Update aosp/master LLVM for rebase to r222494. Change-Id: Ic787f5e0124df789bd26f3f24680f45e678eef2d

commit: 37ed9c199ca639565f6ce88105f9e39e898d82d0 [log] [tgz]
author: Stephen Hines <srhines@google.com> Mon Dec 01 14:51:49 2014 -0800
committer: Stephen Hines <srhines@google.com> Tue Dec 02 16:08:10 2014 -0800
tree: 8fb36d3910e3ee4c4e1b7422f4f017108efc52f5
parent: d2327b22152ced7bc46dc629fc908959e8a52d03 [diff]
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..3186da4
--- /dev/null
+++ b/.clang-tidy

@@ -0,0 +1 @@
+Checks: '-*,clang-diagnostic-*,llvm-*,misc-*'

diff --git a/.gitignore b/.gitignore
index eeebe0d..e806c2c 100644
--- a/.gitignore
+++ b/.gitignore

@@ -49,3 +49,9 @@
 tools/polly
 # Sphinx build tree, if building in-source dir.
 docs/_build
+
+#==============================================================================#
+# Files created in tree by the Go bindings.
+#==============================================================================#
+bindings/go/llvm/llvm_config.go
+bindings/go/llvm/workdir

diff --git a/Android.mk b/Android.mk
index 75dd2a0..52aab8f 100644
--- a/Android.mk
+++ b/Android.mk

@@ -13,7 +13,6 @@
   lib/Bitcode/Writer \
   lib/ExecutionEngine \
   lib/ExecutionEngine/RuntimeDyld \
-  lib/ExecutionEngine/JIT \
   lib/ExecutionEngine/MCJIT \
   lib/ExecutionEngine/Interpreter \
   lib/CodeGen \
@@ -25,7 +24,6 @@
   lib/Linker \
   lib/LTO \
   lib/MC \
-  lib/MC/MCAnalysis \
   lib/MC/MCDisassembler \
   lib/MC/MCParser \
   lib/Object \

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b9fca2a..6691189 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -16,8 +16,29 @@
   endif()
 endif()
 
+if(CMAKE_VERSION VERSION_LESS 3.1.20141117)
+  set(cmake_3_2_USES_TERMINAL)
+else()
+  set(cmake_3_2_USES_TERMINAL USES_TERMINAL)
+endif()
+
 project(LLVM)
 
+# The following only works with the Ninja generator in CMake >= 3.0.
+set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
+  "Define the maximum number of concurrent compilation jobs.")
+if(LLVM_PARALLEL_COMPILE_JOBS)
+  set_property(GLOBAL APPEND PROPERTY JOB_POOLS compile_job_pool=${LLVM_PARALLEL_COMPILE_JOBS})
+  set(CMAKE_JOB_POOL_COMPILE compile_job_pool)
+endif()
+
+set(LLVM_PARALLEL_LINK_JOBS "" CACHE STRING
+  "Define the maximum number of concurrent link jobs.")
+if(LLVM_PARALLEL_LINK_JOBS)
+  set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${LLVM_PARALLEL_LINK_JOBS})
+  set(CMAKE_JOB_POOL_LINK link_job_pool)
+endif()
+
 # Add path for custom modules
 set(CMAKE_MODULE_PATH
   ${CMAKE_MODULE_PATH}
@@ -26,7 +47,7 @@
   )
 
 set(LLVM_VERSION_MAJOR 3)
-set(LLVM_VERSION_MINOR 5)
+set(LLVM_VERSION_MINOR 6)
 set(LLVM_VERSION_PATCH 0)
 
 if (NOT PACKAGE_VERSION)
@@ -208,6 +229,7 @@
   option(LLVM_ENABLE_WARNINGS "Enable compiler warnings." ON)
 endif()
 
+option(LLVM_ENABLE_MODULES "Compile with C++ modules enabled." OFF)
 option(LLVM_ENABLE_CXX1Y "Compile with C++1y enabled." OFF)
 option(LLVM_ENABLE_LIBCXX "Use libc++ if available." OFF)
 option(LLVM_ENABLE_PEDANTIC "Compile with pedantic enabled." ON)
@@ -301,6 +323,8 @@
 option (LLVM_BUILD_EXTERNAL_COMPILER_RT
   "Build compiler-rt as an external project." OFF)
 
+option(LLVM_BUILD_LLVM_DYLIB "Build libllvm dynamic library" OFF)
+
 # All options referred to from HandleLLVMOptions have to be specified
 # BEFORE this include, otherwise options will not be correctly set on
 # first cmake run
@@ -461,14 +485,30 @@
     set(CMAKE_INSTALL_RPATH "\$ORIGIN/../lib")
     if (${CMAKE_SYSTEM_NAME} MATCHES FreeBSD)
       set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,origin")
+      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,origin")
     endif()
   endif(NOT DEFINED CMAKE_INSTALL_RPATH)
 endif()
 
+# Work around a broken bfd ld behavior. When linking a binary with a
+# foo.so library, it will try to find any library that foo.so uses and
+# check its symbols. This is wasteful (the check was done when foo.so
+# was created) and can fail since it is not the dynamic linker and
+# doesn't know how to handle search paths correctly.
+if (UNIX AND NOT APPLE)
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} -Wl,-allow-shlib-undefined")
+endif()
+
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
 include_directories( ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
 
+# when crosscompiling import the executable targets from a file
+if(CMAKE_CROSSCOMPILING)
+  include(CrossCompile)
+endif(CMAKE_CROSSCOMPILING)
+
 if( ${CMAKE_SYSTEM_NAME} MATCHES FreeBSD )
   # On FreeBSD, /usr/local/* is not used by default. In order to build LLVM
   # with libxml2, iconv.h, etc., we must add /usr/local paths.

diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index ee91487..071f6c8 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT

@@ -6,7 +6,7 @@
 The list is sorted by surname and formatted to allow easy grepping and
 beautification by scripts.  The fields are: name (N), email (E), web-address
 (W), PGP key ID and fingerprint (P), description (D), and snail-mail address
-(S).
+(S). Each entry should contain at least the (N), (E) and (D) fields.
 
 N: Joe Abbey
 E: jabbey@arxan.com
@@ -38,9 +38,11 @@
 D: Debug Information, autotools/configure/make build, inline assembly
 
 N: Greg Clayton
+E: gclayton@apple.com
 D: LLDB
 
 N: Peter Collingbourne
+E: peter@pcc.me.uk
 D: libclc
 
 N: Anshuman Dasgupta
@@ -49,13 +51,14 @@
 
 N: Hal Finkel
 E: hfinkel@anl.gov
-D: BBVectorize, the loop reroller and the PowerPC target
+D: BBVectorize, the loop reroller, alias analysis and the PowerPC target
 
 N: Venkatraman Govindaraju
 E: venkatra@cs.wisc.edu
 D: Sparc Backend (lib/Target/Sparc/*)
 
 N: Tobias Grosser
+E: tobias@grosser.es
 D: Polly
 
 N: James Grosbach
@@ -70,8 +73,8 @@
 E: jholewinski@nvidia.com
 D: NVPTX Target (lib/Target/NVPTX/*)
 
-N: Andy Kaylor
-E: andrew.kaylor@intel.com
+N: Lang Hames
+E: lhames@gmail.com
 D: MCJIT, RuntimeDyld and JIT event listeners
 
 N: Galina Kistanova
@@ -100,6 +103,7 @@
 D: AArch64 backend
 
 N: Jakob Olesen
+E: stoklund@2pi.dk
 D: Register allocators and TableGen
 
 N: Richard Osborne
@@ -137,7 +141,7 @@
 N: Tom Stellard
 E: thomas.stellard@amd.com
 E: mesa-dev@lists.freedesktop.org
-D: R600 Backend
+D: Release manager for the 3.5 branch, R600 Backend
 
 N: Evgeniy Stepanov
 E: eugenis@google.com

diff --git a/CREDITS.TXT b/CREDITS.TXT
index 0447c40..40d67f4 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT

@@ -119,6 +119,10 @@
 E: hfinkel@anl.gov
 D: Basic-block autovectorization, PowerPC backend improvements
 
+N: Eric Fiselier
+E: eric@efcs.ca
+D: LIT patches and documentation.
+
 N: Ryan Flynn
 E: pizza@parseerror.com
 D: Miscellaneous bug fixes
@@ -281,8 +285,11 @@
 
 N: Bruno Cardoso Lopes
 E: bruno.cardoso@gmail.com
-W: http://www.brunocardoso.org
-D: The Mips backend
+I: bruno
+W: http://brunocardoso.cc
+D: Mips backend
+D: Random ARM integrated assembler and assembly parser improvements
+D: General X86 AVX1 support
 
 N: Duraid Madina
 E: duraid@octopus.com.au
@@ -456,3 +463,4 @@
 N: Bob Wilson
 E: bob.wilson@acm.org
 D: Advanced SIMD (NEON) support in the ARM backend.
+

diff --git a/Makefile.config.in b/Makefile.config.in
index 1c36412..d34a2d5 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in

@@ -202,10 +202,8 @@
 DOXYGEN    := @DOXYGEN@
 GROFF      := @GROFF@
 GZIPBIN    := @GZIPBIN@
-OCAMLC     := @OCAMLC@
-OCAMLOPT   := @OCAMLOPT@
-OCAMLDEP   := @OCAMLDEP@
-OCAMLDOC   := @OCAMLDOC@
+GO         := @GO@
+OCAMLFIND  := @OCAMLFIND@
 GAS        := @GAS@
 POD2HTML   := @POD2HTML@
 POD2MAN    := @POD2MAN@
@@ -217,6 +215,9 @@
 HAVE_PTHREAD := @HAVE_PTHREAD@
 HAVE_TERMINFO := @HAVE_TERMINFO@
 
+HAVE_OCAMLOPT := @HAVE_OCAMLOPT@
+HAVE_OCAML_OUNIT := @HAVE_OCAML_OUNIT@
+
 LIBS       := @LIBS@
 
 # Targets that are possible to build
@@ -259,9 +260,6 @@
 # When ENABLE_CLANG_ARCMT is enabled, clang will have ARCMigrationTool.
 ENABLE_CLANG_ARCMT = @ENABLE_CLANG_ARCMT@
 
-# When ENABLE_CLANG_REWRITER is enabled, clang will have Rewriter.
-ENABLE_CLANG_REWRITER = @ENABLE_CLANG_REWRITER@
-
 # When ENABLE_CLANG_STATIC_ANALYZER is enabled, clang will have StaticAnalyzer.
 ENABLE_CLANG_STATIC_ANALYZER = @ENABLE_CLANG_STATIC_ANALYZER@
 
@@ -373,7 +371,6 @@
 
 # Bindings that we should build
 BINDINGS_TO_BUILD := @BINDINGS_TO_BUILD@
-ALL_BINDINGS      := @ALL_BINDINGS@
 OCAML_LIBDIR      := @OCAML_LIBDIR@
 
 # When compiling under Mingw/Cygwin, executables such as tblgen
@@ -399,6 +396,8 @@
 NO_UNINITIALIZED = @NO_UNINITIALIZED@
 # -Wno-maybe-uninitialized
 NO_MAYBE_UNINITIALIZED = @NO_MAYBE_UNINITIALIZED@
+# -Wno-comment
+NO_COMMENT = @NO_COMMENT@
 
 # Was polly found in tools/polly?
 LLVM_HAS_POLLY = @LLVM_HAS_POLLY@

diff --git a/Makefile.rules b/Makefile.rules
index ebebc0a..c8c971f 100644
--- a/Makefile.rules
+++ b/Makefile.rules

@@ -449,7 +449,6 @@
   endif
 endif
 
-CXX.Flags     += -Woverloaded-virtual
 CPP.BaseFlags += $(CPP.Defines)
 AR.Flags      := cru
 
@@ -680,7 +679,7 @@
 CompileCommonOpts += -Wall -W -Wno-unused-parameter -Wwrite-strings \
                      $(EXTRA_OPTIONS) $(COVERED_SWITCH_DEFAULT) \
                      $(NO_UNINITIALIZED) $(NO_MAYBE_UNINITIALIZED) \
-                     $(NO_MISSING_FIELD_INITIALIZERS)
+                     $(NO_MISSING_FIELD_INITIALIZERS) $(NO_COMMENT)
 # Enable cast-qual for C++; the workaround is to use const_cast.
 CXX.Flags += -Wcast-qual
 
@@ -727,10 +726,6 @@
 CPP.BaseFlags += -include llvm/Support/Solaris.h
 endif
 
-ifeq ($(HOST_OS),AuroraUX)
-CPP.BaseFlags += -include llvm/Support/Solaris.h
-endif # !HOST_OS - AuroraUX.
-
 # On Windows, SharedLibDir != LibDir. The order is important.
 ifeq ($(HOST_OS), $(filter $(HOST_OS), Cygwin MingW))
   LD.Flags    += -L$(SharedLibDir) -L$(LibDir) -L$(LLVMToolDir) -L$(LLVMLibDir)
@@ -1673,18 +1668,13 @@
 $(TARGET:%=$(ObjDir)/%GenMCCodeEmitter.inc.tmp): \
 $(ObjDir)/%GenMCCodeEmitter.inc.tmp: %.td $(ObjDir)/.dir $(LLVM_TBLGEN)
 	$(Echo) "Building $(<F) MC code emitter with tblgen"
-	$(Verb) $(LLVMTableGen) -gen-emitter -mc-emitter -o $(call SYSPATH, $@) $<
+	$(Verb) $(LLVMTableGen) -gen-emitter -o $(call SYSPATH, $@) $<
 
 $(TARGET:%=$(ObjDir)/%GenMCPseudoLowering.inc.tmp): \
 $(ObjDir)/%GenMCPseudoLowering.inc.tmp: %.td $(ObjDir)/.dir $(LLVM_TBLGEN)
 	$(Echo) "Building $(<F) MC Pseudo instruction expander with tblgen"
 	$(Verb) $(LLVMTableGen) -gen-pseudo-lowering -o $(call SYSPATH, $@) $<
 
-$(TARGET:%=$(ObjDir)/%GenCodeEmitter.inc.tmp): \
-$(ObjDir)/%GenCodeEmitter.inc.tmp: %.td $(ObjDir)/.dir $(LLVM_TBLGEN)
-	$(Echo) "Building $(<F) code emitter with tblgen"
-	$(Verb) $(LLVMTableGen) -gen-emitter -o $(call SYSPATH, $@) $<
-
 $(TARGET:%=$(ObjDir)/%GenDAGISel.inc.tmp): \
 $(ObjDir)/%GenDAGISel.inc.tmp : %.td $(ObjDir)/.dir $(LLVM_TBLGEN)
 	$(Echo) "Building $(<F) DAG instruction selector implementation with tblgen"
@@ -1790,7 +1780,7 @@
 # CHECK: Running the test suite
 ###############################################################################
 
-check:: all
+check::
 	$(Verb) if test -d "$(PROJ_OBJ_ROOT)/test" ; then \
 	  if test -f "$(PROJ_OBJ_ROOT)/test/Makefile" ; then \
 	    $(EchoCmd) Running test suite ; \

diff --git a/autoconf/config.sub b/autoconf/config.sub
index a8d8528..673d62b 100755
--- a/autoconf/config.sub
+++ b/autoconf/config.sub

@@ -251,7 +251,7 @@
 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
 	| am33_2.0 \
 	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
-   | aarch64 \
+   | aarch64 | aarch64_be \
    | be32 | be64 \
 	| bfin \
 	| c4x | clipper \
@@ -360,7 +360,7 @@
 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
 	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
-   | aarch64-* \
+   | aarch64-* | aarch64_be-* \
 	| avr-* | avr32-* \
 	| be32-* | be64-* \
 	| bfin-* | bs2000-* \

diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index a1c2ac5..ca6d710 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac

@@ -32,16 +32,17 @@
 dnl Initialize autoconf and define the package name, version number and
 dnl address for reporting bugs.
 
-AC_INIT([LLVM],[3.5.0svn],[http://llvm.org/bugs/])
+AC_INIT([LLVM],[3.6.0svn],[http://llvm.org/bugs/])
 
 LLVM_VERSION_MAJOR=3
-LLVM_VERSION_MINOR=5
+LLVM_VERSION_MINOR=6
 LLVM_VERSION_PATCH=0
 LLVM_VERSION_SUFFIX=svn
 
 AC_DEFINE_UNQUOTED([LLVM_VERSION_MAJOR], $LLVM_VERSION_MAJOR, [Major version of the LLVM API])
 AC_DEFINE_UNQUOTED([LLVM_VERSION_MINOR], $LLVM_VERSION_MINOR, [Minor version of the LLVM API])
 AC_DEFINE_UNQUOTED([LLVM_VERSION_PATCH], $LLVM_VERSION_PATCH, [Patch version of the LLVM API])
+AC_DEFINE_UNQUOTED([LLVM_VERSION_STRING], "$PACKAGE_VERSION", [LLVM version string])
 
 AC_SUBST([LLVM_VERSION_MAJOR])
 AC_SUBST([LLVM_VERSION_MINOR])
@@ -291,11 +292,6 @@
     llvm_cv_no_link_all_option="-Wl,-z,defaultextract"
     llvm_cv_os_type="SunOS"
     llvm_cv_platform_type="Unix" ;;
-  *-*-auroraux*)
-    llvm_cv_link_all_option="-Wl,-z,allextract"
-    llvm_cv_link_all_option="-Wl,-z,defaultextract"
-    llvm_cv_os_type="AuroraUX"
-    llvm_cv_platform_type="Unix" ;;
   *-*-win32*)
     llvm_cv_link_all_option="-Wl,--whole-archive"
     llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
@@ -361,8 +357,6 @@
     llvm_cv_target_os_type="GNU" ;;
   *-*-solaris*)
     llvm_cv_target_os_type="SunOS" ;;
-  *-*-auroraux*)
-    llvm_cv_target_os_type="AuroraUX" ;;
   *-*-win32*)
     llvm_cv_target_os_type="Win32" ;;
   *-*-mingw*)
@@ -665,36 +659,16 @@
                              enableval="yes")
 case "$enableval" in
   yes) AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[1]) ;;
-  no)  
+  no)
     if test ${clang_arcmt} != "no" ; then
       AC_MSG_ERROR([Cannot enable clang ARC Migration Tool while disabling static analyzer.])
     fi
-    AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[0]) 
+    AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[0])
     ;;
   default) AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[1]);;
   *) AC_MSG_ERROR([Invalid setting for --enable-clang-static-analyzer. Use "yes" or "no"]) ;;
 esac
 
-dnl --enable-clang-rewriter: check whether to enable clang rewriter
-AC_ARG_ENABLE(clang-rewriter,
-              AS_HELP_STRING([--enable-clang-rewriter],
-                             [Enable building of clang rewriter (default is YES)]),,
-                             enableval="yes")
-case "$enableval" in
-  yes) AC_SUBST(ENABLE_CLANG_REWRITER,[1]) ;;
-  no)  
-    if test ${clang_arcmt} != "no" ; then
-      AC_MSG_ERROR([Cannot enable clang ARC Migration Tool while disabling rewriter.])
-    fi
-    if test ${clang_static_analyzer} != "no" ; then
-      AC_MSG_ERROR([Cannot enable clang static analyzer while disabling rewriter.])
-    fi
-    AC_SUBST(ENABLE_CLANG_REWRITER,[0]) 
-    ;;
-  default) AC_SUBST(ENABLE_CLANG_REWRITER,[1]);;
-  *) AC_MSG_ERROR([Invalid setting for --enable-clang-rewriter. Use "yes" or "no"]) ;;
-esac
-
 dnl --enable-optimized : check whether they want to do an optimized build:
 AC_ARG_ENABLE(optimized, AS_HELP_STRING(
  --enable-optimized,[Compile with optimizations enabled (default is NO)]),,enableval=$optimize)
@@ -1314,10 +1288,8 @@
 AC_PATH_PROG(GZIPBIN, [gzip])
 AC_PATH_PROG(PDFROFF, [pdfroff])
 AC_PATH_PROG(ZIP, [zip])
-AC_PATH_PROGS(OCAMLC, [ocamlc])
-AC_PATH_PROGS(OCAMLOPT, [ocamlopt])
-AC_PATH_PROGS(OCAMLDEP, [ocamldep])
-AC_PATH_PROGS(OCAMLDOC, [ocamldoc])
+AC_PATH_PROG(GO, [go])
+AC_PATH_PROGS(OCAMLFIND, [ocamlfind])
 AC_PATH_PROGS(GAS, [gas as])
 
 dnl Get the version of the linker in use.
@@ -1415,7 +1387,27 @@
     CXX_FLAG_CHECK(NO_UNINITIALIZED, [-Wno-uninitialized])
   fi
 fi
-AC_MSG_RESULT([$NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $COVERED_SWITCH_DEFAULT $NO_UNINITIALIZED $NO_MAYBE_UNINITIALIZED])
+
+dnl Check for misbehaving -Wcomment (gcc-4.7 has this) and maybe add
+dnl -Wno-comment to the flags.
+no_comment=
+llvm_cv_old_cxxflags="$CXXFLAGS"
+CXXFLAGS="$CXXFLAGS -Wcomment -Werror"
+AC_COMPILE_IFELSE(
+[
+  AC_LANG_SOURCE([[// Comment \o\
+// Another comment
+int main() { return 0; }
+  ]])
+],
+[
+  no_comment=-Wno-comment
+],
+[])
+AC_SUBST(NO_COMMENT, [$no_comment])
+CXXFLAGS="$llvm_cv_old_cxxflags"
+
+AC_MSG_RESULT([$NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $COVERED_SWITCH_DEFAULT $NO_UNINITIALIZED $NO_MAYBE_UNINITIALIZED $NO_COMMENT])
 
 AC_ARG_WITH([python],
             [AS_HELP_STRING([--with-python], [path to python])],
@@ -1517,25 +1509,6 @@
   AC_CHECK_LIB(z, compress2)
 fi
 
-dnl Allow extra x86-disassembler library
-AC_ARG_WITH(udis86,
-  AS_HELP_STRING([--with-udis86=<path>],
-    [Use udis86 external x86 disassembler library]),
-    [
-      AC_SUBST(USE_UDIS86, [1])
-      case "$withval" in
-        /usr/lib|yes) ;;
-        *) LDFLAGS="$LDFLAGS -L${withval}" ;;
-      esac
-      AC_CHECK_LIB(udis86, ud_init, [], [
-        echo "Error! You need to have libudis86 around."
-        exit -1
-      ])
-    ],
-    AC_SUBST(USE_UDIS86, [0]))
-AC_DEFINE_UNQUOTED([USE_UDIS86],$USE_UDIS86,
-                   [Define if use udis86 library])
-
 dnl Allow OProfile support for JIT output.
 AC_ARG_WITH(oprofile,
   AS_HELP_STRING([--with-oprofile=<prefix>],
@@ -1569,7 +1542,7 @@
           fi ;;
         *)
           AC_MSG_ERROR([OProfile support is available on Linux only.]) ;;
-      esac 
+      esac
     ],
     [
       AC_SUBST(USE_OPROFILE, [0])
@@ -1637,8 +1610,12 @@
 AC_HEADER_TIME
 
 AC_LANG_PUSH([C++])
-AC_CHECK_HEADERS([cxxabi.h])
+dnl size_t must be defined before including cxxabi.h on FreeBSD 10.0.
+AC_CHECK_HEADERS([cxxabi.h], [], [],
+[#include <stddef.h>
+])
 AC_LANG_POP([C++])
+
 AC_CHECK_HEADERS([dlfcn.h execinfo.h fcntl.h inttypes.h link.h])
 AC_CHECK_HEADERS([malloc.h setjmp.h signal.h stdint.h termios.h unistd.h])
 AC_CHECK_HEADERS([utime.h])
@@ -1910,37 +1887,52 @@
 dnl Determine which bindings to build.
 if test "$BINDINGS_TO_BUILD" = auto ; then
   BINDINGS_TO_BUILD=""
-  if test "x$OCAMLC" != x -a "x$OCAMLDEP" != x ; then
+  if test "x$OCAMLFIND" != x ; then
     BINDINGS_TO_BUILD="ocaml $BINDINGS_TO_BUILD"
   fi
+  if test "x$GO" != x ; then
+    if $GO run ${srcdir}/bindings/go/conftest.go ; then
+      BINDINGS_TO_BUILD="go $BINDINGS_TO_BUILD"
+    fi
+  fi
 fi
 AC_SUBST(BINDINGS_TO_BUILD,$BINDINGS_TO_BUILD)
 
-dnl This isn't really configurey, but it avoids having to repeat the list in
-dnl other files.
-AC_SUBST(ALL_BINDINGS,ocaml)
-
 dnl Do any work necessary to ensure that bindings have what they need.
 binding_prereqs_failed=0
 for a_binding in $BINDINGS_TO_BUILD ; do
   case "$a_binding" in
   ocaml)
-    if test "x$OCAMLC" = x ; then
-      AC_MSG_WARN([--enable-bindings=ocaml specified, but ocamlc not found. Try configure OCAMLC=/path/to/ocamlc])
+    if test "x$OCAMLFIND" = x ; then
+      AC_MSG_WARN([--enable-bindings=ocaml specified, but ocamlfind not found. Try configure OCAMLFIND=/path/to/ocamlfind])
       binding_prereqs_failed=1
     fi
-    if test "x$OCAMLDEP" = x ; then
-      AC_MSG_WARN([--enable-bindings=ocaml specified, but ocamldep not found. Try configure OCAMLDEP=/path/to/ocamldep])
+
+    if $OCAMLFIND opt -version >/dev/null 2>/dev/null ; then
+      HAVE_OCAMLOPT=1
+    else
+      HAVE_OCAMLOPT=0
+    fi
+    AC_SUBST(HAVE_OCAMLOPT)
+
+    if ! $OCAMLFIND query ctypes >/dev/null 2>/dev/null; then
+      AC_MSG_WARN([--enable-bindings=ocaml specified, but ctypes is not installed])
       binding_prereqs_failed=1
     fi
-    if test "x$OCAMLOPT" = x ; then
-      AC_MSG_WARN([--enable-bindings=ocaml specified, but ocamlopt not found. Try configure OCAMLOPT=/path/to/ocamlopt])
-      dnl ocamlopt is optional!
+
+    if $OCAMLFIND query oUnit >/dev/null 2>/dev/null; then
+      HAVE_OCAML_OUNIT=1
+    else
+      HAVE_OCAML_OUNIT=0
+      AC_MSG_WARN([--enable-bindings=ocaml specified, but OUnit 2 is not installed. Tests will not run])
+      dnl oUnit is optional!
     fi
+    AC_SUBST(HAVE_OCAML_OUNIT)
+
     if test "x$with_ocaml_libdir" != xauto ; then
       AC_SUBST(OCAML_LIBDIR,$with_ocaml_libdir)
     else
-      ocaml_stdlib="`"$OCAMLC" -where`"
+      ocaml_stdlib="`"$OCAMLFIND" ocamlc -where`"
       if test "$LLVM_PREFIX" '<' "$ocaml_stdlib" -a "$ocaml_stdlib" '<' "$LLVM_PREFIX~"
       then
         # ocaml stdlib is beneath our prefix; use stdlib
@@ -1951,6 +1943,19 @@
       fi
     fi
     ;;
+  go)
+    if test "x$GO" = x ; then
+      AC_MSG_WARN([--enable-bindings=go specified, but go not found. Try configure GO=/path/to/go])
+      binding_prereqs_failed=1
+    else
+      if $GO run ${srcdir}/bindings/go/conftest.go ; then
+        :
+      else
+        AC_MSG_WARN([--enable-bindings=go specified, but need at least Go 1.2. Try configure GO=/path/to/go])
+        binding_prereqs_failed=1
+      fi
+    fi
+    ;;
   esac
 done
 if test "$binding_prereqs_failed" = 1 ; then

diff --git a/bindings/Makefile b/bindings/Makefile
index c545b28..70e9e6c 100644
--- a/bindings/Makefile
+++ b/bindings/Makefile

@@ -11,6 +11,10 @@
 
 include $(LEVEL)/Makefile.config
 
-PARALLEL_DIRS = $(BINDINGS_TO_BUILD)
+PARALLEL_DIRS =
+
+ifneq (,$(filter ocaml,$(BINDINGS_TO_BUILD)))
+PARALLEL_DIRS += ocaml
+endif
 
 include $(LEVEL)/Makefile.common

diff --git a/bindings/go/README.txt b/bindings/go/README.txt
new file mode 100644
index 0000000..2fc4afa
--- /dev/null
+++ b/bindings/go/README.txt

@@ -0,0 +1,53 @@
+This directory contains LLVM bindings for the Go programming language
+(http://golang.org).
+
+Prerequisites
+-------------
+
+* Go 1.2+.
+* CMake (to build LLVM).
+
+Using the bindings
+------------------
+
+The package path "llvm.org/llvm/bindings/go/llvm" can be used to
+import the latest development version of LLVM from SVN. Paths such as
+"llvm.org/llvm.v36/bindings/go/llvm" refer to released versions of LLVM.
+
+It is recommended to use the "-d" flag with "go get" to download the
+package or a dependency, as an additional step is required to build LLVM
+(see "Building LLVM" below).
+
+Building LLVM
+-------------
+
+The script "build.sh" in this directory can be used to build LLVM and prepare
+it to be used by the bindings. If you receive an error message from "go build"
+like this:
+
+    ./analysis.go:4:84: fatal error: llvm-c/Analysis.h: No such file or directory
+     #include <llvm-c/Analysis.h> // If you are getting an error here read bindings/go/README.txt
+
+or like this:
+
+    ./llvm_dep.go:5: undefined: run_build_sh
+
+it means that LLVM needs to be built or updated by running the script.
+
+    $ $GOPATH/src/llvm.org/llvm/bindings/go/build.sh
+
+Any command line arguments supplied to the script are passed to LLVM's CMake
+build system. A good set of arguments to use during development are:
+
+    $ $GOPATH/src/llvm.org/llvm/bindings/go/build.sh -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD=host -DBUILD_SHARED_LIBS=ON
+
+Note that CMake keeps a cache of build settings so once you have built
+LLVM there is no need to pass these arguments again after updating.
+
+Alternatively, you can build LLVM yourself, but you must then set the
+CGO_CPPFLAGS, CGO_CXXFLAGS and CGO_LDFLAGS environment variables:
+
+    $ export CGO_CPPFLAGS="`/path/to/llvm-build/bin/llvm-config --cppflags`"
+    $ export CGO_CXXFLAGS=-std=c++11
+    $ export CGO_LDFLAGS="`/path/to/llvm-build/bin/llvm-config --ldflags --libs --system-libs all`"
+    $ go build -tags byollvm

diff --git a/bindings/go/build.sh b/bindings/go/build.sh
new file mode 100755
index 0000000..3177852
--- /dev/null
+++ b/bindings/go/build.sh

@@ -0,0 +1,28 @@
+#!/bin/sh -xe
+
+gollvmdir=$(dirname "$0")/llvm
+
+workdir=$gollvmdir/workdir
+llvmdir=$gollvmdir/../../..
+llvm_builddir=$workdir/llvm_build
+
+mkdir -p $llvm_builddir
+
+cmake_flags="../../../../.. $@"
+llvm_config="$llvm_builddir/bin/llvm-config"
+llvm_go="$llvm_builddir/bin/llvm-go"
+
+if test -n "`which ninja`" ; then
+  # If Ninja is available, we can speed up the build by building only the
+  # required subset of LLVM.
+  (cd $llvm_builddir && cmake -G Ninja $cmake_flags)
+  ninja -C $llvm_builddir llvm-config llvm-go
+  llvm_components="$($llvm_go print-components)"
+  llvm_buildtargets="$($llvm_config --libs $llvm_components | sed -e 's/-l//g')"
+  ninja -C $llvm_builddir $llvm_buildtargets FileCheck
+else
+  (cd $llvm_builddir && cmake $cmake_flags)
+  make -C $llvm_builddir -j4
+fi
+
+$llvm_go print-config > $gollvmdir/llvm_config.go

diff --git a/bindings/go/conftest.go b/bindings/go/conftest.go
new file mode 100644
index 0000000..d97fb89
--- /dev/null
+++ b/bindings/go/conftest.go

@@ -0,0 +1,16 @@
+package main
+
+import (
+	"go/build"
+	"os"
+)
+
+// Tests that the Go compiler is at least version 1.2.
+func main() {
+	for _, tag := range build.Default.ReleaseTags {
+		if tag == "go1.2" {
+			os.Exit(0)
+		}
+	}
+	os.Exit(1)
+}

diff --git a/bindings/go/llvm/DIBuilderBindings.cpp b/bindings/go/llvm/DIBuilderBindings.cpp
new file mode 100644
index 0000000..94fa96f
--- /dev/null
+++ b/bindings/go/llvm/DIBuilderBindings.cpp

@@ -0,0 +1,222 @@
+//===- DIBuilderBindings.cpp - Bindings for DIBuilder ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines C bindings for the DIBuilder class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DIBuilderBindings.h"
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/DIBuilder.h"
+
+using namespace llvm;
+
+namespace {
+template <typename T>
+T unwrapDI(LLVMValueRef v) {
+  return v ? T(unwrap<MDNode>(v)) : T();
+}
+}
+
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(DIBuilder, LLVMDIBuilderRef)
+
+LLVMDIBuilderRef LLVMNewDIBuilder(LLVMModuleRef mref) {
+  Module *m = unwrap(mref);
+  return wrap(new DIBuilder(*m));
+}
+
+void LLVMDIBuilderDestroy(LLVMDIBuilderRef dref) {
+  DIBuilder *d = unwrap(dref);
+  delete d;
+}
+
+void LLVMDIBuilderFinalize(LLVMDIBuilderRef dref) { unwrap(dref)->finalize(); }
+
+LLVMValueRef LLVMDIBuilderCreateCompileUnit(LLVMDIBuilderRef Dref,
+                                            unsigned Lang, const char *File,
+                                            const char *Dir,
+                                            const char *Producer, int Optimized,
+                                            const char *Flags,
+                                            unsigned RuntimeVersion) {
+  DIBuilder *D = unwrap(Dref);
+  DICompileUnit CU = D->createCompileUnit(Lang, File, Dir, Producer, Optimized,
+                                          Flags, RuntimeVersion);
+  return wrap(CU);
+}
+
+LLVMValueRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef Dref, const char *File,
+                                     const char *Dir) {
+  DIBuilder *D = unwrap(Dref);
+  DIFile F = D->createFile(File, Dir);
+  return wrap(F);
+}
+
+LLVMValueRef LLVMDIBuilderCreateLexicalBlock(LLVMDIBuilderRef Dref,
+                                             LLVMValueRef Scope,
+                                             LLVMValueRef File, unsigned Line,
+                                             unsigned Column) {
+  DIBuilder *D = unwrap(Dref);
+  DILexicalBlock LB = D->createLexicalBlock(
+      unwrapDI<DIDescriptor>(Scope), unwrapDI<DIFile>(File), Line, Column);
+  return wrap(LB);
+}
+
+LLVMValueRef LLVMDIBuilderCreateLexicalBlockFile(LLVMDIBuilderRef Dref,
+                                                 LLVMValueRef Scope,
+                                                 LLVMValueRef File,
+                                                 unsigned Discriminator) {
+  DIBuilder *D = unwrap(Dref);
+  DILexicalBlockFile LBF = D->createLexicalBlockFile(
+      unwrapDI<DIDescriptor>(Scope), unwrapDI<DIFile>(File), Discriminator);
+  return wrap(LBF);
+}
+
+LLVMValueRef LLVMDIBuilderCreateFunction(
+    LLVMDIBuilderRef Dref, LLVMValueRef Scope, const char *Name,
+    const char *LinkageName, LLVMValueRef File, unsigned Line,
+    LLVMValueRef CompositeType, int IsLocalToUnit, int IsDefinition,
+    unsigned ScopeLine, unsigned Flags, int IsOptimized, LLVMValueRef Func) {
+  DIBuilder *D = unwrap(Dref);
+  DISubprogram SP = D->createFunction(
+      unwrapDI<DIDescriptor>(Scope), Name, LinkageName, unwrapDI<DIFile>(File),
+      Line, unwrapDI<DICompositeType>(CompositeType), IsLocalToUnit,
+      IsDefinition, ScopeLine, Flags, IsOptimized, unwrap<Function>(Func));
+  return wrap(SP);
+}
+
+LLVMValueRef LLVMDIBuilderCreateLocalVariable(
+    LLVMDIBuilderRef Dref, unsigned Tag, LLVMValueRef Scope, const char *Name,
+    LLVMValueRef File, unsigned Line, LLVMValueRef Ty, int AlwaysPreserve,
+    unsigned Flags, unsigned ArgNo) {
+  DIBuilder *D = unwrap(Dref);
+  DIVariable V = D->createLocalVariable(
+      Tag, unwrapDI<DIDescriptor>(Scope), Name, unwrapDI<DIFile>(File), Line,
+      unwrapDI<DIType>(Ty), AlwaysPreserve, Flags, ArgNo);
+  return wrap(V);
+}
+
+LLVMValueRef LLVMDIBuilderCreateBasicType(LLVMDIBuilderRef Dref,
+                                          const char *Name, uint64_t SizeInBits,
+                                          uint64_t AlignInBits,
+                                          unsigned Encoding) {
+  DIBuilder *D = unwrap(Dref);
+  DIBasicType T = D->createBasicType(Name, SizeInBits, AlignInBits, Encoding);
+  return wrap(T);
+}
+
+LLVMValueRef LLVMDIBuilderCreatePointerType(LLVMDIBuilderRef Dref,
+                                            LLVMValueRef PointeeType,
+                                            uint64_t SizeInBits,
+                                            uint64_t AlignInBits,
+                                            const char *Name) {
+  DIBuilder *D = unwrap(Dref);
+  DIDerivedType T = D->createPointerType(unwrapDI<DIType>(PointeeType),
+                                         SizeInBits, AlignInBits, Name);
+  return wrap(T);
+}
+
+LLVMValueRef LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef Dref,
+                                               LLVMValueRef File,
+                                               LLVMValueRef ParameterTypes) {
+  DIBuilder *D = unwrap(Dref);
+  DICompositeType CT = D->createSubroutineType(
+      unwrapDI<DIFile>(File), unwrapDI<DITypeArray>(ParameterTypes));
+  return wrap(CT);
+}
+
+LLVMValueRef LLVMDIBuilderCreateStructType(
+    LLVMDIBuilderRef Dref, LLVMValueRef Scope, const char *Name,
+    LLVMValueRef File, unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits,
+    unsigned Flags, LLVMValueRef DerivedFrom, LLVMValueRef ElementTypes) {
+  DIBuilder *D = unwrap(Dref);
+  DICompositeType CT = D->createStructType(
+      unwrapDI<DIDescriptor>(Scope), Name, unwrapDI<DIFile>(File), Line,
+      SizeInBits, AlignInBits, Flags, unwrapDI<DIType>(DerivedFrom),
+      unwrapDI<DIArray>(ElementTypes));
+  return wrap(CT);
+}
+
+LLVMValueRef LLVMDIBuilderCreateMemberType(
+    LLVMDIBuilderRef Dref, LLVMValueRef Scope, const char *Name,
+    LLVMValueRef File, unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits,
+    uint64_t OffsetInBits, unsigned Flags, LLVMValueRef Ty) {
+  DIBuilder *D = unwrap(Dref);
+  DIDerivedType DT = D->createMemberType(
+      unwrapDI<DIDescriptor>(Scope), Name, unwrapDI<DIFile>(File), Line,
+      SizeInBits, AlignInBits, OffsetInBits, Flags, unwrapDI<DIType>(Ty));
+  return wrap(DT);
+}
+
+LLVMValueRef LLVMDIBuilderCreateArrayType(LLVMDIBuilderRef Dref,
+                                          uint64_t SizeInBits,
+                                          uint64_t AlignInBits,
+                                          LLVMValueRef ElementType,
+                                          LLVMValueRef Subscripts) {
+  DIBuilder *D = unwrap(Dref);
+  DICompositeType CT =
+      D->createArrayType(SizeInBits, AlignInBits, unwrapDI<DIType>(ElementType),
+                         unwrapDI<DIArray>(Subscripts));
+  return wrap(CT);
+}
+
+LLVMValueRef LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef Dref, LLVMValueRef Ty,
+                                        const char *Name, LLVMValueRef File,
+                                        unsigned Line, LLVMValueRef Context) {
+  DIBuilder *D = unwrap(Dref);
+  DIDerivedType DT =
+      D->createTypedef(unwrapDI<DIType>(Ty), Name, unwrapDI<DIFile>(File), Line,
+                       unwrapDI<DIDescriptor>(Context));
+  return wrap(DT);
+}
+
+LLVMValueRef LLVMDIBuilderGetOrCreateSubrange(LLVMDIBuilderRef Dref, int64_t Lo,
+                                              int64_t Count) {
+  DIBuilder *D = unwrap(Dref);
+  DISubrange S = D->getOrCreateSubrange(Lo, Count);
+  return wrap(S);
+}
+
+LLVMValueRef LLVMDIBuilderGetOrCreateArray(LLVMDIBuilderRef Dref,
+                                           LLVMValueRef *Data, size_t Length) {
+  DIBuilder *D = unwrap(Dref);
+  Value **DataValue = unwrap(Data);
+  ArrayRef<Value *> Elements(DataValue, Length);
+  DIArray A = D->getOrCreateArray(Elements);
+  return wrap(A);
+}
+
+LLVMValueRef LLVMDIBuilderGetOrCreateTypeArray(LLVMDIBuilderRef Dref,
+                                               LLVMValueRef *Data,
+                                               size_t Length) {
+  DIBuilder *D = unwrap(Dref);
+  Value **DataValue = unwrap(Data);
+  ArrayRef<Value *> Elements(DataValue, Length);
+  DITypeArray A = D->getOrCreateTypeArray(Elements);
+  return wrap(A);
+}
+
+LLVMValueRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Dref, int64_t *Addr,
+                                           size_t Length) {
+  DIBuilder *D = unwrap(Dref);
+  DIExpression Expr = D->createExpression(ArrayRef<int64_t>(Addr, Length));
+  return wrap(Expr);
+}
+
+LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef Dref,
+                                             LLVMValueRef Storage,
+                                             LLVMValueRef VarInfo,
+                                             LLVMValueRef Expr,
+                                             LLVMBasicBlockRef Block) {
+  DIBuilder *D = unwrap(Dref);
+  Instruction *Instr =
+      D->insertDeclare(unwrap(Storage), unwrapDI<DIVariable>(VarInfo),
+                       unwrapDI<DIExpression>(Expr), unwrap(Block));
+  return wrap(Instr);
+}

diff --git a/bindings/go/llvm/DIBuilderBindings.h b/bindings/go/llvm/DIBuilderBindings.h
new file mode 100644
index 0000000..e6fe02a
--- /dev/null
+++ b/bindings/go/llvm/DIBuilderBindings.h

@@ -0,0 +1,123 @@
+//===- DIBuilderBindings.h - Bindings for DIBuilder -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines C bindings for the DIBuilder class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINDINGS_GO_LLVM_DIBUILDERBINDINGS_H
+#define LLVM_BINDINGS_GO_LLVM_DIBUILDERBINDINGS_H
+
+#include "llvm-c/Core.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// FIXME: These bindings shouldn't be Go-specific and should eventually move to
+// a (somewhat) less stable collection of C APIs for use in creating bindings of
+// LLVM in other languages.
+
+typedef struct LLVMOpaqueDIBuilder *LLVMDIBuilderRef;
+
+LLVMDIBuilderRef LLVMNewDIBuilder(LLVMModuleRef m);
+
+void LLVMDIBuilderDestroy(LLVMDIBuilderRef d);
+void LLVMDIBuilderFinalize(LLVMDIBuilderRef d);
+
+LLVMValueRef LLVMDIBuilderCreateCompileUnit(LLVMDIBuilderRef D,
+                                            unsigned Language, const char *File,
+                                            const char *Dir,
+                                            const char *Producer, int Optimized,
+                                            const char *Flags,
+                                            unsigned RuntimeVersion);
+
+LLVMValueRef LLVMDIBuilderCreateFile(LLVMDIBuilderRef D, const char *File,
+                                     const char *Dir);
+
+LLVMValueRef LLVMDIBuilderCreateLexicalBlock(LLVMDIBuilderRef D,
+                                             LLVMValueRef Scope,
+                                             LLVMValueRef File, unsigned Line,
+                                             unsigned Column);
+
+LLVMValueRef LLVMDIBuilderCreateLexicalBlockFile(LLVMDIBuilderRef D,
+                                                 LLVMValueRef Scope,
+                                                 LLVMValueRef File,
+                                                 unsigned Discriminator);
+
+LLVMValueRef LLVMDIBuilderCreateFunction(
+    LLVMDIBuilderRef D, LLVMValueRef Scope, const char *Name,
+    const char *LinkageName, LLVMValueRef File, unsigned Line,
+    LLVMValueRef CompositeType, int IsLocalToUnit, int IsDefinition,
+    unsigned ScopeLine, unsigned Flags, int IsOptimized, LLVMValueRef Function);
+
+LLVMValueRef LLVMDIBuilderCreateLocalVariable(
+    LLVMDIBuilderRef D, unsigned Tag, LLVMValueRef Scope, const char *Name,
+    LLVMValueRef File, unsigned Line, LLVMValueRef Ty, int AlwaysPreserve,
+    unsigned Flags, unsigned ArgNo);
+
+LLVMValueRef LLVMDIBuilderCreateBasicType(LLVMDIBuilderRef D, const char *Name,
+                                          uint64_t SizeInBits,
+                                          uint64_t AlignInBits,
+                                          unsigned Encoding);
+
+LLVMValueRef LLVMDIBuilderCreatePointerType(LLVMDIBuilderRef D,
+                                            LLVMValueRef PointeeType,
+                                            uint64_t SizeInBits,
+                                            uint64_t AlignInBits,
+                                            const char *Name);
+
+LLVMValueRef LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef D,
+                                               LLVMValueRef File,
+                                               LLVMValueRef ParameterTypes);
+
+LLVMValueRef LLVMDIBuilderCreateStructType(
+    LLVMDIBuilderRef D, LLVMValueRef Scope, const char *Name, LLVMValueRef File,
+    unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits, unsigned Flags,
+    LLVMValueRef DerivedFrom, LLVMValueRef ElementTypes);
+
+LLVMValueRef LLVMDIBuilderCreateMemberType(
+    LLVMDIBuilderRef D, LLVMValueRef Scope, const char *Name, LLVMValueRef File,
+    unsigned Line, uint64_t SizeInBits, uint64_t AlignInBits,
+    uint64_t OffsetInBits, unsigned Flags, LLVMValueRef Ty);
+
+LLVMValueRef LLVMDIBuilderCreateArrayType(LLVMDIBuilderRef D,
+                                          uint64_t SizeInBits,
+                                          uint64_t AlignInBits,
+                                          LLVMValueRef ElementType,
+                                          LLVMValueRef Subscripts);
+
+LLVMValueRef LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef D, LLVMValueRef Ty,
+                                        const char *Name, LLVMValueRef File,
+                                        unsigned Line, LLVMValueRef Context);
+
+LLVMValueRef LLVMDIBuilderGetOrCreateSubrange(LLVMDIBuilderRef D, int64_t Lo,
+                                              int64_t Count);
+
+LLVMValueRef LLVMDIBuilderGetOrCreateArray(LLVMDIBuilderRef D,
+                                           LLVMValueRef *Data, size_t Length);
+
+LLVMValueRef LLVMDIBuilderGetOrCreateTypeArray(LLVMDIBuilderRef D,
+                                               LLVMValueRef *Data,
+                                               size_t Length);
+
+LLVMValueRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Dref, int64_t *Addr,
+                                           size_t Length);
+
+LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef D,
+                                             LLVMValueRef Storage,
+                                             LLVMValueRef VarInfo,
+                                             LLVMValueRef Expr,
+                                             LLVMBasicBlockRef Block);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

diff --git a/bindings/go/llvm/IRBindings.cpp b/bindings/go/llvm/IRBindings.cpp
new file mode 100644
index 0000000..67a54a2
--- /dev/null
+++ b/bindings/go/llvm/IRBindings.cpp

@@ -0,0 +1,47 @@
+//===- IRBindings.cpp - Additional bindings for ir ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines additional C bindings for the ir component.
+//
+//===----------------------------------------------------------------------===//
+
+#include "IRBindings.h"
+
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+void LLVMAddFunctionAttr2(LLVMValueRef Fn, uint64_t PA) {
+  Function *Func = unwrap<Function>(Fn);
+  const AttributeSet PAL = Func->getAttributes();
+  AttrBuilder B(PA);
+  const AttributeSet PALnew =
+    PAL.addAttributes(Func->getContext(), AttributeSet::FunctionIndex,
+                      AttributeSet::get(Func->getContext(),
+                                        AttributeSet::FunctionIndex, B));
+  Func->setAttributes(PALnew);
+}
+
+uint64_t LLVMGetFunctionAttr2(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  const AttributeSet PAL = Func->getAttributes();
+  return PAL.Raw(AttributeSet::FunctionIndex);
+}
+
+void LLVMRemoveFunctionAttr2(LLVMValueRef Fn, uint64_t PA) {
+  Function *Func = unwrap<Function>(Fn);
+  const AttributeSet PAL = Func->getAttributes();
+  AttrBuilder B(PA);
+  const AttributeSet PALnew =
+    PAL.removeAttributes(Func->getContext(), AttributeSet::FunctionIndex,
+                         AttributeSet::get(Func->getContext(),
+                                           AttributeSet::FunctionIndex, B));
+  Func->setAttributes(PALnew);
+}

diff --git a/bindings/go/llvm/IRBindings.h b/bindings/go/llvm/IRBindings.h
new file mode 100644
index 0000000..cc63e4e
--- /dev/null
+++ b/bindings/go/llvm/IRBindings.h

@@ -0,0 +1,37 @@
+//===- IRBindings.h - Additional bindings for IR ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines additional C bindings for the IR component.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINDINGS_GO_LLVM_IRBINDINGS_H
+#define LLVM_BINDINGS_GO_LLVM_IRBINDINGS_H
+
+#include "llvm-c/Core.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// These functions duplicate the LLVM*FunctionAttr functions in the stable C
+// API. We cannot use the existing functions because they take 32-bit attribute
+// values, and the Go bindings expose all of the LLVM attributes, some of which
+// have values >= 1<<32.
+
+void LLVMAddFunctionAttr2(LLVMValueRef Fn, uint64_t PA);
+uint64_t LLVMGetFunctionAttr2(LLVMValueRef Fn);
+void LLVMRemoveFunctionAttr2(LLVMValueRef Fn, uint64_t PA);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

diff --git a/bindings/go/llvm/InstrumentationBindings.cpp b/bindings/go/llvm/InstrumentationBindings.cpp
new file mode 100644
index 0000000..b604abb
--- /dev/null
+++ b/bindings/go/llvm/InstrumentationBindings.cpp

@@ -0,0 +1,42 @@
+//===- InstrumentationBindings.cpp - instrumentation bindings -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines C bindings for the instrumentation component.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstrumentationBindings.h"
+
+#include "llvm-c/Core.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Transforms/Instrumentation.h"
+
+using namespace llvm;
+
+void LLVMAddAddressSanitizerFunctionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAddressSanitizerFunctionPass());
+}
+
+void LLVMAddAddressSanitizerModulePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAddressSanitizerModulePass());
+}
+
+void LLVMAddThreadSanitizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createThreadSanitizerPass());
+}
+
+void LLVMAddMemorySanitizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMemorySanitizerPass());
+}
+
+void LLVMAddDataFlowSanitizerPass(LLVMPassManagerRef PM,
+                                  const char *ABIListFile) {
+  unwrap(PM)->add(createDataFlowSanitizerPass(ABIListFile));
+}

diff --git a/bindings/go/llvm/InstrumentationBindings.h b/bindings/go/llvm/InstrumentationBindings.h
new file mode 100644
index 0000000..e8dbd59
--- /dev/null
+++ b/bindings/go/llvm/InstrumentationBindings.h

@@ -0,0 +1,38 @@
+//===- InstrumentationBindings.h - instrumentation bindings -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines C bindings for the Transforms/Instrumentation component.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINDINGS_GO_LLVM_INSTRUMENTATIONBINDINGS_H
+#define LLVM_BINDINGS_GO_LLVM_INSTRUMENTATIONBINDINGS_H
+
+#include "llvm-c/Core.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// FIXME: These bindings shouldn't be Go-specific and should eventually move to
+// a (somewhat) less stable collection of C APIs for use in creating bindings of
+// LLVM in other languages.
+
+void LLVMAddAddressSanitizerFunctionPass(LLVMPassManagerRef PM);
+void LLVMAddAddressSanitizerModulePass(LLVMPassManagerRef PM);
+void LLVMAddThreadSanitizerPass(LLVMPassManagerRef PM);
+void LLVMAddMemorySanitizerPass(LLVMPassManagerRef PM);
+void LLVMAddDataFlowSanitizerPass(LLVMPassManagerRef PM,
+                                  const char *ABIListFile);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

diff --git a/bindings/go/llvm/SupportBindings.cpp b/bindings/go/llvm/SupportBindings.cpp
new file mode 100644
index 0000000..df5f865
--- /dev/null
+++ b/bindings/go/llvm/SupportBindings.cpp

@@ -0,0 +1,27 @@
+//===- SupportBindings.cpp - Additional bindings for support --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines additional C bindings for the support component.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SupportBindings.h"
+
+#include "llvm/Support/DynamicLibrary.h"
+#include <stdlib.h>
+#include <string.h>
+
+void LLVMLoadLibraryPermanently2(const char *Filename, char **ErrMsg) {
+  std::string ErrMsgStr;
+  if (llvm::sys::DynamicLibrary::LoadLibraryPermanently(Filename, &ErrMsgStr)) {
+    *ErrMsg = static_cast<char *>(malloc(ErrMsgStr.size() + 1));
+    memcpy(static_cast<void *>(*ErrMsg),
+           static_cast<const void *>(ErrMsgStr.c_str()), ErrMsgStr.size() + 1);
+  }
+}

diff --git a/bindings/go/llvm/SupportBindings.h b/bindings/go/llvm/SupportBindings.h
new file mode 100644
index 0000000..efcd667
--- /dev/null
+++ b/bindings/go/llvm/SupportBindings.h

@@ -0,0 +1,30 @@
+//===- SupportBindings.h - Additional bindings for Support ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines additional C bindings for the Support component.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BINDINGS_GO_LLVM_SUPPORTBINDINGS_H
+#define LLVM_BINDINGS_GO_LLVM_SUPPORTBINDINGS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This function duplicates the LLVMLoadLibraryPermanently function in the
+// stable C API and adds an extra ErrMsg parameter to retrieve the error
+// message.
+void LLVMLoadLibraryPermanently2(const char *Filename, char **ErrMsg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

diff --git a/bindings/go/llvm/analysis.go b/bindings/go/llvm/analysis.go
new file mode 100644
index 0000000..7b0d8e3
--- /dev/null
+++ b/bindings/go/llvm/analysis.go

@@ -0,0 +1,68 @@
+//===- analysis.go - Bindings for analysis --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the analysis component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/Analysis.h" // If you are getting an error here read bindings/go/README.txt
+#include <stdlib.h>
+*/
+import "C"
+import "errors"
+
+type VerifierFailureAction C.LLVMVerifierFailureAction
+
+const (
+	// verifier will print to stderr and abort()
+	AbortProcessAction VerifierFailureAction = C.LLVMAbortProcessAction
+	// verifier will print to stderr and return 1
+	PrintMessageAction VerifierFailureAction = C.LLVMPrintMessageAction
+	// verifier will just return 1
+	ReturnStatusAction VerifierFailureAction = C.LLVMReturnStatusAction
+)
+
+// Verifies that a module is valid, taking the specified action if not.
+// Optionally returns a human-readable description of any invalid constructs.
+func VerifyModule(m Module, a VerifierFailureAction) error {
+	var cmsg *C.char
+	broken := C.LLVMVerifyModule(m.C, C.LLVMVerifierFailureAction(a), &cmsg)
+
+	// C++'s verifyModule means isModuleBroken, so it returns false if
+	// there are no errors
+	if broken != 0 {
+		err := errors.New(C.GoString(cmsg))
+		C.LLVMDisposeMessage(cmsg)
+		return err
+	}
+	return nil
+}
+
+var verifyFunctionError = errors.New("Function is broken")
+
+// Verifies that a single function is valid, taking the specified action.
+// Useful for debugging.
+func VerifyFunction(f Value, a VerifierFailureAction) error {
+	broken := C.LLVMVerifyFunction(f.C, C.LLVMVerifierFailureAction(a))
+
+	// C++'s verifyFunction means isFunctionBroken, so it returns false if
+	// there are no errors
+	if broken != 0 {
+		return verifyFunctionError
+	}
+	return nil
+}
+
+// Open up a ghostview window that displays the CFG of the current function.
+// Useful for debugging.
+func ViewFunctionCFG(f Value)     { C.LLVMViewFunctionCFG(f.C) }
+func ViewFunctionCFGOnly(f Value) { C.LLVMViewFunctionCFGOnly(f.C) }

diff --git a/bindings/go/llvm/bitreader.go b/bindings/go/llvm/bitreader.go
new file mode 100644
index 0000000..98112a9
--- /dev/null
+++ b/bindings/go/llvm/bitreader.go

@@ -0,0 +1,50 @@
+//===- bitreader.go - Bindings for bitreader ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the bitreader component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/BitReader.h"
+#include <stdlib.h>
+*/
+import "C"
+
+import (
+	"errors"
+	"unsafe"
+)
+
+// ParseBitcodeFile parses the LLVM IR (bitcode) in the file with the
+// specified name, and returns a new LLVM module.
+func ParseBitcodeFile(name string) (Module, error) {
+	var buf C.LLVMMemoryBufferRef
+	var errmsg *C.char
+	var cfilename *C.char = C.CString(name)
+	defer C.free(unsafe.Pointer(cfilename))
+	result := C.LLVMCreateMemoryBufferWithContentsOfFile(cfilename, &buf, &errmsg)
+	if result != 0 {
+		err := errors.New(C.GoString(errmsg))
+		C.free(unsafe.Pointer(errmsg))
+		return Module{}, err
+	}
+	defer C.LLVMDisposeMemoryBuffer(buf)
+
+	var m Module
+	if C.LLVMParseBitcode(buf, &m.C, &errmsg) == 0 {
+		return m, nil
+	}
+
+	err := errors.New(C.GoString(errmsg))
+	C.free(unsafe.Pointer(errmsg))
+	return Module{}, err
+}

diff --git a/bindings/go/llvm/bitwriter.go b/bindings/go/llvm/bitwriter.go
new file mode 100644
index 0000000..e03699c
--- /dev/null
+++ b/bindings/go/llvm/bitwriter.go

@@ -0,0 +1,39 @@
+//===- bitwriter.go - Bindings for bitwriter ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the bitwriter component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/BitWriter.h"
+#include <stdlib.h>
+*/
+import "C"
+import "os"
+import "errors"
+
+var writeBitcodeToFileErr = errors.New("Failed to write bitcode to file")
+
+func WriteBitcodeToFile(m Module, file *os.File) error {
+	fail := C.LLVMWriteBitcodeToFD(m.C, C.int(file.Fd()), C.int(0), C.int(0))
+	if fail != 0 {
+		return writeBitcodeToFileErr
+	}
+	return nil
+}
+
+func WriteBitcodeToMemoryBuffer(m Module) MemoryBuffer {
+	mb := C.LLVMWriteBitcodeToMemoryBuffer(m.C)
+	return MemoryBuffer{mb}
+}
+
+// TODO(nsf): Figure out way how to make it work with io.Writer

diff --git a/bindings/go/llvm/dibuilder.go b/bindings/go/llvm/dibuilder.go
new file mode 100644
index 0000000..1d07e98
--- /dev/null
+++ b/bindings/go/llvm/dibuilder.go

@@ -0,0 +1,492 @@
+//===- dibuilder.go - Bindings for DIBuilder ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the DIBuilder class.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "DIBuilderBindings.h"
+#include <stdlib.h>
+*/
+import "C"
+
+import (
+	"debug/dwarf"
+	"unsafe"
+)
+
+type DwarfTag uint32
+
+const (
+	DW_TAG_lexical_block   DwarfTag = 0x0b
+	DW_TAG_compile_unit    DwarfTag = 0x11
+	DW_TAG_variable        DwarfTag = 0x34
+	DW_TAG_base_type       DwarfTag = 0x24
+	DW_TAG_pointer_type    DwarfTag = 0x0F
+	DW_TAG_structure_type  DwarfTag = 0x13
+	DW_TAG_subroutine_type DwarfTag = 0x15
+	DW_TAG_file_type       DwarfTag = 0x29
+	DW_TAG_subprogram      DwarfTag = 0x2E
+	DW_TAG_auto_variable   DwarfTag = 0x100
+	DW_TAG_arg_variable    DwarfTag = 0x101
+)
+
+const (
+	FlagPrivate = 1 << iota
+	FlagProtected
+	FlagFwdDecl
+	FlagAppleBlock
+	FlagBlockByrefStruct
+	FlagVirtual
+	FlagArtificial
+	FlagExplicit
+	FlagPrototyped
+	FlagObjcClassComplete
+	FlagObjectPointer
+	FlagVector
+	FlagStaticMember
+	FlagIndirectVariable
+)
+
+type DwarfLang uint32
+
+const (
+	// http://dwarfstd.org/ShowIssue.php?issue=101014.1&type=open
+	DW_LANG_Go DwarfLang = 0x0016
+)
+
+type DwarfTypeEncoding uint32
+
+const (
+	DW_ATE_address         DwarfTypeEncoding = 0x01
+	DW_ATE_boolean         DwarfTypeEncoding = 0x02
+	DW_ATE_complex_float   DwarfTypeEncoding = 0x03
+	DW_ATE_float           DwarfTypeEncoding = 0x04
+	DW_ATE_signed          DwarfTypeEncoding = 0x05
+	DW_ATE_signed_char     DwarfTypeEncoding = 0x06
+	DW_ATE_unsigned        DwarfTypeEncoding = 0x07
+	DW_ATE_unsigned_char   DwarfTypeEncoding = 0x08
+	DW_ATE_imaginary_float DwarfTypeEncoding = 0x09
+	DW_ATE_packed_decimal  DwarfTypeEncoding = 0x0a
+	DW_ATE_numeric_string  DwarfTypeEncoding = 0x0b
+	DW_ATE_edited          DwarfTypeEncoding = 0x0c
+	DW_ATE_signed_fixed    DwarfTypeEncoding = 0x0d
+	DW_ATE_unsigned_fixed  DwarfTypeEncoding = 0x0e
+	DW_ATE_decimal_float   DwarfTypeEncoding = 0x0f
+	DW_ATE_UTF             DwarfTypeEncoding = 0x10
+	DW_ATE_lo_user         DwarfTypeEncoding = 0x80
+	DW_ATE_hi_user         DwarfTypeEncoding = 0xff
+)
+
+// DIBuilder is a wrapper for the LLVM DIBuilder class.
+type DIBuilder struct {
+	ref C.LLVMDIBuilderRef
+	m   Module
+}
+
+// NewDIBuilder creates a new DIBuilder, associated with the given module.
+func NewDIBuilder(m Module) *DIBuilder {
+	d := C.LLVMNewDIBuilder(m.C)
+	return &DIBuilder{ref: d, m: m}
+}
+
+// Destroy destroys the DIBuilder.
+func (d *DIBuilder) Destroy() {
+	C.LLVMDIBuilderDestroy(d.ref)
+}
+
+// FInalize finalizes the debug information generated by the DIBuilder.
+func (d *DIBuilder) Finalize() {
+	C.LLVMDIBuilderFinalize(d.ref)
+}
+
+// DICompileUnit holds the values for creating compile unit debug metadata.
+type DICompileUnit struct {
+	Language       DwarfLang
+	File           string
+	Dir            string
+	Producer       string
+	Optimized      bool
+	Flags          string
+	RuntimeVersion int
+}
+
+// CreateCompileUnit creates compile unit debug metadata.
+func (d *DIBuilder) CreateCompileUnit(cu DICompileUnit) Value {
+	file := C.CString(cu.File)
+	defer C.free(unsafe.Pointer(file))
+	dir := C.CString(cu.Dir)
+	defer C.free(unsafe.Pointer(dir))
+	producer := C.CString(cu.Producer)
+	defer C.free(unsafe.Pointer(producer))
+	flags := C.CString(cu.Flags)
+	defer C.free(unsafe.Pointer(flags))
+	result := C.LLVMDIBuilderCreateCompileUnit(
+		d.ref,
+		C.unsigned(cu.Language),
+		file, dir,
+		producer,
+		boolToCInt(cu.Optimized),
+		flags,
+		C.unsigned(cu.RuntimeVersion),
+	)
+	return Value{C: result}
+}
+
+// CreateCompileUnit creates file debug metadata.
+func (d *DIBuilder) CreateFile(filename, dir string) Value {
+	cfilename := C.CString(filename)
+	defer C.free(unsafe.Pointer(cfilename))
+	cdir := C.CString(dir)
+	defer C.free(unsafe.Pointer(cdir))
+	result := C.LLVMDIBuilderCreateFile(d.ref, cfilename, cdir)
+	return Value{C: result}
+}
+
+// DILexicalBlock holds the values for creating lexical block debug metadata.
+type DILexicalBlock struct {
+	File   Value
+	Line   int
+	Column int
+}
+
+// CreateCompileUnit creates lexical block debug metadata.
+func (d *DIBuilder) CreateLexicalBlock(diScope Value, b DILexicalBlock) Value {
+	result := C.LLVMDIBuilderCreateLexicalBlock(
+		d.ref,
+		diScope.C,
+		b.File.C,
+		C.unsigned(b.Line),
+		C.unsigned(b.Column),
+	)
+	return Value{C: result}
+}
+
+func (d *DIBuilder) CreateLexicalBlockFile(diScope Value, diFile Value, discriminator int) Value {
+	result := C.LLVMDIBuilderCreateLexicalBlockFile(d.ref, diScope.C, diFile.C,
+		C.unsigned(discriminator))
+	return Value{C: result}
+}
+
+// DIFunction holds the values for creating function debug metadata.
+type DIFunction struct {
+	Name         string
+	LinkageName  string
+	File         Value
+	Line         int
+	Type         Value
+	LocalToUnit  bool
+	IsDefinition bool
+	ScopeLine    int
+	Flags        int
+	Optimized    bool
+	Function     Value
+}
+
+// CreateCompileUnit creates function debug metadata.
+func (d *DIBuilder) CreateFunction(diScope Value, f DIFunction) Value {
+	name := C.CString(f.Name)
+	defer C.free(unsafe.Pointer(name))
+	linkageName := C.CString(f.LinkageName)
+	defer C.free(unsafe.Pointer(linkageName))
+	result := C.LLVMDIBuilderCreateFunction(
+		d.ref,
+		diScope.C,
+		name,
+		linkageName,
+		f.File.C,
+		C.unsigned(f.Line),
+		f.Type.C,
+		boolToCInt(f.LocalToUnit),
+		boolToCInt(f.IsDefinition),
+		C.unsigned(f.ScopeLine),
+		C.unsigned(f.Flags),
+		boolToCInt(f.Optimized),
+		f.Function.C,
+	)
+	return Value{C: result}
+}
+
+// DILocalVariable holds the values for creating local variable debug metadata.
+type DILocalVariable struct {
+	Tag            dwarf.Tag
+	Name           string
+	File           Value
+	Line           int
+	Type           Value
+	AlwaysPreserve bool
+	Flags          int
+
+	// ArgNo is the 1-based index of the argument in the function's
+	// parameter list if it is an argument, or 0 otherwise.
+	ArgNo int
+}
+
+// CreateLocalVariable creates local variable debug metadata.
+func (d *DIBuilder) CreateLocalVariable(scope Value, v DILocalVariable) Value {
+	name := C.CString(v.Name)
+	defer C.free(unsafe.Pointer(name))
+	result := C.LLVMDIBuilderCreateLocalVariable(
+		d.ref,
+		C.unsigned(v.Tag),
+		scope.C,
+		name,
+		v.File.C,
+		C.unsigned(v.Line),
+		v.Type.C,
+		boolToCInt(v.AlwaysPreserve),
+		C.unsigned(v.Flags),
+		C.unsigned(v.ArgNo),
+	)
+	return Value{C: result}
+}
+
+// DIBasicType holds the values for creating basic type debug metadata.
+type DIBasicType struct {
+	Name        string
+	SizeInBits  uint64
+	AlignInBits uint64
+	Encoding    DwarfTypeEncoding
+}
+
+// CreateBasicType creates basic type debug metadata.
+func (d *DIBuilder) CreateBasicType(t DIBasicType) Value {
+	name := C.CString(t.Name)
+	defer C.free(unsafe.Pointer(name))
+	result := C.LLVMDIBuilderCreateBasicType(
+		d.ref,
+		name,
+		C.uint64_t(t.SizeInBits),
+		C.uint64_t(t.AlignInBits),
+		C.unsigned(t.Encoding),
+	)
+	return Value{C: result}
+}
+
+// DIPointerType holds the values for creating pointer type debug metadata.
+type DIPointerType struct {
+	Pointee     Value
+	SizeInBits  uint64
+	AlignInBits uint64 // optional
+	Name        string // optional
+}
+
+// CreateBasicType creates basic type debug metadata.
+func (d *DIBuilder) CreatePointerType(t DIPointerType) Value {
+	name := C.CString(t.Name)
+	defer C.free(unsafe.Pointer(name))
+	result := C.LLVMDIBuilderCreatePointerType(
+		d.ref,
+		t.Pointee.C,
+		C.uint64_t(t.SizeInBits),
+		C.uint64_t(t.AlignInBits),
+		name,
+	)
+	return Value{C: result}
+}
+
+// DISubroutineType holds the values for creating subroutine type debug metadata.
+type DISubroutineType struct {
+	// File is the file in which the subroutine type is defined.
+	File Value
+
+	// Parameters contains the subroutine parameter types,
+	// including the return type at the 0th index.
+	Parameters []Value
+}
+
+// CreateSubroutineType creates subroutine type debug metadata.
+func (d *DIBuilder) CreateSubroutineType(t DISubroutineType) Value {
+	params := d.getOrCreateTypeArray(t.Parameters)
+	result := C.LLVMDIBuilderCreateSubroutineType(d.ref, t.File.C, params.C)
+	return Value{C: result}
+}
+
+// DIStructType holds the values for creating struct type debug metadata.
+type DIStructType struct {
+	Name        string
+	File        Value
+	Line        int
+	SizeInBits  uint64
+	AlignInBits uint64
+	Flags       int
+	DerivedFrom Value
+	Elements    []Value
+}
+
+// CreateStructType creates struct type debug metadata.
+func (d *DIBuilder) CreateStructType(scope Value, t DIStructType) Value {
+	elements := d.getOrCreateArray(t.Elements)
+	name := C.CString(t.Name)
+	defer C.free(unsafe.Pointer(name))
+	result := C.LLVMDIBuilderCreateStructType(
+		d.ref,
+		scope.C,
+		name,
+		t.File.C,
+		C.unsigned(t.Line),
+		C.uint64_t(t.SizeInBits),
+		C.uint64_t(t.AlignInBits),
+		C.unsigned(t.Flags),
+		t.DerivedFrom.C,
+		elements.C,
+	)
+	return Value{C: result}
+}
+
+// DIMemberType holds the values for creating member type debug metadata.
+type DIMemberType struct {
+	Name         string
+	File         Value
+	Line         int
+	SizeInBits   uint64
+	AlignInBits  uint64
+	OffsetInBits uint64
+	Flags        int
+	Type         Value
+}
+
+// CreateMemberType creates struct type debug metadata.
+func (d *DIBuilder) CreateMemberType(scope Value, t DIMemberType) Value {
+	name := C.CString(t.Name)
+	defer C.free(unsafe.Pointer(name))
+	result := C.LLVMDIBuilderCreateMemberType(
+		d.ref,
+		scope.C,
+		name,
+		t.File.C,
+		C.unsigned(t.Line),
+		C.uint64_t(t.SizeInBits),
+		C.uint64_t(t.AlignInBits),
+		C.uint64_t(t.OffsetInBits),
+		C.unsigned(t.Flags),
+		t.Type.C,
+	)
+	return Value{C: result}
+}
+
+// DISubrange describes an integer value range.
+type DISubrange struct {
+	Lo    int64
+	Count int64
+}
+
+// DIArrayType holds the values for creating array type debug metadata.
+type DIArrayType struct {
+	SizeInBits  uint64
+	AlignInBits uint64
+	ElementType Value
+	Subscripts  []DISubrange
+}
+
+// CreateArrayType creates struct type debug metadata.
+func (d *DIBuilder) CreateArrayType(t DIArrayType) Value {
+	subscriptsSlice := make([]Value, len(t.Subscripts))
+	for i, s := range t.Subscripts {
+		subscriptsSlice[i] = d.getOrCreateSubrange(s.Lo, s.Count)
+	}
+	subscripts := d.getOrCreateArray(subscriptsSlice)
+	result := C.LLVMDIBuilderCreateArrayType(
+		d.ref,
+		C.uint64_t(t.SizeInBits),
+		C.uint64_t(t.AlignInBits),
+		t.ElementType.C,
+		subscripts.C,
+	)
+	return Value{C: result}
+}
+
+// DITypedef holds the values for creating typedef type debug metadata.
+type DITypedef struct {
+	Type    Value
+	Name    string
+	File    Value
+	Line    int
+	Context Value
+}
+
+// CreateTypedef creates typedef type debug metadata.
+func (d *DIBuilder) CreateTypedef(t DITypedef) Value {
+	name := C.CString(t.Name)
+	defer C.free(unsafe.Pointer(name))
+	result := C.LLVMDIBuilderCreateTypedef(
+		d.ref,
+		t.Type.C,
+		name,
+		t.File.C,
+		C.unsigned(t.Line),
+		t.Context.C,
+	)
+	return Value{C: result}
+}
+
+// getOrCreateSubrange gets a metadata node for the specified subrange,
+// creating if required.
+func (d *DIBuilder) getOrCreateSubrange(lo, count int64) Value {
+	result := C.LLVMDIBuilderGetOrCreateSubrange(d.ref, C.int64_t(lo), C.int64_t(count))
+	return Value{C: result}
+}
+
+// getOrCreateArray gets a metadata node containing the specified values,
+// creating if required.
+func (d *DIBuilder) getOrCreateArray(values []Value) Value {
+	if len(values) == 0 {
+		return Value{}
+	}
+	var data *C.LLVMValueRef
+	length := len(values)
+	if length > 0 {
+		data = &values[0].C
+	}
+	result := C.LLVMDIBuilderGetOrCreateArray(d.ref, data, C.size_t(length))
+	return Value{C: result}
+}
+
+// getOrCreateTypeArray gets a metadata node for a type array containing the
+// specified values, creating if required.
+func (d *DIBuilder) getOrCreateTypeArray(values []Value) Value {
+	if len(values) == 0 {
+		return Value{}
+	}
+	var data *C.LLVMValueRef
+	length := len(values)
+	if length > 0 {
+		data = &values[0].C
+	}
+	result := C.LLVMDIBuilderGetOrCreateTypeArray(d.ref, data, C.size_t(length))
+	return Value{C: result}
+}
+
+// CreateExpression creates a new descriptor for the specified
+// variable which has a complex address expression for its address.
+func (d *DIBuilder) CreateExpression(addr []int64) Value {
+	var data *C.int64_t
+	if len(addr) > 0 {
+		data = (*C.int64_t)(unsafe.Pointer(&addr[0]))
+	}
+	result := C.LLVMDIBuilderCreateExpression(d.ref, data, C.size_t(len(addr)))
+	return Value{C: result}
+}
+
+// InsertDeclareAtEnd inserts a call to llvm.dbg.declare at the end of the
+// specified basic block for the given value and associated debug metadata.
+func (d *DIBuilder) InsertDeclareAtEnd(v, diVarInfo, expr Value, bb BasicBlock) Value {
+	result := C.LLVMDIBuilderInsertDeclareAtEnd(d.ref, v.C, diVarInfo.C, expr.C, bb.C)
+	return Value{C: result}
+}
+
+func boolToCInt(v bool) C.int {
+	if v {
+		return 1
+	}
+	return 0
+}

diff --git a/bindings/go/llvm/executionengine.go b/bindings/go/llvm/executionengine.go
new file mode 100644
index 0000000..26b7524
--- /dev/null
+++ b/bindings/go/llvm/executionengine.go

@@ -0,0 +1,163 @@
+//===- executionengine.go - Bindings for executionengine ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the executionengine component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/ExecutionEngine.h"
+#include <stdlib.h>
+*/
+import "C"
+import "unsafe"
+import "errors"
+
+func LinkInMCJIT()       { C.LLVMLinkInMCJIT() }
+func LinkInInterpreter() { C.LLVMLinkInInterpreter() }
+
+type GenericValue struct {
+	C C.LLVMGenericValueRef
+}
+type ExecutionEngine struct {
+	C C.LLVMExecutionEngineRef
+}
+type MCJITCompilerOptions struct {
+	OptLevel           uint
+	CodeModel          CodeModel
+	NoFramePointerElim bool
+	EnableFastISel     bool
+}
+
+// helpers
+func llvmGenericValueRefPtr(t *GenericValue) *C.LLVMGenericValueRef {
+	return (*C.LLVMGenericValueRef)(unsafe.Pointer(t))
+}
+
+//-------------------------------------------------------------------------
+// llvm.GenericValue
+//-------------------------------------------------------------------------
+
+func NewGenericValueFromInt(t Type, n uint64, signed bool) (g GenericValue) {
+	g.C = C.LLVMCreateGenericValueOfInt(t.C, C.ulonglong(n), boolToLLVMBool(signed))
+	return
+}
+func NewGenericValueFromPointer(p unsafe.Pointer) (g GenericValue) {
+	g.C = C.LLVMCreateGenericValueOfPointer(p)
+	return
+}
+func NewGenericValueFromFloat(t Type, n float64) (g GenericValue) {
+	g.C = C.LLVMCreateGenericValueOfFloat(t.C, C.double(n))
+	return
+}
+func (g GenericValue) IntWidth() int { return int(C.LLVMGenericValueIntWidth(g.C)) }
+func (g GenericValue) Int(signed bool) uint64 {
+	return uint64(C.LLVMGenericValueToInt(g.C, boolToLLVMBool(signed)))
+}
+func (g GenericValue) Float(t Type) float64 {
+	return float64(C.LLVMGenericValueToFloat(t.C, g.C))
+}
+func (g GenericValue) Pointer() unsafe.Pointer {
+	return C.LLVMGenericValueToPointer(g.C)
+}
+func (g GenericValue) Dispose() { C.LLVMDisposeGenericValue(g.C) }
+
+//-------------------------------------------------------------------------
+// llvm.ExecutionEngine
+//-------------------------------------------------------------------------
+
+func NewExecutionEngine(m Module) (ee ExecutionEngine, err error) {
+	var cmsg *C.char
+	fail := C.LLVMCreateExecutionEngineForModule(&ee.C, m.C, &cmsg)
+	if fail != 0 {
+		ee.C = nil
+		err = errors.New(C.GoString(cmsg))
+		C.LLVMDisposeMessage(cmsg)
+	}
+	return
+}
+
+func NewInterpreter(m Module) (ee ExecutionEngine, err error) {
+	var cmsg *C.char
+	fail := C.LLVMCreateInterpreterForModule(&ee.C, m.C, &cmsg)
+	if fail != 0 {
+		ee.C = nil
+		err = errors.New(C.GoString(cmsg))
+		C.LLVMDisposeMessage(cmsg)
+	}
+	return
+}
+
+func NewMCJITCompiler(m Module, options MCJITCompilerOptions) (ee ExecutionEngine, err error) {
+	var cmsg *C.char
+	copts := C.struct_LLVMMCJITCompilerOptions{
+		OptLevel:           C.unsigned(options.OptLevel),
+		CodeModel:          C.LLVMCodeModel(options.CodeModel),
+		NoFramePointerElim: boolToLLVMBool(options.NoFramePointerElim),
+		EnableFastISel:     boolToLLVMBool(options.EnableFastISel),
+	}
+	fail := C.LLVMCreateMCJITCompilerForModule(&ee.C, m.C, &copts, C.size_t(unsafe.Sizeof(copts)), &cmsg)
+	if fail != 0 {
+		ee.C = nil
+		err = errors.New(C.GoString(cmsg))
+		C.LLVMDisposeMessage(cmsg)
+	}
+	return
+}
+
+func (ee ExecutionEngine) Dispose()               { C.LLVMDisposeExecutionEngine(ee.C) }
+func (ee ExecutionEngine) RunStaticConstructors() { C.LLVMRunStaticConstructors(ee.C) }
+func (ee ExecutionEngine) RunStaticDestructors()  { C.LLVMRunStaticDestructors(ee.C) }
+
+func (ee ExecutionEngine) RunFunction(f Value, args []GenericValue) (g GenericValue) {
+	nargs := len(args)
+	var argptr *GenericValue
+	if nargs > 0 {
+		argptr = &args[0]
+	}
+	g.C = C.LLVMRunFunction(ee.C, f.C,
+		C.unsigned(nargs), llvmGenericValueRefPtr(argptr))
+	return
+}
+
+func (ee ExecutionEngine) FreeMachineCodeForFunction(f Value) {
+	C.LLVMFreeMachineCodeForFunction(ee.C, f.C)
+}
+func (ee ExecutionEngine) AddModule(m Module) { C.LLVMAddModule(ee.C, m.C) }
+
+func (ee ExecutionEngine) RemoveModule(m Module) {
+	var modtmp C.LLVMModuleRef
+	C.LLVMRemoveModule(ee.C, m.C, &modtmp, nil)
+}
+
+func (ee ExecutionEngine) FindFunction(name string) (f Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	C.LLVMFindFunction(ee.C, cname, &f.C)
+	return
+}
+
+func (ee ExecutionEngine) RecompileAndRelinkFunction(f Value) unsafe.Pointer {
+	return C.LLVMRecompileAndRelinkFunction(ee.C, f.C)
+}
+
+func (ee ExecutionEngine) TargetData() (td TargetData) {
+	td.C = C.LLVMGetExecutionEngineTargetData(ee.C)
+	return
+}
+
+func (ee ExecutionEngine) AddGlobalMapping(global Value, addr unsafe.Pointer) {
+	C.LLVMAddGlobalMapping(ee.C, global.C, addr)
+}
+
+func (ee ExecutionEngine) PointerToGlobal(global Value) unsafe.Pointer {
+	return C.LLVMGetPointerToGlobal(ee.C, global.C)
+}

diff --git a/bindings/go/llvm/executionengine_test.go b/bindings/go/llvm/executionengine_test.go
new file mode 100644
index 0000000..1a3fd45
--- /dev/null
+++ b/bindings/go/llvm/executionengine_test.go

@@ -0,0 +1,93 @@
+//===- executionengine_test.go - Tests for executionengine ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file tests bindings for the executionengine component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+import (
+	"testing"
+)
+
+func TestFactorial(t *testing.T) {
+	LinkInMCJIT()
+	InitializeNativeTarget()
+	InitializeNativeAsmPrinter()
+
+	mod := NewModule("fac_module")
+
+	fac_args := []Type{Int32Type()}
+	fac_type := FunctionType(Int32Type(), fac_args, false)
+	fac := AddFunction(mod, "fac", fac_type)
+	fac.SetFunctionCallConv(CCallConv)
+	n := fac.Param(0)
+
+	entry := AddBasicBlock(fac, "entry")
+	iftrue := AddBasicBlock(fac, "iftrue")
+	iffalse := AddBasicBlock(fac, "iffalse")
+	end := AddBasicBlock(fac, "end")
+
+	builder := NewBuilder()
+	defer builder.Dispose()
+
+	builder.SetInsertPointAtEnd(entry)
+	If := builder.CreateICmp(IntEQ, n, ConstInt(Int32Type(), 0, false), "cmptmp")
+	builder.CreateCondBr(If, iftrue, iffalse)
+
+	builder.SetInsertPointAtEnd(iftrue)
+	res_iftrue := ConstInt(Int32Type(), 1, false)
+	builder.CreateBr(end)
+
+	builder.SetInsertPointAtEnd(iffalse)
+	n_minus := builder.CreateSub(n, ConstInt(Int32Type(), 1, false), "subtmp")
+	call_fac_args := []Value{n_minus}
+	call_fac := builder.CreateCall(fac, call_fac_args, "calltmp")
+	res_iffalse := builder.CreateMul(n, call_fac, "multmp")
+	builder.CreateBr(end)
+
+	builder.SetInsertPointAtEnd(end)
+	res := builder.CreatePHI(Int32Type(), "result")
+	phi_vals := []Value{res_iftrue, res_iffalse}
+	phi_blocks := []BasicBlock{iftrue, iffalse}
+	res.AddIncoming(phi_vals, phi_blocks)
+	builder.CreateRet(res)
+
+	err := VerifyModule(mod, ReturnStatusAction)
+	if err != nil {
+		t.Errorf("Error verifying module: %s", err)
+		return
+	}
+
+	engine, err := NewMCJITCompiler(mod, MCJITCompilerOptions{OptLevel: 2})
+	if err != nil {
+		t.Errorf("Error creating JIT: %s", err)
+		return
+	}
+	defer engine.Dispose()
+
+	pass := NewPassManager()
+	defer pass.Dispose()
+
+	pass.Add(engine.TargetData())
+	pass.AddConstantPropagationPass()
+	pass.AddInstructionCombiningPass()
+	pass.AddPromoteMemoryToRegisterPass()
+	pass.AddGVNPass()
+	pass.AddCFGSimplificationPass()
+	pass.Run(mod)
+
+	exec_args := []GenericValue{NewGenericValueFromInt(Int32Type(), 10, false)}
+	exec_res := engine.RunFunction(fac, exec_args)
+	var fac10 uint64 = 10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1
+	if exec_res.Int(false) != fac10 {
+		t.Errorf("Expected %d, got %d", fac10, exec_res.Int(false))
+	}
+}

diff --git a/bindings/go/llvm/ir.go b/bindings/go/llvm/ir.go
new file mode 100644
index 0000000..7834f5c
--- /dev/null
+++ b/bindings/go/llvm/ir.go

@@ -0,0 +1,1824 @@
+//===- ir.go - Bindings for ir --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the ir component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/Core.h"
+#include "IRBindings.h"
+#include <stdlib.h>
+*/
+import "C"
+import "unsafe"
+import "errors"
+
+type (
+	// We use these weird structs here because *Ref types are pointers and
+	// Go's spec says that a pointer cannot be used as a receiver base type.
+	Context struct {
+		C C.LLVMContextRef
+	}
+	Module struct {
+		C C.LLVMModuleRef
+	}
+	Type struct {
+		C C.LLVMTypeRef
+	}
+	Value struct {
+		C C.LLVMValueRef
+	}
+	BasicBlock struct {
+		C C.LLVMBasicBlockRef
+	}
+	Builder struct {
+		C C.LLVMBuilderRef
+	}
+	ModuleProvider struct {
+		C C.LLVMModuleProviderRef
+	}
+	MemoryBuffer struct {
+		C C.LLVMMemoryBufferRef
+	}
+	PassManager struct {
+		C C.LLVMPassManagerRef
+	}
+	Use struct {
+		C C.LLVMUseRef
+	}
+	Attribute        uint64
+	Opcode           C.LLVMOpcode
+	TypeKind         C.LLVMTypeKind
+	Linkage          C.LLVMLinkage
+	Visibility       C.LLVMVisibility
+	CallConv         C.LLVMCallConv
+	IntPredicate     C.LLVMIntPredicate
+	FloatPredicate   C.LLVMRealPredicate
+	LandingPadClause C.LLVMLandingPadClauseTy
+)
+
+func (c Context) IsNil() bool        { return c.C == nil }
+func (c Module) IsNil() bool         { return c.C == nil }
+func (c Type) IsNil() bool           { return c.C == nil }
+func (c Value) IsNil() bool          { return c.C == nil }
+func (c BasicBlock) IsNil() bool     { return c.C == nil }
+func (c Builder) IsNil() bool        { return c.C == nil }
+func (c ModuleProvider) IsNil() bool { return c.C == nil }
+func (c MemoryBuffer) IsNil() bool   { return c.C == nil }
+func (c PassManager) IsNil() bool    { return c.C == nil }
+func (c Use) IsNil() bool            { return c.C == nil }
+
+// helpers
+func llvmTypeRefPtr(t *Type) *C.LLVMTypeRef    { return (*C.LLVMTypeRef)(unsafe.Pointer(t)) }
+func llvmValueRefPtr(t *Value) *C.LLVMValueRef { return (*C.LLVMValueRef)(unsafe.Pointer(t)) }
+func llvmBasicBlockRefPtr(t *BasicBlock) *C.LLVMBasicBlockRef {
+	return (*C.LLVMBasicBlockRef)(unsafe.Pointer(t))
+}
+func boolToLLVMBool(b bool) C.LLVMBool {
+	if b {
+		return C.LLVMBool(1)
+	}
+	return C.LLVMBool(0)
+}
+
+func llvmValueRefs(values []Value) (*C.LLVMValueRef, C.unsigned) {
+	var pt *C.LLVMValueRef
+	ptlen := C.unsigned(len(values))
+	if ptlen > 0 {
+		pt = llvmValueRefPtr(&values[0])
+	}
+	return pt, ptlen
+}
+
+//-------------------------------------------------------------------------
+// llvm.Attribute
+//-------------------------------------------------------------------------
+
+const (
+	NoneAttribute               Attribute = 0
+	ZExtAttribute               Attribute = C.LLVMZExtAttribute
+	SExtAttribute               Attribute = C.LLVMSExtAttribute
+	NoReturnAttribute           Attribute = C.LLVMNoReturnAttribute
+	InRegAttribute              Attribute = C.LLVMInRegAttribute
+	StructRetAttribute          Attribute = C.LLVMStructRetAttribute
+	NoUnwindAttribute           Attribute = C.LLVMNoUnwindAttribute
+	NoAliasAttribute            Attribute = C.LLVMNoAliasAttribute
+	ByValAttribute              Attribute = C.LLVMByValAttribute
+	NestAttribute               Attribute = C.LLVMNestAttribute
+	ReadNoneAttribute           Attribute = C.LLVMReadNoneAttribute
+	ReadOnlyAttribute           Attribute = C.LLVMReadOnlyAttribute
+	NoInlineAttribute           Attribute = C.LLVMNoInlineAttribute
+	AlwaysInlineAttribute       Attribute = C.LLVMAlwaysInlineAttribute
+	OptimizeForSizeAttribute    Attribute = C.LLVMOptimizeForSizeAttribute
+	StackProtectAttribute       Attribute = C.LLVMStackProtectAttribute
+	StackProtectReqAttribute    Attribute = C.LLVMStackProtectReqAttribute
+	Alignment                   Attribute = C.LLVMAlignment
+	NoCaptureAttribute          Attribute = C.LLVMNoCaptureAttribute
+	NoRedZoneAttribute          Attribute = C.LLVMNoRedZoneAttribute
+	NoImplicitFloatAttribute    Attribute = C.LLVMNoImplicitFloatAttribute
+	NakedAttribute              Attribute = C.LLVMNakedAttribute
+	InlineHintAttribute         Attribute = C.LLVMInlineHintAttribute
+	StackAlignment              Attribute = C.LLVMStackAlignment
+	ReturnsTwiceAttribute       Attribute = C.LLVMReturnsTwice
+	UWTableAttribute            Attribute = C.LLVMUWTable
+	NonLazyBindAttribute        Attribute = 1 << 31
+	SanitizeAddressAttribute    Attribute = 1 << 32
+	MinSizeAttribute            Attribute = 1 << 33
+	NoDuplicateAttribute        Attribute = 1 << 34
+	StackProtectStrongAttribute Attribute = 1 << 35
+	SanitizeThreadAttribute     Attribute = 1 << 36
+	SanitizeMemoryAttribute     Attribute = 1 << 37
+	NoBuiltinAttribute          Attribute = 1 << 38
+	ReturnedAttribute           Attribute = 1 << 39
+	ColdAttribute               Attribute = 1 << 40
+	BuiltinAttribute            Attribute = 1 << 41
+	OptimizeNoneAttribute       Attribute = 1 << 42
+	InAllocaAttribute           Attribute = 1 << 43
+	NonNullAttribute            Attribute = 1 << 44
+	JumpTableAttribute          Attribute = 1 << 45
+)
+
+//-------------------------------------------------------------------------
+// llvm.Opcode
+//-------------------------------------------------------------------------
+
+const (
+	Ret         Opcode = C.LLVMRet
+	Br          Opcode = C.LLVMBr
+	Switch      Opcode = C.LLVMSwitch
+	IndirectBr  Opcode = C.LLVMIndirectBr
+	Invoke      Opcode = C.LLVMInvoke
+	Unreachable Opcode = C.LLVMUnreachable
+
+	// Standard Binary Operators
+	Add  Opcode = C.LLVMAdd
+	FAdd Opcode = C.LLVMFAdd
+	Sub  Opcode = C.LLVMSub
+	FSub Opcode = C.LLVMFSub
+	Mul  Opcode = C.LLVMMul
+	FMul Opcode = C.LLVMFMul
+	UDiv Opcode = C.LLVMUDiv
+	SDiv Opcode = C.LLVMSDiv
+	FDiv Opcode = C.LLVMFDiv
+	URem Opcode = C.LLVMURem
+	SRem Opcode = C.LLVMSRem
+	FRem Opcode = C.LLVMFRem
+
+	// Logical Operators
+	Shl  Opcode = C.LLVMShl
+	LShr Opcode = C.LLVMLShr
+	AShr Opcode = C.LLVMAShr
+	And  Opcode = C.LLVMAnd
+	Or   Opcode = C.LLVMOr
+	Xor  Opcode = C.LLVMXor
+
+	// Memory Operators
+	Alloca        Opcode = C.LLVMAlloca
+	Load          Opcode = C.LLVMLoad
+	Store         Opcode = C.LLVMStore
+	GetElementPtr Opcode = C.LLVMGetElementPtr
+
+	// Cast Operators
+	Trunc    Opcode = C.LLVMTrunc
+	ZExt     Opcode = C.LLVMZExt
+	SExt     Opcode = C.LLVMSExt
+	FPToUI   Opcode = C.LLVMFPToUI
+	FPToSI   Opcode = C.LLVMFPToSI
+	UIToFP   Opcode = C.LLVMUIToFP
+	SIToFP   Opcode = C.LLVMSIToFP
+	FPTrunc  Opcode = C.LLVMFPTrunc
+	FPExt    Opcode = C.LLVMFPExt
+	PtrToInt Opcode = C.LLVMPtrToInt
+	IntToPtr Opcode = C.LLVMIntToPtr
+	BitCast  Opcode = C.LLVMBitCast
+
+	// Other Operators
+	ICmp   Opcode = C.LLVMICmp
+	FCmp   Opcode = C.LLVMFCmp
+	PHI    Opcode = C.LLVMPHI
+	Call   Opcode = C.LLVMCall
+	Select Opcode = C.LLVMSelect
+	// UserOp1
+	// UserOp2
+	VAArg          Opcode = C.LLVMVAArg
+	ExtractElement Opcode = C.LLVMExtractElement
+	InsertElement  Opcode = C.LLVMInsertElement
+	ShuffleVector  Opcode = C.LLVMShuffleVector
+	ExtractValue   Opcode = C.LLVMExtractValue
+	InsertValue    Opcode = C.LLVMInsertValue
+)
+
+//-------------------------------------------------------------------------
+// llvm.TypeKind
+//-------------------------------------------------------------------------
+
+const (
+	VoidTypeKind      TypeKind = C.LLVMVoidTypeKind
+	FloatTypeKind     TypeKind = C.LLVMFloatTypeKind
+	DoubleTypeKind    TypeKind = C.LLVMDoubleTypeKind
+	X86_FP80TypeKind  TypeKind = C.LLVMX86_FP80TypeKind
+	FP128TypeKind     TypeKind = C.LLVMFP128TypeKind
+	PPC_FP128TypeKind TypeKind = C.LLVMPPC_FP128TypeKind
+	LabelTypeKind     TypeKind = C.LLVMLabelTypeKind
+	IntegerTypeKind   TypeKind = C.LLVMIntegerTypeKind
+	FunctionTypeKind  TypeKind = C.LLVMFunctionTypeKind
+	StructTypeKind    TypeKind = C.LLVMStructTypeKind
+	ArrayTypeKind     TypeKind = C.LLVMArrayTypeKind
+	PointerTypeKind   TypeKind = C.LLVMPointerTypeKind
+	VectorTypeKind    TypeKind = C.LLVMVectorTypeKind
+	MetadataTypeKind  TypeKind = C.LLVMMetadataTypeKind
+)
+
+//-------------------------------------------------------------------------
+// llvm.Linkage
+//-------------------------------------------------------------------------
+
+const (
+	ExternalLinkage            Linkage = C.LLVMExternalLinkage
+	AvailableExternallyLinkage Linkage = C.LLVMAvailableExternallyLinkage
+	LinkOnceAnyLinkage         Linkage = C.LLVMLinkOnceAnyLinkage
+	LinkOnceODRLinkage         Linkage = C.LLVMLinkOnceODRLinkage
+	WeakAnyLinkage             Linkage = C.LLVMWeakAnyLinkage
+	WeakODRLinkage             Linkage = C.LLVMWeakODRLinkage
+	AppendingLinkage           Linkage = C.LLVMAppendingLinkage
+	InternalLinkage            Linkage = C.LLVMInternalLinkage
+	PrivateLinkage             Linkage = C.LLVMPrivateLinkage
+	ExternalWeakLinkage        Linkage = C.LLVMExternalWeakLinkage
+	CommonLinkage              Linkage = C.LLVMCommonLinkage
+)
+
+//-------------------------------------------------------------------------
+// llvm.Visibility
+//-------------------------------------------------------------------------
+
+const (
+	DefaultVisibility   Visibility = C.LLVMDefaultVisibility
+	HiddenVisibility    Visibility = C.LLVMHiddenVisibility
+	ProtectedVisibility Visibility = C.LLVMProtectedVisibility
+)
+
+//-------------------------------------------------------------------------
+// llvm.CallConv
+//-------------------------------------------------------------------------
+
+const (
+	CCallConv           CallConv = C.LLVMCCallConv
+	FastCallConv        CallConv = C.LLVMFastCallConv
+	ColdCallConv        CallConv = C.LLVMColdCallConv
+	X86StdcallCallConv  CallConv = C.LLVMX86StdcallCallConv
+	X86FastcallCallConv CallConv = C.LLVMX86FastcallCallConv
+)
+
+//-------------------------------------------------------------------------
+// llvm.IntPredicate
+//-------------------------------------------------------------------------
+
+const (
+	IntEQ  IntPredicate = C.LLVMIntEQ
+	IntNE  IntPredicate = C.LLVMIntNE
+	IntUGT IntPredicate = C.LLVMIntUGT
+	IntUGE IntPredicate = C.LLVMIntUGE
+	IntULT IntPredicate = C.LLVMIntULT
+	IntULE IntPredicate = C.LLVMIntULE
+	IntSGT IntPredicate = C.LLVMIntSGT
+	IntSGE IntPredicate = C.LLVMIntSGE
+	IntSLT IntPredicate = C.LLVMIntSLT
+	IntSLE IntPredicate = C.LLVMIntSLE
+)
+
+//-------------------------------------------------------------------------
+// llvm.FloatPredicate
+//-------------------------------------------------------------------------
+
+const (
+	FloatPredicateFalse FloatPredicate = C.LLVMRealPredicateFalse
+	FloatOEQ            FloatPredicate = C.LLVMRealOEQ
+	FloatOGT            FloatPredicate = C.LLVMRealOGT
+	FloatOGE            FloatPredicate = C.LLVMRealOGE
+	FloatOLT            FloatPredicate = C.LLVMRealOLT
+	FloatOLE            FloatPredicate = C.LLVMRealOLE
+	FloatONE            FloatPredicate = C.LLVMRealONE
+	FloatORD            FloatPredicate = C.LLVMRealORD
+	FloatUNO            FloatPredicate = C.LLVMRealUNO
+	FloatUEQ            FloatPredicate = C.LLVMRealUEQ
+	FloatUGT            FloatPredicate = C.LLVMRealUGT
+	FloatUGE            FloatPredicate = C.LLVMRealUGE
+	FloatULT            FloatPredicate = C.LLVMRealULT
+	FloatULE            FloatPredicate = C.LLVMRealULE
+	FloatUNE            FloatPredicate = C.LLVMRealUNE
+	FloatPredicateTrue  FloatPredicate = C.LLVMRealPredicateTrue
+)
+
+//-------------------------------------------------------------------------
+// llvm.LandingPadClause
+//-------------------------------------------------------------------------
+
+const (
+	LandingPadCatch  LandingPadClause = C.LLVMLandingPadCatch
+	LandingPadFilter LandingPadClause = C.LLVMLandingPadFilter
+)
+
+//-------------------------------------------------------------------------
+// llvm.Context
+//-------------------------------------------------------------------------
+
+func NewContext() Context    { return Context{C.LLVMContextCreate()} }
+func GlobalContext() Context { return Context{C.LLVMGetGlobalContext()} }
+func (c Context) Dispose()   { C.LLVMContextDispose(c.C) }
+
+func (c Context) MDKindID(name string) (id int) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	id = int(C.LLVMGetMDKindIDInContext(c.C, cname, C.unsigned(len(name))))
+	return
+}
+
+func MDKindID(name string) (id int) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	id = int(C.LLVMGetMDKindID(cname, C.unsigned(len(name))))
+	return
+}
+
+//-------------------------------------------------------------------------
+// llvm.Module
+//-------------------------------------------------------------------------
+
+// Create and destroy modules.
+// See llvm::Module::Module.
+func NewModule(name string) (m Module) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	m.C = C.LLVMModuleCreateWithName(cname)
+	return
+}
+
+func (c Context) NewModule(name string) (m Module) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	m.C = C.LLVMModuleCreateWithNameInContext(cname, c.C)
+	return
+}
+
+// See llvm::Module::~Module
+func (m Module) Dispose() { C.LLVMDisposeModule(m.C) }
+
+// Data layout. See Module::getDataLayout.
+func (m Module) DataLayout() string {
+	clayout := C.LLVMGetDataLayout(m.C)
+	return C.GoString(clayout)
+}
+
+func (m Module) SetDataLayout(layout string) {
+	clayout := C.CString(layout)
+	defer C.free(unsafe.Pointer(clayout))
+	C.LLVMSetDataLayout(m.C, clayout)
+}
+
+// Target triple. See Module::getTargetTriple.
+func (m Module) Target() string {
+	ctarget := C.LLVMGetTarget(m.C)
+	return C.GoString(ctarget)
+}
+func (m Module) SetTarget(target string) {
+	ctarget := C.CString(target)
+	defer C.free(unsafe.Pointer(ctarget))
+	C.LLVMSetTarget(m.C, ctarget)
+}
+
+func (m Module) GetTypeByName(name string) (t Type) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	t.C = C.LLVMGetTypeByName(m.C, cname)
+	return
+}
+
+// See Module::dump.
+func (m Module) Dump() {
+	C.LLVMDumpModule(m.C)
+}
+
+func (m Module) String() string {
+	cir := C.LLVMPrintModuleToString(m.C)
+	defer C.free(unsafe.Pointer(cir))
+	ir := C.GoString(cir)
+	return ir
+}
+
+// See Module::setModuleInlineAsm.
+func (m Module) SetInlineAsm(asm string) {
+	casm := C.CString(asm)
+	defer C.free(unsafe.Pointer(casm))
+	C.LLVMSetModuleInlineAsm(m.C, casm)
+}
+
+func (m Module) AddNamedMetadataOperand(name string, operand Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	C.LLVMAddNamedMetadataOperand(m.C, cname, operand.C)
+}
+
+func (m Module) Context() (c Context) {
+	c.C = C.LLVMGetModuleContext(m.C)
+	return
+}
+
+//-------------------------------------------------------------------------
+// llvm.Type
+//-------------------------------------------------------------------------
+
+// LLVM types conform to the following hierarchy:
+//
+//   types:
+//     integer type
+//     real type
+//     function type
+//     sequence types:
+//       array type
+//       pointer type
+//       vector type
+//     void type
+//     label type
+//     opaque type
+
+// See llvm::LLVMTypeKind::getTypeID.
+func (t Type) TypeKind() TypeKind { return TypeKind(C.LLVMGetTypeKind(t.C)) }
+
+// See llvm::LLVMType::getContext.
+func (t Type) Context() (c Context) {
+	c.C = C.LLVMGetTypeContext(t.C)
+	return
+}
+
+// Operations on integer types
+func (c Context) Int1Type() (t Type)  { t.C = C.LLVMInt1TypeInContext(c.C); return }
+func (c Context) Int8Type() (t Type)  { t.C = C.LLVMInt8TypeInContext(c.C); return }
+func (c Context) Int16Type() (t Type) { t.C = C.LLVMInt16TypeInContext(c.C); return }
+func (c Context) Int32Type() (t Type) { t.C = C.LLVMInt32TypeInContext(c.C); return }
+func (c Context) Int64Type() (t Type) { t.C = C.LLVMInt64TypeInContext(c.C); return }
+func (c Context) IntType(numbits int) (t Type) {
+	t.C = C.LLVMIntTypeInContext(c.C, C.unsigned(numbits))
+	return
+}
+
+func Int1Type() (t Type)  { t.C = C.LLVMInt1Type(); return }
+func Int8Type() (t Type)  { t.C = C.LLVMInt8Type(); return }
+func Int16Type() (t Type) { t.C = C.LLVMInt16Type(); return }
+func Int32Type() (t Type) { t.C = C.LLVMInt32Type(); return }
+func Int64Type() (t Type) { t.C = C.LLVMInt64Type(); return }
+
+func IntType(numbits int) (t Type) {
+	t.C = C.LLVMIntType(C.unsigned(numbits))
+	return
+}
+
+func (t Type) IntTypeWidth() int {
+	return int(C.LLVMGetIntTypeWidth(t.C))
+}
+
+// Operations on real types
+func (c Context) FloatType() (t Type)    { t.C = C.LLVMFloatTypeInContext(c.C); return }
+func (c Context) DoubleType() (t Type)   { t.C = C.LLVMDoubleTypeInContext(c.C); return }
+func (c Context) X86FP80Type() (t Type)  { t.C = C.LLVMX86FP80TypeInContext(c.C); return }
+func (c Context) FP128Type() (t Type)    { t.C = C.LLVMFP128TypeInContext(c.C); return }
+func (c Context) PPCFP128Type() (t Type) { t.C = C.LLVMPPCFP128TypeInContext(c.C); return }
+
+func FloatType() (t Type)    { t.C = C.LLVMFloatType(); return }
+func DoubleType() (t Type)   { t.C = C.LLVMDoubleType(); return }
+func X86FP80Type() (t Type)  { t.C = C.LLVMX86FP80Type(); return }
+func FP128Type() (t Type)    { t.C = C.LLVMFP128Type(); return }
+func PPCFP128Type() (t Type) { t.C = C.LLVMPPCFP128Type(); return }
+
+// Operations on function types
+func FunctionType(returnType Type, paramTypes []Type, isVarArg bool) (t Type) {
+	var pt *C.LLVMTypeRef
+	var ptlen C.unsigned
+	if len(paramTypes) > 0 {
+		pt = llvmTypeRefPtr(&paramTypes[0])
+		ptlen = C.unsigned(len(paramTypes))
+	}
+	t.C = C.LLVMFunctionType(returnType.C,
+		pt,
+		ptlen,
+		boolToLLVMBool(isVarArg))
+	return
+}
+
+func (t Type) IsFunctionVarArg() bool { return C.LLVMIsFunctionVarArg(t.C) != 0 }
+func (t Type) ReturnType() (rt Type)  { rt.C = C.LLVMGetReturnType(t.C); return }
+func (t Type) ParamTypesCount() int   { return int(C.LLVMCountParamTypes(t.C)) }
+func (t Type) ParamTypes() []Type {
+	count := t.ParamTypesCount()
+	if count > 0 {
+		out := make([]Type, count)
+		C.LLVMGetParamTypes(t.C, llvmTypeRefPtr(&out[0]))
+		return out
+	}
+	return nil
+}
+
+// Operations on struct types
+func (c Context) StructType(elementTypes []Type, packed bool) (t Type) {
+	var pt *C.LLVMTypeRef
+	var ptlen C.unsigned
+	if len(elementTypes) > 0 {
+		pt = llvmTypeRefPtr(&elementTypes[0])
+		ptlen = C.unsigned(len(elementTypes))
+	}
+	t.C = C.LLVMStructTypeInContext(c.C,
+		pt,
+		ptlen,
+		boolToLLVMBool(packed))
+	return
+}
+
+func StructType(elementTypes []Type, packed bool) (t Type) {
+	var pt *C.LLVMTypeRef
+	var ptlen C.unsigned
+	if len(elementTypes) > 0 {
+		pt = llvmTypeRefPtr(&elementTypes[0])
+		ptlen = C.unsigned(len(elementTypes))
+	}
+	t.C = C.LLVMStructType(pt, ptlen, boolToLLVMBool(packed))
+	return
+}
+
+func (c Context) StructCreateNamed(name string) (t Type) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	t.C = C.LLVMStructCreateNamed(c.C, cname)
+	return
+}
+
+func (t Type) StructName() string {
+	return C.GoString(C.LLVMGetStructName(t.C))
+}
+
+func (t Type) StructSetBody(elementTypes []Type, packed bool) {
+	var pt *C.LLVMTypeRef
+	var ptlen C.unsigned
+	if len(elementTypes) > 0 {
+		pt = llvmTypeRefPtr(&elementTypes[0])
+		ptlen = C.unsigned(len(elementTypes))
+	}
+	C.LLVMStructSetBody(t.C, pt, ptlen, boolToLLVMBool(packed))
+}
+
+func (t Type) IsStructPacked() bool         { return C.LLVMIsPackedStruct(t.C) != 0 }
+func (t Type) StructElementTypesCount() int { return int(C.LLVMCountStructElementTypes(t.C)) }
+func (t Type) StructElementTypes() []Type {
+	out := make([]Type, t.StructElementTypesCount())
+	if len(out) > 0 {
+		C.LLVMGetStructElementTypes(t.C, llvmTypeRefPtr(&out[0]))
+	}
+	return out
+}
+
+// Operations on array, pointer, and vector types (sequence types)
+func ArrayType(elementType Type, elementCount int) (t Type) {
+	t.C = C.LLVMArrayType(elementType.C, C.unsigned(elementCount))
+	return
+}
+func PointerType(elementType Type, addressSpace int) (t Type) {
+	t.C = C.LLVMPointerType(elementType.C, C.unsigned(addressSpace))
+	return
+}
+func VectorType(elementType Type, elementCount int) (t Type) {
+	t.C = C.LLVMVectorType(elementType.C, C.unsigned(elementCount))
+	return
+}
+
+func (t Type) ElementType() (rt Type)   { rt.C = C.LLVMGetElementType(t.C); return }
+func (t Type) ArrayLength() int         { return int(C.LLVMGetArrayLength(t.C)) }
+func (t Type) PointerAddressSpace() int { return int(C.LLVMGetPointerAddressSpace(t.C)) }
+func (t Type) VectorSize() int          { return int(C.LLVMGetVectorSize(t.C)) }
+
+// Operations on other types
+func (c Context) VoidType() (t Type)  { t.C = C.LLVMVoidTypeInContext(c.C); return }
+func (c Context) LabelType() (t Type) { t.C = C.LLVMLabelTypeInContext(c.C); return }
+
+func VoidType() (t Type)  { t.C = C.LLVMVoidType(); return }
+func LabelType() (t Type) { t.C = C.LLVMLabelType(); return }
+
+//-------------------------------------------------------------------------
+// llvm.Value
+//-------------------------------------------------------------------------
+
+// Operations on all values
+func (v Value) Type() (t Type) { t.C = C.LLVMTypeOf(v.C); return }
+func (v Value) Name() string   { return C.GoString(C.LLVMGetValueName(v.C)) }
+func (v Value) SetName(name string) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	C.LLVMSetValueName(v.C, cname)
+}
+func (v Value) Dump()                       { C.LLVMDumpValue(v.C) }
+func (v Value) ReplaceAllUsesWith(nv Value) { C.LLVMReplaceAllUsesWith(v.C, nv.C) }
+func (v Value) HasMetadata() bool           { return C.LLVMHasMetadata(v.C) != 0 }
+func (v Value) Metadata(kind int) (rv Value) {
+	rv.C = C.LLVMGetMetadata(v.C, C.unsigned(kind))
+	return
+}
+func (v Value) SetMetadata(kind int, node Value) {
+	C.LLVMSetMetadata(v.C, C.unsigned(kind), node.C)
+}
+
+// Conversion functions.
+// Return the input value if it is an instance of the specified class, otherwise NULL.
+// See llvm::dyn_cast_or_null<>.
+func (v Value) IsAArgument() (rv Value)   { rv.C = C.LLVMIsAArgument(v.C); return }
+func (v Value) IsABasicBlock() (rv Value) { rv.C = C.LLVMIsABasicBlock(v.C); return }
+func (v Value) IsAInlineAsm() (rv Value)  { rv.C = C.LLVMIsAInlineAsm(v.C); return }
+func (v Value) IsAUser() (rv Value)       { rv.C = C.LLVMIsAUser(v.C); return }
+func (v Value) IsAConstant() (rv Value)   { rv.C = C.LLVMIsAConstant(v.C); return }
+func (v Value) IsAConstantAggregateZero() (rv Value) {
+	rv.C = C.LLVMIsAConstantAggregateZero(v.C)
+	return
+}
+func (v Value) IsAConstantArray() (rv Value)       { rv.C = C.LLVMIsAConstantArray(v.C); return }
+func (v Value) IsAConstantExpr() (rv Value)        { rv.C = C.LLVMIsAConstantExpr(v.C); return }
+func (v Value) IsAConstantFP() (rv Value)          { rv.C = C.LLVMIsAConstantFP(v.C); return }
+func (v Value) IsAConstantInt() (rv Value)         { rv.C = C.LLVMIsAConstantInt(v.C); return }
+func (v Value) IsAConstantPointerNull() (rv Value) { rv.C = C.LLVMIsAConstantPointerNull(v.C); return }
+func (v Value) IsAConstantStruct() (rv Value)      { rv.C = C.LLVMIsAConstantStruct(v.C); return }
+func (v Value) IsAConstantVector() (rv Value)      { rv.C = C.LLVMIsAConstantVector(v.C); return }
+func (v Value) IsAGlobalValue() (rv Value)         { rv.C = C.LLVMIsAGlobalValue(v.C); return }
+func (v Value) IsAFunction() (rv Value)            { rv.C = C.LLVMIsAFunction(v.C); return }
+func (v Value) IsAGlobalAlias() (rv Value)         { rv.C = C.LLVMIsAGlobalAlias(v.C); return }
+func (v Value) IsAGlobalVariable() (rv Value)      { rv.C = C.LLVMIsAGlobalVariable(v.C); return }
+func (v Value) IsAUndefValue() (rv Value)          { rv.C = C.LLVMIsAUndefValue(v.C); return }
+func (v Value) IsAInstruction() (rv Value)         { rv.C = C.LLVMIsAInstruction(v.C); return }
+func (v Value) IsABinaryOperator() (rv Value)      { rv.C = C.LLVMIsABinaryOperator(v.C); return }
+func (v Value) IsACallInst() (rv Value)            { rv.C = C.LLVMIsACallInst(v.C); return }
+func (v Value) IsAIntrinsicInst() (rv Value)       { rv.C = C.LLVMIsAIntrinsicInst(v.C); return }
+func (v Value) IsADbgInfoIntrinsic() (rv Value)    { rv.C = C.LLVMIsADbgInfoIntrinsic(v.C); return }
+func (v Value) IsADbgDeclareInst() (rv Value)      { rv.C = C.LLVMIsADbgDeclareInst(v.C); return }
+func (v Value) IsAMemIntrinsic() (rv Value)        { rv.C = C.LLVMIsAMemIntrinsic(v.C); return }
+func (v Value) IsAMemCpyInst() (rv Value)          { rv.C = C.LLVMIsAMemCpyInst(v.C); return }
+func (v Value) IsAMemMoveInst() (rv Value)         { rv.C = C.LLVMIsAMemMoveInst(v.C); return }
+func (v Value) IsAMemSetInst() (rv Value)          { rv.C = C.LLVMIsAMemSetInst(v.C); return }
+func (v Value) IsACmpInst() (rv Value)             { rv.C = C.LLVMIsACmpInst(v.C); return }
+func (v Value) IsAFCmpInst() (rv Value)            { rv.C = C.LLVMIsAFCmpInst(v.C); return }
+func (v Value) IsAICmpInst() (rv Value)            { rv.C = C.LLVMIsAICmpInst(v.C); return }
+func (v Value) IsAExtractElementInst() (rv Value)  { rv.C = C.LLVMIsAExtractElementInst(v.C); return }
+func (v Value) IsAGetElementPtrInst() (rv Value)   { rv.C = C.LLVMIsAGetElementPtrInst(v.C); return }
+func (v Value) IsAInsertElementInst() (rv Value)   { rv.C = C.LLVMIsAInsertElementInst(v.C); return }
+func (v Value) IsAInsertValueInst() (rv Value)     { rv.C = C.LLVMIsAInsertValueInst(v.C); return }
+func (v Value) IsAPHINode() (rv Value)             { rv.C = C.LLVMIsAPHINode(v.C); return }
+func (v Value) IsASelectInst() (rv Value)          { rv.C = C.LLVMIsASelectInst(v.C); return }
+func (v Value) IsAShuffleVectorInst() (rv Value)   { rv.C = C.LLVMIsAShuffleVectorInst(v.C); return }
+func (v Value) IsAStoreInst() (rv Value)           { rv.C = C.LLVMIsAStoreInst(v.C); return }
+func (v Value) IsATerminatorInst() (rv Value)      { rv.C = C.LLVMIsATerminatorInst(v.C); return }
+func (v Value) IsABranchInst() (rv Value)          { rv.C = C.LLVMIsABranchInst(v.C); return }
+func (v Value) IsAInvokeInst() (rv Value)          { rv.C = C.LLVMIsAInvokeInst(v.C); return }
+func (v Value) IsAReturnInst() (rv Value)          { rv.C = C.LLVMIsAReturnInst(v.C); return }
+func (v Value) IsASwitchInst() (rv Value)          { rv.C = C.LLVMIsASwitchInst(v.C); return }
+func (v Value) IsAUnreachableInst() (rv Value)     { rv.C = C.LLVMIsAUnreachableInst(v.C); return }
+func (v Value) IsAUnaryInstruction() (rv Value)    { rv.C = C.LLVMIsAUnaryInstruction(v.C); return }
+func (v Value) IsAAllocaInst() (rv Value)          { rv.C = C.LLVMIsAAllocaInst(v.C); return }
+func (v Value) IsACastInst() (rv Value)            { rv.C = C.LLVMIsACastInst(v.C); return }
+func (v Value) IsABitCastInst() (rv Value)         { rv.C = C.LLVMIsABitCastInst(v.C); return }
+func (v Value) IsAFPExtInst() (rv Value)           { rv.C = C.LLVMIsAFPExtInst(v.C); return }
+func (v Value) IsAFPToSIInst() (rv Value)          { rv.C = C.LLVMIsAFPToSIInst(v.C); return }
+func (v Value) IsAFPToUIInst() (rv Value)          { rv.C = C.LLVMIsAFPToUIInst(v.C); return }
+func (v Value) IsAFPTruncInst() (rv Value)         { rv.C = C.LLVMIsAFPTruncInst(v.C); return }
+func (v Value) IsAIntToPtrInst() (rv Value)        { rv.C = C.LLVMIsAIntToPtrInst(v.C); return }
+func (v Value) IsAPtrToIntInst() (rv Value)        { rv.C = C.LLVMIsAPtrToIntInst(v.C); return }
+func (v Value) IsASExtInst() (rv Value)            { rv.C = C.LLVMIsASExtInst(v.C); return }
+func (v Value) IsASIToFPInst() (rv Value)          { rv.C = C.LLVMIsASIToFPInst(v.C); return }
+func (v Value) IsATruncInst() (rv Value)           { rv.C = C.LLVMIsATruncInst(v.C); return }
+func (v Value) IsAUIToFPInst() (rv Value)          { rv.C = C.LLVMIsAUIToFPInst(v.C); return }
+func (v Value) IsAZExtInst() (rv Value)            { rv.C = C.LLVMIsAZExtInst(v.C); return }
+func (v Value) IsAExtractValueInst() (rv Value)    { rv.C = C.LLVMIsAExtractValueInst(v.C); return }
+func (v Value) IsALoadInst() (rv Value)            { rv.C = C.LLVMIsALoadInst(v.C); return }
+func (v Value) IsAVAArgInst() (rv Value)           { rv.C = C.LLVMIsAVAArgInst(v.C); return }
+
+// Operations on Uses
+func (v Value) FirstUse() (u Use)  { u.C = C.LLVMGetFirstUse(v.C); return }
+func (u Use) NextUse() (ru Use)    { ru.C = C.LLVMGetNextUse(u.C); return }
+func (u Use) User() (v Value)      { v.C = C.LLVMGetUser(u.C); return }
+func (u Use) UsedValue() (v Value) { v.C = C.LLVMGetUsedValue(u.C); return }
+
+// Operations on Users
+func (v Value) Operand(i int) (rv Value)   { rv.C = C.LLVMGetOperand(v.C, C.unsigned(i)); return }
+func (v Value) SetOperand(i int, op Value) { C.LLVMSetOperand(v.C, C.unsigned(i), op.C) }
+func (v Value) OperandsCount() int         { return int(C.LLVMGetNumOperands(v.C)) }
+
+// Operations on constants of any type
+func ConstNull(t Type) (v Value)        { v.C = C.LLVMConstNull(t.C); return }
+func ConstAllOnes(t Type) (v Value)     { v.C = C.LLVMConstAllOnes(t.C); return }
+func Undef(t Type) (v Value)            { v.C = C.LLVMGetUndef(t.C); return }
+func (v Value) IsConstant() bool        { return C.LLVMIsConstant(v.C) != 0 }
+func (v Value) IsNull() bool            { return C.LLVMIsNull(v.C) != 0 }
+func (v Value) IsUndef() bool           { return C.LLVMIsUndef(v.C) != 0 }
+func ConstPointerNull(t Type) (v Value) { v.C = C.LLVMConstPointerNull(t.C); return }
+
+// Operations on metadata
+func (c Context) MDString(str string) (v Value) {
+	cstr := C.CString(str)
+	defer C.free(unsafe.Pointer(cstr))
+	v.C = C.LLVMMDStringInContext(c.C, cstr, C.unsigned(len(str)))
+	return
+}
+func MDString(str string) (v Value) {
+	cstr := C.CString(str)
+	defer C.free(unsafe.Pointer(cstr))
+	v.C = C.LLVMMDString(cstr, C.unsigned(len(str)))
+	return
+}
+func (c Context) MDNode(vals []Value) (v Value) {
+	ptr, nvals := llvmValueRefs(vals)
+	v.C = C.LLVMMDNodeInContext(c.C, ptr, nvals)
+	return
+}
+func MDNode(vals []Value) (v Value) {
+	ptr, nvals := llvmValueRefs(vals)
+	v.C = C.LLVMMDNode(ptr, nvals)
+	return
+}
+
+// Operations on scalar constants
+func ConstInt(t Type, n uint64, signExtend bool) (v Value) {
+	v.C = C.LLVMConstInt(t.C,
+		C.ulonglong(n),
+		boolToLLVMBool(signExtend))
+	return
+}
+func ConstIntFromString(t Type, str string, radix int) (v Value) {
+	cstr := C.CString(str)
+	defer C.free(unsafe.Pointer(cstr))
+	v.C = C.LLVMConstIntOfString(t.C, cstr, C.uint8_t(radix))
+	return
+}
+func ConstFloat(t Type, n float64) (v Value) {
+	v.C = C.LLVMConstReal(t.C, C.double(n))
+	return
+}
+func ConstFloatFromString(t Type, str string) (v Value) {
+	cstr := C.CString(str)
+	defer C.free(unsafe.Pointer(cstr))
+	v.C = C.LLVMConstRealOfString(t.C, cstr)
+	return
+}
+
+func (v Value) ZExtValue() uint64 { return uint64(C.LLVMConstIntGetZExtValue(v.C)) }
+func (v Value) SExtValue() int64  { return int64(C.LLVMConstIntGetSExtValue(v.C)) }
+
+// Operations on composite constants
+func (c Context) ConstString(str string, addnull bool) (v Value) {
+	cstr := C.CString(str)
+	defer C.free(unsafe.Pointer(cstr))
+	v.C = C.LLVMConstStringInContext(c.C, cstr,
+		C.unsigned(len(str)), boolToLLVMBool(!addnull))
+	return
+}
+func (c Context) ConstStruct(constVals []Value, packed bool) (v Value) {
+	ptr, nvals := llvmValueRefs(constVals)
+	v.C = C.LLVMConstStructInContext(c.C, ptr, nvals,
+		boolToLLVMBool(packed))
+	return
+}
+func ConstNamedStruct(t Type, constVals []Value) (v Value) {
+	ptr, nvals := llvmValueRefs(constVals)
+	v.C = C.LLVMConstNamedStruct(t.C, ptr, nvals)
+	return
+}
+func ConstString(str string, addnull bool) (v Value) {
+	cstr := C.CString(str)
+	defer C.free(unsafe.Pointer(cstr))
+	v.C = C.LLVMConstString(cstr,
+		C.unsigned(len(str)), boolToLLVMBool(!addnull))
+	return
+}
+func ConstArray(t Type, constVals []Value) (v Value) {
+	ptr, nvals := llvmValueRefs(constVals)
+	v.C = C.LLVMConstArray(t.C, ptr, nvals)
+	return
+}
+func ConstStruct(constVals []Value, packed bool) (v Value) {
+	ptr, nvals := llvmValueRefs(constVals)
+	v.C = C.LLVMConstStruct(ptr, nvals, boolToLLVMBool(packed))
+	return
+}
+func ConstVector(scalarConstVals []Value, packed bool) (v Value) {
+	ptr, nvals := llvmValueRefs(scalarConstVals)
+	v.C = C.LLVMConstVector(ptr, nvals)
+	return
+}
+
+// Constant expressions
+func (v Value) Opcode() Opcode                { return Opcode(C.LLVMGetConstOpcode(v.C)) }
+func (v Value) InstructionOpcode() Opcode     { return Opcode(C.LLVMGetInstructionOpcode(v.C)) }
+func AlignOf(t Type) (v Value)                { v.C = C.LLVMAlignOf(t.C); return }
+func SizeOf(t Type) (v Value)                 { v.C = C.LLVMSizeOf(t.C); return }
+func ConstNeg(v Value) (rv Value)             { rv.C = C.LLVMConstNeg(v.C); return }
+func ConstNSWNeg(v Value) (rv Value)          { rv.C = C.LLVMConstNSWNeg(v.C); return }
+func ConstNUWNeg(v Value) (rv Value)          { rv.C = C.LLVMConstNUWNeg(v.C); return }
+func ConstFNeg(v Value) (rv Value)            { rv.C = C.LLVMConstFNeg(v.C); return }
+func ConstNot(v Value) (rv Value)             { rv.C = C.LLVMConstNot(v.C); return }
+func ConstAdd(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstAdd(lhs.C, rhs.C); return }
+func ConstNSWAdd(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNSWAdd(lhs.C, rhs.C); return }
+func ConstNUWAdd(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNUWAdd(lhs.C, rhs.C); return }
+func ConstFAdd(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFAdd(lhs.C, rhs.C); return }
+func ConstSub(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstSub(lhs.C, rhs.C); return }
+func ConstNSWSub(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNSWSub(lhs.C, rhs.C); return }
+func ConstNUWSub(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNUWSub(lhs.C, rhs.C); return }
+func ConstFSub(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFSub(lhs.C, rhs.C); return }
+func ConstMul(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstMul(lhs.C, rhs.C); return }
+func ConstNSWMul(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNSWMul(lhs.C, rhs.C); return }
+func ConstNUWMul(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNUWMul(lhs.C, rhs.C); return }
+func ConstFMul(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFMul(lhs.C, rhs.C); return }
+func ConstUDiv(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstUDiv(lhs.C, rhs.C); return }
+func ConstSDiv(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstSDiv(lhs.C, rhs.C); return }
+func ConstExactSDiv(lhs, rhs Value) (v Value) { v.C = C.LLVMConstExactSDiv(lhs.C, rhs.C); return }
+func ConstFDiv(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFDiv(lhs.C, rhs.C); return }
+func ConstURem(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstURem(lhs.C, rhs.C); return }
+func ConstSRem(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstSRem(lhs.C, rhs.C); return }
+func ConstFRem(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFRem(lhs.C, rhs.C); return }
+func ConstAnd(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstAnd(lhs.C, rhs.C); return }
+func ConstOr(lhs, rhs Value) (v Value)        { v.C = C.LLVMConstOr(lhs.C, rhs.C); return }
+func ConstXor(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstXor(lhs.C, rhs.C); return }
+
+func ConstICmp(pred IntPredicate, lhs, rhs Value) (v Value) {
+	v.C = C.LLVMConstICmp(C.LLVMIntPredicate(pred), lhs.C, rhs.C)
+	return
+}
+func ConstFCmp(pred FloatPredicate, lhs, rhs Value) (v Value) {
+	v.C = C.LLVMConstFCmp(C.LLVMRealPredicate(pred), lhs.C, rhs.C)
+	return
+}
+
+func ConstShl(lhs, rhs Value) (v Value)  { v.C = C.LLVMConstShl(lhs.C, rhs.C); return }
+func ConstLShr(lhs, rhs Value) (v Value) { v.C = C.LLVMConstLShr(lhs.C, rhs.C); return }
+func ConstAShr(lhs, rhs Value) (v Value) { v.C = C.LLVMConstAShr(lhs.C, rhs.C); return }
+
+func ConstGEP(v Value, indices []Value) (rv Value) {
+	ptr, nvals := llvmValueRefs(indices)
+	rv.C = C.LLVMConstGEP(v.C, ptr, nvals)
+	return
+}
+func ConstInBoundsGEP(v Value, indices []Value) (rv Value) {
+	ptr, nvals := llvmValueRefs(indices)
+	rv.C = C.LLVMConstInBoundsGEP(v.C, ptr, nvals)
+	return
+}
+func ConstTrunc(v Value, t Type) (rv Value)         { rv.C = C.LLVMConstTrunc(v.C, t.C); return }
+func ConstSExt(v Value, t Type) (rv Value)          { rv.C = C.LLVMConstSExt(v.C, t.C); return }
+func ConstZExt(v Value, t Type) (rv Value)          { rv.C = C.LLVMConstZExt(v.C, t.C); return }
+func ConstFPTrunc(v Value, t Type) (rv Value)       { rv.C = C.LLVMConstFPTrunc(v.C, t.C); return }
+func ConstFPExt(v Value, t Type) (rv Value)         { rv.C = C.LLVMConstFPExt(v.C, t.C); return }
+func ConstUIToFP(v Value, t Type) (rv Value)        { rv.C = C.LLVMConstUIToFP(v.C, t.C); return }
+func ConstSIToFP(v Value, t Type) (rv Value)        { rv.C = C.LLVMConstSIToFP(v.C, t.C); return }
+func ConstFPToUI(v Value, t Type) (rv Value)        { rv.C = C.LLVMConstFPToUI(v.C, t.C); return }
+func ConstFPToSI(v Value, t Type) (rv Value)        { rv.C = C.LLVMConstFPToSI(v.C, t.C); return }
+func ConstPtrToInt(v Value, t Type) (rv Value)      { rv.C = C.LLVMConstPtrToInt(v.C, t.C); return }
+func ConstIntToPtr(v Value, t Type) (rv Value)      { rv.C = C.LLVMConstIntToPtr(v.C, t.C); return }
+func ConstBitCast(v Value, t Type) (rv Value)       { rv.C = C.LLVMConstBitCast(v.C, t.C); return }
+func ConstZExtOrBitCast(v Value, t Type) (rv Value) { rv.C = C.LLVMConstZExtOrBitCast(v.C, t.C); return }
+func ConstSExtOrBitCast(v Value, t Type) (rv Value) { rv.C = C.LLVMConstSExtOrBitCast(v.C, t.C); return }
+func ConstTruncOrBitCast(v Value, t Type) (rv Value) {
+	rv.C = C.LLVMConstTruncOrBitCast(v.C, t.C)
+	return
+}
+func ConstPointerCast(v Value, t Type) (rv Value) { rv.C = C.LLVMConstPointerCast(v.C, t.C); return }
+func ConstIntCast(v Value, t Type, signed bool) (rv Value) {
+	rv.C = C.LLVMConstIntCast(v.C, t.C, boolToLLVMBool(signed))
+	return
+}
+func ConstFPCast(v Value, t Type) (rv Value) { rv.C = C.LLVMConstFPCast(v.C, t.C); return }
+func ConstSelect(cond, iftrue, iffalse Value) (rv Value) {
+	rv.C = C.LLVMConstSelect(cond.C, iftrue.C, iffalse.C)
+	return
+}
+func ConstExtractElement(vec, i Value) (rv Value) {
+	rv.C = C.LLVMConstExtractElement(vec.C, i.C)
+	return
+}
+func ConstInsertElement(vec, elem, i Value) (rv Value) {
+	rv.C = C.LLVMConstInsertElement(vec.C, elem.C, i.C)
+	return
+}
+func ConstShuffleVector(veca, vecb, mask Value) (rv Value) {
+	rv.C = C.LLVMConstShuffleVector(veca.C, vecb.C, mask.C)
+	return
+}
+
+//TODO
+//LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
+//                                   unsigned NumIdx);
+
+func ConstExtractValue(agg Value, indices []uint32) (rv Value) {
+	n := len(indices)
+	if n == 0 {
+		panic("one or more indices are required")
+	}
+	ptr := (*C.unsigned)(&indices[0])
+	rv.C = C.LLVMConstExtractValue(agg.C, ptr, C.unsigned(n))
+	return
+}
+
+func ConstInsertValue(agg, val Value, indices []uint32) (rv Value) {
+	n := len(indices)
+	if n == 0 {
+		panic("one or more indices are required")
+	}
+	ptr := (*C.unsigned)(&indices[0])
+	rv.C = C.LLVMConstInsertValue(agg.C, val.C, ptr, C.unsigned(n))
+	return
+}
+
+func BlockAddress(f Value, bb BasicBlock) (v Value) {
+	v.C = C.LLVMBlockAddress(f.C, bb.C)
+	return
+}
+
+// Operations on global variables, functions, and aliases (globals)
+func (v Value) GlobalParent() (m Module) { m.C = C.LLVMGetGlobalParent(v.C); return }
+func (v Value) IsDeclaration() bool      { return C.LLVMIsDeclaration(v.C) != 0 }
+func (v Value) Linkage() Linkage         { return Linkage(C.LLVMGetLinkage(v.C)) }
+func (v Value) SetLinkage(l Linkage)     { C.LLVMSetLinkage(v.C, C.LLVMLinkage(l)) }
+func (v Value) Section() string          { return C.GoString(C.LLVMGetSection(v.C)) }
+func (v Value) SetSection(str string) {
+	cstr := C.CString(str)
+	defer C.free(unsafe.Pointer(cstr))
+	C.LLVMSetSection(v.C, cstr)
+}
+func (v Value) Visibility() Visibility      { return Visibility(C.LLVMGetVisibility(v.C)) }
+func (v Value) SetVisibility(vi Visibility) { C.LLVMSetVisibility(v.C, C.LLVMVisibility(vi)) }
+func (v Value) Alignment() int              { return int(C.LLVMGetAlignment(v.C)) }
+func (v Value) SetAlignment(a int)          { C.LLVMSetAlignment(v.C, C.unsigned(a)) }
+func (v Value) SetUnnamedAddr(ua bool)      { C.LLVMSetUnnamedAddr(v.C, boolToLLVMBool(ua)) }
+
+// Operations on global variables
+func AddGlobal(m Module, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMAddGlobal(m.C, t.C, cname)
+	return
+}
+func AddGlobalInAddressSpace(m Module, t Type, name string, addressSpace int) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMAddGlobalInAddressSpace(m.C, t.C, cname, C.unsigned(addressSpace))
+	return
+}
+func (m Module) NamedGlobal(name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMGetNamedGlobal(m.C, cname)
+	return
+}
+
+func (m Module) FirstGlobal() (v Value)   { v.C = C.LLVMGetFirstGlobal(m.C); return }
+func (m Module) LastGlobal() (v Value)    { v.C = C.LLVMGetLastGlobal(m.C); return }
+func NextGlobal(v Value) (rv Value)       { rv.C = C.LLVMGetNextGlobal(v.C); return }
+func PrevGlobal(v Value) (rv Value)       { rv.C = C.LLVMGetPreviousGlobal(v.C); return }
+func (v Value) EraseFromParentAsGlobal()  { C.LLVMDeleteGlobal(v.C) }
+func (v Value) Initializer() (rv Value)   { rv.C = C.LLVMGetInitializer(v.C); return }
+func (v Value) SetInitializer(cv Value)   { C.LLVMSetInitializer(v.C, cv.C) }
+func (v Value) IsThreadLocal() bool       { return C.LLVMIsThreadLocal(v.C) != 0 }
+func (v Value) SetThreadLocal(tl bool)    { C.LLVMSetThreadLocal(v.C, boolToLLVMBool(tl)) }
+func (v Value) IsGlobalConstant() bool    { return C.LLVMIsGlobalConstant(v.C) != 0 }
+func (v Value) SetGlobalConstant(gc bool) { C.LLVMSetGlobalConstant(v.C, boolToLLVMBool(gc)) }
+
+// Operations on aliases
+func AddAlias(m Module, t Type, aliasee Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMAddAlias(m.C, t.C, aliasee.C, cname)
+	return
+}
+
+// Operations on functions
+func AddFunction(m Module, name string, ft Type) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMAddFunction(m.C, cname, ft.C)
+	return
+}
+
+func (m Module) NamedFunction(name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMGetNamedFunction(m.C, cname)
+	return
+}
+
+func (m Module) FirstFunction() (v Value)  { v.C = C.LLVMGetFirstFunction(m.C); return }
+func (m Module) LastFunction() (v Value)   { v.C = C.LLVMGetLastFunction(m.C); return }
+func NextFunction(v Value) (rv Value)      { rv.C = C.LLVMGetNextFunction(v.C); return }
+func PrevFunction(v Value) (rv Value)      { rv.C = C.LLVMGetPreviousFunction(v.C); return }
+func (v Value) EraseFromParentAsFunction() { C.LLVMDeleteFunction(v.C) }
+func (v Value) IntrinsicID() int           { return int(C.LLVMGetIntrinsicID(v.C)) }
+func (v Value) FunctionCallConv() CallConv {
+	return CallConv(C.LLVMCallConv(C.LLVMGetFunctionCallConv(v.C)))
+}
+func (v Value) SetFunctionCallConv(cc CallConv) { C.LLVMSetFunctionCallConv(v.C, C.unsigned(cc)) }
+func (v Value) GC() string                      { return C.GoString(C.LLVMGetGC(v.C)) }
+func (v Value) SetGC(name string) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	C.LLVMSetGC(v.C, cname)
+}
+func (v Value) AddFunctionAttr(a Attribute)    { C.LLVMAddFunctionAttr2(v.C, C.uint64_t(a)) }
+func (v Value) FunctionAttr() Attribute        { return Attribute(C.LLVMGetFunctionAttr2(v.C)) }
+func (v Value) RemoveFunctionAttr(a Attribute) { C.LLVMRemoveFunctionAttr2(v.C, C.uint64_t(a)) }
+func (v Value) AddTargetDependentFunctionAttr(attr, value string) {
+	cattr := C.CString(attr)
+	defer C.free(unsafe.Pointer(cattr))
+	cvalue := C.CString(value)
+	defer C.free(unsafe.Pointer(cvalue))
+	C.LLVMAddTargetDependentFunctionAttr(v.C, cattr, cvalue)
+}
+
+// Operations on parameters
+func (v Value) ParamsCount() int { return int(C.LLVMCountParams(v.C)) }
+func (v Value) Params() []Value {
+	out := make([]Value, v.ParamsCount())
+	if len(out) > 0 {
+		C.LLVMGetParams(v.C, llvmValueRefPtr(&out[0]))
+	}
+	return out
+}
+func (v Value) Param(i int) (rv Value)  { rv.C = C.LLVMGetParam(v.C, C.unsigned(i)); return }
+func (v Value) ParamParent() (rv Value) { rv.C = C.LLVMGetParamParent(v.C); return }
+func (v Value) FirstParam() (rv Value)  { rv.C = C.LLVMGetFirstParam(v.C); return }
+func (v Value) LastParam() (rv Value)   { rv.C = C.LLVMGetLastParam(v.C); return }
+func NextParam(v Value) (rv Value)      { rv.C = C.LLVMGetNextParam(v.C); return }
+func PrevParam(v Value) (rv Value)      { rv.C = C.LLVMGetPreviousParam(v.C); return }
+func (v Value) AddAttribute(a Attribute) {
+	if a >= 1<<32 {
+		panic("attribute value currently unsupported")
+	}
+	C.LLVMAddAttribute(v.C, C.LLVMAttribute(a))
+}
+func (v Value) RemoveAttribute(a Attribute) {
+	if a >= 1<<32 {
+		panic("attribute value currently unsupported")
+	}
+	C.LLVMRemoveAttribute(v.C, C.LLVMAttribute(a))
+}
+func (v Value) Attribute() Attribute        { return Attribute(C.LLVMGetAttribute(v.C)) }
+func (v Value) SetParamAlignment(align int) { C.LLVMSetParamAlignment(v.C, C.unsigned(align)) }
+
+// Operations on basic blocks
+func (bb BasicBlock) AsValue() (v Value)      { v.C = C.LLVMBasicBlockAsValue(bb.C); return }
+func (v Value) IsBasicBlock() bool            { return C.LLVMValueIsBasicBlock(v.C) != 0 }
+func (v Value) AsBasicBlock() (bb BasicBlock) { bb.C = C.LLVMValueAsBasicBlock(v.C); return }
+func (bb BasicBlock) Parent() (v Value)       { v.C = C.LLVMGetBasicBlockParent(bb.C); return }
+func (v Value) BasicBlocksCount() int         { return int(C.LLVMCountBasicBlocks(v.C)) }
+func (v Value) BasicBlocks() []BasicBlock {
+	out := make([]BasicBlock, v.BasicBlocksCount())
+	C.LLVMGetBasicBlocks(v.C, llvmBasicBlockRefPtr(&out[0]))
+	return out
+}
+func (v Value) FirstBasicBlock() (bb BasicBlock)    { bb.C = C.LLVMGetFirstBasicBlock(v.C); return }
+func (v Value) LastBasicBlock() (bb BasicBlock)     { bb.C = C.LLVMGetLastBasicBlock(v.C); return }
+func NextBasicBlock(bb BasicBlock) (rbb BasicBlock) { rbb.C = C.LLVMGetNextBasicBlock(bb.C); return }
+func PrevBasicBlock(bb BasicBlock) (rbb BasicBlock) { rbb.C = C.LLVMGetPreviousBasicBlock(bb.C); return }
+func (v Value) EntryBasicBlock() (bb BasicBlock)    { bb.C = C.LLVMGetEntryBasicBlock(v.C); return }
+func (c Context) AddBasicBlock(f Value, name string) (bb BasicBlock) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	bb.C = C.LLVMAppendBasicBlockInContext(c.C, f.C, cname)
+	return
+}
+func (c Context) InsertBasicBlock(ref BasicBlock, name string) (bb BasicBlock) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	bb.C = C.LLVMInsertBasicBlockInContext(c.C, ref.C, cname)
+	return
+}
+func AddBasicBlock(f Value, name string) (bb BasicBlock) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	bb.C = C.LLVMAppendBasicBlock(f.C, cname)
+	return
+}
+func InsertBasicBlock(ref BasicBlock, name string) (bb BasicBlock) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	bb.C = C.LLVMInsertBasicBlock(ref.C, cname)
+	return
+}
+func (bb BasicBlock) EraseFromParent()          { C.LLVMDeleteBasicBlock(bb.C) }
+func (bb BasicBlock) MoveBefore(pos BasicBlock) { C.LLVMMoveBasicBlockBefore(bb.C, pos.C) }
+func (bb BasicBlock) MoveAfter(pos BasicBlock)  { C.LLVMMoveBasicBlockAfter(bb.C, pos.C) }
+
+// Operations on instructions
+func (v Value) InstructionParent() (bb BasicBlock) { bb.C = C.LLVMGetInstructionParent(v.C); return }
+func (bb BasicBlock) FirstInstruction() (v Value)  { v.C = C.LLVMGetFirstInstruction(bb.C); return }
+func (bb BasicBlock) LastInstruction() (v Value)   { v.C = C.LLVMGetLastInstruction(bb.C); return }
+func NextInstruction(v Value) (rv Value)           { rv.C = C.LLVMGetNextInstruction(v.C); return }
+func PrevInstruction(v Value) (rv Value)           { rv.C = C.LLVMGetPreviousInstruction(v.C); return }
+
+// Operations on call sites
+func (v Value) SetInstructionCallConv(cc CallConv) {
+	C.LLVMSetInstructionCallConv(v.C, C.unsigned(cc))
+}
+func (v Value) InstructionCallConv() CallConv {
+	return CallConv(C.LLVMCallConv(C.LLVMGetInstructionCallConv(v.C)))
+}
+func (v Value) AddInstrAttribute(i int, a Attribute) {
+	if a >= 1<<32 {
+		panic("attribute value currently unsupported")
+	}
+	C.LLVMAddInstrAttribute(v.C, C.unsigned(i), C.LLVMAttribute(a))
+}
+func (v Value) RemoveInstrAttribute(i int, a Attribute) {
+	if a >= 1<<32 {
+		panic("attribute value currently unsupported")
+	}
+	C.LLVMRemoveInstrAttribute(v.C, C.unsigned(i), C.LLVMAttribute(a))
+}
+func (v Value) SetInstrParamAlignment(i int, align int) {
+	C.LLVMSetInstrParamAlignment(v.C, C.unsigned(i), C.unsigned(align))
+}
+
+// Operations on call instructions (only)
+func (v Value) IsTailCall() bool    { return C.LLVMIsTailCall(v.C) != 0 }
+func (v Value) SetTailCall(is bool) { C.LLVMSetTailCall(v.C, boolToLLVMBool(is)) }
+
+// Operations on phi nodes
+func (v Value) AddIncoming(vals []Value, blocks []BasicBlock) {
+	ptr, nvals := llvmValueRefs(vals)
+	C.LLVMAddIncoming(v.C, ptr, llvmBasicBlockRefPtr(&blocks[0]), nvals)
+}
+func (v Value) IncomingCount() int { return int(C.LLVMCountIncoming(v.C)) }
+func (v Value) IncomingValue(i int) (rv Value) {
+	rv.C = C.LLVMGetIncomingValue(v.C, C.unsigned(i))
+	return
+}
+func (v Value) IncomingBlock(i int) (bb BasicBlock) {
+	bb.C = C.LLVMGetIncomingBlock(v.C, C.unsigned(i))
+	return
+}
+
+//-------------------------------------------------------------------------
+// llvm.Builder
+//-------------------------------------------------------------------------
+
+// An instruction builder represents a point within a basic block, and is the
+// exclusive means of building instructions using the C interface.
+
+func (c Context) NewBuilder() (b Builder) { b.C = C.LLVMCreateBuilderInContext(c.C); return }
+func NewBuilder() (b Builder)             { b.C = C.LLVMCreateBuilder(); return }
+func (b Builder) SetInsertPoint(block BasicBlock, instr Value) {
+	C.LLVMPositionBuilder(b.C, block.C, instr.C)
+}
+func (b Builder) SetInsertPointBefore(instr Value)     { C.LLVMPositionBuilderBefore(b.C, instr.C) }
+func (b Builder) SetInsertPointAtEnd(block BasicBlock) { C.LLVMPositionBuilderAtEnd(b.C, block.C) }
+func (b Builder) GetInsertBlock() (bb BasicBlock)      { bb.C = C.LLVMGetInsertBlock(b.C); return }
+func (b Builder) ClearInsertionPoint()                 { C.LLVMClearInsertionPosition(b.C) }
+func (b Builder) Insert(instr Value)                   { C.LLVMInsertIntoBuilder(b.C, instr.C) }
+func (b Builder) InsertWithName(instr Value, name string) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	C.LLVMInsertIntoBuilderWithName(b.C, instr.C, cname)
+}
+func (b Builder) Dispose() { C.LLVMDisposeBuilder(b.C) }
+
+// Metadata
+func (b Builder) SetCurrentDebugLocation(v Value) { C.LLVMSetCurrentDebugLocation(b.C, v.C) }
+func (b Builder) CurrentDebugLocation() (v Value) { v.C = C.LLVMGetCurrentDebugLocation(b.C); return }
+func (b Builder) SetInstDebugLocation(v Value)    { C.LLVMSetInstDebugLocation(b.C, v.C) }
+func (b Builder) InsertDeclare(module Module, storage Value, md Value) Value {
+	f := module.NamedFunction("llvm.dbg.declare")
+	if f.IsNil() {
+		ftyp := FunctionType(VoidType(), []Type{storage.Type(), md.Type()}, false)
+		f = AddFunction(module, "llvm.dbg.declare", ftyp)
+	}
+	return b.CreateCall(f, []Value{storage, md}, "")
+}
+
+// Terminators
+func (b Builder) CreateRetVoid() (rv Value)    { rv.C = C.LLVMBuildRetVoid(b.C); return }
+func (b Builder) CreateRet(v Value) (rv Value) { rv.C = C.LLVMBuildRet(b.C, v.C); return }
+func (b Builder) CreateAggregateRet(vs []Value) (rv Value) {
+	ptr, nvals := llvmValueRefs(vs)
+	rv.C = C.LLVMBuildAggregateRet(b.C, ptr, nvals)
+	return
+}
+func (b Builder) CreateBr(bb BasicBlock) (rv Value) { rv.C = C.LLVMBuildBr(b.C, bb.C); return }
+func (b Builder) CreateCondBr(ifv Value, thenb, elseb BasicBlock) (rv Value) {
+	rv.C = C.LLVMBuildCondBr(b.C, ifv.C, thenb.C, elseb.C)
+	return
+}
+func (b Builder) CreateSwitch(v Value, elseb BasicBlock, numCases int) (rv Value) {
+	rv.C = C.LLVMBuildSwitch(b.C, v.C, elseb.C, C.unsigned(numCases))
+	return
+}
+func (b Builder) CreateIndirectBr(addr Value, numDests int) (rv Value) {
+	rv.C = C.LLVMBuildIndirectBr(b.C, addr.C, C.unsigned(numDests))
+	return
+}
+func (b Builder) CreateInvoke(fn Value, args []Value, then, catch BasicBlock, name string) (rv Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	ptr, nvals := llvmValueRefs(args)
+	rv.C = C.LLVMBuildInvoke(b.C, fn.C, ptr, nvals, then.C, catch.C, cname)
+	return
+}
+func (b Builder) CreateUnreachable() (rv Value) { rv.C = C.LLVMBuildUnreachable(b.C); return }
+
+// Add a case to the switch instruction
+func (v Value) AddCase(on Value, dest BasicBlock) { C.LLVMAddCase(v.C, on.C, dest.C) }
+
+// Add a destination to the indirectbr instruction
+func (v Value) AddDest(dest BasicBlock) { C.LLVMAddDestination(v.C, dest.C) }
+
+// Arithmetic
+func (b Builder) CreateAdd(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildAdd(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateNSWAdd(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildNSWAdd(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateNUWAdd(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildNUWAdd(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateFAdd(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFAdd(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateSub(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildSub(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateNSWSub(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildNSWSub(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateNUWSub(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildNUWSub(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateFSub(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	v.C = C.LLVMBuildFSub(b.C, lhs.C, rhs.C, cname)
+	C.free(unsafe.Pointer(cname))
+	return
+}
+func (b Builder) CreateMul(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildMul(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateNSWMul(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildNSWMul(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateNUWMul(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildNUWMul(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateFMul(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFMul(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateUDiv(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildUDiv(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateSDiv(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildSDiv(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateExactSDiv(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildExactSDiv(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateFDiv(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFDiv(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateURem(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildURem(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateSRem(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildSRem(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateFRem(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFRem(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateShl(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildShl(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateLShr(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildLShr(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateAShr(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildAShr(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateAnd(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildAnd(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateOr(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildOr(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateXor(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildXor(b.C, lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateBinOp(op Opcode, lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildBinOp(b.C, C.LLVMOpcode(op), lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateNeg(v Value, name string) (rv Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	rv.C = C.LLVMBuildNeg(b.C, v.C, cname)
+	return
+}
+func (b Builder) CreateNSWNeg(v Value, name string) (rv Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	rv.C = C.LLVMBuildNSWNeg(b.C, v.C, cname)
+	return
+}
+func (b Builder) CreateNUWNeg(v Value, name string) (rv Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	rv.C = C.LLVMBuildNUWNeg(b.C, v.C, cname)
+	return
+}
+func (b Builder) CreateFNeg(v Value, name string) (rv Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	rv.C = C.LLVMBuildFNeg(b.C, v.C, cname)
+	return
+}
+func (b Builder) CreateNot(v Value, name string) (rv Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	rv.C = C.LLVMBuildNot(b.C, v.C, cname)
+	return
+}
+
+// Memory
+
+func (b Builder) CreateMalloc(t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildMalloc(b.C, t.C, cname)
+	return
+}
+func (b Builder) CreateArrayMalloc(t Type, val Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildArrayMalloc(b.C, t.C, val.C, cname)
+	return
+}
+func (b Builder) CreateAlloca(t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildAlloca(b.C, t.C, cname)
+	return
+}
+func (b Builder) CreateArrayAlloca(t Type, val Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildArrayAlloca(b.C, t.C, val.C, cname)
+	return
+}
+func (b Builder) CreateFree(p Value) (v Value) {
+	v.C = C.LLVMBuildFree(b.C, p.C)
+	return
+}
+func (b Builder) CreateLoad(p Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildLoad(b.C, p.C, cname)
+	return
+}
+func (b Builder) CreateStore(val Value, p Value) (v Value) {
+	v.C = C.LLVMBuildStore(b.C, val.C, p.C)
+	return
+}
+func (b Builder) CreateGEP(p Value, indices []Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	ptr, nvals := llvmValueRefs(indices)
+	v.C = C.LLVMBuildGEP(b.C, p.C, ptr, nvals, cname)
+	return
+}
+func (b Builder) CreateInBoundsGEP(p Value, indices []Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	ptr, nvals := llvmValueRefs(indices)
+	v.C = C.LLVMBuildInBoundsGEP(b.C, p.C, ptr, nvals, cname)
+	return
+}
+func (b Builder) CreateStructGEP(p Value, i int, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildStructGEP(b.C, p.C, C.unsigned(i), cname)
+	return
+}
+func (b Builder) CreateGlobalString(str, name string) (v Value) {
+	cstr := C.CString(str)
+	defer C.free(unsafe.Pointer(cstr))
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildGlobalString(b.C, cstr, cname)
+	return
+}
+func (b Builder) CreateGlobalStringPtr(str, name string) (v Value) {
+	cstr := C.CString(str)
+	defer C.free(unsafe.Pointer(cstr))
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildGlobalStringPtr(b.C, cstr, cname)
+	return
+}
+
+// Casts
+func (b Builder) CreateTrunc(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildTrunc(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateZExt(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildZExt(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateSExt(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildSExt(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateFPToUI(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFPToUI(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateFPToSI(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFPToSI(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateUIToFP(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildUIToFP(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateSIToFP(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildSIToFP(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateFPTrunc(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFPTrunc(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateFPExt(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFPExt(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreatePtrToInt(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildPtrToInt(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateIntToPtr(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildIntToPtr(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateBitCast(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildBitCast(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateZExtOrBitCast(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildZExtOrBitCast(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateSExtOrBitCast(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildSExtOrBitCast(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateTruncOrBitCast(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildTruncOrBitCast(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateCast(val Value, op Opcode, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildCast(b.C, C.LLVMOpcode(op), val.C, t.C, cname)
+	return
+} //
+func (b Builder) CreatePointerCast(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildPointerCast(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateIntCast(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildIntCast(b.C, val.C, t.C, cname)
+	return
+}
+func (b Builder) CreateFPCast(val Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFPCast(b.C, val.C, t.C, cname)
+	return
+}
+
+// Comparisons
+func (b Builder) CreateICmp(pred IntPredicate, lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildICmp(b.C, C.LLVMIntPredicate(pred), lhs.C, rhs.C, cname)
+	return
+}
+func (b Builder) CreateFCmp(pred FloatPredicate, lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildFCmp(b.C, C.LLVMRealPredicate(pred), lhs.C, rhs.C, cname)
+	return
+}
+
+// Miscellaneous instructions
+func (b Builder) CreatePHI(t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildPhi(b.C, t.C, cname)
+	return
+}
+func (b Builder) CreateCall(fn Value, args []Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	ptr, nvals := llvmValueRefs(args)
+	v.C = C.LLVMBuildCall(b.C, fn.C, ptr, nvals, cname)
+	return
+}
+
+func (b Builder) CreateSelect(ifv, thenv, elsev Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildSelect(b.C, ifv.C, thenv.C, elsev.C, cname)
+	return
+}
+
+func (b Builder) CreateVAArg(list Value, t Type, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildVAArg(b.C, list.C, t.C, cname)
+	return
+}
+func (b Builder) CreateExtractElement(vec, i Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildExtractElement(b.C, vec.C, i.C, cname)
+	return
+}
+func (b Builder) CreateInsertElement(vec, elt, i Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildInsertElement(b.C, vec.C, elt.C, i.C, cname)
+	return
+}
+func (b Builder) CreateShuffleVector(v1, v2, mask Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildShuffleVector(b.C, v1.C, v2.C, mask.C, cname)
+	return
+}
+func (b Builder) CreateExtractValue(agg Value, i int, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildExtractValue(b.C, agg.C, C.unsigned(i), cname)
+	return
+}
+func (b Builder) CreateInsertValue(agg, elt Value, i int, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildInsertValue(b.C, agg.C, elt.C, C.unsigned(i), cname)
+	return
+}
+
+func (b Builder) CreateIsNull(val Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildIsNull(b.C, val.C, cname)
+	return
+}
+func (b Builder) CreateIsNotNull(val Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildIsNotNull(b.C, val.C, cname)
+	return
+}
+func (b Builder) CreatePtrDiff(lhs, rhs Value, name string) (v Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	v.C = C.LLVMBuildPtrDiff(b.C, lhs.C, rhs.C, cname)
+	return
+}
+
+func (b Builder) CreateLandingPad(t Type, personality Value, nclauses int, name string) (l Value) {
+	cname := C.CString(name)
+	defer C.free(unsafe.Pointer(cname))
+	l.C = C.LLVMBuildLandingPad(b.C, t.C, personality.C, C.unsigned(nclauses), cname)
+	return l
+}
+
+func (l Value) AddClause(v Value) {
+	C.LLVMAddClause(l.C, v.C)
+}
+
+func (l Value) SetCleanup(cleanup bool) {
+	C.LLVMSetCleanup(l.C, boolToLLVMBool(cleanup))
+}
+
+func (b Builder) CreateResume(ex Value) (v Value) {
+	v.C = C.LLVMBuildResume(b.C, ex.C)
+	return
+}
+
+//-------------------------------------------------------------------------
+// llvm.ModuleProvider
+//-------------------------------------------------------------------------
+
+// Changes the type of M so it can be passed to FunctionPassManagers and the
+// JIT. They take ModuleProviders for historical reasons.
+func NewModuleProviderForModule(m Module) (mp ModuleProvider) {
+	mp.C = C.LLVMCreateModuleProviderForExistingModule(m.C)
+	return
+}
+
+// Destroys the module M.
+func (mp ModuleProvider) Dispose() { C.LLVMDisposeModuleProvider(mp.C) }
+
+//-------------------------------------------------------------------------
+// llvm.MemoryBuffer
+//-------------------------------------------------------------------------
+
+func NewMemoryBufferFromFile(path string) (b MemoryBuffer, err error) {
+	var cmsg *C.char
+	cpath := C.CString(path)
+	defer C.free(unsafe.Pointer(cpath))
+	fail := C.LLVMCreateMemoryBufferWithContentsOfFile(cpath, &b.C, &cmsg)
+	if fail != 0 {
+		b.C = nil
+		err = errors.New(C.GoString(cmsg))
+		C.LLVMDisposeMessage(cmsg)
+	}
+	return
+}
+
+func NewMemoryBufferFromStdin() (b MemoryBuffer, err error) {
+	var cmsg *C.char
+	fail := C.LLVMCreateMemoryBufferWithSTDIN(&b.C, &cmsg)
+	if fail != 0 {
+		b.C = nil
+		err = errors.New(C.GoString(cmsg))
+		C.LLVMDisposeMessage(cmsg)
+	}
+	return
+}
+
+func (b MemoryBuffer) Bytes() []byte {
+	cstart := C.LLVMGetBufferStart(b.C)
+	csize := C.LLVMGetBufferSize(b.C)
+	return C.GoBytes(unsafe.Pointer(cstart), C.int(csize))
+}
+
+func (b MemoryBuffer) Dispose() { C.LLVMDisposeMemoryBuffer(b.C) }
+
+//-------------------------------------------------------------------------
+// llvm.PassManager
+//-------------------------------------------------------------------------
+
+// Constructs a new whole-module pass pipeline. This type of pipeline is
+// suitable for link-time optimization and whole-module transformations.
+// See llvm::PassManager::PassManager.
+func NewPassManager() (pm PassManager) { pm.C = C.LLVMCreatePassManager(); return }
+
+// Constructs a new function-by-function pass pipeline over the module
+// provider. It does not take ownership of the module provider. This type of
+// pipeline is suitable for code generation and JIT compilation tasks.
+// See llvm::FunctionPassManager::FunctionPassManager.
+func NewFunctionPassManagerForModule(m Module) (pm PassManager) {
+	pm.C = C.LLVMCreateFunctionPassManagerForModule(m.C)
+	return
+}
+
+// Initializes, executes on the provided module, and finalizes all of the
+// passes scheduled in the pass manager. Returns 1 if any of the passes
+// modified the module, 0 otherwise. See llvm::PassManager::run(Module&).
+func (pm PassManager) Run(m Module) bool { return C.LLVMRunPassManager(pm.C, m.C) != 0 }
+
+// Initializes all of the function passes scheduled in the function pass
+// manager. Returns 1 if any of the passes modified the module, 0 otherwise.
+// See llvm::FunctionPassManager::doInitialization.
+func (pm PassManager) InitializeFunc() bool { return C.LLVMInitializeFunctionPassManager(pm.C) != 0 }
+
+// Executes all of the function passes scheduled in the function pass manager
+// on the provided function. Returns 1 if any of the passes modified the
+// function, false otherwise.
+// See llvm::FunctionPassManager::run(Function&).
+func (pm PassManager) RunFunc(f Value) bool { return C.LLVMRunFunctionPassManager(pm.C, f.C) != 0 }
+
+// Finalizes all of the function passes scheduled in in the function pass
+// manager. Returns 1 if any of the passes modified the module, 0 otherwise.
+// See llvm::FunctionPassManager::doFinalization.
+func (pm PassManager) FinalizeFunc() bool { return C.LLVMFinalizeFunctionPassManager(pm.C) != 0 }
+
+// Frees the memory of a pass pipeline. For function pipelines, does not free
+// the module provider.
+// See llvm::PassManagerBase::~PassManagerBase.
+func (pm PassManager) Dispose() { C.LLVMDisposePassManager(pm.C) }

diff --git a/bindings/go/llvm/ir_test.go b/bindings/go/llvm/ir_test.go
new file mode 100644
index 0000000..981c94a
--- /dev/null
+++ b/bindings/go/llvm/ir_test.go

@@ -0,0 +1,95 @@
+//===- ir_test.go - Tests for ir ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file tests bindings for the ir component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+import (
+	"strings"
+	"testing"
+)
+
+func testAttribute(t *testing.T, attr Attribute, name string) {
+	mod := NewModule("")
+	defer mod.Dispose()
+
+	ftyp := FunctionType(VoidType(), nil, false)
+	fn := AddFunction(mod, "foo", ftyp)
+
+	fn.AddFunctionAttr(attr)
+	newattr := fn.FunctionAttr()
+	if attr != newattr {
+		t.Errorf("got attribute mask %d, want %d", newattr, attr)
+	}
+
+	text := mod.String()
+	if !strings.Contains(text, " "+name+" ") {
+		t.Errorf("expected attribute '%s', got:\n%s", name, text)
+	}
+
+	fn.RemoveFunctionAttr(attr)
+	newattr = fn.FunctionAttr()
+	if newattr != 0 {
+		t.Errorf("got attribute mask %d, want 0", newattr)
+	}
+}
+
+func TestAttributes(t *testing.T) {
+	// Tests that our attribute constants haven't drifted from LLVM's.
+	attrTests := []struct {
+		attr Attribute
+		name string
+	}{
+		{SanitizeAddressAttribute, "sanitize_address"},
+		{AlwaysInlineAttribute, "alwaysinline"},
+		{BuiltinAttribute, "builtin"},
+		{ByValAttribute, "byval"},
+		{InAllocaAttribute, "inalloca"},
+		{InlineHintAttribute, "inlinehint"},
+		{InRegAttribute, "inreg"},
+		{JumpTableAttribute, "jumptable"},
+		{MinSizeAttribute, "minsize"},
+		{NakedAttribute, "naked"},
+		{NestAttribute, "nest"},
+		{NoAliasAttribute, "noalias"},
+		{NoBuiltinAttribute, "nobuiltin"},
+		{NoCaptureAttribute, "nocapture"},
+		{NoDuplicateAttribute, "noduplicate"},
+		{NoImplicitFloatAttribute, "noimplicitfloat"},
+		{NoInlineAttribute, "noinline"},
+		{NonLazyBindAttribute, "nonlazybind"},
+		{NonNullAttribute, "nonnull"},
+		{NoRedZoneAttribute, "noredzone"},
+		{NoReturnAttribute, "noreturn"},
+		{NoUnwindAttribute, "nounwind"},
+		{OptimizeNoneAttribute, "optnone"},
+		{OptimizeForSizeAttribute, "optsize"},
+		{ReadNoneAttribute, "readnone"},
+		{ReadOnlyAttribute, "readonly"},
+		{ReturnedAttribute, "returned"},
+		{ReturnsTwiceAttribute, "returns_twice"},
+		{SExtAttribute, "signext"},
+		{StackProtectAttribute, "ssp"},
+		{StackProtectReqAttribute, "sspreq"},
+		{StackProtectStrongAttribute, "sspstrong"},
+		{StructRetAttribute, "sret"},
+		{SanitizeThreadAttribute, "sanitize_thread"},
+		{SanitizeMemoryAttribute, "sanitize_memory"},
+		{UWTableAttribute, "uwtable"},
+		{ZExtAttribute, "zeroext"},
+		{ColdAttribute, "cold"},
+	}
+
+	for _, a := range attrTests {
+		testAttribute(t, a.attr, a.name)
+	}
+}

diff --git a/bindings/go/llvm/linker.go b/bindings/go/llvm/linker.go
new file mode 100644
index 0000000..31e9ad2
--- /dev/null
+++ b/bindings/go/llvm/linker.go

@@ -0,0 +1,39 @@
+//===- linker.go - Bindings for linker ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the linker component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/Linker.h"
+#include <stdlib.h>
+*/
+import "C"
+import "errors"
+
+type LinkerMode C.LLVMLinkerMode
+
+const (
+	LinkerDestroySource  = C.LLVMLinkerDestroySource
+	LinkerPreserveSource = C.LLVMLinkerPreserveSource
+)
+
+func LinkModules(Dest, Src Module, Mode LinkerMode) error {
+	var cmsg *C.char
+	failed := C.LLVMLinkModules(Dest.C, Src.C, C.LLVMLinkerMode(Mode), &cmsg)
+	if failed != 0 {
+		err := errors.New(C.GoString(cmsg))
+		C.LLVMDisposeMessage(cmsg)
+		return err
+	}
+	return nil
+}

diff --git a/bindings/go/llvm/llvm_config.go.in b/bindings/go/llvm/llvm_config.go.in
new file mode 100644
index 0000000..41854fa
--- /dev/null
+++ b/bindings/go/llvm/llvm_config.go.in

@@ -0,0 +1,12 @@
+// +build !byollvm
+
+package llvm
+
+/*
+#cgo CXXFLAGS: -std=c++11
+#cgo CPPFLAGS: @LLVM_CFLAGS@
+#cgo LDFLAGS: @LLVM_LDFLAGS@
+*/
+import "C"
+
+type (run_build_sh int)

diff --git a/bindings/go/llvm/llvm_dep.go b/bindings/go/llvm/llvm_dep.go
new file mode 100644
index 0000000..39b4675
--- /dev/null
+++ b/bindings/go/llvm/llvm_dep.go

@@ -0,0 +1,19 @@
+//===- llvm_dep.go - creates LLVM dependency ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file ensures that the LLVM libraries are built before using the
+// bindings.
+//
+//===----------------------------------------------------------------------===//
+
+// +build !byollvm
+
+package llvm
+
+var _ run_build_sh

diff --git a/bindings/go/llvm/string.go b/bindings/go/llvm/string.go
new file mode 100644
index 0000000..bfe869d
--- /dev/null
+++ b/bindings/go/llvm/string.go

@@ -0,0 +1,105 @@
+//===- string.go - Stringer implementation for Type -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Stringer interface for the Type type.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+import "fmt"
+
+func (t TypeKind) String() string {
+	switch t {
+	case VoidTypeKind:
+		return "VoidTypeKind"
+	case FloatTypeKind:
+		return "FloatTypeKind"
+	case DoubleTypeKind:
+		return "DoubleTypeKind"
+	case X86_FP80TypeKind:
+		return "X86_FP80TypeKind"
+	case FP128TypeKind:
+		return "FP128TypeKind"
+	case PPC_FP128TypeKind:
+		return "PPC_FP128TypeKind"
+	case LabelTypeKind:
+		return "LabelTypeKind"
+	case IntegerTypeKind:
+		return "IntegerTypeKind"
+	case FunctionTypeKind:
+		return "FunctionTypeKind"
+	case StructTypeKind:
+		return "StructTypeKind"
+	case ArrayTypeKind:
+		return "ArrayTypeKind"
+	case PointerTypeKind:
+		return "PointerTypeKind"
+	case VectorTypeKind:
+		return "VectorTypeKind"
+	case MetadataTypeKind:
+		return "MetadataTypeKind"
+	}
+	panic("unreachable")
+}
+
+func (t Type) String() string {
+	ts := typeStringer{s: make(map[Type]string)}
+	return ts.typeString(t)
+}
+
+type typeStringer struct {
+	s map[Type]string
+}
+
+func (ts *typeStringer) typeString(t Type) string {
+	if s, ok := ts.s[t]; ok {
+		return s
+	}
+
+	k := t.TypeKind()
+	s := k.String()
+	s = s[:len(s)-len("Kind")]
+
+	switch k {
+	case ArrayTypeKind:
+		s += fmt.Sprintf("(%v[%v])", ts.typeString(t.ElementType()), t.ArrayLength())
+	case PointerTypeKind:
+		s += fmt.Sprintf("(%v)", ts.typeString(t.ElementType()))
+	case FunctionTypeKind:
+		params := t.ParamTypes()
+		s += "("
+		if len(params) > 0 {
+			s += fmt.Sprintf("%v", ts.typeString(params[0]))
+			for i := 1; i < len(params); i++ {
+				s += fmt.Sprintf(", %v", ts.typeString(params[i]))
+			}
+		}
+		s += fmt.Sprintf("):%v", ts.typeString(t.ReturnType()))
+	case StructTypeKind:
+		if name := t.StructName(); name != "" {
+			ts.s[t] = "%" + name
+			s = fmt.Sprintf("%%%s: %s", name, s)
+		}
+		etypes := t.StructElementTypes()
+		s += "("
+		if n := len(etypes); n > 0 {
+			s += ts.typeString(etypes[0])
+			for i := 1; i < n; i++ {
+				s += fmt.Sprintf(", %v", ts.typeString(etypes[i]))
+			}
+		}
+		s += ")"
+	case IntegerTypeKind:
+		s += fmt.Sprintf("(%d bits)", t.IntTypeWidth())
+	}
+
+	ts.s[t] = s
+	return s
+}

diff --git a/bindings/go/llvm/string_test.go b/bindings/go/llvm/string_test.go
new file mode 100644
index 0000000..3008f3e
--- /dev/null
+++ b/bindings/go/llvm/string_test.go

@@ -0,0 +1,28 @@
+//===- string_test.go - test Stringer implementation for Type -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file tests the Stringer interface for the Type type.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+import (
+	"testing"
+)
+
+func TestStringRecursiveType(t *testing.T) {
+	ctx := NewContext()
+	defer ctx.Dispose()
+	s := ctx.StructCreateNamed("recursive")
+	s.StructSetBody([]Type{s, s}, false)
+	if str := s.String(); str != "%recursive: StructType(%recursive, %recursive)" {
+		t.Errorf("incorrect string result %q", str)
+	}
+}

diff --git a/bindings/go/llvm/support.go b/bindings/go/llvm/support.go
new file mode 100644
index 0000000..6f20086
--- /dev/null
+++ b/bindings/go/llvm/support.go

@@ -0,0 +1,54 @@
+//===- support.go - Bindings for support ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the support component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/Support.h"
+#include "SupportBindings.h"
+#include <stdlib.h>
+*/
+import "C"
+
+import (
+	"errors"
+	"unsafe"
+)
+
+// Loads a dynamic library such that it may be used as an LLVM plugin.
+// See llvm::sys::DynamicLibrary::LoadLibraryPermanently.
+func LoadLibraryPermanently(lib string) error {
+	var errstr *C.char
+	libstr := C.CString(lib)
+	defer C.free(unsafe.Pointer(libstr))
+	C.LLVMLoadLibraryPermanently2(libstr, &errstr)
+	if errstr != nil {
+		err := errors.New(C.GoString(errstr))
+		C.free(unsafe.Pointer(errstr))
+		return err
+	}
+	return nil
+}
+
+// Parse the given arguments using the LLVM command line parser.
+// See llvm::cl::ParseCommandLineOptions.
+func ParseCommandLineOptions(args []string, overview string) {
+	argstrs := make([]*C.char, len(args))
+	for i, arg := range args {
+		argstrs[i] = C.CString(arg)
+		defer C.free(unsafe.Pointer(argstrs[i]))
+	}
+	overviewstr := C.CString(overview)
+	defer C.free(unsafe.Pointer(overviewstr))
+	C.LLVMParseCommandLineOptions(C.int(len(args)), &argstrs[0], overviewstr)
+}

diff --git a/bindings/go/llvm/target.go b/bindings/go/llvm/target.go
new file mode 100644
index 0000000..bd1d0f3
--- /dev/null
+++ b/bindings/go/llvm/target.go

@@ -0,0 +1,300 @@
+//===- target.go - Bindings for target ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the target component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/Target.h"
+#include "llvm-c/TargetMachine.h"
+#include <stdlib.h>
+*/
+import "C"
+import "unsafe"
+import "errors"
+
+type (
+	TargetData struct {
+		C C.LLVMTargetDataRef
+	}
+	Target struct {
+		C C.LLVMTargetRef
+	}
+	TargetMachine struct {
+		C C.LLVMTargetMachineRef
+	}
+	ByteOrdering    C.enum_LLVMByteOrdering
+	RelocMode       C.LLVMRelocMode
+	CodeGenOptLevel C.LLVMCodeGenOptLevel
+	CodeGenFileType C.LLVMCodeGenFileType
+	CodeModel       C.LLVMCodeModel
+)
+
+const (
+	BigEndian    ByteOrdering = C.LLVMBigEndian
+	LittleEndian ByteOrdering = C.LLVMLittleEndian
+)
+
+const (
+	RelocDefault      RelocMode = C.LLVMRelocDefault
+	RelocStatic       RelocMode = C.LLVMRelocStatic
+	RelocPIC          RelocMode = C.LLVMRelocPIC
+	RelocDynamicNoPic RelocMode = C.LLVMRelocDynamicNoPic
+)
+
+const (
+	CodeGenLevelNone       CodeGenOptLevel = C.LLVMCodeGenLevelNone
+	CodeGenLevelLess       CodeGenOptLevel = C.LLVMCodeGenLevelLess
+	CodeGenLevelDefault    CodeGenOptLevel = C.LLVMCodeGenLevelDefault
+	CodeGenLevelAggressive CodeGenOptLevel = C.LLVMCodeGenLevelAggressive
+)
+
+const (
+	CodeModelDefault    CodeModel = C.LLVMCodeModelDefault
+	CodeModelJITDefault CodeModel = C.LLVMCodeModelJITDefault
+	CodeModelSmall      CodeModel = C.LLVMCodeModelSmall
+	CodeModelKernel     CodeModel = C.LLVMCodeModelKernel
+	CodeModelMedium     CodeModel = C.LLVMCodeModelMedium
+	CodeModelLarge      CodeModel = C.LLVMCodeModelLarge
+)
+
+const (
+	AssemblyFile CodeGenFileType = C.LLVMAssemblyFile
+	ObjectFile   CodeGenFileType = C.LLVMObjectFile
+)
+
+// InitializeAllTargetInfos - The main program should call this function if it
+// wants access to all available targets that LLVM is configured to support.
+func InitializeAllTargetInfos() { C.LLVMInitializeAllTargetInfos() }
+
+// InitializeAllTargets - The main program should call this function if it wants
+// to link in all available targets that LLVM is configured to support.
+func InitializeAllTargets() { C.LLVMInitializeAllTargets() }
+
+func InitializeAllTargetMCs() { C.LLVMInitializeAllTargetMCs() }
+
+func InitializeAllAsmParsers() { C.LLVMInitializeAllAsmParsers() }
+
+func InitializeAllAsmPrinters() { C.LLVMInitializeAllAsmPrinters() }
+
+var initializeNativeTargetError = errors.New("Failed to initialize native target")
+
+// InitializeNativeTarget - The main program should call this function to
+// initialize the native target corresponding to the host. This is useful
+// for JIT applications to ensure that the target gets linked in correctly.
+func InitializeNativeTarget() error {
+	fail := C.LLVMInitializeNativeTarget()
+	if fail != 0 {
+		return initializeNativeTargetError
+	}
+	return nil
+}
+
+func InitializeNativeAsmPrinter() error {
+	fail := C.LLVMInitializeNativeAsmPrinter()
+	if fail != 0 {
+		return initializeNativeTargetError
+	}
+	return nil
+}
+
+//-------------------------------------------------------------------------
+// llvm.TargetData
+//-------------------------------------------------------------------------
+
+// Creates target data from a target layout string.
+// See the constructor llvm::TargetData::TargetData.
+func NewTargetData(rep string) (td TargetData) {
+	crep := C.CString(rep)
+	defer C.free(unsafe.Pointer(crep))
+	td.C = C.LLVMCreateTargetData(crep)
+	return
+}
+
+// Adds target data information to a pass manager. This does not take ownership
+// of the target data.
+// See the method llvm::PassManagerBase::add.
+func (pm PassManager) Add(td TargetData) {
+	C.LLVMAddTargetData(td.C, pm.C)
+}
+
+// Converts target data to a target layout string. The string must be disposed
+// with LLVMDisposeMessage.
+// See the constructor llvm::TargetData::TargetData.
+func (td TargetData) String() (s string) {
+	cmsg := C.LLVMCopyStringRepOfTargetData(td.C)
+	s = C.GoString(cmsg)
+	C.LLVMDisposeMessage(cmsg)
+	return
+}
+
+// Returns the byte order of a target, either BigEndian or LittleEndian.
+// See the method llvm::TargetData::isLittleEndian.
+func (td TargetData) ByteOrder() ByteOrdering { return ByteOrdering(C.LLVMByteOrder(td.C)) }
+
+// Returns the pointer size in bytes for a target.
+// See the method llvm::TargetData::getPointerSize.
+func (td TargetData) PointerSize() int { return int(C.LLVMPointerSize(td.C)) }
+
+// Returns the integer type that is the same size as a pointer on a target.
+// See the method llvm::TargetData::getIntPtrType.
+func (td TargetData) IntPtrType() (t Type) { t.C = C.LLVMIntPtrType(td.C); return }
+
+// Computes the size of a type in bytes for a target.
+// See the method llvm::TargetData::getTypeSizeInBits.
+func (td TargetData) TypeSizeInBits(t Type) uint64 {
+	return uint64(C.LLVMSizeOfTypeInBits(td.C, t.C))
+}
+
+// Computes the storage size of a type in bytes for a target.
+// See the method llvm::TargetData::getTypeStoreSize.
+func (td TargetData) TypeStoreSize(t Type) uint64 {
+	return uint64(C.LLVMStoreSizeOfType(td.C, t.C))
+}
+
+// Computes the ABI size of a type in bytes for a target.
+// See the method llvm::TargetData::getTypeAllocSize.
+func (td TargetData) TypeAllocSize(t Type) uint64 {
+	return uint64(C.LLVMABISizeOfType(td.C, t.C))
+}
+
+// Computes the ABI alignment of a type in bytes for a target.
+// See the method llvm::TargetData::getABITypeAlignment.
+func (td TargetData) ABITypeAlignment(t Type) int {
+	return int(C.LLVMABIAlignmentOfType(td.C, t.C))
+}
+
+// Computes the call frame alignment of a type in bytes for a target.
+// See the method llvm::TargetData::getCallFrameTypeAlignment.
+func (td TargetData) CallFrameTypeAlignment(t Type) int {
+	return int(C.LLVMCallFrameAlignmentOfType(td.C, t.C))
+}
+
+// Computes the preferred alignment of a type in bytes for a target.
+// See the method llvm::TargetData::getPrefTypeAlignment.
+func (td TargetData) PrefTypeAlignment(t Type) int {
+	return int(C.LLVMPreferredAlignmentOfType(td.C, t.C))
+}
+
+// Computes the preferred alignment of a global variable in bytes for a target.
+// See the method llvm::TargetData::getPreferredAlignment.
+func (td TargetData) PreferredAlignment(g Value) int {
+	return int(C.LLVMPreferredAlignmentOfGlobal(td.C, g.C))
+}
+
+// Computes the structure element that contains the byte offset for a target.
+// See the method llvm::StructLayout::getElementContainingOffset.
+func (td TargetData) ElementContainingOffset(t Type, offset uint64) int {
+	return int(C.LLVMElementAtOffset(td.C, t.C, C.ulonglong(offset)))
+}
+
+// Computes the byte offset of the indexed struct element for a target.
+// See the method llvm::StructLayout::getElementOffset.
+func (td TargetData) ElementOffset(t Type, element int) uint64 {
+	return uint64(C.LLVMOffsetOfElement(td.C, t.C, C.unsigned(element)))
+}
+
+// Deallocates a TargetData.
+// See the destructor llvm::TargetData::~TargetData.
+func (td TargetData) Dispose() { C.LLVMDisposeTargetData(td.C) }
+
+//-------------------------------------------------------------------------
+// llvm.Target
+//-------------------------------------------------------------------------
+
+func FirstTarget() Target {
+	return Target{C.LLVMGetFirstTarget()}
+}
+
+func (t Target) NextTarget() Target {
+	return Target{C.LLVMGetNextTarget(t.C)}
+}
+
+func GetTargetFromTriple(triple string) (t Target, err error) {
+	var errstr *C.char
+	ctriple := C.CString(triple)
+	defer C.free(unsafe.Pointer(ctriple))
+	fail := C.LLVMGetTargetFromTriple(ctriple, &t.C, &errstr)
+	if fail != 0 {
+		err = errors.New(C.GoString(errstr))
+		C.free(unsafe.Pointer(errstr))
+	}
+	return
+}
+
+func (t Target) Name() string {
+	return C.GoString(C.LLVMGetTargetName(t.C))
+}
+
+func (t Target) Description() string {
+	return C.GoString(C.LLVMGetTargetDescription(t.C))
+}
+
+//-------------------------------------------------------------------------
+// llvm.TargetMachine
+//-------------------------------------------------------------------------
+
+// CreateTargetMachine creates a new TargetMachine.
+func (t Target) CreateTargetMachine(Triple string, CPU string, Features string,
+	Level CodeGenOptLevel, Reloc RelocMode,
+	CodeModel CodeModel) (tm TargetMachine) {
+	cTriple := C.CString(Triple)
+	defer C.free(unsafe.Pointer(cTriple))
+	cCPU := C.CString(CPU)
+	defer C.free(unsafe.Pointer(cCPU))
+	cFeatures := C.CString(Features)
+	defer C.free(unsafe.Pointer(cFeatures))
+	tm.C = C.LLVMCreateTargetMachine(t.C, cTriple, cCPU, cFeatures,
+		C.LLVMCodeGenOptLevel(Level),
+		C.LLVMRelocMode(Reloc),
+		C.LLVMCodeModel(CodeModel))
+	return
+}
+
+// Triple returns the triple describing the machine (arch-vendor-os).
+func (tm TargetMachine) Triple() string {
+	cstr := C.LLVMGetTargetMachineTriple(tm.C)
+	return C.GoString(cstr)
+}
+
+// TargetData returns the TargetData for the machine.
+func (tm TargetMachine) TargetData() TargetData {
+	return TargetData{C.LLVMGetTargetMachineData(tm.C)}
+}
+
+func (tm TargetMachine) EmitToMemoryBuffer(m Module, ft CodeGenFileType) (MemoryBuffer, error) {
+	var errstr *C.char
+	var mb MemoryBuffer
+	fail := C.LLVMTargetMachineEmitToMemoryBuffer(tm.C, m.C, C.LLVMCodeGenFileType(ft), &errstr, &mb.C)
+	if fail != 0 {
+		err := errors.New(C.GoString(errstr))
+		C.free(unsafe.Pointer(errstr))
+		return MemoryBuffer{}, err
+	}
+	return mb, nil
+}
+
+func (tm TargetMachine) AddAnalysisPasses(pm PassManager) {
+	C.LLVMAddAnalysisPasses(tm.C, pm.C)
+}
+
+// Dispose releases resources related to the TargetMachine.
+func (tm TargetMachine) Dispose() {
+	C.LLVMDisposeTargetMachine(tm.C)
+}
+
+func DefaultTargetTriple() (triple string) {
+	cTriple := C.LLVMGetDefaultTargetTriple()
+	defer C.free(unsafe.Pointer(cTriple))
+	triple = C.GoString(cTriple)
+	return
+}

diff --git a/bindings/go/llvm/transforms_instrumentation.go b/bindings/go/llvm/transforms_instrumentation.go
new file mode 100644
index 0000000..9b191b2
--- /dev/null
+++ b/bindings/go/llvm/transforms_instrumentation.go

@@ -0,0 +1,43 @@
+//===- transforms_instrumentation.go - Bindings for instrumentation -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the instrumentation component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "InstrumentationBindings.h"
+#include <stdlib.h>
+*/
+import "C"
+import "unsafe"
+
+func (pm PassManager) AddAddressSanitizerFunctionPass() {
+	C.LLVMAddAddressSanitizerFunctionPass(pm.C)
+}
+
+func (pm PassManager) AddAddressSanitizerModulePass() {
+	C.LLVMAddAddressSanitizerModulePass(pm.C)
+}
+
+func (pm PassManager) AddThreadSanitizerPass() {
+	C.LLVMAddThreadSanitizerPass(pm.C)
+}
+
+func (pm PassManager) AddMemorySanitizerPass() {
+	C.LLVMAddMemorySanitizerPass(pm.C)
+}
+
+func (pm PassManager) AddDataFlowSanitizerPass(abilist string) {
+	cabilist := C.CString(abilist)
+	defer C.free(unsafe.Pointer(cabilist))
+	C.LLVMAddDataFlowSanitizerPass(pm.C, cabilist)
+}

diff --git a/bindings/go/llvm/transforms_ipo.go b/bindings/go/llvm/transforms_ipo.go
new file mode 100644
index 0000000..12d972b
--- /dev/null
+++ b/bindings/go/llvm/transforms_ipo.go

@@ -0,0 +1,42 @@
+//===- transforms_ipo.go - Bindings for ipo -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the ipo component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/Transforms/IPO.h"
+*/
+import "C"
+
+// helpers
+func boolToUnsigned(b bool) C.unsigned {
+	if b {
+		return 1
+	}
+	return 0
+}
+
+func (pm PassManager) AddArgumentPromotionPass()     { C.LLVMAddArgumentPromotionPass(pm.C) }
+func (pm PassManager) AddConstantMergePass()         { C.LLVMAddConstantMergePass(pm.C) }
+func (pm PassManager) AddDeadArgEliminationPass()    { C.LLVMAddDeadArgEliminationPass(pm.C) }
+func (pm PassManager) AddFunctionAttrsPass()         { C.LLVMAddFunctionAttrsPass(pm.C) }
+func (pm PassManager) AddFunctionInliningPass()      { C.LLVMAddFunctionInliningPass(pm.C) }
+func (pm PassManager) AddGlobalDCEPass()             { C.LLVMAddGlobalDCEPass(pm.C) }
+func (pm PassManager) AddGlobalOptimizerPass()       { C.LLVMAddGlobalOptimizerPass(pm.C) }
+func (pm PassManager) AddIPConstantPropagationPass() { C.LLVMAddIPConstantPropagationPass(pm.C) }
+func (pm PassManager) AddPruneEHPass()               { C.LLVMAddPruneEHPass(pm.C) }
+func (pm PassManager) AddIPSCCPPass()                { C.LLVMAddIPSCCPPass(pm.C) }
+func (pm PassManager) AddInternalizePass(allButMain bool) {
+	C.LLVMAddInternalizePass(pm.C, boolToUnsigned(allButMain))
+}
+func (pm PassManager) AddStripDeadPrototypesPass() { C.LLVMAddStripDeadPrototypesPass(pm.C) }

diff --git a/bindings/go/llvm/transforms_pmbuilder.go b/bindings/go/llvm/transforms_pmbuilder.go
new file mode 100644
index 0000000..3d79d6e
--- /dev/null
+++ b/bindings/go/llvm/transforms_pmbuilder.go

@@ -0,0 +1,48 @@
+//===- transforms_pmbuilder.go - Bindings for PassManagerBuilder ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the PassManagerBuilder class.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/Transforms/PassManagerBuilder.h"
+*/
+import "C"
+
+type PassManagerBuilder struct {
+	C C.LLVMPassManagerBuilderRef
+}
+
+func NewPassManagerBuilder() (pmb PassManagerBuilder) {
+	pmb.C = C.LLVMPassManagerBuilderCreate()
+	return
+}
+
+func (pmb PassManagerBuilder) SetOptLevel(level int) {
+	C.LLVMPassManagerBuilderSetOptLevel(pmb.C, C.uint(level))
+}
+
+func (pmb PassManagerBuilder) SetSizeLevel(level int) {
+	C.LLVMPassManagerBuilderSetSizeLevel(pmb.C, C.uint(level))
+}
+
+func (pmb PassManagerBuilder) Populate(pm PassManager) {
+	C.LLVMPassManagerBuilderPopulateModulePassManager(pmb.C, pm.C)
+}
+
+func (pmb PassManagerBuilder) PopulateFunc(pm PassManager) {
+	C.LLVMPassManagerBuilderPopulateFunctionPassManager(pmb.C, pm.C)
+}
+
+func (pmb PassManagerBuilder) Dispose() {
+	C.LLVMPassManagerBuilderDispose(pmb.C)
+}

diff --git a/bindings/go/llvm/transforms_scalar.go b/bindings/go/llvm/transforms_scalar.go
new file mode 100644
index 0000000..6492a85
--- /dev/null
+++ b/bindings/go/llvm/transforms_scalar.go

@@ -0,0 +1,45 @@
+//===- transforms_scalar.go - Bindings for scalaropts ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines bindings for the scalaropts component.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm-c/Transforms/Scalar.h"
+*/
+import "C"
+
+func (pm PassManager) AddAggressiveDCEPass()           { C.LLVMAddAggressiveDCEPass(pm.C) }
+func (pm PassManager) AddCFGSimplificationPass()       { C.LLVMAddCFGSimplificationPass(pm.C) }
+func (pm PassManager) AddDeadStoreEliminationPass()    { C.LLVMAddDeadStoreEliminationPass(pm.C) }
+func (pm PassManager) AddGVNPass()                     { C.LLVMAddGVNPass(pm.C) }
+func (pm PassManager) AddIndVarSimplifyPass()          { C.LLVMAddIndVarSimplifyPass(pm.C) }
+func (pm PassManager) AddInstructionCombiningPass()    { C.LLVMAddInstructionCombiningPass(pm.C) }
+func (pm PassManager) AddJumpThreadingPass()           { C.LLVMAddJumpThreadingPass(pm.C) }
+func (pm PassManager) AddLICMPass()                    { C.LLVMAddLICMPass(pm.C) }
+func (pm PassManager) AddLoopDeletionPass()            { C.LLVMAddLoopDeletionPass(pm.C) }
+func (pm PassManager) AddLoopRotatePass()              { C.LLVMAddLoopRotatePass(pm.C) }
+func (pm PassManager) AddLoopUnrollPass()              { C.LLVMAddLoopUnrollPass(pm.C) }
+func (pm PassManager) AddLoopUnswitchPass()            { C.LLVMAddLoopUnswitchPass(pm.C) }
+func (pm PassManager) AddMemCpyOptPass()               { C.LLVMAddMemCpyOptPass(pm.C) }
+func (pm PassManager) AddPromoteMemoryToRegisterPass() { C.LLVMAddPromoteMemoryToRegisterPass(pm.C) }
+func (pm PassManager) AddReassociatePass()             { C.LLVMAddReassociatePass(pm.C) }
+func (pm PassManager) AddSCCPPass()                    { C.LLVMAddSCCPPass(pm.C) }
+func (pm PassManager) AddScalarReplAggregatesPass()    { C.LLVMAddScalarReplAggregatesPass(pm.C) }
+func (pm PassManager) AddScalarReplAggregatesPassWithThreshold(threshold int) {
+	C.LLVMAddScalarReplAggregatesPassWithThreshold(pm.C, C.int(threshold))
+}
+func (pm PassManager) AddSimplifyLibCallsPass()       { C.LLVMAddSimplifyLibCallsPass(pm.C) }
+func (pm PassManager) AddTailCallEliminationPass()    { C.LLVMAddTailCallEliminationPass(pm.C) }
+func (pm PassManager) AddConstantPropagationPass()    { C.LLVMAddConstantPropagationPass(pm.C) }
+func (pm PassManager) AddDemoteMemoryToRegisterPass() { C.LLVMAddDemoteMemoryToRegisterPass(pm.C) }
+func (pm PassManager) AddVerifierPass()               { C.LLVMAddVerifierPass(pm.C) }

diff --git a/bindings/go/llvm/version.go b/bindings/go/llvm/version.go
new file mode 100644
index 0000000..d0623be
--- /dev/null
+++ b/bindings/go/llvm/version.go

@@ -0,0 +1,21 @@
+//===- version.go - LLVM version info -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines LLVM version information.
+//
+//===----------------------------------------------------------------------===//
+
+package llvm
+
+/*
+#include "llvm/Config/llvm-config.h"
+*/
+import "C"
+
+const Version = C.LLVM_VERSION_STRING

diff --git a/bindings/ocaml/Makefile b/bindings/ocaml/Makefile
index b0e1f09..2005367 100644
--- a/bindings/ocaml/Makefile
+++ b/bindings/ocaml/Makefile

@@ -1,10 +1,10 @@
 ##===- bindings/ocaml/Makefile -----------------------------*- Makefile -*-===##
-# 
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../..

diff --git a/bindings/ocaml/Makefile.ocaml b/bindings/ocaml/Makefile.ocaml
index 1b964ee..5e00cf5 100644
--- a/bindings/ocaml/Makefile.ocaml
+++ b/bindings/ocaml/Makefile.ocaml

@@ -1,27 +1,30 @@
 ##===- bindings/ocaml/Makefile.ocaml -----------------------*- Makefile -*-===##
-# 
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
-# 
+#
 # An OCaml library is a unique project type in the context of LLVM, so rules are
 # here rather than in Makefile.rules.
-# 
+#
 # Reference materials on installing OCaml libraries:
-# 
+#
 #   https://fedoraproject.org/wiki/Packaging/OCaml
 #   http://pkg-ocaml-maint.alioth.debian.org/ocaml_packaging_policy.txt
-# 
+#
 ##===----------------------------------------------------------------------===##
 
 include $(LEVEL)/Makefile.config
 
+# We have our own rules for building static libraries.
+NO_BUILD_ARCHIVE = 1
+
 # CFLAGS needs to be set before Makefile.rules is included.
-CXX.Flags += -I"$(shell $(OCAMLC) -where)"
-C.Flags += -I"$(shell $(OCAMLC) -where)"
+CXX.Flags += -I"$(shell $(OCAMLFIND) c -where)"
+C.Flags += -I"$(shell $(OCAMLFIND) c -where)"
 
 ifeq ($(ENABLE_SHARED),1)
 LINK_COMPONENTS := all
@@ -50,64 +53,55 @@
 #    from toplevels.
 ifneq ($(ObjectsO),)
 ifeq ($(ENABLE_SHARED),1)
-OCAMLSTUBS := 1
+OCAMLSTUBS     := 1
+OCAMLSTUBFLAGS := $(patsubst %,-cclib %, $(LLVMLibsOptions) -l$(LIBRARYNAME))
+endif
+endif
+
+# Avoid the need for LD_LIBRARY_PATH
+ifneq ($(HOST_OS), $(filter $(HOST_OS), Cygwin MingW))
+ifneq ($(HOST_OS),Darwin)
+OCAMLRPATH     := $(RPATH) -Wl,'$$ORIGIN/../../lib'
 endif
 endif
 
 # Tools
-OCAMLCFLAGS += -I $(ObjDir) -I $(OcamlDir)
+OCAMLCFLAGS += -I $(OcamlDir) $(addprefix -package ,$(FindlibPackages))
+
 ifndef IS_CLEANING_TARGET
 ifneq ($(ObjectsO),)
 OCAMLAFLAGS += $(patsubst %,-cclib %, \
                  $(filter-out -L$(LibDir),-l$(LIBRARYNAME) \
                                           $(shell $(LLVM_CONFIG) --ldflags)) \
-                                          $(UsedLibs))
+                                          $(UsedLibs) $(ExtraLibs))
 else
 OCAMLAFLAGS += $(patsubst %,-cclib %, \
                  $(filter-out -L$(LibDir),$(shell $(LLVM_CONFIG) --ldflags)) \
-                                          $(UsedLibs))
+                                          $(UsedLibs) $(ExtraLibs))
 endif
 endif
- 
-# -g was introduced in 3.10.0.
-#ifneq ($(ENABLE_OPTIMIZED),1)
-#  OCAMLDEBUGFLAG := -g
-#endif
 
-Compile.CMI  := $(strip $(OCAMLC) -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
-Compile.CMO  := $(strip $(OCAMLC) -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
-Compile.CMX  := $(strip $(OCAMLOPT) -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
+ifneq ($(DEBUG_SYMBOLS),1)
+  OCAMLDEBUGFLAG := -g
+endif
+
+Compile.CMI  := $(strip $(OCAMLFIND) c -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
+Compile.CMO  := $(strip $(OCAMLFIND) c -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
+Compile.CMX  := $(strip $(OCAMLFIND) opt -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
 
 ifdef OCAMLSTUBS
-# Avoid the need for LD_LIBRARY_PATH
-ifneq ($(HOST_OS), $(filter $(HOST_OS), Cygwin MingW))
-ifneq ($(HOST_OS),Darwin)
-OCAMLRPATH   := $(RPATH) -Wl,'$(SharedLibDir)'
-endif
-endif
+# -dllib is engaged with ocamlc builds, $(OCAMLSTUBFLAGS) in ocamlc -custom builds.
+Archive.CMA  := $(strip $(OCAMLFIND) c -a -dllib -l$(LIBRARYNAME) $(OCAMLSTUBFLAGS) \
+																			 $(OCAMLDEBUGFLAG) -o)
+else
+Archive.CMA  := $(strip $(OCAMLFIND) c -a -custom $(OCAMLAFLAGS) $(OCAMLDEBUGFLAG) \
+                                       -o)
 endif
 
 ifdef OCAMLSTUBS
-Archive.CMA  := $(strip $(OCAMLC) -a -dllib -l$(LIBRARYNAME) $(OCAMLDEBUGFLAG) \
-                                  -o)
+Archive.CMXA := $(strip $(OCAMLFIND) opt -a $(OCAMLSTUBFLAGS) $(OCAMLDEBUGFLAG) -o)
 else
-Archive.CMA  := $(strip $(OCAMLC) -a -custom $(OCAMLAFLAGS) $(OCAMLDEBUGFLAG) \
-                                  -o)
-endif
-
-ifdef OCAMLSTUBS
-Archive.CMXA := $(strip $(OCAMLOPT) -a $(patsubst %,-cclib %, \
-                                    $(LLVMLibsOptions) -l$(LIBRARYNAME) \
-                                    -L$(SharedLibDir) $(OCAMLRPATH)) \
-                                    $(OCAMLDEBUGFLAG) -o)
-else
-Archive.CMXA := $(strip $(OCAMLOPT) -a $(OCAMLAFLAGS) $(OCAMLDEBUGFLAG) -o)
-endif
-
-ifdef OCAMLOPT
-Archive.EXE := $(strip $(OCAMLOPT) -cc $(CXX) $(OCAMLCFLAGS) $(UsedOcamlLibs:%=%.cmxa) $(OCAMLDEBUGFLAG) -o)
-else
-Archive.EXE := $(strip $(OCAMLC) -cc $(CXX) $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG:%=%.cma) -o)
+Archive.CMXA := $(strip $(OCAMLFIND) opt -a $(OCAMLAFLAGS) $(OCAMLDEBUGFLAG) -o)
 endif
 
 # Source files
@@ -191,7 +185,7 @@
 ifdef LIBRARYNAME
 $(ObjDir)/$(LIBRARYNAME).ocamldep: $(OcamlSources) $(OcamlHeaders) \
                                    $(OcamlDir)/.dir $(ObjDir)/.dir
-	$(Verb) $(OCAMLDEP) $(OCAMLCFLAGS) $(OcamlSources) $(OcamlHeaders) > $@
+	$(Verb) $(OCAMLFIND) dep $(OCAMLCFLAGS) $(OcamlSources) $(OcamlHeaders) > $@
 
 -include $(ObjDir)/$(LIBRARYNAME).ocamldep
 endif
@@ -199,7 +193,7 @@
 ifdef TOOLNAME
 $(ObjDir)/$(TOOLNAME).ocamldep: $(OcamlSources) $(OcamlHeaders) \
                                 $(OcamlDir)/.dir $(ObjDir)/.dir
-	$(Verb) $(OCAMLDEP) $(OCAMLCFLAGS) $(OcamlSources) $(OcamlHeaders) > $@
+	$(Verb) $(OCAMLFIND) dep $(OCAMLCFLAGS) $(OcamlSources) $(OcamlHeaders) > $@
 
 -include $(ObjDir)/$(TOOLNAME).ocamldep
 endif
@@ -225,7 +219,7 @@
 	$(Echo) "Installing $(BuildMode) $(DestA)"
 	$(Verb) $(MKDIR) $(PROJ_libocamldir)
 	$(Verb) $(INSTALL) $(LibraryA) $(DestA)
-	$(Verb) 
+	$(Verb)
 
 uninstall-a::
 	$(Echo) "Uninstalling $(DestA)"
@@ -368,8 +362,8 @@
 ##===- Build optimized ocaml archive (.ml's -> .cmx's -> .cmxa, .a) -------===##
 
 # The ocamlopt compiler is supported on a set of targets disjoint from LLVM's.
-# If unavailable, 'configure' will not define OCAMLOPT in Makefile.config.
-ifdef OCAMLOPT
+# If unavailable, 'configure' will set HAVE_OCAMLOPT to 0 in Makefile.config.
+ifeq ($(HAVE_OCAMLOPT),1)
 
 $(OcamlDir)/%.cmx: $(ObjDir)/%.cmx
 	$(Verb) $(CP) -f $< $@
@@ -419,31 +413,11 @@
 endif
 endif
 
-##===- Build executables --------------------------------------------------===##
-
-ifdef TOOLNAME
-all-local:: $(OutputEXE)
-clean-local:: clean-exe
-
-$(OutputEXE): $(ToolEXE) $(OcamlDir)/.dir
-	$(Verb) $(CP) -f $< $@
-
-ifndef OCAMLOPT
-$(ToolEXE): $(ObjectsCMO) $(OcamlDir)/.dir
-	$(Echo) "Archiving $(notdir $@) for $(BuildMode) build"
-	$(Verb) $(Archive.EXE) $@ $(ObjectsCMO)
-else
-$(ToolEXE): $(ObjectsCMX) $(OcamlDir)/.dir
-	$(Echo) "Archiving $(notdir $@) for $(BuildMode) build"
-	$(Verb) $(Archive.EXE) $@ $(ObjectsCMX)
-endif
-endif
-
 ##===- Generate documentation ---------------------------------------------===##
 
 $(ObjDir)/$(LIBRARYNAME).odoc: $(ObjectsCMI)
 	$(Echo) "Documenting $(notdir $@)"
-	$(Verb) $(OCAMLDOC) -I $(ObjDir) -I $(OcamlDir) -dump $@ $(OcamlHeaders)
+	$(Verb) $(OCAMLFIND) doc -I $(ObjDir) -I $(OcamlDir) -dump $@ $(OcamlHeaders)
 
 ocamldoc: $(ObjDir)/$(LIBRARYNAME).odoc
 
@@ -454,15 +428,17 @@
 	$(Echo) "LLVM_CONFIG  : " '$(LLVM_CONFIG)'
 	$(Echo) "OCAMLCFLAGS  : " '$(OCAMLCFLAGS)'
 	$(Echo) "OCAMLAFLAGS  : " '$(OCAMLAFLAGS)'
-	$(Echo) "OCAMLC       : " '$(OCAMLC)'
-	$(Echo) "OCAMLOPT     : " '$(OCAMLOPT)'
-	$(Echo) "OCAMLDEP     : " '$(OCAMLDEP)'
+	$(Echo) "OCAMLRPATH   : " '$(OCAMLRPATH)'
+	$(Echo) "OCAMLSTUBS   : " '$(OCAMLSTUBS)'
+	$(Echo) "OCAMLSTUBFLAGS : " '$(OCAMLSTUBFLAGS)'
+	$(Echo) "OCAMLFIND    : " '$(OCAMLFIND)'
 	$(Echo) "Compile.CMI  : " '$(Compile.CMI)'
 	$(Echo) "Compile.CMO  : " '$(Compile.CMO)'
 	$(Echo) "Archive.CMA  : " '$(Archive.CMA)'
 	$(Echo) "Compile.CMX  : " '$(Compile.CMX)'
 	$(Echo) "Archive.CMXA : " '$(Archive.CMXA)'
 	$(Echo) "CAML_LIBDIR  : " '$(CAML_LIBDIR)'
+	$(Echo) "LibraryA     : " '$(LibraryA)'
 	$(Echo) "LibraryCMA   : " '$(LibraryCMA)'
 	$(Echo) "LibraryCMXA  : " '$(LibraryCMXA)'
 	$(Echo) "SharedLib    : " '$(SharedLib)'
@@ -482,6 +458,7 @@
 	$(Echo) "DestSharedLib: " '$(DestSharedLib)'
 	$(Echo) "UsedLibs     : " '$(UsedLibs)'
 	$(Echo) "UsedLibNames : " '$(UsedLibNames)'
+	$(Echo) "ExtraLibs    : " '$(ExtraLibs)'
 
 .PHONY: printcamlvars   build-cmis \
             clean-a     clean-cmis     clean-cma     clean-cmxa \

diff --git a/bindings/ocaml/all_backends/Makefile b/bindings/ocaml/all_backends/Makefile
index a5ff290..f7c8cdb 100644
--- a/bindings/ocaml/all_backends/Makefile
+++ b/bindings/ocaml/all_backends/Makefile

@@ -1,4 +1,4 @@
-##===- bindings/ocaml/all_backends/Makefile ----------------------*- Makefile -*-===##
+##===- bindings/ocaml/all_backends/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -7,7 +7,7 @@
 #
 ##===----------------------------------------------------------------------===##
 #
-# This is the makefile for the Objective Caml Llvm_backends interface.
+# This is the makefile for the Objective Caml Llvm_all_backends interface.
 #
 ##===----------------------------------------------------------------------===##
 

diff --git a/bindings/ocaml/analysis/Makefile b/bindings/ocaml/analysis/Makefile
index cbfcb24..daff061 100644
--- a/bindings/ocaml/analysis/Makefile
+++ b/bindings/ocaml/analysis/Makefile

@@ -1,14 +1,14 @@
 ##===- bindings/ocaml/analysis/Makefile --------------------*- Makefile -*-===##
-# 
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
-# 
+#
 # This is the makefile for the Objective Caml Llvm_analysis interface.
-# 
+#
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../../..

diff --git a/bindings/ocaml/analysis/analysis_ocaml.c b/bindings/ocaml/analysis/analysis_ocaml.c
index 91be2d3..44e3197 100644
--- a/bindings/ocaml/analysis/analysis_ocaml.c
+++ b/bindings/ocaml/analysis/analysis_ocaml.c

@@ -20,15 +20,14 @@
 #include "caml/mlvalues.h"
 #include "caml/memory.h"
 
-
 /* Llvm.llmodule -> string option */
 CAMLprim value llvm_verify_module(LLVMModuleRef M) {
   CAMLparam0();
   CAMLlocal2(String, Option);
-  
+
   char *Message;
   int Result = LLVMVerifyModule(M, LLVMReturnStatusAction, &Message);
-  
+
   if (0 == Result) {
     Option = Val_int(0);
   } else {
@@ -36,9 +35,9 @@
     String = copy_string(Message);
     Store_field(Option, 0, String);
   }
-  
+
   LLVMDisposeMessage(Message);
-  
+
   CAMLreturn(Option);
 }
 

diff --git a/bindings/ocaml/analysis/llvm_analysis.ml b/bindings/ocaml/analysis/llvm_analysis.ml
index 21088ab..8c11a63 100644
--- a/bindings/ocaml/analysis/llvm_analysis.ml
+++ b/bindings/ocaml/analysis/llvm_analysis.ml

@@ -1,4 +1,4 @@
-(*===-- llvm_analysis.ml - LLVM OCaml Interface -----------------*- C++ -*-===*
+(*===-- llvm_analysis.ml - LLVM OCaml Interface ---------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *

diff --git a/bindings/ocaml/analysis/llvm_analysis.mli b/bindings/ocaml/analysis/llvm_analysis.mli
index 1a0af02..03197cd 100644
--- a/bindings/ocaml/analysis/llvm_analysis.mli
+++ b/bindings/ocaml/analysis/llvm_analysis.mli

@@ -1,4 +1,4 @@
-(*===-- llvm_analysis.mli - LLVM OCaml Interface ----------------*- C++ -*-===*
+(*===-- llvm_analysis.mli - LLVM OCaml Interface --------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *

diff --git a/bindings/ocaml/backends/META.llvm_backend.in b/bindings/ocaml/backends/META.llvm_backend.in
index 0d4a6d6..6c1e8c4 100644
--- a/bindings/ocaml/backends/META.llvm_backend.in
+++ b/bindings/ocaml/backends/META.llvm_backend.in

@@ -5,4 +5,3 @@
 archive(byte) = "llvm_@TARGET@.cma"
 archive(native) = "llvm_@TARGET@.cmxa"
 directory = "."
-linkopts = "-ccopt -lstdc++"
\ No newline at end of file

diff --git a/bindings/ocaml/backends/backend_ocaml.c b/bindings/ocaml/backends/backend_ocaml.c
index 2d4ba85..3e1a438 100644
--- a/bindings/ocaml/backends/backend_ocaml.c
+++ b/bindings/ocaml/backends/backend_ocaml.c

@@ -19,10 +19,11 @@
 #include "caml/alloc.h"
 #include "caml/memory.h"
 
-// TODO: Figure out how to call these only for targets which support them.
-// LLVMInitialize ## target ## AsmPrinter();
-// LLVMInitialize ## target ## AsmParser();
-// LLVMInitialize ## target ## Disassembler();
+/* TODO: Figure out how to call these only for targets which support them.
+ * LLVMInitialize ## target ## AsmPrinter();
+ * LLVMInitialize ## target ## AsmParser();
+ * LLVMInitialize ## target ## Disassembler();
+ */
 
 #define INITIALIZER1(target) \
   CAMLprim value llvm_initialize_ ## target(value Unit) {  \

diff --git a/bindings/ocaml/bitreader/Makefile b/bindings/ocaml/bitreader/Makefile
index a1c7de8..dad4e1d 100644
--- a/bindings/ocaml/bitreader/Makefile
+++ b/bindings/ocaml/bitreader/Makefile

@@ -1,14 +1,14 @@
 ##===- bindings/ocaml/bitreader/Makefile -------------------*- Makefile -*-===##
-# 
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
-# 
+#
 # This is the makefile for the Objective Caml Llvm_bitreader interface.
-# 
+#
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../../..

diff --git a/bindings/ocaml/bitreader/bitreader_ocaml.c b/bindings/ocaml/bitreader/bitreader_ocaml.c
index 0264e73..15ebd5f 100644
--- a/bindings/ocaml/bitreader/bitreader_ocaml.c
+++ b/bindings/ocaml/bitreader/bitreader_ocaml.c

@@ -16,58 +16,28 @@
 #include "caml/alloc.h"
 #include "caml/fail.h"
 #include "caml/memory.h"
+#include "caml/callback.h"
 
-
-/* Can't use the recommended caml_named_value mechanism for backwards
-   compatibility reasons. This is largely equivalent. */
-static value llvm_bitreader_error_exn;
-
-CAMLprim value llvm_register_bitreader_exns(value Error) {
-  llvm_bitreader_error_exn = Field(Error, 0);
-  register_global_root(&llvm_bitreader_error_exn);
-  return Val_unit;
-}
-
-static void llvm_raise(value Prototype, char *Message) {
-  CAMLparam1(Prototype);
-  CAMLlocal1(CamlMessage);
-  
-  CamlMessage = copy_string(Message);
-  LLVMDisposeMessage(Message);
-  
-  raise_with_arg(Prototype, CamlMessage);
-  abort(); /* NOTREACHED */
-#ifdef CAMLnoreturn
-  CAMLnoreturn; /* Silences warnings, but is missing in some versions. */
-#endif
-}
-
-
-/*===-- Modules -----------------------------------------------------------===*/
+void llvm_raise(value Prototype, char *Message);
 
 /* Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule */
-CAMLprim value llvm_get_module(LLVMContextRef C, LLVMMemoryBufferRef MemBuf) {
-  CAMLparam0();
-  CAMLlocal2(Variant, MessageVal);
-  char *Message;
-  
+CAMLprim LLVMModuleRef llvm_get_module(LLVMContextRef C, LLVMMemoryBufferRef MemBuf) {
   LLVMModuleRef M;
+  char *Message;
+
   if (LLVMGetBitcodeModuleInContext(C, MemBuf, &M, &Message))
-    llvm_raise(llvm_bitreader_error_exn, Message);
-  
-  CAMLreturn((value) M);
+    llvm_raise(*caml_named_value("Llvm_bitreader.Error"), Message);
+
+  return M;
 }
 
 /* Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule */
-CAMLprim value llvm_parse_bitcode(LLVMContextRef C,
-                                  LLVMMemoryBufferRef MemBuf) {
-  CAMLparam0();
-  CAMLlocal2(Variant, MessageVal);
+CAMLprim LLVMModuleRef llvm_parse_bitcode(LLVMContextRef C, LLVMMemoryBufferRef MemBuf) {
   LLVMModuleRef M;
   char *Message;
-  
+
   if (LLVMParseBitcodeInContext(C, MemBuf, &M, &Message))
-    llvm_raise(llvm_bitreader_error_exn, Message);
-  
-  CAMLreturn((value) M);
+    llvm_raise(*caml_named_value("Llvm_bitreader.Error"), Message);
+
+  return M;
 }

diff --git a/bindings/ocaml/bitreader/llvm_bitreader.ml b/bindings/ocaml/bitreader/llvm_bitreader.ml
index 865208c..b26efdd 100644
--- a/bindings/ocaml/bitreader/llvm_bitreader.ml
+++ b/bindings/ocaml/bitreader/llvm_bitreader.ml

@@ -1,4 +1,4 @@
-(*===-- llvm_bitreader.ml - LLVM OCaml Interface ----------------*- C++ -*-===*
+(*===-- llvm_bitreader.ml - LLVM OCaml Interface --------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -7,14 +7,13 @@
  *
  *===----------------------------------------------------------------------===*)
 
-
 exception Error of string
 
-external register_exns : exn -> unit = "llvm_register_bitreader_exns"
-let _ = register_exns (Error "")
+let () = Callback.register_exception "Llvm_bitreader.Error" (Error "")
 
-external get_module : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
-                    = "llvm_get_module"
-
-external parse_bitcode : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
-                       = "llvm_parse_bitcode"
+external get_module
+  : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
+  = "llvm_get_module"
+external parse_bitcode
+  : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
+  = "llvm_parse_bitcode"

diff --git a/bindings/ocaml/bitreader/llvm_bitreader.mli b/bindings/ocaml/bitreader/llvm_bitreader.mli
index ff377b9..4351343 100644
--- a/bindings/ocaml/bitreader/llvm_bitreader.mli
+++ b/bindings/ocaml/bitreader/llvm_bitreader.mli

@@ -1,4 +1,4 @@
-(*===-- llvm_bitreader.mli - LLVM OCaml Interface ---------------*- C++ -*-===*
+(*===-- llvm_bitreader.mli - LLVM OCaml Interface -------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -20,7 +20,6 @@
     encountered. See the function [llvm::getBitcodeModule]. *)
 val get_module : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
 
-
 (** [parse_bitcode context mb] parses the bitcode for a new module [m] from the
     memory buffer [mb] in the context [context]. Returns [m] if successful, or
     raises [Error msg] otherwise, where [msg] is a description of the error

diff --git a/bindings/ocaml/bitwriter/Makefile b/bindings/ocaml/bitwriter/Makefile
index cec0a59..9f0b2c8 100644
--- a/bindings/ocaml/bitwriter/Makefile
+++ b/bindings/ocaml/bitwriter/Makefile

@@ -1,14 +1,14 @@
 ##===- bindings/ocaml/bitwriter/Makefile -------------------*- Makefile -*-===##
-# 
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
-# 
+#
 # This is the makefile for the Objective Caml Llvm_bitwriter interface.
-# 
+#
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../../..

diff --git a/bindings/ocaml/bitwriter/bitwriter_ocaml.c b/bindings/ocaml/bitwriter/bitwriter_ocaml.c
index a47f700..04fd619 100644
--- a/bindings/ocaml/bitwriter/bitwriter_ocaml.c
+++ b/bindings/ocaml/bitwriter/bitwriter_ocaml.c

@@ -21,25 +21,28 @@
 #include "caml/mlvalues.h"
 #include "caml/memory.h"
 
-/*===-- Modules -----------------------------------------------------------===*/
-
 /* Llvm.llmodule -> string -> bool */
-CAMLprim value llvm_write_bitcode_file(value M, value Path) {
-  int res = LLVMWriteBitcodeToFile((LLVMModuleRef) M, String_val(Path));
-  return Val_bool(res == 0);
+CAMLprim value llvm_write_bitcode_file(LLVMModuleRef M, value Path) {
+  int Result = LLVMWriteBitcodeToFile(M, String_val(Path));
+  return Val_bool(Result == 0);
 }
 
 /* ?unbuffered:bool -> Llvm.llmodule -> Unix.file_descr -> bool */
-CAMLprim value llvm_write_bitcode_to_fd(value U, value M, value FD) {
+CAMLprim value llvm_write_bitcode_to_fd(value U, LLVMModuleRef M, value FD) {
   int Unbuffered;
-  int res;
+  int Result;
 
   if (U == Val_int(0)) {
     Unbuffered = 0;
   } else {
-    Unbuffered = Bool_val(Field(U,0));
+    Unbuffered = Bool_val(Field(U, 0));
   }
 
-  res = LLVMWriteBitcodeToFD((LLVMModuleRef) M, Int_val(FD), 0, Unbuffered);
-  return Val_bool(res == 0);
+  Result = LLVMWriteBitcodeToFD(M, Int_val(FD), 0, Unbuffered);
+  return Val_bool(Result == 0);
+}
+
+/* Llvm.llmodule -> Llvm.llmemorybuffer */
+CAMLprim LLVMMemoryBufferRef llvm_write_bitcode_to_memory_buffer(LLVMModuleRef M) {
+  return LLVMWriteBitcodeToMemoryBuffer(M);
 }

diff --git a/bindings/ocaml/bitwriter/llvm_bitwriter.ml b/bindings/ocaml/bitwriter/llvm_bitwriter.ml
index fac8553..fca6efa 100644
--- a/bindings/ocaml/bitwriter/llvm_bitwriter.ml
+++ b/bindings/ocaml/bitwriter/llvm_bitwriter.ml

@@ -1,4 +1,4 @@
-(*===-- llvm_bitwriter.ml - LLVM OCaml Interface ----------------*- C++ -*-===*
+(*===-- llvm_bitwriter.ml - LLVM OCaml Interface --------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -12,14 +12,17 @@
  *
  *===----------------------------------------------------------------------===*)
 
+external write_bitcode_file
+  : Llvm.llmodule -> string -> bool
+  = "llvm_write_bitcode_file"
 
-(* Writes the bitcode for module the given path. Returns true if successful. *)
-external write_bitcode_file : Llvm.llmodule -> string -> bool
-                            = "llvm_write_bitcode_file"
+external write_bitcode_to_fd
+  : ?unbuffered:bool -> Llvm.llmodule -> Unix.file_descr -> bool
+  = "llvm_write_bitcode_to_fd"
 
-external write_bitcode_to_fd : ?unbuffered:bool -> Llvm.llmodule
-                               -> Unix.file_descr -> bool
-                             = "llvm_write_bitcode_to_fd"
+external write_bitcode_to_memory_buffer
+  : Llvm.llmodule -> Llvm.llmemorybuffer
+  = "llvm_write_bitcode_to_memory_buffer"
 
 let output_bitcode ?unbuffered channel m =
   write_bitcode_to_fd ?unbuffered m (Unix.descr_of_out_channel channel)

diff --git a/bindings/ocaml/bitwriter/llvm_bitwriter.mli b/bindings/ocaml/bitwriter/llvm_bitwriter.mli
index bb3e3b8..3d0f780 100644
--- a/bindings/ocaml/bitwriter/llvm_bitwriter.mli
+++ b/bindings/ocaml/bitwriter/llvm_bitwriter.mli

@@ -1,4 +1,4 @@
-(*===-- llvm_bitwriter.mli - LLVM OCaml Interface ---------------*- C++ -*-===*
+(*===-- llvm_bitwriter.mli - LLVM OCaml Interface -------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -14,15 +14,22 @@
 
 (** [write_bitcode_file m path] writes the bitcode for module [m] to the file at
     [path]. Returns [true] if successful, [false] otherwise. *)
-external write_bitcode_file : Llvm.llmodule -> string -> bool
-                            = "llvm_write_bitcode_file"
+external write_bitcode_file
+  : Llvm.llmodule -> string -> bool
+  = "llvm_write_bitcode_file"
 
 (** [write_bitcode_to_fd ~unbuffered fd m] writes the bitcode for module
     [m] to the channel [c]. If [unbuffered] is [true], after every write the fd
     will be flushed. Returns [true] if successful, [false] otherwise. *)
-external write_bitcode_to_fd : ?unbuffered:bool -> Llvm.llmodule
-                               -> Unix.file_descr -> bool
-                             = "llvm_write_bitcode_to_fd"
+external write_bitcode_to_fd
+  : ?unbuffered:bool -> Llvm.llmodule -> Unix.file_descr -> bool
+  = "llvm_write_bitcode_to_fd"
+
+(** [write_bitcode_to_memory_buffer m] returns a memory buffer containing
+    the bitcode for module [m]. *)
+external write_bitcode_to_memory_buffer
+  : Llvm.llmodule -> Llvm.llmemorybuffer
+  = "llvm_write_bitcode_to_memory_buffer"
 
 (** [output_bitcode ~unbuffered c m] writes the bitcode for module [m]
     to the channel [c]. If [unbuffered] is [true], after every write the fd

diff --git a/bindings/ocaml/executionengine/Makefile b/bindings/ocaml/executionengine/Makefile
index 5fa3f22..8b5d28f 100644
--- a/bindings/ocaml/executionengine/Makefile
+++ b/bindings/ocaml/executionengine/Makefile

@@ -1,19 +1,20 @@
-##===- bindings/ocaml/executionengine/Makefile --------------*- Makefile -*-===##
-# 
+##===- bindings/ocaml/executionengine/Makefile -------------*- Makefile -*-===##
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
-# 
+#
 # This is the makefile for the Objective Caml Llvm_executionengine interface.
-# 
+#
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../../..
 LIBRARYNAME := llvm_executionengine
-UsedComponents := executionengine jit interpreter native
+UsedComponents := executionengine mcjit native
 UsedOcamlInterfaces := llvm llvm_target
+FindlibPackages := ctypes
 
 include ../Makefile.ocaml

diff --git a/bindings/ocaml/executionengine/executionengine_ocaml.c b/bindings/ocaml/executionengine/executionengine_ocaml.c
index 4b44a91..0557efc 100644
--- a/bindings/ocaml/executionengine/executionengine_ocaml.c
+++ b/bindings/ocaml/executionengine/executionengine_ocaml.c

@@ -15,189 +15,48 @@
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
+#include <string.h>
+#include <assert.h>
 #include "llvm-c/ExecutionEngine.h"
 #include "llvm-c/Target.h"
 #include "caml/alloc.h"
 #include "caml/custom.h"
 #include "caml/fail.h"
 #include "caml/memory.h"
-#include <string.h>
-#include <assert.h>
+#include "caml/callback.h"
 
-/* Force the LLVM interpreter and JIT to be linked in. */
-void llvm_initialize(void) {
-  LLVMLinkInInterpreter();
-  LLVMLinkInJIT();
-}
+void llvm_raise(value Prototype, char *Message);
 
 /* unit -> bool */
-CAMLprim value llvm_initialize_native_target(value Unit) {
-  return Val_bool(LLVMInitializeNativeTarget());
+CAMLprim value llvm_ee_initialize(value Unit) {
+  LLVMLinkInMCJIT();
+
+  return Val_bool(!LLVMInitializeNativeTarget() &&
+                  !LLVMInitializeNativeAsmParser() &&
+                  !LLVMInitializeNativeAsmPrinter());
 }
 
-/* Can't use the recommended caml_named_value mechanism for backwards
-   compatibility reasons. This is largely equivalent. */
-static value llvm_ee_error_exn;
-
-CAMLprim value llvm_register_ee_exns(value Error) {
-  llvm_ee_error_exn = Field(Error, 0);
-  register_global_root(&llvm_ee_error_exn);
-  return Val_unit;
-}
-
-static void llvm_raise(value Prototype, char *Message) {
-  CAMLparam1(Prototype);
-  CAMLlocal1(CamlMessage);
-  
-  CamlMessage = copy_string(Message);
-  LLVMDisposeMessage(Message);
-  
-  raise_with_arg(Prototype, CamlMessage);
-  abort(); /* NOTREACHED */
-#ifdef CAMLnoreturn
-  CAMLnoreturn; /* Silences warnings, but is missing in some versions. */
-#endif
-}
-
-
-/*--... Operations on generic values .......................................--*/
-
-#define Genericvalue_val(v)  (*(LLVMGenericValueRef *)(Data_custom_val(v)))
-
-static void llvm_finalize_generic_value(value GenVal) {
-  LLVMDisposeGenericValue(Genericvalue_val(GenVal));
-}
-
-static struct custom_operations generic_value_ops = {
-  (char *) "LLVMGenericValue",
-  llvm_finalize_generic_value,
-  custom_compare_default,
-  custom_hash_default,
-  custom_serialize_default,
-  custom_deserialize_default
-#ifdef custom_compare_ext_default
-  , custom_compare_ext_default
-#endif
-};
-
-static value alloc_generic_value(LLVMGenericValueRef Ref) {
-  value Val = alloc_custom(&generic_value_ops, sizeof(LLVMGenericValueRef), 0, 1);
-  Genericvalue_val(Val) = Ref;
-  return Val;
-}
-
-/* Llvm.lltype -> float -> t */
-CAMLprim value llvm_genericvalue_of_float(LLVMTypeRef Ty, value N) {
-  CAMLparam1(N);
-  CAMLreturn(alloc_generic_value(
-    LLVMCreateGenericValueOfFloat(Ty, Double_val(N))));
-}
-
-/* 'a -> t */
-CAMLprim value llvm_genericvalue_of_pointer(value V) {
-  CAMLparam1(V);
-  CAMLreturn(alloc_generic_value(LLVMCreateGenericValueOfPointer(Op_val(V))));
-}
-
-/* Llvm.lltype -> int -> t */
-CAMLprim value llvm_genericvalue_of_int(LLVMTypeRef Ty, value Int) {
-  return alloc_generic_value(LLVMCreateGenericValueOfInt(Ty, Int_val(Int), 1));
-}
-
-/* Llvm.lltype -> int32 -> t */
-CAMLprim value llvm_genericvalue_of_int32(LLVMTypeRef Ty, value Int32) {
-  CAMLparam1(Int32);
-  CAMLreturn(alloc_generic_value(
-    LLVMCreateGenericValueOfInt(Ty, Int32_val(Int32), 1)));
-}
-
-/* Llvm.lltype -> nativeint -> t */
-CAMLprim value llvm_genericvalue_of_nativeint(LLVMTypeRef Ty, value NatInt) {
-  CAMLparam1(NatInt);
-  CAMLreturn(alloc_generic_value(
-    LLVMCreateGenericValueOfInt(Ty, Nativeint_val(NatInt), 1)));
-}
-
-/* Llvm.lltype -> int64 -> t */
-CAMLprim value llvm_genericvalue_of_int64(LLVMTypeRef Ty, value Int64) {
-  CAMLparam1(Int64);
-  CAMLreturn(alloc_generic_value(
-    LLVMCreateGenericValueOfInt(Ty, Int64_val(Int64), 1)));
-}
-
-/* Llvm.lltype -> t -> float */
-CAMLprim value llvm_genericvalue_as_float(LLVMTypeRef Ty, value GenVal) {
-  CAMLparam1(GenVal);
-  CAMLreturn(copy_double(
-    LLVMGenericValueToFloat(Ty, Genericvalue_val(GenVal))));
-}
-
-/* t -> 'a */
-CAMLprim value llvm_genericvalue_as_pointer(value GenVal) {
-  return Val_op(LLVMGenericValueToPointer(Genericvalue_val(GenVal)));
-}
-
-/* t -> int */
-CAMLprim value llvm_genericvalue_as_int(value GenVal) {
-  assert(LLVMGenericValueIntWidth(Genericvalue_val(GenVal)) <= 8 * sizeof(value)
-         && "Generic value too wide to treat as an int!");
-  return Val_int(LLVMGenericValueToInt(Genericvalue_val(GenVal), 1));
-}
-
-/* t -> int32 */
-CAMLprim value llvm_genericvalue_as_int32(value GenVal) {
-  CAMLparam1(GenVal);
-  assert(LLVMGenericValueIntWidth(Genericvalue_val(GenVal)) <= 32
-         && "Generic value too wide to treat as an int32!");
-  CAMLreturn(copy_int32(LLVMGenericValueToInt(Genericvalue_val(GenVal), 1)));
-}
-
-/* t -> int64 */
-CAMLprim value llvm_genericvalue_as_int64(value GenVal) {
-  CAMLparam1(GenVal);
-  assert(LLVMGenericValueIntWidth(Genericvalue_val(GenVal)) <= 64
-         && "Generic value too wide to treat as an int64!");
-  CAMLreturn(copy_int64(LLVMGenericValueToInt(Genericvalue_val(GenVal), 1)));
-}
-
-/* t -> nativeint */
-CAMLprim value llvm_genericvalue_as_nativeint(value GenVal) {
-  CAMLparam1(GenVal);
-  assert(LLVMGenericValueIntWidth(Genericvalue_val(GenVal)) <= 8 * sizeof(value)
-         && "Generic value too wide to treat as a nativeint!");
-  CAMLreturn(copy_nativeint(LLVMGenericValueToInt(Genericvalue_val(GenVal),1)));
-}
-
-
-/*--... Operations on execution engines ....................................--*/
-
-/* llmodule -> ExecutionEngine.t */
-CAMLprim LLVMExecutionEngineRef llvm_ee_create(LLVMModuleRef M) {
-  LLVMExecutionEngineRef Interp;
+/* llmodule -> llcompileroption -> ExecutionEngine.t */
+CAMLprim LLVMExecutionEngineRef llvm_ee_create(value OptRecordOpt, LLVMModuleRef M) {
+  value OptRecord;
+  LLVMExecutionEngineRef MCJIT;
   char *Error;
-  if (LLVMCreateExecutionEngineForModule(&Interp, M, &Error))
-    llvm_raise(llvm_ee_error_exn, Error);
-  return Interp;
-}
+  struct LLVMMCJITCompilerOptions Options;
 
-/* llmodule -> ExecutionEngine.t */
-CAMLprim LLVMExecutionEngineRef
-llvm_ee_create_interpreter(LLVMModuleRef M) {
-  LLVMExecutionEngineRef Interp;
-  char *Error;
-  if (LLVMCreateInterpreterForModule(&Interp, M, &Error))
-    llvm_raise(llvm_ee_error_exn, Error);
-  return Interp;
-}
+  LLVMInitializeMCJITCompilerOptions(&Options, sizeof(Options));
+  if (OptRecordOpt != Val_int(0)) {
+    OptRecord = Field(OptRecordOpt, 0);
+    Options.OptLevel = Int_val(Field(OptRecord, 0));
+    Options.CodeModel = Int_val(Field(OptRecord, 1));
+    Options.NoFramePointerElim = Int_val(Field(OptRecord, 2));
+    Options.EnableFastISel = Int_val(Field(OptRecord, 3));
+    Options.MCJMM = NULL;
+  }
 
-/* llmodule -> int -> ExecutionEngine.t */
-CAMLprim LLVMExecutionEngineRef
-llvm_ee_create_jit(LLVMModuleRef M, value OptLevel) {
-  LLVMExecutionEngineRef JIT;
-  char *Error;
-  if (LLVMCreateJITCompilerForModule(&JIT, M, Int_val(OptLevel), &Error))
-    llvm_raise(llvm_ee_error_exn, Error);
-  return JIT;
+  if (LLVMCreateMCJITCompilerForModule(&MCJIT, M, &Options,
+                                      sizeof(Options), &Error))
+    llvm_raise(*caml_named_value("Llvm_executionengine.Error"), Error);
+  return MCJIT;
 }
 
 /* ExecutionEngine.t -> unit */
@@ -213,43 +72,12 @@
 }
 
 /* llmodule -> ExecutionEngine.t -> llmodule */
-CAMLprim LLVMModuleRef llvm_ee_remove_module(LLVMModuleRef M,
-                                             LLVMExecutionEngineRef EE) {
+CAMLprim value llvm_ee_remove_module(LLVMModuleRef M, LLVMExecutionEngineRef EE) {
   LLVMModuleRef RemovedModule;
   char *Error;
   if (LLVMRemoveModule(EE, M, &RemovedModule, &Error))
-    llvm_raise(llvm_ee_error_exn, Error);
-  return RemovedModule;
-}
-
-/* string -> ExecutionEngine.t -> llvalue option */
-CAMLprim value llvm_ee_find_function(value Name, LLVMExecutionEngineRef EE) {
-  CAMLparam1(Name);
-  CAMLlocal1(Option);
-  LLVMValueRef Found;
-  if (LLVMFindFunction(EE, String_val(Name), &Found))
-    CAMLreturn(Val_unit);
-  Option = alloc(1, 0);
-  Field(Option, 0) = Val_op(Found);
-  CAMLreturn(Option);
-}
-
-/* llvalue -> GenericValue.t array -> ExecutionEngine.t -> GenericValue.t */
-CAMLprim value llvm_ee_run_function(LLVMValueRef F, value Args,
-                                    LLVMExecutionEngineRef EE) {
-  unsigned NumArgs;
-  LLVMGenericValueRef Result, *GVArgs;
-  unsigned I;
-  
-  NumArgs = Wosize_val(Args);
-  GVArgs = (LLVMGenericValueRef*) malloc(NumArgs * sizeof(LLVMGenericValueRef));
-  for (I = 0; I != NumArgs; ++I)
-    GVArgs[I] = Genericvalue_val(Field(Args, I));
-  
-  Result = LLVMRunFunction(EE, F, NumArgs, GVArgs);
-  
-  free(GVArgs);
-  return alloc_generic_value(Result);
+    llvm_raise(*caml_named_value("Llvm_executionengine.Error"), Error);
+  return Val_unit;
 }
 
 /* ExecutionEngine.t -> unit */
@@ -264,78 +92,31 @@
   return Val_unit;
 }
 
-/* llvalue -> string array -> (string * string) array -> ExecutionEngine.t ->
-   int */
-CAMLprim value llvm_ee_run_function_as_main(LLVMValueRef F,
-                                            value Args, value Env,
-                                            LLVMExecutionEngineRef EE) {
-  CAMLparam2(Args, Env);
-  int I, NumArgs, NumEnv, EnvSize, Result;
-  const char **CArgs, **CEnv;
-  char *CEnvBuf, *Pos;
-  
-  NumArgs = Wosize_val(Args);
-  NumEnv = Wosize_val(Env);
-  
-  /* Build the environment. */
-  CArgs = (const char **) malloc(NumArgs * sizeof(char*));
-  for (I = 0; I != NumArgs; ++I)
-    CArgs[I] = String_val(Field(Args, I));
-  
-  /* Compute the size of the environment string buffer. */
-  for (I = 0, EnvSize = 0; I != NumEnv; ++I) {
-    EnvSize += strlen(String_val(Field(Field(Env, I), 0))) + 1;
-    EnvSize += strlen(String_val(Field(Field(Env, I), 1))) + 1;
-  }
-  
-  /* Build the environment. */
-  CEnv = (const char **) malloc((NumEnv + 1) * sizeof(char*));
-  CEnvBuf = (char*) malloc(EnvSize);
-  Pos = CEnvBuf;
-  for (I = 0; I != NumEnv; ++I) {
-    char *Name  = String_val(Field(Field(Env, I), 0)),
-         *Value = String_val(Field(Field(Env, I), 1));
-    int NameLen  = strlen(Name),
-        ValueLen = strlen(Value);
-    
-    CEnv[I] = Pos;
-    memcpy(Pos, Name, NameLen);
-    Pos += NameLen;
-    *Pos++ = '=';
-    memcpy(Pos, Value, ValueLen);
-    Pos += ValueLen;
-    *Pos++ = '\0';
-  }
-  CEnv[NumEnv] = NULL;
-  
-  Result = LLVMRunFunctionAsMain(EE, F, NumArgs, CArgs, CEnv);
-  
-  free(CArgs);
-  free(CEnv);
-  free(CEnvBuf);
-  
-  CAMLreturn(Val_int(Result));
-}
-
-/* llvalue -> ExecutionEngine.t -> unit */
-CAMLprim value llvm_ee_free_machine_code(LLVMValueRef F,
-                                         LLVMExecutionEngineRef EE) {
-  LLVMFreeMachineCodeForFunction(EE, F);
-  return Val_unit;
-}
-
 extern value llvm_alloc_data_layout(LLVMTargetDataRef TargetData);
 
 /* ExecutionEngine.t -> Llvm_target.DataLayout.t */
 CAMLprim value llvm_ee_get_data_layout(LLVMExecutionEngineRef EE) {
   value DataLayout;
   LLVMTargetDataRef OrigDataLayout;
-  OrigDataLayout = LLVMGetExecutionEngineTargetData(EE);
-
   char* TargetDataCStr;
+
+  OrigDataLayout = LLVMGetExecutionEngineTargetData(EE);
   TargetDataCStr = LLVMCopyStringRepOfTargetData(OrigDataLayout);
   DataLayout = llvm_alloc_data_layout(LLVMCreateTargetData(TargetDataCStr));
   LLVMDisposeMessage(TargetDataCStr);
 
   return DataLayout;
 }
+
+/* Llvm.llvalue -> int64 -> llexecutionengine -> unit */
+CAMLprim value llvm_ee_add_global_mapping(LLVMValueRef Global, value Ptr,
+                                          LLVMExecutionEngineRef EE) {
+  LLVMAddGlobalMapping(EE, Global, (void*) (Int64_val(Ptr)));
+  return Val_unit;
+}
+
+/* Llvm.llvalue -> llexecutionengine -> int64 */
+CAMLprim value llvm_ee_get_pointer_to_global(LLVMValueRef Global,
+                                             LLVMExecutionEngineRef EE) {
+  return caml_copy_int64((int64_t) LLVMGetPointerToGlobal(EE, Global));
+}

diff --git a/bindings/ocaml/executionengine/llvm_executionengine.ml b/bindings/ocaml/executionengine/llvm_executionengine.ml
index a738df7..c0ff330 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.ml
+++ b/bindings/ocaml/executionengine/llvm_executionengine.ml

@@ -1,4 +1,4 @@
-(*===-- llvm_executionengine.ml - LLVM OCaml Interface ----------*- C++ -*-===*
+(*===-- llvm_executionengine.ml - LLVM OCaml Interface --------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -7,105 +7,54 @@
  *
  *===----------------------------------------------------------------------===*)
 
-
 exception Error of string
 
-external register_exns: exn -> unit
-  = "llvm_register_ee_exns"
+let () = Callback.register_exception "Llvm_executionengine.Error" (Error "")
 
+external initialize : unit -> bool
+  = "llvm_ee_initialize"
 
-module GenericValue = struct
-  type t
-  
-  external of_float: Llvm.lltype -> float -> t
-    = "llvm_genericvalue_of_float"
-  external of_pointer: 'a -> t
-    = "llvm_genericvalue_of_pointer"
-  external of_int32: Llvm.lltype -> int32 -> t
-    = "llvm_genericvalue_of_int32"
-  external of_int: Llvm.lltype -> int -> t
-    = "llvm_genericvalue_of_int"
-  external of_nativeint: Llvm.lltype -> nativeint -> t
-    = "llvm_genericvalue_of_nativeint"
-  external of_int64: Llvm.lltype -> int64 -> t
-    = "llvm_genericvalue_of_int64"
-  
-  external as_float: Llvm.lltype -> t -> float
-    = "llvm_genericvalue_as_float"
-  external as_pointer: t -> 'a
-    = "llvm_genericvalue_as_pointer"
-  external as_int32: t -> int32
-    = "llvm_genericvalue_as_int32"
-  external as_int: t -> int
-    = "llvm_genericvalue_as_int"
-  external as_nativeint: t -> nativeint
-    = "llvm_genericvalue_as_nativeint"
-  external as_int64: t -> int64
-    = "llvm_genericvalue_as_int64"
-end
+type llexecutionengine
 
+type llcompileroptions = {
+  opt_level: int;
+  code_model: Llvm_target.CodeModel.t;
+  no_framepointer_elim: bool;
+  enable_fast_isel: bool;
+}
 
-module ExecutionEngine = struct
-  type t
-  
-  (* FIXME: Ocaml is not running this setup code unless we use 'val' in the
-            interface, which causes the emission of a stub for each function;
-            using 'external' in the module allows direct calls into 
-            ocaml_executionengine.c. This is hardly fatal, but it is unnecessary
-            overhead on top of the two stubs that are already invoked for each 
-            call into LLVM. *)
-  let _ = register_exns (Error "")
-  
-  external create: Llvm.llmodule -> t
-    = "llvm_ee_create"
-  external create_interpreter: Llvm.llmodule -> t
-    = "llvm_ee_create_interpreter"
-  external create_jit: Llvm.llmodule -> int -> t
-    = "llvm_ee_create_jit"
-  external dispose: t -> unit
-    = "llvm_ee_dispose"
-  external add_module: Llvm.llmodule -> t -> unit
-    = "llvm_ee_add_module"
-  external remove_module: Llvm.llmodule -> t -> Llvm.llmodule
-    = "llvm_ee_remove_module"
-  external find_function: string -> t -> Llvm.llvalue option
-    = "llvm_ee_find_function"
-  external run_function: Llvm.llvalue -> GenericValue.t array -> t ->
-                         GenericValue.t
-    = "llvm_ee_run_function"
-  external run_static_ctors: t -> unit
-    = "llvm_ee_run_static_ctors"
-  external run_static_dtors: t -> unit
-    = "llvm_ee_run_static_dtors"
-  external run_function_as_main: Llvm.llvalue -> string array ->
-                                 (string * string) array -> t -> int
-    = "llvm_ee_run_function_as_main"
-  external free_machine_code: Llvm.llvalue -> t -> unit
-    = "llvm_ee_free_machine_code"
+let default_compiler_options = {
+  opt_level = 0;
+  code_model = Llvm_target.CodeModel.JITDefault;
+  no_framepointer_elim = false;
+  enable_fast_isel = false }
 
-  external data_layout : t -> Llvm_target.DataLayout.t
-    = "llvm_ee_get_data_layout"
-  
-  (* The following are not bound. Patches are welcome.
-  
-  add_global_mapping: llvalue -> llgenericvalue -> t -> unit
-  clear_all_global_mappings: t -> unit
-  update_global_mapping: llvalue -> llgenericvalue -> t -> unit
-  get_pointer_to_global_if_available: llvalue -> t -> llgenericvalue
-  get_pointer_to_global: llvalue -> t -> llgenericvalue
-  get_pointer_to_function: llvalue -> t -> llgenericvalue
-  get_pointer_to_function_or_stub: llvalue -> t -> llgenericvalue
-  get_global_value_at_address: llgenericvalue -> t -> llvalue option
-  store_value_to_memory: llgenericvalue -> llgenericvalue -> lltype -> unit
-  initialize_memory: llvalue -> llgenericvalue -> t -> unit
-  recompile_and_relink_function: llvalue -> t -> llgenericvalue
-  get_or_emit_global_variable: llvalue -> t -> llgenericvalue
-  disable_lazy_compilation: t -> unit
-  lazy_compilation_enabled: t -> bool
-  install_lazy_function_creator: (string -> llgenericvalue) -> t -> unit
-  
-   *)
-end
+external create : ?options:llcompileroptions -> Llvm.llmodule -> llexecutionengine
+  = "llvm_ee_create"
+external dispose : llexecutionengine -> unit
+  = "llvm_ee_dispose"
+external add_module : Llvm.llmodule -> llexecutionengine -> unit
+  = "llvm_ee_add_module"
+external remove_module : Llvm.llmodule -> llexecutionengine -> unit
+  = "llvm_ee_remove_module"
+external run_static_ctors : llexecutionengine -> unit
+  = "llvm_ee_run_static_ctors"
+external run_static_dtors : llexecutionengine -> unit
+  = "llvm_ee_run_static_dtors"
+external data_layout : llexecutionengine -> Llvm_target.DataLayout.t
+  = "llvm_ee_get_data_layout"
+external add_global_mapping_ : Llvm.llvalue -> int64 -> llexecutionengine -> unit
+  = "llvm_ee_add_global_mapping"
+external get_pointer_to_global_ : Llvm.llvalue -> llexecutionengine -> int64
+  = "llvm_ee_get_pointer_to_global"
 
-external initialize_native_target : unit -> bool
-                                  = "llvm_initialize_native_target"
+let add_global_mapping llval ptr ee =
+  add_global_mapping_ llval (Ctypes.raw_address_of_ptr (Ctypes.to_voidp ptr)) ee
+
+let get_pointer_to_global llval typ ee =
+  Ctypes.coerce (let open Ctypes in ptr void) typ
+                (Ctypes.ptr_of_raw_address (get_pointer_to_global_ llval ee))
+
+(* The following are not bound. Patches are welcome.
+target_machine : llexecutionengine -> Llvm_target.TargetMachine.t
+ *)

diff --git a/bindings/ocaml/executionengine/llvm_executionengine.mli b/bindings/ocaml/executionengine/llvm_executionengine.mli
index 74a6062..b07151d 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.mli
+++ b/bindings/ocaml/executionengine/llvm_executionengine.mli

@@ -1,4 +1,4 @@
-(*===-- llvm_executionengine.mli - LLVM OCaml Interface ---------*- C++ -*-===*
+(*===-- llvm_executionengine.mli - LLVM OCaml Interface -------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -10,147 +10,75 @@
 (** JIT Interpreter.
 
     This interface provides an OCaml API for LLVM execution engine (JIT/
-    interpreter), the classes in the ExecutionEngine library. *)
+    interpreter), the classes in the [ExecutionEngine] library. *)
 
 exception Error of string
 
-module GenericValue: sig
-  (** [GenericValue.t] is a boxed union type used to portably pass arguments to
-      and receive values from the execution engine. It supports only a limited
-      selection of types; for more complex argument types, it is necessary to
-      generate a stub function by hand or to pass parameters by reference.
-      See the struct [llvm::GenericValue]. *)
-  type t
-  
-  (** [of_float fpty n] boxes the float [n] in a float-valued generic value
-      according to the floating point type [fpty]. See the fields
-      [llvm::GenericValue::DoubleVal] and [llvm::GenericValue::FloatVal]. *)
-  val of_float : Llvm.lltype -> float -> t
-  
-  (** [of_pointer v] boxes the pointer value [v] in a generic value. See the
-      field [llvm::GenericValue::PointerVal]. *)
-  val of_pointer : 'a -> t
-  
-  (** [of_int32 n w] boxes the int32 [i] in a generic value with the bitwidth
-      [w]. See the field [llvm::GenericValue::IntVal]. *)
-  val of_int32 : Llvm.lltype -> int32 -> t
-  
-  (** [of_int n w] boxes the int [i] in a generic value with the bitwidth
-      [w]. See the field [llvm::GenericValue::IntVal]. *)
-  val of_int : Llvm.lltype -> int -> t
-  
-  (** [of_natint n w] boxes the native int [i] in a generic value with the
-      bitwidth [w]. See the field [llvm::GenericValue::IntVal]. *)
-  val of_nativeint : Llvm.lltype -> nativeint -> t
+(** [initialize ()] initializes the backend corresponding to the host.
+    Returns [true] if initialization is successful; [false] indicates
+    that there is no such backend or it is unable to emit object code
+    via MCJIT. *)
+val initialize : unit -> bool
 
-  (** [of_int64 n w] boxes the int64 [i] in a generic value with the bitwidth
-      [w]. See the field [llvm::GenericValue::IntVal]. *)
-  val of_int64 : Llvm.lltype -> int64 -> t
+(** An execution engine is either a JIT compiler or an interpreter, capable of
+    directly loading an LLVM module and executing its functions without first
+    invoking a static compiler and generating a native executable. *)
+type llexecutionengine
 
-  (** [as_float fpty gv] unboxes the floating point-valued generic value [gv] of
-      floating point type [fpty]. See the fields [llvm::GenericValue::DoubleVal]
-      and [llvm::GenericValue::FloatVal]. *)
-  val as_float : Llvm.lltype -> t -> float
-  
-  (** [as_pointer gv] unboxes the pointer-valued generic value [gv]. See the
-      field [llvm::GenericValue::PointerVal]. *)
-  val as_pointer : t -> 'a
-  
-  (** [as_int32 gv] unboxes the integer-valued generic value [gv] as an [int32].
-      Is invalid if [gv] has a bitwidth greater than 32 bits. See the field
-      [llvm::GenericValue::IntVal]. *)
-  val as_int32 : t -> int32
-  
-  (** [as_int gv] unboxes the integer-valued generic value [gv] as an [int].
-      Is invalid if [gv] has a bitwidth greater than the host bit width (but the
-      most significant bit may be lost). See the field
-      [llvm::GenericValue::IntVal]. *)
-  val as_int : t -> int
-  
-  (** [as_natint gv] unboxes the integer-valued generic value [gv] as a
-      [nativeint]. Is invalid if [gv] has a bitwidth greater than
-      [nativeint]. See the field [llvm::GenericValue::IntVal]. *)
-  val as_nativeint : t -> nativeint
-  
-  (** [as_int64 gv] returns the integer-valued generic value [gv] as an [int64].
-      Is invalid if [gv] has a bitwidth greater than [int64]. See the field
-      [llvm::GenericValue::IntVal]. *)
-  val as_int64 : t -> int64
-end
+(** MCJIT compiler options. See [llvm::TargetOptions]. *)
+type llcompileroptions = {
+  opt_level: int;
+  code_model: Llvm_target.CodeModel.t;
+  no_framepointer_elim: bool;
+  enable_fast_isel: bool;
+}
 
+(** Default MCJIT compiler options:
+    [{ opt_level = 0; code_model = CodeModel.JIT_default;
+       no_framepointer_elim = false; enable_fast_isel = false }] *)
+val default_compiler_options : llcompileroptions
 
-module ExecutionEngine: sig
-  (** An execution engine is either a JIT compiler or an interpreter, capable of
-      directly loading an LLVM module and executing its functions without first
-      invoking a static compiler and generating a native executable. *)
-  type t
-  
-  (** [create m] creates a new execution engine, taking ownership of the
-      module [m] if successful. Creates a JIT if possible, else falls back to an
-      interpreter. Raises [Error msg] if an error occurrs. The execution engine
-      is not garbage collected and must be destroyed with [dispose ee].
-      See the function [llvm::EngineBuilder::create]. *)
-  val create : Llvm.llmodule -> t
-  
-  (** [create_interpreter m] creates a new interpreter, taking ownership of the
-      module [m] if successful. Raises [Error msg] if an error occurrs. The
-      execution engine is not garbage collected and must be destroyed with
-      [dispose ee].
-      See the function [llvm::EngineBuilder::create]. *)
-  val create_interpreter : Llvm.llmodule -> t
-  
-  (** [create_jit m optlevel] creates a new JIT (just-in-time compiler), taking
-      ownership of the module [m] if successful with the desired optimization
-      level [optlevel]. Raises [Error msg] if an error occurrs. The execution
-      engine is not garbage collected and must be destroyed with [dispose ee].
-      See the function [llvm::EngineBuilder::create]. *)
-  val create_jit : Llvm.llmodule -> int -> t
+(** [create m optlevel] creates a new MCJIT just-in-time compiler, taking
+    ownership of the module [m] if successful with the desired optimization
+    level [optlevel]. Raises [Error msg] if an error occurrs. The execution
+    engine is not garbage collected and must be destroyed with [dispose ee].
 
-  (** [dispose ee] releases the memory used by the execution engine and must be
-      invoked to avoid memory leaks. *)
-  val dispose : t -> unit
+    Run {!initialize} before using this function.
 
-  (** [add_module m ee] adds the module [m] to the execution engine [ee]. *)
-  val add_module : Llvm.llmodule -> t -> unit
-  
-  (** [remove_module m ee] removes the module [m] from the execution engine
-      [ee], disposing of [m] and the module referenced by [mp]. Raises
-      [Error msg] if an error occurs. *)
-  val remove_module : Llvm.llmodule -> t -> Llvm.llmodule
+    See the function [llvm::EngineBuilder::create]. *)
+val create : ?options:llcompileroptions -> Llvm.llmodule -> llexecutionengine
 
-  (** [find_function n ee] finds the function named [n] defined in any of the
-      modules owned by the execution engine [ee]. Returns [None] if the function
-      is not found and [Some f] otherwise. *)
-  val find_function : string -> t -> Llvm.llvalue option
-  
-  (** [run_function f args ee] synchronously executes the function [f] with the
-      arguments [args], which must be compatible with the parameter types. *)
-  val run_function : Llvm.llvalue -> GenericValue.t array -> t ->
-                     GenericValue.t
+(** [dispose ee] releases the memory used by the execution engine and must be
+    invoked to avoid memory leaks. *)
+val dispose : llexecutionengine -> unit
 
-  (** [run_static_ctors ee] executes the static constructors of each module in
-      the execution engine [ee]. *)
-  val run_static_ctors : t -> unit
-  
-  (** [run_static_dtors ee] executes the static destructors of each module in
-      the execution engine [ee]. *)
-  val run_static_dtors : t -> unit
-  
-  (** [run_function_as_main f args env ee] executes the function [f] as a main
-      function, passing it [argv] and [argc] according to the string array
-      [args], and [envp] as specified by the array [env]. Returns the integer
-      return value of the function. *)
-  val run_function_as_main : Llvm.llvalue -> string array ->
-                                  (string * string) array -> t -> int
+(** [add_module m ee] adds the module [m] to the execution engine [ee]. *)
+val add_module : Llvm.llmodule -> llexecutionengine -> unit
 
-  (** [free_machine_code f ee] releases the memory in the execution engine [ee]
-      used to store the machine code for the function [f]. *)
-  val free_machine_code : Llvm.llvalue -> t -> unit
+(** [remove_module m ee] removes the module [m] from the execution engine
+    [ee]. Raises [Error msg] if an error occurs. *)
+val remove_module : Llvm.llmodule -> llexecutionengine -> unit
 
-  (** [data_layout ee] is the data layout of the execution engine [ee]. *)
-  val data_layout : t -> Llvm_target.DataLayout.t
-end
+(** [run_static_ctors ee] executes the static constructors of each module in
+    the execution engine [ee]. *)
+val run_static_ctors : llexecutionengine -> unit
 
-(** [initialize_native_target ()] initializes the native target corresponding
-    to the host. Returns [true] if initialization is {b not} done. *)
-val initialize_native_target : unit -> bool
+(** [run_static_dtors ee] executes the static destructors of each module in
+    the execution engine [ee]. *)
+val run_static_dtors : llexecutionengine -> unit
+
+(** [data_layout ee] is the data layout of the execution engine [ee]. *)
+val data_layout : llexecutionengine -> Llvm_target.DataLayout.t
+
+(** [add_global_mapping gv ptr ee] tells the execution engine [ee] that
+    the global [gv] is at the specified location [ptr], which must outlive
+    [gv] and [ee].
+    All uses of [gv] in the compiled code will refer to [ptr]. *)
+val add_global_mapping : Llvm.llvalue -> 'a Ctypes.ptr -> llexecutionengine -> unit
+
+(** [get_pointer_to_global gv typ ee] returns the value of the global
+    variable [gv] in the execution engine [ee] as type [typ], which may
+    be a pointer type (e.g. [int ptr typ]) for global variables or
+    a function (e.g. [(int -> int) typ]) type for functions, and which
+    will be live as long as [gv] and [ee] are. *)
+val get_pointer_to_global : Llvm.llvalue -> 'a Ctypes.typ -> llexecutionengine -> 'a

diff --git a/bindings/ocaml/irreader/irreader_ocaml.c b/bindings/ocaml/irreader/irreader_ocaml.c
index 30c10c7..ce593db 100644
--- a/bindings/ocaml/irreader/irreader_ocaml.c
+++ b/bindings/ocaml/irreader/irreader_ocaml.c

@@ -16,33 +16,9 @@
 #include "caml/alloc.h"
 #include "caml/fail.h"
 #include "caml/memory.h"
+#include "caml/callback.h"
 
-/* Can't use the recommended caml_named_value mechanism for backwards
-   compatibility reasons. This is largely equivalent. */
-static value llvm_irreader_error_exn;
-
-CAMLprim value llvm_register_irreader_exns(value Error) {
-  llvm_irreader_error_exn = Field(Error, 0);
-  register_global_root(&llvm_irreader_error_exn);
-  return Val_unit;
-}
-
-static void llvm_raise(value Prototype, char *Message) {
-  CAMLparam1(Prototype);
-  CAMLlocal1(CamlMessage);
-
-  CamlMessage = copy_string(Message);
-  LLVMDisposeMessage(Message);
-
-  raise_with_arg(Prototype, CamlMessage);
-  abort(); /* NOTREACHED */
-#ifdef CAMLnoreturn
-  CAMLnoreturn; /* Silences warnings, but is missing in some versions. */
-#endif
-}
-
-
-/*===-- Modules -----------------------------------------------------------===*/
+void llvm_raise(value Prototype, char *Message);
 
 /* Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule */
 CAMLprim value llvm_parse_ir(LLVMContextRef C,
@@ -53,7 +29,7 @@
   char *Message;
 
   if (LLVMParseIRInContext(C, MemBuf, &M, &Message))
-    llvm_raise(llvm_irreader_error_exn, Message);
+    llvm_raise(*caml_named_value("Llvm_irreader.Error"), Message);
 
   CAMLreturn((value) M);
 }

diff --git a/bindings/ocaml/irreader/llvm_irreader.ml b/bindings/ocaml/irreader/llvm_irreader.ml
index 455b1fa..f757d62 100644
--- a/bindings/ocaml/irreader/llvm_irreader.ml
+++ b/bindings/ocaml/irreader/llvm_irreader.ml

@@ -10,8 +10,7 @@
 
 exception Error of string
 
-external register_exns : exn -> unit = "llvm_register_irreader_exns"
-let _ = register_exns (Error "")
+let _ = Callback.register_exception "Llvm_irreader.Error" (Error "")
 
 external parse_ir : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
                   = "llvm_parse_ir"

diff --git a/bindings/ocaml/linker/linker_ocaml.c b/bindings/ocaml/linker/linker_ocaml.c
index 2491e3b..ed37777 100644
--- a/bindings/ocaml/linker/linker_ocaml.c
+++ b/bindings/ocaml/linker/linker_ocaml.c

@@ -1,4 +1,4 @@
-/*===-- linker_ocaml.c - LLVM Ocaml Glue ------------------------*- C++ -*-===*\
+/*===-- linker_ocaml.c - LLVM OCaml Glue ------------------------*- C++ -*-===*\
 |*                                                                            *|
 |*                     The LLVM Compiler Infrastructure                       *|
 |*                                                                            *|
@@ -19,36 +19,16 @@
 #include "caml/alloc.h"
 #include "caml/memory.h"
 #include "caml/fail.h"
+#include "caml/callback.h"
 
-static value llvm_linker_error_exn;
+void llvm_raise(value Prototype, char *Message);
 
-CAMLprim value llvm_register_linker_exns(value Error) {
-  llvm_linker_error_exn = Field(Error, 0);
-  register_global_root(&llvm_linker_error_exn);
-  return Val_unit;
-}
-
-static void llvm_raise(value Prototype, char *Message) {
-  CAMLparam1(Prototype);
-  CAMLlocal1(CamlMessage);
-
-  CamlMessage = copy_string(Message);
-  LLVMDisposeMessage(Message);
-
-  raise_with_arg(Prototype, CamlMessage);
-  abort(); /* NOTREACHED */
-#ifdef CAMLnoreturn
-  CAMLnoreturn; /* Silences warnings, but is missing in some versions. */
-#endif
-}
-
-/* llmodule -> llmodule -> Mode.t -> unit
-   raises Error msg on error */
+/* llmodule -> llmodule -> Mode.t -> unit */
 CAMLprim value llvm_link_modules(LLVMModuleRef Dst, LLVMModuleRef Src, value Mode) {
   char* Message;
 
   if (LLVMLinkModules(Dst, Src, Int_val(Mode), &Message))
-    llvm_raise(llvm_linker_error_exn, Message);
+    llvm_raise(*caml_named_value("Llvm_linker.Error"), Message);
 
   return Val_unit;
 }

diff --git a/bindings/ocaml/linker/llvm_linker.ml b/bindings/ocaml/linker/llvm_linker.ml
index 2b73e2e..5854d70 100644
--- a/bindings/ocaml/linker/llvm_linker.ml
+++ b/bindings/ocaml/linker/llvm_linker.ml

@@ -9,8 +9,7 @@
 
 exception Error of string
 
-external register_exns : exn -> unit = "llvm_register_linker_exns"
-let _ = register_exns (Error "")
+let () = Callback.register_exception "Llvm_linker.Error" (Error "")
 
 module Mode = struct
   type t =
@@ -19,4 +18,4 @@
 end
 
 external link_modules : Llvm.llmodule -> Llvm.llmodule -> Mode.t -> unit
-                      = "llvm_link_modules"
\ No newline at end of file
+                      = "llvm_link_modules"

diff --git a/bindings/ocaml/llvm/META.llvm.in b/bindings/ocaml/llvm/META.llvm.in
index edb84e0..f9808c7 100644
--- a/bindings/ocaml/llvm/META.llvm.in
+++ b/bindings/ocaml/llvm/META.llvm.in

@@ -4,7 +4,6 @@
 archive(byte) = "llvm.cma"
 archive(native) = "llvm.cmxa"
 directory = "."
-linkopts = "-ccopt -lstdc++"
 
 package "analysis" (
     requires = "llvm"
@@ -31,7 +30,7 @@
 )
 
 package "executionengine" (
-    requires = "llvm,llvm.target"
+    requires = "llvm,llvm.target,ctypes.foreign"
     version = "@PACKAGE_VERSION@"
     description = "JIT and Interpreter for LLVM"
     archive(byte) = "llvm_executionengine.cma"

diff --git a/bindings/ocaml/llvm/Makefile b/bindings/ocaml/llvm/Makefile
index 850f564..fb682c7 100644
--- a/bindings/ocaml/llvm/Makefile
+++ b/bindings/ocaml/llvm/Makefile

@@ -1,20 +1,21 @@
 ##===- bindings/ocaml/llvm/Makefile ------------------------*- Makefile -*-===##
-# 
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
-# 
+#
 # This is the makefile for the Objective Caml Llvm interface.
-# 
+#
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../../..
 LIBRARYNAME := llvm
-UsedComponents := core
+UsedComponents := core transformutils
 UsedOcamlLibs := llvm
+ExtraLibs := -lstdc++
 
 include ../Makefile.ocaml
 

diff --git a/bindings/ocaml/llvm/llvm.ml b/bindings/ocaml/llvm/llvm.ml
index 39875a5..0df4d40 100644
--- a/bindings/ocaml/llvm/llvm.ml
+++ b/bindings/ocaml/llvm/llvm.ml

@@ -1,4 +1,4 @@
-(*===-- llvm/llvm.ml - LLVM Ocaml Interface --------------------------------===*
+(*===-- llvm/llvm.ml - LLVM OCaml Interface -------------------------------===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -66,6 +66,13 @@
   | Protected
 end
 
+module DLLStorageClass = struct
+  type t =
+  | Default
+  | DLLImport
+  | DLLExport
+end
+
 module CallConv = struct
   let c = 0
   let fast = 8
@@ -278,8 +285,7 @@
 
 exception IoError of string
 
-external register_exns : exn -> unit = "llvm_register_core_exns"
-let _ = register_exns (IoError "")
+let () = Callback.register_exception "Llvm.IoError" (IoError "")
 
 external install_fatal_error_handler : (string -> unit) -> unit
                                      = "llvm_install_fatal_error_handler"
@@ -287,6 +293,8 @@
                                    = "llvm_reset_fatal_error_handler"
 external enable_pretty_stacktrace : unit -> unit
                                   = "llvm_enable_pretty_stacktrace"
+external parse_command_line_options : ?overview:string -> string array -> unit
+                                    = "llvm_parse_command_line_options"
 
 type ('a, 'b) llpos =
 | At_end of 'a
@@ -305,6 +313,7 @@
 (*===-- Modules -----------------------------------------------------------===*)
 external create_module : llcontext -> string -> llmodule = "llvm_create_module"
 external dispose_module : llmodule -> unit = "llvm_dispose_module"
+external clone_module : llmodule -> llmodule = "LLVMCloneModule"
 external target_triple: llmodule -> string
                       = "llvm_target_triple"
 external set_target_triple: string -> llmodule -> unit
@@ -428,6 +437,7 @@
 
 (*--... Operations on users ................................................--*)
 external operand : llvalue -> int -> llvalue = "llvm_operand"
+external operand_use : llvalue -> int -> lluse = "llvm_operand_use"
 external set_operand : llvalue -> int -> llvalue -> unit = "llvm_set_operand"
 external num_operands : llvalue -> int = "llvm_num_operands"
 
@@ -465,6 +475,8 @@
 external const_int_of_string : lltype -> string -> int -> llvalue
                              = "llvm_const_int_of_string"
 external const_float : lltype -> float -> llvalue = "llvm_const_float"
+external float_of_const : llvalue -> float option
+                        = "llvm_float_of_const"
 external const_float_of_string : lltype -> string -> llvalue
                                = "llvm_const_float_of_string"
 
@@ -479,6 +491,8 @@
 external const_packed_struct : llcontext -> llvalue array -> llvalue
                              = "llvm_const_packed_struct"
 external const_vector : llvalue array -> llvalue = "llvm_const_vector"
+external string_of_const : llvalue -> string option = "llvm_string_of_const"
+external const_element : llvalue -> int -> llvalue = "llvm_const_element"
 
 (*--... Constant expressions ...............................................--*)
 external align_of : lltype -> llvalue = "LLVMAlignOf"
@@ -569,6 +583,8 @@
 external set_section : string -> llvalue -> unit = "llvm_set_section"
 external visibility : llvalue -> Visibility.t = "llvm_visibility"
 external set_visibility : Visibility.t -> llvalue -> unit = "llvm_set_visibility"
+external dll_storage_class : llvalue -> DLLStorageClass.t = "llvm_dll_storage_class"
+external set_dll_storage_class : DLLStorageClass.t -> llvalue -> unit = "llvm_set_dll_storage_class"
 external alignment : llvalue -> int = "llvm_alignment"
 external set_alignment : int -> llvalue -> unit = "llvm_set_alignment"
 external is_global_constant : llvalue -> bool = "llvm_is_global_constant"
@@ -952,6 +968,8 @@
 
 external instr_opcode : llvalue -> Opcode.t = "llvm_instr_get_opcode"
 external icmp_predicate : llvalue -> Icmp.t option = "llvm_instr_icmp_predicate"
+external fcmp_predicate : llvalue -> Fcmp.t option = "llvm_instr_fcmp_predicate"
+external instr_clone : llvalue -> llvalue = "llvm_instr_clone"
 
 let rec iter_instrs_range f i e =
   if i = e then () else
@@ -1019,6 +1037,63 @@
 external is_volatile : llvalue -> bool = "llvm_is_volatile"
 external set_volatile : bool -> llvalue -> unit = "llvm_set_volatile"
 
+(*--... Operations on terminators ..........................................--*)
+
+let is_terminator llv =
+  let open ValueKind in
+  let open Opcode in
+  match classify_value llv with
+    | Instruction (Br | IndirectBr | Invoke | Resume | Ret | Switch | Unreachable)
+      -> true
+    | _ -> false
+
+external successor : llvalue -> int -> llbasicblock = "llvm_successor"
+external set_successor : llvalue -> int -> llbasicblock -> unit
+                       = "llvm_set_successor"
+external num_successors : llvalue -> int = "llvm_num_successors"
+
+let successors llv =
+  if not (is_terminator llv) then
+    raise (Invalid_argument "Llvm.successors can only be used on terminators")
+  else
+    Array.init (num_successors llv) (successor llv)
+
+let iter_successors f llv =
+  if not (is_terminator llv) then
+    raise (Invalid_argument "Llvm.iter_successors can only be used on terminators")
+  else
+    for i = 0 to num_successors llv - 1 do
+      f (successor llv i)
+    done
+
+let fold_successors f llv z =
+  if not (is_terminator llv) then
+    raise (Invalid_argument "Llvm.fold_successors can only be used on terminators")
+  else
+    let n = num_successors llv in
+    let rec aux i acc =
+      if i >= n then acc
+      else begin
+        let llb = successor llv i in
+        aux (i+1) (f llb acc)
+      end
+    in aux 0 z
+
+
+(*--... Operations on branches .............................................--*)
+external condition : llvalue -> llvalue = "llvm_condition"
+external set_condition : llvalue -> llvalue -> unit
+                       = "llvm_set_condition"
+external is_conditional : llvalue -> bool = "llvm_is_conditional"
+
+let get_branch llv =
+  if classify_value llv <> ValueKind.Instruction Opcode.Br then
+    None
+  else if is_conditional llv then
+    Some (`Conditional (condition llv, successor llv 0, successor llv 1))
+  else
+    Some (`Unconditional (successor llv 0))
+
 (*--... Operations on phi nodes ............................................--*)
 external add_incoming : (llvalue * llbasicblock) -> llvalue -> unit
                       = "llvm_add_incoming"

diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli
index f5f5b53..e5e90c3 100644
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli

@@ -105,6 +105,15 @@
   | Protected
 end
 
+(** The DLL storage class of a global value, accessed with {!dll_storage_class} and
+    {!set_dll_storage_class}. See [llvm::GlobalValue::DLLStorageClassTypes]. *)
+module DLLStorageClass : sig
+  type t =
+  | Default
+  | DLLImport
+  | DLLExport
+end
+
 (** The following calling convention values may be accessed with
     {!function_call_conv} and {!set_function_call_conv}. Calling
     conventions are open-ended. *)
@@ -157,16 +166,16 @@
     See the [llvm::ICmpInst::Predicate] enumeration. *)
 module Icmp : sig
   type t =
-  | Eq  (* Equal *)
-  | Ne  (* Not equal *)
-  | Ugt (* Unsigned greater than *)
-  | Uge (* Unsigned greater or equal *)
-  | Ult (* Unsigned less than *)
-  | Ule (* Unsigned less or equal *)
-  | Sgt (* Signed greater than *)
-  | Sge (* Signed greater or equal *)
-  | Slt (* Signed less than *)
-  | Sle (* Signed less or equal *)
+  | Eq  (** Equal *)
+  | Ne  (** Not equal *)
+  | Ugt (** Unsigned greater than *)
+  | Uge (** Unsigned greater or equal *)
+  | Ult (** Unsigned less than *)
+  | Ule (** Unsigned less or equal *)
+  | Sgt (** Signed greater than *)
+  | Sge (** Signed greater or equal *)
+  | Slt (** Signed less than *)
+  | Sle (** Signed less or equal *)
 end
 
 (** The predicate for a floating-point comparison ([fcmp]) instruction.
@@ -175,38 +184,38 @@
     See the [llvm::FCmpInst::Predicate] enumeration. *)
 module Fcmp : sig
   type t =
-  | False (* Always false *)
-  | Oeq   (* Ordered and equal *)
-  | Ogt   (* Ordered and greater than *)
-  | Oge   (* Ordered and greater or equal *)
-  | Olt   (* Ordered and less than *)
-  | Ole   (* Ordered and less or equal *)
-  | One   (* Ordered and not equal *)
-  | Ord   (* Ordered (no operand is NaN) *)
-  | Uno   (* Unordered (one operand at least is NaN) *)
-  | Ueq   (* Unordered and equal *)
-  | Ugt   (* Unordered and greater than *)
-  | Uge   (* Unordered and greater or equal *)
-  | Ult   (* Unordered and less than *)
-  | Ule   (* Unordered and less or equal *)
-  | Une   (* Unordered and not equal *)
-  | True  (* Always true *)
+  | False (** Always false *)
+  | Oeq   (** Ordered and equal *)
+  | Ogt   (** Ordered and greater than *)
+  | Oge   (** Ordered and greater or equal *)
+  | Olt   (** Ordered and less than *)
+  | Ole   (** Ordered and less or equal *)
+  | One   (** Ordered and not equal *)
+  | Ord   (** Ordered (no operand is NaN) *)
+  | Uno   (** Unordered (one operand at least is NaN) *)
+  | Ueq   (** Unordered and equal *)
+  | Ugt   (** Unordered and greater than *)
+  | Uge   (** Unordered and greater or equal *)
+  | Ult   (** Unordered and less than *)
+  | Ule   (** Unordered and less or equal *)
+  | Une   (** Unordered and not equal *)
+  | True  (** Always true *)
 end
 
 (** The opcodes for LLVM instructions and constant expressions. *)
 module Opcode : sig
   type t =
-  | Invalid (* not an instruction *)
-  (* Terminator Instructions *)
-  | Ret
+  | Invalid (** Not an instruction *)
+
+  | Ret (** Terminator Instructions *)
   | Br
   | Switch
   | IndirectBr
   | Invoke
   | Invalid2
   | Unreachable
-  (* Standard Binary Operators *)
-  | Add
+
+  | Add (** Standard Binary Operators *)
   | FAdd
   | Sub
   | FSub
@@ -218,20 +227,20 @@
   | URem
   | SRem
   | FRem
-  (* Logical Operators *)
-  | Shl
+
+  | Shl (** Logical Operators *)
   | LShr
   | AShr
   | And
   | Or
   | Xor
-  (* Memory Operators *)
-  | Alloca
+
+  | Alloca (** Memory Operators *)
   | Load
   | Store
   | GetElementPtr
-  (* Cast Operators *)
-  | Trunc
+
+  | Trunc (** Cast Operators *)
   | ZExt
   | SExt
   | FPToUI
@@ -243,8 +252,8 @@
   | PtrToInt
   | IntToPtr
   | BitCast
-  (* Other Operators *)
-  | ICmp
+
+  | ICmp (** Other Operators *)
   | FCmp
   | PHI
   | Call
@@ -291,7 +300,7 @@
   | NotAtomic
   | Unordered
   | Monotonic
-  | Invalid (* removed due to API changes *)
+  | Invalid (** removed due to API changes *)
   | Acquire
   | Release
   | AcqiureRelease
@@ -381,6 +390,14 @@
 (** [reset_fatal_error_handler ()] resets LLVM's fatal error handler. *)
 val reset_fatal_error_handler : unit -> unit
 
+(** [parse_command_line_options ?overview args] parses [args] using
+    the LLVM command line parser. Note that the only stable thing about this
+    function is its signature; you cannot rely on any particular set of command
+    line arguments being interpreted the same way across LLVM versions.
+
+    See the function [llvm::cl::ParseCommandLineOptions()]. *)
+val parse_command_line_options : ?overview:string -> string array -> unit
+
 (** {6 Contexts} *)
 
 (** [create_context ()] creates a context for storing the "global" state in
@@ -414,6 +431,9 @@
     [llvm::Module::~Module]. *)
 val dispose_module : llmodule -> unit
 
+(** [clone_module m] returns an exact copy of module [m]. *)
+val clone_module : llmodule -> llmodule
+
 (** [target_triple m] is the target specifier for the module [m], something like
     [i686-apple-darwin8]. See the method [llvm::Module::getTargetTriple]. *)
 val target_triple: llmodule -> string
@@ -651,7 +671,7 @@
 val type_by_name : llmodule -> string -> lltype option
 
 
-(* {6 Values} *)
+(** {6 Values} *)
 
 (** [type_of v] returns the type of the value [v].
     See the method [llvm::Value::getType]. *)
@@ -682,7 +702,7 @@
 val replace_all_uses_with : llvalue -> llvalue -> unit
 
 
-(* {6 Uses} *)
+(** {6 Uses} *)
 
 (** [use_begin v] returns the first position in the use list for the value [v].
     [use_begin] and [use_succ] can e used to iterate over the use list in order.
@@ -714,12 +734,17 @@
 val fold_right_uses : (lluse -> 'a -> 'a) -> llvalue -> 'a -> 'a
 
 
-(* {6 Users} *)
+(** {6 Users} *)
 
 (** [operand v i] returns the operand at index [i] for the value [v]. See the
     method [llvm::User::getOperand]. *)
 val operand : llvalue -> int -> llvalue
 
+(** [operand_use v i] returns the use of the operand at index [i] for the value [v]. See the
+    method [llvm::User::getOperandUse]. *)
+val operand_use : llvalue -> int -> lluse
+
+
 (** [set_operand v i o] sets the operand of the value [v] at the index [i] to
     the value [o].
     See the method [llvm::User::setOperand]. *)
@@ -837,15 +862,19 @@
     value [n]. See the method [llvm::ConstantFP::get]. *)
 val const_float : lltype -> float -> llvalue
 
+(** [float_of_const c] returns the float value of the [c] constant float.
+    None is returned if this is not an float constant.
+    See the method [llvm::ConstantFP::getDoubleValue].*)
+val float_of_const : llvalue -> float option
+
 (** [const_float_of_string ty s] returns the floating point constant of type
     [ty] and value [n]. See the method [llvm::ConstantFP::get]. *)
 val const_float_of_string : lltype -> string -> llvalue
 
-
 (** {7 Operations on composite constants} *)
 
 (** [const_string c s] returns the constant [i8] array with the values of the
-    characters in the string [s] in the context [c]. The array is not 
+    characters in the string [s] in the context [c]. The array is not
     null-terminated (but see {!const_stringz}). This value can in turn be used
     as the initializer for a global variable. See the method
     [llvm::ConstantArray::get]. *)
@@ -887,6 +916,14 @@
     values [elts]. See the method [llvm::ConstantVector::get]. *)
 val const_vector : llvalue array -> llvalue
 
+(** [string_of_const c] returns [Some str] if [c] is a string constant,
+    or [None] if this is not a string constant. *)
+val string_of_const : llvalue -> string option
+
+(** [const_element c] returns a constant for a specified index's element.
+    See the method ConstantDataSequential::getElementAsConstant. *)
+val const_element : llvalue -> int -> llvalue
+
 
 (** {7 Constant expressions} *)
 
@@ -1234,6 +1271,14 @@
     [v]. See the method [llvm::GlobalValue::setVisibility]. *)
 val set_visibility : Visibility.t -> llvalue -> unit
 
+(** [dll_storage_class g] returns the DLL storage class of the global value [g].
+    See the method [llvm::GlobalValue::getDLLStorageClass]. *)
+val dll_storage_class : llvalue -> DLLStorageClass.t
+
+(** [set_dll_storage_class v g] sets the DLL storage class of the global value [g] to
+    [v]. See the method [llvm::GlobalValue::setDLLStorageClass]. *)
+val set_dll_storage_class : DLLStorageClass.t -> llvalue -> unit
+
 (** [alignment g] returns the required alignment of the global value [g].
     See the method [llvm::GlobalValue::getAlignment]. *)
 val alignment : llvalue -> int
@@ -1687,6 +1732,15 @@
     instruction [i]. *)
 val icmp_predicate : llvalue -> Icmp.t option
 
+(** [fcmp_predicate i] returns the [fcmp.t] corresponding to an [fcmp]
+    instruction [i]. *)
+val fcmp_predicate : llvalue -> Fcmp.t option
+
+(** [inst_clone i] returns a copy of instruction [i],
+    The instruction has no parent, and no name.
+    See the method [llvm::Instruction::clone]. *)
+val instr_clone : llvalue -> llvalue
+
 
 (** {7 Operations on call sites} *)
 
@@ -1741,6 +1795,52 @@
     [llvm::StoreInst::setVolatile]. *)
 val set_volatile : bool -> llvalue -> unit
 
+(** {7 Operations on terminators} *)
+
+(** [is_terminator v] returns true if the instruction [v] is a terminator. *)
+val is_terminator : llvalue -> bool
+
+(** [successor v i] returns the successor at index [i] for the value [v].
+    See the method [llvm::TerminatorInst::getSuccessor]. *)
+val successor : llvalue -> int -> llbasicblock
+
+(** [set_successor v i o] sets the successor of the value [v] at the index [i] to
+    the value [o].
+    See the method [llvm::TerminatorInst::setSuccessor]. *)
+val set_successor : llvalue -> int -> llbasicblock -> unit
+
+(** [num_successors v] returns the number of successors for the value [v].
+    See the method [llvm::TerminatorInst::getNumSuccessors]. *)
+val num_successors : llvalue -> int
+
+(** [successors v] returns the successors of [v]. *)
+val successors : llvalue -> llbasicblock array
+
+(** [iter_successors f v] applies function f to each successor [v] in order. Tail recursive. *)
+val iter_successors : (llbasicblock -> unit) -> llvalue -> unit
+
+(** [fold_successors f v init] is [f (... (f init vN) ...) v1] where [v1,...,vN] are the successors of [v]. Tail recursive. *)
+val fold_successors : (llbasicblock -> 'a -> 'a) -> llvalue -> 'a -> 'a
+
+(** {7 Operations on branches} *)
+
+(** [is_conditional v] returns true if the branch instruction [v] is conditional.
+    See the method [llvm::BranchInst::isConditional]. *)
+val is_conditional : llvalue -> bool
+
+(** [condition v] return the condition of the branch instruction [v].
+    See the method [llvm::BranchInst::getCondition]. *)
+val condition : llvalue -> llvalue
+
+(** [set_condition v c] sets the condition of the branch instruction [v] to the value [c].
+    See the method [llvm::BranchInst::setCondition]. *)
+val set_condition : llvalue -> llvalue -> unit
+
+(** [get_branch c] returns a description of the branch instruction [c]. *)
+val get_branch : llvalue ->
+  [ `Conditional of llvalue * llbasicblock * llbasicblock
+  | `Unconditional of llbasicblock ]
+    option
 
 (** {7 Operations on phi nodes} *)
 
@@ -2402,7 +2502,7 @@
       path [p]. If the file could not be read, then [IoError msg] is
       raised. *)
   val of_file : string -> llmemorybuffer
-  
+
   (** [of_stdin ()] is the memory buffer containing the contents of standard input.
       If standard input is empty, then [IoError msg] is raised. *)
   val of_stdin : unit -> llmemorybuffer
@@ -2413,7 +2513,7 @@
 
   (** [as_string mb] is the string containing the contents of memory buffer [mb]. *)
   val as_string : llmemorybuffer -> string
-  
+
   (** Disposes of a memory buffer. *)
   val dispose : llmemorybuffer -> unit
 end
@@ -2425,13 +2525,13 @@
   (**  *)
   type 'a t
   type any = [ `Module | `Function ]
-  
+
   (** [PassManager.create ()] constructs a new whole-module pass pipeline. This
       type of pipeline is suitable for link-time optimization and whole-module
       transformations.
       See the constructor of [llvm::PassManager]. *)
   val create : unit -> [ `Module ] t
-  
+
   (** [PassManager.create_function m] constructs a new function-by-function
       pass pipeline over the module [m]. It does not take ownership of [m].
       This type of pipeline is suitable for code generation and JIT compilation
@@ -2450,19 +2550,19 @@
       the module, [false] otherwise.
       See the [llvm::FunctionPassManager::doInitialization] method. *)
   val initialize : [ `Function ] t -> bool
-  
+
   (** [run_function f fpm] executes all of the function passes scheduled in the
       function pass manager [fpm] over the function [f]. Returns [true] if any
       of the passes modified [f], [false] otherwise.
       See the [llvm::FunctionPassManager::run] method. *)
   val run_function : llvalue -> [ `Function ] t -> bool
-  
+
   (** [finalize fpm] finalizes all of the function passes scheduled in in the
       function pass manager [fpm]. Returns [true] if any of the passes
       modified the module, [false] otherwise.
       See the [llvm::FunctionPassManager::doFinalization] method. *)
   val finalize : [ `Function ] t -> bool
-  
+
   (** Frees the memory of a pass pipeline. For function pipelines, does not free
       the module.
       See the destructor of [llvm::BasePassManager]. *)

diff --git a/bindings/ocaml/llvm/llvm_ocaml.c b/bindings/ocaml/llvm/llvm_ocaml.c
index d5ebdcd..63c235d 100644
--- a/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/bindings/ocaml/llvm/llvm_ocaml.c

@@ -15,46 +15,33 @@
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
 #include "llvm-c/Core.h"
 #include "caml/alloc.h"
 #include "caml/custom.h"
 #include "caml/memory.h"
 #include "caml/fail.h"
 #include "caml/callback.h"
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
 
+value llvm_string_of_message(char* Message) {
+  value String = caml_copy_string(Message);
+  LLVMDisposeMessage(Message);
 
-/* Can't use the recommended caml_named_value mechanism for backwards
-   compatibility reasons. This is largely equivalent. */
-static value llvm_ioerror_exn;
-
-CAMLprim value llvm_register_core_exns(value IoError) {
-  llvm_ioerror_exn = Field(IoError, 0);
-  register_global_root(&llvm_ioerror_exn);
-
-  return Val_unit;
+  return String;
 }
 
-static void llvm_raise(value Prototype, char *Message) {
+void llvm_raise(value Prototype, char *Message) {
   CAMLparam1(Prototype);
-  CAMLlocal1(CamlMessage);
-  
-  CamlMessage = copy_string(Message);
-  LLVMDisposeMessage(Message);
-  
-  raise_with_arg(Prototype, CamlMessage);
-  abort(); /* NOTREACHED */
-#ifdef CAMLnoreturn
-  CAMLnoreturn; /* Silences warnings, but is missing in some versions. */
-#endif
+  caml_raise_with_arg(Prototype, llvm_string_of_message(Message));
+  CAMLnoreturn;
 }
 
 static value llvm_fatal_error_handler;
 
 static void llvm_fatal_error_trampoline(const char *Reason) {
-  callback(llvm_fatal_error_handler, copy_string(Reason));
+  callback(llvm_fatal_error_handler, caml_copy_string(Reason));
 }
 
 CAMLprim value llvm_install_fatal_error_handler(value Handler) {
@@ -75,6 +62,17 @@
   return Val_unit;
 }
 
+CAMLprim value llvm_parse_command_line_options(value Overview, value Args) {
+  char *COverview;
+  if (Overview == Val_int(0)) {
+    COverview = NULL;
+  } else {
+    COverview = String_val(Field(Overview, 0));
+  }
+  LLVMParseCommandLineOptions(Wosize_val(Args), (const char* const*) Op_val(Args), COverview);
+  return Val_unit;
+}
+
 static value alloc_variant(int tag, void *Value) {
   value Iter = alloc_small(1, tag);
   Field(Iter, 0) = Val_op(Value);
@@ -157,7 +155,7 @@
 
 /* llmodule -> string */
 CAMLprim value llvm_target_triple(LLVMModuleRef M) {
-  return copy_string(LLVMGetTarget(M));
+  return caml_copy_string(LLVMGetTarget(M));
 }
 
 /* string -> llmodule -> unit */
@@ -168,7 +166,7 @@
 
 /* llmodule -> string */
 CAMLprim value llvm_data_layout(LLVMModuleRef M) {
-  return copy_string(LLVMGetDataLayout(M));
+  return caml_copy_string(LLVMGetDataLayout(M));
 }
 
 /* string -> llmodule -> unit */
@@ -186,22 +184,24 @@
 /* string -> llmodule -> unit */
 CAMLprim value llvm_print_module(value Filename, LLVMModuleRef M) {
   char* Message;
-  if(LLVMPrintModuleToFile(M, String_val(Filename), &Message)) {
-    llvm_raise(llvm_ioerror_exn, Message);
-  }
+
+  if(LLVMPrintModuleToFile(M, String_val(Filename), &Message))
+    llvm_raise(*caml_named_value("Llvm.IoError"), Message);
 
   return Val_unit;
 }
 
 /* llmodule -> string */
 CAMLprim value llvm_string_of_llmodule(LLVMModuleRef M) {
+  CAMLparam0();
+  CAMLlocal1(ModuleStr);
   char* ModuleCStr;
-  ModuleCStr = LLVMPrintModuleToString(M);
 
-  value ModuleStr = caml_copy_string(ModuleCStr);
+  ModuleCStr = LLVMPrintModuleToString(M);
+  ModuleStr = caml_copy_string(ModuleCStr);
   LLVMDisposeMessage(ModuleCStr);
 
-  return ModuleStr;
+  CAMLreturn(ModuleStr);
 }
 
 /* llmodule -> string -> unit */
@@ -234,13 +234,15 @@
 
 /* lltype -> string */
 CAMLprim value llvm_string_of_lltype(LLVMTypeRef M) {
+  CAMLparam0();
+  CAMLlocal1(TypeStr);
   char* TypeCStr;
-  TypeCStr = LLVMPrintTypeToString(M);
 
-  value TypeStr = caml_copy_string(TypeCStr);
+  TypeCStr = LLVMPrintTypeToString(M);
+  TypeStr = caml_copy_string(TypeCStr);
   LLVMDisposeMessage(TypeCStr);
 
-  return TypeStr;
+  CAMLreturn(TypeStr);
 }
 
 /*--... Operations on integer types ........................................--*/
@@ -537,7 +539,7 @@
 
 /* llvalue -> string */
 CAMLprim value llvm_value_name(LLVMValueRef Val) {
-  return copy_string(LLVMGetValueName(Val));
+  return caml_copy_string(LLVMGetValueName(Val));
 }
 
 /* string -> llvalue -> unit */
@@ -554,13 +556,15 @@
 
 /* llvalue -> string */
 CAMLprim value llvm_string_of_llvalue(LLVMValueRef M) {
+  CAMLparam0();
+  CAMLlocal1(ValueStr);
   char* ValueCStr;
-  ValueCStr = LLVMPrintValueToString(M);
 
-  value ValueStr = caml_copy_string(ValueCStr);
+  ValueCStr = LLVMPrintValueToString(M);
+  ValueStr = caml_copy_string(ValueCStr);
   LLVMDisposeMessage(ValueCStr);
 
-  return ValueStr;
+  CAMLreturn(ValueStr);
 }
 
 /* llvalue -> llvalue -> unit */
@@ -577,6 +581,11 @@
   return LLVMGetOperand(V, Int_val(I));
 }
 
+/* llvalue -> int -> lluse */
+CAMLprim LLVMUseRef llvm_operand_use(LLVMValueRef V, value I) {
+  return LLVMGetOperandUse(V, Int_val(I));
+}
+
 /* llvalue -> int -> llvalue -> unit */
 CAMLprim value llvm_set_operand(LLVMValueRef U, value I, LLVMValueRef V) {
   LLVMSetOperand(U, Int_val(I), V);
@@ -695,7 +704,7 @@
 
 /* lltype -> int -> llvalue */
 CAMLprim LLVMValueRef llvm_const_int(LLVMTypeRef IntTy, value N) {
-  return LLVMConstInt(IntTy, (long long) Int_val(N), 1);
+  return LLVMConstInt(IntTy, (long long) Long_val(N), 1);
 }
 
 /* lltype -> Int64.t -> bool -> llvalue */
@@ -729,6 +738,28 @@
   return LLVMConstReal(RealTy, Double_val(N));
 }
 
+
+/* llvalue -> float */
+CAMLprim value llvm_float_of_const(LLVMValueRef Const)
+{
+  CAMLparam0();
+  CAMLlocal1(Option);
+  LLVMBool LosesInfo;
+  double Result;
+
+  if (LLVMIsAConstantFP(Const)) {
+    Result = LLVMConstRealGetDouble(Const, &LosesInfo);
+    if (LosesInfo)
+        CAMLreturn(Val_int(0));
+
+    Option = alloc(1, 0);
+    Field(Option, 0) = caml_copy_double(Result);
+    CAMLreturn(Option);
+  }
+
+  CAMLreturn(Val_int(0));
+}
+
 /* lltype -> string -> llvalue */
 CAMLprim LLVMValueRef llvm_const_float_of_string(LLVMTypeRef RealTy, value S) {
   return LLVMConstRealOfStringAndSize(RealTy, String_val(S),
@@ -782,6 +813,31 @@
                          Wosize_val(ElementVals));
 }
 
+/* llvalue -> string option */
+CAMLprim value llvm_string_of_const(LLVMValueRef Const) {
+  const char *S;
+  size_t Len;
+  CAMLparam0();
+  CAMLlocal2(Option, Str);
+
+  if(LLVMIsAConstantDataSequential(Const) && LLVMIsConstantString(Const)) {
+    S = LLVMGetAsString(Const, &Len);
+    Str = caml_alloc_string(Len);
+    memcpy(String_val(Str), S, Len);
+
+    Option = alloc(1, 0);
+    Field(Option, 0) = Str;
+    CAMLreturn(Option);
+  } else {
+    CAMLreturn(Val_int(0));
+  }
+}
+
+/* llvalue -> int -> llvalue */
+CAMLprim LLVMValueRef llvm_const_element(LLVMValueRef Const, value N) {
+  return LLVMGetElementAsConstant(Const, Int_val(N));
+}
+
 /*--... Constant expressions ...............................................--*/
 
 /* Icmp.t -> llvalue -> llvalue -> llvalue */
@@ -881,7 +937,7 @@
 
 /* llvalue -> string */
 CAMLprim value llvm_section(LLVMValueRef Global) {
-  return copy_string(LLVMGetSection(Global));
+  return caml_copy_string(LLVMGetSection(Global));
 }
 
 /* string -> llvalue -> unit */
@@ -901,6 +957,17 @@
   return Val_unit;
 }
 
+/* llvalue -> DLLStorageClass.t */
+CAMLprim value llvm_dll_storage_class(LLVMValueRef Global) {
+  return Val_int(LLVMGetDLLStorageClass(Global));
+}
+
+/* DLLStorageClass.t -> llvalue -> unit */
+CAMLprim value llvm_set_dll_storage_class(value Viz, LLVMValueRef Global) {
+  LLVMSetDLLStorageClass(Global, Int_val(Viz));
+  return Val_unit;
+}
+
 /* llvalue -> int */
 CAMLprim value llvm_alignment(LLVMValueRef Global) {
   return Val_int(LLVMGetAlignment(Global));
@@ -1151,10 +1218,10 @@
   const char *GC;
   CAMLparam0();
   CAMLlocal2(Name, Option);
-  
+
   if ((GC = LLVMGetGC(Fn))) {
-    Name = copy_string(GC);
-    
+    Name = caml_copy_string(GC);
+
     Option = alloc(1, 0);
     Field(Option, 0) = Name;
     CAMLreturn(Option);
@@ -1328,6 +1395,25 @@
   CAMLreturn(Val_int(0));
 }
 
+/* llvalue -> FCmp.t option */
+CAMLprim value llvm_instr_fcmp_predicate(LLVMValueRef Val) {
+  CAMLparam0();
+  int x = LLVMGetFCmpPredicate(Val);
+  if (x) {
+    value Option = alloc(1, 0);
+    Field(Option, 0) = Val_int(x - LLVMRealPredicateFalse);
+    CAMLreturn(Option);
+  }
+  CAMLreturn(Val_int(0));
+}
+
+/* llvalue -> llvalue */
+CAMLprim LLVMValueRef llvm_instr_clone(LLVMValueRef Inst) {
+  if (!LLVMIsAInstruction(Inst))
+      failwith("Not an instruction");
+  return LLVMInstructionClone(Inst);
+}
+
 
 /*--... Operations on call sites ...........................................--*/
 
@@ -1386,6 +1472,43 @@
   return Val_unit;
 }
 
+
+/*--.. Operations on terminators ...........................................--*/
+
+/* llvalue -> int -> llbasicblock */
+CAMLprim LLVMBasicBlockRef llvm_successor(LLVMValueRef V, value I) {
+  return LLVMGetSuccessor(V, Int_val(I));
+}
+
+/* llvalue -> int -> llvalue -> unit */
+CAMLprim value llvm_set_successor(LLVMValueRef U, value I, LLVMBasicBlockRef B) {
+  LLVMSetSuccessor(U, Int_val(I), B);
+  return Val_unit;
+}
+
+/* llvalue -> int */
+CAMLprim value llvm_num_successors(LLVMValueRef V) {
+  return Val_int(LLVMGetNumSuccessors(V));
+}
+
+/*--.. Operations on branch ................................................--*/
+
+/* llvalue -> llvalue */
+CAMLprim LLVMValueRef llvm_condition(LLVMValueRef V) {
+  return LLVMGetCondition(V);
+}
+
+/* llvalue -> llvalue -> unit */
+CAMLprim value llvm_set_condition(LLVMValueRef B, LLVMValueRef C) {
+  LLVMSetCondition(B, C);
+  return Val_unit;
+}
+
+/* llvalue -> bool */
+CAMLprim value llvm_is_conditional(LLVMValueRef V) {
+  return Val_bool(LLVMIsConditional(V));
+}
+
 /*--... Operations on phi nodes ............................................--*/
 
 /* (llvalue * llbasicblock) -> llvalue -> unit */
@@ -1402,20 +1525,20 @@
   unsigned I;
   CAMLparam0();
   CAMLlocal3(Hd, Tl, Tmp);
-  
+
   /* Build a tuple list of them. */
   Tl = Val_int(0);
   for (I = LLVMCountIncoming(PhiNode); I != 0; ) {
     Hd = alloc(2, 0);
     Store_field(Hd, 0, (value) LLVMGetIncomingValue(PhiNode, --I));
     Store_field(Hd, 1, (value) LLVMGetIncomingBlock(PhiNode, I));
-    
+
     Tmp = alloc(2, 0);
     Store_field(Tmp, 0, Hd);
     Store_field(Tmp, 1, Tl);
     Tl = Tmp;
   }
-  
+
   CAMLreturn(Tl);
 }
 
@@ -1434,15 +1557,13 @@
 }
 
 static struct custom_operations builder_ops = {
-  (char *) "LLVMIRBuilder",
+  (char *) "Llvm.llbuilder",
   llvm_finalize_builder,
   custom_compare_default,
   custom_hash_default,
   custom_serialize_default,
-  custom_deserialize_default
-#ifdef custom_compare_ext_default
-  , custom_compare_ext_default
-#endif
+  custom_deserialize_default,
+  custom_compare_ext_default
 };
 
 static value alloc_builder(LLVMBuilderRef B) {
@@ -1472,7 +1593,7 @@
 CAMLprim LLVMBasicBlockRef llvm_insertion_block(value B) {
   LLVMBasicBlockRef InsertBlock = LLVMGetInsertBlock(Builder_val(B));
   if (!InsertBlock)
-    raise_not_found();
+    caml_raise_not_found();
   return InsertBlock;
 }
 
@@ -2048,9 +2169,9 @@
 CAMLprim LLVMValueRef llvm_build_phi(value Incoming, value Name, value B) {
   value Hd, Tl;
   LLVMValueRef FirstValue, PhiNode;
-  
+
   assert(Incoming != Val_int(0) && "Empty list passed to Llvm.build_phi!");
-  
+
   Hd = Field(Incoming, 0);
   FirstValue = (LLVMValueRef) Field(Hd, 0);
   PhiNode = LLVMBuildPhi(Builder_val(B), LLVMTypeOf(FirstValue),
@@ -2061,7 +2182,7 @@
     LLVMAddIncoming(PhiNode, (LLVMValueRef*) &Field(Hd, 0),
                     (LLVMBasicBlockRef*) &Field(Hd, 1), 1);
   }
-  
+
   return PhiNode;
 }
 
@@ -2097,7 +2218,7 @@
                                                LLVMValueRef Element,
                                                LLVMValueRef Idx,
                                                value Name, value B) {
-  return LLVMBuildInsertElement(Builder_val(B), Vec, Element, Idx, 
+  return LLVMBuildInsertElement(Builder_val(B), Vec, Element, Idx,
                                 String_val(Name));
 }
 
@@ -2149,11 +2270,11 @@
   CAMLparam1(Path);
   char *Message;
   LLVMMemoryBufferRef MemBuf;
-  
+
   if (LLVMCreateMemoryBufferWithContentsOfFile(String_val(Path),
                                                &MemBuf, &Message))
-    llvm_raise(llvm_ioerror_exn, Message);
-  
+    llvm_raise(*caml_named_value("Llvm.IoError"), Message);
+
   CAMLreturn((value) MemBuf);
 }
 
@@ -2162,22 +2283,23 @@
 CAMLprim LLVMMemoryBufferRef llvm_memorybuffer_of_stdin(value Unit) {
   char *Message;
   LLVMMemoryBufferRef MemBuf;
-  
+
   if (LLVMCreateMemoryBufferWithSTDIN(&MemBuf, &Message))
-    llvm_raise(llvm_ioerror_exn, Message);
-  
+    llvm_raise(*caml_named_value("Llvm.IoError"), Message);
+
   return MemBuf;
 }
 
 /* ?name:string -> string -> llmemorybuffer */
 CAMLprim LLVMMemoryBufferRef llvm_memorybuffer_of_string(value Name, value String) {
+  LLVMMemoryBufferRef MemBuf;
   const char *NameCStr;
+
   if(Name == Val_int(0))
     NameCStr = "";
   else
     NameCStr = String_val(Field(Name, 0));
 
-  LLVMMemoryBufferRef MemBuf;
   MemBuf = LLVMCreateMemoryBufferWithMemoryRangeCopy(
                 String_val(String), caml_string_length(String), NameCStr);
 

diff --git a/bindings/ocaml/target/llvm_target.ml b/bindings/ocaml/target/llvm_target.ml
index 974bd49..bd7388e 100644
--- a/bindings/ocaml/target/llvm_target.ml
+++ b/bindings/ocaml/target/llvm_target.ml

@@ -47,8 +47,7 @@
 
 exception Error of string
 
-external register_exns : exn -> unit = "llvm_register_target_exns"
-let _ = register_exns (Error "")
+let () = Callback.register_exception "Llvm_target.Error" (Error "")
 
 module DataLayout = struct
   type t
@@ -127,6 +126,8 @@
                     = "llvm_targetmachine_features"
   external data_layout : t -> DataLayout.t
                        = "llvm_targetmachine_data_layout"
+  external add_analysis_passes : [< Llvm.PassManager.any ] Llvm.PassManager.t -> t -> unit
+                               = "llvm_targetmachine_add_analysis_passes"
   external set_verbose_asm : bool -> t -> unit
                            = "llvm_targetmachine_set_verbose_asm"
   external emit_to_file : Llvm.llmodule -> CodeGenFileType.t -> string ->

diff --git a/bindings/ocaml/target/llvm_target.mli b/bindings/ocaml/target/llvm_target.mli
index 4f5e717..676bc61 100644
--- a/bindings/ocaml/target/llvm_target.mli
+++ b/bindings/ocaml/target/llvm_target.mli

@@ -67,7 +67,7 @@
       See the method [llvm::DataLayout::getStringRepresentation]. *)
   val as_string : t -> string
 
-  (** [add_to_pass_manager dl pm] adds the target data [dl] to
+  (** [add_to_pass_manager pm dl] adds the data layout [dl] to
       the pass manager [pm].
       See the method [llvm::PassManagerBase::add]. *)
   val add_to_pass_manager : [<Llvm.PassManager.any] Llvm.PassManager.t ->
@@ -207,6 +207,10 @@
   (** Returns the data layout of this target machine. *)
   val data_layout : t -> DataLayout.t
 
+  (** Adds the target-specific analysis passes to the pass manager.
+      See [llvm::TargetMachine::addAnalysisPasses]. *)
+  val add_analysis_passes : [< Llvm.PassManager.any ] Llvm.PassManager.t -> t -> unit
+
   (** Sets the assembly verbosity of this target machine.
       See [llvm::TargetMachine::setAsmVerbosity]. *)
   val set_verbose_asm : bool -> t -> unit

diff --git a/bindings/ocaml/target/target_ocaml.c b/bindings/ocaml/target/target_ocaml.c
index 74e8185..8f77cb4 100644
--- a/bindings/ocaml/target/target_ocaml.c
+++ b/bindings/ocaml/target/target_ocaml.c

@@ -21,37 +21,10 @@
 #include "caml/fail.h"
 #include "caml/memory.h"
 #include "caml/custom.h"
+#include "caml/callback.h"
 
-/*===---- Exceptions ------------------------------------------------------===*/
-
-static value llvm_target_error_exn;
-
-CAMLprim value llvm_register_target_exns(value Error) {
-  llvm_target_error_exn = Field(Error, 0);
-  register_global_root(&llvm_target_error_exn);
-  return Val_unit;
-}
-
-static void llvm_raise(value Prototype, char *Message) {
-  CAMLparam1(Prototype);
-  CAMLlocal1(CamlMessage);
-
-  CamlMessage = copy_string(Message);
-  LLVMDisposeMessage(Message);
-
-  raise_with_arg(Prototype, CamlMessage);
-  abort(); /* NOTREACHED */
-#ifdef CAMLnoreturn
-  CAMLnoreturn; /* Silences warnings, but is missing in some versions. */
-#endif
-}
-
-static value llvm_string_of_message(char* Message) {
-  value String = caml_copy_string(Message);
-  LLVMDisposeMessage(Message);
-
-  return String;
-}
+void llvm_raise(value Prototype, char *Message);
+value llvm_string_of_message(char* Message);
 
 /*===---- Data Layout -----------------------------------------------------===*/
 
@@ -62,15 +35,13 @@
 }
 
 static struct custom_operations llvm_data_layout_ops = {
-  (char *) "LLVMDataLayout",
+  (char *) "Llvm_target.DataLayout.t",
   llvm_finalize_data_layout,
   custom_compare_default,
   custom_hash_default,
   custom_serialize_default,
-  custom_deserialize_default
-#ifdef custom_compare_ext_default
-  , custom_compare_ext_default
-#endif
+  custom_deserialize_default,
+  custom_compare_ext_default
 };
 
 value llvm_alloc_data_layout(LLVMTargetDataRef DataLayout) {
@@ -219,7 +190,7 @@
   char *Error;
 
   if(LLVMGetTargetFromTriple(String_val(Triple), &T, &Error))
-    llvm_raise(llvm_target_error_exn, Error);
+    llvm_raise(*caml_named_value("Llvm_target.Error"), Error);
 
   return T;
 }
@@ -258,15 +229,13 @@
 }
 
 static struct custom_operations llvm_target_machine_ops = {
-  (char *) "LLVMTargetMachine",
+  (char *) "Llvm_target.TargetMachine.t",
   llvm_finalize_target_machine,
   custom_compare_default,
   custom_hash_default,
   custom_serialize_default,
-  custom_deserialize_default
-#ifdef custom_compare_ext_default
-  , custom_compare_ext_default
-#endif
+  custom_deserialize_default,
+  custom_compare_ext_default
 };
 
 static value llvm_alloc_targetmachine(LLVMTargetMachineRef Machine) {
@@ -337,6 +306,7 @@
 CAMLprim value llvm_targetmachine_data_layout(value Machine) {
   CAMLparam1(Machine);
   CAMLlocal1(DataLayout);
+  char *TargetDataCStr;
 
   /* LLVMGetTargetMachineData returns a pointer owned by the TargetMachine,
      so it is impossible to wrap it with llvm_alloc_target_data, which assumes
@@ -344,7 +314,6 @@
   LLVMTargetDataRef OrigDataLayout;
   OrigDataLayout = LLVMGetTargetMachineData(TargetMachine_val(Machine));
 
-  char* TargetDataCStr;
   TargetDataCStr = LLVMCopyStringRepOfTargetData(OrigDataLayout);
   DataLayout = llvm_alloc_data_layout(LLVMCreateTargetData(TargetDataCStr));
   LLVMDisposeMessage(TargetDataCStr);
@@ -361,12 +330,12 @@
 /* Llvm.llmodule -> CodeGenFileType.t -> string -> TargetMachine.t -> unit */
 CAMLprim value llvm_targetmachine_emit_to_file(LLVMModuleRef Module,
                             value FileType, value FileName, value Machine) {
-  char* ErrorMessage;
+  char *ErrorMessage;
 
   if(LLVMTargetMachineEmitToFile(TargetMachine_val(Machine), Module,
                                  String_val(FileName), Int_val(FileType),
                                  &ErrorMessage)) {
-    llvm_raise(llvm_target_error_exn, ErrorMessage);
+    llvm_raise(*caml_named_value("Llvm_target.Error"), ErrorMessage);
   }
 
   return Val_unit;
@@ -377,14 +346,21 @@
 CAMLprim LLVMMemoryBufferRef llvm_targetmachine_emit_to_memory_buffer(
                                 LLVMModuleRef Module, value FileType,
                                 value Machine) {
-  char* ErrorMessage;
+  char *ErrorMessage;
   LLVMMemoryBufferRef Buffer;
 
   if(LLVMTargetMachineEmitToMemoryBuffer(TargetMachine_val(Machine), Module,
                                          Int_val(FileType), &ErrorMessage,
                                          &Buffer)) {
-    llvm_raise(llvm_target_error_exn, ErrorMessage);
+    llvm_raise(*caml_named_value("Llvm_target.Error"), ErrorMessage);
   }
 
   return Buffer;
 }
+
+/* TargetMachine.t -> Llvm.PassManager.t -> unit */
+CAMLprim value llvm_targetmachine_add_analysis_passes(LLVMPassManagerRef PM,
+                                                      value Machine) {
+  LLVMAddAnalysisPasses(TargetMachine_val(Machine), PM);
+  return Val_unit;
+}

diff --git a/bindings/ocaml/transforms/Makefile b/bindings/ocaml/transforms/Makefile
index 92c8396..f3637a6 100644
--- a/bindings/ocaml/transforms/Makefile
+++ b/bindings/ocaml/transforms/Makefile

@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../../..
-DIRS = scalar ipo vectorize passmgr_builder
+DIRS = scalar_opts ipo vectorize passmgr_builder
 
 ocamldoc:
 	$(Verb) for i in $(DIRS) ; do \

diff --git a/bindings/ocaml/transforms/ipo/Makefile b/bindings/ocaml/transforms/ipo/Makefile
index ed67a7c..f54bc4e 100644
--- a/bindings/ocaml/transforms/ipo/Makefile
+++ b/bindings/ocaml/transforms/ipo/Makefile

@@ -1,4 +1,4 @@
-##===- bindings/ocaml/transforms/scalar/Makefile -----------*- Makefile -*-===##
+##===- bindings/ocaml/transforms/ipo/Makefile --------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -7,7 +7,7 @@
 #
 ##===----------------------------------------------------------------------===##
 #
-# This is the makefile for the Objective Caml Llvm_scalar_opts interface.
+# This is the makefile for the Objective Caml Llvm_ipo interface.
 #
 ##===----------------------------------------------------------------------===##
 

diff --git a/bindings/ocaml/transforms/ipo/ipo_ocaml.c b/bindings/ocaml/transforms/ipo/ipo_ocaml.c
index 4ad8afb..9d8fb1e 100644
--- a/bindings/ocaml/transforms/ipo/ipo_ocaml.c
+++ b/bindings/ocaml/transforms/ipo/ipo_ocaml.c

@@ -56,12 +56,6 @@
 }
 
 /* [`Module] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_always_inliner_pass(LLVMPassManagerRef PM) {
-  LLVMAddAlwaysInlinerPass(PM);
-  return Val_unit;
-}
-
-/* [`Module] Llvm.PassManager.t -> unit */
 CAMLprim value llvm_add_global_dce(LLVMPassManagerRef PM) {
   LLVMAddGlobalDCEPass(PM);
   return Val_unit;
@@ -74,7 +68,7 @@
 }
 
 /* [`Module] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_ipc_propagation(LLVMPassManagerRef PM) {
+CAMLprim value llvm_add_ip_constant_propagation(LLVMPassManagerRef PM) {
   LLVMAddIPConstantPropagationPass(PM);
   return Val_unit;
 }
@@ -91,7 +85,7 @@
   return Val_unit;
 }
 
-/* [`Module] Llvm.PassManager.t -> bool -> unit */
+/* [`Module] Llvm.PassManager.t -> all_but_main:bool -> unit */
 CAMLprim value llvm_add_internalize(LLVMPassManagerRef PM, value AllButMain) {
   LLVMAddInternalizePass(PM, Bool_val(AllButMain));
   return Val_unit;

diff --git a/bindings/ocaml/transforms/ipo/llvm_ipo.ml b/bindings/ocaml/transforms/ipo/llvm_ipo.ml
index 93f564a..1af7d67 100644
--- a/bindings/ocaml/transforms/ipo/llvm_ipo.ml
+++ b/bindings/ocaml/transforms/ipo/llvm_ipo.ml

@@ -7,31 +7,45 @@
  *
  *===----------------------------------------------------------------------===*)
 
-external add_argument_promotion : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_argument_promotion"
-external add_constant_merge : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_constant_merge"
-external add_dead_arg_elimination :
-  [ | `Module ] Llvm.PassManager.t -> unit = "llvm_add_dead_arg_elimination"
-external add_function_attrs : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_function_attrs"
-external add_function_inlining : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_function_inlining"
-external add_always_inliner : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_always_inliner"
-external add_global_dce : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_global_dce"
-external add_global_optimizer : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_global_optimizer"
-external add_ipc_propagation : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_ipc_propagation"
-external add_prune_eh : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_prune_eh"
-external add_ipsccp : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_ipsccp"
-external add_internalize : [ | `Module ] Llvm.PassManager.t -> bool -> unit =
-  "llvm_add_internalize"
-external add_strip_dead_prototypes :
-  [ | `Module ] Llvm.PassManager.t -> unit = "llvm_add_strip_dead_prototypes"
-external add_strip_symbols : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_strip_symbols"
+external add_argument_promotion
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_argument_promotion"
+external add_constant_merge
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_constant_merge"
+external add_dead_arg_elimination
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_dead_arg_elimination"
+external add_function_attrs
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_function_attrs"
+external add_function_inlining
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_function_inlining"
+external add_always_inliner
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_always_inliner"
+external add_global_dce
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_global_dce"
+external add_global_optimizer
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_global_optimizer"
+external add_ipc_propagation
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_ip_constant_propagation"
+external add_prune_eh
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_prune_eh"
+external add_ipsccp
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_ipsccp"
+external add_internalize
+  : [ `Module ] Llvm.PassManager.t -> all_but_main:bool -> unit
+  = "llvm_add_internalize"
+external add_strip_dead_prototypes
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_strip_dead_prototypes"
+external add_strip_symbols
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_strip_symbols"

diff --git a/bindings/ocaml/transforms/ipo/llvm_ipo.mli b/bindings/ocaml/transforms/ipo/llvm_ipo.mli
index 1944c30..09a4860 100644
--- a/bindings/ocaml/transforms/ipo/llvm_ipo.mli
+++ b/bindings/ocaml/transforms/ipo/llvm_ipo.mli

@@ -12,58 +12,72 @@
     This interface provides an OCaml API for LLVM interprocedural optimizations, the
     classes in the [LLVMIPO] library. *)
 
-(** See llvm::createAddArgumentPromotionPass *)
-external add_argument_promotion : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_argument_promotion"
+(** See the [llvm::createAddArgumentPromotionPass] function. *)
+external add_argument_promotion
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_argument_promotion"
 
-(** See llvm::createConstantMergePass function. *)
-external add_constant_merge : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_constant_merge"
+(** See the [llvm::createConstantMergePass] function. *)
+external add_constant_merge
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_constant_merge"
 
-(**  See llvm::createDeadArgEliminationPass function. *)
-external add_dead_arg_elimination :
-  [ | `Module ] Llvm.PassManager.t -> unit = "llvm_add_dead_arg_elimination"
+(** See the [llvm::createDeadArgEliminationPass] function. *)
+external add_dead_arg_elimination
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_dead_arg_elimination"
 
-(**  See llvm::createFunctionAttrsPass function. *)
-external add_function_attrs : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_function_attrs"
+(** See the [llvm::createFunctionAttrsPass] function. *)
+external add_function_attrs
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_function_attrs"
 
-(**  See llvm::createFunctionInliningPass function. *)
-external add_function_inlining : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_function_inlining"
+(** See the [llvm::createFunctionInliningPass] function. *)
+external add_function_inlining
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_function_inlining"
 
-(**  See llvm::createAlwaysInlinerPass function. *)
-external add_always_inliner : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_always_inliner"
+(** See the [llvm::createAlwaysInlinerPass] function. *)
+external add_always_inliner
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_always_inliner"
 
-(**  See llvm::createGlobalDCEPass function. *)
-external add_global_dce : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_global_dce"
+(** See the [llvm::createGlobalDCEPass] function. *)
+external add_global_dce
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_global_dce"
 
-(**  See llvm::createGlobalOptimizerPass function. *)
-external add_global_optimizer : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_global_optimizer"
+(** See the [llvm::createGlobalOptimizerPass] function. *)
+external add_global_optimizer
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_global_optimizer"
 
-(**  See llvm::createIPConstantPropagationPass function. *)
-external add_ipc_propagation : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_ipc_propagation"
+(** See the [llvm::createIPConstantPropagationPass] function. *)
+external add_ipc_propagation
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_ip_constant_propagation"
 
-(**  See llvm::createPruneEHPass function. *)
-external add_prune_eh : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_prune_eh"
+(** See the [llvm::createPruneEHPass] function. *)
+external add_prune_eh
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_prune_eh"
 
-(**  See llvm::createIPSCCPPass function. *)
-external add_ipsccp : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_ipsccp"
+(** See the [llvm::createIPSCCPPass] function. *)
+external add_ipsccp
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_ipsccp"
 
-(**  See llvm::createInternalizePass function. *)
-external add_internalize : [ | `Module ] Llvm.PassManager.t -> bool -> unit =
-  "llvm_add_internalize"
+(** See the [llvm::createInternalizePass] function. *)
+external add_internalize
+  : [ `Module ] Llvm.PassManager.t -> all_but_main:bool -> unit
+  = "llvm_add_internalize"
 
-(**  See llvm::createStripDeadPrototypesPass function. *)
-external add_strip_dead_prototypes :
-  [ | `Module ] Llvm.PassManager.t -> unit = "llvm_add_strip_dead_prototypes"
+(** See the [llvm::createStripDeadPrototypesPass] function. *)
+external add_strip_dead_prototypes
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_strip_dead_prototypes"
 
-(**  See llvm::createStripSymbolsPass function. *)
-external add_strip_symbols : [ | `Module ] Llvm.PassManager.t -> unit =
-  "llvm_add_strip_symbols"
+(** See the [llvm::createStripSymbolsPass] function. *)
+external add_strip_symbols
+  : [ `Module ] Llvm.PassManager.t -> unit
+  = "llvm_add_strip_symbols"

diff --git a/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.mli b/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.mli
index 66b0981..ce162b1 100644
--- a/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.mli
+++ b/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.mli

@@ -14,41 +14,41 @@
 
 type t
 
-(** See [llvm::PassManagerBuilder]. *)
+(** See the [llvm::PassManagerBuilder] function. *)
 external create : unit -> t
   = "llvm_pmbuilder_create"
 
-(** See [llvm::PassManagerBuilder::OptLevel]. *)
+(** See the [llvm::PassManagerBuilder::OptLevel] function. *)
 external set_opt_level : int -> t -> unit
   = "llvm_pmbuilder_set_opt_level"
 
-(** See [llvm::PassManagerBuilder::SizeLevel]. *)
+(** See the [llvm::PassManagerBuilder::SizeLevel] function. *)
 external set_size_level : int -> t -> unit
   = "llvm_pmbuilder_set_size_level"
 
-(** See [llvm::PassManagerBuilder::DisableUnitAtATime]. *)
+(** See the [llvm::PassManagerBuilder::DisableUnitAtATime] function. *)
 external set_disable_unit_at_a_time : bool -> t -> unit
   = "llvm_pmbuilder_set_disable_unit_at_a_time"
 
-(** See [llvm::PassManagerBuilder::DisableUnrollLoops]. *)
+(** See the [llvm::PassManagerBuilder::DisableUnrollLoops] function. *)
 external set_disable_unroll_loops : bool -> t -> unit
   = "llvm_pmbuilder_set_disable_unroll_loops"
 
-(** See [llvm::PassManagerBuilder::Inliner]. *)
+(** See the [llvm::PassManagerBuilder::Inliner] function. *)
 external use_inliner_with_threshold : int -> t -> unit
   = "llvm_pmbuilder_use_inliner_with_threshold"
 
-(** See [llvm::PassManagerBuilder::populateFunctionPassManager]. *)
+(** See the [llvm::PassManagerBuilder::populateFunctionPassManager] function. *)
 external populate_function_pass_manager
   : [ `Function ] Llvm.PassManager.t -> t -> unit
   = "llvm_pmbuilder_populate_function_pass_manager"
 
-(** See [llvm::PassManagerBuilder::populateModulePassManager]. *)
+(** See the [llvm::PassManagerBuilder::populateModulePassManager] function. *)
 external populate_module_pass_manager
   : [ `Module ] Llvm.PassManager.t -> t -> unit
   = "llvm_pmbuilder_populate_module_pass_manager"
 
-(** See [llvm::PassManagerBuilder::populateLTOPassManager]. *)
+(** See the [llvm::PassManagerBuilder::populateLTOPassManager] function. *)
 external populate_lto_pass_manager
   : [ `Module ] Llvm.PassManager.t -> internalize:bool -> run_inliner:bool -> t -> unit
-  = "llvm_pmbuilder_populate_lto_pass_manager"
\ No newline at end of file
+  = "llvm_pmbuilder_populate_lto_pass_manager"

diff --git a/bindings/ocaml/transforms/passmgr_builder/passmgr_builder_ocaml.c b/bindings/ocaml/transforms/passmgr_builder/passmgr_builder_ocaml.c
index a707856..a43863c 100644
--- a/bindings/ocaml/transforms/passmgr_builder/passmgr_builder_ocaml.c
+++ b/bindings/ocaml/transforms/passmgr_builder/passmgr_builder_ocaml.c

@@ -27,15 +27,13 @@
 }
 
 static struct custom_operations pmbuilder_ops = {
-  (char *) "LLVMPassManagerBuilder",
+  (char *) "Llvm_passmgr_builder.t",
   llvm_finalize_pmbuilder,
   custom_compare_default,
   custom_hash_default,
   custom_serialize_default,
-  custom_deserialize_default
-#ifdef custom_compare_ext_default
-  , custom_compare_ext_default
-#endif
+  custom_deserialize_default,
+  custom_compare_ext_default
 };
 
 static value alloc_pmbuilder(LLVMPassManagerBuilderRef Ref) {

diff --git a/bindings/ocaml/transforms/scalar/Makefile b/bindings/ocaml/transforms/scalar/Makefile
deleted file mode 100644
index 6e250f6..0000000
--- a/bindings/ocaml/transforms/scalar/Makefile
+++ /dev/null

@@ -1,19 +0,0 @@
-##===- bindings/ocaml/transforms/scalar/Makefile -----------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# This is the makefile for the Objective Caml Llvm_scalar_opts interface.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL := ../../../..
-LIBRARYNAME := llvm_scalar_opts
-UsedComponents := scalaropts
-UsedOcamlInterfaces := llvm
-
-include ../../Makefile.ocaml

diff --git a/bindings/ocaml/transforms/scalar/llvm_scalar_opts.ml b/bindings/ocaml/transforms/scalar/llvm_scalar_opts.ml
deleted file mode 100644
index 958939d..0000000
--- a/bindings/ocaml/transforms/scalar/llvm_scalar_opts.ml
+++ /dev/null

@@ -1,114 +0,0 @@
-(*===-- llvm_scalar_opts.ml - LLVM OCaml Interface -------------*- OCaml -*-===*
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*)
-
-external add_constant_propagation : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                    -> unit
-                                  = "llvm_add_constant_propagation"
-external add_sccp : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                  = "llvm_add_sccp"
-external add_dead_store_elimination : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                      -> unit
-                                    = "llvm_add_dead_store_elimination"
-external add_aggressive_dce : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_aggressive_dce"
-external
-add_scalar_repl_aggregation : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_scalar_repl_aggregation"
-
-external
-add_scalar_repl_aggregation_ssa : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_scalar_repl_aggregation_ssa"
-
-external
-add_scalar_repl_aggregation_with_threshold : int -> [<Llvm.PassManager.any] Llvm.PassManager.t
-                                             -> unit
-                            = "llvm_add_scalar_repl_aggregation_with_threshold"
-external add_ind_var_simplification : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                      -> unit
-                                    = "llvm_add_ind_var_simplification"
-external
-add_instruction_combination : [<Llvm.PassManager.any] Llvm.PassManager.t
-                              -> unit
-                            = "llvm_add_instruction_combination"
-external add_licm : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_licm"
-external add_loop_unswitch : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_loop_unswitch"
-external add_loop_unroll : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_loop_unroll"
-external add_loop_rotation : [<Llvm.PassManager.any] Llvm.PassManager.t
-                             -> unit
-                           = "llvm_add_loop_rotation"
-external
-add_memory_to_register_promotion : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                   -> unit
-                                 = "llvm_add_memory_to_register_promotion"
-external
-add_memory_to_register_demotion : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                  -> unit
-                                = "llvm_add_memory_to_register_demotion"
-external add_reassociation : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                           = "llvm_add_reassociation"
-external add_jump_threading : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_jump_threading"
-external add_cfg_simplification : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                  -> unit
-                                = "llvm_add_cfg_simplification"
-external
-add_tail_call_elimination : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                          = "llvm_add_tail_call_elimination" 
-external add_gvn : [<Llvm.PassManager.any] Llvm.PassManager.t
-                   -> unit
-                 = "llvm_add_gvn"
-external add_memcpy_opt : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_memcpy_opt"
-external add_loop_deletion : [<Llvm.PassManager.any] Llvm.PassManager.t
-                             -> unit
-                           = "llvm_add_loop_deletion"
-
-external add_loop_idiom : [<Llvm.PassManager.any] Llvm.PassManager.t
-                             -> unit
-                           = "llvm_add_loop_idiom"
-
-external
-add_lib_call_simplification : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_lib_call_simplification"
-
-external
-add_verifier : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_verifier"
-
-external
-add_correlated_value_propagation : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_correlated_value_propagation"
-
-external
-add_early_cse : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_early_cse"
-
-external
-add_lower_expect_intrinsic : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_lower_expect_intrinsic"
-
-external
-add_type_based_alias_analysis : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_type_based_alias_analysis"
-
-external
-add_basic_alias_analysis : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_basic_alias_analysis"
-
-external
-add_partially_inline_lib_calls : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_partially_inline_lib_calls"

diff --git a/bindings/ocaml/transforms/scalar/llvm_scalar_opts.mli b/bindings/ocaml/transforms/scalar/llvm_scalar_opts.mli
deleted file mode 100644
index ab6fa4a..0000000
--- a/bindings/ocaml/transforms/scalar/llvm_scalar_opts.mli
+++ /dev/null

@@ -1,168 +0,0 @@
-(*===-- llvm_scalar_opts.mli - LLVM OCaml Interface ------------*- OCaml -*-===*
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*)
-
-(** Scalar Transforms.
-
-    This interface provides an OCaml API for LLVM scalar transforms, the
-    classes in the [LLVMScalarOpts] library. *)
-
-(** See the [llvm::createConstantPropagationPass] function. *)
-external add_constant_propagation : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                    -> unit
-                                  = "llvm_add_constant_propagation"
-
-(** See the [llvm::createSCCPPass] function. *)
-external add_sccp : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                  = "llvm_add_sccp"
-
-(** See [llvm::createDeadStoreEliminationPass] function. *)
-external add_dead_store_elimination : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                      -> unit
-                                    = "llvm_add_dead_store_elimination"
-
-(** See The [llvm::createAggressiveDCEPass] function. *)
-external add_aggressive_dce : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_aggressive_dce"
-
-(** See the [llvm::createScalarReplAggregatesPass] function. *)
-external
-add_scalar_repl_aggregation : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_scalar_repl_aggregation"
-
-(** See the [llvm::createScalarReplAggregatesPassSSA] function. *)
-external
-add_scalar_repl_aggregation_ssa : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_scalar_repl_aggregation_ssa"
-
-(** See the [llvm::createScalarReplAggregatesWithThreshold] function. *)
-external
-add_scalar_repl_aggregation_with_threshold : int -> [<Llvm.PassManager.any] Llvm.PassManager.t
-                                             -> unit
-                            = "llvm_add_scalar_repl_aggregation_with_threshold"
-
-(** See the [llvm::createIndVarSimplifyPass] function. *)
-external add_ind_var_simplification : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                      -> unit
-                                    = "llvm_add_ind_var_simplification"
-
-(** See the [llvm::createInstructionCombiningPass] function. *)
-external
-add_instruction_combination : [<Llvm.PassManager.any] Llvm.PassManager.t
-                              -> unit
-                            = "llvm_add_instruction_combination"
-
-(** See the [llvm::createLICMPass] function. *)
-external add_licm : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_licm"
-
-(** See the [llvm::createLoopUnswitchPass] function. *)
-external add_loop_unswitch : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_loop_unswitch"
-
-(** See the [llvm::createLoopUnrollPass] function. *)
-external add_loop_unroll : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_loop_unroll"
-
-(** See the [llvm::createLoopRotatePass] function. *)
-external add_loop_rotation : [<Llvm.PassManager.any] Llvm.PassManager.t
-                             -> unit
-                           = "llvm_add_loop_rotation"
-
-(** See the [llvm::createPromoteMemoryToRegisterPass] function. *)
-external
-add_memory_to_register_promotion : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                   -> unit
-                                 = "llvm_add_memory_to_register_promotion"
-
-(** See the [llvm::createDemoteMemoryToRegisterPass] function. *)
-external
-add_memory_to_register_demotion : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                  -> unit
-                                = "llvm_add_memory_to_register_demotion"
-
-(** See the [llvm::createReassociatePass] function. *)
-external add_reassociation : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                           = "llvm_add_reassociation"
-
-(** See the [llvm::createJumpThreadingPass] function. *)
-external add_jump_threading : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_jump_threading"
-
-(** See the [llvm::createCFGSimplificationPass] function. *)
-external add_cfg_simplification : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                  -> unit
-                                = "llvm_add_cfg_simplification"
-
-(** See the [llvm::createTailCallEliminationPass] function. *)
-external
-add_tail_call_elimination : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                          = "llvm_add_tail_call_elimination" 
-
-(** See the [llvm::createGVNPass] function. *)
-external add_gvn : [<Llvm.PassManager.any] Llvm.PassManager.t
-                   -> unit
-                 = "llvm_add_gvn"
-
-(** See the [llvm::createMemCpyOptPass] function. *)
-external add_memcpy_opt : [<Llvm.PassManager.any] Llvm.PassManager.t
-                                -> unit
-                              = "llvm_add_memcpy_opt"
-
-(** See the [llvm::createLoopDeletionPass] function. *)
-external add_loop_deletion : [<Llvm.PassManager.any] Llvm.PassManager.t
-                             -> unit
-                           = "llvm_add_loop_deletion"
-
-external add_loop_idiom : [<Llvm.PassManager.any] Llvm.PassManager.t
-                             -> unit
-                           = "llvm_add_loop_idiom"
-
-(** See the [llvm::createSimplifyLibCallsPass] function. *)
-external
-add_lib_call_simplification : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_lib_call_simplification"
-
-(** See the [llvm::createVerifierPass] function. *)
-external
-add_verifier : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_verifier"
-
-(** See the [llvm::createCorrelatedValuePropagationPass] function. *)
-external
-add_correlated_value_propagation : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_correlated_value_propagation"
-
-(** See the [llvm::createEarlyCSE] function. *)
-external
-add_early_cse : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_early_cse"
-
-(** See the [llvm::createLowerExpectIntrinsicPass] function. *)
-external
-add_lower_expect_intrinsic : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_lower_expect_intrinsic"
-
-(** See the [llvm::createTypeBasedAliasAnalysisPass] function. *)
-external
-add_type_based_alias_analysis : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_type_based_alias_analysis"
-
-(** See the [llvm::createBasicAliasAnalysisPass] function. *)
-external
-add_basic_alias_analysis : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_basic_alias_analysis"
-
-(** See the [llvm::createPartiallyInlineLibCallsPass] function. *)
-external
-add_partially_inline_lib_calls : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-        = "llvm_add_partially_inline_lib_calls"

diff --git a/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c b/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c
deleted file mode 100644
index 0a71bd7..0000000
--- a/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c
+++ /dev/null

@@ -1,207 +0,0 @@
-/*===-- scalar_opts_ocaml.c - LLVM OCaml Glue -------------------*- C++ -*-===*\
-|*                                                                            *|
-|*                     The LLVM Compiler Infrastructure                       *|
-|*                                                                            *|
-|* This file is distributed under the University of Illinois Open Source      *|
-|* License. See LICENSE.TXT for details.                                      *|
-|*                                                                            *|
-|*===----------------------------------------------------------------------===*|
-|*                                                                            *|
-|* This file glues LLVM's OCaml interface to its C interface. These functions *|
-|* are by and large transparent wrappers to the corresponding C functions.    *|
-|*                                                                            *|
-|* Note that these functions intentionally take liberties with the CAMLparamX *|
-|* macros, since most of the parameters are not GC heap objects.              *|
-|*                                                                            *|
-\*===----------------------------------------------------------------------===*/
-
-#include "llvm-c/Transforms/Scalar.h"
-#include "caml/mlvalues.h"
-#include "caml/misc.h"
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_constant_propagation(LLVMPassManagerRef PM) {
-  LLVMAddConstantPropagationPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_sccp(LLVMPassManagerRef PM) {
-  LLVMAddSCCPPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_dead_store_elimination(LLVMPassManagerRef PM) {
-  LLVMAddDeadStoreEliminationPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_aggressive_dce(LLVMPassManagerRef PM) {
-  LLVMAddAggressiveDCEPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_scalar_repl_aggregation(LLVMPassManagerRef PM) {
-  LLVMAddScalarReplAggregatesPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_scalar_repl_aggregation_ssa(LLVMPassManagerRef PM) {
-  LLVMAddScalarReplAggregatesPassSSA(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> int -> unit */
-CAMLprim value llvm_add_scalar_repl_aggregation_with_threshold(value threshold,
-                                                               LLVMPassManagerRef PM) {
-  LLVMAddScalarReplAggregatesPassWithThreshold(PM, Int_val(threshold));
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_ind_var_simplification(LLVMPassManagerRef PM) {
-  LLVMAddIndVarSimplifyPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_instruction_combination(LLVMPassManagerRef PM) {
-  LLVMAddInstructionCombiningPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_licm(LLVMPassManagerRef PM) {
-  LLVMAddLICMPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_loop_unswitch(LLVMPassManagerRef PM) {
-  LLVMAddLoopUnswitchPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_loop_unroll(LLVMPassManagerRef PM) {
-  LLVMAddLoopUnrollPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_loop_rotation(LLVMPassManagerRef PM) {
-  LLVMAddLoopRotatePass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_memory_to_register_promotion(LLVMPassManagerRef PM) {
-  LLVMAddPromoteMemoryToRegisterPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_memory_to_register_demotion(LLVMPassManagerRef PM) {
-  LLVMAddDemoteMemoryToRegisterPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_reassociation(LLVMPassManagerRef PM) {
-  LLVMAddReassociatePass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_jump_threading(LLVMPassManagerRef PM) {
-  LLVMAddJumpThreadingPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_cfg_simplification(LLVMPassManagerRef PM) {
-  LLVMAddCFGSimplificationPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_tail_call_elimination(LLVMPassManagerRef PM) {
-  LLVMAddTailCallEliminationPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_gvn(LLVMPassManagerRef PM) {
-  LLVMAddGVNPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_memcpy_opt(LLVMPassManagerRef PM) {
-  LLVMAddMemCpyOptPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_loop_deletion(LLVMPassManagerRef PM) {
-  LLVMAddLoopDeletionPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_loop_idiom(LLVMPassManagerRef PM) {
-  LLVMAddLoopIdiomPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_lib_call_simplification(LLVMPassManagerRef PM) {
-  LLVMAddSimplifyLibCallsPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_verifier(LLVMPassManagerRef PM) {
-  LLVMAddVerifierPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_correlated_value_propagation(LLVMPassManagerRef PM) {
-  LLVMAddCorrelatedValuePropagationPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_early_cse(LLVMPassManagerRef PM) {
-  LLVMAddEarlyCSEPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_lower_expect_intrinsic(LLVMPassManagerRef PM) {
-  LLVMAddLowerExpectIntrinsicPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_type_based_alias_analysis(LLVMPassManagerRef PM) {
-  LLVMAddTypeBasedAliasAnalysisPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_basic_alias_analysis(LLVMPassManagerRef PM) {
-  LLVMAddBasicAliasAnalysisPass(PM);
-  return Val_unit;
-}
-
-/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_add_partially_inline_lib_calls(LLVMPassManagerRef PM) {
-  LLVMAddPartiallyInlineLibCallsPass(PM);
-  return Val_unit;
-}

diff --git a/bindings/ocaml/transforms/scalar_opts/Makefile b/bindings/ocaml/transforms/scalar_opts/Makefile
new file mode 100644
index 0000000..63d86a6
--- /dev/null
+++ b/bindings/ocaml/transforms/scalar_opts/Makefile

@@ -0,0 +1,19 @@
+##===- bindings/ocaml/transforms/scalar_opts/Makefile ------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# This is the makefile for the Objective Caml Llvm_scalar_opts interface.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../../..
+LIBRARYNAME := llvm_scalar_opts
+UsedComponents := scalaropts
+UsedOcamlInterfaces := llvm
+
+include ../../Makefile.ocaml

diff --git a/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.ml b/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.ml
new file mode 100644
index 0000000..b90d0ae
--- /dev/null
+++ b/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.ml

@@ -0,0 +1,120 @@
+(*===-- llvm_scalar_opts.ml - LLVM OCaml Interface ------------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+external add_aggressive_dce
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_aggressive_dce"
+external add_alignment_from_assumptions
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_alignment_from_assumptions"
+external add_cfg_simplification
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_cfg_simplification"
+external add_dead_store_elimination
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_dead_store_elimination"
+external add_scalarizer
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scalarizer"
+external add_merged_load_store_motion
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_merged_load_store_motion"
+external add_gvn
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_gvn"
+external add_ind_var_simplification
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_ind_var_simplify"
+external add_instruction_combination
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_instruction_combining"
+external add_jump_threading
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_jump_threading"
+external add_licm
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_licm"
+external add_loop_deletion
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_deletion"
+external add_loop_idiom
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_idiom"
+external add_loop_rotation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_rotate"
+external add_loop_reroll
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_reroll"
+external add_loop_unroll
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_unroll"
+external add_loop_unswitch
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_unswitch"
+external add_memcpy_opt
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_memcpy_opt"
+external add_partially_inline_lib_calls
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_partially_inline_lib_calls"
+external add_lower_switch
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_lower_switch"
+external add_memory_to_register_promotion
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_promote_memory_to_register"
+external add_reassociation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_reassociation"
+external add_sccp
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_sccp"
+external add_scalar_repl_aggregation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scalar_repl_aggregates"
+external add_scalar_repl_aggregation_ssa
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scalar_repl_aggregates_ssa"
+external add_scalar_repl_aggregation_with_threshold
+  : int -> [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scalar_repl_aggregates_with_threshold"
+external add_lib_call_simplification
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_simplify_lib_calls"
+external add_tail_call_elimination
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_tail_call_elimination"
+external add_constant_propagation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_constant_propagation"
+external add_memory_to_register_demotion
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_demote_memory_to_register"
+external add_verifier
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_verifier"
+external add_correlated_value_propagation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_correlated_value_propagation"
+external add_early_cse
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_early_cse"
+external add_lower_expect_intrinsic
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_lower_expect_intrinsic"
+external add_type_based_alias_analysis
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_type_based_alias_analysis"
+external add_scoped_no_alias_alias_analysis
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scoped_no_alias_aa"
+external add_basic_alias_analysis
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_basic_alias_analysis"

diff --git a/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.mli b/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.mli
new file mode 100644
index 0000000..b4cefed
--- /dev/null
+++ b/bindings/ocaml/transforms/scalar_opts/llvm_scalar_opts.mli

@@ -0,0 +1,198 @@
+(*===-- llvm_scalar_opts.mli - LLVM OCaml Interface -----------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+(** Scalar Transforms.
+
+    This interface provides an OCaml API for LLVM scalar transforms, the
+    classes in the [LLVMScalarOpts] library. *)
+
+(** See the [llvm::createAggressiveDCEPass] function. *)
+external add_aggressive_dce
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_aggressive_dce"
+
+(** See the [llvm::createAlignmentFromAssumptionsPass] function. *)
+external add_alignment_from_assumptions
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_alignment_from_assumptions"
+
+(** See the [llvm::createCFGSimplificationPass] function. *)
+external add_cfg_simplification
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_cfg_simplification"
+
+(** See [llvm::createDeadStoreEliminationPass] function. *)
+external add_dead_store_elimination
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_dead_store_elimination"
+
+(** See [llvm::createScalarizerPass] function. *)
+external add_scalarizer
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scalarizer"
+
+(** See [llvm::createMergedLoadStoreMotionPass] function. *)
+external add_merged_load_store_motion
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_merged_load_store_motion"
+
+(** See the [llvm::createGVNPass] function. *)
+external add_gvn
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_gvn"
+
+(** See the [llvm::createIndVarSimplifyPass] function. *)
+external add_ind_var_simplification
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_ind_var_simplify"
+
+(** See the [llvm::createInstructionCombiningPass] function. *)
+external add_instruction_combination
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_instruction_combining"
+
+(** See the [llvm::createJumpThreadingPass] function. *)
+external add_jump_threading
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_jump_threading"
+
+(** See the [llvm::createLICMPass] function. *)
+external add_licm
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_licm"
+
+(** See the [llvm::createLoopDeletionPass] function. *)
+external add_loop_deletion
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_deletion"
+
+(** See the [llvm::createLoopIdiomPass] function. *)
+external add_loop_idiom
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_idiom"
+
+(** See the [llvm::createLoopRotatePass] function. *)
+external add_loop_rotation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_rotate"
+
+(** See the [llvm::createLoopRerollPass] function. *)
+external add_loop_reroll
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_reroll"
+
+(** See the [llvm::createLoopUnrollPass] function. *)
+external add_loop_unroll
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_unroll"
+
+(** See the [llvm::createLoopUnswitchPass] function. *)
+external add_loop_unswitch
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_unswitch"
+
+(** See the [llvm::createMemCpyOptPass] function. *)
+external add_memcpy_opt
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_memcpy_opt"
+
+(** See the [llvm::createPartiallyInlineLibCallsPass] function. *)
+external add_partially_inline_lib_calls
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_partially_inline_lib_calls"
+
+(** See the [llvm::createLowerSwitchPass] function. *)
+external add_lower_switch
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_lower_switch"
+
+(** See the [llvm::createPromoteMemoryToRegisterPass] function. *)
+external add_memory_to_register_promotion
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_promote_memory_to_register"
+
+(** See the [llvm::createReassociatePass] function. *)
+external add_reassociation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_reassociation"
+
+(** See the [llvm::createSCCPPass] function. *)
+external add_sccp
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_sccp"
+
+(** See the [llvm::createScalarReplAggregatesPass] function. *)
+external add_scalar_repl_aggregation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scalar_repl_aggregates"
+
+(** See the [llvm::createScalarReplAggregatesPassSSA] function. *)
+external add_scalar_repl_aggregation_ssa
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scalar_repl_aggregates_ssa"
+
+(** See the [llvm::createScalarReplAggregatesWithThreshold] function. *)
+external add_scalar_repl_aggregation_with_threshold
+  : int -> [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scalar_repl_aggregates_with_threshold"
+
+(** See the [llvm::createSimplifyLibCallsPass] function. *)
+external add_lib_call_simplification
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_simplify_lib_calls"
+
+(** See the [llvm::createTailCallEliminationPass] function. *)
+external add_tail_call_elimination
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_tail_call_elimination"
+
+(** See the [llvm::createConstantPropagationPass] function. *)
+external add_constant_propagation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_constant_propagation"
+
+(** See the [llvm::createDemoteMemoryToRegisterPass] function. *)
+external add_memory_to_register_demotion
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_demote_memory_to_register"
+
+(** See the [llvm::createVerifierPass] function. *)
+external add_verifier
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_verifier"
+
+(** See the [llvm::createCorrelatedValuePropagationPass] function. *)
+external add_correlated_value_propagation
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_correlated_value_propagation"
+
+(** See the [llvm::createEarlyCSE] function. *)
+external add_early_cse
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_early_cse"
+
+(** See the [llvm::createLowerExpectIntrinsicPass] function. *)
+external add_lower_expect_intrinsic
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_lower_expect_intrinsic"
+
+(** See the [llvm::createTypeBasedAliasAnalysisPass] function. *)
+external add_type_based_alias_analysis
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_type_based_alias_analysis"
+
+(** See the [llvm::createScopedNoAliasAAPass] function. *)
+external add_scoped_no_alias_alias_analysis
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_scoped_no_alias_aa"
+
+(** See the [llvm::createBasicAliasAnalysisPass] function. *)
+external add_basic_alias_analysis
+  : [< Llvm.PassManager.any ] Llvm.PassManager.t -> unit
+  = "llvm_add_basic_alias_analysis"

diff --git a/bindings/ocaml/transforms/scalar_opts/scalar_opts_ocaml.c b/bindings/ocaml/transforms/scalar_opts/scalar_opts_ocaml.c
new file mode 100644
index 0000000..bae4e31
--- /dev/null
+++ b/bindings/ocaml/transforms/scalar_opts/scalar_opts_ocaml.c

@@ -0,0 +1,243 @@
+/*===-- scalar_opts_ocaml.c - LLVM OCaml Glue -------------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
+|* are by and large transparent wrappers to the corresponding C functions.    *|
+|*                                                                            *|
+|* Note that these functions intentionally take liberties with the CAMLparamX *|
+|* macros, since most of the parameters are not GC heap objects.              *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c/Transforms/Scalar.h"
+#include "caml/mlvalues.h"
+#include "caml/misc.h"
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_aggressive_dce(LLVMPassManagerRef PM) {
+  LLVMAddAggressiveDCEPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_alignment_from_assumptions(LLVMPassManagerRef PM) {
+  LLVMAddAlignmentFromAssumptionsPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_cfg_simplification(LLVMPassManagerRef PM) {
+  LLVMAddCFGSimplificationPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_dead_store_elimination(LLVMPassManagerRef PM) {
+  LLVMAddDeadStoreEliminationPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_scalarizer(LLVMPassManagerRef PM) {
+  LLVMAddScalarizerPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_merged_load_store_motion(LLVMPassManagerRef PM) {
+  LLVMAddMergedLoadStoreMotionPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_gvn(LLVMPassManagerRef PM) {
+  LLVMAddGVNPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_ind_var_simplify(LLVMPassManagerRef PM) {
+  LLVMAddIndVarSimplifyPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_instruction_combining(LLVMPassManagerRef PM) {
+  LLVMAddInstructionCombiningPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_jump_threading(LLVMPassManagerRef PM) {
+  LLVMAddJumpThreadingPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_licm(LLVMPassManagerRef PM) {
+  LLVMAddLICMPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_loop_deletion(LLVMPassManagerRef PM) {
+  LLVMAddLoopDeletionPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_loop_idiom(LLVMPassManagerRef PM) {
+  LLVMAddLoopIdiomPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_loop_rotate(LLVMPassManagerRef PM) {
+  LLVMAddLoopRotatePass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_loop_reroll(LLVMPassManagerRef PM) {
+  LLVMAddLoopRerollPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_loop_unroll(LLVMPassManagerRef PM) {
+  LLVMAddLoopUnrollPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_loop_unswitch(LLVMPassManagerRef PM) {
+  LLVMAddLoopUnswitchPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_memcpy_opt(LLVMPassManagerRef PM) {
+  LLVMAddMemCpyOptPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_partially_inline_lib_calls(LLVMPassManagerRef PM) {
+  LLVMAddPartiallyInlineLibCallsPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_lower_switch(LLVMPassManagerRef PM) {
+  LLVMAddLowerSwitchPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_promote_memory_to_register(LLVMPassManagerRef PM) {
+  LLVMAddPromoteMemoryToRegisterPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_reassociation(LLVMPassManagerRef PM) {
+  LLVMAddReassociatePass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_sccp(LLVMPassManagerRef PM) {
+  LLVMAddSCCPPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_scalar_repl_aggregates(LLVMPassManagerRef PM) {
+  LLVMAddScalarReplAggregatesPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_scalar_repl_aggregates_ssa(LLVMPassManagerRef PM) {
+  LLVMAddScalarReplAggregatesPassSSA(PM);
+  return Val_unit;
+}
+
+/* int -> [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_scalar_repl_aggregates_with_threshold(value threshold,
+                                                              LLVMPassManagerRef PM) {
+  LLVMAddScalarReplAggregatesPassWithThreshold(PM, Int_val(threshold));
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_simplify_lib_calls(LLVMPassManagerRef PM) {
+  LLVMAddSimplifyLibCallsPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_tail_call_elimination(LLVMPassManagerRef PM) {
+  LLVMAddTailCallEliminationPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_constant_propagation(LLVMPassManagerRef PM) {
+  LLVMAddConstantPropagationPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_demote_memory_to_register(LLVMPassManagerRef PM) {
+  LLVMAddDemoteMemoryToRegisterPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_verifier(LLVMPassManagerRef PM) {
+  LLVMAddVerifierPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_correlated_value_propagation(LLVMPassManagerRef PM) {
+  LLVMAddCorrelatedValuePropagationPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_early_cse(LLVMPassManagerRef PM) {
+  LLVMAddEarlyCSEPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_lower_expect_intrinsic(LLVMPassManagerRef PM) {
+  LLVMAddLowerExpectIntrinsicPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_type_based_alias_analysis(LLVMPassManagerRef PM) {
+  LLVMAddTypeBasedAliasAnalysisPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_scoped_no_alias_aa(LLVMPassManagerRef PM) {
+  LLVMAddScopedNoAliasAAPass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_basic_alias_analysis(LLVMPassManagerRef PM) {
+  LLVMAddBasicAliasAnalysisPass(PM);
+  return Val_unit;
+}

diff --git a/bindings/ocaml/transforms/vectorize/Makefile b/bindings/ocaml/transforms/vectorize/Makefile
index 5a854d1..64ac5c3 100644
--- a/bindings/ocaml/transforms/vectorize/Makefile
+++ b/bindings/ocaml/transforms/vectorize/Makefile

@@ -7,7 +7,7 @@
 #
 ##===----------------------------------------------------------------------===##
 #
-# This is the makefile for the Objective Caml Llvm_vectorize_opts interface.
+# This is the makefile for the Objective Caml Llvm_vectorize interface.
 #
 ##===----------------------------------------------------------------------===##
 

diff --git a/bindings/ocaml/transforms/vectorize/llvm_vectorize.ml b/bindings/ocaml/transforms/vectorize/llvm_vectorize.ml
index 4fc53c6..88831da 100644
--- a/bindings/ocaml/transforms/vectorize/llvm_vectorize.ml
+++ b/bindings/ocaml/transforms/vectorize/llvm_vectorize.ml

@@ -7,9 +7,12 @@
  *
  *===----------------------------------------------------------------------===*)
 
-external add_bb_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                          = "llvm_add_bb_vectorize"
-external add_loop_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_loop_vectorize"
-external add_slp_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                           = "llvm_add_slp_vectorize"
+external add_bb_vectorize
+  : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+  = "llvm_add_bb_vectorize"
+external add_loop_vectorize
+  : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_vectorize"
+external add_slp_vectorize
+  : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+  = "llvm_add_slp_vectorize"

diff --git a/bindings/ocaml/transforms/vectorize/llvm_vectorize.mli b/bindings/ocaml/transforms/vectorize/llvm_vectorize.mli
index 0253039..23a68a2 100644
--- a/bindings/ocaml/transforms/vectorize/llvm_vectorize.mli
+++ b/bindings/ocaml/transforms/vectorize/llvm_vectorize.mli

@@ -13,13 +13,16 @@
     classes in the [LLVMVectorize] library. *)
 
 (** See the [llvm::createBBVectorizePass] function. *)
-external add_bb_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                          = "llvm_add_bb_vectorize"
+external add_bb_vectorize
+  : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+  = "llvm_add_bb_vectorize"
 
 (** See the [llvm::createLoopVectorizePass] function. *)
-external add_loop_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                            = "llvm_add_loop_vectorize"
+external add_loop_vectorize
+  : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+  = "llvm_add_loop_vectorize"
 
-(** See [llvm::createSLPVectorizerPass] function. *)
-external add_slp_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-                           = "llvm_add_slp_vectorize"
+(** See the [llvm::createSLPVectorizerPass] function. *)
+external add_slp_vectorize
+  : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+  = "llvm_add_slp_vectorize"

diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index b862ceb..5204f6c 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake

@@ -42,7 +42,6 @@
 endfunction()
 
 # include checks
-check_include_file_cxx(cxxabi.h HAVE_CXXABI_H)
 check_include_file(dirent.h HAVE_DIRENT_H)
 check_include_file(dlfcn.h HAVE_DLFCN_H)
 check_include_file(errno.h HAVE_ERRNO_H)
@@ -50,6 +49,7 @@
 check_include_file(fcntl.h HAVE_FCNTL_H)
 check_include_file(inttypes.h HAVE_INTTYPES_H)
 check_include_file(limits.h HAVE_LIMITS_H)
+check_include_file(link.h HAVE_LINK_H)
 check_include_file(malloc.h HAVE_MALLOC_H)
 check_include_file(malloc/malloc.h HAVE_MALLOC_MALLOC_H)
 check_include_file(ndir.h HAVE_NDIR_H)
@@ -80,6 +80,13 @@
 check_include_file(mach/mach.h HAVE_MACH_MACH_H)
 check_include_file(mach-o/dyld.h HAVE_MACH_O_DYLD_H)
 
+# size_t must be defined before including cxxabi.h on FreeBSD 10.0.
+check_cxx_source_compiles("
+#include <stddef.h>
+#include <cxxabi.h>
+int main() { return 0; }
+" HAVE_CXXABI_H)
+
 # library checks
 if( NOT PURE_WINDOWS )
   check_library_exists(pthread pthread_create "" HAVE_LIBPTHREAD)
@@ -258,12 +265,12 @@
 
 if( LLVM_ENABLE_FFI )
   find_path(FFI_INCLUDE_PATH ffi.h PATHS ${FFI_INCLUDE_DIR})
-  if( FFI_INCLUDE_PATH )
+  if( EXISTS "${FFI_INCLUDE_PATH}/ffi.h" )
     set(FFI_HEADER ffi.h CACHE INTERNAL "")
     set(HAVE_FFI_H 1 CACHE INTERNAL "")
   else()
     find_path(FFI_INCLUDE_PATH ffi/ffi.h PATHS ${FFI_INCLUDE_DIR})
-    if( FFI_INCLUDE_PATH )
+    if( EXISTS "${FFI_INCLUDE_PATH}/ffi/ffi.h" )
       set(FFI_HEADER ffi/ffi.h CACHE INTERNAL "")
       set(HAVE_FFI_FFI_H 1 CACHE INTERNAL "")
     endif()
@@ -490,3 +497,22 @@
 else()
   message(STATUS "Sphinx disabled.")
 endif()
+
+set(LLVM_BINDINGS "")
+if(WIN32)
+  message(STATUS "Go bindings disabled.")
+else()
+  find_program(GO_EXECUTABLE NAMES go DOC "go executable")
+  if(GO_EXECUTABLE STREQUAL "GO_EXECUTABLE-NOTFOUND")
+    message(STATUS "Go bindings disabled.")
+  else()
+    execute_process(COMMAND ${GO_EXECUTABLE} run ${CMAKE_SOURCE_DIR}/bindings/go/conftest.go
+                    RESULT_VARIABLE GO_CONFTEST)
+    if(GO_CONFTEST STREQUAL "0")
+      set(LLVM_BINDINGS "${LLVM_BINDINGS} go")
+      message(STATUS "Go bindings enabled.")
+    else()
+      message(STATUS "Go bindings disabled, need at least Go 1.2.")
+    endif()
+  endif()
+endif()

diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 31540d9..bc26f06 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake

@@ -8,8 +8,13 @@
     set(update_src_props ON)
   endif()
 
-  if(LLVM_REQUIRES_EH)
-    set(LLVM_REQUIRES_RTTI ON)
+  # LLVM_REQUIRES_EH is an internal flag that individual
+  # targets can use to force EH
+  if(LLVM_REQUIRES_EH OR LLVM_ENABLE_EH)
+    if(NOT (LLVM_REQUIRES_RTTI OR LLVM_ENABLE_RTTI))
+      message(AUTHOR_WARNING "Exception handling requires RTTI. Enabling RTTI for ${name}")
+      set(LLVM_REQUIRES_RTTI ON)
+    endif()
   else()
     if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
       list(APPEND LLVM_COMPILE_FLAGS "-fno-exceptions")
@@ -19,7 +24,9 @@
     endif()
   endif()
 
-  if(NOT LLVM_REQUIRES_RTTI)
+  # LLVM_REQUIRES_RTTI is an internal flag that individual
+  # targets can use to force RTTI
+  if(NOT (LLVM_REQUIRES_RTTI OR LLVM_ENABLE_RTTI))
     list(APPEND LLVM_COMPILE_DEFINITIONS GTEST_HAS_RTTI=0)
     if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
       list(APPEND LLVM_COMPILE_FLAGS "-fno-rtti")
@@ -78,27 +85,29 @@
   else()
     set(native_export_file "${target_name}.def")
 
-    set(CAT "type")
-    if(CYGWIN)
-      set(CAT "cat")
+    set(CAT "cat")
+    set(export_file_nativeslashes ${export_file})
+    if(WIN32 AND NOT CYGWIN)
+      set(CAT "type")
+      # Convert ${export_file} to native format (backslashes) for "type"
+      # Does not use file(TO_NATIVE_PATH) as it doesn't create a native
+      # path but a build-system specific format (see CMake bug
+      # http://public.kitware.com/Bug/print_bug_page.php?bug_id=5939 )
+      string(REPLACE / \\ export_file_nativeslashes ${export_file})
     endif()
 
-    # Using ${export_file} in add_custom_command directly confuses cmd.exe.
-    file(TO_NATIVE_PATH ${export_file} export_file_backslashes)
-
     add_custom_command(OUTPUT ${native_export_file}
       COMMAND ${CMAKE_COMMAND} -E echo "EXPORTS" > ${native_export_file}
-      COMMAND ${CAT} ${export_file_backslashes} >> ${native_export_file}
+      COMMAND ${CAT} ${export_file_nativeslashes} >> ${native_export_file}
       DEPENDS ${export_file}
       VERBATIM
       COMMENT "Creating export file for ${target_name}")
-    if(CYGWIN OR MINGW)
-      set_property(TARGET ${target_name} APPEND_STRING PROPERTY
-                   LINK_FLAGS " ${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}")
-    else()
-      set_property(TARGET ${target_name} APPEND_STRING PROPERTY
-                   LINK_FLAGS " /DEF:${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}")
+    set(export_file_linker_flag "${CMAKE_CURRENT_BINARY_DIR}/${native_export_file}")
+    if(MSVC)
+      set(export_file_linker_flag "/DEF:${export_file_linker_flag}")
     endif()
+    set_property(TARGET ${target_name} APPEND_STRING PROPERTY
+                 LINK_FLAGS " ${export_file_linker_flag}")
   endif()
 
   add_custom_target(${target_name}_exports DEPENDS ${native_export_file})
@@ -133,18 +142,29 @@
   set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
 endfunction(add_llvm_symbol_exports)
 
-function(add_dead_strip target_name)
+function(add_link_opts target_name)
+  # Pass -O3 to the linker. This enabled different optimizations on different
+  # linkers.
+  if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" OR WIN32))
+    set_property(TARGET ${target_name} APPEND_STRING PROPERTY
+                 LINK_FLAGS " -Wl,-O3")
+  endif()
+
   if(NOT LLVM_NO_DEAD_STRIP)
     if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+      # ld64's implementation of -dead_strip breaks tools that use plugins.
       set_property(TARGET ${target_name} APPEND_STRING PROPERTY
                    LINK_FLAGS " -Wl,-dead_strip")
     elseif(NOT WIN32)
       # Object files are compiled with -ffunction-data-sections.
+      # Versions of bfd ld < 2.23.1 have a bug in --gc-sections that breaks
+      # tools that use plugins. Always pass --gc-sections once we require
+      # a newer linker.
       set_property(TARGET ${target_name} APPEND_STRING PROPERTY
                    LINK_FLAGS " -Wl,--gc-sections")
     endif()
   endif()
-endfunction(add_dead_strip)
+endfunction(add_link_opts)
 
 # Set each output directory according to ${CMAKE_CONFIGURATION_TYPES}.
 # Note: Don't set variables CMAKE_*_OUTPUT_DIRECTORY any more,
@@ -155,19 +175,28 @@
     return()
   endif()
 
+  # moddir -- corresponding to LIBRARY_OUTPUT_DIRECTORY.
+  # It affects output of add_library(MODULE).
+  if(WIN32 OR CYGWIN)
+    # DLL platform
+    set(moddir ${bindir})
+  else()
+    set(moddir ${libdir})
+  endif()
   if(NOT "${CMAKE_CFG_INTDIR}" STREQUAL ".")
     foreach(build_mode ${CMAKE_CONFIGURATION_TYPES})
       string(TOUPPER "${build_mode}" CONFIG_SUFFIX)
       string(REPLACE ${CMAKE_CFG_INTDIR} ${build_mode} bi ${bindir})
       string(REPLACE ${CMAKE_CFG_INTDIR} ${build_mode} li ${libdir})
+      string(REPLACE ${CMAKE_CFG_INTDIR} ${build_mode} mi ${moddir})
       set_target_properties(${target} PROPERTIES "RUNTIME_OUTPUT_DIRECTORY_${CONFIG_SUFFIX}" ${bi})
       set_target_properties(${target} PROPERTIES "ARCHIVE_OUTPUT_DIRECTORY_${CONFIG_SUFFIX}" ${li})
-      set_target_properties(${target} PROPERTIES "LIBRARY_OUTPUT_DIRECTORY_${CONFIG_SUFFIX}" ${li})
+      set_target_properties(${target} PROPERTIES "LIBRARY_OUTPUT_DIRECTORY_${CONFIG_SUFFIX}" ${mi})
     endforeach()
   else()
     set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${bindir})
     set_target_properties(${target} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${libdir})
-    set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${libdir})
+    set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${moddir})
   endif()
 endfunction()
 
@@ -265,7 +294,7 @@
   endif()
   set_output_directory(${name} ${LLVM_RUNTIME_OUTPUT_INTDIR} ${LLVM_LIBRARY_OUTPUT_INTDIR})
   llvm_update_compile_flags(${name})
-  add_dead_strip( ${name} )
+  add_link_opts( ${name} )
   if(ARG_OUTPUT_NAME)
     set_target_properties(${name}
       PROPERTIES
@@ -294,6 +323,12 @@
   endif()
 
   if(ARG_MODULE OR ARG_SHARED)
+    # Do not add -Dname_EXPORTS to the command-line when building files in this
+    # target. Doing so is actively harmful for the modules build because it
+    # creates extra module variants, and not useful because we don't use these
+    # macros.
+    set_target_properties( ${name} PROPERTIES DEFINE_SYMBOL "" )
+
     if (LLVM_EXPORTED_SYMBOL_FILE)
       add_llvm_symbol_exports( ${name} ${LLVM_EXPORTED_SYMBOL_FILE} )
     endif()
@@ -324,15 +359,8 @@
       ${lib_deps}
       ${llvm_libs}
       )
-  elseif(ARG_SHARED AND BUILD_SHARED_LIBS)
-    # FIXME: It may be PRIVATE since SO knows its dependent libs.
-    target_link_libraries(${name} PUBLIC
-      ${ARG_LINK_LIBS}
-      ${lib_deps}
-      ${llvm_libs}
-      )
   else()
-    # MODULE|SHARED
+    # We can use PRIVATE since SO knows its dependent libs.
     target_link_libraries(${name} PRIVATE
       ${ARG_LINK_LIBS}
       ${lib_deps}
@@ -364,6 +392,7 @@
     if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "LTO")
       install(TARGETS ${name}
         EXPORT LLVMExports
+        RUNTIME DESTINATION bin
         LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
         ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
     endif()
@@ -382,9 +411,15 @@
       set_target_properties( ${name} PROPERTIES EXCLUDE_FROM_ALL ON)
     else()
       if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+        if(WIN32 OR CYGWIN)
+          # DLL platform
+          set(dlldir "bin")
+        else()
+          set(dlldir "lib${LLVM_LIBDIR_SUFFIX}")
+        endif()
         install(TARGETS ${name}
           EXPORT LLVMExports
-          LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+          LIBRARY DESTINATION ${dlldir}
           ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
       endif()
       set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${name})
@@ -403,7 +438,13 @@
     add_executable(${name} ${ALL_FILES})
   endif()
   llvm_update_compile_flags(${name})
-  add_dead_strip( ${name} )
+  add_link_opts( ${name} )
+
+  # Do not add -Dname_EXPORTS to the command-line when building files in this
+  # target. Doing so is actively harmful for the modules build because it
+  # creates extra module variants, and not useful because we don't use these
+  # macros.
+  set_target_properties( ${name} PROPERTIES DEFINE_SYMBOL "" )
 
   if (LLVM_EXPORTED_SYMBOL_FILE)
     add_llvm_symbol_exports( ${name} ${LLVM_EXPORTED_SYMBOL_FILE} )
@@ -609,21 +650,9 @@
   set(HOST_OS ${CMAKE_SYSTEM_NAME})
   set(HOST_ARCH ${CMAKE_SYSTEM_PROCESSOR})
 
-  if (CLANG_ENABLE_ARCMT)
-    set(ENABLE_CLANG_ARCMT "1")
-  else()
-    set(ENABLE_CLANG_ARCMT "0")
-  endif()
-  if (CLANG_ENABLE_REWRITER)
-    set(ENABLE_CLANG_REWRITER "1")
-  else()
-    set(ENABLE_CLANG_REWRITER "0")
-  endif()
-  if (CLANG_ENABLE_STATIC_ANALYZER)
-    set(ENABLE_CLANG_STATIC_ANALYZER "1")
-  else()
-    set(ENABLE_CLANG_STATIC_ANALYZER "0")
-  endif()
+  set(HOST_CC "${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_ARG1}")
+  set(HOST_CXX "${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER_ARG1}")
+  set(HOST_LDFLAGS "${CMAKE_EXE_LINKER_FLAGS}")
 
   configure_file(${input} ${output} @ONLY)
 endfunction()
@@ -650,6 +679,7 @@
     add_custom_target(${target}
       COMMAND ${LIT_COMMAND} ${ARG_DEFAULT_ARGS}
       COMMENT "${comment}"
+      ${cmake_3_2_USES_TERMINAL}
       )
     add_dependencies(${target} ${ARG_DEPENDS})
   else()

diff --git a/cmake/modules/AddSphinxTarget.cmake b/cmake/modules/AddSphinxTarget.cmake
index fc28a49..045dc23 100644
--- a/cmake/modules/AddSphinxTarget.cmake
+++ b/cmake/modules/AddSphinxTarget.cmake

@@ -8,16 +8,23 @@
   set(SPHINX_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/${builder}")
   set(SPHINX_DOC_TREE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
   set(SPHINX_TARGET_NAME docs-${project}-${builder})
+
+  if (SPHINX_WARNINGS_AS_ERRORS)
+    set(SPHINX_WARNINGS_AS_ERRORS_FLAG "-W")
+  else()
+    set(SPHINX_WARNINGS_AS_ERRORS_FLAG "")
+  endif()
+
   add_custom_target(${SPHINX_TARGET_NAME}
                     COMMAND ${SPHINX_EXECUTABLE}
                             -b ${builder}
                             -d "${SPHINX_DOC_TREE_DIR}"
                             -q # Quiet: no output other than errors and warnings.
-                            -W # Warnings are errors.
+                            ${SPHINX_WARNINGS_AS_ERRORS_FLAG} # Treat warnings as errors if requested
                             "${CMAKE_CURRENT_SOURCE_DIR}" # Source
                             "${SPHINX_BUILD_DIR}" # Output
                     COMMENT
-                    "Generating ${builder} Sphinx documentation for ${project}")
+                    "Generating ${builder} Sphinx documentation for ${project} into \"${SPHINX_BUILD_DIR}\"")
 
   # When "clean" target is run, remove the Sphinx build directory
   set_property(DIRECTORY APPEND PROPERTY

diff --git a/cmake/modules/CMakeLists.txt b/cmake/modules/CMakeLists.txt
index 08aeeb9..c87193d 100644
--- a/cmake/modules/CMakeLists.txt
+++ b/cmake/modules/CMakeLists.txt

@@ -17,11 +17,9 @@
 set(LLVM_CONFIG_CODE "
 # LLVM_BUILD_* values available only from LLVM build tree.
 set(LLVM_BUILD_BINARY_DIR \"${LLVM_BINARY_DIR}\")
-set(LLVM_BUILD_ENABLE_ASSERTIONS \"${LLVM_ENABLE_ASSERTIONS}\")
 set(LLVM_BUILD_LIBRARY_DIR \"${LLVM_LIBRARY_DIR}\")
 set(LLVM_BUILD_MAIN_INCLUDE_DIR \"${LLVM_MAIN_INCLUDE_DIR}\")
 set(LLVM_BUILD_MAIN_SRC_DIR \"${LLVM_MAIN_SRC_DIR}\")
-set(LLVM_BUILD_TOOLS_BINARY_DIR \"${LLVM_TOOLS_BINARY_DIR}\")
 ")
 set(LLVM_CONFIG_INCLUDE_DIRS
   "${LLVM_MAIN_INCLUDE_DIR}"
@@ -31,6 +29,7 @@
   "${LLVM_LIBRARY_DIR}"
   )
 set(LLVM_CONFIG_CMAKE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(LLVM_CONFIG_TOOLS_BINARY_DIR "${LLVM_TOOLS_BINARY_DIR}")
 set(LLVM_CONFIG_EXPORTS_FILE "${llvm_cmake_builddir}/LLVMExports.cmake")
 configure_file(
   LLVMConfig.cmake.in
@@ -61,6 +60,7 @@
 set(LLVM_CONFIG_INCLUDE_DIRS "\${LLVM_INSTALL_PREFIX}/include")
 set(LLVM_CONFIG_LIBRARY_DIRS "\${LLVM_INSTALL_PREFIX}/lib")
 set(LLVM_CONFIG_CMAKE_DIR "\${LLVM_INSTALL_PREFIX}/${LLVM_INSTALL_PACKAGE_DIR}")
+set(LLVM_CONFIG_TOOLS_BINARY_DIR "\${LLVM_INSTALL_PREFIX}/bin")
 set(LLVM_CONFIG_EXPORTS_FILE "\${LLVM_CMAKE_DIR}/LLVMExports.cmake")
 configure_file(
   LLVMConfig.cmake.in

diff --git a/cmake/modules/CheckAtomic.cmake b/cmake/modules/CheckAtomic.cmake
index 0d63a82..2ed4819 100644
--- a/cmake/modules/CheckAtomic.cmake
+++ b/cmake/modules/CheckAtomic.cmake

@@ -2,6 +2,11 @@
 
 INCLUDE(CheckCXXSourceCompiles)
 
+check_library_exists(atomic __atomic_fetch_add_4 "" HAVE_LIBATOMIC)
+if (HAVE_LIBATOMIC)
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "atomic")
+endif()
+
 CHECK_CXX_SOURCE_COMPILES("
 #ifdef _MSC_VER
 #include <windows.h>

diff --git a/cmake/modules/CrossCompile.cmake b/cmake/modules/CrossCompile.cmake
new file mode 100644
index 0000000..400381c
--- /dev/null
+++ b/cmake/modules/CrossCompile.cmake

@@ -0,0 +1,33 @@
+if(NOT DEFINED LLVM_NATIVE_BUILD)
+  set(LLVM_NATIVE_BUILD "${CMAKE_BINARY_DIR}/native")
+  message(STATUS "Setting native build dir to ${LLVM_NATIVE_BUILD}")
+endif(NOT DEFINED LLVM_NATIVE_BUILD)
+
+add_custom_command(OUTPUT ${LLVM_NATIVE_BUILD}
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${LLVM_NATIVE_BUILD}
+  COMMENT "Creating ${LLVM_NATIVE_BUILD}...")
+
+add_custom_command(OUTPUT ${LLVM_NATIVE_BUILD}/CMakeCache.txt
+  COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" ${CMAKE_SOURCE_DIR}
+  WORKING_DIRECTORY ${LLVM_NATIVE_BUILD}
+  DEPENDS ${LLVM_NATIVE_BUILD}
+  COMMENT "Configuring native LLVM...")
+
+add_custom_target(ConfigureNativeLLVM DEPENDS ${LLVM_NATIVE_BUILD}/CMakeCache.txt)
+
+set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES ${LLVM_NATIVE_BUILD})
+
+if(NOT IS_DIRECTORY ${LLVM_NATIVE_BUILD})
+  if(${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin")
+    set(HOST_SYSROOT_FLAGS -DCMAKE_OSX_SYSROOT=macosx)
+  endif(${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin")
+
+  message(STATUS "Configuring native build...")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory
+    ${LLVM_NATIVE_BUILD} )
+
+  message(STATUS "Configuring native targets...")
+  execute_process(COMMAND ${CMAKE_COMMAND} -DCMAKE_BUILD_TYPE=Release
+      -G "${CMAKE_GENERATOR}" -DLLVM_TARGETS_TO_BUILD=${LLVM_TARGETS_TO_BUILD} ${HOST_SYSROOT_FLAGS} ${CMAKE_SOURCE_DIR}
+    WORKING_DIRECTORY ${LLVM_NATIVE_BUILD} )
+endif(NOT IS_DIRECTORY ${LLVM_NATIVE_BUILD})

diff --git a/cmake/modules/FindSphinx.cmake b/cmake/modules/FindSphinx.cmake
index a2adcae..9d252e8 100644
--- a/cmake/modules/FindSphinx.cmake
+++ b/cmake/modules/FindSphinx.cmake

@@ -23,3 +23,5 @@
 # Provide options for controlling different types of output
 option(SPHINX_OUTPUT_HTML "Output standalone HTML files" ON)
 option(SPHINX_OUTPUT_MAN "Output man pages" ON)
+
+option(SPHINX_WARNINGS_AS_ERRORS "When building documentation treat warnings as errors" ON)

diff --git a/cmake/modules/GetSVN.cmake b/cmake/modules/GetSVN.cmake
index acccc12..4e32c09 100644
--- a/cmake/modules/GetSVN.cmake
+++ b/cmake/modules/GetSVN.cmake

@@ -2,24 +2,44 @@
 #
 # Input variables:
 #   FIRST_SOURCE_DIR  - First source directory
-#   FIRST_REPOSITORY  - The macro to define to the first revision number.
-#   SECOND_SOURCE_DIR - Second source directory
-#   SECOND_REPOSITORY - The macro to define to the second revision number.
+#   FIRST_NAME        - The macro prefix for the first repository's info
+#   SECOND_SOURCE_DIR - Second source directory (opt)
+#   SECOND_NAME       - The macro prefix for the second repository's info (opt)
 #   HEADER_FILE       - The header file to write
-include(FindSubversion)
-if (Subversion_FOUND AND EXISTS "${FIRST_SOURCE_DIR}/.svn")
-  # Repository information for the first repository.
-  Subversion_WC_INFO(${FIRST_SOURCE_DIR} MY)
-  file(WRITE ${HEADER_FILE}.txt "#define ${FIRST_REPOSITORY} \"${MY_WC_REVISION}\"\n")
+#
+# The output header will contain macros FIRST_REPOSITORY and FIRST_REVISION,
+# and SECOND_REPOSITORY and SECOND_REVISION if requested, where "FIRST" and
+# "SECOND" are substituted with the names specified in the input variables.
 
-  # Repository information for the second repository.
-  if (EXISTS "${SECOND_SOURCE_DIR}/.svn")
-    Subversion_WC_INFO(${SECOND_SOURCE_DIR} MY)
-    file(APPEND ${HEADER_FILE}.txt 
-      "#define ${SECOND_REPOSITORY} \"${MY_WC_REVISION}\"\n")
-  endif ()
+# Chop off cmake/modules/GetSVN.cmake 
+get_filename_component(LLVM_DIR "${CMAKE_SCRIPT_MODE_FILE}" PATH)
+get_filename_component(LLVM_DIR "${LLVM_DIR}" PATH)
+get_filename_component(LLVM_DIR "${LLVM_DIR}" PATH)
 
-  # Copy the file only if it has changed.
-  execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
-    ${HEADER_FILE}.txt ${HEADER_FILE})
+# Handle strange terminals
+set(ENV{TERM} "dumb")
+
+function(append_info name path)
+  execute_process(COMMAND "${LLVM_DIR}/utils/GetSourceVersion" "${path}"
+    OUTPUT_VARIABLE revision)
+  string(STRIP "${revision}" revision)
+  execute_process(COMMAND "${LLVM_DIR}/utils/GetRepositoryPath" "${path}"
+    OUTPUT_VARIABLE repository
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  string(STRIP "${repository}" repository)
+  file(APPEND "${HEADER_FILE}.txt"
+    "#define ${name}_REVISION \"${revision}\"\n")
+  file(APPEND "${HEADER_FILE}.txt"
+    "#define ${name}_REPOSITORY \"${repository}\"\n")
+endfunction()
+
+append_info(${FIRST_NAME} "${FIRST_SOURCE_DIR}")
+if(DEFINED SECOND_SOURCE_DIR)
+  append_info(${SECOND_NAME} "${SECOND_SOURCE_DIR}")
 endif()
+
+# Copy the file only if it has changed.
+execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+  "${HEADER_FILE}.txt" "${HEADER_FILE}")
+file(REMOVE "${HEADER_FILE}.txt")
+

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index b8577f7..6cc6d65 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake

@@ -2,6 +2,10 @@
 # options and executing the appropriate CMake commands to realize the users'
 # selections.
 
+# This is commonly needed so make sure it's defined before we include anything
+# else.
+string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+
 include(HandleLLVMStdlib)
 include(AddLLVMDefinitions)
 include(CheckCCompilerFlag)
@@ -25,9 +29,6 @@
       set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
       set(OLD_CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES})
       set(CMAKE_REQUIRED_FLAGS "-std=c++0x")
-      if (ANDROID)
-        set(CMAKE_REQUIRED_LIBRARIES "atomic")
-      endif()
       check_cxx_source_compiles("
 #include <atomic>
 std::atomic<float> x(0.0f);
@@ -131,7 +132,7 @@
 function(add_flag_or_print_warning flag name)
   check_c_compiler_flag("-Werror ${flag}" "C_SUPPORTS_${name}")
   check_cxx_compiler_flag("-Werror ${flag}" "CXX_SUPPORTS_${name}")
-  if ("C_SUPPORTS_${name}" AND "CXX_SUPPORTS_${name}")
+  if (C_SUPPORTS_${name} AND CXX_SUPPORTS_${name})
     message(STATUS "Building with ${flag}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${flag}" PARENT_SCOPE)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${flag}" PARENT_SCOPE)
@@ -170,6 +171,10 @@
   endif( LLVM_BUILD_32_BITS )
 endif( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
 
+if (LLVM_BUILD_STATIC)
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
+endif()
+
 if( XCODE )
   # For Xcode enable several build settings that correspond to
   # many warnings that are on by default in Clang but are
@@ -240,11 +245,16 @@
     -wd4146 # Suppress 'unary minus operator applied to unsigned type, result still unsigned'
     -wd4180 # Suppress 'qualifier applied to function type has no meaning; ignored'
     -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
+    -wd4258 # Suppress ''var' : definition from the for loop is ignored; the definition from the enclosing scope is used'
     -wd4267 # Suppress ''var' : conversion from 'size_t' to 'type', possible loss of data'
     -wd4291 # Suppress ''declaration' : no matching operator delete found; memory will not be freed if initialization throws an exception'
     -wd4345 # Suppress 'behavior change: an object of POD type constructed with an initializer of the form () will be default-initialized'
     -wd4351 # Suppress 'new behavior: elements of array 'array' will be default initialized'
     -wd4355 # Suppress ''this' : used in base member initializer list'
+    -wd4456 # Suppress 'declaration of 'var' hides local variable'
+    -wd4457 # Suppress 'declaration of 'var' hides function parameter'
+    -wd4458 # Suppress 'declaration of 'var' hides class member'
+    -wd4459 # Suppress 'declaration of 'var' hides global declaration'
     -wd4503 # Suppress ''identifier' : decorated name length exceeded, name was truncated'
     -wd4624 # Suppress ''derived class' : destructor could not be generated because a base class destructor is inaccessible'
     -wd4722 # Suppress 'function' : destructor never returns, potential memory leak
@@ -270,6 +280,7 @@
 elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
   if (LLVM_ENABLE_WARNINGS)
     append("-Wall -W -Wno-unused-parameter -Wwrite-strings" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    append("-Wcast-qual" CMAKE_CXX_FLAGS)
 
     # Turn off missing field initializer warnings for gcc to avoid noise from
     # false positives with empty {}. Turn them on otherwise (they're off by
@@ -287,13 +298,25 @@
     add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
     append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
     append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
-    check_cxx_compiler_flag("-Werror -Wnon-virtual-dtor" CXX_SUPPORTS_NON_VIRTUAL_DTOR_FLAG)
-    append_if(CXX_SUPPORTS_NON_VIRTUAL_DTOR_FLAG "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
+
+    # Check if -Wnon-virtual-dtor warns even though the class is marked final.
+    # If it does, don't add it. So it won't be added on clang 3.4 and older.
+    # This also catches cases when -Wnon-virtual-dtor isn't supported by
+    # the compiler at all.
+    set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++11 -Werror=non-virtual-dtor")
+    CHECK_CXX_SOURCE_COMPILES("class base {public: virtual void anchor();protected: ~base();};
+                               class derived final : public base { public: ~derived();};
+                               int main() { return 0; }"
+                              CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR)
+    set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+    append_if(CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR
+              "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
 
     # Check if -Wcomment is OK with an // comment ending with '\' if the next
     # line is also a // comment.
     set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
-    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} -Werror -Wcomment)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror -Wcomment")
     CHECK_C_SOURCE_COMPILES("// \\\\\\n//\\nint main() {return 0;}"
                             C_WCOMMENT_ALLOWS_LINE_WRAP)
     set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
@@ -322,6 +345,25 @@
       message(FATAL_ERROR "LLVM requires C++11 support but the '-std=c++11' flag isn't supported.")
     endif()
   endif()
+  if (LLVM_ENABLE_MODULES)
+    set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -fmodules -fcxx-modules")
+    # Check that we can build code with modules enabled, and that repeatedly
+    # including <cassert> still manages to respect NDEBUG properly.
+    CHECK_CXX_SOURCE_COMPILES("#undef NDEBUG
+                               #include <cassert>
+                               #define NDEBUG
+                               #include <cassert>
+                               int main() { assert(this code is not compiled); }"
+                               CXX_SUPPORTS_MODULES)
+    set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+    if (CXX_SUPPORTS_MODULES)
+      append_if(CXX_SUPPORTS_MODULES "-fmodules" CMAKE_C_FLAGS)
+      append_if(CXX_SUPPORTS_MODULES "-fmodules -fcxx-modules" CMAKE_CXX_FLAGS)
+    else()
+      message(FATAL_ERROR "LLVM_ENABLE_MODULES is not supported by this compiler")
+    endif()
+  endif(LLVM_ENABLE_MODULES)
 endif( MSVC )
 
 macro(append_common_sanitizer_flags)
@@ -350,6 +392,13 @@
       if(LLVM_USE_SANITIZER STREQUAL "MemoryWithOrigins")
         append("-fsanitize-memory-track-origins" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
       endif()
+    elseif (LLVM_USE_SANITIZER STREQUAL "Undefined")
+      append_common_sanitizer_flags()
+      append("-fsanitize=undefined -fno-sanitize=vptr,function -fno-sanitize-recover"
+              CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    elseif (LLVM_USE_SANITIZER STREQUAL "Thread")
+      append_common_sanitizer_flags()
+      append("-fsanitize=thread" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     else()
       message(WARNING "Unsupported value of LLVM_USE_SANITIZER: ${LLVM_USE_SANITIZER}")
     endif()
@@ -360,7 +409,7 @@
 
 # Turn on -gsplit-dwarf if requested
 if(LLVM_USE_SPLIT_DWARF)
-  add_llvm_definitions("-gsplit-dwarf")
+  add_definitions("-gsplit-dwarf")
 endif()
 
 add_llvm_definitions( -D__STDC_CONSTANT_MACROS )
@@ -408,11 +457,21 @@
   string(REGEX REPLACE "(^| ) */GR-? *( |$)" "\\1 \\2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
+# Provide public options to globally control RTTI and EH
+option(LLVM_ENABLE_EH "Enable Exception handling" OFF)
+option(LLVM_ENABLE_RTTI "Enable run time type information" OFF)
+if(LLVM_ENABLE_EH AND NOT LLVM_ENABLE_RTTI)
+  message(FATAL_ERROR "Exception handling requires RTTI. You must set LLVM_ENABLE_RTTI to ON")
+endif()
+
 # Plugin support
 # FIXME: Make this configurable.
 if(WIN32 OR CYGWIN)
-  # DLL platform(s) don't support plugins.
-  set(LLVM_ENABLE_PLUGINS OFF)
+  if(BUILD_SHARED_LIBS)
+    set(LLVM_ENABLE_PLUGINS ON)
+  else()
+    set(LLVM_ENABLE_PLUGINS OFF)
+  endif()
 else()
   set(LLVM_ENABLE_PLUGINS ON)
 endif()

diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake
index 2783af8..e8c42fc 100644
--- a/cmake/modules/LLVM-Config.cmake
+++ b/cmake/modules/LLVM-Config.cmake

@@ -40,9 +40,9 @@
 
   llvm_map_components_to_libnames(LIBRARIES ${link_components})
   get_target_property(t ${executable} TYPE)
-  if("${t}" STREQUAL "STATIC_LIBRARY")
+  if("x${t}" STREQUAL "xSTATIC_LIBRARY")
     target_link_libraries(${executable} ${cmake_2_8_12_INTERFACE} ${LIBRARIES})
-  elseif("${t}" STREQUAL "SHARED_LIBRARY" OR "${t}" STREQUAL "MODULE_LIBRARY")
+  elseif("x${t}" STREQUAL "xSHARED_LIBRARY" OR "x${t}" STREQUAL "xMODULE_LIBRARY")
     target_link_libraries(${executable} ${cmake_2_8_12_PRIVATE} ${LIBRARIES})
   else()
     # Use plain form for legacy user.
@@ -51,12 +51,14 @@
 endfunction(explicit_llvm_config)
 
 
-# This is a variant intended for the final user:
+# This is Deprecated
 function(llvm_map_components_to_libraries OUT_VAR)
+  message(AUTHOR_WARNING "Using llvm_map_components_to_libraries() is deprecated. Use llvm_map_components_to_libnames() instead")
   explicit_map_components_to_libraries(result ${ARGN})
   set( ${OUT_VAR} ${result} ${sys_result} PARENT_SCOPE )
 endfunction(llvm_map_components_to_libraries)
 
+# This is a variant intended for the final user:
 # Map LINK_COMPONENTS to actual libnames.
 function(llvm_map_components_to_libnames out_libs)
   set( link_components ${ARGN} )
@@ -105,6 +107,9 @@
       if( TARGET LLVM${c}AsmParser )
         list(APPEND expanded_components "LLVM${c}AsmParser")
       endif()
+      if( TARGET LLVM${c}Desc )
+        list(APPEND expanded_components "LLVM${c}Desc")
+      endif()
       if( TARGET LLVM${c}Info )
         list(APPEND expanded_components "LLVM${c}Info")
       endif()
@@ -115,6 +120,12 @@
       # already processed
     elseif( c STREQUAL "nativecodegen" )
       list(APPEND expanded_components "LLVM${LLVM_NATIVE_ARCH}CodeGen")
+      if( TARGET LLVM${LLVM_NATIVE_ARCH}Desc )
+        list(APPEND expanded_components "LLVM${LLVM_NATIVE_ARCH}Desc")
+      endif()
+      if( TARGET LLVM${LLVM_NATIVE_ARCH}Info )
+        list(APPEND expanded_components "LLVM${LLVM_NATIVE_ARCH}Info")
+      endif()
     elseif( c STREQUAL "backend" )
       # same case as in `native'.
     elseif( c STREQUAL "engine" )

diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in
index 780001a..780a608 100644
--- a/cmake/modules/LLVMConfig.cmake.in
+++ b/cmake/modules/LLVMConfig.cmake.in

@@ -21,6 +21,12 @@
 
 set(TARGET_TRIPLE "@TARGET_TRIPLE@")
 
+set(LLVM_ENABLE_ASSERTIONS @LLVM_ENABLE_ASSERTIONS@)
+
+set(LLVM_ENABLE_EH @LLVM_ENABLE_EH@)
+
+set(LLVM_ENABLE_RTTI @LLVM_ENABLE_RTTI@)
+
 set(LLVM_ENABLE_TERMINFO @LLVM_ENABLE_TERMINFO@)
 
 set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@)
@@ -38,6 +44,7 @@
 set(LLVM_LIBRARY_DIRS "@LLVM_CONFIG_LIBRARY_DIRS@")
 set(LLVM_DEFINITIONS "-D__STDC_LIMIT_MACROS" "-D__STDC_CONSTANT_MACROS")
 set(LLVM_CMAKE_DIR "@LLVM_CONFIG_CMAKE_DIR@")
+set(LLVM_TOOLS_BINARY_DIR "@LLVM_CONFIG_TOOLS_BINARY_DIR@")
 
 if(NOT TARGET LLVMSupport)
   include("@LLVM_CONFIG_EXPORTS_FILE@")

diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index 08b9c8e..64ebce8 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake

@@ -59,12 +59,17 @@
   file(GLOB globbed *.c *.cpp)
   foreach(g ${globbed})
     get_filename_component(fn ${g} NAME)
-    list(FIND LLVM_OPTIONAL_SOURCES ${fn} idx)
-    if( idx LESS 0 )
-      list(FIND listed ${fn} idx)
+
+    # Don't reject hidden files. Some editors create backups in the
+    # same directory as the file.
+    if (NOT "${fn}" MATCHES "^\\.")
+      list(FIND LLVM_OPTIONAL_SOURCES ${fn} idx)
       if( idx LESS 0 )
-        message(SEND_ERROR "Found unknown source file ${g}
+        list(FIND listed ${fn} idx)
+        if( idx LESS 0 )
+          message(SEND_ERROR "Found unknown source file ${g}
 Please update ${CMAKE_CURRENT_LIST_FILE}\n")
+        endif()
       endif()
     endif()
   endforeach()

diff --git a/cmake/modules/Makefile b/cmake/modules/Makefile
index 265c1f8..dd31aa7 100644
--- a/cmake/modules/Makefile
+++ b/cmake/modules/Makefile

@@ -15,6 +15,37 @@
 
 PROJ_cmake := $(DESTDIR)$(PROJ_prefix)/share/llvm/cmake
 
+ifeq ($(DISABLE_ASSERTIONS),1)
+	LLVM_ENABLE_ASSERTIONS := 0
+else
+	LLVM_ENABLE_ASSERTIONS := 1
+endif
+
+ifeq ($(REQUIRES_EH),1)
+	LLVM_ENABLE_EH := 1
+else
+	LLVM_ENABLE_EH := 0
+endif
+
+ifeq ($(REQUIRES_RTTI),1)
+	LLVM_ENABLE_RTTI := 1
+else
+	LLVM_ENABLE_RTTI := 0
+endif
+
+# Don't try to run llvm-config during clean because it won't be available
+ifneq ($(MAKECMDGOALS),clean)
+LLVM_LIBS_TO_EXPORT := $(subst -l,,$(shell $(LLVM_CONFIG) --libs $(LINK_COMPONENTS) || echo Error))
+
+ifeq ($(LLVM_LIBS_TO_EXPORT),Error)
+$(error llvm-config --libs failed)
+endif
+
+ifndef LLVM_LIBS_TO_EXPORT
+$(error LLVM_LIBS_TO_EXPORT cannot be empty)
+endif
+endif
+
 OBJMODS := LLVMConfig.cmake LLVMConfigVersion.cmake LLVMExports.cmake
 
 $(PROJ_OBJ_DIR)/LLVMConfig.cmake: LLVMConfig.cmake.in $(LLVMBuildCMakeFrag)
@@ -27,11 +58,14 @@
 	  -e 's/@LLVM_VERSION_PATCH@/'"$(LLVM_VERSION_PATCH)"'/' \
 	  -e 's/@PACKAGE_VERSION@/'"$(LLVMVersion)"'/' \
 	  -e 's/@LLVM_COMMON_DEPENDS@//' \
-	  -e 's/@LLVM_AVAILABLE_LIBS@/'"$(subst -l,,$(LLVMConfigLibs))"'/' \
+	  -e 's/@LLVM_AVAILABLE_LIBS@/'"$(LLVM_LIBS_TO_EXPORT)"'/' \
 	  -e 's/@LLVM_ALL_TARGETS@/'"$(ALL_TARGETS)"'/' \
 	  -e 's/@LLVM_TARGETS_TO_BUILD@/'"$(TARGETS_TO_BUILD)"'/' \
 	  -e 's/@LLVM_TARGETS_WITH_JIT@/'"$(TARGETS_WITH_JIT)"'/' \
 	  -e 's/@TARGET_TRIPLE@/'"$(TARGET_TRIPLE)"'/' \
+	  -e 's/@LLVM_ENABLE_ASSERTIONS@/'"$(LLVM_ENABLE_ASSERTIONS)"'/' \
+	  -e 's/@LLVM_ENABLE_EH@/'"$(LLVM_ENABLE_EH)"'/' \
+	  -e 's/@LLVM_ENABLE_RTTI@/'"$(LLVM_ENABLE_RTTI)"'/' \
 	  -e 's/@LLVM_ENABLE_TERMINFO@/'"$(ENABLE_TERMINFO)"'/' \
 	  -e 's/@LLVM_ENABLE_THREADS@/'"$(ENABLE_THREADS)"'/' \
 	  -e 's/@LLVM_ENABLE_ZLIB@/'"$(ENABLE_ZLIB)"'/' \
@@ -42,6 +76,7 @@
 	  -e 's/@LLVM_CONFIG_INCLUDE_DIRS@/'"$(subst /,\/,$(PROJ_includedir))"'/' \
 	  -e 's/@LLVM_CONFIG_LIBRARY_DIRS@/'"$(subst /,\/,$(PROJ_libdir))"'/' \
 	  -e 's/@LLVM_CONFIG_CMAKE_DIR@/'"$(subst /,\/,$(PROJ_cmake))"'/' \
+	  -e 's/@LLVM_CONFIG_TOOLS_BINARY_DIR@/'"$(subst /,\/,$(PROJ_bindir))"'/' \
 	  -e 's/@LLVM_CONFIG_EXPORTS_FILE@/$${LLVM_CMAKE_DIR}\/LLVMExports.cmake/' \
 	  -e 's/@all_llvm_lib_deps@//' \
 	 && \
@@ -61,7 +96,7 @@
 	$(Echo) 'Generating LLVM CMake target exports file'
 	$(Verb) ( \
 	  echo '# LLVM CMake target exports.  Do not include directly.' && \
-	  for lib in $(subst -l,,$(LLVMConfigLibs)); do \
+	  for lib in $(LLVM_LIBS_TO_EXPORT); do \
 	    echo 'add_library('"$$lib"' STATIC IMPORTED)' && \
 	    echo 'set_property(TARGET '"$$lib"' PROPERTY IMPORTED_LOCATION "'"$(PROJ_libdir)/lib$$lib.a"'")' ; \
 	  done && \

diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake
index 34b0343..67031a5 100644
--- a/cmake/modules/TableGen.cmake
+++ b/cmake/modules/TableGen.cmake

@@ -70,26 +70,6 @@
   set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} ${target} PARENT_SCOPE)
 endfunction()
 
-if(CMAKE_CROSSCOMPILING)
-  set(CX_NATIVE_TG_DIR "${CMAKE_BINARY_DIR}/native")
-
-  add_custom_command(OUTPUT ${CX_NATIVE_TG_DIR}
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${CX_NATIVE_TG_DIR}
-    COMMENT "Creating ${CX_NATIVE_TG_DIR}...")
-
-  add_custom_command(OUTPUT ${CX_NATIVE_TG_DIR}/CMakeCache.txt
-    COMMAND ${CMAKE_COMMAND} -UMAKE_TOOLCHAIN_FILE -DCMAKE_BUILD_TYPE=Release
-                             -DLLVM_BUILD_POLLY=OFF
-                             -G "${CMAKE_GENERATOR}" ${CMAKE_SOURCE_DIR}
-    WORKING_DIRECTORY ${CX_NATIVE_TG_DIR}
-    DEPENDS ${CX_NATIVE_TG_DIR}
-    COMMENT "Configuring native TableGen...")
-
-  add_custom_target(ConfigureNativeTableGen DEPENDS ${CX_NATIVE_TG_DIR}/CMakeCache.txt)
-
-  set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES ${CX_NATIVE_TG_DIR})
-endif()
-
 macro(add_tablegen target project)
   set(${target}_OLD_LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS})
   set(LLVM_LINK_COMPONENTS ${LLVM_LINK_COMPONENTS} TableGen)
@@ -113,16 +93,16 @@
 
   if(CMAKE_CROSSCOMPILING)
     if( ${${project}_TABLEGEN} STREQUAL "${target}" )
-      set(${project}_TABLEGEN_EXE "${CX_NATIVE_TG_DIR}/bin/${target}")
+      set(${project}_TABLEGEN_EXE "${LLVM_NATIVE_BUILD}/bin/${target}")
       set(${project}_TABLEGEN_EXE ${${project}_TABLEGEN_EXE} PARENT_SCOPE)
 
       add_custom_command(OUTPUT ${${project}_TABLEGEN_EXE}
-        COMMAND ${CMAKE_BUILD_TOOL} ${target}
-        DEPENDS ${CX_NATIVE_TG_DIR}/CMakeCache.txt
-        WORKING_DIRECTORY ${CX_NATIVE_TG_DIR}
+        COMMAND ${CMAKE_COMMAND} --build . --target ${target} --config $<CONFIGURATION>
+        DEPENDS ${LLVM_NATIVE_BUILD}/CMakeCache.txt
+        WORKING_DIRECTORY ${LLVM_NATIVE_BUILD}
         COMMENT "Building native TableGen...")
       add_custom_target(${project}NativeTableGen DEPENDS ${${project}_TABLEGEN_EXE})
-      add_dependencies(${project}NativeTableGen ConfigureNativeTableGen)
+      add_dependencies(${project}NativeTableGen ConfigureNativeLLVM)
 
       add_dependencies(${target} ${project}NativeTableGen)
     endif()

diff --git a/cmake/platforms/iOS.cmake b/cmake/platforms/iOS.cmake
new file mode 100644
index 0000000..4973643
--- /dev/null
+++ b/cmake/platforms/iOS.cmake

@@ -0,0 +1,47 @@
+# Toolchain config for iOS.
+#
+# Usage:
+# mkdir build; cd build
+# cmake ..; make
+# mkdir ios; cd ios
+# cmake -DLLVM_IOS_TOOLCHAIN_DIR=/path/to/ios/ndk \
+#   -DCMAKE_TOOLCHAIN_FILE=../../cmake/platforms/iOS.cmake ../..
+# make <target>
+
+SET(CMAKE_SYSTEM_NAME Darwin)
+SET(CMAKE_SYSTEM_VERSION 13)
+SET(CMAKE_CXX_COMPILER_WORKS True)
+SET(CMAKE_C_COMPILER_WORKS True)
+SET(DARWIN_TARGET_OS_NAME ios)
+
+IF(NOT DEFINED ENV{SDKROOT})
+ MESSAGE(FATAL_ERROR "SDKROOT env var must be set: " $ENV{SDKROOT})
+ENDIF()
+
+IF(NOT CMAKE_C_COMPILER)
+  execute_process(COMMAND xcrun -sdk iphoneos -find clang
+   OUTPUT_VARIABLE CMAKE_C_COMPILER
+   ERROR_QUIET
+   OUTPUT_STRIP_TRAILING_WHITESPACE)
+  message(STATUS "Using c compiler ${CMAKE_C_COMPILER}")
+ENDIF()
+
+IF(NOT CMAKE_CXX_COMPILER)
+  execute_process(COMMAND xcrun -sdk iphoneos -find clang++
+   OUTPUT_VARIABLE CMAKE_CXX_COMPILER
+   ERROR_QUIET
+   OUTPUT_STRIP_TRAILING_WHITESPACE)
+  message(STATUS "Using c compiler ${CMAKE_CXX_COMPILER}")
+ENDIF()
+
+IF (NOT DEFINED IOS_MIN_TARGET)
+execute_process(COMMAND xcodebuild -sdk iphoneos -version SDKVersion
+   OUTPUT_VARIABLE IOS_MIN_TARGET
+   ERROR_QUIET
+   OUTPUT_STRIP_TRAILING_WHITESPACE)
+ENDIF()
+
+SET(IOS_COMMON_FLAGS "-isysroot $ENV{SDKROOT} -mios-version-min=${IOS_MIN_TARGET}")
+SET(CMAKE_C_FLAGS "${IOS_COMMON_FLAGS}" CACHE STRING "toolchain_cflags" FORCE)
+SET(CMAKE_CXX_FLAGS "${IOS_COMMON_FLAGS}" CACHE STRING "toolchain_cxxflags" FORCE)
+SET(CMAKE_LINK_FLAGS "${IOS_COMMON_FLAGS}" CACHE STRING "toolchain_linkflags" FORCE)

diff --git a/configure b/configure
index e9aba06..4bfc2e5 100755
--- a/configure
+++ b/configure

@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.60 for LLVM 3.5.0svn.
+# Generated by GNU Autoconf 2.60 for LLVM 3.6.0svn.
 #
 # Report bugs to <http://llvm.org/bugs/>.
 #
@@ -14,7 +14,7 @@
 ## M4sh Initialization.  ##
 ## --------------------- ##
 
-# Be Bourne compatible.
+# Be Bourne compatible
 if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
   emulate sh
   NULLCMD=:
@@ -561,8 +561,8 @@
 # Identity of this package.
 PACKAGE_NAME='LLVM'
 PACKAGE_TARNAME='llvm'
-PACKAGE_VERSION='3.5.0svn'
-PACKAGE_STRING='LLVM 3.5.0svn'
+PACKAGE_VERSION='3.6.0svn'
+PACKAGE_STRING='LLVM 3.6.0svn'
 PACKAGE_BUGREPORT='http://llvm.org/bugs/'
 
 ac_unique_file="lib/IR/Module.cpp"
@@ -691,7 +691,6 @@
 ENABLE_CLANG_ARCMT
 CLANG_PLUGIN_SUPPORT
 ENABLE_CLANG_STATIC_ANALYZER
-ENABLE_CLANG_REWRITER
 ENABLE_OPTIMIZED
 ENABLE_PROFILING
 DISABLE_ASSERTIONS
@@ -742,15 +741,7 @@
 SED
 TAR
 BINPWD
-GRAPHVIZ
 DOT
-FDP
-NEATO
-TWOPI
-CIRCO
-GV
-DOTTY
-XDOT
 INSTALL_PROGRAM
 INSTALL_SCRIPT
 INSTALL_DATA
@@ -761,10 +752,8 @@
 GZIPBIN
 PDFROFF
 ZIP
-OCAMLC
-OCAMLOPT
-OCAMLDEP
-OCAMLDOC
+GO
+OCAMLFIND
 GAS
 HAVE_LINK_VERSION_SCRIPT
 EGREP
@@ -773,16 +762,15 @@
 COVERED_SWITCH_DEFAULT
 NO_MAYBE_UNINITIALIZED
 NO_UNINITIALIZED
+NO_COMMENT
 PYTHON
 HAVE_DLOPEN
 HAVE_TERMINFO
-USE_UDIS86
 USE_OPROFILE
 USE_INTEL_JITEVENTS
 XML2CONFIG
 LIBXML2_LIBS
 LIBXML2_INC
-CXXCPP
 HAVE_PTHREAD
 HAVE_LIBZ
 HUGE_VAL_SANITY
@@ -798,7 +786,8 @@
 LLVM_MANDIR
 LLVM_CONFIGTIME
 BINDINGS_TO_BUILD
-ALL_BINDINGS
+HAVE_OCAMLOPT
+HAVE_OCAML_OUNIT
 OCAML_LIBDIR
 ENABLE_VISIBILITY_INLINES_HIDDEN
 RPATH
@@ -817,8 +806,7 @@
 CXX
 CXXFLAGS
 CCC
-CPP
-CXXCPP'
+CPP'
 ac_subdirs_all='projects/test-suite
 projects/llvm-test
 projects/poolalloc
@@ -1326,7 +1314,7 @@
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures LLVM 3.5.0svn to adapt to many kinds of systems.
+\`configure' configures LLVM 3.6.0svn to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1392,7 +1380,7 @@
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of LLVM 3.5.0svn:";;
+     short | recursive ) echo "Configuration of LLVM 3.6.0svn:";;
    esac
   cat <<\_ACEOF
 
@@ -1412,7 +1400,6 @@
   --enable-clang-static-analyzer
                           Enable building of clang Static Analyzer (default is
                           YES)
-  --enable-clang-rewriter Enable building of clang rewriter (default is YES)
   --enable-optimized      Compile with optimizations enabled (default is NO)
   --enable-profiling      Compile with profiling enabled (default is NO)
   --enable-assertions     Compile with assertion checks enabled (default is
@@ -1483,7 +1470,6 @@
                           submitted (default=http://llvm.org/bugs/)
   --with-internal-prefix  Installation directory for internal files
   --with-python           path to python
-  --with-udis86=<path>    Use udis86 external x86 disassembler library
   --with-oprofile=<prefix>
                           Tell OProfile >= 0.9.4 how to symbolize JIT output
   --with-intel-jitevents  Notify Intel JIT profiling API of generated code
@@ -1499,7 +1485,6 @@
   CXX         C++ compiler command
   CXXFLAGS    C++ compiler flags
   CPP         C preprocessor
-  CXXCPP      C++ preprocessor
 
 Use these variables to override the choices made by `configure' or to help
 it to find libraries and programs with nonstandard names/locations.
@@ -1565,7 +1550,7 @@
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-LLVM configure 3.5.0svn
+LLVM configure 3.6.0svn
 generated by GNU Autoconf 2.60
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1581,7 +1566,7 @@
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by LLVM $as_me 3.5.0svn, which was
+It was created by LLVM $as_me 3.6.0svn, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   $ $0 $@
@@ -1936,7 +1921,7 @@
 
 
 LLVM_VERSION_MAJOR=3
-LLVM_VERSION_MINOR=5
+LLVM_VERSION_MINOR=6
 LLVM_VERSION_PATCH=0
 LLVM_VERSION_SUFFIX=svn
 
@@ -1956,6 +1941,11 @@
 _ACEOF
 
 
+cat >>confdefs.h <<_ACEOF
+#define LLVM_VERSION_STRING "$PACKAGE_VERSION"
+_ACEOF
+
+
 
 
 
@@ -4001,11 +3991,6 @@
     llvm_cv_no_link_all_option="-Wl,-z,defaultextract"
     llvm_cv_os_type="SunOS"
     llvm_cv_platform_type="Unix" ;;
-  *-*-auroraux*)
-    llvm_cv_link_all_option="-Wl,-z,allextract"
-    llvm_cv_link_all_option="-Wl,-z,defaultextract"
-    llvm_cv_os_type="AuroraUX"
-    llvm_cv_platform_type="Unix" ;;
   *-*-win32*)
     llvm_cv_link_all_option="-Wl,--whole-archive"
     llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
@@ -4077,8 +4062,6 @@
     llvm_cv_target_os_type="GNU" ;;
   *-*-solaris*)
     llvm_cv_target_os_type="SunOS" ;;
-  *-*-auroraux*)
-    llvm_cv_target_os_type="AuroraUX" ;;
   *-*-win32*)
     llvm_cv_target_os_type="Win32" ;;
   *-*-mingw*)
@@ -4921,37 +4904,6 @@
    { (exit 1); exit 1; }; } ;;
 esac
 
-# Check whether --enable-clang-rewriter was given.
-if test "${enable_clang_rewriter+set}" = set; then
-  enableval=$enable_clang_rewriter;
-else
-  enableval="yes"
-fi
-
-case "$enableval" in
-  yes) ENABLE_CLANG_REWRITER=1
- ;;
-  no)
-    if test ${clang_arcmt} != "no" ; then
-      { { echo "$as_me:$LINENO: error: Cannot enable clang ARC Migration Tool while disabling rewriter." >&5
-echo "$as_me: error: Cannot enable clang ARC Migration Tool while disabling rewriter." >&2;}
-   { (exit 1); exit 1; }; }
-    fi
-    if test ${clang_static_analyzer} != "no" ; then
-      { { echo "$as_me:$LINENO: error: Cannot enable clang static analyzer while disabling rewriter." >&5
-echo "$as_me: error: Cannot enable clang static analyzer while disabling rewriter." >&2;}
-   { (exit 1); exit 1; }; }
-    fi
-    ENABLE_CLANG_REWRITER=0
-
-    ;;
-  default) ENABLE_CLANG_REWRITER=1
-;;
-  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-clang-rewriter. Use \"yes\" or \"no\"" >&5
-echo "$as_me: error: Invalid setting for --enable-clang-rewriter. Use \"yes\" or \"no\"" >&2;}
-   { (exit 1); exit 1; }; } ;;
-esac
-
 # Check whether --enable-optimized was given.
 if test "${enable_optimized+set}" = set; then
   enableval=$enable_optimized;
@@ -6500,62 +6452,6 @@
 
 
 
-# Extract the first word of "Graphviz", so it can be a program name with args.
-set dummy Graphviz; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_GRAPHVIZ+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $GRAPHVIZ in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_GRAPHVIZ="$GRAPHVIZ" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_GRAPHVIZ="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_path_GRAPHVIZ" && ac_cv_path_GRAPHVIZ="echo Graphviz"
-  ;;
-esac
-fi
-GRAPHVIZ=$ac_cv_path_GRAPHVIZ
-if test -n "$GRAPHVIZ"; then
-  { echo "$as_me:$LINENO: result: $GRAPHVIZ" >&5
-echo "${ECHO_T}$GRAPHVIZ" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-if test "$GRAPHVIZ" != "echo Graphviz" ; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_GRAPHVIZ 1
-_ACEOF
-
-    if test "$llvm_cv_os_type" = "MingW" ; then
-    GRAPHVIZ=`echo $GRAPHVIZ | sed 's/^\/\([A-Za-z]\)\//\1:\//' `
-  fi
-
-cat >>confdefs.h <<_ACEOF
-#define LLVM_PATH_GRAPHVIZ "$GRAPHVIZ${EXEEXT}"
-_ACEOF
-
-fi
 # Extract the first word of "dot", so it can be a program name with args.
 set dummy dot; ac_word=$2
 { echo "$as_me:$LINENO: checking for $ac_word" >&5
@@ -6612,408 +6508,6 @@
 _ACEOF
 
 fi
-# Extract the first word of "fdp", so it can be a program name with args.
-set dummy fdp; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_FDP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $FDP in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_FDP="$FDP" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_FDP="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_path_FDP" && ac_cv_path_FDP="echo fdp"
-  ;;
-esac
-fi
-FDP=$ac_cv_path_FDP
-if test -n "$FDP"; then
-  { echo "$as_me:$LINENO: result: $FDP" >&5
-echo "${ECHO_T}$FDP" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-if test "$FDP" != "echo fdp" ; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_FDP 1
-_ACEOF
-
-    if test "$llvm_cv_os_type" = "MingW" ; then
-    FDP=`echo $FDP | sed 's/^\/\([A-Za-z]\)\//\1:\//' `
-  fi
-
-cat >>confdefs.h <<_ACEOF
-#define LLVM_PATH_FDP "$FDP${EXEEXT}"
-_ACEOF
-
-fi
-# Extract the first word of "neato", so it can be a program name with args.
-set dummy neato; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_NEATO+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $NEATO in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_NEATO="$NEATO" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_NEATO="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_path_NEATO" && ac_cv_path_NEATO="echo neato"
-  ;;
-esac
-fi
-NEATO=$ac_cv_path_NEATO
-if test -n "$NEATO"; then
-  { echo "$as_me:$LINENO: result: $NEATO" >&5
-echo "${ECHO_T}$NEATO" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-if test "$NEATO" != "echo neato" ; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_NEATO 1
-_ACEOF
-
-    if test "$llvm_cv_os_type" = "MingW" ; then
-    NEATO=`echo $NEATO | sed 's/^\/\([A-Za-z]\)\//\1:\//' `
-  fi
-
-cat >>confdefs.h <<_ACEOF
-#define LLVM_PATH_NEATO "$NEATO${EXEEXT}"
-_ACEOF
-
-fi
-# Extract the first word of "twopi", so it can be a program name with args.
-set dummy twopi; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_TWOPI+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $TWOPI in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_TWOPI="$TWOPI" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_TWOPI="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_path_TWOPI" && ac_cv_path_TWOPI="echo twopi"
-  ;;
-esac
-fi
-TWOPI=$ac_cv_path_TWOPI
-if test -n "$TWOPI"; then
-  { echo "$as_me:$LINENO: result: $TWOPI" >&5
-echo "${ECHO_T}$TWOPI" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-if test "$TWOPI" != "echo twopi" ; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_TWOPI 1
-_ACEOF
-
-    if test "$llvm_cv_os_type" = "MingW" ; then
-    TWOPI=`echo $TWOPI | sed 's/^\/\([A-Za-z]\)\//\1:\//' `
-  fi
-
-cat >>confdefs.h <<_ACEOF
-#define LLVM_PATH_TWOPI "$TWOPI${EXEEXT}"
-_ACEOF
-
-fi
-# Extract the first word of "circo", so it can be a program name with args.
-set dummy circo; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_CIRCO+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $CIRCO in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_CIRCO="$CIRCO" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_CIRCO="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_path_CIRCO" && ac_cv_path_CIRCO="echo circo"
-  ;;
-esac
-fi
-CIRCO=$ac_cv_path_CIRCO
-if test -n "$CIRCO"; then
-  { echo "$as_me:$LINENO: result: $CIRCO" >&5
-echo "${ECHO_T}$CIRCO" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-if test "$CIRCO" != "echo circo" ; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_CIRCO 1
-_ACEOF
-
-    if test "$llvm_cv_os_type" = "MingW" ; then
-    CIRCO=`echo $CIRCO | sed 's/^\/\([A-Za-z]\)\//\1:\//' `
-  fi
-
-cat >>confdefs.h <<_ACEOF
-#define LLVM_PATH_CIRCO "$CIRCO${EXEEXT}"
-_ACEOF
-
-fi
-for ac_prog in gv gsview32
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_GV+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $GV in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_GV="$GV" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_GV="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-GV=$ac_cv_path_GV
-if test -n "$GV"; then
-  { echo "$as_me:$LINENO: result: $GV" >&5
-echo "${ECHO_T}$GV" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  test -n "$GV" && break
-done
-test -n "$GV" || GV="echo gv"
-
-if test "$GV" != "echo gv" ; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_GV 1
-_ACEOF
-
-    if test "$llvm_cv_os_type" = "MingW" ; then
-    GV=`echo $GV | sed 's/^\/\([A-Za-z]\)\//\1:\//' `
-  fi
-
-cat >>confdefs.h <<_ACEOF
-#define LLVM_PATH_GV "$GV${EXEEXT}"
-_ACEOF
-
-fi
-# Extract the first word of "dotty", so it can be a program name with args.
-set dummy dotty; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_DOTTY+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $DOTTY in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_DOTTY="$DOTTY" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_DOTTY="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_path_DOTTY" && ac_cv_path_DOTTY="echo dotty"
-  ;;
-esac
-fi
-DOTTY=$ac_cv_path_DOTTY
-if test -n "$DOTTY"; then
-  { echo "$as_me:$LINENO: result: $DOTTY" >&5
-echo "${ECHO_T}$DOTTY" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-if test "$DOTTY" != "echo dotty" ; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_DOTTY 1
-_ACEOF
-
-    if test "$llvm_cv_os_type" = "MingW" ; then
-    DOTTY=`echo $DOTTY | sed 's/^\/\([A-Za-z]\)\//\1:\//' `
-  fi
-
-cat >>confdefs.h <<_ACEOF
-#define LLVM_PATH_DOTTY "$DOTTY${EXEEXT}"
-_ACEOF
-
-fi
-for ac_prog in xdot xdot.py
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_XDOT+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $XDOT in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_XDOT="$XDOT" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_XDOT="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-XDOT=$ac_cv_path_XDOT
-if test -n "$XDOT"; then
-  { echo "$as_me:$LINENO: result: $XDOT" >&5
-echo "${ECHO_T}$XDOT" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  test -n "$XDOT" && break
-done
-test -n "$XDOT" || XDOT="echo xdot"
-
-if test "$XDOT" != "echo xdot" ; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_XDOT 1
-_ACEOF
-
-    if test "$llvm_cv_os_type" = "MingW" ; then
-    XDOT=`echo $XDOT | sed 's/^\/\([A-Za-z]\)\//\1:\//' `
-  fi
-
-cat >>confdefs.h <<_ACEOF
-#define LLVM_PATH_XDOT "$XDOT${EXEEXT}"
-_ACEOF
-
-fi
 
 # Find a good install program.  We prefer a C program (faster),
 # so one script is as good as another.  But avoid the broken or
@@ -7380,18 +6874,16 @@
 fi
 
 
-for ac_prog in ocamlc
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
+# Extract the first word of "go", so it can be a program name with args.
+set dummy go; ac_word=$2
 { echo "$as_me:$LINENO: checking for $ac_word" >&5
 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_OCAMLC+set}" = set; then
+if test "${ac_cv_path_GO+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  case $OCAMLC in
+  case $GO in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_OCAMLC="$OCAMLC" # Let the user override the test with a path.
+  ac_cv_path_GO="$GO" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -7401,7 +6893,7 @@
   test -z "$as_dir" && as_dir=.
   for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_OCAMLC="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_GO="$as_dir/$ac_word$ac_exec_ext"
     echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -7412,31 +6904,28 @@
   ;;
 esac
 fi
-OCAMLC=$ac_cv_path_OCAMLC
-if test -n "$OCAMLC"; then
-  { echo "$as_me:$LINENO: result: $OCAMLC" >&5
-echo "${ECHO_T}$OCAMLC" >&6; }
+GO=$ac_cv_path_GO
+if test -n "$GO"; then
+  { echo "$as_me:$LINENO: result: $GO" >&5
+echo "${ECHO_T}$GO" >&6; }
 else
   { echo "$as_me:$LINENO: result: no" >&5
 echo "${ECHO_T}no" >&6; }
 fi
 
 
-  test -n "$OCAMLC" && break
-done
-
-for ac_prog in ocamlopt
+for ac_prog in ocamlfind
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
 { echo "$as_me:$LINENO: checking for $ac_word" >&5
 echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_OCAMLOPT+set}" = set; then
+if test "${ac_cv_path_OCAMLFIND+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  case $OCAMLOPT in
+  case $OCAMLFIND in
   [\\/]* | ?:[\\/]*)
-  ac_cv_path_OCAMLOPT="$OCAMLOPT" # Let the user override the test with a path.
+  ac_cv_path_OCAMLFIND="$OCAMLFIND" # Let the user override the test with a path.
   ;;
   *)
   as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
@@ -7446,7 +6935,7 @@
   test -z "$as_dir" && as_dir=.
   for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_OCAMLOPT="$as_dir/$ac_word$ac_exec_ext"
+    ac_cv_path_OCAMLFIND="$as_dir/$ac_word$ac_exec_ext"
     echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
@@ -7457,107 +6946,17 @@
   ;;
 esac
 fi
-OCAMLOPT=$ac_cv_path_OCAMLOPT
-if test -n "$OCAMLOPT"; then
-  { echo "$as_me:$LINENO: result: $OCAMLOPT" >&5
-echo "${ECHO_T}$OCAMLOPT" >&6; }
+OCAMLFIND=$ac_cv_path_OCAMLFIND
+if test -n "$OCAMLFIND"; then
+  { echo "$as_me:$LINENO: result: $OCAMLFIND" >&5
+echo "${ECHO_T}$OCAMLFIND" >&6; }
 else
   { echo "$as_me:$LINENO: result: no" >&5
 echo "${ECHO_T}no" >&6; }
 fi
 
 
-  test -n "$OCAMLOPT" && break
-done
-
-for ac_prog in ocamldep
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_OCAMLDEP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $OCAMLDEP in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_OCAMLDEP="$OCAMLDEP" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_OCAMLDEP="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-OCAMLDEP=$ac_cv_path_OCAMLDEP
-if test -n "$OCAMLDEP"; then
-  { echo "$as_me:$LINENO: result: $OCAMLDEP" >&5
-echo "${ECHO_T}$OCAMLDEP" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  test -n "$OCAMLDEP" && break
-done
-
-for ac_prog in ocamldoc
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_path_OCAMLDOC+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  case $OCAMLDOC in
-  [\\/]* | ?:[\\/]*)
-  ac_cv_path_OCAMLDOC="$OCAMLDOC" # Let the user override the test with a path.
-  ;;
-  *)
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_path_OCAMLDOC="$as_dir/$ac_word$ac_exec_ext"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  ;;
-esac
-fi
-OCAMLDOC=$ac_cv_path_OCAMLDOC
-if test -n "$OCAMLDOC"; then
-  { echo "$as_me:$LINENO: result: $OCAMLDOC" >&5
-echo "${ECHO_T}$OCAMLDOC" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  test -n "$OCAMLDOC" && break
+  test -n "$OCAMLFIND" && break
 done
 
 for ac_prog in gas as
@@ -8673,8 +8072,74 @@
 
   fi
 fi
-{ echo "$as_me:$LINENO: result: $NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $COVERED_SWITCH_DEFAULT $NO_UNINITIALIZED $NO_MAYBE_UNINITIALIZED" >&5
-echo "${ECHO_T}$NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $COVERED_SWITCH_DEFAULT $NO_UNINITIALIZED $NO_MAYBE_UNINITIALIZED" >&6; }
+
+no_comment=
+llvm_cv_old_cxxflags="$CXXFLAGS"
+CXXFLAGS="$CXXFLAGS -Wcomment -Werror"
+cat >conftest.$ac_ext <<_ACEOF
+
+  /* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+// Comment \o\
+// Another comment
+int main() { return 0; }
+
+
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest.$ac_objext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+
+  no_comment=-Wno-comment
+
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+NO_COMMENT=$no_comment
+
+CXXFLAGS="$llvm_cv_old_cxxflags"
+
+{ echo "$as_me:$LINENO: result: $NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $COVERED_SWITCH_DEFAULT $NO_UNINITIALIZED $NO_MAYBE_UNINITIALIZED $NO_COMMENT" >&5
+echo "${ECHO_T}$NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $COVERED_SWITCH_DEFAULT $NO_UNINITIALIZED $NO_MAYBE_UNINITIALIZED $NO_COMMENT" >&6; }
 
 
 # Check whether --with-python was given.
@@ -10227,120 +9692,6 @@
 fi
 
 
-# Check whether --with-udis86 was given.
-if test "${with_udis86+set}" = set; then
-  withval=$with_udis86;
-      USE_UDIS86=1
-
-      case "$withval" in
-        /usr/lib|yes) ;;
-        *) LDFLAGS="$LDFLAGS -L${withval}" ;;
-      esac
-
-{ echo "$as_me:$LINENO: checking for ud_init in -ludis86" >&5
-echo $ECHO_N "checking for ud_init in -ludis86... $ECHO_C" >&6; }
-if test "${ac_cv_lib_udis86_ud_init+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_check_lib_save_LIBS=$LIBS
-LIBS="-ludis86  $LIBS"
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char ud_init ();
-int
-main ()
-{
-return ud_init ();
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext conftest$ac_exeext
-if { (ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_link") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } &&
-	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; } &&
-	 { ac_try='test -s conftest$ac_exeext'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; }; then
-  ac_cv_lib_udis86_ud_init=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_cv_lib_udis86_ud_init=no
-fi
-
-rm -f core conftest.err conftest.$ac_objext \
-      conftest$ac_exeext conftest.$ac_ext
-LIBS=$ac_check_lib_save_LIBS
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_lib_udis86_ud_init" >&5
-echo "${ECHO_T}$ac_cv_lib_udis86_ud_init" >&6; }
-if test $ac_cv_lib_udis86_ud_init = yes; then
-  cat >>confdefs.h <<_ACEOF
-#define HAVE_LIBUDIS86 1
-_ACEOF
-
-  LIBS="-ludis86 $LIBS"
-
-else
-
-        echo "Error! You need to have libudis86 around."
-        exit -1
-
-fi
-
-
-else
-  USE_UDIS86=0
-
-fi
-
-
-cat >>confdefs.h <<_ACEOF
-#define USE_UDIS86 $USE_UDIS86
-_ACEOF
-
-
-
 # Check whether --with-oprofile was given.
 if test "${with_oprofile+set}" = set; then
   withval=$with_oprofile;
@@ -11535,285 +10886,24 @@
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-{ echo "$as_me:$LINENO: checking how to run the C++ preprocessor" >&5
-echo $ECHO_N "checking how to run the C++ preprocessor... $ECHO_C" >&6; }
-if test -z "$CXXCPP"; then
-  if test "${ac_cv_prog_CXXCPP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-      # Double quotes because CXXCPP needs to be expanded
-    for CXXCPP in "$CXX -E" "/lib/cpp"
-    do
-      ac_preproc_ok=false
-for ac_cxx_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null; then
-  if test -s conftest.err; then
-    ac_cpp_err=$ac_cxx_preproc_warn_flag
-    ac_cpp_err=$ac_cpp_err$ac_cxx_werror_flag
-  else
-    ac_cpp_err=
-  fi
-else
-  ac_cpp_err=yes
-fi
-if test -z "$ac_cpp_err"; then
-  :
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  # Broken: fails on valid input.
-continue
-fi
-
-rm -f conftest.err conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null; then
-  if test -s conftest.err; then
-    ac_cpp_err=$ac_cxx_preproc_warn_flag
-    ac_cpp_err=$ac_cpp_err$ac_cxx_werror_flag
-  else
-    ac_cpp_err=
-  fi
-else
-  ac_cpp_err=yes
-fi
-if test -z "$ac_cpp_err"; then
-  # Broken: success on invalid input.
-continue
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-
-rm -f conftest.err conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then
-  break
-fi
-
-    done
-    ac_cv_prog_CXXCPP=$CXXCPP
-
-fi
-  CXXCPP=$ac_cv_prog_CXXCPP
-else
-  ac_cv_prog_CXXCPP=$CXXCPP
-fi
-{ echo "$as_me:$LINENO: result: $CXXCPP" >&5
-echo "${ECHO_T}$CXXCPP" >&6; }
-ac_preproc_ok=false
-for ac_cxx_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null; then
-  if test -s conftest.err; then
-    ac_cpp_err=$ac_cxx_preproc_warn_flag
-    ac_cpp_err=$ac_cpp_err$ac_cxx_werror_flag
-  else
-    ac_cpp_err=
-  fi
-else
-  ac_cpp_err=yes
-fi
-if test -z "$ac_cpp_err"; then
-  :
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  # Broken: fails on valid input.
-continue
-fi
-
-rm -f conftest.err conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null; then
-  if test -s conftest.err; then
-    ac_cpp_err=$ac_cxx_preproc_warn_flag
-    ac_cpp_err=$ac_cpp_err$ac_cxx_werror_flag
-  else
-    ac_cpp_err=
-  fi
-else
-  ac_cpp_err=yes
-fi
-if test -z "$ac_cpp_err"; then
-  # Broken: success on invalid input.
-continue
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-
-rm -f conftest.err conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then
-  :
-else
-  { { echo "$as_me:$LINENO: error: C++ preprocessor \"$CXXCPP\" fails sanity check
-See \`config.log' for more details." >&5
-echo "$as_me: error: C++ preprocessor \"$CXXCPP\" fails sanity check
-See \`config.log' for more details." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-
-
 
 for ac_header in cxxabi.h
 do
 as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh`
-if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
-  { echo "$as_me:$LINENO: checking for $ac_header" >&5
+{ echo "$as_me:$LINENO: checking for $ac_header" >&5
 echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
 if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
-fi
-ac_res=`eval echo '${'$as_ac_Header'}'`
-	       { echo "$as_me:$LINENO: result: $ac_res" >&5
-echo "${ECHO_T}$ac_res" >&6; }
 else
-  # Is the header compilable?
-{ echo "$as_me:$LINENO: checking $ac_header usability" >&5
-echo $ECHO_N "checking $ac_header usability... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
 cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
-$ac_includes_default
+#include <stddef.h>
+
+
 #include <$ac_header>
 _ACEOF
 rm -f conftest.$ac_objext
@@ -11850,106 +10940,19 @@
   ac_status=$?
   echo "$as_me:$LINENO: \$? = $ac_status" >&5
   (exit $ac_status); }; }; then
-  ac_header_compiler=yes
+  eval "$as_ac_Header=yes"
 else
   echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	ac_header_compiler=no
+	eval "$as_ac_Header=no"
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
-echo "${ECHO_T}$ac_header_compiler" >&6; }
-
-# Is the header present?
-{ echo "$as_me:$LINENO: checking $ac_header presence" >&5
-echo $ECHO_N "checking $ac_header presence... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <$ac_header>
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null; then
-  if test -s conftest.err; then
-    ac_cpp_err=$ac_cxx_preproc_warn_flag
-    ac_cpp_err=$ac_cpp_err$ac_cxx_werror_flag
-  else
-    ac_cpp_err=
-  fi
-else
-  ac_cpp_err=yes
-fi
-if test -z "$ac_cpp_err"; then
-  ac_header_preproc=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  ac_header_preproc=no
-fi
-
-rm -f conftest.err conftest.$ac_ext
-{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
-echo "${ECHO_T}$ac_header_preproc" >&6; }
-
-# So?  What about this header?
-case $ac_header_compiler:$ac_header_preproc:$ac_cxx_preproc_warn_flag in
-  yes:no: )
-    { echo "$as_me:$LINENO: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&5
-echo "$as_me: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the compiler's result" >&5
-echo "$as_me: WARNING: $ac_header: proceeding with the compiler's result" >&2;}
-    ac_header_preproc=yes
-    ;;
-  no:yes:* )
-    { echo "$as_me:$LINENO: WARNING: $ac_header: present but cannot be compiled" >&5
-echo "$as_me: WARNING: $ac_header: present but cannot be compiled" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header:     check for missing prerequisite headers?" >&5
-echo "$as_me: WARNING: $ac_header:     check for missing prerequisite headers?" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header: see the Autoconf documentation" >&5
-echo "$as_me: WARNING: $ac_header: see the Autoconf documentation" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header:     section \"Present But Cannot Be Compiled\"" >&5
-echo "$as_me: WARNING: $ac_header:     section \"Present But Cannot Be Compiled\"" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the preprocessor's result" >&5
-echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
-echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
-    ( cat <<\_ASBOX
-## ------------------------------------ ##
-## Report this to http://llvm.org/bugs/ ##
-## ------------------------------------ ##
-_ASBOX
-     ) | sed "s/^/$as_me: WARNING:     /" >&2
-    ;;
-esac
-{ echo "$as_me:$LINENO: checking for $ac_header" >&5
-echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
-if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  eval "$as_ac_Header=\$ac_header_preproc"
 fi
 ac_res=`eval echo '${'$as_ac_Header'}'`
 	       { echo "$as_me:$LINENO: result: $ac_res" >&5
 echo "${ECHO_T}$ac_res" >&6; }
-
-fi
 if test `eval echo '${'$as_ac_Header'}'` = yes; then
   cat >>confdefs.h <<_ACEOF
 #define `echo "HAVE_$ac_header" | $as_tr_cpp` 1
@@ -11970,6 +10973,7 @@
 
 
 
+
 for ac_header in dlfcn.h execinfo.h fcntl.h inttypes.h link.h
 do
 as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh`
@@ -19233,39 +18237,55 @@
 
 if test "$BINDINGS_TO_BUILD" = auto ; then
   BINDINGS_TO_BUILD=""
-  if test "x$OCAMLC" != x -a "x$OCAMLDEP" != x ; then
+  if test "x$OCAMLFIND" != x ; then
     BINDINGS_TO_BUILD="ocaml $BINDINGS_TO_BUILD"
   fi
+  if test "x$GO" != x ; then
+    if $GO run ${srcdir}/bindings/go/conftest.go ; then
+      BINDINGS_TO_BUILD="go $BINDINGS_TO_BUILD"
+    fi
+  fi
 fi
 BINDINGS_TO_BUILD=$BINDINGS_TO_BUILD
 
 
-ALL_BINDINGS=ocaml
-
-
 binding_prereqs_failed=0
 for a_binding in $BINDINGS_TO_BUILD ; do
   case "$a_binding" in
   ocaml)
-    if test "x$OCAMLC" = x ; then
-      { echo "$as_me:$LINENO: WARNING: --enable-bindings=ocaml specified, but ocamlc not found. Try configure OCAMLC=/path/to/ocamlc" >&5
-echo "$as_me: WARNING: --enable-bindings=ocaml specified, but ocamlc not found. Try configure OCAMLC=/path/to/ocamlc" >&2;}
+    if test "x$OCAMLFIND" = x ; then
+      { echo "$as_me:$LINENO: WARNING: --enable-bindings=ocaml specified, but ocamlfind not found. Try configure OCAMLFIND=/path/to/ocamlfind" >&5
+echo "$as_me: WARNING: --enable-bindings=ocaml specified, but ocamlfind not found. Try configure OCAMLFIND=/path/to/ocamlfind" >&2;}
       binding_prereqs_failed=1
     fi
-    if test "x$OCAMLDEP" = x ; then
-      { echo "$as_me:$LINENO: WARNING: --enable-bindings=ocaml specified, but ocamldep not found. Try configure OCAMLDEP=/path/to/ocamldep" >&5
-echo "$as_me: WARNING: --enable-bindings=ocaml specified, but ocamldep not found. Try configure OCAMLDEP=/path/to/ocamldep" >&2;}
+
+    if $OCAMLFIND opt -version >/dev/null 2>/dev/null ; then
+      HAVE_OCAMLOPT=1
+    else
+      HAVE_OCAMLOPT=0
+    fi
+
+
+    if ! $OCAMLFIND query ctypes >/dev/null 2>/dev/null; then
+      { echo "$as_me:$LINENO: WARNING: --enable-bindings=ocaml specified, but ctypes is not installed" >&5
+echo "$as_me: WARNING: --enable-bindings=ocaml specified, but ctypes is not installed" >&2;}
       binding_prereqs_failed=1
     fi
-    if test "x$OCAMLOPT" = x ; then
-      { echo "$as_me:$LINENO: WARNING: --enable-bindings=ocaml specified, but ocamlopt not found. Try configure OCAMLOPT=/path/to/ocamlopt" >&5
-echo "$as_me: WARNING: --enable-bindings=ocaml specified, but ocamlopt not found. Try configure OCAMLOPT=/path/to/ocamlopt" >&2;}
+
+    if $OCAMLFIND query oUnit >/dev/null 2>/dev/null; then
+      HAVE_OCAML_OUNIT=1
+    else
+      HAVE_OCAML_OUNIT=0
+      { echo "$as_me:$LINENO: WARNING: --enable-bindings=ocaml specified, but OUnit 2 is not installed. Tests will not run" >&5
+echo "$as_me: WARNING: --enable-bindings=ocaml specified, but OUnit 2 is not installed. Tests will not run" >&2;}
           fi
+
+
     if test "x$with_ocaml_libdir" != xauto ; then
       OCAML_LIBDIR=$with_ocaml_libdir
 
     else
-      ocaml_stdlib="`"$OCAMLC" -where`"
+      ocaml_stdlib="`"$OCAMLFIND" ocamlc -where`"
       if test "$LLVM_PREFIX" '<' "$ocaml_stdlib" -a "$ocaml_stdlib" '<' "$LLVM_PREFIX~"
       then
         # ocaml stdlib is beneath our prefix; use stdlib
@@ -19278,6 +18298,21 @@
       fi
     fi
     ;;
+  go)
+    if test "x$GO" = x ; then
+      { echo "$as_me:$LINENO: WARNING: --enable-bindings=go specified, but go not found. Try configure GO=/path/to/go" >&5
+echo "$as_me: WARNING: --enable-bindings=go specified, but go not found. Try configure GO=/path/to/go" >&2;}
+      binding_prereqs_failed=1
+    else
+      if $GO run ${srcdir}/bindings/go/conftest.go ; then
+        :
+      else
+        { echo "$as_me:$LINENO: WARNING: --enable-bindings=go specified, but need at least Go 1.2. Try configure GO=/path/to/go" >&5
+echo "$as_me: WARNING: --enable-bindings=go specified, but need at least Go 1.2. Try configure GO=/path/to/go" >&2;}
+        binding_prereqs_failed=1
+      fi
+    fi
+    ;;
   esac
 done
 if test "$binding_prereqs_failed" = 1 ; then
@@ -19860,7 +18895,7 @@
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by LLVM $as_me 3.5.0svn, which was
+This file was extended by LLVM $as_me 3.6.0svn, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -19913,7 +18948,7 @@
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-LLVM config.status 3.5.0svn
+LLVM config.status 3.6.0svn
 configured by $0, generated by GNU Autoconf 2.60,
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
@@ -20203,7 +19238,6 @@
 ENABLE_CLANG_ARCMT!$ENABLE_CLANG_ARCMT$ac_delim
 CLANG_PLUGIN_SUPPORT!$CLANG_PLUGIN_SUPPORT$ac_delim
 ENABLE_CLANG_STATIC_ANALYZER!$ENABLE_CLANG_STATIC_ANALYZER$ac_delim
-ENABLE_CLANG_REWRITER!$ENABLE_CLANG_REWRITER$ac_delim
 ENABLE_OPTIMIZED!$ENABLE_OPTIMIZED$ac_delim
 ENABLE_PROFILING!$ENABLE_PROFILING$ac_delim
 DISABLE_ASSERTIONS!$DISABLE_ASSERTIONS$ac_delim
@@ -20211,6 +19245,7 @@
 ENABLE_EXPENSIVE_CHECKS!$ENABLE_EXPENSIVE_CHECKS$ac_delim
 EXPENSIVE_CHECKS!$EXPENSIVE_CHECKS$ac_delim
 DEBUG_RUNTIME!$DEBUG_RUNTIME$ac_delim
+DEBUG_SYMBOLS!$DEBUG_SYMBOLS$ac_delim
 _ACEOF
 
   if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
@@ -20252,7 +19287,6 @@
 ac_delim='%!_!# '
 for ac_last_try in false false false false false :; do
   cat >conf$$subs.sed <<_ACEOF
-DEBUG_SYMBOLS!$DEBUG_SYMBOLS$ac_delim
 KEEP_SYMBOLS!$KEEP_SYMBOLS$ac_delim
 JIT!$JIT$ac_delim
 TARGET_HAS_JIT!$TARGET_HAS_JIT$ac_delim
@@ -20295,15 +19329,7 @@
 SED!$SED$ac_delim
 TAR!$TAR$ac_delim
 BINPWD!$BINPWD$ac_delim
-GRAPHVIZ!$GRAPHVIZ$ac_delim
 DOT!$DOT$ac_delim
-FDP!$FDP$ac_delim
-NEATO!$NEATO$ac_delim
-TWOPI!$TWOPI$ac_delim
-CIRCO!$CIRCO$ac_delim
-GV!$GV$ac_delim
-DOTTY!$DOTTY$ac_delim
-XDOT!$XDOT$ac_delim
 INSTALL_PROGRAM!$INSTALL_PROGRAM$ac_delim
 INSTALL_SCRIPT!$INSTALL_SCRIPT$ac_delim
 INSTALL_DATA!$INSTALL_DATA$ac_delim
@@ -20314,10 +19340,8 @@
 GZIPBIN!$GZIPBIN$ac_delim
 PDFROFF!$PDFROFF$ac_delim
 ZIP!$ZIP$ac_delim
-OCAMLC!$OCAMLC$ac_delim
-OCAMLOPT!$OCAMLOPT$ac_delim
-OCAMLDEP!$OCAMLDEP$ac_delim
-OCAMLDOC!$OCAMLDOC$ac_delim
+GO!$GO$ac_delim
+OCAMLFIND!$OCAMLFIND$ac_delim
 GAS!$GAS$ac_delim
 HAVE_LINK_VERSION_SCRIPT!$HAVE_LINK_VERSION_SCRIPT$ac_delim
 EGREP!$EGREP$ac_delim
@@ -20326,16 +19350,15 @@
 COVERED_SWITCH_DEFAULT!$COVERED_SWITCH_DEFAULT$ac_delim
 NO_MAYBE_UNINITIALIZED!$NO_MAYBE_UNINITIALIZED$ac_delim
 NO_UNINITIALIZED!$NO_UNINITIALIZED$ac_delim
+NO_COMMENT!$NO_COMMENT$ac_delim
 PYTHON!$PYTHON$ac_delim
 HAVE_DLOPEN!$HAVE_DLOPEN$ac_delim
 HAVE_TERMINFO!$HAVE_TERMINFO$ac_delim
-USE_UDIS86!$USE_UDIS86$ac_delim
 USE_OPROFILE!$USE_OPROFILE$ac_delim
 USE_INTEL_JITEVENTS!$USE_INTEL_JITEVENTS$ac_delim
 XML2CONFIG!$XML2CONFIG$ac_delim
 LIBXML2_LIBS!$LIBXML2_LIBS$ac_delim
 LIBXML2_INC!$LIBXML2_INC$ac_delim
-CXXCPP!$CXXCPP$ac_delim
 HAVE_PTHREAD!$HAVE_PTHREAD$ac_delim
 HAVE_LIBZ!$HAVE_LIBZ$ac_delim
 HUGE_VAL_SANITY!$HUGE_VAL_SANITY$ac_delim
@@ -20349,9 +19372,20 @@
 LLVM_INCLUDEDIR!$LLVM_INCLUDEDIR$ac_delim
 LLVM_INFODIR!$LLVM_INFODIR$ac_delim
 LLVM_MANDIR!$LLVM_MANDIR$ac_delim
+LLVM_CONFIGTIME!$LLVM_CONFIGTIME$ac_delim
+BINDINGS_TO_BUILD!$BINDINGS_TO_BUILD$ac_delim
+HAVE_OCAMLOPT!$HAVE_OCAMLOPT$ac_delim
+HAVE_OCAML_OUNIT!$HAVE_OCAML_OUNIT$ac_delim
+OCAML_LIBDIR!$OCAML_LIBDIR$ac_delim
+ENABLE_VISIBILITY_INLINES_HIDDEN!$ENABLE_VISIBILITY_INLINES_HIDDEN$ac_delim
+RPATH!$RPATH$ac_delim
+RDYNAMIC!$RDYNAMIC$ac_delim
+program_prefix!$program_prefix$ac_delim
+LIBOBJS!$LIBOBJS$ac_delim
+LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 96; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
@@ -20387,59 +19421,6 @@
 _ACEOF
 
 
-ac_delim='%!_!# '
-for ac_last_try in false false false false false :; do
-  cat >conf$$subs.sed <<_ACEOF
-LLVM_CONFIGTIME!$LLVM_CONFIGTIME$ac_delim
-BINDINGS_TO_BUILD!$BINDINGS_TO_BUILD$ac_delim
-ALL_BINDINGS!$ALL_BINDINGS$ac_delim
-OCAML_LIBDIR!$OCAML_LIBDIR$ac_delim
-ENABLE_VISIBILITY_INLINES_HIDDEN!$ENABLE_VISIBILITY_INLINES_HIDDEN$ac_delim
-RPATH!$RPATH$ac_delim
-RDYNAMIC!$RDYNAMIC$ac_delim
-program_prefix!$program_prefix$ac_delim
-LIBOBJS!$LIBOBJS$ac_delim
-LTLIBOBJS!$LTLIBOBJS$ac_delim
-_ACEOF
-
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 10; then
-    break
-  elif $ac_last_try; then
-    { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
-echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
-   { (exit 1); exit 1; }; }
-  else
-    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
-  fi
-done
-
-ac_eof=`sed -n '/^CEOF[0-9]*$/s/CEOF/0/p' conf$$subs.sed`
-if test -n "$ac_eof"; then
-  ac_eof=`echo "$ac_eof" | sort -nru | sed 1q`
-  ac_eof=`expr $ac_eof + 1`
-fi
-
-cat >>$CONFIG_STATUS <<_ACEOF
-cat >"\$tmp/subs-3.sed" <<\CEOF$ac_eof
-/@[a-zA-Z_][a-zA-Z_0-9]*@/!b end
-_ACEOF
-sed '
-s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g
-s/^/s,@/; s/!/@,|#_!!_#|/
-:n
-t n
-s/'"$ac_delim"'$/,g/; t
-s/$/\\/; p
-N; s/^.*\n//; s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g; b n
-' >>$CONFIG_STATUS <conf$$subs.sed
-rm -f conf$$subs.sed
-cat >>$CONFIG_STATUS <<_ACEOF
-:end
-s/|#_!!_#|//g
-CEOF$ac_eof
-_ACEOF
-
-
 # VPATH may cause trouble with some makes, so we remove $(srcdir),
 # ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and
 # trailing colons and then remove the whole line if VPATH becomes empty
@@ -20683,7 +19664,7 @@
 s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
 s&@INSTALL@&$ac_INSTALL&;t t
 $ac_datarootdir_hack
-" $ac_file_inputs | sed -f "$tmp/subs-1.sed" | sed -f "$tmp/subs-2.sed" | sed -f "$tmp/subs-3.sed" >$tmp/out
+" $ac_file_inputs | sed -f "$tmp/subs-1.sed" | sed -f "$tmp/subs-2.sed" | sed 's/|#_!!_#|//g' >$tmp/out
 
 test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
   { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } &&

diff --git a/device/include/llvm/Config/config.h b/device/include/llvm/Config/config.h
index f42e285..3aa64fa 100644
--- a/device/include/llvm/Config/config.h
+++ b/device/include/llvm/Config/config.h

@@ -646,13 +646,13 @@
 #define PACKAGE_NAME "LLVM"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "LLVM 3.4"
+#define PACKAGE_STRING "LLVM 3.6svn"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "llvm"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "3.4"
+#define PACKAGE_VERSION "3.6"
 
 /* Define as the return type of signal handlers (`int' or `void'). */
 #define RETSIGTYPE void

diff --git a/docs/Atomics.rst b/docs/Atomics.rst
index 5f17c61..6c8303b 100644
--- a/docs/Atomics.rst
+++ b/docs/Atomics.rst

@@ -18,16 +18,16 @@
 The atomic instructions are designed specifically to provide readable IR and
 optimized code generation for the following:
 
-* The new C++0x ``<atomic>`` header.  (`C++0x draft available here
-  <http://www.open-std.org/jtc1/sc22/wg21/>`_.) (`C1x draft available here
+* The new C++11 ``<atomic>`` header.  (`C++11 draft available here
+  <http://www.open-std.org/jtc1/sc22/wg21/>`_.) (`C11 draft available here
   <http://www.open-std.org/jtc1/sc22/wg14/>`_.)
 
 * Proper semantics for Java-style memory, for both ``volatile`` and regular
   shared variables. (`Java Specification
-  <http://java.sun.com/docs/books/jls/third_edition/html/memory.html>`_)
+  <http://docs.oracle.com/javase/specs/jls/se8/html/jls-17.html>`_)
 
 * gcc-compatible ``__sync_*`` builtins. (`Description
-  <http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html>`_)
+  <https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html>`_)
 
 * Other scenarios with atomic semantics, including ``static`` variables with
   non-trivial constructors in C++.
@@ -115,7 +115,10 @@
 A ``fence`` provides Acquire and/or Release ordering which is not part of
 another operation; it is normally used along with Monotonic memory operations.
 A Monotonic load followed by an Acquire fence is roughly equivalent to an
-Acquire load.
+Acquire load, and a Monotonic store following a Release fence is roughly
+equivalent to a Release store. SequentiallyConsistent fences behave as both
+an Acquire and a Release fence, and offer some additional complicated
+guarantees, see the C++11 standard for details.
 
 Frontends generating atomic instructions generally need to be aware of the
 target to some degree; atomic instructions are guaranteed to be lock-free, and
@@ -177,10 +180,10 @@
 
 Unordered is the lowest level of atomicity. It essentially guarantees that races
 produce somewhat sane results instead of having undefined behavior.  It also
-guarantees the operation to be lock-free, so it do not depend on the data being
-part of a special atomic structure or depend on a separate per-process global
-lock.  Note that code generation will fail for unsupported atomic operations; if
-you need such an operation, use explicit locking.
+guarantees the operation to be lock-free, so it does not depend on the data
+being part of a special atomic structure or depend on a separate per-process
+global lock.  Note that code generation will fail for unsupported atomic
+operations; if you need such an operation, use explicit locking.
 
 Relevant standard
   This is intended to match the Java memory model for shared variables.
@@ -221,7 +224,7 @@
 address, a consistent ordering exists.
 
 Relevant standard
-  This corresponds to the C++0x/C1x ``memory_order_relaxed``; see those
+  This corresponds to the C++11/C11 ``memory_order_relaxed``; see those
   standards for the exact definition.
 
 Notes for frontends
@@ -251,8 +254,8 @@
 other memory with normal loads and stores.
 
 Relevant standard
-  This corresponds to the C++0x/C1x ``memory_order_acquire``. It should also be
-  used for C++0x/C1x ``memory_order_consume``.
+  This corresponds to the C++11/C11 ``memory_order_acquire``. It should also be
+  used for C++11/C11 ``memory_order_consume``.
 
 Notes for frontends
   If you are writing a frontend which uses this directly, use with caution.
@@ -281,7 +284,7 @@
 release a lock.
 
 Relevant standard
-  This corresponds to the C++0x/C1x ``memory_order_release``.
+  This corresponds to the C++11/C11 ``memory_order_release``.
 
 Notes for frontends
   If you are writing a frontend which uses this directly, use with caution.
@@ -307,7 +310,7 @@
 barrier (for fences and operations which both read and write memory).
 
 Relevant standard
-  This corresponds to the C++0x/C1x ``memory_order_acq_rel``.
+  This corresponds to the C++11/C11 ``memory_order_acq_rel``.
 
 Notes for frontends
   If you are writing a frontend which uses this directly, use with caution.
@@ -330,7 +333,7 @@
 ordering exists between all SequentiallyConsistent operations.
 
 Relevant standard
-  This corresponds to the C++0x/C1x ``memory_order_seq_cst``, Java volatile, and
+  This corresponds to the C++11/C11 ``memory_order_seq_cst``, Java volatile, and
   the gcc-compatible ``__sync_*`` builtins which do not specify otherwise.
 
 Notes for frontends
@@ -368,6 +371,11 @@
   that they return true for any operation which is volatile or at least
   Monotonic.
 
+* ``isAtLeastAcquire()``/``isAtLeastRelease()``: These are predicates on
+  orderings. They can be useful for passes that are aware of atomics, for
+  example to do DSE across a single atomic access, but not across a
+  release-acquire pair (see MemoryDependencyAnalysis for an example of this)
+
 * Alias analysis: Note that AA will return ModRef for anything Acquire or
   Release, and for the address accessed by any Monotonic operation.
 
@@ -389,7 +397,9 @@
 
 * DSE: Unordered stores can be DSE'ed like normal stores.  Monotonic stores can
   be DSE'ed in some cases, but it's tricky to reason about, and not especially
-  important.
+  important. It is possible in some case for DSE to operate across a stronger
+  atomic operation, but it is fairly tricky. DSE delegates this reasoning to
+  MemoryDependencyAnalysis (which is also used by other passes like GVN).
 
 * Folding a load: Any atomic load from a constant global can be constant-folded,
   because it cannot be observed.  Similar reasoning allows scalarrepl with
@@ -400,7 +410,8 @@
 
 Atomic operations are represented in the SelectionDAG with ``ATOMIC_*`` opcodes.
 On architectures which use barrier instructions for all atomic ordering (like
-ARM), appropriate fences are split out as the DAG is built.
+ARM), appropriate fences can be emitted by the AtomicExpand Codegen pass if
+``setInsertFencesForAtomic()`` was used.
 
 The MachineMemOperand for all atomic operations is currently marked as volatile;
 this is not correct in the IR sense of volatile, but CodeGen handles anything
@@ -415,11 +426,6 @@
 generator is not very helpful here at the moment, but hopefully that will
 change.)
 
-The implementation of atomics on LL/SC architectures (like ARM) is currently a
-bit of a mess; there is a lot of copy-pasted code across targets, and the
-representation is relatively unsuited to optimization (it would be nice to be
-able to optimize loops involving cmpxchg etc.).
-
 On x86, all atomic loads generate a ``MOV``. SequentiallyConsistent stores
 generate an ``XCHG``, other stores generate a ``MOV``. SequentiallyConsistent
 fences generate an ``MFENCE``, other fences do not cause any code to be
@@ -435,3 +441,19 @@
 ``atomicrmw`` can be represented using a loop with LL/SC-style instructions
 which take some sort of exclusive lock on a cache line (``LDREX`` and ``STREX``
 on ARM, etc.).
+
+It is often easiest for backends to use AtomicExpandPass to lower some of the
+atomic constructs. Here are some lowerings it can do:
+
+* cmpxchg -> loop with load-linked/store-conditional
+  by overriding ``hasLoadLinkedStoreConditional()``, ``emitLoadLinked()``,
+  ``emitStoreConditional()``
+* large loads/stores -> ll-sc/cmpxchg
+  by overriding ``shouldExpandAtomicStoreInIR()``/``shouldExpandAtomicLoadInIR()``
+* strong atomic accesses -> monotonic accesses + fences
+  by using ``setInsertFencesForAtomic()`` and overriding ``emitLeadingFence()``
+  and ``emitTrailingFence()``
+* atomic rmw -> loop with cmpxchg or load-linked/store-conditional
+  by overriding ``expandAtomicRMWInIR()``
+
+For an example of all of these, look at the ARM backend.

diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index fce1e37..34485b5 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst

@@ -28,8 +28,9 @@
 provides a mechanism for the file to self-describe "abbreviations", which are
 effectively size optimizations for the content.
 
-LLVM IR files may be optionally embedded into a `wrapper`_ structure that makes
-it easy to embed extra data along with LLVM IR files.
+LLVM IR files may be optionally embedded into a `wrapper`_ structure, or in a
+`native object file`_. Both of these mechanisms make it easy to embed extra
+data along with LLVM IR files.
 
 This document first describes the LLVM bitstream format, describes the wrapper
 format, then describes the record structure used by LLVM IR files.
@@ -460,6 +461,19 @@
 in bytes of the stream. CPUType is a target-specific value that can be used to
 encode the CPU of the target.
 
+.. _native object file:
+
+Native Object File Wrapper Format
+=================================
+
+Bitcode files for LLVM IR may also be wrapped in a native object file
+(i.e. ELF, COFF, Mach-O).  The bitcode must be stored in a section of the
+object file named ``.llvmbc``.  This wrapper format is useful for accommodating
+LTO in compilation pipelines where intermediate objects must be native object
+files which contain metadata in other sections.
+
+Not all tools support this format.
+
 .. _encoding of LLVM IR:
 
 LLVM IR Encoding
@@ -714,7 +728,7 @@
 * *unnamed_addr*: If present and non-zero, indicates that the variable has
   ``unnamed_addr``
 
-.. _dllstorageclass:
+.. _bcdllstorageclass:
 
 * *dllstorageclass*: If present, an encoding of the DLL storage class of this variable:
 
@@ -773,7 +787,8 @@
 * *prefix*: If non-zero, the value index of the prefix data for this function,
   plus 1.
 
-* *dllstorageclass*: An encoding of the `dllstorageclass`_ of this function
+* *dllstorageclass*: An encoding of the
+  :ref:`dllstorageclass<bcdllstorageclass>` of this function
 
 MODULE_CODE_ALIAS Record
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -791,7 +806,8 @@
 
 * *visibility*: If present, an encoding of the `visibility`_ of the alias
 
-* *dllstorageclass*: If present, an encoding of the `dllstorageclass`_ of the alias
+* *dllstorageclass*: If present, an encoding of the
+  :ref:`dllstorageclass<bcdllstorageclass>` of the alias
 
 MODULE_CODE_PURGEVALS Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/docs/CMake.rst b/docs/CMake.rst
index bfc9cb9..653fa16 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst

@@ -218,10 +218,18 @@
   Enables code assertions. Defaults to OFF if and only if ``CMAKE_BUILD_TYPE``
   is *Release*.
 
+**LLVM_ENABLE_EH**:BOOL
+  Build LLVM with exception handling support. This is necessary if you wish to
+  link against LLVM libraries and make use of C++ exceptions in your own code
+  that need to propagate through LLVM code. Defaults to OFF.
+
 **LLVM_ENABLE_PIC**:BOOL
   Add the ``-fPIC`` flag for the compiler command-line, if the compiler supports
   this flag. Some systems, like Windows, do not need this flag. Defaults to ON.
 
+**LLVM_ENABLE_RTTI**:BOOL
+  Build LLVM with run time type information. Defaults to OFF.
+
 **LLVM_ENABLE_WARNINGS**:BOOL
   Enable all compiler warnings. Defaults to ON.
 
@@ -280,8 +288,14 @@
 
 **LLVM_USE_SANITIZER**:STRING
   Define the sanitizer used to build LLVM binaries and tests. Possible values
-  are ``Address``, ``Memory`` and ``MemoryWithOrigins``. Defaults to empty
-  string.
+  are ``Address``, ``Memory``, ``MemoryWithOrigins`` and ``Undefined``.
+  Defaults to empty string.
+
+**LLVM_PARALLEL_COMPILE_JOBS**:STRING
+  Define the maximum number of concurrent compilation jobs.
+
+**LLVM_PARALLEL_LINK_JOBS**:STRING
+  Define the maximum number of concurrent link jobs.
 
 **LLVM_BUILD_DOCS**:BOOL
   Enables all enabled documentation targets (i.e. Doxgyen and Sphinx targets) to
@@ -355,6 +369,10 @@
   is enabled). Currently the only target added is ``docs-llvm-man``. Defaults
   to ON.
 
+**SPHINX_WARNINGS_AS_ERRORS**:BOOL
+  If enabled then sphinx documentation warnings will be treated as
+  errors. Defaults to ON.
+
 Executing the test suite
 ========================
 
@@ -384,66 +402,112 @@
 Embedding LLVM in your project
 ==============================
 
-The most difficult part of adding LLVM to the build of a project is to determine
-the set of LLVM libraries corresponding to the set of required LLVM
-features. What follows is an example of how to obtain this information:
+From LLVM 3.5 onwards both the CMake and autoconf/Makefile build systems export
+LLVM libraries as importable CMake targets. This means that clients of LLVM can
+now reliably use CMake to develop their own LLVM based projects against an
+installed version of LLVM regardless of how it was built.
+
+Here is a simple example of CMakeLists.txt file that imports the LLVM libraries
+and uses them to build a simple application ``simple-tool``.
 
 .. code-block:: cmake
 
-  # A convenience variable:
-  set(LLVM_ROOT "" CACHE PATH "Root of LLVM install.")
+  cmake_minimum_required(VERSION 2.8.8)
+  project(SimpleProject)
 
-  # A bit of a sanity check:
-  if( NOT EXISTS ${LLVM_ROOT}/include/llvm )
-  message(FATAL_ERROR "LLVM_ROOT (${LLVM_ROOT}) is not a valid LLVM install")
-  endif()
+  find_package(LLVM REQUIRED CONFIG)
 
-  # We incorporate the CMake features provided by LLVM:
-  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${LLVM_ROOT}/share/llvm/cmake")
-  include(LLVMConfig)
+  message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+  message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 
-  # Now set the header and library paths:
-  include_directories( ${LLVM_INCLUDE_DIRS} )
-  link_directories( ${LLVM_LIBRARY_DIRS} )
-  add_definitions( ${LLVM_DEFINITIONS} )
+  # Set your project compile flags.
+  # E.g. if using the C++ header files
+  # you will need to enable C++11 support
+  # for your compiler.
 
-  # Let's suppose we want to build a JIT compiler with support for
-  # binary code (no interpreter):
-  llvm_map_components_to_libraries(REQ_LLVM_LIBRARIES jit native)
+  include_directories(${LLVM_INCLUDE_DIRS})
+  add_definitions(${LLVM_DEFINITIONS})
 
-  # Finally, we link the LLVM libraries to our executable:
-  target_link_libraries(mycompiler ${REQ_LLVM_LIBRARIES})
+  # Now build our tools
+  add_excutable(simple-tool tool.cpp)
 
-This assumes that LLVM_ROOT points to an install of LLVM. The procedure works
-too for uninstalled builds although we need to take care to add an
-`include_directories` for the location of the headers on the LLVM source
-directory (if we are building out-of-source.)
+  # Find the libraries that correspond to the LLVM components
+  # that we wish to use
+  llvm_map_components_to_libnames(llvm_libs support core irreader)
 
-Alternativaly, you can utilize CMake's ``find_package`` functionality. Here is
-an equivalent variant of snippet shown above:
+  # Link against LLVM libraries
+  target_link_libraries(simple-tool ${llvm_libs})
 
-.. code-block:: cmake
+The ``find_package(...)`` directive when used in CONFIG mode (as in the above
+example) will look for the ``LLVMConfig.cmake`` file in various locations (see
+cmake manual for details).  It creates a ``LLVM_DIR`` cache entry to save the
+directory where ``LLVMConfig.cmake`` is found or allows the user to specify the
+directory (e.g. by passing ``-DLLVM_DIR=/usr/share/llvm/cmake`` to
+the ``cmake`` command or by setting it directly in ``ccmake`` or ``cmake-gui``).
 
-  find_package(LLVM)
+This file is available in two different locations.
 
-  if( NOT LLVM_FOUND )
-    message(FATAL_ERROR "LLVM package can't be found. Set CMAKE_PREFIX_PATH variable to LLVM's installation prefix.")
-  endif()
+* ``<INSTALL_PREFIX>/share/llvm/cmake/LLVMConfig.cmake`` where
+  ``<INSTALL_PREFIX>`` is the install prefix of an installed version of LLVM.
+  On Linux typically this is ``/usr/share/llvm/cmake/LLVMConfig.cmake``.
 
-  include_directories( ${LLVM_INCLUDE_DIRS} )
-  link_directories( ${LLVM_LIBRARY_DIRS} )
+* ``<LLVM_BUILD_ROOT>/share/llvm/cmake/LLVMConfig.cmake`` where
+  ``<LLVM_BUILD_ROOT>`` is the root of the LLVM build tree. **Note this only
+  available when building LLVM with CMake**
 
-  llvm_map_components_to_libraries(REQ_LLVM_LIBRARIES jit native)
+If LLVM is installed in your operating system's normal installation prefix (e.g.
+on Linux this is usually ``/usr/``) ``find_package(LLVM ...)`` will
+automatically find LLVM if it is installed correctly. If LLVM is not installed
+or you wish to build directly against the LLVM build tree you can use
+``LLVM_DIR`` as previously mentioned.
 
-  target_link_libraries(mycompiler ${REQ_LLVM_LIBRARIES})
+The ``LLVMConfig.cmake`` file sets various useful variables. Notable variables
+include
+
+``LLVM_CMAKE_DIR``
+  The path to the LLVM CMake directory (i.e. the directory containing
+  LLVMConfig.cmake).
+
+``LLVM_DEFINITIONS``
+  A list of preprocessor defines that should be used when building against LLVM.
+
+``LLVM_ENABLE_ASSERTIONS``
+  This is set to ON if LLVM was built with assertions, otherwise OFF.
+
+``LLVM_ENABLE_EH``
+  This is set to ON if LLVM was built with exception handling (EH) enabled,
+  otherwise OFF.
+
+``LLVM_ENABLE_RTTI``
+  This is set to ON if LLVM was built with run time type information (RTTI),
+  otherwise OFF.
+
+``LLVM_INCLUDE_DIRS``
+  A list of include paths to directories containing LLVM header files.
+
+``LLVM_PACKAGE_VERSION``
+  The LLVM version. This string can be used with CMake conditionals. E.g. ``if
+  (${LLVM_PACKAGE_VERSION} VERSION_LESS "3.5")``.
+
+``LLVM_TOOLS_BINARY_DIR``
+  The path to the directory containing the LLVM tools (e.g. ``llvm-as``).
+
+Notice that in the above example we link ``simple-tool`` against several LLVM
+libraries. The list of libraries is determined by using the
+``llvm_map_components_to_libnames()`` CMake function. For a list of available
+components look at the output of running ``llvm-config --components``.
+
+Note that for LLVM < 3.5 ``llvm_map_components_to_libraries()`` was
+used instead of ``llvm_map_components_to_libnames()``. This is now deprecated
+and will be removed in a future version of LLVM.
 
 .. _cmake-out-of-source-pass:
 
-Developing LLVM pass out of source
-----------------------------------
+Developing LLVM passes out of source
+------------------------------------
 
-It is possible to develop LLVM passes against installed LLVM.  An example of
-project layout provided below:
+It is possible to develop LLVM passes out of LLVM's source tree (i.e. against an
+installed or built LLVM). An example of a project layout is provided below.
 
 .. code-block:: none
 
@@ -460,14 +524,10 @@
 
 .. code-block:: cmake
 
-  find_package(LLVM)
-
-  # Define add_llvm_* macro's.
-  include(AddLLVM)
+  find_package(LLVM REQUIRED CONFIG)
 
   add_definitions(${LLVM_DEFINITIONS})
   include_directories(${LLVM_INCLUDE_DIRS})
-  link_directories(${LLVM_LIBRARY_DIRS})
 
   add_subdirectory(<pass name>)
 
@@ -475,6 +535,25 @@
 
 .. code-block:: cmake
 
+  add_library(LLVMPassname MODULE Pass.cpp)
+
+Note if you intend for this pass to be merged into the LLVM source tree at some
+point in the future it might make more sense to use LLVM's internal
+add_llvm_loadable_module function instead by...
+
+
+Adding the following to ``<project dir>/CMakeLists.txt`` (after
+``find_package(LLVM ...)``)
+
+.. code-block:: cmake
+
+  list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+  include(AddLLVM)
+
+And then changing ``<project dir>/<pass name>/CMakeLists.txt`` to
+
+.. code-block:: cmake
+
   add_llvm_loadable_module(LLVMPassname
     Pass.cpp
     )

diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index 5736e43..b0a1059 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst

@@ -290,10 +290,10 @@
 whether the instruction has certain target-independent properties (accesses
 memory, is commutable, etc), and holds any target-specific flags.
 
-The ``TargetFrameInfo`` class
------------------------------
+The ``TargetFrameLowering`` class
+---------------------------------
 
-The ``TargetFrameInfo`` class is used to provide information about the stack
+The ``TargetFrameLowering`` class is used to provide information about the stack
 frame layout of the target. It holds the direction of stack growth, the known
 stack alignment on entry to each function, and the offset to the local area.
 The offset to the local area is the offset from the stack pointer on function
@@ -769,7 +769,9 @@
 calls, returns, etc).  All nodes that have side effects should take a token
 chain as input and produce a new one as output.  By convention, token chain
 inputs are always operand #0, and chain results are always the last value
-produced by an operation.
+produced by an operation. However, after instruction selection, the
+machine nodes have their chain after the instruction's operands, and
+may be followed by glue nodes.
 
 A SelectionDAG has designated "Entry" and "Root" nodes.  The Entry node is
 always a marker node with an Opcode of ``ISD::EntryToken``.  The Root node is

diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index 3cfa1f6..0552c71 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst

@@ -162,6 +162,8 @@
 * ``std::initializer_list`` (and the constructors and functions that take it as
   an argument) are not always available, so you cannot (for example) initialize
   a ``std::vector`` with a braced initializer list.
+* ``std::equal()`` (and other algorithms) incorrectly assert in MSVC when given
+  ``nullptr`` as an iterator.
 
 Other than these areas you should assume the standard library is available and
 working as expected until some build bot tells you otherwise. If you're in an
@@ -174,6 +176,25 @@
 .. _the libstdc++ manual:
   http://gcc.gnu.org/onlinedocs/gcc-4.7.3/libstdc++/manual/manual/status.html#status.iso.2011
 
+Other Languages
+---------------
+
+Any code written in the Go programming language is not subject to the
+formatting rules below. Instead, we adopt the formatting rules enforced by
+the `gofmt`_ tool.
+
+Go code should strive to be idiomatic. Two good sets of guidelines for what
+this means are `Effective Go`_ and `Go Code Review Comments`_.
+
+.. _gofmt:
+  https://golang.org/cmd/gofmt/
+
+.. _Effective Go:
+  https://golang.org/doc/effective_go.html
+
+.. _Go Code Review Comments:
+  https://code.google.com/p/go-wiki/wiki/CodeReviewComments
+
 Mechanical Source Issues
 ========================
 

diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 5a60d60..af01503 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst

@@ -49,6 +49,17 @@
  The :option:`--strict-whitespace` argument disables this behavior. End-of-line
  sequences are canonicalized to UNIX-style ``\n`` in all modes.
 
+.. option:: --implicit-check-not check-pattern
+
+  Adds implicit negative checks for the specified patterns between positive
+  checks. The option allows writing stricter tests without stuffing them with
+  ``CHECK-NOT``\ s.
+
+  For example, "``--implicit-check-not warning:``" can be useful when testing
+  diagnostic messages from tools that don't have an option similar to ``clang
+  -verify``. With this option FileCheck will verify that input does not contain
+  warnings not covered by any ``CHECK:`` patterns.
+
 .. option:: -version
 
  Show the version number of this program.

diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index 4d84be6..2708e9d 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst

@@ -84,6 +84,14 @@
 
  Do not use curses based progress bar.
 
+.. option:: --show-unsupported
+
+ Show the names of unsupported tests.
+
+.. option:: --show-xfail
+
+ Show the names of tests that were expected to fail.
+
 .. _execution-options:
 
 EXECUTION OPTIONS
@@ -262,7 +270,7 @@
 themselves are Python modules which will be executed.  When the config file is
 executed, two important global variables are predefined:
 
-**lit**
+**lit_config**
 
  The global **lit** configuration object (a *LitConfig* instance), which defines
  the builtin test formats, global configuration parameters, and other helper
@@ -307,14 +315,6 @@
  **root** The root configuration.  This is the top-most :program:`lit` configuration in
  the project.
 
- **on_clone** The config is actually cloned for every subdirectory inside a test
- suite, to allow local configuration on a per-directory basis.  The *on_clone*
- variable can be set to a Python function which will be called whenever a
- configuration is cloned (for a subdirectory).  The function should takes three
- arguments: (1) the parent configuration, (2) the new configuration (which the
- *on_clone* function will generally modify), and (3) the test path to the new
- directory being scanned.
-
  **pipefail** Normally a test using a shell pipe fails if any of the commands
  on the pipe fail. If this is not desired, setting this variable to false
  makes the test fail only if the last command in the pipe fails.

diff --git a/docs/CommandGuide/llvm-config.rst b/docs/CommandGuide/llvm-config.rst
index 0ebb344..34075d0 100644
--- a/docs/CommandGuide/llvm-config.rst
+++ b/docs/CommandGuide/llvm-config.rst

@@ -151,7 +151,7 @@
 
 **all**
 
- Includes all LLVM libaries.  The default if no components are specified.
+ Includes all LLVM libraries.  The default if no components are specified.
 
 
 

diff --git a/docs/CommandGuide/llvm-profdata.rst b/docs/CommandGuide/llvm-profdata.rst
index 6b8e4d7..0762e2c 100644
--- a/docs/CommandGuide/llvm-profdata.rst
+++ b/docs/CommandGuide/llvm-profdata.rst

@@ -1,29 +1,102 @@
-llvm-profdata - work with profile data
-======================================
+llvm-profdata - Profile data tool
+=================================
 
 SYNOPSIS
 --------
 
-:program:`llvm-profdata` [-output=output] file1 file2
+:program:`llvm-profdata` *command* [*args...*]
 
 DESCRIPTION
 -----------
 
-The experimental :program:`llvm-profdata` tool reads two profile data files
-generated by PGO instrumentation and generates a file with merged data.
+The :program:`llvm-profdata` tool is a small utility for working with profile
+data files.
 
-The profile data format itself is currently textual.
+COMMANDS
+--------
+
+* `merge <profdata_merge_>`_
+* `show <profdata_show_>`_
+
+.. program:: llvm-profdata merge
+
+.. _profdata_merge:
+
+MERGE
+-----
+
+SYNOPSIS
+^^^^^^^^
+
+:program:`llvm-profdata merge` [*options*] [*filenames...*]
+
+DESCRIPTION
+^^^^^^^^^^^
+
+:program:`llvm-profdata merge` takes several profile data files
+generated by PGO instrumentation and merges them together into a single
+indexed profile data file.
 
 OPTIONS
--------
+^^^^^^^
 
-.. option:: -output=output
+.. option:: -help
 
- This option selects the output filename.  If not specified, output is to
- stdout.
+ Print a summary of command line options.
+
+.. option:: -output=output, -o=output
+
+ Specify the output file name.  *Output* cannot be ``-`` as the resulting
+ indexed profile data can't be written to standard output.
+
+.. program:: llvm-profdata show
+
+.. _profdata_show:
+
+SHOW
+----
+
+SYNOPSIS
+^^^^^^^^
+
+:program:`llvm-profdata show` [*options*] [*filename*]
+
+DESCRIPTION
+^^^^^^^^^^^
+
+:program:`llvm-profdata show` takes a profile data file and displays the
+information about the profile counters for this file and
+for any of the specified function(s).
+
+If *filename* is omitted or is ``-``, then **llvm-profdata show** reads its
+input from standard input.
+
+OPTIONS
+^^^^^^^
+
+.. option:: -all-functions
+
+ Print details for every function.
+
+.. option:: -counts
+
+ Print the counter values for the displayed functions.
+
+.. option:: -function=string
+
+ Print details for a function if the function's name contains the given string.
+
+.. option:: -help
+
+ Print a summary of command line options.
+
+.. option:: -output=output, -o=output
+
+ Specify the output file name.  If *output* is ``-`` or it isn't specified,
+ then the output is sent to standard output.
 
 EXIT STATUS
 -----------
 
-:program:`llvm-profdata` returns 1 if it cannot read input files or there is a
-mismatch between their data.
+:program:`llvm-profdata` returns 1 if the command is omitted or is invalid,
+if it cannot read input files, or if there is a mismatch between their data.

diff --git a/docs/CommandGuide/llvm-symbolizer.rst b/docs/CommandGuide/llvm-symbolizer.rst
index ce2d9c0..96720e6 100644
--- a/docs/CommandGuide/llvm-symbolizer.rst
+++ b/docs/CommandGuide/llvm-symbolizer.rst

@@ -92,6 +92,13 @@
  input (see example above). If architecture is not specified in either way,
  address will not be symbolized. Defaults to empty string.
 
+.. option:: -dsym-hint=<path/to/file.dSYM>
+
+ (Darwin-only flag). If the debug info for a binary isn't present in the default
+ location, look for the debug info at the .dSYM path provided via the
+ ``-dsym-hint`` flag. This flag can be used multiple times.
+
+
 EXIT STATUS
 -----------
 

diff --git a/docs/CommandGuide/opt.rst b/docs/CommandGuide/opt.rst
index ad5b62c..3a050f7 100644
--- a/docs/CommandGuide/opt.rst
+++ b/docs/CommandGuide/opt.rst

@@ -62,27 +62,14 @@
  available.  The order in which the options occur on the command line are the
  order in which they are executed (within pass constraints).
 
-.. option:: -std-compile-opts
-
- This is short hand for a standard list of *compile time optimization* passes.
- It might be useful for other front end compilers as well.  To discover the
- full set of options available, use the following command:
-
- .. code-block:: sh
-
-     llvm-as < /dev/null | opt -std-compile-opts -disable-output -debug-pass=Arguments
-
 .. option:: -disable-inlining
 
- This option is only meaningful when :option:`-std-compile-opts` is given.  It
- simply removes the inlining pass from the standard list.
+ This option simply removes the inlining pass from the standard list.
 
 .. option:: -disable-opt
 
- This option is only meaningful when :option:`-std-compile-opts` is given.  It
- disables most, but not all, of the :option:`-std-compile-opts`.  The ones that
- remain are :option:`-verify`, :option:`-lower-setjmp`, and
- :option:`-funcresolve`.
+ This option is only meaningful when :option:`-std-link-opts` is given.  It
+ disables most passes.
 
 .. option:: -strip-debug
 
@@ -95,9 +82,7 @@
  This option causes opt to add a verify pass after every pass otherwise
  specified on the command line (including :option:`-verify`).  This is useful
  for cases where it is suspected that a pass is creating an invalid module but
- it is not clear which pass is doing it.  The combination of
- :option:`-std-compile-opts` and :option:`-verify-each` can quickly track down
- this kind of problem.
+ it is not clear which pass is doing it.
 
 .. option:: -stats
 

diff --git a/docs/CommandLine.rst b/docs/CommandLine.rst
index 1b342e3..1d85215 100644
--- a/docs/CommandLine.rst
+++ b/docs/CommandLine.rst

@@ -1630,13 +1630,13 @@
 
 .. code-block:: c++
 
-  struct FileSizeParser : public cl::basic_parser<unsigned> {
+  struct FileSizeParser : public cl::parser<unsigned> {
     // parse - Return true on error.
-    bool parse(cl::Option &O, const char *ArgName, const std::string &ArgValue,
+    bool parse(cl::Option &O, StringRef ArgName, const std::string &ArgValue,
                unsigned &Val);
   };
 
-Our new class inherits from the ``cl::basic_parser`` template class to fill in
+Our new class inherits from the ``cl::parser`` template class to fill in
 the default, boiler plate code for us.  We give it the data type that we parse
 into, the last argument to the ``parse`` method, so that clients of our custom
 parser know what object type to pass in to the parse method.  (Here we declare
@@ -1652,7 +1652,7 @@
 
 .. code-block:: c++
 
-  bool FileSizeParser::parse(cl::Option &O, const char *ArgName,
+  bool FileSizeParser::parse(cl::Option &O, StringRef ArgName,
                              const std::string &Arg, unsigned &Val) {
     const char *ArgStart = Arg.c_str();
     char *End;
@@ -1698,7 +1698,7 @@
   OPTIONS:
     -help                 - display available options (-help-hidden for more)
     ...
-   -max-file-size=<size> - Maximum file size to accept
+    -max-file-size=<size> - Maximum file size to accept
 
 And we can test that our parse works correctly now (the test program just prints
 out the max-file-size argument value):

diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index 606b5f5..a012c32 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst

@@ -74,6 +74,7 @@
 * `AMD Evergreen shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_Evergreen-Family_Instruction_Set_Architecture.pdf>`_
 * `AMD Cayman/Trinity shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_HD_6900_Series_Instruction_Set_Architecture.pdf>`_
 * `AMD Southern Islands Series ISA <http://developer.amd.com/wordpress/media/2012/12/AMD_Southern_Islands_Instruction_Set_Architecture.pdf>`_
+* `AMD Sea Islands Series ISA <http://developer.amd.com/wordpress/media/2013/07/AMD_Sea_Islands_Instruction_Set_Architecture.pdf>`_
 * `AMD GPU Programming Guide <http://developer.amd.com/download/AMD_Accelerated_Parallel_Processing_OpenCL_Programming_Guide.pdf>`_
 * `AMD Compute Resources <http://developer.amd.com/tools/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/documentation/>`_
 

diff --git a/docs/CoverageMappingFormat.rst b/docs/CoverageMappingFormat.rst
new file mode 100644
index 0000000..8fcffb8
--- /dev/null
+++ b/docs/CoverageMappingFormat.rst

@@ -0,0 +1,576 @@
+.. role:: raw-html(raw)
+   :format: html
+
+=================================
+LLVM Code Coverage Mapping Format
+=================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+LLVM's code coverage mapping format is used to provide code coverage
+analysis using LLVM's and Clang's instrumenation based profiling
+(Clang's ``-fprofile-instr-generate`` option).
+
+This document is aimed at those who use LLVM's code coverage mapping to provide
+code coverage analysis for their own programs, and for those who would like
+to know how it works under the hood. A prior knowledge of how Clang's profile
+guided optimization works is useful, but not required.
+
+We start by showing how to use LLVM and Clang for code coverage analysis,
+then we briefly desribe LLVM's code coverage mapping format and the
+way that Clang and LLVM's code coverage tool work with this format. After
+the basics are down, more advanced features of the coverage mapping format
+are discussed - such as the data structures, LLVM IR representation and
+the binary encoding.
+
+Quick Start
+===========
+
+Here's a short story that describes how to generate code coverage overview
+for a sample source file called *test.c*.
+
+* First, compile an instrumented version of your program using Clang's
+  ``-fprofile-instr-generate`` option with the additional ``-fcoverage-mapping``
+  option:
+
+  ``clang -o test -fprofile-instr-generate -fcoverage-mapping test.c``
+* Then, run the instrumented binary. The runtime will produce a file called
+  *default.profraw* containing the raw profile instrumentation data:
+
+  ``./test``
+* After that, merge the profile data using the *llvm-profdata* tool:
+
+  ``llvm-profdata merge -o test.profdata default.profraw``
+* Finally, run LLVM's code coverage tool (*llvm-cov*) to produce the code
+  coverage overview for the sample source file:
+
+  ``llvm-cov show ./test -instr-profile=test.profdata test.c``
+
+High Level Overview
+===================
+
+LLVM's code coverage mapping format is designed to be a self contained
+data format, that can be embedded into the LLVM IR and object files.
+It's described in this document as a **mapping** format because its goal is
+to store the data that is required for a code coverage tool to map between
+the specific source ranges in a file and the execution counts obtained
+after running the instrumented version of the program.
+
+The mapping data is used in two places in the code coverage process:
+
+1. When clang compiles a source file with ``-fcoverage-mapping``, it
+   generates the mapping information that describes the mapping between the
+   source ranges and the profiling instrumentation counters.
+   This information gets embedded into the LLVM IR and conveniently
+   ends up in the final executable file when the program is linked.
+
+2. It is also used by *llvm-cov* - the mapping information is extracted from an
+   object file and is used to associate the execution counts (the values of the
+   profile instrumentation counters), and the source ranges in a file.
+   After that, the tool is able to generate various code coverage reports
+   for the program.
+
+The coverage mapping format aims to be a "universal format" that would be
+suitable for usage by any frontend, and not just by Clang. It also aims to
+provide the frontend the possibility of generating the minimal coverage mapping
+data in order to reduce the size of the IR and object files - for example,
+instead of emitting mapping information for each statement in a function, the
+frontend is allowed to group the statements with the same execution count into
+regions of code, and emit the mapping information only for those regions.
+
+Advanced Concepts
+=================
+
+The remainder of this guide is meant to give you insight into the way the
+coverage mapping format works.
+
+The coverage mapping format operates on a per-function level as the
+profile instrumentation counters are associated with a specific function.
+For each function that requires code coverage, the frontend has to create
+coverage mapping data that can map between the source code ranges and
+the profile instrumentation counters for that function.
+
+Mapping Region
+--------------
+
+The function's coverage mapping data contains an array of mapping regions.
+A mapping region stores the `source code range`_ that is covered by this region,
+the `file id <coverage file id_>`_, the `coverage mapping counter`_ and
+the region's kind.
+There are several kinds of mapping regions:
+
+* Code regions associate portions of source code and `coverage mapping
+  counters`_. They make up the majority of the mapping regions. They are used
+  by the code coverage tool to compute the execution counts for lines,
+  highlight the regions of code that were never executed, and to obtain
+  the various code coverage statistics for a function.
+  For example:
+
+  :raw-html:`<pre class='highlight' style='line-height:initial;'><span>int main(int argc, const char *argv[]) </span><span style='background-color:#4A789C'>{    </span> <span class='c1'>// Code Region from 1:40 to 9:2</span>
+  <span style='background-color:#4A789C'>                                            </span>
+  <span style='background-color:#4A789C'>  if (argc &gt; 1) </span><span style='background-color:#85C1F5'>{                         </span>   <span class='c1'>// Code Region from 3:17 to 5:4</span>
+  <span style='background-color:#85C1F5'>    printf("%s\n", argv[1]);              </span>
+  <span style='background-color:#85C1F5'>  }</span><span style='background-color:#4A789C'> else </span><span style='background-color:#F6D55D'>{                                </span>   <span class='c1'>// Code Region from 5:10 to 7:4</span>
+  <span style='background-color:#F6D55D'>    printf("\n");                         </span>
+  <span style='background-color:#F6D55D'>  }</span><span style='background-color:#4A789C'>                                         </span>
+  <span style='background-color:#4A789C'>  return 0;                                 </span>
+  <span style='background-color:#4A789C'>}</span>
+  </pre>`
+* Skipped regions are used to represent source ranges that were skipped
+  by Clang's preprocessor. They don't associate with
+  `coverage mapping counters`_, as the frontend knows that they are never
+  executed. They are used by the code coverage tool to mark the skipped lines
+  inside a function as non-code lines that don't have execution counts.
+  For example:
+
+  :raw-html:`<pre class='highlight' style='line-height:initial;'><span>int main() </span><span style='background-color:#4A789C'>{               </span> <span class='c1'>// Code Region from 1:12 to 6:2</span>
+  <span style='background-color:#85C1F5'>#ifdef DEBUG             </span>   <span class='c1'>// Skipped Region from 2:1 to 4:2</span>
+  <span style='background-color:#85C1F5'>  printf("Hello world"); </span>
+  <span style='background-color:#85C1F5'>#</span><span style='background-color:#4A789C'>endif                     </span>
+  <span style='background-color:#4A789C'>  return 0;                </span>
+  <span style='background-color:#4A789C'>}</span>
+  </pre>`
+* Expansion regions are used to represent Clang's macro expansions. They
+  have an additional property - *expanded file id*. This property can be
+  used by the code coverage tool to find the mapping regions that are created
+  as a result of this macro expansion, by checking if their file id matches the
+  expanded file id. They don't associate with `coverage mapping counters`_,
+  as the code coverage tool can determine the execution count for this region
+  by looking up the execution count of the first region with a corresponding
+  file id.
+  For example:
+
+  :raw-html:`<pre class='highlight' style='line-height:initial;'><span>int func(int x) </span><span style='background-color:#4A789C'>{                             </span>
+  <span style='background-color:#4A789C'>  #define MAX(x,y) </span><span style='background-color:#85C1F5'>((x) &gt; (y)? </span><span style='background-color:#F6D55D'>(x)</span><span style='background-color:#85C1F5'> : </span><span style='background-color:#F4BA70'>(y)</span><span style='background-color:#85C1F5'>)</span><span style='background-color:#4A789C'>     </span>
+  <span style='background-color:#4A789C'>  return </span><span style='background-color:#7FCA9F'>MAX</span><span style='background-color:#4A789C'>(x, 42);                          </span> <span class='c1'>// Expansion Region from 3:10 to 3:13</span>
+  <span style='background-color:#4A789C'>}</span>
+  </pre>`
+
+.. _source code range:
+
+Source Range:
+^^^^^^^^^^^^^
+
+The source range record contains the starting and ending location of a certain
+mapping region. Both locations include the line and the column numbers.
+
+.. _coverage file id:
+
+File ID:
+^^^^^^^^
+
+The file id an integer value that tells us
+in which source file or macro expansion is this region located.
+It enables Clang to produce mapping information for the code
+defined inside macros, like this example demonstrates:
+
+:raw-html:`<pre class='highlight' style='line-height:initial;'><span>void func(const char *str) </span><span style='background-color:#4A789C'>{        </span> <span class='c1'>// Code Region from 1:28 to 6:2 with file id 0</span>
+<span style='background-color:#4A789C'>  #define PUT </span><span style='background-color:#85C1F5'>printf("%s\n", str)</span><span style='background-color:#4A789C'>   </span> <span class='c1'>// 2 Code Regions from 2:15 to 2:34 with file ids 1 and 2</span>
+<span style='background-color:#4A789C'>  if(*str)                          </span>
+<span style='background-color:#4A789C'>    </span><span style='background-color:#F6D55D'>PUT</span><span style='background-color:#4A789C'>;                            </span> <span class='c1'>// Expansion Region from 4:5 to 4:8 with file id 0 that expands a macro with file id 1</span>
+<span style='background-color:#4A789C'>  </span><span style='background-color:#F6D55D'>PUT</span><span style='background-color:#4A789C'>;                              </span> <span class='c1'>// Expansion Region from 5:3 to 5:6 with file id 0 that expands a macro with file id 2</span>
+<span style='background-color:#4A789C'>}</span>
+</pre>`
+
+.. _coverage mapping counter:
+.. _coverage mapping counters:
+
+Counter:
+^^^^^^^^
+
+A coverage mapping counter can represents a reference to the profile
+instrumentation counter. The execution count for a region with such counter
+is determined by looking up the value of the corresponding profile
+instrumentation counter.
+
+It can also represent a binary arithmetical expression that operates on
+coverage mapping counters or other expressions.
+The execution count for a region with an expression counter is determined by
+evaluating the expression's arguments and then adding them together or
+subtracting them from one another.
+In the example below, a subtraction expression is used to compute the execution
+count for the compound statement that follows the *else* keyword:
+
+:raw-html:`<pre class='highlight' style='line-height:initial;'><span>int main(int argc, const char *argv[]) </span><span style='background-color:#4A789C'>{   </span> <span class='c1'>// Region's counter is a reference to the profile counter #0</span>
+<span style='background-color:#4A789C'>                                           </span>
+<span style='background-color:#4A789C'>  if (argc &gt; 1) </span><span style='background-color:#85C1F5'>{                        </span>   <span class='c1'>// Region's counter is a reference to the profile counter #1</span>
+<span style='background-color:#85C1F5'>    printf("%s\n", argv[1]);             </span><span>   </span>
+<span style='background-color:#85C1F5'>  }</span><span style='background-color:#4A789C'> else </span><span style='background-color:#F6D55D'>{                               </span>   <span class='c1'>// Region's counter is an expression (reference to the profile counter #0 - reference to the profile counter #1)</span>
+<span style='background-color:#F6D55D'>    printf("\n");                        </span>
+<span style='background-color:#F6D55D'>  }</span><span style='background-color:#4A789C'>                                        </span>
+<span style='background-color:#4A789C'>  return 0;                                </span>
+<span style='background-color:#4A789C'>}</span>
+</pre>`
+
+Finally, a coverage mapping counter can also represent an execution count of
+of zero. The zero counter is used to provide coverage mapping for
+unreachable statements and expressions, like in the example below:
+
+:raw-html:`<pre class='highlight' style='line-height:initial;'><span>int main() </span><span style='background-color:#4A789C'>{                  </span>
+<span style='background-color:#4A789C'>  return 0;                   </span>
+<span style='background-color:#4A789C'>  </span><span style='background-color:#85C1F5'>printf("Hello world!\n")</span><span style='background-color:#4A789C'>;   </span> <span class='c1'>// Unreachable region's counter is zero</span>
+<span style='background-color:#4A789C'>}</span>
+</pre>`
+
+The zero counters allow the code coverage tool to display proper line execution
+counts for the unreachable lines and highlight the unreachable code.
+Without them, the tool would think that those lines and regions were still
+executed, as it doesn't possess the frontend's knowledge.
+
+LLVM IR Representation
+======================
+
+The coverage mapping data is stored in the LLVM IR using a single global
+constant structure variable called *__llvm_coverage_mapping*
+with the *__llvm_covmap* section specifier.
+
+For example, let’s consider a C file and how it gets compiled to LLVM:
+
+.. _coverage mapping sample:
+
+.. code-block:: c
+
+  int foo() {
+    return 42;
+  }
+  int bar() {
+    return 13;
+  }
+
+The coverage mapping variable generated by Clang is:
+
+.. code-block:: llvm
+
+  @__llvm_coverage_mapping = internal constant { i32, i32, i32, i32, [2 x { i8*, i32, i32 }], [40 x i8] }
+  { i32 2,  ; The number of function records
+    i32 20, ; The length of the string that contains the encoded translation unit filenames
+    i32 20, ; The length of the string that contains the encoded coverage mapping data
+    i32 0,  ; Coverage mapping format version
+    [2 x { i8*, i32, i32 }] [ ; Function records
+     { i8*, i32, i32 } { i8* getelementptr inbounds ([3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), ; Function's name
+       i32 3, ; Function's name length
+       i32 9  ; Function's encoded coverage mapping data string length
+     },
+     { i8*, i32, i32 } { i8* getelementptr inbounds ([3 x i8]* @__llvm_profile_name_bar, i32 0, i32 0), ; Function's name
+       i32 3, ; Function's name length
+       i32 9  ; Function's encoded coverage mapping data string length
+     }],
+   [40 x i8] c"..." ; Encoded data (dissected later)
+  }, section "__llvm_covmap", align 8
+
+Version:
+--------
+
+The coverage mapping version number can have the following values:
+
+* 0 — The first (current) version of the coverage mapping format.
+
+.. _function records:
+
+Function record:
+----------------
+
+A function record is a structure of the following type:
+
+.. code-block:: llvm
+
+  { i8*, i32, i32 }
+
+It contains the pointer to the function's name, function's name length,
+and the length of the encoded mapping data for that function.
+
+Encoded data:
+-------------
+
+The encoded data is stored in a single string that contains
+the encoded filenames used by this translation unit and the encoded coverage
+mapping data for each function in this translation unit.
+
+The encoded data has the following structure:
+
+``[filenames, coverageMappingDataForFunctionRecord0, coverageMappingDataForFunctionRecord1, ..., padding]``
+
+If necessary, the encoded data is padded with zeroes so that the size
+of the data string is rounded up to the nearest multiple of 8 bytes.
+
+Dissecting the sample:
+^^^^^^^^^^^^^^^^^^^^^^
+
+Here's an overview of the encoded data that was stored in the
+IR for the `coverage mapping sample`_ that was shown earlier:
+
+* The IR contains the following string constant that represents the encoded
+  coverage mapping data for the sample translation unit:
+
+  .. code-block:: llvm
+
+    c"\01\12/Users/alex/test.c\01\00\00\01\01\01\0C\02\02\01\00\00\01\01\04\0C\02\02\00\00"
+
+* The string contains values that are encoded in the LEB128 format, which is
+  used throughout for storing integers. It also contains a string value.
+
+* The length of the substring that contains the encoded translation unit
+  filenames is the value of the second field in the *__llvm_coverage_mapping*
+  structure, which is 20, thus the filenames are encoded in this string:
+
+  .. code-block:: llvm
+
+    c"\01\12/Users/alex/test.c"
+
+  This string contains the following data:
+
+  * Its first byte has a value of ``0x01``. It stores the number of filenames
+    contained in this string.
+  * Its second byte stores the length of the first filename in this string.
+  * The remaining 18 bytes are used to store the first filename.
+
+* The length of the substring that contains the encoded coverage mapping data
+  for the first function is the value of the third field in the first
+  structure in an array of `function records`_ stored in the
+  fifth field of the *__llvm_coverage_mapping* structure, which is the 9.
+  Therefore, the coverage mapping for the first function record is encoded
+  in this string:
+
+  .. code-block:: llvm
+
+    c"\01\00\00\01\01\01\0C\02\02"
+
+  This string consists of the following bytes:
+
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+  | ``0x01`` | The number of file ids used by this function. There is only one file id used by the mapping data in this function.      |
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+  | ``0x00`` | An index into the filenames array which corresponds to the file "/Users/alex/test.c".                                   |
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+  | ``0x00`` | The number of counter expressions used by this function. This function doesn't use any expressions.                     |
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+  | ``0x01`` | The number of mapping regions that are stored in an array for the function's file id #0.                                |
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+  | ``0x01`` | The coverage mapping counter for the first region in this function. The value of 1 tells us that it's a coverage        |
+  |          | mapping counter that is a reference ot the profile instrumentation counter with an index of 0.                          |
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+  | ``0x01`` | The starting line of the first mapping region in this function.                                                         |
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+  | ``0x0C`` | The starting column of the first mapping region in this function.                                                       |
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+  | ``0x02`` | The ending line of the first mapping region in this function.                                                           |
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+  | ``0x02`` | The ending column of the first mapping region in this function.                                                         |
+  +----------+-------------------------------------------------------------------------------------------------------------------------+
+
+* The length of the substring that contains the encoded coverage mapping data
+  for the second function record is also 9. It's structured like the mapping data
+  for the first function record.
+
+* The two trailing bytes are zeroes and are used to pad the coverage mapping
+  data to give it the 8 byte alignment.
+
+Encoding
+========
+
+The per-function coverage mapping data is encoded as a stream of bytes,
+with a simple structure. The structure consists of the encoding
+`types <cvmtypes_>`_ like variable-length unsigned integers, that
+are used to encode `File ID Mapping`_, `Counter Expressions`_ and
+the `Mapping Regions`_.
+
+The format of the structure follows:
+
+  ``[file id mapping, counter expressions, mapping regions]``
+
+The translation unit filenames are encoded using the same encoding
+`types <cvmtypes_>`_ as the per-function coverage mapping data, with the
+following structure:
+
+  ``[numFilenames : LEB128, filename0 : string, filename1 : string, ...]``
+
+.. _cvmtypes:
+
+Types
+-----
+
+This section describes the basic types that are used by the encoding format
+and can appear after ``:`` in the ``[foo : type]`` description.
+
+.. _LEB128:
+
+LEB128
+^^^^^^
+
+LEB128 is an unsigned interger value that is encoded using DWARF's LEB128
+encoding, optimizing for the case where values are small
+(1 byte for values less than 128).
+
+.. _strings:
+
+Strings
+^^^^^^^
+
+``[length : LEB128, characters...]``
+
+String values are encoded with a `LEB value <LEB128_>`_ for the length
+of the string and a sequence of bytes for its characters.
+
+.. _file id mapping:
+
+File ID Mapping
+---------------
+
+``[numIndices : LEB128, filenameIndex0 : LEB128, filenameIndex1 : LEB128, ...]``
+
+File id mapping in a function's coverage mapping stream
+contains the indices into the translation unit's filenames array.
+
+Counter
+-------
+
+``[value : LEB128]``
+
+A `coverage mapping counter`_ is stored in a single `LEB value <LEB128_>`_.
+It is composed of two things --- the `tag <counter-tag_>`_
+which is stored in the lowest 2 bits, and the `counter data`_ which is stored
+in the remaining bits.
+
+.. _counter-tag:
+
+Tag:
+^^^^
+
+The counter's tag encodes the counter's kind
+and, if the counter is an expression, the expression's kind.
+The possible tag values are:
+
+* 0 - The counter is zero.
+
+* 1 - The counter is a reference to the profile instrumentation counter.
+
+* 2 - The counter is a subtraction expression.
+
+* 3 - The counter is an addition expression.
+
+.. _counter data:
+
+Data:
+^^^^^
+
+The counter's data is interpreted in the following manner:
+
+* When the counter is a reference to the profile instrumentation counter,
+  then the counter's data is the id of the profile counter.
+* When the counter is an expression, then the counter's data
+  is the index into the array of counter expressions.
+
+.. _Counter Expressions:
+
+Counter Expressions
+-------------------
+
+``[numExpressions : LEB128, expr0LHS : LEB128, expr0RHS : LEB128, expr1LHS : LEB128, expr1RHS : LEB128, ...]``
+
+Counter expressions consist of two counters as they
+represent binary arithmetic operations.
+The expression's kind is determined from the `tag <counter-tag_>`_ of the
+counter that references this expression.
+
+.. _Mapping Regions:
+
+Mapping Regions
+---------------
+
+``[numRegionArrays : LEB128, regionsForFile0, regionsForFile1, ...]``
+
+The mapping regions are stored in an array of sub-arrays where every
+region in a particular sub-array has the same file id.
+
+The file id for a sub-array of regions is the index of that
+sub-array in the main array e.g. The first sub-array will have the file id
+of 0.
+
+Sub-Array of Regions
+^^^^^^^^^^^^^^^^^^^^
+
+``[numRegions : LEB128, region0, region1, ...]``
+
+The mapping regions for a specific file id are stored in an array that is
+sorted in an ascending order by the region's starting location.
+
+Mapping Region
+^^^^^^^^^^^^^^
+
+``[header, source range]``
+
+The mapping region record contains two sub-records ---
+the `header`_, which stores the counter and/or the region's kind,
+and the `source range`_ that contains the starting and ending
+location of this region.
+
+.. _header:
+
+Header
+^^^^^^
+
+``[counter]``
+
+or
+
+``[pseudo-counter]``
+
+The header encodes the region's counter and the region's kind.
+
+The value of the counter's tag distinguishes between the counters and
+pseudo-counters --- if the tag is zero, than this header contains a
+pseudo-counter, otherwise this header contains an ordinary counter.
+
+Counter:
+""""""""
+
+A mapping region whose header has a counter with a non-zero tag is
+a code region.
+
+Pseudo-Counter:
+"""""""""""""""
+
+``[value : LEB128]``
+
+A pseudo-counter is stored in a single `LEB value <LEB128_>`_, just like
+the ordinary counter. It has the following interpretation:
+
+* bits 0-1: tag, which is always 0.
+
+* bit 2: expansionRegionTag. If this bit is set, then this mapping region
+  is an expansion region.
+
+* remaining bits: data. If this region is an expansion region, then the data
+  contains the expanded file id of that region.
+
+  Otherwise, the data contains the region's kind. The possible region
+  kind values are:
+
+  * 0 - This mapping region is a code region with a counter of zero.
+  * 2 - This mapping region is a skipped region.
+
+.. _source range:
+
+Source Range
+^^^^^^^^^^^^
+
+``[deltaLineStart : LEB128, columnStart : LEB128, numLines : LEB128, columnEnd : LEB128]``
+
+The source range record contains the following fields:
+
+* *deltaLineStart*: The difference between the starting line of the
+  current mapping region and the starting line of the previous mapping region.
+
+  If the current mapping region is the first region in the current
+  sub-array, then it stores the starting line of that region.
+
+* *columnStart*: The starting column of the mapping region.
+
+* *numLines*: The difference between the ending line and the starting line
+  of the current mapping region.
+
+* *columnEnd*: The ending column of the mapping region.

diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 74a8979..508a04f 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst

@@ -436,6 +436,29 @@
 a patch privately, encourage them to submit it to the appropriate list first.
 
 
+IR Backwards Compatibility
+--------------------------
+
+When the IR format has to be changed, keep in mind that we try to maintain some
+backwards compatibility. The rules are intended as a balance between convenience
+for llvm users and not imposing a big burden on llvm developers:
+
+* The textual format is not backwards compatible. We don't change it too often,
+  but there are no specific promises.
+
+* The bitcode format produced by a X.Y release will be readable by all following
+  X.Z releases and the (X+1).0 release.
+
+* Newer releases can ignore features from older releases, but they cannot
+  miscompile them. For example, if nsw is ever replaced with something else,
+  dropping it would be a valid way to upgrade the IR.
+
+* Debug metadata is special in that it is currently dropped during upgrades.
+
+* Non-debug metadata is defined to be safe to drop, so a valid way to upgrade
+  it is to drop it. That is not very user friendly and a bit more effort is
+  expected, but no promises are made.
+
 .. _copyright-license-patents:
 
 Copyright, License, and Patents

diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst
index dc6dab1..49d3496 100644
--- a/docs/GarbageCollection.rst
+++ b/docs/GarbageCollection.rst

@@ -923,7 +923,7 @@
 
   void MyGCPrinter::finishAssembly(AsmPrinter &AP) {
     MCStreamer &OS = AP.OutStreamer;
-    unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
+    unsigned IntPtrSize = AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
 
     // Put this in the data section.
     OS.SwitchSection(AP.getObjFileLowering().getDataSection());

diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 6de9b90..140210d 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst

@@ -115,7 +115,6 @@
 ================== ===================== =============
 OS                 Arch                  Compilers               
 ================== ===================== =============
-AuroraUX           x86\ :sup:`1`         GCC                     
 Linux              x86\ :sup:`1`         GCC, Clang              
 Linux              amd64                 GCC, Clang              
 Linux              ARM\ :sup:`4`         GCC, Clang              
@@ -331,10 +330,23 @@
 .. _GCC wiki entry:
   http://gcc.gnu.org/wiki/InstallingGCC
 
-Once you have a GCC toolchain, use it as your host compiler. Things should
-generally "just work". You may need to pass a special linker flag,
-``-Wl,-rpath,$HOME/toolchains/lib`` or some variant thereof to get things to
-find the libstdc++ DSO in this toolchain.
+Once you have a GCC toolchain, configure your build of LLVM to use the new
+toolchain for your host compiler and C++ standard library. Because the new
+version of libstdc++ is not on the system library search path, you need to pass
+extra linker flags so that it can be found at link time (``-L``) and at runtime
+(``-rpath``). If you are using CMake, this invocation should produce working
+binaries:
+
+.. code-block:: console
+
+  % mkdir build
+  % cd build
+  % CC=$HOME/toolchains/bin/gcc CXX=$HOME/toolchains/bin/g++ \
+    cmake .. -DCMAKE_CXX_LINK_FLAGS="-Wl,-rpath,$HOME/toolchains/lib64 -L$HOME/toolchains/lib64"
+
+If you fail to set rpath, most LLVM binaries will fail on startup with a message
+from the loader similar to ``libstdc++.so.6: version `GLIBCXX_3.4.20' not
+found``. This means you need to tweak the -rpath linker flag.
 
 When you build Clang, you will need to give *it* access to modern C++11
 standard library in order to use it as your new host in part of a bootstrap.
@@ -713,13 +725,6 @@
   generating the documentation can take a long time and producess 100s of
   megabytes of output.
 
-``--with-udis86``
-
-  LLVM can use external disassembler library for various purposes (now it's used
-  only for examining code produced by JIT). This option will enable usage of
-  `udis86 <http://udis86.sourceforge.net/>`_ x86 (both 32 and 64 bits)
-  disassembler library.
-
 To configure LLVM, follow these steps:
 
 #. Change directory into the object root directory:
@@ -1013,7 +1018,7 @@
 almost all code exists in libraries, making it very easy to share code among the
 different `tools`_.
 
-``llvm/lib/VMCore/``
+``llvm/lib/IR/``
 
   This directory holds the core LLVM source files that implement core classes
   like Instruction and BasicBlock.

diff --git a/docs/GoldPlugin.rst b/docs/GoldPlugin.rst
index 28b202a..6328934 100644
--- a/docs/GoldPlugin.rst
+++ b/docs/GoldPlugin.rst

@@ -44,9 +44,11 @@
   the ``-plugin`` option. Running ``make`` will additionally build
   ``build/binutils/ar`` and ``nm-new`` binaries supporting plugins.
 
-* Build the LLVMgold plugin: Configure LLVM with
-  ``--with-binutils-include=/path/to/binutils/include`` and run
-  ``make``.
+* Build the LLVMgold plugin.  If building with autotools, run configure with
+  ``--with-binutils-include=/path/to/binutils/include`` and run ``make``.
+  If building with CMake, run cmake with
+  ``-DLLVM_BINUTILS_INCDIR=/path/to/binutils/include``.  The correct include
+  path will contain the file ``plugin-api.h``.
 
 Usage
 =====

diff --git a/docs/HowToReleaseLLVM.rst b/docs/HowToReleaseLLVM.rst
index 61aa9e8..26e9f3b 100644
--- a/docs/HowToReleaseLLVM.rst
+++ b/docs/HowToReleaseLLVM.rst

@@ -146,25 +146,25 @@
 
 ::
 
-  $ svn mkdir https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XY
+  $ svn mkdir https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XYZ
   $ svn copy https://llvm.org/svn/llvm-project/llvm/branches/release_XY \
-             https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XY/rc1
+             https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XYZ/rc1
 
-  $ svn mkdir https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XY
+  $ svn mkdir https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XYZ
   $ svn copy https://llvm.org/svn/llvm-project/cfe/branches/release_XY \
-             https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XY/rc1
+             https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XYZ/rc1
 
-  $ svn mkdir https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XY
+  $ svn mkdir https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XYZ
   $ svn copy https://llvm.org/svn/llvm-project/dragonegg/branches/release_XY \
-             https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XY/rc1
+             https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XYZ/rc1
 
-  $ svn mkdir https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XY
+  $ svn mkdir https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XYZ
   $ svn copy https://llvm.org/svn/llvm-project/test-suite/branches/release_XY \
-             https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XY/rc1
+             https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XYZ/rc1
 
 Similarly, **Release Candidate 2** would be named ``RC2`` and so on.  This keeps
 a permanent copy of the release candidate around for people to export and build
-as they wish.  The final released sources will be tagged in the ``RELEASE_XY``
+as they wish.  The final released sources will be tagged in the ``RELEASE_XYZ``
 directory as ``Final`` (c.f. :ref:`tag`).
 
 The Release Manager may supply pre-packaged source tarballs for users.  This can
@@ -172,10 +172,10 @@
 
 ::
 
-  $ svn export https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XY/rc1 llvm-X.Yrc1
-  $ svn export https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XY/rc1 clang-X.Yrc1
-  $ svn export https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XY/rc1 dragonegg-X.Yrc1
-  $ svn export https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XY/rc1 llvm-test-X.Yrc1
+  $ svn export https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XYZ/rc1 llvm-X.Yrc1
+  $ svn export https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XYZ/rc1 clang-X.Yrc1
+  $ svn export https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XYZ/rc1 dragonegg-X.Yrc1
+  $ svn export https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XYZ/rc1 llvm-test-X.Yrc1
 
   $ tar -cvf - llvm-X.Yrc1        | gzip > llvm-X.Yrc1.src.tar.gz
   $ tar -cvf - clang-X.Yrc1       | gzip > clang-X.Yrc1.src.tar.gz
@@ -389,16 +389,16 @@
 ::
 
   $ svn copy https://llvm.org/svn/llvm-project/llvm/branches/release_XY \
-             https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XY/Final
+             https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_XYZ/Final
 
   $ svn copy https://llvm.org/svn/llvm-project/cfe/branches/release_XY \
-             https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XY/Final
+             https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_XYZ/Final
 
   $ svn copy https://llvm.org/svn/llvm-project/dragonegg/branches/release_XY \
-             https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XY/Final
+             https://llvm.org/svn/llvm-project/dragonegg/tags/RELEASE_XYZ/Final
 
   $ svn copy https://llvm.org/svn/llvm-project/test-suite/branches/release_XY \
-             https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XY/Final
+             https://llvm.org/svn/llvm-project/test-suite/tags/RELEASE_XYZ/Final
 
 Update the LLVM Demo Page
 -------------------------

diff --git a/docs/HowToSubmitABug.rst b/docs/HowToSubmitABug.rst
index 702dc0c..9f997d2 100644
--- a/docs/HowToSubmitABug.rst
+++ b/docs/HowToSubmitABug.rst

@@ -89,7 +89,7 @@
 
 .. code-block:: bash
 
-   opt -std-compile-opts -debug-pass=Arguments foo.bc -disable-output
+   opt -O3 -debug-pass=Arguments foo.bc -disable-output
 
 This command should do two things: it should print out a list of passes, and
 then it should crash in the same way as clang.  If it doesn't crash, please

diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index cc9656a..3b7d80b 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst

@@ -75,11 +75,12 @@
 #. Named values are represented as a string of characters with their
    prefix. For example, ``%foo``, ``@DivisionByZero``,
    ``%a.really.long.identifier``. The actual regular expression used is
-   '``[%@][a-zA-Z$._][a-zA-Z$._0-9]*``'. Identifiers which require other
+   '``[%@][a-zA-Z$._][a-zA-Z$._0-9]*``'. Identifiers that require other
    characters in their names can be surrounded with quotes. Special
    characters may be escaped using ``"\xx"`` where ``xx`` is the ASCII
    code for the character in hexadecimal. In this way, any character can
-   be used in a name value, even quotes themselves.
+   be used in a name value, even quotes themselves. The ``"\01"`` prefix
+   can be used on global variables to suppress mangling.
 #. Unnamed values are represented as an unsigned numeric value with
    their prefix. For example, ``%12``, ``@2``, ``%44``.
 #. Constants, which are described in the section  Constants_ below.
@@ -128,9 +129,10 @@
 #. Unnamed temporaries are created when the result of a computation is
    not assigned to a named value.
 #. Unnamed temporaries are numbered sequentially (using a per-function
-   incrementing counter, starting with 0). Note that basic blocks are
-   included in this numbering. For example, if the entry basic block is not
-   given a label name, then it will get number 0.
+   incrementing counter, starting with 0). Note that basic blocks and unnamed
+   function parameters are included in this numbering. For example, if the
+   entry basic block is not given a label name and all function parameters are
+   named, then it will get number 0.
 
 It also shows a convention that we follow in this document. When
 demonstrating instructions, we will follow an instruction with a comment
@@ -168,8 +170,8 @@
     }
 
     ; Named metadata
-    !1 = metadata !{i32 42}
-    !foo = !{!1, null}
+    !0 = metadata !{i32 42, null, metadata !"string"}
+    !foo = !{!0}
 
 This example is made up of a :ref:`global variable <globalvars>` named
 "``.str``", an external declaration of the "``puts``" function, a
@@ -500,7 +502,7 @@
 LLVM IR allows you to specify both "identified" and "literal" :ref:`structure
 types <t_struct>`.  Literal types are uniqued structurally, but identified types
 are never uniqued.  An :ref:`opaque structural type <t_opaque>` can also be used
-to forward declare a type which is not yet available.
+to forward declare a type that is not yet available.
 
 An example of a identified structure specification is:
 
@@ -582,7 +584,7 @@
 case, the extra alignment could be observable: for example, code could
 assume that the globals are densely packed in their section and try to
 iterate over them as an array, alignment padding would break this
-iteration.
+iteration. The maximum alignment is ``1 << 29``.
 
 Globals can also have a :ref:`DLL storage class <dllstorageclass>`.
 
@@ -680,6 +682,14 @@
            [unnamed_addr] [fn Attrs] [section "name"] [comdat $<ComdatName>]
            [align N] [gc] [prefix Constant] { ... }
 
+The argument list is a comma seperated sequence of arguments where each
+argument is of the following form
+
+Syntax::
+
+   <type> [parameter Attrs] [name]
+
+
 .. _langref_aliases:
 
 Aliases
@@ -697,7 +707,7 @@
 
 Syntax::
 
-    @<Name> = [Visibility] [DLLStorageClass] [ThreadLocal] [unnamed_addr] alias [Linkage] <AliaseeTy> @<Aliasee>
+    @<Name> = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal] [unnamed_addr] alias <AliaseeTy> @<Aliasee>
 
 The linkage must be one of ``private``, ``internal``, ``linkonce``, ``weak``,
 ``linkonce_odr``, ``weak_odr``, ``external``. Note that some system linkers
@@ -727,7 +737,7 @@
 
 Comdat IR provides access to COFF and ELF object file COMDAT functionality.
 
-Comdats have a name which represents the COMDAT key.  All global objects which
+Comdats have a name which represents the COMDAT key.  All global objects that
 specify this key will only end up in the final object file if the linker chooses
 that key over some other key.  Aliases are placed in the same COMDAT that their
 aliasee computes to, if any.
@@ -773,7 +783,7 @@
 ``IMAGE_COMDAT_SELECT_LARGEST`` containing the contents of the ``@foo`` symbol
 and another COMDAT section with selection kind
 ``IMAGE_COMDAT_SELECT_ASSOCIATIVE`` which is associated with the first COMDAT
-section and contains the contents of the ``@baz`` symbol.
+section and contains the contents of the ``@bar`` symbol.
 
 There are some restrictions on the properties of the global object.
 It, or an alias to it, must have the same name as the COMDAT group when
@@ -891,7 +901,7 @@
     address of outgoing stack arguments.  An ``inalloca`` argument must
     be a pointer to stack memory produced by an ``alloca`` instruction.
     The alloca, or argument allocation, must also be tagged with the
-    inalloca keyword.  Only the past argument may have the ``inalloca``
+    inalloca keyword.  Only the last argument may have the ``inalloca``
     attribute, and that argument is guaranteed to be passed in memory.
 
     An argument allocation may be used by a call at most once because
@@ -921,11 +931,18 @@
     the first parameter. This is not a valid attribute for return
     values.
 
+``align <n>``
+    This indicates that the pointer value may be assumed by the optimizer to
+    have the specified alignment.
+
+    Note that this attribute has additional semantics when combined with the
+    ``byval`` attribute.
+
 .. _noalias:
 
 ``noalias``
     This indicates that pointer values :ref:`based <pointeraliasing>` on
-    the argument or return value do not alias pointer values which are
+    the argument or return value do not alias pointer values that are
     not *based* on it, ignoring certain "irrelevant" dependencies. For a
     call to the parent function, dependencies between memory references
     from before or after the call and from those during the call are
@@ -969,6 +986,17 @@
     passed in is non-null, or the callee must ensure that the returned pointer 
     is non-null.
 
+``dereferenceable(<n>)``
+    This indicates that the parameter or return pointer is dereferenceable. This
+    attribute may only be applied to pointer typed parameters. A pointer that
+    is dereferenceable can be loaded from speculatively without a risk of
+    trapping. The number of bytes known to be dereferenceable must be provided
+    in parentheses. It is legal for the number of bytes to be less than the
+    size of the pointee type. The ``nonnull`` attribute does not imply
+    dereferenceability (consider a pointer to one element past the end of an
+    array), however ``dereferenceable(<n>)`` does imply ``nonnull`` in
+    ``addrspace(0)`` (which is the default address space).
+
 .. _gc:
 
 Garbage Collector Names
@@ -982,7 +1010,7 @@
     define void @f() gc "name" { ... }
 
 The compiler declares the supported values of *name*. Specifying a
-collector which will cause the compiler to alter its output in order to
+collector will cause the compiler to alter its output in order to
 support the named garbage collection algorithm.
 
 .. _prefixdata:
@@ -1098,7 +1126,7 @@
     This indicates that the callee function at a call site should be
     recognized as a built-in function, even though the function's declaration
     uses the ``nobuiltin`` attribute. This is only valid at call sites for
-    direct calls to functions which are declared with the ``nobuiltin``
+    direct calls to functions that are declared with the ``nobuiltin``
     attribute.
 ``cold``
     This attribute indicates that this function is rarely called. When
@@ -1593,7 +1621,7 @@
 
 -  If R is volatile, the result is target-dependent. (Volatile is
    supposed to give guarantees which can support ``sig_atomic_t`` in
-   C/C++, and may be used for accesses to addresses which do not behave
+   C/C++, and may be used for accesses to addresses that do not behave
    like normal memory. It does not generally provide cross-thread
    synchronization.)
 -  Otherwise, if there is no write to the same byte that happens before
@@ -1681,7 +1709,7 @@
     address. This corresponds to the C++0x/C1x ``memory_order_acq_rel``.
 ``seq_cst`` (sequentially consistent)
     In addition to the guarantees of ``acq_rel`` (``acquire`` for an
-    operation which only reads, ``release`` for an operation which only
+    operation that only reads, ``release`` for an operation that only
     writes), there is a global total order on all
     sequentially-consistent operations on all addresses, which is
     consistent with the *happens-before* partial order and with the
@@ -1730,6 +1758,52 @@
    dramatically change results in floating point (e.g. reassociate). This
    flag implies all the others.
 
+.. _uselistorder:
+
+Use-list Order Directives
+-------------------------
+
+Use-list directives encode the in-memory order of each use-list, allowing the
+order to be recreated.  ``<order-indexes>`` is a comma-separated list of
+indexes that are assigned to the referenced value's uses.  The referenced
+value's use-list is immediately sorted by these indexes.
+
+Use-list directives may appear at function scope or global scope.  They are not
+instructions, and have no effect on the semantics of the IR.  When they're at
+function scope, they must appear after the terminator of the final basic block.
+
+If basic blocks have their address taken via ``blockaddress()`` expressions,
+``uselistorder_bb`` can be used to reorder their use-lists from outside their
+function's scope.
+
+:Syntax:
+
+::
+
+    uselistorder <ty> <value>, { <order-indexes> }
+    uselistorder_bb @function, %block { <order-indexes> }
+
+:Examples:
+
+::
+
+    define void @foo(i32 %arg1, i32 %arg2) {
+    entry:
+      ; ... instructions ...
+    bb:
+      ; ... instructions ...
+
+      ; At function scope.
+      uselistorder i32 %arg1, { 1, 0, 2 }
+      uselistorder label %bb, { 1, 0 }
+    }
+
+    ; At global scope.
+    uselistorder i32* @global, { 1, 2, 0 }
+    uselistorder i32 7, { 1, 0 }
+    uselistorder i32 (i32) @bar, { 1, 0 }
+    uselistorder_bb @foo, %bb, { 5, 1, 3, 2, 0, 4 }
+
 .. _typesystem:
 
 Type System
@@ -1948,8 +2022,8 @@
       < <# elements> x <elementtype> >
 
 The number of elements is a constant integer value larger than 0;
-elementtype may be any integer or floating point type, or a pointer to
-these types. Vectors of size zero are not allowed.
+elementtype may be any integer, floating point or pointer type. Vectors
+of size zero are not allowed.
 
 :Examples:
 
@@ -2202,7 +2276,9 @@
     square brackets (``[]``)). For example:
     "``[ i32 42, i32 11, i32 74 ]``". Array constants must have
     :ref:`array type <t_array>`, and the number and types of elements must
-    match those specified by the type.
+    match those specified by the type. As a special case, character array
+    constants may also be represented as a double-quoted string using the ``c``
+    prefix. For example: "``c"Hello World\0A\00"``".
 **Vector constants**
     Vector constants are represented with notation similar to vector
     type definitions (a comma separated list of elements, surrounded by
@@ -2319,7 +2395,7 @@
       %C = xor %B, %B
 
       %D = undef
-      %E = icmp lt %D, 4
+      %E = icmp slt %D, 4
       %F = icmp gte %D, 4
 
     Safe:
@@ -2384,8 +2460,8 @@
 
 Poison values are similar to :ref:`undef values <undefvalues>`, however
 they also represent the fact that an instruction or constant expression
-which cannot evoke side effects has nevertheless detected a condition
-which results in undefined behavior.
+that cannot evoke side effects has nevertheless detected a condition
+that results in undefined behavior.
 
 There is currently no way of representing a poison value in the IR; they
 only exist when produced by operations such as :ref:`add <i_add>` with
@@ -2422,8 +2498,8 @@
    successor.
 -  Dependence is transitive.
 
-Poison Values have the same behavior as :ref:`undef values <undefvalues>`,
-with the additional affect that any instruction which has a *dependence*
+Poison values have the same behavior as :ref:`undef values <undefvalues>`,
+with the additional effect that any instruction that has a *dependence*
 on a poison value has undefined behavior.
 
 Here are some examples:
@@ -2811,6 +2887,67 @@
 4 byte gap between the two fields. This gap represents padding which
 does not carry useful data and need not be preserved.
 
+'``noalias``' and '``alias.scope``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``noalias`` and ``alias.scope`` metadata provide the ability to specify generic
+noalias memory-access sets. This means that some collection of memory access
+instructions (loads, stores, memory-accessing calls, etc.) that carry
+``noalias`` metadata can specifically be specified not to alias with some other
+collection of memory access instructions that carry ``alias.scope`` metadata.
+Each type of metadata specifies a list of scopes where each scope has an id and
+a domain. When evaluating an aliasing query, if for some some domain, the set
+of scopes with that domain in one instruction's ``alias.scope`` list is a
+subset of (or qual to) the set of scopes for that domain in another
+instruction's ``noalias`` list, then the two memory accesses are assumed not to
+alias.
+
+The metadata identifying each domain is itself a list containing one or two
+entries. The first entry is the name of the domain. Note that if the name is a
+string then it can be combined accross functions and translation units. A
+self-reference can be used to create globally unique domain names. A
+descriptive string may optionally be provided as a second list entry.
+
+The metadata identifying each scope is also itself a list containing two or
+three entries. The first entry is the name of the scope. Note that if the name
+is a string then it can be combined accross functions and translation units. A
+self-reference can be used to create globally unique scope names. A metadata
+reference to the scope's domain is the second entry. A descriptive string may
+optionally be provided as a third list entry.
+
+For example,
+
+.. code-block:: llvm
+
+    ; Two scope domains:
+    !0 = metadata !{metadata !0}
+    !1 = metadata !{metadata !1}
+
+    ; Some scopes in these domains:
+    !2 = metadata !{metadata !2, metadata !0}
+    !3 = metadata !{metadata !3, metadata !0}
+    !4 = metadata !{metadata !4, metadata !1}
+
+    ; Some scope lists:
+    !5 = metadata !{metadata !4} ; A list containing only scope !4
+    !6 = metadata !{metadata !4, metadata !3, metadata !2}
+    !7 = metadata !{metadata !3}
+
+    ; These two instructions don't alias:
+    %0 = load float* %c, align 4, !alias.scope !5
+    store float %0, float* %arrayidx.i, align 4, !noalias !5
+
+    ; These two instructions also don't alias (for domain !1, the set of scopes
+    ; in the !alias.scope equals that in the !noalias list):
+    %2 = load float* %c, align 4, !alias.scope !5
+    store float %2, float* %arrayidx.i2, align 4, !noalias !6
+
+    ; These two instructions don't alias (for domain !0, the set of scopes in
+    ; the !noalias list is not a superset of, or equal to, the scopes in the
+    ; !alias.scope list):
+    %2 = load float* %c, align 4, !alias.scope !6
+    store float %0, float* %arrayidx.i, align 4, !noalias !7
+
 '``fpmath``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -2889,17 +3026,121 @@
     !0 = metadata !{ metadata !0 }
     !1 = metadata !{ metadata !1 }
 
-The loop identifier metadata can be used to specify additional per-loop
-metadata. Any operands after the first operand can be treated as user-defined
-metadata. For example the ``llvm.loop.vectorize.unroll`` metadata is understood
-by the loop vectorizer to indicate how many times to unroll the loop:
+The loop identifier metadata can be used to specify additional
+per-loop metadata. Any operands after the first operand can be treated
+as user-defined metadata. For example the ``llvm.loop.unroll.count``
+suggests an unroll factor to the loop unroller:
 
 .. code-block:: llvm
 
       br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !0
     ...
     !0 = metadata !{ metadata !0, metadata !1 }
-    !1 = metadata !{ metadata !"llvm.loop.vectorize.unroll", i32 2 }
+    !1 = metadata !{ metadata !"llvm.loop.unroll.count", i32 4 }
+
+'``llvm.loop.vectorize``' and '``llvm.loop.interleave``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Metadata prefixed with ``llvm.loop.vectorize`` or ``llvm.loop.interleave`` are
+used to control per-loop vectorization and interleaving parameters such as
+vectorization width and interleave count.  These metadata should be used in
+conjunction with ``llvm.loop`` loop identification metadata.  The
+``llvm.loop.vectorize`` and ``llvm.loop.interleave`` metadata are only
+optimization hints and the optimizer will only interleave and vectorize loops if
+it believes it is safe to do so.  The ``llvm.mem.parallel_loop_access`` metadata
+which contains information about loop-carried memory dependencies can be helpful
+in determining the safety of these transformations.
+
+'``llvm.loop.interleave.count``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata suggests an interleave count to the loop interleaver.
+The first operand is the string ``llvm.loop.interleave.count`` and the
+second operand is an integer specifying the interleave count. For
+example:
+
+.. code-block:: llvm
+
+   !0 = metadata !{ metadata !"llvm.loop.interleave.count", i32 4 }
+
+Note that setting ``llvm.loop.interleave.count`` to 1 disables interleaving
+multiple iterations of the loop.  If ``llvm.loop.interleave.count`` is set to 0
+then the interleave count will be determined automatically.
+
+'``llvm.loop.vectorize.enable``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata selectively enables or disables vectorization for the loop. The
+first operand is the string ``llvm.loop.vectorize.enable`` and the second operand
+is a bit.  If the bit operand value is 1 vectorization is enabled. A value of
+0 disables vectorization:
+
+.. code-block:: llvm
+
+   !0 = metadata !{ metadata !"llvm.loop.vectorize.enable", i1 0 }
+   !1 = metadata !{ metadata !"llvm.loop.vectorize.enable", i1 1 }
+
+'``llvm.loop.vectorize.width``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata sets the target width of the vectorizer. The first
+operand is the string ``llvm.loop.vectorize.width`` and the second
+operand is an integer specifying the width. For example:
+
+.. code-block:: llvm
+
+   !0 = metadata !{ metadata !"llvm.loop.vectorize.width", i32 4 }
+
+Note that setting ``llvm.loop.vectorize.width`` to 1 disables
+vectorization of the loop.  If ``llvm.loop.vectorize.width`` is set to
+0 or if the loop does not have this metadata the width will be
+determined automatically.
+
+'``llvm.loop.unroll``'
+^^^^^^^^^^^^^^^^^^^^^^
+
+Metadata prefixed with ``llvm.loop.unroll`` are loop unrolling
+optimization hints such as the unroll factor. ``llvm.loop.unroll``
+metadata should be used in conjunction with ``llvm.loop`` loop
+identification metadata. The ``llvm.loop.unroll`` metadata are only
+optimization hints and the unrolling will only be performed if the
+optimizer believes it is safe to do so.
+
+'``llvm.loop.unroll.count``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata suggests an unroll factor to the loop unroller. The
+first operand is the string ``llvm.loop.unroll.count`` and the second
+operand is a positive integer specifying the unroll factor. For
+example:
+
+.. code-block:: llvm
+
+   !0 = metadata !{ metadata !"llvm.loop.unroll.count", i32 4 }
+
+If the trip count of the loop is less than the unroll count the loop
+will be partially unrolled.
+
+'``llvm.loop.unroll.disable``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata either disables loop unrolling. The metadata has a single operand
+which is the string ``llvm.loop.unroll.disable``.  For example:
+
+.. code-block:: llvm
+
+   !0 = metadata !{ metadata !"llvm.loop.unroll.disable" }
+
+'``llvm.loop.unroll.full``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata either suggests that the loop should be unrolled fully. The
+metadata has a single operand which is the string ``llvm.loop.unroll.disable``.
+For example:
+
+.. code-block:: llvm
+
+   !0 = metadata !{ metadata !"llvm.loop.unroll.full" }
 
 '``llvm.mem``'
 ^^^^^^^^^^^^^^^
@@ -2984,55 +3225,6 @@
    !1 = metadata !{ metadata !1 } ; an identifier for the inner loop
    !2 = metadata !{ metadata !2 } ; an identifier for the outer loop
 
-'``llvm.loop.vectorize``'
-^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Metadata prefixed with ``llvm.loop.vectorize`` is used to control per-loop
-vectorization parameters such as vectorization factor and unroll factor.
-
-``llvm.loop.vectorize`` metadata should be used in conjunction with
-``llvm.loop`` loop identification metadata.
-
-'``llvm.loop.vectorize.unroll``' Metadata
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This metadata instructs the loop vectorizer to unroll the specified
-loop exactly ``N`` times.
-
-The first operand is the string ``llvm.loop.vectorize.unroll`` and the second
-operand is an integer specifying the unroll factor. For example:
-
-.. code-block:: llvm
-
-   !0 = metadata !{ metadata !"llvm.loop.vectorize.unroll", i32 4 }
-
-Note that setting ``llvm.loop.vectorize.unroll`` to 1 disables
-unrolling of the loop.
-
-If ``llvm.loop.vectorize.unroll`` is set to 0 then the amount of
-unrolling will be determined automatically.
-
-'``llvm.loop.vectorize.width``' Metadata
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This metadata sets the target width of the vectorizer to ``N``. Without
-this metadata, the vectorizer will choose a width automatically.
-Regardless of this metadata, the vectorizer will only vectorize loops if
-it believes it is valid to do so.
-
-The first operand is the string ``llvm.loop.vectorize.width`` and the
-second operand is an integer specifying the width. For example:
-
-.. code-block:: llvm
-
-   !0 = metadata !{ metadata !"llvm.loop.vectorize.width", i32 4 }
-
-Note that setting ``llvm.loop.vectorize.width`` to 1 disables
-vectorization of the loop.
-
-If ``llvm.loop.vectorize.width`` is set to 0 then the width will be
-determined automatically.
-
 Module Flags Metadata
 =====================
 
@@ -4885,7 +5077,7 @@
 
       %agg1 = insertvalue {i32, float} undef, i32 1, 0              ; yields {i32 1, float undef}
       %agg2 = insertvalue {i32, float} %agg1, float %val, 1         ; yields {i32 1, float %val}
-      %agg3 = insertvalue {i32, {float}} %agg1, float %val, 1, 0    ; yields {i32 1, float %val}
+      %agg3 = insertvalue {i32, {float}} undef, float %val, 1, 0    ; yields {i32 undef, {float %val}}
 
 .. _memoryops:
 
@@ -4925,9 +5117,10 @@
 appropriate type to the program. If "NumElements" is specified, it is
 the number of elements allocated, otherwise "NumElements" is defaulted
 to be one. If a constant alignment is specified, the value result of the
-allocation is guaranteed to be aligned to at least that boundary. If not
-specified, or if zero, the target can choose to align the allocation on
-any convenient boundary compatible with the type.
+allocation is guaranteed to be aligned to at least that boundary. The
+alignment may not be greater than ``1 << 29``. If not specified, or if
+zero, the target can choose to align the allocation on any convenient
+boundary compatible with the type.
 
 '``type``' may be any sized type.
 
@@ -4964,7 +5157,7 @@
 
 ::
 
-      <result> = load [volatile] <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>][, !invariant.load !<index>]
+      <result> = load [volatile] <ty>* <pointer>[, align <alignment>][, !nontemporal !<index>][, !invariant.load !<index>][, !nonnull !<index>]
       <result> = load atomic [volatile] <ty>* <pointer> [singlethread] <ordering>, align <alignment>
       !<index> = !{ i32 1 }
 
@@ -5001,7 +5194,8 @@
 alignment for the target. It is the responsibility of the code emitter
 to ensure that the alignment information is correct. Overestimating the
 alignment results in undefined behavior. Underestimating the alignment
-may produce less efficient code. An alignment of 1 is always safe.
+may produce less efficient code. An alignment of 1 is always safe. The
+maximum possible alignment is ``1 << 29``.
 
 The optional ``!nontemporal`` metadata must reference a single
 metadata name ``<index>`` corresponding to a metadata node with one
@@ -5019,6 +5213,14 @@
 execution. The optimizer may then move this load around, for example, by
 hoisting it out of loops using loop invariant code motion.
 
+The optional ``!nonnull`` metadata must reference a single
+metadata name ``<index>`` corresponding to a metadata node with no
+entries. The existence of the ``!nonnull`` metadata on the
+instruction tells the optimizer that the value loaded is known to
+never be null.  This is analogous to the ''nonnull'' attribute
+on parameters and return values.  This metadata can only be applied 
+to loads of a pointer type.  
+
 Semantics:
 """"""""""
 
@@ -5087,7 +5289,7 @@
 to ensure that the alignment information is correct. Overestimating the
 alignment results in undefined behavior. Underestimating the
 alignment may produce less efficient code. An alignment of 1 is always
-safe.
+safe. The maximum possible alignment is ``1 << 29``.
 
 The optional ``!nontemporal`` metadata must reference a single metadata
 name ``<index>`` corresponding to a metadata node with one ``i32`` entry of
@@ -6399,6 +6601,9 @@
    - The calling conventions of the caller and callee must match.
    - All ABI-impacting function attributes, such as sret, byval, inreg,
      returned, and inalloca, must match.
+   - The callee must be varargs iff the caller is varargs. Bitcasting a
+     non-varargs function to the appropriate varargs type is legal so
+     long as the non-varargs prefixes obey the other rules.
 
    Tail call optimization for calls marked ``tail`` is guaranteed to occur if
    the following conditions are met:
@@ -6679,14 +6884,21 @@
 
 .. code-block:: llvm
 
+    ; This struct is different for every platform. For most platforms,
+    ; it is merely an i8*.
+    %struct.va_list = type { i8* }
+
+    ; For Unix x86_64 platforms, va_list is the following struct:
+    ; %struct.va_list = type { i32, i32, i8*, i8* }
+
     define i32 @test(i32 %X, ...) {
       ; Initialize variable argument processing
-      %ap = alloca i8*
-      %ap2 = bitcast i8** %ap to i8*
+      %ap = alloca %struct.va_list
+      %ap2 = bitcast %struct.va_list* %ap to i8*
       call void @llvm.va_start(i8* %ap2)
 
       ; Read a single integer argument
-      %tmp = va_arg i8** %ap, i32
+      %tmp = va_arg i8* %ap2, i32
 
       ; Demonstrate usage of llvm.va_copy and llvm.va_end
       %aq = alloca i8*
@@ -7823,9 +8035,9 @@
 
       declare float     @llvm.fabs.f32(float  %Val)
       declare double    @llvm.fabs.f64(double %Val)
-      declare x86_fp80  @llvm.fabs.f80(x86_fp80  %Val)
+      declare x86_fp80  @llvm.fabs.f80(x86_fp80 %Val)
       declare fp128     @llvm.fabs.f128(fp128 %Val)
-      declare ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128  %Val)
+      declare ppc_fp128 @llvm.fabs.ppcf128(ppc_fp128 %Val)
 
 Overview:
 """""""""
@@ -7845,6 +8057,89 @@
 This function returns the same values as the libm ``fabs`` functions
 would, and handles error conditions in the same way.
 
+'``llvm.minnum.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.minnum`` on any
+floating point or vector of floating point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.minnum.f32(float %Val0, float %Val1)
+      declare double    @llvm.minnum.f64(double %Val0, double %Val1)
+      declare x86_fp80  @llvm.minnum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+      declare fp128     @llvm.minnum.f128(fp128 %Val0, fp128 %Val1)
+      declare ppc_fp128 @llvm.minnum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1)
+
+Overview:
+"""""""""
+
+The '``llvm.minnum.*``' intrinsics return the minimum of the two
+arguments.
+
+
+Arguments:
+""""""""""
+
+The arguments and return value are floating point numbers of the same
+type.
+
+Semantics:
+""""""""""
+
+Follows the IEEE-754 semantics for minNum, which also match for libm's
+fmin.
+
+If either operand is a NaN, returns the other non-NaN operand. Returns
+NaN only if both operands are NaN. If the operands compare equal,
+returns a value that compares equal to both operands. This means that
+fmin(+/-0.0, +/-0.0) could return either -0.0 or 0.0.
+
+'``llvm.maxnum.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.maxnum`` on any
+floating point or vector of floating point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.maxnum.f32(float  %Val0, float  %Val1l)
+      declare double    @llvm.maxnum.f64(double %Val0, double %Val1)
+      declare x86_fp80  @llvm.maxnum.f80(x86_fp80  %Val0, x86_fp80  %Val1)
+      declare fp128     @llvm.maxnum.f128(fp128 %Val0, fp128 %Val1)
+      declare ppc_fp128 @llvm.maxnum.ppcf128(ppc_fp128  %Val0, ppc_fp128  %Val1)
+
+Overview:
+"""""""""
+
+The '``llvm.maxnum.*``' intrinsics return the maximum of the two
+arguments.
+
+
+Arguments:
+""""""""""
+
+The arguments and return value are floating point numbers of the same
+type.
+
+Semantics:
+""""""""""
+Follows the IEEE-754 semantics for maxNum, which also match for libm's
+fmax.
+
+If either operand is a NaN, returns the other non-NaN operand. Returns
+NaN only if both operands are NaN. If the operands compare equal,
+returns a value that compares equal to both operands. This means that
+fmax(+/-0.0, +/-0.0) could return either -0.0 or 0.0.
+
 '``llvm.copysign.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -8651,14 +8946,14 @@
 
 ::
 
-      declare i16 @llvm.convert.to.fp16(float %a)
+      declare i16 @llvm.convert.to.fp16.f32(float %a)
+      declare i16 @llvm.convert.to.fp16.f64(double %a)
 
 Overview:
 """""""""
 
-The '``llvm.convert.to.fp16``' intrinsic function performs a conversion
-from single precision floating point format to half precision floating
-point format.
+The '``llvm.convert.to.fp16``' intrinsic function performs a conversion from a
+conventional floating point type to half precision floating point format.
 
 Arguments:
 """"""""""
@@ -8669,17 +8964,16 @@
 Semantics:
 """"""""""
 
-The '``llvm.convert.to.fp16``' intrinsic function performs a conversion
-from single precision floating point format to half precision floating
-point format. The return value is an ``i16`` which contains the
-converted number.
+The '``llvm.convert.to.fp16``' intrinsic function performs a conversion from a
+conventional floating point format to half precision floating point format. The
+return value is an ``i16`` which contains the converted number.
 
 Examples:
 """""""""
 
 .. code-block:: llvm
 
-      %res = call i16 @llvm.convert.to.fp16(float %a)
+      %res = call i16 @llvm.convert.to.fp16.f32(float %a)
       store i16 %res, i16* @x, align 2
 
 .. _int_convert_from_fp16:
@@ -8692,7 +8986,8 @@
 
 ::
 
-      declare float @llvm.convert.from.fp16(i16 %a)
+      declare float @llvm.convert.from.fp16.f32(i16 %a)
+      declare double @llvm.convert.from.fp16.f64(i16 %a)
 
 Overview:
 """""""""
@@ -9291,6 +9586,46 @@
 
 This intrinsic is lowered to the ``val``.
 
+'``llvm.assume``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.assume(i1 %cond)
+
+Overview:
+"""""""""
+
+The ``llvm.assume`` allows the optimizer to assume that the provided
+condition is true. This information can then be used in simplifying other parts
+of the code.
+
+Arguments:
+""""""""""
+
+The condition which the optimizer may assume is always true.
+
+Semantics:
+""""""""""
+
+The intrinsic allows the optimizer to assume that the provided condition is
+always true whenever the control flow reaches the intrinsic call. No code is
+generated for this intrinsic, and instructions that contribute only to the
+provided condition are not used for code generation. If the condition is
+violated during execution, the behavior is undefined.
+
+Please note that optimizer might limit the transformations performed on values
+used by the ``llvm.assume`` intrinsic in order to preserve the instructions
+only used to form the intrinsic's input argument. This might prove undesirable
+if the extra information provided by the ``llvm.assume`` intrinsic does cause
+sufficient overall improvement in code quality. For this reason,
+``llvm.assume`` should not be used to document basic mathematical invariants
+that the optimizer can otherwise deduce or facts that are of little use to the
+optimizer.
+
 '``llvm.donothing``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -9304,8 +9639,9 @@
 Overview:
 """""""""
 
-The ``llvm.donothing`` intrinsic doesn't perform any operation. It's the
-only intrinsic that can be called with an invoke instruction.
+The ``llvm.donothing`` intrinsic doesn't perform any operation. It's one of only
+two intrinsics (besides ``llvm.experimental.patchpoint``) that can be called
+with an invoke instruction.
 
 Arguments:
 """"""""""

diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
index fccfd5f..9a599da 100644
--- a/docs/Lexicon.rst
+++ b/docs/Lexicon.rst

@@ -133,6 +133,15 @@
 **MC**
     Machine Code
 
+N
+-
+
+**NFC**
+  "No functional change". Used in a commit message to indicate that a patch
+  is a pure refactoring/cleanup.
+  Usually used in the first line, so it is visible without opening the
+  actual commit email.
+
 O
 -
 .. _object pointer:
@@ -159,7 +168,7 @@
     ``Constant::replaceUsesOfWithOnConstant()`` implement the replacement of one
     Value with another by iterating over its def/use chain and fixing up all of
     the pointers to point to the new value.  See
-    also `def/use chains <ProgrammersManual.html#iterate_chains>`_.
+    also `def/use chains <ProgrammersManual.html#iterating-over-def-use-use-def-chains>`_.
 
 **Reassociation**
     Rearranging associative expressions to promote better redundancy elimination

diff --git a/docs/MCJITDesignAndImplementation.rst b/docs/MCJITDesignAndImplementation.rst
index 2cb6296..237a5be 100644
--- a/docs/MCJITDesignAndImplementation.rst
+++ b/docs/MCJITDesignAndImplementation.rst

@@ -57,7 +57,7 @@
 has been set.  If a cached object image cannot be retrieved, MCJIT will

 call its emitObject method.  MCJIT::emitObject uses a local PassManager

 instance and creates a new ObjectBufferStream instance, both of which it

-passes to TargetManager::addPassesToEmitMC before calling PassManager::run

+passes to TargetMachine::addPassesToEmitMC before calling PassManager::run

 on the Module with which it was created.

 

 .. image:: MCJIT-load.png


diff --git a/docs/Makefile b/docs/Makefile
index d973af5..690f772 100644
--- a/docs/Makefile
+++ b/docs/Makefile

@@ -1,10 +1,10 @@
 ##===- docs/Makefile ---------------------------------------*- Makefile -*-===##
-# 
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
 
 LEVEL      := ..
@@ -121,7 +121,8 @@
 	$(Verb) $(MKDIR) $(PROJ_OBJ_DIR)/ocamldoc/html
 	$(Verb) \
 		$(OCAMLDOC) -d $(PROJ_OBJ_DIR)/ocamldoc/html -sort -colorize-code -html \
-		`$(FIND) $(LEVEL)/bindings/ocaml -name "*.odoc" -exec echo -load '{}' ';'`
+		`$(FIND) $(LEVEL)/bindings/ocaml -name "*.odoc" \
+		         -path "*/$(BuildMode)/*.odoc" -exec echo -load '{}' ';'`
 
 uninstall-local::
 	$(Echo) Uninstalling Documentation

diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index a7b28b3..85a4ad8 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst

@@ -298,7 +298,9 @@
 (`doxygen <http://llvm.org/doxygen/classllvm_1_1function_ref.html>`__) class
 template represents a reference to a callable object, templated over the type
 of the callable. This is a good choice for passing a callback to a function,
-if you don't need to hold onto the callback after the function returns.
+if you don't need to hold onto the callback after the function returns. In this
+way, ``function_ref`` is to ``std::function`` as ``StringRef`` is to
+``std::string``.
 
 ``function_ref<Ret(Param1, Param2, ...)>`` can be implicitly constructed from
 any callable object that can be called with arguments of type ``Param1``,
@@ -323,17 +325,11 @@
       return false;
     });
 
-Note that a ``function_ref`` object contains pointers to external memory, so
-it is not generally safe to store an instance of the class (unless you know
-that the external storage will not be freed).
-``function_ref`` is small enough that it should always be passed by value.
-
-``std::function``
-^^^^^^^^^^^^^^^^^
-
-You cannot use ``std::function`` within LLVM code, because it is not supported
-by all our target toolchains.
-
+Note that a ``function_ref`` object contains pointers to external memory, so it
+is not generally safe to store an instance of the class (unless you know that
+the external storage will not be freed). If you need this ability, consider
+using ``std::function``. ``function_ref`` is small enough that it should always
+be passed by value.
 
 .. _DEBUG:
 
@@ -426,9 +422,12 @@
 because there is no system in place to ensure that names do not conflict.  If
 two different modules use the same string, they will all be turned on when the
 name is specified.  This allows, for example, all debug information for
-instruction scheduling to be enabled with ``-debug-type=InstrSched``, even if
+instruction scheduling to be enabled with ``-debug-only=InstrSched``, even if
 the source lives in multiple files.
 
+For performance reasons, -debug-only is not available in optimized build
+(``--enable-optimized``) of LLVM.
+
 The ``DEBUG_WITH_TYPE`` macro is also available for situations where you would
 like to set ``DEBUG_TYPE``, but only for one specific ``DEBUG`` statement.  It
 takes an additional first parameter, which is the type to use.  For example, the
@@ -877,7 +876,7 @@
 llvm/ADT/ilist_node.h
 ^^^^^^^^^^^^^^^^^^^^^
 
-``ilist_node<T>`` implements a the forward and backward links that are expected
+``ilist_node<T>`` implements the forward and backward links that are expected
 by the ``ilist<T>`` (and analogous containers) in the default manner.
 
 ``ilist_node<T>``\ s are meant to be embedded in the node type ``T``, usually
@@ -1441,8 +1440,10 @@
 iteration over maps of pointers.
 
 It is implemented by mapping from key to an index in a vector of key,value
-pairs.  This provides fast lookup and iteration, but has two main drawbacks: The
-key is stored twice and it doesn't support removing elements.
+pairs.  This provides fast lookup and iteration, but has two main drawbacks:
+the key is stored twice and removing elements takes linear time.  If it is
+necessary to remove elements, it's best to remove them in bulk using
+``remove_if()``.
 
 .. _dss_inteqclasses:
 

diff --git a/docs/R600Usage.rst b/docs/R600Usage.rst
new file mode 100644
index 0000000..48a30c8
--- /dev/null
+++ b/docs/R600Usage.rst

@@ -0,0 +1,43 @@
+============================
+User Guide for R600 Back-end
+============================
+
+Introduction
+============
+
+The R600 back-end provides ISA code generation for AMD GPUs, starting with
+the R600 family up until the current Sea Islands (GCN Gen 2).
+
+
+Assembler
+=========
+
+The assembler is currently a work in progress and not yet complete.  Below
+are the currently supported features.
+
+SOPP Instructions
+-----------------
+
+Unless otherwise mentioned, all SOPP instructions that with an operand
+accept a integer operand(s) only.  No verification is performed on the
+operands, so it is up to the programmer to be familiar with the range
+or acceptable values.
+
+s_waitcnt
+^^^^^^^^^
+
+s_waitcnt accepts named arguments to specify which memory counter(s) to
+wait for.
+
+.. code-block:: nasm
+
+   // Wait for all counters to be 0
+   s_waitcnt 0
+
+   // Equivalent to s_waitcnt 0.  Counter names can also be delimited by
+   // '&' or ','.
+   s_waitcnt vmcnt(0) expcnt(0) lgkcmt(0)
+
+   // Wait for vmcnt counter to be 1.
+   s_waitcnt vmcnt(1)
+

diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index fb2e248..be2954c 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst

@@ -6,8 +6,8 @@
     :local:
 
 .. warning::
-   These are in-progress notes for the upcoming LLVM 3.5 release.  You may
-   prefer the `LLVM 3.4 Release Notes <http://llvm.org/releases/3.4/docs
+   These are in-progress notes for the upcoming LLVM 3.6 release.  You may
+   prefer the `LLVM 3.5 Release Notes <http://llvm.org/releases/3.5.0/docs
    /ReleaseNotes.html>`_.
 
 
@@ -15,7 +15,7 @@
 ============
 
 This document contains the release notes for the LLVM Compiler Infrastructure,
-release 3.5.  Here we describe the status of LLVM, including major improvements
+release 3.6.  Here we describe the status of LLVM, including major improvements
 from the previous release, improvements in various subprojects of LLVM, and
 some of the current users of the code.  All LLVM releases may be downloaded
 from the `LLVM releases web site <http://llvm.org/releases/>`_.
@@ -34,35 +34,6 @@
 Non-comprehensive list of changes in this release
 =================================================
 
-* All backends have been changed to use the MC asm printer and support for the
-  non MC one has been removed.
-
-* Clang can now successfully self-host itself on Linux/Sparc64 and on
-  FreeBSD/Sparc64.
-
-* LLVM now assumes the assembler supports ``.loc`` for generating debug line
-  numbers. The old support for printing the debug line info directly was only
-  used by ``llc`` and has been removed.
-
-* All inline assembly is parsed by the integrated assembler when it is enabled.
-  Previously this was only the case for object-file output. It is now the case
-  for assembly output as well. The integrated assembler can be disabled with
-  the ``-no-integrated-as`` option,
-
-* llvm-ar now handles IR files like regular object files. In particular, a
-  regular symbol table is created for symbols defined in IR files, including
-  those in file scope inline assembly.
-
-* LLVM now always uses cfi directives for producing most stack
-  unwinding information.
-
-* The prefix for loop vectorizer hint metadata has been changed from
-  ``llvm.vectorizer`` to ``llvm.loop.vectorize``.
-
-* Some backends previously implemented Atomic NAND(x,y) as ``x & ~y``. Now 
-  all backends implement it as ``~(x & y)``, matching the semantics of GCC 4.4
-  and later.
-
 .. NOTE
    For small 1-3 sentence descriptions, just add an entry at the end of
    this list. If your description won't fit comfortably in one bullet
@@ -70,6 +41,11 @@
    functionality, or simply have a lot to talk about), see the `NOTE` below
    for adding a new subsection.
 
+* Support for AuroraUX has been removed.
+
+* Added support for a `native object file-based bitcode wrapper format
+  <BitCodeFormat.html#native-object-file>`_.
+
 * ... next change ...
 
 .. NOTE
@@ -85,40 +61,27 @@
 Changes to the ARM Backend
 --------------------------
 
-Since release 3.3, a lot of new features have been included in the ARM
-back-end but weren't production ready (ie. well tested) on release 3.4.
-Just after the 3.4 release, we started heavily testing two major parts
-of the back-end: the integrated assembler (IAS) and the ARM exception
-handling (EHABI), and now they are enabled by default on LLVM/Clang.
+ During this release ...
 
-The IAS received a lot of GNU extensions and directives, as well as some
-specific pre-UAL instructions. Not all remaining directives will be
-implemented, as we made judgement calls on the need versus the complexity,
-and have chosen simplicity and future compatibility where hard decisions
-had to be made. The major difference is, as stated above, the IAS validates
-all inline ASM, not just for object emission, and that cause trouble with
-some uses of inline ASM as pre-processor magic.
 
-So, while the IAS is good enough to compile large projects (including most
-of the Linux kernel), there are a few things that we can't (and probably
-won't) do. For those cases, please use ``-fno-integrated-as`` in Clang.
+Changes to the MIPS Target
+--------------------------
 
-Exception handling is another big change. After extensive testing and
-changes to cooperate with Dwarf unwinding, EHABI is enabled by default.
-The options ``-arm-enable-ehabi`` and ``-arm-enable-ehabi-descriptors``,
-which were used to enable EHABI in the previous releases, are removed now.
+During this release ...
 
-This means all ARM code will emit EH unwind tables, or CFI unwinding (for
-debug/profiling), or both. To avoid run-time inconsistencies, C code will
-also emit EH tables (in case they interoperate with C++ code), as is the
-case for other architectures (ex. x86_64).
+Changes to the PowerPC Target
+-----------------------------
 
-External Open Source Projects Using LLVM 3.5
+During this release ...
+
+External Open Source Projects Using LLVM 3.6
 ============================================
 
 An exciting aspect of LLVM is that it is used as an enabling technology for
 a lot of other language and tools projects. This section lists some of the
-projects that have already been updated to work with LLVM 3.5.
+projects that have already been updated to work with LLVM 3.6.
+
+* A project
 
 
 Additional Information

diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst
index 869d3a3..3a5fa6e 100644
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst

@@ -186,11 +186,15 @@
     ...
   }
 
-<a name="LLVMDebugVersion">The first field of a descriptor is always an
-``i32`` containing a tag value identifying the content of the descriptor.
-The remaining fields are specific to the descriptor.  The values of tags are
-loosely bound to the tag values of DWARF information entries.  However, that
-does not restrict the use of the information supplied to DWARF targets.
+Most of the string and integer fields in descriptors are packed into a single,
+null-separated ``mdstring``.  The first field of the header is always an
+``i32`` containing the DWARF tag value identifying the content of the
+descriptor.
+
+For clarity of definition in this document, these header fields are described
+below split inside an imaginary ``DIHeader`` construct.  This is invalid
+assembly syntax.  In valid IR, these fields are stringified and concatenated,
+separated by ``\00``.
 
 The details of the various descriptors follow.
 
@@ -200,19 +204,22 @@
 .. code-block:: llvm
 
   !0 = metadata !{
-    i32,       ;; Tag = 17 (DW_TAG_compile_unit)
+    DIHeader(
+      i32,       ;; Tag = 17 (DW_TAG_compile_unit)
+      i32,       ;; DWARF language identifier (ex. DW_LANG_C89)
+      mdstring,  ;; Producer (ex. "4.0.1 LLVM (LLVM research group)")
+      i1,        ;; True if this is optimized.
+      mdstring,  ;; Flags
+      i32,       ;; Runtime version
+      mdstring,  ;; Split debug filename
+      i32        ;; Debug info emission kind (1 = Full Debug Info, 2 = Line Tables Only)
+    ),
     metadata,  ;; Source directory (including trailing slash) & file pair
-    i32,       ;; DWARF language identifier (ex. DW_LANG_C89)
-    metadata   ;; Producer (ex. "4.0.1 LLVM (LLVM research group)")
-    i1,        ;; True if this is optimized.
-    metadata,  ;; Flags
-    i32        ;; Runtime version
-    metadata   ;; List of enums types
-    metadata   ;; List of retained types
-    metadata   ;; List of subprograms
-    metadata   ;; List of global variables
+    metadata,  ;; List of enums types
+    metadata,  ;; List of retained types
+    metadata,  ;; List of subprograms
+    metadata,  ;; List of global variables
     metadata   ;; List of imported entities
-    metadata   ;; Split debug filename
   }
 
 These descriptors contain a source language ID for the file (we use the DWARF
@@ -235,8 +242,10 @@
 .. code-block:: llvm
 
   !0 = metadata !{
-    i32,      ;; Tag = 41 (DW_TAG_file_type)
-    metadata, ;; Source directory (including trailing slash) & file pair
+    DIHeader(
+      i32       ;; Tag = 41 (DW_TAG_file_type)
+    ),
+    metadata  ;; Source directory (including trailing slash) & file pair
   }
 
 These descriptors contain information for a file.  Global variables and top
@@ -254,17 +263,18 @@
 .. code-block:: llvm
 
   !1 = metadata !{
-    i32,      ;; Tag = 52 (DW_TAG_variable)
-    i32,      ;; Unused field.
+    DIHeader(
+      i32,      ;; Tag = 52 (DW_TAG_variable)
+      mdstring, ;; Name
+      mdstring, ;; Display name (fully qualified C++ name)
+      mdstring, ;; MIPS linkage name (for C++)
+      i32,      ;; Line number where defined
+      i1,       ;; True if the global is local to compile unit (static)
+      i1        ;; True if the global is defined in the compile unit (not extern)
+    ),
     metadata, ;; Reference to context descriptor
-    metadata, ;; Name
-    metadata, ;; Display name (fully qualified C++ name)
-    metadata, ;; MIPS linkage name (for C++)
     metadata, ;; Reference to file where defined
-    i32,      ;; Line number where defined
     metadata, ;; Reference to type descriptor
-    i1,       ;; True if the global is local to compile unit (static)
-    i1,       ;; True if the global is defined in the compile unit (not extern)
     {}*,      ;; Reference to the global variable
     metadata, ;; The static member declaration, if any
   }
@@ -281,27 +291,29 @@
 .. code-block:: llvm
 
   !2 = metadata !{
-    i32,      ;; Tag = 46 (DW_TAG_subprogram)
+    DIHeader(
+      i32,      ;; Tag = 46 (DW_TAG_subprogram)
+      mdstring, ;; Name
+      mdstring, ;; Display name (fully qualified C++ name)
+      mdstring, ;; MIPS linkage name (for C++)
+      i32,      ;; Line number where defined
+      i1,       ;; True if the global is local to compile unit (static)
+      i1,       ;; True if the global is defined in the compile unit (not extern)
+      i32,      ;; Virtuality, e.g. dwarf::DW_VIRTUALITY__virtual
+      i32,      ;; Index into a virtual function
+      i32,      ;; Flags - Artificial, Private, Protected, Explicit, Prototyped.
+      i1,       ;; isOptimized
+      i32       ;; Line number where the scope of the subprogram begins
+    ),
     metadata, ;; Source directory (including trailing slash) & file pair
     metadata, ;; Reference to context descriptor
-    metadata, ;; Name
-    metadata, ;; Display name (fully qualified C++ name)
-    metadata, ;; MIPS linkage name (for C++)
-    i32,      ;; Line number where defined
     metadata, ;; Reference to type descriptor
-    i1,       ;; True if the global is local to compile unit (static)
-    i1,       ;; True if the global is defined in the compile unit (not extern)
-    i32,      ;; Virtuality, e.g. dwarf::DW_VIRTUALITY__virtual
-    i32,      ;; Index into a virtual function
     metadata, ;; indicates which base type contains the vtable pointer for the
               ;; derived class
-    i32,      ;; Flags - Artificial, Private, Protected, Explicit, Prototyped.
-    i1,       ;; isOptimized
     {}*,      ;; Reference to the LLVM function
     metadata, ;; Lists function template parameters
     metadata, ;; Function declaration descriptor
-    metadata, ;; List of function variables
-    i32       ;; Line number where the scope of the subprogram begins
+    metadata  ;; List of function variables
   }
 
 These descriptors provide debug information about functions, methods and
@@ -314,13 +326,14 @@
 .. code-block:: llvm
 
   !3 = metadata !{
-    i32,      ;; Tag = 11 (DW_TAG_lexical_block)
+    DIHeader(
+      i32,      ;; Tag = 11 (DW_TAG_lexical_block)
+      i32,      ;; Line number
+      i32,      ;; Column number
+      i32       ;; Unique ID to identify blocks from a template function
+    ),
     metadata, ;; Source directory (including trailing slash) & file pair
-    metadata, ;; Reference to context descriptor
-    i32,      ;; Line number
-    i32,      ;; Column number
-    i32,      ;; DWARF path discriminator value
-    i32       ;; Unique ID to identify blocks from a template function
+    metadata  ;; Reference to context descriptor
   }
 
 This descriptor provides debug information about nested blocks within a
@@ -330,7 +343,10 @@
 .. code-block:: llvm
 
   !3 = metadata !{
-    i32,      ;; Tag = 11 (DW_TAG_lexical_block)
+    DIHeader(
+      i32,      ;; Tag = 11 (DW_TAG_lexical_block)
+      i32       ;; DWARF path discriminator value
+    ),
     metadata, ;; Source directory (including trailing slash) & file pair
     metadata  ;; Reference to the scope we're annotating with a file change
   }
@@ -346,16 +362,18 @@
 .. code-block:: llvm
 
   !4 = metadata !{
-    i32,      ;; Tag = 36 (DW_TAG_base_type)
+    DIHeader(
+      i32,      ;; Tag = 36 (DW_TAG_base_type)
+      mdstring, ;; Name (may be "" for anonymous types)
+      i32,      ;; Line number where defined (may be 0)
+      i64,      ;; Size in bits
+      i64,      ;; Alignment in bits
+      i64,      ;; Offset in bits
+      i32,      ;; Flags
+      i32       ;; DWARF type encoding
+    ),
     metadata, ;; Source directory (including trailing slash) & file pair (may be null)
-    metadata, ;; Reference to context
-    metadata, ;; Name (may be "" for anonymous types)
-    i32,      ;; Line number where defined (may be 0)
-    i64,      ;; Size in bits
-    i64,      ;; Alignment in bits
-    i64,      ;; Offset in bits
-    i32,      ;; Flags
-    i32       ;; DWARF type encoding
+    metadata  ;; Reference to context
   }
 
 These descriptors define primitive types used in the code.  Example ``int``,
@@ -389,22 +407,19 @@
 .. code-block:: llvm
 
   !5 = metadata !{
-    i32,      ;; Tag (see below)
+    DIHeader(
+      i32,      ;; Tag (see below)
+      mdstring, ;; Name (may be "" for anonymous types)
+      i32,      ;; Line number where defined (may be 0)
+      i64,      ;; Size in bits
+      i64,      ;; Alignment in bits
+      i64,      ;; Offset in bits
+      i32       ;; Flags to encode attributes, e.g. private
+    ),
     metadata, ;; Source directory (including trailing slash) & file pair (may be null)
     metadata, ;; Reference to context
-    metadata, ;; Name (may be "" for anonymous types)
-    i32,      ;; Line number where defined (may be 0)
-    i64,      ;; Size in bits
-    i64,      ;; Alignment in bits
-    i64,      ;; Offset in bits
-    i32,      ;; Flags to encode attributes, e.g. private
     metadata, ;; Reference to type derived from
-    metadata, ;; (optional) Name of the Objective C property associated with
-              ;; Objective-C an ivar, or the type of which this
-              ;; pointer-to-member is pointing to members of.
-    metadata, ;; (optional) Name of the Objective C property getter selector.
-    metadata, ;; (optional) Name of the Objective C property setter selector.
-    i32       ;; (optional) Objective C property attributes.
+    metadata  ;; (optional) Objective C property node
   }
 
 These descriptors are used to define types derived from other types.  The value
@@ -452,21 +467,23 @@
 .. code-block:: llvm
 
   !6 = metadata !{
-    i32,      ;; Tag (see below)
+    DIHeader(
+      i32,      ;; Tag (see below)
+      mdstring, ;; Name (may be "" for anonymous types)
+      i32,      ;; Line number where defined (may be 0)
+      i64,      ;; Size in bits
+      i64,      ;; Alignment in bits
+      i64,      ;; Offset in bits
+      i32,      ;; Flags
+      i32       ;; Runtime languages
+    ),
     metadata, ;; Source directory (including trailing slash) & file pair (may be null)
     metadata, ;; Reference to context
-    metadata, ;; Name (may be "" for anonymous types)
-    i32,      ;; Line number where defined (may be 0)
-    i64,      ;; Size in bits
-    i64,      ;; Alignment in bits
-    i64,      ;; Offset in bits
-    i32,      ;; Flags
     metadata, ;; Reference to type derived from
     metadata, ;; Reference to array of member descriptors
-    i32,      ;; Runtime languages
     metadata, ;; Base type containing the vtable pointer for this type
     metadata, ;; Template parameters
-    metadata  ;; A unique identifier for type uniquing purpose (may be null)
+    mdstring  ;; A unique identifier for type uniquing purpose (may be null)
   }
 
 These descriptors are used to define types that are composed of 0 or more
@@ -528,9 +545,11 @@
 .. code-block:: llvm
 
   !42 = metadata !{
-    i32,      ;; Tag = 33 (DW_TAG_subrange_type)
-    i64,      ;; Low value
-    i64       ;; High value
+    DIHeader(
+      i32,      ;; Tag = 33 (DW_TAG_subrange_type)
+      i64,      ;; Low value
+      i64       ;; High value
+    )
   }
 
 These descriptors are used to define ranges of array subscripts for an array
@@ -547,9 +566,11 @@
 .. code-block:: llvm
 
   !6 = metadata !{
-    i32,      ;; Tag = 40 (DW_TAG_enumerator)
-    metadata, ;; Name
-    i64       ;; Value
+    DIHeader(
+      i32,      ;; Tag = 40 (DW_TAG_enumerator)
+      mdstring, ;; Name
+      i64       ;; Value
+    )
   }
 
 These descriptors are used to define members of an enumeration :ref:`composite
@@ -561,16 +582,17 @@
 .. code-block:: llvm
 
   !7 = metadata !{
-    i32,      ;; Tag (see below)
+    DIHeader(
+      i32,      ;; Tag (see below)
+      mdstring, ;; Name
+      i32,      ;; 24 bit - Line number where defined
+                ;; 8 bit - Argument number. 1 indicates 1st argument.
+      i32       ;; flags
+    ),
     metadata, ;; Context
-    metadata, ;; Name
     metadata, ;; Reference to file where defined
-    i32,      ;; 24 bit - Line number where defined
-              ;; 8 bit - Argument number. 1 indicates 1st argument.
     metadata, ;; Reference to the type descriptor
-    i32,      ;; flags
     metadata  ;; (optional) Reference to inline location
-    metadata  ;; (optional) Reference to a complex expression (see below)
   }
 
 These descriptors are used to define variables local to a sub program.  The
@@ -589,6 +611,25 @@
 Name the source variable name.  Context and line indicate where the variable
 was defined.  Type descriptor defines the declared type of the variable.
 
+Complex Expressions
+^^^^^^^^^^^^^^^^^^^
+.. code-block:: llvm
+
+  !8 = metadata !{
+    i32,      ;; DW_TAG_expression
+    ...
+  }
+
+Complex expressions describe variable storage locations in terms of
+prefix-notated DWARF expressions. Currently the only supported
+operators are ``DW_OP_plus``, ``DW_OP_deref``, and ``DW_OP_piece``.
+
+The ``DW_OP_piece`` operator is used for (typically larger aggregate)
+variables that are fragmented across several locations. It takes two
+i32 arguments, an offset and a size in bytes to describe which piece
+of the variable is at this location.
+
+
 .. _format_common_intrinsics:
 
 Debugger intrinsic functions
@@ -726,8 +767,7 @@
   !15 = metadata !{i32 786688, metadata !16, metadata !"Z", metadata !5, i32 5,
                    metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Z] \
                      [line 5]
-  !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 0, i32 0,
-                   i32 0} \
+  !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 0, i32 0} \
                    ; [ DW_TAG_lexical_block ] [/private/tmp/t.c]
   !17 = metadata !{i32 5, i32 0, metadata !16, null}
   !18 = metadata !{i32 6, i32 0, metadata !16, null}
@@ -779,8 +819,7 @@
 
 .. code-block:: llvm
 
-  !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 0, i32 0,
-                   i32 0}
+  !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 0, i32 0} \
                    ; [ DW_TAG_lexical_block ] [/private/tmp/t.c]
   !17 = metadata !{i32 5, i32 0, metadata !16, null}
 
@@ -810,73 +849,15 @@
 As support for debugging information gets added to the various LLVM
 source-language front-ends, the information used should be documented here.
 
-The following sections provide examples of various C/C++ constructs and the
-debug information that would best describe those constructs.
+The following sections provide examples of a few C/C++ constructs and the debug
+information that would best describe those constructs.  The canonical
+references are the ``DIDescriptor`` classes defined in
+``include/llvm/IR/DebugInfo.h`` and the implementations of the helper functions
+in ``lib/IR/DIBuilder.cpp``.
 
 C/C++ source file information
 -----------------------------
 
-Given the source files ``MySource.cpp`` and ``MyHeader.h`` located in the
-directory ``/Users/mine/sources``, the following code:
-
-.. code-block:: c
-
-  #include "MyHeader.h"
-
-  int main(int argc, char *argv[]) {
-    return 0;
-  }
-
-a C/C++ front-end would generate the following descriptors:
-
-.. code-block:: llvm
-
-  ...
-  ;;
-  ;; Define the compile unit for the main source file "/Users/mine/sources/MySource.cpp".
-  ;;
-  !0 = metadata !{
-    i32 786449,   ;; Tag
-    metadata !1,  ;; File/directory name
-    i32 4,        ;; Language Id
-    metadata !"clang version 3.4 ",
-    i1 false,     ;; Optimized compile unit
-    metadata !"", ;; Compiler flags
-    i32 0,        ;; Runtime version
-    metadata !2,  ;; Enumeration types
-    metadata !2,  ;; Retained types
-    metadata !3,  ;; Subprograms
-    metadata !2,  ;; Global variables
-    metadata !2,  ;; Imported entities (declarations and namespaces)
-    metadata !""  ;; Split debug filename
-  }
-
-  ;;
-  ;; Define the file for the file "/Users/mine/sources/MySource.cpp".
-  ;;
-  !1 = metadata !{
-    metadata !"MySource.cpp",
-    metadata !"/Users/mine/sources"
-  }
-  !5 = metadata !{
-    i32 786473, ;; Tag
-    metadata !1
-  }
-
-  ;;
-  ;; Define the file for the file "/Users/mine/sources/Myheader.h"
-  ;;
-  !14 = metadata !{
-    i32 786473, ;; Tag
-    metadata !15
-  }
-  !15 = metadata !{
-    metadata !"./MyHeader.h",
-    metadata !"/Users/mine/sources",
-  }
-
-  ...
-
 ``llvm::Instruction`` provides easy access to metadata attached with an
 instruction.  One can extract line number information encoded in LLVM IR using
 ``Instruction::getMetadata()`` and ``DILocation::getLineNumber()``.
@@ -906,7 +887,7 @@
   ;;
   ;; Define the global itself.
   ;;
-  %MyGlobal = global int 100
+  @MyGlobal = global i32 100, align 4
   ...
   ;;
   ;; List of debug info of globals
@@ -915,24 +896,35 @@
 
   ;; Define the compile unit.
   !0 = metadata !{
-    i32 786449,                       ;; Tag
-    i32 0,                            ;; Context
-    i32 4,                            ;; Language
-    metadata !"foo.cpp",              ;; File
-    metadata !"/Volumes/Data/tmp",    ;; Directory
-    metadata !"clang version 3.1 ",   ;; Producer
-    i1 true,                          ;; Deprecated field
-    i1 false,                         ;; "isOptimized"?
-    metadata !"",                     ;; Flags
-    i32 0,                            ;; Runtime Version
-    metadata !1,                      ;; Enum Types
-    metadata !1,                      ;; Retained Types
-    metadata !1,                      ;; Subprograms
-    metadata !3,                      ;; Global Variables
-    metadata !1,                      ;; Imported entities
-    "",                               ;; Split debug filename
+    ; Header(
+    ;   i32 17,                           ;; Tag
+    ;   i32 0,                            ;; Context
+    ;   i32 4,                            ;; Language
+    ;   metadata !"clang version 3.6.0 ", ;; Producer
+    ;   i1 false,                         ;; "isOptimized"?
+    ;   metadata !"",                     ;; Flags
+    ;   i32 0,                            ;; Runtime Version
+    ;   "",                               ;; Split debug filename
+    ;   1                                 ;; Full debug info
+    ; )
+    metadata !"0x11\0012\00clang version 3.6.0 \000\00\000\00\001",
+    metadata !1,                          ;; File
+    metadata !2,                          ;; Enum Types
+    metadata !2,                          ;; Retained Types
+    metadata !2,                          ;; Subprograms
+    metadata !3,                          ;; Global Variables
+    metadata !2                           ;; Imported entities
   } ; [ DW_TAG_compile_unit ]
 
+  ;; The file/directory pair.
+  !1 = metadata !{
+    metadata !"foo.c",                                 ;; Filename
+    metadata !"/Users/dexonsmith/data/llvm/debug-info" ;; Directory
+  }
+
+  ;; An empty array.
+  !2 = metadata !{}
+
   ;; The Array of Global Variables
   !3 = metadata !{
     metadata !4
@@ -942,17 +934,19 @@
   ;; Define the global variable itself.
   ;;
   !4 = metadata !{
-    i32 786484,                        ;; Tag
-    i32 0,                             ;; Unused
+    ; Header(
+    ;   i32 52,                        ;; Tag
+    ;   metadata !"MyGlobal",          ;; Name
+    ;   metadata !"MyGlobal",          ;; Display Name
+    ;   metadata !"",                  ;; Linkage Name
+    ;   i32 1,                         ;; Line
+    ;   i32 0,                         ;; IsLocalToUnit
+    ;   i32 1                          ;; IsDefinition
+    ; )
+    metadata !"0x34\00MyGlobal\00MyGlobal\00\001\000\001",
     null,                              ;; Unused
-    metadata !"MyGlobal",              ;; Name
-    metadata !"MyGlobal",              ;; Display Name
-    metadata !"",                      ;; Linkage Name
-    metadata !6,                       ;; File
-    i32 1,                             ;; Line
-    metadata !7,                       ;; Type
-    i32 0,                             ;; IsLocalToUnit
-    i32 1,                             ;; IsDefinition
+    metadata !5,                       ;; File
+    metadata !6,                       ;; Type
     i32* @MyGlobal,                    ;; LLVM-IR Value
     null                               ;; Static member declaration
   } ; [ DW_TAG_variable ]
@@ -961,28 +955,30 @@
   ;; Define the file
   ;;
   !5 = metadata !{
-    metadata !"foo.cpp",               ;; File
-    metadata !"/Volumes/Data/tmp",     ;; Directory
-  }
-  !6 = metadata !{
-    i32 786473,                        ;; Tag
-    metadata !5                        ;; Unused
+    ; Header(
+    ;   i32 41             ;; Tag
+    ; )
+    metadata !"0x29",
+    metadata !1            ;; File/directory pair
   } ; [ DW_TAG_file_type ]
 
   ;;
   ;; Define the type
   ;;
-  !7 = metadata !{
-    i32 786468,                         ;; Tag
-    null,                               ;; Unused
-    null,                               ;; Unused
-    metadata !"int",                    ;; Name
-    i32 0,                              ;; Line
-    i64 32,                             ;; Size in Bits
-    i64 32,                             ;; Align in Bits
-    i64 0,                              ;; Offset
-    i32 0,                              ;; Flags
-    i32 5                               ;; Encoding
+  !6 = metadata !{
+    ; Header(
+    ;   i32 36,                       ;; Tag
+    ;   metadata !"int",              ;; Name
+    ;   i32 0,                        ;; Line
+    ;   i64 32,                       ;; Size in Bits
+    ;   i64 32,                       ;; Align in Bits
+    ;   i64 0,                        ;; Offset
+    ;   i32 0,                        ;; Flags
+    ;   i32 5                         ;; Encoding
+    ; )
+    metadata !"0x24\00int\000\0032\0032\000\000\005",
+    null,                             ;; Unused
+    null                              ;; Unused
   } ; [ DW_TAG_base_type ]
 
 C/C++ function information
@@ -1004,26 +1000,31 @@
   ;; Define the anchor for subprograms.
   ;;
   !6 = metadata !{
-    i32 786484,        ;; Tag
-    metadata !1,       ;; File
-    metadata !1,       ;; Context
-    metadata !"main",  ;; Name
-    metadata !"main",  ;; Display name
-    metadata !"main",  ;; Linkage name
-    i32 1,             ;; Line number
-    metadata !4,       ;; Type
-    i1 false,          ;; Is local
-    i1 true,           ;; Is definition
-    i32 0,             ;; Virtuality attribute, e.g. pure virtual function
-    i32 0,             ;; Index into virtual table for C++ methods
-    i32 0,             ;; Type that holds virtual table.
-    i32 0,             ;; Flags
-    i1 false,          ;; True if this function is optimized
-    Function *,        ;; Pointer to llvm::Function
-    null,              ;; Function template parameters
-    null,              ;; List of function variables (emitted when optimizing)
-    1                  ;; Line number of the opening '{' of the function
+    ; Header(
+    ;   i32 46,             ;; Tag
+    ;   metadata !"main",   ;; Name
+    ;   metadata !"main",   ;; Display name
+    ;   metadata !"",       ;; Linkage name
+    ;   i32 1,              ;; Line number
+    ;   i1 false,           ;; Is local
+    ;   i1 true,            ;; Is definition
+    ;   i32 0,              ;; Virtuality attribute, e.g. pure virtual function
+    ;   i32 0,              ;; Index into virtual table for C++ methods
+    ;   i32 256,            ;; Flags
+    ;   i1 0,               ;; True if this function is optimized
+    ;   1                   ;; Line number of the opening '{' of the function
+    ; )
+    metadata !"0x2e\00main\00main\00\001\000\001\000\000\00256\000\001",
+    metadata !1,            ;; File
+    metadata !5,            ;; Context
+    metadata !6,            ;; Type
+    null,                   ;; Containing type
+    i32 (i32, i8**)* @main, ;; Pointer to llvm::Function
+    null,                   ;; Function template parameters
+    null,                   ;; Function declaration
+    metadata !2             ;; List of function variables (emitted when optimizing)
   }
+
   ;;
   ;; Define the subprogram itself.
   ;;
@@ -1031,443 +1032,6 @@
   ...
   }
 
-C/C++ basic types
------------------
-
-The following are the basic type descriptors for C/C++ core types:
-
-bool
-^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"bool",  ;; Name
-    i32 0,             ;; Line number
-    i64 8,             ;; Size in Bits
-    i64 8,             ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 2              ;; Encoding
-  }
-
-char
-^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"char",  ;; Name
-    i32 0,             ;; Line number
-    i64 8,             ;; Size in Bits
-    i64 8,             ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 6              ;; Encoding
-  }
-
-unsigned char
-^^^^^^^^^^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"unsigned char",
-    i32 0,             ;; Line number
-    i64 8,             ;; Size in Bits
-    i64 8,             ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 8              ;; Encoding
-  }
-
-short
-^^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"short int",
-    i32 0,             ;; Line number
-    i64 16,            ;; Size in Bits
-    i64 16,            ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 5              ;; Encoding
-  }
-
-unsigned short
-^^^^^^^^^^^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"short unsigned int",
-    i32 0,             ;; Line number
-    i64 16,            ;; Size in Bits
-    i64 16,            ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 7              ;; Encoding
-  }
-
-int
-^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"int",   ;; Name
-    i32 0,             ;; Line number
-    i64 32,            ;; Size in Bits
-    i64 32,            ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 5              ;; Encoding
-  }
-
-unsigned int
-^^^^^^^^^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"unsigned int",
-    i32 0,             ;; Line number
-    i64 32,            ;; Size in Bits
-    i64 32,            ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 7              ;; Encoding
-  }
-
-long long
-^^^^^^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"long long int",
-    i32 0,             ;; Line number
-    i64 64,            ;; Size in Bits
-    i64 64,            ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 5              ;; Encoding
-  }
-
-unsigned long long
-^^^^^^^^^^^^^^^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"long long unsigned int",
-    i32 0,             ;; Line number
-    i64 64,            ;; Size in Bits
-    i64 64,            ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 7              ;; Encoding
-  }
-
-float
-^^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"float",
-    i32 0,             ;; Line number
-    i64 32,            ;; Size in Bits
-    i64 32,            ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 4              ;; Encoding
-  }
-
-double
-^^^^^^
-
-.. code-block:: llvm
-
-  !2 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"double",;; Name
-    i32 0,             ;; Line number
-    i64 64,            ;; Size in Bits
-    i64 64,            ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 4              ;; Encoding
-  }
-
-C/C++ derived types
--------------------
-
-Given the following as an example of C/C++ derived type:
-
-.. code-block:: c
-
-  typedef const int *IntPtr;
-
-a C/C++ front-end would generate the following descriptors:
-
-.. code-block:: llvm
-
-  ;;
-  ;; Define the typedef "IntPtr".
-  ;;
-  !2 = metadata !{
-    i32 786454,          ;; Tag
-    metadata !3,         ;; File
-    metadata !1,         ;; Context
-    metadata !"IntPtr",  ;; Name
-    i32 0,               ;; Line number
-    i64 0,               ;; Size in bits
-    i64 0,               ;; Align in bits
-    i64 0,               ;; Offset in bits
-    i32 0,               ;; Flags
-    metadata !4          ;; Derived From type
-  }
-  ;;
-  ;; Define the pointer type.
-  ;;
-  !4 = metadata !{
-    i32 786447,          ;; Tag
-    null,                ;; File
-    null,                ;; Context
-    metadata !"",        ;; Name
-    i32 0,               ;; Line number
-    i64 64,              ;; Size in bits
-    i64 64,              ;; Align in bits
-    i64 0,               ;; Offset in bits
-    i32 0,               ;; Flags
-    metadata !5          ;; Derived From type
-  }
-  ;;
-  ;; Define the const type.
-  ;;
-  !5 = metadata !{
-    i32 786470,          ;; Tag
-    null,                ;; File
-    null,                ;; Context
-    metadata !"",        ;; Name
-    i32 0,               ;; Line number
-    i64 0,               ;; Size in bits
-    i64 0,               ;; Align in bits
-    i64 0,               ;; Offset in bits
-    i32 0,               ;; Flags
-    metadata !6          ;; Derived From type
-  }
-  ;;
-  ;; Define the int type.
-  ;;
-  !6 = metadata !{
-    i32 786468,          ;; Tag
-    null,                ;; File
-    null,                ;; Context
-    metadata !"int",     ;; Name
-    i32 0,               ;; Line number
-    i64 32,              ;; Size in bits
-    i64 32,              ;; Align in bits
-    i64 0,               ;; Offset in bits
-    i32 0,               ;; Flags
-    i32 5                ;; Encoding
-  }
-
-C/C++ struct/union types
-------------------------
-
-Given the following as an example of C/C++ struct type:
-
-.. code-block:: c
-
-  struct Color {
-    unsigned Red;
-    unsigned Green;
-    unsigned Blue;
-  };
-
-a C/C++ front-end would generate the following descriptors:
-
-.. code-block:: llvm
-
-  ;;
-  ;; Define basic type for unsigned int.
-  ;;
-  !5 = metadata !{
-    i32 786468,        ;; Tag
-    null,              ;; File
-    null,              ;; Context
-    metadata !"unsigned int",
-    i32 0,             ;; Line number
-    i64 32,            ;; Size in Bits
-    i64 32,            ;; Align in Bits
-    i64 0,             ;; Offset in Bits
-    i32 0,             ;; Flags
-    i32 7              ;; Encoding
-  }
-  ;;
-  ;; Define composite type for struct Color.
-  ;;
-  !2 = metadata !{
-    i32 786451,        ;; Tag
-    metadata !1,       ;; Compile unit
-    null,              ;; Context
-    metadata !"Color", ;; Name
-    i32 1,             ;; Line number
-    i64 96,            ;; Size in bits
-    i64 32,            ;; Align in bits
-    i64 0,             ;; Offset in bits
-    i32 0,             ;; Flags
-    null,              ;; Derived From
-    metadata !3,       ;; Elements
-    i32 0,             ;; Runtime Language
-    null,              ;; Base type containing the vtable pointer for this type
-    null               ;; Template parameters
-  }
-
-  ;;
-  ;; Define the Red field.
-  ;;
-  !4 = metadata !{
-    i32 786445,        ;; Tag
-    metadata !1,       ;; File
-    metadata !1,       ;; Context
-    metadata !"Red",   ;; Name
-    i32 2,             ;; Line number
-    i64 32,            ;; Size in bits
-    i64 32,            ;; Align in bits
-    i64 0,             ;; Offset in bits
-    i32 0,             ;; Flags
-    metadata !5        ;; Derived From type
-  }
-
-  ;;
-  ;; Define the Green field.
-  ;;
-  !6 = metadata !{
-    i32 786445,        ;; Tag
-    metadata !1,       ;; File
-    metadata !1,       ;; Context
-    metadata !"Green", ;; Name
-    i32 3,             ;; Line number
-    i64 32,            ;; Size in bits
-    i64 32,            ;; Align in bits
-    i64 32,             ;; Offset in bits
-    i32 0,             ;; Flags
-    metadata !5        ;; Derived From type
-  }
-
-  ;;
-  ;; Define the Blue field.
-  ;;
-  !7 = metadata !{
-    i32 786445,        ;; Tag
-    metadata !1,       ;; File
-    metadata !1,       ;; Context
-    metadata !"Blue",  ;; Name
-    i32 4,             ;; Line number
-    i64 32,            ;; Size in bits
-    i64 32,            ;; Align in bits
-    i64 64,             ;; Offset in bits
-    i32 0,             ;; Flags
-    metadata !5        ;; Derived From type
-  }
-
-  ;;
-  ;; Define the array of fields used by the composite type Color.
-  ;;
-  !3 = metadata !{metadata !4, metadata !6, metadata !7}
-
-C/C++ enumeration types
------------------------
-
-Given the following as an example of C/C++ enumeration type:
-
-.. code-block:: c
-
-  enum Trees {
-    Spruce = 100,
-    Oak = 200,
-    Maple = 300
-  };
-
-a C/C++ front-end would generate the following descriptors:
-
-.. code-block:: llvm
-
-  ;;
-  ;; Define composite type for enum Trees
-  ;;
-  !2 = metadata !{
-    i32 786436,        ;; Tag
-    metadata !1,       ;; File
-    metadata !1,       ;; Context
-    metadata !"Trees", ;; Name
-    i32 1,             ;; Line number
-    i64 32,            ;; Size in bits
-    i64 32,            ;; Align in bits
-    i64 0,             ;; Offset in bits
-    i32 0,             ;; Flags
-    null,              ;; Derived From type
-    metadata !3,       ;; Elements
-    i32 0              ;; Runtime language
-  }
-
-  ;;
-  ;; Define the array of enumerators used by composite type Trees.
-  ;;
-  !3 = metadata !{metadata !4, metadata !5, metadata !6}
-
-  ;;
-  ;; Define Spruce enumerator.
-  ;;
-  !4 = metadata !{i32 786472, metadata !"Spruce", i64 100}
-
-  ;;
-  ;; Define Oak enumerator.
-  ;;
-  !5 = metadata !{i32 786472, metadata !"Oak", i64 200}
-
-  ;;
-  ;; Define Maple enumerator.
-  ;;
-  !6 = metadata !{i32 786472, metadata !"Maple", i64 300}
-
 Debugging information format
 ============================
 
@@ -1650,21 +1214,33 @@
 New DWARF Constants
 ^^^^^^^^^^^^^^^^^^^
 
-+--------------------------------+-------+
-| Name                           | Value |
-+================================+=======+
-| DW_AT_APPLE_PROPERTY_readonly  | 0x1   |
-+--------------------------------+-------+
-| DW_AT_APPLE_PROPERTY_readwrite | 0x2   |
-+--------------------------------+-------+
-| DW_AT_APPLE_PROPERTY_assign    | 0x4   |
-+--------------------------------+-------+
-| DW_AT_APPLE_PROPERTY_retain    | 0x8   |
-+--------------------------------+-------+
-| DW_AT_APPLE_PROPERTY_copy      | 0x10  |
-+--------------------------------+-------+
-| DW_AT_APPLE_PROPERTY_nonatomic | 0x20  |
-+--------------------------------+-------+
++--------------------------------------+-------+
+| Name                                 | Value |
++======================================+=======+
+| DW_APPLE_PROPERTY_readonly           | 0x01  |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_getter             | 0x02  |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_assign             | 0x04  |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_readwrite          | 0x08  |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_retain             | 0x10  |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_copy               | 0x20  |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_nonatomic          | 0x40  |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_setter             | 0x80  |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_atomic             | 0x100 |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_weak               | 0x200 |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_strong             | 0x400 |
++--------------------------------------+-------+
+| DW_APPLE_PROPERTY_unsafe_unretained  | 0x800 |
++--------------------------------+-----+-------+
 
 Name Accelerator Tables
 -----------------------

diff --git a/docs/TableGen/BackEnds.rst b/docs/TableGen/BackEnds.rst
index 42de41d..e8544b6 100644
--- a/docs/TableGen/BackEnds.rst
+++ b/docs/TableGen/BackEnds.rst

@@ -78,8 +78,7 @@
 **Output**: C++ code, implementing the target's CodeEmitter
 class by overriding the virtual functions as ``<Target>CodeEmitter::function()``.
 
-**Usage**: Used to include directly at the end of ``<Target>CodeEmitter.cpp``, and
-with option `-mc-emitter` to be included in ``<Target>MCCodeEmitter.cpp``.
+**Usage**: Used to include directly at the end of ``<Target>MCCodeEmitter.cpp``.
 
 RegisterInfo
 ------------

diff --git a/docs/TableGen/LangIntro.rst b/docs/TableGen/LangIntro.rst
index 3e74dff..85c74a5 100644
--- a/docs/TableGen/LangIntro.rst
+++ b/docs/TableGen/LangIntro.rst

@@ -94,7 +94,9 @@
     uninitialized field
 
 ``0b1001011``
-    binary integer value
+    binary integer value.
+    Note that this is sized by the number of bits given and will not be
+    silently extended/truncated.
 
 ``07654321``
     octal integer value (indicated by a leading 0)
@@ -116,8 +118,9 @@
     In rare cases, TableGen is unable to deduce the element type in which case
     the user must specify it explicitly.
 
-``{ a, b, c }``
-    initializer for a "bits<3>" value
+``{ a, b, 0b10 }``
+    initializer for a "bits<4>" value.
+    1-bit from "a", 1-bit from "b", 2-bits from 0b10.
 
 ``value``
     value reference
@@ -208,6 +211,9 @@
     on string, int and bit objects.  Use !cast<string> to compare other types of
     objects.
 
+``!shl(a,b)`` ``!srl(a,b)`` ``!sra(a,b)`` ``!add(a,b)`` ``!and(a,b)``
+    The usual binary and arithmetic operators.
+
 Note that all of the values have rules specifying how they convert to values
 for different types.  These rules allow you to assign a value like "``7``"
 to a "``bits<4>``" value, for example.

diff --git a/docs/TableGen/LangRef.rst b/docs/TableGen/LangRef.rst
index 9b074be..134afed 100644
--- a/docs/TableGen/LangRef.rst
+++ b/docs/TableGen/LangRef.rst

@@ -55,6 +55,10 @@
 ``+`` or ``-``, as opposed to having ``+`` and ``-`` be unary operators as
 most languages do.
 
+Also note that :token:`BinInteger` creates a value of type ``bits<n>``
+(where ``n`` is the number of bits).  This will implicitly convert to
+integers when needed.
+
 TableGen has identifier-like tokens:
 
 .. productionlist::
@@ -92,7 +96,7 @@
 .. productionlist::
    BangOperator: one of
                :!eq     !if      !head    !tail      !con
-               :!add    !shl     !sra     !srl
+               :!add    !shl     !sra     !srl       !and
                :!cast   !empty   !subst   !foreach   !listconcat   !strconcat
 
 Syntax

diff --git a/docs/TableGen/index.rst b/docs/TableGen/index.rst
index 0860afa..cda41b5 100644
--- a/docs/TableGen/index.rst
+++ b/docs/TableGen/index.rst

@@ -273,7 +273,7 @@
 in TableGen is, however, to interpret the source files into an internal 
 representation that can be generated into anything you want.
 
-Current usage of TableGen is to create include huge files with tables that you
+Current usage of TableGen is to create huge include files with tables that you
 can either include directly (if the output is in the language you're coding),
 or be used in pre-processing via macros surrounding the include of the file.
 
@@ -292,7 +292,7 @@
 pointed out numerous times. The common theme is that, while TableGen allows
 you to build Domain-Specific-Languages, the final languages that you create
 lack the power of other DSLs, which in turn increase considerably the size
-and complecity of TableGen files.
+and complexity of TableGen files.
 
 At the same time, TableGen allows you to create virtually any meaning of
 the basic concepts via custom-made back-ends, which can pervert the original

diff --git a/docs/TestingGuide.rst b/docs/TestingGuide.rst
index 481be55..fa0b5dd 100644
--- a/docs/TestingGuide.rst
+++ b/docs/TestingGuide.rst

@@ -240,6 +240,58 @@
 the :doc:`FileCheck tool <CommandGuide/FileCheck>`. *[The usage of grep in RUN
 lines is deprecated - please do not send or commit patches that use it.]*
 
+Extra files
+-----------
+
+If your test requires extra files besides the file containing the ``RUN:``
+lines, the idiomatic place to put them is in a subdirectory ``Inputs``.
+You can then refer to the extra files as ``%S/Inputs/foo.bar``.
+
+For example, consider ``test/Linker/ident.ll``. The directory structure is
+as follows::
+
+  test/
+    Linker/
+      ident.ll
+      Inputs/
+        ident.a.ll
+        ident.b.ll
+
+For convenience, these are the contents:
+
+.. code-block:: llvm
+
+  ;;;;; ident.ll:
+
+  ; RUN: llvm-link %S/Inputs/ident.a.ll %S/Inputs/ident.b.ll -S | FileCheck %s
+
+  ; Verify that multiple input llvm.ident metadata are linked together.
+
+  ; CHECK-DAG: !llvm.ident = !{!0, !1, !2}
+  ; CHECK-DAG: "Compiler V1"
+  ; CHECK-DAG: "Compiler V2"
+  ; CHECK-DAG: "Compiler V3"
+
+  ;;;;; Inputs/ident.a.ll:
+
+  !llvm.ident = !{!0, !1}
+  !0 = metadata !{metadata !"Compiler V1"}
+  !1 = metadata !{metadata !"Compiler V2"}
+
+  ;;;;; Inputs/ident.b.ll:
+
+  !llvm.ident = !{!0}
+  !0 = metadata !{metadata !"Compiler V3"}
+
+For symmetry reasons, ``ident.ll`` is just a dummy file that doesn't
+actually participate in the test besides holding the ``RUN:`` lines.
+
+.. note::
+
+  Some existing tests use ``RUN: true`` in extra files instead of just
+  putting the extra files in an ``Inputs/`` directory. This pattern is
+  deprecated.
+
 Fragile tests
 -------------
 

diff --git a/docs/WritingAnLLVMBackend.rst b/docs/WritingAnLLVMBackend.rst
index fb7c16f..fdadbb0 100644
--- a/docs/WritingAnLLVMBackend.rst
+++ b/docs/WritingAnLLVMBackend.rst

@@ -161,7 +161,7 @@
 know about your target when parsing the ``--enable-targets`` option.  Search
 the configure script for ``TARGETS_TO_BUILD``, add your target to the lists
 there (some creativity required), and then reconfigure.  Alternatively, you can
-change ``autotools/configure.ac`` and regenerate configure by running
+change ``autoconf/configure.ac`` and regenerate configure by running
 ``./autoconf/AutoRegen.sh``.
 
 Target Machine

diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst
index cfbda04..ef2b953 100644
--- a/docs/WritingAnLLVMPass.rst
+++ b/docs/WritingAnLLVMPass.rst

@@ -146,7 +146,7 @@
 
 .. code-block:: c++
 
-      virtual bool runOnFunction(Function &F) {
+      bool runOnFunction(Function &F) override {
         errs() << "Hello: ";
         errs().write_escaped(F.getName()) << "\n";
         return false;
@@ -194,7 +194,7 @@
         static char ID;
         Hello() : FunctionPass(ID) {}
 
-        virtual bool runOnFunction(Function &F) {
+        bool runOnFunction(Function &F) override {
           errs() << "Hello: ";
           errs().write_escaped(F.getName()) << '\n';
           return false;
@@ -434,9 +434,8 @@
   virtual bool doFinalization(CallGraph &CG);
 
 The ``doFinalization`` method is an infrequently used method that is called
-when the pass framework has finished calling :ref:`runOnFunction
-<writing-an-llvm-pass-runOnFunction>` for every function in the program being
-compiled.
+when the pass framework has finished calling :ref:`runOnSCC
+<writing-an-llvm-pass-runOnSCC>` for every SCC in the program being compiled.
 
 .. _writing-an-llvm-pass-FunctionPass:
 
@@ -456,7 +455,7 @@
 #. Inspect or modify a ``Function`` other than the one currently being processed.
 #. Add or remove ``Function``\ s from the current ``Module``.
 #. Add or remove global variables from the current ``Module``.
-#. Maintain state across invocations of:ref:`runOnFunction
+#. Maintain state across invocations of :ref:`runOnFunction
    <writing-an-llvm-pass-runOnFunction>` (including global data).
 
 Implementing a ``FunctionPass`` is usually straightforward (See the :ref:`Hello
@@ -1163,7 +1162,7 @@
 .. code-block:: c++
 
   // We don't modify the program, so we preserve all analyses
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
 

diff --git a/docs/conf.py b/docs/conf.py
index 17d21f3..659c3e0 100644
--- a/docs/conf.py
+++ b/docs/conf.py

@@ -47,9 +47,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '3.4'
+version = '3.6'
 # The full version, including alpha/beta/rc tags.
-release = '3.4'
+release = '3.6'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/index.rst b/docs/index.rst
index 1d4fbd9..5ac5443 100644
--- a/docs/index.rst
+++ b/docs/index.rst

@@ -235,9 +235,11 @@
    WritingAnLLVMPass
    HowToUseAttributes
    NVPTXUsage
+   R600Usage
    StackMaps
    InAlloca
    BigEndianNEON
+   CoverageMappingFormat
 
 :doc:`WritingAnLLVMPass`
    Information on how to write LLVM transformations and analyses.
@@ -316,6 +318,9 @@
 :doc:`NVPTXUsage`
    This document describes using the NVPTX back-end to compile GPU kernels.
 
+:doc:`R600Usage`
+   This document describes how to use the R600 back-end.
+
 :doc:`StackMaps`
   LLVM support for mapping instruction addresses to the location of
   values and allowing code to be patched.
@@ -324,6 +329,8 @@
   LLVM's support for generating NEON instructions on big endian ARM targets is
   somewhat nonintuitive. This document explains the implementation and rationale.
 
+:doc:`CoverageMappingFormat`
+  This describes the format and encoding used for LLVM’s code coverage mapping.
 
 Development Process Documentation
 =================================
@@ -340,6 +347,7 @@
    HowToReleaseLLVM
    Packaging
    ReleaseProcess
+   Phabricator
 
 :doc:`DeveloperPolicy`
    The LLVM project's policy towards developers and their contributions.
@@ -361,11 +369,15 @@
   This is a guide to preparing LLVM releases. Most developers can ignore it.
 
 :doc:`ReleaseProcess`
-  This is a validate a new release, during the release process. Most developers can ignore it.
+  This is a guide to validate a new release, during the release process. Most developers can ignore it.
 
 :doc:`Packaging`
    Advice on packaging LLVM into a distribution.
 
+:doc:`Phabricator`
+   Describes how to use the Phabricator code review tool hosted on
+   http://reviews.llvm.org/ and its command line interface, Arcanist.
+
 Community
 =========
 

diff --git a/docs/tutorial/LangImpl3.rst b/docs/tutorial/LangImpl3.rst
index 7174c09..b7418cc 100644
--- a/docs/tutorial/LangImpl3.rst
+++ b/docs/tutorial/LangImpl3.rst

@@ -581,7 +581,7 @@
 .. code-block:: bash
 
     # Compile
-    clang++ -g -O3 toy.cpp `llvm-config --cppflags --ldflags --libs core` -o toy
+    clang++ -g -O3 toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core` -o toy
     # Run
     ./toy
 

diff --git a/docs/tutorial/LangImpl4.rst b/docs/tutorial/LangImpl4.rst
index 44e0cc1..aa469ca 100644
--- a/docs/tutorial/LangImpl4.rst
+++ b/docs/tutorial/LangImpl4.rst

@@ -428,7 +428,7 @@
 .. code-block:: bash
 
     # Compile
-    clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3 -o toy
+    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core jit native` -O3 -o toy
     # Run
     ./toy
 

diff --git a/docs/tutorial/LangImpl5.rst b/docs/tutorial/LangImpl5.rst
index ed5b652..2a3a4ce 100644
--- a/docs/tutorial/LangImpl5.rst
+++ b/docs/tutorial/LangImpl5.rst

@@ -736,7 +736,7 @@
 .. code-block:: bash
 
     # Compile
-    clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3 -o toy
+    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core jit native` -O3 -o toy
     # Run
     ./toy
 

diff --git a/docs/tutorial/LangImpl6.rst b/docs/tutorial/LangImpl6.rst
index 42839fb..cdceb03 100644
--- a/docs/tutorial/LangImpl6.rst
+++ b/docs/tutorial/LangImpl6.rst

@@ -729,7 +729,7 @@
 .. code-block:: bash
 
     # Compile
-    clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3 -o toy
+    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core jit native` -O3 -o toy
     # Run
     ./toy
 

diff --git a/docs/tutorial/LangImpl7.rst b/docs/tutorial/LangImpl7.rst
index 849ce50..c4c7233 100644
--- a/docs/tutorial/LangImpl7.rst
+++ b/docs/tutorial/LangImpl7.rst

@@ -847,7 +847,7 @@
 .. code-block:: bash
 
     # Compile
-    clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3 -o toy
+    clang++ -g toy.cpp `llvm-config --cxxflags --ldflags --system-libs --libs core jit native` -O3 -o toy
     # Run
     ./toy
 

diff --git a/examples/BrainF/BrainFDriver.cpp b/examples/BrainF/BrainFDriver.cpp
index e2de6bc..99c8ff3 100644
--- a/examples/BrainF/BrainFDriver.cpp
+++ b/examples/BrainF/BrainFDriver.cpp

@@ -26,8 +26,8 @@
 
 #include "BrainF.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
@@ -107,9 +107,8 @@
       OutputFilename = base+".bc";
     }
     if (OutputFilename != "-") {
-      std::string ErrInfo;
-      out = new raw_fd_ostream(OutputFilename.c_str(), ErrInfo,
-                               sys::fs::F_None);
+      std::error_code EC;
+      out = new raw_fd_ostream(OutputFilename, EC, sys::fs::F_None);
     }
   }
 
@@ -125,13 +124,13 @@
 
   //Read the BrainF program
   BrainF bf;
-  Module *mod = bf.parse(in, 65536, cf, Context); //64 KiB
+  std::unique_ptr<Module> Mod(bf.parse(in, 65536, cf, Context)); // 64 KiB
   if (in != &std::cin)
     delete in;
-  addMainFunction(mod);
+  addMainFunction(Mod.get());
 
   //Verify generated code
-  if (verifyModule(*mod)) {
+  if (verifyModule(*Mod)) {
     errs() << "Error: module failed verification.  This shouldn't happen.\n";
     abort();
   }
@@ -141,18 +140,18 @@
     InitializeNativeTarget();
 
     outs() << "------- Running JIT -------\n";
-    ExecutionEngine *ee = EngineBuilder(mod).create();
+    Module &M = *Mod;
+    ExecutionEngine *ee = EngineBuilder(std::move(Mod)).create();
     std::vector<GenericValue> args;
-    Function *brainf_func = mod->getFunction("brainf");
+    Function *brainf_func = M.getFunction("brainf");
     GenericValue gv = ee->runFunction(brainf_func, args);
   } else {
-    WriteBitcodeToFile(mod, *out);
+    WriteBitcodeToFile(Mod.get(), *out);
   }
 
   //Clean up
   if (out != &outs())
     delete out;
-  delete mod;
 
   llvm_shutdown();
 

diff --git a/examples/BrainF/CMakeLists.txt b/examples/BrainF/CMakeLists.txt
index 025d093..cf1cf1b 100644
--- a/examples/BrainF/CMakeLists.txt
+++ b/examples/BrainF/CMakeLists.txt

@@ -2,7 +2,7 @@
   BitWriter
   Core
   ExecutionEngine
-  JIT
+  MC
   Support
   nativecodegen
   )

diff --git a/examples/BrainF/Makefile b/examples/BrainF/Makefile
index 2c3e066..3e36e07 100644
--- a/examples/BrainF/Makefile
+++ b/examples/BrainF/Makefile

@@ -10,6 +10,6 @@
 TOOLNAME = BrainF
 EXAMPLE_TOOL = 1
 
-LINK_COMPONENTS := jit bitwriter nativecodegen interpreter
+LINK_COMPONENTS := mcjit bitwriter nativecodegen interpreter
 
 include $(LEVEL)/Makefile.common

diff --git a/examples/ExceptionDemo/CMakeLists.txt b/examples/ExceptionDemo/CMakeLists.txt
index 5324acd..9cadd94 100644
--- a/examples/ExceptionDemo/CMakeLists.txt
+++ b/examples/ExceptionDemo/CMakeLists.txt

@@ -1,12 +1,15 @@
 set(LLVM_LINK_COMPONENTS
   Core
   ExecutionEngine
+  MC
   MCJIT
   Support
   nativecodegen
   )
 
+# Enable EH and RTTI for this demo
 set(LLVM_REQUIRES_EH 1)
+set(LLVM_REQUIRES_RTTI 1)
 
 add_llvm_example(ExceptionDemo
   ExceptionDemo.cpp

diff --git a/examples/ExceptionDemo/ExceptionDemo.cpp b/examples/ExceptionDemo/ExceptionDemo.cpp
index 24e538c..17076fa 100644
--- a/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/examples/ExceptionDemo/ExceptionDemo.cpp

@@ -1957,17 +1957,17 @@
   llvm::IRBuilder<> theBuilder(context);
 
   // Make the module, which holds all the code.
-  llvm::Module *module = new llvm::Module("my cool jit", context);
+  std::unique_ptr<llvm::Module> Owner =
+      llvm::make_unique<llvm::Module>("my cool jit", context);
+  llvm::Module *module = Owner.get();
 
   llvm::RTDyldMemoryManager *MemMgr = new llvm::SectionMemoryManager();
 
   // Build engine with JIT
-  llvm::EngineBuilder factory(module);
+  llvm::EngineBuilder factory(std::move(Owner));
   factory.setEngineKind(llvm::EngineKind::JIT);
-  factory.setAllocateGVsWithCode(false);
   factory.setTargetOptions(Opts);
   factory.setMCJITMemoryManager(MemMgr);
-  factory.setUseMCJIT(true);
   llvm::ExecutionEngine *executionEngine = factory.create();
 
   {
@@ -1977,7 +1977,7 @@
     // Start with registering info about how the
     // target lays out data structures.
     module->setDataLayout(executionEngine->getDataLayout());
-    fpm.add(new llvm::DataLayoutPass(module));
+    fpm.add(new llvm::DataLayoutPass());
 
     // Optimizations turned on
 #ifdef ADD_OPT_PASSES

diff --git a/examples/ExceptionDemo/Makefile b/examples/ExceptionDemo/Makefile
index 58d9def..895b61d 100644
--- a/examples/ExceptionDemo/Makefile
+++ b/examples/ExceptionDemo/Makefile

@@ -11,6 +11,6 @@
 EXAMPLE_TOOL = 1
 REQUIRES_EH = 1
 
-LINK_COMPONENTS := jit mcjit nativecodegen
+LINK_COMPONENTS := mcjit nativecodegen
 
 include $(LEVEL)/Makefile.common

diff --git a/examples/Fibonacci/CMakeLists.txt b/examples/Fibonacci/CMakeLists.txt
index 724a0f6..087ccdd 100644
--- a/examples/Fibonacci/CMakeLists.txt
+++ b/examples/Fibonacci/CMakeLists.txt

@@ -2,7 +2,7 @@
   Core
   ExecutionEngine
   Interpreter
-  JIT
+  MC
   Support
   nativecodegen
   )

diff --git a/examples/Fibonacci/Makefile b/examples/Fibonacci/Makefile
index 71f6ba0..c99110a 100644
--- a/examples/Fibonacci/Makefile
+++ b/examples/Fibonacci/Makefile

@@ -12,6 +12,6 @@
 EXAMPLE_TOOL = 1
 
 # Link in JIT support
-LINK_COMPONENTS := jit interpreter nativecodegen
+LINK_COMPONENTS := interpreter mcjit nativecodegen
 
 include $(LEVEL)/Makefile.common

diff --git a/examples/Fibonacci/fibonacci.cpp b/examples/Fibonacci/fibonacci.cpp
index ba8e953..8092e19 100644
--- a/examples/Fibonacci/fibonacci.cpp
+++ b/examples/Fibonacci/fibonacci.cpp

@@ -26,7 +26,6 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/Interpreter.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
@@ -96,15 +95,16 @@
   LLVMContext Context;
 
   // Create some module to put our function into it.
-  std::unique_ptr<Module> M(new Module("test", Context));
+  std::unique_ptr<Module> Owner(new Module("test", Context));
+  Module *M = Owner.get();
 
   // We are about to create the "fib" function:
-  Function *FibF = CreateFibFunction(M.get(), Context);
+  Function *FibF = CreateFibFunction(M, Context);
 
   // Now we going to create JIT
   std::string errStr;
   ExecutionEngine *EE =
-    EngineBuilder(M.get())
+    EngineBuilder(std::move(Owner))
     .setErrorStr(&errStr)
     .setEngineKind(EngineKind::JIT)
     .create();

diff --git a/examples/HowToUseJIT/CMakeLists.txt b/examples/HowToUseJIT/CMakeLists.txt
index 88aed02..a344ad0 100644
--- a/examples/HowToUseJIT/CMakeLists.txt
+++ b/examples/HowToUseJIT/CMakeLists.txt

@@ -2,7 +2,7 @@
   Core
   ExecutionEngine
   Interpreter
-  JIT
+  MC
   Support
   nativecodegen
   )

diff --git a/examples/HowToUseJIT/HowToUseJIT.cpp b/examples/HowToUseJIT/HowToUseJIT.cpp
index 7125a15..9552240 100644
--- a/examples/HowToUseJIT/HowToUseJIT.cpp
+++ b/examples/HowToUseJIT/HowToUseJIT.cpp

@@ -36,7 +36,6 @@
 
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/Interpreter.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -56,7 +55,8 @@
   LLVMContext Context;
   
   // Create some module to put our function into it.
-  Module *M = new Module("test", Context);
+  std::unique_ptr<Module> Owner = make_unique<Module>("test", Context);
+  Module *M = Owner.get();
 
   // Create the add1 function entry and insert this entry into module M.  The
   // function will have a return type of "int" and take an argument of "int".
@@ -114,7 +114,7 @@
   builder.CreateRet(Add1CallRes);
 
   // Now we create the JIT.
-  ExecutionEngine* EE = EngineBuilder(M).create();
+  ExecutionEngine* EE = EngineBuilder(std::move(Owner)).create();
 
   outs() << "We just constructed this LLVM module:\n\n" << *M;
   outs() << "\n\nRunning foo: ";
@@ -126,7 +126,6 @@
 
   // Import result of execution:
   outs() << "Result: " << gv.IntVal << "\n";
-  EE->freeMachineCodeForFunction(FooF);
   delete EE;
   llvm_shutdown();
   return 0;

diff --git a/examples/HowToUseJIT/Makefile b/examples/HowToUseJIT/Makefile
index c8919db..26a25a1 100644
--- a/examples/HowToUseJIT/Makefile
+++ b/examples/HowToUseJIT/Makefile

@@ -10,6 +10,6 @@
 TOOLNAME = HowToUseJIT
 EXAMPLE_TOOL = 1
 
-LINK_COMPONENTS := jit interpreter nativecodegen
+LINK_COMPONENTS := mcjit interpreter nativecodegen
 
 include $(LEVEL)/Makefile.common

diff --git a/examples/Kaleidoscope/Chapter4/CMakeLists.txt b/examples/Kaleidoscope/Chapter4/CMakeLists.txt
index 72a9f05..2f828dc 100644
--- a/examples/Kaleidoscope/Chapter4/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter4/CMakeLists.txt

@@ -3,7 +3,7 @@
   Core
   ExecutionEngine
   InstCombine
-  JIT
+  MC
   ScalarOpts
   Support
   nativecodegen

diff --git a/examples/Kaleidoscope/Chapter4/Makefile b/examples/Kaleidoscope/Chapter4/Makefile
index 30162d9..6d6a670 100644
--- a/examples/Kaleidoscope/Chapter4/Makefile
+++ b/examples/Kaleidoscope/Chapter4/Makefile

@@ -10,6 +10,6 @@
 TOOLNAME = Kaleidoscope-Ch4
 EXAMPLE_TOOL = 1
 
-LINK_COMPONENTS := core jit native
+LINK_COMPONENTS := core mcjit native
 
 include $(LEVEL)/Makefile.common

diff --git a/examples/Kaleidoscope/Chapter4/toy.cpp b/examples/Kaleidoscope/Chapter4/toy.cpp
index a8f5942..3564d75 100644
--- a/examples/Kaleidoscope/Chapter4/toy.cpp
+++ b/examples/Kaleidoscope/Chapter4/toy.cpp

@@ -1,6 +1,5 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -572,11 +571,13 @@
   getNextToken();
 
   // Make the module, which holds all the code.
-  TheModule = new Module("my cool jit", Context);
+  std::unique_ptr<Module> Owner = make_unique<Module>("my cool jit", Context);
+  TheModule = Owner.get();
 
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
-  TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&ErrStr).create();
+  TheExecutionEngine =
+      EngineBuilder(std::move(Owner)).setErrorStr(&ErrStr).create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
@@ -587,7 +588,7 @@
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
   TheModule->setDataLayout(TheExecutionEngine->getDataLayout());
-  OurFPM.add(new DataLayoutPass(TheModule));
+  OurFPM.add(new DataLayoutPass());
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.

diff --git a/examples/Kaleidoscope/Chapter5/CMakeLists.txt b/examples/Kaleidoscope/Chapter5/CMakeLists.txt
index c7d0276..1912ddc 100644
--- a/examples/Kaleidoscope/Chapter5/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter5/CMakeLists.txt

@@ -3,7 +3,7 @@
   Core
   ExecutionEngine
   InstCombine
-  JIT
+  MC
   ScalarOpts
   Support
   nativecodegen

diff --git a/examples/Kaleidoscope/Chapter5/Makefile b/examples/Kaleidoscope/Chapter5/Makefile
index d1f5e20..d780967 100644
--- a/examples/Kaleidoscope/Chapter5/Makefile
+++ b/examples/Kaleidoscope/Chapter5/Makefile

@@ -10,6 +10,6 @@
 TOOLNAME = Kaleidoscope-Ch5
 EXAMPLE_TOOL = 1
 
-LINK_COMPONENTS := core jit native
+LINK_COMPONENTS := core mcjit native
 
 include $(LEVEL)/Makefile.common

diff --git a/examples/Kaleidoscope/Chapter5/toy.cpp b/examples/Kaleidoscope/Chapter5/toy.cpp
index a31b5b4..4929a20 100644
--- a/examples/Kaleidoscope/Chapter5/toy.cpp
+++ b/examples/Kaleidoscope/Chapter5/toy.cpp

@@ -1,6 +1,5 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -817,11 +816,13 @@
   getNextToken();
 
   // Make the module, which holds all the code.
-  TheModule = new Module("my cool jit", Context);
+  std::unique_ptr<Module> Owner = make_unique<Module>("my cool jit", Context);
+  TheModule = Owner.get();
 
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
-  TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&ErrStr).create();
+  TheExecutionEngine =
+      EngineBuilder(std::move(Owner)).setErrorStr(&ErrStr).create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
@@ -832,7 +833,7 @@
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
   TheModule->setDataLayout(TheExecutionEngine->getDataLayout());
-  OurFPM.add(new DataLayoutPass(TheModule));
+  OurFPM.add(new DataLayoutPass());
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.

diff --git a/examples/Kaleidoscope/Chapter6/CMakeLists.txt b/examples/Kaleidoscope/Chapter6/CMakeLists.txt
index 669c7eb..d36f030 100644
--- a/examples/Kaleidoscope/Chapter6/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter6/CMakeLists.txt

@@ -3,7 +3,7 @@
   Core
   ExecutionEngine
   InstCombine
-  JIT
+  MC
   ScalarOpts
   Support
   nativecodegen

diff --git a/examples/Kaleidoscope/Chapter6/Makefile b/examples/Kaleidoscope/Chapter6/Makefile
index a5fbcbd..8f47ea0 100644
--- a/examples/Kaleidoscope/Chapter6/Makefile
+++ b/examples/Kaleidoscope/Chapter6/Makefile

@@ -10,6 +10,6 @@
 TOOLNAME = Kaleidoscope-Ch6
 EXAMPLE_TOOL = 1
 
-LINK_COMPONENTS := core jit native
+LINK_COMPONENTS := core mcjit native
 
 include $(LEVEL)/Makefile.common

diff --git a/examples/Kaleidoscope/Chapter6/toy.cpp b/examples/Kaleidoscope/Chapter6/toy.cpp
index 5a3bd2e..06da9ac 100644
--- a/examples/Kaleidoscope/Chapter6/toy.cpp
+++ b/examples/Kaleidoscope/Chapter6/toy.cpp

@@ -1,6 +1,5 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -935,11 +934,13 @@
   getNextToken();
 
   // Make the module, which holds all the code.
-  TheModule = new Module("my cool jit", Context);
+  std::unique_ptr<Module> Owner = make_unique<Module>("my cool jit", Context);
+  TheModule = Owner.get();
 
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
-  TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&ErrStr).create();
+  TheExecutionEngine =
+      EngineBuilder(std::move(Owner)).setErrorStr(&ErrStr).create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
@@ -950,7 +951,7 @@
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
   TheModule->setDataLayout(TheExecutionEngine->getDataLayout());
-  OurFPM.add(new DataLayoutPass(TheModule));
+  OurFPM.add(new DataLayoutPass());
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.

diff --git a/examples/Kaleidoscope/Chapter7/CMakeLists.txt b/examples/Kaleidoscope/Chapter7/CMakeLists.txt
index 0a0c8e7..bdc0e55 100644
--- a/examples/Kaleidoscope/Chapter7/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter7/CMakeLists.txt

@@ -3,7 +3,7 @@
   Core
   ExecutionEngine
   InstCombine
-  JIT
+  MC
   ScalarOpts
   Support
   TransformUtils

diff --git a/examples/Kaleidoscope/Chapter7/Makefile b/examples/Kaleidoscope/Chapter7/Makefile
index 6cec323..7abeb3e 100644
--- a/examples/Kaleidoscope/Chapter7/Makefile
+++ b/examples/Kaleidoscope/Chapter7/Makefile

@@ -11,6 +11,6 @@
 EXAMPLE_TOOL = 1
 REQUIRES_RTTI := 1
 
-LINK_COMPONENTS := core jit native
+LINK_COMPONENTS := core mcjit native
 
 include $(LEVEL)/Makefile.common

diff --git a/examples/Kaleidoscope/Chapter7/toy.cpp b/examples/Kaleidoscope/Chapter7/toy.cpp
index c2c337c..56a6fa9 100644
--- a/examples/Kaleidoscope/Chapter7/toy.cpp
+++ b/examples/Kaleidoscope/Chapter7/toy.cpp

@@ -1,6 +1,5 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -1099,11 +1098,13 @@
   getNextToken();
 
   // Make the module, which holds all the code.
-  TheModule = new Module("my cool jit", Context);
+  std::unique_ptr<Module> Owner = make_unique<Module>("my cool jit", Context);
+  TheModule = Owner.get();
 
   // Create the JIT.  This takes ownership of the module.
   std::string ErrStr;
-  TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&ErrStr).create();
+  TheExecutionEngine =
+      EngineBuilder(std::move(Owner)).setErrorStr(&ErrStr).create();
   if (!TheExecutionEngine) {
     fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
     exit(1);
@@ -1114,7 +1115,7 @@
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
   TheModule->setDataLayout(TheExecutionEngine->getDataLayout());
-  OurFPM.add(new DataLayoutPass(TheModule));
+  OurFPM.add(new DataLayoutPass());
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Promote allocas to registers.

diff --git a/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp b/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp
index 9466360..00f5b83 100644
--- a/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp
+++ b/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp

@@ -2,7 +2,6 @@
 
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"

diff --git a/examples/Kaleidoscope/MCJIT/cached/toy.cpp b/examples/Kaleidoscope/MCJIT/cached/toy.cpp
index 16c548c..af51b4a 100644
--- a/examples/Kaleidoscope/MCJIT/cached/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/cached/toy.cpp

@@ -897,7 +897,6 @@
   std::string ErrStr;
   ExecutionEngine *NewEngine = EngineBuilder(M)
                                             .setErrorStr(&ErrStr)
-                                            .setUseMCJIT(true)
                                             .setMCJITMemoryManager(new HelpingMemoryManager(this))
                                             .create();
   if (!NewEngine) {

diff --git a/examples/Kaleidoscope/MCJIT/complete/toy.cpp b/examples/Kaleidoscope/MCJIT/complete/toy.cpp
index 10e7ada..3beb0d8 100644
--- a/examples/Kaleidoscope/MCJIT/complete/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/complete/toy.cpp

@@ -1,6 +1,5 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
@@ -52,10 +51,6 @@
                   cl::desc("Dump IR from modules to stderr on shutdown"),
                   cl::init(false));
 
-  cl::opt<bool> UseMCJIT(
-    "use-mcjit", cl::desc("Use the MCJIT execution engine"),
-    cl::init(true));
-
   cl::opt<bool> EnableLazyCompilation(
     "enable-lazy-compilation", cl::desc("Enable lazy compilation when using the MCJIT engine"),
     cl::init(true));
@@ -793,96 +788,6 @@
 };
 
 //===----------------------------------------------------------------------===//
-// Helper class for JIT execution engine
-//===----------------------------------------------------------------------===//
-
-class JITHelper : public BaseHelper {
-public:
-  JITHelper(LLVMContext &Context) {
-    // Make the module, which holds all the code.
-    if (!InputIR.empty()) {
-      TheModule = parseInputIR(InputIR, Context);
-    } else {
-      TheModule = new Module("my cool jit", Context);
-    }
-
-    // Create the JIT.  This takes ownership of the module.
-    std::string ErrStr;
-    TheExecutionEngine = EngineBuilder(TheModule).setErrorStr(&ErrStr).create();
-    if (!TheExecutionEngine) {
-      fprintf(stderr, "Could not create ExecutionEngine: %s\n", ErrStr.c_str());
-      exit(1);
-    }
-
-    TheFPM = new FunctionPassManager(TheModule);
-
-    // Set up the optimizer pipeline.  Start with registering info about how the
-    // target lays out data structures.
-    TheFPM->add(new DataLayout(*TheExecutionEngine->getDataLayout()));
-    // Provide basic AliasAnalysis support for GVN.
-    TheFPM->add(createBasicAliasAnalysisPass());
-    // Promote allocas to registers.
-    TheFPM->add(createPromoteMemoryToRegisterPass());
-    // Do simple "peephole" optimizations and bit-twiddling optzns.
-    TheFPM->add(createInstructionCombiningPass());
-    // Reassociate expressions.
-    TheFPM->add(createReassociatePass());
-    // Eliminate Common SubExpressions.
-    TheFPM->add(createGVNPass());
-    // Simplify the control flow graph (deleting unreachable blocks, etc).
-    TheFPM->add(createCFGSimplificationPass());
-
-    TheFPM->doInitialization();
-  }
-
-  virtual ~JITHelper() {
-    if (TheFPM)
-      delete TheFPM;
-    if (TheExecutionEngine)
-      delete TheExecutionEngine;
-  }
-
-  virtual Function *getFunction(const std::string FnName) {
-    assert(TheModule);
-    return TheModule->getFunction(FnName);
-  }
-
-  virtual Module *getModuleForNewFunction() {
-    assert(TheModule);
-    return TheModule;
-  }
-
-  virtual void *getPointerToFunction(Function* F) {
-    assert(TheExecutionEngine);
-    return TheExecutionEngine->getPointerToFunction(F);
-  }
-
-  virtual void *getPointerToNamedFunction(const std::string &Name) {
-    return TheExecutionEngine->getPointerToNamedFunction(Name);
-  }
-
-  virtual void runFPM(Function &F) {
-    assert(TheFPM);
-    TheFPM->run(F);
-  }
-
-  virtual void closeCurrentModule() {
-    // This should never be called for JIT
-    assert(false);
-  }
-
-  virtual void dump() {
-    assert(TheModule);
-    TheModule->dump();
-  }
-
-private:
-  Module *TheModule;
-  ExecutionEngine *TheExecutionEngine;
-  FunctionPassManager *TheFPM;
-};
-
-//===----------------------------------------------------------------------===//
 // MCJIT helper class
 //===----------------------------------------------------------------------===//
 
@@ -1034,7 +939,6 @@
   std::string ErrStr;
   ExecutionEngine *EE = EngineBuilder(M)
                             .setErrorStr(&ErrStr)
-                            .setUseMCJIT(true)
                             .setMCJITMemoryManager(new HelpingMemoryManager(this))
                             .create();
   if (!EE) {
@@ -1194,10 +1098,8 @@
   Value *OperandV = Operand->Codegen();
   if (OperandV == 0) return 0;
   Function *F;
-  if (UseMCJIT)
-    F = TheHelper->getFunction(MakeLegalFunctionName(std::string("unary")+Opcode));
-  else
-    F = TheHelper->getFunction(std::string("unary")+Opcode);
+  F = TheHelper->getFunction(
+      MakeLegalFunctionName(std::string("unary") + Opcode));
   if (F == 0)
     return ErrorV("Unknown unary operator");
 
@@ -1246,10 +1148,7 @@
   // If it wasn't a builtin binary operator, it must be a user defined one. Emit
   // a call to it.
   Function *F;
-  if (UseMCJIT)
-    F = TheHelper->getFunction(MakeLegalFunctionName(std::string("binary")+Op));
-  else
-    F = TheHelper->getFunction(std::string("binary")+Op);
+  F = TheHelper->getFunction(MakeLegalFunctionName(std::string("binary")+Op));
   assert(F && "binary operator not found!");
 
   Value *Ops[] = { L, R };
@@ -1482,10 +1381,7 @@
                                        Doubles, false);
 
   std::string FnName;
-  if (UseMCJIT)
-    FnName = MakeLegalFunctionName(Name);
-  else
-    FnName = Name;
+  FnName = MakeLegalFunctionName(Name);
 
   Module* M = TheHelper->getModuleForNewFunction();
   Function *F = Function::Create(FT, Function::ExternalLinkage, FnName, M);
@@ -1560,10 +1456,6 @@
     // Validate the generated code, checking for consistency.
     verifyFunction(*TheFunction);
 
-    // Optimize the function.
-    if (!UseMCJIT)
-      TheHelper->runFPM(*TheFunction);
-
     return TheFunction;
   }
 
@@ -1581,7 +1473,7 @@
 
 static void HandleDefinition() {
   if (FunctionAST *F = ParseDefinition()) {
-    if (UseMCJIT && EnableLazyCompilation)
+    if (EnableLazyCompilation)
       TheHelper->closeCurrentModule();
     Function *LF = F->Codegen();
     if (LF && VerboseOutput) {
@@ -1671,10 +1563,8 @@
 
 int main(int argc, char **argv) {
   InitializeNativeTarget();
-  if (UseMCJIT) {
-    InitializeNativeTargetAsmPrinter();
-    InitializeNativeTargetAsmParser();
-  }
+  InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
   LLVMContext &Context = getGlobalContext();
 
   cl::ParseCommandLineOptions(argc, argv,
@@ -1690,10 +1580,7 @@
   BinopPrecedence['*'] = 40;  // highest.
 
   // Make the Helper, which holds all the code.
-  if (UseMCJIT)
-    TheHelper = new MCJITHelper(Context);
-  else
-    TheHelper = new JITHelper(Context);
+  TheHelper = new MCJITHelper(Context);
 
   // Prime the first token.
   if (!SuppressPrompts)

diff --git a/examples/Kaleidoscope/MCJIT/initial/toy.cpp b/examples/Kaleidoscope/MCJIT/initial/toy.cpp
index 4c47113..2c1b297 100644
--- a/examples/Kaleidoscope/MCJIT/initial/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/initial/toy.cpp

@@ -778,7 +778,6 @@
     std::string ErrStr;
     ExecutionEngine *NewEngine = EngineBuilder(OpenModule)
                                               .setErrorStr(&ErrStr)
-                                              .setUseMCJIT(true)
                                               .setMCJITMemoryManager(new HelpingMemoryManager(this))
                                               .create();
     if (!NewEngine) {

diff --git a/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp b/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp
index 2d540dd..98c1001 100644
--- a/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp
+++ b/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp

@@ -2,7 +2,6 @@
 
 #include "llvm/Analysis/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"

diff --git a/examples/Kaleidoscope/MCJIT/lazy/toy.cpp b/examples/Kaleidoscope/MCJIT/lazy/toy.cpp
index ff88e23..9c2a0d4 100644
--- a/examples/Kaleidoscope/MCJIT/lazy/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/lazy/toy.cpp

@@ -808,7 +808,6 @@
   std::string ErrStr;
   ExecutionEngine *NewEngine = EngineBuilder(M)
                                             .setErrorStr(&ErrStr)
-                                            .setUseMCJIT(true)
                                             .setMCJITMemoryManager(new HelpingMemoryManager(this))
                                             .create();
   if (!NewEngine) {

diff --git a/examples/ParallelJIT/CMakeLists.txt b/examples/ParallelJIT/CMakeLists.txt
index 8673917..07c0a08 100644
--- a/examples/ParallelJIT/CMakeLists.txt
+++ b/examples/ParallelJIT/CMakeLists.txt

@@ -2,7 +2,7 @@
   Core
   ExecutionEngine
   Interpreter
-  JIT
+  MC
   Support
   nativecodegen
   )

diff --git a/examples/ParallelJIT/Makefile b/examples/ParallelJIT/Makefile
index 8a49d42..0f2a357 100644
--- a/examples/ParallelJIT/Makefile
+++ b/examples/ParallelJIT/Makefile

@@ -10,7 +10,7 @@
 TOOLNAME = ParallelJIT
 EXAMPLE_TOOL = 1
 
-LINK_COMPONENTS := jit interpreter nativecodegen
+LINK_COMPONENTS := mcjit interpreter nativecodegen
 
 include $(LEVEL)/Makefile.common
 

diff --git a/examples/ParallelJIT/ParallelJIT.cpp b/examples/ParallelJIT/ParallelJIT.cpp
index 2aa63d9..4ebf3d0 100644
--- a/examples/ParallelJIT/ParallelJIT.cpp
+++ b/examples/ParallelJIT/ParallelJIT.cpp

@@ -19,7 +19,6 @@
 
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/Interpreter.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
@@ -243,13 +242,14 @@
   LLVMContext Context;
 
   // Create some module to put our function into it.
-  Module *M = new Module("test", Context);
+  std::unique_ptr<Module> Owner = make_unique<Module>("test", Context);
+  Module *M = Owner.get();
 
   Function* add1F = createAdd1( M );
   Function* fibF = CreateFibFunction( M );
 
   // Now we create the JIT.
-  ExecutionEngine* EE = EngineBuilder(M).create();
+  ExecutionEngine* EE = EngineBuilder(std::move(Owner)).create();
 
   //~ std::cout << "We just constructed this LLVM module:\n\n" << *M;
   //~ std::cout << "\n\nRunning foo: " << std::flush;

diff --git a/host/include/llvm/Config/config.h b/host/include/llvm/Config/config.h
index d686e16..cb3d4a3 100644
--- a/host/include/llvm/Config/config.h
+++ b/host/include/llvm/Config/config.h

@@ -225,7 +225,11 @@
 #define HAVE_LINK_EXPORT_DYNAMIC 1
 
 /* Define to 1 if you have the <link.h> header file. */
+#if defined(__APPLE__)
+/* #undef HAVE_LINK_H */
+#else
 #define HAVE_LINK_H 1
+#endif
 
 /* Define if you can use -Wl,-R. to pass -R. to the linker, in order to add
    the current directory to the dynamic linker search path. */
@@ -652,7 +656,7 @@
 #define LLVM_VERSION_MAJOR 3
 
 /* Minor version of the LLVM API */
-#define LLVM_VERSION_MINOR 5
+#define LLVM_VERSION_MINOR 6
 
 /* Define if the OS needs help to load dependent libraries for dlopen(). */
 /* #undef LTDL_DLOPEN_DEPLIBS */
@@ -685,13 +689,13 @@
 #define PACKAGE_NAME "LLVM"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "LLVM 3.5.svn"
+#define PACKAGE_STRING "LLVM 3.6.svn"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "llvm"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "3.5"
+#define PACKAGE_VERSION "3.6"
 
 /* Define as the return type of signal handlers (`int' or `void'). */
 #define RETSIGTYPE void

diff --git a/host/include/llvm/Config/llvm-config.h b/host/include/llvm/Config/llvm-config.h
index 95f85fe..6cb218e 100644
--- a/host/include/llvm/Config/llvm-config.h
+++ b/host/include/llvm/Config/llvm-config.h

@@ -95,7 +95,7 @@
 #define LLVM_VERSION_MAJOR 3
 
 /* Minor version of the LLVM API */
-#define LLVM_VERSION_MINOR 5
+#define LLVM_VERSION_MINOR 6
 
 #include "llvm/Config/llvm-platform-config.h"
 

diff --git a/include/llvm-c/BitReader.h b/include/llvm-c/BitReader.h
index 7af209b..f3b388b 100644
--- a/include/llvm-c/BitReader.h
+++ b/include/llvm-c/BitReader.h

@@ -16,8 +16,8 @@
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
-#ifndef LLVM_C_BITCODEREADER_H
-#define LLVM_C_BITCODEREADER_H
+#ifndef LLVM_C_BITREADER_H
+#define LLVM_C_BITREADER_H
 
 #include "llvm-c/Core.h"
 

diff --git a/include/llvm-c/BitWriter.h b/include/llvm-c/BitWriter.h
index f605e24..f25ad3a 100644
--- a/include/llvm-c/BitWriter.h
+++ b/include/llvm-c/BitWriter.h

@@ -16,8 +16,8 @@
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
-#ifndef LLVM_C_BITCODEWRITER_H
-#define LLVM_C_BITCODEWRITER_H
+#ifndef LLVM_C_BITWRITER_H
+#define LLVM_C_BITWRITER_H
 
 #include "llvm-c/Core.h"
 
@@ -45,6 +45,9 @@
     descriptor. Returns 0 on success. Closes the Handle. */
 int LLVMWriteBitcodeToFileHandle(LLVMModuleRef M, int Handle);
 
+/** Writes a module to a new memory buffer and returns it. */
+LLVMMemoryBufferRef LLVMWriteBitcodeToMemoryBuffer(LLVMModuleRef M);
+
 /**
  * @}
  */

diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 8693a30..30c7595 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h

@@ -168,6 +168,7 @@
     LLVMInAllocaAttribute = 1ULL << 36,
     LLVMNonNullAttribute = 1ULL << 37,
     LLVMJumpTableAttribute = 1ULL << 38,
+    LLVMDereferenceableAttribute = 1ULL << 39,
     */
 } LLVMAttribute;
 
@@ -559,6 +560,10 @@
  */
 LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID,
                                                 LLVMContextRef C);
+/**
+ * Return an exact copy of the specified module.
+ */
+LLVMModuleRef LLVMCloneModule(LLVMModuleRef M);
 
 /**
  * Destroy a module instance.
@@ -1377,6 +1382,13 @@
 LLVMValueRef LLVMGetOperand(LLVMValueRef Val, unsigned Index);
 
 /**
+ * Obtain the use of an operand at a specific index in a llvm::User value.
+ *
+ * @see llvm::User::getOperandUse()
+ */
+LLVMUseRef LLVMGetOperandUse(LLVMValueRef Val, unsigned Index);
+
+/**
  * Set an operand at a specific index in a llvm::User value.
  *
  * @see llvm::User::setOperand()
@@ -1537,6 +1549,14 @@
 long long LLVMConstIntGetSExtValue(LLVMValueRef ConstantVal);
 
 /**
+ * Obtain the double value for an floating point constant value.
+ * losesInfo indicates if some precision was lost in the conversion.
+ *
+ * @see llvm::ConstantFP::getDoubleValue
+ */
+double LLVMConstRealGetDouble(LLVMValueRef ConstantVal, LLVMBool *losesInfo);
+
+/**
  * @}
  */
 
@@ -1569,6 +1589,20 @@
                              LLVMBool DontNullTerminate);
 
 /**
+ * Returns true if the specified constant is an array of i8.
+ *
+ * @see ConstantDataSequential::getAsString()
+ */
+LLVMBool LLVMIsConstantString(LLVMValueRef c);
+
+/**
+ * Get the given constant data sequential as a string.
+ *
+ * @see ConstantDataSequential::getAsString()
+ */
+const char *LLVMGetAsString(LLVMValueRef c, size_t* out);
+
+/**
  * Create an anonymous ConstantStruct with the specified values.
  *
  * @see llvm::ConstantStruct::getAnon()
@@ -1606,6 +1640,13 @@
                                   unsigned Count);
 
 /**
+ * Get an element at specified index as a constant.
+ *
+ * @see ConstantDataSequential::getElementAsConstant()
+ */
+LLVMValueRef LLVMGetElementAsConstant(LLVMValueRef c, unsigned idx);
+
+/**
  * Create a ConstantVector from values.
  *
  * @see llvm::ConstantVector::get()
@@ -2376,6 +2417,26 @@
 LLVMIntPredicate LLVMGetICmpPredicate(LLVMValueRef Inst);
 
 /**
+ * Obtain the float predicate of an instruction.
+ *
+ * This is only valid for instructions that correspond to llvm::FCmpInst
+ * or llvm::ConstantExpr whose opcode is llvm::Instruction::FCmp.
+ *
+ * @see llvm::FCmpInst::getPredicate()
+ */
+LLVMRealPredicate LLVMGetFCmpPredicate(LLVMValueRef Inst);
+
+/**
+ * Create a copy of 'this' instruction that is identical in all ways
+ * except the following:
+ *   * The instruction has no parent
+ *   * The instruction has no name
+ *
+ * @see llvm::Instruction::clone()
+ */
+LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst);
+
+/**
  * @defgroup LLVMCCoreValueInstructionCall Call Sites and Invocations
  *
  * Functions in this group apply to instructions that refer to call
@@ -2436,6 +2497,63 @@
  */
 
 /**
+ * @defgroup LLVMCCoreValueInstructionTerminator Terminators
+ *
+ * Functions in this group only apply to instructions that map to
+ * llvm::TerminatorInst instances.
+ *
+ * @{
+ */
+
+/**
+ * Return the number of successors that this terminator has.
+ *
+ * @see llvm::TerminatorInst::getNumSuccessors
+ */
+unsigned LLVMGetNumSuccessors(LLVMValueRef Term);
+
+/**
+ * Return the specified successor.
+ *
+ * @see llvm::TerminatorInst::getSuccessor
+ */
+LLVMBasicBlockRef LLVMGetSuccessor(LLVMValueRef Term, unsigned i);
+
+/**
+ * Update the specified successor to point at the provided block.
+ *
+ * @see llvm::TerminatorInst::setSuccessor
+ */
+void LLVMSetSuccessor(LLVMValueRef Term, unsigned i, LLVMBasicBlockRef block);
+
+/**
+ * Return if a branch is conditional.
+ *
+ * This only works on llvm::BranchInst instructions.
+ *
+ * @see llvm::BranchInst::isConditional
+ */
+LLVMBool LLVMIsConditional(LLVMValueRef Branch);
+
+/**
+ * Return the condition of a branch instruction.
+ *
+ * This only works on llvm::BranchInst instructions.
+ *
+ * @see llvm::BranchInst::getCondition
+ */
+LLVMValueRef LLVMGetCondition(LLVMValueRef Branch);
+
+/**
+ * Set the condition of a branch instruction.
+ *
+ * This only works on llvm::BranchInst instructions.
+ *
+ * @see llvm::BranchInst::setCondition
+ */
+void LLVMSetCondition(LLVMValueRef Branch, LLVMValueRef Cond);
+
+/**
  * Obtain the default destination basic block of a switch instruction.
  *
  * This only works on llvm::SwitchInst instructions.
@@ -2445,6 +2563,10 @@
 LLVMBasicBlockRef LLVMGetSwitchDefaultDest(LLVMValueRef SwitchInstr);
 
 /**
+ * @}
+ */
+
+/**
  * @defgroup LLVMCCoreValueInstructionPHINode PHI Nodes
  *
  * Functions in this group only apply to instructions that map to

diff --git a/include/llvm-c/Disassembler.h b/include/llvm-c/Disassembler.h
index 8f31150..d6cbe31 100644
--- a/include/llvm-c/Disassembler.h
+++ b/include/llvm-c/Disassembler.h

@@ -174,8 +174,8 @@
  * by passing a block of information in the DisInfo parameter and specifying the
  * TagType and callback functions as described above.  These can all be passed
  * as NULL.  If successful, this returns a disassembler context.  If not, it
- * returns NULL. This function is equivalent to calling LLVMCreateDisasmCPU()
- * with an empty CPU name.
+ * returns NULL. This function is equivalent to calling
+ * LLVMCreateDisasmCPUFeatures() with an empty CPU name and feature set.
  */
 LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
                                       int TagType, LLVMOpInfoCallback GetOpInfo,
@@ -186,7 +186,8 @@
  * disassembly is supported by passing a block of information in the DisInfo
  * parameter and specifying the TagType and callback functions as described
  * above.  These can all be passed * as NULL.  If successful, this returns a
- * disassembler context.  If not, it returns NULL.
+ * disassembler context.  If not, it returns NULL. This function is equivalent
+ * to calling LLVMCreateDisasmCPUFeatures() with an empty feature set.
  */
 LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
                                          void *DisInfo, int TagType,
@@ -194,6 +195,19 @@
                                          LLVMSymbolLookupCallback SymbolLookUp);
 
 /**
+ * Create a disassembler for the TripleName, a specific CPU and specific feature
+ * string.  Symbolic disassembly is supported by passing a block of information
+ * in the DisInfo parameter and specifying the TagType and callback functions as
+ * described above.  These can all be passed * as NULL.  If successful, this
+ * returns a disassembler context.  If not, it returns NULL.
+ */
+LLVMDisasmContextRef
+LLVMCreateDisasmCPUFeatures(const char *Triple, const char *CPU,
+                            const char *Features, void *DisInfo, int TagType,
+                            LLVMOpInfoCallback GetOpInfo,
+                            LLVMSymbolLookupCallback SymbolLookUp);
+
+/**
  * Set the disassembler's options.  Returns 1 if it can set the Options and 0
  * otherwise.
  */

diff --git a/include/llvm-c/ExecutionEngine.h b/include/llvm-c/ExecutionEngine.h
index 7cdf0d7..f1f4cad 100644
--- a/include/llvm-c/ExecutionEngine.h
+++ b/include/llvm-c/ExecutionEngine.h

@@ -34,7 +34,6 @@
  * @{
  */
 
-void LLVMLinkInJIT(void);
 void LLVMLinkInMCJIT(void);
 void LLVMLinkInInterpreter(void);
 

diff --git a/include/llvm-c/Initialization.h b/include/llvm-c/Initialization.h
index ada4738..44194f8 100644
--- a/include/llvm-c/Initialization.h
+++ b/include/llvm-c/Initialization.h

@@ -13,8 +13,8 @@
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
-#ifndef LLVM_C_INITIALIZEPASSES_H
-#define LLVM_C_INITIALIZEPASSES_H
+#ifndef LLVM_C_INITIALIZATION_H
+#define LLVM_C_INITIALIZATION_H
 
 #include "llvm-c/Core.h"
 

diff --git a/include/llvm-c/Support.h b/include/llvm-c/Support.h
index 4e6ff22..a9216d0 100644
--- a/include/llvm-c/Support.h
+++ b/include/llvm-c/Support.h

@@ -47,6 +47,17 @@
   */
 LLVMBool LLVMLoadLibraryPermanently(const char* Filename);
 
+/**
+ * This function parses the given arguments using the LLVM command line parser.
+ * Note that the only stable thing about this function is its signature; you
+ * cannot rely on any particular set of command line arguments being interpreted
+ * the same way across LLVM versions.
+ *
+ * @see llvm::cl::ParseCommandLineOptions()
+ */
+void LLVMParseCommandLineOptions(int argc, const char *const *argv,
+                                 const char *Overview);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/include/llvm-c/Transforms/Scalar.h b/include/llvm-c/Transforms/Scalar.h
index 9b820b2..7ad1ad1 100644
--- a/include/llvm-c/Transforms/Scalar.h
+++ b/include/llvm-c/Transforms/Scalar.h

@@ -35,6 +35,9 @@
 /** See llvm::createAggressiveDCEPass function. */
 void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM);
 
+/** See llvm::createAlignmentFromAssumptionsPass function. */
+void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM);
+
 /** See llvm::createCFGSimplificationPass function. */
 void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM);
 
@@ -44,6 +47,9 @@
 /** See llvm::createScalarizerPass function. */
 void LLVMAddScalarizerPass(LLVMPassManagerRef PM);
 
+/** See llvm::createMergedLoadStoreMotionPass function. */
+void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM);
+
 /** See llvm::createGVNPass function. */
 void LLVMAddGVNPass(LLVMPassManagerRef PM);
 
@@ -83,6 +89,9 @@
 /** See llvm::createPartiallyInlineLibCallsPass function. */
 void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM);
 
+/** See llvm::createLowerSwitchPass function. */
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM);
+
 /** See llvm::createPromoteMemoryToRegisterPass function. */
 void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM);
 
@@ -129,6 +138,9 @@
 /** See llvm::createTypeBasedAliasAnalysisPass function */
 void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM);
 
+/** See llvm::createScopedNoAliasAAPass function */
+void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM);
+
 /** See llvm::createBasicAliasAnalysisPass function */
 void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM);
 

diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index 51079896..3f30d6d 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h

@@ -178,6 +178,35 @@
                                         const char *path);
 
 /**
+ * \brief Loads an object file in its own context.
+ *
+ * Loads an object file in its own LLVMContext.  This function call is
+ * thread-safe.  However, modules created this way should not be merged into an
+ * lto_code_gen_t using \a lto_codegen_add_module().
+ *
+ * Returns NULL on error (check lto_get_error_message() for details).
+ *
+ * \since LTO_API_VERSION=11
+ */
+extern lto_module_t
+lto_module_create_in_local_context(const void *mem, size_t length,
+                                   const char *path);
+
+/**
+ * \brief Loads an object file in the codegen context.
+ *
+ * Loads an object file into the same context as \c cg.  The module is safe to
+ * add using \a lto_codegen_add_module().
+ *
+ * Returns NULL on error (check lto_get_error_message() for details).
+ *
+ * \since LTO_API_VERSION=11
+ */
+extern lto_module_t
+lto_module_create_in_codegen_context(const void *mem, size_t length,
+                                     const char *path, lto_code_gen_t cg);
+
+/**
  * Loads an object file from disk. The seek point of fd is not preserved.
  * Returns NULL on error (check lto_get_error_message() for details).
  *
@@ -324,12 +353,27 @@
  * Instantiates a code generator.
  * Returns NULL on error (check lto_get_error_message() for details).
  *
+ * All modules added using \a lto_codegen_add_module() must have been created
+ * in the same context as the codegen.
+ *
  * \since prior to LTO_API_VERSION=3
  */
 extern lto_code_gen_t
 lto_codegen_create(void);
 
 /**
+ * \brief Instantiate a code generator in its own context.
+ *
+ * Instantiates a code generator in its own context.  Modules added via \a
+ * lto_codegen_add_module() must have all been created in the same context,
+ * using \a lto_module_create_in_codegen_context().
+ *
+ * \since LTO_API_VERSION=11
+ */
+extern lto_code_gen_t
+lto_codegen_create_in_local_context(void);
+
+/**
  * Frees all code generator and all memory it internally allocated.
  * Upon return the lto_code_gen_t is no longer valid.
  *
@@ -342,6 +386,10 @@
  * Add an object module to the set of modules for which code will be generated.
  * Returns true on error (check lto_get_error_message() for details).
  *
+ * \c cg and \c mod must both be in the same context.  See \a
+ * lto_codegen_create_in_local_context() and \a
+ * lto_module_create_in_codegen_context().
+ *
  * \since prior to LTO_API_VERSION=3
  */
 extern lto_bool_t
@@ -375,14 +423,6 @@
 extern void
 lto_codegen_set_cpu(lto_code_gen_t cg, const char *cpu);
 
-/**
- * Sets attributes for the cpu to generate code for.
- *
- * \since LTO_API_VERSION=11
- */
-extern void
-lto_codegen_set_attr(lto_code_gen_t cg, const char *attr);
-
 
 /**
  * Sets the location of the assembler tool to run. If not set, libLTO

diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index 50f1463..26aae77 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h

@@ -304,6 +304,38 @@
   /// IEEE-754R 5.3.1: nextUp/nextDown.
   opStatus next(bool nextDown);
 
+  /// \brief Operator+ overload which provides the default
+  /// \c nmNearestTiesToEven rounding mode and *no* error checking.
+  APFloat operator+(const APFloat &RHS) const {
+    APFloat Result = *this;
+    Result.add(RHS, rmNearestTiesToEven);
+    return Result;
+  }
+
+  /// \brief Operator- overload which provides the default
+  /// \c nmNearestTiesToEven rounding mode and *no* error checking.
+  APFloat operator-(const APFloat &RHS) const {
+    APFloat Result = *this;
+    Result.subtract(RHS, rmNearestTiesToEven);
+    return Result;
+  }
+
+  /// \brief Operator* overload which provides the default
+  /// \c nmNearestTiesToEven rounding mode and *no* error checking.
+  APFloat operator*(const APFloat &RHS) const {
+    APFloat Result = *this;
+    Result.multiply(RHS, rmNearestTiesToEven);
+    return Result;
+  }
+
+  /// \brief Operator/ overload which provides the default
+  /// \c nmNearestTiesToEven rounding mode and *no* error checking.
+  APFloat operator/(const APFloat &RHS) const {
+    APFloat Result = *this;
+    Result.divide(RHS, rmNearestTiesToEven);
+    return Result;
+  }
+
   /// @}
 
   /// \name Sign operations.
@@ -313,6 +345,13 @@
   void clearSign();
   void copySign(const APFloat &);
 
+  /// \brief A static helper to produce a copy of an APFloat value with its sign
+  /// copied from some other APFloat.
+  static APFloat copySign(APFloat Value, const APFloat &Sign) {
+    Value.copySign(Sign);
+    return std::move(Value);
+  }
+
   /// @}
 
   /// \name Conversions
@@ -452,6 +491,36 @@
   /// return true.
   bool getExactInverse(APFloat *inv) const;
 
+  /// \brief Enumeration of \c ilogb error results.
+  enum IlogbErrorKinds {
+    IEK_Zero = INT_MIN+1,
+    IEK_NaN = INT_MIN,
+    IEK_Inf = INT_MAX
+  };
+
+  /// \brief Returns the exponent of the internal representation of the APFloat.
+  ///
+  /// Because the radix of APFloat is 2, this is equivalent to floor(log2(x)).
+  /// For special APFloat values, this returns special error codes:
+  ///
+  ///   NaN -> \c IEK_NaN
+  ///   0   -> \c IEK_Zero
+  ///   Inf -> \c IEK_Inf
+  ///
+  friend int ilogb(const APFloat &Arg) {
+    if (Arg.isNaN())
+      return IEK_NaN;
+    if (Arg.isZero())
+      return IEK_Zero;
+    if (Arg.isInfinity())
+      return IEK_Inf;
+
+    return Arg.exponent;
+  }
+
+  /// \brief Returns: X * 2^Exp for integral exponents.
+  friend APFloat scalbn(APFloat X, int Exp);
+
 private:
 
   /// \name Simple Queries
@@ -573,11 +642,41 @@
   unsigned int sign : 1;
 };
 
-/// See friend declaration above.
+/// See friend declarations above.
 ///
-/// This additional declaration is required in order to compile LLVM with IBM
+/// These additional declarations are required in order to compile LLVM with IBM
 /// xlC compiler.
 hash_code hash_value(const APFloat &Arg);
+APFloat scalbn(APFloat X, int Exp);
+
+/// \brief Returns the absolute value of the argument.
+inline APFloat abs(APFloat X) {
+  X.clearSign();
+  return X;
+}
+
+/// Implements IEEE minNum semantics. Returns the smaller of the 2 arguments if
+/// both are not NaN. If either argument is a NaN, returns the other argument.
+LLVM_READONLY
+inline APFloat minnum(const APFloat &A, const APFloat &B) {
+  if (A.isNaN())
+    return B;
+  if (B.isNaN())
+    return A;
+  return (B.compare(A) == APFloat::cmpLessThan) ? B : A;
+}
+
+/// Implements IEEE maxNum semantics. Returns the larger of the 2 arguments if
+/// both are not NaN. If either argument is a NaN, returns the other argument.
+LLVM_READONLY
+inline APFloat maxnum(const APFloat &A, const APFloat &B) {
+  if (A.isNaN())
+    return B;
+  if (B.isNaN())
+    return A;
+  return (A.compare(B) == APFloat::cmpLessThan) ? B : A;
+}
+
 } // namespace llvm
 
 #endif // LLVM_ADT_APFLOAT_H

diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index aa3c3f6..f4e7e3c 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h

@@ -656,13 +656,24 @@
 
   /// @brief Move assignment operator.
   APInt &operator=(APInt &&that) {
-    if (!isSingleWord())
+    if (!isSingleWord()) {
+      // The MSVC STL shipped in 2013 requires that self move assignment be a
+      // no-op.  Otherwise algorithms like stable_sort will produce answers
+      // where half of the output is left in a moved-from state.
+      if (this == &that)
+        return *this;
       delete[] pVal;
+    }
 
-    BitWidth = that.BitWidth;
-    VAL = that.VAL;
+    // Use memcpy so that type based alias analysis sees both VAL and pVal
+    // as modified.
+    memcpy(&VAL, &that.VAL, sizeof(uint64_t));
 
+    // If 'this == &that', avoid zeroing our own bitwidth by storing to 'that'
+    // first.
+    unsigned ThatBitWidth = that.BitWidth;
     that.BitWidth = 0;
+    BitWidth = ThatBitWidth;
 
     return *this;
   }
@@ -936,7 +947,8 @@
   APInt sdiv_ov(const APInt &RHS, bool &Overflow) const;
   APInt smul_ov(const APInt &RHS, bool &Overflow) const;
   APInt umul_ov(const APInt &RHS, bool &Overflow) const;
-  APInt sshl_ov(unsigned Amt, bool &Overflow) const;
+  APInt sshl_ov(const APInt &Amt, bool &Overflow) const;
+  APInt ushl_ov(const APInt &Amt, bool &Overflow) const;
 
   /// \brief Array-indexing support.
   ///

diff --git a/include/llvm/ADT/APSInt.h b/include/llvm/ADT/APSInt.h
index ee34e9b..a6693f7 100644
--- a/include/llvm/ADT/APSInt.h
+++ b/include/llvm/ADT/APSInt.h

@@ -269,19 +269,15 @@
     else if (I2.getBitWidth() > I1.getBitWidth())
       return isSameValue(I1.extend(I2.getBitWidth()), I2);
 
-    // We have a signedness mismatch. Turn the signed value into an unsigned
-    // value.
-    if (I1.isSigned()) {
-      if (I1.isNegative())
-        return false;
+    assert(I1.isSigned() != I2.isSigned());
 
-      return APSInt(I1, true) == I2;
-    }
-
-    if (I2.isNegative())
+    // We have a signedness mismatch. Check for negative values and do an
+    // unsigned compare if signs match.
+    if ((I1.isSigned() && I1.isNegative()) ||
+        (!I1.isSigned() && I2.isNegative()))
       return false;
 
-    return I1 == APSInt(I2, true);
+    return I1.eq(I2);
   }
 
   /// Profile - Used to insert APSInt objects, or objects that contain APSInt

diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index 0fff505..8c14a42 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h

@@ -11,6 +11,7 @@
 #define LLVM_ADT_ARRAYREF_H
 
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include <vector>
 
@@ -43,6 +44,19 @@
     /// The number of elements.
     size_type Length;
 
+    /// \brief A dummy "optional" type that is only created by implicit
+    /// conversion from a reference to T.
+    ///
+    /// This type must *only* be used in a function argument or as a copy of
+    /// a function argument, as otherwise it will hold a pointer to a temporary
+    /// past that temporaries' lifetime.
+    struct TRefOrNothing {
+      const T *TPtr;
+
+      TRefOrNothing() : TPtr(nullptr) {}
+      TRefOrNothing(const T &TRef) : TPtr(&TRef) {}
+    };
+
   public:
     /// @name Constructors
     /// @{
@@ -90,6 +104,14 @@
       Length(Vec.size()) {}
 #endif
 
+    /// Construct an ArrayRef<const T*> from ArrayRef<T*>. This uses SFINAE to
+    /// ensure that only ArrayRefs of pointers can be converted.
+    template <typename U>
+    ArrayRef(const ArrayRef<U *> &A,
+             typename std::enable_if<
+                 std::is_convertible<U *const *, T const *>::value>::type* = 0)
+      : Data(A.data()), Length(A.size()) {}
+
     /// @}
     /// @name Simple Operations
     /// @{
@@ -131,7 +153,13 @@
     bool equals(ArrayRef RHS) const {
       if (Length != RHS.Length)
         return false;
-      return std::equal(begin(), end(), RHS.begin());
+      // Don't use std::equal(), since it asserts in MSVC on nullptr iterators.
+      for (auto L = begin(), LE = end(), R = RHS.begin(); L != LE; ++L, ++R)
+        // Match std::equal() in using == (instead of !=) to minimize API
+        // requirements of ArrayRef'ed types.
+        if (!(*L == *R))
+          return false;
+      return true;
     }
 
     /// slice(n) - Chop off the first N elements of the array.
@@ -176,6 +204,47 @@
     }
 
     /// @}
+    /// @{
+    /// @name Convenience methods
+
+    /// @brief Predicate for testing that the array equals the exact sequence of
+    /// arguments.
+    ///
+    /// Will return false if the size is not equal to the exact number of
+    /// arguments given or if the array elements don't equal the argument
+    /// elements in order. Currently supports up to 16 arguments, but can
+    /// easily be extended.
+    bool equals(TRefOrNothing Arg0 = TRefOrNothing(),
+                TRefOrNothing Arg1 = TRefOrNothing(),
+                TRefOrNothing Arg2 = TRefOrNothing(),
+                TRefOrNothing Arg3 = TRefOrNothing(),
+                TRefOrNothing Arg4 = TRefOrNothing(),
+                TRefOrNothing Arg5 = TRefOrNothing(),
+                TRefOrNothing Arg6 = TRefOrNothing(),
+                TRefOrNothing Arg7 = TRefOrNothing(),
+                TRefOrNothing Arg8 = TRefOrNothing(),
+                TRefOrNothing Arg9 = TRefOrNothing(),
+                TRefOrNothing Arg10 = TRefOrNothing(),
+                TRefOrNothing Arg11 = TRefOrNothing(),
+                TRefOrNothing Arg12 = TRefOrNothing(),
+                TRefOrNothing Arg13 = TRefOrNothing(),
+                TRefOrNothing Arg14 = TRefOrNothing(),
+                TRefOrNothing Arg15 = TRefOrNothing()) {
+      TRefOrNothing Args[] = {Arg0,  Arg1,  Arg2,  Arg3, Arg4,  Arg5,
+                              Arg6,  Arg7,  Arg8,  Arg9, Arg10, Arg11,
+                              Arg12, Arg13, Arg14, Arg15};
+      if (size() > array_lengthof(Args))
+        return false;
+
+      for (unsigned i = 0, e = size(); i != e; ++i)
+        if (Args[i].TPtr == nullptr || (*this)[i] != *Args[i].TPtr)
+          return false;
+
+      // Either the size is exactly as many args, or the next arg must be null.
+      return size() == array_lengthof(Args) || Args[size()].TPtr == nullptr;
+    }
+
+    /// @}
   };
 
   /// MutableArrayRef - Represent a mutable reference to an array (0 or more

diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index 85f37b9..c44b67a 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h

@@ -305,6 +305,7 @@
 
   template <typename OtherBaseT>
   void copyFrom(const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT>& other) {
+    assert(&other != this);
     assert(getNumBuckets() == other.getNumBuckets());
 
     setNumEntries(other.getNumEntries());
@@ -574,7 +575,8 @@
   }
 
   DenseMap& operator=(const DenseMap& other) {
-    copyFrom(other);
+    if (&other != this)
+      copyFrom(other);
     return *this;
   }
 
@@ -799,7 +801,8 @@
   }
 
   SmallDenseMap& operator=(const SmallDenseMap& other) {
-    copyFrom(other);
+    if (&other != this)
+      copyFrom(other);
     return *this;
   }
 

diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h
index 37a81b0..9ab1be2 100644
--- a/include/llvm/ADT/DenseSet.h
+++ b/include/llvm/ADT/DenseSet.h

@@ -29,7 +29,7 @@
 public:
   typedef ValueT key_type;
   typedef ValueT value_type;
-  typedef unsigned size_type;

+  typedef unsigned size_type;
 
   explicit DenseSet(unsigned NumInitBuckets = 0) : TheMap(NumInitBuckets) {}
 
@@ -45,7 +45,7 @@
     TheMap.clear();
   }
 
-  /// Return 1 if the specified key is in the set, 0 otherwise.

+  /// Return 1 if the specified key is in the set, 0 otherwise.
   size_type count(const ValueT &V) const {
     return TheMap.count(V);
   }
@@ -110,6 +110,21 @@
   const_iterator end() const { return ConstIterator(TheMap.end()); }
 
   iterator find(const ValueT &V) { return Iterator(TheMap.find(V)); }
+
+  /// Alternative version of find() which allows a different, and possibly less
+  /// expensive, key type.
+  /// The DenseMapInfo is responsible for supplying methods
+  /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key type
+  /// used.
+  template <class LookupKeyT>
+  iterator find_as(const LookupKeyT &Val) {
+    return Iterator(TheMap.find_as(Val));
+  }
+  template <class LookupKeyT>
+  const_iterator find_as(const LookupKeyT &Val) const {
+    return ConstIterator(TheMap.find_as(Val));
+  }
+
   void erase(Iterator I) { return TheMap.erase(I.I); }
   void erase(ConstIterator CI) { return TheMap.erase(CI.I); }
 

diff --git a/include/llvm/ADT/DepthFirstIterator.h b/include/llvm/ADT/DepthFirstIterator.h
index dfba43f..0f69146 100644
--- a/include/llvm/ADT/DepthFirstIterator.h
+++ b/include/llvm/ADT/DepthFirstIterator.h

@@ -231,6 +231,13 @@
   return df_ext_iterator<T, SetTy>::end(G, S);
 }
 
+template <class T, class SetTy>
+iterator_range<df_ext_iterator<T, SetTy>> depth_first_ext(const T& G,
+                                                          SetTy &S) {
+  return iterator_range<df_ext_iterator<T, SetTy>>(df_ext_begin(G, S),
+                                                   df_ext_end(G, S));
+}
+
 
 // Provide global definitions of inverse depth first iterators...
 template <class T,
@@ -276,6 +283,13 @@
   return idf_ext_iterator<T, SetTy>::end(Inverse<T>(G), S);
 }
 
+template <class T, class SetTy>
+iterator_range<idf_ext_iterator<T, SetTy>> inverse_depth_first_ext(const T& G,
+                                                                   SetTy &S) {
+  return iterator_range<idf_ext_iterator<T, SetTy>>(idf_ext_begin(G, S),
+                                                    idf_ext_end(G, S));
+}
+
 } // End llvm namespace
 
 #endif

diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h
index f9df378..c859c98 100644
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h

@@ -197,6 +197,9 @@
   private:
     void retain() { if (Obj) IntrusiveRefCntPtrInfo<T>::retain(Obj); }
     void release() { if (Obj) IntrusiveRefCntPtrInfo<T>::release(Obj); }
+
+    template <typename X>
+    friend class IntrusiveRefCntPtr;
   };
 
   template<class T, class U>

diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h
index 2eae22c..14c49c5 100644
--- a/include/llvm/ADT/MapVector.h
+++ b/include/llvm/ADT/MapVector.h

@@ -37,26 +37,20 @@
 public:
   typedef typename VectorType::iterator iterator;
   typedef typename VectorType::const_iterator const_iterator;
+  typedef typename VectorType::reverse_iterator reverse_iterator;
+  typedef typename VectorType::const_reverse_iterator const_reverse_iterator;
 
-  size_type size() const {
-    return Vector.size();
-  }
+  size_type size() const { return Vector.size(); }
 
-  iterator begin() {
-    return Vector.begin();
-  }
+  iterator begin() { return Vector.begin(); }
+  const_iterator begin() const { return Vector.begin(); }
+  iterator end() { return Vector.end(); }
+  const_iterator end() const { return Vector.end(); }
 
-  const_iterator begin() const {
-    return Vector.begin();
-  }
-
-  iterator end() {
-    return Vector.end();
-  }
-
-  const_iterator end() const {
-    return Vector.end();
-  }
+  reverse_iterator rbegin() { return Vector.rbegin(); }
+  const_reverse_iterator rbegin() const { return Vector.rbegin(); }
+  reverse_iterator rend() { return Vector.rend(); }
+  const_reverse_iterator rend() const { return Vector.rend(); }
 
   bool empty() const {
     return Vector.empty();
@@ -125,15 +119,68 @@
   }
 
   /// \brief Remove the element given by Iterator.
+  ///
   /// Returns an iterator to the element following the one which was removed,
   /// which may be end().
+  ///
+  /// \note This is a deceivingly expensive operation (linear time).  It's
+  /// usually better to use \a remove_if() if possible.
   typename VectorType::iterator erase(typename VectorType::iterator Iterator) {
-    typename MapType::iterator MapIterator = Map.find(Iterator->first);
-    Map.erase(MapIterator);
-    return Vector.erase(Iterator);
+    Map.erase(Iterator->first);
+    auto Next = Vector.erase(Iterator);
+    if (Next == Vector.end())
+      return Next;
+
+    // Update indices in the map.
+    size_t Index = Next - Vector.begin();
+    for (auto &I : Map) {
+      assert(I.second != Index && "Index was already erased!");
+      if (I.second > Index)
+        --I.second;
+    }
+    return Next;
   }
+
+  /// \brief Remove all elements with the key value Key.
+  ///
+  /// Returns the number of elements removed.
+  size_type erase(const KeyT &Key) {
+    auto Iterator = find(Key);
+    if (Iterator == end())
+      return 0;
+    erase(Iterator);
+    return 1;
+  }
+
+  /// \brief Remove the elements that match the predicate.
+  ///
+  /// Erase all elements that match \c Pred in a single pass.  Takes linear
+  /// time.
+  template <class Predicate> void remove_if(Predicate Pred);
 };
 
+template <typename KeyT, typename ValueT, typename MapType, typename VectorType>
+template <class Function>
+void MapVector<KeyT, ValueT, MapType, VectorType>::remove_if(Function Pred) {
+  auto O = Vector.begin();
+  for (auto I = O, E = Vector.end(); I != E; ++I) {
+    if (Pred(*I)) {
+      // Erase from the map.
+      Map.erase(I->first);
+      continue;
+    }
+
+    if (I != O) {
+      // Move the value and update the index in the map.
+      *O = std::move(*I);
+      Map[O->first] = O - Vector.begin();
+    }
+    ++O;
+  }
+  // Erase trailing entries in the vector.
+  Vector.erase(O, Vector.end());
 }
 
+} // end namespace llvm
+
 #endif

diff --git a/include/llvm/ADT/Optional.h b/include/llvm/ADT/Optional.h
index ae8344d..591872e 100644
--- a/include/llvm/ADT/Optional.h
+++ b/include/llvm/ADT/Optional.h

@@ -20,6 +20,7 @@
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Compiler.h"
 #include <cassert>
+#include <new>
 #include <utility>
 
 namespace llvm {
@@ -29,6 +30,8 @@
   AlignedCharArrayUnion<T> storage;
   bool hasVal;
 public:
+  typedef T value_type;
+
   Optional(NoneType) : hasVal(false) {}
   explicit Optional() : hasVal(false) {}
   Optional(const T &y) : hasVal(true) {
@@ -67,6 +70,61 @@
     return *this;
   }
 
+#if LLVM_HAS_VARIADIC_TEMPLATES
+
+  /// Create a new object by constructing it in place with the given arguments.
+  template<typename ...ArgTypes>
+  void emplace(ArgTypes &&...Args) {
+    reset();
+    hasVal = true;
+    new (storage.buffer) T(std::forward<ArgTypes>(Args)...);
+  }
+
+#else
+  
+  /// Create a new object by default-constructing it in place.
+  void emplace() {
+    reset();
+    hasVal = true;
+    new (storage.buffer) T();
+  }
+  
+  /// Create a new object by constructing it in place with the given arguments.
+  template<typename T1>
+  void emplace(T1 &&A1) {
+    reset();
+    hasVal = true;
+    new (storage.buffer) T(std::forward<T1>(A1));
+  }
+  
+  /// Create a new object by constructing it in place with the given arguments.
+  template<typename T1, typename T2>
+  void emplace(T1 &&A1, T2 &&A2) {
+    reset();
+    hasVal = true;
+    new (storage.buffer) T(std::forward<T1>(A1), std::forward<T2>(A2));
+  }
+  
+  /// Create a new object by constructing it in place with the given arguments.
+  template<typename T1, typename T2, typename T3>
+  void emplace(T1 &&A1, T2 &&A2, T3 &&A3) {
+    reset();
+    hasVal = true;
+    new (storage.buffer) T(std::forward<T1>(A1), std::forward<T2>(A2),
+        std::forward<T3>(A3));
+  }
+  
+  /// Create a new object by constructing it in place with the given arguments.
+  template<typename T1, typename T2, typename T3, typename T4>
+  void emplace(T1 &&A1, T2 &&A2, T3 &&A3, T4 &&A4) {
+    reset();
+    hasVal = true;
+    new (storage.buffer) T(std::forward<T1>(A1), std::forward<T2>(A2),
+        std::forward<T3>(A3), std::forward<T4>(A4));
+  }
+
+#endif // LLVM_HAS_VARIADIC_TEMPLATES
+
   static inline Optional create(const T* y) {
     return y ? Optional(*y) : Optional();
   }
@@ -117,9 +175,19 @@
   const T& operator*() const LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
   T& operator*() LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
 
+  template <typename U>
+  LLVM_CONSTEXPR T getValueOr(U &&value) const LLVM_LVALUE_FUNCTION {
+    return hasValue() ? getValue() : std::forward<U>(value);
+  }
+
 #if LLVM_HAS_RVALUE_REFERENCE_THIS
   T&& getValue() && { assert(hasVal); return std::move(*getPointer()); }
   T&& operator*() && { assert(hasVal); return std::move(*getPointer()); }
+
+  template <typename U>
+  T getValueOr(U &&value) && {
+    return hasValue() ? std::move(getValue()) : std::forward<U>(value);
+  }
 #endif
 };
 

diff --git a/include/llvm/ADT/PostOrderIterator.h b/include/llvm/ADT/PostOrderIterator.h
index dd8cc74..dfadc3b 100644
--- a/include/llvm/ADT/PostOrderIterator.h
+++ b/include/llvm/ADT/PostOrderIterator.h

@@ -57,7 +57,7 @@
   // Return true if edge destination should be visited.
   template<typename NodeType>
   bool insertEdge(NodeType *From, NodeType *To) {
-    return Visited.insert(To);
+    return Visited.insert(To).second;
   }
 
   // Called after all children of BB have been visited.
@@ -76,8 +76,9 @@
   // Return true if edge destination should be visited, called with From = 0 for
   // the root node.
   // Graph edges can be pruned by specializing this function.
-  template<class NodeType>
-  bool insertEdge(NodeType *From, NodeType *To) { return Visited.insert(To); }
+  template <class NodeType> bool insertEdge(NodeType *From, NodeType *To) {
+    return Visited.insert(To).second;
+  }
 
   // Called after all children of BB have been visited.
   template<class NodeType>

diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index 1cef393..4e56e4d 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h

@@ -77,8 +77,11 @@
   }
 
 public:
-  template<typename Callable>
-  function_ref(Callable &&callable)
+  template <typename Callable>
+  function_ref(Callable &&callable,
+               typename std::enable_if<
+                   !std::is_same<typename std::remove_reference<Callable>::type,
+                                 function_ref>::value>::type * = nullptr)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
         callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()(Params ...params) const {
@@ -100,7 +103,10 @@
 
 public:
   template<typename Callable>
-  function_ref(Callable &&callable)
+  function_ref(Callable &&callable,
+               typename std::enable_if<
+                   !std::is_same<typename std::remove_reference<Callable>::type,
+                                 function_ref>::value>::type * = nullptr)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
         callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()() const { return callback(callable); }
@@ -119,7 +125,10 @@
 
 public:
   template<typename Callable>
-  function_ref(Callable &&callable)
+  function_ref(Callable &&callable,
+               typename std::enable_if<
+                   !std::is_same<typename std::remove_reference<Callable>::type,
+                                 function_ref>::value>::type * = nullptr)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
         callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()(Param1 param1) {
@@ -141,7 +150,10 @@
 
 public:
   template<typename Callable>
-  function_ref(Callable &&callable)
+  function_ref(Callable &&callable,
+               typename std::enable_if<
+                   !std::is_same<typename std::remove_reference<Callable>::type,
+                                 function_ref>::value>::type * = nullptr)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
         callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()(Param1 param1, Param2 param2) {
@@ -167,7 +179,10 @@
 
 public:
   template<typename Callable>
-  function_ref(Callable &&callable)
+  function_ref(Callable &&callable,
+               typename std::enable_if<
+                   !std::is_same<typename std::remove_reference<Callable>::type,
+                                 function_ref>::value>::type * = nullptr)
       : callback(callback_fn<typename std::remove_reference<Callable>::type>),
         callable(reinterpret_cast<intptr_t>(&callable)) {}
   Ret operator()(Param1 param1, Param2 param2, Param3 param3) {
@@ -530,6 +545,12 @@
 
 #endif
 
+struct FreeDeleter {
+  void operator()(void* v) {
+    ::free(v);
+  }
+};
+
 template<typename First, typename Second>
 struct pair_hash {
   size_t operator()(const std::pair<First, Second> &P) const {

diff --git a/include/llvm/ADT/ScopedHashTable.h b/include/llvm/ADT/ScopedHashTable.h
index 02a6ea3..2f60ecc 100644
--- a/include/llvm/ADT/ScopedHashTable.h
+++ b/include/llvm/ADT/ScopedHashTable.h

@@ -148,7 +148,7 @@
   /// ScopeTy - This is a helpful typedef that allows clients to get easy access
   /// to the name of the scope for this hash table.
   typedef ScopedHashTableScope<K, V, KInfo, AllocatorTy> ScopeTy;
-  typedef unsigned size_type;

+  typedef unsigned size_type;
 private:
   typedef ScopedHashTableVal<K, V> ValTy;
   DenseMap<K, ValTy*, KInfo> TopLevelMap;
@@ -171,7 +171,7 @@
   AllocatorTy &getAllocator() { return Allocator; }
   const AllocatorTy &getAllocator() const { return Allocator; }
 
-  /// Return 1 if the specified key is in the table, 0 otherwise.

+  /// Return 1 if the specified key is in the table, 0 otherwise.
   size_type count(const K &Key) const {
     return TopLevelMap.count(Key);
   }

diff --git a/include/llvm/ADT/SetVector.h b/include/llvm/ADT/SetVector.h
index 1e7d237..a7fd408 100644
--- a/include/llvm/ADT/SetVector.h
+++ b/include/llvm/ADT/SetVector.h

@@ -100,7 +100,7 @@
   /// \brief Insert a new element into the SetVector.
   /// \returns true iff the element was inserted into the SetVector.
   bool insert(const value_type &X) {
-    bool result = set_.insert(X);
+    bool result = set_.insert(X).second;
     if (result)
       vector_.push_back(X);
     return result;
@@ -110,7 +110,7 @@
   template<typename It>
   void insert(It Start, It End) {
     for (; Start != End; ++Start)
-      if (set_.insert(*Start))
+      if (set_.insert(*Start).second)
         vector_.push_back(*Start);
   }
 

diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index 0922017..ababf0f 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h

@@ -54,7 +54,7 @@
   };
 
 public:
-  typedef unsigned size_type;

+  typedef unsigned size_type;
   // Encapsulation of a single bit.
   class reference {
     SmallBitVector &TheVector;

diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index 74f3fd4..b8977fa 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h

@@ -22,6 +22,7 @@
 #include <cstddef>
 #include <cstring>
 #include <iterator>
+#include <utility>
 
 namespace llvm {
 
@@ -100,7 +101,7 @@
   /// insert_imp - This returns true if the pointer was new to the set, false if
   /// it was already in the set.  This is hidden from the client so that the
   /// derived class can check that the right type of pointer is passed in.
-  bool insert_imp(const void * Ptr);
+  std::pair<const void *const *, bool> insert_imp(const void *Ptr);
 
   /// erase_imp - If the set contains the specified pointer, remove it and
   /// return true, otherwise return false.  This is hidden from the client so
@@ -240,6 +241,8 @@
 template <typename PtrType>
 class SmallPtrSetImpl : public SmallPtrSetImplBase {
   typedef PointerLikeTypeTraits<PtrType> PtrTraits;
+
+  SmallPtrSetImpl(const SmallPtrSetImpl&) LLVM_DELETED_FUNCTION;
 protected:
   // Constructors that forward to the base.
   SmallPtrSetImpl(const void **SmallStorage, const SmallPtrSetImpl &that)
@@ -251,10 +254,14 @@
       : SmallPtrSetImplBase(SmallStorage, SmallSize) {}
 
 public:
+  typedef SmallPtrSetIterator<PtrType> iterator;
+  typedef SmallPtrSetIterator<PtrType> const_iterator;
+
   /// insert - This returns true if the pointer was new to the set, false if it
   /// was already in the set.
-  bool insert(PtrType Ptr) {
-    return insert_imp(PtrTraits::getAsVoidPointer(Ptr));
+  std::pair<iterator, bool> insert(PtrType Ptr) {
+    auto p = insert_imp(PtrTraits::getAsVoidPointer(Ptr));
+    return std::make_pair(iterator(p.first, CurArray + CurArraySize), p.second);
   }
 
   /// erase - If the set contains the specified pointer, remove it and return
@@ -274,8 +281,6 @@
       insert(*I);
   }
 
-  typedef SmallPtrSetIterator<PtrType> iterator;
-  typedef SmallPtrSetIterator<PtrType> const_iterator;
   inline iterator begin() const {
     return iterator(CurArray, CurArray+CurArraySize);
   }

diff --git a/include/llvm/ADT/SmallSet.h b/include/llvm/ADT/SmallSet.h
index bb1971e..bc64935 100644
--- a/include/llvm/ADT/SmallSet.h
+++ b/include/llvm/ADT/SmallSet.h

@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_SMALLSET_H
 #define LLVM_ADT_SMALLSET_H
 
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include <set>
@@ -60,16 +61,21 @@
 
   /// insert - Insert an element into the set if it isn't already there.
   /// Returns true if the element is inserted (it was not in the set before).
-  bool insert(const T &V) {
+  /// The first value of the returned pair is unused and provided for
+  /// partial compatibility with the standard library self-associative container
+  /// concept.
+  // FIXME: Add iterators that abstract over the small and large form, and then
+  // return those here.
+  std::pair<NoneType, bool> insert(const T &V) {
     if (!isSmall())
-      return Set.insert(V).second;
+      return std::make_pair(None, Set.insert(V).second);
 
     VIterator I = vfind(V);
     if (I != Vector.end())    // Don't reinsert if it already exists.
-      return false;
+      return std::make_pair(None, false);
     if (Vector.size() < N) {
       Vector.push_back(V);
-      return true;
+      return std::make_pair(None, true);
     }
 
     // Otherwise, grow from vector to set.
@@ -78,7 +84,7 @@
       Vector.pop_back();
     }
     Set.insert(V);
-    return true;
+    return std::make_pair(None, true);
   }
 
   template <typename IterT>

diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 82538e9..2117541 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h

@@ -29,8 +29,7 @@
 
 namespace llvm {
 
-/// SmallVectorBase - This is all the non-templated stuff common to all
-/// SmallVectors.
+/// This is all the non-templated stuff common to all SmallVectors.
 class SmallVectorBase {
 protected:
   void *BeginX, *EndX, *CapacityX;
@@ -39,12 +38,12 @@
   SmallVectorBase(void *FirstEl, size_t Size)
     : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
 
-  /// grow_pod - This is an implementation of the grow() method which only works
+  /// This is an implementation of the grow() method which only works
   /// on POD-like data types and is out of line to reduce code duplication.
   void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize);
 
 public:
-  /// size_in_bytes - This returns size()*sizeof(T).
+  /// This returns size()*sizeof(T).
   size_t size_in_bytes() const {
     return size_t((char*)EndX - (char*)BeginX);
   }
@@ -59,10 +58,9 @@
 
 template <typename T, unsigned N> struct SmallVectorStorage;
 
-/// SmallVectorTemplateCommon - This is the part of SmallVectorTemplateBase
-/// which does not depend on whether the type T is a POD. The extra dummy
-/// template argument is used by ArrayRef to avoid unnecessarily requiring T
-/// to be complete.
+/// This is the part of SmallVectorTemplateBase which does not depend on whether
+/// the type T is a POD. The extra dummy template argument is used by ArrayRef
+/// to avoid unnecessarily requiring T to be complete.
 template <typename T, typename = void>
 class SmallVectorTemplateCommon : public SmallVectorBase {
 private:
@@ -82,13 +80,13 @@
     SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
   }
 
-  /// isSmall - Return true if this is a smallvector which has not had dynamic
+  /// Return true if this is a smallvector which has not had dynamic
   /// memory allocated for it.
   bool isSmall() const {
     return BeginX == static_cast<const void*>(&FirstEl);
   }
 
-  /// resetToSmall - Put this vector in a state of being small.
+  /// Put this vector in a state of being small.
   void resetToSmall() {
     BeginX = EndX = CapacityX = &FirstEl;
   }
@@ -128,20 +126,19 @@
   size_type size() const { return end()-begin(); }
   size_type max_size() const { return size_type(-1) / sizeof(T); }
 
-  /// capacity - Return the total number of elements in the currently allocated
-  /// buffer.
+  /// Return the total number of elements in the currently allocated buffer.
   size_t capacity() const { return capacity_ptr() - begin(); }
 
-  /// data - Return a pointer to the vector's buffer, even if empty().
+  /// Return a pointer to the vector's buffer, even if empty().
   pointer data() { return pointer(begin()); }
-  /// data - Return a pointer to the vector's buffer, even if empty().
+  /// Return a pointer to the vector's buffer, even if empty().
   const_pointer data() const { return const_pointer(begin()); }
 
-  reference operator[](unsigned idx) {
+  reference operator[](size_type idx) {
     assert(begin() + idx < end());
     return begin()[idx];
   }
-  const_reference operator[](unsigned idx) const {
+  const_reference operator[](size_type idx) const {
     assert(begin() + idx < end());
     return begin()[idx];
   }
@@ -179,7 +176,7 @@
     }
   }
 
-  /// move - Use move-assignment to move the range [I, E) onto the
+  /// Use move-assignment to move the range [I, E) onto the
   /// objects starting with "Dest".  This is just <memory>'s
   /// std::move, but not all stdlibs actually provide that.
   template<typename It1, typename It2>
@@ -189,7 +186,7 @@
     return Dest;
   }
 
-  /// move_backward - Use move-assignment to move the range
+  /// Use move-assignment to move the range
   /// [I, E) onto the objects ending at "Dest", moving objects
   /// in reverse order.  This is just <algorithm>'s
   /// std::move_backward, but not all stdlibs actually provide that.
@@ -200,25 +197,24 @@
     return Dest;
   }
 
-  /// uninitialized_move - Move the range [I, E) into the uninitialized
-  /// memory starting with "Dest", constructing elements as needed.
+  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
   template<typename It1, typename It2>
   static void uninitialized_move(It1 I, It1 E, It2 Dest) {
     for (; I != E; ++I, ++Dest)
       ::new ((void*) &*Dest) T(::std::move(*I));
   }
 
-  /// uninitialized_copy - Copy the range [I, E) onto the uninitialized
-  /// memory starting with "Dest", constructing elements as needed.
+  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
   template<typename It1, typename It2>
   static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
     std::uninitialized_copy(I, E, Dest);
   }
 
-  /// grow - Grow the allocated memory (without initializing new
-  /// elements), doubling the size of the allocated memory.
-  /// Guarantees space for at least one more element, or MinSize more
-  /// elements if specified.
+  /// Grow the allocated memory (without initializing new elements), doubling
+  /// the size of the allocated memory. Guarantees space for at least one more
+  /// element, or MinSize more elements if specified.
   void grow(size_t MinSize = 0);
 
 public:
@@ -279,22 +275,21 @@
   // No need to do a destroy loop for POD's.
   static void destroy_range(T *, T *) {}
 
-  /// move - Use move-assignment to move the range [I, E) onto the
+  /// Use move-assignment to move the range [I, E) onto the
   /// objects starting with "Dest".  For PODs, this is just memcpy.
   template<typename It1, typename It2>
   static It2 move(It1 I, It1 E, It2 Dest) {
     return ::std::copy(I, E, Dest);
   }
 
-  /// move_backward - Use move-assignment to move the range
-  /// [I, E) onto the objects ending at "Dest", moving objects
-  /// in reverse order.
+  /// Use move-assignment to move the range [I, E) onto the objects ending at
+  /// "Dest", moving objects in reverse order.
   template<typename It1, typename It2>
   static It2 move_backward(It1 I, It1 E, It2 Dest) {
     return ::std::copy_backward(I, E, Dest);
   }
 
-  /// uninitialized_move - Move the range [I, E) onto the uninitialized memory
+  /// Move the range [I, E) onto the uninitialized memory
   /// starting with "Dest", constructing elements into it as needed.
   template<typename It1, typename It2>
   static void uninitialized_move(It1 I, It1 E, It2 Dest) {
@@ -302,7 +297,7 @@
     uninitialized_copy(I, E, Dest);
   }
 
-  /// uninitialized_copy - Copy the range [I, E) onto the uninitialized memory
+  /// Copy the range [I, E) onto the uninitialized memory
   /// starting with "Dest", constructing elements into it as needed.
   template<typename It1, typename It2>
   static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
@@ -310,7 +305,7 @@
     std::uninitialized_copy(I, E, Dest);
   }
 
-  /// uninitialized_copy - Copy the range [I, E) onto the uninitialized memory
+  /// Copy the range [I, E) onto the uninitialized memory
   /// starting with "Dest", constructing elements into it as needed.
   template<typename T1, typename T2>
   static void uninitialized_copy(T1 *I, T1 *E, T2 *Dest) {
@@ -320,7 +315,7 @@
     memcpy(Dest, I, (E-I)*sizeof(T));
   }
 
-  /// grow - double the size of the allocated memory, guaranteeing space for at
+  /// Double the size of the allocated memory, guaranteeing space for at
   /// least one more element or MinSize if specified.
   void grow(size_t MinSize = 0) {
     this->grow_pod(MinSize*sizeof(T), sizeof(T));
@@ -339,9 +334,8 @@
 };
 
 
-/// SmallVectorImpl - This class consists of common code factored out of the
-/// SmallVector class to reduce code duplication based on the SmallVector 'N'
-/// template parameter.
+/// This class consists of common code factored out of the SmallVector class to
+/// reduce code duplication based on the SmallVector 'N' template parameter.
 template <typename T>
 class SmallVectorImpl : public SmallVectorTemplateBase<T, isPodLike<T>::value> {
   typedef SmallVectorTemplateBase<T, isPodLike<T>::value > SuperClass;
@@ -411,8 +405,7 @@
 
   void swap(SmallVectorImpl &RHS);
 
-  /// append - Add the specified range to the end of the SmallVector.
-  ///
+  /// Add the specified range to the end of the SmallVector.
   template<typename in_iter>
   void append(in_iter in_start, in_iter in_end) {
     size_type NumInputs = std::distance(in_start, in_end);
@@ -427,8 +420,7 @@
     this->setEnd(this->end() + NumInputs);
   }
 
-  /// append - Add the specified range to the end of the SmallVector.
-  ///
+  /// Add the specified range to the end of the SmallVector.
   void append(size_type NumInputs, const T &Elt) {
     // Grow allocated space if needed.
     if (NumInputs > size_type(this->capacity_ptr()-this->end()))
@@ -833,7 +825,7 @@
 template <typename T> struct SmallVectorStorage<T, 1> {};
 template <typename T> struct SmallVectorStorage<T, 0> {};
 
-/// SmallVector - This is a 'vector' (really, a variable-sized array), optimized
+/// This is a 'vector' (really, a variable-sized array), optimized
 /// for the case when the array is small.  It contains some number of elements
 /// in-place, which allows it to avoid heap allocation when the actual number of
 /// elements is below that threshold.  This allows normal "small" cases to be
@@ -843,7 +835,7 @@
 ///
 template <typename T, unsigned N>
 class SmallVector : public SmallVectorImpl<T> {
-  /// Storage - Inline space for elements which aren't stored in the base class.
+  /// Inline space for elements which aren't stored in the base class.
   SmallVectorStorage<T, N> Storage;
 public:
   SmallVector() : SmallVectorImpl<T>(N) {

diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h
index 36754d6..d5bde29 100644
--- a/include/llvm/ADT/SparseBitVector.h
+++ b/include/llvm/ADT/SparseBitVector.h

@@ -45,7 +45,7 @@
   : public ilist_node<SparseBitVectorElement<ElementSize> > {
 public:
   typedef unsigned long BitWord;
-  typedef unsigned size_type;

+  typedef unsigned size_type;
   enum {
     BITWORD_SIZE = sizeof(BitWord) * CHAR_BIT,
     BITWORDS_PER_ELEMENT = (ElementSize + BITWORD_SIZE - 1) / BITWORD_SIZE,

diff --git a/include/llvm/ADT/SparseMultiSet.h b/include/llvm/ADT/SparseMultiSet.h
index dc1273e..f858536 100644
--- a/include/llvm/ADT/SparseMultiSet.h
+++ b/include/llvm/ADT/SparseMultiSet.h

@@ -185,7 +185,7 @@
   typedef const ValueT &const_reference;
   typedef ValueT *pointer;
   typedef const ValueT *const_pointer;
-  typedef unsigned size_type;

+  typedef unsigned size_type;
 
   SparseMultiSet()
     : Sparse(nullptr), Universe(0), FreelistIdx(SMSNode::INVALID), NumFree(0) {}

diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index 632d52a..9a13440 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h

@@ -124,7 +124,7 @@
 
   typedef typename KeyFunctorT::argument_type KeyT;
   typedef SmallVector<ValueT, 8> DenseT;
-  typedef unsigned size_type;

+  typedef unsigned size_type;
   DenseT Dense;
   SparseT *Sparse;
   unsigned Universe;

diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index a152f4d..0992f5d 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h

@@ -53,7 +53,7 @@
 /// This should only be used with unsigned types.
 ///
 template<typename IntTy>
-static inline char *utohex_buffer(IntTy X, char *BufferEnd) {
+static inline char *utohex_buffer(IntTy X, char *BufferEnd, bool LowerCase = false) {
   char *BufPtr = BufferEnd;
   *--BufPtr = 0;      // Null terminate buffer.
   if (X == 0) {
@@ -63,15 +63,15 @@
 
   while (X) {
     unsigned char Mod = static_cast<unsigned char>(X) & 15;
-    *--BufPtr = hexdigit(Mod);
+    *--BufPtr = hexdigit(Mod, LowerCase);
     X >>= 4;
   }
   return BufPtr;
 }
 
-static inline std::string utohexstr(uint64_t X) {
+static inline std::string utohexstr(uint64_t X, bool LowerCase = false) {
   char Buffer[17];
-  return utohex_buffer(X, Buffer+17);
+  return utohex_buffer(X, Buffer+17, LowerCase);
 }
 
 static inline std::string utostr_32(uint32_t X, bool isNeg = false) {

diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index c40e5e2..2feb2ab 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h

@@ -117,8 +117,9 @@
 
   explicit StringMapEntry(unsigned strLen)
     : StringMapEntryBase(strLen), second() {}
-  StringMapEntry(unsigned strLen, ValueTy V)
-      : StringMapEntryBase(strLen), second(std::move(V)) {}
+  template <class InitTy>
+  StringMapEntry(unsigned strLen, InitTy &&V)
+      : StringMapEntryBase(strLen), second(std::forward<InitTy>(V)) {}
 
   StringRef getKey() const {
     return StringRef(getKeyData(), getKeyLength());
@@ -138,10 +139,9 @@
 
   /// Create - Create a StringMapEntry for the specified key and default
   /// construct the value.
-  template<typename AllocatorTy, typename InitType>
-  static StringMapEntry *Create(StringRef Key,
-                                AllocatorTy &Allocator,
-                                InitType InitVal) {
+  template <typename AllocatorTy, typename InitType>
+  static StringMapEntry *Create(StringRef Key, AllocatorTy &Allocator,
+                                InitType &&InitVal) {
     unsigned KeyLength = Key.size();
 
     // Allocate a new item with space for the string at the end and a null
@@ -154,7 +154,7 @@
       static_cast<StringMapEntry*>(Allocator.Allocate(AllocSize,Alignment));
 
     // Default construct the value.
-    new (NewItem) StringMapEntry(KeyLength, std::move(InitVal));
+    new (NewItem) StringMapEntry(KeyLength, std::forward<InitType>(InitVal));
 
     // Copy the string information.
     char *StrBuffer = const_cast<char*>(NewItem->getKeyData());
@@ -170,9 +170,9 @@
 
   /// Create - Create a StringMapEntry with normal malloc/free.
   template<typename InitType>
-  static StringMapEntry *Create(StringRef Key, InitType InitVal) {
+  static StringMapEntry *Create(StringRef Key, InitType &&InitVal) {
     MallocAllocator A;
-    return Create(Key, A, std::move(InitVal));
+    return Create(Key, A, std::forward<InitType>(InitVal));
   }
 
   static StringMapEntry *Create(StringRef Key) {
@@ -296,7 +296,7 @@
   }
 
   ValueTy &operator[](StringRef Key) {
-    return GetOrCreateValue(Key).getValue();
+    return insert(std::make_pair(Key, ValueTy())).first->second;
   }
 
   /// count - Return 1 if the element is in the map, 0 otherwise.
@@ -363,18 +363,6 @@
     NumTombstones = 0;
   }
 
-  /// GetOrCreateValue - Look up the specified key in the table.  If a value
-  /// exists, return it.  Otherwise, default construct a value, insert it, and
-  /// return.
-  template <typename InitTy>
-  MapEntryTy &GetOrCreateValue(StringRef Key, InitTy Val) {
-    return *insert(std::make_pair(Key, std::move(Val))).first;
-  }
-
-  MapEntryTy &GetOrCreateValue(StringRef Key) {
-    return GetOrCreateValue(Key, ValueTy());
-  }
-
   /// remove - Remove the specified key/value pair from the map, but do not
   /// erase it.  This aborts if the key is not in the map.
   void remove(MapEntryTy *KeyValue) {

diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index 1f413e8..778fa10 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h

@@ -51,12 +51,6 @@
     /// The length of the string.
     size_t Length;
 
-    // Workaround PR5482: nearly all gcc 4.x miscompile StringRef and std::min()
-    // Changing the arg of min to be an integer, instead of a reference to an
-    // integer works around this bug.
-    static size_t min(size_t a, size_t b) { return a < b ? a : b; }
-    static size_t max(size_t a, size_t b) { return a > b ? a : b; }
-
     // Workaround memcmp issue with null pointers (undefined behavior)
     // by providing a specialized version
     static int compareMemory(const char *Lhs, const char *Rhs, size_t Length) {
@@ -124,7 +118,7 @@
     }
 
     // copy - Allocate copy in Allocator and return StringRef to it.
-    template <typename Allocator> StringRef copy(Allocator &A) {
+    template <typename Allocator> StringRef copy(Allocator &A) const {
       char *S = A.template Allocate<char>(Length);
       std::copy(begin(), end(), S);
       return StringRef(S, Length);
@@ -146,7 +140,7 @@
     /// is lexicographically less than, equal to, or greater than the \p RHS.
     int compare(StringRef RHS) const {
       // Check the prefix for a mismatch.
-      if (int Res = compareMemory(Data, RHS.Data, min(Length, RHS.Length)))
+      if (int Res = compareMemory(Data, RHS.Data, std::min(Length, RHS.Length)))
         return Res < 0 ? -1 : 1;
 
       // Otherwise the prefixes match, so we only need to check the lengths.
@@ -237,7 +231,7 @@
     /// \returns The index of the first occurrence of \p C, or npos if not
     /// found.
     size_t find(char C, size_t From = 0) const {
-      for (size_t i = min(From, Length), e = Length; i != e; ++i)
+      for (size_t i = std::min(From, Length), e = Length; i != e; ++i)
         if (Data[i] == C)
           return i;
       return npos;
@@ -254,7 +248,7 @@
     /// \returns The index of the last occurrence of \p C, or npos if not
     /// found.
     size_t rfind(char C, size_t From = npos) const {
-      From = min(From, Length);
+      From = std::min(From, Length);
       size_t i = From;
       while (i != 0) {
         --i;
@@ -353,8 +347,11 @@
     typename std::enable_if<!std::numeric_limits<T>::is_signed, bool>::type
     getAsInteger(unsigned Radix, T &Result) const {
       unsigned long long ULLVal;
+      // The additional cast to unsigned long long is required to avoid the
+      // Visual C++ warning C4805: '!=' : unsafe mix of type 'bool' and type
+      // 'unsigned __int64' when instantiating getAsInteger with T = bool.
       if (getAsUnsignedInteger(*this, Radix, ULLVal) ||
-            static_cast<T>(ULLVal) != ULLVal)
+          static_cast<unsigned long long>(static_cast<T>(ULLVal)) != ULLVal)
         return true;
       Result = ULLVal;
       return false;
@@ -396,8 +393,8 @@
     /// exceeds the number of characters remaining in the string, the string
     /// suffix (starting with \p Start) will be returned.
     StringRef substr(size_t Start, size_t N = npos) const {
-      Start = min(Start, Length);
-      return StringRef(Data + Start, min(N, Length - Start));
+      Start = std::min(Start, Length);
+      return StringRef(Data + Start, std::min(N, Length - Start));
     }
 
     /// Return a StringRef equal to 'this' but with the first \p N elements
@@ -425,8 +422,8 @@
     /// number of characters remaining in the string, the string suffix
     /// (starting with \p Start) will be returned.
     StringRef slice(size_t Start, size_t End) const {
-      Start = min(Start, Length);
-      End = min(max(Start, End), Length);
+      Start = std::min(Start, Length);
+      End = std::min(std::max(Start, End), Length);
       return StringRef(Data + Start, End - Start);
     }
 

diff --git a/include/llvm/ADT/StringSet.h b/include/llvm/ADT/StringSet.h
index 7bea577..3e0cc20 100644
--- a/include/llvm/ADT/StringSet.h
+++ b/include/llvm/ADT/StringSet.h

@@ -24,20 +24,9 @@
     typedef llvm::StringMap<char, AllocatorTy> base;
   public:
 
-    /// insert - Insert the specified key into the set.  If the key already
-    /// exists in the set, return false and ignore the request, otherwise insert
-    /// it and return true.
-    bool insert(StringRef Key) {
-      // Get or create the map entry for the key; if it doesn't exist the value
-      // type will be default constructed which we use to detect insert.
-      //
-      // We use '+' as the sentinel value in the map.
+    std::pair<typename base::iterator, bool> insert(StringRef Key) {
       assert(!Key.empty());
-      StringMapEntry<char> &Entry = this->GetOrCreateValue(Key);
-      if (Entry.getValue() == '+')
-        return false;
-      Entry.setValue('+');
-      return true;
+      return base::insert(std::make_pair(Key, '\0'));
     }
   };
 }

diff --git a/include/llvm/ADT/TinyPtrVector.h b/include/llvm/ADT/TinyPtrVector.h
index 5669b2a..e158f9d 100644
--- a/include/llvm/ADT/TinyPtrVector.h
+++ b/include/llvm/ADT/TinyPtrVector.h

@@ -99,7 +99,7 @@
   // implicit conversion operator to ArrayRef.
   operator ArrayRef<EltTy>() const {
     if (Val.isNull())
-      return ArrayRef<EltTy>();
+      return None;
     if (Val.template is<EltTy>())
       return *Val.getAddrOfPtr1();
     return *Val.template get<VecTy*>();

diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 2867a0e..fbc19f8 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h

@@ -48,8 +48,6 @@
 
     arm,        // ARM (little endian): arm, armv.*, xscale
     armeb,      // ARM (big endian): armeb
-    arm64,      // ARM64 (little endian): arm64
-    arm64_be,   // ARM64 (big endian): arm64_be
     aarch64,    // AArch64 (little endian): aarch64
     aarch64_be, // AArch64 (big endian): aarch64_be
     hexagon,    // Hexagon: hexagon
@@ -74,11 +72,34 @@
     nvptx,      // NVPTX: 32-bit
     nvptx64,    // NVPTX: 64-bit
     le32,       // le32: generic little-endian 32-bit CPU (PNaCl / Emscripten)
-    amdil,      // amdil: amd IL
+    le64,       // le64: generic little-endian 64-bit CPU (PNaCl / Emscripten)
+    amdil,      // AMDIL
+    amdil64,    // AMDIL with 64-bit pointers
+    hsail,      // AMD HSAIL
+    hsail64,    // AMD HSAIL with 64-bit pointers
     spir,       // SPIR: standard portable IR for OpenCL 32-bit version
     spir64,     // SPIR: standard portable IR for OpenCL 64-bit version
     kalimba     // Kalimba: generic kalimba
   };
+  enum SubArchType {
+    NoSubArch,
+
+    ARMSubArch_v8,
+    ARMSubArch_v7,
+    ARMSubArch_v7em,
+    ARMSubArch_v7m,
+    ARMSubArch_v7s,
+    ARMSubArch_v6,
+    ARMSubArch_v6m,
+    ARMSubArch_v6t2,
+    ARMSubArch_v5,
+    ARMSubArch_v5te,
+    ARMSubArch_v4t,
+
+    KalimbaSubArch_v3,
+    KalimbaSubArch_v4,
+    KalimbaSubArch_v5
+  };
   enum VendorType {
     UnknownVendor,
 
@@ -90,14 +111,13 @@
     Freescale,
     IBM,
     ImaginationTechnologies,
+    MipsTechnologies,
     NVIDIA,
     CSR
   };
   enum OSType {
     UnknownOS,
 
-    AuroraUX,
-    Cygwin,
     Darwin,
     DragonFly,
     FreeBSD,
@@ -106,7 +126,6 @@
     Linux,
     Lv2,        // PS3
     MacOSX,
-    MinGW32,    // i*86-pc-mingw32, *-w64-mingw32
     NetBSD,
     OpenBSD,
     Solaris,
@@ -151,6 +170,9 @@
   /// The parsed arch type.
   ArchType Arch;
 
+  /// The parsed subarchitecture type.
+  SubArchType SubArch;
+
   /// The parsed vendor type.
   VendorType Vendor;
 
@@ -193,6 +215,9 @@
   /// getArch - Get the parsed architecture type of this triple.
   ArchType getArch() const { return Arch; }
 
+  /// getSubArch - get the parsed subarchitecture type for this triple.
+  SubArchType getSubArch() const { return SubArch; }
+
   /// getVendor - Get the parsed vendor type of this triple.
   VendorType getVendor() const { return Vendor; }
 
@@ -358,13 +383,11 @@
   }
 
   bool isWindowsCygwinEnvironment() const {
-    return getOS() == Triple::Cygwin ||
-           (getOS() == Triple::Win32 && getEnvironment() == Triple::Cygnus);
+    return getOS() == Triple::Win32 && getEnvironment() == Triple::Cygnus;
   }
 
   bool isWindowsGNUEnvironment() const {
-    return getOS() == Triple::MinGW32 ||
-           (getOS() == Triple::Win32 && getEnvironment() == Triple::GNU);
+    return getOS() == Triple::Win32 && getEnvironment() == Triple::GNU;
   }
 
   /// \brief Tests for either Cygwin or MinGW OS
@@ -374,7 +397,8 @@
 
   /// \brief Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
   bool isOSMSVCRT() const {
-    return isWindowsMSVCEnvironment() || isWindowsGNUEnvironment();
+    return isWindowsMSVCEnvironment() || isWindowsGNUEnvironment() ||
+           isWindowsItaniumEnvironment();
   }
 
   /// \brief Tests whether the OS is Windows.
@@ -453,10 +477,6 @@
   /// environment components with a single string.
   void setOSAndEnvironmentName(StringRef Str);
 
-  /// getArchNameForAssembler - Get an architecture name that is understood by
-  /// the target assembler.
-  const char *getArchNameForAssembler();
-
   /// @}
   /// @name Helpers to build variants of a particular triple.
   /// @{
@@ -477,6 +497,12 @@
   ///          architecture if no such variant can be found.
   llvm::Triple get64BitArchVariant() const;
 
+  /// Get the (LLVM) name of the minimum ARM CPU for the arch we are targeting.
+  ///
+  /// \param Arch the architecture name (e.g., "armv7s"). If it is an empty
+  /// string then the triple's arch name is used.
+  const char* getARMCPUForArch(StringRef Arch = StringRef()) const;
+
   /// @}
   /// @name Static helpers for IDs.
   /// @{

diff --git a/include/llvm/ADT/Twine.h b/include/llvm/ADT/Twine.h
index 4be3ee6..77d92b4 100644
--- a/include/llvm/ADT/Twine.h
+++ b/include/llvm/ADT/Twine.h

@@ -157,7 +157,7 @@
     // don't support specifying the backing type for an enum
     /// LHSKind - The NodeKind of the left hand side, \see getLHSKind().
     unsigned char LHSKind;
-    /// RHSKind - The NodeKind of the left hand side, \see getLHSKind().
+    /// RHSKind - The NodeKind of the right hand side, \see getRHSKind().
     unsigned char RHSKind;
 
   private:

diff --git a/include/llvm/ADT/VariadicFunction.h b/include/llvm/ADT/VariadicFunction.h
index 0497aa7..403130c 100644
--- a/include/llvm/ADT/VariadicFunction.h
+++ b/include/llvm/ADT/VariadicFunction.h

@@ -105,7 +105,7 @@
           ResultT (*Func)(ArrayRef<const ArgT *>)>
 struct VariadicFunction {
   ResultT operator()() const {
-    return Func(ArrayRef<const ArgT *>());
+    return Func(None);
   }
 
 #define LLVM_DEFINE_OVERLOAD(N) \
@@ -152,7 +152,7 @@
           ResultT (*Func)(Param0T, ArrayRef<const ArgT *>)>
 struct VariadicFunction1 {
   ResultT operator()(Param0T P0) const {
-    return Func(P0, ArrayRef<const ArgT *>());
+    return Func(P0, None);
   }
 
 #define LLVM_DEFINE_OVERLOAD(N) \
@@ -199,7 +199,7 @@
           ResultT (*Func)(Param0T, Param1T, ArrayRef<const ArgT *>)>
 struct VariadicFunction2 {
   ResultT operator()(Param0T P0, Param1T P1) const {
-    return Func(P0, P1, ArrayRef<const ArgT *>());
+    return Func(P0, P1, None);
   }
 
 #define LLVM_DEFINE_OVERLOAD(N) \
@@ -248,7 +248,7 @@
           ResultT (*Func)(Param0T, Param1T, Param2T, ArrayRef<const ArgT *>)>
 struct VariadicFunction3 {
   ResultT operator()(Param0T P0, Param1T P1, Param2T P2) const {
-    return Func(P0, P1, P2, ArrayRef<const ArgT *>());
+    return Func(P0, P1, P2, None);
   }
 
 #define LLVM_DEFINE_OVERLOAD(N) \

diff --git a/include/llvm/ADT/ilist.h b/include/llvm/ADT/ilist.h
index bc14845..8c19a6f 100644
--- a/include/llvm/ADT/ilist.h
+++ b/include/llvm/ADT/ilist.h

@@ -579,60 +579,6 @@
   void splice(iterator where, iplist &L2, iterator first, iterator last) {
     if (first != last) transfer(where, L2, first, last);
   }
-
-
-
-  //===----------------------------------------------------------------------===
-  // High-Level Functionality that shouldn't really be here, but is part of list
-  //
-
-  // These two functions are actually called remove/remove_if in list<>, but
-  // they actually do the job of erase, rename them accordingly.
-  //
-  void erase(const NodeTy &val) {
-    for (iterator I = begin(), E = end(); I != E; ) {
-      iterator next = I; ++next;
-      if (*I == val) erase(I);
-      I = next;
-    }
-  }
-  template<class Pr1> void erase_if(Pr1 pred) {
-    for (iterator I = begin(), E = end(); I != E; ) {
-      iterator next = I; ++next;
-      if (pred(*I)) erase(I);
-      I = next;
-    }
-  }
-
-  template<class Pr2> void unique(Pr2 pred) {
-    if (empty()) return;
-    for (iterator I = begin(), E = end(), Next = begin(); ++Next != E;) {
-      if (pred(*I))
-        erase(Next);
-      else
-        I = Next;
-      Next = I;
-    }
-  }
-  void unique() { unique(op_equal); }
-
-  template<class Pr3> void merge(iplist &right, Pr3 pred) {
-    iterator first1 = begin(), last1 = end();
-    iterator first2 = right.begin(), last2 = right.end();
-    while (first1 != last1 && first2 != last2)
-      if (pred(*first2, *first1)) {
-        iterator next = first2;
-        transfer(first1, right, first2, ++next);
-        first2 = next;
-      } else {
-        ++first1;
-      }
-    if (first2 != last2) transfer(last1, right, first2, last2);
-  }
-  void merge(iplist &right) { return merge(right, op_less); }
-
-  template<class Pr3> void sort(Pr3 pred);
-  void sort() { sort(op_less); }
 };
 
 

diff --git a/include/llvm/ADT/ilist_node.h b/include/llvm/ADT/ilist_node.h
index 85aa7a4..26d0b55 100644
--- a/include/llvm/ADT/ilist_node.h
+++ b/include/llvm/ADT/ilist_node.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_ILISTNODE_H
-#define LLVM_ADT_ILISTNODE_H
+#ifndef LLVM_ADT_ILIST_NODE_H
+#define LLVM_ADT_ILIST_NODE_H
 
 namespace llvm {
 

diff --git a/include/llvm/ADT/iterator_range.h b/include/llvm/ADT/iterator_range.h
index dd17d6c..ecaf4a2 100644
--- a/include/llvm/ADT/iterator_range.h
+++ b/include/llvm/ADT/iterator_range.h

@@ -48,6 +48,10 @@
 template <class T> iterator_range<T> make_range(T x, T y) {
   return iterator_range<T>(std::move(x), std::move(y));
 }
+
+template <typename T> iterator_range<T> make_range(std::pair<T, T> p) {
+  return iterator_range<T>(std::move(p.first), std::move(p.second));
+}
 }
 
 #endif

diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 79d52fc..9bfa045 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h

@@ -39,6 +39,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Metadata.h"
 
 namespace llvm {
 
@@ -112,13 +113,14 @@
     /// there are restrictions on stepping out of one object and into another.
     /// See http://llvm.org/docs/LangRef.html#pointeraliasing
     uint64_t Size;
-    /// TBAATag - The metadata node which describes the TBAA type of
-    /// the location, or null if there is no known unique tag.
-    const MDNode *TBAATag;
+    /// AATags - The metadata nodes which describes the aliasing of the
+    /// location (each member is null if that kind of information is
+    /// unavailable)..
+    AAMDNodes AATags;
 
     explicit Location(const Value *P = nullptr, uint64_t S = UnknownSize,
-                      const MDNode *N = nullptr)
-      : Ptr(P), Size(S), TBAATag(N) {}
+                      const AAMDNodes &N = AAMDNodes())
+      : Ptr(P), Size(S), AATags(N) {}
 
     Location getWithNewPtr(const Value *NewPtr) const {
       Location Copy(*this);
@@ -132,9 +134,9 @@
       return Copy;
     }
 
-    Location getWithoutTBAATag() const {
+    Location getWithoutAATags() const {
       Location Copy(*this);
-      Copy.TBAATag = nullptr;
+      Copy.AATags = AAMDNodes();
       return Copy;
     }
   };
@@ -566,25 +568,23 @@
 template<>
 struct DenseMapInfo<AliasAnalysis::Location> {
   static inline AliasAnalysis::Location getEmptyKey() {
-    return
-      AliasAnalysis::Location(DenseMapInfo<const Value *>::getEmptyKey(),
-                              0, nullptr);
+    return AliasAnalysis::Location(DenseMapInfo<const Value *>::getEmptyKey(),
+                                   0);
   }
   static inline AliasAnalysis::Location getTombstoneKey() {
-    return
-      AliasAnalysis::Location(DenseMapInfo<const Value *>::getTombstoneKey(),
-                              0, nullptr);
+    return AliasAnalysis::Location(
+        DenseMapInfo<const Value *>::getTombstoneKey(), 0);
   }
   static unsigned getHashValue(const AliasAnalysis::Location &Val) {
     return DenseMapInfo<const Value *>::getHashValue(Val.Ptr) ^
            DenseMapInfo<uint64_t>::getHashValue(Val.Size) ^
-           DenseMapInfo<const MDNode *>::getHashValue(Val.TBAATag);
+           DenseMapInfo<AAMDNodes>::getHashValue(Val.AATags);
   }
   static bool isEqual(const AliasAnalysis::Location &LHS,
                       const AliasAnalysis::Location &RHS) {
     return LHS.Ptr == RHS.Ptr &&
            LHS.Size == RHS.Size &&
-           LHS.TBAATag == RHS.TBAATag;
+           LHS.AATags == RHS.AATags;
   }
 };
 
@@ -605,6 +605,13 @@
 ///
 bool isIdentifiedObject(const Value *V);
 
+/// isIdentifiedFunctionLocal - Return true if V is umabigously identified
+/// at the function-level. Different IdentifiedFunctionLocals can't alias.
+/// Further, an IdentifiedFunctionLocal can not alias with any function
+/// arguments other than itself, which is not necessarily true for
+/// IdentifiedObjects.
+bool isIdentifiedFunctionLocal(const Value *V);
+
 } // End llvm namespace
 
 #endif

diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index 6117d91..036d58d 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h

@@ -20,6 +20,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/ValueHandle.h"
 #include <vector>
 
@@ -40,11 +41,11 @@
     PointerRec **PrevInList, *NextInList;
     AliasSet *AS;
     uint64_t Size;
-    const MDNode *TBAAInfo;
+    AAMDNodes AAInfo;
   public:
     PointerRec(Value *V)
       : Val(V), PrevInList(nullptr), NextInList(nullptr), AS(nullptr), Size(0),
-        TBAAInfo(DenseMapInfo<const MDNode *>::getEmptyKey()) {}
+        AAInfo(DenseMapInfo<AAMDNodes>::getEmptyKey()) {}
 
     Value *getValue() const { return Val; }
     
@@ -56,27 +57,27 @@
       return &NextInList;
     }
 
-    void updateSizeAndTBAAInfo(uint64_t NewSize, const MDNode *NewTBAAInfo) {
+    void updateSizeAndAAInfo(uint64_t NewSize, const AAMDNodes &NewAAInfo) {
       if (NewSize > Size) Size = NewSize;
 
-      if (TBAAInfo == DenseMapInfo<const MDNode *>::getEmptyKey())
-        // We don't have a TBAAInfo yet. Set it to NewTBAAInfo.
-        TBAAInfo = NewTBAAInfo;
-      else if (TBAAInfo != NewTBAAInfo)
-        // NewTBAAInfo conflicts with TBAAInfo.
-        TBAAInfo = DenseMapInfo<const MDNode *>::getTombstoneKey();
+      if (AAInfo == DenseMapInfo<AAMDNodes>::getEmptyKey())
+        // We don't have a AAInfo yet. Set it to NewAAInfo.
+        AAInfo = NewAAInfo;
+      else if (AAInfo != NewAAInfo)
+        // NewAAInfo conflicts with AAInfo.
+        AAInfo = DenseMapInfo<AAMDNodes>::getTombstoneKey();
     }
 
     uint64_t getSize() const { return Size; }
 
-    /// getTBAAInfo - Return the TBAAInfo, or null if there is no
+    /// getAAInfo - Return the AAInfo, or null if there is no
     /// information or conflicting information.
-    const MDNode *getTBAAInfo() const {
-      // If we have missing or conflicting TBAAInfo, return null.
-      if (TBAAInfo == DenseMapInfo<const MDNode *>::getEmptyKey() ||
-          TBAAInfo == DenseMapInfo<const MDNode *>::getTombstoneKey())
-        return nullptr;
-      return TBAAInfo;
+    AAMDNodes getAAInfo() const {
+      // If we have missing or conflicting AAInfo, return null.
+      if (AAInfo == DenseMapInfo<AAMDNodes>::getEmptyKey() ||
+          AAInfo == DenseMapInfo<AAMDNodes>::getTombstoneKey())
+        return AAMDNodes();
+      return AAInfo;
     }
 
     AliasSet *getAliasSet(AliasSetTracker &AST) {
@@ -204,7 +205,7 @@
 
     Value *getPointer() const { return CurNode->getValue(); }
     uint64_t getSize() const { return CurNode->getSize(); }
-    const MDNode *getTBAAInfo() const { return CurNode->getTBAAInfo(); }
+    AAMDNodes getAAInfo() const { return CurNode->getAAInfo(); }
 
     iterator& operator++() {                // Preincrement
       assert(CurNode && "Advancing past AliasSet.end()!");
@@ -250,16 +251,19 @@
   void removeFromTracker(AliasSetTracker &AST);
 
   void addPointer(AliasSetTracker &AST, PointerRec &Entry, uint64_t Size,
-                  const MDNode *TBAAInfo,
+                  const AAMDNodes &AAInfo,
                   bool KnownMustAlias = false);
   void addUnknownInst(Instruction *I, AliasAnalysis &AA);
-  void removeUnknownInst(Instruction *I) {
+  void removeUnknownInst(AliasSetTracker &AST, Instruction *I) {
+    bool WasEmpty = UnknownInsts.empty();
     for (size_t i = 0, e = UnknownInsts.size(); i != e; ++i)
       if (UnknownInsts[i] == I) {
         UnknownInsts[i] = UnknownInsts.back();
         UnknownInsts.pop_back();
         --i; --e;  // Revisit the moved entry.
       }
+    if (!WasEmpty && UnknownInsts.empty())
+      dropRef(AST);
   }
   void setVolatile() { Volatile = true; }
 
@@ -267,7 +271,7 @@
   /// aliasesPointer - Return true if the specified pointer "may" (or must)
   /// alias one of the members in the set.
   ///
-  bool aliasesPointer(const Value *Ptr, uint64_t Size, const MDNode *TBAAInfo,
+  bool aliasesPointer(const Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo,
                       AliasAnalysis &AA) const;
   bool aliasesUnknownInst(Instruction *Inst, AliasAnalysis &AA) const;
 };
@@ -322,7 +326,7 @@
   /// These methods return true if inserting the instruction resulted in the
   /// addition of a new alias set (i.e., the pointer did not alias anything).
   ///
-  bool add(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo); // Add a location
+  bool add(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo); // Add a loc.
   bool add(LoadInst *LI);
   bool add(StoreInst *SI);
   bool add(VAArgInst *VAAI);
@@ -335,7 +339,7 @@
   /// be aliased by the specified instruction.  These methods return true if any
   /// alias sets were eliminated.
   // Remove a location
-  bool remove(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo);
+  bool remove(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo);
   bool remove(LoadInst *LI);
   bool remove(StoreInst *SI);
   bool remove(VAArgInst *VAAI);
@@ -354,20 +358,24 @@
   /// true if a new alias set is created to contain the pointer (because the
   /// pointer didn't alias anything).
   AliasSet &getAliasSetForPointer(Value *P, uint64_t Size,
-                                  const MDNode *TBAAInfo,
+                                  const AAMDNodes &AAInfo,
                                   bool *New = nullptr);
 
   /// getAliasSetForPointerIfExists - Return the alias set containing the
   /// location specified if one exists, otherwise return null.
   AliasSet *getAliasSetForPointerIfExists(Value *P, uint64_t Size,
-                                          const MDNode *TBAAInfo) {
-    return findAliasSetForPointer(P, Size, TBAAInfo);
+                                          const AAMDNodes &AAInfo) {
+    return findAliasSetForPointer(P, Size, AAInfo);
   }
 
   /// containsPointer - Return true if the specified location is represented by
   /// this alias set, false otherwise.  This does not modify the AST object or
   /// alias sets.
-  bool containsPointer(Value *P, uint64_t Size, const MDNode *TBAAInfo) const;
+  bool containsPointer(Value *P, uint64_t Size, const AAMDNodes &AAInfo) const;
+
+  /// Return true if the specified instruction "may" (or must) alias one of the
+  /// members in any of the sets.
+  bool containsUnknown(Instruction *I) const;
 
   /// getAliasAnalysis - Return the underlying alias analysis object used by
   /// this tracker.
@@ -414,16 +422,16 @@
     return *Entry;
   }
 
-  AliasSet &addPointer(Value *P, uint64_t Size, const MDNode *TBAAInfo,
+  AliasSet &addPointer(Value *P, uint64_t Size, const AAMDNodes &AAInfo,
                        AliasSet::AccessType E,
                        bool &NewSet) {
     NewSet = false;
-    AliasSet &AS = getAliasSetForPointer(P, Size, TBAAInfo, &NewSet);
+    AliasSet &AS = getAliasSetForPointer(P, Size, AAInfo, &NewSet);
     AS.AccessTy |= E;
     return AS;
   }
   AliasSet *findAliasSetForPointer(const Value *Ptr, uint64_t Size,
-                                   const MDNode *TBAAInfo);
+                                   const AAMDNodes &AAInfo);
 
   AliasSet *findAliasSetForUnknownInst(Instruction *Inst);
 };

diff --git a/include/llvm/Analysis/AssumptionTracker.h b/include/llvm/Analysis/AssumptionTracker.h
new file mode 100644
index 0000000..5a050a8
--- /dev/null
+++ b/include/llvm/Analysis/AssumptionTracker.h

@@ -0,0 +1,128 @@
+//===- llvm/Analysis/AssumptionTracker.h - Track @llvm.assume ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that keeps track of @llvm.assume intrinsics in
+// the functions of a module (allowing assumptions within any function to be
+// found cheaply by other parts of the optimizer).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_ASSUMPTIONTRACKER_H
+#define LLVM_ANALYSIS_ASSUMPTIONTRACKER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include <memory>
+
+namespace llvm {
+
+/// An immutable pass that tracks @llvm.assume intrinsics in a module.
+class AssumptionTracker : public ImmutablePass {
+  /// A callback value handle applied to function objects, which we use to
+  /// delete our cache of intrinsics for a function when it is deleted.
+  class FunctionCallbackVH : public CallbackVH {
+    AssumptionTracker *AT;
+    void deleted() override;
+
+    public:
+      typedef DenseMapInfo<Value *> DMI;
+
+      FunctionCallbackVH(Value *V, AssumptionTracker *AT = nullptr)
+        : CallbackVH(V), AT(AT) {}
+  };
+
+  /// A callback value handle applied to call instructions, which keeps
+  /// track of the call's parent function so that we can remove a
+  /// assumption intrinsic call from our cache when the instruction is
+  /// deleted.
+  class CallCallbackVH : public CallbackVH {
+    AssumptionTracker *AT;
+    void deleted() override;
+
+    // We store the function here because we need it to lookup the set
+    // containing this handle when the underlying CallInst is being deleted.
+    Function *F;
+
+    public:
+      typedef DenseMapInfo<Instruction *> DMI;
+
+      CallCallbackVH(Instruction *I, AssumptionTracker *AT = nullptr)
+        : CallbackVH(I), AT(AT), F(nullptr) {
+        if (I != DMI::getEmptyKey() && I != DMI::getTombstoneKey())
+          F = I->getParent()->getParent();
+      }
+
+      operator CallInst*() const {
+        Value *V = getValPtr();
+        if (V == DMI::getEmptyKey() || V == DMI::getTombstoneKey())
+          return reinterpret_cast<CallInst*>(V);
+
+        return cast<CallInst>(V);
+      }
+
+      CallInst *operator->() const { return cast<CallInst>(getValPtr()); }
+      CallInst &operator*() const { return *cast<CallInst>(getValPtr()); }
+  };
+
+  friend FunctionCallbackVH;
+  friend CallCallbackVH;
+
+  // FIXME: SmallSet might be better here, but it currently has no iterators.
+  typedef DenseSet<CallCallbackVH, CallCallbackVH::DMI> CallHandleSet;
+  typedef DenseMap<FunctionCallbackVH, std::unique_ptr<CallHandleSet>,
+                   FunctionCallbackVH::DMI> FunctionCallsMap;
+  FunctionCallsMap CachedAssumeCalls;
+
+  /// Scan the provided function for @llvm.assume intrinsic calls. Returns an
+  /// iterator to the set for this function in the CachedAssumeCalls map.
+  FunctionCallsMap::iterator scanFunction(Function *F);
+
+public:
+  /// Remove the cache of @llvm.assume intrinsics for the given function.
+  void forgetCachedAssumptions(Function *F);
+
+  /// Add an @llvm.assume intrinsic to the cache for its parent function.
+  void registerAssumption(CallInst *CI);
+
+  typedef CallHandleSet::iterator assumption_iterator;
+  typedef iterator_range<assumption_iterator> assumption_range;
+
+  inline assumption_range assumptions(Function *F) {
+    FunctionCallsMap::iterator I = CachedAssumeCalls.find_as(F);
+    if (I == CachedAssumeCalls.end()) {
+      I = scanFunction(F);
+    }
+
+    return assumption_range(I->second->begin(), I->second->end());
+  }
+
+  AssumptionTracker();
+  ~AssumptionTracker();
+
+  void releaseMemory() override {
+    CachedAssumeCalls.shrink_and_clear();
+  }
+
+  void verifyAnalysis() const override;
+  bool doFinalization(Module &) override {
+    verifyAnalysis();
+    return false;
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 7340801..57b5154 100644
--- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h

@@ -31,15 +31,26 @@
 
 #define DEBUG_TYPE "block-freq"
 
-//===----------------------------------------------------------------------===//
-//
-// BlockMass definition.
-//
-// TODO: Make this private to BlockFrequencyInfoImpl or delete.
-//
-//===----------------------------------------------------------------------===//
 namespace llvm {
 
+class BasicBlock;
+class BranchProbabilityInfo;
+class Function;
+class Loop;
+class LoopInfo;
+class MachineBasicBlock;
+class MachineBranchProbabilityInfo;
+class MachineFunction;
+class MachineLoop;
+class MachineLoopInfo;
+
+namespace bfi_detail {
+
+struct IrreducibleGraph;
+
+// This is part of a workaround for a GCC 4.7 crash on lambdas.
+template <class BT> struct BlockEdgesAdder;
+
 /// \brief Mass of a block.
 ///
 /// This class implements a sort of fixed-point fraction always between 0.0 and
@@ -128,35 +139,11 @@
   return X.print(OS);
 }
 
-template <> struct isPodLike<BlockMass> {
+} // end namespace bfi_detail
+
+template <> struct isPodLike<bfi_detail::BlockMass> {
   static const bool value = true;
 };
-}
-
-//===----------------------------------------------------------------------===//
-//
-// BlockFrequencyInfoImpl definition.
-//
-//===----------------------------------------------------------------------===//
-namespace llvm {
-
-class BasicBlock;
-class BranchProbabilityInfo;
-class Function;
-class Loop;
-class LoopInfo;
-class MachineBasicBlock;
-class MachineBranchProbabilityInfo;
-class MachineFunction;
-class MachineLoop;
-class MachineLoopInfo;
-
-namespace bfi_detail {
-struct IrreducibleGraph;
-
-// This is part of a workaround for a GCC 4.7 crash on lambdas.
-template <class BT> struct BlockEdgesAdder;
-}
 
 /// \brief Base class for BlockFrequencyInfoImpl
 ///
@@ -169,6 +156,7 @@
 class BlockFrequencyInfoImplBase {
 public:
   typedef ScaledNumber<uint64_t> Scaled64;
+  typedef bfi_detail::BlockMass BlockMass;
 
   /// \brief Representative of a block.
   ///
@@ -272,7 +260,7 @@
     /// loop.
     ///
     /// This function should only be called when distributing mass.  As long as
-    /// there are no irreducilbe edges to Node, then it will have complexity
+    /// there are no irreducible edges to Node, then it will have complexity
     /// O(1) in this context.
     ///
     /// In general, the complexity is O(L), where L is the number of loop
@@ -334,6 +322,8 @@
     BlockNode TargetNode;
     uint64_t Amount;
     Weight() : Type(Local), Amount(0) {}
+    Weight(DistType Type, BlockNode TargetNode, uint64_t Amount)
+        : Type(Type), TargetNode(TargetNode), Amount(Amount) {}
   };
 
   /// \brief Distribution of unscaled probability weight.
@@ -1183,7 +1173,8 @@
   OS << "\n";
   return OS;
 }
-}
+
+} // end namespace llvm
 
 #undef DEBUG_TYPE
 

diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
index e6d2ed1..0357648 100644
--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h

@@ -72,13 +72,13 @@
         OutStr.erase(OutStr.begin()+i, OutStr.begin()+Idx);
         --i;
       } else if (ColNum == MaxColumns) {                  // Wrap lines.
-        if (LastSpace) {
-          OutStr.insert(LastSpace, "\\l...");
-          ColNum = i - LastSpace;
-          LastSpace = 0;
-          i += 3; // The loop will advance 'i' again.
-        }
-        // Else keep trying to find a space.
+        // Wrap very long names even though we can't find a space.
+        if (!LastSpace)
+          LastSpace = i;
+        OutStr.insert(LastSpace, "\\l...");
+        ColNum = i - LastSpace;
+        LastSpace = 0;
+        i += 3; // The loop will advance 'i' again.
       }
       else
         ++ColNum;

diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h
index 09101ae..1533b36 100644
--- a/include/llvm/Analysis/CGSCCPassManager.h
+++ b/include/llvm/Analysis/CGSCCPassManager.h

@@ -18,8 +18,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_CGSCC_PASS_MANAGER_H
-#define LLVM_ANALYSIS_CGSCC_PASS_MANAGER_H
+#ifndef LLVM_ANALYSIS_CGSCCPASSMANAGER_H
+#define LLVM_ANALYSIS_CGSCCPASSMANAGER_H
 
 #include "llvm/IR/PassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"

diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index 9a6a4a7..76d9073 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h

@@ -58,7 +58,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/IncludeFile.h"
 #include <map>
 
 namespace llvm {
@@ -418,13 +417,24 @@
 
 template <> struct GraphTraits<const CallGraphNode *> {
   typedef const CallGraphNode NodeType;
-  typedef NodeType::const_iterator ChildIteratorType;
+
+  typedef CallGraphNode::CallRecord CGNPairTy;
+  typedef std::pointer_to_unary_function<CGNPairTy, const CallGraphNode *>
+      CGNDerefFun;
 
   static NodeType *getEntryNode(const CallGraphNode *CGN) { return CGN; }
+
+  typedef mapped_iterator<NodeType::const_iterator, CGNDerefFun>
+      ChildIteratorType;
+
   static inline ChildIteratorType child_begin(NodeType *N) {
-    return N->begin();
+    return map_iterator(N->begin(), CGNDerefFun(CGNDeref));
   }
-  static inline ChildIteratorType child_end(NodeType *N) { return N->end(); }
+  static inline ChildIteratorType child_end(NodeType *N) {
+    return map_iterator(N->end(), CGNDerefFun(CGNDeref));
+  }
+
+  static const CallGraphNode *CGNDeref(CGNPairTy P) { return P.second; }
 };
 
 template <>
@@ -451,17 +461,24 @@
 struct GraphTraits<const CallGraph *> : public GraphTraits<
                                             const CallGraphNode *> {
   static NodeType *getEntryNode(const CallGraph *CGN) {
-    return CGN->getExternalCallingNode();
+    return CGN->getExternalCallingNode(); // Start at the external node!
   }
+  typedef std::pair<const Function *, const CallGraphNode *> PairTy;
+  typedef std::pointer_to_unary_function<PairTy, const CallGraphNode &>
+      DerefFun;
+
   // nodes_iterator/begin/end - Allow iteration over all nodes in the graph
-  typedef CallGraph::const_iterator nodes_iterator;
-  static nodes_iterator nodes_begin(const CallGraph *CG) { return CG->begin(); }
-  static nodes_iterator nodes_end(const CallGraph *CG) { return CG->end(); }
+  typedef mapped_iterator<CallGraph::const_iterator, DerefFun> nodes_iterator;
+  static nodes_iterator nodes_begin(const CallGraph *CG) {
+    return map_iterator(CG->begin(), DerefFun(CGdereference));
+  }
+  static nodes_iterator nodes_end(const CallGraph *CG) {
+    return map_iterator(CG->end(), DerefFun(CGdereference));
+  }
+
+  static const CallGraphNode &CGdereference(PairTy P) { return *P.second; }
 };
 
 } // End llvm namespace
 
-// Make sure that any clients of this file link in CallGraph.cpp
-FORCE_DEFINING_FILE_TO_BE_LINKED(CallGraph)
-
 #endif

diff --git a/include/llvm/Analysis/CaptureTracking.h b/include/llvm/Analysis/CaptureTracking.h
index eccf1f8..8b7c7a9 100644
--- a/include/llvm/Analysis/CaptureTracking.h
+++ b/include/llvm/Analysis/CaptureTracking.h

@@ -18,6 +18,8 @@
 
   class Value;
   class Use;
+  class Instruction;
+  class DominatorTree;
 
   /// PointerMayBeCaptured - Return true if this pointer value may be captured
   /// by the enclosing function (which is required to exist).  This routine can
@@ -30,6 +32,20 @@
                             bool ReturnCaptures,
                             bool StoreCaptures);
 
+  /// PointerMayBeCapturedBefore - Return true if this pointer value may be
+  /// captured by the enclosing function (which is required to exist). If a
+  /// DominatorTree is provided, only captures which happen before the given
+  /// instruction are considered. This routine can be expensive, so consider
+  /// caching the results.  The boolean ReturnCaptures specifies whether
+  /// returning the value (or part of it) from the function counts as capturing
+  /// it or not.  The boolean StoreCaptures specified whether storing the value
+  /// (or part of it) into memory anywhere automatically counts as capturing it
+  /// or not. Captures by the provided instruction are considered if the
+  /// final parameter is true.
+  bool PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
+                                  bool StoreCaptures, const Instruction *I,
+                                  DominatorTree *DT, bool IncludeI = false);
+
   /// This callback is used in conjunction with PointerMayBeCaptured. In
   /// addition to the interface here, you'll need to provide your own getters
   /// to see whether anything was captured.

diff --git a/include/llvm/Analysis/CodeMetrics.h b/include/llvm/Analysis/CodeMetrics.h
index 04b39c1..59502df 100644
--- a/include/llvm/Analysis/CodeMetrics.h
+++ b/include/llvm/Analysis/CodeMetrics.h

@@ -16,10 +16,13 @@
 #define LLVM_ANALYSIS_CODEMETRICS_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/CallSite.h"
 
 namespace llvm {
+class AssumptionTracker;
 class BasicBlock;
+class Loop;
 class Function;
 class Instruction;
 class DataLayout;
@@ -85,7 +88,18 @@
         NumInlineCandidates(0), NumVectorInsts(0), NumRets(0) {}
 
   /// \brief Add information about a block to the current state.
-  void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI);
+  void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI,
+                         SmallPtrSetImpl<const Value*> &EphValues);
+
+  /// \brief Collect a loop's ephemeral values (those used only by an assume
+  /// or similar intrinsics in the loop).
+  static void collectEphemeralValues(const Loop *L, AssumptionTracker *AT,
+                                     SmallPtrSetImpl<const Value*> &EphValues);
+
+  /// \brief Collect a functions's ephemeral values (those used only by an
+  /// assume or similar intrinsics in the function).
+  static void collectEphemeralValues(const Function *L, AssumptionTracker *AT,
+                                     SmallPtrSetImpl<const Value*> &EphValues);
 };
 
 }

diff --git a/include/llvm/Analysis/DOTGraphTraitsPass.h b/include/llvm/Analysis/DOTGraphTraitsPass.h
index 53c832c..cb74e9f 100644
--- a/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/include/llvm/Analysis/DOTGraphTraitsPass.h

@@ -66,15 +66,15 @@
   bool runOnFunction(Function &F) override {
     GraphT Graph = AnalysisGraphTraitsT::getGraph(&getAnalysis<AnalysisT>());
     std::string Filename = Name + "." + F.getName().str() + ".dot";
-    std::string ErrorInfo;
+    std::error_code EC;
 
     errs() << "Writing '" << Filename << "'...";
 
-    raw_fd_ostream File(Filename.c_str(), ErrorInfo, sys::fs::F_Text);
+    raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
     std::string GraphName = DOTGraphTraits<GraphT>::getGraphName(Graph);
     std::string Title = GraphName + " for '" + F.getName().str() + "' function";
 
-    if (ErrorInfo.empty())
+    if (!EC)
       WriteGraph(File, Graph, IsSimple, Title);
     else
       errs() << "  error opening file for writing!";
@@ -129,14 +129,14 @@
   bool runOnModule(Module &M) override {
     GraphT Graph = AnalysisGraphTraitsT::getGraph(&getAnalysis<AnalysisT>());
     std::string Filename = Name + ".dot";
-    std::string ErrorInfo;
+    std::error_code EC;
 
     errs() << "Writing '" << Filename << "'...";
 
-    raw_fd_ostream File(Filename.c_str(), ErrorInfo, sys::fs::F_Text);
+    raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
     std::string Title = DOTGraphTraits<GraphT>::getGraphName(Graph);
 
-    if (ErrorInfo.empty())
+    if (!EC)
       WriteGraph(File, Graph, IsSimple, Title);
     else
       errs() << "  error opening file for writing!";

diff --git a/include/llvm/Analysis/DependenceAnalysis.h b/include/llvm/Analysis/DependenceAnalysis.h
index 279755e..1041e3f 100644
--- a/include/llvm/Analysis/DependenceAnalysis.h
+++ b/include/llvm/Analysis/DependenceAnalysis.h

@@ -287,9 +287,9 @@
     /// The flag PossiblyLoopIndependent should be set by the caller
     /// if it appears that control flow can reach from Src to Dst
     /// without traversing a loop back edge.
-    Dependence *depends(Instruction *Src,
-                        Instruction *Dst,
-                        bool PossiblyLoopIndependent);
+    std::unique_ptr<Dependence> depends(Instruction *Src,
+                                        Instruction *Dst,
+                                        bool PossiblyLoopIndependent);
 
     /// getSplitIteration - Give a dependence that's splittable at some
     /// particular level, return the iteration that should be used to split
@@ -331,7 +331,7 @@
     ///
     /// breaks the dependence and allows us to vectorize/parallelize
     /// both loops.
-    const SCEV *getSplitIteration(const Dependence *Dep, unsigned Level);
+    const SCEV *getSplitIteration(const Dependence &Dep, unsigned Level);
 
   private:
     AliasAnalysis *AA;
@@ -523,6 +523,12 @@
     /// in LoopNest.
     bool isLoopInvariant(const SCEV *Expression, const Loop *LoopNest) const;
 
+    /// Makes sure both subscripts (i.e. Pair->Src and Pair->Dst) share the same
+    /// integer type by sign-extending one of them when necessary.
+    /// Sign-extending a subscript is safe because getelementptr assumes the
+    /// array subscripts are signed.
+    void unifySubscriptType(Subscript *Pair);
+
     /// removeMatchingExtensions - Examines a subscript pair.
     /// If the source and destination are identically sign (or zero)
     /// extended, it strips off the extension in an effort to
@@ -911,7 +917,7 @@
 
     bool tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
                         SmallVectorImpl<Subscript> &Pair,
-                        const SCEV *ElementSize) const;
+                        const SCEV *ElementSize);
 
   public:
     static char ID; // Class identification, replacement for typeinfo

diff --git a/include/llvm/Analysis/DominanceFrontier.h b/include/llvm/Analysis/DominanceFrontier.h
index 0fbaa13..996700e 100644
--- a/include/llvm/Analysis/DominanceFrontier.h
+++ b/include/llvm/Analysis/DominanceFrontier.h

@@ -23,168 +23,188 @@
 #include <set>
 
 namespace llvm {
-  
+
 //===----------------------------------------------------------------------===//
 /// DominanceFrontierBase - Common base class for computing forward and inverse
 /// dominance frontiers for a function.
 ///
-class DominanceFrontierBase : public FunctionPass {
+template <class BlockT>
+class DominanceFrontierBase {
 public:
-  typedef std::set<BasicBlock*>             DomSetType;    // Dom set for a bb
-  typedef std::map<BasicBlock*, DomSetType> DomSetMapType; // Dom set map
+  typedef std::set<BlockT *> DomSetType;                // Dom set for a bb
+  typedef std::map<BlockT *, DomSetType> DomSetMapType; // Dom set map
+
 protected:
+  typedef GraphTraits<BlockT *> BlockTraits;
+
   DomSetMapType Frontiers;
-  std::vector<BasicBlock*> Roots;
+  std::vector<BlockT *> Roots;
   const bool IsPostDominators;
 
 public:
-  DominanceFrontierBase(char &ID, bool isPostDom)
-    : FunctionPass(ID), IsPostDominators(isPostDom) {}
+  DominanceFrontierBase(bool isPostDom) : IsPostDominators(isPostDom) {}
 
   /// getRoots - Return the root blocks of the current CFG.  This may include
   /// multiple blocks if we are computing post dominators.  For forward
   /// dominators, this will always be a single block (the entry node).
   ///
-  inline const std::vector<BasicBlock*> &getRoots() const { return Roots; }
+  inline const std::vector<BlockT *> &getRoots() const {
+    return Roots;
+  }
+
+  BlockT *getRoot() const {
+    assert(Roots.size() == 1 && "Should always have entry node!");
+    return Roots[0];
+  }
 
   /// isPostDominator - Returns true if analysis based of postdoms
   ///
-  bool isPostDominator() const { return IsPostDominators; }
+  bool isPostDominator() const {
+    return IsPostDominators;
+  }
 
-  void releaseMemory() override { Frontiers.clear(); }
+  void releaseMemory() {
+    Frontiers.clear();
+  }
 
   // Accessor interface:
-  typedef DomSetMapType::iterator iterator;
-  typedef DomSetMapType::const_iterator const_iterator;
-  iterator       begin()       { return Frontiers.begin(); }
+  typedef typename DomSetMapType::iterator iterator;
+  typedef typename DomSetMapType::const_iterator const_iterator;
+  iterator begin() { return Frontiers.begin(); }
   const_iterator begin() const { return Frontiers.begin(); }
-  iterator       end()         { return Frontiers.end(); }
-  const_iterator end()   const { return Frontiers.end(); }
-  iterator       find(BasicBlock *B)       { return Frontiers.find(B); }
-  const_iterator find(BasicBlock *B) const { return Frontiers.find(B); }
+  iterator end() { return Frontiers.end(); }
+  const_iterator end() const { return Frontiers.end(); }
+  iterator find(BlockT *B) { return Frontiers.find(B); }
+  const_iterator find(BlockT *B) const { return Frontiers.find(B); }
 
-  iterator addBasicBlock(BasicBlock *BB, const DomSetType &frontier) {
+  iterator addBasicBlock(BlockT *BB, const DomSetType &frontier) {
     assert(find(BB) == end() && "Block already in DominanceFrontier!");
     return Frontiers.insert(std::make_pair(BB, frontier)).first;
   }
 
   /// removeBlock - Remove basic block BB's frontier.
-  void removeBlock(BasicBlock *BB) {
-    assert(find(BB) != end() && "Block is not in DominanceFrontier!");
-    for (iterator I = begin(), E = end(); I != E; ++I)
-      I->second.erase(BB);
-    Frontiers.erase(BB);
-  }
+  void removeBlock(BlockT *BB);
 
-  void addToFrontier(iterator I, BasicBlock *Node) {
-    assert(I != end() && "BB is not in DominanceFrontier!");
-    I->second.insert(Node);
-  }
+  void addToFrontier(iterator I, BlockT *Node);
 
-  void removeFromFrontier(iterator I, BasicBlock *Node) {
-    assert(I != end() && "BB is not in DominanceFrontier!");
-    assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB");
-    I->second.erase(Node);
-  }
+  void removeFromFrontier(iterator I, BlockT *Node);
 
   /// compareDomSet - Return false if two domsets match. Otherwise
   /// return true;
-  bool compareDomSet(DomSetType &DS1, const DomSetType &DS2) const {
-    std::set<BasicBlock *> tmpSet;
-    for (DomSetType::const_iterator I = DS2.begin(),
-           E = DS2.end(); I != E; ++I)
-      tmpSet.insert(*I);
-
-    for (DomSetType::const_iterator I = DS1.begin(),
-           E = DS1.end(); I != E; ) {
-      BasicBlock *Node = *I++;
-
-      if (tmpSet.erase(Node) == 0)
-        // Node is in DS1 but not in DS2.
-        return true;
-    }
-
-    if (!tmpSet.empty())
-      // There are nodes that are in DS2 but not in DS1.
-      return true;
-
-    // DS1 and DS2 matches.
-    return false;
-  }
+  bool compareDomSet(DomSetType &DS1, const DomSetType &DS2) const;
 
   /// compare - Return true if the other dominance frontier base matches
   /// this dominance frontier base. Otherwise return false.
-  bool compare(DominanceFrontierBase &Other) const {
-    DomSetMapType tmpFrontiers;
-    for (DomSetMapType::const_iterator I = Other.begin(),
-           E = Other.end(); I != E; ++I)
-      tmpFrontiers.insert(std::make_pair(I->first, I->second));
-
-    for (DomSetMapType::iterator I = tmpFrontiers.begin(),
-           E = tmpFrontiers.end(); I != E; ) {
-      BasicBlock *Node = I->first;
-      const_iterator DFI = find(Node);
-      if (DFI == end())
-        return true;
-
-      if (compareDomSet(I->second, DFI->second))
-        return true;
-
-      ++I;
-      tmpFrontiers.erase(Node);
-    }
-
-    if (!tmpFrontiers.empty())
-      return true;
-
-    return false;
-  }
+  bool compare(DominanceFrontierBase<BlockT> &Other) const;
 
   /// print - Convert to human readable form
   ///
-  void print(raw_ostream &OS, const Module* = nullptr) const override;
+  void print(raw_ostream &OS) const;
 
   /// dump - Dump the dominance frontier to dbgs().
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump() const;
+#endif
 };
 
-
 //===-------------------------------------
 /// DominanceFrontier Class - Concrete subclass of DominanceFrontierBase that is
 /// used to compute a forward dominator frontiers.
 ///
-class DominanceFrontier : public DominanceFrontierBase {
-  virtual void anchor();
+template <class BlockT>
+class ForwardDominanceFrontierBase : public DominanceFrontierBase<BlockT> {
+private:
+  typedef GraphTraits<BlockT *> BlockTraits;
+
 public:
-  static char ID; // Pass ID, replacement for typeid
-  DominanceFrontier() :
-    DominanceFrontierBase(ID, false) {
-      initializeDominanceFrontierPass(*PassRegistry::getPassRegistry());
-    }
+  typedef DominatorTreeBase<BlockT> DomTreeT;
+  typedef DomTreeNodeBase<BlockT> DomTreeNodeT;
+  typedef typename DominanceFrontierBase<BlockT>::DomSetType DomSetType;
 
-  BasicBlock *getRoot() const {
-    assert(Roots.size() == 1 && "Should always have entry node!");
-    return Roots[0];
+  ForwardDominanceFrontierBase() : DominanceFrontierBase<BlockT>(false) {}
+
+  void analyze(DomTreeT &DT) {
+    this->Roots = DT.getRoots();
+    assert(this->Roots.size() == 1 &&
+           "Only one entry block for forward domfronts!");
+    calculate(DT, DT[this->Roots[0]]);
   }
 
-  bool runOnFunction(Function &) override {
-    Frontiers.clear();
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    Roots = DT.getRoots();
-    assert(Roots.size() == 1 && "Only one entry block for forward domfronts!");
-    calculate(DT, DT[Roots[0]]);
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    AU.addRequired<DominatorTreeWrapperPass>();
-  }
-
-  const DomSetType &calculate(const DominatorTree &DT,
-                              const DomTreeNode *Node);
+  const DomSetType &calculate(const DomTreeT &DT, const DomTreeNodeT *Node);
 };
 
+class DominanceFrontier : public FunctionPass {
+  ForwardDominanceFrontierBase<BasicBlock> Base;
+
+public:
+  typedef DominatorTreeBase<BasicBlock> DomTreeT;
+  typedef DomTreeNodeBase<BasicBlock> DomTreeNodeT;
+  typedef DominanceFrontierBase<BasicBlock>::DomSetType DomSetType;
+  typedef DominanceFrontierBase<BasicBlock>::iterator iterator;
+  typedef DominanceFrontierBase<BasicBlock>::const_iterator const_iterator;
+
+  static char ID; // Pass ID, replacement for typeid
+
+  DominanceFrontier();
+
+  ForwardDominanceFrontierBase<BasicBlock> &getBase() { return Base; }
+
+  inline const std::vector<BasicBlock *> &getRoots() const {
+    return Base.getRoots();
+  }
+
+  BasicBlock *getRoot() const { return Base.getRoot(); }
+
+  bool isPostDominator() const { return Base.isPostDominator(); }
+
+  iterator begin() { return Base.begin(); }
+
+  const_iterator begin() const { return Base.begin(); }
+
+  iterator end() { return Base.end(); }
+
+  const_iterator end() const { return Base.end(); }
+
+  iterator find(BasicBlock *B) { return Base.find(B); }
+
+  const_iterator find(BasicBlock *B) const { return Base.find(B); }
+
+  iterator addBasicBlock(BasicBlock *BB, const DomSetType &frontier) {
+    return Base.addBasicBlock(BB, frontier);
+  }
+
+  void removeBlock(BasicBlock *BB) { return Base.removeBlock(BB); }
+
+  void addToFrontier(iterator I, BasicBlock *Node) {
+    return Base.addToFrontier(I, Node);
+  }
+
+  void removeFromFrontier(iterator I, BasicBlock *Node) {
+    return Base.removeFromFrontier(I, Node);
+  }
+
+  bool compareDomSet(DomSetType &DS1, const DomSetType &DS2) const {
+    return Base.compareDomSet(DS1, DS2);
+  }
+
+  bool compare(DominanceFrontierBase<BasicBlock> &Other) const {
+    return Base.compare(Other);
+  }
+
+  void releaseMemory() override;
+
+  bool runOnFunction(Function &) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  void print(raw_ostream &OS, const Module * = nullptr) const override;
+
+  void dump() const;
+};
+
+EXTERN_TEMPLATE_INSTANTIATION(class DominanceFrontierBase<BasicBlock>);
+EXTERN_TEMPLATE_INSTANTIATION(class ForwardDominanceFrontierBase<BasicBlock>);
+
 } // End llvm namespace
 
 #endif

diff --git a/include/llvm/Analysis/DominanceFrontierImpl.h b/include/llvm/Analysis/DominanceFrontierImpl.h
new file mode 100644
index 0000000..735bfb8
--- /dev/null
+++ b/include/llvm/Analysis/DominanceFrontierImpl.h

@@ -0,0 +1,226 @@
+//===- llvm/Analysis/DominanceFrontier.h - Dominator Frontiers --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the generic implementation of the DominanceFrontier class, which
+// calculate and holds the dominance frontier for a function for.
+//
+// This should be considered deprecated, don't add any more uses of this data
+// structure.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DOMINANCEFRONTIERIMPL_H
+#define LLVM_ANALYSIS_DOMINANCEFRONTIERIMPL_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm {
+
+namespace {
+template <class BlockT>
+class DFCalculateWorkObject {
+public:
+  typedef DomTreeNodeBase<BlockT> DomTreeNodeT;
+
+  DFCalculateWorkObject(BlockT *B, BlockT *P, const DomTreeNodeT *N,
+                        const DomTreeNodeT *PN)
+      : currentBB(B), parentBB(P), Node(N), parentNode(PN) {}
+  BlockT *currentBB;
+  BlockT *parentBB;
+  const DomTreeNodeT *Node;
+  const DomTreeNodeT *parentNode;
+};
+}
+
+template <class BlockT>
+void DominanceFrontierBase<BlockT>::removeBlock(BlockT *BB) {
+  assert(find(BB) != end() && "Block is not in DominanceFrontier!");
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    I->second.erase(BB);
+  Frontiers.erase(BB);
+}
+
+template <class BlockT>
+void DominanceFrontierBase<BlockT>::addToFrontier(iterator I,
+                                                  BlockT *Node) {
+  assert(I != end() && "BB is not in DominanceFrontier!");
+  assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB");
+  I->second.erase(Node);
+}
+
+template <class BlockT>
+void DominanceFrontierBase<BlockT>::removeFromFrontier(iterator I,
+                                                       BlockT *Node) {
+  assert(I != end() && "BB is not in DominanceFrontier!");
+  assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB");
+  I->second.erase(Node);
+}
+
+template <class BlockT>
+bool DominanceFrontierBase<BlockT>::compareDomSet(DomSetType &DS1,
+                                                  const DomSetType &DS2) const {
+  std::set<BlockT *> tmpSet;
+  for (BlockT *BB : DS2)
+    tmpSet.insert(BB);
+
+  for (typename DomSetType::const_iterator I = DS1.begin(), E = DS1.end();
+       I != E;) {
+    BlockT *Node = *I++;
+
+    if (tmpSet.erase(Node) == 0)
+      // Node is in DS1 but tnot in DS2.
+      return true;
+  }
+
+  if (!tmpSet.empty()) {
+    // There are nodes that are in DS2 but not in DS1.
+    return true;
+  }
+
+  // DS1 and DS2 matches.
+  return false;
+}
+
+template <class BlockT>
+bool DominanceFrontierBase<BlockT>::compare(
+    DominanceFrontierBase<BlockT> &Other) const {
+  DomSetMapType tmpFrontiers;
+  for (typename DomSetMapType::const_iterator I = Other.begin(),
+                                              E = Other.end();
+       I != E; ++I)
+    tmpFrontiers.insert(std::make_pair(I->first, I->second));
+
+  for (typename DomSetMapType::iterator I = tmpFrontiers.begin(),
+                                        E = tmpFrontiers.end();
+       I != E;) {
+    BlockT *Node = I->first;
+    const_iterator DFI = find(Node);
+    if (DFI == end())
+      return true;
+
+    if (compareDomSet(I->second, DFI->second))
+      return true;
+
+    ++I;
+    tmpFrontiers.erase(Node);
+  }
+
+  if (!tmpFrontiers.empty())
+    return true;
+
+  return false;
+}
+
+template <class BlockT>
+void DominanceFrontierBase<BlockT>::print(raw_ostream &OS) const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    OS << "  DomFrontier for BB ";
+    if (I->first)
+      I->first->printAsOperand(OS, false);
+    else
+      OS << " <<exit node>>";
+    OS << " is:\t";
+
+    const std::set<BlockT *> &BBs = I->second;
+
+    for (const BlockT *BB : BBs) {
+      OS << ' ';
+      if (BB)
+        BB->printAsOperand(OS, false);
+      else
+        OS << "<<exit node>>";
+    }
+    OS << '\n';
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+template <class BlockT>
+void DominanceFrontierBase<BlockT>::dump() const {
+  print(dbgs());
+}
+#endif
+
+template <class BlockT>
+const typename ForwardDominanceFrontierBase<BlockT>::DomSetType &
+ForwardDominanceFrontierBase<BlockT>::calculate(const DomTreeT &DT,
+                                                const DomTreeNodeT *Node) {
+  BlockT *BB = Node->getBlock();
+  DomSetType *Result = nullptr;
+
+  std::vector<DFCalculateWorkObject<BlockT>> workList;
+  SmallPtrSet<BlockT *, 32> visited;
+
+  workList.push_back(DFCalculateWorkObject<BlockT>(BB, nullptr, Node, nullptr));
+  do {
+    DFCalculateWorkObject<BlockT> *currentW = &workList.back();
+    assert(currentW && "Missing work object.");
+
+    BlockT *currentBB = currentW->currentBB;
+    BlockT *parentBB = currentW->parentBB;
+    const DomTreeNodeT *currentNode = currentW->Node;
+    const DomTreeNodeT *parentNode = currentW->parentNode;
+    assert(currentBB && "Invalid work object. Missing current Basic Block");
+    assert(currentNode && "Invalid work object. Missing current Node");
+    DomSetType &S = this->Frontiers[currentBB];
+
+    // Visit each block only once.
+    if (visited.insert(currentBB).second) {
+      // Loop over CFG successors to calculate DFlocal[currentNode]
+      for (auto SI = BlockTraits::child_begin(currentBB),
+                SE = BlockTraits::child_end(currentBB);
+           SI != SE; ++SI) {
+        // Does Node immediately dominate this successor?
+        if (DT[*SI]->getIDom() != currentNode)
+          S.insert(*SI);
+      }
+    }
+
+    // At this point, S is DFlocal.  Now we union in DFup's of our children...
+    // Loop through and visit the nodes that Node immediately dominates (Node's
+    // children in the IDomTree)
+    bool visitChild = false;
+    for (typename DomTreeNodeT::const_iterator NI = currentNode->begin(),
+                                               NE = currentNode->end();
+         NI != NE; ++NI) {
+      DomTreeNodeT *IDominee = *NI;
+      BlockT *childBB = IDominee->getBlock();
+      if (visited.count(childBB) == 0) {
+        workList.push_back(DFCalculateWorkObject<BlockT>(
+            childBB, currentBB, IDominee, currentNode));
+        visitChild = true;
+      }
+    }
+
+    // If all children are visited or there is any child then pop this block
+    // from the workList.
+    if (!visitChild) {
+      if (!parentBB) {
+        Result = &S;
+        break;
+      }
+
+      typename DomSetType::const_iterator CDFI = S.begin(), CDFE = S.end();
+      DomSetType &parentSet = this->Frontiers[parentBB];
+      for (; CDFI != CDFE; ++CDFI) {
+        if (!DT.properlyDominates(parentNode, DT[*CDFI]))
+          parentSet.insert(*CDFI);
+      }
+      workList.pop_back();
+    }
+
+  } while (!workList.empty());
+
+  return *Result;
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/include/llvm/Analysis/FunctionTargetTransformInfo.h b/include/llvm/Analysis/FunctionTargetTransformInfo.h
new file mode 100644
index 0000000..c1654cc
--- /dev/null
+++ b/include/llvm/Analysis/FunctionTargetTransformInfo.h

@@ -0,0 +1,49 @@
+//===- llvm/Analysis/FunctionTargetTransformInfo.h --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass wraps a TargetTransformInfo in a FunctionPass so that it can
+// forward along the current Function so that we can make target specific
+// decisions based on the particular subtarget specified for each Function.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_FUNCTIONTARGETTRANSFORMINFO_H
+#define LLVM_ANALYSIS_FUNCTIONTARGETTRANSFORMINFO_H
+
+#include "llvm/Pass.h"
+#include "TargetTransformInfo.h"
+
+namespace llvm {
+class FunctionTargetTransformInfo final : public FunctionPass {
+private:
+  const Function *Fn;
+  const TargetTransformInfo *TTI;
+
+  FunctionTargetTransformInfo(const FunctionTargetTransformInfo &)
+      LLVM_DELETED_FUNCTION;
+  void operator=(const FunctionTargetTransformInfo &) LLVM_DELETED_FUNCTION;
+
+public:
+  static char ID;
+  FunctionTargetTransformInfo();
+
+  // Implementation boilerplate.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  bool runOnFunction(Function &F) override;
+
+  // Shimmed functions from TargetTransformInfo.
+  void
+  getUnrollingPreferences(Loop *L,
+                          TargetTransformInfo::UnrollingPreferences &UP) const {
+    TTI->getUnrollingPreferences(Fn, L, UP);
+  }
+};
+}
+#endif

diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h
index 6038872..d1f0370 100644
--- a/include/llvm/Analysis/IVUsers.h
+++ b/include/llvm/Analysis/IVUsers.h

@@ -174,7 +174,7 @@
   /// dump - This method is used for debugging.
   void dump() const;
 protected:
-  bool AddUsersImpl(Instruction *I, SmallPtrSet<Loop*,16> &SimpleLoopNests);
+  bool AddUsersImpl(Instruction *I, SmallPtrSetImpl<Loop*> &SimpleLoopNests);
 };
 
 Pass *createIVUsersPass();

diff --git a/include/llvm/Analysis/InlineCost.h b/include/llvm/Analysis/InlineCost.h
index aaed716..81795ba 100644
--- a/include/llvm/Analysis/InlineCost.h
+++ b/include/llvm/Analysis/InlineCost.h

@@ -19,6 +19,7 @@
 #include <climits>
 
 namespace llvm {
+class AssumptionTracker;
 class CallSite;
 class DataLayout;
 class Function;
@@ -100,6 +101,7 @@
 /// \brief Cost analyzer used by inliner.
 class InlineCostAnalysis : public CallGraphSCCPass {
   const TargetTransformInfo *TTI;
+  AssumptionTracker *AT;
 
 public:
   static char ID;

diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index 2367c0b..51f6e85 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h

@@ -37,6 +37,7 @@
 namespace llvm {
   template<typename T>
   class ArrayRef;
+  class AssumptionTracker;
   class DominatorTree;
   class Instruction;
   class DataLayout;
@@ -50,28 +51,36 @@
   Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// SimplifySubInst - Given operands for a Sub, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// Given operands for an FAdd, see if we can fold the result.  If not, this
   /// returns null.
   Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// Given operands for an FSub, see if we can fold the result.  If not, this
   /// returns null.
   Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF,
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// Given operands for an FMul, see if we can fold the result.  If not, this
   /// returns null.
@@ -79,121 +88,157 @@
                           FastMathFlags FMF,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifyMulInst - Given operands for a Mul, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyMulInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// SimplifySDivInst - Given operands for an SDiv, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifySDivInst(Value *LHS, Value *RHS,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifyUDivInst - Given operands for a UDiv, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyUDivInst(Value *LHS, Value *RHS,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifyFDivInst - Given operands for an FDiv, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyFDivInst(Value *LHS, Value *RHS,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifySRemInst - Given operands for an SRem, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifySRemInst(Value *LHS, Value *RHS,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifyURemInst - Given operands for a URem, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyURemInst(Value *LHS, Value *RHS,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifyFRemInst - Given operands for an FRem, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyFRemInst(Value *LHS, Value *RHS,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifyShlInst - Given operands for a Shl, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// SimplifyLShrInst - Given operands for a LShr, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifyAShrInst - Given operands for a AShr, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifyAndInst - Given operands for an And, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyAndInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// SimplifyOrInst - Given operands for an Or, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyOrInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
                         const TargetLibraryInfo *TLI = nullptr,
-                        const DominatorTree *DT = nullptr);
+                        const DominatorTree *DT = nullptr,
+                        AssumptionTracker *AT = nullptr,
+                        const Instruction *CxtI = nullptr);
 
   /// SimplifyXorInst - Given operands for a Xor, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyXorInst(Value *LHS, Value *RHS, const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          Instruction *CxtI = nullptr);
 
   /// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                           const DataLayout *TD = nullptr,
                           const TargetLibraryInfo *TLI = nullptr,
-                          const DominatorTree *DT = nullptr);
+                          const DominatorTree *DT = nullptr,
+                          AssumptionTracker *AT = nullptr,
+                          const Instruction *CxtI = nullptr);
 
   /// SimplifySelectInst - Given operands for a SelectInst, see if we can fold
   /// the result.  If not, this returns null.
   Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                             const DataLayout *TD = nullptr,
                             const TargetLibraryInfo *TLI = nullptr,
-                            const DominatorTree *DT = nullptr);
+                            const DominatorTree *DT = nullptr,
+                            AssumptionTracker *AT = nullptr,
+                            const Instruction *CxtI = nullptr);
 
   /// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we
   /// can fold the result.  If not, this returns null.
@@ -201,13 +246,17 @@
                                  ArrayRef<unsigned> Idxs,
                                  const DataLayout *TD = nullptr,
                                  const TargetLibraryInfo *TLI = nullptr,
-                                 const DominatorTree *DT = nullptr);
+                                 const DominatorTree *DT = nullptr,
+                                 AssumptionTracker *AT = nullptr,
+                                 const Instruction *CxtI = nullptr);
 
   /// SimplifyTruncInst - Given operands for an TruncInst, see if we can fold
   /// the result.  If not, this returns null.
   Value *SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *TD = nullptr,
                            const TargetLibraryInfo *TLI = nullptr,
-                           const DominatorTree *DT = nullptr);
+                           const DominatorTree *DT = nullptr,
+                           AssumptionTracker *AT = nullptr,
+                           const Instruction *CxtI = nullptr);
 
   //=== Helper functions for higher up the class hierarchy.
 
@@ -217,14 +266,18 @@
   Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                          const DataLayout *TD = nullptr,
                          const TargetLibraryInfo *TLI = nullptr,
-                         const DominatorTree *DT = nullptr);
+                         const DominatorTree *DT = nullptr,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr);
 
   /// SimplifyBinOp - Given operands for a BinaryOperator, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                        const DataLayout *TD = nullptr,
                        const TargetLibraryInfo *TLI = nullptr,
-                       const DominatorTree *DT = nullptr);
+                       const DominatorTree *DT = nullptr,
+                       AssumptionTracker *AT = nullptr,
+                       const Instruction *CxtI = nullptr);
 
   /// \brief Given a function and iterators over arguments, see if we can fold
   /// the result.
@@ -233,7 +286,9 @@
   Value *SimplifyCall(Value *V, User::op_iterator ArgBegin,
                       User::op_iterator ArgEnd, const DataLayout *TD = nullptr,
                       const TargetLibraryInfo *TLI = nullptr,
-                      const DominatorTree *DT = nullptr);
+                      const DominatorTree *DT = nullptr,
+                      AssumptionTracker *AT = nullptr,
+                      const Instruction *CxtI = nullptr);
 
   /// \brief Given a function and set of arguments, see if we can fold the
   /// result.
@@ -242,13 +297,16 @@
   Value *SimplifyCall(Value *V, ArrayRef<Value *> Args,
                       const DataLayout *TD = nullptr,
                       const TargetLibraryInfo *TLI = nullptr,
-                      const DominatorTree *DT = nullptr);
+                      const DominatorTree *DT = nullptr,
+                      AssumptionTracker *AT = nullptr,
+                      const Instruction *CxtI = nullptr);
 
   /// SimplifyInstruction - See if we can compute a simplified version of this
   /// instruction.  If not, this returns null.
   Value *SimplifyInstruction(Instruction *I, const DataLayout *TD = nullptr,
                              const TargetLibraryInfo *TLI = nullptr,
-                             const DominatorTree *DT = nullptr);
+                             const DominatorTree *DT = nullptr,
+                             AssumptionTracker *AT = nullptr);
 
 
   /// \brief Replace all uses of 'I' with 'SimpleV' and simplify the uses
@@ -262,7 +320,8 @@
   bool replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
                                      const DataLayout *TD = nullptr,
                                      const TargetLibraryInfo *TLI = nullptr,
-                                     const DominatorTree *DT = nullptr);
+                                     const DominatorTree *DT = nullptr,
+                                     AssumptionTracker *AT = nullptr);
 
   /// \brief Recursively attempt to simplify an instruction.
   ///
@@ -273,7 +332,8 @@
   bool recursivelySimplifyInstruction(Instruction *I,
                                       const DataLayout *TD = nullptr,
                                       const TargetLibraryInfo *TLI = nullptr,
-                                      const DominatorTree *DT = nullptr);
+                                      const DominatorTree *DT = nullptr,
+                                      AssumptionTracker *AT = nullptr);
 } // end namespace llvm
 
 #endif

diff --git a/include/llvm/Analysis/IntervalIterator.h b/include/llvm/Analysis/IntervalIterator.h
index 73aff76..3b51d44 100644
--- a/include/llvm/Analysis/IntervalIterator.h
+++ b/include/llvm/Analysis/IntervalIterator.h

@@ -165,10 +165,10 @@
   //
   bool ProcessInterval(NodeTy *Node) {
     BasicBlock *Header = getNodeHeader(Node);
-    if (Visited.count(Header)) return false;
+    if (!Visited.insert(Header).second)
+      return false;
 
     Interval *Int = new Interval(Header);
-    Visited.insert(Header);   // The header has now been visited!
 
     // Check all of our successors to see if they are in the interval...
     for (typename GT::ChildIteratorType I = GT::child_begin(Node),

diff --git a/include/llvm/Analysis/JumpInstrTableInfo.h b/include/llvm/Analysis/JumpInstrTableInfo.h
index 54760aa..5b0176c 100644
--- a/include/llvm/Analysis/JumpInstrTableInfo.h
+++ b/include/llvm/Analysis/JumpInstrTableInfo.h

@@ -37,7 +37,9 @@
 public:
   static char ID;
 
-  JumpInstrTableInfo();
+  /// The default byte alignment for jump tables is 16, which is large but
+  /// usually safe.
+  JumpInstrTableInfo(uint64_t ByteAlign = 16);
   virtual ~JumpInstrTableInfo();
   const char *getPassName() const override {
     return "Jump-Instruction Table Info";
@@ -52,9 +54,19 @@
   /// Gets the tables.
   const JumpTables &getTables() const { return Tables; }
 
+  /// Gets the alignment in bytes of a jumptable entry.
+  uint64_t entryByteAlignment() const { return ByteAlignment; }
 private:
   JumpTables Tables;
+
+  /// A power-of-two alignment of a jumptable entry.
+  uint64_t ByteAlignment;
 };
+
+/// Creates a JumpInstrTableInfo pass with the given bound on entry size. This
+/// bound specifies the maximum number of bytes needed to represent an
+/// unconditional jump or a trap instruction in the back end currently in use.
+ModulePass *createJumpInstrTableInfoPass(unsigned Bound);
 }
 
 #endif /* LLVM_ANALYSIS_JUMPINSTRTABLEINFO_H */

diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index 70a4df5..9a59844 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h

@@ -32,8 +32,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_LAZY_CALL_GRAPH
-#define LLVM_ANALYSIS_LAZY_CALL_GRAPH
+#ifndef LLVM_ANALYSIS_LAZYCALLGRAPH_H
+#define LLVM_ANALYSIS_LAZYCALLGRAPH_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
@@ -537,7 +537,7 @@
 
   static void *ID() { return (void *)&PassID; }
 
-  /// \brief Compute the \c LazyCallGraph for a the module \c M.
+  /// \brief Compute the \c LazyCallGraph for the module \c M.
   ///
   /// This just builds the set of entry points to the call graph. The rest is
   /// built lazily as it is walked.

diff --git a/include/llvm/Analysis/LazyValueInfo.h b/include/llvm/Analysis/LazyValueInfo.h
index 2fe7386..52cc0d1 100644
--- a/include/llvm/Analysis/LazyValueInfo.h
+++ b/include/llvm/Analysis/LazyValueInfo.h

@@ -18,16 +18,21 @@
 #include "llvm/Pass.h"
 
 namespace llvm {
+  class AssumptionTracker;
   class Constant;
   class DataLayout;
+  class DominatorTree;
+  class Instruction;
   class TargetLibraryInfo;
   class Value;
   
 /// LazyValueInfo - This pass computes, caches, and vends lazy value constraint
 /// information.
 class LazyValueInfo : public FunctionPass {
+  AssumptionTracker *AT;
   const DataLayout *DL;
   class TargetLibraryInfo *TLI;
+  DominatorTree *DT;
   void *PImpl;
   LazyValueInfo(const LazyValueInfo&) LLVM_DELETED_FUNCTION;
   void operator=(const LazyValueInfo&) LLVM_DELETED_FUNCTION;
@@ -50,16 +55,23 @@
   /// with a constant is known to be true or false on the specified CFG edge.
   /// Pred is a CmpInst predicate.
   Tristate getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
-                              BasicBlock *FromBB, BasicBlock *ToBB);
+                              BasicBlock *FromBB, BasicBlock *ToBB,
+                              Instruction *CxtI = nullptr);
   
-  
+  /// getPredicateAt - Determine whether the specified value comparison
+  /// with a constant is known to be true or false at the specified instruction
+  /// (from an assume intrinsic). Pred is a CmpInst predicate.
+  Tristate getPredicateAt(unsigned Pred, Value *V, Constant *C,
+                          Instruction *CxtI);
+ 
   /// getConstant - Determine whether the specified value is known to be a
   /// constant at the end of the specified block.  Return null if not.
-  Constant *getConstant(Value *V, BasicBlock *BB);
+  Constant *getConstant(Value *V, BasicBlock *BB, Instruction *CxtI = nullptr);
 
   /// getConstantOnEdge - Determine whether the specified value is known to be a
   /// constant on the specified edge.  Return null if not.
-  Constant *getConstantOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB);
+  Constant *getConstantOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
+                              Instruction *CxtI = nullptr);
   
   /// threadEdge - Inform the analysis cache that we have threaded an edge from
   /// PredBB to OldSucc to be from PredBB to NewSucc instead.

diff --git a/include/llvm/Analysis/Loads.h b/include/llvm/Analysis/Loads.h
index 25c5928..0fe3453 100644
--- a/include/llvm/Analysis/Loads.h
+++ b/include/llvm/Analysis/Loads.h

@@ -44,14 +44,14 @@
 /// If it is set to 0, it will scan the whole block. You can also optionally
 /// specify an alias analysis implementation, which makes this more precise.
 ///
-/// If TBAATag is non-null and a load or store is found, the TBAA tag from the
-/// load or store is recorded there.  If there is no TBAA tag or if no access
+/// If AATags is non-null and a load or store is found, the AA tags from the
+/// load or store are recorded there.  If there are no AA tags or if no access
 /// is found, it is left unmodified.
 Value *FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
                                 BasicBlock::iterator &ScanFrom,
                                 unsigned MaxInstsToScan = 6,
                                 AliasAnalysis *AA = nullptr,
-                                MDNode **TBAATag = nullptr);
+                                AAMDNodes *AATags = nullptr);
 
 }
 

diff --git a/include/llvm/Analysis/LoopPass.h b/include/llvm/Analysis/LoopPass.h
index 726e286..8650000 100644
--- a/include/llvm/Analysis/LoopPass.h
+++ b/include/llvm/Analysis/LoopPass.h

@@ -82,6 +82,11 @@
   /// deleteAnalysisValue - Delete analysis info associated with value V.
   virtual void deleteAnalysisValue(Value *V, Loop *L) {}
 
+  /// Delete analysis info associated with Loop L.
+  /// Called to notify a Pass that a loop has been deleted and any
+  /// associated analysis values can be deleted.
+  virtual void deleteAnalysisLoop(Loop *L) {}
+
 protected:
   /// skipOptnoneFunction - Containing function has Attribute::OptimizeNone
   /// and most transformation passes should skip it.
@@ -152,6 +157,10 @@
   /// that implement simple analysis interface.
   void deleteSimpleAnalysisValue(Value *V, Loop *L);
 
+  /// Invoke deleteAnalysisLoop hook for all passes that implement simple
+  /// analysis interface.
+  void deleteSimpleAnalysisLoop(Loop *L);
+
 private:
   std::deque<Loop *> LQ;
   bool skipThisLoop;

diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 1c4441b..4d315d1 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h

@@ -28,6 +28,7 @@
   class Instruction;
   class CallSite;
   class AliasAnalysis;
+  class AssumptionTracker;
   class DataLayout;
   class MemoryDependenceAnalysis;
   class PredIteratorCache;
@@ -281,12 +282,12 @@
       /// Size - The maximum size of the dereferences of the
       /// pointer. May be UnknownSize if the sizes are unknown.
       uint64_t Size;
-      /// TBAATag - The TBAA tag associated with dereferences of the
-      /// pointer. May be null if there are no tags or conflicting tags.
-      const MDNode *TBAATag;
+      /// AATags - The AA tags associated with dereferences of the
+      /// pointer. The members may be null if there are no tags or
+      /// conflicting tags.
+      AAMDNodes AATags;
 
-      NonLocalPointerInfo()
-        : Size(AliasAnalysis::UnknownSize), TBAATag(nullptr) {}
+      NonLocalPointerInfo() : Size(AliasAnalysis::UnknownSize) {}
     };
 
     /// CachedNonLocalPointerInfo - This map stores the cached results of doing
@@ -325,6 +326,7 @@
     AliasAnalysis *AA;
     const DataLayout *DL;
     DominatorTree *DT;
+    AssumptionTracker *AT;
     std::unique_ptr<PredIteratorCache> PredCache;
 
   public:

diff --git a/include/llvm/Analysis/PHITransAddr.h b/include/llvm/Analysis/PHITransAddr.h
index 69f5907..0790e97 100644
--- a/include/llvm/Analysis/PHITransAddr.h
+++ b/include/llvm/Analysis/PHITransAddr.h

@@ -18,6 +18,7 @@
 #include "llvm/IR/Instruction.h"
 
 namespace llvm {
+  class AssumptionTracker;
   class DominatorTree;
   class DataLayout;
   class TargetLibraryInfo;
@@ -41,12 +42,15 @@
 
   /// TLI - The target library info if known, otherwise null.
   const TargetLibraryInfo *TLI;
+
+  /// A cache of @llvm.assume calls used by SimplifyInstruction.
+  AssumptionTracker *AT;
   
   /// InstInputs - The inputs for our symbolic address.
   SmallVector<Instruction*, 4> InstInputs;
 public:
-  PHITransAddr(Value *addr, const DataLayout *DL)
-      : Addr(addr), DL(DL), TLI(nullptr) {
+  PHITransAddr(Value *addr, const DataLayout *DL, AssumptionTracker *AT)
+      : Addr(addr), DL(DL), TLI(nullptr), AT(AT) {
     // If the address is an instruction, the whole thing is considered an input.
     if (Instruction *I = dyn_cast<Instruction>(Addr))
       InstInputs.push_back(I);

diff --git a/include/llvm/Analysis/Passes.h b/include/llvm/Analysis/Passes.h
index fd65ae5..10a5605 100644
--- a/include/llvm/Analysis/Passes.h
+++ b/include/llvm/Analysis/Passes.h

@@ -66,6 +66,13 @@
 
   //===--------------------------------------------------------------------===//
   //
+  // createCFLAliasAnalysisPass - This pass implements a set-based approach to
+  // alias analysis.
+  //
+  ImmutablePass *createCFLAliasAnalysisPass();
+
+  //===--------------------------------------------------------------------===//
+  //
   /// createLibCallAliasAnalysisPass - Create an alias analysis pass that knows
   /// about the semantics of a set of libcalls specified by LCI.  The newly
   /// constructed pass takes ownership of the pointer that is provided.
@@ -88,11 +95,20 @@
 
   //===--------------------------------------------------------------------===//
   //
+  // createScopedNoAliasAAPass - This pass implements metadata-based
+  // scoped noalias analysis.
+  //
+  ImmutablePass *createScopedNoAliasAAPass();
+
+  //===--------------------------------------------------------------------===//
+  //
   // createObjCARCAliasAnalysisPass - This pass implements ObjC-ARC-based
   // alias analysis.
   //
   ImmutablePass *createObjCARCAliasAnalysisPass();
 
+  FunctionPass *createPAEvalPass();
+
   //===--------------------------------------------------------------------===//
   //
   /// createLazyValueInfoPass - This creates an instance of the LazyValueInfo

diff --git a/include/llvm/Analysis/PostDominators.h b/include/llvm/Analysis/PostDominators.h
index d330755..72cd357 100644
--- a/include/llvm/Analysis/PostDominators.h
+++ b/include/llvm/Analysis/PostDominators.h

@@ -19,7 +19,7 @@
 namespace llvm {
 
 /// PostDominatorTree Class - Concrete subclass of DominatorTree that is used to
-/// compute the a post-dominator tree.
+/// compute the post-dominator tree.
 ///
 struct PostDominatorTree : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid

diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 93a1a48..6ff7f97 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h

@@ -37,21 +37,62 @@
 #ifndef LLVM_ANALYSIS_REGIONINFO_H
 #define LLVM_ANALYSIS_REGIONINFO_H
 
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/DominanceFrontier.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Support/Allocator.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
 #include <map>
 #include <memory>
+#include <set>
 
 namespace llvm {
 
-class Region;
-class RegionInfo;
-class raw_ostream;
+// RegionTraits - Class to be specialized for different users of RegionInfo
+// (i.e. BasicBlocks or MachineBasicBlocks). This is only to avoid needing to
+// pass around an unreasonable number of template parameters.
+template <class FuncT_>
+struct RegionTraits {
+  // FuncT
+  // BlockT
+  // RegionT
+  // RegionNodeT
+  // RegionInfoT
+  typedef typename FuncT_::UnknownRegionTypeError BrokenT;
+};
+
+class DominatorTree;
+class DominanceFrontier;
 class Loop;
 class LoopInfo;
+struct PostDominatorTree;
+class raw_ostream;
+class Region;
+template <class RegionTr>
+class RegionBase;
+class RegionNode;
+class RegionInfo;
+template <class RegionTr>
+class RegionInfoBase;
+
+template <>
+struct RegionTraits<Function> {
+  typedef Function FuncT;
+  typedef BasicBlock BlockT;
+  typedef Region RegionT;
+  typedef RegionNode RegionNodeT;
+  typedef RegionInfo RegionInfoT;
+  typedef DominatorTree DomTreeT;
+  typedef DomTreeNode DomTreeNodeT;
+  typedef DominanceFrontier DomFrontierT;
+  typedef PostDominatorTree PostDomTreeT;
+  typedef Instruction InstT;
+  typedef Loop LoopT;
+  typedef LoopInfo LoopInfoT;
+
+  static unsigned getNumSuccessors(BasicBlock *BB) {
+    return BB->getTerminator()->getNumSuccessors();
+  }
+};
 
 /// @brief Marker class to iterate over the elements of a Region in flat mode.
 ///
@@ -65,11 +106,18 @@
 
 /// @brief A RegionNode represents a subregion or a BasicBlock that is part of a
 /// Region.
-class RegionNode {
-  RegionNode(const RegionNode &) LLVM_DELETED_FUNCTION;
-  const RegionNode &operator=(const RegionNode &) LLVM_DELETED_FUNCTION;
+template <class Tr>
+class RegionNodeBase {
+  friend class RegionBase<Tr>;
 
-protected:
+public:
+  typedef typename Tr::BlockT BlockT;
+  typedef typename Tr::RegionT RegionT;
+
+private:
+  RegionNodeBase(const RegionNodeBase &) LLVM_DELETED_FUNCTION;
+  const RegionNodeBase &operator=(const RegionNodeBase &) LLVM_DELETED_FUNCTION;
+
   /// This is the entry basic block that starts this region node.  If this is a
   /// BasicBlock RegionNode, then entry is just the basic block, that this
   /// RegionNode represents.  Otherwise it is the entry of this (Sub)RegionNode.
@@ -80,13 +128,13 @@
   /// The node can hold either a Region or a BasicBlock.
   /// Use one bit to save, if this RegionNode is a subregion or BasicBlock
   /// RegionNode.
-  PointerIntPair<BasicBlock*, 1, bool> entry;
+  PointerIntPair<BlockT *, 1, bool> entry;
 
   /// @brief The parent Region of this RegionNode.
   /// @see getParent()
-  Region* parent;
+  RegionT *parent;
 
-public:
+protected:
   /// @brief Create a RegionNode.
   ///
   /// @param Parent      The parent of this RegionNode.
@@ -95,9 +143,11 @@
   ///                    BasicBlock itself.  If it represents a subregion, this
   ///                    is the entry BasicBlock of the subregion.
   /// @param isSubRegion If this RegionNode represents a SubRegion.
-  inline RegionNode(Region* Parent, BasicBlock* Entry, bool isSubRegion = 0)
-    : entry(Entry, isSubRegion), parent(Parent) {}
+  inline RegionNodeBase(RegionT *Parent, BlockT *Entry,
+                        bool isSubRegion = false)
+      : entry(Entry, isSubRegion), parent(Parent) {}
 
+public:
   /// @brief Get the parent Region of this RegionNode.
   ///
   /// The parent Region is the Region this RegionNode belongs to. If for
@@ -106,7 +156,7 @@
   /// pointing to the Region this RegionNode belongs to.
   ///
   /// @return Get the parent Region of this RegionNode.
-  inline Region* getParent() const { return parent; }
+  inline RegionT *getParent() const { return parent; }
 
   /// @brief Get the entry BasicBlock of this RegionNode.
   ///
@@ -114,7 +164,7 @@
   /// itself, otherwise we return the entry BasicBlock of the Subregion
   ///
   /// @return The entry BasicBlock of this RegionNode.
-  inline BasicBlock* getEntry() const { return entry.getPointer(); }
+  inline BlockT *getEntry() const { return entry.getPointer(); }
 
   /// @brief Get the content of this RegionNode.
   ///
@@ -122,33 +172,15 @@
   /// check the type of the content with the isSubRegion() function call.
   ///
   /// @return The content of this RegionNode.
-  template<class T>
-  inline T* getNodeAs() const;
+  template <class T> inline T *getNodeAs() const;
 
   /// @brief Is this RegionNode a subregion?
   ///
   /// @return True if it contains a subregion. False if it contains a
   ///         BasicBlock.
-  inline bool isSubRegion() const {
-    return entry.getInt();
-  }
+  inline bool isSubRegion() const { return entry.getInt(); }
 };
 
-/// Print a RegionNode.
-inline raw_ostream &operator<<(raw_ostream &OS, const RegionNode &Node);
-
-template<>
-inline BasicBlock* RegionNode::getNodeAs<BasicBlock>() const {
-  assert(!isSubRegion() && "This is not a BasicBlock RegionNode!");
-  return getEntry();
-}
-
-template<>
-inline Region* RegionNode::getNodeAs<Region>() const {
-  assert(isSubRegion() && "This is not a subregion RegionNode!");
-  return reinterpret_cast<Region*>(const_cast<RegionNode*>(this));
-}
-
 //===----------------------------------------------------------------------===//
 /// @brief A single entry single exit Region.
 ///
@@ -211,37 +243,53 @@
 ///
 /// The first call returns a textual representation of the program structure
 /// tree, the second one creates a graphical representation using graphviz.
-class Region : public RegionNode {
-  friend class RegionInfo;
-  Region(const Region &) LLVM_DELETED_FUNCTION;
-  const Region &operator=(const Region &) LLVM_DELETED_FUNCTION;
+template <class Tr>
+class RegionBase : public RegionNodeBase<Tr> {
+  typedef typename Tr::FuncT FuncT;
+  typedef typename Tr::BlockT BlockT;
+  typedef typename Tr::RegionInfoT RegionInfoT;
+  typedef typename Tr::RegionT RegionT;
+  typedef typename Tr::RegionNodeT RegionNodeT;
+  typedef typename Tr::DomTreeT DomTreeT;
+  typedef typename Tr::LoopT LoopT;
+  typedef typename Tr::LoopInfoT LoopInfoT;
+  typedef typename Tr::InstT InstT;
+
+  typedef GraphTraits<BlockT *> BlockTraits;
+  typedef GraphTraits<Inverse<BlockT *>> InvBlockTraits;
+  typedef typename BlockTraits::ChildIteratorType SuccIterTy;
+  typedef typename InvBlockTraits::ChildIteratorType PredIterTy;
+
+  friend class RegionInfoBase<Tr>;
+  RegionBase(const RegionBase &) LLVM_DELETED_FUNCTION;
+  const RegionBase &operator=(const RegionBase &) LLVM_DELETED_FUNCTION;
 
   // Information necessary to manage this Region.
-  RegionInfo* RI;
-  DominatorTree *DT;
+  RegionInfoT *RI;
+  DomTreeT *DT;
 
   // The exit BasicBlock of this region.
   // (The entry BasicBlock is part of RegionNode)
-  BasicBlock *exit;
+  BlockT *exit;
 
-  typedef std::vector<std::unique_ptr<Region>> RegionSet;
+  typedef std::vector<std::unique_ptr<RegionT>> RegionSet;
 
   // The subregions of this region.
   RegionSet children;
 
-  typedef std::map<BasicBlock*, RegionNode*> BBNodeMapT;
+  typedef std::map<BlockT *, RegionNodeT *> BBNodeMapT;
 
   // Save the BasicBlock RegionNodes that are element of this Region.
   mutable BBNodeMapT BBNodeMap;
 
   /// verifyBBInRegion - Check if a BB is in this Region. This check also works
   /// if the region is incorrectly built. (EXPENSIVE!)
-  void verifyBBInRegion(BasicBlock* BB) const;
+  void verifyBBInRegion(BlockT *BB) const;
 
   /// verifyWalk - Walk over all the BBs of the region starting from BB and
   /// verify that all reachable basic blocks are elements of the region.
   /// (EXPENSIVE!)
-  void verifyWalk(BasicBlock* BB, std::set<BasicBlock*>* visitedBB) const;
+  void verifyWalk(BlockT *BB, std::set<BlockT *> *visitedBB) const;
 
   /// verifyRegionNest - Verify if the region and its children are valid
   /// regions (EXPENSIVE!)
@@ -256,27 +304,29 @@
   /// @param DT     The dominator tree of the current function.
   /// @param Parent The surrounding region or NULL if this is a top level
   ///               region.
-  Region(BasicBlock *Entry, BasicBlock *Exit, RegionInfo* RI,
-         DominatorTree *DT, Region *Parent = nullptr);
+  RegionBase(BlockT *Entry, BlockT *Exit, RegionInfoT *RI, DomTreeT *DT,
+             RegionT *Parent = nullptr);
 
   /// Delete the Region and all its subregions.
-  ~Region();
+  ~RegionBase();
 
   /// @brief Get the entry BasicBlock of the Region.
   /// @return The entry BasicBlock of the region.
-  BasicBlock *getEntry() const { return RegionNode::getEntry(); }
+  BlockT *getEntry() const {
+    return RegionNodeBase<Tr>::getEntry();
+  }
 
   /// @brief Replace the entry basic block of the region with the new basic
   ///        block.
   ///
   /// @param BB  The new entry basic block of the region.
-  void replaceEntry(BasicBlock *BB);
+  void replaceEntry(BlockT *BB);
 
   /// @brief Replace the exit basic block of the region with the new basic
   ///        block.
   ///
   /// @param BB  The new exit basic block of the region.
-  void replaceExit(BasicBlock *BB);
+  void replaceExit(BlockT *BB);
 
   /// @brief Recursively replace the entry basic block of the region.
   ///
@@ -285,7 +335,7 @@
   /// this region.
   ///
   /// @param NewEntry The new entry basic block.
-  void replaceEntryRecursive(BasicBlock *NewEntry);
+  void replaceEntryRecursive(BlockT *NewEntry);
 
   /// @brief Recursively replace the exit basic block of the region.
   ///
@@ -294,22 +344,25 @@
   /// this region.
   ///
   /// @param NewExit The new exit basic block.
-  void replaceExitRecursive(BasicBlock *NewExit);
+  void replaceExitRecursive(BlockT *NewExit);
 
   /// @brief Get the exit BasicBlock of the Region.
   /// @return The exit BasicBlock of the Region, NULL if this is the TopLevel
   ///         Region.
-  BasicBlock *getExit() const { return exit; }
+  BlockT *getExit() const { return exit; }
 
   /// @brief Get the parent of the Region.
   /// @return The parent of the Region or NULL if this is a top level
   ///         Region.
-  Region *getParent() const { return RegionNode::getParent(); }
+  RegionT *getParent() const {
+    return RegionNodeBase<Tr>::getParent();
+  }
 
   /// @brief Get the RegionNode representing the current Region.
   /// @return The RegionNode representing the current Region.
-  RegionNode* getNode() const {
-    return const_cast<RegionNode*>(reinterpret_cast<const RegionNode*>(this));
+  RegionNodeT *getNode() const {
+    return const_cast<RegionNodeT *>(
+        reinterpret_cast<const RegionNodeT *>(this));
   }
 
   /// @brief Get the nesting level of this Region.
@@ -330,21 +383,21 @@
   /// @return A region also starting at getEntry(), but reaching to the next
   ///         basic block that forms with getEntry() a (non-canonical) region.
   ///         NULL if such a basic block does not exist.
-  Region *getExpandedRegion() const;
+  RegionT *getExpandedRegion() const;
 
   /// @brief Return the first block of this region's single entry edge,
   ///        if existing.
   ///
   /// @return The BasicBlock starting this region's single entry edge,
   ///         else NULL.
-  BasicBlock *getEnteringBlock() const;
+  BlockT *getEnteringBlock() const;
 
   /// @brief Return the first block of this region's single exit edge,
   ///        if existing.
   ///
   /// @return The BasicBlock starting this region's single exit edge,
   ///         else NULL.
-  BasicBlock *getExitingBlock() const;
+  BlockT *getExitingBlock() const;
 
   /// @brief Is this a simple region?
   ///
@@ -358,50 +411,50 @@
   std::string getNameStr() const;
 
   /// @brief Return the RegionInfo object, that belongs to this Region.
-  RegionInfo *getRegionInfo() const {
-    return RI;
-  }
+  RegionInfoT *getRegionInfo() const { return RI; }
 
   /// PrintStyle - Print region in difference ways.
-  enum PrintStyle { PrintNone, PrintBB, PrintRN  };
+  enum PrintStyle { PrintNone, PrintBB, PrintRN };
 
   /// @brief Print the region.
   ///
   /// @param OS The output stream the Region is printed to.
   /// @param printTree Print also the tree of subregions.
   /// @param level The indentation level used for printing.
-  void print(raw_ostream& OS, bool printTree = true, unsigned level = 0,
-             enum PrintStyle Style = PrintNone) const;
+  void print(raw_ostream &OS, bool printTree = true, unsigned level = 0,
+             PrintStyle Style = PrintNone) const;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// @brief Print the region to stderr.
   void dump() const;
+#endif
 
   /// @brief Check if the region contains a BasicBlock.
   ///
   /// @param BB The BasicBlock that might be contained in this Region.
   /// @return True if the block is contained in the region otherwise false.
-  bool contains(const BasicBlock *BB) const;
+  bool contains(const BlockT *BB) const;
 
   /// @brief Check if the region contains another region.
   ///
   /// @param SubRegion The region that might be contained in this Region.
   /// @return True if SubRegion is contained in the region otherwise false.
-  bool contains(const Region *SubRegion) const {
+  bool contains(const RegionT *SubRegion) const {
     // Toplevel Region.
     if (!getExit())
       return true;
 
-    return contains(SubRegion->getEntry())
-      && (contains(SubRegion->getExit()) || SubRegion->getExit() == getExit());
+    return contains(SubRegion->getEntry()) &&
+           (contains(SubRegion->getExit()) ||
+            SubRegion->getExit() == getExit());
   }
 
   /// @brief Check if the region contains an Instruction.
   ///
   /// @param Inst The Instruction that might be contained in this region.
-  /// @return True if the Instruction is contained in the region otherwise false.
-  bool contains(const Instruction *Inst) const {
-    return contains(Inst->getParent());
-  }
+  /// @return True if the Instruction is contained in the region otherwise
+  /// false.
+  bool contains(const InstT *Inst) const { return contains(Inst->getParent()); }
 
   /// @brief Check if the region contains a loop.
   ///
@@ -410,7 +463,7 @@
   ///         In case a NULL pointer is passed to this function the result
   ///         is false, except for the region that describes the whole function.
   ///         In that case true is returned.
-  bool contains(const Loop *L) const;
+  bool contains(const LoopT *L) const;
 
   /// @brief Get the outermost loop in the region that contains a loop.
   ///
@@ -420,7 +473,7 @@
   /// @param L The loop the lookup is started.
   /// @return The outermost loop in the region, NULL if such a loop does not
   ///         exist or if the region describes the whole function.
-  Loop *outermostLoopInRegion(Loop *L) const;
+  LoopT *outermostLoopInRegion(LoopT *L) const;
 
   /// @brief Get the outermost loop in the region that contains a basic block.
   ///
@@ -431,13 +484,13 @@
   /// @param BB The basic block surrounded by the loop.
   /// @return The outermost loop in the region, NULL if such a loop does not
   ///         exist or if the region describes the whole function.
-  Loop *outermostLoopInRegion(LoopInfo *LI, BasicBlock* BB) const;
+  LoopT *outermostLoopInRegion(LoopInfoT *LI, BlockT *BB) const;
 
   /// @brief Get the subregion that starts at a BasicBlock
   ///
   /// @param BB The BasicBlock the subregion should start.
   /// @return The Subregion if available, otherwise NULL.
-  Region* getSubRegionNode(BasicBlock *BB) const;
+  RegionT *getSubRegionNode(BlockT *BB) const;
 
   /// @brief Get the RegionNode for a BasicBlock
   ///
@@ -445,32 +498,32 @@
   /// @return If available, the RegionNode that represents the subregion
   ///         starting at BB. If no subregion starts at BB, the RegionNode
   ///         representing BB.
-  RegionNode* getNode(BasicBlock *BB) const;
+  RegionNodeT *getNode(BlockT *BB) const;
 
   /// @brief Get the BasicBlock RegionNode for a BasicBlock
   ///
   /// @param BB The BasicBlock for which the RegionNode is requested.
   /// @return The RegionNode representing the BB.
-  RegionNode* getBBNode(BasicBlock *BB) const;
+  RegionNodeT *getBBNode(BlockT *BB) const;
 
   /// @brief Add a new subregion to this Region.
   ///
   /// @param SubRegion The new subregion that will be added.
   /// @param moveChildren Move the children of this region, that are also
   ///                     contained in SubRegion into SubRegion.
-  void addSubRegion(Region *SubRegion, bool moveChildren = false);
+  void addSubRegion(RegionT *SubRegion, bool moveChildren = false);
 
   /// @brief Remove a subregion from this Region.
   ///
   /// The subregion is not deleted, as it will probably be inserted into another
   /// region.
   /// @param SubRegion The SubRegion that will be removed.
-  Region *removeSubRegion(Region *SubRegion);
+  RegionT *removeSubRegion(RegionT *SubRegion);
 
   /// @brief Move all direct child nodes of this Region to another Region.
   ///
   /// @param To The Region the child nodes will be transferred to.
-  void transferChildrenTo(Region *To);
+  void transferChildrenTo(RegionT *To);
 
   /// @brief Verify if the region is a correct region.
   ///
@@ -489,8 +542,8 @@
   ///
   /// These iterators iterator over all subregions of this Region.
   //@{
-  typedef RegionSet::iterator iterator;
-  typedef RegionSet::const_iterator const_iterator;
+  typedef typename RegionSet::iterator iterator;
+  typedef typename RegionSet::const_iterator const_iterator;
 
   iterator begin() { return children.begin(); }
   iterator end() { return children.end(); }
@@ -507,18 +560,18 @@
   //@{
   template <bool IsConst>
   class block_iterator_wrapper
-      : public df_iterator<typename std::conditional<IsConst, const BasicBlock,
-                                                     BasicBlock>::type *> {
-    typedef df_iterator<typename std::conditional<IsConst, const BasicBlock,
-                                                  BasicBlock>::type *> super;
+      : public df_iterator<
+            typename std::conditional<IsConst, const BlockT, BlockT>::type *> {
+    typedef df_iterator<
+        typename std::conditional<IsConst, const BlockT, BlockT>::type *> super;
 
   public:
     typedef block_iterator_wrapper<IsConst> Self;
     typedef typename super::pointer pointer;
 
     // Construct the begin iterator.
-    block_iterator_wrapper(pointer Entry, pointer Exit) : super(df_begin(Entry))
-    {
+    block_iterator_wrapper(pointer Entry, pointer Exit)
+        : super(df_begin(Entry)) {
       // Mark the exit of the region as visited, so that the children of the
       // exit and the exit itself, i.e. the block outside the region will never
       // be visited.
@@ -526,35 +579,29 @@
     }
 
     // Construct the end iterator.
-    block_iterator_wrapper() : super(df_end<pointer>((BasicBlock *)nullptr)) {}
+    block_iterator_wrapper() : super(df_end<pointer>((BlockT *)nullptr)) {}
 
     /*implicit*/ block_iterator_wrapper(super I) : super(I) {}
 
     // FIXME: Even a const_iterator returns a non-const BasicBlock pointer.
     //        This was introduced for backwards compatibility, but should
     //        be removed as soon as all users are fixed.
-    BasicBlock *operator*() const {
-      return const_cast<BasicBlock*>(super::operator*());
+    BlockT *operator*() const {
+      return const_cast<BlockT *>(super::operator*());
     }
   };
 
   typedef block_iterator_wrapper<false> block_iterator;
-  typedef block_iterator_wrapper<true>  const_block_iterator;
+  typedef block_iterator_wrapper<true> const_block_iterator;
 
-  block_iterator block_begin() {
-   return block_iterator(getEntry(), getExit());
-  }
+  block_iterator block_begin() { return block_iterator(getEntry(), getExit()); }
 
-  block_iterator block_end() {
-   return block_iterator();
-  }
+  block_iterator block_end() { return block_iterator(); }
 
   const_block_iterator block_begin() const {
     return const_block_iterator(getEntry(), getExit());
   }
-  const_block_iterator block_end() const {
-    return const_block_iterator();
-  }
+  const_block_iterator block_end() const { return const_block_iterator(); }
 
   typedef iterator_range<block_iterator> block_range;
   typedef iterator_range<const_block_iterator> const_block_range;
@@ -578,12 +625,12 @@
   /// are direct children of this Region. It does not iterate over any
   /// RegionNodes that are also element of a subregion of this Region.
   //@{
-  typedef df_iterator<RegionNode*, SmallPtrSet<RegionNode*, 8>, false,
-                      GraphTraits<RegionNode*> > element_iterator;
+  typedef df_iterator<RegionNodeT *, SmallPtrSet<RegionNodeT *, 8>, false,
+                      GraphTraits<RegionNodeT *>> element_iterator;
 
-  typedef df_iterator<const RegionNode*, SmallPtrSet<const RegionNode*, 8>,
-                      false, GraphTraits<const RegionNode*> >
-            const_element_iterator;
+  typedef df_iterator<const RegionNodeT *, SmallPtrSet<const RegionNodeT *, 8>,
+                      false,
+                      GraphTraits<const RegionNodeT *>> const_element_iterator;
 
   element_iterator element_begin();
   element_iterator element_end();
@@ -593,132 +640,145 @@
   //@}
 };
 
+/// Print a RegionNode.
+template <class Tr>
+inline raw_ostream &operator<<(raw_ostream &OS, const RegionNodeBase<Tr> &Node);
+
 //===----------------------------------------------------------------------===//
 /// @brief Analysis that detects all canonical Regions.
 ///
 /// The RegionInfo pass detects all canonical regions in a function. The Regions
 /// are connected using the parent relation. This builds a Program Structure
 /// Tree.
-class RegionInfo : public FunctionPass {
-  typedef DenseMap<BasicBlock*,BasicBlock*> BBtoBBMap;
-  typedef DenseMap<BasicBlock*, Region*> BBtoRegionMap;
-  typedef SmallPtrSet<Region*, 4> RegionSet;
+template <class Tr>
+class RegionInfoBase {
+  typedef typename Tr::BlockT BlockT;
+  typedef typename Tr::FuncT FuncT;
+  typedef typename Tr::RegionT RegionT;
+  typedef typename Tr::RegionInfoT RegionInfoT;
+  typedef typename Tr::DomTreeT DomTreeT;
+  typedef typename Tr::DomTreeNodeT DomTreeNodeT;
+  typedef typename Tr::PostDomTreeT PostDomTreeT;
+  typedef typename Tr::DomFrontierT DomFrontierT;
+  typedef GraphTraits<BlockT *> BlockTraits;
+  typedef GraphTraits<Inverse<BlockT *>> InvBlockTraits;
+  typedef typename BlockTraits::ChildIteratorType SuccIterTy;
+  typedef typename InvBlockTraits::ChildIteratorType PredIterTy;
 
-  RegionInfo(const RegionInfo &) LLVM_DELETED_FUNCTION;
-  const RegionInfo &operator=(const RegionInfo &) LLVM_DELETED_FUNCTION;
+  friend class RegionInfo;
+  friend class MachineRegionInfo;
+  typedef DenseMap<BlockT *, BlockT *> BBtoBBMap;
+  typedef DenseMap<BlockT *, RegionT *> BBtoRegionMap;
+  typedef SmallPtrSet<RegionT *, 4> RegionSet;
 
-  DominatorTree *DT;
-  PostDominatorTree *PDT;
-  DominanceFrontier *DF;
+  RegionInfoBase();
+  virtual ~RegionInfoBase();
+
+  RegionInfoBase(const RegionInfoBase &) LLVM_DELETED_FUNCTION;
+  const RegionInfoBase &operator=(const RegionInfoBase &) LLVM_DELETED_FUNCTION;
+
+  DomTreeT *DT;
+  PostDomTreeT *PDT;
+  DomFrontierT *DF;
 
   /// The top level region.
-  Region *TopLevelRegion;
+  RegionT *TopLevelRegion;
 
+private:
   /// Map every BB to the smallest region, that contains BB.
   BBtoRegionMap BBtoRegion;
 
   // isCommonDomFrontier - Returns true if BB is in the dominance frontier of
   // entry, because it was inherited from exit. In the other case there is an
   // edge going from entry to BB without passing exit.
-  bool isCommonDomFrontier(BasicBlock* BB, BasicBlock* entry,
-                           BasicBlock* exit) const;
+  bool isCommonDomFrontier(BlockT *BB, BlockT *entry, BlockT *exit) const;
 
   // isRegion - Check if entry and exit surround a valid region, based on
   // dominance tree and dominance frontier.
-  bool isRegion(BasicBlock* entry, BasicBlock* exit) const;
+  bool isRegion(BlockT *entry, BlockT *exit) const;
 
   // insertShortCut - Saves a shortcut pointing from entry to exit.
   // This function may extend this shortcut if possible.
-  void insertShortCut(BasicBlock* entry, BasicBlock* exit,
-                      BBtoBBMap* ShortCut) const;
+  void insertShortCut(BlockT *entry, BlockT *exit, BBtoBBMap *ShortCut) const;
 
   // getNextPostDom - Returns the next BB that postdominates N, while skipping
   // all post dominators that cannot finish a canonical region.
-  DomTreeNode *getNextPostDom(DomTreeNode* N, BBtoBBMap *ShortCut) const;
+  DomTreeNodeT *getNextPostDom(DomTreeNodeT *N, BBtoBBMap *ShortCut) const;
 
   // isTrivialRegion - A region is trivial, if it contains only one BB.
-  bool isTrivialRegion(BasicBlock *entry, BasicBlock *exit) const;
+  bool isTrivialRegion(BlockT *entry, BlockT *exit) const;
 
   // createRegion - Creates a single entry single exit region.
-  Region *createRegion(BasicBlock *entry, BasicBlock *exit);
+  RegionT *createRegion(BlockT *entry, BlockT *exit);
 
   // findRegionsWithEntry - Detect all regions starting with bb 'entry'.
-  void findRegionsWithEntry(BasicBlock *entry, BBtoBBMap *ShortCut);
+  void findRegionsWithEntry(BlockT *entry, BBtoBBMap *ShortCut);
 
   // scanForRegions - Detects regions in F.
-  void scanForRegions(Function &F, BBtoBBMap *ShortCut);
+  void scanForRegions(FuncT &F, BBtoBBMap *ShortCut);
 
   // getTopMostParent - Get the top most parent with the same entry block.
-  Region *getTopMostParent(Region *region);
+  RegionT *getTopMostParent(RegionT *region);
 
   // buildRegionsTree - build the region hierarchy after all region detected.
-  void buildRegionsTree(DomTreeNode *N, Region *region);
-
-  // Calculate - detecte all regions in function and build the region tree.
-  void Calculate(Function& F);
-
-  void releaseMemory() override;
+  void buildRegionsTree(DomTreeNodeT *N, RegionT *region);
 
   // updateStatistics - Update statistic about created regions.
-  void updateStatistics(Region *R);
+  virtual void updateStatistics(RegionT *R) = 0;
 
-  // isSimple - Check if a region is a simple region with exactly one entry
-  // edge and exactly one exit edge.
-  bool isSimple(Region* R) const;
+  // calculate - detect all regions in function and build the region tree.
+  void calculate(FuncT &F);
 
 public:
-  static char ID;
-  explicit RegionInfo();
+  static bool VerifyRegionInfo;
+  static typename RegionT::PrintStyle printStyle;
 
-  ~RegionInfo();
+  void print(raw_ostream &OS) const;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() const;
+#endif
 
-  /// @name FunctionPass interface
-  //@{
-  bool runOnFunction(Function &F) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void print(raw_ostream &OS, const Module *) const override;
-  void verifyAnalysis() const override;
-  //@}
+  void releaseMemory();
 
   /// @brief Get the smallest region that contains a BasicBlock.
   ///
   /// @param BB The basic block.
   /// @return The smallest region, that contains BB or NULL, if there is no
   /// region containing BB.
-  Region *getRegionFor(BasicBlock *BB) const;
+  RegionT *getRegionFor(BlockT *BB) const;
 
   /// @brief  Set the smallest region that surrounds a basic block.
   ///
   /// @param BB The basic block surrounded by a region.
   /// @param R The smallest region that surrounds BB.
-  void setRegionFor(BasicBlock *BB, Region *R);
+  void setRegionFor(BlockT *BB, RegionT *R);
 
   /// @brief A shortcut for getRegionFor().
   ///
   /// @param BB The basic block.
   /// @return The smallest region, that contains BB or NULL, if there is no
   /// region containing BB.
-  Region *operator[](BasicBlock *BB) const;
+  RegionT *operator[](BlockT *BB) const;
 
   /// @brief Return the exit of the maximal refined region, that starts at a
   /// BasicBlock.
   ///
   /// @param BB The BasicBlock the refined region starts.
-  BasicBlock *getMaxRegionExit(BasicBlock *BB) const;
+  BlockT *getMaxRegionExit(BlockT *BB) const;
 
   /// @brief Find the smallest region that contains two regions.
   ///
   /// @param A The first region.
   /// @param B The second region.
   /// @return The smallest region containing A and B.
-  Region *getCommonRegion(Region* A, Region *B) const;
+  RegionT *getCommonRegion(RegionT *A, RegionT *B) const;
 
   /// @brief Find the smallest region that contains two basic blocks.
   ///
   /// @param A The first basic block.
   /// @param B The second basic block.
   /// @return The smallest region that contains A and B.
-  Region* getCommonRegion(BasicBlock* A, BasicBlock *B) const {
+  RegionT *getCommonRegion(BlockT *A, BlockT *B) const {
     return getCommonRegion(getRegionFor(A), getRegionFor(B));
   }
 
@@ -726,23 +786,21 @@
   ///
   /// @param Regions A vector of regions.
   /// @return The smallest region that contains all regions in Regions.
-  Region* getCommonRegion(SmallVectorImpl<Region*> &Regions) const;
+  RegionT *getCommonRegion(SmallVectorImpl<RegionT *> &Regions) const;
 
   /// @brief Find the smallest region that contains a set of basic blocks.
   ///
   /// @param BBs A vector of basic blocks.
   /// @return The smallest region that contains all basic blocks in BBS.
-  Region* getCommonRegion(SmallVectorImpl<BasicBlock*> &BBs) const;
+  RegionT *getCommonRegion(SmallVectorImpl<BlockT *> &BBs) const;
 
-  Region *getTopLevelRegion() const {
-    return TopLevelRegion;
-  }
+  RegionT *getTopLevelRegion() const { return TopLevelRegion; }
 
   /// @brief Update RegionInfo after a basic block was split.
   ///
   /// @param NewBB The basic block that was created before OldBB.
   /// @param OldBB The old basic block.
-  void splitBlock(BasicBlock* NewBB, BasicBlock *OldBB);
+  void splitBlock(BlockT *NewBB, BlockT *OldBB);
 
   /// @brief Clear the Node Cache for all Regions.
   ///
@@ -751,14 +809,104 @@
     if (TopLevelRegion)
       TopLevelRegion->clearNodeCache();
   }
+
+  void verifyAnalysis() const;
 };
 
-inline raw_ostream &operator<<(raw_ostream &OS, const RegionNode &Node) {
-  if (Node.isSubRegion())
-    return OS << Node.getNodeAs<Region>()->getNameStr();
-  else
-    return OS << Node.getNodeAs<BasicBlock>()->getName();
+class Region;
+
+class RegionNode : public RegionNodeBase<RegionTraits<Function>> {
+public:
+  inline RegionNode(Region *Parent, BasicBlock *Entry, bool isSubRegion = false)
+      : RegionNodeBase<RegionTraits<Function>>(Parent, Entry, isSubRegion) {}
+
+  ~RegionNode() {}
+
+  bool operator==(const Region &RN) const {
+    return this == reinterpret_cast<const RegionNode *>(&RN);
+  }
+};
+
+class Region : public RegionBase<RegionTraits<Function>> {
+public:
+  Region(BasicBlock *Entry, BasicBlock *Exit, RegionInfo *RI, DominatorTree *DT,
+         Region *Parent = nullptr);
+  ~Region();
+
+  bool operator==(const RegionNode &RN) const {
+    return &RN == reinterpret_cast<const RegionNode *>(this);
+  }
+};
+
+class RegionInfo : public RegionInfoBase<RegionTraits<Function>> {
+public:
+  explicit RegionInfo();
+
+  virtual ~RegionInfo();
+
+  // updateStatistics - Update statistic about created regions.
+  void updateStatistics(Region *R) final;
+
+  void recalculate(Function &F, DominatorTree *DT, PostDominatorTree *PDT,
+                   DominanceFrontier *DF);
+};
+
+class RegionInfoPass : public FunctionPass {
+  RegionInfo RI;
+
+public:
+  static char ID;
+  explicit RegionInfoPass();
+
+  ~RegionInfoPass();
+
+  RegionInfo &getRegionInfo() { return RI; }
+
+  const RegionInfo &getRegionInfo() const { return RI; }
+
+  /// @name FunctionPass interface
+  //@{
+  bool runOnFunction(Function &F) override;
+  void releaseMemory() override;
+  void verifyAnalysis() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void print(raw_ostream &OS, const Module *) const override;
+  void dump() const;
+  //@}
+};
+
+template <>
+template <>
+inline BasicBlock *
+RegionNodeBase<RegionTraits<Function>>::getNodeAs<BasicBlock>() const {
+  assert(!isSubRegion() && "This is not a BasicBlock RegionNode!");
+  return getEntry();
 }
+
+template <>
+template <>
+inline Region *
+RegionNodeBase<RegionTraits<Function>>::getNodeAs<Region>() const {
+  assert(isSubRegion() && "This is not a subregion RegionNode!");
+  auto Unconst = const_cast<RegionNodeBase<RegionTraits<Function>> *>(this);
+  return reinterpret_cast<Region *>(Unconst);
+}
+
+template <class Tr>
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const RegionNodeBase<Tr> &Node) {
+  typedef typename Tr::BlockT BlockT;
+  typedef typename Tr::RegionT RegionT;
+
+  if (Node.isSubRegion())
+    return OS << Node.template getNodeAs<RegionT>()->getNameStr();
+  else
+    return OS << Node.template getNodeAs<BlockT>()->getName();
+}
+
+EXTERN_TEMPLATE_INSTANTIATION(class RegionBase<RegionTraits<Function>>);
+EXTERN_TEMPLATE_INSTANTIATION(class RegionNodeBase<RegionTraits<Function>>);
+EXTERN_TEMPLATE_INSTANTIATION(class RegionInfoBase<RegionTraits<Function>>);
+
 } // End llvm namespace
 #endif
-

diff --git a/include/llvm/Analysis/RegionInfoImpl.h b/include/llvm/Analysis/RegionInfoImpl.h
new file mode 100644
index 0000000..b5d0bb3
--- /dev/null
+++ b/include/llvm/Analysis/RegionInfoImpl.h

@@ -0,0 +1,923 @@
+//===- RegionInfoImpl.h - SESE region detection analysis --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Detects single entry single exit regions in the control flow graph.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_REGIONINFOIMPL_H
+#define LLVM_ANALYSIS_REGIONINFOIMPL_H
+
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <iterator>
+#include <set>
+
+namespace llvm {
+
+#define DEBUG_TYPE "region"
+
+//===----------------------------------------------------------------------===//
+/// RegionBase Implementation
+template <class Tr>
+RegionBase<Tr>::RegionBase(BlockT *Entry, BlockT *Exit,
+                           typename Tr::RegionInfoT *RInfo, DomTreeT *dt,
+                           RegionT *Parent)
+    : RegionNodeBase<Tr>(Parent, Entry, 1), RI(RInfo), DT(dt), exit(Exit) {}
+
+template <class Tr>
+RegionBase<Tr>::~RegionBase() {
+  // Free the cached nodes.
+  for (typename BBNodeMapT::iterator it = BBNodeMap.begin(),
+                                     ie = BBNodeMap.end();
+       it != ie; ++it)
+    delete it->second;
+
+  // Only clean the cache for this Region. Caches of child Regions will be
+  // cleaned when the child Regions are deleted.
+  BBNodeMap.clear();
+}
+
+template <class Tr>
+void RegionBase<Tr>::replaceEntry(BlockT *BB) {
+  this->entry.setPointer(BB);
+}
+
+template <class Tr>
+void RegionBase<Tr>::replaceExit(BlockT *BB) {
+  assert(exit && "No exit to replace!");
+  exit = BB;
+}
+
+template <class Tr>
+void RegionBase<Tr>::replaceEntryRecursive(BlockT *NewEntry) {
+  std::vector<RegionT *> RegionQueue;
+  BlockT *OldEntry = getEntry();
+
+  RegionQueue.push_back(static_cast<RegionT *>(this));
+  while (!RegionQueue.empty()) {
+    RegionT *R = RegionQueue.back();
+    RegionQueue.pop_back();
+
+    R->replaceEntry(NewEntry);
+    for (typename RegionT::const_iterator RI = R->begin(), RE = R->end();
+         RI != RE; ++RI) {
+      if ((*RI)->getEntry() == OldEntry)
+        RegionQueue.push_back(RI->get());
+    }
+  }
+}
+
+template <class Tr>
+void RegionBase<Tr>::replaceExitRecursive(BlockT *NewExit) {
+  std::vector<RegionT *> RegionQueue;
+  BlockT *OldExit = getExit();
+
+  RegionQueue.push_back(static_cast<RegionT *>(this));
+  while (!RegionQueue.empty()) {
+    RegionT *R = RegionQueue.back();
+    RegionQueue.pop_back();
+
+    R->replaceExit(NewExit);
+    for (typename RegionT::const_iterator RI = R->begin(), RE = R->end();
+         RI != RE; ++RI) {
+      if ((*RI)->getExit() == OldExit)
+        RegionQueue.push_back(RI->get());
+    }
+  }
+}
+
+template <class Tr>
+bool RegionBase<Tr>::contains(const BlockT *B) const {
+  BlockT *BB = const_cast<BlockT *>(B);
+
+  if (!DT->getNode(BB))
+    return false;
+
+  BlockT *entry = getEntry(), *exit = getExit();
+
+  // Toplevel region.
+  if (!exit)
+    return true;
+
+  return (DT->dominates(entry, BB) &&
+          !(DT->dominates(exit, BB) && DT->dominates(entry, exit)));
+}
+
+template <class Tr>
+bool RegionBase<Tr>::contains(const LoopT *L) const {
+  // BBs that are not part of any loop are element of the Loop
+  // described by the NULL pointer. This loop is not part of any region,
+  // except if the region describes the whole function.
+  if (!L)
+    return getExit() == nullptr;
+
+  if (!contains(L->getHeader()))
+    return false;
+
+  SmallVector<BlockT *, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  for (BlockT *BB : ExitingBlocks) {
+    if (!contains(BB))
+      return false;
+  }
+
+  return true;
+}
+
+template <class Tr>
+typename Tr::LoopT *RegionBase<Tr>::outermostLoopInRegion(LoopT *L) const {
+  if (!contains(L))
+    return nullptr;
+
+  while (L && contains(L->getParentLoop())) {
+    L = L->getParentLoop();
+  }
+
+  return L;
+}
+
+template <class Tr>
+typename Tr::LoopT *RegionBase<Tr>::outermostLoopInRegion(LoopInfoT *LI,
+                                                          BlockT *BB) const {
+  assert(LI && BB && "LI and BB cannot be null!");
+  LoopT *L = LI->getLoopFor(BB);
+  return outermostLoopInRegion(L);
+}
+
+template <class Tr>
+typename RegionBase<Tr>::BlockT *RegionBase<Tr>::getEnteringBlock() const {
+  BlockT *entry = getEntry();
+  BlockT *Pred;
+  BlockT *enteringBlock = nullptr;
+
+  for (PredIterTy PI = InvBlockTraits::child_begin(entry),
+                  PE = InvBlockTraits::child_end(entry);
+       PI != PE; ++PI) {
+    Pred = *PI;
+    if (DT->getNode(Pred) && !contains(Pred)) {
+      if (enteringBlock)
+        return nullptr;
+
+      enteringBlock = Pred;
+    }
+  }
+
+  return enteringBlock;
+}
+
+template <class Tr>
+typename RegionBase<Tr>::BlockT *RegionBase<Tr>::getExitingBlock() const {
+  BlockT *exit = getExit();
+  BlockT *Pred;
+  BlockT *exitingBlock = nullptr;
+
+  if (!exit)
+    return nullptr;
+
+  for (PredIterTy PI = InvBlockTraits::child_begin(exit),
+                  PE = InvBlockTraits::child_end(exit);
+       PI != PE; ++PI) {
+    Pred = *PI;
+    if (contains(Pred)) {
+      if (exitingBlock)
+        return nullptr;
+
+      exitingBlock = Pred;
+    }
+  }
+
+  return exitingBlock;
+}
+
+template <class Tr>
+bool RegionBase<Tr>::isSimple() const {
+  return !isTopLevelRegion() && getEnteringBlock() && getExitingBlock();
+}
+
+template <class Tr>
+std::string RegionBase<Tr>::getNameStr() const {
+  std::string exitName;
+  std::string entryName;
+
+  if (getEntry()->getName().empty()) {
+    raw_string_ostream OS(entryName);
+
+    getEntry()->printAsOperand(OS, false);
+  } else
+    entryName = getEntry()->getName();
+
+  if (getExit()) {
+    if (getExit()->getName().empty()) {
+      raw_string_ostream OS(exitName);
+
+      getExit()->printAsOperand(OS, false);
+    } else
+      exitName = getExit()->getName();
+  } else
+    exitName = "<Function Return>";
+
+  return entryName + " => " + exitName;
+}
+
+template <class Tr>
+void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
+  if (!contains(BB))
+    llvm_unreachable("Broken region found!");
+
+  BlockT *entry = getEntry(), *exit = getExit();
+
+  for (SuccIterTy SI = BlockTraits::child_begin(BB),
+                  SE = BlockTraits::child_end(BB);
+       SI != SE; ++SI) {
+    if (!contains(*SI) && exit != *SI)
+      llvm_unreachable("Broken region found!");
+  }
+
+  if (entry != BB) {
+    for (PredIterTy SI = InvBlockTraits::child_begin(BB),
+                    SE = InvBlockTraits::child_end(BB);
+         SI != SE; ++SI) {
+      if (!contains(*SI))
+        llvm_unreachable("Broken region found!");
+    }
+  }
+}
+
+template <class Tr>
+void RegionBase<Tr>::verifyWalk(BlockT *BB, std::set<BlockT *> *visited) const {
+  BlockT *exit = getExit();
+
+  visited->insert(BB);
+
+  verifyBBInRegion(BB);
+
+  for (SuccIterTy SI = BlockTraits::child_begin(BB),
+                  SE = BlockTraits::child_end(BB);
+       SI != SE; ++SI) {
+    if (*SI != exit && visited->find(*SI) == visited->end())
+      verifyWalk(*SI, visited);
+  }
+}
+
+template <class Tr>
+void RegionBase<Tr>::verifyRegion() const {
+  // Only do verification when user wants to, otherwise this expensive check
+  // will be invoked by PMDataManager::verifyPreservedAnalysis when
+  // a regionpass (marked PreservedAll) finish.
+  if (!RegionInfoBase<Tr>::VerifyRegionInfo)
+    return;
+
+  std::set<BlockT *> visited;
+  verifyWalk(getEntry(), &visited);
+}
+
+template <class Tr>
+void RegionBase<Tr>::verifyRegionNest() const {
+  for (typename RegionT::const_iterator RI = begin(), RE = end(); RI != RE;
+       ++RI)
+    (*RI)->verifyRegionNest();
+
+  verifyRegion();
+}
+
+template <class Tr>
+typename RegionBase<Tr>::element_iterator RegionBase<Tr>::element_begin() {
+  return GraphTraits<RegionT *>::nodes_begin(static_cast<RegionT *>(this));
+}
+
+template <class Tr>
+typename RegionBase<Tr>::element_iterator RegionBase<Tr>::element_end() {
+  return GraphTraits<RegionT *>::nodes_end(static_cast<RegionT *>(this));
+}
+
+template <class Tr>
+typename RegionBase<Tr>::const_element_iterator
+RegionBase<Tr>::element_begin() const {
+  return GraphTraits<const RegionT *>::nodes_begin(
+      static_cast<const RegionT *>(this));
+}
+
+template <class Tr>
+typename RegionBase<Tr>::const_element_iterator
+RegionBase<Tr>::element_end() const {
+  return GraphTraits<const RegionT *>::nodes_end(
+      static_cast<const RegionT *>(this));
+}
+
+template <class Tr>
+typename Tr::RegionT *RegionBase<Tr>::getSubRegionNode(BlockT *BB) const {
+  typedef typename Tr::RegionT RegionT;
+  RegionT *R = RI->getRegionFor(BB);
+
+  if (!R || R == this)
+    return nullptr;
+
+  // If we pass the BB out of this region, that means our code is broken.
+  assert(contains(R) && "BB not in current region!");
+
+  while (contains(R->getParent()) && R->getParent() != this)
+    R = R->getParent();
+
+  if (R->getEntry() != BB)
+    return nullptr;
+
+  return R;
+}
+
+template <class Tr>
+typename Tr::RegionNodeT *RegionBase<Tr>::getBBNode(BlockT *BB) const {
+  assert(contains(BB) && "Can get BB node out of this region!");
+
+  typename BBNodeMapT::const_iterator at = BBNodeMap.find(BB);
+
+  if (at != BBNodeMap.end())
+    return at->second;
+
+  auto Deconst = const_cast<RegionBase<Tr> *>(this);
+  RegionNodeT *NewNode = new RegionNodeT(static_cast<RegionT *>(Deconst), BB);
+  BBNodeMap.insert(std::make_pair(BB, NewNode));
+  return NewNode;
+}
+
+template <class Tr>
+typename Tr::RegionNodeT *RegionBase<Tr>::getNode(BlockT *BB) const {
+  assert(contains(BB) && "Can get BB node out of this region!");
+  if (RegionT *Child = getSubRegionNode(BB))
+    return Child->getNode();
+
+  return getBBNode(BB);
+}
+
+template <class Tr>
+void RegionBase<Tr>::transferChildrenTo(RegionT *To) {
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    (*I)->parent = To;
+    To->children.push_back(std::move(*I));
+  }
+  children.clear();
+}
+
+template <class Tr>
+void RegionBase<Tr>::addSubRegion(RegionT *SubRegion, bool moveChildren) {
+  assert(!SubRegion->parent && "SubRegion already has a parent!");
+  assert(std::find_if(begin(), end(), [&](const std::unique_ptr<RegionT> &R) {
+           return R.get() == SubRegion;
+         }) == children.end() &&
+         "Subregion already exists!");
+
+  SubRegion->parent = static_cast<RegionT *>(this);
+  children.push_back(std::unique_ptr<RegionT>(SubRegion));
+
+  if (!moveChildren)
+    return;
+
+  assert(SubRegion->children.empty() &&
+         "SubRegions that contain children are not supported");
+
+  for (element_iterator I = element_begin(), E = element_end(); I != E; ++I) {
+    if (!(*I)->isSubRegion()) {
+      BlockT *BB = (*I)->template getNodeAs<BlockT>();
+
+      if (SubRegion->contains(BB))
+        RI->setRegionFor(BB, SubRegion);
+    }
+  }
+
+  std::vector<std::unique_ptr<RegionT>> Keep;
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    if (SubRegion->contains(I->get()) && I->get() != SubRegion) {
+      (*I)->parent = SubRegion;
+      SubRegion->children.push_back(std::move(*I));
+    } else
+      Keep.push_back(std::move(*I));
+  }
+
+  children.clear();
+  children.insert(
+      children.begin(),
+      std::move_iterator<typename RegionSet::iterator>(Keep.begin()),
+      std::move_iterator<typename RegionSet::iterator>(Keep.end()));
+}
+
+template <class Tr>
+typename Tr::RegionT *RegionBase<Tr>::removeSubRegion(RegionT *Child) {
+  assert(Child->parent == this && "Child is not a child of this region!");
+  Child->parent = nullptr;
+  typename RegionSet::iterator I = std::find_if(
+      children.begin(), children.end(),
+      [&](const std::unique_ptr<RegionT> &R) { return R.get() == Child; });
+  assert(I != children.end() && "Region does not exit. Unable to remove.");
+  children.erase(children.begin() + (I - begin()));
+  return Child;
+}
+
+template <class Tr>
+unsigned RegionBase<Tr>::getDepth() const {
+  unsigned Depth = 0;
+
+  for (RegionT *R = getParent(); R != nullptr; R = R->getParent())
+    ++Depth;
+
+  return Depth;
+}
+
+template <class Tr>
+typename Tr::RegionT *RegionBase<Tr>::getExpandedRegion() const {
+  unsigned NumSuccessors = Tr::getNumSuccessors(exit);
+
+  if (NumSuccessors == 0)
+    return nullptr;
+
+  for (PredIterTy PI = InvBlockTraits::child_begin(getExit()),
+                  PE = InvBlockTraits::child_end(getExit());
+       PI != PE; ++PI) {
+    if (!DT->dominates(getEntry(), *PI))
+      return nullptr;
+  }
+
+  RegionT *R = RI->getRegionFor(exit);
+
+  if (R->getEntry() != exit) {
+    if (Tr::getNumSuccessors(exit) == 1)
+      return new RegionT(getEntry(), *BlockTraits::child_begin(exit), RI, DT);
+    return nullptr;
+  }
+
+  while (R->getParent() && R->getParent()->getEntry() == exit)
+    R = R->getParent();
+
+  if (!DT->dominates(getEntry(), R->getExit())) {
+    for (PredIterTy PI = InvBlockTraits::child_begin(getExit()),
+                    PE = InvBlockTraits::child_end(getExit());
+         PI != PE; ++PI) {
+      if (!DT->dominates(R->getExit(), *PI))
+        return nullptr;
+    }
+  }
+
+  return new RegionT(getEntry(), R->getExit(), RI, DT);
+}
+
+template <class Tr>
+void RegionBase<Tr>::print(raw_ostream &OS, bool print_tree, unsigned level,
+                           PrintStyle Style) const {
+  if (print_tree)
+    OS.indent(level * 2) << '[' << level << "] " << getNameStr();
+  else
+    OS.indent(level * 2) << getNameStr();
+
+  OS << '\n';
+
+  if (Style != PrintNone) {
+    OS.indent(level * 2) << "{\n";
+    OS.indent(level * 2 + 2);
+
+    if (Style == PrintBB) {
+      for (const auto &BB : blocks())
+        OS << BB->getName() << ", "; // TODO: remove the last ","
+    } else if (Style == PrintRN) {
+      for (const_element_iterator I = element_begin(), E = element_end();
+           I != E; ++I) {
+        OS << **I << ", "; // TODO: remove the last ",
+      }
+    }
+
+    OS << '\n';
+  }
+
+  if (print_tree) {
+    for (const_iterator RI = begin(), RE = end(); RI != RE; ++RI)
+      (*RI)->print(OS, print_tree, level + 1, Style);
+  }
+
+  if (Style != PrintNone)
+    OS.indent(level * 2) << "} \n";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+template <class Tr>
+void RegionBase<Tr>::dump() const {
+  print(dbgs(), true, getDepth(), RegionInfoBase<Tr>::printStyle);
+}
+#endif
+
+template <class Tr>
+void RegionBase<Tr>::clearNodeCache() {
+  // Free the cached nodes.
+  for (typename BBNodeMapT::iterator I = BBNodeMap.begin(),
+                                     IE = BBNodeMap.end();
+       I != IE; ++I)
+    delete I->second;
+
+  BBNodeMap.clear();
+  for (typename RegionT::iterator RI = begin(), RE = end(); RI != RE; ++RI)
+    (*RI)->clearNodeCache();
+}
+
+//===----------------------------------------------------------------------===//
+// RegionInfoBase implementation
+//
+
+template <class Tr>
+RegionInfoBase<Tr>::RegionInfoBase()
+    : TopLevelRegion(nullptr) {}
+
+template <class Tr>
+RegionInfoBase<Tr>::~RegionInfoBase() {
+  releaseMemory();
+}
+
+template <class Tr>
+bool RegionInfoBase<Tr>::isCommonDomFrontier(BlockT *BB, BlockT *entry,
+                                             BlockT *exit) const {
+  for (PredIterTy PI = InvBlockTraits::child_begin(BB),
+                  PE = InvBlockTraits::child_end(BB);
+       PI != PE; ++PI) {
+    BlockT *P = *PI;
+    if (DT->dominates(entry, P) && !DT->dominates(exit, P))
+      return false;
+  }
+
+  return true;
+}
+
+template <class Tr>
+bool RegionInfoBase<Tr>::isRegion(BlockT *entry, BlockT *exit) const {
+  assert(entry && exit && "entry and exit must not be null!");
+  typedef typename DomFrontierT::DomSetType DST;
+
+  DST *entrySuccs = &DF->find(entry)->second;
+
+  // Exit is the header of a loop that contains the entry. In this case,
+  // the dominance frontier must only contain the exit.
+  if (!DT->dominates(entry, exit)) {
+    for (typename DST::iterator SI = entrySuccs->begin(),
+                                SE = entrySuccs->end();
+         SI != SE; ++SI) {
+      if (*SI != exit && *SI != entry)
+        return false;
+    }
+
+    return true;
+  }
+
+  DST *exitSuccs = &DF->find(exit)->second;
+
+  // Do not allow edges leaving the region.
+  for (typename DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end();
+       SI != SE; ++SI) {
+    if (*SI == exit || *SI == entry)
+      continue;
+    if (exitSuccs->find(*SI) == exitSuccs->end())
+      return false;
+    if (!isCommonDomFrontier(*SI, entry, exit))
+      return false;
+  }
+
+  // Do not allow edges pointing into the region.
+  for (typename DST::iterator SI = exitSuccs->begin(), SE = exitSuccs->end();
+       SI != SE; ++SI) {
+    if (DT->properlyDominates(entry, *SI) && *SI != exit)
+      return false;
+  }
+
+  return true;
+}
+
+template <class Tr>
+void RegionInfoBase<Tr>::insertShortCut(BlockT *entry, BlockT *exit,
+                                        BBtoBBMap *ShortCut) const {
+  assert(entry && exit && "entry and exit must not be null!");
+
+  typename BBtoBBMap::iterator e = ShortCut->find(exit);
+
+  if (e == ShortCut->end())
+    // No further region at exit available.
+    (*ShortCut)[entry] = exit;
+  else {
+    // We found a region e that starts at exit. Therefore (entry, e->second)
+    // is also a region, that is larger than (entry, exit). Insert the
+    // larger one.
+    BlockT *BB = e->second;
+    (*ShortCut)[entry] = BB;
+  }
+}
+
+template <class Tr>
+typename Tr::DomTreeNodeT *
+RegionInfoBase<Tr>::getNextPostDom(DomTreeNodeT *N, BBtoBBMap *ShortCut) const {
+  typename BBtoBBMap::iterator e = ShortCut->find(N->getBlock());
+
+  if (e == ShortCut->end())
+    return N->getIDom();
+
+  return PDT->getNode(e->second)->getIDom();
+}
+
+template <class Tr>
+bool RegionInfoBase<Tr>::isTrivialRegion(BlockT *entry, BlockT *exit) const {
+  assert(entry && exit && "entry and exit must not be null!");
+
+  unsigned num_successors =
+      BlockTraits::child_end(entry) - BlockTraits::child_begin(entry);
+
+  if (num_successors <= 1 && exit == *(BlockTraits::child_begin(entry)))
+    return true;
+
+  return false;
+}
+
+template <class Tr>
+typename Tr::RegionT *RegionInfoBase<Tr>::createRegion(BlockT *entry,
+                                                       BlockT *exit) {
+  assert(entry && exit && "entry and exit must not be null!");
+
+  if (isTrivialRegion(entry, exit))
+    return nullptr;
+
+  RegionT *region =
+      new RegionT(entry, exit, static_cast<RegionInfoT *>(this), DT);
+  BBtoRegion.insert(std::make_pair(entry, region));
+
+#ifdef XDEBUG
+  region->verifyRegion();
+#else
+  DEBUG(region->verifyRegion());
+#endif
+
+  updateStatistics(region);
+  return region;
+}
+
+template <class Tr>
+void RegionInfoBase<Tr>::findRegionsWithEntry(BlockT *entry,
+                                              BBtoBBMap *ShortCut) {
+  assert(entry);
+
+  DomTreeNodeT *N = PDT->getNode(entry);
+  if (!N)
+    return;
+
+  RegionT *lastRegion = nullptr;
+  BlockT *lastExit = entry;
+
+  // As only a BasicBlock that postdominates entry can finish a region, walk the
+  // post dominance tree upwards.
+  while ((N = getNextPostDom(N, ShortCut))) {
+    BlockT *exit = N->getBlock();
+
+    if (!exit)
+      break;
+
+    if (isRegion(entry, exit)) {
+      RegionT *newRegion = createRegion(entry, exit);
+
+      if (lastRegion)
+        newRegion->addSubRegion(lastRegion);
+
+      lastRegion = newRegion;
+      lastExit = exit;
+    }
+
+    // This can never be a region, so stop the search.
+    if (!DT->dominates(entry, exit))
+      break;
+  }
+
+  // Tried to create regions from entry to lastExit.  Next time take a
+  // shortcut from entry to lastExit.
+  if (lastExit != entry)
+    insertShortCut(entry, lastExit, ShortCut);
+}
+
+template <class Tr>
+void RegionInfoBase<Tr>::scanForRegions(FuncT &F, BBtoBBMap *ShortCut) {
+  typedef typename std::add_pointer<FuncT>::type FuncPtrT;
+  BlockT *entry = GraphTraits<FuncPtrT>::getEntryNode(&F);
+  DomTreeNodeT *N = DT->getNode(entry);
+
+  // Iterate over the dominance tree in post order to start with the small
+  // regions from the bottom of the dominance tree.  If the small regions are
+  // detected first, detection of bigger regions is faster, as we can jump
+  // over the small regions.
+  for (po_iterator<DomTreeNodeT *> FI = po_begin(N), FE = po_end(N); FI != FE;
+       ++FI) {
+    findRegionsWithEntry(FI->getBlock(), ShortCut);
+  }
+}
+
+template <class Tr>
+typename Tr::RegionT *RegionInfoBase<Tr>::getTopMostParent(RegionT *region) {
+  while (region->getParent())
+    region = region->getParent();
+
+  return region;
+}
+
+template <class Tr>
+void RegionInfoBase<Tr>::buildRegionsTree(DomTreeNodeT *N, RegionT *region) {
+  BlockT *BB = N->getBlock();
+
+  // Passed region exit
+  while (BB == region->getExit())
+    region = region->getParent();
+
+  typename BBtoRegionMap::iterator it = BBtoRegion.find(BB);
+
+  // This basic block is a start block of a region. It is already in the
+  // BBtoRegion relation. Only the child basic blocks have to be updated.
+  if (it != BBtoRegion.end()) {
+    RegionT *newRegion = it->second;
+    region->addSubRegion(getTopMostParent(newRegion));
+    region = newRegion;
+  } else {
+    BBtoRegion[BB] = region;
+  }
+
+  for (typename DomTreeNodeT::iterator CI = N->begin(), CE = N->end(); CI != CE;
+       ++CI) {
+    buildRegionsTree(*CI, region);
+  }
+}
+
+#ifdef XDEBUG
+template <class Tr>
+bool RegionInfoBase<Tr>::VerifyRegionInfo = true;
+#else
+template <class Tr>
+bool RegionInfoBase<Tr>::VerifyRegionInfo = false;
+#endif
+
+template <class Tr>
+typename Tr::RegionT::PrintStyle RegionInfoBase<Tr>::printStyle =
+    RegionBase<Tr>::PrintNone;
+
+template <class Tr>
+void RegionInfoBase<Tr>::print(raw_ostream &OS) const {
+  OS << "Region tree:\n";
+  TopLevelRegion->print(OS, true, 0, printStyle);
+  OS << "End region tree\n";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+template <class Tr>
+void RegionInfoBase<Tr>::dump() const { print(dbgs()); }
+#endif
+
+template <class Tr>
+void RegionInfoBase<Tr>::releaseMemory() {
+  BBtoRegion.clear();
+  if (TopLevelRegion)
+    delete TopLevelRegion;
+  TopLevelRegion = nullptr;
+}
+
+template <class Tr>
+void RegionInfoBase<Tr>::verifyAnalysis() const {
+  TopLevelRegion->verifyRegionNest();
+}
+
+// Region pass manager support.
+template <class Tr>
+typename Tr::RegionT *RegionInfoBase<Tr>::getRegionFor(BlockT *BB) const {
+  typename BBtoRegionMap::const_iterator I = BBtoRegion.find(BB);
+  return I != BBtoRegion.end() ? I->second : nullptr;
+}
+
+template <class Tr>
+void RegionInfoBase<Tr>::setRegionFor(BlockT *BB, RegionT *R) {
+  BBtoRegion[BB] = R;
+}
+
+template <class Tr>
+typename Tr::RegionT *RegionInfoBase<Tr>::operator[](BlockT *BB) const {
+  return getRegionFor(BB);
+}
+
+template <class Tr>
+typename RegionInfoBase<Tr>::BlockT *
+RegionInfoBase<Tr>::getMaxRegionExit(BlockT *BB) const {
+  BlockT *Exit = nullptr;
+
+  while (true) {
+    // Get largest region that starts at BB.
+    RegionT *R = getRegionFor(BB);
+    while (R && R->getParent() && R->getParent()->getEntry() == BB)
+      R = R->getParent();
+
+    // Get the single exit of BB.
+    if (R && R->getEntry() == BB)
+      Exit = R->getExit();
+    else if (++BlockTraits::child_begin(BB) == BlockTraits::child_end(BB))
+      Exit = *BlockTraits::child_begin(BB);
+    else // No single exit exists.
+      return Exit;
+
+    // Get largest region that starts at Exit.
+    RegionT *ExitR = getRegionFor(Exit);
+    while (ExitR && ExitR->getParent() &&
+           ExitR->getParent()->getEntry() == Exit)
+      ExitR = ExitR->getParent();
+
+    for (PredIterTy PI = InvBlockTraits::child_begin(Exit),
+                    PE = InvBlockTraits::child_end(Exit);
+         PI != PE; ++PI) {
+      if (!R->contains(*PI) && !ExitR->contains(*PI))
+        break;
+    }
+
+    // This stops infinite cycles.
+    if (DT->dominates(Exit, BB))
+      break;
+
+    BB = Exit;
+  }
+
+  return Exit;
+}
+
+template <class Tr>
+typename Tr::RegionT *RegionInfoBase<Tr>::getCommonRegion(RegionT *A,
+                                                          RegionT *B) const {
+  assert(A && B && "One of the Regions is NULL");
+
+  if (A->contains(B))
+    return A;
+
+  while (!B->contains(A))
+    B = B->getParent();
+
+  return B;
+}
+
+template <class Tr>
+typename Tr::RegionT *
+RegionInfoBase<Tr>::getCommonRegion(SmallVectorImpl<RegionT *> &Regions) const {
+  RegionT *ret = Regions.back();
+  Regions.pop_back();
+
+  for (RegionT *R : Regions)
+    ret = getCommonRegion(ret, R);
+
+  return ret;
+}
+
+template <class Tr>
+typename Tr::RegionT *
+RegionInfoBase<Tr>::getCommonRegion(SmallVectorImpl<BlockT *> &BBs) const {
+  RegionT *ret = getRegionFor(BBs.back());
+  BBs.pop_back();
+
+  for (BlockT *BB : BBs)
+    ret = getCommonRegion(ret, getRegionFor(BB));
+
+  return ret;
+}
+
+template <class Tr>
+void RegionInfoBase<Tr>::splitBlock(BlockT *NewBB, BlockT *OldBB) {
+  RegionT *R = getRegionFor(OldBB);
+
+  setRegionFor(NewBB, R);
+
+  while (R->getEntry() == OldBB && !R->isTopLevelRegion()) {
+    R->replaceEntry(NewBB);
+    R = R->getParent();
+  }
+
+  setRegionFor(OldBB, R);
+}
+
+template <class Tr>
+void RegionInfoBase<Tr>::calculate(FuncT &F) {
+  typedef typename std::add_pointer<FuncT>::type FuncPtrT;
+
+  // ShortCut a function where for every BB the exit of the largest region
+  // starting with BB is stored. These regions can be threated as single BBS.
+  // This improves performance on linear CFGs.
+  BBtoBBMap ShortCut;
+
+  scanForRegions(F, &ShortCut);
+  BlockT *BB = GraphTraits<FuncPtrT>::getEntryNode(&F);
+  buildRegionsTree(DT->getNode(BB), TopLevelRegion);
+}
+
+#undef DEBUG_TYPE
+
+} // end namespace llvm
+
+#endif

diff --git a/include/llvm/Analysis/RegionIterator.h b/include/llvm/Analysis/RegionIterator.h
index ab4d0e0..0daff58 100644
--- a/include/llvm/Analysis/RegionIterator.h
+++ b/include/llvm/Analysis/RegionIterator.h

@@ -30,13 +30,16 @@
 ///
 /// For a subregion RegionNode there is just one successor. The RegionNode
 /// representing the exit of the subregion.
-template<class NodeType>
+template<class NodeType, class BlockT, class RegionT>
 class RNSuccIterator : public std::iterator<std::forward_iterator_tag,
-                                           NodeType, ptrdiff_t>
-{
+                                           NodeType, ptrdiff_t> {
   typedef std::iterator<std::forward_iterator_tag, NodeType, ptrdiff_t> super;
+
+  typedef GraphTraits<BlockT*> BlockTraits;
+  typedef typename BlockTraits::ChildIteratorType SuccIterTy;
+
   // The iterator works in two modes, bb mode or region mode.
-  enum ItMode{
+  enum ItMode {
     // In BB mode it returns all successors of this BasicBlock as its
     // successors.
     ItBB,
@@ -47,10 +50,10 @@
   };
 
   // Use two bit to represent the mode iterator.
-  PointerIntPair<NodeType*, 2, enum ItMode> Node;
+  PointerIntPair<NodeType*, 2, ItMode> Node;
 
   // The block successor iterator.
-  succ_iterator BItor;
+  SuccIterTy BItor;
 
   // advanceRegionSucc - A region node has only one successor. It reaches end
   // once we advance it.
@@ -66,37 +69,36 @@
 
   // Get the immediate successor. This function may return a Basic Block
   // RegionNode or a subregion RegionNode.
-  RegionNode* getISucc(BasicBlock* BB) const {
-    RegionNode *succ;
+  NodeType* getISucc(BlockT* BB) const {
+    NodeType *succ;
     succ = getNode()->getParent()->getNode(BB);
     assert(succ && "BB not in Region or entered subregion!");
     return succ;
   }
 
   // getRegionSucc - Return the successor basic block of a SubRegion RegionNode.
-  inline BasicBlock* getRegionSucc() const {
+  inline BlockT* getRegionSucc() const {
     assert(Node.getInt() == ItRgBegin && "Cannot get the region successor!");
-    return getNode()->template getNodeAs<Region>()->getExit();
+    return getNode()->template getNodeAs<RegionT>()->getExit();
   }
 
   // isExit - Is this the exit BB of the Region?
-  inline bool isExit(BasicBlock* BB) const {
+  inline bool isExit(BlockT* BB) const {
     return getNode()->getParent()->getExit() == BB;
   }
 public:
-  typedef RNSuccIterator<NodeType> Self;
+  typedef RNSuccIterator<NodeType, BlockT, RegionT> Self;
 
   typedef typename super::pointer pointer;
 
   /// @brief Create begin iterator of a RegionNode.
   inline RNSuccIterator(NodeType* node)
     : Node(node, node->isSubRegion() ? ItRgBegin : ItBB),
-    BItor(succ_begin(node->getEntry())) {
-
+      BItor(BlockTraits::child_begin(node->getEntry())) {
 
     // Skip the exit block
     if (!isRegionMode())
-      while (succ_end(node->getEntry()) != BItor && isExit(*BItor))
+      while (BlockTraits::child_end(node->getEntry()) != BItor && isExit(*BItor))
         ++BItor;
 
     if (isRegionMode() && isExit(getRegionSucc()))
@@ -106,7 +108,7 @@
   /// @brief Create an end iterator.
   inline RNSuccIterator(NodeType* node, bool)
     : Node(node, node->isSubRegion() ? ItRgEnd : ItBB),
-    BItor(succ_end(node->getEntry())) {}
+      BItor(BlockTraits::child_end(node->getEntry())) {}
 
   inline bool operator==(const Self& x) const {
     assert(isRegionMode() == x.isRegionMode() && "Broken iterator!");
@@ -119,7 +121,7 @@
   inline bool operator!=(const Self& x) const { return !operator==(x); }
 
   inline pointer operator*() const {
-    BasicBlock* BB = isRegionMode() ? getRegionSucc() : *BItor;
+    BlockT *BB = isRegionMode() ? getRegionSucc() : *BItor;
     assert(!isExit(BB) && "Iterator out of range!");
     return getISucc(BB);
   }
@@ -132,7 +134,7 @@
       // Skip the exit.
       do
         ++BItor;
-      while (BItor != succ_end(getNode()->getEntry())
+      while (BItor != BlockTraits::child_end(getNode()->getEntry())
           && isExit(*BItor));
     }
     return *this;
@@ -162,36 +164,41 @@
 /// The Flat Region iterator will iterate over all BasicBlock RegionNodes that
 /// are contained in the Region and its subregions. This is close to a virtual
 /// control flow graph of the Region.
-template<class NodeType>
-class RNSuccIterator<FlatIt<NodeType> >
-  : public std::iterator<std::forward_iterator_tag, NodeType, ptrdiff_t>
-{
+template<class NodeType, class BlockT, class RegionT>
+class RNSuccIterator<FlatIt<NodeType>, BlockT, RegionT>
+  : public std::iterator<std::forward_iterator_tag, NodeType, ptrdiff_t> {
   typedef std::iterator<std::forward_iterator_tag, NodeType, ptrdiff_t> super;
+  typedef GraphTraits<BlockT*> BlockTraits;
+  typedef typename BlockTraits::ChildIteratorType SuccIterTy;
+
   NodeType* Node;
-  succ_iterator Itor;
+  SuccIterTy Itor;
 
 public:
-  typedef RNSuccIterator<FlatIt<NodeType> > Self;
+  typedef RNSuccIterator<FlatIt<NodeType>, BlockT, RegionT> Self;
   typedef typename super::pointer pointer;
 
   /// @brief Create the iterator from a RegionNode.
   ///
   /// Note that the incoming node must be a bb node, otherwise it will trigger
   /// an assertion when we try to get a BasicBlock.
-  inline RNSuccIterator(NodeType* node) : Node(node),
-    Itor(succ_begin(node->getEntry())) {
+  inline RNSuccIterator(NodeType* node) :
+    Node(node),
+    Itor(BlockTraits::child_begin(node->getEntry())) {
       assert(!Node->isSubRegion()
              && "Subregion node not allowed in flat iterating mode!");
       assert(Node->getParent() && "A BB node must have a parent!");
 
       // Skip the exit block of the iterating region.
-      while (succ_end(Node->getEntry()) != Itor
+      while (BlockTraits::child_end(Node->getEntry()) != Itor
           && Node->getParent()->getExit() == *Itor)
         ++Itor;
   }
+
   /// @brief Create an end iterator
-  inline RNSuccIterator(NodeType* node, bool) : Node(node),
-    Itor(succ_end(node->getEntry())) {
+  inline RNSuccIterator(NodeType* node, bool) :
+    Node(node),
+    Itor(BlockTraits::child_end(node->getEntry())) {
       assert(!Node->isSubRegion()
              && "Subregion node not allowed in flat iterating mode!");
   }
@@ -206,10 +213,10 @@
   inline bool operator!=(const Self& x) const { return !operator==(x); }
 
   inline pointer operator*() const {
-    BasicBlock* BB = *Itor;
+    BlockT *BB = *Itor;
 
     // Get the iterating region.
-    Region* Parent = Node->getParent();
+    RegionT *Parent = Node->getParent();
 
     // The only case that the successor reaches out of the region is it reaches
     // the exit of the region.
@@ -245,14 +252,14 @@
   }
 };
 
-template<class NodeType>
-inline RNSuccIterator<NodeType> succ_begin(NodeType* Node) {
-  return RNSuccIterator<NodeType>(Node);
+template<class NodeType, class BlockT, class RegionT>
+inline RNSuccIterator<NodeType, BlockT, RegionT> succ_begin(NodeType* Node) {
+  return RNSuccIterator<NodeType, BlockT, RegionT>(Node);
 }
 
-template<class NodeType>
-inline RNSuccIterator<NodeType> succ_end(NodeType* Node) {
-  return RNSuccIterator<NodeType>(Node, true);
+template<class NodeType, class BlockT, class RegionT>
+inline RNSuccIterator<NodeType, BlockT, RegionT> succ_end(NodeType* Node) {
+  return RNSuccIterator<NodeType, BlockT, RegionT>(Node, true);
 }
 
 //===--------------------------------------------------------------------===//
@@ -262,27 +269,27 @@
 // NodeT can either be region node or const region node, otherwise child_begin
 // and child_end fail.
 
-#define RegionNodeGraphTraits(NodeT) \
-  template<> struct GraphTraits<NodeT*> { \
+#define RegionNodeGraphTraits(NodeT, BlockT, RegionT)   \
+  template<> struct GraphTraits<NodeT*> {      \
   typedef NodeT NodeType; \
-  typedef RNSuccIterator<NodeType> ChildIteratorType; \
+  typedef RNSuccIterator<NodeType, BlockT, RegionT> ChildIteratorType;  \
   static NodeType *getEntryNode(NodeType* N) { return N; } \
   static inline ChildIteratorType child_begin(NodeType *N) { \
-    return RNSuccIterator<NodeType>(N); \
+    return RNSuccIterator<NodeType, BlockT, RegionT>(N);             \
   } \
   static inline ChildIteratorType child_end(NodeType *N) { \
-    return RNSuccIterator<NodeType>(N, true); \
+    return RNSuccIterator<NodeType, BlockT, RegionT>(N, true);     \
   } \
 }; \
-template<> struct GraphTraits<FlatIt<NodeT*> > { \
+template<> struct GraphTraits<FlatIt<NodeT*>> {  \
   typedef NodeT NodeType; \
-  typedef RNSuccIterator<FlatIt<NodeT> > ChildIteratorType; \
+  typedef RNSuccIterator<FlatIt<NodeT>, BlockT, RegionT > ChildIteratorType;    \
   static NodeType *getEntryNode(NodeType* N) { return N; } \
   static inline ChildIteratorType child_begin(NodeType *N) { \
-    return RNSuccIterator<FlatIt<NodeType> >(N); \
+    return RNSuccIterator<FlatIt<NodeType>, BlockT, RegionT>(N); \
   } \
   static inline ChildIteratorType child_end(NodeType *N) { \
-    return RNSuccIterator<FlatIt<NodeType> >(N, true); \
+    return RNSuccIterator<FlatIt<NodeType>, BlockT, RegionT>(N, true); \
   } \
 }
 
@@ -315,8 +322,8 @@
   } \
 }
 
-RegionNodeGraphTraits(RegionNode);
-RegionNodeGraphTraits(const RegionNode);
+RegionNodeGraphTraits(RegionNode, BasicBlock, Region);
+RegionNodeGraphTraits(const RegionNode, BasicBlock, Region);
 
 RegionGraphTraits(Region, RegionNode);
 RegionGraphTraits(const Region, const RegionNode);
@@ -337,6 +344,22 @@
   }
 };
 
+template <> struct GraphTraits<RegionInfoPass*>
+  : public GraphTraits<RegionInfo *> {
+  typedef df_iterator<NodeType*, SmallPtrSet<NodeType*, 8>, false,
+                      GraphTraits<FlatIt<NodeType*> > > nodes_iterator;
+
+  static NodeType *getEntryNode(RegionInfoPass *RI) {
+    return GraphTraits<RegionInfo*>::getEntryNode(&RI->getRegionInfo());
+  }
+  static nodes_iterator nodes_begin(RegionInfoPass* RI) {
+    return GraphTraits<RegionInfo*>::nodes_begin(&RI->getRegionInfo());
+  }
+  static nodes_iterator nodes_end(RegionInfoPass *RI) {
+    return GraphTraits<RegionInfo*>::nodes_end(&RI->getRegionInfo());
+  }
+};
+
 } // End namespace llvm
 
 #endif

diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 0570826..893402e 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h

@@ -35,6 +35,7 @@
 
 namespace llvm {
   class APInt;
+  class AssumptionTracker;
   class Constant;
   class ConstantInt;
   class DominatorTree;
@@ -128,9 +129,11 @@
     /// purposes.
     void print(raw_ostream &OS) const;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     /// dump - This method is used for debugging.
     ///
     void dump() const;
+#endif
   };
 
   // Specialize FoldingSetTrait for SCEV to avoid needing to compute
@@ -221,6 +224,9 @@
     ///
     Function *F;
 
+    /// The tracker for @llvm.assume intrinsics in this function.
+    AssumptionTracker *AT;
+
     /// LI - The loop information for the function we are currently analyzing.
     ///
     LoopInfo *LI;
@@ -257,24 +263,13 @@
     /// loop exit's branch condition evaluates to the not-taken path.  This is a
     /// temporary pair of exact and max expressions that are eventually
     /// summarized in ExitNotTakenInfo and BackedgeTakenInfo.
-    ///
-    /// If MustExit is true, then the exit must be taken when the BECount
-    /// reaches Exact (and before surpassing Max). If MustExit is false, then
-    /// BECount may exceed Exact or Max if the loop exits via another branch. In
-    /// either case, the loop may exit early via another branch.
-    ///
-    /// MustExit is true for most cases. However, an exit guarded by an
-    /// (in)equality on a nonunit stride may be skipped.
     struct ExitLimit {
       const SCEV *Exact;
       const SCEV *Max;
-      bool MustExit;
 
-      /*implicit*/ ExitLimit(const SCEV *E)
-        : Exact(E), Max(E), MustExit(true) {}
+      /*implicit*/ ExitLimit(const SCEV *E) : Exact(E), Max(E) {}
 
-      ExitLimit(const SCEV *E, const SCEV *M, bool MustExit)
-        : Exact(E), Max(M), MustExit(MustExit) {}
+      ExitLimit(const SCEV *E, const SCEV *M) : Exact(E), Max(M) {}
 
       /// hasAnyInfo - Test whether this ExitLimit contains any computed
       /// information, or whether it's all SCEVCouldNotCompute values.
@@ -749,6 +744,13 @@
     bool isLoopBackedgeGuardedByCond(const Loop *L, ICmpInst::Predicate Pred,
                                      const SCEV *LHS, const SCEV *RHS);
 
+    /// \brief Returns the maximum trip count of the loop if it is a single-exit
+    /// loop and we can compute a small maximum for that loop.
+    ///
+    /// Implemented in terms of the \c getSmallConstantTripCount overload with
+    /// the single exiting block passed to it. See that routine for details.
+    unsigned getSmallConstantTripCount(Loop *L);
+
     /// getSmallConstantTripCount - Returns the maximum trip count of this loop
     /// as a normal unsigned value. Returns 0 if the trip count is unknown or
     /// not constant. This "trip count" assumes that control exits via
@@ -758,6 +760,14 @@
     /// the loop exits prematurely via another branch.
     unsigned getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock);
 
+    /// \brief Returns the largest constant divisor of the trip count of the
+    /// loop if it is a single-exit loop and we can compute a small maximum for
+    /// that loop.
+    ///
+    /// Implemented in terms of the \c getSmallConstantTripMultiple overload with
+    /// the single exiting block passed to it. See that routine for details.
+    unsigned getSmallConstantTripMultiple(Loop *L);
+
     /// getSmallConstantTripMultiple - Returns the largest constant divisor of
     /// the trip count of this loop as a normal unsigned value, if
     /// possible. This means that the actual trip count is always a multiple of
@@ -795,7 +805,8 @@
 
     /// forgetLoop - This method should be called by the client when it has
     /// changed a loop in a way that may effect ScalarEvolution's ability to
-    /// compute a trip count, or if the loop is deleted.
+    /// compute a trip count, or if the loop is deleted.  This call is
+    /// potentially expensive for large loop bodies.
     void forgetLoop(const Loop *L);
 
     /// forgetValue - This method should be called by the client when it has

diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h
index 01b034f..94e665f 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h

@@ -309,17 +309,17 @@
                               getLoop(), FlagAnyWrap);
     }
 
-    /// isAffine - Return true if this is an affine AddRec (i.e., it represents
-    /// an expressions A+B*x where A and B are loop invariant values.
+    /// isAffine - Return true if this represents an expression
+    /// A + B*x where A and B are loop invariant values.
     bool isAffine() const {
       // We know that the start value is invariant.  This expression is thus
       // affine iff the step is also invariant.
       return getNumOperands() == 2;
     }
 
-    /// isQuadratic - Return true if this is an quadratic AddRec (i.e., it
-    /// represents an expressions A+B*x+C*x^2 where A, B and C are loop
-    /// invariant values.  This corresponds to an addrec of the form {L,+,M,+,N}
+    /// isQuadratic - Return true if this represents an expression
+    /// A + B*x + C*x^2 where A, B and C are loop invariant values.
+    /// This corresponds to an addrec of the form {L,+,M,+,N}
     bool isQuadratic() const {
       return getNumOperands() == 3;
     }
@@ -577,7 +577,7 @@
     SmallPtrSet<const SCEV *, 8> Visited;
 
     void push(const SCEV *S) {
-      if (Visited.insert(S) && Visitor.follow(S))
+      if (Visited.insert(S).second && Visitor.follow(S))
         Worklist.push_back(S);
     }
   public:
@@ -624,7 +624,7 @@
     }
   };
 
-  /// Use SCEVTraversal to visit all nodes in the givien expression tree.
+  /// Use SCEVTraversal to visit all nodes in the given expression tree.
   template<typename SV>
   void visitAll(const SCEV *Root, SV& Visitor) {
     SCEVTraversal<SV> T(Visitor);

diff --git a/include/llvm/Analysis/TargetFolder.h b/include/llvm/Analysis/TargetFolder.h
index 8a7fc7c..587a7ef 100644
--- a/include/llvm/Analysis/TargetFolder.h
+++ b/include/llvm/Analysis/TargetFolder.h

@@ -211,6 +211,13 @@
     return Fold(ConstantExpr::getTruncOrBitCast(C, DestTy));
   }
 
+  Constant *CreatePointerBitCastOrAddrSpaceCast(Constant *C,
+                                                Type *DestTy) const {
+    if (C->getType() == DestTy)
+      return C; // avoid calling Fold
+    return Fold(ConstantExpr::getPointerBitCastOrAddrSpaceCast(C, DestTy));
+  }
+
   //===--------------------------------------------------------------------===//
   // Compare Instructions
   //===--------------------------------------------------------------------===//

diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index f57f3eb..9acaaa6 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h

@@ -28,6 +28,7 @@
 
 namespace llvm {
 
+class Function;
 class GlobalValue;
 class Loop;
 class Type;
@@ -183,7 +184,7 @@
   /// should probably move to simpler cost metrics using the above.
   /// Alternatively, we could split the cost interface into distinct code-size
   /// and execution-speed costs. This would allow modelling the core of this
-  /// query more accurately as the a call is a single small instruction, but
+  /// query more accurately as a call is a single small instruction, but
   /// incurs significant execution cost.
   virtual bool isLoweredToCall(const Function *F) const;
 
@@ -227,7 +228,8 @@
   /// \brief Get target-customized preferences for the generic loop unrolling
   /// transformation. The caller will initialize UP with the current
   /// target-independent defaults.
-  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
+  virtual void getUnrollingPreferences(const Function *F, Loop *L,
+                                       UnrollingPreferences &UP) const;
 
   /// @}
 
@@ -335,6 +337,9 @@
     OK_NonUniformConstantValue   // Operand is a non uniform constant value.
   };
 
+  /// \brief Additional properties of an operand's values.
+  enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 };
+
   /// \return The number of scalar or vector registers that the target has.
   /// If 'Vectors' is true, it returns the number of vector registers. If it is
   /// set to false, it returns the number of scalar registers.
@@ -343,15 +348,18 @@
   /// \return The width of the largest scalar or vector register type.
   virtual unsigned getRegisterBitWidth(bool Vector) const;
 
-  /// \return The maximum unroll factor that the vectorizer should try to
+  /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
   /// and the number of execution units in the CPU.
-  virtual unsigned getMaximumUnrollFactor() const;
+  virtual unsigned getMaxInterleaveFactor() const;
 
   /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
-  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                  OperandValueKind Opd1Info = OK_AnyValue,
-                                  OperandValueKind Opd2Info = OK_AnyValue) const;
+  virtual unsigned
+  getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                         OperandValueKind Opd1Info = OK_AnyValue,
+                         OperandValueKind Opd2Info = OK_AnyValue,
+                         OperandValueProperties Opd1PropInfo = OP_None,
+                         OperandValueProperties Opd2PropInfo = OP_None) const;
 
   /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The index and subtype parameters are used by the subvector insertion and
@@ -416,6 +424,13 @@
   virtual unsigned getAddressComputationCost(Type *Ty,
                                              bool IsComplex = false) const;
 
+  /// \returns The cost, if any, of keeping values of the given types alive
+  /// over a callsite.
+  ///
+  /// Some types may require the use of register classes that do not have
+  /// any callee-saved registers, so would require a spill and fill.
+  virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const;
+
   /// @}
 
   /// Analysis group identification.

diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index 83b5408..6bbf4f4 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h

@@ -25,6 +25,8 @@
   class DataLayout;
   class StringRef;
   class MDNode;
+  class AssumptionTracker;
+  class DominatorTree;
   class TargetLibraryInfo;
 
   /// Determine which bits of V are known to be either zero or one and return
@@ -36,7 +38,10 @@
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
   void computeKnownBits(Value *V,  APInt &KnownZero, APInt &KnownOne,
-                        const DataLayout *TD = nullptr, unsigned Depth = 0);
+                        const DataLayout *TD = nullptr, unsigned Depth = 0,
+                        AssumptionTracker *AT = nullptr,
+                        const Instruction *CxtI = nullptr,
+                        const DominatorTree *DT = nullptr);
   /// Compute known bits from the range metadata.
   /// \p KnownZero the set of bits that are known to be zero
   void computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
@@ -45,21 +50,29 @@
   /// ComputeSignBit - Determine whether the sign bit is known to be zero or
   /// one.  Convenience wrapper around computeKnownBits.
   void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                      const DataLayout *TD = nullptr, unsigned Depth = 0);
+                      const DataLayout *TD = nullptr, unsigned Depth = 0,
+                      AssumptionTracker *AT = nullptr,
+                      const Instruction *CxtI = nullptr,
+                      const DominatorTree *DT = nullptr);
 
   /// isKnownToBeAPowerOfTwo - Return true if the given value is known to have
   /// exactly one bit set when defined. For vectors return true if every
   /// element is known to be a power of two when defined.  Supports values with
   /// integer or pointer type and vectors of integers.  If 'OrZero' is set then
   /// returns true if the given value is either a power of two or zero.
-  bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero = false, unsigned Depth = 0);
+  bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero = false, unsigned Depth = 0,
+                              AssumptionTracker *AT = nullptr,
+                              const Instruction *CxtI = nullptr,
+                              const DominatorTree *DT = nullptr);
 
   /// isKnownNonZero - Return true if the given value is known to be non-zero
   /// when defined.  For vectors return true if every element is known to be
   /// non-zero when defined.  Supports values with integer or pointer type and
   /// vectors of integers.
   bool isKnownNonZero(Value *V, const DataLayout *TD = nullptr,
-                      unsigned Depth = 0);
+                      unsigned Depth = 0, AssumptionTracker *AT = nullptr,
+                      const Instruction *CxtI = nullptr,
+                      const DominatorTree *DT = nullptr);
 
   /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
   /// this predicate to simplify operations downstream.  Mask is known to be
@@ -71,7 +84,10 @@
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
   bool MaskedValueIsZero(Value *V, const APInt &Mask, 
-                         const DataLayout *TD = nullptr, unsigned Depth = 0);
+                         const DataLayout *TD = nullptr, unsigned Depth = 0,
+                         AssumptionTracker *AT = nullptr,
+                         const Instruction *CxtI = nullptr,
+                         const DominatorTree *DT = nullptr);
 
   
   /// ComputeNumSignBits - Return the number of times the sign bit of the
@@ -83,7 +99,10 @@
   /// 'Op' must have a scalar integer type.
   ///
   unsigned ComputeNumSignBits(Value *Op, const DataLayout *TD = nullptr,
-                              unsigned Depth = 0);
+                              unsigned Depth = 0,
+                              AssumptionTracker *AT = nullptr,
+                              const Instruction *CxtI = nullptr,
+                              const DominatorTree *DT = nullptr);
 
   /// ComputeMultiple - This function computes the integer multiple of Base that
   /// equals V.  If successful, it returns true and returns the multiple in
@@ -191,6 +210,13 @@
   /// and byval arguments.
   bool isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI = nullptr);
 
+  /// Return true if it is valid to use the assumptions provided by an
+  /// assume intrinsic, I, at the point in the control-flow identified by the
+  /// context instruction, CxtI.
+  bool isValidAssumeForContext(const Instruction *I, const Instruction *CxtI,
+                               const DataLayout *DL = nullptr,
+                               const DominatorTree *DT = nullptr);
+
 } // end namespace llvm
 
 #endif

diff --git a/include/llvm/AsmParser/Parser.h b/include/llvm/AsmParser/Parser.h
index 165c46d..7ef78d7 100644
--- a/include/llvm/AsmParser/Parser.h
+++ b/include/llvm/AsmParser/Parser.h

@@ -14,12 +14,11 @@
 #ifndef LLVM_ASMPARSER_PARSER_H
 #define LLVM_ASMPARSER_PARSER_H
 
-#include <string>
+#include "llvm/Support/MemoryBuffer.h"
 
 namespace llvm {
 
 class Module;
-class MemoryBuffer;
 class SMDiagnostic;
 class LLVMContext;
 
@@ -29,11 +28,12 @@
 /// that this does not verify that the generated Module is valid, so you should
 /// run the verifier after parsing the file to check that it is okay.
 /// @brief Parse LLVM Assembly from a file
-Module *ParseAssemblyFile(
-  const std::string &Filename, ///< The name of the file to parse
-  SMDiagnostic &Error,         ///< Error result info.
-  LLVMContext &Context         ///< Context in which to allocate globals info.
-);
+/// @param Filename The name of the file to parse
+/// @param Error Error result info.
+/// @param Context Context in which to allocate globals info.
+std::unique_ptr<Module> parseAssemblyFile(StringRef Filename,
+                                          SMDiagnostic &Error,
+                                          LLVMContext &Context);
 
 /// The function is a secondary interface to the LLVM Assembly Parser. It parses
 /// an ASCII string that (presumably) contains LLVM Assembly code. It returns a
@@ -41,23 +41,31 @@
 /// that this does not verify that the generated Module is valid, so you should
 /// run the verifier after parsing the file to check that it is okay.
 /// @brief Parse LLVM Assembly from a string
-Module *ParseAssemblyString(
-  const char *AsmString, ///< The string containing assembly
-  Module *M,             ///< A module to add the assembly too.
-  SMDiagnostic &Error,   ///< Error result info.
-  LLVMContext &Context
-);
+/// @param AsmString The string containing assembly
+/// @param Error Error result info.
+/// @param Context Context in which to allocate globals info.
+std::unique_ptr<Module> parseAssemblyString(StringRef AsmString,
+                                            SMDiagnostic &Error,
+                                            LLVMContext &Context);
+
+/// parseAssemblyFile and parseAssemblyString are wrappers around this function.
+/// @brief Parse LLVM Assembly from a MemoryBuffer.
+/// @param F The MemoryBuffer containing assembly
+/// @param Err Error result info.
+/// @param Context Context in which to allocate globals info.
+std::unique_ptr<Module> parseAssembly(MemoryBufferRef F, SMDiagnostic &Err,
+                                      LLVMContext &Context);
 
 /// This function is the low-level interface to the LLVM Assembly Parser.
-/// ParseAssemblyFile and ParseAssemblyString are wrappers around this function.
-/// @brief Parse LLVM Assembly from a MemoryBuffer. This function *always*
-/// takes ownership of the MemoryBuffer.
-Module *ParseAssembly(
-    MemoryBuffer *F,     ///< The MemoryBuffer containing assembly
-    Module *M,           ///< A module to add the assembly too.
-    SMDiagnostic &Err,   ///< Error result info.
-    LLVMContext &Context
-);
+/// This is kept as an independent function instead of being inlined into
+/// parseAssembly for the convenience of interactive users that want to add
+/// recently parsed bits to an existing module.
+///
+/// @param F The MemoryBuffer containing assembly
+/// @param M The module to add data to.
+/// @param Err Error result info.
+/// @return true on error.
+bool parseAssemblyInto(MemoryBufferRef F, Module &M, SMDiagnostic &Err);
 
 } // End llvm namespace
 

diff --git a/include/llvm/Bitcode/BitCodes.h b/include/llvm/Bitcode/BitCodes.h
index b510daf..ed2dcf8 100644
--- a/include/llvm/Bitcode/BitCodes.h
+++ b/include/llvm/Bitcode/BitCodes.h

@@ -18,6 +18,7 @@
 #ifndef LLVM_BITCODE_BITCODES_H
 #define LLVM_BITCODE_BITCODES_H
 
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -161,16 +162,13 @@
 /// BitCodeAbbrev - This class represents an abbreviation record.  An
 /// abbreviation allows a complex record that has redundancy to be stored in a
 /// specialized format instead of the fully-general, fully-vbr, format.
-class BitCodeAbbrev {
+class BitCodeAbbrev : public RefCountedBase<BitCodeAbbrev> {
   SmallVector<BitCodeAbbrevOp, 32> OperandList;
-  unsigned char RefCount; // Number of things using this.
   ~BitCodeAbbrev() {}
+  // Only RefCountedBase is allowed to delete.
+  friend class RefCountedBase<BitCodeAbbrev>;
+
 public:
-  BitCodeAbbrev() : RefCount(1) {}
-
-  void addRef() { ++RefCount; }
-  void dropRef() { if (--RefCount == 0) delete this; }
-
   unsigned getNumOperandInfos() const {
     return static_cast<unsigned>(OperandList.size());
   }

diff --git a/include/llvm/Bitcode/BitcodeWriterPass.h b/include/llvm/Bitcode/BitcodeWriterPass.h
index 898cd52..eb85548 100644
--- a/include/llvm/Bitcode/BitcodeWriterPass.h
+++ b/include/llvm/Bitcode/BitcodeWriterPass.h

@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_BITCODE_BITCODE_WRITER_PASS_H
-#define LLVM_BITCODE_BITCODE_WRITER_PASS_H
+#ifndef LLVM_BITCODE_BITCODEWRITERPASS_H
+#define LLVM_BITCODE_BITCODEWRITERPASS_H
 
 #include "llvm/ADT/StringRef.h"
 

diff --git a/include/llvm/Bitcode/BitstreamReader.h b/include/llvm/Bitcode/BitstreamReader.h
index 6f478b7..ecf8235 100644
--- a/include/llvm/Bitcode/BitstreamReader.h
+++ b/include/llvm/Bitcode/BitstreamReader.h

@@ -17,39 +17,37 @@
 
 #include "llvm/Bitcode/BitCodes.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/StreamableMemoryObject.h"
+#include "llvm/Support/StreamingMemoryObject.h"
 #include <climits>
 #include <string>
 #include <vector>
 
 namespace llvm {
 
-  class Deserializer;
+class Deserializer;
 
-/// BitstreamReader - This class is used to read from an LLVM bitcode stream,
-/// maintaining information that is global to decoding the entire file.  While
-/// a file is being read, multiple cursors can be independently advanced or
-/// skipped around within the file.  These are represented by the
-/// BitstreamCursor class.
+/// This class is used to read from an LLVM bitcode stream, maintaining
+/// information that is global to decoding the entire file. While a file is
+/// being read, multiple cursors can be independently advanced or skipped around
+/// within the file.  These are represented by the BitstreamCursor class.
 class BitstreamReader {
 public:
-  /// BlockInfo - This contains information emitted to BLOCKINFO_BLOCK blocks.
-  /// These describe abbreviations that all blocks of the specified ID inherit.
+  /// This contains information emitted to BLOCKINFO_BLOCK blocks. These
+  /// describe abbreviations that all blocks of the specified ID inherit.
   struct BlockInfo {
     unsigned BlockID;
-    std::vector<BitCodeAbbrev*> Abbrevs;
+    std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> Abbrevs;
     std::string Name;
 
     std::vector<std::pair<unsigned, std::string> > RecordNames;
   };
 private:
-  std::unique_ptr<StreamableMemoryObject> BitcodeBytes;
+  std::unique_ptr<MemoryObject> BitcodeBytes;
 
   std::vector<BlockInfo> BlockInfoRecords;
 
-  /// IgnoreBlockInfoNames - This is set to true if we don't care about the
-  /// block/record name information in the BlockInfo block. Only llvm-bcanalyzer
-  /// uses this.
+  /// This is set to true if we don't care about the block/record name
+  /// information in the BlockInfo block. Only llvm-bcanalyzer uses this.
   bool IgnoreBlockInfoNames;
 
   BitstreamReader(const BitstreamReader&) LLVM_DELETED_FUNCTION;
@@ -58,36 +56,35 @@
   BitstreamReader() : IgnoreBlockInfoNames(true) {
   }
 
-  BitstreamReader(const unsigned char *Start, const unsigned char *End) {
-    IgnoreBlockInfoNames = true;
+  BitstreamReader(const unsigned char *Start, const unsigned char *End)
+      : IgnoreBlockInfoNames(true) {
     init(Start, End);
   }
 
-  BitstreamReader(StreamableMemoryObject *bytes) {
+  BitstreamReader(MemoryObject *bytes) : IgnoreBlockInfoNames(true) {
     BitcodeBytes.reset(bytes);
   }
 
+  BitstreamReader(BitstreamReader &&Other) {
+    *this = std::move(Other);
+  }
+
+  BitstreamReader &operator=(BitstreamReader &&Other) {
+    BitcodeBytes = std::move(Other.BitcodeBytes);
+    // Explicitly swap block info, so that nothing gets destroyed twice.
+    std::swap(BlockInfoRecords, Other.BlockInfoRecords);
+    IgnoreBlockInfoNames = Other.IgnoreBlockInfoNames;
+    return *this;
+  }
+
   void init(const unsigned char *Start, const unsigned char *End) {
     assert(((End-Start) & 3) == 0 &&"Bitcode stream not a multiple of 4 bytes");
     BitcodeBytes.reset(getNonStreamedMemoryObject(Start, End));
   }
 
-  StreamableMemoryObject &getBitcodeBytes() { return *BitcodeBytes; }
+  MemoryObject &getBitcodeBytes() { return *BitcodeBytes; }
 
-  ~BitstreamReader() {
-    // Free the BlockInfoRecords.
-    while (!BlockInfoRecords.empty()) {
-      BlockInfo &Info = BlockInfoRecords.back();
-      // Free blockinfo abbrev info.
-      for (unsigned i = 0, e = static_cast<unsigned>(Info.Abbrevs.size());
-           i != e; ++i)
-        Info.Abbrevs[i]->dropRef();
-      BlockInfoRecords.pop_back();
-    }
-  }
-
-  /// CollectBlockInfoNames - This is called by clients that want block/record
-  /// name information.
+  /// This is called by clients that want block/record name information.
   void CollectBlockInfoNames() { IgnoreBlockInfoNames = false; }
   bool isIgnoringBlockInfoNames() { return IgnoreBlockInfoNames; }
 
@@ -95,13 +92,13 @@
   // Block Manipulation
   //===--------------------------------------------------------------------===//
 
-  /// hasBlockInfoRecords - Return true if we've already read and processed the
-  /// block info block for this Bitstream.  We only process it for the first
-  /// cursor that walks over it.
+  /// Return true if we've already read and processed the block info block for
+  /// this Bitstream. We only process it for the first cursor that walks over
+  /// it.
   bool hasBlockInfoRecords() const { return !BlockInfoRecords.empty(); }
 
-  /// getBlockInfo - If there is block info for the specified ID, return it,
-  /// otherwise return null.
+  /// If there is block info for the specified ID, return it, otherwise return
+  /// null.
   const BlockInfo *getBlockInfo(unsigned BlockID) const {
     // Common case, the most recent entry matches BlockID.
     if (!BlockInfoRecords.empty() && BlockInfoRecords.back().BlockID == BlockID)
@@ -123,23 +120,26 @@
     BlockInfoRecords.back().BlockID = BlockID;
     return BlockInfoRecords.back();
   }
+
+  /// Takes block info from the other bitstream reader.
+  ///
+  /// This is a "take" operation because BlockInfo records are non-trivial, and
+  /// indeed rather expensive.
+  void takeBlockInfo(BitstreamReader &&Other) {
+    assert(!hasBlockInfoRecords());
+    BlockInfoRecords = std::move(Other.BlockInfoRecords);
+  }
 };
 
-
-/// BitstreamEntry - When advancing through a bitstream cursor, each advance can
-/// discover a few different kinds of entries:
-///   Error    - Malformed bitcode was found.
-///   EndBlock - We've reached the end of the current block, (or the end of the
-///              file, which is treated like a series of EndBlock records.
-///   SubBlock - This is the start of a new subblock of a specific ID.
-///   Record   - This is a record with a specific AbbrevID.
-///
+/// When advancing through a bitstream cursor, each advance can discover a few
+/// different kinds of entries:
 struct BitstreamEntry {
   enum {
-    Error,
-    EndBlock,
-    SubBlock,
-    Record
+    Error,    // Malformed bitcode was found.
+    EndBlock, // We've reached the end of the current block, (or the end of the
+              // file, which is treated like a series of EndBlock records.
+    SubBlock, // This is the start of a new subblock of a specific ID.
+    Record    // This is a record with a specific AbbrevID.
   } Kind;
 
   unsigned ID;
@@ -158,9 +158,9 @@
   }
 };
 
-/// BitstreamCursor - This represents a position within a bitcode file.  There
-/// may be multiple independent cursors reading within one bitstream, each
-/// maintaining their own local state.
+/// This represents a position within a bitcode file. There may be multiple
+/// independent cursors reading within one bitstream, each maintaining their own
+/// local state.
 ///
 /// Unlike iterators, BitstreamCursors are heavy-weight objects that should not
 /// be passed by value.
@@ -169,92 +169,74 @@
   BitstreamReader *BitStream;
   size_t NextChar;
 
+  // The size of the bicode. 0 if we don't know it yet.
+  size_t Size;
 
-  /// CurWord/word_t - This is the current data we have pulled from the stream
-  /// but have not returned to the client.  This is specifically and
-  /// intentionally defined to follow the word size of the host machine for
-  /// efficiency.  We use word_t in places that are aware of this to make it
-  /// perfectly explicit what is going on.
-  typedef uint32_t word_t;
+  /// This is the current data we have pulled from the stream but have not
+  /// returned to the client. This is specifically and intentionally defined to
+  /// follow the word size of the host machine for efficiency. We use word_t in
+  /// places that are aware of this to make it perfectly explicit what is going
+  /// on.
+  typedef size_t word_t;
   word_t CurWord;
 
-  /// BitsInCurWord - This is the number of bits in CurWord that are valid. This
-  /// is always from [0...31/63] inclusive (depending on word size).
+  /// This is the number of bits in CurWord that are valid. This is always from
+  /// [0...bits_of(size_t)-1] inclusive.
   unsigned BitsInCurWord;
 
-  // CurCodeSize - This is the declared size of code values used for the current
-  // block, in bits.
+  // This is the declared size of code values used for the current block, in
+  // bits.
   unsigned CurCodeSize;
 
-  /// CurAbbrevs - Abbrevs installed at in this block.
-  std::vector<BitCodeAbbrev*> CurAbbrevs;
+  /// Abbrevs installed at in this block.
+  std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> CurAbbrevs;
 
   struct Block {
     unsigned PrevCodeSize;
-    std::vector<BitCodeAbbrev*> PrevAbbrevs;
+    std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> PrevAbbrevs;
     explicit Block(unsigned PCS) : PrevCodeSize(PCS) {}
   };
 
-  /// BlockScope - This tracks the codesize of parent blocks.
+  /// This tracks the codesize of parent blocks.
   SmallVector<Block, 8> BlockScope;
 
 
 public:
-  BitstreamCursor() : BitStream(nullptr), NextChar(0) {}
-  BitstreamCursor(const BitstreamCursor &RHS)
-      : BitStream(nullptr), NextChar(0) {
-    operator=(RHS);
-  }
+  BitstreamCursor() { init(nullptr); }
 
-  explicit BitstreamCursor(BitstreamReader &R) : BitStream(&R) {
+  explicit BitstreamCursor(BitstreamReader &R) { init(&R); }
+
+  void init(BitstreamReader *R) {
+    freeState();
+
+    BitStream = R;
     NextChar = 0;
-    CurWord = 0;
+    Size = 0;
     BitsInCurWord = 0;
     CurCodeSize = 2;
   }
 
-  void init(BitstreamReader &R) {
-    freeState();
-
-    BitStream = &R;
-    NextChar = 0;
-    CurWord = 0;
-    BitsInCurWord = 0;
-    CurCodeSize = 2;
-  }
-
-  ~BitstreamCursor() {
-    freeState();
-  }
-
-  void operator=(const BitstreamCursor &RHS);
-
   void freeState();
 
-  bool isEndPos(size_t pos) {
-    return BitStream->getBitcodeBytes().isObjectEnd(static_cast<uint64_t>(pos));
-  }
-
   bool canSkipToPos(size_t pos) const {
     // pos can be skipped to if it is a valid address or one byte past the end.
     return pos == 0 || BitStream->getBitcodeBytes().isValidAddress(
         static_cast<uint64_t>(pos - 1));
   }
 
-  uint32_t getWord(size_t pos) {
-    uint8_t buf[4] = { 0xFF, 0xFF, 0xFF, 0xFF };
-    BitStream->getBitcodeBytes().readBytes(pos, sizeof(buf), buf);
-    return *reinterpret_cast<support::ulittle32_t *>(buf);
-  }
-
   bool AtEndOfStream() {
-    return BitsInCurWord == 0 && isEndPos(NextChar);
+    if (BitsInCurWord != 0)
+      return false;
+    if (Size != 0)
+      return Size == NextChar;
+    fillCurWord();
+    return BitsInCurWord == 0;
   }
 
-  /// getAbbrevIDWidth - Return the number of bits used to encode an abbrev #.
+  /// Return the number of bits used to encode an abbrev #.
   unsigned getAbbrevIDWidth() const { return CurCodeSize; }
 
-  /// GetCurrentBitNo - Return the bit # of the bit we are reading.
+  /// Return the bit # of the bit we are reading.
   uint64_t GetCurrentBitNo() const {
     return NextChar*CHAR_BIT - BitsInCurWord;
   }
@@ -268,19 +250,17 @@
 
   /// Flags that modify the behavior of advance().
   enum {
-    /// AF_DontPopBlockAtEnd - If this flag is used, the advance() method does
-    /// not automatically pop the block scope when the end of a block is
-    /// reached.
+    /// If this flag is used, the advance() method does not automatically pop
+    /// the block scope when the end of a block is reached.
     AF_DontPopBlockAtEnd = 1,
 
-    /// AF_DontAutoprocessAbbrevs - If this flag is used, abbrev entries are
-    /// returned just like normal records.
+    /// If this flag is used, abbrev entries are returned just like normal
+    /// records.
     AF_DontAutoprocessAbbrevs = 2
   };
 
-  /// advance - Advance the current bitstream, returning the next entry in the
-  /// stream.
-  BitstreamEntry advance(unsigned Flags = 0) {
+      /// Advance the current bitstream, returning the next entry in the stream.
+      BitstreamEntry advance(unsigned Flags = 0) {
     while (1) {
       unsigned Code = ReadCode();
       if (Code == bitc::END_BLOCK) {
@@ -305,8 +285,8 @@
     }
   }
 
-  /// advanceSkippingSubblocks - This is a convenience function for clients that
-  /// don't expect any subblocks.  This just skips over them automatically.
+  /// This is a convenience function for clients that don't expect any
+  /// subblocks. This just skips over them automatically.
   BitstreamEntry advanceSkippingSubblocks(unsigned Flags = 0) {
     while (1) {
       // If we found a normal entry, return it.
@@ -320,7 +300,7 @@
     }
   }
 
-  /// JumpToBit - Reset the stream to the specified bit number.
+  /// Reset the stream to the specified bit number.
   void JumpToBit(uint64_t BitNo) {
     uintptr_t ByteNo = uintptr_t(BitNo/8) & ~(sizeof(word_t)-1);
     unsigned WordBitNo = unsigned(BitNo & (sizeof(word_t)*8-1));
@@ -329,75 +309,72 @@
     // Move the cursor to the right word.
     NextChar = ByteNo;
     BitsInCurWord = 0;
-    CurWord = 0;
 
     // Skip over any bits that are already consumed.
-    if (WordBitNo) {
-      if (sizeof(word_t) > 4)
-        Read64(WordBitNo);
-      else
-        Read(WordBitNo);
-    }
+    if (WordBitNo)
+      Read(WordBitNo);
   }
 
-
-  uint32_t Read(unsigned NumBits) {
-    assert(NumBits && NumBits <= 32 &&
-           "Cannot return zero or more than 32 bits!");
-
-    // If the field is fully contained by CurWord, return it quickly.
-    if (BitsInCurWord >= NumBits) {
-      uint32_t R = uint32_t(CurWord) & (~0U >> (32-NumBits));
-      CurWord >>= NumBits;
-      BitsInCurWord -= NumBits;
-      return R;
-    }
-
-    // If we run out of data, stop at the end of the stream.
-    if (isEndPos(NextChar)) {
-      CurWord = 0;
-      BitsInCurWord = 0;
-      return 0;
-    }
-
-    uint32_t R = uint32_t(CurWord);
+  void fillCurWord() {
+    assert(Size == 0 || NextChar < (unsigned)Size);
 
     // Read the next word from the stream.
     uint8_t Array[sizeof(word_t)] = {0};
 
-    BitStream->getBitcodeBytes().readBytes(NextChar, sizeof(Array), Array);
+    uint64_t BytesRead =
+        BitStream->getBitcodeBytes().readBytes(Array, sizeof(Array), NextChar);
 
-    // Handle big-endian byte-swapping if necessary.
-    support::detail::packed_endian_specific_integral
-      <word_t, support::little, support::unaligned> EndianValue;
-    memcpy(&EndianValue, Array, sizeof(Array));
+    // If we run out of data, stop at the end of the stream.
+    if (BytesRead == 0) {
+      Size = NextChar;
+      return;
+    }
 
-    CurWord = EndianValue;
-
-    NextChar += sizeof(word_t);
-
-    // Extract NumBits-BitsInCurWord from what we just read.
-    unsigned BitsLeft = NumBits-BitsInCurWord;
-
-    // Be careful here, BitsLeft is in the range [1..32]/[1..64] inclusive.
-    R |= uint32_t((CurWord & (word_t(~0ULL) >> (sizeof(word_t)*8-BitsLeft)))
-                    << BitsInCurWord);
-
-    // BitsLeft bits have just been used up from CurWord.  BitsLeft is in the
-    // range [1..32]/[1..64] so be careful how we shift.
-    if (BitsLeft != sizeof(word_t)*8)
-      CurWord >>= BitsLeft;
-    else
-      CurWord = 0;
-    BitsInCurWord = sizeof(word_t)*8-BitsLeft;
-    return R;
+    CurWord =
+        support::endian::read<word_t, support::little, support::unaligned>(
+            Array);
+    NextChar += BytesRead;
+    BitsInCurWord = BytesRead * 8;
   }
 
-  uint64_t Read64(unsigned NumBits) {
-    if (NumBits <= 32) return Read(NumBits);
+  word_t Read(unsigned NumBits) {
+    static const unsigned BitsInWord = sizeof(word_t) * 8;
 
-    uint64_t V = Read(32);
-    return V | (uint64_t)Read(NumBits-32) << 32;
+    assert(NumBits && NumBits <= BitsInWord &&
+           "Cannot return zero or more than BitsInWord bits!");
+
+    static const unsigned Mask = sizeof(word_t) > 4 ? 0x3f : 0x1f;
+
+    // If the field is fully contained by CurWord, return it quickly.
+    if (BitsInCurWord >= NumBits) {
+      word_t R = CurWord & (~word_t(0) >> (BitsInWord - NumBits));
+
+      // Use a mask to avoid undefined behavior.
+      CurWord >>= (NumBits & Mask);
+
+      BitsInCurWord -= NumBits;
+      return R;
+    }
+
+    word_t R = BitsInCurWord ? CurWord : 0;
+    unsigned BitsLeft = NumBits - BitsInCurWord;
+
+    fillCurWord();
+
+    // If we run out of data, stop at the end of the stream.
+    if (BitsLeft > BitsInCurWord)
+      return 0;
+
+    word_t R2 = CurWord & (~word_t(0) >> (BitsInWord - BitsLeft));
+
+    // Use a mask to avoid undefined behavior.
+    CurWord >>= (BitsLeft & Mask);
+
+    BitsInCurWord -= BitsLeft;
+
+    R |= R2 << (NumBits - BitsLeft);
+
+    return R;
   }
 
   uint32_t ReadVBR(unsigned NumBits) {
@@ -418,8 +395,8 @@
     }
   }
 
-  // ReadVBR64 - Read a VBR that may have a value up to 64-bits in size.  The
-  // chunk size of the VBR must still be <= 32 bits though.
+  // Read a VBR that may have a value up to 64-bits in size. The chunk size of
+  // the VBR must still be <= 32 bits though.
   uint64_t ReadVBR64(unsigned NumBits) {
     uint32_t Piece = Read(NumBits);
     if ((Piece & (1U << (NumBits-1))) == 0)
@@ -450,7 +427,6 @@
     }
 
     BitsInCurWord = 0;
-    CurWord = 0;
   }
 public:
 
@@ -462,15 +438,13 @@
   // Block header:
   //    [ENTER_SUBBLOCK, blockid, newcodelen, <align4bytes>, blocklen]
 
-  /// ReadSubBlockID - Having read the ENTER_SUBBLOCK code, read the BlockID for
-  /// the block.
+  /// Having read the ENTER_SUBBLOCK code, read the BlockID for the block.
   unsigned ReadSubBlockID() {
     return ReadVBR(bitc::BlockIDWidth);
   }
 
-  /// SkipBlock - Having read the ENTER_SUBBLOCK abbrevid and a BlockID, skip
-  /// over the body of this block.  If the block record is malformed, return
-  /// true.
+  /// Having read the ENTER_SUBBLOCK abbrevid and a BlockID, skip over the body
+  /// of this block. If the block record is malformed, return true.
   bool SkipBlock() {
     // Read and ignore the codelen value.  Since we are skipping this block, we
     // don't care what code widths are used inside of it.
@@ -488,8 +462,8 @@
     return false;
   }
 
-  /// EnterSubBlock - Having read the ENTER_SUBBLOCK abbrevid, enter
-  /// the block, and return true if the block has an error.
+  /// Having read the ENTER_SUBBLOCK abbrevid, enter the block, and return true
+  /// if the block has an error.
   bool EnterSubBlock(unsigned BlockID, unsigned *NumWordsP = nullptr);
 
   bool ReadBlockEnd() {
@@ -508,12 +482,7 @@
   void popBlockScope() {
     CurCodeSize = BlockScope.back().PrevCodeSize;
 
-    // Delete abbrevs from popped scope.
-    for (unsigned i = 0, e = static_cast<unsigned>(CurAbbrevs.size());
-         i != e; ++i)
-      CurAbbrevs[i]->dropRef();
-
-    BlockScope.back().PrevAbbrevs.swap(CurAbbrevs);
+    CurAbbrevs = std::move(BlockScope.back().PrevAbbrevs);
     BlockScope.pop_back();
   }
 
@@ -521,23 +490,16 @@
   // Record Processing
   //===--------------------------------------------------------------------===//
 
-private:
-  void readAbbreviatedLiteral(const BitCodeAbbrevOp &Op,
-                              SmallVectorImpl<uint64_t> &Vals);
-  void readAbbreviatedField(const BitCodeAbbrevOp &Op,
-                            SmallVectorImpl<uint64_t> &Vals);
-  void skipAbbreviatedField(const BitCodeAbbrevOp &Op);
-
 public:
 
-  /// getAbbrev - Return the abbreviation for the specified AbbrevId.
+  /// Return the abbreviation for the specified AbbrevId.
   const BitCodeAbbrev *getAbbrev(unsigned AbbrevID) {
     unsigned AbbrevNo = AbbrevID-bitc::FIRST_APPLICATION_ABBREV;
     assert(AbbrevNo < CurAbbrevs.size() && "Invalid abbrev #!");
-    return CurAbbrevs[AbbrevNo];
+    return CurAbbrevs[AbbrevNo].get();
   }
 
-  /// skipRecord - Read the current record and discard it.
+  /// Read the current record and discard it.
   void skipRecord(unsigned AbbrevID);
 
   unsigned readRecord(unsigned AbbrevID, SmallVectorImpl<uint64_t> &Vals,

diff --git a/include/llvm/Bitcode/BitstreamWriter.h b/include/llvm/Bitcode/BitstreamWriter.h
index dcfebd9..9e2c2fa 100644
--- a/include/llvm/Bitcode/BitstreamWriter.h
+++ b/include/llvm/Bitcode/BitstreamWriter.h

@@ -40,12 +40,12 @@
   unsigned BlockInfoCurBID;
 
   /// CurAbbrevs - Abbrevs installed at in this block.
-  std::vector<BitCodeAbbrev*> CurAbbrevs;
+  std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> CurAbbrevs;
 
   struct Block {
     unsigned PrevCodeSize;
     unsigned StartSizeWord;
-    std::vector<BitCodeAbbrev*> PrevAbbrevs;
+    std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> PrevAbbrevs;
     Block(unsigned PCS, unsigned SSW) : PrevCodeSize(PCS), StartSizeWord(SSW) {}
   };
 
@@ -56,7 +56,7 @@
   /// These describe abbreviations that all blocks of the specified ID inherit.
   struct BlockInfo {
     unsigned BlockID;
-    std::vector<BitCodeAbbrev*> Abbrevs;
+    std::vector<IntrusiveRefCntPtr<BitCodeAbbrev>> Abbrevs;
   };
   std::vector<BlockInfo> BlockInfoRecords;
 
@@ -99,16 +99,6 @@
   ~BitstreamWriter() {
     assert(CurBit == 0 && "Unflushed data remaining");
     assert(BlockScope.empty() && CurAbbrevs.empty() && "Block imbalance");
-
-    // Free the BlockInfoRecords.
-    while (!BlockInfoRecords.empty()) {
-      BlockInfo &Info = BlockInfoRecords.back();
-      // Free blockinfo abbrev info.
-      for (unsigned i = 0, e = static_cast<unsigned>(Info.Abbrevs.size());
-           i != e; ++i)
-        Info.Abbrevs[i]->dropRef();
-      BlockInfoRecords.pop_back();
-    }
   }
 
   /// \brief Retrieve the current position in the stream, in bits.
@@ -231,22 +221,13 @@
     // If there is a blockinfo for this BlockID, add all the predefined abbrevs
     // to the abbrev list.
     if (BlockInfo *Info = getBlockInfo(BlockID)) {
-      for (unsigned i = 0, e = static_cast<unsigned>(Info->Abbrevs.size());
-           i != e; ++i) {
-        CurAbbrevs.push_back(Info->Abbrevs[i]);
-        Info->Abbrevs[i]->addRef();
-      }
+      CurAbbrevs.insert(CurAbbrevs.end(), Info->Abbrevs.begin(),
+                        Info->Abbrevs.end());
     }
   }
 
   void ExitBlock() {
     assert(!BlockScope.empty() && "Block scope imbalance!");
-
-    // Delete all abbrevs.
-    for (unsigned i = 0, e = static_cast<unsigned>(CurAbbrevs.size());
-         i != e; ++i)
-      CurAbbrevs[i]->dropRef();
-
     const Block &B = BlockScope.back();
 
     // Block tail:
@@ -263,7 +244,7 @@
 
     // Restore the inner block's code size and abbrev table.
     CurCodeSize = B.PrevCodeSize;
-    BlockScope.back().PrevAbbrevs.swap(CurAbbrevs);
+    CurAbbrevs = std::move(B.PrevAbbrevs);
     BlockScope.pop_back();
   }
 
@@ -317,7 +298,7 @@
     unsigned BlobLen = (unsigned) Blob.size();
     unsigned AbbrevNo = Abbrev-bitc::FIRST_APPLICATION_ABBREV;
     assert(AbbrevNo < CurAbbrevs.size() && "Invalid abbrev #!");
-    BitCodeAbbrev *Abbv = CurAbbrevs[AbbrevNo];
+    const BitCodeAbbrev *Abbv = CurAbbrevs[AbbrevNo].get();
 
     EmitCode(Abbrev);
 

diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index f7e30ef..c42ecfe 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h

@@ -290,7 +290,7 @@
     FUNC_CODE_INST_PHI         = 16, // PHI:        [ty, val0,bb0, ...]
     // 17 is unused.
     // 18 is unused.
-    FUNC_CODE_INST_ALLOCA      = 19, // ALLOCA:     [instty, op, align]
+    FUNC_CODE_INST_ALLOCA      = 19, // ALLOCA:     [instty, opty, op, align]
     FUNC_CODE_INST_LOAD        = 20, // LOAD:       [opty, op, align, vol]
     // 21 is unused.
     // 22 is unused.
@@ -330,7 +330,8 @@
   };
 
   enum UseListCodes {
-    USELIST_CODE_ENTRY = 1   // USELIST_CODE_ENTRY: TBD.
+    USELIST_CODE_DEFAULT = 1, // DEFAULT: [index..., value-id]
+    USELIST_CODE_BB      = 2  // BB: [index..., bb-id]
   };
 
   enum AttributeKindCodes {
@@ -374,7 +375,8 @@
     ATTR_KIND_OPTIMIZE_NONE = 37,
     ATTR_KIND_IN_ALLOCA = 38,
     ATTR_KIND_NON_NULL = 39,
-    ATTR_KIND_JUMP_TABLE = 40
+    ATTR_KIND_JUMP_TABLE = 40,
+    ATTR_KIND_DEREFERENCEABLE = 41
   };
 
   enum ComdatSelectionKindCodes {

diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index 8cf5735..2e8cdc7 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h

@@ -15,11 +15,12 @@
 #define LLVM_BITCODE_READERWRITER_H
 
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
 #include <string>
 
 namespace llvm {
   class BitstreamWriter;
-  class MemoryBuffer;
   class DataStreamer;
   class LLVMContext;
   class Module;
@@ -27,9 +28,9 @@
   class raw_ostream;
 
   /// Read the header of the specified bitcode buffer and prepare for lazy
-  /// deserialization of function bodies.  If successful, this takes ownership
-  /// of 'buffer. On error, this *does not* take ownership of Buffer.
-  ErrorOr<Module *> getLazyBitcodeModule(MemoryBuffer *Buffer,
+  /// deserialization of function bodies.  If successful, this moves Buffer. On
+  /// error, this *does not* move Buffer.
+  ErrorOr<Module *> getLazyBitcodeModule(std::unique_ptr<MemoryBuffer> &&Buffer,
                                          LLVMContext &Context);
 
   /// getStreamedBitcodeModule - Read the header of the specified stream
@@ -42,14 +43,13 @@
                                    std::string *ErrMsg = nullptr);
 
   /// Read the header of the specified bitcode buffer and extract just the
-  /// triple information. If successful, this returns a string and *does not*
-  /// take ownership of 'buffer'. On error, this returns "".
-  std::string getBitcodeTargetTriple(MemoryBuffer *Buffer,
+  /// triple information. If successful, this returns a string. On error, this
+  /// returns "".
+  std::string getBitcodeTargetTriple(MemoryBufferRef Buffer,
                                      LLVMContext &Context);
 
   /// Read the specified bitcode file, returning the module.
-  /// This method *never* takes ownership of Buffer.
-  ErrorOr<Module *> parseBitcodeFile(MemoryBuffer *Buffer,
+  ErrorOr<Module *> parseBitcodeFile(MemoryBufferRef Buffer,
                                      LLVMContext &Context);
 
   /// WriteBitcodeToFile - Write the specified module to the specified
@@ -139,6 +139,38 @@
     BufEnd = BufPtr+Size;
     return false;
   }
+
+  const std::error_category &BitcodeErrorCategory();
+  enum class BitcodeError {
+    ConflictingMETADATA_KINDRecords,
+    CouldNotFindFunctionInStream,
+    ExpectedConstant,
+    InsufficientFunctionProtos,
+    InvalidBitcodeSignature,
+    InvalidBitcodeWrapperHeader,
+    InvalidConstantReference,
+    InvalidID, // A read identifier is not found in the table it should be in.
+    InvalidInstructionWithNoBB,
+    InvalidRecord, // A read record doesn't have the expected size or structure
+    InvalidTypeForValue, // Type read OK, but is invalid for its use
+    InvalidTYPETable,
+    InvalidType,    // We were unable to read a type
+    MalformedBlock, // We are unable to advance in the stream.
+    MalformedGlobalInitializerSet,
+    InvalidMultipleBlocks, // We found multiple blocks of a kind that should
+                           // have only one
+    NeverResolvedValueFoundInFunction,
+    NeverResolvedFunctionFromBlockAddress,
+    InvalidValue // Invalid version, inst number, attr number, etc
+  };
+  inline std::error_code make_error_code(BitcodeError E) {
+    return std::error_code(static_cast<int>(E), BitcodeErrorCategory());
+  }
+
 } // End llvm namespace
 
+namespace std {
+template <> struct is_error_code_enum<llvm::BitcodeError> : std::true_type {};
+}
+
 #endif

diff --git a/include/llvm/CMakeLists.txt b/include/llvm/CMakeLists.txt
index ca4fd13..ff80539 100644
--- a/include/llvm/CMakeLists.txt
+++ b/include/llvm/CMakeLists.txt

@@ -1,18 +1,5 @@
 add_subdirectory(IR)
 
-if( MSVC_IDE OR XCODE )
-  # Creates a dummy target containing all headers for the benefit of
-  # XCode and Visual Studio users.
-  file(GLOB_RECURSE headers *.h)
-  add_td_sources(headers)
-  add_library(llvm_headers_do_not_build EXCLUDE_FROM_ALL
-    # We need at least one source file:
-    ${LLVM_MAIN_SRC_DIR}/lib/Transforms/Hello/Hello.cpp
-    ${headers})
-  set_target_properties(llvm_headers_do_not_build PROPERTIES FOLDER "Misc"
-                        EXCLUDE_FROM_DEFAULT_BUILD ON)
-endif()
-
 # If we're doing an out-of-tree build, copy a module map for generated
 # header files into the build area.
 if (NOT "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")

diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h
index c5060fb..3132999 100644
--- a/include/llvm/CodeGen/Analysis.h
+++ b/include/llvm/CodeGen/Analysis.h

@@ -22,12 +22,13 @@
 #include "llvm/IR/Instructions.h"
 
 namespace llvm {
-class GlobalVariable;
+class GlobalValue;
 class TargetLoweringBase;
+class TargetLowering;
+class TargetMachine;
 class SDNode;
 class SDValue;
 class SelectionDAG;
-class TargetLowering;
 struct EVT;
 
 /// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence
@@ -58,7 +59,7 @@
                      uint64_t StartingOffset = 0);
 
 /// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
-GlobalVariable *ExtractTypeInfo(Value *V);
+GlobalValue *ExtractTypeInfo(Value *V);
 
 /// hasInlineAsmMemConstraint - Return true if the inline asm instruction being
 /// processed uses a memory 'm' constraint.
@@ -86,7 +87,7 @@
 /// between it and the return.
 ///
 /// This function only tests target-independent requirements.
-bool isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG);
+bool isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM);
 
 /// Test if given that the input instruction is in the tail call position if the
 /// return type or any attributes of the function will inhibit tail call
@@ -96,6 +97,13 @@
                                      const ReturnInst *Ret,
                                      const TargetLoweringBase &TLI);
 
+// True if GV can be left out of the object symbol table. This is the case
+// for linkonce_odr values whose address is not significant. While legal, it is
+// not normally profitable to omit them from the .o symbol table. Using this
+// analysis makes sense when the information can be passed down to the linker
+// or we are in LTO.
+bool canBeOmittedFromSymbolTable(const GlobalValue *GV);
+
 } // End llvm namespace
 
 #endif

diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index e1c9a14..25b99a2 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h

@@ -264,6 +264,9 @@
   /// function.
   virtual void EmitFunctionBodyEnd() {}
 
+  /// Targets can override this to emit stuff at the end of a basic block.
+  virtual void EmitBasicBlockEnd(const MachineBasicBlock &MBB) {}
+
   /// Targets should implement this to emit instructions.
   virtual void EmitInstruction(const MachineInstr *) {
     llvm_unreachable("EmitInstruction not implemented");
@@ -346,12 +349,6 @@
   void EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
                            unsigned Size) const;
 
-  /// Emit something like ".long Hi+Offset-Lo" where the size in bytes of the
-  /// directive is specified by Size and Hi/Lo specify the labels.  This
-  /// implicitly uses .set if it is available.
-  void EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
-                                 const MCSymbol *Lo, unsigned Size) const;
-
   /// Emit something like ".long Label+Offset" where the size in bytes of the
   /// directive is specified by Size and Label specifies the label.  This
   /// implicitly uses .set if it is available.
@@ -402,6 +399,13 @@
   /// Get the value for DW_AT_APPLE_isa. Zero if no isa encoding specified.
   virtual unsigned getISAEncoding() { return 0; }
 
+  /// Emit a dwarf register operation for describing
+  /// - a small value occupying only part of a register or
+  /// - a register representing only part of a value.
+  void EmitDwarfOpPiece(ByteStreamer &Streamer, unsigned SizeInBits,
+                        unsigned OffsetInBits = 0) const;
+
+
   /// \brief Emit a partial DWARF register operation.
   /// \param MLoc             the register
   /// \param PieceSize        size and
@@ -418,7 +422,7 @@
                            unsigned PieceSize = 0,
                            unsigned PieceOffset = 0) const;
 
-  /// Emit dwarf register operation.
+  /// EmitDwarfRegOp - Emit a dwarf register operation.
   /// \param Indirect   whether this is a register-indirect address
   virtual void EmitDwarfRegOp(ByteStreamer &BS, const MachineLocation &MLoc,
                               bool Indirect) const;

diff --git a/include/llvm/CodeGen/CalcSpillWeights.h b/include/llvm/CodeGen/CalcSpillWeights.h
index 0d79b1d..91fb0a9 100644
--- a/include/llvm/CodeGen/CalcSpillWeights.h
+++ b/include/llvm/CodeGen/CalcSpillWeights.h

@@ -30,8 +30,10 @@
   /// @param UseDefFreq Expected number of executed use and def instructions
   ///                   per function call. Derived from block frequencies.
   /// @param Size       Size of live interval as returnexd by getSize()
+  /// @param NumInstr   Number of instructions using this live interval
   ///
-  static inline float normalizeSpillWeight(float UseDefFreq, unsigned Size) {
+  static inline float normalizeSpillWeight(float UseDefFreq, unsigned Size,
+                                           unsigned NumInstr) {
     // The constant 25 instructions is added to avoid depending too much on
     // accidental SlotIndex gaps for small intervals. The effect is that small
     // intervals have a spill weight that is mostly proportional to the number
@@ -44,7 +46,7 @@
   /// spill weight and allocation hint.
   class VirtRegAuxInfo {
   public:
-    typedef float (*NormalizingFn)(float, unsigned);
+    typedef float (*NormalizingFn)(float, unsigned, unsigned);
 
   private:
     MachineFunction &MF;

diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index 04af4bd..0b2ccc6 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h

@@ -31,18 +31,25 @@
 class CCValAssign {
 public:
   enum LocInfo {
-    Full,   // The value fills the full location.
-    SExt,   // The value is sign extended in the location.
-    ZExt,   // The value is zero extended in the location.
-    AExt,   // The value is extended with undefined upper bits.
-    BCvt,   // The value is bit-converted in the location.
-    VExt,   // The value is vector-widened in the location.
-            // FIXME: Not implemented yet. Code that uses AExt to mean
-            // vector-widen should be fixed to use VExt instead.
-    FPExt,  // The floating-point value is fp-extended in the location.
-    Indirect // The location contains pointer to the value.
+    Full,      // The value fills the full location.
+    SExt,      // The value is sign extended in the location.
+    ZExt,      // The value is zero extended in the location.
+    AExt,      // The value is extended with undefined upper bits.
+    SExtUpper, // The value is in the upper bits of the location and should be
+               // sign extended when retrieved.
+    ZExtUpper, // The value is in the upper bits of the location and should be
+               // zero extended when retrieved.
+    AExtUpper, // The value is in the upper bits of the location and should be
+               // extended with undefined upper bits when retrieved.
+    BCvt,      // The value is bit-converted in the location.
+    VExt,      // The value is vector-widened in the location.
+               // FIXME: Not implemented yet. Code that uses AExt to mean
+               // vector-widen should be fixed to use VExt instead.
+    FPExt,     // The floating-point value is fp-extended in the location.
+    Indirect   // The location contains pointer to the value.
     // TODO: a subset of the value is in the location.
   };
+
 private:
   /// ValNo - This is the value number begin assigned (e.g. an argument number).
   unsigned ValNo;
@@ -146,6 +153,9 @@
     return (HTP == AExt || HTP == SExt || HTP == ZExt);
   }
 
+  bool isUpperBitsInLoc() const {
+    return HTP == AExtUpper || HTP == SExtUpper || HTP == ZExtUpper;
+  }
 };
 
 /// CCAssignFn - This function assigns a location for Val, updating State to
@@ -174,7 +184,6 @@
   CallingConv::ID CallingConv;
   bool IsVarArg;
   MachineFunction &MF;
-  const TargetMachine &TM;
   const TargetRegisterInfo &TRI;
   SmallVectorImpl<CCValAssign> &Locs;
   LLVMContext &Context;
@@ -208,10 +217,10 @@
   // while "%t" goes to the stack: it wouldn't be described in ByValRegs.
   //
   // Supposed use-case for this collection:
-  // 1. Initially ByValRegs is empty, InRegsParamsProceed is 0.
+  // 1. Initially ByValRegs is empty, InRegsParamsProcessed is 0.
   // 2. HandleByVal fillups ByValRegs.
   // 3. Argument analysis (LowerFormatArguments, for example). After
-  // some byval argument was analyzed, InRegsParamsProceed is increased.
+  // some byval argument was analyzed, InRegsParamsProcessed is increased.
   struct ByValInfo {
     ByValInfo(unsigned B, unsigned E, bool IsWaste = false) :
       Begin(B), End(E), Waste(IsWaste) {}
@@ -229,24 +238,22 @@
   };
   SmallVector<ByValInfo, 4 > ByValRegs;
 
-  // InRegsParamsProceed - shows how many instances of ByValRegs was proceed
+  // InRegsParamsProcessed - shows how many instances of ByValRegs was proceed
   // during argument analysis.
-  unsigned InRegsParamsProceed;
+  unsigned InRegsParamsProcessed;
 
 protected:
   ParmContext CallOrPrologue;
 
 public:
   CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-          const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
-          LLVMContext &C);
+          SmallVectorImpl<CCValAssign> &locs, LLVMContext &C);
 
   void addLoc(const CCValAssign &V) {
     Locs.push_back(V);
   }
 
   LLVMContext &getContext() const { return Context; }
-  const TargetMachine &getTarget() const { return TM; }
   MachineFunction &getMachineFunction() const { return MF; }
   CallingConv::ID getCallingConv() const { return CallingConv; }
   bool isVarArg() const { return IsVarArg; }
@@ -377,8 +384,8 @@
   /// AllocateStack - Allocate a chunk of stack space with the specified size
   /// and alignment.
   unsigned AllocateStack(unsigned Size, unsigned Align) {
-    assert(Align && ((Align-1) & Align) == 0); // Align is power of 2.
-    StackOffset = ((StackOffset + Align-1) & ~(Align-1));
+    assert(Align && ((Align - 1) & Align) == 0); // Align is power of 2.
+    StackOffset = ((StackOffset + Align - 1) & ~(Align - 1));
     unsigned Result = StackOffset;
     StackOffset += Size;
     MF.getFrameInfo()->ensureMaxAlignment(Align);
@@ -412,7 +419,7 @@
   unsigned getInRegsParamsCount() const { return ByValRegs.size(); }
 
   // Returns count of byval in-regs arguments proceed.
-  unsigned getInRegsParamsProceed() const { return InRegsParamsProceed; }
+  unsigned getInRegsParamsProcessed() const { return InRegsParamsProcessed; }
 
   // Get information about N-th byval parameter that is stored in registers.
   // Here "ByValParamIndex" is N.
@@ -436,20 +443,20 @@
   // Returns false, if end is reached.
   bool nextInRegsParam() {
     unsigned e = ByValRegs.size();
-    if (InRegsParamsProceed < e)
-      ++InRegsParamsProceed;
-    return InRegsParamsProceed < e;
+    if (InRegsParamsProcessed < e)
+      ++InRegsParamsProcessed;
+    return InRegsParamsProcessed < e;
   }
 
   // Clear byval registers tracking info.
   void clearByValRegsInfo() {
-    InRegsParamsProceed = 0;
+    InRegsParamsProcessed = 0;
     ByValRegs.clear();
   }
 
   // Rewind byval registers tracking info.
   void rewindByValRegsInfo() {
-    InRegsParamsProceed = 0;
+    InRegsParamsProcessed = 0;
   }
 
   ParmContext getCallOrPrologue() const { return CallOrPrologue; }

diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index 449d934..973c595 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h

@@ -54,6 +54,16 @@
                       "Relocatable external references, non-relocatable code"),
               clEnumValEnd));
 
+cl::opt<ThreadModel::Model>
+TMModel("thread-model",
+        cl::desc("Choose threading model"),
+        cl::init(ThreadModel::POSIX),
+        cl::values(clEnumValN(ThreadModel::POSIX, "posix",
+                              "POSIX thread model"),
+                   clEnumValN(ThreadModel::Single, "single",
+                              "Single thread model"),
+                   clEnumValEnd));
+
 cl::opt<llvm::CodeModel::Model>
 CMModel("code-model",
         cl::desc("Choose code model"),
@@ -83,11 +93,6 @@
              clEnumValEnd));
 
 cl::opt<bool>
-DisableRedZone("disable-red-zone",
-               cl::desc("Do not emit code that uses the red zone."),
-               cl::init(false));
-
-cl::opt<bool>
 EnableFPMAD("enable-fp-mad",
             cl::desc("Enable less precise MAD instructions to be generated"),
             cl::init(false));
@@ -180,8 +185,8 @@
           cl::init(false));
 
 cl::opt<bool>
-UseInitArray("use-init-array",
-             cl::desc("Use .init_array instead of .ctors."),
+UseCtors("use-ctors",
+             cl::desc("Use .ctors instead of .init_array."),
              cl::init(false));
 
 cl::opt<std::string> StopAfter("stop-after",
@@ -217,6 +222,44 @@
                          "Create one table per unique function type."),
               clEnumValEnd));
 
+cl::opt<bool>
+FCFI("fcfi",
+     cl::desc("Apply forward-edge control-flow integrity"),
+     cl::init(false));
+
+cl::opt<llvm::CFIntegrity>
+CFIType("cfi-type",
+        cl::desc("Choose the type of Control-Flow Integrity check to add"),
+        cl::init(CFIntegrity::Sub),
+        cl::values(
+            clEnumValN(CFIntegrity::Sub, "sub",
+                       "Subtract the pointer from the table base, then mask."),
+            clEnumValN(CFIntegrity::Ror, "ror",
+                       "Use rotate to check the offset from a table base."),
+            clEnumValN(CFIntegrity::Add, "add",
+                       "Mask out the high bits and add to an aligned base."),
+            clEnumValEnd));
+
+cl::opt<bool>
+CFIEnforcing("cfi-enforcing",
+             cl::desc("Enforce CFI or pass the violation to a function."),
+             cl::init(false));
+
+// Note that this option is linked to the cfi-enforcing option above: if
+// cfi-enforcing is set, then the cfi-func-name option is entirely ignored. If
+// cfi-enforcing is false and no cfi-func-name is set, then a default function
+// will be generated that ignores all CFI violations. The expected signature for
+// functions called with CFI violations is
+//
+// void (i8*, i8*)
+//
+// The first pointer is a C string containing the name of the function in which
+// the violation occurs, and the second pointer is the pointer that violated
+// CFI.
+cl::opt<std::string>
+CFIFuncName("cfi-func-name", cl::desc("The name of the CFI function to call"),
+            cl::init(""));
+
 // Common utility function tightly tied to the options listed here. Initializes
 // a TargetOptions object with CodeGen flags and returns it.
 static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
@@ -238,12 +281,18 @@
   Options.StackAlignmentOverride = OverrideStackAlignment;
   Options.TrapFuncName = TrapFuncName;
   Options.PositionIndependentExecutable = EnablePIE;
-  Options.UseInitArray = UseInitArray;
+  Options.UseInitArray = !UseCtors;
   Options.DataSections = DataSections;
   Options.FunctionSections = FunctionSections;
 
   Options.MCOptions = InitMCTargetOptionsFromFlags();
   Options.JTType = JTableType;
+  Options.FCFI = FCFI;
+  Options.CFIType = CFIType;
+  Options.CFIEnforcing = CFIEnforcing;
+  Options.CFIFuncName = CFIFuncName;
+
+  Options.ThreadModel = TMModel;
 
   return Options;
 }

diff --git a/include/llvm/CodeGen/DFAPacketizer.h b/include/llvm/CodeGen/DFAPacketizer.h
index 9d25fd3..f9cdc2a 100644
--- a/include/llvm/CodeGen/DFAPacketizer.h
+++ b/include/llvm/CodeGen/DFAPacketizer.h

@@ -91,7 +91,6 @@
 // API call is made to prune the dependence.
 class VLIWPacketizerList {
 protected:
-  const TargetMachine &TM;
   const MachineFunction &MF;
   const TargetInstrInfo *TII;
 
@@ -107,9 +106,7 @@
   std::map<MachineInstr*, SUnit*> MIToSUnit;
 
 public:
-  VLIWPacketizerList(
-    MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-    bool IsPostRA);
+  VLIWPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, bool IsPostRA);
 
   virtual ~VLIWPacketizerList();
 

diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index 2bebae6..b5405f9 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h

@@ -16,36 +16,166 @@
 #define LLVM_CODEGEN_FASTISEL_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IntrinsicInst.h"
 
 namespace llvm {
 
-class AllocaInst;
-class Constant;
-class ConstantFP;
-class CallInst;
-class DataLayout;
-class FunctionLoweringInfo;
-class Instruction;
-class LoadInst;
-class MVT;
-class MachineConstantPool;
-class MachineFrameInfo;
-class MachineFunction;
-class MachineInstr;
-class MachineRegisterInfo;
-class TargetInstrInfo;
-class TargetLibraryInfo;
-class TargetLowering;
-class TargetMachine;
-class TargetRegisterClass;
-class TargetRegisterInfo;
-class User;
-class Value;
-
-/// This is a fast-path instruction selection class that generates poor code and
-/// doesn't support illegal types or non-trivial lowering, but runs quickly.
+/// \brief This is a fast-path instruction selection class that generates poor
+/// code and doesn't support illegal types or non-trivial lowering, but runs
+/// quickly.
 class FastISel {
+public:
+  struct ArgListEntry {
+    Value *Val;
+    Type *Ty;
+    bool IsSExt : 1;
+    bool IsZExt : 1;
+    bool IsInReg : 1;
+    bool IsSRet : 1;
+    bool IsNest : 1;
+    bool IsByVal : 1;
+    bool IsInAlloca : 1;
+    bool IsReturned : 1;
+    uint16_t Alignment;
+
+    ArgListEntry()
+        : Val(nullptr), Ty(nullptr), IsSExt(false), IsZExt(false),
+          IsInReg(false), IsSRet(false), IsNest(false), IsByVal(false),
+          IsInAlloca(false), IsReturned(false), Alignment(0) {}
+
+    /// \brief Set CallLoweringInfo attribute flags based on a call instruction
+    /// and called function attributes.
+    void setAttributes(ImmutableCallSite *CS, unsigned AttrIdx);
+  };
+  typedef std::vector<ArgListEntry> ArgListTy;
+
+  struct CallLoweringInfo {
+    Type *RetTy;
+    bool RetSExt : 1;
+    bool RetZExt : 1;
+    bool IsVarArg : 1;
+    bool IsInReg : 1;
+    bool DoesNotReturn : 1;
+    bool IsReturnValueUsed : 1;
+
+    // \brief IsTailCall Should be modified by implementations of FastLowerCall
+    // that perform tail call conversions.
+    bool IsTailCall;
+
+    unsigned NumFixedArgs;
+    CallingConv::ID CallConv;
+    const Value *Callee;
+    const char *SymName;
+    ArgListTy Args;
+    ImmutableCallSite *CS;
+    MachineInstr *Call;
+    unsigned ResultReg;
+    unsigned NumResultRegs;
+
+    SmallVector<Value *, 16> OutVals;
+    SmallVector<ISD::ArgFlagsTy, 16> OutFlags;
+    SmallVector<unsigned, 16> OutRegs;
+    SmallVector<ISD::InputArg, 4> Ins;
+    SmallVector<unsigned, 4> InRegs;
+
+    CallLoweringInfo()
+        : RetTy(nullptr), RetSExt(false), RetZExt(false), IsVarArg(false),
+          IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true),
+          IsTailCall(false), NumFixedArgs(-1), CallConv(CallingConv::C),
+          Callee(nullptr), SymName(nullptr), CS(nullptr), Call(nullptr),
+          ResultReg(0), NumResultRegs(0) {}
+
+    CallLoweringInfo &setCallee(Type *ResultTy, FunctionType *FuncTy,
+                                const Value *Target, ArgListTy &&ArgsList,
+                                ImmutableCallSite &Call) {
+      RetTy = ResultTy;
+      Callee = Target;
+
+      IsInReg = Call.paramHasAttr(0, Attribute::InReg);
+      DoesNotReturn = Call.doesNotReturn();
+      IsVarArg = FuncTy->isVarArg();
+      IsReturnValueUsed = !Call.getInstruction()->use_empty();
+      RetSExt = Call.paramHasAttr(0, Attribute::SExt);
+      RetZExt = Call.paramHasAttr(0, Attribute::ZExt);
+
+      CallConv = Call.getCallingConv();
+      Args = std::move(ArgsList);
+      NumFixedArgs = FuncTy->getNumParams();
+
+      CS = &Call;
+
+      return *this;
+    }
+
+    CallLoweringInfo &setCallee(Type *ResultTy, FunctionType *FuncTy,
+                                const char *Target, ArgListTy &&ArgsList,
+                                ImmutableCallSite &Call,
+                                unsigned FixedArgs = ~0U) {
+      RetTy = ResultTy;
+      Callee = Call.getCalledValue();
+      SymName = Target;
+
+      IsInReg = Call.paramHasAttr(0, Attribute::InReg);
+      DoesNotReturn = Call.doesNotReturn();
+      IsVarArg = FuncTy->isVarArg();
+      IsReturnValueUsed = !Call.getInstruction()->use_empty();
+      RetSExt = Call.paramHasAttr(0, Attribute::SExt);
+      RetZExt = Call.paramHasAttr(0, Attribute::ZExt);
+
+      CallConv = Call.getCallingConv();
+      Args = std::move(ArgsList);
+      NumFixedArgs = (FixedArgs == ~0U) ? FuncTy->getNumParams() : FixedArgs;
+
+      CS = &Call;
+
+      return *this;
+    }
+
+    CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultTy,
+                                const Value *Target, ArgListTy &&ArgsList,
+                                unsigned FixedArgs = ~0U) {
+      RetTy = ResultTy;
+      Callee = Target;
+      CallConv = CC;
+      Args = std::move(ArgsList);
+      NumFixedArgs = (FixedArgs == ~0U) ? Args.size() : FixedArgs;
+      return *this;
+    }
+
+    CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultTy,
+                                const char *Target, ArgListTy &&ArgsList,
+                                unsigned FixedArgs = ~0U) {
+      RetTy = ResultTy;
+      SymName = Target;
+      CallConv = CC;
+      Args = std::move(ArgsList);
+      NumFixedArgs = (FixedArgs == ~0U) ? Args.size() : FixedArgs;
+      return *this;
+    }
+
+    CallLoweringInfo &setTailCall(bool Value = true) {
+      IsTailCall = Value;
+      return *this;
+    }
+
+    ArgListTy &getArgs() { return Args; }
+
+    void clearOuts() {
+      OutVals.clear();
+      OutFlags.clear();
+      OutRegs.clear();
+    }
+
+    void clearIns() {
+      Ins.clear();
+      InRegs.clear();
+    }
+  };
+
 protected:
   DenseMap<const Value *, unsigned> LocalValueMap;
   FunctionLoweringInfo &FuncInfo;
@@ -60,61 +190,64 @@
   const TargetLowering &TLI;
   const TargetRegisterInfo &TRI;
   const TargetLibraryInfo *LibInfo;
+  bool SkipTargetIndependentISel;
 
-  /// The position of the last instruction for materializing constants for use
-  /// in the current block. It resets to EmitStartPt when it makes sense (for
-  /// example, it's usually profitable to avoid function calls between the
+  /// \brief The position of the last instruction for materializing constants
+  /// for use in the current block. It resets to EmitStartPt when it makes sense
+  /// (for example, it's usually profitable to avoid function calls between the
   /// definition and the use)
   MachineInstr *LastLocalValue;
 
-  /// The top most instruction in the current block that is allowed for emitting
-  /// local variables. LastLocalValue resets to EmitStartPt when it makes sense
-  /// (for example, on function calls)
+  /// \brief The top most instruction in the current block that is allowed for
+  /// emitting local variables. LastLocalValue resets to EmitStartPt when it
+  /// makes sense (for example, on function calls)
   MachineInstr *EmitStartPt;
 
 public:
-  /// Return the position of the last instruction emitted for materializing
-  /// constants for use in the current block.
+  /// \brief Return the position of the last instruction emitted for
+  /// materializing constants for use in the current block.
   MachineInstr *getLastLocalValue() { return LastLocalValue; }
 
-  /// Update the position of the last instruction emitted for materializing
-  /// constants for use in the current block.
+  /// \brief Update the position of the last instruction emitted for
+  /// materializing constants for use in the current block.
   void setLastLocalValue(MachineInstr *I) {
     EmitStartPt = I;
     LastLocalValue = I;
   }
 
-  /// Set the current block to which generated machine instructions will be
-  /// appended, and clear the local CSE map.
+  /// \brief Set the current block to which generated machine instructions will
+  /// be appended, and clear the local CSE map.
   void startNewBlock();
 
-  /// Return current debug location information.
+  /// \brief Return current debug location information.
   DebugLoc getCurDebugLoc() const { return DbgLoc; }
-  
-  /// Do "fast" instruction selection for function arguments and append machine
-  /// instructions to the current block. Return true if it is successful.
-  bool LowerArguments();
 
-  /// Do "fast" instruction selection for the given LLVM IR instruction, and
-  /// append generated machine instructions to the current block. Return true if
-  /// selection was successful.
-  bool SelectInstruction(const Instruction *I);
+  /// \brief Do "fast" instruction selection for function arguments and append
+  /// the machine instructions to the current block. Returns true when
+  /// successful.
+  bool lowerArguments();
 
-  /// Do "fast" instruction selection for the given LLVM IR operator
+  /// \brief Do "fast" instruction selection for the given LLVM IR instruction
+  /// and append the generated machine instructions to the current block.
+  /// Returns true if selection was successful.
+  bool selectInstruction(const Instruction *I);
+
+  /// \brief Do "fast" instruction selection for the given LLVM IR operator
   /// (Instruction or ConstantExpr), and append generated machine instructions
   /// to the current block. Return true if selection was successful.
-  bool SelectOperator(const User *I, unsigned Opcode);
+  bool selectOperator(const User *I, unsigned Opcode);
 
-  /// Create a virtual register and arrange for it to be assigned the value for
-  /// the given LLVM value.
+  /// \brief Create a virtual register and arrange for it to be assigned the
+  /// value for the given LLVM value.
   unsigned getRegForValue(const Value *V);
 
-  /// Look up the value to see if its value is already cached in a register. It
-  /// may be defined by instructions across blocks or defined locally.
+  /// \brief Look up the value to see if its value is already cached in a
+  /// register. It may be defined by instructions across blocks or defined
+  /// locally.
   unsigned lookUpRegForValue(const Value *V);
 
-  /// This is a wrapper around getRegForValue that also takes care of truncating
-  /// or sign-extending the given getelementptr index value.
+  /// \brief This is a wrapper around getRegForValue that also takes care of
+  /// truncating or sign-extending the given getelementptr index value.
   std::pair<unsigned, bool> getRegForGEPIndex(const Value *V);
 
   /// \brief We're checking to see if we can fold \p LI into \p FoldInst. Note
@@ -142,11 +275,11 @@
     return false;
   }
 
-  /// Reset InsertPt to prepare for inserting instructions into the current
-  /// block.
+  /// \brief Reset InsertPt to prepare for inserting instructions into the
+  /// current block.
   void recomputeInsertPt();
 
-  /// Remove all dead instructions between the I and E.
+  /// \brief Remove all dead instructions between the I and E.
   void removeDeadCode(MachineBasicBlock::iterator I,
                       MachineBasicBlock::iterator E);
 
@@ -155,214 +288,195 @@
     DebugLoc DL;
   };
 
-  /// Prepare InsertPt to begin inserting instructions into the local value area
-  /// and return the old insert position.
+  /// \brief Prepare InsertPt to begin inserting instructions into the local
+  /// value area and return the old insert position.
   SavePoint enterLocalValueArea();
 
-  /// Reset InsertPt to the given old insert position.
+  /// \brief Reset InsertPt to the given old insert position.
   void leaveLocalValueArea(SavePoint Old);
 
   virtual ~FastISel();
 
 protected:
-  explicit FastISel(FunctionLoweringInfo &funcInfo,
-                    const TargetLibraryInfo *libInfo);
+  explicit FastISel(FunctionLoweringInfo &FuncInfo,
+                    const TargetLibraryInfo *LibInfo,
+                    bool SkipTargetIndependentISel = false);
 
-  /// This method is called by target-independent code when the normal FastISel
-  /// process fails to select an instruction.  This gives targets a chance to
-  /// emit code for anything that doesn't fit into FastISel's framework. It
-  /// returns true if it was successful.
-  virtual bool
-  TargetSelectInstruction(const Instruction *I) = 0;
-  
-  /// This method is called by target-independent code to do target specific
-  /// argument lowering. It returns true if it was successful.
-  virtual bool FastLowerArguments();
+  /// \brief This method is called by target-independent code when the normal
+  /// FastISel process fails to select an instruction. This gives targets a
+  /// chance to emit code for anything that doesn't fit into FastISel's
+  /// framework. It returns true if it was successful.
+  virtual bool fastSelectInstruction(const Instruction *I) = 0;
 
-  /// This method is called by target-independent code to request that an
+  /// \brief This method is called by target-independent code to do target-
+  /// specific argument lowering. It returns true if it was successful.
+  virtual bool fastLowerArguments();
+
+  /// \brief This method is called by target-independent code to do target-
+  /// specific call lowering. It returns true if it was successful.
+  virtual bool fastLowerCall(CallLoweringInfo &CLI);
+
+  /// \brief This method is called by target-independent code to do target-
+  /// specific intrinsic lowering. It returns true if it was successful.
+  virtual bool fastLowerIntrinsicCall(const IntrinsicInst *II);
+
+  /// \brief This method is called by target-independent code to request that an
   /// instruction with the given type and opcode be emitted.
-  virtual unsigned FastEmit_(MVT VT,
-                             MVT RetVT,
-                             unsigned Opcode);
+  virtual unsigned fastEmit_(MVT VT, MVT RetVT, unsigned Opcode);
 
-  /// This method is called by target-independent code to request that an
+  /// \brief This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and register operand be emitted.
-  virtual unsigned FastEmit_r(MVT VT,
-                              MVT RetVT,
-                              unsigned Opcode,
-                              unsigned Op0, bool Op0IsKill);
+  virtual unsigned fastEmit_r(MVT VT, MVT RetVT, unsigned Opcode, unsigned Op0,
+                              bool Op0IsKill);
 
-  /// This method is called by target-independent code to request that an
+  /// \brief This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and register operands be emitted.
-  virtual unsigned FastEmit_rr(MVT VT,
-                               MVT RetVT,
-                               unsigned Opcode,
-                               unsigned Op0, bool Op0IsKill,
-                               unsigned Op1, bool Op1IsKill);
+  virtual unsigned fastEmit_rr(MVT VT, MVT RetVT, unsigned Opcode, unsigned Op0,
+                               bool Op0IsKill, unsigned Op1, bool Op1IsKill);
 
-  /// This method is called by target-independent code to request that an
+  /// \brief This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and register and immediate
-  /// operands be emitted.
-  virtual unsigned FastEmit_ri(MVT VT,
-                               MVT RetVT,
-                               unsigned Opcode,
-                               unsigned Op0, bool Op0IsKill,
-                               uint64_t Imm);
+  // operands be emitted.
+  virtual unsigned fastEmit_ri(MVT VT, MVT RetVT, unsigned Opcode, unsigned Op0,
+                               bool Op0IsKill, uint64_t Imm);
 
-  /// This method is called by target-independent code to request that an
+  /// \brief This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and register and floating-point
   /// immediate operands be emitted.
-  virtual unsigned FastEmit_rf(MVT VT,
-                               MVT RetVT,
-                               unsigned Opcode,
-                               unsigned Op0, bool Op0IsKill,
-                               const ConstantFP *FPImm);
+  virtual unsigned fastEmit_rf(MVT VT, MVT RetVT, unsigned Opcode, unsigned Op0,
+                               bool Op0IsKill, const ConstantFP *FPImm);
 
-  /// This method is called by target-independent code to request that an
+  /// \brief This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and register and immediate
   /// operands be emitted.
-  virtual unsigned FastEmit_rri(MVT VT,
-                                MVT RetVT,
-                                unsigned Opcode,
-                                unsigned Op0, bool Op0IsKill,
-                                unsigned Op1, bool Op1IsKill,
-                                uint64_t Imm);
+  virtual unsigned fastEmit_rri(MVT VT, MVT RetVT, unsigned Opcode,
+                                unsigned Op0, bool Op0IsKill, unsigned Op1,
+                                bool Op1IsKill, uint64_t Imm);
 
-  /// \brief This method is a wrapper of FastEmit_ri.
-  /// 
+  /// \brief This method is a wrapper of fastEmit_ri.
+  ///
   /// It first tries to emit an instruction with an immediate operand using
-  /// FastEmit_ri.  If that fails, it materializes the immediate into a register
-  /// and try FastEmit_rr instead.
-  unsigned FastEmit_ri_(MVT VT,
-                        unsigned Opcode,
-                        unsigned Op0, bool Op0IsKill,
+  /// fastEmit_ri.  If that fails, it materializes the immediate into a register
+  /// and try fastEmit_rr instead.
+  unsigned fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0, bool Op0IsKill,
                         uint64_t Imm, MVT ImmType);
 
-  /// This method is called by target-independent code to request that an
+  /// \brief This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and immediate operand be emitted.
-  virtual unsigned FastEmit_i(MVT VT,
-                              MVT RetVT,
-                              unsigned Opcode,
-                              uint64_t Imm);
+  virtual unsigned fastEmit_i(MVT VT, MVT RetVT, unsigned Opcode, uint64_t Imm);
 
-  /// This method is called by target-independent code to request that an
+  /// \brief This method is called by target-independent code to request that an
   /// instruction with the given type, opcode, and floating-point immediate
   /// operand be emitted.
-  virtual unsigned FastEmit_f(MVT VT,
-                              MVT RetVT,
-                              unsigned Opcode,
+  virtual unsigned fastEmit_f(MVT VT, MVT RetVT, unsigned Opcode,
                               const ConstantFP *FPImm);
 
-  /// Emit a MachineInstr with no operands and a result register in the given
-  /// register class.
-  unsigned FastEmitInst_(unsigned MachineInstOpcode,
+  /// \brief Emit a MachineInstr with no operands and a result register in the
+  /// given register class.
+  unsigned fastEmitInst_(unsigned MachineInstOpcode,
                          const TargetRegisterClass *RC);
 
-  /// Emit a MachineInstr with one register operand and a result register in the
-  /// given register class.
-  unsigned FastEmitInst_r(unsigned MachineInstOpcode,
-                          const TargetRegisterClass *RC,
-                          unsigned Op0, bool Op0IsKill);
+  /// \brief Emit a MachineInstr with one register operand and a result register
+  /// in the given register class.
+  unsigned fastEmitInst_r(unsigned MachineInstOpcode,
+                          const TargetRegisterClass *RC, unsigned Op0,
+                          bool Op0IsKill);
 
-  /// Emit a MachineInstr with two register operands and a result register in
-  /// the given register class.
-  unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
-                           const TargetRegisterClass *RC,
-                           unsigned Op0, bool Op0IsKill,
-                           unsigned Op1, bool Op1IsKill);
-
-  /// Emit a MachineInstr with three register operands and a result register in
-  /// the given register class.
-  unsigned FastEmitInst_rrr(unsigned MachineInstOpcode,
-                           const TargetRegisterClass *RC,
-                           unsigned Op0, bool Op0IsKill,
-                           unsigned Op1, bool Op1IsKill,
-                           unsigned Op2, bool Op2IsKill);
-
-  /// Emit a MachineInstr with a register operand, an immediate, and a result
+  /// \brief Emit a MachineInstr with two register operands and a result
   /// register in the given register class.
-  unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
-                           const TargetRegisterClass *RC,
-                           unsigned Op0, bool Op0IsKill,
-                           uint64_t Imm);
+  unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
+                           const TargetRegisterClass *RC, unsigned Op0,
+                           bool Op0IsKill, unsigned Op1, bool Op1IsKill);
 
-  /// Emit a MachineInstr with one register operand and two immediate operands.
-  unsigned FastEmitInst_rii(unsigned MachineInstOpcode,
-                           const TargetRegisterClass *RC,
-                           unsigned Op0, bool Op0IsKill,
-                           uint64_t Imm1, uint64_t Imm2);
-
-  /// Emit a MachineInstr with two register operands and a result register in
-  /// the given register class.
-  unsigned FastEmitInst_rf(unsigned MachineInstOpcode,
-                           const TargetRegisterClass *RC,
-                           unsigned Op0, bool Op0IsKill,
-                           const ConstantFP *FPImm);
-
-  /// Emit a MachineInstr with two register operands, an immediate, and a result
+  /// \brief Emit a MachineInstr with three register operands and a result
   /// register in the given register class.
-  unsigned FastEmitInst_rri(unsigned MachineInstOpcode,
-                            const TargetRegisterClass *RC,
-                            unsigned Op0, bool Op0IsKill,
-                            unsigned Op1, bool Op1IsKill,
+  unsigned fastEmitInst_rrr(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC, unsigned Op0,
+                            bool Op0IsKill, unsigned Op1, bool Op1IsKill,
+                            unsigned Op2, bool Op2IsKill);
+
+  /// \brief Emit a MachineInstr with a register operand, an immediate, and a
+  /// result register in the given register class.
+  unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
+                           const TargetRegisterClass *RC, unsigned Op0,
+                           bool Op0IsKill, uint64_t Imm);
+
+  /// \brief Emit a MachineInstr with one register operand and two immediate
+  /// operands.
+  unsigned fastEmitInst_rii(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC, unsigned Op0,
+                            bool Op0IsKill, uint64_t Imm1, uint64_t Imm2);
+
+  /// \brief Emit a MachineInstr with two register operands and a result
+  /// register in the given register class.
+  unsigned fastEmitInst_rf(unsigned MachineInstOpcode,
+                           const TargetRegisterClass *RC, unsigned Op0,
+                           bool Op0IsKill, const ConstantFP *FPImm);
+
+  /// \brief Emit a MachineInstr with two register operands, an immediate, and a
+  /// result register in the given register class.
+  unsigned fastEmitInst_rri(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC, unsigned Op0,
+                            bool Op0IsKill, unsigned Op1, bool Op1IsKill,
                             uint64_t Imm);
 
-  /// Emit a MachineInstr with two register operands, two immediates operands,
-  /// and a result register in the given register class.
-  unsigned FastEmitInst_rrii(unsigned MachineInstOpcode,
-                             const TargetRegisterClass *RC,
-                             unsigned Op0, bool Op0IsKill,
-                             unsigned Op1, bool Op1IsKill,
+  /// \brief Emit a MachineInstr with two register operands, two immediates
+  /// operands, and a result register in the given register class.
+  unsigned fastEmitInst_rrii(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC, unsigned Op0,
+                             bool Op0IsKill, unsigned Op1, bool Op1IsKill,
                              uint64_t Imm1, uint64_t Imm2);
 
-  /// Emit a MachineInstr with a single immediate operand, and a result register
-  /// in the given register class.
-  unsigned FastEmitInst_i(unsigned MachineInstrOpcode,
-                          const TargetRegisterClass *RC,
-                          uint64_t Imm);
+  /// \brief Emit a MachineInstr with a single immediate operand, and a result
+  /// register in the given register class.
+  unsigned fastEmitInst_i(unsigned MachineInstrOpcode,
+                          const TargetRegisterClass *RC, uint64_t Imm);
 
-  /// Emit a MachineInstr with a two immediate operands.
-  unsigned FastEmitInst_ii(unsigned MachineInstrOpcode,
-                          const TargetRegisterClass *RC,
-                          uint64_t Imm1, uint64_t Imm2);
+  /// \brief Emit a MachineInstr with a two immediate operands.
+  unsigned fastEmitInst_ii(unsigned MachineInstrOpcode,
+                           const TargetRegisterClass *RC, uint64_t Imm1,
+                           uint64_t Imm2);
 
-  /// Emit a MachineInstr for an extract_subreg from a specified index of a
-  /// superregister to a specified type.
-  unsigned FastEmitInst_extractsubreg(MVT RetVT,
-                                      unsigned Op0, bool Op0IsKill,
+  /// \brief Emit a MachineInstr for an extract_subreg from a specified index of
+  /// a superregister to a specified type.
+  unsigned fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0, bool Op0IsKill,
                                       uint32_t Idx);
 
-  /// Emit MachineInstrs to compute the value of Op with all but the least
-  /// significant bit set to zero.
-  unsigned FastEmitZExtFromI1(MVT VT,
-                              unsigned Op0, bool Op0IsKill);
+  /// \brief Emit MachineInstrs to compute the value of Op with all but the
+  /// least significant bit set to zero.
+  unsigned fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill);
 
-  /// Emit an unconditional branch to the given block, unless it is the
+  /// \brief Emit an unconditional branch to the given block, unless it is the
   /// immediate (fall-through) successor, and update the CFG.
-  void FastEmitBranch(MachineBasicBlock *MBB, DebugLoc DL);
+  void fastEmitBranch(MachineBasicBlock *MBB, DebugLoc DL);
 
-  void UpdateValueMap(const Value* I, unsigned Reg, unsigned NumRegs = 1);
+  /// \brief Update the value map to include the new mapping for this
+  /// instruction, or insert an extra copy to get the result in a previous
+  /// determined register.
+  ///
+  /// NOTE: This is only necessary because we might select a block that uses a
+  /// value before we select the block that defines the value. It might be
+  /// possible to fix this by selecting blocks in reverse postorder.
+  void updateValueMap(const Value *I, unsigned Reg, unsigned NumRegs = 1);
 
   unsigned createResultReg(const TargetRegisterClass *RC);
 
-  /// Try to constrain Op so that it is usable by argument OpNum of the provided
-  /// MCInstrDesc. If this fails, create a new virtual register in the correct
-  /// class and COPY the value there.
+  /// \brief Try to constrain Op so that it is usable by argument OpNum of the
+  /// provided MCInstrDesc. If this fails, create a new virtual register in the
+  /// correct class and COPY the value there.
   unsigned constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
                                     unsigned OpNum);
 
-  /// Emit a constant in a register using target-specific logic, such as
+  /// \brief Emit a constant in a register using target-specific logic, such as
   /// constant pool loads.
-  virtual unsigned TargetMaterializeConstant(const Constant* C) {
-    return 0;
-  }
+  virtual unsigned fastMaterializeConstant(const Constant *C) { return 0; }
 
-  /// Emit an alloca address in a register using target-specific logic.
-  virtual unsigned TargetMaterializeAlloca(const AllocaInst* C) {
-    return 0;
-  }
+  /// \brief Emit an alloca address in a register using target-specific logic.
+  virtual unsigned fastMaterializeAlloca(const AllocaInst *C) { return 0; }
 
-  virtual unsigned TargetMaterializeFloatZero(const ConstantFP* CF) {
+  /// \brief Emit the floating-point constant +0.0 in a register using target-
+  /// specific logic.
+  virtual unsigned fastMaterializeFloatZero(const ConstantFP *CF) {
     return 0;
   }
 
@@ -375,30 +489,46 @@
   /// - \c Add has a constant operand.
   bool canFoldAddIntoGEP(const User *GEP, const Value *Add);
 
-  /// Test whether the given value has exactly one use.
-  bool hasTrivialKill(const Value *V) const;
+  /// \brief Test whether the given value has exactly one use.
+  bool hasTrivialKill(const Value *V);
 
   /// \brief Create a machine mem operand from the given instruction.
   MachineMemOperand *createMachineMemOperandFor(const Instruction *I) const;
 
+  CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) const;
+
+  bool lowerCallTo(const CallInst *CI, const char *SymName, unsigned NumArgs);
+  bool lowerCallTo(CallLoweringInfo &CLI);
+
+  bool isCommutativeIntrinsic(IntrinsicInst const *II) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::sadd_with_overflow:
+    case Intrinsic::uadd_with_overflow:
+    case Intrinsic::smul_with_overflow:
+    case Intrinsic::umul_with_overflow:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+
+  bool lowerCall(const CallInst *I);
+  /// \brief Select and emit code for a binary operator instruction, which has
+  /// an opcode which directly corresponds to the given ISD opcode.
+  bool selectBinaryOp(const User *I, unsigned ISDOpcode);
+  bool selectFNeg(const User *I);
+  bool selectGetElementPtr(const User *I);
+  bool selectStackmap(const CallInst *I);
+  bool selectPatchpoint(const CallInst *I);
+  bool selectCall(const User *Call);
+  bool selectIntrinsicCall(const IntrinsicInst *II);
+  bool selectBitCast(const User *I);
+  bool selectCast(const User *I, unsigned Opcode);
+  bool selectExtractValue(const User *I);
+  bool selectInsertValue(const User *I);
+
 private:
-  bool SelectBinaryOp(const User *I, unsigned ISDOpcode);
-
-  bool SelectFNeg(const User *I);
-
-  bool SelectGetElementPtr(const User *I);
-
-  bool SelectStackmap(const CallInst *I);
-  bool SelectCall(const User *I);
-
-  bool SelectBitCast(const User *I);
-
-  bool SelectCast(const User *I, unsigned Opcode);
-
-  bool SelectExtractValue(const User *I);
-
-  bool SelectInsertValue(const User *I);
-
   /// \brief Handle PHI nodes in successor blocks.
   ///
   /// Emit code to ensure constants are copied into registers when needed.
@@ -406,22 +536,34 @@
   /// nodes as input.  We cannot just directly add them, because expansion might
   /// result in multiple MBB's for one BB.  As such, the start of the BB might
   /// correspond to a different MBB than the end.
-  bool HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
+  bool handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
-  /// Helper for getRegForVale. This function is called when the value isn't
-  /// already available in a register and must be materialized with new
+  /// \brief Helper for materializeRegForValue to materialize a constant in a
+  /// target-independent way.
+  unsigned materializeConstant(const Value *V, MVT VT);
+
+  /// \brief Helper for getRegForVale. This function is called when the value
+  /// isn't already available in a register and must be materialized with new
   /// instructions.
   unsigned materializeRegForValue(const Value *V, MVT VT);
 
-  /// Clears LocalValueMap and moves the area for the new local variables to the
-  /// beginning of the block. It helps to avoid spilling cached variables across
-  /// heavy instructions like calls.
+  /// \brief Clears LocalValueMap and moves the area for the new local variables
+  /// to the beginning of the block. It helps to avoid spilling cached variables
+  /// across heavy instructions like calls.
   void flushLocalValueMap();
 
+  /// \brief Insertion point before trying to select the current instruction.
+  MachineBasicBlock::iterator SavedInsertPt;
+
+  /// \brief Add a stackmap or patchpoint intrinsic call's live variable
+  /// operands to a stackmap or patchpoint machine instruction.
   bool addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
                            const CallInst *CI, unsigned StartIdx);
+  bool lowerCallOperands(const CallInst *CI, unsigned ArgIdx, unsigned NumArgs,
+                         const Value *Callee, bool ForceRetVoidTy,
+                         CallLoweringInfo &CLI);
 };
 
-}
+} // end namespace llvm
 
 #endif

diff --git a/include/llvm/CodeGen/ForwardControlFlowIntegrity.h b/include/llvm/CodeGen/ForwardControlFlowIntegrity.h
new file mode 100644
index 0000000..a6232c5
--- /dev/null
+++ b/include/llvm/CodeGen/ForwardControlFlowIntegrity.h

@@ -0,0 +1,123 @@
+//===-- ForwardControlFlowIntegrity.h: Forward-Edge CFI ---------*- C++ -*-===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass instruments indirect calls with checks to ensure that these calls
+// pass through the appropriate jump-instruction table generated by
+// JumpInstrTables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_FORWARDCONTROLFLOWINTEGRITY_H
+#define LLVM_CODEGEN_FORWARDCONTROLFLOWINTEGRITY_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+
+#include <string>
+
+namespace llvm {
+
+class AnalysisUsage;
+class BasicBlock;
+class Constant;
+class Function;
+class Instruction;
+class Module;
+class Value;
+
+/// ForwardControlFlowIntegrity uses the information from JumpInstrTableInfo to
+/// prepend checks to indirect calls to make sure that these calls target valid
+/// locations.
+class ForwardControlFlowIntegrity : public ModulePass {
+public:
+  static char ID;
+
+  ForwardControlFlowIntegrity();
+  ForwardControlFlowIntegrity(JumpTable::JumpTableType JTT,
+                              CFIntegrity CFIType,
+                              bool CFIEnforcing, std::string CFIFuncName);
+  ~ForwardControlFlowIntegrity() override;
+
+  /// Runs the CFI pass on a given module. This works best if the module in
+  /// question is the result of link-time optimization (see lib/LTO).
+  bool runOnModule(Module &M) override;
+  const char *getPassName() const override {
+    return "Forward Control-Flow Integrity";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  typedef SmallVector<Instruction *, 64> CallSet;
+
+  /// A structure that is used to keep track of constant table information.
+  struct CFIConstants {
+    Constant *StartValue;
+    Constant *MaskValue;
+    Constant *Size;
+  };
+
+  /// A map from function type to the base of the table for this type and a mask
+  /// for the table
+  typedef DenseMap<FunctionType *, CFIConstants> CFITables;
+
+  CallSet IndirectCalls;
+
+  /// The type of jumptable implementation.
+  JumpTable::JumpTableType JTType;
+
+  /// The type of CFI check to add before each indirect call.
+  CFIntegrity CFIType;
+
+  /// A value that controls whether or not CFI violations cause a halt.
+  bool CFIEnforcing;
+
+  /// The name of the function to call in case of a CFI violation when
+  /// CFIEnforcing is false. There is a default function that ignores
+  /// violations.
+  std::string CFIFuncName;
+
+  /// The alignment of each entry in the table, from JumpInstrTableInfo. The
+  /// JumpInstrTableInfo class always makes this a power of two.
+  uint64_t ByteAlignment;
+
+  /// The base-2 logarithm of ByteAlignment, needed for some of the transforms
+  /// (like CFIntegrity::Ror)
+  unsigned LogByteAlignment;
+
+  /// Adds checks to each indirect call site to make sure that it is calling a
+  /// function in our jump table.
+  void updateIndirectCalls(Module &M, CFITables &CFIT);
+
+  /// Walks the instructions to find all the indirect calls.
+  void getIndirectCalls(Module &M);
+
+  /// Adds a function that handles violations in non-enforcing mode
+  /// (!CFIEnforcing). The default warning function simply returns, since the
+  /// exact details of how to handle CFI violations depend on the application.
+  void addWarningFunction(Module &M);
+
+  /// Rewrites a function pointer in a call/invoke instruction to force it into
+  /// a table.
+  void rewriteFunctionPointer(Module &M, Instruction *I, Value *FunPtr,
+                              Constant *JumpTableStart, Constant *JumpTableMask,
+                              Constant *JumpTableSize);
+
+  /// Inserts a check and a call to a warning function at a given instruction
+  /// that must be an indirect call.
+  void insertWarning(Module &M, BasicBlock *Block, Instruction *I,
+                     Value *FunPtr);
+};
+
+ModulePass *
+createForwardControlFlowIntegrityPass(JumpTable::JumpTableType JTT,
+                                      CFIntegrity CFIType,
+                                      bool CFIEnforcing, StringRef CFIFuncName);
+}
+
+#endif // LLVM_CODEGEN_FORWARDCONTROLFLOWINTEGRITY_H

diff --git a/include/llvm/CodeGen/FunctionLoweringInfo.h b/include/llvm/CodeGen/FunctionLoweringInfo.h
index 9636b51..91f20d0 100644
--- a/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/include/llvm/CodeGen/FunctionLoweringInfo.h

@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -50,10 +51,10 @@
 /// function that is used when lowering a region of the function.
 ///
 class FunctionLoweringInfo {
-  const TargetMachine &TM;
 public:
   const Function *Fn;
   MachineFunction *MF;
+  const TargetLowering *TLI;
   MachineRegisterInfo *RegInfo;
   BranchProbabilityInfo *BPI;
   /// CanLowerReturn - true iff the function's return value can be lowered to
@@ -106,6 +107,10 @@
                     KnownZero(1, 0) {}
   };
 
+  /// Record the preferred extend type (ISD::SIGN_EXTEND or ISD::ZERO_EXTEND)
+  /// for a value.
+  DenseMap<const Value *, ISD::NodeType> PreferredExtendType;
+
   /// VisitedBBs - The set of basic blocks visited thus far by instruction
   /// selection.
   SmallPtrSet<const BasicBlock*, 4> VisitedBBs;
@@ -115,14 +120,13 @@
   /// TODO: This isn't per-function state, it's per-basic-block state. But
   /// there's no other convenient place for it to live right now.
   std::vector<std::pair<MachineInstr*, unsigned> > PHINodesToUpdate;
+  unsigned OrigNumPHINodesToUpdate;
 
   /// If the current MBB is a landing pad, the exception pointer and exception
   /// selector registers are copied into these virtual registers by
   /// SelectionDAGISel::PrepareEHLandingPad().
   unsigned ExceptionPointerVirtReg, ExceptionSelectorVirtReg;
 
-  explicit FunctionLoweringInfo(const TargetMachine &TM) : TM(TM) {}
-
   /// set - Initialize this FunctionLoweringInfo with the given Function
   /// and its associated MachineFunction.
   ///

diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index a8f2368..bbf0ad3 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h

@@ -472,11 +472,11 @@
     ///   5) ISD::CvtCode indicating the type of conversion to do
     CONVERT_RNDSAT,
 
-    /// FP16_TO_FP32, FP32_TO_FP16 - These operators are used to perform
-    /// promotions and truncation for half-precision (16 bit) floating
-    /// numbers. We need special nodes since FP16 is a storage-only type with
-    /// special semantics of operations.
-    FP16_TO_FP32, FP32_TO_FP16,
+    /// FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions
+    /// and truncation for half-precision (16 bit) floating numbers. These nodes
+    /// form a semi-softened interface for dealing with f16 (as an i16), which
+    /// is often a storage-only type but has native conversions.
+    FP16_TO_FP, FP_TO_FP16,
 
     /// FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW,
     /// FLOG, FLOG2, FLOG10, FEXP, FEXP2,
@@ -485,7 +485,8 @@
     FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW,
     FLOG, FLOG2, FLOG10, FEXP, FEXP2,
     FCEIL, FTRUNC, FRINT, FNEARBYINT, FROUND, FFLOOR,
-    
+    FMINNUM, FMAXNUM,
+
     /// FSINCOS - Compute both fsin and fcos as a single operation.
     FSINCOS,
 

diff --git a/include/llvm/CodeGen/JITCodeEmitter.h b/include/llvm/CodeGen/JITCodeEmitter.h
deleted file mode 100644
index dc2a027..0000000
--- a/include/llvm/CodeGen/JITCodeEmitter.h
+++ /dev/null

@@ -1,344 +0,0 @@
-//===-- llvm/CodeGen/JITCodeEmitter.h - Code emission ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an abstract interface that is used by the machine code
-// emission framework to output the code.  This allows machine code emission to
-// be separated from concerns such as resolution of call targets, and where the
-// machine code will be written (memory or disk, f.e.).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_JITCODEEMITTER_H
-#define LLVM_CODEGEN_JITCODEEMITTER_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/MachineCodeEmitter.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/MathExtras.h"
-#include <string>
-
-namespace llvm {
-
-class MachineBasicBlock;
-class MachineConstantPool;
-class MachineJumpTableInfo;
-class MachineFunction;
-class MachineModuleInfo;
-class MachineRelocation;
-class Value;
-class GlobalValue;
-class Function;
-  
-/// JITCodeEmitter - This class defines two sorts of methods: those for
-/// emitting the actual bytes of machine code, and those for emitting auxiliary
-/// structures, such as jump tables, relocations, etc.
-///
-/// Emission of machine code is complicated by the fact that we don't (in
-/// general) know the size of the machine code that we're about to emit before
-/// we emit it.  As such, we preallocate a certain amount of memory, and set the
-/// BufferBegin/BufferEnd pointers to the start and end of the buffer.  As we
-/// emit machine instructions, we advance the CurBufferPtr to indicate the
-/// location of the next byte to emit.  In the case of a buffer overflow (we
-/// need to emit more machine code than we have allocated space for), the
-/// CurBufferPtr will saturate to BufferEnd and ignore stores.  Once the entire
-/// function has been emitted, the overflow condition is checked, and if it has
-/// occurred, more memory is allocated, and we reemit the code into it.
-/// 
-class JITCodeEmitter : public MachineCodeEmitter {
-  void anchor() override;
-public:
-  virtual ~JITCodeEmitter() {}
-
-  /// startFunction - This callback is invoked when the specified function is
-  /// about to be code generated.  This initializes the BufferBegin/End/Ptr
-  /// fields.
-  ///
-  void startFunction(MachineFunction &F) override = 0;
-
-  /// finishFunction - This callback is invoked when the specified function has
-  /// finished code generation.  If a buffer overflow has occurred, this method
-  /// returns true (the callee is required to try again), otherwise it returns
-  /// false.
-  ///
-  bool finishFunction(MachineFunction &F) override = 0;
-
-  /// allocIndirectGV - Allocates and fills storage for an indirect
-  /// GlobalValue, and returns the address.
-  virtual void *allocIndirectGV(const GlobalValue *GV,
-                                const uint8_t *Buffer, size_t Size,
-                                unsigned Alignment) = 0;
-
-  /// emitByte - This callback is invoked when a byte needs to be written to the
-  /// output stream.
-  ///
-  void emitByte(uint8_t B) {
-    if (CurBufferPtr != BufferEnd)
-      *CurBufferPtr++ = B;
-  }
-
-  /// emitWordLE - This callback is invoked when a 32-bit word needs to be
-  /// written to the output stream in little-endian format.
-  ///
-  void emitWordLE(uint32_t W) {
-    if (4 <= BufferEnd-CurBufferPtr) {
-      *CurBufferPtr++ = (uint8_t)(W >>  0);
-      *CurBufferPtr++ = (uint8_t)(W >>  8);
-      *CurBufferPtr++ = (uint8_t)(W >> 16);
-      *CurBufferPtr++ = (uint8_t)(W >> 24);
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-  
-  /// emitWordBE - This callback is invoked when a 32-bit word needs to be
-  /// written to the output stream in big-endian format.
-  ///
-  void emitWordBE(uint32_t W) {
-    if (4 <= BufferEnd-CurBufferPtr) {
-      *CurBufferPtr++ = (uint8_t)(W >> 24);
-      *CurBufferPtr++ = (uint8_t)(W >> 16);
-      *CurBufferPtr++ = (uint8_t)(W >>  8);
-      *CurBufferPtr++ = (uint8_t)(W >>  0);
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-
-  /// emitDWordLE - This callback is invoked when a 64-bit word needs to be
-  /// written to the output stream in little-endian format.
-  ///
-  void emitDWordLE(uint64_t W) {
-    if (8 <= BufferEnd-CurBufferPtr) {
-      *CurBufferPtr++ = (uint8_t)(W >>  0);
-      *CurBufferPtr++ = (uint8_t)(W >>  8);
-      *CurBufferPtr++ = (uint8_t)(W >> 16);
-      *CurBufferPtr++ = (uint8_t)(W >> 24);
-      *CurBufferPtr++ = (uint8_t)(W >> 32);
-      *CurBufferPtr++ = (uint8_t)(W >> 40);
-      *CurBufferPtr++ = (uint8_t)(W >> 48);
-      *CurBufferPtr++ = (uint8_t)(W >> 56);
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-  
-  /// emitDWordBE - This callback is invoked when a 64-bit word needs to be
-  /// written to the output stream in big-endian format.
-  ///
-  void emitDWordBE(uint64_t W) {
-    if (8 <= BufferEnd-CurBufferPtr) {
-      *CurBufferPtr++ = (uint8_t)(W >> 56);
-      *CurBufferPtr++ = (uint8_t)(W >> 48);
-      *CurBufferPtr++ = (uint8_t)(W >> 40);
-      *CurBufferPtr++ = (uint8_t)(W >> 32);
-      *CurBufferPtr++ = (uint8_t)(W >> 24);
-      *CurBufferPtr++ = (uint8_t)(W >> 16);
-      *CurBufferPtr++ = (uint8_t)(W >>  8);
-      *CurBufferPtr++ = (uint8_t)(W >>  0);
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-
-  /// emitAlignment - Move the CurBufferPtr pointer up to the specified
-  /// alignment (saturated to BufferEnd of course).
-  void emitAlignment(unsigned Alignment) {
-    if (Alignment == 0) Alignment = 1;
-    uint8_t *NewPtr = (uint8_t*)RoundUpToAlignment((uintptr_t)CurBufferPtr,
-                                                   Alignment);
-    CurBufferPtr = std::min(NewPtr, BufferEnd);
-  }
-
-  /// emitAlignmentWithFill - Similar to emitAlignment, except that the
-  /// extra bytes are filled with the provided byte.
-  void emitAlignmentWithFill(unsigned Alignment, uint8_t Fill) {
-    if (Alignment == 0) Alignment = 1;
-    uint8_t *NewPtr = (uint8_t*)RoundUpToAlignment((uintptr_t)CurBufferPtr,
-                                                   Alignment);
-    // Fail if we don't have room.
-    if (NewPtr > BufferEnd) {
-      CurBufferPtr = BufferEnd;
-      return;
-    }
-    while (CurBufferPtr < NewPtr) {
-      *CurBufferPtr++ = Fill;
-    }
-  }
-
-  /// emitULEB128Bytes - This callback is invoked when a ULEB128 needs to be
-  /// written to the output stream.
-  void emitULEB128Bytes(uint64_t Value, unsigned PadTo = 0) {
-    do {
-      uint8_t Byte = Value & 0x7f;
-      Value >>= 7;
-      if (Value || PadTo != 0) Byte |= 0x80;
-      emitByte(Byte);
-    } while (Value);
-
-    if (PadTo) {
-      do {
-        uint8_t Byte = (PadTo > 1) ? 0x80 : 0x0;
-        emitByte(Byte);
-      } while (--PadTo);
-    }
-  }
-  
-  /// emitSLEB128Bytes - This callback is invoked when a SLEB128 needs to be
-  /// written to the output stream.
-  void emitSLEB128Bytes(int64_t Value) {
-    int32_t Sign = Value >> (8 * sizeof(Value) - 1);
-    bool IsMore;
-  
-    do {
-      uint8_t Byte = Value & 0x7f;
-      Value >>= 7;
-      IsMore = Value != Sign || ((Byte ^ Sign) & 0x40) != 0;
-      if (IsMore) Byte |= 0x80;
-      emitByte(Byte);
-    } while (IsMore);
-  }
-
-  /// emitString - This callback is invoked when a String needs to be
-  /// written to the output stream.
-  void emitString(const std::string &String) {
-    for (size_t i = 0, N = String.size(); i < N; ++i) {
-      uint8_t C = String[i];
-      emitByte(C);
-    }
-    emitByte(0);
-  }
-  
-  /// emitInt32 - Emit a int32 directive.
-  void emitInt32(uint32_t Value) {
-    if (4 <= BufferEnd-CurBufferPtr) {
-      *((uint32_t*)CurBufferPtr) = Value;
-      CurBufferPtr += 4;
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-
-  /// emitInt64 - Emit a int64 directive.
-  void emitInt64(uint64_t Value) {
-    if (8 <= BufferEnd-CurBufferPtr) {
-      *((uint64_t*)CurBufferPtr) = Value;
-      CurBufferPtr += 8;
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-  
-  /// emitInt32At - Emit the Int32 Value in Addr.
-  void emitInt32At(uintptr_t *Addr, uintptr_t Value) {
-    if (Addr >= (uintptr_t*)BufferBegin && Addr < (uintptr_t*)BufferEnd)
-      (*(uint32_t*)Addr) = (uint32_t)Value;
-  }
-  
-  /// emitInt64At - Emit the Int64 Value in Addr.
-  void emitInt64At(uintptr_t *Addr, uintptr_t Value) {
-    if (Addr >= (uintptr_t*)BufferBegin && Addr < (uintptr_t*)BufferEnd)
-      (*(uint64_t*)Addr) = (uint64_t)Value;
-  }
-  
-  
-  /// emitLabel - Emits a label
-  void emitLabel(MCSymbol *Label) override = 0;
-
-  /// allocateSpace - Allocate a block of space in the current output buffer,
-  /// returning null (and setting conditions to indicate buffer overflow) on
-  /// failure.  Alignment is the alignment in bytes of the buffer desired.
-  void *allocateSpace(uintptr_t Size, unsigned Alignment) override {
-    emitAlignment(Alignment);
-    void *Result;
-    
-    // Check for buffer overflow.
-    if (Size >= (uintptr_t)(BufferEnd-CurBufferPtr)) {
-      CurBufferPtr = BufferEnd;
-      Result = nullptr;
-    } else {
-      // Allocate the space.
-      Result = CurBufferPtr;
-      CurBufferPtr += Size;
-    }
-    
-    return Result;
-  }
-
-  /// allocateGlobal - Allocate memory for a global.  Unlike allocateSpace,
-  /// this method does not allocate memory in the current output buffer,
-  /// because a global may live longer than the current function.
-  virtual void *allocateGlobal(uintptr_t Size, unsigned Alignment) = 0;
-
-  /// StartMachineBasicBlock - This should be called by the target when a new
-  /// basic block is about to be emitted.  This way the MCE knows where the
-  /// start of the block is, and can implement getMachineBasicBlockAddress.
-  void StartMachineBasicBlock(MachineBasicBlock *MBB) override = 0;
-
-  /// getCurrentPCValue - This returns the address that the next emitted byte
-  /// will be output to.
-  ///
-  uintptr_t getCurrentPCValue() const override {
-    return (uintptr_t)CurBufferPtr;
-  }
-
-  /// getCurrentPCOffset - Return the offset from the start of the emitted
-  /// buffer that we are currently writing to.
-  uintptr_t getCurrentPCOffset() const override {
-    return CurBufferPtr-BufferBegin;
-  }
-
-  /// earlyResolveAddresses - True if the code emitter can use symbol addresses 
-  /// during code emission time. The JIT is capable of doing this because it
-  /// creates jump tables or constant pools in memory on the fly while the
-  /// object code emitters rely on a linker to have real addresses and should
-  /// use relocations instead.
-  bool earlyResolveAddresses() const override { return true; }
-
-  /// addRelocation - Whenever a relocatable address is needed, it should be
-  /// noted with this interface.
-  void addRelocation(const MachineRelocation &MR) override = 0;
-
-  /// FIXME: These should all be handled with relocations!
-  
-  /// getConstantPoolEntryAddress - Return the address of the 'Index' entry in
-  /// the constant pool that was last emitted with the emitConstantPool method.
-  ///
-  uintptr_t getConstantPoolEntryAddress(unsigned Index) const override = 0;
-
-  /// getJumpTableEntryAddress - Return the address of the jump table with index
-  /// 'Index' in the function that last called initJumpTableInfo.
-  ///
-  uintptr_t getJumpTableEntryAddress(unsigned Index) const override = 0;
-
-  /// getMachineBasicBlockAddress - Return the address of the specified
-  /// MachineBasicBlock, only usable after the label for the MBB has been
-  /// emitted.
-  ///
-  uintptr_t
-    getMachineBasicBlockAddress(MachineBasicBlock *MBB) const override = 0;
-
-  /// getLabelAddress - Return the address of the specified Label, only usable
-  /// after the Label has been emitted.
-  ///
-  uintptr_t getLabelAddress(MCSymbol *Label) const override = 0;
-
-  /// Specifies the MachineModuleInfo object. This is used for exception handling
-  /// purposes.
-  void setModuleInfo(MachineModuleInfo* Info) override = 0;
-
-  /// getLabelLocations - Return the label locations map of the label IDs to
-  /// their address.
-  virtual DenseMap<MCSymbol*, uintptr_t> *getLabelLocations() {
-    return nullptr;
-  }
-};
-
-} // End llvm namespace
-
-#endif

diff --git a/include/llvm/CodeGen/JumpInstrTables.h b/include/llvm/CodeGen/JumpInstrTables.h
index 6ca3d7d..005bc1e 100644
--- a/include/llvm/CodeGen/JumpInstrTables.h
+++ b/include/llvm/CodeGen/JumpInstrTables.h

@@ -39,13 +39,14 @@
 ///   jmp f_orig@PLT
 /// \endverbatim
 ///
-/// Support for an architecture depends on two functions in TargetInstrInfo:
-/// getUnconditionalBranch, and getTrap. AsmPrinter uses these to generate the
-/// appropriate instructions for the jump statement (an unconditional branch)
-/// and for padding to make the table have a size that is a power of two. This
-/// padding uses a trap instruction to ensure that calls to this area halt the
-/// program. The default implementations of these functions call
-/// llvm_unreachable.
+/// Support for an architecture depends on three functions in TargetInstrInfo:
+/// getUnconditionalBranch, getTrap, and getJumpInstrTableEntryBound. AsmPrinter
+/// uses these to generate the appropriate instructions for the jump statement
+/// (an unconditional branch) and for padding to make the table have a size that
+/// is a power of two. This padding uses a trap instruction to ensure that calls
+/// to this area halt the program. The default implementations of these
+/// functions call llvm_unreachable, except for getJumpInstrTableEntryBound,
+/// which returns 0 by default.
 class JumpInstrTables : public ModulePass {
 public:
   static char ID;
@@ -64,6 +65,14 @@
   /// Checks to see if there is already a table for the given FunctionType.
   bool hasTable(FunctionType *FunTy);
 
+  /// Maps the function into a subset of function types, depending on the
+  /// jump-instruction table style selected from JumpTableTypes in
+  /// JumpInstrTables.cpp. The choice of mapping determines the number of
+  /// jump-instruction tables generated by this pass. E.g., the simplest mapping
+  /// converts every function type into void f(); so, all functions end up in a
+  /// single table.
+  static FunctionType *transformType(JumpTable::JumpTableType JTT,
+                                     FunctionType *FunTy);
 private:
   /// The metadata used while a jump table is being built
   struct TableMeta {
@@ -76,14 +85,6 @@
 
   typedef DenseMap<FunctionType *, struct TableMeta> JumpMap;
 
-  /// Maps the function into a subset of function types, depending on the
-  /// jump-instruction table style selected from JumpTableTypes in
-  /// JumpInstrTables.cpp. The choice of mapping determines the number of
-  /// jump-instruction tables generated by this pass. E.g., the simplest mapping
-  /// converts every function type into void f(); so, all functions end up in a
-  /// single table.
-  FunctionType *transformType(FunctionType *FunTy);
-
   /// The current state of functions and jump entries in the table(s).
   JumpMap Metadata;
 

diff --git a/include/llvm/CodeGen/LexicalScopes.h b/include/llvm/CodeGen/LexicalScopes.h
index 036aea3..021fd98 100644
--- a/include/llvm/CodeGen/LexicalScopes.h
+++ b/include/llvm/CodeGen/LexicalScopes.h

@@ -148,12 +148,6 @@
   /// empty - Return true if there is any lexical scope information available.
   bool empty() { return CurrentFnLexicalScope == nullptr; }
 
-  /// isCurrentFunctionScope - Return true if given lexical scope represents
-  /// current function.
-  bool isCurrentFunctionScope(const LexicalScope *LS) {
-    return LS == CurrentFnLexicalScope;
-  }
-
   /// getCurrentFunctionScope - Return lexical scope for the current function.
   LexicalScope *getCurrentFunctionScope() const {
     return CurrentFnLexicalScope;
@@ -163,7 +157,7 @@
   /// which have machine instructions that belong to lexical scope identified by
   /// DebugLoc.
   void getMachineBasicBlocks(DebugLoc DL,
-                             SmallPtrSet<const MachineBasicBlock *, 4> &MBBs);
+                             SmallPtrSetImpl<const MachineBasicBlock *> &MBBs);
 
   /// dominates - Return true if DebugLoc's lexical scope dominates at least one
   /// machine instruction's lexical scope in a given machine basic block.

diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index 176665b..f9bd317 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h

@@ -17,8 +17,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_LIVEINTERVAL_ANALYSIS_H
-#define LLVM_CODEGEN_LIVEINTERVAL_ANALYSIS_H
+#ifndef LLVM_CODEGEN_LIVEINTERVALANALYSIS_H
+#define LLVM_CODEGEN_LIVEINTERVALANALYSIS_H
 
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/SmallVector.h"
@@ -50,7 +50,6 @@
   class LiveIntervals : public MachineFunctionPass {
     MachineFunction* MF;
     MachineRegisterInfo* MRI;
-    const TargetMachine* TM;
     const TargetRegisterInfo* TRI;
     const TargetInstrInfo* TII;
     AliasAnalysis *AA;

diff --git a/include/llvm/CodeGen/LivePhysRegs.h b/include/llvm/CodeGen/LivePhysRegs.h
index 847092b..91e4ddc 100644
--- a/include/llvm/CodeGen/LivePhysRegs.h
+++ b/include/llvm/CodeGen/LivePhysRegs.h

@@ -26,8 +26,8 @@
 // %XMM0<def> = ..., %YMM0<imp-use> (%YMM0 and all its sub-registers are alive)
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_LIVE_PHYS_REGS_H
-#define LLVM_CODEGEN_LIVE_PHYS_REGS_H
+#ifndef LLVM_CODEGEN_LIVEPHYSREGS_H
+#define LLVM_CODEGEN_LIVEPHYSREGS_H
 
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -143,4 +143,4 @@
 
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_LIVE_PHYS_REGS_H
+#endif

diff --git a/include/llvm/CodeGen/LiveRangeEdit.h b/include/llvm/CodeGen/LiveRangeEdit.h
index 5767cab..44c3c4e 100644
--- a/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/include/llvm/CodeGen/LiveRangeEdit.h

@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 namespace llvm {
 
@@ -111,18 +112,15 @@
   /// @param vrm Map of virtual registers to physical registers for this
   ///            function.  If NULL, no virtual register map updates will
   ///            be done.  This could be the case if called before Regalloc.
-  LiveRangeEdit(LiveInterval *parent,
-                SmallVectorImpl<unsigned> &newRegs,
-                MachineFunction &MF,
-                LiveIntervals &lis,
-                VirtRegMap *vrm,
+  LiveRangeEdit(LiveInterval *parent, SmallVectorImpl<unsigned> &newRegs,
+                MachineFunction &MF, LiveIntervals &lis, VirtRegMap *vrm,
                 Delegate *delegate = nullptr)
-    : Parent(parent), NewRegs(newRegs),
-      MRI(MF.getRegInfo()), LIS(lis), VRM(vrm),
-      TII(*MF.getTarget().getInstrInfo()),
-      TheDelegate(delegate),
-      FirstNew(newRegs.size()),
-      ScannedRemattable(false) { MRI.setDelegate(this); }
+      : Parent(parent), NewRegs(newRegs), MRI(MF.getRegInfo()), LIS(lis),
+        VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()),
+        TheDelegate(delegate), FirstNew(newRegs.size()),
+        ScannedRemattable(false) {
+    MRI.setDelegate(this);
+  }
 
   ~LiveRangeEdit() { MRI.resetDelegate(this); }
 

diff --git a/include/llvm/CodeGen/LiveVariables.h b/include/llvm/CodeGen/LiveVariables.h
index a4a5fcc..55b97dc 100644
--- a/include/llvm/CodeGen/LiveVariables.h
+++ b/include/llvm/CodeGen/LiveVariables.h

@@ -134,14 +134,14 @@
   // PhysRegInfo - Keep track of which instruction was the last def of a
   // physical register. This is a purely local property, because all physical
   // register references are presumed dead across basic blocks.
-  MachineInstr **PhysRegDef;
+  std::vector<MachineInstr *> PhysRegDef;
 
   // PhysRegInfo - Keep track of which instruction was the last use of a
   // physical register. This is a purely local property, because all physical
   // register references are presumed dead across basic blocks.
-  MachineInstr **PhysRegUse;
+  std::vector<MachineInstr *> PhysRegUse;
 
-  SmallVector<unsigned, 4> *PHIVarInfo;
+  std::vector<SmallVector<unsigned, 4>> PHIVarInfo;
 
   // DistanceMap - Keep track the distance of a MI from the start of the
   // current basic block.
@@ -175,6 +175,10 @@
   /// register which is used in a PHI node. We map that to the BB the vreg
   /// is coming from.
   void analyzePHINodes(const MachineFunction& Fn);
+
+  void runOnInstr(MachineInstr *MI, SmallVectorImpl<unsigned> &Defs);
+
+  void runOnBlock(MachineBasicBlock *MBB, unsigned NumRegs);
 public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;

diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index a08cc2e..1440b96 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h

@@ -486,11 +486,15 @@
   /// Insert a range of instructions into the instruction list before I.
   template<typename IT>
   void insert(iterator I, IT S, IT E) {
+    assert((I == end() || I->getParent() == this) &&
+           "iterator points outside of basic block");
     Insts.insert(I.getInstrIterator(), S, E);
   }
 
   /// Insert MI into the instruction list before I.
   iterator insert(iterator I, MachineInstr *MI) {
+    assert((I == end() || I->getParent() == this) &&
+           "iterator points outside of basic block");
     assert(!MI->isBundledWithPred() && !MI->isBundledWithSucc() &&
            "Cannot insert instruction with bundle flags");
     return Insts.insert(I.getInstrIterator(), MI);
@@ -498,6 +502,8 @@
 
   /// Insert MI into the instruction list after I.
   iterator insertAfter(iterator I, MachineInstr *MI) {
+    assert((I == end() || I->getParent() == this) &&
+           "iterator points outside of basic block");
     assert(!MI->isBundledWithPred() && !MI->isBundledWithSucc() &&
            "Cannot insert instruction with bundle flags");
     return Insts.insertAfter(I.getInstrIterator(), MI);

diff --git a/include/llvm/CodeGen/MachineCodeEmitter.h b/include/llvm/CodeGen/MachineCodeEmitter.h
deleted file mode 100644
index 81b0ba1..0000000
--- a/include/llvm/CodeGen/MachineCodeEmitter.h
+++ /dev/null

@@ -1,334 +0,0 @@
-//===-- llvm/CodeGen/MachineCodeEmitter.h - Code emission -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an abstract interface that is used by the machine code
-// emission framework to output the code.  This allows machine code emission to
-// be separated from concerns such as resolution of call targets, and where the
-// machine code will be written (memory or disk, f.e.).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_MACHINECODEEMITTER_H
-#define LLVM_CODEGEN_MACHINECODEEMITTER_H
-
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/Support/DataTypes.h"
-#include <string>
-
-namespace llvm {
-
-class MachineBasicBlock;
-class MachineConstantPool;
-class MachineJumpTableInfo;
-class MachineFunction;
-class MachineModuleInfo;
-class MachineRelocation;
-class Value;
-class GlobalValue;
-class Function;
-class MCSymbol;
-
-/// MachineCodeEmitter - This class defines two sorts of methods: those for
-/// emitting the actual bytes of machine code, and those for emitting auxiliary
-/// structures, such as jump tables, relocations, etc.
-///
-/// Emission of machine code is complicated by the fact that we don't (in
-/// general) know the size of the machine code that we're about to emit before
-/// we emit it.  As such, we preallocate a certain amount of memory, and set the
-/// BufferBegin/BufferEnd pointers to the start and end of the buffer.  As we
-/// emit machine instructions, we advance the CurBufferPtr to indicate the
-/// location of the next byte to emit.  In the case of a buffer overflow (we
-/// need to emit more machine code than we have allocated space for), the
-/// CurBufferPtr will saturate to BufferEnd and ignore stores.  Once the entire
-/// function has been emitted, the overflow condition is checked, and if it has
-/// occurred, more memory is allocated, and we reemit the code into it.
-/// 
-class MachineCodeEmitter {
-  virtual void anchor();
-protected:
-  /// BufferBegin/BufferEnd - Pointers to the start and end of the memory
-  /// allocated for this code buffer.
-  uint8_t *BufferBegin, *BufferEnd;
-  /// CurBufferPtr - Pointer to the next byte of memory to fill when emitting
-  /// code.  This is guaranteed to be in the range [BufferBegin,BufferEnd].  If
-  /// this pointer is at BufferEnd, it will never move due to code emission, and
-  /// all code emission requests will be ignored (this is the buffer overflow
-  /// condition).
-  uint8_t *CurBufferPtr;
-
-public:
-  virtual ~MachineCodeEmitter() {}
-
-  /// startFunction - This callback is invoked when the specified function is
-  /// about to be code generated.  This initializes the BufferBegin/End/Ptr
-  /// fields.
-  ///
-  virtual void startFunction(MachineFunction &F) = 0;
-
-  /// finishFunction - This callback is invoked when the specified function has
-  /// finished code generation.  If a buffer overflow has occurred, this method
-  /// returns true (the callee is required to try again), otherwise it returns
-  /// false.
-  ///
-  virtual bool finishFunction(MachineFunction &F) = 0;
-
-  /// emitByte - This callback is invoked when a byte needs to be written to the
-  /// output stream.
-  ///
-  void emitByte(uint8_t B) {
-    if (CurBufferPtr != BufferEnd)
-      *CurBufferPtr++ = B;
-  }
-
-  /// emitWordLE - This callback is invoked when a 32-bit word needs to be
-  /// written to the output stream in little-endian format.
-  ///
-  void emitWordLE(uint32_t W) {
-    if (4 <= BufferEnd-CurBufferPtr) {
-      emitWordLEInto(CurBufferPtr, W);
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-
-  /// emitWordLEInto - This callback is invoked when a 32-bit word needs to be
-  /// written to an arbitrary buffer in little-endian format.  Buf must have at
-  /// least 4 bytes of available space.
-  ///
-  static void emitWordLEInto(uint8_t *&Buf, uint32_t W) {
-    *Buf++ = (uint8_t)(W >>  0);
-    *Buf++ = (uint8_t)(W >>  8);
-    *Buf++ = (uint8_t)(W >> 16);
-    *Buf++ = (uint8_t)(W >> 24);
-  }
-
-  /// emitWordBE - This callback is invoked when a 32-bit word needs to be
-  /// written to the output stream in big-endian format.
-  ///
-  void emitWordBE(uint32_t W) {
-    if (4 <= BufferEnd-CurBufferPtr) {
-      *CurBufferPtr++ = (uint8_t)(W >> 24);
-      *CurBufferPtr++ = (uint8_t)(W >> 16);
-      *CurBufferPtr++ = (uint8_t)(W >>  8);
-      *CurBufferPtr++ = (uint8_t)(W >>  0);
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-
-  /// emitDWordLE - This callback is invoked when a 64-bit word needs to be
-  /// written to the output stream in little-endian format.
-  ///
-  void emitDWordLE(uint64_t W) {
-    if (8 <= BufferEnd-CurBufferPtr) {
-      *CurBufferPtr++ = (uint8_t)(W >>  0);
-      *CurBufferPtr++ = (uint8_t)(W >>  8);
-      *CurBufferPtr++ = (uint8_t)(W >> 16);
-      *CurBufferPtr++ = (uint8_t)(W >> 24);
-      *CurBufferPtr++ = (uint8_t)(W >> 32);
-      *CurBufferPtr++ = (uint8_t)(W >> 40);
-      *CurBufferPtr++ = (uint8_t)(W >> 48);
-      *CurBufferPtr++ = (uint8_t)(W >> 56);
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-  
-  /// emitDWordBE - This callback is invoked when a 64-bit word needs to be
-  /// written to the output stream in big-endian format.
-  ///
-  void emitDWordBE(uint64_t W) {
-    if (8 <= BufferEnd-CurBufferPtr) {
-      *CurBufferPtr++ = (uint8_t)(W >> 56);
-      *CurBufferPtr++ = (uint8_t)(W >> 48);
-      *CurBufferPtr++ = (uint8_t)(W >> 40);
-      *CurBufferPtr++ = (uint8_t)(W >> 32);
-      *CurBufferPtr++ = (uint8_t)(W >> 24);
-      *CurBufferPtr++ = (uint8_t)(W >> 16);
-      *CurBufferPtr++ = (uint8_t)(W >>  8);
-      *CurBufferPtr++ = (uint8_t)(W >>  0);
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-
-  /// emitAlignment - Move the CurBufferPtr pointer up to the specified
-  /// alignment (saturated to BufferEnd of course).
-  void emitAlignment(unsigned Alignment) {
-    if (Alignment == 0) Alignment = 1;
-
-    if(Alignment <= (uintptr_t)(BufferEnd-CurBufferPtr)) {
-      // Move the current buffer ptr up to the specified alignment.
-      CurBufferPtr =
-        (uint8_t*)(((uintptr_t)CurBufferPtr+Alignment-1) &
-                   ~(uintptr_t)(Alignment-1));
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-  
-
-  /// emitULEB128Bytes - This callback is invoked when a ULEB128 needs to be
-  /// written to the output stream.
-  void emitULEB128Bytes(uint64_t Value) {
-    do {
-      uint8_t Byte = Value & 0x7f;
-      Value >>= 7;
-      if (Value) Byte |= 0x80;
-      emitByte(Byte);
-    } while (Value);
-  }
-  
-  /// emitSLEB128Bytes - This callback is invoked when a SLEB128 needs to be
-  /// written to the output stream.
-  void emitSLEB128Bytes(uint64_t Value) {
-    uint64_t Sign = Value >> (8 * sizeof(Value) - 1);
-    bool IsMore;
-  
-    do {
-      uint8_t Byte = Value & 0x7f;
-      Value >>= 7;
-      IsMore = Value != Sign || ((Byte ^ Sign) & 0x40) != 0;
-      if (IsMore) Byte |= 0x80;
-      emitByte(Byte);
-    } while (IsMore);
-  }
-
-  /// emitString - This callback is invoked when a String needs to be
-  /// written to the output stream.
-  void emitString(const std::string &String) {
-    for (unsigned i = 0, N = static_cast<unsigned>(String.size());
-         i < N; ++i) {
-      uint8_t C = String[i];
-      emitByte(C);
-    }
-    emitByte(0);
-  }
-  
-  /// emitInt32 - Emit a int32 directive.
-  void emitInt32(int32_t Value) {
-    if (4 <= BufferEnd-CurBufferPtr) {
-      *((uint32_t*)CurBufferPtr) = Value;
-      CurBufferPtr += 4;
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-
-  /// emitInt64 - Emit a int64 directive.
-  void emitInt64(uint64_t Value) {
-    if (8 <= BufferEnd-CurBufferPtr) {
-      *((uint64_t*)CurBufferPtr) = Value;
-      CurBufferPtr += 8;
-    } else {
-      CurBufferPtr = BufferEnd;
-    }
-  }
-  
-  /// emitInt32At - Emit the Int32 Value in Addr.
-  void emitInt32At(uintptr_t *Addr, uintptr_t Value) {
-    if (Addr >= (uintptr_t*)BufferBegin && Addr < (uintptr_t*)BufferEnd)
-      (*(uint32_t*)Addr) = (uint32_t)Value;
-  }
-  
-  /// emitInt64At - Emit the Int64 Value in Addr.
-  void emitInt64At(uintptr_t *Addr, uintptr_t Value) {
-    if (Addr >= (uintptr_t*)BufferBegin && Addr < (uintptr_t*)BufferEnd)
-      (*(uint64_t*)Addr) = (uint64_t)Value;
-  }
-  
-  /// processDebugLoc - Records debug location information about a
-  /// MachineInstruction.  This is called before emitting any bytes associated
-  /// with the instruction.  Even if successive instructions have the same debug
-  /// location, this method will be called for each one.
-  virtual void processDebugLoc(DebugLoc DL, bool BeforePrintintInsn) {}
-
-  /// emitLabel - Emits a label
-  virtual void emitLabel(MCSymbol *Label) = 0;
-
-  /// allocateSpace - Allocate a block of space in the current output buffer,
-  /// returning null (and setting conditions to indicate buffer overflow) on
-  /// failure.  Alignment is the alignment in bytes of the buffer desired.
-  virtual void *allocateSpace(uintptr_t Size, unsigned Alignment) {
-    emitAlignment(Alignment);
-    void *Result;
-    
-    // Check for buffer overflow.
-    if (Size >= (uintptr_t)(BufferEnd-CurBufferPtr)) {
-      CurBufferPtr = BufferEnd;
-      Result = nullptr;
-    } else {
-      // Allocate the space.
-      Result = CurBufferPtr;
-      CurBufferPtr += Size;
-    }
-    
-    return Result;
-  }
-
-  /// StartMachineBasicBlock - This should be called by the target when a new
-  /// basic block is about to be emitted.  This way the MCE knows where the
-  /// start of the block is, and can implement getMachineBasicBlockAddress.
-  virtual void StartMachineBasicBlock(MachineBasicBlock *MBB) = 0;
-  
-  /// getCurrentPCValue - This returns the address that the next emitted byte
-  /// will be output to.
-  ///
-  virtual uintptr_t getCurrentPCValue() const {
-    return (uintptr_t)CurBufferPtr;
-  }
-
-  /// getCurrentPCOffset - Return the offset from the start of the emitted
-  /// buffer that we are currently writing to.
-  virtual uintptr_t getCurrentPCOffset() const {
-    return CurBufferPtr-BufferBegin;
-  }
-
-  /// earlyResolveAddresses - True if the code emitter can use symbol addresses 
-  /// during code emission time. The JIT is capable of doing this because it
-  /// creates jump tables or constant pools in memory on the fly while the
-  /// object code emitters rely on a linker to have real addresses and should
-  /// use relocations instead.
-  virtual bool earlyResolveAddresses() const = 0;
-
-  /// addRelocation - Whenever a relocatable address is needed, it should be
-  /// noted with this interface.
-  virtual void addRelocation(const MachineRelocation &MR) = 0;
-  
-  /// FIXME: These should all be handled with relocations!
-  
-  /// getConstantPoolEntryAddress - Return the address of the 'Index' entry in
-  /// the constant pool that was last emitted with the emitConstantPool method.
-  ///
-  virtual uintptr_t getConstantPoolEntryAddress(unsigned Index) const = 0;
-
-  /// getJumpTableEntryAddress - Return the address of the jump table with index
-  /// 'Index' in the function that last called initJumpTableInfo.
-  ///
-  virtual uintptr_t getJumpTableEntryAddress(unsigned Index) const = 0;
-  
-  /// getMachineBasicBlockAddress - Return the address of the specified
-  /// MachineBasicBlock, only usable after the label for the MBB has been
-  /// emitted.
-  ///
-  virtual uintptr_t getMachineBasicBlockAddress(MachineBasicBlock *MBB) const= 0;
-
-  /// getLabelAddress - Return the address of the specified Label, only usable
-  /// after the LabelID has been emitted.
-  ///
-  virtual uintptr_t getLabelAddress(MCSymbol *Label) const = 0;
-  
-  /// Specifies the MachineModuleInfo object. This is used for exception handling
-  /// purposes.
-  virtual void setModuleInfo(MachineModuleInfo* Info) = 0;
-};
-
-} // End llvm namespace
-
-#endif

diff --git a/include/llvm/CodeGen/MachineCodeInfo.h b/include/llvm/CodeGen/MachineCodeInfo.h
deleted file mode 100644
index 820bc87..0000000
--- a/include/llvm/CodeGen/MachineCodeInfo.h
+++ /dev/null

@@ -1,53 +0,0 @@
-//===-- MachineCodeInfo.h - Class used to report JIT info -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines MachineCodeInfo, a class used by the JIT ExecutionEngine
-// to report information about the generated machine code.
-//
-// See JIT::runJITOnFunction for usage.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_MACHINECODEINFO_H
-#define LLVM_CODEGEN_MACHINECODEINFO_H
-
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-class MachineCodeInfo {
-private:
-  size_t Size;   // Number of bytes in memory used
-  void *Address; // The address of the function in memory
-
-public:
-  MachineCodeInfo() : Size(0), Address(nullptr) {}
-
-  void setSize(size_t s) {
-    Size = s;
-  }
-
-  void setAddress(void *a) {
-    Address = a;
-  }
-
-  size_t size() const {
-    return Size;
-  }
-
-  void *address() const {
-    return Address;
-  }
-
-};
-
-}
-
-#endif
-

diff --git a/include/llvm/CodeGen/MachineCombinerPattern.h b/include/llvm/CodeGen/MachineCombinerPattern.h
new file mode 100644
index 0000000..176af14
--- /dev/null
+++ b/include/llvm/CodeGen/MachineCombinerPattern.h

@@ -0,0 +1,29 @@
+//===-- llvm/CodeGen/MachineCombinerPattern.h - Instruction pattern supported by
+// combiner  ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines instruction pattern supported by combiner
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINECOMBINERPATTERN_H
+#define LLVM_CODEGEN_MACHINECOMBINERPATTERN_H
+
+namespace llvm {
+
+/// Enumeration of instruction pattern supported by machine combiner
+///
+///
+namespace MachineCombinerPattern {
+// Forward declaration
+enum MC_PATTERN : int;
+} // end namespace MachineCombinerPattern
+} // end namespace llvm
+
+#endif

diff --git a/include/llvm/CodeGen/MachineConstantPool.h b/include/llvm/CodeGen/MachineConstantPool.h
index 912ce89..c619afb 100644
--- a/include/llvm/CodeGen/MachineConstantPool.h
+++ b/include/llvm/CodeGen/MachineConstantPool.h

@@ -17,6 +17,7 @@
 #define LLVM_CODEGEN_MACHINECONSTANTPOOL_H
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/MC/SectionKind.h"
 #include <cassert>
 #include <climits>
 #include <vector>
@@ -119,6 +120,8 @@
   ///     them.
   ///  2: This entry may have arbitrary relocations. 
   unsigned getRelocationInfo() const;
+
+  SectionKind getSectionKind(const DataLayout *DL) const;
 };
   
 /// The MachineConstantPool class keeps track of constants referenced by a

diff --git a/include/llvm/CodeGen/MachineDominanceFrontier.h b/include/llvm/CodeGen/MachineDominanceFrontier.h
new file mode 100644
index 0000000..e099e71
--- /dev/null
+++ b/include/llvm/CodeGen/MachineDominanceFrontier.h

@@ -0,0 +1,109 @@
+//===- llvm/CodeGen/MachineDominanceFrontier.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEDOMINANCEFRONTIER_H
+#define LLVM_CODEGEN_MACHINEDOMINANCEFRONTIER_H
+
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+
+namespace llvm {
+
+class MachineDominanceFrontier : public MachineFunctionPass {
+  ForwardDominanceFrontierBase<MachineBasicBlock> Base;
+public:
+  typedef DominatorTreeBase<MachineBasicBlock> DomTreeT;
+  typedef DomTreeNodeBase<MachineBasicBlock> DomTreeNodeT;
+  typedef DominanceFrontierBase<MachineBasicBlock>::DomSetType DomSetType;
+  typedef DominanceFrontierBase<MachineBasicBlock>::iterator iterator;
+  typedef DominanceFrontierBase<MachineBasicBlock>::const_iterator const_iterator;
+
+  void operator=(const MachineDominanceFrontier &) LLVM_DELETED_FUNCTION;
+  MachineDominanceFrontier(const MachineDominanceFrontier &) LLVM_DELETED_FUNCTION;
+
+  static char ID;
+
+  MachineDominanceFrontier();
+
+  DominanceFrontierBase<MachineBasicBlock> &getBase() {
+    return Base;
+  }
+
+  inline const std::vector<MachineBasicBlock*> &getRoots() const {
+    return Base.getRoots();
+  }
+
+  MachineBasicBlock *getRoot() const {
+    return Base.getRoot();
+  }
+
+  bool isPostDominator() const {
+    return Base.isPostDominator();
+  }
+
+  iterator begin() {
+    return Base.begin();
+  }
+
+  const_iterator begin() const {
+    return Base.begin();
+  }
+
+  iterator end() {
+    return Base.end();
+  }
+
+  const_iterator end() const {
+    return Base.end();
+  }
+
+  iterator find(MachineBasicBlock *B) {
+    return Base.find(B);
+  }
+
+  const_iterator find(MachineBasicBlock *B) const {
+    return Base.find(B);
+  }
+
+  iterator addBasicBlock(MachineBasicBlock *BB, const DomSetType &frontier) {
+    return Base.addBasicBlock(BB, frontier);
+  }
+
+  void removeBlock(MachineBasicBlock *BB) {
+    return Base.removeBlock(BB);
+  }
+
+  void addToFrontier(iterator I, MachineBasicBlock *Node) {
+    return Base.addToFrontier(I, Node);
+  }
+
+  void removeFromFrontier(iterator I, MachineBasicBlock *Node) {
+    return Base.removeFromFrontier(I, Node);
+  }
+
+  bool compareDomSet(DomSetType &DS1, const DomSetType &DS2) const {
+    return Base.compareDomSet(DS1, DS2);
+  }
+
+  bool compare(DominanceFrontierBase<MachineBasicBlock> &Other) const {
+    return Base.compare(Other);
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  void releaseMemory() override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+}
+
+#endif

diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h
index f1ae0bf..a6980a6 100644
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h

@@ -15,6 +15,7 @@
 #ifndef LLVM_CODEGEN_MACHINEDOMINATORS_H
 #define LLVM_CODEGEN_MACHINEDOMINATORS_H
 
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -38,6 +39,103 @@
 /// compute a normal dominator tree.
 ///
 class MachineDominatorTree : public MachineFunctionPass {
+  /// \brief Helper structure used to hold all the basic blocks
+  /// involved in the split of a critical edge.
+  struct CriticalEdge {
+    MachineBasicBlock *FromBB;
+    MachineBasicBlock *ToBB;
+    MachineBasicBlock *NewBB;
+    CriticalEdge(MachineBasicBlock *FromBB, MachineBasicBlock *ToBB,
+                 MachineBasicBlock *NewBB)
+        : FromBB(FromBB), ToBB(ToBB), NewBB(NewBB) {}
+  };
+
+  /// \brief Pile up all the critical edges to be split.
+  /// The splitting of a critical edge is local and thus, it is possible
+  /// to apply several of those changes at the same time.
+  mutable SmallVector<CriticalEdge, 32> CriticalEdgesToSplit;
+  /// \brief Remember all the basic blocks that are inserted during
+  /// edge splitting.
+  /// Invariant: NewBBs == all the basic blocks contained in the NewBB
+  /// field of all the elements of CriticalEdgesToSplit.
+  /// I.e., forall elt in CriticalEdgesToSplit, it exists BB in NewBBs
+  /// such as BB == elt.NewBB.
+  mutable SmallSet<MachineBasicBlock *, 32> NewBBs;
+
+  /// \brief Apply all the recorded critical edges to the DT.
+  /// This updates the underlying DT information in a way that uses
+  /// the fast query path of DT as much as possible.
+  ///
+  /// \post CriticalEdgesToSplit.empty().
+  void applySplitCriticalEdges() const {
+    // Bail out early if there is nothing to do.
+    if (CriticalEdgesToSplit.empty())
+      return;
+
+    // For each element in CriticalEdgesToSplit, remember whether or
+    // not element is the new immediate domminator of its successor.
+    // The mapping is done by index, i.e., the information for the ith
+    // element of CriticalEdgesToSplit is the ith element of IsNewIDom.
+    SmallVector<bool, 32> IsNewIDom;
+    IsNewIDom.resize(CriticalEdgesToSplit.size());
+    size_t Idx = 0;
+
+    // Collect all the dominance properties info, before invalidating
+    // the underlying DT.
+    for (CriticalEdge &Edge : CriticalEdgesToSplit) {
+      // Update dominator information.
+      MachineBasicBlock *Succ = Edge.ToBB;
+      MachineDomTreeNode *SucccDTNode = DT->getNode(Succ);
+
+      IsNewIDom[Idx] = true;
+      for (MachineBasicBlock *PredBB : Succ->predecessors()) {
+        if (PredBB == Edge.NewBB)
+          continue;
+        // If we are in this situation:
+        // FromBB1        FromBB2
+        //    +              +
+        //   + +            + +
+        //  +   +          +   +
+        // ...  Split1  Split2 ...
+        //           +   +
+        //            + +
+        //             +
+        //            Succ
+        // Instead of checking the domiance property with Split2, we
+        // check it with FromBB2 since Split2 is still unknown of the
+        // underlying DT structure.
+        if (NewBBs.count(PredBB)) {
+          assert(PredBB->pred_size() == 1 && "A basic block resulting from a "
+                                             "critical edge split has more "
+                                             "than one predecessor!");
+          PredBB = *PredBB->pred_begin();
+        }
+        if (!DT->dominates(SucccDTNode, DT->getNode(PredBB))) {
+          IsNewIDom[Idx] = false;
+          break;
+        }
+      }
+      ++Idx;
+    }
+
+    // Now, update DT with the collected dominance properties info.
+    Idx = 0;
+    for (CriticalEdge &Edge : CriticalEdgesToSplit) {
+      // We know FromBB dominates NewBB.
+      MachineDomTreeNode *NewDTNode = DT->addNewBlock(Edge.NewBB, Edge.FromBB);
+      MachineDomTreeNode *SucccDTNode = DT->getNode(Edge.ToBB);
+
+      // If all the other predecessors of "Succ" are dominated by "Succ" itself
+      // then the new block is the new immediate dominator of "Succ". Otherwise,
+      // the new block doesn't dominate anything.
+      if (IsNewIDom[Idx])
+        DT->changeImmediateDominator(SucccDTNode, NewDTNode);
+      ++Idx;
+    }
+    NewBBs.clear();
+    CriticalEdgesToSplit.clear();
+  }
+
 public:
   static char ID; // Pass ID, replacement for typeid
   DominatorTreeBase<MachineBasicBlock>* DT;
@@ -46,7 +144,10 @@
 
   ~MachineDominatorTree();
 
-  DominatorTreeBase<MachineBasicBlock>& getBase() { return *DT; }
+  DominatorTreeBase<MachineBasicBlock> &getBase() {
+    applySplitCriticalEdges();
+    return *DT;
+  }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
@@ -55,14 +156,17 @@
   /// dominators, this will always be a single block (the entry node).
   ///
   inline const std::vector<MachineBasicBlock*> &getRoots() const {
+    applySplitCriticalEdges();
     return DT->getRoots();
   }
 
   inline MachineBasicBlock *getRoot() const {
+    applySplitCriticalEdges();
     return DT->getRoot();
   }
 
   inline MachineDomTreeNode *getRootNode() const {
+    applySplitCriticalEdges();
     return DT->getRootNode();
   }
 
@@ -70,17 +174,20 @@
 
   inline bool dominates(const MachineDomTreeNode* A,
                         const MachineDomTreeNode* B) const {
+    applySplitCriticalEdges();
     return DT->dominates(A, B);
   }
 
   inline bool dominates(const MachineBasicBlock* A,
                         const MachineBasicBlock* B) const {
+    applySplitCriticalEdges();
     return DT->dominates(A, B);
   }
 
   // dominates - Return true if A dominates B. This performs the
   // special checks necessary if A and B are in the same basic block.
   bool dominates(const MachineInstr *A, const MachineInstr *B) const {
+    applySplitCriticalEdges();
     const MachineBasicBlock *BBA = A->getParent(), *BBB = B->getParent();
     if (BBA != BBB) return DT->dominates(BBA, BBB);
 
@@ -100,11 +207,13 @@
 
   inline bool properlyDominates(const MachineDomTreeNode* A,
                                 const MachineDomTreeNode* B) const {
+    applySplitCriticalEdges();
     return DT->properlyDominates(A, B);
   }
 
   inline bool properlyDominates(const MachineBasicBlock* A,
                                 const MachineBasicBlock* B) const {
+    applySplitCriticalEdges();
     return DT->properlyDominates(A, B);
   }
 
@@ -112,10 +221,12 @@
   /// for basic block A and B. If there is no such block then return NULL.
   inline MachineBasicBlock *findNearestCommonDominator(MachineBasicBlock *A,
                                                        MachineBasicBlock *B) {
+    applySplitCriticalEdges();
     return DT->findNearestCommonDominator(A, B);
   }
 
   inline MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
+    applySplitCriticalEdges();
     return DT->getNode(BB);
   }
 
@@ -123,6 +234,7 @@
   /// block.  This is the same as using operator[] on this class.
   ///
   inline MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
+    applySplitCriticalEdges();
     return DT->getNode(BB);
   }
 
@@ -131,6 +243,7 @@
   /// the children list of the immediate dominator.
   inline MachineDomTreeNode *addNewBlock(MachineBasicBlock *BB,
                                          MachineBasicBlock *DomBB) {
+    applySplitCriticalEdges();
     return DT->addNewBlock(BB, DomBB);
   }
 
@@ -139,11 +252,13 @@
   ///
   inline void changeImmediateDominator(MachineBasicBlock *N,
                                        MachineBasicBlock* NewIDom) {
+    applySplitCriticalEdges();
     DT->changeImmediateDominator(N, NewIDom);
   }
 
   inline void changeImmediateDominator(MachineDomTreeNode *N,
                                        MachineDomTreeNode* NewIDom) {
+    applySplitCriticalEdges();
     DT->changeImmediateDominator(N, NewIDom);
   }
 
@@ -151,24 +266,49 @@
   /// dominate any other blocks. Removes node from its immediate dominator's
   /// children list. Deletes dominator node associated with basic block BB.
   inline void eraseNode(MachineBasicBlock *BB) {
+    applySplitCriticalEdges();
     DT->eraseNode(BB);
   }
 
   /// splitBlock - BB is split and now it has one successor. Update dominator
   /// tree to reflect this change.
   inline void splitBlock(MachineBasicBlock* NewBB) {
+    applySplitCriticalEdges();
     DT->splitBlock(NewBB);
   }
 
   /// isReachableFromEntry - Return true if A is dominated by the entry
   /// block of the function containing it.
   bool isReachableFromEntry(const MachineBasicBlock *A) {
+    applySplitCriticalEdges();
     return DT->isReachableFromEntry(A);
   }
 
   void releaseMemory() override;
 
   void print(raw_ostream &OS, const Module*) const override;
+
+  /// \brief Record that the critical edge (FromBB, ToBB) has been
+  /// split with NewBB.
+  /// This is best to use this method instead of directly update the
+  /// underlying information, because this helps mitigating the
+  /// number of time the DT information is invalidated.
+  ///
+  /// \note Do not use this method with regular edges.
+  ///
+  /// \note To benefit from the compile time improvement incurred by this
+  /// method, the users of this method have to limit the queries to the DT
+  /// interface between two edges splitting. In other words, they have to
+  /// pack the splitting of critical edges as much as possible.
+  void recordSplitCriticalEdge(MachineBasicBlock *FromBB,
+                              MachineBasicBlock *ToBB,
+                              MachineBasicBlock *NewBB) {
+    bool Inserted = NewBBs.insert(NewBB).second;
+    (void)Inserted;
+    assert(Inserted &&
+           "A basic block inserted via edge splitting cannot appear twice");
+    CriticalEdgesToSplit.push_back(CriticalEdge(FromBB, ToBB, NewBB));
+  }
 };
 
 //===-------------------------------------

diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index c51f8fe..1e7fee6 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h

@@ -109,13 +109,23 @@
     // block and doesn't need additional handling for allocation beyond that.
     bool PreAllocated;
 
+    // If true, an LLVM IR value might point to this object.
+    // Normally, spill slots and fixed-offset objects don't alias IR-accessible
+    // objects, but there are exceptions (on PowerPC, for example, some byval
+    // arguments have ABI-prescribed offsets).
+    bool isAliased;
+
     StackObject(uint64_t Sz, unsigned Al, int64_t SP, bool IM,
-                bool isSS, const AllocaInst *Val)
+                bool isSS, const AllocaInst *Val, bool A)
       : SPOffset(SP), Size(Sz), Alignment(Al), isImmutable(IM),
-        isSpillSlot(isSS), Alloca(Val), PreAllocated(false) {}
+        isSpillSlot(isSS), Alloca(Val), PreAllocated(false), isAliased(A) {}
   };
 
-  const TargetMachine &TM;
+  /// StackAlignment - The alignment of the stack.
+  unsigned StackAlignment;
+
+  /// StackRealignable - Can the stack be realigned.
+  bool StackRealignable;
 
   /// Objects - The list of stack objects allocated...
   ///
@@ -230,10 +240,17 @@
   /// pointer.
   bool HasInlineAsmWithSPAdjust;
 
-  const TargetFrameLowering *getFrameLowering() const;
+  /// True if the function contains a call to the llvm.vastart intrinsic.
+  bool HasVAStart;
+
+  /// True if this is a varargs function that contains a musttail call.
+  bool HasMustTailInVarArgFunc;
+
 public:
-    explicit MachineFrameInfo(const TargetMachine &TM, bool RealignOpt)
-    : TM(TM), RealignOption(RealignOpt) {
+  explicit MachineFrameInfo(unsigned StackAlign, bool isStackRealign,
+                            bool RealignOpt)
+      : StackAlignment(StackAlign), StackRealignable(isStackRealign),
+        RealignOption(RealignOpt) {
     StackSize = NumFixedObjects = OffsetAdjustment = MaxAlignment = 0;
     HasVarSizedObjects = false;
     FrameAddressTaken = false;
@@ -250,6 +267,8 @@
     LocalFrameMaxAlign = 0;
     UseLocalStackAllocationBlock = false;
     HasInlineAsmWithSPAdjust = false;
+    HasVAStart = false;
+    HasMustTailInVarArgFunc = false;
   }
 
   /// hasStackObjects - Return true if there are any stack objects in this
@@ -469,6 +488,14 @@
   bool hasInlineAsmWithSPAdjust() const { return HasInlineAsmWithSPAdjust; }
   void setHasInlineAsmWithSPAdjust(bool B) { HasInlineAsmWithSPAdjust = B; }
 
+  /// Returns true if the function calls the llvm.va_start intrinsic.
+  bool hasVAStart() const { return HasVAStart; }
+  void setHasVAStart(bool B) { HasVAStart = B; }
+
+  /// Returns true if the function is variadic and contains a musttail call.
+  bool hasMustTailInVarArgFunc() const { return HasMustTailInVarArgFunc; }
+  void setHasMustTailInVarArgFunc(bool B) { HasMustTailInVarArgFunc = B; }
+
   /// getMaxCallFrameSize - Return the maximum size of a call frame that must be
   /// allocated for an outgoing function call.  This is only available if
   /// CallFrameSetup/Destroy pseudo instructions are used by the target, and
@@ -479,10 +506,11 @@
 
   /// CreateFixedObject - Create a new object at a fixed location on the stack.
   /// All fixed objects should be created before other objects are created for
-  /// efficiency. By default, fixed objects are immutable. This returns an
-  /// index with a negative value.
+  /// efficiency. By default, fixed objects are not pointed to by LLVM IR
+  /// values. This returns an index with a negative value.
   ///
-  int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool Immutable);
+  int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool Immutable,
+                        bool isAliased = false);
 
   /// CreateFixedSpillStackObject - Create a spill slot at a fixed location
   /// on the stack.  Returns an index with a negative value.
@@ -494,6 +522,14 @@
     return ObjectIdx < 0 && (ObjectIdx >= -(int)NumFixedObjects);
   }
 
+  /// isAliasedObjectIndex - Returns true if the specified index corresponds
+  /// to an object that might be pointed to by an LLVM IR value.
+  bool isAliasedObjectIndex(int ObjectIdx) const {
+    assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    return Objects[ObjectIdx+NumFixedObjects].isAliased;
+  }
+
   /// isImmutableObjectIndex - Returns true if the specified index corresponds
   /// to an immutable object.
   bool isImmutableObjectIndex(int ObjectIdx) const {

diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index f4c2542..3271410 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h

@@ -21,6 +21,7 @@
 #include "llvm/ADT/ilist.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/ArrayRecycler.h"
 #include "llvm/Support/Recycler.h"
@@ -38,6 +39,7 @@
 class MCContext;
 class Pass;
 class TargetMachine;
+class TargetSubtargetInfo;
 class TargetRegisterClass;
 struct MachinePointerInfo;
 
@@ -75,10 +77,10 @@
 class MachineFunction {
   const Function *Fn;
   const TargetMachine &Target;
+  const TargetSubtargetInfo *STI;
   MCContext &Ctx;
   MachineModuleInfo &MMI;
-  GCModuleInfo *GMI;
-  
+
   // RegInfo - Information about each register in use in the function.
   MachineRegisterInfo *RegInfo;
 
@@ -138,12 +140,10 @@
   void operator=(const MachineFunction&) LLVM_DELETED_FUNCTION;
 public:
   MachineFunction(const Function *Fn, const TargetMachine &TM,
-                  unsigned FunctionNum, MachineModuleInfo &MMI,
-                  GCModuleInfo* GMI);
+                  unsigned FunctionNum, MachineModuleInfo &MMI);
   ~MachineFunction();
 
   MachineModuleInfo &getMMI() const { return MMI; }
-  GCModuleInfo *getGMI() const { return GMI; }
   MCContext &getContext() const { return Ctx; }
 
   /// getFunction - Return the LLVM function that this machine code represents
@@ -162,6 +162,11 @@
   ///
   const TargetMachine &getTarget() const { return Target; }
 
+  /// getSubtarget - Return the subtarget for which this machine code is being
+  /// compiled.
+  const TargetSubtargetInfo &getSubtarget() const { return *STI; }
+  void setSubtarget(const TargetSubtargetInfo *ST) { STI = ST; }
+
   /// getRegInfo - Return information about the registers currently in use.
   ///
   MachineRegisterInfo &getRegInfo() { return *RegInfo; }
@@ -227,19 +232,14 @@
   void setHasInlineAsm(bool B) {
     HasInlineAsm = B;
   }
-  
+
   /// getInfo - Keep track of various per-function pieces of information for
   /// backends that would like to do so.
   ///
   template<typename Ty>
   Ty *getInfo() {
-    if (!MFInfo) {
-        // This should be just `new (Allocator.Allocate<Ty>()) Ty(*this)', but
-        // that apparently breaks GCC 3.3.
-        Ty *Loc = static_cast<Ty*>(Allocator.Allocate(sizeof(Ty),
-                                                      AlignOf<Ty>::Alignment));
-        MFInfo = new (Loc) Ty(*this);
-    }
+    if (!MFInfo)
+      MFInfo = new (Allocator.Allocate<Ty>()) Ty(*this);
     return static_cast<Ty*>(MFInfo);
   }
 
@@ -404,7 +404,7 @@
   MachineMemOperand *getMachineMemOperand(MachinePointerInfo PtrInfo,
                                           unsigned f, uint64_t s,
                                           unsigned base_alignment,
-                                          const MDNode *TBAAInfo = nullptr,
+                                          const AAMDNodes &AAInfo = AAMDNodes(),
                                           const MDNode *Ranges = nullptr);
   
   /// getMachineMemOperand - Allocate a new MachineMemOperand by copying

diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 1e2db7c..d20b45b 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h

@@ -244,12 +244,22 @@
   ///
   DebugLoc getDebugLoc() const { return debugLoc; }
 
-  /// getDebugVariable() - Return the debug variable referenced by
+  /// \brief Return the debug variable referenced by
   /// this DBG_VALUE instruction.
   DIVariable getDebugVariable() const {
     assert(isDebugValue() && "not a DBG_VALUE");
-    const MDNode *Var = getOperand(getNumOperands() - 1).getMetadata();
-    return DIVariable(Var);
+    DIVariable Var(getOperand(2).getMetadata());
+    assert(Var.Verify() && "not a DIVariable");
+    return Var;
+  }
+
+  /// \brief Return the complex address expression referenced by
+  /// this DBG_VALUE instruction.
+  DIExpression getDebugExpression() const {
+    assert(isDebugValue() && "not a DBG_VALUE");
+    DIExpression Expr(getOperand(3).getMetadata());
+    assert(Expr.Verify() && "not a DIExpression");
+    return Expr;
   }
 
   /// emitError - Emit an error referring to the source location of this
@@ -510,6 +520,49 @@
     return hasProperty(MCID::FoldableAsLoad, Type);
   }
 
+  /// \brief Return true if this instruction behaves
+  /// the same way as the generic REG_SEQUENCE instructions.
+  /// E.g., on ARM,
+  /// dX VMOVDRR rY, rZ
+  /// is equivalent to
+  /// dX = REG_SEQUENCE rY, ssub_0, rZ, ssub_1.
+  ///
+  /// Note that for the optimizers to be able to take advantage of
+  /// this property, TargetInstrInfo::getRegSequenceLikeInputs has to be
+  /// override accordingly.
+  bool isRegSequenceLike(QueryType Type = IgnoreBundle) const {
+    return hasProperty(MCID::RegSequence, Type);
+  }
+
+  /// \brief Return true if this instruction behaves
+  /// the same way as the generic EXTRACT_SUBREG instructions.
+  /// E.g., on ARM,
+  /// rX, rY VMOVRRD dZ
+  /// is equivalent to two EXTRACT_SUBREG:
+  /// rX = EXTRACT_SUBREG dZ, ssub_0
+  /// rY = EXTRACT_SUBREG dZ, ssub_1
+  ///
+  /// Note that for the optimizers to be able to take advantage of
+  /// this property, TargetInstrInfo::getExtractSubregLikeInputs has to be
+  /// override accordingly.
+  bool isExtractSubregLike(QueryType Type = IgnoreBundle) const {
+    return hasProperty(MCID::ExtractSubreg, Type);
+  }
+
+  /// \brief Return true if this instruction behaves
+  /// the same way as the generic INSERT_SUBREG instructions.
+  /// E.g., on ARM,
+  /// dX = VSETLNi32 dY, rZ, Imm
+  /// is equivalent to a INSERT_SUBREG:
+  /// dX = INSERT_SUBREG dY, rZ, translateImmToSubIdx(Imm)
+  ///
+  /// Note that for the optimizers to be able to take advantage of
+  /// this property, TargetInstrInfo::getInsertSubregLikeInputs has to be
+  /// override accordingly.
+  bool isInsertSubregLike(QueryType Type = IgnoreBundle) const {
+    return hasProperty(MCID::InsertSubreg, Type);
+  }
+
   //===--------------------------------------------------------------------===//
   // Side Effect Analysis
   //===--------------------------------------------------------------------===//
@@ -671,6 +724,12 @@
   /// eraseFromBundle() to erase individual bundled instructions.
   void eraseFromParent();
 
+  /// Unlink 'this' from the containing basic block and delete it.
+  ///
+  /// For all definitions mark their uses in DBG_VALUE nodes
+  /// as undefined. Otherwise like eraseFromParent().
+  void eraseFromParentAndMarkDBGValuesForRemoval();
+
   /// Unlink 'this' form its basic block and delete it.
   ///
   /// If the instruction is part of a bundle, the other instructions in the

diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index 21a482c..8859b6a 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h

@@ -58,6 +58,10 @@
   MachineInstr *operator->() const { return MI; }
   operator MachineBasicBlock::iterator() const { return MI; }
 
+  /// If conversion operators fail, use this method to get the MachineInstr
+  /// explicitly.
+  MachineInstr *getInstr() const { return MI; }
+
   /// addReg - Add a new virtual register operand...
   ///
   const
@@ -170,6 +174,8 @@
 
   const MachineInstrBuilder &addMetadata(const MDNode *MD) const {
     MI->addOperand(*MF, MachineOperand::CreateMetadata(MD));
+    assert((MI->isDebugValue() ? MI->getDebugVariable().Verify() : true) &&
+           "first MDNode argument of a DBG_VALUE not a DIVariable");
     return *this;
   }
 
@@ -345,24 +351,25 @@
 /// address.  The convention is that a DBG_VALUE is indirect iff the
 /// second operand is an immediate.
 ///
-inline MachineInstrBuilder BuildMI(MachineFunction &MF,
-                                   DebugLoc DL,
-                                   const MCInstrDesc &MCID,
-                                   bool IsIndirect,
-                                   unsigned Reg,
-                                   unsigned Offset,
-                                   const MDNode *MD) {
+inline MachineInstrBuilder BuildMI(MachineFunction &MF, DebugLoc DL,
+                                   const MCInstrDesc &MCID, bool IsIndirect,
+                                   unsigned Reg, unsigned Offset,
+                                   const MDNode *Variable, const MDNode *Expr) {
+  assert(DIVariable(Variable).Verify() && "not a DIVariable");
+  assert(DIExpression(Expr).Verify() && "not a DIExpression");
   if (IsIndirect)
     return BuildMI(MF, DL, MCID)
-      .addReg(Reg, RegState::Debug)
-      .addImm(Offset)
-      .addMetadata(MD);
+        .addReg(Reg, RegState::Debug)
+        .addImm(Offset)
+        .addMetadata(Variable)
+        .addMetadata(Expr);
   else {
     assert(Offset == 0 && "A direct address cannot have an offset.");
     return BuildMI(MF, DL, MCID)
-      .addReg(Reg, RegState::Debug)
-      .addReg(0U, RegState::Debug)
-      .addMetadata(MD);
+        .addReg(Reg, RegState::Debug)
+        .addReg(0U, RegState::Debug)
+        .addMetadata(Variable)
+        .addMetadata(Expr);
   }
 }
 
@@ -371,15 +378,15 @@
 /// address and inserts it at position I.
 ///
 inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
-                                   MachineBasicBlock::iterator I,
-                                   DebugLoc DL,
-                                   const MCInstrDesc &MCID,
-                                   bool IsIndirect,
-                                   unsigned Reg,
-                                   unsigned Offset,
-                                   const MDNode *MD) {
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   const MCInstrDesc &MCID, bool IsIndirect,
+                                   unsigned Reg, unsigned Offset,
+                                   const MDNode *Variable, const MDNode *Expr) {
+  assert(DIVariable(Variable).Verify() && "not a DIVariable");
+  assert(DIExpression(Expr).Verify() && "not a DIExpression");
   MachineFunction &MF = *BB.getParent();
-  MachineInstr *MI = BuildMI(MF, DL, MCID, IsIndirect, Reg, Offset, MD);
+  MachineInstr *MI =
+      BuildMI(MF, DL, MCID, IsIndirect, Reg, Offset, Variable, Expr);
   BB.insert(I, MI);
   return MachineInstrBuilder(MF, MI);
 }

diff --git a/include/llvm/CodeGen/MachineMemOperand.h b/include/llvm/CodeGen/MachineMemOperand.h
index 2532c16..eb5086c 100644
--- a/include/llvm/CodeGen/MachineMemOperand.h
+++ b/include/llvm/CodeGen/MachineMemOperand.h

@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Value.h"  // PointerLikeTypeTraits<Value*>
 #include "llvm/Support/DataTypes.h"
 
@@ -91,7 +92,7 @@
   MachinePointerInfo PtrInfo;
   uint64_t Size;
   unsigned Flags;
-  const MDNode *TBAAInfo;
+  AAMDNodes AAInfo;
   const MDNode *Ranges;
 
 public:
@@ -117,7 +118,8 @@
   /// MachineMemOperand - Construct an MachineMemOperand object with the
   /// specified PtrInfo, flags, size, and base alignment.
   MachineMemOperand(MachinePointerInfo PtrInfo, unsigned flags, uint64_t s,
-                    unsigned base_alignment, const MDNode *TBAAInfo = nullptr,
+                    unsigned base_alignment,
+                    const AAMDNodes &AAInfo = AAMDNodes(),
                     const MDNode *Ranges = nullptr);
 
   const MachinePointerInfo &getPointerInfo() const { return PtrInfo; }
@@ -161,8 +163,8 @@
   /// base address, without the offset.
   uint64_t getBaseAlignment() const { return (1u << (Flags >> MOMaxBits)) >> 1; }
 
-  /// getTBAAInfo - Return the TBAA tag for the memory reference.
-  const MDNode *getTBAAInfo() const { return TBAAInfo; }
+  /// getAAInfo - Return the AA tags for the memory reference.
+  AAMDNodes getAAInfo() const { return AAInfo; }
 
   /// getRanges - Return the range tag for the memory reference.
   const MDNode *getRanges() const { return Ranges; }

diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 6d8d056..6653333 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h

@@ -110,10 +110,6 @@
   /// by debug and exception handling consumers.
   std::vector<MCCFIInstruction> FrameInstructions;
 
-  /// CompactUnwindEncoding - If the target supports it, this is the compact
-  /// unwind encoding. It replaces a function's CIE and FDE.
-  uint32_t CompactUnwindEncoding;
-
   /// LandingPads - List of LandingPadInfo describing the landing pad
   /// information in the current function.
   std::vector<LandingPadInfo> LandingPads;
@@ -131,7 +127,7 @@
   unsigned CurCallSite;
 
   /// TypeInfos - List of C++ TypeInfo used in the current function.
-  std::vector<const GlobalVariable *> TypeInfos;
+  std::vector<const GlobalValue *> TypeInfos;
 
   /// FilterIds - List of typeids encoding filters used in the current function.
   std::vector<unsigned> FilterIds;
@@ -170,6 +166,7 @@
 
   struct VariableDbgInfo {
     TrackingVH<MDNode> Var;
+    TrackingVH<MDNode> Expr;
     unsigned Slot;
     DebugLoc Loc;
   };
@@ -247,15 +244,6 @@
     return FrameInstructions.size() - 1;
   }
 
-  /// getCompactUnwindEncoding - Returns the compact unwind encoding for a
-  /// function if the target supports the encoding. This encoding replaces a
-  /// function's CIE and FDE.
-  uint32_t getCompactUnwindEncoding() const { return CompactUnwindEncoding; }
-
-  /// setCompactUnwindEncoding - Set the compact unwind encoding for a function
-  /// if the target supports the encoding.
-  void setCompactUnwindEncoding(uint32_t Enc) { CompactUnwindEncoding = Enc; }
-
   /// getAddrLabelSymbol - Return the symbol to be used for the specified basic
   /// block when its address is taken.  This cannot be its normal LBB label
   /// because the block may be accessed outside its containing function.
@@ -313,12 +301,12 @@
   /// addCatchTypeInfo - Provide the catch typeinfo for a landing pad.
   ///
   void addCatchTypeInfo(MachineBasicBlock *LandingPad,
-                        ArrayRef<const GlobalVariable *> TyInfo);
+                        ArrayRef<const GlobalValue *> TyInfo);
 
   /// addFilterTypeInfo - Provide the filter typeinfo for a landing pad.
   ///
   void addFilterTypeInfo(MachineBasicBlock *LandingPad,
-                         ArrayRef<const GlobalVariable *> TyInfo);
+                         ArrayRef<const GlobalValue *> TyInfo);
 
   /// addCleanup - Add a cleanup action for a landing pad.
   ///
@@ -326,7 +314,7 @@
 
   /// getTypeIDFor - Return the type id for the specified typeinfo.  This is
   /// function wide.
-  unsigned getTypeIDFor(const GlobalVariable *TI);
+  unsigned getTypeIDFor(const GlobalValue *TI);
 
   /// getFilterIDFor - Return the id of the filter encoded by TyIds.  This is
   /// function wide.
@@ -387,7 +375,7 @@
 
   /// getTypeInfos - Return a reference to the C++ typeinfo for the current
   /// function.
-  const std::vector<const GlobalVariable *> &getTypeInfos() const {
+  const std::vector<const GlobalValue *> &getTypeInfos() const {
     return TypeInfos;
   }
 
@@ -403,8 +391,9 @@
 
   /// setVariableDbgInfo - Collect information used to emit debugging
   /// information of a variable.
-  void setVariableDbgInfo(MDNode *N, unsigned Slot, DebugLoc Loc) {
-    VariableDbgInfo Info = { N, Slot, Loc };
+  void setVariableDbgInfo(MDNode *Var, MDNode *Expr, unsigned Slot,
+                          DebugLoc Loc) {
+    VariableDbgInfo Info = {Var, Expr, Slot, Loc};
     VariableDbgInfos.push_back(std::move(Info));
   }
 

diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 22969bc8..eed1e57 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h

@@ -506,6 +506,11 @@
     Contents.ImmVal = immVal;
   }
 
+  void setFPImm(const ConstantFP *CFP) {
+    assert(isFPImm() && "Wrong MachineOperand mutator");
+    Contents.CFP = CFP;
+  }
+
   void setOffset(int64_t Offset) {
     assert((isGlobal() || isSymbol() || isCPI() || isTargetIndex() ||
             isBlockAddress()) && "Wrong MachineOperand accessor");
@@ -544,6 +549,11 @@
   /// the setImm method should be used.
   void ChangeToImmediate(int64_t ImmVal);
 
+  /// ChangeToFPImmediate - Replace this operand with a new FP immediate operand
+  /// of the specified value.  If an operand is known to be an FP immediate
+  /// already, the setFPImm method should be used.
+  void ChangeToFPImmediate(const ConstantFP *FPImm);
+
   /// ChangeToRegister - Replace this operand with a new register operand of
   /// the specified value.  If an operand is known to be an register already,
   /// the setReg method should be used.
@@ -702,6 +712,8 @@
   friend class MachineInstr;
   friend class MachineRegisterInfo;
 private:
+  void removeRegFromUses();
+
   //===--------------------------------------------------------------------===//
   // Methods for handling register use/def lists.
   //===--------------------------------------------------------------------===//

diff --git a/include/llvm/CodeGen/MachinePostDominators.h b/include/llvm/CodeGen/MachinePostDominators.h
index beb2c4f..aab5c40 100644
--- a/include/llvm/CodeGen/MachinePostDominators.h
+++ b/include/llvm/CodeGen/MachinePostDominators.h

@@ -22,7 +22,7 @@
 
 ///
 /// PostDominatorTree Class - Concrete subclass of DominatorTree that is used
-/// to compute the a post-dominator tree.
+/// to compute the post-dominator tree.
 ///
 struct MachinePostDominatorTree : public MachineFunctionPass {
 private:

diff --git a/include/llvm/CodeGen/MachineRegionInfo.h b/include/llvm/CodeGen/MachineRegionInfo.h
new file mode 100644
index 0000000..43499db
--- /dev/null
+++ b/include/llvm/CodeGen/MachineRegionInfo.h

@@ -0,0 +1,183 @@
+//===- llvm/CodeGen/MachineRegionInfo.h -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEREGIONINFO_H
+#define LLVM_CODEGEN_MACHINEREGIONINFO_H
+
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+
+namespace llvm {
+
+class MachineDominatorTree;
+struct MachinePostDominatorTree;
+class MachineRegion;
+class MachineRegionNode;
+class MachineRegionInfo;
+
+template<>
+struct RegionTraits<MachineFunction> {
+  typedef MachineFunction FuncT;
+  typedef MachineBasicBlock BlockT;
+  typedef MachineRegion RegionT;
+  typedef MachineRegionNode RegionNodeT;
+  typedef MachineRegionInfo RegionInfoT;
+  typedef MachineDominatorTree DomTreeT;
+  typedef MachineDomTreeNode DomTreeNodeT;
+  typedef MachinePostDominatorTree PostDomTreeT;
+  typedef MachineDominanceFrontier DomFrontierT;
+  typedef MachineInstr InstT;
+  typedef MachineLoop LoopT;
+  typedef MachineLoopInfo LoopInfoT;
+
+  static unsigned getNumSuccessors(MachineBasicBlock *BB) {
+    return BB->succ_size();
+  }
+};
+
+
+class MachineRegionNode : public RegionNodeBase<RegionTraits<MachineFunction>> {
+public:
+  inline MachineRegionNode(MachineRegion *Parent,
+                           MachineBasicBlock *Entry,
+                           bool isSubRegion = false)
+    : RegionNodeBase<RegionTraits<MachineFunction>>(Parent, Entry, isSubRegion) {
+
+  }
+
+  ~MachineRegionNode() { }
+
+  bool operator==(const MachineRegion &RN) const {
+    return this == reinterpret_cast<const MachineRegionNode*>(&RN);
+  }
+};
+
+class MachineRegion : public RegionBase<RegionTraits<MachineFunction>> {
+public:
+  MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit,
+                MachineRegionInfo* RI,
+                MachineDominatorTree *DT, MachineRegion *Parent = nullptr);
+  ~MachineRegion();
+
+  bool operator==(const MachineRegionNode &RN) const {
+    return &RN == reinterpret_cast<const MachineRegionNode*>(this);
+  }
+};
+
+class MachineRegionInfo : public RegionInfoBase<RegionTraits<MachineFunction>> {
+public:
+  explicit MachineRegionInfo();
+
+  virtual ~MachineRegionInfo();
+
+  // updateStatistics - Update statistic about created regions.
+  void updateStatistics(MachineRegion *R) final;
+
+  void recalculate(MachineFunction &F,
+                   MachineDominatorTree *DT,
+                   MachinePostDominatorTree *PDT,
+                   MachineDominanceFrontier *DF);
+};
+
+class MachineRegionInfoPass : public MachineFunctionPass {
+  MachineRegionInfo RI;
+
+public:
+  static char ID;
+  explicit MachineRegionInfoPass();
+
+  ~MachineRegionInfoPass();
+
+  MachineRegionInfo &getRegionInfo() {
+    return RI;
+  }
+
+  const MachineRegionInfo &getRegionInfo() const {
+    return RI;
+  }
+
+  /// @name MachineFunctionPass interface
+  //@{
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void releaseMemory() override;
+  void verifyAnalysis() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void print(raw_ostream &OS, const Module *) const override;
+  void dump() const;
+  //@}
+};
+
+
+template <>
+template <>
+inline MachineBasicBlock* RegionNodeBase<RegionTraits<MachineFunction>>::getNodeAs<MachineBasicBlock>() const {
+  assert(!isSubRegion() && "This is not a MachineBasicBlock RegionNode!");
+  return getEntry();
+}
+
+template<>
+template<>
+inline MachineRegion* RegionNodeBase<RegionTraits<MachineFunction>>::getNodeAs<MachineRegion>() const {
+  assert(isSubRegion() && "This is not a subregion RegionNode!");
+  auto Unconst = const_cast<RegionNodeBase<RegionTraits<MachineFunction>>*>(this);
+  return reinterpret_cast<MachineRegion*>(Unconst);
+}
+
+
+RegionNodeGraphTraits(MachineRegionNode, MachineBasicBlock, MachineRegion);
+RegionNodeGraphTraits(const MachineRegionNode, MachineBasicBlock, MachineRegion);
+
+RegionGraphTraits(MachineRegion, MachineRegionNode);
+RegionGraphTraits(const MachineRegion, const MachineRegionNode);
+
+template <> struct GraphTraits<MachineRegionInfo*>
+  : public GraphTraits<FlatIt<MachineRegionNode*> > {
+  typedef df_iterator<NodeType*, SmallPtrSet<NodeType*, 8>, false,
+                      GraphTraits<FlatIt<NodeType*> > > nodes_iterator;
+
+  static NodeType *getEntryNode(MachineRegionInfo *RI) {
+    return GraphTraits<FlatIt<MachineRegion*> >::getEntryNode(RI->getTopLevelRegion());
+  }
+  static nodes_iterator nodes_begin(MachineRegionInfo* RI) {
+    return nodes_iterator::begin(getEntryNode(RI));
+  }
+  static nodes_iterator nodes_end(MachineRegionInfo *RI) {
+    return nodes_iterator::end(getEntryNode(RI));
+  }
+};
+
+template <> struct GraphTraits<MachineRegionInfoPass*>
+  : public GraphTraits<MachineRegionInfo *> {
+  typedef df_iterator<NodeType*, SmallPtrSet<NodeType*, 8>, false,
+                      GraphTraits<FlatIt<NodeType*> > > nodes_iterator;
+
+  static NodeType *getEntryNode(MachineRegionInfoPass *RI) {
+    return GraphTraits<MachineRegionInfo*>::getEntryNode(&RI->getRegionInfo());
+  }
+  static nodes_iterator nodes_begin(MachineRegionInfoPass* RI) {
+    return GraphTraits<MachineRegionInfo*>::nodes_begin(&RI->getRegionInfo());
+  }
+  static nodes_iterator nodes_end(MachineRegionInfoPass *RI) {
+    return GraphTraits<MachineRegionInfo*>::nodes_end(&RI->getRegionInfo());
+  }
+};
+
+EXTERN_TEMPLATE_INSTANTIATION(class RegionBase<RegionTraits<MachineFunction>>);
+EXTERN_TEMPLATE_INSTANTIATION(class RegionNodeBase<RegionTraits<MachineFunction>>);
+EXTERN_TEMPLATE_INSTANTIATION(class RegionInfoBase<RegionTraits<MachineFunction>>);
+
+}
+
+#endif

diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 51139f7..2e7f034 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h

@@ -17,9 +17,10 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <vector>
 
 namespace llvm {
@@ -39,7 +40,7 @@
   };
 
 private:
-  const TargetMachine &TM;
+  const MachineFunction *MF;
   Delegate *TheDelegate;
 
   /// IsSSA - True when the machine function is in SSA form and virtual
@@ -69,7 +70,7 @@
 
   /// PhysRegUseDefLists - This is an array of the head of the use/def list for
   /// physical registers.
-  MachineOperand **PhysRegUseDefLists;
+  std::vector<MachineOperand *> PhysRegUseDefLists;
 
   /// getRegUseDefListHead - Return the head pointer for the register use/def
   /// list for the specified virtual or physical register.
@@ -122,11 +123,10 @@
   MachineRegisterInfo(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
   void operator=(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
 public:
-  explicit MachineRegisterInfo(const TargetMachine &TM);
-  ~MachineRegisterInfo();
+  explicit MachineRegisterInfo(const MachineFunction *MF);
 
   const TargetRegisterInfo *getTargetRegisterInfo() const {
-    return TM.getRegisterInfo();
+    return MF->getSubtarget().getRegisterInfo();
   }
 
   void resetDelegate(Delegate *delegate) {
@@ -515,8 +515,12 @@
   ///
   /// That function will return NULL if the virtual registers have incompatible
   /// constraints.
+  ///
+  /// Note that if ToReg is a physical register the function will replace and
+  /// apply sub registers to ToReg in order to obtain a final/proper physical
+  /// register.
   void replaceRegWith(unsigned FromReg, unsigned ToReg);
-
+  
   /// getVRegDef - Return the machine instr that defines the specified virtual
   /// register or null if none is found.  This assumes that the code is in SSA
   /// form, so there should only be one definition.

diff --git a/include/llvm/CodeGen/MachineRelocation.h b/include/llvm/CodeGen/MachineRelocation.h
deleted file mode 100644
index e778457..0000000
--- a/include/llvm/CodeGen/MachineRelocation.h
+++ /dev/null

@@ -1,342 +0,0 @@
-//===-- llvm/CodeGen/MachineRelocation.h - Target Relocation ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the MachineRelocation class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_MACHINERELOCATION_H
-#define LLVM_CODEGEN_MACHINERELOCATION_H
-
-#include "llvm/Support/DataTypes.h"
-#include <cassert>
-
-namespace llvm {
-class GlobalValue;
-class MachineBasicBlock;
-
-/// MachineRelocation - This represents a target-specific relocation value,
-/// produced by the code emitter.  This relocation is resolved after the has
-/// been emitted, either to an object file or to memory, when the target of the
-/// relocation can be resolved.
-///
-/// A relocation is made up of the following logical portions:
-///   1. An offset in the machine code buffer, the location to modify.
-///   2. A target specific relocation type (a number from 0 to 63).
-///   3. A symbol being referenced, either as a GlobalValue* or as a string.
-///   4. An optional constant value to be added to the reference.
-///   5. A bit, CanRewrite, which indicates to the JIT that a function stub is
-///      not needed for the relocation.
-///   6. An index into the GOT, if the target uses a GOT
-///
-class MachineRelocation {
-  enum AddressType {
-    isResult,         // Relocation has be transformed into its result pointer.
-    isGV,             // The Target.GV field is valid.
-    isIndirectSym,    // Relocation of an indirect symbol.
-    isBB,             // Relocation of BB address.
-    isExtSym,         // The Target.ExtSym field is valid.
-    isConstPool,      // Relocation of constant pool address.
-    isJumpTable,      // Relocation of jump table address.
-    isGOTIndex        // The Target.GOTIndex field is valid.
-  };
-  
-  /// Offset - This is the offset from the start of the code buffer of the
-  /// relocation to perform.
-  uintptr_t Offset;
-  
-  /// ConstantVal - A field that may be used by the target relocation type.
-  intptr_t ConstantVal;
-
-  union {
-    void *Result;           // If this has been resolved to a resolved pointer
-    GlobalValue *GV;        // If this is a pointer to a GV or an indirect ref.
-    MachineBasicBlock *MBB; // If this is a pointer to an LLVM BB
-    const char *ExtSym;     // If this is a pointer to a named symbol
-    unsigned Index;         // Constant pool / jump table index
-    unsigned GOTIndex;      // Index in the GOT of this symbol/global
-  } Target;
-
-  unsigned TargetReloType : 6; // The target relocation ID
-  AddressType AddrType    : 4; // The field of Target to use
-  bool MayNeedFarStub     : 1; // True if this relocation may require a far-stub
-  bool GOTRelative        : 1; // Should this relocation be relative to the GOT?
-  bool TargetResolve      : 1; // True if target should resolve the address
-
-public:
- // Relocation types used in a generic implementation.  Currently, relocation
- // entries for all things use the generic VANILLA type until they are refined
- // into target relocation types.
-  enum RelocationType {
-    VANILLA
-  };
-  
-  /// MachineRelocation::getGV - Return a relocation entry for a GlobalValue.
-  ///
-  static MachineRelocation getGV(uintptr_t offset, unsigned RelocationType, 
-                                 GlobalValue *GV, intptr_t cst = 0,
-                                 bool MayNeedFarStub = 0,
-                                 bool GOTrelative = 0) {
-    assert((RelocationType & ~63) == 0 && "Relocation type too large!");
-    MachineRelocation Result;
-    Result.Offset = offset;
-    Result.ConstantVal = cst;
-    Result.TargetReloType = RelocationType;
-    Result.AddrType = isGV;
-    Result.MayNeedFarStub = MayNeedFarStub;
-    Result.GOTRelative = GOTrelative;
-    Result.TargetResolve = false;
-    Result.Target.GV = GV;
-    return Result;
-  }
-
-  /// MachineRelocation::getIndirectSymbol - Return a relocation entry for an
-  /// indirect symbol.
-  static MachineRelocation getIndirectSymbol(uintptr_t offset,
-                                             unsigned RelocationType, 
-                                             GlobalValue *GV, intptr_t cst = 0,
-                                             bool MayNeedFarStub = 0,
-                                             bool GOTrelative = 0) {
-    assert((RelocationType & ~63) == 0 && "Relocation type too large!");
-    MachineRelocation Result;
-    Result.Offset = offset;
-    Result.ConstantVal = cst;
-    Result.TargetReloType = RelocationType;
-    Result.AddrType = isIndirectSym;
-    Result.MayNeedFarStub = MayNeedFarStub;
-    Result.GOTRelative = GOTrelative;
-    Result.TargetResolve = false;
-    Result.Target.GV = GV;
-    return Result;
-  }
-
-  /// MachineRelocation::getBB - Return a relocation entry for a BB.
-  ///
-  static MachineRelocation getBB(uintptr_t offset,unsigned RelocationType,
-                                 MachineBasicBlock *MBB, intptr_t cst = 0) {
-    assert((RelocationType & ~63) == 0 && "Relocation type too large!");
-    MachineRelocation Result;
-    Result.Offset = offset;
-    Result.ConstantVal = cst;
-    Result.TargetReloType = RelocationType;
-    Result.AddrType = isBB;
-    Result.MayNeedFarStub = false;
-    Result.GOTRelative = false;
-    Result.TargetResolve = false;
-    Result.Target.MBB = MBB;
-    return Result;
-  }
-
-  /// MachineRelocation::getExtSym - Return a relocation entry for an external
-  /// symbol, like "free".
-  ///
-  static MachineRelocation getExtSym(uintptr_t offset, unsigned RelocationType, 
-                                     const char *ES, intptr_t cst = 0,
-                                     bool GOTrelative = 0,
-                                     bool NeedStub = true) {
-    assert((RelocationType & ~63) == 0 && "Relocation type too large!");
-    MachineRelocation Result;
-    Result.Offset = offset;
-    Result.ConstantVal = cst;
-    Result.TargetReloType = RelocationType;
-    Result.AddrType = isExtSym;
-    Result.MayNeedFarStub = NeedStub;
-    Result.GOTRelative = GOTrelative;
-    Result.TargetResolve = false;
-    Result.Target.ExtSym = ES;
-    return Result;
-  }
-
-  /// MachineRelocation::getConstPool - Return a relocation entry for a constant
-  /// pool entry.
-  ///
-  static MachineRelocation getConstPool(uintptr_t offset,unsigned RelocationType,
-                                        unsigned CPI, intptr_t cst = 0,
-                                        bool letTargetResolve = false) {
-    assert((RelocationType & ~63) == 0 && "Relocation type too large!");
-    MachineRelocation Result;
-    Result.Offset = offset;
-    Result.ConstantVal = cst;
-    Result.TargetReloType = RelocationType;
-    Result.AddrType = isConstPool;
-    Result.MayNeedFarStub = false;
-    Result.GOTRelative = false;
-    Result.TargetResolve = letTargetResolve;
-    Result.Target.Index = CPI;
-    return Result;
-  }
-
-  /// MachineRelocation::getJumpTable - Return a relocation entry for a jump
-  /// table entry.
-  ///
-  static MachineRelocation getJumpTable(uintptr_t offset,unsigned RelocationType,
-                                        unsigned JTI, intptr_t cst = 0,
-                                        bool letTargetResolve = false) {
-    assert((RelocationType & ~63) == 0 && "Relocation type too large!");
-    MachineRelocation Result;
-    Result.Offset = offset;
-    Result.ConstantVal = cst;
-    Result.TargetReloType = RelocationType;
-    Result.AddrType = isJumpTable;
-    Result.MayNeedFarStub = false;
-    Result.GOTRelative = false;
-    Result.TargetResolve = letTargetResolve;
-    Result.Target.Index = JTI;
-    return Result;
-  }
-
-  /// getMachineCodeOffset - Return the offset into the code buffer that the
-  /// relocation should be performed.
-  intptr_t getMachineCodeOffset() const {
-    return Offset;
-  }
-
-  /// getRelocationType - Return the target-specific relocation ID for this
-  /// relocation.
-  unsigned getRelocationType() const {
-    return TargetReloType;
-  }
-
-  /// getConstantVal - Get the constant value associated with this relocation.
-  /// This is often an offset from the symbol.
-  ///
-  intptr_t getConstantVal() const {
-    return ConstantVal;
-  }
-
-  /// setConstantVal - Set the constant value associated with this relocation.
-  /// This is often an offset from the symbol.
-  ///
-  void setConstantVal(intptr_t val) {
-    ConstantVal = val;
-  }
-
-  /// isGlobalValue - Return true if this relocation is a GlobalValue, as
-  /// opposed to a constant string.
-  bool isGlobalValue() const {
-    return AddrType == isGV;
-  }
-
-  /// isIndirectSymbol - Return true if this relocation is the address an
-  /// indirect symbol
-  bool isIndirectSymbol() const {
-    return AddrType == isIndirectSym;
-  }
-
-  /// isBasicBlock - Return true if this relocation is a basic block reference.
-  ///
-  bool isBasicBlock() const {
-    return AddrType == isBB;
-  }
-
-  /// isExternalSymbol - Return true if this is a constant string.
-  ///
-  bool isExternalSymbol() const {
-    return AddrType == isExtSym;
-  }
-
-  /// isConstantPoolIndex - Return true if this is a constant pool reference.
-  ///
-  bool isConstantPoolIndex() const {
-    return AddrType == isConstPool;
-  }
-
-  /// isJumpTableIndex - Return true if this is a jump table reference.
-  ///
-  bool isJumpTableIndex() const {
-    return AddrType == isJumpTable;
-  }
-
-  /// isGOTRelative - Return true the target wants the index into the GOT of
-  /// the symbol rather than the address of the symbol.
-  bool isGOTRelative() const {
-    return GOTRelative;
-  }
-
-  /// mayNeedFarStub - This function returns true if the JIT for this target may
-  /// need either a stub function or an indirect global-variable load to handle
-  /// the relocated GlobalValue reference.  For example, the x86-64 call
-  /// instruction can only call functions within +/-2GB of the call site.
-  /// Anything farther away needs a longer mov+call sequence, which can't just
-  /// be written on top of the existing call.
-  bool mayNeedFarStub() const {
-    return MayNeedFarStub;
-  }
-
-  /// letTargetResolve - Return true if the target JITInfo is usually
-  /// responsible for resolving the address of this relocation.
-  bool letTargetResolve() const {
-    return TargetResolve;
-  }
-
-  /// getGlobalValue - If this is a global value reference, return the
-  /// referenced global.
-  GlobalValue *getGlobalValue() const {
-    assert((isGlobalValue() || isIndirectSymbol()) &&
-           "This is not a global value reference!");
-    return Target.GV;
-  }
-
-  MachineBasicBlock *getBasicBlock() const {
-    assert(isBasicBlock() && "This is not a basic block reference!");
-    return Target.MBB;
-  }
-
-  /// getString - If this is a string value, return the string reference.
-  ///
-  const char *getExternalSymbol() const {
-    assert(isExternalSymbol() && "This is not an external symbol reference!");
-    return Target.ExtSym;
-  }
-
-  /// getConstantPoolIndex - If this is a const pool reference, return
-  /// the index into the constant pool.
-  unsigned getConstantPoolIndex() const {
-    assert(isConstantPoolIndex() && "This is not a constant pool reference!");
-    return Target.Index;
-  }
-
-  /// getJumpTableIndex - If this is a jump table reference, return
-  /// the index into the jump table.
-  unsigned getJumpTableIndex() const {
-    assert(isJumpTableIndex() && "This is not a jump table reference!");
-    return Target.Index;
-  }
-
-  /// getResultPointer - Once this has been resolved to point to an actual
-  /// address, this returns the pointer.
-  void *getResultPointer() const {
-    assert(AddrType == isResult && "Result pointer isn't set yet!");
-    return Target.Result;
-  }
-
-  /// setResultPointer - Set the result to the specified pointer value.
-  ///
-  void setResultPointer(void *Ptr) {
-    Target.Result = Ptr;
-    AddrType = isResult;
-  }
-
-  /// setGOTIndex - Set the GOT index to a specific value.
-  void setGOTIndex(unsigned idx) {
-    AddrType = isGOTIndex;
-    Target.GOTIndex = idx;
-  }
-
-  /// getGOTIndex - Once this has been resolved to an entry in the GOT,
-  /// this returns that index.  The index is from the lowest address entry
-  /// in the GOT.
-  unsigned getGOTIndex() const {
-    assert(AddrType == isGOTIndex);
-    return Target.GOTIndex;
-  }
-};
-}
-
-#endif

diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 7d85432..c5f66a8 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h

@@ -250,7 +250,7 @@
 public:
   ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
                 bool IsPostRA)
-      : ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, IsPostRA,
+      : ScheduleDAGInstrs(*C->MF, C->MLI, IsPostRA,
                           /*RemoveKillFlags=*/IsPostRA, C->LIS),
         AA(C->AA), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU), CurrentTop(),
         CurrentBottom(), NextClusterPred(nullptr), NextClusterSucc(nullptr) {

diff --git a/include/llvm/CodeGen/MachineTraceMetrics.h b/include/llvm/CodeGen/MachineTraceMetrics.h
index 323b694..bfe6e94 100644
--- a/include/llvm/CodeGen/MachineTraceMetrics.h
+++ b/include/llvm/CodeGen/MachineTraceMetrics.h

@@ -44,8 +44,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_MACHINE_TRACE_METRICS_H
-#define LLVM_CODEGEN_MACHINE_TRACE_METRICS_H
+#ifndef LLVM_CODEGEN_MACHINETRACEMETRICS_H
+#define LLVM_CODEGEN_MACHINETRACEMETRICS_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -264,8 +264,9 @@
     /// classes are included. For the caller to account for extra machine
     /// instructions, it must first resolve each instruction's scheduling class.
     unsigned getResourceLength(
-                ArrayRef<const MachineBasicBlock*> Extrablocks = None,
-                ArrayRef<const MCSchedClassDesc*> ExtraInstrs = None) const;
+        ArrayRef<const MachineBasicBlock *> Extrablocks = None,
+        ArrayRef<const MCSchedClassDesc *> ExtraInstrs = None,
+        ArrayRef<const MCSchedClassDesc *> RemoveInstrs = None) const;
 
     /// Return the length of the (data dependency) critical path through the
     /// trace.
@@ -286,6 +287,12 @@
     /// Return the Depth of a PHI instruction in a trace center block successor.
     /// The PHI does not have to be part of the trace.
     unsigned getPHIDepth(const MachineInstr *PHI) const;
+
+    /// A dependence is useful if the basic block of the defining instruction
+    /// is part of the trace of the user instruction. It is assumed that DefMI
+    /// dominates UseMI (see also isUsefulDominator).
+    bool isDepInTrace(const MachineInstr *DefMI,
+                      const MachineInstr *UseMI) const;
   };
 
   /// A trace ensemble is a collection of traces selected using the same

diff --git a/include/llvm/CodeGen/MachineValueType.h b/include/llvm/CodeGen/MachineValueType.h
index ad215ec..affacb0 100644
--- a/include/llvm/CodeGen/MachineValueType.h
+++ b/include/llvm/CodeGen/MachineValueType.h

@@ -196,21 +196,24 @@
     /// is32BitVector - Return true if this is a 32-bit vector type.
     bool is32BitVector() const {
       return (SimpleTy == MVT::v4i8  || SimpleTy == MVT::v2i16 ||
-              SimpleTy == MVT::v1i32);
+              SimpleTy == MVT::v1i32 || SimpleTy == MVT::v2f16 ||
+              SimpleTy == MVT::v1f32);
     }
 
     /// is64BitVector - Return true if this is a 64-bit vector type.
     bool is64BitVector() const {
       return (SimpleTy == MVT::v8i8  || SimpleTy == MVT::v4i16 ||
               SimpleTy == MVT::v2i32 || SimpleTy == MVT::v1i64 ||
-              SimpleTy == MVT::v1f64 || SimpleTy == MVT::v2f32);
+              SimpleTy == MVT::v4f16 || SimpleTy == MVT::v2f32 ||
+              SimpleTy == MVT::v1f64);
     }
 
     /// is128BitVector - Return true if this is a 128-bit vector type.
     bool is128BitVector() const {
       return (SimpleTy == MVT::v16i8 || SimpleTy == MVT::v8i16 ||
               SimpleTy == MVT::v4i32 || SimpleTy == MVT::v2i64 ||
-              SimpleTy == MVT::v4f32 || SimpleTy == MVT::v2f64);
+              SimpleTy == MVT::v8f16 || SimpleTy == MVT::v4f32 ||
+              SimpleTy == MVT::v2f64);
     }
 
     /// is256BitVector - Return true if this is a 256-bit vector type.

diff --git a/include/llvm/CodeGen/PBQP/CostAllocator.h b/include/llvm/CodeGen/PBQP/CostAllocator.h
index ff62c09..02d39fe 100644
--- a/include/llvm/CodeGen/PBQP/CostAllocator.h
+++ b/include/llvm/CodeGen/PBQP/CostAllocator.h

@@ -15,117 +15,101 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_COSTALLOCATOR_H
-#define LLVM_COSTALLOCATOR_H
+#ifndef LLVM_CODEGEN_PBQP_COSTALLOCATOR_H
+#define LLVM_CODEGEN_PBQP_COSTALLOCATOR_H
 
-#include <set>
+#include "llvm/ADT/DenseSet.h"
+#include <memory>
 #include <type_traits>
 
+namespace llvm {
 namespace PBQP {
 
-template <typename CostT,
-          typename CostKeyTComparator>
-class CostPool {
+template <typename ValueT>
+class ValuePool {
 public:
-
-  class PoolEntry {
-  public:
-    template <typename CostKeyT>
-    PoolEntry(CostPool &pool, CostKeyT cost)
-      : pool(pool), cost(std::move(cost)), refCount(0) {}
-    ~PoolEntry() { pool.removeEntry(this); }
-    void incRef() { ++refCount; }
-    bool decRef() { --refCount; return (refCount == 0); }
-    CostT& getCost() { return cost; }
-    const CostT& getCost() const { return cost; }
-  private:
-    CostPool &pool;
-    CostT cost;
-    std::size_t refCount;
-  };
-
-  class PoolRef {
-  public:
-    PoolRef(PoolEntry *entry) : entry(entry) {
-      this->entry->incRef();
-    }
-    PoolRef(const PoolRef &r) {
-      entry = r.entry;
-      entry->incRef();
-    }
-    PoolRef& operator=(const PoolRef &r) {
-      assert(entry != nullptr && "entry should not be null.");
-      PoolEntry *temp = r.entry;
-      temp->incRef();
-      entry->decRef();
-      entry = temp;
-      return *this;
-    }
-
-    ~PoolRef() {
-      if (entry->decRef())
-        delete entry;
-    }
-    void reset(PoolEntry *entry) {
-      entry->incRef();
-      this->entry->decRef();
-      this->entry = entry;
-    }
-    CostT& operator*() { return entry->getCost(); }
-    const CostT& operator*() const { return entry->getCost(); }
-    CostT* operator->() { return &entry->getCost(); }
-    const CostT* operator->() const { return &entry->getCost(); }
-  private:
-    PoolEntry *entry;
-  };
+  typedef std::shared_ptr<const ValueT> PoolRef;
 
 private:
-  class EntryComparator {
+
+  class PoolEntry : public std::enable_shared_from_this<PoolEntry> {
   public:
-    template <typename CostKeyT>
-    typename std::enable_if<
-               !std::is_same<PoolEntry*,
-                             typename std::remove_const<CostKeyT>::type>::value,
-               bool>::type
-    operator()(const PoolEntry* a, const CostKeyT &b) {
-      return compare(a->getCost(), b);
-    }
-    bool operator()(const PoolEntry* a, const PoolEntry* b) {
-      return compare(a->getCost(), b->getCost());
-    }
+    template <typename ValueKeyT>
+    PoolEntry(ValuePool &Pool, ValueKeyT Value)
+        : Pool(Pool), Value(std::move(Value)) {}
+    ~PoolEntry() { Pool.removeEntry(this); }
+    const ValueT& getValue() const { return Value; }
   private:
-    CostKeyTComparator compare;
+    ValuePool &Pool;
+    ValueT Value;
   };
 
-  typedef std::set<PoolEntry*, EntryComparator> EntrySet;
+  class PoolEntryDSInfo {
+  public:
+    static inline PoolEntry* getEmptyKey() { return nullptr; }
 
-  EntrySet entrySet;
+    static inline PoolEntry* getTombstoneKey() {
+      return reinterpret_cast<PoolEntry*>(static_cast<uintptr_t>(1));
+    }
 
-  void removeEntry(PoolEntry *p) { entrySet.erase(p); }
+    template <typename ValueKeyT>
+    static unsigned getHashValue(const ValueKeyT &C) {
+      return hash_value(C);
+    }
+
+    static unsigned getHashValue(PoolEntry *P) {
+      return getHashValue(P->getValue());
+    }
+
+    static unsigned getHashValue(const PoolEntry *P) {
+      return getHashValue(P->getValue());
+    }
+
+    template <typename ValueKeyT1, typename ValueKeyT2>
+    static
+    bool isEqual(const ValueKeyT1 &C1, const ValueKeyT2 &C2) {
+      return C1 == C2;
+    }
+
+    template <typename ValueKeyT>
+    static bool isEqual(const ValueKeyT &C, PoolEntry *P) {
+      if (P == getEmptyKey() || P == getTombstoneKey())
+        return false;
+      return isEqual(C, P->getValue());
+    }
+
+    static bool isEqual(PoolEntry *P1, PoolEntry *P2) {
+      if (P1 == getEmptyKey() || P1 == getTombstoneKey())
+        return P1 == P2;
+      return isEqual(P1->getValue(), P2);
+    }
+
+  };
+
+  typedef DenseSet<PoolEntry*, PoolEntryDSInfo> EntrySetT;
+
+  EntrySetT EntrySet;
+
+  void removeEntry(PoolEntry *P) { EntrySet.erase(P); }
 
 public:
+  template <typename ValueKeyT> PoolRef getValue(ValueKeyT ValueKey) {
+    typename EntrySetT::iterator I = EntrySet.find_as(ValueKey);
 
-  template <typename CostKeyT>
-  PoolRef getCost(CostKeyT costKey) {
-    typename EntrySet::iterator itr =
-      std::lower_bound(entrySet.begin(), entrySet.end(), costKey,
-                       EntryComparator());
+    if (I != EntrySet.end())
+      return PoolRef((*I)->shared_from_this(), &(*I)->getValue());
 
-    if (itr != entrySet.end() && costKey == (*itr)->getCost())
-      return PoolRef(*itr);
-
-    PoolEntry *p = new PoolEntry(*this, std::move(costKey));
-    entrySet.insert(itr, p);
-    return PoolRef(p);
+    auto P = std::make_shared<PoolEntry>(*this, std::move(ValueKey));
+    EntrySet.insert(P.get());
+    return PoolRef(std::move(P), &P->getValue());
   }
 };
 
-template <typename VectorT, typename VectorTComparator,
-          typename MatrixT, typename MatrixTComparator>
+template <typename VectorT, typename MatrixT>
 class PoolCostAllocator {
 private:
-  typedef CostPool<VectorT, VectorTComparator> VectorCostPool;
-  typedef CostPool<MatrixT, MatrixTComparator> MatrixCostPool;
+  typedef ValuePool<VectorT> VectorCostPool;
+  typedef ValuePool<MatrixT> MatrixCostPool;
 public:
   typedef VectorT Vector;
   typedef MatrixT Matrix;
@@ -133,15 +117,16 @@
   typedef typename MatrixCostPool::PoolRef MatrixPtr;
 
   template <typename VectorKeyT>
-  VectorPtr getVector(VectorKeyT v) { return vectorPool.getCost(std::move(v)); }
+  VectorPtr getVector(VectorKeyT v) { return VectorPool.getValue(std::move(v)); }
 
   template <typename MatrixKeyT>
-  MatrixPtr getMatrix(MatrixKeyT m) { return matrixPool.getCost(std::move(m)); }
+  MatrixPtr getMatrix(MatrixKeyT m) { return MatrixPool.getValue(std::move(m)); }
 private:
-  VectorCostPool vectorPool;
-  MatrixCostPool matrixPool;
+  VectorCostPool VectorPool;
+  MatrixCostPool MatrixPool;
 };
 
-}
+} // namespace PBQP
+} // namespace llvm
 
-#endif // LLVM_COSTALLOCATOR_H
+#endif

diff --git a/include/llvm/CodeGen/PBQP/Graph.h b/include/llvm/CodeGen/PBQP/Graph.h
index a55f0ea..4dc5674 100644
--- a/include/llvm/CodeGen/PBQP/Graph.h
+++ b/include/llvm/CodeGen/PBQP/Graph.h

@@ -17,11 +17,12 @@
 
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include <list>
 #include <map>
 #include <set>
 
+namespace llvm {
 namespace PBQP {
 
   class GraphBase {
@@ -29,12 +30,12 @@
     typedef unsigned NodeId;
     typedef unsigned EdgeId;
 
-    /// \brief Returns a value representing an invalid (non-existent) node.
+    /// @brief Returns a value representing an invalid (non-existent) node.
     static NodeId invalidNodeId() {
       return std::numeric_limits<NodeId>::max();
     }
 
-    /// \brief Returns a value representing an invalid (non-existent) edge.
+    /// @brief Returns a value representing an invalid (non-existent) edge.
     static EdgeId invalidEdgeId() {
       return std::numeric_limits<EdgeId>::max();
     }
@@ -56,6 +57,7 @@
     typedef typename CostAllocator::MatrixPtr MatrixPtr;
     typedef typename SolverT::NodeMetadata NodeMetadata;
     typedef typename SolverT::EdgeMetadata EdgeMetadata;
+    typedef typename SolverT::GraphMetadata GraphMetadata;
 
   private:
 
@@ -172,6 +174,7 @@
 
     // ----- MEMBERS -----
 
+    GraphMetadata Metadata;
     CostAllocator CostAlloc;
     SolverT *Solver;
 
@@ -187,13 +190,19 @@
 
     // ----- INTERNAL METHODS -----
 
-    NodeEntry& getNode(NodeId NId) { return Nodes[NId]; }
-    const NodeEntry& getNode(NodeId NId) const { return Nodes[NId]; }
+    NodeEntry &getNode(NodeId NId) {
+      assert(NId < Nodes.size() && "Out of bound NodeId");
+      return Nodes[NId];
+    }
+    const NodeEntry &getNode(NodeId NId) const {
+      assert(NId < Nodes.size() && "Out of bound NodeId");
+      return Nodes[NId];
+    }
 
     EdgeEntry& getEdge(EdgeId EId) { return Edges[EId]; }
     const EdgeEntry& getEdge(EdgeId EId) const { return Edges[EId]; }
 
-    NodeId addConstructedNode(const NodeEntry &N) {
+    NodeId addConstructedNode(NodeEntry N) {
       NodeId NId = 0;
       if (!FreeNodeIds.empty()) {
         NId = FreeNodeIds.back();
@@ -206,7 +215,7 @@
       return NId;
     }
 
-    EdgeId addConstructedEdge(const EdgeEntry &E) {
+    EdgeId addConstructedEdge(EdgeEntry E) {
       assert(findEdge(E.getN1Id(), E.getN2Id()) == invalidEdgeId() &&
              "Attempt to add duplicate edge.");
       EdgeId EId = 0;
@@ -235,6 +244,12 @@
 
     class NodeItr {
     public:
+      typedef std::forward_iterator_tag iterator_category;
+      typedef NodeId value_type;
+      typedef int difference_type;
+      typedef NodeId* pointer;
+      typedef NodeId& reference;
+
       NodeItr(NodeId CurNId, const Graph &G)
         : CurNId(CurNId), EndNId(G.Nodes.size()), FreeNodeIds(G.FreeNodeIds) {
         this->CurNId = findNextInUse(CurNId); // Move to first in-use node id
@@ -249,7 +264,7 @@
       NodeId findNextInUse(NodeId NId) const {
         while (NId < EndNId &&
                std::find(FreeNodeIds.begin(), FreeNodeIds.end(), NId) !=
-                 FreeNodeIds.end()) {
+               FreeNodeIds.end()) {
           ++NId;
         }
         return NId;
@@ -328,10 +343,19 @@
       const NodeEntry &NE;
     };
 
-    /// \brief Construct an empty PBQP graph.
-    Graph() : Solver(nullptr) { }
+    /// @brief Construct an empty PBQP graph.
+    Graph() : Solver(nullptr) {}
 
-    /// \brief Lock this graph to the given solver instance in preparation
+    /// @brief Construct an empty PBQP graph with the given graph metadata.
+    Graph(GraphMetadata Metadata) : Metadata(Metadata), Solver(nullptr) {}
+
+    /// @brief Get a reference to the graph metadata.
+    GraphMetadata& getMetadata() { return Metadata; }
+
+    /// @brief Get a const-reference to the graph metadata.
+    const GraphMetadata& getMetadata() const { return Metadata; }
+
+    /// @brief Lock this graph to the given solver instance in preparation
     /// for running the solver. This method will call solver.handleAddNode for
     /// each node in the graph, and handleAddEdge for each edge, to give the
     /// solver an opportunity to set up any requried metadata.
@@ -344,13 +368,13 @@
         Solver->handleAddEdge(EId);
     }
 
-    /// \brief Release from solver instance.
+    /// @brief Release from solver instance.
     void unsetSolver() {
       assert(Solver && "Solver not set.");
       Solver = nullptr;
     }
 
-    /// \brief Add a node with the given costs.
+    /// @brief Add a node with the given costs.
     /// @param Costs Cost vector for the new node.
     /// @return Node iterator for the added node.
     template <typename OtherVectorT>
@@ -363,9 +387,29 @@
       return NId;
     }
 
-    /// \brief Add an edge between the given nodes with the given costs.
+    /// @brief Add a node bypassing the cost allocator.
+    /// @param Costs Cost vector ptr for the new node (must be convertible to
+    ///        VectorPtr).
+    /// @return Node iterator for the added node.
+    ///
+    ///   This method allows for fast addition of a node whose costs don't need
+    /// to be passed through the cost allocator. The most common use case for
+    /// this is when duplicating costs from an existing node (when using a
+    /// pooling allocator). These have already been uniqued, so we can avoid
+    /// re-constructing and re-uniquing them by attaching them directly to the
+    /// new node.
+    template <typename OtherVectorPtrT>
+    NodeId addNodeBypassingCostAllocator(OtherVectorPtrT Costs) {
+      NodeId NId = addConstructedNode(NodeEntry(Costs));
+      if (Solver)
+        Solver->handleAddNode(NId);
+      return NId;
+    }
+
+    /// @brief Add an edge between the given nodes with the given costs.
     /// @param N1Id First node.
     /// @param N2Id Second node.
+    /// @param Costs Cost matrix for new edge.
     /// @return Edge iterator for the added edge.
     template <typename OtherVectorT>
     EdgeId addEdge(NodeId N1Id, NodeId N2Id, OtherVectorT Costs) {
@@ -380,7 +424,32 @@
       return EId;
     }
 
-    /// \brief Returns true if the graph is empty.
+    /// @brief Add an edge bypassing the cost allocator.
+    /// @param N1Id First node.
+    /// @param N2Id Second node.
+    /// @param Costs Cost matrix for new edge.
+    /// @return Edge iterator for the added edge.
+    ///
+    ///   This method allows for fast addition of an edge whose costs don't need
+    /// to be passed through the cost allocator. The most common use case for
+    /// this is when duplicating costs from an existing edge (when using a
+    /// pooling allocator). These have already been uniqued, so we can avoid
+    /// re-constructing and re-uniquing them by attaching them directly to the
+    /// new edge.
+    template <typename OtherMatrixPtrT>
+    NodeId addEdgeBypassingCostAllocator(NodeId N1Id, NodeId N2Id,
+                                         OtherMatrixPtrT Costs) {
+      assert(getNodeCosts(N1Id).getLength() == Costs->getRows() &&
+             getNodeCosts(N2Id).getLength() == Costs->getCols() &&
+             "Matrix dimensions mismatch.");
+      // Get cost matrix from the problem domain.
+      EdgeId EId = addConstructedEdge(EdgeEntry(N1Id, N2Id, Costs));
+      if (Solver)
+        Solver->handleAddEdge(EId);
+      return EId;
+    }
+
+    /// @brief Returns true if the graph is empty.
     bool empty() const { return NodeIdSet(*this).empty(); }
 
     NodeIdSet nodeIds() const { return NodeIdSet(*this); }
@@ -388,15 +457,15 @@
 
     AdjEdgeIdSet adjEdgeIds(NodeId NId) { return AdjEdgeIdSet(getNode(NId)); }
 
-    /// \brief Get the number of nodes in the graph.
+    /// @brief Get the number of nodes in the graph.
     /// @return Number of nodes in the graph.
     unsigned getNumNodes() const { return NodeIdSet(*this).size(); }
 
-    /// \brief Get the number of edges in the graph.
+    /// @brief Get the number of edges in the graph.
     /// @return Number of edges in the graph.
     unsigned getNumEdges() const { return EdgeIdSet(*this).size(); }
 
-    /// \brief Set a node's cost vector.
+    /// @brief Set a node's cost vector.
     /// @param NId Node to update.
     /// @param Costs New costs to set.
     template <typename OtherVectorT>
@@ -407,11 +476,23 @@
       getNode(NId).Costs = AllocatedCosts;
     }
 
-    /// \brief Get a node's cost vector (const version).
+    /// @brief Get a VectorPtr to a node's cost vector. Rarely useful - use
+    ///        getNodeCosts where possible.
+    /// @param NId Node id.
+    /// @return VectorPtr to node cost vector.
+    ///
+    ///   This method is primarily useful for duplicating costs quickly by
+    /// bypassing the cost allocator. See addNodeBypassingCostAllocator. Prefer
+    /// getNodeCosts when dealing with node cost values.
+    const VectorPtr& getNodeCostsPtr(NodeId NId) const {
+      return getNode(NId).Costs;
+    }
+
+    /// @brief Get a node's cost vector.
     /// @param NId Node id.
     /// @return Node cost vector.
     const Vector& getNodeCosts(NodeId NId) const {
-      return *getNode(NId).Costs;
+      return *getNodeCostsPtr(NId);
     }
 
     NodeMetadata& getNodeMetadata(NodeId NId) {
@@ -426,7 +507,7 @@
       return getNode(NId).getAdjEdgeIds().size();
     }
 
-    /// \brief Set an edge's cost matrix.
+    /// @brief Set an edge's cost matrix.
     /// @param EId Edge id.
     /// @param Costs New cost matrix.
     template <typename OtherMatrixT>
@@ -437,34 +518,48 @@
       getEdge(EId).Costs = AllocatedCosts;
     }
 
-    /// \brief Get an edge's cost matrix (const version).
+    /// @brief Get a MatrixPtr to a node's cost matrix. Rarely useful - use
+    ///        getEdgeCosts where possible.
+    /// @param EId Edge id.
+    /// @return MatrixPtr to edge cost matrix.
+    ///
+    ///   This method is primarily useful for duplicating costs quickly by
+    /// bypassing the cost allocator. See addNodeBypassingCostAllocator. Prefer
+    /// getEdgeCosts when dealing with edge cost values.
+    const MatrixPtr& getEdgeCostsPtr(EdgeId EId) const {
+      return getEdge(EId).Costs;
+    }
+
+    /// @brief Get an edge's cost matrix.
     /// @param EId Edge id.
     /// @return Edge cost matrix.
-    const Matrix& getEdgeCosts(EdgeId EId) const { return *getEdge(EId).Costs; }
-
-    EdgeMetadata& getEdgeMetadata(EdgeId NId) {
-      return getEdge(NId).Metadata;
+    const Matrix& getEdgeCosts(EdgeId EId) const {
+      return *getEdge(EId).Costs;
     }
 
-    const EdgeMetadata& getEdgeMetadata(EdgeId NId) const {
-      return getEdge(NId).Metadata;
+    EdgeMetadata& getEdgeMetadata(EdgeId EId) {
+      return getEdge(EId).Metadata;
     }
 
-    /// \brief Get the first node connected to this edge.
+    const EdgeMetadata& getEdgeMetadata(EdgeId EId) const {
+      return getEdge(EId).Metadata;
+    }
+
+    /// @brief Get the first node connected to this edge.
     /// @param EId Edge id.
     /// @return The first node connected to the given edge.
     NodeId getEdgeNode1Id(EdgeId EId) {
       return getEdge(EId).getN1Id();
     }
 
-    /// \brief Get the second node connected to this edge.
+    /// @brief Get the second node connected to this edge.
     /// @param EId Edge id.
     /// @return The second node connected to the given edge.
     NodeId getEdgeNode2Id(EdgeId EId) {
       return getEdge(EId).getN2Id();
     }
 
-    /// \brief Get the "other" node connected to this edge.
+    /// @brief Get the "other" node connected to this edge.
     /// @param EId Edge id.
     /// @param NId Node id for the "given" node.
     /// @return The iterator for the "other" node connected to this edge.
@@ -476,7 +571,7 @@
       return E.getN1Id();
     }
 
-    /// \brief Get the edge connecting two nodes.
+    /// @brief Get the edge connecting two nodes.
     /// @param N1Id First node id.
     /// @param N2Id Second node id.
     /// @return An id for edge (N1Id, N2Id) if such an edge exists,
@@ -491,7 +586,7 @@
       return invalidEdgeId();
     }
 
-    /// \brief Remove a node from the graph.
+    /// @brief Remove a node from the graph.
     /// @param NId Node id.
     void removeNode(NodeId NId) {
       if (Solver)
@@ -499,7 +594,7 @@
       NodeEntry &N = getNode(NId);
       // TODO: Can this be for-each'd?
       for (AdjEdgeItr AEItr = N.adjEdgesBegin(),
-                      AEEnd = N.adjEdgesEnd();
+             AEEnd = N.adjEdgesEnd();
            AEItr != AEEnd;) {
         EdgeId EId = *AEItr;
         ++AEItr;
@@ -508,7 +603,7 @@
       FreeNodeIds.push_back(NId);
     }
 
-    /// \brief Disconnect an edge from the given node.
+    /// @brief Disconnect an edge from the given node.
     ///
     /// Removes the given edge from the adjacency list of the given node.
     /// This operation leaves the edge in an 'asymmetric' state: It will no
@@ -541,14 +636,14 @@
       E.disconnectFrom(*this, NId);
     }
 
-    /// \brief Convenience method to disconnect all neighbours from the given
+    /// @brief Convenience method to disconnect all neighbours from the given
     ///        node.
     void disconnectAllNeighborsFromNode(NodeId NId) {
       for (auto AEId : adjEdgeIds(NId))
         disconnectEdge(AEId, getEdgeOtherNodeId(AEId, NId));
     }
 
-    /// \brief Re-attach an edge to its nodes.
+    /// @brief Re-attach an edge to its nodes.
     ///
     /// Adds an edge that had been previously disconnected back into the
     /// adjacency set of the nodes that the edge connects.
@@ -559,7 +654,7 @@
         Solver->handleReconnectEdge(EId, NId);
     }
 
-    /// \brief Remove an edge from the graph.
+    /// @brief Remove an edge from the graph.
     /// @param EId Edge id.
     void removeEdge(EdgeId EId) {
       if (Solver)
@@ -570,7 +665,7 @@
       Edges[EId].invalidate();
     }
 
-    /// \brief Remove all nodes and edges from the graph.
+    /// @brief Remove all nodes and edges from the graph.
     void clear() {
       Nodes.clear();
       FreeNodeIds.clear();
@@ -578,9 +673,9 @@
       FreeEdgeIds.clear();
     }
 
-    /// \brief Dump a graph to an output stream.
+    /// @brief Dump a graph to an output stream.
     template <typename OStream>
-    void dump(OStream &OS) {
+    void dumpToStream(OStream &OS) {
       OS << nodeIds().size() << " " << edgeIds().size() << "\n";
 
       for (auto NId : nodeIds()) {
@@ -613,7 +708,12 @@
       }
     }
 
-    /// \brief Print a representation of this graph in DOT format.
+    /// @brief Dump this graph to dbgs().
+    void dump() {
+      dumpToStream(dbgs());
+    }
+
+    /// @brief Print a representation of this graph in DOT format.
     /// @param OS Output stream to print on.
     template <typename OStream>
     void printDot(OStream &OS) {
@@ -637,6 +737,7 @@
     }
   };
 
-}
+}  // namespace PBQP
+}  // namespace llvm
 
 #endif // LLVM_CODEGEN_PBQP_GRAPH_HPP

diff --git a/include/llvm/CodeGen/PBQP/Math.h b/include/llvm/CodeGen/PBQP/Math.h
index 69a9d83..2792608 100644
--- a/include/llvm/CodeGen/PBQP/Math.h
+++ b/include/llvm/CodeGen/PBQP/Math.h

@@ -10,17 +10,19 @@
 #ifndef LLVM_CODEGEN_PBQP_MATH_H
 #define LLVM_CODEGEN_PBQP_MATH_H
 
+#include "llvm/ADT/Hashing.h"
 #include <algorithm>
 #include <cassert>
 #include <functional>
 
+namespace llvm {
 namespace PBQP {
 
 typedef float PBQPNum;
 
 /// \brief PBQP Vector class.
 class Vector {
-  friend class VectorComparator;
+  friend hash_code hash_value(const Vector &);
 public:
 
   /// \brief Construct a PBQP vector of the given size.
@@ -136,21 +138,12 @@
   PBQPNum *Data;
 };
 
-class VectorComparator {
-public:
-  bool operator()(const Vector &A, const Vector &B) {
-    if (A.Length < B.Length)
-      return true;
-    if (B.Length < A.Length)
-      return false;
-    char *AData = reinterpret_cast<char*>(A.Data);
-    char *BData = reinterpret_cast<char*>(B.Data);
-    return std::lexicographical_compare(AData,
-                                        AData + A.Length * sizeof(PBQPNum),
-                                        BData,
-                                        BData + A.Length * sizeof(PBQPNum));
-  }
-};
+/// \brief Return a hash_value for the given vector.
+inline hash_code hash_value(const Vector &V) {
+  unsigned *VBegin = reinterpret_cast<unsigned*>(V.Data);
+  unsigned *VEnd = reinterpret_cast<unsigned*>(V.Data + V.Length);
+  return hash_combine(V.Length, hash_combine_range(VBegin, VEnd));
+}
 
 /// \brief Output a textual representation of the given vector on the given
 ///        output stream.
@@ -166,11 +159,10 @@
   return OS;
 }
 
-
 /// \brief PBQP Matrix class
 class Matrix {
 private:
-  friend class MatrixComparator;
+  friend hash_code hash_value(const Matrix &);
 public:
 
   /// \brief Construct a PBQP Matrix with the given dimensions.
@@ -384,24 +376,12 @@
   PBQPNum *Data;
 };
 
-class MatrixComparator {
-public:
-  bool operator()(const Matrix &A, const Matrix &B) {
-    if (A.Rows < B.Rows)
-      return true;
-    if (B.Rows < A.Rows)
-      return false;
-    if (A.Cols < B.Cols)
-      return true;
-    if (B.Cols < A.Cols)
-      return false;
-    char *AData = reinterpret_cast<char*>(A.Data);
-    char *BData = reinterpret_cast<char*>(B.Data);
-    return std::lexicographical_compare(
-             AData, AData + (A.Rows * A.Cols * sizeof(PBQPNum)),
-             BData, BData + (A.Rows * A.Cols * sizeof(PBQPNum)));
-  }
-};
+/// \brief Return a hash_code for the given matrix.
+inline hash_code hash_value(const Matrix &M) {
+  unsigned *MBegin = reinterpret_cast<unsigned*>(M.Data);
+  unsigned *MEnd = reinterpret_cast<unsigned*>(M.Data + (M.Rows * M.Cols));
+  return hash_combine(M.Rows, M.Cols, hash_combine_range(MBegin, MEnd));
+}
 
 /// \brief Output a textual representation of the given matrix on the given
 ///        output stream.
@@ -409,7 +389,7 @@
 OStream& operator<<(OStream &OS, const Matrix &M) {
   assert((M.getRows() != 0) && "Zero-row matrix badness.");
   for (unsigned i = 0; i < M.getRows(); ++i)
-    OS << M.getRowAsVector(i);
+    OS << M.getRowAsVector(i) << "\n";
   return OS;
 }
 
@@ -424,6 +404,11 @@
 };
 
 template <typename Metadata>
+inline hash_code hash_value(const MDVector<Metadata> &V) {
+  return hash_value(static_cast<const Vector&>(V));
+}
+
+template <typename Metadata>
 class MDMatrix : public Matrix {
 public:
   MDMatrix(const Matrix &m) : Matrix(m), md(*this) { }
@@ -433,6 +418,12 @@
   Metadata md;
 };
 
+template <typename Metadata>
+inline hash_code hash_value(const MDMatrix<Metadata> &M) {
+  return hash_value(static_cast<const Matrix&>(M));
 }
 
+} // namespace PBQP
+} // namespace llvm
+
 #endif // LLVM_CODEGEN_PBQP_MATH_H

diff --git a/include/llvm/CodeGen/PBQP/ReductionRules.h b/include/llvm/CodeGen/PBQP/ReductionRules.h
index a55a060..21fde4d 100644
--- a/include/llvm/CodeGen/PBQP/ReductionRules.h
+++ b/include/llvm/CodeGen/PBQP/ReductionRules.h

@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_REDUCTIONRULES_H
-#define LLVM_REDUCTIONRULES_H
+#ifndef LLVM_CODEGEN_PBQP_REDUCTIONRULES_H
+#define LLVM_CODEGEN_PBQP_REDUCTIONRULES_H
 
 #include "Graph.h"
 #include "Math.h"
 #include "Solution.h"
 
+namespace llvm {
 namespace PBQP {
 
   /// \brief Reduce a node of degree one.
@@ -186,6 +187,7 @@
     return s;
   }
 
-}
+} // namespace PBQP
+} // namespace llvm
 
-#endif // LLVM_REDUCTIONRULES_H
+#endif

diff --git a/include/llvm/CodeGen/PBQP/RegAllocSolver.h b/include/llvm/CodeGen/PBQP/RegAllocSolver.h
deleted file mode 100644
index 977c348..0000000
--- a/include/llvm/CodeGen/PBQP/RegAllocSolver.h
+++ /dev/null

@@ -1,359 +0,0 @@
-//===-- RegAllocSolver.h - Heuristic PBQP Solver for reg alloc --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Heuristic PBQP solver for register allocation problems. This solver uses a
-// graph reduction approach. Nodes of degree 0, 1 and 2 are eliminated with
-// optimality-preserving rules (see ReductionRules.h). When no low-degree (<3)
-// nodes are present, a heuristic derived from Brigg's graph coloring approach
-// is used.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_PBQP_REGALLOCSOLVER_H
-#define LLVM_CODEGEN_PBQP_REGALLOCSOLVER_H
-
-#include "CostAllocator.h"
-#include "Graph.h"
-#include "ReductionRules.h"
-#include "Solution.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <limits>
-#include <vector>
-
-namespace PBQP {
-
-  namespace RegAlloc {
-
-    /// \brief Metadata to speed allocatability test.
-    ///
-    /// Keeps track of the number of infinities in each row and column.
-    class MatrixMetadata {
-    private:
-      MatrixMetadata(const MatrixMetadata&);
-      void operator=(const MatrixMetadata&);
-    public:
-      MatrixMetadata(const PBQP::Matrix& M)
-        : WorstRow(0), WorstCol(0),
-          UnsafeRows(new bool[M.getRows() - 1]()),
-          UnsafeCols(new bool[M.getCols() - 1]()) {
-
-        unsigned* ColCounts = new unsigned[M.getCols() - 1]();
-
-        for (unsigned i = 1; i < M.getRows(); ++i) {
-          unsigned RowCount = 0;
-          for (unsigned j = 1; j < M.getCols(); ++j) {
-            if (M[i][j] == std::numeric_limits<PBQP::PBQPNum>::infinity()) {
-              ++RowCount;
-              ++ColCounts[j - 1];
-              UnsafeRows[i - 1] = true;
-              UnsafeCols[j - 1] = true;
-            }
-          }
-          WorstRow = std::max(WorstRow, RowCount);
-        }
-        unsigned WorstColCountForCurRow =
-          *std::max_element(ColCounts, ColCounts + M.getCols() - 1);
-        WorstCol = std::max(WorstCol, WorstColCountForCurRow);
-        delete[] ColCounts;
-      }
-
-      ~MatrixMetadata() {
-        delete[] UnsafeRows;
-        delete[] UnsafeCols;
-      }
-
-      unsigned getWorstRow() const { return WorstRow; }
-      unsigned getWorstCol() const { return WorstCol; }
-      const bool* getUnsafeRows() const { return UnsafeRows; }
-      const bool* getUnsafeCols() const { return UnsafeCols; }
-
-    private:
-      unsigned WorstRow, WorstCol;
-      bool* UnsafeRows;
-      bool* UnsafeCols;
-    };
-
-    class NodeMetadata {
-    public:
-      typedef enum { Unprocessed,
-                     OptimallyReducible,
-                     ConservativelyAllocatable,
-                     NotProvablyAllocatable } ReductionState;
-
-      NodeMetadata() : RS(Unprocessed), DeniedOpts(0), OptUnsafeEdges(nullptr){}
-      ~NodeMetadata() { delete[] OptUnsafeEdges; }
-
-      void setup(const Vector& Costs) {
-        NumOpts = Costs.getLength() - 1;
-        OptUnsafeEdges = new unsigned[NumOpts]();
-      }
-
-      ReductionState getReductionState() const { return RS; }
-      void setReductionState(ReductionState RS) { this->RS = RS; }
-
-      void handleAddEdge(const MatrixMetadata& MD, bool Transpose) {
-        DeniedOpts += Transpose ? MD.getWorstCol() : MD.getWorstRow();
-        const bool* UnsafeOpts =
-          Transpose ? MD.getUnsafeCols() : MD.getUnsafeRows();
-        for (unsigned i = 0; i < NumOpts; ++i)
-          OptUnsafeEdges[i] += UnsafeOpts[i];
-      }
-
-      void handleRemoveEdge(const MatrixMetadata& MD, bool Transpose) {
-        DeniedOpts -= Transpose ? MD.getWorstCol() : MD.getWorstRow();
-        const bool* UnsafeOpts =
-          Transpose ? MD.getUnsafeCols() : MD.getUnsafeRows();
-        for (unsigned i = 0; i < NumOpts; ++i)
-          OptUnsafeEdges[i] -= UnsafeOpts[i];
-      }
-
-      bool isConservativelyAllocatable() const {
-        return (DeniedOpts < NumOpts) ||
-               (std::find(OptUnsafeEdges, OptUnsafeEdges + NumOpts, 0) !=
-                  OptUnsafeEdges + NumOpts);
-      }
-
-    private:
-      ReductionState RS;
-      unsigned NumOpts;
-      unsigned DeniedOpts;
-      unsigned* OptUnsafeEdges;
-    };
-
-    class RegAllocSolverImpl {
-    private:
-      typedef PBQP::MDMatrix<MatrixMetadata> RAMatrix;
-    public:
-      typedef PBQP::Vector RawVector;
-      typedef PBQP::Matrix RawMatrix;
-      typedef PBQP::Vector Vector;
-      typedef RAMatrix     Matrix;
-      typedef PBQP::PoolCostAllocator<
-                Vector, PBQP::VectorComparator,
-                Matrix, PBQP::MatrixComparator> CostAllocator;
-
-      typedef PBQP::GraphBase::NodeId NodeId;
-      typedef PBQP::GraphBase::EdgeId EdgeId;
-
-      typedef RegAlloc::NodeMetadata NodeMetadata;
-
-      struct EdgeMetadata { };
-
-      typedef PBQP::Graph<RegAllocSolverImpl> Graph;
-
-      RegAllocSolverImpl(Graph &G) : G(G) {}
-
-      Solution solve() {
-        G.setSolver(*this);
-        Solution S;
-        setup();
-        S = backpropagate(G, reduce());
-        G.unsetSolver();
-        return S;
-      }
-
-      void handleAddNode(NodeId NId) {
-        G.getNodeMetadata(NId).setup(G.getNodeCosts(NId));
-      }
-      void handleRemoveNode(NodeId NId) {}
-      void handleSetNodeCosts(NodeId NId, const Vector& newCosts) {}
-
-      void handleAddEdge(EdgeId EId) {
-        handleReconnectEdge(EId, G.getEdgeNode1Id(EId));
-        handleReconnectEdge(EId, G.getEdgeNode2Id(EId));
-      }
-
-      void handleRemoveEdge(EdgeId EId) {
-        handleDisconnectEdge(EId, G.getEdgeNode1Id(EId));
-        handleDisconnectEdge(EId, G.getEdgeNode2Id(EId));
-      }
-
-      void handleDisconnectEdge(EdgeId EId, NodeId NId) {
-        NodeMetadata& NMd = G.getNodeMetadata(NId);
-        const MatrixMetadata& MMd = G.getEdgeCosts(EId).getMetadata();
-        NMd.handleRemoveEdge(MMd, NId == G.getEdgeNode2Id(EId));
-        if (G.getNodeDegree(NId) == 3) {
-          // This node is becoming optimally reducible.
-          moveToOptimallyReducibleNodes(NId);
-        } else if (NMd.getReductionState() ==
-                     NodeMetadata::NotProvablyAllocatable &&
-                   NMd.isConservativelyAllocatable()) {
-          // This node just became conservatively allocatable.
-          moveToConservativelyAllocatableNodes(NId);
-        }
-      }
-
-      void handleReconnectEdge(EdgeId EId, NodeId NId) {
-        NodeMetadata& NMd = G.getNodeMetadata(NId);
-        const MatrixMetadata& MMd = G.getEdgeCosts(EId).getMetadata();
-        NMd.handleAddEdge(MMd, NId == G.getEdgeNode2Id(EId));
-      }
-
-      void handleSetEdgeCosts(EdgeId EId, const Matrix& NewCosts) {
-        handleRemoveEdge(EId);
-
-        NodeId N1Id = G.getEdgeNode1Id(EId);
-        NodeId N2Id = G.getEdgeNode2Id(EId);
-        NodeMetadata& N1Md = G.getNodeMetadata(N1Id);
-        NodeMetadata& N2Md = G.getNodeMetadata(N2Id);
-        const MatrixMetadata& MMd = NewCosts.getMetadata();
-        N1Md.handleAddEdge(MMd, N1Id != G.getEdgeNode1Id(EId));
-        N2Md.handleAddEdge(MMd, N2Id != G.getEdgeNode1Id(EId));
-      }
-
-    private:
-
-      void removeFromCurrentSet(NodeId NId) {
-        switch (G.getNodeMetadata(NId).getReductionState()) {
-          case NodeMetadata::Unprocessed: break;
-          case NodeMetadata::OptimallyReducible:
-            assert(OptimallyReducibleNodes.find(NId) !=
-                     OptimallyReducibleNodes.end() &&
-                   "Node not in optimally reducible set.");
-            OptimallyReducibleNodes.erase(NId);
-            break;
-          case NodeMetadata::ConservativelyAllocatable:
-            assert(ConservativelyAllocatableNodes.find(NId) !=
-                     ConservativelyAllocatableNodes.end() &&
-                   "Node not in conservatively allocatable set.");
-            ConservativelyAllocatableNodes.erase(NId);
-            break;
-          case NodeMetadata::NotProvablyAllocatable:
-            assert(NotProvablyAllocatableNodes.find(NId) !=
-                     NotProvablyAllocatableNodes.end() &&
-                   "Node not in not-provably-allocatable set.");
-            NotProvablyAllocatableNodes.erase(NId);
-            break;
-        }
-      }
-
-      void moveToOptimallyReducibleNodes(NodeId NId) {
-        removeFromCurrentSet(NId);
-        OptimallyReducibleNodes.insert(NId);
-        G.getNodeMetadata(NId).setReductionState(
-          NodeMetadata::OptimallyReducible);
-      }
-
-      void moveToConservativelyAllocatableNodes(NodeId NId) {
-        removeFromCurrentSet(NId);
-        ConservativelyAllocatableNodes.insert(NId);
-        G.getNodeMetadata(NId).setReductionState(
-          NodeMetadata::ConservativelyAllocatable);
-      }
-
-      void moveToNotProvablyAllocatableNodes(NodeId NId) {
-        removeFromCurrentSet(NId);
-        NotProvablyAllocatableNodes.insert(NId);
-        G.getNodeMetadata(NId).setReductionState(
-          NodeMetadata::NotProvablyAllocatable);
-      }
-
-      void setup() {
-        // Set up worklists.
-        for (auto NId : G.nodeIds()) {
-          if (G.getNodeDegree(NId) < 3)
-            moveToOptimallyReducibleNodes(NId);
-          else if (G.getNodeMetadata(NId).isConservativelyAllocatable())
-            moveToConservativelyAllocatableNodes(NId);
-          else
-            moveToNotProvablyAllocatableNodes(NId);
-        }
-      }
-
-      // Compute a reduction order for the graph by iteratively applying PBQP
-      // reduction rules. Locally optimal rules are applied whenever possible (R0,
-      // R1, R2). If no locally-optimal rules apply then any conservatively
-      // allocatable node is reduced. Finally, if no conservatively allocatable
-      // node exists then the node with the lowest spill-cost:degree ratio is
-      // selected.
-      std::vector<GraphBase::NodeId> reduce() {
-        assert(!G.empty() && "Cannot reduce empty graph.");
-
-        typedef GraphBase::NodeId NodeId;
-        std::vector<NodeId> NodeStack;
-
-        // Consume worklists.
-        while (true) {
-          if (!OptimallyReducibleNodes.empty()) {
-            NodeSet::iterator NItr = OptimallyReducibleNodes.begin();
-            NodeId NId = *NItr;
-            OptimallyReducibleNodes.erase(NItr);
-            NodeStack.push_back(NId);
-            switch (G.getNodeDegree(NId)) {
-              case 0:
-                break;
-              case 1:
-                applyR1(G, NId);
-                break;
-              case 2:
-                applyR2(G, NId);
-                break;
-              default: llvm_unreachable("Not an optimally reducible node.");
-            }
-          } else if (!ConservativelyAllocatableNodes.empty()) {
-            // Conservatively allocatable nodes will never spill. For now just
-            // take the first node in the set and push it on the stack. When we
-            // start optimizing more heavily for register preferencing, it may
-            // would be better to push nodes with lower 'expected' or worst-case
-            // register costs first (since early nodes are the most
-            // constrained).
-            NodeSet::iterator NItr = ConservativelyAllocatableNodes.begin();
-            NodeId NId = *NItr;
-            ConservativelyAllocatableNodes.erase(NItr);
-            NodeStack.push_back(NId);
-            G.disconnectAllNeighborsFromNode(NId);
-
-          } else if (!NotProvablyAllocatableNodes.empty()) {
-            NodeSet::iterator NItr =
-              std::min_element(NotProvablyAllocatableNodes.begin(),
-                               NotProvablyAllocatableNodes.end(),
-                               SpillCostComparator(G));
-            NodeId NId = *NItr;
-            NotProvablyAllocatableNodes.erase(NItr);
-            NodeStack.push_back(NId);
-            G.disconnectAllNeighborsFromNode(NId);
-          } else
-            break;
-        }
-
-        return NodeStack;
-      }
-
-      class SpillCostComparator {
-      public:
-        SpillCostComparator(const Graph& G) : G(G) {}
-        bool operator()(NodeId N1Id, NodeId N2Id) {
-          PBQPNum N1SC = G.getNodeCosts(N1Id)[0] / G.getNodeDegree(N1Id);
-          PBQPNum N2SC = G.getNodeCosts(N2Id)[0] / G.getNodeDegree(N2Id);
-          return N1SC < N2SC;
-        }
-      private:
-        const Graph& G;
-      };
-
-      Graph& G;
-      typedef std::set<NodeId> NodeSet;
-      NodeSet OptimallyReducibleNodes;
-      NodeSet ConservativelyAllocatableNodes;
-      NodeSet NotProvablyAllocatableNodes;
-    };
-
-    typedef Graph<RegAllocSolverImpl> Graph;
-
-    inline Solution solve(Graph& G) {
-      if (G.empty())
-        return Solution();
-      RegAllocSolverImpl RegAllocSolver(G);
-      return RegAllocSolver.solve();
-    }
-
-  }
-}
-
-#endif // LLVM_CODEGEN_PBQP_REGALLOCSOLVER_H

diff --git a/include/llvm/CodeGen/PBQP/Solution.h b/include/llvm/CodeGen/PBQP/Solution.h
index 3556e60..a3bfaeb 100644
--- a/include/llvm/CodeGen/PBQP/Solution.h
+++ b/include/llvm/CodeGen/PBQP/Solution.h

@@ -18,6 +18,7 @@
 #include "Math.h"
 #include <map>
 
+namespace llvm {
 namespace PBQP {
 
   /// \brief Represents a solution to a PBQP problem.
@@ -87,6 +88,7 @@
 
   };
 
-}
+} // namespace PBQP
+} // namespace llvm
 
 #endif // LLVM_CODEGEN_PBQP_SOLUTION_H

diff --git a/include/llvm/CodeGen/PBQPRAConstraint.h b/include/llvm/CodeGen/PBQPRAConstraint.h
new file mode 100644
index 0000000..833b9ba
--- /dev/null
+++ b/include/llvm/CodeGen/PBQPRAConstraint.h

@@ -0,0 +1,69 @@
+//===-- RegAllocPBQP.h ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PBQPBuilder interface, for classes which build PBQP
+// instances to represent register allocation problems, and the RegAllocPBQP
+// interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PBQPRACONSTRAINT_H
+#define LLVM_CODEGEN_PBQPRACONSTRAINT_H
+
+#include <memory>
+#include <vector>
+
+namespace llvm {
+namespace PBQP {
+namespace RegAlloc {
+// Forward declare PBQP graph class.
+class PBQPRAGraph;
+}
+}
+
+class LiveIntervals;
+class MachineBlockFrequencyInfo;
+class MachineFunction;
+class TargetRegisterInfo;
+
+typedef PBQP::RegAlloc::PBQPRAGraph PBQPRAGraph;
+
+/// @brief Abstract base for classes implementing PBQP register allocation
+///        constraints (e.g. Spill-costs, interference, coalescing).
+class PBQPRAConstraint {
+public:
+  virtual ~PBQPRAConstraint() = 0;
+  virtual void apply(PBQPRAGraph &G) = 0;
+private:
+  virtual void anchor();
+};
+
+/// @brief PBQP register allocation constraint composer.
+///
+///   Constraints added to this list will be applied, in the order that they are
+/// added, to the PBQP graph.
+class PBQPRAConstraintList : public PBQPRAConstraint {
+public:
+  void apply(PBQPRAGraph &G) override {
+    for (auto &C : Constraints)
+      C->apply(G);
+  }
+
+  void addConstraint(std::unique_ptr<PBQPRAConstraint> C) {
+    if (C)
+      Constraints.push_back(std::move(C));
+  }
+private:
+  std::vector<std::unique_ptr<PBQPRAConstraint>> Constraints;
+  void anchor() override;
+};
+
+}
+
+#endif /* LLVM_CODEGEN_PBQPRACONSTRAINT_H */

diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 17477fe..b672d9d 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h

@@ -178,6 +178,10 @@
   /// Return true if the optimized regalloc pipeline is enabled.
   bool getOptimizeRegAlloc() const;
 
+  /// Return true if the default global register allocator is in use and
+  /// has not be overriden on the command line with '-regalloc=...'
+  bool usingDefaultRegAlloc() const;
+
   /// Add common target configurable passes that perform LLVM IR to IR
   /// transforms following machine independent optimization.
   virtual void addIRPasses();
@@ -345,7 +349,7 @@
 
 /// List of target independent CodeGen pass IDs.
 namespace llvm {
-  FunctionPass *createAtomicExpandLoadLinkedPass(const TargetMachine *TM);
+  FunctionPass *createAtomicExpandPass(const TargetMachine *TM);
 
   /// \brief Create a basic TargetTransformInfo analysis pass.
   ///
@@ -372,8 +376,9 @@
   /// matching during instruction selection.
   FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr);
 
-  /// AtomicExpandLoadLinkedID -- FIXME
-  extern char &AtomicExpandLoadLinkedID;
+  /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg
+  /// load-linked/store-conditional loops.
+  extern char &AtomicExpandID;
 
   /// MachineLoopInfo - This pass is a loop analysis pass.
   extern char &MachineLoopInfoID;
@@ -381,6 +386,9 @@
   /// MachineDominators - This pass is a machine dominators analysis pass.
   extern char &MachineDominatorsID;
 
+/// MachineDominanaceFrontier - This pass is a machine dominators analysis pass.
+  extern char &MachineDominanceFrontierID;
+
   /// EdgeBundles analysis - Bundle machine CFG edges.
   extern char &EdgeBundlesID;
 
@@ -486,6 +494,10 @@
   /// inserting cmov instructions.
   extern char &EarlyIfConverterID;
 
+  /// This pass performs instruction combining using trace metrics to estimate
+  /// critical-path and resource depth.
+  extern char &MachineCombinerID;
+
   /// StackSlotColoring - This pass performs stack coloring and merging.
   /// It merges disjoint allocas to reduce the stack size.
   extern char &StackColoringID;
@@ -590,6 +602,10 @@
 
   /// createJumpInstrTables - This pass creates jump-instruction tables.
   ModulePass *createJumpInstrTablesPass();
+
+  /// createForwardControlFlowIntegrityPass - This pass adds control-flow
+  /// integrity.
+  ModulePass *createForwardControlFlowIntegrityPass();
 } // End llvm namespace
 
 /// This initializer registers TargetMachine constructor, so the pass being

diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index 6343bb7..540af08 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h

@@ -16,150 +16,503 @@
 #ifndef LLVM_CODEGEN_REGALLOCPBQP_H
 #define LLVM_CODEGEN_REGALLOCPBQP_H
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/PBQP/RegAllocSolver.h"
-#include <map>
-#include <set>
+#include "llvm/CodeGen/PBQPRAConstraint.h"
+#include "llvm/CodeGen/PBQP/CostAllocator.h"
+#include "llvm/CodeGen/PBQP/ReductionRules.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
+namespace PBQP {
+namespace RegAlloc {
 
-  class LiveIntervals;
-  class MachineBlockFrequencyInfo;
-  class MachineFunction;
-  class TargetRegisterInfo;
+/// @brief Spill option index.
+inline unsigned getSpillOptionIdx() { return 0; }
 
-  typedef PBQP::RegAlloc::Graph PBQPRAGraph;
+/// \brief Metadata to speed allocatability test.
+///
+/// Keeps track of the number of infinities in each row and column.
+class MatrixMetadata {
+private:
+  MatrixMetadata(const MatrixMetadata&);
+  void operator=(const MatrixMetadata&);
+public:
+  MatrixMetadata(const Matrix& M)
+    : WorstRow(0), WorstCol(0),
+      UnsafeRows(new bool[M.getRows() - 1]()),
+      UnsafeCols(new bool[M.getCols() - 1]()) {
 
-  /// This class wraps up a PBQP instance representing a register allocation
-  /// problem, plus the structures necessary to map back from the PBQP solution
-  /// to a register allocation solution. (i.e. The PBQP-node <--> vreg map,
-  /// and the PBQP option <--> storage location map).
-  class PBQPRAProblem {
-  public:
+    unsigned* ColCounts = new unsigned[M.getCols() - 1]();
 
-    typedef SmallVector<unsigned, 16> AllowedSet;
-
-    PBQPRAGraph& getGraph() { return graph; }
-
-    const PBQPRAGraph& getGraph() const { return graph; }
-
-    /// Record the mapping between the given virtual register and PBQP node,
-    /// and the set of allowed pregs for the vreg.
-    ///
-    /// If you are extending
-    /// PBQPBuilder you are unlikely to need this: Nodes and options for all
-    /// vregs will already have been set up for you by the base class.
-    template <typename AllowedRegsItr>
-    void recordVReg(unsigned vreg, PBQPRAGraph::NodeId nodeId,
-                    AllowedRegsItr arBegin, AllowedRegsItr arEnd) {
-      assert(node2VReg.find(nodeId) == node2VReg.end() && "Re-mapping node.");
-      assert(vreg2Node.find(vreg) == vreg2Node.end() && "Re-mapping vreg.");
-      assert(allowedSets[vreg].empty() && "vreg already has pregs.");
-
-      node2VReg[nodeId] = vreg;
-      vreg2Node[vreg] = nodeId;
-      std::copy(arBegin, arEnd, std::back_inserter(allowedSets[vreg]));
+    for (unsigned i = 1; i < M.getRows(); ++i) {
+      unsigned RowCount = 0;
+      for (unsigned j = 1; j < M.getCols(); ++j) {
+        if (M[i][j] == std::numeric_limits<PBQPNum>::infinity()) {
+          ++RowCount;
+          ++ColCounts[j - 1];
+          UnsafeRows[i - 1] = true;
+          UnsafeCols[j - 1] = true;
+        }
+      }
+      WorstRow = std::max(WorstRow, RowCount);
     }
+    unsigned WorstColCountForCurRow =
+      *std::max_element(ColCounts, ColCounts + M.getCols() - 1);
+    WorstCol = std::max(WorstCol, WorstColCountForCurRow);
+    delete[] ColCounts;
+  }
 
-    /// Get the virtual register corresponding to the given PBQP node.
-    unsigned getVRegForNode(PBQPRAGraph::NodeId nodeId) const;
+  unsigned getWorstRow() const { return WorstRow; }
+  unsigned getWorstCol() const { return WorstCol; }
+  const bool* getUnsafeRows() const { return UnsafeRows.get(); }
+  const bool* getUnsafeCols() const { return UnsafeCols.get(); }
 
-    /// Get the PBQP node corresponding to the given virtual register.
-    PBQPRAGraph::NodeId getNodeForVReg(unsigned vreg) const;
+private:
+  unsigned WorstRow, WorstCol;
+  std::unique_ptr<bool[]> UnsafeRows;
+  std::unique_ptr<bool[]> UnsafeCols;
+};
 
-    /// Returns true if the given PBQP option represents a physical register,
-    /// false otherwise.
-    bool isPRegOption(unsigned vreg, unsigned option) const {
-      // At present we only have spills or pregs, so anything that's not a
-      // spill is a preg. (This might be extended one day to support remat).
-      return !isSpillOption(vreg, option);
-    }
+/// \brief Holds a vector of the allowed physical regs for a vreg.
+class AllowedRegVector {
+  friend hash_code hash_value(const AllowedRegVector &);
+public:
 
-    /// Returns true if the given PBQP option represents spilling, false
-    /// otherwise.
-    bool isSpillOption(unsigned vreg, unsigned option) const {
-      // We hardcode option zero as the spill option.
-      return option == 0;
-    }
+  AllowedRegVector() : NumOpts(0), Opts(nullptr) {}
 
-    /// Returns the allowed set for the given virtual register.
-    const AllowedSet& getAllowedSet(unsigned vreg) const;
+  AllowedRegVector(const std::vector<unsigned> &OptVec)
+    : NumOpts(OptVec.size()), Opts(new unsigned[NumOpts]) {
+    std::copy(OptVec.begin(), OptVec.end(), Opts.get());
+  }
 
-    /// Get PReg for option.
-    unsigned getPRegForOption(unsigned vreg, unsigned option) const;
+  AllowedRegVector(const AllowedRegVector &Other)
+    : NumOpts(Other.NumOpts), Opts(new unsigned[NumOpts]) {
+    std::copy(Other.Opts.get(), Other.Opts.get() + NumOpts, Opts.get());
+  }
 
-  private:
+  AllowedRegVector(AllowedRegVector &&Other)
+    : NumOpts(std::move(Other.NumOpts)), Opts(std::move(Other.Opts)) {}
 
-    typedef std::map<PBQPRAGraph::NodeId, unsigned>  Node2VReg;
-    typedef DenseMap<unsigned, PBQPRAGraph::NodeId> VReg2Node;
-    typedef DenseMap<unsigned, AllowedSet> AllowedSetMap;
+  AllowedRegVector& operator=(const AllowedRegVector &Other) {
+    NumOpts = Other.NumOpts;
+    Opts.reset(new unsigned[NumOpts]);
+    std::copy(Other.Opts.get(), Other.Opts.get() + NumOpts, Opts.get());
+    return *this;
+  }
 
-    PBQPRAGraph graph;
-    Node2VReg node2VReg;
-    VReg2Node vreg2Node;
+  AllowedRegVector& operator=(AllowedRegVector &&Other) {
+    NumOpts = std::move(Other.NumOpts);
+    Opts = std::move(Other.Opts);
+    return *this;
+  }
 
-    AllowedSetMap allowedSets;
+  unsigned size() const { return NumOpts; }
+  unsigned operator[](size_t I) const { return Opts[I]; }
 
-  };
+  bool operator==(const AllowedRegVector &Other) const {
+    if (NumOpts != Other.NumOpts)
+      return false;
+    return std::equal(Opts.get(), Opts.get() + NumOpts, Other.Opts.get());
+  }
 
-  /// Builds PBQP instances to represent register allocation problems. Includes
-  /// spill, interference and coalescing costs by default. You can extend this
-  /// class to support additional constraints for your architecture.
-  class PBQPBuilder {
-  private:
-    PBQPBuilder(const PBQPBuilder&) LLVM_DELETED_FUNCTION;
-    void operator=(const PBQPBuilder&) LLVM_DELETED_FUNCTION;
-  public:
+  bool operator!=(const AllowedRegVector &Other) const {
+    return !(*this == Other);
+  }
 
-    typedef std::set<unsigned> RegSet;
+private:
+  unsigned NumOpts;
+  std::unique_ptr<unsigned[]> Opts;
+};
 
-    /// Default constructor.
-    PBQPBuilder() {}
-
-    /// Clean up a PBQPBuilder.
-    virtual ~PBQPBuilder() {}
-
-    /// Build a PBQP instance to represent the register allocation problem for
-    /// the given MachineFunction.
-    virtual PBQPRAProblem *build(MachineFunction *mf, const LiveIntervals *lis,
-                                 const MachineBlockFrequencyInfo *mbfi,
-                                 const RegSet &vregs);
-  private:
-
-    void addSpillCosts(PBQP::Vector &costVec, PBQP::PBQPNum spillCost);
-
-    void addInterferenceCosts(PBQP::Matrix &costMat,
-                              const PBQPRAProblem::AllowedSet &vr1Allowed,
-                              const PBQPRAProblem::AllowedSet &vr2Allowed,
-                              const TargetRegisterInfo *tri);
-  };
-
-  /// Extended builder which adds coalescing constraints to a problem.
-  class PBQPBuilderWithCoalescing : public PBQPBuilder {
-  public:
-
-    /// Build a PBQP instance to represent the register allocation problem for
-    /// the given MachineFunction.
-    PBQPRAProblem *build(MachineFunction *mf, const LiveIntervals *lis,
-                         const MachineBlockFrequencyInfo *mbfi,
-                         const RegSet &vregs) override;
-
-  private:
-
-    void addPhysRegCoalesce(PBQP::Vector &costVec, unsigned pregOption,
-                            PBQP::PBQPNum benefit);
-
-    void addVirtRegCoalesce(PBQP::Matrix &costMat,
-                            const PBQPRAProblem::AllowedSet &vr1Allowed,
-                            const PBQPRAProblem::AllowedSet &vr2Allowed,
-                            PBQP::PBQPNum benefit);
-  };
-
-  FunctionPass *
-  createPBQPRegisterAllocator(std::unique_ptr<PBQPBuilder> &builder,
-                              char *customPassID = nullptr);
+inline hash_code hash_value(const AllowedRegVector &OptRegs) {
+  unsigned *OStart = OptRegs.Opts.get();
+  unsigned *OEnd = OptRegs.Opts.get() + OptRegs.NumOpts;
+  return hash_combine(OptRegs.NumOpts,
+                      hash_combine_range(OStart, OEnd));
 }
 
+/// \brief Holds graph-level metadata relevent to PBQP RA problems.
+class GraphMetadata {
+private:
+  typedef ValuePool<AllowedRegVector> AllowedRegVecPool;
+public:
+
+  typedef AllowedRegVecPool::PoolRef AllowedRegVecRef;
+
+  GraphMetadata(MachineFunction &MF,
+                LiveIntervals &LIS,
+                MachineBlockFrequencyInfo &MBFI)
+    : MF(MF), LIS(LIS), MBFI(MBFI) {}
+
+  MachineFunction &MF;
+  LiveIntervals &LIS;
+  MachineBlockFrequencyInfo &MBFI;
+
+  void setNodeIdForVReg(unsigned VReg, GraphBase::NodeId NId) {
+    VRegToNodeId[VReg] = NId;
+  }
+
+  GraphBase::NodeId getNodeIdForVReg(unsigned VReg) const {
+    auto VRegItr = VRegToNodeId.find(VReg);
+    if (VRegItr == VRegToNodeId.end())
+      return GraphBase::invalidNodeId();
+    return VRegItr->second;
+  }
+
+  void eraseNodeIdForVReg(unsigned VReg) {
+    VRegToNodeId.erase(VReg);
+  }
+
+  AllowedRegVecRef getAllowedRegs(AllowedRegVector Allowed) {
+    return AllowedRegVecs.getValue(std::move(Allowed));
+  }
+
+private:
+  DenseMap<unsigned, GraphBase::NodeId> VRegToNodeId;
+  AllowedRegVecPool AllowedRegVecs;
+};
+
+/// \brief Holds solver state and other metadata relevant to each PBQP RA node.
+class NodeMetadata {
+public:
+  typedef RegAlloc::AllowedRegVector AllowedRegVector;
+
+  typedef enum { Unprocessed,
+                 OptimallyReducible,
+                 ConservativelyAllocatable,
+                 NotProvablyAllocatable } ReductionState;
+
+  NodeMetadata()
+    : RS(Unprocessed), NumOpts(0), DeniedOpts(0), OptUnsafeEdges(nullptr),
+      VReg(0) {}
+
+  // FIXME: Re-implementing default behavior to work around MSVC. Remove once
+  // MSVC synthesizes move constructors properly.
+  NodeMetadata(const NodeMetadata &Other)
+    : RS(Other.RS), NumOpts(Other.NumOpts), DeniedOpts(Other.DeniedOpts),
+      OptUnsafeEdges(new unsigned[NumOpts]), VReg(Other.VReg),
+      AllowedRegs(Other.AllowedRegs) {
+    std::copy(&Other.OptUnsafeEdges[0], &Other.OptUnsafeEdges[NumOpts],
+              &OptUnsafeEdges[0]);
+  }
+
+  // FIXME: Re-implementing default behavior to work around MSVC. Remove once
+  // MSVC synthesizes move constructors properly.
+  NodeMetadata(NodeMetadata &&Other)
+    : RS(Other.RS), NumOpts(Other.NumOpts), DeniedOpts(Other.DeniedOpts),
+      OptUnsafeEdges(std::move(Other.OptUnsafeEdges)), VReg(Other.VReg),
+      AllowedRegs(std::move(Other.AllowedRegs)) {}
+
+  // FIXME: Re-implementing default behavior to work around MSVC. Remove once
+  // MSVC synthesizes move constructors properly.
+  NodeMetadata& operator=(const NodeMetadata &Other) {
+    RS = Other.RS;
+    NumOpts = Other.NumOpts;
+    DeniedOpts = Other.DeniedOpts;
+    OptUnsafeEdges.reset(new unsigned[NumOpts]);
+    std::copy(Other.OptUnsafeEdges.get(), Other.OptUnsafeEdges.get() + NumOpts,
+              OptUnsafeEdges.get());
+    VReg = Other.VReg;
+    AllowedRegs = Other.AllowedRegs;
+    return *this;
+  }
+
+  // FIXME: Re-implementing default behavior to work around MSVC. Remove once
+  // MSVC synthesizes move constructors properly.
+  NodeMetadata& operator=(NodeMetadata &&Other) {
+    RS = Other.RS;
+    NumOpts = Other.NumOpts;
+    DeniedOpts = Other.DeniedOpts;
+    OptUnsafeEdges = std::move(Other.OptUnsafeEdges);
+    VReg = Other.VReg;
+    AllowedRegs = std::move(Other.AllowedRegs);
+    return *this;
+  }
+
+  void setVReg(unsigned VReg) { this->VReg = VReg; }
+  unsigned getVReg() const { return VReg; }
+
+  void setAllowedRegs(GraphMetadata::AllowedRegVecRef AllowedRegs) {
+    this->AllowedRegs = std::move(AllowedRegs);
+  }
+  const AllowedRegVector& getAllowedRegs() const { return *AllowedRegs; }
+
+  void setup(const Vector& Costs) {
+    NumOpts = Costs.getLength() - 1;
+    OptUnsafeEdges = std::unique_ptr<unsigned[]>(new unsigned[NumOpts]());
+  }
+
+  ReductionState getReductionState() const { return RS; }
+  void setReductionState(ReductionState RS) { this->RS = RS; }
+
+  void handleAddEdge(const MatrixMetadata& MD, bool Transpose) {
+    DeniedOpts += Transpose ? MD.getWorstCol() : MD.getWorstRow();
+    const bool* UnsafeOpts =
+      Transpose ? MD.getUnsafeCols() : MD.getUnsafeRows();
+    for (unsigned i = 0; i < NumOpts; ++i)
+      OptUnsafeEdges[i] += UnsafeOpts[i];
+  }
+
+  void handleRemoveEdge(const MatrixMetadata& MD, bool Transpose) {
+    DeniedOpts -= Transpose ? MD.getWorstCol() : MD.getWorstRow();
+    const bool* UnsafeOpts =
+      Transpose ? MD.getUnsafeCols() : MD.getUnsafeRows();
+    for (unsigned i = 0; i < NumOpts; ++i)
+      OptUnsafeEdges[i] -= UnsafeOpts[i];
+  }
+
+  bool isConservativelyAllocatable() const {
+    return (DeniedOpts < NumOpts) ||
+      (std::find(&OptUnsafeEdges[0], &OptUnsafeEdges[NumOpts], 0) !=
+       &OptUnsafeEdges[NumOpts]);
+  }
+
+private:
+  ReductionState RS;
+  unsigned NumOpts;
+  unsigned DeniedOpts;
+  std::unique_ptr<unsigned[]> OptUnsafeEdges;
+  unsigned VReg;
+  GraphMetadata::AllowedRegVecRef AllowedRegs;
+};
+
+class RegAllocSolverImpl {
+private:
+  typedef MDMatrix<MatrixMetadata> RAMatrix;
+public:
+  typedef PBQP::Vector RawVector;
+  typedef PBQP::Matrix RawMatrix;
+  typedef PBQP::Vector Vector;
+  typedef RAMatrix     Matrix;
+  typedef PBQP::PoolCostAllocator<Vector, Matrix> CostAllocator;
+
+  typedef GraphBase::NodeId NodeId;
+  typedef GraphBase::EdgeId EdgeId;
+
+  typedef RegAlloc::NodeMetadata NodeMetadata;
+  struct EdgeMetadata { };
+  typedef RegAlloc::GraphMetadata GraphMetadata;
+
+  typedef PBQP::Graph<RegAllocSolverImpl> Graph;
+
+  RegAllocSolverImpl(Graph &G) : G(G) {}
+
+  Solution solve() {
+    G.setSolver(*this);
+    Solution S;
+    setup();
+    S = backpropagate(G, reduce());
+    G.unsetSolver();
+    return S;
+  }
+
+  void handleAddNode(NodeId NId) {
+    G.getNodeMetadata(NId).setup(G.getNodeCosts(NId));
+  }
+  void handleRemoveNode(NodeId NId) {}
+  void handleSetNodeCosts(NodeId NId, const Vector& newCosts) {}
+
+  void handleAddEdge(EdgeId EId) {
+    handleReconnectEdge(EId, G.getEdgeNode1Id(EId));
+    handleReconnectEdge(EId, G.getEdgeNode2Id(EId));
+  }
+
+  void handleRemoveEdge(EdgeId EId) {
+    handleDisconnectEdge(EId, G.getEdgeNode1Id(EId));
+    handleDisconnectEdge(EId, G.getEdgeNode2Id(EId));
+  }
+
+  void handleDisconnectEdge(EdgeId EId, NodeId NId) {
+    NodeMetadata& NMd = G.getNodeMetadata(NId);
+    const MatrixMetadata& MMd = G.getEdgeCosts(EId).getMetadata();
+    NMd.handleRemoveEdge(MMd, NId == G.getEdgeNode2Id(EId));
+    if (G.getNodeDegree(NId) == 3) {
+      // This node is becoming optimally reducible.
+      moveToOptimallyReducibleNodes(NId);
+    } else if (NMd.getReductionState() ==
+               NodeMetadata::NotProvablyAllocatable &&
+               NMd.isConservativelyAllocatable()) {
+      // This node just became conservatively allocatable.
+      moveToConservativelyAllocatableNodes(NId);
+    }
+  }
+
+  void handleReconnectEdge(EdgeId EId, NodeId NId) {
+    NodeMetadata& NMd = G.getNodeMetadata(NId);
+    const MatrixMetadata& MMd = G.getEdgeCosts(EId).getMetadata();
+    NMd.handleAddEdge(MMd, NId == G.getEdgeNode2Id(EId));
+  }
+
+  void handleSetEdgeCosts(EdgeId EId, const Matrix& NewCosts) {
+    handleRemoveEdge(EId);
+
+    NodeId N1Id = G.getEdgeNode1Id(EId);
+    NodeId N2Id = G.getEdgeNode2Id(EId);
+    NodeMetadata& N1Md = G.getNodeMetadata(N1Id);
+    NodeMetadata& N2Md = G.getNodeMetadata(N2Id);
+    const MatrixMetadata& MMd = NewCosts.getMetadata();
+    N1Md.handleAddEdge(MMd, N1Id != G.getEdgeNode1Id(EId));
+    N2Md.handleAddEdge(MMd, N2Id != G.getEdgeNode1Id(EId));
+  }
+
+private:
+
+  void removeFromCurrentSet(NodeId NId) {
+    switch (G.getNodeMetadata(NId).getReductionState()) {
+    case NodeMetadata::Unprocessed: break;
+    case NodeMetadata::OptimallyReducible:
+      assert(OptimallyReducibleNodes.find(NId) !=
+             OptimallyReducibleNodes.end() &&
+             "Node not in optimally reducible set.");
+      OptimallyReducibleNodes.erase(NId);
+      break;
+    case NodeMetadata::ConservativelyAllocatable:
+      assert(ConservativelyAllocatableNodes.find(NId) !=
+             ConservativelyAllocatableNodes.end() &&
+             "Node not in conservatively allocatable set.");
+      ConservativelyAllocatableNodes.erase(NId);
+      break;
+    case NodeMetadata::NotProvablyAllocatable:
+      assert(NotProvablyAllocatableNodes.find(NId) !=
+             NotProvablyAllocatableNodes.end() &&
+             "Node not in not-provably-allocatable set.");
+      NotProvablyAllocatableNodes.erase(NId);
+      break;
+    }
+  }
+
+  void moveToOptimallyReducibleNodes(NodeId NId) {
+    removeFromCurrentSet(NId);
+    OptimallyReducibleNodes.insert(NId);
+    G.getNodeMetadata(NId).setReductionState(
+      NodeMetadata::OptimallyReducible);
+  }
+
+  void moveToConservativelyAllocatableNodes(NodeId NId) {
+    removeFromCurrentSet(NId);
+    ConservativelyAllocatableNodes.insert(NId);
+    G.getNodeMetadata(NId).setReductionState(
+      NodeMetadata::ConservativelyAllocatable);
+  }
+
+  void moveToNotProvablyAllocatableNodes(NodeId NId) {
+    removeFromCurrentSet(NId);
+    NotProvablyAllocatableNodes.insert(NId);
+    G.getNodeMetadata(NId).setReductionState(
+      NodeMetadata::NotProvablyAllocatable);
+  }
+
+  void setup() {
+    // Set up worklists.
+    for (auto NId : G.nodeIds()) {
+      if (G.getNodeDegree(NId) < 3)
+        moveToOptimallyReducibleNodes(NId);
+      else if (G.getNodeMetadata(NId).isConservativelyAllocatable())
+        moveToConservativelyAllocatableNodes(NId);
+      else
+        moveToNotProvablyAllocatableNodes(NId);
+    }
+  }
+
+  // Compute a reduction order for the graph by iteratively applying PBQP
+  // reduction rules. Locally optimal rules are applied whenever possible (R0,
+  // R1, R2). If no locally-optimal rules apply then any conservatively
+  // allocatable node is reduced. Finally, if no conservatively allocatable
+  // node exists then the node with the lowest spill-cost:degree ratio is
+  // selected.
+  std::vector<GraphBase::NodeId> reduce() {
+    assert(!G.empty() && "Cannot reduce empty graph.");
+
+    typedef GraphBase::NodeId NodeId;
+    std::vector<NodeId> NodeStack;
+
+    // Consume worklists.
+    while (true) {
+      if (!OptimallyReducibleNodes.empty()) {
+        NodeSet::iterator NItr = OptimallyReducibleNodes.begin();
+        NodeId NId = *NItr;
+        OptimallyReducibleNodes.erase(NItr);
+        NodeStack.push_back(NId);
+        switch (G.getNodeDegree(NId)) {
+        case 0:
+          break;
+        case 1:
+          applyR1(G, NId);
+          break;
+        case 2:
+          applyR2(G, NId);
+          break;
+        default: llvm_unreachable("Not an optimally reducible node.");
+        }
+      } else if (!ConservativelyAllocatableNodes.empty()) {
+        // Conservatively allocatable nodes will never spill. For now just
+        // take the first node in the set and push it on the stack. When we
+        // start optimizing more heavily for register preferencing, it may
+        // would be better to push nodes with lower 'expected' or worst-case
+        // register costs first (since early nodes are the most
+        // constrained).
+        NodeSet::iterator NItr = ConservativelyAllocatableNodes.begin();
+        NodeId NId = *NItr;
+        ConservativelyAllocatableNodes.erase(NItr);
+        NodeStack.push_back(NId);
+        G.disconnectAllNeighborsFromNode(NId);
+
+      } else if (!NotProvablyAllocatableNodes.empty()) {
+        NodeSet::iterator NItr =
+          std::min_element(NotProvablyAllocatableNodes.begin(),
+                           NotProvablyAllocatableNodes.end(),
+                           SpillCostComparator(G));
+        NodeId NId = *NItr;
+        NotProvablyAllocatableNodes.erase(NItr);
+        NodeStack.push_back(NId);
+        G.disconnectAllNeighborsFromNode(NId);
+      } else
+        break;
+    }
+
+    return NodeStack;
+  }
+
+  class SpillCostComparator {
+  public:
+    SpillCostComparator(const Graph& G) : G(G) {}
+    bool operator()(NodeId N1Id, NodeId N2Id) {
+      PBQPNum N1SC = G.getNodeCosts(N1Id)[0] / G.getNodeDegree(N1Id);
+      PBQPNum N2SC = G.getNodeCosts(N2Id)[0] / G.getNodeDegree(N2Id);
+      return N1SC < N2SC;
+    }
+  private:
+    const Graph& G;
+  };
+
+  Graph& G;
+  typedef std::set<NodeId> NodeSet;
+  NodeSet OptimallyReducibleNodes;
+  NodeSet ConservativelyAllocatableNodes;
+  NodeSet NotProvablyAllocatableNodes;
+};
+
+class PBQPRAGraph : public PBQP::Graph<RegAllocSolverImpl> {
+private:
+  typedef PBQP::Graph<RegAllocSolverImpl> BaseT;
+public:
+  PBQPRAGraph(GraphMetadata Metadata) : BaseT(Metadata) {}
+};
+
+inline Solution solve(PBQPRAGraph& G) {
+  if (G.empty())
+    return Solution();
+  RegAllocSolverImpl RegAllocSolver(G);
+  return RegAllocSolver.solve();
+}
+
+} // namespace RegAlloc
+} // namespace PBQP
+
+/// @brief Create a PBQP register allocator instance.
+FunctionPass *
+createPBQPRegisterAllocator(char *customPassID = nullptr);
+
+} // namespace llvm
+
 #endif /* LLVM_CODEGEN_REGALLOCPBQP_H */

diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h
index 335dd7f..474861e 100644
--- a/include/llvm/CodeGen/RegisterScavenging.h
+++ b/include/llvm/CodeGen/RegisterScavenging.h

@@ -34,7 +34,7 @@
   MachineRegisterInfo* MRI;
   MachineBasicBlock *MBB;
   MachineBasicBlock::iterator MBBI;
-  unsigned NumPhysRegs;
+  unsigned NumRegUnits;
 
   /// Tracking - True if RegScavenger is currently tracking the liveness of 
   /// registers.
@@ -58,22 +58,19 @@
   /// A vector of information on scavenged registers.
   SmallVector<ScavengedInfo, 2> Scavenged;
 
-  /// CalleeSavedrRegs - A bitvector of callee saved registers for the target.
-  ///
-  BitVector CalleeSavedRegs;
-
-  /// RegsAvailable - The current state of all the physical registers immediately
-  /// before MBBI. One bit per physical register. If bit is set that means it's
-  /// available, unset means the register is currently being used.
-  BitVector RegsAvailable;
+  /// RegUnitsAvailable - The current state of each reg unit immediatelly
+  /// before MBBI. One bit per register unit. If bit is not set it means any
+  /// register containing that register unit is currently being used.
+  BitVector RegUnitsAvailable;
 
   // These BitVectors are only used internally to forward(). They are members
   // to avoid frequent reallocations.
-  BitVector KillRegs, DefRegs;
+  BitVector KillRegUnits, DefRegUnits;
+  BitVector TmpRegUnits;
 
 public:
   RegScavenger()
-    : MBB(nullptr), NumPhysRegs(0), Tracking(false) {}
+    : MBB(nullptr), NumRegUnits(0), Tracking(false) {}
 
   /// enterBasicBlock - Start tracking liveness from the begin of the specific
   /// basic block.
@@ -112,9 +109,9 @@
   MachineBasicBlock::iterator getCurrentPosition() const {
     return MBBI;
   }
-
-  /// getRegsUsed - return all registers currently in use in used.
-  void getRegsUsed(BitVector &used, bool includeReserved);
+  
+  /// isRegUsed - return if a specific register is currently used.
+  bool isRegUsed(unsigned Reg, bool includeReserved = true) const;
 
   /// getRegsAvailable - Return all available registers in the register class
   /// in Mask.
@@ -157,40 +154,29 @@
     return scavengeRegister(RegClass, MBBI, SPAdj);
   }
 
-  /// setUsed - Tell the scavenger a register is used.
+  /// setRegUsed - Tell the scavenger a register is used.
   ///
-  void setUsed(unsigned Reg);
+  void setRegUsed(unsigned Reg);
 private:
   /// isReserved - Returns true if a register is reserved. It is never "unused".
   bool isReserved(unsigned Reg) const { return MRI->isReserved(Reg); }
 
-  /// isUsed - Test if a register is currently being used.  When called by the
-  /// isAliasUsed function, we only check isReserved if this is the original
-  /// register, not an alias register.
+  /// setUsed / setUnused - Mark the state of one or a number of register units.
   ///
-  bool isUsed(unsigned Reg, bool CheckReserved = true) const   {
-    return !RegsAvailable.test(Reg) || (CheckReserved && isReserved(Reg));
+  void setUsed(BitVector &RegUnits) {
+    RegUnitsAvailable.reset(RegUnits);
+  }
+  void setUnused(BitVector &RegUnits) {
+    RegUnitsAvailable |= RegUnits;
   }
 
-  /// isAliasUsed - Is Reg or an alias currently in use?
-  bool isAliasUsed(unsigned Reg) const;
-
-  /// setUsed / setUnused - Mark the state of one or a number of registers.
-  ///
-  void setUsed(BitVector &Regs) {
-    RegsAvailable.reset(Regs);
-  }
-  void setUnused(BitVector &Regs) {
-    RegsAvailable |= Regs;
-  }
-
-  /// Processes the current instruction and fill the KillRegs and DefRegs bit
-  /// vectors.
+  /// Processes the current instruction and fill the KillRegUnits and
+  /// DefRegUnits bit vectors.
   void determineKillsAndDefs();
-
-  /// Add Reg and all its sub-registers to BV.
-  void addRegWithSubRegs(BitVector &BV, unsigned Reg);
-
+  
+  /// Add all Reg Units that Reg contains to BV.
+  void addRegUnits(BitVector &BV, unsigned Reg);
+  
   /// findSurvivorReg - Return the candidate register that is unused for the
   /// longest after StartMI. UseMI is set to the instruction where the search
   /// stopped.

diff --git a/include/llvm/CodeGen/RuntimeLibcalls.h b/include/llvm/CodeGen/RuntimeLibcalls.h
index 009b8a0..64c9c47 100644
--- a/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/include/llvm/CodeGen/RuntimeLibcalls.h

@@ -203,6 +203,16 @@
     COPYSIGN_F80,
     COPYSIGN_F128,
     COPYSIGN_PPCF128,
+    FMIN_F32,
+    FMIN_F64,
+    FMIN_F80,
+    FMIN_F128,
+    FMIN_PPCF128,
+    FMAX_F32,
+    FMAX_F64,
+    FMAX_F80,
+    FMAX_F128,
+    FMAX_PPCF128,
 
     // CONVERSION
     FPEXT_F64_F128,
@@ -210,6 +220,10 @@
     FPEXT_F32_F64,
     FPEXT_F16_F32,
     FPROUND_F32_F16,
+    FPROUND_F64_F16,
+    FPROUND_F80_F16,
+    FPROUND_F128_F16,
+    FPROUND_PPCF128_F16,
     FPROUND_F64_F32,
     FPROUND_F80_F32,
     FPROUND_F128_F32,

diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index e6754a2..00dd8f9 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h

@@ -75,8 +75,7 @@
   /// MachineInstrs.
   class ScheduleDAGInstrs : public ScheduleDAG {
   protected:
-    const MachineLoopInfo &MLI;
-    const MachineDominatorTree &MDT;
+    const MachineLoopInfo *MLI;
     const MachineFrameInfo *MFI;
 
     /// Live Intervals provides reaching defs in preRA scheduling.
@@ -154,8 +153,7 @@
 
   public:
     explicit ScheduleDAGInstrs(MachineFunction &mf,
-                               const MachineLoopInfo &mli,
-                               const MachineDominatorTree &mdt,
+                               const MachineLoopInfo *mli,
                                bool IsPostRAFlag,
                                bool RemoveKillFlags = false,
                                LiveIntervals *LIS = nullptr);

diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 5effb82..fbdaf0d 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h

@@ -16,9 +16,11 @@
 #define LLVM_CODEGEN_SELECTIONDAG_H
 
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Target/TargetMachine.h"
@@ -126,6 +128,10 @@
       DbgValMap[Node].push_back(V);
   }
 
+  /// \brief Invalidate all DbgValues attached to the node and remove
+  /// it from the Node-to-DbgValues map.
+  void erase(const SDNode *Node);
+
   void clear() {
     DbgValMap.clear();
     DbgValues.clear();
@@ -166,7 +172,7 @@
 ///
 class SelectionDAG {
   const TargetMachine &TM;
-  const TargetSelectionDAGInfo &TSI;
+  const TargetSelectionDAGInfo *TSI;
   const TargetLowering *TLI;
   MachineFunction *MF;
   LLVMContext *Context;
@@ -266,7 +272,7 @@
   /// init - Prepare this SelectionDAG to process code in the given
   /// MachineFunction.
   ///
-  void init(MachineFunction &mf, const TargetLowering *TLI);
+  void init(MachineFunction &mf);
 
   /// clear - Clear state and free memory necessary to make this
   /// SelectionDAG ready to process a new block.
@@ -275,8 +281,9 @@
 
   MachineFunction &getMachineFunction() const { return *MF; }
   const TargetMachine &getTarget() const { return TM; }
+  const TargetSubtargetInfo &getSubtarget() const { return MF->getSubtarget(); }
   const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
-  const TargetSelectionDAGInfo &getSelectionDAGInfo() const { return TSI; }
+  const TargetSelectionDAGInfo &getSelectionDAGInfo() const { return *TSI; }
   LLVMContext *getContext() const {return Context; }
 
   /// viewGraph - Pop up a GraphViz/gv window with the DAG rendered using 'dot'.
@@ -364,6 +371,27 @@
   /// the graph.
   void Legalize();
 
+  /// \brief Transforms a SelectionDAG node and any operands to it into a node
+  /// that is compatible with the target instruction selector, as indicated by
+  /// the TargetLowering object.
+  ///
+  /// \returns true if \c N is a valid, legal node after calling this.
+  ///
+  /// This essentially runs a single recursive walk of the \c Legalize process
+  /// over the given node (and its operands). This can be used to incrementally
+  /// legalize the DAG. All of the nodes which are directly replaced,
+  /// potentially including N, are added to the output parameter \c
+  /// UpdatedNodes so that the delta to the DAG can be understood by the
+  /// caller.
+  ///
+  /// When this returns false, N has been legalized in a way that make the
+  /// pointer passed in no longer valid. It may have even been deleted from the
+  /// DAG, and so it shouldn't be used further. When this returns true, the
+  /// N passed in is a legal node, and can be immediately processed as such.
+  /// This may still have done some work on the DAG, and will still populate
+  /// UpdatedNodes with any new nodes replacing those originally in the DAG.
+  bool LegalizeOp(SDNode *N, SmallSetVector<SDNode *, 16> &UpdatedNodes);
+
   /// LegalizeVectors - This transforms the SelectionDAG into a SelectionDAG
   /// that only uses vector math operations supported by the target.  This is
   /// necessary as a separate step from Legalize because unrolling a vector
@@ -546,6 +574,12 @@
     return getVectorShuffle(VT, dl, N1, N2, MaskElts.data());
   }
 
+  /// \brief Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to
+  /// the shuffle node in input but with swapped operands.
+  ///
+  /// Example: shuffle A, B, <0,5,2,7> -> shuffle B, A, <4,1,6,3>
+  SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV);
+
   /// getAnyExtOrTrunc - Convert Op, which must be of integer type, to the
   /// integer type VT, by either any-extending or truncating it.
   SDValue getAnyExtOrTrunc(SDValue Op, SDLoc DL, EVT VT);
@@ -719,7 +753,7 @@
                    SDValue SV, unsigned Align);
 
   /// getAtomicCmpSwap - Gets a node for an atomic cmpxchg op. There are two
-  /// valid Opcodes. ISD::ATOMIC_CMO_SWAP produces a the value loaded and a
+  /// valid Opcodes. ISD::ATOMIC_CMO_SWAP produces the value loaded and a
   /// chain result. ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS produces the value loaded,
   /// a success flag (initially i1), and a chain.
   SDValue getAtomicCmpSwap(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTs,
@@ -772,7 +806,8 @@
                               ArrayRef<SDValue> Ops,
                               EVT MemVT, MachinePointerInfo PtrInfo,
                               unsigned Align = 0, bool Vol = false,
-                              bool ReadMem = true, bool WriteMem = true);
+                              bool ReadMem = true, bool WriteMem = true,
+                              unsigned Size = 0);
 
   SDValue getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
                               ArrayRef<SDValue> Ops,
@@ -787,15 +822,15 @@
   SDValue getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
                   MachinePointerInfo PtrInfo, bool isVolatile,
                   bool isNonTemporal, bool isInvariant, unsigned Alignment,
-                  const MDNode *TBAAInfo = nullptr,
+                  const AAMDNodes &AAInfo = AAMDNodes(),
                   const MDNode *Ranges = nullptr);
   SDValue getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
                   MachineMemOperand *MMO);
   SDValue getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
                      SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo,
                      EVT MemVT, bool isVolatile,
-                     bool isNonTemporal, unsigned Alignment,
-                     const MDNode *TBAAInfo = nullptr);
+                     bool isNonTemporal, bool isInvariant, unsigned Alignment,
+                     const AAMDNodes &AAInfo = AAMDNodes());
   SDValue getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
                      SDValue Chain, SDValue Ptr, EVT MemVT,
                      MachineMemOperand *MMO);
@@ -806,7 +841,7 @@
                   SDValue Chain, SDValue Ptr, SDValue Offset,
                   MachinePointerInfo PtrInfo, EVT MemVT,
                   bool isVolatile, bool isNonTemporal, bool isInvariant,
-                  unsigned Alignment, const MDNode *TBAAInfo = nullptr,
+                  unsigned Alignment, const AAMDNodes &AAInfo = AAMDNodes(),
                   const MDNode *Ranges = nullptr);
   SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                   EVT VT, SDLoc dl,
@@ -818,14 +853,14 @@
   SDValue getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr,
                    MachinePointerInfo PtrInfo, bool isVolatile,
                    bool isNonTemporal, unsigned Alignment,
-                   const MDNode *TBAAInfo = nullptr);
+                   const AAMDNodes &AAInfo = AAMDNodes());
   SDValue getStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr,
                    MachineMemOperand *MMO);
   SDValue getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr,
                         MachinePointerInfo PtrInfo, EVT TVT,
                         bool isNonTemporal, bool isVolatile,
                         unsigned Alignment,
-                        const MDNode *TBAAInfo = nullptr);
+                        const AAMDNodes &AAInfo = AAMDNodes());
   SDValue getTruncStore(SDValue Chain, SDLoc dl, SDValue Val, SDValue Ptr,
                         EVT TVT, MachineMemOperand *MMO);
   SDValue getIndexedStore(SDValue OrigStoe, SDLoc dl, SDValue Base,
@@ -953,15 +988,18 @@
 
   /// getDbgValue - Creates a SDDbgValue node.
   ///
-  SDDbgValue *getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R,
-			  bool IsIndirect, uint64_t Off,
-                          DebugLoc DL, unsigned O);
-  /// Constant.
-  SDDbgValue *getConstantDbgValue(MDNode *MDPtr, const Value *C, uint64_t Off,
-				  DebugLoc DL, unsigned O);
-  /// Frame index.
-  SDDbgValue *getFrameIndexDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
-				    DebugLoc DL, unsigned O);
+  /// SDNode
+  SDDbgValue *getDbgValue(MDNode *Var, MDNode *Expr, SDNode *N, unsigned R,
+                          bool IsIndirect, uint64_t Off, DebugLoc DL,
+                          unsigned O);
+
+  /// Constant
+  SDDbgValue *getConstantDbgValue(MDNode *Var, MDNode *Expr, const Value *C,
+                                  uint64_t Off, DebugLoc DL, unsigned O);
+
+  /// FrameIndex
+  SDDbgValue *getFrameIndexDbgValue(MDNode *Var, MDNode *Expr, unsigned FI,
+                                    uint64_t Off, DebugLoc DL, unsigned O);
 
   /// RemoveDeadNode - Remove the specified node from the system. If any of its
   /// operands then becomes dead, remove them as well. Inform UpdateListener
@@ -1033,7 +1071,10 @@
     case ISD::SADDO:
     case ISD::UADDO:
     case ISD::ADDC:
-    case ISD::ADDE: return true;
+    case ISD::ADDE:
+    case ISD::FMINNUM:
+    case ISD::FMAXNUM:
+      return true;
     default: return false;
     }
   }
@@ -1192,6 +1233,7 @@
   unsigned getEVTAlignment(EVT MemoryVT) const;
 
 private:
+  void InsertNode(SDNode *N);
   bool RemoveNodeFromCSEMaps(SDNode *N);
   void AddModifiedNodeToCSEMaps(SDNode *N);
   SDNode *FindModifiedNodeSlot(SDNode *N, SDValue Op, void *&InsertPos);

diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index 520be40..2639402 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h

@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
@@ -50,15 +51,16 @@
   AliasAnalysis *AA;
   GCFunctionInfo *GFI;
   CodeGenOpt::Level OptLevel;
+  const TargetInstrInfo *TII;
+  const TargetLowering *TLI;
+
   static char ID;
 
   explicit SelectionDAGISel(TargetMachine &tm,
                             CodeGenOpt::Level OL = CodeGenOpt::Default);
   virtual ~SelectionDAGISel();
 
-  const TargetLowering *getTargetLowering() const {
-    return TM.getTargetLowering();
-  }
+  const TargetLowering *getTargetLowering() const { return TLI; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
@@ -238,6 +240,12 @@
                            const unsigned char *MatcherTable,
                            unsigned TableSize);
 
+  /// \brief Return true if complex patterns for this target can mutate the
+  /// DAG.
+  virtual bool ComplexPatternFuncMutatesDAG() const {
+    return false;
+  }
+
 private:
 
   // Calls to these functions are generated by tblgen.

diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 2231511..4715827 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h

@@ -117,11 +117,13 @@
 /// of information is represented with the SDValue value type.
 ///
 class SDValue {
+  friend struct DenseMapInfo<SDValue>;
+
   SDNode *Node;       // The node defining the value we are using.
   unsigned ResNo;     // Which return value of the node we are using.
 public:
   SDValue() : Node(nullptr), ResNo(0) {}
-  SDValue(SDNode *node, unsigned resno) : Node(node), ResNo(resno) {}
+  SDValue(SDNode *node, unsigned resno);
 
   /// get the index which selects a specific result in the SDNode
   unsigned getResNo() const { return ResNo; }
@@ -208,10 +210,14 @@
 
 template<> struct DenseMapInfo<SDValue> {
   static inline SDValue getEmptyKey() {
-    return SDValue((SDNode*)-1, -1U);
+    SDValue V;
+    V.ResNo = -1U;
+    return V;
   }
   static inline SDValue getTombstoneKey() {
-    return SDValue((SDNode*)-1, 0);
+    SDValue V;
+    V.ResNo = -2U;
+    return V;
   }
   static unsigned getHashValue(const SDValue &Val) {
     return ((unsigned)((uintptr_t)Val.getNode() >> 4) ^
@@ -411,6 +417,16 @@
     return NodeType >= ISD::FIRST_TARGET_MEMORY_OPCODE;
   }
 
+  /// Test if this node is a memory intrinsic (with valid pointer information).
+  /// INTRINSIC_W_CHAIN and INTRINSIC_VOID nodes are sometimes created for
+  /// non-memory intrinsics (with chains) that are not really instances of
+  /// MemSDNode. For such nodes, we need some extra state to determine the
+  /// proper classof relationship.
+  bool isMemIntrinsic() const {
+    return (NodeType == ISD::INTRINSIC_W_CHAIN ||
+            NodeType == ISD::INTRINSIC_VOID) && ((SubclassData >> 13) & 1);
+  }
+
   /// isMachineOpcode - Test if this node has a post-isel opcode, directly
   /// corresponding to a MachineInstr opcode.
   bool isMachineOpcode() const { return NodeType < 0; }
@@ -578,7 +594,7 @@
   /// changes.
   /// NOTE: This is still very expensive. Use carefully.
   bool hasPredecessorHelper(const SDNode *N,
-                            SmallPtrSet<const SDNode *, 32> &Visited,
+                            SmallPtrSetImpl<const SDNode *> &Visited,
                             SmallVectorImpl<const SDNode *> &Worklist) const;
 
   /// getNumOperands - Return the number of values used by this operation.
@@ -746,6 +762,10 @@
       ValueList(VTs.VTs), UseList(nullptr),
       NumOperands(Ops.size()), NumValues(VTs.NumVTs),
       debugLoc(dl), IROrder(Order) {
+    assert(NumOperands == Ops.size() &&
+           "NumOperands wasn't wide enough for its operands!");
+    assert(NumValues == VTs.NumVTs &&
+           "NumValues wasn't wide enough for its operands!");
     for (unsigned i = 0; i != Ops.size(); ++i) {
       OperandList[i].setUser(this);
       OperandList[i].setInitial(Ops[i]);
@@ -759,7 +779,10 @@
     : NodeType(Opc), OperandsNeedDelete(false), HasDebugValue(false),
       SubclassData(0), NodeId(-1), OperandList(nullptr), ValueList(VTs.VTs),
       UseList(nullptr), NumOperands(0), NumValues(VTs.NumVTs), debugLoc(dl),
-      IROrder(Order) {}
+      IROrder(Order) {
+    assert(NumValues == VTs.NumVTs &&
+           "NumValues wasn't wide enough for its operands!");
+  }
 
   /// InitOperands - Initialize the operands list of this with 1 operand.
   void InitOperands(SDUse *Ops, const SDValue &Op0) {
@@ -818,6 +841,8 @@
       Ops[i].setInitial(Vals[i]);
     }
     NumOperands = N;
+    assert(NumOperands == N &&
+           "NumOperands wasn't wide enough for its operands!");
     OperandList = Ops;
     checkForCycles(this);
   }
@@ -877,6 +902,13 @@
 
 // Define inline functions from the SDValue class.
 
+inline SDValue::SDValue(SDNode *node, unsigned resno)
+    : Node(node), ResNo(resno) {
+  assert((!Node || ResNo < Node->getNumValues()) &&
+         "Invalid result number for the given node!");
+  assert(ResNo < -2U && "Cannot use result numbers reserved for DenseMaps.");
+}
+
 inline unsigned SDValue::getOpcode() const {
   return Node->getOpcode();
 }
@@ -1088,8 +1120,8 @@
   // Returns the offset from the location of the access.
   int64_t getSrcValueOffset() const { return MMO->getOffset(); }
 
-  /// Returns the TBAAInfo that describes the dereference.
-  const MDNode *getTBAAInfo() const { return MMO->getTBAAInfo(); }
+  /// Returns the AA info that describes the dereference.
+  AAMDNodes getAAInfo() const { return MMO->getAAInfo(); }
 
   /// Returns the Ranges that describes the dereference.
   const MDNode *getRanges() const { return MMO->getRanges(); }
@@ -1145,6 +1177,7 @@
            N->getOpcode() == ISD::ATOMIC_LOAD_UMAX    ||
            N->getOpcode() == ISD::ATOMIC_LOAD         ||
            N->getOpcode() == ISD::ATOMIC_STORE        ||
+           N->isMemIntrinsic()                        ||
            N->isTargetMemoryOpcode();
   }
 };
@@ -1273,14 +1306,14 @@
                      ArrayRef<SDValue> Ops, EVT MemoryVT,
                      MachineMemOperand *MMO)
     : MemSDNode(Opc, Order, dl, VTs, Ops, MemoryVT, MMO) {
+    SubclassData |= 1u << 13;
   }
 
   // Methods to support isa and dyn_cast
   static bool classof(const SDNode *N) {
     // We lower some target intrinsics to their target opcode
     // early a node with a target opcode can be of this class
-    return N->getOpcode() == ISD::INTRINSIC_W_CHAIN ||
-           N->getOpcode() == ISD::INTRINSIC_VOID ||
+    return N->isMemIntrinsic()             ||
            N->getOpcode() == ISD::PREFETCH ||
            N->isTargetMemoryOpcode();
   }

diff --git a/include/llvm/CodeGen/StackMapLivenessAnalysis.h b/include/llvm/CodeGen/StackMapLivenessAnalysis.h
index 6f07546..f67a6e9 100644
--- a/include/llvm/CodeGen/StackMapLivenessAnalysis.h
+++ b/include/llvm/CodeGen/StackMapLivenessAnalysis.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_STACKMAP_LIVENESS_ANALYSIS_H
-#define LLVM_CODEGEN_STACKMAP_LIVENESS_ANALYSIS_H
+#ifndef LLVM_CODEGEN_STACKMAPLIVENESSANALYSIS_H
+#define LLVM_CODEGEN_STACKMAPLIVENESSANALYSIS_H
 
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -61,4 +61,4 @@
 
 } // llvm namespace
 
-#endif // LLVM_CODEGEN_STACKMAP_LIVENESS_ANALYSIS_H
+#endif

diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h
index 5eddbb6..e343980 100644
--- a/include/llvm/CodeGen/StackMaps.h
+++ b/include/llvm/CodeGen/StackMaps.h

@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_STACKMAPS
-#define LLVM_STACKMAPS
+#ifndef LLVM_CODEGEN_STACKMAPS_H
+#define LLVM_CODEGEN_STACKMAPS_H
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -118,6 +118,12 @@
 
   StackMaps(AsmPrinter &AP);
 
+  void reset() {
+    CSInfos.clear();
+    ConstPool.clear();
+    FnStackSize.clear();
+  }
+
   /// \brief Generate a stackmap record for a stackmap instruction.
   ///
   /// MI must be a raw STACKMAP, not a PATCHPOINT.
@@ -136,7 +142,7 @@
 
   typedef SmallVector<Location, 8> LocationVec;
   typedef SmallVector<LiveOutReg, 8> LiveOutVec;
-  typedef MapVector<int64_t, int64_t> ConstantPool;
+  typedef MapVector<uint64_t, uint64_t> ConstantPool;
   typedef MapVector<const MCSymbol *, uint64_t> FnStackSizeMap;
 
   struct CallsiteInfo {
@@ -146,9 +152,9 @@
     LiveOutVec LiveOuts;
     CallsiteInfo() : CSOffsetExpr(nullptr), ID(0) {}
     CallsiteInfo(const MCExpr *CSOffsetExpr, uint64_t ID,
-                 LocationVec &Locations, LiveOutVec &LiveOuts)
-      : CSOffsetExpr(CSOffsetExpr), ID(ID), Locations(Locations),
-        LiveOuts(LiveOuts) {}
+                 LocationVec &&Locations, LiveOutVec &&LiveOuts)
+      : CSOffsetExpr(CSOffsetExpr), ID(ID), Locations(std::move(Locations)),
+        LiveOuts(std::move(LiveOuts)) {}
   };
 
   typedef std::vector<CallsiteInfo> CallsiteInfoList;
@@ -196,4 +202,4 @@
 
 }
 
-#endif // LLVM_STACKMAPS
+#endif

diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 230d1ed..87f1401 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h

@@ -43,7 +43,8 @@
 
   /// Given a constant with the SectionKind, return a section that it should be
   /// placed in.
-  const MCSection *getSectionForConstant(SectionKind Kind) const override;
+  const MCSection *getSectionForConstant(SectionKind Kind,
+                                         const Constant *C) const override;
 
   const MCSection *getExplicitSectionGlobal(const GlobalValue *GV,
                                         SectionKind Kind, Mangler &Mang,
@@ -100,7 +101,8 @@
                              SectionKind Kind, Mangler &Mang,
                              const TargetMachine &TM) const override;
 
-  const MCSection *getSectionForConstant(SectionKind Kind) const override;
+  const MCSection *getSectionForConstant(SectionKind Kind,
+                                         const Constant *C) const override;
 
   /// The mach-o version of this method defaults to returning a stub reference.
   const MCExpr *

diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h
index 690b70f..b613666 100644
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h

@@ -41,7 +41,7 @@
   unsigned MicroOpFactor; // Multiply to normalize microops to resource units.
   unsigned ResourceLCM;   // Resource units per cycle. Latency normalization factor.
 public:
-  TargetSchedModel(): STI(nullptr), TII(nullptr) {}
+  TargetSchedModel(): SchedModel(MCSchedModel::GetDefaultSchedModel()), STI(nullptr), TII(nullptr) {}
 
   /// \brief Initialize the machine model for instruction scheduling.
   ///
@@ -167,6 +167,7 @@
   /// if converter after moving it to TargetSchedModel).
   unsigned computeInstrLatency(const MachineInstr *MI,
                                bool UseDefaultDefLatency = true) const;
+  unsigned computeInstrLatency(unsigned Opcode) const;
 
   /// \brief Output dependency latency of a pair of defs of the same register.
   ///

diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index e9f6702..37696eb 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake

@@ -25,7 +25,7 @@
 #cmakedefine ENABLE_TIMESTAMPS ${ENABLE_TIMESTAMPS}
 
 /* Define to 1 if you have the `arc4random' function. */
-#cmakedefine HAVE_ARC4RANDOM
+#cmakedefine HAVE_DECL_ARC4RANDOM ${HAVE_DECL_ARC4RANDOM}
 
 /* Define to 1 if you have the `backtrace' function. */
 #cmakedefine HAVE_BACKTRACE ${HAVE_BACKTRACE}
@@ -179,9 +179,6 @@
 /* Define to 1 if you have the `shell32' library (-lshell32). */
 #cmakedefine HAVE_LIBSHELL32 ${HAVE_LIBSHELL32}
 
-/* Define to 1 if you have the `udis86' library (-ludis86). */
-#undef HAVE_LIBUDIS86
-
 /* Define to 1 if you have the 'z' library (-lz). */
 #cmakedefine HAVE_LIBZ ${HAVE_LIBZ}
 
@@ -191,6 +188,9 @@
 /* Define to 1 if you have the <limits.h> header file. */
 #cmakedefine HAVE_LIMITS_H ${HAVE_LIMITS_H}
 
+/* Define to 1 if you have the <link.h> header file. */
+#cmakedefine HAVE_LINK_H ${HAVE_LINK_H}
+
 /* Define if you can use -rdynamic. */
 #define HAVE_LINK_EXPORT_DYNAMIC 1
 
@@ -459,9 +459,6 @@
 /* Have host's ___chkstk */
 #cmakedefine HAVE____CHKSTK ${HAVE____CHKSTK}
 
-/* Linker version detected at compile time. */
-#undef HOST_LINK_VERSION
-
 /* Define if we link Polly to the tools */
 #cmakedefine LINK_POLLY_INTO_TOOLS
 
@@ -518,9 +515,6 @@
 /* Define to 1 if your <sys/time.h> declares `struct tm'. */
 #undef TM_IN_SYS_TIME
 
-/* Define if use udis86 library */
-#undef USE_UDIS86
-
 /* Type of 1st arg on ELM Callback */
 #cmakedefine WIN32_ELMCB_PCSTR ${WIN32_ELMCB_PCSTR}
 

diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index b5f7297..8fcf145 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in

@@ -3,15 +3,21 @@
 #ifndef CONFIG_H
 #define CONFIG_H
 
-/* Exported configuration */
-#include "llvm/Config/llvm-config.h"
-
-/* Patch version of the LLVM API */
-#undef LLVM_VERSION_PATCH
-
 /* Bug report URL. */
 #undef BUG_REPORT_URL
 
+/* Define if we have libxml2 */
+#undef CLANG_HAVE_LIBXML
+
+/* Relative directory for resource files */
+#undef CLANG_RESOURCE_DIR
+
+/* Directories clang will search for headers */
+#undef C_INCLUDE_DIRS
+
+/* Default <path> to all compiler invocations for --sysroot=<path>. */
+#undef DEFAULT_SYSROOT
+
 /* Define if you want backtraces on crash */
 #undef ENABLE_BACKTRACES
 
@@ -24,6 +30,9 @@
 /* Define if timestamp information (e.g., __DATE__) is allowed */
 #undef ENABLE_TIMESTAMPS
 
+/* Directory where gcc is installed. */
+#undef GCC_INSTALL_PREFIX
+
 /* Define to 1 if you have the `backtrace' function. */
 #undef HAVE_BACKTRACE
 
@@ -65,6 +74,9 @@
 /* Define if dlopen() is available on this platform. */
 #undef HAVE_DLOPEN
 
+/* Define if the dot program is available */
+#undef HAVE_DOT
+
 /* Define to 1 if you have the <errno.h> header file. */
 #undef HAVE_ERRNO_H
 
@@ -161,9 +173,6 @@
 /* Define to 1 if you have the `shell32' library (-lshell32). */
 #undef HAVE_LIBSHELL32
 
-/* Define to 1 if you have the `udis86' library (-ludis86). */
-#undef HAVE_LIBUDIS86
-
 /* Define to 1 if you have the `z' library (-lz). */
 #undef HAVE_LIBZ
 
@@ -447,9 +456,96 @@
 /* Linker version detected at compile time. */
 #undef HOST_LINK_VERSION
 
+/* Installation directory for binary executables */
+#undef LLVM_BINDIR
+
+/* Time at which LLVM was configured */
+#undef LLVM_CONFIGTIME
+
+/* Installation directory for data files */
+#undef LLVM_DATADIR
+
+/* Target triple LLVM will generate code for by default */
+#undef LLVM_DEFAULT_TARGET_TRIPLE
+
+/* Installation directory for documentation */
+#undef LLVM_DOCSDIR
+
+/* Define if threads enabled */
+#undef LLVM_ENABLE_THREADS
+
 /* Define if zlib is enabled */
 #undef LLVM_ENABLE_ZLIB
 
+/* Installation directory for config files */
+#undef LLVM_ETCDIR
+
+/* Has gcc/MSVC atomic intrinsics */
+#undef LLVM_HAS_ATOMICS
+
+/* Host triple LLVM will be executed on */
+#undef LLVM_HOST_TRIPLE
+
+/* Installation directory for include files */
+#undef LLVM_INCLUDEDIR
+
+/* Installation directory for .info files */
+#undef LLVM_INFODIR
+
+/* Installation directory for man pages */
+#undef LLVM_MANDIR
+
+/* LLVM architecture name for the native architecture, if available */
+#undef LLVM_NATIVE_ARCH
+
+/* LLVM name for the native AsmParser init function, if available */
+#undef LLVM_NATIVE_ASMPARSER
+
+/* LLVM name for the native AsmPrinter init function, if available */
+#undef LLVM_NATIVE_ASMPRINTER
+
+/* LLVM name for the native Disassembler init function, if available */
+#undef LLVM_NATIVE_DISASSEMBLER
+
+/* LLVM name for the native Target init function, if available */
+#undef LLVM_NATIVE_TARGET
+
+/* LLVM name for the native TargetInfo init function, if available */
+#undef LLVM_NATIVE_TARGETINFO
+
+/* LLVM name for the native target MC init function, if available */
+#undef LLVM_NATIVE_TARGETMC
+
+/* Define if this is Unixish platform */
+#undef LLVM_ON_UNIX
+
+/* Define if this is Win32ish platform */
+#undef LLVM_ON_WIN32
+
+/* Define to path to dot program if found or 'echo dot' otherwise */
+#undef LLVM_PATH_DOT
+
+/* Installation prefix directory */
+#undef LLVM_PREFIX
+
+/* Define if we have the Intel JIT API runtime support library */
+#undef LLVM_USE_INTEL_JITEVENTS
+
+/* Define if we have the oprofile JIT-support library */
+#undef LLVM_USE_OPROFILE
+
+/* Major version of the LLVM API */
+#undef LLVM_VERSION_MAJOR
+
+/* Minor version of the LLVM API */
+#undef LLVM_VERSION_MINOR
+
+/* Patch version of the LLVM API */
+#undef LLVM_VERSION_PATCH
+
+/* LLVM version string */
+#undef LLVM_VERSION_STRING
+
 /* The shared library extension */
 #undef LTDL_SHLIB_EXT
 
@@ -487,9 +583,6 @@
 /* Define to 1 if your <sys/time.h> declares `struct tm'. */
 #undef TM_IN_SYS_TIME
 
-/* Define if use udis86 library */
-#undef USE_UDIS86
-
 /* Type of 1st arg on ELM Callback */
 #undef WIN32_ELMCB_PCSTR
 

diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake
index 5811164..77201e6 100644
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake

@@ -92,6 +92,9 @@
 /* Minor version of the LLVM API */
 #cmakedefine LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
 
+/* LLVM version string */
+#define LLVM_VERSION_STRING "${PACKAGE_VERSION}"
+
 /* Define if we link Polly to the tools */
 #cmakedefine LINK_POLLY_INTO_TOOLS
 

diff --git a/include/llvm/Config/llvm-config.h.in b/include/llvm/Config/llvm-config.h.in
index 5656240..2d6add7 100644
--- a/include/llvm/Config/llvm-config.h.in
+++ b/include/llvm/Config/llvm-config.h.in

@@ -92,4 +92,7 @@
 /* Minor version of the LLVM API */
 #undef LLVM_VERSION_MINOR
 
+/* LLVM version string */
+#undef LLVM_VERSION_STRING
+
 #endif

diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index c1aba01..3aa098d 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h

@@ -66,11 +66,15 @@
   }
 };
 
+/// A DINameKind is passed to name search methods to specify a
+/// preference regarding the type of name resolution the caller wants.
+enum class DINameKind { None, ShortName, LinkageName };
+
 /// DILineInfoSpecifier - controls which fields of DILineInfo container
 /// should be filled with data.
 struct DILineInfoSpecifier {
   enum class FileLineInfoKind { None, Default, AbsoluteFilePath };
-  enum class FunctionNameKind { None, ShortName, LinkageName };
+  typedef DINameKind FunctionNameKind;
 
   FileLineInfoKind FLIKind;
   FunctionNameKind FNKind;
@@ -103,7 +107,11 @@
   DIDT_GnuPubtypes,
   DIDT_Str,
   DIDT_StrDwo,
-  DIDT_StrOffsetsDwo
+  DIDT_StrOffsetsDwo,
+  DIDT_AppleNames,
+  DIDT_AppleTypes,
+  DIDT_AppleNamespaces,
+  DIDT_AppleObjC
 };
 
 // In place of applying the relocations to the data we've read from disk we use
@@ -124,7 +132,7 @@
   virtual ~DIContext();
 
   /// getDWARFContext - get a context for binary DWARF data.
-  static DIContext *getDWARFContext(object::ObjectFile *);
+  static DIContext *getDWARFContext(const object::ObjectFile &Obj);
 
   virtual void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All) = 0;
 

diff --git a/include/llvm/DebugInfo/DWARFFormValue.h b/include/llvm/DebugInfo/DWARFFormValue.h
index d517a72..5bb6d1b 100644
--- a/include/llvm/DebugInfo/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARFFormValue.h

@@ -57,6 +57,13 @@
   bool isFormClass(FormClass FC) const;
 
   void dump(raw_ostream &OS, const DWARFUnit *U) const;
+
+  /// \brief extracts a value in data at offset *offset_ptr.
+  ///
+  /// The passed DWARFUnit is allowed to be nullptr, in which
+  /// case no relocation processing will be performed and some
+  /// kind of forms that depend on Unit information are disallowed.
+  /// \returns wether the extraction succeeded.
   bool extractValue(DataExtractor data, uint32_t *offset_ptr,
                     const DWARFUnit *u);
   bool isInlinedCStr() const {
@@ -70,6 +77,7 @@
   Optional<const char *> getAsCString(const DWARFUnit *U) const;
   Optional<uint64_t> getAsAddress(const DWARFUnit *U) const;
   Optional<uint64_t> getAsSectionOffset() const;
+  Optional<ArrayRef<uint8_t>> getAsBlock() const;
 
   bool skipValue(DataExtractor debug_info_data, uint32_t *offset_ptr,
                  const DWARFUnit *u) const;

diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index e5dab61..b9c0b61 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h

@@ -18,9 +18,11 @@
 #include "llvm-c/ExecutionEngine.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/Object/Binary.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Target/TargetMachine.h"
@@ -39,9 +41,7 @@
 class GlobalVariable;
 class GlobalValue;
 class JITEventListener;
-class JITMemoryManager;
 class MachineCodeInfo;
-class Module;
 class MutexGuard;
 class ObjectCache;
 class RTDyldMemoryManager;
@@ -131,29 +131,19 @@
 protected:
   /// The list of Modules that we are JIT'ing from.  We use a SmallVector to
   /// optimize for the case where there is only one module.
-  SmallVector<Module*, 1> Modules;
+  SmallVector<std::unique_ptr<Module>, 1> Modules;
 
   void setDataLayout(const DataLayout *Val) { DL = Val; }
 
   /// getMemoryforGV - Allocate memory for a global variable.
   virtual char *getMemoryForGV(const GlobalVariable *GV);
 
-  // To avoid having libexecutionengine depend on the JIT and interpreter
-  // libraries, the execution engine implementations set these functions to ctor
-  // pointers at startup time if they are linked in.
-  static ExecutionEngine *(*JITCtor)(
-    Module *M,
-    std::string *ErrorStr,
-    JITMemoryManager *JMM,
-    bool GVsWithCode,
-    TargetMachine *TM);
-  static ExecutionEngine *(*MCJITCtor)(
-    Module *M,
-    std::string *ErrorStr,
-    RTDyldMemoryManager *MCJMM,
-    bool GVsWithCode,
-    TargetMachine *TM);
-  static ExecutionEngine *(*InterpCtor)(Module *M, std::string *ErrorStr);
+  static ExecutionEngine *(*MCJITCtor)(std::unique_ptr<Module> M,
+                                       std::string *ErrorStr,
+                                       RTDyldMemoryManager *MCJMM,
+                                       std::unique_ptr<TargetMachine> TM);
+  static ExecutionEngine *(*InterpCtor)(std::unique_ptr<Module> M,
+                                        std::string *ErrorStr);
 
   /// LazyFunctionCreator - If an unknown function is needed, this function
   /// pointer is invoked to create it.  If this returns null, the JIT will
@@ -161,9 +151,8 @@
   void *(*LazyFunctionCreator)(const std::string &);
 
 public:
-  /// lock - This lock protects the ExecutionEngine, MCJIT, JIT, JITResolver and
-  /// JITEmitter classes.  It must be held while changing the internal state of
-  /// any of those classes.
+  /// lock - This lock protects the ExecutionEngine and MCJIT classes. It must
+  /// be held while changing the internal state of any of those classes.
   sys::Mutex lock;
 
   //===--------------------------------------------------------------------===//
@@ -172,44 +161,9 @@
 
   virtual ~ExecutionEngine();
 
-  /// create - This is the factory method for creating an execution engine which
-  /// is appropriate for the current machine.  This takes ownership of the
-  /// module.
-  ///
-  /// \param GVsWithCode - Allocating globals with code breaks
-  /// freeMachineCodeForFunction and is probably unsafe and bad for performance.
-  /// However, we have clients who depend on this behavior, so we must support
-  /// it.  Eventually, when we're willing to break some backwards compatibility,
-  /// this flag should be flipped to false, so that by default
-  /// freeMachineCodeForFunction works.
-  static ExecutionEngine *create(Module *M,
-                                 bool ForceInterpreter = false,
-                                 std::string *ErrorStr = nullptr,
-                                 CodeGenOpt::Level OptLevel =
-                                 CodeGenOpt::Default,
-                                 bool GVsWithCode = true);
-
-  /// createJIT - This is the factory method for creating a JIT for the current
-  /// machine, it does not fall back to the interpreter.  This takes ownership
-  /// of the Module and JITMemoryManager if successful.
-  ///
-  /// Clients should make sure to initialize targets prior to calling this
-  /// function.
-  static ExecutionEngine *createJIT(Module *M,
-                                    std::string *ErrorStr = nullptr,
-                                    JITMemoryManager *JMM = nullptr,
-                                    CodeGenOpt::Level OptLevel =
-                                    CodeGenOpt::Default,
-                                    bool GVsWithCode = true,
-                                    Reloc::Model RM = Reloc::Default,
-                                    CodeModel::Model CMM =
-                                    CodeModel::JITDefault);
-
-  /// addModule - Add a Module to the list of modules that we can JIT from.
-  /// Note that this takes ownership of the Module: when the ExecutionEngine is
-  /// destroyed, it destroys the Module as well.
-  virtual void addModule(Module *M) {
-    Modules.push_back(M);
+  /// Add a Module to the list of modules that we can JIT from.
+  virtual void addModule(std::unique_ptr<Module> M) {
+    Modules.push_back(std::move(M));
   }
 
   /// addObjectFile - Add an ObjectFile to the execution engine.
@@ -223,6 +177,7 @@
   ///
   /// MCJIT will take ownership of the ObjectFile.
   virtual void addObjectFile(std::unique_ptr<object::ObjectFile> O);
+  virtual void addObjectFile(object::OwningBinary<object::ObjectFile> O);
 
   /// addArchive - Add an Archive to the execution engine.
   ///
@@ -230,11 +185,7 @@
   /// resolve external symbols in objects it is loading.  If a symbol is found
   /// in the Archive the contained object file will be extracted (in memory)
   /// and loaded for possible execution.
-  ///
-  /// MCJIT will take ownership of the Archive.
-  virtual void addArchive(object::Archive *A) {
-    llvm_unreachable("ExecutionEngine subclass doesn't implement addArchive.");
-  }
+  virtual void addArchive(object::OwningBinary<object::Archive> A);
 
   //===--------------------------------------------------------------------===//
 
@@ -263,11 +214,7 @@
   /// it prints a message to stderr and aborts.
   ///
   /// This function is deprecated for the MCJIT execution engine.
-  ///
-  /// FIXME: the JIT and MCJIT interfaces should be disentangled or united
-  /// again, if possible.
-  ///
-  virtual void *getPointerToNamedFunction(const std::string &Name,
+  virtual void *getPointerToNamedFunction(StringRef Name,
                                           bool AbortOnFailure = true) = 0;
 
   /// mapSectionAddress - map a section to its target address space value.
@@ -279,7 +226,7 @@
                      "EE!");
   }
 
-  /// generateCodeForModule - Run code generationen for the specified module and
+  /// generateCodeForModule - Run code generation for the specified module and
   /// load it into memory.
   ///
   /// When this function has completed, all code and data for the specified
@@ -293,7 +240,7 @@
   /// locally can use the getFunctionAddress call, which will generate code
   /// and apply final preparations all in one step.
   ///
-  /// This method has no effect for the legacy JIT engine or the interpeter.
+  /// This method has no effect for the interpeter.
   virtual void generateCodeForModule(Module *M) {}
 
   /// finalizeObject - ensure the module is fully processed and is usable.
@@ -302,8 +249,7 @@
   /// object usable for execution.  It should be called after sections within an
   /// object have been relocated using mapSectionAddress.  When this method is
   /// called the MCJIT execution engine will reapply relocations for a loaded
-  /// object.  This method has no effect for the legacy JIT engine or the
-  /// interpeter.
+  /// object.  This method has no effect for the interpeter.
   virtual void finalizeObject() {}
 
   /// runStaticConstructorsDestructors - This method is used to execute all of
@@ -312,11 +258,11 @@
   /// \param isDtors - Run the destructors instead of constructors.
   virtual void runStaticConstructorsDestructors(bool isDtors);
 
-  /// runStaticConstructorsDestructors - This method is used to execute all of
-  /// the static constructors or destructors for a particular module.
+  /// This method is used to execute all of the static constructors or
+  /// destructors for a particular module.
   ///
   /// \param isDtors - Run the destructors instead of constructors.
-  void runStaticConstructorsDestructors(Module *module, bool isDtors);
+  void runStaticConstructorsDestructors(Module &module, bool isDtors);
 
 
   /// runFunctionAsMain - This is a helper function which wraps runFunction to
@@ -373,13 +319,6 @@
   /// getFunctionAddress instead.
   virtual void *getPointerToFunction(Function *F) = 0;
 
-  /// getPointerToBasicBlock - The different EE's represent basic blocks in
-  /// different ways.  Return the representation for a blockaddress of the
-  /// specified block.
-  ///
-  /// This function will not be implemented for the MCJIT execution engine.
-  virtual void *getPointerToBasicBlock(BasicBlock *BB) = 0;
-
   /// getPointerToFunctionOrStub - If the specified function has been
   /// code-gen'd, return a pointer to the function.  If not, compile it, or use
   /// a stub to implement lazy compilation if available.  See
@@ -395,9 +334,9 @@
   /// getGlobalValueAddress - Return the address of the specified global
   /// value. This may involve code generation.
   ///
-  /// This function should not be called with the JIT or interpreter engines.
+  /// This function should not be called with the interpreter engine.
   virtual uint64_t getGlobalValueAddress(const std::string &Name) {
-    // Default implementation for JIT and interpreter.  MCJIT will override this.
+    // Default implementation for the interpreter.  MCJIT will override this.
     // JIT and interpreter clients should use getPointerToGlobal instead.
     return 0;
   }
@@ -405,14 +344,11 @@
   /// getFunctionAddress - Return the address of the specified function.
   /// This may involve code generation.
   virtual uint64_t getFunctionAddress(const std::string &Name) {
-    // Default implementation for JIT and interpreter.  MCJIT will override this.
-    // JIT and interpreter clients should use getPointerToFunction instead.
+    // Default implementation for the interpreter.  MCJIT will override this.
+    // Interpreter clients should use getPointerToFunction instead.
     return 0;
   }
 
-  // The JIT overrides a version that actually does this.
-  virtual void runJITOnFunction(Function *, MachineCodeInfo * = nullptr) { }
-
   /// getGlobalValueAtAddress - Return the LLVM global value object that starts
   /// at the specified address.
   ///
@@ -427,18 +363,6 @@
 
   void InitializeMemory(const Constant *Init, void *Addr);
 
-  /// recompileAndRelinkFunction - This method is used to force a function which
-  /// has already been compiled to be compiled again, possibly after it has been
-  /// modified.  Then the entry to the old copy is overwritten with a branch to
-  /// the new copy.  If there was no old copy, this acts just like
-  /// VM::getPointerToFunction().
-  virtual void *recompileAndRelinkFunction(Function *F) = 0;
-
-  /// freeMachineCodeForFunction - Release memory in the ExecutionEngine
-  /// corresponding to the machine code emitted to execute this function, useful
-  /// for garbage-collecting generated code.
-  virtual void freeMachineCodeForFunction(Function *F) = 0;
-
   /// getOrEmitGlobalVariable - Return the address of the specified global
   /// variable, possibly emitting it to memory if needed.  This is used by the
   /// Emitter.
@@ -457,7 +381,7 @@
   virtual void UnregisterJITEventListener(JITEventListener *) {}
 
   /// Sets the pre-compiled object cache.  The ownership of the ObjectCache is
-  /// not changed.  Supported by MCJIT but not JIT.
+  /// not changed.  Supported by MCJIT but not the interpreter.
   virtual void setObjectCache(ObjectCache *) {
     llvm_unreachable("No support for an object cache");
   }
@@ -499,11 +423,6 @@
   bool isCompilingLazily() const {
     return CompilingLazily;
   }
-  // Deprecated in favor of isCompilingLazily (to reduce double-negatives).
-  // Remove this in LLVM 2.8.
-  bool isLazyCompilationDisabled() const {
-    return !CompilingLazily;
-  }
 
   /// DisableGVCompilation - If called, the JIT will abort if it's asked to
   /// allocate space and populate a GlobalVariable that is not internal to
@@ -544,7 +463,7 @@
   }
 
 protected:
-  explicit ExecutionEngine(Module *M);
+  explicit ExecutionEngine(std::unique_ptr<Module> M);
 
   void emitGlobals();
 
@@ -564,34 +483,30 @@
   const static Kind Either = (Kind)(JIT | Interpreter);
 }
 
-/// EngineBuilder - Builder class for ExecutionEngines.  Use this by
-/// stack-allocating a builder, chaining the various set* methods, and
-/// terminating it with a .create() call.
+/// Builder class for ExecutionEngines. Use this by stack-allocating a builder,
+/// chaining the various set* methods, and terminating it with a .create()
+/// call.
 class EngineBuilder {
 private:
-  Module *M;
+  std::unique_ptr<Module> M;
   EngineKind::Kind WhichEngine;
   std::string *ErrorStr;
   CodeGenOpt::Level OptLevel;
   RTDyldMemoryManager *MCJMM;
-  JITMemoryManager *JMM;
-  bool AllocateGVsWithCode;
   TargetOptions Options;
   Reloc::Model RelocModel;
   CodeModel::Model CMModel;
   std::string MArch;
   std::string MCPU;
   SmallVector<std::string, 4> MAttrs;
-  bool UseMCJIT;
   bool VerifyModules;
 
   /// InitEngine - Does the common initialization of default options.
   void InitEngine();
 
 public:
-  /// EngineBuilder - Constructor for EngineBuilder.  If create() is called and
-  /// is successful, the created engine takes ownership of the module.
-  EngineBuilder(Module *m) : M(m) {
+  /// Constructor for EngineBuilder.
+  EngineBuilder(std::unique_ptr<Module> M) : M(std::move(M)) {
     InitEngine();
   }
 
@@ -607,24 +522,9 @@
   /// is only appropriate for the MCJIT; setting this and configuring the builder
   /// to create anything other than MCJIT will cause a runtime error. If create()
   /// is called and is successful, the created engine takes ownership of the
-  /// memory manager. This option defaults to NULL. Using this option nullifies
-  /// the setJITMemoryManager() option.
+  /// memory manager. This option defaults to NULL.
   EngineBuilder &setMCJITMemoryManager(RTDyldMemoryManager *mcjmm) {
     MCJMM = mcjmm;
-    JMM = nullptr;
-    return *this;
-  }
-
-  /// setJITMemoryManager - Sets the JIT memory manager to use.  This allows
-  /// clients to customize their memory allocation policies.  This is only
-  /// appropriate for either JIT or MCJIT; setting this and configuring the
-  /// builder to create an interpreter will cause a runtime error. If create()
-  /// is called and is successful, the created engine takes ownership of the
-  /// memory manager.  This option defaults to NULL. This option overrides
-  /// setMCJITMemoryManager() as well.
-  EngineBuilder &setJITMemoryManager(JITMemoryManager *jmm) {
-    MCJMM = nullptr;
-    JMM = jmm;
     return *this;
   }
 
@@ -664,18 +564,6 @@
     return *this;
   }
 
-  /// setAllocateGVsWithCode - Sets whether global values should be allocated
-  /// into the same buffer as code.  For most applications this should be set
-  /// to false.  Allocating globals with code breaks freeMachineCodeForFunction
-  /// and is probably unsafe and bad for performance.  However, we have clients
-  /// who depend on this behavior, so we must support it.  This option defaults
-  /// to false so that users of the new API can safely use the new memory
-  /// manager and free machine code.
-  EngineBuilder &setAllocateGVsWithCode(bool a) {
-    AllocateGVsWithCode = a;
-    return *this;
-  }
-
   /// setMArch - Override the architecture set by the Module's triple.
   EngineBuilder &setMArch(StringRef march) {
     MArch.assign(march.begin(), march.end());
@@ -688,13 +576,6 @@
     return *this;
   }
 
-  /// setUseMCJIT - Set whether the MC-JIT implementation should be used
-  /// (experimental).
-  EngineBuilder &setUseMCJIT(bool Value) {
-    UseMCJIT = Value;
-    return *this;
-  }
-
   /// setVerifyModules - Set whether the JIT implementation should verify
   /// IR modules during compilation.
   EngineBuilder &setVerifyModules(bool Verify) {

diff --git a/include/llvm/ExecutionEngine/JIT.h b/include/llvm/ExecutionEngine/JIT.h
deleted file mode 100644
index 581d6e6..0000000
--- a/include/llvm/ExecutionEngine/JIT.h
+++ /dev/null

@@ -1,38 +0,0 @@
-//===-- JIT.h - Abstract Execution Engine Interface -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file forces the JIT to link in on certain operating systems.
-// (Windows).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_JIT_H
-#define LLVM_EXECUTIONENGINE_JIT_H
-
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include <cstdlib>
-
-extern "C" void LLVMLinkInJIT();
-
-namespace {
-  struct ForceJITLinking {
-    ForceJITLinking() {
-      // We must reference JIT in such a way that compilers will not
-      // delete it all as dead code, even with whole program optimization,
-      // yet is effectively a NO-OP. As the compiler isn't smart enough
-      // to know that getenv() never returns -1, this will do the job.
-      if (std::getenv("bar") != (char*) -1)
-        return;
-
-      LLVMLinkInJIT();
-    }
-  } ForceJITLinking;
-}
-
-#endif

diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index 99fe36c..cef3aa2 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h

@@ -59,23 +59,6 @@
   JITEventListener() {}
   virtual ~JITEventListener();
 
-  /// NotifyFunctionEmitted - Called after a function has been successfully
-  /// emitted to memory.  The function still has its MachineFunction attached,
-  /// if you should happen to need that.
-  virtual void NotifyFunctionEmitted(const Function &,
-                                     void *, size_t,
-                                     const EmittedFunctionDetails &) {}
-
-  /// NotifyFreeingMachineCode - Called from freeMachineCodeForFunction(), after
-  /// the global mapping is removed, but before the machine code is returned to
-  /// the allocator.
-  ///
-  /// OldPtr is the address of the machine code and will be the same as the Code
-  /// parameter to a previous NotifyFunctionEmitted call.  The Function passed
-  /// to NotifyFunctionEmitted may have been destroyed by the time of the
-  /// matching NotifyFreeingMachineCode call.
-  virtual void NotifyFreeingMachineCode(void *) {}
-
   /// NotifyObjectEmitted - Called after an object has been successfully
   /// emitted to memory.  NotifyFunctionEmitted will not be called for
   /// individual functions in the object.

diff --git a/include/llvm/ExecutionEngine/JITMemoryManager.h b/include/llvm/ExecutionEngine/JITMemoryManager.h
deleted file mode 100644
index b22d899..0000000
--- a/include/llvm/ExecutionEngine/JITMemoryManager.h
+++ /dev/null

@@ -1,164 +0,0 @@
-//===-- JITMemoryManager.h - Interface JIT uses to Allocate Mem -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_JITMEMORYMANAGER_H
-#define LLVM_EXECUTIONENGINE_JITMEMORYMANAGER_H
-
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/Support/DataTypes.h"
-#include <string>
-
-namespace llvm {
-
-  class Function;
-  class GlobalValue;
-
-/// JITMemoryManager - This interface is used by the JIT to allocate and manage
-/// memory for the code generated by the JIT.  This can be reimplemented by
-/// clients that have a strong desire to control how the layout of JIT'd memory
-/// works.
-class JITMemoryManager : public RTDyldMemoryManager {
-protected:
-  bool HasGOT;
-
-public:
-  JITMemoryManager() : HasGOT(false) {}
-  virtual ~JITMemoryManager();
-
-  /// CreateDefaultMemManager - This is used to create the default
-  /// JIT Memory Manager if the client does not provide one to the JIT.
-  static JITMemoryManager *CreateDefaultMemManager();
-
-  /// setMemoryWritable - When code generation is in progress,
-  /// the code pages may need permissions changed.
-  virtual void setMemoryWritable() = 0;
-
-  /// setMemoryExecutable - When code generation is done and we're ready to
-  /// start execution, the code pages may need permissions changed.
-  virtual void setMemoryExecutable() = 0;
-
-  /// setPoisonMemory - Setting this flag to true makes the memory manager
-  /// garbage values over freed memory.  This is useful for testing and
-  /// debugging, and may be turned on by default in debug mode.
-  virtual void setPoisonMemory(bool poison) = 0;
-
-  //===--------------------------------------------------------------------===//
-  // Global Offset Table Management
-  //===--------------------------------------------------------------------===//
-
-  /// AllocateGOT - If the current table requires a Global Offset Table, this
-  /// method is invoked to allocate it.  This method is required to set HasGOT
-  /// to true.
-  virtual void AllocateGOT() = 0;
-
-  /// isManagingGOT - Return true if the AllocateGOT method is called.
-  bool isManagingGOT() const {
-    return HasGOT;
-  }
-
-  /// getGOTBase - If this is managing a Global Offset Table, this method should
-  /// return a pointer to its base.
-  virtual uint8_t *getGOTBase() const = 0;
-
-  //===--------------------------------------------------------------------===//
-  // Main Allocation Functions
-  //===--------------------------------------------------------------------===//
-
-  /// startFunctionBody - When we start JITing a function, the JIT calls this
-  /// method to allocate a block of free RWX memory, which returns a pointer to
-  /// it.  If the JIT wants to request a block of memory of at least a certain
-  /// size, it passes that value as ActualSize, and this method returns a block
-  /// with at least that much space.  If the JIT doesn't know ahead of time how
-  /// much space it will need to emit the function, it passes 0 for the
-  /// ActualSize.  In either case, this method is required to pass back the size
-  /// of the allocated block through ActualSize.  The JIT will be careful to
-  /// not write more than the returned ActualSize bytes of memory.
-  virtual uint8_t *startFunctionBody(const Function *F,
-                                     uintptr_t &ActualSize) = 0;
-
-  /// allocateStub - This method is called by the JIT to allocate space for a
-  /// function stub (used to handle limited branch displacements) while it is
-  /// JIT compiling a function.  For example, if foo calls bar, and if bar
-  /// either needs to be lazily compiled or is a native function that exists too
-  /// far away from the call site to work, this method will be used to make a
-  /// thunk for it.  The stub should be "close" to the current function body,
-  /// but should not be included in the 'actualsize' returned by
-  /// startFunctionBody.
-  virtual uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
-                                unsigned Alignment) = 0;
-
-  /// endFunctionBody - This method is called when the JIT is done codegen'ing
-  /// the specified function.  At this point we know the size of the JIT
-  /// compiled function.  This passes in FunctionStart (which was returned by
-  /// the startFunctionBody method) and FunctionEnd which is a pointer to the
-  /// actual end of the function.  This method should mark the space allocated
-  /// and remember where it is in case the client wants to deallocate it.
-  virtual void endFunctionBody(const Function *F, uint8_t *FunctionStart,
-                               uint8_t *FunctionEnd) = 0;
-
-  /// allocateSpace - Allocate a memory block of the given size.  This method
-  /// cannot be called between calls to startFunctionBody and endFunctionBody.
-  virtual uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) = 0;
-
-  /// allocateGlobal - Allocate memory for a global.
-  virtual uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment) = 0;
-
-  /// deallocateFunctionBody - Free the specified function body.  The argument
-  /// must be the return value from a call to startFunctionBody() that hasn't
-  /// been deallocated yet.  This is never called when the JIT is currently
-  /// emitting a function.
-  virtual void deallocateFunctionBody(void *Body) = 0;
-
-  /// CheckInvariants - For testing only.  Return true if all internal
-  /// invariants are preserved, or return false and set ErrorStr to a helpful
-  /// error message.
-  virtual bool CheckInvariants(std::string &) {
-    return true;
-  }
-
-  /// GetDefaultCodeSlabSize - For testing only.  Returns DefaultCodeSlabSize
-  /// from DefaultJITMemoryManager.
-  virtual size_t GetDefaultCodeSlabSize() {
-    return 0;
-  }
-
-  /// GetDefaultDataSlabSize - For testing only.  Returns DefaultCodeSlabSize
-  /// from DefaultJITMemoryManager.
-  virtual size_t GetDefaultDataSlabSize() {
-    return 0;
-  }
-
-  /// GetDefaultStubSlabSize - For testing only.  Returns DefaultCodeSlabSize
-  /// from DefaultJITMemoryManager.
-  virtual size_t GetDefaultStubSlabSize() {
-    return 0;
-  }
-
-  /// GetNumCodeSlabs - For testing only.  Returns the number of MemoryBlocks
-  /// allocated for code.
-  virtual unsigned GetNumCodeSlabs() {
-    return 0;
-  }
-
-  /// GetNumDataSlabs - For testing only.  Returns the number of MemoryBlocks
-  /// allocated for data.
-  virtual unsigned GetNumDataSlabs() {
-    return 0;
-  }
-
-  /// GetNumStubSlabs - For testing only.  Returns the number of MemoryBlocks
-  /// allocated for function stubs.
-  virtual unsigned GetNumStubSlabs() {
-    return 0;
-  }
-};
-
-} // end namespace llvm.
-
-#endif

diff --git a/include/llvm/ExecutionEngine/ObjectBuffer.h b/include/llvm/ExecutionEngine/ObjectBuffer.h
index 6221d3b..ee4820a 100644
--- a/include/llvm/ExecutionEngine/ObjectBuffer.h
+++ b/include/llvm/ExecutionEngine/ObjectBuffer.h

@@ -21,41 +21,35 @@
 
 namespace llvm {
 
-/// ObjectBuffer - This class acts as a container for the memory buffer used during
-/// generation and loading of executable objects using MCJIT and RuntimeDyld.  The
+/// This class acts as a container for the memory buffer used during generation
+/// and loading of executable objects using MCJIT and RuntimeDyld. The
 /// underlying memory for the object will be owned by the ObjectBuffer instance
-/// throughout its lifetime.  The getMemBuffer() method provides a way to create a
-/// MemoryBuffer wrapper object instance to be owned by other classes (such as
-/// ObjectFile) as needed, but the MemoryBuffer instance returned does not own the
-/// actual memory it points to.
+/// throughout its lifetime.
 class ObjectBuffer {
   virtual void anchor();
 public:
   ObjectBuffer() {}
-  ObjectBuffer(MemoryBuffer* Buf) : Buffer(Buf) {}
+  ObjectBuffer(std::unique_ptr<MemoryBuffer> Buf) : Buffer(std::move(Buf)) {}
   virtual ~ObjectBuffer() {}
 
-  /// getMemBuffer - Like MemoryBuffer::getMemBuffer() this function
-  /// returns a pointer to an object that is owned by the caller. However,
-  /// the caller does not take ownership of the underlying memory.
-  MemoryBuffer *getMemBuffer() const {
-    return MemoryBuffer::getMemBuffer(Buffer->getBuffer(),
-                                      Buffer->getBufferIdentifier(), false);
-  }
+  MemoryBufferRef getMemBuffer() const { return Buffer->getMemBufferRef(); }
 
   const char *getBufferStart() const { return Buffer->getBufferStart(); }
   size_t getBufferSize() const { return Buffer->getBufferSize(); }
   StringRef getBuffer() const { return Buffer->getBuffer(); }
+  StringRef getBufferIdentifier() const {
+    return Buffer->getBufferIdentifier();
+  }
 
 protected:
   // The memory contained in an ObjectBuffer
   std::unique_ptr<MemoryBuffer> Buffer;
 };
 
-/// ObjectBufferStream - This class encapsulates the SmallVector and
-/// raw_svector_ostream needed to generate an object using MC code emission
-/// while providing a common ObjectBuffer interface for access to the
-/// memory once the object has been generated.
+/// This class encapsulates the SmallVector and raw_svector_ostream needed to
+/// generate an object using MC code emission while providing a common
+/// ObjectBuffer interface for access to the memory once the object has been
+/// generated.
 class ObjectBufferStream : public ObjectBuffer {
   void anchor() override;
 public:
@@ -68,9 +62,8 @@
     OS.flush();
 
     // Make the data accessible via the ObjectBuffer::Buffer
-    Buffer.reset(MemoryBuffer::getMemBuffer(StringRef(SV.data(), SV.size()),
-                                            "",
-                                            false));
+    Buffer =
+        MemoryBuffer::getMemBuffer(StringRef(SV.data(), SV.size()), "", false);
   }
 
 protected:

diff --git a/include/llvm/ExecutionEngine/ObjectCache.h b/include/llvm/ExecutionEngine/ObjectCache.h
index d1849df..cc01a4e 100644
--- a/include/llvm/ExecutionEngine/ObjectCache.h
+++ b/include/llvm/ExecutionEngine/ObjectCache.h

@@ -27,13 +27,12 @@
   virtual ~ObjectCache() { }
 
   /// notifyObjectCompiled - Provides a pointer to compiled code for Module M.
-  virtual void notifyObjectCompiled(const Module *M, const MemoryBuffer *Obj) = 0;
+  virtual void notifyObjectCompiled(const Module *M, MemoryBufferRef Obj) = 0;
 
-  /// getObjectCopy - Returns a pointer to a newly allocated MemoryBuffer that
-  /// contains the object which corresponds with Module M, or 0 if an object is
-  /// not available. The caller owns both the MemoryBuffer returned by this
-  /// and the memory it references.
-  virtual MemoryBuffer* getObject(const Module* M) = 0;
+  /// Returns a pointer to a newly allocated MemoryBuffer that contains the
+  /// object which corresponds with Module M, or 0 if an object is not
+  /// available.
+  virtual std::unique_ptr<MemoryBuffer> getObject(const Module* M) = 0;
 };
 
 }

diff --git a/include/llvm/ExecutionEngine/ObjectImage.h b/include/llvm/ExecutionEngine/ObjectImage.h
index 1fcedd8..dc142bd 100644
--- a/include/llvm/ExecutionEngine/ObjectImage.h
+++ b/include/llvm/ExecutionEngine/ObjectImage.h

@@ -31,7 +31,7 @@
   std::unique_ptr<ObjectBuffer> Buffer;
 
 public:
-  ObjectImage(ObjectBuffer *Input) : Buffer(Input) {}
+  ObjectImage(std::unique_ptr<ObjectBuffer> Input) : Buffer(std::move(Input)) {}
   virtual ~ObjectImage() {}
 
   virtual object::symbol_iterator begin_symbols() const = 0;
@@ -50,6 +50,11 @@
 
   virtual /* Triple::ArchType */ unsigned getArch() const = 0;
 
+  // Return the name associated with this ObjectImage.
+  // This is usually the name of the file or MemoryBuffer that the the
+  // ObjectBuffer was constructed from.
+  StringRef getImageName() const { return Buffer->getBufferIdentifier(); }
+
   // Subclasses can override these methods to update the image with loaded
   // addresses for sections and common symbols
   virtual void updateSectionAddress(const object::SectionRef &Sec,

diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
index b1d6810..b941efc 100644
--- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
+++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTIONENGINE_RT_DYLD_MEMORY_MANAGER_H
-#define LLVM_EXECUTIONENGINE_RT_DYLD_MEMORY_MANAGER_H
+#ifndef LLVM_EXECUTIONENGINE_RTDYLDMEMORYMANAGER_H
+#define LLVM_EXECUTIONENGINE_RTDYLDMEMORYMANAGER_H
 
 #include "llvm-c/ExecutionEngine.h"
 #include "llvm/ADT/StringRef.h"
@@ -76,9 +76,15 @@
 
   virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size);
 
+  /// This method returns the address of the specified function or variable in
+  /// the current process.
+  static uint64_t getSymbolAddressInProcess(const std::string &Name);
+
   /// This method returns the address of the specified function or variable.
   /// It is used to resolve symbols during module linking.
-  virtual uint64_t getSymbolAddress(const std::string &Name);
+  virtual uint64_t getSymbolAddress(const std::string &Name) {
+    return getSymbolAddressInProcess(Name);
+  }
 
   /// This method returns the address of the specified function. As such it is
   /// only useful for resolving library symbols, not code generated symbols.
@@ -123,4 +129,4 @@
 
 } // namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_RT_DYLD_MEMORY_MANAGER_H
+#endif

diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index f123ffb..3605b9e 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h

@@ -26,19 +26,21 @@
 }
 
 class RuntimeDyldImpl;
+class RuntimeDyldCheckerImpl;
 class ObjectImage;
 
 class RuntimeDyld {
-  friend class RuntimeDyldChecker;
+  friend class RuntimeDyldCheckerImpl;
 
   RuntimeDyld(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
   void operator=(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
 
   // RuntimeDyldImpl is the actual class. RuntimeDyld is just the public
   // interface.
-  RuntimeDyldImpl *Dyld;
+  std::unique_ptr<RuntimeDyldImpl> Dyld;
   RTDyldMemoryManager *MM;
   bool ProcessAllSections;
+  RuntimeDyldCheckerImpl *Checker;
 protected:
   // Change the address associated with a section when resolving relocations.
   // Any relocations already associated with the symbol will be re-resolved.
@@ -51,22 +53,24 @@
   /// Ownership of the input buffer is transferred to the ObjectImage
   /// instance returned from this function if successful. In the case of load
   /// failure, the input buffer will be deleted.
-  ObjectImage *loadObject(ObjectBuffer *InputBuffer);
+  std::unique_ptr<ObjectImage>
+  loadObject(std::unique_ptr<ObjectBuffer> InputBuffer);
 
   /// Prepare the referenced object file for execution.
   /// Ownership of the input object is transferred to the ObjectImage
   /// instance returned from this function if successful. In the case of load
   /// failure, the input object will be deleted.
-  ObjectImage *loadObject(std::unique_ptr<object::ObjectFile> InputObject);
+  std::unique_ptr<ObjectImage>
+  loadObject(std::unique_ptr<object::ObjectFile> InputObject);
 
   /// Get the address of our local copy of the symbol. This may or may not
   /// be the address used for relocation (clients can copy the data around
   /// and resolve relocatons based on where they put it).
-  void *getSymbolAddress(StringRef Name);
+  void *getSymbolAddress(StringRef Name) const;
 
   /// Get the address of the target copy of the symbol. This is the address
   /// used for relocation.
-  uint64_t getSymbolLoadAddress(StringRef Name);
+  uint64_t getSymbolLoadAddress(StringRef Name) const;
 
   /// Resolve the relocations for all symbols we currently know about.
   void resolveRelocations();

diff --git a/include/llvm/ExecutionEngine/RuntimeDyldChecker.h b/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
index 38a4ea1..35ceba2 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyldChecker.h

@@ -7,18 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_RUNTIMEDYLDCHECKER_H
-#define LLVM_RUNTIMEDYLDCHECKER_H
+#ifndef LLVM_EXECUTIONENGINE_RUNTIMEDYLDCHECKER_H
+#define LLVM_EXECUTIONENGINE_RUNTIMEDYLDCHECKER_H
 
-#include "RuntimeDyld.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <map>
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
 
 class MCDisassembler;
+class MemoryBuffer;
 class MCInstPrinter;
+class RuntimeDyld;
+class RuntimeDyldCheckerImpl;
+class raw_ostream;
 
 /// \brief RuntimeDyld invariant checker for verifying that RuntimeDyld has
 ///        correctly applied relocations.
@@ -61,14 +62,16 @@
 ///             | expr '>>' expr
 ///
 class RuntimeDyldChecker {
-  friend class RuntimeDyldCheckerExprEval;
 public:
-  RuntimeDyldChecker(RuntimeDyld &RTDyld,
-                     MCDisassembler *Disassembler,
-                     MCInstPrinter *InstPrinter,
-                     llvm::raw_ostream &ErrStream)
-    : RTDyld(*RTDyld.Dyld), Disassembler(Disassembler),
-      InstPrinter(InstPrinter), ErrStream(ErrStream) {}
+  RuntimeDyldChecker(RuntimeDyld &RTDyld, MCDisassembler *Disassembler,
+                     MCInstPrinter *InstPrinter, raw_ostream &ErrStream);
+  ~RuntimeDyldChecker();
+
+  // \brief Get the associated RTDyld instance.
+  RuntimeDyld& getRTDyld();
+
+  // \brief Get the associated RTDyld instance.
+  const RuntimeDyld& getRTDyld() const;
 
   /// \brief Check a single expression against the attached RuntimeDyld
   ///        instance.
@@ -79,20 +82,20 @@
   ///        method to be evaluated as an expression.
   bool checkAllRulesInBuffer(StringRef RulePrefix, MemoryBuffer *MemBuf) const;
 
+  /// \brief Returns the address of the requested section (or an error message
+  ///        in the second element of the pair if the address cannot be found).
+  ///
+  /// if 'LinkerAddress' is true, this returns the address of the section
+  /// within the linker's memory. If 'LinkerAddress' is false it returns the
+  /// address within the target process (i.e. the load address).
+  std::pair<uint64_t, std::string> getSectionAddr(StringRef FileName,
+                                                  StringRef SectionName,
+                                                  bool LinkerAddress);
+
 private:
-
-  bool checkSymbolIsValidForLoad(StringRef Symbol) const;
-  uint64_t getSymbolAddress(StringRef Symbol) const;
-  uint64_t readMemoryAtSymbol(StringRef Symbol, int64_t Offset,
-                              unsigned Size) const;
-  StringRef getSubsectionStartingAt(StringRef Name) const;
-
-  RuntimeDyldImpl &RTDyld;
-  MCDisassembler *Disassembler;
-  MCInstPrinter *InstPrinter;
-  llvm::raw_ostream &ErrStream;
+  std::unique_ptr<RuntimeDyldCheckerImpl> Impl;
 };
 
 } // end namespace llvm
 
-#endif // LLVM_RUNTIMEDYLDCHECKER_H
+#endif

diff --git a/include/llvm/IR/Argument.h b/include/llvm/IR/Argument.h
index 3a63e1a..dd76a90 100644
--- a/include/llvm/IR/Argument.h
+++ b/include/llvm/IR/Argument.h

@@ -56,9 +56,15 @@
   unsigned getArgNo() const;
 
   /// \brief Return true if this argument has the nonnull attribute on it in
-  /// its containing function.
+  /// its containing function. Also returns true if at least one byte is known
+  /// to be dereferenceable and the pointer is in addrspace(0).
   bool hasNonNullAttr() const;
 
+  /// \brief If this argument has the dereferenceable attribute on it in its
+  /// containing function, return the number of bytes known to be
+  /// dereferenceable. Otherwise, zero is returned.
+  uint64_t getDereferenceableBytes() const;
+
   /// \brief Return true if this argument has the byval attribute on it in its
   /// containing function.
   bool hasByValAttr() const;
@@ -99,6 +105,14 @@
   /// its containing function.
   bool hasInAllocaAttr() const;
 
+  /// \brief Return true if this argument has the zext attribute on it in its
+  /// containing function.
+  bool hasZExtAttr() const;
+
+  /// \brief Return true if this argument has the sext attribute on it in its
+  /// containing function.
+  bool hasSExtAttr() const;
+
   /// \brief Add a Attribute to an argument.
   void addAttr(AttributeSet AS);
 

diff --git a/include/llvm/IR/AssemblyAnnotationWriter.h b/include/llvm/IR/AssemblyAnnotationWriter.h
index a8d52f6..19e32a2 100644
--- a/include/llvm/IR/AssemblyAnnotationWriter.h
+++ b/include/llvm/IR/AssemblyAnnotationWriter.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_IR_ASMANNOTATIONWRITER_H
-#define LLVM_IR_ASMANNOTATIONWRITER_H
+#ifndef LLVM_IR_ASSEMBLYANNOTATIONWRITER_H
+#define LLVM_IR_ASSEMBLYANNOTATIONWRITER_H
 
 namespace llvm {
 

diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index e34dc83..5ff48d6 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h

@@ -88,6 +88,7 @@
     NonLazyBind,           ///< Function is called early and/or
                            ///< often, so lazy binding isn't worthwhile
     NonNull,               ///< Pointer is known to be not null
+    Dereferenceable,       ///< Pointer is known to be dereferenceable
     NoRedZone,             ///< Disable redzone
     NoReturn,              ///< Mark the function as not returning
     NoUnwind,              ///< Function doesn't unwind stack
@@ -133,6 +134,8 @@
   /// alignment set.
   static Attribute getWithAlignment(LLVMContext &Context, uint64_t Align);
   static Attribute getWithStackAlignment(LLVMContext &Context, uint64_t Align);
+  static Attribute getWithDereferenceableBytes(LLVMContext &Context,
+                                              uint64_t Bytes);
 
   //===--------------------------------------------------------------------===//
   // Attribute Accessors
@@ -141,8 +144,8 @@
   /// \brief Return true if the attribute is an Attribute::AttrKind type.
   bool isEnumAttribute() const;
 
-  /// \brief Return true if the attribute is an alignment attribute.
-  bool isAlignAttribute() const;
+  /// \brief Return true if the attribute is an integer attribute.
+  bool isIntAttribute() const;
 
   /// \brief Return true if the attribute is a string (target-dependent)
   /// attribute.
@@ -178,6 +181,10 @@
   /// alignment value.
   unsigned getStackAlignment() const;
 
+  /// \brief Returns the number of dereferenceable bytes from the
+  /// dereferenceable attribute (or zero if unknown).
+  uint64_t getDereferenceableBytes() const;
+
   /// \brief The Attribute is converted to a string of equivalent mnemonic. This
   /// is, presumably, for writing out the mnemonics for the assembly writer.
   std::string getAsString(bool InAttrGrp = false) const;
@@ -316,6 +323,9 @@
   /// \brief Get the stack alignment.
   unsigned getStackAlignment(unsigned Index) const;
 
+  /// \brief Get the number of dereferenceable bytes (or zero if unknown).
+  uint64_t getDereferenceableBytes(unsigned Index) const;
+
   /// \brief Return the attributes at the index as a string.
   std::string getAsString(unsigned Index, bool InAttrGrp = false) const;
 
@@ -395,13 +405,15 @@
   std::map<std::string, std::string> TargetDepAttrs;
   uint64_t Alignment;
   uint64_t StackAlignment;
+  uint64_t DerefBytes;
 public:
-  AttrBuilder() : Attrs(0), Alignment(0), StackAlignment(0) {}
+  AttrBuilder() : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0) {}
   explicit AttrBuilder(uint64_t Val)
-    : Attrs(0), Alignment(0), StackAlignment(0) {
+    : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0) {
     addRawValue(Val);
   }
-  AttrBuilder(const Attribute &A) : Attrs(0), Alignment(0), StackAlignment(0) {
+  AttrBuilder(const Attribute &A)
+    : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0) {
     addAttribute(A);
   }
   AttrBuilder(AttributeSet AS, unsigned Idx);
@@ -455,6 +467,10 @@
   /// \brief Retrieve the stack alignment attribute, if it exists.
   uint64_t getStackAlignment() const { return StackAlignment; }
 
+  /// \brief Retrieve the number of dereferenceable bytes, if the dereferenceable
+  /// attribute exists (zero is returned otherwise).
+  uint64_t getDereferenceableBytes() const { return DerefBytes; }
+
   /// \brief This turns an int alignment (which must be a power of 2) into the
   /// form used internally in Attribute.
   AttrBuilder &addAlignmentAttr(unsigned Align);
@@ -463,6 +479,10 @@
   /// the form used internally in Attribute.
   AttrBuilder &addStackAlignmentAttr(unsigned Align);
 
+  /// \brief This turns the number of dereferenceable bytes into the form used
+  /// internally in Attribute.
+  AttrBuilder &addDereferenceableAttr(uint64_t Bytes);
+
   /// \brief Return true if the builder contains no target-independent
   /// attributes.
   bool empty() const { return Attrs.none(); }

diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index a19489a..7c7dd2c 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h

@@ -23,6 +23,7 @@
 
 namespace llvm {
 
+class CallInst;
 class LandingPadInst;
 class TerminatorInst;
 class LLVMContext;
@@ -125,6 +126,14 @@
   TerminatorInst *getTerminator();
   const TerminatorInst *getTerminator() const;
 
+  /// \brief Returns the call instruction marked 'musttail' prior to the
+  /// terminating return instruction of this basic block, if such a call is
+  /// present.  Otherwise, returns null.
+  CallInst *getTerminatingMustTailCall();
+  const CallInst *getTerminatingMustTailCall() const {
+    return const_cast<BasicBlock *>(this)->getTerminatingMustTailCall();
+  }
+
   /// \brief Returns a pointer to the first instruction in this block that is
   /// not a PHINode instruction.
   ///
@@ -173,6 +182,13 @@
   /// right after \p MovePos in the function \p MovePos lives in.
   void moveAfter(BasicBlock *MovePos);
 
+  /// \brief Insert unlinked basic block into a function.
+  ///
+  /// Inserts an unlinked basic block into \c Parent.  If \c InsertBefore is
+  /// provided, inserts before that basic block, otherwise inserts at the end.
+  ///
+  /// \pre \a getParent() is \c nullptr.
+  void insertInto(Function *Parent, BasicBlock *InsertBefore = nullptr);
 
   /// \brief Return the predecessor of this block if it has a single predecessor
   /// block. Otherwise return a null pointer.

diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index deea415..df08257 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h

@@ -217,6 +217,12 @@
     CALLSITE_DELEGATE_GETTER(getParamAlignment(i));
   }
 
+  /// @brief Extract the number of dereferenceable bytes for a call or
+  /// parameter (0=unknown).
+  uint64_t getDereferenceableBytes(uint16_t i) const {
+    CALLSITE_DELEGATE_GETTER(getDereferenceableBytes(i));
+  }
+
   /// \brief Return true if the call should not be treated as a call to a
   /// builtin.
   bool isNoBuiltin() const {
@@ -302,6 +308,19 @@
            paramHasAttr(ArgNo + 1, Attribute::ReadNone);
   }
 
+  /// @brief Return true if the return value is known to be not null.
+  /// This may be because it has the nonnull attribute, or because at least
+  /// one byte is dereferenceable and the pointer is in addrspace(0).
+  bool isReturnNonNull() const {
+    if (paramHasAttr(0, Attribute::NonNull))
+      return true;
+    else if (getDereferenceableBytes(0) > 0 &&
+             getType()->getPointerAddressSpace() == 0)
+      return true;
+
+    return false;
+  }
+
   /// hasArgument - Returns true if this CallSite passes the given Value* as an
   /// argument to the called function.
   bool hasArgument(const Value *Arg) const {

diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 1eaf4f7..9872e6e 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h

@@ -20,10 +20,13 @@
 /// the well-known calling conventions.
 ///
 namespace CallingConv {
+  /// LLVM IR allows to use arbitrary numbers as calling convention identifiers.
+  typedef unsigned ID;
+
   /// A set of enums which specify the assigned numeric values for known llvm
   /// calling conventions.
   /// @brief LLVM Calling Convention Representation
-  enum ID {
+  enum {
     /// C - The default llvm calling convention, compatible with C.  This
     /// convention is the only calling convention that supports varargs calls.
     /// As with typical C calling conventions, the callee/caller have to
@@ -137,7 +140,11 @@
     /// convention differs from the more common \c X86_64_SysV convention
     /// in a number of ways, most notably in that XMM registers used to pass
     /// arguments are shadowed by GPRs, and vice versa.
-    X86_64_Win64 = 79
+    X86_64_Win64 = 79,
+
+    /// \brief MSVC calling convention that passes vectors and vector aggregates
+    /// in SSE registers.
+    X86_VectorCall = 80
   };
 } // End CallingConv namespace
 

diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h
index 82ad9fc..d26991e 100644
--- a/include/llvm/IR/Constant.h
+++ b/include/llvm/IR/Constant.h

@@ -48,11 +48,16 @@
     : User(ty, vty, Ops, NumOps) {}
 
   void destroyConstantImpl();
+  void replaceUsesOfWithOnConstantImpl(Constant *Replacement);
+
 public:
   /// isNullValue - Return true if this is the value that would be returned by
   /// getNullValue.
   bool isNullValue() const;
 
+  /// \brief Returns true if the value is one.
+  bool isOneValue() const;
+
   /// isAllOnesValue - Return true if this is the value that would be returned by
   /// getAllOnesValue.
   bool isAllOnesValue() const;
@@ -64,6 +69,9 @@
   /// Return true if the value is negative zero or null value.
   bool isZeroValue() const;
 
+  /// \brief Return true if the value is not the smallest signed value.
+  bool isNotMinSignedValue() const;
+
   /// \brief Return true if the value is the smallest signed value.
   bool isMinSignedValue() const;
 

diff --git a/include/llvm/IR/ConstantFolder.h b/include/llvm/IR/ConstantFolder.h
index 86668f7..e271a14 100644
--- a/include/llvm/IR/ConstantFolder.h
+++ b/include/llvm/IR/ConstantFolder.h

@@ -159,6 +159,12 @@
   Constant *CreatePointerCast(Constant *C, Type *DestTy) const {
     return ConstantExpr::getPointerCast(C, DestTy);
   }
+
+  Constant *CreatePointerBitCastOrAddrSpaceCast(Constant *C,
+                                                Type *DestTy) const {
+    return ConstantExpr::getPointerBitCastOrAddrSpaceCast(C, DestTy);
+  }
+
   Constant *CreateIntCast(Constant *C, Type *DestTy,
                           bool isSigned) const {
     return ConstantExpr::getIntegerCast(C, DestTy, isSigned);

diff --git a/include/llvm/IR/ConstantRange.h b/include/llvm/IR/ConstantRange.h
index 342422c..3d39289 100644
--- a/include/llvm/IR/ConstantRange.h
+++ b/include/llvm/IR/ConstantRange.h

@@ -29,8 +29,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_CONSTANTRANGE_H
-#define LLVM_SUPPORT_CONSTANTRANGE_H
+#ifndef LLVM_IR_CONSTANTRANGE_H
+#define LLVM_IR_CONSTANTRANGE_H
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/Support/DataTypes.h"

diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index 0e72f04..1b0e1b7 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h

@@ -37,12 +37,8 @@
 class VectorType;
 class SequentialType;
 
-template<class ConstantClass, class TypeClass, class ValType>
-struct ConstantCreator;
-template<class ConstantClass, class TypeClass>
-struct ConstantArrayCreator;
-template<class ConstantClass, class TypeClass>
-struct ConvertConstantType;
+struct ConstantExprKeyType;
+template <class ConstantClass> struct ConstantAggrKeyType;
 
 //===----------------------------------------------------------------------===//
 /// This is the shared class of boolean and integer constants. This class
@@ -268,6 +264,9 @@
   /// isNegative - Return true if the sign bit is set.
   bool isNegative() const { return Val.isNegative(); }
 
+  /// isInfinity - Return true if the value is infinity
+  bool isInfinity() const { return Val.isInfinity(); }
+
   /// isNaN - Return true if the value is a NaN.
   bool isNaN() const { return Val.isNaN(); }
 
@@ -338,7 +337,7 @@
 /// ConstantArray - Constant Array Declarations
 ///
 class ConstantArray : public Constant {
-  friend struct ConstantArrayCreator<ConstantArray, ArrayType>;
+  friend struct ConstantAggrKeyType<ConstantArray>;
   ConstantArray(const ConstantArray &) LLVM_DELETED_FUNCTION;
 protected:
   ConstantArray(ArrayType *T, ArrayRef<Constant *> Val);
@@ -346,6 +345,10 @@
   // ConstantArray accessors
   static Constant *get(ArrayType *T, ArrayRef<Constant*> V);
 
+private:
+  static Constant *getImpl(ArrayType *T, ArrayRef<Constant *> V);
+
+public:
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
 
@@ -376,14 +379,14 @@
 // ConstantStruct - Constant Struct Declarations
 //
 class ConstantStruct : public Constant {
-  friend struct ConstantArrayCreator<ConstantStruct, StructType>;
+  friend struct ConstantAggrKeyType<ConstantStruct>;
   ConstantStruct(const ConstantStruct &) LLVM_DELETED_FUNCTION;
 protected:
   ConstantStruct(StructType *T, ArrayRef<Constant *> Val);
 public:
   // ConstantStruct accessors
   static Constant *get(StructType *T, ArrayRef<Constant*> V);
-  static Constant *get(StructType *T, ...) END_WITH_NULL;
+  static Constant *get(StructType *T, ...) LLVM_END_WITH_NULL;
 
   /// getAnon - Return an anonymous struct that has the specified
   /// elements.  If the struct is possibly empty, then you must specify a
@@ -435,7 +438,7 @@
 /// ConstantVector - Constant Vector Declarations
 ///
 class ConstantVector : public Constant {
-  friend struct ConstantArrayCreator<ConstantVector, VectorType>;
+  friend struct ConstantAggrKeyType<ConstantVector>;
   ConstantVector(const ConstantVector &) LLVM_DELETED_FUNCTION;
 protected:
   ConstantVector(VectorType *T, ArrayRef<Constant *> Val);
@@ -443,6 +446,10 @@
   // ConstantVector accessors
   static Constant *get(ArrayRef<Constant*> V);
 
+private:
+  static Constant *getImpl(ArrayRef<Constant *> V);
+
+public:
   /// getSplat - Return a ConstantVector with the specified constant in each
   /// element.
   static Constant *getSplat(unsigned NumElts, Constant *Elt);
@@ -794,9 +801,7 @@
 /// constant expressions.  The Opcode field for the ConstantExpr class is
 /// maintained in the Value::SubclassData field.
 class ConstantExpr : public Constant {
-  friend struct ConstantCreator<ConstantExpr,Type,
-                            std::pair<unsigned, std::vector<Constant*> > >;
-  friend struct ConvertConstantType<ConstantExpr, Type>;
+  friend struct ConstantExprKeyType;
 
 protected:
   ConstantExpr(Type *ty, unsigned Opcode, Use *Ops, unsigned NumOps)
@@ -856,19 +861,25 @@
                           bool HasNUW = false, bool HasNSW = false);
   static Constant *getLShr(Constant *C1, Constant *C2, bool isExact = false);
   static Constant *getAShr(Constant *C1, Constant *C2, bool isExact = false);
-  static Constant *getTrunc   (Constant *C, Type *Ty);
-  static Constant *getSExt    (Constant *C, Type *Ty);
-  static Constant *getZExt    (Constant *C, Type *Ty);
-  static Constant *getFPTrunc (Constant *C, Type *Ty);
-  static Constant *getFPExtend(Constant *C, Type *Ty);
-  static Constant *getUIToFP  (Constant *C, Type *Ty);
-  static Constant *getSIToFP  (Constant *C, Type *Ty);
-  static Constant *getFPToUI  (Constant *C, Type *Ty);
-  static Constant *getFPToSI  (Constant *C, Type *Ty);
-  static Constant *getPtrToInt(Constant *C, Type *Ty);
-  static Constant *getIntToPtr(Constant *C, Type *Ty);
-  static Constant *getBitCast (Constant *C, Type *Ty);
-  static Constant *getAddrSpaceCast(Constant *C, Type *Ty);
+  static Constant *getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced = false);
+  static Constant *getSExt(Constant *C, Type *Ty, bool OnlyIfReduced = false);
+  static Constant *getZExt(Constant *C, Type *Ty, bool OnlyIfReduced = false);
+  static Constant *getFPTrunc(Constant *C, Type *Ty,
+                              bool OnlyIfReduced = false);
+  static Constant *getFPExtend(Constant *C, Type *Ty,
+                               bool OnlyIfReduced = false);
+  static Constant *getUIToFP(Constant *C, Type *Ty, bool OnlyIfReduced = false);
+  static Constant *getSIToFP(Constant *C, Type *Ty, bool OnlyIfReduced = false);
+  static Constant *getFPToUI(Constant *C, Type *Ty, bool OnlyIfReduced = false);
+  static Constant *getFPToSI(Constant *C, Type *Ty, bool OnlyIfReduced = false);
+  static Constant *getPtrToInt(Constant *C, Type *Ty,
+                               bool OnlyIfReduced = false);
+  static Constant *getIntToPtr(Constant *C, Type *Ty,
+                               bool OnlyIfReduced = false);
+  static Constant *getBitCast(Constant *C, Type *Ty,
+                              bool OnlyIfReduced = false);
+  static Constant *getAddrSpaceCast(Constant *C, Type *Ty,
+                                    bool OnlyIfReduced = false);
 
   static Constant *getNSWNeg(Constant *C) { return getNeg(C, false, true); }
   static Constant *getNUWNeg(Constant *C) { return getNeg(C, true, false); }
@@ -923,13 +934,14 @@
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Constant);
 
-  // @brief Convenience function for getting one of the casting operations
-  // using a CastOps opcode.
-  static Constant *getCast(
-    unsigned ops,  ///< The opcode for the conversion
-    Constant *C,   ///< The constant to be converted
-    Type *Ty ///< The type to which the constant is converted
-  );
+  /// \brief Convenience function for getting a Cast operation.
+  ///
+  /// \param ops The opcode for the conversion
+  /// \param C  The constant to be converted
+  /// \param Ty The type to which the constant is converted
+  /// \param OnlyIfReduced see \a getWithOperands() docs.
+  static Constant *getCast(unsigned ops, Constant *C, Type *Ty,
+                           bool OnlyIfReduced = false);
 
   // @brief Create a ZExt or BitCast cast constant expression
   static Constant *getZExtOrBitCast(
@@ -995,44 +1007,53 @@
 
   /// Select constant expr
   ///
-  static Constant *getSelect(Constant *C, Constant *V1, Constant *V2);
+  /// \param OnlyIfReducedTy see \a getWithOperands() docs.
+  static Constant *getSelect(Constant *C, Constant *V1, Constant *V2,
+                             Type *OnlyIfReducedTy = nullptr);
 
   /// get - Return a binary or shift operator constant expression,
   /// folding if possible.
   ///
+  /// \param OnlyIfReducedTy see \a getWithOperands() docs.
   static Constant *get(unsigned Opcode, Constant *C1, Constant *C2,
-                       unsigned Flags = 0);
+                       unsigned Flags = 0, Type *OnlyIfReducedTy = nullptr);
 
-  /// @brief Return an ICmp or FCmp comparison operator constant expression.
-  static Constant *getCompare(unsigned short pred, Constant *C1, Constant *C2);
+  /// \brief Return an ICmp or FCmp comparison operator constant expression.
+  ///
+  /// \param OnlyIfReduced see \a getWithOperands() docs.
+  static Constant *getCompare(unsigned short pred, Constant *C1, Constant *C2,
+                              bool OnlyIfReduced = false);
 
   /// get* - Return some common constants without having to
   /// specify the full Instruction::OPCODE identifier.
   ///
-  static Constant *getICmp(unsigned short pred, Constant *LHS, Constant *RHS);
-  static Constant *getFCmp(unsigned short pred, Constant *LHS, Constant *RHS);
+  static Constant *getICmp(unsigned short pred, Constant *LHS, Constant *RHS,
+                           bool OnlyIfReduced = false);
+  static Constant *getFCmp(unsigned short pred, Constant *LHS, Constant *RHS,
+                           bool OnlyIfReduced = false);
 
   /// Getelementptr form.  Value* is only accepted for convenience;
   /// all elements must be Constant's.
   ///
-  static Constant *getGetElementPtr(Constant *C,
-                                    ArrayRef<Constant *> IdxList,
-                                    bool InBounds = false) {
-    return getGetElementPtr(C, makeArrayRef((Value * const *)IdxList.data(),
-                                            IdxList.size()),
-                            InBounds);
+  /// \param OnlyIfReducedTy see \a getWithOperands() docs.
+  static Constant *getGetElementPtr(Constant *C, ArrayRef<Constant *> IdxList,
+                                    bool InBounds = false,
+                                    Type *OnlyIfReducedTy = nullptr) {
+    return getGetElementPtr(
+        C, makeArrayRef((Value * const *)IdxList.data(), IdxList.size()),
+        InBounds, OnlyIfReducedTy);
   }
-  static Constant *getGetElementPtr(Constant *C,
-                                    Constant *Idx,
-                                    bool InBounds = false) {
+  static Constant *getGetElementPtr(Constant *C, Constant *Idx,
+                                    bool InBounds = false,
+                                    Type *OnlyIfReducedTy = nullptr) {
     // This form of the function only exists to avoid ambiguous overload
     // warnings about whether to convert Idx to ArrayRef<Constant *> or
     // ArrayRef<Value *>.
-    return getGetElementPtr(C, cast<Value>(Idx), InBounds);
+    return getGetElementPtr(C, cast<Value>(Idx), InBounds, OnlyIfReducedTy);
   }
-  static Constant *getGetElementPtr(Constant *C,
-                                    ArrayRef<Value *> IdxList,
-                                    bool InBounds = false);
+  static Constant *getGetElementPtr(Constant *C, ArrayRef<Value *> IdxList,
+                                    bool InBounds = false,
+                                    Type *OnlyIfReducedTy = nullptr);
 
   /// Create an "inbounds" getelementptr. See the documentation for the
   /// "inbounds" flag in LangRef.html for details.
@@ -1052,12 +1073,17 @@
     return getGetElementPtr(C, IdxList, true);
   }
 
-  static Constant *getExtractElement(Constant *Vec, Constant *Idx);
-  static Constant *getInsertElement(Constant *Vec, Constant *Elt,Constant *Idx);
-  static Constant *getShuffleVector(Constant *V1, Constant *V2, Constant *Mask);
-  static Constant *getExtractValue(Constant *Agg, ArrayRef<unsigned> Idxs);
+  static Constant *getExtractElement(Constant *Vec, Constant *Idx,
+                                     Type *OnlyIfReducedTy = nullptr);
+  static Constant *getInsertElement(Constant *Vec, Constant *Elt, Constant *Idx,
+                                    Type *OnlyIfReducedTy = nullptr);
+  static Constant *getShuffleVector(Constant *V1, Constant *V2, Constant *Mask,
+                                    Type *OnlyIfReducedTy = nullptr);
+  static Constant *getExtractValue(Constant *Agg, ArrayRef<unsigned> Idxs,
+                                   Type *OnlyIfReducedTy = nullptr);
   static Constant *getInsertValue(Constant *Agg, Constant *Val,
-                                  ArrayRef<unsigned> Idxs);
+                                  ArrayRef<unsigned> Idxs,
+                                  Type *OnlyIfReducedTy = nullptr);
 
   /// getOpcode - Return the opcode at the root of this constant expression
   unsigned getOpcode() const { return getSubclassDataFromValue(); }
@@ -1084,11 +1110,17 @@
     return getWithOperands(Ops, getType());
   }
 
-  /// getWithOperands - This returns the current constant expression with the
-  /// operands replaced with the specified values and with the specified result
-  /// type.  The specified array must have the same number of operands as our
-  /// current one.
-  Constant *getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const;
+  /// \brief Get the current expression with the operands replaced.
+  ///
+  /// Return the current constant expression with the operands replaced with \c
+  /// Ops and the type with \c Ty.  The new operands must have the same number
+  /// as the current ones.
+  ///
+  /// If \c OnlyIfReduced is \c true, nullptr will be returned unless something
+  /// gets constant-folded, the type changes, or the expression is otherwise
+  /// canonicalized.  This parameter should almost always be \c false.
+  Constant *getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
+                            bool OnlyIfReduced = false) const;
 
   /// getAsInstruction - Returns an Instruction which implements the same operation
   /// as this ConstantExpr. The instruction is not linked to any basic block.

diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 2673504..3a50609 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h

@@ -27,6 +27,7 @@
   class Function;
   class Module;
   class Value;
+  class Constant;
   class LLVMContext;
   class MDNode;
   class StringRef;
@@ -38,7 +39,6 @@
   class DIFile;
   class DIEnumerator;
   class DIType;
-  class DIArray;
   class DIGlobalVariable;
   class DIImportedEntity;
   class DINameSpace;
@@ -53,7 +53,6 @@
   class DIObjCProperty;
 
   class DIBuilder {
-    private:
     Module &M;
     LLVMContext &VMContext;
 
@@ -74,19 +73,14 @@
     SmallVector<Value *, 4> AllGVs;
     SmallVector<TrackingVH<MDNode>, 4> AllImportedModules;
 
-    // Private use for multiple types of template parameters.
-    DITemplateValueParameter
-    createTemplateValueParameter(unsigned Tag, DIDescriptor Scope,
-                                 StringRef Name, DIType Ty, Value *Val,
-                                 MDNode *File = nullptr, unsigned LineNo = 0,
-                                 unsigned ColumnNo = 0);
+    /// Each subprogram's preserved local variables.
+    DenseMap<MDNode *, std::vector<TrackingVH<MDNode>>> PreservedVariables;
 
     DIBuilder(const DIBuilder &) LLVM_DELETED_FUNCTION;
     void operator=(const DIBuilder &) LLVM_DELETED_FUNCTION;
 
-    public:
+  public:
     explicit DIBuilder(Module &M);
-    enum ComplexAddrKind { OpPlus=1, OpDeref };
     enum DebugEmissionKind { FullDebug=1, LineTablesOnly };
 
     /// finalize - Construct any deferred debug info descriptors.
@@ -218,36 +212,10 @@
     /// @param Ty         Type of the static member.
     /// @param Flags      Flags to encode member attribute, e.g. private.
     /// @param Val        Const initializer of the member.
-    DIDerivedType
-    createStaticMemberType(DIDescriptor Scope, StringRef Name,
-                           DIFile File, unsigned LineNo, DIType Ty,
-                           unsigned Flags, llvm::Value *Val);
-
-    /// createObjCIVar - Create debugging information entry for Objective-C
-    /// instance variable.
-    /// @param Name         Member name.
-    /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
-    /// @param SizeInBits   Member size.
-    /// @param AlignInBits  Member alignment.
-    /// @param OffsetInBits Member offset.
-    /// @param Flags        Flags to encode member attribute, e.g. private
-    /// @param Ty           Parent type.
-    /// @param PropertyName Name of the Objective C property associated with
-    ///                     this ivar.
-    /// @param PropertyGetterName Name of the Objective C property getter
-    ///                           selector.
-    /// @param PropertySetterName Name of the Objective C property setter
-    ///                           selector.
-    /// @param PropertyAttributes Objective C property attributes.
-    DIDerivedType createObjCIVar(StringRef Name, DIFile File,
-                                 unsigned LineNo, uint64_t SizeInBits,
-                                 uint64_t AlignInBits, uint64_t OffsetInBits,
-                                 unsigned Flags, DIType Ty,
-                                 StringRef PropertyName = StringRef(),
-                                 StringRef PropertyGetterName = StringRef(),
-                                 StringRef PropertySetterName = StringRef(),
-                                 unsigned PropertyAttributes = 0);
+    DIDerivedType createStaticMemberType(DIDescriptor Scope, StringRef Name,
+                                         DIFile File, unsigned LineNo,
+                                         DIType Ty, unsigned Flags,
+                                         llvm::Constant *Val);
 
     /// createObjCIVar - Create debugging information entry for Objective-C
     /// instance variable.
@@ -366,8 +334,8 @@
     /// @param LineNo       Line number.
     /// @param ColumnNo     Column Number.
     DITemplateValueParameter
-    createTemplateValueParameter(DIDescriptor Scope, StringRef Name,
-                                 DIType Ty, Value *Val, MDNode *File = nullptr,
+    createTemplateValueParameter(DIDescriptor Scope, StringRef Name, DIType Ty,
+                                 Constant *Val, MDNode *File = nullptr,
                                  unsigned LineNo = 0, unsigned ColumnNo = 0);
 
     /// \brief Create debugging information for a template template parameter.
@@ -435,8 +403,9 @@
     ///                        includes return type at 0th index.
     /// @param Flags           E.g.: LValueReference.
     ///                        These flags are used to emit dwarf attributes.
-    DICompositeType createSubroutineType(DIFile File, DIArray ParameterTypes,
-                                         unsigned Flags = 0);
+    DISubroutineType createSubroutineType(DIFile File,
+                                          DITypeArray ParameterTypes,
+                                          unsigned Flags = 0);
 
     /// createArtificialType - Create a new DIType with "artificial" flag set.
     DIType createArtificialType(DIType Ty);
@@ -463,44 +432,22 @@
     /// through debug info anchors.
     void retainType(DIType T);
 
-    /// createUnspecifiedParameter - Create unspecified type descriptor
+    /// createUnspecifiedParameter - Create unspecified parameter type
     /// for a subroutine type.
-    DIDescriptor createUnspecifiedParameter();
+    DIBasicType createUnspecifiedParameter();
 
     /// getOrCreateArray - Get a DIArray, create one if required.
     DIArray getOrCreateArray(ArrayRef<Value *> Elements);
 
+    /// getOrCreateTypeArray - Get a DITypeArray, create one if required.
+    DITypeArray getOrCreateTypeArray(ArrayRef<Value *> Elements);
+
     /// getOrCreateSubrange - Create a descriptor for a value range.  This
     /// implicitly uniques the values returned.
     DISubrange getOrCreateSubrange(int64_t Lo, int64_t Count);
 
-    /// createGlobalVariable - Create a new descriptor for the specified global.
-    /// @param Name        Name of the variable.
-    /// @param File        File where this variable is defined.
-    /// @param LineNo      Line number.
-    /// @param Ty          Variable Type.
-    /// @param isLocalToUnit Boolean flag indicate whether this variable is
-    ///                      externally visible or not.
-    /// @param Val         llvm::Value of the variable.
-    DIGlobalVariable
-    createGlobalVariable(StringRef Name, DIFile File, unsigned LineNo,
-                         DITypeRef Ty, bool isLocalToUnit, llvm::Value *Val);
 
-    /// \brief Create a new descriptor for the specified global.
-    /// @param Name        Name of the variable.
-    /// @param LinkageName Mangled variable name.
-    /// @param File        File where this variable is defined.
-    /// @param LineNo      Line number.
-    /// @param Ty          Variable Type.
-    /// @param isLocalToUnit Boolean flag indicate whether this variable is
-    ///                      externally visible or not.
-    /// @param Val         llvm::Value of the variable.
-    DIGlobalVariable
-    createGlobalVariable(StringRef Name, StringRef LinkageName, DIFile File,
-                         unsigned LineNo, DITypeRef Ty, bool isLocalToUnit,
-                         llvm::Value *Val);
-
-    /// createStaticVariable - Create a new descriptor for the specified
+    /// createGlobalVariable - Create a new descriptor for the specified
     /// variable.
     /// @param Context     Variable scope.
     /// @param Name        Name of the variable.
@@ -512,12 +459,19 @@
     ///                      externally visible or not.
     /// @param Val         llvm::Value of the variable.
     /// @param Decl        Reference to the corresponding declaration.
-    DIGlobalVariable
-    createStaticVariable(DIDescriptor Context, StringRef Name,
-                         StringRef LinkageName, DIFile File, unsigned LineNo,
-                         DITypeRef Ty, bool isLocalToUnit, llvm::Value *Val,
-                         MDNode *Decl = nullptr);
+    DIGlobalVariable createGlobalVariable(DIDescriptor Context, StringRef Name,
+                                          StringRef LinkageName, DIFile File,
+                                          unsigned LineNo, DITypeRef Ty,
+                                          bool isLocalToUnit,
+                                          llvm::Constant *Val,
+                                          MDNode *Decl = nullptr);
 
+    /// createTempGlobalVariableFwdDecl - Identical to createGlobalVariable
+    /// except that the resulting DbgNode is temporary and meant to be RAUWed.
+    DIGlobalVariable createTempGlobalVariableFwdDecl(
+        DIDescriptor Context, StringRef Name, StringRef LinkageName,
+        DIFile File, unsigned LineNo, DITypeRef Ty, bool isLocalToUnit,
+        llvm::Constant *Val, MDNode *Decl = nullptr);
 
     /// createLocalVariable - Create a new descriptor for the specified
     /// local variable.
@@ -540,23 +494,18 @@
                                    unsigned Flags = 0,
                                    unsigned ArgNo = 0);
 
-
-    /// createComplexVariable - Create a new descriptor for the specified
+    /// createExpression - Create a new descriptor for the specified
     /// variable which has a complex address expression for its address.
-    /// @param Tag         Dwarf TAG. Usually DW_TAG_auto_variable or
-    ///                    DW_TAG_arg_variable.
-    /// @param Scope       Variable scope.
-    /// @param Name        Variable name.
-    /// @param F           File where this variable is defined.
-    /// @param LineNo      Line number.
-    /// @param Ty          Variable Type
     /// @param Addr        An array of complex address operations.
-    /// @param ArgNo       If this variable is an argument then this argument's
-    ///                    number. 1 indicates 1st argument.
-    DIVariable createComplexVariable(unsigned Tag, DIDescriptor Scope,
-                                     StringRef Name, DIFile F, unsigned LineNo,
-                                     DITypeRef Ty, ArrayRef<Value *> Addr,
-                                     unsigned ArgNo = 0);
+    DIExpression createExpression(ArrayRef<int64_t> Addr = None);
+
+    /// createPieceExpression - Create a descriptor to describe one part
+    /// of aggregate variable that is fragmented across multiple Values.
+    ///
+    /// @param OffsetInBytes Offset of the piece in bytes.
+    /// @param SizeInBytes   Size of the piece in bytes.
+    DIExpression createPieceExpression(unsigned OffsetInBytes,
+                                       unsigned SizeInBytes);
 
     /// createFunction - Create a new descriptor for the specified subprogram.
     /// See comments in DISubprogram for descriptions of these fields.
@@ -586,6 +535,21 @@
                                 MDNode *TParam = nullptr,
                                 MDNode *Decl = nullptr);
 
+    /// createTempFunctionFwdDecl - Identical to createFunction,
+    /// except that the resulting DbgNode is meant to be RAUWed.
+    DISubprogram createTempFunctionFwdDecl(DIDescriptor Scope, StringRef Name,
+                                           StringRef LinkageName,
+                                           DIFile File, unsigned LineNo,
+                                           DICompositeType Ty, bool isLocalToUnit,
+                                           bool isDefinition,
+                                           unsigned ScopeLine,
+                                           unsigned Flags = 0,
+                                           bool isOptimized = false,
+                                           Function *Fn = nullptr,
+                                           MDNode *TParam = nullptr,
+                                           MDNode *Decl = nullptr);
+
+
     /// FIXME: this is added for dragonegg. Once we update dragonegg
     /// to call resolve function, this will be removed.
     DISubprogram createFunction(DIScopeRef Scope, StringRef Name,
@@ -646,8 +610,9 @@
     /// lexical block as it crosses a file.
     /// @param Scope       Lexical block.
     /// @param File        Source file.
-    DILexicalBlockFile createLexicalBlockFile(DIDescriptor Scope,
-                                              DIFile File);
+    /// @param Discriminator DWARF path discriminator value.
+    DILexicalBlockFile createLexicalBlockFile(DIDescriptor Scope, DIFile File,
+                                              unsigned Discriminator = 0);
 
     /// createLexicalBlock - This creates a descriptor for a lexical block
     /// with the specified parent context.
@@ -655,10 +620,8 @@
     /// @param File          Source file.
     /// @param Line          Line number.
     /// @param Col           Column number.
-    /// @param Discriminator DWARF path discriminator value.
     DILexicalBlock createLexicalBlock(DIDescriptor Scope, DIFile File,
-                                      unsigned Line, unsigned Col,
-                                      unsigned Discriminator);
+                                      unsigned Line, unsigned Col);
 
     /// \brief Create a descriptor for an imported module.
     /// @param Context The scope this module is imported into
@@ -679,7 +642,7 @@
     /// @param Decl The declaration (or definition) of a function, type, or
     ///             variable
     /// @param Line Line number
-    DIImportedEntity createImportedDeclaration(DIScope Context, DIScope Decl,
+    DIImportedEntity createImportedDeclaration(DIScope Context, DIDescriptor Decl,
                                                unsigned Line,
                                                StringRef Name = StringRef());
     DIImportedEntity createImportedDeclaration(DIScope Context,
@@ -690,36 +653,38 @@
     /// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
     /// @param Storage     llvm::Value of the variable
     /// @param VarInfo     Variable's debug info descriptor.
+    /// @param Expr         A complex location expression.
     /// @param InsertAtEnd Location for the new intrinsic.
     Instruction *insertDeclare(llvm::Value *Storage, DIVariable VarInfo,
-                               BasicBlock *InsertAtEnd);
+                               DIExpression Expr, BasicBlock *InsertAtEnd);
 
     /// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
     /// @param Storage      llvm::Value of the variable
     /// @param VarInfo      Variable's debug info descriptor.
+    /// @param Expr         A complex location expression.
     /// @param InsertBefore Location for the new intrinsic.
     Instruction *insertDeclare(llvm::Value *Storage, DIVariable VarInfo,
-                               Instruction *InsertBefore);
-
+                               DIExpression Expr, Instruction *InsertBefore);
 
     /// insertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
     /// @param Val          llvm::Value of the variable
     /// @param Offset       Offset
     /// @param VarInfo      Variable's debug info descriptor.
+    /// @param Expr         A complex location expression.
     /// @param InsertAtEnd Location for the new intrinsic.
     Instruction *insertDbgValueIntrinsic(llvm::Value *Val, uint64_t Offset,
-                                         DIVariable VarInfo,
+                                         DIVariable VarInfo, DIExpression Expr,
                                          BasicBlock *InsertAtEnd);
 
     /// insertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
     /// @param Val          llvm::Value of the variable
     /// @param Offset       Offset
     /// @param VarInfo      Variable's debug info descriptor.
+    /// @param Expr         A complex location expression.
     /// @param InsertBefore Location for the new intrinsic.
     Instruction *insertDbgValueIntrinsic(llvm::Value *Val, uint64_t Offset,
-                                         DIVariable VarInfo,
+                                         DIVariable VarInfo, DIExpression Expr,
                                          Instruction *InsertBefore);
-
   };
 } // end namespace llvm
 

diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h
index 877029f..4580a4f 100644
--- a/include/llvm/IR/DataLayout.h
+++ b/include/llvm/IR/DataLayout.h

@@ -27,7 +27,8 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/DataTypes.h"
 
-// this needs to be outside of the namespace, to avoid conflict with llvm-c decl
+// This needs to be outside of the namespace, to avoid conflict with llvm-c
+// decl.
 typedef struct LLVMOpaqueTargetData *LLVMTargetDataRef;
 
 namespace llvm {
@@ -45,79 +46,71 @@
 
 /// Enum used to categorize the alignment types stored by LayoutAlignElem
 enum AlignTypeEnum {
-  INVALID_ALIGN = 0,                 ///< An invalid alignment
-  INTEGER_ALIGN = 'i',               ///< Integer type alignment
-  VECTOR_ALIGN = 'v',                ///< Vector type alignment
-  FLOAT_ALIGN = 'f',                 ///< Floating point type alignment
-  AGGREGATE_ALIGN = 'a'              ///< Aggregate alignment
+  INVALID_ALIGN = 0,
+  INTEGER_ALIGN = 'i',
+  VECTOR_ALIGN = 'v',
+  FLOAT_ALIGN = 'f',
+  AGGREGATE_ALIGN = 'a'
 };
 
-/// Layout alignment element.
+/// \brief Layout alignment element.
 ///
 /// Stores the alignment data associated with a given alignment type (integer,
 /// vector, float) and type bit width.
 ///
-/// @note The unusual order of elements in the structure attempts to reduce
+/// \note The unusual order of elements in the structure attempts to reduce
 /// padding and make the structure slightly more cache friendly.
 struct LayoutAlignElem {
-  unsigned AlignType    : 8;  ///< Alignment type (AlignTypeEnum)
-  unsigned TypeBitWidth : 24; ///< Type bit width
-  unsigned ABIAlign     : 16; ///< ABI alignment for this type/bitw
-  unsigned PrefAlign    : 16; ///< Pref. alignment for this type/bitw
+  /// \brief Alignment type from \c AlignTypeEnum
+  unsigned AlignType : 8;
+  unsigned TypeBitWidth : 24;
+  unsigned ABIAlign : 16;
+  unsigned PrefAlign : 16;
 
-  /// Initializer
   static LayoutAlignElem get(AlignTypeEnum align_type, unsigned abi_align,
                              unsigned pref_align, uint32_t bit_width);
-  /// Equality predicate
   bool operator==(const LayoutAlignElem &rhs) const;
 };
 
-/// Layout pointer alignment element.
+/// \brief Layout pointer alignment element.
 ///
 /// Stores the alignment data associated with a given pointer and address space.
 ///
-/// @note The unusual order of elements in the structure attempts to reduce
+/// \note The unusual order of elements in the structure attempts to reduce
 /// padding and make the structure slightly more cache friendly.
 struct PointerAlignElem {
-  unsigned            ABIAlign;       ///< ABI alignment for this type/bitw
-  unsigned            PrefAlign;      ///< Pref. alignment for this type/bitw
-  uint32_t            TypeByteWidth;  ///< Type byte width
-  uint32_t            AddressSpace;   ///< Address space for the pointer type
+  unsigned ABIAlign;
+  unsigned PrefAlign;
+  uint32_t TypeByteWidth;
+  uint32_t AddressSpace;
 
   /// Initializer
   static PointerAlignElem get(uint32_t AddressSpace, unsigned ABIAlign,
-                             unsigned PrefAlign, uint32_t TypeByteWidth);
-  /// Equality predicate
+                              unsigned PrefAlign, uint32_t TypeByteWidth);
   bool operator==(const PointerAlignElem &rhs) const;
 };
 
-/// This class holds a parsed version of the target data layout string in a
-/// module and provides methods for querying it. The target data layout string
-/// is specified *by the target* - a frontend generating LLVM IR is required to
-/// generate the right target data for the target being codegen'd to.
+/// \brief A parsed version of the target data layout string in and methods for
+/// querying it.
+///
+/// The target data layout string is specified *by the target* - a frontend
+/// generating LLVM IR is required to generate the right target data for the
+/// target being codegen'd to.
 class DataLayout {
 private:
-  bool          LittleEndian;          ///< Defaults to false
-  unsigned      StackNaturalAlign;     ///< Stack natural alignment
+  /// Defaults to false.
+  bool BigEndian;
 
-  enum ManglingModeT {
-    MM_None,
-    MM_ELF,
-    MM_MachO,
-    MM_WINCOFF,
-    MM_Mips
-  };
+  unsigned StackNaturalAlign;
+
+  enum ManglingModeT { MM_None, MM_ELF, MM_MachO, MM_WINCOFF, MM_Mips };
   ManglingModeT ManglingMode;
 
-  SmallVector<unsigned char, 8> LegalIntWidths; ///< Legal Integers.
+  SmallVector<unsigned char, 8> LegalIntWidths;
 
-  /// Alignments - Where the primitive type alignment data is stored.
-  ///
-  /// @sa reset().
-  /// @note Could support multiple size pointer alignments, e.g., 32-bit
-  /// pointers vs. 64-bit pointers by extending LayoutAlignment, but for now,
-  /// we don't.
+  /// \brief Primitive type alignment data.
   SmallVector<LayoutAlignElem, 16> Alignments;
+
   typedef SmallVector<PointerAlignElem, 8> PointersTy;
   PointersTy Pointers;
 
@@ -128,31 +121,28 @@
 
   PointersTy::iterator findPointerLowerBound(uint32_t AddressSpace);
 
-  /// InvalidAlignmentElem - This member is a signal that a requested alignment
-  /// type and bit width were not found in the SmallVector.
+  /// This member is a signal that a requested alignment type and bit width were
+  /// not found in the SmallVector.
   static const LayoutAlignElem InvalidAlignmentElem;
 
-  /// InvalidPointerElem - This member is a signal that a requested pointer
-  /// type and bit width were not found in the DenseSet.
+  /// This member is a signal that a requested pointer type and bit width were
+  /// not found in the DenseSet.
   static const PointerAlignElem InvalidPointerElem;
 
   // The StructType -> StructLayout map.
   mutable void *LayoutMap;
 
-  //! Set/initialize target alignments
   void setAlignment(AlignTypeEnum align_type, unsigned abi_align,
                     unsigned pref_align, uint32_t bit_width);
   unsigned getAlignmentInfo(AlignTypeEnum align_type, uint32_t bit_width,
                             bool ABIAlign, Type *Ty) const;
-
-  //! Set/initialize pointer alignments
   void setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
                            unsigned PrefAlign, uint32_t TypeByteWidth);
 
-  //! Internal helper method that returns requested alignment for type.
+  /// Internal helper method that returns requested alignment for type.
   unsigned getAlignment(Type *Ty, bool abi_or_pref) const;
 
-  /// Valid alignment predicate.
+  /// \brief Valid alignment predicate.
   ///
   /// Predicate that tests a LayoutAlignElem reference returned by get() against
   /// InvalidAlignmentElem.
@@ -160,10 +150,10 @@
     return &align != &InvalidAlignmentElem;
   }
 
-  /// Valid pointer predicate.
+  /// \brief Valid pointer predicate.
   ///
-  /// Predicate that tests a PointerAlignElem reference returned by get() against
-  /// InvalidPointerElem.
+  /// Predicate that tests a PointerAlignElem reference returned by get()
+  /// against \c InvalidPointerElem.
   bool validPointer(const PointerAlignElem &align) const {
     return &align != &InvalidPointerElem;
   }
@@ -184,11 +174,13 @@
   /// Initialize target data from properties stored in the module.
   explicit DataLayout(const Module *M);
 
+  void init(const Module *M);
+
   DataLayout(const DataLayout &DL) : LayoutMap(nullptr) { *this = DL; }
 
   DataLayout &operator=(const DataLayout &DL) {
     clear();
-    LittleEndian = DL.isLittleEndian();
+    BigEndian = DL.isBigEndian();
     StackNaturalAlign = DL.StackNaturalAlign;
     ManglingMode = DL.ManglingMode;
     LegalIntWidths = DL.LegalIntWidths;
@@ -200,27 +192,28 @@
   bool operator==(const DataLayout &Other) const;
   bool operator!=(const DataLayout &Other) const { return !(*this == Other); }
 
-  ~DataLayout();  // Not virtual, do not subclass this class
+  ~DataLayout(); // Not virtual, do not subclass this class
 
   /// Parse a data layout string (with fallback to default values).
   void reset(StringRef LayoutDescription);
 
   /// Layout endianness...
-  bool isLittleEndian() const { return LittleEndian; }
-  bool isBigEndian() const { return !LittleEndian; }
+  bool isLittleEndian() const { return !BigEndian; }
+  bool isBigEndian() const { return BigEndian; }
 
-  /// getStringRepresentation - Return the string representation of the
-  /// DataLayout.  This representation is in the same format accepted by the
-  /// string constructor above.
+  /// \brief Returns the string representation of the DataLayout.
+  ///
+  /// This representation is in the same format accepted by the string
+  /// constructor above.
   std::string getStringRepresentation() const;
 
-  /// isLegalInteger - This function returns true if the specified type is
-  /// known to be a native integer type supported by the CPU.  For example,
-  /// i64 is not native on most 32-bit CPUs and i37 is not native on any known
-  /// one.  This returns false if the integer width is not legal.
+  /// \brief Returns true if the specified type is known to be a native integer
+  /// type supported by the CPU.
+  ///
+  /// For example, i64 is not native on most 32-bit CPUs and i37 is not native
+  /// on any known one. This returns false if the integer width is not legal.
   ///
   /// The width is specified in bits.
-  ///
   bool isLegalInteger(unsigned Width) const {
     for (unsigned LegalIntWidth : LegalIntWidths)
       if (LegalIntWidth == Width)
@@ -228,9 +221,7 @@
     return false;
   }
 
-  bool isIllegalInteger(unsigned Width) const {
-    return !isLegalInteger(Width);
-  }
+  bool isIllegalInteger(unsigned Width) const { return !isLegalInteger(Width); }
 
   /// Returns true if the given alignment exceeds the natural stack alignment.
   bool exceedsNaturalStackAlignment(unsigned Align) const {
@@ -241,9 +232,7 @@
     return ManglingMode == MM_WINCOFF;
   }
 
-  bool hasLinkerPrivateGlobalPrefix() const {
-    return ManglingMode == MM_MachO;
-  }
+  bool hasLinkerPrivateGlobalPrefix() const { return ManglingMode == MM_MachO; }
 
   const char *getLinkerPrivateGlobalPrefix() const {
     if (ManglingMode == MM_MachO)
@@ -281,10 +270,11 @@
 
   static const char *getManglingComponent(const Triple &T);
 
-  /// fitsInLegalInteger - This function returns true if the specified type fits
-  /// in a native integer type supported by the CPU.  For example, if the CPU
-  /// only supports i32 as a native integer type, then i27 fits in a legal
-  /// integer type but i45 does not.
+  /// \brief Returns true if the specified type fits in a native integer type
+  /// supported by the CPU.
+  ///
+  /// For example, if the CPU only supports i32 as a native integer type, then
+  /// i27 fits in a legal integer type but i45 does not.
   bool fitsInLegalInteger(unsigned Width) const {
     for (unsigned LegalIntWidth : LegalIntWidths)
       if (Width <= LegalIntWidth)
@@ -342,118 +332,116 @@
   /// [*] The alloc size depends on the alignment, and thus on the target.
   ///     These values are for x86-32 linux.
 
-  /// getTypeSizeInBits - Return the number of bits necessary to hold the
-  /// specified type.  For example, returns 36 for i36 and 80 for x86_fp80.
-  /// The type passed must have a size (Type::isSized() must return true).
+  /// \brief Returns the number of bits necessary to hold the specified type.
+  ///
+  /// For example, returns 36 for i36 and 80 for x86_fp80. The type passed must
+  /// have a size (Type::isSized() must return true).
   uint64_t getTypeSizeInBits(Type *Ty) const;
 
-  /// getTypeStoreSize - Return the maximum number of bytes that may be
-  /// overwritten by storing the specified type.  For example, returns 5
-  /// for i36 and 10 for x86_fp80.
+  /// \brief Returns the maximum number of bytes that may be overwritten by
+  /// storing the specified type.
+  ///
+  /// For example, returns 5 for i36 and 10 for x86_fp80.
   uint64_t getTypeStoreSize(Type *Ty) const {
-    return (getTypeSizeInBits(Ty)+7)/8;
+    return (getTypeSizeInBits(Ty) + 7) / 8;
   }
 
-  /// getTypeStoreSizeInBits - Return the maximum number of bits that may be
-  /// overwritten by storing the specified type; always a multiple of 8.  For
-  /// example, returns 40 for i36 and 80 for x86_fp80.
+  /// \brief Returns the maximum number of bits that may be overwritten by
+  /// storing the specified type; always a multiple of 8.
+  ///
+  /// For example, returns 40 for i36 and 80 for x86_fp80.
   uint64_t getTypeStoreSizeInBits(Type *Ty) const {
-    return 8*getTypeStoreSize(Ty);
+    return 8 * getTypeStoreSize(Ty);
   }
 
-  /// getTypeAllocSize - Return the offset in bytes between successive objects
-  /// of the specified type, including alignment padding.  This is the amount
-  /// that alloca reserves for this type.  For example, returns 12 or 16 for
-  /// x86_fp80, depending on alignment.
+  /// \brief Returns the offset in bytes between successive objects of the
+  /// specified type, including alignment padding.
+  ///
+  /// This is the amount that alloca reserves for this type. For example,
+  /// returns 12 or 16 for x86_fp80, depending on alignment.
   uint64_t getTypeAllocSize(Type *Ty) const {
     // Round up to the next alignment boundary.
-    return RoundUpAlignment(getTypeStoreSize(Ty), getABITypeAlignment(Ty));
+    return RoundUpToAlignment(getTypeStoreSize(Ty), getABITypeAlignment(Ty));
   }
 
-  /// getTypeAllocSizeInBits - Return the offset in bits between successive
-  /// objects of the specified type, including alignment padding; always a
-  /// multiple of 8.  This is the amount that alloca reserves for this type.
-  /// For example, returns 96 or 128 for x86_fp80, depending on alignment.
+  /// \brief Returns the offset in bits between successive objects of the
+  /// specified type, including alignment padding; always a multiple of 8.
+  ///
+  /// This is the amount that alloca reserves for this type. For example,
+  /// returns 96 or 128 for x86_fp80, depending on alignment.
   uint64_t getTypeAllocSizeInBits(Type *Ty) const {
-    return 8*getTypeAllocSize(Ty);
+    return 8 * getTypeAllocSize(Ty);
   }
 
-  /// getABITypeAlignment - Return the minimum ABI-required alignment for the
-  /// specified type.
+  /// \brief Returns the minimum ABI-required alignment for the specified type.
   unsigned getABITypeAlignment(Type *Ty) const;
 
-  /// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
-  /// an integer type of the specified bitwidth.
+  /// \brief Returns the minimum ABI-required alignment for an integer type of
+  /// the specified bitwidth.
   unsigned getABIIntegerTypeAlignment(unsigned BitWidth) const;
 
-  /// getPrefTypeAlignment - Return the preferred stack/global alignment for
-  /// the specified type.  This is always at least as good as the ABI alignment.
+  /// \brief Returns the preferred stack/global alignment for the specified
+  /// type.
+  ///
+  /// This is always at least as good as the ABI alignment.
   unsigned getPrefTypeAlignment(Type *Ty) const;
 
-  /// getPreferredTypeAlignmentShift - Return the preferred alignment for the
-  /// specified type, returned as log2 of the value (a shift amount).
+  /// \brief Returns the preferred alignment for the specified type, returned as
+  /// log2 of the value (a shift amount).
   unsigned getPreferredTypeAlignmentShift(Type *Ty) const;
 
-  /// getIntPtrType - Return an integer type with size at least as big as that
-  /// of a pointer in the given address space.
+  /// \brief Returns an integer type with size at least as big as that of a
+  /// pointer in the given address space.
   IntegerType *getIntPtrType(LLVMContext &C, unsigned AddressSpace = 0) const;
 
-  /// getIntPtrType - Return an integer (vector of integer) type with size at
-  /// least as big as that of a pointer of the given pointer (vector of pointer)
-  /// type.
+  /// \brief Returns an integer (vector of integer) type with size at least as
+  /// big as that of a pointer of the given pointer (vector of pointer) type.
   Type *getIntPtrType(Type *) const;
 
-  /// getSmallestLegalIntType - Return the smallest integer type with size at
-  /// least as big as Width bits.
+  /// \brief Returns the smallest integer type with size at least as big as
+  /// Width bits.
   Type *getSmallestLegalIntType(LLVMContext &C, unsigned Width = 0) const;
 
-  /// getLargestLegalIntType - Return the largest legal integer type, or null if
-  /// none are set.
+  /// \brief Returns the largest legal integer type, or null if none are set.
   Type *getLargestLegalIntType(LLVMContext &C) const {
     unsigned LargestSize = getLargestLegalIntTypeSize();
     return (LargestSize == 0) ? nullptr : Type::getIntNTy(C, LargestSize);
   }
 
-  /// getLargestLegalIntTypeSize - Return the size of largest legal integer
-  /// type size, or 0 if none are set.
+  /// \brief Returns the size of largest legal integer type size, or 0 if none
+  /// are set.
   unsigned getLargestLegalIntTypeSize() const;
 
-  /// getIndexedOffset - return the offset from the beginning of the type for
-  /// the specified indices.  This is used to implement getelementptr.
+  /// \brief Returns the offset from the beginning of the type for the specified
+  /// indices.
+  ///
+  /// This is used to implement getelementptr.
   uint64_t getIndexedOffset(Type *Ty, ArrayRef<Value *> Indices) const;
 
-  /// getStructLayout - Return a StructLayout object, indicating the alignment
-  /// of the struct, its size, and the offsets of its fields.  Note that this
-  /// information is lazily cached.
+  /// \brief Returns a StructLayout object, indicating the alignment of the
+  /// struct, its size, and the offsets of its fields.
+  ///
+  /// Note that this information is lazily cached.
   const StructLayout *getStructLayout(StructType *Ty) const;
 
-  /// getPreferredAlignment - Return the preferred alignment of the specified
-  /// global.  This includes an explicitly requested alignment (if the global
-  /// has one).
+  /// \brief Returns the preferred alignment of the specified global.
+  ///
+  /// This includes an explicitly requested alignment (if the global has one).
   unsigned getPreferredAlignment(const GlobalVariable *GV) const;
 
-  /// getPreferredAlignmentLog - Return the preferred alignment of the
-  /// specified global, returned in log form.  This includes an explicitly
-  /// requested alignment (if the global has one).
+  /// \brief Returns the preferred alignment of the specified global, returned
+  /// in log form.
+  ///
+  /// This includes an explicitly requested alignment (if the global has one).
   unsigned getPreferredAlignmentLog(const GlobalVariable *GV) const;
-
-  /// RoundUpAlignment - Round the specified value up to the next alignment
-  /// boundary specified by Alignment.  For example, 7 rounded up to an
-  /// alignment boundary of 4 is 8.  8 rounded up to the alignment boundary of 4
-  /// is 8 because it is already aligned.
-  template <typename UIntTy>
-  static UIntTy RoundUpAlignment(UIntTy Val, unsigned Alignment) {
-    assert((Alignment & (Alignment-1)) == 0 && "Alignment must be power of 2!");
-    return (Val + (Alignment-1)) & ~UIntTy(Alignment-1);
-  }
 };
 
 inline DataLayout *unwrap(LLVMTargetDataRef P) {
-   return reinterpret_cast<DataLayout*>(P);
+  return reinterpret_cast<DataLayout *>(P);
 }
 
 inline LLVMTargetDataRef wrap(const DataLayout *P) {
-   return reinterpret_cast<LLVMTargetDataRef>(const_cast<DataLayout*>(P));
+  return reinterpret_cast<LLVMTargetDataRef>(const_cast<DataLayout *>(P));
 }
 
 class DataLayoutPass : public ImmutablePass {
@@ -466,40 +454,28 @@
 
   const DataLayout &getDataLayout() const { return DL; }
 
-  // For use with the C API. C++ code should always use the constructor that
-  // takes a module.
-  explicit DataLayoutPass(const DataLayout &DL);
-
-  explicit DataLayoutPass(const Module *M);
-
   static char ID; // Pass identification, replacement for typeid
+
+  bool doFinalization(Module &M) override;
+  bool doInitialization(Module &M) override;
 };
 
-/// StructLayout - used to lazily calculate structure layout information for a
-/// target machine, based on the DataLayout structure.
-///
+/// Used to lazily calculate structure layout information for a target machine,
+/// based on the DataLayout structure.
 class StructLayout {
   uint64_t StructSize;
   unsigned StructAlignment;
   unsigned NumElements;
-  uint64_t MemberOffsets[1];  // variable sized array!
+  uint64_t MemberOffsets[1]; // variable sized array!
 public:
+  uint64_t getSizeInBytes() const { return StructSize; }
 
-  uint64_t getSizeInBytes() const {
-    return StructSize;
-  }
+  uint64_t getSizeInBits() const { return 8 * StructSize; }
 
-  uint64_t getSizeInBits() const {
-    return 8*StructSize;
-  }
+  unsigned getAlignment() const { return StructAlignment; }
 
-  unsigned getAlignment() const {
-    return StructAlignment;
-  }
-
-  /// getElementContainingOffset - Given a valid byte offset into the structure,
-  /// return the structure index that contains it.
-  ///
+  /// \brief Given a valid byte offset into the structure, returns the structure
+  /// index that contains it.
   unsigned getElementContainingOffset(uint64_t Offset) const;
 
   uint64_t getElementOffset(unsigned Idx) const {
@@ -508,15 +484,14 @@
   }
 
   uint64_t getElementOffsetInBits(unsigned Idx) const {
-    return getElementOffset(Idx)*8;
+    return getElementOffset(Idx) * 8;
   }
 
 private:
-  friend class DataLayout;   // Only DataLayout can create this class
+  friend class DataLayout; // Only DataLayout can create this class
   StructLayout(StructType *ST, const DataLayout &DL);
 };
 
-
 // The implementation of this method is provided inline as it is particularly
 // well suited to constant folding when called on a specific Type subclass.
 inline uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const {
@@ -546,7 +521,7 @@
   case Type::PPC_FP128TyID:
   case Type::FP128TyID:
     return 128;
-    // In memory objects this is always aligned to a higher boundary, but
+  // In memory objects this is always aligned to a higher boundary, but
   // only 80 bits contain information.
   case Type::X86_FP80TyID:
     return 80;

diff --git a/include/llvm/IR/DebugInfo.h b/include/llvm/IR/DebugInfo.h
index 088eb9f..22a2138 100644
--- a/include/llvm/IR/DebugInfo.h
+++ b/include/llvm/IR/DebugInfo.h

@@ -25,6 +25,8 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <iterator>
 
 namespace llvm {
 class BasicBlock;
@@ -37,6 +39,7 @@
 class DbgDeclareInst;
 class DbgValueInst;
 class Instruction;
+class Metadata;
 class MDNode;
 class MDString;
 class NamedMDNode;
@@ -52,21 +55,78 @@
 class DIScope;
 class DIObjCProperty;
 
-/// Maps from type identifier to the actual MDNode.
+/// \brief Maps from type identifier to the actual MDNode.
 typedef DenseMap<const MDString *, MDNode *> DITypeIdentifierMap;
 
-/// DIDescriptor - A thin wraper around MDNode to access encoded debug info.
-/// This should not be stored in a container, because the underlying MDNode
-/// may change in certain situations.
+class DIHeaderFieldIterator
+    : public std::iterator<std::input_iterator_tag, StringRef, std::ptrdiff_t,
+                           const StringRef *, StringRef> {
+  StringRef Header;
+  StringRef Current;
+
+public:
+  DIHeaderFieldIterator() {}
+  DIHeaderFieldIterator(StringRef Header)
+      : Header(Header), Current(Header.slice(0, Header.find('\0'))) {}
+  StringRef operator*() const { return Current; }
+  const StringRef * operator->() const { return &Current; }
+  DIHeaderFieldIterator &operator++() {
+    increment();
+    return *this;
+  }
+  DIHeaderFieldIterator operator++(int) {
+    DIHeaderFieldIterator X(*this);
+    increment();
+    return X;
+  }
+  bool operator==(const DIHeaderFieldIterator &X) const {
+    return Current.data() == X.Current.data();
+  }
+  bool operator!=(const DIHeaderFieldIterator &X) const {
+    return !(*this == X);
+  }
+
+  StringRef getHeader() const { return Header; }
+  StringRef getCurrent() const { return Current; }
+  StringRef getPrefix() const {
+    if (Current.begin() == Header.begin())
+      return StringRef();
+    return Header.slice(0, Current.begin() - Header.begin() - 1);
+  }
+  StringRef getSuffix() const {
+    if (Current.end() == Header.end())
+      return StringRef();
+    return Header.slice(Current.end() - Header.begin() + 1, StringRef::npos);
+  }
+
+private:
+  void increment() {
+    assert(Current.data() != nullptr && "Cannot increment past the end");
+    StringRef Suffix = getSuffix();
+    Current = Suffix.slice(0, Suffix.find('\0'));
+  }
+};
+
+/// \brief A thin wraper around MDNode to access encoded debug info.
+///
+/// This should not be stored in a container, because the underlying MDNode may
+/// change in certain situations.
 class DIDescriptor {
   // Befriends DIRef so DIRef can befriend the protected member
   // function: getFieldAs<DIRef>.
   template <typename T> friend class DIRef;
 
 public:
+  /// \brief Accessibility flags.
+  ///
+  /// The three accessibility flags are mutually exclusive and rolled together
+  /// in the first two bits.
   enum {
-    FlagPrivate           = 1 << 0,
-    FlagProtected         = 1 << 1,
+    FlagAccessibility     = 1 << 0 | 1 << 1,
+    FlagPrivate           = 1,
+    FlagProtected         = 2,
+    FlagPublic            = 3,
+
     FlagFwdDecl           = 1 << 2,
     FlagAppleBlock        = 1 << 3,
     FlagBlockByrefStruct  = 1 << 4,
@@ -121,12 +181,36 @@
   bool operator==(DIDescriptor Other) const { return DbgNode == Other.DbgNode; }
   bool operator!=(DIDescriptor Other) const { return !operator==(Other); }
 
-  uint16_t getTag() const {
-    return getUnsignedField(0) & ~LLVMDebugVersionMask;
+  StringRef getHeader() const {
+    return getStringField(0);
   }
 
+  size_t getNumHeaderFields() const {
+    return std::distance(DIHeaderFieldIterator(getHeader()),
+                         DIHeaderFieldIterator());
+  }
+
+  StringRef getHeaderField(unsigned Index) const {
+    // Since callers expect an empty string for out-of-range accesses, we can't
+    // use std::advance() here.
+    for (DIHeaderFieldIterator I(getHeader()), E; I != E; ++I, --Index)
+      if (!Index)
+        return *I;
+    return StringRef();
+  }
+
+  template <class T> T getHeaderFieldAs(unsigned Index) const {
+    T Int;
+    if (getHeaderField(Index).getAsInteger(0, Int))
+      return 0;
+    return Int;
+  }
+
+  uint16_t getTag() const { return getHeaderFieldAs<uint16_t>(0); }
+
   bool isDerivedType() const;
   bool isCompositeType() const;
+  bool isSubroutineType() const;
   bool isBasicType() const;
   bool isVariable() const;
   bool isSubprogram() const;
@@ -140,20 +224,21 @@
   bool isSubrange() const;
   bool isEnumerator() const;
   bool isType() const;
-  bool isUnspecifiedParameter() const;
   bool isTemplateTypeParameter() const;
   bool isTemplateValueParameter() const;
   bool isObjCProperty() const;
   bool isImportedEntity() const;
+  bool isExpression() const;
 
-  /// print - print descriptor.
   void print(raw_ostream &OS) const;
-
-  /// dump - print descriptor to dbgs() with a newline.
   void dump() const;
+
+  /// \brief Replace all uses of debug info referenced by this descriptor.
+  void replaceAllUsesWith(LLVMContext &VMContext, DIDescriptor D);
+  void replaceAllUsesWith(MDNode *D);
 };
 
-/// DISubrange - This is used to represent ranges, for array bounds.
+/// \brief This is used to represent ranges, for array bounds.
 class DISubrange : public DIDescriptor {
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
@@ -161,23 +246,27 @@
 public:
   explicit DISubrange(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
-  int64_t getLo() const { return getInt64Field(1); }
-  int64_t getCount() const { return getInt64Field(2); }
+  int64_t getLo() const { return getHeaderFieldAs<int64_t>(1); }
+  int64_t getCount() const { return getHeaderFieldAs<int64_t>(2); }
   bool Verify() const;
 };
 
-/// DIArray - This descriptor holds an array of descriptors.
-class DIArray : public DIDescriptor {
+/// \brief This descriptor holds an array of nodes with type T.
+template <typename T> class DITypedArray : public DIDescriptor {
 public:
-  explicit DIArray(const MDNode *N = nullptr) : DIDescriptor(N) {}
-
-  unsigned getNumElements() const;
-  DIDescriptor getElement(unsigned Idx) const {
-    return getDescriptorField(Idx);
+  explicit DITypedArray(const MDNode *N = nullptr) : DIDescriptor(N) {}
+  unsigned getNumElements() const {
+    return DbgNode ? DbgNode->getNumOperands() : 0;
+  }
+  T getElement(unsigned Idx) const {
+    return getFieldAs<T>(Idx);
   }
 };
 
-/// DIEnumerator - A wrapper for an enumerator (e.g. X and Y in 'enum {X,Y}').
+typedef DITypedArray<DIDescriptor> DIArray;
+
+/// \brief A wrapper for an enumerator (e.g. X and Y in 'enum {X,Y}').
+///
 /// FIXME: it seems strange that this doesn't have either a reference to the
 /// type/precision or a file/line pair for location info.
 class DIEnumerator : public DIDescriptor {
@@ -187,16 +276,17 @@
 public:
   explicit DIEnumerator(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
-  StringRef getName() const { return getStringField(1); }
-  int64_t getEnumValue() const { return getInt64Field(2); }
+  StringRef getName() const { return getHeaderField(1); }
+  int64_t getEnumValue() const { return getHeaderFieldAs<int64_t>(2); }
   bool Verify() const;
 };
 
 template <typename T> class DIRef;
 typedef DIRef<DIScope> DIScopeRef;
 typedef DIRef<DIType> DITypeRef;
+typedef DITypedArray<DITypeRef> DITypeArray;
 
-/// DIScope - A base class for various scopes.
+/// \brief A base class for various scopes.
 ///
 /// Although, implementation-wise, DIScope is the parent class of most
 /// other DIxxx classes, including DIType and its descendants, most of
@@ -212,21 +302,28 @@
 public:
   explicit DIScope(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
-  /// Gets the parent scope for this scope node or returns a
-  /// default constructed scope.
+  /// \brief Get the parent scope.
+  ///
+  /// Gets the parent scope for this scope node or returns a default
+  /// constructed scope.
   DIScopeRef getContext() const;
+  /// \brief Get the scope name.
+  ///
   /// If the scope node has a name, return that, else return an empty string.
   StringRef getName() const;
   StringRef getFilename() const;
   StringRef getDirectory() const;
 
-  /// Generate a reference to this DIScope. Uses the type identifier instead
-  /// of the actual MDNode if possible, to help type uniquing.
+  /// \brief Generate a reference to this DIScope.
+  ///
+  /// Uses the type identifier instead of the actual MDNode if possible, to
+  /// help type uniquing.
   DIScopeRef getRef() const;
 };
 
-/// Represents reference to a DIDescriptor, abstracts over direct and
-/// identifier-based metadata references.
+/// \brief Represents reference to a DIDescriptor.
+///
+/// Abstracts over direct and identifier-based metadata references.
 template <typename T> class DIRef {
   template <typename DescTy>
   friend DescTy DIDescriptor::getFieldAs(unsigned Elt) const;
@@ -234,15 +331,16 @@
   friend DIScopeRef DIScope::getRef() const;
   friend class DIType;
 
-  /// Val can be either a MDNode or a MDString, in the latter,
-  /// MDString specifies the type identifier.
-  const Value *Val;
-  explicit DIRef(const Value *V);
+  /// \brief Val can be either a MDNode or a MDString.
+  ///
+  /// In the latter, MDString specifies the type identifier.
+  const Metadata *Val;
+  explicit DIRef(const Metadata *V);
 
 public:
   T resolve(const DITypeIdentifierMap &Map) const;
   StringRef getName() const;
-  operator Value *() const { return const_cast<Value *>(Val); }
+  operator Metadata *() const { return const_cast<Metadata *>(Val); }
 };
 
 template <typename T>
@@ -273,17 +371,18 @@
   return MS->getString();
 }
 
-/// Specialize getFieldAs to handle fields that are references to DIScopes.
+/// \brief Handle fields that are references to DIScopes.
 template <> DIScopeRef DIDescriptor::getFieldAs<DIScopeRef>(unsigned Elt) const;
-/// Specialize DIRef constructor for DIScopeRef.
-template <> DIRef<DIScope>::DIRef(const Value *V);
+/// \brief Specialize DIRef constructor for DIScopeRef.
+template <> DIRef<DIScope>::DIRef(const Metadata *V);
 
-/// Specialize getFieldAs to handle fields that are references to DITypes.
+/// \brief Handle fields that are references to DITypes.
 template <> DITypeRef DIDescriptor::getFieldAs<DITypeRef>(unsigned Elt) const;
-/// Specialize DIRef constructor for DITypeRef.
-template <> DIRef<DIType>::DIRef(const Value *V);
+/// \brief Specialize DIRef constructor for DITypeRef.
+template <> DIRef<DIType>::DIRef(const Metadata *V);
 
-/// DIType - This is a wrapper for a type.
+/// \briefThis is a wrapper for a type.
+///
 /// FIXME: Types should be factored much better so that CV qualifiers and
 /// others do not require a huge and empty descriptor full of zeros.
 class DIType : public DIScope {
@@ -299,22 +398,35 @@
     return DITypeRef(&*getRef());
   }
 
-  /// Verify - Verify that a type descriptor is well formed.
   bool Verify() const;
 
   DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(2); }
-  StringRef getName() const { return getStringField(3); }
-  unsigned getLineNumber() const { return getUnsignedField(4); }
-  uint64_t getSizeInBits() const { return getUInt64Field(5); }
-  uint64_t getAlignInBits() const { return getUInt64Field(6); }
+  StringRef getName() const { return getHeaderField(1); }
+  unsigned getLineNumber() const {
+    return getHeaderFieldAs<unsigned>(2);
+  }
+  uint64_t getSizeInBits() const {
+    return getHeaderFieldAs<unsigned>(3);
+  }
+  uint64_t getAlignInBits() const {
+    return getHeaderFieldAs<unsigned>(4);
+  }
   // FIXME: Offset is only used for DW_TAG_member nodes.  Making every type
   // carry this is just plain insane.
-  uint64_t getOffsetInBits() const { return getUInt64Field(7); }
-  unsigned getFlags() const { return getUnsignedField(8); }
-  bool isPrivate() const { return (getFlags() & FlagPrivate) != 0; }
-  bool isProtected() const { return (getFlags() & FlagProtected) != 0; }
+  uint64_t getOffsetInBits() const {
+    return getHeaderFieldAs<unsigned>(5);
+  }
+  unsigned getFlags() const { return getHeaderFieldAs<unsigned>(6); }
+  bool isPrivate() const {
+    return (getFlags() & FlagAccessibility) == FlagPrivate;
+  }
+  bool isProtected() const {
+    return (getFlags() & FlagAccessibility) == FlagProtected;
+  }
+  bool isPublic() const {
+    return (getFlags() & FlagAccessibility) == FlagPublic;
+  }
   bool isForwardDecl() const { return (getFlags() & FlagFwdDecl) != 0; }
-  // isAppleBlock - Return true if this is the Apple Blocks extension.
   bool isAppleBlockExtension() const {
     return (getFlags() & FlagAppleBlock) != 0;
   }
@@ -336,27 +448,22 @@
     return (getFlags() & FlagRValueReference) != 0;
   }
   bool isValid() const { return DbgNode && isType(); }
-
-  /// replaceAllUsesWith - Replace all uses of debug info referenced by
-  /// this descriptor.
-  void replaceAllUsesWith(LLVMContext &VMContext, DIDescriptor D);
-  void replaceAllUsesWith(MDNode *D);
 };
 
-/// DIBasicType - A basic type, like 'int' or 'float'.
+/// \brief A basic type, like 'int' or 'float'.
 class DIBasicType : public DIType {
 public:
   explicit DIBasicType(const MDNode *N = nullptr) : DIType(N) {}
 
-  unsigned getEncoding() const { return getUnsignedField(9); }
+  unsigned getEncoding() const { return getHeaderFieldAs<unsigned>(7); }
 
-  /// Verify - Verify that a basic type descriptor is well formed.
   bool Verify() const;
 };
 
-/// DIDerivedType - A simple derived type, like a const qualified type,
-/// a typedef, a pointer or reference, et cetera.  Or, a data member of
-/// a class/struct/union.
+/// \brief A simple derived type
+///
+/// Like a const qualified type, a typedef, a pointer or reference, et cetera.
+/// Or, a data member of a class/struct/union.
 class DIDerivedType : public DIType {
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
@@ -364,28 +471,29 @@
 public:
   explicit DIDerivedType(const MDNode *N = nullptr) : DIType(N) {}
 
-  DITypeRef getTypeDerivedFrom() const { return getFieldAs<DITypeRef>(9); }
+  DITypeRef getTypeDerivedFrom() const { return getFieldAs<DITypeRef>(3); }
 
-  /// getObjCProperty - Return property node, if this ivar is
-  /// associated with one.
+  /// \brief Return property node, if this ivar is associated with one.
   MDNode *getObjCProperty() const;
 
   DITypeRef getClassType() const {
     assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
-    return getFieldAs<DITypeRef>(10);
+    return getFieldAs<DITypeRef>(4);
   }
 
   Constant *getConstant() const {
     assert((getTag() == dwarf::DW_TAG_member) && isStaticMember());
-    return getConstantField(10);
+    return getConstantField(4);
   }
 
-  /// Verify - Verify that a derived type descriptor is well formed.
   bool Verify() const;
 };
 
-/// DICompositeType - This descriptor holds a type that can refer to multiple
-/// other types, like a function or struct.
+/// \brief Types that refer to multiple other types.
+///
+/// This descriptor holds a type that can refer to multiple other types, like a
+/// function or struct.
+///
 /// DICompositeType is derived from DIDerivedType because some
 /// composite types (such as enums) can be derived from basic types
 // FIXME: Make this derive from DIType directly & just store the
@@ -394,32 +502,57 @@
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
 
+  /// \brief Set the array of member DITypes.
+  void setArraysHelper(MDNode *Elements, MDNode *TParams);
+
 public:
   explicit DICompositeType(const MDNode *N = nullptr) : DIDerivedType(N) {}
 
-  DIArray getTypeArray() const { return getFieldAs<DIArray>(10); }
-  void setTypeArray(DIArray Elements, DIArray TParams = DIArray());
-  unsigned getRunTimeLang() const { return getUnsignedField(11); }
-  DITypeRef getContainingType() const { return getFieldAs<DITypeRef>(12); }
+  DIArray getElements() const {
+    assert(!isSubroutineType() && "no elements for DISubroutineType");
+    return getFieldAs<DIArray>(4);
+  }
+  template <typename T>
+  void setArrays(DITypedArray<T> Elements, DIArray TParams = DIArray()) {
+    assert((!TParams || DbgNode->getNumOperands() == 8) &&
+           "If you're setting the template parameters this should include a slot "
+           "for that!");
+    setArraysHelper(Elements, TParams);
+  }
+  unsigned getRunTimeLang() const {
+    return getHeaderFieldAs<unsigned>(7);
+  }
+  DITypeRef getContainingType() const { return getFieldAs<DITypeRef>(5); }
+
+  /// \brief Set the containing type.
   void setContainingType(DICompositeType ContainingType);
-  DIArray getTemplateParams() const { return getFieldAs<DIArray>(13); }
+  DIArray getTemplateParams() const { return getFieldAs<DIArray>(6); }
   MDString *getIdentifier() const;
 
-  /// Verify - Verify that a composite type descriptor is well formed.
   bool Verify() const;
 };
 
-/// DIFile - This is a wrapper for a file.
+class DISubroutineType : public DICompositeType {
+public:
+  explicit DISubroutineType(const MDNode *N = nullptr) : DICompositeType(N) {}
+  DITypedArray<DITypeRef> getTypeArray() const {
+    return getFieldAs<DITypedArray<DITypeRef>>(4);
+  }
+};
+
+/// \brief This is a wrapper for a file.
 class DIFile : public DIScope {
   friend class DIDescriptor;
 
 public:
   explicit DIFile(const MDNode *N = nullptr) : DIScope(N) {}
+
+  /// \brief Retrieve the MDNode for the directory/file pair.
   MDNode *getFileNode() const;
   bool Verify() const;
 };
 
-/// DICompileUnit - A wrapper for a compile unit.
+/// \brief A wrapper for a compile unit.
 class DICompileUnit : public DIScope {
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
@@ -428,13 +561,13 @@
   explicit DICompileUnit(const MDNode *N = nullptr) : DIScope(N) {}
 
   dwarf::SourceLanguage getLanguage() const {
-    return static_cast<dwarf::SourceLanguage>(getUnsignedField(2));
+    return static_cast<dwarf::SourceLanguage>(getHeaderFieldAs<unsigned>(1));
   }
-  StringRef getProducer() const { return getStringField(3); }
+  StringRef getProducer() const { return getHeaderField(2); }
 
-  bool isOptimized() const { return getUnsignedField(4) != 0; }
-  StringRef getFlags() const { return getStringField(5); }
-  unsigned getRunTimeVersion() const { return getUnsignedField(6); }
+  bool isOptimized() const { return getHeaderFieldAs<bool>(3) != 0; }
+  StringRef getFlags() const { return getHeaderField(4); }
+  unsigned getRunTimeVersion() const { return getHeaderFieldAs<unsigned>(5); }
 
   DIArray getEnumTypes() const;
   DIArray getRetainedTypes() const;
@@ -442,14 +575,16 @@
   DIArray getGlobalVariables() const;
   DIArray getImportedEntities() const;
 
-  StringRef getSplitDebugFilename() const { return getStringField(12); }
-  unsigned getEmissionKind() const { return getUnsignedField(13); }
+  void replaceSubprograms(DIArray Subprograms);
+  void replaceGlobalVariables(DIArray GlobalVariables);
 
-  /// Verify - Verify that a compile unit is well formed.
+  StringRef getSplitDebugFilename() const { return getHeaderField(6); }
+  unsigned getEmissionKind() const { return getHeaderFieldAs<unsigned>(7); }
+
   bool Verify() const;
 };
 
-/// DISubprogram - This is a wrapper for a subprogram (e.g. a function).
+/// \brief This is a wrapper for a subprogram (e.g. a function).
 class DISubprogram : public DIScope {
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
@@ -457,93 +592,95 @@
 public:
   explicit DISubprogram(const MDNode *N = nullptr) : DIScope(N) {}
 
+  StringRef getName() const { return getHeaderField(1); }
+  StringRef getDisplayName() const { return getHeaderField(2); }
+  StringRef getLinkageName() const { return getHeaderField(3); }
+  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(4); }
+
+  /// \brief Check if this is local (like 'static' in C).
+  unsigned isLocalToUnit() const { return getHeaderFieldAs<unsigned>(5); }
+  unsigned isDefinition() const { return getHeaderFieldAs<unsigned>(6); }
+
+  unsigned getVirtuality() const { return getHeaderFieldAs<unsigned>(7); }
+  unsigned getVirtualIndex() const { return getHeaderFieldAs<unsigned>(8); }
+
+  unsigned getFlags() const { return getHeaderFieldAs<unsigned>(9); }
+
+  unsigned isOptimized() const { return getHeaderFieldAs<bool>(10); }
+
+  /// \brief Get the beginning of the scope of the function (not the name).
+  unsigned getScopeLineNumber() const { return getHeaderFieldAs<unsigned>(11); }
+
   DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(2); }
-  StringRef getName() const { return getStringField(3); }
-  StringRef getDisplayName() const { return getStringField(4); }
-  StringRef getLinkageName() const { return getStringField(5); }
-  unsigned getLineNumber() const { return getUnsignedField(6); }
-  DICompositeType getType() const { return getFieldAs<DICompositeType>(7); }
+  DISubroutineType getType() const { return getFieldAs<DISubroutineType>(3); }
 
-  /// isLocalToUnit - Return true if this subprogram is local to the current
-  /// compile unit, like 'static' in C.
-  unsigned isLocalToUnit() const { return getUnsignedField(8); }
-  unsigned isDefinition() const { return getUnsignedField(9); }
+  DITypeRef getContainingType() const { return getFieldAs<DITypeRef>(4); }
 
-  unsigned getVirtuality() const { return getUnsignedField(10); }
-  unsigned getVirtualIndex() const { return getUnsignedField(11); }
-
-  DITypeRef getContainingType() const { return getFieldAs<DITypeRef>(12); }
-
-  unsigned getFlags() const { return getUnsignedField(13); }
-
-  unsigned isArtificial() const {
-    return (getUnsignedField(13) & FlagArtificial) != 0;
-  }
-  /// isPrivate - Return true if this subprogram has "private"
-  /// access specifier.
-  bool isPrivate() const { return (getUnsignedField(13) & FlagPrivate) != 0; }
-  /// isProtected - Return true if this subprogram has "protected"
-  /// access specifier.
-  bool isProtected() const {
-    return (getUnsignedField(13) & FlagProtected) != 0;
-  }
-  /// isExplicit - Return true if this subprogram is marked as explicit.
-  bool isExplicit() const { return (getUnsignedField(13) & FlagExplicit) != 0; }
-  /// isPrototyped - Return true if this subprogram is prototyped.
-  bool isPrototyped() const {
-    return (getUnsignedField(13) & FlagPrototyped) != 0;
-  }
-
-  /// Return true if this subprogram is a C++11 reference-qualified
-  /// non-static member function (void foo() &).
-  unsigned isLValueReference() const {
-    return (getUnsignedField(13) & FlagLValueReference) != 0;
-  }
-
-  /// Return true if this subprogram is a C++11
-  /// rvalue-reference-qualified non-static member function
-  /// (void foo() &&).
-  unsigned isRValueReference() const {
-    return (getUnsignedField(13) & FlagRValueReference) != 0;
-  }
-
-  unsigned isOptimized() const;
-
-  /// Verify - Verify that a subprogram descriptor is well formed.
   bool Verify() const;
 
-  /// describes - Return true if this subprogram provides debugging
-  /// information for the function F.
+  /// \brief Check if this provides debugging information for the function F.
   bool describes(const Function *F);
 
-  Function *getFunction() const { return getFunctionField(15); }
-  void replaceFunction(Function *F) { replaceFunctionField(15, F); }
-  DIArray getTemplateParams() const { return getFieldAs<DIArray>(16); }
+  Function *getFunction() const { return getFunctionField(5); }
+  void replaceFunction(Function *F) { replaceFunctionField(5, F); }
+  DIArray getTemplateParams() const { return getFieldAs<DIArray>(6); }
   DISubprogram getFunctionDeclaration() const {
-    return getFieldAs<DISubprogram>(17);
+    return getFieldAs<DISubprogram>(7);
   }
   MDNode *getVariablesNodes() const;
   DIArray getVariables() const;
 
-  /// getScopeLineNumber - Get the beginning of the scope of the
-  /// function, not necessarily where the name of the program
-  /// starts.
-  unsigned getScopeLineNumber() const { return getUnsignedField(19); }
+  unsigned isArtificial() const { return (getFlags() & FlagArtificial) != 0; }
+  /// \brief Check for the "private" access specifier.
+  bool isPrivate() const {
+    return (getFlags() & FlagAccessibility) == FlagPrivate;
+  }
+  /// \brief Check for the "protected" access specifier.
+  bool isProtected() const {
+    return (getFlags() & FlagAccessibility) == FlagProtected;
+  }
+  /// \brief Check for the "public" access specifier.
+  bool isPublic() const {
+    return (getFlags() & FlagAccessibility) == FlagPublic;
+  }
+  /// \brief Check for "explicit".
+  bool isExplicit() const { return (getFlags() & FlagExplicit) != 0; }
+  /// \brief Check if this is prototyped.
+  bool isPrototyped() const { return (getFlags() & FlagPrototyped) != 0; }
+
+  /// \brief Check if this is reference-qualified.
+  ///
+  /// Return true if this subprogram is a C++11 reference-qualified non-static
+  /// member function (void foo() &).
+  unsigned isLValueReference() const {
+    return (getFlags() & FlagLValueReference) != 0;
+  }
+
+  /// \brief Check if this is rvalue-reference-qualified.
+  ///
+  /// Return true if this subprogram is a C++11 rvalue-reference-qualified
+  /// non-static member function (void foo() &&).
+  unsigned isRValueReference() const {
+    return (getFlags() & FlagRValueReference) != 0;
+  }
+
 };
 
-/// DILexicalBlock - This is a wrapper for a lexical block.
+/// \brief This is a wrapper for a lexical block.
 class DILexicalBlock : public DIScope {
 public:
   explicit DILexicalBlock(const MDNode *N = nullptr) : DIScope(N) {}
   DIScope getContext() const { return getFieldAs<DIScope>(2); }
-  unsigned getLineNumber() const { return getUnsignedField(3); }
-  unsigned getColumnNumber() const { return getUnsignedField(4); }
-  unsigned getDiscriminator() const { return getUnsignedField(5); }
+  unsigned getLineNumber() const {
+    return getHeaderFieldAs<unsigned>(1);
+  }
+  unsigned getColumnNumber() const {
+    return getHeaderFieldAs<unsigned>(2);
+  }
   bool Verify() const;
 };
 
-/// DILexicalBlockFile - This is a wrapper for a lexical block with
-/// a filename change.
+/// \brief This is a wrapper for a lexical block with a filename change.
 class DILexicalBlockFile : public DIScope {
 public:
   explicit DILexicalBlockFile(const MDNode *N = nullptr) : DIScope(N) {}
@@ -555,68 +692,63 @@
   unsigned getLineNumber() const { return getScope().getLineNumber(); }
   unsigned getColumnNumber() const { return getScope().getColumnNumber(); }
   DILexicalBlock getScope() const { return getFieldAs<DILexicalBlock>(2); }
+  unsigned getDiscriminator() const { return getHeaderFieldAs<unsigned>(1); }
   bool Verify() const;
 };
 
-/// DINameSpace - A wrapper for a C++ style name space.
+/// \brief A wrapper for a C++ style name space.
 class DINameSpace : public DIScope {
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
 
 public:
   explicit DINameSpace(const MDNode *N = nullptr) : DIScope(N) {}
+  StringRef getName() const { return getHeaderField(1); }
+  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(2); }
   DIScope getContext() const { return getFieldAs<DIScope>(2); }
-  StringRef getName() const { return getStringField(3); }
-  unsigned getLineNumber() const { return getUnsignedField(4); }
   bool Verify() const;
 };
 
-/// DIUnspecifiedParameter - This is a wrapper for unspecified parameters.
-class DIUnspecifiedParameter : public DIDescriptor {
-public:
-  explicit DIUnspecifiedParameter(const MDNode *N = nullptr)
-    : DIDescriptor(N) {}
-  bool Verify() const;
-};
-
-/// DITemplateTypeParameter - This is a wrapper for template type parameter.
+/// \brief This is a wrapper for template type parameter.
 class DITemplateTypeParameter : public DIDescriptor {
 public:
   explicit DITemplateTypeParameter(const MDNode *N = nullptr)
     : DIDescriptor(N) {}
 
+  StringRef getName() const { return getHeaderField(1); }
+  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(2); }
+  unsigned getColumnNumber() const { return getHeaderFieldAs<unsigned>(3); }
+
   DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
-  StringRef getName() const { return getStringField(2); }
-  DITypeRef getType() const { return getFieldAs<DITypeRef>(3); }
-  StringRef getFilename() const { return getFieldAs<DIFile>(4).getFilename(); }
+  DITypeRef getType() const { return getFieldAs<DITypeRef>(2); }
+  StringRef getFilename() const { return getFieldAs<DIFile>(3).getFilename(); }
   StringRef getDirectory() const {
-    return getFieldAs<DIFile>(4).getDirectory();
+    return getFieldAs<DIFile>(3).getDirectory();
   }
-  unsigned getLineNumber() const { return getUnsignedField(5); }
-  unsigned getColumnNumber() const { return getUnsignedField(6); }
   bool Verify() const;
 };
 
-/// DITemplateValueParameter - This is a wrapper for template value parameter.
+/// \brief This is a wrapper for template value parameter.
 class DITemplateValueParameter : public DIDescriptor {
 public:
   explicit DITemplateValueParameter(const MDNode *N = nullptr)
     : DIDescriptor(N) {}
 
+  StringRef getName() const { return getHeaderField(1); }
+  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(2); }
+  unsigned getColumnNumber() const { return getHeaderFieldAs<unsigned>(3); }
+
   DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
-  StringRef getName() const { return getStringField(2); }
-  DITypeRef getType() const { return getFieldAs<DITypeRef>(3); }
+  DITypeRef getType() const { return getFieldAs<DITypeRef>(2); }
   Value *getValue() const;
-  StringRef getFilename() const { return getFieldAs<DIFile>(5).getFilename(); }
+  StringRef getFilename() const { return getFieldAs<DIFile>(4).getFilename(); }
   StringRef getDirectory() const {
-    return getFieldAs<DIFile>(5).getDirectory();
+    return getFieldAs<DIFile>(4).getDirectory();
   }
-  unsigned getLineNumber() const { return getUnsignedField(6); }
-  unsigned getColumnNumber() const { return getUnsignedField(7); }
   bool Verify() const;
 };
 
-/// DIGlobalVariable - This is a wrapper for a global variable.
+/// \brief This is a wrapper for a global variable.
 class DIGlobalVariable : public DIDescriptor {
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
@@ -624,32 +756,30 @@
 public:
   explicit DIGlobalVariable(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
-  DIScope getContext() const { return getFieldAs<DIScope>(2); }
-  StringRef getName() const { return getStringField(3); }
-  StringRef getDisplayName() const { return getStringField(4); }
-  StringRef getLinkageName() const { return getStringField(5); }
-  StringRef getFilename() const { return getFieldAs<DIFile>(6).getFilename(); }
+  StringRef getName() const { return getHeaderField(1); }
+  StringRef getDisplayName() const { return getHeaderField(2); }
+  StringRef getLinkageName() const { return getHeaderField(3); }
+  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(4); }
+  unsigned isLocalToUnit() const { return getHeaderFieldAs<bool>(5); }
+  unsigned isDefinition() const { return getHeaderFieldAs<bool>(6); }
+
+  DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
+  StringRef getFilename() const { return getFieldAs<DIFile>(2).getFilename(); }
   StringRef getDirectory() const {
-    return getFieldAs<DIFile>(6).getDirectory();
+    return getFieldAs<DIFile>(2).getDirectory();
   }
+  DITypeRef getType() const { return getFieldAs<DITypeRef>(3); }
 
-  unsigned getLineNumber() const { return getUnsignedField(7); }
-  DITypeRef getType() const { return getFieldAs<DITypeRef>(8); }
-  unsigned isLocalToUnit() const { return getUnsignedField(9); }
-  unsigned isDefinition() const { return getUnsignedField(10); }
-
-  GlobalVariable *getGlobal() const { return getGlobalVariableField(11); }
-  Constant *getConstant() const { return getConstantField(11); }
+  GlobalVariable *getGlobal() const { return getGlobalVariableField(4); }
+  Constant *getConstant() const { return getConstantField(4); }
   DIDerivedType getStaticDataMemberDeclaration() const {
-    return getFieldAs<DIDerivedType>(12);
+    return getFieldAs<DIDerivedType>(5);
   }
 
-  /// Verify - Verify that a global variable descriptor is well formed.
   bool Verify() const;
 };
 
-/// DIVariable - This is a wrapper for a variable (e.g. parameter, local,
-/// global etc).
+/// \brief This is a wrapper for a variable (e.g. parameter, local, global etc).
 class DIVariable : public DIDescriptor {
   friend class DIDescriptor;
   void printInternal(raw_ostream &OS) const;
@@ -657,65 +787,83 @@
 public:
   explicit DIVariable(const MDNode *N = nullptr) : DIDescriptor(N) {}
 
-  DIScope getContext() const { return getFieldAs<DIScope>(1); }
-  StringRef getName() const { return getStringField(2); }
-  DIFile getFile() const { return getFieldAs<DIFile>(3); }
-  unsigned getLineNumber() const { return (getUnsignedField(4) << 8) >> 8; }
-  unsigned getArgNumber() const {
-    unsigned L = getUnsignedField(4);
-    return L >> 24;
+  StringRef getName() const { return getHeaderField(1); }
+  unsigned getLineNumber() const {
+    // FIXME: Line number and arg number shouldn't be merged together like this.
+    return (getHeaderFieldAs<unsigned>(2) << 8) >> 8;
   }
-  DITypeRef getType() const { return getFieldAs<DITypeRef>(5); }
+  unsigned getArgNumber() const { return getHeaderFieldAs<unsigned>(2) >> 24; }
 
-  /// isArtificial - Return true if this variable is marked as "artificial".
+  DIScope getContext() const { return getFieldAs<DIScope>(1); }
+  DIFile getFile() const { return getFieldAs<DIFile>(2); }
+  DITypeRef getType() const { return getFieldAs<DITypeRef>(3); }
+
+  /// \brief Return true if this variable is marked as "artificial".
   bool isArtificial() const {
-    return (getUnsignedField(6) & FlagArtificial) != 0;
+    return (getHeaderFieldAs<unsigned>(3) & FlagArtificial) != 0;
   }
 
   bool isObjectPointer() const {
-    return (getUnsignedField(6) & FlagObjectPointer) != 0;
+    return (getHeaderFieldAs<unsigned>(3) & FlagObjectPointer) != 0;
   }
 
   /// \brief Return true if this variable is represented as a pointer.
   bool isIndirect() const {
-    return (getUnsignedField(6) & FlagIndirectVariable) != 0;
+    return (getHeaderFieldAs<unsigned>(3) & FlagIndirectVariable) != 0;
   }
 
-  /// getInlinedAt - If this variable is inlined then return inline location.
+  /// \brief If this variable is inlined then return inline location.
   MDNode *getInlinedAt() const;
 
-  /// Verify - Verify that a variable descriptor is well formed.
   bool Verify() const;
 
-  /// HasComplexAddr - Return true if the variable has a complex address.
-  bool hasComplexAddress() const { return getNumAddrElements() > 0; }
-
-  /// \brief Return the size of this variable's complex address or
-  /// zero if there is none.
-  unsigned getNumAddrElements() const {
-    if (DbgNode->getNumOperands() < 9)
-      return 0;
-    return getDescriptorField(8)->getNumOperands();
-  }
-
-  /// \brief return the Idx'th complex address element.
-  uint64_t getAddrElement(unsigned Idx) const;
-
-  /// isBlockByrefVariable - Return true if the variable was declared as
-  /// a "__block" variable (Apple Blocks).
+  /// \brief Check if this is a "__block" variable (Apple Blocks).
   bool isBlockByrefVariable(const DITypeIdentifierMap &Map) const {
     return (getType().resolve(Map)).isBlockByrefStruct();
   }
 
-  /// isInlinedFnArgument - Return true if this variable provides debugging
-  /// information for an inlined function arguments.
+  /// \brief Check if this is an inlined function argument.
   bool isInlinedFnArgument(const Function *CurFn);
 
+  /// \brief Return the size reported by the variable's type.
+  unsigned getSizeInBits(const DITypeIdentifierMap &Map);
+
   void printExtendedName(raw_ostream &OS) const;
 };
 
-/// DILocation - This object holds location information. This object
-/// is not associated with any DWARF tag.
+/// \brief A complex location expression.
+class DIExpression : public DIDescriptor {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DIExpression(const MDNode *N = nullptr) : DIDescriptor(N) {}
+
+  bool Verify() const;
+
+  /// \brief Return the number of elements in the complex expression.
+  unsigned getNumElements() const {
+    if (!DbgNode)
+      return 0;
+    unsigned N = getNumHeaderFields();
+    assert(N > 0 && "missing tag");
+    return N - 1;
+  }
+
+  /// \brief return the Idx'th complex address element.
+  uint64_t getElement(unsigned Idx) const;
+
+  /// \brief Return whether this is a piece of an aggregate variable.
+  bool isVariablePiece() const;
+  /// \brief Return the offset of this piece in bytes.
+  uint64_t getPieceOffset() const;
+  /// \brief Return the size of this piece in bytes.
+  uint64_t getPieceSize() const;
+};
+
+/// \brief This object holds location information.
+///
+/// This object is not associated with any DWARF tag.
 class DILocation : public DIDescriptor {
 public:
   explicit DILocation(const MDNode *N) : DIDescriptor(N) {}
@@ -731,23 +879,28 @@
     return (getLineNumber() == Other.getLineNumber() &&
             getFilename() == Other.getFilename());
   }
-  /// getDiscriminator - DWARF discriminators are used to distinguish
-  /// identical file locations for instructions that are on different
-  /// basic blocks. If two instructions are inside the same lexical block
-  /// and are in different basic blocks, we create a new lexical block
-  /// with identical location as the original but with a different
-  /// discriminator value (lib/Transforms/Util/AddDiscriminators.cpp
-  /// for details).
+  /// \brief Get the DWAF discriminator.
+  ///
+  /// DWARF discriminators are used to distinguish identical file locations for
+  /// instructions that are on different basic blocks. If two instructions are
+  /// inside the same lexical block and are in different basic blocks, we
+  /// create a new lexical block with identical location as the original but
+  /// with a different discriminator value
+  /// (lib/Transforms/Util/AddDiscriminators.cpp for details).
   unsigned getDiscriminator() const {
     // Since discriminators are associated with lexical blocks, make
     // sure this location is a lexical block before retrieving its
     // value.
-    return getScope().isLexicalBlock()
-               ? getFieldAs<DILexicalBlock>(2).getDiscriminator()
+    return getScope().isLexicalBlockFile()
+               ? getFieldAs<DILexicalBlockFile>(2).getDiscriminator()
                : 0;
   }
+
+  /// \brief Generate a new discriminator value for this location.
   unsigned computeNewDiscriminator(LLVMContext &Ctx);
-  DILocation copyWithNewScope(LLVMContext &Ctx, DILexicalBlock NewScope);
+
+  /// \brief Return a copy of this location with a different scope.
+  DILocation copyWithNewScope(LLVMContext &Ctx, DILexicalBlockFile NewScope);
 };
 
 class DIObjCProperty : public DIDescriptor {
@@ -757,36 +910,38 @@
 public:
   explicit DIObjCProperty(const MDNode *N) : DIDescriptor(N) {}
 
-  StringRef getObjCPropertyName() const { return getStringField(1); }
-  DIFile getFile() const { return getFieldAs<DIFile>(2); }
-  unsigned getLineNumber() const { return getUnsignedField(3); }
+  StringRef getObjCPropertyName() const { return getHeaderField(1); }
+  DIFile getFile() const { return getFieldAs<DIFile>(1); }
+  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(2); }
 
-  StringRef getObjCPropertyGetterName() const { return getStringField(4); }
-  StringRef getObjCPropertySetterName() const { return getStringField(5); }
+  StringRef getObjCPropertyGetterName() const { return getHeaderField(3); }
+  StringRef getObjCPropertySetterName() const { return getHeaderField(4); }
+  unsigned getAttributes() const { return getHeaderFieldAs<unsigned>(5); }
   bool isReadOnlyObjCProperty() const {
-    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_readonly) != 0;
+    return (getAttributes() & dwarf::DW_APPLE_PROPERTY_readonly) != 0;
   }
   bool isReadWriteObjCProperty() const {
-    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_readwrite) != 0;
+    return (getAttributes() & dwarf::DW_APPLE_PROPERTY_readwrite) != 0;
   }
   bool isAssignObjCProperty() const {
-    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_assign) != 0;
+    return (getAttributes() & dwarf::DW_APPLE_PROPERTY_assign) != 0;
   }
   bool isRetainObjCProperty() const {
-    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_retain) != 0;
+    return (getAttributes() & dwarf::DW_APPLE_PROPERTY_retain) != 0;
   }
   bool isCopyObjCProperty() const {
-    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_copy) != 0;
+    return (getAttributes() & dwarf::DW_APPLE_PROPERTY_copy) != 0;
   }
   bool isNonAtomicObjCProperty() const {
-    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_nonatomic) != 0;
+    return (getAttributes() & dwarf::DW_APPLE_PROPERTY_nonatomic) != 0;
   }
 
-  /// Objective-C doesn't have an ODR, so there is no benefit in storing
+  /// \brief Get the type.
+  ///
+  /// \note Objective-C doesn't have an ODR, so there is no benefit in storing
   /// the type as a DITypeRef here.
-  DIType getType() const { return getFieldAs<DIType>(7); }
+  DIType getType() const { return getFieldAs<DIType>(2); }
 
-  /// Verify - Verify that a derived type descriptor is well formed.
   bool Verify() const;
 };
 
@@ -799,47 +954,47 @@
   explicit DIImportedEntity(const MDNode *N) : DIDescriptor(N) {}
   DIScope getContext() const { return getFieldAs<DIScope>(1); }
   DIScopeRef getEntity() const { return getFieldAs<DIScopeRef>(2); }
-  unsigned getLineNumber() const { return getUnsignedField(3); }
-  StringRef getName() const { return getStringField(4); }
+  unsigned getLineNumber() const { return getHeaderFieldAs<unsigned>(1); }
+  StringRef getName() const { return getHeaderField(2); }
   bool Verify() const;
 };
 
-/// getDISubprogram - Find subprogram that is enclosing this scope.
+/// \brief Find subprogram that is enclosing this scope.
 DISubprogram getDISubprogram(const MDNode *Scope);
 
-/// getDICompositeType - Find underlying composite type.
+/// \brief Find debug info for a given function.
+/// \returns a valid DISubprogram, if found. Otherwise, it returns an empty
+/// DISubprogram.
+DISubprogram getDISubprogram(const Function *F);
+
+/// \brief Find underlying composite type.
 DICompositeType getDICompositeType(DIType T);
 
-/// getOrInsertFnSpecificMDNode - Return a NameMDNode that is suitable
-/// to hold function specific information.
-NamedMDNode *getOrInsertFnSpecificMDNode(Module &M, DISubprogram SP);
-
-/// getFnSpecificMDNode - Return a NameMDNode, if available, that is
-/// suitable to hold function specific information.
-NamedMDNode *getFnSpecificMDNode(const Module &M, DISubprogram SP);
-
-/// createInlinedVariable - Create a new inlined variable based on current
-/// variable.
+/// \brief Create a new inlined variable based on current variable.
+///
 /// @param DV            Current Variable.
 /// @param InlinedScope  Location at current variable is inlined.
 DIVariable createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
                                  LLVMContext &VMContext);
 
-/// cleanseInlinedVariable - Remove inlined scope from the variable.
+/// \brief Remove inlined scope from the variable.
 DIVariable cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext);
 
-/// Construct DITypeIdentifierMap by going through retained types of each CU.
+/// \brief Generate map by visiting all retained types.
 DITypeIdentifierMap generateDITypeIdentifierMap(const NamedMDNode *CU_Nodes);
 
-/// Strip debug info in the module if it exists.
+/// \brief Strip debug info in the module if it exists.
+///
 /// To do this, we remove all calls to the debugger intrinsics and any named
 /// metadata for debugging. We also remove debug locations for instructions.
 /// Return true if module is modified.
 bool StripDebugInfo(Module &M);
 
-/// Return Debug Info Metadata Version by checking module flags.
+/// \brief Return Debug Info Metadata Version by checking module flags.
 unsigned getDebugMetadataVersionFromModule(const Module &M);
 
+/// \brief Utility to find all debug info in a module.
+///
 /// DebugInfoFinder tries to list all debug info MDNodes used in a module. To
 /// list debug info MDNodes used by an instruction, DebugInfoFinder uses
 /// processDeclare, processValue and processLocation to handle DbgDeclareInst,
@@ -850,44 +1005,29 @@
 public:
   DebugInfoFinder() : TypeMapInitialized(false) {}
 
-  /// processModule - Process entire module and collect debug info
-  /// anchors.
+  /// \brief Process entire module and collect debug info anchors.
   void processModule(const Module &M);
 
-  /// processDeclare - Process DbgDeclareInst.
+  /// \brief Process DbgDeclareInst.
   void processDeclare(const Module &M, const DbgDeclareInst *DDI);
-  /// Process DbgValueInst.
+  /// \brief Process DbgValueInst.
   void processValue(const Module &M, const DbgValueInst *DVI);
-  /// processLocation - Process DILocation.
+  /// \brief Process DILocation.
   void processLocation(const Module &M, DILocation Loc);
 
-  /// Clear all lists.
+  /// \brief Clear all lists.
   void reset();
 
 private:
-  /// Initialize TypeIdentifierMap.
   void InitializeTypeMap(const Module &M);
 
-  /// processType - Process DIType.
   void processType(DIType DT);
-
-  /// processSubprogram - Process DISubprogram.
   void processSubprogram(DISubprogram SP);
-
   void processScope(DIScope Scope);
-
-  /// addCompileUnit - Add compile unit into CUs.
   bool addCompileUnit(DICompileUnit CU);
-
-  /// addGlobalVariable - Add global variable into GVs.
   bool addGlobalVariable(DIGlobalVariable DIG);
-
-  // addSubprogram - Add subprogram into SPs.
   bool addSubprogram(DISubprogram SP);
-
-  /// addType - Add type into Tys.
   bool addType(DIType DT);
-
   bool addScope(DIScope Scope);
 
 public:
@@ -924,14 +1064,15 @@
   unsigned scope_count() const { return Scopes.size(); }
 
 private:
-  SmallVector<DICompileUnit, 8> CUs;    // Compile Units
-  SmallVector<DISubprogram, 8> SPs;    // Subprograms
-  SmallVector<DIGlobalVariable, 8> GVs;    // Global Variables;
-  SmallVector<DIType, 8> TYs;    // Types
-  SmallVector<DIScope, 8> Scopes; // Scopes
+  SmallVector<DICompileUnit, 8> CUs;
+  SmallVector<DISubprogram, 8> SPs;
+  SmallVector<DIGlobalVariable, 8> GVs;
+  SmallVector<DIType, 8> TYs;
+  SmallVector<DIScope, 8> Scopes;
   SmallPtrSet<MDNode *, 64> NodesSeen;
   DITypeIdentifierMap TypeIdentifierMap;
-  /// Specify if TypeIdentifierMap is initialized.
+
+  /// \brief Specify if TypeIdentifierMap is initialized.
   bool TypeMapInitialized;
 };
 

diff --git a/include/llvm/IR/DerivedTypes.h b/include/llvm/IR/DerivedTypes.h
index ff15087..534d1e5 100644
--- a/include/llvm/IR/DerivedTypes.h
+++ b/include/llvm/IR/DerivedTypes.h

@@ -204,9 +204,6 @@
   /// 
   void *SymbolTableEntry;
 public:
-  ~StructType() {
-    delete [] ContainedTys; // Delete the body.
-  }
 
   /// StructType::create - This creates an identified struct.
   static StructType *create(LLVMContext &Context, StringRef Name);
@@ -221,7 +218,7 @@
                             StringRef Name,
                             bool isPacked = false);
   static StructType *create(LLVMContext &Context, ArrayRef<Type*> Elements);
-  static StructType *create(StringRef Name, Type *elt1, ...) END_WITH_NULL;
+  static StructType *create(StringRef Name, Type *elt1, ...) LLVM_END_WITH_NULL;
 
   /// StructType::get - This static method is the primary way to create a
   /// literal StructType.
@@ -236,7 +233,7 @@
   /// structure types by specifying the elements as arguments.  Note that this
   /// method always returns a non-packed struct, and requires at least one
   /// element type.
-  static StructType *get(Type *elt1, ...) END_WITH_NULL;
+  static StructType *get(Type *elt1, ...) LLVM_END_WITH_NULL;
 
   bool isPacked() const { return (getSubclassData() & SCDB_Packed) != 0; }
   
@@ -249,7 +246,7 @@
   bool isOpaque() const { return (getSubclassData() & SCDB_HasBody) == 0; }
 
   /// isSized - Return true if this is a sized type.
-  bool isSized(SmallPtrSet<const Type*, 4> *Visited = nullptr) const;
+  bool isSized(SmallPtrSetImpl<const Type*> *Visited = nullptr) const;
   
   /// hasName - Return true if this is a named struct that has a non-empty name.
   bool hasName() const { return SymbolTableEntry != nullptr; }
@@ -266,7 +263,7 @@
 
   /// setBody - Specify a body for an opaque identified type.
   void setBody(ArrayRef<Type*> Elements, bool isPacked = false);
-  void setBody(Type *elt1, ...) END_WITH_NULL;
+  void setBody(Type *elt1, ...) LLVM_END_WITH_NULL;
   
   /// isValidElementType - Return true if the specified type is valid as a
   /// element type.

diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index de38d07..b592f89 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h

@@ -12,12 +12,13 @@
 // Diagnostics reporting is still done as part of the LLVMContext.
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_DIAGNOSTICINFO_H
-#define LLVM_SUPPORT_DIAGNOSTICINFO_H
+#ifndef LLVM_IR_DIAGNOSTICINFO_H
+#define LLVM_IR_DIAGNOSTICINFO_H
 
 #include "llvm-c/Core.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Casting.h"
 
 namespace llvm {
@@ -46,11 +47,13 @@
 enum DiagnosticKind {
   DK_InlineAsm,
   DK_StackSize,
+  DK_Linker,
   DK_DebugMetadataVersion,
   DK_SampleProfile,
   DK_OptimizationRemark,
   DK_OptimizationRemarkMissed,
   DK_OptimizationRemarkAnalysis,
+  DK_OptimizationFailure,
   DK_FirstPluginKind
 };
 
@@ -239,7 +242,7 @@
 };
 
 /// Common features for diagnostics dealing with optimization remarks.
-class DiagnosticInfoOptimizationRemarkBase : public DiagnosticInfo {
+class DiagnosticInfoOptimizationBase : public DiagnosticInfo {
 public:
   /// \p PassName is the name of the pass emitting this diagnostic.
   /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is
@@ -248,10 +251,11 @@
   /// location. \p Msg is the message to show. Note that this class does not
   /// copy this message, so this reference must be valid for the whole life time
   /// of the diagnostic.
-  DiagnosticInfoOptimizationRemarkBase(enum DiagnosticKind Kind,
-                                       const char *PassName, const Function &Fn,
-                                       const DebugLoc &DLoc, const Twine &Msg)
-      : DiagnosticInfo(Kind, DS_Remark), PassName(PassName), Fn(Fn), DLoc(DLoc),
+  DiagnosticInfoOptimizationBase(enum DiagnosticKind Kind,
+                                 enum DiagnosticSeverity Severity,
+                                 const char *PassName, const Function &Fn,
+                                 const DebugLoc &DLoc, const Twine &Msg)
+      : DiagnosticInfo(Kind, Severity), PassName(PassName), Fn(Fn), DLoc(DLoc),
         Msg(Msg) {}
 
   /// \see DiagnosticInfo::print.
@@ -302,8 +306,7 @@
 };
 
 /// Diagnostic information for applied optimization remarks.
-class DiagnosticInfoOptimizationRemark
-    : public DiagnosticInfoOptimizationRemarkBase {
+class DiagnosticInfoOptimizationRemark : public DiagnosticInfoOptimizationBase {
 public:
   /// \p PassName is the name of the pass emitting this diagnostic. If
   /// this name matches the regular expression given in -Rpass=, then the
@@ -315,20 +318,20 @@
   /// must be valid for the whole life time of the diagnostic.
   DiagnosticInfoOptimizationRemark(const char *PassName, const Function &Fn,
                                    const DebugLoc &DLoc, const Twine &Msg)
-      : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemark, PassName,
-                                             Fn, DLoc, Msg) {}
+      : DiagnosticInfoOptimizationBase(DK_OptimizationRemark, DS_Remark,
+                                       PassName, Fn, DLoc, Msg) {}
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemark;
   }
 
-  /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled.
-  virtual bool isEnabled() const override;
+  /// \see DiagnosticInfoOptimizationBase::isEnabled.
+  bool isEnabled() const override;
 };
 
 /// Diagnostic information for missed-optimization remarks.
 class DiagnosticInfoOptimizationRemarkMissed
-    : public DiagnosticInfoOptimizationRemarkBase {
+    : public DiagnosticInfoOptimizationBase {
 public:
   /// \p PassName is the name of the pass emitting this diagnostic. If
   /// this name matches the regular expression given in -Rpass-missed=, then the
@@ -341,20 +344,20 @@
   DiagnosticInfoOptimizationRemarkMissed(const char *PassName,
                                          const Function &Fn,
                                          const DebugLoc &DLoc, const Twine &Msg)
-      : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkMissed,
-                                             PassName, Fn, DLoc, Msg) {}
+      : DiagnosticInfoOptimizationBase(DK_OptimizationRemarkMissed, DS_Remark,
+                                       PassName, Fn, DLoc, Msg) {}
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemarkMissed;
   }
 
-  /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled.
-  virtual bool isEnabled() const override;
+  /// \see DiagnosticInfoOptimizationBase::isEnabled.
+  bool isEnabled() const override;
 };
 
 /// Diagnostic information for optimization analysis remarks.
 class DiagnosticInfoOptimizationRemarkAnalysis
-    : public DiagnosticInfoOptimizationRemarkBase {
+    : public DiagnosticInfoOptimizationBase {
 public:
   /// \p PassName is the name of the pass emitting this diagnostic. If
   /// this name matches the regular expression given in -Rpass-analysis=, then
@@ -368,15 +371,15 @@
                                            const Function &Fn,
                                            const DebugLoc &DLoc,
                                            const Twine &Msg)
-      : DiagnosticInfoOptimizationRemarkBase(DK_OptimizationRemarkAnalysis,
-                                             PassName, Fn, DLoc, Msg) {}
+      : DiagnosticInfoOptimizationBase(DK_OptimizationRemarkAnalysis, DS_Remark,
+                                       PassName, Fn, DLoc, Msg) {}
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemarkAnalysis;
   }
 
-  /// \see DiagnosticInfoOptimizationRemarkBase::isEnabled.
-  virtual bool isEnabled() const override;
+  /// \see DiagnosticInfoOptimizationBase::isEnabled.
+  bool isEnabled() const override;
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
@@ -411,6 +414,41 @@
                                     const Function &Fn, const DebugLoc &DLoc,
                                     const Twine &Msg);
 
+/// Diagnostic information for optimization failures.
+class DiagnosticInfoOptimizationFailure
+    : public DiagnosticInfoOptimizationBase {
+public:
+  /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is
+  /// the location information to use in the diagnostic. If line table
+  /// information is available, the diagnostic will include the source code
+  /// location. \p Msg is the message to show. Note that this class does not
+  /// copy this message, so this reference must be valid for the whole life time
+  /// of the diagnostic.
+  DiagnosticInfoOptimizationFailure(const Function &Fn, const DebugLoc &DLoc,
+                                    const Twine &Msg)
+      : DiagnosticInfoOptimizationBase(DK_OptimizationFailure, DS_Warning,
+                                       nullptr, Fn, DLoc, Msg) {}
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_OptimizationFailure;
+  }
+
+  /// \see DiagnosticInfoOptimizationBase::isEnabled.
+  bool isEnabled() const override;
+};
+
+/// Emit a warning when loop vectorization is specified but fails. \p Fn is the
+/// function triggering the warning, \p DLoc is the debug location where the
+/// diagnostic is generated. \p Msg is the message string to use.
+void emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn,
+                              const DebugLoc &DLoc, const Twine &Msg);
+
+/// Emit a warning when loop interleaving is specified but fails. \p Fn is the
+/// function triggering the warning, \p DLoc is the debug location where the
+/// diagnostic is generated. \p Msg is the message string to use.
+void emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn,
+                               const DebugLoc &DLoc, const Twine &Msg);
+
 } // End namespace llvm
 
 #endif

diff --git a/include/llvm/IR/DiagnosticPrinter.h b/include/llvm/IR/DiagnosticPrinter.h
index 411c781..db5779a 100644
--- a/include/llvm/IR/DiagnosticPrinter.h
+++ b/include/llvm/IR/DiagnosticPrinter.h

@@ -13,8 +13,8 @@
 // on their needs.
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_DIAGNOSTICPRINTER_H
-#define LLVM_SUPPORT_DIAGNOSTICPRINTER_H
+#ifndef LLVM_IR_DIAGNOSTICPRINTER_H
+#define LLVM_IR_DIAGNOSTICPRINTER_H
 
 #include <string>
 

diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 22444bd..26d893b 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h

@@ -143,6 +143,9 @@
   /// arguments.
   bool isVarArg() const;
 
+  bool isMaterializable() const;
+  void setIsMaterializable(bool V);
+
   /// getIntrinsicID - This method returns the ID number of the specified
   /// function, or Intrinsic::not_intrinsic if the function is not an
   /// intrinsic, or if the pointer is null.  This value is always defined to be
@@ -233,6 +236,12 @@
     return AttributeSets.getParamAlignment(i);
   }
 
+  /// @brief Extract the number of dereferenceable bytes for a call or
+  /// parameter (0=unknown).
+  uint64_t getDereferenceableBytes(unsigned i) const {
+    return AttributeSets.getDereferenceableBytes(i);
+  }
+
   /// @brief Determine if the function does not access memory.
   bool doesNotAccessMemory() const {
     return AttributeSets.hasAttribute(AttributeSet::FunctionIndex,

diff --git a/include/llvm/IR/GVMaterializer.h b/include/llvm/IR/GVMaterializer.h
index a1216a1..a7d68ec 100644
--- a/include/llvm/IR/GVMaterializer.h
+++ b/include/llvm/IR/GVMaterializer.h

@@ -32,17 +32,13 @@
 public:
   virtual ~GVMaterializer();
 
-  /// True if GV can be materialized from whatever backing store this
-  /// GVMaterializer uses and has not been materialized yet.
-  virtual bool isMaterializable(const GlobalValue *GV) const = 0;
-
   /// True if GV has been materialized and can be dematerialized back to
   /// whatever backing store this GVMaterializer uses.
   virtual bool isDematerializable(const GlobalValue *GV) const = 0;
 
   /// Make sure the given GlobalValue is fully read.
   ///
-  virtual std::error_code Materialize(GlobalValue *GV) = 0;
+  virtual std::error_code materialize(GlobalValue *GV) = 0;
 
   /// If the given GlobalValue is read in, and if the GVMaterializer supports
   /// it, release the memory for the GV, and set it up to be materialized
@@ -54,8 +50,6 @@
   /// Make sure the entire Module has been completely read.
   ///
   virtual std::error_code MaterializeModule(Module *M) = 0;
-
-  virtual void releaseBuffer() = 0;
 };
 
 } // End llvm namespace

diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index 2e042f4..546fea2 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h

@@ -1,4 +1,4 @@
-//===-- llvm/GlobalObject.h - Class to represent a global object *- C++ -*-===//
+//===-- llvm/GlobalObject.h - Class to represent global objects -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -35,12 +35,24 @@
 
   std::string Section;     // Section to emit this into, empty means default
   Comdat *ObjComdat;
+  static const unsigned AlignmentBits = 5;
+  static const unsigned GlobalObjectSubClassDataBits =
+      GlobalValueSubClassDataBits - AlignmentBits;
+
+private:
+  static const unsigned AlignmentMask = (1 << AlignmentBits) - 1;
+
 public:
   unsigned getAlignment() const {
-    return (1u << getGlobalValueSubClassData()) >> 1;
+    unsigned Data = getGlobalValueSubClassData();
+    unsigned AlignmentData = Data & AlignmentMask;
+    return (1u << AlignmentData) >> 1;
   }
   void setAlignment(unsigned Align);
 
+  unsigned getGlobalObjectSubClassData() const;
+  void setGlobalObjectSubClassData(unsigned Val);
+
   bool hasSection() const { return !StringRef(getSection()).empty(); }
   const char *getSection() const { return Section.c_str(); }
   void setSection(StringRef S);

diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 68e410b..e7b5d58 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h

@@ -21,6 +21,8 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 
+#include <system_error>
+
 namespace llvm {
 
 class Comdat;
@@ -84,6 +86,7 @@
   // (19 + 3 + 2 + 1 + 2 + 5) == 32.
   unsigned SubClassData : 19;
 protected:
+  static const unsigned GlobalValueSubClassDataBits = 19;
   unsigned getGlobalValueSubClassData() const {
     return SubClassData;
   }
@@ -246,6 +249,7 @@
   bool hasLinkOnceLinkage() const {
     return isLinkOnceLinkage(Linkage);
   }
+  bool hasLinkOnceODRLinkage() const { return isLinkOnceODRLinkage(Linkage); }
   bool hasWeakLinkage() const {
     return isWeakLinkage(Linkage);
   }
@@ -309,7 +313,7 @@
   /// Make sure this GlobalValue is fully read. If the module is corrupt, this
   /// returns true and fills in the optional string with information about the
   /// problem.  If successful, this returns false.
-  bool Materialize(std::string *ErrInfo = nullptr);
+  std::error_code materialize();
 
   /// If this GlobalValue is read in, and if the GVMaterializer supports it,
   /// release the memory for the function, and set it up to be materialized
@@ -325,6 +329,13 @@
   /// the current translation unit.
   bool isDeclaration() const;
 
+  bool isDeclarationForLinker() const {
+    if (hasAvailableExternallyLinkage())
+      return true;
+
+    return isDeclaration();
+  }
+
   /// This method unlinks 'this' from the containing module, but does not delete
   /// it.
   virtual void removeFromParent() = 0;

diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 00d3684..088c7b4 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h

@@ -28,7 +28,7 @@
 #include "llvm/Support/CBindingWrapping.h"
 
 namespace llvm {
-  class MDNode;
+class MDNode;
 
 /// \brief This provides the default implementation of the IRBuilder
 /// 'InsertHelper' method that is called whenever an instruction is created by
@@ -364,43 +364,60 @@
   /// \brief Create and insert a memset to the specified pointer and the
   /// specified value.
   ///
-  /// If the pointer isn't an i8*, it will be converted.  If a TBAA tag is
-  /// specified, it will be added to the instruction.
+  /// If the pointer isn't an i8*, it will be converted. If a TBAA tag is
+  /// specified, it will be added to the instruction. Likewise with alias.scope
+  /// and noalias tags.
   CallInst *CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, unsigned Align,
-                         bool isVolatile = false, MDNode *TBAATag = nullptr) {
-    return CreateMemSet(Ptr, Val, getInt64(Size), Align, isVolatile, TBAATag);
+                         bool isVolatile = false, MDNode *TBAATag = nullptr,
+                         MDNode *ScopeTag = nullptr,
+                         MDNode *NoAliasTag = nullptr) {
+    return CreateMemSet(Ptr, Val, getInt64(Size), Align, isVolatile,
+                        TBAATag, ScopeTag, NoAliasTag);
   }
 
   CallInst *CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align,
-                         bool isVolatile = false, MDNode *TBAATag = nullptr);
+                         bool isVolatile = false, MDNode *TBAATag = nullptr,
+                         MDNode *ScopeTag = nullptr,
+                         MDNode *NoAliasTag = nullptr);
 
   /// \brief Create and insert a memcpy between the specified pointers.
   ///
   /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
-  /// specified, it will be added to the instruction.
+  /// specified, it will be added to the instruction. Likewise with alias.scope
+  /// and noalias tags.
   CallInst *CreateMemCpy(Value *Dst, Value *Src, uint64_t Size, unsigned Align,
                          bool isVolatile = false, MDNode *TBAATag = nullptr,
-                         MDNode *TBAAStructTag = nullptr) {
+                         MDNode *TBAAStructTag = nullptr,
+                         MDNode *ScopeTag = nullptr,
+                         MDNode *NoAliasTag = nullptr) {
     return CreateMemCpy(Dst, Src, getInt64(Size), Align, isVolatile, TBAATag,
-                        TBAAStructTag);
+                        TBAAStructTag, ScopeTag, NoAliasTag);
   }
 
   CallInst *CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
                          bool isVolatile = false, MDNode *TBAATag = nullptr,
-                         MDNode *TBAAStructTag = nullptr);
+                         MDNode *TBAAStructTag = nullptr,
+                         MDNode *ScopeTag = nullptr,
+                         MDNode *NoAliasTag = nullptr);
 
   /// \brief Create and insert a memmove between the specified
   /// pointers.
   ///
   /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
-  /// specified, it will be added to the instruction.
+  /// specified, it will be added to the instruction. Likewise with alias.scope
+  /// and noalias tags.
   CallInst *CreateMemMove(Value *Dst, Value *Src, uint64_t Size, unsigned Align,
-                          bool isVolatile = false, MDNode *TBAATag = nullptr) {
-    return CreateMemMove(Dst, Src, getInt64(Size), Align, isVolatile, TBAATag);
+                          bool isVolatile = false, MDNode *TBAATag = nullptr,
+                          MDNode *ScopeTag = nullptr,
+                          MDNode *NoAliasTag = nullptr) {
+    return CreateMemMove(Dst, Src, getInt64(Size), Align, isVolatile,
+                         TBAATag, ScopeTag, NoAliasTag);
   }
 
   CallInst *CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
-                          bool isVolatile = false, MDNode *TBAATag = nullptr);
+                          bool isVolatile = false, MDNode *TBAATag = nullptr,
+                          MDNode *ScopeTag = nullptr,
+                          MDNode *NoAliasTag = nullptr);
 
   /// \brief Create a lifetime.start intrinsic.
   ///
@@ -412,6 +429,10 @@
   /// If the pointer isn't i8* it will be converted.
   CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = nullptr);
 
+  /// \brief Create an assume intrinsic call that allows the optimizer to
+  /// assume that the provided condition will be true.
+  CallInst *CreateAssumption(Value *Cond);
+
 private:
   Value *getCastedInt8PtrValue(Value *Ptr);
 };
@@ -429,7 +450,7 @@
 /// The first template argument handles whether or not to preserve names in the
 /// final instruction output. This defaults to on.  The second template argument
 /// specifies a class to use for creating constants.  This defaults to creating
-/// minimally folded constants.  The fourth template argument allows clients to
+/// minimally folded constants.  The third template argument allows clients to
 /// specify custom insertion hooks that are called on every newly created
 /// insertion.
 template<bool preserveNames = true, typename T = ConstantFolder,
@@ -570,8 +591,7 @@
 
   InvokeInst *CreateInvoke(Value *Callee, BasicBlock *NormalDest,
                            BasicBlock *UnwindDest, const Twine &Name = "") {
-    return Insert(InvokeInst::Create(Callee, NormalDest, UnwindDest,
-                                     ArrayRef<Value *>()),
+    return Insert(InvokeInst::Create(Callee, NormalDest, UnwindDest, None),
                   Name);
   }
   InvokeInst *CreateInvoke(Value *Callee, BasicBlock *NormalDest,
@@ -1203,6 +1223,21 @@
       return Insert(Folder.CreatePointerCast(VC, DestTy), Name);
     return Insert(CastInst::CreatePointerCast(V, DestTy), Name);
   }
+
+  Value *CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy,
+                                             const Twine &Name = "") {
+    if (V->getType() == DestTy)
+      return V;
+
+    if (Constant *VC = dyn_cast<Constant>(V)) {
+      return Insert(Folder.CreatePointerBitCastOrAddrSpaceCast(VC, DestTy),
+                    Name);
+    }
+
+    return Insert(CastInst::CreatePointerBitCastOrAddrSpaceCast(V, DestTy),
+                  Name);
+  }
+
   Value *CreateIntCast(Value *V, Type *DestTy, bool isSigned,
                        const Twine &Name = "") {
     if (V->getType() == DestTy)
@@ -1493,6 +1528,44 @@
     }
     return V;
   }
+
+  /// \brief Create an assume intrinsic call that represents an alignment
+  /// assumption on the provided pointer.
+  ///
+  /// An optional offset can be provided, and if it is provided, the offset
+  /// must be subtracted from the provided pointer to get the pointer with the
+  /// specified alignment.
+  CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue,
+                                      unsigned Alignment,
+                                      Value *OffsetValue = nullptr) {
+    assert(isa<PointerType>(PtrValue->getType()) &&
+           "trying to create an alignment assumption on a non-pointer?");
+
+    PointerType *PtrTy = cast<PointerType>(PtrValue->getType());
+    Type *IntPtrTy = getIntPtrTy(&DL, PtrTy->getAddressSpace());
+    Value *PtrIntValue = CreatePtrToInt(PtrValue, IntPtrTy, "ptrint");
+
+    Value *Mask = ConstantInt::get(IntPtrTy,
+      Alignment > 0 ? Alignment - 1 : 0);
+    if (OffsetValue) {
+      bool IsOffsetZero = false;
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(OffsetValue))
+        IsOffsetZero = CI->isZero();
+
+      if (!IsOffsetZero) {
+        if (OffsetValue->getType() != IntPtrTy)
+          OffsetValue = CreateIntCast(OffsetValue, IntPtrTy, /*isSigned*/ true,
+                                      "offsetcast");
+        PtrIntValue = CreateSub(PtrIntValue, OffsetValue, "offsetptr");
+      }
+    }
+
+    Value *Zero = ConstantInt::get(IntPtrTy, 0);
+    Value *MaskedPtr = CreateAnd(PtrIntValue, Mask, "maskedptr");
+    Value *InvCond = CreateICmpEQ(MaskedPtr, Zero, "maskcond");
+
+    return CreateAssumption(InvCond);
+  }
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).

diff --git a/include/llvm/IR/IRPrintingPasses.h b/include/llvm/IR/IRPrintingPasses.h
index 2f78c83..afea0c3 100644
--- a/include/llvm/IR/IRPrintingPasses.h
+++ b/include/llvm/IR/IRPrintingPasses.h

@@ -16,8 +16,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_IR_IR_PRINTING_PASSES_H
-#define LLVM_IR_IR_PRINTING_PASSES_H
+#ifndef LLVM_IR_IRPRINTINGPASSES_H
+#define LLVM_IR_IRPRINTINGPASSES_H
 
 #include "llvm/ADT/StringRef.h"
 #include <string>

diff --git a/include/llvm/IR/InlineAsm.h b/include/llvm/IR/InlineAsm.h
index ac19089..b2d79d0 100644
--- a/include/llvm/IR/InlineAsm.h
+++ b/include/llvm/IR/InlineAsm.h

@@ -25,12 +25,9 @@
 class PointerType;
 class FunctionType;
 class Module;
+
 struct InlineAsmKeyType;
-template<class ValType, class ValRefType, class TypeClass, class ConstantClass,
-         bool HasLargeKey>
-class ConstantUniqueMap;
-template<class ConstantClass, class TypeClass, class ValType>
-struct ConstantCreator;
+template <class ConstantClass> class ConstantUniqueMap;
 
 class InlineAsm : public Value {
 public:
@@ -40,9 +37,8 @@
   };
 
 private:
-  friend struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType>;
-  friend class ConstantUniqueMap<InlineAsmKeyType, const InlineAsmKeyType&,
-                                 PointerType, InlineAsm, false>;
+  friend struct InlineAsmKeyType;
+  friend class ConstantUniqueMap<InlineAsm>;
 
   InlineAsm(const InlineAsm &) LLVM_DELETED_FUNCTION;
   void operator=(const InlineAsm&) LLVM_DELETED_FUNCTION;

diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index a27859e..186fc88 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h

@@ -29,8 +29,8 @@
 //                            TerminatorInst Class
 //===----------------------------------------------------------------------===//
 
-/// TerminatorInst - Subclasses of this class are all able to terminate a basic
-/// block.  Thus, these are all the flow control type of operations.
+/// Subclasses of this class are all able to terminate a basic
+/// block. Thus, these are all the flow control type of operations.
 ///
 class TerminatorInst : public Instruction {
 protected:
@@ -51,23 +51,19 @@
   virtual BasicBlock *getSuccessorV(unsigned idx) const = 0;
   virtual unsigned getNumSuccessorsV() const = 0;
   virtual void setSuccessorV(unsigned idx, BasicBlock *B) = 0;
-  TerminatorInst *clone_impl() const override = 0;
 public:
 
-  /// getNumSuccessors - Return the number of successors that this terminator
-  /// has.
+  /// Return the number of successors that this terminator has.
   unsigned getNumSuccessors() const {
     return getNumSuccessorsV();
   }
 
-  /// getSuccessor - Return the specified successor.
-  ///
+  /// Return the specified successor.
   BasicBlock *getSuccessor(unsigned idx) const {
     return getSuccessorV(idx);
   }
 
-  /// setSuccessor - Update the specified successor to point at the provided
-  /// block.
+  /// Update the specified successor to point at the provided block.
   void setSuccessor(unsigned idx, BasicBlock *B) {
     setSuccessorV(idx, B);
   }
@@ -153,7 +149,7 @@
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
-  /// Create() - Construct a binary instruction, given the opcode and the two
+  /// Construct a binary instruction, given the opcode and the two
   /// operands.  Optionally (if InstBefore is specified) insert the instruction
   /// into a BasicBlock right before the specified instruction.  The specified
   /// Instruction is allowed to be a dereferenced end iterator.
@@ -162,14 +158,14 @@
                                 const Twine &Name = Twine(),
                                 Instruction *InsertBefore = nullptr);
 
-  /// Create() - Construct a binary instruction, given the opcode and the two
+  /// Construct a binary instruction, given the opcode and the two
   /// operands.  Also automatically insert this instruction to the end of the
   /// BasicBlock specified.
   ///
   static BinaryOperator *Create(BinaryOps Op, Value *S1, Value *S2,
                                 const Twine &Name, BasicBlock *InsertAtEnd);
 
-  /// Create* - These methods just forward to Create, and are useful when you
+  /// These methods just forward to Create, and are useful when you
   /// statically know what type of instruction you're going to create.  These
   /// helpers just save some typing.
 #define HANDLE_BINARY_INST(N, OPC, CLASS) \
@@ -281,8 +277,7 @@
   /// Helper functions to construct and inspect unary operations (NEG and NOT)
   /// via binary operators SUB and XOR:
   ///
-  /// CreateNeg, CreateNot - Create the NEG and NOT
-  ///     instructions out of SUB and XOR instructions.
+  /// Create the NEG and NOT instructions out of SUB and XOR instructions.
   ///
   static BinaryOperator *CreateNeg(Value *Op, const Twine &Name = "",
                                    Instruction *InsertBefore = nullptr);
@@ -305,16 +300,14 @@
   static BinaryOperator *CreateNot(Value *Op, const Twine &Name,
                                    BasicBlock *InsertAtEnd);
 
-  /// isNeg, isFNeg, isNot - Check if the given Value is a
-  /// NEG, FNeg, or NOT instruction.
+  /// Check if the given Value is a NEG, FNeg, or NOT instruction.
   ///
   static bool isNeg(const Value *V);
   static bool isFNeg(const Value *V, bool IgnoreZeroSign=false);
   static bool isNot(const Value *V);
 
-  /// getNegArgument, getNotArgument - Helper functions to extract the
-  ///     unary argument of a NEG, FNEG or NOT operation implemented via
-  ///     Sub, FSub, or Xor.
+  /// Helper functions to extract the unary argument of a NEG, FNEG or NOT
+  /// operation implemented via Sub, FSub, or Xor.
   ///
   static const Value *getNegArgument(const Value *BinOp);
   static       Value *getNegArgument(      Value *BinOp);
@@ -327,37 +320,42 @@
     return static_cast<BinaryOps>(Instruction::getOpcode());
   }
 
-  /// swapOperands - Exchange the two operands to this instruction.
+  /// Exchange the two operands to this instruction.
   /// This instruction is safe to use on any binary instruction and
   /// does not modify the semantics of the instruction.  If the instruction
   /// cannot be reversed (ie, it's a Div), then return true.
   ///
   bool swapOperands();
 
-  /// setHasNoUnsignedWrap - Set or clear the nsw flag on this instruction,
-  /// which must be an operator which supports this flag. See LangRef.html
-  /// for the meaning of this flag.
+  /// Set or clear the nsw flag on this instruction, which must be an operator
+  /// which supports this flag. See LangRef.html for the meaning of this flag.
   void setHasNoUnsignedWrap(bool b = true);
 
-  /// setHasNoSignedWrap - Set or clear the nsw flag on this instruction,
-  /// which must be an operator which supports this flag. See LangRef.html
-  /// for the meaning of this flag.
+  /// Set or clear the nsw flag on this instruction, which must be an operator
+  /// which supports this flag. See LangRef.html for the meaning of this flag.
   void setHasNoSignedWrap(bool b = true);
 
-  /// setIsExact - Set or clear the exact flag on this instruction,
-  /// which must be an operator which supports this flag. See LangRef.html
-  /// for the meaning of this flag.
+  /// Set or clear the exact flag on this instruction, which must be an operator
+  /// which supports this flag. See LangRef.html for the meaning of this flag.
   void setIsExact(bool b = true);
 
-  /// hasNoUnsignedWrap - Determine whether the no unsigned wrap flag is set.
+  /// Determine whether the no unsigned wrap flag is set.
   bool hasNoUnsignedWrap() const;
 
-  /// hasNoSignedWrap - Determine whether the no signed wrap flag is set.
+  /// Determine whether the no signed wrap flag is set.
   bool hasNoSignedWrap() const;
 
-  /// isExact - Determine whether the exact flag is set.
+  /// Determine whether the exact flag is set.
   bool isExact() const;
 
+  /// Convenience method to copy supported wrapping, exact, and fast-math flags
+  /// from V to this instruction.
+  void copyIRFlags(const Value *V);
+  
+  /// Logical 'and' of any supported wrapping, exact, and fast-math flags of
+  /// V and this instruction.
+  void andIRFlags(const Value *V);
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->isBinaryOp();
@@ -378,7 +376,7 @@
 //                               CastInst Class
 //===----------------------------------------------------------------------===//
 
-/// CastInst - This is the base class for all instructions that perform data
+/// This is the base class for all instructions that perform data
 /// casts. It is simply provided so that instruction category testing
 /// can be performed with code like:
 ///
@@ -459,7 +457,7 @@
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Create a BitCast or a PtrToInt cast instruction
+  /// @brief Create a BitCast AddrSpaceCast, or a PtrToInt cast instruction.
   static CastInst *CreatePointerCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
     Type *Ty,          ///< The type to which operand is casted
@@ -467,7 +465,7 @@
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
-  /// @brief Create a BitCast or a PtrToInt cast instruction
+  /// @brief Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
   static CastInst *CreatePointerCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
     Type *Ty,          ///< The type to which cast should be made
@@ -475,6 +473,22 @@
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
+  /// @brief Create a BitCast or an AddrSpaceCast cast instruction.
+  static CastInst *CreatePointerBitCastOrAddrSpaceCast(
+    Value *S,                ///< The pointer value to be casted (operand 0)
+    Type *Ty,          ///< The type to which operand is casted
+    const Twine &Name, ///< The name for the instruction
+    BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
+  );
+
+  /// @brief Create a BitCast or an AddrSpaceCast cast instruction.
+  static CastInst *CreatePointerBitCastOrAddrSpaceCast(
+    Value *S,                ///< The pointer value to be casted (operand 0)
+    Type *Ty,          ///< The type to which cast should be made
+    const Twine &Name = "", ///< Name for the instruction
+    Instruction *InsertBefore = 0 ///< Place to insert the instruction
+  );
+
   /// @brief Create a ZExt, BitCast, or Trunc for int -> int casts.
   static CastInst *CreateIntegerCast(
     Value *S,                ///< The pointer value to be casted (operand 0)

diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index bac6a95..ba7791c 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h

@@ -25,6 +25,7 @@
 class FastMathFlags;
 class LLVMContext;
 class MDNode;
+struct AAMDNodes;
 
 template<typename ValueSubClass, typename ItemParentClass>
   class SymbolTableListTraits;
@@ -155,19 +156,25 @@
   /// getAllMetadata - Get all metadata attached to this Instruction.  The first
   /// element of each pair returned is the KindID, the second element is the
   /// metadata value.  This list is returned sorted by the KindID.
-  void getAllMetadata(SmallVectorImpl<std::pair<unsigned, MDNode*> > &MDs)const{
+  void
+  getAllMetadata(SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs) const {
     if (hasMetadata())
       getAllMetadataImpl(MDs);
   }
 
   /// getAllMetadataOtherThanDebugLoc - This does the same thing as
   /// getAllMetadata, except that it filters out the debug location.
-  void getAllMetadataOtherThanDebugLoc(SmallVectorImpl<std::pair<unsigned,
-                                       MDNode*> > &MDs) const {
+  void getAllMetadataOtherThanDebugLoc(
+      SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs) const {
     if (hasMetadataOtherThanDebugLoc())
       getAllMetadataOtherThanDebugLocImpl(MDs);
   }
 
+  /// getAAMetadata - Fills the AAMDNodes structure with AA metadata from
+  /// this instruction. When Merge is true, the existing AA metadata is
+  /// merged with that from this instruction providing the most-general result.
+  void getAAMetadata(AAMDNodes &N, bool Merge = false) const;
+
   /// setMetadata - Set the metadata of the specified kind to the specified
   /// node.  This updates/replaces metadata if already present, or removes it if
   /// Node is null.
@@ -179,7 +186,7 @@
   /// convenience method for passes to do so.
   void dropUnknownMetadata(ArrayRef<unsigned> KnownIDs);
   void dropUnknownMetadata() {
-    return dropUnknownMetadata(ArrayRef<unsigned>());
+    return dropUnknownMetadata(None);
   }
   void dropUnknownMetadata(unsigned ID1) {
     return dropUnknownMetadata(makeArrayRef(ID1));
@@ -189,6 +196,10 @@
     return dropUnknownMetadata(IDs);
   }
 
+  /// setAAMetadata - Sets the metadata on this instruction from the
+  /// AAMDNodes structure.
+  void setAAMetadata(const AAMDNodes &N);
+
   /// setDebugLoc - Set the debug location information for this instruction.
   void setDebugLoc(const DebugLoc &Loc) { DbgLoc = Loc; }
 
@@ -220,11 +231,16 @@
   /// this flag.
   void setHasAllowReciprocal(bool B);
 
-  /// Convenience function for setting all the fast-math flags on this
+  /// Convenience function for setting multiple fast-math flags on this
   /// instruction, which must be an operator which supports these flags. See
-  /// LangRef.html for the meaning of these flats.
+  /// LangRef.html for the meaning of these flags.
   void setFastMathFlags(FastMathFlags FMF);
 
+  /// Convenience function for transferring all fast-math flag values to this
+  /// instruction, which must be an operator which supports these flags. See
+  /// LangRef.html for the meaning of these flags.
+  void copyFastMathFlags(FastMathFlags FMF);
+
   /// Determine whether the unsafe-algebra flag is set.
   bool hasUnsafeAlgebra() const;
 
@@ -242,7 +258,7 @@
 
   /// Convenience function for getting all the fast-math flags, which must be an
   /// operator which supports these flags. See LangRef.html for the meaning of
-  /// these flats.
+  /// these flags.
   FastMathFlags getFastMathFlags() const;
 
   /// Copy I's fast-math flags
@@ -258,9 +274,10 @@
   // These are all implemented in Metadata.cpp.
   MDNode *getMetadataImpl(unsigned KindID) const;
   MDNode *getMetadataImpl(StringRef Kind) const;
-  void getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned,MDNode*> > &)const;
-  void getAllMetadataOtherThanDebugLocImpl(SmallVectorImpl<std::pair<unsigned,
-                                           MDNode*> > &) const;
+  void
+  getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned, MDNode *>> &) const;
+  void getAllMetadataOtherThanDebugLocImpl(
+      SmallVectorImpl<std::pair<unsigned, MDNode *>> &) const;
   void clearMetadataHashEntries();
 public:
   //===--------------------------------------------------------------------===//
@@ -323,6 +340,11 @@
     return mayReadFromMemory() || mayWriteToMemory();
   }
 
+  /// isAtomic - Return true if this instruction has an
+  /// AtomicOrdering of unordered or higher.
+  ///
+  bool isAtomic() const;
+
   /// mayThrow - Return true if this instruction may throw an exception.
   ///
   bool mayThrow() const;

diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index a590f5a..dcf19e0 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h

@@ -50,6 +50,22 @@
   CrossThread = 1
 };
 
+/// Returns true if the ordering is at least as strong as acquire
+/// (i.e. acquire, acq_rel or seq_cst)
+inline bool isAtLeastAcquire(AtomicOrdering Ord) {
+   return (Ord == Acquire ||
+    Ord == AcquireRelease ||
+    Ord == SequentiallyConsistent);
+}
+
+/// Returns true if the ordering is at least as strong as release
+/// (i.e. release, acq_rel or seq_cst)
+inline bool isAtLeastRelease(AtomicOrdering Ord) {
+return (Ord == Release ||
+    Ord == AcquireRelease ||
+    Ord == SequentiallyConsistent);
+}
+
 //===----------------------------------------------------------------------===//
 //                                AllocaInst Class
 //===----------------------------------------------------------------------===//
@@ -119,7 +135,7 @@
     return getSubclassDataFromInstruction() & 32;
   }
 
-  /// \brief Specify whether this alloca is used to represent a the arguments to
+  /// \brief Specify whether this alloca is used to represent the arguments to
   /// a call.
   void setUsedWithInAlloca(bool V) {
     setInstructionSubclassData((getSubclassDataFromInstruction() & ~32) |
@@ -225,7 +241,6 @@
                                (xthread << 6));
   }
 
-  bool isAtomic() const { return getOrdering() != NotAtomic; }
   void setAtomic(AtomicOrdering Ordering,
                  SynchronizationScope SynchScope = CrossThread) {
     setOrdering(Ordering);
@@ -345,7 +360,6 @@
                                (xthread << 6));
   }
 
-  bool isAtomic() const { return getOrdering() != NotAtomic; }
   void setAtomic(AtomicOrdering Ordering,
                  SynchronizationScope SynchScope = CrossThread) {
     setOrdering(Ordering);
@@ -637,7 +651,7 @@
     Sub,
     /// *p = old & v
     And,
-    /// *p = ~old & v
+    /// *p = ~(old & v)
     Nand,
     /// *p = old | v
     Or,
@@ -1376,6 +1390,12 @@
     return AttributeList.getParamAlignment(i);
   }
 
+  /// \brief Extract the number of dereferenceable bytes for a call or
+  /// parameter (0=unknown).
+  uint64_t getDereferenceableBytes(unsigned i) const {
+    return AttributeList.getDereferenceableBytes(i);
+  }
+
   /// \brief Return true if the call should not be treated as a call to a
   /// builtin.
   bool isNoBuiltin() const {
@@ -3051,6 +3071,12 @@
     return AttributeList.getParamAlignment(i);
   }
 
+  /// \brief Extract the number of dereferenceable bytes for a call or
+  /// parameter (0=unknown).
+  uint64_t getDereferenceableBytes(unsigned i) const {
+    return AttributeList.getDereferenceableBytes(i);
+  }
+
   /// \brief Return true if the call should not be treated as a call to a
   /// builtin.
   bool isNoBuiltin() const {

diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index e053f78..e3d7999 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h

@@ -82,6 +82,7 @@
   public:
     Value *getAddress() const;
     MDNode *getVariable() const { return cast<MDNode>(getArgOperand(1)); }
+    MDNode *getExpression() const { return cast<MDNode>(getArgOperand(2)); }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
     static inline bool classof(const IntrinsicInst *I) {
@@ -103,6 +104,7 @@
                           const_cast<Value*>(getArgOperand(1)))->getZExtValue();
     }
     MDNode *getVariable() const { return cast<MDNode>(getArgOperand(2)); }
+    MDNode *getExpression() const { return cast<MDNode>(getArgOperand(3)); }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
     static inline bool classof(const IntrinsicInst *I) {

diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h
index b0d746b..acc0e9e 100644
--- a/include/llvm/IR/Intrinsics.h
+++ b/include/llvm/IR/Intrinsics.h

@@ -28,10 +28,9 @@
 class Module;
 class AttributeSet;
 
-/// Intrinsic Namespace - This namespace contains an enum with a value for
-/// every intrinsic/builtin function known by LLVM.  These enum values are
-/// returned by Function::getIntrinsicID().
-///
+/// This namespace contains an enum with a value for every intrinsic/builtin
+/// function known by LLVM. The enum values are returned by
+/// Function::getIntrinsicID().
 namespace Intrinsic {
   enum ID {
     not_intrinsic = 0,   // Must be zero
@@ -43,25 +42,21 @@
     , num_intrinsics
   };
   
-  /// Intrinsic::getName(ID) - Return the LLVM name for an intrinsic, such as
-  /// "llvm.ppc.altivec.lvx".
+  /// Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
   std::string getName(ID id, ArrayRef<Type*> Tys = None);
 
-  /// Intrinsic::getType(ID) - Return the function type for an intrinsic.
-  ///
+  /// Return the function type for an intrinsic.
   FunctionType *getType(LLVMContext &Context, ID id,
                         ArrayRef<Type*> Tys = None);
 
-  /// Intrinsic::isOverloaded(ID) - Returns true if the intrinsic can be
-  /// overloaded.
+  /// Returns true if the intrinsic can be overloaded.
   bool isOverloaded(ID id);
 
-  /// Intrinsic::getAttributes(ID) - Return the attributes for an intrinsic.
-  ///
+  /// Return the attributes for an intrinsic.
   AttributeSet getAttributes(LLVMContext &C, ID id);
 
-  /// Intrinsic::getDeclaration(M, ID) - Create or insert an LLVM Function
-  /// declaration for an intrinsic, and return it.
+  /// Create or insert an LLVM Function declaration for an intrinsic, and return
+  /// it.
   ///
   /// The Tys parameter is for intrinsics with overloaded types (e.g., those
   /// using iAny, fAny, vAny, or iPTRAny).  For a declaration of an overloaded
@@ -75,9 +70,8 @@
   /// Map a MS builtin name to an intrinsic ID.
   ID getIntrinsicForMSBuiltin(const char *Prefix, const char *BuiltinName);
   
-  /// IITDescriptor - This is a type descriptor which explains the type
-  /// requirements of an intrinsic.  This is returned by
-  /// getIntrinsicInfoTableEntries.
+  /// This is a type descriptor which explains the type requirements of an
+  /// intrinsic. This is returned by getIntrinsicInfoTableEntries.
   struct IITDescriptor {
     enum IITDescriptorKind {
       Void, VarArg, MMX, Metadata, Half, Float, Double,
@@ -117,9 +111,8 @@
     }
   };
   
-  /// getIntrinsicInfoTableEntries - Return the IIT table descriptor for the
-  /// specified intrinsic into an array of IITDescriptors.
-  /// 
+  /// Return the IIT table descriptor for the specified intrinsic into an array
+  /// of IITDescriptors.
   void getIntrinsicInfoTableEntries(ID id, SmallVectorImpl<IITDescriptor> &T);
   
 } // End Intrinsic namespace

diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index ae2a90c..98d48de 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td

@@ -277,6 +277,10 @@
 
 def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
 
+// The assume intrinsic is marked as arbitrarily writing so that proper
+// control dependencies will be maintained.
+def int_assume        : Intrinsic<[], [llvm_i1_ty], []>;
+
 // Stack Protector Intrinsic - The stackprotector intrinsic writes the stack
 // guard to the correct place on the stack frame.
 def int_stackprotector : Intrinsic<[], [llvm_ptr_ty, llvm_ptrptr_ty], []>;
@@ -324,6 +328,8 @@
   def int_exp  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_exp2 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_fabs : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_minnum : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_maxnum : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_copysign : Intrinsic<[llvm_anyfloat_ty],
                                [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_floor : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
@@ -369,9 +375,12 @@
 // places.
 let Properties = [IntrNoMem] in {
   def int_dbg_declare      : Intrinsic<[],
-                                       [llvm_metadata_ty, llvm_metadata_ty]>;
+                                       [llvm_metadata_ty,
+                                       llvm_metadata_ty,
+                                       llvm_metadata_ty]>;
   def int_dbg_value        : Intrinsic<[],
                                        [llvm_metadata_ty, llvm_i64_ty,
+                                        llvm_metadata_ty,
                                         llvm_metadata_ty]>;
 }
 
@@ -476,11 +485,13 @@
 def int_experimental_patchpoint_void : Intrinsic<[],
                                                  [llvm_i64_ty, llvm_i32_ty,
                                                   llvm_ptr_ty, llvm_i32_ty,
-                                                  llvm_vararg_ty]>;
+                                                  llvm_vararg_ty],
+                                                  [Throws]>;
 def int_experimental_patchpoint_i64 : Intrinsic<[llvm_i64_ty],
                                                 [llvm_i64_ty, llvm_i32_ty,
                                                  llvm_ptr_ty, llvm_i32_ty,
-                                                 llvm_vararg_ty]>;
+                                                 llvm_vararg_ty],
+                                                 [Throws]>;
 
 //===-------------------------- Other Intrinsics --------------------------===//
 //
@@ -496,10 +507,8 @@
 
 // Intrisics to support half precision floating point format
 let Properties = [IntrNoMem] in {
-def int_convert_to_fp16   : Intrinsic<[llvm_i16_ty], [llvm_float_ty]>,
-                            GCCBuiltin<"__gnu_f2h_ieee">;
-def int_convert_from_fp16 : Intrinsic<[llvm_float_ty], [llvm_i16_ty]>,
-                            GCCBuiltin<"__gnu_h2f_ieee">;
+def int_convert_to_fp16   : Intrinsic<[llvm_i16_ty], [llvm_anyfloat_ty]>;
+def int_convert_from_fp16 : Intrinsic<[llvm_anyfloat_ty], [llvm_i16_ty]>;
 }
 
 // These convert intrinsics are to support various conversions between

diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index e3c0fb3..7d69ed5 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td

@@ -33,11 +33,23 @@
                                 LLVMMatchType<0>], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
+// HINT
+
+def int_aarch64_hint : Intrinsic<[], [llvm_i32_ty]>;
+
+//===----------------------------------------------------------------------===//
 // RBIT
 
 def int_aarch64_rbit : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
                                  [IntrNoMem]>;
 
+//===----------------------------------------------------------------------===//
+// Data Barrier Instructions
+
+def int_aarch64_dmb : GCCBuiltin<"__builtin_arm_dmb">, Intrinsic<[], [llvm_i32_ty]>;
+def int_aarch64_dsb : GCCBuiltin<"__builtin_arm_dsb">, Intrinsic<[], [llvm_i32_ty]>;
+def int_aarch64_isb : GCCBuiltin<"__builtin_arm_isb">, Intrinsic<[], [llvm_i32_ty]>;
+
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td
index a02d707..ce758e2 100644
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td

@@ -20,8 +20,13 @@
 def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
             Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 
+// A space-consuming intrinsic primarily for testing ARMConstantIslands. The
+// first argument is the number of bytes this "instruction" takes up, the second
+// and return value are essentially chains, used to force ordering during ISel.
+def int_arm_space : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+
 //===----------------------------------------------------------------------===//
-// Saturating Arithmentic
+// Saturating Arithmetic
 
 def int_arm_qadd : GCCBuiltin<"__builtin_arm_qadd">,
     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
@@ -132,6 +137,7 @@
 // HINT
 
 def int_arm_hint : Intrinsic<[], [llvm_i32_ty]>;
+def int_arm_dbg : Intrinsic<[], [llvm_i32_ty]>;
 
 //===----------------------------------------------------------------------===//
 // RBIT
@@ -340,10 +346,6 @@
 
 // Vector Count Leading Sign/Zero Bits.
 def int_arm_neon_vcls : Neon_1Arg_Intrinsic;
-def int_arm_neon_vclz : Neon_1Arg_Intrinsic;
-
-// Vector Count One Bits.
-def int_arm_neon_vcnt : Neon_1Arg_Intrinsic;
 
 // Vector Reciprocal Estimate.
 def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic;

diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td
index 6baf018..9deed41 100644
--- a/include/llvm/IR/IntrinsicsNVVM.td
+++ b/include/llvm/IR/IntrinsicsNVVM.td

@@ -797,24 +797,30 @@
 
 // Generated within nvvm. Use for ldu on sm_20 or later
 def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldu.global.i">;
 def int_nvvm_ldu_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldu.global.f">;
 def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldu.global.p">;
 
 // Generated within nvvm. Use for ldg on sm_35 or later
 def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldg.global.i">;
 def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldg.global.f">;
 def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
-  [LLVMAnyPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+  [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldg.global.p">;
 
 // Use for generic pointers
@@ -1041,10 +1047,11 @@
 
 
 // Texture Fetch
-def int_nvvm_tex_1d_v4f32_i32
+// texmode_independent
+def int_nvvm_tex_1d_v4f32_s32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], [],
-              "llvm.nvvm.tex.1d.v4f32.i32">;
+              "llvm.nvvm.tex.1d.v4f32.s32">;
 def int_nvvm_tex_1d_v4f32_f32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], [],
@@ -1058,28 +1065,45 @@
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty], [],
               "llvm.nvvm.tex.1d.grad.v4f32.f32">;
-def int_nvvm_tex_1d_v4i32_i32
+def int_nvvm_tex_1d_v4s32_s32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], [],
-              "llvm.nvvm.tex.1d.v4i32.i32">;
-def int_nvvm_tex_1d_v4i32_f32
+              "llvm.nvvm.tex.1d.v4s32.s32">;
+def int_nvvm_tex_1d_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], [],
-              "llvm.nvvm.tex.1d.v4i32.f32">;
-def int_nvvm_tex_1d_level_v4i32_f32
+              "llvm.nvvm.tex.1d.v4s32.f32">;
+def int_nvvm_tex_1d_level_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
-              "llvm.nvvm.tex.1d.level.v4i32.f32.level">;
-def int_nvvm_tex_1d_grad_v4i32_f32
+              "llvm.nvvm.tex.1d.level.v4s32.f32">;
+def int_nvvm_tex_1d_grad_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty], [],
-              "llvm.nvvm.tex.1d.grad.v4i32.f32">;
+              "llvm.nvvm.tex.1d.grad.v4s32.f32">;
+def int_nvvm_tex_1d_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.1d.v4u32.s32">;
+def int_nvvm_tex_1d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.v4u32.f32">;
+def int_nvvm_tex_1d_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.level.v4u32.f32">;
+def int_nvvm_tex_1d_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.grad.v4u32.f32">;
 
-def int_nvvm_tex_1d_array_v4f32_i32
+def int_nvvm_tex_1d_array_v4f32_s32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
-              "llvm.nvvm.tex.1d.array.v4f32.i32">;
+              "llvm.nvvm.tex.1d.array.v4f32.s32">;
 def int_nvvm_tex_1d_array_v4f32_f32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [],
@@ -1094,29 +1118,47 @@
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty], [],
               "llvm.nvvm.tex.1d.array.grad.v4f32.f32">;
-def int_nvvm_tex_1d_array_v4i32_i32
+def int_nvvm_tex_1d_array_v4s32_s32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
-              "llvm.nvvm.tex.1d.array.v4i32.i32">;
-def int_nvvm_tex_1d_array_v4i32_f32
+              "llvm.nvvm.tex.1d.array.v4s32.s32">;
+def int_nvvm_tex_1d_array_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [],
-              "llvm.nvvm.tex.1d.array.v4i32.f32">;
-def int_nvvm_tex_1d_array_level_v4i32_f32
+              "llvm.nvvm.tex.1d.array.v4s32.f32">;
+def int_nvvm_tex_1d_array_level_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
                llvm_float_ty], [],
-              "llvm.nvvm.tex.1d.array.level.v4i32.f32">;
-def int_nvvm_tex_1d_array_grad_v4i32_f32
+              "llvm.nvvm.tex.1d.array.level.v4s32.f32">;
+def int_nvvm_tex_1d_array_grad_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty], [],
-              "llvm.nvvm.tex.1d.array.grad.v4i32.f32">;
+              "llvm.nvvm.tex.1d.array.grad.v4s32.f32">;
+def int_nvvm_tex_1d_array_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.1d.array.v4u32.s32">;
+def int_nvvm_tex_1d_array_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.array.v4u32.f32">;
+def int_nvvm_tex_1d_array_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.array.level.v4u32.f32">;
+def int_nvvm_tex_1d_array_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.1d.array.grad.v4u32.f32">;
 
-def int_nvvm_tex_2d_v4f32_i32
+def int_nvvm_tex_2d_v4f32_s32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
-              "llvm.nvvm.tex.2d.v4f32.i32">;
+              "llvm.nvvm.tex.2d.v4f32.s32">;
 def int_nvvm_tex_2d_v4f32_f32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
@@ -1131,30 +1173,48 @@
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
               "llvm.nvvm.tex.2d.grad.v4f32.f32">;
-def int_nvvm_tex_2d_v4i32_i32
+def int_nvvm_tex_2d_v4s32_s32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
-              "llvm.nvvm.tex.2d.v4i32.i32">;
-def int_nvvm_tex_2d_v4i32_f32
+              "llvm.nvvm.tex.2d.v4s32.s32">;
+def int_nvvm_tex_2d_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
-              "llvm.nvvm.tex.2d.v4i32.f32">;
-def int_nvvm_tex_2d_level_v4i32_f32
+              "llvm.nvvm.tex.2d.v4s32.f32">;
+def int_nvvm_tex_2d_level_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty], [],
-              "llvm.nvvm.tex.2d.level.v4i32.f32">;
-def int_nvvm_tex_2d_grad_v4i32_f32
+              "llvm.nvvm.tex.2d.level.v4s32.f32">;
+def int_nvvm_tex_2d_grad_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
-              "llvm.nvvm.tex.2d.grad.v4i32.f32">;
+              "llvm.nvvm.tex.2d.grad.v4s32.f32">;
+def int_nvvm_tex_2d_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.2d.v4u32.s32">;
+def int_nvvm_tex_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.v4u32.f32">;
+def int_nvvm_tex_2d_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.level.v4u32.f32">;
+def int_nvvm_tex_2d_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.grad.v4u32.f32">;
 
-def int_nvvm_tex_2d_array_v4f32_i32
+def int_nvvm_tex_2d_array_v4f32_s32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                llvm_i32_ty], [],
-              "llvm.nvvm.tex.2d.array.v4f32.i32">;
+              "llvm.nvvm.tex.2d.array.v4f32.s32">;
 def int_nvvm_tex_2d_array_v4f32_f32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
@@ -1171,32 +1231,53 @@
                llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty], [],
               "llvm.nvvm.tex.2d.array.grad.v4f32.f32">;
-def int_nvvm_tex_2d_array_v4i32_i32
+def int_nvvm_tex_2d_array_v4s32_s32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                llvm_i32_ty], [],
-              "llvm.nvvm.tex.2d.array.v4i32.i32">;
-def int_nvvm_tex_2d_array_v4i32_f32
+              "llvm.nvvm.tex.2d.array.v4s32.s32">;
+def int_nvvm_tex_2d_array_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
                llvm_float_ty], [],
-              "llvm.nvvm.tex.2d.array.v4i32.f32">;
-def int_nvvm_tex_2d_array_level_v4i32_f32
+              "llvm.nvvm.tex.2d.array.v4s32.f32">;
+def int_nvvm_tex_2d_array_level_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty], [],
-              "llvm.nvvm.tex.2d.array.level.v4i32.f32">;
-def int_nvvm_tex_2d_array_grad_v4i32_f32
+              "llvm.nvvm.tex.2d.array.level.v4s32.f32">;
+def int_nvvm_tex_2d_array_grad_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty], [],
-              "llvm.nvvm.tex.2d.array.grad.v4i32.f32">;
+              "llvm.nvvm.tex.2d.array.grad.v4s32.f32">;
+def int_nvvm_tex_2d_array_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty], [],
+              "llvm.nvvm.tex.2d.array.v4u32.s32">;
+def int_nvvm_tex_2d_array_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.array.v4u32.f32">;
+def int_nvvm_tex_2d_array_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.array.level.v4u32.f32">;
+def int_nvvm_tex_2d_array_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.2d.array.grad.v4u32.f32">;
 
-def int_nvvm_tex_3d_v4f32_i32
+def int_nvvm_tex_3d_v4f32_s32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-              [], "llvm.nvvm.tex.3d.v4f32.i32">;
+              [], "llvm.nvvm.tex.3d.v4f32.s32">;
 def int_nvvm_tex_3d_v4f32_f32
   : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
@@ -1213,28 +1294,787 @@
                llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
               "llvm.nvvm.tex.3d.grad.v4f32.f32">;
-def int_nvvm_tex_3d_v4i32_i32
+def int_nvvm_tex_3d_v4s32_s32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
-              [], "llvm.nvvm.tex.3d.v4i32.i32">;
-def int_nvvm_tex_3d_v4i32_f32
+              [], "llvm.nvvm.tex.3d.v4s32.s32">;
+def int_nvvm_tex_3d_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty], [],
-              "llvm.nvvm.tex.3d.v4i32.f32">;
-def int_nvvm_tex_3d_level_v4i32_f32
+              "llvm.nvvm.tex.3d.v4s32.f32">;
+def int_nvvm_tex_3d_level_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty], [],
-              "llvm.nvvm.tex.3d.level.v4i32.f32">;
-def int_nvvm_tex_3d_grad_v4i32_f32
+              "llvm.nvvm.tex.3d.level.v4s32.f32">;
+def int_nvvm_tex_3d_grad_v4s32_f32
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
                llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
-              "llvm.nvvm.tex.3d.grad.v4i32.f32">;
+              "llvm.nvvm.tex.3d.grad.v4s32.f32">;
+def int_nvvm_tex_3d_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [], "llvm.nvvm.tex.3d.v4u32.s32">;
+def int_nvvm_tex_3d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.3d.v4u32.f32">;
+def int_nvvm_tex_3d_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.3d.level.v4u32.f32">;
+def int_nvvm_tex_3d_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.3d.grad.v4u32.f32">;
 
-// Surface Load
+def int_nvvm_tex_cube_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.v4f32.f32">;
+def int_nvvm_tex_cube_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.level.v4f32.f32">;
+def int_nvvm_tex_cube_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.v4s32.f32">;
+def int_nvvm_tex_cube_level_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.level.v4s32.f32">;
+def int_nvvm_tex_cube_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.v4u32.f32">;
+def int_nvvm_tex_cube_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.level.v4u32.f32">;
+
+def int_nvvm_tex_cube_array_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.array.v4f32.f32">;
+def int_nvvm_tex_cube_array_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.array.level.v4f32.f32">;
+def int_nvvm_tex_cube_array_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.array.v4s32.f32">;
+def int_nvvm_tex_cube_array_level_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.array.level.v4s32.f32">;
+def int_nvvm_tex_cube_array_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.array.v4u32.f32">;
+def int_nvvm_tex_cube_array_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.cube.array.level.v4u32.f32">;
+
+def int_nvvm_tld4_r_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.r.2d.v4f32.f32">;
+def int_nvvm_tld4_g_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.g.2d.v4f32.f32">;
+def int_nvvm_tld4_b_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.b.2d.v4f32.f32">;
+def int_nvvm_tld4_a_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.a.2d.v4f32.f32">;
+def int_nvvm_tld4_r_2d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.r.2d.v4s32.f32">;
+def int_nvvm_tld4_g_2d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.g.2d.v4s32.f32">;
+def int_nvvm_tld4_b_2d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.b.2d.v4s32.f32">;
+def int_nvvm_tld4_a_2d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.a.2d.v4s32.f32">;
+def int_nvvm_tld4_r_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.r.2d.v4u32.f32">;
+def int_nvvm_tld4_g_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.g.2d.v4u32.f32">;
+def int_nvvm_tld4_b_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.b.2d.v4u32.f32">;
+def int_nvvm_tld4_a_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.a.2d.v4u32.f32">;
+
+
+// texmode_unified
+def int_nvvm_tex_unified_1d_v4f32_s32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.1d.v4f32.s32">;
+def int_nvvm_tex_unified_1d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.v4f32.f32">;
+def int_nvvm_tex_unified_1d_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.level.v4f32.f32">;
+def int_nvvm_tex_unified_1d_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.grad.v4f32.f32">;
+def int_nvvm_tex_unified_1d_v4s32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.1d.v4s32.s32">;
+def int_nvvm_tex_unified_1d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.v4s32.f32">;
+def int_nvvm_tex_unified_1d_level_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.level.v4s32.f32">;
+def int_nvvm_tex_unified_1d_grad_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.grad.v4s32.f32">;
+def int_nvvm_tex_unified_1d_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.1d.v4u32.s32">;
+def int_nvvm_tex_unified_1d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.v4u32.f32">;
+def int_nvvm_tex_unified_1d_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.level.v4u32.f32">;
+def int_nvvm_tex_unified_1d_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.grad.v4u32.f32">;
+
+def int_nvvm_tex_unified_1d_array_v4f32_s32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.v4f32.s32">;
+def int_nvvm_tex_unified_1d_array_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.v4f32.f32">;
+def int_nvvm_tex_unified_1d_array_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.level.v4f32.f32">;
+def int_nvvm_tex_unified_1d_array_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.grad.v4f32.f32">;
+def int_nvvm_tex_unified_1d_array_v4s32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.v4s32.s32">;
+def int_nvvm_tex_unified_1d_array_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.v4s32.f32">;
+def int_nvvm_tex_unified_1d_array_level_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.level.v4s32.f32">;
+def int_nvvm_tex_unified_1d_array_grad_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.grad.v4s32.f32">;
+def int_nvvm_tex_unified_1d_array_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.v4u32.s32">;
+def int_nvvm_tex_unified_1d_array_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.v4u32.f32">;
+def int_nvvm_tex_unified_1d_array_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.level.v4u32.f32">;
+def int_nvvm_tex_unified_1d_array_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.1d.array.grad.v4u32.f32">;
+
+def int_nvvm_tex_unified_2d_v4f32_s32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.2d.v4f32.s32">;
+def int_nvvm_tex_unified_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.v4f32.f32">;
+def int_nvvm_tex_unified_2d_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.level.v4f32.f32">;
+def int_nvvm_tex_unified_2d_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.grad.v4f32.f32">;
+def int_nvvm_tex_unified_2d_v4s32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.2d.v4s32.s32">;
+def int_nvvm_tex_unified_2d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.v4s32.f32">;
+def int_nvvm_tex_unified_2d_level_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.level.v4s32.f32">;
+def int_nvvm_tex_unified_2d_grad_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.grad.v4s32.f32">;
+def int_nvvm_tex_unified_2d_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.2d.v4u32.s32">;
+def int_nvvm_tex_unified_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.v4u32.f32">;
+def int_nvvm_tex_unified_2d_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.level.v4u32.f32">;
+def int_nvvm_tex_unified_2d_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.grad.v4u32.f32">;
+
+def int_nvvm_tex_unified_2d_array_v4f32_s32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.v4f32.s32">;
+def int_nvvm_tex_unified_2d_array_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.v4f32.f32">;
+def int_nvvm_tex_unified_2d_array_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.level.v4f32.f32">;
+def int_nvvm_tex_unified_2d_array_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.grad.v4f32.f32">;
+def int_nvvm_tex_unified_2d_array_v4s32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.v4s32.s32">;
+def int_nvvm_tex_unified_2d_array_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.v4s32.f32">;
+def int_nvvm_tex_unified_2d_array_level_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.level.v4s32.f32">;
+def int_nvvm_tex_unified_2d_array_grad_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.grad.v4s32.f32">;
+def int_nvvm_tex_unified_2d_array_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+               llvm_i32_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.v4u32.s32">;
+def int_nvvm_tex_unified_2d_array_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.v4u32.f32">;
+def int_nvvm_tex_unified_2d_array_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.level.v4u32.f32">;
+def int_nvvm_tex_unified_2d_array_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.2d.array.grad.v4u32.f32">;
+
+def int_nvvm_tex_unified_3d_v4f32_s32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [], "llvm.nvvm.tex.unified.3d.v4f32.s32">;
+def int_nvvm_tex_unified_3d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.3d.v4f32.f32">;
+def int_nvvm_tex_unified_3d_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.3d.level.v4f32.f32">;
+def int_nvvm_tex_unified_3d_grad_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.3d.grad.v4f32.f32">;
+def int_nvvm_tex_unified_3d_v4s32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [], "llvm.nvvm.tex.unified.3d.v4s32.s32">;
+def int_nvvm_tex_unified_3d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.3d.v4s32.f32">;
+def int_nvvm_tex_unified_3d_level_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.3d.level.v4s32.f32">;
+def int_nvvm_tex_unified_3d_grad_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.3d.grad.v4s32.f32">;
+def int_nvvm_tex_unified_3d_v4u32_s32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [], "llvm.nvvm.tex.unified.3d.v4u32.s32">;
+def int_nvvm_tex_unified_3d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.3d.v4u32.f32">;
+def int_nvvm_tex_unified_3d_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.3d.level.v4u32.f32">;
+def int_nvvm_tex_unified_3d_grad_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.3d.grad.v4u32.f32">;
+
+def int_nvvm_tex_unified_cube_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.v4f32.f32">;
+def int_nvvm_tex_unified_cube_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.level.v4f32.f32">;
+def int_nvvm_tex_unified_cube_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.v4s32.f32">;
+def int_nvvm_tex_unified_cube_level_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.level.v4s32.f32">;
+def int_nvvm_tex_unified_cube_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.v4u32.f32">;
+def int_nvvm_tex_unified_cube_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.level.v4u32.f32">;
+
+def int_nvvm_tex_unified_cube_array_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.array.v4f32.f32">;
+def int_nvvm_tex_unified_cube_array_level_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.array.level.v4f32.f32">;
+def int_nvvm_tex_unified_cube_array_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.array.v4s32.f32">;
+def int_nvvm_tex_unified_cube_array_level_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.array.level.v4s32.f32">;
+def int_nvvm_tex_unified_cube_array_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.array.v4u32.f32">;
+def int_nvvm_tex_unified_cube_array_level_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty,
+               llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tex.unified.cube.array.level.v4u32.f32">;
+
+def int_nvvm_tld4_unified_r_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.r.2d.v4f32.f32">;
+def int_nvvm_tld4_unified_g_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.g.2d.v4f32.f32">;
+def int_nvvm_tld4_unified_b_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.b.2d.v4f32.f32">;
+def int_nvvm_tld4_unified_a_2d_v4f32_f32
+  : Intrinsic<[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.a.2d.v4f32.f32">;
+def int_nvvm_tld4_unified_r_2d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.r.2d.v4s32.f32">;
+def int_nvvm_tld4_unified_g_2d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.g.2d.v4s32.f32">;
+def int_nvvm_tld4_unified_b_2d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.b.2d.v4s32.f32">;
+def int_nvvm_tld4_unified_a_2d_v4s32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.a.2d.v4s32.f32">;
+def int_nvvm_tld4_unified_r_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.r.2d.v4u32.f32">;
+def int_nvvm_tld4_unified_g_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.g.2d.v4u32.f32">;
+def int_nvvm_tld4_unified_b_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.b.2d.v4u32.f32">;
+def int_nvvm_tld4_unified_a_2d_v4u32_f32
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_float_ty, llvm_float_ty], [],
+              "llvm.nvvm.tld4.unified.a.2d.v4u32.f32">;
+
+
+//=== Surface Load
+// .clamp variants
+def int_nvvm_suld_1d_i8_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i8.clamp">;
+def int_nvvm_suld_1d_i16_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i16.clamp">;
+def int_nvvm_suld_1d_i32_clamp
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i32.clamp">;
+def int_nvvm_suld_1d_i64_clamp
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i64.clamp">;
+def int_nvvm_suld_1d_v2i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i8.clamp">;
+def int_nvvm_suld_1d_v2i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i16.clamp">;
+def int_nvvm_suld_1d_v2i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i32.clamp">;
+def int_nvvm_suld_1d_v2i64_clamp
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i64.clamp">;
+def int_nvvm_suld_1d_v4i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v4i8.clamp">;
+def int_nvvm_suld_1d_v4i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v4i16.clamp">;
+def int_nvvm_suld_1d_v4i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v4i32.clamp">;
+
+def int_nvvm_suld_1d_array_i8_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i8.clamp">;
+def int_nvvm_suld_1d_array_i16_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i16.clamp">;
+def int_nvvm_suld_1d_array_i32_clamp
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i32.clamp">;
+def int_nvvm_suld_1d_array_i64_clamp
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i64.clamp">;
+def int_nvvm_suld_1d_array_v2i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i8.clamp">;
+def int_nvvm_suld_1d_array_v2i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i16.clamp">;
+def int_nvvm_suld_1d_array_v2i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i32.clamp">;
+def int_nvvm_suld_1d_array_v2i64_clamp
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i64.clamp">;
+def int_nvvm_suld_1d_array_v4i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v4i8.clamp">;
+def int_nvvm_suld_1d_array_v4i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v4i16.clamp">;
+def int_nvvm_suld_1d_array_v4i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v4i32.clamp">;
+
+def int_nvvm_suld_2d_i8_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i8.clamp">;
+def int_nvvm_suld_2d_i16_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i16.clamp">;
+def int_nvvm_suld_2d_i32_clamp
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i32.clamp">;
+def int_nvvm_suld_2d_i64_clamp
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i64.clamp">;
+def int_nvvm_suld_2d_v2i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i8.clamp">;
+def int_nvvm_suld_2d_v2i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i16.clamp">;
+def int_nvvm_suld_2d_v2i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i32.clamp">;
+def int_nvvm_suld_2d_v2i64_clamp
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i64.clamp">;
+def int_nvvm_suld_2d_v4i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v4i8.clamp">;
+def int_nvvm_suld_2d_v4i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v4i16.clamp">;
+def int_nvvm_suld_2d_v4i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v4i32.clamp">;
+
+def int_nvvm_suld_2d_array_i8_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i8.clamp">;
+def int_nvvm_suld_2d_array_i16_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i16.clamp">;
+def int_nvvm_suld_2d_array_i32_clamp
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i32.clamp">;
+def int_nvvm_suld_2d_array_i64_clamp
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i64.clamp">;
+def int_nvvm_suld_2d_array_v2i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i8.clamp">;
+def int_nvvm_suld_2d_array_v2i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i16.clamp">;
+def int_nvvm_suld_2d_array_v2i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i32.clamp">;
+def int_nvvm_suld_2d_array_v2i64_clamp
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i64.clamp">;
+def int_nvvm_suld_2d_array_v4i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v4i8.clamp">;
+def int_nvvm_suld_2d_array_v4i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v4i16.clamp">;
+def int_nvvm_suld_2d_array_v4i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v4i32.clamp">;
+
+def int_nvvm_suld_3d_i8_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i8.clamp">;
+def int_nvvm_suld_3d_i16_clamp
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i16.clamp">;
+def int_nvvm_suld_3d_i32_clamp
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i32.clamp">;
+def int_nvvm_suld_3d_i64_clamp
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i64.clamp">;
+def int_nvvm_suld_3d_v2i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i8.clamp">;
+def int_nvvm_suld_3d_v2i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i16.clamp">;
+def int_nvvm_suld_3d_v2i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i32.clamp">;
+def int_nvvm_suld_3d_v2i64_clamp
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i64.clamp">;
+def int_nvvm_suld_3d_v4i8_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v4i8.clamp">;
+def int_nvvm_suld_3d_v4i16_clamp
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v4i16.clamp">;
+def int_nvvm_suld_3d_v4i32_clamp
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v4i32.clamp">;
+
+// .trap variants
 def int_nvvm_suld_1d_i8_trap
   : Intrinsic<[llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty], [],
@@ -1247,6 +2087,10 @@
   : Intrinsic<[llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.1d.i32.trap">;
+def int_nvvm_suld_1d_i64_trap
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i64.trap">;
 def int_nvvm_suld_1d_v2i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty], [],
@@ -1259,6 +2103,10 @@
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.1d.v2i32.trap">;
+def int_nvvm_suld_1d_v2i64_trap
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i64.trap">;
 def int_nvvm_suld_1d_v4i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty], [],
@@ -1284,6 +2132,10 @@
   : Intrinsic<[llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.1d.array.i32.trap">;
+def int_nvvm_suld_1d_array_i64_trap
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i64.trap">;
 def int_nvvm_suld_1d_array_v2i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
@@ -1296,6 +2148,10 @@
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.1d.array.v2i32.trap">;
+def int_nvvm_suld_1d_array_v2i64_trap
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i64.trap">;
 def int_nvvm_suld_1d_array_v4i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
@@ -1321,6 +2177,10 @@
   : Intrinsic<[llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.2d.i32.trap">;
+def int_nvvm_suld_2d_i64_trap
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i64.trap">;
 def int_nvvm_suld_2d_v2i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
@@ -1333,6 +2193,10 @@
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.2d.v2i32.trap">;
+def int_nvvm_suld_2d_v2i64_trap
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i64.trap">;
 def int_nvvm_suld_2d_v4i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
@@ -1358,6 +2222,10 @@
   : Intrinsic<[llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.2d.array.i32.trap">;
+def int_nvvm_suld_2d_array_i64_trap
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i64.trap">;
 def int_nvvm_suld_2d_array_v2i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
@@ -1370,6 +2238,10 @@
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.2d.array.v2i32.trap">;
+def int_nvvm_suld_2d_array_v2i64_trap
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i64.trap">;
 def int_nvvm_suld_2d_array_v4i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
@@ -1395,6 +2267,10 @@
   : Intrinsic<[llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.3d.i32.trap">;
+def int_nvvm_suld_3d_i64_trap
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i64.trap">;
 def int_nvvm_suld_3d_v2i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
@@ -1407,6 +2283,10 @@
   : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.3d.v2i32.trap">;
+def int_nvvm_suld_3d_v2i64_trap
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i64.trap">;
 def int_nvvm_suld_3d_v4i8_trap
   : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
@@ -1420,6 +2300,232 @@
               [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.suld.3d.v4i32.trap">;
 
+// .zero variants
+def int_nvvm_suld_1d_i8_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i8.zero">;
+def int_nvvm_suld_1d_i16_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i16.zero">;
+def int_nvvm_suld_1d_i32_zero
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i32.zero">;
+def int_nvvm_suld_1d_i64_zero
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.i64.zero">;
+def int_nvvm_suld_1d_v2i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i8.zero">;
+def int_nvvm_suld_1d_v2i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i16.zero">;
+def int_nvvm_suld_1d_v2i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i32.zero">;
+def int_nvvm_suld_1d_v2i64_zero
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v2i64.zero">;
+def int_nvvm_suld_1d_v4i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v4i8.zero">;
+def int_nvvm_suld_1d_v4i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v4i16.zero">;
+def int_nvvm_suld_1d_v4i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.v4i32.zero">;
+
+def int_nvvm_suld_1d_array_i8_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i8.zero">;
+def int_nvvm_suld_1d_array_i16_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i16.zero">;
+def int_nvvm_suld_1d_array_i32_zero
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i32.zero">;
+def int_nvvm_suld_1d_array_i64_zero
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.i64.zero">;
+def int_nvvm_suld_1d_array_v2i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i8.zero">;
+def int_nvvm_suld_1d_array_v2i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i16.zero">;
+def int_nvvm_suld_1d_array_v2i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i32.zero">;
+def int_nvvm_suld_1d_array_v2i64_zero
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v2i64.zero">;
+def int_nvvm_suld_1d_array_v4i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v4i8.zero">;
+def int_nvvm_suld_1d_array_v4i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v4i16.zero">;
+def int_nvvm_suld_1d_array_v4i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.1d.array.v4i32.zero">;
+
+def int_nvvm_suld_2d_i8_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i8.zero">;
+def int_nvvm_suld_2d_i16_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i16.zero">;
+def int_nvvm_suld_2d_i32_zero
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i32.zero">;
+def int_nvvm_suld_2d_i64_zero
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.i64.zero">;
+def int_nvvm_suld_2d_v2i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i8.zero">;
+def int_nvvm_suld_2d_v2i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i16.zero">;
+def int_nvvm_suld_2d_v2i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i32.zero">;
+def int_nvvm_suld_2d_v2i64_zero
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v2i64.zero">;
+def int_nvvm_suld_2d_v4i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v4i8.zero">;
+def int_nvvm_suld_2d_v4i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v4i16.zero">;
+def int_nvvm_suld_2d_v4i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.v4i32.zero">;
+
+def int_nvvm_suld_2d_array_i8_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i8.zero">;
+def int_nvvm_suld_2d_array_i16_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i16.zero">;
+def int_nvvm_suld_2d_array_i32_zero
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i32.zero">;
+def int_nvvm_suld_2d_array_i64_zero
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.i64.zero">;
+def int_nvvm_suld_2d_array_v2i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i8.zero">;
+def int_nvvm_suld_2d_array_v2i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i16.zero">;
+def int_nvvm_suld_2d_array_v2i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i32.zero">;
+def int_nvvm_suld_2d_array_v2i64_zero
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v2i64.zero">;
+def int_nvvm_suld_2d_array_v4i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v4i8.zero">;
+def int_nvvm_suld_2d_array_v4i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v4i16.zero">;
+def int_nvvm_suld_2d_array_v4i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.2d.array.v4i32.zero">;
+
+def int_nvvm_suld_3d_i8_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i8.zero">;
+def int_nvvm_suld_3d_i16_zero
+  : Intrinsic<[llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i16.zero">;
+def int_nvvm_suld_3d_i32_zero
+  : Intrinsic<[llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i32.zero">;
+def int_nvvm_suld_3d_i64_zero
+  : Intrinsic<[llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.i64.zero">;
+def int_nvvm_suld_3d_v2i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i8.zero">;
+def int_nvvm_suld_3d_v2i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i16.zero">;
+def int_nvvm_suld_3d_v2i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i32.zero">;
+def int_nvvm_suld_3d_v2i64_zero
+  : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v2i64.zero">;
+def int_nvvm_suld_3d_v4i8_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v4i8.zero">;
+def int_nvvm_suld_3d_v4i16_zero
+  : Intrinsic<[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v4i16.zero">;
+def int_nvvm_suld_3d_v4i32_zero
+  : Intrinsic<[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+              [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.suld.3d.v4i32.zero">;
+
 //===- Texture Query ------------------------------------------------------===//
 
 def int_nvvm_txq_channel_order
@@ -1503,7 +2609,277 @@
 //===- Surface Stores -----------------------------------------------------===//
 
 // Unformatted
+// .clamp variant
+def int_nvvm_sust_b_1d_i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i8_clamp">;
+def int_nvvm_sust_b_1d_i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i16_clamp">;
+def int_nvvm_sust_b_1d_i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i32_clamp">;
+def int_nvvm_sust_b_1d_i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i64_clamp">;
+def int_nvvm_sust_b_1d_v2i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i8_clamp">;
+def int_nvvm_sust_b_1d_v2i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i16_clamp">;
+def int_nvvm_sust_b_1d_v2i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i32_clamp">;
+def int_nvvm_sust_b_1d_v2i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i64_clamp">;
+def int_nvvm_sust_b_1d_v4i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v4i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v4i8_clamp">;
+def int_nvvm_sust_b_1d_v4i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v4i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v4i16_clamp">;
+def int_nvvm_sust_b_1d_v4i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.v4i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v4i32_clamp">;
 
+
+def int_nvvm_sust_b_1d_array_i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i8_clamp">;
+def int_nvvm_sust_b_1d_array_i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i16_clamp">;
+def int_nvvm_sust_b_1d_array_i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i32_clamp">;
+def int_nvvm_sust_b_1d_array_i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i64_clamp">;
+def int_nvvm_sust_b_1d_array_v2i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i8_clamp">;
+def int_nvvm_sust_b_1d_array_v2i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i16_clamp">;
+def int_nvvm_sust_b_1d_array_v2i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_clamp">;
+def int_nvvm_sust_b_1d_array_v2i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i64_clamp">;
+def int_nvvm_sust_b_1d_array_v4i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v4i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i8_clamp">;
+def int_nvvm_sust_b_1d_array_v4i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v4i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i16_clamp">;
+def int_nvvm_sust_b_1d_array_v4i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v4i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i32_clamp">;
+
+
+def int_nvvm_sust_b_2d_i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i8_clamp">;
+def int_nvvm_sust_b_2d_i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i16_clamp">;
+def int_nvvm_sust_b_2d_i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i32_clamp">;
+def int_nvvm_sust_b_2d_i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i64_clamp">;
+def int_nvvm_sust_b_2d_v2i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i8_clamp">;
+def int_nvvm_sust_b_2d_v2i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i16_clamp">;
+def int_nvvm_sust_b_2d_v2i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i32_clamp">;
+def int_nvvm_sust_b_2d_v2i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i64_clamp">;
+def int_nvvm_sust_b_2d_v4i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v4i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v4i8_clamp">;
+def int_nvvm_sust_b_2d_v4i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v4i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v4i16_clamp">;
+def int_nvvm_sust_b_2d_v4i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.v4i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v4i32_clamp">;
+
+
+def int_nvvm_sust_b_2d_array_i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i8_clamp">;
+def int_nvvm_sust_b_2d_array_i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i16_clamp">;
+def int_nvvm_sust_b_2d_array_i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i32_clamp">;
+def int_nvvm_sust_b_2d_array_i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i64_clamp">;
+def int_nvvm_sust_b_2d_array_v2i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i8_clamp">;
+def int_nvvm_sust_b_2d_array_v2i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i16_clamp">;
+def int_nvvm_sust_b_2d_array_v2i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_clamp">;
+def int_nvvm_sust_b_2d_array_v2i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i64_clamp">;
+def int_nvvm_sust_b_2d_array_v4i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v4i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i8_clamp">;
+def int_nvvm_sust_b_2d_array_v4i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v4i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i16_clamp">;
+def int_nvvm_sust_b_2d_array_v4i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v4i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i32_clamp">;
+
+
+def int_nvvm_sust_b_3d_i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i8_clamp">;
+def int_nvvm_sust_b_3d_i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i16_clamp">;
+def int_nvvm_sust_b_3d_i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.3d.i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i32_clamp">;
+def int_nvvm_sust_b_3d_i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.3d.i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i64_clamp">;
+def int_nvvm_sust_b_3d_v2i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i8_clamp">;
+def int_nvvm_sust_b_3d_v2i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i16_clamp">;
+def int_nvvm_sust_b_3d_v2i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i32_clamp">;
+def int_nvvm_sust_b_3d_v2i64_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i64.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i64_clamp">;
+def int_nvvm_sust_b_3d_v4i8_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v4i8.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v4i8_clamp">;
+def int_nvvm_sust_b_3d_v4i16_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v4i16.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v4i16_clamp">;
+def int_nvvm_sust_b_3d_v4i32_clamp
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.3d.v4i32.clamp">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v4i32_clamp">;
+
+
+// .trap variant
 def int_nvvm_sust_b_1d_i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.i8.trap">,
@@ -1516,6 +2892,10 @@
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_1d_i32_trap">;
+def int_nvvm_sust_b_1d_i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i64_trap">;
 def int_nvvm_sust_b_1d_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
               "llvm.nvvm.sust.b.1d.v2i8.trap">,
@@ -1528,6 +2908,10 @@
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.v2i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_1d_v2i32_trap">;
+def int_nvvm_sust_b_1d_v2i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i64_trap">;
 def int_nvvm_sust_b_1d_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
@@ -1557,6 +2941,10 @@
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_1d_array_i32_trap">;
+def int_nvvm_sust_b_1d_array_i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i64_trap">;
 def int_nvvm_sust_b_1d_array_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
@@ -1572,6 +2960,11 @@
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.1d.array.v2i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_trap">;
+def int_nvvm_sust_b_1d_array_v2i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i64_trap">;
 def int_nvvm_sust_b_1d_array_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
@@ -1601,6 +2994,10 @@
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_2d_i32_trap">;
+def int_nvvm_sust_b_2d_i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i64_trap">;
 def int_nvvm_sust_b_2d_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
@@ -1616,6 +3013,11 @@
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.v2i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_2d_v2i32_trap">;
+def int_nvvm_sust_b_2d_v2i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i64_trap">;
 def int_nvvm_sust_b_2d_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
@@ -1648,6 +3050,11 @@
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_2d_array_i32_trap">;
+def int_nvvm_sust_b_2d_array_i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i64_trap">;
 def int_nvvm_sust_b_2d_array_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
@@ -1663,6 +3070,11 @@
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.2d.array.v2i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_trap">;
+def int_nvvm_sust_b_2d_array_v2i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i64_trap">;
 def int_nvvm_sust_b_2d_array_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
@@ -1695,6 +3107,11 @@
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_3d_i32_trap">;
+def int_nvvm_sust_b_3d_i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.3d.i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i64_trap">;
 def int_nvvm_sust_b_3d_v2i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty], [],
@@ -1710,6 +3127,11 @@
                    llvm_i32_ty, llvm_i32_ty], [],
               "llvm.nvvm.sust.b.3d.v2i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_3d_v2i32_trap">;
+def int_nvvm_sust_b_3d_v2i64_trap
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i64.trap">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i64_trap">;
 def int_nvvm_sust_b_3d_v4i8_trap
   : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                    llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
@@ -1726,6 +3148,278 @@
               "llvm.nvvm.sust.b.3d.v4i32.trap">,
     GCCBuiltin<"__nvvm_sust_b_3d_v4i32_trap">;
 
+
+// .zero variant
+def int_nvvm_sust_b_1d_i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i8_zero">;
+def int_nvvm_sust_b_1d_i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i16_zero">;
+def int_nvvm_sust_b_1d_i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i32_zero">;
+def int_nvvm_sust_b_1d_i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_i64_zero">;
+def int_nvvm_sust_b_1d_v2i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i8_zero">;
+def int_nvvm_sust_b_1d_v2i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i16_zero">;
+def int_nvvm_sust_b_1d_v2i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i32_zero">;
+def int_nvvm_sust_b_1d_v2i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.v2i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v2i64_zero">;
+def int_nvvm_sust_b_1d_v4i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v4i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v4i8_zero">;
+def int_nvvm_sust_b_1d_v4i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.v4i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v4i16_zero">;
+def int_nvvm_sust_b_1d_v4i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.v4i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_v4i32_zero">;
+
+
+def int_nvvm_sust_b_1d_array_i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i8_zero">;
+def int_nvvm_sust_b_1d_array_i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i16_zero">;
+def int_nvvm_sust_b_1d_array_i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i32_zero">;
+def int_nvvm_sust_b_1d_array_i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.array.i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_i64_zero">;
+def int_nvvm_sust_b_1d_array_v2i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i8_zero">;
+def int_nvvm_sust_b_1d_array_v2i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i16_zero">;
+def int_nvvm_sust_b_1d_array_v2i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i32_zero">;
+def int_nvvm_sust_b_1d_array_v2i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v2i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v2i64_zero">;
+def int_nvvm_sust_b_1d_array_v4i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v4i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i8_zero">;
+def int_nvvm_sust_b_1d_array_v4i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v4i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i16_zero">;
+def int_nvvm_sust_b_1d_array_v4i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.1d.array.v4i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_1d_array_v4i32_zero">;
+
+
+def int_nvvm_sust_b_2d_i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i8_zero">;
+def int_nvvm_sust_b_2d_i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i16_zero">;
+def int_nvvm_sust_b_2d_i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i32_zero">;
+def int_nvvm_sust_b_2d_i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_i64_zero">;
+def int_nvvm_sust_b_2d_v2i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i8_zero">;
+def int_nvvm_sust_b_2d_v2i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i16_zero">;
+def int_nvvm_sust_b_2d_v2i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i32_zero">;
+def int_nvvm_sust_b_2d_v2i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.v2i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v2i64_zero">;
+def int_nvvm_sust_b_2d_v4i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v4i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v4i8_zero">;
+def int_nvvm_sust_b_2d_v4i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i16_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.v4i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v4i16_zero">;
+def int_nvvm_sust_b_2d_v4i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.v4i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_v4i32_zero">;
+
+
+def int_nvvm_sust_b_2d_array_i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i8_zero">;
+def int_nvvm_sust_b_2d_array_i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i16_zero">;
+def int_nvvm_sust_b_2d_array_i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i32_zero">;
+def int_nvvm_sust_b_2d_array_i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.array.i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_i64_zero">;
+def int_nvvm_sust_b_2d_array_v2i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i8_zero">;
+def int_nvvm_sust_b_2d_array_v2i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i16_zero">;
+def int_nvvm_sust_b_2d_array_v2i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i32_zero">;
+def int_nvvm_sust_b_2d_array_v2i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v2i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v2i64_zero">;
+def int_nvvm_sust_b_2d_array_v4i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v4i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i8_zero">;
+def int_nvvm_sust_b_2d_array_v4i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v4i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i16_zero">;
+def int_nvvm_sust_b_2d_array_v4i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.2d.array.v4i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_2d_array_v4i32_zero">;
+
+
+def int_nvvm_sust_b_3d_i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i8_zero">;
+def int_nvvm_sust_b_3d_i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i16_zero">;
+def int_nvvm_sust_b_3d_i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.3d.i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i32_zero">;
+def int_nvvm_sust_b_3d_i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.3d.i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_i64_zero">;
+def int_nvvm_sust_b_3d_v2i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i8_zero">;
+def int_nvvm_sust_b_3d_v2i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i16_zero">;
+def int_nvvm_sust_b_3d_v2i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i32_zero">;
+def int_nvvm_sust_b_3d_v2i64_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i64_ty, llvm_i64_ty], [],
+              "llvm.nvvm.sust.b.3d.v2i64.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v2i64_zero">;
+def int_nvvm_sust_b_3d_v4i8_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v4i8.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v4i8_zero">;
+def int_nvvm_sust_b_3d_v4i16_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_i16_ty], [],
+              "llvm.nvvm.sust.b.3d.v4i16.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v4i16_zero">;
+def int_nvvm_sust_b_3d_v4i32_zero
+  : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [],
+              "llvm.nvvm.sust.b.3d.v4i32.zero">,
+    GCCBuiltin<"__nvvm_sust_b_3d_v4i32_zero">;
+
+
+
 // Formatted
 
 def int_nvvm_sust_p_1d_i8_trap
@@ -1950,6 +3644,7 @@
               "llvm.nvvm.sust.p.3d.v4i32.trap">,
     GCCBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
 
+
 def int_nvvm_rotate_b32
   : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
               [IntrNoMem], "llvm.nvvm.rotate.b32">,

diff --git a/include/llvm/IR/IntrinsicsPowerPC.td b/include/llvm/IR/IntrinsicsPowerPC.td
index 49ddfb8..5cdabde 100644
--- a/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/include/llvm/IR/IntrinsicsPowerPC.td

@@ -28,8 +28,10 @@
   def int_ppc_dcbz  : Intrinsic<[], [llvm_ptr_ty], []>;
   def int_ppc_dcbzl : Intrinsic<[], [llvm_ptr_ty], []>;
 
-  // sync instruction
+  // sync instruction (i.e. sync 0, a.k.a hwsync)
   def int_ppc_sync : Intrinsic<[], [], []>;
+  // lwsync is sync 1
+  def int_ppc_lwsync : Intrinsic<[], [], []>;
 
   // Intrinsics used to generate ctr-based loops. These should only be
   // generated by the PowerPC backend!
@@ -45,6 +47,13 @@
                               list<IntrinsicProperty> properties>
     : GCCBuiltin<!strconcat("__builtin_altivec_", GCCIntSuffix)>,
       Intrinsic<ret_types, param_types, properties>;
+
+  /// PowerPC_VSX_Intrinsic - Base class for all VSX intrinsics.
+  class PowerPC_VSX_Intrinsic<string GCCIntSuffix, list<LLVMType> ret_types,
+                              list<LLVMType> param_types,
+                              list<IntrinsicProperty> properties>
+    : GCCBuiltin<!strconcat("__builtin_vsx_", GCCIntSuffix)>,
+      Intrinsic<ret_types, param_types, properties>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -87,6 +96,32 @@
 
 
 //===----------------------------------------------------------------------===//
+// PowerPC VSX Intrinsic Class Definitions.
+//
+
+/// PowerPC_VSX_Vec_DDD_Intrinsic - A PowerPC intrinsic that takes two v2f64
+/// vectors and returns one.  These intrinsics have no side effects.
+class PowerPC_VSX_Vec_DDD_Intrinsic<string GCCIntSuffix>
+  : PowerPC_VSX_Intrinsic<GCCIntSuffix,
+                          [llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty],
+                          [IntrNoMem]>;
+
+/// PowerPC_VSX_Vec_FFF_Intrinsic - A PowerPC intrinsic that takes two v4f32
+/// vectors and returns one.  These intrinsics have no side effects.
+class PowerPC_VSX_Vec_FFF_Intrinsic<string GCCIntSuffix>
+  : PowerPC_VSX_Intrinsic<GCCIntSuffix,
+                          [llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
+                          [IntrNoMem]>;
+
+/// PowerPC_VSX_Sca_DDD_Intrinsic - A PowerPC intrinsic that takes two f64
+/// scalars and returns one.  These intrinsics have no side effects.
+class PowerPC_VSX_Sca_DDD_Intrinsic<string GCCIntSuffix>
+  : PowerPC_VSX_Intrinsic<GCCIntSuffix,
+                          [llvm_double_ty], [llvm_double_ty, llvm_double_ty],
+                          [IntrNoMem]>;
+
+
+//===----------------------------------------------------------------------===//
 // PowerPC Altivec Intrinsic Definitions.
 
 let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
@@ -474,3 +509,36 @@
 def int_ppc_altivec_vlogefp   : PowerPC_Vec_FF_Intrinsic<"vlogefp">;
 def int_ppc_altivec_vrefp     : PowerPC_Vec_FF_Intrinsic<"vrefp">;
 def int_ppc_altivec_vrsqrtefp : PowerPC_Vec_FF_Intrinsic<"vrsqrtefp">;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC VSX Intrinsic Definitions.
+
+let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
+
+// Vector load.
+def int_ppc_vsx_lxvw4x :
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+def int_ppc_vsx_lxvd2x :
+      Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+
+// Vector store.
+def int_ppc_vsx_stxvw4x :
+      Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
+def int_ppc_vsx_stxvd2x :
+      Intrinsic<[], [llvm_v2f64_ty, llvm_ptr_ty], [IntrReadWriteArgMem]>;
+
+// Vector and scalar maximum.
+def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">;
+def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">;
+def int_ppc_vsx_xsmaxdp : PowerPC_VSX_Sca_DDD_Intrinsic<"xsmaxdp">;
+
+// Vector and scalar minimum.
+def int_ppc_vsx_xvmindp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmindp">;
+def int_ppc_vsx_xvminsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvminsp">;
+def int_ppc_vsx_xsmindp : PowerPC_VSX_Sca_DDD_Intrinsic<"xsmindp">;
+
+// Vector divide.
+def int_ppc_vsx_xvdivdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvdivdp">;
+def int_ppc_vsx_xvdivsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvdivsp">;
+}

diff --git a/include/llvm/IR/IntrinsicsR600.td b/include/llvm/IR/IntrinsicsR600.td
index ba69eaa..d99c42d 100644
--- a/include/llvm/IR/IntrinsicsR600.td
+++ b/include/llvm/IR/IntrinsicsR600.td

@@ -33,10 +33,14 @@
                                        "__builtin_r600_read_tgid">;
 defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
                                        "__builtin_r600_read_tidig">;
-
 } // End TargetPrefix = "r600"
 
 let TargetPrefix = "AMDGPU" in {
+
+class AMDGPUReadPreloadRegisterIntrinsic<string name>
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+    GCCBuiltin<name>;
+
 def int_AMDGPU_div_scale : GCCBuiltin<"__builtin_amdgpu_div_scale">,
   // 1st parameter: Numerator
   // 2nd parameter: Denominator
@@ -48,7 +52,7 @@
 
 def int_AMDGPU_div_fmas : GCCBuiltin<"__builtin_amdgpu_div_fmas">,
   Intrinsic<[llvm_anyfloat_ty],
-            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
             [IntrNoMem]>;
 
 def int_AMDGPU_div_fixup : GCCBuiltin<"__builtin_amdgpu_div_fixup">,
@@ -69,4 +73,10 @@
 def int_AMDGPU_rsq_clamped : GCCBuiltin<"__builtin_amdgpu_rsq_clamped">,
   Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 
+def int_AMDGPU_ldexp : GCCBuiltin<"__builtin_amdgpu_ldexp">,
+  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>;
+
+def int_AMDGPU_read_workdim : AMDGPUReadPreloadRegisterIntrinsic <
+                                       "__builtin_amdgpu_read_workdim">;
+
 } // End TargetPrefix = "AMDGPU"

diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 5de9508..59ff946 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td

@@ -886,7 +886,7 @@
 // Vector insert
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_insertps       : GCCBuiltin<"__builtin_ia32_insertps128">,
-          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,llvm_i32_ty],
+          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
 }
 
@@ -896,13 +896,13 @@
         Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_v16i8_ty],
                   [IntrNoMem]>;
   def int_x86_sse41_pblendw          : GCCBuiltin<"__builtin_ia32_pblendw128">,
-        Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
+        Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
                   [IntrNoMem]>;
   def int_x86_sse41_blendpd          : GCCBuiltin<"__builtin_ia32_blendpd">,
-        Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
+        Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
                   [IntrNoMem]>;
   def int_x86_sse41_blendps          : GCCBuiltin<"__builtin_ia32_blendps">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
+        Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
                   [IntrNoMem]>;
   def int_x86_sse41_blendvpd         : GCCBuiltin<"__builtin_ia32_blendvpd">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,llvm_v2f64_ty],
@@ -915,17 +915,17 @@
 // Vector dot product
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_dppd            : GCCBuiltin<"__builtin_ia32_dppd">,
-          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,llvm_i32_ty],
+          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
                     [IntrNoMem, Commutative]>;
   def int_x86_sse41_dpps            : GCCBuiltin<"__builtin_ia32_dpps">,
-          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,llvm_i32_ty],
+          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
                     [IntrNoMem, Commutative]>;
 }
 
 // Vector sum of absolute differences
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse41_mpsadbw         : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
-          Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i32_ty],
+          Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i8_ty],
                     [IntrNoMem, Commutative]>;
 }
 
@@ -1171,10 +1171,10 @@
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_blend_pd_256 : GCCBuiltin<"__builtin_ia32_blendpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
-                  llvm_v4f64_ty, llvm_i32_ty], [IntrNoMem]>;
+                  llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx_blend_ps_256 : GCCBuiltin<"__builtin_ia32_blendps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
-                  llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem]>;
+                  llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx_blendv_pd_256 : GCCBuiltin<"__builtin_ia32_blendvpd256">,
         Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
                   llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>;
@@ -1187,7 +1187,7 @@
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_dp_ps_256 : GCCBuiltin<"__builtin_ia32_dpps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
-                  llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem]>;
+                  llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
 }
 
 // Vector compare
@@ -1389,6 +1389,10 @@
         GCCBuiltin<"__builtin_ia32_storeupd512_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
                   [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_store_ss :
+        GCCBuiltin<"__builtin_ia32_storess_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1580,6 +1584,25 @@
   def int_x86_avx2_psrl_dq_bs : GCCBuiltin<"__builtin_ia32_psrldqi256_byteshift">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
                          llvm_i32_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi512">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi512">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrai_d : GCCBuiltin<"__builtin_ia32_psradi512">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrai_q : GCCBuiltin<"__builtin_ia32_psraqi512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
 }
 
 // Pack ops.
@@ -1706,13 +1729,13 @@
                          llvm_v32i8_ty], [IntrNoMem]>;
   def int_x86_avx2_pblendw : GCCBuiltin<"__builtin_ia32_pblendw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
+                         llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx2_pblendd_128 : GCCBuiltin<"__builtin_ia32_pblendd128">,
               Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
+                         llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx2_pblendd_256 : GCCBuiltin<"__builtin_ia32_pblendd256">,
               Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_i32_ty], [IntrNoMem]>;
+                         llvm_i8_ty], [IntrNoMem]>;
 }
 
 // Vector load with broadcast
@@ -1787,6 +1810,23 @@
   def int_x86_avx2_vinserti128 : GCCBuiltin<"__builtin_ia32_insert128i256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
                          llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_vextractf32x4_512 :
+      GCCBuiltin<"__builtin_ia32_extractf32x4_mask">,
+                 Intrinsic<[llvm_v4f32_ty], [llvm_v16f32_ty, llvm_i8_ty,
+                           llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_vextracti32x4_512 :
+      GCCBuiltin<"__builtin_ia32_extracti32x4_mask">,
+                 Intrinsic<[llvm_v4i32_ty], [llvm_v16i32_ty, llvm_i8_ty,
+                           llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_vextractf64x4_512 :
+      GCCBuiltin<"__builtin_ia32_extractf64x4_mask">,
+                 Intrinsic<[llvm_v4f64_ty], [llvm_v8f64_ty, llvm_i8_ty,
+                           llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_vextracti64x4_512 :
+      GCCBuiltin<"__builtin_ia32_extracti64x4_mask">,
+                 Intrinsic<[llvm_v4i64_ty], [llvm_v8i64_ty, llvm_i8_ty,
+                           llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
 }
 
 // Conditional load ops
@@ -1951,11 +1991,9 @@
                          llvm_v32i8_ty], [IntrNoMem]>;
   def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">,
               Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
-                         llvm_i32_ty], [IntrNoMem, Commutative]>;
+                         llvm_i8_ty], [IntrNoMem, Commutative]>;
   def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
-  def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1986,13 +2024,15 @@
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfmadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfmaddps512">,
+  def int_x86_fma_mask_vfmadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfmaddps512_mask">,
               Intrinsic<[llvm_v16f32_ty],
-                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
+                         llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddpd512">,
+  def int_x86_fma_mask_vfmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
-                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
+                         llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_x86_fma_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">,
               Intrinsic<[llvm_v4f32_ty],
@@ -2018,13 +2058,15 @@
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfmsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfmsubps512">,
+  def int_x86_fma_mask_vfmsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfmsubps512_mask">,
               Intrinsic<[llvm_v16f32_ty],
-                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
+                         llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubpd512">,
+  def int_x86_fma_mask_vfmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
-                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
+                         llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_x86_fma_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">,
               Intrinsic<[llvm_v4f32_ty],
@@ -2050,13 +2092,15 @@
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfnmadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfnmaddps512">,
+  def int_x86_fma_mask_vfnmadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfnmaddps512_mask">,
               Intrinsic<[llvm_v16f32_ty],
-                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
+                         llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfnmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmaddpd512">,
+  def int_x86_fma_mask_vfnmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmaddpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
-                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
+                         llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_x86_fma_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">,
               Intrinsic<[llvm_v4f32_ty],
@@ -2082,13 +2126,15 @@
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfnmsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfnmsubps512">,
+  def int_x86_fma_mask_vfnmsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfnmsubps512_mask">,
               Intrinsic<[llvm_v16f32_ty],
-                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
+                         llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfnmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmsubpd512">,
+  def int_x86_fma_mask_vfnmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmsubpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
-                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
+                         llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">,
               Intrinsic<[llvm_v4f32_ty],
@@ -2108,13 +2154,15 @@
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfmaddsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubps512">,
+  def int_x86_fma_mask_vfmaddsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubps512_mask">,
               Intrinsic<[llvm_v16f32_ty],
-                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
+                         llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfmaddsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd512">,
+  def int_x86_fma_mask_vfmaddsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
-                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
+                         llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
   def int_x86_fma_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">,
               Intrinsic<[llvm_v4f32_ty],
@@ -2134,13 +2182,15 @@
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfmsubadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddps512">,
+  def int_x86_fma_mask_vfmsubadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddps512_mask">,
               Intrinsic<[llvm_v16f32_ty],
-                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,
+                         llvm_i16_ty, llvm_i32_ty],
                         [IntrNoMem]>;
-  def int_x86_fma_vfmsubadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd512">,
+  def int_x86_fma_mask_vfmsubadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd512_mask">,
               Intrinsic<[llvm_v8f64_ty],
-                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty,
+                         llvm_i8_ty, llvm_i32_ty],
                         [IntrNoMem]>;
 }
 
@@ -2749,6 +2799,30 @@
 }
 
 //===----------------------------------------------------------------------===//
+// ADX
+
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_addcarryx_u32: GCCBuiltin<"__builtin_ia32_addcarryx_u32">,
+        Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty,
+                                 llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_x86_addcarryx_u64: GCCBuiltin<"__builtin_ia32_addcarryx_u64">,
+        Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty,
+                                 llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_x86_addcarry_u32: GCCBuiltin<"__builtin_ia32_addcarry_u32">,
+        Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty,
+                                 llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_x86_addcarry_u64: GCCBuiltin<"__builtin_ia32_addcarry_u64">,
+        Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty,
+                                 llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_x86_subborrow_u32: GCCBuiltin<"__builtin_ia32_subborrow_u32">,
+        Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty,
+                                 llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_x86_subborrow_u64: GCCBuiltin<"__builtin_ia32_subborrow_u64">,
+        Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty,
+                                 llvm_ptr_ty], [IntrReadWriteArgMem]>;
+}
+
+//===----------------------------------------------------------------------===//
 // RTM intrinsics. Transactional Memory support.
 
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
@@ -2955,10 +3029,12 @@
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty],
                         [IntrNoMem]>;
 
-  def int_x86_avx512_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512">,
-        Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty], [IntrNoMem]>;
-  def int_x86_avx512_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512">,
-        Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty], [IntrNoMem]>;
+  def int_x86_avx512_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512_mask">,
+        Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+                                    llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512_mask">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+                                     llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
@@ -2993,6 +3069,13 @@
   def int_x86_avx512_rcp28_pd : GCCBuiltin<"__builtin_ia32_rcp28pd_mask">,
             Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                         llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_exp2_ps : GCCBuiltin<"__builtin_ia32_exp2ps_mask">,
+            Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
+                                         llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_exp2_pd : GCCBuiltin<"__builtin_ia32_exp2pd_mask">,
+            Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
+                                        llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
   def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_mask">,
             Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
                                         llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
@@ -3182,6 +3265,180 @@
                   [IntrNoMem]>;
 }
 
+let TargetPrefix = "x86" in {
+  def int_x86_avx512_mask_valign_q_512 : GCCBuiltin<"__builtin_ia32_alignq512_mask">,
+        Intrinsic<[llvm_v8i64_ty],
+                  [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_v8i64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+
+  def int_x86_avx512_mask_valign_d_512 : GCCBuiltin<"__builtin_ia32_alignd512_mask">,
+        Intrinsic<[llvm_v16i32_ty],
+                  [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i8_ty, llvm_v16i32_ty, llvm_i16_ty],
+                  [IntrNoMem]>;
+}
+
+// Compares
+let TargetPrefix = "x86" in {
+  // 512-bit
+  def int_x86_avx512_mask_pcmpeq_b_512 : GCCBuiltin<"__builtin_ia32_pcmpeqb512_mask">,
+        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpeq_w_512 : GCCBuiltin<"__builtin_ia32_pcmpeqw512_mask">,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpeq_d_512 : GCCBuiltin<"__builtin_ia32_pcmpeqd512_mask">,
+            Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpeq_q_512 : GCCBuiltin<"__builtin_ia32_pcmpeqq512_mask">,
+            Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
+                      [IntrNoMem]>;
+
+  def int_x86_avx512_mask_pcmpgt_b_512: GCCBuiltin<"__builtin_ia32_pcmpgtb512_mask">,
+        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpgt_w_512: GCCBuiltin<"__builtin_ia32_pcmpgtw512_mask">,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpgt_d_512: GCCBuiltin<"__builtin_ia32_pcmpgtd512_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpgt_q_512: GCCBuiltin<"__builtin_ia32_pcmpgtq512_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+
+  def int_x86_avx512_mask_cmp_b_512: GCCBuiltin<"__builtin_ia32_cmpb512_mask">,
+        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty,
+                  llvm_i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_cmp_w_512: GCCBuiltin<"__builtin_ia32_cmpw512_mask">,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty,
+                  llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_cmp_d_512: GCCBuiltin<"__builtin_ia32_cmpd512_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
+                  llvm_i16_ty], [IntrNoMem ]>;
+  def int_x86_avx512_mask_cmp_q_512: GCCBuiltin<"__builtin_ia32_cmpq512_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_ucmp_b_512: GCCBuiltin<"__builtin_ia32_ucmpb512_mask">,
+        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty,
+                  llvm_i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_ucmp_w_512: GCCBuiltin<"__builtin_ia32_ucmpw512_mask">,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty,
+                  llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_ucmp_d_512: GCCBuiltin<"__builtin_ia32_ucmpd512_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
+                  llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_ucmp_q_512: GCCBuiltin<"__builtin_ia32_ucmpq512_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+
+  // 256-bit
+  def int_x86_avx512_mask_pcmpeq_b_256 : GCCBuiltin<"__builtin_ia32_pcmpeqb256_mask">,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpeq_w_256 : GCCBuiltin<"__builtin_ia32_pcmpeqw256_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpeq_d_256 : GCCBuiltin<"__builtin_ia32_pcmpeqd256_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpeq_q_256 : GCCBuiltin<"__builtin_ia32_pcmpeqq256_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+
+  def int_x86_avx512_mask_pcmpgt_b_256: GCCBuiltin<"__builtin_ia32_pcmpgtb256_mask">,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpgt_w_256: GCCBuiltin<"__builtin_ia32_pcmpgtw256_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpgt_d_256: GCCBuiltin<"__builtin_ia32_pcmpgtd256_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpgt_q_256: GCCBuiltin<"__builtin_ia32_pcmpgtq256_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+
+  def int_x86_avx512_mask_cmp_b_256: GCCBuiltin<"__builtin_ia32_cmpb256_mask">,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty,
+                  llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_cmp_w_256: GCCBuiltin<"__builtin_ia32_cmpw256_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty,
+                  llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_cmp_d_256: GCCBuiltin<"__builtin_ia32_cmpd256_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_cmp_q_256: GCCBuiltin<"__builtin_ia32_cmpq256_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_ucmp_b_256: GCCBuiltin<"__builtin_ia32_ucmpb256_mask">,
+        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty,
+                  llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_ucmp_w_256: GCCBuiltin<"__builtin_ia32_ucmpw256_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty,
+                  llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_ucmp_d_256: GCCBuiltin<"__builtin_ia32_ucmpd256_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_ucmp_q_256: GCCBuiltin<"__builtin_ia32_ucmpq256_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+
+  // 128-bit
+  def int_x86_avx512_mask_pcmpeq_b_128 : GCCBuiltin<"__builtin_ia32_pcmpeqb128_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpeq_w_128 : GCCBuiltin<"__builtin_ia32_pcmpeqw128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpeq_d_128 : GCCBuiltin<"__builtin_ia32_pcmpeqd128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpeq_q_128 : GCCBuiltin<"__builtin_ia32_pcmpeqq128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+
+  def int_x86_avx512_mask_pcmpgt_b_128: GCCBuiltin<"__builtin_ia32_pcmpgtb128_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpgt_w_128: GCCBuiltin<"__builtin_ia32_pcmpgtw128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpgt_d_128: GCCBuiltin<"__builtin_ia32_pcmpgtd128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mask_pcmpgt_q_128: GCCBuiltin<"__builtin_ia32_pcmpgtq128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+
+  def int_x86_avx512_mask_cmp_b_128: GCCBuiltin<"__builtin_ia32_cmpb128_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+                  llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_cmp_w_128: GCCBuiltin<"__builtin_ia32_cmpw128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_cmp_d_128: GCCBuiltin<"__builtin_ia32_cmpd128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_cmp_q_128: GCCBuiltin<"__builtin_ia32_cmpq128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_ucmp_b_128: GCCBuiltin<"__builtin_ia32_ucmpb128_mask">,
+        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
+                  llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_ucmp_w_128: GCCBuiltin<"__builtin_ia32_ucmpw128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_ucmp_d_128: GCCBuiltin<"__builtin_ia32_ucmpd128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_ucmp_q_128: GCCBuiltin<"__builtin_ia32_ucmpq128_mask">,
+        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
+                  llvm_i8_ty], [IntrNoMem]>;
+}
+
 // Misc.
 let TargetPrefix = "x86" in {
   def int_x86_avx512_mask_cmp_ps_512 : GCCBuiltin<"__builtin_ia32_cmpps512_mask">,
@@ -3190,13 +3447,6 @@
   def int_x86_avx512_mask_cmp_pd_512 : GCCBuiltin<"__builtin_ia32_cmppd512_mask">,
             Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty,
                                       llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  
-  def int_x86_avx512_mask_pcmpeq_d_512 : GCCBuiltin<"__builtin_ia32_pcmpeqd512_mask">,
-            Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
-                      [IntrNoMem]>;
-  def int_x86_avx512_mask_pcmpeq_q_512 : GCCBuiltin<"__builtin_ia32_pcmpeqq512_mask">,
-            Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
-                      [IntrNoMem]>;
   def int_x86_avx512_mask_pand_d_512 : GCCBuiltin<"__builtin_ia32_pandd512_mask">,
             Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
                                          llvm_v16i32_ty, llvm_i16_ty],
@@ -3205,6 +3455,8 @@
             Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
                                         llvm_v8i64_ty, llvm_i8_ty],
                       [IntrNoMem]>;
+  def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">,
+            Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index 4d940d5..2f18782 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h

@@ -18,6 +18,7 @@
 #include "llvm-c/Core.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Options.h"
 
 namespace llvm {
 
@@ -52,7 +53,12 @@
     MD_fpmath = 3,  // "fpmath"
     MD_range = 4, // "range"
     MD_tbaa_struct = 5, // "tbaa.struct"
-    MD_invariant_load = 6 // "invariant.load"
+    MD_invariant_load = 6, // "invariant.load"
+    MD_alias_scope = 7, // "alias.scope"
+    MD_noalias = 8, // "noalias",
+    MD_nontemporal = 9, // "nontemporal"
+    MD_mem_parallel_loop_access = 10, // "llvm.mem.parallel_loop_access"
+    MD_nonnull = 11 // "nonnull"
   };
 
   /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
@@ -97,12 +103,14 @@
   /// setDiagnosticHandler - This method sets a handler that is invoked
   /// when the backend needs to report anything to the user.  The first
   /// argument is a function pointer and the second is a context pointer that
-  /// gets passed into the DiagHandler.
+  /// gets passed into the DiagHandler.  The third argument should be set to
+  /// true if the handler only expects enabled diagnostics.
   ///
   /// LLVMContext doesn't take ownership or interpret either of these
   /// pointers.
   void setDiagnosticHandler(DiagnosticHandlerTy DiagHandler,
-                            void *DiagContext = nullptr);
+                            void *DiagContext = nullptr,
+                            bool RespectFilters = false);
 
   /// getDiagnosticHandler - Return the diagnostic handler set by
   /// setDiagnosticHandler.
@@ -112,14 +120,16 @@
   /// setDiagnosticContext.
   void *getDiagnosticContext() const;
 
-  /// diagnose - Report a message to the currently installed diagnostic handler.
+  /// \brief Report a message to the currently installed diagnostic handler.
+  ///
   /// This function returns, in particular in the case of error reporting
-  /// (DI.Severity == RS_Error), so the caller should leave the compilation
+  /// (DI.Severity == \a DS_Error), so the caller should leave the compilation
   /// process in a self-consistent state, even though the generated code
   /// need not be correct.
-  /// The diagnostic message will be implicitly prefixed with a severity
-  /// keyword according to \p DI.getSeverity(), i.e., "error: "
-  /// for RS_Error, "warning: " for RS_Warning, and "note: " for RS_Note.
+  ///
+  /// The diagnostic message will be implicitly prefixed with a severity keyword
+  /// according to \p DI.getSeverity(), i.e., "error: " for \a DS_Error,
+  /// "warning: " for \a DS_Warning, and "note: " for \a DS_Note.
   void diagnose(const DiagnosticInfo &DI);
 
   /// \brief Registers a yield callback with the given context.
@@ -157,6 +167,14 @@
   void emitError(const Instruction *I, const Twine &ErrorStr);
   void emitError(const Twine &ErrorStr);
 
+  /// \brief Query for a debug option's value.
+  ///
+  /// This function returns typed data populated from command line parsing.
+  template <typename ValT, typename Base, ValT(Base::*Mem)>
+  ValT getOption() const {
+    return OptionRegistry::instance().template get<ValT, Base, Mem>();
+  }
+
 private:
   LLVMContext(LLVMContext&) LLVM_DELETED_FUNCTION;
   void operator=(LLVMContext&) LLVM_DELETED_FUNCTION;

diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
index f6065a4..ab500a1 100644
--- a/include/llvm/IR/LegacyPassManagers.h
+++ b/include/llvm/IR/LegacyPassManagers.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PASSMANAGERS_H
-#define LLVM_PASSMANAGERS_H
+#ifndef LLVM_IR_LEGACYPASSMANAGERS_H
+#define LLVM_IR_LEGACYPASSMANAGERS_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -61,7 +61,7 @@
 //
 // [o] class FunctionPassManager;
 //
-// This is a external interface used by JIT to manage FunctionPasses. This
+// This is a external interface used to manage FunctionPasses. This
 // interface relies on FunctionPassManagerImpl to do all the tasks.
 //
 // [o] class FunctionPassManagerImpl : public ModulePass, PMDataManager,
@@ -248,7 +248,7 @@
   DenseMap<Pass *, SmallPtrSet<Pass *, 8> > InversedLastUser;
 
   /// Immutable passes are managed by top level manager.
-  SmallVector<ImmutablePass *, 8> ImmutablePasses;
+  SmallVector<ImmutablePass *, 16> ImmutablePasses;
 
   DenseMap<Pass *, AnalysisUsage *> AnUsageMap;
 };
@@ -393,7 +393,7 @@
 
   // Collection of higher level analysis used by the pass managed by
   // this manager.
-  SmallVector<Pass *, 8> HigherLevelAnalysis;
+  SmallVector<Pass *, 16> HigherLevelAnalysis;
 
   unsigned Depth;
 };

diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h
index 37d263b..d29512c 100644
--- a/include/llvm/IR/MDBuilder.h
+++ b/include/llvm/IR/MDBuilder.h

@@ -15,6 +15,7 @@
 #ifndef LLVM_IR_MDBUILDER_H
 #define LLVM_IR_MDBUILDER_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
 #include <utility>
 
@@ -25,7 +26,6 @@
 class LLVMContext;
 class MDNode;
 class MDString;
-class StringRef;
 
 class MDBuilder {
   LLVMContext &Context;
@@ -63,19 +63,54 @@
   MDNode *createRange(const APInt &Lo, const APInt &Hi);
 
   //===------------------------------------------------------------------===//
-  // TBAA metadata.
+  // AA metadata.
   //===------------------------------------------------------------------===//
 
-  /// \brief Return metadata appropriate for a TBAA root node.  Each returned
+protected:
+  /// \brief Return metadata appropriate for a AA root node (scope or TBAA).
+  /// Each returned node is distinct from all other metadata and will never
+  /// be identified (uniqued) with anything else.
+  MDNode *createAnonymousAARoot(StringRef Name = StringRef(),
+                                MDNode *Extra = nullptr);
+
+public:
+  /// \brief Return metadata appropriate for a TBAA root node. Each returned
   /// node is distinct from all other metadata and will never be identified
   /// (uniqued) with anything else.
-  MDNode *createAnonymousTBAARoot();
+  MDNode *createAnonymousTBAARoot() {
+    return createAnonymousAARoot();
+  }
+
+  /// \brief Return metadata appropriate for an alias scope domain node.
+  /// Each returned node is distinct from all other metadata and will never
+  /// be identified (uniqued) with anything else.
+  MDNode *createAnonymousAliasScopeDomain(StringRef Name = StringRef()) {
+    return createAnonymousAARoot(Name);
+  }
+
+  /// \brief Return metadata appropriate for an alias scope root node.
+  /// Each returned node is distinct from all other metadata and will never
+  /// be identified (uniqued) with anything else.
+  MDNode *createAnonymousAliasScope(MDNode *Domain,
+                                    StringRef Name = StringRef()) {
+    return createAnonymousAARoot(Name, Domain);
+  }
 
   /// \brief Return metadata appropriate for a TBAA root node with the given
   /// name.  This may be identified (uniqued) with other roots with the same
   /// name.
   MDNode *createTBAARoot(StringRef Name);
 
+  /// \brief Return metadata appropriate for an alias scope domain node with
+  /// the given name. This may be identified (uniqued) with other roots with
+  /// the same name.
+  MDNode *createAliasScopeDomain(StringRef Name);
+
+  /// \brief Return metadata appropriate for an alias scope node with
+  /// the given name. This may be identified (uniqued) with other scopes with
+  /// the same name and domain.
+  MDNode *createAliasScope(StringRef Name, MDNode *Domain);
+
   /// \brief Return metadata for a non-root TBAA node with the given name,
   /// parent in the TBAA tree, and value for 'pointsToConstantMemory'.
   MDNode *createTBAANode(StringRef Name, MDNode *Parent,

diff --git a/include/llvm/IR/Mangler.h b/include/llvm/IR/Mangler.h
index c1ba585..1e6b5b1 100644
--- a/include/llvm/IR/Mangler.h
+++ b/include/llvm/IR/Mangler.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_MANGLER_H
-#define LLVM_TARGET_MANGLER_H
+#ifndef LLVM_IR_MANGLER_H
+#define LLVM_IR_MANGLER_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/raw_ostream.h"
@@ -66,4 +66,4 @@
 
 } // End llvm namespace
 
-#endif // LLVM_TARGET_MANGLER_H
+#endif

diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 7a0ca88..a056b0d 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h

@@ -17,10 +17,12 @@
 #define LLVM_IR_METADATA_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 class LLVMContext;
@@ -30,62 +32,139 @@
 
 
 enum LLVMConstants : uint32_t {
-  DEBUG_METADATA_VERSION = 1  // Current debug info version number.
+  DEBUG_METADATA_VERSION = 2  // Current debug info version number.
+};
+
+/// \brief Root of the metadata hierarchy.
+///
+/// This is a root class for typeless data in the IR.
+///
+/// TODO: Detach from the Value hierarchy.
+class Metadata : public Value {
+protected:
+  Metadata(LLVMContext &Context, unsigned ID);
+
+public:
+  static bool classof(const Value *V) {
+    return V->getValueID() == GenericMDNodeVal ||
+           V->getValueID() == MDNodeFwdDeclVal ||
+           V->getValueID() == MDStringVal;
+  }
 };
 
 //===----------------------------------------------------------------------===//
-/// MDString - a single uniqued string.
+/// \brief A single uniqued string.
+///
 /// These are used to efficiently contain a byte sequence for metadata.
 /// MDString is always unnamed.
-class MDString : public Value {
+class MDString : public Metadata {
+  friend class StringMapEntry<MDString>;
+
   virtual void anchor();
   MDString(const MDString &) LLVM_DELETED_FUNCTION;
 
-  explicit MDString(LLVMContext &C);
+  explicit MDString(LLVMContext &Context)
+      : Metadata(Context, Value::MDStringVal) {}
+
+  /// \brief Shadow Value::getName() to prevent its use.
+  StringRef getName() const LLVM_DELETED_FUNCTION;
+
 public:
   static MDString *get(LLVMContext &Context, StringRef Str);
   static MDString *get(LLVMContext &Context, const char *Str) {
     return get(Context, Str ? StringRef(Str) : StringRef());
   }
 
-  StringRef getString() const { return getName(); }
+  StringRef getString() const;
 
-  unsigned getLength() const { return (unsigned)getName().size(); }
+  unsigned getLength() const { return (unsigned)getString().size(); }
 
   typedef StringRef::iterator iterator;
 
-  /// begin() - Pointer to the first byte of the string.
-  iterator begin() const { return getName().begin(); }
+  /// \brief Pointer to the first byte of the string.
+  iterator begin() const { return getString().begin(); }
 
-  /// end() - Pointer to one byte past the end of the string.
-  iterator end() const { return getName().end(); }
+  /// \brief Pointer to one byte past the end of the string.
+  iterator end() const { return getString().end(); }
 
-  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast.
   static bool classof(const Value *V) {
     return V->getValueID() == MDStringVal;
   }
 };
 
+/// \brief A collection of metadata nodes that might be associated with a
+/// memory access used by the alias-analysis infrastructure.
+struct AAMDNodes {
+  explicit AAMDNodes(MDNode *T = nullptr, MDNode *S = nullptr,
+                     MDNode *N = nullptr)
+      : TBAA(T), Scope(S), NoAlias(N) {}
+
+  bool operator==(const AAMDNodes &A) const {
+    return TBAA == A.TBAA && Scope == A.Scope && NoAlias == A.NoAlias;
+  }
+
+  bool operator!=(const AAMDNodes &A) const { return !(*this == A); }
+
+  LLVM_EXPLICIT operator bool() const { return TBAA || Scope || NoAlias; }
+
+  /// \brief The tag for type-based alias analysis.
+  MDNode *TBAA;
+
+  /// \brief The tag for alias scope specification (used with noalias).
+  MDNode *Scope;
+
+  /// \brief The tag specifying the noalias scope.
+  MDNode *NoAlias;
+};
+
+// Specialize DenseMapInfo for AAMDNodes.
+template<>
+struct DenseMapInfo<AAMDNodes> {
+  static inline AAMDNodes getEmptyKey() {
+    return AAMDNodes(DenseMapInfo<MDNode *>::getEmptyKey(), 0, 0);
+  }
+  static inline AAMDNodes getTombstoneKey() {
+    return AAMDNodes(DenseMapInfo<MDNode *>::getTombstoneKey(), 0, 0);
+  }
+  static unsigned getHashValue(const AAMDNodes &Val) {
+    return DenseMapInfo<MDNode *>::getHashValue(Val.TBAA) ^
+           DenseMapInfo<MDNode *>::getHashValue(Val.Scope) ^
+           DenseMapInfo<MDNode *>::getHashValue(Val.NoAlias);
+  }
+  static bool isEqual(const AAMDNodes &LHS, const AAMDNodes &RHS) {
+    return LHS == RHS;
+  }
+};
 
 class MDNodeOperand;
 
 //===----------------------------------------------------------------------===//
-/// MDNode - a tuple of other values.
-class MDNode : public Value, public FoldingSetNode {
+/// \brief Tuple of metadata.
+class MDNode : public Metadata {
   MDNode(const MDNode &) LLVM_DELETED_FUNCTION;
   void operator=(const MDNode &) LLVM_DELETED_FUNCTION;
   friend class MDNodeOperand;
   friend class LLVMContextImpl;
-  friend struct FoldingSetTrait<MDNode>;
+  void *operator new(size_t) LLVM_DELETED_FUNCTION;
 
-  /// Hash - If the MDNode is uniqued cache the hash to speed up lookup.
-  unsigned Hash;
+protected:
+  void *operator new(size_t Size, unsigned NumOps);
 
-  /// NumOperands - This many 'MDNodeOperand' items are co-allocated onto the
-  /// end of this MDNode.
-  unsigned NumOperands;
+  /// \brief Required by std, but never called.
+  void operator delete(void *Mem);
 
-  // Subclass data enums.
+  /// \brief Required by std, but never called.
+  void operator delete(void *, unsigned) {
+    llvm_unreachable("Constructor throws?");
+  }
+
+  /// \brief Required by std, but never called.
+  void operator delete(void *, unsigned, bool) {
+    llvm_unreachable("Constructor throws?");
+  }
+
+  /// \brief Subclass data enums.
   enum {
     /// FunctionLocalBit - This bit is set if this MDNode is function local.
     /// This is true when it (potentially transitively) contains a reference to
@@ -94,89 +173,88 @@
 
     /// NotUniquedBit - This is set on MDNodes that are not uniqued because they
     /// have a null operand.
-    NotUniquedBit    = 1 << 1,
-
-    /// DestroyFlag - This bit is set by destroy() so the destructor can assert
-    /// that the node isn't being destroyed with a plain 'delete'.
-    DestroyFlag      = 1 << 2
+    NotUniquedBit    = 1 << 1
   };
 
-  // FunctionLocal enums.
+  /// \brief FunctionLocal enums.
   enum FunctionLocalness {
     FL_Unknown = -1,
     FL_No = 0,
     FL_Yes = 1
   };
 
-  /// replaceOperand - Replace each instance of F from the operand list of this
-  /// node with T.
+  /// \brief Replace each instance of the given operand with a new value.
   void replaceOperand(MDNodeOperand *Op, Value *NewVal);
-  ~MDNode();
 
-  MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal);
+  MDNode(LLVMContext &C, unsigned ID, ArrayRef<Value *> Vals,
+         bool isFunctionLocal);
+  ~MDNode() {}
 
   static MDNode *getMDNode(LLVMContext &C, ArrayRef<Value*> Vals,
                            FunctionLocalness FL, bool Insert = true);
 public:
-  // Constructors and destructors.
   static MDNode *get(LLVMContext &Context, ArrayRef<Value*> Vals);
-  // getWhenValsUnresolved - Construct MDNode determining function-localness
-  // from isFunctionLocal argument, not by analyzing Vals.
+  /// \brief Construct MDNode with an explicit function-localness.
+  ///
+  /// Don't analyze Vals; trust isFunctionLocal.
   static MDNode *getWhenValsUnresolved(LLVMContext &Context,
                                        ArrayRef<Value*> Vals,
                                        bool isFunctionLocal);
 
   static MDNode *getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals);
 
-  /// getTemporary - Return a temporary MDNode, for use in constructing
-  /// cyclic MDNode structures. A temporary MDNode is not uniqued,
-  /// may be RAUW'd, and must be manually deleted with deleteTemporary.
+  /// \brief Return a temporary MDNode
+  ///
+  /// For use in constructing cyclic MDNode structures. A temporary MDNode is
+  /// not uniqued, may be RAUW'd, and must be manually deleted with
+  /// deleteTemporary.
   static MDNode *getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals);
 
-  /// deleteTemporary - Deallocate a node created by getTemporary. The
-  /// node must not have any users.
+  /// \brief Deallocate a node created by getTemporary.
+  ///
+  /// The node must not have any users.
   static void deleteTemporary(MDNode *N);
 
-  /// replaceOperandWith - Replace a specific operand.
+  /// \brief Replace a specific operand.
   void replaceOperandWith(unsigned i, Value *NewVal);
 
-  /// getOperand - Return specified operand.
+  /// \brief Return specified operand.
   Value *getOperand(unsigned i) const LLVM_READONLY;
 
-  /// getNumOperands - Return number of MDNode operands.
+  /// \brief Return number of MDNode operands.
   unsigned getNumOperands() const { return NumOperands; }
 
-  /// isFunctionLocal - Return whether MDNode is local to a function.
+  /// \brief Return whether MDNode is local to a function.
   bool isFunctionLocal() const {
     return (getSubclassDataFromValue() & FunctionLocalBit) != 0;
   }
 
-  // getFunction - If this metadata is function-local and recursively has a
-  // function-local operand, return the first such operand's parent function.
-  // Otherwise, return null. getFunction() should not be used for performance-
-  // critical code because it recursively visits all the MDNode's operands.
+  /// \brief Return the first function-local operand's function.
+  ///
+  /// If this metadata is function-local and recursively has a function-local
+  /// operand, return the first such operand's parent function.  Otherwise,
+  /// return null. getFunction() should not be used for performance- critical
+  /// code because it recursively visits all the MDNode's operands.
   const Function *getFunction() const;
 
-  /// Profile - calculate a unique identifier for this MDNode to collapse
-  /// duplicates
-  void Profile(FoldingSetNodeID &ID) const;
-
-  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Value *V) {
-    return V->getValueID() == MDNodeVal;
+    return V->getValueID() == GenericMDNodeVal ||
+           V->getValueID() == MDNodeFwdDeclVal;
   }
 
-  /// Check whether MDNode is a vtable access.
+  /// \brief Check whether MDNode is a vtable access.
   bool isTBAAVtableAccess() const;
 
-  /// Methods for metadata merging.
+  /// \brief Methods for metadata merging.
+  static MDNode *concatenate(MDNode *A, MDNode *B);
+  static MDNode *intersect(MDNode *A, MDNode *B);
   static MDNode *getMostGenericTBAA(MDNode *A, MDNode *B);
+  static AAMDNodes getMostGenericAA(const AAMDNodes &A, const AAMDNodes &B);
   static MDNode *getMostGenericFPMath(MDNode *A, MDNode *B);
   static MDNode *getMostGenericRange(MDNode *A, MDNode *B);
-private:
-  // destroy - Delete this node.  Only when there are no uses.
-  void destroy();
 
+protected:
   bool isNotUniqued() const {
     return (getSubclassDataFromValue() & NotUniquedBit) != 0;
   }
@@ -189,10 +267,62 @@
   }
 };
 
+/// \brief Generic metadata node.
+///
+/// Generic metadata nodes, with opt-out support for uniquing.
+///
+/// Although nodes are uniqued by default, \a GenericMDNode has no support for
+/// RAUW.  If an operand change (due to RAUW or otherwise) causes a uniquing
+/// collision, the uniquing bit is dropped.
+///
+/// TODO: Make uniquing opt-out (status: mandatory, sometimes dropped).
+/// TODO: Drop support for RAUW.
+class GenericMDNode : public MDNode {
+  friend class MDNode;
+  friend class LLVMContextImpl;
+
+  unsigned Hash;
+
+  GenericMDNode(LLVMContext &C, ArrayRef<Value *> Vals, bool isFunctionLocal)
+      : MDNode(C, GenericMDNodeVal, Vals, isFunctionLocal), Hash(0) {}
+  ~GenericMDNode();
+
+  void dropAllReferences();
+
+public:
+  /// \brief Get the hash, if any.
+  unsigned getHash() const { return Hash; }
+
+  static bool classof(const Value *V) {
+    return V->getValueID() == GenericMDNodeVal;
+  }
+};
+
+/// \brief Forward declaration of metadata.
+///
+/// Forward declaration of metadata, in the form of a metadata node.  Unlike \a
+/// GenericMDNode, this class has support for RAUW and is suitable for forward
+/// references.
+class MDNodeFwdDecl : public MDNode {
+  friend class MDNode;
+
+  MDNodeFwdDecl(LLVMContext &C, ArrayRef<Value *> Vals, bool isFunctionLocal)
+      : MDNode(C, MDNodeFwdDeclVal, Vals, isFunctionLocal) {}
+  ~MDNodeFwdDecl() {}
+
+public:
+  static bool classof(const Value *V) {
+    return V->getValueID() == MDNodeFwdDeclVal;
+  }
+};
+
 //===----------------------------------------------------------------------===//
-/// NamedMDNode - a tuple of MDNodes. Despite its name, a NamedMDNode isn't
-/// itself an MDNode. NamedMDNodes belong to modules, have names, and contain
-/// lists of MDNodes.
+/// \brief A tuple of MDNodes.
+///
+/// Despite its name, a NamedMDNode isn't itself an MDNode. NamedMDNodes belong
+/// to modules, have names, and contain lists of MDNodes.
+///
+/// TODO: Inherit from Metadata.
 class NamedMDNode : public ilist_node<NamedMDNode> {
   friend class SymbolTableListTraits<NamedMDNode, Module>;
   friend struct ilist_traits<NamedMDNode>;
@@ -245,46 +375,33 @@
   };
 
 public:
-  /// eraseFromParent - Drop all references and remove the node from parent
-  /// module.
+  /// \brief Drop all references and remove the node from parent module.
   void eraseFromParent();
 
-  /// dropAllReferences - Remove all uses and clear node vector.
+  /// \brief Remove all uses and clear node vector.
   void dropAllReferences();
 
-  /// ~NamedMDNode - Destroy NamedMDNode.
   ~NamedMDNode();
 
-  /// getParent - Get the module that holds this named metadata collection.
+  /// \brief Get the module that holds this named metadata collection.
   inline Module *getParent() { return Parent; }
   inline const Module *getParent() const { return Parent; }
 
-  /// getOperand - Return specified operand.
   MDNode *getOperand(unsigned i) const;
-
-  /// getNumOperands - Return the number of NamedMDNode operands.
   unsigned getNumOperands() const;
-
-  /// addOperand - Add metadata operand.
   void addOperand(MDNode *M);
-
-  /// getName - Return a constant reference to this named metadata's name.
   StringRef getName() const;
-
-  /// print - Implement operator<< on NamedMDNode.
   void print(raw_ostream &ROS) const;
-
-  /// dump() - Allow printing of NamedMDNodes from the debugger.
   void dump() const;
 
   // ---------------------------------------------------------------------------
   // Operand Iterator interface...
   //
-  typedef op_iterator_impl<MDNode*, MDNode> op_iterator;
+  typedef op_iterator_impl<MDNode *, MDNode> op_iterator;
   op_iterator op_begin() { return op_iterator(this, 0); }
   op_iterator op_end()   { return op_iterator(this, getNumOperands()); }
 
-  typedef op_iterator_impl<const MDNode*, MDNode> const_op_iterator;
+  typedef op_iterator_impl<const MDNode *, MDNode> const_op_iterator;
   const_op_iterator op_begin() const { return const_op_iterator(this, 0); }
   const_op_iterator op_end()   const { return const_op_iterator(this, getNumOperands()); }
 

diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 26f62db..7fff80a 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h

@@ -23,6 +23,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/DataTypes.h"
 #include <system_error>
 
@@ -137,6 +138,11 @@
   /// The Function constant iterator
   typedef FunctionListType::const_iterator               const_iterator;
 
+  /// The Function reverse iterator.
+  typedef FunctionListType::reverse_iterator             reverse_iterator;
+  /// The Function constant reverse iterator.
+  typedef FunctionListType::const_reverse_iterator const_reverse_iterator;
+
   /// The Global Alias iterators.
   typedef AliasListType::iterator                        alias_iterator;
   /// The Global Alias constant iterator
@@ -177,9 +183,17 @@
     /// Appends the two values, which are required to be metadata
     /// nodes. However, duplicate entries in the second list are dropped
     /// during the append operation.
-    AppendUnique = 6
+    AppendUnique = 6,
+
+    // Markers:
+    ModFlagBehaviorFirstVal = Error,
+    ModFlagBehaviorLastVal = AppendUnique
   };
 
+  /// Checks if Value represents a valid ModFlagBehavior, and stores the
+  /// converted result in MFB.
+  static bool isValidModFlagBehavior(Value *V, ModFlagBehavior &MFB);
+
   struct ModuleFlagEntry {
     ModFlagBehavior Behavior;
     MDString *Key;
@@ -339,11 +353,11 @@
   /// function arguments, which makes it easier for clients to use.
   Constant *getOrInsertFunction(StringRef Name,
                                 AttributeSet AttributeList,
-                                Type *RetTy, ...)  END_WITH_NULL;
+                                Type *RetTy, ...) LLVM_END_WITH_NULL;
 
   /// Same as above, but without the attributes.
   Constant *getOrInsertFunction(StringRef Name, Type *RetTy, ...)
-    END_WITH_NULL;
+    LLVM_END_WITH_NULL;
 
   /// Look up the specified function in the module symbol table. If it does not
   /// exist, return null.
@@ -357,8 +371,11 @@
   /// does not exist, return null. If AllowInternal is set to true, this
   /// function will return types that have InternalLinkage. By default, these
   /// types are not returned.
-  const GlobalVariable *getGlobalVariable(StringRef Name,
-                                          bool AllowInternal = false) const {
+  GlobalVariable *getGlobalVariable(StringRef Name) const {
+    return getGlobalVariable(Name, false);
+  }
+
+  GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal) const {
     return const_cast<Module *>(this)->getGlobalVariable(Name, AllowInternal);
   }
 
@@ -456,9 +473,6 @@
   /// Retrieves the GVMaterializer, if any, for this Module.
   GVMaterializer *getMaterializer() const { return Materializer.get(); }
 
-  /// True if the definition of GV has yet to be materializedfrom the
-  /// GVMaterializer.
-  bool isMaterializable(const GlobalValue *GV) const;
   /// Returns true if this GV was loaded from this Module's GVMaterializer and
   /// the GVMaterializer knows how to dematerialize the GV.
   bool isDematerializable(const GlobalValue *GV) const;
@@ -466,7 +480,7 @@
   /// Make sure the GlobalValue is fully read. If the module is corrupt, this
   /// returns true and fills in the optional string with information about the
   /// problem. If successful, this returns false.
-  bool Materialize(GlobalValue *GV, std::string *ErrInfo = nullptr);
+  std::error_code materialize(GlobalValue *GV);
   /// If the GlobalValue is read in, and if the GVMaterializer supports it,
   /// release the memory for the function, and set it up to be materialized
   /// lazily. If !isDematerializable(), this method is a noop.
@@ -478,7 +492,7 @@
   /// Make sure all GlobalValues in this Module are fully read and clear the
   /// Materializer. If the module is corrupt, this DOES NOT clear the old
   /// Materializer.
-  std::error_code materializeAllPermanently(bool ReleaseBuffer = false);
+  std::error_code materializeAllPermanently();
 
 /// @}
 /// @name Direct access to the globals list, functions list, and symbol table
@@ -546,9 +560,20 @@
   const_iterator          begin() const { return FunctionList.begin(); }
   iterator                end  ()       { return FunctionList.end();   }
   const_iterator          end  () const { return FunctionList.end();   }
+  reverse_iterator        rbegin()      { return FunctionList.rbegin(); }
+  const_reverse_iterator  rbegin() const{ return FunctionList.rbegin(); }
+  reverse_iterator        rend()        { return FunctionList.rend(); }
+  const_reverse_iterator  rend() const  { return FunctionList.rend(); }
   size_t                  size() const  { return FunctionList.size(); }
   bool                    empty() const { return FunctionList.empty(); }
 
+  iterator_range<iterator> functions() {
+    return iterator_range<iterator>(begin(), end());
+  }
+  iterator_range<const_iterator> functions() const {
+    return iterator_range<const_iterator>(begin(), end());
+  }
+
 /// @}
 /// @name Alias Iteration
 /// @{
@@ -620,6 +645,15 @@
   unsigned getDwarfVersion() const;
 
 /// @}
+/// @name Utility functions for querying and setting PIC level
+/// @{
+
+  /// \brief Returns the PIC level (small or large model)
+  PICLevel::Level getPICLevel() const;
+
+  /// \brief Set the PIC level (small or large model)
+  void setPICLevel(PICLevel::Level PL);
+/// @}
 };
 
 /// An raw_ostream inserter for modules.

diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index 888cabf..0933f21 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h

@@ -28,9 +28,8 @@
 class BinaryOperator;
 class ConstantExpr;
 
-/// Operator - This is a utility class that provides an abstraction for the
-/// common functionality between Instructions and ConstantExprs.
-///
+/// This is a utility class that provides an abstraction for the common
+/// functionality between Instructions and ConstantExprs.
 class Operator : public User {
 private:
   // The Operator class is intended to be used as a utility, and is never itself
@@ -46,17 +45,15 @@
   ~Operator();
 
 public:
-  /// getOpcode - Return the opcode for this Instruction or ConstantExpr.
-  ///
+  /// Return the opcode for this Instruction or ConstantExpr.
   unsigned getOpcode() const {
     if (const Instruction *I = dyn_cast<Instruction>(this))
       return I->getOpcode();
     return cast<ConstantExpr>(this)->getOpcode();
   }
 
-  /// getOpcode - If V is an Instruction or ConstantExpr, return its
-  /// opcode. Otherwise return UserOp1.
-  ///
+  /// If V is an Instruction or ConstantExpr, return its opcode.
+  /// Otherwise return UserOp1.
   static unsigned getOpcode(const Value *V) {
     if (const Instruction *I = dyn_cast<Instruction>(V))
       return I->getOpcode();
@@ -72,10 +69,9 @@
   }
 };
 
-/// OverflowingBinaryOperator - Utility class for integer arithmetic operators
-/// which may exhibit overflow - Add, Sub, and Mul. It does not include SDiv,
-/// despite that operator having the potential for overflow.
-///
+/// Utility class for integer arithmetic operators which may exhibit overflow -
+/// Add, Sub, and Mul. It does not include SDiv, despite that operator having
+/// the potential for overflow.
 class OverflowingBinaryOperator : public Operator {
 public:
   enum {
@@ -96,13 +92,13 @@
   }
 
 public:
-  /// hasNoUnsignedWrap - Test whether this operation is known to never
+  /// Test whether this operation is known to never
   /// undergo unsigned overflow, aka the nuw property.
   bool hasNoUnsignedWrap() const {
     return SubclassOptionalData & NoUnsignedWrap;
   }
 
-  /// hasNoSignedWrap - Test whether this operation is known to never
+  /// Test whether this operation is known to never
   /// undergo signed overflow, aka the nsw property.
   bool hasNoSignedWrap() const {
     return (SubclassOptionalData & NoSignedWrap) != 0;
@@ -126,8 +122,8 @@
   }
 };
 
-/// PossiblyExactOperator - A udiv or sdiv instruction, which can be marked as
-/// "exact", indicating that no bits are destroyed.
+/// A udiv or sdiv instruction, which can be marked as "exact",
+/// indicating that no bits are destroyed.
 class PossiblyExactOperator : public Operator {
 public:
   enum {
@@ -142,8 +138,7 @@
   }
 
 public:
-  /// isExact - Test whether this division is known to be exact, with
-  /// zero remainder.
+  /// Test whether this division is known to be exact, with zero remainder.
   bool isExact() const {
     return SubclassOptionalData & IsExact;
   }
@@ -217,7 +212,7 @@
 };
 
 
-/// FPMathOperator - Utility class for floating point operations which can have
+/// Utility class for floating point operations which can have
 /// information about relaxed accuracy requirements attached to them.
 class FPMathOperator : public Operator {
 private:
@@ -257,11 +252,18 @@
       (B * FastMathFlags::AllowReciprocal);
   }
 
-  /// Convenience function for setting all the fast-math flags
+  /// Convenience function for setting multiple fast-math flags.
+  /// FMF is a mask of the bits to set.
   void setFastMathFlags(FastMathFlags FMF) {
     SubclassOptionalData |= FMF.Flags;
   }
 
+  /// Convenience function for copying all fast-math flags.
+  /// All values in FMF are transferred to this operator.
+  void copyFastMathFlags(FastMathFlags FMF) {
+    SubclassOptionalData = FMF.Flags;
+  }
+
 public:
   /// Test whether this operation is permitted to be
   /// algebraically transformed, aka the 'A' fast-math property.
@@ -312,8 +314,7 @@
 };
 
 
-/// ConcreteOperator - A helper template for defining operators for individual
-/// opcodes.
+/// A helper template for defining operators for individual opcodes.
 template<typename SuperClass, unsigned Opc>
 class ConcreteOperator : public SuperClass {
 public:
@@ -357,6 +358,8 @@
 };
 
 
+class ZExtOperator : public ConcreteOperator<Operator, Instruction::ZExt> {};
+
 
 class GEPOperator
   : public ConcreteOperator<Operator, Instruction::GetElementPtr> {
@@ -372,8 +375,7 @@
   }
 
 public:
-  /// isInBounds - Test whether this is an inbounds GEP, as defined
-  /// by LangRef.html.
+  /// Test whether this is an inbounds GEP, as defined by LangRef.html.
   bool isInBounds() const {
     return SubclassOptionalData & IsInBounds;
   }
@@ -393,16 +395,14 @@
     return 0U;                      // get index for modifying correct operand
   }
 
-  /// getPointerOperandType - Method to return the pointer operand as a
-  /// PointerType.
+  /// Method to return the pointer operand as a PointerType.
   Type *getPointerOperandType() const {
     return getPointerOperand()->getType();
   }
 
-  /// getPointerAddressSpace - Method to return the address space of the
-  /// pointer operand.
+  /// Method to return the address space of the pointer operand.
   unsigned getPointerAddressSpace() const {
-    return cast<PointerType>(getPointerOperandType())->getAddressSpace();
+    return getPointerOperandType()->getPointerAddressSpace();
   }
 
   unsigned getNumIndices() const {  // Note: always non-negative
@@ -413,8 +413,8 @@
     return getNumOperands() > 1;
   }
 
-  /// hasAllZeroIndices - Return true if all of the indices of this GEP are
-  /// zeros.  If so, the result pointer and the first operand have the same
+  /// Return true if all of the indices of this GEP are zeros.
+  /// If so, the result pointer and the first operand have the same
   /// value, just potentially different types.
   bool hasAllZeroIndices() const {
     for (const_op_iterator I = idx_begin(), E = idx_end(); I != E; ++I) {
@@ -426,8 +426,8 @@
     return true;
   }
 
-  /// hasAllConstantIndices - Return true if all of the indices of this GEP are
-  /// constant integers.  If so, the result pointer and the first operand have
+  /// Return true if all of the indices of this GEP are constant integers.
+  /// If so, the result pointer and the first operand have
   /// a constant offset between them.
   bool hasAllConstantIndices() const {
     for (const_op_iterator I = idx_begin(), E = idx_end(); I != E; ++I) {
@@ -493,14 +493,12 @@
     return 0U;                      // get index for modifying correct operand
   }
 
-  /// getPointerOperandType - Method to return the pointer operand as a
-  /// PointerType.
+  /// Method to return the pointer operand as a PointerType.
   Type *getPointerOperandType() const {
     return getPointerOperand()->getType();
   }
 
-  /// getPointerAddressSpace - Method to return the address space of the
-  /// pointer operand.
+  /// Method to return the address space of the pointer operand.
   unsigned getPointerAddressSpace() const {
     return cast<PointerType>(getPointerOperandType())->getAddressSpace();
   }

diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index cc2a80b..45985e1 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h

@@ -35,8 +35,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_IR_PASS_MANAGER_H
-#define LLVM_IR_PASS_MANAGER_H
+#ifndef LLVM_IR_PASSMANAGER_H
+#define LLVM_IR_PASSMANAGER_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -107,11 +107,9 @@
       PreservedPassIDs = Arg.PreservedPassIDs;
       return;
     }
-    for (SmallPtrSet<void *, 2>::const_iterator I = PreservedPassIDs.begin(),
-                                                E = PreservedPassIDs.end();
-         I != E; ++I)
-      if (!Arg.PreservedPassIDs.count(*I))
-        PreservedPassIDs.erase(*I);
+    for (void *P : PreservedPassIDs)
+      if (!Arg.PreservedPassIDs.count(P))
+        PreservedPassIDs.erase(P);
   }
 
   /// \brief Intersect this set with a temporary other set in place.
@@ -125,11 +123,9 @@
       PreservedPassIDs = std::move(Arg.PreservedPassIDs);
       return;
     }
-    for (SmallPtrSet<void *, 2>::const_iterator I = PreservedPassIDs.begin(),
-                                                E = PreservedPassIDs.end();
-         I != E; ++I)
-      if (!Arg.PreservedPassIDs.count(*I))
-        PreservedPassIDs.erase(*I);
+    for (void *P : PreservedPassIDs)
+      if (!Arg.PreservedPassIDs.count(P))
+        PreservedPassIDs.erase(P);
   }
 
   /// \brief Query whether a pass is marked as preserved by this set.

diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index 2efb294..4783062 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h

@@ -32,7 +32,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Operator.h"
 
 namespace llvm {
@@ -362,6 +362,29 @@
   }
 };
 
+/// Match a specified integer value or vector of all elements of that value.
+struct specific_intval {
+  uint64_t Val;
+  specific_intval(uint64_t V) : Val(V) {}
+
+  template<typename ITy>
+  bool match(ITy *V) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(V);
+    if (!CI && V->getType()->isVectorTy())
+      if (const auto *C = dyn_cast<Constant>(V))
+        CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue());
+
+    if (CI && CI->getBitWidth() <= 64)
+      return CI->getZExtValue() == Val;
+
+    return false;
+  }
+};
+
+/// Match a specific integer value or vector with all elements equal to the
+/// value.
+inline specific_intval m_SpecificInt(uint64_t V) { return specific_intval(V); }
+
 /// m_ConstantInt - Match a ConstantInt and bind to its value.  This does not
 /// match ConstantInts wider than 64-bits.
 inline bind_const_intval_ty m_ConstantInt(uint64_t &V) { return V; }
@@ -1135,8 +1158,10 @@
 
   template<typename OpTy>
   bool match(OpTy *V) {
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(V);
-    return II && II->getIntrinsicID() == ID;
+    if (const CallInst *CI = dyn_cast<CallInst>(V))
+      if (const Function *F = CI->getCalledFunction())
+        return F->getIntrinsicID() == ID;
+    return false;
   }
 };
 
@@ -1205,6 +1230,18 @@
   return m_Intrinsic<Intrinsic::bswap>(Op0);
 }
 
+template<typename Opnd0, typename Opnd1>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty
+m_FMin(const Opnd0 &Op0, const Opnd1 &Op1) {
+  return m_Intrinsic<Intrinsic::minnum>(Op0, Op1);
+}
+
+template<typename Opnd0, typename Opnd1>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty
+m_FMax(const Opnd0 &Op0, const Opnd1 &Op1) {
+  return m_Intrinsic<Intrinsic::maxnum>(Op0, Op1);
+}
+
 } // end namespace PatternMatch
 } // end namespace llvm
 

diff --git a/include/llvm/IR/PredIteratorCache.h b/include/llvm/IR/PredIteratorCache.h
index 02bc583..5e1be37 100644
--- a/include/llvm/IR/PredIteratorCache.h
+++ b/include/llvm/IR/PredIteratorCache.h

@@ -11,14 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_IR_PREDITERATORCACHE_H
+#define LLVM_IR_PREDITERATORCACHE_H
+
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/Support/Allocator.h"
 
-#ifndef LLVM_IR_PREDITERATORCACHE_H
-#define LLVM_IR_PREDITERATORCACHE_H
-
 namespace llvm {
 
   /// PredIteratorCache - This class is an extremely trivial cache for

diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h
index 7955587..a36fb0f 100644
--- a/include/llvm/IR/Type.h
+++ b/include/llvm/IR/Type.h

@@ -265,7 +265,7 @@
   /// get the actual size for a particular target, it is reasonable to use the
   /// DataLayout subsystem to do this.
   ///
-  bool isSized(SmallPtrSet<const Type*, 4> *Visited = nullptr) const {
+  bool isSized(SmallPtrSetImpl<const Type*> *Visited = nullptr) const {
     // If it's a primitive, it is always sized.
     if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
         getTypeID() == PointerTyID ||
@@ -323,7 +323,7 @@
   }
 
   /// getContainedType - This method is used to implement the type iterator
-  /// (defined a the end of the file).  For derived types, this returns the
+  /// (defined at the end of the file).  For derived types, this returns the
   /// types 'contained' in the derived type.
   ///
   Type *getContainedType(unsigned i) const {
@@ -419,7 +419,7 @@
   /// isSizedDerivedType - Derived types like structures and arrays are sized
   /// iff all of the members of the type are sized as well.  Since asking for
   /// their size is relatively uncommon, move this operation out of line.
-  bool isSizedDerivedType(SmallPtrSet<const Type*, 4> *Visited = nullptr) const;
+  bool isSizedDerivedType(SmallPtrSetImpl<const Type*> *Visited = nullptr) const;
 };
 
 // Printing of types.

diff --git a/include/llvm/IR/UseListOrder.h b/include/llvm/IR/UseListOrder.h
new file mode 100644
index 0000000..5df459b
--- /dev/null
+++ b/include/llvm/IR/UseListOrder.h

@@ -0,0 +1,62 @@
+//===- llvm/IR/UseListOrder.h - LLVM Use List Order -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file has structures and command-line options for preserving use-list
+// order.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_USELISTORDER_H
+#define LLVM_IR_USELISTORDER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include <vector>
+
+namespace llvm {
+
+class Module;
+class Function;
+class Value;
+
+/// \brief Structure to hold a use-list order.
+struct UseListOrder {
+  const Value *V;
+  const Function *F;
+  std::vector<unsigned> Shuffle;
+
+  UseListOrder(const Value *V, const Function *F, size_t ShuffleSize)
+      : V(V), F(F), Shuffle(ShuffleSize) {}
+
+  UseListOrder() : V(0), F(0) {}
+  UseListOrder(UseListOrder &&X)
+      : V(X.V), F(X.F), Shuffle(std::move(X.Shuffle)) {}
+  UseListOrder &operator=(UseListOrder &&X) {
+    V = X.V;
+    F = X.F;
+    Shuffle = std::move(X.Shuffle);
+    return *this;
+  }
+
+private:
+  UseListOrder(const UseListOrder &X) LLVM_DELETED_FUNCTION;
+  UseListOrder &operator=(const UseListOrder &X) LLVM_DELETED_FUNCTION;
+};
+
+typedef std::vector<UseListOrder> UseListOrderStack;
+
+/// \brief Whether to preserve use-list ordering.
+bool shouldPreserveBitcodeUseListOrder();
+bool shouldPreserveAssemblyUseListOrder();
+void setPreserveBitcodeUseListOrder(bool ShouldPreserve);
+void setPreserveAssemblyUseListOrder(bool ShouldPreserve);
+
+} // end namespace llvm
+
+#endif

diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index 848adae..f578227 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h

@@ -26,9 +26,9 @@
 
 namespace llvm {
 
-/// OperandTraits - Compile-time customization of
-/// operand-related allocators and accessors
-/// for use of the User class
+/// \brief Compile-time customization of User operands.
+///
+/// Customizes operand-related allocators and accessors.
 template <class>
 struct OperandTraits;
 
@@ -39,11 +39,8 @@
   friend struct HungoffOperandTraits;
   virtual void anchor();
 protected:
-  /// NumOperands - The number of values used by this User.
+  /// \brief This is a pointer to the array of Uses for this User.
   ///
-  unsigned NumOperands;
-
-  /// OperandList - This is a pointer to the array of Uses for this User.
   /// For nodes of fixed arity (e.g. a binary operator) this array will live
   /// prefixed to some derived class instance.  For nodes of resizable variable
   /// arity (e.g. PHINodes, SwitchInst etc.), this memory will be dynamically
@@ -52,7 +49,9 @@
 
   void *operator new(size_t s, unsigned Us);
   User(Type *ty, unsigned vty, Use *OpList, unsigned NumOps)
-    : Value(ty, vty), NumOperands(NumOps), OperandList(OpList) {}
+      : Value(ty, vty), OperandList(OpList) {
+    NumOperands = NumOps;
+  }
   Use *allocHungoffUses(unsigned) const;
   void dropHungoffUses() {
     Use::zap(OperandList, OperandList + NumOperands, true);
@@ -64,13 +63,13 @@
   ~User() {
     Use::zap(OperandList, OperandList + NumOperands);
   }
-  /// operator delete - free memory allocated for User and Use objects
+  /// \brief Free memory allocated for User and Use objects.
   void operator delete(void *Usr);
-  /// placement delete - required by std, but never called.
+  /// \brief Placement delete - required by std, but never called.
   void operator delete(void*, unsigned) {
     llvm_unreachable("Constructor throws?");
   }
-  /// placement delete - required by std, but never called.
+  /// \brief Placement delete - required by std, but never called.
   void operator delete(void*, unsigned, bool) {
     llvm_unreachable("Constructor throws?");
   }
@@ -128,8 +127,7 @@
     return const_op_range(op_begin(), op_end());
   }
 
-  /// Convenience iterator for directly iterating over the Values in the
-  /// OperandList
+  /// \brief Iterator for directly iterating over the operand Values.
   struct value_op_iterator
       : iterator_adaptor_base<value_op_iterator, op_iterator,
                               std::random_access_iterator_tag, Value *,
@@ -150,22 +148,23 @@
     return iterator_range<value_op_iterator>(value_op_begin(), value_op_end());
   }
 
-  // dropAllReferences() - This function is in charge of "letting go" of all
-  // objects that this User refers to.  This allows one to
-  // 'delete' a whole class at a time, even though there may be circular
-  // references...  First all references are dropped, and all use counts go to
-  // zero.  Then everything is deleted for real.  Note that no operations are
-  // valid on an object that has "dropped all references", except operator
-  // delete.
-  //
+  /// \brief Drop all references to operands.
+  ///
+  /// This function is in charge of "letting go" of all objects that this User
+  /// refers to.  This allows one to 'delete' a whole class at a time, even
+  /// though there may be circular references...  First all references are
+  /// dropped, and all use counts go to zero.  Then everything is deleted for
+  /// real.  Note that no operations are valid on an object that has "dropped
+  /// all references", except operator delete.
   void dropAllReferences() {
     for (Use &U : operands())
       U.set(nullptr);
   }
 
-  /// replaceUsesOfWith - Replaces all references to the "From" definition with
-  /// references to the "To" definition.
+  /// \brief Replace uses of one Value with another.
   ///
+  /// Replaces all references to the "From" definition with references to the
+  /// "To" definition.
   void replaceUsesOfWith(Value *From, Value *To);
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:

diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index b5bbc96..67665be 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h

@@ -53,6 +53,8 @@
 //                                 Value Class
 //===----------------------------------------------------------------------===//
 
+/// \brief LLVM Value Representation
+///
 /// This is a very important LLVM class. It is the base class of all values
 /// computed by a program that may be used as operands to other values. Value is
 /// the super class of other important classes such as Instruction and Function.
@@ -64,8 +66,6 @@
 /// using this Value.  A Value can also have an arbitrary number of ValueHandle
 /// objects that watch it and listen to RAUW and Destroy events.  See
 /// llvm/IR/ValueHandle.h for details.
-///
-/// @brief LLVM Value Representation
 class Value {
   Type *VTy;
   Use *UseList;
@@ -77,18 +77,34 @@
   const unsigned char SubclassID;   // Subclass identifier (for isa/dyn_cast)
   unsigned char HasValueHandle : 1; // Has a ValueHandle pointing to this?
 protected:
-  /// SubclassOptionalData - This member is similar to SubclassData, however it
-  /// is for holding information which may be used to aid optimization, but
-  /// which may be cleared to zero without affecting conservative
-  /// interpretation.
+  /// \brief Hold subclass data that can be dropped.
+  ///
+  /// This member is similar to SubclassData, however it is for holding
+  /// information which may be used to aid optimization, but which may be
+  /// cleared to zero without affecting conservative interpretation.
   unsigned char SubclassOptionalData : 7;
 
 private:
-  /// SubclassData - This member is defined by this class, but is not used for
-  /// anything.  Subclasses can use it to hold whatever state they find useful.
-  /// This field is initialized to zero by the ctor.
+  /// \brief Hold arbitrary subclass data.
+  ///
+  /// This member is defined by this class, but is not used for anything.
+  /// Subclasses can use it to hold whatever state they find useful.  This
+  /// field is initialized to zero by the ctor.
   unsigned short SubclassData;
 
+protected:
+  /// \brief The number of operands in the subclass.
+  ///
+  /// This member is defined by this class, but not used for anything.
+  /// Subclasses can use it to store their number of operands, if they have
+  /// any.
+  ///
+  /// This is stored here to save space in User on 64-bit hosts.  Since most
+  /// instances of Value have operands, 32-bit hosts aren't significantly
+  /// affected.
+  unsigned NumOperands;
+
+private:
   template <typename UseT> // UseT == 'Use' or 'const Use'
   class use_iterator_impl
       : public std::iterator<std::forward_iterator_tag, UseT *, ptrdiff_t> {
@@ -175,6 +191,7 @@
     Use &getUse() const { return *UI; }
 
     /// \brief Return the operand # of this use in its User.
+    ///
     /// FIXME: Replace all callers with a direct call to Use::getOperandNo.
     unsigned getOperandNo() const { return UI->getOperandNo(); }
   };
@@ -187,15 +204,14 @@
 public:
   virtual ~Value();
 
-  /// dump - Support for debugging, callable in GDB: V->dump()
-  //
+  /// \brief Support for debugging, callable in GDB: V->dump()
   void dump() const;
 
-  /// print - Implement operator<< on Value.
-  ///
+  /// \brief Implement operator<< on Value.
   void print(raw_ostream &O) const;
 
   /// \brief Print the name of this Value out to the specified raw_ostream.
+  ///
   /// This is useful when you just want to print 'int %reg126', not the
   /// instruction that generated it. If you specify a Module for context, then
   /// even constanst get pretty-printed; for example, the type of a null
@@ -203,38 +219,43 @@
   void printAsOperand(raw_ostream &O, bool PrintType = true,
                       const Module *M = nullptr) const;
 
-  /// All values are typed, get the type of this value.
-  ///
+  /// \brief All values are typed, get the type of this value.
   Type *getType() const { return VTy; }
 
-  /// All values hold a context through their type.
+  /// \brief All values hold a context through their type.
   LLVMContext &getContext() const;
 
-  // All values can potentially be named.
-  bool hasName() const { return Name != nullptr && SubclassID != MDStringVal; }
+  // \brief All values can potentially be named.
+  bool hasName() const { return Name != nullptr; }
   ValueName *getValueName() const { return Name; }
   void setValueName(ValueName *VN) { Name = VN; }
 
-  /// getName() - Return a constant reference to the value's name. This is cheap
-  /// and guaranteed to return the same reference as long as the value is not
-  /// modified.
+  /// \brief Return a constant reference to the value's name.
+  ///
+  /// This is cheap and guaranteed to return the same reference as long as the
+  /// value is not modified.
   StringRef getName() const;
 
-  /// setName() - Change the name of the value, choosing a new unique name if
-  /// the provided name is taken.
+  /// \brief Change the name of the value.
+  ///
+  /// Choose a new unique name if the provided name is taken.
   ///
   /// \param Name The new name; or "" if the value's name should be removed.
   void setName(const Twine &Name);
 
 
-  /// takeName - transfer the name from V to this value, setting V's name to
-  /// empty.  It is an error to call V->takeName(V).
+  /// \brief Transfer the name from V to this value.
+  ///
+  /// After taking V's name, sets V's name to empty.
+  ///
+  /// \note It is an error to call V->takeName(V).
   void takeName(Value *V);
 
-  /// replaceAllUsesWith - Go through the uses list for this definition and make
-  /// each use point to "V" instead of "this".  After this completes, 'this's
-  /// use list is guaranteed to be empty.
+  /// \brief Change all uses of this to point to a new Value.
   ///
+  /// Go through the uses list for this definition and make each use point to
+  /// "V" instead of "this".  After this completes, 'this's use list is
+  /// guaranteed to be empty.
   void replaceAllUsesWith(Value *V);
 
   //----------------------------------------------------------------------
@@ -270,36 +291,38 @@
     return iterator_range<const_user_iterator>(user_begin(), user_end());
   }
 
-  /// hasOneUse - Return true if there is exactly one user of this value.  This
-  /// is specialized because it is a common request and does not require
-  /// traversing the whole use list.
+  /// \brief Return true if there is exactly one user of this value.
   ///
+  /// This is specialized because it is a common request and does not require
+  /// traversing the whole use list.
   bool hasOneUse() const {
     const_use_iterator I = use_begin(), E = use_end();
     if (I == E) return false;
     return ++I == E;
   }
 
-  /// hasNUses - Return true if this Value has exactly N users.
-  ///
+  /// \brief Return true if this Value has exactly N users.
   bool hasNUses(unsigned N) const;
 
-  /// hasNUsesOrMore - Return true if this value has N users or more.  This is
-  /// logically equivalent to getNumUses() >= N.
+  /// \brief Return true if this value has N users or more.
   ///
+  /// This is logically equivalent to getNumUses() >= N.
   bool hasNUsesOrMore(unsigned N) const;
 
+  /// \brief Check if this value is used in the specified basic block.
   bool isUsedInBasicBlock(const BasicBlock *BB) const;
 
-  /// getNumUses - This method computes the number of uses of this Value.  This
-  /// is a linear time operation.  Use hasOneUse, hasNUses, or hasNUsesOrMore
-  /// to check for specific values.
+  /// \brief This method computes the number of uses of this Value.
+  ///
+  /// This is a linear time operation.  Use hasOneUse, hasNUses, or
+  /// hasNUsesOrMore to check for specific values.
   unsigned getNumUses() const;
 
-  /// addUse - This method should only be used by the Use class.
-  ///
+  /// \brief This method should only be used by the Use class.
   void addUse(Use &U) { U.addToList(&UseList); }
 
+  /// \brief Concrete subclass of this.
+  ///
   /// An enumeration for keeping track of the concrete subclass of Value that
   /// is actually instantiated. Values of this enumeration are kept in the
   /// Value classes SubclassID field. They are used for concrete type
@@ -322,7 +345,8 @@
     ConstantStructVal,        // This is an instance of ConstantStruct
     ConstantVectorVal,        // This is an instance of ConstantVector
     ConstantPointerNullVal,   // This is an instance of ConstantPointerNull
-    MDNodeVal,                // This is an instance of MDNode
+    GenericMDNodeVal,         // This is an instance of GenericMDNode
+    MDNodeFwdDeclVal,         // This is an instance of MDNodeFwdDecl
     MDStringVal,              // This is an instance of MDString
     InlineAsmVal,             // This is an instance of InlineAsm
     InstructionVal,           // This is an instance of Instruction
@@ -334,11 +358,12 @@
     ConstantLastVal  = ConstantPointerNullVal
   };
 
-  /// getValueID - Return an ID for the concrete type of this object.  This is
-  /// used to implement the classof checks.  This should not be used for any
-  /// other purpose, as the values may change as LLVM evolves.  Also, note that
-  /// for instructions, the Instruction's opcode is added to InstructionVal. So
-  /// this means three things:
+  /// \brief Return an ID for the concrete type of this object.
+  ///
+  /// This is used to implement the classof checks.  This should not be used
+  /// for any other purpose, as the values may change as LLVM evolves.  Also,
+  /// note that for instructions, the Instruction's opcode is added to
+  /// InstructionVal. So this means three things:
   /// # there is no value with code InstructionVal (no opcode==0).
   /// # there are more possible values for the value type than in ValueTy enum.
   /// # the InstructionVal enumerator must be the highest valued enumerator in
@@ -347,64 +372,59 @@
     return SubclassID;
   }
 
-  /// getRawSubclassOptionalData - Return the raw optional flags value
-  /// contained in this value. This should only be used when testing two
-  /// Values for equivalence.
+  /// \brief Return the raw optional flags value contained in this value.
+  ///
+  /// This should only be used when testing two Values for equivalence.
   unsigned getRawSubclassOptionalData() const {
     return SubclassOptionalData;
   }
 
-  /// clearSubclassOptionalData - Clear the optional flags contained in
-  /// this value.
+  /// \brief Clear the optional flags contained in this value.
   void clearSubclassOptionalData() {
     SubclassOptionalData = 0;
   }
 
-  /// hasSameSubclassOptionalData - Test whether the optional flags contained
-  /// in this value are equal to the optional flags in the given value.
+  /// \brief Check the optional flags for equality.
   bool hasSameSubclassOptionalData(const Value *V) const {
     return SubclassOptionalData == V->SubclassOptionalData;
   }
 
-  /// intersectOptionalDataWith - Clear any optional flags in this value
-  /// that are not also set in the given value.
+  /// \brief Clear any optional flags not set in the given Value.
   void intersectOptionalDataWith(const Value *V) {
     SubclassOptionalData &= V->SubclassOptionalData;
   }
 
-  /// hasValueHandle - Return true if there is a value handle associated with
-  /// this value.
+  /// \brief Return true if there is a value handle associated with this value.
   bool hasValueHandle() const { return HasValueHandle; }
 
-  /// \brief Strips off any unneeded pointer casts, all-zero GEPs and aliases
-  /// from the specified value, returning the original uncasted value.
+  /// \brief Strip off pointer casts, all-zero GEPs, and aliases.
   ///
-  /// If this is called on a non-pointer value, it returns 'this'.
+  /// Returns the original uncasted value.  If this is called on a non-pointer
+  /// value, it returns 'this'.
   Value *stripPointerCasts();
   const Value *stripPointerCasts() const {
     return const_cast<Value*>(this)->stripPointerCasts();
   }
 
-  /// \brief Strips off any unneeded pointer casts and all-zero GEPs from the
-  /// specified value, returning the original uncasted value.
+  /// \brief Strip off pointer casts and all-zero GEPs.
   ///
-  /// If this is called on a non-pointer value, it returns 'this'.
+  /// Returns the original uncasted value.  If this is called on a non-pointer
+  /// value, it returns 'this'.
   Value *stripPointerCastsNoFollowAliases();
   const Value *stripPointerCastsNoFollowAliases() const {
     return const_cast<Value*>(this)->stripPointerCastsNoFollowAliases();
   }
 
-  /// \brief Strips off unneeded pointer casts and all-constant GEPs from the
-  /// specified value, returning the original pointer value.
+  /// \brief Strip off pointer casts and all-constant inbounds GEPs.
   ///
-  /// If this is called on a non-pointer value, it returns 'this'.
+  /// Returns the original pointer value.  If this is called on a non-pointer
+  /// value, it returns 'this'.
   Value *stripInBoundsConstantOffsets();
   const Value *stripInBoundsConstantOffsets() const {
     return const_cast<Value*>(this)->stripInBoundsConstantOffsets();
   }
 
-  /// \brief Strips like \c stripInBoundsConstantOffsets but also accumulates
-  /// the constant offset stripped.
+  /// \brief Accumulate offsets from \a stripInBoundsConstantOffsets().
   ///
   /// Stores the resulting constant offset stripped into the APInt provided.
   /// The provided APInt will be extended or truncated as needed to be the
@@ -419,23 +439,27 @@
         ->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
   }
 
-  /// \brief Strips off unneeded pointer casts and any in-bounds offsets from
-  /// the specified value, returning the original pointer value.
+  /// \brief Strip off pointer casts and inbounds GEPs.
   ///
-  /// If this is called on a non-pointer value, it returns 'this'.
+  /// Returns the original pointer value.  If this is called on a non-pointer
+  /// value, it returns 'this'.
   Value *stripInBoundsOffsets();
   const Value *stripInBoundsOffsets() const {
     return const_cast<Value*>(this)->stripInBoundsOffsets();
   }
 
-  /// isDereferenceablePointer - Test if this value is always a pointer to
-  /// allocated and suitably aligned memory for a simple load or store.
+  /// \brief Check if this is always a dereferenceable pointer.
+  ///
+  /// Test if this value is always a pointer to allocated and suitably aligned
+  /// memory for a simple load or store.
   bool isDereferenceablePointer(const DataLayout *DL = nullptr) const;
 
-  /// DoPHITranslation - If this value is a PHI node with CurBB as its parent,
-  /// return the value in the PHI node corresponding to PredBB.  If not, return
-  /// ourself.  This is useful if you want to know the value something has in a
-  /// predecessor block.
+  /// \brief Translate PHI node to its predecessor from the given basic block.
+  ///
+  /// If this value is a PHI node with CurBB as its parent, return the value in
+  /// the PHI node corresponding to PredBB.  If not, return ourself.  This is
+  /// useful if you want to know the value something has in a predecessor
+  /// block.
   Value *DoPHITranslation(const BasicBlock *CurBB, const BasicBlock *PredBB);
 
   const Value *DoPHITranslation(const BasicBlock *CurBB,
@@ -443,11 +467,14 @@
     return const_cast<Value*>(this)->DoPHITranslation(CurBB, PredBB);
   }
 
-  /// MaximumAlignment - This is the greatest alignment value supported by
-  /// load, store, and alloca instructions, and global values.
+  /// \brief The maximum alignment for instructions.
+  ///
+  /// This is the greatest alignment value supported by load, store, and alloca
+  /// instructions, and global values.
   static const unsigned MaximumAlignment = 1u << 29;
 
-  /// mutateType - Mutate the type of this Value to be of the specified type.
+  /// \brief Mutate the type of this Value to be of the specified type.
+  ///
   /// Note that this is an extremely dangerous operation which can create
   /// completely invalid IR very easily.  It is strongly recommended that you
   /// recreate IR objects with the right types instead of mutating them in
@@ -456,6 +483,37 @@
     VTy = Ty;
   }
 
+  /// \brief Sort the use-list.
+  ///
+  /// Sorts the Value's use-list by Cmp using a stable mergesort.  Cmp is
+  /// expected to compare two \a Use references.
+  template <class Compare> void sortUseList(Compare Cmp);
+
+  /// \brief Reverse the use-list.
+  void reverseUseList();
+
+private:
+  /// \brief Merge two lists together.
+  ///
+  /// Merges \c L and \c R using \c Cmp.  To enable stable sorts, always pushes
+  /// "equal" items from L before items from R.
+  ///
+  /// \return the first element in the list.
+  ///
+  /// \note Completely ignores \a Use::Prev (doesn't read, doesn't update).
+  template <class Compare>
+  static Use *mergeUseLists(Use *L, Use *R, Compare Cmp) {
+    Use *Merged;
+    mergeUseListsImpl(L, R, &Merged, Cmp);
+    return Merged;
+  }
+
+  /// \brief Tail-recursive helper for \a mergeUseLists().
+  ///
+  /// \param[out] Next the first element in the list.
+  template <class Compare>
+  static void mergeUseListsImpl(Use *L, Use *R, Use **Next, Compare Cmp);
+
 protected:
   unsigned short getSubclassDataFromValue() const { return SubclassData; }
   void setValueSubclassData(unsigned short D) { SubclassData = D; }
@@ -472,6 +530,91 @@
   if (V) V->addUse(*this);
 }
 
+template <class Compare> void Value::sortUseList(Compare Cmp) {
+  if (!UseList || !UseList->Next)
+    // No need to sort 0 or 1 uses.
+    return;
+
+  // Note: this function completely ignores Prev pointers until the end when
+  // they're fixed en masse.
+
+  // Create a binomial vector of sorted lists, visiting uses one at a time and
+  // merging lists as necessary.
+  const unsigned MaxSlots = 32;
+  Use *Slots[MaxSlots];
+
+  // Collect the first use, turning it into a single-item list.
+  Use *Next = UseList->Next;
+  UseList->Next = nullptr;
+  unsigned NumSlots = 1;
+  Slots[0] = UseList;
+
+  // Collect all but the last use.
+  while (Next->Next) {
+    Use *Current = Next;
+    Next = Current->Next;
+
+    // Turn Current into a single-item list.
+    Current->Next = nullptr;
+
+    // Save Current in the first available slot, merging on collisions.
+    unsigned I;
+    for (I = 0; I < NumSlots; ++I) {
+      if (!Slots[I])
+        break;
+
+      // Merge two lists, doubling the size of Current and emptying slot I.
+      //
+      // Since the uses in Slots[I] originally preceded those in Current, send
+      // Slots[I] in as the left parameter to maintain a stable sort.
+      Current = mergeUseLists(Slots[I], Current, Cmp);
+      Slots[I] = nullptr;
+    }
+    // Check if this is a new slot.
+    if (I == NumSlots) {
+      ++NumSlots;
+      assert(NumSlots <= MaxSlots && "Use list bigger than 2^32");
+    }
+
+    // Found an open slot.
+    Slots[I] = Current;
+  }
+
+  // Merge all the lists together.
+  assert(Next && "Expected one more Use");
+  assert(!Next->Next && "Expected only one Use");
+  UseList = Next;
+  for (unsigned I = 0; I < NumSlots; ++I)
+    if (Slots[I])
+      // Since the uses in Slots[I] originally preceded those in UseList, send
+      // Slots[I] in as the left parameter to maintain a stable sort.
+      UseList = mergeUseLists(Slots[I], UseList, Cmp);
+
+  // Fix the Prev pointers.
+  for (Use *I = UseList, **Prev = &UseList; I; I = I->Next) {
+    I->setPrev(Prev);
+    Prev = &I->Next;
+  }
+}
+
+template <class Compare>
+void Value::mergeUseListsImpl(Use *L, Use *R, Use **Next, Compare Cmp) {
+  if (!L) {
+    *Next = R;
+    return;
+  }
+  if (!R) {
+    *Next = L;
+    return;
+  }
+  if (Cmp(*R, *L)) {
+    *Next = R;
+    mergeUseListsImpl(L, R->Next, &R->Next, Cmp);
+    return;
+  }
+  *Next = L;
+  mergeUseListsImpl(L->Next, R, &L->Next, Cmp);
+}
 
 // isa - Provide some specializations of isa so that we don't have to include
 // the subtype header files to test to see if the value is a subclass...
@@ -539,7 +682,8 @@
 
 template <> struct isa_impl<MDNode, Value> {
   static inline bool doit(const Value &Val) {
-    return Val.getValueID() == Value::MDNodeVal;
+    return Val.getValueID() == Value::GenericMDNodeVal ||
+           Val.getValueID() == Value::MDNodeFwdDeclVal;
   }
 };
 

diff --git a/include/llvm/IR/ValueHandle.h b/include/llvm/IR/ValueHandle.h
index aa29b2e..460210e 100644
--- a/include/llvm/IR/ValueHandle.h
+++ b/include/llvm/IR/ValueHandle.h

@@ -33,15 +33,16 @@
   enum { NumLowBitsAvailable = 2 };
 };
 
-/// ValueHandleBase - This is the common base class of value handles.
+/// \brief This is the common base class of value handles.
+///
 /// ValueHandle's are smart pointers to Value's that have special behavior when
 /// the value is deleted or ReplaceAllUsesWith'd.  See the specific handles
 /// below for details.
-///
 class ValueHandleBase {
   friend class Value;
 protected:
-  /// HandleBaseKind - This indicates what sub class the handle actually is.
+  /// \brief This indicates what sub class the handle actually is.
+  ///
   /// This is to avoid having a vtable for the light-weight handle pointers. The
   /// fully general Callback version does have a vtable.
   enum HandleBaseKind {
@@ -122,26 +123,28 @@
   HandleBaseKind getKind() const { return PrevPair.getInt(); }
   void setPrevPtr(ValueHandleBase **Ptr) { PrevPair.setPointer(Ptr); }
 
-  /// AddToExistingUseList - Add this ValueHandle to the use list for VP, where
+  /// \brief Add this ValueHandle to the use list for VP.
+  ///
   /// List is the address of either the head of the list or a Next node within
   /// the existing use list.
   void AddToExistingUseList(ValueHandleBase **List);
 
-  /// AddToExistingUseListAfter - Add this ValueHandle to the use list after
-  /// Node.
+  /// \brief Add this ValueHandle to the use list after Node.
   void AddToExistingUseListAfter(ValueHandleBase *Node);
 
-  /// AddToUseList - Add this ValueHandle to the use list for VP.
+  /// \brief Add this ValueHandle to the use list for VP.
   void AddToUseList();
-  /// RemoveFromUseList - Remove this ValueHandle from its current use list.
+  /// \brief Remove this ValueHandle from its current use list.
   void RemoveFromUseList();
 };
 
-/// WeakVH - This is a value handle that tries hard to point to a Value, even
-/// across RAUW operations, but will null itself out if the value is destroyed.
-/// this is useful for advisory sorts of information, but should not be used as
-/// the key of a map (since the map would have to rearrange itself when the
-/// pointer changes).
+/// \brief Value handle that is nullable, but tries to track the Value.
+///
+/// This is a value handle that tries hard to point to a Value, even across
+/// RAUW operations, but will null itself out if the value is destroyed.  this
+/// is useful for advisory sorts of information, but should not be used as the
+/// key of a map (since the map would have to rearrange itself when the pointer
+/// changes).
 class WeakVH : public ValueHandleBase {
 public:
   WeakVH() : ValueHandleBase(Weak) {}
@@ -170,14 +173,16 @@
   }
 };
 
-/// AssertingVH - This is a Value Handle that points to a value and asserts out
-/// if the value is destroyed while the handle is still live.  This is very
-/// useful for catching dangling pointer bugs and other things which can be
-/// non-obvious.  One particularly useful place to use this is as the Key of a
-/// map.  Dangling pointer bugs often lead to really subtle bugs that only occur
-/// if another object happens to get allocated to the same address as the old
-/// one.  Using an AssertingVH ensures that an assert is triggered as soon as
-/// the bad delete occurs.
+/// \brief Value handle that asserts if the Value is deleted.
+///
+/// This is a Value Handle that points to a value and asserts out if the value
+/// is destroyed while the handle is still live.  This is very useful for
+/// catching dangling pointer bugs and other things which can be non-obvious.
+/// One particularly useful place to use this is as the Key of a map.  Dangling
+/// pointer bugs often lead to really subtle bugs that only occur if another
+/// object happens to get allocated to the same address as the old one.  Using
+/// an AssertingVH ensures that an assert is triggered as soon as the bad
+/// delete occurs.
 ///
 /// Note that an AssertingVH handle does *not* follow values across RAUW
 /// operations.  This means that RAUW's need to explicitly update the
@@ -189,6 +194,7 @@
   : public ValueHandleBase
 #endif
   {
+  friend struct DenseMapInfo<AssertingVH<ValueTy> >;
 
 #ifndef NDEBUG
   ValueTy *getValPtr() const {
@@ -248,11 +254,19 @@
   static unsigned getHashValue(const AssertingVH<T> &Val) {
     return PointerInfo::getHashValue(Val);
   }
+#ifndef NDEBUG
+  static bool isEqual(const AssertingVH<T> &LHS, const AssertingVH<T> &RHS) {
+    // Avoid downcasting AssertingVH<T> to T*, as empty/tombstone keys may not
+    // be properly aligned pointers to T*.
+    return LHS.ValueHandleBase::getValPtr() == RHS.ValueHandleBase::getValPtr();
+  }
+#else
   static bool isEqual(const AssertingVH<T> &LHS, const AssertingVH<T> &RHS) {
     return LHS == RHS;
   }
+#endif
 };
-  
+
 template <typename T>
 struct isPodLike<AssertingVH<T> > {
 #ifdef NDEBUG
@@ -263,8 +277,7 @@
 };
 
 
-/// TrackingVH - This is a value handle that tracks a Value (or Value subclass),
-/// even across RAUW operations.
+/// \brief Value handle that tracks a Value across RAUW.
 ///
 /// TrackingVH is designed for situations where a client needs to hold a handle
 /// to a Value (or subclass) across some operations which may move that value,
@@ -332,12 +345,14 @@
   ValueTy &operator*() const { return *getValPtr(); }
 };
 
-/// CallbackVH - This is a value handle that allows subclasses to define
-/// callbacks that run when the underlying Value has RAUW called on it or is
-/// destroyed.  This class can be used as the key of a map, as long as the user
-/// takes it out of the map before calling setValPtr() (since the map has to
-/// rearrange itself when the pointer changes).  Unlike ValueHandleBase, this
-/// class has a vtable and a virtual destructor.
+/// \brief Value handle with callbacks on RAUW and destruction.
+///
+/// This is a value handle that allows subclasses to define callbacks that run
+/// when the underlying Value has RAUW called on it or is destroyed.  This
+/// class can be used as the key of a map, as long as the user takes it out of
+/// the map before calling setValPtr() (since the map has to rearrange itself
+/// when the pointer changes).  Unlike ValueHandleBase, this class has a vtable
+/// and a virtual destructor.
 class CallbackVH : public ValueHandleBase {
   virtual void anchor();
 protected:
@@ -358,16 +373,20 @@
     return getValPtr();
   }
 
-  /// Called when this->getValPtr() is destroyed, inside ~Value(), so you may
-  /// call any non-virtual Value method on getValPtr(), but no subclass methods.
-  /// If WeakVH were implemented as a CallbackVH, it would use this method to
-  /// call setValPtr(NULL).  AssertingVH would use this method to cause an
-  /// assertion failure.
+  /// \brief Callback for Value destruction.
+  ///
+  /// Called when this->getValPtr() is destroyed, inside ~Value(), so you
+  /// may call any non-virtual Value method on getValPtr(), but no subclass
+  /// methods.  If WeakVH were implemented as a CallbackVH, it would use this
+  /// method to call setValPtr(NULL).  AssertingVH would use this method to
+  /// cause an assertion failure.
   ///
   /// All implementations must remove the reference from this object to the
   /// Value that's being destroyed.
   virtual void deleted() { setValPtr(nullptr); }
 
+  /// \brief Callback for Value RAUW.
+  ///
   /// Called when this->getValPtr()->replaceAllUsesWith(new_value) is called,
   /// _before_ any of the uses have actually been replaced.  If WeakVH were
   /// implemented as a CallbackVH, it would use this method to call

diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h
index 43a79c7..aa8a29d 100644
--- a/include/llvm/IR/ValueMap.h
+++ b/include/llvm/IR/ValueMap.h

@@ -29,6 +29,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/UniqueLock.h"
 #include "llvm/Support/type_traits.h"
 #include <iterator>
 
@@ -111,7 +112,7 @@
 
   void clear() { Map.clear(); }
 
-  /// Return 1 if the specified key is in the map, 0 otherwise.

+  /// Return 1 if the specified key is in the map, 0 otherwise.
   size_type count(const KeyT &Val) const {
     return Map.find_as(Val) == Map.end() ? 0 : 1;
   }
@@ -216,12 +217,11 @@
     // Make a copy that won't get changed even when *this is destroyed.
     ValueMapCallbackVH Copy(*this);
     typename Config::mutex_type *M = Config::getMutex(Copy.Map->Data);
+    unique_lock<typename Config::mutex_type> Guard;
     if (M)
-      M->acquire();
+      Guard = unique_lock<typename Config::mutex_type>(*M);
     Config::onDelete(Copy.Map->Data, Copy.Unwrap());  // May destroy *this.
     Copy.Map->Map.erase(Copy);  // Definitely destroys *this.
-    if (M)
-      M->release();
   }
   void allUsesReplacedWith(Value *new_key) override {
     assert(isa<KeySansPointerT>(new_key) &&
@@ -229,8 +229,9 @@
     // Make a copy that won't get changed even when *this is destroyed.
     ValueMapCallbackVH Copy(*this);
     typename Config::mutex_type *M = Config::getMutex(Copy.Map->Data);
+    unique_lock<typename Config::mutex_type> Guard;
     if (M)
-      M->acquire();
+      Guard = unique_lock<typename Config::mutex_type>(*M);
 
     KeyT typed_new_key = cast<KeySansPointerT>(new_key);
     // Can destroy *this:
@@ -245,8 +246,6 @@
         Copy.Map->insert(std::make_pair(typed_new_key, Target));
       }
     }
-    if (M)
-      M->release();
   }
 };
 

diff --git a/include/llvm/IRReader/IRReader.h b/include/llvm/IRReader/IRReader.h
index 59ffc09..2d9ace0 100644
--- a/include/llvm/IRReader/IRReader.h
+++ b/include/llvm/IRReader/IRReader.h

@@ -15,12 +15,12 @@
 #ifndef LLVM_IRREADER_IRREADER_H
 #define LLVM_IRREADER_IRREADER_H
 
+#include "llvm/Support/MemoryBuffer.h"
 #include <string>
 
 namespace llvm {
 
 class Module;
-class MemoryBuffer;
 class SMDiagnostic;
 class LLVMContext;
 
@@ -28,20 +28,21 @@
 /// for it which does lazy deserialization of function bodies.  Otherwise,
 /// attempt to parse it as LLVM Assembly and return a fully populated
 /// Module.
-Module *getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err,
-                            LLVMContext &Context);
+std::unique_ptr<Module> getLazyIRFileModule(StringRef Filename,
+                                            SMDiagnostic &Err,
+                                            LLVMContext &Context);
 
 /// If the given MemoryBuffer holds a bitcode image, return a Module
 /// for it.  Otherwise, attempt to parse it as LLVM Assembly and return
-/// a Module for it. This function *never* takes ownership of Buffer.
-Module *ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err, LLVMContext &Context);
+/// a Module for it.
+std::unique_ptr<Module> parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
+                                LLVMContext &Context);
 
 /// If the given file holds a bitcode image, return a Module for it.
 /// Otherwise, attempt to parse it as LLVM Assembly and return a Module
 /// for it.
-Module *ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
-                    LLVMContext &Context);
-
+std::unique_ptr<Module> parseIRFile(StringRef Filename, SMDiagnostic &Err,
+                                    LLVMContext &Context);
 }
 
 #endif

diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 0c840f3..a4bc598 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h

@@ -71,8 +71,9 @@
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlwaysInlinerPass(PassRegistry&);
 void initializeArgPromotionPass(PassRegistry&);
-void initializeAtomicExpandLoadLinkedPass(PassRegistry&);
+void initializeAtomicExpandPass(PassRegistry&);
 void initializeSampleProfileLoaderPass(PassRegistry&);
+void initializeAlignmentFromAssumptionsPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAliasAnalysisPass(PassRegistry&);
 void initializeCallGraphWrapperPassPass(PassRegistry &);
@@ -89,6 +90,8 @@
 void initializeCFGOnlyViewerPass(PassRegistry&);
 void initializeCFGPrinterPass(PassRegistry&);
 void initializeCFGSimplifyPassPass(PassRegistry&);
+void initializeCFLAliasAnalysisPass(PassRegistry&);
+void initializeForwardControlFlowIntegrityPass(PassRegistry&);
 void initializeFlattenCFGPassPass(PassRegistry&);
 void initializeStructurizeCFGPass(PassRegistry&);
 void initializeCFGViewerPass(PassRegistry&);
@@ -123,6 +126,7 @@
 void initializeAddressSanitizerModulePass(PassRegistry&);
 void initializeMemorySanitizerPass(PassRegistry&);
 void initializeThreadSanitizerPass(PassRegistry&);
+void initializeSanitizerCoverageModulePass(PassRegistry&);
 void initializeDataFlowSanitizerPass(PassRegistry&);
 void initializeScalarizerPass(PassRegistry&);
 void initializeEarlyCSEPass(PassRegistry&);
@@ -184,10 +188,12 @@
 void initializeMachineBranchProbabilityInfoPass(PassRegistry&);
 void initializeMachineCSEPass(PassRegistry&);
 void initializeMachineDominatorTreePass(PassRegistry&);
+void initializeMachineDominanceFrontierPass(PassRegistry&);
 void initializeMachinePostDominatorTreePass(PassRegistry&);
 void initializeMachineLICMPass(PassRegistry&);
 void initializeMachineLoopInfoPass(PassRegistry&);
 void initializeMachineModuleInfoPass(PassRegistry&);
+void initializeMachineRegionInfoPassPass(PassRegistry&);
 void initializeMachineSchedulerPass(PassRegistry&);
 void initializeMachineSinkingPass(PassRegistry&);
 void initializeMachineTraceMetricsPass(PassRegistry&);
@@ -195,6 +201,7 @@
 void initializeMemCpyOptPass(PassRegistry&);
 void initializeMemDepPrinterPass(PassRegistry&);
 void initializeMemoryDependenceAnalysisPass(PassRegistry&);
+void initializeMergedLoadStoreMotionPass(PassRegistry &);
 void initializeMetaRenamerPass(PassRegistry&);
 void initializeMergeFunctionsPass(PassRegistry&);
 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
@@ -204,6 +211,7 @@
 void initializeObjCARCExpandPass(PassRegistry&);
 void initializeObjCARCContractPass(PassRegistry&);
 void initializeObjCARCOptPass(PassRegistry&);
+void initializePAEvalPass(PassRegistry &);
 void initializeOptimizePHIsPass(PassRegistry&);
 void initializePartiallyInlineLibCallsPass(PassRegistry&);
 void initializePEIPass(PassRegistry&);
@@ -225,7 +233,7 @@
 void initializePruneEHPass(PassRegistry&);
 void initializeReassociatePass(PassRegistry&);
 void initializeRegToMemPass(PassRegistry&);
-void initializeRegionInfoPass(PassRegistry&);
+void initializeRegionInfoPassPass(PassRegistry&);
 void initializeRegionOnlyPrinterPass(PassRegistry&);
 void initializeRegionOnlyViewerPass(PassRegistry&);
 void initializeRegionPrinterPass(PassRegistry&);
@@ -256,10 +264,13 @@
 void initializeTargetPassConfigPass(PassRegistry&);
 void initializeDataLayoutPassPass(PassRegistry &);
 void initializeTargetTransformInfoAnalysisGroup(PassRegistry&);
+void initializeFunctionTargetTransformInfoPass(PassRegistry &);
 void initializeNoTTIPass(PassRegistry&);
 void initializeTargetLibraryInfoPass(PassRegistry&);
+void initializeAssumptionTrackerPass(PassRegistry &);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAliasAnalysisPass(PassRegistry&);
+void initializeScopedNoAliasAAPass(PassRegistry&);
 void initializeUnifyFunctionExitNodesPass(PassRegistry&);
 void initializeUnreachableBlockElimPass(PassRegistry&);
 void initializeUnreachableMachineBlockElimPass(PassRegistry&);
@@ -274,7 +285,9 @@
 void initializeBBVectorizePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 void initializeStackMapLivenessPass(PassRegistry&);
+void initializeMachineCombinerPass(PassRegistry &);
 void initializeLoadCombinePass(PassRegistry&);
+void initializeRewriteSymbolsPass(PassRegistry&);
 }
 
 #endif

diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h
index b19b232..0c9ce4a 100644
--- a/include/llvm/LTO/LTOCodeGenerator.h
+++ b/include/llvm/LTO/LTOCodeGenerator.h

@@ -32,8 +32,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LTO_CODE_GENERATOR_H
-#define LTO_CODE_GENERATOR_H
+#ifndef LLVM_LTO_LTOCODEGENERATOR_H
+#define LLVM_LTO_LTOCODEGENERATOR_H
 
 #include "llvm-c/lto.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -61,10 +61,11 @@
   static const char *getVersionString();
 
   LTOCodeGenerator();
+  LTOCodeGenerator(std::unique_ptr<LLVMContext> Context);
   ~LTOCodeGenerator();
 
   // Merge given module, return true on success.
-  bool addModule(struct LTOModule*, std::string &errMsg);
+  bool addModule(struct LTOModule *);
 
   void setTargetOptions(TargetOptions options);
   void setDebugInfo(lto_debug_model);
@@ -101,6 +102,7 @@
                        bool disableOpt,
                        bool disableInline,
                        bool disableGVNLoadPRE,
+                       bool disableVectorization,
                        std::string &errMsg);
 
   // As with compile_to_file(), this function compiles the merged module into
@@ -112,19 +114,23 @@
                       bool disableOpt,
                       bool disableInline,
                       bool disableGVNLoadPRE,
+                      bool disableVectorization,
                       std::string &errMsg);
 
   void setDiagnosticHandler(lto_diagnostic_handler_t, void *);
 
+  LLVMContext &getContext() { return Context; }
+
 private:
   void initializeLTOPasses();
 
   bool generateObjectFile(raw_ostream &out, bool disableOpt, bool disableInline,
-                          bool disableGVNLoadPRE, std::string &errMsg);
+                          bool disableGVNLoadPRE, bool disableVectorization,
+                          std::string &errMsg);
   void applyScopeRestrictions();
-  void applyRestriction(GlobalValue &GV, const ArrayRef<StringRef> &Libcalls,
+  void applyRestriction(GlobalValue &GV, ArrayRef<StringRef> Libcalls,
                         std::vector<const char *> &MustPreserveList,
-                        SmallPtrSet<GlobalValue *, 8> &AsmUsed,
+                        SmallPtrSetImpl<GlobalValue *> &AsmUsed,
                         Mangler &Mangler);
   bool determineTarget(std::string &errMsg);
 
@@ -134,6 +140,8 @@
 
   typedef StringMap<uint8_t> StringSet;
 
+  void initialize();
+  std::unique_ptr<LLVMContext> OwnedContext;
   LLVMContext &Context;
   Linker IRLinker;
   TargetMachine *TargetMach;
@@ -142,7 +150,7 @@
   lto_codegen_model CodeModel;
   StringSet MustPreserveSymbols;
   StringSet AsmUndefinedRefs;
-  MemoryBuffer *NativeObjectFile;
+  std::unique_ptr<MemoryBuffer> NativeObjectFile;
   std::vector<char *> CodegenOptions;
   std::string MCpu;
   std::string MAttr;
@@ -152,4 +160,4 @@
   void *DiagContext;
 };
 }
-#endif // LTO_CODE_GENERATOR_H
+#endif

diff --git a/include/llvm/LTO/LTOModule.h b/include/llvm/LTO/LTOModule.h
index c43846a..53c2b8e 100644
--- a/include/llvm/LTO/LTOModule.h
+++ b/include/llvm/LTO/LTOModule.h

@@ -11,11 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LTO_MODULE_H
-#define LTO_MODULE_H
+#ifndef LLVM_LTO_LTOMODULE_H
+#define LLVM_LTO_LTOMODULE_H
 
 #include "llvm-c/lto.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
@@ -37,8 +38,6 @@
 ///
 struct LTOModule {
 private:
-  typedef StringMap<uint8_t> StringSet;
-
   struct NameAndAttributes {
     const char        *name;
     uint32_t           attributes;
@@ -46,21 +45,27 @@
     const GlobalValue *symbol;
   };
 
+  std::unique_ptr<LLVMContext> OwnedContext;
+
   std::unique_ptr<object::IRObjectFile> IRFile;
   std::unique_ptr<TargetMachine> _target;
-  StringSet                               _linkeropt_strings;
+  StringSet<>                             _linkeropt_strings;
   std::vector<const char *>               _deplibs;
   std::vector<const char *>               _linkeropts;
   std::vector<NameAndAttributes>          _symbols;
 
   // _defines and _undefines only needed to disambiguate tentative definitions
-  StringSet                               _defines;
+  StringSet<>                             _defines;
   StringMap<NameAndAttributes> _undefines;
   std::vector<const char*>                _asm_undefines;
 
   LTOModule(std::unique_ptr<object::IRObjectFile> Obj, TargetMachine *TM);
+  LTOModule(std::unique_ptr<object::IRObjectFile> Obj, TargetMachine *TM,
+            std::unique_ptr<LLVMContext> Context);
 
 public:
+  ~LTOModule();
+
   /// Returns 'true' if the file or memory contents is LLVM bitcode.
   static bool isBitcodeFile(const void *mem, size_t length);
   static bool isBitcodeFile(const char *path);
@@ -71,8 +76,8 @@
                                  StringRef triplePrefix);
 
   /// Create a MemoryBuffer from a memory range with an optional name.
-  static MemoryBuffer *makeBuffer(const void *mem, size_t length,
-                                  StringRef name = "");
+  static std::unique_ptr<MemoryBuffer>
+  makeBuffer(const void *mem, size_t length, StringRef name = "");
 
   /// Create an LTOModule. N.B. These methods take ownership of the buffer. The
   /// caller must have initialized the Targets, the TargetMCs, the AsmPrinters,
@@ -95,6 +100,13 @@
                                      TargetOptions options, std::string &errMsg,
                                      StringRef path = "");
 
+  static LTOModule *createInLocalContext(const void *mem, size_t length,
+                                         TargetOptions options,
+                                         std::string &errMsg, StringRef path);
+  static LTOModule *createInContext(const void *mem, size_t length,
+                                    TargetOptions options, std::string &errMsg,
+                                    StringRef path, LLVMContext *Context);
+
   const Module &getModule() const {
     return const_cast<LTOModule*>(this)->getModule();
   }
@@ -202,10 +214,9 @@
   /// Get string that the data pointer points to.
   bool objcClassNameFromExpression(const Constant *c, std::string &name);
 
-  /// Create an LTOModule (private version). N.B. This method takes ownership of
-  /// the buffer.
-  static LTOModule *makeLTOModule(std::unique_ptr<MemoryBuffer> Buffer,
-                                  TargetOptions options, std::string &errMsg);
+  /// Create an LTOModule (private version).
+  static LTOModule *makeLTOModule(MemoryBufferRef Buffer, TargetOptions options,
+                                  std::string &errMsg, LLVMContext *Context);
 };
 }
-#endif // LTO_MODULE_H
+#endif

diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index b2309ff..66e4e9c 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h

@@ -34,6 +34,7 @@
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <cstdlib>
 
@@ -52,15 +53,18 @@
       (void) llvm::createAliasAnalysisCounterPass();
       (void) llvm::createAliasDebugger();
       (void) llvm::createArgumentPromotionPass();
+      (void) llvm::createAlignmentFromAssumptionsPass();
       (void) llvm::createBasicAliasAnalysisPass();
       (void) llvm::createLibCallAliasAnalysisPass(nullptr);
       (void) llvm::createScalarEvolutionAliasAnalysisPass();
       (void) llvm::createTypeBasedAliasAnalysisPass();
+      (void) llvm::createScopedNoAliasAAPass();
       (void) llvm::createBoundsCheckingPass();
       (void) llvm::createBreakCriticalEdgesPass();
       (void) llvm::createCallGraphPrinterPass();
       (void) llvm::createCallGraphViewerPass();
       (void) llvm::createCFGSimplificationPass();
+      (void) llvm::createCFLAliasAnalysisPass();
       (void) llvm::createStructurizeCFGPass();
       (void) llvm::createConstantMergePass();
       (void) llvm::createConstantPropagationPass();
@@ -107,6 +111,7 @@
       (void) llvm::createObjCARCExpandPass();
       (void) llvm::createObjCARCContractPass();
       (void) llvm::createObjCARCOptPass();
+      (void) llvm::createPAEvalPass();
       (void) llvm::createPromoteMemoryToRegisterPass();
       (void) llvm::createDemoteRegisterToMemoryPass();
       (void) llvm::createPruneEHPass();
@@ -134,6 +139,7 @@
       (void) llvm::createConstantHoistingPass();
       (void) llvm::createCodeGenPreparePass();
       (void) llvm::createEarlyCSEPass();
+      (void) llvm::createMergedLoadStoreMotionPass();
       (void) llvm::createGVNPass();
       (void) llvm::createMemCpyOptPass();
       (void) llvm::createLoopDeletionPass();
@@ -159,6 +165,7 @@
       (void) llvm::createPartiallyInlineLibCallsPass();
       (void) llvm::createScalarizerPass();
       (void) llvm::createSeparateConstOffsetFromGEPPass();
+      (void) llvm::createRewriteSymbolsPass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::FindUsedTypes();
@@ -167,7 +174,7 @@
       llvm::RGPassManager RGM;
       ((llvm::RegionPass*)nullptr)->runOnRegion((llvm::Region*)nullptr, RGM);
       llvm::AliasSetTracker X(*(llvm::AliasAnalysis*)nullptr);
-      X.add((llvm::Value*)nullptr, 0, nullptr);  // for -print-alias-sets
+      X.add(nullptr, 0, llvm::AAMDNodes()); // for -print-alias-sets
     }
   } ForcePassLinking; // Force link by creating a global definition.
 }

diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index 6254bbb..c957cc2 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h

@@ -11,14 +11,12 @@
 #define LLVM_LINKER_LINKER_H
 
 #include "llvm/ADT/SmallPtrSet.h"
-#include <string>
+
+#include <functional>
 
 namespace llvm {
-
-class Comdat;
-class GlobalValue;
+class DiagnosticInfo;
 class Module;
-class StringRef;
 class StructType;
 
 /// This class provides the core functionality of linking in LLVM. It keeps a
@@ -27,35 +25,30 @@
 /// something with it after the linking.
 class Linker {
   public:
-    enum LinkerMode {
-      DestroySource = 0, // Allow source module to be destroyed.
-      PreserveSource = 1 // Preserve the source module.
-    };
+    typedef std::function<void(const DiagnosticInfo &)>
+        DiagnosticHandlerFunction;
 
-    Linker(Module *M, bool SuppressWarnings=false);
+    Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
+    Linker(Module *M);
     ~Linker();
 
     Module *getModule() const { return Composite; }
     void deleteModule();
 
-    /// \brief Link \p Src into the composite. The source is destroyed if
-    /// \p Mode is DestroySource and preserved if it is PreserveSource.
-    /// If \p ErrorMsg is not null, information about any error is written
-    /// to it.
+    /// \brief Link \p Src into the composite. The source is destroyed.
     /// Returns true on error.
-    bool linkInModule(Module *Src, unsigned Mode, std::string *ErrorMsg);
-    bool linkInModule(Module *Src, std::string *ErrorMsg) {
-      return linkInModule(Src, Linker::DestroySource, ErrorMsg);
-    }
+    bool linkInModule(Module *Src);
 
-    static bool LinkModules(Module *Dest, Module *Src, unsigned Mode,
-                            std::string *ErrorMsg);
+    static bool LinkModules(Module *Dest, Module *Src,
+                            DiagnosticHandlerFunction DiagnosticHandler);
+
+    static bool LinkModules(Module *Dest, Module *Src);
 
   private:
+    void init(Module *M, DiagnosticHandlerFunction DiagnosticHandler);
     Module *Composite;
     SmallPtrSet<StructType*, 32> IdentifiedStructTypes;
-
-    bool SuppressWarnings;
+    DiagnosticHandlerFunction DiagnosticHandler;
 };
 
 } // End llvm namespace

diff --git a/include/llvm/MC/ConstantPools.h b/include/llvm/MC/ConstantPools.h
index 2819b75..1fc0332 100644
--- a/include/llvm/MC/ConstantPools.h
+++ b/include/llvm/MC/ConstantPools.h

@@ -12,20 +12,31 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef LLVM_MC_CONSTANTPOOL_H
-#define LLVM_MC_CONSTANTPOOL_H
+#ifndef LLVM_MC_CONSTANTPOOLS_H
+#define LLVM_MC_CONSTANTPOOLS_H
 
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+
 namespace llvm {
 class MCContext;
 class MCExpr;
 class MCSection;
 class MCStreamer;
 class MCSymbol;
+
+struct ConstantPoolEntry {
+  ConstantPoolEntry(MCSymbol *L, const MCExpr *Val, unsigned Sz)
+    : Label(L), Value(Val), Size(Sz) {}
+  MCSymbol *Label;
+  const MCExpr *Value;
+  unsigned Size;
+};
+
 // A class to keep track of assembler-generated constant pools that are use to
 // implement the ldr-pseudo.
 class ConstantPool {
-  typedef SmallVector<std::pair<MCSymbol *, const MCExpr *>, 4> EntryVecTy;
+  typedef SmallVector<ConstantPoolEntry, 4> EntryVecTy;
   EntryVecTy Entries;
 
 public:
@@ -34,9 +45,11 @@
 
   // Add a new entry to the constant pool in the next slot.
   // \param Value is the new entry to put in the constant pool.
+  // \param Size is the size in bytes of the entry
   //
   // \returns a MCExpr that references the newly inserted value
-  const MCExpr *addEntry(const MCExpr *Value, MCContext &Context);
+  const MCExpr *addEntry(const MCExpr *Value, MCContext &Context,
+                         unsigned Size);
 
   // Emit the contents of the constant pool using the provided streamer.
   void emitEntries(MCStreamer &Streamer);
@@ -69,7 +82,8 @@
 
   void emitAll(MCStreamer &Streamer);
   void emitForCurrentSection(MCStreamer &Streamer);
-  const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr);
+  const MCExpr *addEntry(MCStreamer &Streamer, const MCExpr *Expr,
+                         unsigned Size);
 
 private:
   ConstantPool *getConstantPool(const MCSection *Section);

diff --git a/include/llvm/MC/MCAnalysis/MCAtom.h b/include/llvm/MC/MCAnalysis/MCAtom.h
deleted file mode 100644
index 33f3431..0000000
--- a/include/llvm/MC/MCAnalysis/MCAtom.h
+++ /dev/null

@@ -1,199 +0,0 @@
-//===-- MCAtom.h ------------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the MCAtom class, which is used to
-// represent a contiguous region in a decoded object that is uniformly data or
-// instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_MC_MCANALYSIS_MCATOM_H
-#define LLVM_MC_MCANALYSIS_MCATOM_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/DataTypes.h"
-#include <vector>
-
-namespace llvm {
-
-class MCModule;
-
-class MCAtom;
-class MCTextAtom;
-class MCDataAtom;
-
-/// \brief Represents a contiguous range of either instructions (a TextAtom)
-/// or data (a DataAtom).  Address ranges are expressed as _closed_ intervals.
-class MCAtom {
-  virtual void anchor();
-public:
-  virtual ~MCAtom() {}
-
-  enum AtomKind { TextAtom, DataAtom };
-  AtomKind getKind() const { return Kind; }
-
-  /// \brief Get the start address of the atom.
-  uint64_t getBeginAddr() const { return Begin; }
-  /// \brief Get the end address, i.e. the last one inside the atom.
-  uint64_t getEndAddr() const { return End; }
-
-  /// \name Atom modification methods:
-  /// When modifying a TextAtom, keep instruction boundaries in mind.
-  /// For instance, split must me given the start address of an instruction.
-  /// @{
-
-  /// \brief Splits the atom in two at a given address.
-  /// \param SplitPt Address at which to start a new atom, splitting this one.
-  /// \returns The newly created atom starting at \p SplitPt.
-  virtual MCAtom *split(uint64_t SplitPt) = 0;
-
-  /// \brief Truncates an atom, discarding everything after \p TruncPt.
-  /// \param TruncPt Last byte address to be contained in this atom.
-  virtual void truncate(uint64_t TruncPt) = 0;
-  /// @}
-
-  /// \name Naming:
-  ///
-  /// This is mostly for display purposes, and may contain anything that hints
-  /// at what the atom contains: section or symbol name, BB start address, ..
-  /// @{
-  StringRef getName() const { return Name; }
-  void setName(StringRef NewName) { Name = NewName.str(); }
-  /// @}
-
-protected:
-  const AtomKind Kind;
-  std::string Name;
-  MCModule *Parent;
-  uint64_t Begin, End;
-
-  friend class MCModule;
-  MCAtom(AtomKind K, MCModule *P, uint64_t B, uint64_t E)
-    : Kind(K), Name("(unknown)"), Parent(P), Begin(B), End(E) { }
-
-  /// \name Atom remapping helpers
-  /// @{
-
-  /// \brief Remap the atom, using the given range, updating Begin/End.
-  /// One or both of the bounds can remain the same, but overlapping with other
-  /// atoms in the module is still forbidden.
-  void remap(uint64_t NewBegin, uint64_t NewEnd);
-
-  /// \brief Remap the atom to prepare for a truncation at TruncPt.
-  /// Equivalent to:
-  /// \code
-  ///   // Bound checks
-  ///   remap(Begin, TruncPt);
-  /// \endcode
-  void remapForTruncate(uint64_t TruncPt);
-
-  /// \brief Remap the atom to prepare for a split at SplitPt.
-  /// The bounds for the resulting atoms are returned in {L,R}{Begin,End}.
-  /// The current atom is truncated to \p LEnd.
-  void remapForSplit(uint64_t SplitPt,
-                     uint64_t &LBegin, uint64_t &LEnd,
-                     uint64_t &RBegin, uint64_t &REnd);
-  /// @}
-};
-
-/// \name Text atom
-/// @{
-
-/// \brief An entry in an MCTextAtom: a disassembled instruction.
-/// NOTE: Both the Address and Size field are actually redundant when taken in
-/// the context of the text atom, and may better be exposed in an iterator
-/// instead of stored in the atom, which would replace this class.
-class MCDecodedInst {
-public:
-  MCInst Inst;
-  uint64_t Address;
-  uint64_t Size;
-  MCDecodedInst(const MCInst &Inst, uint64_t Address, uint64_t Size)
-    : Inst(Inst), Address(Address), Size(Size) {}
-};
-
-/// \brief An atom consisting of disassembled instructions.
-class MCTextAtom : public MCAtom {
-private:
-  typedef std::vector<MCDecodedInst> InstListTy;
-  InstListTy Insts;
-
-  /// \brief The address of the next appended instruction, i.e., the
-  /// address immediately after the last instruction in the atom.
-  uint64_t NextInstAddress;
-public:
-  /// Append an instruction, expanding the atom if necessary.
-  void addInst(const MCInst &Inst, uint64_t Size);
-
-  /// \name Instruction list access
-  /// @{
-  typedef InstListTy::const_iterator const_iterator;
-  const_iterator begin() const { return Insts.begin(); }
-  const_iterator end()   const { return Insts.end(); }
-
-  const MCDecodedInst &back() const { return Insts.back(); }
-  const MCDecodedInst &at(size_t n) const { return Insts.at(n); }
-  size_t size() const { return Insts.size(); }
-  /// @}
-
-  /// \name Atom type specific split/truncate logic.
-  /// @{
-  MCTextAtom *split(uint64_t SplitPt) override;
-  void     truncate(uint64_t TruncPt) override;
-  /// @}
-
-  // Class hierarchy.
-  static bool classof(const MCAtom *A) { return A->getKind() == TextAtom; }
-private:
-  friend class MCModule;
-  // Private constructor - only callable by MCModule
-  MCTextAtom(MCModule *P, uint64_t Begin, uint64_t End)
-    : MCAtom(TextAtom, P, Begin, End), NextInstAddress(Begin) {}
-};
-/// @}
-
-/// \name Data atom
-/// @{
-
-/// \brief An entry in an MCDataAtom.
-// NOTE: This may change to a more complex type in the future.
-typedef uint8_t MCData;
-
-/// \brief An atom consising of a sequence of bytes.
-class MCDataAtom : public MCAtom {
-  std::vector<MCData> Data;
-
-public:
-  /// Append a data entry, expanding the atom if necessary.
-  void addData(const MCData &D);
-
-  /// Get a reference to the data in this atom.
-  ArrayRef<MCData> getData() const { return Data; }
-
-  /// \name Atom type specific split/truncate logic.
-  /// @{
-  MCDataAtom *split(uint64_t SplitPt) override;
-  void     truncate(uint64_t TruncPt) override;
-  /// @}
-
-  // Class hierarchy.
-  static bool classof(const MCAtom *A) { return A->getKind() == DataAtom; }
-private:
-  friend class MCModule;
-  // Private constructor - only callable by MCModule
-  MCDataAtom(MCModule *P, uint64_t Begin, uint64_t End)
-    : MCAtom(DataAtom, P, Begin, End) {
-    Data.reserve(End + 1 - Begin);
-  }
-};
-
-}
-
-#endif

diff --git a/include/llvm/MC/MCAnalysis/MCFunction.h b/include/llvm/MC/MCAnalysis/MCFunction.h
deleted file mode 100644
index 44fa450..0000000
--- a/include/llvm/MC/MCAnalysis/MCFunction.h
+++ /dev/null

@@ -1,142 +0,0 @@
-//===-- MCFunction.h --------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the data structures to hold a CFG reconstructed from
-// machine code.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_MC_MCANALYSIS_MCFUNCTION_H
-#define LLVM_MC_MCANALYSIS_MCFUNCTION_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCInst.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace llvm {
-
-class MCFunction;
-class MCModule;
-class MCTextAtom;
-
-/// \brief Basic block containing a sequence of disassembled instructions.
-/// The basic block is backed by an MCTextAtom, which holds the instructions,
-/// and the address range it covers.
-/// Create a basic block using MCFunction::createBlock.
-class MCBasicBlock {
-  const MCTextAtom *Insts;
-
-  // MCFunction owns the basic block.
-  MCFunction *Parent;
-  friend class MCFunction;
-  MCBasicBlock(const MCTextAtom &Insts, MCFunction *Parent);
-
-  /// \name Predecessors/Successors, to represent the CFG.
-  /// @{
-  typedef std::vector<const MCBasicBlock *> BasicBlockListTy;
-  BasicBlockListTy Successors;
-  BasicBlockListTy Predecessors;
-  /// @}
-public:
-
-  /// \brief Get the backing MCTextAtom, containing the instruction sequence.
-  const MCTextAtom *getInsts() const { return Insts; }
-
-  /// \name Get the owning MCFunction.
-  /// @{
-  const MCFunction *getParent() const { return Parent; }
-        MCFunction *getParent()       { return Parent; }
-  /// @}
-
-  /// MC CFG access: Predecessors/Successors.
-  /// @{
-  typedef BasicBlockListTy::const_iterator succ_const_iterator;
-  succ_const_iterator succ_begin() const { return Successors.begin(); }
-  succ_const_iterator succ_end()   const { return Successors.end(); }
-
-  typedef BasicBlockListTy::const_iterator pred_const_iterator;
-  pred_const_iterator pred_begin() const { return Predecessors.begin(); }
-  pred_const_iterator pred_end()   const { return Predecessors.end(); }
-
-  void addSuccessor(const MCBasicBlock *MCBB);
-  bool isSuccessor(const MCBasicBlock *MCBB) const;
-
-  void addPredecessor(const MCBasicBlock *MCBB);
-  bool isPredecessor(const MCBasicBlock *MCBB) const;
-
-  /// \brief Split block, mirrorring NewAtom = Insts->split(..).
-  /// This moves all successors to \p SplitBB, and
-  /// adds a fallthrough to it.
-  /// \p SplitBB The result of splitting Insts, a basic block directly following
-  /// this basic block.
-  void splitBasicBlock(MCBasicBlock *SplitBB);
-  /// @}
-};
-
-/// \brief Represents a function in machine code, containing MCBasicBlocks.
-/// MCFunctions are created by MCModule.
-class MCFunction {
-  MCFunction           (const MCFunction&) LLVM_DELETED_FUNCTION;
-  MCFunction& operator=(const MCFunction&) LLVM_DELETED_FUNCTION;
-
-  std::string Name;
-  MCModule *ParentModule;
-  typedef std::vector<std::unique_ptr<MCBasicBlock>> BasicBlockListTy;
-  BasicBlockListTy Blocks;
-
-  // MCModule owns the function.
-  friend class MCModule;
-  MCFunction(StringRef Name, MCModule *Parent);
-
-public:
-  /// \brief Create an MCBasicBlock backed by Insts and add it to this function.
-  /// \param Insts Sequence of straight-line code backing the basic block.
-  /// \returns The newly created basic block.
-  MCBasicBlock &createBlock(const MCTextAtom &Insts);
-
-  StringRef getName() const { return Name; }
-
-  /// \name Get the owning MC Module.
-  /// @{
-  const MCModule *getParent() const { return ParentModule; }
-        MCModule *getParent()       { return ParentModule; }
-  /// @}
-
-  /// \name Access to the function's basic blocks. No ordering is enforced,
-  /// except that the first block is the entry block.
-  /// @{
-  /// \brief Get the entry point basic block.
-  const MCBasicBlock *getEntryBlock() const { return front(); }
-        MCBasicBlock *getEntryBlock()       { return front(); }
-
-  bool empty() const { return Blocks.empty(); }
-
-  typedef BasicBlockListTy::const_iterator const_iterator;
-  typedef BasicBlockListTy::      iterator       iterator;
-  const_iterator begin() const { return Blocks.begin(); }
-        iterator begin()       { return Blocks.begin(); }
-  const_iterator   end() const { return Blocks.end(); }
-        iterator   end()       { return Blocks.end(); }
-
-  const MCBasicBlock* front() const { return Blocks.front().get(); }
-        MCBasicBlock* front()       { return Blocks.front().get(); }
-  const MCBasicBlock*  back() const { return Blocks.back().get(); }
-        MCBasicBlock*  back()       { return Blocks.back().get(); }
-
-  /// \brief Find the basic block, if any, that starts at \p StartAddr.
-  const MCBasicBlock *find(uint64_t StartAddr) const;
-        MCBasicBlock *find(uint64_t StartAddr);
-  /// @}
-};
-
-}
-
-#endif

diff --git a/include/llvm/MC/MCAnalysis/MCModule.h b/include/llvm/MC/MCAnalysis/MCModule.h
deleted file mode 100644
index cf7e2c0..0000000
--- a/include/llvm/MC/MCAnalysis/MCModule.h
+++ /dev/null

@@ -1,134 +0,0 @@
-//===-- MCModule.h - MCModule class -----------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the MCModule class, which is used to
-// represent a complete, disassembled object file or executable.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_MC_MCANALYSIS_MCMODULE_H
-#define LLVM_MC_MCANALYSIS_MCMODULE_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataTypes.h"
-#include <memory>
-#include <vector>
-
-namespace llvm {
-
-class MCAtom;
-class MCBasicBlock;
-class MCDataAtom;
-class MCFunction;
-class MCObjectDisassembler;
-class MCTextAtom;
-
-/// \brief A completely disassembled object file or executable.
-/// It comprises a list of MCAtom's, each representing a contiguous range of
-/// either instructions or data.
-/// An MCModule is created using MCObjectDisassembler::buildModule.
-class MCModule {
-  /// \name Atom tracking
-  /// @{
-
-  /// \brief Atoms in this module, sorted by begin address.
-  /// FIXME: This doesn't handle overlapping atoms (which happen when a basic
-  /// block starts in the middle of an instruction of another basic block.)
-  typedef std::vector<MCAtom*> AtomListTy;
-  AtomListTy Atoms;
-
-  // For access to map/remap.
-  friend class MCAtom;
-
-  /// \brief Remap \p Atom to the given range, and update its Begin/End fields.
-  /// \param Atom An atom belonging to this module.
-  /// An atom should always use this method to update its bounds, because this
-  /// enables the owning MCModule to keep track of its atoms.
-  void remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd);
-
-  /// \brief Insert an atom in the module, using its Begin and End addresses.
-  void map(MCAtom *NewAtom);
-  /// @}
-
-  /// \name Basic block tracking
-  /// @{
-  typedef std::vector<MCBasicBlock*> BBsByAtomTy;
-  BBsByAtomTy BBsByAtom;
-
-  // For access to basic block > atom tracking.
-  friend class MCBasicBlock;
-  friend class MCTextAtom;
-
-  /// \brief Keep track of \p BBBackedByAtom as being backed by \p Atom.
-  /// This is used to update succs/preds when \p Atom is split.
-  void trackBBForAtom(const MCTextAtom *Atom, MCBasicBlock *BBBackedByAtom);
-  void splitBasicBlocksForAtom(const MCTextAtom *TA, const MCTextAtom *NewTA);
-  /// @}
-
-  /// \name Function tracking
-  /// @{
-  typedef std::vector<std::unique_ptr<MCFunction>> FunctionListTy;
-  FunctionListTy Functions;
-  /// @}
-
-  /// The address of the entrypoint function.
-  uint64_t Entrypoint;
-
-  MCModule           (const MCModule &) LLVM_DELETED_FUNCTION;
-  MCModule& operator=(const MCModule &) LLVM_DELETED_FUNCTION;
-
-  // MCObjectDisassembler creates MCModules.
-  friend class MCObjectDisassembler;
-
-public:
-  MCModule();
-  ~MCModule();
-
-  /// \name Create a new MCAtom covering the specified offset range.
-  /// @{
-  MCTextAtom *createTextAtom(uint64_t Begin, uint64_t End);
-  MCDataAtom *createDataAtom(uint64_t Begin, uint64_t End);
-  /// @}
-
-  /// \name Access to the owned atom list, ordered by begin address.
-  /// @{
-  const MCAtom *findAtomContaining(uint64_t Addr) const;
-        MCAtom *findAtomContaining(uint64_t Addr);
-  const MCAtom *findFirstAtomAfter(uint64_t Addr) const;
-        MCAtom *findFirstAtomAfter(uint64_t Addr);
-
-  typedef AtomListTy::const_iterator const_atom_iterator;
-  typedef AtomListTy::      iterator       atom_iterator;
-  const_atom_iterator atom_begin() const { return Atoms.begin(); }
-        atom_iterator atom_begin()       { return Atoms.begin(); }
-  const_atom_iterator atom_end()   const { return Atoms.end(); }
-        atom_iterator atom_end()         { return Atoms.end(); }
-  /// @}
-
-  /// \brief Create a new MCFunction.
-  MCFunction *createFunction(StringRef Name);
-
-  /// \name Access to the owned function list.
-  /// @{
-  typedef FunctionListTy::const_iterator const_func_iterator;
-  typedef FunctionListTy::      iterator       func_iterator;
-  const_func_iterator func_begin() const { return Functions.begin(); }
-        func_iterator func_begin()       { return Functions.begin(); }
-  const_func_iterator func_end()   const { return Functions.end(); }
-        func_iterator func_end()         { return Functions.end(); }
-  /// @}
-
-  /// \brief Get the address of the entrypoint function, or 0 if there is none.
-  uint64_t getEntrypoint() const { return Entrypoint; }
-};
-
-}
-
-#endif

diff --git a/include/llvm/MC/MCAnalysis/MCModuleYAML.h b/include/llvm/MC/MCAnalysis/MCModuleYAML.h
deleted file mode 100644
index 4856277..0000000
--- a/include/llvm/MC/MCAnalysis/MCModuleYAML.h
+++ /dev/null

@@ -1,40 +0,0 @@
-//===- MCModuleYAML.h - MCModule YAMLIO implementation ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This file declares classes for handling the YAML representation
-/// of MCModule.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_MC_MCANALYSIS_MCMODULEYAML_H
-#define LLVM_MC_MCANALYSIS_MCMODULEYAML_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCAnalysis/MCModule.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-class MCInstrInfo;
-class MCRegisterInfo;
-
-/// \brief Dump a YAML representation of the MCModule \p MCM to \p OS.
-/// \returns The empty string on success, an error message on failure.
-StringRef mcmodule2yaml(raw_ostream &OS, const MCModule &MCM,
-                        const MCInstrInfo &MII, const MCRegisterInfo &MRI);
-
-/// \brief Creates a new module and returns it in \p MCM.
-/// \returns The empty string on success, an error message on failure.
-StringRef yaml2mcmodule(std::unique_ptr<MCModule> &MCM, StringRef YamlContent,
-                        const MCInstrInfo &MII, const MCRegisterInfo &MRI);
-
-} // end namespace llvm
-
-#endif

diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 82b65fd..15a956b 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h

@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCAsmBack.h - MC Asm Backend --------------------*- C++ -*-===//
+//===-- llvm/MC/MCAsmBackend.h - MC Asm Backend -----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -146,7 +146,7 @@
 
   /// \brief Generate the compact unwind encoding for the CFI instructions.
   virtual uint32_t
-  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction>) const {
+      generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction>) const {
     return 0;
   }
 };

diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 06e473d..4f38aac 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h

@@ -31,22 +31,22 @@
 
 namespace WinEH {
 enum class EncodingType {
-  ET_Invalid, /// Invalid
-  ET_Alpha,   /// Windows Alpha
-  ET_Alpha64, /// Windows AXP64
-  ET_ARM,     /// Windows NT (Windows on ARM)
-  ET_CE,      /// Windows CE ARM, PowerPC, SH3, SH4
-  ET_Itanium, /// Windows x64, Windows Itanium (IA-64)
-  ET_MIPS = ET_Alpha,
+  Invalid, /// Invalid
+  Alpha,   /// Windows Alpha
+  Alpha64, /// Windows AXP64
+  ARM,     /// Windows NT (Windows on ARM)
+  CE,      /// Windows CE ARM, PowerPC, SH3, SH4
+  Itanium, /// Windows x64, Windows Itanium (IA-64)
+  MIPS = Alpha,
 };
 }
 
 enum class ExceptionHandling {
-  None,     /// No exception support
-  DwarfCFI, /// DWARF-like instruction based exceptions
-  SjLj,     /// setjmp/longjmp based exceptions
-  ARM,      /// ARM EHABI
-  WinEH,    /// Windows Exception Handling
+  None,         /// No exception support
+  DwarfCFI,     /// DWARF-like instruction based exceptions
+  SjLj,         /// setjmp/longjmp based exceptions
+  ARM,          /// ARM EHABI
+  ItaniumWinEH, /// Itanium EH built on Windows unwind info (.pdata and .xdata)
 };
 
 namespace LCOMM {
@@ -87,16 +87,11 @@
   bool HasMachoTBSSDirective;
 
   /// True if the compiler should emit a ".reference .constructors_used" or
-  /// ".reference .destructors_used" directive after the a static ctor/dtor
+  /// ".reference .destructors_used" directive after the static ctor/dtor
   /// list.  This directive is only emitted in Static relocation model.  Default
   /// is false.
   bool HasStaticCtorDtorReferenceInStaticMode;
 
-  /// True if the linker has a bug and requires that the debug_line section be
-  /// of a minimum size. In practice such a linker requires a non-empty line
-  /// sequence if a file is present.  Default to false.
-  bool LinkerRequiresNonEmptyDwarfLines;
-
   /// This is the maximum possible length of an instruction, which is needed to
   /// compute the size of an inline asm.  Defaults to 4.
   unsigned MaxInstLength;
@@ -223,8 +218,12 @@
   /// This is the directive used to declare a global entity.  Defaults to NULL.
   const char *GlobalDirective;
 
-  /// True if the assembler supports the .set directive.  Defaults to true.
-  bool HasSetDirective;
+  /// True if the expression
+  ///   .long f - g
+  /// uses an relocation but it can be supressed by writting
+  ///   a = f - g
+  ///   .long a
+  bool SetDirectiveSuppressesReloc;
 
   /// False if the assembler requires that we use
   /// \code
@@ -295,9 +294,6 @@
 
   //===--- Dwarf Emission Directives -----------------------------------===//
 
-  /// True if target asm supports leb128 directives.  Defaults to false.
-  bool HasLEB128;
-
   /// True if target supports emission of debugging information.  Defaults to
   /// false.
   bool SupportsDebugInformation;
@@ -404,9 +400,6 @@
   bool hasStaticCtorDtorReferenceInStaticMode() const {
     return HasStaticCtorDtorReferenceInStaticMode;
   }
-  bool getLinkerRequiresNonEmptyDwarfLines() const {
-    return LinkerRequiresNonEmptyDwarfLines;
-  }
   unsigned getMaxInstLength() const { return MaxInstLength; }
   unsigned getMinInstAlignment() const { return MinInstAlignment; }
   bool getDollarIsPC() const { return DollarIsPC; }
@@ -445,7 +438,9 @@
   bool getAlignmentIsInBytes() const { return AlignmentIsInBytes; }
   unsigned getTextAlignFillValue() const { return TextAlignFillValue; }
   const char *getGlobalDirective() const { return GlobalDirective; }
-  bool hasSetDirective() const { return HasSetDirective; }
+  bool doesSetDirectiveSuppressesReloc() const {
+    return SetDirectiveSuppressesReloc;
+  }
   bool hasAggressiveSymbolFolding() const { return HasAggressiveSymbolFolding; }
   bool getCOMMDirectiveAlignmentIsInBytes() const {
     return COMMDirectiveAlignmentIsInBytes;
@@ -471,19 +466,22 @@
   MCSymbolAttr getProtectedVisibilityAttr() const {
     return ProtectedVisibilityAttr;
   }
-  bool hasLEB128() const { return HasLEB128; }
   bool doesSupportDebugInformation() const { return SupportsDebugInformation; }
   bool doesSupportExceptionHandling() const {
     return ExceptionsType != ExceptionHandling::None;
   }
   ExceptionHandling getExceptionHandlingType() const { return ExceptionsType; }
   WinEH::EncodingType getWinEHEncodingType() const { return WinEHEncodingType; }
-  bool isExceptionHandlingDwarf() const {
+
+  /// Return true if the exception handling type uses the language-specific data
+  /// area (LSDA) format specified by the Itanium C++ ABI.
+  bool usesItaniumLSDAForExceptions() const {
     return (ExceptionsType == ExceptionHandling::DwarfCFI ||
             ExceptionsType == ExceptionHandling::ARM ||
-            // Windows handler data still uses DWARF LSDA encoding.
-            ExceptionsType == ExceptionHandling::WinEH);
+            // This Windows EH type uses the Itanium LSDA encoding.
+            ExceptionsType == ExceptionHandling::ItaniumWinEH);
   }
+
   bool doesDwarfUseRelocationsAcrossSections() const {
     return DwarfUsesRelocationsAcrossSections;
   }

diff --git a/include/llvm/MC/MCAsmInfoELF.h b/include/llvm/MC/MCAsmInfoELF.h
index 27fea84..7bd2460 100644
--- a/include/llvm/MC/MCAsmInfoELF.h
+++ b/include/llvm/MC/MCAsmInfoELF.h

@@ -15,6 +15,9 @@
 namespace llvm {
 class MCAsmInfoELF : public MCAsmInfo {
   virtual void anchor();
+  const MCSection *
+  getNonexecutableStackSection(MCContext &Ctx) const override final;
+
 protected:
   MCAsmInfoELF();
 };

diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 1cb34c2..681a317 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h

@@ -11,6 +11,7 @@
 #define LLVM_MC_MCASSEMBLER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/ilist.h"
@@ -593,7 +594,10 @@
   unsigned Alignment;
 
   /// \brief Keeping track of bundle-locked state.
-  BundleLockStateType BundleLockState; 
+  BundleLockStateType BundleLockState;
+
+  /// \brief Current nesting depth of bundle_lock directives.
+  unsigned BundleLockNestingDepth;
 
   /// \brief We've seen a bundle_lock directive but not its first instruction
   /// yet.
@@ -665,9 +669,7 @@
     return BundleLockState;
   }
 
-  void setBundleLockState(BundleLockStateType NewState) {
-    BundleLockState = NewState;
-  }
+  void setBundleLockState(BundleLockStateType NewState);
 
   bool isBundleGroupBeforeFirstInst() const {
     return BundleGroupBeforeFirstInst;
@@ -684,34 +686,27 @@
 
 // FIXME: Same concerns as with SectionData.
 class MCSymbolData : public ilist_node<MCSymbolData> {
-public:
   const MCSymbol *Symbol;
 
-  /// Fragment - The fragment this symbol's value is relative to, if any.
-  MCFragment *Fragment;
+  /// Fragment - The fragment this symbol's value is relative to, if any. Also
+  /// stores if this symbol is visible outside this translation unit (bit 0) or
+  /// if it is private extern (bit 1).
+  PointerIntPair<MCFragment *, 2> Fragment;
 
-  /// Offset - The offset to apply to the fragment address to form this symbol's
-  /// value.
-  uint64_t Offset;
+  union {
+    /// Offset - The offset to apply to the fragment address to form this
+    /// symbol's value.
+    uint64_t Offset;
 
-  /// IsExternal - True if this symbol is visible outside this translation
-  /// unit.
-  unsigned IsExternal : 1;
-
-  /// IsPrivateExtern - True if this symbol is private extern.
-  unsigned IsPrivateExtern : 1;
-
-  /// CommonSize - The size of the symbol, if it is 'common', or 0.
-  //
-  // FIXME: Pack this in with other fields? We could put it in offset, since a
-  // common symbol can never get a definition.
-  uint64_t CommonSize;
+    /// CommonSize - The size of the symbol, if it is 'common'.
+    uint64_t CommonSize;
+  };
 
   /// SymbolSize - An expression describing how to calculate the size of
   /// a symbol. If a symbol has no size this field will be NULL.
   const MCExpr *SymbolSize;
 
-  /// CommonAlign - The alignment of the symbol, if it is 'common'.
+  /// CommonAlign - The alignment of the symbol, if it is 'common', or -1.
   //
   // FIXME: Pack this in with other fields?
   unsigned CommonAlign;
@@ -734,30 +729,41 @@
 
   const MCSymbol &getSymbol() const { return *Symbol; }
 
-  MCFragment *getFragment() const { return Fragment; }
-  void setFragment(MCFragment *Value) { Fragment = Value; }
+  MCFragment *getFragment() const { return Fragment.getPointer(); }
+  void setFragment(MCFragment *Value) { Fragment.setPointer(Value); }
 
-  uint64_t getOffset() const { return Offset; }
-  void setOffset(uint64_t Value) { Offset = Value; }
+  uint64_t getOffset() const {
+    assert(!isCommon());
+    return Offset;
+  }
+  void setOffset(uint64_t Value) {
+    assert(!isCommon());
+    Offset = Value;
+  }
 
   /// @}
   /// @name Symbol Attributes
   /// @{
 
-  bool isExternal() const { return IsExternal; }
-  void setExternal(bool Value) { IsExternal = Value; }
+  bool isExternal() const { return Fragment.getInt() & 1; }
+  void setExternal(bool Value) {
+    Fragment.setInt((Fragment.getInt() & ~1) | unsigned(Value));
+  }
 
-  bool isPrivateExtern() const { return IsPrivateExtern; }
-  void setPrivateExtern(bool Value) { IsPrivateExtern = Value; }
+  bool isPrivateExtern() const { return Fragment.getInt() & 2; }
+  void setPrivateExtern(bool Value) {
+    Fragment.setInt((Fragment.getInt() & ~2) | (unsigned(Value) << 1));
+  }
 
   /// isCommon - Is this a 'common' symbol.
-  bool isCommon() const { return CommonSize != 0; }
+  bool isCommon() const { return CommonAlign != -1U; }
 
   /// setCommon - Mark this symbol as being 'common'.
   ///
   /// \param Size - The size of the symbol.
   /// \param Align - The alignment of the symbol.
   void setCommon(uint64_t Size, unsigned Align) {
+    assert(getOffset() == 0);
     CommonSize = Size;
     CommonAlign = Align;
   }
@@ -910,7 +916,6 @@
   unsigned BundleAlignSize;
 
   unsigned RelaxAll : 1;
-  unsigned NoExecStack : 1;
   unsigned SubsectionsViaSymbols : 1;
 
   /// ELF specific e_header flags
@@ -1056,9 +1061,6 @@
   bool getRelaxAll() const { return RelaxAll; }
   void setRelaxAll(bool Value) { RelaxAll = Value; }
 
-  bool getNoExecStack() const { return NoExecStack; }
-  void setNoExecStack(bool Value) { NoExecStack = Value; }
-
   bool isBundlingEnabled() const {
     return BundleAlignSize != 0;
   }

diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index eb0340f..f209448 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h

@@ -73,6 +73,10 @@
     /// Symbols - Bindings of names to symbols.
     SymbolTable Symbols;
 
+    /// ELF sections can have a corresponding symbol. This maps one to the
+    /// other.
+    DenseMap<const MCSectionELF*, MCSymbol*> SectionSymbols;
+
     /// A maping from a local label number and an instance count to a symbol.
     /// For example, in the assembly
     ///     1:
@@ -231,6 +235,8 @@
     MCSymbol *GetOrCreateSymbol(StringRef Name);
     MCSymbol *GetOrCreateSymbol(const Twine &Name);
 
+    MCSymbol *getOrCreateSectionSymbol(const MCSectionELF &Section);
+
     /// LookupSymbol - Get the symbol for \p Name, or null.
     MCSymbol *LookupSymbol(StringRef Name) const;
     MCSymbol *LookupSymbol(const Twine &Name) const;
@@ -284,6 +290,13 @@
 
     const MCSectionCOFF *getCOFFSection(StringRef Section);
 
+    /// Gets or creates a section equivalent to Sec that is associated with the
+    /// section containing KeySym. For example, to create a debug info section
+    /// associated with an inline function, pass the normal debug info section
+    /// as Sec and the function symbol as KeySym.
+    const MCSectionCOFF *getAssociativeCOFFSection(const MCSectionCOFF *Sec,
+                                                   const MCSymbol *KeySym);
+
     /// @}
 
     /// @name Dwarf Management

diff --git a/include/llvm/MC/MCDisassembler.h b/include/llvm/MC/MCDisassembler.h
index 9d441bb..d6b0a30 100644
--- a/include/llvm/MC/MCDisassembler.h
+++ b/include/llvm/MC/MCDisassembler.h

@@ -10,6 +10,7 @@
 #define LLVM_MC_MCDISASSEMBLER_H
 
 #include "llvm-c/Disassembler.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCRelocationInfo.h"
 #include "llvm/MC/MCSymbolizer.h"
 #include "llvm/Support/DataTypes.h"
@@ -18,12 +19,11 @@
 
 class MCInst;
 class MCSubtargetInfo;
-class MemoryObject;
 class raw_ostream;
 class MCContext;
 
-/// MCDisassembler - Superclass for all disassemblers.  Consumes a memory region
-///   and provides an array of assembly instructions.
+/// Superclass for all disassemblers. Consumes a memory region and provides an
+/// array of assembly instructions.
 class MCDisassembler {
 public:
   /// Ternary decode status. Most backends will just use Fail and
@@ -54,34 +54,31 @@
     Success = 3
   };
 
-  /// Constructor     - Performs initial setup for the disassembler.
   MCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
     : Ctx(Ctx), STI(STI), Symbolizer(), CommentStream(nullptr) {}
 
   virtual ~MCDisassembler();
 
-  /// getInstruction  - Returns the disassembly of a single instruction.
+  /// Returns the disassembly of a single instruction.
   ///
-  /// @param instr    - An MCInst to populate with the contents of the
+  /// @param Instr    - An MCInst to populate with the contents of the
   ///                   instruction.
-  /// @param size     - A value to populate with the size of the instruction, or
+  /// @param Size     - A value to populate with the size of the instruction, or
   ///                   the number of bytes consumed while attempting to decode
   ///                   an invalid instruction.
-  /// @param region   - The memory object to use as a source for machine code.
-  /// @param address  - The address, in the memory space of region, of the first
+  /// @param Address  - The address, in the memory space of region, of the first
   ///                   byte of the instruction.
-  /// @param vStream  - The stream to print warnings and diagnostic messages on.
-  /// @param cStream  - The stream to print comments and annotations on.
+  /// @param VStream  - The stream to print warnings and diagnostic messages on.
+  /// @param CStream  - The stream to print comments and annotations on.
   /// @return         - MCDisassembler::Success if the instruction is valid,
   ///                   MCDisassembler::SoftFail if the instruction was
   ///                                            disassemblable but invalid,
   ///                   MCDisassembler::Fail if the instruction was invalid.
-  virtual DecodeStatus  getInstruction(MCInst& instr,
-                                       uint64_t& size,
-                                       const MemoryObject &region,
-                                       uint64_t address,
-                                       raw_ostream &vStream,
-                                       raw_ostream &cStream) const = 0;
+  virtual DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                                      ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      raw_ostream &VStream,
+                                      raw_ostream &CStream) const = 0;
+
 private:
   MCContext &Ctx;
 

diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 6cd9a9a..a221d26 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h

@@ -457,7 +457,7 @@
     return Offset;
   }
 
-  const StringRef getValues() const {
+  StringRef getValues() const {
     assert(Operation == OpEscape);
     return StringRef(&Values[0], Values.size());
   }
@@ -466,13 +466,15 @@
 struct MCDwarfFrameInfo {
   MCDwarfFrameInfo()
       : Begin(nullptr), End(nullptr), Personality(nullptr), Lsda(nullptr),
-        Instructions(), PersonalityEncoding(), LsdaEncoding(0),
-        CompactUnwindEncoding(0), IsSignalFrame(false), IsSimple(false) {}
+        Instructions(), CurrentCfaRegister(0), PersonalityEncoding(),
+        LsdaEncoding(0), CompactUnwindEncoding(0), IsSignalFrame(false),
+        IsSimple(false) {}
   MCSymbol *Begin;
   MCSymbol *End;
   const MCSymbol *Personality;
   const MCSymbol *Lsda;
   std::vector<MCCFIInstruction> Instructions;
+  unsigned CurrentCfaRegister;
   unsigned PersonalityEncoding;
   unsigned LsdaEncoding;
   uint32_t CompactUnwindEncoding;

diff --git a/include/llvm/MC/MCELF.h b/include/llvm/MC/MCELF.h
index 7e59911..294a51b 100644
--- a/include/llvm/MC/MCELF.h
+++ b/include/llvm/MC/MCELF.h

@@ -27,9 +27,9 @@
   static void SetType(MCSymbolData &SD, unsigned Type);
   static unsigned GetType(const MCSymbolData &SD);
   static void SetVisibility(MCSymbolData &SD, unsigned Visibility);
-  static unsigned GetVisibility(MCSymbolData &SD);
+  static unsigned GetVisibility(const MCSymbolData &SD);
   static void setOther(MCSymbolData &SD, unsigned Other);
-  static unsigned getOther(MCSymbolData &SD);
+  static unsigned getOther(const MCSymbolData &SD);
 };
 
 }

diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index 127f162..421e7a0 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h

@@ -22,6 +22,7 @@
 class MCObjectWriter;
 class MCSectionData;
 class MCSymbol;
+class MCSymbolData;
 class MCValue;
 
 class MCELFObjectTargetWriter {
@@ -54,7 +55,8 @@
   virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                 bool IsPCRel) const = 0;
 
-  virtual bool needsRelocateWithSymbol(unsigned Type) const;
+  virtual bool needsRelocateWithSymbol(const MCSymbolData &SD,
+                                       unsigned Type) const;
 
   /// @name Accessors
   /// @{

diff --git a/include/llvm/MC/MCELFStreamer.h b/include/llvm/MC/MCELFStreamer.h
index 66729fe..ab6c5e3 100644
--- a/include/llvm/MC/MCELFStreamer.h
+++ b/include/llvm/MC/MCELFStreamer.h

@@ -41,10 +41,18 @@
 
   virtual ~MCELFStreamer();
 
+  /// state management
+  void reset() override {
+    LocalCommons.clear();
+    BindingExplicitlySet.clear();
+    SeenIdent = false;
+    MCObjectStreamer::reset();
+  }
+
   /// @name MCStreamer Interface
   /// @{
 
-  void InitSections() override;
+  void InitSections(bool NoExecStack) override;
   void ChangeSection(const MCSection *Section,
                      const MCExpr *Subsection) override;
   void EmitLabel(MCSymbol *Symbol) override;
@@ -107,8 +115,7 @@
 
 MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                     raw_ostream &OS, MCCodeEmitter *Emitter,
-                                    bool RelaxAll, bool NoExecStack,
-                                    bool IsThumb);
+                                    bool RelaxAll, bool IsThumb);
 
 } // end namespace llvm
 

diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index e96ecb4..f0e8611 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h

@@ -19,6 +19,7 @@
 class MCAsmLayout;
 class MCAssembler;
 class MCContext;
+class MCFixup;
 class MCSection;
 class MCSectionData;
 class MCStreamer;
@@ -49,11 +50,17 @@
   bool EvaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
                           const MCAsmLayout *Layout,
                           const SectionAddrMap *Addrs) const;
+
+  bool evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
+                          const MCAsmLayout *Layout,
+                          const SectionAddrMap *Addrs, bool InSet) const;
+
 protected:
   explicit MCExpr(ExprKind _Kind) : Kind(_Kind) {}
 
   bool EvaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
                                  const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup,
                                  const SectionAddrMap *Addrs, bool InSet,
                                  bool ForceVarExpansion) const;
 
@@ -87,13 +94,17 @@
   bool EvaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const;
   bool EvaluateAsAbsolute(int64_t &Res, const MCAsmLayout &Layout) const;
 
+  int64_t evaluateKnownAbsolute(const MCAsmLayout &Layout) const;
+
   /// EvaluateAsRelocatable - Try to evaluate the expression to a relocatable
   /// value, i.e. an expression of the fixed form (a - b + constant).
   ///
   /// @param Res - The relocatable value, if evaluation succeeds.
   /// @param Layout - The assembler layout object to use for evaluating values.
+  /// @param Fixup - The Fixup object if available.
   /// @result - True on success.
-  bool EvaluateAsRelocatable(MCValue &Res, const MCAsmLayout *Layout) const;
+  bool EvaluateAsRelocatable(MCValue &Res, const MCAsmLayout *Layout,
+                             const MCFixup *Fixup) const;
 
   /// \brief Try to evaluate the expression to the form (a - b + constant) where
   /// neither a nor b are variables.
@@ -101,7 +112,8 @@
   /// This is a more aggressive variant of EvaluateAsRelocatable. The intended
   /// use is for when relocations are not available, like the symbol value in
   /// the symbol table.
-  bool EvaluateAsValue(MCValue &Res, const MCAsmLayout *Layout) const;
+  bool EvaluateAsValue(MCValue &Res, const MCAsmLayout *Layout,
+                       const MCFixup *Fixup) const;
 
   /// FindAssociatedSection - Find the "associated section" for this expression,
   /// which is currently defined as the absolute section for constants, or
@@ -238,6 +250,7 @@
     VK_PPC_GOT_TLSLD_HI,   // symbol@got@tlsld@h
     VK_PPC_GOT_TLSLD_HA,   // symbol@got@tlsld@ha
     VK_PPC_TLSLD,          // symbol@tlsld
+    VK_PPC_LOCAL,          // symbol@local
 
     VK_Mips_GPREL,
     VK_Mips_GOT_CALL,
@@ -270,21 +283,20 @@
   };
 
 private:
+  /// The symbol reference modifier.
+  const unsigned Kind : 16;
+
+  /// Specifies how the variant kind should be printed.
+  const unsigned UseParensForSymbolVariant : 1;
+
+  // FIXME: Remove this bit.
+  const unsigned HasSubsectionsViaSymbols : 1;
+
   /// The symbol being referenced.
   const MCSymbol *Symbol;
 
-  /// The symbol reference modifier.
-  const VariantKind Kind;
-
-  /// MCAsmInfo that is used to print symbol variants correctly.
-  const MCAsmInfo *MAI;
-
-  explicit MCSymbolRefExpr(const MCSymbol *_Symbol, VariantKind _Kind,
-                           const MCAsmInfo *_MAI)
-    : MCExpr(MCExpr::SymbolRef), Symbol(_Symbol), Kind(_Kind), MAI(_MAI) {
-    assert(Symbol);
-    assert(MAI);
-  }
+  explicit MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
+                           const MCAsmInfo *MAI);
 
 public:
   /// @name Construction
@@ -304,9 +316,12 @@
   /// @{
 
   const MCSymbol &getSymbol() const { return *Symbol; }
-  const MCAsmInfo &getMCAsmInfo() const { return *MAI; }
 
-  VariantKind getKind() const { return Kind; }
+  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
+
+  void printVariantKind(raw_ostream &OS) const;
+
+  bool hasSubsectionsViaSymbols() const { return HasSubsectionsViaSymbols; }
 
   /// @}
   /// @name Static Utility Functions
@@ -524,7 +539,8 @@
 
   virtual void PrintImpl(raw_ostream &OS) const = 0;
   virtual bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                         const MCAsmLayout *Layout) const = 0;
+                                         const MCAsmLayout *Layout,
+                                         const MCFixup *Fixup) const = 0;
   virtual void visitUsedExpr(MCStreamer& Streamer) const = 0;
   virtual const MCSection *FindAssociatedSection() const = 0;
 

diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index 7f55b29..95124c3 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h

@@ -1,4 +1,4 @@
-//===-- MCInstPrinter.h - Convert an MCInst to target assembly syntax -----===//
+//===- MCInstPrinter.h - MCInst to target assembly syntax -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //

diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index 101778e..d4f93c1 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h

@@ -125,7 +125,10 @@
     Rematerializable,
     CheapAsAMove,
     ExtraSrcRegAllocReq,
-    ExtraDefRegAllocReq
+    ExtraDefRegAllocReq,
+    RegSequence,
+    ExtractSubreg,
+    InsertSubreg
   };
 }
 
@@ -357,6 +360,47 @@
     return Flags & (1 << MCID::FoldableAsLoad);
   }
 
+  /// \brief Return true if this instruction behaves
+  /// the same way as the generic REG_SEQUENCE instructions.
+  /// E.g., on ARM,
+  /// dX VMOVDRR rY, rZ
+  /// is equivalent to
+  /// dX = REG_SEQUENCE rY, ssub_0, rZ, ssub_1.
+  ///
+  /// Note that for the optimizers to be able to take advantage of
+  /// this property, TargetInstrInfo::getRegSequenceLikeInputs has to be
+  /// override accordingly.
+  bool isRegSequenceLike() const { return Flags & (1 << MCID::RegSequence); }
+
+  /// \brief Return true if this instruction behaves
+  /// the same way as the generic EXTRACT_SUBREG instructions.
+  /// E.g., on ARM,
+  /// rX, rY VMOVRRD dZ
+  /// is equivalent to two EXTRACT_SUBREG:
+  /// rX = EXTRACT_SUBREG dZ, ssub_0
+  /// rY = EXTRACT_SUBREG dZ, ssub_1
+  ///
+  /// Note that for the optimizers to be able to take advantage of
+  /// this property, TargetInstrInfo::getExtractSubregLikeInputs has to be
+  /// override accordingly.
+  bool isExtractSubregLike() const {
+    return Flags & (1 << MCID::ExtractSubreg);
+  }
+
+  /// \brief Return true if this instruction behaves
+  /// the same way as the generic INSERT_SUBREG instructions.
+  /// E.g., on ARM,
+  /// dX = VSETLNi32 dY, rZ, Imm
+  /// is equivalent to a INSERT_SUBREG:
+  /// dX = INSERT_SUBREG dY, rZ, translateImmToSubIdx(Imm)
+  ///
+  /// Note that for the optimizers to be able to take advantage of
+  /// this property, TargetInstrInfo::getInsertSubregLikeInputs has to be
+  /// override accordingly.
+  bool isInsertSubregLike() const {
+    return Flags & (1 << MCID::InsertSubreg);
+  }
+
   //===--------------------------------------------------------------------===//
   // Side Effect Analysis
   //===--------------------------------------------------------------------===//

diff --git a/include/llvm/MC/MCInstrItineraries.h b/include/llvm/MC/MCInstrItineraries.h
index 5104345..94d599f 100644
--- a/include/llvm/MC/MCInstrItineraries.h
+++ b/include/llvm/MC/MCInstrItineraries.h

@@ -22,7 +22,7 @@
 namespace llvm {
 
 //===----------------------------------------------------------------------===//
-/// Instruction stage - These values represent a non-pipelined step in
+/// These values represent a non-pipelined step in
 /// the execution of an instruction.  Cycles represents the number of
 /// discrete time slots needed to complete the stage.  Units represent
 /// the choice of functional units that can be used to complete the
@@ -67,12 +67,12 @@
   int NextCycles_;   ///< Number of machine cycles to next stage
   ReservationKinds Kind_; ///< Kind of the FU reservation
 
-  /// getCycles - returns the number of cycles the stage is occupied
+  /// Returns the number of cycles the stage is occupied.
   unsigned getCycles() const {
     return Cycles_;
   }
 
-  /// getUnits - returns the choice of FUs
+  /// Returns the choice of FUs.
   unsigned getUnits() const {
     return Units_;
   }
@@ -81,7 +81,7 @@
     return Kind_;
   }
 
-  /// getNextCycles - returns the number of cycles from the start of
+  /// Returns the number of cycles from the start of
   /// this stage to the start of the next stage in the itinerary
   unsigned getNextCycles() const {
     return (NextCycles_ >= 0) ? (unsigned)NextCycles_ : Cycles_;
@@ -90,10 +90,9 @@
 
 
 //===----------------------------------------------------------------------===//
-/// Instruction itinerary - An itinerary represents the scheduling
-/// information for an instruction. This includes a set of stages
-/// occupies by the instruction, and the pipeline cycle in which
-/// operands are read and written.
+/// An itinerary represents the scheduling information for an instruction.
+/// This includes a set of stages occupied by the instruction and the pipeline
+/// cycle in which operands are read and written.
 ///
 struct InstrItinerary {
   int      NumMicroOps;        ///< # of micro-ops, -1 means it's variable
@@ -105,12 +104,11 @@
 
 
 //===----------------------------------------------------------------------===//
-/// Instruction itinerary Data - Itinerary data supplied by a subtarget to be
-/// used by a target.
+/// Itinerary data supplied by a subtarget to be used by a target.
 ///
 class InstrItineraryData {
 public:
-  const MCSchedModel   *SchedModel;     ///< Basic machine properties.
+  MCSchedModel          SchedModel;     ///< Basic machine properties.
   const InstrStage     *Stages;         ///< Array of stages selected
   const unsigned       *OperandCycles;  ///< Array of operand cycles selected
   const unsigned       *Forwardings;    ///< Array of pipeline forwarding pathes
@@ -118,45 +116,38 @@
 
   /// Ctors.
   ///
-  InstrItineraryData() : SchedModel(&MCSchedModel::DefaultSchedModel),
+  InstrItineraryData() : SchedModel(MCSchedModel::GetDefaultSchedModel()),
                          Stages(nullptr), OperandCycles(nullptr),
                          Forwardings(nullptr), Itineraries(nullptr) {}
 
-  InstrItineraryData(const MCSchedModel *SM, const InstrStage *S,
+  InstrItineraryData(const MCSchedModel &SM, const InstrStage *S,
                      const unsigned *OS, const unsigned *F)
     : SchedModel(SM), Stages(S), OperandCycles(OS), Forwardings(F),
-      Itineraries(SchedModel->InstrItineraries) {}
+      Itineraries(SchedModel.InstrItineraries) {}
 
-  /// isEmpty - Returns true if there are no itineraries.
-  ///
+  /// Returns true if there are no itineraries.
   bool isEmpty() const { return Itineraries == nullptr; }
 
-  /// isEndMarker - Returns true if the index is for the end marker
-  /// itinerary.
-  ///
+  /// Returns true if the index is for the end marker itinerary.
   bool isEndMarker(unsigned ItinClassIndx) const {
     return ((Itineraries[ItinClassIndx].FirstStage == ~0U) &&
             (Itineraries[ItinClassIndx].LastStage == ~0U));
   }
 
-  /// beginStage - Return the first stage of the itinerary.
-  ///
+  /// Return the first stage of the itinerary.
   const InstrStage *beginStage(unsigned ItinClassIndx) const {
     unsigned StageIdx = Itineraries[ItinClassIndx].FirstStage;
     return Stages + StageIdx;
   }
 
-  /// endStage - Return the last+1 stage of the itinerary.
-  ///
+  /// Return the last+1 stage of the itinerary.
   const InstrStage *endStage(unsigned ItinClassIndx) const {
     unsigned StageIdx = Itineraries[ItinClassIndx].LastStage;
     return Stages + StageIdx;
   }
 
-  /// getStageLatency - Return the total stage latency of the given
-  /// class.  The latency is the maximum completion time for any stage
-  /// in the itinerary.
-  ///
+  /// Return the total stage latency of the given class.
+  /// The latency is the maximum completion time for any stage in the itinerary.
   /// If no stages exist, it defaults to one cycle.
   unsigned getStageLatency(unsigned ItinClassIndx) const {
     // If the target doesn't provide itinerary information, use a simple
@@ -174,9 +165,8 @@
     return Latency;
   }
 
-  /// getOperandCycle - Return the cycle for the given class and
-  /// operand. Return -1 if no cycle is specified for the operand.
-  ///
+  /// Return the cycle for the given class and operand.
+  /// Return -1 if no cycle is specified for the operand.
   int getOperandCycle(unsigned ItinClassIndx, unsigned OperandIdx) const {
     if (isEmpty())
       return -1;
@@ -189,7 +179,7 @@
     return (int)OperandCycles[FirstIdx + OperandIdx];
   }
 
-  /// hasPipelineForwarding - Return true if there is a pipeline forwarding
+  /// Return true if there is a pipeline forwarding
   /// between instructions of itinerary classes DefClass and UseClasses so that
   /// value produced by an instruction of itinerary class DefClass, operand
   /// index DefIdx can be bypassed when it's read by an instruction of
@@ -212,7 +202,7 @@
       Forwardings[FirstUseIdx + UseIdx];
   }
 
-  /// getOperandLatency - Compute and return the use operand latency of a given
+  /// Compute and return the use operand latency of a given
   /// itinerary class and operand index if the value is produced by an
   /// instruction of the specified itinerary class and def operand index.
   int getOperandLatency(unsigned DefClass, unsigned DefIdx,
@@ -236,9 +226,8 @@
     return UseCycle;
   }
 
-  /// getNumMicroOps - Return the number of micro-ops that the given class
-  /// decodes to. Return -1 for classes that require dynamic lookup via
-  /// TargetInstrInfo.
+  /// Return the number of micro-ops that the given class decodes to.
+  /// Return -1 for classes that require dynamic lookup via TargetInstrInfo.
   int getNumMicroOps(unsigned ItinClassIndx) const {
     if (isEmpty())
       return 1;

diff --git a/include/llvm/MC/MCLinkerOptimizationHint.h b/include/llvm/MC/MCLinkerOptimizationHint.h
index 50fd527..1f91b0d 100644
--- a/include/llvm/MC/MCLinkerOptimizationHint.h
+++ b/include/llvm/MC/MCLinkerOptimizationHint.h

@@ -45,7 +45,7 @@
   return StringRef(".loh");
 }
 
-static inline bool isValidMCLOHType(MCLOHType Kind) {
+static inline bool isValidMCLOHType(unsigned Kind) {
   return Kind >= MCLOH_AdrpAdrp && Kind <= MCLOH_AdrpLdrGot;
 }
 

diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index 12a7f0e..0c5aa8a 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h

@@ -14,6 +14,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/MachO.h"
 #include <vector>
@@ -104,7 +105,7 @@
   /// @name Symbol Table Data
   /// @{
 
-  SmallString<256> StringTable;
+  StringTableBuilder StringTable;
   std::vector<MachSymbolData> LocalSymbolData;
   std::vector<MachSymbolData> ExternalSymbolData;
   std::vector<MachSymbolData> UndefinedSymbolData;
@@ -239,8 +240,7 @@
 
   /// ComputeSymbolTable - Compute the symbol table data
   ///
-  /// \param StringTable [out] - The string table data.
-  void ComputeSymbolTable(MCAssembler &Asm, SmallString<256> &StringTable,
+  void ComputeSymbolTable(MCAssembler &Asm,
                           std::vector<MachSymbolData> &LocalSymbolData,
                           std::vector<MachSymbolData> &ExternalSymbolData,
                           std::vector<MachSymbolData> &UndefinedSymbolData);

diff --git a/include/llvm/MC/MCObjectDisassembler.h b/include/llvm/MC/MCObjectDisassembler.h
deleted file mode 100644
index 5b935db..0000000
--- a/include/llvm/MC/MCObjectDisassembler.h
+++ /dev/null

@@ -1,174 +0,0 @@
-//===-- llvm/MC/MCObjectDisassembler.h --------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the MCObjectDisassembler class, which
-// can be used to construct an MCModule and an MC CFG from an ObjectFile.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_MC_MCOBJECTDISASSEMBLER_H
-#define LLVM_MC_MCOBJECTDISASSEMBLER_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/MemoryObject.h"
-#include <vector>
-
-namespace llvm {
-
-namespace object {
-  class ObjectFile;
-  class MachOObjectFile;
-}
-
-class MCBasicBlock;
-class MCDisassembler;
-class MCFunction;
-class MCInstrAnalysis;
-class MCModule;
-class MCObjectSymbolizer;
-
-/// \brief Disassemble an ObjectFile to an MCModule and MCFunctions.
-/// This class builds on MCDisassembler to disassemble whole sections, creating
-/// MCAtom (MCTextAtom for disassembled sections and MCDataAtom for raw data).
-/// It can also be used to create a control flow graph consisting of MCFunctions
-/// and MCBasicBlocks.
-class MCObjectDisassembler {
-public:
-  MCObjectDisassembler(const object::ObjectFile &Obj,
-                       const MCDisassembler &Dis,
-                       const MCInstrAnalysis &MIA);
-  virtual ~MCObjectDisassembler() {}
-
-  /// \brief Build an MCModule, creating atoms and optionally functions.
-  /// \param withCFG Also build a CFG by adding MCFunctions to the Module.
-  /// If withCFG is false, the MCModule built only contains atoms, representing
-  /// what was found in the object file. If withCFG is true, MCFunctions are
-  /// created, containing MCBasicBlocks. All text atoms are split to form basic
-  /// block atoms, which then each back an MCBasicBlock.
-  MCModule *buildModule(bool withCFG = false);
-
-  MCModule *buildEmptyModule();
-
-  typedef std::vector<uint64_t> AddressSetTy;
-  /// \name Create a new MCFunction.
-  MCFunction *createFunction(MCModule *Module, uint64_t BeginAddr,
-                             AddressSetTy &CallTargets,
-                             AddressSetTy &TailCallTargets);
-
-  /// \brief Set the region on which to fallback if disassembly was requested
-  /// somewhere not accessible in the object file.
-  /// This is used for dynamic disassembly (see RawMemoryObject).
-  void setFallbackRegion(std::unique_ptr<MemoryObject> &Region) {
-    FallbackRegion.reset(Region.release());
-  }
-
-  /// \brief Set the symbolizer to use to get information on external functions.
-  /// Note that this isn't used to do instruction-level symbolization (that is,
-  /// plugged into MCDisassembler), but to symbolize function call targets.
-  void setSymbolizer(MCObjectSymbolizer *ObjectSymbolizer) {
-    MOS = ObjectSymbolizer;
-  }
-
-  /// \brief Get the effective address of the entrypoint, or 0 if there is none.
-  virtual uint64_t getEntrypoint();
-
-  /// \name Get the addresses of static constructors/destructors in the object.
-  /// The caller is expected to know how to interpret the addresses;
-  /// for example, Mach-O init functions expect 5 arguments, not for ELF.
-  /// The addresses are original object file load addresses, not effective.
-  /// @{
-  virtual ArrayRef<uint64_t> getStaticInitFunctions();
-  virtual ArrayRef<uint64_t> getStaticExitFunctions();
-  /// @}
-
-  /// \name Translation between effective and objectfile load address.
-  /// @{
-  /// \brief Compute the effective load address, from an objectfile virtual
-  /// address. This is implemented in a format-specific way, to take into
-  /// account things like PIE/ASLR when doing dynamic disassembly.
-  /// For example, on Mach-O this would be done by adding the VM addr slide,
-  /// on glibc ELF by keeping a map between segment load addresses, filled
-  /// using dl_iterate_phdr, etc..
-  /// In most static situations and in the default impl., this returns \p Addr.
-  virtual uint64_t getEffectiveLoadAddr(uint64_t Addr);
-
-  /// \brief Compute the original load address, as specified in the objectfile.
-  /// This is the inverse of getEffectiveLoadAddr.
-  virtual uint64_t getOriginalLoadAddr(uint64_t EffectiveAddr);
-  /// @}
-
-protected:
-  const object::ObjectFile &Obj;
-  const MCDisassembler &Dis;
-  const MCInstrAnalysis &MIA;
-  MCObjectSymbolizer *MOS;
-
-  /// \brief The fallback memory region, outside the object file.
-  std::unique_ptr<MemoryObject> FallbackRegion;
-
-  /// \brief Return a memory region suitable for reading starting at \p Addr.
-  /// In most cases, this returns a StringRefMemoryObject backed by the
-  /// containing section. When no section was found, this returns the
-  /// FallbackRegion, if it is suitable.
-  /// If it is not, or if there is no fallback region, this returns 0.
-  MemoryObject *getRegionFor(uint64_t Addr);
-
-private:
-  /// \brief Fill \p Module by creating an atom for each section.
-  /// This could be made much smarter, using information like symbols, but also
-  /// format-specific features, like mach-o function_start or data_in_code LCs.
-  void buildSectionAtoms(MCModule *Module);
-
-  /// \brief Enrich \p Module with a CFG consisting of MCFunctions.
-  /// \param Module An MCModule returned by buildModule, with no CFG.
-  /// NOTE: Each MCBasicBlock in a MCFunction is backed by a single MCTextAtom.
-  /// When the CFG is built, contiguous instructions that were previously in a
-  /// single MCTextAtom will be split in multiple basic block atoms.
-  void buildCFG(MCModule *Module);
-
-  MCBasicBlock *getBBAt(MCModule *Module, MCFunction *MCFN, uint64_t BeginAddr,
-                        AddressSetTy &CallTargets,
-                        AddressSetTy &TailCallTargets);
-};
-
-class MCMachOObjectDisassembler : public MCObjectDisassembler {
-  const object::MachOObjectFile &MOOF;
-
-  uint64_t VMAddrSlide;
-  uint64_t HeaderLoadAddress;
-
-  // __DATA;__mod_init_func support.
-  llvm::StringRef ModInitContents;
-  // __DATA;__mod_exit_func support.
-  llvm::StringRef ModExitContents;
-
-public:
-  /// \brief Construct a Mach-O specific object disassembler.
-  /// \param VMAddrSlide The virtual address slide applied by dyld.
-  /// \param HeaderLoadAddress The load address of the mach_header for this
-  /// object.
-  MCMachOObjectDisassembler(const object::MachOObjectFile &MOOF,
-                            const MCDisassembler &Dis,
-                            const MCInstrAnalysis &MIA, uint64_t VMAddrSlide,
-                            uint64_t HeaderLoadAddress);
-
-protected:
-  uint64_t getEffectiveLoadAddr(uint64_t Addr) override;
-  uint64_t getOriginalLoadAddr(uint64_t EffectiveAddr) override;
-  uint64_t getEntrypoint() override;
-
-  ArrayRef<uint64_t> getStaticInitFunctions() override;
-  ArrayRef<uint64_t> getStaticExitFunctions() override;
-};
-
-}
-
-#endif

diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 4d1715e..321043c 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCBJECTFILEINFO_H
-#define LLVM_MC_MCBJECTFILEINFO_H
+#ifndef LLVM_MC_MCOBJECTFILEINFO_H
+#define LLVM_MC_MCOBJECTFILEINFO_H
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/CodeGen.h"
@@ -116,6 +116,7 @@
 
   /// These are used for the Fission separate debug information files.
   const MCSection *DwarfInfoDWOSection;
+  const MCSection *DwarfTypesDWOSection;
   const MCSection *DwarfAbbrevDWOSection;
   const MCSection *DwarfStrDWOSection;
   const MCSection *DwarfLineDWOSection;
@@ -261,7 +262,9 @@
     return DwarfInfoDWOSection;
   }
   const MCSection *getDwarfTypesSection(uint64_t Hash) const;
-  const MCSection *getDwarfTypesDWOSection(uint64_t Hash) const;
+  const MCSection *getDwarfTypesDWOSection() const {
+    return DwarfTypesDWOSection;
+  }
   const MCSection *getDwarfAbbrevDWOSection() const {
     return DwarfAbbrevDWOSection;
   }

diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index 8d37c85..0866ff5 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h

@@ -10,6 +10,7 @@
 #ifndef LLVM_MC_MCOBJECTSTREAMER_H
 #define LLVM_MC_MCOBJECTSTREAMER_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCStreamer.h"
 
@@ -37,11 +38,16 @@
   MCSectionData::iterator CurInsertionPoint;
   bool EmitEHFrame;
   bool EmitDebugFrame;
+  SmallVector<MCSymbolData *, 2> PendingLabels;
 
   virtual void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo&) = 0;
   void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
   void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
 
+  // If any labels have been emitted but not assigned fragments, ensure that
+  // they get assigned, either to F if possible or to a new data fragment.
+  void flushPendingLabels(MCFragment *F);
+
 protected:
   MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &_OS,
                    MCCodeEmitter *_Emitter);
@@ -69,14 +75,15 @@
 
   MCFragment *getCurrentFragment() const;
 
-  void insert(MCFragment *F) const {
+  void insert(MCFragment *F) {
+    flushPendingLabels(F);
     CurSectionData->getFragmentList().insert(CurInsertionPoint, F);
     F->setParent(CurSectionData);
   }
 
   /// Get a data fragment to write into, creating a new one if the current
   /// fragment is not a data fragment.
-  MCDataFragment *getOrCreateDataFragment() const;
+  MCDataFragment *getOrCreateDataFragment();
 
 public:
   void visitUsedSymbol(const MCSymbol &Sym) override;
@@ -126,7 +133,7 @@
   void EmitZeros(uint64_t NumBytes) override;
   void FinishImpl() override;
 
-  virtual bool mayHaveInstructions() const {
+  bool mayHaveInstructions() const override {
     return getCurrentSectionData()->hasInstructions();
   }
 };

diff --git a/include/llvm/MC/MCObjectSymbolizer.h b/include/llvm/MC/MCObjectSymbolizer.h
deleted file mode 100644
index f75b7f5..0000000
--- a/include/llvm/MC/MCObjectSymbolizer.h
+++ /dev/null

@@ -1,83 +0,0 @@
-//===-- llvm/MC/MCObjectSymbolizer.h --------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the MCObjectSymbolizer class, an MCSymbolizer that is
-// backed by an object::ObjectFile.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_MC_MCOBJECTSYMBOLIZER_H
-#define LLVM_MC_MCOBJECTSYMBOLIZER_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/MC/MCSymbolizer.h"
-#include "llvm/Object/ObjectFile.h"
-#include <vector>
-
-namespace llvm {
-
-class MCExpr;
-class MCInst;
-class MCRelocationInfo;
-class raw_ostream;
-
-/// \brief An ObjectFile-backed symbolizer.
-class MCObjectSymbolizer : public MCSymbolizer {
-protected:
-  const object::ObjectFile *Obj;
-
-  // Map a load address to the first relocation that applies there. As far as I
-  // know, if there are several relocations at the exact same address, they are
-  // related and the others can be determined from the first that was found in
-  // the relocation table. For instance, on x86-64 mach-o, a SUBTRACTOR
-  // relocation (referencing the minuend symbol) is followed by an UNSIGNED
-  // relocation (referencing the subtrahend symbol).
-  const object::RelocationRef *findRelocationAt(uint64_t Addr);
-  const object::SectionRef *findSectionContaining(uint64_t Addr);
-
-  MCObjectSymbolizer(MCContext &Ctx, std::unique_ptr<MCRelocationInfo> RelInfo,
-                     const object::ObjectFile *Obj);
-
-public:
-  /// \name Overridden MCSymbolizer methods:
-  /// @{
-  bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &cStream,
-                                int64_t Value, uint64_t Address,
-                                bool IsBranch, uint64_t Offset,
-                                uint64_t InstSize) override;
-
-  void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
-                                       int64_t Value,
-                                       uint64_t Address) override;
-  /// @}
-
-  /// \brief Look for an external function symbol at \p Addr.
-  /// (References through the ELF PLT, Mach-O stubs, and similar).
-  /// \returns An MCExpr representing the external symbol, or 0 if not found.
-  virtual StringRef findExternalFunctionAt(uint64_t Addr);
-
-  /// \brief Create an object symbolizer for \p Obj.
-  static MCObjectSymbolizer *
-  createObjectSymbolizer(MCContext &Ctx,
-                         std::unique_ptr<MCRelocationInfo> RelInfo,
-                         const object::ObjectFile *Obj);
-
-private:
-  typedef DenseMap<uint64_t, object::RelocationRef> AddrToRelocMap;
-  typedef std::vector<object::SectionRef> SortedSectionList;
-  SortedSectionList SortedSections;
-  AddrToRelocMap AddrToReloc;
-
-  void buildSectionList();
-  void buildRelocationByAddrMap();
-};
-
-}
-
-#endif

diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index 0b550ba..a9a30f1 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h

@@ -49,7 +49,7 @@
 
   const AsmToken peekTok(bool ShouldSkipSpace = true) override;
 
-  bool isAtStartOfComment(char Char);
+  bool isAtStartOfComment(const char *Ptr);
   bool isAtStatementSeparator(const char *Ptr);
 
   const MCAsmInfo &getMAI() const { return MAI; }

diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index e3d4181..b05891c 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h

@@ -18,7 +18,7 @@
 
 namespace llvm {
 
-/// AsmToken - Target independent representation for an assembler token.
+/// Target independent representation for an assembler token.
 class AsmToken {
 public:
   enum TokenKind {
@@ -74,25 +74,26 @@
 
   SMLoc getLoc() const;
   SMLoc getEndLoc() const;
+  SMRange getLocRange() const;
 
-  /// getStringContents - Get the contents of a string token (without quotes).
+  /// Get the contents of a string token (without quotes).
   StringRef getStringContents() const {
     assert(Kind == String && "This token isn't a string!");
     return Str.slice(1, Str.size() - 1);
   }
 
-  /// getIdentifier - Get the identifier string for the current token, which
-  /// should be an identifier or a string. This gets the portion of the string
-  /// which should be used as the identifier, e.g., it does not include the
-  /// quotes on strings.
+  /// Get the identifier string for the current token, which should be an
+  /// identifier or a string. This gets the portion of the string which should
+  /// be used as the identifier, e.g., it does not include the quotes on
+  /// strings.
   StringRef getIdentifier() const {
     if (Kind == Identifier)
       return getString();
     return getStringContents();
   }
 
-  /// getString - Get the string for the current token, this includes all
-  /// characters (for example, the quotes on strings) in the token.
+  /// Get the string for the current token, this includes all characters (for
+  /// example, the quotes on strings) in the token.
   ///
   /// The returned StringRef points into the source manager's memory buffer, and
   /// is safe to store across calls to Lex().
@@ -113,8 +114,8 @@
   }
 };
 
-/// MCAsmLexer - Generic assembler lexer interface, for use by target specific
-/// assembly lexers.
+/// Generic assembler lexer interface, for use by target specific assembly
+/// lexers.
 class MCAsmLexer {
   /// The current token, stored in the base class for faster access.
   AsmToken CurTok;
@@ -142,7 +143,7 @@
 public:
   virtual ~MCAsmLexer();
 
-  /// Lex - Consume the next token from the input stream and return it.
+  /// Consume the next token from the input stream and return it.
   ///
   /// The lexer will continuosly return the end-of-file token once the end of
   /// the main input file has been reached.
@@ -152,37 +153,37 @@
 
   virtual StringRef LexUntilEndOfStatement() = 0;
 
-  /// getLoc - Get the current source location.
+  /// Get the current source location.
   SMLoc getLoc() const;
 
-  /// getTok - Get the current (last) lexed token.
-  const AsmToken &getTok() {
+  /// Get the current (last) lexed token.
+  const AsmToken &getTok() const {
     return CurTok;
   }
 
-  /// peekTok - Look ahead at the next token to be lexed.
+  /// Look ahead at the next token to be lexed.
   virtual const AsmToken peekTok(bool ShouldSkipSpace = true) = 0;
 
-  /// getErrLoc - Get the current error location
+  /// Get the current error location
   const SMLoc &getErrLoc() {
     return ErrLoc;
   }
 
-  /// getErr - Get the current error string
+  /// Get the current error string
   const std::string &getErr() {
     return Err;
   }
 
-  /// getKind - Get the kind of current token.
+  /// Get the kind of current token.
   AsmToken::TokenKind getKind() const { return CurTok.getKind(); }
 
-  /// is - Check if the current token has kind \p K.
+  /// Check if the current token has kind \p K.
   bool is(AsmToken::TokenKind K) const { return CurTok.is(K); }
 
-  /// isNot - Check if the current token has kind \p K.
+  /// Check if the current token has kind \p K.
   bool isNot(AsmToken::TokenKind K) const { return CurTok.isNot(K); }
 
-  /// setSkipSpace - Set whether spaces should be ignored by the lexer
+  /// Set whether spaces should be ignored by the lexer
   void setSkipSpace(bool val) { SkipSpace = val; }
 
   bool getAllowAtInIdentifier() { return AllowAtInIdentifier; }

diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 9836795..34188e6 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h

@@ -45,20 +45,22 @@
   }
 };
 
-/// MCAsmParserSemaCallback - Generic Sema callback for assembly parser.
+/// Generic Sema callback for assembly parser.
 class MCAsmParserSemaCallback {
 public:
   virtual ~MCAsmParserSemaCallback();
   virtual void *LookupInlineAsmIdentifier(StringRef &LineBuf,
                                           InlineAsmIdentifierInfo &Info,
                                           bool IsUnevaluatedContext) = 0;
+  virtual StringRef LookupInlineAsmLabel(StringRef Identifier, SourceMgr &SM,
+                                         SMLoc Location, bool Create) = 0;
 
   virtual bool LookupInlineAsmField(StringRef Base, StringRef Member,
                                     unsigned &Offset) = 0;
 };
 
-/// MCAsmParser - Generic assembler parser interface, for use by target specific
-/// assembly parsers.
+/// Generic assembler parser interface, for use by target specific assembly
+/// parsers.
 class MCAsmParser {
 public:
   typedef bool (*DirectiveHandler)(MCAsmParserExtension*, StringRef, SMLoc);
@@ -85,10 +87,13 @@
   virtual SourceMgr &getSourceManager() = 0;
 
   virtual MCAsmLexer &getLexer() = 0;
+  const MCAsmLexer &getLexer() const {
+    return const_cast<MCAsmParser*>(this)->getLexer();
+  }
 
   virtual MCContext &getContext() = 0;
 
-  /// getStreamer - Return the output streamer for the assembler.
+  /// Return the output streamer for the assembler.
   virtual MCStreamer &getStreamer() = 0;
 
   MCTargetAsmParser &getTargetParser() const { return *TargetParser; }
@@ -100,51 +105,49 @@
   bool getShowParsedOperands() const { return ShowParsedOperands; }
   void setShowParsedOperands(bool Value) { ShowParsedOperands = Value; }
 
-  /// Run - Run the parser on the input source buffer.
+  /// Run the parser on the input source buffer.
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false) = 0;
 
   virtual void setParsingInlineAsm(bool V) = 0;
   virtual bool isParsingInlineAsm() = 0;
 
-  /// parseMSInlineAsm - Parse ms-style inline assembly.
-  virtual bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
-                                unsigned &NumOutputs, unsigned &NumInputs,
-                                SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
-                                SmallVectorImpl<std::string> &Constraints,
-                                SmallVectorImpl<std::string> &Clobbers,
-                                const MCInstrInfo *MII,
-                                const MCInstPrinter *IP,
-                                MCAsmParserSemaCallback &SI) = 0;
+  /// Parse ms-style inline assembly.
+  virtual bool parseMSInlineAsm(
+      void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
+      unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
+      SmallVectorImpl<std::string> &Constraints,
+      SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
+      const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) = 0;
 
-  /// Note - Emit a note at the location \p L, with the message \p Msg.
+  /// Emit a note at the location \p L, with the message \p Msg.
   virtual void Note(SMLoc L, const Twine &Msg,
                     ArrayRef<SMRange> Ranges = None) = 0;
 
-  /// Warning - Emit a warning at the location \p L, with the message \p Msg.
+  /// Emit a warning at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is true, if warnings are fatal.
   virtual bool Warning(SMLoc L, const Twine &Msg,
                        ArrayRef<SMRange> Ranges = None) = 0;
 
-  /// Error - Emit an error at the location \p L, with the message \p Msg.
+  /// Emit an error at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
   virtual bool Error(SMLoc L, const Twine &Msg,
                      ArrayRef<SMRange> Ranges = None) = 0;
 
-  /// Lex - Get the next AsmToken in the stream, possibly handling file
-  /// inclusion first.
+  /// Get the next AsmToken in the stream, possibly handling file inclusion
+  /// first.
   virtual const AsmToken &Lex() = 0;
 
-  /// getTok - Get the current AsmToken from the stream.
-  const AsmToken &getTok();
+  /// Get the current AsmToken from the stream.
+  const AsmToken &getTok() const;
 
   /// \brief Report an error at the current lexer location.
   bool TokError(const Twine &Msg, ArrayRef<SMRange> Ranges = None);
 
-  /// parseIdentifier - Parse an identifier or string (as a quoted identifier)
-  /// and set \p Res to the identifier contents.
+  /// Parse an identifier or string (as a quoted identifier) and set \p Res to
+  /// the identifier contents.
   virtual bool parseIdentifier(StringRef &Res) = 0;
 
   /// \brief Parse up to the end of statement and return the contents from the
@@ -152,15 +155,14 @@
   /// will be either the EndOfStatement or EOF.
   virtual StringRef parseStringToEndOfStatement() = 0;
 
-  /// parseEscapedString - Parse the current token as a string which may include
-  /// escaped characters and return the string contents.
+  /// Parse the current token as a string which may include escaped characters
+  /// and return the string contents.
   virtual bool parseEscapedString(std::string &Data) = 0;
 
-  /// eatToEndOfStatement - Skip to the end of the current statement, for error
-  /// recovery.
+  /// Skip to the end of the current statement, for error recovery.
   virtual void eatToEndOfStatement() = 0;
 
-  /// parseExpression - Parse an arbitrary expression.
+  /// Parse an arbitrary expression.
   ///
   /// @param Res - The value of the expression. The result is undefined
   /// on error.
@@ -168,31 +170,30 @@
   virtual bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) = 0;
   bool parseExpression(const MCExpr *&Res);
 
-  /// parsePrimaryExpr - Parse a primary expression.
+  /// Parse a primary expression.
   ///
   /// @param Res - The value of the expression. The result is undefined
   /// on error.
   /// @result - False on success.
   virtual bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) = 0;
 
-  /// parseParenExpression - Parse an arbitrary expression, assuming that an
-  /// initial '(' has already been consumed.
+  /// Parse an arbitrary expression, assuming that an initial '(' has already
+  /// been consumed.
   ///
   /// @param Res - The value of the expression. The result is undefined
   /// on error.
   /// @result - False on success.
   virtual bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) = 0;
 
-  /// parseAbsoluteExpression - Parse an expression which must evaluate to an
-  /// absolute value.
+  /// Parse an expression which must evaluate to an absolute value.
   ///
   /// @param Res - The value of the absolute expression. The result is undefined
   /// on error.
   /// @result - False on success.
   virtual bool parseAbsoluteExpression(int64_t &Res) = 0;
 
-  /// checkForValidSection - Ensure that we have a valid section set in the
-  /// streamer. Otherwise, report an error and switch to .text.
+  /// Ensure that we have a valid section set in the streamer. Otherwise, report
+  /// an error and switch to .text.
   virtual void checkForValidSection() = 0;
 };
 

diff --git a/include/llvm/MC/MCParser/MCAsmParserExtension.h b/include/llvm/MC/MCParser/MCAsmParserExtension.h
index 2eda3a9..bfc0afa 100644
--- a/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/include/llvm/MC/MCParser/MCAsmParserExtension.h

@@ -52,8 +52,17 @@
   /// @{
 
   MCContext &getContext() { return getParser().getContext(); }
+
   MCAsmLexer &getLexer() { return getParser().getLexer(); }
+  const MCAsmLexer &getLexer() const {
+    return const_cast<MCAsmParserExtension *>(this)->getLexer();
+  }
+
   MCAsmParser &getParser() { return *Parser; }
+  const MCAsmParser &getParser() const {
+    return const_cast<MCAsmParserExtension*>(this)->getParser();
+  }
+
   SourceMgr &getSourceManager() { return getParser().getSourceManager(); }
   MCStreamer &getStreamer() { return getParser().getStreamer(); }
   bool Warning(SMLoc L, const Twine &Msg) {

diff --git a/include/llvm/MC/MCRegisterInfo.h b/include/llvm/MC/MCRegisterInfo.h
index 766f631..df556e7 100644
--- a/include/llvm/MC/MCRegisterInfo.h
+++ b/include/llvm/MC/MCRegisterInfo.h

@@ -32,9 +32,9 @@
   typedef const MCPhysReg* iterator;
   typedef const MCPhysReg* const_iterator;
 
-  const char *Name;
   const iterator RegsBegin;
   const uint8_t *const RegSet;
+  const uint32_t NameIdx;
   const uint16_t RegsSize;
   const uint16_t RegSetSize;
   const uint16_t ID;
@@ -46,10 +46,6 @@
   ///
   unsigned getID() const { return ID; }
 
-  /// getName() - Return the register class name for debugging.
-  ///
-  const char *getName() const { return Name; }
-
   /// begin/end - Return all of the registers in this class.
   ///
   iterator       begin() const { return RegsBegin; }
@@ -162,6 +158,7 @@
   const MCPhysReg (*RegUnitRoots)[2];         // Pointer to regunit root table.
   const MCPhysReg *DiffLists;                 // Pointer to the difflists array
   const char *RegStrings;                     // Pointer to the string table.
+  const char *RegClassStrings;                // Pointer to the class strings.
   const uint16_t *SubRegIndices;              // Pointer to the subreg lookup
                                               // array.
   const SubRegCoveredBits *SubRegIdxRanges;   // Pointer to the subreg covered
@@ -243,6 +240,7 @@
                           unsigned NRU,
                           const MCPhysReg *DL,
                           const char *Strings,
+                          const char *ClassStrings,
                           const uint16_t *SubIndices,
                           unsigned NumIndices,
                           const SubRegCoveredBits *SubIdxRanges,
@@ -254,6 +252,7 @@
     Classes = C;
     DiffLists = DL;
     RegStrings = Strings;
+    RegClassStrings = ClassStrings;
     NumClasses = NC;
     RegUnitRoots = RURoots;
     NumRegUnits = NRU;
@@ -401,6 +400,10 @@
     return Classes[i];
   }
 
+  const char *getRegClassName(const MCRegisterClass *Class) const {
+    return RegClassStrings + Class->NameIdx;
+  }
+
    /// \brief Returns the encoding for RegNo
   uint16_t getEncodingValue(unsigned RegNo) const {
     assert(RegNo < NumRegs &&

diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 862a0fd..1adfedd 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h

@@ -133,10 +133,7 @@
 /// provides a detailed reservation table describing each cycle of instruction
 /// execution. Subtargets may define any or all of the above categories of data
 /// depending on the type of CPU and selected scheduler.
-class MCSchedModel {
-public:
-  static MCSchedModel DefaultSchedModel; // For unknown processors.
-
+struct MCSchedModel {
   // IssueWidth is the maximum number of instructions that may be scheduled in
   // the same per-cycle group.
   unsigned IssueWidth;
@@ -186,10 +183,11 @@
   // takes to recover from a branch misprediction.
   unsigned MispredictPenalty;
   static const unsigned DefaultMispredictPenalty = 10;
+  
+  bool PostRAScheduler; // default value is false
 
   bool CompleteModel;
 
-private:
   unsigned ProcID;
   const MCProcResourceDesc *ProcResourceTable;
   const MCSchedClassDesc *SchedClassTable;
@@ -199,35 +197,6 @@
   friend class InstrItineraryData;
   const InstrItinerary *InstrItineraries;
 
-public:
-  // Default's must be specified as static const literals so that tablegenerated
-  // target code can use it in static initializers. The defaults need to be
-  // initialized in this default ctor because some clients directly instantiate
-  // MCSchedModel instead of using a generated itinerary.
-  MCSchedModel(): IssueWidth(DefaultIssueWidth),
-                  MicroOpBufferSize(DefaultMicroOpBufferSize),
-                  LoopMicroOpBufferSize(DefaultLoopMicroOpBufferSize),
-                  LoadLatency(DefaultLoadLatency),
-                  HighLatency(DefaultHighLatency),
-                  MispredictPenalty(DefaultMispredictPenalty),
-                  CompleteModel(true), ProcID(0), ProcResourceTable(nullptr),
-                  SchedClassTable(nullptr), NumProcResourceKinds(0),
-                  NumSchedClasses(0), InstrItineraries(nullptr) {
-    (void)NumProcResourceKinds;
-    (void)NumSchedClasses;
-  }
-
-  // Table-gen driven ctor.
-  MCSchedModel(unsigned iw, int mbs, int lmbs, unsigned ll, unsigned hl,
-               unsigned mp, bool cm, unsigned pi, const MCProcResourceDesc *pr,
-               const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
-               const InstrItinerary *ii):
-    IssueWidth(iw), MicroOpBufferSize(mbs), LoopMicroOpBufferSize(lmbs),
-    LoadLatency(ll), HighLatency(hl),
-    MispredictPenalty(mp), CompleteModel(cm), ProcID(pi),
-    ProcResourceTable(pr), SchedClassTable(sc), NumProcResourceKinds(npr),
-    NumSchedClasses(nsc), InstrItineraries(ii) {}
-
   unsigned getProcessorID() const { return ProcID; }
 
   /// Does this machine model include instruction-level scheduling.
@@ -254,6 +223,26 @@
     assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx");
     return &SchedClassTable[SchedClassIdx];
   }
+
+  // /\brief Returns a default initialized model. Used for unknown processors.
+  static MCSchedModel GetDefaultSchedModel() {
+    MCSchedModel Ret = { DefaultIssueWidth,
+                         DefaultMicroOpBufferSize,
+                         DefaultLoopMicroOpBufferSize,
+                         DefaultLoadLatency,
+                         DefaultHighLatency,
+                         DefaultMispredictPenalty,
+                         false,
+                         true,
+                         0,
+                         nullptr,
+                         nullptr,
+                         0,
+                         0,
+                         nullptr
+                       };
+    return Ret;
+  }
 };
 
 } // End llvm namespace

diff --git a/include/llvm/MC/MCSectionCOFF.h b/include/llvm/MC/MCSectionCOFF.h
index d205e2a..0bbf369 100644
--- a/include/llvm/MC/MCSectionCOFF.h
+++ b/include/llvm/MC/MCSectionCOFF.h

@@ -36,7 +36,7 @@
     /// The COMDAT symbol of this section. Only valid if this is a COMDAT
     /// section. Two COMDAT sections are merged if they have the same
     /// COMDAT symbol.
-    const MCSymbol *COMDATSymbol;
+    MCSymbol *COMDATSymbol;
 
     /// Selection - This is the Selection field for the section symbol, if
     /// it is a COMDAT section (Characteristics & IMAGE_SCN_LNK_COMDAT) != 0
@@ -45,7 +45,7 @@
   private:
     friend class MCContext;
     MCSectionCOFF(StringRef Section, unsigned Characteristics,
-                  const MCSymbol *COMDATSymbol, int Selection, SectionKind K)
+                  MCSymbol *COMDATSymbol, int Selection, SectionKind K)
         : MCSection(SV_COFF, K), SectionName(Section),
           Characteristics(Characteristics), COMDATSymbol(COMDATSymbol),
           Selection(Selection) {
@@ -67,7 +67,7 @@
       return SectionName.str() + "_end";
     }
     unsigned getCharacteristics() const { return Characteristics; }
-    const MCSymbol *getCOMDATSymbol() const { return COMDATSymbol; }
+    MCSymbol *getCOMDATSymbol() const { return COMDATSymbol; }
     int getSelection() const { return Selection; }
 
     void setSelection(int Selection) const;

diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 216de75..df896a6 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h

@@ -20,7 +20,7 @@
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
-#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinEH.h"
 #include "llvm/Support/DataTypes.h"
 #include <string>
 
@@ -91,18 +91,20 @@
   AArch64TargetStreamer(MCStreamer &S);
   ~AArch64TargetStreamer();
 
-
   void finish() override;
 
   /// Callback used to implement the ldr= pseudo.
   /// Add a new entry to the constant pool for the current section and return an
   /// MCExpr that can be used to refer to the constant pool location.
-  const MCExpr *addConstantPoolEntry(const MCExpr *);
+  const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size);
 
   /// Callback used to implemnt the .ltorg directive.
   /// Emit contents of constant pool for the current section.
   void emitCurrentConstantPool();
 
+  /// Callback used to implement the .inst directive.
+  virtual void emitInst(uint32_t Inst);
+
 private:
   std::unique_ptr<AssemblerConstantPools> ConstantPools;
 };
@@ -175,15 +177,15 @@
   MCStreamer(const MCStreamer &) LLVM_DELETED_FUNCTION;
   MCStreamer &operator=(const MCStreamer &) LLVM_DELETED_FUNCTION;
 
-  std::vector<MCDwarfFrameInfo> FrameInfos;
-  MCDwarfFrameInfo *getCurrentFrameInfo();
-  MCSymbol *EmitCFICommon();
-  void EnsureValidFrame();
+  std::vector<MCDwarfFrameInfo> DwarfFrameInfos;
+  MCDwarfFrameInfo *getCurrentDwarfFrameInfo();
+  void EnsureValidDwarfFrame();
 
-  std::vector<MCWin64EHUnwindInfo *> W64UnwindInfos;
-  MCWin64EHUnwindInfo *CurrentW64UnwindInfo;
-  void setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame);
-  void EnsureValidW64UnwindInfo();
+  MCSymbol *EmitCFICommon();
+
+  std::vector<WinEH::FrameInfo *> WinFrameInfos;
+  WinEH::FrameInfo *CurrentWinFrameInfo;
+  void EnsureValidWinFrameInfo();
 
   // SymbolOrdering - Tracks an index to represent the order
   // a symbol was emitted in. Zero means we did not emit that symbol.
@@ -196,18 +198,14 @@
 protected:
   MCStreamer(MCContext &Ctx);
 
-  const MCExpr *BuildSymbolDiff(MCContext &Context, const MCSymbol *A,
-                                const MCSymbol *B);
-
-  const MCExpr *ForceExpAbs(const MCExpr *Expr);
-
   virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
 
-  MCWin64EHUnwindInfo *getCurrentW64UnwindInfo() {
-    return CurrentW64UnwindInfo;
+  WinEH::FrameInfo *getCurrentWinFrameInfo() {
+    return CurrentWinFrameInfo;
   }
-  void EmitW64Tables();
+
+  virtual void EmitWindowsUnwindTables();
 
   virtual void EmitRawTextImpl(StringRef String);
 
@@ -231,20 +229,14 @@
     return TargetStreamer.get();
   }
 
-  unsigned getNumFrameInfos() { return FrameInfos.size(); }
-
-  const MCDwarfFrameInfo &getFrameInfo(unsigned i) { return FrameInfos[i]; }
-
-  ArrayRef<MCDwarfFrameInfo> getFrameInfos() const { return FrameInfos; }
-
-  unsigned getNumW64UnwindInfos() { return W64UnwindInfos.size(); }
-
-  MCWin64EHUnwindInfo &getW64UnwindInfo(unsigned i) {
-    return *W64UnwindInfos[i];
+  unsigned getNumFrameInfos() { return DwarfFrameInfos.size(); }
+  ArrayRef<MCDwarfFrameInfo> getDwarfFrameInfos() const {
+    return DwarfFrameInfos;
   }
 
-  ArrayRef<MCWin64EHUnwindInfo *> getW64UnwindInfos() const {
-    return W64UnwindInfos;
+  unsigned getNumWinFrameInfos() { return WinFrameInfos.size(); }
+  ArrayRef<WinEH::FrameInfo *> getWinFrameInfos() const {
+    return WinFrameInfos;
   }
 
   void generateCompactUnwindEncodings(MCAsmBackend *MAB);
@@ -354,8 +346,8 @@
   /// @p Section.  This is required to update CurSection.
   ///
   /// This corresponds to assembler directives like .section, .text, etc.
-  void SwitchSection(const MCSection *Section,
-                     const MCExpr *Subsection = nullptr) {
+  virtual void SwitchSection(const MCSection *Section,
+                             const MCExpr *Subsection = nullptr) {
     assert(Section && "Cannot switch to a null section!");
     MCSectionSubPair curSection = SectionStack.back().first;
     SectionStack.back().second = curSection;
@@ -378,7 +370,7 @@
   }
 
   /// Create the default sections and set the initial one.
-  virtual void InitSections();
+  virtual void InitSections(bool NoExecStack);
 
   /// AssignSection - Sets the symbol's section.
   ///
@@ -557,12 +549,6 @@
   /// to pass in a MCExpr for constant integers.
   virtual void EmitIntValue(uint64_t Value, unsigned Size);
 
-  /// EmitAbsValue - Emit the Value, but try to avoid relocations. On MachO
-  /// this is done by producing
-  /// foo = value
-  /// .long foo
-  void EmitAbsValue(const MCExpr *Value, unsigned Size);
-
   virtual void EmitULEB128Value(const MCExpr *Value);
 
   virtual void EmitSLEB128Value(const MCExpr *Value);
@@ -577,7 +563,8 @@
 
   /// EmitSymbolValue - Special case of EmitValue that avoids the client
   /// having to pass in a MCExpr for MCSymbols.
-  void EmitSymbolValue(const MCSymbol *Sym, unsigned Size);
+  void EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
+                       bool IsSectionRelative = false);
 
   /// EmitGPRel64Value - Emit the expression @p Value into the output as a
   /// gprel64 (64-bit GP relative) value.
@@ -673,11 +660,6 @@
                                      StringRef FileName);
 
   virtual MCSymbol *getDwarfLineTableSymbol(unsigned CUID);
-
-  void EmitDwarfSetLineAddr(int64_t LineDelta, const MCSymbol *Label,
-                            int PointerSize);
-
-  virtual void EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding);
   virtual void EmitCFISections(bool EH, bool Debug);
   void EmitCFIStartProc(bool IsSimple);
   void EmitCFIEndProc();
@@ -786,8 +768,8 @@
 /// createELFStreamer - Create a machine code streamer which will generate
 /// ELF format object files.
 MCStreamer *createELFStreamer(MCContext &Ctx, MCAsmBackend &TAB,
-                              raw_ostream &OS, MCCodeEmitter *CE, bool RelaxAll,
-                              bool NoExecStack);
+                              raw_ostream &OS, MCCodeEmitter *CE,
+                              bool RelaxAll);
 
 } // end namespace llvm
 

diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 088c5e7..9d09bd8 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCSUBTARGET_H
-#define LLVM_MC_MCSUBTARGET_H
+#ifndef LLVM_MC_MCSUBTARGETINFO_H
+#define LLVM_MC_MCSUBTARGETINFO_H
 
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -36,7 +36,7 @@
   const MCWriteProcResEntry *WriteProcResTable;
   const MCWriteLatencyEntry *WriteLatencyTable;
   const MCReadAdvanceEntry *ReadAdvanceTable;
-  const MCSchedModel *CPUSchedModel;
+  MCSchedModel CPUSchedModel;
 
   const InstrStage *Stages;            // Instruction itinerary stages
   const unsigned *OperandCycles;       // Itinerary operand cycles
@@ -65,6 +65,10 @@
     return FeatureBits;
   }
 
+  /// setFeatureBits - Set the feature bits.
+  ///
+  void setFeatureBits(uint64_t FeatureBits_) { FeatureBits = FeatureBits_; }
+
   /// InitMCProcessorInfo - Set or change the CPU (optionally supplemented with
   /// feature string). Recompute feature bits and scheduling model.
   void InitMCProcessorInfo(StringRef CPU, StringRef FS);
@@ -82,11 +86,11 @@
 
   /// getSchedModelForCPU - Get the machine model of a CPU.
   ///
-  const MCSchedModel *getSchedModelForCPU(StringRef CPU) const;
+  MCSchedModel getSchedModelForCPU(StringRef CPU) const;
 
   /// getSchedModel - Get the machine model for this subtarget's CPU.
   ///
-  const MCSchedModel *getSchedModel() const { return CPUSchedModel; }
+  const MCSchedModel &getSchedModel() const { return CPUSchedModel; }
 
   /// Return an iterator at the first process resource consumed by the given
   /// scheduling class.

diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index 384cc1b..cf92307 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_TARGETPARSER_H
-#define LLVM_MC_TARGETPARSER_H
+#ifndef LLVM_MC_MCTARGETASMPARSER_H
+#define LLVM_MC_MCTARGETASMPARSER_H
 
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
@@ -38,20 +38,22 @@
   AOK_Input,          // Rewrite in terms of $N.
   AOK_Output,         // Rewrite in terms of $N.
   AOK_SizeDirective,  // Add a sizing directive (e.g., dword ptr).
+  AOK_Label,          // Rewrite local labels.
   AOK_Skip            // Skip emission (e.g., offset/type operators).
 };
 
 const char AsmRewritePrecedence [] = {
   0, // AOK_Delete
-  1, // AOK_Align
-  1, // AOK_DotOperator
-  1, // AOK_Emit
-  3, // AOK_Imm
-  3, // AOK_ImmPrefix
-  2, // AOK_Input
-  2, // AOK_Output
-  4, // AOK_SizeDirective
-  1  // AOK_Skip
+  2, // AOK_Align
+  2, // AOK_DotOperator
+  2, // AOK_Emit
+  4, // AOK_Imm
+  4, // AOK_ImmPrefix
+  3, // AOK_Input
+  3, // AOK_Output
+  5, // AOK_SizeDirective
+  1, // AOK_Label
+  2  // AOK_Skip
 };
 
 struct AsmRewrite {
@@ -59,9 +61,12 @@
   SMLoc Loc;
   unsigned Len;
   unsigned Val;
+  StringRef Label;
 public:
   AsmRewrite(AsmRewriteKind kind, SMLoc loc, unsigned len = 0, unsigned val = 0)
     : Kind(kind), Loc(loc), Len(len), Val(val) {}
+  AsmRewrite(AsmRewriteKind kind, SMLoc loc, unsigned len, StringRef label)
+    : Kind(kind), Loc(loc), Len(len), Val(0), Label(label) {}
 };
 
 struct ParseInstructionInfo {
@@ -93,7 +98,7 @@
   MCTargetAsmParser();
 
   /// AvailableFeatures - The current set of available features.
-  unsigned AvailableFeatures;
+  uint64_t AvailableFeatures;
 
   /// ParsingInlineAsm - Are we parsing ms-style inline assembly?
   bool ParsingInlineAsm;
@@ -108,12 +113,14 @@
 public:
   virtual ~MCTargetAsmParser();
 
-  unsigned getAvailableFeatures() const { return AvailableFeatures; }
-  void setAvailableFeatures(unsigned Value) { AvailableFeatures = Value; }
+  uint64_t getAvailableFeatures() const { return AvailableFeatures; }
+  void setAvailableFeatures(uint64_t Value) { AvailableFeatures = Value; }
 
   bool isParsingInlineAsm () { return ParsingInlineAsm; }
   void setParsingInlineAsm (bool Value) { ParsingInlineAsm = Value; }
 
+  MCTargetOptions getTargetOptions() const { return MCOptions; }
+
   void setSemaCallback(MCAsmParserSemaCallback *Callback) {
     SemaCallback = Callback;
   }
@@ -121,6 +128,9 @@
   virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                              SMLoc &EndLoc) = 0;
 
+  /// Sets frame register corresponding to the current MachineFunction.
+  virtual void SetFrameRegister(unsigned RegNo) {}
+
   /// ParseInstruction - Parse one assembly instruction.
   ///
   /// The parser is positioned following the instruction name. The target
@@ -161,9 +171,12 @@
   /// explaining the match failure.
   virtual bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                        OperandVector &Operands, MCStreamer &Out,
-                                       unsigned &ErrorInfo,
+                                       uint64_t &ErrorInfo,
                                        bool MatchingInlineAsm) = 0;
 
+  /// Allows targets to let registers opt out of clobber lists.
+  virtual bool OmitRegisterFromClobberLists(unsigned RegNo) { return false; }
+
   /// Allow a target to add special case operand matching for things that
   /// tblgen doesn't/can't handle effectively. For example, literal
   /// immediates on ARM. TableGen expects a token operand, but the parser

diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h
index eb4348e..de79bae 100644
--- a/include/llvm/MC/MCTargetOptions.h
+++ b/include/llvm/MC/MCTargetOptions.h

@@ -24,6 +24,7 @@
 
   bool MCRelaxAll : 1;
   bool MCNoExecStack : 1;
+  bool MCFatalWarnings : 1;
   bool MCSaveTempLabels : 1;
   bool MCUseDwarfDirectory : 1;
   bool ShowMCEncoding : 1;
@@ -38,12 +39,13 @@
   return (ARE_EQUAL(SanitizeAddress) &&
           ARE_EQUAL(MCRelaxAll) &&
           ARE_EQUAL(MCNoExecStack) &&
+          ARE_EQUAL(MCFatalWarnings) &&
           ARE_EQUAL(MCSaveTempLabels) &&
           ARE_EQUAL(MCUseDwarfDirectory) &&
           ARE_EQUAL(ShowMCEncoding) &&
           ARE_EQUAL(ShowMCInst) &&
           ARE_EQUAL(AsmVerbose) &&
-	  ARE_EQUAL(DwarfVersion));
+          ARE_EQUAL(DwarfVersion));
 #undef ARE_EQUAL
 }
 

diff --git a/include/llvm/MC/MCWin64EH.h b/include/llvm/MC/MCWin64EH.h
index d21e762..0e81a19 100644
--- a/include/llvm/MC/MCWin64EH.h
+++ b/include/llvm/MC/MCWin64EH.h

@@ -15,79 +15,49 @@
 #ifndef LLVM_MC_MCWIN64EH_H
 #define LLVM_MC_MCWIN64EH_H
 
+#include "llvm/MC/MCWinEH.h"
 #include "llvm/Support/Win64EH.h"
-#include <cassert>
 #include <vector>
 
 namespace llvm {
-  class StringRef;
-  class MCStreamer;
-  class MCSymbol;
+class MCStreamer;
+class MCSymbol;
 
-  class MCWin64EHInstruction {
-  public:
-    typedef Win64EH::UnwindOpcodes OpType;
-  private:
-    OpType Operation;
-    MCSymbol *Label;
-    unsigned Offset;
-    unsigned Register;
-  public:
-    MCWin64EHInstruction(OpType Op, MCSymbol *L, unsigned Reg)
-      : Operation(Op), Label(L), Offset(0), Register(Reg) {
-     assert(Op == Win64EH::UOP_PushNonVol);
-    }
-    MCWin64EHInstruction(MCSymbol *L, unsigned Size)
-      : Operation(Size>128 ? Win64EH::UOP_AllocLarge : Win64EH::UOP_AllocSmall),
-        Label(L), Offset(Size) { }
-    MCWin64EHInstruction(OpType Op, MCSymbol *L, unsigned Reg, unsigned Off)
-      : Operation(Op), Label(L), Offset(Off), Register(Reg) {
-      assert(Op == Win64EH::UOP_SetFPReg ||
-             Op == Win64EH::UOP_SaveNonVol ||
-             Op == Win64EH::UOP_SaveNonVolBig ||
-             Op == Win64EH::UOP_SaveXMM128 ||
-             Op == Win64EH::UOP_SaveXMM128Big);
-    }
-    MCWin64EHInstruction(OpType Op, MCSymbol *L, bool Code)
-      : Operation(Op), Label(L), Offset(Code ? 1 : 0) {
-      assert(Op == Win64EH::UOP_PushMachFrame);
-    }
-    OpType getOperation() const { return Operation; }
-    MCSymbol *getLabel() const { return Label; }
-    unsigned getOffset() const { return Offset; }
-    unsigned getSize() const { return Offset; }
-    unsigned getRegister() const { return Register; }
-    bool isPushCodeFrame() const { return Offset == 1; }
-  };
+namespace Win64EH {
+struct Instruction {
+  static WinEH::Instruction PushNonVol(MCSymbol *L, unsigned Reg) {
+    return WinEH::Instruction(Win64EH::UOP_PushNonVol, L, Reg, -1);
+  }
+  static WinEH::Instruction Alloc(MCSymbol *L, unsigned Size) {
+    return WinEH::Instruction(Size > 128 ? UOP_AllocLarge : UOP_AllocSmall, L,
+                              -1, Size);
+  }
+  static WinEH::Instruction PushMachFrame(MCSymbol *L, bool Code) {
+    return WinEH::Instruction(UOP_PushMachFrame, L, -1, Code ? 1 : 0);
+  }
+  static WinEH::Instruction SaveNonVol(MCSymbol *L, unsigned Reg,
+                                       unsigned Offset) {
+    return WinEH::Instruction(Offset > 512 * 1024 - 8 ? UOP_SaveNonVolBig
+                                                      : UOP_SaveNonVol,
+                              L, Reg, Offset);
+  }
+  static WinEH::Instruction SaveXMM(MCSymbol *L, unsigned Reg,
+                                    unsigned Offset) {
+    return WinEH::Instruction(Offset > 512 * 1024 - 8 ? UOP_SaveXMM128Big
+                                                      : UOP_SaveXMM128,
+                              L, Reg, Offset);
+  }
+  static WinEH::Instruction SetFPReg(MCSymbol *L, unsigned Reg, unsigned Off) {
+    return WinEH::Instruction(UOP_SetFPReg, L, Reg, Off);
+  }
+};
 
-  struct MCWin64EHUnwindInfo {
-    MCWin64EHUnwindInfo()
-      : Begin(nullptr), End(nullptr),ExceptionHandler(nullptr),
-        Function(nullptr), PrologEnd(nullptr), Symbol(nullptr),
-        HandlesUnwind(false), HandlesExceptions(false), LastFrameInst(-1),
-        ChainedParent(nullptr), Instructions() {}
-    MCSymbol *Begin;
-    MCSymbol *End;
-    const MCSymbol *ExceptionHandler;
-    const MCSymbol *Function;
-    MCSymbol *PrologEnd;
-    MCSymbol *Symbol;
-    bool HandlesUnwind;
-    bool HandlesExceptions;
-    int LastFrameInst;
-    MCWin64EHUnwindInfo *ChainedParent;
-    std::vector<MCWin64EHInstruction> Instructions;
-  };
-
-  class MCWin64EHUnwindEmitter {
-  public:
-    static StringRef GetSectionSuffix(const MCSymbol *func);
-    //
-    // This emits the unwind info sections (.pdata and .xdata in PE/COFF).
-    //
-    static void Emit(MCStreamer &streamer);
-    static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info);
-  };
+class UnwindEmitter : public WinEH::UnwindEmitter {
+public:
+  void Emit(MCStreamer &Streamer) const override;
+  void EmitUnwindInfo(MCStreamer &Streamer, WinEH::FrameInfo *FI) const override;
+};
+}
 } // end namespace llvm
 
 #endif

diff --git a/include/llvm/MC/MCWinCOFFStreamer.h b/include/llvm/MC/MCWinCOFFStreamer.h
index 7d2d0e4..57a75ce 100644
--- a/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/include/llvm/MC/MCWinCOFFStreamer.h

@@ -30,10 +30,16 @@
   MCWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, MCCodeEmitter &CE,
                     raw_ostream &OS);
 
+  /// state management
+  void reset() override {
+    CurSymbol = nullptr;
+    MCObjectStreamer::reset();
+  }
+
   /// \name MCStreamer interface
   /// \{
 
-  void InitSections() override;
+  void InitSections(bool NoExecStack) override;
   void EmitLabel(MCSymbol *Symbol) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitThumbFunc(MCSymbol *Func) override;

diff --git a/include/llvm/MC/MCWinEH.h b/include/llvm/MC/MCWinEH.h
new file mode 100644
index 0000000..05b58c7
--- /dev/null
+++ b/include/llvm/MC/MCWinEH.h

@@ -0,0 +1,84 @@
+//===- MCWinEH.h - Windows Unwinding Support --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCWINEH_H
+#define LLVM_MC_MCWINEH_H
+
+#include <vector>
+
+namespace llvm {
+class MCContext;
+class MCSection;
+class MCStreamer;
+class MCSymbol;
+class StringRef;
+
+namespace WinEH {
+struct Instruction {
+  const MCSymbol *Label;
+  const unsigned Offset;
+  const unsigned Register;
+  const unsigned Operation;
+
+  Instruction(unsigned Op, MCSymbol *L, unsigned Reg, unsigned Off)
+    : Label(L), Offset(Off), Register(Reg), Operation(Op) {}
+};
+
+struct FrameInfo {
+  const MCSymbol *Begin;
+  const MCSymbol *End;
+  const MCSymbol *ExceptionHandler;
+  const MCSymbol *Function;
+  const MCSymbol *PrologEnd;
+  const MCSymbol *Symbol;
+
+  bool HandlesUnwind;
+  bool HandlesExceptions;
+
+  int LastFrameInst;
+  const FrameInfo *ChainedParent;
+  std::vector<Instruction> Instructions;
+
+  FrameInfo()
+    : Begin(nullptr), End(nullptr), ExceptionHandler(nullptr),
+      Function(nullptr), PrologEnd(nullptr), Symbol(nullptr),
+      HandlesUnwind(false), HandlesExceptions(false), LastFrameInst(-1),
+      ChainedParent(nullptr), Instructions() {}
+  FrameInfo(const MCSymbol *Function, const MCSymbol *BeginFuncEHLabel)
+    : Begin(BeginFuncEHLabel), End(nullptr), ExceptionHandler(nullptr),
+      Function(Function), PrologEnd(nullptr), Symbol(nullptr),
+      HandlesUnwind(false), HandlesExceptions(false), LastFrameInst(-1),
+      ChainedParent(nullptr), Instructions() {}
+  FrameInfo(const MCSymbol *Function, const MCSymbol *BeginFuncEHLabel,
+            const FrameInfo *ChainedParent)
+    : Begin(BeginFuncEHLabel), End(nullptr), ExceptionHandler(nullptr),
+      Function(Function), PrologEnd(nullptr), Symbol(nullptr),
+      HandlesUnwind(false), HandlesExceptions(false), LastFrameInst(-1),
+      ChainedParent(ChainedParent), Instructions() {}
+};
+
+class UnwindEmitter {
+public:
+  static const MCSection *getPDataSection(const MCSymbol *Function,
+                                          MCContext &Context);
+  static const MCSection *getXDataSection(const MCSymbol *Function,
+                                          MCContext &Context);
+
+  virtual ~UnwindEmitter() { }
+
+  //
+  // This emits the unwind info sections (.pdata and .xdata in PE/COFF).
+  //
+  virtual void Emit(MCStreamer &Streamer) const = 0;
+  virtual void EmitUnwindInfo(MCStreamer &Streamer, FrameInfo *FI) const = 0;
+};
+}
+}
+
+#endif

diff --git a/include/llvm/MC/StringTableBuilder.h b/include/llvm/MC/StringTableBuilder.h
index 065e9e0..897d449 100644
--- a/include/llvm/MC/StringTableBuilder.h
+++ b/include/llvm/MC/StringTableBuilder.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_STRINGTABLE_BUILDER_H
-#define LLVM_MC_STRINGTABLE_BUILDER_H
+#ifndef LLVM_MC_STRINGTABLEBUILDER_H
+#define LLVM_MC_STRINGTABLEBUILDER_H
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
@@ -26,12 +26,18 @@
   /// copy of s. Can only be used before the table is finalized.
   StringRef add(StringRef s) {
     assert(!isFinalized());
-    return StringIndexMap.GetOrCreateValue(s, 0).getKey();
+    return StringIndexMap.insert(std::make_pair(s, 0)).first->first();
   }
 
+  enum Kind {
+    ELF,
+    WinCOFF,
+    MachO
+  };
+
   /// \brief Analyze the strings and build the final table. No more strings can
   /// be added after this point.
-  void finalize();
+  void finalize(Kind kind);
 
   /// \brief Retrieve the string table data. Can only be used after the table
   /// is finalized.
@@ -48,6 +54,8 @@
     return StringIndexMap[s];
   }
 
+  void clear();
+
 private:
   bool isFinalized() {
     return !StringTable.empty();

diff --git a/include/llvm/MC/SubtargetFeature.h b/include/llvm/MC/SubtargetFeature.h
index c5d62a6..bfecb8b 100644
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h

@@ -72,21 +72,21 @@
 class SubtargetFeatures {
   std::vector<std::string> Features;    // Subtarget features as a vector
 public:
-  explicit SubtargetFeatures(const StringRef Initial = "");
+  explicit SubtargetFeatures(StringRef Initial = "");
 
   /// Features string accessors.
   std::string getString() const;
 
   /// Adding Features.
-  void AddFeature(const StringRef String);
+  void AddFeature(StringRef String);
 
   /// ToggleFeature - Toggle a feature and returns the newly updated feature
   /// bits.
-  uint64_t ToggleFeature(uint64_t Bits, const StringRef String,
+  uint64_t ToggleFeature(uint64_t Bits, StringRef String,
                          ArrayRef<SubtargetFeatureKV> FeatureTable);
 
   /// Get feature bits of a CPU.
-  uint64_t getFeatureBits(const StringRef CPU,
+  uint64_t getFeatureBits(StringRef CPU,
                           ArrayRef<SubtargetFeatureKV> CPUTable,
                           ArrayRef<SubtargetFeatureKV> FeatureTable);
 

diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index af6c995..7c03dcd 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h

@@ -14,6 +14,7 @@
 #ifndef LLVM_OBJECT_ARCHIVE_H
 #define LLVM_OBJECT_ARCHIVE_H
 
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -89,8 +90,7 @@
       return StringRef(Data.data() + StartOfFile, getSize());
     }
 
-    ErrorOr<std::unique_ptr<MemoryBuffer>>
-    getMemoryBuffer(bool FullPath = false) const;
+    ErrorOr<MemoryBufferRef> getMemoryBufferRef() const;
 
     ErrorOr<std::unique_ptr<Binary>>
     getAsBinary(LLVMContext *Context = nullptr) const;
@@ -98,12 +98,12 @@
 
   class child_iterator {
     Child child;
+
   public:
     child_iterator() : child(Child(nullptr, nullptr)) {}
     child_iterator(const Child &c) : child(c) {}
-    const Child* operator->() const {
-      return &child;
-    }
+    const Child *operator->() const { return &child; }
+    const Child &operator*() const { return child; }
 
     bool operator==(const child_iterator &other) const {
       return child == other.child;
@@ -113,11 +113,11 @@
       return !(*this == other);
     }
 
-    bool operator <(const child_iterator &other) const {
+    bool operator<(const child_iterator &other) const {
       return child < other.child;
     }
 
-    child_iterator& operator++() {  // Preincrement
+    child_iterator &operator++() { // Preincrement
       child = child.getNext();
       return *this;
     }
@@ -164,8 +164,8 @@
     }
   };
 
-  Archive(std::unique_ptr<MemoryBuffer> Source, std::error_code &EC);
-  static ErrorOr<Archive *> create(std::unique_ptr<MemoryBuffer> Source);
+  Archive(MemoryBufferRef Source, std::error_code &EC);
+  static ErrorOr<std::unique_ptr<Archive>> create(MemoryBufferRef Source);
 
   enum Kind {
     K_GNU,
@@ -179,6 +179,10 @@
 
   child_iterator child_begin(bool SkipInternal = true) const;
   child_iterator child_end() const;
+  iterator_range<child_iterator> children(bool SkipInternal = true) const {
+    return iterator_range<child_iterator>(child_begin(SkipInternal),
+                                          child_end());
+  }
 
   symbol_iterator symbol_begin() const;
   symbol_iterator symbol_end() const;

diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index 9be2fbe..4b2b7e6 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h

@@ -17,11 +17,11 @@
 #include "llvm/Object/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 namespace llvm {
 
 class LLVMContext;
-class MemoryBuffer;
 class StringRef;
 
 namespace object {
@@ -34,9 +34,9 @@
   unsigned int TypeID;
 
 protected:
-  std::unique_ptr<MemoryBuffer> Data;
+  MemoryBufferRef Data;
 
-  Binary(unsigned int Type, std::unique_ptr<MemoryBuffer> Source);
+  Binary(unsigned int Type, MemoryBufferRef Source);
 
   enum {
     ID_Archive,
@@ -78,8 +78,8 @@
   virtual ~Binary();
 
   StringRef getData() const;
-  MemoryBuffer *releaseBuffer() { return Data.release(); }
   StringRef getFileName() const;
+  MemoryBufferRef getMemoryBufferRef() const;
 
   // Cast methods.
   unsigned int getType() const { return TypeID; }
@@ -125,13 +125,59 @@
 
 /// @brief Create a Binary from Source, autodetecting the file type.
 ///
-/// @param Source The data to create the Binary from. Ownership is transferred
-///        to the Binary if successful. If an error is returned,
-///        Source is destroyed by createBinary before returning.
-ErrorOr<Binary *> createBinary(std::unique_ptr<MemoryBuffer> &Source,
-                               LLVMContext *Context = nullptr);
+/// @param Source The data to create the Binary from.
+ErrorOr<std::unique_ptr<Binary>> createBinary(MemoryBufferRef Source,
+                                              LLVMContext *Context = nullptr);
 
-ErrorOr<Binary *> createBinary(StringRef Path);
+template <typename T> class OwningBinary {
+  std::unique_ptr<T> Bin;
+  std::unique_ptr<MemoryBuffer> Buf;
+
+public:
+  OwningBinary();
+  OwningBinary(std::unique_ptr<T> Bin, std::unique_ptr<MemoryBuffer> Buf);
+  OwningBinary(OwningBinary<T>&& Other);
+  OwningBinary<T> &operator=(OwningBinary<T> &&Other);
+
+  std::pair<std::unique_ptr<T>, std::unique_ptr<MemoryBuffer>> takeBinary();
+
+  T* getBinary();
+  const T* getBinary() const;
+};
+
+template <typename T>
+OwningBinary<T>::OwningBinary(std::unique_ptr<T> Bin,
+                              std::unique_ptr<MemoryBuffer> Buf)
+    : Bin(std::move(Bin)), Buf(std::move(Buf)) {}
+
+template <typename T> OwningBinary<T>::OwningBinary() {}
+
+template <typename T>
+OwningBinary<T>::OwningBinary(OwningBinary &&Other)
+    : Bin(std::move(Other.Bin)), Buf(std::move(Other.Buf)) {}
+
+template <typename T>
+OwningBinary<T> &OwningBinary<T>::operator=(OwningBinary &&Other) {
+  Bin = std::move(Other.Bin);
+  Buf = std::move(Other.Buf);
+  return *this;
+}
+
+template <typename T>
+std::pair<std::unique_ptr<T>, std::unique_ptr<MemoryBuffer>>
+OwningBinary<T>::takeBinary() {
+  return std::make_pair(std::move(Bin), std::move(Buf));
+}
+
+template <typename T> T* OwningBinary<T>::getBinary() {
+  return Bin.get();
+}
+
+template <typename T> const T* OwningBinary<T>::getBinary() const {
+  return Bin.get();
+}
+
+ErrorOr<OwningBinary<Binary>> createBinary(StringRef Path);
 }
 }
 

diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index e2da070..3368d68 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h

@@ -14,22 +14,31 @@
 #ifndef LLVM_OBJECT_COFF_H
 #define LLVM_OBJECT_COFF_H
 
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorOr.h"
 
 namespace llvm {
 template <typename T> class ArrayRef;
 
 namespace object {
 class ImportDirectoryEntryRef;
+class DelayImportDirectoryEntryRef;
 class ExportDirectoryEntryRef;
+class ImportedSymbolRef;
+class BaseRelocRef;
 typedef content_iterator<ImportDirectoryEntryRef> import_directory_iterator;
+typedef content_iterator<DelayImportDirectoryEntryRef>
+    delay_import_directory_iterator;
 typedef content_iterator<ExportDirectoryEntryRef> export_directory_iterator;
+typedef content_iterator<ImportedSymbolRef> imported_symbol_iterator;
+typedef content_iterator<BaseRelocRef> base_reloc_iterator;
 
 /// The DOS compatible header at the front of all PE/COFF executables.
 struct dos_header {
-  support::ulittle16_t Magic;
+  char                 Magic[2];
   support::ulittle16_t UsedBytesInTheLastPage;
   support::ulittle16_t FileSizeInPages;
   support::ulittle16_t NumberOfRelocationItems;
@@ -62,6 +71,22 @@
   bool isImportLibrary() const { return NumberOfSections == 0xffff; }
 };
 
+struct coff_bigobj_file_header {
+  support::ulittle16_t Sig1;
+  support::ulittle16_t Sig2;
+  support::ulittle16_t Version;
+  support::ulittle16_t Machine;
+  support::ulittle32_t TimeDateStamp;
+  uint8_t              UUID[16];
+  support::ulittle32_t unused1;
+  support::ulittle32_t unused2;
+  support::ulittle32_t unused3;
+  support::ulittle32_t unused4;
+  support::ulittle32_t NumberOfSections;
+  support::ulittle32_t PointerToSymbolTable;
+  support::ulittle32_t NumberOfSymbols;
+};
+
 /// The 32-bit PE header that follows the COFF header.
 struct pe32_header {
   support::ulittle16_t Magic;
@@ -87,12 +112,14 @@
   support::ulittle32_t SizeOfHeaders;
   support::ulittle32_t CheckSum;
   support::ulittle16_t Subsystem;
+  // FIXME: This should be DllCharacteristics.
   support::ulittle16_t DLLCharacteristics;
   support::ulittle32_t SizeOfStackReserve;
   support::ulittle32_t SizeOfStackCommit;
   support::ulittle32_t SizeOfHeapReserve;
   support::ulittle32_t SizeOfHeapCommit;
   support::ulittle32_t LoaderFlags;
+  // FIXME: This should be NumberOfRvaAndSizes.
   support::ulittle32_t NumberOfRvaAndSize;
 };
 
@@ -142,22 +169,40 @@
   support::ulittle32_t ImportAddressTableRVA;
 };
 
-struct import_lookup_table_entry32 {
-  support::ulittle32_t data;
+template <typename IntTy>
+struct import_lookup_table_entry {
+  IntTy Data;
 
-  bool isOrdinal() const { return data & 0x80000000; }
+  bool isOrdinal() const { return Data < 0; }
 
   uint16_t getOrdinal() const {
     assert(isOrdinal() && "ILT entry is not an ordinal!");
-    return data & 0xFFFF;
+    return Data & 0xFFFF;
   }
 
   uint32_t getHintNameRVA() const {
     assert(!isOrdinal() && "ILT entry is not a Hint/Name RVA!");
-    return data;
+    return Data & 0xFFFFFFFF;
   }
 };
 
+typedef import_lookup_table_entry<support::little32_t>
+    import_lookup_table_entry32;
+typedef import_lookup_table_entry<support::little64_t>
+    import_lookup_table_entry64;
+
+struct delay_import_directory_table_entry {
+  // dumpbin reports this field as "Characteristics" instead of "Attributes".
+  support::ulittle32_t Attributes;
+  support::ulittle32_t Name;
+  support::ulittle32_t ModuleHandle;
+  support::ulittle32_t DelayImportAddressTable;
+  support::ulittle32_t DelayImportNameTable;
+  support::ulittle32_t BoundDelayImportTable;
+  support::ulittle32_t UnloadDelayImportTable;
+  support::ulittle32_t TimeStamp;
+};
+
 struct export_directory_table_entry {
   support::ulittle32_t ExportFlags;
   support::ulittle32_t TimeDateStamp;
@@ -180,67 +225,147 @@
 typedef support::ulittle32_t export_name_pointer_table_entry;
 typedef support::ulittle16_t export_ordinal_table_entry;
 
-struct coff_symbol {
-  struct StringTableOffset {
-    support::ulittle32_t Zeroes;
-    support::ulittle32_t Offset;
-  };
+struct StringTableOffset {
+  support::ulittle32_t Zeroes;
+  support::ulittle32_t Offset;
+};
 
+template <typename SectionNumberType>
+struct coff_symbol {
   union {
-    char ShortName[8];
+    char ShortName[COFF::NameSize];
     StringTableOffset Offset;
   } Name;
 
   support::ulittle32_t Value;
-  support::ulittle16_t SectionNumber;
+  SectionNumberType SectionNumber;
 
   support::ulittle16_t Type;
 
-  support::ulittle8_t StorageClass;
-  support::ulittle8_t NumberOfAuxSymbols;
+  uint8_t StorageClass;
+  uint8_t NumberOfAuxSymbols;
+};
 
-  uint8_t getBaseType() const { return Type & 0x0F; }
+typedef coff_symbol<support::ulittle16_t> coff_symbol16;
+typedef coff_symbol<support::ulittle32_t> coff_symbol32;
 
-  uint8_t getComplexType() const { return (Type & 0xF0) >> 4; }
+class COFFSymbolRef {
+public:
+  COFFSymbolRef(const coff_symbol16 *CS) : CS16(CS), CS32(nullptr) {}
+  COFFSymbolRef(const coff_symbol32 *CS) : CS16(nullptr), CS32(CS) {}
+  COFFSymbolRef() : CS16(nullptr), CS32(nullptr) {}
 
-  bool isFunctionDefinition() const {
-    return StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
-           getBaseType() == COFF::IMAGE_SYM_TYPE_NULL &&
-           getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION &&
-           !COFF::isReservedSectionNumber(SectionNumber);
+  const void *getRawPtr() const {
+    return CS16 ? static_cast<const void *>(CS16) : CS32;
   }
 
-  bool isFunctionLineInfo() const {
-    return StorageClass == COFF::IMAGE_SYM_CLASS_FUNCTION;
+  friend bool operator<(COFFSymbolRef A, COFFSymbolRef B) {
+    return A.getRawPtr() < B.getRawPtr();
+  }
+
+  bool isBigObj() const {
+    if (CS16)
+      return false;
+    if (CS32)
+      return true;
+    llvm_unreachable("COFFSymbolRef points to nothing!");
+  }
+
+  const char *getShortName() const {
+    return CS16 ? CS16->Name.ShortName : CS32->Name.ShortName;
+  }
+
+  const StringTableOffset &getStringTableOffset() const {
+    return CS16 ? CS16->Name.Offset : CS32->Name.Offset;
+  }
+
+  uint32_t getValue() const { return CS16 ? CS16->Value : CS32->Value; }
+
+  int32_t getSectionNumber() const {
+    if (CS16) {
+      // Reserved sections are returned as negative numbers.
+      if (CS16->SectionNumber <= COFF::MaxNumberOfSections16)
+        return CS16->SectionNumber;
+      return static_cast<int16_t>(CS16->SectionNumber);
+    }
+    return static_cast<int32_t>(CS32->SectionNumber);
+  }
+
+  uint16_t getType() const { return CS16 ? CS16->Type : CS32->Type; }
+
+  uint8_t getStorageClass() const {
+    return CS16 ? CS16->StorageClass : CS32->StorageClass;
+  }
+
+  uint8_t getNumberOfAuxSymbols() const {
+    return CS16 ? CS16->NumberOfAuxSymbols : CS32->NumberOfAuxSymbols;
+  }
+
+  uint8_t getBaseType() const { return getType() & 0x0F; }
+
+  uint8_t getComplexType() const {
+    return (getType() & 0xF0) >> COFF::SCT_COMPLEX_TYPE_SHIFT;
+  }
+
+  bool isExternal() const {
+    return getStorageClass() == COFF::IMAGE_SYM_CLASS_EXTERNAL;
+  }
+
+  bool isCommon() const {
+    return isExternal() && getSectionNumber() == COFF::IMAGE_SYM_UNDEFINED &&
+           getValue() != 0;
+  }
+
+  bool isUndefined() const {
+    return isExternal() && getSectionNumber() == COFF::IMAGE_SYM_UNDEFINED &&
+           getValue() == 0;
   }
 
   bool isWeakExternal() const {
-    return StorageClass == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL ||
-           (StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
-            SectionNumber == COFF::IMAGE_SYM_UNDEFINED && Value == 0);
+    return getStorageClass() == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
+  }
+
+  bool isFunctionDefinition() const {
+    return isExternal() && getBaseType() == COFF::IMAGE_SYM_TYPE_NULL &&
+           getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION &&
+           !COFF::isReservedSectionNumber(getSectionNumber());
+  }
+
+  bool isFunctionLineInfo() const {
+    return getStorageClass() == COFF::IMAGE_SYM_CLASS_FUNCTION;
+  }
+
+  bool isAnyUndefined() const {
+    return isUndefined() || isWeakExternal();
   }
 
   bool isFileRecord() const {
-    return StorageClass == COFF::IMAGE_SYM_CLASS_FILE;
+    return getStorageClass() == COFF::IMAGE_SYM_CLASS_FILE;
   }
 
   bool isSectionDefinition() const {
     // C++/CLI creates external ABS symbols for non-const appdomain globals.
     // These are also followed by an auxiliary section definition.
-    bool isAppdomainGlobal = StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
-                             SectionNumber == COFF::IMAGE_SYM_ABSOLUTE;
-    bool isOrdinarySection =
-        StorageClass == COFF::IMAGE_SYM_CLASS_STATIC && Value == 0;
+    bool isAppdomainGlobal =
+        getStorageClass() == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
+        getSectionNumber() == COFF::IMAGE_SYM_ABSOLUTE;
+    bool isOrdinarySection = getStorageClass() == COFF::IMAGE_SYM_CLASS_STATIC;
+    if (!getNumberOfAuxSymbols())
+      return false;
     return isAppdomainGlobal || isOrdinarySection;
   }
 
   bool isCLRToken() const {
-    return StorageClass == COFF::IMAGE_SYM_CLASS_CLR_TOKEN;
+    return getStorageClass() == COFF::IMAGE_SYM_CLASS_CLR_TOKEN;
   }
+
+private:
+  const coff_symbol16 *CS16;
+  const coff_symbol32 *CS32;
 };
 
 struct coff_section {
-  char Name[8];
+  char Name[COFF::NameSize];
   support::ulittle32_t VirtualSize;
   support::ulittle32_t VirtualAddress;
   support::ulittle32_t SizeOfRawData;
@@ -254,9 +379,9 @@
   // Returns true if the actual number of relocations is stored in
   // VirtualAddress field of the first relocation table entry.
   bool hasExtendedRelocations() const {
-    return Characteristics & COFF::IMAGE_SCN_LNK_NRELOC_OVFL &&
-        NumberOfRelocations == UINT16_MAX;
-  };
+    return (Characteristics & COFF::IMAGE_SCN_LNK_NRELOC_OVFL) &&
+           NumberOfRelocations == UINT16_MAX;
+  }
 };
 
 struct coff_relocation {
@@ -270,7 +395,6 @@
   support::ulittle32_t TotalSize;
   support::ulittle32_t PointerToLinenumber;
   support::ulittle32_t PointerToNextFunction;
-  char Unused[2];
 };
 
 struct coff_aux_bf_and_ef_symbol {
@@ -278,17 +402,11 @@
   support::ulittle16_t Linenumber;
   char Unused2[6];
   support::ulittle32_t PointerToNextFunction;
-  char Unused3[2];
 };
 
 struct coff_aux_weak_external {
   support::ulittle32_t TagIndex;
   support::ulittle32_t Characteristics;
-  char Unused[10];
-};
-
-struct coff_aux_file {
-  char FileName[18];
 };
 
 struct coff_aux_section_definition {
@@ -296,16 +414,22 @@
   support::ulittle16_t NumberOfRelocations;
   support::ulittle16_t NumberOfLinenumbers;
   support::ulittle32_t CheckSum;
-  support::ulittle16_t Number;
-  support::ulittle8_t Selection;
-  char Unused[3];
+  support::ulittle16_t NumberLowPart;
+  uint8_t              Selection;
+  uint8_t              Unused;
+  support::ulittle16_t NumberHighPart;
+  int32_t getNumber(bool IsBigObj) const {
+    uint32_t Number = static_cast<uint32_t>(NumberLowPart);
+    if (IsBigObj)
+      Number |= static_cast<uint32_t>(NumberHighPart) << 16;
+    return static_cast<int32_t>(Number);
+  }
 };
 
 struct coff_aux_clr_token {
-  support::ulittle8_t AuxType;
-  support::ulittle8_t Reserved;
+  uint8_t              AuxType;
+  uint8_t              Reserved;
   support::ulittle32_t SymbolTableIndex;
-  char Unused[12];
 };
 
 struct coff_load_configuration32 {
@@ -324,7 +448,7 @@
   support::ulittle32_t ProcessAffinityMask;
   support::ulittle32_t ProcessHeapFlags;
   support::ulittle16_t CSDVersion;
-  uint16_t Reserved;
+  support::ulittle16_t Reserved;
   support::ulittle32_t EditList;
   support::ulittle32_t SecurityCookie;
   support::ulittle32_t SEHandlerTable;
@@ -337,32 +461,114 @@
   support::ulittle32_t UnwindInformation;
 };
 
+struct coff_base_reloc_block_header {
+  support::ulittle32_t PageRVA;
+  support::ulittle32_t BlockSize;
+};
+
+struct coff_base_reloc_block_entry {
+  support::ulittle16_t Data;
+  int getType() const { return Data >> 12; }
+  int getOffset() const { return Data & ((1 << 12) - 1); }
+};
+
 class COFFObjectFile : public ObjectFile {
 private:
   friend class ImportDirectoryEntryRef;
   friend class ExportDirectoryEntryRef;
   const coff_file_header *COFFHeader;
+  const coff_bigobj_file_header *COFFBigObjHeader;
   const pe32_header *PE32Header;
   const pe32plus_header *PE32PlusHeader;
   const data_directory *DataDirectory;
   const coff_section *SectionTable;
-  const coff_symbol *SymbolTable;
+  const coff_symbol16 *SymbolTable16;
+  const coff_symbol32 *SymbolTable32;
   const char *StringTable;
   uint32_t StringTableSize;
   const import_directory_table_entry *ImportDirectory;
   uint32_t NumberOfImportDirectory;
+  const delay_import_directory_table_entry *DelayImportDirectory;
+  uint32_t NumberOfDelayImportDirectory;
   const export_directory_table_entry *ExportDirectory;
+  const coff_base_reloc_block_header *BaseRelocHeader;
+  const coff_base_reloc_block_header *BaseRelocEnd;
 
   std::error_code getString(uint32_t offset, StringRef &Res) const;
 
-  const coff_symbol *toSymb(DataRefImpl Symb) const;
+  template <typename coff_symbol_type>
+  const coff_symbol_type *toSymb(DataRefImpl Symb) const;
   const coff_section *toSec(DataRefImpl Sec) const;
   const coff_relocation *toRel(DataRefImpl Rel) const;
 
   std::error_code initSymbolTablePtr();
   std::error_code initImportTablePtr();
+  std::error_code initDelayImportTablePtr();
   std::error_code initExportTablePtr();
+  std::error_code initBaseRelocPtr();
 
+public:
+  uintptr_t getSymbolTable() const {
+    if (SymbolTable16)
+      return reinterpret_cast<uintptr_t>(SymbolTable16);
+    if (SymbolTable32)
+      return reinterpret_cast<uintptr_t>(SymbolTable32);
+    return uintptr_t(0);
+  }
+  uint16_t getMachine() const {
+    if (COFFHeader)
+      return COFFHeader->Machine;
+    if (COFFBigObjHeader)
+      return COFFBigObjHeader->Machine;
+    llvm_unreachable("no COFF header!");
+  }
+  uint16_t getSizeOfOptionalHeader() const {
+    if (COFFHeader)
+      return COFFHeader->isImportLibrary() ? 0
+                                           : COFFHeader->SizeOfOptionalHeader;
+    // bigobj doesn't have this field.
+    if (COFFBigObjHeader)
+      return 0;
+    llvm_unreachable("no COFF header!");
+  }
+  uint16_t getCharacteristics() const {
+    if (COFFHeader)
+      return COFFHeader->isImportLibrary() ? 0 : COFFHeader->Characteristics;
+    // bigobj doesn't have characteristics to speak of,
+    // editbin will silently lie to you if you attempt to set any.
+    if (COFFBigObjHeader)
+      return 0;
+    llvm_unreachable("no COFF header!");
+  }
+  uint32_t getTimeDateStamp() const {
+    if (COFFHeader)
+      return COFFHeader->TimeDateStamp;
+    if (COFFBigObjHeader)
+      return COFFBigObjHeader->TimeDateStamp;
+    llvm_unreachable("no COFF header!");
+  }
+  uint32_t getNumberOfSections() const {
+    if (COFFHeader)
+      return COFFHeader->isImportLibrary() ? 0 : COFFHeader->NumberOfSections;
+    if (COFFBigObjHeader)
+      return COFFBigObjHeader->NumberOfSections;
+    llvm_unreachable("no COFF header!");
+  }
+  uint32_t getPointerToSymbolTable() const {
+    if (COFFHeader)
+      return COFFHeader->isImportLibrary() ? 0
+                                           : COFFHeader->PointerToSymbolTable;
+    if (COFFBigObjHeader)
+      return COFFBigObjHeader->PointerToSymbolTable;
+    llvm_unreachable("no COFF header!");
+  }
+  uint32_t getNumberOfSymbols() const {
+    if (COFFHeader)
+      return COFFHeader->isImportLibrary() ? 0 : COFFHeader->NumberOfSymbols;
+    if (COFFBigObjHeader)
+      return COFFBigObjHeader->NumberOfSymbols;
+    llvm_unreachable("no COFF header!");
+  }
 protected:
   void moveSymbolNext(DataRefImpl &Symb) const override;
   std::error_code getSymbolName(DataRefImpl Symb,
@@ -378,24 +584,19 @@
   void moveSectionNext(DataRefImpl &Sec) const override;
   std::error_code getSectionName(DataRefImpl Sec,
                                  StringRef &Res) const override;
-  std::error_code getSectionAddress(DataRefImpl Sec,
-                                    uint64_t &Res) const override;
-  std::error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
+  uint64_t getSectionAddress(DataRefImpl Sec) const override;
+  uint64_t getSectionSize(DataRefImpl Sec) const override;
   std::error_code getSectionContents(DataRefImpl Sec,
                                      StringRef &Res) const override;
-  std::error_code getSectionAlignment(DataRefImpl Sec,
-                                      uint64_t &Res) const override;
-  std::error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionReadOnlyData(DataRefImpl Sec,
-                                        bool &Res) const override;
-  std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                                bool &Res) const override;
-  std::error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                        bool &Result) const override;
+  uint64_t getSectionAlignment(DataRefImpl Sec) const override;
+  bool isSectionText(DataRefImpl Sec) const override;
+  bool isSectionData(DataRefImpl Sec) const override;
+  bool isSectionBSS(DataRefImpl Sec) const override;
+  bool isSectionVirtual(DataRefImpl Sec) const override;
+  bool isSectionZeroInit(DataRefImpl Sec) const override;
+  bool isSectionReadOnlyData(DataRefImpl Sec) const override;
+  bool isSectionRequiredForExecution(DataRefImpl Sec) const override;
+  bool sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
 
@@ -414,54 +615,93 @@
   getRelocationValueString(DataRefImpl Rel,
                            SmallVectorImpl<char> &Result) const override;
 
-  std::error_code getLibraryNext(DataRefImpl LibData,
-                                 LibraryRef &Result) const override;
-  std::error_code getLibraryPath(DataRefImpl LibData,
-                                 StringRef &Result) const override;
-
 public:
-  COFFObjectFile(std::unique_ptr<MemoryBuffer> Object, std::error_code &EC);
+  COFFObjectFile(MemoryBufferRef Object, std::error_code &EC);
   basic_symbol_iterator symbol_begin_impl() const override;
   basic_symbol_iterator symbol_end_impl() const override;
-  library_iterator needed_library_begin() const override;
-  library_iterator needed_library_end() const override;
   section_iterator section_begin() const override;
   section_iterator section_end() const override;
 
   const coff_section *getCOFFSection(const SectionRef &Section) const;
-  const coff_symbol *getCOFFSymbol(const SymbolRef &Symbol) const;
+  COFFSymbolRef getCOFFSymbol(const DataRefImpl &Ref) const;
+  COFFSymbolRef getCOFFSymbol(const SymbolRef &Symbol) const;
   const coff_relocation *getCOFFRelocation(const RelocationRef &Reloc) const;
 
   uint8_t getBytesInAddress() const override;
   StringRef getFileFormatName() const override;
   unsigned getArch() const override;
-  StringRef getLoadName() const override;
 
   import_directory_iterator import_directory_begin() const;
   import_directory_iterator import_directory_end() const;
+  delay_import_directory_iterator delay_import_directory_begin() const;
+  delay_import_directory_iterator delay_import_directory_end() const;
   export_directory_iterator export_directory_begin() const;
   export_directory_iterator export_directory_end() const;
+  base_reloc_iterator base_reloc_begin() const;
+  base_reloc_iterator base_reloc_end() const;
 
-  std::error_code getHeader(const coff_file_header *&Res) const;
-  std::error_code getCOFFHeader(const coff_file_header *&Res) const;
+  iterator_range<import_directory_iterator> import_directories() const;
+  iterator_range<delay_import_directory_iterator>
+      delay_import_directories() const;
+  iterator_range<export_directory_iterator> export_directories() const;
+  iterator_range<base_reloc_iterator> base_relocs() const;
+
+  const dos_header *getDOSHeader() const {
+    if (!PE32Header && !PE32PlusHeader)
+      return nullptr;
+    return reinterpret_cast<const dos_header *>(base());
+  }
   std::error_code getPE32Header(const pe32_header *&Res) const;
   std::error_code getPE32PlusHeader(const pe32plus_header *&Res) const;
   std::error_code getDataDirectory(uint32_t index,
                                    const data_directory *&Res) const;
   std::error_code getSection(int32_t index, const coff_section *&Res) const;
-  std::error_code getSymbol(uint32_t index, const coff_symbol *&Res) const;
+  template <typename coff_symbol_type>
+  std::error_code getSymbol(uint32_t Index,
+                            const coff_symbol_type *&Res) const {
+    if (Index >= getNumberOfSymbols())
+      return object_error::parse_failed;
+
+    Res = reinterpret_cast<coff_symbol_type *>(getSymbolTable()) + Index;
+    return object_error::success;
+  }
+  ErrorOr<COFFSymbolRef> getSymbol(uint32_t index) const {
+    if (SymbolTable16) {
+      const coff_symbol16 *Symb = nullptr;
+      if (std::error_code EC = getSymbol(index, Symb))
+        return EC;
+      return COFFSymbolRef(Symb);
+    }
+    if (SymbolTable32) {
+      const coff_symbol32 *Symb = nullptr;
+      if (std::error_code EC = getSymbol(index, Symb))
+        return EC;
+      return COFFSymbolRef(Symb);
+    }
+    return object_error::parse_failed;
+  }
   template <typename T>
   std::error_code getAuxSymbol(uint32_t index, const T *&Res) const {
-    const coff_symbol *s;
-    std::error_code ec = getSymbol(index, s);
-    Res = reinterpret_cast<const T *>(s);
-    return ec;
+    ErrorOr<COFFSymbolRef> s = getSymbol(index);
+    if (std::error_code EC = s.getError())
+      return EC;
+    Res = reinterpret_cast<const T *>(s->getRawPtr());
+    return object_error::success;
   }
-  std::error_code getSymbolName(const coff_symbol *symbol,
-                                StringRef &Res) const;
-  ArrayRef<uint8_t> getSymbolAuxData(const coff_symbol *symbol) const;
+  std::error_code getSymbolName(COFFSymbolRef Symbol, StringRef &Res) const;
+
+  ArrayRef<uint8_t> getSymbolAuxData(COFFSymbolRef Symbol) const;
+
+  size_t getSymbolTableEntrySize() const {
+    if (COFFHeader)
+      return sizeof(coff_symbol16);
+    if (COFFBigObjHeader)
+      return sizeof(coff_symbol32);
+    llvm_unreachable("null symbol table pointer!");
+  }
 
   std::error_code getSectionName(const coff_section *Sec, StringRef &Res) const;
+  uint64_t getSectionSize(const coff_section *Sec) const;
   std::error_code getSectionContents(const coff_section *Sec,
                                      ArrayRef<uint8_t> &Res) const;
 
@@ -470,6 +710,9 @@
   std::error_code getHintName(uint32_t Rva, uint16_t &Hint,
                               StringRef &Name) const;
 
+  bool isRelocatableObject() const override;
+  bool is64() const { return PE32PlusHeader; }
+
   static inline bool classof(const Binary *v) { return v->isCOFF(); }
 };
 
@@ -483,7 +726,14 @@
 
   bool operator==(const ImportDirectoryEntryRef &Other) const;
   void moveNext();
+
+  imported_symbol_iterator imported_symbol_begin() const;
+  imported_symbol_iterator imported_symbol_end() const;
+  iterator_range<imported_symbol_iterator> imported_symbols() const;
+
   std::error_code getName(StringRef &Result) const;
+  std::error_code getImportLookupTableRVA(uint32_t &Result) const;
+  std::error_code getImportAddressTableRVA(uint32_t &Result) const;
 
   std::error_code
   getImportTableEntry(const import_directory_table_entry *&Result) const;
@@ -497,6 +747,31 @@
   const COFFObjectFile *OwningObject;
 };
 
+class DelayImportDirectoryEntryRef {
+public:
+  DelayImportDirectoryEntryRef() : OwningObject(nullptr) {}
+  DelayImportDirectoryEntryRef(const delay_import_directory_table_entry *T,
+                               uint32_t I, const COFFObjectFile *Owner)
+      : Table(T), Index(I), OwningObject(Owner) {}
+
+  bool operator==(const DelayImportDirectoryEntryRef &Other) const;
+  void moveNext();
+
+  imported_symbol_iterator imported_symbol_begin() const;
+  imported_symbol_iterator imported_symbol_end() const;
+  iterator_range<imported_symbol_iterator> imported_symbols() const;
+
+  std::error_code getName(StringRef &Result) const;
+  std::error_code getDelayImportTable(
+      const delay_import_directory_table_entry *&Result) const;
+  std::error_code getImportAddress(int AddrIndex, uint64_t &Result) const;
+
+private:
+  const delay_import_directory_table_entry *Table;
+  uint32_t Index;
+  const COFFObjectFile *OwningObject;
+};
+
 // The iterator for the export directory table entry.
 class ExportDirectoryEntryRef {
 public:
@@ -519,6 +794,49 @@
   uint32_t Index;
   const COFFObjectFile *OwningObject;
 };
+
+class ImportedSymbolRef {
+public:
+  ImportedSymbolRef() : OwningObject(nullptr) {}
+  ImportedSymbolRef(const import_lookup_table_entry32 *Entry, uint32_t I,
+                    const COFFObjectFile *Owner)
+      : Entry32(Entry), Entry64(nullptr), Index(I), OwningObject(Owner) {}
+  ImportedSymbolRef(const import_lookup_table_entry64 *Entry, uint32_t I,
+                    const COFFObjectFile *Owner)
+      : Entry32(nullptr), Entry64(Entry), Index(I), OwningObject(Owner) {}
+
+  bool operator==(const ImportedSymbolRef &Other) const;
+  void moveNext();
+
+  std::error_code getSymbolName(StringRef &Result) const;
+  std::error_code getOrdinal(uint16_t &Result) const;
+
+private:
+  const import_lookup_table_entry32 *Entry32;
+  const import_lookup_table_entry64 *Entry64;
+  uint32_t Index;
+  const COFFObjectFile *OwningObject;
+};
+
+class BaseRelocRef {
+public:
+  BaseRelocRef() : OwningObject(nullptr) {}
+  BaseRelocRef(const coff_base_reloc_block_header *Header,
+               const COFFObjectFile *Owner)
+      : Header(Header), Index(0), OwningObject(Owner) {}
+
+  bool operator==(const BaseRelocRef &Other) const;
+  void moveNext();
+
+  std::error_code getType(uint8_t &Type) const;
+  std::error_code getRVA(uint32_t &Result) const;
+
+private:
+  const coff_base_reloc_block_header *Header;
+  uint32_t Index;
+  const COFFObjectFile *OwningObject;
+};
+
 } // end namespace object
 } // end namespace llvm
 

diff --git a/include/llvm/Object/COFFYAML.h b/include/llvm/Object/COFFYAML.h
index 4aba08f..12a2522 100644
--- a/include/llvm/Object/COFFYAML.h
+++ b/include/llvm/Object/COFFYAML.h

@@ -31,6 +31,12 @@
   uint32_t Ret = static_cast<uint32_t>(a) | static_cast<uint32_t>(b);
   return static_cast<SectionCharacteristics>(Ret);
 }
+
+inline DLLCharacteristics operator|(DLLCharacteristics a,
+                                    DLLCharacteristics b) {
+  uint16_t Ret = static_cast<uint16_t>(a) | static_cast<uint16_t>(b);
+  return static_cast<DLLCharacteristics>(Ret);
+}
 }
 
 // The structure of the yaml files is not an exact 1:1 match to COFF. In order
@@ -69,7 +75,13 @@
     Symbol();
   };
 
+  struct PEHeader {
+    COFF::PE32Header Header;
+    Optional<COFF::DataDirectory> DataDirectories[COFF::NUM_DATA_DIRECTORIES];
+  };
+
   struct Object {
+    Optional<PEHeader> OptionalHeader;
     COFF::header Header;
     std::vector<Section> Sections;
     std::vector<Symbol> Symbols;
@@ -131,6 +143,11 @@
 };
 
 template <>
+struct ScalarEnumerationTraits<COFF::WindowsSubsystem> {
+  static void enumeration(IO &IO, COFF::WindowsSubsystem &Value);
+};
+
+template <>
 struct ScalarBitSetTraits<COFF::Characteristics> {
   static void bitset(IO &IO, COFF::Characteristics &Value);
 };
@@ -141,11 +158,26 @@
 };
 
 template <>
+struct ScalarBitSetTraits<COFF::DLLCharacteristics> {
+  static void bitset(IO &IO, COFF::DLLCharacteristics &Value);
+};
+
+template <>
 struct MappingTraits<COFFYAML::Relocation> {
   static void mapping(IO &IO, COFFYAML::Relocation &Rel);
 };
 
 template <>
+struct MappingTraits<COFFYAML::PEHeader> {
+  static void mapping(IO &IO, COFFYAML::PEHeader &PH);
+};
+
+template <>
+struct MappingTraits<COFF::DataDirectory> {
+  static void mapping(IO &IO, COFF::DataDirectory &DD);
+};
+
+template <>
 struct MappingTraits<COFF::header> {
   static void mapping(IO &IO, COFF::header &H);
 };

diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index fbc48e6..7c10bbf 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h

@@ -540,7 +540,7 @@
   if (Sec->sh_offset + Sec->sh_size > Buf.size())
     return object_error::parse_failed;
   const uint8_t *Start = base() + Sec->sh_offset;
-  return ArrayRef<uint8_t>(Start, Sec->sh_size);
+  return makeArrayRef(Start, Sec->sh_size);
 }
 
 template <class ELFT>

diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index cfb6b08..3fcd98d 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJECT_ELF_OBJECT_FILE_H
-#define LLVM_OBJECT_ELF_OBJECT_FILE_H
+#ifndef LLVM_OBJECT_ELFOBJECTFILE_H
+#define LLVM_OBJECT_ELFOBJECTFILE_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
@@ -35,8 +35,23 @@
 namespace llvm {
 namespace object {
 
-template <class ELFT>
-class ELFObjectFile : public ObjectFile {
+class ELFObjectFileBase : public ObjectFile {
+protected:
+  ELFObjectFileBase(unsigned int Type, MemoryBufferRef Source);
+
+public:
+  virtual std::error_code getRelocationAddend(DataRefImpl Rel,
+                                              int64_t &Res) const = 0;
+  virtual std::pair<symbol_iterator, symbol_iterator>
+  getELFDynamicSymbolIterators() const = 0;
+
+  virtual std::error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
+                                           bool &IsDefault) const = 0;
+
+  static inline bool classof(const Binary *v) { return v->isELF(); }
+};
+
+template <class ELFT> class ELFObjectFile : public ELFObjectFileBase {
 public:
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
@@ -65,37 +80,28 @@
                                      uint32_t &Res) const override;
   std::error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const override;
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
+  std::error_code getSymbolOther(DataRefImpl Symb, uint8_t &Res) const override;
   std::error_code getSymbolType(DataRefImpl Symb,
                                 SymbolRef::Type &Res) const override;
   std::error_code getSymbolSection(DataRefImpl Symb,
                                    section_iterator &Res) const override;
 
-  std::error_code getLibraryNext(DataRefImpl Data,
-                                 LibraryRef &Result) const override;
-  std::error_code getLibraryPath(DataRefImpl Data,
-                                 StringRef &Res) const override;
-
   void moveSectionNext(DataRefImpl &Sec) const override;
   std::error_code getSectionName(DataRefImpl Sec,
                                  StringRef &Res) const override;
-  std::error_code getSectionAddress(DataRefImpl Sec,
-                                    uint64_t &Res) const override;
-  std::error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
+  uint64_t getSectionAddress(DataRefImpl Sec) const override;
+  uint64_t getSectionSize(DataRefImpl Sec) const override;
   std::error_code getSectionContents(DataRefImpl Sec,
                                      StringRef &Res) const override;
-  std::error_code getSectionAlignment(DataRefImpl Sec,
-                                      uint64_t &Res) const override;
-  std::error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                                bool &Res) const override;
-  std::error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionReadOnlyData(DataRefImpl Sec,
-                                        bool &Res) const override;
-  std::error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                        bool &Result) const override;
+  uint64_t getSectionAlignment(DataRefImpl Sec) const override;
+  bool isSectionText(DataRefImpl Sec) const override;
+  bool isSectionData(DataRefImpl Sec) const override;
+  bool isSectionBSS(DataRefImpl Sec) const override;
+  bool isSectionRequiredForExecution(DataRefImpl Sec) const override;
+  bool isSectionVirtual(DataRefImpl Sec) const override;
+  bool isSectionZeroInit(DataRefImpl Sec) const override;
+  bool isSectionReadOnlyData(DataRefImpl Sec) const override;
+  bool sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
   section_iterator getRelocatedSection(DataRefImpl Sec) const override;
@@ -177,7 +183,7 @@
   bool isDyldELFObject;
 
 public:
-  ELFObjectFile(std::unique_ptr<MemoryBuffer> Object, std::error_code &EC);
+  ELFObjectFile(MemoryBufferRef Object, std::error_code &EC);
 
   const Elf_Sym *getSymbol(DataRefImpl Symb) const;
 
@@ -190,17 +196,20 @@
   section_iterator section_begin() const override;
   section_iterator section_end() const override;
 
-  library_iterator needed_library_begin() const override;
-  library_iterator needed_library_end() const override;
-
-  std::error_code getRelocationAddend(DataRefImpl Rel, int64_t &Res) const;
+  std::error_code getRelocationAddend(DataRefImpl Rel,
+                                      int64_t &Res) const override;
   std::error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
-                                   bool &IsDefault) const;
+                                   bool &IsDefault) const override;
 
   uint8_t getBytesInAddress() const override;
   StringRef getFileFormatName() const override;
   unsigned getArch() const override;
-  StringRef getLoadName() const override;
+  StringRef getLoadName() const;
+
+  std::error_code getPlatformFlags(unsigned &Result) const override {
+    Result = EF.getHeader()->e_flags;
+    return object_error::success;
+  }
 
   const ELFFile<ELFT> *getELFFile() const { return &EF; }
 
@@ -209,6 +218,11 @@
     return v->getType() == getELFType(ELFT::TargetEndianness == support::little,
                                       ELFT::Is64Bits);
   }
+
+  std::pair<symbol_iterator, symbol_iterator>
+  getELFDynamicSymbolIterators() const override;
+
+  bool isRelocatableObject() const override;
 };
 
 // Use an alignment of 2 for the typedefs since that is the worst case for
@@ -295,6 +309,13 @@
 }
 
 template <class ELFT>
+std::error_code ELFObjectFile<ELFT>::getSymbolOther(DataRefImpl Symb,
+                                                    uint8_t &Result) const {
+  Result = toELFSymIter(Symb)->st_other;
+  return object_error::success;
+}
+
+template <class ELFT>
 std::error_code
 ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb,
                                    SymbolRef::Type &Result) const {
@@ -387,17 +408,13 @@
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec,
-                                                       uint64_t &Result) const {
-  Result = toELFShdrIter(Sec)->sh_addr;
-  return object_error::success;
+uint64_t ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec) const {
+  return toELFShdrIter(Sec)->sh_addr;
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec,
-                                                    uint64_t &Result) const {
-  Result = toELFShdrIter(Sec)->sh_size;
-  return object_error::success;
+uint64_t ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec) const {
+  return toELFShdrIter(Sec)->sh_size;
 }
 
 template <class ELFT>
@@ -410,79 +427,59 @@
 }
 
 template <class ELFT>
-std::error_code
-ELFObjectFile<ELFT>::getSectionAlignment(DataRefImpl Sec,
-                                         uint64_t &Result) const {
-  Result = toELFShdrIter(Sec)->sh_addralign;
-  return object_error::success;
+uint64_t ELFObjectFile<ELFT>::getSectionAlignment(DataRefImpl Sec) const {
+  return toELFShdrIter(Sec)->sh_addralign;
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec,
-                                                   bool &Result) const {
-  Result = toELFShdrIter(Sec)->sh_flags & ELF::SHF_EXECINSTR;
-  return object_error::success;
+bool ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec) const {
+  return toELFShdrIter(Sec)->sh_flags & ELF::SHF_EXECINSTR;
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
-                                                   bool &Result) const {
+bool ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
-  Result = EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
-           EShdr->sh_type == ELF::SHT_PROGBITS;
-  return object_error::success;
+  return EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
+         EShdr->sh_type == ELF::SHT_PROGBITS;
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
-                                                  bool &Result) const {
+bool ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
-  Result = EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
-           EShdr->sh_type == ELF::SHT_NOBITS;
-  return object_error::success;
+  return EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
+         EShdr->sh_type == ELF::SHT_NOBITS;
 }
 
 template <class ELFT>
-std::error_code
-ELFObjectFile<ELFT>::isSectionRequiredForExecution(DataRefImpl Sec,
-                                                   bool &Result) const {
-  Result = toELFShdrIter(Sec)->sh_flags & ELF::SHF_ALLOC;
-  return object_error::success;
+bool ELFObjectFile<ELFT>::isSectionRequiredForExecution(DataRefImpl Sec) const {
+  return toELFShdrIter(Sec)->sh_flags & ELF::SHF_ALLOC;
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec,
-                                                      bool &Result) const {
-  Result = toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
-  return object_error::success;
+bool ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec) const {
+  return toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec,
-                                                       bool &Result) const {
-  Result = toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
-  return object_error::success;
+bool ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec) const {
+  return toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec,
-                                                           bool &Result) const {
+bool ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec) const {
   Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
-  Result = !(EShdr->sh_flags & (ELF::SHF_WRITE | ELF::SHF_EXECINSTR));
-  return object_error::success;
+  return !(EShdr->sh_flags & (ELF::SHF_WRITE | ELF::SHF_EXECINSTR));
 }
 
 template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
-                                                           DataRefImpl Symb,
-                                                           bool &Result) const {
+bool ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
+                                                DataRefImpl Symb) const {
   Elf_Sym_Iter ESym = toELFSymIter(Symb);
 
   uintX_t Index = ESym->st_shndx;
   bool Reserved = Index >= ELF::SHN_LORESERVE && Index <= ELF::SHN_HIRESERVE;
 
-  Result = !Reserved && (&*toELFShdrIter(Sec) == EF.getSection(ESym->st_shndx));
-  return object_error::success;
+  return !Reserved && (&*toELFShdrIter(Sec) == EF.getSection(ESym->st_shndx));
 }
 
 template <class ELFT>
@@ -741,6 +738,7 @@
     Result.append(fmtbuf.begin(), fmtbuf.end());
     break;
   }
+  case ELF::EM_386:
   case ELF::EM_ARM:
   case ELF::EM_HEXAGON:
   case ELF::EM_MIPS:
@@ -773,13 +771,13 @@
 }
 
 template <class ELFT>
-ELFObjectFile<ELFT>::ELFObjectFile(std::unique_ptr<MemoryBuffer> Object,
-                                   std::error_code &EC)
-    : ObjectFile(getELFType(static_cast<endianness>(ELFT::TargetEndianness) ==
-                                support::little,
-                            ELFT::Is64Bits),
-                 std::move(Object)),
-      EF(Data->getBuffer(), EC) {}
+ELFObjectFile<ELFT>::ELFObjectFile(MemoryBufferRef Object, std::error_code &EC)
+    : ELFObjectFileBase(
+          getELFType(static_cast<endianness>(ELFT::TargetEndianness) ==
+                         support::little,
+                     ELFT::Is64Bits),
+          Object),
+      EF(Data.getBuffer(), EC) {}
 
 template <class ELFT>
 basic_symbol_iterator ELFObjectFile<ELFT>::symbol_begin_impl() const {
@@ -825,50 +823,13 @@
 }
 
 template <class ELFT>
-library_iterator ELFObjectFile<ELFT>::needed_library_begin() const {
-  Elf_Dyn_Iter DI = EF.begin_dynamic_table();
-  Elf_Dyn_Iter DE = EF.end_dynamic_table();
-
-  while (DI != DE && DI->getTag() != ELF::DT_SONAME)
-    ++DI;
-
-  return library_iterator(LibraryRef(toDRI(DI), this));
-}
-
-template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
-                                                    LibraryRef &Result) const {
-  Elf_Dyn_Iter DI = toELFDynIter(Data);
-  Elf_Dyn_Iter DE = EF.end_dynamic_table();
-
-  // Skip to the next DT_NEEDED entry.
-  do
-    ++DI;
-  while (DI != DE && DI->getTag() != ELF::DT_NEEDED);
-
-  Result = LibraryRef(toDRI(DI), this);
-  return object_error::success;
-}
-
-template <class ELFT>
-std::error_code ELFObjectFile<ELFT>::getLibraryPath(DataRefImpl Data,
-                                                    StringRef &Res) const {
-  Res = EF.getDynamicString(toELFDynIter(Data)->getVal());
-  return object_error::success;
-}
-
-template <class ELFT>
-library_iterator ELFObjectFile<ELFT>::needed_library_end() const {
-  return library_iterator(LibraryRef(toDRI(EF.end_dynamic_table()), this));
-}
-
-template <class ELFT>
 uint8_t ELFObjectFile<ELFT>::getBytesInAddress() const {
   return ELFT::Is64Bits ? 8 : 4;
 }
 
 template <class ELFT>
 StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
+  bool IsLittleEndian = ELFT::TargetEndianness == support::little;
   switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) {
   case ELF::ELFCLASS32:
     switch (EF.getHeader()->e_machine) {
@@ -877,7 +838,7 @@
     case ELF::EM_X86_64:
       return "ELF32-x86-64";
     case ELF::EM_ARM:
-      return "ELF32-arm";
+      return (IsLittleEndian ? "ELF32-arm-little" : "ELF32-arm-big");
     case ELF::EM_HEXAGON:
       return "ELF32-hexagon";
     case ELF::EM_MIPS:
@@ -897,7 +858,7 @@
     case ELF::EM_X86_64:
       return "ELF64-x86-64";
     case ELF::EM_AARCH64:
-      return "ELF64-aarch64";
+      return (IsLittleEndian ? "ELF64-aarch64-little" : "ELF64-aarch64-big");
     case ELF::EM_PPC64:
       return "ELF64-ppc64";
     case ELF::EM_S390:
@@ -938,6 +899,8 @@
     default:
       report_fatal_error("Invalid ELFCLASS!");
     }
+  case ELF::EM_PPC:
+    return Triple::ppc;
   case ELF::EM_PPC64:
     return IsLittleEndian ? Triple::ppc64le : Triple::ppc64;
   case ELF::EM_S390:
@@ -954,73 +917,34 @@
   }
 }
 
-/// FIXME: Maybe we should have a base ElfObjectFile that is not a template
-/// and make these member functions?
+template <class ELFT>
+std::pair<symbol_iterator, symbol_iterator>
+ELFObjectFile<ELFT>::getELFDynamicSymbolIterators() const {
+  return std::make_pair(dynamic_symbol_begin(), dynamic_symbol_end());
+}
+
+template <class ELFT> bool ELFObjectFile<ELFT>::isRelocatableObject() const {
+  return EF.getHeader()->e_type == ELF::ET_REL;
+}
+
 inline std::error_code getELFRelocationAddend(const RelocationRef R,
                                               int64_t &Addend) {
   const ObjectFile *Obj = R.getObjectFile();
   DataRefImpl DRI = R.getRawDataRefImpl();
-  // Little-endian 32-bit
-  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
-    return ELFObj->getRelocationAddend(DRI, Addend);
-
-  // Big-endian 32-bit
-  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
-    return ELFObj->getRelocationAddend(DRI, Addend);
-
-  // Little-endian 64-bit
-  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    return ELFObj->getRelocationAddend(DRI, Addend);
-
-  // Big-endian 64-bit
-  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
-    return ELFObj->getRelocationAddend(DRI, Addend);
-
-  llvm_unreachable("Object passed to getELFRelocationAddend() is not ELF");
+  return cast<ELFObjectFileBase>(Obj)->getRelocationAddend(DRI, Addend);
 }
 
 inline std::pair<symbol_iterator, symbol_iterator>
-getELFDynamicSymbolIterators(SymbolicFile *Obj) {
-  if (const ELF32LEObjectFile *ELF = dyn_cast<ELF32LEObjectFile>(Obj))
-    return std::make_pair(ELF->dynamic_symbol_begin(),
-                          ELF->dynamic_symbol_end());
-  if (const ELF64LEObjectFile *ELF = dyn_cast<ELF64LEObjectFile>(Obj))
-    return std::make_pair(ELF->dynamic_symbol_begin(),
-                          ELF->dynamic_symbol_end());
-  if (const ELF32BEObjectFile *ELF = dyn_cast<ELF32BEObjectFile>(Obj))
-    return std::make_pair(ELF->dynamic_symbol_begin(),
-                          ELF->dynamic_symbol_end());
-  if (const ELF64BEObjectFile *ELF = cast<ELF64BEObjectFile>(Obj))
-    return std::make_pair(ELF->dynamic_symbol_begin(),
-                          ELF->dynamic_symbol_end());
-
-  llvm_unreachable(
-      "Object passed to getELFDynamicSymbolIterators() is not ELF");
+getELFDynamicSymbolIterators(const SymbolicFile *Obj) {
+  return cast<ELFObjectFileBase>(Obj)->getELFDynamicSymbolIterators();
 }
 
-/// This is a generic interface for retrieving GNU symbol version
-/// information from an ELFObjectFile.
 inline std::error_code GetELFSymbolVersion(const ObjectFile *Obj,
                                            const SymbolRef &Sym,
                                            StringRef &Version,
                                            bool &IsDefault) {
-  // Little-endian 32-bit
-  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
-    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
-
-  // Big-endian 32-bit
-  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
-    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
-
-  // Little-endian 64-bit
-  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
-
-  // Big-endian 64-bit
-  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
-    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
-
-  llvm_unreachable("Object passed to GetELFSymbolVersion() is not ELF");
+  return cast<ELFObjectFileBase>(Obj)
+      ->getSymbolVersion(Sym, Version, IsDefault);
 }
 }
 }

diff --git a/include/llvm/Object/ELFTypes.h b/include/llvm/Object/ELFTypes.h
index 84b6031..4bc0c7c 100644
--- a/include/llvm/Object/ELFTypes.h
+++ b/include/llvm/Object/ELFTypes.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJECT_ELF_TYPES_H
-#define LLVM_OBJECT_ELF_TYPES_H
+#ifndef LLVM_OBJECT_ELFTYPES_H
+#define LLVM_OBJECT_ELFTYPES_H
 
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/DataTypes.h"
@@ -176,6 +176,7 @@
 template <class ELFT>
 struct Elf_Sym_Impl : Elf_Sym_Base<ELFT> {
   using Elf_Sym_Base<ELFT>::st_info;
+  using Elf_Sym_Base<ELFT>::st_other;
 
   // These accessors and mutators correspond to the ELF32_ST_BIND,
   // ELF32_ST_TYPE, and ELF32_ST_INFO macros defined in the ELF specification:
@@ -186,6 +187,9 @@
   void setBindingAndType(unsigned char b, unsigned char t) {
     st_info = (b << 4) + (t & 0x0f);
   }
+
+  /// Access to the STV_xxx flag stored in the first two bits of st_other.
+  unsigned char getVisibility() const { return st_other & 0x3; }
 };
 
 /// Elf_Versym: This is the structure of entries in the SHT_GNU_versym section

diff --git a/include/llvm/Object/ELFYAML.h b/include/llvm/Object/ELFYAML.h
index fc8cc95..687611d 100644
--- a/include/llvm/Object/ELFYAML.h
+++ b/include/llvm/Object/ELFYAML.h

@@ -45,6 +45,7 @@
 LLVM_YAML_STRONG_TYPEDEF(uint64_t, ELF_SHF)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STT)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STV)
+LLVM_YAML_STRONG_TYPEDEF(uint8_t, ELF_STO)
 
 // For now, hardcode 64 bits everywhere that 32 or 64 would be needed
 // since 64-bit can hold 32-bit values too.
@@ -63,7 +64,7 @@
   StringRef Section;
   llvm::yaml::Hex64 Value;
   llvm::yaml::Hex64 Size;
-  ELF_STV Visibility;
+  uint8_t Other;
 };
 struct LocalGlobalWeakSymbols {
   std::vector<Symbol> Local;
@@ -175,6 +176,11 @@
 };
 
 template <>
+struct ScalarBitSetTraits<ELFYAML::ELF_STO> {
+  static void bitset(IO &IO, ELFYAML::ELF_STO &Value);
+};
+
+template <>
 struct ScalarEnumerationTraits<ELFYAML::ELF_REL> {
   static void enumeration(IO &IO, ELFYAML::ELF_REL &Value);
 };

diff --git a/include/llvm/Object/Error.h b/include/llvm/Object/Error.h
index 701da12..90c2bd7 100644
--- a/include/llvm/Object/Error.h
+++ b/include/llvm/Object/Error.h

@@ -26,7 +26,8 @@
   arch_not_found,
   invalid_file_type,
   parse_failed,
-  unexpected_eof
+  unexpected_eof,
+  bitcode_section_not_found,
 };
 
 inline std::error_code make_error_code(object_error e) {

diff --git a/include/llvm/Object/IRObjectFile.h b/include/llvm/Object/IRObjectFile.h
index b33cc26..b650d5d 100644
--- a/include/llvm/Object/IRObjectFile.h
+++ b/include/llvm/Object/IRObjectFile.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJECT_IR_OBJECT_FILE_H
-#define LLVM_OBJECT_IR_OBJECT_FILE_H
+#ifndef LLVM_OBJECT_IROBJECTFILE_H
+#define LLVM_OBJECT_IROBJECTFILE_H
 
 #include "llvm/Object/SymbolicFile.h"
 
@@ -22,13 +22,15 @@
 class GlobalValue;
 
 namespace object {
+class ObjectFile;
+
 class IRObjectFile : public SymbolicFile {
   std::unique_ptr<Module> M;
   std::unique_ptr<Mangler> Mang;
   std::vector<std::pair<std::string, uint32_t>> AsmSymbols;
 
 public:
-  IRObjectFile(std::unique_ptr<MemoryBuffer> Object, std::unique_ptr<Module> M);
+  IRObjectFile(MemoryBufferRef Object, std::unique_ptr<Module> M);
   ~IRObjectFile();
   void moveSymbolNext(DataRefImpl &Symb) const override;
   std::error_code printSymbolName(raw_ostream &OS,
@@ -49,9 +51,18 @@
     return v->isIR();
   }
 
-  static ErrorOr<IRObjectFile *>
-  createIRObjectFile(std::unique_ptr<MemoryBuffer> Object,
-                     LLVMContext &Context);
+  /// \brief Finds and returns bitcode embedded in the given object file, or an
+  /// error code if not found.
+  static ErrorOr<MemoryBufferRef> findBitcodeInObject(const ObjectFile &Obj);
+
+  /// \brief Finds and returns bitcode in the given memory buffer (which may
+  /// be either a bitcode file or a native object file with embedded bitcode),
+  /// or an error code if not found.
+  static ErrorOr<MemoryBufferRef>
+  findBitcodeInMemBuffer(MemoryBufferRef Object);
+
+  static ErrorOr<std::unique_ptr<IRObjectFile>>
+  createIRObjectFile(MemoryBufferRef Object, LLVMContext &Context);
 };
 }
 }

diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index e93ebb8..768cda6 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h

@@ -49,6 +49,141 @@
 };
 typedef content_iterator<DiceRef> dice_iterator;
 
+/// ExportEntry encapsulates the current-state-of-the-walk used when doing a
+/// non-recursive walk of the trie data structure.  This allows you to iterate
+/// across all exported symbols using:
+///      for (const llvm::object::ExportEntry &AnExport : Obj->exports()) {
+///      }
+class ExportEntry {
+public:
+  ExportEntry(ArrayRef<uint8_t> Trie);
+
+  StringRef name() const;
+  uint64_t flags() const;
+  uint64_t address() const;
+  uint64_t other() const;
+  StringRef otherName() const;
+  uint32_t nodeOffset() const;
+
+  bool operator==(const ExportEntry &) const;
+
+  void moveNext();
+
+private:
+  friend class MachOObjectFile;
+  void moveToFirst();
+  void moveToEnd();
+  uint64_t readULEB128(const uint8_t *&p);
+  void pushDownUntilBottom();
+  void pushNode(uint64_t Offset);
+
+  // Represents a node in the mach-o exports trie.
+  struct NodeState {
+    NodeState(const uint8_t *Ptr);
+    const uint8_t *Start;
+    const uint8_t *Current;
+    uint64_t Flags;
+    uint64_t Address;
+    uint64_t Other;
+    const char *ImportName;
+    unsigned ChildCount;
+    unsigned NextChildIndex;
+    unsigned ParentStringLength;
+    bool IsExportNode;
+  };
+
+  ArrayRef<uint8_t> Trie;
+  SmallString<256> CumulativeString;
+  SmallVector<NodeState, 16> Stack;
+  bool Malformed;
+  bool Done;
+};
+typedef content_iterator<ExportEntry> export_iterator;
+
+/// MachORebaseEntry encapsulates the current state in the decompression of   
+/// rebasing opcodes. This allows you to iterate through the compressed table of
+/// rebasing using:
+///    for (const llvm::object::MachORebaseEntry &Entry : Obj->rebaseTable()) {
+///    }
+class MachORebaseEntry {
+public:
+  MachORebaseEntry(ArrayRef<uint8_t> opcodes, bool is64Bit);
+
+  uint32_t segmentIndex() const;
+  uint64_t segmentOffset() const;
+  StringRef typeName() const;
+
+  bool operator==(const MachORebaseEntry &) const;
+
+  void moveNext();
+  
+private:
+  friend class MachOObjectFile;
+  void moveToFirst();
+  void moveToEnd();
+  uint64_t readULEB128();
+
+  ArrayRef<uint8_t> Opcodes;
+  const uint8_t *Ptr;
+  uint64_t SegmentOffset;
+  uint32_t SegmentIndex;
+  uint64_t RemainingLoopCount;
+  uint64_t AdvanceAmount;
+  uint8_t  RebaseType;
+  uint8_t  PointerSize;
+  bool     Malformed;
+  bool     Done;
+};
+typedef content_iterator<MachORebaseEntry> rebase_iterator;
+
+/// MachOBindEntry encapsulates the current state in the decompression of
+/// binding opcodes. This allows you to iterate through the compressed table of
+/// bindings using:
+///    for (const llvm::object::MachOBindEntry &Entry : Obj->bindTable()) {
+///    }
+class MachOBindEntry {
+public:
+  enum class Kind { Regular, Lazy, Weak };
+
+  MachOBindEntry(ArrayRef<uint8_t> Opcodes, bool is64Bit, MachOBindEntry::Kind);
+
+  uint32_t segmentIndex() const;
+  uint64_t segmentOffset() const;
+  StringRef typeName() const;
+  StringRef symbolName() const;
+  uint32_t flags() const;
+  int64_t addend() const;
+  int ordinal() const;
+
+  bool operator==(const MachOBindEntry &) const;
+
+  void moveNext();
+
+private:
+  friend class MachOObjectFile;
+  void moveToFirst();
+  void moveToEnd();
+  uint64_t readULEB128();
+  int64_t readSLEB128();
+
+  ArrayRef<uint8_t> Opcodes;
+  const uint8_t *Ptr;
+  uint64_t SegmentOffset;
+  uint32_t SegmentIndex;
+  StringRef SymbolName;
+  int      Ordinal;
+  uint32_t Flags;
+  int64_t  Addend;
+  uint64_t RemainingLoopCount;
+  uint64_t AdvanceAmount;
+  uint8_t  BindType;
+  uint8_t  PointerSize;
+  Kind     TableKind;
+  bool     Malformed;
+  bool     Done;
+};
+typedef content_iterator<MachOBindEntry> bind_iterator;
+
 class MachOObjectFile : public ObjectFile {
 public:
   struct LoadCommandInfo {
@@ -56,8 +191,8 @@
     MachO::load_command C; // The command itself.
   };
 
-  MachOObjectFile(std::unique_ptr<MemoryBuffer> Object, bool IsLittleEndian,
-                  bool Is64Bits, std::error_code &EC);
+  MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian, bool Is64Bits,
+                  std::error_code &EC);
 
   void moveSymbolNext(DataRefImpl &Symb) const override;
   std::error_code getSymbolName(DataRefImpl Symb,
@@ -80,24 +215,19 @@
   void moveSectionNext(DataRefImpl &Sec) const override;
   std::error_code getSectionName(DataRefImpl Sec,
                                  StringRef &Res) const override;
-  std::error_code getSectionAddress(DataRefImpl Sec,
-                                    uint64_t &Res) const override;
-  std::error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const override;
+  uint64_t getSectionAddress(DataRefImpl Sec) const override;
+  uint64_t getSectionSize(DataRefImpl Sec) const override;
   std::error_code getSectionContents(DataRefImpl Sec,
                                      StringRef &Res) const override;
-  std::error_code getSectionAlignment(DataRefImpl Sec,
-                                      uint64_t &Res) const override;
-  std::error_code isSectionText(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionData(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                                bool &Res) const override;
-  std::error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const override;
-  std::error_code isSectionReadOnlyData(DataRefImpl Sec,
-                                        bool &Res) const override;
-  std::error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                        bool &Result) const override;
+  uint64_t getSectionAlignment(DataRefImpl Sec) const override;
+  bool isSectionText(DataRefImpl Sec) const override;
+  bool isSectionData(DataRefImpl Sec) const override;
+  bool isSectionBSS(DataRefImpl Sec) const override;
+  bool isSectionRequiredForExecution(DataRefImpl Sec) const override;
+  bool isSectionVirtual(DataRefImpl Sec) const override;
+  bool isSectionZeroInit(DataRefImpl Sec) const override;
+  bool isSectionReadOnlyData(DataRefImpl Sec) const override;
+  bool sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
 
@@ -118,13 +248,8 @@
   std::error_code getRelocationHidden(DataRefImpl Rel,
                                       bool &Result) const override;
 
-  std::error_code getLibraryNext(DataRefImpl LibData,
-                                 LibraryRef &Res) const override;
-  std::error_code getLibraryPath(DataRefImpl LibData,
-                                 StringRef &Res) const override;
-
   // MachO specific.
-  std::error_code getLibraryShortNameByIndex(unsigned Index, StringRef &Res);
+  std::error_code getLibraryShortNameByIndex(unsigned Index, StringRef &) const;
 
   // TODO: Would be useful to have an iterator based version
   // of the load command interface too.
@@ -138,21 +263,45 @@
   section_iterator section_begin() const override;
   section_iterator section_end() const override;
 
-  library_iterator needed_library_begin() const override;
-  library_iterator needed_library_end() const override;
-
   uint8_t getBytesInAddress() const override;
 
   StringRef getFileFormatName() const override;
   unsigned getArch() const override;
-
-  StringRef getLoadName() const override;
+  Triple getArch(const char **McpuDefault, Triple *ThumbTriple) const;
 
   relocation_iterator section_rel_begin(unsigned Index) const;
   relocation_iterator section_rel_end(unsigned Index) const;
 
   dice_iterator begin_dices() const;
   dice_iterator end_dices() const;
+  
+  /// For use iterating over all exported symbols.
+  iterator_range<export_iterator> exports() const;
+  
+  /// For use examining a trie not in a MachOObjectFile.
+  static iterator_range<export_iterator> exports(ArrayRef<uint8_t> Trie);
+
+  /// For use iterating over all rebase table entries.
+  iterator_range<rebase_iterator> rebaseTable() const;
+
+  /// For use examining rebase opcodes not in a MachOObjectFile.
+  static iterator_range<rebase_iterator> rebaseTable(ArrayRef<uint8_t> Opcodes,
+                                                     bool is64);
+
+  /// For use iterating over all bind table entries.
+  iterator_range<bind_iterator> bindTable() const;
+
+  /// For use iterating over all lazy bind table entries.
+  iterator_range<bind_iterator> lazyBindTable() const;
+
+  /// For use iterating over all lazy bind table entries.
+  iterator_range<bind_iterator> weakBindTable() const;
+
+  /// For use examining bind opcodes not in a MachOObjectFile.
+  static iterator_range<bind_iterator> bindTable(ArrayRef<uint8_t> Opcodes,
+                                                 bool is64,
+                                                 MachOBindEntry::Kind);
+
 
   // In a MachO file, sections have a segment name. This is used in the .o
   // files. They have a single segment, but this field specifies which segment
@@ -173,6 +322,8 @@
                                     const MachO::any_relocation_info &RE) const;
   uint32_t getScatteredRelocationValue(
                                     const MachO::any_relocation_info &RE) const;
+  uint32_t getScatteredRelocationType(
+                                    const MachO::any_relocation_info &RE) const;
   unsigned getAnyRelocationAddress(const MachO::any_relocation_info &RE) const;
   unsigned getAnyRelocationPCRel(const MachO::any_relocation_info &RE) const;
   unsigned getAnyRelocationLength(const MachO::any_relocation_info &RE) const;
@@ -203,6 +354,16 @@
   getVersionMinLoadCommand(const LoadCommandInfo &L) const;
   MachO::dylib_command
   getDylibIDLoadCommand(const LoadCommandInfo &L) const;
+  MachO::dyld_info_command
+  getDyldInfoLoadCommand(const LoadCommandInfo &L) const;
+  MachO::dylinker_command
+  getDylinkerCommand(const LoadCommandInfo &L) const;
+  MachO::uuid_command
+  getUuidCommand(const LoadCommandInfo &L) const;
+  MachO::source_version_command
+  getSourceVersionCommand(const LoadCommandInfo &L) const;
+  MachO::entry_point_command
+  getEntryPointCommand(const LoadCommandInfo &L) const;
 
   MachO::any_relocation_info getRelocation(DataRefImpl Rel) const;
   MachO::data_in_code_entry getDice(DataRefImpl Rel) const;
@@ -216,6 +377,12 @@
   MachO::symtab_command getSymtabLoadCommand() const;
   MachO::dysymtab_command getDysymtabLoadCommand() const;
   MachO::linkedit_data_command getDataInCodeLoadCommand() const;
+  ArrayRef<uint8_t> getDyldInfoRebaseOpcodes() const;
+  ArrayRef<uint8_t> getDyldInfoBindOpcodes() const;
+  ArrayRef<uint8_t> getDyldInfoWeakBindOpcodes() const;
+  ArrayRef<uint8_t> getDyldInfoLazyBindOpcodes() const;
+  ArrayRef<uint8_t> getDyldInfoExportsTrie() const;
+  ArrayRef<uint8_t> getUuid() const;
 
   StringRef getStringTableData() const;
   bool is64Bit() const;
@@ -225,26 +392,36 @@
                                          StringRef &Suffix);
 
   static Triple::ArchType getArch(uint32_t CPUType);
-  static Triple getArch(uint32_t CPUType, uint32_t CPUSubType);
-  static Triple getArch(StringRef ArchFlag);
+  static Triple getArch(uint32_t CPUType, uint32_t CPUSubType,
+                        const char **McpuDefault = nullptr);
+  static Triple getThumbArch(uint32_t CPUType, uint32_t CPUSubType,
+                             const char **McpuDefault = nullptr);
+  static Triple getArch(uint32_t CPUType, uint32_t CPUSubType,
+                        const char **McpuDefault, Triple *ThumbTriple);
+  static bool isValidArch(StringRef ArchFlag);
   static Triple getHostArch();
 
+  bool isRelocatableObject() const override;
+
+  bool hasPageZeroSegment() const { return HasPageZeroSegment; }
+
   static bool classof(const Binary *v) {
     return v->isMachO();
   }
 
-  const char *getSectionPointer(DataRefImpl Rel) const;
-
 private:
-  typedef SmallVector<const char *, 1> SectionList;
+  typedef SmallVector<const char*, 1> SectionList;
   SectionList Sections;
-  typedef SmallVector<const char *, 1> LibraryList;
+  typedef SmallVector<const char*, 1> LibraryList;
   LibraryList Libraries;
   typedef SmallVector<StringRef, 1> LibraryShortName;
-  LibraryShortName LibrariesShortNames;
+  mutable LibraryShortName LibrariesShortNames;
   const char *SymtabLoadCmd;
   const char *DysymtabLoadCmd;
   const char *DataInCodeLoadCmd;
+  const char *DyldInfoLoadCmd;
+  const char *UuidLoadCmd;
+  bool HasPageZeroSegment;
 };
 
 /// DiceRef

diff --git a/include/llvm/Object/MachOUniversal.h b/include/llvm/Object/MachOUniversal.h
index e6677f5..46cf3fb 100644
--- a/include/llvm/Object/MachOUniversal.h
+++ b/include/llvm/Object/MachOUniversal.h

@@ -25,8 +25,6 @@
 namespace llvm {
 namespace object {
 
-class ObjectFile;
-
 class MachOUniversalBinary : public Binary {
   virtual void anchor();
 
@@ -58,7 +56,7 @@
       return T.getArchName();
     }
 
-    ErrorOr<std::unique_ptr<ObjectFile>> getAsObjectFile() const;
+    ErrorOr<std::unique_ptr<MachOObjectFile>> getAsObjectFile() const;
 
     std::error_code getAsArchive(std::unique_ptr<Archive> &Result) const;
   };
@@ -84,10 +82,9 @@
     }
   };
 
-  MachOUniversalBinary(std::unique_ptr<MemoryBuffer> Source,
-                       std::error_code &ec);
-  static ErrorOr<MachOUniversalBinary *>
-  create(std::unique_ptr<MemoryBuffer> Source);
+  MachOUniversalBinary(MemoryBufferRef Souce, std::error_code &EC);
+  static ErrorOr<std::unique_ptr<MachOUniversalBinary>>
+  create(MemoryBufferRef Source);
 
   object_iterator begin_objects() const {
     return ObjectForArch(this, 0);
@@ -103,7 +100,7 @@
     return V->isMachOUniversalBinary();
   }
 
-  ErrorOr<std::unique_ptr<ObjectFile>>
+  ErrorOr<std::unique_ptr<MachOObjectFile>>
   getObjectForArch(Triple::ArchType Arch) const;
 };
 

diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 646abf8..68b873a 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h

@@ -27,6 +27,8 @@
 namespace object {
 
 class ObjectFile;
+class COFFObjectFile;
+class MachOObjectFile;
 
 class SymbolRef;
 class symbol_iterator;
@@ -93,23 +95,22 @@
   void moveNext();
 
   std::error_code getName(StringRef &Result) const;
-  std::error_code getAddress(uint64_t &Result) const;
-  std::error_code getSize(uint64_t &Result) const;
+  uint64_t getAddress() const;
+  uint64_t getSize() const;
   std::error_code getContents(StringRef &Result) const;
 
   /// @brief Get the alignment of this section as the actual value (not log 2).
-  std::error_code getAlignment(uint64_t &Result) const;
+  uint64_t getAlignment() const;
 
-  // FIXME: Move to the normalization layer when it's created.
-  std::error_code isText(bool &Result) const;
-  std::error_code isData(bool &Result) const;
-  std::error_code isBSS(bool &Result) const;
-  std::error_code isRequiredForExecution(bool &Result) const;
-  std::error_code isVirtual(bool &Result) const;
-  std::error_code isZeroInit(bool &Result) const;
-  std::error_code isReadOnlyData(bool &Result) const;
+  bool isText() const;
+  bool isData() const;
+  bool isBSS() const;
+  bool isRequiredForExecution() const;
+  bool isVirtual() const;
+  bool isZeroInit() const;
+  bool isReadOnlyData() const;
 
-  std::error_code containsSymbol(SymbolRef S, bool &Result) const;
+  bool containsSymbol(SymbolRef S) const;
 
   relocation_iterator relocation_begin() const;
   relocation_iterator relocation_end() const;
@@ -149,6 +150,7 @@
   std::error_code getAlignment(uint32_t &Result) const;
   std::error_code getSize(uint64_t &Result) const;
   std::error_code getType(SymbolRef::Type &Result) const;
+  std::error_code getOther(uint8_t &Result) const;
 
   /// @brief Get section this symbol is defined in reference to. Result is
   /// end_sections() if it is undefined or is an absolute symbol.
@@ -175,30 +177,6 @@
   }
 };
 
-/// LibraryRef - This is a value type class that represents a single library in
-/// the list of libraries needed by a shared or dynamic object.
-class LibraryRef {
-  friend class SectionRef;
-  DataRefImpl LibraryPimpl;
-  const ObjectFile *OwningObject;
-
-public:
-  LibraryRef() : OwningObject(nullptr) { }
-
-  LibraryRef(DataRefImpl LibraryP, const ObjectFile *Owner);
-
-  bool operator==(const LibraryRef &Other) const;
-  bool operator<(const LibraryRef &Other) const;
-
-  std::error_code getNext(LibraryRef &Result) const;
-
-  // Get the path to this library, as stored in the object file.
-  std::error_code getPath(StringRef &Result) const;
-
-  DataRefImpl getRawDataRefImpl() const;
-};
-typedef content_iterator<LibraryRef> library_iterator;
-
 /// ObjectFile - This class is the base class for all object file types.
 /// Concrete instances of this object are created by createObjectFile, which
 /// figures out which type to create.
@@ -208,10 +186,10 @@
   ObjectFile(const ObjectFile &other) LLVM_DELETED_FUNCTION;
 
 protected:
-  ObjectFile(unsigned int Type, std::unique_ptr<MemoryBuffer> Source);
+  ObjectFile(unsigned int Type, MemoryBufferRef Source);
 
   const uint8_t *base() const {
-    return reinterpret_cast<const uint8_t *>(Data->getBufferStart());
+    return reinterpret_cast<const uint8_t *>(Data.getBufferStart());
   }
 
   // These functions are for SymbolRef to call internally. The main goal of
@@ -237,35 +215,31 @@
                                         SymbolRef::Type &Res) const = 0;
   virtual std::error_code getSymbolSection(DataRefImpl Symb,
                                            section_iterator &Res) const = 0;
+  virtual std::error_code getSymbolOther(DataRefImpl Symb,
+                                         uint8_t &Res) const {
+    return object_error::invalid_file_type;
+  }
 
   // Same as above for SectionRef.
   friend class SectionRef;
   virtual void moveSectionNext(DataRefImpl &Sec) const = 0;
   virtual std::error_code getSectionName(DataRefImpl Sec,
                                          StringRef &Res) const = 0;
-  virtual std::error_code getSectionAddress(DataRefImpl Sec,
-                                            uint64_t &Res) const = 0;
-  virtual std::error_code getSectionSize(DataRefImpl Sec,
-                                         uint64_t &Res) const = 0;
+  virtual uint64_t getSectionAddress(DataRefImpl Sec) const = 0;
+  virtual uint64_t getSectionSize(DataRefImpl Sec) const = 0;
   virtual std::error_code getSectionContents(DataRefImpl Sec,
                                              StringRef &Res) const = 0;
-  virtual std::error_code getSectionAlignment(DataRefImpl Sec,
-                                              uint64_t &Res) const = 0;
-  virtual std::error_code isSectionText(DataRefImpl Sec, bool &Res) const = 0;
-  virtual std::error_code isSectionData(DataRefImpl Sec, bool &Res) const = 0;
-  virtual std::error_code isSectionBSS(DataRefImpl Sec, bool &Res) const = 0;
-  virtual std::error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                                        bool &Res) const = 0;
+  virtual uint64_t getSectionAlignment(DataRefImpl Sec) const = 0;
+  virtual bool isSectionText(DataRefImpl Sec) const = 0;
+  virtual bool isSectionData(DataRefImpl Sec) const = 0;
+  virtual bool isSectionBSS(DataRefImpl Sec) const = 0;
+  virtual bool isSectionRequiredForExecution(DataRefImpl Sec) const = 0;
   // A section is 'virtual' if its contents aren't present in the object image.
-  virtual std::error_code isSectionVirtual(DataRefImpl Sec,
-                                           bool &Res) const = 0;
-  virtual std::error_code isSectionZeroInit(DataRefImpl Sec,
-                                            bool &Res) const = 0;
-  virtual std::error_code isSectionReadOnlyData(DataRefImpl Sec,
-                                                bool &Res) const = 0;
-  virtual std::error_code sectionContainsSymbol(DataRefImpl Sec,
-                                                DataRefImpl Symb,
-                                                bool &Result) const = 0;
+  virtual bool isSectionVirtual(DataRefImpl Sec) const = 0;
+  virtual bool isSectionZeroInit(DataRefImpl Sec) const = 0;
+  virtual bool isSectionReadOnlyData(DataRefImpl Sec) const = 0;
+  virtual bool sectionContainsSymbol(DataRefImpl Sec,
+                                     DataRefImpl Symb) const = 0;
   virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0;
   virtual relocation_iterator section_rel_end(DataRefImpl Sec) const = 0;
   virtual section_iterator getRelocatedSection(DataRefImpl Sec) const;
@@ -292,13 +266,6 @@
     return object_error::success;
   }
 
-  // Same for LibraryRef
-  friend class LibraryRef;
-  virtual std::error_code getLibraryNext(DataRefImpl Lib,
-                                         LibraryRef &Res) const = 0;
-  virtual std::error_code getLibraryPath(DataRefImpl Lib,
-                                         StringRef &Res) const = 0;
-
 public:
   typedef iterator_range<symbol_iterator> symbol_iterator_range;
   symbol_iterator_range symbols() const {
@@ -313,9 +280,6 @@
     return section_iterator_range(section_begin(), section_end());
   }
 
-  virtual library_iterator needed_library_begin() const = 0;
-  virtual library_iterator needed_library_end() const = 0;
-
   /// @brief The number of bytes used to represent an address in this object
   ///        file format.
   virtual uint8_t getBytesInAddress() const = 0;
@@ -323,21 +287,26 @@
   virtual StringRef getFileFormatName() const = 0;
   virtual /* Triple::ArchType */ unsigned getArch() const = 0;
 
-  /// For shared objects, returns the name which this object should be
-  /// loaded from at runtime. This corresponds to DT_SONAME on ELF and
-  /// LC_ID_DYLIB (install name) on MachO.
-  virtual StringRef getLoadName() const = 0;
+  /// Returns platform-specific object flags, if any.
+  virtual std::error_code getPlatformFlags(unsigned &Result) const {
+    Result = 0;
+    return object_error::invalid_file_type;
+  }
+
+  /// True if this is a relocatable object (.o/.obj).
+  virtual bool isRelocatableObject() const = 0;
 
   /// @returns Pointer to ObjectFile subclass to handle this type of object.
   /// @param ObjectPath The path to the object file. ObjectPath.isObject must
   ///        return true.
   /// @brief Create ObjectFile from path.
-  static ErrorOr<ObjectFile *> createObjectFile(StringRef ObjectPath);
-  static ErrorOr<ObjectFile *>
-  createObjectFile(std::unique_ptr<MemoryBuffer> &Object,
-                   sys::fs::file_magic Type);
-  static ErrorOr<ObjectFile *>
-  createObjectFile(std::unique_ptr<MemoryBuffer> &Object) {
+  static ErrorOr<OwningBinary<ObjectFile>>
+  createObjectFile(StringRef ObjectPath);
+
+  static ErrorOr<std::unique_ptr<ObjectFile>>
+  createObjectFile(MemoryBufferRef Object, sys::fs::file_magic Type);
+  static ErrorOr<std::unique_ptr<ObjectFile>>
+  createObjectFile(MemoryBufferRef Object) {
     return createObjectFile(Object, sys::fs::file_magic::unknown);
   }
 
@@ -346,13 +315,14 @@
     return v->isObject();
   }
 
-public:
-  static ErrorOr<ObjectFile *>
-  createCOFFObjectFile(std::unique_ptr<MemoryBuffer> Object);
-  static ErrorOr<ObjectFile *>
-  createELFObjectFile(std::unique_ptr<MemoryBuffer> &Object);
-  static ErrorOr<ObjectFile *>
-  createMachOObjectFile(std::unique_ptr<MemoryBuffer> &Object);
+  static ErrorOr<std::unique_ptr<COFFObjectFile>>
+  createCOFFObjectFile(MemoryBufferRef Object);
+
+  static ErrorOr<std::unique_ptr<ObjectFile>>
+  createELFObjectFile(MemoryBufferRef Object);
+
+  static ErrorOr<std::unique_ptr<MachOObjectFile>>
+  createMachOObjectFile(MemoryBufferRef Object);
 };
 
 // Inline function definitions.
@@ -383,6 +353,10 @@
   return getObject()->getSymbolType(getRawDataRefImpl(), Result);
 }
 
+inline std::error_code SymbolRef::getOther(uint8_t &Result) const {
+  return getObject()->getSymbolOther(getRawDataRefImpl(), Result);
+}
+
 inline const ObjectFile *SymbolRef::getObject() const {
   const SymbolicFile *O = BasicSymbolRef::getObject();
   return cast<ObjectFile>(O);
@@ -415,54 +389,53 @@
   return OwningObject->getSectionName(SectionPimpl, Result);
 }
 
-inline std::error_code SectionRef::getAddress(uint64_t &Result) const {
-  return OwningObject->getSectionAddress(SectionPimpl, Result);
+inline uint64_t SectionRef::getAddress() const {
+  return OwningObject->getSectionAddress(SectionPimpl);
 }
 
-inline std::error_code SectionRef::getSize(uint64_t &Result) const {
-  return OwningObject->getSectionSize(SectionPimpl, Result);
+inline uint64_t SectionRef::getSize() const {
+  return OwningObject->getSectionSize(SectionPimpl);
 }
 
 inline std::error_code SectionRef::getContents(StringRef &Result) const {
   return OwningObject->getSectionContents(SectionPimpl, Result);
 }
 
-inline std::error_code SectionRef::getAlignment(uint64_t &Result) const {
-  return OwningObject->getSectionAlignment(SectionPimpl, Result);
+inline uint64_t SectionRef::getAlignment() const {
+  return OwningObject->getSectionAlignment(SectionPimpl);
 }
 
-inline std::error_code SectionRef::isText(bool &Result) const {
-  return OwningObject->isSectionText(SectionPimpl, Result);
+inline bool SectionRef::isText() const {
+  return OwningObject->isSectionText(SectionPimpl);
 }
 
-inline std::error_code SectionRef::isData(bool &Result) const {
-  return OwningObject->isSectionData(SectionPimpl, Result);
+inline bool SectionRef::isData() const {
+  return OwningObject->isSectionData(SectionPimpl);
 }
 
-inline std::error_code SectionRef::isBSS(bool &Result) const {
-  return OwningObject->isSectionBSS(SectionPimpl, Result);
+inline bool SectionRef::isBSS() const {
+  return OwningObject->isSectionBSS(SectionPimpl);
 }
 
-inline std::error_code SectionRef::isRequiredForExecution(bool &Result) const {
-  return OwningObject->isSectionRequiredForExecution(SectionPimpl, Result);
+inline bool SectionRef::isRequiredForExecution() const {
+  return OwningObject->isSectionRequiredForExecution(SectionPimpl);
 }
 
-inline std::error_code SectionRef::isVirtual(bool &Result) const {
-  return OwningObject->isSectionVirtual(SectionPimpl, Result);
+inline bool SectionRef::isVirtual() const {
+  return OwningObject->isSectionVirtual(SectionPimpl);
 }
 
-inline std::error_code SectionRef::isZeroInit(bool &Result) const {
-  return OwningObject->isSectionZeroInit(SectionPimpl, Result);
+inline bool SectionRef::isZeroInit() const {
+  return OwningObject->isSectionZeroInit(SectionPimpl);
 }
 
-inline std::error_code SectionRef::isReadOnlyData(bool &Result) const {
-  return OwningObject->isSectionReadOnlyData(SectionPimpl, Result);
+inline bool SectionRef::isReadOnlyData() const {
+  return OwningObject->isSectionReadOnlyData(SectionPimpl);
 }
 
-inline std::error_code SectionRef::containsSymbol(SymbolRef S,
-                                                  bool &Result) const {
+inline bool SectionRef::containsSymbol(SymbolRef S) const {
   return OwningObject->sectionContainsSymbol(SectionPimpl,
-                                             S.getRawDataRefImpl(), Result);
+                                             S.getRawDataRefImpl());
 }
 
 inline relocation_iterator SectionRef::relocation_begin() const {
@@ -533,26 +506,6 @@
   return OwningObject;
 }
 
-// Inline function definitions.
-inline LibraryRef::LibraryRef(DataRefImpl LibraryP, const ObjectFile *Owner)
-  : LibraryPimpl(LibraryP)
-  , OwningObject(Owner) {}
-
-inline bool LibraryRef::operator==(const LibraryRef &Other) const {
-  return LibraryPimpl == Other.LibraryPimpl;
-}
-
-inline bool LibraryRef::operator<(const LibraryRef &Other) const {
-  return LibraryPimpl < Other.LibraryPimpl;
-}
-
-inline std::error_code LibraryRef::getNext(LibraryRef &Result) const {
-  return OwningObject->getLibraryNext(LibraryPimpl, Result);
-}
-
-inline std::error_code LibraryRef::getPath(StringRef &Result) const {
-  return OwningObject->getLibraryPath(LibraryPimpl, Result);
-}
 
 } // end namespace object
 } // end namespace llvm

diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index 5ca2450..91eafd5 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h

@@ -17,6 +17,7 @@
 #define LLVM_OBJECT_RELOCVISITOR_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
@@ -40,128 +41,18 @@
 /// @brief Base class for object file relocation visitors.
 class RelocVisitor {
 public:
-  explicit RelocVisitor(StringRef FileFormat)
-    : FileFormat(FileFormat), HasError(false) {}
+  explicit RelocVisitor(const ObjectFile &Obj)
+    : ObjToVisit(Obj), HasError(false) {}
 
   // TODO: Should handle multiple applied relocations via either passing in the
   // previously computed value or just count paired relocations as a single
   // visit.
-  RelocToApply visit(uint32_t RelocType, RelocationRef R, uint64_t SecAddr = 0,
-                     uint64_t Value = 0) {
-    if (FileFormat == "ELF64-x86-64") {
-      switch (RelocType) {
-        case llvm::ELF::R_X86_64_NONE:
-          return visitELF_X86_64_NONE(R);
-        case llvm::ELF::R_X86_64_64:
-          return visitELF_X86_64_64(R, Value);
-        case llvm::ELF::R_X86_64_PC32:
-          return visitELF_X86_64_PC32(R, Value, SecAddr);
-        case llvm::ELF::R_X86_64_32:
-          return visitELF_X86_64_32(R, Value);
-        case llvm::ELF::R_X86_64_32S:
-          return visitELF_X86_64_32S(R, Value);
-        default:
-          HasError = true;
-          return RelocToApply();
-      }
-    } else if (FileFormat == "ELF32-i386") {
-      switch (RelocType) {
-      case llvm::ELF::R_386_NONE:
-        return visitELF_386_NONE(R);
-      case llvm::ELF::R_386_32:
-        return visitELF_386_32(R, Value);
-      case llvm::ELF::R_386_PC32:
-        return visitELF_386_PC32(R, Value, SecAddr);
-      default:
-        HasError = true;
-        return RelocToApply();
-      }
-    } else if (FileFormat == "ELF64-ppc64") {
-      switch (RelocType) {
-      case llvm::ELF::R_PPC64_ADDR32:
-        return visitELF_PPC64_ADDR32(R, Value);
-      case llvm::ELF::R_PPC64_ADDR64:
-        return visitELF_PPC64_ADDR64(R, Value);
-      default:
-        HasError = true;
-        return RelocToApply();
-      }
-    } else if (FileFormat == "ELF32-ppc") {
-      switch (RelocType) {
-      case llvm::ELF::R_PPC_ADDR32:
-        return visitELF_PPC_ADDR32(R, Value);
-      default:
-        HasError = true;
-        return RelocToApply();
-      }
-    } else if (FileFormat == "ELF32-mips") {
-      switch (RelocType) {
-      case llvm::ELF::R_MIPS_32:
-        return visitELF_MIPS_32(R, Value);
-      default:
-        HasError = true;
-        return RelocToApply();
-      }
-    } else if (FileFormat == "ELF64-mips") {
-      switch (RelocType) {
-      case llvm::ELF::R_MIPS_32:
-        return visitELF_MIPS_32(R, Value);
-      case llvm::ELF::R_MIPS_64:
-        return visitELF_MIPS_64(R, Value);
-      default:
-        HasError = true;
-        return RelocToApply();
-      }
-    } else if (FileFormat == "ELF64-aarch64") {
-      switch (RelocType) {
-      case llvm::ELF::R_AARCH64_ABS32:
-        return visitELF_AARCH64_ABS32(R, Value);
-      case llvm::ELF::R_AARCH64_ABS64:
-        return visitELF_AARCH64_ABS64(R, Value);
-      default:
-        HasError = true;
-        return RelocToApply();
-      }
-    } else if (FileFormat == "ELF64-s390") {
-      switch (RelocType) {
-      case llvm::ELF::R_390_32:
-        return visitELF_390_32(R, Value);
-      case llvm::ELF::R_390_64:
-        return visitELF_390_64(R, Value);
-      default:
-        HasError = true;
-        return RelocToApply();
-      }
-    } else if (FileFormat == "ELF32-sparc") {
-      switch (RelocType) {
-      case llvm::ELF::R_SPARC_32:
-      case llvm::ELF::R_SPARC_UA32:
-        return visitELF_SPARC_32(R, Value);
-      default:
-        HasError = true;
-        return RelocToApply();
-      }
-    } else if (FileFormat == "ELF64-sparc") {
-      switch (RelocType) {
-      case llvm::ELF::R_SPARC_32:
-      case llvm::ELF::R_SPARC_UA32:
-        return visitELF_SPARCV9_32(R, Value);
-      case llvm::ELF::R_SPARC_64:
-      case llvm::ELF::R_SPARC_UA64:
-        return visitELF_SPARCV9_64(R, Value);
-      default:
-        HasError = true;
-        return RelocToApply();
-      }
-    } else if (FileFormat == "ELF32-arm") {
-      switch (RelocType) {
-      default:
-        HasError = true;
-        return RelocToApply();
-      case llvm::ELF::R_ARM_ABS32:
-        return visitELF_ARM_ABS32(R, Value);
-      }
-    }
+  RelocToApply visit(uint32_t RelocType, RelocationRef R, uint64_t Value = 0) {
+    if (isa<ELFObjectFileBase>(ObjToVisit))
+      return visitELF(RelocType, R, Value);
+    if (isa<COFFObjectFile>(ObjToVisit))
+      return visitCOFF(RelocType, R, Value);
+
     HasError = true;
     return RelocToApply();
   }
@@ -169,10 +60,168 @@
   bool error() { return HasError; }
 
 private:
-  StringRef FileFormat;
+  const ObjectFile &ObjToVisit;
   bool HasError;
 
-  int64_t getAddend32LE(RelocationRef R) {
+  RelocToApply visitELF(uint32_t RelocType, RelocationRef R, uint64_t Value) {
+    if (ObjToVisit.getBytesInAddress() == 8) { // 64-bit object file
+      switch (ObjToVisit.getArch()) {
+      case Triple::x86_64:
+        switch (RelocType) {
+        case llvm::ELF::R_X86_64_NONE:
+          return visitELF_X86_64_NONE(R);
+        case llvm::ELF::R_X86_64_64:
+          return visitELF_X86_64_64(R, Value);
+        case llvm::ELF::R_X86_64_PC32:
+          return visitELF_X86_64_PC32(R, Value);
+        case llvm::ELF::R_X86_64_32:
+          return visitELF_X86_64_32(R, Value);
+        case llvm::ELF::R_X86_64_32S:
+          return visitELF_X86_64_32S(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      case Triple::aarch64:
+        switch (RelocType) {
+        case llvm::ELF::R_AARCH64_ABS32:
+          return visitELF_AARCH64_ABS32(R, Value);
+        case llvm::ELF::R_AARCH64_ABS64:
+          return visitELF_AARCH64_ABS64(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      case Triple::mips64el:
+      case Triple::mips64:
+        switch (RelocType) {
+        case llvm::ELF::R_MIPS_32:
+          return visitELF_MIPS_32(R, Value);
+        case llvm::ELF::R_MIPS_64:
+          return visitELF_MIPS_64(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      case Triple::ppc64le:
+      case Triple::ppc64:
+        switch (RelocType) {
+        case llvm::ELF::R_PPC64_ADDR32:
+          return visitELF_PPC64_ADDR32(R, Value);
+        case llvm::ELF::R_PPC64_ADDR64:
+          return visitELF_PPC64_ADDR64(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      case Triple::systemz:
+        switch (RelocType) {
+        case llvm::ELF::R_390_32:
+          return visitELF_390_32(R, Value);
+        case llvm::ELF::R_390_64:
+          return visitELF_390_64(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      case Triple::sparcv9:
+        switch (RelocType) {
+        case llvm::ELF::R_SPARC_32:
+        case llvm::ELF::R_SPARC_UA32:
+          return visitELF_SPARCV9_32(R, Value);
+        case llvm::ELF::R_SPARC_64:
+        case llvm::ELF::R_SPARC_UA64:
+          return visitELF_SPARCV9_64(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      default:
+        HasError = true;
+        return RelocToApply();
+      }
+    } else if (ObjToVisit.getBytesInAddress() == 4) { // 32-bit object file
+      switch (ObjToVisit.getArch()) {
+      case Triple::x86:
+        switch (RelocType) {
+        case llvm::ELF::R_386_NONE:
+          return visitELF_386_NONE(R);
+        case llvm::ELF::R_386_32:
+          return visitELF_386_32(R, Value);
+        case llvm::ELF::R_386_PC32:
+          return visitELF_386_PC32(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      case Triple::ppc:
+        switch (RelocType) {
+        case llvm::ELF::R_PPC_ADDR32:
+          return visitELF_PPC_ADDR32(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      case Triple::arm:
+      case Triple::armeb:
+        switch (RelocType) {
+        default:
+          HasError = true;
+          return RelocToApply();
+        case llvm::ELF::R_ARM_ABS32:
+          return visitELF_ARM_ABS32(R, Value);
+        }
+      case Triple::mipsel:
+      case Triple::mips:
+        switch (RelocType) {
+        case llvm::ELF::R_MIPS_32:
+          return visitELF_MIPS_32(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      case Triple::sparc:
+        switch (RelocType) {
+        case llvm::ELF::R_SPARC_32:
+        case llvm::ELF::R_SPARC_UA32:
+          return visitELF_SPARC_32(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+        }
+      default:
+        HasError = true;
+        return RelocToApply();
+      }
+    } else {
+      report_fatal_error("Invalid word size in object file");
+    }
+  }
+
+  RelocToApply visitCOFF(uint32_t RelocType, RelocationRef R, uint64_t Value) {
+    switch (ObjToVisit.getArch()) {
+    case Triple::x86:
+      switch (RelocType) {
+      case COFF::IMAGE_REL_I386_SECREL:
+        return visitCOFF_I386_SECREL(R, Value);
+      case COFF::IMAGE_REL_I386_DIR32:
+        return visitCOFF_I386_DIR32(R, Value);
+      }
+      break;
+    case Triple::x86_64:
+      switch (RelocType) {
+      case COFF::IMAGE_REL_AMD64_SECREL:
+        return visitCOFF_AMD64_SECREL(R, Value);
+      case COFF::IMAGE_REL_AMD64_ADDR64:
+        return visitCOFF_AMD64_ADDR64(R, Value);
+      }
+      break;
+    }
+    HasError = true;
+    return RelocToApply();
+  }
+
+  int64_t getELFAddend32LE(RelocationRef R) {
     const ELF32LEObjectFile *Obj = cast<ELF32LEObjectFile>(R.getObjectFile());
     DataRefImpl DRI = R.getRawDataRefImpl();
     int64_t Addend;
@@ -180,7 +229,7 @@
     return Addend;
   }
 
-  int64_t getAddend64LE(RelocationRef R) {
+  int64_t getELFAddend64LE(RelocationRef R) {
     const ELF64LEObjectFile *Obj = cast<ELF64LEObjectFile>(R.getObjectFile());
     DataRefImpl DRI = R.getRawDataRefImpl();
     int64_t Addend;
@@ -188,7 +237,7 @@
     return Addend;
   }
 
-  int64_t getAddend32BE(RelocationRef R) {
+  int64_t getELFAddend32BE(RelocationRef R) {
     const ELF32BEObjectFile *Obj = cast<ELF32BEObjectFile>(R.getObjectFile());
     DataRefImpl DRI = R.getRawDataRefImpl();
     int64_t Addend;
@@ -196,7 +245,7 @@
     return Addend;
   }
 
-  int64_t getAddend64BE(RelocationRef R) {
+  int64_t getELFAddend64BE(RelocationRef R) {
     const ELF64BEObjectFile *Obj = cast<ELF64BEObjectFile>(R.getObjectFile());
     DataRefImpl DRI = R.getRawDataRefImpl();
     int64_t Addend;
@@ -213,13 +262,12 @@
   // Ideally the Addend here will be the addend in the data for
   // the relocation. It's not actually the case for Rel relocations.
   RelocToApply visitELF_386_32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend32LE(R);
+    int64_t Addend = getELFAddend32LE(R);
     return RelocToApply(Value + Addend, 4);
   }
 
-  RelocToApply visitELF_386_PC32(RelocationRef R, uint64_t Value,
-                                 uint64_t SecAddr) {
-    int64_t Addend = getAddend32LE(R);
+  RelocToApply visitELF_386_PC32(RelocationRef R, uint64_t Value) {
+    int64_t Addend = getELFAddend32LE(R);
     uint64_t Address;
     R.getOffset(Address);
     return RelocToApply(Value + Addend - Address, 4);
@@ -230,23 +278,22 @@
     return RelocToApply(0, 0);
   }
   RelocToApply visitELF_X86_64_64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64LE(R);
+    int64_t Addend = getELFAddend64LE(R);
     return RelocToApply(Value + Addend, 8);
   }
-  RelocToApply visitELF_X86_64_PC32(RelocationRef R, uint64_t Value,
-                                    uint64_t SecAddr) {
-    int64_t Addend = getAddend64LE(R);
+  RelocToApply visitELF_X86_64_PC32(RelocationRef R, uint64_t Value) {
+    int64_t Addend = getELFAddend64LE(R);
     uint64_t Address;
     R.getOffset(Address);
     return RelocToApply(Value + Addend - Address, 4);
   }
   RelocToApply visitELF_X86_64_32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64LE(R);
+    int64_t Addend = getELFAddend64LE(R);
     uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
     return RelocToApply(Res, 4);
   }
   RelocToApply visitELF_X86_64_32S(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64LE(R);
+    int64_t Addend = getELFAddend64LE(R);
     int32_t Res = (Value + Addend) & 0xFFFFFFFF;
     return RelocToApply(Res, 4);
   }
@@ -266,7 +313,7 @@
 
   /// PPC32 ELF
   RelocToApply visitELF_PPC_ADDR32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend32BE(R);
+    int64_t Addend = getELFAddend32BE(R);
     uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
     return RelocToApply(Res, 4);
   }
@@ -288,7 +335,8 @@
 
   // AArch64 ELF
   RelocToApply visitELF_AARCH64_ABS32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64LE(R);
+    int64_t Addend;
+    getELFRelocationAddend(R, Addend);
     int64_t Res =  Value + Addend;
 
     // Overflow check allows for both signed and unsigned interpretation.
@@ -299,13 +347,14 @@
   }
 
   RelocToApply visitELF_AARCH64_ABS64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64LE(R);
+    int64_t Addend;
+    getELFRelocationAddend(R, Addend);
     return RelocToApply(Value + Addend, 8);
   }
 
   // SystemZ ELF
   RelocToApply visitELF_390_32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64BE(R);
+    int64_t Addend = getELFAddend64BE(R);
     int64_t Res = Value + Addend;
 
     // Overflow check allows for both signed and unsigned interpretation.
@@ -316,30 +365,54 @@
   }
 
   RelocToApply visitELF_390_64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64BE(R);
+    int64_t Addend = getELFAddend64BE(R);
     return RelocToApply(Value + Addend, 8);
   }
 
   RelocToApply visitELF_SPARC_32(RelocationRef R, uint32_t Value) {
-    int32_t Addend = getAddend32BE(R);
+    int32_t Addend = getELFAddend32BE(R);
     return RelocToApply(Value + Addend, 4);
   }
 
   RelocToApply visitELF_SPARCV9_32(RelocationRef R, uint64_t Value) {
-    int32_t Addend = getAddend64BE(R);
+    int32_t Addend = getELFAddend64BE(R);
     return RelocToApply(Value + Addend, 4);
   }
 
   RelocToApply visitELF_SPARCV9_64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend64BE(R);
+    int64_t Addend = getELFAddend64BE(R);
     return RelocToApply(Value + Addend, 8);
   }
 
   RelocToApply visitELF_ARM_ABS32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getAddend32LE(R);
-    return RelocToApply(Value + Addend, 4);
+    int64_t Addend;
+    getELFRelocationAddend(R, Addend);
+    int64_t Res = Value + Addend;
+
+    // Overflow check allows for both signed and unsigned interpretation.
+    if (Res < INT32_MIN || Res > UINT32_MAX)
+      HasError = true;
+
+    return RelocToApply(static_cast<uint32_t>(Res), 4);
   }
 
+  /// I386 COFF
+  RelocToApply visitCOFF_I386_SECREL(RelocationRef R, uint64_t Value) {
+    return RelocToApply(static_cast<uint32_t>(Value), /*Width=*/4);
+  }
+
+  RelocToApply visitCOFF_I386_DIR32(RelocationRef R, uint64_t Value) {
+    return RelocToApply(static_cast<uint32_t>(Value), /*Width=*/4);
+  }
+
+  /// AMD64 COFF
+  RelocToApply visitCOFF_AMD64_SECREL(RelocationRef R, uint64_t Value) {
+    return RelocToApply(static_cast<uint32_t>(Value), /*Width=*/4);
+  }
+
+  RelocToApply visitCOFF_AMD64_ADDR64(RelocationRef R, uint64_t Value) {
+    return RelocToApply(Value, /*Width=*/8);
+  }
 };
 
 }

diff --git a/include/llvm/Object/SymbolicFile.h b/include/llvm/Object/SymbolicFile.h
index 77eef4a..435799a 100644
--- a/include/llvm/Object/SymbolicFile.h
+++ b/include/llvm/Object/SymbolicFile.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJECT_SYMBOLIC_FILE_H
-#define LLVM_OBJECT_SYMBOLIC_FILE_H
+#ifndef LLVM_OBJECT_SYMBOLICFILE_H
+#define LLVM_OBJECT_SYMBOLICFILE_H
 
 #include "llvm/Object/Binary.h"
 
@@ -87,8 +87,9 @@
     SF_Absolute = 1U << 3,       // Absolute symbol
     SF_Common = 1U << 4,         // Symbol has common linkage
     SF_Indirect = 1U << 5,       // Symbol is an alias to another symbol
-    SF_FormatSpecific = 1U << 6  // Specific to the object file format
+    SF_FormatSpecific = 1U << 6, // Specific to the object file format
                                  // (e.g. section symbols)
+    SF_Thumb = 1U << 7           // Thumb symbol in a 32-bit ARM binary
   };
 
   BasicSymbolRef() : OwningObject(nullptr) { }
@@ -115,7 +116,7 @@
 class SymbolicFile : public Binary {
 public:
   virtual ~SymbolicFile();
-  SymbolicFile(unsigned int Type, std::unique_ptr<MemoryBuffer> Source);
+  SymbolicFile(unsigned int Type, MemoryBufferRef Source);
 
   // virtual interface.
   virtual void moveSymbolNext(DataRefImpl &Symb) const = 0;
@@ -142,15 +143,16 @@
   }
 
   // construction aux.
-  static ErrorOr<SymbolicFile *>
-  createSymbolicFile(std::unique_ptr<MemoryBuffer> &Object,
-                     sys::fs::file_magic Type, LLVMContext *Context);
+  static ErrorOr<std::unique_ptr<SymbolicFile>>
+  createSymbolicFile(MemoryBufferRef Object, sys::fs::file_magic Type,
+                     LLVMContext *Context);
 
-  static ErrorOr<SymbolicFile *>
-  createSymbolicFile(std::unique_ptr<MemoryBuffer> &Object) {
+  static ErrorOr<std::unique_ptr<SymbolicFile>>
+  createSymbolicFile(MemoryBufferRef Object) {
     return createSymbolicFile(Object, sys::fs::file_magic::unknown, nullptr);
   }
-  static ErrorOr<SymbolicFile *> createSymbolicFile(StringRef ObjectPath);
+  static ErrorOr<OwningBinary<SymbolicFile>>
+  createSymbolicFile(StringRef ObjectPath);
 
   static inline bool classof(const Binary *v) {
     return v->isSymbolic();

diff --git a/include/llvm/Option/ArgList.h b/include/llvm/Option/ArgList.h
index d46b0e8..3f8547e 100644
--- a/include/llvm/Option/ArgList.h
+++ b/include/llvm/Option/ArgList.h

@@ -187,6 +187,7 @@
   ///
   /// \p Claim Whether the argument should be claimed, if it exists.
   Arg *getLastArgNoClaim(OptSpecifier Id) const;
+  Arg *getLastArgNoClaim(OptSpecifier Id0, OptSpecifier Id1) const;
   Arg *getLastArg(OptSpecifier Id) const;
   Arg *getLastArg(OptSpecifier Id0, OptSpecifier Id1) const;
   Arg *getLastArg(OptSpecifier Id0, OptSpecifier Id1, OptSpecifier Id2) const;

diff --git a/include/llvm/Option/OptParser.td b/include/llvm/Option/OptParser.td
index 963389f..dbf240d 100644
--- a/include/llvm/Option/OptParser.td
+++ b/include/llvm/Option/OptParser.td

@@ -75,6 +75,7 @@
   string Name = name;
   string HelpText = ?;
   OptionGroup Group = ?;
+  list<OptionFlag> Flags = [];
 }
 
 // Define the option class.

diff --git a/include/llvm/PassRegistry.h b/include/llvm/PassRegistry.h
index 1558c51..8c28ef5 100644
--- a/include/llvm/PassRegistry.h
+++ b/include/llvm/PassRegistry.h

@@ -42,61 +42,51 @@
   mutable sys::SmartRWMutex<true> Lock;
 
   /// PassInfoMap - Keep track of the PassInfo object for each registered pass.
-  typedef DenseMap<const void*, const PassInfo*> MapType;
+  typedef DenseMap<const void *, const PassInfo *> MapType;
   MapType PassInfoMap;
-  
-  typedef StringMap<const PassInfo*> StringMapType;
+
+  typedef StringMap<const PassInfo *> StringMapType;
   StringMapType PassInfoStringMap;
-  
-  /// AnalysisGroupInfo - Keep track of information for each analysis group.
-  struct AnalysisGroupInfo {
-    SmallPtrSet<const PassInfo *, 8> Implementations;
-  };
-  DenseMap<const PassInfo*, AnalysisGroupInfo> AnalysisGroupInfoMap;
-  
+
   std::vector<std::unique_ptr<const PassInfo>> ToFree;
-  std::vector<PassRegistrationListener*> Listeners;
-   
+  std::vector<PassRegistrationListener *> Listeners;
+
 public:
-  PassRegistry() { }
+  PassRegistry() {}
   ~PassRegistry();
-  
-  /// getPassRegistry - Access the global registry object, which is 
+
+  /// getPassRegistry - Access the global registry object, which is
   /// automatically initialized at application launch and destroyed by
   /// llvm_shutdown.
   static PassRegistry *getPassRegistry();
-  
+
   /// getPassInfo - Look up a pass' corresponding PassInfo, indexed by the pass'
   /// type identifier (&MyPass::ID).
   const PassInfo *getPassInfo(const void *TI) const;
-  
+
   /// getPassInfo - Look up a pass' corresponding PassInfo, indexed by the pass'
   /// argument string.
   const PassInfo *getPassInfo(StringRef Arg) const;
-  
-  /// registerPass - Register a pass (by means of its PassInfo) with the 
+
+  /// registerPass - Register a pass (by means of its PassInfo) with the
   /// registry.  Required in order to use the pass with a PassManager.
   void registerPass(const PassInfo &PI, bool ShouldFree = false);
-  
-  /// registerPass - Unregister a pass (by means of its PassInfo) with the 
-  /// registry.
-  void unregisterPass(const PassInfo &PI);
-  
+
   /// registerAnalysisGroup - Register an analysis group (or a pass implementing
-  // an analysis group) with the registry.  Like registerPass, this is required 
+  // an analysis group) with the registry.  Like registerPass, this is required
   // in order for a PassManager to be able to use this group/pass.
   void registerAnalysisGroup(const void *InterfaceID, const void *PassID,
-                             PassInfo& Registeree, bool isDefault,
+                             PassInfo &Registeree, bool isDefault,
                              bool ShouldFree = false);
-  
+
   /// enumerateWith - Enumerate the registered passes, calling the provided
   /// PassRegistrationListener's passEnumerate() callback on each of them.
   void enumerateWith(PassRegistrationListener *L);
-  
+
   /// addRegistrationListener - Register the given PassRegistrationListener
   /// to receive passRegistered() callbacks whenever a new pass is registered.
   void addRegistrationListener(PassRegistrationListener *L);
-  
+
   /// removeRegistrationListener - Unregister a PassRegistrationListener so that
   /// it no longer receives passRegistered() callbacks.
   void removeRegistrationListener(PassRegistrationListener *L);

diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index 449bc92..6cb6516 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h

@@ -82,6 +82,15 @@
     CALL_ONCE_INITIALIZATION(initialize##passName##PassOnce) \
   }
 
+#define INITIALIZE_PASS_WITH_OPTIONS(PassName, Arg, Name, Cfg, Analysis) \
+  INITIALIZE_PASS_BEGIN(PassName, Arg, Name, Cfg, Analysis) \
+  PassName::registerOptions(); \
+  INITIALIZE_PASS_END(PassName, Arg, Name, Cfg, Analysis)
+
+#define INITIALIZE_PASS_WITH_OPTIONS_BEGIN(PassName, Arg, Name, Cfg, Analysis) \
+  INITIALIZE_PASS_BEGIN(PassName, Arg, Name, Cfg, Analysis) \
+  PassName::registerOptions(); \
+
 template<typename PassName>
 Pass *callDefaultCtor() { return new PassName(); }
 

diff --git a/include/llvm/ProfileData/CoverageMapping.h b/include/llvm/ProfileData/CoverageMapping.h
new file mode 100644
index 0000000..38fc8ca
--- /dev/null
+++ b/include/llvm/ProfileData/CoverageMapping.h

@@ -0,0 +1,448 @@
+//=-- CoverageMapping.h - Code coverage mapping support ---------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Code coverage mapping data is generated by clang and read by
+// llvm-cov to show code coverage statistics for a file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_COVERAGEMAPPING_H_
+#define LLVM_PROFILEDATA_COVERAGEMAPPING_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <system_error>
+
+namespace llvm {
+class IndexedInstrProfReader;
+namespace coverage {
+
+class ObjectFileCoverageMappingReader;
+
+class CoverageMapping;
+struct CounterExpressions;
+
+enum CoverageMappingVersion { CoverageMappingVersion1 };
+
+/// \brief A Counter is an abstract value that describes how to compute the
+/// execution count for a region of code using the collected profile count data.
+struct Counter {
+  enum CounterKind { Zero, CounterValueReference, Expression };
+  static const unsigned EncodingTagBits = 2;
+  static const unsigned EncodingTagMask = 0x3;
+  static const unsigned EncodingCounterTagAndExpansionRegionTagBits =
+      EncodingTagBits + 1;
+
+private:
+  CounterKind Kind;
+  unsigned ID;
+
+  Counter(CounterKind Kind, unsigned ID) : Kind(Kind), ID(ID) {}
+
+public:
+  Counter() : Kind(Zero), ID(0) {}
+
+  CounterKind getKind() const { return Kind; }
+
+  bool isZero() const { return Kind == Zero; }
+
+  bool isExpression() const { return Kind == Expression; }
+
+  unsigned getCounterID() const { return ID; }
+
+  unsigned getExpressionID() const { return ID; }
+
+  bool operator==(const Counter &Other) const {
+    return Kind == Other.Kind && ID == Other.ID;
+  }
+
+  friend bool operator<(const Counter &LHS, const Counter &RHS) {
+    return std::tie(LHS.Kind, LHS.ID) < std::tie(RHS.Kind, RHS.ID);
+  }
+
+  /// \brief Return the counter that represents the number zero.
+  static Counter getZero() { return Counter(); }
+
+  /// \brief Return the counter that corresponds to a specific profile counter.
+  static Counter getCounter(unsigned CounterId) {
+    return Counter(CounterValueReference, CounterId);
+  }
+
+  /// \brief Return the counter that corresponds to a specific
+  /// addition counter expression.
+  static Counter getExpression(unsigned ExpressionId) {
+    return Counter(Expression, ExpressionId);
+  }
+};
+
+/// \brief A Counter expression is a value that represents an arithmetic
+/// operation with two counters.
+struct CounterExpression {
+  enum ExprKind { Subtract, Add };
+  ExprKind Kind;
+  Counter LHS, RHS;
+
+  CounterExpression(ExprKind Kind, Counter LHS, Counter RHS)
+      : Kind(Kind), LHS(LHS), RHS(RHS) {}
+};
+
+/// \brief A Counter expression builder is used to construct the
+/// counter expressions. It avoids unecessary duplication
+/// and simplifies algebraic expressions.
+class CounterExpressionBuilder {
+  /// \brief A list of all the counter expressions
+  std::vector<CounterExpression> Expressions;
+  /// \brief A lookup table for the index of a given expression.
+  llvm::DenseMap<CounterExpression, unsigned> ExpressionIndices;
+
+  /// \brief Return the counter which corresponds to the given expression.
+  ///
+  /// If the given expression is already stored in the builder, a counter
+  /// that references that expression is returned. Otherwise, the given
+  /// expression is added to the builder's collection of expressions.
+  Counter get(const CounterExpression &E);
+
+  /// \brief Gather the terms of the expression tree for processing.
+  ///
+  /// This collects each addition and subtraction referenced by the counter into
+  /// a sequence that can be sorted and combined to build a simplified counter
+  /// expression.
+  void extractTerms(Counter C, int Sign,
+                    SmallVectorImpl<std::pair<unsigned, int>> &Terms);
+
+  /// \brief Simplifies the given expression tree
+  /// by getting rid of algebraically redundant operations.
+  Counter simplify(Counter ExpressionTree);
+
+public:
+  ArrayRef<CounterExpression> getExpressions() const { return Expressions; }
+
+  /// \brief Return a counter that represents the expression
+  /// that adds LHS and RHS.
+  Counter add(Counter LHS, Counter RHS);
+
+  /// \brief Return a counter that represents the expression
+  /// that subtracts RHS from LHS.
+  Counter subtract(Counter LHS, Counter RHS);
+};
+
+/// \brief A Counter mapping region associates a source range with
+/// a specific counter.
+struct CounterMappingRegion {
+  enum RegionKind {
+    /// \brief A CodeRegion associates some code with a counter
+    CodeRegion,
+
+    /// \brief An ExpansionRegion represents a file expansion region that
+    /// associates a source range with the expansion of a virtual source file,
+    /// such as for a macro instantiation or #include file.
+    ExpansionRegion,
+
+    /// \brief A SkippedRegion represents a source range with code that
+    /// was skipped by a preprocessor or similar means.
+    SkippedRegion
+  };
+
+  static const unsigned EncodingHasCodeBeforeBits = 1;
+
+  Counter Count;
+  unsigned FileID, ExpandedFileID;
+  unsigned LineStart, ColumnStart, LineEnd, ColumnEnd;
+  RegionKind Kind;
+  /// \brief A flag that is set to true when there is already code before
+  /// this region on the same line.
+  /// This is useful to accurately compute the execution counts for a line.
+  bool HasCodeBefore;
+
+  CounterMappingRegion(Counter Count, unsigned FileID, unsigned LineStart,
+                       unsigned ColumnStart, unsigned LineEnd,
+                       unsigned ColumnEnd, bool HasCodeBefore = false,
+                       RegionKind Kind = CodeRegion)
+      : Count(Count), FileID(FileID), ExpandedFileID(0), LineStart(LineStart),
+        ColumnStart(ColumnStart), LineEnd(LineEnd), ColumnEnd(ColumnEnd),
+        Kind(Kind), HasCodeBefore(HasCodeBefore) {}
+
+  inline std::pair<unsigned, unsigned> startLoc() const {
+    return std::pair<unsigned, unsigned>(LineStart, ColumnStart);
+  }
+
+  inline std::pair<unsigned, unsigned> endLoc() const {
+    return std::pair<unsigned, unsigned>(LineEnd, ColumnEnd);
+  }
+
+  bool operator<(const CounterMappingRegion &Other) const {
+    if (FileID != Other.FileID)
+      return FileID < Other.FileID;
+    return startLoc() < Other.startLoc();
+  }
+
+  bool contains(const CounterMappingRegion &Other) const {
+    if (FileID != Other.FileID)
+      return false;
+    if (startLoc() > Other.startLoc())
+      return false;
+    if (endLoc() < Other.endLoc())
+      return false;
+    return true;
+  }
+};
+
+/// \brief Associates a source range with an execution count.
+struct CountedRegion : public CounterMappingRegion {
+  uint64_t ExecutionCount;
+
+  CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount)
+      : CounterMappingRegion(R), ExecutionCount(ExecutionCount) {}
+};
+
+/// \brief A Counter mapping context is used to connect the counters,
+/// expressions and the obtained counter values.
+class CounterMappingContext {
+  ArrayRef<CounterExpression> Expressions;
+  ArrayRef<uint64_t> CounterValues;
+
+public:
+  CounterMappingContext(ArrayRef<CounterExpression> Expressions,
+                        ArrayRef<uint64_t> CounterValues = ArrayRef<uint64_t>())
+      : Expressions(Expressions), CounterValues(CounterValues) {}
+
+  void dump(const Counter &C, llvm::raw_ostream &OS) const;
+  void dump(const Counter &C) const { dump(C, llvm::outs()); }
+
+  /// \brief Return the number of times that a region of code associated with
+  /// this counter was executed.
+  ErrorOr<int64_t> evaluate(const Counter &C) const;
+};
+
+/// \brief Code coverage information for a single function.
+struct FunctionRecord {
+  /// \brief Raw function name.
+  std::string Name;
+  /// \brief Associated files.
+  std::vector<std::string> Filenames;
+  /// \brief Regions in the function along with their counts.
+  std::vector<CountedRegion> CountedRegions;
+  /// \brief The number of times this function was executed.
+  uint64_t ExecutionCount;
+
+  FunctionRecord(StringRef Name, ArrayRef<StringRef> Filenames,
+                 uint64_t ExecutionCount)
+      : Name(Name), Filenames(Filenames.begin(), Filenames.end()),
+        ExecutionCount(ExecutionCount) {}
+};
+
+/// \brief Iterator over Functions, optionally filtered to a single file.
+class FunctionRecordIterator
+    : public iterator_facade_base<FunctionRecordIterator,
+                                  std::forward_iterator_tag, FunctionRecord> {
+  ArrayRef<FunctionRecord> Records;
+  ArrayRef<FunctionRecord>::iterator Current;
+  StringRef Filename;
+
+  /// \brief Skip records whose primary file is not \c Filename.
+  void skipOtherFiles();
+
+public:
+  FunctionRecordIterator(ArrayRef<FunctionRecord> Records_,
+                         StringRef Filename = "")
+      : Records(Records_), Current(Records.begin()), Filename(Filename) {
+    skipOtherFiles();
+  }
+
+  FunctionRecordIterator() : Current(Records.begin()) {}
+
+  bool operator==(const FunctionRecordIterator &RHS) const {
+    return Current == RHS.Current && Filename == RHS.Filename;
+  }
+
+  const FunctionRecord &operator*() const { return *Current; }
+
+  FunctionRecordIterator &operator++() {
+    assert(Current != Records.end() && "incremented past end");
+    ++Current;
+    skipOtherFiles();
+    return *this;
+  }
+};
+
+/// \brief Coverage information for a macro expansion or #included file.
+///
+/// When covered code has pieces that can be expanded for more detail, such as a
+/// preprocessor macro use and its definition, these are represented as
+/// expansions whose coverage can be looked up independently.
+struct ExpansionRecord {
+  /// \brief The abstract file this expansion covers.
+  unsigned FileID;
+  /// \brief The region that expands to this record.
+  const CountedRegion &Region;
+  /// \brief Coverage for the expansion.
+  const FunctionRecord &Function;
+
+  ExpansionRecord(const CountedRegion &Region,
+                  const FunctionRecord &Function)
+      : FileID(Region.ExpandedFileID), Region(Region), Function(Function) {}
+};
+
+/// \brief The execution count information starting at a point in a file.
+///
+/// A sequence of CoverageSegments gives execution counts for a file in format
+/// that's simple to iterate through for processing.
+struct CoverageSegment {
+  /// \brief The line where this segment begins.
+  unsigned Line;
+  /// \brief The column where this segment begins.
+  unsigned Col;
+  /// \brief The execution count, or zero if no count was recorded.
+  uint64_t Count;
+  /// \brief When false, the segment was uninstrumented or skipped.
+  bool HasCount;
+  /// \brief Whether this enters a new region or returns to a previous count.
+  bool IsRegionEntry;
+
+  CoverageSegment(unsigned Line, unsigned Col, bool IsRegionEntry)
+      : Line(Line), Col(Col), Count(0), HasCount(false),
+        IsRegionEntry(IsRegionEntry) {}
+  void setCount(uint64_t NewCount) {
+    Count = NewCount;
+    HasCount = true;
+  }
+  void addCount(uint64_t NewCount) { setCount(Count + NewCount); }
+};
+
+/// \brief Coverage information to be processed or displayed.
+///
+/// This represents the coverage of an entire file, expansion, or function. It
+/// provides a sequence of CoverageSegments to iterate through, as well as the
+/// list of expansions that can be further processed.
+class CoverageData {
+  std::string Filename;
+  std::vector<CoverageSegment> Segments;
+  std::vector<ExpansionRecord> Expansions;
+  friend class CoverageMapping;
+
+public:
+  CoverageData() {}
+
+  CoverageData(StringRef Filename) : Filename(Filename) {}
+
+  CoverageData(CoverageData &&RHS)
+      : Filename(std::move(RHS.Filename)), Segments(std::move(RHS.Segments)),
+        Expansions(std::move(RHS.Expansions)) {}
+
+  /// \brief Get the name of the file this data covers.
+  StringRef getFilename() { return Filename; }
+
+  std::vector<CoverageSegment>::iterator begin() { return Segments.begin(); }
+  std::vector<CoverageSegment>::iterator end() { return Segments.end(); }
+  bool empty() { return Segments.empty(); }
+
+  /// \brief Expansions that can be further processed.
+  std::vector<ExpansionRecord> getExpansions() { return Expansions; }
+};
+
+/// \brief The mapping of profile information to coverage data.
+///
+/// This is the main interface to get coverage information, using a profile to
+/// fill out execution counts.
+class CoverageMapping {
+  std::vector<FunctionRecord> Functions;
+  unsigned MismatchedFunctionCount;
+
+  CoverageMapping() : MismatchedFunctionCount(0) {}
+
+public:
+  /// \brief Load the coverage mapping using the given readers.
+  static ErrorOr<std::unique_ptr<CoverageMapping>>
+  load(ObjectFileCoverageMappingReader &CoverageReader,
+       IndexedInstrProfReader &ProfileReader);
+
+  /// \brief Load the coverage mapping from the given files.
+  static ErrorOr<std::unique_ptr<CoverageMapping>>
+  load(StringRef ObjectFilename, StringRef ProfileFilename);
+
+  /// \brief The number of functions that couldn't have their profiles mapped.
+  ///
+  /// This is a count of functions whose profile is out of date or otherwise
+  /// can't be associated with any coverage information.
+  unsigned getMismatchedCount() { return MismatchedFunctionCount; }
+
+  /// \brief Returns the list of files that are covered.
+  std::vector<StringRef> getUniqueSourceFiles() const;
+
+  /// \brief Get the coverage for a particular file.
+  ///
+  /// The given filename must be the name as recorded in the coverage
+  /// information. That is, only names returned from getUniqueSourceFiles will
+  /// yield a result.
+  CoverageData getCoverageForFile(StringRef Filename);
+
+  /// \brief Gets all of the functions covered by this profile.
+  iterator_range<FunctionRecordIterator> getCoveredFunctions() const {
+    return make_range(FunctionRecordIterator(Functions),
+                      FunctionRecordIterator());
+  }
+
+  /// \brief Gets all of the functions in a particular file.
+  iterator_range<FunctionRecordIterator>
+  getCoveredFunctions(StringRef Filename) const {
+    return make_range(FunctionRecordIterator(Functions, Filename),
+                      FunctionRecordIterator());
+  }
+
+  /// \brief Get the list of function instantiations in the file.
+  ///
+  /// Fucntions that are instantiated more than once, such as C++ template
+  /// specializations, have distinct coverage records for each instantiation.
+  std::vector<const FunctionRecord *> getInstantiations(StringRef Filename);
+
+  /// \brief Get the coverage for a particular function.
+  CoverageData getCoverageForFunction(const FunctionRecord &Function);
+
+  /// \brief Get the coverage for an expansion within a coverage set.
+  CoverageData getCoverageForExpansion(const ExpansionRecord &Expansion);
+};
+
+} // end namespace coverage
+
+/// \brief Provide DenseMapInfo for CounterExpression
+template<> struct DenseMapInfo<coverage::CounterExpression> {
+  static inline coverage::CounterExpression getEmptyKey() {
+    using namespace coverage;
+    return CounterExpression(CounterExpression::ExprKind::Subtract,
+                             Counter::getCounter(~0U),
+                             Counter::getCounter(~0U));
+  }
+
+  static inline coverage::CounterExpression getTombstoneKey() {
+    using namespace coverage;
+    return CounterExpression(CounterExpression::ExprKind::Add,
+                             Counter::getCounter(~0U),
+                             Counter::getCounter(~0U));
+  }
+
+  static unsigned getHashValue(const coverage::CounterExpression &V) {
+    return static_cast<unsigned>(
+        hash_combine(V.Kind, V.LHS.getKind(), V.LHS.getCounterID(),
+                     V.RHS.getKind(), V.RHS.getCounterID()));
+  }
+
+  static bool isEqual(const coverage::CounterExpression &LHS,
+                      const coverage::CounterExpression &RHS) {
+    return LHS.Kind == RHS.Kind && LHS.LHS == RHS.LHS && LHS.RHS == RHS.RHS;
+  }
+};
+
+
+} // end namespace llvm
+
+#endif // LLVM_PROFILEDATA_COVERAGEMAPPING_H_

diff --git a/include/llvm/ProfileData/CoverageMappingReader.h b/include/llvm/ProfileData/CoverageMappingReader.h
new file mode 100644
index 0000000..73b0248
--- /dev/null
+++ b/include/llvm/ProfileData/CoverageMappingReader.h

@@ -0,0 +1,209 @@
+//=-- CoverageMappingReader.h - Code coverage mapping reader ------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading coverage mapping data for
+// instrumentation based coverage.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_COVERAGEMAPPINGREADER_H
+#define LLVM_PROFILEDATA_COVERAGEMAPPINGREADER_H
+
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/CoverageMapping.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/FileSystem.h"
+
+#include <iterator>
+
+namespace llvm {
+namespace coverage {
+
+class ObjectFileCoverageMappingReader;
+
+/// \brief Coverage mapping information for a single function.
+struct CoverageMappingRecord {
+  StringRef FunctionName;
+  uint64_t FunctionHash;
+  ArrayRef<StringRef> Filenames;
+  ArrayRef<CounterExpression> Expressions;
+  ArrayRef<CounterMappingRegion> MappingRegions;
+};
+
+/// \brief A file format agnostic iterator over coverage mapping data.
+class CoverageMappingIterator
+    : public std::iterator<std::input_iterator_tag, CoverageMappingRecord> {
+  ObjectFileCoverageMappingReader *Reader;
+  CoverageMappingRecord Record;
+
+  void increment();
+
+public:
+  CoverageMappingIterator() : Reader(nullptr) {}
+  CoverageMappingIterator(ObjectFileCoverageMappingReader *Reader)
+      : Reader(Reader) {
+    increment();
+  }
+
+  CoverageMappingIterator &operator++() {
+    increment();
+    return *this;
+  }
+  bool operator==(const CoverageMappingIterator &RHS) {
+    return Reader == RHS.Reader;
+  }
+  bool operator!=(const CoverageMappingIterator &RHS) {
+    return Reader != RHS.Reader;
+  }
+  CoverageMappingRecord &operator*() { return Record; }
+  CoverageMappingRecord *operator->() { return &Record; }
+};
+
+/// \brief Base class for the raw coverage mapping and filenames data readers.
+class RawCoverageReader {
+protected:
+  StringRef Data;
+
+  /// \brief Return the error code.
+  std::error_code error(std::error_code EC) { return EC; }
+
+  /// \brief Clear the current error code and return a successful one.
+  std::error_code success() { return error(instrprof_error::success); }
+
+  RawCoverageReader(StringRef Data) : Data(Data) {}
+
+  std::error_code readULEB128(uint64_t &Result);
+  std::error_code readIntMax(uint64_t &Result, uint64_t MaxPlus1);
+  std::error_code readSize(uint64_t &Result);
+  std::error_code readString(StringRef &Result);
+};
+
+/// \brief Reader for the raw coverage filenames.
+class RawCoverageFilenamesReader : public RawCoverageReader {
+  std::vector<StringRef> &Filenames;
+
+  RawCoverageFilenamesReader(const RawCoverageFilenamesReader &)
+      LLVM_DELETED_FUNCTION;
+  RawCoverageFilenamesReader &
+  operator=(const RawCoverageFilenamesReader &) LLVM_DELETED_FUNCTION;
+
+public:
+  RawCoverageFilenamesReader(StringRef Data, std::vector<StringRef> &Filenames)
+      : RawCoverageReader(Data), Filenames(Filenames) {}
+
+  std::error_code read();
+};
+
+/// \brief Reader for the raw coverage mapping data.
+class RawCoverageMappingReader : public RawCoverageReader {
+  StringRef FunctionName;
+  ArrayRef<StringRef> TranslationUnitFilenames;
+  std::vector<StringRef> &Filenames;
+  std::vector<CounterExpression> &Expressions;
+  std::vector<CounterMappingRegion> &MappingRegions;
+
+  RawCoverageMappingReader(const RawCoverageMappingReader &)
+      LLVM_DELETED_FUNCTION;
+  RawCoverageMappingReader &
+  operator=(const RawCoverageMappingReader &) LLVM_DELETED_FUNCTION;
+
+public:
+  RawCoverageMappingReader(StringRef FunctionName, StringRef MappingData,
+                           ArrayRef<StringRef> TranslationUnitFilenames,
+                           std::vector<StringRef> &Filenames,
+                           std::vector<CounterExpression> &Expressions,
+                           std::vector<CounterMappingRegion> &MappingRegions)
+      : RawCoverageReader(MappingData), FunctionName(FunctionName),
+        TranslationUnitFilenames(TranslationUnitFilenames),
+        Filenames(Filenames), Expressions(Expressions),
+        MappingRegions(MappingRegions) {}
+
+  std::error_code read(CoverageMappingRecord &Record);
+
+private:
+  std::error_code decodeCounter(unsigned Value, Counter &C);
+  std::error_code readCounter(Counter &C);
+  std::error_code
+  readMappingRegionsSubArray(std::vector<CounterMappingRegion> &MappingRegions,
+                             unsigned InferredFileID, size_t NumFileIDs);
+};
+
+/// \brief Reader for the coverage mapping data that is emitted by the
+/// frontend and stored in an object file.
+class ObjectFileCoverageMappingReader {
+public:
+  struct ProfileMappingRecord {
+    CoverageMappingVersion Version;
+    StringRef FunctionName;
+    uint64_t FunctionHash;
+    StringRef CoverageMapping;
+    size_t FilenamesBegin;
+    size_t FilenamesSize;
+
+    ProfileMappingRecord(CoverageMappingVersion Version, StringRef FunctionName,
+                         uint64_t FunctionHash, StringRef CoverageMapping,
+                         size_t FilenamesBegin, size_t FilenamesSize)
+        : Version(Version), FunctionName(FunctionName),
+          FunctionHash(FunctionHash), CoverageMapping(CoverageMapping),
+          FilenamesBegin(FilenamesBegin), FilenamesSize(FilenamesSize) {}
+  };
+
+private:
+  std::error_code LastError;
+  object::OwningBinary<object::ObjectFile> Object;
+  std::vector<StringRef> Filenames;
+  std::vector<ProfileMappingRecord> MappingRecords;
+  size_t CurrentRecord;
+  std::vector<StringRef> FunctionsFilenames;
+  std::vector<CounterExpression> Expressions;
+  std::vector<CounterMappingRegion> MappingRegions;
+
+  ObjectFileCoverageMappingReader(const ObjectFileCoverageMappingReader &)
+      LLVM_DELETED_FUNCTION;
+  ObjectFileCoverageMappingReader &
+  operator=(const ObjectFileCoverageMappingReader &) LLVM_DELETED_FUNCTION;
+
+  /// \brief Set the current error_code and return same.
+  std::error_code error(std::error_code EC) {
+    LastError = EC;
+    return EC;
+  }
+
+  /// \brief Clear the current error code and return a successful one.
+  std::error_code success() { return error(instrprof_error::success); }
+
+public:
+  ObjectFileCoverageMappingReader(StringRef FileName);
+  ObjectFileCoverageMappingReader(
+      std::unique_ptr<MemoryBuffer> &ObjectBuffer,
+      sys::fs::file_magic Type = sys::fs::file_magic::unknown);
+
+  std::error_code readHeader();
+  std::error_code readNextRecord(CoverageMappingRecord &Record);
+
+  /// Iterator over profile data.
+  CoverageMappingIterator begin() { return CoverageMappingIterator(this); }
+  CoverageMappingIterator end() { return CoverageMappingIterator(); }
+
+  /// \brief Return true if the reader has finished reading the profile data.
+  bool isEOF() { return LastError == instrprof_error::eof; }
+  /// \brief Return true if the reader encountered an error reading profiling
+  /// data.
+  bool hasError() { return LastError && !isEOF(); }
+  /// \brief Get the current error code.
+  std::error_code getError() { return LastError; }
+};
+
+} // end namespace coverage
+} // end namespace llvm
+
+#endif

diff --git a/include/llvm/ProfileData/CoverageMappingWriter.h b/include/llvm/ProfileData/CoverageMappingWriter.h
new file mode 100644
index 0000000..cf16140
--- /dev/null
+++ b/include/llvm/ProfileData/CoverageMappingWriter.h

@@ -0,0 +1,63 @@
+//=-- CoverageMappingWriter.h - Code coverage mapping writer ------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing coverage mapping data for
+// instrumentation based coverage.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_COVERAGEMAPPINGWRITER_H
+#define LLVM_PROFILEDATA_COVERAGEMAPPINGWRITER_H
+
+#include "llvm/ProfileData/CoverageMapping.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace coverage {
+
+/// \brief Writer of the filenames section for the instrumentation
+/// based code coverage.
+class CoverageFilenamesSectionWriter {
+  ArrayRef<StringRef> Filenames;
+
+public:
+  CoverageFilenamesSectionWriter(ArrayRef<StringRef> Filenames)
+      : Filenames(Filenames) {}
+
+  /// \brief Write encoded filenames to the given output stream.
+  void write(raw_ostream &OS);
+};
+
+/// \brief Writer for instrumentation based coverage mapping data.
+class CoverageMappingWriter {
+  ArrayRef<unsigned> VirtualFileMapping;
+  ArrayRef<CounterExpression> Expressions;
+  MutableArrayRef<CounterMappingRegion> MappingRegions;
+
+public:
+  CoverageMappingWriter(ArrayRef<unsigned> VirtualFileMapping,
+                        ArrayRef<CounterExpression> Expressions,
+                        MutableArrayRef<CounterMappingRegion> MappingRegions)
+      : VirtualFileMapping(VirtualFileMapping), Expressions(Expressions),
+        MappingRegions(MappingRegions) {}
+
+  CoverageMappingWriter(ArrayRef<CounterExpression> Expressions,
+                        MutableArrayRef<CounterMappingRegion> MappingRegions)
+      : Expressions(Expressions), MappingRegions(MappingRegions) {}
+
+  /// \brief Write encoded coverage mapping data to the given output stream.
+  void write(raw_ostream &OS);
+};
+
+} // end namespace coverage
+} // end namespace llvm
+
+#endif

diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 7a5a71d..38c5310 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h

@@ -12,12 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PROFILEDATA_INSTRPROF_READER_H_
-#define LLVM_PROFILEDATA_INSTRPROF_READER_H_
+#ifndef LLVM_PROFILEDATA_INSTRPROFREADER_H
+#define LLVM_PROFILEDATA_INSTRPROFREADER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/EndianStream.h"
@@ -94,8 +95,7 @@
 
   /// Factory method to create an appropriately typed reader for the given
   /// instrprof file.
-  static std::error_code create(std::string Path,
-                                std::unique_ptr<InstrProfReader> &Result);
+  static ErrorOr<std::unique_ptr<InstrProfReader>> create(std::string Path);
 };
 
 /// Reader for the simple text based instrprof format.
@@ -120,7 +120,7 @@
     LLVM_DELETED_FUNCTION;
 public:
   TextInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer_)
-      : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, '#') {}
+      : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, true, '#') {}
 
   /// Read the header.
   std::error_code readHeader() override { return success(); }
@@ -206,12 +206,17 @@
 /// Trait for lookups into the on-disk hash table for the binary instrprof
 /// format.
 class InstrProfLookupTrait {
-  std::vector<uint64_t> CountBuffer;
+  std::vector<uint64_t> DataBuffer;
   IndexedInstrProf::HashT HashType;
 public:
   InstrProfLookupTrait(IndexedInstrProf::HashT HashType) : HashType(HashType) {}
 
-  typedef InstrProfRecord data_type;
+  struct data_type {
+    data_type(StringRef Name, ArrayRef<uint64_t> Data)
+        : Name(Name), Data(Data) {}
+    StringRef Name;
+    ArrayRef<uint64_t> Data;
+  };
   typedef StringRef internal_key_type;
   typedef StringRef external_key_type;
   typedef uint64_t hash_value_type;
@@ -234,25 +239,20 @@
     return StringRef((const char *)D, N);
   }
 
-  InstrProfRecord ReadData(StringRef K, const unsigned char *D, offset_type N) {
-    if (N < 2 * sizeof(uint64_t) || N % sizeof(uint64_t)) {
+  data_type ReadData(StringRef K, const unsigned char *D, offset_type N) {
+    DataBuffer.clear();
+    if (N % sizeof(uint64_t))
       // The data is corrupt, don't try to read it.
-      CountBuffer.clear();
-      return InstrProfRecord("", 0, CountBuffer);
-    }
+      return data_type("", DataBuffer);
 
     using namespace support;
-
-    // The first stored value is the hash.
-    uint64_t Hash = endian::readNext<uint64_t, little, unaligned>(D);
-    // Each counter follows.
-    unsigned NumCounters = N / sizeof(uint64_t) - 1;
-    CountBuffer.clear();
-    CountBuffer.reserve(NumCounters - 1);
-    for (unsigned I = 0; I < NumCounters; ++I)
-      CountBuffer.push_back(endian::readNext<uint64_t, little, unaligned>(D));
-
-    return InstrProfRecord(K, Hash, CountBuffer);
+    // We just treat the data as opaque here. It's simpler to handle in
+    // IndexedInstrProfReader.
+    unsigned NumEntries = N / sizeof(uint64_t);
+    DataBuffer.reserve(NumEntries);
+    for (unsigned I = 0; I < NumEntries; ++I)
+      DataBuffer.push_back(endian::readNext<uint64_t, little, unaligned>(D));
+    return data_type(K, DataBuffer);
   }
 };
 typedef OnDiskIterableChainedHashTable<InstrProfLookupTrait>
@@ -267,7 +267,11 @@
   std::unique_ptr<InstrProfReaderIndex> Index;
   /// Iterator over the profile data.
   InstrProfReaderIndex::data_iterator RecordIterator;
-  /// The maximal execution count among all fucntions.
+  /// Offset into our current data set.
+  size_t CurrentOffset;
+  /// The file format version of the profile data.
+  uint64_t FormatVersion;
+  /// The maximal execution count among all functions.
   uint64_t MaxFunctionCount;
 
   IndexedInstrProfReader(const IndexedInstrProfReader &) LLVM_DELETED_FUNCTION;
@@ -275,8 +279,7 @@
     LLVM_DELETED_FUNCTION;
 public:
   IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
-      : DataBuffer(std::move(DataBuffer)), Index(nullptr),
-        RecordIterator(InstrProfReaderIndex::data_iterator()) {}
+      : DataBuffer(std::move(DataBuffer)), Index(nullptr), CurrentOffset(0) {}
 
   /// Return true if the given buffer is in an indexed instrprof format.
   static bool hasFormat(const MemoryBuffer &DataBuffer);
@@ -287,7 +290,7 @@
   std::error_code readNextRecord(InstrProfRecord &Record) override;
 
   /// Fill Counts with the profile data for the given function name.
-  std::error_code getFunctionCounts(StringRef FuncName, uint64_t &FuncHash,
+  std::error_code getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
                                     std::vector<uint64_t> &Counts);
   /// Return the maximum of all known function counts.
   uint64_t getMaximumFunctionCount() { return MaxFunctionCount; }
@@ -299,4 +302,4 @@
 
 } // end namespace llvm
 
-#endif // LLVM_PROFILEDATA_INSTRPROF_READER_H_
+#endif

diff --git a/include/llvm/ProfileData/InstrProfWriter.h b/include/llvm/ProfileData/InstrProfWriter.h
index 6e68bee..e76f668 100644
--- a/include/llvm/ProfileData/InstrProfWriter.h
+++ b/include/llvm/ProfileData/InstrProfWriter.h

@@ -12,10 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PROFILEDATA_INSTRPROF_WRITER_H_
-#define LLVM_PROFILEDATA_INSTRPROF_WRITER_H_
+#ifndef LLVM_PROFILEDATA_INSTRPROFWRITER_H
+#define LLVM_PROFILEDATA_INSTRPROFWRITER_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/DataTypes.h"
@@ -28,13 +29,13 @@
 /// Writer for instrumentation based profile data.
 class InstrProfWriter {
 public:
-  struct CounterData {
-    uint64_t Hash;
-    std::vector<uint64_t> Counts;
-  };
+  typedef SmallDenseMap<uint64_t, std::vector<uint64_t>, 1> CounterData;
 private:
   StringMap<CounterData> FunctionData;
+  uint64_t MaxFunctionCount;
 public:
+  InstrProfWriter() : MaxFunctionCount(0) {}
+
   /// Add function counts for the given function. If there are already counts
   /// for this function and the hash and number of counts match, each counter is
   /// summed.
@@ -47,4 +48,4 @@
 
 } // end namespace llvm
 
-#endif // LLVM_PROFILE_INSTRPROF_WRITER_H_
+#endif

diff --git a/include/llvm/ProfileData/SampleProf.h b/include/llvm/ProfileData/SampleProf.h
new file mode 100644
index 0000000..5c70b31
--- /dev/null
+++ b/include/llvm/ProfileData/SampleProf.h

@@ -0,0 +1,248 @@
+//=-- SampleProf.h - Sampling profiling format support --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains common definitions used in the reading and writing of
+// sample profile data.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_PROFILEDATA_SAMPLEPROF_H_
+#define LLVM_PROFILEDATA_SAMPLEPROF_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <system_error>
+
+namespace llvm {
+
+const std::error_category &sampleprof_category();
+
+enum class sampleprof_error {
+  success = 0,
+  bad_magic,
+  unsupported_version,
+  too_large,
+  truncated,
+  malformed,
+  unrecognized_format
+};
+
+inline std::error_code make_error_code(sampleprof_error E) {
+  return std::error_code(static_cast<int>(E), sampleprof_category());
+}
+
+} // end namespace llvm
+
+namespace std {
+template <>
+struct is_error_code_enum<llvm::sampleprof_error> : std::true_type {};
+}
+
+namespace llvm {
+
+namespace sampleprof {
+
+static inline uint64_t SPMagic() {
+  return uint64_t('S') << (64 - 8) | uint64_t('P') << (64 - 16) |
+         uint64_t('R') << (64 - 24) | uint64_t('O') << (64 - 32) |
+         uint64_t('F') << (64 - 40) | uint64_t('4') << (64 - 48) |
+         uint64_t('2') << (64 - 56) | uint64_t(0xff);
+}
+
+static inline uint64_t SPVersion() { return 100; }
+
+/// \brief Represents the relative location of an instruction.
+///
+/// Instruction locations are specified by the line offset from the
+/// beginning of the function (marked by the line where the function
+/// header is) and the discriminator value within that line.
+///
+/// The discriminator value is useful to distinguish instructions
+/// that are on the same line but belong to different basic blocks
+/// (e.g., the two post-increment instructions in "if (p) x++; else y++;").
+struct LineLocation {
+  LineLocation(int L, unsigned D) : LineOffset(L), Discriminator(D) {}
+  int LineOffset;
+  unsigned Discriminator;
+};
+
+} // End namespace sampleprof
+
+template <> struct DenseMapInfo<sampleprof::LineLocation> {
+  typedef DenseMapInfo<int> OffsetInfo;
+  typedef DenseMapInfo<unsigned> DiscriminatorInfo;
+  static inline sampleprof::LineLocation getEmptyKey() {
+    return sampleprof::LineLocation(OffsetInfo::getEmptyKey(),
+                                    DiscriminatorInfo::getEmptyKey());
+  }
+  static inline sampleprof::LineLocation getTombstoneKey() {
+    return sampleprof::LineLocation(OffsetInfo::getTombstoneKey(),
+                                    DiscriminatorInfo::getTombstoneKey());
+  }
+  static inline unsigned getHashValue(sampleprof::LineLocation Val) {
+    return DenseMapInfo<std::pair<int, unsigned>>::getHashValue(
+        std::pair<int, unsigned>(Val.LineOffset, Val.Discriminator));
+  }
+  static inline bool isEqual(sampleprof::LineLocation LHS,
+                             sampleprof::LineLocation RHS) {
+    return LHS.LineOffset == RHS.LineOffset &&
+           LHS.Discriminator == RHS.Discriminator;
+  }
+};
+
+namespace sampleprof {
+
+/// \brief Representation of a single sample record.
+///
+/// A sample record is represented by a positive integer value, which
+/// indicates how frequently was the associated line location executed.
+///
+/// Additionally, if the associated location contains a function call,
+/// the record will hold a list of all the possible called targets. For
+/// direct calls, this will be the exact function being invoked. For
+/// indirect calls (function pointers, virtual table dispatch), this
+/// will be a list of one or more functions.
+class SampleRecord {
+public:
+  typedef StringMap<unsigned> CallTargetMap;
+
+  SampleRecord() : NumSamples(0), CallTargets() {}
+
+  /// \brief Increment the number of samples for this record by \p S.
+  ///
+  /// Sample counts accumulate using saturating arithmetic, to avoid wrapping
+  /// around unsigned integers.
+  void addSamples(unsigned S) {
+    if (NumSamples <= std::numeric_limits<unsigned>::max() - S)
+      NumSamples += S;
+    else
+      NumSamples = std::numeric_limits<unsigned>::max();
+  }
+
+  /// \brief Add called function \p F with samples \p S.
+  ///
+  /// Sample counts accumulate using saturating arithmetic, to avoid wrapping
+  /// around unsigned integers.
+  void addCalledTarget(StringRef F, unsigned S) {
+    unsigned &TargetSamples = CallTargets[F];
+    if (TargetSamples <= std::numeric_limits<unsigned>::max() - S)
+      TargetSamples += S;
+    else
+      TargetSamples = std::numeric_limits<unsigned>::max();
+  }
+
+  /// \brief Return true if this sample record contains function calls.
+  bool hasCalls() const { return CallTargets.size() > 0; }
+
+  unsigned getSamples() const { return NumSamples; }
+  const CallTargetMap &getCallTargets() const { return CallTargets; }
+
+  /// \brief Merge the samples in \p Other into this record.
+  void merge(const SampleRecord &Other) {
+    addSamples(Other.getSamples());
+    for (const auto &I : Other.getCallTargets())
+      addCalledTarget(I.first(), I.second);
+  }
+
+private:
+  unsigned NumSamples;
+  CallTargetMap CallTargets;
+};
+
+typedef DenseMap<LineLocation, SampleRecord> BodySampleMap;
+
+/// \brief Representation of the samples collected for a function.
+///
+/// This data structure contains all the collected samples for the body
+/// of a function. Each sample corresponds to a LineLocation instance
+/// within the body of the function.
+class FunctionSamples {
+public:
+  FunctionSamples() : TotalSamples(0), TotalHeadSamples(0) {}
+  void print(raw_ostream &OS = dbgs());
+  void addTotalSamples(unsigned Num) { TotalSamples += Num; }
+  void addHeadSamples(unsigned Num) { TotalHeadSamples += Num; }
+  void addBodySamples(int LineOffset, unsigned Discriminator, unsigned Num) {
+    assert(LineOffset >= 0);
+    // When dealing with instruction weights, we use the value
+    // zero to indicate the absence of a sample. If we read an
+    // actual zero from the profile file, use the value 1 to
+    // avoid the confusion later on.
+    if (Num == 0)
+      Num = 1;
+    BodySamples[LineLocation(LineOffset, Discriminator)].addSamples(Num);
+  }
+  void addCalledTargetSamples(int LineOffset, unsigned Discriminator,
+                              std::string FName, unsigned Num) {
+    assert(LineOffset >= 0);
+    BodySamples[LineLocation(LineOffset, Discriminator)].addCalledTarget(FName,
+                                                                         Num);
+  }
+
+  /// \brief Return the sample record at the given location.
+  /// Each location is specified by \p LineOffset and \p Discriminator.
+  SampleRecord &sampleRecordAt(const LineLocation &Loc) {
+    return BodySamples[Loc];
+  }
+
+  /// \brief Return the number of samples collected at the given location.
+  /// Each location is specified by \p LineOffset and \p Discriminator.
+  unsigned samplesAt(int LineOffset, unsigned Discriminator) {
+    return sampleRecordAt(LineLocation(LineOffset, Discriminator)).getSamples();
+  }
+
+  bool empty() const { return BodySamples.empty(); }
+
+  /// \brief Return the total number of samples collected inside the function.
+  unsigned getTotalSamples() const { return TotalSamples; }
+
+  /// \brief Return the total number of samples collected at the head of the
+  /// function.
+  unsigned getHeadSamples() const { return TotalHeadSamples; }
+
+  /// \brief Return all the samples collected in the body of the function.
+  const BodySampleMap &getBodySamples() const { return BodySamples; }
+
+  /// \brief Merge the samples in \p Other into this one.
+  void merge(const FunctionSamples &Other) {
+    addTotalSamples(Other.getTotalSamples());
+    addHeadSamples(Other.getHeadSamples());
+    for (const auto &I : Other.getBodySamples()) {
+      const LineLocation &Loc = I.first;
+      const SampleRecord &Rec = I.second;
+      sampleRecordAt(Loc).merge(Rec);
+    }
+  }
+
+private:
+  /// \brief Total number of samples collected inside this function.
+  ///
+  /// Samples are cumulative, they include all the samples collected
+  /// inside this function and all its inlined callees.
+  unsigned TotalSamples;
+
+  /// \brief Total number of samples collected at the head of the function.
+  unsigned TotalHeadSamples;
+
+  /// \brief Map instruction locations to collected samples.
+  ///
+  /// Each entry in this map contains the number of samples
+  /// collected at the corresponding line offset. All line locations
+  /// are an offset from the start of the function.
+  BodySampleMap BodySamples;
+};
+
+} // End namespace sampleprof
+
+} // End namespace llvm
+
+#endif // LLVM_PROFILEDATA_SAMPLEPROF_H_

diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
new file mode 100644
index 0000000..c20b815
--- /dev/null
+++ b/include/llvm/ProfileData/SampleProfReader.h

@@ -0,0 +1,170 @@
+//===- SampleProfReader.h - Read LLVM sample profile data -----------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions needed for reading sample profiles.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_PROFILEDATA_SAMPLEPROFREADER_H
+#define LLVM_PROFILEDATA_SAMPLEPROFREADER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+namespace sampleprof {
+
+/// \brief Sample-based profile reader.
+///
+/// Each profile contains sample counts for all the functions
+/// executed. Inside each function, statements are annotated with the
+/// collected samples on all the instructions associated with that
+/// statement.
+///
+/// For this to produce meaningful data, the program needs to be
+/// compiled with some debug information (at minimum, line numbers:
+/// -gline-tables-only). Otherwise, it will be impossible to match IR
+/// instructions to the line numbers collected by the profiler.
+///
+/// From the profile file, we are interested in collecting the
+/// following information:
+///
+/// * A list of functions included in the profile (mangled names).
+///
+/// * For each function F:
+///   1. The total number of samples collected in F.
+///
+///   2. The samples collected at each line in F. To provide some
+///      protection against source code shuffling, line numbers should
+///      be relative to the start of the function.
+///
+/// The reader supports two file formats: text and binary. The text format
+/// is useful for debugging and testing, while the binary format is more
+/// compact. They can both be used interchangeably.
+class SampleProfileReader {
+public:
+  SampleProfileReader(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
+      : Profiles(0), Ctx(C), Buffer(std::move(B)) {}
+
+  virtual ~SampleProfileReader() {}
+
+  /// \brief Read and validate the file header.
+  virtual std::error_code readHeader() = 0;
+
+  /// \brief Read sample profiles from the associated file.
+  virtual std::error_code read() = 0;
+
+  /// \brief Print the profile for \p FName on stream \p OS.
+  void dumpFunctionProfile(StringRef FName, raw_ostream &OS = dbgs());
+
+  /// \brief Print all the profiles on stream \p OS.
+  void dump(raw_ostream &OS = dbgs());
+
+  /// \brief Return the samples collected for function \p F.
+  FunctionSamples *getSamplesFor(const Function &F) {
+    return &Profiles[F.getName()];
+  }
+
+  /// \brief Return all the profiles.
+  StringMap<FunctionSamples> &getProfiles() { return Profiles; }
+
+  /// \brief Report a parse error message.
+  void reportParseError(int64_t LineNumber, Twine Msg) const {
+    Ctx.diagnose(DiagnosticInfoSampleProfile(Buffer->getBufferIdentifier(),
+                                             LineNumber, Msg));
+  }
+
+  /// \brief Create a sample profile reader appropriate to the file format.
+  static ErrorOr<std::unique_ptr<SampleProfileReader>>
+  create(StringRef Filename, LLVMContext &C);
+
+protected:
+  /// \brief Map every function to its associated profile.
+  ///
+  /// The profile of every function executed at runtime is collected
+  /// in the structure FunctionSamples. This maps function objects
+  /// to their corresponding profiles.
+  StringMap<FunctionSamples> Profiles;
+
+  /// \brief LLVM context used to emit diagnostics.
+  LLVMContext &Ctx;
+
+  /// \brief Memory buffer holding the profile file.
+  std::unique_ptr<MemoryBuffer> Buffer;
+};
+
+class SampleProfileReaderText : public SampleProfileReader {
+public:
+  SampleProfileReaderText(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
+      : SampleProfileReader(std::move(B), C) {}
+
+  /// \brief Read and validate the file header.
+  std::error_code readHeader() override { return sampleprof_error::success; }
+
+  /// \brief Read sample profiles from the associated file.
+  std::error_code read() override;
+};
+
+class SampleProfileReaderBinary : public SampleProfileReader {
+public:
+  SampleProfileReaderBinary(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
+      : SampleProfileReader(std::move(B), C), Data(nullptr), End(nullptr) {}
+
+  /// \brief Read and validate the file header.
+  std::error_code readHeader() override;
+
+  /// \brief Read sample profiles from the associated file.
+  std::error_code read() override;
+
+  /// \brief Return true if \p Buffer is in the format supported by this class.
+  static bool hasFormat(const MemoryBuffer &Buffer);
+
+protected:
+  /// \brief Read a numeric value of type T from the profile.
+  ///
+  /// If an error occurs during decoding, a diagnostic message is emitted and
+  /// EC is set.
+  ///
+  /// \returns the read value.
+  template <typename T> ErrorOr<T> readNumber();
+
+  /// \brief Read a string from the profile.
+  ///
+  /// If an error occurs during decoding, a diagnostic message is emitted and
+  /// EC is set.
+  ///
+  /// \returns the read value.
+  ErrorOr<StringRef> readString();
+
+  /// \brief Return true if we've reached the end of file.
+  bool at_eof() const { return Data >= End; }
+
+  /// \brief Points to the current location in the buffer.
+  const uint8_t *Data;
+
+  /// \brief Points to the end of the buffer.
+  const uint8_t *End;
+};
+
+} // End namespace sampleprof
+
+} // End namespace llvm
+
+#endif // LLVM_PROFILEDATA_SAMPLEPROFREADER_H

diff --git a/include/llvm/ProfileData/SampleProfWriter.h b/include/llvm/ProfileData/SampleProfWriter.h
new file mode 100644
index 0000000..302a82d
--- /dev/null
+++ b/include/llvm/ProfileData/SampleProfWriter.h

@@ -0,0 +1,110 @@
+//===- SampleProfWriter.h - Write LLVM sample profile data ----------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitions needed for writing sample profiles.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_PROFILEDATA_SAMPLEPROFWRITER_H
+#define LLVM_PROFILEDATA_SAMPLEPROFWRITER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+namespace sampleprof {
+
+enum SampleProfileFormat { SPF_None = 0, SPF_Text, SPF_Binary, SPF_GCC };
+
+/// \brief Sample-based profile writer. Base class.
+class SampleProfileWriter {
+public:
+  SampleProfileWriter(StringRef Filename, std::error_code &EC,
+                      sys::fs::OpenFlags Flags)
+      : OS(Filename, EC, Flags) {}
+  virtual ~SampleProfileWriter() {}
+
+  /// \brief Write sample profiles in \p S for function \p FName.
+  ///
+  /// \returns true if the file was updated successfully. False, otherwise.
+  virtual bool write(StringRef FName, const FunctionSamples &S) = 0;
+
+  /// \brief Write sample profiles in \p S for function \p F.
+  bool write(const Function &F, const FunctionSamples &S) {
+    return write(F.getName(), S);
+  }
+
+  /// \brief Write all the sample profiles for all the functions in \p M.
+  ///
+  /// \returns true if the file was updated successfully. False, otherwise.
+  bool write(const Module &M, StringMap<FunctionSamples> &P) {
+    for (const auto &F : M) {
+      StringRef Name = F.getName();
+      if (!write(Name, P[Name]))
+        return false;
+    }
+    return true;
+  }
+
+  /// \brief Write all the sample profiles in the given map of samples.
+  ///
+  /// \returns true if the file was updated successfully. False, otherwise.
+  bool write(StringMap<FunctionSamples> &ProfileMap) {
+    for (auto &I : ProfileMap) {
+      StringRef FName = I.first();
+      FunctionSamples &Profile = I.second;
+      if (!write(FName, Profile))
+        return false;
+    }
+    return true;
+  }
+
+  /// \brief Profile writer factory. Create a new writer based on the value of
+  /// \p Format.
+  static ErrorOr<std::unique_ptr<SampleProfileWriter>>
+  create(StringRef Filename, SampleProfileFormat Format);
+
+protected:
+  /// \brief Output stream where to emit the profile to.
+  raw_fd_ostream OS;
+};
+
+/// \brief Sample-based profile writer (text format).
+class SampleProfileWriterText : public SampleProfileWriter {
+public:
+  SampleProfileWriterText(StringRef F, std::error_code &EC)
+      : SampleProfileWriter(F, EC, sys::fs::F_Text) {}
+
+  bool write(StringRef FName, const FunctionSamples &S) override;
+  bool write(const Module &M, StringMap<FunctionSamples> &P) {
+    return SampleProfileWriter::write(M, P);
+  }
+};
+
+/// \brief Sample-based profile writer (binary format).
+class SampleProfileWriterBinary : public SampleProfileWriter {
+public:
+  SampleProfileWriterBinary(StringRef F, std::error_code &EC);
+
+  bool write(StringRef F, const FunctionSamples &S) override;
+  bool write(const Module &M, StringMap<FunctionSamples> &P) {
+    return SampleProfileWriter::write(M, P);
+  }
+};
+
+} // End namespace sampleprof
+
+} // End namespace llvm
+
+#endif // LLVM_PROFILEDATA_SAMPLEPROFWRITER_H

diff --git a/include/llvm/Support/ARMBuildAttributes.h b/include/llvm/Support/ARMBuildAttributes.h
index f63e0a6..07340de 100644
--- a/include/llvm/Support/ARMBuildAttributes.h
+++ b/include/llvm/Support/ARMBuildAttributes.h

@@ -16,8 +16,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_ARM_BUILD_ATTRIBUTES_H
-#define LLVM_SUPPORT_ARM_BUILD_ATTRIBUTES_H
+#ifndef LLVM_SUPPORT_ARMBUILDATTRIBUTES_H
+#define LLVM_SUPPORT_ARMBUILDATTRIBUTES_H
 
 namespace llvm {
 class StringRef;
@@ -146,6 +146,12 @@
   AllowNeon2 = 2,     // SIMDv2 was permitted (Half-precision FP, MAC operations)
   AllowNeonARMv8 = 3, // ARM v8-A SIMD was permitted
 
+  // Tag_ABI_PCS_R9_use, (=14), uleb128
+  R9IsGPR = 0,        // R9 used as v6 (just another callee-saved register)
+  R9IsSB = 1,         // R9 used as a global static base rgister
+  R9IsTLSPointer = 2, // R9 used as a thread local storage pointer
+  R9Reserved = 3,     // R9 not used by code associated with attributed entity
+
   // Tag_ABI_PCS_RW_data, (=15), uleb128
   AddressRWPCRel = 1, // Address RW static data PC-relative
   AddressRWSBRel = 2, // Address RW static data SB-relative
@@ -214,4 +220,4 @@
 } // namespace ARMBuildAttrs
 } // namespace llvm
 
-#endif // LLVM_SUPPORT_ARM_BUILD_ATTRIBUTES_H
+#endif

diff --git a/include/llvm/Support/ARMEHABI.h b/include/llvm/Support/ARMEHABI.h
index c7ac54a..9b052df 100644
--- a/include/llvm/Support/ARMEHABI.h
+++ b/include/llvm/Support/ARMEHABI.h

@@ -19,8 +19,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_ARM_EHABI_H
-#define LLVM_SUPPORT_ARM_EHABI_H
+#ifndef LLVM_SUPPORT_ARMEHABI_H
+#define LLVM_SUPPORT_ARMEHABI_H
 
 namespace llvm {
 namespace ARM {
@@ -131,4 +131,4 @@
 }
 }
 
-#endif // ARM_UNWIND_OP_H
+#endif

diff --git a/include/llvm/Support/ARMWinEH.h b/include/llvm/Support/ARMWinEH.h
index 78deb8d..1463629 100644
--- a/include/llvm/Support/ARMWinEH.h
+++ b/include/llvm/Support/ARMWinEH.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_WINARMEH_H
-#define LLVM_SUPPORT_WINARMEH_H
+#ifndef LLVM_SUPPORT_ARMWINEH_H
+#define LLVM_SUPPORT_ARMWINEH_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Endian.h"
@@ -350,16 +350,15 @@
   ArrayRef<support::ulittle32_t> EpilogueScopes() const {
     assert(E() == 0 && "epilogue scopes are only present when the E bit is 0");
     size_t Offset = HeaderWords(*this);
-    return ArrayRef<support::ulittle32_t>(&Data[Offset], EpilogueCount());
+    return makeArrayRef(&Data[Offset], EpilogueCount());
   }
 
-  ArrayRef<support::ulittle8_t> UnwindByteCode() const {
+  ArrayRef<uint8_t> UnwindByteCode() const {
     const size_t Offset = HeaderWords(*this)
                         + (E() ? 0 :  EpilogueCount());
-    const support::ulittle8_t *ByteCode =
-      reinterpret_cast<const support::ulittle8_t *>(&Data[Offset]);
-    return ArrayRef<support::ulittle8_t>(ByteCode,
-                                         CodeWords() * sizeof(uint32_t));
+    const uint8_t *ByteCode =
+      reinterpret_cast<const uint8_t *>(&Data[Offset]);
+    return makeArrayRef(ByteCode, CodeWords() * sizeof(uint32_t));
   }
 
   uint32_t ExceptionHandlerRVA() const {
@@ -381,4 +380,3 @@
 }
 
 #endif
-

diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index 7a7e4c0..de31771 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h

@@ -90,7 +90,10 @@
 public:
   void Reset() {}
 
-  void *Allocate(size_t Size, size_t /*Alignment*/) { return malloc(Size); }
+  LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t Size,
+                                                size_t /*Alignment*/) {
+    return malloc(Size);
+  }
 
   // Pull in base class overloads.
   using AllocatorBase<MallocAllocator>::Allocate;
@@ -116,8 +119,8 @@
 /// \brief Allocate memory in an ever growing pool, as if by bump-pointer.
 ///
 /// This isn't strictly a bump-pointer allocator as it uses backing slabs of
-/// memory rather than relying on boundless contiguous heap. However, it has
-/// bump-pointer semantics in that is a monotonically growing pool of memory
+/// memory rather than relying on a boundless contiguous heap. However, it has
+/// bump-pointer semantics in that it is a monotonically growing pool of memory
 /// where every allocation is found by merely allocating the next N bytes in
 /// the slab, or the next N bytes in the next slab.
 ///
@@ -200,28 +203,24 @@
   }
 
   /// \brief Allocate space at the specified alignment.
-  void *Allocate(size_t Size, size_t Alignment) {
-    if (!CurPtr) // Start a new slab if we haven't allocated one already.
-      StartNewSlab();
+  LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t Size, size_t Alignment) {
+    assert(Alignment > 0 && "0-byte alignnment is not allowed. Use 1 instead.");
 
     // Keep track of how many bytes we've allocated.
     BytesAllocated += Size;
 
-    // 0-byte alignment means 1-byte alignment.
-    if (Alignment == 0)
-      Alignment = 1;
+    size_t Adjustment = alignmentAdjustment(CurPtr, Alignment);
+    assert(Adjustment + Size >= Size && "Adjustment + Size must not overflow");
 
-    // Allocate the aligned space, going forwards from CurPtr.
-    char *Ptr = alignPtr(CurPtr, Alignment);
-
-    // Check if we can hold it.
-    if (Ptr + Size <= End) {
-      CurPtr = Ptr + Size;
+    // Check if we have enough space.
+    if (Adjustment + Size <= size_t(End - CurPtr)) {
+      char *AlignedPtr = CurPtr + Adjustment;
+      CurPtr = AlignedPtr + Size;
       // Update the allocation point of this memory block in MemorySanitizer.
       // Without this, MemorySanitizer messages for values originated from here
       // will point to the allocation of the entire slab.
-      __msan_allocated_memory(Ptr, Size);
-      return Ptr;
+      __msan_allocated_memory(AlignedPtr, Size);
+      return AlignedPtr;
     }
 
     // If Size is really big, allocate a separate slab for it.
@@ -230,19 +229,22 @@
       void *NewSlab = Allocator.Allocate(PaddedSize, 0);
       CustomSizedSlabs.push_back(std::make_pair(NewSlab, PaddedSize));
 
-      Ptr = alignPtr((char *)NewSlab, Alignment);
-      assert((uintptr_t)Ptr + Size <= (uintptr_t)NewSlab + PaddedSize);
-      __msan_allocated_memory(Ptr, Size);
-      return Ptr;
+      uintptr_t AlignedAddr = alignAddr(NewSlab, Alignment);
+      assert(AlignedAddr + Size <= (uintptr_t)NewSlab + PaddedSize);
+      char *AlignedPtr = (char*)AlignedAddr;
+      __msan_allocated_memory(AlignedPtr, Size);
+      return AlignedPtr;
     }
 
     // Otherwise, start a new slab and try again.
     StartNewSlab();
-    Ptr = alignPtr(CurPtr, Alignment);
-    CurPtr = Ptr + Size;
-    assert(CurPtr <= End && "Unable to allocate memory!");
-    __msan_allocated_memory(Ptr, Size);
-    return Ptr;
+    uintptr_t AlignedAddr = alignAddr(CurPtr, Alignment);
+    assert(AlignedAddr + Size <= (uintptr_t)End &&
+           "Unable to allocate memory!");
+    char *AlignedPtr = (char*)AlignedAddr;
+    CurPtr = AlignedPtr + Size;
+    __msan_allocated_memory(AlignedPtr, Size);
+    return AlignedPtr;
   }
 
   // Pull in base class overloads.
@@ -320,8 +322,10 @@
 #ifndef NDEBUG
       // Poison the memory so stale pointers crash sooner.  Note we must
       // preserve the Size and NextPtr fields at the beginning.
-      sys::Memory::setRangeWritable(*I, AllocatedSlabSize);
-      memset(*I, 0xCD, AllocatedSlabSize);
+      if (AllocatedSlabSize != 0) {
+        sys::Memory::setRangeWritable(*I, AllocatedSlabSize);
+        memset(*I, 0xCD, AllocatedSlabSize);
+      }
 #endif
       Allocator.Deallocate(*I, AllocatedSlabSize);
     }
@@ -373,7 +377,7 @@
   /// all memory allocated so far.
   void DestroyAll() {
     auto DestroyElements = [](char *Begin, char *End) {
-      assert(Begin == alignPtr(Begin, alignOf<T>()));
+      assert(Begin == (char*)alignAddr(Begin, alignOf<T>()));
       for (char *Ptr = Begin; Ptr + sizeof(T) <= End; Ptr += sizeof(T))
         reinterpret_cast<T *>(Ptr)->~T();
     };
@@ -382,7 +386,7 @@
          ++I) {
       size_t AllocatedSlabSize = BumpPtrAllocator::computeSlabSize(
           std::distance(Allocator.Slabs.begin(), I));
-      char *Begin = alignPtr((char *)*I, alignOf<T>());
+      char *Begin = (char*)alignAddr(*I, alignOf<T>());
       char *End = *I == Allocator.Slabs.back() ? Allocator.CurPtr
                                                : (char *)*I + AllocatedSlabSize;
 
@@ -392,7 +396,7 @@
     for (auto &PtrAndSize : Allocator.CustomSizedSlabs) {
       void *Ptr = PtrAndSize.first;
       size_t Size = PtrAndSize.second;
-      DestroyElements(alignPtr((char *)Ptr, alignOf<T>()), (char *)Ptr + Size);
+      DestroyElements((char*)alignAddr(Ptr, alignOf<T>()), (char *)Ptr + Size);
     }
 
     Allocator.Reset();

diff --git a/include/llvm/Support/CBindingWrapping.h b/include/llvm/Support/CBindingWrapping.h
index 51097b8..786ba18 100644
--- a/include/llvm/Support/CBindingWrapping.h
+++ b/include/llvm/Support/CBindingWrapping.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_C_BINDING_WRAPPING_H
-#define LLVM_C_BINDING_WRAPPING_H
+#ifndef LLVM_SUPPORT_CBINDINGWRAPPING_H
+#define LLVM_SUPPORT_CBINDINGWRAPPING_H
 
 #include "llvm/Support/Casting.h"
 

diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h
index e09ef07..150bce5 100644
--- a/include/llvm/Support/COFF.h
+++ b/include/llvm/Support/COFF.h

@@ -31,23 +31,30 @@
 namespace COFF {
 
   // The maximum number of sections that a COFF object can have (inclusive).
-  const int MaxNumberOfSections = 65299;
+  const int32_t MaxNumberOfSections16 = 65279;
 
   // The PE signature bytes that follows the DOS stub header.
   static const char PEMagic[] = { 'P', 'E', '\0', '\0' };
 
+  static const char BigObjMagic[] = {
+      '\xc7', '\xa1', '\xba', '\xd1', '\xee', '\xba', '\xa9', '\x4b',
+      '\xaf', '\x20', '\xfa', '\xf6', '\x6a', '\xa4', '\xdc', '\xb8',
+  };
+
   // Sizes in bytes of various things in the COFF format.
   enum {
-    HeaderSize     = 20,
+    Header16Size   = 20,
+    Header32Size   = 56,
     NameSize       = 8,
-    SymbolSize     = 18,
+    Symbol16Size   = 18,
+    Symbol32Size   = 20,
     SectionSize    = 40,
     RelocationSize = 10
   };
 
   struct header {
     uint16_t Machine;
-    uint16_t NumberOfSections;
+    int32_t  NumberOfSections;
     uint32_t TimeDateStamp;
     uint32_t PointerToSymbolTable;
     uint32_t NumberOfSymbols;
@@ -55,6 +62,24 @@
     uint16_t Characteristics;
   };
 
+  struct BigObjHeader {
+    enum : uint16_t { MinBigObjectVersion = 2 };
+
+    uint16_t Sig1; ///< Must be IMAGE_FILE_MACHINE_UNKNOWN (0).
+    uint16_t Sig2; ///< Must be 0xFFFF.
+    uint16_t Version;
+    uint16_t Machine;
+    uint32_t TimeDateStamp;
+    uint8_t  UUID[16];
+    uint32_t unused1;
+    uint32_t unused2;
+    uint32_t unused3;
+    uint32_t unused4;
+    uint32_t NumberOfSections;
+    uint32_t PointerToSymbolTable;
+    uint32_t NumberOfSymbols;
+  };
+
   enum MachineTypes {
     MT_Invalid = 0xffff,
 
@@ -124,7 +149,7 @@
   struct symbol {
     char     Name[NameSize];
     uint32_t Value;
-    uint16_t SectionNumber;
+    int32_t  SectionNumber;
     uint16_t Type;
     uint8_t  StorageClass;
     uint8_t  NumberOfAuxSymbols;
@@ -140,9 +165,9 @@
     SF_WeakExternal = 0x01000000
   };
 
-  enum SymbolSectionNumber {
-    IMAGE_SYM_DEBUG     = 0xFFFE,
-    IMAGE_SYM_ABSOLUTE  = 0xFFFF,
+  enum SymbolSectionNumber : int32_t {
+    IMAGE_SYM_DEBUG     = -2,
+    IMAGE_SYM_ABSOLUTE  = -1,
     IMAGE_SYM_UNDEFINED = 0
   };
 
@@ -367,18 +392,14 @@
     IMAGE_WEAK_EXTERN_SEARCH_ALIAS     = 3
   };
 
-  struct AuxiliaryFile {
-    uint8_t FileName[18];
-  };
-
   struct AuxiliarySectionDefinition {
     uint32_t Length;
     uint16_t NumberOfRelocations;
     uint16_t NumberOfLinenumbers;
     uint32_t CheckSum;
-    uint16_t Number;
+    uint32_t Number;
     uint8_t  Selection;
-    char     unused[3];
+    char     unused;
   };
 
   struct AuxiliaryCLRToken {
@@ -392,7 +413,6 @@
     AuxiliaryFunctionDefinition FunctionDefinition;
     AuxiliarybfAndefSymbol      bfAndefSymbol;
     AuxiliaryWeakExternal       WeakExternal;
-    AuxiliaryFile               File;
     AuxiliarySectionDefinition  SectionDefinition;
   };
 
@@ -495,12 +515,14 @@
     uint32_t SizeOfHeaders;
     uint32_t CheckSum;
     uint16_t Subsystem;
+    // FIXME: This should be DllCharacteristics to match the COFF spec.
     uint16_t DLLCharacteristics;
     uint32_t SizeOfStackReserve;
     uint32_t SizeOfStackCommit;
     uint32_t SizeOfHeapReserve;
     uint32_t SizeOfHeapCommit;
     uint32_t LoaderFlags;
+    // FIXME: This should be NumberOfRvaAndSizes to match the COFF spec.
     uint32_t NumberOfRvaAndSize;
   };
 
@@ -524,7 +546,9 @@
     BOUND_IMPORT,
     IAT,
     DELAY_IMPORT_DESCRIPTOR,
-    CLR_RUNTIME_HEADER
+    CLR_RUNTIME_HEADER,
+
+    NUM_DATA_DIRECTORIES
   };
 
   enum WindowsSubsystem {
@@ -642,13 +666,18 @@
 
   enum CodeViewLineTableIdentifiers {
     DEBUG_SECTION_MAGIC           = 0x4,
+    DEBUG_SYMBOL_SUBSECTION       = 0xF1,
     DEBUG_LINE_TABLE_SUBSECTION   = 0xF2,
     DEBUG_STRING_TABLE_SUBSECTION = 0xF3,
-    DEBUG_INDEX_SUBSECTION        = 0xF4
+    DEBUG_INDEX_SUBSECTION        = 0xF4,
+
+    // Symbol subsections are split into records of different types.
+    DEBUG_SYMBOL_TYPE_PROC_START = 0x1147,
+    DEBUG_SYMBOL_TYPE_PROC_END   = 0x114F
   };
 
-  inline bool isReservedSectionNumber(int N) {
-    return N == IMAGE_SYM_UNDEFINED || N > MaxNumberOfSections;
+  inline bool isReservedSectionNumber(int32_t SectionNumber) {
+    return SectionNumber <= 0;
   }
 
 } // End namespace COFF.

diff --git a/include/llvm/Support/CodeGen.h b/include/llvm/Support/CodeGen.h
index 240eba6..243f2dd 100644
--- a/include/llvm/Support/CodeGen.h
+++ b/include/llvm/Support/CodeGen.h

@@ -30,6 +30,10 @@
     enum Model { Default, JITDefault, Small, Kernel, Medium, Large };
   }
 
+  namespace PICLevel {
+    enum Level { Default=0, Small=1, Large=2 };
+  }
+
   // TLS models.
   namespace TLSModel {
     enum Model {

diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index 5cb5501..2b5c9c5 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h

@@ -270,8 +270,8 @@
 
   // addOccurrence - Wrapper around handleOccurrence that enforces Flags.
   //
-  bool addOccurrence(unsigned pos, StringRef ArgName,
-                     StringRef Value, bool MultiArg = false);
+  virtual bool addOccurrence(unsigned pos, StringRef ArgName,
+                             StringRef Value, bool MultiArg = false);
 
   // Prints option name followed by message.  Always returns true.
   bool error(const Twine &Message, StringRef ArgName = StringRef());
@@ -513,9 +513,9 @@
   }
 };
 
-template<class DataType>
-ValuesClass<DataType> END_WITH_NULL values(const char *Arg, DataType Val,
-                                           const char *Desc, ...) {
+template <class DataType>
+ValuesClass<DataType> LLVM_END_WITH_NULL
+values(const char *Arg, DataType Val, const char *Desc, ...) {
     va_list ValueArgs;
     va_start(ValueArgs, Desc);
     ValuesClass<DataType> Vals(Arg, Val, Desc, ValueArgs);
@@ -1649,6 +1649,10 @@
                                 StringRef Arg) override {
     return AliasFor->handleOccurrence(pos, AliasFor->ArgStr, Arg);
   }
+  bool addOccurrence(unsigned pos, StringRef /*ArgName*/,
+                     StringRef Value, bool MultiArg = false) override {
+    return AliasFor->addOccurrence(pos, AliasFor->ArgStr, Value, MultiArg);
+  }
   // Handle printing stuff...
   size_t getOptionWidth() const override;
   void printOptionInfo(size_t GlobalWidth) const override;
@@ -1786,9 +1790,12 @@
 ///
 /// \param [in] Source The string to be split on whitespace with quotes.
 /// \param [in] Saver Delegates back to the caller for saving parsed strings.
+/// \param [in] MarkEOLs true if tokenizing a response file and you want end of
+/// lines and end of the response file to be marked with a nullptr string.
 /// \param [out] NewArgv All parsed strings are appended to NewArgv.
 void TokenizeGNUCommandLine(StringRef Source, StringSaver &Saver,
-                            SmallVectorImpl<const char *> &NewArgv);
+                            SmallVectorImpl<const char *> &NewArgv,
+                            bool MarkEOLs = false);
 
 /// \brief Tokenizes a Windows command line which may contain quotes and escaped
 /// quotes.
@@ -1798,25 +1805,36 @@
 ///
 /// \param [in] Source The string to be split on whitespace with quotes.
 /// \param [in] Saver Delegates back to the caller for saving parsed strings.
+/// \param [in] MarkEOLs true if tokenizing a response file and you want end of
+/// lines and end of the response file to be marked with a nullptr string.
 /// \param [out] NewArgv All parsed strings are appended to NewArgv.
 void TokenizeWindowsCommandLine(StringRef Source, StringSaver &Saver,
-                                SmallVectorImpl<const char *> &NewArgv);
+                                SmallVectorImpl<const char *> &NewArgv,
+                                bool MarkEOLs = false);
 
 /// \brief String tokenization function type.  Should be compatible with either
 /// Windows or Unix command line tokenizers.
 typedef void (*TokenizerCallback)(StringRef Source, StringSaver &Saver,
-                                  SmallVectorImpl<const char *> &NewArgv);
+                                  SmallVectorImpl<const char *> &NewArgv,
+                                  bool MarkEOLs);
 
 /// \brief Expand response files on a command line recursively using the given
 /// StringSaver and tokenization strategy.  Argv should contain the command line
-/// before expansion and will be modified in place.
+/// before expansion and will be modified in place. If requested, Argv will
+/// also be populated with nullptrs indicating where each response file line
+/// ends, which is useful for the "/link" argument that needs to consume all
+/// remaining arguments only until the next end of line, when in a response
+/// file.
 ///
 /// \param [in] Saver Delegates back to the caller for saving parsed strings.
 /// \param [in] Tokenizer Tokenization strategy. Typically Unix or Windows.
 /// \param [in,out] Argv Command line into which to expand response files.
+/// \param [in] MarkEOLs Mark end of lines and the end of the response file
+/// with nullptrs in the Argv vector.
 /// \return true if all @files were expanded successfully or there were none.
 bool ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
-                         SmallVectorImpl<const char *> &Argv);
+                         SmallVectorImpl<const char *> &Argv,
+                         bool MarkEOLs = false);
 
 } // End namespace cl
 

diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 25bf32a..d008fec 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h

@@ -33,14 +33,19 @@
 # define __has_builtin(x) 0
 #endif
 
-/// \macro __GNUC_PREREQ
-/// \brief Defines __GNUC_PREREQ if glibc's features.h isn't available.
-#ifndef __GNUC_PREREQ
-# if defined(__GNUC__) && defined(__GNUC_MINOR__)
-#  define __GNUC_PREREQ(maj, min) \
-    ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
+/// \macro LLVM_GNUC_PREREQ
+/// \brief Extend the default __GNUC_PREREQ even if glibc's features.h isn't
+/// available.
+#ifndef LLVM_GNUC_PREREQ
+# if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#  define LLVM_GNUC_PREREQ(maj, min, patch) \
+    ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) + __GNUC_PATCHLEVEL__ >= \
+     ((maj) << 20) + ((min) << 10) + (patch))
+# elif defined(__GNUC__) && defined(__GNUC_MINOR__)
+#  define LLVM_GNUC_PREREQ(maj, min, patch) \
+    ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) >= ((maj) << 20) + ((min) << 10))
 # else
-#  define __GNUC_PREREQ(maj, min) 0
+#  define LLVM_GNUC_PREREQ(maj, min, patch) 0
 # endif
 #endif
 
@@ -61,7 +66,7 @@
 #define LLVM_MSC_PREREQ(version) 0
 #endif
 
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) || defined(__clang__) || LLVM_MSC_PREREQ(1900)
 #define LLVM_NOEXCEPT noexcept
 #else
 #define LLVM_NOEXCEPT
@@ -70,10 +75,8 @@
 /// \brief Does the compiler support r-value reference *this?
 ///
 /// Sadly, this is separate from just r-value reference support because GCC
-/// implemented everything but this thus far. No release of GCC yet has support
-/// for this feature so it is enabled with Clang only.
-/// FIXME: This should change to a version check when GCC grows support for it.
-#if __has_feature(cxx_rvalue_references)
+/// implemented this later than everything else.
+#if __has_feature(cxx_rvalue_references) || LLVM_GNUC_PREREQ(4, 8, 1)
 #define LLVM_HAS_RVALUE_REFERENCE_THIS 1
 #else
 #define LLVM_HAS_RVALUE_REFERENCE_THIS 0
@@ -128,20 +131,26 @@
 /// not accessible from outside it.  Can also be used to mark variables and
 /// functions, making them private to any shared library they are linked into.
 /// On PE/COFF targets, library visibility is the default, so this isn't needed.
-#if (__has_attribute(visibility) || __GNUC_PREREQ(4, 0)) &&                    \
+#if (__has_attribute(visibility) || LLVM_GNUC_PREREQ(4, 0, 0)) &&              \
     !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(LLVM_ON_WIN32)
 #define LLVM_LIBRARY_VISIBILITY __attribute__ ((visibility("hidden")))
 #else
 #define LLVM_LIBRARY_VISIBILITY
 #endif
 
-#if __has_attribute(used) || __GNUC_PREREQ(3, 1)
+#if __has_attribute(sentinel) || LLVM_GNUC_PREREQ(3, 0, 0)
+#define LLVM_END_WITH_NULL __attribute__((sentinel))
+#else
+#define LLVM_END_WITH_NULL
+#endif
+
+#if __has_attribute(used) || LLVM_GNUC_PREREQ(3, 1, 0)
 #define LLVM_ATTRIBUTE_USED __attribute__((__used__))
 #else
 #define LLVM_ATTRIBUTE_USED
 #endif
 
-#if __has_attribute(warn_unused_result) || __GNUC_PREREQ(3, 4)
+#if __has_attribute(warn_unused_result) || LLVM_GNUC_PREREQ(3, 4, 0)
 #define LLVM_ATTRIBUTE_UNUSED_RESULT __attribute__((__warn_unused_result__))
 #else
 #define LLVM_ATTRIBUTE_UNUSED_RESULT
@@ -155,14 +164,14 @@
 // more portable solution:
 //   (void)unused_var_name;
 // Prefer cast-to-void wherever it is sufficient.
-#if __has_attribute(unused) || __GNUC_PREREQ(3, 1)
+#if __has_attribute(unused) || LLVM_GNUC_PREREQ(3, 1, 0)
 #define LLVM_ATTRIBUTE_UNUSED __attribute__((__unused__))
 #else
 #define LLVM_ATTRIBUTE_UNUSED
 #endif
 
 // FIXME: Provide this for PE/COFF targets.
-#if (__has_attribute(weak) || __GNUC_PREREQ(4, 0)) &&                          \
+#if (__has_attribute(weak) || LLVM_GNUC_PREREQ(4, 0, 0)) &&                    \
     (!defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(LLVM_ON_WIN32))
 #define LLVM_ATTRIBUTE_WEAK __attribute__((__weak__))
 #else
@@ -185,7 +194,7 @@
 #define LLVM_READONLY
 #endif
 
-#if __has_builtin(__builtin_expect) || __GNUC_PREREQ(4, 0)
+#if __has_builtin(__builtin_expect) || LLVM_GNUC_PREREQ(4, 0, 0)
 #define LLVM_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
 #define LLVM_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
 #else
@@ -208,7 +217,7 @@
 
 /// LLVM_ATTRIBUTE_NOINLINE - On compilers where we have a directive to do so,
 /// mark a method "not for inlining".
-#if __has_attribute(noinline) || __GNUC_PREREQ(3, 4)
+#if __has_attribute(noinline) || LLVM_GNUC_PREREQ(3, 4, 0)
 #define LLVM_ATTRIBUTE_NOINLINE __attribute__((noinline))
 #elif defined(_MSC_VER)
 #define LLVM_ATTRIBUTE_NOINLINE __declspec(noinline)
@@ -220,7 +229,7 @@
 /// so, mark a method "always inline" because it is performance sensitive. GCC
 /// 3.4 supported this but is buggy in various cases and produces unimplemented
 /// errors, just use it in GCC 4.0 and later.
-#if __has_attribute(always_inline) || __GNUC_PREREQ(4, 0)
+#if __has_attribute(always_inline) || LLVM_GNUC_PREREQ(4, 0, 0)
 #define LLVM_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
 #define LLVM_ATTRIBUTE_ALWAYS_INLINE __forceinline
@@ -236,6 +245,12 @@
 #define LLVM_ATTRIBUTE_NORETURN
 #endif
 
+#if __has_attribute(returns_nonnull) || LLVM_GNUC_PREREQ(4, 9, 0)
+#define LLVM_ATTRIBUTE_RETURNS_NONNULL __attribute__((returns_nonnull))
+#else
+#define LLVM_ATTRIBUTE_RETURNS_NONNULL
+#endif
+
 /// LLVM_EXTENSION - Support compilers where we have a keyword to suppress
 /// pedantic diagnostics.
 #ifdef __GNUC__
@@ -262,7 +277,7 @@
 /// LLVM_BUILTIN_UNREACHABLE - On compilers which support it, expands
 /// to an expression which states that it is undefined behavior for the
 /// compiler to reach this point.  Otherwise is not defined.
-#if __has_builtin(__builtin_unreachable) || __GNUC_PREREQ(4, 5)
+#if __has_builtin(__builtin_unreachable) || LLVM_GNUC_PREREQ(4, 5, 0)
 # define LLVM_BUILTIN_UNREACHABLE __builtin_unreachable()
 #elif defined(_MSC_VER)
 # define LLVM_BUILTIN_UNREACHABLE __assume(false)
@@ -270,7 +285,7 @@
 
 /// LLVM_BUILTIN_TRAP - On compilers which support it, expands to an expression
 /// which causes the program to exit abnormally.
-#if __has_builtin(__builtin_trap) || __GNUC_PREREQ(4, 3)
+#if __has_builtin(__builtin_trap) || LLVM_GNUC_PREREQ(4, 3, 0)
 # define LLVM_BUILTIN_TRAP __builtin_trap()
 #else
 # define LLVM_BUILTIN_TRAP *(volatile int*)0x11 = 0
@@ -278,7 +293,7 @@
 
 /// \macro LLVM_ASSUME_ALIGNED
 /// \brief Returns a pointer with an assumed alignment.
-#if __has_builtin(__builtin_assume_aligned) && __GNUC_PREREQ(4, 7)
+#if __has_builtin(__builtin_assume_aligned) || LLVM_GNUC_PREREQ(4, 7, 0)
 # define LLVM_ASSUME_ALIGNED(p, a) __builtin_assume_aligned(p, a)
 #elif defined(LLVM_BUILTIN_UNREACHABLE)
 // As of today, clang does not support __builtin_assume_aligned.

diff --git a/include/llvm/Support/CrashRecoveryContext.h b/include/llvm/Support/CrashRecoveryContext.h
index 3869ebd..f1e636d 100644
--- a/include/llvm/Support/CrashRecoveryContext.h
+++ b/include/llvm/Support/CrashRecoveryContext.h

@@ -166,9 +166,7 @@
     : CrashRecoveryContextCleanupBase<
         CrashRecoveryContextDeleteCleanup<T>, T>(context, resource) {}
 
-  virtual void recoverResources() {
-    delete this->resource;
-  }  
+  void recoverResources() override { delete this->resource; }
 };
 
 template <typename T>
@@ -181,9 +179,7 @@
     : CrashRecoveryContextCleanupBase<CrashRecoveryContextReleaseRefCleanup<T>,
           T>(context, resource) {}
 
-  virtual void recoverResources() {
-    this->resource->Release();
-  }
+  void recoverResources() override { this->resource->Release(); }
 };
 
 template <typename T, typename Cleanup = CrashRecoveryContextDeleteCleanup<T> >

diff --git a/include/llvm/Support/DataExtractor.h b/include/llvm/Support/DataExtractor.h
index e8a19cd..48235d4 100644
--- a/include/llvm/Support/DataExtractor.h
+++ b/include/llvm/Support/DataExtractor.h

@@ -348,6 +348,17 @@
   bool isValidOffsetForDataOfSize(uint32_t offset, uint32_t length) const {
     return offset + length >= offset && isValidOffset(offset + length - 1);
   }
+
+  /// Test the availability of enough bytes of data for a pointer from
+  /// \a offset. The size of a pointer is \a getAddressSize().
+  ///
+  /// @return
+  ///     \b true if \a offset is a valid offset and there are enough
+  ///     bytes for a pointer available at that offset, \b false
+  ///     otherwise.
+  bool isValidOffsetForAddress(uint32_t offset) const {
+    return isValidOffsetForDataOfSize(offset, AddressSize);
+  }
 };
 
 } // namespace llvm

diff --git a/include/llvm/Support/DataTypes.h.cmake b/include/llvm/Support/DataTypes.h.cmake
index 1f0c8eb..c90bf51 100644
--- a/include/llvm/Support/DataTypes.h.cmake
+++ b/include/llvm/Support/DataTypes.h.cmake

@@ -101,6 +101,13 @@
 #define PRIu64 "I64u"
 #define PRIx64 "I64x"
 #define PRIX64 "I64X"
+
+#define PRId32 "d"
+#define PRIi32 "i"
+#define PRIo32 "o"
+#define PRIu32 "u"
+#define PRIx32 "x"
+#define PRIX32 "X"
 #endif /* HAVE_INTTYPES_H */
 
 #endif /* _MSC_VER */
@@ -116,12 +123,6 @@
 # define UINT64_MAX 0xffffffffffffffffULL
 #endif
 
-#if __GNUC__ > 3
-#define END_WITH_NULL __attribute__((sentinel))
-#else
-#define END_WITH_NULL
-#endif
-
 #ifndef HUGE_VALF
 #define HUGE_VALF (float)HUGE_VAL
 #endif

diff --git a/include/llvm/Support/DataTypes.h.in b/include/llvm/Support/DataTypes.h.in
index 09cfcdf..b8b2ba5 100644
--- a/include/llvm/Support/DataTypes.h.in
+++ b/include/llvm/Support/DataTypes.h.in

@@ -116,12 +116,6 @@
 # define UINT64_MAX 0xffffffffffffffffULL
 #endif
 
-#if __GNUC__ > 3
-#define END_WITH_NULL __attribute__((sentinel))
-#else
-#define END_WITH_NULL
-#endif
-
 #ifndef HUGE_VALF
 #define HUGE_VALF (float)HUGE_VAL
 #endif

diff --git a/include/llvm/Support/Disassembler.h b/include/llvm/Support/Disassembler.h
deleted file mode 100644
index 6d1cc0f..0000000
--- a/include/llvm/Support/Disassembler.h
+++ /dev/null

@@ -1,35 +0,0 @@
-//===- llvm/Support/Disassembler.h ------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the necessary glue to call external disassembler
-// libraries.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SYSTEM_DISASSEMBLER_H
-#define LLVM_SYSTEM_DISASSEMBLER_H
-
-#include "llvm/Support/DataTypes.h"
-#include <string>
-
-namespace llvm {
-namespace sys {
-
-/// This function returns true, if there is possible to use some external
-/// disassembler library. False otherwise.
-bool hasDisassembler();
-
-/// This function provides some "glue" code to call external disassembler
-/// libraries.
-std::string disassembleBuffer(uint8_t* start, size_t length, uint64_t pc = 0);
-
-}
-}
-
-#endif // LLVM_SYSTEM_DISASSEMBLER_H

diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index cd9f756..47b00b1 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h

@@ -7,9 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains constants used for implementing Dwarf debug support.  For
-// Details on the Dwarf 3 specfication see DWARF Debugging Information Format
-// V.3 reference manual http://dwarf.freestandards.org ,
+// \file
+// \brief This file contains constants used for implementing Dwarf
+// debug support.
+//
+// For details on the Dwarf specfication see the latest DWARF Debugging
+// Information Format standard document on http://www.dwarfstd.org. This
+// file often includes support for non-released standard features.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,22 +25,6 @@
 
 namespace llvm {
 
-//===----------------------------------------------------------------------===//
-// Debug info constants.
-
-enum : uint32_t {
-  LLVMDebugVersion = (12 << 16),    // Current version of debug information.
-  LLVMDebugVersion11 = (11 << 16),  // Constant for version 11.
-  LLVMDebugVersion10 = (10 << 16),  // Constant for version 10.
-  LLVMDebugVersion9 = (9 << 16),    // Constant for version 9.
-  LLVMDebugVersion8 = (8 << 16),    // Constant for version 8.
-  LLVMDebugVersion7 = (7 << 16),    // Constant for version 7.
-  LLVMDebugVersion6 = (6 << 16),    // Constant for version 6.
-  LLVMDebugVersion5 = (5 << 16),    // Constant for version 5.
-  LLVMDebugVersion4 = (4 << 16),    // Constant for version 4.
-  LLVMDebugVersionMask = 0xffff0000 // Mask for version number.
-};
-
 namespace dwarf {
 
 //===----------------------------------------------------------------------===//
@@ -53,6 +41,7 @@
 
   DW_TAG_auto_variable = 0x100, // Tag for local (auto) variables.
   DW_TAG_arg_variable = 0x101,  // Tag for argument variables.
+  DW_TAG_expression = 0x102,    // Tag for complex address expressions.
 
   DW_TAG_user_base = 0x1000, // Recommended base for user tags.
 
@@ -779,100 +768,24 @@
   DW_LLE_offset_pair_entry
 };
 
+/// Contstants for the DW_APPLE_PROPERTY_attributes attribute.
+/// Keep this list in sync with clang's DeclSpec.h ObjCPropertyAttributeKind.
 enum ApplePropertyAttributes {
   // Apple Objective-C Property Attributes
   DW_APPLE_PROPERTY_readonly = 0x01,
-  DW_APPLE_PROPERTY_readwrite = 0x02,
+  DW_APPLE_PROPERTY_getter = 0x02,
   DW_APPLE_PROPERTY_assign = 0x04,
-  DW_APPLE_PROPERTY_retain = 0x08,
-  DW_APPLE_PROPERTY_copy = 0x10,
-  DW_APPLE_PROPERTY_nonatomic = 0x20
+  DW_APPLE_PROPERTY_readwrite = 0x08,
+  DW_APPLE_PROPERTY_retain = 0x10,
+  DW_APPLE_PROPERTY_copy = 0x20,
+  DW_APPLE_PROPERTY_nonatomic = 0x40,
+  DW_APPLE_PROPERTY_setter = 0x80,
+  DW_APPLE_PROPERTY_atomic = 0x100,
+  DW_APPLE_PROPERTY_weak =   0x200,
+  DW_APPLE_PROPERTY_strong = 0x400,
+  DW_APPLE_PROPERTY_unsafe_unretained = 0x800
 };
 
-/// TagString - Return the string for the specified tag.
-///
-const char *TagString(unsigned Tag);
-
-/// ChildrenString - Return the string for the specified children flag.
-///
-const char *ChildrenString(unsigned Children);
-
-/// AttributeString - Return the string for the specified attribute.
-///
-const char *AttributeString(unsigned Attribute);
-
-/// FormEncodingString - Return the string for the specified form encoding.
-///
-const char *FormEncodingString(unsigned Encoding);
-
-/// OperationEncodingString - Return the string for the specified operation
-/// encoding.
-const char *OperationEncodingString(unsigned Encoding);
-
-/// AttributeEncodingString - Return the string for the specified attribute
-/// encoding.
-const char *AttributeEncodingString(unsigned Encoding);
-
-/// DecimalSignString - Return the string for the specified decimal sign
-/// attribute.
-const char *DecimalSignString(unsigned Sign);
-
-/// EndianityString - Return the string for the specified endianity.
-///
-const char *EndianityString(unsigned Endian);
-
-/// AccessibilityString - Return the string for the specified accessibility.
-///
-const char *AccessibilityString(unsigned Access);
-
-/// VisibilityString - Return the string for the specified visibility.
-///
-const char *VisibilityString(unsigned Visibility);
-
-/// VirtualityString - Return the string for the specified virtuality.
-///
-const char *VirtualityString(unsigned Virtuality);
-
-/// LanguageString - Return the string for the specified language.
-///
-const char *LanguageString(unsigned Language);
-
-/// CaseString - Return the string for the specified identifier case.
-///
-const char *CaseString(unsigned Case);
-
-/// ConventionString - Return the string for the specified calling convention.
-///
-const char *ConventionString(unsigned Convention);
-
-/// InlineCodeString - Return the string for the specified inline code.
-///
-const char *InlineCodeString(unsigned Code);
-
-/// ArrayOrderString - Return the string for the specified array order.
-///
-const char *ArrayOrderString(unsigned Order);
-
-/// DiscriminantString - Return the string for the specified discriminant
-/// descriptor.
-const char *DiscriminantString(unsigned Discriminant);
-
-/// LNStandardString - Return the string for the specified line number standard.
-///
-const char *LNStandardString(unsigned Standard);
-
-/// LNExtendedString - Return the string for the specified line number extended
-/// opcode encodings.
-const char *LNExtendedString(unsigned Encoding);
-
-/// MacinfoString - Return the string for the specified macinfo type encodings.
-///
-const char *MacinfoString(unsigned Encoding);
-
-/// CallFrameString - Return the string for the specified call frame instruction
-/// encodings.
-const char *CallFrameString(unsigned Encoding);
-
 // Constants for the DWARF5 Accelerator Table Proposal
 enum AcceleratorTable {
   // Data layout descriptors.
@@ -895,9 +808,6 @@
   DW_hash_function_djb = 0u
 };
 
-/// AtomTypeString - Return the string for the specified Atom type.
-const char *AtomTypeString(unsigned Atom);
-
 // Constants for the GNU pubnames/pubtypes extensions supporting gdb index.
 enum GDBIndexEntryKind {
   GIEK_NONE,
@@ -910,15 +820,51 @@
   GIEK_UNUSED7
 };
 
-const char *GDBIndexEntryKindString(GDBIndexEntryKind Kind);
-
 enum GDBIndexEntryLinkage {
   GIEL_EXTERNAL,
   GIEL_STATIC
 };
 
+/// \defgroup DwarfConstantsDumping Dwarf constants dumping functions
+///
+/// All these functions map their argument's value back to the
+/// corresponding enumerator name or return nullptr if the value isn't
+/// known.
+///
+/// @{
+const char *TagString(unsigned Tag);
+const char *ChildrenString(unsigned Children);
+const char *AttributeString(unsigned Attribute);
+const char *FormEncodingString(unsigned Encoding);
+const char *OperationEncodingString(unsigned Encoding);
+const char *AttributeEncodingString(unsigned Encoding);
+const char *DecimalSignString(unsigned Sign);
+const char *EndianityString(unsigned Endian);
+const char *AccessibilityString(unsigned Access);
+const char *VisibilityString(unsigned Visibility);
+const char *VirtualityString(unsigned Virtuality);
+const char *LanguageString(unsigned Language);
+const char *CaseString(unsigned Case);
+const char *ConventionString(unsigned Convention);
+const char *InlineCodeString(unsigned Code);
+const char *ArrayOrderString(unsigned Order);
+const char *DiscriminantString(unsigned Discriminant);
+const char *LNStandardString(unsigned Standard);
+const char *LNExtendedString(unsigned Encoding);
+const char *MacinfoString(unsigned Encoding);
+const char *CallFrameString(unsigned Encoding);
+const char *ApplePropertyString(unsigned);
+const char *AtomTypeString(unsigned Atom);
+const char *GDBIndexEntryKindString(GDBIndexEntryKind Kind);
 const char *GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage);
+/// @}
 
+/// \brief Returns the symbolic string representing Val when used as a value
+/// for attribute Attr.
+const char *AttributeValueString(uint16_t Attr, unsigned Val);
+
+/// \brief Decsribes an entry of the various gnu_pub* debug sections.
+/// 
 /// The gnu_pub* kind looks like:
 ///
 /// 0-3  reserved
@@ -950,6 +896,7 @@
   };
 };
 
+
 } // End of namespace dwarf
 
 } // End of namespace llvm

diff --git a/include/llvm/Support/DynamicLibrary.h b/include/llvm/Support/DynamicLibrary.h
index de47be6..a7d2221 100644
--- a/include/llvm/Support/DynamicLibrary.h
+++ b/include/llvm/Support/DynamicLibrary.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_DYNAMICLIBRARY_H
-#define LLVM_SYSTEM_DYNAMICLIBRARY_H
+#ifndef LLVM_SUPPORT_DYNAMICLIBRARY_H
+#define LLVM_SUPPORT_DYNAMICLIBRARY_H
 
 #include <string>
 
@@ -43,10 +43,11 @@
     // Opaque data used to interface with OS-specific dynamic library handling.
     void *Data;
 
-    explicit DynamicLibrary(void *data = &Invalid) : Data(data) {}
   public:
+    explicit DynamicLibrary(void *data = &Invalid) : Data(data) {}
+
     /// Returns true if the object refers to a valid library.
-    bool isValid() { return Data != &Invalid; }
+    bool isValid() const { return Data != &Invalid; }
 
     /// Searches through the library for the symbol \p symbolName. If it is
     /// found, the address of that symbol is returned. If not, NULL is returned.
@@ -101,4 +102,4 @@
 } // End sys namespace
 } // End llvm namespace
 
-#endif // LLVM_SYSTEM_DYNAMIC_LIBRARY_H
+#endif

diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 67cc651..5f78cc2 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h

@@ -458,6 +458,9 @@
   R_PPC_GOT16_LO              = 15,
   R_PPC_GOT16_HI              = 16,
   R_PPC_GOT16_HA              = 17,
+  R_PPC_PLTREL24              = 18,
+  R_PPC_JMP_SLOT              = 21,
+  R_PPC_LOCAL24PC             = 23,
   R_PPC_REL32                 = 26,
   R_PPC_TLS                   = 67,
   R_PPC_DTPMOD32              = 68,
@@ -495,6 +498,37 @@
   R_PPC_REL16_HA              = 252
 };
 
+// Specific e_flags for PPC64
+enum {
+  // e_flags bits specifying ABI:
+  // 1 for original ABI using function descriptors,
+  // 2 for revised ABI without function descriptors,
+  // 0 for unspecified or not using any features affected by the differences.
+  EF_PPC64_ABI = 3
+};
+
+// Special values for the st_other field in the symbol table entry for PPC64.
+enum {
+  STO_PPC64_LOCAL_BIT = 5,
+  STO_PPC64_LOCAL_MASK = (7 << STO_PPC64_LOCAL_BIT)
+};
+static inline int64_t
+decodePPC64LocalEntryOffset(unsigned Other) {
+  unsigned Val = (Other & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT;
+  return ((1 << Val) >> 2) << 2;
+}
+static inline unsigned
+encodePPC64LocalEntryOffset(int64_t Offset) {
+  unsigned Val = (Offset >= 4 * 4
+                  ? (Offset >= 8 * 4
+                     ? (Offset >= 16 * 4 ? 6 : 5)
+                     : 4)
+                  : (Offset >= 2 * 4
+                     ? 3
+                     : (Offset >= 1 * 4 ? 2 : 0)));
+  return Val << STO_PPC64_LOCAL_BIT;
+}
+
 // ELF Relocation types for PPC64
 enum {
   R_PPC64_NONE                = 0,
@@ -515,6 +549,7 @@
   R_PPC64_GOT16_LO            = 15,
   R_PPC64_GOT16_HI            = 16,
   R_PPC64_GOT16_HA            = 17,
+  R_PPC64_JMP_SLOT            = 21,
   R_PPC64_REL32               = 26,
   R_PPC64_ADDR64              = 38,
   R_PPC64_ADDR16_HIGHER       = 39,
@@ -621,6 +656,9 @@
 
   R_AARCH64_LDST128_ABS_LO12_NC         = 0x12b,
 
+  R_AARCH64_GOTREL64                    = 0x133,
+  R_AARCH64_GOTREL32                    = 0x134,
+
   R_AARCH64_ADR_GOT_PAGE                = 0x137,
   R_AARCH64_LD64_GOT_LO12_NC            = 0x138,
 
@@ -668,7 +706,17 @@
   R_AARCH64_TLSDESC_LD64_LO12_NC        = 0x233,
   R_AARCH64_TLSDESC_ADD_LO12_NC         = 0x234,
 
-  R_AARCH64_TLSDESC_CALL                = 0x239
+  R_AARCH64_TLSDESC_CALL                = 0x239,
+
+  R_AARCH64_COPY                        = 0x400,
+  R_AARCH64_GLOB_DAT                    = 0x401,
+  R_AARCH64_JUMP_SLOT                   = 0x402,
+  R_AARCH64_RELATIVE                    = 0x403,
+  R_AARCH64_TLS_DTPREL64                = 0x404,
+  R_AARCH64_TLS_DTPMOD64                = 0x405,
+  R_AARCH64_TLS_TPREL64                 = 0x406,
+  R_AARCH64_TLSDESC                     = 0x407,
+  R_AARCH64_IRELATIVE                   = 0x408
 };
 
 // ARM Specific e_flags
@@ -829,7 +877,13 @@
   EF_MIPS_ABI2      = 0x00000020,
   EF_MIPS_32BITMODE = 0x00000100,
   EF_MIPS_NAN2008   = 0x00000400, // Uses IEE 754-2008 NaN encoding
+
+  // ABI flags
   EF_MIPS_ABI_O32   = 0x00001000, // This file follows the first MIPS 32 bit ABI
+  EF_MIPS_ABI_O64    = 0x00002000, // O32 ABI extended for 64-bit architecture.
+  EF_MIPS_ABI_EABI32 = 0x00003000, // EABI in 32 bit mode.
+  EF_MIPS_ABI_EABI64 = 0x00004000, // EABI in 64 bit mode.
+  EF_MIPS_ABI        = 0x0000f000, // Mask for selecting EF_MIPS_ABI_ variant.
 
   //ARCH_ASE
   EF_MIPS_MICROMIPS = 0x02000000, // microMIPS

diff --git a/include/llvm/Support/Endian.h b/include/llvm/Support/Endian.h
index 455d0fc..47b82fd 100644
--- a/include/llvm/Support/Endian.h
+++ b/include/llvm/Support/Endian.h

@@ -93,15 +93,40 @@
       (void*)Value.buffer, newValue);
   }
 
+  packed_endian_specific_integral &operator+=(value_type newValue) {
+    *this = *this + newValue;
+    return *this;
+  }
+
+  packed_endian_specific_integral &operator-=(value_type newValue) {
+    *this = *this - newValue;
+    return *this;
+  }
+
 private:
   AlignedCharArray<PickAlignment<value_type, alignment>::value,
                    sizeof(value_type)> Value;
+
+public:
+  struct ref {
+    explicit ref(void *Ptr) : Ptr(Ptr) {}
+
+    operator value_type() const {
+      return endian::read<value_type, endian, alignment>(Ptr);
+    }
+
+    void operator=(value_type NewValue) {
+      endian::write<value_type, endian, alignment>(Ptr, NewValue);
+    }
+
+  private:
+    void *Ptr;
+  };
 };
+
 } // end namespace detail
 
 typedef detail::packed_endian_specific_integral
-                  <uint8_t, little, unaligned>  ulittle8_t;
-typedef detail::packed_endian_specific_integral
                   <uint16_t, little, unaligned> ulittle16_t;
 typedef detail::packed_endian_specific_integral
                   <uint32_t, little, unaligned> ulittle32_t;
@@ -109,8 +134,6 @@
                   <uint64_t, little, unaligned> ulittle64_t;
 
 typedef detail::packed_endian_specific_integral
-                   <int8_t, little, unaligned>  little8_t;
-typedef detail::packed_endian_specific_integral
                    <int16_t, little, unaligned> little16_t;
 typedef detail::packed_endian_specific_integral
                    <int32_t, little, unaligned> little32_t;
@@ -118,8 +141,6 @@
                    <int64_t, little, unaligned> little64_t;
 
 typedef detail::packed_endian_specific_integral
-                    <uint8_t, little, aligned>  aligned_ulittle8_t;
-typedef detail::packed_endian_specific_integral
                     <uint16_t, little, aligned> aligned_ulittle16_t;
 typedef detail::packed_endian_specific_integral
                     <uint32_t, little, aligned> aligned_ulittle32_t;
@@ -127,8 +148,6 @@
                     <uint64_t, little, aligned> aligned_ulittle64_t;
 
 typedef detail::packed_endian_specific_integral
-                     <int8_t, little, aligned>  aligned_little8_t;
-typedef detail::packed_endian_specific_integral
                      <int16_t, little, aligned> aligned_little16_t;
 typedef detail::packed_endian_specific_integral
                      <int32_t, little, aligned> aligned_little32_t;
@@ -136,8 +155,6 @@
                      <int64_t, little, aligned> aligned_little64_t;
 
 typedef detail::packed_endian_specific_integral
-                  <uint8_t, big, unaligned>     ubig8_t;
-typedef detail::packed_endian_specific_integral
                   <uint16_t, big, unaligned>    ubig16_t;
 typedef detail::packed_endian_specific_integral
                   <uint32_t, big, unaligned>    ubig32_t;
@@ -145,8 +162,6 @@
                   <uint64_t, big, unaligned>    ubig64_t;
 
 typedef detail::packed_endian_specific_integral
-                   <int8_t, big, unaligned>     big8_t;
-typedef detail::packed_endian_specific_integral
                    <int16_t, big, unaligned>    big16_t;
 typedef detail::packed_endian_specific_integral
                    <int32_t, big, unaligned>    big32_t;
@@ -154,8 +169,6 @@
                    <int64_t, big, unaligned>    big64_t;
 
 typedef detail::packed_endian_specific_integral
-                    <uint8_t, big, aligned>     aligned_ubig8_t;
-typedef detail::packed_endian_specific_integral
                     <uint16_t, big, aligned>    aligned_ubig16_t;
 typedef detail::packed_endian_specific_integral
                     <uint32_t, big, aligned>    aligned_ubig32_t;
@@ -163,8 +176,6 @@
                     <uint64_t, big, aligned>    aligned_ubig64_t;
 
 typedef detail::packed_endian_specific_integral
-                     <int8_t, big, aligned>     aligned_big8_t;
-typedef detail::packed_endian_specific_integral
                      <int16_t, big, aligned>    aligned_big16_t;
 typedef detail::packed_endian_specific_integral
                      <int32_t, big, aligned>    aligned_big32_t;

diff --git a/include/llvm/Support/EndianStream.h b/include/llvm/Support/EndianStream.h
index 89c66d3..94f372f 100644
--- a/include/llvm/Support/EndianStream.h
+++ b/include/llvm/Support/EndianStream.h

@@ -12,11 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LLVM_SUPPORT_ENDIAN_STREAM_H_
-#define _LLVM_SUPPORT_ENDIAN_STREAM_H_
+#ifndef LLVM_SUPPORT_ENDIANSTREAM_H
+#define LLVM_SUPPORT_ENDIANSTREAM_H
 
-#include <llvm/Support/Endian.h>
-#include <llvm/Support/raw_ostream.h>
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace support {
@@ -36,4 +36,4 @@
 } // end namespace support
 } // end namespace llvm
 
-#endif // _LLVM_SUPPORT_ENDIAN_STREAM_H_
+#endif

diff --git a/include/llvm/Support/ErrorOr.h b/include/llvm/Support/ErrorOr.h
index 0742a2d..84763de 100644
--- a/include/llvm/Support/ErrorOr.h
+++ b/include/llvm/Support/ErrorOr.h

@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_ERROR_OR_H
-#define LLVM_SUPPORT_ERROR_OR_H
+#ifndef LLVM_SUPPORT_ERROROR_H
+#define LLVM_SUPPORT_ERROROR_H
 
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/AlignOf.h"
@@ -68,9 +68,9 @@
 /// \endcode
 ///
 ///
-/// An implicit conversion to bool provides a way to check if there was an
-/// error. The unary * and -> operators provide pointer like access to the
-/// value. Accessing the value when there is an error has undefined behavior.
+/// Implicit conversion to bool returns true if there is a usable value. The
+/// unary * and -> operators provide pointer like access to the value. Accessing
+/// the value when there is an error has undefined behavior.
 ///
 /// When T is a reference type the behaivor is slightly different. The reference
 /// is held in a std::reference_wrapper<std::remove_reference<T>::type>, and
@@ -115,19 +115,19 @@
   }
 
   template <class OtherT>
-  ErrorOr(const ErrorOr<OtherT> &Other) {
+  ErrorOr(
+      const ErrorOr<OtherT> &Other,
+      typename std::enable_if<std::is_convertible<OtherT, T>::value>::type * =
+          nullptr) {
     copyConstruct(Other);
   }
 
-  ErrorOr &operator =(const ErrorOr &Other) {
-    copyAssign(Other);
-    return *this;
-  }
-
   template <class OtherT>
-  ErrorOr &operator =(const ErrorOr<OtherT> &Other) {
-    copyAssign(Other);
-    return *this;
+  explicit ErrorOr(
+      const ErrorOr<OtherT> &Other,
+      typename std::enable_if<
+          !std::is_convertible<OtherT, const T &>::value>::type * = nullptr) {
+    copyConstruct(Other);
   }
 
   ErrorOr(ErrorOr &&Other) {
@@ -135,17 +135,29 @@
   }
 
   template <class OtherT>
-  ErrorOr(ErrorOr<OtherT> &&Other) {
+  ErrorOr(
+      ErrorOr<OtherT> &&Other,
+      typename std::enable_if<std::is_convertible<OtherT, T>::value>::type * =
+          nullptr) {
     moveConstruct(std::move(Other));
   }
 
-  ErrorOr &operator =(ErrorOr &&Other) {
-    moveAssign(std::move(Other));
+  // This might eventually need SFINAE but it's more complex than is_convertible
+  // & I'm too lazy to write it right now.
+  template <class OtherT>
+  explicit ErrorOr(
+      ErrorOr<OtherT> &&Other,
+      typename std::enable_if<!std::is_convertible<OtherT, T>::value>::type * =
+          nullptr) {
+    moveConstruct(std::move(Other));
+  }
+
+  ErrorOr &operator=(const ErrorOr &Other) {
+    copyAssign(Other);
     return *this;
   }
 
-  template <class OtherT>
-  ErrorOr &operator =(ErrorOr<OtherT> &&Other) {
+  ErrorOr &operator=(ErrorOr &&Other) {
     moveAssign(std::move(Other));
     return *this;
   }
@@ -161,7 +173,7 @@
   }
 
   reference get() { return *getStorage(); }
-  const_reference get() const { return const_cast<ErrorOr<T> >(this)->get(); }
+  const_reference get() const { return const_cast<ErrorOr<T> *>(this)->get(); }
 
   std::error_code getError() const {
     return HasError ? *getErrorStorage() : std::error_code();

diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
index 0a9a979..a7cfacd 100644
--- a/include/llvm/Support/FileOutputBuffer.h
+++ b/include/llvm/Support/FileOutputBuffer.h

@@ -77,7 +77,7 @@
   FileOutputBuffer(const FileOutputBuffer &) LLVM_DELETED_FUNCTION;
   FileOutputBuffer &operator=(const FileOutputBuffer &) LLVM_DELETED_FUNCTION;
 
-  FileOutputBuffer(llvm::sys::fs::mapped_file_region *R,
+  FileOutputBuffer(std::unique_ptr<llvm::sys::fs::mapped_file_region> R,
                    StringRef Path, StringRef TempPath);
 
   std::unique_ptr<llvm::sys::fs::mapped_file_region> Region;

diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index 556701c..63c9ed5 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h

@@ -226,6 +226,7 @@
     unknown = 0,              ///< Unrecognized file
     bitcode,                  ///< Bitcode file
     archive,                  ///< ar style archive file
+    elf,                      ///< ELF Unknown type
     elf_relocatable,          ///< ELF Relocatable object file
     elf_executable,           ///< ELF Executable image
     elf_shared_object,        ///< ELF dynamically linked shared lib
@@ -276,14 +277,6 @@
 ///          platform-specific error_code.
 std::error_code make_absolute(SmallVectorImpl<char> &path);
 
-/// @brief Normalize path separators in \a Path
-///
-/// If the path contains any '\' separators, they are transformed into '/'.
-/// This is particularly useful when cross-compiling Windows on Linux, but is
-/// safe to invoke on Windows, which accepts both characters as a path
-/// separator.
-std::error_code normalize_separators(SmallVectorImpl<char> &Path);
-
 /// @brief Create all the non-existent directories in path.
 ///
 /// @param path Directories to create.
@@ -360,33 +353,38 @@
 ///          not.
 bool exists(file_status status);
 
+enum class AccessMode { Exist, Write, Execute };
+
+/// @brief Can the file be accessed?
+///
+/// @param Path Input path.
+/// @returns errc::success if the path can be accessed, otherwise a
+///          platform-specific error_code.
+std::error_code access(const Twine &Path, AccessMode Mode);
+
 /// @brief Does file exist?
 ///
-/// @param path Input path.
-/// @param result Set to true if the file represented by status exists, false if
-///               it does not. Undefined otherwise.
-/// @returns errc::success if result has been successfully set, otherwise a
-///          platform-specific error_code.
-std::error_code exists(const Twine &path, bool &result);
-
-/// @brief Simpler version of exists for clients that don't need to
-///        differentiate between an error and false.
-inline bool exists(const Twine &path) {
-  bool result;
-  return !exists(path, result) && result;
+/// @param Path Input path.
+/// @returns True if it exists, false otherwise.
+inline bool exists(const Twine &Path) {
+  return !access(Path, AccessMode::Exist);
 }
 
 /// @brief Can we execute this file?
 ///
 /// @param Path Input path.
 /// @returns True if we can execute it, false otherwise.
-bool can_execute(const Twine &Path);
+inline bool can_execute(const Twine &Path) {
+  return !access(Path, AccessMode::Execute);
+}
 
 /// @brief Can we write this file?
 ///
 /// @param Path Input path.
 /// @returns True if we can write to it, false otherwise.
-bool can_write(const Twine &Path);
+inline bool can_write(const Twine &Path) {
+  return !access(Path, AccessMode::Write);
+}
 
 /// @brief Do file_status's represent the same thing?
 ///

diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h
index b713cc7..8e163dd 100644
--- a/include/llvm/Support/Format.h
+++ b/include/llvm/Support/Format.h

@@ -23,6 +23,9 @@
 #ifndef LLVM_SUPPORT_FORMAT_H
 #define LLVM_SUPPORT_FORMAT_H
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+
 #include <cassert>
 #include <cstdio>
 #ifdef _MSC_VER
@@ -41,6 +44,7 @@
 class format_object_base {
 protected:
   const char *Fmt;
+  ~format_object_base() {} // Disallow polymorphic deletion.
   virtual void home(); // Out of line virtual method.
 
   /// Call snprintf() for this object, on the given buffer and size.
@@ -48,7 +52,6 @@
 
 public:
   format_object_base(const char *fmt) : Fmt(fmt) {}
-  virtual ~format_object_base() {}
 
   /// Format the object into the specified buffer.  On success, this returns
   /// the length of the formatted string.  If the buffer is too small, this
@@ -79,7 +82,7 @@
 /// returns whether or not it is big enough.
 
 template <typename T>
-class format_object1 : public format_object_base {
+class format_object1 final : public format_object_base {
   T Val;
 public:
   format_object1(const char *fmt, const T &val)
@@ -92,7 +95,7 @@
 };
 
 template <typename T1, typename T2>
-class format_object2 : public format_object_base {
+class format_object2 final : public format_object_base {
   T1 Val1;
   T2 Val2;
 public:
@@ -106,7 +109,7 @@
 };
 
 template <typename T1, typename T2, typename T3>
-class format_object3 : public format_object_base {
+class format_object3 final : public format_object_base {
   T1 Val1;
   T2 Val2;
   T3 Val3;
@@ -121,7 +124,7 @@
 };
 
 template <typename T1, typename T2, typename T3, typename T4>
-class format_object4 : public format_object_base {
+class format_object4 final : public format_object_base {
   T1 Val1;
   T2 Val2;
   T3 Val3;
@@ -138,7 +141,7 @@
 };
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class format_object5 : public format_object_base {
+class format_object5 final : public format_object_base {
   T1 Val1;
   T2 Val2;
   T3 Val3;
@@ -158,7 +161,7 @@
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5,
           typename T6>
-class format_object6 : public format_object_base {
+class format_object6 final : public format_object_base {
   T1 Val1;
   T2 Val2;
   T3 Val3;
@@ -225,6 +228,66 @@
                                                 Val5, Val6);
 }
 
+/// This is a helper class used for left_justify() and right_justify().
+class FormattedString {
+  StringRef Str;
+  unsigned Width;
+  bool RightJustify;
+  friend class raw_ostream;
+public:
+    FormattedString(StringRef S, unsigned W, bool R)
+      : Str(S), Width(W), RightJustify(R) { }
+};
+
+/// left_justify - append spaces after string so total output is
+/// \p Width characters.  If \p Str is larger that \p Width, full string
+/// is written with no padding.
+inline FormattedString left_justify(StringRef Str, unsigned Width) {
+  return FormattedString(Str, Width, false);
+}
+
+/// right_justify - add spaces before string so total output is
+/// \p Width characters.  If \p Str is larger that \p Width, full string
+/// is written with no padding.
+inline FormattedString right_justify(StringRef Str, unsigned Width) {
+  return FormattedString(Str, Width, true);
+}
+
+/// This is a helper class used for format_hex() and format_decimal().
+class FormattedNumber {
+  uint64_t HexValue;
+  int64_t DecValue;
+  unsigned Width;
+  bool Hex;
+  bool Upper;
+  friend class raw_ostream;
+public:
+    FormattedNumber(uint64_t HV, int64_t DV, unsigned W, bool H, bool U)
+      : HexValue(HV), DecValue(DV), Width(W), Hex(H), Upper(U) { }
+};
+
+/// format_hex - Output \p N as a fixed width hexadecimal. If number will not
+/// fit in width, full number is still printed.  Examples:
+///   OS << format_hex(255, 4)        => 0xff
+///   OS << format_hex(255, 4, true)  => 0xFF
+///   OS << format_hex(255, 6)        => 0x00ff
+///   OS << format_hex(255, 2)        => 0xff
+inline FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false) {
+  assert(Width <= 18 && "hex width must be <= 18");
+  return FormattedNumber(N, 0, Width, true, Upper);
+}
+
+/// format_decimal - Output \p N as a right justified, fixed-width decimal. If 
+/// number will not fit in width, full number is still printed.  Examples:
+///   OS << format_decimal(0, 5)     => "    0"
+///   OS << format_decimal(255, 5)   => "  255"
+///   OS << format_decimal(-1, 3)    => " -1"
+///   OS << format_decimal(12345, 3) => "12345"
+inline FormattedNumber format_decimal(int64_t N, unsigned Width) {
+  return FormattedNumber(0, N, Width, false, false);
+}
+
+
 } // end namespace llvm
 
 #endif

diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h
index 0cb6cfd..e378602 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/Support/GCOV.h

@@ -100,7 +100,7 @@
   /// cursor and return true otherwise return false.
   bool readFunctionTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() || 
+    if (Tag.empty() ||
         Tag[0] != '\0' || Tag[1] != '\0' ||
         Tag[2] != '\0' || Tag[3] != '\1') {
       return false;
@@ -113,7 +113,7 @@
   /// cursor and return true otherwise return false.
   bool readBlockTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() || 
+    if (Tag.empty() ||
         Tag[0] != '\0' || Tag[1] != '\0' ||
         Tag[2] != '\x41' || Tag[3] != '\x01') {
       return false;
@@ -126,7 +126,7 @@
   /// cursor and return true otherwise return false.
   bool readEdgeTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() || 
+    if (Tag.empty() ||
         Tag[0] != '\0' || Tag[1] != '\0' ||
         Tag[2] != '\x43' || Tag[3] != '\x01') {
       return false;
@@ -139,7 +139,7 @@
   /// cursor and return true otherwise return false.
   bool readLineTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() || 
+    if (Tag.empty() ||
         Tag[0] != '\0' || Tag[1] != '\0' ||
         Tag[2] != '\x45' || Tag[3] != '\x01') {
       return false;
@@ -152,7 +152,7 @@
   /// cursor and return true otherwise return false.
   bool readArcTag() {
     StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    if (Tag.empty() || 
+    if (Tag.empty() ||
         Tag[0] != '\0' || Tag[1] != '\0' ||
         Tag[2] != '\xa1' || Tag[3] != '\1') {
       return false;

diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h
index 876ab6e..6bc4b44 100644
--- a/include/llvm/Support/GenericDomTree.h
+++ b/include/llvm/Support/GenericDomTree.h

@@ -15,8 +15,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_GENERIC_DOM_TREE_H
-#define LLVM_SUPPORT_GENERIC_DOM_TREE_H
+#ifndef LLVM_SUPPORT_GENERICDOMTREE_H
+#define LLVM_SUPPORT_GENERICDOMTREE_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"

diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index bcba5e0..ad4f8a9 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h

@@ -22,8 +22,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef LLVM_SUPPORT_GENERIC_DOM_TREE_CONSTRUCTION_H
-#define LLVM_SUPPORT_GENERIC_DOM_TREE_CONSTRUCTION_H
+#ifndef LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
+#define LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/GenericDomTree.h"
@@ -125,7 +125,7 @@
     typename GraphT::NodeType* VAncestor = DT.Vertex[VInfo.Parent];
 
     // Process Ancestor first
-    if (Visited.insert(VAncestor) && VInfo.Parent >= LastLinked) {
+    if (Visited.insert(VAncestor).second && VInfo.Parent >= LastLinked) {
       Work.push_back(VAncestor);
       continue;
     } 

diff --git a/include/llvm/Support/IncludeFile.h b/include/llvm/Support/IncludeFile.h
deleted file mode 100644
index 2067e34..0000000
--- a/include/llvm/Support/IncludeFile.h
+++ /dev/null

@@ -1,79 +0,0 @@
-//===- llvm/Support/IncludeFile.h - Ensure Linking Of Library ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the FORCE_DEFINING_FILE_TO_BE_LINKED and DEFINE_FILE_FOR
-// macros.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_INCLUDEFILE_H
-#define LLVM_SUPPORT_INCLUDEFILE_H
-
-/// This macro is the public interface that IncludeFile.h exports. This gives
-/// us the option to implement the "link the definition" capability in any
-/// manner that we choose. All header files that depend on a specific .cpp
-/// file being linked at run time should use this macro instead of the
-/// IncludeFile class directly.
-///
-/// For example, foo.h would use:<br/>
-/// <tt>FORCE_DEFINING_FILE_TO_BE_LINKED(foo)</tt><br/>
-///
-/// And, foo.cp would use:<br/>
-/// <tt>DEFINING_FILE_FOR(foo)</tt><br/>
-#ifdef __GNUC__
-// If the `used' attribute is available, use it to create a variable
-// with an initializer that will force the linking of the defining file.
-#define FORCE_DEFINING_FILE_TO_BE_LINKED(name) \
-  namespace llvm { \
-    extern const char name ## LinkVar; \
-    __attribute__((used)) static const char *const name ## LinkObj = \
-      &name ## LinkVar; \
-  }
-#else
-// Otherwise use a constructor call.
-#define FORCE_DEFINING_FILE_TO_BE_LINKED(name) \
-  namespace llvm { \
-    extern const char name ## LinkVar; \
-    static const IncludeFile name ## LinkObj ( &name ## LinkVar ); \
-  }
-#endif
-
-/// This macro is the counterpart to FORCE_DEFINING_FILE_TO_BE_LINKED. It should
-/// be used in a .cpp file to define the name referenced in a header file that
-/// will cause linkage of the .cpp file. It should only be used at extern level.
-#define DEFINING_FILE_FOR(name) \
-  namespace llvm { const char name ## LinkVar = 0; }
-
-namespace llvm {
-
-/// This class is used in the implementation of FORCE_DEFINING_FILE_TO_BE_LINKED
-/// macro to make sure that the implementation of a header file is included
-/// into a tool that uses the header.  This is solely
-/// to overcome problems linking .a files and not getting the implementation
-/// of compilation units we need. This is commonly an issue with the various
-/// Passes but also occurs elsewhere in LLVM. We like to use .a files because
-/// they link faster and provide the smallest executables. However, sometimes
-/// those executables are too small, if the program doesn't reference something
-/// that might be needed, especially by a loaded share object. This little class
-/// helps to resolve that problem. The basic strategy is to use this class in
-/// a header file and pass the address of a variable to the constructor. If the
-/// variable is defined in the header file's corresponding .cpp file then all
-/// tools/libraries that \#include the header file will require the .cpp as
-/// well.
-/// For example:<br/>
-/// <tt>extern int LinkMyCodeStub;</tt><br/>
-/// <tt>static IncludeFile LinkMyModule(&LinkMyCodeStub);</tt><br/>
-/// @brief Class to ensure linking of corresponding object file.
-struct IncludeFile {
-  explicit IncludeFile(const void *);
-};
-
-}
-
-#endif

diff --git a/include/llvm/Support/LEB128.h b/include/llvm/Support/LEB128.h
index ea76c9b..6a95432 100644
--- a/include/llvm/Support/LEB128.h
+++ b/include/llvm/Support/LEB128.h

@@ -82,7 +82,7 @@
   uint64_t Value = 0;
   unsigned Shift = 0;
   do {
-    Value += (*p & 0x7f) << Shift;
+    Value += uint64_t(*p & 0x7f) << Shift;
     Shift += 7;
   } while (*p++ >= 128);
   if (n)
@@ -90,6 +90,26 @@
   return Value;
 }
 
+/// Utility function to decode a SLEB128 value.
+inline int64_t decodeSLEB128(const uint8_t *p, unsigned *n = nullptr) {
+  const uint8_t *orig_p = p;
+  int64_t Value = 0;
+  unsigned Shift = 0;
+  uint8_t Byte;
+  do {
+    Byte = *p++;
+    Value |= ((Byte & 0x7f) << Shift);
+    Shift += 7;
+  } while (Byte >= 128);
+  // Sign extend negative numbers.
+  if (Byte & 0x40)
+    Value |= (-1ULL) << Shift;
+  if (n)
+    *n = (unsigned)(p - orig_p);
+  return Value;
+}
+
+
 /// Utility function to get the size of the ULEB128-encoded value.
 extern unsigned getULEB128Size(uint64_t Value);
 

diff --git a/include/llvm/Support/LineIterator.h b/include/llvm/Support/LineIterator.h
index 2a58262..9d4cd3b 100644
--- a/include/llvm/Support/LineIterator.h
+++ b/include/llvm/Support/LineIterator.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_LINEITERATOR_H__
-#define LLVM_SUPPORT_LINEITERATOR_H__
+#ifndef LLVM_SUPPORT_LINEITERATOR_H
+#define LLVM_SUPPORT_LINEITERATOR_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
@@ -18,20 +18,22 @@
 
 class MemoryBuffer;
 
-/// \brief A forward iterator which reads non-blank text lines from a buffer.
+/// \brief A forward iterator which reads text lines from a buffer.
 ///
 /// This class provides a forward iterator interface for reading one line at
 /// a time from a buffer. When default constructed the iterator will be the
 /// "end" iterator.
 ///
-/// The iterator also is aware of what line number it is currently processing
-/// and can strip comment lines given the comment-starting character.
+/// The iterator is aware of what line number it is currently processing. It
+/// strips blank lines by default, and comment lines given a comment-starting
+/// character.
 ///
 /// Note that this iterator requires the buffer to be nul terminated.
 class line_iterator
     : public std::iterator<std::forward_iterator_tag, StringRef> {
   const MemoryBuffer *Buffer;
   char CommentMarker;
+  bool SkipBlanks;
 
   unsigned LineNumber;
   StringRef CurrentLine;
@@ -41,7 +43,8 @@
   line_iterator() : Buffer(nullptr) {}
 
   /// \brief Construct a new iterator around some memory buffer.
-  explicit line_iterator(const MemoryBuffer &Buffer, char CommentMarker = '\0');
+  explicit line_iterator(const MemoryBuffer &Buffer, bool SkipBlanks = true,
+                         char CommentMarker = '\0');
 
   /// \brief Return true if we've reached EOF or are an "end" iterator.
   bool is_at_eof() const { return !Buffer; }
@@ -82,4 +85,4 @@
 };
 }
 
-#endif // LLVM_SUPPORT_LINEITERATOR_H__
+#endif

diff --git a/include/llvm/Support/MD5.h b/include/llvm/Support/MD5.h
index 4eb8507..f6e1e92 100644
--- a/include/llvm/Support/MD5.h
+++ b/include/llvm/Support/MD5.h

@@ -25,8 +25,8 @@
  * See md5.c for more information.
  */
 
-#ifndef LLVM_SYSTEM_MD5_H
-#define LLVM_SYSTEM_MD5_H
+#ifndef LLVM_SUPPORT_MD5_H
+#define LLVM_SUPPORT_MD5_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
@@ -55,11 +55,11 @@
   void update(StringRef Str);
 
   /// \brief Finishes off the hash and puts the result in result.
-  void final(MD5Result &result);
+  void final(MD5Result &Result);
 
   /// \brief Translates the bytes in \p Res to a hex string that is
   /// deposited into \p Str. The result will be of length 32.
-  static void stringifyResult(MD5Result &Res, SmallString<32> &Str);
+  static void stringifyResult(MD5Result &Result, SmallString<32> &Str);
 
 private:
   const uint8_t *body(ArrayRef<uint8_t> Data);

diff --git a/include/llvm/Support/MachO.h b/include/llvm/Support/MachO.h
index bd4dc2f..c07bd88 100644
--- a/include/llvm/Support/MachO.h
+++ b/include/llvm/Support/MachO.h

@@ -73,7 +73,10 @@
       MH_SETUID_SAFE             = 0x00080000u,
       MH_NO_REEXPORTED_DYLIBS    = 0x00100000u,
       MH_PIE                     = 0x00200000u,
-      MH_DEAD_STRIPPABLE_DYLIB   = 0x00400000u
+      MH_DEAD_STRIPPABLE_DYLIB   = 0x00400000u,
+      MH_HAS_TLV_DESCRIPTORS     = 0x00800000u,
+      MH_NO_HEAP_EXECUTION       = 0x01000000u,
+      MH_APP_EXTENSION_SAFE      = 0x02000000u
     };
 
     enum : uint32_t {
@@ -327,7 +330,8 @@
 
     enum ExportSymbolKind {
       EXPORT_SYMBOL_FLAGS_KIND_REGULAR        = 0x00u,
-      EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL   = 0x01u
+      EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL   = 0x01u,
+      EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE       = 0x02u
     };
 
 
@@ -386,13 +390,15 @@
 
     enum StabType {
       // Constant values for the "n_type" field in llvm::MachO::nlist and
-      // llvm::MachO::nlist_64 when "(n_type & NlistMaskStab) != 0"
+      // llvm::MachO::nlist_64 when "(n_type & N_STAB) != 0"
       N_GSYM    = 0x20u,
       N_FNAME   = 0x22u,
       N_FUN     = 0x24u,
       N_STSYM   = 0x26u,
       N_LCSYM   = 0x28u,
       N_BNSYM   = 0x2Eu,
+      N_PC      = 0x30u,
+      N_AST     = 0x32u,
       N_OPT     = 0x3Cu,
       N_RSYM    = 0x40u,
       N_SLINE   = 0x44u,
@@ -841,7 +847,7 @@
                           // LC_VERSION_MIN_IPHONEOS
       uint32_t cmdsize;   // sizeof(struct version_min_command)
       uint32_t version;   // X.Y.Z is encoded in nibbles xxxx.yy.zz
-      uint32_t reserved;
+      uint32_t sdk;       // X.Y.Z is encoded in nibbles xxxx.yy.zz
     };
 
     struct dyld_info_command {
@@ -957,6 +963,13 @@
     };
 
     // Structs from <mach-o/nlist.h>
+    struct nlist_base {
+      uint32_t n_strx;
+      uint8_t n_type;
+      uint8_t n_sect;
+      uint16_t n_desc;
+    };
+
     struct nlist {
       uint32_t n_strx;
       uint8_t n_type;
@@ -973,6 +986,217 @@
       uint64_t n_value;
     };
 
+
+    // Byte order swapping functions for MachO structs
+
+    inline void swapStruct(mach_header &mh) {
+      sys::swapByteOrder(mh.magic);
+      sys::swapByteOrder(mh.cputype);
+      sys::swapByteOrder(mh.cpusubtype);
+      sys::swapByteOrder(mh.filetype);
+      sys::swapByteOrder(mh.ncmds);
+      sys::swapByteOrder(mh.sizeofcmds);
+      sys::swapByteOrder(mh.flags);
+    }
+
+    inline void swapStruct(mach_header_64 &H) {
+      sys::swapByteOrder(H.magic);
+      sys::swapByteOrder(H.cputype);
+      sys::swapByteOrder(H.cpusubtype);
+      sys::swapByteOrder(H.filetype);
+      sys::swapByteOrder(H.ncmds);
+      sys::swapByteOrder(H.sizeofcmds);
+      sys::swapByteOrder(H.flags);
+      sys::swapByteOrder(H.reserved);
+    }
+
+    inline void swapStruct(load_command &lc) {
+      sys::swapByteOrder(lc.cmd);
+      sys::swapByteOrder(lc.cmdsize);
+    }
+
+    inline void swapStruct(symtab_command &lc) {
+      sys::swapByteOrder(lc.cmd);
+      sys::swapByteOrder(lc.cmdsize);
+      sys::swapByteOrder(lc.symoff);
+      sys::swapByteOrder(lc.nsyms);
+      sys::swapByteOrder(lc.stroff);
+      sys::swapByteOrder(lc.strsize);
+    }
+
+    inline void swapStruct(segment_command_64 &seg) {
+      sys::swapByteOrder(seg.cmd);
+      sys::swapByteOrder(seg.cmdsize);
+      sys::swapByteOrder(seg.vmaddr);
+      sys::swapByteOrder(seg.vmsize);
+      sys::swapByteOrder(seg.fileoff);
+      sys::swapByteOrder(seg.filesize);
+      sys::swapByteOrder(seg.maxprot);
+      sys::swapByteOrder(seg.initprot);
+      sys::swapByteOrder(seg.nsects);
+      sys::swapByteOrder(seg.flags);
+    }
+
+    inline void swapStruct(segment_command &seg) {
+      sys::swapByteOrder(seg.cmd);
+      sys::swapByteOrder(seg.cmdsize);
+      sys::swapByteOrder(seg.vmaddr);
+      sys::swapByteOrder(seg.vmsize);
+      sys::swapByteOrder(seg.fileoff);
+      sys::swapByteOrder(seg.filesize);
+      sys::swapByteOrder(seg.maxprot);
+      sys::swapByteOrder(seg.initprot);
+      sys::swapByteOrder(seg.nsects);
+      sys::swapByteOrder(seg.flags);
+    }
+
+    inline void swapStruct(section_64 &sect) {
+      sys::swapByteOrder(sect.addr);
+      sys::swapByteOrder(sect.size);
+      sys::swapByteOrder(sect.offset);
+      sys::swapByteOrder(sect.align);
+      sys::swapByteOrder(sect.reloff);
+      sys::swapByteOrder(sect.nreloc);
+      sys::swapByteOrder(sect.flags);
+      sys::swapByteOrder(sect.reserved1);
+      sys::swapByteOrder(sect.reserved2);
+    }
+
+    inline void swapStruct(section &sect) {
+      sys::swapByteOrder(sect.addr);
+      sys::swapByteOrder(sect.size);
+      sys::swapByteOrder(sect.offset);
+      sys::swapByteOrder(sect.align);
+      sys::swapByteOrder(sect.reloff);
+      sys::swapByteOrder(sect.nreloc);
+      sys::swapByteOrder(sect.flags);
+      sys::swapByteOrder(sect.reserved1);
+      sys::swapByteOrder(sect.reserved2);
+    }
+
+    inline void swapStruct(dyld_info_command &info) {
+      sys::swapByteOrder(info.cmd);
+      sys::swapByteOrder(info.cmdsize);
+      sys::swapByteOrder(info.rebase_off);
+      sys::swapByteOrder(info.rebase_size);
+      sys::swapByteOrder(info.bind_off);
+      sys::swapByteOrder(info.bind_size);
+      sys::swapByteOrder(info.weak_bind_off);
+      sys::swapByteOrder(info.weak_bind_size);
+      sys::swapByteOrder(info.lazy_bind_off);
+      sys::swapByteOrder(info.lazy_bind_size);
+      sys::swapByteOrder(info.export_off);
+      sys::swapByteOrder(info.export_size);
+    }
+
+    inline void swapStruct(dylib_command &d) {
+      sys::swapByteOrder(d.cmd);
+      sys::swapByteOrder(d.cmdsize);
+      sys::swapByteOrder(d.dylib.name);
+      sys::swapByteOrder(d.dylib.timestamp);
+      sys::swapByteOrder(d.dylib.current_version);
+      sys::swapByteOrder(d.dylib.compatibility_version);
+    }
+
+    inline void swapStruct(dylinker_command &d) {
+      sys::swapByteOrder(d.cmd);
+      sys::swapByteOrder(d.cmdsize);
+      sys::swapByteOrder(d.name);
+    }
+
+    inline void swapStruct(uuid_command &u) {
+      sys::swapByteOrder(u.cmd);
+      sys::swapByteOrder(u.cmdsize);
+    }
+
+    inline void swapStruct(source_version_command &s) {
+      sys::swapByteOrder(s.cmd);
+      sys::swapByteOrder(s.cmdsize);
+      sys::swapByteOrder(s.version);
+    }
+
+    inline void swapStruct(entry_point_command &e) {
+      sys::swapByteOrder(e.cmd);
+      sys::swapByteOrder(e.cmdsize);
+      sys::swapByteOrder(e.entryoff);
+      sys::swapByteOrder(e.stacksize);
+    }
+
+    inline void swapStruct(dysymtab_command &dst) {
+      sys::swapByteOrder(dst.cmd);
+      sys::swapByteOrder(dst.cmdsize);
+      sys::swapByteOrder(dst.ilocalsym);
+      sys::swapByteOrder(dst.nlocalsym);
+      sys::swapByteOrder(dst.iextdefsym);
+      sys::swapByteOrder(dst.nextdefsym);
+      sys::swapByteOrder(dst.iundefsym);
+      sys::swapByteOrder(dst.nundefsym);
+      sys::swapByteOrder(dst.tocoff);
+      sys::swapByteOrder(dst.ntoc);
+      sys::swapByteOrder(dst.modtaboff);
+      sys::swapByteOrder(dst.nmodtab);
+      sys::swapByteOrder(dst.extrefsymoff);
+      sys::swapByteOrder(dst.nextrefsyms);
+      sys::swapByteOrder(dst.indirectsymoff);
+      sys::swapByteOrder(dst.nindirectsyms);
+      sys::swapByteOrder(dst.extreloff);
+      sys::swapByteOrder(dst.nextrel);
+      sys::swapByteOrder(dst.locreloff);
+      sys::swapByteOrder(dst.nlocrel);
+    }
+
+    inline void swapStruct(any_relocation_info &reloc) {
+      sys::swapByteOrder(reloc.r_word0);
+      sys::swapByteOrder(reloc.r_word1);
+    }
+
+    inline void swapStruct(nlist_base &S) {
+      sys::swapByteOrder(S.n_strx);
+      sys::swapByteOrder(S.n_desc);
+    }
+
+    inline void swapStruct(nlist &sym) {
+      sys::swapByteOrder(sym.n_strx);
+      sys::swapByteOrder(sym.n_desc);
+      sys::swapByteOrder(sym.n_value);
+    }
+
+    inline void swapStruct(nlist_64 &sym) {
+      sys::swapByteOrder(sym.n_strx);
+      sys::swapByteOrder(sym.n_desc);
+      sys::swapByteOrder(sym.n_value);
+    }
+
+    inline void swapStruct(linkedit_data_command &C) {
+      sys::swapByteOrder(C.cmd);
+      sys::swapByteOrder(C.cmdsize);
+      sys::swapByteOrder(C.dataoff);
+      sys::swapByteOrder(C.datasize);
+    }
+
+    inline void swapStruct(linker_options_command &C) {
+      sys::swapByteOrder(C.cmd);
+      sys::swapByteOrder(C.cmdsize);
+      sys::swapByteOrder(C.count);
+    }
+
+    inline void swapStruct(version_min_command&C) {
+      sys::swapByteOrder(C.cmd);
+      sys::swapByteOrder(C.cmdsize);
+      sys::swapByteOrder(C.version);
+      sys::swapByteOrder(C.sdk);
+    }
+
+    inline void swapStruct(data_in_code_entry &C) {
+      sys::swapByteOrder(C.offset);
+      sys::swapByteOrder(C.length);
+      sys::swapByteOrder(C.kind);
+    }
+
+    inline void swapStruct(uint32_t &C) {
+      sys::swapByteOrder(C);
+    }
+
     // Get/Set functions from <mach-o/nlist.h>
 
     static inline uint16_t GET_LIBRARY_ORDINAL(uint16_t n_desc) {

diff --git a/include/llvm/Support/ManagedStatic.h b/include/llvm/Support/ManagedStatic.h
index d8fbfeb..addd34e 100644
--- a/include/llvm/Support/ManagedStatic.h
+++ b/include/llvm/Support/ManagedStatic.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_MANAGED_STATIC_H
-#define LLVM_SUPPORT_MANAGED_STATIC_H
+#ifndef LLVM_SUPPORT_MANAGEDSTATIC_H
+#define LLVM_SUPPORT_MANAGEDSTATIC_H
 
 #include "llvm/Support/Atomic.h"
 #include "llvm/Support/Threading.h"

diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 0abba62..9d16182 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h

@@ -22,7 +22,6 @@
 
 #ifdef _MSC_VER
 #include <intrin.h>
-#include <limits>
 #endif
 
 namespace llvm {
@@ -81,7 +80,7 @@
   if (ZB != ZB_Undefined && Val == 0)
     return 32;
 
-#if __has_builtin(__builtin_ctz) || __GNUC_PREREQ(4, 0)
+#if __has_builtin(__builtin_ctz) || LLVM_GNUC_PREREQ(4, 0, 0)
   return __builtin_ctz(Val);
 #elif _MSC_VER
   unsigned long Index;
@@ -96,7 +95,7 @@
   if (ZB != ZB_Undefined && Val == 0)
     return 64;
 
-#if __has_builtin(__builtin_ctzll) || __GNUC_PREREQ(4, 0)
+#if __has_builtin(__builtin_ctzll) || LLVM_GNUC_PREREQ(4, 0, 0)
   return __builtin_ctzll(Val);
 #elif _MSC_VER
   unsigned long Index;
@@ -147,7 +146,7 @@
   if (ZB != ZB_Undefined && Val == 0)
     return 32;
 
-#if __has_builtin(__builtin_clz) || __GNUC_PREREQ(4, 0)
+#if __has_builtin(__builtin_clz) || LLVM_GNUC_PREREQ(4, 0, 0)
   return __builtin_clz(Val);
 #elif _MSC_VER
   unsigned long Index;
@@ -162,7 +161,7 @@
   if (ZB != ZB_Undefined && Val == 0)
     return 64;
 
-#if __has_builtin(__builtin_clzll) || __GNUC_PREREQ(4, 0)
+#if __has_builtin(__builtin_clzll) || LLVM_GNUC_PREREQ(4, 0, 0)
   return __builtin_clzll(Val);
 #elif _MSC_VER
   unsigned long Index;
@@ -550,16 +549,23 @@
   return (A | B) & (1 + ~(A | B));
 }
 
-/// \brief Aligns \c Ptr to \c Alignment bytes, rounding up.
+/// \brief Aligns \c Addr to \c Alignment bytes, rounding up.
 ///
 /// Alignment should be a power of two.  This method rounds up, so
-/// AlignPtr(7, 4) == 8 and AlignPtr(8, 4) == 8.
-inline char *alignPtr(char *Ptr, size_t Alignment) {
+/// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8.
+inline uintptr_t alignAddr(void *Addr, size_t Alignment) {
   assert(Alignment && isPowerOf2_64((uint64_t)Alignment) &&
          "Alignment is not a power of two!");
 
-  return (char *)(((uintptr_t)Ptr + Alignment - 1) &
-                  ~(uintptr_t)(Alignment - 1));
+  assert((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr);
+
+  return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1));
+}
+
+/// \brief Returns the necessary adjustment for aligning \c Ptr to \c Alignment
+/// bytes, rounding up.
+inline size_t alignmentAdjustment(void *Ptr, size_t Alignment) {
+  return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr;
 }
 
 /// NextPowerOf2 - Returns the next power of two (in 64-bits)
@@ -589,9 +595,10 @@
 ///   RoundUpToAlignment(5, 8) = 8
 ///   RoundUpToAlignment(17, 8) = 24
 ///   RoundUpToAlignment(~0LL, 8) = 0
+///   RoundUpToAlignment(321, 255) = 510
 /// \endcode
 inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align) {
-  return ((Value + Align - 1) / Align) * Align;
+  return (Value + Align - 1) / Align * Align;
 }
 
 /// Returns the offset to the next integer (mod 2**64) that is greater than
@@ -632,13 +639,7 @@
   return int64_t(X << (64 - B)) >> (64 - B);
 }
 
-#if defined(_MSC_VER)
-  // Visual Studio defines the HUGE_VAL class of macros using purposeful
-  // constant arithmetic overflow, which it then warns on when encountered.
-  const float huge_valf = std::numeric_limits<float>::infinity();
-#else
-  const float huge_valf = HUGE_VALF;
-#endif
+extern const float huge_valf;
 } // End llvm namespace
 
 #endif

diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 147be47..e2f8d7e 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h

@@ -24,11 +24,13 @@
 #include <system_error>
 
 namespace llvm {
-/// MemoryBuffer - This interface provides simple read-only access to a block
-/// of memory, and provides simple methods for reading files and standard input
-/// into a memory buffer.  In addition to basic access to the characters in the
-/// file, this interface guarantees you can read one character past the end of
-/// the file, and that this character will read as '\0'.
+class MemoryBufferRef;
+
+/// This interface provides simple read-only access to a block of memory, and
+/// provides simple methods for reading files and standard input into a memory
+/// buffer.  In addition to basic access to the characters in the file, this
+/// interface guarantees you can read one character past the end of the file,
+/// and that this character will read as '\0'.
 ///
 /// The '\0' guarantee is needed to support an optimization -- it's intended to
 /// be more efficient for clients which are reading all the data to stop
@@ -55,8 +57,8 @@
     return StringRef(BufferStart, getBufferSize());
   }
 
-  /// getBufferIdentifier - Return an identifier for this buffer, typically the
-  /// filename it was read from.
+  /// Return an identifier for this buffer, typically the filename it was read
+  /// from.
   virtual const char *getBufferIdentifier() const {
     return "Unknown buffer";
   }
@@ -70,19 +72,15 @@
   /// changing, e.g. when libclang tries to parse while the user is
   /// editing/updating the file.
   static ErrorOr<std::unique_ptr<MemoryBuffer>>
-  getFile(Twine Filename, int64_t FileSize = -1,
+  getFile(const Twine &Filename, int64_t FileSize = -1,
           bool RequiresNullTerminator = true, bool IsVolatileSize = false);
 
   /// Given an already-open file descriptor, map some slice of it into a
   /// MemoryBuffer. The slice is specified by an \p Offset and \p MapSize.
   /// Since this is in the middle of a file, the buffer is not null terminated.
-  ///
-  /// \param IsVolatileSize Set to true to indicate that the file size may be
-  /// changing, e.g. when libclang tries to parse while the user is
-  /// editing/updating the file.
   static ErrorOr<std::unique_ptr<MemoryBuffer>>
-  getOpenFileSlice(int FD, const char *Filename, uint64_t MapSize,
-                   int64_t Offset, bool IsVolatileSize = false);
+  getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,
+                   int64_t Offset);
 
   /// Given an already-open file descriptor, read the file and return a
   /// MemoryBuffer.
@@ -91,33 +89,34 @@
   /// changing, e.g. when libclang tries to parse while the user is
   /// editing/updating the file.
   static ErrorOr<std::unique_ptr<MemoryBuffer>>
-  getOpenFile(int FD, const char *Filename, uint64_t FileSize,
+  getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,
               bool RequiresNullTerminator = true, bool IsVolatileSize = false);
 
-  /// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
-  /// that InputData must be null terminated if RequiresNullTerminator is true.
-  static MemoryBuffer *getMemBuffer(StringRef InputData,
-                                    StringRef BufferName = "",
-                                    bool RequiresNullTerminator = true);
+  /// Open the specified memory range as a MemoryBuffer. Note that InputData
+  /// must be null terminated if RequiresNullTerminator is true.
+  static std::unique_ptr<MemoryBuffer>
+  getMemBuffer(StringRef InputData, StringRef BufferName = "",
+               bool RequiresNullTerminator = true);
 
-  /// getMemBufferCopy - Open the specified memory range as a MemoryBuffer,
-  /// copying the contents and taking ownership of it.  InputData does not
-  /// have to be null terminated.
-  static MemoryBuffer *getMemBufferCopy(StringRef InputData,
-                                        StringRef BufferName = "");
+  static std::unique_ptr<MemoryBuffer>
+  getMemBuffer(MemoryBufferRef Ref, bool RequiresNullTerminator = true);
 
-  /// getNewMemBuffer - Allocate a new MemoryBuffer of the specified size that
-  /// is completely initialized to zeros.  Note that the caller should
-  /// initialize the memory allocated by this method.  The memory is owned by
-  /// the MemoryBuffer object.
-  static MemoryBuffer *getNewMemBuffer(size_t Size, StringRef BufferName = "");
+  /// Open the specified memory range as a MemoryBuffer, copying the contents
+  /// and taking ownership of it. InputData does not have to be null terminated.
+  static std::unique_ptr<MemoryBuffer>
+  getMemBufferCopy(StringRef InputData, const Twine &BufferName = "");
 
-  /// getNewUninitMemBuffer - Allocate a new MemoryBuffer of the specified size
-  /// that is not initialized.  Note that the caller should initialize the
-  /// memory allocated by this method.  The memory is owned by the MemoryBuffer
-  /// object.
-  static MemoryBuffer *getNewUninitMemBuffer(size_t Size,
-                                             StringRef BufferName = "");
+  /// Allocate a new zero-initialized MemoryBuffer of the specified size. Note
+  /// that the caller need not initialize the memory allocated by this method.
+  /// The memory is owned by the MemoryBuffer object.
+  static std::unique_ptr<MemoryBuffer>
+  getNewMemBuffer(size_t Size, StringRef BufferName = "");
+
+  /// Allocate a new MemoryBuffer of the specified size that is not initialized.
+  /// Note that the caller should initialize the memory allocated by this
+  /// method. The memory is owned by the MemoryBuffer object.
+  static std::unique_ptr<MemoryBuffer>
+  getNewUninitMemBuffer(size_t Size, const Twine &BufferName = "");
 
   /// Read all of stdin into a file buffer, and return it.
   static ErrorOr<std::unique_ptr<MemoryBuffer>> getSTDIN();
@@ -125,7 +124,11 @@
   /// Open the specified file as a MemoryBuffer, or open stdin if the Filename
   /// is "-".
   static ErrorOr<std::unique_ptr<MemoryBuffer>>
-  getFileOrSTDIN(StringRef Filename, int64_t FileSize = -1);
+  getFileOrSTDIN(const Twine &Filename, int64_t FileSize = -1);
+
+  /// Map a subrange of the the specified file as a MemoryBuffer.
+  static ErrorOr<std::unique_ptr<MemoryBuffer>>
+  getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset);
 
   //===--------------------------------------------------------------------===//
   // Provided for performance analysis.
@@ -139,7 +142,27 @@
 
   /// Return information on the memory mechanism used to support the
   /// MemoryBuffer.
-  virtual BufferKind getBufferKind() const = 0;  
+  virtual BufferKind getBufferKind() const = 0;
+
+  MemoryBufferRef getMemBufferRef() const;
+};
+
+class MemoryBufferRef {
+  StringRef Buffer;
+  StringRef Identifier;
+
+public:
+  MemoryBufferRef() {}
+  MemoryBufferRef(StringRef Buffer, StringRef Identifier)
+      : Buffer(Buffer), Identifier(Identifier) {}
+
+  StringRef getBuffer() const { return Buffer; }
+
+  StringRef getBufferIdentifier() const { return Identifier; }
+
+  const char *getBufferStart() const { return Buffer.begin(); }
+  const char *getBufferEnd() const { return Buffer.end(); }
+  size_t getBufferSize() const { return Buffer.size(); }
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).

diff --git a/include/llvm/Support/MemoryObject.h b/include/llvm/Support/MemoryObject.h
index 17aa9d2..e0c8749 100644
--- a/include/llvm/Support/MemoryObject.h
+++ b/include/llvm/Support/MemoryObject.h

@@ -14,49 +14,53 @@
 
 namespace llvm {
 
-/// MemoryObject - Abstract base class for contiguous addressable memory.
-///   Necessary for cases in which the memory is in another process, in a
-///   file, or on a remote machine.
-///   All size and offset parameters are uint64_ts, to allow 32-bit processes
-///   access to 64-bit address spaces.
+/// Interface to data which might be streamed. Streamability has 2 important
+/// implications/restrictions. First, the data might not yet exist in memory
+/// when the request is made. This just means that readByte/readBytes might have
+/// to block or do some work to get it. More significantly, the exact size of
+/// the object might not be known until it has all been fetched. This means that
+/// to return the right result, getExtent must also wait for all the data to
+/// arrive; therefore it should not be called on objects which are actually
+/// streamed (this would defeat the purpose of streaming). Instead,
+/// isValidAddress can be used to test addresses without knowing the exact size
+/// of the stream. Finally, getPointer can be used instead of readBytes to avoid
+/// extra copying.
 class MemoryObject {
 public:
-  /// Destructor      - Override as necessary.
   virtual ~MemoryObject();
 
-  /// getBase         - Returns the lowest valid address in the region.
-  ///
-  /// @result         - The lowest valid address.
-  virtual uint64_t getBase() const = 0;
-
-  /// getExtent       - Returns the size of the region in bytes.  (The region is
-  ///                   contiguous, so the highest valid address of the region
-  ///                   is getBase() + getExtent() - 1).
+  /// Returns the size of the region in bytes.  (The region is contiguous, so
+  /// the highest valid address of the region is getExtent() - 1).
   ///
   /// @result         - The size of the region.
   virtual uint64_t getExtent() const = 0;
 
-  /// readByte        - Tries to read a single byte from the region.
+  /// Tries to read a contiguous range of bytes from the region, up to the end
+  /// of the region.
   ///
-  /// @param address  - The address of the byte, in the same space as getBase().
-  /// @param ptr      - A pointer to a byte to be filled in.  Must be non-NULL.
-  /// @result         - 0 if successful; -1 if not.  Failure may be due to a
-  ///                   bounds violation or an implementation-specific error.
-  virtual int readByte(uint64_t address, uint8_t *ptr) const = 0;
-
-  /// readBytes       - Tries to read a contiguous range of bytes from the
-  ///                   region, up to the end of the region.
-  ///                   You should override this function if there is a quicker
-  ///                   way than going back and forth with individual bytes.
-  ///
-  /// @param address  - The address of the first byte, in the same space as 
-  ///                   getBase().
-  /// @param size     - The number of bytes to copy.
-  /// @param buf      - A pointer to a buffer to be filled in.  Must be non-NULL
+  /// @param Buf      - A pointer to a buffer to be filled in.  Must be non-NULL
   ///                   and large enough to hold size bytes.
-  /// @result         - 0 if successful; -1 if not.  Failure may be due to a
-  ///                   bounds violation or an implementation-specific error.
-  virtual int readBytes(uint64_t address, uint64_t size, uint8_t *buf) const;
+  /// @param Size     - The number of bytes to copy.
+  /// @param Address  - The address of the first byte, in the same space as
+  ///                   getBase().
+  /// @result         - The number of bytes read.
+  virtual uint64_t readBytes(uint8_t *Buf, uint64_t Size,
+                             uint64_t Address) const = 0;
+
+  /// Ensures that the requested data is in memory, and returns a pointer to it.
+  /// More efficient than using readBytes if the data is already in memory. May
+  /// block until (address - base + size) bytes have been read
+  /// @param address - address of the byte, in the same space as getBase()
+  /// @param size    - amount of data that must be available on return
+  /// @result        - valid pointer to the requested data
+  virtual const uint8_t *getPointer(uint64_t address, uint64_t size) const = 0;
+
+  /// Returns true if the address is within the object (i.e. between base and
+  /// base + extent - 1 inclusive). May block until (address - base) bytes have
+  /// been read
+  /// @param address - address of the byte, in the same space as getBase()
+  /// @result        - true if the address may be read with readByte()
+  virtual bool isValidAddress(uint64_t address) const = 0;
 };
 
 }

diff --git a/include/llvm/Support/Mutex.h b/include/llvm/Support/Mutex.h
index 496a438..97dd501 100644
--- a/include/llvm/Support/Mutex.h
+++ b/include/llvm/Support/Mutex.h

@@ -86,16 +86,17 @@
     /// indicates whether this mutex should become a no-op when we're not
     /// running in multithreaded mode.
     template<bool mt_only>
-    class SmartMutex : public MutexImpl {
+    class SmartMutex {
+      MutexImpl impl;
       unsigned acquired;
       bool recursive;
     public:
       explicit SmartMutex(bool rec = true) :
-        MutexImpl(rec), acquired(0), recursive(rec) { }
+        impl(rec), acquired(0), recursive(rec) { }
 
-      bool acquire() {
+      bool lock() {
         if (!mt_only || llvm_is_multithreaded()) {
-          return MutexImpl::acquire();
+          return impl.acquire();
         } else {
           // Single-threaded debugging code.  This would be racy in
           // multithreaded mode, but provides not sanity checks in single
@@ -106,9 +107,9 @@
         }
       }
 
-      bool release() {
+      bool unlock() {
         if (!mt_only || llvm_is_multithreaded()) {
-          return MutexImpl::release();
+          return impl.release();
         } else {
           // Single-threaded debugging code.  This would be racy in
           // multithreaded mode, but provides not sanity checks in single
@@ -120,9 +121,9 @@
         }
       }
 
-      bool tryacquire() {
+      bool try_lock() {
         if (!mt_only || llvm_is_multithreaded())
-          return MutexImpl::tryacquire();
+          return impl.tryacquire();
         else return true;
       }
 
@@ -140,11 +141,11 @@
 
     public:
       SmartScopedLock(SmartMutex<mt_only>& m) : mtx(m) {
-        mtx.acquire();
+        mtx.lock();
       }
 
       ~SmartScopedLock() {
-        mtx.release();
+        mtx.unlock();
       }
     };
 

diff --git a/include/llvm/Support/MutexGuard.h b/include/llvm/Support/MutexGuard.h
index 6bb1622..b9f941d 100644
--- a/include/llvm/Support/MutexGuard.h
+++ b/include/llvm/Support/MutexGuard.h

@@ -29,8 +29,8 @@
     MutexGuard(const MutexGuard &) LLVM_DELETED_FUNCTION;
     void operator=(const MutexGuard &) LLVM_DELETED_FUNCTION;
   public:
-    MutexGuard(sys::Mutex &m) : M(m) { M.acquire(); }
-    ~MutexGuard() { M.release(); }
+    MutexGuard(sys::Mutex &m) : M(m) { M.lock(); }
+    ~MutexGuard() { M.unlock(); }
     /// holds - Returns true if this locker instance holds the specified lock.
     /// This is mostly used in assertions to validate that the correct mutex
     /// is held.

diff --git a/include/llvm/Support/OnDiskHashTable.h b/include/llvm/Support/OnDiskHashTable.h
index f6d43a4..b039fae 100644
--- a/include/llvm/Support/OnDiskHashTable.h
+++ b/include/llvm/Support/OnDiskHashTable.h

@@ -11,8 +11,8 @@
 /// \brief Defines facilities for reading and writing on-disk hash tables.
 ///
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
-#define LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
+#ifndef LLVM_SUPPORT_ONDISKHASHTABLE_H
+#define LLVM_SUPPORT_ONDISKHASHTABLE_H
 
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/AlignOf.h"
@@ -568,4 +568,4 @@
 
 } // end namespace llvm
 
-#endif // LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
+#endif

diff --git a/include/llvm/Support/Options.h b/include/llvm/Support/Options.h
new file mode 100644
index 0000000..4fd1bff
--- /dev/null
+++ b/include/llvm/Support/Options.h

@@ -0,0 +1,120 @@
+//===- llvm/Support/Options.h - Debug options support -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares helper objects for defining debug options that can be
+/// configured via the command line. The new API currently builds on the cl::opt
+/// API, but does not require the use of static globals.
+///
+/// With this API options are registered during initialization. For passes, this
+/// happens during pass initialization. Passes with options will call a static
+/// registerOptions method during initialization that registers options with the
+/// OptionRegistry. An example implementation of registerOptions is:
+///
+/// static void registerOptions() {
+///   OptionRegistry::registerOption<bool, Scalarizer,
+///                                &Scalarizer::ScalarizeLoadStore>(
+///       "scalarize-load-store",
+///       "Allow the scalarizer pass to scalarize loads and store", false);
+/// }
+///
+/// When reading data for options the interface is via the LLVMContext. Option
+/// data for passes should be read from the context during doInitialization. An
+/// example of reading the above option would be:
+///
+/// ScalarizeLoadStore =
+///   M.getContext().getOption<bool,
+///                            Scalarizer,
+///                            &Scalarizer::ScalarizeLoadStore>();
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_OPTIONS_H
+#define LLVM_SUPPORT_OPTIONS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+
+namespace detail {
+
+// Options are keyed of the unique address of a static character synthesized
+// based on template arguments.
+template <typename ValT, typename Base, ValT(Base::*Mem)> class OptionKey {
+public:
+  static char ID;
+};
+
+template <typename ValT, typename Base, ValT(Base::*Mem)>
+char OptionKey<ValT, Base, Mem>::ID = 0;
+
+} // namespace detail
+
+/// \brief Singleton class used to register debug options.
+///
+/// The OptionRegistry is responsible for managing lifetimes of the options and
+/// provides interfaces for option registration and reading values from options.
+/// This object is a singleton, only one instance should ever exist so that all
+/// options are registered in teh same place.
+class OptionRegistry {
+private:
+  DenseMap<void *, cl::Option *> Options;
+
+  /// \brief Adds a cl::Option to the registry.
+  ///
+  /// \param Key unique key for option
+  /// \param O option to map to \p Key
+  ///
+  /// Allocated cl::Options are owened by the OptionRegistry and are deallocated
+  /// on destruction or removal
+  void addOption(void *Key, cl::Option *O);
+
+public:
+  ~OptionRegistry();
+  OptionRegistry() {}
+
+  /// \brief Returns a reference to the singleton instance.
+  static OptionRegistry &instance();
+
+  /// \brief Registers an option with the OptionRegistry singleton.
+  ///
+  /// \tparam ValT type of the option's data
+  /// \tparam Base class used to key the option
+  /// \tparam Mem member of \p Base used for keying the option
+  ///
+  /// Options are keyed off the template parameters to generate unique static
+  /// characters. The template parameters are (1) the type of the data the
+  /// option stores (\p ValT), the class that will read the option (\p Base),
+  /// and the memeber that the class will store the data into (\p Mem).
+  template <typename ValT, typename Base, ValT(Base::*Mem)>
+  static void registerOption(const char *ArgStr, const char *Desc,
+                             const ValT &InitValue) {
+    cl::opt<ValT> *Option = new cl::opt<ValT>(ArgStr, cl::desc(Desc),
+                                              cl::Hidden, cl::init(InitValue));
+    instance().addOption(&detail::OptionKey<ValT, Base, Mem>::ID, Option);
+  }
+
+  /// \brief Returns the value of the option.
+  ///
+  /// \tparam ValT type of the option's data
+  /// \tparam Base class used to key the option
+  /// \tparam Mem member of \p Base used for keying the option
+  ///
+  /// Reads option values based on the key generated by the template parameters.
+  /// Keying for get() is the same as keying for registerOption.
+  template <typename ValT, typename Base, ValT(Base::*Mem)> ValT get() const {
+    auto It = Options.find(&detail::OptionKey<ValT, Base, Mem>::ID);
+    assert(It != Options.end() && "Option not in OptionRegistry");
+    return *(cl::opt<ValT> *)It->second;
+  }
+};
+
+} // namespace llvm
+
+#endif

diff --git a/include/llvm/Support/Path.h b/include/llvm/Support/Path.h
index cf821f0..8fae853 100644
--- a/include/llvm/Support/Path.h
+++ b/include/llvm/Support/Path.h

@@ -30,13 +30,13 @@
 
 /// @brief Path iterator.
 ///
-/// This is a bidirectional iterator that iterates over the individual
-/// components in \a path. The forward traversal order is as follows:
+/// This is an input iterator that iterates over the individual components in
+/// \a path. The traversal order is as follows:
 /// * The root-name element, if present.
 /// * The root-directory element, if present.
 /// * Each successive filename element, if present.
 /// * Dot, if one or more trailing non-root slash characters are present.
-/// The backwards traversal order is the reverse of forward traversal.
+/// Traversing backwards is possible with \a reverse_iterator
 ///
 /// Iteration examples. Each component is separated by ',':
 /// @code
@@ -47,7 +47,8 @@
 ///   ../        => ..,.
 ///   C:\foo\bar => C:,/,foo,bar
 /// @endcode
-class const_iterator {
+class const_iterator
+    : public std::iterator<std::input_iterator_tag, const StringRef> {
   StringRef Path;      ///< The entire path.
   StringRef Component; ///< The current component. Not necessarily in Path.
   size_t    Position;  ///< The iterators current position within Path.
@@ -57,26 +58,39 @@
   friend const_iterator end(StringRef path);
 
 public:
-  typedef const StringRef value_type;
-  typedef ptrdiff_t difference_type;
-  typedef value_type &reference;
-  typedef value_type *pointer;
-  typedef std::bidirectional_iterator_tag iterator_category;
-
   reference operator*() const { return Component; }
   pointer   operator->() const { return &Component; }
   const_iterator &operator++();    // preincrement
   const_iterator &operator++(int); // postincrement
-  const_iterator &operator--();    // predecrement
-  const_iterator &operator--(int); // postdecrement
   bool operator==(const const_iterator &RHS) const;
-  bool operator!=(const const_iterator &RHS) const;
+  bool operator!=(const const_iterator &RHS) const { return !(*this == RHS); }
 
   /// @brief Difference in bytes between this and RHS.
   ptrdiff_t operator-(const const_iterator &RHS) const;
 };
 
-typedef std::reverse_iterator<const_iterator> reverse_iterator;
+/// @brief Reverse path iterator.
+///
+/// This is an input iterator that iterates over the individual components in
+/// \a path in reverse order. The traversal order is exactly reversed from that
+/// of \a const_iterator
+class reverse_iterator
+    : public std::iterator<std::input_iterator_tag, const StringRef> {
+  StringRef Path;      ///< The entire path.
+  StringRef Component; ///< The current component. Not necessarily in Path.
+  size_t    Position;  ///< The iterators current position within Path.
+
+  friend reverse_iterator rbegin(StringRef path);
+  friend reverse_iterator rend(StringRef path);
+
+public:
+  reference operator*() const { return Component; }
+  pointer   operator->() const { return &Component; }
+  reverse_iterator &operator++();    // preincrement
+  reverse_iterator &operator++(int); // postincrement
+  bool operator==(const reverse_iterator &RHS) const;
+  bool operator!=(const reverse_iterator &RHS) const { return !(*this == RHS); }
+};
 
 /// @brief Get begin iterator over \a path.
 /// @param path Input path.
@@ -91,16 +105,12 @@
 /// @brief Get reverse begin iterator over \a path.
 /// @param path Input path.
 /// @returns Iterator initialized with the first reverse component of \a path.
-inline reverse_iterator rbegin(StringRef path) {
-  return reverse_iterator(end(path));
-}
+reverse_iterator rbegin(StringRef path);
 
 /// @brief Get reverse end iterator over \a path.
 /// @param path Input path.
 /// @returns Iterator initialized to the reverse end of \a path.
-inline reverse_iterator rend(StringRef path) {
-  return reverse_iterator(begin(path));
-}
+reverse_iterator rend(StringRef path);
 
 /// @}
 /// @name Lexical Modifiers
@@ -194,7 +204,7 @@
 ///
 /// @param path Input path.
 /// @result The root name of \a path if it has one, otherwise "".
-const StringRef root_name(StringRef path);
+StringRef root_name(StringRef path);
 
 /// @brief Get root directory.
 ///
@@ -207,7 +217,7 @@
 /// @param path Input path.
 /// @result The root directory of \a path if it has one, otherwise
 ///               "".
-const StringRef root_directory(StringRef path);
+StringRef root_directory(StringRef path);
   
 /// @brief Get root path.
 ///
@@ -215,7 +225,7 @@
 ///
 /// @param path Input path.
 /// @result The root path of \a path if it has one, otherwise "".
-const StringRef root_path(StringRef path);
+StringRef root_path(StringRef path);
 
 /// @brief Get relative path.
 ///
@@ -227,7 +237,7 @@
 ///
 /// @param path Input path.
 /// @result The path starting after root_path if one exists, otherwise "".
-const StringRef relative_path(StringRef path);
+StringRef relative_path(StringRef path);
 
 /// @brief Get parent path.
 ///
@@ -239,7 +249,7 @@
 ///
 /// @param path Input path.
 /// @result The parent path of \a path if one exists, otherwise "".
-const StringRef parent_path(StringRef path);
+StringRef parent_path(StringRef path);
 
 /// @brief Get filename.
 ///
@@ -253,7 +263,7 @@
 /// @param path Input path.
 /// @result The filename part of \a path. This is defined as the last component
 ///         of \a path.
-const StringRef filename(StringRef path);
+StringRef filename(StringRef path);
 
 /// @brief Get stem.
 ///
@@ -271,7 +281,7 @@
 ///
 /// @param path Input path.
 /// @result The stem of \a path.
-const StringRef stem(StringRef path);
+StringRef stem(StringRef path);
 
 /// @brief Get extension.
 ///
@@ -287,7 +297,7 @@
 ///
 /// @param path Input path.
 /// @result The extension of \a path.
-const StringRef extension(StringRef path);
+StringRef extension(StringRef path);
 
 /// @brief Check whether the given char is a path separator on the host OS.
 ///
@@ -298,7 +308,7 @@
 /// @brief Return the preferred separator for this platform.
 ///
 /// @result StringRef of the preferred separator, null-terminated.
-const StringRef get_separator();
+StringRef get_separator();
 
 /// @brief Get the typical temporary directory for the system, e.g., 
 /// "/var/tmp" or "C:/TEMP"

diff --git a/include/llvm/Support/Process.h b/include/llvm/Support/Process.h
index 30973de..8616679 100644
--- a/include/llvm/Support/Process.h
+++ b/include/llvm/Support/Process.h

@@ -186,6 +186,21 @@
                     ArrayRef<const char *> ArgsFromMain,
                     SpecificBumpPtrAllocator<char> &ArgAllocator);
 
+  // This functions ensures that the standard file descriptors (input, output,
+  // and error) are properly mapped to a file descriptor before we use any of
+  // them.  This should only be called by standalone programs, library
+  // components should not call this.
+  static std::error_code FixupStandardFileDescriptors();
+
+  // This function safely closes a file descriptor.  It is not safe to retry
+  // close(2) when it returns with errno equivalent to EINTR; this is because
+  // *nixen cannot agree if the file descriptor is, in fact, closed when this
+  // occurs.
+  //
+  // N.B. Some operating systems, due to thread cancellation, cannot properly
+  // guarantee that it will or will not be closed one way or the other!
+  static std::error_code SafelyCloseFileDescriptor(int FD);
+
   /// This function determines if the standard input is connected directly
   /// to a user's input (keyboard probably), rather than coming from a file
   /// or pipe.

diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index 51279a9..40dc60f 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h

@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_PROGRAM_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/Path.h"
 #include <system_error>
 
@@ -51,17 +52,22 @@
   ProcessInfo();
 };
 
-  /// This function attempts to locate a program in the operating
-  /// system's file system using some pre-determined set of locations to search
-  /// (e.g. the PATH on Unix). Paths with slashes are returned unmodified.
+  /// \brief Find the first executable file \p Name in \p Paths.
   ///
-  /// It does not perform hashing as a shell would but instead stats each PATH
+  /// This does not perform hashing as a shell would but instead stats each PATH
   /// entry individually so should generally be avoided. Core LLVM library
   /// functions and options should instead require fully specified paths.
   ///
-  /// @returns A string containing the path of the program or an empty string if
-  /// the program could not be found.
-  std::string FindProgramByName(const std::string& name);
+  /// \param Name name of the executable to find. If it contains any system
+  ///   slashes, it will be returned as is.
+  /// \param Paths optional list of paths to search for \p Name. If empty it
+  ///   will use the system PATH environment instead.
+  ///
+  /// \returns The fully qualified path to the first \p Name in \p Paths if it
+  ///   exists. \p Name if \p Name has slashes in it. Otherwise an error.
+  ErrorOr<std::string>
+  findProgramByName(StringRef Name,
+                    ArrayRef<StringRef> Paths = ArrayRef<StringRef>());
 
   // These functions change the specified standard stream (stdin or stdout) to
   // binary mode. They return errc::success if the specified stream
@@ -82,7 +88,7 @@
   /// -2 indicates a crash during execution or timeout
   int ExecuteAndWait(
       StringRef Program, ///< Path of the program to be executed. It is
-      /// presumed this is the result of the FindProgramByName method.
+      /// presumed this is the result of the findProgramByName method.
       const char **args, ///< A vector of strings that are passed to the
       ///< program.  The first element should be the name of the program.
       ///< The list *must* be terminated by a null char* entry.
@@ -126,6 +132,40 @@
   /// argument length limits.
   bool argumentsFitWithinSystemLimits(ArrayRef<const char*> Args);
 
+  /// File encoding options when writing contents that a non-UTF8 tool will
+  /// read (on Windows systems). For UNIX, we always use UTF-8.
+  enum WindowsEncodingMethod {
+    /// UTF-8 is the LLVM native encoding, being the same as "do not perform
+    /// encoding conversion".
+    WEM_UTF8,
+    WEM_CurrentCodePage,
+    WEM_UTF16
+  };
+
+  /// Saves the UTF8-encoded \p contents string into the file \p FileName
+  /// using a specific encoding.
+  ///
+  /// This write file function adds the possibility to choose which encoding
+  /// to use when writing a text file. On Windows, this is important when
+  /// writing files with internationalization support with an encoding that is
+  /// different from the one used in LLVM (UTF-8). We use this when writing
+  /// response files, since GCC tools on MinGW only understand legacy code
+  /// pages, and VisualStudio tools only understand UTF-16.
+  /// For UNIX, using different encodings is silently ignored, since all tools
+  /// work well with UTF-8.
+  /// This function assumes that you only use UTF-8 *text* data and will convert
+  /// it to your desired encoding before writing to the file.
+  ///
+  /// FIXME: We use EM_CurrentCodePage to write response files for GNU tools in
+  /// a MinGW/MinGW-w64 environment, which has serious flaws but currently is
+  /// our best shot to make gcc/ld understand international characters. This
+  /// should be changed as soon as binutils fix this to support UTF16 on mingw.
+  ///
+  /// \returns non-zero error_code if failed
+  std::error_code
+  writeFileWithEncoding(StringRef FileName, StringRef Contents,
+                        WindowsEncodingMethod Encoding = WEM_UTF8);
+
   /// This function waits for the process specified by \p PI to finish.
   /// \returns A \see ProcessInfo struct with Pid set to:
   /// \li The process id of the child process if the child process has changed

diff --git a/include/llvm/Support/RWMutex.h b/include/llvm/Support/RWMutex.h
index 935b307..b80b855 100644
--- a/include/llvm/Support/RWMutex.h
+++ b/include/llvm/Support/RWMutex.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_RWMUTEX_H
-#define LLVM_SYSTEM_RWMUTEX_H
+#ifndef LLVM_SUPPORT_RWMUTEX_H
+#define LLVM_SUPPORT_RWMUTEX_H
 
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Threading.h"
@@ -85,14 +85,15 @@
     /// indicates whether this mutex should become a no-op when we're not
     /// running in multithreaded mode.
     template<bool mt_only>
-    class SmartRWMutex : public RWMutexImpl {
+    class SmartRWMutex {
+      RWMutexImpl impl;
       unsigned readers, writers;
     public:
-      explicit SmartRWMutex() : RWMutexImpl(), readers(0), writers(0) { }
+      explicit SmartRWMutex() : impl(), readers(0), writers(0) { }
 
-      bool reader_acquire() {
+      bool lock_shared() {
         if (!mt_only || llvm_is_multithreaded())
-          return RWMutexImpl::reader_acquire();
+          return impl.reader_acquire();
 
         // Single-threaded debugging code.  This would be racy in multithreaded
         // mode, but provides not sanity checks in single threaded mode.
@@ -100,9 +101,9 @@
         return true;
       }
 
-      bool reader_release() {
+      bool unlock_shared() {
         if (!mt_only || llvm_is_multithreaded())
-          return RWMutexImpl::reader_release();
+          return impl.reader_release();
 
         // Single-threaded debugging code.  This would be racy in multithreaded
         // mode, but provides not sanity checks in single threaded mode.
@@ -111,9 +112,9 @@
         return true;
       }
 
-      bool writer_acquire() {
+      bool lock() {
         if (!mt_only || llvm_is_multithreaded())
-          return RWMutexImpl::writer_acquire();
+          return impl.writer_acquire();
 
         // Single-threaded debugging code.  This would be racy in multithreaded
         // mode, but provides not sanity checks in single threaded mode.
@@ -122,9 +123,9 @@
         return true;
       }
 
-      bool writer_release() {
+      bool unlock() {
         if (!mt_only || llvm_is_multithreaded())
-          return RWMutexImpl::writer_release();
+          return impl.writer_release();
 
         // Single-threaded debugging code.  This would be racy in multithreaded
         // mode, but provides not sanity checks in single threaded mode.
@@ -145,11 +146,11 @@
       SmartRWMutex<mt_only>& mutex;
 
       explicit SmartScopedReader(SmartRWMutex<mt_only>& m) : mutex(m) {
-        mutex.reader_acquire();
+        mutex.lock_shared();
       }
 
       ~SmartScopedReader() {
-        mutex.reader_release();
+        mutex.unlock_shared();
       }
     };
     typedef SmartScopedReader<false> ScopedReader;
@@ -160,11 +161,11 @@
       SmartRWMutex<mt_only>& mutex;
 
       explicit SmartScopedWriter(SmartRWMutex<mt_only>& m) : mutex(m) {
-        mutex.writer_acquire();
+        mutex.lock();
       }
 
       ~SmartScopedWriter() {
-        mutex.writer_release();
+        mutex.unlock();
       }
     };
     typedef SmartScopedWriter<false> ScopedWriter;

diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 4717553..f9e114b 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h

@@ -19,11 +19,11 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include <string>
 
 namespace llvm {
-  class MemoryBuffer;
   class SourceMgr;
   class SMDiagnostic;
   class SMFixIt;
@@ -47,10 +47,15 @@
 private:
   struct SrcBuffer {
     /// The memory buffer for the file.
-    MemoryBuffer *Buffer;
+    std::unique_ptr<MemoryBuffer> Buffer;
 
     /// This is the location of the parent include, or null if at the top level.
     SMLoc IncludeLoc;
+
+    SrcBuffer() {}
+
+    SrcBuffer(SrcBuffer &&O)
+        : Buffer(std::move(O.Buffer)), IncludeLoc(O.IncludeLoc) {}
   };
 
   /// This is all of the buffers that we are reading from.
@@ -96,7 +101,7 @@
 
   const MemoryBuffer *getMemoryBuffer(unsigned i) const {
     assert(isValidBufferID(i));
-    return Buffers[i - 1].Buffer;
+    return Buffers[i - 1].Buffer.get();
   }
 
   unsigned getNumBuffers() const {
@@ -115,11 +120,12 @@
 
   /// Add a new source buffer to this source manager. This takes ownership of
   /// the memory buffer.
-  unsigned AddNewSourceBuffer(MemoryBuffer *F, SMLoc IncludeLoc) {
+  unsigned AddNewSourceBuffer(std::unique_ptr<MemoryBuffer> F,
+                              SMLoc IncludeLoc) {
     SrcBuffer NB;
-    NB.Buffer = F;
+    NB.Buffer = std::move(F);
     NB.IncludeLoc = IncludeLoc;
-    Buffers.push_back(NB);
+    Buffers.push_back(std::move(NB));
     return Buffers.size();
   }
 

diff --git a/include/llvm/Support/SpecialCaseList.h b/include/llvm/Support/SpecialCaseList.h
index 098b9c7..313212e 100644
--- a/include/llvm/Support/SpecialCaseList.h
+++ b/include/llvm/Support/SpecialCaseList.h

@@ -56,17 +56,19 @@
 class StringRef;
 
 class SpecialCaseList {
- public:
+public:
   /// Parses the special case list from a file. If Path is empty, returns
   /// an empty special case list. On failure, returns 0 and writes an error
   /// message to string.
-  static SpecialCaseList *create(const StringRef Path, std::string &Error);
+  static std::unique_ptr<SpecialCaseList> create(StringRef Path,
+                                                  std::string &Error);
   /// Parses the special case list from a memory buffer. On failure, returns
   /// 0 and writes an error message to string.
-  static SpecialCaseList *create(const MemoryBuffer *MB, std::string &Error);
+  static std::unique_ptr<SpecialCaseList> create(const MemoryBuffer *MB,
+                                                  std::string &Error);
   /// Parses the special case list from a file. On failure, reports a fatal
   /// error.
-  static SpecialCaseList *createOrDie(const StringRef Path);
+  static std::unique_ptr<SpecialCaseList> createOrDie(StringRef Path);
 
   ~SpecialCaseList();
 
@@ -75,10 +77,10 @@
   ///   @Section:<E>=@Category
   /// \endcode
   /// and @Query satisfies a wildcard expression <E>.
-  bool inSection(const StringRef Section, const StringRef Query,
-                 const StringRef Category = StringRef()) const;
+  bool inSection(StringRef Section, StringRef Query,
+                 StringRef Category = StringRef()) const;
 
- private:
+private:
   SpecialCaseList(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
   SpecialCaseList &operator=(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
 

diff --git a/include/llvm/Support/StreamableMemoryObject.h b/include/llvm/Support/StreamableMemoryObject.h
deleted file mode 100644
index 6e71ad4..0000000
--- a/include/llvm/Support/StreamableMemoryObject.h
+++ /dev/null

@@ -1,178 +0,0 @@
-//===- StreamableMemoryObject.h - Streamable data interface -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef LLVM_SUPPORT_STREAMABLEMEMORYOBJECT_H
-#define LLVM_SUPPORT_STREAMABLEMEMORYOBJECT_H
-
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataStream.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryObject.h"
-#include <cassert>
-#include <memory>
-#include <vector>
-
-namespace llvm {
-
-/// StreamableMemoryObject - Interface to data which might be streamed.
-/// Streamability has 2 important implications/restrictions. First, the data
-/// might not yet exist in memory when the request is made. This just means
-/// that readByte/readBytes might have to block or do some work to get it.
-/// More significantly, the exact size of the object might not be known until
-/// it has all been fetched. This means that to return the right result,
-/// getExtent must also wait for all the data to arrive; therefore it should
-/// not be called on objects which are actually streamed (this would defeat
-/// the purpose of streaming). Instead, isValidAddress and isObjectEnd can be
-/// used to test addresses without knowing the exact size of the stream.
-/// Finally, getPointer can be used instead of readBytes to avoid extra copying.
-class StreamableMemoryObject : public MemoryObject {
- public:
-  /// Destructor      - Override as necessary.
-  virtual ~StreamableMemoryObject();
-
-  /// getBase         - Returns the lowest valid address in the region.
-  ///
-  /// @result         - The lowest valid address.
-  uint64_t getBase() const override = 0;
-
-  /// getExtent       - Returns the size of the region in bytes.  (The region is
-  ///                   contiguous, so the highest valid address of the region
-  ///                   is getBase() + getExtent() - 1).
-  ///                   May block until all bytes in the stream have been read
-  ///
-  /// @result         - The size of the region.
-  uint64_t getExtent() const override = 0;
-
-  /// readByte        - Tries to read a single byte from the region.
-  ///                   May block until (address - base) bytes have been read
-  /// @param address  - The address of the byte, in the same space as getBase().
-  /// @param ptr      - A pointer to a byte to be filled in.  Must be non-NULL.
-  /// @result         - 0 if successful; -1 if not.  Failure may be due to a
-  ///                   bounds violation or an implementation-specific error.
-  int readByte(uint64_t address, uint8_t *ptr) const override = 0;
-
-  /// readBytes       - Tries to read a contiguous range of bytes from the
-  ///                   region, up to the end of the region.
-  ///                   May block until (address - base + size) bytes have
-  ///                   been read. Additionally, StreamableMemoryObjects will
-  ///                   not do partial reads - if size bytes cannot be read,
-  ///                   readBytes will fail.
-  ///
-  /// @param address  - The address of the first byte, in the same space as
-  ///                   getBase().
-  /// @param size     - The number of bytes to copy.
-  /// @param buf      - A pointer to a buffer to be filled in.  Must be non-NULL
-  ///                   and large enough to hold size bytes.
-  /// @result         - 0 if successful; -1 if not.  Failure may be due to a
-  ///                   bounds violation or an implementation-specific error.
-  int readBytes(uint64_t address, uint64_t size,
-                uint8_t *buf) const override = 0;
-
-  /// getPointer  - Ensures that the requested data is in memory, and returns
-  ///               A pointer to it. More efficient than using readBytes if the
-  ///               data is already in memory.
-  ///               May block until (address - base + size) bytes have been read
-  /// @param address - address of the byte, in the same space as getBase()
-  /// @param size    - amount of data that must be available on return
-  /// @result        - valid pointer to the requested data
-  virtual const uint8_t *getPointer(uint64_t address, uint64_t size) const = 0;
-
-  /// isValidAddress - Returns true if the address is within the object
-  ///                  (i.e. between base and base + extent - 1 inclusive)
-  ///                  May block until (address - base) bytes have been read
-  /// @param address - address of the byte, in the same space as getBase()
-  /// @result        - true if the address may be read with readByte()
-  virtual bool isValidAddress(uint64_t address) const = 0;
-
-  /// isObjectEnd    - Returns true if the address is one past the end of the
-  ///                  object (i.e. if it is equal to base + extent)
-  ///                  May block until (address - base) bytes have been read
-  /// @param address - address of the byte, in the same space as getBase()
-  /// @result        - true if the address is equal to base + extent
-  virtual bool isObjectEnd(uint64_t address) const = 0;
-};
-
-/// StreamingMemoryObject - interface to data which is actually streamed from
-/// a DataStreamer. In addition to inherited members, it has the
-/// dropLeadingBytes and setKnownObjectSize methods which are not applicable
-/// to non-streamed objects.
-class StreamingMemoryObject : public StreamableMemoryObject {
-public:
-  StreamingMemoryObject(DataStreamer *streamer);
-  uint64_t getBase() const override { return 0; }
-  uint64_t getExtent() const override;
-  int readByte(uint64_t address, uint8_t *ptr) const override;
-  int readBytes(uint64_t address, uint64_t size,
-                uint8_t *buf) const override;
-  const uint8_t *getPointer(uint64_t address, uint64_t size) const override {
-    // This could be fixed by ensuring the bytes are fetched and making a copy,
-    // requiring that the bitcode size be known, or otherwise ensuring that
-    // the memory doesn't go away/get reallocated, but it's
-    // not currently necessary. Users that need the pointer don't stream.
-    llvm_unreachable("getPointer in streaming memory objects not allowed");
-    return nullptr;
-  }
-  bool isValidAddress(uint64_t address) const override;
-  bool isObjectEnd(uint64_t address) const override;
-
-  /// Drop s bytes from the front of the stream, pushing the positions of the
-  /// remaining bytes down by s. This is used to skip past the bitcode header,
-  /// since we don't know a priori if it's present, and we can't put bytes
-  /// back into the stream once we've read them.
-  bool dropLeadingBytes(size_t s);
-
-  /// If the data object size is known in advance, many of the operations can
-  /// be made more efficient, so this method should be called before reading
-  /// starts (although it can be called anytime).
-  void setKnownObjectSize(size_t size);
-
-private:
-  const static uint32_t kChunkSize = 4096 * 4;
-  mutable std::vector<unsigned char> Bytes;
-  std::unique_ptr<DataStreamer> Streamer;
-  mutable size_t BytesRead;   // Bytes read from stream
-  size_t BytesSkipped;// Bytes skipped at start of stream (e.g. wrapper/header)
-  mutable size_t ObjectSize; // 0 if unknown, set if wrapper seen or EOF reached
-  mutable bool EOFReached;
-
-  // Fetch enough bytes such that Pos can be read or EOF is reached
-  // (i.e. BytesRead > Pos). Return true if Pos can be read.
-  // Unlike most of the functions in BitcodeReader, returns true on success.
-  // Most of the requests will be small, but we fetch at kChunkSize bytes
-  // at a time to avoid making too many potentially expensive GetBytes calls
-  bool fetchToPos(size_t Pos) const {
-    if (EOFReached) return Pos < ObjectSize;
-    while (Pos >= BytesRead) {
-      Bytes.resize(BytesRead + BytesSkipped + kChunkSize);
-      size_t bytes = Streamer->GetBytes(&Bytes[BytesRead + BytesSkipped],
-                                        kChunkSize);
-      BytesRead += bytes;
-      if (bytes < kChunkSize) {
-        assert((!ObjectSize || BytesRead >= Pos) &&
-               "Unexpected short read fetching bitcode");
-        if (BytesRead <= Pos) { // reached EOF/ran out of bytes
-          ObjectSize = BytesRead;
-          EOFReached = true;
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  StreamingMemoryObject(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
-  void operator=(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
-};
-
-StreamableMemoryObject *getNonStreamedMemoryObject(
-    const unsigned char *Start, const unsigned char *End);
-
-}
-#endif  // STREAMABLEMEMORYOBJECT_H_

diff --git a/include/llvm/Support/StreamingMemoryObject.h b/include/llvm/Support/StreamingMemoryObject.h
new file mode 100644
index 0000000..6957c6e
--- /dev/null
+++ b/include/llvm/Support/StreamingMemoryObject.h

@@ -0,0 +1,91 @@
+//===- StreamingMemoryObject.h - Streamable data interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_STREAMINGMEMORYOBJECT_H
+#define LLVM_SUPPORT_STREAMINGMEMORYOBJECT_H
+
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataStream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryObject.h"
+#include <cassert>
+#include <memory>
+#include <vector>
+
+namespace llvm {
+
+/// Interface to data which is actually streamed from a DataStreamer. In
+/// addition to inherited members, it has the dropLeadingBytes and
+/// setKnownObjectSize methods which are not applicable to non-streamed objects.
+class StreamingMemoryObject : public MemoryObject {
+public:
+  StreamingMemoryObject(DataStreamer *streamer);
+  uint64_t getExtent() const override;
+  uint64_t readBytes(uint8_t *Buf, uint64_t Size,
+                     uint64_t Address) const override;
+  const uint8_t *getPointer(uint64_t address, uint64_t size) const override {
+    // This could be fixed by ensuring the bytes are fetched and making a copy,
+    // requiring that the bitcode size be known, or otherwise ensuring that
+    // the memory doesn't go away/get reallocated, but it's
+    // not currently necessary. Users that need the pointer don't stream.
+    llvm_unreachable("getPointer in streaming memory objects not allowed");
+    return nullptr;
+  }
+  bool isValidAddress(uint64_t address) const override;
+
+  /// Drop s bytes from the front of the stream, pushing the positions of the
+  /// remaining bytes down by s. This is used to skip past the bitcode header,
+  /// since we don't know a priori if it's present, and we can't put bytes
+  /// back into the stream once we've read them.
+  bool dropLeadingBytes(size_t s);
+
+  /// If the data object size is known in advance, many of the operations can
+  /// be made more efficient, so this method should be called before reading
+  /// starts (although it can be called anytime).
+  void setKnownObjectSize(size_t size);
+
+private:
+  const static uint32_t kChunkSize = 4096 * 4;
+  mutable std::vector<unsigned char> Bytes;
+  std::unique_ptr<DataStreamer> Streamer;
+  mutable size_t BytesRead;   // Bytes read from stream
+  size_t BytesSkipped;// Bytes skipped at start of stream (e.g. wrapper/header)
+  mutable size_t ObjectSize; // 0 if unknown, set if wrapper seen or EOF reached
+  mutable bool EOFReached;
+
+  // Fetch enough bytes such that Pos can be read or EOF is reached
+  // (i.e. BytesRead > Pos). Return true if Pos can be read.
+  // Unlike most of the functions in BitcodeReader, returns true on success.
+  // Most of the requests will be small, but we fetch at kChunkSize bytes
+  // at a time to avoid making too many potentially expensive GetBytes calls
+  bool fetchToPos(size_t Pos) const {
+    if (EOFReached) return Pos < ObjectSize;
+    while (Pos >= BytesRead) {
+      Bytes.resize(BytesRead + BytesSkipped + kChunkSize);
+      size_t bytes = Streamer->GetBytes(&Bytes[BytesRead + BytesSkipped],
+                                        kChunkSize);
+      BytesRead += bytes;
+      if (BytesRead <= Pos) { // reached EOF/ran out of bytes
+        ObjectSize = BytesRead;
+        EOFReached = true;
+        return false;
+      }
+    }
+    return true;
+  }
+
+  StreamingMemoryObject(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
+  void operator=(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
+};
+
+MemoryObject *getNonStreamedMemoryObject(
+    const unsigned char *Start, const unsigned char *End);
+
+}
+#endif  // STREAMINGMEMORYOBJECT_H_

diff --git a/include/llvm/Support/StringRefMemoryObject.h b/include/llvm/Support/StringRefMemoryObject.h
deleted file mode 100644
index 8a349ea..0000000
--- a/include/llvm/Support/StringRefMemoryObject.h
+++ /dev/null

@@ -1,41 +0,0 @@
-//===- llvm/Support/StringRefMemoryObject.h ---------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the StringRefMemObject class, a simple
-// wrapper around StringRef implementing the MemoryObject interface.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_STRINGREFMEMORYOBJECT_H
-#define LLVM_SUPPORT_STRINGREFMEMORYOBJECT_H
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/MemoryObject.h"
-
-namespace llvm {
-
-/// StringRefMemoryObject - Simple StringRef-backed MemoryObject
-class StringRefMemoryObject : public MemoryObject {
-  StringRef Bytes;
-  uint64_t Base;
-public:
-  StringRefMemoryObject(StringRef Bytes, uint64_t Base = 0)
-    : Bytes(Bytes), Base(Base) {}
-
-  uint64_t getBase() const override { return Base; }
-  uint64_t getExtent() const override { return Bytes.size(); }
-
-  int readByte(uint64_t Addr, uint8_t *Byte) const override;
-  int readBytes(uint64_t Addr, uint64_t Size, uint8_t *Buf) const override;
-};
-
-}
-
-#endif

diff --git a/include/llvm/Support/SwapByteOrder.h b/include/llvm/Support/SwapByteOrder.h
index 340954f..9c5a3c5 100644
--- a/include/llvm/Support/SwapByteOrder.h
+++ b/include/llvm/Support/SwapByteOrder.h

@@ -15,6 +15,7 @@
 #ifndef LLVM_SUPPORT_SWAPBYTEORDER_H
 #define LLVM_SUPPORT_SWAPBYTEORDER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include <cstddef>
 #include <limits>
@@ -39,8 +40,7 @@
 /// SwapByteOrder_32 - This function returns a byte-swapped representation of
 /// the 32-bit argument.
 inline uint32_t SwapByteOrder_32(uint32_t value) {
-#if defined(__llvm__) || \
-(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) && !defined(__ICC)
+#if defined(__llvm__) || (LLVM_GNUC_PREREQ(4, 3, 0) && !defined(__ICC))
   return __builtin_bswap32(value);
 #elif defined(_MSC_VER) && !defined(_DEBUG)
   return _byteswap_ulong(value);
@@ -56,8 +56,7 @@
 /// SwapByteOrder_64 - This function returns a byte-swapped representation of
 /// the 64-bit argument.
 inline uint64_t SwapByteOrder_64(uint64_t value) {
-#if defined(__llvm__) || \
-(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) && !defined(__ICC)
+#if defined(__llvm__) || (LLVM_GNUC_PREREQ(4, 3, 0) && !defined(__ICC))
   return __builtin_bswap64(value);
 #elif defined(_MSC_VER) && !defined(_DEBUG)
   return _byteswap_uint64(value);

diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index 5d5b86a..8ac4b90 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h

@@ -123,15 +123,10 @@
                                                   const MCRegisterInfo &MRI,
                                                   const MCSubtargetInfo &STI,
                                                   MCContext &Ctx);
-    typedef MCStreamer *(*MCObjectStreamerCtorTy)(const Target &T,
-                                                  StringRef TT,
-                                                  MCContext &Ctx,
-                                                  MCAsmBackend &TAB,
-                                                  raw_ostream &_OS,
-                                                  MCCodeEmitter *_Emitter,
-                                                  const MCSubtargetInfo &STI,
-                                                  bool RelaxAll,
-                                                  bool NoExecStack);
+    typedef MCStreamer *(*MCObjectStreamerCtorTy)(
+        const Target &T, StringRef TT, MCContext &Ctx, MCAsmBackend &TAB,
+        raw_ostream &_OS, MCCodeEmitter *_Emitter, const MCSubtargetInfo &STI,
+        bool RelaxAll);
     typedef MCStreamer *(*AsmStreamerCtorTy)(MCContext &Ctx,
                                              formatted_raw_ostream &OS,
                                              bool isVerboseAsm,
@@ -423,18 +418,15 @@
     /// \param _OS The stream object.
     /// \param _Emitter The target independent assembler object.Takes ownership.
     /// \param RelaxAll Relax all fixups?
-    /// \param NoExecStack Mark file as not needing a executable stack.
     MCStreamer *createMCObjectStreamer(StringRef TT, MCContext &Ctx,
-                                       MCAsmBackend &TAB,
-                                       raw_ostream &_OS,
+                                       MCAsmBackend &TAB, raw_ostream &_OS,
                                        MCCodeEmitter *_Emitter,
                                        const MCSubtargetInfo &STI,
-                                       bool RelaxAll,
-                                       bool NoExecStack) const {
+                                       bool RelaxAll) const {
       if (!MCObjectStreamerCtorFn)
         return nullptr;
       return MCObjectStreamerCtorFn(*this, TT, Ctx, TAB, _OS, _Emitter, STI,
-                                    RelaxAll, NoExecStack);
+                                    RelaxAll);
     }
 
     /// createAsmStreamer - Create a target specific MCStreamer.

diff --git a/include/llvm/Support/TimeValue.h b/include/llvm/Support/TimeValue.h
index ee0e286..6bca58b 100644
--- a/include/llvm/Support/TimeValue.h
+++ b/include/llvm/Support/TimeValue.h

@@ -38,28 +38,38 @@
     /// value permissible by the class. MinTime is some point
     /// in the distant past, about 300 billion years BCE.
     /// @brief The smallest possible time value.
-    static const TimeValue MinTime;
+    static TimeValue MinTime() {
+      return TimeValue ( INT64_MIN,0 );
+    }
 
     /// A constant TimeValue representing the largest time
     /// value permissible by the class. MaxTime is some point
     /// in the distant future, about 300 billion years AD.
     /// @brief The largest possible time value.
-    static const TimeValue MaxTime;
+    static TimeValue MaxTime() {
+      return TimeValue ( INT64_MAX,0 );
+    }
 
     /// A constant TimeValue representing the base time,
     /// or zero time of 00:00:00 (midnight) January 1st, 2000.
     /// @brief 00:00:00 Jan 1, 2000 UTC.
-    static const TimeValue ZeroTime;
+    static TimeValue ZeroTime() {
+      return TimeValue ( 0,0 );
+    }
 
     /// A constant TimeValue for the Posix base time which is
     /// 00:00:00 (midnight) January 1st, 1970.
     /// @brief 00:00:00 Jan 1, 1970 UTC.
-    static const TimeValue PosixZeroTime;
+    static TimeValue PosixZeroTime() {
+      return TimeValue ( PosixZeroTimeSeconds,0 );
+    }
 
     /// A constant TimeValue for the Win32 base time which is
     /// 00:00:00 (midnight) January 1st, 1601.
     /// @brief 00:00:00 Jan 1, 1601 UTC.
-    static const TimeValue Win32ZeroTime;
+    static TimeValue Win32ZeroTime() {
+      return TimeValue ( Win32ZeroTimeSeconds,0 );
+    }
 
   /// @}
   /// @name Types

diff --git a/include/llvm/Support/ToolOutputFile.h b/include/llvm/Support/ToolOutputFile.h
index 88f8ccc..d98e7bb 100644
--- a/include/llvm/Support/ToolOutputFile.h
+++ b/include/llvm/Support/ToolOutputFile.h

@@ -29,13 +29,13 @@
   /// destructed after the raw_fd_ostream is destructed. It installs
   /// cleanups in its constructor and uninstalls them in its destructor.
   class CleanupInstaller {
-    /// Filename - The name of the file.
+    /// The name of the file.
     std::string Filename;
   public:
-    /// Keep - The flag which indicates whether we should not delete the file.
+    /// The flag which indicates whether we should not delete the file.
     bool Keep;
 
-    explicit CleanupInstaller(const char *filename);
+    explicit CleanupInstaller(StringRef ilename);
     ~CleanupInstaller();
   } Installer;
 
@@ -44,12 +44,12 @@
   raw_fd_ostream OS;
 
 public:
-  /// tool_output_file - This constructor's arguments are passed to
-  /// to raw_fd_ostream's constructor.
-  tool_output_file(const char *filename, std::string &ErrorInfo,
+  /// This constructor's arguments are passed to to raw_fd_ostream's
+  /// constructor.
+  tool_output_file(StringRef Filename, std::error_code &EC,
                    sys::fs::OpenFlags Flags);
 
-  tool_output_file(const char *Filename, int FD);
+  tool_output_file(StringRef Filename, int FD);
 
   /// os - Return the contained raw_fd_ostream.
   raw_fd_ostream &os() { return OS; }

diff --git a/include/llvm/Support/UniqueLock.h b/include/llvm/Support/UniqueLock.h
new file mode 100644
index 0000000..5a4c273
--- /dev/null
+++ b/include/llvm/Support/UniqueLock.h

@@ -0,0 +1,67 @@
+//===-- Support/UniqueLock.h - Acquire/Release Mutex In Scope ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a guard for a block of code that ensures a Mutex is locked
+// upon construction and released upon destruction.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_UNIQUE_LOCK_H
+#define LLVM_SUPPORT_UNIQUE_LOCK_H
+
+#include "llvm/Support/Mutex.h"
+
+namespace llvm {
+  /// A pared-down imitation of std::unique_lock from C++11. Contrary to the
+  /// name, it's really more of a wrapper for a lock. It may or may not have
+  /// an associated mutex, which is guaranteed to be locked upon creation
+  /// and unlocked after destruction. unique_lock can also unlock the mutex
+  /// and re-lock it freely during its lifetime.
+  /// @brief Guard a section of code with a mutex.
+  template<typename MutexT>
+  class unique_lock {
+    MutexT *M;
+    bool locked;
+
+    unique_lock(const unique_lock &) LLVM_DELETED_FUNCTION;
+    void operator=(const unique_lock &) LLVM_DELETED_FUNCTION;
+  public:
+    unique_lock() : M(nullptr), locked(false) {}
+    explicit unique_lock(MutexT &m) : M(&m), locked(true) { M->lock(); }
+
+    void operator=(unique_lock &&o) {
+      if (owns_lock())
+        M->unlock();
+      M = o.M;
+      locked = o.locked;
+      o.M = nullptr;
+      o.locked = false;
+    }
+
+    ~unique_lock() { if (owns_lock()) M->unlock(); }
+
+    void lock() {
+      assert(!locked && "mutex already locked!");
+      assert(M && "no associated mutex!");
+      M->lock();
+      locked = true;
+    }
+
+    void unlock() {
+      assert(locked && "unlocking a mutex that isn't locked!");
+      assert(M && "no associated mutex!");
+      M->unlock();
+      locked = false;
+    }
+
+    bool owns_lock() { return locked; }
+  };
+}
+
+#endif // LLVM_SUPPORT_UNIQUE_LOCK_H

diff --git a/include/llvm/Support/Win64EH.h b/include/llvm/Support/Win64EH.h
index 7ca218e..f6c4927 100644
--- a/include/llvm/Support/Win64EH.h
+++ b/include/llvm/Support/Win64EH.h

@@ -40,8 +40,8 @@
 /// or part thereof.
 union UnwindCode {
   struct {
-    support::ulittle8_t CodeOffset;
-    support::ulittle8_t UnwindOpAndOpInfo;
+    uint8_t CodeOffset;
+    uint8_t UnwindOpAndOpInfo;
   } u;
   support::ulittle16_t FrameOffset;
 
@@ -74,10 +74,10 @@
 
 /// UnwindInfo - An entry in the exception table.
 struct UnwindInfo {
-  support::ulittle8_t VersionAndFlags;
-  support::ulittle8_t PrologSize;
-  support::ulittle8_t NumCodes;
-  support::ulittle8_t FrameRegisterAndOffset;
+  uint8_t VersionAndFlags;
+  uint8_t PrologSize;
+  uint8_t NumCodes;
+  uint8_t FrameRegisterAndOffset;
   UnwindCode UnwindCodes[1];
 
   uint8_t getVersion() const {

diff --git a/include/llvm/Support/WindowsError.h b/include/llvm/Support/WindowsError.h
index 0e909a0..63bfe59 100644
--- a/include/llvm/Support/WindowsError.h
+++ b/include/llvm/Support/WindowsError.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_WINDOWS_ERROR_H
-#define LLVM_SUPPORT_WINDOWS_ERROR_H
+#ifndef LLVM_SUPPORT_WINDOWSERROR_H
+#define LLVM_SUPPORT_WINDOWSERROR_H
 
 #include <system_error>
 

diff --git a/include/llvm/Support/YAMLParser.h b/include/llvm/Support/YAMLParser.h
index c39874c..de6e654 100644
--- a/include/llvm/Support/YAMLParser.h
+++ b/include/llvm/Support/YAMLParser.h

@@ -41,13 +41,13 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
 #include <limits>
 #include <map>
 #include <utility>
 
 namespace llvm {
-class MemoryBuffer;
 class SourceMgr;
 class raw_ostream;
 class Twine;
@@ -79,8 +79,7 @@
   /// \brief This keeps a reference to the string referenced by \p Input.
   Stream(StringRef Input, SourceMgr &);
 
-  /// \brief This takes ownership of \p InputBuffer.
-  Stream(MemoryBuffer *InputBuffer, SourceMgr &);
+  Stream(MemoryBufferRef InputBuffer, SourceMgr &);
   ~Stream();
 
   document_iterator begin();

diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index a23faf6..023dcee7 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h

@@ -943,16 +943,17 @@
   };
 
   class MapHNode : public HNode {
+    virtual void anchor();
+
   public:
     MapHNode(Node *n) : HNode(n) { }
-    virtual ~MapHNode();
 
     static inline bool classof(const HNode *n) {
       return MappingNode::classof(n->_node);
     }
     static inline bool classof(const MapHNode *) { return true; }
 
-    typedef llvm::StringMap<HNode*> NameToNode;
+    typedef llvm::StringMap<std::unique_ptr<HNode>> NameToNode;
 
     bool isValidKey(StringRef key);
 
@@ -961,19 +962,20 @@
   };
 
   class SequenceHNode : public HNode {
+    virtual void anchor();
+
   public:
     SequenceHNode(Node *n) : HNode(n) { }
-    virtual ~SequenceHNode();
 
     static inline bool classof(const HNode *n) {
       return SequenceNode::classof(n->_node);
     }
     static inline bool classof(const SequenceHNode *) { return true; }
 
-    std::vector<HNode*> Entries;
+    std::vector<std::unique_ptr<HNode>> Entries;
   };
 
-  Input::HNode *createHNodes(Node *node);
+  std::unique_ptr<Input::HNode> createHNodes(Node *node);
   void setError(HNode *hnode, const Twine &message);
   void setError(Node *node, const Twine &message);
 

diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index 34fbe08..c9ef637 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h

@@ -17,9 +17,12 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
+#include <system_error>
 
 namespace llvm {
   class format_object_base;
+  class FormattedString;
+  class FormattedNumber;
   template <typename T>
   class SmallVectorImpl;
 
@@ -210,6 +213,12 @@
   // Formatted output, see the format() function in Support/Format.h.
   raw_ostream &operator<<(const format_object_base &Fmt);
 
+  // Formatted output, see the leftJustify() function in Support/Format.h.
+  raw_ostream &operator<<(const FormattedString &);
+  
+  // Formatted output, see the formatHex() function in Support/Format.h.
+  raw_ostream &operator<<(const FormattedNumber &);
+  
   /// indent - Insert 'NumSpaces' spaces.
   raw_ostream &indent(unsigned NumSpaces);
 
@@ -341,17 +350,17 @@
   void error_detected() { Error = true; }
 
 public:
-  /// raw_fd_ostream - Open the specified file for writing. If an error occurs,
-  /// information about the error is put into ErrorInfo, and the stream should
-  /// be immediately destroyed; the string will be empty if no error occurred.
-  /// This allows optional flags to control how the file will be opened.
+  /// Open the specified file for writing. If an error occurs, information
+  /// about the error is put into EC, and the stream should be immediately
+  /// destroyed;
+  /// \p Flags allows optional flags to control how the file will be opened.
   ///
   /// As a special case, if Filename is "-", then the stream will use
   /// STDOUT_FILENO instead of opening a file. Note that it will still consider
   /// itself to own the file descriptor. In particular, it will close the
   /// file descriptor when it is done (this is necessary to detect
   /// output errors).
-  raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
+  raw_fd_ostream(StringRef Filename, std::error_code &EC,
                  sys::fs::OpenFlags Flags);
 
   /// raw_fd_ostream ctor - FD is the file descriptor that this writes to.  If

diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index 36464d7..8c5452e 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h

@@ -432,8 +432,8 @@
   /// readability for really no benefit.
   enum InitKind {
     IK_BitInit,
-    IK_BitsInit,
     IK_FirstTypedInit,
+    IK_BitsInit,
     IK_DagInit,
     IK_DefInit,
     IK_FieldInit,
@@ -651,11 +651,12 @@
 /// BitsInit - { a, b, c } - Represents an initializer for a BitsRecTy value.
 /// It contains a vector of bits, whose size is determined by the type.
 ///
-class BitsInit : public Init, public FoldingSetNode {
+class BitsInit : public TypedInit, public FoldingSetNode {
   std::vector<Init*> Bits;
 
   BitsInit(ArrayRef<Init *> Range)
-    : Init(IK_BitsInit), Bits(Range.begin(), Range.end()) {}
+    : TypedInit(IK_BitsInit, BitsRecTy::get(Range.size())),
+      Bits(Range.begin(), Range.end()) {}
 
   BitsInit(const BitsInit &Other) LLVM_DELETED_FUNCTION;
   BitsInit &operator=(const BitsInit &Other) LLVM_DELETED_FUNCTION;
@@ -688,6 +689,14 @@
   }
   std::string getAsString() const override;
 
+  /// resolveListElementReference - This method is used to implement
+  /// VarListElementInit::resolveReferences.  If the list element is resolvable
+  /// now, we return the resolved value, otherwise we return null.
+  Init *resolveListElementReference(Record &R, const RecordVal *RV,
+                                    unsigned Elt) const override {
+    llvm_unreachable("Illegal element reference off bits<n>");
+  }
+
   Init *resolveReferences(Record &R, const RecordVal *RV) const override;
 
   Init *getBit(unsigned Bit) const override {
@@ -928,7 +937,7 @@
 ///
 class BinOpInit : public OpInit {
 public:
-  enum BinaryOp { ADD, SHL, SRA, SRL, LISTCONCAT, STRCONCAT, CONCAT, EQ };
+  enum BinaryOp { ADD, AND, SHL, SRA, SRL, LISTCONCAT, STRCONCAT, CONCAT, EQ };
 
 private:
   BinaryOp Opc;
@@ -1392,6 +1401,18 @@
   DefInit *TheInit;
   bool IsAnonymous;
 
+  // Class-instance values can be used by other defs.  For example, Struct<i>
+  // is used here as a template argument to another class:
+  //
+  //   multiclass MultiClass<int i> {
+  //     def Def : Class<Struct<i>>;
+  //
+  // These need to get fully resolved before instantiating any other
+  // definitions that usie them (e.g. Def).  However, inside a multiclass they
+  // can't be immediately resolved so we mark them ResolveFirst to fully
+  // resolve them later as soon as the multiclass is instantiated.
+  bool ResolveFirst;
+
   void init();
   void checkName();
 
@@ -1400,13 +1421,15 @@
   explicit Record(const std::string &N, ArrayRef<SMLoc> locs,
                   RecordKeeper &records, bool Anonymous = false) :
     ID(LastID++), Name(StringInit::get(N)), Locs(locs.begin(), locs.end()),
-    TrackedRecords(records), TheInit(nullptr), IsAnonymous(Anonymous) {
+    TrackedRecords(records), TheInit(nullptr), IsAnonymous(Anonymous),
+    ResolveFirst(false) {
     init();
   }
   explicit Record(Init *N, ArrayRef<SMLoc> locs, RecordKeeper &records,
                   bool Anonymous = false) :
     ID(LastID++), Name(N), Locs(locs.begin(), locs.end()),
-    TrackedRecords(records), TheInit(nullptr), IsAnonymous(Anonymous) {
+    TrackedRecords(records), TheInit(nullptr), IsAnonymous(Anonymous),
+    ResolveFirst(false) {
     init();
   }
 
@@ -1416,7 +1439,8 @@
     ID(LastID++), Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
     Values(O.Values), SuperClasses(O.SuperClasses),
     SuperClassRanges(O.SuperClassRanges), TrackedRecords(O.TrackedRecords),
-    TheInit(O.TheInit), IsAnonymous(O.IsAnonymous) { }
+    TheInit(O.TheInit), IsAnonymous(O.IsAnonymous),
+    ResolveFirst(O.ResolveFirst) { }
 
   ~Record() {}
 
@@ -1544,6 +1568,14 @@
     return IsAnonymous;
   }
 
+  bool isResolveFirst() const {
+    return ResolveFirst;
+  }
+
+  void setResolveFirst(bool b) {
+    ResolveFirst = b;
+  }
+
   void dump() const;
 
   //===--------------------------------------------------------------------===//
@@ -1641,53 +1673,36 @@
 };
 
 class RecordKeeper {
-  std::map<std::string, Record*> Classes, Defs;
+  typedef std::map<std::string, std::unique_ptr<Record>> RecordMap;
+  RecordMap Classes, Defs;
 
 public:
-  ~RecordKeeper() {
-    for (std::map<std::string, Record*>::iterator I = Classes.begin(),
-           E = Classes.end(); I != E; ++I)
-      delete I->second;
-    for (std::map<std::string, Record*>::iterator I = Defs.begin(),
-           E = Defs.end(); I != E; ++I)
-      delete I->second;
-  }
-
-  const std::map<std::string, Record*> &getClasses() const { return Classes; }
-  const std::map<std::string, Record*> &getDefs() const { return Defs; }
+  const RecordMap &getClasses() const { return Classes; }
+  const RecordMap &getDefs() const { return Defs; }
 
   Record *getClass(const std::string &Name) const {
-    std::map<std::string, Record*>::const_iterator I = Classes.find(Name);
-    return I == Classes.end() ? nullptr : I->second;
+    auto I = Classes.find(Name);
+    return I == Classes.end() ? nullptr : I->second.get();
   }
   Record *getDef(const std::string &Name) const {
-    std::map<std::string, Record*>::const_iterator I = Defs.find(Name);
-    return I == Defs.end() ? nullptr : I->second;
+    auto I = Defs.find(Name);
+    return I == Defs.end() ? nullptr : I->second.get();
   }
-  void addClass(Record *R) {
-    bool Ins = Classes.insert(std::make_pair(R->getName(), R)).second;
+  void addClass(Record *_R) {
+    std::unique_ptr<Record> R(_R);
+    bool Ins = Classes.insert(std::make_pair(R->getName(),
+                                             std::move(R))).second;
     (void)Ins;
     assert(Ins && "Class already exists");
   }
-  void addDef(Record *R) {
-    bool Ins = Defs.insert(std::make_pair(R->getName(), R)).second;
+  void addDef(Record *_R) {
+    std::unique_ptr<Record> R(_R);
+    bool Ins = Defs.insert(std::make_pair(R->getName(),
+                                          std::move(R))).second;
     (void)Ins;
     assert(Ins && "Record already exists");
   }
 
-  /// removeClass - Remove, but do not delete, the specified record.
-  ///
-  void removeClass(const std::string &Name) {
-    assert(Classes.count(Name) && "Class does not exist!");
-    Classes.erase(Name);
-  }
-  /// removeDef - Remove, but do not delete, the specified record.
-  ///
-  void removeDef(const std::string &Name) {
-    assert(Defs.count(Name) && "Def does not exist!");
-    Defs.erase(Name);
-  }
-
   //===--------------------------------------------------------------------===//
   // High-level helper methods, useful for tablegen backends...
 

diff --git a/include/llvm/TableGen/SetTheory.h b/include/llvm/TableGen/SetTheory.h
index 5baed79..76e56ec 100644
--- a/include/llvm/TableGen/SetTheory.h
+++ b/include/llvm/TableGen/SetTheory.h

@@ -44,8 +44,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SETTHEORY_H
-#define SETTHEORY_H
+#ifndef LLVM_TABLEGEN_SETTHEORY_H
+#define LLVM_TABLEGEN_SETTHEORY_H
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringMap.h"

diff --git a/include/llvm/TableGen/StringToOffsetTable.h b/include/llvm/TableGen/StringToOffsetTable.h
index c924bd8..e327703 100644
--- a/include/llvm/TableGen/StringToOffsetTable.h
+++ b/include/llvm/TableGen/StringToOffsetTable.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TBLGEN_STRING_TO_OFFSET_TABLE_H
-#define TBLGEN_STRING_TO_OFFSET_TABLE_H
+#ifndef LLVM_TABLEGEN_STRINGTOOFFSETTABLE_H
+#define LLVM_TABLEGEN_STRINGTOOFFSETTABLE_H
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -28,16 +28,16 @@
 
 public:
   unsigned GetOrAddStringOffset(StringRef Str, bool appendZero = true) {
-    StringMapEntry<unsigned> &Entry = StringOffset.GetOrCreateValue(Str, -1U);
-    if (Entry.getValue() == -1U) {
+    auto IterBool =
+        StringOffset.insert(std::make_pair(Str, AggregateString.size()));
+    if (IterBool.second) {
       // Add the string to the aggregate if this is the first time found.
-      Entry.setValue(AggregateString.size());
       AggregateString.append(Str.begin(), Str.end());
       if (appendZero)
         AggregateString += '\0';
     }
 
-    return Entry.getValue();
+    return IterBool.first->second;
   }
 
   void EmitString(raw_ostream &O) {

diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index f77cc7a..902647e 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td

@@ -378,9 +378,18 @@
   bit isAsCheapAsAMove = 0; // As cheap (or cheaper) than a move instruction.
   bit hasExtraSrcRegAllocReq = 0; // Sources have special regalloc requirement?
   bit hasExtraDefRegAllocReq = 0; // Defs have special regalloc requirement?
+  bit isRegSequence = 0;    // Is this instruction a kind of reg sequence?
+                            // If so, make sure to override
+                            // TargetInstrInfo::getRegSequenceLikeInputs.
   bit isPseudo     = 0;     // Is this instruction a pseudo-instruction?
                             // If so, won't have encoding information for
                             // the [MC]CodeEmitter stuff.
+  bit isExtractSubreg = 0;  // Is this instruction a kind of extract subreg?
+                             // If so, make sure to override
+                             // TargetInstrInfo::getExtractSubregLikeInputs.
+  bit isInsertSubreg = 0;   // Is this instruction a kind of insert subreg?
+                            // If so, make sure to override
+                            // TargetInstrInfo::getInsertSubregLikeInputs.
 
   // Side effect flags - When set, the flags have these meanings:
   //
@@ -583,7 +592,6 @@
   string PrintMethod = "printOperand";
   string EncoderMethod = "";
   string DecoderMethod = "";
-  string AsmOperandLowerMethod = ?;
   string OperandType = "OPERAND_UNKNOWN";
   dag MIOperandInfo = (ops);
 
@@ -797,7 +805,7 @@
 }
 def REG_SEQUENCE : Instruction {
   let OutOperandList = (outs unknown:$dst);
-  let InOperandList = (ins variable_ops);
+  let InOperandList = (ins unknown:$supersrc, variable_ops);
   let AsmString = "";
   let neverHasSideEffects = 1;
   let isAsCheapAsAMove = 1;
@@ -841,6 +849,14 @@
   let mayLoad = 1;
   let usesCustomInserter = 1;
 }
+def LOAD_STACK_GUARD : Instruction {
+  let OutOperandList = (outs ptr_rc:$dst);
+  let InOperandList = (ins);
+  let mayLoad = 1;
+  bit isReMaterializable = 1;
+  let hasSideEffects = 0;
+  bit isPseudo = 1;
+}
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/include/llvm/Target/TargetCallingConv.td b/include/llvm/Target/TargetCallingConv.td
index 8f31e08..2e766c4 100644
--- a/include/llvm/Target/TargetCallingConv.td
+++ b/include/llvm/Target/TargetCallingConv.td

@@ -67,6 +67,9 @@
 /// the specified action.
 class CCIfSRet<CCAction A> : CCIf<"ArgFlags.isSRet()", A> {}
 
+/// CCIfVarArg - If the current function is vararg - apply the action
+class CCIfVarArg<CCAction A> : CCIf<"State.isVarArg()", A> {}
+
 /// CCIfNotVarArg - If the current function is not vararg - apply the action
 class CCIfNotVarArg<CCAction A> : CCIf<"!State.isVarArg()", A> {}
 
@@ -119,6 +122,12 @@
   ValueType DestTy = destTy;
 }
 
+/// CCPromoteToUpperBitsInType - If applied, this promotes the specified current
+/// value to the specified type and shifts the value into the upper bits.
+class CCPromoteToUpperBitsInType<ValueType destTy> : CCAction {
+  ValueType DestTy = destTy;
+}
+
 /// CCBitConvertToType - If applied, this bitconverts the specified current
 /// value to the specified type.
 class CCBitConvertToType<ValueType destTy> : CCAction {
@@ -141,6 +150,13 @@
 /// that the target supports.
 class CallingConv<list<CCAction> actions> {
   list<CCAction> Actions = actions;
+  bit Custom = 0;
+}
+
+/// CustomCallingConv - An instance of this is used to declare calling
+/// conventions that are implemented using a custom function of the same name.
+class CustomCallingConv : CallingConv<[]> {
+  let Custom = 1;
 }
 
 /// CalleeSavedRegs - A list of callee saved registers for a given calling

diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index a589d0e..a37a7f9 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h

@@ -15,9 +15,11 @@
 #define LLVM_TARGET_TARGETINSTRINFO_H
 
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
 
@@ -28,7 +30,7 @@
 class MachineRegisterInfo;
 class MDNode;
 class MCInst;
-class MCSchedModel;
+struct MCSchedModel;
 class MCSymbolRefExpr;
 class SDNode;
 class ScheduleHazardRecognizer;
@@ -38,6 +40,7 @@
 class TargetRegisterInfo;
 class BranchProbability;
 class TargetSubtargetInfo;
+class DFAPacketizer;
 
 template<class T> class SmallVectorImpl;
 
@@ -261,6 +264,85 @@
   virtual bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                                      unsigned &SrcOpIdx2) const;
 
+  /// A pair composed of a register and a sub-register index.
+  /// Used to give some type checking when modeling Reg:SubReg.
+  struct RegSubRegPair {
+    unsigned Reg;
+    unsigned SubReg;
+    RegSubRegPair(unsigned Reg = 0, unsigned SubReg = 0)
+        : Reg(Reg), SubReg(SubReg) {}
+  };
+  /// A pair composed of a pair of a register and a sub-register index,
+  /// and another sub-register index.
+  /// Used to give some type checking when modeling Reg:SubReg1, SubReg2.
+  struct RegSubRegPairAndIdx : RegSubRegPair {
+    unsigned SubIdx;
+    RegSubRegPairAndIdx(unsigned Reg = 0, unsigned SubReg = 0,
+                        unsigned SubIdx = 0)
+        : RegSubRegPair(Reg, SubReg), SubIdx(SubIdx) {}
+  };
+
+  /// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI
+  /// and \p DefIdx.
+  /// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of
+  /// the list is modeled as <Reg:SubReg, SubIdx>.
+  /// E.g., REG_SEQUENCE vreg1:sub1, sub0, vreg2, sub1 would produce
+  /// two elements:
+  /// - vreg1:sub1, sub0
+  /// - vreg2<:0>, sub1
+  ///
+  /// \returns true if it is possible to build such an input sequence
+  /// with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isRegSequence() or MI.isRegSequenceLike().
+  ///
+  /// \note The generic implementation does not provide any support for
+  /// MI.isRegSequenceLike(). In other words, one has to override
+  /// getRegSequenceLikeInputs for target specific instructions.
+  bool
+  getRegSequenceInputs(const MachineInstr &MI, unsigned DefIdx,
+                       SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const;
+
+  /// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI
+  /// and \p DefIdx.
+  /// \p [out] InputReg of the equivalent EXTRACT_SUBREG.
+  /// E.g., EXTRACT_SUBREG vreg1:sub1, sub0, sub1 would produce:
+  /// - vreg1:sub1, sub0
+  ///
+  /// \returns true if it is possible to build such an input sequence
+  /// with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isExtractSubreg() or MI.isExtractSubregLike().
+  ///
+  /// \note The generic implementation does not provide any support for
+  /// MI.isExtractSubregLike(). In other words, one has to override
+  /// getExtractSubregLikeInputs for target specific instructions.
+  bool
+  getExtractSubregInputs(const MachineInstr &MI, unsigned DefIdx,
+                         RegSubRegPairAndIdx &InputReg) const;
+
+  /// Build the equivalent inputs of a INSERT_SUBREG for the given \p MI
+  /// and \p DefIdx.
+  /// \p [out] BaseReg and \p [out] InsertedReg contain
+  /// the equivalent inputs of INSERT_SUBREG.
+  /// E.g., INSERT_SUBREG vreg0:sub0, vreg1:sub1, sub3 would produce:
+  /// - BaseReg: vreg0:sub0
+  /// - InsertedReg: vreg1:sub1, sub3
+  ///
+  /// \returns true if it is possible to build such an input sequence
+  /// with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isInsertSubreg() or MI.isInsertSubregLike().
+  ///
+  /// \note The generic implementation does not provide any support for
+  /// MI.isInsertSubregLike(). In other words, one has to override
+  /// getInsertSubregLikeInputs for target specific instructions.
+  bool
+  getInsertSubregInputs(const MachineInstr &MI, unsigned DefIdx,
+                        RegSubRegPair &BaseReg,
+                        RegSubRegPairAndIdx &InsertedReg) const;
+
+
   /// produceSameValue - Return true if two machine instructions would produce
   /// identical values. By default, this is only true when the two instructions
   /// are deemed identical except for defs. If this function is called when the
@@ -346,6 +428,26 @@
     llvm_unreachable("Target didn't implement TargetInstrInfo::getTrap!");
   }
 
+  /// getJumpInstrTableEntryBound - Get a number of bytes that suffices to hold
+  /// either the instruction returned by getUnconditionalBranch or the
+  /// instruction returned by getTrap. This only makes sense because
+  /// getUnconditionalBranch returns a single, specific instruction. This
+  /// information is needed by the jumptable construction code, since it must
+  /// decide how many bytes to use for a jumptable entry so it can generate the
+  /// right mask.
+  ///
+  /// Note that if the jumptable instruction requires alignment, then that
+  /// alignment should be factored into this required bound so that the
+  /// resulting bound gives the right alignment for the instruction.
+  virtual unsigned getJumpInstrTableEntryBound() const {
+    // This method gets called by LLVMTargetMachine always, so it can't fail
+    // just because there happens to be no implementation for this target.
+    // Any code that tries to use a jumptable annotation without defining
+    // getUnconditionalBranch on the appropriate Target will fail anyway, and
+    // the value returned here won't matter in that case.
+    return 0;
+  }
+
   /// isLegalToSplitMBBAt - Return true if it's legal to split the given basic
   /// block at the specified instruction (i.e. instruction would be the start
   /// of a new basic block).
@@ -572,6 +674,42 @@
                                   const SmallVectorImpl<unsigned> &Ops,
                                   MachineInstr* LoadMI) const;
 
+  /// hasPattern - return true when there is potentially a faster code sequence
+  /// for an instruction chain ending in \p Root. All potential pattern are
+  /// returned in the \p Pattern vector. Pattern should be sorted in priority
+  /// order since the pattern evaluator stops checking as soon as it finds a
+  /// faster sequence.
+  /// \param Root - Instruction that could be combined with one of its operands
+  /// \param Pattern - Vector of possible combination pattern
+
+  virtual bool hasPattern(
+      MachineInstr &Root,
+      SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const {
+    return false;
+  }
+
+  /// genAlternativeCodeSequence - when hasPattern() finds a pattern this
+  /// function generates the instructions that could replace the original code
+  /// sequence. The client has to decide whether the actual replacementment is
+  /// beneficial or not.
+  /// \param Root - Instruction that could be combined with one of its operands
+  /// \param P - Combination pattern for Root
+  /// \param InsInstrs - Vector of new instructions that implement P
+  /// \param DelInstrs - Old instructions, including Root, that could be replaced
+  /// by InsInstr
+  /// \param InstrIdxForVirtReg - map of virtual register to instruction in
+  /// InsInstr that defines it
+  virtual void genAlternativeCodeSequence(
+      MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
+      SmallVectorImpl<MachineInstr *> &InsInstrs,
+      SmallVectorImpl<MachineInstr *> &DelInstrs,
+      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+    return;
+  }
+
+  /// useMachineCombiner - return true when a target supports MachineCombiner
+  virtual bool useMachineCombiner() const { return false; }
+
 protected:
   /// foldMemoryOperandImpl - Target-dependent implementation for
   /// foldMemoryOperand. Target-independent code in foldMemoryOperand will
@@ -593,6 +731,49 @@
     return nullptr;
   }
 
+  /// \brief Target-dependent implementation of getRegSequenceInputs.
+  ///
+  /// \returns true if it is possible to build the equivalent
+  /// REG_SEQUENCE inputs with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isRegSequenceLike().
+  ///
+  /// \see TargetInstrInfo::getRegSequenceInputs.
+  virtual bool getRegSequenceLikeInputs(
+      const MachineInstr &MI, unsigned DefIdx,
+      SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+    return false;
+  }
+
+  /// \brief Target-dependent implementation of getExtractSubregInputs.
+  ///
+  /// \returns true if it is possible to build the equivalent
+  /// EXTRACT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isExtractSubregLike().
+  ///
+  /// \see TargetInstrInfo::getExtractSubregInputs.
+  virtual bool getExtractSubregLikeInputs(
+      const MachineInstr &MI, unsigned DefIdx,
+      RegSubRegPairAndIdx &InputReg) const {
+    return false;
+  }
+
+  /// \brief Target-dependent implementation of getInsertSubregInputs.
+  ///
+  /// \returns true if it is possible to build the equivalent
+  /// INSERT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isInsertSubregLike().
+  ///
+  /// \see TargetInstrInfo::getInsertSubregInputs.
+  virtual bool
+  getInsertSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
+                            RegSubRegPair &BaseReg,
+                            RegSubRegPairAndIdx &InsertedReg) const {
+    return false;
+  }
+
 public:
   /// canFoldMemoryOperand - Returns true for the specified load / store if
   /// folding is possible.
@@ -686,10 +867,8 @@
                           MachineBasicBlock::iterator MI) const;
 
 
-  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  virtual void getNoopForMachoTarget(MCInst &NopInst) const {
-    // Default to just using 'nop' string.
-  }
+  /// Return the noop instruction to use for a noop.
+  virtual void getNoopForMachoTarget(MCInst &NopInst) const;
 
 
   /// isPredicated - Returns true if the instruction is already predicated.
@@ -793,6 +972,7 @@
                                     const MachineRegisterInfo *MRI) const {
     return false;
   }
+  virtual bool optimizeCondBranch(MachineInstr *MI) const { return false; }
 
   /// optimizeLoadInstr - Try to remove the load by folding it to a register
   /// operand at the use. We fold the load instructions if and only if the
@@ -871,7 +1051,7 @@
                               SDNode *Node) const;
 
   /// Return the default expected latency for a def based on it's opcode.
-  unsigned defaultDefLatency(const MCSchedModel *SchedModel,
+  unsigned defaultDefLatency(const MCSchedModel &SchedModel,
                              const MachineInstr *DefMI) const;
 
   int computeDefOperandLatency(const InstrItineraryData *ItinData,
@@ -1026,11 +1206,25 @@
                             const TargetRegisterInfo *TRI) const {}
 
   /// Create machine specific model for scheduling.
-  virtual DFAPacketizer*
-    CreateTargetScheduleState(const TargetMachine*, const ScheduleDAG*) const {
+  virtual DFAPacketizer *
+  CreateTargetScheduleState(const TargetSubtargetInfo &) const {
     return nullptr;
   }
 
+  // areMemAccessesTriviallyDisjoint - Sometimes, it is possible for the target
+  // to tell, even without aliasing information, that two MIs access different
+  // memory addresses. This function returns true if two MIs access different
+  // memory addresses, and false otherwise.
+  virtual bool
+  areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
+                                  AliasAnalysis *AA = nullptr) const {
+    assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
+           "MIa must load from or modify a memory location");
+    assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
+           "MIb must load from or modify a memory location");
+    return false;
+  }
+
 private:
   int CallFrameSetupOpcode, CallFrameDestroyOpcode;
 };

diff --git a/include/llvm/Target/TargetIntrinsicInfo.h b/include/llvm/Target/TargetIntrinsicInfo.h
index 6de264e..71c0166 100644
--- a/include/llvm/Target/TargetIntrinsicInfo.h
+++ b/include/llvm/Target/TargetIntrinsicInfo.h

@@ -52,7 +52,7 @@
 
   /// Returns true if the intrinsic can be overloaded.
   virtual bool isOverloaded(unsigned IID) const = 0;
-  
+
   /// Create or insert an LLVM Function declaration for an intrinsic,
   /// and return it. The Tys and numTys are for intrinsics with overloaded
   /// types. See above for more information.

diff --git a/include/llvm/Target/TargetJITInfo.h b/include/llvm/Target/TargetJITInfo.h
deleted file mode 100644
index f9bd0fb..0000000
--- a/include/llvm/Target/TargetJITInfo.h
+++ /dev/null

@@ -1,137 +0,0 @@
-//===- Target/TargetJITInfo.h - Target Information for JIT ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file exposes an abstract interface used by the Just-In-Time code
-// generator to perform target-specific activities, such as emitting stubs.  If
-// a TargetMachine supports JIT code generation, it should provide one of these
-// objects through the getJITInfo() method.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_TARGETJITINFO_H
-#define LLVM_TARGET_TARGETJITINFO_H
-
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
-
-namespace llvm {
-  class Function;
-  class GlobalValue;
-  class JITCodeEmitter;
-  class MachineRelocation;
-
-  /// TargetJITInfo - Target specific information required by the Just-In-Time
-  /// code generator.
-  class TargetJITInfo {
-    virtual void anchor();
-  public:
-    virtual ~TargetJITInfo() {}
-
-    /// replaceMachineCodeForFunction - Make it so that calling the function
-    /// whose machine code is at OLD turns into a call to NEW, perhaps by
-    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
-    /// code.
-    ///
-    virtual void replaceMachineCodeForFunction(void *Old, void *New) = 0;
-
-    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
-    /// to emit an indirect symbol which contains the address of the specified
-    /// ptr.
-    virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
-                                             JITCodeEmitter &JCE) {
-      llvm_unreachable("This target doesn't implement "
-                       "emitGlobalValueIndirectSym!");
-    }
-
-    /// Records the required size and alignment for a call stub in bytes.
-    struct StubLayout {
-      size_t Size;
-      size_t Alignment;
-    };
-    /// Returns the maximum size and alignment for a call stub on this target.
-    virtual StubLayout getStubLayout() {
-      llvm_unreachable("This target doesn't implement getStubLayout!");
-    }
-
-    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
-    /// small native function that simply calls the function at the specified
-    /// address.  The JITCodeEmitter must already have storage allocated for the
-    /// stub.  Return the address of the resultant function, which may have been
-    /// aligned from the address the JCE was set up to emit at.
-    virtual void *emitFunctionStub(const Function* F, void *Target,
-                                   JITCodeEmitter &JCE) {
-      llvm_unreachable("This target doesn't implement emitFunctionStub!");
-    }
-
-    /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
-    /// specific basic block.
-    virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase) {
-      llvm_unreachable("This target doesn't implement getPICJumpTableEntry!");
-    }
-
-    /// LazyResolverFn - This typedef is used to represent the function that
-    /// unresolved call points should invoke.  This is a target specific
-    /// function that knows how to walk the stack and find out which stub the
-    /// call is coming from.
-    typedef void (*LazyResolverFn)();
-
-    /// JITCompilerFn - This typedef is used to represent the JIT function that
-    /// lazily compiles the function corresponding to a stub.  The JIT keeps
-    /// track of the mapping between stubs and LLVM Functions, the target
-    /// provides the ability to figure out the address of a stub that is called
-    /// by the LazyResolverFn.
-    typedef void* (*JITCompilerFn)(void *);
-
-    /// getLazyResolverFunction - This method is used to initialize the JIT,
-    /// giving the target the function that should be used to compile a
-    /// function, and giving the JIT the target function used to do the lazy
-    /// resolving.
-    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn) {
-      llvm_unreachable("Not implemented for this target!");
-    }
-
-    /// relocate - Before the JIT can run a block of code that has been emitted,
-    /// it must rewrite the code to contain the actual addresses of any
-    /// referenced global symbols.
-    virtual void relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char* GOTBase) {
-      assert(NumRelocs == 0 && "This target does not have relocations!");
-    }
-    
-
-    /// allocateThreadLocalMemory - Each target has its own way of
-    /// handling thread local variables. This method returns a value only
-    /// meaningful to the target.
-    virtual char* allocateThreadLocalMemory(size_t size) {
-      llvm_unreachable("This target does not implement thread local storage!");
-    }
-
-    /// needsGOT - Allows a target to specify that it would like the
-    /// JIT to manage a GOT for it.
-    bool needsGOT() const { return useGOT; }
-
-    /// hasCustomConstantPool - Allows a target to specify that constant
-    /// pool address resolution is handled by the target.
-    virtual bool hasCustomConstantPool() const { return false; }
-
-    /// hasCustomJumpTables - Allows a target to specify that jumptables
-    /// are emitted by the target.
-    virtual bool hasCustomJumpTables() const { return false; }
-
-    /// allocateSeparateGVMemory - If true, globals should be placed in
-    /// separately allocated heap memory rather than in the same
-    /// code memory allocated by JITCodeEmitter.
-    virtual bool allocateSeparateGVMemory() const { return false; }
-  protected:
-    bool useGOT;
-  };
-} // End llvm namespace
-
-#endif

diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index 93c9aa3..249849b 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h

@@ -37,10 +37,18 @@
       ZdaPv,
       /// void operator delete[](void*, nothrow);
       ZdaPvRKSt9nothrow_t,
+      /// void operator delete[](void*, unsigned int);
+      ZdaPvj,
+      /// void operator delete[](void*, unsigned long);
+      ZdaPvm,
       /// void operator delete(void*);
       ZdlPv,
       /// void operator delete(void*, nothrow);
       ZdlPvRKSt9nothrow_t,
+      /// void operator delete(void*, unsigned int);
+      ZdlPvj,
+      /// void operator delete(void*, unsigned long);
+      ZdlPvm,
       /// void *new[](unsigned int);
       Znaj,
       /// void *new[](unsigned int, nothrow);
@@ -65,7 +73,7 @@
       cxa_atexit,
       /// void __cxa_guard_abort(guard_t *guard);
       /// guard_t is int64_t in Itanium ABI or int32_t on ARM eabi.
-      cxa_guard_abort,      
+      cxa_guard_abort,
       /// int __cxa_guard_acquire(guard_t *guard);
       cxa_guard_acquire,
       /// void __cxa_guard_release(guard_t *guard);
@@ -76,6 +84,11 @@
       dunder_isoc99_sscanf,
       /// void *__memcpy_chk(void *s1, const void *s2, size_t n, size_t s1size);
       memcpy_chk,
+      /// void *__memmove_chk(void *s1, const void *s2, size_t n,
+      ///                     size_t s1size);
+      memmove_chk,
+      /// void *__memset_chk(void *s, char v, size_t n, size_t s1size);
+      memset_chk,
       /// double __sincospi_stret(double x);
       sincospi_stret,
       /// float __sincospif_stret(float x);
@@ -90,8 +103,18 @@
       sqrtf_finite,
       /// long double __sqrt_finite(long double x);
       sqrtl_finite,
+      /// char *__stpcpy_chk(char *s1, const char *s2, size_t s1size);
+      stpcpy_chk,
+      /// char *__stpncpy_chk(char *s1, const char *s2, size_t n,
+      ///                     size_t s1size);
+      stpncpy_chk,
+      /// char *__strcpy_chk(char *s1, const char *s2, size_t s1size);
+      strcpy_chk,
       /// char * __strdup(const char *s);
       dunder_strdup,
+      /// char *__strncpy_chk(char *s1, const char *s2, size_t n,
+      ///                     size_t s1size);
+      strncpy_chk,
       /// char *__strndup(const char *s, size_t n);
       dunder_strndup,
       /// char * __strtok_r(char *s, const char *delim, char **save_ptr);
@@ -707,7 +730,7 @@
   TargetLibraryInfo();
   TargetLibraryInfo(const Triple &T);
   explicit TargetLibraryInfo(const TargetLibraryInfo &TLI);
-  
+
   /// getLibFunc - Search for a particular function name.  If it is one of the
   /// known library functions, return true and set F to the corresponding value.
   bool getLibFunc(StringRef funcName, LibFunc::Func &F) const;

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 5e9978d..882dab4 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h

@@ -31,6 +31,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Target/TargetCallingConv.h"
@@ -136,10 +137,9 @@
     llvm_unreachable("Invalid content kind");
   }
 
-  /// NOTE: The constructor takes ownership of TLOF.
-  explicit TargetLoweringBase(const TargetMachine &TM,
-                              const TargetLoweringObjectFile *TLOF);
-  virtual ~TargetLoweringBase();
+  /// NOTE: The TargetMachine owns TLOF.
+  explicit TargetLoweringBase(const TargetMachine &TM);
+  virtual ~TargetLoweringBase() {}
 
 protected:
   /// \brief Initialize all of the actions to default values.
@@ -148,7 +148,9 @@
 public:
   const TargetMachine &getTargetMachine() const { return TM; }
   const DataLayout *getDataLayout() const { return DL; }
-  const TargetLoweringObjectFile &getObjFileLowering() const { return TLOF; }
+  const TargetLoweringObjectFile &getObjFileLowering() const {
+    return *TM.getObjFileLowering();
+  }
 
   bool isBigEndian() const { return !IsLittleEndian; }
   bool isLittleEndian() const { return IsLittleEndian; }
@@ -223,8 +225,8 @@
     return BypassSlowDivWidths;
   }
 
-  /// Return true if pow2 div is cheaper than a chain of srl/add/sra.
-  bool isPow2DivCheap() const { return Pow2DivIsCheap; }
+  /// Return true if pow2 sdiv is cheaper than a chain of sra/srl/add/sra.
+  bool isPow2SDivCheap() const { return Pow2SDivIsCheap; }
 
   /// Return true if Flow Control is an expensive operation that should be
   /// avoided.
@@ -262,10 +264,27 @@
     return MaskAndBranchFoldingIsLegal;
   }
 
-  /// Return the ValueType of the result of SETCC operations.  Also used to
-  /// obtain the target's preferred type for the condition operand of SELECT and
-  /// BRCOND nodes.  In the case of BRCOND the argument passed is MVT::Other
-  /// since there are no other operands to get a type hint from.
+  /// Return true if the target can combine store(extractelement VectorTy,
+  /// Idx).
+  /// \p Cost[out] gives the cost of that transformation when this is true.
+  virtual bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                         unsigned &Cost) const {
+    return false;
+  }
+
+  /// Return true if target supports floating point exceptions.
+  bool hasFloatingPointExceptions() const {
+    return HasFloatingPointExceptions;
+  }
+
+  /// Return true if target always beneficiates from combining into FMA for a
+  /// given value type. This must typically return false on targets where FMA
+  /// takes more cycles to execute than FADD.
+  virtual bool enableAggressiveFMAFusion(EVT VT) const {
+    return false;
+  }
+
+  /// Return the ValueType of the result of SETCC operations.
   virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
   /// Return the ValueType for comparison libcalls. Comparions libcalls include
@@ -426,10 +445,15 @@
     EVT          memVT;       // memory VT
     const Value* ptrVal;      // value representing memory location
     int          offset;      // offset off of ptrVal
+    unsigned     size;        // the size of the memory location
+                              // (taken from memVT if zero)
     unsigned     align;       // alignment
     bool         vol;         // is volatile?
     bool         readMem;     // reads memory?
     bool         writeMem;    // writes memory?
+
+    IntrinsicInfo() : opc(0), ptrVal(nullptr), offset(0), size(0), align(1),
+                      vol(false), readMem(false), writeMem(false) {}
   };
 
   /// Given an intrinsic, checks if on the target the intrinsic will need to map
@@ -517,10 +541,12 @@
   /// Return how this load with extension should be treated: either it is legal,
   /// needs to be promoted to a larger size, needs to be expanded to some other
   /// code sequence, or the target has a custom expander for it.
-  LegalizeAction getLoadExtAction(unsigned ExtType, MVT VT) const {
-    assert(ExtType < ISD::LAST_LOADEXT_TYPE && VT < MVT::LAST_VALUETYPE &&
+  LegalizeAction getLoadExtAction(unsigned ExtType, EVT VT) const {
+    if (VT.isExtended()) return Expand;
+    unsigned I = (unsigned) VT.getSimpleVT().SimpleTy;
+    assert(ExtType < ISD::LAST_LOADEXT_TYPE && I < MVT::LAST_VALUETYPE &&
            "Table isn't big enough!");
-    return (LegalizeAction)LoadExtActions[VT.SimpleTy][ExtType];
+    return (LegalizeAction)LoadExtActions[I][ExtType];
   }
 
   /// Return true if the specified load with extension is legal on this target.
@@ -532,11 +558,13 @@
   /// Return how this store with truncation should be treated: either it is
   /// legal, needs to be promoted to a larger size, needs to be expanded to some
   /// other code sequence, or the target has a custom expander for it.
-  LegalizeAction getTruncStoreAction(MVT ValVT, MVT MemVT) const {
-    assert(ValVT < MVT::LAST_VALUETYPE && MemVT < MVT::LAST_VALUETYPE &&
+  LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const {
+    if (ValVT.isExtended() || MemVT.isExtended()) return Expand;
+    unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
+    unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
+    assert(ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE &&
            "Table isn't big enough!");
-    return (LegalizeAction)TruncStoreActions[ValVT.SimpleTy]
-                                            [MemVT.SimpleTy];
+    return (LegalizeAction)TruncStoreActions[ValI][MemI];
   }
 
   /// Return true if the specified store with truncation is legal on this
@@ -773,14 +801,15 @@
   ///
   /// This function returns true if the target allows unaligned memory accesses
   /// of the specified type in the given address space. If true, it also returns
-  /// whether the unaligned memory access is "fast" in the third argument by
+  /// whether the unaligned memory access is "fast" in the last argument by
   /// reference. This is used, for example, in situations where an array
   /// copy/move/set is converted to a sequence of store operations. Its use
   /// helps to ensure that such replacements don't generate code that causes an
   /// alignment error (trap) on the target machine.
-  virtual bool allowsUnalignedMemoryAccesses(EVT,
-                                             unsigned AddrSpace = 0,
-                                             bool * /*Fast*/ = nullptr) const {
+  virtual bool allowsMisalignedMemoryAccesses(EVT,
+                                              unsigned AddrSpace = 0,
+                                              unsigned Align = 1,
+                                              bool * /*Fast*/ = nullptr) const {
     return false;
   }
 
@@ -823,11 +852,6 @@
     return UseUnderscoreLongJmp;
   }
 
-  /// Return whether the target can generate code for jump tables.
-  bool supportJumpTables() const {
-    return SupportJumpTables;
-  }
-
   /// Return integer threshold on number of blocks to use jump tables rather
   /// than if sequence.
   int getMinimumJumpTableEntries() const {
@@ -922,9 +946,13 @@
   /// @}
 
   //===--------------------------------------------------------------------===//
-  /// \name Helpers for load-linked/store-conditional atomic expansion.
+  /// \name Helpers for atomic expansion.
   /// @{
 
+  /// True if AtomicExpandPass should use emitLoadLinked/emitStoreConditional
+  /// and expand AtomicCmpXchgInst.
+  virtual bool hasLoadLinkedStoreConditional() const { return false; }
+
   /// Perform a load-linked operation on Addr, returning a "Value *" with the
   /// corresponding pointee type. This may entail some non-trivial operations to
   /// truncate or reconstruct types that will be illegal in the backend. See
@@ -941,15 +969,90 @@
     llvm_unreachable("Store conditional unimplemented on this target");
   }
 
-  /// Return true if the given (atomic) instruction should be expanded by the
-  /// IR-level AtomicExpandLoadLinked pass into a loop involving
-  /// load-linked/store-conditional pairs. Atomic stores will be expanded in the
-  /// same way as "atomic xchg" operations which ignore their output if needed.
-  virtual bool shouldExpandAtomicInIR(Instruction *Inst) const {
+  /// Inserts in the IR a target-specific intrinsic specifying a fence.
+  /// It is called by AtomicExpandPass before expanding an
+  ///   AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad.
+  /// RMW and CmpXchg set both IsStore and IsLoad to true.
+  /// This function should either return a nullptr, or a pointer to an IR-level
+  ///   Instruction*. Even complex fence sequences can be represented by a
+  ///   single Instruction* through an intrinsic to be lowered later.
+  /// Backends with !getInsertFencesForAtomic() should keep a no-op here.
+  /// Backends should override this method to produce target-specific intrinsic
+  ///   for their fences.
+  /// FIXME: Please note that the default implementation here in terms of
+  ///   IR-level fences exists for historical/compatibility reasons and is
+  ///   *unsound* ! Fences cannot, in general, be used to restore sequential
+  ///   consistency. For example, consider the following example:
+  /// atomic<int> x = y = 0;
+  /// int r1, r2, r3, r4;
+  /// Thread 0:
+  ///   x.store(1);
+  /// Thread 1:
+  ///   y.store(1);
+  /// Thread 2:
+  ///   r1 = x.load();
+  ///   r2 = y.load();
+  /// Thread 3:
+  ///   r3 = y.load();
+  ///   r4 = x.load();
+  ///  r1 = r3 = 1 and r2 = r4 = 0 is impossible as long as the accesses are all
+  ///  seq_cst. But if they are lowered to monotonic accesses, no amount of
+  ///  IR-level fences can prevent it.
+  /// @{
+  virtual Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+          bool IsStore, bool IsLoad) const {
+    if (!getInsertFencesForAtomic())
+      return nullptr;
+
+    if (isAtLeastRelease(Ord) && IsStore)
+      return Builder.CreateFence(Ord);
+    else
+      return nullptr;
+  }
+
+  virtual Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+          bool IsStore, bool IsLoad) const {
+    if (!getInsertFencesForAtomic())
+      return nullptr;
+
+    if (isAtLeastAcquire(Ord))
+      return Builder.CreateFence(Ord);
+    else
+      return nullptr;
+  }
+  /// @}
+
+  /// Returns true if the given (atomic) store should be expanded by the
+  /// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input.
+  virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const {
     return false;
   }
 
+  /// Returns true if the given (atomic) load should be expanded by the
+  /// IR-level AtomicExpand pass into a load-linked instruction
+  /// (through emitLoadLinked()).
+  virtual bool shouldExpandAtomicLoadInIR(LoadInst *LI) const { return false; }
 
+  /// Returns true if the given AtomicRMW should be expanded by the
+  /// IR-level AtomicExpand pass into a loop using LoadLinked/StoreConditional.
+  virtual bool shouldExpandAtomicRMWInIR(AtomicRMWInst *RMWI) const {
+    return false;
+  }
+
+  /// On some platforms, an AtomicRMW that never actually modifies the value
+  /// (such as fetch_add of 0) can be turned into a fence followed by an
+  /// atomic load. This may sound useless, but it makes it possible for the
+  /// processor to keep the cacheline shared, dramatically improving
+  /// performance. And such idempotent RMWs are useful for implementing some
+  /// kinds of locks, see for example (justification + benchmarks):
+  /// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+  /// This method tries doing that transformation, returning the atomic load if
+  /// it succeeds, and nullptr otherwise.
+  /// If shouldExpandAtomicLoadInIR returns true on that load, it will undergo
+  /// another round of expansion.
+  virtual LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *RMWI) const {
+    return nullptr;
+  }
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
@@ -996,11 +1099,6 @@
     UseUnderscoreLongJmp = Val;
   }
 
-  /// Indicate whether the target can generate code for jump tables.
-  void setSupportJumpTables(bool Val) {
-    SupportJumpTables = Val;
-  }
-
   /// Indicate the number of blocks to generate jump tables rather than if
   /// sequence.
   void setMinimumJumpTableEntries(int Val) {
@@ -1058,15 +1156,21 @@
   /// possible, should be replaced by an alternate sequence of instructions not
   /// containing an integer divide.
   void setIntDivIsCheap(bool isCheap = true) { IntDivIsCheap = isCheap; }
+  
+  /// Tells the code generator that this target supports floating point
+  /// exceptions and cares about preserving floating point exception behavior.
+  void setHasFloatingPointExceptions(bool FPExceptions = true) {
+    HasFloatingPointExceptions = FPExceptions;
+  }
 
   /// Tells the code generator which bitwidths to bypass.
   void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth) {
     BypassSlowDivWidths[SlowBitWidth] = FastBitWidth;
   }
 
-  /// Tells the code generator that it shouldn't generate srl/add/sra for a
-  /// signed divide by power of two, and let the target handle it.
-  void setPow2DivIsCheap(bool isCheap = true) { Pow2DivIsCheap = isCheap; }
+  /// Tells the code generator that it shouldn't generate sra/srl/add/sra for a
+  /// signed divide by power of two; let the target handle it.
+  void setPow2SDivIsCheap(bool isCheap = true) { Pow2SDivIsCheap = isCheap; }
 
   /// Add the specified register class as an available regclass for the
   /// specified value type. This indicates the selector can handle values of
@@ -1451,7 +1555,6 @@
 private:
   const TargetMachine &TM;
   const DataLayout *DL;
-  const TargetLoweringObjectFile &TLOF;
 
   /// True if this is a little endian target.
   bool IsLittleEndian;
@@ -1485,15 +1588,19 @@
   /// div/rem when the operands are positive and less than 256.
   DenseMap <unsigned int, unsigned int> BypassSlowDivWidths;
 
-  /// Tells the code generator that it shouldn't generate srl/add/sra for a
-  /// signed divide by power of two, and let the target handle it.
-  bool Pow2DivIsCheap;
+  /// Tells the code generator that it shouldn't generate sra/srl/add/sra for a
+  /// signed divide by power of two; let the target handle it.
+  bool Pow2SDivIsCheap;
 
   /// Tells the code generator that it shouldn't generate extra flow control
   /// instructions and should attempt to combine flow control instructions via
   /// predication.
   bool JumpIsExpensive;
 
+  /// Whether the target supports or cares about preserving floating point
+  /// exception behavior.
+  bool HasFloatingPointExceptions;
+
   /// This target prefers to use _setjmp to implement llvm.setjmp.
   ///
   /// Defaults to false.
@@ -1504,10 +1611,6 @@
   /// Defaults to false.
   bool UseUnderscoreLongJmp;
 
-  /// Whether the target can generate code for jumptables.  If it's not true,
-  /// then each jumptable must be lowered into if-then-else's.
-  bool SupportJumpTables;
-
   /// Number of blocks threshold to use jump tables.
   int MinimumJumpTableEntries;
 
@@ -1635,7 +1738,7 @@
       LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
 
       assert(
-        (LA == TypeLegal ||
+        (LA == TypeLegal || LA == TypeSoftenFloat ||
          ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger)
          && "Promote may not follow Expand or Promote");
 
@@ -1861,9 +1964,8 @@
   void operator=(const TargetLowering&) LLVM_DELETED_FUNCTION;
 
 public:
-  /// NOTE: The constructor takes ownership of TLOF.
-  explicit TargetLowering(const TargetMachine &TM,
-                          const TargetLoweringObjectFile *TLOF);
+  /// NOTE: The TargetMachine owns TLOF.
+  explicit TargetLowering(const TargetMachine &TM);
 
   /// Returns true by value, base pointer and offset pointer and addressing mode
   /// by reference if the node's address can be legally represented as
@@ -2324,9 +2426,9 @@
   /// all the time, e.g. i1 on x86-64. It is also not necessary for non-C
   /// calling conventions. The frontend should handle this and include all of
   /// the necessary information.
-  virtual MVT getTypeForExtArgOrReturn(MVT VT,
+  virtual EVT getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
                                        ISD::NodeType /*ExtendKind*/) const {
-    MVT MinVT = getRegisterType(MVT::i32);
+    EVT MinVT = getRegisterType(Context, MVT::i32);
     return VT.bitsLT(MinVT) ? MinVT : VT;
   }
 
@@ -2474,11 +2576,10 @@
     unsigned getMatchedOperand() const;
 
     /// Copy constructor for copying from a ConstraintInfo.
-    AsmOperandInfo(const InlineAsm::ConstraintInfo &info)
-      : InlineAsm::ConstraintInfo(info),
-        ConstraintType(TargetLowering::C_Unknown),
-        CallOperandVal(nullptr), ConstraintVT(MVT::Other) {
-    }
+    AsmOperandInfo(InlineAsm::ConstraintInfo Info)
+        : InlineAsm::ConstraintInfo(std::move(Info)),
+          ConstraintType(TargetLowering::C_Unknown), CallOperandVal(nullptr),
+          ConstraintVT(MVT::Other) {}
   };
 
   typedef std::vector<AsmOperandInfo> AsmOperandInfoVector;
@@ -2545,6 +2646,45 @@
   SDValue BuildUDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                     bool IsAfterLegalization,
                     std::vector<SDNode *> *Created) const;
+  virtual SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                SelectionDAG &DAG,
+                                std::vector<SDNode *> *Created) const {
+    return SDValue();
+  }
+
+  /// Hooks for building estimates in place of slower divisions and square
+  /// roots.
+  
+  /// Return a reciprocal square root estimate value for the input operand.
+  /// The RefinementSteps output is the number of Newton-Raphson refinement
+  /// iterations required to generate a sufficient (though not necessarily
+  /// IEEE-754 compliant) estimate for the value type.
+  /// The boolean UseOneConstNR output is used to select a Newton-Raphson
+  /// algorithm implementation that uses one constant or two constants.
+  /// A target may choose to implement its own refinement within this function.
+  /// If that's true, then return '0' as the number of RefinementSteps to avoid
+  /// any further refinement of the estimate.
+  /// An empty SDValue return means no estimate sequence can be created.
+  virtual SDValue getRsqrtEstimate(SDValue Operand,
+                              DAGCombinerInfo &DCI,
+                              unsigned &RefinementSteps,
+                              bool &UseOneConstNR) const {
+    return SDValue();
+  }
+
+  /// Return a reciprocal estimate value for the input operand.
+  /// The RefinementSteps output is the number of Newton-Raphson refinement
+  /// iterations required to generate a sufficient (though not necessarily
+  /// IEEE-754 compliant) estimate for the value type.
+  /// A target may choose to implement its own refinement within this function.
+  /// If that's true, then return '0' as the number of RefinementSteps to avoid
+  /// any further refinement of the estimate.
+  /// An empty SDValue return means no estimate sequence can be created.
+  virtual SDValue getRecipEstimate(SDValue Operand,
+                                   DAGCombinerInfo &DCI,
+                                   unsigned &RefinementSteps) const {
+    return SDValue();
+  }
 
   //===--------------------------------------------------------------------===//
   // Legalization utility functions
@@ -2564,6 +2704,12 @@
                  SDValue LH = SDValue(), SDValue RL = SDValue(),
                  SDValue RH = SDValue()) const;
 
+  /// Expand float(f32) to SINT(i64) conversion
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
   //
@@ -2583,6 +2729,12 @@
   /// ARM 's' setting instructions.
   virtual void
   AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const;
+
+  /// If this function returns true, SelectionDAGBuilder emits a
+  /// LOAD_STACK_GUARD node when it is lowering Intrinsic::stackprotector.
+  virtual bool useLoadStackGuardNode() const {
+    return false;
+  }
 };
 
 /// Given an LLVM IR type and return type attributes, compute the return value

diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index 419eced..7fcb171 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h

@@ -70,7 +70,8 @@
 
   /// Given a constant with the SectionKind, return a section that it should be
   /// placed in.
-  virtual const MCSection *getSectionForConstant(SectionKind Kind) const;
+  virtual const MCSection *getSectionForConstant(SectionKind Kind,
+                                                 const Constant *C) const;
 
   /// Classify the specified global variable into a set of target independent
   /// categories embodied in SectionKind.
@@ -159,7 +160,7 @@
 protected:
   virtual const MCSection *
   SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                         Mangler &Mang, const TargetMachine &TM) const;
+                         Mangler &Mang, const TargetMachine &TM) const = 0;
 };
 
 } // end namespace llvm

diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index b263c57..a4f95c0 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h

@@ -24,7 +24,6 @@
 namespace llvm {
 
 class InstrItineraryData;
-class JITCodeEmitter;
 class GlobalValue;
 class Mangler;
 class MCAsmInfo;
@@ -35,9 +34,7 @@
 class DataLayout;
 class TargetLibraryInfo;
 class TargetFrameLowering;
-class TargetInstrInfo;
 class TargetIntrinsicInfo;
-class TargetJITInfo;
 class TargetLowering;
 class TargetPassConfig;
 class TargetRegisterInfo;
@@ -47,6 +44,7 @@
 class VectorTargetTransformInfo;
 class formatted_raw_ostream;
 class raw_ostream;
+class TargetLoweringObjectFile;
 
 // The old pass manager infrastructure is hidden in a legacy namespace now.
 namespace legacy {
@@ -87,47 +85,27 @@
   unsigned RequireStructuredCFG : 1;
 
 public:
+  mutable TargetOptions Options;
+
   virtual ~TargetMachine();
 
   const Target &getTarget() const { return TheTarget; }
 
-  const StringRef getTargetTriple() const { return TargetTriple; }
-  const StringRef getTargetCPU() const { return TargetCPU; }
-  const StringRef getTargetFeatureString() const { return TargetFS; }
+  StringRef getTargetTriple() const { return TargetTriple; }
+  StringRef getTargetCPU() const { return TargetCPU; }
+  StringRef getTargetFeatureString() const { return TargetFS; }
 
   /// getSubtargetImpl - virtual method implemented by subclasses that returns
   /// a reference to that target's TargetSubtargetInfo-derived member variable.
   virtual const TargetSubtargetInfo *getSubtargetImpl() const {
     return nullptr;
   }
-
-  mutable TargetOptions Options;
-
-  /// \brief Reset the target options based on the function's attributes.
-  void resetTargetOptions(const MachineFunction *MF) const;
-
-  // Interfaces to the major aspects of target machine information:
-  //
-  // -- Instruction opcode and operand information
-  // -- Pipelines and scheduling information
-  // -- Stack frame information
-  // -- Selection DAG lowering information
-  //
-  // N.B. These objects may change during compilation. It's not safe to cache
-  // them between functions.
-  virtual const TargetInstrInfo  *getInstrInfo() const { return nullptr; }
-  virtual const TargetFrameLowering *getFrameLowering() const {
+  virtual const TargetSubtargetInfo *getSubtargetImpl(const Function &) const {
+    return getSubtargetImpl();
+  }
+  virtual TargetLoweringObjectFile *getObjFileLowering() const {
     return nullptr;
   }
-  virtual const TargetLowering *getTargetLowering() const { return nullptr; }
-  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
-    return nullptr;
-  }
-  virtual const DataLayout *getDataLayout() const { return nullptr; }
-
-  /// getMCAsmInfo - Return target specific asm information.
-  ///
-  const MCAsmInfo *getMCAsmInfo() const { return AsmInfo; }
 
   /// getSubtarget - This method returns a pointer to the specified type of
   /// TargetSubtargetInfo.  In debug builds, it verifies that the object being
@@ -135,27 +113,23 @@
   template<typename STC> const STC &getSubtarget() const {
     return *static_cast<const STC*>(getSubtargetImpl());
   }
+  template <typename STC> const STC &getSubtarget(const Function *) const {
+    return *static_cast<const STC*>(getSubtargetImpl());
+  }
 
-  /// getRegisterInfo - If register information is available, return it.  If
-  /// not, return null.  This is kept separate from RegInfo until RegInfo has
-  /// details of graph coloring register allocation removed from it.
+  /// \brief Reset the target options based on the function's attributes.
+  // FIXME: Remove TargetOptions that affect per-function code generation
+  // from TargetMachine.
+  void resetTargetOptions(const Function &F) const;
+
+  /// getMCAsmInfo - Return target specific asm information.
   ///
-  virtual const TargetRegisterInfo *getRegisterInfo() const { return nullptr; }
+  const MCAsmInfo *getMCAsmInfo() const { return AsmInfo; }
 
   /// getIntrinsicInfo - If intrinsic information is available, return it.  If
   /// not, return null.
   ///
-  virtual const TargetIntrinsicInfo *getIntrinsicInfo() const { return nullptr;}
-
-  /// getJITInfo - If this target supports a JIT, return information for it,
-  /// otherwise return null.
-  ///
-  virtual TargetJITInfo *getJITInfo() { return nullptr; }
-
-  /// getInstrItineraryData - Returns instruction itinerary data for the target
-  /// or specific subtarget.
-  ///
-  virtual const InstrItineraryData *getInstrItineraryData() const {
+  virtual const TargetIntrinsicInfo *getIntrinsicInfo() const {
     return nullptr;
   }
 
@@ -233,18 +207,6 @@
     return true;
   }
 
-  /// addPassesToEmitMachineCode - Add passes to the specified pass manager to
-  /// get machine code emitted.  This uses a JITCodeEmitter object to handle
-  /// actually outputting the machine code and resolving things like the address
-  /// of functions.  This method returns true if machine code emission is
-  /// not supported.
-  ///
-  virtual bool addPassesToEmitMachineCode(PassManagerBase &,
-                                          JITCodeEmitter &,
-                                          bool /*DisableVerify*/ = true) {
-    return true;
-  }
-
   /// addPassesToEmitMC - Add passes to the specified pass manager to get
   /// machine code emitted with the MCJIT. This method returns true if machine
   /// code is not supported. It fills the MCContext Ctx pointer which can be
@@ -291,15 +253,6 @@
                            AnalysisID StartAfter = nullptr,
                            AnalysisID StopAfter = nullptr) override;
 
-  /// addPassesToEmitMachineCode - Add passes to the specified pass manager to
-  /// get machine code emitted.  This uses a JITCodeEmitter object to handle
-  /// actually outputting the machine code and resolving things like the address
-  /// of functions.  This method returns true if machine code emission is
-  /// not supported.
-  ///
-  bool addPassesToEmitMachineCode(PassManagerBase &PM, JITCodeEmitter &MCE,
-                                  bool DisableVerify = true) override;
-
   /// addPassesToEmitMC - Add passes to the specified pass manager to get
   /// machine code emitted with the MCJIT. This method returns true if machine
   /// code is not supported. It fills the MCContext Ctx pointer which can be
@@ -307,14 +260,6 @@
   ///
   bool addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
                          raw_ostream &OS, bool DisableVerify = true) override;
-
-  /// addCodeEmitter - This pass should be overridden by the target to add a
-  /// code emitter, if supported.  If this is not supported, 'true' should be
-  /// returned.
-  virtual bool addCodeEmitter(PassManagerBase &,
-                              JITCodeEmitter &) {
-    return true;
-  }
 };
 
 } // End llvm namespace

diff --git a/include/llvm/Target/TargetOpcodes.h b/include/llvm/Target/TargetOpcodes.h
index abb3eca..1fbd2ae 100644
--- a/include/llvm/Target/TargetOpcodes.h
+++ b/include/llvm/Target/TargetOpcodes.h

@@ -104,7 +104,13 @@
   /// support optimizations for dynamic languages (such as javascript) that
   /// rewrite calls to runtimes with more efficient code sequences.
   /// This also implies a stack map.
-  PATCHPOINT = 18
+  PATCHPOINT = 18,
+
+  /// This pseudo-instruction loads the stack guard value. Targets which need
+  /// to prevent the stack guard value or address from being spilled to the
+  /// stack should override TargetLowering::emitLoadStackGuardNode and
+  /// additionally expand this pseudo after register allocation.
+  LOAD_STACK_GUARD = 19
 };
 } // end namespace TargetOpcode
 } // end namespace llvm

diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 922fae5..73014d8 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h

@@ -50,6 +50,21 @@
     };
   }
 
+  namespace ThreadModel {
+    enum Model {
+      POSIX,  // POSIX Threads
+      Single  // Single Threaded Environment
+    };
+  }
+
+  enum class CFIntegrity {
+    Sub,             // Use subtraction-based checks.
+    Ror,             // Use rotation-based checks.
+    Add              // Use addition-based checks. This depends on having
+                     // sufficient alignment in the code and is usually not
+                     // feasible.
+  };
+
   class TargetOptions {
   public:
     TargetOptions()
@@ -63,9 +78,11 @@
           EnableFastISel(false), PositionIndependentExecutable(false),
           UseInitArray(false), DisableIntegratedAS(false),
           CompressDebugSections(false), FunctionSections(false),
-          DataSections(false), TrapUnreachable(false), TrapFuncName(""),
+          DataSections(false), TrapUnreachable(false), TrapFuncName(),
           FloatABIType(FloatABI::Default),
-          AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single) {}
+          AllowFPOpFusion(FPOpFusion::Standard), JTType(JumpTable::Single),
+          FCFI(false), ThreadModel(ThreadModel::POSIX),
+          CFIType(CFIntegrity::Sub), CFIEnforcing(false), CFIFuncName() {}
 
     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
     /// option is specified on the command line, and should enable debugging
@@ -220,6 +237,28 @@
     /// create for functions that have the jumptable attribute.
     JumpTable::JumpTableType JTType;
 
+    /// FCFI - This flags controls whether or not forward-edge control-flow
+    /// integrity is applied.
+    bool FCFI;
+
+    /// ThreadModel - This flag specifies the type of threading model to assume
+    /// for things like atomics
+    ThreadModel::Model ThreadModel;
+
+    /// CFIType - This flag specifies the type of control-flow integrity check
+    /// to add as a preamble to indirect calls.
+    CFIntegrity CFIType;
+
+    /// CFIEnforcing - This flags controls whether or not CFI violations cause
+    /// the program to halt.
+    bool CFIEnforcing;
+
+    /// getCFIFuncName - If this returns a non-empty string, then this is the
+    /// name of the function that will be called for each CFI violation in
+    /// non-enforcing mode.
+    std::string CFIFuncName;
+    StringRef getCFIFuncName() const;
+
     /// Machine level options.
     MCTargetOptions MCOptions;
   };

diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index c6f3fbf..16b72a9 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h

@@ -18,7 +18,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include <cassert>
@@ -52,10 +52,6 @@
   ///
   unsigned getID() const { return MC->getID(); }
 
-  /// getName() - Return the register class name for debugging.
-  ///
-  const char *getName() const { return MC->getName(); }
-
   /// begin/end - Return all of the registers in this class.
   ///
   iterator       begin() const { return MC->begin(); }
@@ -101,9 +97,9 @@
 
   /// hasType - return true if this TargetRegisterClass has the ValueType vt.
   ///
-  bool hasType(EVT vt) const {
+  bool hasType(MVT vt) const {
     for(int i = 0; VTs[i] != MVT::Other; ++i)
-      if (EVT(VTs[i]) == vt)
+      if (MVT(VTs[i]) == vt)
         return true;
     return false;
   }
@@ -306,7 +302,7 @@
   /// register of the given type, picking the most sub register class of
   /// the right type that contains this physreg.
   const TargetRegisterClass *
-    getMinimalPhysRegClass(unsigned Reg, EVT VT = MVT::Other) const;
+    getMinimalPhysRegClass(unsigned Reg, MVT VT = MVT::Other) const;
 
   /// getAllocatableClass - Return the maximal subclass of the given register
   /// class that is alloctable, or NULL.
@@ -506,6 +502,10 @@
     return composeSubRegIndicesImpl(a, b);
   }
 
+  /// Debugging helper: dump register in human readable form to dbgs() stream.
+  static void dumpReg(unsigned Reg, unsigned SubRegIndex = 0,
+                      const TargetRegisterInfo* TRI = nullptr);
+
 protected:
   /// Overridden by TableGen in targets that have sub-registers.
   virtual unsigned composeSubRegIndicesImpl(unsigned, unsigned) const {
@@ -561,6 +561,11 @@
     return RegClassBegin[i];
   }
 
+  /// getRegClassName - Returns the name of the register class.
+  const char *getRegClassName(const TargetRegisterClass *Class) const {
+    return MCRegisterInfo::getRegClassName(Class->MC);
+  }
+
   /// getCommonSubClass - find the largest common subclass of A and B. Return
   /// NULL if there is no common subclass.
   const TargetRegisterClass *
@@ -683,12 +688,6 @@
   /// (3) Bottom-up allocation is no longer guaranteed to optimally color.
   virtual bool reverseLocalAssignment() const { return false; }
 
-  /// Allow the target to override register assignment heuristics based on the
-  /// live range size. If this returns false, then local live ranges are always
-  /// assigned in order regardless of their size. This is a temporary hook for
-  /// debugging downstream codegen failures exposed by regalloc.
-  virtual bool mayOverrideLocalAssignment() const { return true; }
-
   /// Allow the target to override the cost of using a callee-saved register for
   /// the first time. Default value of 0 means we will use a callee-saved
   /// register if it is available.
@@ -808,6 +807,18 @@
                                    RegScavenger *RS = nullptr) const = 0;
 
   //===--------------------------------------------------------------------===//
+  /// Subtarget Hooks
+
+  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true.
+  virtual bool shouldCoalesce(MachineInstr *MI,
+                              const TargetRegisterClass *SrcRC,
+                              unsigned SubReg,
+                              const TargetRegisterClass *DstRC,
+                              unsigned DstSubReg,
+                              const TargetRegisterClass *NewRC) const
+  { return true; }
+
+  //===--------------------------------------------------------------------===//
   /// Debug information queries.
 
   /// getFrameRegister - This method should return the register used as a base

diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index e6eeb88..89db37c 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td

@@ -88,6 +88,8 @@
   // Per-cycle resources tables.
   ProcessorItineraries Itineraries = NoItineraries;
 
+  bit PostRAScheduler = 0; // Enable Post RegAlloc Scheduler pass.
+
   // Subtargets that define a model for only a subset of instructions
   // that have a scheduling class (itinerary class or SchedRW list)
   // and may actually be generated for that subtarget must clear this

diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 16cfff1..f63afd7 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td

@@ -162,6 +162,10 @@
   SDTCisVT<0, OtherVT>
 ]>;
 
+def SDTBrCC : SDTypeProfile<0, 4, [       // brcc
+  SDTCisVT<0, OtherVT>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
+]>;
+
 def SDTBrcond : SDTypeProfile<0, 2, [       // brcond
   SDTCisInt<0>, SDTCisVT<1, OtherVT>
 ]>;
@@ -205,7 +209,7 @@
   SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, SDTCisInt<1>
 ]>;
 
-def SDTMemBarrier : SDTypeProfile<0, 5, [   // memory barier
+def SDTMemBarrier : SDTypeProfile<0, 5, [   // memory barrier
   SDTCisSameAs<0,1>,  SDTCisSameAs<0,2>,  SDTCisSameAs<0,3>, SDTCisSameAs<0,4>,
   SDTCisInt<0>
 ]>;
@@ -369,6 +373,8 @@
 def frem       : SDNode<"ISD::FREM"       , SDTFPBinOp>;
 def fma        : SDNode<"ISD::FMA"        , SDTFPTernaryOp>;
 def fabs       : SDNode<"ISD::FABS"       , SDTFPUnaryOp>;
+def fminnum    : SDNode<"ISD::FMINNUM"    , SDTFPBinOp>;
+def fmaxnum    : SDNode<"ISD::FMAXNUM"    , SDTFPBinOp>;
 def fgetsign   : SDNode<"ISD::FGETSIGN"   , SDTFPToIntOp>;
 def fneg       : SDNode<"ISD::FNEG"       , SDTFPUnaryOp>;
 def fsqrt      : SDNode<"ISD::FSQRT"      , SDTFPUnaryOp>;
@@ -392,14 +398,15 @@
 def uint_to_fp : SDNode<"ISD::UINT_TO_FP" , SDTIntToFPOp>;
 def fp_to_sint : SDNode<"ISD::FP_TO_SINT" , SDTFPToIntOp>;
 def fp_to_uint : SDNode<"ISD::FP_TO_UINT" , SDTFPToIntOp>;
-def f16_to_f32 : SDNode<"ISD::FP16_TO_FP32", SDTIntToFPOp>;
-def f32_to_f16 : SDNode<"ISD::FP32_TO_FP16", SDTFPToIntOp>;
+def f16_to_fp  : SDNode<"ISD::FP16_TO_FP" , SDTIntToFPOp>;
+def fp_to_f16  : SDNode<"ISD::FP_TO_FP16" , SDTFPToIntOp>;
 
 def setcc      : SDNode<"ISD::SETCC"      , SDTSetCC>;
 def select     : SDNode<"ISD::SELECT"     , SDTSelect>;
 def vselect    : SDNode<"ISD::VSELECT"    , SDTVSelect>;
 def selectcc   : SDNode<"ISD::SELECT_CC"  , SDTSelectCC>;
 
+def brcc       : SDNode<"ISD::BR_CC"      , SDTBrCC,   [SDNPHasChain]>;
 def brcond     : SDNode<"ISD::BRCOND"     , SDTBrcond, [SDNPHasChain]>;
 def brind      : SDNode<"ISD::BRIND"      , SDTBrind,  [SDNPHasChain]>;
 def br         : SDNode<"ISD::BR"         , SDTBr,     [SDNPHasChain]>;

diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index 78a2db1..d1a3fcf 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h

@@ -46,7 +46,7 @@
   /// more efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a different
   /// lowering strategy should be used.
-  /// 
+  ///
   /// If AlwaysInline is true, the size is constant and the target should not
   /// emit any calls and is strongly encouraged to attempt to emit inline code
   /// even if it is beyond the usual threshold because this intrinsic is being

diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index bbb83ef..80ff9e3 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h

@@ -14,17 +14,24 @@
 #ifndef LLVM_TARGET_TARGETSUBTARGETINFO_H
 #define LLVM_TARGET_TARGETSUBTARGETINFO_H
 
+#include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/CodeGen.h"
 
 namespace llvm {
 
+class DataLayout;
 class MachineFunction;
 class MachineInstr;
 class SDep;
 class SUnit;
+class TargetFrameLowering;
+class TargetInstrInfo;
+class TargetLowering;
 class TargetRegisterClass;
+class TargetRegisterInfo;
 class TargetSchedModel;
+class TargetSelectionDAGInfo;
 struct MachineSchedPolicy;
 template <typename T> class SmallVectorImpl;
 
@@ -47,6 +54,38 @@
 
   virtual ~TargetSubtargetInfo();
 
+  // Interfaces to the major aspects of target machine information:
+  //
+  // -- Instruction opcode and operand information
+  // -- Pipelines and scheduling information
+  // -- Stack frame information
+  // -- Selection DAG lowering information
+  //
+  // N.B. These objects may change during compilation. It's not safe to cache
+  // them between functions.
+  virtual const TargetInstrInfo *getInstrInfo() const { return nullptr; }
+  virtual const TargetFrameLowering *getFrameLowering() const {
+    return nullptr;
+  }
+  virtual const TargetLowering *getTargetLowering() const { return nullptr; }
+  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
+    return nullptr;
+  }
+  virtual const DataLayout *getDataLayout() const { return nullptr; }
+
+  /// getRegisterInfo - If register information is available, return it.  If
+  /// not, return null.  This is kept separate from RegInfo until RegInfo has
+  /// details of graph coloring register allocation removed from it.
+  ///
+  virtual const TargetRegisterInfo *getRegisterInfo() const { return nullptr; }
+
+  /// getInstrItineraryData - Returns instruction itinerary data for the target
+  /// or specific subtarget.
+  ///
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return nullptr;
+  }
+
   /// Resolve a SchedClass at runtime, where SchedClass identifies an
   /// MCSchedClassDesc with the isVariant property. This may return the ID of
   /// another variant SchedClass, but repeated invocation must quickly terminate
@@ -74,7 +113,7 @@
   virtual bool enablePostMachineScheduler() const;
 
   /// \brief True if the subtarget should run the atomic expansion pass.
-  virtual bool enableAtomicExpandLoadLinked() const;
+  virtual bool enableAtomicExpand() const;
 
   /// \brief Override generic scheduling policy within a region.
   ///
@@ -91,14 +130,24 @@
   virtual void adjustSchedDependency(SUnit *def, SUnit *use,
                                      SDep& dep) const { }
 
-  // enablePostRAScheduler - If the target can benefit from post-regalloc
-  // scheduling and the specified optimization level meets the requirement
-  // return true to enable post-register-allocation scheduling. In
-  // CriticalPathRCs return any register classes that should only be broken
-  // if on the critical path.
-  virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                                     AntiDepBreakMode& Mode,
-                                     RegClassVector& CriticalPathRCs) const;
+  // For use with PostRAScheduling: get the anti-dependence breaking that should
+  // be performed before post-RA scheduling.
+  virtual AntiDepBreakMode getAntiDepBreakMode() const {
+    return ANTIDEP_NONE;
+  }
+
+  // For use with PostRAScheduling: in CriticalPathRCs, return any register
+  // classes that should only be considered for anti-dependence breaking if they
+  // are on the critical path.
+  virtual void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
+    return CriticalPathRCs.clear();
+  }
+
+  // For use with PostRAScheduling: get the minimum optimization level needed
+  // to enable post-RA scheduling.
+  virtual CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const {
+    return CodeGenOpt::Default;
+  }
 
   /// \brief True if the subtarget should run the local reassignment
   /// heuristic of the register allocator.
@@ -113,8 +162,12 @@
   /// \brief Enable the use of the early if conversion pass.
   virtual bool enableEarlyIfConversion() const { return false; }
 
-  /// \brief Reset the features for the subtarget.
-  virtual void resetSubtargetFeatures(const MachineFunction *MF) { }
+  /// \brief Return PBQPConstraint(s) for the target.
+  ///
+  /// Override to provide custom PBQP constraints.
+  virtual std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const {
+    return nullptr;
+  }
 };
 
 } // End llvm namespace

diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 50877d0..b1426b4 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h

@@ -18,16 +18,17 @@
 #include <vector>
 
 namespace llvm {
-class TargetLibraryInfo;
 class Pass;
+class TargetLibraryInfo;
+class TargetMachine;
 
 // The old pass manager infrastructure is hidden in a legacy namespace now.
 namespace legacy {
-class PassManagerBase;
 class FunctionPassManager;
+class PassManagerBase;
 }
-using legacy::PassManagerBase;
 using legacy::FunctionPassManager;
+using legacy::PassManagerBase;
 
 /// PassManagerBuilder - This class is used to set up a standard optimization
 /// sequence for languages like C and C++, allowing some APIs to customize the
@@ -118,6 +119,11 @@
   bool LoopVectorize;
   bool RerollLoops;
   bool LoadCombine;
+  bool DisableGVNLoadPRE;
+  bool VerifyInput;
+  bool VerifyOutput;
+  bool StripDebug;
+  bool MergeFunctions;
 
 private:
   /// ExtensionList - This is list of all of the extensions that are registered.
@@ -135,6 +141,7 @@
 private:
   void addExtensionsToPM(ExtensionPointTy ETy, PassManagerBase &PM) const;
   void addInitialAliasAnalysisPasses(PassManagerBase &PM) const;
+  void addLTOOptimizationPasses(PassManagerBase &PM);
 
 public:
   /// populateFunctionPassManager - This fills in the function pass manager,
@@ -144,8 +151,7 @@
 
   /// populateModulePassManager - This sets up the primary pass manager.
   void populateModulePassManager(PassManagerBase &MPM);
-  void populateLTOPassManager(PassManagerBase &PM, bool Internalize,
-                              bool RunInliner, bool DisableGVNLoadPRE = false);
+  void populateLTOPassManager(PassManagerBase &PM, TargetMachine *TM = nullptr);
 };
 
 /// Registers a function for adding a standard set of passes.  This should be

diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index c6a339b..87422df 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h

@@ -78,6 +78,9 @@
                                         void *(*getArgTLS)() = nullptr,
                                         void *(*getRetValTLS)() = nullptr);
 
+// Insert SanitizerCoverage instrumentation.
+ModulePass *createSanitizerCoverageModulePass(int CoverageLevel);
+
 #if defined(__GNUC__) && defined(__linux__) && !defined(ANDROID)
 inline ModulePass *createDataFlowSanitizerPassForJIT(StringRef ABIListFile =
                                                          StringRef()) {

diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 8ecfd80..5dcd899 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h

@@ -36,6 +36,13 @@
 
 //===----------------------------------------------------------------------===//
 //
+// AlignmentFromAssumptions - Use assume intrinsics to set load/store
+// alignments.
+//
+FunctionPass *createAlignmentFromAssumptionsPass();
+
+//===----------------------------------------------------------------------===//
+//
 // SCCP - Sparse conditional constant propagation.
 //
 FunctionPass *createSCCPPass();
@@ -199,16 +206,17 @@
 //===----------------------------------------------------------------------===//
 //
 // JumpThreading - Thread control through mult-pred/multi-succ blocks where some
-// preds always go to some succ.
+// preds always go to some succ. Thresholds other than minus one override the
+// internal BB duplication default threshold.
 //
-FunctionPass *createJumpThreadingPass();
+FunctionPass *createJumpThreadingPass(int Threshold = -1);
 
 //===----------------------------------------------------------------------===//
 //
 // CFGSimplification - Merge basic blocks, eliminate unreachable blocks,
 // simplify terminator instructions, etc...
 //
-FunctionPass *createCFGSimplificationPass();
+FunctionPass *createCFGSimplificationPass(int Threshold = -1);
 
 //===----------------------------------------------------------------------===//
 //
@@ -288,6 +296,13 @@
 
 //===----------------------------------------------------------------------===//
 //
+// MergedLoadStoreMotion - This pass merges loads and stores in diamonds. Loads
+// are hoisted into the header, while stores sink into the footer.
+//
+FunctionPass *createMergedLoadStoreMotionPass();
+
+//===----------------------------------------------------------------------===//
+//
 // GVN - This pass performs global value numbering and redundant load
 // elimination cotemporaneously.
 //
@@ -380,7 +395,9 @@
 //
 // SeparateConstOffsetFromGEP - Split GEPs for better CSE
 //
-FunctionPass *createSeparateConstOffsetFromGEPPass();
+FunctionPass *
+createSeparateConstOffsetFromGEPPass(const TargetMachine *TM = nullptr,
+                                     bool LowerGEP = false);
 
 //===----------------------------------------------------------------------===//
 //

diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 7309f69..19acf5b 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h

@@ -23,6 +23,7 @@
 namespace llvm {
 
 class AliasAnalysis;
+class DominatorTree;
 class Instruction;
 class MDNode;
 class Pass;
@@ -132,6 +133,11 @@
   }
 }
 
+// SplitAllCriticalEdges - Loop over all of the edges in the CFG,
+// breaking critical edges as they are found. Pass P must not be NULL.
+// Returns the number of broken edges.
+unsigned SplitAllCriticalEdges(Function &F, Pass *P);
+
 /// SplitEdge -  Split the edge connecting specified block. Pass P must
 /// not be NULL.
 BasicBlock *SplitEdge(BasicBlock *From, BasicBlock *To, Pass *P);
@@ -202,9 +208,12 @@
 /// If Unreachable is true, then ThenBlock ends with
 /// UnreachableInst, otherwise it branches to Tail.
 /// Returns the NewBasicBlock's terminator.
+///
+/// Updates DT if given.
 TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
                                           bool Unreachable,
-                                          MDNode *BranchWeights = nullptr);
+                                          MDNode *BranchWeights = nullptr,
+                                          DominatorTree *DT = nullptr);
 
 /// SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen,
 /// but also creates the ElseBlock.

diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index bdf50dd..740d725 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h

@@ -43,6 +43,8 @@
 class Loop;
 class LoopInfo;
 class AllocaInst;
+class AliasAnalysis;
+class AssumptionTracker;
 
 /// CloneModule - Return an exact copy of the specified module
 ///
@@ -157,13 +159,18 @@
 /// InlineFunction call, and records the auxiliary results produced by it.
 class InlineFunctionInfo {
 public:
-  explicit InlineFunctionInfo(CallGraph *cg = nullptr, const DataLayout *DL = nullptr)
-    : CG(cg), DL(DL) {}
+  explicit InlineFunctionInfo(CallGraph *cg = nullptr,
+                              const DataLayout *DL = nullptr,
+                              AliasAnalysis *AA = nullptr,
+                              AssumptionTracker *AT = nullptr)
+    : CG(cg), DL(DL), AA(AA), AT(AT) {}
 
   /// CG - If non-null, InlineFunction will update the callgraph to reflect the
   /// changes it makes.
   CallGraph *CG;
   const DataLayout *DL;
+  AliasAnalysis *AA;
+  AssumptionTracker *AT;
 
   /// StaticAllocas - InlineFunction fills this in with all static allocas that
   /// get copied into the caller.

diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h
index 6b41e82..3a96d95 100644
--- a/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/include/llvm/Transforms/Utils/CodeExtractor.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_UTILS_CODE_EXTRACTOR_H
-#define LLVM_TRANSFORMS_UTILS_CODE_EXTRACTOR_H
+#ifndef LLVM_TRANSFORMS_UTILS_CODEEXTRACTOR_H
+#define LLVM_TRANSFORMS_UTILS_CODEEXTRACTOR_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SetVector.h"

diff --git a/include/llvm/Transforms/Utils/CtorUtils.h b/include/llvm/Transforms/Utils/CtorUtils.h
index 81e7b95..63e564d 100644
--- a/include/llvm/Transforms/Utils/CtorUtils.h
+++ b/include/llvm/Transforms/Utils/CtorUtils.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_UTILS_CTOR_UTILS_H
-#define LLVM_TRANSFORMS_UTILS_CTOR_UTILS_H
+#ifndef LLVM_TRANSFORMS_UTILS_CTORUTILS_H
+#define LLVM_TRANSFORMS_UTILS_CTORUTILS_H
 
 #include "llvm/ADT/STLExtras.h"
 

diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index c0c6906..e89e5e5 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h

@@ -34,12 +34,14 @@
 class Pass;
 class PHINode;
 class AllocaInst;
+class AssumptionTracker;
 class ConstantExpr;
 class DataLayout;
 class TargetLibraryInfo;
 class TargetTransformInfo;
 class DIBuilder;
 class AliasAnalysis;
+class DominatorTree;
 
 template<typename T> class SmallVectorImpl;
 
@@ -136,7 +138,9 @@
 /// the basic block that was pointed to.
 ///
 bool SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
-                 const DataLayout *TD = nullptr);
+                 unsigned BonusInstThreshold,
+                 const DataLayout *TD = nullptr,
+                 AssumptionTracker *AT = nullptr);
 
 /// FlatternCFG - This function is used to flatten a CFG.  For
 /// example, it uses parallel-and and parallel-or mode to collapse
@@ -148,7 +152,8 @@
 /// and if a predecessor branches to us and one of our successors, fold the
 /// setcc into the predecessor and use logical operations to pick the right
 /// destination.
-bool FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL = nullptr);
+bool FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL = nullptr,
+                            unsigned BonusInstThreshold = 1);
 
 /// DemoteRegToStack - This function takes a virtual register computed by an
 /// Instruction and replaces it with a slot in the stack frame, allocated via
@@ -170,12 +175,18 @@
 /// and it is more than the alignment of the ultimate object, see if we can
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
-                                    const DataLayout *TD = nullptr);
+                                    const DataLayout *TD = nullptr,
+                                    AssumptionTracker *AT = nullptr,
+                                    const Instruction *CxtI = nullptr,
+                                    const DominatorTree *DT = nullptr);
 
 /// getKnownAlignment - Try to infer an alignment for the specified pointer.
 static inline unsigned getKnownAlignment(Value *V,
-                                         const DataLayout *TD = nullptr) {
-  return getOrEnforceKnownAlignment(V, 0, TD);
+                                         const DataLayout *TD = nullptr,
+                                         AssumptionTracker *AT = nullptr,
+                                         const Instruction *CxtI = nullptr,
+                                         const DominatorTree *DT = nullptr) {
+  return getOrEnforceKnownAlignment(V, 0, TD, AT, CxtI, DT);
 }
 
 /// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the
@@ -275,6 +286,11 @@
 /// Returns true if any basic block was removed.
 bool removeUnreachableBlocks(Function &F);
 
+/// \brief Combine the metadata of two instructions so that K can replace J
+///
+/// Metadata not listed as known via KnownIDs is removed
+void combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsigned> KnownIDs);
+
 } // End llvm namespace
 
 #endif

diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index 7e3a74a..fdae80d 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h

@@ -16,6 +16,7 @@
 
 namespace llvm {
 class AliasAnalysis;
+class AssumptionTracker;
 class BasicBlock;
 class DataLayout;
 class DominatorTree;
@@ -34,7 +35,8 @@
 /// passed into it.
 bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
                   AliasAnalysis *AA = nullptr, ScalarEvolution *SE = nullptr,
-                  const DataLayout *DL = nullptr);
+                  const DataLayout *DL = nullptr,
+                  AssumptionTracker *AT = nullptr);
 
 /// \brief Put loop into LCSSA form.
 ///

diff --git a/include/llvm/Transforms/Utils/ModuleUtils.h b/include/llvm/Transforms/Utils/ModuleUtils.h
index 98a19ed..16904f1 100644
--- a/include/llvm/Transforms/Utils/ModuleUtils.h
+++ b/include/llvm/Transforms/Utils/ModuleUtils.h

@@ -20,7 +20,7 @@
 class Function;
 class GlobalValue;
 class GlobalVariable;
-template <class PtrType, unsigned SmallSize> class SmallPtrSet;
+template <class PtrType> class SmallPtrSetImpl;
 
 /// Append F to the list of global ctors of module M with the given Priority.
 /// This wraps the function in the appropriate structure and stores it along
@@ -34,7 +34,7 @@
 /// \brief Given "llvm.used" or "llvm.compiler.used" as a global name, collect
 /// the initializer elements of that global in Set and return the global itself.
 GlobalVariable *collectUsedGlobalVariables(Module &M,
-                                           SmallPtrSet<GlobalValue *, 8> &Set,
+                                           SmallPtrSetImpl<GlobalValue *> &Set,
                                            bool CompilerUsed);
 } // End llvm namespace
 

diff --git a/include/llvm/Transforms/Utils/PromoteMemToReg.h b/include/llvm/Transforms/Utils/PromoteMemToReg.h
index c83fedb..3fdd5e9 100644
--- a/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/include/llvm/Transforms/Utils/PromoteMemToReg.h

@@ -22,6 +22,7 @@
 class AllocaInst;
 class DominatorTree;
 class AliasSetTracker;
+class AssumptionTracker;
 
 /// \brief Return true if this alloca is legal for promotion.
 ///
@@ -41,7 +42,8 @@
 /// If AST is specified, the specified tracker is updated to reflect changes
 /// made to the IR.
 void PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                     AliasSetTracker *AST = nullptr);
+                     AliasSetTracker *AST = nullptr,
+                     AssumptionTracker *AT = nullptr);
 
 } // End llvm namespace
 

diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index a2a5f9a..6765ac1 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h

@@ -15,40 +15,118 @@
 #ifndef LLVM_TRANSFORMS_UTILS_SIMPLIFYLIBCALLS_H
 #define LLVM_TRANSFORMS_UTILS_SIMPLIFYLIBCALLS_H
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/IRBuilder.h"
+
 namespace llvm {
-  class Value;
-  class CallInst;
-  class DataLayout;
-  class Instruction;
-  class TargetLibraryInfo;
-  class LibCallSimplifierImpl;
+class Value;
+class CallInst;
+class DataLayout;
+class Instruction;
+class TargetLibraryInfo;
+class BasicBlock;
+class Function;
 
-  /// LibCallSimplifier - This class implements a collection of optimizations
-  /// that replace well formed calls to library functions with a more optimal
-  /// form.  For example, replacing 'printf("Hello!")' with 'puts("Hello!")'.
-  class LibCallSimplifier {
-    /// Impl - A pointer to the actual implementation of the library call
-    /// simplifier.
-    LibCallSimplifierImpl *Impl;
+/// LibCallSimplifier - This class implements a collection of optimizations
+/// that replace well formed calls to library functions with a more optimal
+/// form.  For example, replacing 'printf("Hello!")' with 'puts("Hello!")'.
+class LibCallSimplifier {
+private:
+  const DataLayout *DL;
+  const TargetLibraryInfo *TLI;
+  bool UnsafeFPShrink;
 
-  public:
-    LibCallSimplifier(const DataLayout *TD, const TargetLibraryInfo *TLI,
-                      bool UnsafeFPShrink);
-    virtual ~LibCallSimplifier();
+protected:
+  ~LibCallSimplifier() {}
 
-    /// optimizeCall - Take the given call instruction and return a more
-    /// optimal value to replace the instruction with or 0 if a more
-    /// optimal form can't be found.  Note that the returned value may
-    /// be equal to the instruction being optimized.  In this case all
-    /// other instructions that use the given instruction were modified
-    /// and the given instruction is dead.
-    Value *optimizeCall(CallInst *CI);
+public:
+  LibCallSimplifier(const DataLayout *TD, const TargetLibraryInfo *TLI);
 
-    /// replaceAllUsesWith - This method is used when the library call
-    /// simplifier needs to replace instructions other than the library
-    /// call being modified.
-    virtual void replaceAllUsesWith(Instruction *I, Value *With) const;
-  };
+  /// optimizeCall - Take the given call instruction and return a more
+  /// optimal value to replace the instruction with or 0 if a more
+  /// optimal form can't be found.  Note that the returned value may
+  /// be equal to the instruction being optimized.  In this case all
+  /// other instructions that use the given instruction were modified
+  /// and the given instruction is dead.
+  Value *optimizeCall(CallInst *CI);
+
+  /// replaceAllUsesWith - This method is used when the library call
+  /// simplifier needs to replace instructions other than the library
+  /// call being modified.
+  virtual void replaceAllUsesWith(Instruction *I, Value *With) const;
+
+private:
+  // Fortified Library Call Optimizations
+  Value *optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemSetChk(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrCpyChk(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStpCpyChk(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrNCpyChk(CallInst *CI, IRBuilder<> &B);
+
+  // String and Memory Library Call Optimizations
+  Value *optimizeStrCat(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrNCat(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrChr(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrRChr(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrCmp(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrNCmp(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrCpy(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStpCpy(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrNCpy(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrLen(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrPBrk(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrTo(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrSpn(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrCSpn(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeStrStr(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemCmp(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemCpy(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemMove(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemSet(CallInst *CI, IRBuilder<> &B);
+
+  // Math Library Optimizations
+  Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B, bool CheckRetType);
+  Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeCos(CallInst *CI, IRBuilder<> &B);
+  Value *optimizePow(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeExp2(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFabs(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeSqrt(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeSinCosPi(CallInst *CI, IRBuilder<> &B);
+
+  // Integer Library Call Optimizations
+  Value *optimizeFFS(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeAbs(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeIsDigit(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeIsAscii(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeToAscii(CallInst *CI, IRBuilder<> &B);
+
+  // Formatting and IO Library Call Optimizations
+  Value *optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
+                                int StreamArg = -1);
+  Value *optimizePrintF(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeSPrintF(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFPrintF(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFWrite(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFPuts(CallInst *CI, IRBuilder<> &B);
+  Value *optimizePuts(CallInst *CI, IRBuilder<> &B);
+
+  // Helper methods
+  Value *emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B);
+  void classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
+                      SmallVectorImpl<CallInst *> &SinCalls,
+                      SmallVectorImpl<CallInst *> &CosCalls,
+                      SmallVectorImpl<CallInst *> &SinCosCalls);
+  void replaceTrigInsts(SmallVectorImpl<CallInst *> &Calls, Value *Res);
+  Value *optimizePrintFString(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeSPrintFString(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeFPrintFString(CallInst *CI, IRBuilder<> &B);
+
+  /// hasFloatVersion - Checks if there is a float version of the specified
+  /// function by checking for an existing function with name FuncName + f
+  bool hasFloatVersion(StringRef FuncName);
+};
 } // End llvm namespace
 
 #endif

diff --git a/include/llvm/Transforms/Utils/SymbolRewriter.h b/include/llvm/Transforms/Utils/SymbolRewriter.h
new file mode 100644
index 0000000..af79372
--- /dev/null
+++ b/include/llvm/Transforms/Utils/SymbolRewriter.h

@@ -0,0 +1,155 @@
+//===-- SymbolRewriter.h - Symbol Rewriting Pass ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the prototypes and definitions related to the Symbol
+// Rewriter pass.
+//
+// The Symbol Rewriter pass takes a set of rewrite descriptors which define
+// transformations for symbol names.  These can be either single name to name
+// trnsformation or more broad regular expression based transformations.
+//
+// All the functions are re-written at the IR level.  The Symbol Rewriter itself
+// is exposed as a module level pass.  All symbols at the module level are
+// iterated.  For any matching symbol, the requested transformation is applied,
+// updating references to it as well (a la RAUW).  The resulting binary will
+// only contain the rewritten symbols.
+//
+// By performing this operation in the compiler, we are able to catch symbols
+// that would otherwise not be possible to catch (e.g. inlined symbols).
+//
+// This makes it possible to cleanly transform symbols without resorting to
+// overly-complex macro tricks and the pre-processor.  An example of where this
+// is useful is the sanitizers where we would like to intercept a well-defined
+// set of functions across the module.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_SYMBOL_REWRITER_H
+#define LLVM_TRANSFORMS_UTILS_SYMBOL_REWRITER_H
+
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+class MemoryBuffer;
+
+namespace yaml {
+class KeyValueNode;
+class MappingNode;
+class ScalarNode;
+class Stream;
+}
+
+namespace SymbolRewriter {
+/// The basic entity representing a rewrite operation.  It serves as the base
+/// class for any rewrite descriptor.  It has a certain set of specializations
+/// which describe a particular rewrite.
+///
+/// The RewriteMapParser can be used to parse a mapping file that provides the
+/// mapping for rewriting the symbols.  The descriptors individually describe
+/// whether to rewrite a function, global variable, or global alias.  Each of
+/// these can be selected either by explicitly providing a name for the ones to
+/// be rewritten or providing a (posix compatible) regular expression that will
+/// select the symbols to rewrite.  This descriptor list is passed to the
+/// SymbolRewriter pass.
+class RewriteDescriptor : public ilist_node<RewriteDescriptor> {
+  RewriteDescriptor(const RewriteDescriptor &) LLVM_DELETED_FUNCTION;
+
+  const RewriteDescriptor &
+  operator=(const RewriteDescriptor &) LLVM_DELETED_FUNCTION;
+
+public:
+  enum class Type {
+    Invalid,        /// invalid
+    Function,       /// function - descriptor rewrites a function
+    GlobalVariable, /// global variable - descriptor rewrites a global variable
+    NamedAlias,     /// named alias - descriptor rewrites a global alias
+  };
+
+  virtual ~RewriteDescriptor() {}
+
+  Type getType() const { return Kind; }
+
+  virtual bool performOnModule(Module &M) = 0;
+
+protected:
+  explicit RewriteDescriptor(Type T) : Kind(T) {}
+
+private:
+  const Type Kind;
+};
+
+typedef iplist<RewriteDescriptor> RewriteDescriptorList;
+
+class RewriteMapParser {
+public:
+  RewriteMapParser() {}
+  ~RewriteMapParser() {}
+
+  bool parse(const std::string &MapFile, RewriteDescriptorList *Descriptors);
+
+private:
+  bool parse(std::unique_ptr<MemoryBuffer> &MapFile, RewriteDescriptorList *DL);
+  bool parseEntry(yaml::Stream &Stream, yaml::KeyValueNode &Entry,
+                  RewriteDescriptorList *DL);
+  bool parseRewriteFunctionDescriptor(yaml::Stream &Stream,
+                                      yaml::ScalarNode *Key,
+                                      yaml::MappingNode *Value,
+                                      RewriteDescriptorList *DL);
+  bool parseRewriteGlobalVariableDescriptor(yaml::Stream &Stream,
+                                            yaml::ScalarNode *Key,
+                                            yaml::MappingNode *Value,
+                                            RewriteDescriptorList *DL);
+  bool parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                                         yaml::MappingNode *V,
+                                         RewriteDescriptorList *DL);
+};
+}
+
+template <>
+struct ilist_traits<SymbolRewriter::RewriteDescriptor>
+    : public ilist_default_traits<SymbolRewriter::RewriteDescriptor> {
+  mutable ilist_half_node<SymbolRewriter::RewriteDescriptor> Sentinel;
+
+public:
+  // createSentinel is used to get a reference to a node marking the end of
+  // the list.  Because the sentinel is relative to this instance, use a
+  // non-static method.
+  SymbolRewriter::RewriteDescriptor *createSentinel() const {
+    // since i[p] lists always publicly derive from the corresponding
+    // traits, placing a data member in this class will augment the
+    // i[p]list.  Since the NodeTy is expected to publicly derive from
+    // ilist_node<NodeTy>, there is a legal viable downcast from it to
+    // NodeTy.  We use this trick to superpose i[p]list with a "ghostly"
+    // NodeTy, which becomes the sentinel.  Dereferencing the sentinel is
+    // forbidden (save the ilist_node<NodeTy>) so no one will ever notice
+    // the superposition.
+    return static_cast<SymbolRewriter::RewriteDescriptor *>(&Sentinel);
+  }
+  void destroySentinel(SymbolRewriter::RewriteDescriptor *) {}
+
+  SymbolRewriter::RewriteDescriptor *provideInitialHead() const {
+    return createSentinel();
+  }
+
+  SymbolRewriter::RewriteDescriptor *
+  ensureHead(SymbolRewriter::RewriteDescriptor *&) const {
+    return createSentinel();
+  }
+
+  static void noteHead(SymbolRewriter::RewriteDescriptor *,
+                       SymbolRewriter::RewriteDescriptor *) {}
+};
+
+ModulePass *createRewriteSymbolsPass();
+ModulePass *createRewriteSymbolsPass(SymbolRewriter::RewriteDescriptorList &);
+}
+
+#endif

diff --git a/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h b/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
index 7ac2572..550292f 100644
--- a/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
+++ b/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h

@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_UNIFYFUNCTIONEXITNODES_H
-#define LLVM_TRANSFORMS_UNIFYFUNCTIONEXITNODES_H
+#ifndef LLVM_TRANSFORMS_UTILS_UNIFYFUNCTIONEXITNODES_H
+#define LLVM_TRANSFORMS_UTILS_UNIFYFUNCTIONEXITNODES_H
 
 #include "llvm/Pass.h"
 

diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h
index aaadd7d..0b88d25 100644
--- a/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/include/llvm/Transforms/Utils/UnrollLoop.h

@@ -18,6 +18,7 @@
 
 namespace llvm {
 
+class AssumptionTracker;
 class Loop;
 class LoopInfo;
 class LPPassManager;
@@ -25,7 +26,7 @@
 
 bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool AllowRuntime,
                 unsigned TripMultiple, LoopInfo *LI, Pass *PP,
-                LPPassManager *LPM);
+                LPPassManager *LPM, AssumptionTracker *AT);
 
 bool UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
                              LPPassManager* LPM);

diff --git a/include/llvm/Transforms/Utils/VectorUtils.h b/include/llvm/Transforms/Utils/VectorUtils.h
index 44a7149..83871fc 100644
--- a/include/llvm/Transforms/Utils/VectorUtils.h
+++ b/include/llvm/Transforms/Utils/VectorUtils.h

@@ -36,6 +36,8 @@
   case Intrinsic::log10:
   case Intrinsic::log2:
   case Intrinsic::fabs:
+  case Intrinsic::minnum:
+  case Intrinsic::maxnum:
   case Intrinsic::copysign:
   case Intrinsic::floor:
   case Intrinsic::ceil:
@@ -99,7 +101,7 @@
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
     Intrinsic::ID ID = II->getIntrinsicID();
     if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
-        ID == Intrinsic::lifetime_end)
+        ID == Intrinsic::lifetime_end || ID == Intrinsic::assume)
       return ID;
     else
       return Intrinsic::not_intrinsic;
@@ -153,6 +155,14 @@
   case LibFunc::fabsf:
   case LibFunc::fabsl:
     return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
+  case LibFunc::fmin:
+  case LibFunc::fminf:
+  case LibFunc::fminl:
+    return checkBinaryFloatSignature(*CI, Intrinsic::minnum);
+  case LibFunc::fmax:
+  case LibFunc::fmaxf:
+  case LibFunc::fmaxl:
+    return checkBinaryFloatSignature(*CI, Intrinsic::maxnum);
   case LibFunc::copysign:
   case LibFunc::copysignf:
   case LibFunc::copysignl:

diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 1790a72..46f6e40 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap

@@ -112,7 +112,12 @@
   exclude header "MC/MCTargetOptionsCommandFlags.h"
 }
 
-module LLVM_Object { requires cplusplus umbrella "Object" module * { export * } }
+module LLVM_Object {
+  requires cplusplus
+  umbrella "Object"
+  module * { export * }
+}
+
 module LLVM_Option { requires cplusplus umbrella "Option" module * { export * } }
 module LLVM_TableGen { requires cplusplus umbrella "TableGen" module * { export * } }
 
@@ -148,9 +153,6 @@
     exclude header "Support/AIXDataTypesFix.h"
 
     // Exclude this; it's fundamentally non-modular.
-    exclude header "Support/Debug.h"
-
-    // Exclude this; it's fundamentally non-modular.
     exclude header "Support/PluginLoader.h"
 
     // Exclude this; it's a weirdly-factored part of llvm-gcov and conflicts

diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 5cde979..5171a45 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp

@@ -196,17 +196,21 @@
         if (!Arg->getType()->isPointerTy())
           continue;
         ModRefResult ArgMask;
-        Location CS1Loc =
-          getArgLocation(CS1, (unsigned) std::distance(CS1.arg_begin(), I),
-                         ArgMask);
-        if ((getModRefInfo(CS2, CS1Loc) & ArgMask) != NoModRef) {
-          R = Mask;
+        Location CS1Loc = getArgLocation(
+            CS1, (unsigned)std::distance(CS1.arg_begin(), I), ArgMask);
+        // ArgMask indicates what CS1 might do to CS1Loc; if CS1 might Mod
+        // CS1Loc, then we care about either a Mod or a Ref by CS2. If CS1
+        // might Ref, then we care only about a Mod by CS2.
+        ModRefResult ArgR = getModRefInfo(CS2, CS1Loc);
+        if (((ArgMask & Mod) != NoModRef && (ArgR & ModRef) != NoModRef) ||
+            ((ArgMask & Ref) != NoModRef && (ArgR & Mod)    != NoModRef))
+          R = ModRefResult((R | ArgMask) & Mask);
+
+        if (R == Mask)
           break;
-        }
       }
     }
-    if (R == NoModRef)
-      return R;
+    return R;
   }
 
   // If this is the end of the chain, don't forward.
@@ -247,61 +251,73 @@
 //===----------------------------------------------------------------------===//
 
 AliasAnalysis::Location AliasAnalysis::getLocation(const LoadInst *LI) {
+  AAMDNodes AATags;
+  LI->getAAMetadata(AATags);
+
   return Location(LI->getPointerOperand(),
-                  getTypeStoreSize(LI->getType()),
-                  LI->getMetadata(LLVMContext::MD_tbaa));
+                  getTypeStoreSize(LI->getType()), AATags);
 }
 
 AliasAnalysis::Location AliasAnalysis::getLocation(const StoreInst *SI) {
+  AAMDNodes AATags;
+  SI->getAAMetadata(AATags);
+
   return Location(SI->getPointerOperand(),
-                  getTypeStoreSize(SI->getValueOperand()->getType()),
-                  SI->getMetadata(LLVMContext::MD_tbaa));
+                  getTypeStoreSize(SI->getValueOperand()->getType()), AATags);
 }
 
 AliasAnalysis::Location AliasAnalysis::getLocation(const VAArgInst *VI) {
-  return Location(VI->getPointerOperand(),
-                  UnknownSize,
-                  VI->getMetadata(LLVMContext::MD_tbaa));
+  AAMDNodes AATags;
+  VI->getAAMetadata(AATags);
+
+  return Location(VI->getPointerOperand(), UnknownSize, AATags);
 }
 
 AliasAnalysis::Location
 AliasAnalysis::getLocation(const AtomicCmpXchgInst *CXI) {
+  AAMDNodes AATags;
+  CXI->getAAMetadata(AATags);
+
   return Location(CXI->getPointerOperand(),
                   getTypeStoreSize(CXI->getCompareOperand()->getType()),
-                  CXI->getMetadata(LLVMContext::MD_tbaa));
+                  AATags);
 }
 
 AliasAnalysis::Location
 AliasAnalysis::getLocation(const AtomicRMWInst *RMWI) {
+  AAMDNodes AATags;
+  RMWI->getAAMetadata(AATags);
+
   return Location(RMWI->getPointerOperand(),
-                  getTypeStoreSize(RMWI->getValOperand()->getType()),
-                  RMWI->getMetadata(LLVMContext::MD_tbaa));
+                  getTypeStoreSize(RMWI->getValOperand()->getType()), AATags);
 }
 
-AliasAnalysis::Location 
+AliasAnalysis::Location
 AliasAnalysis::getLocationForSource(const MemTransferInst *MTI) {
   uint64_t Size = UnknownSize;
   if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
     Size = C->getValue().getZExtValue();
 
-  // memcpy/memmove can have TBAA tags. For memcpy, they apply
+  // memcpy/memmove can have AA tags. For memcpy, they apply
   // to both the source and the destination.
-  MDNode *TBAATag = MTI->getMetadata(LLVMContext::MD_tbaa);
+  AAMDNodes AATags;
+  MTI->getAAMetadata(AATags);
 
-  return Location(MTI->getRawSource(), Size, TBAATag);
+  return Location(MTI->getRawSource(), Size, AATags);
 }
 
-AliasAnalysis::Location 
+AliasAnalysis::Location
 AliasAnalysis::getLocationForDest(const MemIntrinsic *MTI) {
   uint64_t Size = UnknownSize;
   if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
     Size = C->getValue().getZExtValue();
 
-  // memcpy/memmove can have TBAA tags. For memcpy, they apply
+  // memcpy/memmove can have AA tags. For memcpy, they apply
   // to both the source and the destination.
-  MDNode *TBAATag = MTI->getMetadata(LLVMContext::MD_tbaa);
-  
-  return Location(MTI->getRawDest(), Size, TBAATag);
+  AAMDNodes AATags;
+  MTI->getAAMetadata(AATags);
+
+  return Location(MTI->getRawDest(), Size, AATags);
 }
 
 
@@ -383,53 +399,6 @@
   return ModRef;
 }
 
-namespace {
-  /// Only find pointer captures which happen before the given instruction. Uses
-  /// the dominator tree to determine whether one instruction is before another.
-  /// Only support the case where the Value is defined in the same basic block
-  /// as the given instruction and the use.
-  struct CapturesBefore : public CaptureTracker {
-    CapturesBefore(const Instruction *I, DominatorTree *DT)
-      : BeforeHere(I), DT(DT), Captured(false) {}
-
-    void tooManyUses() override { Captured = true; }
-
-    bool shouldExplore(const Use *U) override {
-      Instruction *I = cast<Instruction>(U->getUser());
-      BasicBlock *BB = I->getParent();
-      // We explore this usage only if the usage can reach "BeforeHere".
-      // If use is not reachable from entry, there is no need to explore.
-      if (BeforeHere != I && !DT->isReachableFromEntry(BB))
-        return false;
-      // If the value is defined in the same basic block as use and BeforeHere,
-      // there is no need to explore the use if BeforeHere dominates use.
-      // Check whether there is a path from I to BeforeHere.
-      if (BeforeHere != I && DT->dominates(BeforeHere, I) &&
-          !isPotentiallyReachable(I, BeforeHere, DT))
-        return false;
-      return true;
-    }
-
-    bool captured(const Use *U) override {
-      Instruction *I = cast<Instruction>(U->getUser());
-      BasicBlock *BB = I->getParent();
-      // Same logic as in shouldExplore.
-      if (BeforeHere != I && !DT->isReachableFromEntry(BB))
-        return false;
-      if (BeforeHere != I && DT->dominates(BeforeHere, I) &&
-          !isPotentiallyReachable(I, BeforeHere, DT))
-        return false;
-      Captured = true;
-      return true;
-    }
-
-    const Instruction *BeforeHere;
-    DominatorTree *DT;
-
-    bool Captured;
-  };
-}
-
 // FIXME: this is really just shoring-up a deficiency in alias analysis.
 // BasicAA isn't willing to spend linear time determining whether an alloca
 // was captured before or after this particular call, while we are. However,
@@ -449,9 +418,9 @@
   if (!CS.getInstruction() || CS.getInstruction() == Object)
     return AliasAnalysis::ModRef;
 
-  CapturesBefore CB(I, DT);
-  llvm::PointerMayBeCaptured(Object, &CB);
-  if (CB.Captured)
+  if (llvm::PointerMayBeCapturedBefore(Object, /* ReturnCaptures */ true,
+                                       /* StoreCaptures */ true, I, DT,
+                                       /* include Object */ true))
     return AliasAnalysis::ModRef;
 
   unsigned ArgNo = 0;
@@ -470,7 +439,7 @@
     // assume that the call could touch the pointer, even though it doesn't
     // escape.
     if (isNoAlias(AliasAnalysis::Location(*CI),
-		  AliasAnalysis::Location(Object)))
+                  AliasAnalysis::Location(Object)))
       continue;
     if (CS.doesNotAccessMemory(ArgNo))
       continue;
@@ -577,3 +546,13 @@
     return A->hasNoAliasAttr() || A->hasByValAttr();
   return false;
 }
+
+/// isIdentifiedFunctionLocal - Return true if V is umabigously identified
+/// at the function-level. Different IdentifiedFunctionLocals can't alias.
+/// Further, an IdentifiedFunctionLocal can not alias with any function
+/// arguments other than itself, which is not necessarily true for
+/// IdentifiedObjects.
+bool llvm::isIdentifiedFunctionLocal(const Value *V)
+{
+  return isa<AllocaInst>(V) || isNoAliasCall(V) || isNoAliasArgument(V);
+}

diff --git a/lib/Analysis/AliasAnalysisEvaluator.cpp b/lib/Analysis/AliasAnalysisEvaluator.cpp
index d9fa5a5..fe4bd4c 100644
--- a/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/lib/Analysis/AliasAnalysisEvaluator.cpp

@@ -43,7 +43,7 @@
 static cl::opt<bool> PrintRef("print-ref", cl::ReallyHidden);
 static cl::opt<bool> PrintModRef("print-modref", cl::ReallyHidden);
 
-static cl::opt<bool> EvalTBAA("evaluate-tbaa", cl::ReallyHidden);
+static cl::opt<bool> EvalAAMD("evaluate-aa-metadata", cl::ReallyHidden);
 
 namespace {
   class AAEval : public FunctionPass {
@@ -153,9 +153,9 @@
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
     if (I->getType()->isPointerTy()) // Add all pointer instructions.
       Pointers.insert(&*I);
-    if (EvalTBAA && isa<LoadInst>(&*I))
+    if (EvalAAMD && isa<LoadInst>(&*I))
       Loads.insert(&*I);
-    if (EvalTBAA && isa<StoreInst>(&*I))
+    if (EvalAAMD && isa<StoreInst>(&*I))
       Stores.insert(&*I);
     Instruction &Inst = *I;
     if (CallSite CS = cast<Value>(&Inst)) {
@@ -213,7 +213,7 @@
     }
   }
 
-  if (EvalTBAA) {
+  if (EvalAAMD) {
     // iterate over all pairs of load, store
     for (SetVector<Value *>::iterator I1 = Loads.begin(), E = Loads.end();
          I1 != E; ++I1) {

diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index a45fe23..45442b0 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp

@@ -47,18 +47,21 @@
     // If the pointers are not a must-alias pair, this set becomes a may alias.
     if (AA.alias(AliasAnalysis::Location(L->getValue(),
                                          L->getSize(),
-                                         L->getTBAAInfo()),
+                                         L->getAAInfo()),
                  AliasAnalysis::Location(R->getValue(),
                                          R->getSize(),
-                                         R->getTBAAInfo()))
+                                         R->getAAInfo()))
         != AliasAnalysis::MustAlias)
       AliasTy = MayAlias;
   }
 
+  bool ASHadUnknownInsts = !AS.UnknownInsts.empty();
   if (UnknownInsts.empty()) {            // Merge call sites...
-    if (!AS.UnknownInsts.empty())
+    if (ASHadUnknownInsts) {
       std::swap(UnknownInsts, AS.UnknownInsts);
-  } else if (!AS.UnknownInsts.empty()) {
+      addRef();
+    }
+  } else if (ASHadUnknownInsts) {
     UnknownInsts.insert(UnknownInsts.end(), AS.UnknownInsts.begin(), AS.UnknownInsts.end());
     AS.UnknownInsts.clear();
   }
@@ -76,6 +79,8 @@
     AS.PtrListEnd = &AS.PtrList;
     assert(*AS.PtrListEnd == nullptr && "End of list is not null?");
   }
+  if (ASHadUnknownInsts)
+    AS.dropRef(AST);
 }
 
 void AliasSetTracker::removeAliasSet(AliasSet *AS) {
@@ -92,7 +97,7 @@
 }
 
 void AliasSet::addPointer(AliasSetTracker &AST, PointerRec &Entry,
-                          uint64_t Size, const MDNode *TBAAInfo,
+                          uint64_t Size, const AAMDNodes &AAInfo,
                           bool KnownMustAlias) {
   assert(!Entry.hasAliasSet() && "Entry already in set!");
 
@@ -102,17 +107,17 @@
       AliasAnalysis &AA = AST.getAliasAnalysis();
       AliasAnalysis::AliasResult Result =
         AA.alias(AliasAnalysis::Location(P->getValue(), P->getSize(),
-                                         P->getTBAAInfo()),
-                 AliasAnalysis::Location(Entry.getValue(), Size, TBAAInfo));
+                                         P->getAAInfo()),
+                 AliasAnalysis::Location(Entry.getValue(), Size, AAInfo));
       if (Result != AliasAnalysis::MustAlias)
         AliasTy = MayAlias;
       else                  // First entry of must alias must have maximum size!
-        P->updateSizeAndTBAAInfo(Size, TBAAInfo);
+        P->updateSizeAndAAInfo(Size, AAInfo);
       assert(Result != AliasAnalysis::NoAlias && "Cannot be part of must set!");
     }
 
   Entry.setAliasSet(this);
-  Entry.updateSizeAndTBAAInfo(Size, TBAAInfo);
+  Entry.updateSizeAndAAInfo(Size, AAInfo);
 
   // Add it to the end of the list...
   assert(*PtrListEnd == nullptr && "End of list is not null?");
@@ -123,6 +128,8 @@
 }
 
 void AliasSet::addUnknownInst(Instruction *I, AliasAnalysis &AA) {
+  if (UnknownInsts.empty())
+    addRef();
   UnknownInsts.push_back(I);
 
   if (!I->mayWriteToMemory()) {
@@ -140,7 +147,7 @@
 /// alias one of the members in the set.
 ///
 bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
-                              const MDNode *TBAAInfo,
+                              const AAMDNodes &AAInfo,
                               AliasAnalysis &AA) const {
   if (AliasTy == MustAlias) {
     assert(UnknownInsts.empty() && "Illegal must alias set!");
@@ -151,23 +158,23 @@
     assert(SomePtr && "Empty must-alias set??");
     return AA.alias(AliasAnalysis::Location(SomePtr->getValue(),
                                             SomePtr->getSize(),
-                                            SomePtr->getTBAAInfo()),
-                    AliasAnalysis::Location(Ptr, Size, TBAAInfo));
+                                            SomePtr->getAAInfo()),
+                    AliasAnalysis::Location(Ptr, Size, AAInfo));
   }
 
   // If this is a may-alias set, we have to check all of the pointers in the set
   // to be sure it doesn't alias the set...
   for (iterator I = begin(), E = end(); I != E; ++I)
-    if (AA.alias(AliasAnalysis::Location(Ptr, Size, TBAAInfo),
+    if (AA.alias(AliasAnalysis::Location(Ptr, Size, AAInfo),
                  AliasAnalysis::Location(I.getPointer(), I.getSize(),
-                                         I.getTBAAInfo())))
+                                         I.getAAInfo())))
       return true;
 
   // Check the unknown instructions...
   if (!UnknownInsts.empty()) {
     for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i)
       if (AA.getModRefInfo(UnknownInsts[i],
-                           AliasAnalysis::Location(Ptr, Size, TBAAInfo)) !=
+                           AliasAnalysis::Location(Ptr, Size, AAInfo)) !=
             AliasAnalysis::NoModRef)
         return true;
   }
@@ -190,7 +197,7 @@
   for (iterator I = begin(), E = end(); I != E; ++I)
     if (AA.getModRefInfo(Inst, AliasAnalysis::Location(I.getPointer(),
                                                        I.getSize(),
-                                                       I.getTBAAInfo())) !=
+                                                       I.getAAInfo())) !=
            AliasAnalysis::NoModRef)
       return true;
 
@@ -216,15 +223,16 @@
 ///
 AliasSet *AliasSetTracker::findAliasSetForPointer(const Value *Ptr,
                                                   uint64_t Size,
-                                                  const MDNode *TBAAInfo) {
+                                                  const AAMDNodes &AAInfo) {
   AliasSet *FoundSet = nullptr;
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    if (I->Forward || !I->aliasesPointer(Ptr, Size, TBAAInfo, AA)) continue;
+  for (iterator I = begin(), E = end(); I != E;) {
+    iterator Cur = I++;
+    if (Cur->Forward || !Cur->aliasesPointer(Ptr, Size, AAInfo, AA)) continue;
     
     if (!FoundSet) {      // If this is the first alias set ptr can go into.
-      FoundSet = I;       // Remember it.
+      FoundSet = Cur;     // Remember it.
     } else {              // Otherwise, we must merge the sets.
-      FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
+      FoundSet->mergeSetIn(*Cur, *this);     // Merge in contents.
     }
   }
 
@@ -235,25 +243,30 @@
 /// this alias set, false otherwise.  This does not modify the AST object or
 /// alias sets.
 bool AliasSetTracker::containsPointer(Value *Ptr, uint64_t Size,
-                                      const MDNode *TBAAInfo) const {
+                                      const AAMDNodes &AAInfo) const {
   for (const_iterator I = begin(), E = end(); I != E; ++I)
-    if (!I->Forward && I->aliasesPointer(Ptr, Size, TBAAInfo, AA))
+    if (!I->Forward && I->aliasesPointer(Ptr, Size, AAInfo, AA))
       return true;
   return false;
 }
 
-
+bool AliasSetTracker::containsUnknown(Instruction *Inst) const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    if (!I->Forward && I->aliasesUnknownInst(Inst, AA))
+      return true;
+  return false;
+}
 
 AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
   AliasSet *FoundSet = nullptr;
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    if (I->Forward || !I->aliasesUnknownInst(Inst, AA))
+  for (iterator I = begin(), E = end(); I != E;) {
+    iterator Cur = I++;
+    if (Cur->Forward || !Cur->aliasesUnknownInst(Inst, AA))
       continue;
-    
     if (!FoundSet)            // If this is the first alias set ptr can go into.
-      FoundSet = I;           // Remember it.
-    else if (!I->Forward)     // Otherwise, we must merge the sets.
-      FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
+      FoundSet = Cur;         // Remember it.
+    else if (!Cur->Forward)   // Otherwise, we must merge the sets.
+      FoundSet->mergeSetIn(*Cur, *this);     // Merge in contents.
   }
   return FoundSet;
 }
@@ -264,67 +277,75 @@
 /// getAliasSetForPointer - Return the alias set that the specified pointer
 /// lives in.
 AliasSet &AliasSetTracker::getAliasSetForPointer(Value *Pointer, uint64_t Size,
-                                                 const MDNode *TBAAInfo,
+                                                 const AAMDNodes &AAInfo,
                                                  bool *New) {
   AliasSet::PointerRec &Entry = getEntryFor(Pointer);
 
   // Check to see if the pointer is already known.
   if (Entry.hasAliasSet()) {
-    Entry.updateSizeAndTBAAInfo(Size, TBAAInfo);
+    Entry.updateSizeAndAAInfo(Size, AAInfo);
     // Return the set!
     return *Entry.getAliasSet(*this)->getForwardedTarget(*this);
   }
   
-  if (AliasSet *AS = findAliasSetForPointer(Pointer, Size, TBAAInfo)) {
+  if (AliasSet *AS = findAliasSetForPointer(Pointer, Size, AAInfo)) {
     // Add it to the alias set it aliases.
-    AS->addPointer(*this, Entry, Size, TBAAInfo);
+    AS->addPointer(*this, Entry, Size, AAInfo);
     return *AS;
   }
   
   if (New) *New = true;
   // Otherwise create a new alias set to hold the loaded pointer.
   AliasSets.push_back(new AliasSet());
-  AliasSets.back().addPointer(*this, Entry, Size, TBAAInfo);
+  AliasSets.back().addPointer(*this, Entry, Size, AAInfo);
   return AliasSets.back();
 }
 
-bool AliasSetTracker::add(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo) {
+bool AliasSetTracker::add(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo) {
   bool NewPtr;
-  addPointer(Ptr, Size, TBAAInfo, AliasSet::NoModRef, NewPtr);
+  addPointer(Ptr, Size, AAInfo, AliasSet::NoModRef, NewPtr);
   return NewPtr;
 }
 
 
 bool AliasSetTracker::add(LoadInst *LI) {
   if (LI->getOrdering() > Monotonic) return addUnknown(LI);
+
+  AAMDNodes AAInfo;
+  LI->getAAMetadata(AAInfo);
+
   AliasSet::AccessType ATy = AliasSet::Refs;
   bool NewPtr;
   AliasSet &AS = addPointer(LI->getOperand(0),
                             AA.getTypeStoreSize(LI->getType()),
-                            LI->getMetadata(LLVMContext::MD_tbaa),
-                            ATy, NewPtr);
+                            AAInfo, ATy, NewPtr);
   if (LI->isVolatile()) AS.setVolatile();
   return NewPtr;
 }
 
 bool AliasSetTracker::add(StoreInst *SI) {
   if (SI->getOrdering() > Monotonic) return addUnknown(SI);
+
+  AAMDNodes AAInfo;
+  SI->getAAMetadata(AAInfo);
+
   AliasSet::AccessType ATy = AliasSet::Mods;
   bool NewPtr;
   Value *Val = SI->getOperand(0);
   AliasSet &AS = addPointer(SI->getOperand(1),
                             AA.getTypeStoreSize(Val->getType()),
-                            SI->getMetadata(LLVMContext::MD_tbaa),
-                            ATy, NewPtr);
+                            AAInfo, ATy, NewPtr);
   if (SI->isVolatile()) AS.setVolatile();
   return NewPtr;
 }
 
 bool AliasSetTracker::add(VAArgInst *VAAI) {
+  AAMDNodes AAInfo;
+  VAAI->getAAMetadata(AAInfo);
+
   bool NewPtr;
   addPointer(VAAI->getOperand(0), AliasAnalysis::UnknownSize, 
-             VAAI->getMetadata(LLVMContext::MD_tbaa),
-             AliasSet::ModRef, NewPtr);
+             AAInfo, AliasSet::ModRef, NewPtr);
   return NewPtr;
 }
 
@@ -382,7 +403,7 @@
     bool X;
     for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
       AliasSet &NewAS = addPointer(ASI.getPointer(), ASI.getSize(),
-                                   ASI.getTBAAInfo(),
+                                   ASI.getAAInfo(),
                                    (AliasSet::AccessType)AS.AccessTy, X);
       if (AS.isVolatile()) NewAS.setVolatile();
     }
@@ -393,6 +414,8 @@
 /// tracker.
 void AliasSetTracker::remove(AliasSet &AS) {
   // Drop all call sites.
+  if (!AS.UnknownInsts.empty())
+    AS.dropRef(*this);
   AS.UnknownInsts.clear();
   
   // Clear the alias set.
@@ -419,8 +442,8 @@
 }
 
 bool
-AliasSetTracker::remove(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo) {
-  AliasSet *AS = findAliasSetForPointer(Ptr, Size, TBAAInfo);
+AliasSetTracker::remove(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo) {
+  AliasSet *AS = findAliasSetForPointer(Ptr, Size, AAInfo);
   if (!AS) return false;
   remove(*AS);
   return true;
@@ -428,8 +451,11 @@
 
 bool AliasSetTracker::remove(LoadInst *LI) {
   uint64_t Size = AA.getTypeStoreSize(LI->getType());
-  const MDNode *TBAAInfo = LI->getMetadata(LLVMContext::MD_tbaa);
-  AliasSet *AS = findAliasSetForPointer(LI->getOperand(0), Size, TBAAInfo);
+
+  AAMDNodes AAInfo;
+  LI->getAAMetadata(AAInfo);
+
+  AliasSet *AS = findAliasSetForPointer(LI->getOperand(0), Size, AAInfo);
   if (!AS) return false;
   remove(*AS);
   return true;
@@ -437,17 +463,22 @@
 
 bool AliasSetTracker::remove(StoreInst *SI) {
   uint64_t Size = AA.getTypeStoreSize(SI->getOperand(0)->getType());
-  const MDNode *TBAAInfo = SI->getMetadata(LLVMContext::MD_tbaa);
-  AliasSet *AS = findAliasSetForPointer(SI->getOperand(1), Size, TBAAInfo);
+
+  AAMDNodes AAInfo;
+  SI->getAAMetadata(AAInfo);
+
+  AliasSet *AS = findAliasSetForPointer(SI->getOperand(1), Size, AAInfo);
   if (!AS) return false;
   remove(*AS);
   return true;
 }
 
 bool AliasSetTracker::remove(VAArgInst *VAAI) {
+  AAMDNodes AAInfo;
+  VAAI->getAAMetadata(AAInfo);
+
   AliasSet *AS = findAliasSetForPointer(VAAI->getOperand(0),
-                                        AliasAnalysis::UnknownSize,
-                                        VAAI->getMetadata(LLVMContext::MD_tbaa));
+                                        AliasAnalysis::UnknownSize, AAInfo);
   if (!AS) return false;
   remove(*AS);
   return true;
@@ -489,10 +520,10 @@
   if (Instruction *Inst = dyn_cast<Instruction>(PtrVal)) {
     if (Inst->mayReadOrWriteMemory()) {
       // Scan all the alias sets to see if this call site is contained.
-      for (iterator I = begin(), E = end(); I != E; ++I) {
-        if (I->Forward) continue;
-        
-        I->removeUnknownInst(Inst);
+      for (iterator I = begin(), E = end(); I != E;) {
+        iterator Cur = I++;
+        if (!Cur->Forward)
+          Cur->removeUnknownInst(*this, Inst);
       }
     }
   }
@@ -536,7 +567,7 @@
   I = PointerMap.find_as(From);
   AliasSet *AS = I->second->getAliasSet(*this);
   AS->addPointer(*this, Entry, I->second->getSize(),
-                 I->second->getTBAAInfo(),
+                 I->second->getAAInfo(),
                  true);
 }
 

diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index ade940a..f64bf0e 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp

@@ -34,6 +34,7 @@
   initializeCFGPrinterPass(Registry);
   initializeCFGOnlyViewerPass(Registry);
   initializeCFGOnlyPrinterPass(Registry);
+  initializeCFLAliasAnalysisPass(Registry);
   initializeDependenceAnalysisPass(Registry);
   initializeDelinearizationPass(Registry);
   initializeDominanceFrontierPass(Registry);
@@ -57,7 +58,7 @@
   initializeMemoryDependenceAnalysisPass(Registry);
   initializeModuleDebugInfoPrinterPass(Registry);
   initializePostDominatorTreePass(Registry);
-  initializeRegionInfoPass(Registry);
+  initializeRegionInfoPassPass(Registry);
   initializeRegionViewerPass(Registry);
   initializeRegionPrinterPass(Registry);
   initializeRegionOnlyViewerPass(Registry);
@@ -66,6 +67,7 @@
   initializeScalarEvolutionAliasAnalysisPass(Registry);
   initializeTargetTransformInfoAnalysisGroup(Registry);
   initializeTypeBasedAliasAnalysisPass(Registry);
+  initializeScopedNoAliasAAPass(Registry);
 }
 
 void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {

diff --git a/lib/Analysis/Android.mk b/lib/Analysis/Android.mk
index 4e435a1..8770fa7 100644
--- a/lib/Analysis/Android.mk
+++ b/lib/Analysis/Android.mk

@@ -7,12 +7,15 @@
   AliasDebugger.cpp \
   AliasSetTracker.cpp \
   Analysis.cpp \
+  AssumptionTracker.cpp \
   BasicAliasAnalysis.cpp \
   BlockFrequencyInfo.cpp \
   BlockFrequencyInfoImpl.cpp \
   BranchProbabilityInfo.cpp \
   CFG.cpp \
   CFGPrinter.cpp \
+  CFLAliasAnalysis.cpp \
+  CGSCCPassManager.cpp \
   CaptureTracking.cpp \
   CodeMetrics.cpp \
   ConstantFolding.cpp \
@@ -21,7 +24,7 @@
   DependenceAnalysis.cpp \
   DomPrinter.cpp \
   DominanceFrontier.cpp \
-  CGSCCPassManager.cpp \
+  FunctionTargetTransformInfo.cpp \
   IVUsers.cpp \
   InstCount.cpp \
   InstructionSimplify.cpp \
@@ -51,6 +54,7 @@
   ScalarEvolutionAliasAnalysis.cpp \
   ScalarEvolutionExpander.cpp \
   ScalarEvolutionNormalization.cpp \
+  ScopedNoAliasAA.cpp \
   SparsePropagation.cpp \
   TargetTransformInfo.cpp \
   Trace.cpp \

diff --git a/lib/Analysis/AssumptionTracker.cpp b/lib/Analysis/AssumptionTracker.cpp
new file mode 100644
index 0000000..775ce1d
--- /dev/null
+++ b/lib/Analysis/AssumptionTracker.cpp

@@ -0,0 +1,110 @@
+//===- AssumptionTracker.cpp - Track @llvm.assume -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that keeps track of @llvm.assume intrinsics in
+// the functions of a module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+void AssumptionTracker::FunctionCallbackVH::deleted() {
+  AT->forgetCachedAssumptions(cast<Function>(getValPtr()));
+  // 'this' now dangles!
+}
+
+void AssumptionTracker::forgetCachedAssumptions(Function *F) {
+  auto I = CachedAssumeCalls.find_as(F);
+  if (I != CachedAssumeCalls.end())
+    CachedAssumeCalls.erase(I);
+}
+
+void AssumptionTracker::CallCallbackVH::deleted() {
+  assert(F && "delete callback called on dummy handle");
+  FunctionCallsMap::iterator I = AT->CachedAssumeCalls.find_as(F);
+  assert(I != AT->CachedAssumeCalls.end() &&
+         "Function cleared from the map without removing the values?");
+
+  I->second->erase(*this);
+  // 'this' now dangles!
+}
+
+AssumptionTracker::FunctionCallsMap::iterator
+AssumptionTracker::scanFunction(Function *F) {
+  auto IP = CachedAssumeCalls.insert(std::make_pair(
+      FunctionCallbackVH(F, this), llvm::make_unique<CallHandleSet>()));
+  assert(IP.second && "Scanning function already in the map?");
+
+  FunctionCallsMap::iterator I = IP.first;
+
+  // Go through all instructions in all blocks, add all calls to @llvm.assume
+  // to our cache.
+  for (BasicBlock &B : *F)
+    for (Instruction &II : B)
+      if (match(&II, m_Intrinsic<Intrinsic::assume>()))
+        I->second->insert(CallCallbackVH(&II, this));
+
+  return I;
+}
+
+void AssumptionTracker::verifyAnalysis() const {
+#ifndef NDEBUG
+  for (const auto &I : CachedAssumeCalls) {
+    for (const BasicBlock &B : cast<Function>(*I.first))
+      for (const Instruction &II : B) {
+        if (match(&II, m_Intrinsic<Intrinsic::assume>())) {
+          assert(I.second->find_as(&II) != I.second->end() &&
+                 "Assumption in scanned function not in cache");
+        }
+    }
+  }
+#endif
+}
+
+void AssumptionTracker::registerAssumption(CallInst *CI) {
+  assert(match(CI, m_Intrinsic<Intrinsic::assume>()) &&
+         "Registered call does not call @llvm.assume");
+  assert(CI->getParent() &&
+         "Cannot register @llvm.assume call not in a basic block");
+
+  Function *F = CI->getParent()->getParent();
+  assert(F && "Cannot register @llvm.assume call not in a function");
+
+  FunctionCallsMap::iterator I = CachedAssumeCalls.find_as(F);
+  if (I == CachedAssumeCalls.end()) {
+    // If this function has not already been scanned, then don't do anything
+    // here. This intrinsic will be found, if it still exists, if the list of
+    // assumptions in this function is requested at some later point. This
+    // maintains the following invariant: if a function is present in the
+    // cache, then its list of assumption intrinsic calls is complete.
+    return;
+  }
+
+  I->second->insert(CallCallbackVH(CI, this));
+}
+
+AssumptionTracker::AssumptionTracker() : ImmutablePass(ID) {
+  initializeAssumptionTrackerPass(*PassRegistry::getPassRegistry());
+}
+
+AssumptionTracker::~AssumptionTracker() {}
+
+INITIALIZE_PASS(AssumptionTracker, "assumption-tracker", "Assumption Tracker",
+                false, true)
+char AssumptionTracker::ID = 0;
+

diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index c50dd4a..9aba0d3 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp

@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -156,17 +157,6 @@
   return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize == Size;
 }
 
-/// isIdentifiedFunctionLocal - Return true if V is umabigously identified
-/// at the function-level. Different IdentifiedFunctionLocals can't alias.
-/// Further, an IdentifiedFunctionLocal can not alias with any function
-/// arguments other than itself, which is not necessarily true for
-/// IdentifiedObjects.
-static bool isIdentifiedFunctionLocal(const Value *V)
-{
-  return isa<AllocaInst>(V) || isNoAliasCall(V) || isNoAliasArgument(V);
-}
-
-
 //===----------------------------------------------------------------------===//
 // GetElementPtr Instruction Decomposition and Analysis
 //===----------------------------------------------------------------------===//
@@ -205,7 +195,9 @@
 /// represented in the result.
 static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
                                   ExtensionKind &Extension,
-                                  const DataLayout &DL, unsigned Depth) {
+                                  const DataLayout &DL, unsigned Depth,
+                                  AssumptionTracker *AT,
+                                  DominatorTree *DT) {
   assert(V->getType()->isIntegerTy() && "Not an integer value");
 
   // Limit our recursion depth.
@@ -215,6 +207,14 @@
     return V;
   }
 
+  if (ConstantInt *Const = dyn_cast<ConstantInt>(V)) {
+    // if it's a constant, just convert it to an offset
+    // and remove the variable.
+    Offset += Const->getValue();
+    assert(Scale == 0 && "Constant values don't have a scale");
+    return V;
+  }
+
   if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
       switch (BOp->getOpcode()) {
@@ -222,23 +222,24 @@
       case Instruction::Or:
         // X|C == X+C if all the bits in C are unset in X.  Otherwise we can't
         // analyze it.
-        if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), &DL))
+        if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), &DL, 0,
+                               AT, BOp, DT))
           break;
         // FALL THROUGH.
       case Instruction::Add:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
-                                DL, Depth+1);
+                                DL, Depth+1, AT, DT);
         Offset += RHSC->getValue();
         return V;
       case Instruction::Mul:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
-                                DL, Depth+1);
+                                DL, Depth+1, AT, DT);
         Offset *= RHSC->getValue();
         Scale *= RHSC->getValue();
         return V;
       case Instruction::Shl:
         V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
-                                DL, Depth+1);
+                                DL, Depth+1, AT, DT);
         Offset <<= RHSC->getValue().getLimitedValue();
         Scale <<= RHSC->getValue().getLimitedValue();
         return V;
@@ -259,9 +260,12 @@
     Extension = isa<SExtInst>(V) ? EK_SignExt : EK_ZeroExt;
 
     Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension,
-                                        DL, Depth+1);
+                                        DL, Depth+1, AT, DT);
     Scale = Scale.zext(OldWidth);
-    Offset = Offset.zext(OldWidth);
+
+    // We have to sign-extend even if Extension == EK_ZeroExt as we can't
+    // decompose a sign extension (i.e. zext(x - 1) != zext(x) - zext(-1)).
+    Offset = Offset.sext(OldWidth);
 
     return Result;
   }
@@ -289,7 +293,8 @@
 static const Value *
 DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
                        SmallVectorImpl<VariableGEPIndex> &VarIndices,
-                       bool &MaxLookupReached, const DataLayout *DL) {
+                       bool &MaxLookupReached, const DataLayout *DL,
+                       AssumptionTracker *AT, DominatorTree *DT) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = MaxLookupSearchDepth;
   MaxLookupReached = false;
@@ -309,7 +314,8 @@
       return V;
     }
 
-    if (Op->getOpcode() == Instruction::BitCast) {
+    if (Op->getOpcode() == Instruction::BitCast ||
+        Op->getOpcode() == Instruction::AddrSpaceCast) {
       V = Op->getOperand(0);
       continue;
     }
@@ -319,7 +325,10 @@
       // If it's not a GEP, hand it off to SimplifyInstruction to see if it
       // can come up with something. This matches what GetUnderlyingObject does.
       if (const Instruction *I = dyn_cast<Instruction>(V))
-        // TODO: Get a DominatorTree and use it here.
+        // TODO: Get a DominatorTree and AssumptionTracker and use them here
+        // (these are both now available in this function, but this should be
+        // updated when GetUnderlyingObject is updated). TLI should be
+        // provided also.
         if (const Value *Simplified =
               SimplifyInstruction(const_cast<Instruction *>(I), DL)) {
           V = Simplified;
@@ -378,7 +387,7 @@
       // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
       APInt IndexScale(Width, 0), IndexOffset(Width, 0);
       Index = GetLinearExpression(Index, IndexScale, IndexOffset, Extension,
-                                  *DL, 0);
+                                  *DL, 0, AT, DT);
 
       // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
       // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
@@ -459,6 +468,7 @@
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AssumptionTracker>();
       AU.addRequired<TargetLibraryInfo>();
     }
 
@@ -466,8 +476,8 @@
       assert(AliasCache.empty() && "AliasCache must be cleared after use!");
       assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
              "BasicAliasAnalysis doesn't support interprocedural queries.");
-      AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.TBAATag,
-                                     LocB.Ptr, LocB.Size, LocB.TBAATag);
+      AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.AATags,
+                                     LocB.Ptr, LocB.Size, LocB.AATags);
       // AliasCache rarely has more than 1 or 2 elements, always use
       // shrink_and_clear so it quickly returns to the inline capacity of the
       // SmallDenseMap if it ever grows larger.
@@ -481,10 +491,7 @@
                                const Location &Loc) override;
 
     ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                               ImmutableCallSite CS2) override {
-      // The AliasAnalysis base class has some smarts, lets use them.
-      return AliasAnalysis::getModRefInfo(CS1, CS2);
-    }
+                               ImmutableCallSite CS2) override;
 
     /// pointsToConstantMemory - Chase pointers until we find a (constant
     /// global) or not.
@@ -554,28 +561,28 @@
     // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP
     // instruction against another.
     AliasResult aliasGEP(const GEPOperator *V1, uint64_t V1Size,
-                         const MDNode *V1TBAAInfo,
+                         const AAMDNodes &V1AAInfo,
                          const Value *V2, uint64_t V2Size,
-                         const MDNode *V2TBAAInfo,
+                         const AAMDNodes &V2AAInfo,
                          const Value *UnderlyingV1, const Value *UnderlyingV2);
 
     // aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI
     // instruction against another.
     AliasResult aliasPHI(const PHINode *PN, uint64_t PNSize,
-                         const MDNode *PNTBAAInfo,
+                         const AAMDNodes &PNAAInfo,
                          const Value *V2, uint64_t V2Size,
-                         const MDNode *V2TBAAInfo);
+                         const AAMDNodes &V2AAInfo);
 
     /// aliasSelect - Disambiguate a Select instruction against another value.
     AliasResult aliasSelect(const SelectInst *SI, uint64_t SISize,
-                            const MDNode *SITBAAInfo,
+                            const AAMDNodes &SIAAInfo,
                             const Value *V2, uint64_t V2Size,
-                            const MDNode *V2TBAAInfo);
+                            const AAMDNodes &V2AAInfo);
 
     AliasResult aliasCheck(const Value *V1, uint64_t V1Size,
-                           const MDNode *V1TBAATag,
+                           AAMDNodes V1AATag,
                            const Value *V2, uint64_t V2Size,
-                           const MDNode *V2TBAATag);
+                           AAMDNodes V2AATag);
   };
 }  // End of anonymous namespace
 
@@ -584,6 +591,7 @@
 INITIALIZE_AG_PASS_BEGIN(BasicAliasAnalysis, AliasAnalysis, "basicaa",
                    "Basic Alias Analysis (stateless AA impl)",
                    false, true, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_PASS_END(BasicAliasAnalysis, AliasAnalysis, "basicaa",
                    "Basic Alias Analysis (stateless AA impl)",
@@ -606,7 +614,7 @@
   Worklist.push_back(Loc.Ptr);
   do {
     const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), DL);
-    if (!Visited.insert(V)) {
+    if (!Visited.insert(V).second) {
       Visited.clear();
       return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
     }
@@ -798,6 +806,14 @@
   return Loc;
 }
 
+static bool isAssumeIntrinsic(ImmutableCallSite CS) {
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
+  if (II && II->getIntrinsicID() == Intrinsic::assume)
+    return true;
+
+  return false;
+}
+
 /// getModRefInfo - Check to see if the specified callsite can clobber the
 /// specified memory object.  Since we only look at local properties of this
 /// function, we really can't say much about this query.  We do, however, use
@@ -850,10 +866,29 @@
       return NoModRef;
   }
 
+  // While the assume intrinsic is marked as arbitrarily writing so that
+  // proper control dependencies will be maintained, it never aliases any
+  // particular memory location.
+  if (isAssumeIntrinsic(CS))
+    return NoModRef;
+
   // The AliasAnalysis base class has some smarts, lets use them.
   return AliasAnalysis::getModRefInfo(CS, Loc);
 }
 
+AliasAnalysis::ModRefResult
+BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
+                                  ImmutableCallSite CS2) {
+  // While the assume intrinsic is marked as arbitrarily writing so that
+  // proper control dependencies will be maintained, it never aliases any
+  // particular memory location.
+  if (isAssumeIntrinsic(CS1) || isAssumeIntrinsic(CS2))
+    return NoModRef;
+
+  // The AliasAnalysis base class has some smarts, lets use them.
+  return AliasAnalysis::getModRefInfo(CS1, CS2);
+}
+
 /// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
 /// against another pointer.  We know that V1 is a GEP, but we don't know
 /// anything about V2.  UnderlyingV1 is GetUnderlyingObject(GEP1, DL),
@@ -861,30 +896,35 @@
 ///
 AliasAnalysis::AliasResult
 BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
-                             const MDNode *V1TBAAInfo,
+                             const AAMDNodes &V1AAInfo,
                              const Value *V2, uint64_t V2Size,
-                             const MDNode *V2TBAAInfo,
+                             const AAMDNodes &V2AAInfo,
                              const Value *UnderlyingV1,
                              const Value *UnderlyingV2) {
   int64_t GEP1BaseOffset;
   bool GEP1MaxLookupReached;
   SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
 
+  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+  DominatorTreeWrapperPass *DTWP =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
   // If we have two gep instructions with must-alias or not-alias'ing base
   // pointers, figure out if the indexes to the GEP tell us anything about the
   // derived pointer.
   if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
     // Do the base pointers alias?
-    AliasResult BaseAlias = aliasCheck(UnderlyingV1, UnknownSize, nullptr,
-                                       UnderlyingV2, UnknownSize, nullptr);
+    AliasResult BaseAlias = aliasCheck(UnderlyingV1, UnknownSize, AAMDNodes(),
+                                       UnderlyingV2, UnknownSize, AAMDNodes());
 
     // Check for geps of non-aliasing underlying pointers where the offsets are
     // identical.
     if ((BaseAlias == MayAlias) && V1Size == V2Size) {
       // Do the base pointers alias assuming type and size.
       AliasResult PreciseBaseAlias = aliasCheck(UnderlyingV1, V1Size,
-                                                V1TBAAInfo, UnderlyingV2,
-                                                V2Size, V2TBAAInfo);
+                                                V1AAInfo, UnderlyingV2,
+                                                V2Size, V2AAInfo);
       if (PreciseBaseAlias == NoAlias) {
         // See if the computed offset from the common pointer tells us about the
         // relation of the resulting pointer.
@@ -893,10 +933,10 @@
         SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
         const Value *GEP2BasePtr =
           DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
-                                 GEP2MaxLookupReached, DL);
+                                 GEP2MaxLookupReached, DL, AT, DT);
         const Value *GEP1BasePtr =
           DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                                 GEP1MaxLookupReached, DL);
+                                 GEP1MaxLookupReached, DL, AT, DT);
         // DecomposeGEPExpression and GetUnderlyingObject should return the
         // same result except when DecomposeGEPExpression has no DataLayout.
         if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
@@ -925,14 +965,14 @@
     // about the relation of the resulting pointer.
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                             GEP1MaxLookupReached, DL);
+                             GEP1MaxLookupReached, DL, AT, DT);
 
     int64_t GEP2BaseOffset;
     bool GEP2MaxLookupReached;
     SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
     const Value *GEP2BasePtr =
       DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices,
-                             GEP2MaxLookupReached, DL);
+                             GEP2MaxLookupReached, DL, AT, DT);
 
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
@@ -959,8 +999,8 @@
     if (V1Size == UnknownSize && V2Size == UnknownSize)
       return MayAlias;
 
-    AliasResult R = aliasCheck(UnderlyingV1, UnknownSize, nullptr,
-                               V2, V2Size, V2TBAAInfo);
+    AliasResult R = aliasCheck(UnderlyingV1, UnknownSize, AAMDNodes(),
+                               V2, V2Size, V2AAInfo);
     if (R != MustAlias)
       // If V2 may alias GEP base pointer, conservatively returns MayAlias.
       // If V2 is known not to alias GEP base pointer, then the two values
@@ -971,7 +1011,7 @@
 
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices,
-                             GEP1MaxLookupReached, DL);
+                             GEP1MaxLookupReached, DL, AT, DT);
 
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
@@ -1022,12 +1062,45 @@
     }
   }
 
-  // Try to distinguish something like &A[i][1] against &A[42][0].
-  // Grab the least significant bit set in any of the scales.
   if (!GEP1VariableIndices.empty()) {
     uint64_t Modulo = 0;
-    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i)
-      Modulo |= (uint64_t)GEP1VariableIndices[i].Scale;
+    bool AllPositive = true;
+    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i) {
+
+      // Try to distinguish something like &A[i][1] against &A[42][0].
+      // Grab the least significant bit set in any of the scales. We
+      // don't need std::abs here (even if the scale's negative) as we'll
+      // be ^'ing Modulo with itself later.
+      Modulo |= (uint64_t) GEP1VariableIndices[i].Scale;
+
+      if (AllPositive) {
+        // If the Value could change between cycles, then any reasoning about
+        // the Value this cycle may not hold in the next cycle. We'll just
+        // give up if we can't determine conditions that hold for every cycle:
+        const Value *V = GEP1VariableIndices[i].V;
+
+        bool SignKnownZero, SignKnownOne;
+        ComputeSignBit(
+          const_cast<Value *>(V),
+          SignKnownZero, SignKnownOne,
+          DL, 0, AT, nullptr, DT);
+
+        // Zero-extension widens the variable, and so forces the sign
+        // bit to zero.
+        bool IsZExt = GEP1VariableIndices[i].Extension == EK_ZeroExt;
+        SignKnownZero |= IsZExt;
+        SignKnownOne &= !IsZExt;
+
+        // If the variable begins with a zero then we know it's
+        // positive, regardless of whether the value is signed or
+        // unsigned.
+        int64_t Scale = GEP1VariableIndices[i].Scale;
+        AllPositive =
+          (SignKnownZero && Scale >= 0) ||
+          (SignKnownOne && Scale < 0);
+      }
+    }
+
     Modulo = Modulo ^ (Modulo & (Modulo - 1));
 
     // We can compute the difference between the two addresses
@@ -1037,6 +1110,12 @@
     if (V1Size != UnknownSize && V2Size != UnknownSize &&
         ModOffset >= V2Size && V1Size <= Modulo - ModOffset)
       return NoAlias;
+
+    // If we know all the variables are positive, then GEP1 >= GEP1BasePtr.
+    // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
+    // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
+    if (AllPositive && GEP1BaseOffset > 0 && V2Size <= (uint64_t) GEP1BaseOffset)
+      return NoAlias;
   }
 
   // Statically, we can see that the base objects are the same, but the
@@ -1066,33 +1145,33 @@
 /// instruction against another.
 AliasAnalysis::AliasResult
 BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize,
-                                const MDNode *SITBAAInfo,
+                                const AAMDNodes &SIAAInfo,
                                 const Value *V2, uint64_t V2Size,
-                                const MDNode *V2TBAAInfo) {
+                                const AAMDNodes &V2AAInfo) {
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for aliases between the values on corresponding arms.
   if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
     if (SI->getCondition() == SI2->getCondition()) {
       AliasResult Alias =
-        aliasCheck(SI->getTrueValue(), SISize, SITBAAInfo,
-                   SI2->getTrueValue(), V2Size, V2TBAAInfo);
+        aliasCheck(SI->getTrueValue(), SISize, SIAAInfo,
+                   SI2->getTrueValue(), V2Size, V2AAInfo);
       if (Alias == MayAlias)
         return MayAlias;
       AliasResult ThisAlias =
-        aliasCheck(SI->getFalseValue(), SISize, SITBAAInfo,
-                   SI2->getFalseValue(), V2Size, V2TBAAInfo);
+        aliasCheck(SI->getFalseValue(), SISize, SIAAInfo,
+                   SI2->getFalseValue(), V2Size, V2AAInfo);
       return MergeAliasResults(ThisAlias, Alias);
     }
 
   // If both arms of the Select node NoAlias or MustAlias V2, then returns
   // NoAlias / MustAlias. Otherwise, returns MayAlias.
   AliasResult Alias =
-    aliasCheck(V2, V2Size, V2TBAAInfo, SI->getTrueValue(), SISize, SITBAAInfo);
+    aliasCheck(V2, V2Size, V2AAInfo, SI->getTrueValue(), SISize, SIAAInfo);
   if (Alias == MayAlias)
     return MayAlias;
 
   AliasResult ThisAlias =
-    aliasCheck(V2, V2Size, V2TBAAInfo, SI->getFalseValue(), SISize, SITBAAInfo);
+    aliasCheck(V2, V2Size, V2AAInfo, SI->getFalseValue(), SISize, SIAAInfo);
   return MergeAliasResults(ThisAlias, Alias);
 }
 
@@ -1100,9 +1179,9 @@
 // against another.
 AliasAnalysis::AliasResult
 BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
-                             const MDNode *PNTBAAInfo,
+                             const AAMDNodes &PNAAInfo,
                              const Value *V2, uint64_t V2Size,
-                             const MDNode *V2TBAAInfo) {
+                             const AAMDNodes &V2AAInfo) {
   // Track phi nodes we have visited. We use this information when we determine
   // value equivalence.
   VisitedPhiBBs.insert(PN->getParent());
@@ -1112,8 +1191,8 @@
   // on corresponding edges.
   if (const PHINode *PN2 = dyn_cast<PHINode>(V2))
     if (PN2->getParent() == PN->getParent()) {
-      LocPair Locs(Location(PN, PNSize, PNTBAAInfo),
-                   Location(V2, V2Size, V2TBAAInfo));
+      LocPair Locs(Location(PN, PNSize, PNAAInfo),
+                   Location(V2, V2Size, V2AAInfo));
       if (PN > V2)
         std::swap(Locs.first, Locs.second);
       // Analyse the PHIs' inputs under the assumption that the PHIs are
@@ -1131,9 +1210,9 @@
 
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         AliasResult ThisAlias =
-          aliasCheck(PN->getIncomingValue(i), PNSize, PNTBAAInfo,
+          aliasCheck(PN->getIncomingValue(i), PNSize, PNAAInfo,
                      PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)),
-                     V2Size, V2TBAAInfo);
+                     V2Size, V2AAInfo);
         Alias = MergeAliasResults(ThisAlias, Alias);
         if (Alias == MayAlias)
           break;
@@ -1156,12 +1235,12 @@
       // sides are PHI nodes. In which case, this is O(m x n) time where 'm'
       // and 'n' are the number of PHI sources.
       return MayAlias;
-    if (UniqueSrc.insert(PV1))
+    if (UniqueSrc.insert(PV1).second)
       V1Srcs.push_back(PV1);
   }
 
-  AliasResult Alias = aliasCheck(V2, V2Size, V2TBAAInfo,
-                                 V1Srcs[0], PNSize, PNTBAAInfo);
+  AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo,
+                                 V1Srcs[0], PNSize, PNAAInfo);
   // Early exit if the check of the first PHI source against V2 is MayAlias.
   // Other results are not possible.
   if (Alias == MayAlias)
@@ -1172,8 +1251,8 @@
   for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
     Value *V = V1Srcs[i];
 
-    AliasResult ThisAlias = aliasCheck(V2, V2Size, V2TBAAInfo,
-                                       V, PNSize, PNTBAAInfo);
+    AliasResult ThisAlias = aliasCheck(V2, V2Size, V2AAInfo,
+                                       V, PNSize, PNAAInfo);
     Alias = MergeAliasResults(ThisAlias, Alias);
     if (Alias == MayAlias)
       break;
@@ -1187,9 +1266,9 @@
 //
 AliasAnalysis::AliasResult
 BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
-                               const MDNode *V1TBAAInfo,
+                               AAMDNodes V1AAInfo,
                                const Value *V2, uint64_t V2Size,
-                               const MDNode *V2TBAAInfo) {
+                               AAMDNodes V2AAInfo) {
   // If either of the memory references is empty, it doesn't matter what the
   // pointer values are.
   if (V1Size == 0 || V2Size == 0)
@@ -1269,8 +1348,8 @@
 
   // Check the cache before climbing up use-def chains. This also terminates
   // otherwise infinitely recursive queries.
-  LocPair Locs(Location(V1, V1Size, V1TBAAInfo),
-               Location(V2, V2Size, V2TBAAInfo));
+  LocPair Locs(Location(V1, V1Size, V1AAInfo),
+               Location(V2, V2Size, V2AAInfo));
   if (V1 > V2)
     std::swap(Locs.first, Locs.second);
   std::pair<AliasCacheTy::iterator, bool> Pair =
@@ -1284,32 +1363,32 @@
     std::swap(V1, V2);
     std::swap(V1Size, V2Size);
     std::swap(O1, O2);
-    std::swap(V1TBAAInfo, V2TBAAInfo);
+    std::swap(V1AAInfo, V2AAInfo);
   }
   if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
-    AliasResult Result = aliasGEP(GV1, V1Size, V1TBAAInfo, V2, V2Size, V2TBAAInfo, O1, O2);
+    AliasResult Result = aliasGEP(GV1, V1Size, V1AAInfo, V2, V2Size, V2AAInfo, O1, O2);
     if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
   if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
     std::swap(V1, V2);
     std::swap(V1Size, V2Size);
-    std::swap(V1TBAAInfo, V2TBAAInfo);
+    std::swap(V1AAInfo, V2AAInfo);
   }
   if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
-    AliasResult Result = aliasPHI(PN, V1Size, V1TBAAInfo,
-                                  V2, V2Size, V2TBAAInfo);
+    AliasResult Result = aliasPHI(PN, V1Size, V1AAInfo,
+                                  V2, V2Size, V2AAInfo);
     if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
   if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
     std::swap(V1, V2);
     std::swap(V1Size, V2Size);
-    std::swap(V1TBAAInfo, V2TBAAInfo);
+    std::swap(V1AAInfo, V2AAInfo);
   }
   if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
-    AliasResult Result = aliasSelect(S1, V1Size, V1TBAAInfo,
-                                     V2, V2Size, V2TBAAInfo);
+    AliasResult Result = aliasSelect(S1, V1Size, V1AAInfo,
+                                     V2, V2Size, V2AAInfo);
     if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
@@ -1322,8 +1401,8 @@
       return AliasCache[Locs] = PartialAlias;
 
   AliasResult Result =
-    AliasAnalysis::alias(Location(V1, V1Size, V1TBAAInfo),
-                         Location(V2, V2Size, V2TBAAInfo));
+    AliasAnalysis::alias(Location(V1, V1Size, V1AAInfo),
+                         Location(V2, V2Size, V2AAInfo));
   return AliasCache[Locs] = Result;
 }
 
@@ -1348,10 +1427,8 @@
   // Make sure that the visited phis cannot reach the Value. This ensures that
   // the Values cannot come from different iterations of a potential cycle the
   // phi nodes could be involved in.
-  for (SmallPtrSet<const BasicBlock *, 8>::iterator PI = VisitedPhiBBs.begin(),
-                                                    PE = VisitedPhiBBs.end();
-       PI != PE; ++PI)
-    if (isPotentiallyReachable((*PI)->begin(), Inst, DT, LI))
+  for (auto *P : VisitedPhiBBs)
+    if (isPotentiallyReachable(P->begin(), Inst, DT, LI))
       return false;
 
   return true;

diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 4fd2c11..06b8acd 100644
--- a/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp

@@ -14,18 +14,12 @@
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Support/raw_ostream.h"
-#include <deque>
 
 using namespace llvm;
 using namespace llvm::bfi_detail;
 
 #define DEBUG_TYPE "block-freq"
 
-//===----------------------------------------------------------------------===//
-//
-// BlockMass implementation.
-//
-//===----------------------------------------------------------------------===//
 ScaledNumber<uint64_t> BlockMass::toScaled() const {
   if (isFull())
     return ScaledNumber<uint64_t>(1, 0);
@@ -46,11 +40,6 @@
   return OS;
 }
 
-//===----------------------------------------------------------------------===//
-//
-// BlockFrequencyInfoImpl implementation.
-//
-//===----------------------------------------------------------------------===//
 namespace {
 
 typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
@@ -87,7 +76,8 @@
 
   BlockMass takeMass(uint32_t Weight);
 };
-}
+
+} // end namespace
 
 DitheringDistributer::DitheringDistributer(Distribution &Dist,
                                            const BlockMass &Mass) {
@@ -121,11 +111,7 @@
   Total = NewTotal;
 
   // Save the weight.
-  Weight W;
-  W.TargetNode = Node;
-  W.Amount = Amount;
-  W.Type = Type;
-  Weights.push_back(W);
+  Weights.push_back(Weight(Type, Node, Amount));
 }
 
 static void combineWeight(Weight &W, const Weight &OtherW) {
@@ -615,7 +601,8 @@
       break;
     }
   }
-  assert(Headers.size() >= 2 && "Should be irreducible");
+  assert(Headers.size() >= 2 &&
+         "Expected irreducible CFG; -loop-info is likely invalid");
   if (Headers.size() == InSCC.size()) {
     // Every block is a header.
     std::sort(Headers.begin(), Headers.end());

diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index 8ef5302..25e7bc0 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp

@@ -45,7 +45,7 @@
     bool FoundNew = false;
     while (I != succ_end(ParentBB)) {
       BB = *I++;
-      if (Visited.insert(BB)) {
+      if (Visited.insert(BB).second) {
         FoundNew = true;
         break;
       }
@@ -141,7 +141,7 @@
   SmallSet<const BasicBlock*, 64> Visited;
   do {
     BasicBlock *BB = Worklist.pop_back_val();
-    if (!Visited.insert(BB))
+    if (!Visited.insert(BB).second)
       continue;
     if (BB == StopBB)
       return true;

diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp
index c2c19d6..89787f82 100644
--- a/lib/Analysis/CFGPrinter.cpp
+++ b/lib/Analysis/CFGPrinter.cpp

@@ -79,11 +79,11 @@
     bool runOnFunction(Function &F) override {
       std::string Filename = "cfg." + F.getName().str() + ".dot";
       errs() << "Writing '" << Filename << "'...";
-      
-      std::string ErrorInfo;
-      raw_fd_ostream File(Filename.c_str(), ErrorInfo, sys::fs::F_Text);
 
-      if (ErrorInfo.empty())
+      std::error_code EC;
+      raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
+
+      if (!EC)
         WriteGraph(File, (const Function*)&F);
       else
         errs() << "  error opening file for writing!";
@@ -114,10 +114,10 @@
       std::string Filename = "cfg." + F.getName().str() + ".dot";
       errs() << "Writing '" << Filename << "'...";
 
-      std::string ErrorInfo;
-      raw_fd_ostream File(Filename.c_str(), ErrorInfo, sys::fs::F_Text);
-      
-      if (ErrorInfo.empty())
+      std::error_code EC;
+      raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
+
+      if (!EC)
         WriteGraph(File, (const Function*)&F, true);
       else
         errs() << "  error opening file for writing!";

diff --git a/lib/Analysis/CFLAliasAnalysis.cpp b/lib/Analysis/CFLAliasAnalysis.cpp
new file mode 100644
index 0000000..5f1b3d3
--- /dev/null
+++ b/lib/Analysis/CFLAliasAnalysis.cpp

@@ -0,0 +1,1013 @@
+//===- CFLAliasAnalysis.cpp - CFL-Based Alias Analysis Implementation ------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a CFL-based context-insensitive alias analysis
+// algorithm. It does not depend on types. The algorithm is a mixture of the one
+// described in "Demand-driven alias analysis for C" by Xin Zheng and Radu
+// Rugina, and "Fast algorithms for Dyck-CFL-reachability with applications to
+// Alias Analysis" by Zhang Q, Lyu M R, Yuan H, and Su Z. -- to summarize the
+// papers, we build a graph of the uses of a variable, where each node is a
+// memory location, and each edge is an action that happened on that memory
+// location.  The "actions" can be one of Dereference, Reference, Assign, or
+// Assign.
+//
+// Two variables are considered as aliasing iff you can reach one value's node
+// from the other value's node and the language formed by concatenating all of
+// the edge labels (actions) conforms to a context-free grammar.
+//
+// Because this algorithm requires a graph search on each query, we execute the
+// algorithm outlined in "Fast algorithms..." (mentioned above)
+// in order to transform the graph into sets of variables that may alias in
+// ~nlogn time (n = number of variables.), which makes queries take constant
+// time.
+//===----------------------------------------------------------------------===//
+
+#include "StratifiedSets.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/None.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <forward_list>
+#include <tuple>
+
+using namespace llvm;
+
+// Try to go from a Value* to a Function*. Never returns nullptr.
+static Optional<Function *> parentFunctionOfValue(Value *);
+
+// Returns possible functions called by the Inst* into the given
+// SmallVectorImpl. Returns true if targets found, false otherwise.
+// This is templated because InvokeInst/CallInst give us the same
+// set of functions that we care about, and I don't like repeating
+// myself.
+template <typename Inst>
+static bool getPossibleTargets(Inst *, SmallVectorImpl<Function *> &);
+
+// Some instructions need to have their users tracked. Instructions like
+// `add` require you to get the users of the Instruction* itself, other
+// instructions like `store` require you to get the users of the first
+// operand. This function gets the "proper" value to track for each
+// type of instruction we support.
+static Optional<Value *> getTargetValue(Instruction *);
+
+// There are certain instructions (i.e. FenceInst, etc.) that we ignore.
+// This notes that we should ignore those.
+static bool hasUsefulEdges(Instruction *);
+
+const StratifiedIndex StratifiedLink::SetSentinel =
+  std::numeric_limits<StratifiedIndex>::max();
+
+namespace {
+// StratifiedInfo Attribute things.
+typedef unsigned StratifiedAttr;
+LLVM_CONSTEXPR unsigned MaxStratifiedAttrIndex = NumStratifiedAttrs;
+LLVM_CONSTEXPR unsigned AttrAllIndex = 0;
+LLVM_CONSTEXPR unsigned AttrGlobalIndex = 1;
+LLVM_CONSTEXPR unsigned AttrFirstArgIndex = 2;
+LLVM_CONSTEXPR unsigned AttrLastArgIndex = MaxStratifiedAttrIndex;
+LLVM_CONSTEXPR unsigned AttrMaxNumArgs = AttrLastArgIndex - AttrFirstArgIndex;
+
+LLVM_CONSTEXPR StratifiedAttr AttrNone = 0;
+LLVM_CONSTEXPR StratifiedAttr AttrAll = ~AttrNone;
+
+// \brief StratifiedSets call for knowledge of "direction", so this is how we
+// represent that locally.
+enum class Level { Same, Above, Below };
+
+// \brief Edges can be one of four "weights" -- each weight must have an inverse
+// weight (Assign has Assign; Reference has Dereference).
+enum class EdgeType {
+  // The weight assigned when assigning from or to a value. For example, in:
+  // %b = getelementptr %a, 0
+  // ...The relationships are %b assign %a, and %a assign %b. This used to be
+  // two edges, but having a distinction bought us nothing.
+  Assign,
+
+  // The edge used when we have an edge going from some handle to a Value.
+  // Examples of this include:
+  // %b = load %a              (%b Dereference %a)
+  // %b = extractelement %a, 0 (%a Dereference %b)
+  Dereference,
+
+  // The edge used when our edge goes from a value to a handle that may have
+  // contained it at some point. Examples:
+  // %b = load %a              (%a Reference %b)
+  // %b = extractelement %a, 0 (%b Reference %a)
+  Reference
+};
+
+// \brief Encodes the notion of a "use"
+struct Edge {
+  // \brief Which value the edge is coming from
+  Value *From;
+
+  // \brief Which value the edge is pointing to
+  Value *To;
+
+  // \brief Edge weight
+  EdgeType Weight;
+
+  // \brief Whether we aliased any external values along the way that may be
+  // invisible to the analysis (i.e. landingpad for exceptions, calls for
+  // interprocedural analysis, etc.)
+  StratifiedAttrs AdditionalAttrs;
+
+  Edge(Value *From, Value *To, EdgeType W, StratifiedAttrs A)
+      : From(From), To(To), Weight(W), AdditionalAttrs(A) {}
+};
+
+// \brief Information we have about a function and would like to keep around
+struct FunctionInfo {
+  StratifiedSets<Value *> Sets;
+  // Lots of functions have < 4 returns. Adjust as necessary.
+  SmallVector<Value *, 4> ReturnedValues;
+
+  FunctionInfo(StratifiedSets<Value *> &&S,
+               SmallVector<Value *, 4> &&RV)
+    : Sets(std::move(S)), ReturnedValues(std::move(RV)) {}
+};
+
+struct CFLAliasAnalysis;
+
+struct FunctionHandle : public CallbackVH {
+  FunctionHandle(Function *Fn, CFLAliasAnalysis *CFLAA)
+      : CallbackVH(Fn), CFLAA(CFLAA) {
+    assert(Fn != nullptr);
+    assert(CFLAA != nullptr);
+  }
+
+  virtual ~FunctionHandle() {}
+
+  void deleted() override { removeSelfFromCache(); }
+  void allUsesReplacedWith(Value *) override { removeSelfFromCache(); }
+
+private:
+  CFLAliasAnalysis *CFLAA;
+
+  void removeSelfFromCache();
+};
+
+struct CFLAliasAnalysis : public ImmutablePass, public AliasAnalysis {
+private:
+  /// \brief Cached mapping of Functions to their StratifiedSets.
+  /// If a function's sets are currently being built, it is marked
+  /// in the cache as an Optional without a value. This way, if we
+  /// have any kind of recursion, it is discernable from a function
+  /// that simply has empty sets.
+  DenseMap<Function *, Optional<FunctionInfo>> Cache;
+  std::forward_list<FunctionHandle> Handles;
+
+public:
+  static char ID;
+
+  CFLAliasAnalysis() : ImmutablePass(ID) {
+    initializeCFLAliasAnalysisPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual ~CFLAliasAnalysis() {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AliasAnalysis::getAnalysisUsage(AU);
+  }
+
+  void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &AliasAnalysis::ID)
+      return (AliasAnalysis *)this;
+    return this;
+  }
+
+  /// \brief Inserts the given Function into the cache.
+  void scan(Function *Fn);
+
+  void evict(Function *Fn) { Cache.erase(Fn); }
+
+  /// \brief Ensures that the given function is available in the cache.
+  /// Returns the appropriate entry from the cache.
+  const Optional<FunctionInfo> &ensureCached(Function *Fn) {
+    auto Iter = Cache.find(Fn);
+    if (Iter == Cache.end()) {
+      scan(Fn);
+      Iter = Cache.find(Fn);
+      assert(Iter != Cache.end());
+      assert(Iter->second.hasValue());
+    }
+    return Iter->second;
+  }
+
+  AliasResult query(const Location &LocA, const Location &LocB);
+
+  AliasResult alias(const Location &LocA, const Location &LocB) override {
+    if (LocA.Ptr == LocB.Ptr) {
+      if (LocA.Size == LocB.Size) {
+        return MustAlias;
+      } else {
+        return PartialAlias;
+      }
+    }
+
+    // Comparisons between global variables and other constants should be
+    // handled by BasicAA.
+    if (isa<Constant>(LocA.Ptr) && isa<Constant>(LocB.Ptr)) {
+      return MayAlias;
+    }
+
+    return query(LocA, LocB);
+  }
+
+  void initializePass() override { InitializeAliasAnalysis(this); }
+};
+
+void FunctionHandle::removeSelfFromCache() {
+  assert(CFLAA != nullptr);
+  auto *Val = getValPtr();
+  CFLAA->evict(cast<Function>(Val));
+  setValPtr(nullptr);
+}
+
+// \brief Gets the edges our graph should have, based on an Instruction*
+class GetEdgesVisitor : public InstVisitor<GetEdgesVisitor, void> {
+  CFLAliasAnalysis &AA;
+  SmallVectorImpl<Edge> &Output;
+
+public:
+  GetEdgesVisitor(CFLAliasAnalysis &AA, SmallVectorImpl<Edge> &Output)
+      : AA(AA), Output(Output) {}
+
+  void visitInstruction(Instruction &) {
+    llvm_unreachable("Unsupported instruction encountered");
+  }
+
+  void visitCastInst(CastInst &Inst) {
+    Output.push_back(Edge(&Inst, Inst.getOperand(0), EdgeType::Assign,
+                          AttrNone));
+  }
+
+  void visitBinaryOperator(BinaryOperator &Inst) {
+    auto *Op1 = Inst.getOperand(0);
+    auto *Op2 = Inst.getOperand(1);
+    Output.push_back(Edge(&Inst, Op1, EdgeType::Assign, AttrNone));
+    Output.push_back(Edge(&Inst, Op2, EdgeType::Assign, AttrNone));
+  }
+
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &Inst) {
+    auto *Ptr = Inst.getPointerOperand();
+    auto *Val = Inst.getNewValOperand();
+    Output.push_back(Edge(Ptr, Val, EdgeType::Dereference, AttrNone));
+  }
+
+  void visitAtomicRMWInst(AtomicRMWInst &Inst) {
+    auto *Ptr = Inst.getPointerOperand();
+    auto *Val = Inst.getValOperand();
+    Output.push_back(Edge(Ptr, Val, EdgeType::Dereference, AttrNone));
+  }
+
+  void visitPHINode(PHINode &Inst) {
+    for (unsigned I = 0, E = Inst.getNumIncomingValues(); I != E; ++I) {
+      Value *Val = Inst.getIncomingValue(I);
+      Output.push_back(Edge(&Inst, Val, EdgeType::Assign, AttrNone));
+    }
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &Inst) {
+    auto *Op = Inst.getPointerOperand();
+    Output.push_back(Edge(&Inst, Op, EdgeType::Assign, AttrNone));
+    for (auto I = Inst.idx_begin(), E = Inst.idx_end(); I != E; ++I)
+      Output.push_back(Edge(&Inst, *I, EdgeType::Assign, AttrNone));
+  }
+
+  void visitSelectInst(SelectInst &Inst) {
+    auto *Condition = Inst.getCondition();
+    Output.push_back(Edge(&Inst, Condition, EdgeType::Assign, AttrNone));
+    auto *TrueVal = Inst.getTrueValue();
+    Output.push_back(Edge(&Inst, TrueVal, EdgeType::Assign, AttrNone));
+    auto *FalseVal = Inst.getFalseValue();
+    Output.push_back(Edge(&Inst, FalseVal, EdgeType::Assign, AttrNone));
+  }
+
+  void visitAllocaInst(AllocaInst &) {}
+
+  void visitLoadInst(LoadInst &Inst) {
+    auto *Ptr = Inst.getPointerOperand();
+    auto *Val = &Inst;
+    Output.push_back(Edge(Val, Ptr, EdgeType::Reference, AttrNone));
+  }
+
+  void visitStoreInst(StoreInst &Inst) {
+    auto *Ptr = Inst.getPointerOperand();
+    auto *Val = Inst.getValueOperand();
+    Output.push_back(Edge(Ptr, Val, EdgeType::Dereference, AttrNone));
+  }
+
+  void visitVAArgInst(VAArgInst &Inst) {
+    // We can't fully model va_arg here. For *Ptr = Inst.getOperand(0), it does
+    // two things:
+    //  1. Loads a value from *((T*)*Ptr).
+    //  2. Increments (stores to) *Ptr by some target-specific amount.
+    // For now, we'll handle this like a landingpad instruction (by placing the
+    // result in its own group, and having that group alias externals).
+    auto *Val = &Inst;
+    Output.push_back(Edge(Val, Val, EdgeType::Assign, AttrAll));
+  }
+
+  static bool isFunctionExternal(Function *Fn) {
+    return Fn->isDeclaration() || !Fn->hasLocalLinkage();
+  }
+
+  // Gets whether the sets at Index1 above, below, or equal to the sets at
+  // Index2. Returns None if they are not in the same set chain.
+  static Optional<Level> getIndexRelation(const StratifiedSets<Value *> &Sets,
+                                          StratifiedIndex Index1,
+                                          StratifiedIndex Index2) {
+    if (Index1 == Index2)
+      return Level::Same;
+
+    const auto *Current = &Sets.getLink(Index1);
+    while (Current->hasBelow()) {
+      if (Current->Below == Index2)
+        return Level::Below;
+      Current = &Sets.getLink(Current->Below);
+    }
+
+    Current = &Sets.getLink(Index1);
+    while (Current->hasAbove()) {
+      if (Current->Above == Index2)
+        return Level::Above;
+      Current = &Sets.getLink(Current->Above);
+    }
+
+    return NoneType();
+  }
+
+  bool
+  tryInterproceduralAnalysis(const SmallVectorImpl<Function *> &Fns,
+                             Value *FuncValue,
+                             const iterator_range<User::op_iterator> &Args) {
+    const unsigned ExpectedMaxArgs = 8;
+    const unsigned MaxSupportedArgs = 50;
+    assert(Fns.size() > 0);
+
+    // I put this here to give us an upper bound on time taken by IPA. Is it
+    // really (realistically) needed? Keep in mind that we do have an n^2 algo.
+    if (std::distance(Args.begin(), Args.end()) > (int) MaxSupportedArgs)
+      return false;
+
+    // Exit early if we'll fail anyway
+    for (auto *Fn : Fns) {
+      if (isFunctionExternal(Fn) || Fn->isVarArg())
+        return false;
+      auto &MaybeInfo = AA.ensureCached(Fn);
+      if (!MaybeInfo.hasValue())
+        return false;
+    }
+
+    SmallVector<Value *, ExpectedMaxArgs> Arguments(Args.begin(), Args.end());
+    SmallVector<StratifiedInfo, ExpectedMaxArgs> Parameters;
+    for (auto *Fn : Fns) {
+      auto &Info = *AA.ensureCached(Fn);
+      auto &Sets = Info.Sets;
+      auto &RetVals = Info.ReturnedValues;
+
+      Parameters.clear();
+      for (auto &Param : Fn->args()) {
+        auto MaybeInfo = Sets.find(&Param);
+        // Did a new parameter somehow get added to the function/slip by?
+        if (!MaybeInfo.hasValue())
+          return false;
+        Parameters.push_back(*MaybeInfo);
+      }
+
+      // Adding an edge from argument -> return value for each parameter that
+      // may alias the return value
+      for (unsigned I = 0, E = Parameters.size(); I != E; ++I) {
+        auto &ParamInfo = Parameters[I];
+        auto &ArgVal = Arguments[I];
+        bool AddEdge = false;
+        StratifiedAttrs Externals;
+        for (unsigned X = 0, XE = RetVals.size(); X != XE; ++X) {
+          auto MaybeInfo = Sets.find(RetVals[X]);
+          if (!MaybeInfo.hasValue())
+            return false;
+
+          auto &RetInfo = *MaybeInfo;
+          auto RetAttrs = Sets.getLink(RetInfo.Index).Attrs;
+          auto ParamAttrs = Sets.getLink(ParamInfo.Index).Attrs;
+          auto MaybeRelation =
+              getIndexRelation(Sets, ParamInfo.Index, RetInfo.Index);
+          if (MaybeRelation.hasValue()) {
+            AddEdge = true;
+            Externals |= RetAttrs | ParamAttrs;
+          }
+        }
+        if (AddEdge)
+          Output.push_back(Edge(FuncValue, ArgVal, EdgeType::Assign,
+                            StratifiedAttrs().flip()));
+      }
+
+      if (Parameters.size() != Arguments.size())
+        return false;
+
+      // Adding edges between arguments for arguments that may end up aliasing
+      // each other. This is necessary for functions such as
+      // void foo(int** a, int** b) { *a = *b; }
+      // (Technically, the proper sets for this would be those below
+      // Arguments[I] and Arguments[X], but our algorithm will produce
+      // extremely similar, and equally correct, results either way)
+      for (unsigned I = 0, E = Arguments.size(); I != E; ++I) {
+        auto &MainVal = Arguments[I];
+        auto &MainInfo = Parameters[I];
+        auto &MainAttrs = Sets.getLink(MainInfo.Index).Attrs;
+        for (unsigned X = I + 1; X != E; ++X) {
+          auto &SubInfo = Parameters[X];
+          auto &SubVal = Arguments[X];
+          auto &SubAttrs = Sets.getLink(SubInfo.Index).Attrs;
+          auto MaybeRelation =
+              getIndexRelation(Sets, MainInfo.Index, SubInfo.Index);
+
+          if (!MaybeRelation.hasValue())
+            continue;
+
+          auto NewAttrs = SubAttrs | MainAttrs;
+          Output.push_back(Edge(MainVal, SubVal, EdgeType::Assign, NewAttrs));
+        }
+      }
+    }
+    return true;
+  }
+
+  template <typename InstT> void visitCallLikeInst(InstT &Inst) {
+    SmallVector<Function *, 4> Targets;
+    if (getPossibleTargets(&Inst, Targets)) {
+      if (tryInterproceduralAnalysis(Targets, &Inst, Inst.arg_operands()))
+        return;
+      // Cleanup from interprocedural analysis
+      Output.clear();
+    }
+
+    for (Value *V : Inst.arg_operands())
+      Output.push_back(Edge(&Inst, V, EdgeType::Assign, AttrAll));
+  }
+
+  void visitCallInst(CallInst &Inst) { visitCallLikeInst(Inst); }
+
+  void visitInvokeInst(InvokeInst &Inst) { visitCallLikeInst(Inst); }
+
+  // Because vectors/aggregates are immutable and unaddressable,
+  // there's nothing we can do to coax a value out of them, other
+  // than calling Extract{Element,Value}. We can effectively treat
+  // them as pointers to arbitrary memory locations we can store in
+  // and load from.
+  void visitExtractElementInst(ExtractElementInst &Inst) {
+    auto *Ptr = Inst.getVectorOperand();
+    auto *Val = &Inst;
+    Output.push_back(Edge(Val, Ptr, EdgeType::Reference, AttrNone));
+  }
+
+  void visitInsertElementInst(InsertElementInst &Inst) {
+    auto *Vec = Inst.getOperand(0);
+    auto *Val = Inst.getOperand(1);
+    Output.push_back(Edge(&Inst, Vec, EdgeType::Assign, AttrNone));
+    Output.push_back(Edge(&Inst, Val, EdgeType::Dereference, AttrNone));
+  }
+
+  void visitLandingPadInst(LandingPadInst &Inst) {
+    // Exceptions come from "nowhere", from our analysis' perspective.
+    // So we place the instruction its own group, noting that said group may
+    // alias externals
+    Output.push_back(Edge(&Inst, &Inst, EdgeType::Assign, AttrAll));
+  }
+
+  void visitInsertValueInst(InsertValueInst &Inst) {
+    auto *Agg = Inst.getOperand(0);
+    auto *Val = Inst.getOperand(1);
+    Output.push_back(Edge(&Inst, Agg, EdgeType::Assign, AttrNone));
+    Output.push_back(Edge(&Inst, Val, EdgeType::Dereference, AttrNone));
+  }
+
+  void visitExtractValueInst(ExtractValueInst &Inst) {
+    auto *Ptr = Inst.getAggregateOperand();
+    Output.push_back(Edge(&Inst, Ptr, EdgeType::Reference, AttrNone));
+  }
+
+  void visitShuffleVectorInst(ShuffleVectorInst &Inst) {
+    auto *From1 = Inst.getOperand(0);
+    auto *From2 = Inst.getOperand(1);
+    Output.push_back(Edge(&Inst, From1, EdgeType::Assign, AttrNone));
+    Output.push_back(Edge(&Inst, From2, EdgeType::Assign, AttrNone));
+  }
+};
+
+// For a given instruction, we need to know which Value* to get the
+// users of in order to build our graph. In some cases (i.e. add),
+// we simply need the Instruction*. In other cases (i.e. store),
+// finding the users of the Instruction* is useless; we need to find
+// the users of the first operand. This handles determining which
+// value to follow for us.
+//
+// Note: we *need* to keep this in sync with GetEdgesVisitor. Add
+// something to GetEdgesVisitor, add it here -- remove something from
+// GetEdgesVisitor, remove it here.
+class GetTargetValueVisitor
+    : public InstVisitor<GetTargetValueVisitor, Value *> {
+public:
+  Value *visitInstruction(Instruction &Inst) { return &Inst; }
+
+  Value *visitStoreInst(StoreInst &Inst) { return Inst.getPointerOperand(); }
+
+  Value *visitAtomicCmpXchgInst(AtomicCmpXchgInst &Inst) {
+    return Inst.getPointerOperand();
+  }
+
+  Value *visitAtomicRMWInst(AtomicRMWInst &Inst) {
+    return Inst.getPointerOperand();
+  }
+
+  Value *visitInsertElementInst(InsertElementInst &Inst) {
+    return Inst.getOperand(0);
+  }
+
+  Value *visitInsertValueInst(InsertValueInst &Inst) {
+    return Inst.getAggregateOperand();
+  }
+};
+
+// Set building requires a weighted bidirectional graph.
+template <typename EdgeTypeT> class WeightedBidirectionalGraph {
+public:
+  typedef std::size_t Node;
+
+private:
+  const static Node StartNode = Node(0);
+
+  struct Edge {
+    EdgeTypeT Weight;
+    Node Other;
+
+    Edge(const EdgeTypeT &W, const Node &N)
+      : Weight(W), Other(N) {}
+
+    bool operator==(const Edge &E) const {
+      return Weight == E.Weight && Other == E.Other;
+    }
+
+    bool operator!=(const Edge &E) const { return !operator==(E); }
+  };
+
+  struct NodeImpl {
+    std::vector<Edge> Edges;
+  };
+
+  std::vector<NodeImpl> NodeImpls;
+
+  bool inbounds(Node NodeIndex) const { return NodeIndex < NodeImpls.size(); }
+
+  const NodeImpl &getNode(Node N) const { return NodeImpls[N]; }
+  NodeImpl &getNode(Node N) { return NodeImpls[N]; }
+
+public:
+  // ----- Various Edge iterators for the graph ----- //
+
+  // \brief Iterator for edges. Because this graph is bidirected, we don't
+  // allow modificaiton of the edges using this iterator. Additionally, the
+  // iterator becomes invalid if you add edges to or from the node you're
+  // getting the edges of.
+  struct EdgeIterator : public std::iterator<std::forward_iterator_tag,
+                                             std::tuple<EdgeTypeT, Node *>> {
+    EdgeIterator(const typename std::vector<Edge>::const_iterator &Iter)
+        : Current(Iter) {}
+
+    EdgeIterator(NodeImpl &Impl) : Current(Impl.begin()) {}
+
+    EdgeIterator &operator++() {
+      ++Current;
+      return *this;
+    }
+
+    EdgeIterator operator++(int) {
+      EdgeIterator Copy(Current);
+      operator++();
+      return Copy;
+    }
+
+    std::tuple<EdgeTypeT, Node> &operator*() {
+      Store = std::make_tuple(Current->Weight, Current->Other);
+      return Store;
+    }
+
+    bool operator==(const EdgeIterator &Other) const {
+      return Current == Other.Current;
+    }
+
+    bool operator!=(const EdgeIterator &Other) const {
+      return !operator==(Other);
+    }
+
+  private:
+    typename std::vector<Edge>::const_iterator Current;
+    std::tuple<EdgeTypeT, Node> Store;
+  };
+
+  // Wrapper for EdgeIterator with begin()/end() calls.
+  struct EdgeIterable {
+    EdgeIterable(const std::vector<Edge> &Edges)
+        : BeginIter(Edges.begin()), EndIter(Edges.end()) {}
+
+    EdgeIterator begin() { return EdgeIterator(BeginIter); }
+
+    EdgeIterator end() { return EdgeIterator(EndIter); }
+
+  private:
+    typename std::vector<Edge>::const_iterator BeginIter;
+    typename std::vector<Edge>::const_iterator EndIter;
+  };
+
+  // ----- Actual graph-related things ----- //
+
+  WeightedBidirectionalGraph() {}
+
+  WeightedBidirectionalGraph(WeightedBidirectionalGraph<EdgeTypeT> &&Other)
+      : NodeImpls(std::move(Other.NodeImpls)) {}
+
+  WeightedBidirectionalGraph<EdgeTypeT> &
+  operator=(WeightedBidirectionalGraph<EdgeTypeT> &&Other) {
+    NodeImpls = std::move(Other.NodeImpls);
+    return *this;
+  }
+
+  Node addNode() {
+    auto Index = NodeImpls.size();
+    auto NewNode = Node(Index);
+    NodeImpls.push_back(NodeImpl());
+    return NewNode;
+  }
+
+  void addEdge(Node From, Node To, const EdgeTypeT &Weight,
+               const EdgeTypeT &ReverseWeight) {
+    assert(inbounds(From));
+    assert(inbounds(To));
+    auto &FromNode = getNode(From);
+    auto &ToNode = getNode(To);
+    FromNode.Edges.push_back(Edge(Weight, To));
+    ToNode.Edges.push_back(Edge(ReverseWeight, From));
+  }
+
+  EdgeIterable edgesFor(const Node &N) const {
+    const auto &Node = getNode(N);
+    return EdgeIterable(Node.Edges);
+  }
+
+  bool empty() const { return NodeImpls.empty(); }
+  std::size_t size() const { return NodeImpls.size(); }
+
+  // \brief Gets an arbitrary node in the graph as a starting point for
+  // traversal.
+  Node getEntryNode() {
+    assert(inbounds(StartNode));
+    return StartNode;
+  }
+};
+
+typedef WeightedBidirectionalGraph<std::pair<EdgeType, StratifiedAttrs>> GraphT;
+typedef DenseMap<Value *, GraphT::Node> NodeMapT;
+}
+
+// -- Setting up/registering CFLAA pass -- //
+char CFLAliasAnalysis::ID = 0;
+
+INITIALIZE_AG_PASS(CFLAliasAnalysis, AliasAnalysis, "cfl-aa",
+                   "CFL-Based AA implementation", false, true, false)
+
+ImmutablePass *llvm::createCFLAliasAnalysisPass() {
+  return new CFLAliasAnalysis();
+}
+
+//===----------------------------------------------------------------------===//
+// Function declarations that require types defined in the namespace above
+//===----------------------------------------------------------------------===//
+
+// Given an argument number, returns the appropriate Attr index to set.
+static StratifiedAttr argNumberToAttrIndex(StratifiedAttr);
+
+// Given a Value, potentially return which AttrIndex it maps to.
+static Optional<StratifiedAttr> valueToAttrIndex(Value *Val);
+
+// Gets the inverse of a given EdgeType.
+static EdgeType flipWeight(EdgeType);
+
+// Gets edges of the given Instruction*, writing them to the SmallVector*.
+static void argsToEdges(CFLAliasAnalysis &, Instruction *,
+                        SmallVectorImpl<Edge> &);
+
+// Gets the "Level" that one should travel in StratifiedSets
+// given an EdgeType.
+static Level directionOfEdgeType(EdgeType);
+
+// Builds the graph needed for constructing the StratifiedSets for the
+// given function
+static void buildGraphFrom(CFLAliasAnalysis &, Function *,
+                           SmallVectorImpl<Value *> &, NodeMapT &, GraphT &);
+
+// Builds the graph + StratifiedSets for a function.
+static FunctionInfo buildSetsFrom(CFLAliasAnalysis &, Function *);
+
+static Optional<Function *> parentFunctionOfValue(Value *Val) {
+  if (auto *Inst = dyn_cast<Instruction>(Val)) {
+    auto *Bb = Inst->getParent();
+    return Bb->getParent();
+  }
+
+  if (auto *Arg = dyn_cast<Argument>(Val))
+    return Arg->getParent();
+  return NoneType();
+}
+
+template <typename Inst>
+static bool getPossibleTargets(Inst *Call,
+                               SmallVectorImpl<Function *> &Output) {
+  if (auto *Fn = Call->getCalledFunction()) {
+    Output.push_back(Fn);
+    return true;
+  }
+
+  // TODO: If the call is indirect, we might be able to enumerate all potential
+  // targets of the call and return them, rather than just failing.
+  return false;
+}
+
+static Optional<Value *> getTargetValue(Instruction *Inst) {
+  GetTargetValueVisitor V;
+  return V.visit(Inst);
+}
+
+static bool hasUsefulEdges(Instruction *Inst) {
+  bool IsNonInvokeTerminator =
+      isa<TerminatorInst>(Inst) && !isa<InvokeInst>(Inst);
+  return !isa<CmpInst>(Inst) && !isa<FenceInst>(Inst) && !IsNonInvokeTerminator;
+}
+
+static Optional<StratifiedAttr> valueToAttrIndex(Value *Val) {
+  if (isa<GlobalValue>(Val))
+    return AttrGlobalIndex;
+
+  if (auto *Arg = dyn_cast<Argument>(Val))
+    if (!Arg->hasNoAliasAttr())
+      return argNumberToAttrIndex(Arg->getArgNo());
+  return NoneType();
+}
+
+static StratifiedAttr argNumberToAttrIndex(unsigned ArgNum) {
+  if (ArgNum > AttrMaxNumArgs)
+    return AttrAllIndex;
+  return ArgNum + AttrFirstArgIndex;
+}
+
+static EdgeType flipWeight(EdgeType Initial) {
+  switch (Initial) {
+  case EdgeType::Assign:
+    return EdgeType::Assign;
+  case EdgeType::Dereference:
+    return EdgeType::Reference;
+  case EdgeType::Reference:
+    return EdgeType::Dereference;
+  }
+  llvm_unreachable("Incomplete coverage of EdgeType enum");
+}
+
+static void argsToEdges(CFLAliasAnalysis &Analysis, Instruction *Inst,
+                        SmallVectorImpl<Edge> &Output) {
+  GetEdgesVisitor v(Analysis, Output);
+  v.visit(Inst);
+}
+
+static Level directionOfEdgeType(EdgeType Weight) {
+  switch (Weight) {
+  case EdgeType::Reference:
+    return Level::Above;
+  case EdgeType::Dereference:
+    return Level::Below;
+  case EdgeType::Assign:
+    return Level::Same;
+  }
+  llvm_unreachable("Incomplete switch coverage");
+}
+
+// Aside: We may remove graph construction entirely, because it doesn't really
+// buy us much that we don't already have. I'd like to add interprocedural
+// analysis prior to this however, in case that somehow requires the graph
+// produced by this for efficient execution
+static void buildGraphFrom(CFLAliasAnalysis &Analysis, Function *Fn,
+                           SmallVectorImpl<Value *> &ReturnedValues,
+                           NodeMapT &Map, GraphT &Graph) {
+  const auto findOrInsertNode = [&Map, &Graph](Value *Val) {
+    auto Pair = Map.insert(std::make_pair(Val, GraphT::Node()));
+    auto &Iter = Pair.first;
+    if (Pair.second) {
+      auto NewNode = Graph.addNode();
+      Iter->second = NewNode;
+    }
+    return Iter->second;
+  };
+
+  SmallVector<Edge, 8> Edges;
+  for (auto &Bb : Fn->getBasicBlockList()) {
+    for (auto &Inst : Bb.getInstList()) {
+      // We don't want the edges of most "return" instructions, but we *do* want
+      // to know what can be returned.
+      if (auto *Ret = dyn_cast<ReturnInst>(&Inst))
+        ReturnedValues.push_back(Ret);
+
+      if (!hasUsefulEdges(&Inst))
+        continue;
+
+      Edges.clear();
+      argsToEdges(Analysis, &Inst, Edges);
+
+      // In the case of an unused alloca (or similar), edges may be empty. Note
+      // that it exists so we can potentially answer NoAlias.
+      if (Edges.empty()) {
+        auto MaybeVal = getTargetValue(&Inst);
+        assert(MaybeVal.hasValue());
+        auto *Target = *MaybeVal;
+        findOrInsertNode(Target);
+        continue;
+      }
+
+      for (const Edge &E : Edges) {
+        auto To = findOrInsertNode(E.To);
+        auto From = findOrInsertNode(E.From);
+        auto FlippedWeight = flipWeight(E.Weight);
+        auto Attrs = E.AdditionalAttrs;
+        Graph.addEdge(From, To, std::make_pair(E.Weight, Attrs),
+                                std::make_pair(FlippedWeight, Attrs));
+      }
+    }
+  }
+}
+
+static FunctionInfo buildSetsFrom(CFLAliasAnalysis &Analysis, Function *Fn) {
+  NodeMapT Map;
+  GraphT Graph;
+  SmallVector<Value *, 4> ReturnedValues;
+
+  buildGraphFrom(Analysis, Fn, ReturnedValues, Map, Graph);
+
+  DenseMap<GraphT::Node, Value *> NodeValueMap;
+  NodeValueMap.resize(Map.size());
+  for (const auto &Pair : Map)
+    NodeValueMap.insert(std::make_pair(Pair.second, Pair.first));
+
+  const auto findValueOrDie = [&NodeValueMap](GraphT::Node Node) {
+    auto ValIter = NodeValueMap.find(Node);
+    assert(ValIter != NodeValueMap.end());
+    return ValIter->second;
+  };
+
+  StratifiedSetsBuilder<Value *> Builder;
+
+  SmallVector<GraphT::Node, 16> Worklist;
+  for (auto &Pair : Map) {
+    Worklist.clear();
+
+    auto *Value = Pair.first;
+    Builder.add(Value);
+    auto InitialNode = Pair.second;
+    Worklist.push_back(InitialNode);
+    while (!Worklist.empty()) {
+      auto Node = Worklist.pop_back_val();
+      auto *CurValue = findValueOrDie(Node);
+      if (isa<Constant>(CurValue) && !isa<GlobalValue>(CurValue))
+        continue;
+
+      for (const auto &EdgeTuple : Graph.edgesFor(Node)) {
+        auto Weight = std::get<0>(EdgeTuple);
+        auto Label = Weight.first;
+        auto &OtherNode = std::get<1>(EdgeTuple);
+        auto *OtherValue = findValueOrDie(OtherNode);
+
+        if (isa<Constant>(OtherValue) && !isa<GlobalValue>(OtherValue))
+          continue;
+
+        bool Added;
+        switch (directionOfEdgeType(Label)) {
+        case Level::Above:
+          Added = Builder.addAbove(CurValue, OtherValue);
+          break;
+        case Level::Below:
+          Added = Builder.addBelow(CurValue, OtherValue);
+          break;
+        case Level::Same:
+          Added = Builder.addWith(CurValue, OtherValue);
+          break;
+        }
+
+        if (Added) {
+          auto Aliasing = Weight.second;
+          if (auto MaybeCurIndex = valueToAttrIndex(CurValue))
+            Aliasing.set(*MaybeCurIndex);
+          if (auto MaybeOtherIndex = valueToAttrIndex(OtherValue))
+            Aliasing.set(*MaybeOtherIndex);
+          Builder.noteAttributes(CurValue, Aliasing);
+          Builder.noteAttributes(OtherValue, Aliasing);
+          Worklist.push_back(OtherNode);
+        }
+      }
+    }
+  }
+
+  // There are times when we end up with parameters not in our graph (i.e. if
+  // it's only used as the condition of a branch). Other bits of code depend on
+  // things that were present during construction being present in the graph.
+  // So, we add all present arguments here.
+  for (auto &Arg : Fn->args()) {
+    Builder.add(&Arg);
+  }
+
+  return FunctionInfo(Builder.build(), std::move(ReturnedValues));
+}
+
+void CFLAliasAnalysis::scan(Function *Fn) {
+  auto InsertPair = Cache.insert(std::make_pair(Fn, Optional<FunctionInfo>()));
+  (void)InsertPair;
+  assert(InsertPair.second &&
+         "Trying to scan a function that has already been cached");
+
+  FunctionInfo Info(buildSetsFrom(*this, Fn));
+  Cache[Fn] = std::move(Info);
+  Handles.push_front(FunctionHandle(Fn, this));
+}
+
+AliasAnalysis::AliasResult
+CFLAliasAnalysis::query(const AliasAnalysis::Location &LocA,
+                        const AliasAnalysis::Location &LocB) {
+  auto *ValA = const_cast<Value *>(LocA.Ptr);
+  auto *ValB = const_cast<Value *>(LocB.Ptr);
+
+  Function *Fn = nullptr;
+  auto MaybeFnA = parentFunctionOfValue(ValA);
+  auto MaybeFnB = parentFunctionOfValue(ValB);
+  if (!MaybeFnA.hasValue() && !MaybeFnB.hasValue()) {
+    llvm_unreachable("Don't know how to extract the parent function "
+                     "from values A or B");
+  }
+
+  if (MaybeFnA.hasValue()) {
+    Fn = *MaybeFnA;
+    assert((!MaybeFnB.hasValue() || *MaybeFnB == *MaybeFnA) &&
+           "Interprocedural queries not supported");
+  } else {
+    Fn = *MaybeFnB;
+  }
+
+  assert(Fn != nullptr);
+  auto &MaybeInfo = ensureCached(Fn);
+  assert(MaybeInfo.hasValue());
+
+  auto &Sets = MaybeInfo->Sets;
+  auto MaybeA = Sets.find(ValA);
+  if (!MaybeA.hasValue())
+    return AliasAnalysis::MayAlias;
+
+  auto MaybeB = Sets.find(ValB);
+  if (!MaybeB.hasValue())
+    return AliasAnalysis::MayAlias;
+
+  auto SetA = *MaybeA;
+  auto SetB = *MaybeB;
+
+  if (SetA.Index == SetB.Index)
+    return AliasAnalysis::PartialAlias;
+
+  auto AttrsA = Sets.getLink(SetA.Index).Attrs;
+  auto AttrsB = Sets.getLink(SetB.Index).Attrs;
+  // Stratified set attributes are used as markets to signify whether a member
+  // of a StratifiedSet (or a member of a set above the current set) has 
+  // interacted with either arguments or globals. "Interacted with" meaning
+  // its value may be different depending on the value of an argument or 
+  // global. The thought behind this is that, because arguments and globals
+  // may alias each other, if AttrsA and AttrsB have touched args/globals,
+  // we must conservatively say that they alias. However, if at least one of 
+  // the sets has no values that could legally be altered by changing the value 
+  // of an argument or global, then we don't have to be as conservative.
+  if (AttrsA.any() && AttrsB.any())
+    return AliasAnalysis::MayAlias;
+
+  return AliasAnalysis::NoAlias;
+}

diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index d1632fd..4e9664f 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt

@@ -5,12 +5,14 @@
   AliasDebugger.cpp
   AliasSetTracker.cpp
   Analysis.cpp
+  AssumptionTracker.cpp
   BasicAliasAnalysis.cpp
   BlockFrequencyInfo.cpp
   BlockFrequencyInfoImpl.cpp
   BranchProbabilityInfo.cpp
   CFG.cpp
   CFGPrinter.cpp
+  CFLAliasAnalysis.cpp
   CGSCCPassManager.cpp
   CaptureTracking.cpp
   CostModel.cpp
@@ -20,6 +22,7 @@
   DependenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
+  FunctionTargetTransformInfo.cpp
   IVUsers.cpp
   InstCount.cpp
   InstructionSimplify.cpp
@@ -53,6 +56,7 @@
   TargetTransformInfo.cpp
   Trace.cpp
   TypeBasedAliasAnalysis.cpp
+  ScopedNoAliasAA.cpp
   ValueTracking.cpp
   )
 

diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index 3708e60..a271729 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp

@@ -20,8 +20,10 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 
 using namespace llvm;
@@ -49,6 +51,65 @@
 
     bool Captured;
   };
+
+  /// Only find pointer captures which happen before the given instruction. Uses
+  /// the dominator tree to determine whether one instruction is before another.
+  /// Only support the case where the Value is defined in the same basic block
+  /// as the given instruction and the use.
+  struct CapturesBefore : public CaptureTracker {
+    CapturesBefore(bool ReturnCaptures, const Instruction *I, DominatorTree *DT,
+                   bool IncludeI)
+      : BeforeHere(I), DT(DT), ReturnCaptures(ReturnCaptures),
+        IncludeI(IncludeI), Captured(false) {}
+
+    void tooManyUses() override { Captured = true; }
+
+    bool shouldExplore(const Use *U) override {
+      Instruction *I = cast<Instruction>(U->getUser());
+      if (BeforeHere == I && !IncludeI)
+        return false;
+
+      BasicBlock *BB = I->getParent();
+      // We explore this usage only if the usage can reach "BeforeHere".
+      // If use is not reachable from entry, there is no need to explore.
+      if (BeforeHere != I && !DT->isReachableFromEntry(BB))
+        return false;
+      // If the value is defined in the same basic block as use and BeforeHere,
+      // there is no need to explore the use if BeforeHere dominates use.
+      // Check whether there is a path from I to BeforeHere.
+      if (BeforeHere != I && DT->dominates(BeforeHere, I) &&
+          !isPotentiallyReachable(I, BeforeHere, DT))
+        return false;
+      return true;
+    }
+
+    bool captured(const Use *U) override {
+      if (isa<ReturnInst>(U->getUser()) && !ReturnCaptures)
+        return false;
+
+      Instruction *I = cast<Instruction>(U->getUser());
+      if (BeforeHere == I && !IncludeI)
+        return false;
+
+      BasicBlock *BB = I->getParent();
+      // Same logic as in shouldExplore.
+      if (BeforeHere != I && !DT->isReachableFromEntry(BB))
+        return false;
+      if (BeforeHere != I && DT->dominates(BeforeHere, I) &&
+          !isPotentiallyReachable(I, BeforeHere, DT))
+        return false;
+      Captured = true;
+      return true;
+    }
+
+    const Instruction *BeforeHere;
+    DominatorTree *DT;
+
+    bool ReturnCaptures;
+    bool IncludeI;
+
+    bool Captured;
+  };
 }
 
 /// PointerMayBeCaptured - Return true if this pointer value may be captured
@@ -74,6 +135,32 @@
   return SCT.Captured;
 }
 
+/// PointerMayBeCapturedBefore - Return true if this pointer value may be
+/// captured by the enclosing function (which is required to exist). If a
+/// DominatorTree is provided, only captures which happen before the given
+/// instruction are considered. This routine can be expensive, so consider
+/// caching the results.  The boolean ReturnCaptures specifies whether
+/// returning the value (or part of it) from the function counts as capturing
+/// it or not.  The boolean StoreCaptures specified whether storing the value
+/// (or part of it) into memory anywhere automatically counts as capturing it
+/// or not.
+bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
+                                      bool StoreCaptures, const Instruction *I,
+                                      DominatorTree *DT, bool IncludeI) {
+  assert(!isa<GlobalValue>(V) &&
+         "It doesn't make sense to ask whether a global is captured.");
+
+  if (!DT)
+    return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures);
+
+  // TODO: See comment in PointerMayBeCaptured regarding what could be done
+  // with StoreCaptures.
+
+  CapturesBefore CB(ReturnCaptures, I, DT, IncludeI);
+  PointerMayBeCaptured(V, &CB);
+  return CB.Captured;
+}
+
 /// TODO: Write a new FunctionPass AliasAnalysis so that it can keep
 /// a cache. Then we can move the code from BasicAliasAnalysis into
 /// that path, and remove this threshold.
@@ -152,7 +239,7 @@
         if (Count++ >= Threshold)
           return Tracker->tooManyUses();
 
-        if (Visited.insert(&UU))
+        if (Visited.insert(&UU).second)
           if (Tracker->shouldExplore(&UU))
             Worklist.push_back(&UU);
       }

diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp
index 4c8a093..f29e4a2 100644
--- a/lib/Analysis/CodeMetrics.cpp
+++ b/lib/Analysis/CodeMetrics.cpp

@@ -11,23 +11,101 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "code-metrics"
 
 using namespace llvm;
 
+static void completeEphemeralValues(SmallVector<const Value *, 16> &WorkSet,
+                                    SmallPtrSetImpl<const Value*> &EphValues) {
+  SmallPtrSet<const Value *, 32> Visited;
+
+  // Make sure that all of the items in WorkSet are in our EphValues set.
+  EphValues.insert(WorkSet.begin(), WorkSet.end());
+
+  // Note: We don't speculate PHIs here, so we'll miss instruction chains kept
+  // alive only by ephemeral values.
+
+  while (!WorkSet.empty()) {
+    const Value *V = WorkSet.front();
+    WorkSet.erase(WorkSet.begin());
+
+    if (!Visited.insert(V).second)
+      continue;
+
+    // If all uses of this value are ephemeral, then so is this value.
+    bool FoundNEUse = false;
+    for (const User *I : V->users())
+      if (!EphValues.count(I)) {
+        FoundNEUse = true;
+        break;
+      }
+
+    if (FoundNEUse)
+      continue;
+
+    EphValues.insert(V);
+    DEBUG(dbgs() << "Ephemeral Value: " << *V << "\n");
+
+    if (const User *U = dyn_cast<User>(V))
+      for (const Value *J : U->operands()) {
+        if (isSafeToSpeculativelyExecute(J))
+          WorkSet.push_back(J);
+      }
+  }
+}
+
+// Find all ephemeral values.
+void CodeMetrics::collectEphemeralValues(const Loop *L, AssumptionTracker *AT,
+                                         SmallPtrSetImpl<const Value*> &EphValues) {
+  SmallVector<const Value *, 16> WorkSet;
+
+  for (auto &I : AT->assumptions(L->getHeader()->getParent())) {
+    // Filter out call sites outside of the loop so we don't to a function's
+    // worth of work for each of its loops (and, in the common case, ephemeral
+    // values in the loop are likely due to @llvm.assume calls in the loop).
+    if (!L->contains(I->getParent()))
+      continue;
+
+    WorkSet.push_back(I);
+  }
+
+  completeEphemeralValues(WorkSet, EphValues);
+}
+
+void CodeMetrics::collectEphemeralValues(const Function *F, AssumptionTracker *AT,
+                                         SmallPtrSetImpl<const Value*> &EphValues) {
+  SmallVector<const Value *, 16> WorkSet;
+
+  for (auto &I : AT->assumptions(const_cast<Function*>(F)))
+    WorkSet.push_back(I);
+
+  completeEphemeralValues(WorkSet, EphValues);
+}
+
 /// analyzeBasicBlock - Fill in the current structure with information gleaned
 /// from the specified block.
 void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
-                                    const TargetTransformInfo &TTI) {
+                                    const TargetTransformInfo &TTI,
+                                    SmallPtrSetImpl<const Value*> &EphValues) {
   ++NumBlocks;
   unsigned NumInstsBeforeThisBB = NumInsts;
   for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
        II != E; ++II) {
+    // Skip ephemeral values.
+    if (EphValues.count(II))
+      continue;
+
     // Special handling for calls.
     if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
       ImmutableCallSite CS(cast<Instruction>(II));

diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index eb3e2c6..fd8f2ae 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp

@@ -47,15 +47,16 @@
 // Constant Folding internal helper functions
 //===----------------------------------------------------------------------===//
 
-/// FoldBitCast - Constant fold bitcast, symbolically evaluating it with
-/// DataLayout.  This always returns a non-null constant, but it may be a
+/// Constant fold bitcast, symbolically evaluating it with DataLayout.
+/// This always returns a non-null constant, but it may be a
 /// ConstantExpr if unfoldable.
 static Constant *FoldBitCast(Constant *C, Type *DestTy,
                              const DataLayout &TD) {
   // Catch the obvious splat cases.
   if (C->isNullValue() && !DestTy->isX86_MMXTy())
     return Constant::getNullValue(DestTy);
-  if (C->isAllOnesValue() && !DestTy->isX86_MMXTy())
+  if (C->isAllOnesValue() && !DestTy->isX86_MMXTy() &&
+      !DestTy->isPtrOrPtrVectorTy()) // Don't get ones for ptr types!
     return Constant::getAllOnesValue(DestTy);
 
   // Handle a vector->integer cast.
@@ -197,7 +198,7 @@
 
   // Handle: bitcast (<2 x i64> <i64 0, i64 1> to <4 x i32>)
   unsigned Ratio = NumDstElt/NumSrcElt;
-  unsigned DstBitSize = DstEltTy->getPrimitiveSizeInBits();
+  unsigned DstBitSize = TD.getTypeSizeInBits(DstEltTy);
 
   // Loop over each source value, expanding into multiple results.
   for (unsigned i = 0; i != NumSrcElt; ++i) {
@@ -213,6 +214,15 @@
                                   ConstantInt::get(Src->getType(), ShiftAmt));
       ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
 
+      // Truncate the element to an integer with the same pointer size and
+      // convert the element back to a pointer using a inttoptr.
+      if (DstEltTy->isPointerTy()) {
+        IntegerType *DstIntTy = Type::getIntNTy(C->getContext(), DstBitSize);
+        Constant *CE = ConstantExpr::getTrunc(Elt, DstIntTy);
+        Result.push_back(ConstantExpr::getIntToPtr(CE, DstEltTy));
+        continue;
+      }
+
       // Truncate and remember this piece.
       Result.push_back(ConstantExpr::getTrunc(Elt, DstEltTy));
     }
@@ -222,9 +232,8 @@
 }
 
 
-/// IsConstantOffsetFromGlobal - If this constant is actually a constant offset
-/// from a global, return the global and the constant.  Because of
-/// constantexprs, this function is recursive.
+/// If this constant is a constant offset from a global, return the global and
+/// the constant. Because of constantexprs, this function is recursive.
 static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
                                        APInt &Offset, const DataLayout &TD) {
   // Trivial case, constant is the global.
@@ -240,7 +249,8 @@
 
   // Look through ptr->int and ptr->ptr casts.
   if (CE->getOpcode() == Instruction::PtrToInt ||
-      CE->getOpcode() == Instruction::BitCast)
+      CE->getOpcode() == Instruction::BitCast ||
+      CE->getOpcode() == Instruction::AddrSpaceCast)
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD);
 
   // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
@@ -263,10 +273,10 @@
   return true;
 }
 
-/// ReadDataFromGlobal - Recursive helper to read bits out of global.  C is the
-/// constant being copied out of. ByteOffset is an offset into C.  CurPtr is the
-/// pointer to copy results into and BytesLeft is the number of bytes left in
-/// the CurPtr buffer.  TD is the target data.
+/// Recursive helper to read bits out of global. C is the constant being copied
+/// out of. ByteOffset is an offset into C. CurPtr is the pointer to copy
+/// results into and BytesLeft is the number of bytes left in
+/// the CurPtr buffer. TD is the target data.
 static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
                                unsigned char *CurPtr, unsigned BytesLeft,
                                const DataLayout &TD) {
@@ -517,9 +527,8 @@
   return nullptr;
 }
 
-/// ConstantFoldLoadFromConstPtr - Return the value that a load from C would
-/// produce if it is constant and determinable.  If this is not determinable,
-/// return null.
+/// Return the value that a load from C would produce if it is constant and
+/// determinable. If this is not determinable, return null.
 Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
                                              const DataLayout *TD) {
   // First, try the easy cases:
@@ -609,7 +618,7 @@
   return nullptr;
 }
 
-/// SymbolicallyEvaluateBinop - One of Op0/Op1 is a constant expression.
+/// One of Op0/Op1 is a constant expression.
 /// Attempt to symbolically evaluate the result of a binary operator merging
 /// these together.  If target data info is available, it is provided as DL,
 /// otherwise DL is null.
@@ -666,9 +675,8 @@
   return nullptr;
 }
 
-/// CastGEPIndices - If array indices are not pointer-sized integers,
-/// explicitly cast them so that they aren't implicitly casted by the
-/// getelementptr.
+/// If array indices are not pointer-sized integers, explicitly cast them so
+/// that they aren't implicitly casted by the getelementptr.
 static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
                                 Type *ResultTy, const DataLayout *TD,
                                 const TargetLibraryInfo *TLI) {
@@ -723,8 +731,7 @@
   return Ptr;
 }
 
-/// SymbolicallyEvaluateGEP - If we can symbolically evaluate the specified GEP
-/// constant expression, do so.
+/// If we can symbolically evaluate the GEP constant expression, do so.
 static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
                                          Type *ResultTy, const DataLayout *TD,
                                          const TargetLibraryInfo *TLI) {
@@ -886,7 +893,7 @@
 // Constant Folding public APIs
 //===----------------------------------------------------------------------===//
 
-/// ConstantFoldInstruction - Try to constant fold the specified instruction.
+/// Try to constant fold the specified instruction.
 /// If successful, the constant result is returned, if not, null is returned.
 /// Note that this fails if not all of the operands are constant.  Otherwise,
 /// this function can only fail when attempting to fold instructions like loads
@@ -966,7 +973,7 @@
 static Constant *
 ConstantFoldConstantExpressionImpl(const ConstantExpr *CE, const DataLayout *TD,
                                    const TargetLibraryInfo *TLI,
-                                   SmallPtrSet<ConstantExpr *, 4> &FoldedOps) {
+                                   SmallPtrSetImpl<ConstantExpr *> &FoldedOps) {
   SmallVector<Constant *, 8> Ops;
   for (User::const_op_iterator i = CE->op_begin(), e = CE->op_end(); i != e;
        ++i) {
@@ -974,7 +981,7 @@
     // Recursively fold the ConstantExpr's operands. If we have already folded
     // a ConstantExpr, we don't have to process it again.
     if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(NewC)) {
-      if (FoldedOps.insert(NewCE))
+      if (FoldedOps.insert(NewCE).second)
         NewC = ConstantFoldConstantExpressionImpl(NewCE, TD, TLI, FoldedOps);
     }
     Ops.push_back(NewC);
@@ -986,7 +993,7 @@
   return ConstantFoldInstOperands(CE->getOpcode(), CE->getType(), Ops, TD, TLI);
 }
 
-/// ConstantFoldConstantExpression - Attempt to fold the constant expression
+/// Attempt to fold the constant expression
 /// using the specified DataLayout.  If successful, the constant result is
 /// result is returned, if not, null is returned.
 Constant *llvm::ConstantFoldConstantExpression(const ConstantExpr *CE,
@@ -996,7 +1003,7 @@
   return ConstantFoldConstantExpressionImpl(CE, TD, TLI, FoldedOps);
 }
 
-/// ConstantFoldInstOperands - Attempt to constant fold an instruction with the
+/// Attempt to constant fold an instruction with the
 /// specified opcode and operands.  If successful, the constant result is
 /// returned, if not, null is returned.  Note that this function can fail when
 /// attempting to fold instructions like loads and stores, which have no
@@ -1101,10 +1108,9 @@
   }
 }
 
-/// ConstantFoldCompareInstOperands - Attempt to constant fold a compare
+/// Attempt to constant fold a compare
 /// instruction (icmp/fcmp) with the specified operands.  If it fails, it
 /// returns a constant expression of the specified operands.
-///
 Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
                                                 Constant *Ops0, Constant *Ops1,
                                                 const DataLayout *TD,
@@ -1191,9 +1197,9 @@
 }
 
 
-/// ConstantFoldLoadThroughGEPConstantExpr - Given a constant and a
-/// getelementptr constantexpr, return the constant value being addressed by the
-/// constant expression, or null if something is funny and we can't decide.
+/// Given a constant and a getelementptr constantexpr, return the constant value
+/// being addressed by the constant expression, or null if something is funny
+/// and we can't decide.
 Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
                                                        ConstantExpr *CE) {
   if (!CE->getOperand(1)->isNullValue())
@@ -1209,10 +1215,9 @@
   return C;
 }
 
-/// ConstantFoldLoadThroughGEPIndices - Given a constant and getelementptr
-/// indices (with an *implied* zero pointer index that is not in the list),
-/// return the constant value being addressed by a virtual load, or null if
-/// something is funny and we can't decide.
+/// Given a constant and getelementptr indices (with an *implied* zero pointer
+/// index that is not in the list), return the constant value being addressed by
+/// a virtual load, or null if something is funny and we can't decide.
 Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
                                                   ArrayRef<Constant*> Indices) {
   // Loop over all of the operands, tracking down which value we are
@@ -1230,11 +1235,12 @@
 //  Constant Folding for Calls
 //
 
-/// canConstantFoldCallTo - Return true if its even possible to fold a call to
-/// the specified function.
+/// Return true if it's even possible to fold a call to the specified function.
 bool llvm::canConstantFoldCallTo(const Function *F) {
   switch (F->getIntrinsicID()) {
   case Intrinsic::fabs:
+  case Intrinsic::minnum:
+  case Intrinsic::maxnum:
   case Intrinsic::log:
   case Intrinsic::log2:
   case Intrinsic::log10:
@@ -1320,7 +1326,7 @@
 }
 
 namespace {
-/// llvm_fenv_clearexcept - Clear the floating-point exception state.
+/// Clear the floating-point exception state.
 static inline void llvm_fenv_clearexcept() {
 #if defined(HAVE_FENV_H) && HAVE_DECL_FE_ALL_EXCEPT
   feclearexcept(FE_ALL_EXCEPT);
@@ -1328,7 +1334,7 @@
   errno = 0;
 }
 
-/// llvm_fenv_testexcept - Test if a floating-point exception was raised.
+/// Test if a floating-point exception was raised.
 static inline bool llvm_fenv_testexcept() {
   int errno_val = errno;
   if (errno_val == ERANGE || errno_val == EDOM)
@@ -1365,14 +1371,13 @@
   return GetConstantFoldFPValue(V, Ty);
 }
 
-/// ConstantFoldConvertToInt - Attempt to an SSE floating point to integer
-/// conversion of a constant floating point. If roundTowardZero is false, the
-/// default IEEE rounding is used (toward nearest, ties to even). This matches
-/// the behavior of the non-truncating SSE instructions in the default rounding
-/// mode. The desired integer type Ty is used to select how many bits are
-/// available for the result. Returns null if the conversion cannot be
-/// performed, otherwise returns the Constant value resulting from the
-/// conversion.
+/// Attempt to fold an SSE floating point to integer conversion of a constant
+/// floating point. If roundTowardZero is false, the default IEEE rounding is
+/// used (toward nearest, ties to even). This matches the behavior of the
+/// non-truncating SSE instructions in the default rounding mode. The desired
+/// integer type Ty is used to select how many bits are available for the
+/// result. Returns null if the conversion cannot be performed, otherwise
+/// returns the Constant value resulting from the conversion.
 static Constant *ConstantFoldConvertToInt(const APFloat &Val,
                                           bool roundTowardZero, Type *Ty) {
   // All of these conversion intrinsics form an integer of at most 64bits.
@@ -1519,8 +1524,14 @@
                  (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())) {
           if (V >= -0.0)
             return ConstantFoldFP(sqrt, V, Ty);
-          else // Undefined
-            return Constant::getNullValue(Ty);
+          else {
+            // Unlike the sqrt definitions in C/C++, POSIX, and IEEE-754 - which
+            // all guarantee or favor returning NaN - the square root of a
+            // negative number is not defined for the LLVM sqrt intrinsic.
+            // This is because the intrinsic should only be emitted in place of
+            // libm's sqrt function when using "no-nans-fp-math".
+            return UndefValue::get(Ty);
+          }
         }
         break;
       case 's':
@@ -1626,6 +1637,19 @@
           V1.copySign(V2);
           return ConstantFP::get(Ty->getContext(), V1);
         }
+
+        if (IntrinsicID == Intrinsic::minnum) {
+          const APFloat &C1 = Op1->getValueAPF();
+          const APFloat &C2 = Op2->getValueAPF();
+          return ConstantFP::get(Ty->getContext(), minnum(C1, C2));
+        }
+
+        if (IntrinsicID == Intrinsic::maxnum) {
+          const APFloat &C1 = Op1->getValueAPF();
+          const APFloat &C2 = Op2->getValueAPF();
+          return ConstantFP::get(Ty->getContext(), maxnum(C1, C2));
+        }
+
         if (!TLI)
           return nullptr;
         if (Name == "pow" && TLI->has(LibFunc::pow))
@@ -1761,7 +1785,7 @@
   return ConstantVector::get(Result);
 }
 
-/// ConstantFoldCall - Attempt to constant fold a call to the specified function
+/// Attempt to constant fold a call to the specified function
 /// with the specified arguments, returning null if unsuccessful.
 Constant *
 llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,

diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index d0784f1..092df5c 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp

@@ -163,16 +163,15 @@
            DstI != DstE; ++DstI) {
         if (isa<StoreInst>(*DstI) || isa<LoadInst>(*DstI)) {
           OS << "da analyze - ";
-          if (Dependence *D = DA->depends(&*SrcI, &*DstI, true)) {
+          if (auto D = DA->depends(&*SrcI, &*DstI, true)) {
             D->dump(OS);
             for (unsigned Level = 1; Level <= D->getLevels(); Level++) {
               if (D->isSplitable(Level)) {
                 OS << "da analyze - split level = " << Level;
-                OS << ", iteration = " << *DA->getSplitIteration(D, Level);
+                OS << ", iteration = " << *DA->getSplitIteration(*D, Level);
                 OS << "!\n";
               }
             }
-            delete D;
           }
           else
             OS << "none!\n";
@@ -782,6 +781,25 @@
   }
 }
 
+void DependenceAnalysis::unifySubscriptType(Subscript *Pair) {
+  const SCEV *Src = Pair->Src;
+  const SCEV *Dst = Pair->Dst;
+  IntegerType *SrcTy = dyn_cast<IntegerType>(Src->getType());
+  IntegerType *DstTy = dyn_cast<IntegerType>(Dst->getType());
+  if (SrcTy == nullptr || DstTy == nullptr) {
+    assert(SrcTy == DstTy && "This function only unify integer types and "
+                             "expect Src and Dst share the same type "
+                             "otherwise.");
+    return;
+  }
+  if (SrcTy->getBitWidth() > DstTy->getBitWidth()) {
+    // Sign-extend Dst to typeof(Src) if typeof(Src) is wider than typeof(Dst).
+    Pair->Dst = SE->getSignExtendExpr(Dst, SrcTy);
+  } else if (SrcTy->getBitWidth() < DstTy->getBitWidth()) {
+    // Sign-extend Src to typeof(Dst) if typeof(Dst) is wider than typeof(Src).
+    Pair->Src = SE->getSignExtendExpr(Src, DstTy);
+  }
+}
 
 // removeMatchingExtensions - Examines a subscript pair.
 // If the source and destination are identically sign (or zero)
@@ -794,9 +812,11 @@
       (isa<SCEVSignExtendExpr>(Src) && isa<SCEVSignExtendExpr>(Dst))) {
     const SCEVCastExpr *SrcCast = cast<SCEVCastExpr>(Src);
     const SCEVCastExpr *DstCast = cast<SCEVCastExpr>(Dst);
-    if (SrcCast->getType() == DstCast->getType()) {
-      Pair->Src = SrcCast->getOperand();
-      Pair->Dst = DstCast->getOperand();
+    const SCEV *SrcCastOp = SrcCast->getOperand();
+    const SCEV *DstCastOp = DstCast->getOperand();
+    if (SrcCastOp->getType() == DstCastOp->getType()) {
+      Pair->Src = SrcCastOp;
+      Pair->Dst = DstCastOp;
     }
   }
 }
@@ -2957,15 +2977,11 @@
                              AddRec->getNoWrapFlags());
   }
   if (SE->isLoopInvariant(AddRec, TargetLoop))
-    return SE->getAddRecExpr(AddRec,
-			     Value,
-			     TargetLoop,
-			     SCEV::FlagAnyWrap);
-  return SE->getAddRecExpr(addToCoefficient(AddRec->getStart(),
-                                            TargetLoop, Value),
-                           AddRec->getStepRecurrence(*SE),
-                           AddRec->getLoop(),
-                           AddRec->getNoWrapFlags());
+    return SE->getAddRecExpr(AddRec, Value, TargetLoop, SCEV::FlagAnyWrap);
+  return SE->getAddRecExpr(
+      addToCoefficient(AddRec->getStart(), TargetLoop, Value),
+      AddRec->getStepRecurrence(*SE), AddRec->getLoop(),
+      AddRec->getNoWrapFlags());
 }
 
 
@@ -3183,7 +3199,7 @@
 bool DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV,
                                         const SCEV *DstSCEV,
                                         SmallVectorImpl<Subscript> &Pair,
-                                        const SCEV *ElementSize) const {
+                                        const SCEV *ElementSize) {
   const SCEVUnknown *SrcBase =
       dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcSCEV));
   const SCEVUnknown *DstBase =
@@ -3238,6 +3254,7 @@
   for (int i = 0; i < size; ++i) {
     Pair[i].Src = SrcSubscripts[i];
     Pair[i].Dst = DstSubscripts[i];
+    unifySubscriptType(&Pair[i]);
 
     // FIXME: we should record the bounds SrcSizes[i] and DstSizes[i] that the
     // delinearization has found, and add these constraints to the dependence
@@ -3277,9 +3294,9 @@
 //
 // Care is required to keep the routine below, getSplitIteration(),
 // up to date with respect to this routine.
-Dependence *DependenceAnalysis::depends(Instruction *Src,
-                                        Instruction *Dst,
-                                        bool PossiblyLoopIndependent) {
+std::unique_ptr<Dependence>
+DependenceAnalysis::depends(Instruction *Src, Instruction *Dst,
+                            bool PossiblyLoopIndependent) {
   if (Src == Dst)
     PossiblyLoopIndependent = false;
 
@@ -3291,7 +3308,7 @@
   if (!isLoadOrStore(Src) || !isLoadOrStore(Dst)) {
     // can only analyze simple loads and stores, i.e., no calls, invokes, etc.
     DEBUG(dbgs() << "can only handle simple loads and stores\n");
-    return new Dependence(Src, Dst);
+    return make_unique<Dependence>(Src, Dst);
   }
 
   Value *SrcPtr = getPointerOperand(Src);
@@ -3302,7 +3319,7 @@
   case AliasAnalysis::PartialAlias:
     // cannot analyse objects if we don't understand their aliasing.
     DEBUG(dbgs() << "can't analyze may or partial alias\n");
-    return new Dependence(Src, Dst);
+    return make_unique<Dependence>(Src, Dst);
   case AliasAnalysis::NoAlias:
     // If the objects noalias, they are distinct, accesses are independent.
     DEBUG(dbgs() << "no alias\n");
@@ -3346,6 +3363,7 @@
          ++SrcIdx, ++DstIdx, ++P) {
       Pair[P].Src = SE->getSCEV(*SrcIdx);
       Pair[P].Dst = SE->getSCEV(*DstIdx);
+      unifySubscriptType(&Pair[P]);
     }
   }
   else {
@@ -3675,9 +3693,9 @@
       return nullptr;
   }
 
-  FullDependence *Final = new FullDependence(Result);
+  auto Final = make_unique<FullDependence>(Result);
   Result.DV = nullptr;
-  return Final;
+  return std::move(Final);
 }
 
 
@@ -3729,13 +3747,12 @@
 //
 // breaks the dependence and allows us to vectorize/parallelize
 // both loops.
-const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
+const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence &Dep,
                                                    unsigned SplitLevel) {
-  assert(Dep && "expected a pointer to a Dependence");
-  assert(Dep->isSplitable(SplitLevel) &&
+  assert(Dep.isSplitable(SplitLevel) &&
          "Dep should be splitable at SplitLevel");
-  Instruction *Src = Dep->getSrc();
-  Instruction *Dst = Dep->getDst();
+  Instruction *Src = Dep.getSrc();
+  Instruction *Dst = Dep.getDst();
   assert(Src->mayReadFromMemory() || Src->mayWriteToMemory());
   assert(Dst->mayReadFromMemory() || Dst->mayWriteToMemory());
   assert(isLoadOrStore(Src));

diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp
index 74594f8..7ba91bc 100644
--- a/lib/Analysis/DominanceFrontier.cpp
+++ b/lib/Analysis/DominanceFrontier.cpp

@@ -8,133 +8,50 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/DominanceFrontier.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/DominanceFrontierImpl.h"
+
 using namespace llvm;
 
+namespace llvm {
+template class DominanceFrontierBase<BasicBlock>;
+template class ForwardDominanceFrontierBase<BasicBlock>;
+}
+
 char DominanceFrontier::ID = 0;
+
 INITIALIZE_PASS_BEGIN(DominanceFrontier, "domfrontier",
                 "Dominance Frontier Construction", true, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(DominanceFrontier, "domfrontier",
                 "Dominance Frontier Construction", true, true)
 
-namespace {
-  class DFCalculateWorkObject {
-  public:
-    DFCalculateWorkObject(BasicBlock *B, BasicBlock *P, 
-                          const DomTreeNode *N,
-                          const DomTreeNode *PN)
-    : currentBB(B), parentBB(P), Node(N), parentNode(PN) {}
-    BasicBlock *currentBB;
-    BasicBlock *parentBB;
-    const DomTreeNode *Node;
-    const DomTreeNode *parentNode;
-  };
+DominanceFrontier::DominanceFrontier()
+  : FunctionPass(ID),
+    Base() {
+  initializeDominanceFrontierPass(*PassRegistry::getPassRegistry());
 }
 
-void DominanceFrontier::anchor() { }
-
-const DominanceFrontier::DomSetType &
-DominanceFrontier::calculate(const DominatorTree &DT,
-                             const DomTreeNode *Node) {
-  BasicBlock *BB = Node->getBlock();
-  DomSetType *Result = nullptr;
-
-  std::vector<DFCalculateWorkObject> workList;
-  SmallPtrSet<BasicBlock *, 32> visited;
-
-  workList.push_back(DFCalculateWorkObject(BB, nullptr, Node, nullptr));
-  do {
-    DFCalculateWorkObject *currentW = &workList.back();
-    assert (currentW && "Missing work object.");
-
-    BasicBlock *currentBB = currentW->currentBB;
-    BasicBlock *parentBB = currentW->parentBB;
-    const DomTreeNode *currentNode = currentW->Node;
-    const DomTreeNode *parentNode = currentW->parentNode;
-    assert (currentBB && "Invalid work object. Missing current Basic Block");
-    assert (currentNode && "Invalid work object. Missing current Node");
-    DomSetType &S = Frontiers[currentBB];
-
-    // Visit each block only once.
-    if (visited.count(currentBB) == 0) {
-      visited.insert(currentBB);
-
-      // Loop over CFG successors to calculate DFlocal[currentNode]
-      for (succ_iterator SI = succ_begin(currentBB), SE = succ_end(currentBB);
-           SI != SE; ++SI) {
-        // Does Node immediately dominate this successor?
-        if (DT[*SI]->getIDom() != currentNode)
-          S.insert(*SI);
-      }
-    }
-
-    // At this point, S is DFlocal.  Now we union in DFup's of our children...
-    // Loop through and visit the nodes that Node immediately dominates (Node's
-    // children in the IDomTree)
-    bool visitChild = false;
-    for (DomTreeNode::const_iterator NI = currentNode->begin(), 
-           NE = currentNode->end(); NI != NE; ++NI) {
-      DomTreeNode *IDominee = *NI;
-      BasicBlock *childBB = IDominee->getBlock();
-      if (visited.count(childBB) == 0) {
-        workList.push_back(DFCalculateWorkObject(childBB, currentBB,
-                                                 IDominee, currentNode));
-        visitChild = true;
-      }
-    }
-
-    // If all children are visited or there is any child then pop this block
-    // from the workList.
-    if (!visitChild) {
-
-      if (!parentBB) {
-        Result = &S;
-        break;
-      }
-
-      DomSetType::const_iterator CDFI = S.begin(), CDFE = S.end();
-      DomSetType &parentSet = Frontiers[parentBB];
-      for (; CDFI != CDFE; ++CDFI) {
-        if (!DT.properlyDominates(parentNode, DT[*CDFI]))
-          parentSet.insert(*CDFI);
-      }
-      workList.pop_back();
-    }
-
-  } while (!workList.empty());
-
-  return *Result;
+void DominanceFrontier::releaseMemory() {
+  Base.releaseMemory();
 }
 
-void DominanceFrontierBase::print(raw_ostream &OS, const Module* ) const {
-  for (const_iterator I = begin(), E = end(); I != E; ++I) {
-    OS << "  DomFrontier for BB ";
-    if (I->first)
-      I->first->printAsOperand(OS, false);
-    else
-      OS << " <<exit node>>";
-    OS << " is:\t";
-    
-    const std::set<BasicBlock*> &BBs = I->second;
-    
-    for (std::set<BasicBlock*>::const_iterator I = BBs.begin(), E = BBs.end();
-         I != E; ++I) {
-      OS << ' ';
-      if (*I)
-        (*I)->printAsOperand(OS, false);
-      else
-        OS << "<<exit node>>";
-    }
-    OS << "\n";
-  }
+bool DominanceFrontier::runOnFunction(Function &) {
+  releaseMemory();
+  Base.analyze(getAnalysis<DominatorTreeWrapperPass>().getDomTree());
+  return false;
+}
+
+void DominanceFrontier::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<DominatorTreeWrapperPass>();
+}
+
+void DominanceFrontier::print(raw_ostream &OS, const Module *) const {
+  Base.print(OS);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void DominanceFrontierBase::dump() const {
+void DominanceFrontier::dump() const {
   print(dbgs());
 }
 #endif
-

diff --git a/lib/Analysis/FunctionTargetTransformInfo.cpp b/lib/Analysis/FunctionTargetTransformInfo.cpp
new file mode 100644
index 0000000..a686bec
--- /dev/null
+++ b/lib/Analysis/FunctionTargetTransformInfo.cpp

@@ -0,0 +1,50 @@
+//===- llvm/Analysis/FunctionTargetTransformInfo.h --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass wraps a TargetTransformInfo in a FunctionPass so that it can
+// forward along the current Function so that we can make target specific
+// decisions based on the particular subtarget specified for each Function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Analysis/FunctionTargetTransformInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "function-tti"
+static const char ftti_name[] = "Function TargetTransformInfo";
+INITIALIZE_PASS_BEGIN(FunctionTargetTransformInfo, "function_tti", ftti_name, false, true)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(FunctionTargetTransformInfo, "function_tti", ftti_name, false, true)
+char FunctionTargetTransformInfo::ID = 0;
+
+namespace llvm {
+FunctionPass *createFunctionTargetTransformInfoPass() {
+  return new FunctionTargetTransformInfo();
+}
+}
+
+FunctionTargetTransformInfo::FunctionTargetTransformInfo()
+  : FunctionPass(ID), Fn(nullptr), TTI(nullptr) {
+  initializeFunctionTargetTransformInfoPass(*PassRegistry::getPassRegistry());
+}
+
+void FunctionTargetTransformInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetTransformInfo>();
+}
+
+void FunctionTargetTransformInfo::releaseMemory() {}
+
+bool FunctionTargetTransformInfo::runOnFunction(Function &F) {
+  Fn = &F;
+  TTI = &getAnalysis<TargetTransformInfo>();
+  return false;
+}

diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index caec253..67cf7f8 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp

@@ -267,7 +267,7 @@
 
 char CallGraphWrapperPass::ID = 0;
 
-void CallGraphWrapperPass::releaseMemory() { G.reset(nullptr); }
+void CallGraphWrapperPass::releaseMemory() { G.reset(); }
 
 void CallGraphWrapperPass::print(raw_ostream &OS, const Module *) const {
   if (!G) {
@@ -282,6 +282,3 @@
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void CallGraphWrapperPass::dump() const { print(dbgs(), nullptr); }
 #endif
-
-// Enuse that users of CallGraph.h also link with this file
-DEFINING_FILE_FOR(CallGraph)

diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index c27edbf..665aa7f 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp

@@ -243,7 +243,14 @@
       
       assert(!CallSites.count(I->first) &&
              "Call site occurs in node multiple times");
-      CallSites.insert(std::make_pair(I->first, I->second));
+      
+      CallSite CS(I->first);
+      if (CS) {
+        Function *Callee = CS.getCalledFunction();
+        // Ignore intrinsics because they're not really function calls.
+        if (!Callee || !(Callee->isIntrinsic()))
+          CallSites.insert(std::make_pair(I->first, I->second));
+      }
       ++I;
     }
     

diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 8807529..85db278 100644
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp

@@ -17,7 +17,9 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
@@ -49,6 +51,9 @@
   /// The TargetTransformInfo available for this compilation.
   const TargetTransformInfo &TTI;
 
+  /// The cache of @llvm.assume intrinsics.
+  AssumptionTracker *AT;
+
   // The called function.
   Function &F;
 
@@ -104,7 +109,7 @@
   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
 
   // Custom analysis routines.
-  bool analyzeBlock(BasicBlock *BB);
+  bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
 
   // Disable several entry points to the visitor so we don't accidentally use
   // them by declaring but not defining them here.
@@ -141,8 +146,8 @@
 
 public:
   CallAnalyzer(const DataLayout *DL, const TargetTransformInfo &TTI,
-               Function &Callee, int Threshold)
-      : DL(DL), TTI(TTI), F(Callee), Threshold(Threshold), Cost(0),
+               AssumptionTracker *AT, Function &Callee, int Threshold)
+      : DL(DL), TTI(TTI), AT(AT), F(Callee), Threshold(Threshold), Cost(0),
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
@@ -778,7 +783,7 @@
   // during devirtualization and so we want to give it a hefty bonus for
   // inlining, but cap that bonus in the event that inlining wouldn't pan
   // out. Pretend to inline the function, with a custom threshold.
-  CallAnalyzer CA(DL, TTI, *F, InlineConstants::IndirectCallThreshold);
+  CallAnalyzer CA(DL, TTI, AT, *F, InlineConstants::IndirectCallThreshold);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
     // bonus we want to apply, but don't go below zero.
@@ -881,7 +886,8 @@
 /// aborts early if the threshold has been exceeded or an impossible to inline
 /// construct has been detected. It returns false if inlining is no longer
 /// viable, and true if inlining remains viable.
-bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
+bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
+                                SmallPtrSetImpl<const Value *> &EphValues) {
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     // FIXME: Currently, the number of instructions in a function regardless of
     // our ability to simplify them during inline to constants or dead code,
@@ -893,6 +899,10 @@
     if (isa<DbgInfoIntrinsic>(I))
       continue;
 
+    // Skip ephemeral values.
+    if (EphValues.count(I))
+      continue;
+
     ++NumInstructions;
     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
       ++NumVectorInstructions;
@@ -967,7 +977,7 @@
       break;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
-  } while (Visited.insert(V));
+  } while (Visited.insert(V).second);
 
   Type *IntPtrTy = DL->getIntPtrType(V->getContext());
   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
@@ -1096,6 +1106,12 @@
   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
   NumAllocaArgs = SROAArgValues.size();
 
+  // FIXME: If a caller has multiple calls to a callee, we end up recomputing
+  // the ephemeral values multiple times (and they're completely determined by
+  // the callee, so this is purely duplicate work).
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(&F, AT, EphValues);
+
   // The worklist of live basic blocks in the callee *after* inlining. We avoid
   // adding basic blocks of the callee which can be proven to be dead for this
   // particular call site in order to get more accurate cost estimates. This
@@ -1129,7 +1145,7 @@
 
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
-    if (!analyzeBlock(BB)) {
+    if (!analyzeBlock(BB, EphValues)) {
       if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
           HasIndirectBr)
         return false;
@@ -1217,6 +1233,7 @@
 INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
                       true, true)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
                     true, true)
 
@@ -1228,12 +1245,14 @@
 
 void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
+  AU.addRequired<AssumptionTracker>();
   AU.addRequired<TargetTransformInfo>();
   CallGraphSCCPass::getAnalysisUsage(AU);
 }
 
 bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
   TTI = &getAnalysis<TargetTransformInfo>();
+  AT = &getAnalysis<AssumptionTracker>();
   return false;
 }
 
@@ -1290,7 +1309,7 @@
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
         << "...\n");
 
-  CallAnalyzer CA(Callee->getDataLayout(), *TTI, *Callee, Threshold);
+  CallAnalyzer CA(Callee->getDataLayout(), *TTI, AT, *Callee, Threshold);
   bool ShouldInline = CA.analyzeCall(CS);
 
   DEBUG(CA.dump());

diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 24655aa..6b5f370 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp

@@ -84,7 +84,7 @@
 /// form.
 static bool isSimplifiedLoopNest(BasicBlock *BB, const DominatorTree *DT,
                                  const LoopInfo *LI,
-                                 SmallPtrSet<Loop*,16> &SimpleLoopNests) {
+                                 SmallPtrSetImpl<Loop*> &SimpleLoopNests) {
   Loop *NearestLoop = nullptr;
   for (DomTreeNode *Rung = DT->getNode(BB);
        Rung; Rung = Rung->getIDom()) {
@@ -112,10 +112,10 @@
 /// reducible SCEV, recursively add its users to the IVUsesByStride set and
 /// return true.  Otherwise, return false.
 bool IVUsers::AddUsersImpl(Instruction *I,
-                           SmallPtrSet<Loop*,16> &SimpleLoopNests) {
+                           SmallPtrSetImpl<Loop*> &SimpleLoopNests) {
   // Add this IV user to the Processed set before returning false to ensure that
   // all IV users are members of the set. See IVUsers::isIVUserOrOperand.
-  if (!Processed.insert(I))
+  if (!Processed.insert(I).second)
     return true;    // Instruction already handled.
 
   if (!SE->isSCEVable(I->getType()))
@@ -145,7 +145,7 @@
   SmallPtrSet<Instruction *, 4> UniqueUsers;
   for (Use &U : I->uses()) {
     Instruction *User = cast<Instruction>(U.getUser());
-    if (!UniqueUsers.insert(User))
+    if (!UniqueUsers.insert(User).second)
       continue;
 
     // Do not infinitely recurse on PHI nodes.

diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index bd42af1..f151a3a 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp

@@ -41,14 +41,20 @@
 STATISTIC(NumExpand,  "Number of expansions");
 STATISTIC(NumReassoc, "Number of reassociations");
 
+namespace {
 struct Query {
   const DataLayout *DL;
   const TargetLibraryInfo *TLI;
   const DominatorTree *DT;
+  AssumptionTracker *AT;
+  const Instruction *CxtI;
 
   Query(const DataLayout *DL, const TargetLibraryInfo *tli,
-        const DominatorTree *dt) : DL(DL), TLI(tli), DT(dt) {}
+        const DominatorTree *dt, AssumptionTracker *at = nullptr,
+        const Instruction *cxti = nullptr)
+    : DL(DL), TLI(tli), DT(dt), AT(at), CxtI(cxti) {}
 };
+} // end anonymous namespace
 
 static Value *SimplifyAndInst(Value *, Value *, const Query &, unsigned);
 static Value *SimplifyBinOp(unsigned, Value *, Value *, const Query &,
@@ -575,9 +581,10 @@
 
 Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query (DL, TLI, DT),
-                           RecursionLimit);
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW,
+                           Query (DL, TLI, DT, AT, CxtI), RecursionLimit);
 }
 
 /// \brief Compute the base pointer and cumulative constant offsets for V.
@@ -624,7 +631,7 @@
     }
     assert(V->getType()->getScalarType()->isPointerTy() &&
            "Unexpected operand type!");
-  } while (Visited.insert(V));
+  } while (Visited.insert(V).second);
 
   Constant *OffsetIntPtr = ConstantInt::get(IntPtrTy, Offset);
   if (V->getType()->isVectorTy())
@@ -676,6 +683,18 @@
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
+  // X - (0 - Y) -> X if the second sub is NUW.
+  // If Y != 0, 0 - Y is a poison value.
+  // If Y == 0, 0 - Y simplifies to 0.
+  if (BinaryOperator::isNeg(Op1)) {
+    if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) {
+      assert(BO->getOpcode() == Instruction::Sub &&
+             "Expected a subtraction operator!");
+      if (BO->hasNoUnsignedWrap())
+        return Op0;
+    }
+  }
+
   // (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies.
   // For example, (X + Y) - Y -> X; (Y + X) - Y -> X
   Value *X = nullptr, *Y = nullptr, *Z = Op1;
@@ -769,9 +788,10 @@
 
 Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Query (DL, TLI, DT),
-                           RecursionLimit);
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW,
+                           Query (DL, TLI, DT, AT, CxtI), RecursionLimit);
 }
 
 /// Given operands for an FAdd, see if we can fold the result.  If not, this
@@ -947,28 +967,37 @@
 
 Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifyFAddInst(Op0, Op1, FMF, Query (DL, TLI, DT), RecursionLimit);
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifyFAddInst(Op0, Op1, FMF, Query (DL, TLI, DT, AT, CxtI),
+                            RecursionLimit);
 }
 
 Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifyFSubInst(Op0, Op1, FMF, Query (DL, TLI, DT), RecursionLimit);
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifyFSubInst(Op0, Op1, FMF, Query (DL, TLI, DT, AT, CxtI),
+                            RecursionLimit);
 }
 
 Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1,
                               FastMathFlags FMF,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifyFMulInst(Op0, Op1, FMF, Query (DL, TLI, DT), RecursionLimit);
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifyFMulInst(Op0, Op1, FMF, Query (DL, TLI, DT, AT, CxtI),
+                            RecursionLimit);
 }
 
 Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout *DL,
                              const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifyMulInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifyMulInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                           RecursionLimit);
 }
 
 /// SimplifyDiv - Given operands for an SDiv or UDiv, see if we can
@@ -1028,6 +1057,16 @@
       (!isSigned && match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
     return Constant::getNullValue(Op0->getType());
 
+  // (X /u C1) /u C2 -> 0 if C1 * C2 overflow
+  ConstantInt *C1, *C2;
+  if (!isSigned && match(Op0, m_UDiv(m_Value(X), m_ConstantInt(C1))) &&
+      match(Op1, m_ConstantInt(C2))) {
+    bool Overflow;
+    C1->getValue().umul_ov(C2->getValue(), Overflow);
+    if (Overflow)
+      return Constant::getNullValue(Op0->getType());
+  }
+
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
@@ -1055,8 +1094,11 @@
 
 Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifySDivInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifySDivInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                            RecursionLimit);
 }
 
 /// SimplifyUDivInst - Given operands for a UDiv, see if we can
@@ -1071,8 +1113,11 @@
 
 Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifyUDivInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifyUDivInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                            RecursionLimit);
 }
 
 static Value *SimplifyFDivInst(Value *Op0, Value *Op1, const Query &Q,
@@ -1090,8 +1135,11 @@
 
 Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifyFDivInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifyFDivInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                            RecursionLimit);
 }
 
 /// SimplifyRem - Given operands for an SRem or URem, see if we can
@@ -1133,6 +1181,13 @@
   if (Op0 == Op1)
     return Constant::getNullValue(Op0->getType());
 
+  // (X % Y) % Y -> X % Y
+  if ((Opcode == Instruction::SRem &&
+       match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) ||
+      (Opcode == Instruction::URem &&
+       match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
+    return Op0;
+
   // If the operation is with the result of a select instruction, check whether
   // operating on either branch of the select always yields the same value.
   if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
@@ -1160,8 +1215,11 @@
 
 Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifySRemInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifySRemInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                            RecursionLimit);
 }
 
 /// SimplifyURemInst - Given operands for a URem, see if we can
@@ -1176,8 +1234,11 @@
 
 Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifyURemInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifyURemInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                            RecursionLimit);
 }
 
 static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const Query &,
@@ -1195,8 +1256,11 @@
 
 Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifyFRemInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifyFRemInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                            RecursionLimit);
 }
 
 /// isUndefShift - Returns true if a shift by \c Amount always yields undef.
@@ -1264,6 +1328,32 @@
   return nullptr;
 }
 
+/// \brief Given operands for an Shl, LShr or AShr, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyRightShift(unsigned Opcode, Value *Op0, Value *Op1,
+                                 bool isExact, const Query &Q,
+                                 unsigned MaxRecurse) {
+  if (Value *V = SimplifyShift(Opcode, Op0, Op1, Q, MaxRecurse))
+    return V;
+
+  // X >> X -> 0
+  if (Op0 == Op1)
+    return Constant::getNullValue(Op0->getType());
+
+  // The low bit cannot be shifted out of an exact shift if it is set.
+  if (isExact) {
+    unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
+    APInt Op0KnownZero(BitWidth, 0);
+    APInt Op0KnownOne(BitWidth, 0);
+    computeKnownBits(Op0, Op0KnownZero, Op0KnownOne, Q.DL, /*Depth=*/0, Q.AT, Q.CxtI,
+                     Q.DT);
+    if (Op0KnownOne[0])
+      return Op0;
+  }
+
+  return nullptr;
+}
+
 /// SimplifyShlInst - Given operands for an Shl, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
@@ -1284,8 +1374,9 @@
 
 Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Query (DL, TLI, DT),
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Query (DL, TLI, DT, AT, CxtI),
                            RecursionLimit);
 }
 
@@ -1293,12 +1384,9 @@
 /// fold the result.  If not, this returns null.
 static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                                const Query &Q, unsigned MaxRecurse) {
-  if (Value *V = SimplifyShift(Instruction::LShr, Op0, Op1, Q, MaxRecurse))
-    return V;
-
-  // X >> X -> 0
-  if (Op0 == Op1)
-    return Constant::getNullValue(Op0->getType());
+  if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q,
+                                    MaxRecurse))
+      return V;
 
   // undef >>l X -> 0
   if (match(Op0, m_Undef()))
@@ -1306,8 +1394,7 @@
 
   // (X << A) >> A -> X
   Value *X;
-  if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1))) &&
-      cast<OverflowingBinaryOperator>(Op0)->hasNoUnsignedWrap())
+  if (match(Op0, m_NUWShl(m_Value(X), m_Specific(Op1))))
     return X;
 
   return nullptr;
@@ -1316,8 +1403,10 @@
 Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifyLShrInst(Op0, Op1, isExact, Query (DL, TLI, DT),
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifyLShrInst(Op0, Op1, isExact, Query (DL, TLI, DT, AT, CxtI),
                             RecursionLimit);
 }
 
@@ -1325,13 +1414,10 @@
 /// fold the result.  If not, this returns null.
 static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                                const Query &Q, unsigned MaxRecurse) {
-  if (Value *V = SimplifyShift(Instruction::AShr, Op0, Op1, Q, MaxRecurse))
+  if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q,
+                                    MaxRecurse))
     return V;
 
-  // X >> X -> 0
-  if (Op0 == Op1)
-    return Constant::getNullValue(Op0->getType());
-
   // all ones >>a X -> all ones
   if (match(Op0, m_AllOnes()))
     return Op0;
@@ -1342,21 +1428,75 @@
 
   // (X << A) >> A -> X
   Value *X;
-  if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1))) &&
-      cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap())
+  if (match(Op0, m_NSWShl(m_Value(X), m_Specific(Op1))))
     return X;
 
+  // Arithmetic shifting an all-sign-bit value is a no-op.
+  unsigned NumSignBits = ComputeNumSignBits(Op0, Q.DL, 0, Q.AT, Q.CxtI, Q.DT);
+  if (NumSignBits == Op0->getType()->getScalarSizeInBits())
+    return Op0;
+
   return nullptr;
 }
 
 Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifyAShrInst(Op0, Op1, isExact, Query (DL, TLI, DT),
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifyAShrInst(Op0, Op1, isExact, Query (DL, TLI, DT, AT, CxtI),
                             RecursionLimit);
 }
 
+// Simplify (and (icmp ...) (icmp ...)) to true when we can tell that the range
+// of possible values cannot be satisfied.
+static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
+  ICmpInst::Predicate Pred0, Pred1;
+  ConstantInt *CI1, *CI2;
+  Value *V;
+  if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_ConstantInt(CI1)),
+                         m_ConstantInt(CI2))))
+   return nullptr;
+
+  if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Specific(CI1))))
+    return nullptr;
+
+  Type *ITy = Op0->getType();
+
+  auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
+  bool isNSW = AddInst->hasNoSignedWrap();
+  bool isNUW = AddInst->hasNoUnsignedWrap();
+
+  const APInt &CI1V = CI1->getValue();
+  const APInt &CI2V = CI2->getValue();
+  const APInt Delta = CI2V - CI1V;
+  if (CI1V.isStrictlyPositive()) {
+    if (Delta == 2) {
+      if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_SGT)
+        return getFalse(ITy);
+      if (Pred0 == ICmpInst::ICMP_SLT && Pred1 == ICmpInst::ICMP_SGT && isNSW)
+        return getFalse(ITy);
+    }
+    if (Delta == 1) {
+      if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_SGT)
+        return getFalse(ITy);
+      if (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGT && isNSW)
+        return getFalse(ITy);
+    }
+  }
+  if (CI1V.getBoolValue() && isNUW) {
+    if (Delta == 2)
+      if (Pred0 == ICmpInst::ICMP_ULT && Pred1 == ICmpInst::ICMP_UGT)
+        return getFalse(ITy);
+    if (Delta == 1)
+      if (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGT)
+        return getFalse(ITy);
+  }
+
+  return nullptr;
+}
+
 /// SimplifyAndInst - Given operands for an And, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
@@ -1407,12 +1547,21 @@
   // A & (-A) = A if A is a power of two or zero.
   if (match(Op0, m_Neg(m_Specific(Op1))) ||
       match(Op1, m_Neg(m_Specific(Op0)))) {
-    if (isKnownToBeAPowerOfTwo(Op0, /*OrZero*/true))
+    if (isKnownToBeAPowerOfTwo(Op0, /*OrZero*/true, 0, Q.AT, Q.CxtI, Q.DT))
       return Op0;
-    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true))
+    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true, 0, Q.AT, Q.CxtI, Q.DT))
       return Op1;
   }
 
+  if (auto *ICILHS = dyn_cast<ICmpInst>(Op0)) {
+    if (auto *ICIRHS = dyn_cast<ICmpInst>(Op1)) {
+      if (Value *V = SimplifyAndOfICmps(ICILHS, ICIRHS))
+        return V;
+      if (Value *V = SimplifyAndOfICmps(ICIRHS, ICILHS))
+        return V;
+    }
+  }
+
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q,
                                           MaxRecurse))
@@ -1447,8 +1596,58 @@
 
 Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const DataLayout *DL,
                              const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifyAndInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifyAndInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                           RecursionLimit);
+}
+
+// Simplify (or (icmp ...) (icmp ...)) to true when we can tell that the union
+// contains all possible values.
+static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
+  ICmpInst::Predicate Pred0, Pred1;
+  ConstantInt *CI1, *CI2;
+  Value *V;
+  if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_ConstantInt(CI1)),
+                         m_ConstantInt(CI2))))
+   return nullptr;
+
+  if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Specific(CI1))))
+    return nullptr;
+
+  Type *ITy = Op0->getType();
+
+  auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
+  bool isNSW = AddInst->hasNoSignedWrap();
+  bool isNUW = AddInst->hasNoUnsignedWrap();
+
+  const APInt &CI1V = CI1->getValue();
+  const APInt &CI2V = CI2->getValue();
+  const APInt Delta = CI2V - CI1V;
+  if (CI1V.isStrictlyPositive()) {
+    if (Delta == 2) {
+      if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_SLE)
+        return getTrue(ITy);
+      if (Pred0 == ICmpInst::ICMP_SGE && Pred1 == ICmpInst::ICMP_SLE && isNSW)
+        return getTrue(ITy);
+    }
+    if (Delta == 1) {
+      if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_SLE)
+        return getTrue(ITy);
+      if (Pred0 == ICmpInst::ICMP_SGT && Pred1 == ICmpInst::ICMP_SLE && isNSW)
+        return getTrue(ITy);
+    }
+  }
+  if (CI1V.getBoolValue() && isNUW) {
+    if (Delta == 2)
+      if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_ULE)
+        return getTrue(ITy);
+    if (Delta == 1)
+      if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_ULE)
+        return getTrue(ITy);
+  }
+
+  return nullptr;
 }
 
 /// SimplifyOrInst - Given operands for an Or, see if we can
@@ -1508,6 +1707,15 @@
       (A == Op0 || B == Op0))
     return Constant::getAllOnesValue(Op0->getType());
 
+  if (auto *ICILHS = dyn_cast<ICmpInst>(Op0)) {
+    if (auto *ICIRHS = dyn_cast<ICmpInst>(Op1)) {
+      if (Value *V = SimplifyOrOfICmps(ICILHS, ICIRHS))
+        return V;
+      if (Value *V = SimplifyOrOfICmps(ICIRHS, ICILHS))
+        return V;
+    }
+  }
+
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q,
                                           MaxRecurse))
@@ -1540,18 +1748,22 @@
       if ((C2->getValue() & (C2->getValue() + 1)) == 0 && // C2 == 0+1+
           match(A, m_Add(m_Value(V1), m_Value(V2)))) {
         // Add commutes, try both ways.
-        if (V1 == B && MaskedValueIsZero(V2, C2->getValue()))
+        if (V1 == B && MaskedValueIsZero(V2, C2->getValue(), Q.DL,
+                                         0, Q.AT, Q.CxtI, Q.DT))
           return A;
-        if (V2 == B && MaskedValueIsZero(V1, C2->getValue()))
+        if (V2 == B && MaskedValueIsZero(V1, C2->getValue(), Q.DL,
+                                         0, Q.AT, Q.CxtI, Q.DT))
           return A;
       }
       // Or commutes, try both ways.
       if ((C1->getValue() & (C1->getValue() + 1)) == 0 &&
           match(B, m_Add(m_Value(V1), m_Value(V2)))) {
         // Add commutes, try both ways.
-        if (V1 == A && MaskedValueIsZero(V2, C1->getValue()))
+        if (V1 == A && MaskedValueIsZero(V2, C1->getValue(), Q.DL,
+                                         0, Q.AT, Q.CxtI, Q.DT))
           return B;
-        if (V2 == A && MaskedValueIsZero(V1, C1->getValue()))
+        if (V2 == A && MaskedValueIsZero(V1, C1->getValue(), Q.DL,
+                                         0, Q.AT, Q.CxtI, Q.DT))
           return B;
       }
     }
@@ -1568,8 +1780,10 @@
 
 Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout *DL,
                             const TargetLibraryInfo *TLI,
-                            const DominatorTree *DT) {
-  return ::SimplifyOrInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                            const DominatorTree *DT, AssumptionTracker *AT,
+                            const Instruction *CxtI) {
+  return ::SimplifyOrInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                          RecursionLimit);
 }
 
 /// SimplifyXorInst - Given operands for a Xor, see if we can
@@ -1623,8 +1837,10 @@
 
 Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const DataLayout *DL,
                              const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifyXorInst(Op0, Op1, Query (DL, TLI, DT), RecursionLimit);
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifyXorInst(Op0, Op1, Query (DL, TLI, DT, AT, CxtI),
+                           RecursionLimit);
 }
 
 static Type *GetCompareTy(Value *Op) {
@@ -1878,40 +2094,46 @@
       return getTrue(ITy);
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULE:
-      if (isKnownNonZero(LHS, Q.DL))
+      if (isKnownNonZero(LHS, Q.DL, 0, Q.AT, Q.CxtI, Q.DT))
         return getFalse(ITy);
       break;
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
-      if (isKnownNonZero(LHS, Q.DL))
+      if (isKnownNonZero(LHS, Q.DL, 0, Q.AT, Q.CxtI, Q.DT))
         return getTrue(ITy);
       break;
     case ICmpInst::ICMP_SLT:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL,
+                     0, Q.AT, Q.CxtI, Q.DT);
       if (LHSKnownNegative)
         return getTrue(ITy);
       if (LHSKnownNonNegative)
         return getFalse(ITy);
       break;
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL,
+                     0, Q.AT, Q.CxtI, Q.DT);
       if (LHSKnownNegative)
         return getTrue(ITy);
-      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL))
+      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL,
+                                                0, Q.AT, Q.CxtI, Q.DT))
         return getFalse(ITy);
       break;
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL,
+                     0, Q.AT, Q.CxtI, Q.DT);
       if (LHSKnownNegative)
         return getFalse(ITy);
       if (LHSKnownNonNegative)
         return getTrue(ITy);
       break;
     case ICmpInst::ICMP_SGT:
-      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL);
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL,
+                     0, Q.AT, Q.CxtI, Q.DT);
       if (LHSKnownNegative)
         return getFalse(ITy);
-      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL))
+      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 
+                                                0, Q.AT, Q.CxtI, Q.DT))
         return getTrue(ITy);
       break;
     }
@@ -1958,13 +2180,39 @@
         Lower = (-Upper) + 1;
       }
     } else if (match(LHS, m_SDiv(m_Value(), m_ConstantInt(CI2)))) {
-      // 'sdiv x, CI2' produces [INT_MIN / CI2, INT_MAX / CI2].
       APInt IntMin = APInt::getSignedMinValue(Width);
       APInt IntMax = APInt::getSignedMaxValue(Width);
-      APInt Val = CI2->getValue().abs();
-      if (!Val.isMinValue()) {
+      APInt Val = CI2->getValue();
+      if (Val.isAllOnesValue()) {
+        // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX]
+        //    where CI2 != -1 and CI2 != 0 and CI2 != 1
+        Lower = IntMin + 1;
+        Upper = IntMax + 1;
+      } else if (Val.countLeadingZeros() < Width - 1) {
+        // 'sdiv x, CI2' produces [INT_MIN / CI2, INT_MAX / CI2]
+        //    where CI2 != -1 and CI2 != 0 and CI2 != 1
         Lower = IntMin.sdiv(Val);
-        Upper = IntMax.sdiv(Val) + 1;
+        Upper = IntMax.sdiv(Val);
+        if (Lower.sgt(Upper))
+          std::swap(Lower, Upper);
+        Upper = Upper + 1;
+        assert(Upper != Lower && "Upper part of range has wrapped!");
+      }
+    } else if (match(LHS, m_NUWShl(m_ConstantInt(CI2), m_Value()))) {
+      // 'shl nuw CI2, x' produces [CI2, CI2 << CLZ(CI2)]
+      Lower = CI2->getValue();
+      Upper = Lower.shl(Lower.countLeadingZeros()) + 1;
+    } else if (match(LHS, m_NSWShl(m_ConstantInt(CI2), m_Value()))) {
+      if (CI2->isNegative()) {
+        // 'shl nsw CI2, x' produces [CI2 << CLO(CI2)-1, CI2]
+        unsigned ShiftAmount = CI2->getValue().countLeadingOnes() - 1;
+        Lower = CI2->getValue().shl(ShiftAmount);
+        Upper = CI2->getValue() + 1;
+      } else {
+        // 'shl nsw CI2, x' produces [CI2, CI2 << CLZ(CI2)-1]
+        unsigned ShiftAmount = CI2->getValue().countLeadingZeros() - 1;
+        Lower = CI2->getValue();
+        Upper = CI2->getValue().shl(ShiftAmount) + 1;
       }
     } else if (match(LHS, m_LShr(m_Value(), m_ConstantInt(CI2)))) {
       // 'lshr x, CI2' produces [0, UINT_MAX >> CI2].
@@ -2174,25 +2422,6 @@
     }
   }
 
-  // If a bit is known to be zero for A and known to be one for B,
-  // then A and B cannot be equal.
-  if (ICmpInst::isEquality(Pred)) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
-      uint32_t BitWidth = CI->getBitWidth();
-      APInt LHSKnownZero(BitWidth, 0);
-      APInt LHSKnownOne(BitWidth, 0);
-      computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
-      APInt RHSKnownZero(BitWidth, 0);
-      APInt RHSKnownOne(BitWidth, 0);
-      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
-      if (((LHSKnownOne & RHSKnownZero) != 0) ||
-          ((LHSKnownZero & RHSKnownOne) != 0))
-        return (Pred == ICmpInst::ICMP_EQ)
-                   ? ConstantInt::getFalse(CI->getContext())
-                   : ConstantInt::getTrue(CI->getContext());
-    }
-  }
-
   // Special logic for binary operators.
   BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
@@ -2286,7 +2515,8 @@
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL);
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL,
+                     0, Q.AT, Q.CxtI, Q.DT);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2296,7 +2526,8 @@
       return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL);
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL,
+                     0, Q.AT, Q.CxtI, Q.DT);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2315,7 +2546,8 @@
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL);
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL,
+                     0, Q.AT, Q.CxtI, Q.DT);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2325,7 +2557,8 @@
       return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL);
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL,
+                     0, Q.AT, Q.CxtI, Q.DT);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2345,6 +2578,41 @@
       return getTrue(ITy);
   }
 
+  // handle:
+  //   CI2 << X == CI
+  //   CI2 << X != CI
+  //
+  //   where CI2 is a power of 2 and CI isn't
+  if (auto *CI = dyn_cast<ConstantInt>(RHS)) {
+    const APInt *CI2Val, *CIVal = &CI->getValue();
+    if (LBO && match(LBO, m_Shl(m_APInt(CI2Val), m_Value())) &&
+        CI2Val->isPowerOf2()) {
+      if (!CIVal->isPowerOf2()) {
+        // CI2 << X can equal zero in some circumstances,
+        // this simplification is unsafe if CI is zero.
+        //
+        // We know it is safe if:
+        // - The shift is nsw, we can't shift out the one bit.
+        // - The shift is nuw, we can't shift out the one bit.
+        // - CI2 is one
+        // - CI isn't zero
+        if (LBO->hasNoSignedWrap() || LBO->hasNoUnsignedWrap() ||
+            *CI2Val == 1 || !CI->isZero()) {
+          if (Pred == ICmpInst::ICMP_EQ)
+            return ConstantInt::getFalse(RHS->getContext());
+          if (Pred == ICmpInst::ICMP_NE)
+            return ConstantInt::getTrue(RHS->getContext());
+        }
+      }
+      if (CIVal->isSignBit() && *CI2Val == 1) {
+        if (Pred == ICmpInst::ICMP_UGT)
+          return ConstantInt::getFalse(RHS->getContext());
+        if (Pred == ICmpInst::ICMP_ULE)
+          return ConstantInt::getTrue(RHS->getContext());
+      }
+    }
+  }
+
   if (MaxRecurse && LBO && RBO && LBO->getOpcode() == RBO->getOpcode() &&
       LBO->getOperand(1) == RBO->getOperand(1)) {
     switch (LBO->getOpcode()) {
@@ -2592,6 +2860,23 @@
     }
   }
 
+  // If a bit is known to be zero for A and known to be one for B,
+  // then A and B cannot be equal.
+  if (ICmpInst::isEquality(Pred)) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      uint32_t BitWidth = CI->getBitWidth();
+      APInt LHSKnownZero(BitWidth, 0);
+      APInt LHSKnownOne(BitWidth, 0);
+      computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, Q.DL, /*Depth=*/0, Q.AT,
+                       Q.CxtI, Q.DT);
+      const APInt &RHSVal = CI->getValue();
+      if (((LHSKnownZero & RHSVal) != 0) || ((LHSKnownOne & ~RHSVal) != 0))
+        return Pred == ICmpInst::ICMP_EQ
+                   ? ConstantInt::getFalse(CI->getContext())
+                   : ConstantInt::getTrue(CI->getContext());
+    }
+  }
+
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
@@ -2610,8 +2895,10 @@
 Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifyICmpInst(Predicate, LHS, RHS, Query (DL, TLI, DT),
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              Instruction *CxtI) {
+  return ::SimplifyICmpInst(Predicate, LHS, RHS, Query (DL, TLI, DT, AT, CxtI),
                             RecursionLimit);
 }
 
@@ -2707,8 +2994,10 @@
 Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                               const DataLayout *DL,
                               const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT) {
-  return ::SimplifyFCmpInst(Predicate, LHS, RHS, Query (DL, TLI, DT),
+                              const DominatorTree *DT,
+                              AssumptionTracker *AT,
+                              const Instruction *CxtI) {
+  return ::SimplifyFCmpInst(Predicate, LHS, RHS, Query (DL, TLI, DT, AT, CxtI),
                             RecursionLimit);
 }
 
@@ -2746,9 +3035,11 @@
 Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                                 const DataLayout *DL,
                                 const TargetLibraryInfo *TLI,
-                                const DominatorTree *DT) {
-  return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Query (DL, TLI, DT),
-                              RecursionLimit);
+                                const DominatorTree *DT,
+                                AssumptionTracker *AT,
+                                const Instruction *CxtI) {
+  return ::SimplifySelectInst(Cond, TrueVal, FalseVal,
+                              Query (DL, TLI, DT, AT, CxtI), RecursionLimit);
 }
 
 /// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
@@ -2756,29 +3047,72 @@
 static Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const Query &Q, unsigned) {
   // The type of the GEP pointer operand.
   PointerType *PtrTy = cast<PointerType>(Ops[0]->getType()->getScalarType());
+  unsigned AS = PtrTy->getAddressSpace();
 
   // getelementptr P -> P.
   if (Ops.size() == 1)
     return Ops[0];
 
-  if (isa<UndefValue>(Ops[0])) {
-    // Compute the (pointer) type returned by the GEP instruction.
-    Type *LastType = GetElementPtrInst::getIndexedType(PtrTy, Ops.slice(1));
-    Type *GEPTy = PointerType::get(LastType, PtrTy->getAddressSpace());
-    if (VectorType *VT = dyn_cast<VectorType>(Ops[0]->getType()))
-      GEPTy = VectorType::get(GEPTy, VT->getNumElements());
+  // Compute the (pointer) type returned by the GEP instruction.
+  Type *LastType = GetElementPtrInst::getIndexedType(PtrTy, Ops.slice(1));
+  Type *GEPTy = PointerType::get(LastType, AS);
+  if (VectorType *VT = dyn_cast<VectorType>(Ops[0]->getType()))
+    GEPTy = VectorType::get(GEPTy, VT->getNumElements());
+
+  if (isa<UndefValue>(Ops[0]))
     return UndefValue::get(GEPTy);
-  }
 
   if (Ops.size() == 2) {
     // getelementptr P, 0 -> P.
     if (match(Ops[1], m_Zero()))
       return Ops[0];
-    // getelementptr P, N -> P if P points to a type of zero size.
-    if (Q.DL) {
-      Type *Ty = PtrTy->getElementType();
-      if (Ty->isSized() && Q.DL->getTypeAllocSize(Ty) == 0)
+
+    Type *Ty = PtrTy->getElementType();
+    if (Q.DL && Ty->isSized()) {
+      Value *P;
+      uint64_t C;
+      uint64_t TyAllocSize = Q.DL->getTypeAllocSize(Ty);
+      // getelementptr P, N -> P if P points to a type of zero size.
+      if (TyAllocSize == 0)
         return Ops[0];
+
+      // The following transforms are only safe if the ptrtoint cast
+      // doesn't truncate the pointers.
+      if (Ops[1]->getType()->getScalarSizeInBits() ==
+          Q.DL->getPointerSizeInBits(AS)) {
+        auto PtrToIntOrZero = [GEPTy](Value *P) -> Value * {
+          if (match(P, m_Zero()))
+            return Constant::getNullValue(GEPTy);
+          Value *Temp;
+          if (match(P, m_PtrToInt(m_Value(Temp))))
+            if (Temp->getType() == GEPTy)
+              return Temp;
+          return nullptr;
+        };
+
+        // getelementptr V, (sub P, V) -> P if P points to a type of size 1.
+        if (TyAllocSize == 1 &&
+            match(Ops[1], m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0])))))
+          if (Value *R = PtrToIntOrZero(P))
+            return R;
+
+        // getelementptr V, (ashr (sub P, V), C) -> Q
+        // if P points to a type of size 1 << C.
+        if (match(Ops[1],
+                  m_AShr(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
+                         m_ConstantInt(C))) &&
+            TyAllocSize == 1ULL << C)
+          if (Value *R = PtrToIntOrZero(P))
+            return R;
+
+        // getelementptr V, (sdiv (sub P, V), C) -> Q
+        // if P points to a type of size C.
+        if (match(Ops[1],
+                  m_SDiv(m_Sub(m_Value(P), m_PtrToInt(m_Specific(Ops[0]))),
+                         m_SpecificInt(TyAllocSize))))
+          if (Value *R = PtrToIntOrZero(P))
+            return R;
+      }
     }
   }
 
@@ -2792,8 +3126,9 @@
 
 Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout *DL,
                              const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifyGEPInst(Ops, Query (DL, TLI, DT), RecursionLimit);
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifyGEPInst(Ops, Query (DL, TLI, DT, AT, CxtI), RecursionLimit);
 }
 
 /// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we
@@ -2829,8 +3164,11 @@
                                      ArrayRef<unsigned> Idxs,
                                      const DataLayout *DL,
                                      const TargetLibraryInfo *TLI,
-                                     const DominatorTree *DT) {
-  return ::SimplifyInsertValueInst(Agg, Val, Idxs, Query (DL, TLI, DT),
+                                     const DominatorTree *DT,
+                                     AssumptionTracker *AT,
+                                     const Instruction *CxtI) {
+  return ::SimplifyInsertValueInst(Agg, Val, Idxs,
+                                   Query (DL, TLI, DT, AT, CxtI),
                                    RecursionLimit);
 }
 
@@ -2877,8 +3215,11 @@
 
 Value *llvm::SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *DL,
                                const TargetLibraryInfo *TLI,
-                               const DominatorTree *DT) {
-  return ::SimplifyTruncInst(Op, Ty, Query (DL, TLI, DT), RecursionLimit);
+                               const DominatorTree *DT,
+                               AssumptionTracker *AT,
+                               const Instruction *CxtI) {
+  return ::SimplifyTruncInst(Op, Ty, Query (DL, TLI, DT, AT, CxtI),
+                             RecursionLimit);
 }
 
 //=== Helper functions for higher up the class hierarchy.
@@ -2950,8 +3291,10 @@
 
 Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                            const DataLayout *DL, const TargetLibraryInfo *TLI,
-                           const DominatorTree *DT) {
-  return ::SimplifyBinOp(Opcode, LHS, RHS, Query (DL, TLI, DT), RecursionLimit);
+                           const DominatorTree *DT, AssumptionTracker *AT,
+                           const Instruction *CxtI) {
+  return ::SimplifyBinOp(Opcode, LHS, RHS, Query (DL, TLI, DT, AT, CxtI),
+                         RecursionLimit);
 }
 
 /// SimplifyCmpInst - Given operands for a CmpInst, see if we can
@@ -2965,8 +3308,9 @@
 
 Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                              const DataLayout *DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT) {
-  return ::SimplifyCmpInst(Predicate, LHS, RHS, Query (DL, TLI, DT),
+                             const DominatorTree *DT, AssumptionTracker *AT,
+                             const Instruction *CxtI) {
+  return ::SimplifyCmpInst(Predicate, LHS, RHS, Query (DL, TLI, DT, AT, CxtI),
                            RecursionLimit);
 }
 
@@ -3041,23 +3385,26 @@
 Value *llvm::SimplifyCall(Value *V, User::op_iterator ArgBegin,
                           User::op_iterator ArgEnd, const DataLayout *DL,
                           const TargetLibraryInfo *TLI,
-                          const DominatorTree *DT) {
-  return ::SimplifyCall(V, ArgBegin, ArgEnd, Query(DL, TLI, DT),
+                          const DominatorTree *DT, AssumptionTracker *AT,
+                          const Instruction *CxtI) {
+  return ::SimplifyCall(V, ArgBegin, ArgEnd, Query(DL, TLI, DT, AT, CxtI),
                         RecursionLimit);
 }
 
 Value *llvm::SimplifyCall(Value *V, ArrayRef<Value *> Args,
                           const DataLayout *DL, const TargetLibraryInfo *TLI,
-                          const DominatorTree *DT) {
-  return ::SimplifyCall(V, Args.begin(), Args.end(), Query(DL, TLI, DT),
-                        RecursionLimit);
+                          const DominatorTree *DT, AssumptionTracker *AT,
+                          const Instruction *CxtI) {
+  return ::SimplifyCall(V, Args.begin(), Args.end(),
+                        Query(DL, TLI, DT, AT, CxtI), RecursionLimit);
 }
 
 /// SimplifyInstruction - See if we can compute a simplified version of this
 /// instruction.  If not, this returns null.
 Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout *DL,
                                  const TargetLibraryInfo *TLI,
-                                 const DominatorTree *DT) {
+                                 const DominatorTree *DT,
+                                 AssumptionTracker *AT) {
   Value *Result;
 
   switch (I->getOpcode()) {
@@ -3066,109 +3413,122 @@
     break;
   case Instruction::FAdd:
     Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT);
+                              I->getFastMathFlags(), DL, TLI, DT, AT, I);
     break;
   case Instruction::Add:
     Result = SimplifyAddInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
-                             DL, TLI, DT);
+                             DL, TLI, DT, AT, I);
     break;
   case Instruction::FSub:
     Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT);
+                              I->getFastMathFlags(), DL, TLI, DT, AT, I);
     break;
   case Instruction::Sub:
     Result = SimplifySubInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
-                             DL, TLI, DT);
+                             DL, TLI, DT, AT, I);
     break;
   case Instruction::FMul:
     Result = SimplifyFMulInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT);
+                              I->getFastMathFlags(), DL, TLI, DT, AT, I);
     break;
   case Instruction::Mul:
-    Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1),
+                             DL, TLI, DT, AT, I);
     break;
   case Instruction::SDiv:
-    Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1),
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::UDiv:
-    Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1),
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::FDiv:
-    Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1),
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::SRem:
-    Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1),
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::URem:
-    Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1),
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::FRem:
-    Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1),
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::Shl:
     Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
-                             DL, TLI, DT);
+                             DL, TLI, DT, AT, I);
     break;
   case Instruction::LShr:
     Result = SimplifyLShrInst(I->getOperand(0), I->getOperand(1),
                               cast<BinaryOperator>(I)->isExact(),
-                              DL, TLI, DT);
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::AShr:
     Result = SimplifyAShrInst(I->getOperand(0), I->getOperand(1),
                               cast<BinaryOperator>(I)->isExact(),
-                              DL, TLI, DT);
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::And:
-    Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1),
+                             DL, TLI, DT, AT, I);
     break;
   case Instruction::Or:
-    Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
+                            AT, I);
     break;
   case Instruction::Xor:
-    Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+    Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1),
+                             DL, TLI, DT, AT, I);
     break;
   case Instruction::ICmp:
     Result = SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(),
-                              I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+                              I->getOperand(0), I->getOperand(1),
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::FCmp:
     Result = SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(),
-                              I->getOperand(0), I->getOperand(1), DL, TLI, DT);
+                              I->getOperand(0), I->getOperand(1),
+                              DL, TLI, DT, AT, I);
     break;
   case Instruction::Select:
     Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1),
-                                I->getOperand(2), DL, TLI, DT);
+                                I->getOperand(2), DL, TLI, DT, AT, I);
     break;
   case Instruction::GetElementPtr: {
     SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
-    Result = SimplifyGEPInst(Ops, DL, TLI, DT);
+    Result = SimplifyGEPInst(Ops, DL, TLI, DT, AT, I);
     break;
   }
   case Instruction::InsertValue: {
     InsertValueInst *IV = cast<InsertValueInst>(I);
     Result = SimplifyInsertValueInst(IV->getAggregateOperand(),
                                      IV->getInsertedValueOperand(),
-                                     IV->getIndices(), DL, TLI, DT);
+                                     IV->getIndices(), DL, TLI, DT, AT, I);
     break;
   }
   case Instruction::PHI:
-    Result = SimplifyPHINode(cast<PHINode>(I), Query (DL, TLI, DT));
+    Result = SimplifyPHINode(cast<PHINode>(I), Query (DL, TLI, DT, AT, I));
     break;
   case Instruction::Call: {
     CallSite CS(cast<CallInst>(I));
     Result = SimplifyCall(CS.getCalledValue(), CS.arg_begin(), CS.arg_end(),
-                          DL, TLI, DT);
+                          DL, TLI, DT, AT, I);
     break;
   }
   case Instruction::Trunc:
-    Result = SimplifyTruncInst(I->getOperand(0), I->getType(), DL, TLI, DT);
+    Result = SimplifyTruncInst(I->getOperand(0), I->getType(), DL, TLI, DT,
+                               AT, I);
     break;
   }
 
@@ -3192,7 +3552,8 @@
 static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
                                               const DataLayout *DL,
                                               const TargetLibraryInfo *TLI,
-                                              const DominatorTree *DT) {
+                                              const DominatorTree *DT,
+                                              AssumptionTracker *AT) {
   bool Simplified = false;
   SmallSetVector<Instruction *, 8> Worklist;
 
@@ -3219,7 +3580,7 @@
     I = Worklist[Idx];
 
     // See if this instruction simplifies.
-    SimpleV = SimplifyInstruction(I, DL, TLI, DT);
+    SimpleV = SimplifyInstruction(I, DL, TLI, DT, AT);
     if (!SimpleV)
       continue;
 
@@ -3245,15 +3606,17 @@
 bool llvm::recursivelySimplifyInstruction(Instruction *I,
                                           const DataLayout *DL,
                                           const TargetLibraryInfo *TLI,
-                                          const DominatorTree *DT) {
-  return replaceAndRecursivelySimplifyImpl(I, nullptr, DL, TLI, DT);
+                                          const DominatorTree *DT,
+                                          AssumptionTracker *AT) {
+  return replaceAndRecursivelySimplifyImpl(I, nullptr, DL, TLI, DT, AT);
 }
 
 bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
                                          const DataLayout *DL,
                                          const TargetLibraryInfo *TLI,
-                                         const DominatorTree *DT) {
+                                         const DominatorTree *DT,
+                                         AssumptionTracker *AT) {
   assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!");
   assert(SimpleV && "Must provide a simplified value.");
-  return replaceAndRecursivelySimplifyImpl(I, SimpleV, DL, TLI, DT);
+  return replaceAndRecursivelySimplifyImpl(I, SimpleV, DL, TLI, DT, AT);
 }

diff --git a/lib/Analysis/JumpInstrTableInfo.cpp b/lib/Analysis/JumpInstrTableInfo.cpp
index b5b4265..7aae2a5 100644
--- a/lib/Analysis/JumpInstrTableInfo.cpp
+++ b/lib/Analysis/JumpInstrTableInfo.cpp

@@ -17,6 +17,7 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
@@ -28,7 +29,21 @@
   return new JumpInstrTableInfo();
 }
 
-JumpInstrTableInfo::JumpInstrTableInfo() : ImmutablePass(ID), Tables() {
+ModulePass *llvm::createJumpInstrTableInfoPass(unsigned Bound) {
+  // This cast is always safe, since Bound is always in a subset of uint64_t.
+  uint64_t B = static_cast<uint64_t>(Bound);
+  return new JumpInstrTableInfo(B);
+}
+
+JumpInstrTableInfo::JumpInstrTableInfo(uint64_t ByteAlign)
+    : ImmutablePass(ID), Tables(), ByteAlignment(ByteAlign) {
+  if (!llvm::isPowerOf2_64(ByteAlign)) {
+    // Note that we don't explicitly handle overflow here, since we handle the 0
+    // case explicitly when a caller actually tries to create jumptable entries,
+    // and this is the return value on overflow.
+    ByteAlignment = llvm::NextPowerOf2(ByteAlign);
+  }
+
   initializeJumpInstrTableInfoPass(*PassRegistry::getPassRegistry());
 }
 

diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index e073616..767da4e 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp

@@ -48,7 +48,7 @@
     }
 
     for (Value *Op : C->operand_values())
-      if (Visited.insert(cast<Constant>(Op)))
+      if (Visited.insert(cast<Constant>(Op)).second)
         Worklist.push_back(cast<Constant>(Op));
   }
 }
@@ -66,7 +66,7 @@
     for (Instruction &I : BB)
       for (Value *Op : I.operand_values())
         if (Constant *C = dyn_cast<Constant>(Op))
-          if (Visited.insert(C))
+          if (Visited.insert(C).second)
             Worklist.push_back(C);
 
   // We've collected all the constant (and thus potentially function or
@@ -113,7 +113,7 @@
   SmallPtrSet<Constant *, 16> Visited;
   for (GlobalVariable &GV : M.globals())
     if (GV.hasInitializer())
-      if (Visited.insert(GV.getInitializer()))
+      if (Visited.insert(GV.getInitializer()).second)
         Worklist.push_back(GV.getInitializer());
 
   DEBUG(dbgs() << "  Adding functions referenced by global initializers to the "
@@ -688,7 +688,7 @@
                        SmallPtrSetImpl<LazyCallGraph::Node *> &Printed) {
   // Recurse depth first through the nodes.
   for (LazyCallGraph::Node &ChildN : N)
-    if (Printed.insert(&ChildN))
+    if (Printed.insert(&ChildN).second)
       printNodes(OS, ChildN, Printed);
 
   OS << "  Call edges in function: " << N.getFunction().getName() << "\n";
@@ -717,7 +717,7 @@
 
   SmallPtrSet<LazyCallGraph::Node *, 16> Printed;
   for (LazyCallGraph::Node &N : G)
-    if (Printed.insert(&N))
+    if (Printed.insert(&N).second)
       printNodes(OS, N, Printed);
 
   for (LazyCallGraph::SCC &SCC : G.postorder_sccs())

diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 9f919f7..c712c9f 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp

@@ -15,12 +15,14 @@
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
@@ -38,6 +40,7 @@
 char LazyValueInfo::ID = 0;
 INITIALIZE_PASS_BEGIN(LazyValueInfo, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(LazyValueInfo, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
@@ -338,6 +341,13 @@
     /// during a query.  It basically emulates the callstack of the naive
     /// recursive value lookup process.
     std::stack<std::pair<BasicBlock*, Value*> > BlockValueStack;
+
+    /// A pointer to the cache of @llvm.assume calls.
+    AssumptionTracker *AT;
+    /// An optional DL pointer.
+    const DataLayout *DL;
+    /// An optional DT pointer.
+    DominatorTree *DT;
     
     friend struct LVIValueHandle;
     
@@ -364,7 +374,8 @@
 
     LVILatticeVal getBlockValue(Value *Val, BasicBlock *BB);
     bool getEdgeValue(Value *V, BasicBlock *F, BasicBlock *T,
-                      LVILatticeVal &Result);
+                      LVILatticeVal &Result,
+                      Instruction *CxtI = nullptr);
     bool hasBlockValue(Value *Val, BasicBlock *BB);
 
     // These methods process one work item and may add more. A false value
@@ -377,6 +388,8 @@
                                 PHINode *PN, BasicBlock *BB);
     bool solveBlockValueConstantRange(LVILatticeVal &BBLV,
                                       Instruction *BBI, BasicBlock *BB);
+    void mergeAssumeBlockValueConstantRange(Value *Val, LVILatticeVal &BBLV,
+                                            Instruction *BBI);
 
     void solve();
     
@@ -387,11 +400,18 @@
   public:
     /// getValueInBlock - This is the query interface to determine the lattice
     /// value for the specified Value* at the end of the specified block.
-    LVILatticeVal getValueInBlock(Value *V, BasicBlock *BB);
+    LVILatticeVal getValueInBlock(Value *V, BasicBlock *BB,
+                                  Instruction *CxtI = nullptr);
+
+    /// getValueAt - This is the query interface to determine the lattice
+    /// value for the specified Value* at the specified instruction (generally
+    /// from an assume intrinsic).
+    LVILatticeVal getValueAt(Value *V, Instruction *CxtI);
 
     /// getValueOnEdge - This is the query interface to determine the lattice
     /// value for the specified Value* that is true on the specified edge.
-    LVILatticeVal getValueOnEdge(Value *V, BasicBlock *FromBB,BasicBlock *ToBB);
+    LVILatticeVal getValueOnEdge(Value *V, BasicBlock *FromBB,BasicBlock *ToBB,
+                                 Instruction *CxtI = nullptr);
     
     /// threadEdge - This is the update interface to inform the cache that an
     /// edge from PredBB to OldSucc has been threaded to be from PredBB to
@@ -408,6 +428,10 @@
       ValueCache.clear();
       OverDefinedCache.clear();
     }
+
+    LazyValueInfoCache(AssumptionTracker *AT,
+                       const DataLayout *DL = nullptr,
+                       DominatorTree *DT = nullptr) : AT(AT), DL(DL), DT(DT) {}
   };
 } // end anonymous namespace
 
@@ -500,7 +524,6 @@
   // cache needs updating, i.e. if we have solve a new value or not.
   OverDefinedCacheUpdater ODCacheUpdater(Val, BB, BBLV, this);
 
-  // If we've already computed this block's value, return it.
   if (!BBLV.isUndefined()) {
     DEBUG(dbgs() << "  reuse BB '" << BB->getName() << "' val=" << BBLV <<'\n');
     
@@ -669,7 +692,10 @@
     BasicBlock *PhiBB = PN->getIncomingBlock(i);
     Value *PhiVal = PN->getIncomingValue(i);
     LVILatticeVal EdgeResult;
-    EdgesMissing |= !getEdgeValue(PhiVal, PhiBB, BB, EdgeResult);
+    // Note that we can provide PN as the context value to getEdgeValue, even
+    // though the results will be cached, because PN is the value being used as
+    // the cache key in the caller.
+    EdgesMissing |= !getEdgeValue(PhiVal, PhiBB, BB, EdgeResult, PN);
     if (EdgesMissing)
       continue;
 
@@ -694,6 +720,36 @@
   return true;
 }
 
+static bool getValueFromFromCondition(Value *Val, ICmpInst *ICI,
+                                      LVILatticeVal &Result,
+                                      bool isTrueDest = true);
+
+// If we can determine a constant range for the value Val at the context
+// provided by the instruction BBI, then merge it into BBLV. If we did find a
+// constant range, return true.
+void LazyValueInfoCache::mergeAssumeBlockValueConstantRange(
+  Value *Val, LVILatticeVal &BBLV, Instruction *BBI) {
+  BBI = BBI ? BBI : dyn_cast<Instruction>(Val);
+  if (!BBI)
+    return;
+
+  for (auto &I : AT->assumptions(BBI->getParent()->getParent())) {
+    if (!isValidAssumeForContext(I, BBI, DL, DT))
+      continue;
+
+    Value *C = I->getArgOperand(0);
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(C)) {
+      LVILatticeVal Result;
+      if (getValueFromFromCondition(Val, ICI, Result)) {
+        if (BBLV.isOverdefined())
+          BBLV = Result;
+        else
+          BBLV.mergeIn(Result);
+      }
+    }
+  }
+}
+
 bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
                                                       Instruction *BBI,
                                                       BasicBlock *BB) {
@@ -704,6 +760,7 @@
   }
 
   LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
+  mergeAssumeBlockValueConstantRange(BBI->getOperand(0), LHSVal, BBI);
   if (!LHSVal.isConstantRange()) {
     BBLV.markOverdefined();
     return true;
@@ -775,6 +832,47 @@
   return true;
 }
 
+bool getValueFromFromCondition(Value *Val, ICmpInst *ICI,
+                               LVILatticeVal &Result, bool isTrueDest) {
+  if (ICI && isa<Constant>(ICI->getOperand(1))) {
+    if (ICI->isEquality() && ICI->getOperand(0) == Val) {
+      // We know that V has the RHS constant if this is a true SETEQ or
+      // false SETNE. 
+      if (isTrueDest == (ICI->getPredicate() == ICmpInst::ICMP_EQ))
+        Result = LVILatticeVal::get(cast<Constant>(ICI->getOperand(1)));
+      else
+        Result = LVILatticeVal::getNot(cast<Constant>(ICI->getOperand(1)));
+      return true;
+    }
+
+    // Recognize the range checking idiom that InstCombine produces.
+    // (X-C1) u< C2 --> [C1, C1+C2)
+    ConstantInt *NegOffset = nullptr;
+    if (ICI->getPredicate() == ICmpInst::ICMP_ULT)
+      match(ICI->getOperand(0), m_Add(m_Specific(Val),
+                                      m_ConstantInt(NegOffset)));
+
+    ConstantInt *CI = dyn_cast<ConstantInt>(ICI->getOperand(1));
+    if (CI && (ICI->getOperand(0) == Val || NegOffset)) {
+      // Calculate the range of values that would satisfy the comparison.
+      ConstantRange CmpRange(CI->getValue());
+      ConstantRange TrueValues =
+        ConstantRange::makeICmpRegion(ICI->getPredicate(), CmpRange);
+
+      if (NegOffset) // Apply the offset from above.
+        TrueValues = TrueValues.subtract(NegOffset->getValue());
+
+      // If we're interested in the false dest, invert the condition.
+      if (!isTrueDest) TrueValues = TrueValues.inverse();
+
+      Result = LVILatticeVal::getRange(TrueValues);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 /// \brief Compute the value of Val on the edge BBFrom -> BBTo. Returns false if
 /// Val is not constrained on the edge.
 static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
@@ -801,41 +899,8 @@
       // If the condition of the branch is an equality comparison, we may be
       // able to infer the value.
       ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition());
-      if (ICI && isa<Constant>(ICI->getOperand(1))) {
-        if (ICI->isEquality() && ICI->getOperand(0) == Val) {
-          // We know that V has the RHS constant if this is a true SETEQ or
-          // false SETNE. 
-          if (isTrueDest == (ICI->getPredicate() == ICmpInst::ICMP_EQ))
-            Result = LVILatticeVal::get(cast<Constant>(ICI->getOperand(1)));
-          else
-            Result = LVILatticeVal::getNot(cast<Constant>(ICI->getOperand(1)));
-          return true;
-        }
-
-        // Recognize the range checking idiom that InstCombine produces.
-        // (X-C1) u< C2 --> [C1, C1+C2)
-        ConstantInt *NegOffset = nullptr;
-        if (ICI->getPredicate() == ICmpInst::ICMP_ULT)
-          match(ICI->getOperand(0), m_Add(m_Specific(Val),
-                                          m_ConstantInt(NegOffset)));
-
-        ConstantInt *CI = dyn_cast<ConstantInt>(ICI->getOperand(1));
-        if (CI && (ICI->getOperand(0) == Val || NegOffset)) {
-          // Calculate the range of values that would satisfy the comparison.
-          ConstantRange CmpRange(CI->getValue());
-          ConstantRange TrueValues =
-            ConstantRange::makeICmpRegion(ICI->getPredicate(), CmpRange);
-
-          if (NegOffset) // Apply the offset from above.
-            TrueValues = TrueValues.subtract(NegOffset->getValue());
-
-          // If we're interested in the false dest, invert the condition.
-          if (!isTrueDest) TrueValues = TrueValues.inverse();
-
-          Result = LVILatticeVal::getRange(TrueValues);
-          return true;
-        }
-      }
+      if (getValueFromFromCondition(Val, ICI, Result, isTrueDest))
+        return true;
     }
   }
 
@@ -869,7 +934,8 @@
 /// \brief Compute the value of Val on the edge BBFrom -> BBTo, or the value at
 /// the basic block if the edge does not constraint Val.
 bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
-                                      BasicBlock *BBTo, LVILatticeVal &Result) {
+                                      BasicBlock *BBTo, LVILatticeVal &Result,
+                                      Instruction *CxtI) {
   // If already a constant, there is nothing to compute.
   if (Constant *VC = dyn_cast<Constant>(Val)) {
     Result = LVILatticeVal::get(VC);
@@ -891,6 +957,10 @@
 
     // Try to intersect ranges of the BB and the constraint on the edge.
     LVILatticeVal InBlock = getBlockValue(Val, BBFrom);
+    mergeAssumeBlockValueConstantRange(Val, InBlock, BBFrom->getTerminator());
+    // See note on the use of the CxtI with mergeAssumeBlockValueConstantRange,
+    // and caching, below.
+    mergeAssumeBlockValueConstantRange(Val, InBlock, CxtI);
     if (!InBlock.isConstantRange())
       return true;
 
@@ -907,30 +977,54 @@
 
   // if we couldn't compute the value on the edge, use the value from the BB
   Result = getBlockValue(Val, BBFrom);
+  mergeAssumeBlockValueConstantRange(Val, Result, BBFrom->getTerminator());
+  // We can use the context instruction (generically the ultimate instruction
+  // the calling pass is trying to simplify) here, even though the result of
+  // this function is generally cached when called from the solve* functions
+  // (and that cached result might be used with queries using a different
+  // context instruction), because when this function is called from the solve*
+  // functions, the context instruction is not provided. When called from
+  // LazyValueInfoCache::getValueOnEdge, the context instruction is provided,
+  // but then the result is not cached.
+  mergeAssumeBlockValueConstantRange(Val, Result, CxtI);
   return true;
 }
 
-LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB) {
+LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB,
+                                                  Instruction *CxtI) {
   DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '"
         << BB->getName() << "'\n");
   
   BlockValueStack.push(std::make_pair(BB, V));
   solve();
   LVILatticeVal Result = getBlockValue(V, BB);
+  mergeAssumeBlockValueConstantRange(V, Result, CxtI);
+
+  DEBUG(dbgs() << "  Result = " << Result << "\n");
+  return Result;
+}
+
+LVILatticeVal LazyValueInfoCache::getValueAt(Value *V, Instruction *CxtI) {
+  DEBUG(dbgs() << "LVI Getting value " << *V << " at '"
+        << CxtI->getName() << "'\n");
+
+  LVILatticeVal Result;
+  mergeAssumeBlockValueConstantRange(V, Result, CxtI);
 
   DEBUG(dbgs() << "  Result = " << Result << "\n");
   return Result;
 }
 
 LVILatticeVal LazyValueInfoCache::
-getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB) {
+getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
+               Instruction *CxtI) {
   DEBUG(dbgs() << "LVI Getting edge value " << *V << " from '"
         << FromBB->getName() << "' to '" << ToBB->getName() << "'\n");
   
   LVILatticeVal Result;
-  if (!getEdgeValue(V, FromBB, ToBB, Result)) {
+  if (!getEdgeValue(V, FromBB, ToBB, Result, CxtI)) {
     solve();
-    bool WasFastQuery = getEdgeValue(V, FromBB, ToBB, Result);
+    bool WasFastQuery = getEdgeValue(V, FromBB, ToBB, Result, CxtI);
     (void)WasFastQuery;
     assert(WasFastQuery && "More work to do after problem solved?");
   }
@@ -1004,39 +1098,51 @@
 //===----------------------------------------------------------------------===//
 
 /// getCache - This lazily constructs the LazyValueInfoCache.
-static LazyValueInfoCache &getCache(void *&PImpl) {
+static LazyValueInfoCache &getCache(void *&PImpl,
+                                    AssumptionTracker *AT,
+                                    const DataLayout *DL = nullptr,
+                                    DominatorTree *DT = nullptr) {
   if (!PImpl)
-    PImpl = new LazyValueInfoCache();
+    PImpl = new LazyValueInfoCache(AT, DL, DT);
   return *static_cast<LazyValueInfoCache*>(PImpl);
 }
 
 bool LazyValueInfo::runOnFunction(Function &F) {
-  if (PImpl)
-    getCache(PImpl).clear();
+  AT = &getAnalysis<AssumptionTracker>();
+
+  DominatorTreeWrapperPass *DTWP =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
 
+  if (PImpl)
+    getCache(PImpl, AT, DL, DT).clear();
+
   // Fully lazy.
   return false;
 }
 
 void LazyValueInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
+  AU.addRequired<AssumptionTracker>();
   AU.addRequired<TargetLibraryInfo>();
 }
 
 void LazyValueInfo::releaseMemory() {
   // If the cache was allocated, free it.
   if (PImpl) {
-    delete &getCache(PImpl);
+    delete &getCache(PImpl, AT);
     PImpl = nullptr;
   }
 }
 
-Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB) {
-  LVILatticeVal Result = getCache(PImpl).getValueInBlock(V, BB);
+Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
+                                     Instruction *CxtI) {
+  LVILatticeVal Result =
+    getCache(PImpl, AT, DL, DT).getValueInBlock(V, BB, CxtI);
   
   if (Result.isConstant())
     return Result.getConstant();
@@ -1051,8 +1157,10 @@
 /// getConstantOnEdge - Determine whether the specified value is known to be a
 /// constant on the specified edge.  Return null if not.
 Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
-                                           BasicBlock *ToBB) {
-  LVILatticeVal Result = getCache(PImpl).getValueOnEdge(V, FromBB, ToBB);
+                                           BasicBlock *ToBB,
+                                           Instruction *CxtI) {
+  LVILatticeVal Result =
+    getCache(PImpl, AT, DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
   
   if (Result.isConstant())
     return Result.getConstant();
@@ -1064,51 +1172,47 @@
   return nullptr;
 }
 
-/// getPredicateOnEdge - Determine whether the specified value comparison
-/// with a constant is known to be true or false on the specified CFG edge.
-/// Pred is a CmpInst predicate.
-LazyValueInfo::Tristate
-LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
-                                  BasicBlock *FromBB, BasicBlock *ToBB) {
-  LVILatticeVal Result = getCache(PImpl).getValueOnEdge(V, FromBB, ToBB);
-  
+static LazyValueInfo::Tristate
+getPredicateResult(unsigned Pred, Constant *C, LVILatticeVal &Result,
+                   const DataLayout *DL, TargetLibraryInfo *TLI) {
+
   // If we know the value is a constant, evaluate the conditional.
   Constant *Res = nullptr;
   if (Result.isConstant()) {
     Res = ConstantFoldCompareInstOperands(Pred, Result.getConstant(), C, DL,
                                           TLI);
     if (ConstantInt *ResCI = dyn_cast<ConstantInt>(Res))
-      return ResCI->isZero() ? False : True;
-    return Unknown;
+      return ResCI->isZero() ? LazyValueInfo::False : LazyValueInfo::True;
+    return LazyValueInfo::Unknown;
   }
   
   if (Result.isConstantRange()) {
     ConstantInt *CI = dyn_cast<ConstantInt>(C);
-    if (!CI) return Unknown;
+    if (!CI) return LazyValueInfo::Unknown;
     
     ConstantRange CR = Result.getConstantRange();
     if (Pred == ICmpInst::ICMP_EQ) {
       if (!CR.contains(CI->getValue()))
-        return False;
+        return LazyValueInfo::False;
       
       if (CR.isSingleElement() && CR.contains(CI->getValue()))
-        return True;
+        return LazyValueInfo::True;
     } else if (Pred == ICmpInst::ICMP_NE) {
       if (!CR.contains(CI->getValue()))
-        return True;
+        return LazyValueInfo::True;
       
       if (CR.isSingleElement() && CR.contains(CI->getValue()))
-        return False;
+        return LazyValueInfo::False;
     }
     
     // Handle more complex predicates.
     ConstantRange TrueValues =
         ICmpInst::makeConstantRange((ICmpInst::Predicate)Pred, CI->getValue());
     if (TrueValues.contains(CR))
-      return True;
+      return LazyValueInfo::True;
     if (TrueValues.inverse().contains(CR))
-      return False;
-    return Unknown;
+      return LazyValueInfo::False;
+    return LazyValueInfo::Unknown;
   }
   
   if (Result.isNotConstant()) {
@@ -1120,26 +1224,48 @@
                                             Result.getNotConstant(), C, DL,
                                             TLI);
       if (Res->isNullValue())
-        return False;
+        return LazyValueInfo::False;
     } else if (Pred == ICmpInst::ICMP_NE) {
       // !C1 != C -> true iff C1 == C.
       Res = ConstantFoldCompareInstOperands(ICmpInst::ICMP_NE,
                                             Result.getNotConstant(), C, DL,
                                             TLI);
       if (Res->isNullValue())
-        return True;
+        return LazyValueInfo::True;
     }
-    return Unknown;
+    return LazyValueInfo::Unknown;
   }
   
-  return Unknown;
+  return LazyValueInfo::Unknown;
+}
+
+/// getPredicateOnEdge - Determine whether the specified value comparison
+/// with a constant is known to be true or false on the specified CFG edge.
+/// Pred is a CmpInst predicate.
+LazyValueInfo::Tristate
+LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
+                                  BasicBlock *FromBB, BasicBlock *ToBB,
+                                  Instruction *CxtI) {
+  LVILatticeVal Result =
+    getCache(PImpl, AT, DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
+
+  return getPredicateResult(Pred, C, Result, DL, TLI);
+}
+
+LazyValueInfo::Tristate
+LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
+                              Instruction *CxtI) {
+  LVILatticeVal Result =
+    getCache(PImpl, AT, DL, DT).getValueAt(V, CxtI);
+
+  return getPredicateResult(Pred, C, Result, DL, TLI);
 }
 
 void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
                                BasicBlock *NewSucc) {
-  if (PImpl) getCache(PImpl).threadEdge(PredBB, OldSucc, NewSucc);
+  if (PImpl) getCache(PImpl, AT, DL, DT).threadEdge(PredBB, OldSucc, NewSucc);
 }
 
 void LazyValueInfo::eraseBlock(BasicBlock *BB) {
-  if (PImpl) getCache(PImpl).eraseBlock(BB);
+  if (PImpl) getCache(PImpl, AT, DL, DT).eraseBlock(BB);
 }

diff --git a/lib/Analysis/LibCallSemantics.cpp b/lib/Analysis/LibCallSemantics.cpp
index 7d4e254..23639e7 100644
--- a/lib/Analysis/LibCallSemantics.cpp
+++ b/lib/Analysis/LibCallSemantics.cpp

@@ -18,7 +18,7 @@
 #include "llvm/IR/Function.h"
 using namespace llvm;
 
-/// getMap - This impl pointer in ~LibCallInfo is actually a StringMap.  This
+/// This impl pointer in ~LibCallInfo is actually a StringMap.  This
 /// helper does the cast.
 static StringMap<const LibCallFunctionInfo*> *getMap(void *Ptr) {
   return static_cast<StringMap<const LibCallFunctionInfo*> *>(Ptr);
@@ -38,7 +38,7 @@
 }
 
 
-/// getFunctionInfo - Return the LibCallFunctionInfo object corresponding to
+/// Return the LibCallFunctionInfo object corresponding to
 /// the specified function if we have it.  If not, return null.
 const LibCallFunctionInfo *
 LibCallInfo::getFunctionInfo(const Function *F) const {

diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index b14f329..8ee9b8a 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp

@@ -37,6 +37,7 @@
 #include "llvm/Analysis/Lint.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
@@ -96,11 +97,12 @@
 
     Value *findValue(Value *V, bool OffsetOk) const;
     Value *findValueImpl(Value *V, bool OffsetOk,
-                         SmallPtrSet<Value *, 4> &Visited) const;
+                         SmallPtrSetImpl<Value *> &Visited) const;
 
   public:
     Module *Mod;
     AliasAnalysis *AA;
+    AssumptionTracker *AT;
     DominatorTree *DT;
     const DataLayout *DL;
     TargetLibraryInfo *TLI;
@@ -118,6 +120,7 @@
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
       AU.addRequired<AliasAnalysis>();
+      AU.addRequired<AssumptionTracker>();
       AU.addRequired<TargetLibraryInfo>();
       AU.addRequired<DominatorTreeWrapperPass>();
     }
@@ -151,6 +154,7 @@
 char Lint::ID = 0;
 INITIALIZE_PASS_BEGIN(Lint, "lint", "Statically lint-checks LLVM IR",
                       false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
@@ -175,6 +179,7 @@
 bool Lint::runOnFunction(Function &F) {
   Mod = F.getParent();
   AA = &getAnalysis<AliasAnalysis>();
+  AT = &getAnalysis<AssumptionTracker>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
@@ -504,7 +509,8 @@
             "Undefined result: Shift count out of range", &I);
 }
 
-static bool isZero(Value *V, const DataLayout *DL) {
+static bool isZero(Value *V, const DataLayout *DL, DominatorTree *DT,
+                   AssumptionTracker *AT) {
   // Assume undef could be zero.
   if (isa<UndefValue>(V))
     return true;
@@ -513,7 +519,8 @@
   if (!VecTy) {
     unsigned BitWidth = V->getType()->getIntegerBitWidth();
     APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-    computeKnownBits(V, KnownZero, KnownOne, DL);
+    computeKnownBits(V, KnownZero, KnownOne, DL,
+                     0, AT, dyn_cast<Instruction>(V), DT);
     return KnownZero.isAllOnesValue();
   }
 
@@ -543,22 +550,22 @@
 }
 
 void Lint::visitSDiv(BinaryOperator &I) {
-  Assert1(!isZero(I.getOperand(1), DL),
+  Assert1(!isZero(I.getOperand(1), DL, DT, AT),
           "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitUDiv(BinaryOperator &I) {
-  Assert1(!isZero(I.getOperand(1), DL),
+  Assert1(!isZero(I.getOperand(1), DL, DT, AT),
           "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitSRem(BinaryOperator &I) {
-  Assert1(!isZero(I.getOperand(1), DL),
+  Assert1(!isZero(I.getOperand(1), DL, DT, AT),
           "Undefined behavior: Division by zero", &I);
 }
 
 void Lint::visitURem(BinaryOperator &I) {
-  Assert1(!isZero(I.getOperand(1), DL),
+  Assert1(!isZero(I.getOperand(1), DL, DT, AT),
           "Undefined behavior: Division by zero", &I);
 }
 
@@ -622,9 +629,9 @@
 
 /// findValueImpl - Implementation helper for findValue.
 Value *Lint::findValueImpl(Value *V, bool OffsetOk,
-                           SmallPtrSet<Value *, 4> &Visited) const {
+                           SmallPtrSetImpl<Value *> &Visited) const {
   // Detect self-referential values.
-  if (!Visited.insert(V))
+  if (!Visited.insert(V).second)
     return UndefValue::get(V->getType());
 
   // TODO: Look through sext or zext cast, when the result is known to
@@ -638,7 +645,8 @@
     BasicBlock *BB = L->getParent();
     SmallPtrSet<BasicBlock *, 4> VisitedBlocks;
     for (;;) {
-      if (!VisitedBlocks.insert(BB)) break;
+      if (!VisitedBlocks.insert(BB).second)
+        break;
       if (Value *U = FindAvailableLoadedValue(L->getPointerOperand(),
                                               BB, BBI, 6, AA))
         return findValueImpl(U, OffsetOk, Visited);
@@ -678,7 +686,7 @@
 
   // As a last resort, try SimplifyInstruction or constant folding.
   if (Instruction *Inst = dyn_cast<Instruction>(V)) {
-    if (Value *W = SimplifyInstruction(Inst, DL, TLI, DT))
+    if (Value *W = SimplifyInstruction(Inst, DL, TLI, DT, AT))
       return findValueImpl(W, OffsetOk, Visited);
   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
     if (Value *W = ConstantFoldConstantExpression(CE, DL, TLI))

diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index 005d309..bb0d60e 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp

@@ -22,25 +22,29 @@
 #include "llvm/IR/Operator.h"
 using namespace llvm;
 
-/// AreEquivalentAddressValues - Test if A and B will obviously have the same
-/// value. This includes recognizing that %t0 and %t1 will have the same
+/// \brief Test if A and B will obviously have the same value.
+///
+/// This includes recognizing that %t0 and %t1 will have the same
 /// value in code like this:
+/// \code
 ///   %t0 = getelementptr \@a, 0, 3
 ///   store i32 0, i32* %t0
 ///   %t1 = getelementptr \@a, 0, 3
 ///   %t2 = load i32* %t1
+/// \endcode
 ///
 static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
   // Test if the values are trivially equivalent.
-  if (A == B) return true;
+  if (A == B)
+    return true;
 
   // Test if the values come from identical arithmetic instructions.
   // Use isIdenticalToWhenDefined instead of isIdenticalTo because
   // this function is only used when one address use dominates the
   // other, which means that they'll always either have the same
   // value or one of them will have an undefined value.
-  if (isa<BinaryOperator>(A) || isa<CastInst>(A) ||
-      isa<PHINode>(A) || isa<GetElementPtrInst>(A))
+  if (isa<BinaryOperator>(A) || isa<CastInst>(A) || isa<PHINode>(A) ||
+      isa<GetElementPtrInst>(A))
     if (const Instruction *BI = dyn_cast<Instruction>(B))
       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
         return true;
@@ -49,15 +53,19 @@
   return false;
 }
 
-/// isSafeToLoadUnconditionally - Return true if we know that executing a load
-/// from this value cannot trap.  If it is not obviously safe to load from the
-/// specified pointer, we do a quick local scan of the basic block containing
-/// ScanFrom, to determine if the address is already accessed.
+/// \brief Check if executing a load of this pointer value cannot trap.
+///
+/// If it is not obviously safe to load from the specified pointer, we do
+/// a quick local scan of the basic block containing \c ScanFrom, to determine
+/// if the address is already accessed.
+///
+/// This uses the pointee type to determine how many bytes need to be safe to
+/// load from the pointer.
 bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
-                                       unsigned Align, const DataLayout *TD) {
+                                       unsigned Align, const DataLayout *DL) {
   int64_t ByteOffset = 0;
   Value *Base = V;
-  Base = GetPointerBaseWithConstantOffset(V, ByteOffset, TD);
+  Base = GetPointerBaseWithConstantOffset(V, ByteOffset, DL);
 
   if (ByteOffset < 0) // out of bounds
     return false;
@@ -69,26 +77,29 @@
     BaseType = AI->getAllocatedType();
     BaseAlign = AI->getAlignment();
   } else if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
-    // Global variables are safe to load from but their size cannot be
-    // guaranteed if they are overridden.
+    // Global variables are not necessarily safe to load from if they are
+    // overridden. Their size may change or they may be weak and require a test
+    // to determine if they were in fact provided.
     if (!GV->mayBeOverridden()) {
       BaseType = GV->getType()->getElementType();
       BaseAlign = GV->getAlignment();
     }
   }
 
-  if (BaseType && BaseType->isSized()) {
-    if (TD && BaseAlign == 0)
-      BaseAlign = TD->getPrefTypeAlignment(BaseType);
+  PointerType *AddrTy = cast<PointerType>(V->getType());
+  uint64_t LoadSize = DL ? DL->getTypeStoreSize(AddrTy->getElementType()) : 0;
+
+  // If we found a base allocated type from either an alloca or global variable,
+  // try to see if we are definitively within the allocated region. We need to
+  // know the size of the base type and the loaded type to do anything in this
+  // case, so only try this when we have the DataLayout available.
+  if (BaseType && BaseType->isSized() && DL) {
+    if (BaseAlign == 0)
+      BaseAlign = DL->getPrefTypeAlignment(BaseType);
 
     if (Align <= BaseAlign) {
-      if (!TD)
-        return true; // Loading directly from an alloca or global is OK.
-
       // Check if the load is within the bounds of the underlying object.
-      PointerType *AddrTy = cast<PointerType>(V->getType());
-      uint64_t LoadSize = TD->getTypeStoreSize(AddrTy->getElementType());
-      if (ByteOffset + LoadSize <= TD->getTypeAllocSize(BaseType) &&
+      if (ByteOffset + LoadSize <= DL->getTypeAllocSize(BaseType) &&
           (Align == 0 || (ByteOffset % Align) == 0))
         return true;
     }
@@ -101,6 +112,10 @@
   // the load entirely).
   BasicBlock::iterator BBI = ScanFrom, E = ScanFrom->getParent()->begin();
 
+  // We can at least always strip pointer casts even though we can't use the
+  // base here.
+  V = V->stripPointerCasts();
+
   while (BBI != E) {
     --BBI;
 
@@ -110,46 +125,62 @@
         !isa<DbgInfoIntrinsic>(BBI))
       return false;
 
-    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
-      if (AreEquivalentAddressValues(LI->getOperand(0), V)) return true;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
-      if (AreEquivalentAddressValues(SI->getOperand(1), V)) return true;
-    }
+    Value *AccessedPtr;
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI))
+      AccessedPtr = LI->getPointerOperand();
+    else if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+      AccessedPtr = SI->getPointerOperand();
+    else
+      continue;
+
+    // Handle trivial cases even w/o DataLayout or other work.
+    if (AccessedPtr == V)
+      return true;
+
+    if (!DL)
+      continue;
+
+    auto *AccessedTy = cast<PointerType>(AccessedPtr->getType());
+    if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) &&
+        LoadSize <= DL->getTypeStoreSize(AccessedTy->getElementType()))
+      return true;
   }
   return false;
 }
 
-/// FindAvailableLoadedValue - Scan the ScanBB block backwards (starting at the
-/// instruction before ScanFrom) checking to see if we have the value at the
+/// \brief Scan the ScanBB block backwards to see if we have the value at the
 /// memory address *Ptr locally available within a small number of instructions.
-/// If the value is available, return it.
 ///
-/// If not, return the iterator for the last validated instruction that the 
-/// value would be live through.  If we scanned the entire block and didn't find
-/// something that invalidates *Ptr or provides it, ScanFrom would be left at
-/// begin() and this returns null.  ScanFrom could also be left 
+/// The scan starts from \c ScanFrom. \c MaxInstsToScan specifies the maximum
+/// instructions to scan in the block. If it is set to \c 0, it will scan the whole
+/// block.
 ///
-/// MaxInstsToScan specifies the maximum instructions to scan in the block.  If
-/// it is set to 0, it will scan the whole block. You can also optionally
-/// specify an alias analysis implementation, which makes this more precise.
+/// If the value is available, this function returns it. If not, it returns the
+/// iterator for the last validated instruction that the value would be live
+/// through. If we scanned the entire block and didn't find something that
+/// invalidates \c *Ptr or provides it, \c ScanFrom is left at the last
+/// instruction processed and this returns null.
 ///
-/// If TBAATag is non-null and a load or store is found, the TBAA tag from the
-/// load or store is recorded there.  If there is no TBAA tag or if no access
-/// is found, it is left unmodified.
+/// You can also optionally specify an alias analysis implementation, which
+/// makes this more precise.
+///
+/// If \c AATags is non-null and a load or store is found, the AA tags from the
+/// load or store are recorded there. If there are no AA tags or if no access is
+/// found, it is left unmodified.
 Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
                                       BasicBlock::iterator &ScanFrom,
                                       unsigned MaxInstsToScan,
-                                      AliasAnalysis *AA,
-                                      MDNode **TBAATag) {
-  if (MaxInstsToScan == 0) MaxInstsToScan = ~0U;
+                                      AliasAnalysis *AA, AAMDNodes *AATags) {
+  if (MaxInstsToScan == 0)
+    MaxInstsToScan = ~0U;
+
+  Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
 
   // If we're using alias analysis to disambiguate get the size of *Ptr.
-  uint64_t AccessSize = 0;
-  if (AA) {
-    Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
-    AccessSize = AA->getTypeStoreSize(AccessTy);
-  }
-  
+  uint64_t AccessSize = AA ? AA->getTypeStoreSize(AccessTy) : 0;
+
+  Value *StrippedPtr = Ptr->stripPointerCasts();
+
   while (ScanFrom != ScanBB->begin()) {
     // We must ignore debug info directives when counting (otherwise they
     // would affect codegen).
@@ -159,62 +190,71 @@
 
     // Restore ScanFrom to expected value in case next test succeeds
     ScanFrom++;
-   
+
     // Don't scan huge blocks.
-    if (MaxInstsToScan-- == 0) return nullptr;
-    
+    if (MaxInstsToScan-- == 0)
+      return nullptr;
+
     --ScanFrom;
     // If this is a load of Ptr, the loaded value is available.
     // (This is true even if the load is volatile or atomic, although
     // those cases are unlikely.)
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-      if (AreEquivalentAddressValues(LI->getOperand(0), Ptr)) {
-        if (TBAATag) *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa);
+      if (AreEquivalentAddressValues(
+              LI->getPointerOperand()->stripPointerCasts(), StrippedPtr) &&
+          CastInst::isBitCastable(LI->getType(), AccessTy)) {
+        if (AATags)
+          LI->getAAMetadata(*AATags);
         return LI;
       }
-    
+
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      Value *StorePtr = SI->getPointerOperand()->stripPointerCasts();
       // If this is a store through Ptr, the value is available!
       // (This is true even if the store is volatile or atomic, although
       // those cases are unlikely.)
-      if (AreEquivalentAddressValues(SI->getOperand(1), Ptr)) {
-        if (TBAATag) *TBAATag = SI->getMetadata(LLVMContext::MD_tbaa);
+      if (AreEquivalentAddressValues(StorePtr, StrippedPtr) &&
+          CastInst::isBitCastable(SI->getValueOperand()->getType(), AccessTy)) {
+        if (AATags)
+          SI->getAAMetadata(*AATags);
         return SI->getOperand(0);
       }
-      
-      // If Ptr is an alloca and this is a store to a different alloca, ignore
-      // the store.  This is a trivial form of alias analysis that is important
-      // for reg2mem'd code.
-      if ((isa<AllocaInst>(Ptr) || isa<GlobalVariable>(Ptr)) &&
-          (isa<AllocaInst>(SI->getOperand(1)) ||
-           isa<GlobalVariable>(SI->getOperand(1))))
+
+      // If both StrippedPtr and StorePtr reach all the way to an alloca or
+      // global and they are different, ignore the store. This is a trivial form
+      // of alias analysis that is important for reg2mem'd code.
+      if ((isa<AllocaInst>(StrippedPtr) || isa<GlobalVariable>(StrippedPtr)) &&
+          (isa<AllocaInst>(StorePtr) || isa<GlobalVariable>(StorePtr)) &&
+          StrippedPtr != StorePtr)
         continue;
-      
+
       // If we have alias analysis and it says the store won't modify the loaded
       // value, ignore the store.
       if (AA &&
-          (AA->getModRefInfo(SI, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+          (AA->getModRefInfo(SI, StrippedPtr, AccessSize) &
+           AliasAnalysis::Mod) == 0)
         continue;
-      
+
       // Otherwise the store that may or may not alias the pointer, bail out.
       ++ScanFrom;
       return nullptr;
     }
-    
+
     // If this is some other instruction that may clobber Ptr, bail out.
     if (Inst->mayWriteToMemory()) {
       // If alias analysis claims that it really won't modify the load,
       // ignore it.
       if (AA &&
-          (AA->getModRefInfo(Inst, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+          (AA->getModRefInfo(Inst, StrippedPtr, AccessSize) &
+           AliasAnalysis::Mod) == 0)
         continue;
-      
+
       // May modify the pointer, bail out.
       ++ScanFrom;
       return nullptr;
     }
   }
-  
+
   // Got to the start of the block, we didn't find it, but are done for this
   // block.
   return nullptr;

diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 46c0eaa..b1f62c4 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp

@@ -24,6 +24,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -307,7 +308,8 @@
       // directly or indirectly through another list metadata (in case of
       // nested parallel loops). The loop identifier metadata refers to
       // itself so we can check both cases with the same routine.
-      MDNode *loopIdMD = II->getMetadata("llvm.mem.parallel_loop_access");
+      MDNode *loopIdMD =
+          II->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
 
       if (!loopIdMD)
         return false;

diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 7bd866e..190abc7 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp

@@ -76,6 +76,9 @@
 
   LI->updateUnloop(L);
 
+  // Notify passes that the loop is being deleted.
+  deleteSimpleAnalysisLoop(L);
+
   // If L is current loop then skip rest of the passes and let
   // runOnFunction remove L from LQ. Otherwise, remove L from LQ now
   // and continue applying other passes on CurrentLoop.
@@ -164,6 +167,14 @@
   }
 }
 
+/// Invoke deleteAnalysisLoop hook for all passes.
+void LPPassManager::deleteSimpleAnalysisLoop(Loop *L) {
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    LoopPass *LP = getContainedPass(Index);
+    LP->deleteAnalysisLoop(L);
+  }
+}
+
 
 // Recurse through all subloops and all loops  into LQ.
 static void addLoopIntoQueue(Loop *L, std::deque<Loop *> &LQ) {

diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 64d339f..08b41fe 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp

@@ -332,7 +332,11 @@
       TLIFn == LibFunc::ZdlPv || // operator delete(void*)
       TLIFn == LibFunc::ZdaPv)   // operator delete[](void*)
     ExpectedNumParams = 1;
-  else if (TLIFn == LibFunc::ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
+  else if (TLIFn == LibFunc::ZdlPvj ||              // delete(void*, uint)
+           TLIFn == LibFunc::ZdlPvm ||              // delete(void*, ulong)
+           TLIFn == LibFunc::ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
+           TLIFn == LibFunc::ZdaPvj ||              // delete[](void*, uint)
+           TLIFn == LibFunc::ZdaPvm ||              // delete[](void*, ulong)
            TLIFn == LibFunc::ZdaPvRKSt9nothrow_t)   // delete[](void*, nothrow)
     ExpectedNumParams = 2;
   else
@@ -412,7 +416,7 @@
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     // If we have already seen this instruction, bail out. Cycles can happen in
     // unreachable code after constant propagation.
-    if (!SeenInsts.insert(I))
+    if (!SeenInsts.insert(I).second)
       return unknown();
 
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
@@ -648,7 +652,7 @@
   // Record the pointers that were handled in this run, so that they can be
   // cleaned later if something fails. We also use this set to break cycles that
   // can occur in dead code.
-  if (!SeenVals.insert(V)) {
+  if (!SeenVals.insert(V).second) {
     Result = unknown();
   } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     Result = visitGEPOperator(*GEP);

diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 9eaf109..187eada 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp

@@ -18,6 +18,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/PHITransAddr.h"
@@ -48,13 +49,17 @@
           "Number of block queries that were completely cached");
 
 // Limit for the number of instructions to scan in a block.
-static const int BlockScanLimit = 100;
+static const unsigned int BlockScanLimit = 100;
+
+// Limit on the number of memdep results to process.
+static const unsigned int NumResultsLimit = 100;
 
 char MemoryDependenceAnalysis::ID = 0;
 
 // Register this pass...
 INITIALIZE_PASS_BEGIN(MemoryDependenceAnalysis, "memdep",
                 "Memory Dependence Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(MemoryDependenceAnalysis, "memdep",
                       "Memory Dependence Analysis", false, true)
@@ -83,11 +88,13 @@
 ///
 void MemoryDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
+  AU.addRequired<AssumptionTracker>();
   AU.addRequiredTransitive<AliasAnalysis>();
 }
 
 bool MemoryDependenceAnalysis::runOnFunction(Function &) {
   AA = &getAnalysis<AliasAnalysis>();
+  AT = &getAnalysis<AssumptionTracker>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
   DominatorTreeWrapperPass *DTWP =
@@ -158,29 +165,32 @@
     return AliasAnalysis::Mod;
   }
 
-  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    AAMDNodes AAInfo;
+
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
+      II->getAAMetadata(AAInfo);
       Loc = AliasAnalysis::Location(II->getArgOperand(1),
                                     cast<ConstantInt>(II->getArgOperand(0))
-                                      ->getZExtValue(),
-                                    II->getMetadata(LLVMContext::MD_tbaa));
+                                      ->getZExtValue(), AAInfo);
       // These intrinsics don't really modify the memory, but returning Mod
       // will allow them to be handled conservatively.
       return AliasAnalysis::Mod;
     case Intrinsic::invariant_end:
+      II->getAAMetadata(AAInfo);
       Loc = AliasAnalysis::Location(II->getArgOperand(2),
                                     cast<ConstantInt>(II->getArgOperand(1))
-                                      ->getZExtValue(),
-                                    II->getMetadata(LLVMContext::MD_tbaa));
+                                      ->getZExtValue(), AAInfo);
       // These intrinsics don't really modify the memory, but returning Mod
       // will allow them to be handled conservatively.
       return AliasAnalysis::Mod;
     default:
       break;
     }
+  }
 
   // Otherwise, just do the coarse-grained thing that always works.
   if (Inst->mayWriteToMemory())
@@ -367,6 +377,36 @@
   int64_t MemLocOffset = 0;
   unsigned Limit = BlockScanLimit;
   bool isInvariantLoad = false;
+
+  // We must be careful with atomic accesses, as they may allow another thread
+  //   to touch this location, cloberring it. We are conservative: if the
+  //   QueryInst is not a simple (non-atomic) memory access, we automatically
+  //   return getClobber.
+  // If it is simple, we know based on the results of
+  // "Compiler testing via a theory of sound optimisations in the C11/C++11
+  //   memory model" in PLDI 2013, that a non-atomic location can only be
+  //   clobbered between a pair of a release and an acquire action, with no
+  //   access to the location in between.
+  // Here is an example for giving the general intuition behind this rule.
+  // In the following code:
+  //   store x 0;
+  //   release action; [1]
+  //   acquire action; [4]
+  //   %val = load x;
+  // It is unsafe to replace %val by 0 because another thread may be running:
+  //   acquire action; [2]
+  //   store x 42;
+  //   release action; [3]
+  // with synchronization from 1 to 2 and from 3 to 4, resulting in %val
+  // being 42. A key property of this program however is that if either
+  // 1 or 4 were missing, there would be a race between the store of 42
+  // either the store of 0 or the load (making the whole progam racy).
+  // The paper mentionned above shows that the same property is respected
+  // by every program that can detect any optimisation of that kind: either
+  // it is racy (undefined) or there is a release followed by an acquire
+  // between the pair of accesses under consideration.
+  bool HasSeenAcquire = false;
+
   if (isLoad && QueryInst) {
     LoadInst *LI = dyn_cast<LoadInst>(QueryInst);
     if (LI && LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr)
@@ -404,10 +444,37 @@
 
     // Values depend on loads if the pointers are must aliased.  This means that
     // a load depends on another must aliased load from the same value.
+    // One exception is atomic loads: a value can depend on an atomic load that it
+    // does not alias with when this atomic load indicates that another thread may
+    // be accessing the location.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
       // Atomic loads have complications involved.
+      // A Monotonic (or higher) load is OK if the query inst is itself not atomic.
+      // An Acquire (or higher) load sets the HasSeenAcquire flag, so that any
+      //   release store will know to return getClobber.
       // FIXME: This is overly conservative.
-      if (!LI->isUnordered())
+      if (!LI->isUnordered()) {
+        if (!QueryInst)
+          return MemDepResult::getClobber(LI);
+        if (auto *QueryLI = dyn_cast<LoadInst>(QueryInst)) {
+          if (!QueryLI->isSimple())
+            return MemDepResult::getClobber(LI);
+        } else if (auto *QuerySI = dyn_cast<StoreInst>(QueryInst)) {
+          if (!QuerySI->isSimple())
+            return MemDepResult::getClobber(LI);
+        } else if (QueryInst->mayReadOrWriteMemory()) {
+          return MemDepResult::getClobber(LI);
+        }
+
+        if (isAtLeastAcquire(LI->getOrdering()))
+          HasSeenAcquire = true;
+      }
+
+      // FIXME: this is overly conservative.
+      // While volatile access cannot be eliminated, they do not have to clobber
+      // non-aliasing locations, as normal accesses can for example be reordered
+      // with volatile accesses.
+      if (LI->isVolatile())
         return MemDepResult::getClobber(LI);
 
       AliasAnalysis::Location LoadLoc = AA->getLocation(LI);
@@ -466,8 +533,32 @@
 
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       // Atomic stores have complications involved.
+      // A Monotonic store is OK if the query inst is itself not atomic.
+      // A Release (or higher) store further requires that no acquire load
+      //   has been seen.
       // FIXME: This is overly conservative.
-      if (!SI->isUnordered())
+      if (!SI->isUnordered()) {
+        if (!QueryInst)
+          return MemDepResult::getClobber(SI);
+        if (auto *QueryLI = dyn_cast<LoadInst>(QueryInst)) {
+          if (!QueryLI->isSimple())
+            return MemDepResult::getClobber(SI);
+        } else if (auto *QuerySI = dyn_cast<StoreInst>(QueryInst)) {
+          if (!QuerySI->isSimple())
+            return MemDepResult::getClobber(SI);
+        } else if (QueryInst->mayReadOrWriteMemory()) {
+          return MemDepResult::getClobber(SI);
+        }
+
+        if (HasSeenAcquire && isAtLeastRelease(SI->getOrdering()))
+          return MemDepResult::getClobber(SI);
+      }
+
+      // FIXME: this is overly conservative.
+      // While volatile access cannot be eliminated, they do not have to clobber
+      // non-aliasing locations, as normal accesses can for example be reordered
+      // with volatile accesses.
+      if (SI->isVolatile())
         return MemDepResult::getClobber(SI);
 
       // If alias analysis can tell that this store is guaranteed to not modify
@@ -685,7 +776,7 @@
     DirtyBlocks.pop_back();
 
     // Already processed this block?
-    if (!Visited.insert(DirtyBB))
+    if (!Visited.insert(DirtyBB).second)
       continue;
 
     // Do a binary search to see if we already have an entry for this block in
@@ -775,7 +866,7 @@
          "Can't get pointer deps of a non-pointer!");
   Result.clear();
 
-  PHITransAddr Address(const_cast<Value *>(Loc.Ptr), DL);
+  PHITransAddr Address(const_cast<Value *>(Loc.Ptr), DL, AT);
 
   // This is the set of blocks we've inspected, and the pointer we consider in
   // each block.  Because of critical edges, we currently bail out if querying
@@ -861,7 +952,7 @@
   return Dep;
 }
 
-/// SortNonLocalDepInfoCache - Sort the a NonLocalDepInfo cache, given a certain
+/// SortNonLocalDepInfoCache - Sort the NonLocalDepInfo cache, given a certain
 /// number of elements in the array that are already properly ordered.  This is
 /// optimized for the case when only a few entries are added.
 static void
@@ -922,10 +1013,10 @@
   // Set up a temporary NLPI value. If the map doesn't yet have an entry for
   // CacheKey, this value will be inserted as the associated value. Otherwise,
   // it'll be ignored, and we'll have to check to see if the cached size and
-  // tbaa tag are consistent with the current query.
+  // aa tags are consistent with the current query.
   NonLocalPointerInfo InitialNLPI;
   InitialNLPI.Size = Loc.Size;
-  InitialNLPI.TBAATag = Loc.TBAATag;
+  InitialNLPI.AATags = Loc.AATags;
 
   // Get the NLPI for CacheKey, inserting one into the map if it doesn't
   // already have one.
@@ -955,21 +1046,21 @@
                                          SkipFirstBlock);
     }
 
-    // If the query's TBAATag is inconsistent with the cached one,
+    // If the query's AATags are inconsistent with the cached one,
     // conservatively throw out the cached data and restart the query with
     // no tag if needed.
-    if (CacheInfo->TBAATag != Loc.TBAATag) {
-      if (CacheInfo->TBAATag) {
+    if (CacheInfo->AATags != Loc.AATags) {
+      if (CacheInfo->AATags) {
         CacheInfo->Pair = BBSkipFirstBlockPair();
-        CacheInfo->TBAATag = nullptr;
+        CacheInfo->AATags = AAMDNodes();
         for (NonLocalDepInfo::iterator DI = CacheInfo->NonLocalDeps.begin(),
              DE = CacheInfo->NonLocalDeps.end(); DI != DE; ++DI)
           if (Instruction *Inst = DI->getResult().getInst())
             RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
         CacheInfo->NonLocalDeps.clear();
       }
-      if (Loc.TBAATag)
-        return getNonLocalPointerDepFromBB(Pointer, Loc.getWithoutTBAATag(),
+      if (Loc.AATags)
+        return getNonLocalPointerDepFromBB(Pointer, Loc.getWithoutAATags(),
                                            isLoad, StartBB, Result, Visited,
                                            SkipFirstBlock);
     }
@@ -1045,6 +1136,25 @@
   while (!Worklist.empty()) {
     BasicBlock *BB = Worklist.pop_back_val();
 
+    // If we do process a large number of blocks it becomes very expensive and
+    // likely it isn't worth worrying about
+    if (Result.size() > NumResultsLimit) {
+      Worklist.clear();
+      // Sort it now (if needed) so that recursive invocations of
+      // getNonLocalPointerDepFromBB and other routines that could reuse the
+      // cache value will only see properly sorted cache arrays.
+      if (Cache && NumSortedEntries != Cache->size()) {
+        SortNonLocalDepInfoCache(*Cache, NumSortedEntries);
+        NumSortedEntries = Cache->size();
+      }
+      // Since we bail out, the "Cache" set won't contain all of the
+      // results for the query.  This is ok (we can still use it to accelerate
+      // specific block queries) but we can't do the fastpath "return all
+      // results from the set".  Clear out the indicator for this.
+      CacheInfo->Pair = BBSkipFirstBlockPair();
+      return true;
+    }
+
     // Skip the first block if we have it.
     if (!SkipFirstBlock) {
       // Analyze the dependency of *Pointer in FromBB.  See if we already have
@@ -1369,14 +1479,11 @@
 
   ReverseDepMapType::iterator ReverseDepIt = ReverseLocalDeps.find(RemInst);
   if (ReverseDepIt != ReverseLocalDeps.end()) {
-    SmallPtrSet<Instruction*, 4> &ReverseDeps = ReverseDepIt->second;
     // RemInst can't be the terminator if it has local stuff depending on it.
-    assert(!ReverseDeps.empty() && !isa<TerminatorInst>(RemInst) &&
+    assert(!ReverseDepIt->second.empty() && !isa<TerminatorInst>(RemInst) &&
            "Nothing can locally depend on a terminator");
 
-    for (SmallPtrSet<Instruction*, 4>::iterator I = ReverseDeps.begin(),
-         E = ReverseDeps.end(); I != E; ++I) {
-      Instruction *InstDependingOnRemInst = *I;
+    for (Instruction *InstDependingOnRemInst : ReverseDepIt->second) {
       assert(InstDependingOnRemInst != RemInst &&
              "Already removed our local dep info");
 
@@ -1402,12 +1509,10 @@
 
   ReverseDepIt = ReverseNonLocalDeps.find(RemInst);
   if (ReverseDepIt != ReverseNonLocalDeps.end()) {
-    SmallPtrSet<Instruction*, 4> &Set = ReverseDepIt->second;
-    for (SmallPtrSet<Instruction*, 4>::iterator I = Set.begin(), E = Set.end();
-         I != E; ++I) {
-      assert(*I != RemInst && "Already removed NonLocalDep info for RemInst");
+    for (Instruction *I : ReverseDepIt->second) {
+      assert(I != RemInst && "Already removed NonLocalDep info for RemInst");
 
-      PerInstNLInfo &INLD = NonLocalDeps[*I];
+      PerInstNLInfo &INLD = NonLocalDeps[I];
       // The information is now dirty!
       INLD.second = true;
 
@@ -1419,7 +1524,7 @@
         DI->setResult(NewDirtyVal);
 
         if (Instruction *NextI = NewDirtyVal.getInst())
-          ReverseDepsToAdd.push_back(std::make_pair(NextI, *I));
+          ReverseDepsToAdd.push_back(std::make_pair(NextI, I));
       }
     }
 
@@ -1438,12 +1543,9 @@
   ReverseNonLocalPtrDepTy::iterator ReversePtrDepIt =
     ReverseNonLocalPtrDeps.find(RemInst);
   if (ReversePtrDepIt != ReverseNonLocalPtrDeps.end()) {
-    SmallPtrSet<ValueIsLoadPair, 4> &Set = ReversePtrDepIt->second;
     SmallVector<std::pair<Instruction*, ValueIsLoadPair>,8> ReversePtrDepsToAdd;
 
-    for (SmallPtrSet<ValueIsLoadPair, 4>::iterator I = Set.begin(),
-         E = Set.end(); I != E; ++I) {
-      ValueIsLoadPair P = *I;
+    for (ValueIsLoadPair P : ReversePtrDepIt->second) {
       assert(P.getPointer() != RemInst &&
              "Already removed NonLocalPointerDeps info for RemInst");
 
@@ -1484,8 +1586,10 @@
   DEBUG(verifyRemoved(RemInst));
 }
 /// verifyRemoved - Verify that the specified instruction does not occur
-/// in our internal data structures.
+/// in our internal data structures. This function verifies by asserting in
+/// debug builds.
 void MemoryDependenceAnalysis::verifyRemoved(Instruction *D) const {
+#ifndef NDEBUG
   for (LocalDepMapType::const_iterator I = LocalDeps.begin(),
        E = LocalDeps.end(); I != E; ++I) {
     assert(I->first != D && "Inst occurs in data structures");
@@ -1514,18 +1618,16 @@
   for (ReverseDepMapType::const_iterator I = ReverseLocalDeps.begin(),
        E = ReverseLocalDeps.end(); I != E; ++I) {
     assert(I->first != D && "Inst occurs in data structures");
-    for (SmallPtrSet<Instruction*, 4>::const_iterator II = I->second.begin(),
-         EE = I->second.end(); II != EE; ++II)
-      assert(*II != D && "Inst occurs in data structures");
+    for (Instruction *Inst : I->second)
+      assert(Inst != D && "Inst occurs in data structures");
   }
 
   for (ReverseDepMapType::const_iterator I = ReverseNonLocalDeps.begin(),
        E = ReverseNonLocalDeps.end();
        I != E; ++I) {
     assert(I->first != D && "Inst occurs in data structures");
-    for (SmallPtrSet<Instruction*, 4>::const_iterator II = I->second.begin(),
-         EE = I->second.end(); II != EE; ++II)
-      assert(*II != D && "Inst occurs in data structures");
+    for (Instruction *Inst : I->second)
+      assert(Inst != D && "Inst occurs in data structures");
   }
 
   for (ReverseNonLocalPtrDepTy::const_iterator
@@ -1533,11 +1635,10 @@
        E = ReverseNonLocalPtrDeps.end(); I != E; ++I) {
     assert(I->first != D && "Inst occurs in rev NLPD map");
 
-    for (SmallPtrSet<ValueIsLoadPair, 4>::const_iterator II = I->second.begin(),
-         E = I->second.end(); II != E; ++II)
-      assert(*II != ValueIsLoadPair(D, false) &&
-             *II != ValueIsLoadPair(D, true) &&
+    for (ValueIsLoadPair P : I->second)
+      assert(P != ValueIsLoadPair(D, false) &&
+             P != ValueIsLoadPair(D, true) &&
              "Inst occurs in ReverseNonLocalPtrDeps map");
   }
-
+#endif
 }

diff --git a/lib/Analysis/NoAliasAnalysis.cpp b/lib/Analysis/NoAliasAnalysis.cpp
index 139fa38..c214d3c 100644
--- a/lib/Analysis/NoAliasAnalysis.cpp
+++ b/lib/Analysis/NoAliasAnalysis.cpp

@@ -57,8 +57,9 @@
     Location getArgLocation(ImmutableCallSite CS, unsigned ArgIdx,
                             ModRefResult &Mask) override {
       Mask = ModRef;
-      return Location(CS.getArgument(ArgIdx), UnknownSize,
-                      CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa));
+      AAMDNodes AATags;
+      CS->getAAMetadata(AATags);
+      return Location(CS.getArgument(ArgIdx), UnknownSize, AATags);
     }
 
     ModRefResult getModRefInfo(ImmutableCallSite CS,

diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index bfe8642..b3d060a 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp

@@ -228,7 +228,7 @@
       return GEP;
 
     // Simplify the GEP to handle 'gep x, 0' -> x etc.
-    if (Value *V = SimplifyGEPInst(GEPOps, DL, TLI, DT)) {
+    if (Value *V = SimplifyGEPInst(GEPOps, DL, TLI, DT, AT)) {
       for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
         RemoveInstInputs(GEPOps[i], InstInputs);
 
@@ -283,7 +283,7 @@
         }
 
     // See if the add simplifies away.
-    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, DL, TLI, DT)) {
+    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, DL, TLI, DT, AT)) {
       // If we simplified the operands, the LHS is no longer an input, but Res
       // is.
       RemoveInstInputs(LHS, InstInputs);
@@ -369,7 +369,7 @@
                            SmallVectorImpl<Instruction*> &NewInsts) {
   // See if we have a version of this value already available and dominating
   // PredBB.  If so, there is no need to insert a new instance of it.
-  PHITransAddr Tmp(InVal, DL);
+  PHITransAddr Tmp(InVal, DL, AT);
   if (!Tmp.PHITranslateValue(CurBB, PredBB, &DT))
     return Tmp.getAddr();
 

diff --git a/lib/Analysis/PtrUseVisitor.cpp b/lib/Analysis/PtrUseVisitor.cpp
index 1b0f359..68c7535 100644
--- a/lib/Analysis/PtrUseVisitor.cpp
+++ b/lib/Analysis/PtrUseVisitor.cpp

@@ -17,7 +17,7 @@
 
 void detail::PtrUseVisitorBase::enqueueUsers(Instruction &I) {
   for (Use &U : I.uses()) {
-    if (VisitedUses.insert(&U)) {
+    if (VisitedUses.insert(&U).second) {
       UseToVisit NewU = {
         UseToVisit::UseAndIsOffsetKnownPair(&U, IsOffsetKnown),
         Offset

diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 7f88ae1..08ebf0d 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp

@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -25,21 +26,26 @@
 
 #define DEBUG_TYPE "region"
 
-// Always verify if expensive checking is enabled.
-#ifdef XDEBUG
-static bool VerifyRegionInfo = true;
-#else
-static bool VerifyRegionInfo = false;
-#endif
-
-static cl::opt<bool,true>
-VerifyRegionInfoX("verify-region-info", cl::location(VerifyRegionInfo),
-                cl::desc("Verify region info (time consuming)"));
+namespace llvm {
+template class RegionBase<RegionTraits<Function>>;
+template class RegionNodeBase<RegionTraits<Function>>;
+template class RegionInfoBase<RegionTraits<Function>>;
+}
 
 STATISTIC(numRegions,       "The # of regions");
 STATISTIC(numSimpleRegions, "The # of simple regions");
 
-static cl::opt<enum Region::PrintStyle> printStyle("print-region-style",
+// Always verify if expensive checking is enabled.
+
+static cl::opt<bool,true>
+VerifyRegionInfoX(
+  "verify-region-info",
+  cl::location(RegionInfoBase<RegionTraits<Function>>::VerifyRegionInfo),
+  cl::desc("Verify region info (time consuming)"));
+
+
+static cl::opt<Region::PrintStyle, true> printStyleX("print-region-style",
+  cl::location(RegionInfo::printStyle),
   cl::Hidden,
   cl::desc("style of printing regions"),
   cl::values(
@@ -49,812 +55,110 @@
     clEnumValN(Region::PrintRN, "rn",
                "print regions in detail with element_iterator"),
     clEnumValEnd));
+
+
 //===----------------------------------------------------------------------===//
-/// Region Implementation
-Region::Region(BasicBlock *Entry, BasicBlock *Exit, RegionInfo* RInfo,
-               DominatorTree *dt, Region *Parent)
-               : RegionNode(Parent, Entry, 1), RI(RInfo), DT(dt), exit(Exit) {}
+// Region implementation
+//
 
-Region::~Region() {
-  // Free the cached nodes.
-  for (BBNodeMapT::iterator it = BBNodeMap.begin(),
-         ie = BBNodeMap.end(); it != ie; ++it)
-    delete it->second;
+Region::Region(BasicBlock *Entry, BasicBlock *Exit,
+               RegionInfo* RI,
+               DominatorTree *DT, Region *Parent) :
+  RegionBase<RegionTraits<Function>>(Entry, Exit, RI, DT, Parent) {
 
-  // Only clean the cache for this Region. Caches of child Regions will be
-  // cleaned when the child Regions are deleted.
-  BBNodeMap.clear();
 }
 
-void Region::replaceEntry(BasicBlock *BB) {
-  entry.setPointer(BB);
-}
-
-void Region::replaceExit(BasicBlock *BB) {
-  assert(exit && "No exit to replace!");
-  exit = BB;
-}
-
-void Region::replaceEntryRecursive(BasicBlock *NewEntry) {
-  std::vector<Region *> RegionQueue;
-  BasicBlock *OldEntry = getEntry();
-
-  RegionQueue.push_back(this);
-  while (!RegionQueue.empty()) {
-    Region *R = RegionQueue.back();
-    RegionQueue.pop_back();
-
-    R->replaceEntry(NewEntry);
-    for (Region::const_iterator RI = R->begin(), RE = R->end(); RI != RE; ++RI)
-      if ((*RI)->getEntry() == OldEntry)
-        RegionQueue.push_back(RI->get());
-  }
-}
-
-void Region::replaceExitRecursive(BasicBlock *NewExit) {
-  std::vector<Region *> RegionQueue;
-  BasicBlock *OldExit = getExit();
-
-  RegionQueue.push_back(this);
-  while (!RegionQueue.empty()) {
-    Region *R = RegionQueue.back();
-    RegionQueue.pop_back();
-
-    R->replaceExit(NewExit);
-    for (Region::const_iterator RI = R->begin(), RE = R->end(); RI != RE; ++RI)
-      if ((*RI)->getExit() == OldExit)
-        RegionQueue.push_back(RI->get());
-  }
-}
-
-bool Region::contains(const BasicBlock *B) const {
-  BasicBlock *BB = const_cast<BasicBlock*>(B);
-
-  if (!DT->getNode(BB))
-    return false;
-
-  BasicBlock *entry = getEntry(), *exit = getExit();
-
-  // Toplevel region.
-  if (!exit)
-    return true;
-
-  return (DT->dominates(entry, BB)
-    && !(DT->dominates(exit, BB) && DT->dominates(entry, exit)));
-}
-
-bool Region::contains(const Loop *L) const {
-  // BBs that are not part of any loop are element of the Loop
-  // described by the NULL pointer. This loop is not part of any region,
-  // except if the region describes the whole function.
-  if (!L)
-    return getExit() == nullptr;
-
-  if (!contains(L->getHeader()))
-    return false;
-
-  SmallVector<BasicBlock *, 8> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-
-  for (SmallVectorImpl<BasicBlock*>::iterator BI = ExitingBlocks.begin(),
-       BE = ExitingBlocks.end(); BI != BE; ++BI)
-    if (!contains(*BI))
-      return false;
-
-  return true;
-}
-
-Loop *Region::outermostLoopInRegion(Loop *L) const {
-  if (!contains(L))
-    return nullptr;
-
-  while (L && contains(L->getParentLoop())) {
-    L = L->getParentLoop();
-  }
-
-  return L;
-}
-
-Loop *Region::outermostLoopInRegion(LoopInfo *LI, BasicBlock* BB) const {
-  assert(LI && BB && "LI and BB cannot be null!");
-  Loop *L = LI->getLoopFor(BB);
-  return outermostLoopInRegion(L);
-}
-
-BasicBlock *Region::getEnteringBlock() const {
-  BasicBlock *entry = getEntry();
-  BasicBlock *Pred;
-  BasicBlock *enteringBlock = nullptr;
-
-  for (pred_iterator PI = pred_begin(entry), PE = pred_end(entry); PI != PE;
-       ++PI) {
-    Pred = *PI;
-    if (DT->getNode(Pred) && !contains(Pred)) {
-      if (enteringBlock)
-        return nullptr;
-
-      enteringBlock = Pred;
-    }
-  }
-
-  return enteringBlock;
-}
-
-BasicBlock *Region::getExitingBlock() const {
-  BasicBlock *exit = getExit();
-  BasicBlock *Pred;
-  BasicBlock *exitingBlock = nullptr;
-
-  if (!exit)
-    return nullptr;
-
-  for (pred_iterator PI = pred_begin(exit), PE = pred_end(exit); PI != PE;
-       ++PI) {
-    Pred = *PI;
-    if (contains(Pred)) {
-      if (exitingBlock)
-        return nullptr;
-
-      exitingBlock = Pred;
-    }
-  }
-
-  return exitingBlock;
-}
-
-bool Region::isSimple() const {
-  return !isTopLevelRegion() && getEnteringBlock() && getExitingBlock();
-}
-
-std::string Region::getNameStr() const {
-  std::string exitName;
-  std::string entryName;
-
-  if (getEntry()->getName().empty()) {
-    raw_string_ostream OS(entryName);
-
-    getEntry()->printAsOperand(OS, false);
-  } else
-    entryName = getEntry()->getName();
-
-  if (getExit()) {
-    if (getExit()->getName().empty()) {
-      raw_string_ostream OS(exitName);
-
-      getExit()->printAsOperand(OS, false);
-    } else
-      exitName = getExit()->getName();
-  } else
-    exitName = "<Function Return>";
-
-  return entryName + " => " + exitName;
-}
-
-void Region::verifyBBInRegion(BasicBlock *BB) const {
-  if (!contains(BB))
-    llvm_unreachable("Broken region found!");
-
-  BasicBlock *entry = getEntry(), *exit = getExit();
-
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-    if (!contains(*SI) && exit != *SI)
-      llvm_unreachable("Broken region found!");
-
-  if (entry != BB)
-    for (pred_iterator SI = pred_begin(BB), SE = pred_end(BB); SI != SE; ++SI)
-      if (!contains(*SI))
-        llvm_unreachable("Broken region found!");
-}
-
-void Region::verifyWalk(BasicBlock *BB, std::set<BasicBlock*> *visited) const {
-  BasicBlock *exit = getExit();
-
-  visited->insert(BB);
-
-  verifyBBInRegion(BB);
-
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-    if (*SI != exit && visited->find(*SI) == visited->end())
-        verifyWalk(*SI, visited);
-}
-
-void Region::verifyRegion() const {
-  // Only do verification when user wants to, otherwise this expensive
-  // check will be invoked by PassManager.
-  if (!VerifyRegionInfo) return;
-
-  std::set<BasicBlock*> visited;
-  verifyWalk(getEntry(), &visited);
-}
-
-void Region::verifyRegionNest() const {
-  for (Region::const_iterator RI = begin(), RE = end(); RI != RE; ++RI)
-    (*RI)->verifyRegionNest();
-
-  verifyRegion();
-}
-
-Region::element_iterator Region::element_begin() {
-  return GraphTraits<Region*>::nodes_begin(this);
-}
-
-Region::element_iterator Region::element_end() {
-  return GraphTraits<Region*>::nodes_end(this);
-}
-
-Region::const_element_iterator Region::element_begin() const {
-  return GraphTraits<const Region*>::nodes_begin(this);
-}
-
-Region::const_element_iterator Region::element_end() const {
-  return GraphTraits<const Region*>::nodes_end(this);
-}
-
-Region* Region::getSubRegionNode(BasicBlock *BB) const {
-  Region *R = RI->getRegionFor(BB);
-
-  if (!R || R == this)
-    return nullptr;
-
-  // If we pass the BB out of this region, that means our code is broken.
-  assert(contains(R) && "BB not in current region!");
-
-  while (contains(R->getParent()) && R->getParent() != this)
-    R = R->getParent();
-
-  if (R->getEntry() != BB)
-    return nullptr;
-
-  return R;
-}
-
-RegionNode* Region::getBBNode(BasicBlock *BB) const {
-  assert(contains(BB) && "Can get BB node out of this region!");
-
-  BBNodeMapT::const_iterator at = BBNodeMap.find(BB);
-
-  if (at != BBNodeMap.end())
-    return at->second;
-
-  RegionNode *NewNode = new RegionNode(const_cast<Region*>(this), BB);
-  BBNodeMap.insert(std::make_pair(BB, NewNode));
-  return NewNode;
-}
-
-RegionNode* Region::getNode(BasicBlock *BB) const {
-  assert(contains(BB) && "Can get BB node out of this region!");
-  if (Region* Child = getSubRegionNode(BB))
-    return Child->getNode();
-
-  return getBBNode(BB);
-}
-
-void Region::transferChildrenTo(Region *To) {
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    (*I)->parent = To;
-    To->children.push_back(std::move(*I));
-  }
-  children.clear();
-}
-
-void Region::addSubRegion(Region *SubRegion, bool moveChildren) {
-  assert(!SubRegion->parent && "SubRegion already has a parent!");
-  assert(std::find_if(begin(), end(), [&](const std::unique_ptr<Region> &R) {
-           return R.get() == SubRegion;
-         }) == children.end() &&
-         "Subregion already exists!");
-
-  SubRegion->parent = this;
-  children.push_back(std::unique_ptr<Region>(SubRegion));
-
-  if (!moveChildren)
-    return;
-
-  assert(SubRegion->children.size() == 0
-         && "SubRegions that contain children are not supported");
-
-  for (element_iterator I = element_begin(), E = element_end(); I != E; ++I)
-    if (!(*I)->isSubRegion()) {
-      BasicBlock *BB = (*I)->getNodeAs<BasicBlock>();
-
-      if (SubRegion->contains(BB))
-        RI->setRegionFor(BB, SubRegion);
-    }
-
-  std::vector<std::unique_ptr<Region>> Keep;
-  for (iterator I = begin(), E = end(); I != E; ++I)
-    if (SubRegion->contains(I->get()) && I->get() != SubRegion) {
-      (*I)->parent = SubRegion;
-      SubRegion->children.push_back(std::move(*I));
-    } else
-      Keep.push_back(std::move(*I));
-
-  children.clear();
-  children.insert(children.begin(),
-                  std::move_iterator<RegionSet::iterator>(Keep.begin()),
-                  std::move_iterator<RegionSet::iterator>(Keep.end()));
-}
-
-
-Region *Region::removeSubRegion(Region *Child) {
-  assert(Child->parent == this && "Child is not a child of this region!");
-  Child->parent = nullptr;
-  RegionSet::iterator I = std::find_if(
-      children.begin(), children.end(),
-      [&](const std::unique_ptr<Region> &R) { return R.get() == Child; });
-  assert(I != children.end() && "Region does not exit. Unable to remove.");
-  children.erase(children.begin()+(I-begin()));
-  return Child;
-}
-
-unsigned Region::getDepth() const {
-  unsigned Depth = 0;
-
-  for (Region *R = parent; R != nullptr; R = R->parent)
-    ++Depth;
-
-  return Depth;
-}
-
-Region *Region::getExpandedRegion() const {
-  unsigned NumSuccessors = exit->getTerminator()->getNumSuccessors();
-
-  if (NumSuccessors == 0)
-    return nullptr;
-
-  for (pred_iterator PI = pred_begin(getExit()), PE = pred_end(getExit());
-       PI != PE; ++PI)
-    if (!DT->dominates(getEntry(), *PI))
-      return nullptr;
-
-  Region *R = RI->getRegionFor(exit);
-
-  if (R->getEntry() != exit) {
-    if (exit->getTerminator()->getNumSuccessors() == 1)
-      return new Region(getEntry(), *succ_begin(exit), RI, DT);
-    else
-      return nullptr;
-  }
-
-  while (R->getParent() && R->getParent()->getEntry() == exit)
-    R = R->getParent();
-
-  if (!DT->dominates(getEntry(), R->getExit()))
-    for (pred_iterator PI = pred_begin(getExit()), PE = pred_end(getExit());
-         PI != PE; ++PI)
-    if (!DT->dominates(R->getExit(), *PI))
-      return nullptr;
-
-  return new Region(getEntry(), R->getExit(), RI, DT);
-}
-
-void Region::print(raw_ostream &OS, bool print_tree, unsigned level,
-                   enum PrintStyle Style) const {
-  if (print_tree)
-    OS.indent(level*2) << "[" << level << "] " << getNameStr();
-  else
-    OS.indent(level*2) << getNameStr();
-
-  OS << "\n";
-
-
-  if (Style != PrintNone) {
-    OS.indent(level*2) << "{\n";
-    OS.indent(level*2 + 2);
-
-    if (Style == PrintBB) {
-      for (const auto &BB : blocks())
-        OS << BB->getName() << ", "; // TODO: remove the last ","
-    } else if (Style == PrintRN) {
-      for (const_element_iterator I = element_begin(), E = element_end(); I!=E; ++I)
-        OS << **I << ", "; // TODO: remove the last ",
-    }
-
-    OS << "\n";
-  }
-
-  if (print_tree)
-    for (const_iterator RI = begin(), RE = end(); RI != RE; ++RI)
-      (*RI)->print(OS, print_tree, level+1, Style);
-
-  if (Style != PrintNone)
-    OS.indent(level*2) << "} \n";
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void Region::dump() const {
-  print(dbgs(), true, getDepth(), printStyle.getValue());
-}
-#endif
-
-void Region::clearNodeCache() {
-  // Free the cached nodes.
-  for (BBNodeMapT::iterator I = BBNodeMap.begin(),
-       IE = BBNodeMap.end(); I != IE; ++I)
-    delete I->second;
-
-  BBNodeMap.clear();
-  for (Region::iterator RI = begin(), RE = end(); RI != RE; ++RI)
-    (*RI)->clearNodeCache();
-}
+Region::~Region() { }
 
 //===----------------------------------------------------------------------===//
 // RegionInfo implementation
 //
 
-bool RegionInfo::isCommonDomFrontier(BasicBlock *BB, BasicBlock *entry,
-                                     BasicBlock *exit) const {
-  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
-    BasicBlock *P = *PI;
-    if (DT->dominates(entry, P) && !DT->dominates(exit, P))
-      return false;
-  }
-  return true;
+RegionInfo::RegionInfo() :
+  RegionInfoBase<RegionTraits<Function>>() {
+
 }
 
-bool RegionInfo::isRegion(BasicBlock *entry, BasicBlock *exit) const {
-  assert(entry && exit && "entry and exit must not be null!");
-  typedef DominanceFrontier::DomSetType DST;
+RegionInfo::~RegionInfo() {
 
-  DST *entrySuccs = &DF->find(entry)->second;
-
-  // Exit is the header of a loop that contains the entry. In this case,
-  // the dominance frontier must only contain the exit.
-  if (!DT->dominates(entry, exit)) {
-    for (DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end();
-         SI != SE; ++SI)
-      if (*SI != exit && *SI != entry)
-        return false;
-
-    return true;
-  }
-
-  DST *exitSuccs = &DF->find(exit)->second;
-
-  // Do not allow edges leaving the region.
-  for (DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end();
-       SI != SE; ++SI) {
-    if (*SI == exit || *SI == entry)
-      continue;
-    if (exitSuccs->find(*SI) == exitSuccs->end())
-      return false;
-    if (!isCommonDomFrontier(*SI, entry, exit))
-      return false;
-  }
-
-  // Do not allow edges pointing into the region.
-  for (DST::iterator SI = exitSuccs->begin(), SE = exitSuccs->end();
-       SI != SE; ++SI)
-    if (DT->properlyDominates(entry, *SI) && *SI != exit)
-      return false;
-
-
-  return true;
-}
-
-void RegionInfo::insertShortCut(BasicBlock *entry, BasicBlock *exit,
-                             BBtoBBMap *ShortCut) const {
-  assert(entry && exit && "entry and exit must not be null!");
-
-  BBtoBBMap::iterator e = ShortCut->find(exit);
-
-  if (e == ShortCut->end())
-    // No further region at exit available.
-    (*ShortCut)[entry] = exit;
-  else {
-    // We found a region e that starts at exit. Therefore (entry, e->second)
-    // is also a region, that is larger than (entry, exit). Insert the
-    // larger one.
-    BasicBlock *BB = e->second;
-    (*ShortCut)[entry] = BB;
-  }
-}
-
-DomTreeNode* RegionInfo::getNextPostDom(DomTreeNode* N,
-                                        BBtoBBMap *ShortCut) const {
-  BBtoBBMap::iterator e = ShortCut->find(N->getBlock());
-
-  if (e == ShortCut->end())
-    return N->getIDom();
-
-  return PDT->getNode(e->second)->getIDom();
-}
-
-bool RegionInfo::isTrivialRegion(BasicBlock *entry, BasicBlock *exit) const {
-  assert(entry && exit && "entry and exit must not be null!");
-
-  unsigned num_successors = succ_end(entry) - succ_begin(entry);
-
-  if (num_successors <= 1 && exit == *(succ_begin(entry)))
-    return true;
-
-  return false;
 }
 
 void RegionInfo::updateStatistics(Region *R) {
   ++numRegions;
 
   // TODO: Slow. Should only be enabled if -stats is used.
-  if (R->isSimple()) ++numSimpleRegions;
+  if (R->isSimple())
+    ++numSimpleRegions;
 }
 
-Region *RegionInfo::createRegion(BasicBlock *entry, BasicBlock *exit) {
-  assert(entry && exit && "entry and exit must not be null!");
+void RegionInfo::recalculate(Function &F, DominatorTree *DT_,
+                             PostDominatorTree *PDT_, DominanceFrontier *DF_) {
+  DT = DT_;
+  PDT = PDT_;
+  DF = DF_;
 
-  if (isTrivialRegion(entry, exit))
-    return nullptr;
-
-  Region *region = new Region(entry, exit, this, DT);
-  BBtoRegion.insert(std::make_pair(entry, region));
-
- #ifdef XDEBUG
-    region->verifyRegion();
- #else
-    DEBUG(region->verifyRegion());
- #endif
-
-  updateStatistics(region);
-  return region;
-}
-
-void RegionInfo::findRegionsWithEntry(BasicBlock *entry, BBtoBBMap *ShortCut) {
-  assert(entry);
-
-  DomTreeNode *N = PDT->getNode(entry);
-
-  if (!N)
-    return;
-
-  Region *lastRegion= nullptr;
-  BasicBlock *lastExit = entry;
-
-  // As only a BasicBlock that postdominates entry can finish a region, walk the
-  // post dominance tree upwards.
-  while ((N = getNextPostDom(N, ShortCut))) {
-    BasicBlock *exit = N->getBlock();
-
-    if (!exit)
-      break;
-
-    if (isRegion(entry, exit)) {
-      Region *newRegion = createRegion(entry, exit);
-
-      if (lastRegion)
-        newRegion->addSubRegion(lastRegion);
-
-      lastRegion = newRegion;
-      lastExit = exit;
-    }
-
-    // This can never be a region, so stop the search.
-    if (!DT->dominates(entry, exit))
-      break;
-  }
-
-  // Tried to create regions from entry to lastExit.  Next time take a
-  // shortcut from entry to lastExit.
-  if (lastExit != entry)
-    insertShortCut(entry, lastExit, ShortCut);
-}
-
-void RegionInfo::scanForRegions(Function &F, BBtoBBMap *ShortCut) {
-  BasicBlock *entry = &(F.getEntryBlock());
-  DomTreeNode *N = DT->getNode(entry);
-
-  // Iterate over the dominance tree in post order to start with the small
-  // regions from the bottom of the dominance tree.  If the small regions are
-  // detected first, detection of bigger regions is faster, as we can jump
-  // over the small regions.
-  for (po_iterator<DomTreeNode*> FI = po_begin(N), FE = po_end(N); FI != FE;
-    ++FI) {
-    findRegionsWithEntry(FI->getBlock(), ShortCut);
-  }
-}
-
-Region *RegionInfo::getTopMostParent(Region *region) {
-  while (region->parent)
-    region = region->getParent();
-
-  return region;
-}
-
-void RegionInfo::buildRegionsTree(DomTreeNode *N, Region *region) {
-  BasicBlock *BB = N->getBlock();
-
-  // Passed region exit
-  while (BB == region->getExit())
-    region = region->getParent();
-
-  BBtoRegionMap::iterator it = BBtoRegion.find(BB);
-
-  // This basic block is a start block of a region. It is already in the
-  // BBtoRegion relation. Only the child basic blocks have to be updated.
-  if (it != BBtoRegion.end()) {
-    Region *newRegion = it->second;
-    region->addSubRegion(getTopMostParent(newRegion));
-    region = newRegion;
-  } else {
-    BBtoRegion[BB] = region;
-  }
-
-  for (DomTreeNode::iterator CI = N->begin(), CE = N->end(); CI != CE; ++CI)
-    buildRegionsTree(*CI, region);
-}
-
-void RegionInfo::releaseMemory() {
-  BBtoRegion.clear();
-  if (TopLevelRegion)
-    delete TopLevelRegion;
-  TopLevelRegion = nullptr;
-}
-
-RegionInfo::RegionInfo() : FunctionPass(ID) {
-  initializeRegionInfoPass(*PassRegistry::getPassRegistry());
-  TopLevelRegion = nullptr;
-}
-
-RegionInfo::~RegionInfo() {
-  releaseMemory();
-}
-
-void RegionInfo::Calculate(Function &F) {
-  // ShortCut a function where for every BB the exit of the largest region
-  // starting with BB is stored. These regions can be threated as single BBS.
-  // This improves performance on linear CFGs.
-  BBtoBBMap ShortCut;
-
-  scanForRegions(F, &ShortCut);
-  BasicBlock *BB = &F.getEntryBlock();
-  buildRegionsTree(DT->getNode(BB), TopLevelRegion);
-}
-
-bool RegionInfo::runOnFunction(Function &F) {
-  releaseMemory();
-
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  PDT = &getAnalysis<PostDominatorTree>();
-  DF = &getAnalysis<DominanceFrontier>();
-
-  TopLevelRegion = new Region(&F.getEntryBlock(), nullptr, this, DT, nullptr);
+  TopLevelRegion = new Region(&F.getEntryBlock(), nullptr,
+                              this, DT, nullptr);
   updateStatistics(TopLevelRegion);
+  calculate(F);
+}
 
-  Calculate(F);
+//===----------------------------------------------------------------------===//
+// RegionInfoPass implementation
+//
 
+RegionInfoPass::RegionInfoPass() : FunctionPass(ID) {
+  initializeRegionInfoPassPass(*PassRegistry::getPassRegistry());
+}
+
+RegionInfoPass::~RegionInfoPass() {
+
+}
+
+bool RegionInfoPass::runOnFunction(Function &F) {
+  releaseMemory();
+
+  auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto PDT = &getAnalysis<PostDominatorTree>();
+  auto DF = &getAnalysis<DominanceFrontier>();
+
+  RI.recalculate(F, DT, PDT, DF);
   return false;
 }
 
-void RegionInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+void RegionInfoPass::releaseMemory() {
+  RI.releaseMemory();
+}
+
+void RegionInfoPass::verifyAnalysis() const {
+    RI.verifyAnalysis();
+}
+
+void RegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequiredTransitive<DominatorTreeWrapperPass>();
   AU.addRequired<PostDominatorTree>();
   AU.addRequired<DominanceFrontier>();
 }
 
-void RegionInfo::print(raw_ostream &OS, const Module *) const {
-  OS << "Region tree:\n";
-  TopLevelRegion->print(OS, true, 0, printStyle.getValue());
-  OS << "End region tree\n";
+void RegionInfoPass::print(raw_ostream &OS, const Module *) const {
+  RI.print(OS);
 }
 
-void RegionInfo::verifyAnalysis() const {
-  // Only do verification when user wants to, otherwise this expensive check
-  // will be invoked by PMDataManager::verifyPreservedAnalysis when
-  // a regionpass (marked PreservedAll) finish.
-  if (!VerifyRegionInfo) return;
-
-  TopLevelRegion->verifyRegionNest();
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void RegionInfoPass::dump() const {
+  RI.dump();
 }
+#endif
 
-// Region pass manager support.
-Region *RegionInfo::getRegionFor(BasicBlock *BB) const {
-  BBtoRegionMap::const_iterator I=
-    BBtoRegion.find(BB);
-  return I != BBtoRegion.end() ? I->second : nullptr;
-}
+char RegionInfoPass::ID = 0;
 
-void RegionInfo::setRegionFor(BasicBlock *BB, Region *R) {
-  BBtoRegion[BB] = R;
-}
-
-Region *RegionInfo::operator[](BasicBlock *BB) const {
-  return getRegionFor(BB);
-}
-
-BasicBlock *RegionInfo::getMaxRegionExit(BasicBlock *BB) const {
-  BasicBlock *Exit = nullptr;
-
-  while (true) {
-    // Get largest region that starts at BB.
-    Region *R = getRegionFor(BB);
-    while (R && R->getParent() && R->getParent()->getEntry() == BB)
-      R = R->getParent();
-
-    // Get the single exit of BB.
-    if (R && R->getEntry() == BB)
-      Exit = R->getExit();
-    else if (++succ_begin(BB) == succ_end(BB))
-      Exit = *succ_begin(BB);
-    else // No single exit exists.
-      return Exit;
-
-    // Get largest region that starts at Exit.
-    Region *ExitR = getRegionFor(Exit);
-    while (ExitR && ExitR->getParent()
-           && ExitR->getParent()->getEntry() == Exit)
-      ExitR = ExitR->getParent();
-
-    for (pred_iterator PI = pred_begin(Exit), PE = pred_end(Exit); PI != PE;
-         ++PI)
-      if (!R->contains(*PI) && !ExitR->contains(*PI))
-        break;
-
-    // This stops infinite cycles.
-    if (DT->dominates(Exit, BB))
-      break;
-
-    BB = Exit;
-  }
-
-  return Exit;
-}
-
-Region*
-RegionInfo::getCommonRegion(Region *A, Region *B) const {
-  assert (A && B && "One of the Regions is NULL");
-
-  if (A->contains(B)) return A;
-
-  while (!B->contains(A))
-    B = B->getParent();
-
-  return B;
-}
-
-Region*
-RegionInfo::getCommonRegion(SmallVectorImpl<Region*> &Regions) const {
-  Region* ret = Regions.back();
-  Regions.pop_back();
-
-  for (SmallVectorImpl<Region*>::const_iterator I = Regions.begin(),
-       E = Regions.end(); I != E; ++I)
-      ret = getCommonRegion(ret, *I);
-
-  return ret;
-}
-
-Region*
-RegionInfo::getCommonRegion(SmallVectorImpl<BasicBlock*> &BBs) const {
-  Region* ret = getRegionFor(BBs.back());
-  BBs.pop_back();
-
-  for (SmallVectorImpl<BasicBlock*>::const_iterator I = BBs.begin(),
-       E = BBs.end(); I != E; ++I)
-      ret = getCommonRegion(ret, getRegionFor(*I));
-
-  return ret;
-}
-
-void RegionInfo::splitBlock(BasicBlock* NewBB, BasicBlock *OldBB)
-{
-  Region *R = getRegionFor(OldBB);
-
-  setRegionFor(NewBB, R);
-
-  while (R->getEntry() == OldBB && !R->isTopLevelRegion()) {
-    R->replaceEntry(NewBB);
-    R = R->getParent();
-  }
-
-  setRegionFor(OldBB, R);
-}
-
-char RegionInfo::ID = 0;
-INITIALIZE_PASS_BEGIN(RegionInfo, "regions",
+INITIALIZE_PASS_BEGIN(RegionInfoPass, "regions",
                 "Detect single entry single exit regions", true, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(DominanceFrontier)
-INITIALIZE_PASS_END(RegionInfo, "regions",
+INITIALIZE_PASS_END(RegionInfoPass, "regions",
                 "Detect single entry single exit regions", true, true)
 
 // Create methods available outside of this file, to use them
@@ -863,7 +167,7 @@
 
 namespace llvm {
   FunctionPass *createRegionInfoPass() {
-    return new RegionInfo();
+    return new RegionInfoPass();
   }
 }
 

diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 71de144..de34b72 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp

@@ -45,14 +45,14 @@
 
 /// Pass Manager itself does not invalidate any analysis info.
 void RGPassManager::getAnalysisUsage(AnalysisUsage &Info) const {
-  Info.addRequired<RegionInfo>();
+  Info.addRequired<RegionInfoPass>();
   Info.setPreservesAll();
 }
 
 /// run - Execute all of the passes scheduled for execution.  Keep track of
 /// whether any of the passes modifies the function, and if so, return true.
 bool RGPassManager::runOnFunction(Function &F) {
-  RI = &getAnalysis<RegionInfo>();
+  RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
   bool Changed = false;
 
   // Collect inherited analysis from Module level pass manager.

diff --git a/lib/Analysis/RegionPrinter.cpp b/lib/Analysis/RegionPrinter.cpp
index 893210a..ad83113 100644
--- a/lib/Analysis/RegionPrinter.cpp
+++ b/lib/Analysis/RegionPrinter.cpp

@@ -56,23 +56,24 @@
 };
 
 template<>
-struct DOTGraphTraits<RegionInfo*> : public DOTGraphTraits<RegionNode*> {
+struct DOTGraphTraits<RegionInfoPass*> : public DOTGraphTraits<RegionNode*> {
 
-  DOTGraphTraits (bool isSimple=false)
+  DOTGraphTraits (bool isSimple = false)
     : DOTGraphTraits<RegionNode*>(isSimple) {}
 
-  static std::string getGraphName(RegionInfo *DT) {
+  static std::string getGraphName(RegionInfoPass *DT) {
     return "Region Graph";
   }
 
-  std::string getNodeLabel(RegionNode *Node, RegionInfo *G) {
+  std::string getNodeLabel(RegionNode *Node, RegionInfoPass *G) {
+    RegionInfo &RI = G->getRegionInfo();
     return DOTGraphTraits<RegionNode*>::getNodeLabel(Node,
-                                                     G->getTopLevelRegion());
+                                                     reinterpret_cast<RegionNode*>(RI.getTopLevelRegion()));
   }
 
   std::string getEdgeAttributes(RegionNode *srcNode,
-    GraphTraits<RegionInfo*>::ChildIteratorType CI, RegionInfo *RI) {
-
+    GraphTraits<RegionInfo*>::ChildIteratorType CI, RegionInfoPass *G) {
+    RegionInfo &RI = G->getRegionInfo();
     RegionNode *destNode = *CI;
 
     if (srcNode->isSubRegion() || destNode->isSubRegion())
@@ -82,7 +83,7 @@
     BasicBlock *srcBB = srcNode->getNodeAs<BasicBlock>();
     BasicBlock *destBB = destNode->getNodeAs<BasicBlock>();
 
-    Region *R = RI->getRegionFor(destBB);
+    Region *R = RI.getRegionFor(destBB);
 
     while (R && R->getParent())
       if (R->getParent()->getEntry() == destBB)
@@ -98,7 +99,8 @@
 
   // Print the cluster of the subregions. This groups the single basic blocks
   // and adds a different background color for each group.
-  static void printRegionCluster(const Region &R, GraphWriter<RegionInfo*> &GW,
+  static void printRegionCluster(const Region &R,
+                                 GraphWriter<RegionInfoPass*> &GW,
                                  unsigned depth = 0) {
     raw_ostream &O = GW.getOStream();
     O.indent(2 * depth) << "subgraph cluster_" << static_cast<const void*>(&R)
@@ -119,22 +121,23 @@
     for (Region::const_iterator RI = R.begin(), RE = R.end(); RI != RE; ++RI)
       printRegionCluster(**RI, GW, depth + 1);
 
-    RegionInfo *RI = R.getRegionInfo();
+    const RegionInfo &RI = *static_cast<const RegionInfo*>(R.getRegionInfo());
 
     for (const auto &BB : R.blocks())
-      if (RI->getRegionFor(BB) == &R)
+      if (RI.getRegionFor(BB) == &R)
         O.indent(2 * (depth + 1)) << "Node"
-          << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(BB))
+          << static_cast<const void*>(RI.getTopLevelRegion()->getBBNode(BB))
           << ";\n";
 
     O.indent(2 * depth) << "}\n";
   }
 
-  static void addCustomGraphFeatures(const RegionInfo* RI,
-                                     GraphWriter<RegionInfo*> &GW) {
+  static void addCustomGraphFeatures(const RegionInfoPass* RIP,
+                                     GraphWriter<RegionInfoPass*> &GW) {
+    const RegionInfo &RI = RIP->getRegionInfo();
     raw_ostream &O = GW.getOStream();
     O << "\tcolorscheme = \"paired12\"\n";
-    printRegionCluster(*RI->getTopLevelRegion(), GW, 4);
+    printRegionCluster(*RI.getTopLevelRegion(), GW, 4);
   }
 };
 } //end namespace llvm
@@ -142,28 +145,28 @@
 namespace {
 
 struct RegionViewer
-  : public DOTGraphTraitsViewer<RegionInfo, false> {
+  : public DOTGraphTraitsViewer<RegionInfoPass, false> {
   static char ID;
-  RegionViewer() : DOTGraphTraitsViewer<RegionInfo, false>("reg", ID){
+  RegionViewer() : DOTGraphTraitsViewer<RegionInfoPass, false>("reg", ID){
     initializeRegionViewerPass(*PassRegistry::getPassRegistry());
   }
 };
 char RegionViewer::ID = 0;
 
 struct RegionOnlyViewer
-  : public DOTGraphTraitsViewer<RegionInfo, true> {
+  : public DOTGraphTraitsViewer<RegionInfoPass, true> {
   static char ID;
-  RegionOnlyViewer() : DOTGraphTraitsViewer<RegionInfo, true>("regonly", ID) {
+  RegionOnlyViewer() : DOTGraphTraitsViewer<RegionInfoPass, true>("regonly", ID) {
     initializeRegionOnlyViewerPass(*PassRegistry::getPassRegistry());
   }
 };
 char RegionOnlyViewer::ID = 0;
 
 struct RegionPrinter
-  : public DOTGraphTraitsPrinter<RegionInfo, false> {
+  : public DOTGraphTraitsPrinter<RegionInfoPass, false> {
   static char ID;
   RegionPrinter() :
-    DOTGraphTraitsPrinter<RegionInfo, false>("reg", ID) {
+    DOTGraphTraitsPrinter<RegionInfoPass, false>("reg", ID) {
       initializeRegionPrinterPass(*PassRegistry::getPassRegistry());
     }
 };
@@ -175,7 +178,7 @@
 
 INITIALIZE_PASS(RegionViewer, "view-regions", "View regions of function",
                 true, true)
-                
+
 INITIALIZE_PASS(RegionOnlyViewer, "view-regions-only",
                 "View regions of function (with no function bodies)",
                 true, true)
@@ -183,10 +186,10 @@
 namespace {
 
 struct RegionOnlyPrinter
-  : public DOTGraphTraitsPrinter<RegionInfo, true> {
+  : public DOTGraphTraitsPrinter<RegionInfoPass, true> {
   static char ID;
   RegionOnlyPrinter() :
-    DOTGraphTraitsPrinter<RegionInfo, true>("reg", ID) {
+    DOTGraphTraitsPrinter<RegionInfoPass, true>("reg", ID) {
       initializeRegionOnlyPrinterPass(*PassRegistry::getPassRegistry());
     }
 };

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 06dbde5..68549ef 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp

@@ -1,4 +1,4 @@
-//===- ScalarEvolution.cpp - Scalar Evolution Analysis ----------*- C++ -*-===//
+//===- ScalarEvolution.cpp - Scalar Evolution Analysis --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -59,9 +59,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -78,6 +80,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -113,6 +116,7 @@
 
 INITIALIZE_PASS_BEGIN(ScalarEvolution, "scalar-evolution",
                 "Scalar Evolution Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
@@ -671,7 +675,321 @@
   }
 }
 
+static const APInt srem(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
 
+  if (ABW > BBW)
+    B = B.sext(ABW);
+  else if (ABW < BBW)
+    A = A.sext(BBW);
+
+  return APIntOps::srem(A, B);
+}
+
+static const APInt sdiv(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.sext(ABW);
+  else if (ABW < BBW)
+    A = A.sext(BBW);
+
+  return APIntOps::sdiv(A, B);
+}
+
+static const APInt urem(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.zext(ABW);
+  else if (ABW < BBW)
+    A = A.zext(BBW);
+
+  return APIntOps::urem(A, B);
+}
+
+static const APInt udiv(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.zext(ABW);
+  else if (ABW < BBW)
+    A = A.zext(BBW);
+
+  return APIntOps::udiv(A, B);
+}
+
+namespace {
+struct FindSCEVSize {
+  int Size;
+  FindSCEVSize() : Size(0) {}
+
+  bool follow(const SCEV *S) {
+    ++Size;
+    // Keep looking at all operands of S.
+    return true;
+  }
+  bool isDone() const {
+    return false;
+  }
+};
+}
+
+// Returns the size of the SCEV S.
+static inline int sizeOfSCEV(const SCEV *S) {
+  FindSCEVSize F;
+  SCEVTraversal<FindSCEVSize> ST(F);
+  ST.visitAll(S);
+  return F.Size;
+}
+
+namespace {
+
+template <typename Derived>
+struct SCEVDivision : public SCEVVisitor<Derived, void> {
+public:
+  // Computes the Quotient and Remainder of the division of Numerator by
+  // Denominator.
+  static void divide(ScalarEvolution &SE, const SCEV *Numerator,
+                     const SCEV *Denominator, const SCEV **Quotient,
+                     const SCEV **Remainder) {
+    assert(Numerator && Denominator && "Uninitialized SCEV");
+
+    Derived D(SE, Numerator, Denominator);
+
+    // Check for the trivial case here to avoid having to check for it in the
+    // rest of the code.
+    if (Numerator == Denominator) {
+      *Quotient = D.One;
+      *Remainder = D.Zero;
+      return;
+    }
+
+    if (Numerator->isZero()) {
+      *Quotient = D.Zero;
+      *Remainder = D.Zero;
+      return;
+    }
+
+    // Split the Denominator when it is a product.
+    if (const SCEVMulExpr *T = dyn_cast<const SCEVMulExpr>(Denominator)) {
+      const SCEV *Q, *R;
+      *Quotient = Numerator;
+      for (const SCEV *Op : T->operands()) {
+        divide(SE, *Quotient, Op, &Q, &R);
+        *Quotient = Q;
+
+        // Bail out when the Numerator is not divisible by one of the terms of
+        // the Denominator.
+        if (!R->isZero()) {
+          *Quotient = D.Zero;
+          *Remainder = Numerator;
+          return;
+        }
+      }
+      *Remainder = D.Zero;
+      return;
+    }
+
+    D.visit(Numerator);
+    *Quotient = D.Quotient;
+    *Remainder = D.Remainder;
+  }
+
+  // Except in the trivial case described above, we do not know how to divide
+  // Expr by Denominator for the following functions with empty implementation.
+  void visitTruncateExpr(const SCEVTruncateExpr *Numerator) {}
+  void visitZeroExtendExpr(const SCEVZeroExtendExpr *Numerator) {}
+  void visitSignExtendExpr(const SCEVSignExtendExpr *Numerator) {}
+  void visitUDivExpr(const SCEVUDivExpr *Numerator) {}
+  void visitSMaxExpr(const SCEVSMaxExpr *Numerator) {}
+  void visitUMaxExpr(const SCEVUMaxExpr *Numerator) {}
+  void visitUnknown(const SCEVUnknown *Numerator) {}
+  void visitCouldNotCompute(const SCEVCouldNotCompute *Numerator) {}
+
+  void visitAddRecExpr(const SCEVAddRecExpr *Numerator) {
+    const SCEV *StartQ, *StartR, *StepQ, *StepR;
+    assert(Numerator->isAffine() && "Numerator should be affine");
+    divide(SE, Numerator->getStart(), Denominator, &StartQ, &StartR);
+    divide(SE, Numerator->getStepRecurrence(SE), Denominator, &StepQ, &StepR);
+    Quotient = SE.getAddRecExpr(StartQ, StepQ, Numerator->getLoop(),
+                                Numerator->getNoWrapFlags());
+    Remainder = SE.getAddRecExpr(StartR, StepR, Numerator->getLoop(),
+                                 Numerator->getNoWrapFlags());
+  }
+
+  void visitAddExpr(const SCEVAddExpr *Numerator) {
+    SmallVector<const SCEV *, 2> Qs, Rs;
+    Type *Ty = Denominator->getType();
+
+    for (const SCEV *Op : Numerator->operands()) {
+      const SCEV *Q, *R;
+      divide(SE, Op, Denominator, &Q, &R);
+
+      // Bail out if types do not match.
+      if (Ty != Q->getType() || Ty != R->getType()) {
+        Quotient = Zero;
+        Remainder = Numerator;
+        return;
+      }
+
+      Qs.push_back(Q);
+      Rs.push_back(R);
+    }
+
+    if (Qs.size() == 1) {
+      Quotient = Qs[0];
+      Remainder = Rs[0];
+      return;
+    }
+
+    Quotient = SE.getAddExpr(Qs);
+    Remainder = SE.getAddExpr(Rs);
+  }
+
+  void visitMulExpr(const SCEVMulExpr *Numerator) {
+    SmallVector<const SCEV *, 2> Qs;
+    Type *Ty = Denominator->getType();
+
+    bool FoundDenominatorTerm = false;
+    for (const SCEV *Op : Numerator->operands()) {
+      // Bail out if types do not match.
+      if (Ty != Op->getType()) {
+        Quotient = Zero;
+        Remainder = Numerator;
+        return;
+      }
+
+      if (FoundDenominatorTerm) {
+        Qs.push_back(Op);
+        continue;
+      }
+
+      // Check whether Denominator divides one of the product operands.
+      const SCEV *Q, *R;
+      divide(SE, Op, Denominator, &Q, &R);
+      if (!R->isZero()) {
+        Qs.push_back(Op);
+        continue;
+      }
+
+      // Bail out if types do not match.
+      if (Ty != Q->getType()) {
+        Quotient = Zero;
+        Remainder = Numerator;
+        return;
+      }
+
+      FoundDenominatorTerm = true;
+      Qs.push_back(Q);
+    }
+
+    if (FoundDenominatorTerm) {
+      Remainder = Zero;
+      if (Qs.size() == 1)
+        Quotient = Qs[0];
+      else
+        Quotient = SE.getMulExpr(Qs);
+      return;
+    }
+
+    if (!isa<SCEVUnknown>(Denominator)) {
+      Quotient = Zero;
+      Remainder = Numerator;
+      return;
+    }
+
+    // The Remainder is obtained by replacing Denominator by 0 in Numerator.
+    ValueToValueMap RewriteMap;
+    RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
+        cast<SCEVConstant>(Zero)->getValue();
+    Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
+
+    if (Remainder->isZero()) {
+      // The Quotient is obtained by replacing Denominator by 1 in Numerator.
+      RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
+          cast<SCEVConstant>(One)->getValue();
+      Quotient =
+          SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
+      return;
+    }
+
+    // Quotient is (Numerator - Remainder) divided by Denominator.
+    const SCEV *Q, *R;
+    const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder);
+    if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator)) {
+      // This SCEV does not seem to simplify: fail the division here.
+      Quotient = Zero;
+      Remainder = Numerator;
+      return;
+    }
+    divide(SE, Diff, Denominator, &Q, &R);
+    assert(R == Zero &&
+           "(Numerator - Remainder) should evenly divide Denominator");
+    Quotient = Q;
+  }
+
+private:
+  SCEVDivision(ScalarEvolution &S, const SCEV *Numerator,
+               const SCEV *Denominator)
+      : SE(S), Denominator(Denominator) {
+    Zero = SE.getConstant(Denominator->getType(), 0);
+    One = SE.getConstant(Denominator->getType(), 1);
+
+    // By default, we don't know how to divide Expr by Denominator.
+    // Providing the default here simplifies the rest of the code.
+    Quotient = Zero;
+    Remainder = Numerator;
+  }
+
+  ScalarEvolution &SE;
+  const SCEV *Denominator, *Quotient, *Remainder, *Zero, *One;
+
+  friend struct SCEVSDivision;
+  friend struct SCEVUDivision;
+};
+
+struct SCEVSDivision : public SCEVDivision<SCEVSDivision> {
+  SCEVSDivision(ScalarEvolution &S, const SCEV *Numerator,
+                const SCEV *Denominator)
+      : SCEVDivision(S, Numerator, Denominator) {}
+
+  void visitConstant(const SCEVConstant *Numerator) {
+    if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
+      Quotient = SE.getConstant(sdiv(Numerator, D));
+      Remainder = SE.getConstant(srem(Numerator, D));
+      return;
+    }
+  }
+};
+
+struct SCEVUDivision : public SCEVDivision<SCEVUDivision> {
+  SCEVUDivision(ScalarEvolution &S, const SCEV *Numerator,
+                const SCEV *Denominator)
+      : SCEVDivision(S, Numerator, Denominator) {}
+
+  void visitConstant(const SCEVConstant *Numerator) {
+    if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
+      Quotient = SE.getConstant(udiv(Numerator, D));
+      Remainder = SE.getConstant(urem(Numerator, D));
+      return;
+    }
+  }
+};
+
+}
 
 //===----------------------------------------------------------------------===//
 //                      Simple SCEV method implementations
@@ -2061,71 +2379,66 @@
     // Okay, if there weren't any loop invariants to be folded, check to see if
     // there are multiple AddRec's with the same loop induction variable being
     // multiplied together.  If so, we can fold them.
+
+    // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
+    // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
+    //       choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z
+    //   ]]],+,...up to x=2n}.
+    // Note that the arguments to choose() are always integers with values
+    // known at compile time, never SCEV objects.
+    //
+    // The implementation avoids pointless extra computations when the two
+    // addrec's are of different length (mathematically, it's equivalent to
+    // an infinite stream of zeros on the right).
+    bool OpsModified = false;
     for (unsigned OtherIdx = Idx+1;
-         OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+         OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
          ++OtherIdx) {
-      if (AddRecLoop != cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop())
+      const SCEVAddRecExpr *OtherAddRec =
+        dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]);
+      if (!OtherAddRec || OtherAddRec->getLoop() != AddRecLoop)
         continue;
 
-      // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
-      // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
-      //       choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z
-      //   ]]],+,...up to x=2n}.
-      // Note that the arguments to choose() are always integers with values
-      // known at compile time, never SCEV objects.
-      //
-      // The implementation avoids pointless extra computations when the two
-      // addrec's are of different length (mathematically, it's equivalent to
-      // an infinite stream of zeros on the right).
-      bool OpsModified = false;
-      for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
-           ++OtherIdx) {
-        const SCEVAddRecExpr *OtherAddRec =
-          dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]);
-        if (!OtherAddRec || OtherAddRec->getLoop() != AddRecLoop)
-          continue;
-
-        bool Overflow = false;
-        Type *Ty = AddRec->getType();
-        bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
-        SmallVector<const SCEV*, 7> AddRecOps;
-        for (int x = 0, xe = AddRec->getNumOperands() +
-               OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) {
-          const SCEV *Term = getConstant(Ty, 0);
-          for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
-            uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
-            for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
-                   ze = std::min(x+1, (int)OtherAddRec->getNumOperands());
-                 z < ze && !Overflow; ++z) {
-              uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow);
-              uint64_t Coeff;
-              if (LargerThan64Bits)
-                Coeff = umul_ov(Coeff1, Coeff2, Overflow);
-              else
-                Coeff = Coeff1*Coeff2;
-              const SCEV *CoeffTerm = getConstant(Ty, Coeff);
-              const SCEV *Term1 = AddRec->getOperand(y-z);
-              const SCEV *Term2 = OtherAddRec->getOperand(z);
-              Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2));
-            }
+      bool Overflow = false;
+      Type *Ty = AddRec->getType();
+      bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
+      SmallVector<const SCEV*, 7> AddRecOps;
+      for (int x = 0, xe = AddRec->getNumOperands() +
+             OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) {
+        const SCEV *Term = getConstant(Ty, 0);
+        for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
+          uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
+          for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
+                 ze = std::min(x+1, (int)OtherAddRec->getNumOperands());
+               z < ze && !Overflow; ++z) {
+            uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow);
+            uint64_t Coeff;
+            if (LargerThan64Bits)
+              Coeff = umul_ov(Coeff1, Coeff2, Overflow);
+            else
+              Coeff = Coeff1*Coeff2;
+            const SCEV *CoeffTerm = getConstant(Ty, Coeff);
+            const SCEV *Term1 = AddRec->getOperand(y-z);
+            const SCEV *Term2 = OtherAddRec->getOperand(z);
+            Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2));
           }
-          AddRecOps.push_back(Term);
         }
-        if (!Overflow) {
-          const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(),
-                                                SCEV::FlagAnyWrap);
-          if (Ops.size() == 2) return NewAddRec;
-          Ops[Idx] = NewAddRec;
-          Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
-          OpsModified = true;
-          AddRec = dyn_cast<SCEVAddRecExpr>(NewAddRec);
-          if (!AddRec)
-            break;
-        }
+        AddRecOps.push_back(Term);
       }
-      if (OpsModified)
-        return getMulExpr(Ops);
+      if (!Overflow) {
+        const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(),
+                                              SCEV::FlagAnyWrap);
+        if (Ops.size() == 2) return NewAddRec;
+        Ops[Idx] = NewAddRec;
+        Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+        OpsModified = true;
+        AddRec = dyn_cast<SCEVAddRecExpr>(NewAddRec);
+        if (!AddRec)
+          break;
+      }
     }
+    if (OpsModified)
+      return getMulExpr(Ops);
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
     // next one.
@@ -3082,7 +3395,8 @@
   Visited.insert(PN);
   while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
-    if (!Visited.insert(I)) continue;
+    if (!Visited.insert(I).second)
+      continue;
 
     ValueExprMapType::iterator It =
       ValueExprMap.find_as(static_cast<Value *>(I));
@@ -3263,7 +3577,7 @@
   // PHI's incoming blocks are in a different loop, in which case doing so
   // risks breaking LCSSA form. Instcombine would normally zap these, but
   // it doesn't have DominatorTree information, so it may miss cases.
-  if (Value *V = SimplifyInstruction(PN, DL, TLI, DT))
+  if (Value *V = SimplifyInstruction(PN, DL, TLI, DT, AT))
     if (LI->replacementPreservesLCSSAForm(PN, V))
       return getSCEV(V);
 
@@ -3395,7 +3709,7 @@
     // For a SCEVUnknown, ask ValueTracking.
     unsigned BitWidth = getTypeSizeInBits(U->getType());
     APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    computeKnownBits(U->getValue(), Zeros, Ones);
+    computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, AT, nullptr, DT);
     return Zeros.countTrailingOnes();
   }
 
@@ -3403,6 +3717,31 @@
   return 0;
 }
 
+/// GetRangeFromMetadata - Helper method to assign a range to V from
+/// metadata present in the IR.
+static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    if (MDNode *MD = I->getMetadata(LLVMContext::MD_range)) {
+      ConstantRange TotalRange(
+          cast<IntegerType>(I->getType())->getBitWidth(), false);
+
+      unsigned NumRanges = MD->getNumOperands() / 2;
+      assert(NumRanges >= 1);
+
+      for (unsigned i = 0; i < NumRanges; ++i) {
+        ConstantInt *Lower = cast<ConstantInt>(MD->getOperand(2*i + 0));
+        ConstantInt *Upper = cast<ConstantInt>(MD->getOperand(2*i + 1));
+        ConstantRange Range(Lower->getValue(), Upper->getValue());
+        TotalRange = TotalRange.unionWith(Range);
+      }
+
+      return TotalRange;
+    }
+  }
+
+  return None;
+}
+
 /// getUnsignedRange - Determine the unsigned range for a particular SCEV.
 ///
 ConstantRange
@@ -3532,9 +3871,14 @@
   }
 
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    // Check if the IR explicitly contains !range metadata.
+    Optional<ConstantRange> MDRange = GetRangeFromMetadata(U->getValue());
+    if (MDRange.hasValue())
+      ConservativeResult = ConservativeResult.intersectWith(MDRange.getValue());
+
     // For a SCEVUnknown, ask ValueTracking.
     APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    computeKnownBits(U->getValue(), Zeros, Ones, DL);
+    computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, AT, nullptr, DT);
     if (Ones == ~Zeros + 1)
       return setUnsignedRange(U, ConservativeResult);
     return setUnsignedRange(U,
@@ -3683,10 +4027,15 @@
   }
 
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    // Check if the IR explicitly contains !range metadata.
+    Optional<ConstantRange> MDRange = GetRangeFromMetadata(U->getValue());
+    if (MDRange.hasValue())
+      ConservativeResult = ConservativeResult.intersectWith(MDRange.getValue());
+
     // For a SCEVUnknown, ask ValueTracking.
     if (!U->getValue()->getType()->isIntegerTy() && !DL)
       return setSignedRange(U, ConservativeResult);
-    unsigned NS = ComputeNumSignBits(U->getValue(), DL);
+    unsigned NS = ComputeNumSignBits(U->getValue(), DL, 0, AT, nullptr, DT);
     if (NS <= 1)
       return setSignedRange(U, ConservativeResult);
     return setSignedRange(U, ConservativeResult.intersectWith(
@@ -3793,7 +4142,8 @@
       unsigned TZ = A.countTrailingZeros();
       unsigned BitWidth = A.getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      computeKnownBits(U->getOperand(0), KnownZero, KnownOne, DL);
+      computeKnownBits(U->getOperand(0), KnownZero, KnownOne, DL,
+                       0, AT, nullptr, DT);
 
       APInt EffectiveMask =
           APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ);
@@ -4070,6 +4420,14 @@
 //                   Iteration Count Computation Code
 //
 
+unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) {
+  if (BasicBlock *ExitingBB = L->getExitingBlock())
+    return getSmallConstantTripCount(L, ExitingBB);
+
+  // No trip count information for multiple exits.
+  return 0;
+}
+
 /// getSmallConstantTripCount - Returns the maximum trip count of this loop as a
 /// normal unsigned value. Returns 0 if the trip count is unknown or not
 /// constant. Will also return 0 if the maximum trip count is very large (>=
@@ -4080,19 +4438,13 @@
 /// before taking the branch. For loops with multiple exits, it may not be the
 /// number times that the loop header executes because the loop may exit
 /// prematurely via another branch.
-///
-/// FIXME: We conservatively call getBackedgeTakenCount(L) instead of
-/// getExitCount(L, ExitingBlock) to compute a safe trip count considering all
-/// loop exits. getExitCount() may return an exact count for this branch
-/// assuming no-signed-wrap. The number of well-defined iterations may actually
-/// be higher than this trip count if this exit test is skipped and the loop
-/// exits via a different branch. Ideally, getExitCount() would know whether it
-/// depends on a NSW assumption, and we would only fall back to a conservative
-/// trip count in that case.
-unsigned ScalarEvolution::
-getSmallConstantTripCount(Loop *L, BasicBlock * /*ExitingBlock*/) {
+unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
+                                                    BasicBlock *ExitingBlock) {
+  assert(ExitingBlock && "Must pass a non-null exiting block!");
+  assert(L->isLoopExiting(ExitingBlock) &&
+         "Exiting block must actually branch out of the loop!");
   const SCEVConstant *ExitCount =
-    dyn_cast<SCEVConstant>(getBackedgeTakenCount(L));
+      dyn_cast<SCEVConstant>(getExitCount(L, ExitingBlock));
   if (!ExitCount)
     return 0;
 
@@ -4106,6 +4458,14 @@
   return ((unsigned)ExitConst->getZExtValue()) + 1;
 }
 
+unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) {
+  if (BasicBlock *ExitingBB = L->getExitingBlock())
+    return getSmallConstantTripMultiple(L, ExitingBB);
+
+  // No trip multiple information for multiple exits.
+  return 0;
+}
+
 /// getSmallConstantTripMultiple - Returns the largest constant divisor of the
 /// trip count of this loop as a normal unsigned value, if possible. This
 /// means that the actual trip count is always a multiple of the returned
@@ -4118,9 +4478,13 @@
 ///
 /// As explained in the comments for getSmallConstantTripCount, this assumes
 /// that control exits the loop via ExitingBlock.
-unsigned ScalarEvolution::
-getSmallConstantTripMultiple(Loop *L, BasicBlock * /*ExitingBlock*/) {
-  const SCEV *ExitCount = getBackedgeTakenCount(L);
+unsigned
+ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
+                                              BasicBlock *ExitingBlock) {
+  assert(ExitingBlock && "Must pass a non-null exiting block!");
+  assert(L->isLoopExiting(ExitingBlock) &&
+         "Exiting block must actually branch out of the loop!");
+  const SCEV *ExitCount = getExitCount(L, ExitingBlock);
   if (ExitCount == getCouldNotCompute())
     return 1;
 
@@ -4230,7 +4594,8 @@
     SmallPtrSet<Instruction *, 8> Visited;
     while (!Worklist.empty()) {
       Instruction *I = Worklist.pop_back_val();
-      if (!Visited.insert(I)) continue;
+      if (!Visited.insert(I).second)
+        continue;
 
       ValueExprMapType::iterator It =
         ValueExprMap.find_as(static_cast<Value *>(I));
@@ -4282,7 +4647,8 @@
   SmallPtrSet<Instruction *, 8> Visited;
   while (!Worklist.empty()) {
     Instruction *I = Worklist.pop_back_val();
-    if (!Visited.insert(I)) continue;
+    if (!Visited.insert(I).second)
+      continue;
 
     ValueExprMapType::iterator It =
       ValueExprMap.find_as(static_cast<Value *>(I));
@@ -4316,7 +4682,8 @@
   SmallPtrSet<Instruction *, 8> Visited;
   while (!Worklist.empty()) {
     I = Worklist.pop_back_val();
-    if (!Visited.insert(I)) continue;
+    if (!Visited.insert(I).second)
+      continue;
 
     ValueExprMapType::iterator It =
       ValueExprMap.find_as(static_cast<Value *>(I));
@@ -4467,20 +4834,12 @@
     // non-exiting iterations. Partition the loop exits into two kinds:
     // LoopMustExits and LoopMayExits.
     //
-    // A LoopMustExit meets two requirements:
-    //
-    // (a) Its ExitLimit.MustExit flag must be set which indicates that the exit
-    // test condition cannot be skipped (the tested variable has unit stride or
-    // the test is less-than or greater-than, rather than a strict inequality).
-    //
-    // (b) It must dominate the loop latch, hence must be tested on every loop
-    // iteration.
-    //
-    // If any computable LoopMustExit is found, then MaxBECount is the minimum
-    // EL.Max of computable LoopMustExits. Otherwise, MaxBECount is
-    // conservatively the maximum EL.Max, where CouldNotCompute is considered
-    // greater than any computable EL.Max.
-    if (EL.MustExit && EL.Max != getCouldNotCompute() && Latch &&
+    // If the exit dominates the loop latch, it is a LoopMustExit otherwise it
+    // is a LoopMayExit.  If any computable LoopMustExit is found, then
+    // MaxBECount is the minimum EL.Max of computable LoopMustExits. Otherwise,
+    // MaxBECount is conservatively the maximum EL.Max, where CouldNotCompute is
+    // considered greater than any computable EL.Max.
+    if (EL.Max != getCouldNotCompute() && Latch &&
         DT->dominates(ExitBB, Latch)) {
       if (!MustExitMaxBECount)
         MustExitMaxBECount = EL.Max;
@@ -4567,18 +4926,19 @@
       return getCouldNotCompute();
   }
 
+  bool IsOnlyExit = (L->getExitingBlock() != nullptr);
   TerminatorInst *Term = ExitingBlock->getTerminator();
   if (BranchInst *BI = dyn_cast<BranchInst>(Term)) {
     assert(BI->isConditional() && "If unconditional, it can't be in loop!");
     // Proceed to the next level to examine the exit condition expression.
     return ComputeExitLimitFromCond(L, BI->getCondition(), BI->getSuccessor(0),
                                     BI->getSuccessor(1),
-                                    /*IsSubExpr=*/false);
+                                    /*ControlsExit=*/IsOnlyExit);
   }
 
   if (SwitchInst *SI = dyn_cast<SwitchInst>(Term))
     return ComputeExitLimitFromSingleExitSwitch(L, SI, Exit,
-                                                /*IsSubExpr=*/false);
+                                                /*ControlsExit=*/IsOnlyExit);
 
   return getCouldNotCompute();
 }
@@ -4587,28 +4947,27 @@
 /// backedge of the specified loop will execute if its exit condition
 /// were a conditional branch of ExitCond, TBB, and FBB.
 ///
-/// @param IsSubExpr is true if ExitCond does not directly control the exit
-/// branch. In this case, we cannot assume that the loop only exits when the
-/// condition is true and cannot infer that failing to meet the condition prior
-/// to integer wraparound results in undefined behavior.
+/// @param ControlsExit is true if ExitCond directly controls the exit
+/// branch. In this case, we can assume that the loop exits only if the
+/// condition is true and can infer that failing to meet the condition prior to
+/// integer wraparound results in undefined behavior.
 ScalarEvolution::ExitLimit
 ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
                                           Value *ExitCond,
                                           BasicBlock *TBB,
                                           BasicBlock *FBB,
-                                          bool IsSubExpr) {
+                                          bool ControlsExit) {
   // Check if the controlling expression for this loop is an And or Or.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
     if (BO->getOpcode() == Instruction::And) {
       // Recurse on the operands of the and.
       bool EitherMayExit = L->contains(TBB);
       ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
-                                               IsSubExpr || EitherMayExit);
+                                               ControlsExit && !EitherMayExit);
       ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
-                                               IsSubExpr || EitherMayExit);
+                                               ControlsExit && !EitherMayExit);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
-      bool MustExit = false;
       if (EitherMayExit) {
         // Both conditions must be true for the loop to continue executing.
         // Choose the less conservative count.
@@ -4623,7 +4982,6 @@
           MaxBECount = EL0.Max;
         else
           MaxBECount = getUMinFromMismatchedTypes(EL0.Max, EL1.Max);
-        MustExit = EL0.MustExit || EL1.MustExit;
       } else {
         // Both conditions must be true at the same time for the loop to exit.
         // For now, be conservative.
@@ -4632,21 +4990,19 @@
           MaxBECount = EL0.Max;
         if (EL0.Exact == EL1.Exact)
           BECount = EL0.Exact;
-        MustExit = EL0.MustExit && EL1.MustExit;
       }
 
-      return ExitLimit(BECount, MaxBECount, MustExit);
+      return ExitLimit(BECount, MaxBECount);
     }
     if (BO->getOpcode() == Instruction::Or) {
       // Recurse on the operands of the or.
       bool EitherMayExit = L->contains(FBB);
       ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
-                                               IsSubExpr || EitherMayExit);
+                                               ControlsExit && !EitherMayExit);
       ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
-                                               IsSubExpr || EitherMayExit);
+                                               ControlsExit && !EitherMayExit);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
-      bool MustExit = false;
       if (EitherMayExit) {
         // Both conditions must be false for the loop to continue executing.
         // Choose the less conservative count.
@@ -4661,7 +5017,6 @@
           MaxBECount = EL0.Max;
         else
           MaxBECount = getUMinFromMismatchedTypes(EL0.Max, EL1.Max);
-        MustExit = EL0.MustExit || EL1.MustExit;
       } else {
         // Both conditions must be false at the same time for the loop to exit.
         // For now, be conservative.
@@ -4670,17 +5025,16 @@
           MaxBECount = EL0.Max;
         if (EL0.Exact == EL1.Exact)
           BECount = EL0.Exact;
-        MustExit = EL0.MustExit && EL1.MustExit;
       }
 
-      return ExitLimit(BECount, MaxBECount, MustExit);
+      return ExitLimit(BECount, MaxBECount);
     }
   }
 
   // With an icmp, it may be feasible to compute an exact backedge-taken count.
   // Proceed to the next level to examine the icmp.
   if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond))
-    return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, IsSubExpr);
+    return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, ControlsExit);
 
   // Check for a constant condition. These are normally stripped out by
   // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to
@@ -4707,7 +5061,7 @@
                                           ICmpInst *ExitCond,
                                           BasicBlock *TBB,
                                           BasicBlock *FBB,
-                                          bool IsSubExpr) {
+                                          bool ControlsExit) {
 
   // If the condition was exit on true, convert the condition to exit on false
   ICmpInst::Predicate Cond;
@@ -4759,7 +5113,7 @@
   switch (Cond) {
   case ICmpInst::ICMP_NE: {                     // while (X != Y)
     // Convert to: while (X-Y != 0)
-    ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L, IsSubExpr);
+    ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
@@ -4772,14 +5126,14 @@
   case ICmpInst::ICMP_SLT:
   case ICmpInst::ICMP_ULT: {                    // while (X < Y)
     bool IsSigned = Cond == ICmpInst::ICMP_SLT;
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, IsSigned, IsSubExpr);
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, IsSigned, ControlsExit);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_SGT:
   case ICmpInst::ICMP_UGT: {                    // while (X > Y)
     bool IsSigned = Cond == ICmpInst::ICMP_SGT;
-    ExitLimit EL = HowManyGreaterThans(LHS, RHS, L, IsSigned, IsSubExpr);
+    ExitLimit EL = HowManyGreaterThans(LHS, RHS, L, IsSigned, ControlsExit);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
@@ -4801,7 +5155,7 @@
 ScalarEvolution::ComputeExitLimitFromSingleExitSwitch(const Loop *L,
                                                       SwitchInst *Switch,
                                                       BasicBlock *ExitingBlock,
-                                                      bool IsSubExpr) {
+                                                      bool ControlsExit) {
   assert(!L->contains(ExitingBlock) && "Not an exiting block!");
 
   // Give up if the exit is the default dest of a switch.
@@ -4814,7 +5168,7 @@
   const SCEV *RHS = getConstant(Switch->findCaseDest(ExitingBlock));
 
   // while (X != Y) --> while (X-Y != 0)
-  ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L, IsSubExpr);
+  ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L, ControlsExit);
   if (EL.hasAnyInfo())
     return EL;
 
@@ -5687,7 +6041,7 @@
 /// effectively V != 0.  We know and take advantage of the fact that this
 /// expression only being used in a comparison by zero context.
 ScalarEvolution::ExitLimit
-ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool IsSubExpr) {
+ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool ControlsExit) {
   // If the value is a constant
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
     // If the value is already zero, the branch will execute zero times.
@@ -5781,37 +6135,30 @@
     else
       MaxBECount = getConstant(CountDown ? CR.getUnsignedMax()
                                          : -CR.getUnsignedMin());
-    return ExitLimit(Distance, MaxBECount, /*MustExit=*/true);
+    return ExitLimit(Distance, MaxBECount);
   }
 
-  // If the recurrence is known not to wraparound, unsigned divide computes the
-  // back edge count. (Ideally we would have an "isexact" bit for udiv). We know
-  // that the value will either become zero (and thus the loop terminates), that
-  // the loop will terminate through some other exit condition first, or that
-  // the loop has undefined behavior.  This means we can't "miss" the exit
-  // value, even with nonunit stride, and exit later via the same branch. Note
-  // that we can skip this exit if loop later exits via a different
-  // branch. Hence MustExit=false.
-  //
-  // This is only valid for expressions that directly compute the loop exit. It
-  // is invalid for subexpressions in which the loop may exit through this
-  // branch even if this subexpression is false. In that case, the trip count
-  // computed by this udiv could be smaller than the number of well-defined
-  // iterations.
-  if (!IsSubExpr && AddRec->getNoWrapFlags(SCEV::FlagNW)) {
+  // If the step exactly divides the distance then unsigned divide computes the
+  // backedge count.
+  const SCEV *Q, *R;
+  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
+  SCEVUDivision::divide(SE, Distance, Step, &Q, &R);
+  if (R->isZero()) {
     const SCEV *Exact =
-      getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
-    return ExitLimit(Exact, Exact, /*MustExit=*/false);
+        getUDivExactExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
+    return ExitLimit(Exact, Exact);
   }
 
-  // If Step is a power of two that evenly divides Start we know that the loop
-  // will always terminate.  Start may not be a constant so we just have the
-  // number of trailing zeros available.  This is safe even in presence of
-  // overflow as the recurrence will overflow to exactly 0.
-  const APInt &StepV = StepC->getValue()->getValue();
-  if (StepV.isPowerOf2() &&
-      GetMinTrailingZeros(getNegativeSCEV(Start)) >= StepV.countTrailingZeros())
-    return getUDivExactExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
+  // If the condition controls loop exit (the loop exits only if the expression
+  // is true) and the addition is no-wrap we can use unsigned divide to
+  // compute the backedge count.  In this case, the step may not divide the
+  // distance, but we don't care because if the condition is "missed" the loop
+  // will have undefined behavior due to wrapping.
+  if (ControlsExit && AddRec->getNoWrapFlags(SCEV::FlagNW)) {
+    const SCEV *Exact =
+        getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
+    return ExitLimit(Exact, Exact);
+  }
 
   // Then, try to solve the above equation provided that Start is constant.
   if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start))
@@ -6309,19 +6656,30 @@
   // (interprocedural conditions notwithstanding).
   if (!L) return true;
 
+  if (isKnownPredicateWithRanges(Pred, LHS, RHS)) return true;
+
   BasicBlock *Latch = L->getLoopLatch();
   if (!Latch)
     return false;
 
   BranchInst *LoopContinuePredicate =
     dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!LoopContinuePredicate ||
-      LoopContinuePredicate->isUnconditional())
-    return false;
+  if (LoopContinuePredicate && LoopContinuePredicate->isConditional() &&
+      isImpliedCond(Pred, LHS, RHS,
+                    LoopContinuePredicate->getCondition(),
+                    LoopContinuePredicate->getSuccessor(0) != L->getHeader()))
+    return true;
 
-  return isImpliedCond(Pred, LHS, RHS,
-                       LoopContinuePredicate->getCondition(),
-                       LoopContinuePredicate->getSuccessor(0) != L->getHeader());
+  // Check conditions due to any @llvm.assume intrinsics.
+  for (auto &CI : AT->assumptions(F)) {
+    if (!DT->dominates(CI, Latch->getTerminator()))
+      continue;
+
+    if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false))
+      return true;
+  }
+
+  return false;
 }
 
 /// isLoopEntryGuardedByCond - Test whether entry to the loop is protected
@@ -6335,6 +6693,8 @@
   // (interprocedural conditions notwithstanding).
   if (!L) return false;
 
+  if (isKnownPredicateWithRanges(Pred, LHS, RHS)) return true;
+
   // Starting at the loop predecessor, climb up the predecessor chain, as long
   // as there are predecessors that can be found that have unique successors
   // leading to the original header.
@@ -6355,6 +6715,15 @@
       return true;
   }
 
+  // Check conditions due to any @llvm.assume intrinsics.
+  for (auto &CI : AT->assumptions(F)) {
+    if (!DT->dominates(CI, L->getHeader()))
+      continue;
+
+    if (isImpliedCond(Pred, LHS, RHS, CI->getArgOperand(0), false))
+      return true;
+  }
+
   return false;
 }
 
@@ -6469,6 +6838,66 @@
                                    RHS, LHS, FoundLHS, FoundRHS);
   }
 
+  // Check if we can make progress by sharpening ranges.
+  if (FoundPred == ICmpInst::ICMP_NE &&
+      (isa<SCEVConstant>(FoundLHS) || isa<SCEVConstant>(FoundRHS))) {
+
+    const SCEVConstant *C = nullptr;
+    const SCEV *V = nullptr;
+
+    if (isa<SCEVConstant>(FoundLHS)) {
+      C = cast<SCEVConstant>(FoundLHS);
+      V = FoundRHS;
+    } else {
+      C = cast<SCEVConstant>(FoundRHS);
+      V = FoundLHS;
+    }
+
+    // The guarding predicate tells us that C != V. If the known range
+    // of V is [C, t), we can sharpen the range to [C + 1, t).  The
+    // range we consider has to correspond to same signedness as the
+    // predicate we're interested in folding.
+
+    APInt Min = ICmpInst::isSigned(Pred) ?
+        getSignedRange(V).getSignedMin() : getUnsignedRange(V).getUnsignedMin();
+
+    if (Min == C->getValue()->getValue()) {
+      // Given (V >= Min && V != Min) we conclude V >= (Min + 1).
+      // This is true even if (Min + 1) wraps around -- in case of
+      // wraparound, (Min + 1) < Min, so (V >= Min => V >= (Min + 1)).
+
+      APInt SharperMin = Min + 1;
+
+      switch (Pred) {
+        case ICmpInst::ICMP_SGE:
+        case ICmpInst::ICMP_UGE:
+          // We know V `Pred` SharperMin.  If this implies LHS `Pred`
+          // RHS, we're done.
+          if (isImpliedCondOperands(Pred, LHS, RHS, V,
+                                    getConstant(SharperMin)))
+            return true;
+
+        case ICmpInst::ICMP_SGT:
+        case ICmpInst::ICMP_UGT:
+          // We know from the range information that (V `Pred` Min ||
+          // V == Min).  We know from the guarding condition that !(V
+          // == Min).  This gives us
+          //
+          //       V `Pred` Min || V == Min && !(V == Min)
+          //   =>  V `Pred` Min
+          //
+          // If V `Pred` Min implies LHS `Pred` RHS, we're done.
+
+          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min)))
+            return true;
+
+        default:
+          // No change
+          break;
+      }
+    }
+  }
+
   // Check whether the actual condition is beyond sufficient.
   if (FoundPred == ICmpInst::ICMP_EQ)
     if (ICmpInst::isTrueWhenEqual(Pred))
@@ -6614,13 +7043,13 @@
 /// specified less-than comparison will execute.  If not computable, return
 /// CouldNotCompute.
 ///
-/// @param IsSubExpr is true when the LHS < RHS condition does not directly
-/// control the branch. In this case, we can only compute an iteration count for
-/// a subexpression that cannot overflow before evaluating true.
+/// @param ControlsExit is true when the LHS < RHS condition directly controls
+/// the branch (loops exits only if condition is true). In this case, we can use
+/// NoWrapFlags to skip overflow checks.
 ScalarEvolution::ExitLimit
 ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
                                   const Loop *L, bool IsSigned,
-                                  bool IsSubExpr) {
+                                  bool ControlsExit) {
   // We handle only IV < Invariant
   if (!isLoopInvariant(RHS, L))
     return getCouldNotCompute();
@@ -6631,7 +7060,7 @@
   if (!IV || IV->getLoop() != L || !IV->isAffine())
     return getCouldNotCompute();
 
-  bool NoWrap = !IsSubExpr &&
+  bool NoWrap = ControlsExit &&
                 IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);
 
   const SCEV *Stride = IV->getStepRecurrence(*this);
@@ -6651,9 +7080,19 @@
                                       : ICmpInst::ICMP_ULT;
   const SCEV *Start = IV->getStart();
   const SCEV *End = RHS;
-  if (!isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS))
-    End = IsSigned ? getSMaxExpr(RHS, Start)
-                   : getUMaxExpr(RHS, Start);
+  if (!isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS)) {
+    const SCEV *Diff = getMinusSCEV(RHS, Start);
+    // If we have NoWrap set, then we can assume that the increment won't
+    // overflow, in which case if RHS - Start is a constant, we don't need to
+    // do a max operation since we can just figure it out statically
+    if (NoWrap && isa<SCEVConstant>(Diff)) {
+      APInt D = dyn_cast<const SCEVConstant>(Diff)->getValue()->getValue();
+      if (D.isNegative())
+        End = Start;
+    } else
+      End = IsSigned ? getSMaxExpr(RHS, Start)
+                     : getUMaxExpr(RHS, Start);
+  }
 
   const SCEV *BECount = computeBECount(getMinusSCEV(End, Start), Stride, false);
 
@@ -6684,13 +7123,13 @@
   if (isa<SCEVCouldNotCompute>(MaxBECount))
     MaxBECount = BECount;
 
-  return ExitLimit(BECount, MaxBECount, /*MustExit=*/true);
+  return ExitLimit(BECount, MaxBECount);
 }
 
 ScalarEvolution::ExitLimit
 ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
                                      const Loop *L, bool IsSigned,
-                                     bool IsSubExpr) {
+                                     bool ControlsExit) {
   // We handle only IV > Invariant
   if (!isLoopInvariant(RHS, L))
     return getCouldNotCompute();
@@ -6701,7 +7140,7 @@
   if (!IV || IV->getLoop() != L || !IV->isAffine())
     return getCouldNotCompute();
 
-  bool NoWrap = !IsSubExpr &&
+  bool NoWrap = ControlsExit &&
                 IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);
 
   const SCEV *Stride = getNegativeSCEV(IV->getStepRecurrence(*this));
@@ -6722,9 +7161,19 @@
 
   const SCEV *Start = IV->getStart();
   const SCEV *End = RHS;
-  if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS))
-    End = IsSigned ? getSMinExpr(RHS, Start)
-                   : getUMinExpr(RHS, Start);
+  if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS)) {
+    const SCEV *Diff = getMinusSCEV(RHS, Start);
+    // If we have NoWrap set, then we can assume that the increment won't
+    // overflow, in which case if RHS - Start is a constant, we don't need to
+    // do a max operation since we can just figure it out statically
+    if (NoWrap && isa<SCEVConstant>(Diff)) {
+      APInt D = dyn_cast<const SCEVConstant>(Diff)->getValue()->getValue();
+      if (!D.isNegative())
+        End = Start;
+    } else
+      End = IsSigned ? getSMinExpr(RHS, Start)
+                     : getUMinExpr(RHS, Start);
+  }
 
   const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false);
 
@@ -6756,7 +7205,7 @@
   if (isa<SCEVCouldNotCompute>(MaxBECount))
     MaxBECount = BECount;
 
-  return ExitLimit(BECount, MaxBECount, /*MustExit=*/true);
+  return ExitLimit(BECount, MaxBECount);
 }
 
 /// getNumIterationsInRange - Return the number of iterations of this loop that
@@ -6984,268 +7433,6 @@
     });
 }
 
-static const APInt srem(const SCEVConstant *C1, const SCEVConstant *C2) {
-  APInt A = C1->getValue()->getValue();
-  APInt B = C2->getValue()->getValue();
-  uint32_t ABW = A.getBitWidth();
-  uint32_t BBW = B.getBitWidth();
-
-  if (ABW > BBW)
-    B = B.sext(ABW);
-  else if (ABW < BBW)
-    A = A.sext(BBW);
-
-  return APIntOps::srem(A, B);
-}
-
-static const APInt sdiv(const SCEVConstant *C1, const SCEVConstant *C2) {
-  APInt A = C1->getValue()->getValue();
-  APInt B = C2->getValue()->getValue();
-  uint32_t ABW = A.getBitWidth();
-  uint32_t BBW = B.getBitWidth();
-
-  if (ABW > BBW)
-    B = B.sext(ABW);
-  else if (ABW < BBW)
-    A = A.sext(BBW);
-
-  return APIntOps::sdiv(A, B);
-}
-
-namespace {
-struct FindSCEVSize {
-  int Size;
-  FindSCEVSize() : Size(0) {}
-
-  bool follow(const SCEV *S) {
-    ++Size;
-    // Keep looking at all operands of S.
-    return true;
-  }
-  bool isDone() const {
-    return false;
-  }
-};
-}
-
-// Returns the size of the SCEV S.
-static inline int sizeOfSCEV(const SCEV *S) {
-  FindSCEVSize F;
-  SCEVTraversal<FindSCEVSize> ST(F);
-  ST.visitAll(S);
-  return F.Size;
-}
-
-namespace {
-
-struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
-public:
-  // Computes the Quotient and Remainder of the division of Numerator by
-  // Denominator.
-  static void divide(ScalarEvolution &SE, const SCEV *Numerator,
-                     const SCEV *Denominator, const SCEV **Quotient,
-                     const SCEV **Remainder) {
-    assert(Numerator && Denominator && "Uninitialized SCEV");
-
-    SCEVDivision D(SE, Numerator, Denominator);
-
-    // Check for the trivial case here to avoid having to check for it in the
-    // rest of the code.
-    if (Numerator == Denominator) {
-      *Quotient = D.One;
-      *Remainder = D.Zero;
-      return;
-    }
-
-    if (Numerator->isZero()) {
-      *Quotient = D.Zero;
-      *Remainder = D.Zero;
-      return;
-    }
-
-    // Split the Denominator when it is a product.
-    if (const SCEVMulExpr *T = dyn_cast<const SCEVMulExpr>(Denominator)) {
-      const SCEV *Q, *R;
-      *Quotient = Numerator;
-      for (const SCEV *Op : T->operands()) {
-        divide(SE, *Quotient, Op, &Q, &R);
-        *Quotient = Q;
-
-        // Bail out when the Numerator is not divisible by one of the terms of
-        // the Denominator.
-        if (!R->isZero()) {
-          *Quotient = D.Zero;
-          *Remainder = Numerator;
-          return;
-        }
-      }
-      *Remainder = D.Zero;
-      return;
-    }
-
-    D.visit(Numerator);
-    *Quotient = D.Quotient;
-    *Remainder = D.Remainder;
-  }
-
-  SCEVDivision(ScalarEvolution &S, const SCEV *Numerator, const SCEV *Denominator)
-      : SE(S), Denominator(Denominator) {
-    Zero = SE.getConstant(Denominator->getType(), 0);
-    One = SE.getConstant(Denominator->getType(), 1);
-
-    // By default, we don't know how to divide Expr by Denominator.
-    // Providing the default here simplifies the rest of the code.
-    Quotient = Zero;
-    Remainder = Numerator;
-  }
-
-  // Except in the trivial case described above, we do not know how to divide
-  // Expr by Denominator for the following functions with empty implementation.
-  void visitTruncateExpr(const SCEVTruncateExpr *Numerator) {}
-  void visitZeroExtendExpr(const SCEVZeroExtendExpr *Numerator) {}
-  void visitSignExtendExpr(const SCEVSignExtendExpr *Numerator) {}
-  void visitUDivExpr(const SCEVUDivExpr *Numerator) {}
-  void visitSMaxExpr(const SCEVSMaxExpr *Numerator) {}
-  void visitUMaxExpr(const SCEVUMaxExpr *Numerator) {}
-  void visitUnknown(const SCEVUnknown *Numerator) {}
-  void visitCouldNotCompute(const SCEVCouldNotCompute *Numerator) {}
-
-  void visitConstant(const SCEVConstant *Numerator) {
-    if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
-      Quotient = SE.getConstant(sdiv(Numerator, D));
-      Remainder = SE.getConstant(srem(Numerator, D));
-      return;
-    }
-  }
-
-  void visitAddRecExpr(const SCEVAddRecExpr *Numerator) {
-    const SCEV *StartQ, *StartR, *StepQ, *StepR;
-    assert(Numerator->isAffine() && "Numerator should be affine");
-    divide(SE, Numerator->getStart(), Denominator, &StartQ, &StartR);
-    divide(SE, Numerator->getStepRecurrence(SE), Denominator, &StepQ, &StepR);
-    Quotient = SE.getAddRecExpr(StartQ, StepQ, Numerator->getLoop(),
-                                Numerator->getNoWrapFlags());
-    Remainder = SE.getAddRecExpr(StartR, StepR, Numerator->getLoop(),
-                                 Numerator->getNoWrapFlags());
-  }
-
-  void visitAddExpr(const SCEVAddExpr *Numerator) {
-    SmallVector<const SCEV *, 2> Qs, Rs;
-    Type *Ty = Denominator->getType();
-
-    for (const SCEV *Op : Numerator->operands()) {
-      const SCEV *Q, *R;
-      divide(SE, Op, Denominator, &Q, &R);
-
-      // Bail out if types do not match.
-      if (Ty != Q->getType() || Ty != R->getType()) {
-        Quotient = Zero;
-        Remainder = Numerator;
-        return;
-      }
-
-      Qs.push_back(Q);
-      Rs.push_back(R);
-    }
-
-    if (Qs.size() == 1) {
-      Quotient = Qs[0];
-      Remainder = Rs[0];
-      return;
-    }
-
-    Quotient = SE.getAddExpr(Qs);
-    Remainder = SE.getAddExpr(Rs);
-  }
-
-  void visitMulExpr(const SCEVMulExpr *Numerator) {
-    SmallVector<const SCEV *, 2> Qs;
-    Type *Ty = Denominator->getType();
-
-    bool FoundDenominatorTerm = false;
-    for (const SCEV *Op : Numerator->operands()) {
-      // Bail out if types do not match.
-      if (Ty != Op->getType()) {
-        Quotient = Zero;
-        Remainder = Numerator;
-        return;
-      }
-
-      if (FoundDenominatorTerm) {
-        Qs.push_back(Op);
-        continue;
-      }
-
-      // Check whether Denominator divides one of the product operands.
-      const SCEV *Q, *R;
-      divide(SE, Op, Denominator, &Q, &R);
-      if (!R->isZero()) {
-        Qs.push_back(Op);
-        continue;
-      }
-
-      // Bail out if types do not match.
-      if (Ty != Q->getType()) {
-        Quotient = Zero;
-        Remainder = Numerator;
-        return;
-      }
-
-      FoundDenominatorTerm = true;
-      Qs.push_back(Q);
-    }
-
-    if (FoundDenominatorTerm) {
-      Remainder = Zero;
-      if (Qs.size() == 1)
-        Quotient = Qs[0];
-      else
-        Quotient = SE.getMulExpr(Qs);
-      return;
-    }
-
-    if (!isa<SCEVUnknown>(Denominator)) {
-      Quotient = Zero;
-      Remainder = Numerator;
-      return;
-    }
-
-    // The Remainder is obtained by replacing Denominator by 0 in Numerator.
-    ValueToValueMap RewriteMap;
-    RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
-        cast<SCEVConstant>(Zero)->getValue();
-    Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
-
-    if (Remainder->isZero()) {
-      // The Quotient is obtained by replacing Denominator by 1 in Numerator.
-      RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
-          cast<SCEVConstant>(One)->getValue();
-      Quotient =
-          SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
-      return;
-    }
-
-    // Quotient is (Numerator - Remainder) divided by Denominator.
-    const SCEV *Q, *R;
-    const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder);
-    if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator)) {
-      // This SCEV does not seem to simplify: fail the division here.
-      Quotient = Zero;
-      Remainder = Numerator;
-      return;
-    }
-    divide(SE, Diff, Denominator, &Q, &R);
-    assert(R == Zero &&
-           "(Numerator - Remainder) should evenly divide Denominator");
-    Quotient = Q;
-  }
-
-private:
-  ScalarEvolution &SE;
-  const SCEV *Denominator, *Quotient, *Remainder, *Zero, *One;
-};
-}
-
 static bool findArrayDimensionsRec(ScalarEvolution &SE,
                                    SmallVectorImpl<const SCEV *> &Terms,
                                    SmallVectorImpl<const SCEV *> &Sizes) {
@@ -7270,7 +7457,7 @@
   for (const SCEV *&Term : Terms) {
     // Normalize the terms before the next call to findArrayDimensionsRec.
     const SCEV *Q, *R;
-    SCEVDivision::divide(SE, Term, Step, &Q, &R);
+    SCEVSDivision::divide(SE, Term, Step, &Q, &R);
 
     // Bail out when GCD does not evenly divide one of the terms.
     if (!R->isZero())
@@ -7407,7 +7594,7 @@
   // Divide all terms by the element size.
   for (const SCEV *&Term : Terms) {
     const SCEV *Q, *R;
-    SCEVDivision::divide(SE, Term, ElementSize, &Q, &R);
+    SCEVSDivision::divide(SE, Term, ElementSize, &Q, &R);
     Term = Q;
   }
 
@@ -7454,7 +7641,7 @@
   int Last = Sizes.size() - 1;
   for (int i = Last; i >= 0; i--) {
     const SCEV *Q, *R;
-    SCEVDivision::divide(SE, Res, Sizes[i], &Q, &R);
+    SCEVSDivision::divide(SE, Res, Sizes[i], &Q, &R);
 
     DEBUG({
         dbgs() << "Res: " << *Res << "\n";
@@ -7609,7 +7796,7 @@
     // that until everything else is done.
     if (U == Old)
       continue;
-    if (!Visited.insert(U))
+    if (!Visited.insert(U).second)
       continue;
     if (PHINode *PN = dyn_cast<PHINode>(U))
       SE->ConstantEvolutionLoopExitValue.erase(PN);
@@ -7638,6 +7825,7 @@
 
 bool ScalarEvolution::runOnFunction(Function &F) {
   this->F = &F;
+  AT = &getAnalysis<AssumptionTracker>();
   LI = &getAnalysis<LoopInfo>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
@@ -7678,6 +7866,7 @@
 
 void ScalarEvolution::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
+  AU.addRequired<AssumptionTracker>();
   AU.addRequiredTransitive<LoopInfo>();
   AU.addRequiredTransitive<DominatorTreeWrapperPass>();
   AU.addRequired<TargetLibraryInfo>();

diff --git a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index 6933f74..5c339ee 100644
--- a/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp

@@ -162,10 +162,10 @@
   if ((AO && AO != LocA.Ptr) || (BO && BO != LocB.Ptr))
     if (alias(Location(AO ? AO : LocA.Ptr,
                        AO ? +UnknownSize : LocA.Size,
-                       AO ? nullptr : LocA.TBAATag),
+                       AO ? AAMDNodes() : LocA.AATags),
               Location(BO ? BO : LocB.Ptr,
                        BO ? +UnknownSize : LocB.Size,
-                       BO ? nullptr : LocB.TBAATag)) == NoAlias)
+                       BO ? AAMDNodes() : LocB.AATags)) == NoAlias)
       return NoAlias;
 
   // Forward the query to the next analysis.

diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 8c75b0d..bee3685 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp

@@ -1443,8 +1443,12 @@
     Constant *One = ConstantInt::get(Ty, 1);
     for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
       BasicBlock *HP = *HPI;
-      if (!PredSeen.insert(HP))
+      if (!PredSeen.insert(HP).second) {
+        // There must be an incoming value for each predecessor, even the
+        // duplicates!
+        CanonicalIV->addIncoming(CanonicalIV->getIncomingValueForBlock(HP), HP);
         continue;
+      }
 
       if (L->contains(HP)) {
         // Insert a unit add instruction right before the terminator
@@ -1707,7 +1711,7 @@
 
     // Fold constant phis. They may be congruent to other constant phis and
     // would confuse the logic below that expects proper IVs.
-    if (Value *V = SimplifyInstruction(Phi, SE.DL, SE.TLI, SE.DT)) {
+    if (Value *V = SimplifyInstruction(Phi, SE.DL, SE.TLI, SE.DT, SE.AT)) {
       Phi->replaceAllUsesWith(V);
       DeadInsts.push_back(Phi);
       ++NumElim;

diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index 3ccefb0..b238fe4 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp

@@ -126,7 +126,7 @@
       // Normalized form:   {-2,+,1,+,2}
       // Denormalized form: {1,+,3,+,2}
       //
-      // However, denormalization would use the a different step expression than
+      // However, denormalization would use a different step expression than
       // normalization (see getPostIncExpr), generating the wrong final
       // expression: {-2,+,1,+,2} + {1,+,2} => {-1,+,3,+,2}
       if (AR->isAffine() &&

diff --git a/lib/Analysis/ScopedNoAliasAA.cpp b/lib/Analysis/ScopedNoAliasAA.cpp
new file mode 100644
index 0000000..f6c300a
--- /dev/null
+++ b/lib/Analysis/ScopedNoAliasAA.cpp

@@ -0,0 +1,245 @@
+//===- ScopedNoAliasAA.cpp - Scoped No-Alias Alias Analysis ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ScopedNoAlias alias-analysis pass, which implements
+// metadata-based scoped no-alias support.
+//
+// Alias-analysis scopes are defined by an id (which can be a string or some
+// other metadata node), a domain node, and an optional descriptive string.
+// A domain is defined by an id (which can be a string or some other metadata
+// node), and an optional descriptive string.
+//
+// !dom0 =   metadata !{ metadata !"domain of foo()" }
+// !scope1 = metadata !{ metadata !scope1, metadata !dom0, metadata !"scope 1" }
+// !scope2 = metadata !{ metadata !scope2, metadata !dom0, metadata !"scope 2" }
+//
+// Loads and stores can be tagged with an alias-analysis scope, and also, with
+// a noalias tag for a specific scope:
+//
+// ... = load %ptr1, !alias.scope !{ !scope1 }
+// ... = load %ptr2, !alias.scope !{ !scope1, !scope2 }, !noalias !{ !scope1 }
+//
+// When evaluating an aliasing query, if one of the instructions is associated
+// has a set of noalias scopes in some domain that is superset of the alias
+// scopes in that domain of some other instruction, then the two memory
+// accesses are assumed not to alias.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+// A handy option for disabling scoped no-alias functionality. The same effect
+// can also be achieved by stripping the associated metadata tags from IR, but
+// this option is sometimes more convenient.
+static cl::opt<bool>
+EnableScopedNoAlias("enable-scoped-noalias", cl::init(true));
+
+namespace {
+/// AliasScopeNode - This is a simple wrapper around an MDNode which provides
+/// a higher-level interface by hiding the details of how alias analysis
+/// information is encoded in its operands.
+class AliasScopeNode {
+  const MDNode *Node;
+
+public:
+  AliasScopeNode() : Node(0) {}
+  explicit AliasScopeNode(const MDNode *N) : Node(N) {}
+
+  /// getNode - Get the MDNode for this AliasScopeNode.
+  const MDNode *getNode() const { return Node; }
+
+  /// getDomain - Get the MDNode for this AliasScopeNode's domain.
+  const MDNode *getDomain() const {
+    if (Node->getNumOperands() < 2)
+      return nullptr;
+    return dyn_cast_or_null<MDNode>(Node->getOperand(1));
+  }
+};
+
+/// ScopedNoAliasAA - This is a simple alias analysis
+/// implementation that uses scoped-noalias metadata to answer queries.
+class ScopedNoAliasAA : public ImmutablePass, public AliasAnalysis {
+public:
+  static char ID; // Class identification, replacement for typeinfo
+  ScopedNoAliasAA() : ImmutablePass(ID) {
+    initializeScopedNoAliasAAPass(*PassRegistry::getPassRegistry());
+  }
+
+  void initializePass() override { InitializeAliasAnalysis(this); }
+
+  /// getAdjustedAnalysisPointer - This method is used when a pass implements
+  /// an analysis interface through multiple inheritance.  If needed, it
+  /// should override this to adjust the this pointer as needed for the
+  /// specified pass info.
+  void *getAdjustedAnalysisPointer(const void *PI) override {
+    if (PI == &AliasAnalysis::ID)
+      return (AliasAnalysis*)this;
+    return this;
+  }
+
+protected:
+  bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias) const;
+  void collectMDInDomain(const MDNode *List, const MDNode *Domain,
+                         SmallPtrSetImpl<const MDNode *> &Nodes) const;
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  AliasResult alias(const Location &LocA, const Location &LocB) override;
+  bool pointsToConstantMemory(const Location &Loc, bool OrLocal) override;
+  ModRefBehavior getModRefBehavior(ImmutableCallSite CS) override;
+  ModRefBehavior getModRefBehavior(const Function *F) override;
+  ModRefResult getModRefInfo(ImmutableCallSite CS,
+                             const Location &Loc) override;
+  ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                             ImmutableCallSite CS2) override;
+};
+}  // End of anonymous namespace
+
+// Register this pass...
+char ScopedNoAliasAA::ID = 0;
+INITIALIZE_AG_PASS(ScopedNoAliasAA, AliasAnalysis, "scoped-noalias",
+                   "Scoped NoAlias Alias Analysis", false, true, false)
+
+ImmutablePass *llvm::createScopedNoAliasAAPass() {
+  return new ScopedNoAliasAA();
+}
+
+void
+ScopedNoAliasAA::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AliasAnalysis::getAnalysisUsage(AU);
+}
+
+void
+ScopedNoAliasAA::collectMDInDomain(const MDNode *List, const MDNode *Domain,
+                   SmallPtrSetImpl<const MDNode *> &Nodes) const {
+  for (unsigned i = 0, ie = List->getNumOperands(); i != ie; ++i)
+    if (const MDNode *MD = dyn_cast<MDNode>(List->getOperand(i)))
+      if (AliasScopeNode(MD).getDomain() == Domain)
+        Nodes.insert(MD);
+}
+
+bool
+ScopedNoAliasAA::mayAliasInScopes(const MDNode *Scopes,
+                                  const MDNode *NoAlias) const {
+  if (!Scopes || !NoAlias)
+    return true;
+
+  // Collect the set of scope domains relevant to the noalias scopes.
+  SmallPtrSet<const MDNode *, 16> Domains;
+  for (unsigned i = 0, ie = NoAlias->getNumOperands(); i != ie; ++i)
+    if (const MDNode *NAMD = dyn_cast<MDNode>(NoAlias->getOperand(i)))
+      if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain())
+        Domains.insert(Domain);
+
+  // We alias unless, for some domain, the set of noalias scopes in that domain
+  // is a superset of the set of alias scopes in that domain.
+  for (const MDNode *Domain : Domains) {
+    SmallPtrSet<const MDNode *, 16> NANodes, ScopeNodes;
+    collectMDInDomain(NoAlias, Domain, NANodes);
+    collectMDInDomain(Scopes, Domain, ScopeNodes);
+    if (!ScopeNodes.size())
+      continue;
+
+    // To not alias, all of the nodes in ScopeNodes must be in NANodes.
+    bool FoundAll = true;
+    for (const MDNode *SMD : ScopeNodes)
+      if (!NANodes.count(SMD)) {
+        FoundAll = false;
+        break;
+      }
+
+    if (FoundAll)
+      return false;
+  }
+
+  return true;
+}
+
+AliasAnalysis::AliasResult
+ScopedNoAliasAA::alias(const Location &LocA, const Location &LocB) {
+  if (!EnableScopedNoAlias)
+    return AliasAnalysis::alias(LocA, LocB);
+
+  // Get the attached MDNodes.
+  const MDNode *AScopes = LocA.AATags.Scope,
+               *BScopes = LocB.AATags.Scope;
+
+  const MDNode *ANoAlias = LocA.AATags.NoAlias,
+               *BNoAlias = LocB.AATags.NoAlias;
+
+  if (!mayAliasInScopes(AScopes, BNoAlias))
+    return NoAlias;
+
+  if (!mayAliasInScopes(BScopes, ANoAlias))
+    return NoAlias;
+
+  // If they may alias, chain to the next AliasAnalysis.
+  return AliasAnalysis::alias(LocA, LocB);
+}
+
+bool ScopedNoAliasAA::pointsToConstantMemory(const Location &Loc,
+                                             bool OrLocal) {
+  return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+}
+
+AliasAnalysis::ModRefBehavior
+ScopedNoAliasAA::getModRefBehavior(ImmutableCallSite CS) {
+  return AliasAnalysis::getModRefBehavior(CS);
+}
+
+AliasAnalysis::ModRefBehavior
+ScopedNoAliasAA::getModRefBehavior(const Function *F) {
+  return AliasAnalysis::getModRefBehavior(F);
+}
+
+AliasAnalysis::ModRefResult
+ScopedNoAliasAA::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
+  if (!EnableScopedNoAlias)
+    return AliasAnalysis::getModRefInfo(CS, Loc);
+
+  if (!mayAliasInScopes(Loc.AATags.Scope, CS.getInstruction()->getMetadata(
+                                              LLVMContext::MD_noalias)))
+    return NoModRef;
+
+  if (!mayAliasInScopes(
+          CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope),
+          Loc.AATags.NoAlias))
+    return NoModRef;
+
+  return AliasAnalysis::getModRefInfo(CS, Loc);
+}
+
+AliasAnalysis::ModRefResult
+ScopedNoAliasAA::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) {
+  if (!EnableScopedNoAlias)
+    return AliasAnalysis::getModRefInfo(CS1, CS2);
+
+  if (!mayAliasInScopes(
+          CS1.getInstruction()->getMetadata(LLVMContext::MD_alias_scope),
+          CS2.getInstruction()->getMetadata(LLVMContext::MD_noalias)))
+    return NoModRef;
+
+  if (!mayAliasInScopes(
+          CS2.getInstruction()->getMetadata(LLVMContext::MD_alias_scope),
+          CS1.getInstruction()->getMetadata(LLVMContext::MD_noalias)))
+    return NoModRef;
+
+  return AliasAnalysis::getModRefInfo(CS1, CS2);
+}
+

diff --git a/lib/Analysis/StratifiedSets.h b/lib/Analysis/StratifiedSets.h
new file mode 100644
index 0000000..fd3fbc0
--- /dev/null
+++ b/lib/Analysis/StratifiedSets.h

@@ -0,0 +1,692 @@
+//===- StratifiedSets.h - Abstract stratified sets implementation. --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_STRATIFIEDSETS_H
+#define LLVM_ADT_STRATIFIEDSETS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+#include <bitset>
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+// \brief An index into Stratified Sets.
+typedef unsigned StratifiedIndex;
+// NOTE: ^ This can't be a short -- bootstrapping clang has a case where
+// ~1M sets exist.
+
+// \brief Container of information related to a value in a StratifiedSet.
+struct StratifiedInfo {
+  StratifiedIndex Index;
+  // For field sensitivity, etc. we can tack attributes on to this struct.
+};
+
+// The number of attributes that StratifiedAttrs should contain. Attributes are
+// described below, and 32 was an arbitrary choice because it fits nicely in 32
+// bits (because we use a bitset for StratifiedAttrs).
+static const unsigned NumStratifiedAttrs = 32;
+
+// These are attributes that the users of StratifiedSets/StratifiedSetBuilders
+// may use for various purposes. These also have the special property of that
+// they are merged down. So, if set A is above set B, and one decides to set an
+// attribute in set A, then the attribute will automatically be set in set B.
+typedef std::bitset<NumStratifiedAttrs> StratifiedAttrs;
+
+// \brief A "link" between two StratifiedSets.
+struct StratifiedLink {
+  // \brief This is a value used to signify "does not exist" where
+  // the StratifiedIndex type is used. This is used instead of
+  // Optional<StratifiedIndex> because Optional<StratifiedIndex> would
+  // eat up a considerable amount of extra memory, after struct
+  // padding/alignment is taken into account.
+  static const StratifiedIndex SetSentinel;
+
+  // \brief The index for the set "above" current
+  StratifiedIndex Above;
+
+  // \brief The link for the set "below" current
+  StratifiedIndex Below;
+
+  // \brief Attributes for these StratifiedSets.
+  StratifiedAttrs Attrs;
+
+  StratifiedLink() : Above(SetSentinel), Below(SetSentinel) {}
+
+  bool hasBelow() const { return Below != SetSentinel; }
+  bool hasAbove() const { return Above != SetSentinel; }
+
+  void clearBelow() { Below = SetSentinel; }
+  void clearAbove() { Above = SetSentinel; }
+};
+
+// \brief These are stratified sets, as described in "Fast algorithms for
+// Dyck-CFL-reachability with applications to Alias Analysis" by Zhang Q, Lyu M
+// R, Yuan H, and Su Z. -- in short, this is meant to represent different sets
+// of Value*s. If two Value*s are in the same set, or if both sets have 
+// overlapping attributes, then the Value*s are said to alias.
+//
+// Sets may be related by position, meaning that one set may be considered as
+// above or below another. In CFL Alias Analysis, this gives us an indication
+// of how two variables are related; if the set of variable A is below a set
+// containing variable B, then at some point, a variable that has interacted
+// with B (or B itself) was either used in order to extract the variable A, or
+// was used as storage of variable A.
+//
+// Sets may also have attributes (as noted above). These attributes are
+// generally used for noting whether a variable in the set has interacted with
+// a variable whose origins we don't quite know (i.e. globals/arguments), or if
+// the variable may have had operations performed on it (modified in a function
+// call). All attributes that exist in a set A must exist in all sets marked as
+// below set A.
+template <typename T> class StratifiedSets {
+public:
+  StratifiedSets() {}
+
+  StratifiedSets(DenseMap<T, StratifiedInfo> Map,
+                 std::vector<StratifiedLink> Links)
+      : Values(std::move(Map)), Links(std::move(Links)) {}
+
+  StratifiedSets(StratifiedSets<T> &&Other) { *this = std::move(Other); }
+
+  StratifiedSets &operator=(StratifiedSets<T> &&Other) {
+    Values = std::move(Other.Values);
+    Links = std::move(Other.Links);
+    return *this;
+  }
+
+  Optional<StratifiedInfo> find(const T &Elem) const {
+    auto Iter = Values.find(Elem);
+    if (Iter == Values.end()) {
+      return NoneType();
+    }
+    return Iter->second;
+  }
+
+  const StratifiedLink &getLink(StratifiedIndex Index) const {
+    assert(inbounds(Index));
+    return Links[Index];
+  }
+
+private:
+  DenseMap<T, StratifiedInfo> Values;
+  std::vector<StratifiedLink> Links;
+
+  bool inbounds(StratifiedIndex Idx) const { return Idx < Links.size(); }
+};
+
+// \brief Generic Builder class that produces StratifiedSets instances.
+//
+// The goal of this builder is to efficiently produce correct StratifiedSets
+// instances. To this end, we use a few tricks:
+//   > Set chains (A method for linking sets together)
+//   > Set remaps (A method for marking a set as an alias [irony?] of another)
+//
+// ==== Set chains ====
+// This builder has a notion of some value A being above, below, or with some
+// other value B:
+//   > The `A above B` relationship implies that there is a reference edge going
+//   from A to B. Namely, it notes that A can store anything in B's set.
+//   > The `A below B` relationship is the opposite of `A above B`. It implies
+//   that there's a dereference edge going from A to B.
+//   > The `A with B` relationship states that there's an assignment edge going
+//   from A to B, and that A and B should be treated as equals.
+//
+// As an example, take the following code snippet:
+//
+// %a = alloca i32, align 4
+// %ap = alloca i32*, align 8
+// %app = alloca i32**, align 8
+// store %a, %ap
+// store %ap, %app
+// %aw = getelementptr %ap, 0
+//
+// Given this, the follow relations exist:
+//   - %a below %ap & %ap above %a
+//   - %ap below %app & %app above %ap
+//   - %aw with %ap & %ap with %aw
+//
+// These relations produce the following sets:
+//   [{%a}, {%ap, %aw}, {%app}]
+//
+// ...Which states that the only MayAlias relationship in the above program is
+// between %ap and %aw.
+//
+// Life gets more complicated when we actually have logic in our programs. So,
+// we either must remove this logic from our programs, or make consessions for
+// it in our AA algorithms. In this case, we have decided to select the latter
+// option.
+//
+// First complication: Conditionals
+// Motivation:
+//  %ad = alloca int, align 4
+//  %a = alloca int*, align 8
+//  %b = alloca int*, align 8
+//  %bp = alloca int**, align 8
+//  %c = call i1 @SomeFunc()
+//  %k = select %c, %ad, %bp
+//  store %ad, %a
+//  store %b, %bp
+//
+// %k has 'with' edges to both %a and %b, which ordinarily would not be linked
+// together. So, we merge the set that contains %a with the set that contains
+// %b. We then recursively merge the set above %a with the set above %b, and
+// the set below  %a with the set below %b, etc. Ultimately, the sets for this
+// program would end up like: {%ad}, {%a, %b, %k}, {%bp}, where {%ad} is below
+// {%a, %b, %c} is below {%ad}.
+//
+// Second complication: Arbitrary casts
+// Motivation:
+//  %ip = alloca int*, align 8
+//  %ipp = alloca int**, align 8
+//  %i = bitcast ipp to int
+//  store %ip, %ipp
+//  store %i, %ip
+//
+// This is impossible to construct with any of the rules above, because a set
+// containing both {%i, %ipp} is supposed to exist, the set with %i is supposed
+// to be below the set with %ip, and the set with %ip is supposed to be below
+// the set with %ipp. Because we don't allow circular relationships like this,
+// we merge all concerned sets into one. So, the above code would generate a
+// single StratifiedSet: {%ip, %ipp, %i}.
+//
+// ==== Set remaps ====
+// More of an implementation detail than anything -- when merging sets, we need
+// to update the numbers of all of the elements mapped to those sets. Rather
+// than doing this at each merge, we note in the BuilderLink structure that a
+// remap has occurred, and use this information so we can defer renumbering set
+// elements until build time.
+template <typename T> class StratifiedSetsBuilder {
+  // \brief Represents a Stratified Set, with information about the Stratified
+  // Set above it, the set below it, and whether the current set has been
+  // remapped to another.
+  struct BuilderLink {
+    const StratifiedIndex Number;
+
+    BuilderLink(StratifiedIndex N) : Number(N) {
+      Remap = StratifiedLink::SetSentinel;
+    }
+
+    bool hasAbove() const {
+      assert(!isRemapped());
+      return Link.hasAbove();
+    }
+
+    bool hasBelow() const {
+      assert(!isRemapped());
+      return Link.hasBelow();
+    }
+
+    void setBelow(StratifiedIndex I) {
+      assert(!isRemapped());
+      Link.Below = I;
+    }
+
+    void setAbove(StratifiedIndex I) {
+      assert(!isRemapped());
+      Link.Above = I;
+    }
+
+    void clearBelow() {
+      assert(!isRemapped());
+      Link.clearBelow();
+    }
+
+    void clearAbove() {
+      assert(!isRemapped());
+      Link.clearAbove();
+    }
+
+    StratifiedIndex getBelow() const {
+      assert(!isRemapped());
+      assert(hasBelow());
+      return Link.Below;
+    }
+
+    StratifiedIndex getAbove() const {
+      assert(!isRemapped());
+      assert(hasAbove());
+      return Link.Above;
+    }
+
+    StratifiedAttrs &getAttrs() {
+      assert(!isRemapped());
+      return Link.Attrs;
+    }
+
+    void setAttr(unsigned index) {
+      assert(!isRemapped());
+      assert(index < NumStratifiedAttrs);
+      Link.Attrs.set(index);
+    }
+
+    void setAttrs(const StratifiedAttrs &other) {
+      assert(!isRemapped());
+      Link.Attrs |= other;
+    }
+
+    bool isRemapped() const { return Remap != StratifiedLink::SetSentinel; }
+
+    // \brief For initial remapping to another set
+    void remapTo(StratifiedIndex Other) {
+      assert(!isRemapped());
+      Remap = Other;
+    }
+
+    StratifiedIndex getRemapIndex() const {
+      assert(isRemapped());
+      return Remap;
+    }
+
+    // \brief Should only be called when we're already remapped.
+    void updateRemap(StratifiedIndex Other) {
+      assert(isRemapped());
+      Remap = Other;
+    }
+
+    // \brief Prefer the above functions to calling things directly on what's
+    // returned from this -- they guard against unexpected calls when the
+    // current BuilderLink is remapped.
+    const StratifiedLink &getLink() const { return Link; }
+
+  private:
+    StratifiedLink Link;
+    StratifiedIndex Remap;
+  };
+
+  // \brief This function performs all of the set unioning/value renumbering
+  // that we've been putting off, and generates a vector<StratifiedLink> that
+  // may be placed in a StratifiedSets instance.
+  void finalizeSets(std::vector<StratifiedLink> &StratLinks) {
+    DenseMap<StratifiedIndex, StratifiedIndex> Remaps;
+    for (auto &Link : Links) {
+      if (Link.isRemapped()) {
+        continue;
+      }
+
+      StratifiedIndex Number = StratLinks.size();
+      Remaps.insert(std::make_pair(Link.Number, Number));
+      StratLinks.push_back(Link.getLink());
+    }
+
+    for (auto &Link : StratLinks) {
+      if (Link.hasAbove()) {
+        auto &Above = linksAt(Link.Above);
+        auto Iter = Remaps.find(Above.Number);
+        assert(Iter != Remaps.end());
+        Link.Above = Iter->second;
+      }
+
+      if (Link.hasBelow()) {
+        auto &Below = linksAt(Link.Below);
+        auto Iter = Remaps.find(Below.Number);
+        assert(Iter != Remaps.end());
+        Link.Below = Iter->second;
+      }
+    }
+
+    for (auto &Pair : Values) {
+      auto &Info = Pair.second;
+      auto &Link = linksAt(Info.Index);
+      auto Iter = Remaps.find(Link.Number);
+      assert(Iter != Remaps.end());
+      Info.Index = Iter->second;
+    }
+  }
+
+  // \brief There's a guarantee in StratifiedLink where all bits set in a
+  // Link.externals will be set in all Link.externals "below" it.
+  static void propagateAttrs(std::vector<StratifiedLink> &Links) {
+    const auto getHighestParentAbove = [&Links](StratifiedIndex Idx) {
+      const auto *Link = &Links[Idx];
+      while (Link->hasAbove()) {
+        Idx = Link->Above;
+        Link = &Links[Idx];
+      }
+      return Idx;
+    };
+
+    SmallSet<StratifiedIndex, 16> Visited;
+    for (unsigned I = 0, E = Links.size(); I < E; ++I) {
+      auto CurrentIndex = getHighestParentAbove(I);
+      if (!Visited.insert(CurrentIndex).second) {
+        continue;
+      }
+
+      while (Links[CurrentIndex].hasBelow()) {
+        auto &CurrentBits = Links[CurrentIndex].Attrs;
+        auto NextIndex = Links[CurrentIndex].Below;
+        auto &NextBits = Links[NextIndex].Attrs;
+        NextBits |= CurrentBits;
+        CurrentIndex = NextIndex;
+      }
+    }
+  }
+
+public:
+  // \brief Builds a StratifiedSet from the information we've been given since
+  // either construction or the prior build() call.
+  StratifiedSets<T> build() {
+    std::vector<StratifiedLink> StratLinks;
+    finalizeSets(StratLinks);
+    propagateAttrs(StratLinks);
+    Links.clear();
+    return StratifiedSets<T>(std::move(Values), std::move(StratLinks));
+  }
+
+  std::size_t size() const { return Values.size(); }
+  std::size_t numSets() const { return Links.size(); }
+
+  bool has(const T &Elem) const { return get(Elem).hasValue(); }
+
+  bool add(const T &Main) {
+    if (get(Main).hasValue())
+      return false;
+
+    auto NewIndex = getNewUnlinkedIndex();
+    return addAtMerging(Main, NewIndex);
+  }
+
+  // \brief Restructures the stratified sets as necessary to make "ToAdd" in a
+  // set above "Main". There are some cases where this is not possible (see
+  // above), so we merge them such that ToAdd and Main are in the same set.
+  bool addAbove(const T &Main, const T &ToAdd) {
+    assert(has(Main));
+    auto Index = *indexOf(Main);
+    if (!linksAt(Index).hasAbove())
+      addLinkAbove(Index);
+
+    auto Above = linksAt(Index).getAbove();
+    return addAtMerging(ToAdd, Above);
+  }
+
+  // \brief Restructures the stratified sets as necessary to make "ToAdd" in a
+  // set below "Main". There are some cases where this is not possible (see
+  // above), so we merge them such that ToAdd and Main are in the same set.
+  bool addBelow(const T &Main, const T &ToAdd) {
+    assert(has(Main));
+    auto Index = *indexOf(Main);
+    if (!linksAt(Index).hasBelow())
+      addLinkBelow(Index);
+
+    auto Below = linksAt(Index).getBelow();
+    return addAtMerging(ToAdd, Below);
+  }
+
+  bool addWith(const T &Main, const T &ToAdd) {
+    assert(has(Main));
+    auto MainIndex = *indexOf(Main);
+    return addAtMerging(ToAdd, MainIndex);
+  }
+
+  void noteAttribute(const T &Main, unsigned AttrNum) {
+    assert(has(Main));
+    assert(AttrNum < StratifiedLink::SetSentinel);
+    auto *Info = *get(Main);
+    auto &Link = linksAt(Info->Index);
+    Link.setAttr(AttrNum);
+  }
+
+  void noteAttributes(const T &Main, const StratifiedAttrs &NewAttrs) {
+    assert(has(Main));
+    auto *Info = *get(Main);
+    auto &Link = linksAt(Info->Index);
+    Link.setAttrs(NewAttrs);
+  }
+
+  StratifiedAttrs getAttributes(const T &Main) {
+    assert(has(Main));
+    auto *Info = *get(Main);
+    auto *Link = &linksAt(Info->Index);
+    auto Attrs = Link->getAttrs();
+    while (Link->hasAbove()) {
+      Link = &linksAt(Link->getAbove());
+      Attrs |= Link->getAttrs();
+    }
+
+    return Attrs;
+  }
+
+  bool getAttribute(const T &Main, unsigned AttrNum) {
+    assert(AttrNum < StratifiedLink::SetSentinel);
+    auto Attrs = getAttributes(Main);
+    return Attrs[AttrNum];
+  }
+
+  // \brief Gets the attributes that have been applied to the set that Main
+  // belongs to. It ignores attributes in any sets above the one that Main
+  // resides in.
+  StratifiedAttrs getRawAttributes(const T &Main) {
+    assert(has(Main));
+    auto *Info = *get(Main);
+    auto &Link = linksAt(Info->Index);
+    return Link.getAttrs();
+  }
+
+  // \brief Gets an attribute from the attributes that have been applied to the
+  // set that Main belongs to. It ignores attributes in any sets above the one
+  // that Main resides in.
+  bool getRawAttribute(const T &Main, unsigned AttrNum) {
+    assert(AttrNum < StratifiedLink::SetSentinel);
+    auto Attrs = getRawAttributes(Main);
+    return Attrs[AttrNum];
+  }
+
+private:
+  DenseMap<T, StratifiedInfo> Values;
+  std::vector<BuilderLink> Links;
+
+  // \brief Adds the given element at the given index, merging sets if
+  // necessary.
+  bool addAtMerging(const T &ToAdd, StratifiedIndex Index) {
+    StratifiedInfo Info = {Index};
+    auto Pair = Values.insert(std::make_pair(ToAdd, Info));
+    if (Pair.second)
+      return true;
+
+    auto &Iter = Pair.first;
+    auto &IterSet = linksAt(Iter->second.Index);
+    auto &ReqSet = linksAt(Index);
+
+    // Failed to add where we wanted to. Merge the sets.
+    if (&IterSet != &ReqSet)
+      merge(IterSet.Number, ReqSet.Number);
+
+    return false;
+  }
+
+  // \brief Gets the BuilderLink at the given index, taking set remapping into
+  // account.
+  BuilderLink &linksAt(StratifiedIndex Index) {
+    auto *Start = &Links[Index];
+    if (!Start->isRemapped())
+      return *Start;
+
+    auto *Current = Start;
+    while (Current->isRemapped())
+      Current = &Links[Current->getRemapIndex()];
+
+    auto NewRemap = Current->Number;
+
+    // Run through everything that has yet to be updated, and update them to
+    // remap to NewRemap
+    Current = Start;
+    while (Current->isRemapped()) {
+      auto *Next = &Links[Current->getRemapIndex()];
+      Current->updateRemap(NewRemap);
+      Current = Next;
+    }
+
+    return *Current;
+  }
+
+  // \brief Merges two sets into one another. Assumes that these sets are not
+  // already one in the same
+  void merge(StratifiedIndex Idx1, StratifiedIndex Idx2) {
+    assert(inbounds(Idx1) && inbounds(Idx2));
+    assert(&linksAt(Idx1) != &linksAt(Idx2) &&
+           "Merging a set into itself is not allowed");
+
+    // CASE 1: If the set at `Idx1` is above or below `Idx2`, we need to merge
+    // both the
+    // given sets, and all sets between them, into one.
+    if (tryMergeUpwards(Idx1, Idx2))
+      return;
+
+    if (tryMergeUpwards(Idx2, Idx1))
+      return;
+
+    // CASE 2: The set at `Idx1` is not in the same chain as the set at `Idx2`.
+    // We therefore need to merge the two chains together.
+    mergeDirect(Idx1, Idx2);
+  }
+
+  // \brief Merges two sets assuming that the set at `Idx1` is unreachable from
+  // traversing above or below the set at `Idx2`.
+  void mergeDirect(StratifiedIndex Idx1, StratifiedIndex Idx2) {
+    assert(inbounds(Idx1) && inbounds(Idx2));
+
+    auto *LinksInto = &linksAt(Idx1);
+    auto *LinksFrom = &linksAt(Idx2);
+    // Merging everything above LinksInto then proceeding to merge everything
+    // below LinksInto becomes problematic, so we go as far "up" as possible!
+    while (LinksInto->hasAbove() && LinksFrom->hasAbove()) {
+      LinksInto = &linksAt(LinksInto->getAbove());
+      LinksFrom = &linksAt(LinksFrom->getAbove());
+    }
+
+    if (LinksFrom->hasAbove()) {
+      LinksInto->setAbove(LinksFrom->getAbove());
+      auto &NewAbove = linksAt(LinksInto->getAbove());
+      NewAbove.setBelow(LinksInto->Number);
+    }
+
+    // Merging strategy:
+    //  > If neither has links below, stop.
+    //  > If only `LinksInto` has links below, stop.
+    //  > If only `LinksFrom` has links below, reset `LinksInto.Below` to
+    //  match `LinksFrom.Below`
+    //  > If both have links above, deal with those next.
+    while (LinksInto->hasBelow() && LinksFrom->hasBelow()) {
+      auto &FromAttrs = LinksFrom->getAttrs();
+      LinksInto->setAttrs(FromAttrs);
+
+      // Remap needs to happen after getBelow(), but before
+      // assignment of LinksFrom
+      auto *NewLinksFrom = &linksAt(LinksFrom->getBelow());
+      LinksFrom->remapTo(LinksInto->Number);
+      LinksFrom = NewLinksFrom;
+      LinksInto = &linksAt(LinksInto->getBelow());
+    }
+
+    if (LinksFrom->hasBelow()) {
+      LinksInto->setBelow(LinksFrom->getBelow());
+      auto &NewBelow = linksAt(LinksInto->getBelow());
+      NewBelow.setAbove(LinksInto->Number);
+    }
+
+    LinksFrom->remapTo(LinksInto->Number);
+  }
+
+  // \brief Checks to see if lowerIndex is at a level lower than upperIndex.
+  // If so, it will merge lowerIndex with upperIndex (and all of the sets
+  // between) and return true. Otherwise, it will return false.
+  bool tryMergeUpwards(StratifiedIndex LowerIndex, StratifiedIndex UpperIndex) {
+    assert(inbounds(LowerIndex) && inbounds(UpperIndex));
+    auto *Lower = &linksAt(LowerIndex);
+    auto *Upper = &linksAt(UpperIndex);
+    if (Lower == Upper)
+      return true;
+
+    SmallVector<BuilderLink *, 8> Found;
+    auto *Current = Lower;
+    auto Attrs = Current->getAttrs();
+    while (Current->hasAbove() && Current != Upper) {
+      Found.push_back(Current);
+      Attrs |= Current->getAttrs();
+      Current = &linksAt(Current->getAbove());
+    }
+
+    if (Current != Upper)
+      return false;
+
+    Upper->setAttrs(Attrs);
+
+    if (Lower->hasBelow()) {
+      auto NewBelowIndex = Lower->getBelow();
+      Upper->setBelow(NewBelowIndex);
+      auto &NewBelow = linksAt(NewBelowIndex);
+      NewBelow.setAbove(UpperIndex);
+    } else {
+      Upper->clearBelow();
+    }
+
+    for (const auto &Ptr : Found)
+      Ptr->remapTo(Upper->Number);
+
+    return true;
+  }
+
+  Optional<const StratifiedInfo *> get(const T &Val) const {
+    auto Result = Values.find(Val);
+    if (Result == Values.end())
+      return NoneType();
+    return &Result->second;
+  }
+
+  Optional<StratifiedInfo *> get(const T &Val) {
+    auto Result = Values.find(Val);
+    if (Result == Values.end())
+      return NoneType();
+    return &Result->second;
+  }
+
+  Optional<StratifiedIndex> indexOf(const T &Val) {
+    auto MaybeVal = get(Val);
+    if (!MaybeVal.hasValue())
+      return NoneType();
+    auto *Info = *MaybeVal;
+    auto &Link = linksAt(Info->Index);
+    return Link.Number;
+  }
+
+  StratifiedIndex addLinkBelow(StratifiedIndex Set) {
+    auto At = addLinks();
+    Links[Set].setBelow(At);
+    Links[At].setAbove(Set);
+    return At;
+  }
+
+  StratifiedIndex addLinkAbove(StratifiedIndex Set) {
+    auto At = addLinks();
+    Links[At].setBelow(Set);
+    Links[Set].setAbove(At);
+    return At;
+  }
+
+  StratifiedIndex getNewUnlinkedIndex() { return addLinks(); }
+
+  StratifiedIndex addLinks() {
+    auto Link = Links.size();
+    Links.push_back(BuilderLink(Link));
+    return Link;
+  }
+
+  bool inbounds(StratifiedIndex N) const { return N < Links.size(); }
+};
+}
+#endif // LLVM_ADT_STRATIFIEDSETS_H

diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index cdb0b79..c1ffb9d 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp

@@ -87,9 +87,10 @@
   return PrevTTI->isLoweredToCall(F);
 }
 
-void TargetTransformInfo::getUnrollingPreferences(Loop *L,
-                            UnrollingPreferences &UP) const {
-  PrevTTI->getUnrollingPreferences(L, UP);
+void
+TargetTransformInfo::getUnrollingPreferences(const Function *F, Loop *L,
+                                             UnrollingPreferences &UP) const {
+  PrevTTI->getUnrollingPreferences(F, L, UP);
 }
 
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
@@ -167,15 +168,16 @@
   return PrevTTI->getRegisterBitWidth(Vector);
 }
 
-unsigned TargetTransformInfo::getMaximumUnrollFactor() const {
-  return PrevTTI->getMaximumUnrollFactor();
+unsigned TargetTransformInfo::getMaxInterleaveFactor() const {
+  return PrevTTI->getMaxInterleaveFactor();
 }
 
-unsigned TargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
-                                                Type *Ty,
-                                                OperandValueKind Op1Info,
-                                                OperandValueKind Op2Info) const {
-  return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+unsigned TargetTransformInfo::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
+    OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
+    OperandValueProperties Opd2PropInfo) const {
+  return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
 }
 
 unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Tp,
@@ -230,6 +232,11 @@
   return PrevTTI->getReductionCost(Opcode, Ty, IsPairwise);
 }
 
+unsigned TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys)
+  const {
+  return PrevTTI->getCostOfKeepingLiveOverCall(Tys);
+}
+
 namespace {
 
 struct NoTTI final : ImmutablePass, TargetTransformInfo {
@@ -239,7 +246,7 @@
     initializeNoTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() override {
+  void initializePass() override {
     // Note that this subclass is special, and must *not* call initializeTTI as
     // it does not chain.
     TopTTI = this;
@@ -248,7 +255,7 @@
     DL = DLP ? &DLP->getDataLayout() : nullptr;
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     // Note that this subclass is special, and must *not* call
     // TTI::getAnalysisUsage as it breaks the recursion.
   }
@@ -257,7 +264,7 @@
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo*)this;
     return this;
@@ -385,6 +392,8 @@
       // FIXME: This is wrong for libc intrinsics.
       return TCC_Basic;
 
+    case Intrinsic::annotation:
+    case Intrinsic::assume:
     case Intrinsic::dbg_declare:
     case Intrinsic::dbg_value:
     case Intrinsic::invariant_start:
@@ -466,6 +475,8 @@
     // These will all likely lower to a single selection DAG node.
     if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
         Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" ||
+        Name == "fmin" || Name == "fminf" || Name == "fminl" ||
+        Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" ||
         Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" ||
         Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
       return false;
@@ -480,8 +491,8 @@
     return true;
   }
 
-  void getUnrollingPreferences(Loop *, UnrollingPreferences &) const override {
-  }
+  void getUnrollingPreferences(const Function *, Loop *,
+                               UnrollingPreferences &) const override {}
 
   bool isLegalAddImmediate(int64_t Imm) const override {
     return false;
@@ -558,12 +569,13 @@
     return 32;
   }
 
-  unsigned getMaximumUnrollFactor() const override {
+  unsigned getMaxInterleaveFactor() const override {
     return 1;
   }
 
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
-                                  OperandValueKind) const override {
+                                  OperandValueKind, OperandValueProperties,
+                                  OperandValueProperties) const override {
     return 1;
   }
 
@@ -612,6 +624,11 @@
   unsigned getReductionCost(unsigned, Type *, bool) const override {
     return 1;
   }
+
+  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override {
+    return 0;
+  }
+
 };
 
 } // end anonymous namespace

diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index f36f6f8..f347eb5 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp

@@ -454,9 +454,9 @@
 
   // Get the attached MDNodes. If either value lacks a tbaa MDNode, we must
   // be conservative.
-  const MDNode *AM = LocA.TBAATag;
+  const MDNode *AM = LocA.AATags.TBAA;
   if (!AM) return AliasAnalysis::alias(LocA, LocB);
-  const MDNode *BM = LocB.TBAATag;
+  const MDNode *BM = LocB.AATags.TBAA;
   if (!BM) return AliasAnalysis::alias(LocA, LocB);
 
   // If they may alias, chain to the next AliasAnalysis.
@@ -472,7 +472,7 @@
   if (!EnableTBAA)
     return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
 
-  const MDNode *M = Loc.TBAATag;
+  const MDNode *M = Loc.AATags.TBAA;
   if (!M) return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
 
   // If this is an "immutable" type, we can assume the pointer is pointing
@@ -513,9 +513,9 @@
   if (!EnableTBAA)
     return AliasAnalysis::getModRefInfo(CS, Loc);
 
-  if (const MDNode *L = Loc.TBAATag)
+  if (const MDNode *L = Loc.AATags.TBAA)
     if (const MDNode *M =
-          CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
+            CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
       if (!Aliases(L, M))
         return NoModRef;
 
@@ -529,9 +529,9 @@
     return AliasAnalysis::getModRefInfo(CS1, CS2);
 
   if (const MDNode *M1 =
-        CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
+          CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
     if (const MDNode *M2 =
-          CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
+            CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
       if (!Aliases(M1, M2))
         return NoModRef;
 
@@ -611,3 +611,24 @@
   Value *Ops[3] = { Ret, Ret, ConstantInt::get(Int64, 0) };
   return MDNode::get(A->getContext(), Ops);
 }
+
+void Instruction::getAAMetadata(AAMDNodes &N, bool Merge) const {
+  if (Merge)
+    N.TBAA =
+        MDNode::getMostGenericTBAA(N.TBAA, getMetadata(LLVMContext::MD_tbaa));
+  else
+    N.TBAA = getMetadata(LLVMContext::MD_tbaa);
+
+  if (Merge)
+    N.Scope =
+        MDNode::intersect(N.Scope, getMetadata(LLVMContext::MD_alias_scope));
+  else
+    N.Scope = getMetadata(LLVMContext::MD_alias_scope);
+
+  if (Merge)
+    N.NoAlias =
+        MDNode::intersect(N.NoAlias, getMetadata(LLVMContext::MD_noalias));
+  else
+    N.NoAlias = getMetadata(LLVMContext::MD_noalias);
+}
+

diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 5264745..e9bbf83 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp

@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -20,6 +21,7 @@
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -29,6 +31,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include <cstring>
 using namespace llvm;
@@ -36,8 +39,8 @@
 
 const unsigned MaxDepth = 6;
 
-/// getBitWidth - Returns the bitwidth of the given scalar or pointer type (if
-/// unknown returns 0).  For vector types, returns the element type's bitwidth.
+/// Returns the bitwidth of the given scalar or pointer type (if unknown returns
+/// 0). For vector types, returns the element type's bitwidth.
 static unsigned getBitWidth(Type *Ty, const DataLayout *TD) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
@@ -45,10 +48,125 @@
   return TD ? TD->getPointerTypeSizeInBits(Ty) : 0;
 }
 
+// Many of these functions have internal versions that take an assumption
+// exclusion set. This is because of the potential for mutual recursion to
+// cause computeKnownBits to repeatedly visit the same assume intrinsic. The
+// classic case of this is assume(x = y), which will attempt to determine
+// bits in x from bits in y, which will attempt to determine bits in y from
+// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
+// isKnownNonZero, which calls computeKnownBits and ComputeSignBit and
+// isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so on.
+typedef SmallPtrSet<const Value *, 8> ExclInvsSet;
+
+namespace {
+// Simplifying using an assume can only be done in a particular control-flow
+// context (the context instruction provides that context). If an assume and
+// the context instruction are not in the same block then the DT helps in
+// figuring out if we can use it.
+struct Query {
+  ExclInvsSet ExclInvs;
+  AssumptionTracker *AT;
+  const Instruction *CxtI;
+  const DominatorTree *DT;
+
+  Query(AssumptionTracker *AT = nullptr, const Instruction *CxtI = nullptr,
+        const DominatorTree *DT = nullptr)
+    : AT(AT), CxtI(CxtI), DT(DT) {}
+
+  Query(const Query &Q, const Value *NewExcl)
+    : ExclInvs(Q.ExclInvs), AT(Q.AT), CxtI(Q.CxtI), DT(Q.DT) {
+    ExclInvs.insert(NewExcl);
+  }
+};
+} // end anonymous namespace
+
+// Given the provided Value and, potentially, a context instruction, return
+// the preferred context instruction (if any).
+static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) {
+  // If we've been provided with a context instruction, then use that (provided
+  // it has been inserted).
+  if (CxtI && CxtI->getParent())
+    return CxtI;
+
+  // If the value is really an already-inserted instruction, then use that.
+  CxtI = dyn_cast<Instruction>(V);
+  if (CxtI && CxtI->getParent())
+    return CxtI;
+
+  return nullptr;
+}
+
+static void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
+                            const DataLayout *TD, unsigned Depth,
+                            const Query &Q);
+
+void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
+                            const DataLayout *TD, unsigned Depth,
+                            AssumptionTracker *AT, const Instruction *CxtI,
+                            const DominatorTree *DT) {
+  ::computeKnownBits(V, KnownZero, KnownOne, TD, Depth,
+                     Query(AT, safeCxtI(V, CxtI), DT));
+}
+
+static void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
+                          const DataLayout *TD, unsigned Depth,
+                          const Query &Q);
+
+void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
+                          const DataLayout *TD, unsigned Depth,
+                          AssumptionTracker *AT, const Instruction *CxtI,
+                          const DominatorTree *DT) {
+  ::ComputeSignBit(V, KnownZero, KnownOne, TD, Depth,
+                   Query(AT, safeCxtI(V, CxtI), DT));
+}
+
+static bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
+                                   const Query &Q);
+
+bool llvm::isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
+                                  AssumptionTracker *AT,
+                                  const Instruction *CxtI,
+                                  const DominatorTree *DT) {
+  return ::isKnownToBeAPowerOfTwo(V, OrZero, Depth,
+                                  Query(AT, safeCxtI(V, CxtI), DT));
+}
+
+static bool isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth,
+                           const Query &Q);
+
+bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth,
+                          AssumptionTracker *AT, const Instruction *CxtI,
+                          const DominatorTree *DT) {
+  return ::isKnownNonZero(V, TD, Depth, Query(AT, safeCxtI(V, CxtI), DT));
+}
+
+static bool MaskedValueIsZero(Value *V, const APInt &Mask,
+                              const DataLayout *TD, unsigned Depth,
+                              const Query &Q);
+
+bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
+                             const DataLayout *TD, unsigned Depth,
+                             AssumptionTracker *AT, const Instruction *CxtI,
+                             const DominatorTree *DT) {
+  return ::MaskedValueIsZero(V, Mask, TD, Depth,
+                             Query(AT, safeCxtI(V, CxtI), DT));
+}
+
+static unsigned ComputeNumSignBits(Value *V, const DataLayout *TD,
+                                   unsigned Depth, const Query &Q);
+
+unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
+                                  unsigned Depth, AssumptionTracker *AT,
+                                  const Instruction *CxtI,
+                                  const DominatorTree *DT) {
+  return ::ComputeNumSignBits(V, TD, Depth, Query(AT, safeCxtI(V, CxtI), DT));
+}
+
 static void computeKnownBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
                                    APInt &KnownZero, APInt &KnownOne,
                                    APInt &KnownZero2, APInt &KnownOne2,
-                                   const DataLayout *TD, unsigned Depth) {
+                                   const DataLayout *TD, unsigned Depth,
+                                   const Query &Q) {
   if (!Add) {
     if (ConstantInt *CLHS = dyn_cast<ConstantInt>(Op0)) {
       // We know that the top bits of C-X are clear if X contains less bits
@@ -59,7 +177,7 @@
         unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
         // NLZ can't be BitWidth with no sign bit
         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        llvm::computeKnownBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
+        computeKnownBits(Op1, KnownZero2, KnownOne2, TD, Depth+1, Q);
 
         // If all of the MaskV bits are known to be zero, then we know the
         // output top bits are zero, because we now know that the output is
@@ -75,55 +193,51 @@
 
   unsigned BitWidth = KnownZero.getBitWidth();
 
-  // If one of the operands has trailing zeros, then the bits that the
-  // other operand has in those bit positions will be preserved in the
-  // result. For an add, this works with either operand. For a subtract,
-  // this only works if the known zeros are in the right operand.
+  // If an initial sequence of bits in the result is not needed, the
+  // corresponding bits in the operands are not needed.
   APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-  llvm::computeKnownBits(Op0, LHSKnownZero, LHSKnownOne, TD, Depth+1);
-  unsigned LHSKnownZeroOut = LHSKnownZero.countTrailingOnes();
+  computeKnownBits(Op0, LHSKnownZero, LHSKnownOne, TD, Depth+1, Q);
+  computeKnownBits(Op1, KnownZero2, KnownOne2, TD, Depth+1, Q);
 
-  llvm::computeKnownBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
-  unsigned RHSKnownZeroOut = KnownZero2.countTrailingOnes();
-
-  // Determine which operand has more trailing zeros, and use that
-  // many bits from the other operand.
-  if (LHSKnownZeroOut > RHSKnownZeroOut) {
-    if (Add) {
-      APInt Mask = APInt::getLowBitsSet(BitWidth, LHSKnownZeroOut);
-      KnownZero |= KnownZero2 & Mask;
-      KnownOne  |= KnownOne2 & Mask;
-    } else {
-      // If the known zeros are in the left operand for a subtract,
-      // fall back to the minimum known zeros in both operands.
-      KnownZero |= APInt::getLowBitsSet(BitWidth,
-                                        std::min(LHSKnownZeroOut,
-                                                 RHSKnownZeroOut));
-    }
-  } else if (RHSKnownZeroOut >= LHSKnownZeroOut) {
-    APInt Mask = APInt::getLowBitsSet(BitWidth, RHSKnownZeroOut);
-    KnownZero |= LHSKnownZero & Mask;
-    KnownOne  |= LHSKnownOne & Mask;
+  // Carry in a 1 for a subtract, rather than a 0.
+  APInt CarryIn(BitWidth, 0);
+  if (!Add) {
+    // Sum = LHS + ~RHS + 1
+    std::swap(KnownZero2, KnownOne2);
+    CarryIn.setBit(0);
   }
 
+  APInt PossibleSumZero = ~LHSKnownZero + ~KnownZero2 + CarryIn;
+  APInt PossibleSumOne = LHSKnownOne + KnownOne2 + CarryIn;
+
+  // Compute known bits of the carry.
+  APInt CarryKnownZero = ~(PossibleSumZero ^ LHSKnownZero ^ KnownZero2);
+  APInt CarryKnownOne = PossibleSumOne ^ LHSKnownOne ^ KnownOne2;
+
+  // Compute set of known bits (where all three relevant bits are known).
+  APInt LHSKnown = LHSKnownZero | LHSKnownOne;
+  APInt RHSKnown = KnownZero2 | KnownOne2;
+  APInt CarryKnown = CarryKnownZero | CarryKnownOne;
+  APInt Known = LHSKnown & RHSKnown & CarryKnown;
+
+  assert((PossibleSumZero & Known) == (PossibleSumOne & Known) &&
+         "known bits of sum differ");
+
+  // Compute known bits of the result.
+  KnownZero = ~PossibleSumOne & Known;
+  KnownOne = PossibleSumOne & Known;
+
   // Are we still trying to solve for the sign bit?
-  if (!KnownZero.isNegative() && !KnownOne.isNegative()) {
+  if (!Known.isNegative()) {
     if (NSW) {
-      if (Add) {
-        // Adding two positive numbers can't wrap into negative
-        if (LHSKnownZero.isNegative() && KnownZero2.isNegative())
-          KnownZero |= APInt::getSignBit(BitWidth);
-        // and adding two negative numbers can't wrap into positive.
-        else if (LHSKnownOne.isNegative() && KnownOne2.isNegative())
-          KnownOne |= APInt::getSignBit(BitWidth);
-      } else {
-        // Subtracting a negative number from a positive one can't wrap
-        if (LHSKnownZero.isNegative() && KnownOne2.isNegative())
-          KnownZero |= APInt::getSignBit(BitWidth);
-        // neither can subtracting a positive number from a negative one.
-        else if (LHSKnownOne.isNegative() && KnownZero2.isNegative())
-          KnownOne |= APInt::getSignBit(BitWidth);
-      }
+      // Adding two non-negative numbers, or subtracting a negative number from
+      // a non-negative one, can't wrap into negative.
+      if (LHSKnownZero.isNegative() && KnownZero2.isNegative())
+        KnownZero |= APInt::getSignBit(BitWidth);
+      // Adding two negative numbers, or subtracting a non-negative number from
+      // a negative one, can't wrap into non-negative.
+      else if (LHSKnownOne.isNegative() && KnownOne2.isNegative())
+        KnownOne |= APInt::getSignBit(BitWidth);
     }
   }
 }
@@ -131,10 +245,11 @@
 static void computeKnownBitsMul(Value *Op0, Value *Op1, bool NSW,
                                 APInt &KnownZero, APInt &KnownOne,
                                 APInt &KnownZero2, APInt &KnownOne2,
-                                const DataLayout *TD, unsigned Depth) {
+                                const DataLayout *TD, unsigned Depth,
+                                const Query &Q) {
   unsigned BitWidth = KnownZero.getBitWidth();
-  computeKnownBits(Op1, KnownZero, KnownOne, TD, Depth+1);
-  computeKnownBits(Op0, KnownZero2, KnownOne2, TD, Depth+1);
+  computeKnownBits(Op1, KnownZero, KnownOne, TD, Depth+1, Q);
+  computeKnownBits(Op0, KnownZero2, KnownOne2, TD, Depth+1, Q);
 
   bool isKnownNegative = false;
   bool isKnownNonNegative = false;
@@ -155,9 +270,9 @@
       // negative or zero.
       if (!isKnownNonNegative)
         isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
-                           isKnownNonZero(Op0, TD, Depth)) ||
+                           isKnownNonZero(Op0, TD, Depth, Q)) ||
                           (isKnownNegativeOp0 && isKnownNonNegativeOp1 &&
-                           isKnownNonZero(Op1, TD, Depth));
+                           isKnownNonZero(Op1, TD, Depth, Q));
     }
   }
 
@@ -209,6 +324,410 @@
   KnownZero = APInt::getHighBitsSet(BitWidth, MinLeadingZeros);
 }
 
+static bool isEphemeralValueOf(Instruction *I, const Value *E) {
+  SmallVector<const Value *, 16> WorkSet(1, I);
+  SmallPtrSet<const Value *, 32> Visited;
+  SmallPtrSet<const Value *, 16> EphValues;
+
+  while (!WorkSet.empty()) {
+    const Value *V = WorkSet.pop_back_val();
+    if (!Visited.insert(V).second)
+      continue;
+
+    // If all uses of this value are ephemeral, then so is this value.
+    bool FoundNEUse = false;
+    for (const User *I : V->users())
+      if (!EphValues.count(I)) {
+        FoundNEUse = true;
+        break;
+      }
+
+    if (!FoundNEUse) {
+      if (V == E)
+        return true;
+
+      EphValues.insert(V);
+      if (const User *U = dyn_cast<User>(V))
+        for (User::const_op_iterator J = U->op_begin(), JE = U->op_end();
+             J != JE; ++J) {
+          if (isSafeToSpeculativelyExecute(*J))
+            WorkSet.push_back(*J);
+        }
+    }
+  }
+
+  return false;
+}
+
+// Is this an intrinsic that cannot be speculated but also cannot trap?
+static bool isAssumeLikeIntrinsic(const Instruction *I) {
+  if (const CallInst *CI = dyn_cast<CallInst>(I))
+    if (Function *F = CI->getCalledFunction())
+      switch (F->getIntrinsicID()) {
+      default: break;
+      // FIXME: This list is repeated from NoTTI::getIntrinsicCost.
+      case Intrinsic::assume:
+      case Intrinsic::dbg_declare:
+      case Intrinsic::dbg_value:
+      case Intrinsic::invariant_start:
+      case Intrinsic::invariant_end:
+      case Intrinsic::lifetime_start:
+      case Intrinsic::lifetime_end:
+      case Intrinsic::objectsize:
+      case Intrinsic::ptr_annotation:
+      case Intrinsic::var_annotation:
+        return true;
+      }
+
+  return false;
+}
+
+static bool isValidAssumeForContext(Value *V, const Query &Q,
+                                    const DataLayout *DL) {
+  Instruction *Inv = cast<Instruction>(V);
+
+  // There are two restrictions on the use of an assume:
+  //  1. The assume must dominate the context (or the control flow must
+  //     reach the assume whenever it reaches the context).
+  //  2. The context must not be in the assume's set of ephemeral values
+  //     (otherwise we will use the assume to prove that the condition
+  //     feeding the assume is trivially true, thus causing the removal of
+  //     the assume).
+
+  if (Q.DT) {
+    if (Q.DT->dominates(Inv, Q.CxtI)) {
+      return true;
+    } else if (Inv->getParent() == Q.CxtI->getParent()) {
+      // The context comes first, but they're both in the same block. Make sure
+      // there is nothing in between that might interrupt the control flow.
+      for (BasicBlock::const_iterator I =
+             std::next(BasicBlock::const_iterator(Q.CxtI)),
+                                      IE(Inv); I != IE; ++I)
+        if (!isSafeToSpeculativelyExecute(I, DL) &&
+            !isAssumeLikeIntrinsic(I))
+          return false;
+
+      return !isEphemeralValueOf(Inv, Q.CxtI);
+    }
+
+    return false;
+  }
+
+  // When we don't have a DT, we do a limited search...
+  if (Inv->getParent() == Q.CxtI->getParent()->getSinglePredecessor()) {
+    return true;
+  } else if (Inv->getParent() == Q.CxtI->getParent()) {
+    // Search forward from the assume until we reach the context (or the end
+    // of the block); the common case is that the assume will come first.
+    for (BasicBlock::iterator I = std::next(BasicBlock::iterator(Inv)),
+         IE = Inv->getParent()->end(); I != IE; ++I)
+      if (I == Q.CxtI)
+        return true;
+
+    // The context must come first...
+    for (BasicBlock::const_iterator I =
+           std::next(BasicBlock::const_iterator(Q.CxtI)),
+                                    IE(Inv); I != IE; ++I)
+      if (!isSafeToSpeculativelyExecute(I, DL) &&
+          !isAssumeLikeIntrinsic(I))
+        return false;
+
+    return !isEphemeralValueOf(Inv, Q.CxtI);
+  }
+
+  return false;
+}
+
+bool llvm::isValidAssumeForContext(const Instruction *I,
+                                   const Instruction *CxtI,
+                                   const DataLayout *DL,
+                                   const DominatorTree *DT) {
+  return ::isValidAssumeForContext(const_cast<Instruction*>(I),
+                                   Query(nullptr, CxtI, DT), DL);
+}
+
+template<typename LHS, typename RHS>
+inline match_combine_or<CmpClass_match<LHS, RHS, ICmpInst, ICmpInst::Predicate>,
+                        CmpClass_match<RHS, LHS, ICmpInst, ICmpInst::Predicate>>
+m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
+  return m_CombineOr(m_ICmp(Pred, L, R), m_ICmp(Pred, R, L));
+}
+
+template<typename LHS, typename RHS>
+inline match_combine_or<BinaryOp_match<LHS, RHS, Instruction::And>,
+                        BinaryOp_match<RHS, LHS, Instruction::And>>
+m_c_And(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_And(L, R), m_And(R, L));
+}
+
+template<typename LHS, typename RHS>
+inline match_combine_or<BinaryOp_match<LHS, RHS, Instruction::Or>,
+                        BinaryOp_match<RHS, LHS, Instruction::Or>>
+m_c_Or(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_Or(L, R), m_Or(R, L));
+}
+
+template<typename LHS, typename RHS>
+inline match_combine_or<BinaryOp_match<LHS, RHS, Instruction::Xor>,
+                        BinaryOp_match<RHS, LHS, Instruction::Xor>>
+m_c_Xor(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_Xor(L, R), m_Xor(R, L));
+}
+
+static void computeKnownBitsFromAssume(Value *V, APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const DataLayout *DL,
+                                       unsigned Depth, const Query &Q) {
+  // Use of assumptions is context-sensitive. If we don't have a context, we
+  // cannot use them!
+  if (!Q.AT || !Q.CxtI)
+    return;
+
+  unsigned BitWidth = KnownZero.getBitWidth();
+
+  Function *F = const_cast<Function*>(Q.CxtI->getParent()->getParent());
+  for (auto &CI : Q.AT->assumptions(F)) {
+    CallInst *I = CI;
+    if (Q.ExclInvs.count(I))
+      continue;
+
+    if (match(I, m_Intrinsic<Intrinsic::assume>(m_Specific(V))) &&
+        isValidAssumeForContext(I, Q, DL)) {
+      assert(BitWidth == 1 && "assume operand is not i1?");
+      KnownZero.clearAllBits();
+      KnownOne.setAllBits();
+      return;
+    }
+
+    Value *A, *B;
+    auto m_V = m_CombineOr(m_Specific(V),
+                           m_CombineOr(m_PtrToInt(m_Specific(V)),
+                           m_BitCast(m_Specific(V))));
+
+    CmpInst::Predicate Pred;
+    ConstantInt *C;
+    // assume(v = a)
+    if (match(I, m_Intrinsic<Intrinsic::assume>(
+                   m_c_ICmp(Pred, m_V, m_Value(A)))) &&
+        Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      KnownZero |= RHSKnownZero;
+      KnownOne  |= RHSKnownOne;
+    // assume(v & b = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0);
+      computeKnownBits(B, MaskKnownZero, MaskKnownOne, DL, Depth+1, Query(Q, I));
+
+      // For those bits in the mask that are known to be one, we can propagate
+      // known bits from the RHS to V.
+      KnownZero |= RHSKnownZero & MaskKnownOne;
+      KnownOne  |= RHSKnownOne  & MaskKnownOne;
+    // assume(~(v & b) = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
+                                m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0);
+      computeKnownBits(B, MaskKnownZero, MaskKnownOne, DL, Depth+1, Query(Q, I));
+
+      // For those bits in the mask that are known to be one, we can propagate
+      // inverted known bits from the RHS to V.
+      KnownZero |= RHSKnownOne  & MaskKnownOne;
+      KnownOne  |= RHSKnownZero & MaskKnownOne;
+    // assume(v | b = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
+      computeKnownBits(B, BKnownZero, BKnownOne, DL, Depth+1, Query(Q, I));
+
+      // For those bits in B that are known to be zero, we can propagate known
+      // bits from the RHS to V.
+      KnownZero |= RHSKnownZero & BKnownZero;
+      KnownOne  |= RHSKnownOne  & BKnownZero;
+    // assume(~(v | b) = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
+                                m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
+      computeKnownBits(B, BKnownZero, BKnownOne, DL, Depth+1, Query(Q, I));
+
+      // For those bits in B that are known to be zero, we can propagate
+      // inverted known bits from the RHS to V.
+      KnownZero |= RHSKnownOne  & BKnownZero;
+      KnownOne  |= RHSKnownZero & BKnownZero;
+    // assume(v ^ b = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
+      computeKnownBits(B, BKnownZero, BKnownOne, DL, Depth+1, Query(Q, I));
+
+      // For those bits in B that are known to be zero, we can propagate known
+      // bits from the RHS to V. For those bits in B that are known to be one,
+      // we can propagate inverted known bits from the RHS to V.
+      KnownZero |= RHSKnownZero & BKnownZero;
+      KnownOne  |= RHSKnownOne  & BKnownZero;
+      KnownZero |= RHSKnownOne  & BKnownOne;
+      KnownOne  |= RHSKnownZero & BKnownOne;
+    // assume(~(v ^ b) = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
+                                m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
+      computeKnownBits(B, BKnownZero, BKnownOne, DL, Depth+1, Query(Q, I));
+
+      // For those bits in B that are known to be zero, we can propagate
+      // inverted known bits from the RHS to V. For those bits in B that are
+      // known to be one, we can propagate known bits from the RHS to V.
+      KnownZero |= RHSKnownOne  & BKnownZero;
+      KnownOne  |= RHSKnownZero & BKnownZero;
+      KnownZero |= RHSKnownZero & BKnownOne;
+      KnownOne  |= RHSKnownOne  & BKnownOne;
+    // assume(v << c = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
+                                      m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      // For those bits in RHS that are known, we can propagate them to known
+      // bits in V shifted to the right by C.
+      KnownZero |= RHSKnownZero.lshr(C->getZExtValue());
+      KnownOne  |= RHSKnownOne.lshr(C->getZExtValue());
+    // assume(~(v << c) = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
+                                      m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      // For those bits in RHS that are known, we can propagate them inverted
+      // to known bits in V shifted to the right by C.
+      KnownZero |= RHSKnownOne.lshr(C->getZExtValue());
+      KnownOne  |= RHSKnownZero.lshr(C->getZExtValue());
+    // assume(v >> c = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_CombineOr(m_LShr(m_V, m_ConstantInt(C)),
+                                                  m_AShr(m_V,
+                                                         m_ConstantInt(C))),
+                                     m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      // For those bits in RHS that are known, we can propagate them to known
+      // bits in V shifted to the right by C.
+      KnownZero |= RHSKnownZero << C->getZExtValue();
+      KnownOne  |= RHSKnownOne  << C->getZExtValue();
+    // assume(~(v >> c) = a)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_c_ICmp(Pred, m_Not(m_CombineOr(
+                                              m_LShr(m_V, m_ConstantInt(C)),
+                                              m_AShr(m_V, m_ConstantInt(C)))),
+                                     m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+      // For those bits in RHS that are known, we can propagate them inverted
+      // to known bits in V shifted to the right by C.
+      KnownZero |= RHSKnownOne  << C->getZExtValue();
+      KnownOne  |= RHSKnownZero << C->getZExtValue();
+    // assume(v >=_s c) where c is non-negative
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_SGE &&
+               isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+
+      if (RHSKnownZero.isNegative()) {
+        // We know that the sign bit is zero.
+        KnownZero |= APInt::getSignBit(BitWidth);
+      }
+    // assume(v >_s c) where c is at least -1.
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_SGT &&
+               isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+
+      if (RHSKnownOne.isAllOnesValue() || RHSKnownZero.isNegative()) {
+        // We know that the sign bit is zero.
+        KnownZero |= APInt::getSignBit(BitWidth);
+      }
+    // assume(v <=_s c) where c is negative
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_SLE &&
+               isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+
+      if (RHSKnownOne.isNegative()) {
+        // We know that the sign bit is one.
+        KnownOne |= APInt::getSignBit(BitWidth);
+      }
+    // assume(v <_s c) where c is non-positive
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_SLT &&
+               isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+
+      if (RHSKnownZero.isAllOnesValue() || RHSKnownOne.isNegative()) {
+        // We know that the sign bit is one.
+        KnownOne |= APInt::getSignBit(BitWidth);
+      }
+    // assume(v <=_u c)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_ULE &&
+               isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+
+      // Whatever high bits in c are zero are known to be zero.
+      KnownZero |=
+        APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
+    // assume(v <_u c)
+    } else if (match(I, m_Intrinsic<Intrinsic::assume>(
+                       m_ICmp(Pred, m_V, m_Value(A)))) &&
+               Pred == ICmpInst::ICMP_ULT &&
+               isValidAssumeForContext(I, Q, DL)) {
+      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+      computeKnownBits(A, RHSKnownZero, RHSKnownOne, DL, Depth+1, Query(Q, I));
+
+      // Whatever high bits in c are zero are known to be zero (if c is a power
+      // of 2, then one more).
+      if (isKnownToBeAPowerOfTwo(A, false, Depth+1, Query(Q, I)))
+        KnownZero |=
+          APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes()+1);
+      else
+        KnownZero |=
+          APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
+    }
+  }
+}
+
 /// Determine which bits of V are known to be either zero or one and return
 /// them in the KnownZero/KnownOne bit sets.
 ///
@@ -224,8 +743,9 @@
 /// where V is a vector, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
-void llvm::computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
-                            const DataLayout *TD, unsigned Depth) {
+void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
+                      const DataLayout *TD, unsigned Depth,
+                      const Query &Q) {
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
   unsigned BitWidth = KnownZero.getBitWidth();
@@ -270,6 +790,17 @@
     return;
   }
 
+  // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
+  // the bits of its aliasee.
+  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    if (GA->mayBeOverridden()) {
+      KnownZero.clearAllBits(); KnownOne.clearAllBits();
+    } else {
+      computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, TD, Depth+1, Q);
+    }
+    return;
+  }
+
   // The address of an aligned GlobalValue has trailing zeros.
   if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
     unsigned Align = GV->getAlignment();
@@ -295,25 +826,11 @@
     KnownOne.clearAllBits();
     return;
   }
-  // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
-  // the bits of its aliasee.
-  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-    if (GA->mayBeOverridden()) {
-      KnownZero.clearAllBits(); KnownOne.clearAllBits();
-    } else {
-      computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, TD, Depth+1);
-    }
-    return;
-  }
 
   if (Argument *A = dyn_cast<Argument>(V)) {
-    unsigned Align = 0;
+    unsigned Align = A->getType()->isPointerTy() ? A->getParamAlignment() : 0;
 
-    if (A->hasByValOrInAllocaAttr()) {
-      // Get alignment information off byval/inalloca arguments if specified in
-      // the IR.
-      Align = A->getParamAlignment();
-    } else if (TD && A->hasStructRetAttr()) {
+    if (!Align && TD && A->hasStructRetAttr()) {
       // An sret parameter has at least the ABI alignment of the return type.
       Type *EltTy = cast<PointerType>(A->getType())->getElementType();
       if (EltTy->isSized())
@@ -322,6 +839,10 @@
 
     if (Align)
       KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
+
+    // Don't give up yet... there might be an assumption that provides more
+    // information...
+    computeKnownBitsFromAssume(V, KnownZero, KnownOne, TD, Depth, Q);
     return;
   }
 
@@ -331,6 +852,9 @@
   if (Depth == MaxDepth)
     return;  // Limit search depth.
 
+  // Check whether a nearby assume intrinsic can determine some known bits.
+  computeKnownBitsFromAssume(V, KnownZero, KnownOne, TD, Depth, Q);
+
   Operator *I = dyn_cast<Operator>(V);
   if (!I) return;
 
@@ -343,8 +867,8 @@
     break;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
-    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1, Q);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
     KnownOne &= KnownOne2;
@@ -353,8 +877,8 @@
     break;
   }
   case Instruction::Or: {
-    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1, Q);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     KnownZero &= KnownZero2;
@@ -363,8 +887,8 @@
     break;
   }
   case Instruction::Xor: {
-    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1, Q);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1, Q);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
     APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
@@ -376,19 +900,20 @@
   case Instruction::Mul: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW,
-                         KnownZero, KnownOne, KnownZero2, KnownOne2, TD, Depth);
+                         KnownZero, KnownOne, KnownZero2, KnownOne2, TD,
+                         Depth, Q);
     break;
   }
   case Instruction::UDiv: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1, Q);
     unsigned LeadZ = KnownZero2.countLeadingOnes();
 
     KnownOne2.clearAllBits();
     KnownZero2.clearAllBits();
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
+    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1, Q);
     unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
     if (RHSUnknownLeadingOnes != BitWidth)
       LeadZ = std::min(BitWidth,
@@ -398,9 +923,8 @@
     break;
   }
   case Instruction::Select:
-    computeKnownBits(I->getOperand(2), KnownZero, KnownOne, TD, Depth+1);
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, TD,
-                      Depth+1);
+    computeKnownBits(I->getOperand(2), KnownZero, KnownOne, TD, Depth+1, Q);
+    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1, Q);
 
     // Only known if known in both the LHS and RHS.
     KnownOne &= KnownOne2;
@@ -415,6 +939,7 @@
     break; // Can't work with floating point.
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
+  case Instruction::AddrSpaceCast: // Pointers could be different sizes.
     // We can't handle these if we don't know the pointer size.
     if (!TD) break;
     // FALL THROUGH and handle them the same as zext/trunc.
@@ -435,7 +960,7 @@
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
     KnownZero = KnownZero.zextOrTrunc(SrcBitWidth);
     KnownOne = KnownOne.zextOrTrunc(SrcBitWidth);
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1, Q);
     KnownZero = KnownZero.zextOrTrunc(BitWidth);
     KnownOne = KnownOne.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
@@ -449,7 +974,7 @@
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
         !I->getType()->isVectorTy()) {
-      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1, Q);
       break;
     }
     break;
@@ -460,7 +985,7 @@
 
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1, Q);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
 
@@ -476,11 +1001,10 @@
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
-      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1, Q);
       KnownZero <<= ShiftAmt;
       KnownOne  <<= ShiftAmt;
       KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt); // low bits known 0
-      break;
     }
     break;
   case Instruction::LShr:
@@ -490,12 +1014,11 @@
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
 
       // Unsigned shift right.
-      computeKnownBits(I->getOperand(0), KnownZero,KnownOne, TD, Depth+1);
+      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1, Q);
       KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
       KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
       // high bits known zero.
       KnownZero |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
-      break;
     }
     break;
   case Instruction::AShr:
@@ -505,7 +1028,7 @@
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
 
       // Signed shift right.
-      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1, Q);
       KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
       KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
 
@@ -514,21 +1037,20 @@
         KnownZero |= HighBits;
       else if (KnownOne[BitWidth-ShiftAmt-1])  // New bits are known one.
         KnownOne |= HighBits;
-      break;
     }
     break;
   case Instruction::Sub: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
                             KnownZero, KnownOne, KnownZero2, KnownOne2, TD,
-                            Depth);
+                            Depth, Q);
     break;
   }
   case Instruction::Add: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
                             KnownZero, KnownOne, KnownZero2, KnownOne2, TD,
-                            Depth);
+                            Depth, Q);
     break;
   }
   case Instruction::SRem:
@@ -536,7 +1058,8 @@
       APInt RA = Rem->getValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, TD,
+                         Depth+1, Q);
 
         // The low bits of the first operand are unchanged by the srem.
         KnownZero = KnownZero2 & LowBits;
@@ -561,7 +1084,7 @@
     if (KnownZero.isNonNegative()) {
       APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
       computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, TD,
-                       Depth+1);
+                       Depth+1, Q);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
         KnownZero.setBit(BitWidth - 1);
@@ -574,7 +1097,7 @@
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
         computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD,
-                         Depth+1);
+                         Depth+1, Q);
         KnownZero |= ~LowBits;
         KnownOne &= LowBits;
         break;
@@ -583,8 +1106,8 @@
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
+    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1, Q);
+    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1, Q);
 
     unsigned Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
@@ -608,7 +1131,7 @@
     // to determine if we can prove known low zero bits.
     APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0);
     computeKnownBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, TD,
-                     Depth+1);
+                     Depth+1, Q);
     unsigned TrailZ = LocalKnownZero.countTrailingOnes();
 
     gep_type_iterator GTI = gep_type_begin(I);
@@ -644,7 +1167,7 @@
         unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
         uint64_t TypeSize = TD ? TD->getTypeAllocSize(IndexedTy) : 1;
         LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0);
-        computeKnownBits(Index, LocalKnownZero, LocalKnownOne, TD, Depth+1);
+        computeKnownBits(Index, LocalKnownZero, LocalKnownOne, TD, Depth+1, Q);
         TrailZ = std::min(TrailZ,
                           unsigned(countTrailingZeros(TypeSize) +
                                    LocalKnownZero.countTrailingOnes()));
@@ -686,11 +1209,11 @@
             break;
           // Ok, we have a PHI of the form L op= R. Check for low
           // zero bits.
-          computeKnownBits(R, KnownZero2, KnownOne2, TD, Depth+1);
+          computeKnownBits(R, KnownZero2, KnownOne2, TD, Depth+1, Q);
 
           // We need to take the minimum number of known bits
           APInt KnownZero3(KnownZero), KnownOne3(KnownOne);
-          computeKnownBits(L, KnownZero3, KnownOne3, TD, Depth+1);
+          computeKnownBits(L, KnownZero3, KnownOne3, TD, Depth+1, Q);
 
           KnownZero = APInt::getLowBitsSet(BitWidth,
                                            std::min(KnownZero2.countTrailingOnes(),
@@ -722,7 +1245,7 @@
         // Recurse, but cap the recursion to one level, because we don't
         // want to waste time spinning around in loops.
         computeKnownBits(P->getIncomingValue(i), KnownZero2, KnownOne2, TD,
-                         MaxDepth-1);
+                         MaxDepth-1, Q);
         KnownZero &= KnownZero2;
         KnownOne &= KnownOne2;
         // If all bits have been ruled out, there's no need to check
@@ -774,19 +1297,19 @@
         case Intrinsic::sadd_with_overflow:
           computeKnownBitsAddSub(true, II->getArgOperand(0),
                                  II->getArgOperand(1), false, KnownZero,
-                                 KnownOne, KnownZero2, KnownOne2, TD, Depth);
+                                 KnownOne, KnownZero2, KnownOne2, TD, Depth, Q);
           break;
         case Intrinsic::usub_with_overflow:
         case Intrinsic::ssub_with_overflow:
           computeKnownBitsAddSub(false, II->getArgOperand(0),
                                  II->getArgOperand(1), false, KnownZero,
-                                 KnownOne, KnownZero2, KnownOne2, TD, Depth);
+                                 KnownOne, KnownZero2, KnownOne2, TD, Depth, Q);
           break;
         case Intrinsic::umul_with_overflow:
         case Intrinsic::smul_with_overflow:
           computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1),
                               false, KnownZero, KnownOne,
-                              KnownZero2, KnownOne2, TD, Depth);
+                              KnownZero2, KnownOne2, TD, Depth, Q);
           break;
         }
       }
@@ -796,10 +1319,11 @@
   assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 }
 
-/// ComputeSignBit - Determine whether the sign bit is known to be zero or
-/// one.  Convenience wrapper around computeKnownBits.
-void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                          const DataLayout *TD, unsigned Depth) {
+/// Determine whether the sign bit is known to be zero or one.
+/// Convenience wrapper around computeKnownBits.
+void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
+                    const DataLayout *TD, unsigned Depth,
+                    const Query &Q) {
   unsigned BitWidth = getBitWidth(V->getType(), TD);
   if (!BitWidth) {
     KnownZero = false;
@@ -808,16 +1332,17 @@
   }
   APInt ZeroBits(BitWidth, 0);
   APInt OneBits(BitWidth, 0);
-  computeKnownBits(V, ZeroBits, OneBits, TD, Depth);
+  computeKnownBits(V, ZeroBits, OneBits, TD, Depth, Q);
   KnownOne = OneBits[BitWidth - 1];
   KnownZero = ZeroBits[BitWidth - 1];
 }
 
-/// isKnownToBeAPowerOfTwo - Return true if the given value is known to have exactly one
+/// Return true if the given value is known to have exactly one
 /// bit set when defined. For vectors return true if every element is known to
-/// be a power of two when defined.  Supports values with integer or pointer
+/// be a power of two when defined. Supports values with integer or pointer
 /// types and vectors of integers.
-bool llvm::isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth) {
+bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
+                            const Query &Q) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return OrZero;
@@ -844,19 +1369,20 @@
   // A shift of a power of two is a power of two or zero.
   if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
                  match(V, m_Shr(m_Value(X), m_Value()))))
-    return isKnownToBeAPowerOfTwo(X, /*OrZero*/true, Depth);
+    return isKnownToBeAPowerOfTwo(X, /*OrZero*/true, Depth, Q);
 
   if (ZExtInst *ZI = dyn_cast<ZExtInst>(V))
-    return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth);
+    return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth, Q);
 
   if (SelectInst *SI = dyn_cast<SelectInst>(V))
-    return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth) &&
-      isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth);
+    return
+      isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth, Q) &&
+      isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth, Q);
 
   if (OrZero && match(V, m_And(m_Value(X), m_Value(Y)))) {
     // A power of two and'd with anything is a power of two or zero.
-    if (isKnownToBeAPowerOfTwo(X, /*OrZero*/true, Depth) ||
-        isKnownToBeAPowerOfTwo(Y, /*OrZero*/true, Depth))
+    if (isKnownToBeAPowerOfTwo(X, /*OrZero*/true, Depth, Q) ||
+        isKnownToBeAPowerOfTwo(Y, /*OrZero*/true, Depth, Q))
       return true;
     // X & (-X) is always a power of two or zero.
     if (match(X, m_Neg(m_Specific(Y))) || match(Y, m_Neg(m_Specific(X))))
@@ -871,19 +1397,19 @@
     if (OrZero || VOBO->hasNoUnsignedWrap() || VOBO->hasNoSignedWrap()) {
       if (match(X, m_And(m_Specific(Y), m_Value())) ||
           match(X, m_And(m_Value(), m_Specific(Y))))
-        if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth))
+        if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth, Q))
           return true;
       if (match(Y, m_And(m_Specific(X), m_Value())) ||
           match(Y, m_And(m_Value(), m_Specific(X))))
-        if (isKnownToBeAPowerOfTwo(X, OrZero, Depth))
+        if (isKnownToBeAPowerOfTwo(X, OrZero, Depth, Q))
           return true;
 
       unsigned BitWidth = V->getType()->getScalarSizeInBits();
       APInt LHSZeroBits(BitWidth, 0), LHSOneBits(BitWidth, 0);
-      computeKnownBits(X, LHSZeroBits, LHSOneBits, nullptr, Depth);
+      computeKnownBits(X, LHSZeroBits, LHSOneBits, nullptr, Depth, Q);
 
       APInt RHSZeroBits(BitWidth, 0), RHSOneBits(BitWidth, 0);
-      computeKnownBits(Y, RHSZeroBits, RHSOneBits, nullptr, Depth);
+      computeKnownBits(Y, RHSZeroBits, RHSOneBits, nullptr, Depth, Q);
       // If i8 V is a power of two or zero:
       //  ZeroBits: 1 1 1 0 1 1 1 1
       // ~ZeroBits: 0 0 0 1 0 0 0 0
@@ -900,7 +1426,8 @@
   // copying a sign bit (sdiv int_min, 2).
   if (match(V, m_Exact(m_LShr(m_Value(), m_Value()))) ||
       match(V, m_Exact(m_UDiv(m_Value(), m_Value())))) {
-    return isKnownToBeAPowerOfTwo(cast<Operator>(V)->getOperand(0), OrZero, Depth);
+    return isKnownToBeAPowerOfTwo(cast<Operator>(V)->getOperand(0), OrZero,
+                                  Depth, Q);
   }
 
   return false;
@@ -913,7 +1440,7 @@
 ///
 /// Currently this routine does not support vector GEPs.
 static bool isGEPKnownNonNull(GEPOperator *GEP, const DataLayout *DL,
-                              unsigned Depth) {
+                              unsigned Depth, const Query &Q) {
   if (!GEP->isInBounds() || GEP->getPointerAddressSpace() != 0)
     return false;
 
@@ -922,7 +1449,7 @@
 
   // If the base pointer is non-null, we cannot walk to a null address with an
   // inbounds GEP in address space zero.
-  if (isKnownNonZero(GEP->getPointerOperand(), DL, Depth))
+  if (isKnownNonZero(GEP->getPointerOperand(), DL, Depth, Q))
     return true;
 
   // Past this, if we don't have DataLayout, we can't do much.
@@ -965,18 +1492,36 @@
     if (Depth++ >= MaxDepth)
       continue;
 
-    if (isKnownNonZero(GTI.getOperand(), DL, Depth))
+    if (isKnownNonZero(GTI.getOperand(), DL, Depth, Q))
       return true;
   }
 
   return false;
 }
 
-/// isKnownNonZero - Return true if the given value is known to be non-zero
-/// when defined.  For vectors return true if every element is known to be
-/// non-zero when defined.  Supports values with integer or pointer type and
-/// vectors of integers.
-bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
+/// Does the 'Range' metadata (which must be a valid MD_range operand list)
+/// ensure that the value it's attached to is never Value?  'RangeType' is
+/// is the type of the value described by the range.
+static bool rangeMetadataExcludesValue(MDNode* Ranges,
+                                       const APInt& Value) {
+  const unsigned NumRanges = Ranges->getNumOperands() / 2;
+  assert(NumRanges >= 1);
+  for (unsigned i = 0; i < NumRanges; ++i) {
+    ConstantInt *Lower = cast<ConstantInt>(Ranges->getOperand(2*i + 0));
+    ConstantInt *Upper = cast<ConstantInt>(Ranges->getOperand(2*i + 1));
+    ConstantRange Range(Lower->getValue(), Upper->getValue());
+    if (Range.contains(Value))
+      return false;
+  }
+  return true;
+}
+
+/// Return true if the given value is known to be non-zero when defined.
+/// For vectors return true if every element is known to be non-zero when
+/// defined. Supports values with integer or pointer type and vectors of
+/// integers.
+bool isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth,
+                    const Query &Q) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return false;
@@ -987,6 +1532,18 @@
     return false;
   }
 
+  if (Instruction* I = dyn_cast<Instruction>(V)) {
+    if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range)) {
+      // If the possible ranges don't contain zero, then the value is
+      // definitely non-zero.
+      if (IntegerType* Ty = dyn_cast<IntegerType>(V->getType())) {
+        const APInt ZeroValue(Ty->getBitWidth(), 0);
+        if (rangeMetadataExcludesValue(Ranges, ZeroValue))
+          return true;
+      }
+    }
+  }
+
   // The remaining tests are all recursive, so bail out if we hit the limit.
   if (Depth++ >= MaxDepth)
     return false;
@@ -996,7 +1553,7 @@
     if (isKnownNonNull(V))
       return true; 
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
-      if (isGEPKnownNonNull(GEP, TD, Depth))
+      if (isGEPKnownNonNull(GEP, TD, Depth, Q))
         return true;
   }
 
@@ -1005,11 +1562,12 @@
   // X | Y != 0 if X != 0 or Y != 0.
   Value *X = nullptr, *Y = nullptr;
   if (match(V, m_Or(m_Value(X), m_Value(Y))))
-    return isKnownNonZero(X, TD, Depth) || isKnownNonZero(Y, TD, Depth);
+    return isKnownNonZero(X, TD, Depth, Q) ||
+           isKnownNonZero(Y, TD, Depth, Q);
 
   // ext X != 0 if X != 0.
   if (isa<SExtInst>(V) || isa<ZExtInst>(V))
-    return isKnownNonZero(cast<Instruction>(V)->getOperand(0), TD, Depth);
+    return isKnownNonZero(cast<Instruction>(V)->getOperand(0), TD, Depth, Q);
 
   // shl X, Y != 0 if X is odd.  Note that the value of the shift is undefined
   // if the lowest bit is shifted off the end.
@@ -1017,11 +1575,11 @@
     // shl nuw can't remove any non-zero bits.
     OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
     if (BO->hasNoUnsignedWrap())
-      return isKnownNonZero(X, TD, Depth);
+      return isKnownNonZero(X, TD, Depth, Q);
 
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    computeKnownBits(X, KnownZero, KnownOne, TD, Depth);
+    computeKnownBits(X, KnownZero, KnownOne, TD, Depth, Q);
     if (KnownOne[0])
       return true;
   }
@@ -1031,28 +1589,29 @@
     // shr exact can only shift out zero bits.
     PossiblyExactOperator *BO = cast<PossiblyExactOperator>(V);
     if (BO->isExact())
-      return isKnownNonZero(X, TD, Depth);
+      return isKnownNonZero(X, TD, Depth, Q);
 
     bool XKnownNonNegative, XKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, TD, Depth);
+    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, TD, Depth, Q);
     if (XKnownNegative)
       return true;
   }
   // div exact can only produce a zero if the dividend is zero.
   else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) {
-    return isKnownNonZero(X, TD, Depth);
+    return isKnownNonZero(X, TD, Depth, Q);
   }
   // X + Y.
   else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
     bool XKnownNonNegative, XKnownNegative;
     bool YKnownNonNegative, YKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, TD, Depth);
-    ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, TD, Depth);
+    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, TD, Depth, Q);
+    ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, TD, Depth, Q);
 
     // If X and Y are both non-negative (as signed values) then their sum is not
     // zero unless both X and Y are zero.
     if (XKnownNonNegative && YKnownNonNegative)
-      if (isKnownNonZero(X, TD, Depth) || isKnownNonZero(Y, TD, Depth))
+      if (isKnownNonZero(X, TD, Depth, Q) ||
+          isKnownNonZero(Y, TD, Depth, Q))
         return true;
 
     // If X and Y are both negative (as signed values) then their sum is not
@@ -1063,20 +1622,22 @@
       APInt Mask = APInt::getSignedMaxValue(BitWidth);
       // The sign bit of X is set.  If some other bit is set then X is not equal
       // to INT_MIN.
-      computeKnownBits(X, KnownZero, KnownOne, TD, Depth);
+      computeKnownBits(X, KnownZero, KnownOne, TD, Depth, Q);
       if ((KnownOne & Mask) != 0)
         return true;
       // The sign bit of Y is set.  If some other bit is set then Y is not equal
       // to INT_MIN.
-      computeKnownBits(Y, KnownZero, KnownOne, TD, Depth);
+      computeKnownBits(Y, KnownZero, KnownOne, TD, Depth, Q);
       if ((KnownOne & Mask) != 0)
         return true;
     }
 
     // The sum of a non-negative number and a power of two is not zero.
-    if (XKnownNonNegative && isKnownToBeAPowerOfTwo(Y, /*OrZero*/false, Depth))
+    if (XKnownNonNegative &&
+        isKnownToBeAPowerOfTwo(Y, /*OrZero*/false, Depth, Q))
       return true;
-    if (YKnownNonNegative && isKnownToBeAPowerOfTwo(X, /*OrZero*/false, Depth))
+    if (YKnownNonNegative &&
+        isKnownToBeAPowerOfTwo(X, /*OrZero*/false, Depth, Q))
       return true;
   }
   // X * Y.
@@ -1085,51 +1646,53 @@
     // If X and Y are non-zero then so is X * Y as long as the multiplication
     // does not overflow.
     if ((BO->hasNoSignedWrap() || BO->hasNoUnsignedWrap()) &&
-        isKnownNonZero(X, TD, Depth) && isKnownNonZero(Y, TD, Depth))
+        isKnownNonZero(X, TD, Depth, Q) &&
+        isKnownNonZero(Y, TD, Depth, Q))
       return true;
   }
   // (C ? X : Y) != 0 if X != 0 and Y != 0.
   else if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
-    if (isKnownNonZero(SI->getTrueValue(), TD, Depth) &&
-        isKnownNonZero(SI->getFalseValue(), TD, Depth))
+    if (isKnownNonZero(SI->getTrueValue(), TD, Depth, Q) &&
+        isKnownNonZero(SI->getFalseValue(), TD, Depth, Q))
       return true;
   }
 
   if (!BitWidth) return false;
   APInt KnownZero(BitWidth, 0);
   APInt KnownOne(BitWidth, 0);
-  computeKnownBits(V, KnownZero, KnownOne, TD, Depth);
+  computeKnownBits(V, KnownZero, KnownOne, TD, Depth, Q);
   return KnownOne != 0;
 }
 
-/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
-/// this predicate to simplify operations downstream.  Mask is known to be zero
-/// for bits that V cannot have.
+/// Return true if 'V & Mask' is known to be zero.  We use this predicate to
+/// simplify operations downstream. Mask is known to be zero for bits that V
+/// cannot have.
 ///
 /// This function is defined on values with integer type, values with pointer
 /// type (but only if TD is non-null), and vectors of integers.  In the case
 /// where V is a vector, the mask, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
-bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
-                             const DataLayout *TD, unsigned Depth) {
+bool MaskedValueIsZero(Value *V, const APInt &Mask,
+                       const DataLayout *TD, unsigned Depth,
+                       const Query &Q) {
   APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
-  computeKnownBits(V, KnownZero, KnownOne, TD, Depth);
+  computeKnownBits(V, KnownZero, KnownOne, TD, Depth, Q);
   return (KnownZero & Mask) == Mask;
 }
 
 
 
-/// ComputeNumSignBits - Return the number of times the sign bit of the
-/// register is replicated into the other bits.  We know that at least 1 bit
-/// is always equal to the sign bit (itself), but other cases can give us
-/// information.  For example, immediately after an "ashr X, 2", we know that
-/// the top 3 bits are all equal to each other, so we return 3.
+/// Return the number of times the sign bit of the register is replicated into
+/// the other bits. We know that at least 1 bit is always equal to the sign bit
+/// (itself), but other cases can give us information. For example, immediately
+/// after an "ashr X, 2", we know that the top 3 bits are all equal to each
+/// other, so we return 3.
 ///
 /// 'Op' must have a scalar integer type.
 ///
-unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
-                                  unsigned Depth) {
+unsigned ComputeNumSignBits(Value *V, const DataLayout *TD,
+                            unsigned Depth, const Query &Q) {
   assert((TD || V->getType()->isIntOrIntVectorTy()) &&
          "ComputeNumSignBits requires a DataLayout object to operate "
          "on non-integer values!");
@@ -1150,10 +1713,10 @@
   default: break;
   case Instruction::SExt:
     Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
-    return ComputeNumSignBits(U->getOperand(0), TD, Depth+1) + Tmp;
+    return ComputeNumSignBits(U->getOperand(0), TD, Depth+1, Q) + Tmp;
 
   case Instruction::AShr: {
-    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1, Q);
     // ashr X, C   -> adds C sign bits.  Vectors too.
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
@@ -1166,7 +1729,7 @@
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
       // shl destroys sign bits.
-      Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+      Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1, Q);
       Tmp2 = ShAmt->getZExtValue();
       if (Tmp2 >= TyBits ||      // Bad shift.
           Tmp2 >= Tmp) break;    // Shifted all sign bits out.
@@ -1178,9 +1741,9 @@
   case Instruction::Or:
   case Instruction::Xor:    // NOT is handled here.
     // Logical binary ops preserve the number of sign bits at the worst.
-    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1, Q);
     if (Tmp != 1) {
-      Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+      Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1, Q);
       FirstAnswer = std::min(Tmp, Tmp2);
       // We computed what we know about the sign bits as our first
       // answer. Now proceed to the generic code that uses
@@ -1189,22 +1752,22 @@
     break;
 
   case Instruction::Select:
-    Tmp = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+    Tmp = ComputeNumSignBits(U->getOperand(1), TD, Depth+1, Q);
     if (Tmp == 1) return 1;  // Early out.
-    Tmp2 = ComputeNumSignBits(U->getOperand(2), TD, Depth+1);
+    Tmp2 = ComputeNumSignBits(U->getOperand(2), TD, Depth+1, Q);
     return std::min(Tmp, Tmp2);
 
   case Instruction::Add:
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
-    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1, Q);
     if (Tmp == 1) return 1;  // Early out.
 
     // Special case decrementing a value (ADD X, -1):
     if (ConstantInt *CRHS = dyn_cast<ConstantInt>(U->getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        computeKnownBits(U->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+        computeKnownBits(U->getOperand(0), KnownZero, KnownOne, TD, Depth+1, Q);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
@@ -1217,19 +1780,19 @@
           return Tmp;
       }
 
-    Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+    Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1, Q);
     if (Tmp2 == 1) return 1;
     return std::min(Tmp, Tmp2)-1;
 
   case Instruction::Sub:
-    Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+    Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1, Q);
     if (Tmp2 == 1) return 1;
 
     // Handle NEG.
     if (ConstantInt *CLHS = dyn_cast<ConstantInt>(U->getOperand(0)))
       if (CLHS->isNullValue()) {
         APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        computeKnownBits(U->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+        computeKnownBits(U->getOperand(1), KnownZero, KnownOne, TD, Depth+1, Q);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue())
@@ -1245,7 +1808,7 @@
 
     // Sub can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
-    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1, Q);
     if (Tmp == 1) return 1;  // Early out.
     return std::min(Tmp, Tmp2)-1;
 
@@ -1256,11 +1819,12 @@
 
     // Take the minimum of all incoming values.  This can't infinitely loop
     // because of our depth threshold.
-    Tmp = ComputeNumSignBits(PN->getIncomingValue(0), TD, Depth+1);
+    Tmp = ComputeNumSignBits(PN->getIncomingValue(0), TD, Depth+1, Q);
     for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
       if (Tmp == 1) return Tmp;
       Tmp = std::min(Tmp,
-                     ComputeNumSignBits(PN->getIncomingValue(i), TD, Depth+1));
+                     ComputeNumSignBits(PN->getIncomingValue(i), TD,
+                                        Depth+1, Q));
     }
     return Tmp;
   }
@@ -1275,7 +1839,7 @@
   // use this information.
   APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
   APInt Mask;
-  computeKnownBits(V, KnownZero, KnownOne, TD, Depth);
+  computeKnownBits(V, KnownZero, KnownOne, TD, Depth, Q);
 
   if (KnownZero.isNegative()) {        // sign bit is 0
     Mask = KnownZero;
@@ -1295,9 +1859,9 @@
   return std::max(FirstAnswer, std::min(TyBits, Mask.countLeadingZeros()));
 }
 
-/// ComputeMultiple - This function computes the integer multiple of Base that
-/// equals V.  If successful, it returns true and returns the multiple in
-/// Multiple.  If unsuccessful, it returns false. It looks
+/// This function computes the integer multiple of Base that equals V.
+/// If successful, it returns true and returns the multiple in
+/// Multiple. If unsuccessful, it returns false. It looks
 /// through SExt instructions only if LookThroughSExt is true.
 bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
                            bool LookThroughSExt, unsigned Depth) {
@@ -1415,8 +1979,8 @@
   return false;
 }
 
-/// CannotBeNegativeZero - Return true if we can prove that the specified FP
-/// value is never equal to -0.0.
+/// Return true if we can prove that the specified FP value is never equal to
+/// -0.0.
 ///
 /// NOTE: this function will need to be revisited when we support non-default
 /// rounding modes!
@@ -1469,8 +2033,8 @@
   return false;
 }
 
-/// isBytewiseValue - If the specified value can be set by repeating the same
-/// byte in memory, return the i8 value that it is represented with.  This is
+/// If the specified value can be set by repeating the same byte in memory,
+/// return the i8 value that it is represented with.  This is
 /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
 /// i16 0xF0F0, double 0.0 etc.  If the value can't be handled with a repeated
 /// byte store (e.g. i16 0x1234), return null.
@@ -1618,7 +2182,7 @@
   return BuildSubAggregate(From, To, IndexedType, Idxs, IdxSkip, InsertBefore);
 }
 
-/// FindInsertedValue - Given an aggregrate and an sequence of indices, see if
+/// Given an aggregrate and an sequence of indices, see if
 /// the scalar value indexed is already around as a register, for example if it
 /// were inserted directly into the aggregrate.
 ///
@@ -1708,9 +2272,8 @@
   return nullptr;
 }
 
-/// GetPointerBaseWithConstantOffset - Analyze the specified pointer to see if
-/// it can be expressed as a base pointer plus a constant offset.  Return the
-/// base and offset to the caller.
+/// Analyze the specified pointer to see if it can be expressed as a base
+/// pointer plus a constant offset. Return the base and offset to the caller.
 Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
                                               const DataLayout *DL) {
   // Without DataLayout, conservatively assume 64-bit offsets, which is
@@ -1731,7 +2294,8 @@
       }
 
       Ptr = GEP->getPointerOperand();
-    } else if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+    } else if (Operator::getOpcode(Ptr) == Instruction::BitCast ||
+               Operator::getOpcode(Ptr) == Instruction::AddrSpaceCast) {
       Ptr = cast<Operator>(Ptr)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
       if (GA->mayBeOverridden())
@@ -1746,9 +2310,9 @@
 }
 
 
-/// getConstantStringInfo - This function computes the length of a
-/// null-terminated C string pointed to by V.  If successful, it returns true
-/// and returns the string in Str.  If unsuccessful, it returns false.
+/// This function computes the length of a null-terminated C string pointed to
+/// by V. If successful, it returns true and returns the string in Str.
+/// If unsuccessful, it returns false.
 bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
                                  uint64_t Offset, bool TrimAtNul) {
   assert(V);
@@ -1832,16 +2396,16 @@
 // nodes.
 // TODO: See if we can integrate these two together.
 
-/// GetStringLengthH - If we can compute the length of the string pointed to by
+/// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
-static uint64_t GetStringLengthH(Value *V, SmallPtrSet<PHINode*, 32> &PHIs) {
+static uint64_t GetStringLengthH(Value *V, SmallPtrSetImpl<PHINode*> &PHIs) {
   // Look through noop bitcast instructions.
   V = V->stripPointerCasts();
 
   // If this is a PHI node, there are two cases: either we have already seen it
   // or we haven't.
   if (PHINode *PN = dyn_cast<PHINode>(V)) {
-    if (!PHIs.insert(PN))
+    if (!PHIs.insert(PN).second)
       return ~0ULL;  // already in the set.
 
     // If it was new, see if all the input strings are the same length.
@@ -1881,7 +2445,7 @@
   return StrData.size()+1;
 }
 
-/// GetStringLength - If we can compute the length of the string pointed to by
+/// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
 uint64_t llvm::GetStringLength(Value *V) {
   if (!V->getType()->isPointerTy()) return 0;
@@ -1900,7 +2464,8 @@
   for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
       V = GEP->getPointerOperand();
-    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+    } else if (Operator::getOpcode(V) == Instruction::BitCast ||
+               Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (GA->mayBeOverridden())
@@ -1909,7 +2474,7 @@
     } else {
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
-        // TODO: Acquire a DominatorTree and use it.
+        // TODO: Acquire a DominatorTree and AssumptionTracker and use them.
         if (Value *Simplified = SimplifyInstruction(I, TD, nullptr)) {
           V = Simplified;
           continue;
@@ -1934,7 +2499,7 @@
     Value *P = Worklist.pop_back_val();
     P = GetUnderlyingObject(P, TD, MaxLookup);
 
-    if (!Visited.insert(P))
+    if (!Visited.insert(P).second)
       continue;
 
     if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
@@ -1953,9 +2518,7 @@
   } while (!Worklist.empty());
 }
 
-/// onlyUsedByLifetimeMarkers - Return true if the only users of this pointer
-/// are lifetime markers.
-///
+/// Return true if the only users of this pointer are lifetime markers.
 bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
   for (const User *U : V->users()) {
     const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
@@ -1983,23 +2546,31 @@
   default:
     return true;
   case Instruction::UDiv:
-  case Instruction::URem:
-    // x / y is undefined if y == 0, but calculations like x / 3 are safe.
-    return isKnownNonZero(Inst->getOperand(1), TD);
+  case Instruction::URem: {
+    // x / y is undefined if y == 0.
+    const APInt *V;
+    if (match(Inst->getOperand(1), m_APInt(V)))
+      return *V != 0;
+    return false;
+  }
   case Instruction::SDiv:
   case Instruction::SRem: {
-    Value *Op = Inst->getOperand(1);
-    // x / y is undefined if y == 0
-    if (!isKnownNonZero(Op, TD))
-      return false;
-    // x / y might be undefined if y == -1
-    unsigned BitWidth = getBitWidth(Op->getType(), TD);
-    if (BitWidth == 0)
-      return false;
-    APInt KnownZero(BitWidth, 0);
-    APInt KnownOne(BitWidth, 0);
-    computeKnownBits(Op, KnownZero, KnownOne, TD);
-    return !!KnownZero;
+    // x / y is undefined if y == 0 or x == INT_MIN and y == -1
+    const APInt *X, *Y;
+    if (match(Inst->getOperand(1), m_APInt(Y))) {
+      if (*Y != 0) {
+        if (*Y == -1) {
+          // The numerator can't be MinSignedValue if the denominator is -1.
+          if (match(Inst->getOperand(0), m_APInt(X)))
+            return !Y->isMinSignedValue();
+          // The numerator *might* be MinSignedValue.
+          return false;
+        }
+        // The denominator is not 0 or -1, it's safe to proceed.
+        return true;
+      }
+    }
+    return false;
   }
   case Instruction::Load: {
     const LoadInst *LI = cast<LoadInst>(Inst);
@@ -2010,41 +2581,44 @@
     return LI->getPointerOperand()->isDereferenceablePointer(TD);
   }
   case Instruction::Call: {
-   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-     switch (II->getIntrinsicID()) {
-       // These synthetic intrinsics have no side-effects and just mark
-       // information about their operands.
-       // FIXME: There are other no-op synthetic instructions that potentially
-       // should be considered at least *safe* to speculate...
-       case Intrinsic::dbg_declare:
-       case Intrinsic::dbg_value:
-         return true;
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+      switch (II->getIntrinsicID()) {
+      // These synthetic intrinsics have no side-effects and just mark
+      // information about their operands.
+      // FIXME: There are other no-op synthetic instructions that potentially
+      // should be considered at least *safe* to speculate...
+      case Intrinsic::dbg_declare:
+      case Intrinsic::dbg_value:
+        return true;
 
-       case Intrinsic::bswap:
-       case Intrinsic::ctlz:
-       case Intrinsic::ctpop:
-       case Intrinsic::cttz:
-       case Intrinsic::objectsize:
-       case Intrinsic::sadd_with_overflow:
-       case Intrinsic::smul_with_overflow:
-       case Intrinsic::ssub_with_overflow:
-       case Intrinsic::uadd_with_overflow:
-       case Intrinsic::umul_with_overflow:
-       case Intrinsic::usub_with_overflow:
-         return true;
-       // Sqrt should be OK, since the llvm sqrt intrinsic isn't defined to set
-       // errno like libm sqrt would.
-       case Intrinsic::sqrt:
-       case Intrinsic::fma:
-       case Intrinsic::fmuladd:
-         return true;
-       // TODO: some fp intrinsics are marked as having the same error handling
-       // as libm. They're safe to speculate when they won't error.
-       // TODO: are convert_{from,to}_fp16 safe?
-       // TODO: can we list target-specific intrinsics here?
-       default: break;
-     }
-   }
+      case Intrinsic::bswap:
+      case Intrinsic::ctlz:
+      case Intrinsic::ctpop:
+      case Intrinsic::cttz:
+      case Intrinsic::objectsize:
+      case Intrinsic::sadd_with_overflow:
+      case Intrinsic::smul_with_overflow:
+      case Intrinsic::ssub_with_overflow:
+      case Intrinsic::uadd_with_overflow:
+      case Intrinsic::umul_with_overflow:
+      case Intrinsic::usub_with_overflow:
+        return true;
+      // Sqrt should be OK, since the llvm sqrt intrinsic isn't defined to set
+      // errno like libm sqrt would.
+      case Intrinsic::sqrt:
+      case Intrinsic::fma:
+      case Intrinsic::fmuladd:
+      case Intrinsic::fabs:
+      case Intrinsic::minnum:
+      case Intrinsic::maxnum:
+        return true;
+      // TODO: some fp intrinsics are marked as having the same error handling
+      // as libm. They're safe to speculate when they won't error.
+      // TODO: are convert_{from,to}_fp16 safe?
+      // TODO: can we list target-specific intrinsics here?
+      default: break;
+      }
+    }
     return false; // The called function could have undefined behavior or
                   // side-effects, even if marked readnone nounwind.
   }
@@ -2067,8 +2641,7 @@
   }
 }
 
-/// isKnownNonNull - Return true if we know that the specified value is never
-/// null.
+/// Return true if we know that the specified value is never null.
 bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
   // Alloca never returns null, malloc might.
   if (isa<AllocaInst>(V)) return true;
@@ -2081,8 +2654,12 @@
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
     return !GV->hasExternalWeakLinkage();
 
+  // A Load tagged w/nonnull metadata is never null. 
+  if (const LoadInst *LI = dyn_cast<LoadInst>(V))
+    return LI->getMetadata(LLVMContext::MD_nonnull);
+
   if (ImmutableCallSite CS = V)
-    if (CS.paramHasAttr(0, Attribute::NonNull))
+    if (CS.isReturnNonNull())
       return true;
 
   // operator new never returns null.

diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 1e5bcdd..6523bce 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp

@@ -161,14 +161,10 @@
 // Lexer definition.
 //===----------------------------------------------------------------------===//
 
-LLLexer::LLLexer(MemoryBuffer *StartBuf, SourceMgr &sm, SMDiagnostic &Err,
+LLLexer::LLLexer(StringRef StartBuf, SourceMgr &sm, SMDiagnostic &Err,
                  LLVMContext &C)
   : CurBuf(StartBuf), ErrorInfo(Err), SM(sm), Context(C), APFloatVal(0.0) {
-  CurPtr = CurBuf->getBufferStart();
-}
-
-std::string LLLexer::getFilename() const {
-  return CurBuf->getBufferIdentifier();
+  CurPtr = CurBuf.begin();
 }
 
 int LLLexer::getNextChar() {
@@ -178,7 +174,7 @@
   case 0:
     // A nul character in the stream is either the end of the current buffer or
     // a random nul in the file.  Disambiguate that here.
-    if (CurPtr-1 != CurBuf->getBufferEnd())
+    if (CurPtr-1 != CurBuf.end())
       return 0;  // Just whitespace.
 
     // Otherwise, return end of file.
@@ -516,8 +512,6 @@
 
   KEYWORD(private);
   KEYWORD(internal);
-  KEYWORD(linker_private);        // NOTE: deprecated, for parser compatibility
-  KEYWORD(linker_private_weak);   // NOTE: deprecated, for parser compatibility
   KEYWORD(available_externally);
   KEYWORD(linkonce);
   KEYWORD(linkonce_odr);
@@ -586,6 +580,7 @@
   KEYWORD(x86_stdcallcc);
   KEYWORD(x86_fastcallcc);
   KEYWORD(x86_thiscallcc);
+  KEYWORD(x86_vectorcallcc);
   KEYWORD(arm_apcscc);
   KEYWORD(arm_aapcscc);
   KEYWORD(arm_aapcs_vfpcc);
@@ -612,6 +607,7 @@
   KEYWORD(byval);
   KEYWORD(inalloca);
   KEYWORD(cold);
+  KEYWORD(dereferenceable);
   KEYWORD(inlinehint);
   KEYWORD(inreg);
   KEYWORD(jumptable);
@@ -669,6 +665,10 @@
   KEYWORD(x);
   KEYWORD(blockaddress);
 
+  // Use-list order directives.
+  KEYWORD(uselistorder);
+  KEYWORD(uselistorder_bb);
+
   KEYWORD(personality);
   KEYWORD(cleanup);
   KEYWORD(catch);

diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h
index d42de57..219827f 100644
--- a/lib/AsmParser/LLLexer.h
+++ b/lib/AsmParser/LLLexer.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LIB_ASMPARSER_LLLEXER_H
-#define LIB_ASMPARSER_LLLEXER_H
+#ifndef LLVM_LIB_ASMPARSER_LLLEXER_H
+#define LLVM_LIB_ASMPARSER_LLLEXER_H
 
 #include "LLToken.h"
 #include "llvm/ADT/APFloat.h"
@@ -28,7 +28,7 @@
 
   class LLLexer {
     const char *CurPtr;
-    MemoryBuffer *CurBuf;
+    StringRef CurBuf;
     SMDiagnostic &ErrorInfo;
     SourceMgr &SM;
     LLVMContext &Context;
@@ -43,7 +43,7 @@
     APSInt  APSIntVal;
 
   public:
-    explicit LLLexer(MemoryBuffer *StartBuf, SourceMgr &SM, SMDiagnostic &,
+    explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &,
                      LLVMContext &C);
     ~LLLexer() {}
 
@@ -67,8 +67,6 @@
     void Warning(LocTy WarningLoc, const Twine &Msg) const;
     void Warning(const Twine &Msg) const { return Warning(getLoc(), Msg); }
 
-    std::string getFilename() const;
-
   private:
     lltok::Kind LexToken();
 

diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index be55ac6..2c835f9 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp

@@ -24,6 +24,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -129,28 +130,11 @@
     }
   }
 
-  // If there are entries in ForwardRefBlockAddresses at this point, they are
-  // references after the function was defined.  Resolve those now.
-  while (!ForwardRefBlockAddresses.empty()) {
-    // Okay, we are referencing an already-parsed function, resolve them now.
-    Function *TheFn = nullptr;
-    const ValID &Fn = ForwardRefBlockAddresses.begin()->first;
-    if (Fn.Kind == ValID::t_GlobalName)
-      TheFn = M->getFunction(Fn.StrVal);
-    else if (Fn.UIntVal < NumberedVals.size())
-      TheFn = dyn_cast<Function>(NumberedVals[Fn.UIntVal]);
-
-    if (!TheFn)
-      return Error(Fn.Loc, "unknown function referenced by blockaddress");
-
-    // Resolve all these references.
-    if (ResolveForwardRefBlockAddresses(TheFn,
-                                      ForwardRefBlockAddresses.begin()->second,
-                                        nullptr))
-      return true;
-
-    ForwardRefBlockAddresses.erase(ForwardRefBlockAddresses.begin());
-  }
+  // If there are entries in ForwardRefBlockAddresses at this point, the
+  // function was never defined.
+  if (!ForwardRefBlockAddresses.empty())
+    return Error(ForwardRefBlockAddresses.begin()->first.Loc,
+                 "expected function name in blockaddress");
 
   for (unsigned i = 0, e = NumberedTypes.size(); i != e; ++i)
     if (NumberedTypes[i].second.isValid())
@@ -193,38 +177,6 @@
   return false;
 }
 
-bool LLParser::ResolveForwardRefBlockAddresses(Function *TheFn,
-                             std::vector<std::pair<ValID, GlobalValue*> > &Refs,
-                                               PerFunctionState *PFS) {
-  // Loop over all the references, resolving them.
-  for (unsigned i = 0, e = Refs.size(); i != e; ++i) {
-    BasicBlock *Res;
-    if (PFS) {
-      if (Refs[i].first.Kind == ValID::t_LocalName)
-        Res = PFS->GetBB(Refs[i].first.StrVal, Refs[i].first.Loc);
-      else
-        Res = PFS->GetBB(Refs[i].first.UIntVal, Refs[i].first.Loc);
-    } else if (Refs[i].first.Kind == ValID::t_LocalID) {
-      return Error(Refs[i].first.Loc,
-       "cannot take address of numeric label after the function is defined");
-    } else {
-      Res = dyn_cast_or_null<BasicBlock>(
-                     TheFn->getValueSymbolTable().lookup(Refs[i].first.StrVal));
-    }
-
-    if (!Res)
-      return Error(Refs[i].first.Loc,
-                   "referenced value is not a basic block");
-
-    // Get the BlockAddress for this and update references to use it.
-    BlockAddress *BA = BlockAddress::get(TheFn, Res);
-    Refs[i].second->replaceAllUsesWith(BA);
-    Refs[i].second->eraseFromParent();
-  }
-  return false;
-}
-
-
 //===----------------------------------------------------------------------===//
 // Top-Level Entities
 //===----------------------------------------------------------------------===//
@@ -254,8 +206,6 @@
     //               ('constant'|'global') ...
     case lltok::kw_private:             // OptionalLinkage
     case lltok::kw_internal:            // OptionalLinkage
-    case lltok::kw_linker_private:      // Obsolete OptionalLinkage
-    case lltok::kw_linker_private_weak: // Obsolete OptionalLinkage
     case lltok::kw_weak:                // OptionalLinkage
     case lltok::kw_weak_odr:            // OptionalLinkage
     case lltok::kw_linkonce:            // OptionalLinkage
@@ -289,6 +239,9 @@
     }
 
     case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break;
+    case lltok::kw_uselistorder: if (ParseUseListOrder()) return true; break;
+    case lltok::kw_uselistorder_bb:
+                                 if (ParseUseListOrderBB()) return true; break;
     }
   }
 }
@@ -483,10 +436,10 @@
       parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
-  if (HasLinkage || Lex.getKind() != lltok::kw_alias)
+  if (Lex.getKind() != lltok::kw_alias)
     return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
                        DLLStorageClass, TLM, UnnamedAddr);
-  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass, TLM,
+  return ParseAlias(Name, NameLoc, Linkage, Visibility, DLLStorageClass, TLM,
                     UnnamedAddr);
 }
 
@@ -512,10 +465,11 @@
       parseOptionalUnnamedAddr(UnnamedAddr))
     return true;
 
-  if (HasLinkage || Lex.getKind() != lltok::kw_alias)
+  if (Lex.getKind() != lltok::kw_alias)
     return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility,
                        DLLStorageClass, TLM, UnnamedAddr);
-  return ParseAlias(Name, NameLoc, Visibility, DLLStorageClass, TLM,
+
+  return ParseAlias(Name, NameLoc, Linkage, Visibility, DLLStorageClass, TLM,
                     UnnamedAddr);
 }
 
@@ -693,33 +647,29 @@
 }
 
 /// ParseAlias:
-///   ::= GlobalVar '=' OptionalVisibility OptionalDLLStorageClass
-///                     OptionalThreadLocal OptionalUnNammedAddr 'alias'
-///                     OptionalLinkage Aliasee
+///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility
+///                     OptionalDLLStorageClass OptionalThreadLocal
+///                     OptionalUnNammedAddr 'alias' Aliasee
 ///
 /// Aliasee
 ///   ::= TypeAndValue
 ///
 /// Everything through OptionalUnNammedAddr has already been parsed.
 ///
-bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
+bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc, unsigned L,
                           unsigned Visibility, unsigned DLLStorageClass,
                           GlobalVariable::ThreadLocalMode TLM,
                           bool UnnamedAddr) {
   assert(Lex.getKind() == lltok::kw_alias);
   Lex.Lex();
-  LocTy LinkageLoc = Lex.getLoc();
-  unsigned L;
-  if (ParseOptionalLinkage(L))
-    return true;
 
   GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes) L;
 
   if(!GlobalAlias::isValidLinkage(Linkage))
-    return Error(LinkageLoc, "invalid linkage type for alias");
+    return Error(NameLoc, "invalid linkage type for alias");
 
   if (!isValidVisibilityForLinkage(Visibility, L))
-    return Error(LinkageLoc,
+    return Error(NameLoc,
                  "symbol with local linkage must have default visibility");
 
   Constant *Aliasee;
@@ -1052,6 +1002,7 @@
               "invalid use of attribute on a function");
       break;
     case lltok::kw_byval:
+    case lltok::kw_dereferenceable:
     case lltok::kw_inalloca:
     case lltok::kw_nest:
     case lltok::kw_noalias:
@@ -1212,6 +1163,16 @@
   return false;
 }
 
+/// ParseUInt64
+///   ::= uint64
+bool LLParser::ParseUInt64(uint64_t &Val) {
+  if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
+    return TokError("expected integer");
+  Val = Lex.getAPSIntVal().getLimitedValue();
+  Lex.Lex();
+  return false;
+}
+
 /// ParseTLSModel
 ///   := 'localdynamic'
 ///   := 'initialexec'
@@ -1284,6 +1245,13 @@
       continue;
     }
     case lltok::kw_byval:           B.addAttribute(Attribute::ByVal); break;
+    case lltok::kw_dereferenceable: {
+      uint64_t Bytes;
+      if (ParseOptionalDereferenceableBytes(Bytes))
+        return true;
+      B.addDereferenceableAttr(Bytes);
+      continue;
+    }
     case lltok::kw_inalloca:        B.addAttribute(Attribute::InAlloca); break;
     case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
     case lltok::kw_nest:            B.addAttribute(Attribute::Nest); break;
@@ -1341,6 +1309,13 @@
     switch (Token) {
     default:  // End of attributes.
       return HaveError;
+    case lltok::kw_dereferenceable: {
+      uint64_t Bytes;
+      if (ParseOptionalDereferenceableBytes(Bytes))
+        return true;
+      B.addDereferenceableAttr(Bytes);
+      continue;
+    }
     case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
     case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
     case lltok::kw_nonnull:         B.addAttribute(Attribute::NonNull); break;
@@ -1409,10 +1384,6 @@
 ///   ::= 'common'
 ///   ::= 'extern_weak'
 ///   ::= 'external'
-///
-///   Deprecated Values:
-///     ::= 'linker_private'
-///     ::= 'linker_private_weak'
 bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
   HasLinkage = false;
   switch (Lex.getKind()) {
@@ -1430,15 +1401,6 @@
   case lltok::kw_common:         Res = GlobalValue::CommonLinkage;        break;
   case lltok::kw_extern_weak:    Res = GlobalValue::ExternalWeakLinkage;  break;
   case lltok::kw_external:       Res = GlobalValue::ExternalLinkage;      break;
-
-  case lltok::kw_linker_private:
-  case lltok::kw_linker_private_weak:
-    Lex.Warning("'" + Lex.getStrVal() + "' is deprecated, treating as"
-                " PrivateLinkage");
-    Lex.Lex();
-    // treat linker_private and linker_private_weak as PrivateLinkage
-    Res = GlobalValue::PrivateLinkage;
-    return false;
   }
   Lex.Lex();
   HasLinkage = true;
@@ -1486,6 +1448,7 @@
 ///   ::= 'x86_stdcallcc'
 ///   ::= 'x86_fastcallcc'
 ///   ::= 'x86_thiscallcc'
+///   ::= 'x86_vectorcallcc'
 ///   ::= 'arm_apcscc'
 ///   ::= 'arm_aapcscc'
 ///   ::= 'arm_aapcs_vfpcc'
@@ -1502,7 +1465,7 @@
 ///   ::= 'preserve_allcc'
 ///   ::= 'cc' UINT
 ///
-bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
+bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   switch (Lex.getKind()) {
   default:                       CC = CallingConv::C; return false;
   case lltok::kw_ccc:            CC = CallingConv::C; break;
@@ -1511,6 +1474,7 @@
   case lltok::kw_x86_stdcallcc:  CC = CallingConv::X86_StdCall; break;
   case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break;
   case lltok::kw_x86_thiscallcc: CC = CallingConv::X86_ThisCall; break;
+  case lltok::kw_x86_vectorcallcc:CC = CallingConv::X86_VectorCall; break;
   case lltok::kw_arm_apcscc:     CC = CallingConv::ARM_APCS; break;
   case lltok::kw_arm_aapcscc:    CC = CallingConv::ARM_AAPCS; break;
   case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break;
@@ -1527,12 +1491,8 @@
   case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break;
   case lltok::kw_preserve_allcc: CC = CallingConv::PreserveAll; break;
   case lltok::kw_cc: {
-      unsigned ArbitraryCC;
       Lex.Lex();
-      if (ParseUInt32(ArbitraryCC))
-        return true;
-      CC = static_cast<CallingConv::ID>(ArbitraryCC);
-      return false;
+      return ParseUInt32(CC);
     }
   }
 
@@ -1606,6 +1566,26 @@
   return false;
 }
 
+/// ParseOptionalDereferenceableBytes
+///   ::= /* empty */
+///   ::= 'dereferenceable' '(' 4 ')'
+bool LLParser::ParseOptionalDereferenceableBytes(uint64_t &Bytes) {
+  Bytes = 0;
+  if (!EatIfPresent(lltok::kw_dereferenceable))
+    return false;
+  LocTy ParenLoc = Lex.getLoc();
+  if (!EatIfPresent(lltok::lparen))
+    return Error(ParenLoc, "expected '('");
+  LocTy DerefLoc = Lex.getLoc();
+  if (ParseUInt64(Bytes)) return true;
+  ParenLoc = Lex.getLoc();
+  if (!EatIfPresent(lltok::rparen))
+    return Error(ParenLoc, "expected ')'");
+  if (!Bytes)
+    return Error(DerefLoc, "dereferenceable bytes must be non-zero");
+  return false;
+}
+
 /// ParseOptionalCommaAlign
 ///   ::=
 ///   ::= ',' align 4
@@ -1837,7 +1817,8 @@
 ///  Arg
 ///    ::= Type OptionalAttributes Value OptionalAttributes
 bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
-                                  PerFunctionState &PFS) {
+                                  PerFunctionState &PFS, bool IsMustTailCall,
+                                  bool InVarArgsFunc) {
   if (ParseToken(lltok::lparen, "expected '(' in call"))
     return true;
 
@@ -1848,6 +1829,17 @@
         ParseToken(lltok::comma, "expected ',' in argument list"))
       return true;
 
+    // Parse an ellipsis if this is a musttail call in a variadic function.
+    if (Lex.getKind() == lltok::dotdotdot) {
+      const char *Msg = "unexpected ellipsis in argument list for ";
+      if (!IsMustTailCall)
+        return TokError(Twine(Msg) + "non-musttail call");
+      if (!InVarArgsFunc)
+        return TokError(Twine(Msg) + "musttail call in non-varargs function");
+      Lex.Lex();  // Lex the '...', it is purely for readability.
+      return ParseToken(lltok::rparen, "expected ')' at end of argument list");
+    }
+
     // Parse the argument.
     LocTy ArgLoc;
     Type *ArgTy = nullptr;
@@ -1864,6 +1856,10 @@
                                                              ArgAttrs)));
   }
 
+  if (IsMustTailCall && InVarArgsFunc)
+    return TokError("expected '...' at end of argument list for musttail call "
+                    "in varargs function");
+
   Lex.Lex();  // Lex the ')'.
   return false;
 }
@@ -2159,28 +2155,6 @@
 }
 
 bool LLParser::PerFunctionState::FinishFunction() {
-  // Check to see if someone took the address of labels in this block.
-  if (!P.ForwardRefBlockAddresses.empty()) {
-    ValID FunctionID;
-    if (!F.getName().empty()) {
-      FunctionID.Kind = ValID::t_GlobalName;
-      FunctionID.StrVal = F.getName();
-    } else {
-      FunctionID.Kind = ValID::t_GlobalID;
-      FunctionID.UIntVal = FunctionNumber;
-    }
-
-    std::map<ValID, std::vector<std::pair<ValID, GlobalValue*> > >::iterator
-      FRBAI = P.ForwardRefBlockAddresses.find(FunctionID);
-    if (FRBAI != P.ForwardRefBlockAddresses.end()) {
-      // Resolve all these references.
-      if (P.ResolveForwardRefBlockAddresses(&F, FRBAI->second, this))
-        return true;
-
-      P.ForwardRefBlockAddresses.erase(FRBAI);
-    }
-  }
-
   if (!ForwardRefVals.empty())
     return P.Error(ForwardRefVals.begin()->second.second,
                    "use of undefined value '%" + ForwardRefVals.begin()->first +
@@ -2222,7 +2196,7 @@
   }
 
   // Don't make placeholders with invalid type.
-  if (!Ty->isFirstClassType() && !Ty->isLabelTy()) {
+  if (!Ty->isFirstClassType()) {
     P.Error(Loc, "invalid use of a non-first-class type");
     return nullptr;
   }
@@ -2263,7 +2237,7 @@
     return nullptr;
   }
 
-  if (!Ty->isFirstClassType() && !Ty->isLabelTy()) {
+  if (!Ty->isFirstClassType()) {
     P.Error(Loc, "invalid use of a non-first-class type");
     return nullptr;
   }
@@ -2566,12 +2540,56 @@
     if (Label.Kind != ValID::t_LocalID && Label.Kind != ValID::t_LocalName)
       return Error(Label.Loc, "expected basic block name in blockaddress");
 
-    // Make a global variable as a placeholder for this reference.
-    GlobalVariable *FwdRef = new GlobalVariable(*M, Type::getInt8Ty(Context),
-                                           false, GlobalValue::InternalLinkage,
-                                                nullptr, "");
-    ForwardRefBlockAddresses[Fn].push_back(std::make_pair(Label, FwdRef));
-    ID.ConstantVal = FwdRef;
+    // Try to find the function (but skip it if it's forward-referenced).
+    GlobalValue *GV = nullptr;
+    if (Fn.Kind == ValID::t_GlobalID) {
+      if (Fn.UIntVal < NumberedVals.size())
+        GV = NumberedVals[Fn.UIntVal];
+    } else if (!ForwardRefVals.count(Fn.StrVal)) {
+      GV = M->getNamedValue(Fn.StrVal);
+    }
+    Function *F = nullptr;
+    if (GV) {
+      // Confirm that it's actually a function with a definition.
+      if (!isa<Function>(GV))
+        return Error(Fn.Loc, "expected function name in blockaddress");
+      F = cast<Function>(GV);
+      if (F->isDeclaration())
+        return Error(Fn.Loc, "cannot take blockaddress inside a declaration");
+    }
+
+    if (!F) {
+      // Make a global variable as a placeholder for this reference.
+      GlobalValue *&FwdRef = ForwardRefBlockAddresses[Fn][Label];
+      if (!FwdRef)
+        FwdRef = new GlobalVariable(*M, Type::getInt8Ty(Context), false,
+                                    GlobalValue::InternalLinkage, nullptr, "");
+      ID.ConstantVal = FwdRef;
+      ID.Kind = ValID::t_Constant;
+      return false;
+    }
+
+    // We found the function; now find the basic block.  Don't use PFS, since we
+    // might be inside a constant expression.
+    BasicBlock *BB;
+    if (BlockAddressPFS && F == &BlockAddressPFS->getFunction()) {
+      if (Label.Kind == ValID::t_LocalID)
+        BB = BlockAddressPFS->GetBB(Label.UIntVal, Label.Loc);
+      else
+        BB = BlockAddressPFS->GetBB(Label.StrVal, Label.Loc);
+      if (!BB)
+        return Error(Label.Loc, "referenced value is not a basic block");
+    } else {
+      if (Label.Kind == ValID::t_LocalID)
+        return Error(Label.Loc, "cannot take address of numeric label after "
+                                "the function is defined");
+      BB = dyn_cast_or_null<BasicBlock>(
+          F->getValueSymbolTable().lookup(Label.StrVal));
+      if (!BB)
+        return Error(Label.Loc, "referenced value is not a basic block");
+    }
+
+    ID.ConstantVal = BlockAddress::get(F, BB);
     ID.Kind = ValID::t_Constant;
     return false;
   }
@@ -2886,7 +2904,7 @@
 /// ParseGlobalValueVector
 ///   ::= /*empty*/
 ///   ::= TypeAndValue (',' TypeAndValue)*
-bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts) {
+bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts) {
   // Empty list.
   if (Lex.getKind() == lltok::rbrace ||
       Lex.getKind() == lltok::rsquare ||
@@ -3111,7 +3129,7 @@
   unsigned Visibility;
   unsigned DLLStorageClass;
   AttrBuilder RetAttrs;
-  CallingConv::ID CC;
+  unsigned CC;
   Type *RetType = nullptr;
   LocTy RetTypeLoc = Lex.getLoc();
   if (ParseOptionalLinkage(Linkage) ||
@@ -3314,13 +3332,63 @@
                    ArgList[i].Name + "'");
   }
 
+  if (isDefine)
+    return false;
+
+  // Check the declaration has no block address forward references.
+  ValID ID;
+  if (FunctionName.empty()) {
+    ID.Kind = ValID::t_GlobalID;
+    ID.UIntVal = NumberedVals.size() - 1;
+  } else {
+    ID.Kind = ValID::t_GlobalName;
+    ID.StrVal = FunctionName;
+  }
+  auto Blocks = ForwardRefBlockAddresses.find(ID);
+  if (Blocks != ForwardRefBlockAddresses.end())
+    return Error(Blocks->first.Loc,
+                 "cannot take blockaddress inside a declaration");
   return false;
 }
 
+bool LLParser::PerFunctionState::resolveForwardRefBlockAddresses() {
+  ValID ID;
+  if (FunctionNumber == -1) {
+    ID.Kind = ValID::t_GlobalName;
+    ID.StrVal = F.getName();
+  } else {
+    ID.Kind = ValID::t_GlobalID;
+    ID.UIntVal = FunctionNumber;
+  }
+
+  auto Blocks = P.ForwardRefBlockAddresses.find(ID);
+  if (Blocks == P.ForwardRefBlockAddresses.end())
+    return false;
+
+  for (const auto &I : Blocks->second) {
+    const ValID &BBID = I.first;
+    GlobalValue *GV = I.second;
+
+    assert((BBID.Kind == ValID::t_LocalID || BBID.Kind == ValID::t_LocalName) &&
+           "Expected local id or name");
+    BasicBlock *BB;
+    if (BBID.Kind == ValID::t_LocalName)
+      BB = GetBB(BBID.StrVal, BBID.Loc);
+    else
+      BB = GetBB(BBID.UIntVal, BBID.Loc);
+    if (!BB)
+      return P.Error(BBID.Loc, "referenced value is not a basic block");
+
+    GV->replaceAllUsesWith(BlockAddress::get(&F, BB));
+    GV->eraseFromParent();
+  }
+
+  P.ForwardRefBlockAddresses.erase(Blocks);
+  return false;
+}
 
 /// ParseFunctionBody
-///   ::= '{' BasicBlock+ '}'
-///
+///   ::= '{' BasicBlock+ UseListOrderDirective* '}'
 bool LLParser::ParseFunctionBody(Function &Fn) {
   if (Lex.getKind() != lltok::lbrace)
     return TokError("expected '{' in function body");
@@ -3331,13 +3399,24 @@
 
   PerFunctionState PFS(*this, Fn, FunctionNumber);
 
+  // Resolve block addresses and allow basic blocks to be forward-declared
+  // within this function.
+  if (PFS.resolveForwardRefBlockAddresses())
+    return true;
+  SaveAndRestore<PerFunctionState *> ScopeExit(BlockAddressPFS, &PFS);
+
   // We need at least one basic block.
-  if (Lex.getKind() == lltok::rbrace)
+  if (Lex.getKind() == lltok::rbrace || Lex.getKind() == lltok::kw_uselistorder)
     return TokError("function body requires at least one basic block");
 
-  while (Lex.getKind() != lltok::rbrace)
+  while (Lex.getKind() != lltok::rbrace &&
+         Lex.getKind() != lltok::kw_uselistorder)
     if (ParseBasicBlock(PFS)) return true;
 
+  while (Lex.getKind() != lltok::rbrace)
+    if (ParseUseListOrder(&PFS))
+      return true;
+
   // Eat the }.
   Lex.Lex();
 
@@ -3656,7 +3735,7 @@
         ParseTypeAndBasicBlock(DestBB, PFS))
       return true;
 
-    if (!SeenCases.insert(Constant))
+    if (!SeenCases.insert(Constant).second)
       return Error(CondLoc, "duplicate case value in switch");
     if (!isa<ConstantInt>(Constant))
       return Error(CondLoc, "case value is not a constant integer");
@@ -3722,7 +3801,7 @@
   AttrBuilder RetAttrs, FnAttrs;
   std::vector<unsigned> FwdRefAttrGrps;
   LocTy NoBuiltinLoc;
-  CallingConv::ID CC;
+  unsigned CC;
   Type *RetType = nullptr;
   LocTy RetTypeLoc;
   ValID CalleeID;
@@ -4136,7 +4215,7 @@
   AttrBuilder RetAttrs, FnAttrs;
   std::vector<unsigned> FwdRefAttrGrps;
   LocTy BuiltinLoc;
-  CallingConv::ID CC;
+  unsigned CC;
   Type *RetType = nullptr;
   LocTy RetTypeLoc;
   ValID CalleeID;
@@ -4149,7 +4228,8 @@
       ParseOptionalReturnAttrs(RetAttrs) ||
       ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
       ParseValID(CalleeID) ||
-      ParseParameterList(ArgList, PFS) ||
+      ParseParameterList(ArgList, PFS, TCK == CallInst::TCK_MustTail,
+                         PFS.getFunction().isVarArg()) ||
       ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
                                  BuiltinLoc))
     return true;
@@ -4596,3 +4676,135 @@
 
   return false;
 }
+
+//===----------------------------------------------------------------------===//
+// Use-list order directives.
+//===----------------------------------------------------------------------===//
+bool LLParser::sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes,
+                                SMLoc Loc) {
+  if (V->use_empty())
+    return Error(Loc, "value has no uses");
+
+  unsigned NumUses = 0;
+  SmallDenseMap<const Use *, unsigned, 16> Order;
+  for (const Use &U : V->uses()) {
+    if (++NumUses > Indexes.size())
+      break;
+    Order[&U] = Indexes[NumUses - 1];
+  }
+  if (NumUses < 2)
+    return Error(Loc, "value only has one use");
+  if (Order.size() != Indexes.size() || NumUses > Indexes.size())
+    return Error(Loc, "wrong number of indexes, expected " +
+                          Twine(std::distance(V->use_begin(), V->use_end())));
+
+  V->sortUseList([&](const Use &L, const Use &R) {
+    return Order.lookup(&L) < Order.lookup(&R);
+  });
+  return false;
+}
+
+/// ParseUseListOrderIndexes
+///   ::= '{' uint32 (',' uint32)+ '}'
+bool LLParser::ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes) {
+  SMLoc Loc = Lex.getLoc();
+  if (ParseToken(lltok::lbrace, "expected '{' here"))
+    return true;
+  if (Lex.getKind() == lltok::rbrace)
+    return Lex.Error("expected non-empty list of uselistorder indexes");
+
+  // Use Offset, Max, and IsOrdered to check consistency of indexes.  The
+  // indexes should be distinct numbers in the range [0, size-1], and should
+  // not be in order.
+  unsigned Offset = 0;
+  unsigned Max = 0;
+  bool IsOrdered = true;
+  assert(Indexes.empty() && "Expected empty order vector");
+  do {
+    unsigned Index;
+    if (ParseUInt32(Index))
+      return true;
+
+    // Update consistency checks.
+    Offset += Index - Indexes.size();
+    Max = std::max(Max, Index);
+    IsOrdered &= Index == Indexes.size();
+
+    Indexes.push_back(Index);
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rbrace, "expected '}' here"))
+    return true;
+
+  if (Indexes.size() < 2)
+    return Error(Loc, "expected >= 2 uselistorder indexes");
+  if (Offset != 0 || Max >= Indexes.size())
+    return Error(Loc, "expected distinct uselistorder indexes in range [0, size)");
+  if (IsOrdered)
+    return Error(Loc, "expected uselistorder indexes to change the order");
+
+  return false;
+}
+
+/// ParseUseListOrder
+///   ::= 'uselistorder' Type Value ',' UseListOrderIndexes
+bool LLParser::ParseUseListOrder(PerFunctionState *PFS) {
+  SMLoc Loc = Lex.getLoc();
+  if (ParseToken(lltok::kw_uselistorder, "expected uselistorder directive"))
+    return true;
+
+  Value *V;
+  SmallVector<unsigned, 16> Indexes;
+  if (ParseTypeAndValue(V, PFS) ||
+      ParseToken(lltok::comma, "expected comma in uselistorder directive") ||
+      ParseUseListOrderIndexes(Indexes))
+    return true;
+
+  return sortUseListOrder(V, Indexes, Loc);
+}
+
+/// ParseUseListOrderBB
+///   ::= 'uselistorder_bb' @foo ',' %bar ',' UseListOrderIndexes
+bool LLParser::ParseUseListOrderBB() {
+  assert(Lex.getKind() == lltok::kw_uselistorder_bb);
+  SMLoc Loc = Lex.getLoc();
+  Lex.Lex();
+
+  ValID Fn, Label;
+  SmallVector<unsigned, 16> Indexes;
+  if (ParseValID(Fn) ||
+      ParseToken(lltok::comma, "expected comma in uselistorder_bb directive") ||
+      ParseValID(Label) ||
+      ParseToken(lltok::comma, "expected comma in uselistorder_bb directive") ||
+      ParseUseListOrderIndexes(Indexes))
+    return true;
+
+  // Check the function.
+  GlobalValue *GV;
+  if (Fn.Kind == ValID::t_GlobalName)
+    GV = M->getNamedValue(Fn.StrVal);
+  else if (Fn.Kind == ValID::t_GlobalID)
+    GV = Fn.UIntVal < NumberedVals.size() ? NumberedVals[Fn.UIntVal] : nullptr;
+  else
+    return Error(Fn.Loc, "expected function name in uselistorder_bb");
+  if (!GV)
+    return Error(Fn.Loc, "invalid function forward reference in uselistorder_bb");
+  auto *F = dyn_cast<Function>(GV);
+  if (!F)
+    return Error(Fn.Loc, "expected function name in uselistorder_bb");
+  if (F->isDeclaration())
+    return Error(Fn.Loc, "invalid declaration in uselistorder_bb");
+
+  // Check the basic block.
+  if (Label.Kind == ValID::t_LocalID)
+    return Error(Label.Loc, "invalid numeric label in uselistorder_bb");
+  if (Label.Kind != ValID::t_LocalName)
+    return Error(Label.Loc, "expected basic block name in uselistorder_bb");
+  Value *V = F->getValueSymbolTable().lookup(Label.StrVal);
+  if (!V)
+    return Error(Label.Loc, "invalid basic block in uselistorder_bb");
+  if (!isa<BasicBlock>(V))
+    return Error(Label.Loc, "expected basic block in uselistorder_bb");
+
+  return sortUseListOrder(V, Indexes, Loc);
+}

diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 2efb260..aa62bcc 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ASMPARSER_LLPARSER_H
-#define LLVM_ASMPARSER_LLPARSER_H
+#ifndef LLVM_LIB_ASMPARSER_LLPARSER_H
+#define LLVM_LIB_ASMPARSER_LLPARSER_H
 
 #include "LLLexer.h"
 #include "llvm/ADT/DenseMap.h"
@@ -128,17 +128,21 @@
 
     // References to blockaddress.  The key is the function ValID, the value is
     // a list of references to blocks in that function.
-    std::map<ValID, std::vector<std::pair<ValID, GlobalValue*> > >
-      ForwardRefBlockAddresses;
+    std::map<ValID, std::map<ValID, GlobalValue *>> ForwardRefBlockAddresses;
+    class PerFunctionState;
+    /// Reference to per-function state to allow basic blocks to be
+    /// forward-referenced by blockaddress instructions within the same
+    /// function.
+    PerFunctionState *BlockAddressPFS;
 
     // Attribute builder reference information.
     std::map<Value*, std::vector<unsigned> > ForwardRefAttrGroups;
     std::map<unsigned, AttrBuilder> NumberedAttrBuilders;
 
   public:
-    LLParser(MemoryBuffer *F, SourceMgr &SM, SMDiagnostic &Err, Module *m) :
-      Context(m->getContext()), Lex(F, SM, Err, m->getContext()),
-      M(m) {}
+    LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *m)
+        : Context(m->getContext()), Lex(F, SM, Err, m->getContext()), M(m),
+          BlockAddressPFS(nullptr) {}
     bool Run();
 
     LLVMContext &getContext() { return Context; }
@@ -202,6 +206,11 @@
       Loc = Lex.getLoc();
       return ParseUInt32(Val);
     }
+    bool ParseUInt64(uint64_t &Val);
+    bool ParseUInt64(uint64_t &Val, LocTy &Loc) {
+      Loc = Lex.getLoc();
+      return ParseUInt64(Val);
+    }
 
     bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
@@ -217,8 +226,9 @@
     }
     bool ParseOptionalVisibility(unsigned &Visibility);
     bool ParseOptionalDLLStorageClass(unsigned &DLLStorageClass);
-    bool ParseOptionalCallingConv(CallingConv::ID &CC);
+    bool ParseOptionalCallingConv(unsigned &CC);
     bool ParseOptionalAlignment(unsigned &Alignment);
+    bool ParseOptionalDereferenceableBytes(uint64_t &Bytes);
     bool ParseScopeAndOrdering(bool isAtomic, SynchronizationScope &Scope,
                                AtomicOrdering &Ordering);
     bool ParseOrdering(AtomicOrdering &Ordering);
@@ -252,8 +262,8 @@
                      bool HasLinkage, unsigned Visibility,
                      unsigned DLLStorageClass,
                      GlobalVariable::ThreadLocalMode TLM, bool UnnamedAddr);
-    bool ParseAlias(const std::string &Name, LocTy Loc, unsigned Visibility,
-                    unsigned DLLStorageClass,
+    bool ParseAlias(const std::string &Name, LocTy Loc, unsigned Linkage,
+                    unsigned Visibility, unsigned DLLStorageClass,
                     GlobalVariable::ThreadLocalMode TLM, bool UnnamedAddr);
     bool parseComdat();
     bool ParseStandaloneMetadata();
@@ -321,6 +331,8 @@
       /// unnamed.  If there is an error, this returns null otherwise it returns
       /// the block being defined.
       BasicBlock *DefineBB(const std::string &Name, LocTy Loc);
+
+      bool resolveForwardRefBlockAddresses();
     };
 
     bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
@@ -360,13 +372,15 @@
         : Loc(loc), V(v), Attrs(attrs) {}
     };
     bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
-                            PerFunctionState &PFS);
+                            PerFunctionState &PFS,
+                            bool IsMustTailCall = false,
+                            bool InVarArgsFunc = false);
 
     // Constant Parsing.
     bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr);
     bool ParseGlobalValue(Type *Ty, Constant *&V);
     bool ParseGlobalTypeAndValue(Constant *&V);
-    bool ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts);
+    bool ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts);
     bool parseOptionalComdat(Comdat *&C);
     bool ParseMetadataListValue(ValID &ID, PerFunctionState *PFS);
     bool ParseMetadataValue(ValID &ID, PerFunctionState *PFS);
@@ -427,9 +441,11 @@
     int ParseExtractValue(Instruction *&I, PerFunctionState &PFS);
     int ParseInsertValue(Instruction *&I, PerFunctionState &PFS);
 
-    bool ResolveForwardRefBlockAddresses(Function *TheFn,
-                             std::vector<std::pair<ValID, GlobalValue*> > &Refs,
-                                         PerFunctionState *PFS);
+    // Use-list order directives.
+    bool ParseUseListOrder(PerFunctionState *PFS = nullptr);
+    bool ParseUseListOrderBB();
+    bool ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes);
+    bool sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes, SMLoc Loc);
   };
 } // End llvm namespace
 

diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 534d824..f9821f7 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LIBS_ASMPARSER_LLTOKEN_H
-#define LIBS_ASMPARSER_LLTOKEN_H
+#ifndef LLVM_LIB_ASMPARSER_LLTOKEN_H
+#define LLVM_LIB_ASMPARSER_LLTOKEN_H
 
 namespace llvm {
 namespace lltok {
@@ -39,8 +39,6 @@
 
     kw_private,
     kw_internal,
-    kw_linker_private,          // NOTE: deprecated, for parser compatibility
-    kw_linker_private_weak,     // NOTE: deprecated, for parser compatibility
     kw_linkonce, kw_linkonce_odr,
     kw_weak, // Used as a linkage, and a modifier for "cmpxchg".
     kw_weak_odr, kw_appending,
@@ -89,7 +87,7 @@
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
     kw_intel_ocl_bicc,
-    kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
+    kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc, kw_x86_vectorcallcc,
     kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
     kw_msp430_intrcc,
     kw_ptx_kernel, kw_ptx_device,
@@ -106,6 +104,7 @@
     kw_byval,
     kw_inalloca,
     kw_cold,
+    kw_dereferenceable,
     kw_inlinehint,
     kw_inreg,
     kw_jumptable,
@@ -181,6 +180,9 @@
     kw_extractelement, kw_insertelement, kw_shufflevector,
     kw_extractvalue, kw_insertvalue, kw_blockaddress,
 
+    // Use-list order directives.
+    kw_uselistorder, kw_uselistorder_bb,
+
     // Unsigned Valued tokens (UIntVal).
     GlobalID,          // @42
     LocalVarID,        // %42

diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
index 91bb51c..0815907 100644
--- a/lib/AsmParser/Parser.cpp
+++ b/lib/AsmParser/Parser.cpp

@@ -21,26 +21,29 @@
 #include <system_error>
 using namespace llvm;
 
-Module *llvm::ParseAssembly(MemoryBuffer *F,
-                            Module *M,
-                            SMDiagnostic &Err,
-                            LLVMContext &Context) {
+bool llvm::parseAssemblyInto(MemoryBufferRef F, Module &M, SMDiagnostic &Err) {
   SourceMgr SM;
-  SM.AddNewSourceBuffer(F, SMLoc());
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(F, false);
+  SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
 
-  // If we are parsing into an existing module, do it.
-  if (M)
-    return LLParser(F, SM, Err, M).Run() ? nullptr : M;
-
-  // Otherwise create a new module.
-  std::unique_ptr<Module> M2(new Module(F->getBufferIdentifier(), Context));
-  if (LLParser(F, SM, Err, M2.get()).Run())
-    return nullptr;
-  return M2.release();
+  return LLParser(F.getBuffer(), SM, Err, &M).Run();
 }
 
-Module *llvm::ParseAssemblyFile(const std::string &Filename, SMDiagnostic &Err,
-                                LLVMContext &Context) {
+std::unique_ptr<Module> llvm::parseAssembly(MemoryBufferRef F,
+                                            SMDiagnostic &Err,
+                                            LLVMContext &Context) {
+  std::unique_ptr<Module> M =
+      make_unique<Module>(F.getBufferIdentifier(), Context);
+
+  if (parseAssemblyInto(F, *M, Err))
+    return nullptr;
+
+  return std::move(M);
+}
+
+std::unique_ptr<Module> llvm::parseAssemblyFile(StringRef Filename,
+                                                SMDiagnostic &Err,
+                                                LLVMContext &Context) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -49,13 +52,12 @@
     return nullptr;
   }
 
-  return ParseAssembly(FileOrErr.get().release(), nullptr, Err, Context);
+  return parseAssembly(FileOrErr.get()->getMemBufferRef(), Err, Context);
 }
 
-Module *llvm::ParseAssemblyString(const char *AsmString, Module *M,
-                                  SMDiagnostic &Err, LLVMContext &Context) {
-  MemoryBuffer *F =
-      MemoryBuffer::getMemBuffer(StringRef(AsmString), "<string>");
-
-  return ParseAssembly(F, M, Err, Context);
+std::unique_ptr<Module> llvm::parseAssemblyString(StringRef AsmString,
+                                                  SMDiagnostic &Err,
+                                                  LLVMContext &Context) {
+  MemoryBufferRef F(AsmString, "<string>");
+  return parseAssembly(F, Err, Context);
 }

diff --git a/lib/Bitcode/Reader/BitReader.cpp b/lib/Bitcode/Reader/BitReader.cpp
index b5886c1..9b3acb5 100644
--- a/lib/Bitcode/Reader/BitReader.cpp
+++ b/lib/Bitcode/Reader/BitReader.cpp

@@ -31,7 +31,7 @@
                                    LLVMModuleRef *OutModule,
                                    char **OutMessage) {
   ErrorOr<Module *> ModuleOrErr =
-      parseBitcodeFile(unwrap(MemBuf), *unwrap(ContextRef));
+      parseBitcodeFile(unwrap(MemBuf)->getMemBufferRef(), *unwrap(ContextRef));
   if (std::error_code EC = ModuleOrErr.getError()) {
     if (OutMessage)
       *OutMessage = strdup(EC.message().c_str());
@@ -51,8 +51,11 @@
                                        LLVMModuleRef *OutM,
                                        char **OutMessage) {
   std::string Message;
+  std::unique_ptr<MemoryBuffer> Owner(unwrap(MemBuf));
+
   ErrorOr<Module *> ModuleOrErr =
-      getLazyBitcodeModule(unwrap(MemBuf), *unwrap(ContextRef));
+      getLazyBitcodeModule(std::move(Owner), *unwrap(ContextRef));
+  Owner.release();
 
   if (std::error_code EC = ModuleOrErr.getError()) {
     *OutM = wrap((Module *)nullptr);

diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 192f753..b2ca22c 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp

@@ -25,17 +25,45 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ManagedStatic.h"
+
 using namespace llvm;
 
 enum {
   SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
 };
 
-void BitcodeReader::materializeForwardReferencedFunctions() {
-  while (!BlockAddrFwdRefs.empty()) {
-    Function *F = BlockAddrFwdRefs.begin()->first;
-    F->Materialize();
+std::error_code BitcodeReader::materializeForwardReferencedFunctions() {
+  if (WillMaterializeAllForwardRefs)
+    return std::error_code();
+
+  // Prevent recursion.
+  WillMaterializeAllForwardRefs = true;
+
+  while (!BasicBlockFwdRefQueue.empty()) {
+    Function *F = BasicBlockFwdRefQueue.front();
+    BasicBlockFwdRefQueue.pop_front();
+    assert(F && "Expected valid function");
+    if (!BasicBlockFwdRefs.count(F))
+      // Already materialized.
+      continue;
+
+    // Check for a function that isn't materializable to prevent an infinite
+    // loop.  When parsing a blockaddress stored in a global variable, there
+    // isn't a trivial way to check if a function will have a body without a
+    // linear search through FunctionsWithBodies, so just check it here.
+    if (!F->isMaterializable())
+      return Error(BitcodeError::NeverResolvedFunctionFromBlockAddress);
+
+    // Try to materialize F.
+    if (std::error_code EC = materialize(F))
+      return EC;
   }
+  assert(BasicBlockFwdRefs.empty() && "Function missing from queue");
+
+  // Reset state.
+  WillMaterializeAllForwardRefs = false;
+  return std::error_code();
 }
 
 void BitcodeReader::FreeState() {
@@ -51,7 +79,8 @@
   DeferredFunctionInfo.clear();
   MDKindMap.clear();
 
-  assert(BlockAddrFwdRefs.empty() && "Unresolved blockaddress fwd references");
+  assert(BasicBlockFwdRefs.empty() && "Unresolved blockaddress fwd references");
+  BasicBlockFwdRefQueue.clear();
 }
 
 //===----------------------------------------------------------------------===//
@@ -487,10 +516,10 @@
 
 std::error_code BitcodeReader::ParseAttributeBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   if (!MAttributes.empty())
-    return Error(InvalidMultipleBlocks);
+    return Error(BitcodeError::InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -503,7 +532,7 @@
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -519,7 +548,7 @@
     case bitc::PARAMATTR_CODE_ENTRY_OLD: { // ENTRY: [paramidx0, attr0, ...]
       // FIXME: Remove in 4.0.
       if (Record.size() & 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B;
@@ -588,6 +617,8 @@
     return Attribute::NonLazyBind;
   case bitc::ATTR_KIND_NON_NULL:
     return Attribute::NonNull;
+  case bitc::ATTR_KIND_DEREFERENCEABLE:
+    return Attribute::Dereferenceable;
   case bitc::ATTR_KIND_NO_RED_ZONE:
     return Attribute::NoRedZone;
   case bitc::ATTR_KIND_NO_RETURN:
@@ -635,16 +666,16 @@
                                              Attribute::AttrKind *Kind) {
   *Kind = GetAttrFromCode(Code);
   if (*Kind == Attribute::None)
-    return Error(InvalidValue);
+    return Error(BitcodeError::InvalidValue);
   return std::error_code();
 }
 
 std::error_code BitcodeReader::ParseAttributeGroupBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   if (!MAttributeGroups.empty())
-    return Error(InvalidMultipleBlocks);
+    return Error(BitcodeError::InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -655,7 +686,7 @@
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -670,7 +701,7 @@
       break;
     case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...]
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       uint64_t GrpID = Record[0];
       uint64_t Idx = Record[1]; // Index of the object this attribute refers to.
@@ -683,14 +714,16 @@
             return EC;
 
           B.addAttribute(Kind);
-        } else if (Record[i] == 1) { // Align attribute
+        } else if (Record[i] == 1) { // Integer attribute
           Attribute::AttrKind Kind;
           if (std::error_code EC = ParseAttrKind(Record[++i], &Kind))
             return EC;
           if (Kind == Attribute::Alignment)
             B.addAlignmentAttr(Record[++i]);
-          else
+          else if (Kind == Attribute::StackAlignment)
             B.addStackAlignmentAttr(Record[++i]);
+          else if (Kind == Attribute::Dereferenceable)
+            B.addDereferenceableAttr(Record[++i]);
         } else {                     // String attribute
           assert((Record[i] == 3 || Record[i] == 4) &&
                  "Invalid attribute group entry");
@@ -723,14 +756,14 @@
 
 std::error_code BitcodeReader::ParseTypeTable() {
   if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   return ParseTypeTableBody();
 }
 
 std::error_code BitcodeReader::ParseTypeTableBody() {
   if (!TypeList.empty())
-    return Error(InvalidMultipleBlocks);
+    return Error(BitcodeError::InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
   unsigned NumRecords = 0;
@@ -744,10 +777,10 @@
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       if (NumRecords != TypeList.size())
-        return Error(MalformedBlock);
+        return Error(BitcodeError::MalformedBlock);
       return std::error_code();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -759,12 +792,12 @@
     Type *ResultTy = nullptr;
     switch (Stream.readRecord(Entry.ID, Record)) {
     default:
-      return Error(InvalidValue);
+      return Error(BitcodeError::InvalidValue);
     case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries]
       // TYPE_CODE_NUMENTRY contains a count of the number of types in the
       // type list.  This allows us to reserve space.
       if (Record.size() < 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       TypeList.resize(Record[0]);
       continue;
     case bitc::TYPE_CODE_VOID:      // VOID
@@ -799,20 +832,20 @@
       break;
     case bitc::TYPE_CODE_INTEGER:   // INTEGER: [width]
       if (Record.size() < 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       ResultTy = IntegerType::get(Context, Record[0]);
       break;
     case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or
                                     //          [pointee type, address space]
       if (Record.size() < 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       unsigned AddressSpace = 0;
       if (Record.size() == 2)
         AddressSpace = Record[1];
       ResultTy = getTypeByID(Record[0]);
       if (!ResultTy)
-        return Error(InvalidType);
+        return Error(BitcodeError::InvalidType);
       ResultTy = PointerType::get(ResultTy, AddressSpace);
       break;
     }
@@ -820,7 +853,7 @@
       // FIXME: attrid is dead, remove it in LLVM 4.0
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 3, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -831,7 +864,7 @@
 
       ResultTy = getTypeByID(Record[2]);
       if (!ResultTy || ArgTys.size() < Record.size()-3)
-        return Error(InvalidType);
+        return Error(BitcodeError::InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
@@ -839,7 +872,7 @@
     case bitc::TYPE_CODE_FUNCTION: {
       // FUNCTION: [vararg, retty, paramty x N]
       if (Record.size() < 2)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -850,14 +883,14 @@
 
       ResultTy = getTypeByID(Record[1]);
       if (!ResultTy || ArgTys.size() < Record.size()-2)
-        return Error(InvalidType);
+        return Error(BitcodeError::InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_ANON: {  // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SmallVector<Type*, 8> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -866,21 +899,21 @@
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error(InvalidType);
+        return Error(BitcodeError::InvalidType);
       ResultTy = StructType::get(Context, EltTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_NAME:   // STRUCT_NAME: [strchr x N]
       if (ConvertToString(Record, 0, TypeName))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       continue;
 
     case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       if (NumRecords >= TypeList.size())
-        return Error(InvalidTYPETable);
+        return Error(BitcodeError::InvalidTYPETable);
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -899,17 +932,17 @@
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Res->setBody(EltTys, Record[0]);
       ResultTy = Res;
       break;
     }
     case bitc::TYPE_CODE_OPAQUE: {       // OPAQUE: []
       if (Record.size() != 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       if (NumRecords >= TypeList.size())
-        return Error(InvalidTYPETable);
+        return Error(BitcodeError::InvalidTYPETable);
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -924,24 +957,24 @@
     }
     case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
       if (Record.size() < 2)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = ArrayType::get(ResultTy, Record[0]);
       else
-        return Error(InvalidType);
+        return Error(BitcodeError::InvalidType);
       break;
     case bitc::TYPE_CODE_VECTOR:    // VECTOR: [numelts, eltty]
       if (Record.size() < 2)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = VectorType::get(ResultTy, Record[0]);
       else
-        return Error(InvalidType);
+        return Error(BitcodeError::InvalidType);
       break;
     }
 
     if (NumRecords >= TypeList.size())
-      return Error(InvalidTYPETable);
+      return Error(BitcodeError::InvalidTYPETable);
     assert(ResultTy && "Didn't read a type?");
     assert(!TypeList[NumRecords] && "Already read type?");
     TypeList[NumRecords++] = ResultTy;
@@ -950,7 +983,7 @@
 
 std::error_code BitcodeReader::ParseValueSymbolTable() {
   if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -962,7 +995,7 @@
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -977,10 +1010,10 @@
       break;
     case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
       if (ConvertToString(Record, 1, ValueName))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       unsigned ValueID = Record[0];
       if (ValueID >= ValueList.size() || !ValueList[ValueID])
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Value *V = ValueList[ValueID];
 
       V->setName(StringRef(ValueName.data(), ValueName.size()));
@@ -989,10 +1022,10 @@
     }
     case bitc::VST_CODE_BBENTRY: {
       if (ConvertToString(Record, 1, ValueName))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       BasicBlock *BB = getBasicBlock(Record[0]);
       if (!BB)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       BB->setName(StringRef(ValueName.data(), ValueName.size()));
       ValueName.clear();
@@ -1006,7 +1039,7 @@
   unsigned NextMDValueNo = MDValueList.size();
 
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1017,7 +1050,7 @@
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -1048,7 +1081,7 @@
       for (unsigned i = 0; i != Size; ++i) {
         MDNode *MD = dyn_cast_or_null<MDNode>(MDValueList.getValueFwdRef(Record[i]));
         if (!MD)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         NMD->addOperand(MD);
       }
       break;
@@ -1058,14 +1091,14 @@
       // fall-through
     case bitc::METADATA_NODE: {
       if (Record.size() % 2 == 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       unsigned Size = Record.size();
       SmallVector<Value*, 8> Elts;
       for (unsigned i = 0; i != Size; i += 2) {
         Type *Ty = getTypeByID(Record[i]);
         if (!Ty)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         if (Ty->isMetadataTy())
           Elts.push_back(MDValueList.getValueFwdRef(Record[i+1]));
         else if (!Ty->isVoidTy())
@@ -1087,14 +1120,14 @@
     }
     case bitc::METADATA_KIND: {
       if (Record.size() < 2)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       unsigned Kind = Record[0];
       SmallString<8> Name(Record.begin()+1, Record.end());
 
       unsigned NewKind = TheModule->getMDKindID(Name.str());
       if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
-        return Error(ConflictingMETADATA_KINDRecords);
+        return Error(BitcodeError::ConflictingMETADATA_KINDRecords);
       break;
     }
     }
@@ -1132,7 +1165,7 @@
       if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
         GlobalInitWorklist.back().first->setInitializer(C);
       else
-        return Error(ExpectedConstant);
+        return Error(BitcodeError::ExpectedConstant);
     }
     GlobalInitWorklist.pop_back();
   }
@@ -1145,7 +1178,7 @@
       if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
         AliasInitWorklist.back().first->setAliasee(C);
       else
-        return Error(ExpectedConstant);
+        return Error(BitcodeError::ExpectedConstant);
     }
     AliasInitWorklist.pop_back();
   }
@@ -1158,7 +1191,7 @@
       if (Constant *C = dyn_cast_or_null<Constant>(ValueList[ValID]))
         FunctionPrefixWorklist.back().first->setPrefixData(C);
       else
-        return Error(ExpectedConstant);
+        return Error(BitcodeError::ExpectedConstant);
     }
     FunctionPrefixWorklist.pop_back();
   }
@@ -1176,7 +1209,7 @@
 
 std::error_code BitcodeReader::ParseConstants() {
   if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1189,10 +1222,10 @@
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       if (NextCstNo != ValueList.size())
-        return Error(InvalidConstantReference);
+        return Error(BitcodeError::InvalidConstantReference);
 
       // Once all the constants have been read, go through and resolve forward
       // references.
@@ -1214,9 +1247,9 @@
       break;
     case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
       if (Record.empty())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       if (Record[0] >= TypeList.size() || !TypeList[Record[0]])
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       CurTy = TypeList[Record[0]];
       continue;  // Skip the ValueList manipulation.
     case bitc::CST_CODE_NULL:      // NULL
@@ -1224,12 +1257,12 @@
       break;
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
     case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       APInt VInt = ReadWideAPInt(Record,
                                  cast<IntegerType>(CurTy)->getBitWidth());
@@ -1239,7 +1272,7 @@
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
       if (Record.empty())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       if (CurTy->isHalfTy())
         V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf,
                                              APInt(16, (uint16_t)Record[0])));
@@ -1269,7 +1302,7 @@
 
     case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number]
       if (Record.empty())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       unsigned Size = Record.size();
       SmallVector<Constant*, 16> Elts;
@@ -1297,7 +1330,7 @@
     case bitc::CST_CODE_STRING:    // STRING: [values]
     case bitc::CST_CODE_CSTRING: { // CSTRING: [values]
       if (Record.empty())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       SmallString<16> Elts(Record.begin(), Record.end());
       V = ConstantDataArray::getString(Context, Elts,
@@ -1306,7 +1339,7 @@
     }
     case bitc::CST_CODE_DATA: {// DATA: [n x value]
       if (Record.empty())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       Type *EltTy = cast<SequentialType>(CurTy)->getElementType();
       unsigned Size = Record.size();
@@ -1351,14 +1384,14 @@
         else
           V = ConstantDataArray::get(Context, Elts);
       } else {
-        return Error(InvalidTypeForValue);
+        return Error(BitcodeError::InvalidTypeForValue);
       }
       break;
     }
 
     case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       int Opc = GetDecodedBinaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown binop.
@@ -1389,14 +1422,14 @@
     }
     case bitc::CST_CODE_CE_CAST: {  // CE_CAST: [opcode, opty, opval]
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       int Opc = GetDecodedCastOpcode(Record[0]);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown cast.
       } else {
         Type *OpTy = getTypeByID(Record[1]);
         if (!OpTy)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
         V = UpgradeBitCastExpr(Opc, Op, CurTy);
         if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy);
@@ -1406,12 +1439,12 @@
     case bitc::CST_CODE_CE_INBOUNDS_GEP:
     case bitc::CST_CODE_CE_GEP: {  // CE_GEP:        [n x operands]
       if (Record.size() & 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SmallVector<Constant*, 16> Elts;
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         Type *ElTy = getTypeByID(Record[i]);
         if (!ElTy)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         Elts.push_back(ValueList.getConstantFwdRef(Record[i+1], ElTy));
       }
       ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
@@ -1422,7 +1455,7 @@
     }
     case bitc::CST_CODE_CE_SELECT: {  // CE_SELECT: [opval#, opval#, opval#]
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       Type *SelectorTy = Type::getInt1Ty(Context);
 
@@ -1441,22 +1474,22 @@
     case bitc::CST_CODE_CE_EXTRACTELT
         : { // CE_EXTRACTELT: [opty, opval, opty, opval]
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (!OpTy)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = nullptr;
       if (Record.size() == 4) {
         Type *IdxTy = getTypeByID(Record[2]);
         if (!IdxTy)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         Op1 = ValueList.getConstantFwdRef(Record[3], IdxTy);
       } else // TODO: Remove with llvm 4.0
         Op1 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
       if (!Op1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       V = ConstantExpr::getExtractElement(Op0, Op1);
       break;
     }
@@ -1464,7 +1497,7 @@
         : { // CE_INSERTELT: [opval, opval, opty, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || !OpTy)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
                                                   OpTy->getElementType());
@@ -1472,19 +1505,19 @@
       if (Record.size() == 4) {
         Type *IdxTy = getTypeByID(Record[2]);
         if (!IdxTy)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         Op2 = ValueList.getConstantFwdRef(Record[3], IdxTy);
       } else // TODO: Remove with llvm 4.0
         Op2 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
       if (!Op2)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       V = ConstantExpr::getInsertElement(Op0, Op1, Op2);
       break;
     }
     case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || !OpTy)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1498,7 +1531,7 @@
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (Record.size() < 4 || !RTy || !OpTy)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1509,10 +1542,10 @@
     }
     case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
       if (Record.size() < 4)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       if (!OpTy)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
 
@@ -1526,16 +1559,16 @@
     // FIXME: Remove with the 4.0 release.
     case bitc::CST_CODE_INLINEASM_OLD: {
       if (Record.size() < 2)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = Record[0] >> 1;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1550,17 +1583,17 @@
     // inteldialect).
     case bitc::CST_CODE_INLINEASM: {
       if (Record.size() < 2)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = (Record[0] >> 1) & 1;
       unsigned AsmDialect = Record[0] >> 2;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1574,35 +1607,46 @@
     }
     case bitc::CST_CODE_BLOCKADDRESS:{
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *FnTy = getTypeByID(Record[0]);
       if (!FnTy)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Function *Fn =
         dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
       if (!Fn)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
+
+      // Don't let Fn get dematerialized.
+      BlockAddressesTaken.insert(Fn);
 
       // If the function is already parsed we can insert the block address right
       // away.
+      BasicBlock *BB;
+      unsigned BBID = Record[2];
+      if (!BBID)
+        // Invalid reference to entry block.
+        return Error(BitcodeError::InvalidID);
       if (!Fn->empty()) {
         Function::iterator BBI = Fn->begin(), BBE = Fn->end();
-        for (size_t I = 0, E = Record[2]; I != E; ++I) {
+        for (size_t I = 0, E = BBID; I != E; ++I) {
           if (BBI == BBE)
-            return Error(InvalidID);
+            return Error(BitcodeError::InvalidID);
           ++BBI;
         }
-        V = BlockAddress::get(Fn, BBI);
+        BB = BBI;
       } else {
         // Otherwise insert a placeholder and remember it so it can be inserted
         // when the function is parsed.
-        GlobalVariable *FwdRef = new GlobalVariable(*Fn->getParent(),
-                                                    Type::getInt8Ty(Context),
-                                            false, GlobalValue::InternalLinkage,
-                                                    nullptr, "");
-        BlockAddrFwdRefs[Fn].push_back(std::make_pair(Record[2], FwdRef));
-        V = FwdRef;
+        auto &FwdBBs = BasicBlockFwdRefs[Fn];
+        if (FwdBBs.empty())
+          BasicBlockFwdRefQueue.push_back(Fn);
+        if (FwdBBs.size() < BBID + 1)
+          FwdBBs.resize(BBID + 1);
+        if (!FwdBBs[BBID])
+          FwdBBs[BBID] = BasicBlock::Create(Context);
+        BB = FwdBBs[BBID];
       }
+      V = BlockAddress::get(Fn, BB);
       break;
     }
     }
@@ -1614,18 +1658,17 @@
 
 std::error_code BitcodeReader::ParseUseLists() {
   if (Stream.EnterSubBlock(bitc::USELIST_BLOCK_ID))
-    return Error(InvalidRecord);
-
-  SmallVector<uint64_t, 64> Record;
+    return Error(BitcodeError::InvalidRecord);
 
   // Read all the records.
+  SmallVector<uint64_t, 64> Record;
   while (1) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -1635,14 +1678,42 @@
 
     // Read a use list record.
     Record.clear();
+    bool IsBB = false;
     switch (Stream.readRecord(Entry.ID, Record)) {
     default:  // Default behavior: unknown type.
       break;
-    case bitc::USELIST_CODE_ENTRY: { // USELIST_CODE_ENTRY: TBD.
+    case bitc::USELIST_CODE_BB:
+      IsBB = true;
+      // fallthrough
+    case bitc::USELIST_CODE_DEFAULT: {
       unsigned RecordLength = Record.size();
-      if (RecordLength < 1)
-        return Error(InvalidRecord);
-      UseListRecords.push_back(Record);
+      if (RecordLength < 3)
+        // Records should have at least an ID and two indexes.
+        return Error(BitcodeError::InvalidRecord);
+      unsigned ID = Record.back();
+      Record.pop_back();
+
+      Value *V;
+      if (IsBB) {
+        assert(ID < FunctionBBs.size() && "Basic block not found");
+        V = FunctionBBs[ID];
+      } else
+        V = ValueList[ID];
+      unsigned NumUses = 0;
+      SmallDenseMap<const Use *, unsigned, 16> Order;
+      for (const Use &U : V->uses()) {
+        if (++NumUses > Record.size())
+          break;
+        Order[&U] = Record[NumUses - 1];
+      }
+      if (Order.size() != Record.size() || NumUses > Record.size())
+        // Mismatches can happen if the functions are being materialized lazily
+        // (out-of-order), or a value has been upgraded.
+        break;
+
+      V->sortUseList([&](const Use &L, const Use &R) {
+        return Order.lookup(&L) < Order.lookup(&R);
+      });
       break;
     }
     }
@@ -1655,7 +1726,7 @@
 std::error_code BitcodeReader::RememberAndSkipFunctionBody() {
   // Get the function we are talking about.
   if (FunctionsWithBodies.empty())
-    return Error(InsufficientFunctionProtos);
+    return Error(BitcodeError::InsufficientFunctionProtos);
 
   Function *Fn = FunctionsWithBodies.back();
   FunctionsWithBodies.pop_back();
@@ -1666,7 +1737,7 @@
 
   // Skip over the function block for now.
   if (Stream.SkipBlock())
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
   return std::error_code();
 }
 
@@ -1674,7 +1745,7 @@
   // Patch the initializers for globals and aliases up.
   ResolveGlobalAndAliasInits();
   if (!GlobalInits.empty() || !AliasInits.empty())
-    return Error(MalformedGlobalInitializerSet);
+    return Error(BitcodeError::MalformedGlobalInitializerSet);
 
   // Look for intrinsic functions which need to be upgraded at some point
   for (Module::iterator FI = TheModule->begin(), FE = TheModule->end();
@@ -1703,7 +1774,7 @@
   if (Resume)
     Stream.JumpToBit(NextUnreadBit);
   else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
   std::vector<std::string> SectionTable;
@@ -1715,7 +1786,7 @@
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return GlobalCleanup();
 
@@ -1723,11 +1794,11 @@
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         break;
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error(MalformedBlock);
+          return Error(BitcodeError::MalformedBlock);
         break;
       case bitc::PARAMATTR_BLOCK_ID:
         if (std::error_code EC = ParseAttributeBlock())
@@ -1797,12 +1868,12 @@
     default: break;  // Default behavior, ignore unknown content.
     case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
       if (Record.size() < 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       // Only version #0 and #1 are supported so far.
       unsigned module_version = Record[0];
       switch (module_version) {
         default:
-          return Error(InvalidValue);
+          return Error(BitcodeError::InvalidValue);
         case 0:
           UseRelativeIDs = false;
           break;
@@ -1815,21 +1886,21 @@
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       TheModule->setTargetTriple(S);
       break;
     }
     case bitc::MODULE_CODE_DATALAYOUT: {  // DATALAYOUT: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       TheModule->setDataLayout(S);
       break;
     }
     case bitc::MODULE_CODE_ASM: {  // ASM: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       TheModule->setModuleInlineAsm(S);
       break;
     }
@@ -1837,27 +1908,27 @@
       // FIXME: Remove in 4.0.
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       // Ignore value.
       break;
     }
     case bitc::MODULE_CODE_SECTIONNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SectionTable.push_back(S);
       break;
     }
     case bitc::MODULE_CODE_GCNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       GCTable.push_back(S);
       break;
     }
     case bitc::MODULE_CODE_COMDAT: { // COMDAT: [selection_kind, name]
       if (Record.size() < 2)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Comdat::SelectionKind SK = getDecodedComdatSelectionKind(Record[0]);
       unsigned ComdatNameSize = Record[1];
       std::string ComdatName;
@@ -1874,12 +1945,12 @@
     //             unnamed_addr, dllstorageclass]
     case bitc::MODULE_CODE_GLOBALVAR: {
       if (Record.size() < 6)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error(InvalidTypeForValue);
+        return Error(BitcodeError::InvalidTypeForValue);
       unsigned AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
       Ty = cast<PointerType>(Ty)->getElementType();
 
@@ -1889,7 +1960,7 @@
       std::string Section;
       if (Record[5]) {
         if (Record[5]-1 >= SectionTable.size())
-          return Error(InvalidID);
+          return Error(BitcodeError::InvalidID);
         Section = SectionTable[Record[5]-1];
       }
       GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
@@ -1942,16 +2013,16 @@
     //             dllstorageclass]
     case bitc::MODULE_CODE_FUNCTION: {
       if (Record.size() < 8)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error(InvalidTypeForValue);
+        return Error(BitcodeError::InvalidTypeForValue);
       FunctionType *FTy =
         dyn_cast<FunctionType>(cast<PointerType>(Ty)->getElementType());
       if (!FTy)
-        return Error(InvalidTypeForValue);
+        return Error(BitcodeError::InvalidTypeForValue);
 
       Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
                                         "", TheModule);
@@ -1964,7 +2035,7 @@
       Func->setAlignment((1 << Record[5]) >> 1);
       if (Record[6]) {
         if (Record[6]-1 >= SectionTable.size())
-          return Error(InvalidID);
+          return Error(BitcodeError::InvalidID);
         Func->setSection(SectionTable[Record[6]-1]);
       }
       // Local linkage must have default visibility.
@@ -1973,7 +2044,7 @@
         Func->setVisibility(GetDecodedVisibility(Record[7]));
       if (Record.size() > 8 && Record[8]) {
         if (Record[8]-1 > GCTable.size())
-          return Error(InvalidID);
+          return Error(BitcodeError::InvalidID);
         Func->setGC(GCTable[Record[8]-1].c_str());
       }
       bool UnnamedAddr = false;
@@ -1999,8 +2070,10 @@
       // If this is a function with a body, remember the prototype we are
       // creating now, so that we can match up the body with them later.
       if (!isProto) {
+        Func->setIsMaterializable(true);
         FunctionsWithBodies.push_back(Func);
-        if (LazyStreamer) DeferredFunctionInfo[Func] = 0;
+        if (LazyStreamer)
+          DeferredFunctionInfo[Func] = 0;
       }
       break;
     }
@@ -2008,13 +2081,13 @@
     // ALIAS: [alias type, aliasee val#, linkage, visibility, dllstorageclass]
     case bitc::MODULE_CODE_ALIAS: {
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       auto *PTy = dyn_cast<PointerType>(Ty);
       if (!PTy)
-        return Error(InvalidTypeForValue);
+        return Error(BitcodeError::InvalidTypeForValue);
 
       auto *NewGA =
           GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
@@ -2029,9 +2102,9 @@
       else
         UpgradeDLLImportExportLinkage(NewGA, Record[2]);
       if (Record.size() > 5)
-	NewGA->setThreadLocalMode(GetDecodedThreadLocalMode(Record[5]));
+        NewGA->setThreadLocalMode(GetDecodedThreadLocalMode(Record[5]));
       if (Record.size() > 6)
-	NewGA->setUnnamedAddr(Record[6]);
+        NewGA->setUnnamedAddr(Record[6]);
       ValueList.push_back(NewGA);
       AliasInits.push_back(std::make_pair(NewGA, Record[1]));
       break;
@@ -2040,7 +2113,7 @@
     case bitc::MODULE_CODE_PURGEVALS:
       // Trim down the value list to the specified size.
       if (Record.size() < 1 || Record[0] > ValueList.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       ValueList.shrinkTo(Record[0]);
       break;
     }
@@ -2061,7 +2134,7 @@
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error(InvalidBitcodeSignature);
+    return Error(BitcodeError::InvalidBitcodeSignature);
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
@@ -2074,7 +2147,7 @@
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return std::error_code();
 
@@ -2082,12 +2155,12 @@
       switch (Entry.ID) {
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error(MalformedBlock);
+          return Error(BitcodeError::MalformedBlock);
         break;
       case bitc::MODULE_BLOCK_ID:
         // Reject multiple MODULE_BLOCK's in a single bitstream.
         if (TheModule)
-          return Error(InvalidMultipleBlocks);
+          return Error(BitcodeError::InvalidMultipleBlocks);
         TheModule = M;
         if (std::error_code EC = ParseModule(false))
           return EC;
@@ -2096,7 +2169,7 @@
         break;
       default:
         if (Stream.SkipBlock())
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         break;
       }
       continue;
@@ -2111,14 +2184,14 @@
           Stream.AtEndOfStream())
         return std::error_code();
 
-      return Error(InvalidRecord);
+      return Error(BitcodeError::InvalidRecord);
     }
   }
 }
 
 ErrorOr<std::string> BitcodeReader::parseModuleTriple() {
   if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -2130,7 +2203,7 @@
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return Triple;
     case BitstreamEntry::Record:
@@ -2144,7 +2217,7 @@
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Triple = S;
       break;
     }
@@ -2165,7 +2238,7 @@
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error(InvalidBitcodeSignature);
+    return Error(BitcodeError::InvalidBitcodeSignature);
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
@@ -2174,7 +2247,7 @@
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return std::error_code();
 
@@ -2184,7 +2257,7 @@
 
       // Ignore other sub-blocks.
       if (Stream.SkipBlock())
-        return Error(MalformedBlock);
+        return Error(BitcodeError::MalformedBlock);
       continue;
 
     case BitstreamEntry::Record:
@@ -2197,7 +2270,7 @@
 /// ParseMetadataAttachment - Parse metadata attachments.
 std::error_code BitcodeReader::ParseMetadataAttachment() {
   if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
   while (1) {
@@ -2206,7 +2279,7 @@
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       return std::error_code();
     case BitstreamEntry::Record:
@@ -2222,14 +2295,14 @@
     case bitc::METADATA_ATTACHMENT: {
       unsigned RecordLength = Record.size();
       if (Record.empty() || (RecordLength - 1) % 2 == 1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Instruction *Inst = InstructionList[Record[0]];
       for (unsigned i = 1; i != RecordLength; i = i+2) {
         unsigned Kind = Record[i];
         DenseMap<unsigned, unsigned>::iterator I =
           MDKindMap.find(Kind);
         if (I == MDKindMap.end())
-          return Error(InvalidID);
+          return Error(BitcodeError::InvalidID);
         Value *Node = MDValueList.getValueFwdRef(Record[i+1]);
         Inst->setMetadata(I->second, cast<MDNode>(Node));
         if (I->second == LLVMContext::MD_tbaa)
@@ -2244,7 +2317,7 @@
 /// ParseFunctionBody - Lazily parse the specified function body block.
 std::error_code BitcodeReader::ParseFunctionBody(Function *F) {
   if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
-    return Error(InvalidRecord);
+    return Error(BitcodeError::InvalidRecord);
 
   InstructionList.clear();
   unsigned ModuleValueListSize = ValueList.size();
@@ -2267,7 +2340,7 @@
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error(MalformedBlock);
+      return Error(BitcodeError::MalformedBlock);
     case BitstreamEntry::EndBlock:
       goto OutOfRecordLoop;
 
@@ -2275,7 +2348,7 @@
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         break;
       case bitc::CONSTANTS_BLOCK_ID:
         if (std::error_code EC = ParseConstants())
@@ -2294,6 +2367,10 @@
         if (std::error_code EC = ParseMetadata())
           return EC;
         break;
+      case bitc::USELIST_BLOCK_ID:
+        if (std::error_code EC = ParseUseLists())
+          return EC;
+        break;
       }
       continue;
 
@@ -2308,16 +2385,41 @@
     unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default: // Default behavior: reject
-      return Error(InvalidValue);
-    case bitc::FUNC_CODE_DECLAREBLOCKS:     // DECLAREBLOCKS: [nblocks]
+      return Error(BitcodeError::InvalidValue);
+    case bitc::FUNC_CODE_DECLAREBLOCKS: {   // DECLAREBLOCKS: [nblocks]
       if (Record.size() < 1 || Record[0] == 0)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       // Create all the basic blocks for the function.
       FunctionBBs.resize(Record[0]);
-      for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i)
-        FunctionBBs[i] = BasicBlock::Create(Context, "", F);
+
+      // See if anything took the address of blocks in this function.
+      auto BBFRI = BasicBlockFwdRefs.find(F);
+      if (BBFRI == BasicBlockFwdRefs.end()) {
+        for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i)
+          FunctionBBs[i] = BasicBlock::Create(Context, "", F);
+      } else {
+        auto &BBRefs = BBFRI->second;
+        // Check for invalid basic block references.
+        if (BBRefs.size() > FunctionBBs.size())
+          return Error(BitcodeError::InvalidID);
+        assert(!BBRefs.empty() && "Unexpected empty array");
+        assert(!BBRefs.front() && "Invalid reference to entry block");
+        for (unsigned I = 0, E = FunctionBBs.size(), RE = BBRefs.size(); I != E;
+             ++I)
+          if (I < RE && BBRefs[I]) {
+            BBRefs[I]->insertInto(F);
+            FunctionBBs[I] = BBRefs[I];
+          } else {
+            FunctionBBs[I] = BasicBlock::Create(Context, "", F);
+          }
+
+        // Erase from the table.
+        BasicBlockFwdRefs.erase(BBFRI);
+      }
+
       CurBB = FunctionBBs[0];
       continue;
+    }
 
     case bitc::FUNC_CODE_DEBUG_LOC_AGAIN:  // DEBUG_LOC_AGAIN
       // This record indicates that the last instruction is at the same
@@ -2332,7 +2434,7 @@
         I = &FunctionBBs[CurBBNo-1]->back();
 
       if (!I)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       I->setDebugLoc(LastLoc);
       I = nullptr;
       continue;
@@ -2345,7 +2447,7 @@
                !FunctionBBs[CurBBNo-1]->empty())
         I = &FunctionBBs[CurBBNo-1]->back();
       if (!I || Record.size() < 4)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       unsigned Line = Record[0], Col = Record[1];
       unsigned ScopeID = Record[2], IAID = Record[3];
@@ -2365,11 +2467,11 @@
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 > Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       int Opc = GetDecodedBinaryOpcode(Record[OpNum++], LHS->getType());
       if (Opc == -1)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
@@ -2411,12 +2513,12 @@
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+2 != Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       Type *ResTy = getTypeByID(Record[OpNum]);
       int Opc = GetDecodedCastOpcode(Record[OpNum+1]);
       if (Opc == -1 || !ResTy)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Instruction *Temp = nullptr;
       if ((I = UpgradeBitCastInst(Opc, Op, ResTy, Temp))) {
         if (Temp) {
@@ -2434,13 +2536,13 @@
       unsigned OpNum = 0;
       Value *BasePtr;
       if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       SmallVector<Value*, 16> GEPIdx;
       while (OpNum != Record.size()) {
         Value *Op;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         GEPIdx.push_back(Op);
       }
 
@@ -2456,14 +2558,14 @@
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       SmallVector<unsigned, 4> EXTRACTVALIdx;
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
         uint64_t Index = Record[OpNum];
         if ((unsigned)Index != Index)
-          return Error(InvalidValue);
+          return Error(BitcodeError::InvalidValue);
         EXTRACTVALIdx.push_back((unsigned)Index);
       }
 
@@ -2477,17 +2579,17 @@
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Value *Val;
       if (getValueTypePair(Record, OpNum, NextValueNo, Val))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       SmallVector<unsigned, 4> INSERTVALIdx;
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
         uint64_t Index = Record[OpNum];
         if ((unsigned)Index != Index)
-          return Error(InvalidValue);
+          return Error(BitcodeError::InvalidValue);
         INSERTVALIdx.push_back((unsigned)Index);
       }
 
@@ -2504,7 +2606,7 @@
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
       InstructionList.push_back(I);
@@ -2519,18 +2621,18 @@
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           getValueTypePair(Record, OpNum, NextValueNo, Cond))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       // select condition can be either i1 or [N x i1]
       if (VectorType* vector_type =
           dyn_cast<VectorType>(Cond->getType())) {
         // expect <n x i1>
         if (vector_type->getElementType() != Type::getInt1Ty(Context))
-          return Error(InvalidTypeForValue);
+          return Error(BitcodeError::InvalidTypeForValue);
       } else {
         // expect i1
         if (Cond->getType() != Type::getInt1Ty(Context))
-          return Error(InvalidTypeForValue);
+          return Error(BitcodeError::InvalidTypeForValue);
       }
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
@@ -2543,7 +2645,7 @@
       Value *Vec, *Idx;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
           getValueTypePair(Record, OpNum, NextValueNo, Idx))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       I = ExtractElementInst::Create(Vec, Idx);
       InstructionList.push_back(I);
       break;
@@ -2556,7 +2658,7 @@
           popValue(Record, OpNum, NextValueNo,
                    cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
           getValueTypePair(Record, OpNum, NextValueNo, Idx))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       I = InsertElementInst::Create(Vec, Elt, Idx);
       InstructionList.push_back(I);
       break;
@@ -2567,10 +2669,10 @@
       Value *Vec1, *Vec2, *Mask;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
           popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       I = new ShuffleVectorInst(Vec1, Vec2, Mask);
       InstructionList.push_back(I);
       break;
@@ -2588,7 +2690,7 @@
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 != Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       if (LHS->getType()->isFPOrFPVectorTy())
         I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
@@ -2610,9 +2712,9 @@
         unsigned OpNum = 0;
         Value *Op = nullptr;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         if (OpNum != Record.size())
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
 
         I = ReturnInst::Create(Context, Op);
         InstructionList.push_back(I);
@@ -2620,10 +2722,10 @@
       }
     case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#]
       if (Record.size() != 1 && Record.size() != 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       BasicBlock *TrueDest = getBasicBlock(Record[0]);
       if (!TrueDest)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       if (Record.size() == 1) {
         I = BranchInst::Create(TrueDest);
@@ -2634,7 +2736,7 @@
         Value *Cond = getValue(Record, 2, NextValueNo,
                                Type::getInt1Ty(Context));
         if (!FalseDest || !Cond)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
         InstructionList.push_back(I);
       }
@@ -2654,7 +2756,7 @@
         Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
         BasicBlock *Default = getBasicBlock(Record[3]);
         if (!OpTy || !Cond || !Default)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
 
         unsigned NumCases = Record[4];
 
@@ -2706,12 +2808,12 @@
       // Old SwitchInst format without case ranges.
 
       if (Record.size() < 3 || (Record.size() & 1) == 0)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (!OpTy || !Cond || !Default)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       unsigned NumCases = (Record.size()-3)/2;
       SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
       InstructionList.push_back(SI);
@@ -2721,7 +2823,7 @@
         BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
         if (!CaseVal || !DestBB) {
           delete SI;
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         }
         SI->addCase(CaseVal, DestBB);
       }
@@ -2730,11 +2832,11 @@
     }
     case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...]
       if (Record.size() < 2)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Address = getValue(Record, 1, NextValueNo, OpTy);
       if (!OpTy || !Address)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       unsigned NumDests = Record.size()-2;
       IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests);
       InstructionList.push_back(IBI);
@@ -2743,7 +2845,7 @@
           IBI->addDestination(DestBB);
         } else {
           delete IBI;
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         }
       }
       I = IBI;
@@ -2753,7 +2855,7 @@
     case bitc::FUNC_CODE_INST_INVOKE: {
       // INVOKE: [attrs, cc, normBB, unwindBB, fnty, op0,op1,op2, ...]
       if (Record.size() < 4)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
       BasicBlock *NormalBB = getBasicBlock(Record[2]);
@@ -2762,7 +2864,7 @@
       unsigned OpNum = 4;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = !CalleeTy ? nullptr :
@@ -2771,25 +2873,25 @@
       // Check that the right number of fixed parameters are here.
       if (!FTy || !NormalBB || !UnwindBB ||
           Record.size() < OpNum+FTy->getNumParams())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       SmallVector<Value*, 16> Ops;
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         Ops.push_back(getValue(Record, OpNum, NextValueNo,
                                FTy->getParamType(i)));
         if (!Ops.back())
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
       }
 
       if (!FTy->isVarArg()) {
         if (Record.size() != OpNum)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
       } else {
         // Read type/value pairs for varargs params.
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error(InvalidRecord);
+            return Error(BitcodeError::InvalidRecord);
           Ops.push_back(Op);
         }
       }
@@ -2805,7 +2907,7 @@
       unsigned Idx = 0;
       Value *Val = nullptr;
       if (getValueTypePair(Record, Idx, NextValueNo, Val))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       I = ResumeInst::Create(Val);
       InstructionList.push_back(I);
       break;
@@ -2816,10 +2918,10 @@
       break;
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
       if (Record.size() < 1 || ((Record.size()-1)&1))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
       if (!Ty)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
       InstructionList.push_back(PN);
@@ -2835,7 +2937,7 @@
           V = getValue(Record, 1+i, NextValueNo, Ty);
         BasicBlock *BB = getBasicBlock(Record[2+i]);
         if (!V || !BB)
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         PN->addIncoming(V, BB);
       }
       I = PN;
@@ -2846,13 +2948,13 @@
       // LANDINGPAD: [ty, val, val, num, (id0,val0 ...)?]
       unsigned Idx = 0;
       if (Record.size() < 4)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *Ty = getTypeByID(Record[Idx++]);
       if (!Ty)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Value *PersFn = nullptr;
       if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       bool IsCleanup = !!Record[Idx++];
       unsigned NumClauses = Record[Idx++];
@@ -2865,7 +2967,7 @@
 
         if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
           delete LP;
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
         }
 
         assert((CT != LandingPadInst::Catch ||
@@ -2884,15 +2986,19 @@
 
     case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
       if (Record.size() != 4)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       PointerType *Ty =
         dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
       Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
-      unsigned Align = Record[3];
+      unsigned AlignRecord = Record[3];
+      bool InAlloca = AlignRecord & (1 << 5);
+      unsigned Align = AlignRecord & ((1 << 5) - 1);
       if (!Ty || !Size)
-        return Error(InvalidRecord);
-      I = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1);
+        return Error(BitcodeError::InvalidRecord);
+      AllocaInst *AI = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1);
+      AI->setUsedWithInAlloca(InAlloca);
+      I = AI;
       InstructionList.push_back(I);
       break;
     }
@@ -2901,7 +3007,7 @@
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+2 != Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1);
       InstructionList.push_back(I);
@@ -2913,15 +3019,14 @@
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+4 != Record.size())
-        return Error(InvalidRecord);
-
+        return Error(BitcodeError::InvalidRecord);
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Release ||
           Ordering == AcquireRelease)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
 
       I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1,
@@ -2936,7 +3041,7 @@
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+2 != Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1);
       InstructionList.push_back(I);
@@ -2950,15 +3055,15 @@
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Acquire ||
           Ordering == AcquireRelease)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1,
                         Ordering, SynchScope);
@@ -2976,10 +3081,10 @@
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), New) ||
           (Record.size() < OpNum + 3 || Record.size() > OpNum + 5))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       AtomicOrdering SuccessOrdering = GetDecodedOrdering(Record[OpNum+1]);
       if (SuccessOrdering == NotAtomic || SuccessOrdering == Unordered)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+2]);
 
       AtomicOrdering FailureOrdering;
@@ -3014,14 +3119,14 @@
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       AtomicRMWInst::BinOp Operation = GetDecodedRMWOperation(Record[OpNum]);
       if (Operation < AtomicRMWInst::FIRST_BINOP ||
           Operation > AtomicRMWInst::LAST_BINOP)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Unordered)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope);
       cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
@@ -3030,11 +3135,11 @@
     }
     case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, synchscope]
       if (2 != Record.size())
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[0]);
       if (Ordering == NotAtomic || Ordering == Unordered ||
           Ordering == Monotonic)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[1]);
       I = new FenceInst(Context, Ordering, SynchScope);
       InstructionList.push_back(I);
@@ -3043,7 +3148,7 @@
     case bitc::FUNC_CODE_INST_CALL: {
       // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
@@ -3051,13 +3156,13 @@
       unsigned OpNum = 2;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = nullptr;
       if (OpTy) FTy = dyn_cast<FunctionType>(OpTy->getElementType());
       if (!FTy || Record.size() < FTy->getNumParams()+OpNum)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
 
       SmallVector<Value*, 16> Args;
       // Read the fixed params.
@@ -3068,18 +3173,18 @@
           Args.push_back(getValue(Record, OpNum, NextValueNo,
                                   FTy->getParamType(i)));
         if (!Args.back())
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
       }
 
       // Read type/value pairs for varargs params.
       if (!FTy->isVarArg()) {
         if (OpNum != Record.size())
-          return Error(InvalidRecord);
+          return Error(BitcodeError::InvalidRecord);
       } else {
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error(InvalidRecord);
+            return Error(BitcodeError::InvalidRecord);
           Args.push_back(Op);
         }
       }
@@ -3099,12 +3204,12 @@
     }
     case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
       if (Record.size() < 3)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Op = getValue(Record, 1, NextValueNo, OpTy);
       Type *ResTy = getTypeByID(Record[2]);
       if (!OpTy || !Op || !ResTy)
-        return Error(InvalidRecord);
+        return Error(BitcodeError::InvalidRecord);
       I = new VAArgInst(Op, ResTy);
       InstructionList.push_back(I);
       break;
@@ -3115,7 +3220,7 @@
     // this file.
     if (!CurBB) {
       delete I;
-      return Error(InvalidInstructionWithNoBB);
+      return Error(BitcodeError::InvalidInstructionWithNoBB);
     }
     CurBB->getInstList().push_back(I);
 
@@ -3142,32 +3247,13 @@
           delete A;
         }
       }
-      return Error(NeverResolvedValueFoundInFunction);
+      return Error(BitcodeError::NeverResolvedValueFoundInFunction);
     }
   }
 
   // FIXME: Check for unresolved forward-declared metadata references
   // and clean up leaks.
 
-  // See if anything took the address of blocks in this function.  If so,
-  // resolve them now.
-  DenseMap<Function*, std::vector<BlockAddrRefTy> >::iterator BAFRI =
-    BlockAddrFwdRefs.find(F);
-  if (BAFRI != BlockAddrFwdRefs.end()) {
-    std::vector<BlockAddrRefTy> &RefList = BAFRI->second;
-    for (unsigned i = 0, e = RefList.size(); i != e; ++i) {
-      unsigned BlockIdx = RefList[i].first;
-      if (BlockIdx >= FunctionBBs.size())
-        return Error(InvalidID);
-
-      GlobalVariable *FwdRef = RefList[i].second;
-      FwdRef->replaceAllUsesWith(BlockAddress::get(F, FunctionBBs[BlockIdx]));
-      FwdRef->eraseFromParent();
-    }
-
-    BlockAddrFwdRefs.erase(BAFRI);
-  }
-
   // Trim the value list down to the size it was before we parsed this function.
   ValueList.shrinkTo(ModuleValueListSize);
   MDValueList.shrinkTo(ModuleMDValueListSize);
@@ -3181,7 +3267,7 @@
     DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator) {
   while (DeferredFunctionInfoIterator->second == 0) {
     if (Stream.AtEndOfStream())
-      return Error(CouldNotFindFunctionInStream);
+      return Error(BitcodeError::CouldNotFindFunctionInStream);
     // ParseModule will parse the next body in the stream and set its
     // position in the DeferredFunctionInfo map.
     if (std::error_code EC = ParseModule(true))
@@ -3196,15 +3282,7 @@
 
 void BitcodeReader::releaseBuffer() { Buffer.release(); }
 
-bool BitcodeReader::isMaterializable(const GlobalValue *GV) const {
-  if (const Function *F = dyn_cast<Function>(GV)) {
-    return F->isDeclaration() &&
-      DeferredFunctionInfo.count(const_cast<Function*>(F));
-  }
-  return false;
-}
-
-std::error_code BitcodeReader::Materialize(GlobalValue *GV) {
+std::error_code BitcodeReader::materialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If it's not a function or is already material, ignore the request.
   if (!F || !F->isMaterializable())
@@ -3223,6 +3301,7 @@
 
   if (std::error_code EC = ParseFunctionBody(F))
     return EC;
+  F->setIsMaterializable(false);
 
   // Upgrade any old intrinsic calls in the function.
   for (UpgradedIntrinsicMap::iterator I = UpgradedIntrinsics.begin(),
@@ -3236,13 +3315,21 @@
     }
   }
 
-  return std::error_code();
+  // Bring in any functions that this function forward-referenced via
+  // blockaddresses.
+  return materializeForwardReferencedFunctions();
 }
 
 bool BitcodeReader::isDematerializable(const GlobalValue *GV) const {
   const Function *F = dyn_cast<Function>(GV);
   if (!F || F->isDeclaration())
     return false;
+
+  // Dematerializing F would leave dangling references that wouldn't be
+  // reconnected on re-materialization.
+  if (BlockAddressesTaken.count(F))
+    return false;
+
   return DeferredFunctionInfo.count(const_cast<Function*>(F));
 }
 
@@ -3255,20 +3342,23 @@
   assert(DeferredFunctionInfo.count(F) && "No info to read function later?");
 
   // Just forget the function body, we can remat it later.
-  F->deleteBody();
+  F->dropAllReferences();
+  F->setIsMaterializable(true);
 }
 
 std::error_code BitcodeReader::MaterializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
+
+  // Promise to materialize all forward references.
+  WillMaterializeAllForwardRefs = true;
+
   // Iterate over the module, deserializing any functions that are still on
   // disk.
   for (Module::iterator F = TheModule->begin(), E = TheModule->end();
        F != E; ++F) {
-    if (F->isMaterializable()) {
-      if (std::error_code EC = Materialize(F))
-        return EC;
-    }
+    if (std::error_code EC = materialize(F))
+      return EC;
   }
   // At this point, if there are any function bodies, the current bit is
   // pointing to the END_BLOCK record after them. Now make sure the rest
@@ -3276,6 +3366,11 @@
   if (NextUnreadBit)
     ParseModule(true);
 
+  // Check that all block address forward references got resolved (as we
+  // promised above).
+  if (!BasicBlockFwdRefs.empty())
+    return Error(BitcodeError::NeverResolvedFunctionFromBlockAddress);
+
   // Upgrade any intrinsic calls that slipped through (should not happen!) and
   // delete the old functions to clean up. We can't do this unless the entire
   // module is materialized because there could always be another function body
@@ -3312,21 +3407,17 @@
   const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart();
   const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
-  if (Buffer->getBufferSize() & 3) {
-    if (!isRawBitcode(BufPtr, BufEnd) && !isBitcodeWrapper(BufPtr, BufEnd))
-      return Error(InvalidBitcodeSignature);
-    else
-      return Error(BitcodeStreamInvalidSize);
-  }
+  if (Buffer->getBufferSize() & 3)
+    return Error(BitcodeError::InvalidBitcodeSignature);
 
   // If we have a wrapper header, parse it and ignore the non-bc file contents.
   // The magic number is 0x0B17C0DE stored in little endian.
   if (isBitcodeWrapper(BufPtr, BufEnd))
     if (SkipBitcodeWrapperHeader(BufPtr, BufEnd, true))
-      return Error(InvalidBitcodeWrapperHeader);
+      return Error(BitcodeError::InvalidBitcodeWrapperHeader);
 
   StreamFile.reset(new BitstreamReader(BufPtr, BufEnd));
-  Stream.init(*StreamFile);
+  Stream.init(&*StreamFile);
 
   return std::error_code();
 }
@@ -3336,14 +3427,14 @@
   // see it.
   StreamingMemoryObject *Bytes = new StreamingMemoryObject(LazyStreamer);
   StreamFile.reset(new BitstreamReader(Bytes));
-  Stream.init(*StreamFile);
+  Stream.init(&*StreamFile);
 
   unsigned char buf[16];
-  if (Bytes->readBytes(0, 16, buf) == -1)
-    return Error(BitcodeStreamInvalidSize);
+  if (Bytes->readBytes(buf, 16, 0) != 16)
+    return Error(BitcodeError::InvalidBitcodeSignature);
 
   if (!isBitcode(buf, buf + 16))
-    return Error(InvalidBitcodeSignature);
+    return Error(BitcodeError::InvalidBitcodeSignature);
 
   if (isBitcodeWrapper(buf, buf + 4)) {
     const unsigned char *bitcodeStart = buf;
@@ -3361,45 +3452,45 @@
     return "llvm.bitcode";
   }
   std::string message(int IE) const override {
-    BitcodeReader::ErrorType E = static_cast<BitcodeReader::ErrorType>(IE);
+    BitcodeError E = static_cast<BitcodeError>(IE);
     switch (E) {
-    case BitcodeReader::BitcodeStreamInvalidSize:
-      return "Bitcode stream length should be >= 16 bytes and a multiple of 4";
-    case BitcodeReader::ConflictingMETADATA_KINDRecords:
+    case BitcodeError::ConflictingMETADATA_KINDRecords:
       return "Conflicting METADATA_KIND records";
-    case BitcodeReader::CouldNotFindFunctionInStream:
+    case BitcodeError::CouldNotFindFunctionInStream:
       return "Could not find function in stream";
-    case BitcodeReader::ExpectedConstant:
+    case BitcodeError::ExpectedConstant:
       return "Expected a constant";
-    case BitcodeReader::InsufficientFunctionProtos:
+    case BitcodeError::InsufficientFunctionProtos:
       return "Insufficient function protos";
-    case BitcodeReader::InvalidBitcodeSignature:
+    case BitcodeError::InvalidBitcodeSignature:
       return "Invalid bitcode signature";
-    case BitcodeReader::InvalidBitcodeWrapperHeader:
+    case BitcodeError::InvalidBitcodeWrapperHeader:
       return "Invalid bitcode wrapper header";
-    case BitcodeReader::InvalidConstantReference:
+    case BitcodeError::InvalidConstantReference:
       return "Invalid ronstant reference";
-    case BitcodeReader::InvalidID:
+    case BitcodeError::InvalidID:
       return "Invalid ID";
-    case BitcodeReader::InvalidInstructionWithNoBB:
+    case BitcodeError::InvalidInstructionWithNoBB:
       return "Invalid instruction with no BB";
-    case BitcodeReader::InvalidRecord:
+    case BitcodeError::InvalidRecord:
       return "Invalid record";
-    case BitcodeReader::InvalidTypeForValue:
+    case BitcodeError::InvalidTypeForValue:
       return "Invalid type for value";
-    case BitcodeReader::InvalidTYPETable:
+    case BitcodeError::InvalidTYPETable:
       return "Invalid TYPE table";
-    case BitcodeReader::InvalidType:
+    case BitcodeError::InvalidType:
       return "Invalid type";
-    case BitcodeReader::MalformedBlock:
+    case BitcodeError::MalformedBlock:
       return "Malformed block";
-    case BitcodeReader::MalformedGlobalInitializerSet:
+    case BitcodeError::MalformedGlobalInitializerSet:
       return "Malformed global initializer set";
-    case BitcodeReader::InvalidMultipleBlocks:
+    case BitcodeError::InvalidMultipleBlocks:
       return "Invalid multiple blocks";
-    case BitcodeReader::NeverResolvedValueFoundInFunction:
+    case BitcodeError::NeverResolvedValueFoundInFunction:
       return "Never resolved value found in function";
-    case BitcodeReader::InvalidValue:
+    case BitcodeError::NeverResolvedFunctionFromBlockAddress:
+      return "Never resolved function from blockaddress";
+    case BitcodeError::InvalidValue:
       return "Invalid value";
     }
     llvm_unreachable("Unknown error type!");
@@ -3407,33 +3498,54 @@
 };
 }
 
-const std::error_category &BitcodeReader::BitcodeErrorCategory() {
-  static BitcodeErrorCategoryType O;
-  return O;
+static ManagedStatic<BitcodeErrorCategoryType> ErrorCategory;
+
+const std::error_category &llvm::BitcodeErrorCategory() {
+  return *ErrorCategory;
 }
 
 //===----------------------------------------------------------------------===//
 // External interface
 //===----------------------------------------------------------------------===//
 
-/// getLazyBitcodeModule - lazy function-at-a-time loading from a file.
+/// \brief Get a lazy one-at-time loading module from bitcode.
 ///
-ErrorOr<Module *> llvm::getLazyBitcodeModule(MemoryBuffer *Buffer,
-                                             LLVMContext &Context) {
+/// This isn't always used in a lazy context.  In particular, it's also used by
+/// \a parseBitcodeFile().  If this is truly lazy, then we need to eagerly pull
+/// in forward-referenced functions from block address references.
+///
+/// \param[in] WillMaterializeAll Set to \c true if the caller promises to
+/// materialize everything -- in particular, if this isn't truly lazy.
+static ErrorOr<Module *>
+getLazyBitcodeModuleImpl(std::unique_ptr<MemoryBuffer> &&Buffer,
+                         LLVMContext &Context, bool WillMaterializeAll) {
   Module *M = new Module(Buffer->getBufferIdentifier(), Context);
-  BitcodeReader *R = new BitcodeReader(Buffer, Context);
+  BitcodeReader *R = new BitcodeReader(Buffer.get(), Context);
   M->setMaterializer(R);
-  if (std::error_code EC = R->ParseBitcodeInto(M)) {
+
+  auto cleanupOnError = [&](std::error_code EC) {
     R->releaseBuffer(); // Never take ownership on error.
     delete M;  // Also deletes R.
     return EC;
-  }
+  };
 
-  R->materializeForwardReferencedFunctions();
+  if (std::error_code EC = R->ParseBitcodeInto(M))
+    return cleanupOnError(EC);
 
+  if (!WillMaterializeAll)
+    // Resolve forward references from blockaddresses.
+    if (std::error_code EC = R->materializeForwardReferencedFunctions())
+      return cleanupOnError(EC);
+
+  Buffer.release(); // The BitcodeReader owns it now.
   return M;
 }
 
+ErrorOr<Module *>
+llvm::getLazyBitcodeModule(std::unique_ptr<MemoryBuffer> &&Buffer,
+                           LLVMContext &Context) {
+  return getLazyBitcodeModuleImpl(std::move(Buffer), Context, false);
+}
 
 Module *llvm::getStreamedBitcodeModule(const std::string &name,
                                        DataStreamer *streamer,
@@ -3451,14 +3563,16 @@
   return M;
 }
 
-ErrorOr<Module *> llvm::parseBitcodeFile(MemoryBuffer *Buffer,
+ErrorOr<Module *> llvm::parseBitcodeFile(MemoryBufferRef Buffer,
                                          LLVMContext &Context) {
-  ErrorOr<Module *> ModuleOrErr = getLazyBitcodeModule(Buffer, Context);
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
+  ErrorOr<Module *> ModuleOrErr =
+      getLazyBitcodeModuleImpl(std::move(Buf), Context, true);
   if (!ModuleOrErr)
     return ModuleOrErr;
   Module *M = ModuleOrErr.get();
   // Read in the entire module, and destroy the BitcodeReader.
-  if (std::error_code EC = M->materializeAllPermanently(true)) {
+  if (std::error_code EC = M->materializeAllPermanently()) {
     delete M;
     return EC;
   }
@@ -3469,12 +3583,11 @@
   return M;
 }
 
-std::string llvm::getBitcodeTargetTriple(MemoryBuffer *Buffer,
+std::string llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer,
                                          LLVMContext &Context) {
-  BitcodeReader *R = new BitcodeReader(Buffer, Context);
+  std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(Buffer, false);
+  auto R = llvm::make_unique<BitcodeReader>(Buf.release(), Context);
   ErrorOr<std::string> Triple = R->parseTriple();
-  R->releaseBuffer();
-  delete R;
   if (Triple.getError())
     return "";
   return Triple.get();

diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index 1d4869a..047fef8 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef BITCODE_READER_H
-#define BITCODE_READER_H
+#ifndef LLVM_LIB_BITCODE_READER_BITCODEREADER_H
+#define LLVM_LIB_BITCODE_READER_BITCODEREADER_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Bitcode/BitstreamReader.h"
@@ -22,6 +22,7 @@
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueHandle.h"
+#include <deque>
 #include <system_error>
 #include <vector>
 
@@ -138,7 +139,6 @@
   BitcodeReaderMDValueList MDValueList;
   std::vector<Comdat *> ComdatList;
   SmallVector<Instruction *, 64> InstructionList;
-  SmallVector<SmallVector<uint64_t, 64>, 64> UseListRecords;
 
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInits;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInits;
@@ -180,10 +180,11 @@
   /// stream.
   DenseMap<Function*, uint64_t> DeferredFunctionInfo;
 
-  /// BlockAddrFwdRefs - These are blockaddr references to basic blocks.  These
-  /// are resolved lazily when functions are loaded.
-  typedef std::pair<unsigned, GlobalVariable*> BlockAddrRefTy;
-  DenseMap<Function*, std::vector<BlockAddrRefTy> > BlockAddrFwdRefs;
+  /// These are basic blocks forward-referenced by block addresses.  They are
+  /// inserted lazily into functions when they're loaded.  The basic block ID is
+  /// its index into the vector.
+  DenseMap<Function *, std::vector<BasicBlock *>> BasicBlockFwdRefs;
+  std::deque<Function *> BasicBlockFwdRefQueue;
 
   /// UseRelativeIDs - Indicates that we are using a new encoding for
   /// instruction operands where most operands in the current
@@ -194,55 +195,36 @@
   /// not need this flag.
   bool UseRelativeIDs;
 
-  static const std::error_category &BitcodeErrorCategory();
+  /// True if all functions will be materialized, negating the need to process
+  /// (e.g.) blockaddress forward references.
+  bool WillMaterializeAllForwardRefs;
+
+  /// Functions that have block addresses taken.  This is usually empty.
+  SmallPtrSet<const Function *, 4> BlockAddressesTaken;
 
 public:
-  enum ErrorType {
-    BitcodeStreamInvalidSize,
-    ConflictingMETADATA_KINDRecords,
-    CouldNotFindFunctionInStream,
-    ExpectedConstant,
-    InsufficientFunctionProtos,
-    InvalidBitcodeSignature,
-    InvalidBitcodeWrapperHeader,
-    InvalidConstantReference,
-    InvalidID, // A read identifier is not found in the table it should be in.
-    InvalidInstructionWithNoBB,
-    InvalidRecord, // A read record doesn't have the expected size or structure
-    InvalidTypeForValue, // Type read OK, but is invalid for its use
-    InvalidTYPETable,
-    InvalidType, // We were unable to read a type
-    MalformedBlock, // We are unable to advance in the stream.
-    MalformedGlobalInitializerSet,
-    InvalidMultipleBlocks, // We found multiple blocks of a kind that should
-                           // have only one
-    NeverResolvedValueFoundInFunction,
-    InvalidValue // Invalid version, inst number, attr number, etc
-  };
-
-  std::error_code Error(ErrorType E) {
-    return std::error_code(E, BitcodeErrorCategory());
-  }
+  std::error_code Error(BitcodeError E) { return make_error_code(E); }
 
   explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
       : Context(C), TheModule(nullptr), Buffer(buffer), LazyStreamer(nullptr),
         NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C),
-        MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false) {}
+        MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false),
+        WillMaterializeAllForwardRefs(false) {}
   explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
       : Context(C), TheModule(nullptr), Buffer(nullptr), LazyStreamer(streamer),
         NextUnreadBit(0), SeenValueSymbolTable(false), ValueList(C),
-        MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false) {}
+        MDValueList(C), SeenFirstFunctionBody(false), UseRelativeIDs(false),
+        WillMaterializeAllForwardRefs(false) {}
   ~BitcodeReader() { FreeState(); }
 
-  void materializeForwardReferencedFunctions();
+  std::error_code materializeForwardReferencedFunctions();
 
   void FreeState();
 
-  void releaseBuffer() override;
+  void releaseBuffer();
 
-  bool isMaterializable(const GlobalValue *GV) const override;
   bool isDematerializable(const GlobalValue *GV) const override;
-  std::error_code Materialize(GlobalValue *GV) override;
+  std::error_code materialize(GlobalValue *GV) override;
   std::error_code MaterializeModule(Module *M) override;
   void Dematerialize(GlobalValue *GV) override;
 

diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
index 72451ec..5e3232e 100644
--- a/lib/Bitcode/Reader/BitstreamReader.cpp
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp

@@ -15,41 +15,11 @@
 //  BitstreamCursor implementation
 //===----------------------------------------------------------------------===//
 
-void BitstreamCursor::operator=(const BitstreamCursor &RHS) {
-  freeState();
-
-  BitStream = RHS.BitStream;
-  NextChar = RHS.NextChar;
-  CurWord = RHS.CurWord;
-  BitsInCurWord = RHS.BitsInCurWord;
-  CurCodeSize = RHS.CurCodeSize;
-
-  // Copy abbreviations, and bump ref counts.
-  CurAbbrevs = RHS.CurAbbrevs;
-  for (size_t i = 0, e = CurAbbrevs.size(); i != e; ++i)
-    CurAbbrevs[i]->addRef();
-
-  // Copy block scope and bump ref counts.
-  BlockScope = RHS.BlockScope;
-  for (size_t S = 0, e = BlockScope.size(); S != e; ++S) {
-    std::vector<BitCodeAbbrev*> &Abbrevs = BlockScope[S].PrevAbbrevs;
-    for (size_t i = 0, e = Abbrevs.size(); i != e; ++i)
-      Abbrevs[i]->addRef();
-  }
-}
-
 void BitstreamCursor::freeState() {
   // Free all the Abbrevs.
-  for (size_t i = 0, e = CurAbbrevs.size(); i != e; ++i)
-    CurAbbrevs[i]->dropRef();
   CurAbbrevs.clear();
 
   // Free all the Abbrevs in the block scope.
-  for (size_t S = 0, e = BlockScope.size(); S != e; ++S) {
-    std::vector<BitCodeAbbrev*> &Abbrevs = BlockScope[S].PrevAbbrevs;
-    for (size_t i = 0, e = Abbrevs.size(); i != e; ++i)
-      Abbrevs[i]->dropRef();
-  }
   BlockScope.clear();
 }
 
@@ -63,10 +33,8 @@
   // Add the abbrevs specific to this block to the CurAbbrevs list.
   if (const BitstreamReader::BlockInfo *Info =
       BitStream->getBlockInfo(BlockID)) {
-    for (size_t i = 0, e = Info->Abbrevs.size(); i != e; ++i) {
-      CurAbbrevs.push_back(Info->Abbrevs[i]);
-      CurAbbrevs.back()->addRef();
-    }
+    CurAbbrevs.insert(CurAbbrevs.end(), Info->Abbrevs.begin(),
+                      Info->Abbrevs.end());
   }
 
   // Get the codesize of this block.
@@ -82,16 +50,9 @@
   return false;
 }
 
-void BitstreamCursor::readAbbreviatedLiteral(const BitCodeAbbrevOp &Op,
-                                             SmallVectorImpl<uint64_t> &Vals) {
-  assert(Op.isLiteral() && "Not a literal");
-  // If the abbrev specifies the literal value to use, use it.
-  Vals.push_back(Op.getLiteralValue());
-}
-
-void BitstreamCursor::readAbbreviatedField(const BitCodeAbbrevOp &Op,
-                                           SmallVectorImpl<uint64_t> &Vals) {
-  assert(!Op.isLiteral() && "Use ReadAbbreviatedLiteral for literals!");
+static uint64_t readAbbreviatedField(BitstreamCursor &Cursor,
+                                     const BitCodeAbbrevOp &Op) {
+  assert(!Op.isLiteral() && "Not to be used with literals!");
 
   // Decode the value as we are commanded.
   switch (Op.getEncoding()) {
@@ -99,19 +60,18 @@
   case BitCodeAbbrevOp::Blob:
     llvm_unreachable("Should not reach here");
   case BitCodeAbbrevOp::Fixed:
-    Vals.push_back(Read((unsigned)Op.getEncodingData()));
-    break;
+    return Cursor.Read((unsigned)Op.getEncodingData());
   case BitCodeAbbrevOp::VBR:
-    Vals.push_back(ReadVBR64((unsigned)Op.getEncodingData()));
-    break;
+    return Cursor.ReadVBR64((unsigned)Op.getEncodingData());
   case BitCodeAbbrevOp::Char6:
-    Vals.push_back(BitCodeAbbrevOp::DecodeChar6(Read(6)));
-    break;
+    return BitCodeAbbrevOp::DecodeChar6(Cursor.Read(6));
   }
+  llvm_unreachable("invalid abbreviation encoding");
 }
 
-void BitstreamCursor::skipAbbreviatedField(const BitCodeAbbrevOp &Op) {
-  assert(!Op.isLiteral() && "Use ReadAbbreviatedLiteral for literals!");
+static void skipAbbreviatedField(BitstreamCursor &Cursor,
+                                 const BitCodeAbbrevOp &Op) {
+  assert(!Op.isLiteral() && "Not to be used with literals!");
 
   // Decode the value as we are commanded.
   switch (Op.getEncoding()) {
@@ -119,13 +79,13 @@
   case BitCodeAbbrevOp::Blob:
     llvm_unreachable("Should not reach here");
   case BitCodeAbbrevOp::Fixed:
-    (void)Read((unsigned)Op.getEncodingData());
+    Cursor.Read((unsigned)Op.getEncodingData());
     break;
   case BitCodeAbbrevOp::VBR:
-    (void)ReadVBR64((unsigned)Op.getEncodingData());
+    Cursor.ReadVBR64((unsigned)Op.getEncodingData());
     break;
   case BitCodeAbbrevOp::Char6:
-    (void)Read(6);
+    Cursor.Read(6);
     break;
   }
 }
@@ -153,7 +113,7 @@
 
     if (Op.getEncoding() != BitCodeAbbrevOp::Array &&
         Op.getEncoding() != BitCodeAbbrevOp::Blob) {
-      skipAbbreviatedField(Op);
+      skipAbbreviatedField(*this, Op);
       continue;
     }
 
@@ -167,7 +127,7 @@
 
       // Read all the elements.
       for (; NumElts; --NumElts)
-        skipAbbreviatedField(EltEnc);
+        skipAbbreviatedField(*this, EltEnc);
       continue;
     }
 
@@ -207,22 +167,22 @@
   // Read the record code first.
   assert(Abbv->getNumOperandInfos() != 0 && "no record code in abbreviation?");
   const BitCodeAbbrevOp &CodeOp = Abbv->getOperandInfo(0);
+  unsigned Code;
   if (CodeOp.isLiteral())
-    readAbbreviatedLiteral(CodeOp, Vals);
+    Code = CodeOp.getLiteralValue();
   else
-    readAbbreviatedField(CodeOp, Vals);
-  unsigned Code = (unsigned)Vals.pop_back_val();
+    Code = readAbbreviatedField(*this, CodeOp);
 
   for (unsigned i = 1, e = Abbv->getNumOperandInfos(); i != e; ++i) {
     const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
     if (Op.isLiteral()) {
-      readAbbreviatedLiteral(Op, Vals);
+      Vals.push_back(Op.getLiteralValue());
       continue;
     }
 
     if (Op.getEncoding() != BitCodeAbbrevOp::Array &&
         Op.getEncoding() != BitCodeAbbrevOp::Blob) {
-      readAbbreviatedField(Op, Vals);
+      Vals.push_back(readAbbreviatedField(*this, Op));
       continue;
     }
 
@@ -236,7 +196,7 @@
 
       // Read all the elements.
       for (; NumElts; --NumElts)
-        readAbbreviatedField(EltEnc, Vals);
+        Vals.push_back(readAbbreviatedField(*this, EltEnc));
       continue;
     }
 
@@ -339,9 +299,8 @@
 
       // ReadAbbrevRecord installs the abbrev in CurAbbrevs.  Move it to the
       // appropriate BlockInfo.
-      BitCodeAbbrev *Abbv = CurAbbrevs.back();
+      CurBlockInfo->Abbrevs.push_back(std::move(CurAbbrevs.back()));
       CurAbbrevs.pop_back();
-      CurBlockInfo->Abbrevs.push_back(Abbv);
       continue;
     }
 

diff --git a/lib/Bitcode/Writer/BitWriter.cpp b/lib/Bitcode/Writer/BitWriter.cpp
index 3747122..7218ea0 100644
--- a/lib/Bitcode/Writer/BitWriter.cpp
+++ b/lib/Bitcode/Writer/BitWriter.cpp

@@ -18,10 +18,10 @@
 /*===-- Operations on modules ---------------------------------------------===*/
 
 int LLVMWriteBitcodeToFile(LLVMModuleRef M, const char *Path) {
-  std::string ErrorInfo;
-  raw_fd_ostream OS(Path, ErrorInfo, sys::fs::F_None);
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC, sys::fs::F_None);
 
-  if (!ErrorInfo.empty())
+  if (EC)
     return -1;
 
   WriteBitcodeToFile(unwrap(M), OS);
@@ -39,3 +39,11 @@
 int LLVMWriteBitcodeToFileHandle(LLVMModuleRef M, int FileHandle) {
   return LLVMWriteBitcodeToFD(M, FileHandle, true, false);
 }
+
+LLVMMemoryBufferRef LLVMWriteBitcodeToMemoryBuffer(LLVMModuleRef M) {
+  std::string Data;
+  raw_string_ostream OS(Data);
+
+  WriteBitcodeToFile(unwrap(M), OS);
+  return wrap(MemoryBuffer::getMemBufferCopy(OS.str()).release());
+}

diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index dd9282a..6cfc357 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp

@@ -22,6 +22,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/UseListOrder.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -32,12 +33,6 @@
 #include <map>
 using namespace llvm;
 
-static cl::opt<bool>
-EnablePreserveUseListOrdering("enable-bc-uselist-preserve",
-                              cl::desc("Turn on experimental support for "
-                                       "use-list order preservation."),
-                              cl::init(false), cl::Hidden);
-
 /// These are manifest constants used by the bitcode writer. They do not need to
 /// be kept in sync with the reader, but need to be consistent within this file.
 enum {
@@ -201,6 +196,8 @@
     return bitc::ATTR_KIND_NON_LAZY_BIND;
   case Attribute::NonNull:
     return bitc::ATTR_KIND_NON_NULL;
+  case Attribute::Dereferenceable:
+    return bitc::ATTR_KIND_DEREFERENCEABLE;
   case Attribute::NoRedZone:
     return bitc::ATTR_KIND_NO_RED_ZONE;
   case Attribute::NoReturn:
@@ -272,7 +269,7 @@
         if (Attr.isEnumAttribute()) {
           Record.push_back(0);
           Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum()));
-        } else if (Attr.isAlignAttribute()) {
+        } else if (Attr.isIntAttribute()) {
           Record.push_back(1);
           Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum()));
           Record.push_back(Attr.getValueAsInt());
@@ -713,18 +710,15 @@
 static uint64_t GetOptimizationFlags(const Value *V) {
   uint64_t Flags = 0;
 
-  if (const OverflowingBinaryOperator *OBO =
-        dyn_cast<OverflowingBinaryOperator>(V)) {
+  if (const auto *OBO = dyn_cast<OverflowingBinaryOperator>(V)) {
     if (OBO->hasNoSignedWrap())
       Flags |= 1 << bitc::OBO_NO_SIGNED_WRAP;
     if (OBO->hasNoUnsignedWrap())
       Flags |= 1 << bitc::OBO_NO_UNSIGNED_WRAP;
-  } else if (const PossiblyExactOperator *PEO =
-               dyn_cast<PossiblyExactOperator>(V)) {
+  } else if (const auto *PEO = dyn_cast<PossiblyExactOperator>(V)) {
     if (PEO->isExact())
       Flags |= 1 << bitc::PEO_EXACT;
-  } else if (const FPMathOperator *FPMO =
-             dyn_cast<const FPMathOperator>(V)) {
+  } else if (const auto *FPMO = dyn_cast<FPMathOperator>(V)) {
     if (FPMO->hasUnsafeAlgebra())
       Flags |= FastMathFlags::UnsafeAlgebra;
     if (FPMO->hasNoNaNs())
@@ -762,13 +756,13 @@
 static void WriteModuleMetadata(const Module *M,
                                 const ValueEnumerator &VE,
                                 BitstreamWriter &Stream) {
-  const ValueEnumerator::ValueList &Vals = VE.getMDValues();
+  const auto &Vals = VE.getMDValues();
   bool StartedMetadataBlock = false;
   unsigned MDSAbbrev = 0;
   SmallVector<uint64_t, 64> Record;
   for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
 
-    if (const MDNode *N = dyn_cast<MDNode>(Vals[i].first)) {
+    if (const MDNode *N = dyn_cast<MDNode>(Vals[i])) {
       if (!N->isFunctionLocal() || !N->getFunction()) {
         if (!StartedMetadataBlock) {
           Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
@@ -776,7 +770,7 @@
         }
         WriteMDNode(N, VE, Stream, Record);
       }
-    } else if (const MDString *MDS = dyn_cast<MDString>(Vals[i].first)) {
+    } else if (const MDString *MDS = dyn_cast<MDString>(Vals[i])) {
       if (!StartedMetadataBlock)  {
         Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
 
@@ -854,7 +848,7 @@
 
   // Write metadata attachments
   // METADATA_ATTACHMENT - [m x [value, [n x [id, mdnode]]]
-  SmallVector<std::pair<unsigned, MDNode*>, 4> MDs;
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
 
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
@@ -1431,13 +1425,20 @@
     break;
   }
 
-  case Instruction::Alloca:
+  case Instruction::Alloca: {
     Code = bitc::FUNC_CODE_INST_ALLOCA;
     Vals.push_back(VE.getTypeID(I.getType()));
     Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
     Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
-    Vals.push_back(Log2_32(cast<AllocaInst>(I).getAlignment())+1);
+    const AllocaInst &AI = cast<AllocaInst>(I);
+    unsigned AlignRecord = Log2_32(AI.getAlignment()) + 1;
+    assert(Log2_32(Value::MaximumAlignment) + 1 < 1 << 5 &&
+           "not enough bits for maximum alignment");
+    assert(AlignRecord < 1 << 5 && "alignment greater than 1 << 64");
+    AlignRecord |= AI.isUsedWithInAlloca() << 5;
+    Vals.push_back(AlignRecord);
     break;
+  }
 
   case Instruction::Load:
     if (cast<LoadInst>(I).isAtomic()) {
@@ -1598,6 +1599,39 @@
   Stream.ExitBlock();
 }
 
+static void WriteUseList(ValueEnumerator &VE, UseListOrder &&Order,
+                         BitstreamWriter &Stream) {
+  assert(Order.Shuffle.size() >= 2 && "Shuffle too small");
+  unsigned Code;
+  if (isa<BasicBlock>(Order.V))
+    Code = bitc::USELIST_CODE_BB;
+  else
+    Code = bitc::USELIST_CODE_DEFAULT;
+
+  SmallVector<uint64_t, 64> Record;
+  for (unsigned I : Order.Shuffle)
+    Record.push_back(I);
+  Record.push_back(VE.getValueID(Order.V));
+  Stream.EmitRecord(Code, Record);
+}
+
+static void WriteUseListBlock(const Function *F, ValueEnumerator &VE,
+                              BitstreamWriter &Stream) {
+  auto hasMore = [&]() {
+    return !VE.UseListOrders.empty() && VE.UseListOrders.back().F == F;
+  };
+  if (!hasMore())
+    // Nothing to do.
+    return;
+
+  Stream.EnterSubblock(bitc::USELIST_BLOCK_ID, 3);
+  while (hasMore()) {
+    WriteUseList(VE, std::move(VE.UseListOrders.back()), Stream);
+    VE.UseListOrders.pop_back();
+  }
+  Stream.ExitBlock();
+}
+
 /// WriteFunction - Emit a function body to the module stream.
 static void WriteFunction(const Function &F, ValueEnumerator &VE,
                           BitstreamWriter &Stream) {
@@ -1666,6 +1700,8 @@
 
   if (NeedsMetadataAttachment)
     WriteMetadataAttachment(F, VE, Stream);
+  if (shouldPreserveBitcodeUseListOrder())
+    WriteUseListBlock(&F, VE, Stream);
   VE.purgeFunction();
   Stream.ExitBlock();
 }
@@ -1831,98 +1867,6 @@
   Stream.ExitBlock();
 }
 
-// Sort the Users based on the order in which the reader parses the bitcode
-// file.
-static bool bitcodereader_order(const User *lhs, const User *rhs) {
-  // TODO: Implement.
-  return true;
-}
-
-static void WriteUseList(const Value *V, const ValueEnumerator &VE,
-                         BitstreamWriter &Stream) {
-
-  // One or zero uses can't get out of order.
-  if (V->use_empty() || V->hasNUses(1))
-    return;
-
-  // Make a copy of the in-memory use-list for sorting.
-  SmallVector<const User*, 8> UserList(V->user_begin(), V->user_end());
-
-  // Sort the copy based on the order read by the BitcodeReader.
-  std::sort(UserList.begin(), UserList.end(), bitcodereader_order);
-
-  // TODO: Generate a diff between the BitcodeWriter in-memory use-list and the
-  // sorted list (i.e., the expected BitcodeReader in-memory use-list).
-
-  // TODO: Emit the USELIST_CODE_ENTRYs.
-}
-
-static void WriteFunctionUseList(const Function *F, ValueEnumerator &VE,
-                                 BitstreamWriter &Stream) {
-  VE.incorporateFunction(*F);
-
-  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-       AI != AE; ++AI)
-    WriteUseList(AI, VE, Stream);
-  for (Function::const_iterator BB = F->begin(), FE = F->end(); BB != FE;
-       ++BB) {
-    WriteUseList(BB, VE, Stream);
-    for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end(); II != IE;
-         ++II) {
-      WriteUseList(II, VE, Stream);
-      for (User::const_op_iterator OI = II->op_begin(), E = II->op_end();
-           OI != E; ++OI) {
-        if ((isa<Constant>(*OI) && !isa<GlobalValue>(*OI)) ||
-            isa<InlineAsm>(*OI))
-          WriteUseList(*OI, VE, Stream);
-      }
-    }
-  }
-  VE.purgeFunction();
-}
-
-// Emit use-lists.
-static void WriteModuleUseLists(const Module *M, ValueEnumerator &VE,
-                                BitstreamWriter &Stream) {
-  Stream.EnterSubblock(bitc::USELIST_BLOCK_ID, 3);
-
-  // XXX: this modifies the module, but in a way that should never change the
-  // behavior of any pass or codegen in LLVM. The problem is that GVs may
-  // contain entries in the use_list that do not exist in the Module and are
-  // not stored in the .bc file.
-  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
-       I != E; ++I)
-    I->removeDeadConstantUsers();
-
-  // Write the global variables.
-  for (Module::const_global_iterator GI = M->global_begin(),
-         GE = M->global_end(); GI != GE; ++GI) {
-    WriteUseList(GI, VE, Stream);
-
-    // Write the global variable initializers.
-    if (GI->hasInitializer())
-      WriteUseList(GI->getInitializer(), VE, Stream);
-  }
-
-  // Write the functions.
-  for (Module::const_iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI) {
-    WriteUseList(FI, VE, Stream);
-    if (!FI->isDeclaration())
-      WriteFunctionUseList(FI, VE, Stream);
-    if (FI->hasPrefixData())
-      WriteUseList(FI->getPrefixData(), VE, Stream);
-  }
-
-  // Write the aliases.
-  for (Module::const_alias_iterator AI = M->alias_begin(), AE = M->alias_end();
-       AI != AE; ++AI) {
-    WriteUseList(AI, VE, Stream);
-    WriteUseList(AI->getAliasee(), VE, Stream);
-  }
-
-  Stream.ExitBlock();
-}
-
 /// WriteModule - Emit the specified module to the bitstream.
 static void WriteModule(const Module *M, BitstreamWriter &Stream) {
   Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
@@ -1933,7 +1877,7 @@
   Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
 
   // Analyze the module, enumerating globals, functions, etc.
-  ValueEnumerator VE(M);
+  ValueEnumerator VE(*M);
 
   // Emit blockinfo, which defines the standard abbreviations etc.
   WriteBlockInfo(VE, Stream);
@@ -1965,9 +1909,9 @@
   // Emit names for globals/functions etc.
   WriteValueSymbolTable(M->getValueSymbolTable(), VE, Stream);
 
-  // Emit use-lists.
-  if (EnablePreserveUseListOrdering)
-    WriteModuleUseLists(M, VE, Stream);
+  // Emit module-level use-lists.
+  if (shouldPreserveBitcodeUseListOrder())
+    WriteUseListBlock(nullptr, VE, Stream);
 
   // Emit function bodies.
   for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F)

diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 15f8034..f065c83 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp

@@ -18,31 +18,280 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/UseListOrder.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace llvm;
 
+namespace {
+struct OrderMap {
+  DenseMap<const Value *, std::pair<unsigned, bool>> IDs;
+  unsigned LastGlobalConstantID;
+  unsigned LastGlobalValueID;
+
+  OrderMap() : LastGlobalConstantID(0), LastGlobalValueID(0) {}
+
+  bool isGlobalConstant(unsigned ID) const {
+    return ID <= LastGlobalConstantID;
+  }
+  bool isGlobalValue(unsigned ID) const {
+    return ID <= LastGlobalValueID && !isGlobalConstant(ID);
+  }
+
+  unsigned size() const { return IDs.size(); }
+  std::pair<unsigned, bool> &operator[](const Value *V) { return IDs[V]; }
+  std::pair<unsigned, bool> lookup(const Value *V) const {
+    return IDs.lookup(V);
+  }
+  void index(const Value *V) {
+    // Explicitly sequence get-size and insert-value operations to avoid UB.
+    unsigned ID = IDs.size() + 1;
+    IDs[V].first = ID;
+  }
+};
+}
+
+static void orderValue(const Value *V, OrderMap &OM) {
+  if (OM.lookup(V).first)
+    return;
+
+  if (const Constant *C = dyn_cast<Constant>(V))
+    if (C->getNumOperands() && !isa<GlobalValue>(C))
+      for (const Value *Op : C->operands())
+        if (!isa<BasicBlock>(Op) && !isa<GlobalValue>(Op))
+          orderValue(Op, OM);
+
+  // Note: we cannot cache this lookup above, since inserting into the map
+  // changes the map's size, and thus affects the other IDs.
+  OM.index(V);
+}
+
+static OrderMap orderModule(const Module &M) {
+  // This needs to match the order used by ValueEnumerator::ValueEnumerator()
+  // and ValueEnumerator::incorporateFunction().
+  OrderMap OM;
+
+  // In the reader, initializers of GlobalValues are set *after* all the
+  // globals have been read.  Rather than awkwardly modeling this behaviour
+  // directly in predictValueUseListOrderImpl(), just assign IDs to
+  // initializers of GlobalValues before GlobalValues themselves to model this
+  // implicitly.
+  for (const GlobalVariable &G : M.globals())
+    if (G.hasInitializer())
+      if (!isa<GlobalValue>(G.getInitializer()))
+        orderValue(G.getInitializer(), OM);
+  for (const GlobalAlias &A : M.aliases())
+    if (!isa<GlobalValue>(A.getAliasee()))
+      orderValue(A.getAliasee(), OM);
+  for (const Function &F : M)
+    if (F.hasPrefixData())
+      if (!isa<GlobalValue>(F.getPrefixData()))
+        orderValue(F.getPrefixData(), OM);
+  OM.LastGlobalConstantID = OM.size();
+
+  // Initializers of GlobalValues are processed in
+  // BitcodeReader::ResolveGlobalAndAliasInits().  Match the order there rather
+  // than ValueEnumerator, and match the code in predictValueUseListOrderImpl()
+  // by giving IDs in reverse order.
+  //
+  // Since GlobalValues never reference each other directly (just through
+  // initializers), their relative IDs only matter for determining order of
+  // uses in their initializers.
+  for (const Function &F : M)
+    orderValue(&F, OM);
+  for (const GlobalAlias &A : M.aliases())
+    orderValue(&A, OM);
+  for (const GlobalVariable &G : M.globals())
+    orderValue(&G, OM);
+  OM.LastGlobalValueID = OM.size();
+
+  for (const Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    // Here we need to match the union of ValueEnumerator::incorporateFunction()
+    // and WriteFunction().  Basic blocks are implicitly declared before
+    // anything else (by declaring their size).
+    for (const BasicBlock &BB : F)
+      orderValue(&BB, OM);
+    for (const Argument &A : F.args())
+      orderValue(&A, OM);
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        for (const Value *Op : I.operands())
+          if ((isa<Constant>(*Op) && !isa<GlobalValue>(*Op)) ||
+              isa<InlineAsm>(*Op))
+            orderValue(Op, OM);
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        orderValue(&I, OM);
+  }
+  return OM;
+}
+
+static void predictValueUseListOrderImpl(const Value *V, const Function *F,
+                                         unsigned ID, const OrderMap &OM,
+                                         UseListOrderStack &Stack) {
+  // Predict use-list order for this one.
+  typedef std::pair<const Use *, unsigned> Entry;
+  SmallVector<Entry, 64> List;
+  for (const Use &U : V->uses())
+    // Check if this user will be serialized.
+    if (OM.lookup(U.getUser()).first)
+      List.push_back(std::make_pair(&U, List.size()));
+
+  if (List.size() < 2)
+    // We may have lost some users.
+    return;
+
+  bool IsGlobalValue = OM.isGlobalValue(ID);
+  std::sort(List.begin(), List.end(), [&](const Entry &L, const Entry &R) {
+    const Use *LU = L.first;
+    const Use *RU = R.first;
+    if (LU == RU)
+      return false;
+
+    auto LID = OM.lookup(LU->getUser()).first;
+    auto RID = OM.lookup(RU->getUser()).first;
+
+    // Global values are processed in reverse order.
+    //
+    // Moreover, initializers of GlobalValues are set *after* all the globals
+    // have been read (despite having earlier IDs).  Rather than awkwardly
+    // modeling this behaviour here, orderModule() has assigned IDs to
+    // initializers of GlobalValues before GlobalValues themselves.
+    if (OM.isGlobalValue(LID) && OM.isGlobalValue(RID))
+      return LID < RID;
+
+    // If ID is 4, then expect: 7 6 5 1 2 3.
+    if (LID < RID) {
+      if (RID <= ID)
+        if (!IsGlobalValue) // GlobalValue uses don't get reversed.
+          return true;
+      return false;
+    }
+    if (RID < LID) {
+      if (LID <= ID)
+        if (!IsGlobalValue) // GlobalValue uses don't get reversed.
+          return false;
+      return true;
+    }
+
+    // LID and RID are equal, so we have different operands of the same user.
+    // Assume operands are added in order for all instructions.
+    if (LID <= ID)
+      if (!IsGlobalValue) // GlobalValue uses don't get reversed.
+        return LU->getOperandNo() < RU->getOperandNo();
+    return LU->getOperandNo() > RU->getOperandNo();
+  });
+
+  if (std::is_sorted(
+          List.begin(), List.end(),
+          [](const Entry &L, const Entry &R) { return L.second < R.second; }))
+    // Order is already correct.
+    return;
+
+  // Store the shuffle.
+  Stack.emplace_back(V, F, List.size());
+  assert(List.size() == Stack.back().Shuffle.size() && "Wrong size");
+  for (size_t I = 0, E = List.size(); I != E; ++I)
+    Stack.back().Shuffle[I] = List[I].second;
+}
+
+static void predictValueUseListOrder(const Value *V, const Function *F,
+                                     OrderMap &OM, UseListOrderStack &Stack) {
+  auto &IDPair = OM[V];
+  assert(IDPair.first && "Unmapped value");
+  if (IDPair.second)
+    // Already predicted.
+    return;
+
+  // Do the actual prediction.
+  IDPair.second = true;
+  if (!V->use_empty() && std::next(V->use_begin()) != V->use_end())
+    predictValueUseListOrderImpl(V, F, IDPair.first, OM, Stack);
+
+  // Recursive descent into constants.
+  if (const Constant *C = dyn_cast<Constant>(V))
+    if (C->getNumOperands()) // Visit GlobalValues.
+      for (const Value *Op : C->operands())
+        if (isa<Constant>(Op)) // Visit GlobalValues.
+          predictValueUseListOrder(Op, F, OM, Stack);
+}
+
+static UseListOrderStack predictUseListOrder(const Module &M) {
+  OrderMap OM = orderModule(M);
+
+  // Use-list orders need to be serialized after all the users have been added
+  // to a value, or else the shuffles will be incomplete.  Store them per
+  // function in a stack.
+  //
+  // Aside from function order, the order of values doesn't matter much here.
+  UseListOrderStack Stack;
+
+  // We want to visit the functions backward now so we can list function-local
+  // constants in the last Function they're used in.  Module-level constants
+  // have already been visited above.
+  for (auto I = M.rbegin(), E = M.rend(); I != E; ++I) {
+    const Function &F = *I;
+    if (F.isDeclaration())
+      continue;
+    for (const BasicBlock &BB : F)
+      predictValueUseListOrder(&BB, &F, OM, Stack);
+    for (const Argument &A : F.args())
+      predictValueUseListOrder(&A, &F, OM, Stack);
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        for (const Value *Op : I.operands())
+          if (isa<Constant>(*Op) || isa<InlineAsm>(*Op)) // Visit GlobalValues.
+            predictValueUseListOrder(Op, &F, OM, Stack);
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        predictValueUseListOrder(&I, &F, OM, Stack);
+  }
+
+  // Visit globals last, since the module-level use-list block will be seen
+  // before the function bodies are processed.
+  for (const GlobalVariable &G : M.globals())
+    predictValueUseListOrder(&G, nullptr, OM, Stack);
+  for (const Function &F : M)
+    predictValueUseListOrder(&F, nullptr, OM, Stack);
+  for (const GlobalAlias &A : M.aliases())
+    predictValueUseListOrder(&A, nullptr, OM, Stack);
+  for (const GlobalVariable &G : M.globals())
+    if (G.hasInitializer())
+      predictValueUseListOrder(G.getInitializer(), nullptr, OM, Stack);
+  for (const GlobalAlias &A : M.aliases())
+    predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack);
+  for (const Function &F : M)
+    if (F.hasPrefixData())
+      predictValueUseListOrder(F.getPrefixData(), nullptr, OM, Stack);
+
+  return Stack;
+}
+
 static bool isIntOrIntVectorValue(const std::pair<const Value*, unsigned> &V) {
   return V.first->getType()->isIntOrIntVectorTy();
 }
 
-/// ValueEnumerator - Enumerate module-level information.
-ValueEnumerator::ValueEnumerator(const Module *M) {
+ValueEnumerator::ValueEnumerator(const Module &M) {
+  if (shouldPreserveBitcodeUseListOrder())
+    UseListOrders = predictUseListOrder(M);
+
   // Enumerate the global variables.
-  for (Module::const_global_iterator I = M->global_begin(),
-         E = M->global_end(); I != E; ++I)
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
     EnumerateValue(I);
 
   // Enumerate the functions.
-  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+  for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
     EnumerateValue(I);
     EnumerateAttributes(cast<Function>(I)->getAttributes());
   }
 
   // Enumerate the aliases.
-  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E; ++I)
     EnumerateValue(I);
 
@@ -50,30 +299,30 @@
   unsigned FirstConstant = Values.size();
 
   // Enumerate the global variable initializers.
-  for (Module::const_global_iterator I = M->global_begin(),
-         E = M->global_end(); I != E; ++I)
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
     if (I->hasInitializer())
       EnumerateValue(I->getInitializer());
 
   // Enumerate the aliasees.
-  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+  for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E; ++I)
     EnumerateValue(I->getAliasee());
 
   // Enumerate the prefix data constants.
-  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
+  for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
     if (I->hasPrefixData())
       EnumerateValue(I->getPrefixData());
 
   // Insert constants and metadata that are named at module level into the slot
   // pool so that the module symbol table can refer to them...
-  EnumerateValueSymbolTable(M->getValueSymbolTable());
+  EnumerateValueSymbolTable(M.getValueSymbolTable());
   EnumerateNamedMetadata(M);
 
-  SmallVector<std::pair<unsigned, MDNode*>, 8> MDs;
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
 
   // Enumerate types used by function bodies and argument lists.
-  for (const Function &F : *M) {
+  for (const Function &F : M) {
     for (const Argument &A : F.args())
       EnumerateType(A.getType());
 
@@ -179,6 +428,11 @@
 void ValueEnumerator::OptimizeConstants(unsigned CstStart, unsigned CstEnd) {
   if (CstStart == CstEnd || CstStart+1 == CstEnd) return;
 
+  if (shouldPreserveBitcodeUseListOrder())
+    // Optimizing constants makes the use-list order difficult to predict.
+    // Disable it for now when trying to preserve the order.
+    return;
+
   std::stable_sort(Values.begin() + CstStart, Values.begin() + CstEnd,
                    [this](const std::pair<const Value *, unsigned> &LHS,
                           const std::pair<const Value *, unsigned> &RHS) {
@@ -209,11 +463,12 @@
     EnumerateValue(VI->getValue());
 }
 
-/// EnumerateNamedMetadata - Insert all of the values referenced by
-/// named metadata in the specified module.
-void ValueEnumerator::EnumerateNamedMetadata(const Module *M) {
-  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
-       E = M->named_metadata_end(); I != E; ++I)
+/// Insert all of the values referenced by named metadata in the specified
+/// module.
+void ValueEnumerator::EnumerateNamedMetadata(const Module &M) {
+  for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
+                                             E = M.named_metadata_end();
+       I != E; ++I)
     EnumerateNamedMDNode(I);
 }
 
@@ -239,31 +494,31 @@
 void ValueEnumerator::EnumerateMetadata(const Value *MD) {
   assert((isa<MDNode>(MD) || isa<MDString>(MD)) && "Invalid metadata kind");
 
-  // Enumerate the type of this value.
-  EnumerateType(MD->getType());
-
+  // Skip function-local nodes themselves, but walk their operands.
   const MDNode *N = dyn_cast<MDNode>(MD);
-
-  // In the module-level pass, skip function-local nodes themselves, but
-  // do walk their operands.
   if (N && N->isFunctionLocal() && N->getFunction()) {
     EnumerateMDNodeOperands(N);
     return;
   }
 
-  // Check to see if it's already in!
-  unsigned &MDValueID = MDValueMap[MD];
-  if (MDValueID) {
-    // Increment use count.
-    MDValues[MDValueID-1].second++;
+  // Insert a dummy ID to block the co-recursive call to
+  // EnumerateMDNodeOperands() from re-visiting MD in a cyclic graph.
+  //
+  // Return early if there's already an ID.
+  if (!MDValueMap.insert(std::make_pair(MD, 0)).second)
     return;
-  }
-  MDValues.push_back(std::make_pair(MD, 1U));
-  MDValueID = MDValues.size();
 
-  // Enumerate all non-function-local operands.
+  // Enumerate the type of this value.
+  EnumerateType(MD->getType());
+
+  // Visit operands first to minimize RAUW.
   if (N)
     EnumerateMDNodeOperands(N);
+
+  // Replace the dummy ID inserted above with the correct one.  MDValueMap may
+  // have changed by inserting operands, so we need a fresh lookup here.
+  MDValues.push_back(MD);
+  MDValueMap[MD] = MDValues.size();
 }
 
 /// EnumerateFunctionLocalMetadataa - Incorporate function-local metadata
@@ -277,12 +532,10 @@
 
   // Check to see if it's already in!
   unsigned &MDValueID = MDValueMap[N];
-  if (MDValueID) {
-    // Increment use count.
-    MDValues[MDValueID-1].second++;
+  if (MDValueID)
     return;
-  }
-  MDValues.push_back(std::make_pair(N, 1U));
+
+  MDValues.push_back(N);
   MDValueID = MDValues.size();
 
   // To incoroporate function-local information visit all function-local
@@ -487,7 +740,7 @@
             FnLocalMDVector.push_back(MD);
       }
 
-      SmallVector<std::pair<unsigned, MDNode*>, 8> MDs;
+      SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
       I->getAllMetadataOtherThanDebugLoc(MDs);
       for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
         MDNode *N = MDs[i].second;
@@ -510,7 +763,7 @@
   for (unsigned i = NumModuleValues, e = Values.size(); i != e; ++i)
     ValueMap.erase(Values[i].first);
   for (unsigned i = NumModuleMDValues, e = MDValues.size(); i != e; ++i)
-    MDValueMap.erase(MDValues[i].first);
+    MDValueMap.erase(MDValues[i]);
   for (unsigned i = 0, e = BasicBlocks.size(); i != e; ++i)
     ValueMap.erase(BasicBlocks[i]);
 

diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 1c9f38e..563c214 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h

@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef VALUE_ENUMERATOR_H
-#define VALUE_ENUMERATOR_H
+#ifndef LLVM_LIB_BITCODE_WRITER_VALUEENUMERATOR_H
+#define LLVM_LIB_BITCODE_WRITER_VALUEENUMERATOR_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/UseListOrder.h"
 #include <vector>
 
 namespace llvm {
@@ -42,6 +43,9 @@
 
   // For each value, we remember its Value* and occurrence frequency.
   typedef std::vector<std::pair<const Value*, unsigned> > ValueList;
+
+  UseListOrderStack UseListOrders;
+
 private:
   typedef DenseMap<Type*, unsigned> TypeMapType;
   TypeMapType TypeMap;
@@ -54,7 +58,7 @@
   typedef UniqueVector<const Comdat *> ComdatSetType;
   ComdatSetType Comdats;
 
-  ValueList MDValues;
+  std::vector<const Value *> MDValues;
   SmallVector<const MDNode *, 8> FunctionLocalMDs;
   ValueMapType MDValueMap;
 
@@ -92,7 +96,7 @@
   ValueEnumerator(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
   void operator=(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
 public:
-  ValueEnumerator(const Module *M);
+  ValueEnumerator(const Module &M);
 
   void dump() const;
   void print(raw_ostream &OS, const ValueMapType &Map, const char *Name) const;
@@ -130,7 +134,7 @@
   }
 
   const ValueList &getValues() const { return Values; }
-  const ValueList &getMDValues() const { return MDValues; }
+  const std::vector<const Value *> &getMDValues() const { return MDValues; }
   const SmallVectorImpl<const MDNode *> &getFunctionLocalMDValues() const {
     return FunctionLocalMDs;
   }
@@ -172,7 +176,7 @@
   void EnumerateAttributes(AttributeSet PAL);
 
   void EnumerateValueSymbolTable(const ValueSymbolTable &ST);
-  void EnumerateNamedMetadata(const Module *M);
+  void EnumerateNamedMetadata(const Module &M);
 };
 
 } // End llvm namespace

diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 0f38c64..91c1314 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp

@@ -24,7 +24,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
@@ -111,18 +110,13 @@
   return((KillIndices[Reg] != ~0u) && (DefIndices[Reg] == ~0u));
 }
 
-
-
-AggressiveAntiDepBreaker::
-AggressiveAntiDepBreaker(MachineFunction& MFi,
-                         const RegisterClassInfo &RCI,
-                         TargetSubtargetInfo::RegClassVector& CriticalPathRCs) :
-  AntiDepBreaker(), MF(MFi),
-  MRI(MF.getRegInfo()),
-  TII(MF.getTarget().getInstrInfo()),
-  TRI(MF.getTarget().getRegisterInfo()),
-  RegClassInfo(RCI),
-  State(nullptr) {
+AggressiveAntiDepBreaker::AggressiveAntiDepBreaker(
+    MachineFunction &MFi, const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::RegClassVector &CriticalPathRCs)
+    : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()),
+      TII(MF.getSubtarget().getInstrInfo()),
+      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI),
+      State(nullptr) {
   /* Collect a bitset of all registers that are only broken if they
      are on the critical path. */
   for (unsigned i = 0, e = CriticalPathRCs.size(); i < e; ++i) {
@@ -262,11 +256,8 @@
   for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
        P != PE; ++P) {
     if ((P->getKind() == SDep::Anti) || (P->getKind() == SDep::Output)) {
-      unsigned Reg = P->getReg();
-      if (RegSet.count(Reg) == 0) {
+      if (RegSet.insert(P->getReg()).second)
         Edges.push_back(&*P);
-        RegSet.insert(Reg);
-      }
     }
   }
 }
@@ -527,7 +518,7 @@
       BV &= RCBV;
     }
 
-    DEBUG(dbgs() << " " << RC->getName());
+    DEBUG(dbgs() << " " << TRI->getRegClassName(RC));
   }
 
   return BV;
@@ -582,7 +573,9 @@
     unsigned Reg = Regs[i];
     if (Reg == SuperReg) continue;
     bool IsSub = TRI->isSubRegister(SuperReg, Reg);
-    assert(IsSub && "Expecting group subregister");
+    // FIXME: remove this once PR18663 has been properly fixed. For now,
+    // return a conservative answer:
+    // assert(IsSub && "Expecting group subregister");
     if (!IsSub)
       return false;
   }
@@ -618,8 +611,7 @@
 
   DEBUG(dbgs() << "\tFind Registers:");
 
-  if (RenameOrder.count(SuperRC) == 0)
-    RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size()));
+  RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size()));
 
   unsigned OrigR = RenameOrder[SuperRC];
   unsigned EndR = ((OrigR == Order.size()) ? 0 : OrigR);

diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 2ab9d89..12cf95b 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
-#define LLVM_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+#ifndef LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
 
 #include "AntiDepBreaker.h"
 #include "llvm/ADT/BitVector.h"
@@ -32,88 +32,83 @@
 namespace llvm {
 class RegisterClassInfo;
 
-  /// Class AggressiveAntiDepState
   /// Contains all the state necessary for anti-dep breaking.
   class AggressiveAntiDepState {
   public:
-    /// RegisterReference - Information about a register reference
-    /// within a liverange
+    /// Information about a register reference within a liverange
     typedef struct {
-      /// Operand - The registers operand
+      /// The registers operand
       MachineOperand *Operand;
-      /// RC - The register class
+      /// The register class
       const TargetRegisterClass *RC;
     } RegisterReference;
 
   private:
-    /// NumTargetRegs - Number of non-virtual target registers
-    /// (i.e. TRI->getNumRegs()).
+    /// Number of non-virtual target registers (i.e. TRI->getNumRegs()).
     const unsigned NumTargetRegs;
 
-    /// GroupNodes - Implements a disjoint-union data structure to
+    /// Implements a disjoint-union data structure to
     /// form register groups. A node is represented by an index into
     /// the vector. A node can "point to" itself to indicate that it
     /// is the parent of a group, or point to another node to indicate
     /// that it is a member of the same group as that node.
     std::vector<unsigned> GroupNodes;
 
-    /// GroupNodeIndices - For each register, the index of the GroupNode
+    /// For each register, the index of the GroupNode
     /// currently representing the group that the register belongs to.
     /// Register 0 is always represented by the 0 group, a group
     /// composed of registers that are not eligible for anti-aliasing.
     std::vector<unsigned> GroupNodeIndices;
 
-    /// RegRefs - Map registers to all their references within a live range.
+    /// Map registers to all their references within a live range.
     std::multimap<unsigned, RegisterReference> RegRefs;
 
-    /// KillIndices - The index of the most recent kill (proceding bottom-up),
+    /// The index of the most recent kill (proceding bottom-up),
     /// or ~0u if the register is not live.
     std::vector<unsigned> KillIndices;
 
-    /// DefIndices - The index of the most recent complete def (proceding bottom
+    /// The index of the most recent complete def (proceding bottom
     /// up), or ~0u if the register is live.
     std::vector<unsigned> DefIndices;
 
   public:
     AggressiveAntiDepState(const unsigned TargetRegs, MachineBasicBlock *BB);
 
-    /// GetKillIndices - Return the kill indices.
+    /// Return the kill indices.
     std::vector<unsigned> &GetKillIndices() { return KillIndices; }
 
-    /// GetDefIndices - Return the define indices.
+    /// Return the define indices.
     std::vector<unsigned> &GetDefIndices() { return DefIndices; }
 
-    /// GetRegRefs - Return the RegRefs map.
+    /// Return the RegRefs map.
     std::multimap<unsigned, RegisterReference>& GetRegRefs() { return RegRefs; }
 
-    // GetGroup - Get the group for a register. The returned value is
+    // Get the group for a register. The returned value is
     // the index of the GroupNode representing the group.
     unsigned GetGroup(unsigned Reg);
 
-    // GetGroupRegs - Return a vector of the registers belonging to a
-    // group. If RegRefs is non-NULL then only included referenced registers.
+    // Return a vector of the registers belonging to a group.
+    // If RegRefs is non-NULL then only included referenced registers.
     void GetGroupRegs(
        unsigned Group,
        std::vector<unsigned> &Regs,
        std::multimap<unsigned,
          AggressiveAntiDepState::RegisterReference> *RegRefs);
 
-    // UnionGroups - Union Reg1's and Reg2's groups to form a new
-    // group. Return the index of the GroupNode representing the
-    // group.
+    // Union Reg1's and Reg2's groups to form a new group.
+    // Return the index of the GroupNode representing the group.
     unsigned UnionGroups(unsigned Reg1, unsigned Reg2);
 
-    // LeaveGroup - Remove a register from its current group and place
+    // Remove a register from its current group and place
     // it alone in its own group. Return the index of the GroupNode
     // representing the registers new group.
     unsigned LeaveGroup(unsigned Reg);
 
-    /// IsLive - Return true if Reg is live
+    /// Return true if Reg is live.
     bool IsLive(unsigned Reg);
   };
 
 
-  /// Class AggressiveAntiDepBreaker
   class AggressiveAntiDepBreaker : public AntiDepBreaker {
     MachineFunction& MF;
     MachineRegisterInfo &MRI;
@@ -121,12 +116,11 @@
     const TargetRegisterInfo *TRI;
     const RegisterClassInfo &RegClassInfo;
 
-    /// CriticalPathSet - The set of registers that should only be
+    /// The set of registers that should only be
     /// renamed if they are on the critical path.
     BitVector CriticalPathSet;
 
-    /// State - The state used to identify and rename anti-dependence
-    /// registers.
+    /// The state used to identify and rename anti-dependence registers.
     AggressiveAntiDepState *State;
 
   public:
@@ -135,11 +129,10 @@
                           TargetSubtargetInfo::RegClassVector& CriticalPathRCs);
     ~AggressiveAntiDepBreaker();
 
-    /// Start - Initialize anti-dep breaking for a new basic block.
+    /// Initialize anti-dep breaking for a new basic block.
     void StartBlock(MachineBasicBlock *BB) override;
 
-    /// BreakAntiDependencies - Identifiy anti-dependencies along the critical
-    /// path
+    /// Identifiy anti-dependencies along the critical path
     /// of the ScheduleDAG and break them by renaming registers.
     ///
     unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
@@ -148,24 +141,24 @@
                                    unsigned InsertPosIndex,
                                    DbgValueVector &DbgValues) override;
 
-    /// Observe - Update liveness information to account for the current
+    /// Update liveness information to account for the current
     /// instruction, which will not be scheduled.
     ///
     void Observe(MachineInstr *MI, unsigned Count,
                  unsigned InsertPosIndex) override;
 
-    /// Finish - Finish anti-dep breaking for a basic block.
+    /// Finish anti-dep breaking for a basic block.
     void FinishBlock() override;
 
   private:
     /// Keep track of a position in the allocation order for each regclass.
     typedef std::map<const TargetRegisterClass *, unsigned> RenameOrderType;
 
-    /// IsImplicitDefUse - Return true if MO represents a register
+    /// Return true if MO represents a register
     /// that is both implicitly used and defined in MI
     bool IsImplicitDefUse(MachineInstr *MI, MachineOperand& MO);
 
-    /// GetPassthruRegs - If MI implicitly def/uses a register, then
+    /// If MI implicitly def/uses a register, then
     /// return that register and all subregisters.
     void GetPassthruRegs(MachineInstr *MI, std::set<unsigned>& PassthruRegs);
 

diff --git a/lib/CodeGen/AllocationOrder.h b/lib/CodeGen/AllocationOrder.h
index 64ff2a7..1e4eaa7 100644
--- a/lib/CodeGen/AllocationOrder.h
+++ b/lib/CodeGen/AllocationOrder.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_ALLOCATIONORDER_H
-#define LLVM_CODEGEN_ALLOCATIONORDER_H
+#ifndef LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
+#define LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCRegisterInfo.h"

diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 1bdf312..9a3b790 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp

@@ -25,6 +25,9 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
 using namespace llvm;
 
 /// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence
@@ -106,15 +109,16 @@
 }
 
 /// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
-GlobalVariable *llvm::ExtractTypeInfo(Value *V) {
+GlobalValue *llvm::ExtractTypeInfo(Value *V) {
   V = V->stripPointerCasts();
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
+  GlobalValue *GV = dyn_cast<GlobalValue>(V);
+  GlobalVariable *Var = dyn_cast<GlobalVariable>(V);
 
-  if (GV && GV->getName() == "llvm.eh.catch.all.value") {
-    assert(GV->hasInitializer() &&
+  if (Var && Var->getName() == "llvm.eh.catch.all.value") {
+    assert(Var->hasInitializer() &&
            "The EH catch-all value must have an initializer");
-    Value *Init = GV->getInitializer();
-    GV = dyn_cast<GlobalVariable>(Init);
+    Value *Init = Var->getInitializer();
+    GV = dyn_cast<GlobalValue>(Init);
     if (!GV) V = cast<ConstantPointerNull>(Init);
   }
 
@@ -475,7 +479,7 @@
 /// between it and the return.
 ///
 /// This function only tests target-independent requirements.
-bool llvm::isInTailCallPosition(ImmutableCallSite CS, const SelectionDAG &DAG) {
+bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
   const Instruction *I = CS.getInstruction();
   const BasicBlock *ExitBB = I->getParent();
   const TerminatorInst *Term = ExitBB->getTerminator();
@@ -490,8 +494,7 @@
   // longjmp on x86), it can end up causing miscompilation that has not
   // been fully understood.
   if (!Ret &&
-      (!DAG.getTarget().Options.GuaranteedTailCallOpt ||
-       !isa<UnreachableInst>(Term)))
+      (!TM.Options.GuaranteedTailCallOpt || !isa<UnreachableInst>(Term)))
     return false;
 
   // If I will have a chain, make sure no other instruction that will have a
@@ -509,8 +512,8 @@
         return false;
     }
 
-  return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret,
-                                         *DAG.getTarget().getTargetLowering());
+  return returnTypeIsEligibleForTailCall(
+      ExitBB->getParent(), I, Ret, *TM.getSubtargetImpl()->getTargetLowering());
 }
 
 bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
@@ -607,3 +610,29 @@
 
   return true;
 }
+
+bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) {
+  if (!GV->hasLinkOnceODRLinkage())
+    return false;
+
+  if (GV->hasUnnamedAddr())
+    return true;
+
+  // If it is a non constant variable, it needs to be uniqued across shared
+  // objects.
+  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) {
+    if (!Var->isConstant())
+      return false;
+  }
+
+  // An alias can point to a variable. We could try to resolve the alias to
+  // decide, but for now just don't hide them.
+  if (isa<GlobalAlias>(GV))
+    return false;
+
+  GlobalStatus GS;
+  if (GlobalStatus::analyzeGlobal(GV, GS))
+    return false;
+
+  return !GS.IsCompared;
+}

diff --git a/lib/CodeGen/Android.mk b/lib/CodeGen/Android.mk
index 05e5c45..5cb351d 100644
--- a/lib/CodeGen/Android.mk
+++ b/lib/CodeGen/Android.mk

@@ -4,7 +4,7 @@
   AggressiveAntiDepBreaker.cpp \
   AllocationOrder.cpp \
   Analysis.cpp \
-  AtomicExpandLoadLinkedPass.cpp \
+  AtomicExpandPass.cpp \
   BasicTargetTransformInfo.cpp \
   BranchFolding.cpp \
   CalcSpillWeights.cpp \
@@ -21,6 +21,7 @@
   ExecutionDepsFix.cpp \
   ExpandISelPseudos.cpp \
   ExpandPostRAPseudos.cpp \
+  ForwardControlFlowIntegrity.cpp \
   GCMetadata.cpp \
   GCMetadataPrinter.cpp \
   GCStrategy.cpp \
@@ -29,7 +30,6 @@
   InlineSpiller.cpp \
   InterferenceCache.cpp \
   IntrinsicLowering.cpp \
-  JITCodeEmitter.cpp \
   JumpInstrTables.cpp \
   LatencyPriorityQueue.cpp \
   LexicalScopes.cpp \
@@ -49,7 +49,7 @@
   MachineBlockFrequencyInfo.cpp \
   MachineBlockPlacement.cpp \
   MachineBranchProbabilityInfo.cpp \
-  MachineCodeEmitter.cpp \
+  MachineCombiner.cpp \
   MachineCopyPropagation.cpp \
   MachineCSE.cpp \
   MachineDominators.cpp \
@@ -97,7 +97,6 @@
   ShadowStackGC.cpp \
   SjLjEHPrepare.cpp \
   SlotIndexes.cpp \
-  Spiller.cpp \
   SpillPlacement.cpp \
   SplitKit.cpp \
   StackColoring.cpp \

diff --git a/lib/CodeGen/AntiDepBreaker.h b/lib/CodeGen/AntiDepBreaker.h
index df47f98..a61a8ef 100644
--- a/lib/CodeGen/AntiDepBreaker.h
+++ b/lib/CodeGen/AntiDepBreaker.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_ANTIDEPBREAKER_H
-#define LLVM_CODEGEN_ANTIDEPBREAKER_H
+#ifndef LLVM_LIB_CODEGEN_ANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_ANTIDEPBREAKER_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -25,9 +25,8 @@
 
 namespace llvm {
 
-/// AntiDepBreaker - This class works into conjunction with the
-/// post-RA scheduler to rename registers to break register
-/// anti-dependencies.
+/// This class works in conjunction with the post-RA scheduler to rename
+/// registers to break register anti-dependencies (WAR hazards).
 class AntiDepBreaker {
 public:
   typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > 
@@ -35,29 +34,26 @@
 
   virtual ~AntiDepBreaker();
 
-  /// Start - Initialize anti-dep breaking for a new basic block.
+  /// Initialize anti-dep breaking for a new basic block.
   virtual void StartBlock(MachineBasicBlock *BB) =0;
 
-  /// BreakAntiDependencies - Identifiy anti-dependencies within a
-  /// basic-block region and break them by renaming registers. Return
-  /// the number of anti-dependencies broken.
-  ///
+  /// Identifiy anti-dependencies within a basic-block region and break them by
+  /// renaming registers. Return the number of anti-dependencies broken.
   virtual unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                                          MachineBasicBlock::iterator Begin,
                                          MachineBasicBlock::iterator End,
                                          unsigned InsertPosIndex,
                                          DbgValueVector &DbgValues) = 0;
   
-  /// Observe - Update liveness information to account for the current
+  /// Update liveness information to account for the current
   /// instruction, which will not be scheduled.
-  ///
   virtual void Observe(MachineInstr *MI, unsigned Count,
                        unsigned InsertPosIndex) =0;
   
-  /// Finish - Finish anti-dep breaking for a basic block.
+  /// Finish anti-dep breaking for a basic block.
   virtual void FinishBlock() =0;
 
-  /// UpdateDbgValue - Update DBG_VALUE if dependency breaker is updating
+  /// Update DBG_VALUE if dependency breaker is updating
   /// other machine instruction to use NewReg.
   void UpdateDbgValue(MachineInstr *MI, unsigned OldReg, unsigned NewReg) {
     assert (MI->isDebugValue() && "MI is not DBG_VALUE!");

diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 251f5ef..66c6c63 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp

@@ -108,7 +108,7 @@
 }
 
 void ARMException::emitTypeInfos(unsigned TTypeEncoding) {
-  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<const GlobalValue *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
 
   bool VerboseAsm = Asm->OutStreamer.isVerboseAsm();
@@ -121,9 +121,9 @@
     Entry = TypeInfos.size();
   }
 
-  for (std::vector<const GlobalVariable *>::const_reverse_iterator
+  for (std::vector<const GlobalValue *>::const_reverse_iterator
          I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) {
-    const GlobalVariable *GV = *I;
+    const GlobalValue *GV = *I;
     if (VerboseAsm)
       Asm->OutStreamer.AddComment("TypeInfo " + Twine(Entry--));
     Asm->EmitTTypeReference(GV, TTypeEncoding);

diff --git a/lib/CodeGen/AsmPrinter/AddressPool.h b/lib/CodeGen/AsmPrinter/AddressPool.h
index 42757d7..802e050 100644
--- a/lib/CodeGen/AsmPrinter/AddressPool.h
+++ b/lib/CodeGen/AsmPrinter/AddressPool.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_ADDRESSPOOL_H__
-#define CODEGEN_ASMPRINTER_ADDRESSPOOL_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
 
 #include "llvm/ADT/DenseMap.h"
 

diff --git a/lib/CodeGen/AsmPrinter/Android.mk b/lib/CodeGen/AsmPrinter/Android.mk
index 083cc0d..cb8e96a 100644
--- a/lib/CodeGen/AsmPrinter/Android.mk
+++ b/lib/CodeGen/AsmPrinter/Android.mk

@@ -11,6 +11,7 @@
   DIEHash.cpp \
   DwarfAccelTable.cpp \
   DwarfCFIException.cpp \
+  DwarfCompileUnit.cpp \
   DwarfDebug.cpp \
   DwarfFile.cpp \
   DwarfStringPool.cpp \

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index f80fdea..8a32713 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp

@@ -14,11 +14,13 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
+#include "Win64Exception.h"
 #include "WinCodeViewLineTables.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/JumpInstrTableInfo.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -49,7 +51,6 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/Transforms/Utils/GlobalStatus.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -98,11 +99,10 @@
 }
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
-  : MachineFunctionPass(ID),
-    TM(tm), MAI(tm.getMCAsmInfo()), MII(tm.getInstrInfo()),
-    OutContext(Streamer.getContext()),
-    OutStreamer(Streamer),
-    LastMI(nullptr), LastFn(0), Counter(~0U), SetCounter(0) {
+    : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
+      MII(tm.getSubtargetImpl()->getInstrInfo()),
+      OutContext(Streamer.getContext()), OutStreamer(Streamer), LastMI(nullptr),
+      LastFn(0), Counter(~0U), SetCounter(0) {
   DD = nullptr; MMI = nullptr; LI = nullptr; MF = nullptr;
   CurrentFnSym = CurrentFnSymForSize = nullptr;
   GCMetadataPrinters = nullptr;
@@ -129,12 +129,12 @@
 }
 
 const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const {
-  return TM.getTargetLowering()->getObjFileLowering();
+  return TM.getSubtargetImpl()->getTargetLowering()->getObjFileLowering();
 }
 
 /// getDataLayout - Return information about data layout.
 const DataLayout &AsmPrinter::getDataLayout() const {
-  return *TM.getDataLayout();
+  return *TM.getSubtargetImpl()->getDataLayout();
 }
 
 const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const {
@@ -173,9 +173,9 @@
   const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
     .Initialize(OutContext, TM);
 
-  OutStreamer.InitSections();
+  OutStreamer.InitSections(false);
 
-  Mang = new Mangler(TM.getDataLayout());
+  Mang = new Mangler(TM.getSubtargetImpl()->getDataLayout());
 
   // Emit the version-min deplyment target directive if needed.
   //
@@ -222,14 +222,12 @@
   }
 
   if (MAI->doesSupportDebugInformation()) {
-    if (Triple(TM.getTargetTriple()).isKnownWindowsMSVCEnvironment()) {
+    if (Triple(TM.getTargetTriple()).isKnownWindowsMSVCEnvironment())
       Handlers.push_back(HandlerInfo(new WinCodeViewLineTables(this),
                                      DbgTimerName,
                                      CodeViewLineTablesGroupName));
-    } else {
-      DD = new DwarfDebug(this, &M);
-      Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
-    }
+    DD = new DwarfDebug(this, &M);
+    Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
   }
 
   EHStreamer *ES = nullptr;
@@ -243,8 +241,13 @@
   case ExceptionHandling::ARM:
     ES = new ARMException(this);
     break;
-  case ExceptionHandling::WinEH:
-    ES = new Win64Exception(this);
+  case ExceptionHandling::ItaniumWinEH:
+    switch (MAI->getWinEHEncodingType()) {
+    default: llvm_unreachable("unsupported unwinding information encoding");
+    case WinEH::EncodingType::Itanium:
+      ES = new Win64Exception(this);
+      break;
+    }
     break;
   }
   if (ES)
@@ -253,33 +256,10 @@
 }
 
 static bool canBeHidden(const GlobalValue *GV, const MCAsmInfo &MAI) {
-  GlobalValue::LinkageTypes Linkage = GV->getLinkage();
-  if (Linkage != GlobalValue::LinkOnceODRLinkage)
-    return false;
-
   if (!MAI.hasWeakDefCanBeHiddenDirective())
     return false;
 
-  if (GV->hasUnnamedAddr())
-    return true;
-
-  // This is only used for MachO, so right now it doesn't really matter how
-  // we handle alias. Revisit this once the MachO linker implements aliases.
-  if (isa<GlobalAlias>(GV))
-    return false;
-
-  // If it is a non constant variable, it needs to be uniqued across shared
-  // objects.
-  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) {
-    if (!Var->isConstant())
-      return false;
-  }
-
-  GlobalStatus GS;
-  if (!GlobalStatus::analyzeGlobal(GV, GS) && !GS.IsCompared)
-    return true;
-
-  return false;
+  return canBeOmittedFromSymbolTable(GV);
 }
 
 void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
@@ -361,7 +341,7 @@
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   uint64_t Size = DL->getTypeAllocSize(GV->getType()->getElementType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
@@ -578,20 +558,24 @@
   // We assume a single instruction only has a spill or reload, not
   // both.
   const MachineMemOperand *MMO;
-  if (TM.getInstrInfo()->isLoadFromStackSlotPostFE(&MI, FI)) {
+  if (TM.getSubtargetImpl()->getInstrInfo()->isLoadFromStackSlotPostFE(&MI,
+                                                                       FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Reload\n";
     }
-  } else if (TM.getInstrInfo()->hasLoadFromStackSlot(&MI, MMO, FI)) {
+  } else if (TM.getSubtargetImpl()->getInstrInfo()->hasLoadFromStackSlot(
+                 &MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Reload\n";
-  } else if (TM.getInstrInfo()->isStoreToStackSlotPostFE(&MI, FI)) {
+  } else if (TM.getSubtargetImpl()->getInstrInfo()->isStoreToStackSlotPostFE(
+                 &MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Spill\n";
     }
-  } else if (TM.getInstrInfo()->hasStoreToStackSlot(&MI, MMO, FI)) {
+  } else if (TM.getSubtargetImpl()->getInstrInfo()->hasStoreToStackSlot(
+                 &MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Spill\n";
   }
@@ -605,8 +589,9 @@
 /// that is an implicit def.
 void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  OutStreamer.AddComment(Twine("implicit-def: ") +
-                         TM.getRegisterInfo()->getName(RegNo));
+  OutStreamer.AddComment(
+      Twine("implicit-def: ") +
+      TM.getSubtargetImpl()->getRegisterInfo()->getName(RegNo));
   OutStreamer.AddBlankLine();
 }
 
@@ -616,7 +601,7 @@
     const MachineOperand &Op = MI->getOperand(i);
     assert(Op.isReg() && "KILL instruction must have only register operands");
     Str += ' ';
-    Str += AP.TM.getRegisterInfo()->getName(Op.getReg());
+    Str += AP.TM.getSubtargetImpl()->getRegisterInfo()->getName(Op.getReg());
     Str += (Op.isDef() ? "<def>" : "<kill>");
   }
   AP.OutStreamer.AddComment(Str);
@@ -627,21 +612,27 @@
 /// of DBG_VALUE, returning true if it was able to do so.  A false return
 /// means the target will need to handle MI in EmitInstruction.
 static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
-  // This code handles only the 3-operand target-independent form.
-  if (MI->getNumOperands() != 3)
+  // This code handles only the 4-operand target-independent form.
+  if (MI->getNumOperands() != 4)
     return false;
 
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
   OS << "DEBUG_VALUE: ";
 
-  DIVariable V(MI->getOperand(2).getMetadata());
+  DIVariable V = MI->getDebugVariable();
   if (V.getContext().isSubprogram()) {
     StringRef Name = DISubprogram(V.getContext()).getDisplayName();
     if (!Name.empty())
       OS << Name << ":";
   }
-  OS << V.getName() << " <- ";
+  OS << V.getName();
+
+  DIExpression Expr = MI->getDebugExpression();
+  if (Expr.isVariablePiece())
+    OS << " [piece offset=" << Expr.getPieceOffset()
+       << " size=" << Expr.getPieceSize() << "]";
+  OS << " <- ";
 
   // The second operand is only an offset if it's an immediate.
   bool Deref = MI->getOperand(0).isReg() && MI->getOperand(1).isImm();
@@ -672,7 +663,8 @@
       Reg = MI->getOperand(0).getReg();
     } else {
       assert(MI->getOperand(0).isFI() && "Unknown operand type");
-      const TargetFrameLowering *TFI = AP.TM.getFrameLowering();
+      const TargetFrameLowering *TFI =
+          AP.TM.getSubtargetImpl()->getFrameLowering();
       Offset += TFI->getFrameIndexReference(*AP.MF,
                                             MI->getOperand(0).getIndex(), Reg);
       Deref = true;
@@ -686,7 +678,7 @@
     }
     if (Deref)
       OS << '[';
-    OS << AP.TM.getRegisterInfo()->getName(Reg);
+    OS << AP.TM.getSubtargetImpl()->getRegisterInfo()->getName(Reg);
   }
 
   if (Deref)
@@ -709,8 +701,8 @@
 }
 
 bool AsmPrinter::needsSEHMoves() {
-  return MAI->getExceptionHandlingType() == ExceptionHandling::WinEH &&
-    MF->getFunction()->needsUnwindTableEntry();
+  return MAI->getExceptionHandlingType() == ExceptionHandling::ItaniumWinEH &&
+         MF->getFunction()->needsUnwindTableEntry();
 }
 
 void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
@@ -722,9 +714,6 @@
   if (needsCFIMoves() == CFI_M_None)
     return;
 
-  if (MMI->getCompactUnwindEncoding() != 0)
-    OutStreamer.EmitCompactUnwindEncoding(MMI->getCompactUnwindEncoding());
-
   const MachineModuleInfo &MMI = MF->getMMI();
   const std::vector<MCCFIInstruction> &Instrs = MMI.getFrameInstructions();
   unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
@@ -742,12 +731,10 @@
 
   // Print out code for the function.
   bool HasAnyRealCode = false;
-  const MachineInstr *LastMI = nullptr;
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
     EmitBasicBlockStart(MBB);
     for (auto &MI : MBB) {
-      LastMI = &MI;
 
       // Print the assembly for the instruction.
       if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
@@ -804,26 +791,22 @@
         }
       }
     }
-  }
 
-  // If the last instruction was a prolog label, then we have a situation where
-  // we emitted a prolog but no function body. This results in the ending prolog
-  // label equaling the end of function label and an invalid "row" in the
-  // FDE. We need to emit a noop in this situation so that the FDE's rows are
-  // valid.
-  bool RequiresNoop = LastMI && LastMI->isCFIInstruction();
+    EmitBasicBlockEnd(MBB);
+  }
 
   // If the function is empty and the object file uses .subsections_via_symbols,
   // then we need to emit *something* to the function body to prevent the
   // labels from collapsing together.  Just emit a noop.
-  if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode) || RequiresNoop) {
+  if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode)) {
     MCInst Noop;
-    TM.getInstrInfo()->getNoopForMachoTarget(Noop);
-    if (Noop.getOpcode()) {
-      OutStreamer.AddComment("avoids zero-length function");
+    TM.getSubtargetImpl()->getInstrInfo()->getNoopForMachoTarget(Noop);
+    OutStreamer.AddComment("avoids zero-length function");
+
+    // Targets can opt-out of emitting the noop here by leaving the opcode
+    // unspecified.
+    if (Noop.getOpcode())
       OutStreamer.EmitInstruction(Noop, getSubtargetInfo());
-    } else  // Target not mc-ized yet.
-      OutStreamer.EmitRawText(StringRef("\tnop\n"));
   }
 
   const Function *F = MF->getFunction();
@@ -895,17 +878,18 @@
     unsigned Arch = Triple(getTargetTriple()).getArch();
     bool IsThumb = (Arch == Triple::thumb || Arch == Triple::thumbeb);
     MCInst TrapInst;
-    TM.getInstrInfo()->getTrap(TrapInst);
+    TM.getSubtargetImpl()->getInstrInfo()->getTrap(TrapInst);
+    unsigned LogAlignment = llvm::Log2_64(JITI->entryByteAlignment());
+
+    // Emit the right section for these functions.
+    OutStreamer.SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
     for (const auto &KV : JITI->getTables()) {
       uint64_t Count = 0;
       for (const auto &FunPair : KV.second) {
         // Emit the function labels to make this be a function entry point.
         MCSymbol *FunSym =
           OutContext.GetOrCreateSymbol(FunPair.second->getName());
-        OutStreamer.EmitSymbolAttribute(FunSym, MCSA_Global);
-        // FIXME: JumpTableInstrInfo should store information about the required
-        // alignment of table entries and the size of the padding instruction.
-        EmitAlignment(3);
+        EmitAlignment(LogAlignment);
         if (IsThumb)
           OutStreamer.EmitThumbFunc(FunSym);
         if (MAI->hasDotTypeDotSizeDirective())
@@ -920,16 +904,16 @@
         const MCSymbolRefExpr *TargetSymRef =
           MCSymbolRefExpr::Create(TargetSymbol, MCSymbolRefExpr::VK_PLT,
                                   OutContext);
-        TM.getInstrInfo()->getUnconditionalBranch(JumpToFun, TargetSymRef);
+        TM.getSubtargetImpl()->getInstrInfo()->getUnconditionalBranch(
+            JumpToFun, TargetSymRef);
         OutStreamer.EmitInstruction(JumpToFun, getSubtargetInfo());
         ++Count;
       }
 
       // Emit enough padding instructions to fill up to the next power of two.
-      // This assumes that the trap instruction takes 8 bytes or fewer.
       uint64_t Remaining = NextPowerOf2(Count) - Count;
       for (uint64_t C = 0; C < Remaining; ++C) {
-        EmitAlignment(3);
+        EmitAlignment(LogAlignment);
         OutStreamer.EmitInstruction(TrapInst, getSubtargetInfo());
       }
 
@@ -976,24 +960,21 @@
     }
   }
 
-  if (MAI->hasSetDirective()) {
-    OutStreamer.AddBlankLine();
-    for (const auto &Alias : M.aliases()) {
-      MCSymbol *Name = getSymbol(&Alias);
+  OutStreamer.AddBlankLine();
+  for (const auto &Alias : M.aliases()) {
+    MCSymbol *Name = getSymbol(&Alias);
 
-      if (Alias.hasExternalLinkage() || !MAI->getWeakRefDirective())
-        OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
-      else if (Alias.hasWeakLinkage() || Alias.hasLinkOnceLinkage())
-        OutStreamer.EmitSymbolAttribute(Name, MCSA_WeakReference);
-      else
-        assert(Alias.hasLocalLinkage() && "Invalid alias linkage");
+    if (Alias.hasExternalLinkage() || !MAI->getWeakRefDirective())
+      OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
+    else if (Alias.hasWeakLinkage() || Alias.hasLinkOnceLinkage())
+      OutStreamer.EmitSymbolAttribute(Name, MCSA_WeakReference);
+    else
+      assert(Alias.hasLocalLinkage() && "Invalid alias linkage");
 
-      EmitVisibility(Name, Alias.getVisibility());
+    EmitVisibility(Name, Alias.getVisibility());
 
-      // Emit the directives as assignments aka .set:
-      OutStreamer.EmitAssignment(Name,
-                                 lowerConstant(Alias.getAliasee(), *this));
-    }
+    // Emit the directives as assignments aka .set:
+    OutStreamer.EmitAssignment(Name, lowerConstant(Alias.getAliasee(), *this));
   }
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
@@ -1062,23 +1043,14 @@
     const MachineConstantPoolEntry &CPE = CP[i];
     unsigned Align = CPE.getAlignment();
 
-    SectionKind Kind;
-    switch (CPE.getRelocationInfo()) {
-    default: llvm_unreachable("Unknown section kind");
-    case 2: Kind = SectionKind::getReadOnlyWithRel(); break;
-    case 1:
-      Kind = SectionKind::getReadOnlyWithRelLocal();
-      break;
-    case 0:
-    switch (TM.getDataLayout()->getTypeAllocSize(CPE.getType())) {
-    case 4:  Kind = SectionKind::getMergeableConst4(); break;
-    case 8:  Kind = SectionKind::getMergeableConst8(); break;
-    case 16: Kind = SectionKind::getMergeableConst16();break;
-    default: Kind = SectionKind::getMergeableConst(); break;
-    }
-    }
+    SectionKind Kind =
+        CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout());
 
-    const MCSection *S = getObjFileLowering().getSectionForConstant(Kind);
+    const Constant *C = nullptr;
+    if (!CPE.isMachineConstantPoolEntry())
+      C = CPE.Val.ConstVal;
+
+    const MCSection *S = getObjFileLowering().getSectionForConstant(Kind, C);
 
     // The number of sections are small, just do a linear search from the
     // last section to the first.
@@ -1101,13 +1073,22 @@
   }
 
   // Now print stuff into the calculated sections.
+  const MCSection *CurSection = nullptr;
+  unsigned Offset = 0;
   for (unsigned i = 0, e = CPSections.size(); i != e; ++i) {
-    OutStreamer.SwitchSection(CPSections[i].S);
-    EmitAlignment(Log2_32(CPSections[i].Alignment));
-
-    unsigned Offset = 0;
     for (unsigned j = 0, ee = CPSections[i].CPEs.size(); j != ee; ++j) {
       unsigned CPI = CPSections[i].CPEs[j];
+      MCSymbol *Sym = GetCPISymbol(CPI);
+      if (!Sym->isUndefined())
+        continue;
+
+      if (CurSection != CPSections[i].S) {
+        OutStreamer.SwitchSection(CPSections[i].S);
+        EmitAlignment(Log2_32(CPSections[i].Alignment));
+        CurSection = CPSections[i].S;
+        Offset = 0;
+      }
+
       MachineConstantPoolEntry CPE = CP[CPI];
 
       // Emit inter-object padding for alignment.
@@ -1116,9 +1097,10 @@
       OutStreamer.EmitZeros(NewOffset - Offset);
 
       Type *Ty = CPE.getType();
-      Offset = NewOffset + TM.getDataLayout()->getTypeAllocSize(Ty);
-      OutStreamer.EmitLabel(GetCPISymbol(CPI));
+      Offset = NewOffset +
+               TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(Ty);
 
+      OutStreamer.EmitLabel(Sym);
       if (CPE.isMachineConstantPoolEntry())
         EmitMachineConstantPoolValue(CPE.Val.MachineCPVal);
       else
@@ -1131,7 +1113,7 @@
 /// by the current function to the current output stream.
 ///
 void AsmPrinter::EmitJumpTableInfo() {
-  const DataLayout *DL = MF->getTarget().getDataLayout();
+  const DataLayout *DL = MF->getSubtarget().getDataLayout();
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (!MJTI) return;
   if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;
@@ -1156,12 +1138,14 @@
   } else {
     // Otherwise, drop it in the readonly section.
     const MCSection *ReadOnlySection =
-      getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly());
+        getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly(),
+                                                   /*C=*/nullptr);
     OutStreamer.SwitchSection(ReadOnlySection);
     JTInDiffSection = true;
   }
 
-  EmitAlignment(Log2_32(MJTI->getEntryAlignment(*TM.getDataLayout())));
+  EmitAlignment(Log2_32(
+      MJTI->getEntryAlignment(*TM.getSubtargetImpl()->getDataLayout())));
 
   // Jump tables in code sections are marked with a data_region directive
   // where that's supported.
@@ -1174,17 +1158,17 @@
     // If this jump table was deleted, ignore it.
     if (JTBBs.empty()) continue;
 
-    // For the EK_LabelDifference32 entry, if the target supports .set, emit a
-    // .set directive for each unique entry.  This reduces the number of
-    // relocations the assembler will generate for the jump table.
+    // For the EK_LabelDifference32 entry, if using .set avoids a relocation,
+    /// emit a .set directive for each unique entry.
     if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
-        MAI->hasSetDirective()) {
+        MAI->doesSetDirectiveSuppressesReloc()) {
       SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
-      const TargetLowering *TLI = TM.getTargetLowering();
+      const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
       const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
       for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) {
         const MachineBasicBlock *MBB = JTBBs[ii];
-        if (!EmittedSets.insert(MBB)) continue;
+        if (!EmittedSets.insert(MBB).second)
+          continue;
 
         // .set LJTSet, LBB32-base
         const MCExpr *LHS =
@@ -1223,8 +1207,9 @@
   case MachineJumpTableInfo::EK_Inline:
     llvm_unreachable("Cannot emit EK_Inline jump table entry");
   case MachineJumpTableInfo::EK_Custom32:
-    Value = TM.getTargetLowering()->LowerCustomJumpTableEntry(MJTI, MBB, UID,
-                                                              OutContext);
+    Value =
+        TM.getSubtargetImpl()->getTargetLowering()->LowerCustomJumpTableEntry(
+            MJTI, MBB, UID, OutContext);
     break;
   case MachineJumpTableInfo::EK_BlockAddress:
     // EK_BlockAddress - Each entry is a plain address of block, e.g.:
@@ -1250,34 +1235,30 @@
   }
 
   case MachineJumpTableInfo::EK_LabelDifference32: {
-    // EK_LabelDifference32 - Each entry is the address of the block minus
-    // the address of the jump table.  This is used for PIC jump tables where
-    // gprel32 is not supported.  e.g.:
+    // Each entry is the address of the block minus the address of the jump
+    // table. This is used for PIC jump tables where gprel32 is not supported.
+    // e.g.:
     //      .word LBB123 - LJTI1_2
-    // If the .set directive is supported, this is emitted as:
+    // If the .set directive avoids relocations, this is emitted as:
     //      .set L4_5_set_123, LBB123 - LJTI1_2
     //      .word L4_5_set_123
-
-    // If we have emitted set directives for the jump table entries, print
-    // them rather than the entries themselves.  If we're emitting PIC, then
-    // emit the table entries as differences between two text section labels.
-    if (MAI->hasSetDirective()) {
-      // If we used .set, reference the .set's symbol.
+    if (MAI->doesSetDirectiveSuppressesReloc()) {
       Value = MCSymbolRefExpr::Create(GetJTSetSymbol(UID, MBB->getNumber()),
                                       OutContext);
       break;
     }
-    // Otherwise, use the difference as the jump table entry.
     Value = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
-    const MCExpr *JTI = MCSymbolRefExpr::Create(GetJTISymbol(UID), OutContext);
-    Value = MCBinaryExpr::CreateSub(Value, JTI, OutContext);
+    const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+    const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, UID, OutContext);
+    Value = MCBinaryExpr::CreateSub(Value, Base, OutContext);
     break;
   }
   }
 
   assert(Value && "Unknown entry kind!");
 
-  unsigned EntrySize = MJTI->getEntrySize(*TM.getDataLayout());
+  unsigned EntrySize =
+      MJTI->getEntrySize(*TM.getSubtargetImpl()->getDataLayout());
   OutStreamer.EmitValue(Value, EntrySize);
 }
 
@@ -1387,7 +1368,7 @@
   }
 
   // Emit the function pointers in the target-specific order
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   unsigned Align = Log2_32(DL->getPointerPrefAlignment());
   std::stable_sort(Structors.begin(), Structors.end(),
                    [](const Structor &L,
@@ -1450,9 +1431,9 @@
   OutStreamer.EmitIntValue(Value, 4);
 }
 
-/// EmitLabelDifference - Emit something like ".long Hi-Lo" where the size
-/// in bytes of the directive is specified by Size and Hi/Lo specify the
-/// labels.  This implicitly uses .set if it is available.
+/// Emit something like ".long Hi-Lo" where the size in bytes of the directive
+/// is specified by Size and Hi/Lo specify the labels. This implicitly uses
+/// .set if it avoids relocations.
 void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
                                      unsigned Size) const {
   // Get the Hi-Lo expression.
@@ -1461,7 +1442,7 @@
                             MCSymbolRefExpr::Create(Lo, OutContext),
                             OutContext);
 
-  if (!MAI->hasSetDirective()) {
+  if (!MAI->doesSetDirectiveSuppressesReloc()) {
     OutStreamer.EmitValue(Diff, Size);
     return;
   }
@@ -1472,36 +1453,6 @@
   OutStreamer.EmitSymbolValue(SetLabel, Size);
 }
 
-/// EmitLabelOffsetDifference - Emit something like ".long Hi+Offset-Lo"
-/// where the size in bytes of the directive is specified by Size and Hi/Lo
-/// specify the labels.  This implicitly uses .set if it is available.
-void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
-                                           const MCSymbol *Lo,
-                                           unsigned Size) const {
-
-  // Emit Hi+Offset - Lo
-  // Get the Hi+Offset expression.
-  const MCExpr *Plus =
-    MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Hi, OutContext),
-                            MCConstantExpr::Create(Offset, OutContext),
-                            OutContext);
-
-  // Get the Hi+Offset-Lo expression.
-  const MCExpr *Diff =
-    MCBinaryExpr::CreateSub(Plus,
-                            MCSymbolRefExpr::Create(Lo, OutContext),
-                            OutContext);
-
-  if (!MAI->hasSetDirective())
-    OutStreamer.EmitValue(Diff, Size);
-  else {
-    // Otherwise, emit with .set (aka assignment).
-    MCSymbol *SetLabel = GetTempSymbol("set", SetCounter++);
-    OutStreamer.EmitAssignment(SetLabel, Diff);
-    OutStreamer.EmitSymbolValue(SetLabel, Size);
-  }
-}
-
 /// EmitLabelPlusOffset - Emit something like ".long Label+Offset"
 /// where the size in bytes of the directive is specified by Size and Label
 /// specifies the label.  This implicitly uses .set if it is available.
@@ -1531,7 +1482,9 @@
 // if required for correctness.
 //
 void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const {
-  if (GV) NumBits = getGVAlignmentLog2(GV, *TM.getDataLayout(), NumBits);
+  if (GV)
+    NumBits = getGVAlignmentLog2(GV, *TM.getSubtargetImpl()->getDataLayout(),
+                                 NumBits);
 
   if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
 
@@ -1577,8 +1530,8 @@
     // If the code isn't optimized, there may be outstanding folding
     // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
-    if (Constant *C =
-          ConstantFoldConstantExpression(CE, AP.TM.getDataLayout()))
+    if (Constant *C = ConstantFoldConstantExpression(
+            CE, AP.TM.getSubtargetImpl()->getDataLayout()))
       if (C != CE)
         return lowerConstant(C, AP);
 
@@ -1592,7 +1545,7 @@
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const DataLayout &DL = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
     // Generate a symbolic expression for the byte address
     APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
     cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
@@ -1616,7 +1569,7 @@
     return lowerConstant(CE->getOperand(0), AP);
 
   case Instruction::IntToPtr: {
-    const DataLayout &DL = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
@@ -1626,7 +1579,7 @@
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &DL = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
@@ -1699,7 +1652,8 @@
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getBitWidth() > 64) return -1;
 
-    uint64_t Size = TM.getDataLayout()->getTypeAllocSize(V->getType());
+    uint64_t Size =
+        TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(V->getType());
     uint64_t Value = CI->getZExtValue();
 
     // Make sure the constant is at least 8 bits long and has a power
@@ -1743,7 +1697,9 @@
   // See if we can aggregate this into a .fill, if so, emit it as such.
   int Value = isRepeatedByteSequence(CDS, AP.TM);
   if (Value != -1) {
-    uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CDS->getType());
+    uint64_t Bytes =
+        AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+            CDS->getType());
     // Don't emit a 1-byte object as a .fill.
     if (Bytes > 1)
       return AP.OutStreamer.EmitFill(Bytes, Value);
@@ -1793,7 +1749,7 @@
     }
   }
 
-  const DataLayout &DL = *AP.TM.getDataLayout();
+  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CDS->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
@@ -1808,7 +1764,9 @@
   int Value = isRepeatedByteSequence(CA, AP.TM);
 
   if (Value != -1) {
-    uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CA->getType());
+    uint64_t Bytes =
+        AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+            CA->getType());
     AP.OutStreamer.EmitFill(Bytes, Value);
   }
   else {
@@ -1821,7 +1779,7 @@
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
     emitGlobalConstantImpl(CV->getOperand(i), AP);
 
-  const DataLayout &DL = *AP.TM.getDataLayout();
+  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CV->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
@@ -1831,7 +1789,7 @@
 
 static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP) {
   // Print the fields in successive locations. Pad to align if needed!
-  const DataLayout *DL = AP.TM.getDataLayout();
+  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
   unsigned Size = DL->getTypeAllocSize(CS->getType());
   const StructLayout *Layout = DL->getStructLayout(CS->getType());
   uint64_t SizeSoFar = 0;
@@ -1881,7 +1839,7 @@
 
   // PPC's long double has odd notions of endianness compared to how LLVM
   // handles it: p[0] goes first for *big* endian on PPC.
-  if (AP.TM.getDataLayout()->isBigEndian() &&
+  if (AP.TM.getSubtargetImpl()->getDataLayout()->isBigEndian() &&
       !CFP->getType()->isPPC_FP128Ty()) {
     int Chunk = API.getNumWords() - 1;
 
@@ -1900,13 +1858,13 @@
   }
 
   // Emit the tail padding for the long double.
-  const DataLayout &DL = *AP.TM.getDataLayout();
+  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
   AP.OutStreamer.EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
                            DL.getTypeStoreSize(CFP->getType()));
 }
 
 static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
-  const DataLayout *DL = AP.TM.getDataLayout();
+  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
   unsigned BitWidth = CI->getBitWidth();
 
   // Copy the value as we may massage the layout for constants whose bit width
@@ -1952,7 +1910,8 @@
     // Emit the extra bits after the 64-bits chunks.
 
     // Emit a directive that fills the expected size.
-    uint64_t Size = AP.TM.getDataLayout()->getTypeAllocSize(CI->getType());
+    uint64_t Size = AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+        CI->getType());
     Size -= (BitWidth / 64) * 8;
     assert(Size && Size * 8 >= ExtraBitsSize &&
            (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize)))
@@ -1962,7 +1921,7 @@
 }
 
 static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
-  const DataLayout *DL = AP.TM.getDataLayout();
+  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
   uint64_t Size = DL->getTypeAllocSize(CV->getType());
   if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
     return AP.OutStreamer.EmitZeros(Size);
@@ -2027,7 +1986,8 @@
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
 void AsmPrinter::EmitGlobalConstant(const Constant *CV) {
-  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
+  uint64_t Size =
+      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(CV->getType());
   if (Size)
     emitGlobalConstantImpl(CV, *this);
   else if (MAI->hasSubsectionsViaSymbols()) {
@@ -2056,7 +2016,7 @@
 /// GetTempSymbol - Return the MCSymbol corresponding to the assembler
 /// temporary label with the specified stem and unique ID.
 MCSymbol *AsmPrinter::GetTempSymbol(Twine Name, unsigned ID) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   return OutContext.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix()) +
                                       Name + Twine(ID));
 }
@@ -2064,7 +2024,7 @@
 /// GetTempSymbol - Return an assembler temporary label with the specified
 /// stem.
 MCSymbol *AsmPrinter::GetTempSymbol(Twine Name) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   return OutContext.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
                                       Name);
 }
@@ -2080,7 +2040,7 @@
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   return OutContext.GetOrCreateSymbol
     (Twine(DL->getPrivateGlobalPrefix()) + "CPI" + Twine(getFunctionNumber())
      + "_" + Twine(CPID));
@@ -2094,7 +2054,7 @@
 /// GetJTSetSymbol - Return the symbol for the specified jump table .set
 /// FIXME: privatize to AsmPrinter.
 MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   return OutContext.GetOrCreateSymbol
   (Twine(DL->getPrivateGlobalPrefix()) + Twine(getFunctionNumber()) + "_" +
    Twine(UID) + "_set_" + Twine(MBBID));

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 02cd12b..05f6a68 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp

@@ -27,6 +27,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -130,7 +131,7 @@
   default:
     llvm_unreachable("Invalid encoded value.");
   case dwarf::DW_EH_PE_absptr:
-    return TM.getDataLayout()->getPointerSize();
+    return TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
   case dwarf::DW_EH_PE_udata2:
     return 2;
   case dwarf::DW_EH_PE_udata4:
@@ -214,13 +215,10 @@
     Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
 }
 
-/// Emit a dwarf register operation for describing
-/// - a small value occupying only part of a register or
-/// - a small register representing only part of a value.
-static void emitDwarfOpPiece(ByteStreamer &Streamer, unsigned SizeInBits,
-                             unsigned OffsetInBits) {
-  assert(SizeInBits > 0 && "zero-sized piece");
-  unsigned SizeOfByte = 8;
+void AsmPrinter::EmitDwarfOpPiece(ByteStreamer &Streamer, unsigned SizeInBits,
+                                  unsigned OffsetInBits) const {
+  assert(SizeInBits > 0 && "piece has size zero");
+  const unsigned SizeOfByte = 8;
   if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
     Streamer.EmitInt8(dwarf::DW_OP_bit_piece, "DW_OP_bit_piece");
     Streamer.EmitULEB128(SizeInBits, Twine(SizeInBits));
@@ -249,13 +247,13 @@
                                      unsigned PieceSizeInBits,
                                      unsigned PieceOffsetInBits) const {
   assert(MLoc.isReg() && "MLoc must be a register");
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
   int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
 
   // If this is a valid register number, emit it.
   if (Reg >= 0) {
     emitDwarfRegOp(Streamer, Reg);
-    emitDwarfOpPiece(Streamer, PieceSizeInBits, PieceOffsetInBits);
+    EmitDwarfOpPiece(Streamer, PieceSizeInBits, PieceOffsetInBits);
     return;
   }
 
@@ -266,19 +264,19 @@
     if (Reg >= 0) {
       unsigned Idx = TRI->getSubRegIndex(*SR, MLoc.getReg());
       unsigned Size = TRI->getSubRegIdxSize(Idx);
-      unsigned Offset = TRI->getSubRegIdxOffset(Idx);
+      unsigned RegOffset = TRI->getSubRegIdxOffset(Idx);
       OutStreamer.AddComment("super-register");
       emitDwarfRegOp(Streamer, Reg);
-      if (PieceOffsetInBits == Offset) {
-        emitDwarfOpPiece(Streamer, Size, Offset);
+      if (PieceOffsetInBits == RegOffset) {
+        EmitDwarfOpPiece(Streamer, Size, RegOffset);
       } else {
         // If this is part of a variable in a sub-register at a
         // non-zero offset, we need to manually shift the value into
         // place, since the DW_OP_piece describes the part of the
         // variable, not the position of the subregister.
-        emitDwarfOpPiece(Streamer, Size, PieceOffsetInBits);
-        if (Offset)
-          emitDwarfOpShr(Streamer, Offset);
+        if (RegOffset)
+          emitDwarfOpShr(Streamer, RegOffset);
+        EmitDwarfOpPiece(Streamer, Size, PieceOffsetInBits);
       }
       return;
     }
@@ -312,7 +310,7 @@
     if (Reg >= 0 && Intersection.any()) {
       OutStreamer.AddComment("sub-register");
       emitDwarfRegOp(Streamer, Reg);
-      emitDwarfOpPiece(Streamer, Size, Offset == CurPos ? 0 : Offset);
+      EmitDwarfOpPiece(Streamer, Size, Offset == CurPos ? 0 : Offset);
       CurPos = Offset + Size;
 
       // Mark it as emitted.
@@ -331,7 +329,7 @@
 void AsmPrinter::EmitDwarfRegOp(ByteStreamer &Streamer,
                                 const MachineLocation &MLoc,
                                 bool Indirect) const {
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
   int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
   if (Reg < 0) {
     // We assume that pointers are always in an addressable register.

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h b/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
index 2825367..31867dd 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H__
-#define CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H
 
 #include "llvm/Support/DataTypes.h"
 

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 46ee0c8..cca5f22 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp

@@ -33,6 +33,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
@@ -110,14 +111,14 @@
     HasDiagHandler = true;
   }
 
-  MemoryBuffer *Buffer;
+  std::unique_ptr<MemoryBuffer> Buffer;
   if (isNullTerminated)
     Buffer = MemoryBuffer::getMemBuffer(Str, "<inline asm>");
   else
     Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>");
 
   // Tell SrcMgr about this buffer, it takes ownership of the buffer.
-  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
+  SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
 
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, OutContext, OutStreamer, *MAI));
@@ -146,6 +147,10 @@
                        " we don't have an asm parser for this target\n");
   Parser->setAssemblerDialect(Dialect);
   Parser->setTargetParser(*TAP.get());
+  if (MF) {
+    const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+    TAP->SetFrameRegister(TRI->getFrameRegister(*MF));
+  }
 
   // Don't implicitly switch to the text section before the asm.
   int Res = Parser->Run(/*NoInitialTextSection*/ true,
@@ -500,7 +505,7 @@
 /// for their own strange codes.
 void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
                               const char *Code) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   if (!strcmp(Code, "private")) {
     OS << DL->getPrivateGlobalPrefix();
   } else if (!strcmp(Code, "comment")) {

diff --git a/lib/CodeGen/AsmPrinter/ByteStreamer.h b/lib/CodeGen/AsmPrinter/ByteStreamer.h
index 6c01d65..0cc8353 100644
--- a/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/lib/CodeGen/AsmPrinter/ByteStreamer.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_BYTESTREAMER_H
-#define LLVM_CODEGEN_BYTESTREAMER_H
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"

diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index f555f21..e6b7d64 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt

@@ -9,6 +9,7 @@
   DIEHash.cpp
   DwarfAccelTable.cpp
   DwarfCFIException.cpp
+  DwarfCompileUnit.cpp
   DwarfDebug.cpp
   DwarfFile.cpp
   DwarfStringPool.cpp

diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index c3dcd9c..50ea369 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp

@@ -12,12 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "DIE.h"
+
+#include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Debug.h"
@@ -370,6 +373,29 @@
 // DIEEntry Implementation
 //===----------------------------------------------------------------------===//
 
+/// Emit something like ".long Hi+Offset-Lo" where the size in bytes of the
+/// directive is specified by Size and Hi/Lo specify the labels.
+static void emitLabelOffsetDifference(MCStreamer &Streamer, const MCSymbol *Hi,
+                                      uint64_t Offset, const MCSymbol *Lo,
+                                      unsigned Size) {
+  MCContext &Context = Streamer.getContext();
+
+  // Emit Hi+Offset - Lo
+  // Get the Hi+Offset expression.
+  const MCExpr *Plus =
+      MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Hi, Context),
+                              MCConstantExpr::Create(Offset, Context), Context);
+
+  // Get the Hi+Offset-Lo expression.
+  const MCExpr *Diff = MCBinaryExpr::CreateSub(
+      Plus, MCSymbolRefExpr::Create(Lo, Context), Context);
+
+  // Otherwise, emit with .set (aka assignment).
+  MCSymbol *SetLabel = Context.CreateTempSymbol();
+  Streamer.EmitAssignment(SetLabel, Diff);
+  Streamer.EmitSymbolValue(SetLabel, Size);
+}
+
 /// EmitValue - Emit debug information entry offset.
 ///
 void DIEEntry::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
@@ -388,9 +414,9 @@
       AP->EmitLabelPlusOffset(CU->getSectionSym(), Addr,
                               DIEEntry::getRefAddrSize(AP));
     else
-      AP->EmitLabelOffsetDifference(CU->getSectionSym(), Addr,
-                                    CU->getSectionSym(),
-                                    DIEEntry::getRefAddrSize(AP));
+      emitLabelOffsetDifference(AP->OutStreamer, CU->getSectionSym(), Addr,
+                                CU->getSectionSym(),
+                                DIEEntry::getRefAddrSize(AP));
   } else
     AP->EmitInt32(Entry.getOffset());
 }

diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
index ef05f17..e310aef 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/lib/CodeGen/AsmPrinter/DIE.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DIE_H__
-#define CODEGEN_ASMPRINTER_DIE_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DIE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DIE_H
 
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -381,10 +381,10 @@
 ///
 class DIEString : public DIEValue {
   const DIEValue *Access;
-  const StringRef Str;
+  StringRef Str;
 
 public:
-  DIEString(const DIEValue *Acc, const StringRef S)
+  DIEString(const DIEValue *Acc, StringRef S)
       : DIEValue(isString), Access(Acc), Str(S) {}
 
   /// getString - Grab the string out of the object.

diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index c2fad59..b2a3ba8 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp

@@ -261,7 +261,7 @@
     return;
   }
 
-  // otherwise, b) use the letter 'T' as a the marker, ...
+  // otherwise, b) use the letter 'T' as the marker, ...
   addULEB128('T');
 
   addULEB128(Attribute);

diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
index 175d660..872aa0e 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DIEHASH_H__
-#define CODEGEN_ASMPRINTER_DIEHASH_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
 
 #include "DIE.h"
 #include "llvm/ADT/DenseMap.h"

diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
index a66d08e..0c2a5e5 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp

@@ -8,25 +8,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "DbgValueHistoryCalculator.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 #include <map>
-#include <set>
+using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
 
-namespace llvm {
-
 // \brief If @MI is a DBG_VALUE with debug value described by a
 // defined register, returns the number of this register.
 // In the other case, returns 0.
 static unsigned isDescribedByReg(const MachineInstr &MI) {
   assert(MI.isDebugValue());
-  assert(MI.getNumOperands() == 3);
+  assert(MI.getNumOperands() == 4);
   // If location of variable is described using a register (directly or
   // indirecltly), this register is always a first operand.
   return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
@@ -36,7 +36,7 @@
                                          const MachineInstr &MI) {
   // Instruction range should start with a DBG_VALUE instruction for the
   // variable.
-  assert(MI.isDebugValue() && MI.getDebugVariable() == Var);
+  assert(MI.isDebugValue() && "not a DBG_VALUE");
   auto &Ranges = VarInstrRanges[Var];
   if (!Ranges.empty() && Ranges.back().second == nullptr &&
       Ranges.back().first->isIdenticalTo(&MI)) {
@@ -96,6 +96,19 @@
   VarSet.push_back(Var);
 }
 
+// \brief Terminate the location range for variables described by register at
+// @I by inserting @ClobberingInstr to their history.
+static void clobberRegisterUses(RegDescribedVarsMap &RegVars,
+                                RegDescribedVarsMap::iterator I,
+                                DbgValueHistoryMap &HistMap,
+                                const MachineInstr &ClobberingInstr) {
+  // Iterate over all variables described by this register and add this
+  // instruction to their history, clobbering it.
+  for (const auto &Var : I->second)
+    HistMap.endInstrRange(Var, ClobberingInstr);
+  RegVars.erase(I);
+}
+
 // \brief Terminate the location range for variables described by register
 // @RegNo by inserting @ClobberingInstr to their history.
 static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
@@ -104,22 +117,26 @@
   const auto &I = RegVars.find(RegNo);
   if (I == RegVars.end())
     return;
-  // Iterate over all variables described by this register and add this
-  // instruction to their history, clobbering it.
-  for (const auto &Var : I->second)
-    HistMap.endInstrRange(Var, ClobberingInstr);
-  RegVars.erase(I);
+  clobberRegisterUses(RegVars, I, HistMap, ClobberingInstr);
 }
 
-// \brief Collect all registers clobbered by @MI and insert them to @Regs.
-static void collectClobberedRegisters(const MachineInstr &MI,
+// \brief Collect all registers clobbered by @MI and apply the functor
+// @Func to their RegNo.
+// @Func should be a functor with a void(unsigned) signature. We're
+// not using std::function here for performance reasons. It has a
+// small but measurable impact. By using a functor instead of a
+// std::set& here, we can avoid the overhead of constructing
+// temporaries in calculateDbgValueHistory, which has a significant
+// performance impact.
+template<typename Callable>
+static void applyToClobberedRegisters(const MachineInstr &MI,
                                       const TargetRegisterInfo *TRI,
-                                      std::set<unsigned> &Regs) {
+                                      Callable Func) {
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef() || !MO.getReg())
       continue;
     for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
-      Regs.insert(*AI);
+      Func(*AI);
   }
 }
 
@@ -133,11 +150,12 @@
   // as the return instruction.
   DebugLoc LastLoc = LastMI->getDebugLoc();
   auto Res = LastMI;
-  for (MachineBasicBlock::const_reverse_iterator I(std::next(LastMI)); I != MBB.rend();
-       ++I) {
+  for (MachineBasicBlock::const_reverse_iterator I(std::next(LastMI)),
+       E = MBB.rend();
+       I != E; ++I) {
     if (I->getDebugLoc() != LastLoc)
       return Res;
-    Res = std::prev(I.base());
+    Res = &*I;
   }
   // If all instructions have the same debug location, assume whole MBB is
   // an epilogue.
@@ -145,25 +163,26 @@
 }
 
 // \brief Collect registers that are modified in the function body (their
-// contents is changed only in the prologue and epilogue).
+// contents is changed outside of the prologue and epilogue).
 static void collectChangingRegs(const MachineFunction *MF,
                                 const TargetRegisterInfo *TRI,
-                                std::set<unsigned> &Regs) {
+                                BitVector &Regs) {
   for (const auto &MBB : *MF) {
     auto FirstEpilogueInst = getFirstEpilogueInst(MBB);
-    bool IsInEpilogue = false;
+
     for (const auto &MI : MBB) {
-      IsInEpilogue |= &MI == FirstEpilogueInst;
-      if (!MI.getFlag(MachineInstr::FrameSetup) && !IsInEpilogue)
-        collectClobberedRegisters(MI, TRI, Regs);
+      if (&MI == FirstEpilogueInst)
+        break;
+      if (!MI.getFlag(MachineInstr::FrameSetup))
+        applyToClobberedRegisters(MI, TRI, [&](unsigned r) { Regs.set(r); });
     }
   }
 }
 
-void calculateDbgValueHistory(const MachineFunction *MF,
-                              const TargetRegisterInfo *TRI,
-                              DbgValueHistoryMap &Result) {
-  std::set<unsigned> ChangingRegs;
+void llvm::calculateDbgValueHistory(const MachineFunction *MF,
+                                    const TargetRegisterInfo *TRI,
+                                    DbgValueHistoryMap &Result) {
+  BitVector ChangingRegs(TRI->getNumRegs());
   collectChangingRegs(MF, TRI, ChangingRegs);
 
   RegDescribedVarsMap RegVars;
@@ -172,17 +191,18 @@
       if (!MI.isDebugValue()) {
         // Not a DBG_VALUE instruction. It may clobber registers which describe
         // some variables.
-        std::set<unsigned> MIClobberedRegs;
-        collectClobberedRegisters(MI, TRI, MIClobberedRegs);
-        for (unsigned RegNo : MIClobberedRegs) {
-          if (ChangingRegs.count(RegNo))
+        applyToClobberedRegisters(MI, TRI, [&](unsigned RegNo) {
+          if (ChangingRegs.test(RegNo))
             clobberRegisterUses(RegVars, RegNo, Result, MI);
-        }
+        });
         continue;
       }
 
       assert(MI.getNumOperands() > 1 && "Invalid DBG_VALUE instruction!");
-      const MDNode *Var = MI.getDebugVariable();
+      // Use the base variable (without any DW_OP_piece expressions)
+      // as index into History. The full variables including the
+      // piece expressions are attached to the MI.
+      DIVariable Var = MI.getDebugVariable();
 
       if (unsigned PrevReg = Result.getRegisterForVar(Var))
         dropRegDescribedVar(RegVars, PrevReg, Var);
@@ -196,11 +216,12 @@
     // Make sure locations for register-described variables are valid only
     // until the end of the basic block (unless it's the last basic block, in
     // which case let their liveness run off to the end of the function).
-    if (!MBB.empty() &&  &MBB != &MF->back()) {
-      for (unsigned RegNo : ChangingRegs)
-        clobberRegisterUses(RegVars, RegNo, Result, MBB.back());
+    if (!MBB.empty() && &MBB != &MF->back()) {
+      for (auto I = RegVars.begin(), E = RegVars.end(); I != E;) {
+        auto CurElem = I++; // CurElem can be erased below.
+        if (ChangingRegs.test(CurElem->first))
+          clobberRegisterUses(RegVars, CurElem, Result, MBB.back());
+      }
     }
   }
 }
-
-}

diff --git a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
index b9177f0..4b62007 100644
--- a/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
+++ b/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H_
-#define CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H_
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -28,9 +28,11 @@
   // range. If end is not specified, location is valid until the start
   // instruction of the next instruction range, or until the end of the
   // function.
+public:
   typedef std::pair<const MachineInstr *, const MachineInstr *> InstrRange;
   typedef SmallVector<InstrRange, 4> InstrRanges;
   typedef MapVector<const MDNode *, InstrRanges> InstrRangesMap;
+private:
   InstrRangesMap VarInstrRanges;
 
 public:

diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index 3beb799..6cca985 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h

@@ -7,14 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H__
-#define CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCSymbol.h"
 
 namespace llvm {
-class DwarfCompileUnit;
 class MDNode;
 /// \brief This struct describes location entries emitted in the .debug_loc
 /// section.
@@ -26,25 +26,30 @@
 public:
   /// A single location or constant.
   struct Value {
-    Value(const MDNode *Var, int64_t i)
-      : Variable(Var), EntryKind(E_Integer) {
+    Value(const MDNode *Var, const MDNode *Expr, int64_t i)
+        : Variable(Var), Expression(Expr), EntryKind(E_Integer) {
       Constant.Int = i;
     }
-    Value(const MDNode *Var, const ConstantFP *CFP)
-      : Variable(Var), EntryKind(E_ConstantFP) {
+    Value(const MDNode *Var, const MDNode *Expr, const ConstantFP *CFP)
+        : Variable(Var), Expression(Expr), EntryKind(E_ConstantFP) {
       Constant.CFP = CFP;
     }
-    Value(const MDNode *Var, const ConstantInt *CIP)
-      : Variable(Var), EntryKind(E_ConstantInt) {
+    Value(const MDNode *Var, const MDNode *Expr, const ConstantInt *CIP)
+        : Variable(Var), Expression(Expr), EntryKind(E_ConstantInt) {
       Constant.CIP = CIP;
     }
-    Value(const MDNode *Var, MachineLocation Loc)
-      : Variable(Var), EntryKind(E_Location), Loc(Loc) {
+    Value(const MDNode *Var, const MDNode *Expr, MachineLocation Loc)
+        : Variable(Var), Expression(Expr), EntryKind(E_Location), Loc(Loc) {
+      assert(DIVariable(Var).Verify());
+      assert(DIExpression(Expr).Verify());
     }
 
     // The variable to which this location entry corresponds.
     const MDNode *Variable;
 
+    // Any complex address location expression for this Value.
+    const MDNode *Expression;
+
     // Type of entry that this represents.
     enum EntryType { E_Location, E_Integer, E_ConstantFP, E_ConstantInt };
     enum EntryType EntryKind;
@@ -59,23 +64,6 @@
     // Or a location in the machine frame.
     MachineLocation Loc;
 
-    bool operator==(const Value &other) const {
-      if (EntryKind != other.EntryKind)
-        return false;
-
-      switch (EntryKind) {
-      case E_Location:
-        return Loc == other.Loc;
-      case E_Integer:
-        return Constant.Int == other.Constant.Int;
-      case E_ConstantFP:
-        return Constant.CFP == other.Constant.CFP;
-      case E_ConstantInt:
-        return Constant.CIP == other.Constant.CIP;
-      }
-      llvm_unreachable("unhandled EntryKind");
-    }
-
     bool isLocation() const { return EntryKind == E_Location; }
     bool isInt() const { return EntryKind == E_Integer; }
     bool isConstantFP() const { return EntryKind == E_ConstantFP; }
@@ -84,40 +72,114 @@
     const ConstantFP *getConstantFP() const { return Constant.CFP; }
     const ConstantInt *getConstantInt() const { return Constant.CIP; }
     MachineLocation getLoc() const { return Loc; }
-    const MDNode *getVariable() const { return Variable; }
+    const MDNode *getVariableNode() const { return Variable; }
+    DIVariable getVariable() const { return DIVariable(Variable); }
+    bool isVariablePiece() const { return getExpression().isVariablePiece(); }
+    DIExpression getExpression() const { return DIExpression(Expression); }
+    friend bool operator==(const Value &, const Value &);
+    friend bool operator<(const Value &, const Value &);
   };
+
 private:
-  /// A list of locations/constants belonging to this entry.
+  /// A nonempty list of locations/constants belonging to this entry,
+  /// sorted by offset.
   SmallVector<Value, 1> Values;
 
-  /// The compile unit that this location entry is referenced by.
-  const DwarfCompileUnit *Unit;
-
 public:
-  DebugLocEntry() : Begin(nullptr), End(nullptr), Unit(nullptr) {}
-  DebugLocEntry(const MCSymbol *B, const MCSymbol *E,
-                Value Val, const DwarfCompileUnit *U)
-      : Begin(B), End(E), Unit(U) {
+  DebugLocEntry(const MCSymbol *B, const MCSymbol *E, Value Val)
+      : Begin(B), End(E) {
     Values.push_back(std::move(Val));
   }
 
+  /// \brief If this and Next are describing different pieces of the same
+  // variable, merge them by appending Next's values to the current
+  // list of values.
+  // Return true if the merge was successful.
+  bool MergeValues(const DebugLocEntry &Next) {
+    if (Begin == Next.Begin) {
+      DIExpression Expr(Values[0].Expression);
+      DIVariable Var(Values[0].Variable);
+      DIExpression NextExpr(Next.Values[0].Expression);
+      DIVariable NextVar(Next.Values[0].Variable);
+      if (Var == NextVar && Expr.isVariablePiece() &&
+          NextExpr.isVariablePiece()) {
+        addValues(Next.Values);
+        End = Next.End;
+        return true;
+      }
+    }
+    return false;
+  }
+
   /// \brief Attempt to merge this DebugLocEntry with Next and return
   /// true if the merge was successful. Entries can be merged if they
   /// share the same Loc/Constant and if Next immediately follows this
   /// Entry.
-  bool Merge(const DebugLocEntry &Next) {
+  bool MergeRanges(const DebugLocEntry &Next) {
+    // If this and Next are describing the same variable, merge them.
     if ((End == Next.Begin && Values == Next.Values)) {
       End = Next.End;
       return true;
     }
     return false;
   }
+
   const MCSymbol *getBeginSym() const { return Begin; }
   const MCSymbol *getEndSym() const { return End; }
-  const DwarfCompileUnit *getCU() const { return Unit; }
-  const ArrayRef<Value> getValues() const { return Values; }
-  void addValue(Value Val) { Values.push_back(Val); }
+  ArrayRef<Value> getValues() const { return Values; }
+  void addValues(ArrayRef<DebugLocEntry::Value> Vals) {
+    Values.append(Vals.begin(), Vals.end());
+    sortUniqueValues();
+    assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value V){
+          return V.isVariablePiece();
+        }) && "value must be a piece");
+  }
+
+  // Sort the pieces by offset.
+  // Remove any duplicate entries by dropping all but the first.
+  void sortUniqueValues() {
+    std::sort(Values.begin(), Values.end());
+    Values.erase(std::unique(Values.begin(), Values.end(),
+                             [](const Value &A, const Value &B) {
+                   return A.getVariable() == B.getVariable() &&
+                          A.getExpression() == B.getExpression();
+                 }),
+                 Values.end());
+  }
 };
 
+/// Compare two Values for equality.
+inline bool operator==(const DebugLocEntry::Value &A,
+                       const DebugLocEntry::Value &B) {
+  if (A.EntryKind != B.EntryKind)
+    return false;
+
+  if (A.Expression != B.Expression)
+    return false;
+
+  if (A.Variable != B.Variable)
+    return false;
+
+  switch (A.EntryKind) {
+  case DebugLocEntry::Value::E_Location:
+    return A.Loc == B.Loc;
+  case DebugLocEntry::Value::E_Integer:
+    return A.Constant.Int == B.Constant.Int;
+  case DebugLocEntry::Value::E_ConstantFP:
+    return A.Constant.CFP == B.Constant.CFP;
+  case DebugLocEntry::Value::E_ConstantInt:
+    return A.Constant.CIP == B.Constant.CIP;
+  }
+  llvm_unreachable("unhandled EntryKind");
 }
+
+/// Compare two pieces based on their offset.
+inline bool operator<(const DebugLocEntry::Value &A,
+                      const DebugLocEntry::Value &B) {
+  return A.getExpression().getPieceOffset() <
+         B.getExpression().getPieceOffset();
+}
+
+}
+
 #endif

diff --git a/lib/CodeGen/AsmPrinter/DebugLocList.h b/lib/CodeGen/AsmPrinter/DebugLocList.h
index 7a51c7b..2a4f58f 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocList.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocList.h

@@ -7,16 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DEBUGLOCLIST_H__
-#define CODEGEN_ASMPRINTER_DEBUGLOCLIST_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCLIST_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCLIST_H
 
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/ADT/SmallVector.h"
 #include "DebugLocEntry.h"
 
 namespace llvm {
+class DwarfCompileUnit;
+class MCSymbol;
 struct DebugLocList {
   MCSymbol *Label;
+  DwarfCompileUnit *CU;
   SmallVector<DebugLocEntry, 4> List;
 };
 }

diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index e9527c4..7e87566 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp

@@ -13,6 +13,7 @@
 
 #include "DwarfAccelTable.h"
 #include "DIE.h"
+#include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
@@ -174,7 +175,8 @@
 // Walk through the buckets and emit the full data for each element in
 // the bucket. For the string case emit the dies and the various offsets.
 // Terminate each HashData bucket with 0.
-void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfFile *D) {
+void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfDebug *D,
+                               MCSymbol *StrSym) {
   uint64_t PrevHash = UINT64_MAX;
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
@@ -183,13 +185,14 @@
       // Remember to emit the label for our offset.
       Asm->OutStreamer.EmitLabel((*HI)->Sym);
       Asm->OutStreamer.AddComment((*HI)->Str);
-      Asm->EmitSectionOffset((*HI)->Data.StrSym,
-                             D->getStringPool().getSectionSymbol());
+      Asm->EmitSectionOffset((*HI)->Data.StrSym, StrSym);
       Asm->OutStreamer.AddComment("Num DIEs");
       Asm->EmitInt32((*HI)->Data.Values.size());
       for (HashDataContents *HD : (*HI)->Data.Values) {
         // Emit the DIE offset
-        Asm->EmitInt32(HD->Die->getOffset());
+        DwarfCompileUnit *CU = D->lookupUnit(HD->Die->getUnit());
+        assert(CU && "Accelerated DIE should belong to a CU.");
+        Asm->EmitInt32(HD->Die->getOffset() + CU->getDebugInfoOffset());
         // If we have multiple Atoms emit that info too.
         // FIXME: A bit of a hack, we either emit only one atom or all info.
         if (HeaderData.Atoms.size() > 1) {
@@ -206,7 +209,8 @@
 }
 
 // Emit the entire data structure to the output file.
-void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin, DwarfFile *D) {
+void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin, DwarfDebug *D,
+                           MCSymbol *StrSym) {
   // Emit the header.
   EmitHeader(Asm);
 
@@ -220,7 +224,7 @@
   EmitOffsets(Asm, SecBegin);
 
   // Emit the hash data.
-  EmitData(Asm, D);
+  EmitData(Asm, D, StrSym);
 }
 
 #ifndef NDEBUG

diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index a3cc95f..3cdf678 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DWARFACCELTABLE_H__
-#define CODEGEN_ASMPRINTER_DWARFACCELTABLE_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
 
 #include "DIE.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -62,7 +62,7 @@
 namespace llvm {
 
 class AsmPrinter;
-class DwarfFile;
+class DwarfDebug;
 
 class DwarfAccelTable {
 
@@ -140,7 +140,7 @@
 private:
   struct TableHeaderData {
     uint32_t die_offset_base;
-    SmallVector<Atom, 1> Atoms;
+    SmallVector<Atom, 3> Atoms;
 
     TableHeaderData(ArrayRef<Atom> AtomList, uint32_t offset = 0)
         : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) {}
@@ -223,7 +223,7 @@
   void EmitBuckets(AsmPrinter *);
   void EmitHashes(AsmPrinter *);
   void EmitOffsets(AsmPrinter *, MCSymbol *);
-  void EmitData(AsmPrinter *, DwarfFile *D);
+  void EmitData(AsmPrinter *, DwarfDebug *D, MCSymbol *StrSym);
 
   // Allocator for HashData and HashDataContents.
   BumpPtrAllocator Allocator;
@@ -248,7 +248,7 @@
   void AddName(StringRef Name, MCSymbol *StrSym, const DIE *Die,
                char Flags = 0);
   void FinalizeTable(AsmPrinter *, StringRef);
-  void Emit(AsmPrinter *, MCSymbol *, DwarfFile *);
+  void Emit(AsmPrinter *, MCSymbol *, DwarfDebug *, MCSymbol *StrSym);
 #ifndef NDEBUG
   void print(raw_ostream &O);
   void dump() { print(dbgs()); }

diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 74215aa..0dc52da 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp

@@ -51,7 +51,7 @@
   if (moveTypeModule == AsmPrinter::CFI_M_Debug)
     Asm->OutStreamer.EmitCFISections(false, true);
 
-  if (!Asm->MAI->isExceptionHandlingDwarf())
+  if (!Asm->MAI->usesItaniumLSDAForExceptions())
     return;
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();

diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
new file mode 100644
index 0000000..2f1b0e5
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp

@@ -0,0 +1,862 @@
+#include "DwarfCompileUnit.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+
+DwarfCompileUnit::DwarfCompileUnit(unsigned UID, DICompileUnit Node,
+                                   AsmPrinter *A, DwarfDebug *DW,
+                                   DwarfFile *DWU)
+    : DwarfUnit(UID, dwarf::DW_TAG_compile_unit, Node, A, DW, DWU),
+      Skeleton(nullptr), LabelBegin(nullptr), BaseAddress(nullptr) {
+  insertDIE(Node, &getUnitDie());
+}
+
+/// addLabelAddress - Add a dwarf label attribute data and value using
+/// DW_FORM_addr or DW_FORM_GNU_addr_index.
+///
+void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                                       const MCSymbol *Label) {
+
+  // Don't use the address pool in non-fission or in the skeleton unit itself.
+  // FIXME: Once GDB supports this, it's probably worthwhile using the address
+  // pool from the skeleton - maybe even in non-fission (possibly fewer
+  // relocations by sharing them in the pool, but we have other ideas about how
+  // to reduce the number of relocations as well/instead).
+  if (!DD->useSplitDwarf() || !Skeleton)
+    return addLocalLabelAddress(Die, Attribute, Label);
+
+  if (Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
+  unsigned idx = DD->getAddressPool().getIndex(Label);
+  DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
+  Die.addValue(Attribute, dwarf::DW_FORM_GNU_addr_index, Value);
+}
+
+void DwarfCompileUnit::addLocalLabelAddress(DIE &Die,
+                                            dwarf::Attribute Attribute,
+                                            const MCSymbol *Label) {
+  if (Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
+  Die.addValue(Attribute, dwarf::DW_FORM_addr,
+               Label ? (DIEValue *)new (DIEValueAllocator) DIELabel(Label)
+                     : new (DIEValueAllocator) DIEInteger(0));
+}
+
+unsigned DwarfCompileUnit::getOrCreateSourceID(StringRef FileName,
+                                               StringRef DirName) {
+  // If we print assembly, we can't separate .file entries according to
+  // compile units. Thus all files will belong to the default compile unit.
+
+  // FIXME: add a better feature test than hasRawTextSupport. Even better,
+  // extend .file to support this.
+  return Asm->OutStreamer.EmitDwarfFileDirective(
+      0, DirName, FileName,
+      Asm->OutStreamer.hasRawTextSupport() ? 0 : getUniqueID());
+}
+
+// Return const expression if value is a GEP to access merged global
+// constant. e.g.
+// i8* getelementptr ({ i8, i8, i8, i8 }* @_MergedGlobals, i32 0, i32 0)
+static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
+  const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(V);
+  if (!CE || CE->getNumOperands() != 3 ||
+      CE->getOpcode() != Instruction::GetElementPtr)
+    return nullptr;
+
+  // First operand points to a global struct.
+  Value *Ptr = CE->getOperand(0);
+  if (!isa<GlobalValue>(Ptr) ||
+      !isa<StructType>(cast<PointerType>(Ptr->getType())->getElementType()))
+    return nullptr;
+
+  // Second operand is zero.
+  const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CE->getOperand(1));
+  if (!CI || !CI->isZero())
+    return nullptr;
+
+  // Third operand is offset.
+  if (!isa<ConstantInt>(CE->getOperand(2)))
+    return nullptr;
+
+  return CE;
+}
+
+/// getOrCreateGlobalVariableDIE - get or create global variable DIE.
+DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(DIGlobalVariable GV) {
+  // Check for pre-existence.
+  if (DIE *Die = getDIE(GV))
+    return Die;
+
+  assert(GV.isGlobalVariable());
+
+  DIScope GVContext = DD->resolve(GV.getContext());
+  DIType GTy = DD->resolve(GV.getType());
+
+  // Construct the context before querying for the existence of the DIE in
+  // case such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(GVContext);
+
+  // Add to map.
+  DIE *VariableDIE = &createAndAddDIE(GV.getTag(), *ContextDIE, GV);
+  DIScope DeclContext;
+
+  if (DIDerivedType SDMDecl = GV.getStaticDataMemberDeclaration()) {
+    DeclContext = resolve(SDMDecl.getContext());
+    assert(SDMDecl.isStaticMember() && "Expected static member decl");
+    assert(GV.isDefinition());
+    // We need the declaration DIE that is in the static member's class.
+    DIE *VariableSpecDIE = getOrCreateStaticMemberDIE(SDMDecl);
+    addDIEEntry(*VariableDIE, dwarf::DW_AT_specification, *VariableSpecDIE);
+  } else {
+    DeclContext = resolve(GV.getContext());
+    // Add name and type.
+    addString(*VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
+    addType(*VariableDIE, GTy);
+
+    // Add scoping info.
+    if (!GV.isLocalToUnit())
+      addFlag(*VariableDIE, dwarf::DW_AT_external);
+
+    // Add line number info.
+    addSourceLine(*VariableDIE, GV);
+  }
+
+  if (!GV.isDefinition())
+    addFlag(*VariableDIE, dwarf::DW_AT_declaration);
+
+  // Add location.
+  bool addToAccelTable = false;
+  bool isGlobalVariable = GV.getGlobal() != nullptr;
+  if (isGlobalVariable) {
+    addToAccelTable = true;
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+    const MCSymbol *Sym = Asm->getSymbol(GV.getGlobal());
+    if (GV.getGlobal()->isThreadLocal()) {
+      // FIXME: Make this work with -gsplit-dwarf.
+      unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+      assert((PointerSize == 4 || PointerSize == 8) &&
+             "Add support for other sizes if necessary");
+      // Based on GCC's support for TLS:
+      if (!DD->useSplitDwarf()) {
+        // 1) Start with a constNu of the appropriate pointer size
+        addUInt(*Loc, dwarf::DW_FORM_data1,
+                PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u);
+        // 2) containing the (relocated) offset of the TLS variable
+        //    within the module's TLS block.
+        addExpr(*Loc, dwarf::DW_FORM_udata,
+                Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+      } else {
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+        addUInt(*Loc, dwarf::DW_FORM_udata,
+                DD->getAddressPool().getIndex(Sym, /* TLS */ true));
+      }
+      // 3) followed by a custom OP to make the debugger do a TLS lookup.
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_push_tls_address);
+    } else {
+      DD->addArangeLabel(SymbolCU(this, Sym));
+      addOpAddress(*Loc, Sym);
+    }
+
+    addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
+    // Add the linkage name.
+    StringRef LinkageName = GV.getLinkageName();
+    if (!LinkageName.empty())
+      // From DWARF4: DIEs to which DW_AT_linkage_name may apply include:
+      // TAG_common_block, TAG_constant, TAG_entry_point, TAG_subprogram and
+      // TAG_variable.
+      addString(*VariableDIE,
+                DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
+                                           : dwarf::DW_AT_MIPS_linkage_name,
+                GlobalValue::getRealLinkageName(LinkageName));
+  } else if (const ConstantInt *CI =
+                 dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
+    addConstantValue(*VariableDIE, CI, GTy);
+  } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV.getConstant())) {
+    addToAccelTable = true;
+    // GV is a merged global.
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+    Value *Ptr = CE->getOperand(0);
+    MCSymbol *Sym = Asm->getSymbol(cast<GlobalValue>(Ptr));
+    DD->addArangeLabel(SymbolCU(this, Sym));
+    addOpAddress(*Loc, Sym);
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
+    addUInt(*Loc, dwarf::DW_FORM_udata,
+            Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
+  }
+
+  if (addToAccelTable) {
+    DD->addAccelName(GV.getName(), *VariableDIE);
+
+    // If the linkage name is different than the name, go ahead and output
+    // that as well into the name table.
+    if (GV.getLinkageName() != "" && GV.getName() != GV.getLinkageName())
+      DD->addAccelName(GV.getLinkageName(), *VariableDIE);
+  }
+
+  addGlobalName(GV.getName(), *VariableDIE, DeclContext);
+  return VariableDIE;
+}
+
+void DwarfCompileUnit::addRange(RangeSpan Range) {
+  bool SameAsPrevCU = this == DD->getPrevCU();
+  DD->setPrevCU(this);
+  // If we have no current ranges just add the range and return, otherwise,
+  // check the current section and CU against the previous section and CU we
+  // emitted into and the subprogram was contained within. If these are the
+  // same then extend our current range, otherwise add this as a new range.
+  if (CURanges.empty() || !SameAsPrevCU ||
+      (&CURanges.back().getEnd()->getSection() !=
+       &Range.getEnd()->getSection())) {
+    CURanges.push_back(Range);
+    return;
+  }
+
+  CURanges.back().setEnd(Range.getEnd());
+}
+
+void DwarfCompileUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                                       const MCSymbol *Label,
+                                       const MCSymbol *Sec) {
+  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+    addLabel(Die, Attribute,
+             DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                        : dwarf::DW_FORM_data4,
+             Label);
+  else
+    addSectionDelta(Die, Attribute, Label, Sec);
+}
+
+void DwarfCompileUnit::initStmtList(MCSymbol *DwarfLineSectionSym) {
+  // Define start line table label for each Compile Unit.
+  MCSymbol *LineTableStartSym =
+      Asm->OutStreamer.getDwarfLineTableSymbol(getUniqueID());
+
+  stmtListIndex = UnitDie.getValues().size();
+
+  // DW_AT_stmt_list is a offset of line number information for this
+  // compile unit in debug_line section. For split dwarf this is
+  // left in the skeleton CU and so not included.
+  // The line table entries are not always emitted in assembly, so it
+  // is not okay to use line_table_start here.
+  addSectionLabel(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym,
+                  DwarfLineSectionSym);
+}
+
+void DwarfCompileUnit::applyStmtList(DIE &D) {
+  D.addValue(dwarf::DW_AT_stmt_list,
+             UnitDie.getAbbrev().getData()[stmtListIndex].getForm(),
+             UnitDie.getValues()[stmtListIndex]);
+}
+
+void DwarfCompileUnit::attachLowHighPC(DIE &D, const MCSymbol *Begin,
+                                       const MCSymbol *End) {
+  assert(Begin && "Begin label should not be null!");
+  assert(End && "End label should not be null!");
+  assert(Begin->isDefined() && "Invalid starting label");
+  assert(End->isDefined() && "Invalid end label");
+
+  addLabelAddress(D, dwarf::DW_AT_low_pc, Begin);
+  if (DD->getDwarfVersion() < 4)
+    addLabelAddress(D, dwarf::DW_AT_high_pc, End);
+  else
+    addLabelDelta(D, dwarf::DW_AT_high_pc, End, Begin);
+}
+
+// Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
+// and DW_AT_high_pc attributes. If there are global variables in this
+// scope then create and insert DIEs for these variables.
+DIE &DwarfCompileUnit::updateSubprogramScopeDIE(DISubprogram SP) {
+  DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
+
+  attachLowHighPC(*SPDie, DD->getFunctionBeginSym(), DD->getFunctionEndSym());
+  if (!DD->getCurrentFunction()->getTarget().Options.DisableFramePointerElim(
+          *DD->getCurrentFunction()))
+    addFlag(*SPDie, dwarf::DW_AT_APPLE_omit_frame_ptr);
+
+  // Only include DW_AT_frame_base in full debug info
+  if (!includeMinimalInlineScopes()) {
+    const TargetRegisterInfo *RI =
+        Asm->TM.getSubtargetImpl()->getRegisterInfo();
+    MachineLocation Location(RI->getFrameRegister(*Asm->MF));
+    addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+  }
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_subprogram nodes.
+  DD->addSubprogramNames(SP, *SPDie);
+
+  return *SPDie;
+}
+
+// Construct a DIE for this scope.
+void DwarfCompileUnit::constructScopeDIE(
+    LexicalScope *Scope, SmallVectorImpl<std::unique_ptr<DIE>> &FinalChildren) {
+  if (!Scope || !Scope->getScopeNode())
+    return;
+
+  DIScope DS(Scope->getScopeNode());
+
+  assert((Scope->getInlinedAt() || !DS.isSubprogram()) &&
+         "Only handle inlined subprograms here, use "
+         "constructSubprogramScopeDIE for non-inlined "
+         "subprograms");
+
+  SmallVector<std::unique_ptr<DIE>, 8> Children;
+
+  // We try to create the scope DIE first, then the children DIEs. This will
+  // avoid creating un-used children then removing them later when we find out
+  // the scope DIE is null.
+  std::unique_ptr<DIE> ScopeDIE;
+  if (Scope->getParent() && DS.isSubprogram()) {
+    ScopeDIE = constructInlinedScopeDIE(Scope);
+    if (!ScopeDIE)
+      return;
+    // We create children when the scope DIE is not null.
+    createScopeChildrenDIE(Scope, Children);
+  } else {
+    // Early exit when we know the scope DIE is going to be null.
+    if (DD->isLexicalScopeDIENull(Scope))
+      return;
+
+    unsigned ChildScopeCount;
+
+    // We create children here when we know the scope DIE is not going to be
+    // null and the children will be added to the scope DIE.
+    createScopeChildrenDIE(Scope, Children, &ChildScopeCount);
+
+    // Skip imported directives in gmlt-like data.
+    if (!includeMinimalInlineScopes()) {
+      // There is no need to emit empty lexical block DIE.
+      for (const auto &E : DD->findImportedEntitiesForScope(DS))
+        Children.push_back(
+            constructImportedEntityDIE(DIImportedEntity(E.second)));
+    }
+
+    // If there are only other scopes as children, put them directly in the
+    // parent instead, as this scope would serve no purpose.
+    if (Children.size() == ChildScopeCount) {
+      FinalChildren.insert(FinalChildren.end(),
+                           std::make_move_iterator(Children.begin()),
+                           std::make_move_iterator(Children.end()));
+      return;
+    }
+    ScopeDIE = constructLexicalScopeDIE(Scope);
+    assert(ScopeDIE && "Scope DIE should not be null.");
+  }
+
+  // Add children
+  for (auto &I : Children)
+    ScopeDIE->addChild(std::move(I));
+
+  FinalChildren.push_back(std::move(ScopeDIE));
+}
+
+void DwarfCompileUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
+                                       const MCSymbol *Hi, const MCSymbol *Lo) {
+  DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
+  Die.addValue(Attribute, DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                                     : dwarf::DW_FORM_data4,
+               Value);
+}
+
+void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
+                                         SmallVector<RangeSpan, 2> Range) {
+  // Emit offset in .debug_range as a relocatable label. emitDIE will handle
+  // emitting it appropriately.
+  auto *RangeSectionSym = DD->getRangeSectionSym();
+
+  RangeSpanList List(
+      Asm->GetTempSymbol("debug_ranges", DD->getNextRangeNumber()),
+      std::move(Range));
+
+  // Under fission, ranges are specified by constant offsets relative to the
+  // CU's DW_AT_GNU_ranges_base.
+  if (isDwoUnit())
+    addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                    RangeSectionSym);
+  else
+    addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                    RangeSectionSym);
+
+  // Add the range list to the set of ranges to be emitted.
+  (Skeleton ? Skeleton : this)->CURangeLists.push_back(std::move(List));
+}
+
+void DwarfCompileUnit::attachRangesOrLowHighPC(
+    DIE &Die, SmallVector<RangeSpan, 2> Ranges) {
+  if (Ranges.size() == 1) {
+    const auto &single = Ranges.front();
+    attachLowHighPC(Die, single.getStart(), single.getEnd());
+  } else
+    addScopeRangeList(Die, std::move(Ranges));
+}
+
+void DwarfCompileUnit::attachRangesOrLowHighPC(
+    DIE &Die, const SmallVectorImpl<InsnRange> &Ranges) {
+  SmallVector<RangeSpan, 2> List;
+  List.reserve(Ranges.size());
+  for (const InsnRange &R : Ranges)
+    List.push_back(RangeSpan(DD->getLabelBeforeInsn(R.first),
+                             DD->getLabelAfterInsn(R.second)));
+  attachRangesOrLowHighPC(Die, std::move(List));
+}
+
+// This scope represents inlined body of a function. Construct DIE to
+// represent this concrete inlined copy of the function.
+std::unique_ptr<DIE>
+DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
+  assert(Scope->getScopeNode());
+  DIScope DS(Scope->getScopeNode());
+  DISubprogram InlinedSP = getDISubprogram(DS);
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP];
+  assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
+
+  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_inlined_subroutine);
+  addDIEEntry(*ScopeDIE, dwarf::DW_AT_abstract_origin, *OriginDIE);
+
+  attachRangesOrLowHighPC(*ScopeDIE, Scope->getRanges());
+
+  // Add the call site information to the DIE.
+  DILocation DL(Scope->getInlinedAt());
+  addUInt(*ScopeDIE, dwarf::DW_AT_call_file, None,
+          getOrCreateSourceID(DL.getFilename(), DL.getDirectory()));
+  addUInt(*ScopeDIE, dwarf::DW_AT_call_line, None, DL.getLineNumber());
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
+  DD->addSubprogramNames(InlinedSP, *ScopeDIE);
+
+  return ScopeDIE;
+}
+
+// Construct new DW_TAG_lexical_block for this scope and attach
+// DW_AT_low_pc/DW_AT_high_pc labels.
+std::unique_ptr<DIE>
+DwarfCompileUnit::constructLexicalScopeDIE(LexicalScope *Scope) {
+  if (DD->isLexicalScopeDIENull(Scope))
+    return nullptr;
+
+  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_lexical_block);
+  if (Scope->isAbstractScope())
+    return ScopeDIE;
+
+  attachRangesOrLowHighPC(*ScopeDIE, Scope->getRanges());
+
+  return ScopeDIE;
+}
+
+/// constructVariableDIE - Construct a DIE for the given DbgVariable.
+std::unique_ptr<DIE> DwarfCompileUnit::constructVariableDIE(DbgVariable &DV,
+                                                            bool Abstract) {
+  auto D = constructVariableDIEImpl(DV, Abstract);
+  DV.setDIE(*D);
+  return D;
+}
+
+std::unique_ptr<DIE>
+DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
+                                           bool Abstract) {
+  // Define variable debug information entry.
+  auto VariableDie = make_unique<DIE>(DV.getTag());
+
+  if (Abstract) {
+    applyVariableAttributes(DV, *VariableDie);
+    return VariableDie;
+  }
+
+  // Add variable address.
+
+  unsigned Offset = DV.getDotDebugLocOffset();
+  if (Offset != ~0U) {
+    addLocationList(*VariableDie, dwarf::DW_AT_location, Offset);
+    return VariableDie;
+  }
+
+  // Check if variable is described by a DBG_VALUE instruction.
+  if (const MachineInstr *DVInsn = DV.getMInsn()) {
+    assert(DVInsn->getNumOperands() == 4);
+    if (DVInsn->getOperand(0).isReg()) {
+      const MachineOperand RegOp = DVInsn->getOperand(0);
+      // If the second operand is an immediate, this is an indirect value.
+      if (DVInsn->getOperand(1).isImm()) {
+        MachineLocation Location(RegOp.getReg(),
+                                 DVInsn->getOperand(1).getImm());
+        addVariableAddress(DV, *VariableDie, Location);
+      } else if (RegOp.getReg())
+        addVariableAddress(DV, *VariableDie, MachineLocation(RegOp.getReg()));
+    } else if (DVInsn->getOperand(0).isImm())
+      addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
+    else if (DVInsn->getOperand(0).isFPImm())
+      addConstantFPValue(*VariableDie, DVInsn->getOperand(0));
+    else if (DVInsn->getOperand(0).isCImm())
+      addConstantValue(*VariableDie, DVInsn->getOperand(0).getCImm(),
+                       DV.getType());
+
+    return VariableDie;
+  }
+
+  // .. else use frame index.
+  int FI = DV.getFrameIndex();
+  if (FI != ~0) {
+    unsigned FrameReg = 0;
+    const TargetFrameLowering *TFI =
+        Asm->TM.getSubtargetImpl()->getFrameLowering();
+    int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+    MachineLocation Location(FrameReg, Offset);
+    addVariableAddress(DV, *VariableDie, Location);
+  }
+
+  return VariableDie;
+}
+
+std::unique_ptr<DIE> DwarfCompileUnit::constructVariableDIE(
+    DbgVariable &DV, const LexicalScope &Scope, DIE *&ObjectPointer) {
+  auto Var = constructVariableDIE(DV, Scope.isAbstractScope());
+  if (DV.isObjectPointer())
+    ObjectPointer = Var.get();
+  return Var;
+}
+
+DIE *DwarfCompileUnit::createScopeChildrenDIE(
+    LexicalScope *Scope, SmallVectorImpl<std::unique_ptr<DIE>> &Children,
+    unsigned *ChildScopeCount) {
+  DIE *ObjectPointer = nullptr;
+
+  for (DbgVariable *DV : DU->getScopeVariables().lookup(Scope))
+    Children.push_back(constructVariableDIE(*DV, *Scope, ObjectPointer));
+
+  unsigned ChildCountWithoutScopes = Children.size();
+
+  for (LexicalScope *LS : Scope->getChildren())
+    constructScopeDIE(LS, Children);
+
+  if (ChildScopeCount)
+    *ChildScopeCount = Children.size() - ChildCountWithoutScopes;
+
+  return ObjectPointer;
+}
+
+void DwarfCompileUnit::constructSubprogramScopeDIE(LexicalScope *Scope) {
+  assert(Scope && Scope->getScopeNode());
+  assert(!Scope->getInlinedAt());
+  assert(!Scope->isAbstractScope());
+  DISubprogram Sub(Scope->getScopeNode());
+
+  assert(Sub.isSubprogram());
+
+  DD->getProcessedSPNodes().insert(Sub);
+
+  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub);
+
+  // If this is a variadic function, add an unspecified parameter.
+  DITypeArray FnArgs = Sub.getType().getTypeArray();
+
+  // Collect lexical scope children first.
+  // ObjectPointer might be a local (non-argument) local variable if it's a
+  // block's synthetic this pointer.
+  if (DIE *ObjectPointer = createAndAddScopeChildren(Scope, ScopeDIE))
+    addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, *ObjectPointer);
+
+  // If we have a single element of null, it is a function that returns void.
+  // If we have more than one elements and the last one is null, it is a
+  // variadic function.
+  if (FnArgs.getNumElements() > 1 &&
+      !FnArgs.getElement(FnArgs.getNumElements() - 1) &&
+      !includeMinimalInlineScopes())
+    ScopeDIE.addChild(make_unique<DIE>(dwarf::DW_TAG_unspecified_parameters));
+}
+
+DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
+                                                 DIE &ScopeDIE) {
+  // We create children when the scope DIE is not null.
+  SmallVector<std::unique_ptr<DIE>, 8> Children;
+  DIE *ObjectPointer = createScopeChildrenDIE(Scope, Children);
+
+  // Add children
+  for (auto &I : Children)
+    ScopeDIE.addChild(std::move(I));
+
+  return ObjectPointer;
+}
+
+void
+DwarfCompileUnit::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
+  DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()];
+  if (AbsDef)
+    return;
+
+  DISubprogram SP(Scope->getScopeNode());
+
+  DIE *ContextDIE;
+
+  if (includeMinimalInlineScopes())
+    ContextDIE = &getUnitDie();
+  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
+  // the important distinction that the DIDescriptor is not associated with the
+  // DIE (since the DIDescriptor will be associated with the concrete DIE, if
+  // any). It could be refactored to some common utility function.
+  else if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
+    ContextDIE = &getUnitDie();
+    getOrCreateSubprogramDIE(SPDecl);
+  } else
+    ContextDIE = getOrCreateContextDIE(resolve(SP.getContext()));
+
+  // Passing null as the associated DIDescriptor because the abstract definition
+  // shouldn't be found by lookup.
+  AbsDef =
+      &createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, DIDescriptor());
+  applySubprogramAttributesToDefinition(SP, *AbsDef);
+
+  if (!includeMinimalInlineScopes())
+    addUInt(*AbsDef, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
+  if (DIE *ObjectPointer = createAndAddScopeChildren(Scope, *AbsDef))
+    addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
+}
+
+std::unique_ptr<DIE>
+DwarfCompileUnit::constructImportedEntityDIE(const DIImportedEntity &Module) {
+  assert(Module.Verify() &&
+         "Use one of the MDNode * overloads to handle invalid metadata");
+  std::unique_ptr<DIE> IMDie = make_unique<DIE>((dwarf::Tag)Module.getTag());
+  insertDIE(Module, IMDie.get());
+  DIE *EntityDie;
+  DIDescriptor Entity = resolve(Module.getEntity());
+  if (Entity.isNameSpace())
+    EntityDie = getOrCreateNameSpace(DINameSpace(Entity));
+  else if (Entity.isSubprogram())
+    EntityDie = getOrCreateSubprogramDIE(DISubprogram(Entity));
+  else if (Entity.isType())
+    EntityDie = getOrCreateTypeDIE(DIType(Entity));
+  else if (Entity.isGlobalVariable())
+    EntityDie = getOrCreateGlobalVariableDIE(DIGlobalVariable(Entity));
+  else
+    EntityDie = getDIE(Entity);
+  assert(EntityDie);
+  addSourceLine(*IMDie, Module.getLineNumber(),
+                Module.getContext().getFilename(),
+                Module.getContext().getDirectory());
+  addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie);
+  StringRef Name = Module.getName();
+  if (!Name.empty())
+    addString(*IMDie, dwarf::DW_AT_name, Name);
+
+  return IMDie;
+}
+
+void DwarfCompileUnit::finishSubprogramDefinition(DISubprogram SP) {
+  DIE *D = getDIE(SP);
+  if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) {
+    if (D)
+      // If this subprogram has an abstract definition, reference that
+      addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
+  } else {
+    if (!D && !includeMinimalInlineScopes())
+      // Lazily construct the subprogram if we didn't see either concrete or
+      // inlined versions during codegen. (except in -gmlt ^ where we want
+      // to omit these entirely)
+      D = getOrCreateSubprogramDIE(SP);
+    if (D)
+      // And attach the attributes
+      applySubprogramAttributesToDefinition(SP, *D);
+  }
+}
+void DwarfCompileUnit::collectDeadVariables(DISubprogram SP) {
+  assert(SP.isSubprogram() && "CU's subprogram list contains a non-subprogram");
+  assert(SP.isDefinition() &&
+         "CU's subprogram list contains a subprogram declaration");
+  DIArray Variables = SP.getVariables();
+  if (Variables.getNumElements() == 0)
+    return;
+
+  DIE *SPDIE = DU->getAbstractSPDies().lookup(SP);
+  if (!SPDIE)
+    SPDIE = getDIE(SP);
+  assert(SPDIE);
+  for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
+    DIVariable DV(Variables.getElement(vi));
+    assert(DV.isVariable());
+    DbgVariable NewVar(DV, DIExpression(nullptr), DD);
+    auto VariableDie = constructVariableDIE(NewVar);
+    applyVariableAttributes(NewVar, *VariableDie);
+    SPDIE->addChild(std::move(VariableDie));
+  }
+}
+
+void DwarfCompileUnit::emitHeader(const MCSymbol *ASectionSym) const {
+  // Don't bother labeling the .dwo unit, as its offset isn't used.
+  if (!Skeleton)
+    Asm->OutStreamer.EmitLabel(LabelBegin);
+
+  DwarfUnit::emitHeader(ASectionSym);
+}
+
+/// addGlobalName - Add a new global name to the compile unit.
+void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die,
+                                     DIScope Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Name.str();
+  GlobalNames[FullName] = &Die;
+}
+
+/// Add a new global type to the unit.
+void DwarfCompileUnit::addGlobalType(DIType Ty, const DIE &Die,
+                                     DIScope Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Ty.getName().str();
+  GlobalTypes[FullName] = &Die;
+}
+
+/// addVariableAddress - Add DW_AT_location attribute for a
+/// DbgVariable based on provided MachineLocation.
+void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
+                                          MachineLocation Location) {
+  if (DV.variableHasComplexAddress())
+    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else if (DV.isBlockByrefVariable())
+    addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else
+    addAddress(Die, dwarf::DW_AT_location, Location,
+               DV.getVariable().isIndirect());
+}
+
+/// Add an address attribute to a die based on the location provided.
+void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
+                                  const MachineLocation &Location,
+                                  bool Indirect) {
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+
+  if (Location.isReg() && !Indirect)
+    addRegisterOpPiece(*Loc, Location.getReg());
+  else {
+    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+    if (Indirect && !Location.isReg()) {
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    }
+  }
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, Loc);
+}
+
+/// Start with the address based on the location provided, and generate the
+/// DWARF information necessary to find the actual variable given the extra
+/// address information encoded in the DbgVariable, starting from the starting
+/// location.  Add the DWARF information to the die.
+void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
+                                         dwarf::Attribute Attribute,
+                                         const MachineLocation &Location) {
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+  unsigned N = DV.getNumAddrElements();
+  unsigned i = 0;
+  if (Location.isReg()) {
+    if (N >= 2 && DV.getAddrElement(0) == dwarf::DW_OP_plus) {
+      assert(!DV.getVariable().isIndirect() &&
+             "double indirection not handled");
+      // If first address element is OpPlus then emit
+      // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
+      addRegisterOffset(*Loc, Location.getReg(), DV.getAddrElement(1));
+      i = 2;
+    } else if (N >= 2 && DV.getAddrElement(0) == dwarf::DW_OP_deref) {
+      assert(!DV.getVariable().isIndirect() &&
+             "double indirection not handled");
+      addRegisterOpPiece(*Loc, Location.getReg(),
+                         DV.getExpression().getPieceSize(),
+                         DV.getExpression().getPieceOffset());
+      i = 3;
+    } else
+      addRegisterOpPiece(*Loc, Location.getReg());
+  } else
+    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+
+  for (; i < N; ++i) {
+    uint64_t Element = DV.getAddrElement(i);
+    if (Element == dwarf::DW_OP_plus) {
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(*Loc, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
+
+    } else if (Element == dwarf::DW_OP_deref) {
+      if (!Location.isReg())
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+
+    } else if (Element == dwarf::DW_OP_piece) {
+      const unsigned SizeOfByte = 8;
+      unsigned PieceOffsetInBits = DV.getAddrElement(++i) * SizeOfByte;
+      unsigned PieceSizeInBits = DV.getAddrElement(++i) * SizeOfByte;
+      // Emit DW_OP_bit_piece Size Offset.
+      assert(PieceSizeInBits > 0 && "piece has zero size");
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_bit_piece);
+      addUInt(*Loc, dwarf::DW_FORM_udata, PieceSizeInBits);
+      addUInt(*Loc, dwarf::DW_FORM_udata, PieceOffsetInBits);
+    } else
+      llvm_unreachable("unknown DIBuilder Opcode");
+  }
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, Loc);
+}
+
+/// Add a Dwarf loclistptr attribute data and value.
+void DwarfCompileUnit::addLocationList(DIE &Die, dwarf::Attribute Attribute,
+                                       unsigned Index) {
+  DIEValue *Value = new (DIEValueAllocator) DIELocList(Index);
+  dwarf::Form Form = DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                                : dwarf::DW_FORM_data4;
+  Die.addValue(Attribute, Form, Value);
+}
+
+void DwarfCompileUnit::applyVariableAttributes(const DbgVariable &Var,
+                                               DIE &VariableDie) {
+  StringRef Name = Var.getName();
+  if (!Name.empty())
+    addString(VariableDie, dwarf::DW_AT_name, Name);
+  addSourceLine(VariableDie, Var.getVariable());
+  addType(VariableDie, Var.getType());
+  if (Var.isArtificial())
+    addFlag(VariableDie, dwarf::DW_AT_artificial);
+}
+
+/// Add a Dwarf expression attribute data and value.
+void DwarfCompileUnit::addExpr(DIELoc &Die, dwarf::Form Form,
+                               const MCExpr *Expr) {
+  DIEValue *Value = new (DIEValueAllocator) DIEExpr(Expr);
+  Die.addValue((dwarf::Attribute)0, Form, Value);
+}
+
+void DwarfCompileUnit::applySubprogramAttributesToDefinition(DISubprogram SP,
+                                                             DIE &SPDie) {
+  DISubprogram SPDecl = SP.getFunctionDeclaration();
+  DIScope Context = resolve(SPDecl ? SPDecl.getContext() : SP.getContext());
+  applySubprogramAttributes(SP, SPDie, includeMinimalInlineScopes());
+  addGlobalName(SP.getName(), SPDie, Context);
+}
+
+bool DwarfCompileUnit::isDwoUnit() const {
+  return DD->useSplitDwarf() && Skeleton;
+}
+
+bool DwarfCompileUnit::includeMinimalInlineScopes() const {
+  return getCUNode().getEmissionKind() == DIBuilder::LineTablesOnly ||
+         (DD->useSplitDwarf() && !Skeleton);
+}
+} // end llvm namespace

diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
new file mode 100644
index 0000000..e521f39
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h

@@ -0,0 +1,250 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+
+#include "DwarfUnit.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DebugInfo.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class DIE;
+class DwarfDebug;
+class DwarfFile;
+class MCSymbol;
+class LexicalScope;
+
+class DwarfCompileUnit : public DwarfUnit {
+  /// The attribute index of DW_AT_stmt_list in the compile unit DIE, avoiding
+  /// the need to search for it in applyStmtList.
+  unsigned stmtListIndex;
+
+  /// Skeleton unit associated with this unit.
+  DwarfCompileUnit *Skeleton;
+
+  /// A label at the start of the non-dwo section related to this unit.
+  MCSymbol *SectionSym;
+
+  /// The start of the unit within its section.
+  MCSymbol *LabelBegin;
+
+  /// GlobalNames - A map of globally visible named entities for this unit.
+  StringMap<const DIE *> GlobalNames;
+
+  /// GlobalTypes - A map of globally visible types for this unit.
+  StringMap<const DIE *> GlobalTypes;
+
+  // List of range lists for a given compile unit, separate from the ranges for
+  // the CU itself.
+  SmallVector<RangeSpanList, 1> CURangeLists;
+
+  // List of ranges for a given compile unit.
+  SmallVector<RangeSpan, 2> CURanges;
+
+  // The base address of this unit, if any. Used for relative references in
+  // ranges/locs.
+  const MCSymbol *BaseAddress;
+
+  /// \brief Construct a DIE for the given DbgVariable without initializing the
+  /// DbgVariable's DIE reference.
+  std::unique_ptr<DIE> constructVariableDIEImpl(const DbgVariable &DV,
+                                                bool Abstract);
+
+  bool isDwoUnit() const override;
+
+  bool includeMinimalInlineScopes() const;
+
+public:
+  DwarfCompileUnit(unsigned UID, DICompileUnit Node, AsmPrinter *A,
+                   DwarfDebug *DW, DwarfFile *DWU);
+
+  DwarfCompileUnit *getSkeleton() const {
+    return Skeleton;
+  }
+
+  void initStmtList(MCSymbol *DwarfLineSectionSym);
+
+  /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE.
+  void applyStmtList(DIE &D);
+
+  /// getOrCreateGlobalVariableDIE - get or create global variable DIE.
+  DIE *getOrCreateGlobalVariableDIE(DIGlobalVariable GV);
+
+  /// addLabelAddress - Add a dwarf label attribute data and value using
+  /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  void addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                       const MCSymbol *Label);
+
+  /// addLocalLabelAddress - Add a dwarf label attribute data and value using
+  /// DW_FORM_addr only.
+  void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                            const MCSymbol *Label);
+
+  /// addSectionDelta - Add a label delta attribute data and value.
+  void addSectionDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
+                       const MCSymbol *Lo);
+
+  DwarfCompileUnit &getCU() override { return *this; }
+
+  unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
+
+  /// addRange - Add an address range to the list of ranges for this unit.
+  void addRange(RangeSpan Range);
+
+  void attachLowHighPC(DIE &D, const MCSymbol *Begin, const MCSymbol *End);
+
+  /// addSectionLabel - Add a Dwarf section label attribute data and value.
+  ///
+  void addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                       const MCSymbol *Label, const MCSymbol *Sec);
+
+  /// \brief Find DIE for the given subprogram and attach appropriate
+  /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
+  /// variables in this scope then create and insert DIEs for these
+  /// variables.
+  DIE &updateSubprogramScopeDIE(DISubprogram SP);
+
+  void constructScopeDIE(LexicalScope *Scope,
+                         SmallVectorImpl<std::unique_ptr<DIE>> &FinalChildren);
+
+  /// \brief A helper function to construct a RangeSpanList for a given
+  /// lexical scope.
+  void addScopeRangeList(DIE &ScopeDIE, SmallVector<RangeSpan, 2> Range);
+
+  void attachRangesOrLowHighPC(DIE &D, SmallVector<RangeSpan, 2> Ranges);
+
+  void attachRangesOrLowHighPC(DIE &D,
+                               const SmallVectorImpl<InsnRange> &Ranges);
+  /// \brief This scope represents inlined body of a function. Construct
+  /// DIE to represent this concrete inlined copy of the function.
+  std::unique_ptr<DIE> constructInlinedScopeDIE(LexicalScope *Scope);
+
+  /// \brief Construct new DW_TAG_lexical_block for this scope and
+  /// attach DW_AT_low_pc/DW_AT_high_pc labels.
+  std::unique_ptr<DIE> constructLexicalScopeDIE(LexicalScope *Scope);
+
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  std::unique_ptr<DIE> constructVariableDIE(DbgVariable &DV,
+                                            bool Abstract = false);
+
+  std::unique_ptr<DIE> constructVariableDIE(DbgVariable &DV,
+                                            const LexicalScope &Scope,
+                                            DIE *&ObjectPointer);
+
+  /// A helper function to create children of a Scope DIE.
+  DIE *createScopeChildrenDIE(LexicalScope *Scope,
+                              SmallVectorImpl<std::unique_ptr<DIE>> &Children,
+                              unsigned *ChildScopeCount = nullptr);
+
+  /// \brief Construct a DIE for this subprogram scope.
+  void constructSubprogramScopeDIE(LexicalScope *Scope);
+
+  DIE *createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE);
+
+  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+
+  /// \brief Construct import_module DIE.
+  std::unique_ptr<DIE>
+  constructImportedEntityDIE(const DIImportedEntity &Module);
+
+  void finishSubprogramDefinition(DISubprogram SP);
+
+  void collectDeadVariables(DISubprogram SP);
+
+  /// Set the skeleton unit associated with this unit.
+  void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
+
+  MCSymbol *getSectionSym() const {
+    assert(Section);
+    return SectionSym;
+  }
+
+  /// Pass in the SectionSym even though we could recreate it in every compile
+  /// unit (type units will have actually distinct symbols once they're in
+  /// comdat sections).
+  void initSection(const MCSection *Section, MCSymbol *SectionSym) {
+    DwarfUnit::initSection(Section);
+    this->SectionSym = SectionSym;
+
+    // Don't bother labeling the .dwo unit, as its offset isn't used.
+    if (!Skeleton)
+      LabelBegin =
+          Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
+  }
+
+  unsigned getLength() {
+    return sizeof(uint32_t) + // Length field
+        getHeaderSize() + UnitDie.getSize();
+  }
+
+  void emitHeader(const MCSymbol *ASectionSym) const override;
+
+  MCSymbol *getLabelBegin() const {
+    assert(Section);
+    return LabelBegin;
+  }
+
+  /// Add a new global name to the compile unit.
+  void addGlobalName(StringRef Name, DIE &Die, DIScope Context) override;
+
+  /// Add a new global type to the compile unit.
+  void addGlobalType(DIType Ty, const DIE &Die, DIScope Context) override;
+
+  const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
+  const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
+
+  /// Add DW_AT_location attribute for a DbgVariable based on provided
+  /// MachineLocation.
+  void addVariableAddress(const DbgVariable &DV, DIE &Die,
+                          MachineLocation Location);
+  /// Add an address attribute to a die based on the location provided.
+  void addAddress(DIE &Die, dwarf::Attribute Attribute,
+                  const MachineLocation &Location, bool Indirect = false);
+
+  /// Start with the address based on the location provided, and generate the
+  /// DWARF information necessary to find the actual variable (navigating the
+  /// extra location information encoded in the type) based on the starting
+  /// location.  Add the DWARF information to the die.
+  void addComplexAddress(const DbgVariable &DV, DIE &Die,
+                         dwarf::Attribute Attribute,
+                         const MachineLocation &Location);
+
+  /// Add a Dwarf loclistptr attribute data and value.
+  void addLocationList(DIE &Die, dwarf::Attribute Attribute, unsigned Index);
+  void applyVariableAttributes(const DbgVariable &Var, DIE &VariableDie);
+
+  /// Add a Dwarf expression attribute data and value.
+  void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
+
+  void applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie);
+
+  /// getRangeLists - Get the vector of range lists.
+  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
+    return (Skeleton ? Skeleton : this)->CURangeLists;
+  }
+
+  /// getRanges - Get the list of ranges for this unit.
+  const SmallVectorImpl<RangeSpan> &getRanges() const { return CURanges; }
+  SmallVector<RangeSpan, 2> takeRanges() { return std::move(CURanges); }
+
+  void setBaseAddress(const MCSymbol *Base) { BaseAddress = Base; }
+  const MCSymbol *getBaseAddress() const { return BaseAddress; }
+};
+
+} // end llvm namespace
+
+#endif

diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 77860c0..230ea46 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp

@@ -11,8 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ByteStreamer.h"
 #include "DwarfDebug.h"
+
+#include "ByteStreamer.h"
+#include "DwarfCompileUnit.h"
 #include "DIE.h"
 #include "DIEHash.h"
 #include "DwarfUnit.h"
@@ -36,6 +38,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
@@ -47,6 +50,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
@@ -149,7 +153,7 @@
     if (tag == dwarf::DW_TAG_pointer_type)
       subType = resolve(DIDerivedType(Ty).getTypeDerivedFrom());
 
-    DIArray Elements = DICompositeType(subType).getTypeArray();
+    DIArray Elements = DICompositeType(subType).getElements();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
       DIDerivedType DT(Elements.getElement(i));
       if (getName() == DT.getName())
@@ -165,10 +169,11 @@
     DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags, dwarf::DW_FORM_data1)};
 
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
-    : Asm(A), MMI(Asm->MMI), FirstCU(nullptr), PrevLabel(nullptr),
-      GlobalRangeCount(0), InfoHolder(A, "info_string", DIEValueAllocator),
+    : Asm(A), MMI(Asm->MMI), PrevLabel(nullptr), GlobalRangeCount(0),
+      InfoHolder(A, *this, "info_string", DIEValueAllocator),
       UsedNonDefaultText(false),
-      SkeletonHolder(A, "skel_string", DIEValueAllocator),
+      SkeletonHolder(A, *this, "skel_string", DIEValueAllocator),
+      IsDarwin(Triple(A->getTargetTriple()).isOSDarwin()),
       AccelNames(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                        dwarf::DW_FORM_data4)),
       AccelObjC(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
@@ -188,8 +193,6 @@
 
   // Turn on accelerator tables for Darwin by default, pubnames by
   // default for non-Darwin, and handle split dwarf.
-  bool IsDarwin = Triple(A->getTargetTriple()).isOSDarwin();
-
   if (DwarfAccelTables == Default)
     HasDwarfAccelTables = IsDarwin;
   else
@@ -308,26 +311,6 @@
   return false;
 }
 
-// Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
-// and DW_AT_high_pc attributes. If there are global variables in this
-// scope then create and insert DIEs for these variables.
-DIE &DwarfDebug::updateSubprogramScopeDIE(DwarfCompileUnit &SPCU,
-                                          DISubprogram SP) {
-  DIE *SPDie = SPCU.getOrCreateSubprogramDIE(SP);
-
-  attachLowHighPC(SPCU, *SPDie, FunctionBeginSym, FunctionEndSym);
-
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-  SPCU.addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
-
-  // Add name to the name table, we do this here because we're guaranteed
-  // to have concrete versions of our DW_TAG_subprogram nodes.
-  addSubprogramNames(SP, *SPDie);
-
-  return *SPDie;
-}
-
 /// Check whether we should create a DIE for the given Scope, return true
 /// if we don't create a DIE (the corresponding DIE is null).
 bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
@@ -344,271 +327,30 @@
 
   // We don't create a DIE if we have a single Range and the end label
   // is null.
-  SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin();
-  MCSymbol *End = getLabelAfterInsn(RI->second);
-  return !End;
+  return !getLabelAfterInsn(Ranges.front().second);
 }
 
-static void addSectionLabel(AsmPrinter &Asm, DwarfUnit &U, DIE &D,
-                            dwarf::Attribute A, const MCSymbol *L,
-                            const MCSymbol *Sec) {
-  if (Asm.MAI->doesDwarfUseRelocationsAcrossSections())
-    U.addSectionLabel(D, A, L);
-  else
-    U.addSectionDelta(D, A, L, Sec);
+template <typename Func> void forBothCUs(DwarfCompileUnit &CU, Func F) {
+  F(CU);
+  if (auto *SkelCU = CU.getSkeleton())
+    F(*SkelCU);
 }
 
-void DwarfDebug::addScopeRangeList(DwarfCompileUnit &TheCU, DIE &ScopeDIE,
-                                   const SmallVectorImpl<InsnRange> &Range) {
-  // Emit offset in .debug_range as a relocatable label. emitDIE will handle
-  // emitting it appropriately.
-  MCSymbol *RangeSym = Asm->GetTempSymbol("debug_ranges", GlobalRangeCount++);
-
-  // Under fission, ranges are specified by constant offsets relative to the
-  // CU's DW_AT_GNU_ranges_base.
-  if (useSplitDwarf())
-    TheCU.addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, RangeSym,
-                          DwarfDebugRangeSectionSym);
-  else
-    addSectionLabel(*Asm, TheCU, ScopeDIE, dwarf::DW_AT_ranges, RangeSym,
-                    DwarfDebugRangeSectionSym);
-
-  RangeSpanList List(RangeSym);
-  for (const InsnRange &R : Range) {
-    RangeSpan Span(getLabelBeforeInsn(R.first), getLabelAfterInsn(R.second));
-    List.addRange(std::move(Span));
-  }
-
-  // Add the range list to the set of ranges to be emitted.
-  TheCU.addRangeList(std::move(List));
-}
-
-void DwarfDebug::attachRangesOrLowHighPC(DwarfCompileUnit &TheCU, DIE &Die,
-                                    const SmallVectorImpl<InsnRange> &Ranges) {
-  assert(!Ranges.empty());
-  if (Ranges.size() == 1)
-    attachLowHighPC(TheCU, Die, getLabelBeforeInsn(Ranges.front().first),
-                    getLabelAfterInsn(Ranges.front().second));
-  else
-    addScopeRangeList(TheCU, Die, Ranges);
-}
-
-// Construct new DW_TAG_lexical_block for this scope and attach
-// DW_AT_low_pc/DW_AT_high_pc labels.
-std::unique_ptr<DIE>
-DwarfDebug::constructLexicalScopeDIE(DwarfCompileUnit &TheCU,
-                                     LexicalScope *Scope) {
-  if (isLexicalScopeDIENull(Scope))
-    return nullptr;
-
-  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_lexical_block);
-  if (Scope->isAbstractScope())
-    return ScopeDIE;
-
-  attachRangesOrLowHighPC(TheCU, *ScopeDIE, Scope->getRanges());
-
-  return ScopeDIE;
-}
-
-// This scope represents inlined body of a function. Construct DIE to
-// represent this concrete inlined copy of the function.
-std::unique_ptr<DIE>
-DwarfDebug::constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
-                                     LexicalScope *Scope) {
-  assert(Scope->getScopeNode());
-  DIScope DS(Scope->getScopeNode());
-  DISubprogram InlinedSP = getDISubprogram(DS);
-  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
-  // was inlined from another compile unit.
-  DIE *OriginDIE = AbstractSPDies[InlinedSP];
-  assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
-
-  auto ScopeDIE = make_unique<DIE>(dwarf::DW_TAG_inlined_subroutine);
-  TheCU.addDIEEntry(*ScopeDIE, dwarf::DW_AT_abstract_origin, *OriginDIE);
-
-  attachRangesOrLowHighPC(TheCU, *ScopeDIE, Scope->getRanges());
-
-  InlinedSubprogramDIEs.insert(OriginDIE);
-
-  // Add the call site information to the DIE.
-  DILocation DL(Scope->getInlinedAt());
-  TheCU.addUInt(*ScopeDIE, dwarf::DW_AT_call_file, None,
-                TheCU.getOrCreateSourceID(DL.getFilename(), DL.getDirectory()));
-  TheCU.addUInt(*ScopeDIE, dwarf::DW_AT_call_line, None, DL.getLineNumber());
-
-  // Add name to the name table, we do this here because we're guaranteed
-  // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
-  addSubprogramNames(InlinedSP, *ScopeDIE);
-
-  return ScopeDIE;
-}
-
-static std::unique_ptr<DIE> constructVariableDIE(DwarfCompileUnit &TheCU,
-                                                 DbgVariable &DV,
-                                                 const LexicalScope &Scope,
-                                                 DIE *&ObjectPointer) {
-  auto Var = TheCU.constructVariableDIE(DV, Scope.isAbstractScope());
-  if (DV.isObjectPointer())
-    ObjectPointer = Var.get();
-  return Var;
-}
-
-DIE *DwarfDebug::createScopeChildrenDIE(
-    DwarfCompileUnit &TheCU, LexicalScope *Scope,
-    SmallVectorImpl<std::unique_ptr<DIE>> &Children) {
-  DIE *ObjectPointer = nullptr;
-
-  // Collect arguments for current function.
-  if (LScopes.isCurrentFunctionScope(Scope)) {
-    for (DbgVariable *ArgDV : CurrentFnArguments)
-      if (ArgDV)
-        Children.push_back(
-            constructVariableDIE(TheCU, *ArgDV, *Scope, ObjectPointer));
-
-    // If this is a variadic function, add an unspecified parameter.
-    DISubprogram SP(Scope->getScopeNode());
-    DIArray FnArgs = SP.getType().getTypeArray();
-    if (FnArgs.getElement(FnArgs.getNumElements() - 1)
-            .isUnspecifiedParameter()) {
-      Children.push_back(
-          make_unique<DIE>(dwarf::DW_TAG_unspecified_parameters));
-    }
-  }
-
-  // Collect lexical scope children first.
-  for (DbgVariable *DV : ScopeVariables.lookup(Scope))
-    Children.push_back(constructVariableDIE(TheCU, *DV, *Scope, ObjectPointer));
-
-  for (LexicalScope *LS : Scope->getChildren())
-    if (std::unique_ptr<DIE> Nested = constructScopeDIE(TheCU, LS))
-      Children.push_back(std::move(Nested));
-  return ObjectPointer;
-}
-
-void DwarfDebug::createAndAddScopeChildren(DwarfCompileUnit &TheCU,
-                                           LexicalScope *Scope, DIE &ScopeDIE) {
-  // We create children when the scope DIE is not null.
-  SmallVector<std::unique_ptr<DIE>, 8> Children;
-  if (DIE *ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children))
-    TheCU.addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, *ObjectPointer);
-
-  // Add children
-  for (auto &I : Children)
-    ScopeDIE.addChild(std::move(I));
-}
-
-void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &TheCU,
-                                                     LexicalScope *Scope) {
+void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
   assert(Scope && Scope->getScopeNode());
   assert(Scope->isAbstractScope());
   assert(!Scope->getInlinedAt());
 
-  DISubprogram SP(Scope->getScopeNode());
+  const MDNode *SP = Scope->getScopeNode();
 
   ProcessedSPNodes.insert(SP);
 
-  DIE *&AbsDef = AbstractSPDies[SP];
-  if (AbsDef)
-    return;
-
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
-  DwarfCompileUnit &SPCU = *SPMap[SP];
-  DIE *ContextDIE;
-
-  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
-  // the important distinction that the DIDescriptor is not associated with the
-  // DIE (since the DIDescriptor will be associated with the concrete DIE, if
-  // any). It could be refactored to some common utility function.
-  if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
-    ContextDIE = &SPCU.getUnitDie();
-    SPCU.getOrCreateSubprogramDIE(SPDecl);
-  } else
-    ContextDIE = SPCU.getOrCreateContextDIE(resolve(SP.getContext()));
-
-  // Passing null as the associated DIDescriptor because the abstract definition
-  // shouldn't be found by lookup.
-  AbsDef = &SPCU.createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE,
-                                 DIDescriptor());
-  SPCU.applySubprogramAttributesToDefinition(SP, *AbsDef);
-
-  SPCU.addUInt(*AbsDef, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
-  createAndAddScopeChildren(SPCU, Scope, *AbsDef);
-}
-
-DIE &DwarfDebug::constructSubprogramScopeDIE(DwarfCompileUnit &TheCU,
-                                             LexicalScope *Scope) {
-  assert(Scope && Scope->getScopeNode());
-  assert(!Scope->getInlinedAt());
-  assert(!Scope->isAbstractScope());
-  DISubprogram Sub(Scope->getScopeNode());
-
-  assert(Sub.isSubprogram());
-
-  ProcessedSPNodes.insert(Sub);
-
-  DIE &ScopeDIE = updateSubprogramScopeDIE(TheCU, Sub);
-
-  createAndAddScopeChildren(TheCU, Scope, ScopeDIE);
-
-  return ScopeDIE;
-}
-
-// Construct a DIE for this scope.
-std::unique_ptr<DIE> DwarfDebug::constructScopeDIE(DwarfCompileUnit &TheCU,
-                                                   LexicalScope *Scope) {
-  if (!Scope || !Scope->getScopeNode())
-    return nullptr;
-
-  DIScope DS(Scope->getScopeNode());
-
-  assert((Scope->getInlinedAt() || !DS.isSubprogram()) &&
-         "Only handle inlined subprograms here, use "
-         "constructSubprogramScopeDIE for non-inlined "
-         "subprograms");
-
-  SmallVector<std::unique_ptr<DIE>, 8> Children;
-
-  // We try to create the scope DIE first, then the children DIEs. This will
-  // avoid creating un-used children then removing them later when we find out
-  // the scope DIE is null.
-  std::unique_ptr<DIE> ScopeDIE;
-  if (Scope->getParent() && DS.isSubprogram()) {
-    ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
-    if (!ScopeDIE)
-      return nullptr;
-    // We create children when the scope DIE is not null.
-    createScopeChildrenDIE(TheCU, Scope, Children);
-  } else {
-    // Early exit when we know the scope DIE is going to be null.
-    if (isLexicalScopeDIENull(Scope))
-      return nullptr;
-
-    // We create children here when we know the scope DIE is not going to be
-    // null and the children will be added to the scope DIE.
-    createScopeChildrenDIE(TheCU, Scope, Children);
-
-    // There is no need to emit empty lexical block DIE.
-    std::pair<ImportedEntityMap::const_iterator,
-              ImportedEntityMap::const_iterator> Range =
-        std::equal_range(ScopesWithImportedEntities.begin(),
-                         ScopesWithImportedEntities.end(),
-                         std::pair<const MDNode *, const MDNode *>(DS, nullptr),
-                         less_first());
-    if (Children.empty() && Range.first == Range.second)
-      return nullptr;
-    ScopeDIE = constructLexicalScopeDIE(TheCU, Scope);
-    assert(ScopeDIE && "Scope DIE should not be null.");
-    for (ImportedEntityMap::const_iterator i = Range.first; i != Range.second;
-         ++i)
-      constructImportedEntityDIE(TheCU, i->second, *ScopeDIE);
-  }
-
-  // Add children
-  for (auto &I : Children)
-    ScopeDIE->addChild(std::move(I));
-
-  return ScopeDIE;
+  auto &CU = SPMap[SP];
+  forBothCUs(*CU, [&](DwarfCompileUnit &CU) {
+    CU.constructAbstractSubprogramScopeDIE(Scope);
+  });
 }
 
 void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
@@ -629,6 +371,8 @@
   DwarfCompileUnit &NewCU = *OwnedUnit;
   DIE &Die = NewCU.getUnitDie();
   InfoHolder.addUnit(std::move(OwnedUnit));
+  if (useSplitDwarf())
+    NewCU.setSkeleton(constructSkeletonCU(NewCU));
 
   // LTO with assembly output shares a single line table amongst multiple CUs.
   // To avoid the compilation directory being ambiguous, let the line table
@@ -665,14 +409,10 @@
     NewCU.addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
                   dwarf::DW_FORM_data1, RVer);
 
-  if (!FirstCU)
-    FirstCU = &NewCU;
-
-  if (useSplitDwarf()) {
+  if (useSplitDwarf())
     NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoDWOSection(),
                       DwarfInfoDWOSectionSym);
-    NewCU.setSkeleton(constructSkeletonCU(NewCU));
-  } else
+  else
     NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoSection(),
                       DwarfInfoSectionSym);
 
@@ -681,44 +421,12 @@
   return NewCU;
 }
 
-void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                            const MDNode *N) {
+void DwarfDebug::constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
+                                                  const MDNode *N) {
   DIImportedEntity Module(N);
   assert(Module.Verify());
   if (DIE *D = TheCU.getOrCreateContextDIE(Module.getContext()))
-    constructImportedEntityDIE(TheCU, Module, *D);
-}
-
-void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                            const MDNode *N, DIE &Context) {
-  DIImportedEntity Module(N);
-  assert(Module.Verify());
-  return constructImportedEntityDIE(TheCU, Module, Context);
-}
-
-void DwarfDebug::constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                            const DIImportedEntity &Module,
-                                            DIE &Context) {
-  assert(Module.Verify() &&
-         "Use one of the MDNode * overloads to handle invalid metadata");
-  DIE &IMDie = TheCU.createAndAddDIE(Module.getTag(), Context, Module);
-  DIE *EntityDie;
-  DIDescriptor Entity = resolve(Module.getEntity());
-  if (Entity.isNameSpace())
-    EntityDie = TheCU.getOrCreateNameSpace(DINameSpace(Entity));
-  else if (Entity.isSubprogram())
-    EntityDie = TheCU.getOrCreateSubprogramDIE(DISubprogram(Entity));
-  else if (Entity.isType())
-    EntityDie = TheCU.getOrCreateTypeDIE(DIType(Entity));
-  else
-    EntityDie = TheCU.getDIE(Entity);
-  TheCU.addSourceLine(IMDie, Module.getLineNumber(),
-                      Module.getContext().getFilename(),
-                      Module.getContext().getDirectory());
-  TheCU.addDIEEntry(IMDie, dwarf::DW_AT_import, *EntityDie);
-  StringRef Name = Module.getName();
-  if (!Name.empty())
-    TheCU.addString(IMDie, dwarf::DW_AT_name, Name);
+    D->addChild(TheCU.constructImportedEntityDIE(Module));
 }
 
 // Emit all Dwarf sections that should come prior to the content. Create
@@ -732,8 +440,6 @@
 
   FunctionDIs = makeSubprogramMap(*M);
 
-  // If module has named metadata anchors then use them, otherwise scan the
-  // module using debug info finder to collect debug info.
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes)
     return;
@@ -756,13 +462,18 @@
               ScopesWithImportedEntities.end(), less_first());
     DIArray GVs = CUNode.getGlobalVariables();
     for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i)
-      CU.createGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
+      CU.getOrCreateGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
     DIArray SPs = CUNode.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
       SPMap.insert(std::make_pair(SPs.getElement(i), &CU));
     DIArray EnumTypes = CUNode.getEnumTypes();
-    for (unsigned i = 0, e = EnumTypes.getNumElements(); i != e; ++i)
-      CU.getOrCreateTypeDIE(EnumTypes.getElement(i));
+    for (unsigned i = 0, e = EnumTypes.getNumElements(); i != e; ++i) {
+      DIType Ty(EnumTypes.getElement(i));
+      // The enum types array by design contains pointers to
+      // MDNodes rather than DIRefs. Unique them here.
+      DIType UniqueTy(resolve(Ty.getRef()));
+      CU.getOrCreateTypeDIE(UniqueTy);
+    }
     DIArray RetainedTypes = CUNode.getRetainedTypes();
     for (unsigned i = 0, e = RetainedTypes.getNumElements(); i != e; ++i) {
       DIType Ty(RetainedTypes.getElement(i));
@@ -774,7 +485,7 @@
     // Emit imported_modules last so that the relevant context is already
     // available.
     for (unsigned i = 0, e = ImportedEntities.getNumElements(); i != e; ++i)
-      constructImportedEntityDIE(CU, ImportedEntities.getElement(i));
+      constructAndAddImportedEntityDIE(CU, ImportedEntities.getElement(i));
   }
 
   // Tell MMI that we have debug info.
@@ -787,9 +498,7 @@
 void DwarfDebug::finishVariableDefinitions() {
   for (const auto &Var : ConcreteVariables) {
     DIE *VariableDie = Var->getDIE();
-    // FIXME: There shouldn't be any variables without DIEs.
-    if (!VariableDie)
-      continue;
+    assert(VariableDie);
     // FIXME: Consider the time-space tradeoff of just storing the unit pointer
     // in the ConcreteVariables list, rather than looking it up again here.
     // DIE::getUnit isn't simple - it walks parent pointers, etc.
@@ -805,36 +514,10 @@
 }
 
 void DwarfDebug::finishSubprogramDefinitions() {
-  const Module *M = MMI->getModule();
-
-  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
-  for (MDNode *N : CU_Nodes->operands()) {
-    DICompileUnit TheCU(N);
-    // Construct subprogram DIE and add variables DIEs.
-    DwarfCompileUnit *SPCU =
-        static_cast<DwarfCompileUnit *>(CUMap.lookup(TheCU));
-    DIArray Subprograms = TheCU.getSubprograms();
-    for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
-      DISubprogram SP(Subprograms.getElement(i));
-      // Perhaps the subprogram is in another CU (such as due to comdat
-      // folding, etc), in which case ignore it here.
-      if (SPMap[SP] != SPCU)
-        continue;
-      DIE *D = SPCU->getDIE(SP);
-      if (DIE *AbsSPDIE = AbstractSPDies.lookup(SP)) {
-        if (D)
-          // If this subprogram has an abstract definition, reference that
-          SPCU->addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
-      } else {
-        if (!D)
-          // Lazily construct the subprogram if we didn't see either concrete or
-          // inlined versions during codegen.
-          D = SPCU->getOrCreateSubprogramDIE(SP);
-        // And attach the attributes
-        SPCU->applySubprogramAttributesToDefinition(SP, *D);
-      }
-    }
-  }
+  for (const auto &P : SPMap)
+    forBothCUs(*P.second, [&](DwarfCompileUnit &CU) {
+      CU.finishSubprogramDefinition(DISubprogram(P.first));
+    });
 }
 
 
@@ -854,26 +537,7 @@
         DISubprogram SP(Subprograms.getElement(i));
         if (ProcessedSPNodes.count(SP) != 0)
           continue;
-        assert(SP.isSubprogram() &&
-               "CU's subprogram list contains a non-subprogram");
-        assert(SP.isDefinition() &&
-               "CU's subprogram list contains a subprogram declaration");
-        DIArray Variables = SP.getVariables();
-        if (Variables.getNumElements() == 0)
-          continue;
-
-        DIE *SPDIE = AbstractSPDies.lookup(SP);
-        if (!SPDIE)
-          SPDIE = SPCU->getDIE(SP);
-        assert(SPDIE);
-        for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
-          DIVariable DV(Variables.getElement(vi));
-          assert(DV.isVariable());
-          DbgVariable NewVar(DV, this);
-          auto VariableDie = SPCU->constructVariableDIE(NewVar);
-          SPCU->applyVariableAttributes(NewVar, *VariableDie);
-          SPDIE->addChild(std::move(VariableDie));
-        }
+        SPCU->collectDeadVariables(SP);
       }
     }
   }
@@ -889,66 +553,52 @@
 
   // Handle anything that needs to be done on a per-unit basis after
   // all other generation.
-  for (const auto &TheU : getUnits()) {
+  for (const auto &P : CUMap) {
+    auto &TheCU = *P.second;
     // Emit DW_AT_containing_type attribute to connect types with their
     // vtable holding type.
-    TheU->constructContainingTypeDIEs();
+    TheCU.constructContainingTypeDIEs();
 
     // Add CU specific attributes if we need to add any.
-    if (TheU->getUnitDie().getTag() == dwarf::DW_TAG_compile_unit) {
-      // If we're splitting the dwarf out now that we've got the entire
-      // CU then add the dwo id to it.
-      DwarfCompileUnit *SkCU =
-          static_cast<DwarfCompileUnit *>(TheU->getSkeleton());
-      if (useSplitDwarf()) {
-        // Emit a unique identifier for this CU.
-        uint64_t ID = DIEHash(Asm).computeCUSignature(TheU->getUnitDie());
-        TheU->addUInt(TheU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
-                      dwarf::DW_FORM_data8, ID);
-        SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
-                      dwarf::DW_FORM_data8, ID);
+    // If we're splitting the dwarf out now that we've got the entire
+    // CU then add the dwo id to it.
+    auto *SkCU = TheCU.getSkeleton();
+    if (useSplitDwarf()) {
+      // Emit a unique identifier for this CU.
+      uint64_t ID = DIEHash(Asm).computeCUSignature(TheCU.getUnitDie());
+      TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                    dwarf::DW_FORM_data8, ID);
+      SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                    dwarf::DW_FORM_data8, ID);
 
-        // We don't keep track of which addresses are used in which CU so this
-        // is a bit pessimistic under LTO.
-        if (!AddrPool.isEmpty())
-          addSectionLabel(*Asm, *SkCU, SkCU->getUnitDie(),
-                          dwarf::DW_AT_GNU_addr_base, DwarfAddrSectionSym,
-                          DwarfAddrSectionSym);
-        if (!TheU->getRangeLists().empty())
-          addSectionLabel(*Asm, *SkCU, SkCU->getUnitDie(),
-                          dwarf::DW_AT_GNU_ranges_base,
-                          DwarfDebugRangeSectionSym, DwarfDebugRangeSectionSym);
-      }
+      // We don't keep track of which addresses are used in which CU so this
+      // is a bit pessimistic under LTO.
+      if (!AddrPool.isEmpty())
+        SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_addr_base,
+                              DwarfAddrSectionSym, DwarfAddrSectionSym);
+      if (!SkCU->getRangeLists().empty())
+        SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_ranges_base,
+                              DwarfDebugRangeSectionSym,
+                              DwarfDebugRangeSectionSym);
+    }
 
-      // If we have code split among multiple sections or non-contiguous
-      // ranges of code then emit a DW_AT_ranges attribute on the unit that will
-      // remain in the .o file, otherwise add a DW_AT_low_pc.
-      // FIXME: We should use ranges allow reordering of code ala
-      // .subsections_via_symbols in mach-o. This would mean turning on
-      // ranges for all subprogram DIEs for mach-o.
-      DwarfCompileUnit &U =
-          SkCU ? *SkCU : static_cast<DwarfCompileUnit &>(*TheU);
-      unsigned NumRanges = TheU->getRanges().size();
-      if (NumRanges) {
-        if (NumRanges > 1) {
-          addSectionLabel(*Asm, U, U.getUnitDie(), dwarf::DW_AT_ranges,
-                          Asm->GetTempSymbol("cu_ranges", U.getUniqueID()),
-                          DwarfDebugRangeSectionSym);
-
-          // A DW_AT_low_pc attribute may also be specified in combination with
-          // DW_AT_ranges to specify the default base address for use in
-          // location lists (see Section 2.6.2) and range lists (see Section
-          // 2.17.3).
-          U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
-                    0);
-        } else {
-          RangeSpan &Range = TheU->getRanges().back();
-          U.addLocalLabelAddress(U.getUnitDie(), dwarf::DW_AT_low_pc,
-                                 Range.getStart());
-          U.addLabelDelta(U.getUnitDie(), dwarf::DW_AT_high_pc, Range.getEnd(),
-                          Range.getStart());
-        }
-      }
+    // If we have code split among multiple sections or non-contiguous
+    // ranges of code then emit a DW_AT_ranges attribute on the unit that will
+    // remain in the .o file, otherwise add a DW_AT_low_pc.
+    // FIXME: We should use ranges allow reordering of code ala
+    // .subsections_via_symbols in mach-o. This would mean turning on
+    // ranges for all subprogram DIEs for mach-o.
+    DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+    if (unsigned NumRanges = TheCU.getRanges().size()) {
+      if (NumRanges > 1)
+        // A DW_AT_low_pc attribute may also be specified in combination with
+        // DW_AT_ranges to specify the default base address for use in
+        // location lists (see Section 2.6.2) and range lists (see Section
+        // 2.17.3).
+        U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
+      else
+        TheCU.setBaseAddress(TheCU.getRanges().front().getStart());
+      U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
     }
   }
 
@@ -1010,7 +660,10 @@
   assert(CurFn == nullptr);
   assert(CurMI == nullptr);
 
-  if (!FirstCU)
+  // If we aren't actually generating debug info (check beginModule -
+  // conditionalized on !DisableDebugInfoPrinting and the presence of the
+  // llvm.dbg.cu metadata node)
+  if (!DwarfInfoSectionSym)
     return;
 
   // End any existing sections.
@@ -1064,9 +717,6 @@
   // clean up.
   SPMap.clear();
   AbstractVariables.clear();
-
-  // Reset these for the next Module if we have one.
-  FirstCU = nullptr;
 }
 
 // Find abstract variable, if any, associated with Var.
@@ -1092,8 +742,8 @@
 
 void DwarfDebug::createAbstractVariable(const DIVariable &Var,
                                         LexicalScope *Scope) {
-  auto AbsDbgVariable = make_unique<DbgVariable>(Var, this);
-  addScopeVariable(Scope, AbsDbgVariable.get());
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, DIExpression(), this);
+  InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
   AbstractVariables[Var] = std::move(AbsDbgVariable);
 }
 
@@ -1117,55 +767,35 @@
     createAbstractVariable(Cleansed, Scope);
 }
 
-// If Var is a current function argument then add it to CurrentFnArguments list.
-bool DwarfDebug::addCurrentFnArgument(DbgVariable *Var, LexicalScope *Scope) {
-  if (!LScopes.isCurrentFunctionScope(Scope))
-    return false;
-  DIVariable DV = Var->getVariable();
-  if (DV.getTag() != dwarf::DW_TAG_arg_variable)
-    return false;
-  unsigned ArgNo = DV.getArgNumber();
-  if (ArgNo == 0)
-    return false;
-
-  size_t Size = CurrentFnArguments.size();
-  if (Size == 0)
-    CurrentFnArguments.resize(CurFn->getFunction()->arg_size());
-  // llvm::Function argument size is not good indicator of how many
-  // arguments does the function have at source level.
-  if (ArgNo > Size)
-    CurrentFnArguments.resize(ArgNo * 2);
-  CurrentFnArguments[ArgNo - 1] = Var;
-  return true;
-}
-
 // Collect variable information from side table maintained by MMI.
 void DwarfDebug::collectVariableInfoFromMMITable(
-    SmallPtrSet<const MDNode *, 16> &Processed) {
+    SmallPtrSetImpl<const MDNode *> &Processed) {
   for (const auto &VI : MMI->getVariableDbgInfo()) {
     if (!VI.Var)
       continue;
     Processed.insert(VI.Var);
-    DIVariable DV(VI.Var);
     LexicalScope *Scope = LScopes.findLexicalScope(VI.Loc);
 
     // If variable scope is not found then skip this variable.
     if (!Scope)
       continue;
 
+    DIVariable DV(VI.Var);
+    DIExpression Expr(VI.Expr);
     ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
-    ConcreteVariables.push_back(make_unique<DbgVariable>(DV, this));
+    ConcreteVariables.push_back(make_unique<DbgVariable>(DV, Expr, this));
     DbgVariable *RegVar = ConcreteVariables.back().get();
     RegVar->setFrameIndex(VI.Slot);
-    addScopeVariable(Scope, RegVar);
+    InfoHolder.addScopeVariable(Scope, RegVar);
   }
 }
 
 // Get .debug_loc entry for the instruction range starting at MI.
 static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
+  const MDNode *Expr = MI->getDebugExpression();
   const MDNode *Var = MI->getDebugVariable();
 
-  assert(MI->getNumOperands() == 3);
+  assert(MI->getNumOperands() == 4);
   if (MI->getOperand(0).isReg()) {
     MachineLocation MLoc;
     // If the second operand is an immediate, this is a
@@ -1174,24 +804,138 @@
       MLoc.set(MI->getOperand(0).getReg());
     else
       MLoc.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-    return DebugLocEntry::Value(Var, MLoc);
+    return DebugLocEntry::Value(Var, Expr, MLoc);
   }
   if (MI->getOperand(0).isImm())
-    return DebugLocEntry::Value(Var, MI->getOperand(0).getImm());
+    return DebugLocEntry::Value(Var, Expr, MI->getOperand(0).getImm());
   if (MI->getOperand(0).isFPImm())
-    return DebugLocEntry::Value(Var, MI->getOperand(0).getFPImm());
+    return DebugLocEntry::Value(Var, Expr, MI->getOperand(0).getFPImm());
   if (MI->getOperand(0).isCImm())
-    return DebugLocEntry::Value(Var, MI->getOperand(0).getCImm());
+    return DebugLocEntry::Value(Var, Expr, MI->getOperand(0).getCImm());
 
-  llvm_unreachable("Unexpected 3 operand DBG_VALUE instruction!");
+  llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
 }
 
+/// Determine whether two variable pieces overlap.
+static bool piecesOverlap(DIExpression P1, DIExpression P2) {
+  if (!P1.isVariablePiece() || !P2.isVariablePiece())
+    return true;
+  unsigned l1 = P1.getPieceOffset();
+  unsigned l2 = P2.getPieceOffset();
+  unsigned r1 = l1 + P1.getPieceSize();
+  unsigned r2 = l2 + P2.getPieceSize();
+  // True where [l1,r1[ and [r1,r2[ overlap.
+  return (l1 < r2) && (l2 < r1);
+}
+
+/// Build the location list for all DBG_VALUEs in the function that
+/// describe the same variable.  If the ranges of several independent
+/// pieces of the same variable overlap partially, split them up and
+/// combine the ranges. The resulting DebugLocEntries are will have
+/// strict monotonically increasing begin addresses and will never
+/// overlap.
+//
+// Input:
+//
+//   Ranges History [var, loc, piece ofs size]
+// 0 |      [x, (reg0, piece 0, 32)]
+// 1 | |    [x, (reg1, piece 32, 32)] <- IsPieceOfPrevEntry
+// 2 | |    ...
+// 3   |    [clobber reg0]
+// 4        [x, (mem, piece 0, 64)] <- overlapping with both previous pieces of x.
+//
+// Output:
+//
+// [0-1]    [x, (reg0, piece  0, 32)]
+// [1-3]    [x, (reg0, piece  0, 32), (reg1, piece 32, 32)]
+// [3-4]    [x, (reg1, piece 32, 32)]
+// [4- ]    [x, (mem,  piece  0, 64)]
+void
+DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                              const DbgValueHistoryMap::InstrRanges &Ranges) {
+  SmallVector<DebugLocEntry::Value, 4> OpenRanges;
+
+  for (auto I = Ranges.begin(), E = Ranges.end(); I != E; ++I) {
+    const MachineInstr *Begin = I->first;
+    const MachineInstr *End = I->second;
+    assert(Begin->isDebugValue() && "Invalid History entry");
+
+    // Check if a variable is inaccessible in this range.
+    if (Begin->getNumOperands() > 1 &&
+        Begin->getOperand(0).isReg() && !Begin->getOperand(0).getReg()) {
+      OpenRanges.clear();
+      continue;
+    }
+
+    // If this piece overlaps with any open ranges, truncate them.
+    DIExpression DIExpr = Begin->getDebugExpression();
+    auto Last = std::remove_if(OpenRanges.begin(), OpenRanges.end(),
+                               [&](DebugLocEntry::Value R) {
+      return piecesOverlap(DIExpr, R.getExpression());
+    });
+    OpenRanges.erase(Last, OpenRanges.end());
+
+    const MCSymbol *StartLabel = getLabelBeforeInsn(Begin);
+    assert(StartLabel && "Forgot label before DBG_VALUE starting a range!");
+
+    const MCSymbol *EndLabel;
+    if (End != nullptr)
+      EndLabel = getLabelAfterInsn(End);
+    else if (std::next(I) == Ranges.end())
+      EndLabel = FunctionEndSym;
+    else
+      EndLabel = getLabelBeforeInsn(std::next(I)->first);
+    assert(EndLabel && "Forgot label after instruction ending a range!");
+
+    DEBUG(dbgs() << "DotDebugLoc: " << *Begin << "\n");
+
+    auto Value = getDebugLocValue(Begin);
+    DebugLocEntry Loc(StartLabel, EndLabel, Value);
+    bool couldMerge = false;
+
+    // If this is a piece, it may belong to the current DebugLocEntry.
+    if (DIExpr.isVariablePiece()) {
+      // Add this value to the list of open ranges.
+      OpenRanges.push_back(Value);
+
+      // Attempt to add the piece to the last entry.
+      if (!DebugLoc.empty())
+        if (DebugLoc.back().MergeValues(Loc))
+          couldMerge = true;
+    }
+
+    if (!couldMerge) {
+      // Need to add a new DebugLocEntry. Add all values from still
+      // valid non-overlapping pieces.
+      if (OpenRanges.size())
+        Loc.addValues(OpenRanges);
+
+      DebugLoc.push_back(std::move(Loc));
+    }
+
+    // Attempt to coalesce the ranges of two otherwise identical
+    // DebugLocEntries.
+    auto CurEntry = DebugLoc.rbegin();
+    auto PrevEntry = std::next(CurEntry);
+    if (PrevEntry != DebugLoc.rend() && PrevEntry->MergeRanges(*CurEntry))
+      DebugLoc.pop_back();
+
+    DEBUG({
+      dbgs() << CurEntry->getValues().size() << " Values:\n";
+      for (auto Value : CurEntry->getValues()) {
+        Value.getVariable()->dump();
+        Value.getExpression()->dump();
+      }
+      dbgs() << "-----\n";
+    });
+  }
+}
+
+
 // Find variables for each lexical scope.
 void
-DwarfDebug::collectVariableInfo(SmallPtrSet<const MDNode *, 16> &Processed) {
-  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
-  DwarfCompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
-
+DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, DISubprogram SP,
+                                SmallPtrSetImpl<const MDNode *> &Processed) {
   // Grab the variable info that was squirreled away in the MMI side-table.
   collectVariableInfoFromMMITable(Processed);
 
@@ -1206,10 +950,7 @@
       continue;
 
     LexicalScope *Scope = nullptr;
-    if (DV.getTag() == dwarf::DW_TAG_arg_variable &&
-        DISubprogram(DV.getContext()).describes(CurFn->getFunction()))
-      Scope = LScopes.getCurrentFunctionScope();
-    else if (MDNode *IA = DV.getInlinedAt()) {
+    if (MDNode *IA = DV.getInlinedAt()) {
       DebugLoc DL = DebugLoc::getFromDILocation(IA);
       Scope = LScopes.findInlinedScope(DebugLoc::get(
           DL.getLine(), DL.getCol(), DV.getContext(), IA));
@@ -1225,7 +966,7 @@
     ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
     ConcreteVariables.push_back(make_unique<DbgVariable>(MInsn, this));
     DbgVariable *RegVar = ConcreteVariables.back().get();
-    addScopeVariable(Scope, RegVar);
+    InfoHolder.addScopeVariable(Scope, RegVar);
 
     // Check if the first DBG_VALUE is valid for the rest of the function.
     if (Ranges.size() == 1 && Ranges.front().second == nullptr)
@@ -1236,53 +977,26 @@
 
     DotDebugLocEntries.resize(DotDebugLocEntries.size() + 1);
     DebugLocList &LocList = DotDebugLocEntries.back();
+    LocList.CU = &TheCU;
     LocList.Label =
         Asm->GetTempSymbol("debug_loc", DotDebugLocEntries.size() - 1);
-    SmallVector<DebugLocEntry, 4> &DebugLoc = LocList.List;
-    for (auto I = Ranges.begin(), E = Ranges.end(); I != E; ++I) {
-      const MachineInstr *Begin = I->first;
-      const MachineInstr *End = I->second;
-      assert(Begin->isDebugValue() && "Invalid History entry");
 
-      // Check if a variable is unaccessible in this range.
-      if (Begin->getNumOperands() > 1 && Begin->getOperand(0).isReg() &&
-          !Begin->getOperand(0).getReg())
-        continue;
-      DEBUG(dbgs() << "DotDebugLoc Pair:\n" << "\t" << *Begin);
-      if (End != nullptr)
-        DEBUG(dbgs() << "\t" << *End);
-      else
-        DEBUG(dbgs() << "\tNULL\n");
-
-      const MCSymbol *StartLabel = getLabelBeforeInsn(Begin);
-      assert(StartLabel && "Forgot label before DBG_VALUE starting a range!");
-
-      const MCSymbol *EndLabel;
-      if (End != nullptr)
-        EndLabel = getLabelAfterInsn(End);
-      else if (std::next(I) == Ranges.end())
-        EndLabel = FunctionEndSym;
-      else
-        EndLabel = getLabelBeforeInsn(std::next(I)->first);
-      assert(EndLabel && "Forgot label after instruction ending a range!");
-
-      DebugLocEntry Loc(StartLabel, EndLabel, getDebugLocValue(Begin), TheCU);
-      if (DebugLoc.empty() || !DebugLoc.back().Merge(Loc))
-        DebugLoc.push_back(std::move(Loc));
-    }
+    // Build the location list for this variable.
+    buildLocationList(LocList.List, Ranges);
   }
 
   // Collect info for variables that were optimized out.
-  DIArray Variables = DISubprogram(FnScope->getScopeNode()).getVariables();
+  DIArray Variables = SP.getVariables();
   for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
     DIVariable DV(Variables.getElement(i));
     assert(DV.isVariable());
-    if (!Processed.insert(DV))
+    if (!Processed.insert(DV).second)
       continue;
     if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext())) {
       ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
-      ConcreteVariables.push_back(make_unique<DbgVariable>(DV, this));
-      addScopeVariable(Scope, ConcreteVariables.back().get());
+      DIExpression NoExpr;
+      ConcreteVariables.push_back(make_unique<DbgVariable>(DV, NoExpr, this));
+      InfoHolder.addScopeVariable(Scope, ConcreteVariables.back().get());
     }
   }
 }
@@ -1458,7 +1172,8 @@
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
   // Calculate history for local variables.
-  calculateDbgValueHistory(MF, Asm->TM.getRegisterInfo(), DbgValues);
+  calculateDbgValueHistory(MF, Asm->TM.getSubtargetImpl()->getRegisterInfo(),
+                           DbgValues);
 
   // Request labels for the full history.
   for (const auto &I : DbgValues) {
@@ -1468,10 +1183,24 @@
 
     // The first mention of a function argument gets the FunctionBeginSym
     // label, so arguments are visible when breaking at function entry.
-    DIVariable DV(I.first);
-    if (DV.isVariable() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
-        getDISubprogram(DV.getContext()).describes(MF->getFunction()))
+    DIVariable DIVar(Ranges.front().first->getDebugVariable());
+    if (DIVar.isVariable() && DIVar.getTag() == dwarf::DW_TAG_arg_variable &&
+        getDISubprogram(DIVar.getContext()).describes(MF->getFunction())) {
       LabelsBeforeInsn[Ranges.front().first] = FunctionBeginSym;
+      if (Ranges.front().first->getDebugExpression().isVariablePiece()) {
+        // Mark all non-overlapping initial pieces.
+        for (auto I = Ranges.begin(); I != Ranges.end(); ++I) {
+          DIExpression Piece = I->first->getDebugExpression();
+          if (std::all_of(Ranges.begin(), I,
+                          [&](DbgValueHistoryMap::InstrRange Pred) {
+                return !piecesOverlap(Piece, Pred.first->getDebugExpression());
+              }))
+            LabelsBeforeInsn[I->first] = FunctionBeginSym;
+          else
+            break;
+        }
+      }
+    }
 
     for (const auto &Range : Ranges) {
       requestLabelBeforeInsn(Range.first);
@@ -1497,56 +1226,16 @@
   }
 }
 
-void DwarfDebug::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
-  if (addCurrentFnArgument(Var, LS))
-    return;
-  SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
-  DIVariable DV = Var->getVariable();
-  // Variables with positive arg numbers are parameters.
-  if (unsigned ArgNum = DV.getArgNumber()) {
-    // Keep all parameters in order at the start of the variable list to ensure
-    // function types are correct (no out-of-order parameters)
-    //
-    // This could be improved by only doing it for optimized builds (unoptimized
-    // builds have the right order to begin with), searching from the back (this
-    // would catch the unoptimized case quickly), or doing a binary search
-    // rather than linear search.
-    SmallVectorImpl<DbgVariable *>::iterator I = Vars.begin();
-    while (I != Vars.end()) {
-      unsigned CurNum = (*I)->getVariable().getArgNumber();
-      // A local (non-parameter) variable has been found, insert immediately
-      // before it.
-      if (CurNum == 0)
-        break;
-      // A later indexed parameter has been found, insert immediately before it.
-      if (CurNum > ArgNum)
-        break;
-      ++I;
-    }
-    Vars.insert(I, Var);
-    return;
-  }
-
-  Vars.push_back(Var);
-}
-
 // Gather and emit post-function debug information.
 void DwarfDebug::endFunction(const MachineFunction *MF) {
-  // Every beginFunction(MF) call should be followed by an endFunction(MF) call,
-  // though the beginFunction may not be called at all.
-  // We should handle both cases.
-  if (!CurFn)
-    CurFn = MF;
-  else
-    assert(CurFn == MF);
-  assert(CurFn != nullptr);
+  assert(CurFn == MF &&
+      "endFunction should be called with the same function as beginFunction");
 
   if (!MMI->hasDebugInfo() || LScopes.empty() ||
       !FunctionDIs.count(MF->getFunction())) {
     // If we don't have a lexical scope for this function then there will
     // be a hole in the range information. Keep note of this by setting the
     // previously used section to nullptr.
-    PrevSection = nullptr;
     PrevCU = nullptr;
     CurFn = nullptr;
     return;
@@ -1560,45 +1249,63 @@
   // Set DwarfDwarfCompileUnitID in MCContext to default value.
   Asm->OutStreamer.getContext().setDwarfCompileUnitID(0);
 
-  SmallPtrSet<const MDNode *, 16> ProcessedVars;
-  collectVariableInfo(ProcessedVars);
-
   LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
-  DwarfCompileUnit &TheCU = *SPMap.lookup(FnScope->getScopeNode());
+  DISubprogram SP(FnScope->getScopeNode());
+  DwarfCompileUnit &TheCU = *SPMap.lookup(SP);
 
+  SmallPtrSet<const MDNode *, 16> ProcessedVars;
+  collectVariableInfo(TheCU, SP, ProcessedVars);
+
+  // Add the range of this function to the list of ranges for the CU.
+  TheCU.addRange(RangeSpan(FunctionBeginSym, FunctionEndSym));
+
+  // Under -gmlt, skip building the subprogram if there are no inlined
+  // subroutines inside it.
+  if (TheCU.getCUNode().getEmissionKind() == DIBuilder::LineTablesOnly &&
+      LScopes.getAbstractScopesList().empty() && !IsDarwin) {
+    assert(InfoHolder.getScopeVariables().empty());
+    assert(DbgValues.empty());
+    // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed
+    // by a -gmlt CU. Add a test and remove this assertion.
+    assert(AbstractVariables.empty());
+    LabelsBeforeInsn.clear();
+    LabelsAfterInsn.clear();
+    PrevLabel = nullptr;
+    CurFn = nullptr;
+    return;
+  }
+
+#ifndef NDEBUG
+  size_t NumAbstractScopes = LScopes.getAbstractScopesList().size();
+#endif
   // Construct abstract scopes.
   for (LexicalScope *AScope : LScopes.getAbstractScopesList()) {
     DISubprogram SP(AScope->getScopeNode());
-    if (!SP.isSubprogram())
-      continue;
+    assert(SP.isSubprogram());
     // Collect info for variables that were optimized out.
     DIArray Variables = SP.getVariables();
     for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
       DIVariable DV(Variables.getElement(i));
       assert(DV && DV.isVariable());
-      if (!ProcessedVars.insert(DV))
+      if (!ProcessedVars.insert(DV).second)
         continue;
       ensureAbstractVariableIsCreated(DV, DV.getContext());
+      assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
+             && "ensureAbstractVariableIsCreated inserted abstract scopes");
     }
-    constructAbstractSubprogramScopeDIE(TheCU, AScope);
+    constructAbstractSubprogramScopeDIE(AScope);
   }
 
-  DIE &CurFnDIE = constructSubprogramScopeDIE(TheCU, FnScope);
-  if (!CurFn->getTarget().Options.DisableFramePointerElim(*CurFn))
-    TheCU.addFlag(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr);
-
-  // Add the range of this function to the list of ranges for the CU.
-  RangeSpan Span(FunctionBeginSym, FunctionEndSym);
-  TheCU.addRange(std::move(Span));
-  PrevSection = Asm->getCurrentSection();
-  PrevCU = &TheCU;
+  TheCU.constructSubprogramScopeDIE(FnScope);
+  if (auto *SkelCU = TheCU.getSkeleton())
+    if (!LScopes.getAbstractScopesList().empty())
+      SkelCU->constructSubprogramScopeDIE(FnScope);
 
   // Clear debug info
   // Ownership of DbgVariables is a bit subtle - ScopeVariables owns all the
   // DbgVariables except those that are also in AbstractVariables (since they
   // can be used cross-function)
-  ScopeVariables.clear();
-  CurrentFnArguments.clear();
+  InfoHolder.getScopeVariables().clear();
   DbgValues.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
@@ -1618,8 +1325,8 @@
     assert(Scope.isScope());
     Fn = Scope.getFilename();
     Dir = Scope.getDirectory();
-    if (Scope.isLexicalBlock())
-      Discriminator = DILexicalBlock(S).getDiscriminator();
+    if (Scope.isLexicalBlockFile())
+      Discriminator = DILexicalBlockFile(S).getDiscriminator();
 
     unsigned CUID = Asm->OutStreamer.getContext().getDwarfCompileUnitID();
     Src = static_cast<DwarfCompileUnit &>(*InfoHolder.getUnits()[CUID])
@@ -1640,9 +1347,12 @@
   // Dwarf sections base addresses.
   DwarfInfoSectionSym =
       emitSectionSym(Asm, TLOF.getDwarfInfoSection(), "section_info");
-  if (useSplitDwarf())
+  if (useSplitDwarf()) {
     DwarfInfoDWOSectionSym =
         emitSectionSym(Asm, TLOF.getDwarfInfoDWOSection(), "section_info_dwo");
+    DwarfTypesDWOSectionSym =
+        emitSectionSym(Asm, TLOF.getDwarfTypesDWOSection(), "section_types_dwo");
+  }
   DwarfAbbrevSectionSym =
       emitSectionSym(Asm, TLOF.getDwarfAbbrevSection(), "section_abbrev");
   if (useSplitDwarf())
@@ -1726,7 +1436,7 @@
 void DwarfDebug::emitDebugInfo() {
   DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
 
-  Holder.emitUnits(this, DwarfAbbrevSectionSym);
+  Holder.emitUnits(DwarfAbbrevSectionSym);
 }
 
 // Emit the abbreviation section.
@@ -1760,54 +1470,41 @@
   Asm->EmitInt8(1);
 }
 
-// Emit visible names into a hashed accelerator table section.
-void DwarfDebug::emitAccelNames() {
-  AccelNames.FinalizeTable(Asm, "Names");
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfAccelNamesSection());
-  MCSymbol *SectionBegin = Asm->GetTempSymbol("names_begin");
+void DwarfDebug::emitAccel(DwarfAccelTable &Accel, const MCSection *Section,
+                           StringRef TableName, StringRef SymName) {
+  Accel.FinalizeTable(Asm, TableName);
+  Asm->OutStreamer.SwitchSection(Section);
+  auto *SectionBegin = Asm->GetTempSymbol(SymName);
   Asm->OutStreamer.EmitLabel(SectionBegin);
 
   // Emit the full data.
-  AccelNames.Emit(Asm, SectionBegin, &InfoHolder);
+  Accel.Emit(Asm, SectionBegin, this, DwarfStrSectionSym);
+}
+
+// Emit visible names into a hashed accelerator table section.
+void DwarfDebug::emitAccelNames() {
+  emitAccel(AccelNames, Asm->getObjFileLowering().getDwarfAccelNamesSection(),
+            "Names", "names_begin");
 }
 
 // Emit objective C classes and categories into a hashed accelerator table
 // section.
 void DwarfDebug::emitAccelObjC() {
-  AccelObjC.FinalizeTable(Asm, "ObjC");
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfAccelObjCSection());
-  MCSymbol *SectionBegin = Asm->GetTempSymbol("objc_begin");
-  Asm->OutStreamer.EmitLabel(SectionBegin);
-
-  // Emit the full data.
-  AccelObjC.Emit(Asm, SectionBegin, &InfoHolder);
+  emitAccel(AccelObjC, Asm->getObjFileLowering().getDwarfAccelObjCSection(),
+            "ObjC", "objc_begin");
 }
 
 // Emit namespace dies into a hashed accelerator table.
 void DwarfDebug::emitAccelNamespaces() {
-  AccelNamespace.FinalizeTable(Asm, "namespac");
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfAccelNamespaceSection());
-  MCSymbol *SectionBegin = Asm->GetTempSymbol("namespac_begin");
-  Asm->OutStreamer.EmitLabel(SectionBegin);
-
-  // Emit the full data.
-  AccelNamespace.Emit(Asm, SectionBegin, &InfoHolder);
+  emitAccel(AccelNamespace,
+            Asm->getObjFileLowering().getDwarfAccelNamespaceSection(),
+            "namespac", "namespac_begin");
 }
 
 // Emit type dies into a hashed accelerator table.
 void DwarfDebug::emitAccelTypes() {
-
-  AccelTypes.FinalizeTable(Asm, "types");
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfAccelTypesSection());
-  MCSymbol *SectionBegin = Asm->GetTempSymbol("types_begin");
-  Asm->OutStreamer.EmitLabel(SectionBegin);
-
-  // Emit the full data.
-  AccelTypes.Emit(Asm, SectionBegin, &InfoHolder);
+  emitAccel(AccelTypes, Asm->getObjFileLowering().getDwarfAccelTypesSection(),
+            "types", "types_begin");
 }
 
 // Public name handling.
@@ -1874,12 +1571,13 @@
       GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubNamesSection()
                : Asm->getObjFileLowering().getDwarfPubNamesSection();
 
-  emitDebugPubSection(GnuStyle, PSec, "Names", &DwarfUnit::getGlobalNames);
+  emitDebugPubSection(GnuStyle, PSec, "Names",
+                      &DwarfCompileUnit::getGlobalNames);
 }
 
 void DwarfDebug::emitDebugPubSection(
     bool GnuStyle, const MCSection *PSec, StringRef Name,
-    const StringMap<const DIE *> &(DwarfUnit::*Accessor)() const) {
+    const StringMap<const DIE *> &(DwarfCompileUnit::*Accessor)() const) {
   for (const auto &NU : CUMap) {
     DwarfCompileUnit *TheU = NU.second;
 
@@ -1888,7 +1586,7 @@
     if (Globals.empty())
       continue;
 
-    if (auto Skeleton = static_cast<DwarfCompileUnit *>(TheU->getSkeleton()))
+    if (auto *Skeleton = TheU->getSkeleton())
       TheU = Skeleton;
     unsigned ID = TheU->getUniqueID();
 
@@ -1910,7 +1608,7 @@
     Asm->EmitSectionOffset(TheU->getLabelBegin(), TheU->getSectionSym());
 
     Asm->OutStreamer.AddComment("Compilation Unit Length");
-    Asm->EmitLabelDifference(TheU->getLabelEnd(), TheU->getLabelBegin(), 4);
+    Asm->EmitInt32(TheU->getLength());
 
     // Emit the pubnames for this compilation unit.
     for (const auto &GI : Globals) {
@@ -1943,7 +1641,8 @@
       GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubTypesSection()
                : Asm->getObjFileLowering().getDwarfPubTypesSection();
 
-  emitDebugPubSection(GnuStyle, PSec, "Types", &DwarfUnit::getGlobalTypes);
+  emitDebugPubSection(GnuStyle, PSec, "Types",
+                      &DwarfCompileUnit::getGlobalTypes);
 }
 
 // Emit visible names into a debug str section.
@@ -1952,12 +1651,67 @@
   Holder.emitStrings(Asm->getObjFileLowering().getDwarfStrSection());
 }
 
+/// Emits an optimal (=sorted) sequence of DW_OP_pieces.
+void DwarfDebug::emitLocPieces(ByteStreamer &Streamer,
+                               const DITypeIdentifierMap &Map,
+                               ArrayRef<DebugLocEntry::Value> Values) {
+  assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value P) {
+        return P.isVariablePiece();
+      }) && "all values are expected to be pieces");
+  assert(std::is_sorted(Values.begin(), Values.end()) &&
+         "pieces are expected to be sorted");
+
+  unsigned Offset = 0;
+  for (auto Piece : Values) {
+    DIExpression Expr = Piece.getExpression();
+    unsigned PieceOffset = Expr.getPieceOffset();
+    unsigned PieceSize = Expr.getPieceSize();
+    assert(Offset <= PieceOffset && "overlapping or duplicate pieces");
+    if (Offset < PieceOffset) {
+      // The DWARF spec seriously mandates pieces with no locations for gaps.
+      Asm->EmitDwarfOpPiece(Streamer, (PieceOffset-Offset)*8);
+      Offset += PieceOffset-Offset;
+    }
+
+    Offset += PieceSize;
+
+    const unsigned SizeOfByte = 8;
+#ifndef NDEBUG
+    DIVariable Var = Piece.getVariable();
+    assert(!Var.isIndirect() && "indirect address for piece");
+    unsigned VarSize = Var.getSizeInBits(Map);
+    assert(PieceSize+PieceOffset <= VarSize/SizeOfByte
+           && "piece is larger than or outside of variable");
+    assert(PieceSize*SizeOfByte != VarSize
+           && "piece covers entire variable");
+#endif
+    if (Piece.isLocation() && Piece.getLoc().isReg())
+      Asm->EmitDwarfRegOpPiece(Streamer,
+                               Piece.getLoc(),
+                               PieceSize*SizeOfByte);
+    else {
+      emitDebugLocValue(Streamer, Piece);
+      Asm->EmitDwarfOpPiece(Streamer, PieceSize*SizeOfByte);
+    }
+  }
+}
+
+
 void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
                                    const DebugLocEntry &Entry) {
-  assert(Entry.getValues().size() == 1 &&
-         "multi-value entries are not supported yet.");
   const DebugLocEntry::Value Value = Entry.getValues()[0];
-  DIVariable DV(Value.getVariable());
+  if (Value.isVariablePiece())
+    // Emit all pieces that belong to the same variable and range.
+    return emitLocPieces(Streamer, TypeIdentifierMap, Entry.getValues());
+
+  assert(Entry.getValues().size() == 1 && "only pieces may have >1 value");
+  emitDebugLocValue(Streamer, Value);
+}
+
+void DwarfDebug::emitDebugLocValue(ByteStreamer &Streamer,
+                                   const DebugLocEntry::Value &Value) {
+  DIVariable DV = Value.getVariable();
+  // Regular entry.
   if (Value.isInt()) {
     DIBasicType BTy(resolve(DV.getType()));
     if (BTy.Verify() && (BTy.getEncoding() == dwarf::DW_ATE_signed ||
@@ -1970,24 +1724,25 @@
     }
   } else if (Value.isLocation()) {
     MachineLocation Loc = Value.getLoc();
-    if (!DV.hasComplexAddress())
+    DIExpression Expr = Value.getExpression();
+    if (!Expr)
       // Regular entry.
       Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
     else {
       // Complex address entry.
-      unsigned N = DV.getNumAddrElements();
+      unsigned N = Expr.getNumElements();
       unsigned i = 0;
-      if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
+      if (N >= 2 && Expr.getElement(0) == dwarf::DW_OP_plus) {
         if (Loc.getOffset()) {
           i = 2;
           Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
           Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
           Streamer.EmitInt8(dwarf::DW_OP_plus_uconst, "DW_OP_plus_uconst");
-          Streamer.EmitSLEB128(DV.getAddrElement(1));
+          Streamer.EmitSLEB128(Expr.getElement(1));
         } else {
           // If first address element is OpPlus then emit
           // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-          MachineLocation TLoc(Loc.getReg(), DV.getAddrElement(1));
+          MachineLocation TLoc(Loc.getReg(), Expr.getElement(1));
           Asm->EmitDwarfRegOp(Streamer, TLoc, DV.isIndirect());
           i = 2;
         }
@@ -1997,13 +1752,16 @@
 
       // Emit remaining complex address elements.
       for (; i < N; ++i) {
-        uint64_t Element = DV.getAddrElement(i);
-        if (Element == DIBuilder::OpPlus) {
+        uint64_t Element = Expr.getElement(i);
+        if (Element == dwarf::DW_OP_plus) {
           Streamer.EmitInt8(dwarf::DW_OP_plus_uconst, "DW_OP_plus_uconst");
-          Streamer.EmitULEB128(DV.getAddrElement(++i));
-        } else if (Element == DIBuilder::OpDeref) {
+          Streamer.EmitULEB128(Expr.getElement(++i));
+        } else if (Element == dwarf::DW_OP_deref) {
           if (!Loc.isReg())
             Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
+        } else if (Element == dwarf::DW_OP_piece) {
+          i += 3;
+          // handled in emitDebugLocEntry.
         } else
           llvm_unreachable("unknown Opcode found in complex address");
       }
@@ -2035,14 +1793,12 @@
   unsigned char Size = Asm->getDataLayout().getPointerSize();
   for (const auto &DebugLoc : DotDebugLocEntries) {
     Asm->OutStreamer.EmitLabel(DebugLoc.Label);
+    const DwarfCompileUnit *CU = DebugLoc.CU;
     for (const auto &Entry : DebugLoc.List) {
       // Set up the range. This range is relative to the entry point of the
       // compile unit. This is a hard coded 0 for low_pc when we're emitting
       // ranges, or the DW_AT_low_pc on the compile unit otherwise.
-      const DwarfCompileUnit *CU = Entry.getCU();
-      if (CU->getRanges().size() == 1) {
-        // Grab the begin symbol from the first range as our base.
-        const MCSymbol *Base = CU->getRanges()[0].getStart();
+      if (auto *Base = CU->getBaseAddress()) {
         Asm->EmitLabelDifference(Entry.getBeginSym(), Base, Size);
         Asm->EmitLabelDifference(Entry.getEndSym(), Base, Size);
       } else {
@@ -2172,6 +1928,10 @@
   for (DwarfCompileUnit *CU : CUs) {
     std::vector<ArangeSpan> &List = Spans[CU];
 
+    // Describe the skeleton CU's offset and length, not the dwo file's.
+    if (auto *Skel = CU->getSkeleton())
+      CU = Skel;
+
     // Emit size of content not including length itself.
     unsigned ContentSize =
         sizeof(int16_t) + // DWARF ARange version number
@@ -2194,7 +1954,7 @@
     Asm->OutStreamer.AddComment("DWARF Arange version number");
     Asm->EmitInt16(dwarf::DW_ARANGES_VERSION);
     Asm->OutStreamer.AddComment("Offset Into Debug Info Section");
-    Asm->EmitSectionOffset(CU->getLocalLabelBegin(), CU->getLocalSectionSym());
+    Asm->EmitSectionOffset(CU->getLabelBegin(), CU->getSectionSym());
     Asm->OutStreamer.AddComment("Address Size (in bytes)");
     Asm->EmitInt8(PtrSize);
     Asm->OutStreamer.AddComment("Segment Size (in bytes)");
@@ -2238,6 +1998,9 @@
   for (const auto &I : CUMap) {
     DwarfCompileUnit *TheCU = I.second;
 
+    if (auto *Skel = TheCU->getSkeleton())
+      TheCU = Skel;
+
     // Iterate over the misc ranges for the compile units in the module.
     for (const RangeSpanList &List : TheCU->getRangeLists()) {
       // Emit our symbol so we can find the beginning of the range.
@@ -2248,9 +2011,7 @@
         const MCSymbol *End = Range.getEnd();
         assert(Begin && "Range without a begin symbol?");
         assert(End && "Range without an end symbol?");
-        if (TheCU->getRanges().size() == 1) {
-          // Grab the begin symbol from the first range as our base.
-          const MCSymbol *Base = TheCU->getRanges()[0].getStart();
+        if (auto *Base = TheCU->getBaseAddress()) {
           Asm->EmitLabelDifference(Begin, Base, Size);
           Asm->EmitLabelDifference(End, Base, Size);
         } else {
@@ -2263,23 +2024,6 @@
       Asm->OutStreamer.EmitIntValue(0, Size);
       Asm->OutStreamer.EmitIntValue(0, Size);
     }
-
-    // Now emit a range for the CU itself.
-    if (TheCU->getRanges().size() > 1) {
-      Asm->OutStreamer.EmitLabel(
-          Asm->GetTempSymbol("cu_ranges", TheCU->getUniqueID()));
-      for (const RangeSpan &Range : TheCU->getRanges()) {
-        const MCSymbol *Begin = Range.getStart();
-        const MCSymbol *End = Range.getEnd();
-        assert(Begin && "Range without a begin symbol?");
-        assert(End && "Range without an end symbol?");
-        Asm->OutStreamer.EmitSymbolValue(Begin, Size);
-        Asm->OutStreamer.EmitSymbolValue(End, Size);
-      }
-      // And terminate the list with two 0 values.
-      Asm->OutStreamer.EmitIntValue(0, Size);
-      Asm->OutStreamer.EmitIntValue(0, Size);
-    }
   }
 }
 
@@ -2287,11 +2031,11 @@
 
 void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
                                   std::unique_ptr<DwarfUnit> NewU) {
-  NewU->addLocalString(Die, dwarf::DW_AT_GNU_dwo_name,
-                       U.getCUNode().getSplitDebugFilename());
+  NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name,
+                  U.getCUNode().getSplitDebugFilename());
 
   if (!CompilationDir.empty())
-    NewU->addLocalString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+    NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
 
   addGnuPubAttributes(*NewU, Die);
 
@@ -2316,31 +2060,13 @@
   return NewCU;
 }
 
-// This DIE has the following attributes: DW_AT_comp_dir, DW_AT_dwo_name,
-// DW_AT_addr_base.
-DwarfTypeUnit &DwarfDebug::constructSkeletonTU(DwarfTypeUnit &TU) {
-  DwarfCompileUnit &CU = static_cast<DwarfCompileUnit &>(
-      *SkeletonHolder.getUnits()[TU.getCU().getUniqueID()]);
-
-  auto OwnedUnit = make_unique<DwarfTypeUnit>(TU.getUniqueID(), CU, Asm, this,
-                                              &SkeletonHolder);
-  DwarfTypeUnit &NewTU = *OwnedUnit;
-  NewTU.setTypeSignature(TU.getTypeSignature());
-  NewTU.setType(nullptr);
-  NewTU.initSection(
-      Asm->getObjFileLowering().getDwarfTypesSection(TU.getTypeSignature()));
-
-  initSkeletonUnit(TU, NewTU.getUnitDie(), std::move(OwnedUnit));
-  return NewTU;
-}
-
 // Emit the .debug_info.dwo section for separated dwarf. This contains the
 // compile units that would normally be in debug_info.
 void DwarfDebug::emitDebugInfoDWO() {
   assert(useSplitDwarf() && "No split dwarf debug info?");
   // Don't pass an abbrev symbol, using a constant zero instead so as not to
   // emit relocations into the dwo file.
-  InfoHolder.emitUnits(this, /* AbbrevSymbol */ nullptr);
+  InfoHolder.emitUnits(/* AbbrevSymbol */ nullptr);
 }
 
 // Emit the .debug_abbrev.dwo section for separated dwarf. This contains the
@@ -2364,9 +2090,8 @@
   assert(useSplitDwarf() && "No split dwarf?");
   const MCSection *OffSec =
       Asm->getObjFileLowering().getDwarfStrOffDWOSection();
-  const MCSymbol *StrSym = DwarfStrSectionSym;
   InfoHolder.emitStrings(Asm->getObjFileLowering().getDwarfStrDWOSection(),
-                         OffSec, StrSym);
+                         OffSec);
 }
 
 MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
@@ -2406,9 +2131,9 @@
   bool TopLevelType = TypeUnitsUnderConstruction.empty();
   AddrPool.resetUsedFlag();
 
-  auto OwnedUnit =
-      make_unique<DwarfTypeUnit>(InfoHolder.getUnits().size(), CU, Asm, this,
-                                 &InfoHolder, getDwoLineTable(CU));
+  auto OwnedUnit = make_unique<DwarfTypeUnit>(
+      InfoHolder.getUnits().size() + TypeUnitsUnderConstruction.size(), CU, Asm,
+      this, &InfoHolder, getDwoLineTable(CU));
   DwarfTypeUnit &NewTU = *OwnedUnit;
   DIE &UnitDie = NewTU.getUnitDie();
   TU = &NewTU;
@@ -2421,15 +2146,13 @@
   uint64_t Signature = makeTypeSignature(Identifier);
   NewTU.setTypeSignature(Signature);
 
-  if (!useSplitDwarf())
+  if (useSplitDwarf())
+    NewTU.initSection(Asm->getObjFileLowering().getDwarfTypesDWOSection());
+  else {
     CU.applyStmtList(UnitDie);
-
-  // FIXME: Skip using COMDAT groups for type units in the .dwo file once tools
-  // such as DWP ( http://gcc.gnu.org/wiki/DebugFissionDWP ) can cope with it.
-  NewTU.initSection(
-      useSplitDwarf()
-          ? Asm->getObjFileLowering().getDwarfTypesDWOSection(Signature)
-          : Asm->getObjFileLowering().getDwarfTypesSection(Signature));
+    NewTU.initSection(
+        Asm->getObjFileLowering().getDwarfTypesSection(Signature));
+  }
 
   NewTU.setType(NewTU.createTypeDIE(CTy));
 
@@ -2457,29 +2180,12 @@
 
     // If the type wasn't dependent on fission addresses, finish adding the type
     // and all its dependent types.
-    for (auto &TU : TypeUnitsToAdd) {
-      if (useSplitDwarf())
-        TU.first->setSkeleton(constructSkeletonTU(*TU.first));
+    for (auto &TU : TypeUnitsToAdd)
       InfoHolder.addUnit(std::move(TU.first));
-    }
   }
   CU.addDIETypeSignature(RefDie, NewTU);
 }
 
-void DwarfDebug::attachLowHighPC(DwarfCompileUnit &Unit, DIE &D,
-                                 MCSymbol *Begin, MCSymbol *End) {
-  assert(Begin && "Begin label should not be null!");
-  assert(End && "End label should not be null!");
-  assert(Begin->isDefined() && "Invalid starting label");
-  assert(End->isDefined() && "Invalid end label");
-
-  Unit.addLabelAddress(D, dwarf::DW_AT_low_pc, Begin);
-  if (DwarfVersion < 4)
-    Unit.addLabelAddress(D, dwarf::DW_AT_high_pc, End);
-  else
-    Unit.addLabelDelta(D, dwarf::DW_AT_high_pc, End, Begin);
-}
-
 // Accelerator table mutators - add each name along with its companion
 // DIE to the proper table while ensuring that the name that we're going
 // to reference is in the string table. We do this since the names we

diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index ffe4843..48c2809 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DWARFDEBUG_H__
-#define CODEGEN_ASMPRINTER_DWARFDEBUG_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 
 #include "DwarfFile.h"
 #include "AsmPrinterHandler.h"
@@ -70,6 +70,7 @@
 /// \brief This class is used to track local variable information.
 class DbgVariable {
   DIVariable Var;             // Variable Descriptor.
+  DIExpression Expr;          // Complex address location expression.
   DIE *TheDIE;                // Variable DIE.
   unsigned DotDebugLocOffset; // Offset in DotDebugLocEntries.
   const MachineInstr *MInsn;  // DBG_VALUE instruction of the variable.
@@ -78,18 +79,22 @@
 
 public:
   /// Construct a DbgVariable from a DIVariable.
-  DbgVariable(DIVariable V, DwarfDebug *DD)
-      : Var(V), TheDIE(nullptr), DotDebugLocOffset(~0U), MInsn(nullptr),
-        FrameIndex(~0), DD(DD) {}
+  DbgVariable(DIVariable V, DIExpression E, DwarfDebug *DD)
+      : Var(V), Expr(E), TheDIE(nullptr), DotDebugLocOffset(~0U),
+        MInsn(nullptr), FrameIndex(~0), DD(DD) {
+    assert(Var.Verify() && Expr.Verify());
+  }
 
   /// Construct a DbgVariable from a DEBUG_VALUE.
   /// AbstractVar may be NULL.
   DbgVariable(const MachineInstr *DbgValue, DwarfDebug *DD)
-      : Var(DbgValue->getDebugVariable()), TheDIE(nullptr),
-        DotDebugLocOffset(~0U), MInsn(DbgValue), FrameIndex(~0), DD(DD) {}
+      : Var(DbgValue->getDebugVariable()), Expr(DbgValue->getDebugExpression()),
+        TheDIE(nullptr), DotDebugLocOffset(~0U), MInsn(DbgValue),
+        FrameIndex(~0), DD(DD) {}
 
   // Accessors.
   DIVariable getVariable() const { return Var; }
+  DIExpression getExpression() const { return Expr; }
   void setDIE(DIE &D) { TheDIE = &D; }
   DIE *getDIE() const { return TheDIE; }
   void setDotDebugLocOffset(unsigned O) { DotDebugLocOffset = O; }
@@ -124,14 +129,14 @@
 
   bool variableHasComplexAddress() const {
     assert(Var.isVariable() && "Invalid complex DbgVariable!");
-    return Var.hasComplexAddress();
+    return Expr.getNumElements() > 0;
   }
   bool isBlockByrefVariable() const;
   unsigned getNumAddrElements() const {
     assert(Var.isVariable() && "Invalid complex DbgVariable!");
-    return Var.getNumAddrElements();
+    return Expr.getNumElements();
   }
-  uint64_t getAddrElement(unsigned i) const { return Var.getAddrElement(i); }
+  uint64_t getAddrElement(unsigned i) const { return Expr.getElement(i); }
   DIType getType() const;
 
 private:
@@ -159,26 +164,15 @@
   // All DIEValues are allocated through this allocator.
   BumpPtrAllocator DIEValueAllocator;
 
-  // Handle to the compile unit used for the inline extension handling,
-  // this is just so that the DIEValue allocator has a place to store
-  // the particular elements.
-  // FIXME: Store these off of DwarfDebug instead?
-  DwarfCompileUnit *FirstCU;
-
   // Maps MDNode with its corresponding DwarfCompileUnit.
   MapVector<const MDNode *, DwarfCompileUnit *> CUMap;
 
   // Maps subprogram MDNode with its corresponding DwarfCompileUnit.
-  DenseMap<const MDNode *, DwarfCompileUnit *> SPMap;
+  MapVector<const MDNode *, DwarfCompileUnit *> SPMap;
 
   // Maps a CU DIE with its corresponding DwarfCompileUnit.
   DenseMap<const DIE *, DwarfCompileUnit *> CUDieMap;
 
-  /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
-  /// be shared across CUs, that is why we keep the map here instead
-  /// of in DwarfCompileUnit.
-  DenseMap<const MDNode *, DIE *> MDTypeNodeToDieMap;
-
   // List of all labels used in aranges generation.
   std::vector<SymbolCU> ArangeLabels;
 
@@ -189,19 +183,8 @@
   typedef DenseMap<const MCSection *, SmallVector<SymbolCU, 8> > SectionMapType;
   SectionMapType SectionMap;
 
-  // List of arguments for current function.
-  SmallVector<DbgVariable *, 8> CurrentFnArguments;
-
   LexicalScopes LScopes;
 
-  // Collection of abstract subprogram DIEs.
-  DenseMap<const MDNode *, DIE *> AbstractSPDies;
-
-  // Collection of dbg variables of a scope.
-  typedef DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> >
-  ScopeVariablesMap;
-  ScopeVariablesMap ScopeVariables;
-
   // Collection of abstract variables.
   DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
   SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
@@ -210,10 +193,6 @@
   // can refer to them in spite of insertions into this list.
   SmallVector<DebugLocList, 4> DotDebugLocEntries;
 
-  // Collection of subprogram DIEs that are marked (at the end of the module)
-  // as DW_AT_inline.
-  SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
-
   // This is a collection of subprogram MDNodes that are processed to
   // create DIEs.
   SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
@@ -243,10 +222,6 @@
   // If nonnull, stores the current machine instruction we're processing.
   const MachineInstr *CurMI;
 
-  // If nonnull, stores the section that the previous function was allocated to
-  // emitting.
-  const MCSection *PrevSection;
-
   // If nonnull, stores the CU in which the previous subprogram was contained.
   const DwarfCompileUnit *PrevCU;
 
@@ -258,6 +233,7 @@
   MCSymbol *DwarfDebugLocSectionSym, *DwarfLineSectionSym, *DwarfAddrSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
   MCSymbol *DwarfInfoDWOSectionSym, *DwarfAbbrevDWOSectionSym;
+  MCSymbol *DwarfTypesDWOSectionSym;
   MCSymbol *DwarfStrDWOSectionSym;
   MCSymbol *DwarfGnuPubNamesSectionSym, *DwarfGnuPubTypesSectionSym;
 
@@ -322,6 +298,7 @@
 
   // True iff there are multiple CUs in this module.
   bool SingleCU;
+  bool IsDarwin;
 
   AddressPool AddrPool;
 
@@ -334,8 +311,6 @@
 
   MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
 
-  void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
-
   const SmallVectorImpl<std::unique_ptr<DwarfUnit>> &getUnits() {
     return InfoHolder.getUnits();
   }
@@ -350,45 +325,8 @@
   void ensureAbstractVariableIsCreatedIfScoped(const DIVariable &Var,
                                                const MDNode *Scope);
 
-  /// \brief Find DIE for the given subprogram and attach appropriate
-  /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
-  /// variables in this scope then create and insert DIEs for these
-  /// variables.
-  DIE &updateSubprogramScopeDIE(DwarfCompileUnit &SPCU, DISubprogram SP);
-
-  /// \brief A helper function to check whether the DIE for a given Scope is
-  /// going to be null.
-  bool isLexicalScopeDIENull(LexicalScope *Scope);
-
-  /// \brief A helper function to construct a RangeSpanList for a given
-  /// lexical scope.
-  void addScopeRangeList(DwarfCompileUnit &TheCU, DIE &ScopeDIE,
-                         const SmallVectorImpl<InsnRange> &Range);
-
-  /// \brief Construct new DW_TAG_lexical_block for this scope and
-  /// attach DW_AT_low_pc/DW_AT_high_pc labels.
-  std::unique_ptr<DIE> constructLexicalScopeDIE(DwarfCompileUnit &TheCU,
-                                                LexicalScope *Scope);
-
-  /// \brief This scope represents inlined body of a function. Construct
-  /// DIE to represent this concrete inlined copy of the function.
-  std::unique_ptr<DIE> constructInlinedScopeDIE(DwarfCompileUnit &TheCU,
-                                                LexicalScope *Scope);
-
-  /// \brief Construct a DIE for this scope.
-  std::unique_ptr<DIE> constructScopeDIE(DwarfCompileUnit &TheCU,
-                                         LexicalScope *Scope);
-  void createAndAddScopeChildren(DwarfCompileUnit &TheCU, LexicalScope *Scope,
-                                 DIE &ScopeDIE);
   /// \brief Construct a DIE for this abstract scope.
-  void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &TheCU,
-                                           LexicalScope *Scope);
-  /// \brief Construct a DIE for this subprogram scope.
-  DIE &constructSubprogramScopeDIE(DwarfCompileUnit &TheCU,
-                                   LexicalScope *Scope);
-  /// A helper function to create children of a Scope DIE.
-  DIE *createScopeChildrenDIE(DwarfCompileUnit &TheCU, LexicalScope *Scope,
-                              SmallVectorImpl<std::unique_ptr<DIE>> &Children);
+  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
   /// \brief Emit initial Dwarf sections with a label at the start of each one.
   void emitSectionLabels();
@@ -424,6 +362,10 @@
   /// the line matrix.
   void emitEndOfLineMatrix(unsigned SectionEnd);
 
+  /// \brief Emit a specified accelerator table.
+  void emitAccel(DwarfAccelTable &Accel, const MCSection *Section,
+                 StringRef TableName, StringRef SymName);
+
   /// \brief Emit visible names into a hashed accelerator table section.
   void emitAccelNames();
 
@@ -449,10 +391,9 @@
   /// index.
   void emitDebugPubTypes(bool GnuStyle = false);
 
-  void
-  emitDebugPubSection(bool GnuStyle, const MCSection *PSec, StringRef Name,
-                      const StringMap<const DIE *> &(DwarfUnit::*Accessor)()
-                      const);
+  void emitDebugPubSection(
+      bool GnuStyle, const MCSection *PSec, StringRef Name,
+      const StringMap<const DIE *> &(DwarfCompileUnit::*Accessor)() const);
 
   /// \brief Emit visible names into a debug str section.
   void emitDebugStr();
@@ -507,15 +448,8 @@
   DwarfCompileUnit &constructDwarfCompileUnit(DICompileUnit DIUnit);
 
   /// \brief Construct imported_module or imported_declaration DIE.
-  void constructImportedEntityDIE(DwarfCompileUnit &TheCU, const MDNode *N);
-
-  /// \brief Construct import_module DIE.
-  void constructImportedEntityDIE(DwarfCompileUnit &TheCU, const MDNode *N,
-                                  DIE &Context);
-
-  /// \brief Construct import_module DIE.
-  void constructImportedEntityDIE(DwarfCompileUnit &TheCU,
-                                  const DIImportedEntity &Module, DIE &Context);
+  void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
+                                        const MDNode *N);
 
   /// \brief Register a source line with debug info. Returns the unique
   /// label that was emitted and which provides correspondence to the
@@ -527,38 +461,29 @@
   /// ending of a scope.
   void identifyScopeMarkers();
 
-  /// \brief If Var is an current function argument that add it in
-  /// CurrentFnArguments list.
-  bool addCurrentFnArgument(DbgVariable *Var, LexicalScope *Scope);
-
   /// \brief Populate LexicalScope entries with variables' info.
-  void collectVariableInfo(SmallPtrSet<const MDNode *, 16> &ProcessedVars);
+  void collectVariableInfo(DwarfCompileUnit &TheCU, DISubprogram SP,
+                           SmallPtrSetImpl<const MDNode *> &ProcessedVars);
+
+  /// \brief Build the location list for all DBG_VALUEs in the
+  /// function that describe the same variable.
+  void buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                         const DbgValueHistoryMap::InstrRanges &Ranges);
 
   /// \brief Collect variable information from the side table maintained
   /// by MMI.
-  void collectVariableInfoFromMMITable(SmallPtrSet<const MDNode *, 16> &P);
+  void collectVariableInfoFromMMITable(SmallPtrSetImpl<const MDNode *> &P);
 
   /// \brief Ensure that a label will be emitted before MI.
   void requestLabelBeforeInsn(const MachineInstr *MI) {
     LabelsBeforeInsn.insert(std::make_pair(MI, nullptr));
   }
 
-  /// \brief Return Label preceding the instruction.
-  MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
-
   /// \brief Ensure that a label will be emitted after MI.
   void requestLabelAfterInsn(const MachineInstr *MI) {
     LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
   }
 
-  /// \brief Return Label immediately following the instruction.
-  MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
-
-  void attachRangesOrLowHighPC(DwarfCompileUnit &Unit, DIE &D,
-                               const SmallVectorImpl<InsnRange> &Ranges);
-  void attachLowHighPC(DwarfCompileUnit &Unit, DIE &D, MCSymbol *Begin,
-                       MCSymbol *End);
-
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
@@ -567,13 +492,6 @@
 
   ~DwarfDebug() override;
 
-  void insertDIE(const MDNode *TypeMD, DIE *Die) {
-    MDTypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
-  }
-  DIE *getDIE(const MDNode *TypeMD) {
-    return MDTypeNodeToDieMap.lookup(TypeMD);
-  }
-
   /// \brief Emit all Dwarf sections that should come prior to the
   /// content.
   void beginModule();
@@ -626,11 +544,15 @@
   /// Returns the section symbol for the .debug_loc section.
   MCSymbol *getDebugLocSym() const { return DwarfDebugLocSectionSym; }
 
-  /// Returns the previous section that was emitted into.
-  const MCSection *getPrevSection() const { return PrevSection; }
+  /// Returns the section symbol for the .debug_str section.
+  MCSymbol *getDebugStrSym() const { return DwarfStrSectionSym; }
+
+  /// Returns the section symbol for the .debug_ranges section.
+  MCSymbol *getRangeSectionSym() const { return DwarfDebugRangeSectionSym; }
 
   /// Returns the previous CU that was being updated
   const DwarfCompileUnit *getPrevCU() const { return PrevCU; }
+  void setPrevCU(const DwarfCompileUnit *PrevCU) { this->PrevCU = PrevCU; }
 
   /// Returns the entries for the .debug_loc section.
   const SmallVectorImpl<DebugLocList> &
@@ -641,6 +563,13 @@
   /// \brief Emit an entry for the debug loc section. This can be used to
   /// handle an entry that's going to be emitted into the debug loc section.
   void emitDebugLocEntry(ByteStreamer &Streamer, const DebugLocEntry &Entry);
+  /// \brief emit a single value for the debug loc section.
+  void emitDebugLocValue(ByteStreamer &Streamer,
+                         const DebugLocEntry::Value &Value);
+  /// Emits an optimal (=sorted) sequence of DW_OP_pieces.
+  void emitLocPieces(ByteStreamer &Streamer,
+                     const DITypeIdentifierMap &Map,
+                     ArrayRef<DebugLocEntry::Value> Values);
 
   /// Emit the location for a debug loc entry, including the size header.
   void emitDebugLocEntryLocation(const DebugLocEntry &Entry);
@@ -674,6 +603,40 @@
   void addAccelNamespace(StringRef Name, const DIE &Die);
 
   void addAccelType(StringRef Name, const DIE &Die, char Flags);
+
+  const MachineFunction *getCurrentFunction() const { return CurFn; }
+  const MCSymbol *getFunctionBeginSym() const { return FunctionBeginSym; }
+  const MCSymbol *getFunctionEndSym() const { return FunctionEndSym; }
+
+  iterator_range<ImportedEntityMap::const_iterator>
+  findImportedEntitiesForScope(const MDNode *Scope) const {
+    return make_range(std::equal_range(
+        ScopesWithImportedEntities.begin(), ScopesWithImportedEntities.end(),
+        std::pair<const MDNode *, const MDNode *>(Scope, nullptr),
+        less_first()));
+  }
+
+  /// \brief A helper function to check whether the DIE for a given Scope is
+  /// going to be null.
+  bool isLexicalScopeDIENull(LexicalScope *Scope);
+
+  /// \brief Return Label preceding the instruction.
+  MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
+
+  /// \brief Return Label immediately following the instruction.
+  MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
+
+  // FIXME: Consider rolling ranges up into DwarfDebug since we use a single
+  // range_base anyway, so there's no need to keep them as separate per-CU range
+  // lists. (though one day we might end up with a range.dwo section, in which
+  // case it'd go to DwarfFile)
+  unsigned getNextRangeNumber() { return GlobalRangeCount++; }
+
+  // FIXME: Sink these functions down into DwarfFile/Dwarf*Unit.
+
+  SmallPtrSet<const MDNode *, 16> &getProcessedSPNodes() {
+    return ProcessedSPNodes;
+  }
 };
 } // End of namespace llvm
 

diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index 0440fce..e8867c0 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
-#define LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
 
 #include "EHStreamer.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -81,39 +81,6 @@
   /// endFunction - Gather and emit post-function exception information.
   void endFunction(const MachineFunction *) override;
 };
-
-class Win64Exception : public EHStreamer {
-  /// shouldEmitPersonality - Per-function flag to indicate if personality
-  /// info should be emitted.
-  bool shouldEmitPersonality;
-
-  /// shouldEmitLSDA - Per-function flag to indicate if the LSDA
-  /// should be emitted.
-  bool shouldEmitLSDA;
-
-  /// shouldEmitMoves - Per-function flag to indicate if frame moves info
-  /// should be emitted.
-  bool shouldEmitMoves;
-
-public:
-  //===--------------------------------------------------------------------===//
-  // Main entry points.
-  //
-  Win64Exception(AsmPrinter *A);
-  virtual ~Win64Exception();
-
-  /// endModule - Emit all exception information that should come after the
-  /// content.
-  void endModule() override;
-
-  /// beginFunction - Gather pre-function exception information.  Assumes being
-  /// emitted immediately after the function entry point.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// endFunction - Gather and emit post-function exception information.
-  void endFunction(const MachineFunction *) override;
-};
-
 } // End of namespace llvm
 
 #endif

diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 737ee54..50180ea 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp

@@ -18,8 +18,9 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
-DwarfFile::DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA)
-    : Asm(AP), StrPool(DA, *Asm, Pref) {}
+DwarfFile::DwarfFile(AsmPrinter *AP, DwarfDebug &DD, StringRef Pref,
+                     BumpPtrAllocator &DA)
+    : Asm(AP), DD(DD), StrPool(DA, *Asm, Pref) {}
 
 DwarfFile::~DwarfFile() {}
 
@@ -48,25 +49,18 @@
 
 // Emit the various dwarf units to the unit section USection with
 // the abbreviations going into ASection.
-void DwarfFile::emitUnits(DwarfDebug *DD, const MCSymbol *ASectionSym) {
+void DwarfFile::emitUnits(const MCSymbol *ASectionSym) {
   for (const auto &TheU : CUs) {
     DIE &Die = TheU->getUnitDie();
     const MCSection *USection = TheU->getSection();
     Asm->OutStreamer.SwitchSection(USection);
 
-    // Emit the compile units header.
-    Asm->OutStreamer.EmitLabel(TheU->getLabelBegin());
-
-    // Emit size of content not including length itself
-    Asm->OutStreamer.AddComment("Length of Unit");
-    Asm->EmitInt32(TheU->getHeaderSize() + Die.getSize());
-
     TheU->emitHeader(ASectionSym);
 
-    DD->emitDIE(Die);
-    Asm->OutStreamer.EmitLabel(TheU->getLabelEnd());
+    DD.emitDIE(Die);
   }
 }
+
 // Compute the size and offset for each DIE.
 void DwarfFile::computeSizeAndOffsets() {
   // Offset from the first CU in the debug info section is 0 initially.
@@ -149,8 +143,44 @@
 
 // Emit strings into a string section.
 void DwarfFile::emitStrings(const MCSection *StrSection,
-                            const MCSection *OffsetSection,
-                            const MCSymbol *StrSecSym) {
-  StrPool.emit(*Asm, StrSection, OffsetSection, StrSecSym);
+                            const MCSection *OffsetSection) {
+  StrPool.emit(*Asm, StrSection, OffsetSection);
+}
+
+void DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+  SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
+  DIVariable DV = Var->getVariable();
+  // Variables with positive arg numbers are parameters.
+  if (unsigned ArgNum = DV.getArgNumber()) {
+    // Keep all parameters in order at the start of the variable list to ensure
+    // function types are correct (no out-of-order parameters)
+    //
+    // This could be improved by only doing it for optimized builds (unoptimized
+    // builds have the right order to begin with), searching from the back (this
+    // would catch the unoptimized case quickly), or doing a binary search
+    // rather than linear search.
+    auto I = Vars.begin();
+    while (I != Vars.end()) {
+      unsigned CurNum = (*I)->getVariable().getArgNumber();
+      // A local (non-parameter) variable has been found, insert immediately
+      // before it.
+      if (CurNum == 0)
+        break;
+      // A later indexed parameter has been found, insert immediately before it.
+      if (CurNum > ArgNum)
+        break;
+      // FIXME: There are still some cases where two inlined functions are
+      // conflated together (two calls to the same function at the same
+      // location (eg: via a macro, or without column info, etc)) and then
+      // their arguments are conflated as well.
+      assert((LS->getParent() || CurNum != ArgNum) &&
+             "Duplicate argument for top level (non-inlined) function");
+      ++I;
+    }
+    Vars.insert(I, Var);
+    return;
+  }
+
+  Vars.push_back(Var);
 }
 }

diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 3985eb2..9d64bfc 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DWARFFILE_H__
-#define CODEGEN_ASMPRINTER_DWARFFILE_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -24,10 +24,13 @@
 
 namespace llvm {
 class AsmPrinter;
+class DbgVariable;
 class DwarfUnit;
 class DIEAbbrev;
 class MCSymbol;
 class DIE;
+class DISubprogram;
+class LexicalScope;
 class StringRef;
 class DwarfDebug;
 class MCSection;
@@ -35,6 +38,8 @@
   // Target of Dwarf emission, used for sizing of abbreviations.
   AsmPrinter *Asm;
 
+  DwarfDebug &DD;
+
   // Used to uniquely define abbreviations.
   FoldingSet<DIEAbbrev> AbbreviationsSet;
 
@@ -46,8 +51,20 @@
 
   DwarfStringPool StrPool;
 
+  // Collection of dbg variables of a scope.
+  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> ScopeVariables;
+
+  // Collection of abstract subprogram DIEs.
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+
+  /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
+  /// be shared across CUs, that is why we keep the map here instead
+  /// of in DwarfCompileUnit.
+  DenseMap<const MDNode *, DIE *> MDTypeNodeToDieMap;
+
 public:
-  DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA);
+  DwarfFile(AsmPrinter *AP, DwarfDebug &DD, StringRef Pref,
+            BumpPtrAllocator &DA);
 
   ~DwarfFile();
 
@@ -67,18 +84,34 @@
 
   /// \brief Emit all of the units to the section listed with the given
   /// abbreviation section.
-  void emitUnits(DwarfDebug *DD, const MCSymbol *ASectionSym);
+  void emitUnits(const MCSymbol *ASectionSym);
 
   /// \brief Emit a set of abbreviations to the specific section.
   void emitAbbrevs(const MCSection *);
 
   /// \brief Emit all of the strings to the section given.
   void emitStrings(const MCSection *StrSection,
-                   const MCSection *OffsetSection = nullptr,
-                   const MCSymbol *StrSecSym = nullptr);
+                   const MCSection *OffsetSection = nullptr);
 
   /// \brief Returns the string pool.
   DwarfStringPool &getStringPool() { return StrPool; }
+
+  void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
+
+  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> &getScopeVariables() {
+    return ScopeVariables;
+  }
+
+  DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
+    return AbstractSPDies;
+  }
+
+  void insertDIE(const MDNode *TypeMD, DIE *Die) {
+    MDTypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
+  }
+  DIE *getDIE(const MDNode *TypeMD) {
+    return MDTypeNodeToDieMap.lookup(TypeMD);
+  }
 };
 }
 #endif

diff --git a/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index 72cab60..d76b66c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp

@@ -12,14 +12,11 @@
 
 using namespace llvm;
 
-MCSymbol *DwarfStringPool::getSectionSymbol() { return SectionSymbol; }
-
 static std::pair<MCSymbol *, unsigned> &
 getEntry(AsmPrinter &Asm,
          StringMap<std::pair<MCSymbol *, unsigned>, BumpPtrAllocator &> &Pool,
          StringRef Prefix, StringRef Str) {
-  std::pair<MCSymbol *, unsigned> &Entry =
-      Pool.GetOrCreateValue(Str).getValue();
+  std::pair<MCSymbol *, unsigned> &Entry = Pool[Str];
   if (!Entry.first) {
     Entry.second = Pool.size() - 1;
     Entry.first = Asm.GetTempSymbol(Prefix, Entry.second);
@@ -36,8 +33,7 @@
 }
 
 void DwarfStringPool::emit(AsmPrinter &Asm, const MCSection *StrSection,
-                           const MCSection *OffsetSection,
-                           const MCSymbol *StrSecSym) {
+                           const MCSection *OffsetSection) {
   if (Pool.empty())
     return;
 

diff --git a/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/lib/CodeGen/AsmPrinter/DwarfStringPool.h
index c1615fb..ab32c1b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfStringPool.h
+++ b/lib/CodeGen/AsmPrinter/DwarfStringPool.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_STRINGPOOL_H__
-#define CODEGEN_ASMPRINTER_STRINGPOOL_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFSTRINGPOOL_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFSTRINGPOOL_H
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -28,18 +28,13 @@
 class DwarfStringPool {
   StringMap<std::pair<MCSymbol *, unsigned>, BumpPtrAllocator &> Pool;
   StringRef Prefix;
-  MCSymbol *SectionSymbol;
 
 public:
   DwarfStringPool(BumpPtrAllocator &A, AsmPrinter &Asm, StringRef Prefix)
-      : Pool(A), Prefix(Prefix), SectionSymbol(Asm.GetTempSymbol(Prefix)) {}
+      : Pool(A), Prefix(Prefix) {}
 
   void emit(AsmPrinter &Asm, const MCSection *StrSection,
-            const MCSection *OffsetSection = nullptr,
-            const MCSymbol *StrSecSym = nullptr);
-
-  /// \brief Returns the entry into the start of the pool.
-  MCSymbol *getSectionSymbol();
+            const MCSection *OffsetSection = nullptr);
 
   /// \brief Returns an entry into the string pool with the given
   /// string text.

diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 9538bee..919d9d2 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp

@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfUnit.h"
+
 #include "DwarfAccelTable.h"
+#include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/IR/Constants.h"
@@ -30,6 +32,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -44,20 +47,12 @@
 DwarfUnit::DwarfUnit(unsigned UID, dwarf::Tag UnitTag, DICompileUnit Node,
                      AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
     : UniqueID(UID), CUNode(Node), UnitDie(UnitTag), DebugInfoOffset(0), Asm(A),
-      DD(DW), DU(DWU), IndexTyDie(nullptr), Section(nullptr),
-      Skeleton(nullptr) {
+      DD(DW), DU(DWU), IndexTyDie(nullptr), Section(nullptr) {
   assert(UnitTag == dwarf::DW_TAG_compile_unit ||
          UnitTag == dwarf::DW_TAG_type_unit);
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
 }
 
-DwarfCompileUnit::DwarfCompileUnit(unsigned UID, DICompileUnit Node,
-                                   AsmPrinter *A, DwarfDebug *DW,
-                                   DwarfFile *DWU)
-    : DwarfUnit(UID, dwarf::DW_TAG_compile_unit, Node, A, DW, DWU) {
-  insertDIE(Node, &getUnitDie());
-}
-
 DwarfTypeUnit::DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
                              DwarfDebug *DW, DwarfFile *DWU,
                              MCDwarfDwoLineTable *SplitLineTable)
@@ -146,7 +141,7 @@
 /// will be kept in DwarfDebug for shareable DIEs.
 DIE *DwarfUnit::getDIE(DIDescriptor D) const {
   if (isShareableAcrossCUs(D))
-    return DD->getDIE(D);
+    return DU->getDIE(D);
   return MDNodeToDieMap.lookup(D);
 }
 
@@ -155,7 +150,7 @@
 /// will be kept in DwarfDebug for shareable DIEs.
 void DwarfUnit::insertDIE(DIDescriptor Desc, DIE *D) {
   if (isShareableAcrossCUs(Desc)) {
-    DD->insertDIE(Desc, D);
+    DU->insertDIE(Desc, D);
     return;
   }
   MDNodeToDieMap.insert(std::make_pair(Desc, D));
@@ -206,10 +201,14 @@
 /// table.
 void DwarfUnit::addString(DIE &Die, dwarf::Attribute Attribute,
                           StringRef String) {
-
-  if (!DD->useSplitDwarf())
+  if (!isDwoUnit())
     return addLocalString(Die, Attribute, String);
 
+  addIndexedString(Die, Attribute, String);
+}
+
+void DwarfUnit::addIndexedString(DIE &Die, dwarf::Attribute Attribute,
+                                 StringRef String) {
   unsigned idx = DU->getStringPool().getIndex(*Asm, String);
   DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
   DIEValue *Str = new (DIEValueAllocator) DIEString(Value, String);
@@ -224,31 +223,12 @@
   DIEValue *Value;
   if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
     Value = new (DIEValueAllocator) DIELabel(Symb);
-  else {
-    MCSymbol *StringPool = DU->getStringPool().getSectionSymbol();
-    Value = new (DIEValueAllocator) DIEDelta(Symb, StringPool);
-  }
+  else
+    Value = new (DIEValueAllocator) DIEDelta(Symb, DD->getDebugStrSym());
   DIEValue *Str = new (DIEValueAllocator) DIEString(Value, String);
   Die.addValue(Attribute, dwarf::DW_FORM_strp, Str);
 }
 
-/// addExpr - Add a Dwarf expression attribute data and value.
-///
-void DwarfUnit::addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr) {
-  DIEValue *Value = new (DIEValueAllocator) DIEExpr(Expr);
-  Die.addValue((dwarf::Attribute)0, Form, Value);
-}
-
-/// addLocationList - Add a Dwarf loclistptr attribute data and value.
-///
-void DwarfUnit::addLocationList(DIE &Die, dwarf::Attribute Attribute,
-                                unsigned Index) {
-  DIEValue *Value = new (DIEValueAllocator) DIELocList(Index);
-  dwarf::Form Form = DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                                : dwarf::DW_FORM_data4;
-  Die.addValue(Attribute, Form, Value);
-}
-
 /// addLabel - Add a Dwarf label attribute data and value.
 ///
 void DwarfUnit::addLabel(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
@@ -261,16 +241,6 @@
   addLabel(Die, (dwarf::Attribute)0, Form, Label);
 }
 
-/// addSectionLabel - Add a Dwarf section label attribute data and value.
-///
-void DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
-                                const MCSymbol *Label) {
-  if (DD->getDwarfVersion() >= 4)
-    addLabel(Die, Attribute, dwarf::DW_FORM_sec_offset, Label);
-  else
-    addLabel(Die, Attribute, dwarf::DW_FORM_data4, Label);
-}
-
 /// addSectionOffset - Add an offset into a section attribute data and value.
 ///
 void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute,
@@ -281,45 +251,6 @@
     addUInt(Die, Attribute, dwarf::DW_FORM_data4, Integer);
 }
 
-/// addLabelAddress - Add a dwarf label attribute data and value using
-/// DW_FORM_addr or DW_FORM_GNU_addr_index.
-///
-void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
-                                       const MCSymbol *Label) {
-
-  if (!DD->useSplitDwarf())
-    return addLocalLabelAddress(Die, Attribute, Label);
-
-  if (Label)
-    DD->addArangeLabel(SymbolCU(this, Label));
-
-  unsigned idx = DD->getAddressPool().getIndex(Label);
-  DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
-  Die.addValue(Attribute, dwarf::DW_FORM_GNU_addr_index, Value);
-}
-
-void DwarfCompileUnit::addLocalLabelAddress(DIE &Die,
-                                            dwarf::Attribute Attribute,
-                                            const MCSymbol *Label) {
-  if (Label)
-    DD->addArangeLabel(SymbolCU(this, Label));
-
-  Die.addValue(Attribute, dwarf::DW_FORM_addr,
-               Label ? (DIEValue *)new (DIEValueAllocator) DIELabel(Label)
-                     : new (DIEValueAllocator) DIEInteger(0));
-}
-
-unsigned DwarfCompileUnit::getOrCreateSourceID(StringRef FileName, StringRef DirName) {
-  // If we print assembly, we can't separate .file entries according to
-  // compile units. Thus all files will belong to the default compile unit.
-
-  // FIXME: add a better feature test than hasRawTextSupport. Even better,
-  // extend .file to support this.
-  return Asm->OutStreamer.EmitDwarfFileDirective(
-      0, DirName, FileName,
-      Asm->OutStreamer.hasRawTextSupport() ? 0 : getUniqueID());
-}
-
 unsigned DwarfTypeUnit::getOrCreateSourceID(StringRef FileName, StringRef DirName) {
   return SplitLineTable ? SplitLineTable->getFile(DirName, FileName)
                         : getCU().getOrCreateSourceID(FileName, DirName);
@@ -339,16 +270,6 @@
   }
 }
 
-/// addSectionDelta - Add a section label delta attribute data and value.
-///
-void DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
-                                const MCSymbol *Hi, const MCSymbol *Lo) {
-  DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
-  Die.addValue(Attribute, DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                                     : dwarf::DW_FORM_data4,
-               Value);
-}
-
 void DwarfUnit::addLabelDelta(DIE &Die, dwarf::Attribute Attribute,
                               const MCSymbol *Hi, const MCSymbol *Lo) {
   DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
@@ -477,22 +398,12 @@
   addSourceLine(Die, NS.getLineNumber(), NS.getFilename(), NS.getDirectory());
 }
 
-/// addVariableAddress - Add DW_AT_location attribute for a
-/// DbgVariable based on provided MachineLocation.
-void DwarfUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
-                                   MachineLocation Location) {
-  if (DV.variableHasComplexAddress())
-    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
-  else if (DV.isBlockByrefVariable())
-    addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
-  else
-    addAddress(Die, dwarf::DW_AT_location, Location,
-               DV.getVariable().isIndirect());
-}
-
 /// addRegisterOp - Add register operand.
-void DwarfUnit::addRegisterOp(DIELoc &TheDie, unsigned Reg) {
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
+// FIXME: Ideally, this would share the implementation with
+// AsmPrinter::EmitDwarfRegOpPiece.
+void DwarfUnit::addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+                                   unsigned SizeInBits, unsigned OffsetInBits) {
+  const TargetRegisterInfo *RI = Asm->TM.getSubtargetImpl()->getRegisterInfo();
   int DWReg = RI->getDwarfRegNum(Reg, false);
   bool isSubRegister = DWReg < 0;
 
@@ -511,7 +422,7 @@
     return;
   }
 
-  // Emit register
+  // Emit register.
   if (DWReg < 32)
     addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
   else {
@@ -519,18 +430,34 @@
     addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
   }
 
-  // Emit Mask
-  if (isSubRegister) {
-    unsigned Size = RI->getSubRegIdxSize(Idx);
-    unsigned Offset = RI->getSubRegIdxOffset(Idx);
-    if (Offset > 0) {
+  // Emit mask.
+  bool isPiece = SizeInBits > 0;
+  if (isSubRegister || isPiece) {
+    const unsigned SizeOfByte = 8;
+    unsigned RegSizeInBits = RI->getSubRegIdxSize(Idx);
+    unsigned RegOffsetInBits = RI->getSubRegIdxOffset(Idx);
+    unsigned PieceSizeInBits = std::max(SizeInBits, RegSizeInBits);
+    unsigned PieceOffsetInBits = OffsetInBits ? OffsetInBits : RegOffsetInBits;
+    assert(RegSizeInBits >= SizeInBits && "register smaller than value");
+
+    if (RegOffsetInBits != PieceOffsetInBits) {
+      // Manually shift the value into place, since the DW_OP_piece
+      // describes the part of the variable, not the position of the
+      // subregister.
+      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+      addUInt(TheDie, dwarf::DW_FORM_data1, RegOffsetInBits);
+      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_shr);
+    }
+
+    if (PieceOffsetInBits > 0 || PieceSizeInBits % SizeOfByte) {
+      assert(PieceSizeInBits > 0 && "piece has zero size");
       addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bit_piece);
-      addUInt(TheDie, dwarf::DW_FORM_data1, Size);
-      addUInt(TheDie, dwarf::DW_FORM_data1, Offset);
-    } else {
-      unsigned ByteSize = Size / 8; // Assuming 8 bits per byte.
+      addUInt(TheDie, dwarf::DW_FORM_data1, PieceSizeInBits);
+      addUInt(TheDie, dwarf::DW_FORM_data1, PieceOffsetInBits);
+     } else {
+      assert(PieceSizeInBits > 0 && "piece has zero size");
       addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_piece);
-      addUInt(TheDie, dwarf::DW_FORM_data1, ByteSize);
+      addUInt(TheDie, dwarf::DW_FORM_data1, PieceSizeInBits/SizeOfByte);
     }
   }
 }
@@ -538,9 +465,9 @@
 /// addRegisterOffset - Add register offset.
 void DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
                                   int64_t Offset) {
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
+  const TargetRegisterInfo *RI = Asm->TM.getSubtargetImpl()->getRegisterInfo();
   unsigned DWReg = RI->getDwarfRegNum(Reg, false);
-  const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = Asm->TM.getSubtargetImpl()->getRegisterInfo();
   if (Reg == TRI->getFrameRegister(*Asm->MF))
     // If variable offset is based in frame register then use fbreg.
     addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
@@ -553,63 +480,6 @@
   addSInt(TheDie, dwarf::DW_FORM_sdata, Offset);
 }
 
-/// addAddress - Add an address attribute to a die based on the location
-/// provided.
-void DwarfUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
-                           const MachineLocation &Location, bool Indirect) {
-  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-
-  if (Location.isReg() && !Indirect)
-    addRegisterOp(*Loc, Location.getReg());
-  else {
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
-    if (Indirect && !Location.isReg()) {
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    }
-  }
-
-  // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
-}
-
-/// addComplexAddress - Start with the address based on the location provided,
-/// and generate the DWARF information necessary to find the actual variable
-/// given the extra address information encoded in the DbgVariable, starting
-/// from the starting location.  Add the DWARF information to the die.
-///
-void DwarfUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
-                                  dwarf::Attribute Attribute,
-                                  const MachineLocation &Location) {
-  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-  unsigned N = DV.getNumAddrElements();
-  unsigned i = 0;
-  if (Location.isReg()) {
-    if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
-      // If first address element is OpPlus then emit
-      // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-      addRegisterOffset(*Loc, Location.getReg(), DV.getAddrElement(1));
-      i = 2;
-    } else
-      addRegisterOp(*Loc, Location.getReg());
-  } else
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
-
-  for (; i < N; ++i) {
-    uint64_t Element = DV.getAddrElement(i);
-    if (Element == DIBuilder::OpPlus) {
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(*Loc, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
-    } else if (Element == DIBuilder::OpDeref) {
-      if (!Location.isReg())
-        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    } else
-      llvm_unreachable("unknown DIBuilder Opcode");
-  }
-
-  // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
-}
-
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
    VarName;", but the compiler creates a __Block_byref_x_VarName struct, and
    gives the variable VarName either the struct, or a pointer to the struct, as
@@ -690,7 +560,7 @@
 
   // Find the __forwarding field and the variable field in the __Block_byref
   // struct.
-  DIArray Fields = blockStruct.getTypeArray();
+  DIArray Fields = blockStruct.getElements();
   DIDerivedType varField;
   DIDerivedType forwardingField;
 
@@ -712,7 +582,7 @@
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
 
   if (Location.isReg())
-    addRegisterOp(*Loc, Location.getReg());
+    addRegisterOpPiece(*Loc, Location.getReg());
   else
     addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
 
@@ -1002,11 +872,9 @@
     unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
     DD->addAccelType(Ty.getName(), TyDIE, Flags);
 
-    if ((!Context || Context.isCompileUnit() || Context.isFile() ||
-         Context.isNameSpace()) &&
-        getCUNode().getEmissionKind() != DIBuilder::LineTablesOnly)
-      GlobalTypes[getParentContextString(Context) + Ty.getName().str()] =
-          &TyDIE;
+    if (!Context || Context.isCompileUnit() || Context.isFile() ||
+        Context.isNameSpace())
+      addGlobalType(Ty, TyDIE, Context);
   }
 }
 
@@ -1031,14 +899,6 @@
   addDIEEntry(Entity, Attribute, Entry);
 }
 
-/// addGlobalName - Add a new global name to the compile unit.
-void DwarfUnit::addGlobalName(StringRef Name, DIE &Die, DIScope Context) {
-  if (getCUNode().getEmissionKind() == DIBuilder::LineTablesOnly)
-    return;
-  std::string FullName = getParentContextString(Context) + Name.str();
-  GlobalNames[FullName] = &Die;
-}
-
 /// getParentContextString - Walks the metadata parent chain in a language
 /// specific manner (using the compile unit language) and returns
 /// it as a string. This is done at the metadata level because DIEs may
@@ -1129,16 +989,16 @@
 }
 
 /// constructSubprogramArguments - Construct function argument DIEs.
-void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DIArray Args) {
+void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeArray Args) {
   for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
-    DIDescriptor Ty = Args.getElement(i);
-    if (Ty.isUnspecifiedParameter()) {
+    DIType Ty = resolve(Args.getElement(i));
+    if (!Ty) {
       assert(i == N-1 && "Unspecified parameter must be the last argument");
       createAndAddDIE(dwarf::DW_TAG_unspecified_parameters, Buffer);
     } else {
       DIE &Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
-      addType(Arg, DIType(Ty));
-      if (DIType(Ty).isArtificial())
+      addType(Arg, Ty);
+      if (Ty.isArtificial())
         addFlag(Arg, dwarf::DW_AT_artificial);
     }
   }
@@ -1161,14 +1021,14 @@
     break;
   case dwarf::DW_TAG_subroutine_type: {
     // Add return type. A void return won't have a type.
-    DIArray Elements = CTy.getTypeArray();
-    DIType RTy(Elements.getElement(0));
+    DITypeArray Elements = DISubroutineType(CTy).getTypeArray();
+    DIType RTy(resolve(Elements.getElement(0)));
     if (RTy)
       addType(Buffer, RTy);
 
     bool isPrototyped = true;
     if (Elements.getNumElements() == 2 &&
-        Elements.getElement(1).isUnspecifiedParameter())
+        !Elements.getElement(1))
       isPrototyped = false;
 
     constructSubprogramArguments(Buffer, Elements);
@@ -1191,7 +1051,7 @@
   case dwarf::DW_TAG_union_type:
   case dwarf::DW_TAG_class_type: {
     // Add elements to structure type.
-    DIArray Elements = CTy.getTypeArray();
+    DIArray Elements = CTy.getElements();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
       DIDescriptor Element = Elements.getElement(i);
       if (Element.isSubprogram())
@@ -1373,20 +1233,23 @@
 }
 
 /// getOrCreateSubprogramDIE - Create new DIE using SP.
-DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
+DIE *DwarfUnit::getOrCreateSubprogramDIE(DISubprogram SP, bool Minimal) {
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE (as is the case for member function
   // declarations).
-  DIE *ContextDIE = getOrCreateContextDIE(resolve(SP.getContext()));
+  DIE *ContextDIE =
+      Minimal ? &getUnitDie() : getOrCreateContextDIE(resolve(SP.getContext()));
 
   if (DIE *SPDie = getDIE(SP))
     return SPDie;
 
   if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
-    // Add subprogram definitions to the CU die directly.
-    ContextDIE = &getUnitDie();
-    // Build the decl now to ensure it precedes the definition.
-    getOrCreateSubprogramDIE(SPDecl);
+    if (!Minimal) {
+      // Add subprogram definitions to the CU die directly.
+      ContextDIE = &getUnitDie();
+      // Build the decl now to ensure it precedes the definition.
+      getOrCreateSubprogramDIE(SPDecl);
+    }
   }
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
@@ -1401,14 +1264,8 @@
   return &SPDie;
 }
 
-void DwarfUnit::applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie) {
-  DISubprogram SPDecl = SP.getFunctionDeclaration();
-  DIScope Context = resolve(SPDecl ? SPDecl.getContext() : SP.getContext());
-  applySubprogramAttributes(SP, SPDie);
-  addGlobalName(SP.getName(), SPDie, Context);
-}
-
-void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie) {
+bool DwarfUnit::applySubprogramDefinitionAttributes(DISubprogram SP,
+                                                    DIE &SPDie) {
   DIE *DeclDie = nullptr;
   StringRef DeclLinkageName;
   if (DISubprogram SPDecl = SP.getFunctionDeclaration()) {
@@ -1431,17 +1288,29 @@
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
               GlobalValue::getRealLinkageName(LinkageName));
 
-  if (DeclDie) {
-    // Refer to the function declaration where all the other attributes will be
-    // found.
-    addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
-    return;
-  }
+  if (!DeclDie)
+    return false;
+
+  // Refer to the function declaration where all the other attributes will be
+  // found.
+  addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
+  return true;
+}
+
+void DwarfUnit::applySubprogramAttributes(DISubprogram SP, DIE &SPDie,
+                                          bool Minimal) {
+  if (!Minimal)
+    if (applySubprogramDefinitionAttributes(SP, SPDie))
+      return;
 
   // Constructors and operators for anonymous aggregates do not have names.
   if (!SP.getName().empty())
     addString(SPDie, dwarf::DW_AT_name, SP.getName());
 
+  // Skip the rest of the attributes under -gmlt to save space.
+  if (Minimal)
+    return;
+
   addSourceLine(SPDie, SP);
 
   // Add the prototype if we have a prototype and we have a C like
@@ -1452,15 +1321,15 @@
        Language == dwarf::DW_LANG_ObjC))
     addFlag(SPDie, dwarf::DW_AT_prototyped);
 
-  DICompositeType SPTy = SP.getType();
+  DISubroutineType SPTy = SP.getType();
   assert(SPTy.getTag() == dwarf::DW_TAG_subroutine_type &&
          "the type of a subprogram should be a subroutine");
 
-  DIArray Args = SPTy.getTypeArray();
+  DITypeArray Args = SPTy.getTypeArray();
   // Add a return type. If this is a type like a C/C++ void type we don't add a
   // return type.
-  if (Args.getElement(0))
-    addType(SPDie, DIType(Args.getElement(0)));
+  if (resolve(Args.getElement(0)))
+    addType(SPDie, DIType(resolve(Args.getElement(0))));
 
   unsigned VK = SP.getVirtuality();
   if (VK) {
@@ -1506,7 +1375,7 @@
   else if (SP.isPrivate())
     addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_private);
-  else
+  else if (SP.isPublic())
     addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_public);
 
@@ -1514,184 +1383,6 @@
     addFlag(SPDie, dwarf::DW_AT_explicit);
 }
 
-void DwarfUnit::applyVariableAttributes(const DbgVariable &Var,
-                                        DIE &VariableDie) {
-  StringRef Name = Var.getName();
-  if (!Name.empty())
-    addString(VariableDie, dwarf::DW_AT_name, Name);
-  addSourceLine(VariableDie, Var.getVariable());
-  addType(VariableDie, Var.getType());
-  if (Var.isArtificial())
-    addFlag(VariableDie, dwarf::DW_AT_artificial);
-}
-
-// Return const expression if value is a GEP to access merged global
-// constant. e.g.
-// i8* getelementptr ({ i8, i8, i8, i8 }* @_MergedGlobals, i32 0, i32 0)
-static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
-  const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(V);
-  if (!CE || CE->getNumOperands() != 3 ||
-      CE->getOpcode() != Instruction::GetElementPtr)
-    return nullptr;
-
-  // First operand points to a global struct.
-  Value *Ptr = CE->getOperand(0);
-  if (!isa<GlobalValue>(Ptr) ||
-      !isa<StructType>(cast<PointerType>(Ptr->getType())->getElementType()))
-    return nullptr;
-
-  // Second operand is zero.
-  const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CE->getOperand(1));
-  if (!CI || !CI->isZero())
-    return nullptr;
-
-  // Third operand is offset.
-  if (!isa<ConstantInt>(CE->getOperand(2)))
-    return nullptr;
-
-  return CE;
-}
-
-/// createGlobalVariableDIE - create global variable DIE.
-void DwarfCompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
-  // Check for pre-existence.
-  if (getDIE(GV))
-    return;
-
-  assert(GV.isGlobalVariable());
-
-  DIScope GVContext = GV.getContext();
-  DIType GTy = DD->resolve(GV.getType());
-
-  // If this is a static data member definition, some attributes belong
-  // to the declaration DIE.
-  DIE *VariableDIE = nullptr;
-  bool IsStaticMember = false;
-  DIDerivedType SDMDecl = GV.getStaticDataMemberDeclaration();
-  if (SDMDecl.Verify()) {
-    assert(SDMDecl.isStaticMember() && "Expected static member decl");
-    // We need the declaration DIE that is in the static member's class.
-    VariableDIE = getOrCreateStaticMemberDIE(SDMDecl);
-    IsStaticMember = true;
-  }
-
-  // If this is not a static data member definition, create the variable
-  // DIE and add the initial set of attributes to it.
-  if (!VariableDIE) {
-    // Construct the context before querying for the existence of the DIE in
-    // case such construction creates the DIE.
-    DIE *ContextDIE = getOrCreateContextDIE(GVContext);
-
-    // Add to map.
-    VariableDIE = &createAndAddDIE(GV.getTag(), *ContextDIE, GV);
-
-    // Add name and type.
-    addString(*VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
-    addType(*VariableDIE, GTy);
-
-    // Add scoping info.
-    if (!GV.isLocalToUnit())
-      addFlag(*VariableDIE, dwarf::DW_AT_external);
-
-    // Add line number info.
-    addSourceLine(*VariableDIE, GV);
-  }
-
-  // Add location.
-  bool addToAccelTable = false;
-  DIE *VariableSpecDIE = nullptr;
-  bool isGlobalVariable = GV.getGlobal() != nullptr;
-  if (isGlobalVariable) {
-    addToAccelTable = true;
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-    const MCSymbol *Sym = Asm->getSymbol(GV.getGlobal());
-    if (GV.getGlobal()->isThreadLocal()) {
-      // FIXME: Make this work with -gsplit-dwarf.
-      unsigned PointerSize = Asm->getDataLayout().getPointerSize();
-      assert((PointerSize == 4 || PointerSize == 8) &&
-             "Add support for other sizes if necessary");
-      // Based on GCC's support for TLS:
-      if (!DD->useSplitDwarf()) {
-        // 1) Start with a constNu of the appropriate pointer size
-        addUInt(*Loc, dwarf::DW_FORM_data1,
-                PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u);
-        // 2) containing the (relocated) offset of the TLS variable
-        //    within the module's TLS block.
-        addExpr(*Loc, dwarf::DW_FORM_udata,
-                Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
-      } else {
-        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-        addUInt(*Loc, dwarf::DW_FORM_udata,
-                DD->getAddressPool().getIndex(Sym, /* TLS */ true));
-      }
-      // 3) followed by a custom OP to make the debugger do a TLS lookup.
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_push_tls_address);
-    } else {
-      DD->addArangeLabel(SymbolCU(this, Sym));
-      addOpAddress(*Loc, Sym);
-    }
-    // Do not create specification DIE if context is either compile unit
-    // or a subprogram.
-    if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
-        !GVContext.isFile() && !DD->isSubprogramContext(GVContext)) {
-      // Create specification DIE.
-      VariableSpecDIE = &createAndAddDIE(dwarf::DW_TAG_variable, UnitDie);
-      addDIEEntry(*VariableSpecDIE, dwarf::DW_AT_specification, *VariableDIE);
-      addBlock(*VariableSpecDIE, dwarf::DW_AT_location, Loc);
-      // A static member's declaration is already flagged as such.
-      if (!SDMDecl.Verify())
-        addFlag(*VariableDIE, dwarf::DW_AT_declaration);
-    } else {
-      addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
-    }
-    // Add the linkage name.
-    StringRef LinkageName = GV.getLinkageName();
-    if (!LinkageName.empty())
-      // From DWARF4: DIEs to which DW_AT_linkage_name may apply include:
-      // TAG_common_block, TAG_constant, TAG_entry_point, TAG_subprogram and
-      // TAG_variable.
-      addString(IsStaticMember && VariableSpecDIE ? *VariableSpecDIE
-                                                  : *VariableDIE,
-                DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
-                                           : dwarf::DW_AT_MIPS_linkage_name,
-                GlobalValue::getRealLinkageName(LinkageName));
-  } else if (const ConstantInt *CI =
-                 dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
-    // AT_const_value was added when the static member was created. To avoid
-    // emitting AT_const_value multiple times, we only add AT_const_value when
-    // it is not a static member.
-    if (!IsStaticMember)
-      addConstantValue(*VariableDIE, CI, GTy);
-  } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV->getOperand(11))) {
-    addToAccelTable = true;
-    // GV is a merged global.
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-    Value *Ptr = CE->getOperand(0);
-    MCSymbol *Sym = Asm->getSymbol(cast<GlobalValue>(Ptr));
-    DD->addArangeLabel(SymbolCU(this, Sym));
-    addOpAddress(*Loc, Sym);
-    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
-    addUInt(*Loc, dwarf::DW_FORM_udata,
-            Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
-    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-    addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
-  }
-
-  if (addToAccelTable) {
-    DIE &AddrDIE = VariableSpecDIE ? *VariableSpecDIE : *VariableDIE;
-    DD->addAccelName(GV.getName(), AddrDIE);
-
-    // If the linkage name is different than the name, go ahead and output
-    // that as well into the name table.
-    if (GV.getLinkageName() != "" && GV.getName() != GV.getLinkageName())
-      DD->addAccelName(GV.getLinkageName(), AddrDIE);
-  }
-
-  addGlobalName(GV.getName(), VariableSpecDIE ? *VariableSpecDIE : *VariableDIE,
-                GV.getContext());
-}
-
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
 void DwarfUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy) {
   DIE &DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
@@ -1700,9 +1391,7 @@
   // The LowerBound value defines the lower bounds which is typically zero for
   // C/C++. The Count value is the number of elements.  Values are 64 bit. If
   // Count == -1 then the array is unbounded and we do not emit
-  // DW_AT_lower_bound and DW_AT_upper_bound attributes. If LowerBound == 0 and
-  // Count == 0, then the array has zero elements in which case we do not emit
-  // an upper bound.
+  // DW_AT_lower_bound and DW_AT_count attributes.
   int64_t LowerBound = SR.getLo();
   int64_t DefaultLowerBound = getDefaultLowerBound();
   int64_t Count = SR.getCount();
@@ -1710,11 +1399,22 @@
   if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
     addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);
 
-  if (Count != -1 && Count != 0)
+  if (Count != -1)
     // FIXME: An unbounded array should reference the expression that defines
     // the array.
-    addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, None,
-            LowerBound + Count - 1);
+    addUInt(DW_Subrange, dwarf::DW_AT_count, None, Count);
+}
+
+DIE *DwarfUnit::getIndexTyDie() {
+  if (IndexTyDie)
+    return IndexTyDie;
+  // Construct an integer type to use for indexes.
+  IndexTyDie = &createAndAddDIE(dwarf::DW_TAG_base_type, UnitDie);
+  addString(*IndexTyDie, dwarf::DW_AT_name, "sizetype");
+  addUInt(*IndexTyDie, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
+  addUInt(*IndexTyDie, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+          dwarf::DW_ATE_unsigned);
+  return IndexTyDie;
 }
 
 /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
@@ -1729,18 +1429,9 @@
   // FIXME: This type should be passed down from the front end
   // as different languages may have different sizes for indexes.
   DIE *IdxTy = getIndexTyDie();
-  if (!IdxTy) {
-    // Construct an integer type to use for indexes.
-    IdxTy = &createAndAddDIE(dwarf::DW_TAG_base_type, UnitDie);
-    addString(*IdxTy, dwarf::DW_AT_name, "sizetype");
-    addUInt(*IdxTy, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
-    addUInt(*IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
-            dwarf::DW_ATE_unsigned);
-    setIndexTyDie(IdxTy);
-  }
 
   // Add subranges to array type.
-  DIArray Elements = CTy.getTypeArray();
+  DIArray Elements = CTy.getElements();
   for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
     DIDescriptor Element = Elements.getElement(i);
     if (Element.getTag() == dwarf::DW_TAG_subrange_type)
@@ -1750,7 +1441,7 @@
 
 /// constructEnumTypeDIE - Construct an enum type DIE from DICompositeType.
 void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy) {
-  DIArray Elements = CTy.getTypeArray();
+  DIArray Elements = CTy.getElements();
 
   // Add enumerators to enumeration type.
   for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
@@ -1788,68 +1479,6 @@
   }
 }
 
-/// constructVariableDIE - Construct a DIE for the given DbgVariable.
-std::unique_ptr<DIE> DwarfUnit::constructVariableDIE(DbgVariable &DV,
-                                                     bool Abstract) {
-  auto D = constructVariableDIEImpl(DV, Abstract);
-  DV.setDIE(*D);
-  return D;
-}
-
-std::unique_ptr<DIE> DwarfUnit::constructVariableDIEImpl(const DbgVariable &DV,
-                                                         bool Abstract) {
-  // Define variable debug information entry.
-  auto VariableDie = make_unique<DIE>(DV.getTag());
-
-  if (Abstract) {
-    applyVariableAttributes(DV, *VariableDie);
-    return VariableDie;
-  }
-
-  // Add variable address.
-
-  unsigned Offset = DV.getDotDebugLocOffset();
-  if (Offset != ~0U) {
-    addLocationList(*VariableDie, dwarf::DW_AT_location, Offset);
-    return VariableDie;
-  }
-
-  // Check if variable is described by a DBG_VALUE instruction.
-  if (const MachineInstr *DVInsn = DV.getMInsn()) {
-    assert(DVInsn->getNumOperands() == 3);
-    if (DVInsn->getOperand(0).isReg()) {
-      const MachineOperand RegOp = DVInsn->getOperand(0);
-      // If the second operand is an immediate, this is an indirect value.
-      if (DVInsn->getOperand(1).isImm()) {
-        MachineLocation Location(RegOp.getReg(),
-                                 DVInsn->getOperand(1).getImm());
-        addVariableAddress(DV, *VariableDie, Location);
-      } else if (RegOp.getReg())
-        addVariableAddress(DV, *VariableDie, MachineLocation(RegOp.getReg()));
-    } else if (DVInsn->getOperand(0).isImm())
-      addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
-    else if (DVInsn->getOperand(0).isFPImm())
-      addConstantFPValue(*VariableDie, DVInsn->getOperand(0));
-    else if (DVInsn->getOperand(0).isCImm())
-      addConstantValue(*VariableDie, DVInsn->getOperand(0).getCImm(),
-                       DV.getType());
-
-    return VariableDie;
-  }
-
-  // .. else use frame index.
-  int FI = DV.getFrameIndex();
-  if (FI != ~0) {
-    unsigned FrameReg = 0;
-    const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-    int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-    MachineLocation Location(FrameReg, Offset);
-    addVariableAddress(DV, *VariableDie, Location);
-  }
-
-  return VariableDie;
-}
-
 /// constructMemberDIE - Construct member DIE from DIDerivedType.
 void DwarfUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
   DIE &MemberDie = createAndAddDIE(DT.getTag(), Buffer);
@@ -1922,7 +1551,7 @@
     addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_private);
   // Otherwise C++ member and base classes are considered public.
-  else
+  else if (DT.isPublic())
     addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_public);
   if (DT.isVirtual())
@@ -1971,7 +1600,7 @@
   else if (DT.isPrivate())
     addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_private);
-  else
+  else if (DT.isPublic())
     addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_public);
 
@@ -1984,6 +1613,10 @@
 }
 
 void DwarfUnit::emitHeader(const MCSymbol *ASectionSym) const {
+  // Emit size of content not including length itself
+  Asm->OutStreamer.AddComment("Length of Unit");
+  Asm->EmitInt32(getHeaderSize() + UnitDie.getSize());
+
   Asm->OutStreamer.AddComment("DWARF version number");
   Asm->EmitInt16(DD->getDwarfVersion());
   Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
@@ -1999,50 +1632,9 @@
   Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
 }
 
-void DwarfUnit::addRange(RangeSpan Range) {
-  // Only add a range for this unit if we're emitting full debug.
-  if (getCUNode().getEmissionKind() == DIBuilder::FullDebug) {
-    // If we have no current ranges just add the range and return, otherwise,
-    // check the current section and CU against the previous section and CU we
-    // emitted into and the subprogram was contained within. If these are the
-    // same then extend our current range, otherwise add this as a new range.
-    if (CURanges.size() == 0 ||
-        this != DD->getPrevCU() ||
-        Asm->getCurrentSection() != DD->getPrevSection()) {
-      CURanges.push_back(Range);
-      return;
-    }
-
-    assert(&(CURanges.back().getEnd()->getSection()) ==
-               &(Range.getEnd()->getSection()) &&
-           "We can only append to a range in the same section!");
-    CURanges.back().setEnd(Range.getEnd());
-  }
-}
-
-void DwarfCompileUnit::initStmtList(MCSymbol *DwarfLineSectionSym) {
-  // Define start line table label for each Compile Unit.
-  MCSymbol *LineTableStartSym =
-      Asm->OutStreamer.getDwarfLineTableSymbol(getUniqueID());
-
-  stmtListIndex = UnitDie.getValues().size();
-
-  // DW_AT_stmt_list is a offset of line number information for this
-  // compile unit in debug_line section. For split dwarf this is
-  // left in the skeleton CU and so not included.
-  // The line table entries are not always emitted in assembly, so it
-  // is not okay to use line_table_start here.
-  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-    addSectionLabel(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym);
-  else
-    addSectionDelta(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym,
-                    DwarfLineSectionSym);
-}
-
-void DwarfCompileUnit::applyStmtList(DIE &D) {
-  D.addValue(dwarf::DW_AT_stmt_list,
-             UnitDie.getAbbrev().getData()[stmtListIndex].getForm(),
-             UnitDie.getValues()[stmtListIndex]);
+void DwarfUnit::initSection(const MCSection *Section) {
+  assert(!this->Section);
+  this->Section = Section;
 }
 
 void DwarfTypeUnit::emitHeader(const MCSymbol *ASectionSym) const {
@@ -2055,16 +1647,8 @@
                                 sizeof(Ty->getOffset()));
 }
 
-void DwarfTypeUnit::initSection(const MCSection *Section) {
-  assert(!this->Section);
-  this->Section = Section;
-  // Since each type unit is contained in its own COMDAT section, the begin
-  // label and the section label are the same. Using the begin label emission in
-  // DwarfDebug to emit the section label as well is slightly subtle/sneaky, but
-  // the only other alternative of lazily constructing start-of-section labels
-  // and storing a mapping in DwarfDebug (or AsmPrinter).
-  this->SectionSym = this->LabelBegin =
-      Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
-  this->LabelEnd =
-      Asm->GetTempSymbol(Section->getLabelEndName(), getUniqueID());
+bool DwarfTypeUnit::isDwoUnit() const {
+  // Since there are no skeleton type units, all type units are dwo type units
+  // when split DWARF is being used.
+  return DD->useSplitDwarf();
 }

diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index b7b83b2..f40c937 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
-#define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
 
 #include "DIE.h"
 #include "DwarfDebug.h"
@@ -55,7 +55,8 @@
   SmallVector<RangeSpan, 2> Ranges;
 
 public:
-  RangeSpanList(MCSymbol *Sym) : RangeSym(Sym) {}
+  RangeSpanList(MCSymbol *Sym, SmallVector<RangeSpan, 2> Ranges)
+      : RangeSym(Sym), Ranges(std::move(Ranges)) {}
   MCSymbol *getSym() const { return RangeSym; }
   const SmallVectorImpl<RangeSpan> &getRanges() const { return Ranges; }
   void addRange(RangeSpan Range) { Ranges.push_back(Range); }
@@ -96,12 +97,6 @@
   /// descriptors to debug information entries using a DIEEntry proxy.
   DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap;
 
-  /// GlobalNames - A map of globally visible named entities for this unit.
-  StringMap<const DIE *> GlobalNames;
-
-  /// GlobalTypes - A map of globally visible types for this unit.
-  StringMap<const DIE *> GlobalTypes;
-
   /// DIEBlocks - A list of all the DIEBlocks in use.
   std::vector<DIEBlock *> DIEBlocks;
   
@@ -113,13 +108,6 @@
   /// corresponds to the MDNode mapped with the subprogram DIE.
   DenseMap<DIE *, const MDNode *> ContainingTypeMap;
 
-  // List of ranges for a given compile unit.
-  SmallVector<RangeSpan, 1> CURanges;
-
-  // List of range lists for a given compile unit, separate from the ranges for
-  // the CU itself.
-  SmallVector<RangeSpanList, 1> CURangeLists;
-
   // DIEValueAllocator - All DIEValues are allocated through this allocator.
   BumpPtrAllocator DIEValueAllocator;
 
@@ -129,86 +117,31 @@
   /// The section this unit will be emitted in.
   const MCSection *Section;
 
-  /// A label at the start of the non-dwo section related to this unit.
-  MCSymbol *SectionSym;
-
-  /// The start of the unit within its section.
-  MCSymbol *LabelBegin;
-
-  /// The end of the unit within its section.
-  MCSymbol *LabelEnd;
-
-  /// Skeleton unit associated with this unit.
-  DwarfUnit *Skeleton;
-
   DwarfUnit(unsigned UID, dwarf::Tag, DICompileUnit CU, AsmPrinter *A,
             DwarfDebug *DW, DwarfFile *DWU);
 
+  void initSection(const MCSection *Section);
+
+  /// Add a string attribute data and value.
+  void addLocalString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
+
+  void addIndexedString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
+
+  bool applySubprogramDefinitionAttributes(DISubprogram SP, DIE &SPDie);
+
 public:
   virtual ~DwarfUnit();
 
-  /// Set the skeleton unit associated with this unit.
-  void setSkeleton(DwarfUnit &Skel) { Skeleton = &Skel; }
-
-  /// Get the skeleton unit associated with this unit.
-  DwarfUnit *getSkeleton() const { return Skeleton; }
-
-  /// Pass in the SectionSym even though we could recreate it in every compile
-  /// unit (type units will have actually distinct symbols once they're in
-  /// comdat sections).
-  void initSection(const MCSection *Section, MCSymbol *SectionSym) {
-    assert(!this->Section);
-    this->Section = Section;
-    this->SectionSym = SectionSym;
-    this->LabelBegin =
-        Asm->GetTempSymbol(Section->getLabelBeginName(), getUniqueID());
-    this->LabelEnd =
-        Asm->GetTempSymbol(Section->getLabelEndName(), getUniqueID());
-  }
-
   const MCSection *getSection() const {
     assert(Section);
     return Section;
   }
 
-  /// If there's a skeleton then return the section symbol for the skeleton
-  /// unit, otherwise return the section symbol for this unit.
-  MCSymbol *getLocalSectionSym() const {
-    if (Skeleton)
-      return Skeleton->getSectionSym();
-    return getSectionSym();
-  }
-
-  MCSymbol *getSectionSym() const {
-    assert(Section);
-    return SectionSym;
-  }
-
-  /// If there's a skeleton then return the begin label for the skeleton unit,
-  /// otherwise return the local label for this unit.
-  MCSymbol *getLocalLabelBegin() const {
-    if (Skeleton)
-      return Skeleton->getLabelBegin();
-    return getLabelBegin();
-  }
-
-  MCSymbol *getLabelBegin() const {
-    assert(Section);
-    return LabelBegin;
-  }
-
-  MCSymbol *getLabelEnd() const {
-    assert(Section);
-    return LabelEnd;
-  }
-
   // Accessors.
   unsigned getUniqueID() const { return UniqueID; }
   uint16_t getLanguage() const { return CUNode.getLanguage(); }
   DICompileUnit getCUNode() const { return CUNode; }
   DIE &getUnitDie() { return UnitDie; }
-  const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
-  const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
 
   unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
   void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
@@ -216,29 +149,15 @@
   /// hasContent - Return true if this compile unit has something to write out.
   bool hasContent() const { return !UnitDie.getChildren().empty(); }
 
-  /// addRange - Add an address range to the list of ranges for this unit.
-  void addRange(RangeSpan Range);
-
-  /// getRanges - Get the list of ranges for this unit.
-  const SmallVectorImpl<RangeSpan> &getRanges() const { return CURanges; }
-  SmallVectorImpl<RangeSpan> &getRanges() { return CURanges; }
-
-  /// addRangeList - Add an address range list to the list of range lists.
-  void addRangeList(RangeSpanList Ranges) { CURangeLists.push_back(Ranges); }
-
-  /// getRangeLists - Get the vector of range lists.
-  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
-    return CURangeLists;
-  }
-  SmallVectorImpl<RangeSpanList> &getRangeLists() { return CURangeLists; }
-
   /// getParentContextString - Get a string containing the language specific
   /// context for a global name.
   std::string getParentContextString(DIScope Context) const;
 
-  /// addGlobalName - Add a new global entity to the compile unit.
-  ///
-  void addGlobalName(StringRef Name, DIE &Die, DIScope Context);
+  /// Add a new global name to the compile unit.
+  virtual void addGlobalName(StringRef Name, DIE &Die, DIScope Context) {}
+
+  /// Add a new global type to the compile unit.
+  virtual void addGlobalType(DIType Ty, const DIE &Die, DIScope Context) {}
 
   /// addAccelNamespace - Add a new name to the namespace accelerator table.
   void addAccelNamespace(StringRef Name, const DIE &Die);
@@ -275,14 +194,7 @@
   void addSInt(DIELoc &Die, Optional<dwarf::Form> Form, int64_t Integer);
 
   /// addString - Add a string attribute data and value.
-  void addString(DIE &Die, dwarf::Attribute Attribute, const StringRef Str);
-
-  /// addLocalString - Add a string attribute data and value.
-  void addLocalString(DIE &Die, dwarf::Attribute Attribute,
-                      const StringRef Str);
-
-  /// addExpr - Add a Dwarf expression attribute data and value.
-  void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
+  void addString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
 
   /// addLabel - Add a Dwarf label attribute data and value.
   void addLabel(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
@@ -290,14 +202,6 @@
 
   void addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label);
 
-  /// addLocationList - Add a Dwarf loclistptr attribute data and value.
-  void addLocationList(DIE &Die, dwarf::Attribute Attribute, unsigned Index);
-
-  /// addSectionLabel - Add a Dwarf section label attribute data and value.
-  ///
-  void addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
-                       const MCSymbol *Label);
-
   /// addSectionOffset - Add an offset into a section attribute data and value.
   ///
   void addSectionOffset(DIE &Die, dwarf::Attribute Attribute, uint64_t Integer);
@@ -306,10 +210,6 @@
   /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
   void addOpAddress(DIELoc &Die, const MCSymbol *Label);
 
-  /// addSectionDelta - Add a label delta attribute data and value.
-  void addSectionDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
-                       const MCSymbol *Lo);
-
   /// addLabelDelta - Add a label delta attribute data and value.
   void addLabelDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
                      const MCSymbol *Lo);
@@ -339,11 +239,6 @@
   void addSourceLine(DIE &Die, DINameSpace NS);
   void addSourceLine(DIE &Die, DIObjCProperty Ty);
 
-  /// addAddress - Add an address attribute to a die based on the location
-  /// provided.
-  void addAddress(DIE &Die, dwarf::Attribute Attribute,
-                  const MachineLocation &Location, bool Indirect = false);
-
   /// addConstantValue - Add constant value entry in variable DIE.
   void addConstantValue(DIE &Die, const MachineOperand &MO, DIType Ty);
   void addConstantValue(DIE &Die, const ConstantInt *CI, DIType Ty);
@@ -359,19 +254,12 @@
   void addTemplateParams(DIE &Buffer, DIArray TParams);
 
   /// addRegisterOp - Add register operand.
-  void addRegisterOp(DIELoc &TheDie, unsigned Reg);
+  void addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+                          unsigned SizeInBits = 0, unsigned OffsetInBits = 0);
 
   /// addRegisterOffset - Add register offset.
   void addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
 
-  /// addComplexAddress - Start with the address based on the location provided,
-  /// and generate the DWARF information necessary to find the actual variable
-  /// (navigating the extra location information encoded in the type) based on
-  /// the starting location.  Add the DWARF information to the die.
-  void addComplexAddress(const DbgVariable &DV, DIE &Die,
-                         dwarf::Attribute Attribute,
-                         const MachineLocation &Location);
-
   // FIXME: Should be reformulated in terms of addComplexAddress.
   /// addBlockByrefAddress - Start with the address based on the location
   /// provided, and generate the DWARF information necessary to find the
@@ -382,11 +270,6 @@
                             dwarf::Attribute Attribute,
                             const MachineLocation &Location);
 
-  /// addVariableAddress - Add DW_AT_location attribute for a
-  /// DbgVariable based on provided MachineLocation.
-  void addVariableAddress(const DbgVariable &DV, DIE &Die,
-                          MachineLocation Location);
-
   /// addType - Add a new type attribute to the specified entity. This takes
   /// and attribute parameter because DW_AT_friend attributes are also
   /// type references.
@@ -397,11 +280,10 @@
   DIE *getOrCreateNameSpace(DINameSpace NS);
 
   /// getOrCreateSubprogramDIE - Create new DIE using SP.
-  DIE *getOrCreateSubprogramDIE(DISubprogram SP);
+  DIE *getOrCreateSubprogramDIE(DISubprogram SP, bool Minimal = false);
 
-  void applySubprogramAttributes(DISubprogram SP, DIE &SPDie);
-  void applySubprogramAttributesToDefinition(DISubprogram SP, DIE &SPDie);
-  void applyVariableAttributes(const DbgVariable &Var, DIE &VariableDie);
+  void applySubprogramAttributes(DISubprogram SP, DIE &SPDie,
+                                 bool Minimal = false);
 
   /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
   /// given DIType.
@@ -417,12 +299,8 @@
   /// vtables.
   void constructContainingTypeDIEs();
 
-  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  std::unique_ptr<DIE> constructVariableDIE(DbgVariable &DV,
-                                            bool Abstract = false);
-
   /// constructSubprogramArguments - Construct function argument DIEs.
-  void constructSubprogramArguments(DIE &Buffer, DIArray Args);
+  void constructSubprogramArguments(DIE &Buffer, DITypeArray Args);
 
   /// Create a DIE with the given Tag, add the DIE to its parent, and
   /// call insertDIE if MD is not null.
@@ -453,12 +331,13 @@
   /// none currently exists, create a new ID and insert it in the line table.
   virtual unsigned getOrCreateSourceID(StringRef File, StringRef Directory) = 0;
 
-private:
-  /// \brief Construct a DIE for the given DbgVariable without initializing the
-  /// DbgVariable's DIE reference.
-  std::unique_ptr<DIE> constructVariableDIEImpl(const DbgVariable &DV,
-                                                bool Abstract);
+  /// resolve - Look in the DwarfDebug map for the MDNode that
+  /// corresponds to the reference.
+  template <typename T> T resolve(DIRef<T> Ref) const {
+    return DD->resolve(Ref);
+  }
 
+private:
   /// constructTypeDIE - Construct basic type die from DIBasicType.
   void constructTypeDIE(DIE &Buffer, DIBasicType BTy);
 
@@ -503,7 +382,7 @@
   }
 
   // getIndexTyDie - Get an anonymous type for index type.
-  DIE *getIndexTyDie() { return IndexTyDie; }
+  DIE *getIndexTyDie();
 
   // setIndexTyDie - Set D as anonymous type for index which can be reused
   // later.
@@ -513,56 +392,22 @@
   /// information entry.
   DIEEntry *createDIEEntry(DIE &Entry);
 
-  /// resolve - Look in the DwarfDebug map for the MDNode that
-  /// corresponds to the reference.
-  template <typename T> T resolve(DIRef<T> Ref) const {
-    return DD->resolve(Ref);
-  }
-
   /// If this is a named finished type then include it in the list of types for
   /// the accelerator tables.
   void updateAcceleratorTables(DIScope Context, DIType Ty, const DIE &TyDIE);
-};
 
-class DwarfCompileUnit : public DwarfUnit {
-  /// The attribute index of DW_AT_stmt_list in the compile unit DIE, avoiding
-  /// the need to search for it in applyStmtList.
-  unsigned stmtListIndex;
-
-public:
-  DwarfCompileUnit(unsigned UID, DICompileUnit Node, AsmPrinter *A,
-                   DwarfDebug *DW, DwarfFile *DWU);
-
-  void initStmtList(MCSymbol *DwarfLineSectionSym);
-
-  /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE.
-  void applyStmtList(DIE &D);
-
-  /// createGlobalVariableDIE - create global variable DIE.
-  void createGlobalVariableDIE(DIGlobalVariable GV);
-
-  /// addLabelAddress - Add a dwarf label attribute data and value using
-  /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
-  void addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
-                       const MCSymbol *Label);
-
-  /// addLocalLabelAddress - Add a dwarf label attribute data and value using
-  /// DW_FORM_addr only.
-  void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute,
-                            const MCSymbol *Label);
-
-  DwarfCompileUnit &getCU() override { return *this; }
-
-  unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
+  virtual bool isDwoUnit() const = 0;
 };
 
 class DwarfTypeUnit : public DwarfUnit {
-private:
   uint64_t TypeSignature;
   const DIE *Ty;
   DwarfCompileUnit &CU;
   MCDwarfDwoLineTable *SplitLineTable;
 
+  unsigned getOrCreateSourceID(StringRef File, StringRef Directory) override;
+  bool isDwoUnit() const override;
+
 public:
   DwarfTypeUnit(unsigned UID, DwarfCompileUnit &CU, AsmPrinter *A,
                 DwarfDebug *DW, DwarfFile *DWU,
@@ -578,11 +423,8 @@
     return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature
            sizeof(uint32_t);                               // Type DIE Offset
   }
-  void initSection(const MCSection *Section);
+  using DwarfUnit::initSection;
   DwarfCompileUnit &getCU() override { return CU; }
-
-protected:
-  unsigned getOrCreateSourceID(StringRef File, StringRef Directory) override;
 };
 } // end llvm namespace
 #endif

diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 73f62bf..2bbffb3 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp

@@ -237,7 +237,7 @@
       // instruction between the previous try-range and this one may throw,
       // create a call-site entry with no landing pad for the region between the
       // try-ranges.
-      if (SawPotentiallyThrowing && Asm->MAI->isExceptionHandlingDwarf()) {
+      if (SawPotentiallyThrowing && Asm->MAI->usesItaniumLSDAForExceptions()) {
         CallSiteEntry Site = { LastLabel, BeginLabel, nullptr, 0 };
         CallSites.push_back(Site);
         PreviousIsInvoke = false;
@@ -259,7 +259,7 @@
         };
 
         // Try to merge with the previous call-site. SJLJ doesn't do this
-        if (PreviousIsInvoke && Asm->MAI->isExceptionHandlingDwarf()) {
+        if (PreviousIsInvoke && Asm->MAI->usesItaniumLSDAForExceptions()) {
           CallSiteEntry &Prev = CallSites.back();
           if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
             // Extend the range of the previous entry.
@@ -269,7 +269,7 @@
         }
 
         // Otherwise, create a new call-site.
-        if (Asm->MAI->isExceptionHandlingDwarf())
+        if (Asm->MAI->usesItaniumLSDAForExceptions())
           CallSites.push_back(Site);
         else {
           // SjLj EH must maintain the call sites in the order assigned
@@ -287,7 +287,7 @@
   // If some instruction between the previous try-range and the end of the
   // function may throw, create a call-site entry with no landing pad for the
   // region following the try-range.
-  if (SawPotentiallyThrowing && Asm->MAI->isExceptionHandlingDwarf()) {
+  if (SawPotentiallyThrowing && Asm->MAI->usesItaniumLSDAForExceptions()) {
     CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 };
     CallSites.push_back(Site);
   }
@@ -314,7 +314,7 @@
 ///  3. Type ID table contains references to all the C++ typeinfo for all
 ///     catches in the function.  This tables is reverse indexed base 1.
 void EHStreamer::emitExceptionTable() {
-  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<const GlobalValue *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
   const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
 
@@ -520,7 +520,7 @@
     }
   } else {
     // DWARF Exception handling
-    assert(Asm->MAI->isExceptionHandlingDwarf());
+    assert(Asm->MAI->usesItaniumLSDAForExceptions());
 
     // The call-site table is a list of all call sites that may throw an
     // exception (including C++ 'throw' statements) in the procedure
@@ -649,7 +649,7 @@
 }
 
 void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
-  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<const GlobalValue *> &TypeInfos = MMI->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
 
   bool VerboseAsm = Asm->OutStreamer.isVerboseAsm();
@@ -662,9 +662,9 @@
     Entry = TypeInfos.size();
   }
 
-  for (std::vector<const GlobalVariable *>::const_reverse_iterator
+  for (std::vector<const GlobalValue *>::const_reverse_iterator
          I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) {
-    const GlobalVariable *GV = *I;
+    const GlobalValue *GV = *I;
     if (VerboseAsm)
       Asm->OutStreamer.AddComment("TypeInfo " + Twine(Entry--));
     Asm->EmitTTypeReference(GV, TTypeEncoding);

diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index 2b6ba78..7e9549d 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_ASMPRINTER_EHSTREAMER_H
-#define LLVM_CODEGEN_ASMPRINTER_EHSTREAMER_H
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
 
 #include "AsmPrinterHandler.h"
 #include "llvm/ADT/DenseMap.h"

diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index bfcbe6b..5bda5a9 100644
--- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp

@@ -28,6 +28,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -50,7 +51,8 @@
 
 void ErlangGCPrinter::finishAssembly(AsmPrinter &AP) {
   MCStreamer &OS = AP.OutStreamer;
-  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
+  unsigned IntPtrSize =
+      AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
 
   // Put this in a custom .note section.
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getContext()

diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 5a9ecd7..6480d048 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp

@@ -26,6 +26,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cctype>
 using namespace llvm;
 
@@ -91,7 +92,8 @@
 /// either condition is detected in a function which uses the GC.
 ///
 void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
-  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
+  unsigned IntPtrSize =
+      AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
   EmitCamlGlobal(getModule(), AP, "code_end");

diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
index 81285d5..0f0ad75 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp

@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DwarfException.h"
+#include "Win64Exception.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"

diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.h b/lib/CodeGen/AsmPrinter/Win64Exception.h
new file mode 100644
index 0000000..538e132
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.h

@@ -0,0 +1,52 @@
+//===-- Win64Exception.h - Windows Exception Handling ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing windows exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WIN64EXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WIN64EXCEPTION_H
+
+#include "EHStreamer.h"
+
+namespace llvm {
+class MachineFunction;
+
+class Win64Exception : public EHStreamer {
+  /// Per-function flag to indicate if personality info should be emitted.
+  bool shouldEmitPersonality;
+
+  /// Per-function flag to indicate if the LSDA should be emitted.
+  bool shouldEmitLSDA;
+
+  /// Per-function flag to indicate if frame moves info should be emitted.
+  bool shouldEmitMoves;
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  Win64Exception(AsmPrinter *A);
+  virtual ~Win64Exception();
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+};
+}
+
+#endif
+

diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
index 6a5c431..b5e0929 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.cpp

@@ -116,15 +116,67 @@
   Asm = AP;
 }
 
+void WinCodeViewLineTables::endModule() {
+  if (FnDebugInfo.empty())
+    return;
+
+  assert(Asm != nullptr);
+  Asm->OutStreamer.SwitchSection(
+      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
+  Asm->EmitInt32(COFF::DEBUG_SECTION_MAGIC);
+
+  // The COFF .debug$S section consists of several subsections, each starting
+  // with a 4-byte control code (e.g. 0xF1, 0xF2, etc) and then a 4-byte length
+  // of the payload followed by the payload itself.  The subsections are 4-byte
+  // aligned.
+
+  // Emit per-function debug information.  This code is extracted into a
+  // separate function for readability.
+  for (size_t I = 0, E = VisitedFunctions.size(); I != E; ++I)
+    emitDebugInfoForFunction(VisitedFunctions[I]);
+
+  // This subsection holds a file index to offset in string table table.
+  Asm->OutStreamer.AddComment("File index to string table offset subsection");
+  Asm->EmitInt32(COFF::DEBUG_INDEX_SUBSECTION);
+  size_t NumFilenames = FileNameRegistry.Infos.size();
+  Asm->EmitInt32(8 * NumFilenames);
+  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
+    StringRef Filename = FileNameRegistry.Filenames[I];
+    // For each unique filename, just write its offset in the string table.
+    Asm->EmitInt32(FileNameRegistry.Infos[Filename].StartOffset);
+    // The function name offset is not followed by any additional data.
+    Asm->EmitInt32(0);
+  }
+
+  // This subsection holds the string table.
+  Asm->OutStreamer.AddComment("String table");
+  Asm->EmitInt32(COFF::DEBUG_STRING_TABLE_SUBSECTION);
+  Asm->EmitInt32(FileNameRegistry.LastOffset);
+  // The payload starts with a null character.
+  Asm->EmitInt8(0);
+
+  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
+    // Just emit unique filenames one by one, separated by a null character.
+    Asm->OutStreamer.EmitBytes(FileNameRegistry.Filenames[I]);
+    Asm->EmitInt8(0);
+  }
+
+  // No more subsections. Fill with zeros to align the end of the section by 4.
+  Asm->OutStreamer.EmitFill((-FileNameRegistry.LastOffset) % 4, 0);
+
+  clear();
+}
+
 static void EmitLabelDiff(MCStreamer &Streamer,
-                          const MCSymbol *From, const MCSymbol *To) {
+                          const MCSymbol *From, const MCSymbol *To,
+                          unsigned int Size = 4) {
   MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
   MCContext &Context = Streamer.getContext();
   const MCExpr *FromRef = MCSymbolRefExpr::Create(From, Variant, Context),
                *ToRef   = MCSymbolRefExpr::Create(To, Variant, Context);
   const MCExpr *AddrDelta =
       MCBinaryExpr::Create(MCBinaryExpr::Sub, ToRef, FromRef, Context);
-  Streamer.EmitValue(AddrDelta, 4);
+  Streamer.EmitValue(AddrDelta, Size);
 }
 
 void WinCodeViewLineTables::emitDebugInfoForFunction(const Function *GV) {
@@ -138,6 +190,51 @@
     return;
   assert(FI.End && "Don't know where the function ends?");
 
+  StringRef FuncName = getDISubprogram(GV).getDisplayName(),
+            GVName = GV->getName();
+  // FIXME Clang currently sets DisplayName to "bar" for a C++
+  // "namespace_foo::bar" function, see PR21528.  Luckily, dbghelp.dll is trying
+  // to demangle display names anyways, so let's just put a mangled name into
+  // the symbols subsection until Clang gives us what we need.
+  if (GVName.startswith("\01?"))
+    FuncName = GVName.substr(1);
+  // Emit a symbol subsection, required by VS2012+ to find function boundaries.
+  MCSymbol *SymbolsBegin = Asm->MMI->getContext().CreateTempSymbol(),
+           *SymbolsEnd = Asm->MMI->getContext().CreateTempSymbol();
+  Asm->OutStreamer.AddComment("Symbol subsection for " + Twine(FuncName));
+  Asm->EmitInt32(COFF::DEBUG_SYMBOL_SUBSECTION);
+  EmitLabelDiff(Asm->OutStreamer, SymbolsBegin, SymbolsEnd);
+  Asm->OutStreamer.EmitLabel(SymbolsBegin);
+  {
+    MCSymbol *ProcSegmentBegin = Asm->MMI->getContext().CreateTempSymbol(),
+             *ProcSegmentEnd = Asm->MMI->getContext().CreateTempSymbol();
+    EmitLabelDiff(Asm->OutStreamer, ProcSegmentBegin, ProcSegmentEnd, 2);
+    Asm->OutStreamer.EmitLabel(ProcSegmentBegin);
+
+    Asm->EmitInt16(COFF::DEBUG_SYMBOL_TYPE_PROC_START);
+    // Some bytes of this segment don't seem to be required for basic debugging,
+    // so just fill them with zeroes.
+    Asm->OutStreamer.EmitFill(12, 0);
+    // This is the important bit that tells the debugger where the function
+    // code is located and what's its size:
+    EmitLabelDiff(Asm->OutStreamer, Fn, FI.End);
+    Asm->OutStreamer.EmitFill(12, 0);
+    Asm->OutStreamer.EmitCOFFSecRel32(Fn);
+    Asm->OutStreamer.EmitCOFFSectionIndex(Fn);
+    Asm->EmitInt8(0);
+    // Emit the function display name as a null-terminated string.
+    Asm->OutStreamer.EmitBytes(FuncName);
+    Asm->EmitInt8(0);
+    Asm->OutStreamer.EmitLabel(ProcSegmentEnd);
+
+    // We're done with this function.
+    Asm->EmitInt16(0x0002);
+    Asm->EmitInt16(COFF::DEBUG_SYMBOL_TYPE_PROC_END);
+  }
+  Asm->OutStreamer.EmitLabel(SymbolsEnd);
+  // Every subsection must be aligned to a 4-byte boundary.
+  Asm->OutStreamer.EmitFill((-FuncName.size()) % 4, 0);
+
   // PCs/Instructions are grouped into segments sharing the same filename.
   // Pre-calculate the lengths (in instructions) of these segments and store
   // them in a map for convenience.  Each index in the map is the sequential
@@ -154,18 +251,19 @@
   }
   FilenameSegmentLengths[LastSegmentEnd] = FI.Instrs.size() - LastSegmentEnd;
 
-  // Emit the control code of the subsection followed by the payload size.
-  Asm->OutStreamer.AddComment(
-      "Linetable subsection for " + Twine(Fn->getName()));
+  // Emit a line table subsection, requred to do PC-to-file:line lookup.
+  Asm->OutStreamer.AddComment("Line table subsection for " + Twine(FuncName));
   Asm->EmitInt32(COFF::DEBUG_LINE_TABLE_SUBSECTION);
-  MCSymbol *SubsectionBegin = Asm->MMI->getContext().CreateTempSymbol(),
-           *SubsectionEnd = Asm->MMI->getContext().CreateTempSymbol();
-  EmitLabelDiff(Asm->OutStreamer, SubsectionBegin, SubsectionEnd);
-  Asm->OutStreamer.EmitLabel(SubsectionBegin);
+  MCSymbol *LineTableBegin = Asm->MMI->getContext().CreateTempSymbol(),
+           *LineTableEnd = Asm->MMI->getContext().CreateTempSymbol();
+  EmitLabelDiff(Asm->OutStreamer, LineTableBegin, LineTableEnd);
+  Asm->OutStreamer.EmitLabel(LineTableBegin);
 
   // Identify the function this subsection is for.
   Asm->OutStreamer.EmitCOFFSecRel32(Fn);
   Asm->OutStreamer.EmitCOFFSectionIndex(Fn);
+  // Insert padding after a 16-bit section index.
+  Asm->EmitInt16(0);
 
   // Length of the function's code, in bytes.
   EmitLabelDiff(Asm->OutStreamer, Fn, FI.End);
@@ -209,56 +307,7 @@
 
   if (FileSegmentEnd)
     Asm->OutStreamer.EmitLabel(FileSegmentEnd);
-  Asm->OutStreamer.EmitLabel(SubsectionEnd);
-}
-
-void WinCodeViewLineTables::endModule() {
-  if (FnDebugInfo.empty())
-    return;
-
-  assert(Asm != nullptr);
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
-  Asm->EmitInt32(COFF::DEBUG_SECTION_MAGIC);
-
-  // The COFF .debug$S section consists of several subsections, each starting
-  // with a 4-byte control code (e.g. 0xF1, 0xF2, etc) and then a 4-byte length
-  // of the payload followed by the payload itself.  The subsections are 4-byte
-  // aligned.
-
-  for (size_t I = 0, E = VisitedFunctions.size(); I != E; ++I)
-    emitDebugInfoForFunction(VisitedFunctions[I]);
-
-  // This subsection holds a file index to offset in string table table.
-  Asm->OutStreamer.AddComment("File index to string table offset subsection");
-  Asm->EmitInt32(COFF::DEBUG_INDEX_SUBSECTION);
-  size_t NumFilenames = FileNameRegistry.Infos.size();
-  Asm->EmitInt32(8 * NumFilenames);
-  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
-    StringRef Filename = FileNameRegistry.Filenames[I];
-    // For each unique filename, just write it's offset in the string table.
-    Asm->EmitInt32(FileNameRegistry.Infos[Filename].StartOffset);
-    // The function name offset is not followed by any additional data.
-    Asm->EmitInt32(0);
-  }
-
-  // This subsection holds the string table.
-  Asm->OutStreamer.AddComment("String table");
-  Asm->EmitInt32(COFF::DEBUG_STRING_TABLE_SUBSECTION);
-  Asm->EmitInt32(FileNameRegistry.LastOffset);
-  // The payload starts with a null character.
-  Asm->EmitInt8(0);
-
-  for (size_t I = 0, E = FileNameRegistry.Filenames.size(); I != E; ++I) {
-    // Just emit unique filenames one by one, separated by a null character.
-    Asm->OutStreamer.EmitBytes(FileNameRegistry.Filenames[I]);
-    Asm->EmitInt8(0);
-  }
-
-  // No more subsections. Fill with zeros to align the end of the section by 4.
-  Asm->OutStreamer.EmitFill((-FileNameRegistry.LastOffset) % 4, 0);
-
-  clear();
+  Asm->OutStreamer.EmitLabel(LineTableEnd);
 }
 
 void WinCodeViewLineTables::beginFunction(const MachineFunction *MF) {

diff --git a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
index 0734d97..8492eac 100644
--- a/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h
+++ b/lib/CodeGen/AsmPrinter/WinCodeViewLineTables.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H__
-#define CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H__
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WINCODEVIEWLINETABLES_H
 
 #include "AsmPrinterHandler.h"
 #include "llvm/ADT/DenseMap.h"

diff --git a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp b/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
deleted file mode 100644
index 421946d..0000000
--- a/lib/CodeGen/AtomicExpandLoadLinkedPass.cpp
+++ /dev/null

@@ -1,380 +0,0 @@
-//===-- AtomicExpandLoadLinkedPass.cpp - Expand atomic instructions -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass (at IR level) to replace atomic instructions with
-// appropriate (intrinsic-based) ldrex/strex loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "arm-atomic-expand"
-
-namespace {
-  class AtomicExpandLoadLinked : public FunctionPass {
-    const TargetMachine *TM;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit AtomicExpandLoadLinked(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM) {
-      initializeAtomicExpandLoadLinkedPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override;
-    bool expandAtomicInsts(Function &F);
-
-    bool expandAtomicLoad(LoadInst *LI);
-    bool expandAtomicStore(StoreInst *LI);
-    bool expandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
-
-    AtomicOrdering insertLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
-    void insertTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
-  };
-}
-
-char AtomicExpandLoadLinked::ID = 0;
-char &llvm::AtomicExpandLoadLinkedID = AtomicExpandLoadLinked::ID;
-INITIALIZE_TM_PASS(AtomicExpandLoadLinked, "atomic-ll-sc",
-    "Expand Atomic calls in terms of load-linked & store-conditional",
-    false, false)
-
-FunctionPass *llvm::createAtomicExpandLoadLinkedPass(const TargetMachine *TM) {
-  return new AtomicExpandLoadLinked(TM);
-}
-
-bool AtomicExpandLoadLinked::runOnFunction(Function &F) {
-  if (!TM || !TM->getSubtargetImpl()->enableAtomicExpandLoadLinked())
-    return false;
-
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (BasicBlock &BB : F)
-    for (Instruction &Inst : BB) {
-      if (isa<AtomicRMWInst>(&Inst) || isa<AtomicCmpXchgInst>(&Inst) ||
-          (isa<LoadInst>(&Inst) && cast<LoadInst>(&Inst)->isAtomic()) ||
-          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
-        AtomicInsts.push_back(&Inst);
-    }
-
-  bool MadeChange = false;
-  for (Instruction *Inst : AtomicInsts) {
-    if (!TM->getTargetLowering()->shouldExpandAtomicInIR(Inst))
-      continue;
-
-    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
-      MadeChange |= expandAtomicRMW(AI);
-    else if (AtomicCmpXchgInst *CI = dyn_cast<AtomicCmpXchgInst>(Inst))
-      MadeChange |= expandAtomicCmpXchg(CI);
-    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-      MadeChange |= expandAtomicLoad(LI);
-    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-      MadeChange |= expandAtomicStore(SI);
-    else
-      llvm_unreachable("Unknown atomic instruction");
-  }
-
-  return MadeChange;
-}
-
-bool AtomicExpandLoadLinked::expandAtomicLoad(LoadInst *LI) {
-  // Load instructions don't actually need a leading fence, even in the
-  // SequentiallyConsistent case.
-  AtomicOrdering MemOpOrder =
-      TM->getTargetLowering()->getInsertFencesForAtomic() ? Monotonic
-                                                          : LI->getOrdering();
-
-  // The only 64-bit load guaranteed to be single-copy atomic by the ARM ARM is
-  // an ldrexd (A3.5.3).
-  IRBuilder<> Builder(LI);
-  Value *Val = TM->getTargetLowering()->emitLoadLinked(
-      Builder, LI->getPointerOperand(), MemOpOrder);
-
-  insertTrailingFence(Builder, LI->getOrdering());
-
-  LI->replaceAllUsesWith(Val);
-  LI->eraseFromParent();
-
-  return true;
-}
-
-bool AtomicExpandLoadLinked::expandAtomicStore(StoreInst *SI) {
-  // The only atomic 64-bit store on ARM is an strexd that succeeds, which means
-  // we need a loop and the entire instruction is essentially an "atomicrmw
-  // xchg" that ignores the value loaded.
-  IRBuilder<> Builder(SI);
-  AtomicRMWInst *AI =
-      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
-                              SI->getValueOperand(), SI->getOrdering());
-  SI->eraseFromParent();
-
-  // Now we have an appropriate swap instruction, lower it as usual.
-  return expandAtomicRMW(AI);
-}
-
-bool AtomicExpandLoadLinked::expandAtomicRMW(AtomicRMWInst *AI) {
-  AtomicOrdering Order = AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
-  //
-  // The standard expansion we produce is:
-  //     [...]
-  //     fence?
-  // atomicrmw.start:
-  //     %loaded = @load.linked(%addr)
-  //     %new = some_op iN %loaded, %incr
-  //     %stored = @store_conditional(%new, %addr)
-  //     %try_again = icmp i32 ne %stored, 0
-  //     br i1 %try_again, label %loop, label %atomicrmw.end
-  // atomicrmw.end:
-  //     fence?
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
-  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
-
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we might want a fence too. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, Order);
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  Value *Loaded =
-      TM->getTargetLowering()->emitLoadLinked(Builder, Addr, MemOpOrder);
-
-  Value *NewVal;
-  switch (AI->getOperation()) {
-  case AtomicRMWInst::Xchg:
-    NewVal = AI->getValOperand();
-    break;
-  case AtomicRMWInst::Add:
-    NewVal = Builder.CreateAdd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Sub:
-    NewVal = Builder.CreateSub(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::And:
-    NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Nand:
-    NewVal = Builder.CreateNot(Builder.CreateAnd(Loaded, AI->getValOperand()),
-                               "new");
-    break;
-  case AtomicRMWInst::Or:
-    NewVal = Builder.CreateOr(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Xor:
-    NewVal = Builder.CreateXor(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Max:
-    NewVal = Builder.CreateICmpSGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Min:
-    NewVal = Builder.CreateICmpSLE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMax:
-    NewVal = Builder.CreateICmpUGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMin:
-    NewVal = Builder.CreateICmpULE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  default:
-    llvm_unreachable("Unknown atomic op");
-  }
-
-  Value *StoreSuccess = TM->getTargetLowering()->emitStoreConditional(
-      Builder, NewVal, Addr, MemOpOrder);
-  Value *TryAgain = Builder.CreateICmpNE(
-      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
-  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
-
-  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-  insertTrailingFence(Builder, Order);
-
-  AI->replaceAllUsesWith(Loaded);
-  AI->eraseFromParent();
-
-  return true;
-}
-
-bool AtomicExpandLoadLinked::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
-  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
-  AtomicOrdering FailureOrder = CI->getFailureOrdering();
-  Value *Addr = CI->getPointerOperand();
-  BasicBlock *BB = CI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
-  //
-  // The full expansion we produce is:
-  //     [...]
-  //     fence?
-  // cmpxchg.start:
-  //     %loaded = @load.linked(%addr)
-  //     %should_store = icmp eq %loaded, %desired
-  //     br i1 %should_store, label %cmpxchg.trystore,
-  //                          label %cmpxchg.failure
-  // cmpxchg.trystore:
-  //     %stored = @store_conditional(%new, %addr)
-  //     %success = icmp eq i32 %stored, 0
-  //     br i1 %success, label %cmpxchg.success, label %loop/%cmpxchg.failure
-  // cmpxchg.success:
-  //     fence?
-  //     br label %cmpxchg.end
-  // cmpxchg.failure:
-  //     fence?
-  //     br label %cmpxchg.end
-  // cmpxchg.end:
-  //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
-  //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
-  //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
-  auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
-  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB);
-  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
-  auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
-
-  // This grabs the DebugLoc from CI
-  IRBuilder<> Builder(CI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we might want a fence too. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, SuccessOrder);
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  Value *Loaded =
-      TM->getTargetLowering()->emitLoadLinked(Builder, Addr, MemOpOrder);
-  Value *ShouldStore =
-      Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
-
-  // If the the cmpxchg doesn't actually need any ordering when it fails, we can
-  // jump straight past that fence instruction (if it exists).
-  Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
-
-  Builder.SetInsertPoint(TryStoreBB);
-  Value *StoreSuccess = TM->getTargetLowering()->emitStoreConditional(
-      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
-  StoreSuccess = Builder.CreateICmpEQ(
-      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
-  Builder.CreateCondBr(StoreSuccess, SuccessBB,
-                       CI->isWeak() ? FailureBB : LoopBB);
-
-  // Make sure later instructions don't get reordered with a fence if necessary.
-  Builder.SetInsertPoint(SuccessBB);
-  insertTrailingFence(Builder, SuccessOrder);
-  Builder.CreateBr(ExitBB);
-
-  Builder.SetInsertPoint(FailureBB);
-  insertTrailingFence(Builder, FailureOrder);
-  Builder.CreateBr(ExitBB);
-
-  // Finally, we have control-flow based knowledge of whether the cmpxchg
-  // succeeded or not. We expose this to later passes by converting any
-  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
-
-  // Setup the builder so we can create any PHIs we need.
-  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
-  Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
-  Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
-
-  // Look for any users of the cmpxchg that are just comparing the loaded value
-  // against the desired one, and replace them with the CFG-derived version.
-  SmallVector<ExtractValueInst *, 2> PrunedInsts;
-  for (auto User : CI->users()) {
-    ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
-    if (!EV)
-      continue;
-
-    assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
-           "weird extraction from { iN, i1 }");
-
-    if (EV->getIndices()[0] == 0)
-      EV->replaceAllUsesWith(Loaded);
-    else
-      EV->replaceAllUsesWith(Success);
-
-    PrunedInsts.push_back(EV);
-  }
-
-  // We can remove the instructions now we're no longer iterating through them.
-  for (auto EV : PrunedInsts)
-    EV->eraseFromParent();
-
-  if (!CI->use_empty()) {
-    // Some use of the full struct return that we don't understand has happened,
-    // so we've got to reconstruct it properly.
-    Value *Res;
-    Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
-    Res = Builder.CreateInsertValue(Res, Success, 1);
-
-    CI->replaceAllUsesWith(Res);
-  }
-
-  CI->eraseFromParent();
-  return true;
-}
-
-AtomicOrdering AtomicExpandLoadLinked::insertLeadingFence(IRBuilder<> &Builder,
-                                                       AtomicOrdering Ord) {
-  if (!TM->getTargetLowering()->getInsertFencesForAtomic())
-    return Ord;
-
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    Builder.CreateFence(Release);
-
-  // The exclusive operations don't need any barrier if we're adding separate
-  // fences.
-  return Monotonic;
-}
-
-void AtomicExpandLoadLinked::insertTrailingFence(IRBuilder<> &Builder,
-                                              AtomicOrdering Ord) {
-  if (!TM->getTargetLowering()->getInsertFencesForAtomic())
-    return;
-
-  if (Ord == Acquire || Ord == AcquireRelease)
-    Builder.CreateFence(Acquire);
-  else if (Ord == SequentiallyConsistent)
-    Builder.CreateFence(SequentiallyConsistent);
-}

diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
new file mode 100644
index 0000000..12f6bd7
--- /dev/null
+++ b/lib/CodeGen/AtomicExpandPass.cpp

@@ -0,0 +1,563 @@
+//===-- AtomicExpandPass.cpp - Expand atomic instructions -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass (at IR level) to replace atomic instructions with
+// either (intrinsic-based) load-linked/store-conditional loops or AtomicCmpXchg.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "atomic-expand"
+
+namespace {
+  class AtomicExpand: public FunctionPass {
+    const TargetMachine *TM;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit AtomicExpand(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {
+      initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+  private:
+    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+                               bool IsStore, bool IsLoad);
+    bool expandAtomicLoad(LoadInst *LI);
+    bool expandAtomicLoadToLL(LoadInst *LI);
+    bool expandAtomicLoadToCmpXchg(LoadInst *LI);
+    bool expandAtomicStore(StoreInst *SI);
+    bool expandAtomicRMW(AtomicRMWInst *AI);
+    bool expandAtomicRMWToLLSC(AtomicRMWInst *AI);
+    bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI);
+    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+    bool isIdempotentRMW(AtomicRMWInst *AI);
+    bool simplifyIdempotentRMW(AtomicRMWInst *AI);
+  };
+}
+
+char AtomicExpand::ID = 0;
+char &llvm::AtomicExpandID = AtomicExpand::ID;
+INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand",
+    "Expand Atomic calls in terms of either load-linked & store-conditional or cmpxchg",
+    false, false)
+
+FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
+  return new AtomicExpand(TM);
+}
+
+bool AtomicExpand::runOnFunction(Function &F) {
+  if (!TM || !TM->getSubtargetImpl()->enableAtomicExpand())
+    return false;
+  auto TargetLowering = TM->getSubtargetImpl()->getTargetLowering();
+
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    if (I->isAtomic())
+      AtomicInsts.push_back(&*I);
+  }
+
+  bool MadeChange = false;
+  for (auto I : AtomicInsts) {
+    auto LI = dyn_cast<LoadInst>(I);
+    auto SI = dyn_cast<StoreInst>(I);
+    auto RMWI = dyn_cast<AtomicRMWInst>(I);
+    auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
+    assert((LI || SI || RMWI || CASI || isa<FenceInst>(I)) &&
+           "Unknown atomic instruction");
+
+    auto FenceOrdering = Monotonic;
+    bool IsStore, IsLoad;
+    if (TargetLowering->getInsertFencesForAtomic()) {
+      if (LI && isAtLeastAcquire(LI->getOrdering())) {
+        FenceOrdering = LI->getOrdering();
+        LI->setOrdering(Monotonic);
+        IsStore = false;
+        IsLoad = true;
+      } else if (SI && isAtLeastRelease(SI->getOrdering())) {
+        FenceOrdering = SI->getOrdering();
+        SI->setOrdering(Monotonic);
+        IsStore = true;
+        IsLoad = false;
+      } else if (RMWI && (isAtLeastRelease(RMWI->getOrdering()) ||
+                          isAtLeastAcquire(RMWI->getOrdering()))) {
+        FenceOrdering = RMWI->getOrdering();
+        RMWI->setOrdering(Monotonic);
+        IsStore = IsLoad = true;
+      } else if (CASI && !TargetLowering->hasLoadLinkedStoreConditional() &&
+                    (isAtLeastRelease(CASI->getSuccessOrdering()) ||
+                     isAtLeastAcquire(CASI->getSuccessOrdering()))) {
+        // If a compare and swap is lowered to LL/SC, we can do smarter fence
+        // insertion, with a stronger one on the success path than on the
+        // failure path. As a result, fence insertion is directly done by
+        // expandAtomicCmpXchg in that case.
+        FenceOrdering = CASI->getSuccessOrdering();
+        CASI->setSuccessOrdering(Monotonic);
+        CASI->setFailureOrdering(Monotonic);
+        IsStore = IsLoad = true;
+      }
+
+      if (FenceOrdering != Monotonic) {
+        MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
+      }
+    }
+
+    if (LI && TargetLowering->shouldExpandAtomicLoadInIR(LI)) {
+      MadeChange |= expandAtomicLoad(LI);
+    } else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) {
+      MadeChange |= expandAtomicStore(SI);
+    } else if (RMWI) {
+      // There are two different ways of expanding RMW instructions:
+      // - into a load if it is idempotent
+      // - into a Cmpxchg/LL-SC loop otherwise
+      // we try them in that order.
+      MadeChange |= (isIdempotentRMW(RMWI) &&
+                        simplifyIdempotentRMW(RMWI)) ||
+                    (TargetLowering->shouldExpandAtomicRMWInIR(RMWI) &&
+                        expandAtomicRMW(RMWI));
+    } else if (CASI && TargetLowering->hasLoadLinkedStoreConditional()) {
+      MadeChange |= expandAtomicCmpXchg(CASI);
+    }
+  }
+  return MadeChange;
+}
+
+bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+                                         bool IsStore, bool IsLoad) {
+  IRBuilder<> Builder(I);
+
+  auto LeadingFence =
+      TM->getSubtargetImpl()->getTargetLowering()->emitLeadingFence(
+      Builder, Order, IsStore, IsLoad);
+
+  auto TrailingFence =
+      TM->getSubtargetImpl()->getTargetLowering()->emitTrailingFence(
+      Builder, Order, IsStore, IsLoad);
+  // The trailing fence is emitted before the instruction instead of after
+  // because there is no easy way of setting Builder insertion point after
+  // an instruction. So we must erase it from the BB, and insert it back
+  // in the right place.
+  // We have a guard here because not every atomic operation generates a
+  // trailing fence.
+  if (TrailingFence) {
+    TrailingFence->removeFromParent();
+    TrailingFence->insertAfter(I);
+  }
+
+  return (LeadingFence || TrailingFence);
+}
+
+bool AtomicExpand::expandAtomicLoad(LoadInst *LI) {
+   if (TM->getSubtargetImpl()
+          ->getTargetLowering()
+          ->hasLoadLinkedStoreConditional())
+    return expandAtomicLoadToLL(LI);
+  else
+    return expandAtomicLoadToCmpXchg(LI);
+}
+
+bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+  IRBuilder<> Builder(LI);
+
+  // On some architectures, load-linked instructions are atomic for larger
+  // sizes than normal loads. For example, the only 64-bit load guaranteed
+  // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
+  Value *Val =
+      TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
+
+  LI->replaceAllUsesWith(Val);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
+  IRBuilder<> Builder(LI);
+  AtomicOrdering Order = LI->getOrdering();
+  Value *Addr = LI->getPointerOperand();
+  Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
+  Constant *DummyVal = Constant::getNullValue(Ty);
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, DummyVal, DummyVal, Order,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
+  Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
+
+  LI->replaceAllUsesWith(Loaded);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
+  // This function is only called on atomic stores that are too large to be
+  // atomic if implemented as a native store. So we replace them by an
+  // atomic swap, that can be implemented for example as a ldrex/strex on ARM
+  // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
+  // It is the responsibility of the target to only return true in
+  // shouldExpandAtomicRMW in cases where this is required and possible.
+  IRBuilder<> Builder(SI);
+  AtomicRMWInst *AI =
+      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
+                              SI->getValueOperand(), SI->getOrdering());
+  SI->eraseFromParent();
+
+  // Now we have an appropriate swap instruction, lower it as usual.
+  return expandAtomicRMW(AI);
+}
+
+bool AtomicExpand::expandAtomicRMW(AtomicRMWInst *AI) {
+  if (TM->getSubtargetImpl()
+          ->getTargetLowering()
+          ->hasLoadLinkedStoreConditional())
+    return expandAtomicRMWToLLSC(AI);
+  else
+    return expandAtomicRMWToCmpXchg(AI);
+}
+
+/// Emit IR to implement the given atomicrmw operation on values in registers,
+/// returning the new value.
+static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
+                              Value *Loaded, Value *Inc) {
+  Value *NewVal;
+  switch (Op) {
+  case AtomicRMWInst::Xchg:
+    return Inc;
+  case AtomicRMWInst::Add:
+    return Builder.CreateAdd(Loaded, Inc, "new");
+  case AtomicRMWInst::Sub:
+    return Builder.CreateSub(Loaded, Inc, "new");
+  case AtomicRMWInst::And:
+    return Builder.CreateAnd(Loaded, Inc, "new");
+  case AtomicRMWInst::Nand:
+    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
+  case AtomicRMWInst::Or:
+    return Builder.CreateOr(Loaded, Inc, "new");
+  case AtomicRMWInst::Xor:
+    return Builder.CreateXor(Loaded, Inc, "new");
+  case AtomicRMWInst::Max:
+    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::Min:
+    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMax:
+    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMin:
+    NewVal = Builder.CreateICmpULE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+}
+
+bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+  AtomicOrdering MemOpOrder = AI->getOrdering();
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     fence?
+  // atomicrmw.start:
+  //     %loaded = @load.linked(%addr)
+  //     %new = some_op iN %loaded, %incr
+  //     %stored = @store_conditional(%new, %addr)
+  //     %try_again = icmp i32 ne %stored, 0
+  //     br i1 %try_again, label %loop, label %atomicrmw.end
+  // atomicrmw.end:
+  //     fence?
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+
+  Value *NewVal =
+      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+
+  Value *StoreSuccess =
+      TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
+  Value *TryAgain = Builder.CreateICmpNE(
+      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
+  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI) {
+  AtomicOrdering MemOpOrder =
+      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
+  Value *Addr = AI->getPointerOperand();
+  BasicBlock *BB = AI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     %init_loaded = load atomic iN* %addr
+  //     br label %loop
+  // loop:
+  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+  //     %new = some_op iN %loaded, %incr
+  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
+  //     %success = extractvalue { iN, i1 } %pair, 1
+  //     br i1 %success, label %atomicrmw.end, label %loop
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // This grabs the DebugLoc from AI.
+  IRBuilder<> Builder(AI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we want a load. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
+  // Atomics require at least natural alignment.
+  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
+  Loaded->addIncoming(InitLoaded, BB);
+
+  Value *NewVal =
+      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, Loaded, NewVal, MemOpOrder,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
+  Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+  Loaded->addIncoming(NewLoaded, LoopBB);
+
+  Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
+  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+
+  AI->replaceAllUsesWith(NewLoaded);
+  AI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
+  AtomicOrdering FailureOrder = CI->getFailureOrdering();
+  Value *Addr = CI->getPointerOperand();
+  BasicBlock *BB = CI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+  // If getInsertFencesForAtomic() returns true, then the target does not want
+  // to deal with memory orders, and emitLeading/TrailingFence should take care
+  // of everything. Otherwise, emitLeading/TrailingFence are no-op and we
+  // should preserve the ordering.
+  AtomicOrdering MemOpOrder =
+      TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder;
+
+  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
+  //
+  // The full expansion we produce is:
+  //     [...]
+  //     fence?
+  // cmpxchg.start:
+  //     %loaded = @load.linked(%addr)
+  //     %should_store = icmp eq %loaded, %desired
+  //     br i1 %should_store, label %cmpxchg.trystore,
+  //                          label %cmpxchg.failure
+  // cmpxchg.trystore:
+  //     %stored = @store_conditional(%new, %addr)
+  //     %success = icmp eq i32 %stored, 0
+  //     br i1 %success, label %cmpxchg.success, label %loop/%cmpxchg.failure
+  // cmpxchg.success:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.failure:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.end:
+  //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
+  //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
+  //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
+  auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
+  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, FailureBB);
+  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, SuccessBB);
+  auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
+
+  // This grabs the DebugLoc from CI
+  IRBuilder<> Builder(CI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                        /*IsLoad=*/true);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+  Value *ShouldStore =
+      Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
+
+  // If the the cmpxchg doesn't actually need any ordering when it fails, we can
+  // jump straight past that fence instruction (if it exists).
+  Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
+
+  Builder.SetInsertPoint(TryStoreBB);
+  Value *StoreSuccess = TLI->emitStoreConditional(
+      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
+  StoreSuccess = Builder.CreateICmpEQ(
+      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
+  Builder.CreateCondBr(StoreSuccess, SuccessBB,
+                       CI->isWeak() ? FailureBB : LoopBB);
+
+  // Make sure later instructions don't get reordered with a fence if necessary.
+  Builder.SetInsertPoint(SuccessBB);
+  TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                         /*IsLoad=*/true);
+  Builder.CreateBr(ExitBB);
+
+  Builder.SetInsertPoint(FailureBB);
+  TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
+                         /*IsLoad=*/true);
+  Builder.CreateBr(ExitBB);
+
+  // Finally, we have control-flow based knowledge of whether the cmpxchg
+  // succeeded or not. We expose this to later passes by converting any
+  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate PHI.
+
+  // Setup the builder so we can create any PHIs we need.
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+  Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
+  Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
+
+  // Look for any users of the cmpxchg that are just comparing the loaded value
+  // against the desired one, and replace them with the CFG-derived version.
+  SmallVector<ExtractValueInst *, 2> PrunedInsts;
+  for (auto User : CI->users()) {
+    ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
+    if (!EV)
+      continue;
+
+    assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
+           "weird extraction from { iN, i1 }");
+
+    if (EV->getIndices()[0] == 0)
+      EV->replaceAllUsesWith(Loaded);
+    else
+      EV->replaceAllUsesWith(Success);
+
+    PrunedInsts.push_back(EV);
+  }
+
+  // We can remove the instructions now we're no longer iterating through them.
+  for (auto EV : PrunedInsts)
+    EV->eraseFromParent();
+
+  if (!CI->use_empty()) {
+    // Some use of the full struct return that we don't understand has happened,
+    // so we've got to reconstruct it properly.
+    Value *Res;
+    Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
+    Res = Builder.CreateInsertValue(Res, Success, 1);
+
+    CI->replaceAllUsesWith(Res);
+  }
+
+  CI->eraseFromParent();
+  return true;
+}
+
+bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
+  auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
+  if(!C)
+    return false;
+
+  AtomicRMWInst::BinOp Op = RMWI->getOperation();
+  switch(Op) {
+    case AtomicRMWInst::Add:
+    case AtomicRMWInst::Sub:
+    case AtomicRMWInst::Or:
+    case AtomicRMWInst::Xor:
+      return C->isZero();
+    case AtomicRMWInst::And:
+      return C->isMinusOne();
+    // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
+    default:
+      return false;
+  }
+}
+
+bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
+  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
+
+  if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
+    if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad))
+      expandAtomicLoad(ResultingLoad);
+    return true;
+  }
+
+  return false;
+}

diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index b2737bf..b9b1fd8 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp

@@ -42,7 +42,9 @@
   /// Estimate the cost overhead of SK_Alternate shuffle.
   unsigned getAltShuffleOverhead(Type *Ty) const;
 
-  const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }
+  const TargetLoweringBase *getTLI() const {
+    return TM->getSubtargetImpl()->getTargetLowering();
+  }
 
 public:
   BasicTTI() : ImmutablePass(ID), TM(nullptr) {
@@ -90,7 +92,7 @@
   unsigned getJumpBufSize() const override;
   bool shouldBuildLookupTables() const override;
   bool haveFastSqrt(Type *Ty) const override;
-  void getUnrollingPreferences(Loop *L,
+  void getUnrollingPreferences(const Function *F, Loop *L,
                                UnrollingPreferences &UP) const override;
 
   /// @}
@@ -99,10 +101,11 @@
   /// @{
 
   unsigned getNumberOfRegisters(bool Vector) const override;
-  unsigned getMaximumUnrollFactor() const override;
+  unsigned getMaxInterleaveFactor() const override;
   unsigned getRegisterBitWidth(bool Vector) const override;
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
-                                  OperandValueKind) const override;
+                                  OperandValueKind, OperandValueProperties,
+                                  OperandValueProperties) const override;
   unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
                           int Index, Type *SubTp) const override;
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
@@ -186,9 +189,8 @@
 
 bool BasicTTI::shouldBuildLookupTables() const {
   const TargetLoweringBase *TLI = getTLI();
-  return TLI->supportJumpTables() &&
-      (TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
-       TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+  return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+         TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
 }
 
 bool BasicTTI::haveFastSqrt(Type *Ty) const {
@@ -197,7 +199,7 @@
   return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
 }
 
-void BasicTTI::getUnrollingPreferences(Loop *L,
+void BasicTTI::getUnrollingPreferences(const Function *F, Loop *L,
                                        UnrollingPreferences &UP) const {
   // This unrolling functionality is target independent, but to provide some
   // motivation for its intended use, for x86:
@@ -223,11 +225,11 @@
   // until someone finds a case where it matters in practice.
 
   unsigned MaxOps;
-  const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>(F);
   if (PartialUnrollingThreshold.getNumOccurrences() > 0)
     MaxOps = PartialUnrollingThreshold;
-  else if (ST->getSchedModel()->LoopMicroOpBufferSize > 0)
-    MaxOps = ST->getSchedModel()->LoopMicroOpBufferSize;
+  else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
+    MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
   else
     return;
 
@@ -282,13 +284,14 @@
   return 32;
 }
 
-unsigned BasicTTI::getMaximumUnrollFactor() const {
+unsigned BasicTTI::getMaxInterleaveFactor() const {
   return 1;
 }
 
 unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind,
-                                          OperandValueKind) const {
+                                          OperandValueKind, OperandValueKind,
+                                          OperandValueProperties,
+                                          OperandValueProperties) const {
   // Check if any of the operands are vector operands.
   const TargetLoweringBase *TLI = getTLI();
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -465,7 +468,8 @@
 
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
 
-  if (!TLI->isOperationExpand(ISD, LT.second)) {
+  if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
+      !TLI->isOperationExpand(ISD, LT.second)) {
     // The operation is legal. Assume it costs 1. Multiply
     // by the type-legalization overhead.
     return LT.first * 1;
@@ -561,6 +565,8 @@
   case Intrinsic::log10:   ISD = ISD::FLOG10; break;
   case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
   case Intrinsic::fabs:    ISD = ISD::FABS;   break;
+  case Intrinsic::minnum:  ISD = ISD::FMINNUM; break;
+  case Intrinsic::maxnum:  ISD = ISD::FMAXNUM; break;
   case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break;
   case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
   case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
@@ -572,6 +578,7 @@
   case Intrinsic::pow:     ISD = ISD::FPOW;   break;
   case Intrinsic::fma:     ISD = ISD::FMA;    break;
   case Intrinsic::fmuladd: ISD = ISD::FMA;    break;
+  // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
     return 0;
@@ -582,7 +589,7 @@
 
   if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
     // The operation is legal. Assume it costs 1.
-    // If the type is split to multiple registers, assume that thre is some
+    // If the type is split to multiple registers, assume that there is some
     // overhead to this.
     // TODO: Once we have extract/insert subvector cost we need to use them.
     if (LT.first > 1)

diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 7503e57..2128da1 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp

@@ -20,6 +20,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -32,8 +34,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -70,6 +72,8 @@
     bool runOnMachineFunction(MachineFunction &MF) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
       AU.addRequired<TargetPassConfig>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -91,22 +95,24 @@
   // HW that requires structurized CFG.
   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
       PassConfig->getEnableTailMerge();
-  BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true);
-  return Folder.OptimizeFunction(MF,
-                                 MF.getTarget().getInstrInfo(),
-                                 MF.getTarget().getRegisterInfo(),
+  BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true,
+                      getAnalysis<MachineBlockFrequencyInfo>(),
+                      getAnalysis<MachineBranchProbabilityInfo>());
+  return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
+                                 MF.getSubtarget().getRegisterInfo(),
                                  getAnalysisIfAvailable<MachineModuleInfo>());
 }
 
-
-BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist) {
+BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
+                           const MachineBlockFrequencyInfo &FreqInfo,
+                           const MachineBranchProbabilityInfo &ProbInfo)
+    : EnableHoistCommonCode(CommonHoist), MBBFreqInfo(FreqInfo),
+      MBPI(ProbInfo) {
   switch (FlagEnableTailMerge) {
   case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
   case cl::BOU_TRUE: EnableTailMerge = true; break;
   case cl::BOU_FALSE: EnableTailMerge = false; break;
   }
-
-  EnableHoistCommonCode = CommonHoist;
 }
 
 /// RemoveDeadBlock - Remove the specified dead machine basic block from the
@@ -388,10 +394,8 @@
     RS->enterBasicBlock(CurMBB);
     if (!CurMBB->empty())
       RS->forward(std::prev(CurMBB->end()));
-    BitVector RegsLiveAtExit(TRI->getNumRegs());
-    RS->getRegsUsed(RegsLiveAtExit, false);
-    for (unsigned int i = 0, e = TRI->getNumRegs(); i != e; i++)
-      if (RegsLiveAtExit[i])
+    for (unsigned int i = 1, e = TRI->getNumRegs(); i != e; i++)
+      if (RS->isRegUsed(i, false))
         NewMBB->addLiveIn(i);
   }
 }
@@ -435,6 +439,9 @@
   // Splice the code over.
   NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
 
+  // NewMBB inherits CurMBB's block frequency.
+  MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));
+
   // For targets that use the register scavenger, we must maintain LiveIns.
   MaintainLiveIns(&CurMBB, NewMBB);
 
@@ -504,6 +511,21 @@
 #endif
 }
 
+BlockFrequency
+BranchFolder::MBFIWrapper::getBlockFreq(const MachineBasicBlock *MBB) const {
+  auto I = MergedBBFreq.find(MBB);
+
+  if (I != MergedBBFreq.end())
+    return I->second;
+
+  return MBFI.getBlockFreq(MBB);
+}
+
+void BranchFolder::MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
+                                             BlockFrequency F) {
+  MergedBBFreq[MBB] = F;
+}
+
 /// CountTerminators - Count the number of terminators in the given
 /// block and set I to the position of the first non-terminator, if there
 /// is one, or MBB->end() otherwise.
@@ -806,6 +828,10 @@
     }
 
     MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+    // Recompute commont tail MBB's edge weights and block frequency.
+    setCommonTailEdgeWeights(*MBB);
+
     // MBB is common tail.  Adjust all other BB's to jump to this one.
     // Traversal must be forwards so erases work.
     DEBUG(dbgs() << "\nUsing common tail in BB#" << MBB->getNumber()
@@ -890,7 +916,7 @@
         continue;
 
       // Visit each predecessor only once.
-      if (!UniquePreds.insert(PBB))
+      if (!UniquePreds.insert(PBB).second)
         continue;
 
       // Skip blocks which may jump to a landing pad. Can't tail merge these.
@@ -968,6 +994,44 @@
   return MadeChange;
 }
 
+void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
+  SmallVector<BlockFrequency, 2> EdgeFreqLs(TailMBB.succ_size());
+  BlockFrequency AccumulatedMBBFreq;
+
+  // Aggregate edge frequency of successor edge j:
+  //  edgeFreq(j) = sum (freq(bb) * edgeProb(bb, j)),
+  //  where bb is a basic block that is in SameTails.
+  for (const auto &Src : SameTails) {
+    const MachineBasicBlock *SrcMBB = Src.getBlock();
+    BlockFrequency BlockFreq = MBBFreqInfo.getBlockFreq(SrcMBB);
+    AccumulatedMBBFreq += BlockFreq;
+
+    // It is not necessary to recompute edge weights if TailBB has less than two
+    // successors.
+    if (TailMBB.succ_size() <= 1)
+      continue;
+
+    auto EdgeFreq = EdgeFreqLs.begin();
+
+    for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+         SuccI != SuccE; ++SuccI, ++EdgeFreq)
+      *EdgeFreq += BlockFreq * MBPI.getEdgeProbability(SrcMBB, *SuccI);
+  }
+
+  MBBFreqInfo.setBlockFreq(&TailMBB, AccumulatedMBBFreq);
+
+  if (TailMBB.succ_size() <= 1)
+    return;
+
+  auto MaxEdgeFreq = *std::max_element(EdgeFreqLs.begin(), EdgeFreqLs.end());
+  uint64_t Scale = MaxEdgeFreq.getFrequency() / UINT32_MAX + 1;
+  auto EdgeFreq = EdgeFreqLs.begin();
+
+  for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+       SuccI != SuccE; ++SuccI, ++EdgeFreq)
+    TailMBB.setSuccWeight(SuccI, EdgeFreq->getFrequency() / Scale);
+}
+
 //===----------------------------------------------------------------------===//
 //  Branch Optimization
 //===----------------------------------------------------------------------===//

diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index 0d15ed7..3653a2c 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h

@@ -7,14 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_BRANCHFOLDING_HPP
-#define LLVM_CODEGEN_BRANCHFOLDING_HPP
+#ifndef LLVM_LIB_CODEGEN_BRANCHFOLDING_H
+#define LLVM_LIB_CODEGEN_BRANCHFOLDING_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Support/BlockFrequency.h"
 #include <vector>
 
 namespace llvm {
+  class MachineBlockFrequencyInfo;
+  class MachineBranchProbabilityInfo;
   class MachineFunction;
   class MachineModuleInfo;
   class RegScavenger;
@@ -23,7 +26,9 @@
 
   class BranchFolder {
   public:
-    explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist);
+    explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
+                          const MachineBlockFrequencyInfo &MBFI,
+                          const MachineBranchProbabilityInfo &MBPI);
 
     bool OptimizeFunction(MachineFunction &MF,
                           const TargetInstrInfo *tii,
@@ -92,9 +97,26 @@
     MachineModuleInfo *MMI;
     RegScavenger *RS;
 
+    /// \brief This class keeps track of branch frequencies of newly created
+    /// blocks and tail-merged blocks.
+    class MBFIWrapper {
+    public:
+      MBFIWrapper(const MachineBlockFrequencyInfo &I) : MBFI(I) {}
+      BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const;
+      void setBlockFreq(const MachineBasicBlock *MBB, BlockFrequency F);
+
+    private:
+      const MachineBlockFrequencyInfo &MBFI;
+      DenseMap<const MachineBasicBlock *, BlockFrequency> MergedBBFreq;
+    };
+
+    MBFIWrapper MBBFreqInfo;
+    const MachineBranchProbabilityInfo &MBPI;
+
     bool TailMergeBlocks(MachineFunction &MF);
     bool TryTailMergeBlocks(MachineBasicBlock* SuccBB,
                        MachineBasicBlock* PredBB);
+    void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB);
     void MaintainLiveIns(MachineBasicBlock *CurMBB,
                          MachineBasicBlock *NewMBB);
     void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,

diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 57c24e8..092346b 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt

@@ -2,7 +2,7 @@
   AggressiveAntiDepBreaker.cpp
   AllocationOrder.cpp
   Analysis.cpp
-  AtomicExpandLoadLinkedPass.cpp
+  AtomicExpandPass.cpp
   BasicTargetTransformInfo.cpp
   BranchFolding.cpp
   CalcSpillWeights.cpp
@@ -19,6 +19,7 @@
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
+  ForwardControlFlowIntegrity.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
   GCStrategy.cpp
@@ -27,7 +28,6 @@
   InlineSpiller.cpp
   InterferenceCache.cpp
   IntrinsicLowering.cpp
-  JITCodeEmitter.cpp
   JumpInstrTables.cpp
   LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
@@ -48,9 +48,10 @@
   MachineBlockPlacement.cpp
   MachineBranchProbabilityInfo.cpp
   MachineCSE.cpp
-  MachineCodeEmitter.cpp
+  MachineCombiner.cpp
   MachineCopyPropagation.cpp
   MachineDominators.cpp
+  MachineDominanceFrontier.cpp
   MachineFunction.cpp
   MachineFunctionAnalysis.cpp
   MachineFunctionPass.cpp
@@ -64,6 +65,7 @@
   MachinePassRegistry.cpp
   MachinePostDominators.cpp
   MachineRegisterInfo.cpp
+  MachineRegionInfo.cpp
   MachineSSAUpdater.cpp
   MachineScheduler.cpp
   MachineSink.cpp
@@ -96,7 +98,6 @@
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   SpillPlacement.cpp
-  Spiller.cpp
   SplitKit.cpp
   StackColoring.cpp
   StackProtector.cpp

diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index bc033f9..d08fae0 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp

@@ -16,8 +16,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "calcspillweights"
@@ -95,11 +95,12 @@
 void
 VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
   MachineRegisterInfo &mri = MF.getRegInfo();
-  const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo &tri = *MF.getSubtarget().getRegisterInfo();
   MachineBasicBlock *mbb = nullptr;
   MachineLoop *loop = nullptr;
   bool isExiting = false;
   float totalWeight = 0;
+  unsigned numInstr = 0; // Number of instructions using li
   SmallPtrSet<MachineInstr*, 8> visited;
 
   // Find the best physreg hint and the best virtreg hint.
@@ -116,9 +117,10 @@
        I = mri.reg_instr_begin(li.reg), E = mri.reg_instr_end();
        I != E; ) {
     MachineInstr *mi = &*(I++);
+    numInstr++;
     if (mi->isIdentityCopy() || mi->isImplicitDef() || mi->isDebugValue())
       continue;
-    if (!visited.insert(mi))
+    if (!visited.insert(mi).second)
       continue;
 
     float weight = 1.0f;
@@ -186,8 +188,8 @@
   // it is a preferred candidate for spilling.
   // FIXME: this gets much more complicated once we support non-trivial
   // re-materialization.
-  if (isRematerializable(li, LIS, *MF.getTarget().getInstrInfo()))
+  if (isRematerializable(li, LIS, *MF.getSubtarget().getInstrInfo()))
     totalWeight *= 0.5F;
 
-  li.weight = normalize(totalWeight, li.getSize());
+  li.weight = normalize(totalWeight, li.getSize(), numInstr);
 }

diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index add861a..56ecde0 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp

@@ -19,16 +19,15 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
-                 const TargetMachine &tm, SmallVectorImpl<CCValAssign> &locs,
-                 LLVMContext &C)
-  : CallingConv(CC), IsVarArg(isVarArg), MF(mf), TM(tm),
-    TRI(*TM.getRegisterInfo()), Locs(locs), Context(C),
-    CallOrPrologue(Unknown) {
+                 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
+    : CallingConv(CC), IsVarArg(isVarArg), MF(mf),
+      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C),
+      CallOrPrologue(Unknown) {
   // No stack is used.
   StackOffset = 0;
 
@@ -50,7 +49,8 @@
   if (MinAlign > (int)Align)
     Align = MinAlign;
   MF.getFrameInfo()->ensureMaxAlignment(Align);
-  TM.getTargetLowering()->HandleByVal(this, Size, Align);
+  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size, Align);
+  Size = unsigned(RoundUpToAlignment(Size, MinAlign));
   unsigned Offset = AllocateStack(Size, Align);
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 }

diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index b3beac3..307dec5 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp

@@ -20,7 +20,7 @@
 
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
-  initializeAtomicExpandLoadLinkedPass(Registry);
+  initializeAtomicExpandPass(Registry);
   initializeBasicTTIPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeCodeGenPreparePass(Registry);
@@ -41,6 +41,7 @@
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
   initializeMachineCopyPropagationPass(Registry);
+  initializeMachineCombinerPass(Registry);
   initializeMachineCSEPass(Registry);
   initializeMachineDominatorTreePass(Registry);
   initializeMachinePostDominatorTreePass(Registry);

diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index ccac40c..8d20848 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp

@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -63,6 +64,7 @@
 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
 STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
+STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
@@ -80,15 +82,29 @@
    "enable-andcmp-sinking", cl::Hidden, cl::init(true),
    cl::desc("Enable sinkinig and/cmp into branches."));
 
+static cl::opt<bool> DisableStoreExtract(
+    "disable-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> StressStoreExtract(
+    "stress-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
-typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
+struct TypeIsSExt {
+  Type *Ty;
+  bool IsSExt;
+  TypeIsSExt(Type *Ty, bool IsSExt) : Ty(Ty), IsSExt(IsSExt) {}
+};
+typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
 
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// transformation profitability.
     const TargetMachine *TM;
     const TargetLowering *TLI;
+    const TargetTransformInfo *TTI;
     const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
 
@@ -118,7 +134,7 @@
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM), TLI(nullptr) {
+        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr) {
         initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
       }
     bool runOnFunction(Function &F) override;
@@ -128,6 +144,7 @@
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetTransformInfo>();
     }
 
   private:
@@ -144,6 +161,7 @@
     bool OptimizeExtUses(Instruction *I);
     bool OptimizeSelectInst(SelectInst *SI);
     bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI);
+    bool OptimizeExtractElementInst(Instruction *Inst);
     bool DupRetToEnableTailCallOpts(BasicBlock *BB);
     bool PlaceDbgValues(Function &F);
     bool sinkAndCmp(Function &F);
@@ -168,8 +186,10 @@
   PromotedInsts.clear();
 
   ModifiedDT = false;
-  if (TM) TLI = TM->getTargetLowering();
+  if (TM)
+    TLI = TM->getSubtargetImpl()->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfo>();
+  TTI = &getAnalysis<TargetTransformInfo>();
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
@@ -662,10 +682,13 @@
     if (!ISDOpcode)
       continue;
 
-    // If the use is actually a legal node, there will not be an implicit
-    // truncate.
-    if (TLI.isOperationLegalOrCustom(ISDOpcode,
-                                     EVT::getEVT(TruncUser->getType())))
+    // If the use is actually a legal node, there will not be an
+    // implicit truncate.
+    // FIXME: always querying the result type is just an
+    // approximation; some nodes' legality is determined by the
+    // operand or other means. There's no good way to find out though.
+    if (TLI.isOperationLegalOrCustom(
+            ISDOpcode, TLI.getValueType(TruncUser->getType(), true)))
       continue;
 
     // Don't bother for PHI nodes.
@@ -978,7 +1001,7 @@
   } else {
     SmallPtrSet<BasicBlock*, 4> VisitedBBs;
     for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
-      if (!VisitedBBs.insert(*PI))
+      if (!VisitedBBs.insert(*PI).second)
         continue;
 
       BasicBlock::InstListType &InstList = (*PI)->getInstList();
@@ -1250,46 +1273,75 @@
 
   /// \brief Build a truncate instruction.
   class TruncBuilder : public TypePromotionAction {
+    Value *Val;
   public:
     /// \brief Build a truncate instruction of \p Opnd producing a \p Ty
     /// result.
     /// trunc Opnd to Ty.
     TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
       IRBuilder<> Builder(Opnd);
-      Inst = cast<Instruction>(Builder.CreateTrunc(Opnd, Ty, "promoted"));
-      DEBUG(dbgs() << "Do: TruncBuilder: " << *Inst << "\n");
+      Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
+      DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
     }
 
-    /// \brief Get the built instruction.
-    Instruction *getBuiltInstruction() { return Inst; }
+    /// \brief Get the built value.
+    Value *getBuiltValue() { return Val; }
 
     /// \brief Remove the built instruction.
     void undo() override {
-      DEBUG(dbgs() << "Undo: TruncBuilder: " << *Inst << "\n");
-      Inst->eraseFromParent();
+      DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
     }
   };
 
   /// \brief Build a sign extension instruction.
   class SExtBuilder : public TypePromotionAction {
+    Value *Val;
   public:
     /// \brief Build a sign extension instruction of \p Opnd producing a \p Ty
     /// result.
     /// sext Opnd to Ty.
     SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
-        : TypePromotionAction(Inst) {
+        : TypePromotionAction(InsertPt) {
       IRBuilder<> Builder(InsertPt);
-      Inst = cast<Instruction>(Builder.CreateSExt(Opnd, Ty, "promoted"));
-      DEBUG(dbgs() << "Do: SExtBuilder: " << *Inst << "\n");
+      Val = Builder.CreateSExt(Opnd, Ty, "promoted");
+      DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
     }
 
-    /// \brief Get the built instruction.
-    Instruction *getBuiltInstruction() { return Inst; }
+    /// \brief Get the built value.
+    Value *getBuiltValue() { return Val; }
 
     /// \brief Remove the built instruction.
     void undo() override {
-      DEBUG(dbgs() << "Undo: SExtBuilder: " << *Inst << "\n");
-      Inst->eraseFromParent();
+      DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
+    }
+  };
+
+  /// \brief Build a zero extension instruction.
+  class ZExtBuilder : public TypePromotionAction {
+    Value *Val;
+  public:
+    /// \brief Build a zero extension instruction of \p Opnd producing a \p Ty
+    /// result.
+    /// zext Opnd to Ty.
+    ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
+        : TypePromotionAction(InsertPt) {
+      IRBuilder<> Builder(InsertPt);
+      Val = Builder.CreateZExt(Opnd, Ty, "promoted");
+      DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
+    }
+
+    /// \brief Get the built value.
+    Value *getBuiltValue() { return Val; }
+
+    /// \brief Remove the built instruction.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
     }
   };
 
@@ -1418,9 +1470,11 @@
   /// Same as Value::mutateType.
   void mutateType(Instruction *Inst, Type *NewTy);
   /// Same as IRBuilder::createTrunc.
-  Instruction *createTrunc(Instruction *Opnd, Type *Ty);
+  Value *createTrunc(Instruction *Opnd, Type *Ty);
   /// Same as IRBuilder::createSExt.
-  Instruction *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
+  Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
+  /// Same as IRBuilder::createZExt.
+  Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
   /// Same as Instruction::moveBefore.
   void moveBefore(Instruction *Inst, Instruction *Before);
   /// @}
@@ -1452,20 +1506,28 @@
   Actions.push_back(make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
 }
 
-Instruction *TypePromotionTransaction::createTrunc(Instruction *Opnd,
-                                                   Type *Ty) {
+Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
+                                             Type *Ty) {
   std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
-  Instruction *I = Ptr->getBuiltInstruction();
+  Value *Val = Ptr->getBuiltValue();
   Actions.push_back(std::move(Ptr));
-  return I;
+  return Val;
 }
 
-Instruction *TypePromotionTransaction::createSExt(Instruction *Inst,
-                                                  Value *Opnd, Type *Ty) {
+Value *TypePromotionTransaction::createSExt(Instruction *Inst,
+                                            Value *Opnd, Type *Ty) {
   std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
-  Instruction *I = Ptr->getBuiltInstruction();
+  Value *Val = Ptr->getBuiltValue();
   Actions.push_back(std::move(Ptr));
-  return I;
+  return Val;
+}
+
+Value *TypePromotionTransaction::createZExt(Instruction *Inst,
+                                            Value *Opnd, Type *Ty) {
+  std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
+  Value *Val = Ptr->getBuiltValue();
+  Actions.push_back(std::move(Ptr));
+  return Val;
 }
 
 void TypePromotionTransaction::moveBefore(Instruction *Inst,
@@ -1658,58 +1720,75 @@
 
 /// \brief Hepler class to perform type promotion.
 class TypePromotionHelper {
-  /// \brief Utility function to check whether or not a sign extension of
-  /// \p Inst with \p ConsideredSExtType can be moved through \p Inst by either
-  /// using the operands of \p Inst or promoting \p Inst.
+  /// \brief Utility function to check whether or not a sign or zero extension
+  /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
+  /// either using the operands of \p Inst or promoting \p Inst.
+  /// The type of the extension is defined by \p IsSExt.
   /// In other words, check if:
-  /// sext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredSExtType.
+  /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
   /// #1 Promotion applies:
-  /// ConsideredSExtType Inst (sext opnd1 to ConsideredSExtType, ...).
+  /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
   /// #2 Operand reuses:
-  /// sext opnd1 to ConsideredSExtType.
+  /// ext opnd1 to ConsideredExtType.
   /// \p PromotedInsts maps the instructions to their type before promotion.
-  static bool canGetThrough(const Instruction *Inst, Type *ConsideredSExtType,
-                            const InstrToOrigTy &PromotedInsts);
+  static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
+                            const InstrToOrigTy &PromotedInsts, bool IsSExt);
 
   /// \brief Utility function to determine if \p OpIdx should be promoted when
   /// promoting \p Inst.
-  static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
+  static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
     if (isa<SelectInst>(Inst) && OpIdx == 0)
       return false;
     return true;
   }
 
-  /// \brief Utility function to promote the operand of \p SExt when this
-  /// operand is a promotable trunc or sext.
+  /// \brief Utility function to promote the operand of \p Ext when this
+  /// operand is a promotable trunc or sext or zext.
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInsts[out] contains how many non-free instructions have been
-  /// created to promote the operand of SExt.
+  /// created to promote the operand of Ext.
   /// Should never be called directly.
-  /// \return The promoted value which is used instead of SExt.
-  static Value *promoteOperandForTruncAndSExt(Instruction *SExt,
-                                              TypePromotionTransaction &TPT,
-                                              InstrToOrigTy &PromotedInsts,
-                                              unsigned &CreatedInsts);
+  /// \return The promoted value which is used instead of Ext.
+  static Value *promoteOperandForTruncAndAnyExt(Instruction *Ext,
+                                                TypePromotionTransaction &TPT,
+                                                InstrToOrigTy &PromotedInsts,
+                                                unsigned &CreatedInsts);
 
-  /// \brief Utility function to promote the operand of \p SExt when this
+  /// \brief Utility function to promote the operand of \p Ext when this
   /// operand is promotable and is not a supported trunc or sext.
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInsts[out] contains how many non-free instructions have been
-  /// created to promote the operand of SExt.
+  /// created to promote the operand of Ext.
   /// Should never be called directly.
-  /// \return The promoted value which is used instead of SExt.
-  static Value *promoteOperandForOther(Instruction *SExt,
+  /// \return The promoted value which is used instead of Ext.
+  static Value *promoteOperandForOther(Instruction *Ext,
                                        TypePromotionTransaction &TPT,
                                        InstrToOrigTy &PromotedInsts,
-                                       unsigned &CreatedInsts);
+                                       unsigned &CreatedInsts, bool IsSExt);
+
+  /// \see promoteOperandForOther.
+  static Value *signExtendOperandForOther(Instruction *Ext,
+                                          TypePromotionTransaction &TPT,
+                                          InstrToOrigTy &PromotedInsts,
+                                          unsigned &CreatedInsts) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, true);
+  }
+
+  /// \see promoteOperandForOther.
+  static Value *zeroExtendOperandForOther(Instruction *Ext,
+                                          TypePromotionTransaction &TPT,
+                                          InstrToOrigTy &PromotedInsts,
+                                          unsigned &CreatedInsts) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, false);
+  }
 
 public:
-  /// Type for the utility function that promotes the operand of SExt.
-  typedef Value *(*Action)(Instruction *SExt, TypePromotionTransaction &TPT,
+  /// Type for the utility function that promotes the operand of Ext.
+  typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT,
                            InstrToOrigTy &PromotedInsts,
                            unsigned &CreatedInsts);
-  /// \brief Given a sign extend instruction \p SExt, return the approriate
-  /// action to promote the operand of \p SExt instead of using SExt.
+  /// \brief Given a sign/zero extend instruction \p Ext, return the approriate
+  /// action to promote the operand of \p Ext instead of using Ext.
   /// \return NULL if no promotable action is possible with the current
   /// sign extension.
   /// \p InsertedTruncs keeps track of all the truncate instructions inserted by
@@ -1717,36 +1796,42 @@
   /// because we do not want to promote these instructions as CodeGenPrepare
   /// will reinsert them later. Thus creating an infinite loop: create/remove.
   /// \p PromotedInsts maps the instructions to their type before promotion.
-  static Action getAction(Instruction *SExt, const SetOfInstrs &InsertedTruncs,
+  static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedTruncs,
                           const TargetLowering &TLI,
                           const InstrToOrigTy &PromotedInsts);
 };
 
 bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
-                                        Type *ConsideredSExtType,
-                                        const InstrToOrigTy &PromotedInsts) {
-  // We can always get through sext.
-  if (isa<SExtInst>(Inst))
+                                        Type *ConsideredExtType,
+                                        const InstrToOrigTy &PromotedInsts,
+                                        bool IsSExt) {
+  // We can always get through zext.
+  if (isa<ZExtInst>(Inst))
+    return true;
+
+  // sext(sext) is ok too.
+  if (IsSExt && isa<SExtInst>(Inst))
     return true;
 
   // We can get through binary operator, if it is legal. In other words, the
   // binary operator must have a nuw or nsw flag.
   const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
   if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
-      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
+      ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
+       (IsSExt && BinOp->hasNoSignedWrap())))
     return true;
 
   // Check if we can do the following simplification.
-  // sext(trunc(sext)) --> sext
+  // ext(trunc(opnd)) --> ext(opnd)
   if (!isa<TruncInst>(Inst))
     return false;
 
   Value *OpndVal = Inst->getOperand(0);
-  // Check if we can use this operand in the sext.
-  // If the type is larger than the result type of the sign extension,
+  // Check if we can use this operand in the extension.
+  // If the type is larger than the result type of the extension,
   // we cannot.
   if (OpndVal->getType()->getIntegerBitWidth() >
-      ConsideredSExtType->getIntegerBitWidth())
+      ConsideredExtType->getIntegerBitWidth())
     return false;
 
   // If the operand of the truncate is not an instruction, we will not have
@@ -1757,18 +1842,19 @@
     return false;
 
   // Check if the source of the type is narrow enough.
-  // I.e., check that trunc just drops sign extended bits.
-  // #1 get the type of the operand.
+  // I.e., check that trunc just drops extended bits of the same kind of
+  // the extension.
+  // #1 get the type of the operand and check the kind of the extended bits.
   const Type *OpndType;
   InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
-  if (It != PromotedInsts.end())
-    OpndType = It->second;
-  else if (isa<SExtInst>(Opnd))
-    OpndType = cast<Instruction>(Opnd)->getOperand(0)->getType();
+  if (It != PromotedInsts.end() && It->second.IsSExt == IsSExt)
+    OpndType = It->second.Ty;
+  else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
+    OpndType = Opnd->getOperand(0)->getType();
   else
     return false;
 
-  // #2 check that the truncate just drop sign extended bits.
+  // #2 check that the truncate just drop extended bits.
   if (Inst->getType()->getIntegerBitWidth() >= OpndType->getIntegerBitWidth())
     return true;
 
@@ -1776,149 +1862,167 @@
 }
 
 TypePromotionHelper::Action TypePromotionHelper::getAction(
-    Instruction *SExt, const SetOfInstrs &InsertedTruncs,
+    Instruction *Ext, const SetOfInstrs &InsertedTruncs,
     const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
-  Instruction *SExtOpnd = dyn_cast<Instruction>(SExt->getOperand(0));
-  Type *SExtTy = SExt->getType();
-  // If the operand of the sign extension is not an instruction, we cannot
+  assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+         "Unexpected instruction type");
+  Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
+  Type *ExtTy = Ext->getType();
+  bool IsSExt = isa<SExtInst>(Ext);
+  // If the operand of the extension is not an instruction, we cannot
   // get through.
   // If it, check we can get through.
-  if (!SExtOpnd || !canGetThrough(SExtOpnd, SExtTy, PromotedInsts))
+  if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
     return nullptr;
 
   // Do not promote if the operand has been added by codegenprepare.
   // Otherwise, it means we are undoing an optimization that is likely to be
   // redone, thus causing potential infinite loop.
-  if (isa<TruncInst>(SExtOpnd) && InsertedTruncs.count(SExtOpnd))
+  if (isa<TruncInst>(ExtOpnd) && InsertedTruncs.count(ExtOpnd))
     return nullptr;
 
   // SExt or Trunc instructions.
   // Return the related handler.
-  if (isa<SExtInst>(SExtOpnd) || isa<TruncInst>(SExtOpnd))
-    return promoteOperandForTruncAndSExt;
+  if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
+      isa<ZExtInst>(ExtOpnd))
+    return promoteOperandForTruncAndAnyExt;
 
   // Regular instruction.
   // Abort early if we will have to insert non-free instructions.
-  if (!SExtOpnd->hasOneUse() &&
-      !TLI.isTruncateFree(SExtTy, SExtOpnd->getType()))
+  if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
     return nullptr;
-  return promoteOperandForOther;
+  return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
 }
 
-Value *TypePromotionHelper::promoteOperandForTruncAndSExt(
+Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
     llvm::Instruction *SExt, TypePromotionTransaction &TPT,
     InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts) {
   // By construction, the operand of SExt is an instruction. Otherwise we cannot
   // get through it and this method should not be called.
   Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
-  // Replace sext(trunc(opnd)) or sext(sext(opnd))
-  // => sext(opnd).
-  TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
+  Value *ExtVal = SExt;
+  if (isa<ZExtInst>(SExtOpnd)) {
+    // Replace s|zext(zext(opnd))
+    // => zext(opnd).
+    Value *ZExt =
+        TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
+    TPT.replaceAllUsesWith(SExt, ZExt);
+    TPT.eraseInstruction(SExt);
+    ExtVal = ZExt;
+  } else {
+    // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
+    // => z|sext(opnd).
+    TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
+  }
   CreatedInsts = 0;
 
   // Remove dead code.
   if (SExtOpnd->use_empty())
     TPT.eraseInstruction(SExtOpnd);
 
-  // Check if the sext is still needed.
-  if (SExt->getType() != SExt->getOperand(0)->getType())
-    return SExt;
+  // Check if the extension is still needed.
+  Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
+  if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType())
+    return ExtVal;
 
-  // At this point we have: sext ty opnd to ty.
-  // Reassign the uses of SExt to the opnd and remove SExt.
-  Value *NextVal = SExt->getOperand(0);
-  TPT.eraseInstruction(SExt, NextVal);
+  // At this point we have: ext ty opnd to ty.
+  // Reassign the uses of ExtInst to the opnd and remove ExtInst.
+  Value *NextVal = ExtInst->getOperand(0);
+  TPT.eraseInstruction(ExtInst, NextVal);
   return NextVal;
 }
 
-Value *
-TypePromotionHelper::promoteOperandForOther(Instruction *SExt,
-                                            TypePromotionTransaction &TPT,
-                                            InstrToOrigTy &PromotedInsts,
-                                            unsigned &CreatedInsts) {
-  // By construction, the operand of SExt is an instruction. Otherwise we cannot
+Value *TypePromotionHelper::promoteOperandForOther(
+    Instruction *Ext, TypePromotionTransaction &TPT,
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, bool IsSExt) {
+  // By construction, the operand of Ext is an instruction. Otherwise we cannot
   // get through it and this method should not be called.
-  Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
+  Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
   CreatedInsts = 0;
-  if (!SExtOpnd->hasOneUse()) {
-    // SExtOpnd will be promoted.
-    // All its uses, but SExt, will need to use a truncated value of the
+  if (!ExtOpnd->hasOneUse()) {
+    // ExtOpnd will be promoted.
+    // All its uses, but Ext, will need to use a truncated value of the
     // promoted version.
     // Create the truncate now.
-    Instruction *Trunc = TPT.createTrunc(SExt, SExtOpnd->getType());
-    Trunc->removeFromParent();
-    // Insert it just after the definition.
-    Trunc->insertAfter(SExtOpnd);
+    Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
+    if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
+      ITrunc->removeFromParent();
+      // Insert it just after the definition.
+      ITrunc->insertAfter(ExtOpnd);
+    }
 
-    TPT.replaceAllUsesWith(SExtOpnd, Trunc);
-    // Restore the operand of SExt (which has been replace by the previous call
+    TPT.replaceAllUsesWith(ExtOpnd, Trunc);
+    // Restore the operand of Ext (which has been replace by the previous call
     // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
-    TPT.setOperand(SExt, 0, SExtOpnd);
+    TPT.setOperand(Ext, 0, ExtOpnd);
   }
 
   // Get through the Instruction:
   // 1. Update its type.
-  // 2. Replace the uses of SExt by Inst.
-  // 3. Sign extend each operand that needs to be sign extended.
+  // 2. Replace the uses of Ext by Inst.
+  // 3. Extend each operand that needs to be extended.
 
   // Remember the original type of the instruction before promotion.
   // This is useful to know that the high bits are sign extended bits.
-  PromotedInsts.insert(
-      std::pair<Instruction *, Type *>(SExtOpnd, SExtOpnd->getType()));
+  PromotedInsts.insert(std::pair<Instruction *, TypeIsSExt>(
+      ExtOpnd, TypeIsSExt(ExtOpnd->getType(), IsSExt)));
   // Step #1.
-  TPT.mutateType(SExtOpnd, SExt->getType());
+  TPT.mutateType(ExtOpnd, Ext->getType());
   // Step #2.
-  TPT.replaceAllUsesWith(SExt, SExtOpnd);
+  TPT.replaceAllUsesWith(Ext, ExtOpnd);
   // Step #3.
-  Instruction *SExtForOpnd = SExt;
+  Instruction *ExtForOpnd = Ext;
 
-  DEBUG(dbgs() << "Propagate SExt to operands\n");
-  for (int OpIdx = 0, EndOpIdx = SExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
+  DEBUG(dbgs() << "Propagate Ext to operands\n");
+  for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
        ++OpIdx) {
-    DEBUG(dbgs() << "Operand:\n" << *(SExtOpnd->getOperand(OpIdx)) << '\n');
-    if (SExtOpnd->getOperand(OpIdx)->getType() == SExt->getType() ||
-        !shouldSExtOperand(SExtOpnd, OpIdx)) {
+    DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
+    if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
+        !shouldExtOperand(ExtOpnd, OpIdx)) {
       DEBUG(dbgs() << "No need to propagate\n");
       continue;
     }
-    // Check if we can statically sign extend the operand.
-    Value *Opnd = SExtOpnd->getOperand(OpIdx);
+    // Check if we can statically extend the operand.
+    Value *Opnd = ExtOpnd->getOperand(OpIdx);
     if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
-      DEBUG(dbgs() << "Statically sign extend\n");
-      TPT.setOperand(
-          SExtOpnd, OpIdx,
-          ConstantInt::getSigned(SExt->getType(), Cst->getSExtValue()));
+      DEBUG(dbgs() << "Statically extend\n");
+      unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
+      APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
+                            : Cst->getValue().zext(BitWidth);
+      TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
       continue;
     }
     // UndefValue are typed, so we have to statically sign extend them.
     if (isa<UndefValue>(Opnd)) {
-      DEBUG(dbgs() << "Statically sign extend\n");
-      TPT.setOperand(SExtOpnd, OpIdx, UndefValue::get(SExt->getType()));
+      DEBUG(dbgs() << "Statically extend\n");
+      TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
       continue;
     }
 
     // Otherwise we have to explicity sign extend the operand.
-    // Check if SExt was reused to sign extend an operand.
-    if (!SExtForOpnd) {
+    // Check if Ext was reused to extend an operand.
+    if (!ExtForOpnd) {
       // If yes, create a new one.
-      DEBUG(dbgs() << "More operands to sext\n");
-      SExtForOpnd = TPT.createSExt(SExt, Opnd, SExt->getType());
+      DEBUG(dbgs() << "More operands to ext\n");
+      ExtForOpnd =
+          cast<Instruction>(IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
+                                   : TPT.createZExt(Ext, Opnd, Ext->getType()));
       ++CreatedInsts;
     }
 
-    TPT.setOperand(SExtForOpnd, 0, Opnd);
+    TPT.setOperand(ExtForOpnd, 0, Opnd);
 
     // Move the sign extension before the insertion point.
-    TPT.moveBefore(SExtForOpnd, SExtOpnd);
-    TPT.setOperand(SExtOpnd, OpIdx, SExtForOpnd);
+    TPT.moveBefore(ExtForOpnd, ExtOpnd);
+    TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
     // If more sext are required, new instructions will have to be created.
-    SExtForOpnd = nullptr;
+    ExtForOpnd = nullptr;
   }
-  if (SExtForOpnd == SExt) {
-    DEBUG(dbgs() << "Sign extension is useless now\n");
-    TPT.eraseInstruction(SExt);
+  if (ExtForOpnd == Ext) {
+    DEBUG(dbgs() << "Extension is useless now\n");
+    TPT.eraseInstruction(Ext);
   }
-  return SExtOpnd;
+  return ExtOpnd;
 }
 
 /// IsPromotionProfitable - Check whether or not promoting an instruction
@@ -1951,8 +2055,8 @@
   if (!ISDOpcode)
     return true;
   // Otherwise, check if the promoted instruction is legal or not.
-  return TLI.isOperationLegalOrCustom(ISDOpcode,
-                                      EVT::getEVT(PromotedInst->getType()));
+  return TLI.isOperationLegalOrCustom(
+      ISDOpcode, TLI.getValueType(PromotedInst->getType()));
 }
 
 /// MatchOperationAddr - Given an instruction or constant expr, see if we can
@@ -2036,7 +2140,8 @@
   case Instruction::Shl: {
     // Can only handle X*C and X << C.
     ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
-    if (!RHS) return false;
+    if (!RHS)
+      return false;
     int64_t Scale = RHS->getSExtValue();
     if (Opcode == Instruction::Shl)
       Scale = 1LL << Scale;
@@ -2129,28 +2234,32 @@
 
     return true;
   }
-  case Instruction::SExt: {
-    // Try to move this sext out of the way of the addressing mode.
-    Instruction *SExt = cast<Instruction>(AddrInst);
+  case Instruction::SExt:
+  case Instruction::ZExt: {
+    Instruction *Ext = dyn_cast<Instruction>(AddrInst);
+    if (!Ext)
+      return false;
+
+    // Try to move this ext out of the way of the addressing mode.
     // Ask for a method for doing so.
-    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
-        SExt, InsertedTruncs, TLI, PromotedInsts);
+    TypePromotionHelper::Action TPH =
+        TypePromotionHelper::getAction(Ext, InsertedTruncs, TLI, PromotedInsts);
     if (!TPH)
       return false;
 
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
     unsigned CreatedInsts = 0;
-    Value *PromotedOperand = TPH(SExt, TPT, PromotedInsts, CreatedInsts);
+    Value *PromotedOperand = TPH(Ext, TPT, PromotedInsts, CreatedInsts);
     // SExt has been moved away.
     // Thus either it will be rematched later in the recursive calls or it is
     // gone. Anyway, we must not fold it into the addressing mode at this point.
     // E.g.,
     // op = add opnd, 1
-    // idx = sext op
+    // idx = ext op
     // addr = gep base, idx
     // is now:
-    // promotedOpnd = sext opnd           <- no match here
+    // promotedOpnd = ext opnd            <- no match here
     // op = promoted_add promotedOpnd, 1  <- match (later in recursive calls)
     // addr = gep base, op                <- match
     if (MovedAway)
@@ -2289,10 +2398,10 @@
 /// Add the ultimately found memory instructions to MemoryUses.
 static bool FindAllMemoryUses(Instruction *I,
                 SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
-                              SmallPtrSet<Instruction*, 16> &ConsideredInsts,
+                              SmallPtrSetImpl<Instruction*> &ConsideredInsts,
                               const TargetLowering &TLI) {
   // If we already considered this instruction, we're done.
-  if (!ConsideredInsts.insert(I))
+  if (!ConsideredInsts.insert(I).second)
     return false;
 
   // If this is an obviously unfoldable instruction, bail out.
@@ -2506,7 +2615,7 @@
     worklist.pop_back();
 
     // Break use-def graph loops.
-    if (!Visited.insert(V)) {
+    if (!Visited.insert(V).second) {
       Consensus = nullptr;
       break;
     }
@@ -2696,8 +2805,8 @@
       if (AddrMode.BaseOffs) {
         Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
         if (ResultIndex) {
-	  // We need to add this separately from the scale above to help with
-	  // SDAG consecutive load/store merging.
+          // We need to add this separately from the scale above to help with
+          // SDAG consecutive load/store merging.
           if (ResultPtr->getType() != I8PtrTy)
             ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
           ResultPtr = Builder.CreateGEP(ResultPtr, ResultIndex, "sunkaddr");
@@ -3105,6 +3214,367 @@
   return MadeChange;
 }
 
+namespace {
+/// \brief Helper class to promote a scalar operation to a vector one.
+/// This class is used to move downward extractelement transition.
+/// E.g.,
+/// a = vector_op <2 x i32>
+/// b = extractelement <2 x i32> a, i32 0
+/// c = scalar_op b
+/// store c
+///
+/// =>
+/// a = vector_op <2 x i32>
+/// c = vector_op a (equivalent to scalar_op on the related lane)
+/// * d = extractelement <2 x i32> c, i32 0
+/// * store d
+/// Assuming both extractelement and store can be combine, we get rid of the
+/// transition.
+class VectorPromoteHelper {
+  /// Used to perform some checks on the legality of vector operations.
+  const TargetLowering &TLI;
+
+  /// Used to estimated the cost of the promoted chain.
+  const TargetTransformInfo &TTI;
+
+  /// The transition being moved downwards.
+  Instruction *Transition;
+  /// The sequence of instructions to be promoted.
+  SmallVector<Instruction *, 4> InstsToBePromoted;
+  /// Cost of combining a store and an extract.
+  unsigned StoreExtractCombineCost;
+  /// Instruction that will be combined with the transition.
+  Instruction *CombineInst;
+
+  /// \brief The instruction that represents the current end of the transition.
+  /// Since we are faking the promotion until we reach the end of the chain
+  /// of computation, we need a way to get the current end of the transition.
+  Instruction *getEndOfTransition() const {
+    if (InstsToBePromoted.empty())
+      return Transition;
+    return InstsToBePromoted.back();
+  }
+
+  /// \brief Return the index of the original value in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
+  /// c, is at index 0.
+  unsigned getTransitionOriginalValueIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 0;
+  }
+
+  /// \brief Return the index of the index in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 0" the index
+  /// is at index 1.
+  unsigned getTransitionIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 1;
+  }
+
+  /// \brief Get the type of the transition.
+  /// This is the type of the original value.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
+  /// transition is <2 x i32>.
+  Type *getTransitionType() const {
+    return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
+  }
+
+  /// \brief Promote \p ToBePromoted by moving \p Def downward through.
+  /// I.e., we have the following sequence:
+  /// Def = Transition <ty1> a to <ty2>
+  /// b = ToBePromoted <ty2> Def, ...
+  /// =>
+  /// b = ToBePromoted <ty1> a, ...
+  /// Def = Transition <ty1> ToBePromoted to <ty2>
+  void promoteImpl(Instruction *ToBePromoted);
+
+  /// \brief Check whether or not it is profitable to promote all the
+  /// instructions enqueued to be promoted.
+  bool isProfitableToPromote() {
+    Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
+    unsigned Index = isa<ConstantInt>(ValIdx)
+                         ? cast<ConstantInt>(ValIdx)->getZExtValue()
+                         : -1;
+    Type *PromotedType = getTransitionType();
+
+    StoreInst *ST = cast<StoreInst>(CombineInst);
+    unsigned AS = ST->getPointerAddressSpace();
+    unsigned Align = ST->getAlignment();
+    // Check if this store is supported.
+    if (!TLI.allowsMisalignedMemoryAccesses(
+            TLI.getValueType(ST->getValueOperand()->getType()), AS, Align)) {
+      // If this is not supported, there is no way we can combine
+      // the extract with the store.
+      return false;
+    }
+
+    // The scalar chain of computation has to pay for the transition
+    // scalar to vector.
+    // The vector chain has to account for the combining cost.
+    uint64_t ScalarCost =
+        TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
+    uint64_t VectorCost = StoreExtractCombineCost;
+    for (const auto &Inst : InstsToBePromoted) {
+      // Compute the cost.
+      // By construction, all instructions being promoted are arithmetic ones.
+      // Moreover, one argument is a constant that can be viewed as a splat
+      // constant.
+      Value *Arg0 = Inst->getOperand(0);
+      bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
+                            isa<ConstantFP>(Arg0);
+      TargetTransformInfo::OperandValueKind Arg0OVK =
+          IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+                         : TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Arg1OVK =
+          !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+                          : TargetTransformInfo::OK_AnyValue;
+      ScalarCost += TTI.getArithmeticInstrCost(
+          Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
+      VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
+                                               Arg0OVK, Arg1OVK);
+    }
+    DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
+                 << ScalarCost << "\nVector: " << VectorCost << '\n');
+    return ScalarCost > VectorCost;
+  }
+
+  /// \brief Generate a constant vector with \p Val with the same
+  /// number of elements as the transition.
+  /// \p UseSplat defines whether or not \p Val should be replicated
+  /// accross the whole vector.
+  /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
+  /// otherwise we generate a vector with as many undef as possible:
+  /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
+  /// used at the index of the extract.
+  Value *getConstantVector(Constant *Val, bool UseSplat) const {
+    unsigned ExtractIdx = UINT_MAX;
+    if (!UseSplat) {
+      // If we cannot determine where the constant must be, we have to
+      // use a splat constant.
+      Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
+      if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
+        ExtractIdx = CstVal->getSExtValue();
+      else
+        UseSplat = true;
+    }
+
+    unsigned End = getTransitionType()->getVectorNumElements();
+    if (UseSplat)
+      return ConstantVector::getSplat(End, Val);
+
+    SmallVector<Constant *, 4> ConstVec;
+    UndefValue *UndefVal = UndefValue::get(Val->getType());
+    for (unsigned Idx = 0; Idx != End; ++Idx) {
+      if (Idx == ExtractIdx)
+        ConstVec.push_back(Val);
+      else
+        ConstVec.push_back(UndefVal);
+    }
+    return ConstantVector::get(ConstVec);
+  }
+
+  /// \brief Check if promoting to a vector type an operand at \p OperandIdx
+  /// in \p Use can trigger undefined behavior.
+  static bool canCauseUndefinedBehavior(const Instruction *Use,
+                                        unsigned OperandIdx) {
+    // This is not safe to introduce undef when the operand is on
+    // the right hand side of a division-like instruction.
+    if (OperandIdx != 1)
+      return false;
+    switch (Use->getOpcode()) {
+    default:
+      return false;
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::SRem:
+    case Instruction::URem:
+      return true;
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      return !Use->hasNoNaNs();
+    }
+    llvm_unreachable(nullptr);
+  }
+
+public:
+  VectorPromoteHelper(const TargetLowering &TLI, const TargetTransformInfo &TTI,
+                      Instruction *Transition, unsigned CombineCost)
+      : TLI(TLI), TTI(TTI), Transition(Transition),
+        StoreExtractCombineCost(CombineCost), CombineInst(nullptr) {
+    assert(Transition && "Do not know how to promote null");
+  }
+
+  /// \brief Check if we can promote \p ToBePromoted to \p Type.
+  bool canPromote(const Instruction *ToBePromoted) const {
+    // We could support CastInst too.
+    return isa<BinaryOperator>(ToBePromoted);
+  }
+
+  /// \brief Check if it is profitable to promote \p ToBePromoted
+  /// by moving downward the transition through.
+  bool shouldPromote(const Instruction *ToBePromoted) const {
+    // Promote only if all the operands can be statically expanded.
+    // Indeed, we do not want to introduce any new kind of transitions.
+    for (const Use &U : ToBePromoted->operands()) {
+      const Value *Val = U.get();
+      if (Val == getEndOfTransition()) {
+        // If the use is a division and the transition is on the rhs,
+        // we cannot promote the operation, otherwise we may create a
+        // division by zero.
+        if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
+          return false;
+        continue;
+      }
+      if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
+          !isa<ConstantFP>(Val))
+        return false;
+    }
+    // Check that the resulting operation is legal.
+    int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
+    if (!ISDOpcode)
+      return false;
+    return StressStoreExtract ||
+           TLI.isOperationLegalOrCustom(
+               ISDOpcode, TLI.getValueType(getTransitionType(), true));
+  }
+
+  /// \brief Check whether or not \p Use can be combined
+  /// with the transition.
+  /// I.e., is it possible to do Use(Transition) => AnotherUse?
+  bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
+
+  /// \brief Record \p ToBePromoted as part of the chain to be promoted.
+  void enqueueForPromotion(Instruction *ToBePromoted) {
+    InstsToBePromoted.push_back(ToBePromoted);
+  }
+
+  /// \brief Set the instruction that will be combined with the transition.
+  void recordCombineInstruction(Instruction *ToBeCombined) {
+    assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
+    CombineInst = ToBeCombined;
+  }
+
+  /// \brief Promote all the instructions enqueued for promotion if it is
+  /// is profitable.
+  /// \return True if the promotion happened, false otherwise.
+  bool promote() {
+    // Check if there is something to promote.
+    // Right now, if we do not have anything to combine with,
+    // we assume the promotion is not profitable.
+    if (InstsToBePromoted.empty() || !CombineInst)
+      return false;
+
+    // Check cost.
+    if (!StressStoreExtract && !isProfitableToPromote())
+      return false;
+
+    // Promote.
+    for (auto &ToBePromoted : InstsToBePromoted)
+      promoteImpl(ToBePromoted);
+    InstsToBePromoted.clear();
+    return true;
+  }
+};
+} // End of anonymous namespace.
+
+void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
+  // At this point, we know that all the operands of ToBePromoted but Def
+  // can be statically promoted.
+  // For Def, we need to use its parameter in ToBePromoted:
+  // b = ToBePromoted ty1 a
+  // Def = Transition ty1 b to ty2
+  // Move the transition down.
+  // 1. Replace all uses of the promoted operation by the transition.
+  // = ... b => = ... Def.
+  assert(ToBePromoted->getType() == Transition->getType() &&
+         "The type of the result of the transition does not match "
+         "the final type");
+  ToBePromoted->replaceAllUsesWith(Transition);
+  // 2. Update the type of the uses.
+  // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
+  Type *TransitionTy = getTransitionType();
+  ToBePromoted->mutateType(TransitionTy);
+  // 3. Update all the operands of the promoted operation with promoted
+  // operands.
+  // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
+  for (Use &U : ToBePromoted->operands()) {
+    Value *Val = U.get();
+    Value *NewVal = nullptr;
+    if (Val == Transition)
+      NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
+    else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
+             isa<ConstantFP>(Val)) {
+      // Use a splat constant if it is not safe to use undef.
+      NewVal = getConstantVector(
+          cast<Constant>(Val),
+          isa<UndefValue>(Val) ||
+              canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
+    } else
+      assert(0 && "Did you modified shouldPromote and forgot to update this?");
+    ToBePromoted->setOperand(U.getOperandNo(), NewVal);
+  }
+  Transition->removeFromParent();
+  Transition->insertAfter(ToBePromoted);
+  Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
+}
+
+/// Some targets can do store(extractelement) with one instruction.
+/// Try to push the extractelement towards the stores when the target
+/// has this feature and this is profitable.
+bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
+  unsigned CombineCost = UINT_MAX;
+  if (DisableStoreExtract || !TLI ||
+      (!StressStoreExtract &&
+       !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
+                                       Inst->getOperand(1), CombineCost)))
+    return false;
+
+  // At this point we know that Inst is a vector to scalar transition.
+  // Try to move it down the def-use chain, until:
+  // - We can combine the transition with its single use
+  //   => we got rid of the transition.
+  // - We escape the current basic block
+  //   => we would need to check that we are moving it at a cheaper place and
+  //      we do not do that for now.
+  BasicBlock *Parent = Inst->getParent();
+  DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
+  VectorPromoteHelper VPH(*TLI, *TTI, Inst, CombineCost);
+  // If the transition has more than one use, assume this is not going to be
+  // beneficial.
+  while (Inst->hasOneUse()) {
+    Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
+    DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
+
+    if (ToBePromoted->getParent() != Parent) {
+      DEBUG(dbgs() << "Instruction to promote is in a different block ("
+                   << ToBePromoted->getParent()->getName()
+                   << ") than the transition (" << Parent->getName() << ").\n");
+      return false;
+    }
+
+    if (VPH.canCombine(ToBePromoted)) {
+      DEBUG(dbgs() << "Assume " << *Inst << '\n'
+                   << "will be combined with: " << *ToBePromoted << '\n');
+      VPH.recordCombineInstruction(ToBePromoted);
+      bool Changed = VPH.promote();
+      NumStoreExtractExposed += Changed;
+      return Changed;
+    }
+
+    DEBUG(dbgs() << "Try promoting.\n");
+    if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
+      return false;
+
+    DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
+
+    VPH.enqueueForPromotion(ToBePromoted);
+    Inst = ToBePromoted;
+  }
+  return false;
+}
+
 bool CodeGenPrepare::OptimizeInst(Instruction *I) {
   if (PHINode *P = dyn_cast<PHINode>(I)) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
@@ -3199,6 +3669,9 @@
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
     return OptimizeShuffleVectorInst(SVI);
 
+  if (isa<ExtractElementInst>(I))
+    return OptimizeExtractElementInst(I);
+
   return false;
 }
 

diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index d2231ec..3d62d48 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp

@@ -20,24 +20,20 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "post-RA-sched"
 
-CriticalAntiDepBreaker::
-CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo &RCI) :
-  AntiDepBreaker(), MF(MFi),
-  MRI(MF.getRegInfo()),
-  TII(MF.getTarget().getInstrInfo()),
-  TRI(MF.getTarget().getRegisterInfo()),
-  RegClassInfo(RCI),
-  Classes(TRI->getNumRegs(), nullptr),
-  KillIndices(TRI->getNumRegs(), 0),
-  DefIndices(TRI->getNumRegs(), 0),
-  KeepRegs(TRI->getNumRegs(), false) {}
+CriticalAntiDepBreaker::CriticalAntiDepBreaker(MachineFunction &MFi,
+                                               const RegisterClassInfo &RCI)
+    : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()),
+      TII(MF.getSubtarget().getInstrInfo()),
+      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI),
+      Classes(TRI->getNumRegs(), nullptr), KillIndices(TRI->getNumRegs(), 0),
+      DefIndices(TRI->getNumRegs(), 0), KeepRegs(TRI->getNumRegs(), false) {}
 
 CriticalAntiDepBreaker::~CriticalAntiDepBreaker() {
 }
@@ -273,19 +269,10 @@
       // Ignore two-addr defs.
       if (MI->isRegTiedToUseOperand(i)) continue;
 
-      // FIXME: we should use a SubRegIterator that includes self (as above), so
-      // we don't have to repeat all this code for the reg itself.
-      DefIndices[Reg] = Count;
-      KillIndices[Reg] = ~0u;
-      assert(((KillIndices[Reg] == ~0u) !=
-              (DefIndices[Reg] == ~0u)) &&
-             "Kill and Def maps aren't consistent for Reg!");
-      KeepRegs.reset(Reg);
-      Classes[Reg] = nullptr;
-      RegRefs.erase(Reg);
-      // Repeat, for all subregs.
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
-        unsigned SubregReg = *SubRegs;
+      // For the reg itself and all subregs: update the def to current;
+      // reset the kill state, any restrictions, and references.
+      for (MCSubRegIterator SRI(Reg, TRI, true); SRI.isValid(); ++SRI) {
+        unsigned SubregReg = *SRI;
         DefIndices[SubregReg] = Count;
         KillIndices[SubregReg] = ~0u;
         KeepRegs.reset(SubregReg);
@@ -317,19 +304,9 @@
 
     RegRefs.insert(std::make_pair(Reg, &MO));
 
-    // FIXME: we should use an MCRegAliasIterator that includes self so we don't
-    // have to repeat all this code for the reg itself.
-    
     // It wasn't previously live but now it is, this is a kill.
-    if (KillIndices[Reg] == ~0u) {
-      KillIndices[Reg] = Count;
-      DefIndices[Reg] = ~0u;
-          assert(((KillIndices[Reg] == ~0u) !=
-                  (DefIndices[Reg] == ~0u)) &&
-               "Kill and Def maps aren't consistent for Reg!");
-    }
-    // Repeat, for all aliases.
-    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+    // Repeat for all aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
       unsigned AliasReg = *AI;
       if (KillIndices[AliasReg] == ~0u) {
         KillIndices[AliasReg] = Count;

diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 45e4ff5..ceef74d 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
-#define LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
+#ifndef LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
 
 #include "AntiDepBreaker.h"
 #include "llvm/ADT/BitVector.h"
@@ -38,32 +38,32 @@
     const TargetRegisterInfo *TRI;
     const RegisterClassInfo &RegClassInfo;
 
-    /// AllocatableSet - The set of allocatable registers.
+    /// The set of allocatable registers.
     /// We'll be ignoring anti-dependencies on non-allocatable registers,
     /// because they may not be safe to break.
     const BitVector AllocatableSet;
 
-    /// Classes - For live regs that are only used in one register class in a
+    /// For live regs that are only used in one register class in a
     /// live range, the register class. If the register is not live, the
     /// corresponding value is null. If the register is live but used in
     /// multiple register classes, the corresponding value is -1 casted to a
     /// pointer.
     std::vector<const TargetRegisterClass*> Classes;
 
-    /// RegRefs - Map registers to all their references within a live range.
+    /// Map registers to all their references within a live range.
     std::multimap<unsigned, MachineOperand *> RegRefs;
     typedef std::multimap<unsigned, MachineOperand *>::const_iterator
       RegRefIter;
 
-    /// KillIndices - The index of the most recent kill (proceeding bottom-up),
+    /// The index of the most recent kill (proceeding bottom-up),
     /// or ~0u if the register is not live.
     std::vector<unsigned> KillIndices;
 
-    /// DefIndices - The index of the most recent complete def (proceeding
+    /// The index of the most recent complete def (proceeding
     /// bottom up), or ~0u if the register is live.
     std::vector<unsigned> DefIndices;
 
-    /// KeepRegs - A set of registers which are live and cannot be changed to
+    /// A set of registers which are live and cannot be changed to
     /// break anti-dependencies.
     BitVector KeepRegs;
 
@@ -71,26 +71,23 @@
     CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo&);
     ~CriticalAntiDepBreaker();
 
-    /// Start - Initialize anti-dep breaking for a new basic block.
+    /// Initialize anti-dep breaking for a new basic block.
     void StartBlock(MachineBasicBlock *BB) override;
 
-    /// BreakAntiDependencies - Identifiy anti-dependencies along the critical
-    /// path
+    /// Identifiy anti-dependencies along the critical path
     /// of the ScheduleDAG and break them by renaming registers.
-    ///
     unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                                    MachineBasicBlock::iterator Begin,
                                    MachineBasicBlock::iterator End,
                                    unsigned InsertPosIndex,
                                    DbgValueVector &DbgValues) override;
 
-    /// Observe - Update liveness information to account for the current
+    /// Update liveness information to account for the current
     /// instruction, which will not be scheduled.
-    ///
     void Observe(MachineInstr *MI, unsigned Count,
                  unsigned InsertPosIndex) override;
 
-    /// Finish - Finish anti-dep breaking for a basic block.
+    /// Finish anti-dep breaking for a basic block.
     void FinishBlock() override;
 
   private:

diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index bc6e9dc..0a188c0 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp

@@ -106,16 +106,15 @@
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
-                   MachineDominatorTree &MDT, bool IsPostRA);
+                       bool IsPostRA);
   // Schedule - Actual scheduling work.
   void schedule() override;
 };
 }
 
-DefaultVLIWScheduler::DefaultVLIWScheduler(
-  MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-  bool IsPostRA) :
-  ScheduleDAGInstrs(MF, MLI, MDT, IsPostRA) {
+DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
+                                           MachineLoopInfo &MLI, bool IsPostRA)
+    : ScheduleDAGInstrs(MF, &MLI, IsPostRA) {
   CanHandleTerminators = true;
 }
 
@@ -125,12 +124,12 @@
 }
 
 // VLIWPacketizerList Ctor
-VLIWPacketizerList::VLIWPacketizerList(
-  MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-  bool IsPostRA) : TM(MF.getTarget()), MF(MF)  {
-  TII = TM.getInstrInfo();
-  ResourceTracker = TII->CreateTargetScheduleState(&TM, nullptr);
-  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA);
+VLIWPacketizerList::VLIWPacketizerList(MachineFunction &MF,
+                                       MachineLoopInfo &MLI, bool IsPostRA)
+    : MF(MF) {
+  TII = MF.getSubtarget().getInstrInfo();
+  ResourceTracker = TII->CreateTargetScheduleState(MF.getSubtarget());
+  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, IsPostRA);
 }
 
 // VLIWPacketizerList Dtor

diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 2b144d8..48213c1 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp

@@ -19,7 +19,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "codegen-dce"
@@ -90,8 +91,8 @@
 
   bool AnyChanges = false;
   MRI = &MF.getRegInfo();
-  TRI = MF.getTarget().getRegisterInfo();
-  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
 
   // Loop over all instructions in all blocks, from bottom to top, so that it's
   // more likely that chains of dependent but ultimately dead instructions will
@@ -122,19 +123,10 @@
       if (isDead(MI)) {
         DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << *MI);
         // It is possible that some DBG_VALUE instructions refer to this
-        // instruction.  Examine each def operand for such references;
-        // if found, mark the DBG_VALUE as undef (but don't delete it).
-        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-          const MachineOperand &MO = MI->getOperand(i);
-          if (!MO.isReg() || !MO.isDef())
-            continue;
-          unsigned Reg = MO.getReg();
-          if (!TargetRegisterInfo::isVirtualRegister(Reg))
-            continue;
-          MRI->markUsesInDebugValueAsUndef(Reg);
-        }
+        // instruction.  They get marked as undef and will be deleted
+        // in the live debug variable analysis.
+        MI->eraseFromParentAndMarkDBGValuesForRemoval();
         AnyChanges = true;
-        MI->eraseFromParent();
         ++NumDeletes;
         MIE = MBB->rend();
         // MII is now pointing to the next instruction to process,

diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index a195586..75b74d9 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp

@@ -23,6 +23,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
@@ -50,6 +51,11 @@
 
     bool runOnFunction(Function &Fn) override;
 
+    bool doFinalization(Module &M) override {
+      RewindFunction = nullptr;
+      return false;
+    }
+
     void getAnalysisUsage(AnalysisUsage &AU) const override { }
 
     const char *getPassName() const override {
@@ -118,7 +124,7 @@
     return false;
 
   // Find the rewind function if we didn't already.
-  const TargetLowering *TLI = TM->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
   if (!RewindFunction) {
     LLVMContext &Ctx = Resumes[0]->getContext();
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),

diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index c470632..995606f 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp

@@ -153,8 +153,8 @@
 public:
   /// runOnMachineFunction - Initialize per-function data structures.
   void runOnMachineFunction(MachineFunction &MF) {
-    TII = MF.getTarget().getInstrInfo();
-    TRI = MF.getTarget().getRegisterInfo();
+    TII = MF.getSubtarget().getInstrInfo();
+    TRI = MF.getSubtarget().getRegisterInfo();
     MRI = &MF.getRegInfo();
     LiveRegUnits.clear();
     LiveRegUnits.setUniverse(TRI->getNumRegUnits());
@@ -245,7 +245,7 @@
       MachineInstr *DefMI = MRI->getVRegDef(Reg);
       if (!DefMI || DefMI->getParent() != Head)
         continue;
-      if (InsertAfter.insert(DefMI))
+      if (InsertAfter.insert(DefMI).second)
         DEBUG(dbgs() << "BB#" << MBB->getNumber() << " depends on " << *DefMI);
       if (DefMI->isTerminator()) {
         DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
@@ -580,7 +580,7 @@
 class EarlyIfConverter : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
-  const MCSchedModel *SchedModel;
+  MCSchedModel SchedModel;
   MachineRegisterInfo *MRI;
   MachineDominatorTree *DomTree;
   MachineLoopInfo *Loops;
@@ -688,7 +688,7 @@
                               FBBTrace.getCriticalPath());
 
   // Set a somewhat arbitrary limit on the critical path extension we accept.
-  unsigned CritLimit = SchedModel->MispredictPenalty/2;
+  unsigned CritLimit = SchedModel.MispredictPenalty/2;
 
   // If-conversion only makes sense when there is unexploited ILP. Compute the
   // maximum-ILP resource length of the trace after if-conversion. Compare it
@@ -782,8 +782,8 @@
            .enableEarlyIfConversion())
     return false;
 
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   SchedModel =
     MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
   MRI = &MF.getRegInfo();

diff --git a/lib/CodeGen/ErlangGC.cpp b/lib/CodeGen/ErlangGC.cpp
index e976d7f..85b0893 100644
--- a/lib/CodeGen/ErlangGC.cpp
+++ b/lib/CodeGen/ErlangGC.cpp

@@ -21,6 +21,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -53,7 +54,7 @@
 MCSymbol *ErlangGC::InsertLabel(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI,
                                 DebugLoc DL) const {
-  const TargetInstrInfo* TII = MBB.getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
   MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
   BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
   return Label;

diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index cf55b68..3680498 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp

@@ -29,7 +29,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "execution-fix"
@@ -713,13 +714,13 @@
 
 bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
-  TII = MF->getTarget().getInstrInfo();
-  TRI = MF->getTarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
   LiveRegs = nullptr;
   assert(NumRegs == RC->getNumRegs() && "Bad regclass");
 
   DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: "
-               << RC->getName() << " **********\n");
+               << TRI->getRegClassName(RC) << " **********\n");
 
   // If no relevant registers are used in the function, we can skip it
   // completely.

diff --git a/lib/CodeGen/ExpandISelPseudos.cpp b/lib/CodeGen/ExpandISelPseudos.cpp
index 90b62b5..55e809e 100644
--- a/lib/CodeGen/ExpandISelPseudos.cpp
+++ b/lib/CodeGen/ExpandISelPseudos.cpp

@@ -19,7 +19,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "expand-isel-pseudos"
@@ -46,7 +46,7 @@
 
 bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
-  const TargetLowering *TLI = MF.getTarget().getTargetLowering();
+  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
 
   // Iterate through each instruction in the function, looking for pseudos.
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {

diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 8969bcc..e7bf143 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp

@@ -20,8 +20,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "postrapseudos"
@@ -182,8 +183,8 @@
   DEBUG(dbgs() << "Machine Function\n"
                << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
                << "********** Function: " << MF.getName() << '\n');
-  TRI = MF.getTarget().getRegisterInfo();
-  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
 
   bool MadeChange = false;
 

diff --git a/lib/CodeGen/ForwardControlFlowIntegrity.cpp b/lib/CodeGen/ForwardControlFlowIntegrity.cpp
new file mode 100644
index 0000000..5e7e853
--- /dev/null
+++ b/lib/CodeGen/ForwardControlFlowIntegrity.cpp

@@ -0,0 +1,374 @@
+//===-- ForwardControlFlowIntegrity.cpp: Forward-Edge CFI -----------------===//
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief A pass that instruments code with fast checks for indirect calls and
+/// hooks for a function to check violations.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "cfi"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/JumpInstrTableInfo.h"
+#include "llvm/CodeGen/ForwardControlFlowIntegrity.h"
+#include "llvm/CodeGen/JumpInstrTables.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(NumCFIIndirectCalls,
+          "Number of indirect call sites rewritten by the CFI pass");
+
+char ForwardControlFlowIntegrity::ID = 0;
+INITIALIZE_PASS_BEGIN(ForwardControlFlowIntegrity, "forward-cfi",
+                      "Control-Flow Integrity", true, true)
+INITIALIZE_PASS_DEPENDENCY(JumpInstrTableInfo);
+INITIALIZE_PASS_DEPENDENCY(JumpInstrTables);
+INITIALIZE_PASS_END(ForwardControlFlowIntegrity, "forward-cfi",
+                    "Control-Flow Integrity", true, true)
+
+ModulePass *llvm::createForwardControlFlowIntegrityPass() {
+  return new ForwardControlFlowIntegrity();
+}
+
+ModulePass *llvm::createForwardControlFlowIntegrityPass(
+    JumpTable::JumpTableType JTT, CFIntegrity CFIType, bool CFIEnforcing,
+    StringRef CFIFuncName) {
+  return new ForwardControlFlowIntegrity(JTT, CFIType, CFIEnforcing,
+                                         CFIFuncName);
+}
+
+// Checks to see if a given CallSite is making an indirect call, including
+// cases where the indirect call is made through a bitcast.
+static bool isIndirectCall(CallSite &CS) {
+  if (CS.getCalledFunction())
+    return false;
+
+  // Check the value to see if it is merely a bitcast of a function. In
+  // this case, it will translate to a direct function call in the resulting
+  // assembly, so we won't treat it as an indirect call here.
+  const Value *V = CS.getCalledValue();
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    return !(CE->isCast() && isa<Function>(CE->getOperand(0)));
+  }
+
+  // Otherwise, since we know it's a call, it must be an indirect call
+  return true;
+}
+
+static const char cfi_failure_func_name[] = "__llvm_cfi_pointer_warning";
+
+ForwardControlFlowIntegrity::ForwardControlFlowIntegrity()
+    : ModulePass(ID), IndirectCalls(), JTType(JumpTable::Single),
+      CFIType(CFIntegrity::Sub), CFIEnforcing(false), CFIFuncName("") {
+  initializeForwardControlFlowIntegrityPass(*PassRegistry::getPassRegistry());
+}
+
+ForwardControlFlowIntegrity::ForwardControlFlowIntegrity(
+    JumpTable::JumpTableType JTT, CFIntegrity CFIType, bool CFIEnforcing,
+    std::string CFIFuncName)
+    : ModulePass(ID), IndirectCalls(), JTType(JTT), CFIType(CFIType),
+      CFIEnforcing(CFIEnforcing), CFIFuncName(CFIFuncName) {
+  initializeForwardControlFlowIntegrityPass(*PassRegistry::getPassRegistry());
+}
+
+ForwardControlFlowIntegrity::~ForwardControlFlowIntegrity() {}
+
+void ForwardControlFlowIntegrity::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<JumpInstrTableInfo>();
+  AU.addRequired<JumpInstrTables>();
+}
+
+void ForwardControlFlowIntegrity::getIndirectCalls(Module &M) {
+  // To get the indirect calls, we iterate over all functions and iterate over
+  // the list of basic blocks in each. We extract a total list of indirect calls
+  // before modifying any of them, since our modifications will modify the list
+  // of basic blocks.
+  for (Function &F : M) {
+    for (BasicBlock &BB : F) {
+      for (Instruction &I : BB) {
+        CallSite CS(&I);
+        if (!(CS && isIndirectCall(CS)))
+          continue;
+
+        Value *CalledValue = CS.getCalledValue();
+
+        // Don't rewrite this instruction if the indirect call is actually just
+        // inline assembly, since our transformation will generate an invalid
+        // module in that case.
+        if (isa<InlineAsm>(CalledValue))
+          continue;
+
+        IndirectCalls.push_back(&I);
+      }
+    }
+  }
+}
+
+void ForwardControlFlowIntegrity::updateIndirectCalls(Module &M,
+                                                      CFITables &CFIT) {
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  for (Instruction *I : IndirectCalls) {
+    CallSite CS(I);
+    Value *CalledValue = CS.getCalledValue();
+
+    // Get the function type for this call and look it up in the tables.
+    Type *VTy = CalledValue->getType();
+    PointerType *PTy = dyn_cast<PointerType>(VTy);
+    Type *EltTy = PTy->getElementType();
+    FunctionType *FunTy = dyn_cast<FunctionType>(EltTy);
+    FunctionType *TransformedTy = JumpInstrTables::transformType(JTType, FunTy);
+    ++NumCFIIndirectCalls;
+    Constant *JumpTableStart = nullptr;
+    Constant *JumpTableMask = nullptr;
+    Constant *JumpTableSize = nullptr;
+
+    // Some call sites have function types that don't correspond to any
+    // address-taken function in the module. This happens when function pointers
+    // are passed in from external code.
+    auto it = CFIT.find(TransformedTy);
+    if (it == CFIT.end()) {
+      // In this case, make sure that the function pointer will change by
+      // setting the mask and the start to be 0 so that the transformed
+      // function is 0.
+      JumpTableStart = ConstantInt::get(Int64Ty, 0);
+      JumpTableMask = ConstantInt::get(Int64Ty, 0);
+      JumpTableSize = ConstantInt::get(Int64Ty, 0);
+    } else {
+      JumpTableStart = it->second.StartValue;
+      JumpTableMask = it->second.MaskValue;
+      JumpTableSize = it->second.Size;
+    }
+
+    rewriteFunctionPointer(M, I, CalledValue, JumpTableStart, JumpTableMask,
+                           JumpTableSize);
+  }
+
+  return;
+}
+
+bool ForwardControlFlowIntegrity::runOnModule(Module &M) {
+  JumpInstrTableInfo *JITI = &getAnalysis<JumpInstrTableInfo>();
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+
+  // JumpInstrTableInfo stores information about the alignment of each entry.
+  // The alignment returned by JumpInstrTableInfo is alignment in bytes, not
+  // in the exponent.
+  ByteAlignment = JITI->entryByteAlignment();
+  LogByteAlignment = llvm::Log2_64(ByteAlignment);
+
+  // Set up tables for control-flow integrity based on information about the
+  // jump-instruction tables.
+  CFITables CFIT;
+  for (const auto &KV : JITI->getTables()) {
+    uint64_t Size = static_cast<uint64_t>(KV.second.size());
+    uint64_t TableSize = NextPowerOf2(Size);
+
+    int64_t MaskValue = ((TableSize << LogByteAlignment) - 1) & -ByteAlignment;
+    Constant *JumpTableMaskValue = ConstantInt::get(Int64Ty, MaskValue);
+    Constant *JumpTableSize = ConstantInt::get(Int64Ty, Size);
+
+    // The base of the table is defined to be the first jumptable function in
+    // the table.
+    Function *First = KV.second.begin()->second;
+    Constant *JumpTableStartValue = ConstantExpr::getBitCast(First, VoidPtrTy);
+    CFIT[KV.first].StartValue = JumpTableStartValue;
+    CFIT[KV.first].MaskValue = JumpTableMaskValue;
+    CFIT[KV.first].Size = JumpTableSize;
+  }
+
+  if (CFIT.empty())
+    return false;
+
+  getIndirectCalls(M);
+
+  if (!CFIEnforcing) {
+    addWarningFunction(M);
+  }
+
+  // Update the instructions with the check and the indirect jump through our
+  // table.
+  updateIndirectCalls(M, CFIT);
+
+  return true;
+}
+
+void ForwardControlFlowIntegrity::addWarningFunction(Module &M) {
+  PointerType *CharPtrTy = Type::getInt8PtrTy(M.getContext());
+
+  // Get the type of the Warning Function: void (i8*, i8*),
+  // where the first argument is the name of the function in which the violation
+  // occurs, and the second is the function pointer that violates CFI.
+  SmallVector<Type *, 2> WarningFunArgs;
+  WarningFunArgs.push_back(CharPtrTy);
+  WarningFunArgs.push_back(CharPtrTy);
+  FunctionType *WarningFunTy =
+      FunctionType::get(Type::getVoidTy(M.getContext()), WarningFunArgs, false);
+
+  if (!CFIFuncName.empty()) {
+    Constant *FailureFun = M.getOrInsertFunction(CFIFuncName, WarningFunTy);
+    if (!FailureFun)
+      report_fatal_error("Could not get or insert the function specified by"
+                         " -cfi-func-name");
+  } else {
+    // The default warning function swallows the warning and lets the call
+    // continue, since there's no generic way for it to print out this
+    // information.
+    Function *WarningFun = M.getFunction(cfi_failure_func_name);
+    if (!WarningFun) {
+      WarningFun =
+          Function::Create(WarningFunTy, GlobalValue::LinkOnceAnyLinkage,
+                           cfi_failure_func_name, &M);
+    }
+
+    BasicBlock *Entry =
+        BasicBlock::Create(M.getContext(), "entry", WarningFun, 0);
+    ReturnInst::Create(M.getContext(), Entry);
+  }
+}
+
+void ForwardControlFlowIntegrity::rewriteFunctionPointer(
+    Module &M, Instruction *I, Value *FunPtr, Constant *JumpTableStart,
+    Constant *JumpTableMask, Constant *JumpTableSize) {
+  IRBuilder<> TempBuilder(I);
+
+  Type *OrigFunType = FunPtr->getType();
+
+  BasicBlock *CurBB = cast<BasicBlock>(I->getParent());
+  Function *CurF = cast<Function>(CurBB->getParent());
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+
+  Value *TI = TempBuilder.CreatePtrToInt(FunPtr, Int64Ty);
+  Value *TStartInt = TempBuilder.CreatePtrToInt(JumpTableStart, Int64Ty);
+
+  Value *NewFunPtr = nullptr;
+  Value *Check = nullptr;
+  switch (CFIType) {
+  case CFIntegrity::Sub: {
+    // This is the subtract, mask, and add version.
+    // Subtract from the base.
+    Value *Sub = TempBuilder.CreateSub(TI, TStartInt);
+
+    // Mask the difference to force this to be a table offset.
+    Value *And = TempBuilder.CreateAnd(Sub, JumpTableMask);
+
+    // Add it back to the base.
+    Value *Result = TempBuilder.CreateAdd(And, TStartInt);
+
+    // Convert it back into a function pointer that we can call.
+    NewFunPtr = TempBuilder.CreateIntToPtr(Result, OrigFunType);
+    break;
+  }
+  case CFIntegrity::Ror: {
+    // This is the subtract and rotate version.
+    // Rotate right by the alignment value. The optimizer should recognize
+    // this sequence as a rotation.
+
+    // This cast is safe, since unsigned is always a subset of uint64_t.
+    uint64_t LogByteAlignment64 = static_cast<uint64_t>(LogByteAlignment);
+    Constant *RightShift = ConstantInt::get(Int64Ty, LogByteAlignment64);
+    Constant *LeftShift = ConstantInt::get(Int64Ty, 64 - LogByteAlignment64);
+
+    // Subtract from the base.
+    Value *Sub = TempBuilder.CreateSub(TI, TStartInt);
+
+    // Create the equivalent of a rotate-right instruction.
+    Value *Shr = TempBuilder.CreateLShr(Sub, RightShift);
+    Value *Shl = TempBuilder.CreateShl(Sub, LeftShift);
+    Value *Or = TempBuilder.CreateOr(Shr, Shl);
+
+    // Perform unsigned comparison to check for inclusion in the table.
+    Check = TempBuilder.CreateICmpULT(Or, JumpTableSize);
+    NewFunPtr = FunPtr;
+    break;
+  }
+  case CFIntegrity::Add: {
+    // This is the mask and add version.
+    // Mask the function pointer to turn it into an offset into the table.
+    Value *And = TempBuilder.CreateAnd(TI, JumpTableMask);
+
+    // Then or this offset to the base and get the pointer value.
+    Value *Result = TempBuilder.CreateAdd(And, TStartInt);
+
+    // Convert it back into a function pointer that we can call.
+    NewFunPtr = TempBuilder.CreateIntToPtr(Result, OrigFunType);
+    break;
+  }
+  }
+
+  if (!CFIEnforcing) {
+    // If a check hasn't been added (in the rotation version), then check to see
+    // if it's the same as the original function. This check determines whether
+    // or not we call the CFI failure function.
+    if (!Check)
+      Check = TempBuilder.CreateICmpEQ(NewFunPtr, FunPtr);
+    BasicBlock *InvalidPtrBlock =
+        BasicBlock::Create(M.getContext(), "invalid.ptr", CurF, 0);
+    BasicBlock *ContinuationBB = CurBB->splitBasicBlock(I);
+
+    // Remove the unconditional branch that connects the two blocks.
+    TerminatorInst *TermInst = CurBB->getTerminator();
+    TermInst->eraseFromParent();
+
+    // Add a conditional branch that depends on the Check above.
+    BranchInst::Create(ContinuationBB, InvalidPtrBlock, Check, CurBB);
+
+    // Call the warning function for this pointer, then continue.
+    Instruction *BI = BranchInst::Create(ContinuationBB, InvalidPtrBlock);
+    insertWarning(M, InvalidPtrBlock, BI, FunPtr);
+  } else {
+    // Modify the instruction to call this value.
+    CallSite CS(I);
+    CS.setCalledFunction(NewFunPtr);
+  }
+}
+
+void ForwardControlFlowIntegrity::insertWarning(Module &M, BasicBlock *Block,
+                                                Instruction *I, Value *FunPtr) {
+  Function *ParentFun = cast<Function>(Block->getParent());
+
+  // Get the function to call right before the instruction.
+  Function *WarningFun = nullptr;
+  if (CFIFuncName.empty()) {
+    WarningFun = M.getFunction(cfi_failure_func_name);
+  } else {
+    WarningFun = M.getFunction(CFIFuncName);
+  }
+
+  assert(WarningFun && "Could not find the CFI failure function");
+
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+
+  IRBuilder<> WarningInserter(I);
+  // Create a mergeable GlobalVariable containing the name of the function.
+  Value *ParentNameGV =
+      WarningInserter.CreateGlobalString(ParentFun->getName());
+  Value *ParentNamePtr = WarningInserter.CreateBitCast(ParentNameGV, VoidPtrTy);
+  Value *FunVoidPtr = WarningInserter.CreateBitCast(FunPtr, VoidPtrTy);
+  WarningInserter.CreateCall2(WarningFun, ParentNamePtr, FunVoidPtr);
+}

diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index c3e4f3e..ed40982 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp

@@ -73,7 +73,7 @@
       std::unique_ptr<GCStrategy> S = I->instantiate();
       S->M = M;
       S->Name = Name;
-      StrategyMap.GetOrCreateValue(Name).setValue(S.get());
+      StrategyMap[Name] = S.get();
       StrategyList.push_back(std::move(S));
       return StrategyList.back().get();
     }

diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index 1fdff6b..b346657 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp

@@ -31,6 +31,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -377,7 +378,7 @@
 }
 
 void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
-  const TargetFrameLowering *TFI = TM->getFrameLowering();
+  const TargetFrameLowering *TFI = TM->getSubtargetImpl()->getFrameLowering();
   assert(TFI && "TargetRegisterInfo not available!");
 
   for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
@@ -403,7 +404,7 @@
 
   TM = &MF.getTarget();
   MMI = &getAnalysis<MachineModuleInfo>();
-  TII = TM->getInstrInfo();
+  TII = TM->getSubtargetImpl()->getInstrInfo();
 
   // Find the size of the stack frame.
   FI->setFrameSize(MF.getFrameInfo()->getStackSize());

diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 027ee38..457d7d6 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp

@@ -68,6 +68,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "global-merge"
@@ -142,7 +143,7 @@
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
-  const TargetLowering *TLI = TM->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
   const DataLayout *DL = TLI->getDataLayout();
 
   // FIXME: Infer the maximum possible offset depending on the actual users
@@ -281,7 +282,7 @@
 
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
                                                         BSSGlobals;
-  const TargetLowering *TLI = TM->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
   const DataLayout *DL = TLI->getDataLayout();
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;

diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 1502d5f..e84d25d 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp

@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -30,7 +31,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
@@ -161,6 +161,7 @@
     const TargetLoweringBase *TLI;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
+    const MachineBlockFrequencyInfo *MBFI;
     const MachineBranchProbabilityInfo *MBPI;
     MachineRegisterInfo *MRI;
 
@@ -177,6 +178,7 @@
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBlockFrequencyInfo>();
       AU.addRequired<MachineBranchProbabilityInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -269,15 +271,16 @@
 INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
 
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
-  TLI = MF.getTarget().getTargetLowering();
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TLI = MF.getSubtarget().getTargetLowering();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MRI = &MF.getRegInfo();
 
   const TargetSubtargetInfo &ST =
     MF.getTarget().getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
 
   if (!TII) return false;
 
@@ -286,9 +289,8 @@
   bool BFChange = false;
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
-    BranchFolder BF(true, false);
-    BFChange = BF.OptimizeFunction(MF, TII,
-                                   MF.getTarget().getRegisterInfo(),
+    BranchFolder BF(true, false, *MBFI, *MBPI);
+    BFChange = BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
                                    getAnalysisIfAvailable<MachineModuleInfo>());
   }
 
@@ -420,9 +422,8 @@
   BBAnalysis.clear();
 
   if (MadeChange && IfCvtBranchFold) {
-    BranchFolder BF(false, false);
-    BF.OptimizeFunction(MF, TII,
-                        MF.getTarget().getRegisterInfo(),
+    BranchFolder BF(false, false, *MBFI, *MBPI);
+    BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
                         getAnalysisIfAvailable<MachineModuleInfo>());
   }
 
@@ -940,9 +941,8 @@
 /// to determine if it can be if-converted. If predecessor is already enqueued,
 /// dequeue it!
 void IfConverter::InvalidatePreds(MachineBasicBlock *BB) {
-  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
-         E = BB->pred_end(); PI != E; ++PI) {
-    BBInfo &PBBI = BBAnalysis[(*PI)->getNumber()];
+  for (const auto &Predecessor : BB->predecessors()) {
+    BBInfo &PBBI = BBAnalysis[Predecessor->getNumber()];
     if (PBBI.IsDone || PBBI.BB == BB)
       continue;
     PBBI.IsAnalyzed = false;
@@ -1184,6 +1184,7 @@
   bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
   uint64_t CvtNext = 0, CvtFalse = 0, BBNext = 0, BBCvt = 0, SumWeight = 0;
   uint32_t WeightScale = 0;
+
   if (HasEarlyExit) {
     // Get weights before modifying CvtBBI->BB and BBI.BB.
     CvtNext = MBPI->getEdgeWeight(CvtBBI->BB, NextBBI->BB);
@@ -1192,6 +1193,7 @@
     BBCvt = MBPI->getEdgeWeight(BBI.BB, CvtBBI->BB);
     SumWeight = MBPI->getSumForBlock(CvtBBI->BB, WeightScale);
   }
+
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to

diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index f3c8d3d..6a6e15d 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp

@@ -34,7 +34,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -139,21 +138,16 @@
   ~InlineSpiller() {}
 
 public:
-  InlineSpiller(MachineFunctionPass &pass,
-                MachineFunction &mf,
-                VirtRegMap &vrm)
-    : MF(mf),
-      LIS(pass.getAnalysis<LiveIntervals>()),
-      LSS(pass.getAnalysis<LiveStacks>()),
-      AA(&pass.getAnalysis<AliasAnalysis>()),
-      MDT(pass.getAnalysis<MachineDominatorTree>()),
-      Loops(pass.getAnalysis<MachineLoopInfo>()),
-      VRM(vrm),
-      MFI(*mf.getFrameInfo()),
-      MRI(mf.getRegInfo()),
-      TII(*mf.getTarget().getInstrInfo()),
-      TRI(*mf.getTarget().getRegisterInfo()),
-      MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
+  InlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
+      : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
+        LSS(pass.getAnalysis<LiveStacks>()),
+        AA(&pass.getAnalysis<AliasAnalysis>()),
+        MDT(pass.getAnalysis<MachineDominatorTree>()),
+        Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
+        MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
+        TII(*mf.getSubtarget().getInstrInfo()),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
 
   void spill(LiveRangeEdit &) override;
 
@@ -190,11 +184,16 @@
 }
 
 namespace llvm {
+
+Spiller::~Spiller() { }
+void Spiller::anchor() { }
+
 Spiller *createInlineSpiller(MachineFunctionPass &pass,
                              MachineFunction &mf,
                              VirtRegMap &vrm) {
   return new InlineSpiller(pass, mf, vrm);
 }
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -824,7 +823,7 @@
   WorkList.push_back(std::make_pair(LI, VNI));
   do {
     std::tie(LI, VNI) = WorkList.pop_back_val();
-    if (!UsedValues.insert(VNI))
+    if (!UsedValues.insert(VNI).second)
       continue;
 
     if (VNI->isPHIDef()) {
@@ -853,6 +852,15 @@
 /// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
 bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
                                      MachineBasicBlock::iterator MI) {
+
+  // Analyze instruction
+  SmallVector<std::pair<MachineInstr *, unsigned>, 8> Ops;
+  MIBundleOperands::VirtRegInfo RI =
+    MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
+
+  if (!RI.Reads)
+    return false;
+
   SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true);
   VNInfo *ParentVNI = VirtReg.getVNInfoAt(UseIdx.getBaseIndex());
 
@@ -883,9 +891,6 @@
 
   // If the instruction also writes VirtReg.reg, it had better not require the
   // same register for uses and defs.
-  SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
-  MIBundleOperands::VirtRegInfo RI =
-    MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
   if (RI.Tied) {
     markValueUsed(&VirtReg, ParentVNI);
     DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << *MI);
@@ -939,10 +944,15 @@
   for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
     unsigned Reg = RegsToSpill[i];
     LiveInterval &LI = LIS.getInterval(Reg);
-    for (MachineRegisterInfo::use_bundle_nodbg_iterator
-         RI = MRI.use_bundle_nodbg_begin(Reg), E = MRI.use_bundle_nodbg_end();
-         RI != E; ) {
-      MachineInstr *MI = &*(RI++);
+    for (MachineRegisterInfo::reg_bundle_iterator
+           RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end();
+         RegI != E; ) {
+      MachineInstr *MI = &*(RegI++);
+
+      // Debug values are not allowed to affect codegen.
+      if (MI->isDebugValue())
+        continue;
+
       anyRemat |= reMaterializeFor(LI, MI);
     }
   }
@@ -1218,12 +1228,16 @@
       // Modify DBG_VALUE now that the value is in a spill slot.
       bool IsIndirect = MI->isIndirectDebugValue();
       uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
-      const MDNode *MDPtr = MI->getOperand(2).getMetadata();
+      const MDNode *Var = MI->getDebugVariable();
+      const MDNode *Expr = MI->getDebugExpression();
       DebugLoc DL = MI->getDebugLoc();
       DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
       MachineBasicBlock *MBB = MI->getParent();
       BuildMI(*MBB, MBB->erase(MI), DL, TII.get(TargetOpcode::DBG_VALUE))
-          .addFrameIndex(StackSlot).addImm(Offset).addMetadata(MDPtr);
+          .addFrameIndex(StackSlot)
+          .addImm(Offset)
+          .addMetadata(Var)
+          .addMetadata(Expr);
       continue;
     }
 
@@ -1363,7 +1377,7 @@
   StackInt = nullptr;
 
   DEBUG(dbgs() << "Inline spilling "
-               << MRI.getRegClass(edit.getReg())->getName()
+               << TRI.getRegClassName(MRI.getRegClass(edit.getReg()))
                << ':' << edit.getParent()
                << "\nFrom original " << PrintReg(Original) << '\n');
   assert(edit.getParent().isSpillable() &&

diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index 91a1da9..1791afb 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_INTERFERENCECACHE
-#define LLVM_CODEGEN_INTERFERENCECACHE
+#ifndef LLVM_LIB_CODEGEN_INTERFERENCECACHE_H
+#define LLVM_LIB_CODEGEN_INTERFERENCECACHE_H
 
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 

diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index a8b8600..2c95e9e 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp

@@ -459,9 +459,10 @@
     CI->replaceAllUsesWith(CI->getOperand(0));
     break;
 
+  case Intrinsic::assume:
   case Intrinsic::var_annotation:
-    break;   // Strip out annotate intrinsic
-    
+    break;   // Strip out these intrinsics
+ 
   case Intrinsic::memcpy: {
     Type *IntPtr = DL.getIntPtrType(Context);
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
@@ -527,6 +528,34 @@
     ReplaceFPIntrinsicWithCall(CI, "powf", "pow", "powl");
     break;
   }
+  case Intrinsic::sin: {
+    ReplaceFPIntrinsicWithCall(CI, "sinf", "sin", "sinl");
+    break;
+  }
+  case Intrinsic::cos: {
+    ReplaceFPIntrinsicWithCall(CI, "cosf", "cos", "cosl");
+    break;
+  }
+  case Intrinsic::floor: {
+    ReplaceFPIntrinsicWithCall(CI, "floorf", "floor", "floorl");
+    break;
+  }
+  case Intrinsic::ceil: {
+    ReplaceFPIntrinsicWithCall(CI, "ceilf", "ceil", "ceill");
+    break;
+  }
+  case Intrinsic::trunc: {
+    ReplaceFPIntrinsicWithCall(CI, "truncf", "trunc", "truncl");
+    break;
+  }
+  case Intrinsic::round: {
+    ReplaceFPIntrinsicWithCall(CI, "roundf", "round", "roundl");
+    break;
+  }
+  case Intrinsic::copysign: {
+    ReplaceFPIntrinsicWithCall(CI, "copysignf", "copysign", "copysignl");
+    break;
+  }
   case Intrinsic::flt_rounds:
      // Lower to "round to the nearest"
      if (!CI->getType()->isVoidTy())

diff --git a/lib/CodeGen/JITCodeEmitter.cpp b/lib/CodeGen/JITCodeEmitter.cpp
deleted file mode 100644
index 96a5389..0000000
--- a/lib/CodeGen/JITCodeEmitter.cpp
+++ /dev/null

@@ -1,14 +0,0 @@
-//===-- llvm/CodeGen/JITCodeEmitter.cpp - Code emission --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/JITCodeEmitter.h"
-
-using namespace llvm;
-
-void JITCodeEmitter::anchor() { }

diff --git a/lib/CodeGen/JumpInstrTables.cpp b/lib/CodeGen/JumpInstrTables.cpp
index 61ef722..20f775c 100644
--- a/lib/CodeGen/JumpInstrTables.cpp
+++ b/lib/CodeGen/JumpInstrTables.cpp

@@ -163,7 +163,7 @@
 
 Function *JumpInstrTables::insertEntry(Module &M, Function *Target) {
   FunctionType *OrigFunTy = Target->getFunctionType();
-  FunctionType *FunTy = transformType(OrigFunTy);
+  FunctionType *FunTy = transformType(JTType, OrigFunTy);
 
   JumpMap::iterator it = Metadata.find(FunTy);
   if (Metadata.end() == it) {
@@ -191,11 +191,12 @@
 }
 
 bool JumpInstrTables::hasTable(FunctionType *FunTy) {
-  FunctionType *TransTy = transformType(FunTy);
+  FunctionType *TransTy = transformType(JTType, FunTy);
   return Metadata.end() != Metadata.find(TransTy);
 }
 
-FunctionType *JumpInstrTables::transformType(FunctionType *FunTy) {
+FunctionType *JumpInstrTables::transformType(JumpTable::JumpTableType JTT,
+                                             FunctionType *FunTy) {
   // Returning nullptr forces all types into the same table, since all types map
   // to the same type
   Type *VoidPtrTy = Type::getInt8PtrTy(FunTy->getContext());
@@ -211,7 +212,7 @@
   Type *Int32Ty = Type::getInt32Ty(FunTy->getContext());
   FunctionType *VoidFnTy = FunctionType::get(
       Type::getVoidTy(FunTy->getContext()), EmptyParams, false);
-  switch (JTType) {
+  switch (JTT) {
   case JumpTable::Single:
 
     return FunctionType::get(RetTy, EmptyParams, false);
@@ -251,16 +252,12 @@
 }
 
 bool JumpInstrTables::runOnModule(Module &M) {
-  // Make sure the module is well-formed, especially with respect to jumptable.
-  if (verifyModule(M))
-    return false;
-
   JITI = &getAnalysis<JumpInstrTableInfo>();
 
-  // Get the set of jumptable-annotated functions.
+  // Get the set of jumptable-annotated functions that have their address taken.
   DenseMap<Function *, Function *> Functions;
   for (Function &F : M) {
-    if (F.hasFnAttribute(Attribute::JumpTable)) {
+    if (F.hasFnAttribute(Attribute::JumpTable) && F.hasAddressTaken()) {
       assert(F.hasUnnamedAddr() &&
              "Attribute 'jumptable' requires 'unnamed_addr'");
       Functions[&F] = nullptr;

diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index df96b94..61face2 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp

@@ -13,8 +13,10 @@
 
 #include "llvm/Target/TargetMachine.h"
 
+#include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/ForwardControlFlowIntegrity.h"
 #include "llvm/CodeGen/JumpInstrTables.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -48,8 +50,8 @@
   cl::desc("Enable the \"fast\" instruction selector"));
 
 void LLVMTargetMachine::initAsmInfo() {
-  MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(*getRegisterInfo(),
-                                                    TargetTriple);
+  MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(
+      *getSubtargetImpl()->getRegisterInfo(), getTargetTriple());
   // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
   // and if the old one gets included then MCAsmInfo will be NULL and
   // we'll crash later.
@@ -110,9 +112,9 @@
 
   // Install a MachineModuleInfo class, which is an immutable pass that holds
   // all the per-module stuff we're generating, including MCContext.
-  MachineModuleInfo *MMI =
-    new MachineModuleInfo(*TM->getMCAsmInfo(), *TM->getRegisterInfo(),
-                          &TM->getTargetLowering()->getObjFileLowering());
+  MachineModuleInfo *MMI = new MachineModuleInfo(
+      *TM->getMCAsmInfo(), *TM->getSubtargetImpl()->getRegisterInfo(),
+      &TM->getSubtargetImpl()->getTargetLowering()->getObjFileLowering());
   PM.add(MMI);
 
   // Set up a MachineFunction for the rest of CodeGen to work on.
@@ -143,8 +145,13 @@
                                             AnalysisID StopAfter) {
   // Passes to handle jumptable function annotations. These can't be handled at
   // JIT time, so we don't add them directly to addPassesToGenerateCode.
-  PM.add(createJumpInstrTableInfoPass());
+  PM.add(createJumpInstrTableInfoPass(
+      getSubtargetImpl()->getInstrInfo()->getJumpInstrTableEntryBound()));
   PM.add(createJumpInstrTablesPass(Options.JTType));
+  if (Options.FCFI)
+    PM.add(createForwardControlFlowIntegrityPass(
+        Options.JTType, Options.CFIType, Options.CFIEnforcing,
+        Options.getCFIFuncName()));
 
   // Add common CodeGen passes.
   MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify,
@@ -165,10 +172,10 @@
   if (Options.MCOptions.MCSaveTempLabels)
     Context->setAllowTemporaryLabels(false);
 
-  const MCAsmInfo &MAI = *getMCAsmInfo();
-  const MCRegisterInfo &MRI = *getRegisterInfo();
-  const MCInstrInfo &MII = *getInstrInfo();
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
+  const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCRegisterInfo &MRI = *getSubtargetImpl()->getRegisterInfo();
+  const MCInstrInfo &MII = *getSubtargetImpl()->getInstrInfo();
   std::unique_ptr<MCStreamer> AsmStreamer;
 
   switch (FileType) {
@@ -201,9 +208,10 @@
     if (!MCE || !MAB)
       return true;
 
-    AsmStreamer.reset(getTarget().createMCObjectStreamer(
-        getTargetTriple(), *Context, *MAB, Out, MCE, STI,
-        Options.MCOptions.MCRelaxAll, Options.MCOptions.MCNoExecStack));
+    AsmStreamer.reset(
+        getTarget()
+            .createMCObjectStreamer(getTargetTriple(), *Context, *MAB, Out, MCE,
+                                    STI, Options.MCOptions.MCRelaxAll));
     break;
   }
   case CGFT_Null:
@@ -226,26 +234,6 @@
   return false;
 }
 
-/// addPassesToEmitMachineCode - Add passes to the specified pass manager to
-/// get machine code emitted.  This uses a JITCodeEmitter object to handle
-/// actually outputting the machine code and resolving things like the address
-/// of functions.  This method should return true if machine code emission is
-/// not supported.
-///
-bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
-                                                   JITCodeEmitter &JCE,
-                                                   bool DisableVerify) {
-  // Add common CodeGen passes.
-  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify, nullptr,
-                                               nullptr);
-  if (!Context)
-    return true;
-
-  addCodeEmitter(PM, JCE);
-
-  return false; // success!
-}
-
 /// addPassesToEmitMC - Add passes to the specified pass manager to get
 /// machine code emitted with the MCJIT. This method returns true if machine
 /// code is not supported. It fills the MCContext Ctx pointer which can be
@@ -265,19 +253,20 @@
 
   // Create the code emitter for the target if it exists.  If not, .o file
   // emission fails.
-  const MCRegisterInfo &MRI = *getRegisterInfo();
+  const MCRegisterInfo &MRI = *getSubtargetImpl()->getRegisterInfo();
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
-                                                       STI, *Ctx);
+  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(
+      *getSubtargetImpl()->getInstrInfo(), MRI, STI, *Ctx);
   MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
                                                      TargetCPU);
   if (!MCE || !MAB)
     return true;
 
   std::unique_ptr<MCStreamer> AsmStreamer;
-  AsmStreamer.reset(getTarget().createMCObjectStreamer(
-      getTargetTriple(), *Ctx, *MAB, Out, MCE, STI,
-      Options.MCOptions.MCRelaxAll, Options.MCOptions.MCNoExecStack));
+  AsmStreamer.reset(getTarget()
+                        .createMCObjectStreamer(getTargetTriple(), *Ctx, *MAB,
+                                                Out, MCE, STI,
+                                                Options.MCOptions.MCRelaxAll));
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
   FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);

diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index d12c234..b621e3b 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp

@@ -137,6 +137,8 @@
 /// getOrCreateLexicalScope - Find lexical scope for the given DebugLoc. If
 /// not available then create new lexical scope.
 LexicalScope *LexicalScopes::getOrCreateLexicalScope(DebugLoc DL) {
+  if (DL.isUnknown())
+    return nullptr;
   MDNode *Scope = nullptr;
   MDNode *InlinedAt = nullptr;
   DL.getScopeAndInlinedAt(Scope, InlinedAt, MF->getFunction()->getContext());
@@ -172,9 +174,12 @@
                               std::make_tuple(Parent, DIDescriptor(Scope),
                                               nullptr, false)).first;
 
-  if (!Parent && DIDescriptor(Scope).isSubprogram() &&
-      DISubprogram(Scope).describes(MF->getFunction()))
+  if (!Parent) {
+    assert(DIDescriptor(Scope).isSubprogram());
+    assert(DISubprogram(Scope).describes(MF->getFunction()));
+    assert(!CurrentFnLexicalScope);
     CurrentFnLexicalScope = &I->second;
+  }
 
   return &I->second;
 }
@@ -285,7 +290,7 @@
 /// have machine instructions that belong to lexical scope identified by
 /// DebugLoc.
 void LexicalScopes::getMachineBasicBlocks(
-    DebugLoc DL, SmallPtrSet<const MachineBasicBlock *, 4> &MBBs) {
+    DebugLoc DL, SmallPtrSetImpl<const MachineBasicBlock *> &MBBs) {
   MBBs.clear();
   LexicalScope *Scope = getOrCreateLexicalScope(DL);
   if (!Scope)

diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 7d5646b..1624851 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp

@@ -39,6 +39,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 #include <memory>
 
@@ -109,7 +110,8 @@
 namespace {
 class LDVImpl;
 class UserValue {
-  const MDNode *variable; ///< The debug info variable we are part of.
+  const MDNode *Variable;   ///< The debug info variable we are part of.
+  const MDNode *Expression; ///< Any complex address expression.
   unsigned offset;        ///< Byte offset into variable.
   bool IsIndirect;        ///< true if this is a register-indirect+offset value.
   DebugLoc dl;            ///< The debug location for the variable. This is
@@ -139,11 +141,10 @@
 
 public:
   /// UserValue - Create a new UserValue.
-  UserValue(const MDNode *var, unsigned o, bool i, DebugLoc L,
-            LocMap::Allocator &alloc)
-    : variable(var), offset(o), IsIndirect(i), dl(L), leader(this),
-      next(nullptr), locInts(alloc)
-  {}
+  UserValue(const MDNode *var, const MDNode *expr, unsigned o, bool i,
+            DebugLoc L, LocMap::Allocator &alloc)
+      : Variable(var), Expression(expr), offset(o), IsIndirect(i), dl(L),
+        leader(this), next(nullptr), locInts(alloc) {}
 
   /// getLeader - Get the leader of this value's equivalence class.
   UserValue *getLeader() {
@@ -157,8 +158,10 @@
   UserValue *getNext() const { return next; }
 
   /// match - Does this UserValue match the parameters?
-  bool match(const MDNode *Var, unsigned Offset, bool indirect) const {
-    return Var == variable && Offset == offset && indirect == IsIndirect;
+  bool match(const MDNode *Var, const MDNode *Expr, unsigned Offset,
+             bool indirect) const {
+    return Var == Variable && Expr == Expression && Offset == offset &&
+           indirect == IsIndirect;
   }
 
   /// merge - Merge equivalence classes.
@@ -267,7 +270,7 @@
                        LiveIntervals &LIS, const TargetInstrInfo &TRI);
 
   /// findDebugLoc - Return DebugLoc used for this DBG_VALUE instruction. A
-  /// variable may have more than one corresponding DBG_VALUE instructions. 
+  /// variable may have more than one corresponding DBG_VALUE instructions.
   /// Only first one needs DebugLoc to identify variable's lexical scope
   /// in source file.
   DebugLoc findDebugLoc();
@@ -306,8 +309,8 @@
   UVMap userVarMap;
 
   /// getUserValue - Find or create a UserValue.
-  UserValue *getUserValue(const MDNode *Var, unsigned Offset,
-                          bool IsIndirect, DebugLoc DL);
+  UserValue *getUserValue(const MDNode *Var, const MDNode *Expr,
+                          unsigned Offset, bool IsIndirect, DebugLoc DL);
 
   /// lookupVirtReg - Find the EC leader for VirtReg or null.
   UserValue *lookupVirtReg(unsigned VirtReg);
@@ -344,6 +347,7 @@
            "Dbg values are not emitted in LDV");
     EmitDone = false;
     ModifiedMF = false;
+    LS.reset();
   }
 
   /// mapVirtReg - Map virtual register to an equivalence class.
@@ -360,8 +364,8 @@
 } // namespace
 
 void UserValue::print(raw_ostream &OS, const TargetMachine *TM) {
-  DIVariable DV(variable);
-  OS << "!\""; 
+  DIVariable DV(Variable);
+  OS << "!\"";
   DV.printExtendedName(OS);
   OS << "\"\t";
   if (offset)
@@ -421,19 +425,20 @@
       LDV->mapVirtReg(locations[i].getReg(), this);
 }
 
-UserValue *LDVImpl::getUserValue(const MDNode *Var, unsigned Offset,
-                                 bool IsIndirect, DebugLoc DL) {
+UserValue *LDVImpl::getUserValue(const MDNode *Var, const MDNode *Expr,
+                                 unsigned Offset, bool IsIndirect,
+                                 DebugLoc DL) {
   UserValue *&Leader = userVarMap[Var];
   if (Leader) {
     UserValue *UV = Leader->getLeader();
     Leader = UV;
     for (; UV; UV = UV->getNext())
-      if (UV->match(Var, Offset, IsIndirect))
+      if (UV->match(Var, Expr, Offset, IsIndirect))
         return UV;
   }
 
   userValues.push_back(
-      make_unique<UserValue>(Var, Offset, IsIndirect, DL, allocator));
+      make_unique<UserValue>(Var, Expr, Offset, IsIndirect, DL, allocator));
   UserValue *UV = userValues.back().get();
   Leader = UserValue::merge(Leader, UV);
   return UV;
@@ -453,7 +458,7 @@
 
 bool LDVImpl::handleDebugValue(MachineInstr *MI, SlotIndex Idx) {
   // DBG_VALUE loc, offset, variable
-  if (MI->getNumOperands() != 3 ||
+  if (MI->getNumOperands() != 4 ||
       !(MI->getOperand(1).isReg() || MI->getOperand(1).isImm()) ||
       !MI->getOperand(2).isMetadata()) {
     DEBUG(dbgs() << "Can't handle " << *MI);
@@ -463,9 +468,11 @@
   // Get or create the UserValue for (variable,offset).
   bool IsIndirect = MI->isIndirectDebugValue();
   unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
-  const MDNode *Var = MI->getOperand(2).getMetadata();
+  const MDNode *Var = MI->getDebugVariable();
+  const MDNode *Expr = MI->getDebugExpression();
   //here.
-  UserValue *UV = getUserValue(Var, Offset, IsIndirect, MI->getDebugLoc());
+  UserValue *UV =
+      getUserValue(Var, Expr, Offset, IsIndirect, MI->getDebugLoc());
   UV->addDef(Idx, MI->getOperand(0));
   return true;
 }
@@ -698,7 +705,7 @@
   MF = &mf;
   LIS = &pass.getAnalysis<LiveIntervals>();
   MDT = &pass.getAnalysis<MachineDominatorTree>();
-  TRI = mf.getTarget().getRegisterInfo();
+  TRI = mf.getSubtarget().getRegisterInfo();
   LS.initialize(mf);
   DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
                << mf.getName() << " **********\n");
@@ -710,11 +717,25 @@
   return Changed;
 }
 
+static void removeDebugValues(MachineFunction &mf) {
+  for (MachineBasicBlock &MBB : mf) {
+    for (auto MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ) {
+      if (!MBBI->isDebugValue()) {
+        ++MBBI;
+        continue;
+      }
+      MBBI = MBB.erase(MBBI);
+    }
+  }
+}
+
 bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
   if (!EnableLDV)
     return false;
-  if (!FunctionDIs.count(mf.getFunction()))
+  if (!FunctionDIs.count(mf.getFunction())) {
+    removeDebugValues(mf);
     return false;
+  }
   if (!pImpl)
     pImpl = new LDVImpl(this);
   return static_cast<LDVImpl*>(pImpl)->runOnMachineFunction(mf);
@@ -936,10 +957,13 @@
 
   if (Loc.isReg())
     BuildMI(*MBB, I, findDebugLoc(), TII.get(TargetOpcode::DBG_VALUE),
-            IsIndirect, Loc.getReg(), offset, variable);
+            IsIndirect, Loc.getReg(), offset, Variable, Expression);
   else
     BuildMI(*MBB, I, findDebugLoc(), TII.get(TargetOpcode::DBG_VALUE))
-      .addOperand(Loc).addImm(offset).addMetadata(variable);
+        .addOperand(Loc)
+        .addImm(offset)
+        .addMetadata(Variable)
+        .addMetadata(Expression);
 }
 
 void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
@@ -979,7 +1003,7 @@
   DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
   if (!MF)
     return;
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
     DEBUG(userValues[i]->print(dbgs(), &MF->getTarget()));
     userValues[i]->rewriteLocations(*VRM, *TRI);

diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index 7ec0d17..7e3b361 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h

@@ -18,8 +18,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
-#define LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
+#ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
+#define LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/DebugInfo.h"
@@ -72,4 +72,4 @@
 
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
+#endif

diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index ce8ce96..ddb0032 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp

@@ -206,7 +206,7 @@
   valnos.clear();
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     VNInfo *VNI = I->valno;
-    if (!Seen.insert(VNI))
+    if (!Seen.insert(VNI).second)
       continue;
     assert(!VNI->isUnused() && "Unused valno used by live segment");
     VNI->id = (unsigned)valnos.size();

diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 1559560..1742e63 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp

@@ -34,8 +34,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cmath>
 #include <limits>
@@ -110,9 +110,8 @@
 bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
   MRI = &MF->getRegInfo();
-  TM = &fn.getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
   AA = &getAnalysis<AliasAnalysis>();
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
@@ -380,12 +379,13 @@
       (void)ExtVNI;
       assert(ExtVNI == VNI && "Unexpected existing value number");
       // Is this a PHIDef we haven't seen before?
-      if (!VNI->isPHIDef() || VNI->def != BlockStart || !UsedPHIs.insert(VNI))
+      if (!VNI->isPHIDef() || VNI->def != BlockStart ||
+          !UsedPHIs.insert(VNI).second)
         continue;
       // The PHI is live, make sure the predecessors are live-out.
       for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
            PE = MBB->pred_end(); PI != PE; ++PI) {
-        if (!LiveOut.insert(*PI))
+        if (!LiveOut.insert(*PI).second)
           continue;
         SlotIndex Stop = getMBBEndIdx(*PI);
         // A predecessor is not required to have a live-out value for a PHI.
@@ -402,7 +402,7 @@
     // Make sure VNI is live-out from the predecessors.
     for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
          PE = MBB->pred_end(); PI != PE; ++PI) {
-      if (!LiveOut.insert(*PI))
+      if (!LiveOut.insert(*PI).second)
         continue;
       SlotIndex Stop = getMBBEndIdx(*PI);
       assert(li->getVNInfoBefore(Stop) == VNI &&
@@ -785,7 +785,7 @@
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
   void updateRange(LiveRange &LR, unsigned Reg) {
-    if (!Updated.insert(&LR))
+    if (!Updated.insert(&LR).second)
       return;
     DEBUG({
       dbgs() << "     ";

diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 67ab559..345d6c4 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h

@@ -19,8 +19,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_LIVERANGECALC_H
-#define LLVM_CODEGEN_LIVERANGECALC_H
+#ifndef LLVM_LIB_CODEGEN_LIVERANGECALC_H
+#define LLVM_LIB_CODEGEN_LIVERANGECALC_H
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/IndexedMap.h"

diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index c27d630..a0fb712 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp

@@ -411,8 +411,11 @@
   for (unsigned I = 0, Size = size(); I < Size; ++I) {
     LiveInterval &LI = LIS.getInterval(get(I));
     if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
-      DEBUG(dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
-                   << MRI.getRegClass(LI.reg)->getName() << '\n');
+      DEBUG({
+        const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+        dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
+               << TRI->getRegClassName(MRI.getRegClass(LI.reg)) << '\n';
+      });
     VRAI.calculateSpillWeightAndHint(LI);
   }
 }

diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index de2ce22..a8cae08 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp

@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
@@ -48,7 +47,7 @@
 }
 
 bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
-  TRI = MF.getTarget().getRegisterInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
   VRM = &getAnalysis<VirtRegMap>();

diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index b3161a4..8a6ac25 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp

@@ -20,6 +20,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <limits>
 using namespace llvm;
 
@@ -49,7 +50,7 @@
 }
 
 bool LiveStacks::runOnMachineFunction(MachineFunction &MF) {
-  TRI = MF.getTarget().getRegisterInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   // FIXME: No analysis is being done right now. We are relying on the
   // register allocators to provide the information.
   return false;
@@ -80,7 +81,7 @@
     int Slot = I->first;
     const TargetRegisterClass *RC = getIntervalRegClass(Slot);
     if (RC)
-      OS << " [" << RC->getName() << "]\n";
+      OS << " [" << TRI->getRegClassName(RC) << "]\n";
     else
       OS << " [Unknown]\n";
   }

diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 758b216..c4bca5f 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp

@@ -37,7 +37,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -497,17 +496,135 @@
   }
 }
 
+void LiveVariables::runOnInstr(MachineInstr *MI,
+                               SmallVectorImpl<unsigned> &Defs) {
+  assert(!MI->isDebugValue());
+  // Process all of the operands of the instruction...
+  unsigned NumOperandsToProcess = MI->getNumOperands();
+
+  // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
+  // of the uses.  They will be handled in other basic blocks.
+  if (MI->isPHI())
+    NumOperandsToProcess = 1;
+
+  // Clear kill and dead markers. LV will recompute them.
+  SmallVector<unsigned, 4> UseRegs;
+  SmallVector<unsigned, 4> DefRegs;
+  SmallVector<unsigned, 1> RegMasks;
+  for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask()) {
+      RegMasks.push_back(i);
+      continue;
+    }
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (MO.isUse()) {
+      MO.setIsKill(false);
+      if (MO.readsReg())
+        UseRegs.push_back(MOReg);
+    } else /*MO.isDef()*/ {
+      MO.setIsDead(false);
+      DefRegs.push_back(MOReg);
+    }
+  }
+
+  MachineBasicBlock *MBB = MI->getParent();
+  // Process all uses.
+  for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
+    unsigned MOReg = UseRegs[i];
+    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+      HandleVirtRegUse(MOReg, MBB, MI);
+    else if (!MRI->isReserved(MOReg))
+      HandlePhysRegUse(MOReg, MI);
+  }
+
+  // Process all masked registers. (Call clobbers).
+  for (unsigned i = 0, e = RegMasks.size(); i != e; ++i)
+    HandleRegMask(MI->getOperand(RegMasks[i]));
+
+  // Process all defs.
+  for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
+    unsigned MOReg = DefRegs[i];
+    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+      HandleVirtRegDef(MOReg, MI);
+    else if (!MRI->isReserved(MOReg))
+      HandlePhysRegDef(MOReg, MI, Defs);
+  }
+  UpdatePhysRegDefs(MI, Defs);
+}
+
+void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
+  // Mark live-in registers as live-in.
+  SmallVector<unsigned, 4> Defs;
+  for (MachineBasicBlock::livein_iterator II = MBB->livein_begin(),
+         EE = MBB->livein_end(); II != EE; ++II) {
+    assert(TargetRegisterInfo::isPhysicalRegister(*II) &&
+           "Cannot have a live-in virtual register!");
+    HandlePhysRegDef(*II, nullptr, Defs);
+  }
+
+  // Loop over all of the instructions, processing them.
+  DistanceMap.clear();
+  unsigned Dist = 0;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    MachineInstr *MI = I;
+    if (MI->isDebugValue())
+      continue;
+    DistanceMap.insert(std::make_pair(MI, Dist++));
+
+    runOnInstr(MI, Defs);
+  }
+
+  // Handle any virtual assignments from PHI nodes which might be at the
+  // bottom of this basic block.  We check all of our successor blocks to see
+  // if they have PHI nodes, and if so, we simulate an assignment at the end
+  // of the current block.
+  if (!PHIVarInfo[MBB->getNumber()].empty()) {
+    SmallVectorImpl<unsigned> &VarInfoVec = PHIVarInfo[MBB->getNumber()];
+
+    for (SmallVectorImpl<unsigned>::iterator I = VarInfoVec.begin(),
+           E = VarInfoVec.end(); I != E; ++I)
+      // Mark it alive only in the block we are representing.
+      MarkVirtRegAliveInBlock(getVarInfo(*I),MRI->getVRegDef(*I)->getParent(),
+                              MBB);
+  }
+
+  // MachineCSE may CSE instructions which write to non-allocatable physical
+  // registers across MBBs. Remember if any reserved register is liveout.
+  SmallSet<unsigned, 4> LiveOuts;
+  for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock *SuccMBB = *SI;
+    if (SuccMBB->isLandingPad())
+      continue;
+    for (MachineBasicBlock::livein_iterator LI = SuccMBB->livein_begin(),
+           LE = SuccMBB->livein_end(); LI != LE; ++LI) {
+      unsigned LReg = *LI;
+      if (!TRI->isInAllocatableClass(LReg))
+        // Ignore other live-ins, e.g. those that are live into landing pads.
+        LiveOuts.insert(LReg);
+    }
+  }
+
+  // Loop over PhysRegDef / PhysRegUse, killing any registers that are
+  // available at the end of the basic block.
+  for (unsigned i = 0; i != NumRegs; ++i)
+    if ((PhysRegDef[i] || PhysRegUse[i]) && !LiveOuts.count(i))
+      HandlePhysRegDef(i, nullptr, Defs);
+}
+
 bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   MRI = &mf.getRegInfo();
-  TRI = MF->getTarget().getRegisterInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
 
-  unsigned NumRegs = TRI->getNumRegs();
-  PhysRegDef  = new MachineInstr*[NumRegs];
-  PhysRegUse  = new MachineInstr*[NumRegs];
-  PHIVarInfo = new SmallVector<unsigned, 4>[MF->getNumBlockIDs()];
-  std::fill(PhysRegDef,  PhysRegDef  + NumRegs, nullptr);
-  std::fill(PhysRegUse,  PhysRegUse  + NumRegs, nullptr);
+  const unsigned NumRegs = TRI->getNumRegs();
+  PhysRegDef.assign(NumRegs, nullptr);
+  PhysRegUse.assign(NumRegs, nullptr);
+  PHIVarInfo.resize(MF->getNumBlockIDs());
   PHIJoins.clear();
 
   // FIXME: LiveIntervals will be updated to remove its dependence on
@@ -525,124 +642,11 @@
   MachineBasicBlock *Entry = MF->begin();
   SmallPtrSet<MachineBasicBlock*,16> Visited;
 
-  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
-         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
-       DFI != E; ++DFI) {
-    MachineBasicBlock *MBB = *DFI;
+  for (MachineBasicBlock *MBB : depth_first_ext(Entry, Visited)) {
+    runOnBlock(MBB, NumRegs);
 
-    // Mark live-in registers as live-in.
-    SmallVector<unsigned, 4> Defs;
-    for (MachineBasicBlock::livein_iterator II = MBB->livein_begin(),
-           EE = MBB->livein_end(); II != EE; ++II) {
-      assert(TargetRegisterInfo::isPhysicalRegister(*II) &&
-             "Cannot have a live-in virtual register!");
-      HandlePhysRegDef(*II, nullptr, Defs);
-    }
-
-    // Loop over all of the instructions, processing them.
-    DistanceMap.clear();
-    unsigned Dist = 0;
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-         I != E; ++I) {
-      MachineInstr *MI = I;
-      if (MI->isDebugValue())
-        continue;
-      DistanceMap.insert(std::make_pair(MI, Dist++));
-
-      // Process all of the operands of the instruction...
-      unsigned NumOperandsToProcess = MI->getNumOperands();
-
-      // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
-      // of the uses.  They will be handled in other basic blocks.
-      if (MI->isPHI())
-        NumOperandsToProcess = 1;
-
-      // Clear kill and dead markers. LV will recompute them.
-      SmallVector<unsigned, 4> UseRegs;
-      SmallVector<unsigned, 4> DefRegs;
-      SmallVector<unsigned, 1> RegMasks;
-      for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
-        MachineOperand &MO = MI->getOperand(i);
-        if (MO.isRegMask()) {
-          RegMasks.push_back(i);
-          continue;
-        }
-        if (!MO.isReg() || MO.getReg() == 0)
-          continue;
-        unsigned MOReg = MO.getReg();
-        if (MO.isUse()) {
-          MO.setIsKill(false);
-          if (MO.readsReg())
-            UseRegs.push_back(MOReg);
-        } else /*MO.isDef()*/ {
-          MO.setIsDead(false);
-          DefRegs.push_back(MOReg);
-        }
-      }
-
-      // Process all uses.
-      for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
-        unsigned MOReg = UseRegs[i];
-        if (TargetRegisterInfo::isVirtualRegister(MOReg))
-          HandleVirtRegUse(MOReg, MBB, MI);
-        else if (!MRI->isReserved(MOReg))
-          HandlePhysRegUse(MOReg, MI);
-      }
-
-      // Process all masked registers. (Call clobbers).
-      for (unsigned i = 0, e = RegMasks.size(); i != e; ++i)
-        HandleRegMask(MI->getOperand(RegMasks[i]));
-
-      // Process all defs.
-      for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
-        unsigned MOReg = DefRegs[i];
-        if (TargetRegisterInfo::isVirtualRegister(MOReg))
-          HandleVirtRegDef(MOReg, MI);
-        else if (!MRI->isReserved(MOReg))
-          HandlePhysRegDef(MOReg, MI, Defs);
-      }
-      UpdatePhysRegDefs(MI, Defs);
-    }
-
-    // Handle any virtual assignments from PHI nodes which might be at the
-    // bottom of this basic block.  We check all of our successor blocks to see
-    // if they have PHI nodes, and if so, we simulate an assignment at the end
-    // of the current block.
-    if (!PHIVarInfo[MBB->getNumber()].empty()) {
-      SmallVectorImpl<unsigned> &VarInfoVec = PHIVarInfo[MBB->getNumber()];
-
-      for (SmallVectorImpl<unsigned>::iterator I = VarInfoVec.begin(),
-             E = VarInfoVec.end(); I != E; ++I)
-        // Mark it alive only in the block we are representing.
-        MarkVirtRegAliveInBlock(getVarInfo(*I),MRI->getVRegDef(*I)->getParent(),
-                                MBB);
-    }
-
-    // MachineCSE may CSE instructions which write to non-allocatable physical
-    // registers across MBBs. Remember if any reserved register is liveout.
-    SmallSet<unsigned, 4> LiveOuts;
-    for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
-           SE = MBB->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock *SuccMBB = *SI;
-      if (SuccMBB->isLandingPad())
-        continue;
-      for (MachineBasicBlock::livein_iterator LI = SuccMBB->livein_begin(),
-             LE = SuccMBB->livein_end(); LI != LE; ++LI) {
-        unsigned LReg = *LI;
-        if (!TRI->isInAllocatableClass(LReg))
-          // Ignore other live-ins, e.g. those that are live into landing pads.
-          LiveOuts.insert(LReg);
-      }
-    }
-
-    // Loop over PhysRegDef / PhysRegUse, killing any registers that are
-    // available at the end of the basic block.
-    for (unsigned i = 0; i != NumRegs; ++i)
-      if ((PhysRegDef[i] || PhysRegUse[i]) && !LiveOuts.count(i))
-        HandlePhysRegDef(i, nullptr, Defs);
-
-    std::fill(PhysRegDef,  PhysRegDef  + NumRegs, nullptr);
-    std::fill(PhysRegUse,  PhysRegUse  + NumRegs, nullptr);
+    PhysRegDef.assign(NumRegs, nullptr);
+    PhysRegUse.assign(NumRegs, nullptr);
   }
 
   // Convert and transfer the dead / killed information we have gathered into
@@ -664,9 +668,9 @@
     assert(Visited.count(&*i) != 0 && "unreachable basic block found");
 #endif
 
-  delete[] PhysRegDef;
-  delete[] PhysRegUse;
-  delete[] PHIVarInfo;
+  PhysRegDef.clear();
+  PhysRegUse.clear();
+  PHIVarInfo.clear();
 
   return false;
 }

diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index 36885e8..5c5712f 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp

@@ -36,6 +36,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -102,7 +103,7 @@
 
 bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   unsigned LocalObjectCount = MFI->getObjectIndexEnd();
 
   // If the target doesn't want/need this pass, or if there are no locals
@@ -183,7 +184,7 @@
 void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
   // Loop over all of the stack objects, assigning sequential addresses...
   MachineFrameInfo *MFI = Fn.getFrameInfo();
-  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
   bool StackGrowsDown =
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
   int64_t Offset = 0;
@@ -272,8 +273,8 @@
   bool UsedBaseReg = false;
 
   MachineFrameInfo *MFI = Fn.getFrameInfo();
-  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
-  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
   bool StackGrowsDown =
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
 

diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 08fef5f..3058b1a 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp

@@ -32,6 +32,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -54,7 +55,8 @@
     const MachineFunction *MF = getParent();
     MCContext &Ctx = MF->getContext();
     const TargetMachine &TM = MF->getTarget();
-    const char *Prefix = TM.getDataLayout()->getPrivateGlobalPrefix();
+    const char *Prefix =
+        TM.getSubtargetImpl()->getDataLayout()->getPrivateGlobalPrefix();
     CachedMCSymbol = Ctx.GetOrCreateSymbol(Twine(Prefix) + "BB" +
                                            Twine(MF->getFunctionNumber()) +
                                            "_" + Twine(getNumber()));
@@ -290,7 +292,7 @@
 
   OS << '\n';
 
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   if (!livein_empty()) {
     if (Indexes) OS << '\t';
     OS << "    Live Ins:";
@@ -359,7 +361,7 @@
   bool LiveIn = isLiveIn(PhysReg);
   iterator I = SkipPHIsAndLabels(begin()), E = end();
   MachineRegisterInfo &MRI = getParent()->getRegInfo();
-  const TargetInstrInfo &TII = *getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *getParent()->getSubtarget().getInstrInfo();
 
   // Look for an existing copy.
   if (LiveIn)
@@ -390,7 +392,7 @@
 }
 
 void MachineBasicBlock::updateTerminator() {
-  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
   // A block with no successors has no concerns with fall-through edges.
   if (this->succ_empty()) return;
 
@@ -645,7 +647,7 @@
   // Analyze the branches, if any, at the end of the block.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
   if (TII->AnalyzeBranch(*this, TBB, FBB, Cond)) {
     // If we couldn't analyze the branch, examine the last instruction.
     // If the block doesn't end in a known control barrier, assume fallthrough
@@ -690,7 +692,7 @@
 
   // We may need to update this's terminator, but we can't do that if
   // AnalyzeBranch fails. If this uses a jump table, we won't touch it.
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   if (TII->AnalyzeBranch(*this, TBB, FBB, Cond))
@@ -795,7 +797,8 @@
   NMBB->addSuccessor(Succ);
   if (!NMBB->isLayoutSuccessor(Succ)) {
     Cond.clear();
-    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, Succ, nullptr, Cond, dl);
+    MF->getSubtarget().getInstrInfo()->InsertBranch(*NMBB, Succ, nullptr, Cond,
+                                                    dl);
 
     if (Indexes) {
       for (instr_iterator I = NMBB->instr_begin(), E = NMBB->instr_end();
@@ -823,7 +826,7 @@
     NMBB->addLiveIn(*I);
 
   // Update LiveVariables.
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   if (LV) {
     // Restore kills of virtual registers that were killed by the terminators.
     while (!KilledRegs.empty()) {
@@ -903,31 +906,8 @@
   }
 
   if (MachineDominatorTree *MDT =
-      P->getAnalysisIfAvailable<MachineDominatorTree>()) {
-    // Update dominator information.
-    MachineDomTreeNode *SucccDTNode = MDT->getNode(Succ);
-
-    bool IsNewIDom = true;
-    for (const_pred_iterator PI = Succ->pred_begin(), E = Succ->pred_end();
-         PI != E; ++PI) {
-      MachineBasicBlock *PredBB = *PI;
-      if (PredBB == NMBB)
-        continue;
-      if (!MDT->dominates(SucccDTNode, MDT->getNode(PredBB))) {
-        IsNewIDom = false;
-        break;
-      }
-    }
-
-    // We know "this" dominates the newly created basic block.
-    MachineDomTreeNode *NewDTNode = MDT->addNewBlock(NMBB, this);
-
-    // If all the other predecessors of "Succ" are dominated by "Succ" itself
-    // then the new block is the new immediate dominator of "Succ". Otherwise,
-    // the new block doesn't dominate anything.
-    if (IsNewIDom)
-      MDT->changeImmediateDominator(SucccDTNode, NewDTNode);
-  }
+      P->getAnalysisIfAvailable<MachineDominatorTree>())
+    MDT->recordSplitCriticalEdge(this, Succ, NMBB);
 
   if (MachineLoopInfo *MLI = P->getAnalysisIfAvailable<MachineLoopInfo>())
     if (MachineLoop *TIL = MLI->getLoopFor(this)) {
@@ -1086,7 +1066,7 @@
   MachineBasicBlock::succ_iterator SI = succ_begin();
   while (SI != succ_end()) {
     const MachineBasicBlock *MBB = *SI;
-    if (!SeenMBBs.insert(MBB) ||
+    if (!SeenMBBs.insert(MBB).second ||
         (MBB != DestA && MBB != DestB && !MBB->isLandingPad())) {
       // This is a superfluous edge, remove it.
       SI = removeSuccessor(SI);

diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 74af1e2..08fd200 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp

@@ -42,6 +42,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -812,7 +813,7 @@
                                    BE = L.block_end();
        BI != BE; ++BI) {
     BlockChain &Chain = *BlockToChain[*BI];
-    if (!UpdatedPreds.insert(&Chain))
+    if (!UpdatedPreds.insert(&Chain).second)
       continue;
 
     assert(Chain.LoopPredecessors == 0);
@@ -913,7 +914,7 @@
   for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
     MachineBasicBlock *BB = &*FI;
     BlockChain &Chain = *BlockToChain[BB];
-    if (!UpdatedPreds.insert(&Chain))
+    if (!UpdatedPreds.insert(&Chain).second)
       continue;
 
     assert(Chain.LoopPredecessors == 0);
@@ -1111,8 +1112,8 @@
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   MLI = &getAnalysis<MachineLoopInfo>();
-  TII = F.getTarget().getInstrInfo();
-  TLI = F.getTarget().getTargetLowering();
+  TII = F.getSubtarget().getInstrInfo();
+  TLI = F.getSubtarget().getTargetLowering();
   assert(BlockToChain.empty());
 
   buildCFGChains(F);

diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index c2ab76e..ae26967 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp

@@ -25,6 +25,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "machine-cse"
@@ -78,7 +79,8 @@
     SmallVector<MachineInstr*, 64> Exps;
     unsigned CurrVN;
 
-    bool PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool PerformTrivialCopyPropagation(MachineInstr *MI,
+                                       MachineBasicBlock *MBB);
     bool isPhysDefTriviallyDead(unsigned Reg,
                                 MachineBasicBlock::const_iterator I,
                                 MachineBasicBlock::const_iterator E) const;
@@ -112,8 +114,12 @@
 INITIALIZE_PASS_END(MachineCSE, "machine-cse",
                 "Machine Common Subexpression Elimination", false, false)
 
-bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
-                                          MachineBasicBlock *MBB) {
+/// The source register of a COPY machine instruction can be propagated to all
+/// its users, and this propagation could increase the probability of finding
+/// common subexpressions. If the COPY has only one user, the COPY itself can
+/// be removed.
+bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
+                                               MachineBasicBlock *MBB) {
   bool Changed = false;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
@@ -122,10 +128,7 @@
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
-    if (!MRI->hasOneNonDBGUse(Reg))
-      // Only coalesce single use copies. This ensure the copy will be
-      // deleted.
-      continue;
+    bool OnlyOneUse = MRI->hasOneNonDBGUse(Reg);
     MachineInstr *DefMI = MRI->getVRegDef(Reg);
     if (!DefMI->isCopy())
       continue;
@@ -153,10 +156,14 @@
       continue;
     DEBUG(dbgs() << "Coalescing: " << *DefMI);
     DEBUG(dbgs() << "***     to: " << *MI);
+    // Propagate SrcReg of copies to MI.
     MO.setReg(SrcReg);
     MRI->clearKillFlags(SrcReg);
-    DefMI->eraseFromParent();
-    ++NumCoalesces;
+    // Coalesce single use copies.
+    if (OnlyOneUse) {
+      DefMI->eraseFromParent();
+      ++NumCoalesces;
+    }
     Changed = true;
   }
 
@@ -453,13 +460,15 @@
 
     bool FoundCSE = VNT.count(MI);
     if (!FoundCSE) {
-      // Look for trivial copy coalescing opportunities.
-      if (PerformTrivialCoalescing(MI, MBB)) {
+      // Using trivial copy propagation to find more CSE opportunities.
+      if (PerformTrivialCopyPropagation(MI, MBB)) {
         Changed = true;
 
         // After coalescing MI itself may become a copy.
         if (MI->isCopyLike())
           continue;
+
+        // Try again to see if CSE is possible.
         FoundCSE = VNT.count(MI);
       }
     }
@@ -663,8 +672,8 @@
   if (skipOptnoneFunction(*MF.getFunction()))
     return false;
 
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   AA = &getAnalysis<AliasAnalysis>();
   DT = &getAnalysis<MachineDominatorTree>();

diff --git a/lib/CodeGen/MachineCodeEmitter.cpp b/lib/CodeGen/MachineCodeEmitter.cpp
deleted file mode 100644
index 81b4978..0000000
--- a/lib/CodeGen/MachineCodeEmitter.cpp
+++ /dev/null

@@ -1,14 +0,0 @@
-//===-- llvm/CodeGen/MachineCodeEmitter.cpp - Code emission -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/MachineCodeEmitter.h"
-
-using namespace llvm;
-
-void MachineCodeEmitter::anchor() { }

diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
new file mode 100644
index 0000000..2931258
--- /dev/null
+++ b/lib/CodeGen/MachineCombiner.cpp

@@ -0,0 +1,435 @@
+//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The machine combiner pass uses machine trace metrics to ensure the combined
+// instructions does not lengthen the critical path or the resource depth.
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "machine-combiner"
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumInstCombined, "Number of machineinst combined");
+
+namespace {
+class MachineCombiner : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MCSchedModel SchedModel;
+  MachineRegisterInfo *MRI;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+
+  TargetSchedModel TSchedModel;
+
+  /// OptSize - True if optimizing for code size.
+  bool OptSize;
+
+public:
+  static char ID;
+  MachineCombiner() : MachineFunctionPass(ID) {
+    initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override { return "Machine InstCombiner"; }
+
+private:
+  bool doSubstitute(unsigned NewSize, unsigned OldSize);
+  bool combineInstructions(MachineBasicBlock *);
+  MachineInstr *getOperandDef(const MachineOperand &MO);
+  unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
+                    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                    MachineTraceMetrics::Trace BlockTrace);
+  unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
+                      MachineTraceMetrics::Trace BlockTrace);
+  bool
+  preservesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
+                           MachineTraceMetrics::Trace BlockTrace,
+                           SmallVectorImpl<MachineInstr *> &InsInstrs,
+                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg);
+  bool preservesResourceLen(MachineBasicBlock *MBB,
+                            MachineTraceMetrics::Trace BlockTrace,
+                            SmallVectorImpl<MachineInstr *> &InsInstrs,
+                            SmallVectorImpl<MachineInstr *> &DelInstrs);
+  void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
+                     SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
+};
+}
+
+char MachineCombiner::ID = 0;
+char &llvm::MachineCombinerID = MachineCombiner::ID;
+
+INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
+                      "Machine InstCombiner", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
+                    false, false)
+
+void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
+  MachineInstr *DefInstr = nullptr;
+  // We need a virtual register definition.
+  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    DefInstr = MRI->getUniqueVRegDef(MO.getReg());
+  // PHI's have no depth etc.
+  if (DefInstr && DefInstr->isPHI())
+    DefInstr = nullptr;
+  return DefInstr;
+}
+
+/// getDepth - Computes depth of instructions in vector \InsInstr.
+///
+/// \param InsInstrs is a vector of machine instructions
+/// \param InstrIdxForVirtReg is a dense map of virtual register to index
+/// of defining machine instruction in \p InsInstrs
+/// \param BlockTrace is a trace of machine instructions
+///
+/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
+unsigned
+MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                          MachineTraceMetrics::Trace BlockTrace) {
+
+  SmallVector<unsigned, 16> InstrDepth;
+  assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
+
+  // Foreach instruction in in the new sequence compute the depth based on the
+  // operands. Use the trace information when possible. For new operands which
+  // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
+  for (auto *InstrPtr : InsInstrs) { // for each Use
+    unsigned IDepth = 0;
+    DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";);
+    for (unsigned i = 0, e = InstrPtr->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = InstrPtr->getOperand(i);
+      // Check for virtual register operand.
+      if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+        continue;
+      if (!MO.isUse())
+        continue;
+      unsigned DepthOp = 0;
+      unsigned LatencyOp = 0;
+      DenseMap<unsigned, unsigned>::iterator II =
+          InstrIdxForVirtReg.find(MO.getReg());
+      if (II != InstrIdxForVirtReg.end()) {
+        // Operand is new virtual register not in trace
+        assert(II->second < InstrDepth.size() && "Bad Index");
+        MachineInstr *DefInstr = InsInstrs[II->second];
+        assert(DefInstr &&
+               "There must be a definition for a new virtual register");
+        DepthOp = InstrDepth[II->second];
+        LatencyOp = TSchedModel.computeOperandLatency(
+            DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
+            InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+      } else {
+        MachineInstr *DefInstr = getOperandDef(MO);
+        if (DefInstr) {
+          DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth;
+          LatencyOp = TSchedModel.computeOperandLatency(
+              DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
+              InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+        }
+      }
+      IDepth = std::max(IDepth, DepthOp + LatencyOp);
+    }
+    InstrDepth.push_back(IDepth);
+  }
+  unsigned NewRootIdx = InsInstrs.size() - 1;
+  return InstrDepth[NewRootIdx];
+}
+
+/// getLatency - Computes instruction latency as max of latency of defined
+/// operands
+///
+/// \param Root is a machine instruction that could be replaced by NewRoot.
+/// It is used to compute a more accurate latency information for NewRoot in
+/// case there is a dependent instruction in the same trace (\p BlockTrace)
+/// \param NewRoot is the instruction for which the latency is computed
+/// \param BlockTrace is a trace of machine instructions
+///
+/// \returns Latency of \p NewRoot
+unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
+                                     MachineTraceMetrics::Trace BlockTrace) {
+
+  assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
+
+  // Check each definition in NewRoot and compute the latency
+  unsigned NewRootLatency = 0;
+
+  for (unsigned i = 0, e = NewRoot->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = NewRoot->getOperand(i);
+    // Check for virtual register operand.
+    if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+      continue;
+    if (!MO.isDef())
+      continue;
+    // Get the first instruction that uses MO
+    MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
+    RI++;
+    MachineInstr *UseMO = RI->getParent();
+    unsigned LatencyOp = 0;
+    if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) {
+      LatencyOp = TSchedModel.computeOperandLatency(
+          NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
+          UseMO->findRegisterUseOperandIdx(MO.getReg()));
+    } else {
+      LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode());
+    }
+    NewRootLatency = std::max(NewRootLatency, LatencyOp);
+  }
+  return NewRootLatency;
+}
+
+/// preservesCriticalPathlen - True when the new instruction sequence does not
+/// lengthen the critical path. The DAGCombine code sequence ends in MI
+/// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A
+/// necessary condition for the new sequence to replace the old sequence is that
+/// is cannot lengthen the critical path. This is decided by the formula
+/// (NewRootDepth + NewRootLatency) <=  (RootDepth + RootLatency + RootSlack)).
+/// The slack is the number of cycles Root can be delayed before the critical
+/// patch becomes longer.
+bool MachineCombiner::preservesCriticalPathLen(
+    MachineBasicBlock *MBB, MachineInstr *Root,
+    MachineTraceMetrics::Trace BlockTrace,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
+
+  assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
+  // NewRoot is the last instruction in the \p InsInstrs vector
+  // Get depth and latency of NewRoot
+  unsigned NewRootIdx = InsInstrs.size() - 1;
+  MachineInstr *NewRoot = InsInstrs[NewRootIdx];
+  unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
+  unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
+
+  // Get depth, latency and slack of Root
+  unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth;
+  unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
+  unsigned RootSlack = BlockTrace.getInstrSlack(Root);
+
+  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
+        dbgs() << " NewRootDepth: " << NewRootDepth
+               << " NewRootLatency: " << NewRootLatency << "\n";
+        dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency
+               << " RootSlack: " << RootSlack << "\n";
+        dbgs() << " NewRootDepth + NewRootLatency "
+               << NewRootDepth + NewRootLatency << "\n";
+        dbgs() << " RootDepth + RootLatency + RootSlack "
+               << RootDepth + RootLatency + RootSlack << "\n";);
+
+  /// True when the new sequence does not lenghten the critical path.
+  return ((NewRootDepth + NewRootLatency) <=
+          (RootDepth + RootLatency + RootSlack));
+}
+
+/// helper routine to convert instructions into SC
+void MachineCombiner::instr2instrSC(
+    SmallVectorImpl<MachineInstr *> &Instrs,
+    SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
+  for (auto *InstrPtr : Instrs) {
+    unsigned Opc = InstrPtr->getOpcode();
+    unsigned Idx = TII->get(Opc).getSchedClass();
+    const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
+    InstrsSC.push_back(SC);
+  }
+}
+/// preservesResourceLen - True when the new instructions do not increase
+/// resource length
+bool MachineCombiner::preservesResourceLen(
+    MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs) {
+
+  // Compute current resource length
+
+  //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
+  SmallVector <const MachineBasicBlock *, 1> MBBarr;
+  MBBarr.push_back(MBB);
+  unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
+
+  // Deal with SC rather than Instructions.
+  SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
+  SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
+
+  instr2instrSC(InsInstrs, InsInstrsSC);
+  instr2instrSC(DelInstrs, DelInstrsSC);
+
+  ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
+  ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
+
+  // Compute new resource length
+  unsigned ResLenAfterCombine =
+      BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
+
+  DEBUG(dbgs() << "RESOURCE DATA: \n";
+        dbgs() << " resource len before: " << ResLenBeforeCombine
+               << " after: " << ResLenAfterCombine << "\n";);
+
+  return ResLenAfterCombine <= ResLenBeforeCombine;
+}
+
+/// \returns true when new instruction sequence should be generated
+/// independent if it lenghtens critical path or not
+bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
+  if (OptSize && (NewSize < OldSize))
+    return true;
+  if (!TSchedModel.hasInstrSchedModel())
+    return true;
+  return false;
+}
+
+/// combineInstructions - substitute a slow code sequence with a faster one by
+/// evaluating instruction combining pattern.
+/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
+/// combining based on machine trace metrics. Only combine a sequence of
+/// instructions  when this neither lengthens the critical path nor increases
+/// resource pressure. When optimizing for codesize always combine when the new
+/// sequence is shorter.
+bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
+
+  auto BlockIter = MBB->begin();
+
+  while (BlockIter != MBB->end()) {
+    auto &MI = *BlockIter++;
+
+    DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";);
+    SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Pattern;
+    // The motivating example is:
+    //
+    //     MUL  Other        MUL_op1 MUL_op2  Other
+    //      \    /               \      |    /
+    //      ADD/SUB      =>        MADD/MSUB
+    //      (=Root)                (=NewRoot)
+
+    // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
+    // usually beneficial for code size it unfortunately can hurt performance
+    // when the ADD is on the critical path, but the MUL is not. With the
+    // substitution the MUL becomes part of the critical path (in form of the
+    // MADD) and can lengthen it on architectures where the MADD latency is
+    // longer than the ADD latency.
+    //
+    // For each instruction we check if it can be the root of a combiner
+    // pattern. Then for each pattern the new code sequence in form of MI is
+    // generated and evaluated. When the efficiency criteria (don't lengthen
+    // critical path, don't use more resources) is met the new sequence gets
+    // hooked up into the basic block before the old sequence is removed.
+    //
+    // The algorithm does not try to evaluate all patterns and pick the best.
+    // This is only an artificial restriction though. In practice there is
+    // mostly one pattern and hasPattern() can order patterns based on an
+    // internal cost heuristic.
+
+    if (TII->hasPattern(MI, Pattern)) {
+      for (auto P : Pattern) {
+        SmallVector<MachineInstr *, 16> InsInstrs;
+        SmallVector<MachineInstr *, 16> DelInstrs;
+        DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
+        if (!MinInstr)
+          MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+        MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+        Traces->verifyAnalysis();
+        TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
+                                        InstrIdxForVirtReg);
+        // Found pattern, but did not generate alternative sequence.
+        // This can happen e.g. when an immediate could not be materialized
+        // in a single instruction.
+        if (!InsInstrs.size())
+          continue;
+        // Substitute when we optimize for codesize and the new sequence has
+        // fewer instructions OR
+        // the new sequence neither lenghten the critical path nor increases
+        // resource pressure.
+        if (doSubstitute(InsInstrs.size(), DelInstrs.size()) ||
+            (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
+                                      InstrIdxForVirtReg) &&
+             preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
+          for (auto *InstrPtr : InsInstrs)
+            MBB->insert((MachineBasicBlock::iterator) & MI,
+                        (MachineInstr *)InstrPtr);
+          for (auto *InstrPtr : DelInstrs)
+            InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
+
+          Changed = true;
+          ++NumInstCombined;
+
+          Traces->invalidate(MBB);
+          Traces->verifyAnalysis();
+          // Eagerly stop after the first pattern fired
+          break;
+        } else {
+          // Cleanup instructions of the alternative code sequence. There is no
+          // use for them.
+          for (auto *InstrPtr : InsInstrs) {
+            MachineFunction *MF = MBB->getParent();
+            MF->DeleteMachineInstr((MachineInstr *)InstrPtr);
+          }
+        }
+        InstrIdxForVirtReg.clear();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
+  const TargetSubtargetInfo &STI =
+      MF.getTarget().getSubtarget<TargetSubtargetInfo>();
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  SchedModel = STI.getSchedModel();
+  TSchedModel.init(SchedModel, &STI, TII);
+  MRI = &MF.getRegInfo();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = 0;
+
+  OptSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
+  if (!TII->useMachineCombiner()) {
+    DEBUG(dbgs() << "  Skipping pass: Target does not support machine combiner\n");
+    return false;
+  }
+
+  bool Changed = false;
+
+  // Try to combine instructions.
+  for (auto &MBB : MF)
+    Changed |= combineInstructions(&MBB);
+
+  return Changed;
+}

diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 3119a35..cbd6272 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp

@@ -25,6 +25,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "codegen-cp"
@@ -335,8 +336,8 @@
 
   bool Changed = false;
 
-  TRI = MF.getTarget().getRegisterInfo();
-  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
 
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)

diff --git a/lib/CodeGen/MachineDominanceFrontier.cpp b/lib/CodeGen/MachineDominanceFrontier.cpp
new file mode 100644
index 0000000..0bee846
--- /dev/null
+++ b/lib/CodeGen/MachineDominanceFrontier.cpp

@@ -0,0 +1,54 @@
+//===- MachineDominanceFrontier.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/Analysis/DominanceFrontierImpl.h"
+#include "llvm/CodeGen/Passes.h"
+
+
+using namespace llvm;
+
+namespace llvm {
+template class DominanceFrontierBase<MachineBasicBlock>;
+template class ForwardDominanceFrontierBase<MachineBasicBlock>;
+}
+
+
+char MachineDominanceFrontier::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MachineDominanceFrontier, "machine-domfrontier",
+                "Machine Dominance Frontier Construction", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MachineDominanceFrontier, "machine-domfrontier",
+                "Machine Dominance Frontier Construction", true, true)
+
+MachineDominanceFrontier::MachineDominanceFrontier()
+  : MachineFunctionPass(ID),
+    Base() {
+  initializeMachineDominanceFrontierPass(*PassRegistry::getPassRegistry());
+}
+
+char &llvm::MachineDominanceFrontierID = MachineDominanceFrontier::ID;
+
+bool MachineDominanceFrontier::runOnMachineFunction(MachineFunction &) {
+  releaseMemory();
+  Base.analyze(getAnalysis<MachineDominatorTree>().getBase());
+  return false;
+}
+
+void MachineDominanceFrontier::releaseMemory() {
+  Base.releaseMemory();
+}
+
+void MachineDominanceFrontier::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}

diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 04c8ecb..df60cf3 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp

@@ -35,6 +35,8 @@
 }
 
 bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  CriticalEdgesToSplit.clear();
+  NewBBs.clear();
   DT->recalculate(F);
 
   return false;

diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 6138aef..8a2b610 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp

@@ -36,6 +36,7 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "codegen"
@@ -52,17 +53,19 @@
 }
 
 MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
-                                 unsigned FunctionNum, MachineModuleInfo &mmi,
-                                 GCModuleInfo* gmi)
-  : Fn(F), Target(TM), Ctx(mmi.getContext()), MMI(mmi), GMI(gmi) {
-  if (TM.getRegisterInfo())
-    RegInfo = new (Allocator) MachineRegisterInfo(TM);
+                                 unsigned FunctionNum, MachineModuleInfo &mmi)
+    : Fn(F), Target(TM), STI(TM.getSubtargetImpl()), Ctx(mmi.getContext()),
+      MMI(mmi) {
+  if (STI->getRegisterInfo())
+    RegInfo = new (Allocator) MachineRegisterInfo(this);
   else
     RegInfo = nullptr;
 
   MFInfo = nullptr;
-  FrameInfo =
-    new (Allocator) MachineFrameInfo(TM,!F->hasFnAttribute("no-realign-stack"));
+  FrameInfo = new (Allocator)
+      MachineFrameInfo(STI->getFrameLowering()->getStackAlignment(),
+                       STI->getFrameLowering()->isStackRealignable(),
+                       !F->hasFnAttribute("no-realign-stack"));
 
   if (Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                        Attribute::StackAlignment))
@@ -70,13 +73,13 @@
                                 getStackAlignment(AttributeSet::FunctionIndex));
 
   ConstantPool = new (Allocator) MachineConstantPool(TM);
-  Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
+  Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
 
   // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
   if (!Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                         Attribute::OptimizeForSize))
     Alignment = std::max(Alignment,
-                         TM.getTargetLowering()->getPrefFunctionAlignment());
+                         STI->getTargetLowering()->getPrefFunctionAlignment());
 
   FunctionNumber = FunctionNum;
   JumpTableInfo = nullptr;
@@ -229,10 +232,10 @@
 MachineMemOperand *
 MachineFunction::getMachineMemOperand(MachinePointerInfo PtrInfo, unsigned f,
                                       uint64_t s, unsigned base_alignment,
-                                      const MDNode *TBAAInfo,
+                                      const AAMDNodes &AAInfo,
                                       const MDNode *Ranges) {
   return new (Allocator) MachineMemOperand(PtrInfo, f, s, base_alignment,
-                                           TBAAInfo, Ranges);
+                                           AAInfo, Ranges);
 }
 
 MachineMemOperand *
@@ -243,12 +246,12 @@
                MachineMemOperand(MachinePointerInfo(MMO->getValue(),
                                                     MMO->getOffset()+Offset),
                                  MMO->getFlags(), Size,
-                                 MMO->getBaseAlignment(), nullptr);
+                                 MMO->getBaseAlignment());
   return new (Allocator)
              MachineMemOperand(MachinePointerInfo(MMO->getPseudoValue(),
                                                   MMO->getOffset()+Offset),
                                MMO->getFlags(), Size,
-                               MMO->getBaseAlignment(), nullptr);
+                               MMO->getBaseAlignment());
 }
 
 MachineInstr::mmo_iterator
@@ -279,7 +282,7 @@
           getMachineMemOperand((*I)->getPointerInfo(),
                                (*I)->getFlags() & ~MachineMemOperand::MOStore,
                                (*I)->getSize(), (*I)->getBaseAlignment(),
-                               (*I)->getTBAAInfo());
+                               (*I)->getAAInfo());
         Result[Index] = JustLoad;
       }
       ++Index;
@@ -311,7 +314,7 @@
           getMachineMemOperand((*I)->getPointerInfo(),
                                (*I)->getFlags() & ~MachineMemOperand::MOLoad,
                                (*I)->getSize(), (*I)->getBaseAlignment(),
-                               (*I)->getTBAAInfo());
+                               (*I)->getAAInfo());
         Result[Index] = JustStore;
       }
       ++Index;
@@ -350,7 +353,7 @@
   // Print Constant Pool
   ConstantPool->print(OS);
 
-  const TargetRegisterInfo *TRI = getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = getSubtarget().getRegisterInfo();
 
   if (RegInfo && !RegInfo->livein_empty()) {
     OS << "Function Live Ins: ";
@@ -459,7 +462,7 @@
 /// normal 'L' label is returned.
 MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
                                         bool isLinkerPrivate) const {
-  const DataLayout *DL = getTarget().getDataLayout();
+  const DataLayout *DL = getSubtarget().getDataLayout();
   assert(JumpTableInfo && "No jump tables");
   assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
 
@@ -474,7 +477,7 @@
 /// getPICBaseSymbol - Return a function-local symbol to represent the PIC
 /// base.
 MCSymbol *MachineFunction::getPICBaseSymbol() const {
-  const DataLayout *DL = getTarget().getDataLayout();
+  const DataLayout *DL = getSubtarget().getDataLayout();
   return Ctx.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
                                Twine(getFunctionNumber())+"$pb");
 }
@@ -483,15 +486,11 @@
 //  MachineFrameInfo implementation
 //===----------------------------------------------------------------------===//
 
-const TargetFrameLowering *MachineFrameInfo::getFrameLowering() const {
-  return TM.getFrameLowering();
-}
-
 /// ensureMaxAlignment - Make sure the function is at least Align bytes
 /// aligned.
 void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
-  if (!getFrameLowering()->isStackRealignable() || !RealignOption)
-    assert(Align <= getFrameLowering()->getStackAlignment() &&
+  if (!StackRealignable || !RealignOption)
+    assert(Align <= StackAlignment &&
            "For targets without stack realignment, Align is out of limit!");
   if (MaxAlignment < Align) MaxAlignment = Align;
 }
@@ -513,11 +512,10 @@
 int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
                       bool isSS, const AllocaInst *Alloca) {
   assert(Size != 0 && "Cannot allocate zero size stack objects!");
-  Alignment =
-    clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
-                          !RealignOption,
-                        Alignment, getFrameLowering()->getStackAlignment());
-  Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca));
+  Alignment = clampStackAlignment(!StackRealignable || !RealignOption,
+                                  Alignment, StackAlignment);
+  Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca,
+                                !isSS));
   int Index = (int)Objects.size() - NumFixedObjects - 1;
   assert(Index >= 0 && "Bad frame index!");
   ensureMaxAlignment(Alignment);
@@ -530,9 +528,8 @@
 ///
 int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
                                              unsigned Alignment) {
-  Alignment = clampStackAlignment(
-      !getFrameLowering()->isStackRealignable() || !RealignOption, Alignment,
-      getFrameLowering()->getStackAlignment());
+  Alignment = clampStackAlignment(!StackRealignable || !RealignOption,
+                                  Alignment, StackAlignment);
   CreateStackObject(Size, Alignment, true);
   int Index = (int)Objects.size() - NumFixedObjects - 1;
   ensureMaxAlignment(Alignment);
@@ -547,10 +544,9 @@
 int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
                                                 const AllocaInst *Alloca) {
   HasVarSizedObjects = true;
-  Alignment = clampStackAlignment(
-      !getFrameLowering()->isStackRealignable() || !RealignOption, Alignment,
-      getFrameLowering()->getStackAlignment());
-  Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca));
+  Alignment = clampStackAlignment(!StackRealignable || !RealignOption,
+                                  Alignment, StackAlignment);
+  Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true));
   ensureMaxAlignment(Alignment);
   return (int)Objects.size()-NumFixedObjects-1;
 }
@@ -561,20 +557,18 @@
 /// index with a negative value.
 ///
 int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
-                                        bool Immutable) {
+                                        bool Immutable, bool isAliased) {
   assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
   // The alignment of the frame index can be determined from its offset from
   // the incoming frame position.  If the frame object is at offset 32 and
   // the stack is guaranteed to be 16-byte aligned, then we know that the
   // object is 16-byte aligned.
-  unsigned StackAlign = getFrameLowering()->getStackAlignment();
-  unsigned Align = MinAlign(SPOffset, StackAlign);
-  Align = clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
-                                  !RealignOption,
-                              Align, getFrameLowering()->getStackAlignment());
+  unsigned Align = MinAlign(SPOffset, StackAlignment);
+  Align = clampStackAlignment(!StackRealignable || !RealignOption, Align,
+                              StackAlignment);
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
                                               /*isSS*/   false,
-                                              /*Alloca*/ nullptr));
+                                              /*Alloca*/ nullptr, isAliased));
   return -++NumFixedObjects;
 }
 
@@ -582,15 +576,14 @@
 /// on the stack.  Returns an index with a negative value.
 int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
                                                   int64_t SPOffset) {
-  unsigned StackAlign = getFrameLowering()->getStackAlignment();
-  unsigned Align = MinAlign(SPOffset, StackAlign);
-  Align = clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
-                                  !RealignOption,
-                              Align, getFrameLowering()->getStackAlignment());
+  unsigned Align = MinAlign(SPOffset, StackAlignment);
+  Align = clampStackAlignment(!StackRealignable || !RealignOption, Align,
+                              StackAlignment);
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset,
                                               /*Immutable*/ true,
                                               /*isSS*/ true,
-                                              /*Alloca*/ nullptr));
+                                              /*Alloca*/ nullptr,
+                                              /*isAliased*/ false));
   return -++NumFixedObjects;
 }
 
@@ -600,7 +593,7 @@
   const MachineFunction *MF = MBB->getParent();
   assert(MF && "MBB must be part of a MachineFunction");
   const TargetMachine &TM = MF->getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
   BitVector BV(TRI->getNumRegs());
 
   // Before CSI is calculated, no registers are considered pristine. They can be
@@ -625,8 +618,8 @@
 }
 
 unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   unsigned MaxAlign = getMaxAlignment();
   int Offset = 0;
 
@@ -676,7 +669,7 @@
 void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
   if (Objects.empty()) return;
 
-  const TargetFrameLowering *FI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering();
   int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
 
   OS << "Frame Objects:\n";
@@ -820,7 +813,7 @@
 void MachineConstantPoolValue::anchor() { }
 
 const DataLayout *MachineConstantPool::getDataLayout() const {
-  return TM.getDataLayout();
+  return TM.getSubtargetImpl()->getDataLayout();
 }
 
 Type *MachineConstantPoolEntry::getType() const {
@@ -836,6 +829,37 @@
   return Val.ConstVal->getRelocationInfo();
 }
 
+SectionKind
+MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const {
+  SectionKind Kind;
+  switch (getRelocationInfo()) {
+  default:
+    llvm_unreachable("Unknown section kind");
+  case 2:
+    Kind = SectionKind::getReadOnlyWithRel();
+    break;
+  case 1:
+    Kind = SectionKind::getReadOnlyWithRelLocal();
+    break;
+  case 0:
+    switch (DL->getTypeAllocSize(getType())) {
+    case 4:
+      Kind = SectionKind::getMergeableConst4();
+      break;
+    case 8:
+      Kind = SectionKind::getMergeableConst8();
+      break;
+    case 16:
+      Kind = SectionKind::getMergeableConst16();
+      break;
+    default:
+      Kind = SectionKind::getMergeableConst();
+      break;
+    }
+  }
+  return Kind;
+}
+
 MachineConstantPool::~MachineConstantPool() {
   for (unsigned i = 0, e = Constants.size(); i != e; ++i)
     if (Constants[i].isMachineConstantPoolEntry())

diff --git a/lib/CodeGen/MachineFunctionAnalysis.cpp b/lib/CodeGen/MachineFunctionAnalysis.cpp
index 46cd60a..f6f34ba 100644
--- a/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/lib/CodeGen/MachineFunctionAnalysis.cpp

@@ -46,8 +46,7 @@
 bool MachineFunctionAnalysis::runOnFunction(Function &F) {
   assert(!MF && "MachineFunctionAnalysis already initialized!");
   MF = new MachineFunction(&F, TM, NextFnNum++,
-                           getAnalysis<MachineModuleInfo>(),
-                           getAnalysisIfAvailable<GCModuleInfo>());
+                           getAnalysis<MachineModuleInfo>());
   return false;
 }
 

diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 5122165..7ad0d94 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp

@@ -39,6 +39,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -105,23 +106,41 @@
   IsDef = Val;
 }
 
+// If this operand is currently a register operand, and if this is in a
+// function, deregister the operand from the register's use/def list.
+void MachineOperand::removeRegFromUses() {
+  if (!isReg() || !isOnRegUseList())
+    return;
+
+  if (MachineInstr *MI = getParent()) {
+    if (MachineBasicBlock *MBB = MI->getParent()) {
+      if (MachineFunction *MF = MBB->getParent())
+        MF->getRegInfo().removeRegOperandFromUseList(this);
+    }
+  }
+}
+
 /// ChangeToImmediate - Replace this operand with a new immediate operand of
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
 void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
   assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
-  // If this operand is currently a register operand, and if this is in a
-  // function, deregister the operand from the register's use/def list.
-  if (isReg() && isOnRegUseList())
-    if (MachineInstr *MI = getParent())
-      if (MachineBasicBlock *MBB = MI->getParent())
-        if (MachineFunction *MF = MBB->getParent())
-          MF->getRegInfo().removeRegOperandFromUseList(this);
+
+  removeRegFromUses();
 
   OpKind = MO_Immediate;
   Contents.ImmVal = ImmVal;
 }
 
+void MachineOperand::ChangeToFPImmediate(const ConstantFP *FPImm) {
+  assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
+
+  removeRegFromUses();
+
+  OpKind = MO_FPImmediate;
+  Contents.CFP = FPImm;
+}
+
 /// ChangeToRegister - Replace this operand with a new register operand of
 /// the specified value.  If an operand is known to be an register already,
 /// the setReg method should be used.
@@ -265,7 +284,8 @@
       if (const MachineBasicBlock *MBB = MI->getParent())
         if (const MachineFunction *MF = MBB->getParent())
           TM = &MF->getTarget();
-  const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : nullptr;
+  const TargetRegisterInfo *TRI =
+      TM ? TM->getSubtargetImpl()->getRegisterInfo() : nullptr;
 
   switch (getType()) {
   case MachineOperand::MO_Register:
@@ -429,11 +449,11 @@
 
 MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, unsigned f,
                                      uint64_t s, unsigned int a,
-                                     const MDNode *TBAAInfo,
+                                     const AAMDNodes &AAInfo,
                                      const MDNode *Ranges)
   : PtrInfo(ptrinfo), Size(s),
     Flags((f & ((1 << MOMaxBits) - 1)) | ((Log2_32(a) + 1) << MOMaxBits)),
-    TBAAInfo(TBAAInfo), Ranges(Ranges) {
+    AAInfo(AAInfo), Ranges(Ranges) {
   assert((PtrInfo.V.isNull() || PtrInfo.V.is<const PseudoSourceValue*>() ||
           isa<PointerType>(PtrInfo.V.get<const Value*>()->getType())) &&
          "invalid pointer value");
@@ -514,7 +534,7 @@
     OS << "(align=" << MMO.getAlignment() << ")";
 
   // Print TBAA info.
-  if (const MDNode *TBAAInfo = MMO.getTBAAInfo()) {
+  if (const MDNode *TBAAInfo = MMO.getAAInfo().TBAA) {
     OS << "(tbaa=";
     if (TBAAInfo->getNumOperands() > 0)
       TBAAInfo->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
@@ -523,6 +543,34 @@
     OS << ")";
   }
 
+  // Print AA scope info.
+  if (const MDNode *ScopeInfo = MMO.getAAInfo().Scope) {
+    OS << "(alias.scope=";
+    if (ScopeInfo->getNumOperands() > 0)
+      for (unsigned i = 0, ie = ScopeInfo->getNumOperands(); i != ie; ++i) {
+        ScopeInfo->getOperand(i)->printAsOperand(OS, /*PrintType=*/false);
+        if (i != ie-1)
+          OS << ",";
+      }
+    else
+      OS << "<unknown>";
+    OS << ")";
+  }
+
+  // Print AA noalias scope info.
+  if (const MDNode *NoAliasInfo = MMO.getAAInfo().NoAlias) {
+    OS << "(noalias=";
+    if (NoAliasInfo->getNumOperands() > 0)
+      for (unsigned i = 0, ie = NoAliasInfo->getNumOperands(); i != ie; ++i) {
+        NoAliasInfo->getOperand(i)->printAsOperand(OS, /*PrintType=*/false);
+        if (i != ie-1)
+          OS << ",";
+      }
+    else
+      OS << "<unknown>";
+    OS << ")";
+  }
+
   // Print nontemporal info.
   if (MMO.isNonTemporal())
     OS << "(nontemporal)";
@@ -865,6 +913,27 @@
   getParent()->erase(this);
 }
 
+void MachineInstr::eraseFromParentAndMarkDBGValuesForRemoval() {
+  assert(getParent() && "Not embedded in a basic block!");
+  MachineBasicBlock *MBB = getParent();
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Not embedded in a function!");
+
+  MachineInstr *MI = (MachineInstr *)this;
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    MRI.markUsesInDebugValueAsUndef(Reg);
+  }
+  MI->eraseFromParent();
+}
+
 void MachineInstr::eraseFromBundle() {
   assert(getParent() && "Not embedded in a basic block!");
   getParent()->erase_instr(this);
@@ -1379,7 +1448,7 @@
       // If we have an AliasAnalysis, ask it whether the memory is constant.
       if (AA && AA->pointsToConstantMemory(
                       AliasAnalysis::Location(V, (*I)->getSize(),
-                                              (*I)->getTBAAInfo())))
+                                              (*I)->getAAInfo())))
         continue;
     }
 
@@ -1489,8 +1558,8 @@
     OS << " = ";
 
   // Print the opcode name.
-  if (TM && TM->getInstrInfo())
-    OS << TM->getInstrInfo()->getName(getOpcode());
+  if (TM && TM->getSubtargetImpl()->getInstrInfo())
+    OS << TM->getSubtargetImpl()->getInstrInfo()->getName(getOpcode());
   else
     OS << "UNKNOWN";
 
@@ -1538,17 +1607,17 @@
     // call instructions much less noisy on targets where calls clobber lots
     // of registers. Don't rely on MO.isDead() because we may be called before
     // LiveVariables is run, or we may be looking at a non-allocatable reg.
-    if (MF && isCall() &&
+    if (MRI && isCall() &&
         MO.isReg() && MO.isImplicit() && MO.isDef()) {
       unsigned Reg = MO.getReg();
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        const MachineRegisterInfo &MRI = MF->getRegInfo();
-        if (MRI.use_empty(Reg)) {
+        if (MRI->use_empty(Reg)) {
           bool HasAliasLive = false;
-          for (MCRegAliasIterator AI(Reg, TM->getRegisterInfo(), true);
+          for (MCRegAliasIterator AI(
+                   Reg, TM->getSubtargetImpl()->getRegisterInfo(), true);
                AI.isValid(); ++AI) {
             unsigned AliasReg = *AI;
-            if (!MRI.use_empty(AliasReg)) {
+            if (!MRI->use_empty(AliasReg)) {
               HasAliasLive = true;
               break;
             }
@@ -1573,12 +1642,16 @@
     if (isDebugValue() && MO.isMetadata()) {
       // Pretty print DBG_VALUE instructions.
       const MDNode *MD = MO.getMetadata();
-      if (const MDString *MDS = dyn_cast<MDString>(MD->getOperand(2)))
-        OS << "!\"" << MDS->getString() << '\"';
+      DIDescriptor DI(MD);
+      DIVariable DIV(MD);
+
+      if (DI.isVariable() && !DIV.getName().empty())
+        OS << "!\"" << DIV.getName() << '\"';
       else
         MO.print(OS, TM);
     } else if (TM && (isInsertSubreg() || isRegSequence()) && MO.isImm()) {
-      OS << TM->getRegisterInfo()->getSubRegIndexName(MO.getImm());
+      OS << TM->getSubtargetImpl()->getRegisterInfo()->getSubRegIndexName(
+          MO.getImm());
     } else if (i == AsmDescOp && MO.isImm()) {
       // Pretty print the inline asm operand descriptor.
       OS << '$' << AsmOpCount++;
@@ -1595,9 +1668,12 @@
 
       unsigned RCID = 0;
       if (InlineAsm::hasRegClassConstraint(Flag, RCID)) {
-        if (TM)
-          OS << ':' << TM->getRegisterInfo()->getRegClass(RCID)->getName();
-        else
+        if (TM) {
+          const TargetRegisterInfo *TRI =
+            TM->getSubtargetImpl()->getRegisterInfo();
+          OS << ':'
+             << TRI->getRegClassName(TRI->getRegClass(RCID));
+        } else
           OS << ":RC" << RCID;
       }
 
@@ -1646,7 +1722,8 @@
     if (!HaveSemi) OS << ";"; HaveSemi = true;
     for (unsigned i = 0; i != VirtRegs.size(); ++i) {
       const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]);
-      OS << " " << RC->getName() << ':' << PrintReg(VirtRegs[i]);
+      OS << " " << MRI->getTargetRegisterInfo()->getRegClassName(RC)
+         << ':' << PrintReg(VirtRegs[i]);
       for (unsigned j = i+1; j != VirtRegs.size();) {
         if (MRI->getRegClass(VirtRegs[j]) != RC) {
           ++j;
@@ -1672,6 +1749,8 @@
         OS << " ]";
       }
     }
+    if (isIndirectDebugValue())
+      OS << " indirect";
   } else if (!debugLoc.isUnknown() && MF) {
     if (!HaveSemi) OS << ";";
     OS << " dbg:";

diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index 962169e..0690f08 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp

@@ -16,6 +16,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 namespace {
@@ -103,12 +104,12 @@
   assert(FirstMI != LastMI && "Empty bundle?");
   MIBundleBuilder Bundle(MBB, FirstMI, LastMI);
 
-  const TargetMachine &TM = MBB.getParent()->getTarget();
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
-  MachineInstrBuilder MIB = BuildMI(*MBB.getParent(), FirstMI->getDebugLoc(),
-                                    TII->get(TargetOpcode::BUNDLE));
+  MachineInstrBuilder MIB =
+      BuildMI(MF, FirstMI->getDebugLoc(), TII->get(TargetOpcode::BUNDLE));
   Bundle.prepend(MIB);
 
   SmallVector<unsigned, 32> LocalDefs;
@@ -140,7 +141,7 @@
           // Internal def is now killed.
           KilledDefSet.insert(Reg);
       } else {
-        if (ExternUseSet.insert(Reg)) {
+        if (ExternUseSet.insert(Reg).second) {
           ExternUses.push_back(Reg);
           if (MO.isUndef())
             UndefUseSet.insert(Reg);
@@ -157,7 +158,7 @@
       if (!Reg)
         continue;
 
-      if (LocalDefSet.insert(Reg)) {
+      if (LocalDefSet.insert(Reg).second) {
         LocalDefs.push_back(Reg);
         if (MO.isDead()) {
           DeadDefSet.insert(Reg);
@@ -173,7 +174,7 @@
       if (!MO.isDead()) {
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
           unsigned SubReg = *SubRegs;
-          if (LocalDefSet.insert(SubReg))
+          if (LocalDefSet.insert(SubReg).second)
             LocalDefs.push_back(SubReg);
         }
       }
@@ -185,7 +186,7 @@
   SmallSet<unsigned, 32> Added;
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
     unsigned Reg = LocalDefs[i];
-    if (Added.insert(Reg)) {
+    if (Added.insert(Reg).second) {
       // If it's not live beyond end of the bundle, mark it dead.
       bool isDead = DeadDefSet.count(Reg) || KilledDefSet.count(Reg);
       MIB.addReg(Reg, getDefRegState(true) | getDeadRegState(isDead) |

diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 94cdab5..2ab0467 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp

@@ -39,6 +39,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "machine-licm"
@@ -61,7 +62,6 @@
 
 namespace {
   class MachineLICM : public MachineFunctionPass {
-    const TargetMachine   *TM;
     const TargetInstrInfo *TII;
     const TargetLoweringBase *TLI;
     const TargetRegisterInfo *TRI;
@@ -142,9 +142,6 @@
       RegPressure.clear();
       RegLimit.clear();
       BackTrace.clear();
-      for (DenseMap<unsigned,std::vector<const MachineInstr*> >::iterator
-             CI = CSEMap.begin(), CE = CSEMap.end(); CI != CE; ++CI)
-        CI->second.clear();
       CSEMap.clear();
     }
 
@@ -324,13 +321,12 @@
     return false;
 
   Changed = FirstInLoop = false;
-  TM = &MF.getTarget();
-  TII = TM->getInstrInfo();
-  TLI = TM->getTargetLowering();
-  TRI = TM->getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TLI = MF.getSubtarget().getTargetLowering();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MFI = MF.getFrameInfo();
   MRI = &MF.getRegInfo();
-  InstrItins = TM->getInstrItineraryData();
+  InstrItins = MF.getSubtarget().getInstrItineraryData();
 
   PreRegAlloc = MRI->isSSA();
 
@@ -822,7 +818,7 @@
       if (!TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
 
-      bool isNew = RegSeen.insert(Reg);
+      bool isNew = RegSeen.insert(Reg).second;
       unsigned RCId, RCCost;
       getRegisterClassIDAndCost(MI, Reg, i, RCId, RCCost);
       if (MO.isDef())
@@ -854,7 +850,7 @@
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
 
-    bool isNew = RegSeen.insert(Reg);
+    bool isNew = RegSeen.insert(Reg).second;
     if (MO.isDef())
       Defs.push_back(Reg);
     else if (!isNew && isOperandKill(MO, MRI)) {
@@ -1299,15 +1295,7 @@
   for (MachineBasicBlock::iterator I = BB->begin(),E = BB->end(); I != E; ++I) {
     const MachineInstr *MI = &*I;
     unsigned Opcode = MI->getOpcode();
-    DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
-      CI = CSEMap.find(Opcode);
-    if (CI != CSEMap.end())
-      CI->second.push_back(MI);
-    else {
-      std::vector<const MachineInstr*> CSEMIs;
-      CSEMIs.push_back(MI);
-      CSEMap.insert(std::make_pair(Opcode, CSEMIs));
-    }
+    CSEMap[Opcode].push_back(MI);
   }
 }
 
@@ -1447,11 +1435,8 @@
     // Add to the CSE map.
     if (CI != CSEMap.end())
       CI->second.push_back(MI);
-    else {
-      std::vector<const MachineInstr*> CSEMIs;
-      CSEMIs.push_back(MI);
-      CSEMap.insert(std::make_pair(Opcode, CSEMIs));
-    }
+    else
+      CSEMap[Opcode].push_back(MI);
   }
 
   ++NumHoisted;

diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 4976e35..eb3c0bf 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp

@@ -270,7 +270,6 @@
 bool MachineModuleInfo::doInitialization(Module &M) {
 
   ObjFileMMI = nullptr;
-  CompactUnwindEncoding = 0;
   CurCallSite = 0;
   CallsEHReturn = 0;
   CallsUnwindInit = 0;
@@ -312,7 +311,6 @@
   FilterEnds.clear();
   CallsEHReturn = 0;
   CallsUnwindInit = 0;
-  CompactUnwindEncoding = 0;
   VariableDbgInfos.clear();
 }
 
@@ -429,7 +427,7 @@
 ///
 void MachineModuleInfo::
 addCatchTypeInfo(MachineBasicBlock *LandingPad,
-                 ArrayRef<const GlobalVariable *> TyInfo) {
+                 ArrayRef<const GlobalValue *> TyInfo) {
   LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
   for (unsigned N = TyInfo.size(); N; --N)
     LP.TypeIds.push_back(getTypeIDFor(TyInfo[N - 1]));
@@ -439,7 +437,7 @@
 ///
 void MachineModuleInfo::
 addFilterTypeInfo(MachineBasicBlock *LandingPad,
-                  ArrayRef<const GlobalVariable *> TyInfo) {
+                  ArrayRef<const GlobalValue *> TyInfo) {
   LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
   std::vector<unsigned> IdsInFilter(TyInfo.size());
   for (unsigned I = 0, E = TyInfo.size(); I != E; ++I)
@@ -508,7 +506,7 @@
 
 /// getTypeIDFor - Return the type id for the specified typeinfo.  This is
 /// function wide.
-unsigned MachineModuleInfo::getTypeIDFor(const GlobalVariable *TI) {
+unsigned MachineModuleInfo::getTypeIDFor(const GlobalValue *TI) {
   for (unsigned i = 0, N = TypeInfos.size(); i != N; ++i)
     if (TypeInfos[i] == TI) return i + 1;
 

diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp
new file mode 100644
index 0000000..5a5035e
--- /dev/null
+++ b/lib/CodeGen/MachineRegionInfo.cpp

@@ -0,0 +1,140 @@
+
+#include "llvm/CodeGen/MachineRegionInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/RegionInfoImpl.h"
+
+#define DEBUG_TYPE "region"
+
+using namespace llvm;
+
+STATISTIC(numMachineRegions,       "The # of machine regions");
+STATISTIC(numMachineSimpleRegions, "The # of simple machine regions");
+
+namespace llvm {
+template class RegionBase<RegionTraits<MachineFunction>>;
+template class RegionNodeBase<RegionTraits<MachineFunction>>;
+template class RegionInfoBase<RegionTraits<MachineFunction>>;
+}
+
+//===----------------------------------------------------------------------===//
+// MachineRegion implementation
+//
+
+MachineRegion::MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit,
+                             MachineRegionInfo* RI,
+                             MachineDominatorTree *DT, MachineRegion *Parent) :
+  RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) {
+
+}
+
+MachineRegion::~MachineRegion() { }
+
+//===----------------------------------------------------------------------===//
+// MachineRegionInfo implementation
+//
+
+MachineRegionInfo::MachineRegionInfo() :
+  RegionInfoBase<RegionTraits<MachineFunction>>() {
+
+}
+
+MachineRegionInfo::~MachineRegionInfo() {
+
+}
+
+void MachineRegionInfo::updateStatistics(MachineRegion *R) {
+  ++numMachineRegions;
+
+  // TODO: Slow. Should only be enabled if -stats is used.
+  if (R->isSimple())
+    ++numMachineSimpleRegions;
+}
+
+void MachineRegionInfo::recalculate(MachineFunction &F,
+                                    MachineDominatorTree *DT_,
+                                    MachinePostDominatorTree *PDT_,
+                                    MachineDominanceFrontier *DF_) {
+  DT = DT_;
+  PDT = PDT_;
+  DF = DF_;
+
+  MachineBasicBlock *Entry = GraphTraits<MachineFunction*>::getEntryNode(&F);
+
+  TopLevelRegion = new MachineRegion(Entry, nullptr, this, DT, nullptr);
+  updateStatistics(TopLevelRegion);
+  calculate(F);
+}
+
+//===----------------------------------------------------------------------===//
+// MachineRegionInfoPass implementation
+//
+
+MachineRegionInfoPass::MachineRegionInfoPass() : MachineFunctionPass(ID) {
+  initializeMachineRegionInfoPassPass(*PassRegistry::getPassRegistry());
+}
+
+MachineRegionInfoPass::~MachineRegionInfoPass() {
+
+}
+
+bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
+  releaseMemory();
+
+  auto DT = &getAnalysis<MachineDominatorTree>();
+  auto PDT = &getAnalysis<MachinePostDominatorTree>();
+  auto DF = &getAnalysis<MachineDominanceFrontier>();
+
+  RI.recalculate(F, DT, PDT, DF);
+  return false;
+}
+
+void MachineRegionInfoPass::releaseMemory() {
+  RI.releaseMemory();
+}
+
+void MachineRegionInfoPass::verifyAnalysis() const {
+  // Only do verification when user wants to, otherwise this expensive check
+  // will be invoked by PMDataManager::verifyPreservedAnalysis when
+  // a regionpass (marked PreservedAll) finish.
+  if (MachineRegionInfo::VerifyRegionInfo)
+    RI.verifyAnalysis();
+}
+
+void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequired<PostDominatorTree>();
+  AU.addRequired<DominanceFrontier>();
+}
+
+void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const {
+  RI.print(OS);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MachineRegionInfoPass::dump() const {
+  RI.dump();
+}
+#endif
+
+char MachineRegionInfoPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions",
+                "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(MachineRegionInfoPass, "regions",
+                "Detect single entry single exit regions", true, true)
+
+// Create methods available outside of this file, to use them
+// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
+// the link time optimization.
+
+namespace llvm {
+  FunctionPass *createMachineRegionInfoPass() {
+    return new MachineRegionInfoPass();
+  }
+}
+

diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index f560259..e9612f3 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp

@@ -16,28 +16,22 @@
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
 // Pin the vtable to this file.
 void MachineRegisterInfo::Delegate::anchor() {}
 
-MachineRegisterInfo::MachineRegisterInfo(const TargetMachine &TM)
-  : TM(TM), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true) {
+MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
+  : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
   UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
   UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs());
 
   // Create the physreg use/def lists.
-  PhysRegUseDefLists =
-    new MachineOperand*[getTargetRegisterInfo()->getNumRegs()];
-  memset(PhysRegUseDefLists, 0,
-         sizeof(MachineOperand*)*getTargetRegisterInfo()->getNumRegs());
-}
-
-MachineRegisterInfo::~MachineRegisterInfo() {
-  delete [] PhysRegUseDefLists;
+  PhysRegUseDefLists.resize(getTargetRegisterInfo()->getNumRegs(), nullptr);
 }
 
 /// setRegClass - Set the register class of the specified virtual register.
@@ -67,7 +61,7 @@
 
 bool
 MachineRegisterInfo::recomputeRegClass(unsigned Reg, const TargetMachine &TM) {
-  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
   const TargetRegisterClass *OldRC = getRegClass(Reg);
   const TargetRegisterClass *NewRC =
     getTargetRegisterInfo()->getLargestLegalSuperClass(OldRC);
@@ -283,18 +277,25 @@
 /// replaceRegWith - Replace all instances of FromReg with ToReg in the
 /// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
 /// except that it also changes any definitions of the register as well.
+/// If ToReg is a physical register we apply the sub register to obtain the
+/// final/proper physical register.
 void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
   assert(FromReg != ToReg && "Cannot replace a reg with itself");
 
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  
   // TODO: This could be more efficient by bulk changing the operands.
   for (reg_iterator I = reg_begin(FromReg), E = reg_end(); I != E; ) {
     MachineOperand &O = *I;
     ++I;
-    O.setReg(ToReg);
+    if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+      O.substPhysReg(ToReg, *TRI);
+    } else {
+      O.setReg(ToReg);
+    }
   }
 }
 
-
 /// getVRegDef - Return the machine instr that defines the specified virtual
 /// register or null if none is found.  This assumes that the code is in SSA
 /// form, so there should only be one definition.

diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index d9173a2..71a6eba 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp

@@ -24,8 +24,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
 using namespace llvm;
 
@@ -39,7 +39,7 @@
 MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
                                      SmallVectorImpl<MachineInstr*> *NewPHI)
   : AV(nullptr), InsertedPHIs(NewPHI) {
-  TII = MF.getTarget().getInstrInfo();
+  TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
 }
 

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 44191f7..261942f 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp

@@ -40,6 +40,9 @@
                            cl::desc("Force top-down list scheduling"));
 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
                             cl::desc("Force bottom-up list scheduling"));
+cl::opt<bool>
+DumpCriticalPathLength("misched-dcpl", cl::Hidden,
+                       cl::desc("Print critical path length to stdout"));
 }
 
 #ifndef NDEBUG
@@ -378,7 +381,7 @@
 
 /// Main driver for both MachineScheduler and PostMachineScheduler.
 void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   bool IsPostRA = Scheduler.isPostRA();
 
   // Visit all machine basic blocks.
@@ -451,6 +454,11 @@
             else dbgs() << "End";
             dbgs() << " RegionInstrs: " << NumRegionInstrs
             << " Remaining: " << RemainingInstrs << "\n");
+      if (DumpCriticalPathLength) {
+        errs() << MF->getName();
+        errs() << ":BB# " << MBB->getNumber();
+        errs() << " " << MBB->getName() << " \n";
+      }
 
       // Schedule a region: possibly reorder instructions.
       // This invalidates 'RegionEnd' and 'I'.
@@ -2355,14 +2363,15 @@
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
   // are disabled, then these HazardRecs will be disabled.
   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
-  const TargetMachine &TM = DAG->MF.getTarget();
   if (!Top.HazardRec) {
     Top.HazardRec =
-      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
   }
   if (!Bot.HazardRec) {
     Bot.HazardRec =
-      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
   }
 }
 
@@ -2370,8 +2379,8 @@
 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
                                   MachineBasicBlock::iterator End,
                                   unsigned NumRegionInstrs) {
-  const TargetMachine &TM = Context->MF->getTarget();
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const MachineFunction &MF = *Begin->getParent()->getParent();
+  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
 
   // Avoid setting up the register pressure tracker for small regions to save
   // compile time. As a rough heuristic, only track pressure when the number of
@@ -2391,8 +2400,8 @@
   RegionPolicy.OnlyBottomUp = true;
 
   // Allow the subtarget to override default policy.
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
-  ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs);
+  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, Begin, End,
+                                        NumRegionInstrs);
 
   // After subtarget overrides, apply command line options.
   if (!EnableRegPressure)
@@ -2460,7 +2469,10 @@
     if ((*I)->getDepth() > Rem.CriticalPath)
       Rem.CriticalPath = (*I)->getDepth();
   }
-  DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+  DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
+  if (DumpCriticalPathLength) {
+    errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
+  }
 
   if (EnableCyclicPath) {
     Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
@@ -2482,8 +2494,8 @@
   }
   // If one candidate decreases and the other increases, go with it.
   // Invalid candidates have UnitInc==0.
-  if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
-              Reason)) {
+  if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
+                 Reason)) {
     return true;
   }
   // If the candidates are decreasing pressure, reverse priority.
@@ -2885,10 +2897,10 @@
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
   // or are disabled, then these HazardRecs will be disabled.
   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
-  const TargetMachine &TM = DAG->MF.getTarget();
   if (!Top.HazardRec) {
     Top.HazardRec =
-      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
   }
 }
 
@@ -2902,7 +2914,10 @@
     if ((*I)->getDepth() > Rem.CriticalPath)
       Rem.CriticalPath = (*I)->getDepth();
   }
-  DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+  DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
+  if (DumpCriticalPathLength) {
+    errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n";
+  }
 }
 
 /// Apply a set of heursitics to a new candidate for PostRA scheduling.

diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 0ae495c..ba25bca 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp

@@ -17,18 +17,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "machine-sink"
@@ -38,6 +41,12 @@
            cl::desc("Split critical edges during machine sinking"),
            cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+UseBlockFreqInfo("machine-sink-bfi",
+           cl::desc("Use block frequency info to find successors to sink"),
+           cl::init(true), cl::Hidden);
+
+
 STATISTIC(NumSunk,      "Number of machine instructions sunk");
 STATISTIC(NumSplit,     "Number of critical edges split");
 STATISTIC(NumCoalesces, "Number of copies coalesced");
@@ -46,14 +55,20 @@
   class MachineSinking : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
-    MachineRegisterInfo  *MRI;  // Machine register information
-    MachineDominatorTree *DT;   // Machine dominator tree
+    MachineRegisterInfo  *MRI;     // Machine register information
+    MachineDominatorTree *DT;      // Machine dominator tree
+    MachinePostDominatorTree *PDT; // Machine post dominator tree
     MachineLoopInfo *LI;
+    const MachineBlockFrequencyInfo *MBFI;
     AliasAnalysis *AA;
 
     // Remember which edges have been considered for breaking.
     SmallSet<std::pair<MachineBasicBlock*,MachineBasicBlock*>, 8>
     CEBCandidates;
+    // Remember which edges we are about to split.
+    // This is different from CEBCandidates since those edges
+    // will be split.
+    SetVector<std::pair<MachineBasicBlock*,MachineBasicBlock*> > ToSplit;
 
   public:
     static char ID; // Pass identification
@@ -68,9 +83,13 @@
       MachineFunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AliasAnalysis>();
       AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachinePostDominatorTree>();
       AU.addRequired<MachineLoopInfo>();
       AU.addPreserved<MachineDominatorTree>();
+      AU.addPreserved<MachinePostDominatorTree>();
       AU.addPreserved<MachineLoopInfo>();
+      if (UseBlockFreqInfo)
+        AU.addRequired<MachineBlockFrequencyInfo>();
     }
 
     void releaseMemory() override {
@@ -82,10 +101,22 @@
     bool isWorthBreakingCriticalEdge(MachineInstr *MI,
                                      MachineBasicBlock *From,
                                      MachineBasicBlock *To);
-    MachineBasicBlock *SplitCriticalEdge(MachineInstr *MI,
-                                         MachineBasicBlock *From,
-                                         MachineBasicBlock *To,
-                                         bool BreakPHIEdge);
+    /// \brief Postpone the splitting of the given critical
+    /// edge (\p From, \p To).
+    ///
+    /// We do not split the edges on the fly. Indeed, this invalidates
+    /// the dominance information and thus triggers a lot of updates
+    /// of that information underneath.
+    /// Instead, we postpone all the splits after each iteration of
+    /// the main loop. That way, the information is at least valid
+    /// for the lifetime of an iteration.
+    ///
+    /// \return True if the edge is marked as toSplit, false otherwise.
+    /// False can be retruned if, for instance, this is not profitable.
+    bool PostponeSplitCriticalEdge(MachineInstr *MI,
+                                   MachineBasicBlock *From,
+                                   MachineBasicBlock *To,
+                                   bool BreakPHIEdge);
     bool SinkInstruction(MachineInstr *MI, bool &SawStore);
     bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB,
                                  MachineBasicBlock *DefMBB,
@@ -135,6 +166,11 @@
   DEBUG(dbgs() << "*** to: " << *MI);
   MRI->replaceRegWith(DstReg, SrcReg);
   MI->eraseFromParent();
+
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
+  MRI->clearKillFlags(SrcReg);
+
   ++NumCoalesces;
   return true;
 }
@@ -213,12 +249,13 @@
 
   DEBUG(dbgs() << "******** Machine Sinking ********\n");
 
-  const TargetMachine &TM = MF.getTarget();
-  TII = TM.getInstrInfo();
-  TRI = TM.getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   DT = &getAnalysis<MachineDominatorTree>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
   LI = &getAnalysis<MachineLoopInfo>();
+  MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr;
   AA = &getAnalysis<AliasAnalysis>();
 
   bool EverMadeChange = false;
@@ -228,10 +265,24 @@
 
     // Process all basic blocks.
     CEBCandidates.clear();
+    ToSplit.clear();
     for (MachineFunction::iterator I = MF.begin(), E = MF.end();
          I != E; ++I)
       MadeChange |= ProcessBlock(*I);
 
+    // If we have anything we marked as toSplit, split it now.
+    for (auto &Pair : ToSplit) {
+      auto NewSucc = Pair.first->SplitCriticalEdge(Pair.second, this);
+      if (NewSucc != nullptr) {
+        DEBUG(dbgs() << " *** Splitting critical edge:"
+              " BB#" << Pair.first->getNumber()
+              << " -- BB#" << NewSucc->getNumber()
+              << " -- BB#" << Pair.second->getNumber() << '\n');
+        MadeChange = true;
+        ++NumSplit;
+      } else
+        DEBUG(dbgs() << " *** Not legal to break critical edge\n");
+    }
     // If this iteration over the code changed anything, keep iterating.
     if (!MadeChange) break;
     EverMadeChange = true;
@@ -289,7 +340,7 @@
   // If the pass has already considered breaking this edge (during this pass
   // through the function), then let's go ahead and break it. This means
   // sinking multiple "cheap" instructions into the same block.
-  if (!CEBCandidates.insert(std::make_pair(From, To)))
+  if (!CEBCandidates.insert(std::make_pair(From, To)).second)
     return true;
 
   if (!MI->isCopy() && !TII->isAsCheapAsAMove(MI))
@@ -328,21 +379,21 @@
   return false;
 }
 
-MachineBasicBlock *MachineSinking::SplitCriticalEdge(MachineInstr *MI,
-                                                     MachineBasicBlock *FromBB,
-                                                     MachineBasicBlock *ToBB,
-                                                     bool BreakPHIEdge) {
+bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr *MI,
+                                               MachineBasicBlock *FromBB,
+                                               MachineBasicBlock *ToBB,
+                                               bool BreakPHIEdge) {
   if (!isWorthBreakingCriticalEdge(MI, FromBB, ToBB))
-    return nullptr;
+    return false;
 
   // Avoid breaking back edge. From == To means backedge for single BB loop.
   if (!SplitEdges || FromBB == ToBB)
-    return nullptr;
+    return false;
 
   // Check for backedges of more "complex" loops.
   if (LI->getLoopFor(FromBB) == LI->getLoopFor(ToBB) &&
       LI->isLoopHeader(ToBB))
-    return nullptr;
+    return false;
 
   // It's not always legal to break critical edges and sink the computation
   // to the edge.
@@ -389,11 +440,13 @@
       if (*PI == FromBB)
         continue;
       if (!DT->dominates(ToBB, *PI))
-        return nullptr;
+        return false;
     }
   }
 
-  return FromBB->SplitCriticalEdge(ToBB, this);
+  ToSplit.insert(std::make_pair(FromBB, ToBB));
+  
+  return true;
 }
 
 static bool AvoidsSinking(MachineInstr *MI, MachineRegisterInfo *MRI) {
@@ -419,23 +472,6 @@
   }
 }
 
-/// isPostDominatedBy - Return true if A is post dominated by B.
-static bool isPostDominatedBy(MachineBasicBlock *A, MachineBasicBlock *B) {
-
-  // FIXME - Use real post dominator.
-  if (A->succ_size() != 2)
-    return false;
-  MachineBasicBlock::succ_iterator I = A->succ_begin();
-  if (B == *I)
-    ++I;
-  MachineBasicBlock *OtherSuccBlock = *I;
-  if (OtherSuccBlock->succ_size() != 1 ||
-      *(OtherSuccBlock->succ_begin()) != B)
-    return false;
-
-  return true;
-}
-
 /// isProfitableToSinkTo - Return true if it is profitable to sink MI.
 bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
                                           MachineBasicBlock *MBB,
@@ -447,8 +483,13 @@
     return false;
 
   // It is profitable if SuccToSinkTo does not post dominate current block.
-  if (!isPostDominatedBy(MBB, SuccToSinkTo))
-      return true;
+  if (!PDT->dominates(SuccToSinkTo, MBB))
+    return true;
+
+  // It is profitable to sink an instruction from a deeper loop to a shallower
+  // loop, even if the latter post-dominates the former (PR21115).
+  if (LI->getLoopDepth(MBB) > LI->getLoopDepth(SuccToSinkTo))
+    return true;
 
   // Check if only use in post dominated block is PHI instruction.
   bool NonPHIUse = false;
@@ -539,14 +580,20 @@
       }
 
       // Otherwise, we should look at all the successors and decide which one
-      // we should sink to.
-      // We give successors with smaller loop depth higher priority.
-      SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(), MBB->succ_end());
-      // Sort Successors according to their loop depth.
+      // we should sink to. If we have reliable block frequency information
+      // (frequency != 0) available, give successors with smaller frequencies
+      // higher priority, otherwise prioritize smaller loop depths.
+      SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(),
+                                               MBB->succ_end());
+      // Sort Successors according to their loop depth or block frequency info.
       std::stable_sort(
           Succs.begin(), Succs.end(),
-          [this](const MachineBasicBlock *LHS, const MachineBasicBlock *RHS) {
-            return LI->getLoopDepth(LHS) < LI->getLoopDepth(RHS);
+          [this](const MachineBasicBlock *L, const MachineBasicBlock *R) {
+            uint64_t LHSFreq = MBFI ? MBFI->getBlockFreq(L).getFrequency() : 0;
+            uint64_t RHSFreq = MBFI ? MBFI->getBlockFreq(R).getFrequency() : 0;
+            bool HasBlockFreq = LHSFreq != 0 && RHSFreq != 0;
+            return HasBlockFreq ? LHSFreq < RHSFreq
+                                : LI->getLoopDepth(L) < LI->getLoopDepth(R);
           });
       for (SmallVectorImpl<MachineBasicBlock *>::iterator SI = Succs.begin(),
              E = Succs.end(); SI != E; ++SI) {
@@ -655,21 +702,16 @@
     if (!TryBreak)
       DEBUG(dbgs() << "Sinking along critical edge.\n");
     else {
-      MachineBasicBlock *NewSucc =
-        SplitCriticalEdge(MI, ParentBlock, SuccToSinkTo, BreakPHIEdge);
-      if (!NewSucc) {
+      // Mark this edge as to be split.
+      // If the edge can actually be split, the next iteration of the main loop
+      // will sink MI in the newly created block.
+      bool Status =
+        PostponeSplitCriticalEdge(MI, ParentBlock, SuccToSinkTo, BreakPHIEdge);
+      if (!Status)
         DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
-                        "break critical edge\n");
-        return false;
-      } else {
-        DEBUG(dbgs() << " *** Splitting critical edge:"
-              " BB#" << ParentBlock->getNumber()
-              << " -- BB#" << NewSucc->getNumber()
-              << " -- BB#" << SuccToSinkTo->getNumber() << '\n');
-        SuccToSinkTo = NewSucc;
-        ++NumSplit;
-        BreakPHIEdge = false;
-      }
+              "break critical edge\n");
+      // The instruction will not be sunk this time.
+      return false;
     }
   }
 
@@ -677,20 +719,13 @@
     // BreakPHIEdge is true if all the uses are in the successor MBB being
     // sunken into and they are all PHI nodes. In this case, machine-sink must
     // break the critical edge first.
-    MachineBasicBlock *NewSucc = SplitCriticalEdge(MI, ParentBlock,
-                                                   SuccToSinkTo, BreakPHIEdge);
-    if (!NewSucc) {
+    bool Status = PostponeSplitCriticalEdge(MI, ParentBlock,
+                                            SuccToSinkTo, BreakPHIEdge);
+    if (!Status)
       DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
             "break critical edge\n");
-      return false;
-    }
-
-    DEBUG(dbgs() << " *** Splitting critical edge:"
-          " BB#" << ParentBlock->getNumber()
-          << " -- BB#" << NewSucc->getNumber()
-          << " -- BB#" << SuccToSinkTo->getNumber() << '\n');
-    SuccToSinkTo = NewSucc;
-    ++NumSplit;
+    // The instruction will not be sunk this time.
+    return false;
   }
 
   // Determine where to insert into. Skip phi nodes.

diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 1bbf0ad..2cf87eb 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp

@@ -52,13 +52,13 @@
 
 bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
-  TII = MF->getTarget().getInstrInfo();
-  TRI = MF->getTarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
   MRI = &MF->getRegInfo();
   Loops = &getAnalysis<MachineLoopInfo>();
   const TargetSubtargetInfo &ST =
     MF->getTarget().getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
   BlockInfo.resize(MF->getNumBlockIDs());
   ProcResourceCycles.resize(MF->getNumBlockIDs() *
                             SchedModel.getNumProcResourceKinds());
@@ -135,8 +135,7 @@
          "getResources() must be called before getProcResourceCycles()");
   unsigned PRKinds = SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceCycles.size());
-  return ArrayRef<unsigned>(ProcResourceCycles.data() + MBBNum * PRKinds,
-                            PRKinds);
+  return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds);
 }
 
 
@@ -256,8 +255,7 @@
 getProcResourceDepths(unsigned MBBNum) const {
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceDepths.size());
-  return ArrayRef<unsigned>(ProcResourceDepths.data() + MBBNum * PRKinds,
-                            PRKinds);
+  return makeArrayRef(ProcResourceDepths.data() + MBBNum * PRKinds, PRKinds);
 }
 
 /// Get an array of processor resource heights for MBB. Indexed by processor
@@ -270,8 +268,7 @@
 getProcResourceHeights(unsigned MBBNum) const {
   unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
   assert((MBBNum+1) * PRKinds <= ProcResourceHeights.size());
-  return ArrayRef<unsigned>(ProcResourceHeights.data() + MBBNum * PRKinds,
-                            PRKinds);
+  return makeArrayRef(ProcResourceHeights.data() + MBBNum * PRKinds, PRKinds);
 }
 
 //===----------------------------------------------------------------------===//
@@ -452,7 +449,7 @@
     }
     // To is a new block. Mark the block as visited in case the CFG has cycles
     // that MachineLoopInfo didn't recognize as a natural loop.
-    return LB.Visited.insert(To);
+    return LB.Visited.insert(To).second;
   }
 };
 }
@@ -1169,6 +1166,7 @@
   return DepCycle;
 }
 
+/// When bottom is set include instructions in current block in estimate.
 unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
   // Find the limiting processor resource.
   // Numbers have been pre-scaled to be comparable.
@@ -1185,7 +1183,9 @@
   // Convert to cycle count.
   PRMax = TE.MTM.getCycles(PRMax);
 
+  /// All instructions before current block
   unsigned Instrs = TBI.InstrDepth;
+  // plus instructions in current block
   if (Bottom)
     Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount;
   if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
@@ -1194,44 +1194,72 @@
   return std::max(Instrs, PRMax);
 }
 
-
-unsigned MachineTraceMetrics::Trace::
-getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks,
-                  ArrayRef<const MCSchedClassDesc*> ExtraInstrs) const {
+unsigned MachineTraceMetrics::Trace::getResourceLength(
+    ArrayRef<const MachineBasicBlock *> Extrablocks,
+    ArrayRef<const MCSchedClassDesc *> ExtraInstrs,
+    ArrayRef<const MCSchedClassDesc *> RemoveInstrs) const {
   // Add up resources above and below the center block.
   ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
   ArrayRef<unsigned> PRHeights = TE.getProcResourceHeights(getBlockNum());
   unsigned PRMax = 0;
+
+  // Capture computing cycles from extra instructions
+  auto extraCycles = [this](ArrayRef<const MCSchedClassDesc *> Instrs,
+                            unsigned ResourceIdx)
+                         ->unsigned {
+    unsigned Cycles = 0;
+    for (unsigned I = 0; I != Instrs.size(); ++I) {
+      const MCSchedClassDesc *SC = Instrs[I];
+      if (!SC->isValid())
+        continue;
+      for (TargetSchedModel::ProcResIter
+               PI = TE.MTM.SchedModel.getWriteProcResBegin(SC),
+               PE = TE.MTM.SchedModel.getWriteProcResEnd(SC);
+           PI != PE; ++PI) {
+        if (PI->ProcResourceIdx != ResourceIdx)
+          continue;
+        Cycles +=
+            (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(ResourceIdx));
+      }
+    }
+    return Cycles;
+  };
+
   for (unsigned K = 0; K != PRDepths.size(); ++K) {
     unsigned PRCycles = PRDepths[K] + PRHeights[K];
     for (unsigned I = 0; I != Extrablocks.size(); ++I)
       PRCycles += TE.MTM.getProcResourceCycles(Extrablocks[I]->getNumber())[K];
-    for (unsigned I = 0; I != ExtraInstrs.size(); ++I) {
-      const MCSchedClassDesc* SC = ExtraInstrs[I];
-      if (!SC->isValid())
-        continue;
-      for (TargetSchedModel::ProcResIter
-             PI = TE.MTM.SchedModel.getWriteProcResBegin(SC),
-             PE = TE.MTM.SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
-        if (PI->ProcResourceIdx != K)
-          continue;
-        PRCycles += (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(K));
-      }
-    }
+    PRCycles += extraCycles(ExtraInstrs, K);
+    PRCycles -= extraCycles(RemoveInstrs, K);
     PRMax = std::max(PRMax, PRCycles);
   }
   // Convert to cycle count.
   PRMax = TE.MTM.getCycles(PRMax);
 
+  // Instrs: #instructions in current trace outside current block.
   unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;
+  // Add instruction count from the extra blocks.
   for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i)
     Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount;
+  Instrs += ExtraInstrs.size();
+  Instrs -= RemoveInstrs.size();
   if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
     Instrs /= IW;
   // Assume issue width 1 without a schedule model.
   return std::max(Instrs, PRMax);
 }
 
+bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr *DefMI,
+                                              const MachineInstr *UseMI) const {
+  if (DefMI->getParent() == UseMI->getParent())
+    return true;
+
+  const TraceBlockInfo &DepTBI = TE.BlockInfo[DefMI->getParent()->getNumber()];
+  const TraceBlockInfo &TBI = TE.BlockInfo[UseMI->getParent()->getNumber()];
+
+  return DepTBI.isUsefulDominator(TBI);
+}
+
 void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const {
   OS << getName() << " ensemble:\n";
   for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {

diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 8515b0f..99f0583 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp

@@ -46,6 +46,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 namespace {
@@ -214,9 +215,9 @@
     void report(const char *msg, const MachineBasicBlock *MBB,
                 const LiveInterval &LI);
     void report(const char *msg, const MachineFunction *MF,
-                const LiveRange &LR);
+                const LiveRange &LR, unsigned Reg);
     void report(const char *msg, const MachineBasicBlock *MBB,
-                const LiveRange &LR);
+                const LiveRange &LR, unsigned Reg);
 
     void verifyInlineAsm(const MachineInstr *MI);
 
@@ -275,11 +276,12 @@
 bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   raw_ostream *OutFile = nullptr;
   if (OutFileName) {
-    std::string ErrorInfo;
-    OutFile = new raw_fd_ostream(OutFileName, ErrorInfo,
+    std::error_code EC;
+    OutFile = new raw_fd_ostream(OutFileName, EC,
                                  sys::fs::F_Append | sys::fs::F_Text);
-    if (!ErrorInfo.empty()) {
-      errs() << "Error opening '" << OutFileName << "': " << ErrorInfo << '\n';
+    if (EC) {
+      errs() << "Error opening '" << OutFileName << "': " << EC.message()
+             << '\n';
       exit(1);
     }
 
@@ -292,8 +294,8 @@
 
   this->MF = &MF;
   TM = &MF.getTarget();
-  TII = TM->getInstrInfo();
-  TRI = TM->getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
 
   LiveVars = nullptr;
@@ -430,15 +432,17 @@
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
-                             const LiveRange &LR) {
+                             const LiveRange &LR, unsigned Reg) {
   report(msg, MBB);
-  *OS << "- liverange:    " << LR << "\n";
+  *OS << "- liverange:   " << LR << '\n';
+  *OS << "- register:    " << PrintReg(Reg, TRI) << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF,
-                             const LiveRange &LR) {
+                             const LiveRange &LR, unsigned Reg) {
   report(msg, MF);
-  *OS << "- liverange:    " << LR << "\n";
+  *OS << "- liverange:   " << LR << '\n';
+  *OS << "- register:    " << PrintReg(Reg, TRI) << '\n';
 }
 
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
@@ -905,7 +909,7 @@
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
             *OS << TRI->getName(Reg) << " is not a "
-                << DRC->getName() << " register.\n";
+                << TRI->getRegClassName(DRC) << " register.\n";
           }
         }
       } else {
@@ -916,13 +920,13 @@
             TRI->getSubClassWithSubReg(RC, SubIdx);
           if (!SRC) {
             report("Invalid subregister index for virtual register", MO, MONum);
-            *OS << "Register class " << RC->getName()
+            *OS << "Register class " << TRI->getRegClassName(RC)
                 << " does not support subreg index " << SubIdx << "\n";
             return;
           }
           if (RC != SRC) {
             report("Invalid register class for subregister index", MO, MONum);
-            *OS << "Register class " << RC->getName()
+            *OS << "Register class " << TRI->getRegClassName(RC)
                 << " does not fully support subreg index " << SubIdx << "\n";
             return;
           }
@@ -944,8 +948,9 @@
           }
           if (!RC->hasSuperClassEq(DRC)) {
             report("Illegal virtual register for instruction", MO, MONum);
-            *OS << "Expected a " << DRC->getName() << " register, but got a "
-                << RC->getName() << " register\n";
+            *OS << "Expected a " << TRI->getRegClassName(DRC)
+                << " register, but got a " << TRI->getRegClassName(RC)
+                << " register\n";
           }
         }
       }
@@ -1357,13 +1362,13 @@
   const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
 
   if (!DefVNI) {
-    report("Valno not live at def and not marked unused", MF, LR);
+    report("Valno not live at def and not marked unused", MF, LR, Reg);
     *OS << "Valno #" << VNI->id << '\n';
     return;
   }
 
   if (DefVNI != VNI) {
-    report("Live segment at def has different valno", MF, LR);
+    report("Live segment at def has different valno", MF, LR, Reg);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " where valno #" << DefVNI->id << " is live\n";
     return;
@@ -1371,7 +1376,7 @@
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
   if (!MBB) {
-    report("Invalid definition index", MF, LR);
+    report("Invalid definition index", MF, LR, Reg);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " in " << LR << '\n';
     return;
@@ -1379,7 +1384,7 @@
 
   if (VNI->isPHIDef()) {
     if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-      report("PHIDef value is not defined at MBB start", MBB, LR);
+      report("PHIDef value is not defined at MBB start", MBB, LR, Reg);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
           << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
     }
@@ -1389,7 +1394,7 @@
   // Non-PHI def.
   const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
   if (!MI) {
-    report("No instruction at def index", MBB, LR);
+    report("No instruction at def index", MBB, LR, Reg);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     return;
   }
@@ -1422,12 +1427,13 @@
     // DEF slots.
     if (isEarlyClobber) {
       if (!VNI->def.isEarlyClobber()) {
-        report("Early clobber def must be at an early-clobber slot", MBB, LR);
+        report("Early clobber def must be at an early-clobber slot", MBB, LR,
+               Reg);
         *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
       }
     } else if (!VNI->def.isRegister()) {
       report("Non-PHI, non-early clobber def must be at a register slot",
-             MBB, LR);
+             MBB, LR, Reg);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     }
   }
@@ -1441,31 +1447,31 @@
   assert(VNI && "Live segment has no valno");
 
   if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
-    report("Foreign valno in live segment", MF, LR);
+    report("Foreign valno in live segment", MF, LR, Reg);
     *OS << S << " has a bad valno\n";
   }
 
   if (VNI->isUnused()) {
-    report("Live segment valno is marked unused", MF, LR);
+    report("Live segment valno is marked unused", MF, LR, Reg);
     *OS << S << '\n';
   }
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
   if (!MBB) {
-    report("Bad start of live segment, no basic block", MF, LR);
+    report("Bad start of live segment, no basic block", MF, LR, Reg);
     *OS << S << '\n';
     return;
   }
   SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
   if (S.start != MBBStartIdx && S.start != VNI->def) {
-    report("Live segment must begin at MBB entry or valno def", MBB, LR);
+    report("Live segment must begin at MBB entry or valno def", MBB, LR, Reg);
     *OS << S << '\n';
   }
 
   const MachineBasicBlock *EndMBB =
     LiveInts->getMBBFromIndex(S.end.getPrevSlot());
   if (!EndMBB) {
-    report("Bad end of live segment, no basic block", MF, LR);
+    report("Bad end of live segment, no basic block", MF, LR, Reg);
     *OS << S << '\n';
     return;
   }
@@ -1483,14 +1489,14 @@
   const MachineInstr *MI =
     LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
   if (!MI) {
-    report("Live segment doesn't end at a valid instruction", EndMBB, LR);
+    report("Live segment doesn't end at a valid instruction", EndMBB, LR, Reg);
     *OS << S << '\n';
     return;
   }
 
   // The block slot must refer to a basic block boundary.
   if (S.end.isBlock()) {
-    report("Live segment ends at B slot of an instruction", EndMBB, LR);
+    report("Live segment ends at B slot of an instruction", EndMBB, LR, Reg);
     *OS << S << '\n';
   }
 
@@ -1498,7 +1504,8 @@
     // Segment ends on the dead slot.
     // That means there must be a dead def.
     if (!SlotIndex::isSameInstr(S.start, S.end)) {
-      report("Live segment ending at dead slot spans instructions", EndMBB, LR);
+      report("Live segment ending at dead slot spans instructions", EndMBB, LR,
+             Reg);
       *OS << S << '\n';
     }
   }
@@ -1508,7 +1515,7 @@
   if (S.end.isEarlyClobber()) {
     if (I+1 == LR.end() || (I+1)->start != S.end) {
       report("Live segment ending at early clobber slot must be "
-             "redefined by an EC def in the same instruction", EndMBB, LR);
+             "redefined by an EC def in the same instruction", EndMBB, LR, Reg);
       *OS << S << '\n';
     }
   }
@@ -1566,7 +1573,7 @@
 
       // All predecessors must have a live-out value.
       if (!PVNI) {
-        report("Register not marked live out of predecessor", *PI, LR);
+        report("Register not marked live out of predecessor", *PI, LR, Reg);
         *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
             << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
             << PEnd << '\n';
@@ -1575,7 +1582,7 @@
 
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
-        report("Different value live out of predecessor", *PI, LR);
+        report("Different value live out of predecessor", *PI, LR, Reg);
         *OS << "Valno #" << PVNI->id << " live out of BB#"
             << (*PI)->getNumber() << '@' << PEnd
             << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()

diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index 95a2934..a1042e7 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp

@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "phi-opt"
@@ -66,7 +67,7 @@
     return false;
 
   MRI = &Fn.getRegInfo();
-  TII = Fn.getTarget().getInstrInfo();
+  TII = Fn.getSubtarget().getInstrInfo();
 
   // Find dead PHI cycles and PHI cycles that can be replaced by a single
   // value.  InstCombine does these optimizations, but DAG legalization may
@@ -91,7 +92,7 @@
   unsigned DstReg = MI->getOperand(0).getReg();
 
   // See if we already saw this register.
-  if (!PHIsInCycle.insert(MI))
+  if (!PHIsInCycle.insert(MI).second)
     return true;
 
   // Don't scan crazily complex things.
@@ -136,7 +137,7 @@
          "PHI destination is not a virtual register");
 
   // See if we already saw this register.
-  if (!PHIsInCycle.insert(MI))
+  if (!PHIsInCycle.insert(MI).second)
     return true;
 
   // Don't scan crazily complex things.

diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index c8d0819..def2e3d 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp

@@ -30,7 +30,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -150,9 +150,7 @@
     Changed |= EliminatePHINodes(MF, *I);
 
   // Remove dead IMPLICIT_DEF instructions.
-  for (SmallPtrSet<MachineInstr*, 4>::iterator I = ImpDefs.begin(),
-         E = ImpDefs.end(); I != E; ++I) {
-    MachineInstr *DefMI = *I;
+  for (MachineInstr *DefMI : ImpDefs) {
     unsigned DefReg = DefMI->getOperand(0).getReg();
     if (MRI->use_nodbg_empty(DefReg)) {
       if (LIS)
@@ -240,7 +238,7 @@
   // Insert a register to register copy at the top of the current block (but
   // after any remaining phi nodes) which copies the new incoming register
   // into the phi node destination.
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   if (isSourceDefinedByImplicitDef(MPhi, MRI))
     // If all sources of a PHI node are implicit_def, just emit an
     // implicit_def instead of a copy.
@@ -369,7 +367,7 @@
     // Check to make sure we haven't already emitted the copy for this block.
     // This can happen because PHI nodes may have multiple entries for the same
     // basic block.
-    if (!MBBsInsertedInto.insert(&opBlock))
+    if (!MBBsInsertedInto.insert(&opBlock).second)
       continue;  // If the copy has already been emitted, we're done.
 
     // Find a safe location to insert the copy, this may be the first terminator

diff --git a/lib/CodeGen/PHIEliminationUtils.h b/lib/CodeGen/PHIEliminationUtils.h
index 48234ae..b997d7a 100644
--- a/lib/CodeGen/PHIEliminationUtils.h
+++ b/lib/CodeGen/PHIEliminationUtils.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_PHIELIMINATIONUTILS_H
-#define LLVM_CODEGEN_PHIELIMINATIONUTILS_H
+#ifndef LLVM_LIB_CODEGEN_PHIELIMINATIONUTILS_H
+#define LLVM_LIB_CODEGEN_PHIELIMINATIONUTILS_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 

diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 249b2d0..ec71d86 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp

@@ -27,6 +27,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/SymbolRewriter.h"
 
 using namespace llvm;
 
@@ -71,6 +72,8 @@
     cl::desc("Disable Codegen Prepare"));
 static cl::opt<bool> DisableCopyProp("disable-copyprop", cl::Hidden,
     cl::desc("Disable Copy Propagation pass"));
+static cl::opt<bool> DisablePartialLibcallInlining("disable-partial-libcall-inlining",
+    cl::Hidden, cl::desc("Disable Partial Libcall Inlining"));
 static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
     cl::desc("Print LLVM IR produced by the loop-reduce pass"));
 static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
@@ -97,6 +100,10 @@
 static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
     cl::desc("Run live interval analysis earlier in the pipeline"));
 
+static cl::opt<bool> UseCFLAA("use-cfl-aa-in-codegen",
+  cl::init(false), cl::Hidden,
+  cl::desc("Enable the new, experimental CFL alias analysis in CodeGen"));
+
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
@@ -374,7 +381,10 @@
   // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
+  if (UseCFLAA)
+    addPass(createCFLAliasAnalysisPass());
   addPass(createTypeBasedAliasAnalysisPass());
+  addPass(createScopedNoAliasAAPass());
   addPass(createBasicAliasAnalysisPass());
 
   // Before running any passes, run the verifier to determine if the input
@@ -399,6 +409,9 @@
   // Prepare expensive constants for SelectionDAG.
   if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting)
     addPass(createConstantHoistingPass());
+
+  if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining)
+    addPass(createPartiallyInlineLibCallsPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
@@ -416,7 +429,7 @@
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
-  case ExceptionHandling::WinEH:
+  case ExceptionHandling::ItaniumWinEH:
     addPass(createDwarfEHPass(TM));
     break;
   case ExceptionHandling::None:
@@ -433,6 +446,7 @@
 void TargetPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
     addPass(createCodeGenPreparePass(TM));
+  addPass(createRewriteSymbolsPass());
 }
 
 /// Add common passes that perform LLVM IR to IR transforms in preparation for
@@ -601,6 +615,9 @@
   printAndVerify("After Machine LICM, CSE and Sinking passes");
 
   addPass(&PeepholeOptimizerID);
+  // Clean-up the dead code that may have been generated by peephole
+  // rewriting.
+  addPass(&DeadMachineInstructionElimID);
   printAndVerify("After codegen peephole optimization pass");
 }
 
@@ -675,6 +692,12 @@
   return createTargetRegisterAllocator(Optimized);
 }
 
+/// Return true if the default global register allocator is in use and
+/// has not be overriden on the command line with '-regalloc=...'
+bool TargetPassConfig::usingDefaultRegAlloc() const {
+  return RegAlloc.getNumOccurrences() == 0;
+}
+
 /// Add the minimum set of target-independent passes that are required for
 /// register allocation. No coalescing or scheduling.
 void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
@@ -709,6 +732,7 @@
 
   addPass(&TwoAddressInstructionPassID);
   addPass(&RegisterCoalescerID);
+  printAndVerify("After Register Coalescing");
 
   // PreRA instruction scheduling.
   if (addPass(&MachineSchedulerID))

diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 716cb1f..a296aea 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp

@@ -46,7 +46,7 @@
 //     if it loads to virtual registers and the virtual register defined has 
 //     a single use.
 //
-// - Optimize Copies and Bitcast:
+// - Optimize Copies and Bitcast (more generally, target specific copies):
 //
 //     Rewrite copies and bitcasts to avoid cross register bank copies
 //     when possible.
@@ -78,6 +78,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "peephole-opt"
@@ -92,7 +94,7 @@
                 cl::desc("Disable the peephole optimizer"));
 
 static cl::opt<bool>
-DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(true),
+DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
                   cl::desc("Disable advanced copy optimization"));
 
 STATISTIC(NumReuse,      "Number of extension results reused");
@@ -100,12 +102,13 @@
 STATISTIC(NumImmFold,    "Number of move immediate folded");
 STATISTIC(NumLoadFold,   "Number of loads folded");
 STATISTIC(NumSelects,    "Number of selects optimized");
-STATISTIC(NumCopiesBitcasts, "Number of copies/bitcasts optimized");
+STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
+STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
 
 namespace {
   class PeepholeOptimizer : public MachineFunctionPass {
-    const TargetMachine   *TM;
     const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
     MachineRegisterInfo   *MRI;
     MachineDominatorTree  *DT;  // Machine dominator tree
 
@@ -129,9 +132,14 @@
   private:
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
-                          SmallPtrSet<MachineInstr*, 8> &LocalMIs);
+                          SmallPtrSetImpl<MachineInstr*> &LocalMIs);
     bool optimizeSelect(MachineInstr *MI);
+    bool optimizeCondBranch(MachineInstr *MI);
     bool optimizeCopyOrBitcast(MachineInstr *MI);
+    bool optimizeCoalescableCopy(MachineInstr *MI);
+    bool optimizeUncoalescableCopy(MachineInstr *MI,
+                                   SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool findNextSource(unsigned &Reg, unsigned &SubReg);
     bool isMoveImmediate(MachineInstr *MI,
                          SmallSet<unsigned, 4> &ImmDefRegs,
                          DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
@@ -140,6 +148,25 @@
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
     bool isLoadFoldable(MachineInstr *MI,
                         SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
+
+    /// \brief Check whether \p MI is understood by the register coalescer
+    /// but may require some rewriting.
+    bool isCoalescableCopy(const MachineInstr &MI) {
+      // SubregToRegs are not interesting, because they are already register
+      // coalescer friendly.
+      return MI.isCopy() || (!DisableAdvCopyOpt &&
+                             (MI.isRegSequence() || MI.isInsertSubreg() ||
+                              MI.isExtractSubreg()));
+    }
+
+    /// \brief Check whether \p MI is a copy like instruction that is
+    /// not recognized by the register coalescer.
+    bool isUncoalescableCopy(const MachineInstr &MI) {
+      return MI.isBitcast() ||
+             (!DisableAdvCopyOpt &&
+              (MI.isRegSequenceLike() || MI.isInsertSubregLike() ||
+               MI.isExtractSubregLike()));
+    }
   };
 
   /// \brief Helper class to track the possible sources of a value defined by
@@ -176,63 +203,87 @@
     /// the ValueTracker class but that would have complicated the code of
     /// the users of this class.
     bool UseAdvancedTracking;
-    /// Optional MachineRegisterInfo used to perform some complex
+    /// MachineRegisterInfo used to perform tracking.
+    const MachineRegisterInfo &MRI;
+    /// Optional TargetInstrInfo used to perform some complex
     /// tracking.
-    const MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
 
     /// \brief Dispatcher to the right underlying implementation of
     /// getNextSource.
-    bool getNextSourceImpl(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceImpl(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for Copy instructions.
-    bool getNextSourceFromCopy(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromCopy(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for Bitcast instructions.
-    bool getNextSourceFromBitcast(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromBitcast(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for RegSequence
     /// instructions.
-    bool getNextSourceFromRegSequence(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromRegSequence(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for InsertSubreg
     /// instructions.
-    bool getNextSourceFromInsertSubreg(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromInsertSubreg(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for ExtractSubreg
     /// instructions.
-    bool getNextSourceFromExtractSubreg(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromExtractSubreg(unsigned &SrcReg, unsigned &SrcSubReg);
     /// \brief Specialized version of getNextSource for SubregToReg
     /// instructions.
-    bool getNextSourceFromSubregToReg(unsigned &SrcIdx, unsigned &SrcSubReg);
+    bool getNextSourceFromSubregToReg(unsigned &SrcReg, unsigned &SrcSubReg);
 
   public:
-    /// \brief Create a ValueTracker instance for the value defines by \p MI
-    /// at the operand index \p DefIdx.
+    /// \brief Create a ValueTracker instance for the value defined by \p Reg.
     /// \p DefSubReg represents the sub register index the value tracker will
-    /// track. It does not need to match the sub register index used in \p MI.
+    /// track. It does not need to match the sub register index used in the
+    /// definition of \p Reg.
     /// \p UseAdvancedTracking specifies whether or not the value tracker looks
     /// through complex instructions. By default (false), it handles only copy
     /// and bitcast instructions.
-    /// \p MRI useful to perform some complex checks.
-    ValueTracker(const MachineInstr &MI, unsigned DefIdx, unsigned DefSubReg,
+    /// If \p Reg is a physical register, a value tracker constructed with
+    /// this constructor will not find any alternative source.
+    /// Indeed, when \p Reg is a physical register that constructor does not
+    /// know which definition of \p Reg it should track.
+    /// Use the next constructor to track a physical register.
+    ValueTracker(unsigned Reg, unsigned DefSubReg,
+                 const MachineRegisterInfo &MRI,
                  bool UseAdvancedTracking = false,
-                 const MachineRegisterInfo *MRI = nullptr)
+                 const TargetInstrInfo *TII = nullptr)
+        : Def(nullptr), DefIdx(0), DefSubReg(DefSubReg), Reg(Reg),
+          UseAdvancedTracking(UseAdvancedTracking), MRI(MRI), TII(TII) {
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        Def = MRI.getVRegDef(Reg);
+        DefIdx = MRI.def_begin(Reg).getOperandNo();
+      }
+    }
+
+    /// \brief Create a ValueTracker instance for the value defined by
+    /// the pair \p MI, \p DefIdx.
+    /// Unlike the other constructor, the value tracker produced by this one
+    /// may be able to find a new source when the definition is a physical
+    /// register.
+    /// This could be useful to rewrite target specific instructions into
+    /// generic copy instructions.
+    ValueTracker(const MachineInstr &MI, unsigned DefIdx, unsigned DefSubReg,
+                 const MachineRegisterInfo &MRI,
+                 bool UseAdvancedTracking = false,
+                 const TargetInstrInfo *TII = nullptr)
         : Def(&MI), DefIdx(DefIdx), DefSubReg(DefSubReg),
-          UseAdvancedTracking(UseAdvancedTracking), MRI(MRI) {
-      assert(Def->getOperand(DefIdx).isDef() &&
-             Def->getOperand(DefIdx).isReg() &&
-             "Definition does not match machine instruction");
-      // Initially the value is in the defined register.
+          UseAdvancedTracking(UseAdvancedTracking), MRI(MRI), TII(TII) {
+      assert(DefIdx < Def->getDesc().getNumDefs() &&
+             Def->getOperand(DefIdx).isReg() && "Invalid definition");
       Reg = Def->getOperand(DefIdx).getReg();
     }
 
     /// \brief Following the use-def chain, get the next available source
     /// for the tracked value.
-    /// When the returned value is not nullptr, getReg() gives the register
+    /// When the returned value is not nullptr, \p SrcReg gives the register
     /// that contain the tracked value.
     /// \note The sub register index returned in \p SrcSubReg must be used
-    /// on that getReg() to access the actual value.
+    /// on \p SrcReg to access the actual value.
     /// \return Unless the returned value is nullptr (i.e., no source found),
-    /// \p SrcIdx gives the index of the next source in the returned
-    /// instruction and \p SrcSubReg the index to be used on that source to
-    /// get the tracked value. When nullptr is returned, no alternative source
-    /// has been found.
-    const MachineInstr *getNextSource(unsigned &SrcIdx, unsigned &SrcSubReg);
+    /// \p SrcReg gives the register of the next source used in the returned
+    /// instruction and \p SrcSubReg the sub-register index to be used on that
+    /// source to get the tracked value. When nullptr is returned, no
+    /// alternative source has been found.
+    const MachineInstr *getNextSource(unsigned &SrcReg, unsigned &SrcSubReg);
 
     /// \brief Get the last register where the initial value can be found.
     /// Initially this is the register of the definition.
@@ -261,7 +312,7 @@
 /// debug uses.
 bool PeepholeOptimizer::
 optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
-                 SmallPtrSet<MachineInstr*, 8> &LocalMIs) {
+                 SmallPtrSetImpl<MachineInstr*> &LocalMIs) {
   unsigned SrcReg, DstReg, SubIdx;
   if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx))
     return false;
@@ -277,7 +328,7 @@
   // Ensure DstReg can get a register class that actually supports
   // sub-registers. Don't change the class until we commit.
   const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
-  DstRC = TM->getRegisterInfo()->getSubClassWithSubReg(DstRC, SubIdx);
+  DstRC = TRI->getSubClassWithSubReg(DstRC, SubIdx);
   if (!DstRC)
     return false;
 
@@ -286,8 +337,8 @@
   // register.
   // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of
   // SrcReg:SubIdx should be replaced.
-  bool UseSrcSubIdx = TM->getRegisterInfo()->
-    getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != nullptr;
+  bool UseSrcSubIdx =
+      TRI->getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != nullptr;
 
   // The source has other uses. See if we can replace the other uses with use of
   // the result of the extension.
@@ -447,6 +498,12 @@
   return true;
 }
 
+/// \brief Check if a simpler conditional branch can be
+// generated
+bool PeepholeOptimizer::optimizeCondBranch(MachineInstr *MI) {
+  return TII->optimizeCondBranch(MI);
+}
+
 /// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
 /// share the same register file.
 static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
@@ -477,85 +534,34 @@
   return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
 }
 
-/// \brief Get the index of the definition and source for \p Copy
-/// instruction.
-/// \pre Copy.isCopy() or Copy.isBitcast().
-/// \return True if the Copy instruction has only one register source
-/// and one register definition. Otherwise, \p DefIdx and \p SrcIdx
-/// are invalid.
-static bool getCopyOrBitcastDefUseIdx(const MachineInstr &Copy,
-                                      unsigned &DefIdx, unsigned &SrcIdx) {
-  assert((Copy.isCopy() || Copy.isBitcast()) && "Wrong operation type.");
-  if (Copy.isCopy()) {
-    // Copy instruction are supposed to be: Def = Src.
-     if (Copy.getDesc().getNumOperands() != 2)
-       return false;
-     DefIdx = 0;
-     SrcIdx = 1;
-     assert(Copy.getOperand(DefIdx).isDef() && "Use comes before def!");
-     return true;
-  }
-  // Bitcast case.
-  // Bitcasts with more than one def are not supported.
-  if (Copy.getDesc().getNumDefs() != 1)
-    return false;
-  // Initialize SrcIdx to an undefined operand.
-  SrcIdx = Copy.getDesc().getNumOperands();
-  for (unsigned OpIdx = 0, EndOpIdx = SrcIdx; OpIdx != EndOpIdx; ++OpIdx) {
-    const MachineOperand &MO = Copy.getOperand(OpIdx);
-    if (!MO.isReg() || !MO.getReg())
-      continue;
-    if (MO.isDef())
-      DefIdx = OpIdx;
-    else if (SrcIdx != EndOpIdx)
-      // Multiple sources?
-      return false;
-    SrcIdx = OpIdx;
-  }
-  return true;
-}
-
-/// \brief Optimize a copy or bitcast instruction to avoid cross
-/// register bank copy. The optimization looks through a chain of
-/// copies and try to find a source that has a compatible register
-/// class.
-/// Two register classes are considered to be compatible if they share
-/// the same register bank.
-/// New copies issued by this optimization are register allocator
-/// friendly. This optimization does not remove any copy as it may
-/// overconstraint the register allocator, but replaces some when
-/// possible.
-/// \pre \p MI is a Copy (MI->isCopy() is true)
-/// \return True, when \p MI has been optimized. In that case, \p MI has
-/// been removed from its parent.
-bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) {
-  unsigned DefIdx, SrcIdx;
-  if (!MI || !getCopyOrBitcastDefUseIdx(*MI, DefIdx, SrcIdx))
+/// \brief Try to find the next source that share the same register file
+/// for the value defined by \p Reg and \p SubReg.
+/// When true is returned, \p Reg and \p SubReg are updated with the
+/// register number and sub-register index of the new source.
+/// \return False if no alternative sources are available. True otherwise.
+bool PeepholeOptimizer::findNextSource(unsigned &Reg, unsigned &SubReg) {
+  // Do not try to find a new source for a physical register.
+  // So far we do not have any motivating example for doing that.
+  // Thus, instead of maintaining untested code, we will revisit that if
+  // that changes at some point.
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
     return false;
 
-  const MachineOperand &MODef = MI->getOperand(DefIdx);
-  assert(MODef.isReg() && "Copies must be between registers.");
-  unsigned Def = MODef.getReg();
-
-  if (TargetRegisterInfo::isPhysicalRegister(Def))
-    return false;
-
-  const TargetRegisterClass *DefRC = MRI->getRegClass(Def);
-  unsigned DefSubReg = MODef.getSubReg();
+  const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
+  unsigned DefSubReg = SubReg;
 
   unsigned Src;
   unsigned SrcSubReg;
   bool ShouldRewrite = false;
-  const TargetRegisterInfo &TRI = *TM->getRegisterInfo();
 
   // Follow the chain of copies until we reach the top of the use-def chain
   // or find a more suitable source.
-  ValueTracker ValTracker(*MI, DefIdx, DefSubReg, !DisableAdvCopyOpt, MRI);
+  ValueTracker ValTracker(Reg, DefSubReg, *MRI, !DisableAdvCopyOpt, TII);
   do {
-    unsigned CopySrcIdx, CopySrcSubReg;
-    if (!ValTracker.getNextSource(CopySrcIdx, CopySrcSubReg))
+    unsigned CopySrcReg, CopySrcSubReg;
+    if (!ValTracker.getNextSource(CopySrcReg, CopySrcSubReg))
       break;
-    Src = ValTracker.getReg();
+    Src = CopySrcReg;
     SrcSubReg = CopySrcSubReg;
 
     // Do not extend the live-ranges of physical registers as they add
@@ -569,29 +575,411 @@
     const TargetRegisterClass *SrcRC = MRI->getRegClass(Src);
 
     // If this source does not incur a cross register bank copy, use it.
-    ShouldRewrite = shareSameRegisterFile(TRI, DefRC, DefSubReg, SrcRC,
+    ShouldRewrite = shareSameRegisterFile(*TRI, DefRC, DefSubReg, SrcRC,
                                           SrcSubReg);
   } while (!ShouldRewrite);
 
   // If we did not find a more suitable source, there is nothing to optimize.
-  if (!ShouldRewrite || Src == MI->getOperand(SrcIdx).getReg())
+  if (!ShouldRewrite || Src == Reg)
     return false;
 
-  // Rewrite the copy to avoid a cross register bank penalty. 
-  unsigned NewVR = TargetRegisterInfo::isPhysicalRegister(Def) ? Def :
-    MRI->createVirtualRegister(DefRC);
-  MachineInstr *NewCopy = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                  TII->get(TargetOpcode::COPY), NewVR)
-    .addReg(Src, 0, SrcSubReg);
-  NewCopy->getOperand(0).setSubReg(DefSubReg);
+  Reg = Src;
+  SubReg = SrcSubReg;
+  return true;
+}
 
-  MRI->replaceRegWith(Def, NewVR);
-  MRI->clearKillFlags(NewVR);
-  // We extended the lifetime of Src.
-  // Clear the kill flags to account for that.
-  MRI->clearKillFlags(Src);
+namespace {
+/// \brief Helper class to rewrite the arguments of a copy-like instruction.
+class CopyRewriter {
+protected:
+  /// The copy-like instruction.
+  MachineInstr &CopyLike;
+  /// The index of the source being rewritten.
+  unsigned CurrentSrcIdx;
+
+public:
+  CopyRewriter(MachineInstr &MI) : CopyLike(MI), CurrentSrcIdx(0) {}
+
+  virtual ~CopyRewriter() {}
+
+  /// \brief Get the next rewritable source (SrcReg, SrcSubReg) and
+  /// the related value that it affects (TrackReg, TrackSubReg).
+  /// A source is considered rewritable if its register class and the
+  /// register class of the related TrackReg may not be register
+  /// coalescer friendly. In other words, given a copy-like instruction
+  /// not all the arguments may be returned at rewritable source, since
+  /// some arguments are none to be register coalescer friendly.
+  ///
+  /// Each call of this method moves the current source to the next
+  /// rewritable source.
+  /// For instance, let CopyLike be the instruction to rewrite.
+  /// CopyLike has one definition and one source:
+  /// dst.dstSubIdx = CopyLike src.srcSubIdx.
+  ///
+  /// The first call will give the first rewritable source, i.e.,
+  /// the only source this instruction has:
+  /// (SrcReg, SrcSubReg) = (src, srcSubIdx).
+  /// This source defines the whole definition, i.e.,
+  /// (TrackReg, TrackSubReg) = (dst, dstSubIdx).
+  ///
+  /// The second and subsequent calls will return false, has there is only one
+  /// rewritable source.
+  ///
+  /// \return True if a rewritable source has been found, false otherwise.
+  /// The output arguments are valid if and only if true is returned.
+  virtual bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                                       unsigned &TrackReg,
+                                       unsigned &TrackSubReg) {
+    // If CurrentSrcIdx == 1, this means this function has already been
+    // called once. CopyLike has one defintiion and one argument, thus,
+    // there is nothing else to rewrite.
+    if (!CopyLike.isCopy() || CurrentSrcIdx == 1)
+      return false;
+    // This is the first call to getNextRewritableSource.
+    // Move the CurrentSrcIdx to remember that we made that call.
+    CurrentSrcIdx = 1;
+    // The rewritable source is the argument.
+    const MachineOperand &MOSrc = CopyLike.getOperand(1);
+    SrcReg = MOSrc.getReg();
+    SrcSubReg = MOSrc.getSubReg();
+    // What we track are the alternative sources of the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    TrackReg = MODef.getReg();
+    TrackSubReg = MODef.getSubReg();
+    return true;
+  }
+
+  /// \brief Rewrite the current source with \p NewReg and \p NewSubReg
+  /// if possible.
+  /// \return True if the rewritting was possible, false otherwise.
+  virtual bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) {
+    if (!CopyLike.isCopy() || CurrentSrcIdx != 1)
+      return false;
+    MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx);
+    MOSrc.setReg(NewReg);
+    MOSrc.setSubReg(NewSubReg);
+    return true;
+  }
+};
+
+/// \brief Specialized rewriter for INSERT_SUBREG instruction.
+class InsertSubregRewriter : public CopyRewriter {
+public:
+  InsertSubregRewriter(MachineInstr &MI) : CopyRewriter(MI) {
+    assert(MI.isInsertSubreg() && "Invalid instruction");
+  }
+
+  /// \brief See CopyRewriter::getNextRewritableSource.
+  /// Here CopyLike has the following form:
+  /// dst = INSERT_SUBREG Src1, Src2.src2SubIdx, subIdx.
+  /// Src1 has the same register class has dst, hence, there is
+  /// nothing to rewrite.
+  /// Src2.src2SubIdx, may not be register coalescer friendly.
+  /// Therefore, the first call to this method returns:
+  /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
+  /// (TrackReg, TrackSubReg) = (dst, subIdx).
+  ///
+  /// Subsequence calls will return false.
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // If we already get the only source we can rewrite, return false.
+    if (CurrentSrcIdx == 2)
+      return false;
+    // We are looking at v2 = INSERT_SUBREG v0, v1, sub0.
+    CurrentSrcIdx = 2;
+    const MachineOperand &MOInsertedReg = CopyLike.getOperand(2);
+    SrcReg = MOInsertedReg.getReg();
+    SrcSubReg = MOInsertedReg.getSubReg();
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+
+    // We want to track something that is compatible with the
+    // partial definition.
+    TrackReg = MODef.getReg();
+    if (MODef.getSubReg())
+      // Bails if we have to compose sub-register indices.
+      return false;
+    TrackSubReg = (unsigned)CopyLike.getOperand(3).getImm();
+    return true;
+  }
+  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+    if (CurrentSrcIdx != 2)
+      return false;
+    // We are rewriting the inserted reg.
+    MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
+    MO.setReg(NewReg);
+    MO.setSubReg(NewSubReg);
+    return true;
+  }
+};
+
+/// \brief Specialized rewriter for EXTRACT_SUBREG instruction.
+class ExtractSubregRewriter : public CopyRewriter {
+  const TargetInstrInfo &TII;
+
+public:
+  ExtractSubregRewriter(MachineInstr &MI, const TargetInstrInfo &TII)
+      : CopyRewriter(MI), TII(TII) {
+    assert(MI.isExtractSubreg() && "Invalid instruction");
+  }
+
+  /// \brief See CopyRewriter::getNextRewritableSource.
+  /// Here CopyLike has the following form:
+  /// dst.dstSubIdx = EXTRACT_SUBREG Src, subIdx.
+  /// There is only one rewritable source: Src.subIdx,
+  /// which defines dst.dstSubIdx.
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // If we already get the only source we can rewrite, return false.
+    if (CurrentSrcIdx == 1)
+      return false;
+    // We are looking at v1 = EXTRACT_SUBREG v0, sub0.
+    CurrentSrcIdx = 1;
+    const MachineOperand &MOExtractedReg = CopyLike.getOperand(1);
+    SrcReg = MOExtractedReg.getReg();
+    // If we have to compose sub-register indices, bails out.
+    if (MOExtractedReg.getSubReg())
+      return false;
+
+    SrcSubReg = CopyLike.getOperand(2).getImm();
+
+    // We want to track something that is compatible with the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    TrackReg = MODef.getReg();
+    TrackSubReg = MODef.getSubReg();
+    return true;
+  }
+
+  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+    // The only source we can rewrite is the input register.
+    if (CurrentSrcIdx != 1)
+      return false;
+
+    CopyLike.getOperand(CurrentSrcIdx).setReg(NewReg);
+
+    // If we find a source that does not require to extract something,
+    // rewrite the operation with a copy.
+    if (!NewSubReg) {
+      // Move the current index to an invalid position.
+      // We do not want another call to this method to be able
+      // to do any change.
+      CurrentSrcIdx = -1;
+      // Rewrite the operation as a COPY.
+      // Get rid of the sub-register index.
+      CopyLike.RemoveOperand(2);
+      // Morph the operation into a COPY.
+      CopyLike.setDesc(TII.get(TargetOpcode::COPY));
+      return true;
+    }
+    CopyLike.getOperand(CurrentSrcIdx + 1).setImm(NewSubReg);
+    return true;
+  }
+};
+
+/// \brief Specialized rewriter for REG_SEQUENCE instruction.
+class RegSequenceRewriter : public CopyRewriter {
+public:
+  RegSequenceRewriter(MachineInstr &MI) : CopyRewriter(MI) {
+    assert(MI.isRegSequence() && "Invalid instruction");
+  }
+
+  /// \brief See CopyRewriter::getNextRewritableSource.
+  /// Here CopyLike has the following form:
+  /// dst = REG_SEQUENCE Src1.src1SubIdx, subIdx1, Src2.src2SubIdx, subIdx2.
+  /// Each call will return a different source, walking all the available
+  /// source.
+  ///
+  /// The first call returns:
+  /// (SrcReg, SrcSubReg) = (Src1, src1SubIdx).
+  /// (TrackReg, TrackSubReg) = (dst, subIdx1).
+  ///
+  /// The second call returns:
+  /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
+  /// (TrackReg, TrackSubReg) = (dst, subIdx2).
+  ///
+  /// And so on, until all the sources have been traversed, then
+  /// it returns false.
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // We are looking at v0 = REG_SEQUENCE v1, sub1, v2, sub2, etc.
+
+    // If this is the first call, move to the first argument.
+    if (CurrentSrcIdx == 0) {
+      CurrentSrcIdx = 1;
+    } else {
+      // Otherwise, move to the next argument and check that it is valid.
+      CurrentSrcIdx += 2;
+      if (CurrentSrcIdx >= CopyLike.getNumOperands())
+        return false;
+    }
+    const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx);
+    SrcReg = MOInsertedReg.getReg();
+    // If we have to compose sub-register indices, bails out.
+    if ((SrcSubReg = MOInsertedReg.getSubReg()))
+      return false;
+
+    // We want to track something that is compatible with the related
+    // partial definition.
+    TrackSubReg = CopyLike.getOperand(CurrentSrcIdx + 1).getImm();
+
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    TrackReg = MODef.getReg();
+    // If we have to compose sub-registers, bails.
+    return MODef.getSubReg() == 0;
+  }
+
+  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+    // We cannot rewrite out of bound operands.
+    // Moreover, rewritable sources are at odd positions.
+    if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
+      return false;
+
+    MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
+    MO.setReg(NewReg);
+    MO.setSubReg(NewSubReg);
+    return true;
+  }
+};
+} // End namespace.
+
+/// \brief Get the appropriated CopyRewriter for \p MI.
+/// \return A pointer to a dynamically allocated CopyRewriter or nullptr
+/// if no rewriter works for \p MI.
+static CopyRewriter *getCopyRewriter(MachineInstr &MI,
+                                     const TargetInstrInfo &TII) {
+  switch (MI.getOpcode()) {
+  default:
+    return nullptr;
+  case TargetOpcode::COPY:
+    return new CopyRewriter(MI);
+  case TargetOpcode::INSERT_SUBREG:
+    return new InsertSubregRewriter(MI);
+  case TargetOpcode::EXTRACT_SUBREG:
+    return new ExtractSubregRewriter(MI, TII);
+  case TargetOpcode::REG_SEQUENCE:
+    return new RegSequenceRewriter(MI);
+  }
+  llvm_unreachable(nullptr);
+}
+
+/// \brief Optimize generic copy instructions to avoid cross
+/// register bank copy. The optimization looks through a chain of
+/// copies and tries to find a source that has a compatible register
+/// class.
+/// Two register classes are considered to be compatible if they share
+/// the same register bank.
+/// New copies issued by this optimization are register allocator
+/// friendly. This optimization does not remove any copy as it may
+/// overconstraint the register allocator, but replaces some operands
+/// when possible.
+/// \pre isCoalescableCopy(*MI) is true.
+/// \return True, when \p MI has been rewritten. False otherwise.
+bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr *MI) {
+  assert(MI && isCoalescableCopy(*MI) && "Invalid argument");
+  assert(MI->getDesc().getNumDefs() == 1 &&
+         "Coalescer can understand multiple defs?!");
+  const MachineOperand &MODef = MI->getOperand(0);
+  // Do not rewrite physical definitions.
+  if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg()))
+    return false;
+
+  bool Changed = false;
+  // Get the right rewriter for the current copy.
+  std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII));
+  // If none exists, bails out.
+  if (!CpyRewriter)
+    return false;
+  // Rewrite each rewritable source.
+  unsigned SrcReg, SrcSubReg, TrackReg, TrackSubReg;
+  while (CpyRewriter->getNextRewritableSource(SrcReg, SrcSubReg, TrackReg,
+                                              TrackSubReg)) {
+    unsigned NewSrc = TrackReg;
+    unsigned NewSubReg = TrackSubReg;
+    // Try to find a more suitable source.
+    // If we failed to do so, or get the actual source,
+    // move to the next source.
+    if (!findNextSource(NewSrc, NewSubReg) || SrcReg == NewSrc)
+      continue;
+    // Rewrite source.
+    if (CpyRewriter->RewriteCurrentSource(NewSrc, NewSubReg)) {
+      // We may have extended the live-range of NewSrc, account for that.
+      MRI->clearKillFlags(NewSrc);
+      Changed = true;
+    }
+  }
+  // TODO: We could have a clean-up method to tidy the instruction.
+  // E.g., v0 = INSERT_SUBREG v1, v1.sub0, sub0
+  // => v0 = COPY v1
+  // Currently we haven't seen motivating example for that and we
+  // want to avoid untested code.
+  NumRewrittenCopies += Changed == true;
+  return Changed;
+}
+
+/// \brief Optimize copy-like instructions to create
+/// register coalescer friendly instruction.
+/// The optimization tries to kill-off the \p MI by looking
+/// through a chain of copies to find a source that has a compatible
+/// register class.
+/// If such a source is found, it replace \p MI by a generic COPY
+/// operation.
+/// \pre isUncoalescableCopy(*MI) is true.
+/// \return True, when \p MI has been optimized. In that case, \p MI has
+/// been removed from its parent.
+/// All COPY instructions created, are inserted in \p LocalMIs.
+bool PeepholeOptimizer::optimizeUncoalescableCopy(
+    MachineInstr *MI, SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
+  assert(MI && isUncoalescableCopy(*MI) && "Invalid argument");
+
+  // Check if we can rewrite all the values defined by this instruction.
+  SmallVector<
+      std::pair<TargetInstrInfo::RegSubRegPair, TargetInstrInfo::RegSubRegPair>,
+      4> RewritePairs;
+  for (const MachineOperand &MODef : MI->defs()) {
+    if (MODef.isDead())
+      // We can ignore those.
+      continue;
+
+    // If a physical register is here, this is probably for a good reason.
+    // Do not rewrite that.
+    if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg()))
+      return false;
+
+    // If we do not know how to rewrite this definition, there is no point
+    // in trying to kill this instruction.
+    TargetInstrInfo::RegSubRegPair Def(MODef.getReg(), MODef.getSubReg());
+    TargetInstrInfo::RegSubRegPair Src = Def;
+    if (!findNextSource(Src.Reg, Src.SubReg))
+      return false;
+    RewritePairs.push_back(std::make_pair(Def, Src));
+  }
+  // The change is possible for all defs, do it.
+  for (const auto &PairDefSrc : RewritePairs) {
+    const auto &Def = PairDefSrc.first;
+    const auto &Src = PairDefSrc.second;
+    // Rewrite the "copy" in a way the register coalescer understands.
+    assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) &&
+           "We do not rewrite physical registers");
+    const TargetRegisterClass *DefRC = MRI->getRegClass(Def.Reg);
+    unsigned NewVR = MRI->createVirtualRegister(DefRC);
+    MachineInstr *NewCopy = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                    TII->get(TargetOpcode::COPY),
+                                    NewVR).addReg(Src.Reg, 0, Src.SubReg);
+    NewCopy->getOperand(0).setSubReg(Def.SubReg);
+    if (Def.SubReg)
+      NewCopy->getOperand(0).setIsUndef();
+    LocalMIs.insert(NewCopy);
+    MRI->replaceRegWith(Def.Reg, NewVR);
+    MRI->clearKillFlags(NewVR);
+    // We extended the lifetime of Src.
+    // Clear the kill flags to account for that.
+    MRI->clearKillFlags(Src.Reg);
+  }
+  // MI is now dead.
   MI->eraseFromParent();
-  ++NumCopiesBitcasts;
+  ++NumUncoalescableCopies;
   return true;
 }
 
@@ -673,8 +1061,8 @@
   if (DisablePeephole)
     return false;
 
-  TM  = &MF.getTarget();
-  TII = TM->getInstrInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
 
@@ -684,7 +1072,7 @@
     MachineBasicBlock *MBB = &*I;
 
     bool SeenMoveImm = false;
-    SmallPtrSet<MachineInstr*, 8> LocalMIs;
+    SmallPtrSet<MachineInstr*, 16> LocalMIs;
     SmallSet<unsigned, 4> ImmDefRegs;
     DenseMap<unsigned, MachineInstr*> ImmDefMIs;
     SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
@@ -711,7 +1099,8 @@
       if (MI->mayStore() || MI->isCall())
         FoldAsLoadDefCandidates.clear();
 
-      if (((MI->isBitcast() || MI->isCopy()) && optimizeCopyOrBitcast(MI)) ||
+      if ((isUncoalescableCopy(*MI) &&
+           optimizeUncoalescableCopy(MI, LocalMIs)) ||
           (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
           (MI->isSelect() && optimizeSelect(MI))) {
         // MI is deleted.
@@ -720,6 +1109,17 @@
         continue;
       }
 
+      if (MI->isConditionalBranch() && optimizeCondBranch(MI)) {
+        Changed = true;
+        continue;
+      }
+
+      if (isCoalescableCopy(*MI) && optimizeCoalescableCopy(MI)) {
+        // MI is just rewritten.
+        Changed = true;
+        continue;
+      }
+
       if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
         SeenMoveImm = true;
       } else {
@@ -781,24 +1181,25 @@
   return Changed;
 }
 
-bool ValueTracker::getNextSourceFromCopy(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromCopy(unsigned &SrcReg,
                                          unsigned &SrcSubReg) {
   assert(Def->isCopy() && "Invalid definition");
   // Copy instruction are supposed to be: Def = Src.
   // If someone breaks this assumption, bad things will happen everywhere.
-  assert(Def->getDesc().getNumOperands() == 2 && "Invalid number of operands");
+  assert(Def->getNumOperands() == 2 && "Invalid number of operands");
 
   if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
     // If we look for a different subreg, it means we want a subreg of src.
     // Bails as we do not support composing subreg yet.
     return false;
   // Otherwise, we want the whole source.
-  SrcIdx = 1;
-  SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  const MachineOperand &Src = Def->getOperand(1);
+  SrcReg = Src.getReg();
+  SrcSubReg = Src.getSubReg();
   return true;
 }
 
-bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromBitcast(unsigned &SrcReg,
                                             unsigned &SrcSubReg) {
   assert(Def->isBitcast() && "Invalid definition");
 
@@ -814,7 +1215,7 @@
     // Bails as we do not support composing subreg yet.
     return false;
 
-  SrcIdx = Def->getDesc().getNumOperands();
+  unsigned SrcIdx = Def->getNumOperands();
   for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx;
        ++OpIdx) {
     const MachineOperand &MO = Def->getOperand(OpIdx);
@@ -826,13 +1227,16 @@
       return false;
     SrcIdx = OpIdx;
   }
-  SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  const MachineOperand &Src = Def->getOperand(SrcIdx);
+  SrcReg = Src.getReg();
+  SrcSubReg = Src.getSubReg();
   return true;
 }
 
-bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromRegSequence(unsigned &SrcReg,
                                                 unsigned &SrcSubReg) {
-  assert(Def->isRegSequence() && "Invalid definition");
+  assert((Def->isRegSequence() || Def->isRegSequenceLike()) &&
+         "Invalid definition");
 
   if (Def->getOperand(DefIdx).getSubReg())
     // If we are composing subreg, bails out.
@@ -851,19 +1255,26 @@
     // turn that into an assertion.
     return false;
 
+  if (!TII)
+    // We could handle the REG_SEQUENCE here, but we do not want to
+    // duplicate the code from the generic TII.
+    return false;
+
+  SmallVector<TargetInstrInfo::RegSubRegPairAndIdx, 8> RegSeqInputRegs;
+  if (!TII->getRegSequenceInputs(*Def, DefIdx, RegSeqInputRegs))
+    return false;
+
   // We are looking at:
   // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
   // Check if one of the operand defines the subreg we are interested in.
-  for (unsigned OpIdx = DefIdx + 1, EndOpIdx = Def->getNumOperands();
-       OpIdx != EndOpIdx; OpIdx += 2) {
-    const MachineOperand &MOSubIdx = Def->getOperand(OpIdx + 1);
-    assert(MOSubIdx.isImm() &&
-           "One of the subindex of the reg_sequence is not an immediate");
-    if (MOSubIdx.getImm() == DefSubReg) {
-      assert(Def->getOperand(OpIdx).isReg() &&
-             "One of the source of the reg_sequence is not a register");
-      SrcIdx = OpIdx;
-      SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  for (auto &RegSeqInput : RegSeqInputRegs) {
+    if (RegSeqInput.SubIdx == DefSubReg) {
+      if (RegSeqInput.SubReg)
+        // Bails if we have to compose sub registers.
+        return false;
+
+      SrcReg = RegSeqInput.Reg;
+      SrcSubReg = RegSeqInput.SubReg;
       return true;
     }
   }
@@ -874,61 +1285,68 @@
   return false;
 }
 
-bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromInsertSubreg(unsigned &SrcReg,
                                                  unsigned &SrcSubReg) {
-  assert(Def->isInsertSubreg() && "Invalid definition");
+  assert((Def->isInsertSubreg() || Def->isInsertSubregLike()) &&
+         "Invalid definition");
+
   if (Def->getOperand(DefIdx).getSubReg())
     // If we are composing subreg, bails out.
     // Same remark as getNextSourceFromRegSequence.
     // I.e., this may be turned into an assert.
     return false;
 
+  if (!TII)
+    // We could handle the REG_SEQUENCE here, but we do not want to
+    // duplicate the code from the generic TII.
+    return false;
+
+  TargetInstrInfo::RegSubRegPair BaseReg;
+  TargetInstrInfo::RegSubRegPairAndIdx InsertedReg;
+  if (!TII->getInsertSubregInputs(*Def, DefIdx, BaseReg, InsertedReg))
+    return false;
+
   // We are looking at:
   // Def = INSERT_SUBREG v0, v1, sub1
   // There are two cases:
   // 1. DefSubReg == sub1, get v1.
   // 2. DefSubReg != sub1, the value may be available through v0.
 
-  // #1 Check if the inserted register matches the require sub index.
-  unsigned InsertedSubReg = Def->getOperand(3).getImm();
-  if (InsertedSubReg == DefSubReg) {
-    SrcIdx = 2;
-    SrcSubReg = Def->getOperand(SrcIdx).getSubReg();
+  // #1 Check if the inserted register matches the required sub index.
+  if (InsertedReg.SubIdx == DefSubReg) {
+    SrcReg = InsertedReg.Reg;
+    SrcSubReg = InsertedReg.SubReg;
     return true;
   }
   // #2 Otherwise, if the sub register we are looking for is not partial
   // defined by the inserted element, we can look through the main
   // register (v0).
-  // To check the overlapping we need a MRI and a TRI.
-  if (!MRI)
-    return false;
-
   const MachineOperand &MODef = Def->getOperand(DefIdx);
-  const MachineOperand &MOBase = Def->getOperand(1);
   // If the result register (Def) and the base register (v0) do not
   // have the same register class or if we have to compose
   // subregisters, bails out.
-  if (MRI->getRegClass(MODef.getReg()) != MRI->getRegClass(MOBase.getReg()) ||
-      MOBase.getSubReg())
+  if (MRI.getRegClass(MODef.getReg()) != MRI.getRegClass(BaseReg.Reg) ||
+      BaseReg.SubReg)
     return false;
 
-  // Get the TRI and check if inserted sub register overlaps with the
-  // sub register we are tracking.
-  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  // Get the TRI and check if the inserted sub-register overlaps with the
+  // sub-register we are tracking.
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
   if (!TRI ||
       (TRI->getSubRegIndexLaneMask(DefSubReg) &
-       TRI->getSubRegIndexLaneMask(InsertedSubReg)) != 0)
+       TRI->getSubRegIndexLaneMask(InsertedReg.SubIdx)) != 0)
     return false;
   // At this point, the value is available in v0 via the same subreg
   // we used for Def.
-  SrcIdx = 1;
+  SrcReg = BaseReg.Reg;
   SrcSubReg = DefSubReg;
   return true;
 }
 
-bool ValueTracker::getNextSourceFromExtractSubreg(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromExtractSubreg(unsigned &SrcReg,
                                                   unsigned &SrcSubReg) {
-  assert(Def->isExtractSubreg() && "Invalid definition");
+  assert((Def->isExtractSubreg() ||
+          Def->isExtractSubregLike()) && "Invalid definition");
   // We are looking at:
   // Def = EXTRACT_SUBREG v0, sub0
 
@@ -937,17 +1355,26 @@
   if (DefSubReg)
     return false;
 
+  if (!TII)
+    // We could handle the EXTRACT_SUBREG here, but we do not want to
+    // duplicate the code from the generic TII.
+    return false;
+
+  TargetInstrInfo::RegSubRegPairAndIdx ExtractSubregInputReg;
+  if (!TII->getExtractSubregInputs(*Def, DefIdx, ExtractSubregInputReg))
+    return false;
+
   // Bails if we have to compose sub registers.
   // Likewise, if v0.subreg != 0, we would have to compose v0.subreg with sub0.
-  if (Def->getOperand(1).getSubReg())
+  if (ExtractSubregInputReg.SubReg)
     return false;
   // Otherwise, the value is available in the v0.sub0.
-  SrcIdx = 1;
-  SrcSubReg = Def->getOperand(2).getImm();
+  SrcReg = ExtractSubregInputReg.Reg;
+  SrcSubReg = ExtractSubregInputReg.SubIdx;
   return true;
 }
 
-bool ValueTracker::getNextSourceFromSubregToReg(unsigned &SrcIdx,
+bool ValueTracker::getNextSourceFromSubregToReg(unsigned &SrcReg,
                                                 unsigned &SrcSubReg) {
   assert(Def->isSubregToReg() && "Invalid definition");
   // We are looking at:
@@ -964,37 +1391,37 @@
   if (Def->getOperand(2).getSubReg())
     return false;
 
-  SrcIdx = 2;
+  SrcReg = Def->getOperand(2).getReg();
   SrcSubReg = Def->getOperand(3).getImm();
   return true;
 }
 
-bool ValueTracker::getNextSourceImpl(unsigned &SrcIdx, unsigned &SrcSubReg) {
+bool ValueTracker::getNextSourceImpl(unsigned &SrcReg, unsigned &SrcSubReg) {
   assert(Def && "This method needs a valid definition");
 
   assert(
       (DefIdx < Def->getDesc().getNumDefs() || Def->getDesc().isVariadic()) &&
       Def->getOperand(DefIdx).isDef() && "Invalid DefIdx");
   if (Def->isCopy())
-    return getNextSourceFromCopy(SrcIdx, SrcSubReg);
+    return getNextSourceFromCopy(SrcReg, SrcSubReg);
   if (Def->isBitcast())
-    return getNextSourceFromBitcast(SrcIdx, SrcSubReg);
+    return getNextSourceFromBitcast(SrcReg, SrcSubReg);
   // All the remaining cases involve "complex" instructions.
   // Bails if we did not ask for the advanced tracking.
   if (!UseAdvancedTracking)
     return false;
-  if (Def->isRegSequence())
-    return getNextSourceFromRegSequence(SrcIdx, SrcSubReg);
-  if (Def->isInsertSubreg())
-    return getNextSourceFromInsertSubreg(SrcIdx, SrcSubReg);
-  if (Def->isExtractSubreg())
-    return getNextSourceFromExtractSubreg(SrcIdx, SrcSubReg);
+  if (Def->isRegSequence() || Def->isRegSequenceLike())
+    return getNextSourceFromRegSequence(SrcReg, SrcSubReg);
+  if (Def->isInsertSubreg() || Def->isInsertSubregLike())
+    return getNextSourceFromInsertSubreg(SrcReg, SrcSubReg);
+  if (Def->isExtractSubreg() || Def->isExtractSubregLike())
+    return getNextSourceFromExtractSubreg(SrcReg, SrcSubReg);
   if (Def->isSubregToReg())
-    return getNextSourceFromSubregToReg(SrcIdx, SrcSubReg);
+    return getNextSourceFromSubregToReg(SrcReg, SrcSubReg);
   return false;
 }
 
-const MachineInstr *ValueTracker::getNextSource(unsigned &SrcIdx,
+const MachineInstr *ValueTracker::getNextSource(unsigned &SrcReg,
                                                 unsigned &SrcSubReg) {
   // If we reach a point where we cannot move up in the use-def chain,
   // there is nothing we can get.
@@ -1003,20 +1430,18 @@
 
   const MachineInstr *PrevDef = nullptr;
   // Try to find the next source.
-  if (getNextSourceImpl(SrcIdx, SrcSubReg)) {
+  if (getNextSourceImpl(SrcReg, SrcSubReg)) {
     // Update definition, definition index, and subregister for the
     // next call of getNextSource.
-    const MachineOperand &MO = Def->getOperand(SrcIdx);
-    assert(MO.isReg() && !MO.isDef() && "Source is invalid");
     // Update the current register.
-    Reg = MO.getReg();
+    Reg = SrcReg;
     // Update the return value before moving up in the use-def chain.
     PrevDef = Def;
     // If we can still move up in the use-def chain, move to the next
     // defintion.
     if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      Def = MRI->getVRegDef(Reg);
-      DefIdx = MRI->def_begin(Reg).getOperandNo();
+      Def = MRI.getVRegDef(Reg);
+      DefIdx = MRI.def_begin(Reg).getOperandNo();
       DefSubReg = SrcSubReg;
       return PrevDef;
     }

diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index db3933e..89e1d11 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp

@@ -41,7 +41,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
@@ -98,6 +97,11 @@
     }
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    bool enablePostRAScheduler(
+        const TargetSubtargetInfo &ST, CodeGenOpt::Level OptLevel,
+        TargetSubtargetInfo::AntiDepBreakMode &Mode,
+        TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const;
   };
   char PostRAScheduler::ID = 0;
 
@@ -132,10 +136,10 @@
 
   public:
     SchedulePostRATDList(
-      MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-      AliasAnalysis *AA, const RegisterClassInfo&,
-      TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
-      SmallVectorImpl<const TargetRegisterClass*> &CriticalPathRCs);
+        MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA,
+        const RegisterClassInfo &,
+        TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+        SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs);
 
     ~SchedulePostRATDList();
 
@@ -188,16 +192,17 @@
                 "Post RA top-down list latency scheduler", false, false)
 
 SchedulePostRATDList::SchedulePostRATDList(
-  MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
-  AliasAnalysis *AA, const RegisterClassInfo &RCI,
-  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
-  SmallVectorImpl<const TargetRegisterClass*> &CriticalPathRCs)
-  : ScheduleDAGInstrs(MF, MLI, MDT, /*IsPostRA=*/true), AA(AA), EndIndex(0) {
+    MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA,
+    const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+    SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs)
+    : ScheduleDAGInstrs(MF, &MLI, /*IsPostRA=*/true), AA(AA), EndIndex(0) {
 
-  const TargetMachine &TM = MF.getTarget();
-  const InstrItineraryData *InstrItins = TM.getInstrItineraryData();
+  const InstrItineraryData *InstrItins =
+      MF.getSubtarget().getInstrItineraryData();
   HazardRec =
-    TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins, this);
+      MF.getSubtarget().getInstrInfo()->CreateTargetPostRAHazardRecognizer(
+          InstrItins, this);
 
   assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE ||
           MRI.tracksLiveness()) &&
@@ -245,13 +250,23 @@
 }
 #endif
 
+bool PostRAScheduler::enablePostRAScheduler(
+    const TargetSubtargetInfo &ST,
+    CodeGenOpt::Level OptLevel,
+    TargetSubtargetInfo::AntiDepBreakMode &Mode,
+    TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const {
+  Mode = ST.getAntiDepBreakMode();
+  ST.getCriticalPathRCs(CriticalPathRCs);
+  return ST.enablePostMachineScheduler() &&
+         OptLevel >= ST.getOptLevelToEnablePostRAScheduler();
+}
+
 bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   if (skipOptnoneFunction(*Fn.getFunction()))
     return false;
 
-  TII = Fn.getTarget().getInstrInfo();
+  TII = Fn.getSubtarget().getInstrInfo();
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
-  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   AliasAnalysis *AA = &getAnalysis<AliasAnalysis>();
   TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
 
@@ -267,9 +282,10 @@
   } else {
     // Check that post-RA scheduling is enabled for this target.
     // This may upgrade the AntiDepMode.
-    const TargetSubtargetInfo &ST = Fn.getTarget().getSubtarget<TargetSubtargetInfo>();
-    if (!ST.enablePostRAScheduler(PassConfig->getOptLevel(), AntiDepMode,
-                                  CriticalPathRCs))
+    const TargetSubtargetInfo &ST =
+        Fn.getTarget().getSubtarget<TargetSubtargetInfo>();
+    if (!enablePostRAScheduler(ST, PassConfig->getOptLevel(),
+                               AntiDepMode, CriticalPathRCs))
       return false;
   }
 
@@ -284,7 +300,7 @@
 
   DEBUG(dbgs() << "PostRAScheduler\n");
 
-  SchedulePostRATDList Scheduler(Fn, MLI, MDT, AA, RegClassInfo, AntiDepMode,
+  SchedulePostRATDList Scheduler(Fn, MLI, AA, RegClassInfo, AntiDepMode,
                                  CriticalPathRCs);
 
   // Loop over all of the basic blocks
@@ -543,10 +559,10 @@
       if (HT == ScheduleHazardRecognizer::NoHazard) {
         if (HazardRec->ShouldPreferAnother(CurSUnit)) {
           if (!NotPreferredSUnit) {
-	    // If this is the first non-preferred node for this cycle, then
-	    // record it and continue searching for a preferred node. If this
-	    // is not the first non-preferred node, then treat it as though
-	    // there had been a hazard.
+            // If this is the first non-preferred node for this cycle, then
+            // record it and continue searching for a preferred node. If this
+            // is not the first non-preferred node, then treat it as though
+            // there had been a hazard.
             NotPreferredSUnit = CurSUnit;
             continue;
           }

diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 3129927..b153800 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp

@@ -16,6 +16,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -138,8 +139,8 @@
 
   bool Changed = false;
 
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   assert(MRI->isSSA() && "ProcessImplicitDefs only works on SSA form.");
   assert(WorkList.empty() && "Inconsistent worklist state");

diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index b98d210..06530b9 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp

@@ -41,6 +41,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
 
 using namespace llvm;
@@ -110,8 +111,8 @@
 ///
 bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   const Function* F = Fn.getFunction();
-  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
 
   assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
 
@@ -185,8 +186,8 @@
 /// variables for the function's frame information and eliminate call frame
 /// pseudo instructions.
 void PEI::calculateCallsInformation(MachineFunction &Fn) {
-  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = Fn.getFrameInfo();
 
   unsigned MaxCallFrameSize = 0;
@@ -239,8 +240,8 @@
 /// calculateCalleeSavedRegisters - Scan the function for modified callee saved
 /// registers.
 void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
-  const TargetRegisterInfo *RegInfo = F.getTarget().getRegisterInfo();
-  const TargetFrameLowering *TFI = F.getTarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = F.getFrameInfo();
 
   // Get the callee saved register list...
@@ -337,9 +338,9 @@
   if (CSI.empty())
     return;
 
-  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
-  const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
-  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
   MachineBasicBlock::iterator I;
 
   // Spill using target interface.
@@ -445,7 +446,7 @@
 /// abstract stack objects.
 ///
 void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
   StackProtector *SP = &getAnalysis<StackProtector>();
 
   bool StackGrowsDown =
@@ -515,7 +516,7 @@
   // Make sure the special register scavenging spill slot is closest to the
   // incoming stack pointer if a frame pointer is required and is closer
   // to the incoming rather than the final stack pointer.
-  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
   bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
                                TFI.isFPCloseToIncomingSP() &&
                                RegInfo->useFPForScavengingIndex(Fn) &&
@@ -670,7 +671,7 @@
 /// prolog and epilog code to the function.
 ///
 void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
 
   // Add prologue to the function...
   TFI.emitPrologue(Fn);
@@ -710,8 +711,7 @@
   SmallPtrSet<MachineBasicBlock*, 8> Reachable;
 
   // Iterate over the reachable blocks in DFS order.
-  for (df_ext_iterator<MachineFunction*, SmallPtrSet<MachineBasicBlock*, 8> >
-       DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
+  for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
        DFI != DFE; ++DFI) {
     int SPAdj = 0;
     // Check the exit state of the DFS stack predecessor.
@@ -738,11 +738,11 @@
 
 void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
                               int &SPAdj) {
-  const TargetMachine &TM = Fn.getTarget();
-  assert(TM.getRegisterInfo() && "TM::getRegisterInfo() must be implemented!");
-  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
-  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
-  const TargetFrameLowering *TFI = TM.getFrameLowering();
+  assert(Fn.getSubtarget().getRegisterInfo() &&
+         "getRegisterInfo() must be implemented!");
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
   bool StackGrowsDown =
     TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
   int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
@@ -837,7 +837,8 @@
 /// FIXME: Iterating over the instruction stream is unnecessary. We can simply
 /// iterate over the vreg use list, which at this point only contains machine
 /// operands for which eliminateFrameIndex need a new scratch reg.
-void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
+void
+PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
   // Run through the instructions and find any virtual registers.
   for (MachineFunction::iterator BB = Fn.begin(),
        E = Fn.end(); BB != E; ++BB) {
@@ -888,12 +889,16 @@
           // Replace this reference to the virtual register with the
           // scratch register.
           assert (ScratchReg && "Missing scratch register!");
+          MachineRegisterInfo &MRI = Fn.getRegInfo();
           Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
+          
+          // Make sure MRI now accounts this register as used.
+          MRI.setPhysRegUsed(ScratchReg);
 
           // Because this instruction was processed by the RS before this
           // register was allocated, make sure that the RS now records the
           // register as being used.
-          RS->setUsed(ScratchReg);
+          RS->setRegUsed(ScratchReg);
         }
       }
 

diff --git a/lib/CodeGen/PrologEpilogInserter.h b/lib/CodeGen/PrologEpilogInserter.h
index 5a6d39a..f88b8ef 100644
--- a/lib/CodeGen/PrologEpilogInserter.h
+++ b/lib/CodeGen/PrologEpilogInserter.h

@@ -16,8 +16,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_PEI_H
-#define LLVM_CODEGEN_PEI_H
+#ifndef LLVM_LIB_CODEGEN_PROLOGEPILOGINSERTER_H
+#define LLVM_LIB_CODEGEN_PROLOGEPILOGINSERTER_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SparseBitVector.h"

diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index 12b2c90..b1c341d 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp

@@ -107,13 +107,9 @@
 }
 
 bool FixedStackPseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
-  // Negative frame indices are used for special things that don't
-  // appear in LLVM IR. Non-negative indices may be used for things
-  // like static allocas.
   if (!MFI)
-    return FI >= 0;
-  // Spill slots should not alias others.
-  return !MFI->isFixedObjectIndex(FI) && !MFI->isSpillSlotObjectIndex(FI);
+    return true;
+  return MFI->isAliasedObjectIndex(FI);
 }
 
 bool FixedStackPseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {

diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index 894aee7..122afd1 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp

@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #ifndef NDEBUG
 #include "llvm/ADT/SparseBitVector.h"
@@ -102,7 +101,7 @@
     // register if possible and populate a list of new live intervals that
     // result from splitting.
     DEBUG(dbgs() << "\nselectOrSplit "
-          << MRI->getRegClass(VirtReg->reg)->getName()
+          << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg))
           << ':' << *VirtReg << " w=" << VirtReg->weight << '\n');
     typedef SmallVector<unsigned, 4> VirtRegVec;
     VirtRegVec SplitVRegs;

diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index b333c36..bbd79cd 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h

@@ -34,8 +34,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_REGALLOCBASE
-#define LLVM_CODEGEN_REGALLOCBASE
+#ifndef LLVM_LIB_CODEGEN_REGALLOCBASE_H
+#define LLVM_LIB_CODEGEN_REGALLOCBASE_H
 
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
@@ -106,4 +106,4 @@
 
 } // end namespace llvm
 
-#endif // !defined(LLVM_CODEGEN_REGALLOCBASE)
+#endif

diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index b722098..0090332 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp

@@ -33,7 +33,6 @@
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <cstdlib>
 #include <queue>
@@ -157,7 +156,7 @@
 }
 
 void RABasic::releaseMemory() {
-  SpillerInstance.reset(nullptr);
+  SpillerInstance.reset();
 }
 
 

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 97b9f76..8fc10b4 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp

@@ -33,7 +33,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -53,7 +53,6 @@
     RAFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1),
                isBulkSpilling(false) {}
   private:
-    const TargetMachine *TM;
     MachineFunction *MF;
     MachineRegisterInfo *MRI;
     const TargetRegisterInfo *TRI;
@@ -298,7 +297,8 @@
       LiveDbgValueMap[LRI->VirtReg];
     for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) {
       MachineInstr *DBG = LRIDbgValues[li];
-      const MDNode *MDPtr = DBG->getOperand(2).getMetadata();
+      const MDNode *Var = DBG->getDebugVariable();
+      const MDNode *Expr = DBG->getDebugExpression();
       bool IsIndirect = DBG->isIndirectDebugValue();
       uint64_t Offset = IsIndirect ? DBG->getOperand(1).getImm() : 0;
       DebugLoc DL;
@@ -308,10 +308,13 @@
         DL = (--EI)->getDebugLoc();
       } else
         DL = MI->getDebugLoc();
-      MachineBasicBlock *MBB = DBG->getParent();
       MachineInstr *NewDV =
           BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::DBG_VALUE))
-              .addFrameIndex(FI).addImm(Offset).addMetadata(MDPtr);
+              .addFrameIndex(FI)
+              .addImm(Offset)
+              .addMetadata(Var)
+              .addMetadata(Expr);
+      assert(NewDV->getParent() == MBB && "dangling parent pointer");
       (void)NewDV;
       DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
     }
@@ -545,7 +548,7 @@
   }
 
   DEBUG(dbgs() << "Allocating " << PrintReg(VirtReg) << " from "
-               << RC->getName() << "\n");
+               << TRI->getRegClassName(RC) << "\n");
 
   unsigned BestReg = 0, BestCost = spillImpossible;
   for (ArrayRef<MCPhysReg>::iterator I = AO.begin(), E = AO.end(); I != E; ++I){
@@ -705,7 +708,7 @@
       continue;
     if (MO.isEarlyClobber() || MI->isRegTiedToDefOperand(i) ||
         (MO.getSubReg() && MI->readsVirtualRegister(Reg))) {
-      if (ThroughRegs.insert(Reg))
+      if (ThroughRegs.insert(Reg).second)
         DEBUG(dbgs() << ' ' << PrintReg(Reg));
     }
   }
@@ -862,13 +865,16 @@
               // Modify DBG_VALUE now that the value is in a spill slot.
               bool IsIndirect = MI->isIndirectDebugValue();
               uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
-              const MDNode *MDPtr =
-                MI->getOperand(MI->getNumOperands()-1).getMetadata();
+              const MDNode *Var = MI->getDebugVariable();
+              const MDNode *Expr = MI->getDebugExpression();
               DebugLoc DL = MI->getDebugLoc();
               MachineBasicBlock *MBB = MI->getParent();
               MachineInstr *NewDV = BuildMI(*MBB, MBB->erase(MI), DL,
                                             TII->get(TargetOpcode::DBG_VALUE))
-                  .addFrameIndex(SS).addImm(Offset).addMetadata(MDPtr);
+                                        .addFrameIndex(SS)
+                                        .addImm(Offset)
+                                        .addMetadata(Var)
+                                        .addMetadata(Expr);
               DEBUG(dbgs() << "Modifying debug info due to spill:"
                            << "\t" << *NewDV);
               // Scan NewDV operands from the beginning.
@@ -1070,9 +1076,8 @@
                << "********** Function: " << Fn.getName() << '\n');
   MF = &Fn;
   MRI = &MF->getRegInfo();
-  TM = &Fn.getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
   MRI->freezeReservedRegs(Fn);
   RegClassInfo.runOnMachineFunction(Fn);
   UsedInInstr.clear();
@@ -1093,9 +1098,8 @@
   }
 
   // Add the clobber lists for all the instructions we skipped earlier.
-  for (SmallPtrSet<const MCInstrDesc*, 4>::const_iterator
-       I = SkippedInstrs.begin(), E = SkippedInstrs.end(); I != E; ++I)
-    if (const uint16_t *Defs = (*I)->getImplicitDefs())
+  for (const MCInstrDesc *Desc : SkippedInstrs)
+    if (const uint16_t *Defs = Desc->getImplicitDefs())
       while (*Defs)
         MRI->setPhysRegUsed(*Defs++);
 

diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 901b993..8ef5dcd 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp

@@ -486,7 +486,7 @@
 }
 
 void RAGreedy::releaseMemory() {
-  SpillerInstance.reset(nullptr);
+  SpillerInstance.reset();
   ExtraRegInfo.clear();
   GlobalCand.clear();
 }
@@ -514,7 +514,7 @@
     // Giant live ranges fall back to the global assignment heuristic, which
     // prevents excessive spilling in pathological cases.
     bool ReverseLocal = TRI->reverseLocalAssignment();
-    bool ForceGlobal = !ReverseLocal && TRI->mayOverrideLocalAssignment() &&
+    bool ForceGlobal = !ReverseLocal &&
       (Size / SlotIndex::InstrDist) > (2 * MRI->getRegClass(Reg)->getNumRegs());
 
     if (ExtraRegInfo[Reg].Stage == RS_Assign && !ForceGlobal && !LI->empty() &&
@@ -817,7 +817,7 @@
     const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg);
     unsigned MinCost = RegClassInfo.getMinCost(RC);
     if (MinCost >= CostPerUseLimit) {
-      DEBUG(dbgs() << RC->getName() << " minimum cost = " << MinCost
+      DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = " << MinCost
                    << ", no cheaper registers to be found.\n");
       return 0;
     }
@@ -967,14 +967,12 @@
       BCS[B].Exit = SpillPlacement::PrefSpill;
 
     if (++B == GroupSize) {
-      ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B);
-      SpillPlacer->addConstraints(Array);
+      SpillPlacer->addConstraints(makeArrayRef(BCS, B));
       B = 0;
     }
   }
 
-  ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B);
-  SpillPlacer->addConstraints(Array);
+  SpillPlacer->addConstraints(makeArrayRef(BCS, B));
   SpillPlacer->addLinks(makeArrayRef(TBS, T));
 }
 
@@ -1013,7 +1011,7 @@
 
     // Compute through constraints from the interference, or assume that all
     // through blocks prefer spilling when forming compact regions.
-    ArrayRef<unsigned> NewBlocks = makeArrayRef(ActiveBlocks).slice(AddedTo);
+    auto NewBlocks = makeArrayRef(ActiveBlocks).slice(AddedTo);
     if (Cand.PhysReg)
       addThroughConstraints(Cand.Intf, NewBlocks);
     else
@@ -1791,9 +1789,11 @@
         // instructions.
         //
         // Try to guess the size of the new interval.
-        const float EstWeight = normalizeSpillWeight(blockFreq * (NewGaps + 1),
-                                 Uses[SplitBefore].distance(Uses[SplitAfter]) +
-                                 (LiveBefore + LiveAfter)*SlotIndex::InstrDist);
+        const float EstWeight = normalizeSpillWeight(
+            blockFreq * (NewGaps + 1),
+            Uses[SplitBefore].distance(Uses[SplitAfter]) +
+                (LiveBefore + LiveAfter) * SlotIndex::InstrDist,
+            1);
         // Would this split be possible to allocate?
         // Never allocate all gaps, we wouldn't be making progress.
         DEBUG(dbgs() << " w=" << EstWeight);
@@ -2319,13 +2319,13 @@
                << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
-  const TargetMachine &TM = MF->getTarget();
-  TRI = TM.getRegisterInfo();
-  TII = TM.getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
   RCI.runOnMachineFunction(mf);
 
   EnableLocalReassign = EnableLocalReassignment ||
-    TM.getSubtargetImpl()->enableRALocalReassignment(TM.getOptLevel());
+                        MF->getSubtarget().enableRALocalReassignment(
+                            MF->getTarget().getOptLevel());
 
   if (VerifyEnabled)
     MF->verify(this, "Before greedy register allocator");

diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index b8d2325..eb7e5633 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp

@@ -49,9 +49,10 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <limits>
 #include <memory>
+#include <queue>
 #include <set>
 #include <sstream>
 #include <vector>
@@ -61,17 +62,17 @@
 #define DEBUG_TYPE "regalloc"
 
 static RegisterRegAlloc
-registerPBQPRepAlloc("pbqp", "PBQP register allocator",
+RegisterPBQPRepAlloc("pbqp", "PBQP register allocator",
                        createDefaultPBQPRegisterAllocator);
 
 static cl::opt<bool>
-pbqpCoalescing("pbqp-coalescing",
+PBQPCoalescing("pbqp-coalescing",
                 cl::desc("Attempt coalescing during PBQP register allocation."),
                 cl::init(false), cl::Hidden);
 
 #ifndef NDEBUG
 static cl::opt<bool>
-pbqpDumpGraphs("pbqp-dump-graphs",
+PBQPDumpGraphs("pbqp-dump-graphs",
                cl::desc("Dump graphs for each function/round in the compilation unit."),
                cl::init(false), cl::Hidden);
 #endif
@@ -88,8 +89,8 @@
   static char ID;
 
   /// Construct a PBQP register allocator.
-  RegAllocPBQP(std::unique_ptr<PBQPBuilder> &b, char *cPassID=nullptr)
-      : MachineFunctionPass(ID), builder(b.release()), customPassID(cPassID) {
+  RegAllocPBQP(char *cPassID = nullptr)
+      : MachineFunctionPass(ID), customPassID(cPassID) {
     initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
     initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
     initializeLiveStacksPass(*PassRegistry::getPassRegistry());
@@ -117,301 +118,320 @@
   typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap;
   typedef std::set<unsigned> RegSet;
 
-  std::unique_ptr<PBQPBuilder> builder;
-
   char *customPassID;
 
-  MachineFunction *mf;
-  const TargetMachine *tm;
-  const TargetRegisterInfo *tri;
-  const TargetInstrInfo *tii;
-  MachineRegisterInfo *mri;
-  const MachineBlockFrequencyInfo *mbfi;
-
-  std::unique_ptr<Spiller> spiller;
-  LiveIntervals *lis;
-  LiveStacks *lss;
-  VirtRegMap *vrm;
-
-  RegSet vregsToAlloc, emptyIntervalVRegs;
+  RegSet VRegsToAlloc, EmptyIntervalVRegs;
 
   /// \brief Finds the initial set of vreg intervals to allocate.
-  void findVRegIntervalsToAlloc();
+  void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
+
+  /// \brief Constructs an initial graph.
+  void initializeGraph(PBQPRAGraph &G);
 
   /// \brief Given a solved PBQP problem maps this solution back to a register
   /// assignment.
-  bool mapPBQPToRegAlloc(const PBQPRAProblem &problem,
-                         const PBQP::Solution &solution);
+  bool mapPBQPToRegAlloc(const PBQPRAGraph &G,
+                         const PBQP::Solution &Solution,
+                         VirtRegMap &VRM,
+                         Spiller &VRegSpiller);
 
   /// \brief Postprocessing before final spilling. Sets basic block "live in"
   /// variables.
-  void finalizeAlloc() const;
+  void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS,
+                     VirtRegMap &VRM) const;
 
 };
 
 char RegAllocPBQP::ID = 0;
 
+/// @brief Set spill costs for each node in the PBQP reg-alloc graph.
+class SpillCosts : public PBQPRAConstraint {
+public:
+  void apply(PBQPRAGraph &G) override {
+    LiveIntervals &LIS = G.getMetadata().LIS;
+
+    // A minimum spill costs, so that register constraints can can be set
+    // without normalization in the [0.0:MinSpillCost( interval.
+    const PBQP::PBQPNum MinSpillCost = 10.0;
+
+    for (auto NId : G.nodeIds()) {
+      PBQP::PBQPNum SpillCost =
+        LIS.getInterval(G.getNodeMetadata(NId).getVReg()).weight;
+      if (SpillCost == 0.0)
+        SpillCost = std::numeric_limits<PBQP::PBQPNum>::min();
+      else
+        SpillCost += MinSpillCost;
+      PBQPRAGraph::RawVector NodeCosts(G.getNodeCosts(NId));
+      NodeCosts[PBQP::RegAlloc::getSpillOptionIdx()] = SpillCost;
+      G.setNodeCosts(NId, std::move(NodeCosts));
+    }
+  }
+};
+
+/// @brief Add interference edges between overlapping vregs.
+class Interference : public PBQPRAConstraint {
+private:
+
+private:
+
+  typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;
+  typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IMatrixKey;
+  typedef DenseMap<IMatrixKey, PBQPRAGraph::MatrixPtr> IMatrixCache;
+
+  // Holds (Interval, CurrentSegmentID, and NodeId). The first two are required
+  // for the fast interference graph construction algorithm. The last is there
+  // to save us from looking up node ids via the VRegToNode map in the graph
+  // metadata.
+  typedef std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>
+    IntervalInfo;
+
+  static SlotIndex getStartPoint(const IntervalInfo &I) {
+    return std::get<0>(I)->segments[std::get<1>(I)].start;
+  }
+
+  static SlotIndex getEndPoint(const IntervalInfo &I) {
+    return std::get<0>(I)->segments[std::get<1>(I)].end;
+  }
+
+  static PBQP::GraphBase::NodeId getNodeId(const IntervalInfo &I) {
+    return std::get<2>(I);
+  }
+
+  static bool lowestStartPoint(const IntervalInfo &I1,
+                               const IntervalInfo &I2) {
+    // Condition reversed because priority queue has the *highest* element at
+    // the front, rather than the lowest.
+    return getStartPoint(I1) > getStartPoint(I2);
+  }
+
+  static bool lowestEndPoint(const IntervalInfo &I1,
+                             const IntervalInfo &I2) {
+    SlotIndex E1 = getEndPoint(I1);
+    SlotIndex E2 = getEndPoint(I2);
+
+    if (E1 < E2)
+      return true;
+
+    if (E1 > E2)
+      return false;
+
+    // If two intervals end at the same point, we need a way to break the tie or
+    // the set will assume they're actually equal and refuse to insert a
+    // "duplicate". Just compare the vregs - fast and guaranteed unique.
+    return std::get<0>(I1)->reg < std::get<0>(I2)->reg;
+  }
+
+  static bool isAtLastSegment(const IntervalInfo &I) {
+    return std::get<1>(I) == std::get<0>(I)->size() - 1;
+  }
+
+  static IntervalInfo nextSegment(const IntervalInfo &I) {
+    return std::make_tuple(std::get<0>(I), std::get<1>(I) + 1, std::get<2>(I));
+  }
+
+public:
+
+  void apply(PBQPRAGraph &G) override {
+    // The following is loosely based on the linear scan algorithm introduced in
+    // "Linear Scan Register Allocation" by Poletto and Sarkar. This version
+    // isn't linear, because the size of the active set isn't bound by the
+    // number of registers, but rather the size of the largest clique in the
+    // graph. Still, we expect this to be better than N^2.
+    LiveIntervals &LIS = G.getMetadata().LIS;
+
+    // Interferenc matrices are incredibly regular - they're only a function of
+    // the allowed sets, so we cache them to avoid the overhead of constructing
+    // and uniquing them.
+    IMatrixCache C;
+
+    typedef std::set<IntervalInfo, decltype(&lowestEndPoint)> IntervalSet;
+    typedef std::priority_queue<IntervalInfo, std::vector<IntervalInfo>,
+                                decltype(&lowestStartPoint)> IntervalQueue;
+    IntervalSet Active(lowestEndPoint);
+    IntervalQueue Inactive(lowestStartPoint);
+
+    // Start by building the inactive set.
+    for (auto NId : G.nodeIds()) {
+      unsigned VReg = G.getNodeMetadata(NId).getVReg();
+      LiveInterval &LI = LIS.getInterval(VReg);
+      assert(!LI.empty() && "PBQP graph contains node for empty interval");
+      Inactive.push(std::make_tuple(&LI, 0, NId));
+    }
+
+    while (!Inactive.empty()) {
+      // Tentatively grab the "next" interval - this choice may be overriden
+      // below.
+      IntervalInfo Cur = Inactive.top();
+
+      // Retire any active intervals that end before Cur starts.
+      IntervalSet::iterator RetireItr = Active.begin();
+      while (RetireItr != Active.end() &&
+             (getEndPoint(*RetireItr) <= getStartPoint(Cur))) {
+        // If this interval has subsequent segments, add the next one to the
+        // inactive list.
+        if (!isAtLastSegment(*RetireItr))
+          Inactive.push(nextSegment(*RetireItr));
+
+        ++RetireItr;
+      }
+      Active.erase(Active.begin(), RetireItr);
+
+      // One of the newly retired segments may actually start before the
+      // Cur segment, so re-grab the front of the inactive list.
+      Cur = Inactive.top();
+      Inactive.pop();
+
+      // At this point we know that Cur overlaps all active intervals. Add the
+      // interference edges.
+      PBQP::GraphBase::NodeId NId = getNodeId(Cur);
+      for (const auto &A : Active) {
+        PBQP::GraphBase::NodeId MId = getNodeId(A);
+
+        // Check that we haven't already added this edge
+        // FIXME: findEdge is expensive in the worst case (O(max_clique(G))).
+        //        It might be better to replace this with a local bit-matrix.
+        if (G.findEdge(NId, MId) != PBQPRAGraph::invalidEdgeId())
+          continue;
+
+        // This is a new edge - add it to the graph.
+        createInterferenceEdge(G, NId, MId, C);
+      }
+
+      // Finally, add Cur to the Active set.
+      Active.insert(Cur);
+    }
+  }
+
+private:
+
+  void createInterferenceEdge(PBQPRAGraph &G, PBQPRAGraph::NodeId NId,
+                              PBQPRAGraph::NodeId MId, IMatrixCache &C) {
+
+    const TargetRegisterInfo &TRI =
+      *G.getMetadata().MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+
+    const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs();
+    const auto &MRegs = G.getNodeMetadata(MId).getAllowedRegs();
+
+    // Try looking the edge costs up in the IMatrixCache first.
+    IMatrixKey K(&NRegs, &MRegs);
+    IMatrixCache::iterator I = C.find(K);
+    if (I != C.end()) {
+      G.addEdgeBypassingCostAllocator(NId, MId, I->second);
+      return;
+    }
+
+    PBQPRAGraph::RawMatrix M(NRegs.size() + 1, MRegs.size() + 1, 0);
+    for (unsigned I = 0; I != NRegs.size(); ++I) {
+      unsigned PRegN = NRegs[I];
+      for (unsigned J = 0; J != MRegs.size(); ++J) {
+        unsigned PRegM = MRegs[J];
+        if (TRI.regsOverlap(PRegN, PRegM))
+          M[I + 1][J + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+      }
+    }
+
+    PBQPRAGraph::EdgeId EId = G.addEdge(NId, MId, std::move(M));
+    C[K] = G.getEdgeCostsPtr(EId);
+  }
+};
+
+
+class Coalescing : public PBQPRAConstraint {
+public:
+  void apply(PBQPRAGraph &G) override {
+    MachineFunction &MF = G.getMetadata().MF;
+    MachineBlockFrequencyInfo &MBFI = G.getMetadata().MBFI;
+    CoalescerPair CP(*MF.getTarget().getSubtargetImpl()->getRegisterInfo());
+
+    // Scan the machine function and add a coalescing cost whenever CoalescerPair
+    // gives the Ok.
+    for (const auto &MBB : MF) {
+      for (const auto &MI : MBB) {
+
+        // Skip not-coalescable or already coalesced copies.
+        if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg())
+          continue;
+
+        unsigned DstReg = CP.getDstReg();
+        unsigned SrcReg = CP.getSrcReg();
+
+        const float Scale = 1.0f / MBFI.getEntryFreq();
+        PBQP::PBQPNum CBenefit = MBFI.getBlockFreq(&MBB).getFrequency() * Scale;
+
+        if (CP.isPhys()) {
+          if (!MF.getRegInfo().isAllocatable(DstReg))
+            continue;
+
+          PBQPRAGraph::NodeId NId = G.getMetadata().getNodeIdForVReg(SrcReg);
+
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed =
+            G.getNodeMetadata(NId).getAllowedRegs();
+
+          unsigned PRegOpt = 0;
+          while (PRegOpt < Allowed.size() && Allowed[PRegOpt] != DstReg)
+            ++PRegOpt;
+
+          if (PRegOpt < Allowed.size()) {
+            PBQPRAGraph::RawVector NewCosts(G.getNodeCosts(NId));
+            NewCosts[PRegOpt + 1] -= CBenefit;
+            G.setNodeCosts(NId, std::move(NewCosts));
+          }
+        } else {
+          PBQPRAGraph::NodeId N1Id = G.getMetadata().getNodeIdForVReg(DstReg);
+          PBQPRAGraph::NodeId N2Id = G.getMetadata().getNodeIdForVReg(SrcReg);
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector *Allowed1 =
+            &G.getNodeMetadata(N1Id).getAllowedRegs();
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector *Allowed2 =
+            &G.getNodeMetadata(N2Id).getAllowedRegs();
+
+          PBQPRAGraph::EdgeId EId = G.findEdge(N1Id, N2Id);
+          if (EId == G.invalidEdgeId()) {
+            PBQPRAGraph::RawMatrix Costs(Allowed1->size() + 1,
+                                         Allowed2->size() + 1, 0);
+            addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
+            G.addEdge(N1Id, N2Id, std::move(Costs));
+          } else {
+            if (G.getEdgeNode1Id(EId) == N2Id) {
+              std::swap(N1Id, N2Id);
+              std::swap(Allowed1, Allowed2);
+            }
+            PBQPRAGraph::RawMatrix Costs(G.getEdgeCosts(EId));
+            addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
+            G.setEdgeCosts(EId, std::move(Costs));
+          }
+        }
+      }
+    }
+  }
+
+private:
+
+  void addVirtRegCoalesce(
+                    PBQPRAGraph::RawMatrix &CostMat,
+                    const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1,
+                    const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed2,
+                    PBQP::PBQPNum Benefit) {
+    assert(CostMat.getRows() == Allowed1.size() + 1 && "Size mismatch.");
+    assert(CostMat.getCols() == Allowed2.size() + 1 && "Size mismatch.");
+    for (unsigned I = 0; I != Allowed1.size(); ++I) {
+      unsigned PReg1 = Allowed1[I];
+      for (unsigned J = 0; J != Allowed2.size(); ++J) {
+        unsigned PReg2 = Allowed2[J];
+        if (PReg1 == PReg2)
+          CostMat[I + 1][J + 1] -= Benefit;
+      }
+    }
+  }
+
+};
+
 } // End anonymous namespace.
 
-unsigned PBQPRAProblem::getVRegForNode(PBQPRAGraph::NodeId node) const {
-  Node2VReg::const_iterator vregItr = node2VReg.find(node);
-  assert(vregItr != node2VReg.end() && "No vreg for node.");
-  return vregItr->second;
-}
-
-PBQPRAGraph::NodeId PBQPRAProblem::getNodeForVReg(unsigned vreg) const {
-  VReg2Node::const_iterator nodeItr = vreg2Node.find(vreg);
-  assert(nodeItr != vreg2Node.end() && "No node for vreg.");
-  return nodeItr->second;
-
-}
-
-const PBQPRAProblem::AllowedSet&
-  PBQPRAProblem::getAllowedSet(unsigned vreg) const {
-  AllowedSetMap::const_iterator allowedSetItr = allowedSets.find(vreg);
-  assert(allowedSetItr != allowedSets.end() && "No pregs for vreg.");
-  const AllowedSet &allowedSet = allowedSetItr->second;
-  return allowedSet;
-}
-
-unsigned PBQPRAProblem::getPRegForOption(unsigned vreg, unsigned option) const {
-  assert(isPRegOption(vreg, option) && "Not a preg option.");
-
-  const AllowedSet& allowedSet = getAllowedSet(vreg);
-  assert(option <= allowedSet.size() && "Option outside allowed set.");
-  return allowedSet[option - 1];
-}
-
-PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
-                                  const MachineBlockFrequencyInfo *mbfi,
-                                  const RegSet &vregs) {
-
-  LiveIntervals *LIS = const_cast<LiveIntervals*>(lis);
-  MachineRegisterInfo *mri = &mf->getRegInfo();
-  const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo();
-
-  std::unique_ptr<PBQPRAProblem> p(new PBQPRAProblem());
-  PBQPRAGraph &g = p->getGraph();
-  RegSet pregs;
-
-  // Collect the set of preg intervals, record that they're used in the MF.
-  for (unsigned Reg = 1, e = tri->getNumRegs(); Reg != e; ++Reg) {
-    if (mri->def_empty(Reg))
-      continue;
-    pregs.insert(Reg);
-    mri->setPhysRegUsed(Reg);
-  }
-
-  // Iterate over vregs.
-  for (RegSet::const_iterator vregItr = vregs.begin(), vregEnd = vregs.end();
-       vregItr != vregEnd; ++vregItr) {
-    unsigned vreg = *vregItr;
-    const TargetRegisterClass *trc = mri->getRegClass(vreg);
-    LiveInterval *vregLI = &LIS->getInterval(vreg);
-
-    // Record any overlaps with regmask operands.
-    BitVector regMaskOverlaps;
-    LIS->checkRegMaskInterference(*vregLI, regMaskOverlaps);
-
-    // Compute an initial allowed set for the current vreg.
-    typedef std::vector<unsigned> VRAllowed;
-    VRAllowed vrAllowed;
-    ArrayRef<MCPhysReg> rawOrder = trc->getRawAllocationOrder(*mf);
-    for (unsigned i = 0; i != rawOrder.size(); ++i) {
-      unsigned preg = rawOrder[i];
-      if (mri->isReserved(preg))
-        continue;
-
-      // vregLI crosses a regmask operand that clobbers preg.
-      if (!regMaskOverlaps.empty() && !regMaskOverlaps.test(preg))
-        continue;
-
-      // vregLI overlaps fixed regunit interference.
-      bool Interference = false;
-      for (MCRegUnitIterator Units(preg, tri); Units.isValid(); ++Units) {
-        if (vregLI->overlaps(LIS->getRegUnit(*Units))) {
-          Interference = true;
-          break;
-        }
-      }
-      if (Interference)
-        continue;
-
-      // preg is usable for this virtual register.
-      vrAllowed.push_back(preg);
-    }
-
-    PBQP::Vector nodeCosts(vrAllowed.size() + 1, 0);
-
-    PBQP::PBQPNum spillCost = (vregLI->weight != 0.0) ?
-        vregLI->weight : std::numeric_limits<PBQP::PBQPNum>::min();
-
-    addSpillCosts(nodeCosts, spillCost);
-
-    // Construct the node.
-    PBQPRAGraph::NodeId nId = g.addNode(std::move(nodeCosts));
-
-    // Record the mapping and allowed set in the problem.
-    p->recordVReg(vreg, nId, vrAllowed.begin(), vrAllowed.end());
-
-  }
-
-  for (RegSet::const_iterator vr1Itr = vregs.begin(), vrEnd = vregs.end();
-         vr1Itr != vrEnd; ++vr1Itr) {
-    unsigned vr1 = *vr1Itr;
-    const LiveInterval &l1 = lis->getInterval(vr1);
-    const PBQPRAProblem::AllowedSet &vr1Allowed = p->getAllowedSet(vr1);
-
-    for (RegSet::const_iterator vr2Itr = std::next(vr1Itr); vr2Itr != vrEnd;
-         ++vr2Itr) {
-      unsigned vr2 = *vr2Itr;
-      const LiveInterval &l2 = lis->getInterval(vr2);
-      const PBQPRAProblem::AllowedSet &vr2Allowed = p->getAllowedSet(vr2);
-
-      assert(!l2.empty() && "Empty interval in vreg set?");
-      if (l1.overlaps(l2)) {
-        PBQP::Matrix edgeCosts(vr1Allowed.size()+1, vr2Allowed.size()+1, 0);
-        addInterferenceCosts(edgeCosts, vr1Allowed, vr2Allowed, tri);
-
-        g.addEdge(p->getNodeForVReg(vr1), p->getNodeForVReg(vr2),
-                  std::move(edgeCosts));
-      }
-    }
-  }
-
-  return p.release();
-}
-
-void PBQPBuilder::addSpillCosts(PBQP::Vector &costVec,
-                                PBQP::PBQPNum spillCost) {
-  costVec[0] = spillCost;
-}
-
-void PBQPBuilder::addInterferenceCosts(
-                                    PBQP::Matrix &costMat,
-                                    const PBQPRAProblem::AllowedSet &vr1Allowed,
-                                    const PBQPRAProblem::AllowedSet &vr2Allowed,
-                                    const TargetRegisterInfo *tri) {
-  assert(costMat.getRows() == vr1Allowed.size() + 1 && "Matrix height mismatch.");
-  assert(costMat.getCols() == vr2Allowed.size() + 1 && "Matrix width mismatch.");
-
-  for (unsigned i = 0; i != vr1Allowed.size(); ++i) {
-    unsigned preg1 = vr1Allowed[i];
-
-    for (unsigned j = 0; j != vr2Allowed.size(); ++j) {
-      unsigned preg2 = vr2Allowed[j];
-
-      if (tri->regsOverlap(preg1, preg2)) {
-        costMat[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
-      }
-    }
-  }
-}
-
-PBQPRAProblem *PBQPBuilderWithCoalescing::build(MachineFunction *mf,
-                                                const LiveIntervals *lis,
-                                                const MachineBlockFrequencyInfo *mbfi,
-                                                const RegSet &vregs) {
-
-  std::unique_ptr<PBQPRAProblem> p(PBQPBuilder::build(mf, lis, mbfi, vregs));
-  PBQPRAGraph &g = p->getGraph();
-
-  const TargetMachine &tm = mf->getTarget();
-  CoalescerPair cp(*tm.getRegisterInfo());
-
-  // Scan the machine function and add a coalescing cost whenever CoalescerPair
-  // gives the Ok.
-  for (const auto &mbb : *mf) {
-    for (const auto &mi : mbb) {
-      if (!cp.setRegisters(&mi)) {
-        continue; // Not coalescable.
-      }
-
-      if (cp.getSrcReg() == cp.getDstReg()) {
-        continue; // Already coalesced.
-      }
-
-      unsigned dst = cp.getDstReg(),
-               src = cp.getSrcReg();
-
-      const float copyFactor = 0.5; // Cost of copy relative to load. Current
-      // value plucked randomly out of the air.
-
-      PBQP::PBQPNum cBenefit =
-        copyFactor * LiveIntervals::getSpillWeight(false, true, mbfi, &mi);
-
-      if (cp.isPhys()) {
-        if (!mf->getRegInfo().isAllocatable(dst)) {
-          continue;
-        }
-
-        const PBQPRAProblem::AllowedSet &allowed = p->getAllowedSet(src);
-        unsigned pregOpt = 0;
-        while (pregOpt < allowed.size() && allowed[pregOpt] != dst) {
-          ++pregOpt;
-        }
-        if (pregOpt < allowed.size()) {
-          ++pregOpt; // +1 to account for spill option.
-          PBQPRAGraph::NodeId node = p->getNodeForVReg(src);
-          llvm::dbgs() << "Reading node costs for node " << node << "\n";
-          llvm::dbgs() << "Source node: " << &g.getNodeCosts(node) << "\n";
-          PBQP::Vector newCosts(g.getNodeCosts(node));
-          addPhysRegCoalesce(newCosts, pregOpt, cBenefit);
-          g.setNodeCosts(node, newCosts);
-        }
-      } else {
-        const PBQPRAProblem::AllowedSet *allowed1 = &p->getAllowedSet(dst);
-        const PBQPRAProblem::AllowedSet *allowed2 = &p->getAllowedSet(src);
-        PBQPRAGraph::NodeId node1 = p->getNodeForVReg(dst);
-        PBQPRAGraph::NodeId node2 = p->getNodeForVReg(src);
-        PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2);
-        if (edge == g.invalidEdgeId()) {
-          PBQP::Matrix costs(allowed1->size() + 1, allowed2->size() + 1, 0);
-          addVirtRegCoalesce(costs, *allowed1, *allowed2, cBenefit);
-          g.addEdge(node1, node2, costs);
-        } else {
-          if (g.getEdgeNode1Id(edge) == node2) {
-            std::swap(node1, node2);
-            std::swap(allowed1, allowed2);
-          }
-          PBQP::Matrix costs(g.getEdgeCosts(edge));
-          addVirtRegCoalesce(costs, *allowed1, *allowed2, cBenefit);
-          g.setEdgeCosts(edge, costs);
-        }
-      }
-    }
-  }
-
-  return p.release();
-}
-
-void PBQPBuilderWithCoalescing::addPhysRegCoalesce(PBQP::Vector &costVec,
-                                                   unsigned pregOption,
-                                                   PBQP::PBQPNum benefit) {
-  costVec[pregOption] += -benefit;
-}
-
-void PBQPBuilderWithCoalescing::addVirtRegCoalesce(
-                                    PBQP::Matrix &costMat,
-                                    const PBQPRAProblem::AllowedSet &vr1Allowed,
-                                    const PBQPRAProblem::AllowedSet &vr2Allowed,
-                                    PBQP::PBQPNum benefit) {
-
-  assert(costMat.getRows() == vr1Allowed.size() + 1 && "Size mismatch.");
-  assert(costMat.getCols() == vr2Allowed.size() + 1 && "Size mismatch.");
-
-  for (unsigned i = 0; i != vr1Allowed.size(); ++i) {
-    unsigned preg1 = vr1Allowed[i];
-    for (unsigned j = 0; j != vr2Allowed.size(); ++j) {
-      unsigned preg2 = vr2Allowed[j];
-
-      if (preg1 == preg2) {
-        costMat[i + 1][j + 1] += -benefit;
-      }
-    }
-  }
-}
-
+// Out-of-line destructor/anchor for PBQPRAConstraint.
+PBQPRAConstraint::~PBQPRAConstraint() {}
+void PBQPRAConstraint::anchor() {}
+void PBQPRAConstraintList::anchor() {}
 
 void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   au.setPreservesCFG();
@@ -437,118 +457,197 @@
   MachineFunctionPass::getAnalysisUsage(au);
 }
 
-void RegAllocPBQP::findVRegIntervalsToAlloc() {
+void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
+                                            LiveIntervals &LIS) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Iterate over all live ranges.
-  for (unsigned i = 0, e = mri->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    if (mri->reg_nodbg_empty(Reg))
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
       continue;
-    LiveInterval *li = &lis->getInterval(Reg);
+    LiveInterval &LI = LIS.getInterval(Reg);
 
     // If this live interval is non-empty we will use pbqp to allocate it.
     // Empty intervals we allocate in a simple post-processing stage in
     // finalizeAlloc.
-    if (!li->empty()) {
-      vregsToAlloc.insert(li->reg);
+    if (!LI.empty()) {
+      VRegsToAlloc.insert(LI.reg);
     } else {
-      emptyIntervalVRegs.insert(li->reg);
+      EmptyIntervalVRegs.insert(LI.reg);
     }
   }
 }
 
-bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
-                                     const PBQP::Solution &solution) {
+static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI,
+                                   const MachineFunction &MF) {
+  const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSR[i] != 0; ++i)
+    if (TRI.regsOverlap(reg, CSR[i]))
+      return true;
+  return false;
+}
+
+void RegAllocPBQP::initializeGraph(PBQPRAGraph &G) {
+  MachineFunction &MF = G.getMetadata().MF;
+
+  LiveIntervals &LIS = G.getMetadata().LIS;
+  const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
+  const TargetRegisterInfo &TRI =
+    *G.getMetadata().MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+
+  for (auto VReg : VRegsToAlloc) {
+    const TargetRegisterClass *TRC = MRI.getRegClass(VReg);
+    LiveInterval &VRegLI = LIS.getInterval(VReg);
+
+    // Record any overlaps with regmask operands.
+    BitVector RegMaskOverlaps;
+    LIS.checkRegMaskInterference(VRegLI, RegMaskOverlaps);
+
+    // Compute an initial allowed set for the current vreg.
+    std::vector<unsigned> VRegAllowed;
+    ArrayRef<MCPhysReg> RawPRegOrder = TRC->getRawAllocationOrder(MF);
+    for (unsigned I = 0; I != RawPRegOrder.size(); ++I) {
+      unsigned PReg = RawPRegOrder[I];
+      if (MRI.isReserved(PReg))
+        continue;
+
+      // vregLI crosses a regmask operand that clobbers preg.
+      if (!RegMaskOverlaps.empty() && !RegMaskOverlaps.test(PReg))
+        continue;
+
+      // vregLI overlaps fixed regunit interference.
+      bool Interference = false;
+      for (MCRegUnitIterator Units(PReg, &TRI); Units.isValid(); ++Units) {
+        if (VRegLI.overlaps(LIS.getRegUnit(*Units))) {
+          Interference = true;
+          break;
+        }
+      }
+      if (Interference)
+        continue;
+
+      // preg is usable for this virtual register.
+      VRegAllowed.push_back(PReg);
+    }
+
+    PBQPRAGraph::RawVector NodeCosts(VRegAllowed.size() + 1, 0);
+
+    // Tweak cost of callee saved registers, as using then force spilling and
+    // restoring them. This would only happen in the prologue / epilogue though.
+    for (unsigned i = 0; i != VRegAllowed.size(); ++i)
+      if (isACalleeSavedRegister(VRegAllowed[i], TRI, MF))
+        NodeCosts[1 + i] += 1.0;
+
+    PBQPRAGraph::NodeId NId = G.addNode(std::move(NodeCosts));
+    G.getNodeMetadata(NId).setVReg(VReg);
+    G.getNodeMetadata(NId).setAllowedRegs(
+      G.getMetadata().getAllowedRegs(std::move(VRegAllowed)));
+    G.getMetadata().setNodeIdForVReg(VReg, NId);
+  }
+}
+
+bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
+                                     const PBQP::Solution &Solution,
+                                     VirtRegMap &VRM,
+                                     Spiller &VRegSpiller) {
+  MachineFunction &MF = G.getMetadata().MF;
+  LiveIntervals &LIS = G.getMetadata().LIS;
+  const TargetRegisterInfo &TRI =
+    *MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+  (void)TRI;
+
   // Set to true if we have any spills
-  bool anotherRoundNeeded = false;
+  bool AnotherRoundNeeded = false;
 
   // Clear the existing allocation.
-  vrm->clearAllVirt();
+  VRM.clearAllVirt();
 
-  const PBQPRAGraph &g = problem.getGraph();
   // Iterate over the nodes mapping the PBQP solution to a register
   // assignment.
-  for (auto NId : g.nodeIds()) {
-    unsigned vreg = problem.getVRegForNode(NId);
-    unsigned alloc = solution.getSelection(NId);
+  for (auto NId : G.nodeIds()) {
+    unsigned VReg = G.getNodeMetadata(NId).getVReg();
+    unsigned AllocOption = Solution.getSelection(NId);
 
-    if (problem.isPRegOption(vreg, alloc)) {
-      unsigned preg = problem.getPRegForOption(vreg, alloc);
-      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> "
-            << tri->getName(preg) << "\n");
-      assert(preg != 0 && "Invalid preg selected.");
-      vrm->assignVirt2Phys(vreg, preg);
-    } else if (problem.isSpillOption(vreg, alloc)) {
-      vregsToAlloc.erase(vreg);
-      SmallVector<unsigned, 8> newSpills;
-      LiveRangeEdit LRE(&lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
-      spiller->spill(LRE);
+    if (AllocOption != PBQP::RegAlloc::getSpillOptionIdx()) {
+      unsigned PReg = G.getNodeMetadata(NId).getAllowedRegs()[AllocOption - 1];
+      DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> "
+            << TRI.getName(PReg) << "\n");
+      assert(PReg != 0 && "Invalid preg selected.");
+      VRM.assignVirt2Phys(VReg, PReg);
+    } else {
+      VRegsToAlloc.erase(VReg);
+      SmallVector<unsigned, 8> NewSpills;
+      LiveRangeEdit LRE(&LIS.getInterval(VReg), NewSpills, MF, LIS, &VRM);
+      VRegSpiller.spill(LRE);
 
-      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> SPILLED (Cost: "
+      DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> SPILLED (Cost: "
                    << LRE.getParent().weight << ", New vregs: ");
 
       // Copy any newly inserted live intervals into the list of regs to
       // allocate.
-      for (LiveRangeEdit::iterator itr = LRE.begin(), end = LRE.end();
-           itr != end; ++itr) {
-        LiveInterval &li = lis->getInterval(*itr);
-        assert(!li.empty() && "Empty spill range.");
-        DEBUG(dbgs() << PrintReg(li.reg, tri) << " ");
-        vregsToAlloc.insert(li.reg);
+      for (LiveRangeEdit::iterator I = LRE.begin(), E = LRE.end();
+           I != E; ++I) {
+        LiveInterval &LI = LIS.getInterval(*I);
+        assert(!LI.empty() && "Empty spill range.");
+        DEBUG(dbgs() << PrintReg(LI.reg, &TRI) << " ");
+        VRegsToAlloc.insert(LI.reg);
       }
 
       DEBUG(dbgs() << ")\n");
 
       // We need another round if spill intervals were added.
-      anotherRoundNeeded |= !LRE.empty();
-    } else {
-      llvm_unreachable("Unknown allocation option.");
+      AnotherRoundNeeded |= !LRE.empty();
     }
   }
 
-  return !anotherRoundNeeded;
+  return !AnotherRoundNeeded;
 }
 
+void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
+                                 LiveIntervals &LIS,
+                                 VirtRegMap &VRM) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
-void RegAllocPBQP::finalizeAlloc() const {
   // First allocate registers for the empty intervals.
   for (RegSet::const_iterator
-         itr = emptyIntervalVRegs.begin(), end = emptyIntervalVRegs.end();
-         itr != end; ++itr) {
-    LiveInterval *li = &lis->getInterval(*itr);
+         I = EmptyIntervalVRegs.begin(), E = EmptyIntervalVRegs.end();
+         I != E; ++I) {
+    LiveInterval &LI = LIS.getInterval(*I);
 
-    unsigned physReg = mri->getSimpleHint(li->reg);
+    unsigned PReg = MRI.getSimpleHint(LI.reg);
 
-    if (physReg == 0) {
-      const TargetRegisterClass *liRC = mri->getRegClass(li->reg);
-      physReg = liRC->getRawAllocationOrder(*mf).front();
+    if (PReg == 0) {
+      const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg);
+      PReg = RC.getRawAllocationOrder(MF).front();
     }
 
-    vrm->assignVirt2Phys(li->reg, physReg);
+    VRM.assignVirt2Phys(LI.reg, PReg);
   }
 }
 
+static inline float normalizePBQPSpillWeight(float UseDefFreq, unsigned Size,
+                                         unsigned NumInstr) {
+  // All intervals have a spill weight that is mostly proportional to the number
+  // of uses, with uses in loops having a bigger weight.
+  return NumInstr * normalizeSpillWeight(UseDefFreq, Size, 1);
+}
+
 bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineBlockFrequencyInfo &MBFI =
+    getAnalysis<MachineBlockFrequencyInfo>();
 
-  mf = &MF;
-  tm = &mf->getTarget();
-  tri = tm->getRegisterInfo();
-  tii = tm->getInstrInfo();
-  mri = &mf->getRegInfo();
+  calculateSpillWeightsAndHints(LIS, MF, getAnalysis<MachineLoopInfo>(), MBFI,
+                                normalizePBQPSpillWeight);
 
-  lis = &getAnalysis<LiveIntervals>();
-  lss = &getAnalysis<LiveStacks>();
-  mbfi = &getAnalysis<MachineBlockFrequencyInfo>();
+  VirtRegMap &VRM = getAnalysis<VirtRegMap>();
 
-  calculateSpillWeightsAndHints(*lis, MF, getAnalysis<MachineLoopInfo>(),
-                                *mbfi);
+  std::unique_ptr<Spiller> VRegSpiller(createInlineSpiller(*this, MF, VRM));
 
-  vrm = &getAnalysis<VirtRegMap>();
-  spiller.reset(createInlineSpiller(*this, MF, *vrm));
+  MF.getRegInfo().freezeReservedRegs(MF);
 
-  mri->freezeReservedRegs(MF);
-
-  DEBUG(dbgs() << "PBQP Register Allocating for " << mf->getName() << "\n");
+  DEBUG(dbgs() << "PBQP Register Allocating for " << MF.getName() << "\n");
 
   // Allocator main loop:
   //
@@ -560,72 +659,72 @@
   // This process is continued till no more spills are generated.
 
   // Find the vreg intervals in need of allocation.
-  findVRegIntervalsToAlloc();
+  findVRegIntervalsToAlloc(MF, LIS);
 
 #ifndef NDEBUG
-  const Function* func = mf->getFunction();
-  std::string fqn =
-    func->getParent()->getModuleIdentifier() + "." +
-    func->getName().str();
+  const Function &F = *MF.getFunction();
+  std::string FullyQualifiedName =
+    F.getParent()->getModuleIdentifier() + "." + F.getName().str();
 #endif
 
   // If there are non-empty intervals allocate them using pbqp.
-  if (!vregsToAlloc.empty()) {
+  if (!VRegsToAlloc.empty()) {
 
-    bool pbqpAllocComplete = false;
-    unsigned round = 0;
+    const TargetSubtargetInfo &Subtarget = *MF.getTarget().getSubtargetImpl();
+    std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
+      llvm::make_unique<PBQPRAConstraintList>();
+    ConstraintsRoot->addConstraint(llvm::make_unique<SpillCosts>());
+    ConstraintsRoot->addConstraint(llvm::make_unique<Interference>());
+    if (PBQPCoalescing)
+      ConstraintsRoot->addConstraint(llvm::make_unique<Coalescing>());
+    ConstraintsRoot->addConstraint(Subtarget.getCustomPBQPConstraints());
 
-    while (!pbqpAllocComplete) {
-      DEBUG(dbgs() << "  PBQP Regalloc round " << round << ":\n");
+    bool PBQPAllocComplete = false;
+    unsigned Round = 0;
 
-      std::unique_ptr<PBQPRAProblem> problem(
-          builder->build(mf, lis, mbfi, vregsToAlloc));
+    while (!PBQPAllocComplete) {
+      DEBUG(dbgs() << "  PBQP Regalloc round " << Round << ":\n");
+
+      PBQPRAGraph G(PBQPRAGraph::GraphMetadata(MF, LIS, MBFI));
+      initializeGraph(G);
+      ConstraintsRoot->apply(G);
 
 #ifndef NDEBUG
-      if (pbqpDumpGraphs) {
-        std::ostringstream rs;
-        rs << round;
-        std::string graphFileName(fqn + "." + rs.str() + ".pbqpgraph");
-        std::string tmp;
-        raw_fd_ostream os(graphFileName.c_str(), tmp, sys::fs::F_Text);
-        DEBUG(dbgs() << "Dumping graph for round " << round << " to \""
-              << graphFileName << "\"\n");
-        problem->getGraph().dump(os);
+      if (PBQPDumpGraphs) {
+        std::ostringstream RS;
+        RS << Round;
+        std::string GraphFileName = FullyQualifiedName + "." + RS.str() +
+                                    ".pbqpgraph";
+        std::error_code EC;
+        raw_fd_ostream OS(GraphFileName, EC, sys::fs::F_Text);
+        DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
+              << GraphFileName << "\"\n");
+        G.dumpToStream(OS);
       }
 #endif
 
-      PBQP::Solution solution =
-        PBQP::RegAlloc::solve(problem->getGraph());
-
-      pbqpAllocComplete = mapPBQPToRegAlloc(*problem, solution);
-
-      ++round;
+      PBQP::Solution Solution = PBQP::RegAlloc::solve(G);
+      PBQPAllocComplete = mapPBQPToRegAlloc(G, Solution, VRM, *VRegSpiller);
+      ++Round;
     }
   }
 
   // Finalise allocation, allocate empty ranges.
-  finalizeAlloc();
-  vregsToAlloc.clear();
-  emptyIntervalVRegs.clear();
+  finalizeAlloc(MF, LIS, VRM);
+  VRegsToAlloc.clear();
+  EmptyIntervalVRegs.clear();
 
-  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *vrm << "\n");
+  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << VRM << "\n");
 
   return true;
 }
 
-FunctionPass *
-llvm::createPBQPRegisterAllocator(std::unique_ptr<PBQPBuilder> &builder,
-                                  char *customPassID) {
-  return new RegAllocPBQP(builder, customPassID);
+FunctionPass *llvm::createPBQPRegisterAllocator(char *customPassID) {
+  return new RegAllocPBQP(customPassID);
 }
 
 FunctionPass* llvm::createDefaultPBQPRegisterAllocator() {
-  std::unique_ptr<PBQPBuilder> Builder;
-  if (pbqpCoalescing)
-    Builder.reset(new PBQPBuilderWithCoalescing());
-  else
-    Builder.reset(new PBQPBuilder());
-  return createPBQPRegisterAllocator(Builder);
+  return createPBQPRegisterAllocator();
 }
 
 #undef DEBUG_TYPE

diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 8b5445c..e0d1aa2 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp

@@ -20,7 +20,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -38,8 +37,8 @@
   MF = &mf;
 
   // Allocate new array the first time we see a new target.
-  if (MF->getTarget().getRegisterInfo() != TRI) {
-    TRI = MF->getTarget().getRegisterInfo();
+  if (MF->getSubtarget().getRegisterInfo() != TRI) {
+    TRI = MF->getSubtarget().getRegisterInfo();
     RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
     unsigned NumPSets = TRI->getNumRegPressureSets();
     PSetLimits.reset(new unsigned[NumPSets]);
@@ -138,7 +137,7 @@
   RCI.LastCostChange = LastCostChange;
 
   DEBUG({
-    dbgs() << "AllocationOrder(" << RC->getName() << ") = [";
+    dbgs() << "AllocationOrder(" << TRI->getRegClassName(RC) << ") = [";
     for (unsigned I = 0; I != RCI.NumRegs; ++I)
       dbgs() << ' ' << PrintReg(RCI.Order[I], TRI);
     dbgs() << (RCI.ProperSubClass ? " ] (sub-class)\n" : " ]\n");

diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 65b0528..2d2dc92 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp

@@ -94,11 +94,11 @@
     /// blocks exclusively containing copies.
     bool JoinSplitEdges;
 
-    /// WorkList - Copy instructions yet to be coalesced.
+    /// Copy instructions yet to be coalesced.
     SmallVector<MachineInstr*, 8> WorkList;
     SmallVector<MachineInstr*, 8> LocalWorkList;
 
-    /// ErasedInstrs - Set of instruction pointers that have been erased, and
+    /// Set of instruction pointers that have been erased, and
     /// that may be present in WorkList.
     SmallPtrSet<MachineInstr*, 8> ErasedInstrs;
 
@@ -114,21 +114,21 @@
     /// LiveRangeEdit callback.
     void LRE_WillEraseInstruction(MachineInstr *MI) override;
 
-    /// coalesceLocals - coalesce the LocalWorkList.
+    /// Coalesce the LocalWorkList.
     void coalesceLocals();
 
-    /// joinAllIntervals - join compatible live intervals
+    /// Join compatible live intervals
     void joinAllIntervals();
 
-    /// copyCoalesceInMBB - Coalesce copies in the specified MBB, putting
+    /// Coalesce copies in the specified MBB, putting
     /// copies that cannot yet be coalesced into WorkList.
     void copyCoalesceInMBB(MachineBasicBlock *MBB);
 
-    /// copyCoalesceWorkList - Try to coalesce all copies in CurrList. Return
+    /// Try to coalesce all copies in CurrList. Return
     /// true if any progress was made.
     bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
 
-    /// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+    /// Attempt to join intervals corresponding to SrcReg/DstReg,
     /// which are the src/dst of the copy instruction CopyMI.  This returns
     /// true if the copy was successfully coalesced away. If it is not
     /// currently possible to coalesce this interval, but it may be possible if
@@ -136,7 +136,7 @@
     /// 'Again'.
     bool joinCopy(MachineInstr *TheCopy, bool &Again);
 
-    /// joinIntervals - Attempt to join these two intervals.  On failure, this
+    /// Attempt to join these two intervals.  On failure, this
     /// returns false.  The output "SrcInt" will not have been modified, so we
     /// can use this information below to update aliases.
     bool joinIntervals(CoalescerPair &CP);
@@ -147,39 +147,39 @@
     /// Attempt joining with a reserved physreg.
     bool joinReservedPhysReg(CoalescerPair &CP);
 
-    /// adjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
+    /// We found a non-trivially-coalescable copy. If
     /// the source value number is defined by a copy from the destination reg
     /// see if we can merge these two destination reg valno# into a single
     /// value number, eliminating a copy.
     bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
 
-    /// hasOtherReachingDefs - Return true if there are definitions of IntB
+    /// Return true if there are definitions of IntB
     /// other than BValNo val# that can reach uses of AValno val# of IntA.
     bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
                               VNInfo *AValNo, VNInfo *BValNo);
 
-    /// removeCopyByCommutingDef - We found a non-trivially-coalescable copy.
+    /// We found a non-trivially-coalescable copy.
     /// If the source value number is defined by a commutable instruction and
     /// its other operand is coalesced to the copy dest register, see if we
     /// can transform the copy into a noop by commuting the definition.
     bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
-    /// reMaterializeTrivialDef - If the source of a copy is defined by a
+    /// If the source of a copy is defined by a
     /// trivial computation, replace the copy by rematerialize the definition.
     bool reMaterializeTrivialDef(CoalescerPair &CP, MachineInstr *CopyMI,
                                  bool &IsDefCopy);
 
-    /// canJoinPhys - Return true if a physreg copy should be joined.
+    /// Return true if a physreg copy should be joined.
     bool canJoinPhys(const CoalescerPair &CP);
 
-    /// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+    /// Replace all defs and uses of SrcReg to DstReg and
     /// update the subregister number if it is not zero. If DstReg is a
     /// physical register and the existing subregister number of the def / use
     /// being updated is not zero, make sure to set it to the correct physical
     /// subregister.
     void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
 
-    /// eliminateUndefCopy - Handle copies of undef values.
+    /// Handle copies of undef values.
     bool eliminateUndefCopy(MachineInstr *CopyMI, const CoalescerPair &CP);
 
   public:
@@ -192,10 +192,10 @@
 
     void releaseMemory() override;
 
-    /// runOnMachineFunction - pass entry point
+    /// This is the pass entry point.
     bool runOnMachineFunction(MachineFunction&) override;
 
-    /// print - Implement the dump method.
+    /// Implement the dump method.
     void print(raw_ostream &O, const Module* = nullptr) const override;
   };
 } /// end anonymous namespace
@@ -407,7 +407,7 @@
   ErasedInstrs.insert(MI);
 }
 
-/// adjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA
+/// We found a non-trivially-coalescable copy with IntA
 /// being the source and IntB being the dest, thus this defines a value number
 /// in IntB.  If the source value number (in IntA) is defined by a copy from B,
 /// see if we can merge these two pieces of B into a single value number,
@@ -512,7 +512,7 @@
   return true;
 }
 
-/// hasOtherReachingDefs - Return true if there are definitions of IntB
+/// Return true if there are definitions of IntB
 /// other than BValNo val# that can reach uses of AValno val# of IntA.
 bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
                                              LiveInterval &IntB,
@@ -542,7 +542,7 @@
   return false;
 }
 
-/// removeCopyByCommutingDef - We found a non-trivially-coalescable copy with
+/// We found a non-trivially-coalescable copy with
 /// IntA being the source and IntB being the dest, thus this defines a value
 /// number in IntB.  If the source value number (in IntA) is defined by a
 /// commutable instruction and its other operand is coalesced to the copy dest
@@ -725,7 +725,7 @@
   return true;
 }
 
-/// reMaterializeTrivialDef - If the source of a copy is defined by a trivial
+/// If the source of a copy is defined by a trivial
 /// computation, replace the copy by rematerialize the definition.
 bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
                                                 MachineInstr *CopyMI,
@@ -904,7 +904,7 @@
   return true;
 }
 
-/// eliminateUndefCopy - ProcessImpicitDefs may leave some copies of <undef>
+/// ProcessImpicitDefs may leave some copies of <undef>
 /// values, it only removes local variables. When we have a copy like:
 ///
 ///   %vreg1 = COPY %vreg2<undef>
@@ -944,11 +944,10 @@
   return true;
 }
 
-/// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
-/// update the subregister number if it is not zero. If DstReg is a
-/// physical register and the existing subregister number of the def / use
-/// being updated is not zero, make sure to set it to the correct physical
-/// subregister.
+/// Replace all defs and uses of SrcReg to DstReg and update the subregister
+/// number if it is not zero. If DstReg is a physical register and the existing
+/// subregister number of the def / use being updated is not zero, make sure to
+/// set it to the correct physical subregister.
 void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
                                           unsigned DstReg,
                                           unsigned SubIdx) {
@@ -966,7 +965,7 @@
     // the UseMI operands removes them from the SrcReg use-def chain, but when
     // SrcReg is DstReg we could encounter UseMI twice if it has multiple
     // operands mentioning the virtual register.
-    if (SrcReg == DstReg && !Visited.insert(UseMI))
+    if (SrcReg == DstReg && !Visited.insert(UseMI).second)
       continue;
 
     SmallVector<unsigned,8> Ops;
@@ -1003,7 +1002,7 @@
   }
 }
 
-/// canJoinPhys - Return true if a copy involving a physreg should be joined.
+/// Return true if a copy involving a physreg should be joined.
 bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
   /// Always join simple intervals that are defined by a single copy from a
   /// reserved register. This doesn't increase register pressure, so it is
@@ -1021,7 +1020,7 @@
   return false;
 }
 
-/// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+/// Attempt to join intervals corresponding to SrcReg/DstReg,
 /// which are the src/dst of the copy instruction CopyMI.  This returns true
 /// if the copy was successfully coalesced away. If it is not currently
 /// possible to coalesce this interval, but it may be possible if other
@@ -1037,6 +1036,22 @@
     return false;
   }
 
+  if (CP.getNewRC()) {
+    auto SrcRC = MRI->getRegClass(CP.getSrcReg());
+    auto DstRC = MRI->getRegClass(CP.getDstReg());
+    unsigned SrcIdx = CP.getSrcIdx();
+    unsigned DstIdx = CP.getDstIdx();
+    if (CP.isFlipped()) {
+      std::swap(SrcIdx, DstIdx);
+      std::swap(SrcRC, DstRC);
+    }
+    if (!TRI->shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx,
+                            CP.getNewRC())) {
+      DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
+      return false;
+    }
+  }
+
   // Dead code elimination. This really should be handled by MachineDCE, but
   // sometimes dead copies slip through, and we can't generate invalid live
   // ranges.
@@ -1090,9 +1105,14 @@
       return false;
     }
   } else {
+    // When possible, let DstReg be the larger interval.
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
+                           LIS->getInterval(CP.getDstReg()).size())
+      CP.flip();
+
     DEBUG({
-      dbgs() << "\tConsidering merging to " << CP.getNewRC()->getName()
-             << " with ";
+      dbgs() << "\tConsidering merging to "
+             << TRI->getRegClassName(CP.getNewRC()) << " with ";
       if (CP.getDstIdx() && CP.getSrcIdx())
         dbgs() << PrintReg(CP.getDstReg()) << " in "
                << TRI->getSubRegIndexName(CP.getDstIdx()) << " and "
@@ -1102,11 +1122,6 @@
         dbgs() << PrintReg(CP.getSrcReg(), TRI) << " in "
                << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n';
     });
-
-    // When possible, let DstReg be the larger interval.
-    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
-                           LIS->getInterval(CP.getDstReg()).size())
-      CP.flip();
   }
 
   // Okay, attempt to join these two intervals.  On failure, this returns false.
@@ -1171,7 +1186,9 @@
   TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
-    dbgs() << "\tJoined. Result = ";
+    dbgs() << "\tSuccess: " << PrintReg(CP.getSrcReg(), TRI, CP.getSrcIdx())
+           << " -> " << PrintReg(CP.getDstReg(), TRI, CP.getDstIdx()) << '\n';
+    dbgs() << "\tResult = ";
     if (CP.isPhys())
       dbgs() << PrintReg(CP.getDstReg(), TRI);
     else
@@ -1423,7 +1440,7 @@
   /// Add erased instructions to ErasedInstrs.
   /// Add foreign virtual registers to ShrinkRegs if their live range ended at
   /// the erased instrs.
-  void eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+  void eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
                    SmallVectorImpl<unsigned> &ShrinkRegs);
 
   /// Get the value assignments suitable for passing to LiveInterval::join.
@@ -1936,7 +1953,7 @@
   }
 }
 
-void JoinVals::eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
                            SmallVectorImpl<unsigned> &ShrinkRegs) {
   for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
     // Get the def location before markUnused() below invalidates it.
@@ -2035,8 +2052,7 @@
   return true;
 }
 
-/// joinIntervals - Attempt to join these two intervals.  On failure, this
-/// returns false.
+/// Attempt to join these two intervals.  On failure, this returns false.
 bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
   return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
 }
@@ -2208,8 +2224,8 @@
   MF = &fn;
   MRI = &fn.getRegInfo();
   TM = &fn.getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
+  TRI = TM->getSubtargetImpl()->getRegisterInfo();
+  TII = TM->getSubtargetImpl()->getInstrInfo();
   LIS = &getAnalysis<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();
   Loops = &getAnalysis<MachineLoopInfo>();
@@ -2250,7 +2266,7 @@
       continue;
     if (MRI->recomputeRegClass(Reg, *TM)) {
       DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
-                   << MRI->getRegClass(Reg)->getName() << '\n');
+                   << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
       ++NumInflated;
     }
   }
@@ -2261,7 +2277,7 @@
   return true;
 }
 
-/// print - Implement the dump method.
+/// Implement the dump method.
 void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
    LIS->print(O, m);
 }

diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h
index e57ceab..04067a1 100644
--- a/lib/CodeGen/RegisterCoalescer.h
+++ b/lib/CodeGen/RegisterCoalescer.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_REGISTER_COALESCER_H
-#define LLVM_CODEGEN_REGISTER_COALESCER_H
+#ifndef LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
+#define LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
 
 namespace llvm {
 
@@ -22,38 +22,36 @@
   class TargetRegisterClass;
   class TargetInstrInfo;
 
-  /// CoalescerPair - A helper class for register coalescers. When deciding if
+  /// A helper class for register coalescers. When deciding if
   /// two registers can be coalesced, CoalescerPair can determine if a copy
   /// instruction would become an identity copy after coalescing.
   class CoalescerPair {
     const TargetRegisterInfo &TRI;
 
-    /// DstReg - The register that will be left after coalescing. It can be a
+    /// The register that will be left after coalescing. It can be a
     /// virtual or physical register.
     unsigned DstReg;
 
-    /// SrcReg - the virtual register that will be coalesced into dstReg.
+    /// The virtual register that will be coalesced into dstReg.
     unsigned SrcReg;
 
-    /// DstIdx - The sub-register index of the old DstReg in the new coalesced
-    /// register.
+    /// The sub-register index of the old DstReg in the new coalesced register.
     unsigned DstIdx;
 
-    /// SrcIdx - The sub-register index of the old SrcReg in the new coalesced
-    /// register.
+    /// The sub-register index of the old SrcReg in the new coalesced register.
     unsigned SrcIdx;
 
-    /// Partial - True when the original copy was a partial subregister copy.
+    /// True when the original copy was a partial subregister copy.
     bool Partial;
 
-    /// CrossClass - True when both regs are virtual, and newRC is constrained.
+    /// True when both regs are virtual and newRC is constrained.
     bool CrossClass;
 
-    /// Flipped - True when DstReg and SrcReg are reversed from the original
+    /// True when DstReg and SrcReg are reversed from the original
     /// copy instruction.
     bool Flipped;
 
-    /// NewRC - The register class of the coalesced register, or NULL if DstReg
+    /// The register class of the coalesced register, or NULL if DstReg
     /// is a physreg. This register class may be a super-register of both
     /// SrcReg and DstReg.
     const TargetRegisterClass *NewRC;
@@ -70,49 +68,47 @@
       : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg), DstIdx(0), SrcIdx(0),
         Partial(false), CrossClass(false), Flipped(false), NewRC(nullptr) {}
 
-    /// setRegisters - set registers to match the copy instruction MI. Return
+    /// Set registers to match the copy instruction MI. Return
     /// false if MI is not a coalescable copy instruction.
     bool setRegisters(const MachineInstr*);
 
-    /// flip - Swap SrcReg and DstReg. Return false if swapping is impossible
+    /// Swap SrcReg and DstReg. Return false if swapping is impossible
     /// because DstReg is a physical register, or SubIdx is set.
     bool flip();
 
-    /// isCoalescable - Return true if MI is a copy instruction that will become
+    /// Return true if MI is a copy instruction that will become
     /// an identity copy after coalescing.
     bool isCoalescable(const MachineInstr*) const;
 
-    /// isPhys - Return true if DstReg is a physical register.
+    /// Return true if DstReg is a physical register.
     bool isPhys() const { return !NewRC; }
 
-    /// isPartial - Return true if the original copy instruction did not copy
+    /// Return true if the original copy instruction did not copy
     /// the full register, but was a subreg operation.
     bool isPartial() const { return Partial; }
 
-    /// isCrossClass - Return true if DstReg is virtual and NewRC is a smaller
+    /// Return true if DstReg is virtual and NewRC is a smaller
     /// register class than DstReg's.
     bool isCrossClass() const { return CrossClass; }
 
-    /// isFlipped - Return true when getSrcReg is the register being defined by
+    /// Return true when getSrcReg is the register being defined by
     /// the original copy instruction.
     bool isFlipped() const { return Flipped; }
 
-    /// getDstReg - Return the register (virtual or physical) that will remain
+    /// Return the register (virtual or physical) that will remain
     /// after coalescing.
     unsigned getDstReg() const { return DstReg; }
 
-    /// getSrcReg - Return the virtual register that will be coalesced away.
+    /// Return the virtual register that will be coalesced away.
     unsigned getSrcReg() const { return SrcReg; }
 
-    /// getDstIdx - Return the subregister index that DstReg will be coalesced
-    /// into, or 0.
+    /// Return the subregister index that DstReg will be coalesced into, or 0.
     unsigned getDstIdx() const { return DstIdx; }
 
-    /// getSrcIdx - Return the subregister index that SrcReg will be coalesced
-    /// into, or 0.
+    /// Return the subregister index that SrcReg will be coalesced into, or 0.
     unsigned getSrcIdx() const { return SrcIdx; }
 
-    /// getNewRC - Return the register class of the coalesced register.
+    /// Return the register class of the coalesced register.
     const TargetRegisterClass *getNewRC() const { return NewRC; }
   };
 } // End llvm namespace

diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 617e459..9925efb 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp

@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -185,7 +184,7 @@
   reset();
 
   MF = mf;
-  TRI = MF->getTarget().getRegisterInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
   RCI = rci;
   MRI = &MF->getRegInfo();
   MBB = mbb;

diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 72b6285..7626dd2 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp

@@ -24,24 +24,16 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "reg-scavenging"
 
-/// setUsed - Set the register and its sub-registers as being used.
-void RegScavenger::setUsed(unsigned Reg) {
-  for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-       SubRegs.isValid(); ++SubRegs)
-    RegsAvailable.reset(*SubRegs);
-}
-
-bool RegScavenger::isAliasUsed(unsigned Reg) const {
-  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-    if (isUsed(*AI, *AI == Reg))
-      return true;
-  return false;
+/// setUsed - Set the register units of this register as used.
+void RegScavenger::setRegUsed(unsigned Reg) {
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+    RegUnitsAvailable.reset(*RUI);
 }
 
 void RegScavenger::initRegState() {
@@ -51,8 +43,8 @@
     I->Restore = nullptr;
   }
 
-  // All registers started out unused.
-  RegsAvailable.set();
+  // All register units start out unused.
+  RegUnitsAvailable.set();
 
   if (!MBB)
     return;
@@ -60,22 +52,21 @@
   // Live-in registers are in use.
   for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
          E = MBB->livein_end(); I != E; ++I)
-    setUsed(*I);
+    setRegUsed(*I);
 
   // Pristine CSRs are also unavailable.
   BitVector PR = MBB->getParent()->getFrameInfo()->getPristineRegs(MBB);
   for (int I = PR.find_first(); I>0; I = PR.find_next(I))
-    setUsed(I);
+    setRegUsed(I);
 }
 
 void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
   MachineFunction &MF = *mbb->getParent();
-  const TargetMachine &TM = MF.getTarget();
-  TII = TM.getInstrInfo();
-  TRI = TM.getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
 
-  assert((NumPhysRegs == 0 || NumPhysRegs == TRI->getNumRegs()) &&
+  assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) &&
          "Target changed?");
 
   // It is not possible to use the register scavenger after late optimization
@@ -85,17 +76,11 @@
 
   // Self-initialize.
   if (!MBB) {
-    NumPhysRegs = TRI->getNumRegs();
-    RegsAvailable.resize(NumPhysRegs);
-    KillRegs.resize(NumPhysRegs);
-    DefRegs.resize(NumPhysRegs);
-
-    // Create callee-saved registers bitvector.
-    CalleeSavedRegs.resize(NumPhysRegs);
-    const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
-    if (CSRegs != nullptr)
-      for (unsigned i = 0; CSRegs[i]; ++i)
-        CalleeSavedRegs.set(CSRegs[i]);
+    NumRegUnits = TRI->getNumRegUnits();
+    RegUnitsAvailable.resize(NumRegUnits);
+    KillRegUnits.resize(NumRegUnits);
+    DefRegUnits.resize(NumRegUnits);
+    TmpRegUnits.resize(NumRegUnits);
   }
 
   MBB = mbb;
@@ -104,10 +89,9 @@
   Tracking = false;
 }
 
-void RegScavenger::addRegWithSubRegs(BitVector &BV, unsigned Reg) {
-  for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-       SubRegs.isValid(); ++SubRegs)
-    BV.set(*SubRegs);
+void RegScavenger::addRegUnits(BitVector &BV, unsigned Reg) {
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+    BV.set(*RUI);
 }
 
 void RegScavenger::determineKillsAndDefs() {
@@ -122,12 +106,25 @@
   // predicated, conservatively assume "kill" markers do not actually kill the
   // register. Similarly ignores "dead" markers.
   bool isPred = TII->isPredicated(MI);
-  KillRegs.reset();
-  DefRegs.reset();
+  KillRegUnits.reset();
+  DefRegUnits.reset();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (MO.isRegMask())
-      (isPred ? DefRegs : KillRegs).setBitsNotInMask(MO.getRegMask());
+    if (MO.isRegMask()) {
+      
+      TmpRegUnits.clear();
+      for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd; ++RU) {
+        for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) {
+          if (MO.clobbersPhysReg(*RURI)) {
+            TmpRegUnits.set(RU);
+            break;
+          }
+        }
+      }
+      
+      // Apply the mask.
+      (isPred ? DefRegUnits : KillRegUnits) |= TmpRegUnits;
+    }
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
@@ -139,13 +136,13 @@
       if (MO.isUndef())
         continue;
       if (!isPred && MO.isKill())
-        addRegWithSubRegs(KillRegs, Reg);
+        addRegUnits(KillRegUnits, Reg);
     } else {
       assert(MO.isDef());
       if (!isPred && MO.isDead())
-        addRegWithSubRegs(KillRegs, Reg);
+        addRegUnits(KillRegUnits, Reg);
       else
-        addRegWithSubRegs(DefRegs, Reg);
+        addRegUnits(DefRegUnits, Reg);
     }
   }
 }
@@ -158,8 +155,8 @@
     determineKillsAndDefs();
 
     // Commit the changes.
-    setUsed(KillRegs);
-    setUnused(DefRegs);
+    setUsed(KillRegUnits);
+    setUnused(DefRegUnits);
   }
 
   if (MBBI == MBB->begin()) {
@@ -208,7 +205,7 @@
     if (MO.isUse()) {
       if (MO.isUndef())
         continue;
-      if (!isUsed(Reg)) {
+      if (!isRegUsed(Reg)) {
         // Check if it's partial live: e.g.
         // D0 = insert_subreg D0<undef>, S0
         // ... D0
@@ -219,15 +216,23 @@
         // insert_subreg around causes both correctness and performance issues.
         bool SubUsed = false;
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-          if (isUsed(*SubRegs)) {
+          if (isRegUsed(*SubRegs)) {
             SubUsed = true;
             break;
           }
-        if (!SubUsed) {
+        bool SuperUsed = false;
+        for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
+          if (isRegUsed(*SR)) {
+            SuperUsed = true;
+            break;
+          }
+        }
+        if (!SubUsed && !SuperUsed) {
           MBB->getParent()->verify(nullptr, "In Register Scavenger");
           llvm_unreachable("Using an undefined register!");
         }
         (void)SubUsed;
+        (void)SuperUsed;
       }
     } else {
       assert(MO.isDef());
@@ -243,23 +248,23 @@
 #endif // NDEBUG
 
   // Commit the changes.
-  setUnused(KillRegs);
-  setUsed(DefRegs);
+  setUnused(KillRegUnits);
+  setUsed(DefRegUnits);
 }
 
-void RegScavenger::getRegsUsed(BitVector &used, bool includeReserved) {
-  used = RegsAvailable;
-  used.flip();
-  if (includeReserved)
-    used |= MRI->getReservedRegs();
-  else
-    used.reset(MRI->getReservedRegs());
+bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const {
+  if (includeReserved && isReserved(Reg))
+    return true;
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+    if (!RegUnitsAvailable.test(*RUI))
+      return true;
+  return false;
 }
 
 unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
   for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
        I != E; ++I)
-    if (!isAliasUsed(*I)) {
+    if (!isRegUsed(*I)) {
       DEBUG(dbgs() << "Scavenger found unused reg: " << TRI->getName(*I) <<
             "\n");
       return *I;
@@ -273,13 +278,13 @@
   BitVector Mask(TRI->getNumRegs());
   for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
        I != E; ++I)
-    if (!isAliasUsed(*I))
+    if (!isRegUsed(*I))
       Mask.set(*I);
   return Mask;
 }
 
 /// findSurvivorReg - Return the candidate register that is unused for the
-/// longest after StargMII. UseMI is set to the instruction where the search
+/// longest after StartMII. UseMI is set to the instruction where the search
 /// stopped.
 ///
 /// No more than InstrLimit instructions are inspected.
@@ -375,9 +380,7 @@
   }
 
   // Try to find a register that's unused if there is one, as then we won't
-  // have to spill. Search explicitly rather than masking out based on
-  // RegsAvailable, as RegsAvailable does not take aliases into account.
-  // That's what getRegsAvailable() is for.
+  // have to spill.
   BitVector Available = getRegsAvailable(RC);
   Available &= Candidates;
   if (Available.any())
@@ -388,7 +391,7 @@
   unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
 
   // If we found an unused register there is no reason to spill it.
-  if (!isAliasUsed(SReg)) {
+  if (!isRegUsed(SReg)) {
     DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n");
     return SReg;
   }

diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 6a2a080..6f8b337 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp

@@ -21,6 +21,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
 using namespace llvm;
 
@@ -35,11 +36,9 @@
 void SchedulingPriorityQueue::anchor() { }
 
 ScheduleDAG::ScheduleDAG(MachineFunction &mf)
-  : TM(mf.getTarget()),
-    TII(TM.getInstrInfo()),
-    TRI(TM.getRegisterInfo()),
-    MF(mf), MRI(mf.getRegInfo()),
-    EntrySU(), ExitSU() {
+    : TM(mf.getTarget()), TII(TM.getSubtargetImpl()->getInstrInfo()),
+      TRI(TM.getSubtargetImpl()->getRegisterInfo()), MF(mf),
+      MRI(mf.getRegInfo()), EntrySU(), ExitSU() {
 #ifndef NDEBUG
   StressSched = StressSchedOpt;
 #endif

diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 0f8b21c..d8d8422 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp

@@ -50,12 +50,11 @@
     cl::init(true), cl::desc("Enable use of TBAA during MI GAD construction"));
 
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
-                                     const MachineLoopInfo &mli,
-                                     const MachineDominatorTree &mdt,
+                                     const MachineLoopInfo *mli,
                                      bool IsPostRAFlag,
                                      bool RemoveKillFlags,
                                      LiveIntervals *lis)
-  : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()), LIS(lis),
+  : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis),
     IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags),
     CanHandleTerminators(false), FirstDbgValue(nullptr) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
@@ -64,7 +63,7 @@
          "Virtual registers must be removed prior to PostRA scheduling");
 
   const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
 }
 
 /// getUnderlyingObjectFromInt - This is the function that does the work of
@@ -110,7 +109,7 @@
     for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), IE = Objs.end();
          I != IE; ++I) {
       V = *I;
-      if (!Visited.insert(V))
+      if (!Visited.insert(V).second)
         continue;
       if (Operator::getOpcode(V) == Instruction::IntToPtr) {
         const Value *O =
@@ -512,9 +511,18 @@
 static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
                              MachineInstr *MIa,
                              MachineInstr *MIb) {
+  const MachineFunction *MF = MIa->getParent()->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
   // Cover a trivial case - no edge is need to itself.
   if (MIa == MIb)
     return false;
+ 
+  // Let the target decide if memory accesses cannot possibly overlap.
+  if ((MIa->mayLoad() || MIa->mayStore()) &&
+      (MIb->mayLoad() || MIb->mayStore()))
+    if (TII->areMemAccessesTriviallyDisjoint(MIa, MIb, AA))
+      return false;
 
   // FIXME: Need to handle multiple memory operands to support all targets.
   if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
@@ -563,9 +571,9 @@
 
   AliasAnalysis::AliasResult AAResult = AA->alias(
       AliasAnalysis::Location(MMOa->getValue(), Overlapa,
-                              UseTBAA ? MMOa->getTBAAInfo() : nullptr),
+                              UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
       AliasAnalysis::Location(MMOb->getValue(), Overlapb,
-                              UseTBAA ? MMOb->getTBAAInfo() : nullptr));
+                              UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
 
   return (AAResult != AliasAnalysis::NoAlias);
 }
@@ -575,12 +583,12 @@
 static unsigned
 iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
                  SUnit *SUa, SUnit *SUb, SUnit *ExitSU, unsigned *Depth,
-                 SmallPtrSet<const SUnit*, 16> &Visited) {
+                 SmallPtrSetImpl<const SUnit*> &Visited) {
   if (!SUa || !SUb || SUb == ExitSU)
     return *Depth;
 
   // Remember visited nodes.
-  if (!Visited.insert(SUb))
+  if (!Visited.insert(SUb).second)
       return *Depth;
   // If there is _some_ dependency already in place, do not
   // descend any further.
@@ -656,7 +664,7 @@
                          bool isNormalMemory = false) {
   // If this is a false dependency,
   // do not add the edge, but rememeber the rejected node.
-  if (!AA || MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+  if (MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
     SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
     Dep.setLatency(TrueMemOrderLatency);
     SUb->addPred(Dep);

diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
index f59c6cf..b2e4617 100644
--- a/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp

@@ -21,7 +21,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <fstream>
 using namespace llvm;

diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 004c685..38833a4 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp

@@ -78,7 +78,7 @@
     DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
   else {
     // A nonempty itinerary must have a SchedModel.
-    IssueWidth = ItinData->SchedModel->IssueWidth;
+    IssueWidth = ItinData->SchedModel.IssueWidth;
     DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
           << ScoreboardDepth << '\n');
   }

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7c42e4d..a1291ed 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

@@ -17,7 +17,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -32,7 +34,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -76,6 +77,10 @@
                              "slicing"),
                     cl::init(false));
 
+  static cl::opt<bool>
+    MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
+                      cl::desc("DAG combiner may split indexing from loads"));
+
 //------------------------------ DAGCombiner ---------------------------------//
 
   class DAGCombiner {
@@ -87,56 +92,70 @@
     bool LegalTypes;
     bool ForCodeSize;
 
-    // Worklist of all of the nodes that need to be simplified.
-    //
-    // This has the semantics that when adding to the worklist,
-    // the item added must be next to be processed. It should
-    // also only appear once. The naive approach to this takes
-    // linear time.
-    //
-    // To reduce the insert/remove time to logarithmic, we use
-    // a set and a vector to maintain our worklist.
-    //
-    // The set contains the items on the worklist, but does not
-    // maintain the order they should be visited.
-    //
-    // The vector maintains the order nodes should be visited, but may
-    // contain duplicate or removed nodes. When choosing a node to
-    // visit, we pop off the order stack until we find an item that is
-    // also in the contents set. All operations are O(log N).
-    SmallPtrSet<SDNode*, 64> WorkListContents;
-    SmallVector<SDNode*, 64> WorkListOrder;
+    /// \brief Worklist of all of the nodes that need to be simplified.
+    ///
+    /// This must behave as a stack -- new nodes to process are pushed onto the
+    /// back and when processing we pop off of the back.
+    ///
+    /// The worklist will not contain duplicates but may contain null entries
+    /// due to nodes being deleted from the underlying DAG.
+    SmallVector<SDNode *, 64> Worklist;
+
+    /// \brief Mapping from an SDNode to its position on the worklist.
+    ///
+    /// This is used to find and remove nodes from the worklist (by nulling
+    /// them) when they are deleted from the underlying DAG. It relies on
+    /// stable indices of nodes within the worklist.
+    DenseMap<SDNode *, unsigned> WorklistMap;
+
+    /// \brief Set of nodes which have been combined (at least once).
+    ///
+    /// This is used to allow us to reliably add any operands of a DAG node
+    /// which have not yet been combined to the worklist.
+    SmallPtrSet<SDNode *, 64> CombinedNodes;
 
     // AA - Used for DAG load/store alias analysis.
     AliasAnalysis &AA;
 
-    /// AddUsersToWorkList - When an instruction is simplified, add all users of
-    /// the instruction to the work lists because they might get more simplified
-    /// now.
-    ///
-    void AddUsersToWorkList(SDNode *N) {
+    /// When an instruction is simplified, add all users of the instruction to
+    /// the work lists because they might get more simplified now.
+    void AddUsersToWorklist(SDNode *N) {
       for (SDNode *Node : N->uses())
-        AddToWorkList(Node);
+        AddToWorklist(Node);
     }
 
-    /// visit - call the node-specific routine that knows how to fold each
-    /// particular type of node.
+    /// Call the node-specific routine that folds each particular type of node.
     SDValue visit(SDNode *N);
 
   public:
-    /// AddToWorkList - Add to the work list making sure its instance is at the
-    /// back (next to be processed.)
-    void AddToWorkList(SDNode *N) {
-      WorkListContents.insert(N);
-      WorkListOrder.push_back(N);
+    /// Add to the worklist making sure its instance is at the back (next to be
+    /// processed.)
+    void AddToWorklist(SDNode *N) {
+      // Skip handle nodes as they can't usefully be combined and confuse the
+      // zero-use deletion strategy.
+      if (N->getOpcode() == ISD::HANDLENODE)
+        return;
+
+      if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
+        Worklist.push_back(N);
     }
 
-    /// removeFromWorkList - remove all instances of N from the worklist.
-    ///
-    void removeFromWorkList(SDNode *N) {
-      WorkListContents.erase(N);
+    /// Remove all instances of N from the worklist.
+    void removeFromWorklist(SDNode *N) {
+      CombinedNodes.erase(N);
+
+      auto It = WorklistMap.find(N);
+      if (It == WorklistMap.end())
+        return; // Not in the worklist.
+
+      // Null out the entry rather than erasing it to avoid a linear operation.
+      Worklist[It->second] = nullptr;
+      WorklistMap.erase(It);
     }
 
+    void deleteAndRecombine(SDNode *N);
+    bool recursivelyDeleteUnusedNodes(SDNode *N);
+
     SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
                       bool AddTo = true);
 
@@ -154,9 +173,9 @@
 
   private:
 
-    /// SimplifyDemandedBits - Check the specified integer node value to see if
-    /// it can be simplified or if things it uses can be simplified by bit
-    /// propagation.  If so, return true.
+    /// Check the specified integer node value to see if it can be simplified or
+    /// if things it uses can be simplified by bit propagation.
+    /// If so, return true.
     bool SimplifyDemandedBits(SDValue Op) {
       unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
       APInt Demanded = APInt::getAllOnesValue(BitWidth);
@@ -167,6 +186,7 @@
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
+    SDValue SplitIndexingFromLoad(LoadSDNode *LD);
     bool SliceUpLoad(SDNode *N);
 
     /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
@@ -192,7 +212,7 @@
                          SDValue Trunc, SDValue ExtLoad, SDLoc DL,
                          ISD::NodeType ExtType);
 
-    /// combine - call the node-specific routine that knows how to fold each
+    /// Call the node-specific routine that knows how to fold each
     /// particular type of node. If that doesn't do anything, try the
     /// target-specific DAG combines.
     SDValue combine(SDNode *N);
@@ -256,6 +276,7 @@
     SDValue visitFMA(SDNode *N);
     SDValue visitFDIV(SDNode *N);
     SDValue visitFREM(SDNode *N);
+    SDValue visitFSQRT(SDNode *N);
     SDValue visitFCOPYSIGN(SDNode *N);
     SDValue visitSINT_TO_FP(SDNode *N);
     SDValue visitUINT_TO_FP(SDNode *N);
@@ -269,6 +290,8 @@
     SDValue visitFCEIL(SDNode *N);
     SDValue visitFTRUNC(SDNode *N);
     SDValue visitFFLOOR(SDNode *N);
+    SDValue visitFMINNUM(SDNode *N);
+    SDValue visitFMAXNUM(SDNode *N);
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
@@ -304,7 +327,12 @@
     SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
     SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
     SDValue BuildSDIV(SDNode *N);
+    SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
+    SDValue BuildReciprocalEstimate(SDValue Op);
+    SDValue BuildRsqrtEstimate(SDValue Op);
+    SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations);
+    SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations);
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
@@ -321,17 +349,16 @@
 
     SDValue GetDemandedBits(SDValue V, const APInt &Mask);
 
-    /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+    /// Walk up chain skipping non-aliasing memory nodes,
     /// looking for aliasing nodes and adding them to the Aliases vector.
     void GatherAllAliases(SDNode *N, SDValue OriginalChain,
                           SmallVectorImpl<SDValue> &Aliases);
 
-    /// isAlias - Return true if there is any possibility that the two addresses
-    /// overlap.
+    /// Return true if there is any possibility that the two addresses overlap.
     bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const;
 
-    /// FindBetterChain - Walk up chain skipping non-aliasing memory nodes,
-    /// looking for a better chain (aliasing node.)
+    /// Walk up chain skipping non-aliasing memory nodes, looking for a better
+    /// chain (aliasing node.)
     SDValue FindBetterChain(SDNode *N, SDValue Chain);
 
     /// Merge consecutive store operations into a wide store.
@@ -359,13 +386,13 @@
           FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
     }
 
-    /// Run - runs the dag combiner on all nodes in the work list
+    /// Runs the dag combiner on all nodes in the work list
     void Run(CombineLevel AtLevel);
 
     SelectionDAG &getDAG() const { return DAG; }
 
-    /// getShiftAmountTy - Returns a type large enough to hold any valid
-    /// shift amount - before type legalization these can be huge.
+    /// Returns a type large enough to hold any valid shift amount - before type
+    /// legalization these can be huge.
     EVT getShiftAmountTy(EVT LHSTy) {
       assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
       if (LHSTy.isVector())
@@ -374,15 +401,14 @@
                         : TLI.getPointerTy();
     }
 
-    /// isTypeLegal - This method returns true if we are running before type
-    /// legalization or if the specified VT is legal.
+    /// This method returns true if we are running before type legalization or
+    /// if the specified VT is legal.
     bool isTypeLegal(const EVT &VT) {
       if (!LegalTypes) return true;
       return TLI.isTypeLegal(VT);
     }
 
-    /// getSetCCResultType - Convenience wrapper around
-    /// TargetLowering::getSetCCResultType
+    /// Convenience wrapper around TargetLowering::getSetCCResultType
     EVT getSetCCResultType(EVT VT) const {
       return TLI.getSetCCResultType(*DAG.getContext(), VT);
     }
@@ -391,16 +417,16 @@
 
 
 namespace {
-/// WorkListRemover - This class is a DAGUpdateListener that removes any deleted
+/// This class is a DAGUpdateListener that removes any deleted
 /// nodes from the worklist.
-class WorkListRemover : public SelectionDAG::DAGUpdateListener {
+class WorklistRemover : public SelectionDAG::DAGUpdateListener {
   DAGCombiner &DC;
 public:
-  explicit WorkListRemover(DAGCombiner &dc)
+  explicit WorklistRemover(DAGCombiner &dc)
     : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
 
   void NodeDeleted(SDNode *N, SDNode *E) override {
-    DC.removeFromWorkList(N);
+    DC.removeFromWorklist(N);
   }
 };
 }
@@ -410,11 +436,11 @@
 //===----------------------------------------------------------------------===//
 
 void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
-  ((DAGCombiner*)DC)->AddToWorkList(N);
+  ((DAGCombiner*)DC)->AddToWorklist(N);
 }
 
 void TargetLowering::DAGCombinerInfo::RemoveFromWorklist(SDNode *N) {
-  ((DAGCombiner*)DC)->removeFromWorkList(N);
+  ((DAGCombiner*)DC)->removeFromWorklist(N);
 }
 
 SDValue TargetLowering::DAGCombinerInfo::
@@ -442,9 +468,24 @@
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-/// isNegatibleForFree - Return 1 if we can compute the negated form of the
-/// specified expression for the same cost as the expression itself, or 2 if we
-/// can compute the negated form more cheaply than the expression itself.
+void DAGCombiner::deleteAndRecombine(SDNode *N) {
+  removeFromWorklist(N);
+
+  // If the operands of this node are only used by the node, they will now be
+  // dead. Make sure to re-visit them and recursively delete dead nodes.
+  for (const SDValue &Op : N->ops())
+    // For an operand generating multiple values, one of the values may
+    // become dead allowing further simplification (e.g. split index
+    // arithmetic from an indexed load).
+    if (Op->hasOneUse() || Op->getNumValues() > 1)
+      AddToWorklist(Op.getNode());
+
+  DAG.DeleteNode(N);
+}
+
+/// Return 1 if we can compute the negated form of the specified expression for
+/// the same cost as the expression itself, or 2 if we can compute the negated
+/// form more cheaply than the expression itself.
 static char isNegatibleForFree(SDValue Op, bool LegalOperations,
                                const TargetLowering &TLI,
                                const TargetOptions *Options,
@@ -507,10 +548,10 @@
   }
 }
 
-/// GetNegatedExpression - If isNegatibleForFree returns true, this function
-/// returns the newly negated expression.
+/// If isNegatibleForFree returns true, return the newly negated expression.
 static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
                                     bool LegalOperations, unsigned Depth = 0) {
+  const TargetOptions &Options = DAG.getTarget().Options;
   // fneg is removable even if it has multiple uses.
   if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);
 
@@ -527,12 +568,11 @@
   }
   case ISD::FADD:
     // FIXME: determine better conditions for this xform.
-    assert(DAG.getTarget().Options.UnsafeFPMath);
+    assert(Options.UnsafeFPMath);
 
     // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
-                           DAG.getTargetLoweringInfo(),
-                           &DAG.getTarget().Options, Depth+1))
+                           DAG.getTargetLoweringInfo(), &Options, Depth+1))
       return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
@@ -544,7 +584,7 @@
                        Op.getOperand(0));
   case ISD::FSUB:
     // We can't turn -(A-B) into B-A when we honor signed zeros.
-    assert(DAG.getTarget().Options.UnsafeFPMath);
+    assert(Options.UnsafeFPMath);
 
     // fold (fneg (fsub 0, B)) -> B
     if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
@@ -557,12 +597,11 @@
 
   case ISD::FMUL:
   case ISD::FDIV:
-    assert(!DAG.getTarget().Options.HonorSignDependentRoundingFPMath());
+    assert(!Options.HonorSignDependentRoundingFPMath());
 
     // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
-                           DAG.getTargetLoweringInfo(),
-                           &DAG.getTarget().Options, Depth+1))
+                           DAG.getTargetLoweringInfo(), &Options, Depth+1))
       return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
@@ -587,7 +626,7 @@
   }
 }
 
-// isSetCCEquivalent - Return true if this node is a setcc, or is a select_cc
+// Return true if this node is a setcc, or is a select_cc
 // that selects between the target values used for true and false, making it
 // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
 // the appropriate nodes based on the type of node we are checking. This
@@ -606,15 +645,19 @@
       !TLI.isConstFalseVal(N.getOperand(3).getNode()))
     return false;
 
+  if (TLI.getBooleanContents(N.getValueType()) ==
+      TargetLowering::UndefinedBooleanContent)
+    return false;
+
   LHS = N.getOperand(0);
   RHS = N.getOperand(1);
   CC  = N.getOperand(4);
   return true;
 }
 
-// isOneUseSetCC - Return true if this is a SetCC-equivalent operation with only
-// one use.  If this is true, it allows the users to invert the operation for
-// free when it is profitable to do so.
+/// Return true if this is a SetCC-equivalent operation with only one use.
+/// If this is true, it allows the users to invert the operation for free when
+/// it is profitable to do so.
 bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   SDValue N0, N1, N2;
   if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
@@ -622,7 +665,7 @@
   return false;
 }
 
-/// isConstantSplatVector - Returns true if N is a BUILD_VECTOR node whose
+/// Returns true if N is a BUILD_VECTOR node whose
 /// elements are all the same constant or undefined.
 static bool isConstantSplatVector(SDNode *N, APInt& SplatValue) {
   BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(N);
@@ -643,7 +686,7 @@
   if (isa<ConstantSDNode>(N))
     return N.getNode();
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
-  if(BV && BV->isConstant())
+  if (BV && BV->isConstant())
     return BV;
   return nullptr;
 }
@@ -669,6 +712,23 @@
   return nullptr;
 }
 
+// \brief Returns the SDNode if it is a constant splat BuildVector or constant
+// float.
+static ConstantFPSDNode *isConstOrConstSplatFP(SDValue N) {
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    return CN;
+
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
+    BitVector UndefElements;
+    ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements);
+
+    if (CN && UndefElements.none())
+      return CN;
+  }
+
+  return nullptr;
+}
+
 SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL,
                                     SDValue N0, SDValue N1) {
   EVT VT = N0.getValueType();
@@ -687,7 +747,7 @@
         SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
         if (!OpNode.getNode())
           return SDValue();
-        AddToWorkList(OpNode.getNode());
+        AddToWorklist(OpNode.getNode());
         return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
       }
     }
@@ -708,7 +768,7 @@
         SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N1.getOperand(0), N0);
         if (!OpNode.getNode())
           return SDValue();
-        AddToWorkList(OpNode.getNode());
+        AddToWorklist(OpNode.getNode());
         return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
       }
     }
@@ -730,14 +790,14 @@
           assert((!To[i].getNode() ||
                   N->getValueType(i) == To[i].getValueType()) &&
                  "Cannot combine value to value of different type!"));
-  WorkListRemover DeadNodes(*this);
+  WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesWith(N, To);
   if (AddTo) {
     // Push the new nodes and any users onto the worklist
     for (unsigned i = 0, e = NumTo; i != e; ++i) {
       if (To[i].getNode()) {
-        AddToWorkList(To[i].getNode());
-        AddUsersToWorkList(To[i].getNode());
+        AddToWorklist(To[i].getNode());
+        AddUsersToWorklist(To[i].getNode());
       }
     }
   }
@@ -745,14 +805,8 @@
   // Finally, if the node is now dead, remove it from the graph.  The node
   // may not be dead if the replacement process recursively simplified to
   // something else needing this node.
-  if (N->use_empty()) {
-    // Nodes can be reintroduced into the worklist.  Make sure we do not
-    // process a node that has been replaced.
-    removeFromWorkList(N);
-
-    // Finally, since the node is now dead, remove it from the graph.
-    DAG.DeleteNode(N);
-  }
+  if (N->use_empty())
+    deleteAndRecombine(N);
   return SDValue(N, 0);
 }
 
@@ -760,32 +814,22 @@
 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   // Replace all uses.  If any nodes become isomorphic to other nodes and
   // are deleted, make sure to remove them from our worklist.
-  WorkListRemover DeadNodes(*this);
+  WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
 
   // Push the new node and any (possibly new) users onto the worklist.
-  AddToWorkList(TLO.New.getNode());
-  AddUsersToWorkList(TLO.New.getNode());
+  AddToWorklist(TLO.New.getNode());
+  AddUsersToWorklist(TLO.New.getNode());
 
   // Finally, if the node is now dead, remove it from the graph.  The node
   // may not be dead if the replacement process recursively simplified to
   // something else needing this node.
-  if (TLO.Old.getNode()->use_empty()) {
-    removeFromWorkList(TLO.Old.getNode());
-
-    // If the operands of this node are only used by the node, they will now
-    // be dead.  Make sure to visit them first to delete dead nodes early.
-    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i)
-      if (TLO.Old.getNode()->getOperand(i).getNode()->hasOneUse())
-        AddToWorkList(TLO.Old.getNode()->getOperand(i).getNode());
-
-    DAG.DeleteNode(TLO.Old.getNode());
-  }
+  if (TLO.Old.getNode()->use_empty())
+    deleteAndRecombine(TLO.Old.getNode());
 }
 
-/// SimplifyDemandedBits - Check the specified integer node value to see if
-/// it can be simplified or if things it uses can be simplified by bit
-/// propagation.  If so, return true.
+/// Check the specified integer node value to see if it can be simplified or if
+/// things it uses can be simplified by bit propagation. If so, return true.
 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
   TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
   APInt KnownZero, KnownOne;
@@ -793,7 +837,7 @@
     return false;
 
   // Revisit the node.
-  AddToWorkList(Op.getNode());
+  AddToWorklist(Op.getNode());
 
   // Replace the old value with the new one.
   ++NodesCombined;
@@ -817,12 +861,11 @@
         dbgs() << "\nWith: ";
         Trunc.getNode()->dump(&DAG);
         dbgs() << '\n');
-  WorkListRemover DeadNodes(*this);
+  WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
-  removeFromWorkList(Load);
-  DAG.DeleteNode(Load);
-  AddToWorkList(Trunc.getNode());
+  deleteAndRecombine(Load);
+  AddToWorklist(Trunc.getNode());
 }
 
 SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
@@ -872,7 +915,7 @@
   SDValue NewOp = PromoteOperand(Op, PVT, Replace);
   if (!NewOp.getNode())
     return SDValue();
-  AddToWorkList(NewOp.getNode());
+  AddToWorklist(NewOp.getNode());
 
   if (Replace)
     ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
@@ -887,16 +930,16 @@
   SDValue NewOp = PromoteOperand(Op, PVT, Replace);
   if (!NewOp.getNode())
     return SDValue();
-  AddToWorkList(NewOp.getNode());
+  AddToWorklist(NewOp.getNode());
 
   if (Replace)
     ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
   return DAG.getZeroExtendInReg(NewOp, dl, OldVT);
 }
 
-/// PromoteIntBinOp - Promote the specified integer binary operation if the
-/// target indicates it is beneficial. e.g. On x86, it's usually better to
-/// promote i16 operations to i32 since i16 instructions are longer.
+/// Promote the specified integer binary operation if the target indicates it is
+/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
+/// i32 since i16 instructions are longer.
 SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   if (!LegalOperations)
     return SDValue();
@@ -934,9 +977,9 @@
         return SDValue();
     }
 
-    AddToWorkList(NN0.getNode());
+    AddToWorklist(NN0.getNode());
     if (NN1.getNode())
-      AddToWorkList(NN1.getNode());
+      AddToWorklist(NN1.getNode());
 
     if (Replace0)
       ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
@@ -952,9 +995,9 @@
   return SDValue();
 }
 
-/// PromoteIntShiftOp - Promote the specified integer shift operation if the
-/// target indicates it is beneficial. e.g. On x86, it's usually better to
-/// promote i16 operations to i32 since i16 instructions are longer.
+/// Promote the specified integer shift operation if the target indicates it is
+/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
+/// i32 since i16 instructions are longer.
 SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
   if (!LegalOperations)
     return SDValue();
@@ -986,7 +1029,7 @@
     if (!N0.getNode())
       return SDValue();
 
-    AddToWorkList(N0.getNode());
+    AddToWorklist(N0.getNode());
     if (Replace)
       ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
 
@@ -1066,17 +1109,45 @@
           dbgs() << "\nTo: ";
           Result.getNode()->dump(&DAG);
           dbgs() << '\n');
-    WorkListRemover DeadNodes(*this);
+    WorklistRemover DeadNodes(*this);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
-    removeFromWorkList(N);
-    DAG.DeleteNode(N);
-    AddToWorkList(Result.getNode());
+    deleteAndRecombine(N);
+    AddToWorklist(Result.getNode());
     return true;
   }
   return false;
 }
 
+/// \brief Recursively delete a node which has no uses and any operands for
+/// which it is the only use.
+///
+/// Note that this both deletes the nodes and removes them from the worklist.
+/// It also adds any nodes who have had a user deleted to the worklist as they
+/// may now have only one use and subject to other combines.
+bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
+  if (!N->use_empty())
+    return false;
+
+  SmallSetVector<SDNode *, 16> Nodes;
+  Nodes.insert(N);
+  do {
+    N = Nodes.pop_back_val();
+    if (!N)
+      continue;
+
+    if (N->use_empty()) {
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        Nodes.insert(N->getOperand(i).getNode());
+
+      removeFromWorklist(N);
+      DAG.DeleteNode(N);
+    } else {
+      AddToWorklist(N);
+    }
+  } while (!Nodes.empty());
+  return true;
+}
 
 //===----------------------------------------------------------------------===//
 //  Main DAG Combiner implementation
@@ -1088,44 +1159,69 @@
   LegalOperations = Level >= AfterLegalizeVectorOps;
   LegalTypes = Level >= AfterLegalizeTypes;
 
+  // Early exit if this basic block is in an optnone function.
+  AttributeSet FnAttrs =
+    DAG.getMachineFunction().getFunction()->getAttributes();
+  if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::OptimizeNone))
+    return;
+
   // Add all the dag nodes to the worklist.
   for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
        E = DAG.allnodes_end(); I != E; ++I)
-    AddToWorkList(I);
+    AddToWorklist(I);
 
   // Create a dummy node (which is not added to allnodes), that adds a reference
   // to the root node, preventing it from being deleted, and tracking any
   // changes of the root.
   HandleSDNode Dummy(DAG.getRoot());
 
-  // The root of the dag may dangle to deleted nodes until the dag combiner is
-  // done.  Set it to null to avoid confusion.
-  DAG.setRoot(SDValue());
-
   // while the worklist isn't empty, find a node and
   // try and combine it.
-  while (!WorkListContents.empty()) {
+  while (!WorklistMap.empty()) {
     SDNode *N;
-    // The WorkListOrder holds the SDNodes in order, but it may contain
-    // duplicates.
-    // In order to avoid a linear scan, we use a set (O(log N)) to hold what the
-    // worklist *should* contain, and check the node we want to visit is should
-    // actually be visited.
+    // The Worklist holds the SDNodes in order, but it may contain null entries.
     do {
-      N = WorkListOrder.pop_back_val();
-    } while (!WorkListContents.erase(N));
+      N = Worklist.pop_back_val();
+    } while (!N);
+
+    bool GoodWorklistEntry = WorklistMap.erase(N);
+    (void)GoodWorklistEntry;
+    assert(GoodWorklistEntry &&
+           "Found a worklist entry without a corresponding map entry!");
 
     // If N has no uses, it is dead.  Make sure to revisit all N's operands once
     // N is deleted from the DAG, since they too may now be dead or may have a
     // reduced number of uses, allowing other xforms.
-    if (N->use_empty() && N != &Dummy) {
-      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-        AddToWorkList(N->getOperand(i).getNode());
-
-      DAG.DeleteNode(N);
+    if (recursivelyDeleteUnusedNodes(N))
       continue;
+
+    WorklistRemover DeadNodes(*this);
+
+    // If this combine is running after legalizing the DAG, re-legalize any
+    // nodes pulled off the worklist.
+    if (Level == AfterLegalizeDAG) {
+      SmallSetVector<SDNode *, 16> UpdatedNodes;
+      bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
+
+      for (SDNode *LN : UpdatedNodes) {
+        AddToWorklist(LN);
+        AddUsersToWorklist(LN);
+      }
+      if (!NIsValid)
+        continue;
     }
 
+    DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
+
+    // Add any operands of the new node which have not yet been combined to the
+    // worklist as well. Because the worklist uniques things already, this
+    // won't repeatedly process the same operand.
+    CombinedNodes.insert(N);
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      if (!CombinedNodes.count(N->getOperand(i).getNode()))
+        AddToWorklist(N->getOperand(i).getNode());
+
     SDValue RV = combine(N);
 
     if (!RV.getNode())
@@ -1144,15 +1240,11 @@
            RV.getNode()->getOpcode() != ISD::DELETED_NODE &&
            "Node was deleted but visit returned new node!");
 
-    DEBUG(dbgs() << "\nReplacing.3 ";
-          N->dump(&DAG);
-          dbgs() << "\nWith: ";
-          RV.getNode()->dump(&DAG);
-          dbgs() << '\n');
+    DEBUG(dbgs() << " ... into: ";
+          RV.getNode()->dump(&DAG));
 
     // Transfer debug value.
     DAG.TransferDbgValues(SDValue(N, 0), RV);
-    WorkListRemover DeadNodes(*this);
     if (N->getNumValues() == RV.getNode()->getNumValues())
       DAG.ReplaceAllUsesWith(N, RV.getNode());
     else {
@@ -1163,26 +1255,14 @@
     }
 
     // Push the new node and any users onto the worklist
-    AddToWorkList(RV.getNode());
-    AddUsersToWorkList(RV.getNode());
-
-    // Add any uses of the old node to the worklist in case this node is the
-    // last one that uses them.  They may become dead after this node is
-    // deleted.
-    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-      AddToWorkList(N->getOperand(i).getNode());
+    AddToWorklist(RV.getNode());
+    AddUsersToWorklist(RV.getNode());
 
     // Finally, if the node is now dead, remove it from the graph.  The node
     // may not be dead if the replacement process recursively simplified to
-    // something else needing this node.
-    if (N->use_empty()) {
-      // Nodes can be reintroduced into the worklist.  Make sure we do not
-      // process a node that has been replaced.
-      removeFromWorkList(N);
-
-      // Finally, since the node is now dead, remove it from the graph.
-      DAG.DeleteNode(N);
-    }
+    // something else needing this node. This will also take care of adding any
+    // operands which have lost a user to the worklist.
+    recursivelyDeleteUnusedNodes(N);
   }
 
   // If the root changed (e.g. it was a dead load, update the root).
@@ -1244,6 +1324,7 @@
   case ISD::FMA:                return visitFMA(N);
   case ISD::FDIV:               return visitFDIV(N);
   case ISD::FREM:               return visitFREM(N);
+  case ISD::FSQRT:              return visitFSQRT(N);
   case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
   case ISD::SINT_TO_FP:         return visitSINT_TO_FP(N);
   case ISD::UINT_TO_FP:         return visitUINT_TO_FP(N);
@@ -1255,6 +1336,8 @@
   case ISD::FNEG:               return visitFNEG(N);
   case ISD::FABS:               return visitFABS(N);
   case ISD::FFLOOR:             return visitFFLOOR(N);
+  case ISD::FMINNUM:            return visitFMINNUM(N);
+  case ISD::FMAXNUM:            return visitFMAXNUM(N);
   case ISD::FCEIL:              return visitFCEIL(N);
   case ISD::FTRUNC:             return visitFTRUNC(N);
   case ISD::BRCOND:             return visitBRCOND(N);
@@ -1347,8 +1430,8 @@
   return RV;
 }
 
-/// getInputChainForNode - Given a node, return its input chain if it has one,
-/// otherwise return a null sd operand.
+/// Given a node, return its input chain if it has one, otherwise return a null
+/// sd operand.
 static SDValue getInputChainForNode(SDNode *N) {
   if (unsigned NumOps = N->getNumOperands()) {
     if (N->getOperand(0).getValueType() == MVT::Other)
@@ -1402,7 +1485,7 @@
           // Queue up for processing.
           TFs.push_back(Op.getNode());
           // Clean up in case the token factor is removed.
-          AddToWorkList(Op.getNode());
+          AddToWorklist(Op.getNode());
           Changed = true;
           break;
         }
@@ -1410,7 +1493,7 @@
 
       default:
         // Only add if it isn't already in the list.
-        if (SeenOps.insert(Op.getNode()))
+        if (SeenOps.insert(Op.getNode()).second)
           Ops.push_back(Op);
         else
           Changed = true;
@@ -1440,44 +1523,21 @@
 
 /// MERGE_VALUES can always be eliminated.
 SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
-  WorkListRemover DeadNodes(*this);
+  WorklistRemover DeadNodes(*this);
   // Replacing results may cause a different MERGE_VALUES to suddenly
   // be CSE'd with N, and carry its uses with it. Iterate until no
   // uses remain, to ensure that the node can be safely deleted.
   // First add the users of this node to the work list so that they
   // can be tried again once they have new operands.
-  AddUsersToWorkList(N);
+  AddUsersToWorklist(N);
   do {
     for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
       DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i));
   } while (!N->use_empty());
-  removeFromWorkList(N);
-  DAG.DeleteNode(N);
+  deleteAndRecombine(N);
   return SDValue(N, 0);   // Return N so it doesn't get rechecked!
 }
 
-static
-SDValue combineShlAddConstant(SDLoc DL, SDValue N0, SDValue N1,
-                              SelectionDAG &DAG) {
-  EVT VT = N0.getValueType();
-  SDValue N00 = N0.getOperand(0);
-  SDValue N01 = N0.getOperand(1);
-  ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N01);
-
-  if (N01C && N00.getOpcode() == ISD::ADD && N00.getNode()->hasOneUse() &&
-      isa<ConstantSDNode>(N00.getOperand(1))) {
-    // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
-    N0 = DAG.getNode(ISD::ADD, SDLoc(N0), VT,
-                     DAG.getNode(ISD::SHL, SDLoc(N00), VT,
-                                 N00.getOperand(0), N01),
-                     DAG.getNode(ISD::SHL, SDLoc(N01), VT,
-                                 N00.getOperand(1), N01));
-    return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
-  }
-
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -1594,16 +1654,6 @@
     }
   }
 
-  // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
-  if (N0.getOpcode() == ISD::SHL && N0.getNode()->hasOneUse()) {
-    SDValue Result = combineShlAddConstant(SDLoc(N), N0, N1, DAG);
-    if (Result.getNode()) return Result;
-  }
-  if (N1.getOpcode() == ISD::SHL && N1.getNode()->hasOneUse()) {
-    SDValue Result = combineShlAddConstant(SDLoc(N), N1, N0, DAG);
-    if (Result.getNode()) return Result;
-  }
-
   // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
   if (N1.getOpcode() == ISD::SHL &&
       N1.getOperand(0).getOpcode() == ISD::SUB)
@@ -1647,6 +1697,17 @@
     return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
   }
 
+  // add X, (sextinreg Y i1) -> sub X, (and Y 1)
+  if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
+    if (TN->getVT() == MVT::i1) {
+      SDLoc DL(N);
+      SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
+                                 DAG.getConstant(1, VT));
+      return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
+    }
+  }
+
   return SDValue();
 }
 
@@ -1812,6 +1873,17 @@
                                  VT);
     }
 
+  // sub X, (sextinreg Y i1) -> add X, (and Y 1)
+  if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
+    if (TN->getVT() == MVT::i1) {
+      SDLoc DL(N);
+      SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
+                                 DAG.getConstant(1, VT));
+      return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
+    }
+  }
+
   return SDValue();
 }
 
@@ -1933,7 +2005,7 @@
                      isa<ConstantSDNode>(N0.getOperand(1)))) {
     SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT,
                              N1, N0.getOperand(1));
-    AddToWorkList(C3.getNode());
+    AddToWorklist(C3.getNode());
     return DAG.getNode(ISD::MUL, SDLoc(N), VT,
                        N0.getOperand(0), C3);
   }
@@ -2016,9 +2088,14 @@
                                      (-N1C->getAPIntValue()).isPowerOf2())) {
     // If dividing by powers of two is cheap, then don't perform the following
     // fold.
-    if (TLI.isPow2DivCheap())
+    if (TLI.isPow2SDivCheap())
       return SDValue();
 
+    // Target-specific implementation of sdiv x, pow2.
+    SDValue Res = BuildSDIVPow2(N);
+    if (Res.getNode())
+      return Res;
+
     unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
 
     // Splat the sign bit into the register
@@ -2026,7 +2103,7 @@
         DAG.getNode(ISD::SRA, SDLoc(N), VT, N0,
                     DAG.getConstant(VT.getScalarSizeInBits() - 1,
                                     getShiftAmountTy(N0.getValueType())));
-    AddToWorkList(SGN.getNode());
+    AddToWorklist(SGN.getNode());
 
     // Add (N0 < 0) ? abs2 - 1 : 0;
     SDValue SRL =
@@ -2034,8 +2111,8 @@
                     DAG.getConstant(VT.getScalarSizeInBits() - lg2,
                                     getShiftAmountTy(SGN.getValueType())));
     SDValue ADD = DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, SRL);
-    AddToWorkList(SRL.getNode());
-    AddToWorkList(ADD.getNode());    // Divide by pow2
+    AddToWorklist(SRL.getNode());
+    AddToWorklist(ADD.getNode());    // Divide by pow2
     SDValue SRA = DAG.getNode(ISD::SRA, SDLoc(N), VT, ADD,
                   DAG.getConstant(lg2, getShiftAmountTy(ADD.getValueType())));
 
@@ -2044,7 +2121,7 @@
     if (N1C->getAPIntValue().isNonNegative())
       return SRA;
 
-    AddToWorkList(SRA.getNode());
+    AddToWorklist(SRA.getNode());
     return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), SRA);
   }
 
@@ -2096,7 +2173,7 @@
                                   DAG.getConstant(SHC->getAPIntValue()
                                                                   .logBase2(),
                                                   ADDVT));
-        AddToWorkList(Add.getNode());
+        AddToWorklist(Add.getNode());
         return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, Add);
       }
     }
@@ -2138,13 +2215,13 @@
   // X%C to the equivalent of X-X/C*C.
   if (N1C && !N1C->isNullValue()) {
     SDValue Div = DAG.getNode(ISD::SDIV, SDLoc(N), VT, N0, N1);
-    AddToWorkList(Div.getNode());
+    AddToWorklist(Div.getNode());
     SDValue OptimizedDiv = combine(Div.getNode());
     if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
       SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT,
                                 OptimizedDiv, N1);
       SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul);
-      AddToWorkList(Mul.getNode());
+      AddToWorklist(Mul.getNode());
       return Sub;
     }
   }
@@ -2181,7 +2258,7 @@
           DAG.getNode(ISD::ADD, SDLoc(N), VT, N1,
                  DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()),
                                  VT));
-        AddToWorkList(Add.getNode());
+        AddToWorklist(Add.getNode());
         return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, Add);
       }
     }
@@ -2191,13 +2268,13 @@
   // X%C to the equivalent of X-X/C*C.
   if (N1C && !N1C->isNullValue()) {
     SDValue Div = DAG.getNode(ISD::UDIV, SDLoc(N), VT, N0, N1);
-    AddToWorkList(Div.getNode());
+    AddToWorklist(Div.getNode());
     SDValue OptimizedDiv = combine(Div.getNode());
     if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
       SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT,
                                 OptimizedDiv, N1);
       SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul);
-      AddToWorkList(Mul.getNode());
+      AddToWorklist(Mul.getNode());
       return Sub;
     }
   }
@@ -2286,10 +2363,9 @@
   return SDValue();
 }
 
-/// SimplifyNodeWithTwoResults - Perform optimizations common to nodes that
-/// compute two values. LoOp and HiOp give the opcodes for the two computations
-/// that are being performed. Return true if a simplification was made.
-///
+/// Perform optimizations common to nodes that compute two values. LoOp and HiOp
+/// give the opcodes for the two computations that are being performed. Return
+/// true if a simplification was made.
 SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                                 unsigned HiOp) {
   // If the high half is not needed, just compute the low half.
@@ -2297,8 +2373,7 @@
   if (!HiExists &&
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
-    SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0),
-                              ArrayRef<SDUse>(N->op_begin(), N->op_end()));
+    SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
     return CombineTo(N, Res, Res);
   }
 
@@ -2307,8 +2382,7 @@
   if (!LoExists &&
       (!LegalOperations ||
        TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
-    SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1),
-                              ArrayRef<SDUse>(N->op_begin(), N->op_end()));
+    SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
     return CombineTo(N, Res, Res);
   }
 
@@ -2318,9 +2392,8 @@
 
   // If the two computed results can be simplified separately, separate them.
   if (LoExists) {
-    SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0),
-                             ArrayRef<SDUse>(N->op_begin(), N->op_end()));
-    AddToWorkList(Lo.getNode());
+    SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
+    AddToWorklist(Lo.getNode());
     SDValue LoOpt = combine(Lo.getNode());
     if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
         (!LegalOperations ||
@@ -2329,9 +2402,8 @@
   }
 
   if (HiExists) {
-    SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1),
-                             ArrayRef<SDUse>(N->op_begin(), N->op_end()));
-    AddToWorkList(Hi.getNode());
+    SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
+    AddToWorklist(Hi.getNode());
     SDValue HiOpt = combine(Hi.getNode());
     if (HiOpt.getNode() && HiOpt != Hi &&
         (!LegalOperations ||
@@ -2436,8 +2508,8 @@
   return SDValue();
 }
 
-/// SimplifyBinOpWithSameOpcodeHands - If this is a binary operator with
-/// two operands of the same opcode, try to simplify it.
+/// If this is a binary operator with two operands of the same opcode, try to
+/// simplify it.
 SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
@@ -2470,7 +2542,7 @@
     SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
                                  N0.getOperand(0).getValueType(),
                                  N0.getOperand(0), N1.getOperand(0));
-    AddToWorkList(ORNode.getNode());
+    AddToWorklist(ORNode.getNode());
     return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode);
   }
 
@@ -2484,7 +2556,7 @@
     SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
                                  N0.getOperand(0).getValueType(),
                                  N0.getOperand(0), N1.getOperand(0));
-    AddToWorkList(ORNode.getNode());
+    AddToWorklist(ORNode.getNode());
     return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
                        ORNode, N0.getOperand(1));
   }
@@ -2509,7 +2581,7 @@
     if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
       SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1);
       SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op);
-      AddToWorkList(Op.getNode());
+      AddToWorklist(Op.getNode());
       return BC;
     }
   }
@@ -2556,7 +2628,7 @@
       if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
         SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                       N0->getOperand(0), N1->getOperand(0));
-        AddToWorkList(NewNode.getNode());
+        AddToWorklist(NewNode.getNode());
         return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp,
                                     &SVN0->getMask()[0]);
       }
@@ -2577,7 +2649,7 @@
       if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) {
         SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                       N0->getOperand(1), N1->getOperand(1));
-        AddToWorkList(NewNode.getNode());
+        AddToWorklist(NewNode.getNode());
         return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode,
                                     &SVN0->getMask()[0]);
       }
@@ -2603,9 +2675,17 @@
 
     // fold (and x, 0) -> 0, vector edition
     if (ISD::isBuildVectorAllZeros(N0.getNode()))
-      return N0;
+      // do not return N0, because undef node may exist in N0
+      return DAG.getConstant(
+          APInt::getNullValue(
+              N0.getValueType().getScalarType().getSizeInBits()),
+          N0.getValueType());
     if (ISD::isBuildVectorAllZeros(N1.getNode()))
-      return N1;
+      // do not return N1, because undef node may exist in N1
+      return DAG.getConstant(
+          APInt::getNullValue(
+              N1.getValueType().getScalarType().getSizeInBits()),
+          N1.getValueType());
 
     // fold (and x, -1) -> x, vector edition
     if (ISD::isBuildVectorAllOnes(N0.getNode()))
@@ -2768,21 +2848,21 @@
       if (cast<ConstantSDNode>(LR)->isNullValue() && Op1 == ISD::SETEQ) {
         SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
                                      LR.getValueType(), LL, RL);
-        AddToWorkList(ORNode.getNode());
+        AddToWorklist(ORNode.getNode());
         return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1);
       }
       // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1)
       if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETEQ) {
         SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0),
                                       LR.getValueType(), LL, RL);
-        AddToWorkList(ANDNode.getNode());
+        AddToWorklist(ANDNode.getNode());
         return DAG.getSetCC(SDLoc(N), VT, ANDNode, LR, Op1);
       }
       // fold (and (setgt X,  -1), (setgt Y,  -1)) -> (setgt (or X, Y), -1)
       if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETGT) {
         SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
                                      LR.getValueType(), LL, RL);
-        AddToWorkList(ORNode.getNode());
+        AddToWorklist(ORNode.getNode());
         return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1);
       }
     }
@@ -2795,7 +2875,7 @@
                                  cast<ConstantSDNode>(RR)->isNullValue()))) {
       SDValue ADDNode = DAG.getNode(ISD::ADD, SDLoc(N0), LL.getValueType(),
                                     LL, DAG.getConstant(1, LL.getValueType()));
-      AddToWorkList(ADDNode.getNode());
+      AddToWorklist(ADDNode.getNode());
       return DAG.getSetCC(SDLoc(N), VT, ADDNode,
                           DAG.getConstant(2, LL.getValueType()), ISD::SETUGE);
     }
@@ -2843,7 +2923,7 @@
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
-      AddToWorkList(N);
+      AddToWorklist(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
@@ -2863,7 +2943,7 @@
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
-      AddToWorkList(N);
+      AddToWorklist(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
@@ -2894,7 +2974,7 @@
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
                            LN0->getChain(), LN0->getBasePtr(), ExtVT,
                            LN0->getMemOperand());
-          AddToWorkList(N);
+          AddToWorklist(N);
           CombineTo(LN0, NewLoad, NewLoad.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
         }
@@ -2921,7 +3001,7 @@
             Alignment = MinAlign(Alignment, PtrOff);
           }
 
-          AddToWorkList(NewPtr.getNode());
+          AddToWorklist(NewPtr.getNode());
 
           EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
           SDValue Load =
@@ -2929,8 +3009,8 @@
                            LN0->getChain(), NewPtr,
                            LN0->getPointerInfo(),
                            ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                           Alignment, LN0->getTBAAInfo());
-          AddToWorkList(N);
+                           LN0->isInvariant(), Alignment, LN0->getAAInfo());
+          AddToWorklist(N);
           CombineTo(LN0, Load, Load.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
         }
@@ -2976,8 +3056,7 @@
   return SDValue();
 }
 
-/// MatchBSwapHWord - Match (a >> 8) | (a << 8) as (bswap a) >> 16
-///
+/// Match (a >> 8) | (a << 8) as (bswap a) >> 16.
 SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                         bool DemandHighBits) {
   if (!LegalOperations)
@@ -3082,10 +3161,13 @@
   return Res;
 }
 
-/// isBSwapHWordElement - Return true if the specified node is an element
-/// that makes up a 32-bit packed halfword byteswap. i.e.
-/// ((x&0xff)<<8)|((x&0xff00)>>8)|((x&0x00ff0000)<<8)|((x&0xff000000)>>8)
-static bool isBSwapHWordElement(SDValue N, SmallVectorImpl<SDNode *> &Parts) {
+/// Return true if the specified node is an element that makes up a 32-bit
+/// packed halfword byteswap.
+/// ((x & 0x000000ff) << 8) |
+/// ((x & 0x0000ff00) >> 8) |
+/// ((x & 0x00ff0000) << 8) |
+/// ((x & 0xff000000) >> 8)
+static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
   if (!N.getNode()->hasOneUse())
     return false;
 
@@ -3152,8 +3234,11 @@
   return true;
 }
 
-/// MatchBSwapHWord - Match a 32-bit packed halfword bswap. That is
-/// ((x&0xff)<<8)|((x&0xff00)>>8)|((x&0x00ff0000)<<8)|((x&0xff000000)>>8)
+/// Match a 32-bit packed halfword bswap. That is
+/// ((x & 0x000000ff) << 8) |
+/// ((x & 0x0000ff00) >> 8) |
+/// ((x & 0x00ff0000) << 8) |
+/// ((x & 0xff000000) >> 8)
 /// => (rotl (bswap x), 16)
 SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   if (!LegalOperations)
@@ -3165,7 +3250,6 @@
   if (!TLI.isOperationLegal(ISD::BSWAP, VT))
     return SDValue();
 
-  SmallVector<SDNode*,4> Parts(4, (SDNode*)nullptr);
   // Look for either
   // (or (or (and), (and)), (or (and), (and)))
   // (or (or (or (and), (and)), (and)), (and))
@@ -3173,6 +3257,7 @@
     return SDValue();
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
+  SDNode *Parts[4] = {};
 
   if (N1.getOpcode() == ISD::OR &&
       N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
@@ -3246,15 +3331,25 @@
 
     // fold (or x, -1) -> -1, vector edition
     if (ISD::isBuildVectorAllOnes(N0.getNode()))
-      return N0;
+      // do not return N0, because undef node may exist in N0
+      return DAG.getConstant(
+          APInt::getAllOnesValue(
+              N0.getValueType().getScalarType().getSizeInBits()),
+          N0.getValueType());
     if (ISD::isBuildVectorAllOnes(N1.getNode()))
-      return N1;
+      // do not return N1, because undef node may exist in N1
+      return DAG.getConstant(
+          APInt::getAllOnesValue(
+              N1.getValueType().getScalarType().getSizeInBits()),
+          N1.getValueType());
 
     // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask1)
     // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf B, A, Mask2)
     // Do this only if the resulting shuffle is legal.
     if (isa<ShuffleVectorSDNode>(N0) &&
         isa<ShuffleVectorSDNode>(N1) &&
+        // Avoid folding a node with illegal type.
+        TLI.isTypeLegal(VT) &&
         N0->getOperand(1) == N1->getOperand(1) &&
         ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode())) {
       bool CanFold = true;
@@ -3366,7 +3461,7 @@
           (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) {
         SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR),
                                      LR.getValueType(), LL, RL);
-        AddToWorkList(ORNode.getNode());
+        AddToWorklist(ORNode.getNode());
         return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1);
       }
       // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1)
@@ -3375,7 +3470,7 @@
           (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) {
         SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR),
                                       LR.getValueType(), LL, RL);
-        AddToWorkList(ANDNode.getNode());
+        AddToWorklist(ANDNode.getNode());
         return DAG.getSetCC(SDLoc(N), VT, ANDNode, LR, Op1);
       }
     }
@@ -3438,7 +3533,7 @@
   return SDValue();
 }
 
-/// MatchRotateHalf - Match "(X shl/srl V1) & V2" where V2 may not be present.
+/// Match "(X shl/srl V1) & V2" where V2 may not be present.
 static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
   if (Op.getOpcode() == ISD::AND) {
     if (isa<ConstantSDNode>(Op.getOperand(1))) {
@@ -3735,7 +3830,7 @@
     return RXOR;
 
   // fold !(x cc y) -> (x !cc y)
-  if (N1C && N1C->getAPIntValue() == 1 && isSetCCEquivalent(N0, LHS, RHS, CC)) {
+  if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
     bool isInt = LHS.getValueType().isInteger();
     ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
                                                isInt);
@@ -3761,7 +3856,7 @@
     SDValue V = N0.getOperand(0);
     V = DAG.getNode(ISD::XOR, SDLoc(N0), V.getValueType(), V,
                     DAG.getConstant(1, V.getValueType()));
-    AddToWorkList(V.getNode());
+    AddToWorklist(V.getNode());
     return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V);
   }
 
@@ -3773,7 +3868,7 @@
       unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
       LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
       RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
-      AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
+      AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
       return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
     }
   }
@@ -3785,7 +3880,7 @@
       unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
       LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
       RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
-      AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
+      AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
       return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
     }
   }
@@ -3794,7 +3889,7 @@
       N0->getOperand(1) == N1) {
     SDValue X = N0->getOperand(0);
     SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
-    AddToWorkList(NotX.getNode());
+    AddToWorklist(NotX.getNode());
     return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1);
   }
   // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2))
@@ -3828,8 +3923,8 @@
   return SDValue();
 }
 
-/// visitShiftByConstant - Handle transforms common to the three shifts, when
-/// the shift amount is a constant.
+/// Handle transforms common to the three shifts, when the shift amount is a
+/// constant.
 SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
   // We can't and shouldn't fold opaque constants.
   if (Amt->isOpaque())
@@ -4059,7 +4154,7 @@
           EVT CountVT = NewOp0.getOperand(1).getValueType();
           SDValue NewSHL = DAG.getNode(ISD::SHL, SDLoc(N), NewOp0.getValueType(),
                                        NewOp0, DAG.getConstant(c2, CountVT));
-          AddToWorkList(NewSHL.getNode());
+          AddToWorklist(NewSHL.getNode());
           return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
         }
       }
@@ -4101,6 +4196,18 @@
                        HiBitsMask);
   }
 
+  // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+  // Variant of version done on multiply, except mul by a power of 2 is turned
+  // into a shift.
+  APInt Val;
+  if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
+      (isa<ConstantSDNode>(N0.getOperand(1)) ||
+       isConstantSplatVector(N0.getOperand(1).getNode(), Val))) {
+    SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
+    SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
+    return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);
+  }
+
   if (N1C) {
     SDValue NewSHL = visitShiftByConstant(N, N1C);
     if (NewSHL.getNode())
@@ -4345,7 +4452,7 @@
       SDValue SmallShift = DAG.getNode(ISD::SRL, SDLoc(N0), SmallVT,
                                        N0.getOperand(0),
                           DAG.getConstant(ShiftAmt, getShiftAmountTy(SmallVT)));
-      AddToWorkList(SmallShift.getNode());
+      AddToWorklist(SmallShift.getNode());
       APInt Mask = APInt::getAllOnesValue(OpSizeInBits).lshr(ShiftAmt);
       return DAG.getNode(ISD::AND, SDLoc(N), VT,
                          DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, SmallShift),
@@ -4387,7 +4494,7 @@
       if (ShAmt) {
         Op = DAG.getNode(ISD::SRL, SDLoc(N0), VT, Op,
                   DAG.getConstant(ShAmt, getShiftAmountTy(Op.getValueType())));
-        AddToWorkList(Op.getNode());
+        AddToWorklist(Op.getNode());
       }
 
       return DAG.getNode(ISD::XOR, SDLoc(N), VT,
@@ -4439,12 +4546,12 @@
   if (N->hasOneUse()) {
     SDNode *Use = *N->use_begin();
     if (Use->getOpcode() == ISD::BRCOND)
-      AddToWorkList(Use);
+      AddToWorklist(Use);
     else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
       // Also look pass the truncate.
       Use = *Use->use_begin();
       if (Use->getOpcode() == ISD::BRCOND)
-        AddToWorkList(Use);
+        AddToWorklist(Use);
     }
   }
 
@@ -4545,7 +4652,7 @@
                          N0, DAG.getConstant(1, VT0));
     XORNode = DAG.getNode(ISD::XOR, SDLoc(N0), VT0,
                           N0, DAG.getConstant(1, VT0));
-    AddToWorkList(XORNode.getNode());
+    AddToWorklist(XORNode.getNode());
     if (VT.bitsGT(VT0))
       return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, XORNode);
     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, XORNode);
@@ -4553,13 +4660,13 @@
   // fold (select C, 0, X) -> (and (not C), X)
   if (VT == VT0 && VT == MVT::i1 && N1C && N1C->isNullValue()) {
     SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
-    AddToWorkList(NOTNode.getNode());
+    AddToWorklist(NOTNode.getNode());
     return DAG.getNode(ISD::AND, SDLoc(N), VT, NOTNode, N2);
   }
   // fold (select C, X, 1) -> (or (not C), X)
   if (VT == VT0 && VT == MVT::i1 && N2C && N2C->getAPIntValue() == 1) {
     SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
-    AddToWorkList(NOTNode.getNode());
+    AddToWorklist(NOTNode.getNode());
     return DAG.getNode(ISD::OR, SDLoc(N), VT, NOTNode, N1);
   }
   // fold (select C, X, 0) -> (and C, X)
@@ -4582,7 +4689,7 @@
   if (N0.getOpcode() == ISD::SETCC) {
     if ((!LegalOperations &&
          TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
-	TLI.isOperationLegal(ISD::SELECT_CC, VT))
+        TLI.isOperationLegal(ISD::SELECT_CC, VT))
       return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1),
                          N1, N2, N0.getOperand(2));
@@ -4616,12 +4723,17 @@
   SDValue Cond = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
-  MVT VT = N->getSimpleValueType(0);
+  EVT VT = N->getValueType(0);
   int NumElems = VT.getVectorNumElements();
   assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
          RHS.getOpcode() == ISD::CONCAT_VECTORS &&
          Cond.getOpcode() == ISD::BUILD_VECTOR);
 
+  // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
+  // binary ones here.
+  if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2)
+    return SDValue();
+
   // We're sure we have an even number of elements due to the
   // concat_vectors we have as arguments to vselect.
   // Skip BV elements until we find one that's not an UNDEF
@@ -4690,8 +4802,8 @@
           ISD::SRA, DL, VT, LHS,
           DAG.getConstant(VT.getScalarType().getSizeInBits() - 1, VT));
       SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
-      AddToWorkList(Shift.getNode());
-      AddToWorkList(Add.getNode());
+      AddToWorklist(Shift.getNode());
+      AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
     }
   }
@@ -4718,8 +4830,8 @@
 
     // Add the new VSELECT nodes to the work list in case they need to be split
     // again.
-    AddToWorkList(Lo.getNode());
-    AddToWorkList(Hi.getNode());
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
 
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   }
@@ -4761,7 +4873,7 @@
   SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
                               N0, N1, CC, SDLoc(N), false);
   if (SCC.getNode()) {
-    AddToWorkList(SCC.getNode());
+    AddToWorklist(SCC.getNode());
 
     if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
       if (!SCCC->isNullValue())
@@ -4958,7 +5070,7 @@
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
         // CombineTo deleted the truncate, if needed, but not what's under it.
-        AddToWorkList(oye);
+        AddToWorklist(oye);
       }
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
@@ -5134,14 +5246,10 @@
       if (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, SetCCVT)) {
         SDLoc DL(N);
         ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-        SDValue SetCC = DAG.getSetCC(DL,
-                                     SetCCVT,
+        SDValue SetCC = DAG.getSetCC(DL, SetCCVT,
                                      N0.getOperand(0), N0.getOperand(1), CC);
-        EVT SelectVT = getSetCCResultType(VT);
-        return DAG.getSelect(DL, VT,
-                             DAG.getSExtOrTrunc(SetCC, DL, SelectVT),
+        return DAG.getSelect(DL, VT, SetCC,
                              NegOne, DAG.getConstant(0, VT));
-
       }
     }
   }
@@ -5239,7 +5347,7 @@
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
         // CombineTo deleted the truncate, if needed, but not what's under it.
-        AddToWorkList(oye);
+        AddToWorklist(oye);
       }
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
@@ -5257,7 +5365,7 @@
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
         // CombineTo deleted the truncate, if needed, but not what's under it.
-        AddToWorkList(oye);
+        AddToWorklist(oye);
       }
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
@@ -5265,10 +5373,10 @@
     SDValue Op = N0.getOperand(0);
     if (Op.getValueType().bitsLT(VT)) {
       Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
-      AddToWorkList(Op.getNode());
+      AddToWorklist(Op.getNode());
     } else if (Op.getValueType().bitsGT(VT)) {
       Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-      AddToWorkList(Op.getNode());
+      AddToWorklist(Op.getNode());
     }
     return DAG.getZeroExtendInReg(Op, SDLoc(N),
                                   N0.getValueType().getScalarType());
@@ -5487,7 +5595,7 @@
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
         // CombineTo deleted the truncate, if needed, but not what's under it.
-        AddToWorkList(oye);
+        AddToWorklist(oye);
       }
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
@@ -5528,8 +5636,7 @@
   // scalars.
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
+      TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
@@ -5614,9 +5721,9 @@
   return SDValue();
 }
 
-/// GetDemandedBits - See if the specified operand can be simplified with the
-/// knowledge that only the bits specified by Mask are used.  If so, return the
-/// simpler operand, otherwise return a null SDValue.
+/// See if the specified operand can be simplified with the knowledge that only
+/// the bits specified by Mask are used.  If so, return the simpler operand,
+/// otherwise return a null SDValue.
 SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
   switch (V.getOpcode()) {
   default: break;
@@ -5657,11 +5764,11 @@
   return SDValue();
 }
 
-/// ReduceLoadWidth - If the result of a wider load is shifted to right of N
-/// bits and then truncated to a narrower type and where N is a multiple
-/// of number of bits of the narrower type, transform it to a narrower load
-/// from address + N / num of bits of new type. If the result is to be
-/// extended, also fold the extension to form a extending load.
+/// If the result of a wider load is shifted to right of N  bits and then
+/// truncated to a narrower type and where N is a multiple of number of bits of
+/// the narrower type, transform it to a narrower load from address + N / num of
+/// bits of new type. If the result is to be extended, also fold the extension
+/// to form a extending load.
 SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   unsigned Opc = N->getOpcode();
 
@@ -5786,22 +5893,22 @@
   SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LN0),
                                PtrType, LN0->getBasePtr(),
                                DAG.getConstant(PtrOff, PtrType));
-  AddToWorkList(NewPtr.getNode());
+  AddToWorklist(NewPtr.getNode());
 
   SDValue Load;
   if (ExtType == ISD::NON_EXTLOAD)
     Load =  DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
                         LN0->getPointerInfo().getWithOffset(PtrOff),
                         LN0->isVolatile(), LN0->isNonTemporal(),
-                        LN0->isInvariant(), NewAlign, LN0->getTBAAInfo());
+                        LN0->isInvariant(), NewAlign, LN0->getAAInfo());
   else
     Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(),NewPtr,
                           LN0->getPointerInfo().getWithOffset(PtrOff),
                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                          NewAlign, LN0->getTBAAInfo());
+                          LN0->isInvariant(), NewAlign, LN0->getAAInfo());
 
   // Replace the old load's chain with the new load's chain.
-  WorkListRemover DeadNodes(*this);
+  WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
 
   // Shift the result left, if we've swallowed a left shift.
@@ -5900,7 +6007,7 @@
                                      LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
-    AddToWorkList(ExtLoad.getNode());
+    AddToWorklist(ExtLoad.getNode());
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
   }
   // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
@@ -6022,6 +6129,19 @@
     }
   }
 
+  // trunc (select c, a, b) -> select c, (trunc a), (trunc b)
+  if (N0.getOpcode() == ISD::SELECT) {
+    EVT SrcVT = N0.getValueType();
+    if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
+        TLI.isTruncateFree(SrcVT, VT)) {
+      SDLoc SL(N0);
+      SDValue Cond = N0.getOperand(0);
+      SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
+      SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
+      return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
+    }
+  }
+
   // Fold a series of buildvector, bitcast, and truncate if possible.
   // For example fold
   //   (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
@@ -6121,7 +6241,7 @@
           continue;
         }
         SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
-        AddToWorkList(NV.getNode());
+        AddToWorklist(NV.getNode());
         Opnds.push_back(NV);
       }
       return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
@@ -6143,7 +6263,7 @@
   return Elt.getOperand(Elt.getResNo()).getNode();
 }
 
-/// CombineConsecutiveLoads - build_pair (load, load) -> load
+/// build_pair (load, load) -> load
 /// if load locations are consecutive.
 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   assert(N->getOpcode() == ISD::BUILD_PAIR);
@@ -6209,7 +6329,7 @@
       // Ideally this won't happen very often, because instcombine
       // and the earlier dagcombine runs (where illegal nodes are
       // permitted) should have folded most of them already.
-      DAG.DeleteNode(Res.getNode());
+      deleteAndRecombine(Res.getNode());
     }
   }
 
@@ -6238,12 +6358,8 @@
                                  LN0->getBasePtr(), LN0->getPointerInfo(),
                                  LN0->isVolatile(), LN0->isNonTemporal(),
                                  LN0->isInvariant(), OrigAlign,
-                                 LN0->getTBAAInfo());
-      AddToWorkList(N);
-      CombineTo(N0.getNode(),
-                DAG.getNode(ISD::BITCAST, SDLoc(N0),
-                            N0.getValueType(), Load),
-                Load.getValue(1));
+                                 LN0->getAAInfo());
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
       return Load;
     }
   }
@@ -6257,7 +6373,7 @@
       !VT.isVector() && !N0.getValueType().isVector()) {
     SDValue NewConv = DAG.getNode(ISD::BITCAST, SDLoc(N0), VT,
                                   N0.getOperand(0));
-    AddToWorkList(NewConv.getNode());
+    AddToWorklist(NewConv.getNode());
 
     APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
     if (N0.getOpcode() == ISD::FNEG)
@@ -6280,34 +6396,34 @@
     if (isTypeLegal(IntXVT)) {
       SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0),
                               IntXVT, N0.getOperand(1));
-      AddToWorkList(X.getNode());
+      AddToWorklist(X.getNode());
 
       // If X has a different width than the result/lhs, sext it or truncate it.
       unsigned VTWidth = VT.getSizeInBits();
       if (OrigXWidth < VTWidth) {
         X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
-        AddToWorkList(X.getNode());
+        AddToWorklist(X.getNode());
       } else if (OrigXWidth > VTWidth) {
         // To get the sign bit in the right place, we have to shift it right
         // before truncating.
         X = DAG.getNode(ISD::SRL, SDLoc(X),
                         X.getValueType(), X,
                         DAG.getConstant(OrigXWidth-VTWidth, X.getValueType()));
-        AddToWorkList(X.getNode());
+        AddToWorklist(X.getNode());
         X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
-        AddToWorkList(X.getNode());
+        AddToWorklist(X.getNode());
       }
 
       APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
       X = DAG.getNode(ISD::AND, SDLoc(X), VT,
                       X, DAG.getConstant(SignBit, VT));
-      AddToWorkList(X.getNode());
+      AddToWorklist(X.getNode());
 
       SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0),
                                 VT, N0.getOperand(0));
       Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
                         Cst, DAG.getConstant(~SignBit, VT));
-      AddToWorkList(Cst.getNode());
+      AddToWorklist(Cst.getNode());
 
       return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
     }
@@ -6328,9 +6444,8 @@
   return CombineConsecutiveLoads(N, VT);
 }
 
-/// ConstantFoldBITCASTofBUILD_VECTOR - We know that BV is a build_vector
-/// node with Constant, ConstantFP or Undef operands.  DstEltVT indicates the
-/// destination element value type.
+/// We know that BV is a build_vector node with Constant, ConstantFP or Undef
+/// operands. DstEltVT indicates the destination element value type.
 SDValue DAGCombiner::
 ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
@@ -6363,7 +6478,7 @@
         Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
       Ops.push_back(DAG.getNode(ISD::BITCAST, SDLoc(BV),
                                 DstEltVT, Op));
-      AddToWorkList(Ops.back().getNode());
+      AddToWorklist(Ops.back().getNode());
     }
     return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops);
   }
@@ -6466,6 +6581,7 @@
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  const TargetOptions &Options = DAG.getTarget().Options;
 
   // fold vector ops
   if (VT.isVector()) {
@@ -6476,193 +6592,143 @@
   // fold (fadd c1, c2) -> c1 + c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N1);
+
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N0);
-  // fold (fadd A, 0) -> A
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
-      N1CFP->getValueAPF().isZero())
-    return N0;
+
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-    isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+      isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
     return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
+
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-    isNegatibleForFree(N0, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+      isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2)
     return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations));
 
-  // If allowed, fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
-      N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
-      isa<ConstantFPSDNode>(N0.getOperand(1)))
-    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0.getOperand(0),
-                       DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                   N0.getOperand(1), N1));
+  // If 'unsafe math' is enabled, fold lots of things.
+  if (Options.UnsafeFPMath) {
+    // No FP constant should be created after legalization as Instruction
+    // Selection pass has a hard time dealing with FP constants.
+    bool AllowNewConst = (Level < AfterLegalizeDAG);
 
-  // No FP constant should be created after legalization as Instruction
-  // Selection pass has hard time in dealing with FP constant.
-  //
-  // We don't need test this condition for transformation like following, as
-  // the DAG being transformed implies it is legal to take FP constant as
-  // operand.
-  //
-  //  (fadd (fmul c, x), x) -> (fmul c+1, x)
-  //
-  bool AllowNewFpConst = (Level < AfterLegalizeDAG);
+    // fold (fadd A, 0) -> A
+    if (N1CFP && N1CFP->getValueAPF().isZero())
+      return N0;
 
-  // If allow, fold (fadd (fneg x), x) -> 0.0
-  if (AllowNewFpConst && DAG.getTarget().Options.UnsafeFPMath &&
-      N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
-    return DAG.getConstantFP(0.0, VT);
+    // fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
+    if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
+        isa<ConstantFPSDNode>(N0.getOperand(1)))
+      return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0.getOperand(0),
+                         DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                     N0.getOperand(1), N1));
 
-    // If allow, fold (fadd x, (fneg x)) -> 0.0
-  if (AllowNewFpConst && DAG.getTarget().Options.UnsafeFPMath &&
-      N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
-    return DAG.getConstantFP(0.0, VT);
+    // If allowed, fold (fadd (fneg x), x) -> 0.0
+    if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
+      return DAG.getConstantFP(0.0, VT);
 
-  // In unsafe math mode, we can fold chains of FADD's of the same value
-  // into multiplications.  This transform is not safe in general because
-  // we are reducing the number of rounding steps.
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      TLI.isOperationLegalOrCustom(ISD::FMUL, VT) &&
-      !N0CFP && !N1CFP) {
-    if (N0.getOpcode() == ISD::FMUL) {
-      ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
-      ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    // If allowed, fold (fadd x, (fneg x)) -> 0.0
+    if (AllowNewConst && N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
+      return DAG.getConstantFP(0.0, VT);
 
-      // (fadd (fmul c, x), x) -> (fmul x, c+1)
-      if (CFP00 && !CFP01 && N0.getOperand(1) == N1) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP00, 0),
-                                     DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1, NewCFP);
+    // We can fold chains of FADD's of the same value into multiplications.
+    // This transform is not safe in general because we are reducing the number
+    // of rounding steps.
+    if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
+      if (N0.getOpcode() == ISD::FMUL) {
+        ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+        ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+
+        // (fadd (fmul x, c), x) -> (fmul x, c+1)
+        if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                       SDValue(CFP01, 0),
+                                       DAG.getConstantFP(1.0, VT));
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1, NewCFP);
+        }
+
+        // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
+        if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
+            N1.getOperand(0) == N1.getOperand(1) &&
+            N0.getOperand(0) == N1.getOperand(0)) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                       SDValue(CFP01, 0),
+                                       DAG.getConstantFP(2.0, VT));
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
+                             N0.getOperand(0), NewCFP);
+        }
       }
 
-      // (fadd (fmul x, c), x) -> (fmul x, c+1)
-      if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP01, 0),
-                                     DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1, NewCFP);
+      if (N1.getOpcode() == ISD::FMUL) {
+        ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+        ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1));
+
+        // (fadd x, (fmul x, c)) -> (fmul x, c+1)
+        if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                       SDValue(CFP11, 0),
+                                       DAG.getConstantFP(1.0, VT));
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, NewCFP);
+        }
+
+        // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
+        if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
+            N0.getOperand(0) == N0.getOperand(1) &&
+            N1.getOperand(0) == N0.getOperand(0)) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
+                                       SDValue(CFP11, 0),
+                                       DAG.getConstantFP(2.0, VT));
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1.getOperand(0), NewCFP);
+        }
       }
 
-      // (fadd (fmul c, x), (fadd x, x)) -> (fmul x, c+2)
-      if (CFP00 && !CFP01 && N1.getOpcode() == ISD::FADD &&
-          N1.getOperand(0) == N1.getOperand(1) &&
-          N0.getOperand(1) == N1.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP00, 0),
-                                     DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0.getOperand(1), NewCFP);
+      if (N0.getOpcode() == ISD::FADD && AllowNewConst) {
+        ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+        // (fadd (fadd x, x), x) -> (fmul x, 3.0)
+        if (!CFP && N0.getOperand(0) == N0.getOperand(1) &&
+            (N0.getOperand(0) == N1))
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
+                             N1, DAG.getConstantFP(3.0, VT));
       }
 
-      // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
-      if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
-          N1.getOperand(0) == N1.getOperand(1) &&
-          N0.getOperand(0) == N1.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP01, 0),
-                                     DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0.getOperand(0), NewCFP);
-      }
-    }
-
-    if (N1.getOpcode() == ISD::FMUL) {
-      ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
-      ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1));
-
-      // (fadd x, (fmul c, x)) -> (fmul x, c+1)
-      if (CFP10 && !CFP11 && N1.getOperand(1) == N0) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP10, 0),
-                                     DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0, NewCFP);
+      if (N1.getOpcode() == ISD::FADD && AllowNewConst) {
+        ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+        // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
+        if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
+            N1.getOperand(0) == N0)
+          return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
+                             N0, DAG.getConstantFP(3.0, VT));
       }
 
-      // (fadd x, (fmul x, c)) -> (fmul x, c+1)
-      if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP11, 0),
-                                     DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0, NewCFP);
-      }
-
-
-      // (fadd (fadd x, x), (fmul c, x)) -> (fmul x, c+2)
-      if (CFP10 && !CFP11 && N0.getOpcode() == ISD::FADD &&
+      // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
+      if (AllowNewConst &&
+          N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
           N0.getOperand(0) == N0.getOperand(1) &&
-          N1.getOperand(1) == N0.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP10, 0),
-                                     DAG.getConstantFP(2.0, VT));
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0))
         return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1.getOperand(1), NewCFP);
-      }
-
-      // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
-      if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
-          N0.getOperand(0) == N0.getOperand(1) &&
-          N1.getOperand(0) == N0.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
-                                     SDValue(CFP11, 0),
-                                     DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1.getOperand(0), NewCFP);
-      }
+                           N0.getOperand(0), DAG.getConstantFP(4.0, VT));
     }
-
-    if (N0.getOpcode() == ISD::FADD && AllowNewFpConst) {
-      ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
-      // (fadd (fadd x, x), x) -> (fmul x, 3.0)
-      if (!CFP && N0.getOperand(0) == N0.getOperand(1) &&
-          (N0.getOperand(0) == N1))
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N1, DAG.getConstantFP(3.0, VT));
-    }
-
-    if (N1.getOpcode() == ISD::FADD && AllowNewFpConst) {
-      ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
-      // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
-      if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
-          N1.getOperand(0) == N0)
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                           N0, DAG.getConstantFP(3.0, VT));
-    }
-
-    // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
-    if (AllowNewFpConst &&
-        N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
-        N0.getOperand(0) == N0.getOperand(1) &&
-        N1.getOperand(0) == N1.getOperand(1) &&
-        N0.getOperand(0) == N1.getOperand(0))
-      return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                         N0.getOperand(0),
-                         DAG.getConstantFP(4.0, VT));
-  }
+  } // enable-unsafe-fp-math
 
   // FADD -> FMA combines:
-  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
-       DAG.getTarget().Options.UnsafeFPMath) &&
-      DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) &&
+  if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
     // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
+    if (N0.getOpcode() == ISD::FMUL &&
+        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1), N1);
 
     // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
     // Note: Commutes FADD operands.
-    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse())
+    if (N1.getOpcode() == ISD::FMUL &&
+        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N1.getOperand(0), N1.getOperand(1), N0);
   }
@@ -6673,10 +6739,11 @@
 SDValue DAGCombiner::visitFSUB(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
 
   // fold vector ops
   if (VT.isVector()) {
@@ -6687,60 +6754,60 @@
   // fold (fsub c1, c2) -> c1-c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0, N1);
-  // fold (fsub A, 0) -> A
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      N1CFP && N1CFP->getValueAPF().isZero())
-    return N0;
-  // fold (fsub 0, B) -> -B
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      N0CFP && N0CFP->getValueAPF().isZero()) {
-    if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
-      return GetNegatedExpression(N1, DAG, LegalOperations);
-    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-      return DAG.getNode(ISD::FNEG, dl, VT, N1);
-  }
+
   // fold (fsub A, (fneg B)) -> (fadd A, B)
-  if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
+  if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
     return DAG.getNode(ISD::FADD, dl, VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
 
-  // If 'unsafe math' is enabled, fold
-  //    (fsub x, x) -> 0.0 &
-  //    (fsub x, (fadd x, y)) -> (fneg y) &
-  //    (fsub x, (fadd y, x)) -> (fneg y)
-  if (DAG.getTarget().Options.UnsafeFPMath) {
+  // If 'unsafe math' is enabled, fold lots of things.
+  if (Options.UnsafeFPMath) {
+    // (fsub A, 0) -> A
+    if (N1CFP && N1CFP->getValueAPF().isZero())
+      return N0;
+
+    // (fsub 0, B) -> -B
+    if (N0CFP && N0CFP->getValueAPF().isZero()) {
+      if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
+        return GetNegatedExpression(N1, DAG, LegalOperations);
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+        return DAG.getNode(ISD::FNEG, dl, VT, N1);
+    }
+
+    // (fsub x, x) -> 0.0
     if (N0 == N1)
       return DAG.getConstantFP(0.0f, VT);
 
+    // (fsub x, (fadd x, y)) -> (fneg y)
+    // (fsub x, (fadd y, x)) -> (fneg y)
     if (N1.getOpcode() == ISD::FADD) {
       SDValue N10 = N1->getOperand(0);
       SDValue N11 = N1->getOperand(1);
 
-      if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI,
-                                          &DAG.getTarget().Options))
+      if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI, &Options))
         return GetNegatedExpression(N11, DAG, LegalOperations);
 
-      if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI,
-                                          &DAG.getTarget().Options))
+      if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI, &Options))
         return GetNegatedExpression(N10, DAG, LegalOperations);
     }
   }
 
   // FSUB -> FMA combines:
-  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
-       DAG.getTarget().Options.UnsafeFPMath) &&
-      DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) &&
+  if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
     // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
+    if (N0.getOpcode() == ISD::FMUL &&
+        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, dl, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(ISD::FNEG, dl, VT, N1));
 
     // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
     // Note: Commutes FSUB operands.
-    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse())
+    if (N1.getOpcode() == ISD::FMUL &&
+        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
       return DAG.getNode(ISD::FMA, dl, VT,
                          DAG.getNode(ISD::FNEG, dl, VT,
                          N1.getOperand(0)),
@@ -6749,7 +6816,8 @@
     // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
     if (N0.getOpcode() == ISD::FNEG &&
         N0.getOperand(0).getOpcode() == ISD::FMUL &&
-        N0->hasOneUse() && N0.getOperand(0).hasOneUse()) {
+        ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) ||
+            TLI.enableAggressiveFMAFusion(VT))) {
       SDValue N00 = N0.getOperand(0).getOperand(0);
       SDValue N01 = N0.getOperand(0).getOperand(1);
       return DAG.getNode(ISD::FMA, dl, VT,
@@ -6764,47 +6832,82 @@
 SDValue DAGCombiner::visitFMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
   EVT VT = N->getValueType(0);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const TargetOptions &Options = DAG.getTarget().Options;
 
   // fold vector ops
   if (VT.isVector()) {
+    // This just handles C1 * C2 for vectors. Other vector folds are below.
     SDValue FoldedVOp = SimplifyVBinOp(N);
-    if (FoldedVOp.getNode()) return FoldedVOp;
+    if (FoldedVOp.getNode())
+      return FoldedVOp;
+    // Canonicalize vector constant to RHS.
+    if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+        N1.getOpcode() != ISD::BUILD_VECTOR)
+      if (auto *BV0 = dyn_cast<BuildVectorSDNode>(N0))
+        if (BV0->isConstant())
+          return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
   }
 
   // fold (fmul c1, c2) -> c1*c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, N1);
+
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1, N0);
-  // fold (fmul A, 0) -> 0
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      N1CFP && N1CFP->getValueAPF().isZero())
-    return N1;
-  // fold (fmul A, 0) -> 0, vector edition.
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      ISD::isBuildVectorAllZeros(N1.getNode()))
-    return N1;
+
   // fold (fmul A, 1.0) -> A
   if (N1CFP && N1CFP->isExactlyValue(1.0))
     return N0;
+
+  if (Options.UnsafeFPMath) {
+    // fold (fmul A, 0) -> 0
+    if (N1CFP && N1CFP->getValueAPF().isZero())
+      return N1;
+
+    // fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
+    if (N0.getOpcode() == ISD::FMUL) {
+      // Fold scalars or any vector constants (not just splats).
+      // This fold is done in general by InstCombine, but extra fmul insts
+      // may have been generated during lowering.
+      SDValue N01 = N0.getOperand(1);
+      auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
+      auto *BV01 = dyn_cast<BuildVectorSDNode>(N01);
+      if ((N1CFP && isConstOrConstSplatFP(N01)) ||
+          (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) {
+        SDLoc SL(N);
+        SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, N01, N1);
+        return DAG.getNode(ISD::FMUL, SL, VT, N0.getOperand(0), MulConsts);
+      }
+    }
+
+    // fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c))
+    // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs
+    // during an early run of DAGCombiner can prevent folding with fmuls
+    // inserted during lowering.
+    if (N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1)) {
+      SDLoc SL(N);
+      const SDValue Two = DAG.getConstantFP(2.0, VT);
+      SDValue MulConsts = DAG.getNode(ISD::FMUL, SL, VT, Two, N1);
+      return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), MulConsts);
+    }
+  }
+
   // fold (fmul X, 2.0) -> (fadd X, X)
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N0);
+
   // fold (fmul X, -1.0) -> (fneg X)
   if (N1CFP && N1CFP->isExactlyValue(-1.0))
     if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
       return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
 
   // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
-                                       &DAG.getTarget().Options)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI,
-                                         &DAG.getTarget().Options)) {
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
       if (LHSNeg == 2 || RHSNeg == 2)
@@ -6814,14 +6917,6 @@
     }
   }
 
-  // If allowed, fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
-  if (DAG.getTarget().Options.UnsafeFPMath &&
-      N1CFP && N0.getOpcode() == ISD::FMUL &&
-      N0.getNode()->hasOneUse() && isa<ConstantFPSDNode>(N0.getOperand(1)))
-    return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
-                       DAG.getNode(ISD::FMUL, SDLoc(N), VT,
-                                   N0.getOperand(1), N1));
-
   return SDValue();
 }
 
@@ -6833,8 +6928,16 @@
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
 
-  if (DAG.getTarget().Options.UnsafeFPMath) {
+  // Constant fold FMA.
+  if (isa<ConstantFPSDNode>(N0) &&
+      isa<ConstantFPSDNode>(N1) &&
+      isa<ConstantFPSDNode>(N2)) {
+    return DAG.getNode(ISD::FMA, dl, VT, N0, N1, N2);
+  }
+
+  if (Options.UnsafeFPMath) {
     if (N0CFP && N0CFP->isZero())
       return N2;
     if (N1CFP && N1CFP->isZero())
@@ -6850,7 +6953,7 @@
     return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
 
   // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+  if (Options.UnsafeFPMath && N1CFP &&
       N2.getOpcode() == ISD::FMUL &&
       N0 == N2.getOperand(0) &&
       N2.getOperand(1).getOpcode() == ISD::ConstantFP) {
@@ -6860,7 +6963,7 @@
 
 
   // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
-  if (DAG.getTarget().Options.UnsafeFPMath &&
+  if (Options.UnsafeFPMath &&
       N0.getOpcode() == ISD::FMUL && N1CFP &&
       N0.getOperand(1).getOpcode() == ISD::ConstantFP) {
     return DAG.getNode(ISD::FMA, dl, VT,
@@ -6878,19 +6981,19 @@
     if (N1CFP->isExactlyValue(-1.0) &&
         (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
       SDValue RHSNeg = DAG.getNode(ISD::FNEG, dl, VT, N0);
-      AddToWorkList(RHSNeg.getNode());
+      AddToWorklist(RHSNeg.getNode());
       return DAG.getNode(ISD::FADD, dl, VT, N2, RHSNeg);
     }
   }
 
   // (fma x, c, x) -> (fmul x, (c+1))
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP && N0 == N2)
+  if (Options.UnsafeFPMath && N1CFP && N0 == N2)
     return DAG.getNode(ISD::FMUL, dl, VT, N0,
                        DAG.getNode(ISD::FADD, dl, VT,
                                    N1, DAG.getConstantFP(1.0, VT)));
 
   // (fma x, c, (fneg x)) -> (fmul x, (c-1))
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+  if (Options.UnsafeFPMath && N1CFP &&
       N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0)
     return DAG.getNode(ISD::FMUL, dl, VT, N0,
                        DAG.getNode(ISD::FADD, dl, VT,
@@ -6906,7 +7009,8 @@
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
 
   // fold vector ops
   if (VT.isVector()) {
@@ -6918,30 +7022,79 @@
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1);
 
-  // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
-  if (N1CFP && DAG.getTarget().Options.UnsafeFPMath) {
-    // Compute the reciprocal 1.0 / c2.
-    APFloat N1APF = N1CFP->getValueAPF();
-    APFloat Recip(N1APF.getSemantics(), 1); // 1.0
-    APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
-    // Only do the transform if the reciprocal is a legal fp immediate that
-    // isn't too nasty (eg NaN, denormal, ...).
-    if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
-        (!LegalOperations ||
-         // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
-         // backend)... we should handle this gracefully after Legalize.
-         // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) ||
-         TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
-         TLI.isFPImmLegal(Recip, VT)))
-      return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0,
-                         DAG.getConstantFP(Recip, VT));
+  if (Options.UnsafeFPMath) {
+    // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
+    if (N1CFP) {
+      // Compute the reciprocal 1.0 / c2.
+      APFloat N1APF = N1CFP->getValueAPF();
+      APFloat Recip(N1APF.getSemantics(), 1); // 1.0
+      APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
+      // Only do the transform if the reciprocal is a legal fp immediate that
+      // isn't too nasty (eg NaN, denormal, ...).
+      if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
+          (!LegalOperations ||
+           // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
+           // backend)... we should handle this gracefully after Legalize.
+           // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) ||
+           TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
+           TLI.isFPImmLegal(Recip, VT)))
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0,
+                           DAG.getConstantFP(Recip, VT));
+    }
+
+    // If this FDIV is part of a reciprocal square root, it may be folded
+    // into a target-specific square root estimate instruction.
+    if (N1.getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0))) {
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      }
+    } else if (N1.getOpcode() == ISD::FP_EXTEND &&
+               N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
+        RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
+        AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      }
+    } else if (N1.getOpcode() == ISD::FP_ROUND &&
+               N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0))) {
+        RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
+        AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      }
+    } else if (N1.getOpcode() == ISD::FMUL) {
+      // Look through an FMUL. Even though this won't remove the FDIV directly,
+      // it's still worthwhile to get rid of the FSQRT if possible.
+      SDValue SqrtOp;
+      SDValue OtherOp;
+      if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+        SqrtOp = N1.getOperand(0);
+        OtherOp = N1.getOperand(1);
+      } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
+        SqrtOp = N1.getOperand(1);
+        OtherOp = N1.getOperand(0);
+      }
+      if (SqrtOp.getNode()) {
+        // We found a FSQRT, so try to make this fold:
+        // x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
+        if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0))) {
+          RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp);
+          AddToWorklist(RV.getNode());
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+        }
+      }
+    }
+
+    // Fold into a reciprocal estimate and multiply instead of a real divide.
+    if (SDValue RV = BuildReciprocalEstimate(N1)) {
+      AddToWorklist(RV.getNode());
+      return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+    }
   }
 
   // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
-                                       &DAG.getTarget().Options)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI,
-                                         &DAG.getTarget().Options)) {
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
       if (LHSNeg == 2 || RHSNeg == 2)
@@ -6968,6 +7121,31 @@
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFSQRT(SDNode *N) {
+  if (DAG.getTarget().Options.UnsafeFPMath) {
+    // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
+    if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
+      EVT VT = RV.getValueType();
+      RV = DAG.getNode(ISD::FMUL, SDLoc(N), VT, N->getOperand(0), RV);
+      AddToWorklist(RV.getNode());
+
+      // Unfortunately, RV is now NaN if the input was exactly 0.
+      // Select out this case and force the answer to 0.
+      SDValue Zero = DAG.getConstantFP(0.0, VT);
+      SDValue ZeroCmp =
+        DAG.getSetCC(SDLoc(N), TLI.getSetCCResultType(*DAG.getContext(), VT),
+                     N->getOperand(0), Zero, ISD::SETEQ);
+      AddToWorklist(ZeroCmp.getNode());
+      AddToWorklist(RV.getNode());
+
+      RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT,
+                       SDLoc(N), VT, ZeroCmp, Zero, RV);
+      return RV;
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -7162,7 +7340,7 @@
   if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
     SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
                               N0.getOperand(0), N1);
-    AddToWorkList(Tmp.getNode());
+    AddToWorklist(Tmp.getNode());
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        Tmp, N0.getOperand(1));
   }
@@ -7213,8 +7391,7 @@
 
   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
+       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -7231,54 +7408,6 @@
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFNEG(SDNode *N) {
-  SDValue N0 = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-
-  if (VT.isVector()) {
-    SDValue FoldedVOp = SimplifyVUnaryOp(N);
-    if (FoldedVOp.getNode()) return FoldedVOp;
-  }
-
-  if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
-                         &DAG.getTarget().Options))
-    return GetNegatedExpression(N0, DAG, LegalOperations);
-
-  // Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading
-  // constant pool values.
-  if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST &&
-      !VT.isVector() &&
-      N0.getNode()->hasOneUse() &&
-      N0.getOperand(0).getValueType().isInteger()) {
-    SDValue Int = N0.getOperand(0);
-    EVT IntVT = Int.getValueType();
-    if (IntVT.isInteger() && !IntVT.isVector()) {
-      Int = DAG.getNode(ISD::XOR, SDLoc(N0), IntVT, Int,
-              DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
-      AddToWorkList(Int.getNode());
-      return DAG.getNode(ISD::BITCAST, SDLoc(N),
-                         VT, Int);
-    }
-  }
-
-  // (fneg (fmul c, x)) -> (fmul -c, x)
-  if (N0.getOpcode() == ISD::FMUL) {
-    ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
-    if (CFP1) {
-      APFloat CVal = CFP1->getValueAPF();
-      CVal.changeSign();
-      if (Level >= AfterLegalizeDAG &&
-          (TLI.isFPImmLegal(CVal, N->getValueType(0)) ||
-           TLI.isOperationLegal(ISD::ConstantFP, N->getValueType(0))))
-        return DAG.getNode(
-            ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
-            DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)));
-    }
-  }
-
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitFCEIL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
@@ -7315,9 +7444,111 @@
   return SDValue();
 }
 
+// FIXME: FNEG and FABS have a lot in common; refactor.
+SDValue DAGCombiner::visitFNEG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVUnaryOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // Constant fold FNEG.
+  if (isa<ConstantFPSDNode>(N0))
+    return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N->getOperand(0));
+
+  if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
+                         &DAG.getTarget().Options))
+    return GetNegatedExpression(N0, DAG, LegalOperations);
+
+  // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
+  // constant pool values.
+  if (!TLI.isFNegFree(VT) &&
+      N0.getOpcode() == ISD::BITCAST &&
+      N0.getNode()->hasOneUse()) {
+    SDValue Int = N0.getOperand(0);
+    EVT IntVT = Int.getValueType();
+    if (IntVT.isInteger() && !IntVT.isVector()) {
+      APInt SignMask;
+      if (N0.getValueType().isVector()) {
+        // For a vector, get a mask such as 0x80... per scalar element
+        // and splat it.
+        SignMask = APInt::getSignBit(N0.getValueType().getScalarSizeInBits());
+        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+      } else {
+        // For a scalar, just generate 0x80...
+        SignMask = APInt::getSignBit(IntVT.getSizeInBits());
+      }
+      Int = DAG.getNode(ISD::XOR, SDLoc(N0), IntVT, Int,
+                        DAG.getConstant(SignMask, IntVT));
+      AddToWorklist(Int.getNode());
+      return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Int);
+    }
+  }
+
+  // (fneg (fmul c, x)) -> (fmul -c, x)
+  if (N0.getOpcode() == ISD::FMUL) {
+    ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    if (CFP1) {
+      APFloat CVal = CFP1->getValueAPF();
+      CVal.changeSign();
+      if (Level >= AfterLegalizeDAG &&
+          (TLI.isFPImmLegal(CVal, N->getValueType(0)) ||
+           TLI.isOperationLegal(ISD::ConstantFP, N->getValueType(0))))
+        return DAG.getNode(
+            ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
+            DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  const ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  const ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+
+  if (N0CFP && N1CFP) {
+    const APFloat &C0 = N0CFP->getValueAPF();
+    const APFloat &C1 = N1CFP->getValueAPF();
+    return DAG.getConstantFP(minnum(C0, C1), N->getValueType(0));
+  }
+
+  if (N0CFP) {
+    EVT VT = N->getValueType(0);
+    // Canonicalize to constant on RHS.
+    return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  const ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  const ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+
+  if (N0CFP && N1CFP) {
+    const APFloat &C0 = N0CFP->getValueAPF();
+    const APFloat &C1 = N1CFP->getValueAPF();
+    return DAG.getConstantFP(maxnum(C0, C1), N->getValueType(0));
+  }
+
+  if (N0CFP) {
+    EVT VT = N->getValueType(0);
+    // Canonicalize to constant on RHS.
+    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0);
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   EVT VT = N->getValueType(0);
 
   if (VT.isVector()) {
@@ -7326,30 +7557,40 @@
   }
 
   // fold (fabs c1) -> fabs(c1)
-  if (N0CFP)
+  if (isa<ConstantFPSDNode>(N0))
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
+
   // fold (fabs (fabs x)) -> (fabs x)
   if (N0.getOpcode() == ISD::FABS)
     return N->getOperand(0);
+
   // fold (fabs (fneg x)) -> (fabs x)
   // fold (fabs (fcopysign x, y)) -> (fabs x)
   if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
 
-  // Transform fabs(bitconvert(x)) -> bitconvert(x&~sign) to avoid loading
+  // Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading
   // constant pool values.
   if (!TLI.isFAbsFree(VT) &&
-      N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse() &&
-      N0.getOperand(0).getValueType().isInteger() &&
-      !N0.getOperand(0).getValueType().isVector()) {
+      N0.getOpcode() == ISD::BITCAST &&
+      N0.getNode()->hasOneUse()) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
+      APInt SignMask;
+      if (N0.getValueType().isVector()) {
+        // For a vector, get a mask such as 0x7f... per scalar element
+        // and splat it.
+        SignMask = ~APInt::getSignBit(N0.getValueType().getScalarSizeInBits());
+        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+      } else {
+        // For a scalar, just generate 0x7f...
+        SignMask = ~APInt::getSignBit(IntVT.getSizeInBits());
+      }
       Int = DAG.getNode(ISD::AND, SDLoc(N0), IntVT, Int,
-             DAG.getConstant(~APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
-      AddToWorkList(Int.getNode());
-      return DAG.getNode(ISD::BITCAST, SDLoc(N),
-                         N->getValueType(0), Int);
+                        DAG.getConstant(SignMask, IntVT));
+      AddToWorklist(Int.getNode());
+      return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Int);
     }
   }
 
@@ -7429,15 +7670,12 @@
           // will convert it back to (X & C1) >> C2.
           CombineTo(N, NewBRCond, false);
           // Truncate is dead.
-          if (Trunc) {
-            removeFromWorkList(Trunc);
-            DAG.DeleteNode(Trunc);
-          }
+          if (Trunc)
+            deleteAndRecombine(Trunc);
           // Replace the uses of SRL with SETCC
-          WorkListRemover DeadNodes(*this);
+          WorklistRemover DeadNodes(*this);
           DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
-          removeFromWorkList(N1.getNode());
-          DAG.DeleteNode(N1.getNode());
+          deleteAndRecombine(N1.getNode());
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
         }
       }
@@ -7464,10 +7702,9 @@
                 dbgs() << "\nWith: ";
                 Tmp.getNode()->dump(&DAG);
                 dbgs() << '\n');
-          WorkListRemover DeadNodes(*this);
+          WorklistRemover DeadNodes(*this);
           DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
-          removeFromWorkList(TheXor);
-          DAG.DeleteNode(TheXor);
+          deleteAndRecombine(TheXor);
           return DAG.getNode(ISD::BRCOND, SDLoc(N),
                              MVT::Other, Chain, Tmp, N2);
         }
@@ -7495,10 +7732,9 @@
                                    Op0, Op1,
                                    Equal ? ISD::SETEQ : ISD::SETNE);
       // Replace the uses of XOR with SETCC
-      WorkListRemover DeadNodes(*this);
+      WorklistRemover DeadNodes(*this);
       DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
-      removeFromWorkList(N1.getNode());
-      DAG.DeleteNode(N1.getNode());
+      deleteAndRecombine(N1.getNode());
       return DAG.getNode(ISD::BRCOND, SDLoc(N),
                          MVT::Other, Chain, SetCC, N2);
     }
@@ -7523,7 +7759,7 @@
   SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
                                CondLHS, CondRHS, CC->get(), SDLoc(N),
                                false);
-  if (Simp.getNode()) AddToWorkList(Simp.getNode());
+  if (Simp.getNode()) AddToWorklist(Simp.getNode());
 
   // fold to a simpler setcc
   if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
@@ -7535,9 +7771,8 @@
   return SDValue();
 }
 
-/// canFoldInAddressingMode - Return true if 'Use' is a load or a store that
-/// uses N as its base pointer and that N may be folded in the load / store
-/// addressing mode.
+/// Return true if 'Use' is a load or a store that uses N as its base pointer
+/// and that N may be folded in the load / store addressing mode.
 static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
                                     SelectionDAG &DAG,
                                     const TargetLowering &TLI) {
@@ -7576,12 +7811,11 @@
   return TLI.isLegalAddressingMode(AM, VT.getTypeForEVT(*DAG.getContext()));
 }
 
-/// CombineToPreIndexedLoadStore - Try turning a load / store into a
-/// pre-indexed load / store when the base pointer is an add or subtract
-/// and it has other uses besides the load / store. After the
-/// transformation, the new indexed load / store has effectively folded
-/// the add / subtract in and all of its other uses are redirected to the
-/// new load / store.
+/// Try turning a load/store into a pre-indexed load/store when the base
+/// pointer is an add or subtract and it has other uses besides the load/store.
+/// After the transformation, the new indexed load/store has effectively folded
+/// the add/subtract in and all of its other uses are redirected to the
+/// new load/store.
 bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   if (Level < AfterLegalizeDAG)
     return false;
@@ -7733,7 +7967,7 @@
         dbgs() << "\nWith: ";
         Result.getNode()->dump(&DAG);
         dbgs() << '\n');
-  WorkListRemover DeadNodes(*this);
+  WorklistRemover DeadNodes(*this);
   if (isLoad) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
@@ -7742,7 +7976,7 @@
   }
 
   // Finally, since the node is now dead, remove it from the graph.
-  DAG.DeleteNode(N);
+  deleteAndRecombine(N);
 
   if (Swapped)
     std::swap(BasePtr, Offset);
@@ -7792,23 +8026,20 @@
                                  SDLoc(OtherUses[i]),
                                  OtherUses[i]->getValueType(0), NewOp1, NewOp2);
     DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
-    removeFromWorkList(OtherUses[i]);
-    DAG.DeleteNode(OtherUses[i]);
+    deleteAndRecombine(OtherUses[i]);
   }
 
   // Replace the uses of Ptr with uses of the updated base value.
   DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
-  removeFromWorkList(Ptr.getNode());
-  DAG.DeleteNode(Ptr.getNode());
+  deleteAndRecombine(Ptr.getNode());
 
   return true;
 }
 
-/// CombineToPostIndexedLoadStore - Try to combine a load / store with a
-/// add / sub of the base pointer node into a post-indexed load / store.
-/// The transformation folded the add / subtract into the new indexed
-/// load / store effectively and all of its uses are redirected to the
-/// new load / store.
+/// Try to combine a load/store with a add/sub of the base pointer node into a
+/// post-indexed load/store. The transformation folded the add/subtract into the
+/// new indexed load/store effectively and all of its uses are redirected to the
+/// new load/store.
 bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   if (Level < AfterLegalizeDAG)
     return false;
@@ -7903,7 +8134,7 @@
               dbgs() << "\nWith: ";
               Result.getNode()->dump(&DAG);
               dbgs() << '\n');
-        WorkListRemover DeadNodes(*this);
+        WorklistRemover DeadNodes(*this);
         if (isLoad) {
           DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
           DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
@@ -7912,13 +8143,12 @@
         }
 
         // Finally, since the node is now dead, remove it from the graph.
-        DAG.DeleteNode(N);
+        deleteAndRecombine(N);
 
         // Replace the uses of Use with uses of the updated base value.
         DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
                                       Result.getValue(isLoad ? 1 : 0));
-        removeFromWorkList(Op);
-        DAG.DeleteNode(Op);
+        deleteAndRecombine(Op);
         return true;
       }
     }
@@ -7927,6 +8157,30 @@
   return false;
 }
 
+/// \brief Return the base-pointer arithmetic from an indexed \p LD.
+SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  assert(AM != ISD::UNINDEXED);
+  SDValue BP = LD->getOperand(1);
+  SDValue Inc = LD->getOperand(2);
+
+  // Some backends use TargetConstants for load offsets, but don't expect
+  // TargetConstants in general ADD nodes. We can convert these constants into
+  // regular Constants (if the constant is not opaque).
+  assert((Inc.getOpcode() != ISD::TargetConstant ||
+          !cast<ConstantSDNode>(Inc)->isOpaque()) &&
+         "Cannot split out indexing using opaque target constants");
+  if (Inc.getOpcode() == ISD::TargetConstant) {
+    ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
+    Inc = DAG.getConstant(*ConstInc->getConstantIntValue(),
+                          ConstInc->getValueType(0));
+  }
+
+  unsigned Opc =
+      (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
+  return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
+}
+
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
@@ -7950,33 +8204,46 @@
               dbgs() << "\nWith chain: ";
               Chain.getNode()->dump(&DAG);
               dbgs() << "\n");
-        WorkListRemover DeadNodes(*this);
+        WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
 
-        if (N->use_empty()) {
-          removeFromWorkList(N);
-          DAG.DeleteNode(N);
-        }
+        if (N->use_empty())
+          deleteAndRecombine(N);
 
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
     } else {
       // Indexed loads.
       assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
-      if (!N->hasAnyUseOfValue(0) && !N->hasAnyUseOfValue(1)) {
+
+      // If this load has an opaque TargetConstant offset, then we cannot split
+      // the indexing into an add/sub directly (that TargetConstant may not be
+      // valid for a different type of node, and we cannot convert an opaque
+      // target constant into a regular constant).
+      bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant &&
+                       cast<ConstantSDNode>(LD->getOperand(2))->isOpaque();
+
+      if (!N->hasAnyUseOfValue(0) &&
+          ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) {
         SDValue Undef = DAG.getUNDEF(N->getValueType(0));
+        SDValue Index;
+        if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) {
+          Index = SplitIndexingFromLoad(LD);
+          // Try to fold the base pointer arithmetic into subsequent loads and
+          // stores.
+          AddUsersToWorklist(N);
+        } else
+          Index = DAG.getUNDEF(N->getValueType(1));
         DEBUG(dbgs() << "\nReplacing.7 ";
               N->dump(&DAG);
               dbgs() << "\nWith: ";
               Undef.getNode()->dump(&DAG);
               dbgs() << " and 2 other values\n");
-        WorkListRemover DeadNodes(*this);
+        WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
-                                      DAG.getUNDEF(N->getValueType(1)));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
-        removeFromWorkList(N);
-        DAG.DeleteNode(N);
+        deleteAndRecombine(N);
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
     }
@@ -8004,15 +8271,15 @@
                               LD->getValueType(0),
                               Chain, Ptr, LD->getPointerInfo(),
                               LD->getMemoryVT(),
-                              LD->isVolatile(), LD->isNonTemporal(), Align,
-                              LD->getTBAAInfo());
+                              LD->isVolatile(), LD->isNonTemporal(),
+                              LD->isInvariant(), Align, LD->getAAInfo());
         return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
       }
     }
   }
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
-    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
 #ifndef NDEBUG
   if (CombinerAAOnlyFunc.getNumOccurrences() &&
       CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
@@ -8042,7 +8309,7 @@
                                   MVT::Other, Chain, ReplLoad.getValue(1));
 
       // Make sure the new and old chains are cleaned up.
-      AddToWorkList(Token.getNode());
+      AddToWorklist(Token.getNode());
 
       // Replace uses with load result and token factor. Don't add users
       // to work list.
@@ -8342,7 +8609,7 @@
 
     // At this point, we know that we perform a cross-register-bank copy.
     // Check if it is expensive.
-    const TargetRegisterInfo *TRI = TLI.getTargetMachine().getRegisterInfo();
+    const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
     // Assume bitcasts are cheap, unless both register classes do not
     // explicitly share a common sub class.
     if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
@@ -8606,9 +8873,9 @@
   return true;
 }
 
-/// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
-/// load is having specific bytes cleared out.  If so, return the byte size
-/// being masked out and the shift amount.
+/// Check to see if V is (and load (ptr), imm), where the load is having
+/// specific bytes cleared out.  If so, return the byte size being masked out
+/// and the shift amount.
 static std::pair<unsigned, unsigned>
 CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
   std::pair<unsigned, unsigned> Result(0, 0);
@@ -8681,9 +8948,9 @@
 }
 
 
-/// ShrinkLoadReplaceStoreWithStore - Check to see if IVal is something that
-/// provides a value as specified by MaskInfo.  If so, replace the specified
-/// store with a narrower store of truncated IVal.
+/// Check to see if IVal is something that provides a value as specified by
+/// MaskInfo. If so, replace the specified store with a narrower store of
+/// truncated IVal.
 static SDNode *
 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
                                 SDValue IVal, StoreSDNode *St,
@@ -8738,10 +9005,10 @@
 }
 
 
-/// ReduceLoadOpStoreWidth - Look for sequence of load / op / store where op is
-/// one of 'or', 'xor', and 'and' of immediates. If 'op' is only touching some
-/// of the loaded bits, try narrowing the load and store if it would end up
-/// being a win for performance or code size.
+/// Look for sequence of load / op / store where op is one of 'or', 'xor', and
+/// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
+/// narrowing the load and store if it would end up being a win for performance
+/// or code size.
 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   if (ST->isVolatile())
@@ -8841,7 +9108,7 @@
                                   LD->getPointerInfo().getWithOffset(PtrOff),
                                   LD->isVolatile(), LD->isNonTemporal(),
                                   LD->isInvariant(), NewAlign,
-                                  LD->getTBAAInfo());
+                                  LD->getAAInfo());
       SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
                                    DAG.getConstant(NewImm, NewVT));
       SDValue NewST = DAG.getStore(Chain, SDLoc(N),
@@ -8849,10 +9116,10 @@
                                    ST->getPointerInfo().getWithOffset(PtrOff),
                                    false, false, NewAlign);
 
-      AddToWorkList(NewPtr.getNode());
-      AddToWorkList(NewLD.getNode());
-      AddToWorkList(NewVal.getNode());
-      WorkListRemover DeadNodes(*this);
+      AddToWorklist(NewPtr.getNode());
+      AddToWorklist(NewLD.getNode());
+      AddToWorklist(NewVal.getNode());
+      WorklistRemover DeadNodes(*this);
       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
       ++OpsNarrowed;
       return NewST;
@@ -8862,10 +9129,9 @@
   return SDValue();
 }
 
-/// TransformFPLoadStorePair - For a given floating point load / store pair,
-/// if the load value isn't used by any other operations, then consider
-/// transforming the pair to integer load / store operations if the target
-/// deems the transformation profitable.
+/// For a given floating point load / store pair, if the load value isn't used
+/// by any other operations, then consider transforming the pair to integer
+/// load / store operations if the target deems the transformation profitable.
 SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   SDValue Chain = ST->getChain();
@@ -8907,9 +9173,9 @@
                                  ST->getPointerInfo(),
                                  false, false, STAlign);
 
-    AddToWorkList(NewLD.getNode());
-    AddToWorkList(NewST.getNode());
-    WorkListRemover DeadNodes(*this);
+    AddToWorklist(NewLD.getNode());
+    AddToWorklist(NewST.getNode());
+    WorklistRemover DeadNodes(*this);
     DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
     ++LdStFP2Int;
     return NewST;
@@ -9039,7 +9305,7 @@
     return false;
 
   // Only look at ends of store sequences.
-  SDValue Chain = SDValue(St, 1);
+  SDValue Chain = SDValue(St, 0);
   if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
     return false;
 
@@ -9070,7 +9336,7 @@
   StoreSDNode *Index = St;
   while (Index) {
     // If the chain has more than one use, then we can't reorder the mem ops.
-    if (Index != St && !SDValue(Index, 1)->hasOneUse())
+    if (Index != St && !SDValue(Index, 0)->hasOneUse())
       break;
 
     // Find the base pointer and offset for this memory node.
@@ -9301,8 +9567,7 @@
       // Since we know that St is redundant, just iterate.
       while (!St->use_empty())
         DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
-      removeFromWorkList(St);
-      DAG.DeleteNode(St);
+      deleteAndRecombine(St);
     }
 
     return true;
@@ -9361,6 +9626,13 @@
   if (LoadNodes.size() < 2)
     return false;
 
+  // If we have load/store pair instructions and we only have two values,
+  // don't bother.
+  unsigned RequiredAlignment;
+  if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
+      St->getAlignment() >= RequiredAlignment)
+    return false;
+
   // Scan the memory operations on the chain and find the first non-consecutive
   // load memory address. These variables hold the index in the store node
   // array.
@@ -9476,8 +9748,7 @@
       continue;
     StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
     DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
-    removeFromWorkList(St);
-    DAG.DeleteNode(St);
+    deleteAndRecombine(St);
   }
 
   return true;
@@ -9503,7 +9774,7 @@
       return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0),
                           Ptr, ST->getPointerInfo(), ST->isVolatile(),
                           ST->isNonTemporal(), OrigAlign,
-                          ST->getTBAAInfo());
+                          ST->getAAInfo());
   }
 
   // Turn 'store undef, Ptr' -> nothing.
@@ -9557,19 +9828,19 @@
           unsigned Alignment = ST->getAlignment();
           bool isVolatile = ST->isVolatile();
           bool isNonTemporal = ST->isNonTemporal();
-          const MDNode *TBAAInfo = ST->getTBAAInfo();
+          AAMDNodes AAInfo = ST->getAAInfo();
 
           SDValue St0 = DAG.getStore(Chain, SDLoc(ST), Lo,
                                      Ptr, ST->getPointerInfo(),
                                      isVolatile, isNonTemporal,
-                                     ST->getAlignment(), TBAAInfo);
+                                     ST->getAlignment(), AAInfo);
           Ptr = DAG.getNode(ISD::ADD, SDLoc(N), Ptr.getValueType(), Ptr,
                             DAG.getConstant(4, Ptr.getValueType()));
           Alignment = MinAlign(Alignment, 4U);
           SDValue St1 = DAG.getStore(Chain, SDLoc(ST), Hi,
                                      Ptr, ST->getPointerInfo().getWithOffset(4),
                                      isVolatile, isNonTemporal,
-                                     Alignment, TBAAInfo);
+                                     Alignment, AAInfo);
           return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other,
                              St0, St1);
         }
@@ -9586,7 +9857,7 @@
         return DAG.getTruncStore(Chain, SDLoc(N), Value,
                                  Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
                                  ST->isVolatile(), ST->isNonTemporal(), Align,
-                                 ST->getTBAAInfo());
+                                 ST->getAAInfo());
     }
   }
 
@@ -9596,8 +9867,8 @@
   if (NewST.getNode())
     return NewST;
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
-    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
 #ifndef NDEBUG
   if (CombinerAAOnlyFunc.getNumOccurrences() &&
       CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
@@ -9625,7 +9896,7 @@
                                   MVT::Other, Chain, ReplStore);
 
       // Make sure the new and old chains are cleaned up.
-      AddToWorkList(Token.getNode());
+      AddToWorklist(Token.getNode());
 
       // Don't add users to work list.
       return CombineTo(N, Token, false);
@@ -9647,7 +9918,7 @@
                       APInt::getLowBitsSet(
                         Value.getValueType().getScalarType().getSizeInBits(),
                         ST->getMemoryVT().getScalarType().getSizeInBits()));
-    AddToWorkList(Value.getNode());
+    AddToWorklist(Value.getNode());
     if (Shorter.getNode())
       return DAG.getTruncStore(Chain, SDLoc(N), Shorter,
                                Ptr, ST->getMemoryVT(), ST->getMemOperand());
@@ -9674,6 +9945,17 @@
     }
   }
 
+  // If this is a store followed by a store with the same value to the same
+  // location, then the store is dead/noop.
+  if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
+    if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() &&
+        ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() &&
+        ST1->isUnindexed() && !ST1->isVolatile()) {
+      // The store is dead, remove it.
+      return Chain;
+    }
+  }
+
   // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
   // truncating store.  We can do this even if this is already a truncstore.
   if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
@@ -9741,7 +10023,7 @@
       // Swap nodes.
       SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VT,
                                   InVec.getOperand(0), InVal, EltNo);
-      AddToWorkList(NewOp.getNode());
+      AddToWorklist(NewOp.getNode());
       return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
                          VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
     }
@@ -9829,32 +10111,32 @@
     ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, VecEltVT)
                                    ? ISD::ZEXTLOAD
                                    : ISD::EXTLOAD;
-    Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(),
-                          NewPtr, MPI, VecEltVT, OriginalLoad->isVolatile(),
-                          OriginalLoad->isNonTemporal(), Align,
-                          OriginalLoad->getTBAAInfo());
+    Load = DAG.getExtLoad(
+        ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(), NewPtr, MPI,
+        VecEltVT, OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(),
+        OriginalLoad->isInvariant(), Align, OriginalLoad->getAAInfo());
     Chain = Load.getValue(1);
   } else {
     Load = DAG.getLoad(
         VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI,
         OriginalLoad->isVolatile(), OriginalLoad->isNonTemporal(),
-        OriginalLoad->isInvariant(), Align, OriginalLoad->getTBAAInfo());
+        OriginalLoad->isInvariant(), Align, OriginalLoad->getAAInfo());
     Chain = Load.getValue(1);
     if (ResultVT.bitsLT(VecEltVT))
       Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
     else
       Load = DAG.getNode(ISD::BITCAST, SDLoc(EVE), ResultVT, Load);
   }
-  WorkListRemover DeadNodes(*this);
+  WorklistRemover DeadNodes(*this);
   SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
   SDValue To[] = { Load, Chain };
   DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
   // Since we're explicitly calling ReplaceAllUses, add the new node to the
   // worklist explicitly as well.
-  AddToWorkList(Load.getNode());
-  AddUsersToWorkList(Load.getNode()); // Add users too
+  AddToWorklist(Load.getNode());
+  AddUsersToWorklist(Load.getNode()); // Add users too
   // Make sure to revisit this node to clean it up; it will usually be dead.
-  AddToWorkList(EVE);
+  AddToWorklist(EVE);
   ++OpsNarrowed;
   return SDValue(EVE, 0);
 }
@@ -9952,7 +10234,8 @@
 
   // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
   if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
-      ISD::isNormalLoad(InVec.getNode())) {
+      ISD::isNormalLoad(InVec.getNode()) &&
+      !N->getOperand(1)->hasPredecessor(InVec.getNode())) {
     SDValue Index = N->getOperand(1);
     if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec))
       return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
@@ -10135,7 +10418,7 @@
   SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
 
   // The new BUILD_VECTOR node has the potential to be further optimized.
-  AddToWorkList(BV.getNode());
+  AddToWorklist(BV.getNode());
   // Bitcast to the desired type.
   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }
@@ -10201,7 +10484,7 @@
       Opnds.push_back(In.getOperand(0));
   }
   SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Opnds);
-  AddToWorkList(BV.getNode());
+  AddToWorklist(BV.getNode());
 
   return DAG.getNode(Opcode, dl, VT, BV);
 }
@@ -10227,9 +10510,12 @@
   // operations.  If so, and if the EXTRACT_VECTOR_ELT vector inputs come from
   // at most two distinct vectors, turn this into a shuffle node.
 
+  // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
   // May only combine to shuffle after legalize if shuffle is legal.
-  if (LegalOperations &&
-      !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))
+  if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
     return SDValue();
 
   SDValue VecIn1, VecIn2;
@@ -10319,10 +10605,6 @@
         VecIn1.getValueType() != VT)
           return SDValue();
 
-    // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
-    if (!isTypeLegal(VT))
-      return SDValue();
-
     // Return the new VECTOR_SHUFFLE node.
     SDValue Ops[2];
     Ops[0] = VecIn1;
@@ -10513,6 +10795,92 @@
   return SDValue();
 }
 
+static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements,
+                                                 SDValue V, SelectionDAG &DAG) {
+  SDLoc DL(V);
+  EVT VT = V.getValueType();
+
+  switch (V.getOpcode()) {
+  default:
+    return V;
+
+  case ISD::CONCAT_VECTORS: {
+    EVT OpVT = V->getOperand(0).getValueType();
+    int OpSize = OpVT.getVectorNumElements();
+    SmallBitVector OpUsedElements(OpSize, false);
+    bool FoundSimplification = false;
+    SmallVector<SDValue, 4> NewOps;
+    NewOps.reserve(V->getNumOperands());
+    for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) {
+      SDValue Op = V->getOperand(i);
+      bool OpUsed = false;
+      for (int j = 0; j < OpSize; ++j)
+        if (UsedElements[i * OpSize + j]) {
+          OpUsedElements[j] = true;
+          OpUsed = true;
+        }
+      NewOps.push_back(
+          OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG)
+                 : DAG.getUNDEF(OpVT));
+      FoundSimplification |= Op == NewOps.back();
+      OpUsedElements.reset();
+    }
+    if (FoundSimplification)
+      V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps);
+    return V;
+  }
+
+  case ISD::INSERT_SUBVECTOR: {
+    SDValue BaseV = V->getOperand(0);
+    SDValue SubV = V->getOperand(1);
+    auto *IdxN = dyn_cast<ConstantSDNode>(V->getOperand(2));
+    if (!IdxN)
+      return V;
+
+    int SubSize = SubV.getValueType().getVectorNumElements();
+    int Idx = IdxN->getZExtValue();
+    bool SubVectorUsed = false;
+    SmallBitVector SubUsedElements(SubSize, false);
+    for (int i = 0; i < SubSize; ++i)
+      if (UsedElements[i + Idx]) {
+        SubVectorUsed = true;
+        SubUsedElements[i] = true;
+        UsedElements[i + Idx] = false;
+      }
+
+    // Now recurse on both the base and sub vectors.
+    SDValue SimplifiedSubV =
+        SubVectorUsed
+            ? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG)
+            : DAG.getUNDEF(SubV.getValueType());
+    SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG);
+    if (SimplifiedSubV != SubV || SimplifiedBaseV != BaseV)
+      V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                      SimplifiedBaseV, SimplifiedSubV, V->getOperand(2));
+    return V;
+  }
+  }
+}
+
+static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
+                                       SDValue N1, SelectionDAG &DAG) {
+  EVT VT = SVN->getValueType(0);
+  int NumElts = VT.getVectorNumElements();
+  SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false);
+  for (int M : SVN->getMask())
+    if (M >= 0 && M < NumElts)
+      N0UsedElements[M] = true;
+    else if (M >= NumElts)
+      N1UsedElements[M - NumElts] = true;
+
+  SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG);
+  SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG);
+  if (S0 == N0 && S1 == N1)
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
+}
+
 // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat.
 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
@@ -10665,6 +11033,12 @@
     }
   }
 
+  // There are various patterns used to build up a vector from smaller vectors,
+  // subvectors, or elements. Scan chains of these and replace unused insertions
+  // or components with undef.
+  if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
+    return S;
+
   if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
       Level < AfterLegalizeVectorOps &&
       (N1.getOpcode() == ISD::UNDEF ||
@@ -10699,7 +11073,15 @@
         Idx = OtherSV->getMaskElt(Idx);
       Mask.push_back(Idx);
     }
-    
+
+    // Check if all indices in Mask are Undef. In case, propagate Undef.
+    bool isUndefMask = true;
+    for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
+      isUndefMask &= Mask[i] < 0;
+
+    if (isUndefMask)
+      return DAG.getUNDEF(VT);
+
     bool CommuteOperands = false;
     if (N0.getOperand(1).getOpcode() != ISD::UNDEF) {
       // To be valid, the combine shuffle mask should only reference elements
@@ -10738,12 +11120,12 @@
       // The combined shuffle must map each index to itself.
       IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex;
     }
-    
+
     if (IsIdentityMask) {
       if (CommuteOperands)
         // optimize shuffle(shuffle(x, y), undef) -> y.
         return OtherSV->getOperand(1);
-      
+
       // optimize shuffle(shuffle(x, undef), undef) -> x
       // optimize shuffle(shuffle(x, y), undef) -> x
       return OtherSV->getOperand(0);
@@ -10751,16 +11133,134 @@
 
     // It may still be beneficial to combine the two shuffles if the
     // resulting shuffle is legal.
+    if (TLI.isTypeLegal(VT)) {
+      if (!CommuteOperands) {
+        if (TLI.isShuffleMaskLegal(Mask, VT))
+          // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
+          // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3)
+          return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
+                                      &Mask[0]);
+      } else {
+        // Compute the commuted shuffle mask.
+        for (unsigned i = 0; i != NumElts; ++i) {
+          int idx = Mask[i];
+          if (idx < 0)
+            continue;
+          else if (idx < (int)NumElts)
+            Mask[i] = idx + NumElts;
+          else
+            Mask[i] = idx - NumElts;
+        }
+
+        if (TLI.isShuffleMaskLegal(Mask, VT))
+          //   shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(y, undef, M3)
+          return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(1), N1,
+                                      &Mask[0]);
+      }
+    }
+  }
+
+  // Canonicalize shuffles according to rules:
+  //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
+  //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
+  //  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
+  if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && N0.getOpcode() != ISD::UNDEF &&
+      N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
+      TLI.isTypeLegal(VT)) {
+    // The incoming shuffle must be of the same type as the result of the
+    // current shuffle.
+    assert(N1->getOperand(0).getValueType() == VT &&
+           "Shuffle types don't match");
+
+    SDValue SV0 = N1->getOperand(0);
+    SDValue SV1 = N1->getOperand(1);
+    bool HasSameOp0 = N0 == SV0;
+    bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF;
+    if (HasSameOp0 || IsSV1Undef || N0 == SV1)
+      // Commute the operands of this shuffle so that next rule
+      // will trigger.
+      return DAG.getCommutedVectorShuffle(*SVN);
+  }
+
+  // Try to fold according to rules:
+  //   shuffle(shuffle(A, B, M0), B, M1) -> shuffle(A, B, M2)
+  //   shuffle(shuffle(A, B, M0), A, M1) -> shuffle(A, B, M2)
+  //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2)
+  //   shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2)
+  // Don't try to fold shuffles with illegal type.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
+      N1.getOpcode() != ISD::UNDEF && TLI.isTypeLegal(VT)) {
+    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
+
+    // The incoming shuffle must be of the same type as the result of the
+    // current shuffle.
+    assert(OtherSV->getOperand(0).getValueType() == VT &&
+           "Shuffle types don't match");
+
+    SDValue SV0 = OtherSV->getOperand(0);
+    SDValue SV1 = OtherSV->getOperand(1);
+    bool HasSameOp0 = N1 == SV0;
+    bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF;
+    if (!HasSameOp0 && !IsSV1Undef && N1 != SV1)
+      // Early exit.
+      return SDValue();
+
+    SmallVector<int, 4> Mask;
+    // Compute the combined shuffle mask for a shuffle with SV0 as the first
+    // operand, and SV1 as the second operand.
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx < 0) {
+        // Propagate Undef.
+        Mask.push_back(Idx);
+        continue;
+      }
+
+      if (Idx < (int)NumElts) {
+        Idx = OtherSV->getMaskElt(Idx);
+        if (IsSV1Undef && Idx >= (int) NumElts)
+          Idx = -1;  // Propagate Undef.
+      } else
+        Idx = HasSameOp0 ? Idx - NumElts : Idx;
+
+      Mask.push_back(Idx);
+    }
+
+    // Check if all indices in Mask are Undef. In case, propagate Undef.
+    bool isUndefMask = true;
+    for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
+      isUndefMask &= Mask[i] < 0;
+
+    if (isUndefMask)
+      return DAG.getUNDEF(VT);
+
+    // Avoid introducing shuffles with illegal mask.
     if (TLI.isShuffleMaskLegal(Mask, VT)) {
-      if (!CommuteOperands)
-        // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
-        // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3)
-        return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
-                                    &Mask[0]);
-      
-      //   shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(undef, y, M3)
-      return DAG.getVectorShuffle(VT, SDLoc(N), N1, N0->getOperand(1),
-                                  &Mask[0]);
+      if (IsSV1Undef)
+        //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2)
+        //   shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2)
+        return DAG.getVectorShuffle(VT, SDLoc(N), SV0, N1, &Mask[0]);
+      return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]);
+    }
+
+    // Compute the commuted shuffle mask.
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int idx = Mask[i];
+      if (idx < 0)
+        continue;
+      else if (idx < (int)NumElts)
+        Mask[i] = idx + NumElts;
+      else
+        Mask[i] = idx - NumElts;
+    }
+
+    if (TLI.isShuffleMaskLegal(Mask, VT)) {
+      if (IsSV1Undef)
+        //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(B, A, M2)
+        return DAG.getVectorShuffle(VT, SDLoc(N), N1, SV0, &Mask[0]);
+      //   shuffle(shuffle(A, B, M0), B, M1) -> shuffle(B, A, M2)
+      //   shuffle(shuffle(A, B, M0), A, M1) -> shuffle(B, A, M2)
+      return DAG.getVectorShuffle(VT, SDLoc(N), SV1, SV0, &Mask[0]);
     }
   }
 
@@ -10794,8 +11294,8 @@
   return SDValue();
 }
 
-/// XformToShuffleWithZero - Returns a vector_shuffle if it able to transform
-/// an AND to a vector_shuffle with the destination vector and a zero vector.
+/// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
+/// with the destination vector and a zero vector.
 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
 ///      vector_shuffle V, Zero, <0, 4, 2, 4>
 SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
@@ -10817,7 +11317,7 @@
         if (cast<ConstantSDNode>(Elt)->isAllOnesValue())
           Indices.push_back(i);
         else if (cast<ConstantSDNode>(Elt)->isNullValue())
-          Indices.push_back(NumElts);
+          Indices.push_back(NumElts+i);
         else
           return SDValue();
       }
@@ -10841,7 +11341,7 @@
   return SDValue();
 }
 
-/// SimplifyVBinOp - Visit a binary vector operation, like ADD.
+/// Visit a binary vector operation, like ADD.
 SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          "SimplifyVBinOp only works on vectors!");
@@ -10896,7 +11396,7 @@
           FoldOp.getOpcode() != ISD::ConstantFP)
         break;
       Ops.push_back(FoldOp);
-      AddToWorkList(FoldOp.getNode());
+      AddToWorklist(FoldOp.getNode());
     }
 
     if (Ops.size() == LHS.getNumOperands())
@@ -10918,7 +11418,7 @@
       SDValue UndefVector = LHS.getOperand(1);
       SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                      LHS.getOperand(0), RHS.getOperand(0));
-      AddUsersToWorkList(N);
+      AddUsersToWorklist(N);
       return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
                                   &SVN0->getMask()[0]);
     }
@@ -10927,7 +11427,7 @@
   return SDValue();
 }
 
-/// SimplifyVUnaryOp - Visit a binary vector operation, like FABS/FNEG.
+/// Visit a binary vector operation, like FABS/FNEG.
 SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          "SimplifyVUnaryOp only works on vectors!");
@@ -10950,7 +11450,7 @@
         FoldOp.getOpcode() != ISD::ConstantFP)
       break;
     Ops.push_back(FoldOp);
-    AddToWorkList(FoldOp.getNode());
+    AddToWorklist(FoldOp.getNode());
   }
 
   if (Ops.size() != N0.getNumOperands())
@@ -10977,9 +11477,9 @@
                                   N0.getValueType(),
                                   SCC.getOperand(0), SCC.getOperand(1),
                                   SCC.getOperand(4));
-      AddToWorkList(SETCC.getNode());
-      return DAG.getSelect(SDLoc(SCC), SCC.getValueType(),
-                           SCC.getOperand(2), SCC.getOperand(3), SETCC);
+      AddToWorklist(SETCC.getNode());
+      return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
+                           SCC.getOperand(2), SCC.getOperand(3));
     }
 
     return SCC;
@@ -10987,12 +11487,11 @@
   return SDValue();
 }
 
-/// SimplifySelectOps - Given a SELECT or a SELECT_CC node, where LHS and RHS
-/// are the two values being selected between, see if we can simplify the
-/// select.  Callers of this should assume that TheSelect is deleted if this
-/// returns true.  As such, they should return the appropriate thing (e.g. the
-/// node) back to the top-level of the DAG combiner loop to avoid it being
-/// looked at.
+/// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
+/// being selected between, see if we can simplify the select.  Callers of this
+/// should assume that TheSelect is deleted if this returns true.  As such, they
+/// should return the appropriate thing (e.g. the node) back to the top-level of
+/// the DAG combiner loop to avoid it being looked at.
 bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
                                     SDValue RHS) {
 
@@ -11071,22 +11570,27 @@
     }
 
     SDValue Load;
+    // It is safe to replace the two loads if they have different alignments,
+    // but the new load must be the minimum (most restrictive) alignment of the
+    // inputs.
+    bool isInvariant = LLD->isInvariant() & RLD->isInvariant();
+    unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
     if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
       Load = DAG.getLoad(TheSelect->getValueType(0),
                          SDLoc(TheSelect),
-                         // FIXME: Discards pointer and TBAA info.
+                         // FIXME: Discards pointer and AA info.
                          LLD->getChain(), Addr, MachinePointerInfo(),
                          LLD->isVolatile(), LLD->isNonTemporal(),
-                         LLD->isInvariant(), LLD->getAlignment());
+                         isInvariant, Alignment);
     } else {
       Load = DAG.getExtLoad(LLD->getExtensionType() == ISD::EXTLOAD ?
                             RLD->getExtensionType() : LLD->getExtensionType(),
                             SDLoc(TheSelect),
                             TheSelect->getValueType(0),
-                            // FIXME: Discards pointer and TBAA info.
+                            // FIXME: Discards pointer and AA info.
                             LLD->getChain(), Addr, MachinePointerInfo(),
                             LLD->getMemoryVT(), LLD->isVolatile(),
-                            LLD->isNonTemporal(), LLD->getAlignment());
+                            LLD->isNonTemporal(), isInvariant, Alignment);
     }
 
     // Users of the select now use the result of the load.
@@ -11102,7 +11606,7 @@
   return false;
 }
 
-/// SimplifySelectCC - Simplify an expression of the form (N0 cond N1) ? N2 : N3
+/// Simplify an expression of the form (N0 cond N1) ? N2 : N3
 /// where 'cond' is the comparison specified by CC.
 SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
                                       SDValue N2, SDValue N3,
@@ -11118,7 +11622,7 @@
   // Determine if the condition we're dealing with is constant
   SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
                               N0, N1, CC, DL, false);
-  if (SCC.getNode()) AddToWorkList(SCC.getNode());
+  if (SCC.getNode()) AddToWorklist(SCC.getNode());
   ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode());
 
   // fold select_cc true, x, y -> x
@@ -11186,13 +11690,13 @@
         SDValue Cond = DAG.getSetCC(DL,
                                     getSetCCResultType(N0.getValueType()),
                                     N0, N1, CC);
-        AddToWorkList(Cond.getNode());
+        AddToWorklist(Cond.getNode());
         SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(),
                                           Cond, One, Zero);
-        AddToWorkList(CstOffset.getNode());
+        AddToWorklist(CstOffset.getNode());
         CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
                             CstOffset);
-        AddToWorkList(CPIdx.getNode());
+        AddToWorklist(CPIdx.getNode());
         return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
                            MachinePointerInfo::getConstantPool(), false,
                            false, false, Alignment);
@@ -11217,11 +11721,11 @@
                                        getShiftAmountTy(N0.getValueType()));
         SDValue Shift = DAG.getNode(ISD::SRL, SDLoc(N0),
                                     XType, N0, ShCt);
-        AddToWorkList(Shift.getNode());
+        AddToWorklist(Shift.getNode());
 
         if (XType.bitsGT(AType)) {
           Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
-          AddToWorkList(Shift.getNode());
+          AddToWorklist(Shift.getNode());
         }
 
         return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
@@ -11231,11 +11735,11 @@
                                   XType, N0,
                                   DAG.getConstant(XType.getSizeInBits()-1,
                                          getShiftAmountTy(N0.getValueType())));
-      AddToWorkList(Shift.getNode());
+      AddToWorklist(Shift.getNode());
 
       if (XType.bitsGT(AType)) {
         Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
-        AddToWorkList(Shift.getNode());
+        AddToWorklist(Shift.getNode());
       }
 
       return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
@@ -11305,8 +11809,8 @@
                            N2.getValueType(), SCC);
       }
 
-      AddToWorkList(SCC.getNode());
-      AddToWorkList(Temp.getNode());
+      AddToWorklist(SCC.getNode());
+      AddToWorklist(Temp.getNode());
 
       if (N2C->getAPIntValue() == 1)
         return Temp;
@@ -11385,8 +11889,8 @@
                                          getShiftAmountTy(N0.getValueType())));
       SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0),
                                 XType, N0, Shift);
-      AddToWorkList(Shift.getNode());
-      AddToWorkList(Add.getNode());
+      AddToWorklist(Shift.getNode());
+      AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
     }
   }
@@ -11394,7 +11898,7 @@
   return SDValue();
 }
 
-/// SimplifySetCC - This is a stub for TargetLowering::SimplifySetCC.
+/// This is a stub for TargetLowering::SimplifySetCC.
 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0,
                                    SDValue N1, ISD::CondCode Cond,
                                    SDLoc DL, bool foldBooleans) {
@@ -11403,10 +11907,10 @@
   return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
 }
 
-/// BuildSDIVSequence - Given an ISD::SDIV node expressing a divide by constant,
-/// return a DAG expression to select that will generate the same value by
-/// multiplying by a magic number.  See:
-/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+/// Given an ISD::SDIV node expressing a divide by constant, return
+/// a DAG expression to select that will generate the same value by multiplying
+/// by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue DAGCombiner::BuildSDIV(SDNode *N) {
   ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
   if (!C)
@@ -11421,14 +11925,33 @@
       TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);
 
   for (SDNode *N : Built)
-    AddToWorkList(N);
+    AddToWorklist(N);
   return S;
 }
 
-/// BuildUDIV - Given an ISD::UDIV node expressing a divide by constant,
-/// return a DAG expression to select that will generate the same value by
-/// multiplying by a magic number.  See:
-/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+/// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
+/// DAG expression that will generate the same value by right shifting.
+SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (!C->getAPIntValue())
+    return SDValue();
+
+  std::vector<SDNode *> Built;
+  SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, &Built);
+
+  for (SDNode *N : Built)
+    AddToWorklist(N);
+  return S;
+}
+
+/// Given an ISD::UDIV node expressing a divide by constant, return a DAG
+/// expression that will generate the same value by multiplying by a magic
+/// number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue DAGCombiner::BuildUDIV(SDNode *N) {
   ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
   if (!C)
@@ -11443,13 +11966,145 @@
       TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);
 
   for (SDNode *N : Built)
-    AddToWorkList(N);
+    AddToWorklist(N);
   return S;
 }
 
-/// FindBaseOffset - Return true if base is a frame index, which is known not
-// to alias with anything but itself.  Provides base object and offset as
-// results.
+SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op) {
+  if (Level >= AfterLegalizeDAG)
+    return SDValue();
+
+  // Expose the DAG combiner to the target combiner implementations.
+  TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
+
+  unsigned Iterations = 0;
+  if (SDValue Est = TLI.getRecipEstimate(Op, DCI, Iterations)) {
+    if (Iterations) {
+      // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+      // For the reciprocal, we need to find the zero of the function:
+      //   F(X) = A X - 1 [which has a zero at X = 1/A]
+      //     =>
+      //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+      //     does not require additional intermediate precision]
+      EVT VT = Op.getValueType();
+      SDLoc DL(Op);
+      SDValue FPOne = DAG.getConstantFP(1.0, VT);
+
+      AddToWorklist(Est.getNode());
+
+      // Newton iterations: Est = Est + Est (1 - Arg * Est)
+      for (unsigned i = 0; i < Iterations; ++i) {
+        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est);
+        AddToWorklist(NewEst.getNode());
+
+        NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst);
+        AddToWorklist(NewEst.getNode());
+
+        NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
+        AddToWorklist(NewEst.getNode());
+
+        Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst);
+        AddToWorklist(Est.getNode());
+      }
+    }
+    return Est;
+  }
+
+  return SDValue();
+}
+
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+///     =>
+///   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
+/// As a result, we precompute A/2 prior to the iteration loop.
+SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
+                                          unsigned Iterations) {
+  EVT VT = Arg.getValueType();
+  SDLoc DL(Arg);
+  SDValue ThreeHalves = DAG.getConstantFP(1.5, VT);
+
+  // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
+  // this entire sequence requires only one FP constant.
+  SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg);
+  AddToWorklist(HalfArg.getNode());
+
+  HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg);
+  AddToWorklist(HalfArg.getNode());
+
+  // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
+  for (unsigned i = 0; i < Iterations; ++i) {
+    SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+    AddToWorklist(NewEst.getNode());
+
+    NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst);
+    AddToWorklist(NewEst.getNode());
+
+    NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst);
+    AddToWorklist(NewEst.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst);
+    AddToWorklist(Est.getNode());
+  }
+  return Est;
+}
+
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+///     =>
+///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
+SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est,
+                                          unsigned Iterations) {
+  EVT VT = Arg.getValueType();
+  SDLoc DL(Arg);
+  SDValue MinusThree = DAG.getConstantFP(-3.0, VT);
+  SDValue MinusHalf = DAG.getConstantFP(-0.5, VT);
+
+  // Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est)
+  for (unsigned i = 0; i < Iterations; ++i) {
+    SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf);
+    AddToWorklist(HalfEst.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est);
+    AddToWorklist(Est.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg);
+    AddToWorklist(Est.getNode());
+
+    Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree);
+    AddToWorklist(Est.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst);
+    AddToWorklist(Est.getNode());
+  }
+  return Est;
+}
+
+SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op) {
+  if (Level >= AfterLegalizeDAG)
+    return SDValue();
+
+  // Expose the DAG combiner to the target combiner implementations.
+  TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
+  unsigned Iterations = 0;
+  bool UseOneConstNR = false;
+  if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) {
+    AddToWorklist(Est.getNode());
+    if (Iterations) {
+      Est = UseOneConstNR ?
+        BuildRsqrtNROneConst(Op, Est, Iterations) :
+        BuildRsqrtNRTwoConst(Op, Est, Iterations);
+    }
+    return Est;
+  }
+
+  return SDValue();
+}
+
+/// Return true if base is a frame index, which is known not to alias with
+/// anything but itself.  Provides base object and offset as results.
 static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
                            const GlobalValue *&GV, const void *&CV) {
   // Assume it is a primitive operation.
@@ -11485,8 +12140,7 @@
   return isa<FrameIndexSDNode>(Base);
 }
 
-/// isAlias - Return true if there is any possibility that the two addresses
-/// overlap.
+/// Return true if there is any possibility that the two addresses overlap.
 bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   // If they are the same then they must be aliases.
   if (Op0->getBasePtr() == Op1->getBasePtr()) return true;
@@ -11545,8 +12199,9 @@
       return false;
   }
 
-  bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 ? CombinerGlobalAA :
-    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
+                   ? CombinerGlobalAA
+                   : DAG.getSubtarget().useAA();
 #ifndef NDEBUG
   if (CombinerAAOnlyFunc.getNumOccurrences() &&
       CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
@@ -11564,10 +12219,10 @@
     AliasAnalysis::AliasResult AAResult =
         AA.alias(AliasAnalysis::Location(Op0->getMemOperand()->getValue(),
                                          Overlap1,
-                                         UseTBAA ? Op0->getTBAAInfo() : nullptr),
+                                         UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
                  AliasAnalysis::Location(Op1->getMemOperand()->getValue(),
                                          Overlap2,
-                                         UseTBAA ? Op1->getTBAAInfo() : nullptr));
+                                         UseTBAA ? Op1->getAAInfo() : AAMDNodes()));
     if (AAResult == AliasAnalysis::NoAlias)
       return false;
   }
@@ -11576,7 +12231,7 @@
   return true;
 }
 
-/// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+/// Walk up chain skipping non-aliasing memory nodes,
 /// looking for aliasing nodes and adding them to the Aliases vector.
 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
                                    SmallVectorImpl<SDValue> &Aliases) {
@@ -11612,7 +12267,7 @@
     }
 
     // Don't bother if we've been before.
-    if (!Visited.insert(Chain.getNode()))
+    if (!Visited.insert(Chain.getNode()).second)
       continue;
 
     switch (Chain.getOpcode()) {
@@ -11687,10 +12342,9 @@
   // like register copies will interfere with trivial cases).
 
   SmallVector<const SDNode *, 16> Worklist;
-  for (SmallPtrSet<SDNode *, 16>::iterator I = Visited.begin(),
-       IE = Visited.end(); I != IE; ++I)
-    if (*I != OriginalChain.getNode())
-      Worklist.push_back(*I);
+  for (const SDNode *N : Visited)
+    if (N != OriginalChain.getNode())
+      Worklist.push_back(N);
 
   while (!Worklist.empty()) {
     const SDNode *M = Worklist.pop_back_val();
@@ -11701,7 +12355,8 @@
 
     for (SDNode::use_iterator UI = M->use_begin(),
          UIE = M->use_end(); UI != UIE; ++UI)
-      if (UI.getUse().getValueType() == MVT::Other && Visited.insert(*UI)) {
+      if (UI.getUse().getValueType() == MVT::Other &&
+          Visited.insert(*UI).second) {
         if (isa<MemIntrinsicSDNode>(*UI) || isa<MemSDNode>(*UI)) {
           // We've not visited this use, and we care about it (it could have an
           // ordering dependency with the original node).
@@ -11717,8 +12372,8 @@
   }
 }
 
-/// FindBetterChain - Walk up chain skipping non-aliasing memory nodes, looking
-/// for a better chain (aliasing node.)
+/// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
+/// (aliasing node.)
 SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   SmallVector<SDValue, 8> Aliases;  // Ops for replacing token factor.
 
@@ -11737,11 +12392,9 @@
   return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
-// SelectionDAG::Combine - This is the entry point for the file.
-//
+/// This is the entry point for the file.
 void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
                            CodeGenOpt::Level OptLevel) {
-  /// run - This is the main entry point to this class.
-  ///
+  /// This is the main entry point to this class.
   DAGCombiner(*this, AA, OptLevel).Run(Level);
 }

diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 445572a..8facbc2 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp

@@ -39,6 +39,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
@@ -64,19 +65,32 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "isel"
 
 STATISTIC(NumFastIselSuccessIndependent, "Number of insts selected by "
-          "target-independent selector");
+                                         "target-independent selector");
 STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
-          "target-specific selector");
+                                    "target-specific selector");
 STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
 
-/// startNewBlock - Set the current block to which generated machine
-/// instructions will be appended, and clear the local CSE map.
-///
+void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS,
+                                           unsigned AttrIdx) {
+  IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt);
+  IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
+  IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg);
+  IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
+  IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest);
+  IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
+  IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
+  IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  Alignment = CS->getParamAlignment(AttrIdx);
+}
+
+/// Set the current block to which generated machine instructions will be
+/// appended, and clear the local CSE map.
 void FastISel::startNewBlock() {
   LocalValueMap.clear();
 
@@ -89,18 +103,19 @@
   LastLocalValue = EmitStartPt;
 }
 
-bool FastISel::LowerArguments() {
+bool FastISel::lowerArguments() {
   if (!FuncInfo.CanLowerReturn)
     // Fallback to SDISel argument lowering code to deal with sret pointer
     // parameter.
     return false;
 
-  if (!FastLowerArguments())
+  if (!fastLowerArguments())
     return false;
 
   // Enter arguments into ValueMap for uses in non-entry BBs.
   for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(),
-         E = FuncInfo.Fn->arg_end(); I != E; ++I) {
+                                    E = FuncInfo.Fn->arg_end();
+       I != E; ++I) {
     DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(I);
     assert(VI != LocalValueMap.end() && "Missed an argument?");
     FuncInfo.ValueMap[I] = VI->second;
@@ -112,22 +127,30 @@
   LocalValueMap.clear();
   LastLocalValue = EmitStartPt;
   recomputeInsertPt();
+  SavedInsertPt = FuncInfo.InsertPt;
 }
 
-bool FastISel::hasTrivialKill(const Value *V) const {
+bool FastISel::hasTrivialKill(const Value *V) {
   // Don't consider constants or arguments to have trivial kills.
   const Instruction *I = dyn_cast<Instruction>(V);
   if (!I)
     return false;
 
   // No-op casts are trivially coalesced by fast-isel.
-  if (const CastInst *Cast = dyn_cast<CastInst>(I))
+  if (const auto *Cast = dyn_cast<CastInst>(I))
     if (Cast->isNoopCast(DL.getIntPtrType(Cast->getContext())) &&
         !hasTrivialKill(Cast->getOperand(0)))
       return false;
 
+  // Even the value might have only one use in the LLVM IR, it is possible that
+  // FastISel might fold the use into another instruction and now there is more
+  // than one use at the Machine Instruction level.
+  unsigned Reg = lookUpRegForValue(V);
+  if (Reg && !MRI.use_empty(Reg))
+    return false;
+
   // GEPs with all zero indices are trivially coalesced by fast-isel.
-  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+  if (const auto *GEP = dyn_cast<GetElementPtrInst>(I))
     if (GEP->hasAllZeroIndices() && !hasTrivialKill(GEP->getOperand(0)))
       return false;
 
@@ -160,7 +183,7 @@
 
   // Look up the value to see if we already have a register for it.
   unsigned Reg = lookUpRegForValue(V);
-  if (Reg != 0)
+  if (Reg)
     return Reg;
 
   // In bottom-up mode, just create the virtual register which will be used
@@ -181,29 +204,24 @@
   return Reg;
 }
 
-/// materializeRegForValue - Helper for getRegForValue. This function is
-/// called when the value isn't already available in a register and must
-/// be materialized with new instructions.
-unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
+unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
   unsigned Reg = 0;
-
-  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+  if (const auto *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getValue().getActiveBits() <= 64)
-      Reg = FastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
-  } else if (isa<AllocaInst>(V)) {
-    Reg = TargetMaterializeAlloca(cast<AllocaInst>(V));
-  } else if (isa<ConstantPointerNull>(V)) {
+      Reg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+  } else if (isa<AllocaInst>(V))
+    Reg = fastMaterializeAlloca(cast<AllocaInst>(V));
+  else if (isa<ConstantPointerNull>(V))
     // Translate this as an integer zero so that it can be
     // local-CSE'd with actual integer zeros.
-    Reg =
-      getRegForValue(Constant::getNullValue(DL.getIntPtrType(V->getContext())));
-  } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
-    if (CF->isNullValue()) {
-      Reg = TargetMaterializeFloatZero(CF);
-    } else {
+    Reg = getRegForValue(
+        Constant::getNullValue(DL.getIntPtrType(V->getContext())));
+  else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
+    if (CF->isNullValue())
+      Reg = fastMaterializeFloatZero(CF);
+    else
       // Try to emit the constant directly.
-      Reg = FastEmit_f(VT, VT, ISD::ConstantFP, CF);
-    }
+      Reg = fastEmit_f(VT, VT, ISD::ConstantFP, CF);
 
     if (!Reg) {
       // Try to emit the constant by using an integer constant with a cast.
@@ -213,22 +231,22 @@
       uint64_t x[2];
       uint32_t IntBitWidth = IntVT.getSizeInBits();
       bool isExact;
-      (void) Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
-                                  APFloat::rmTowardZero, &isExact);
+      (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
+                                 APFloat::rmTowardZero, &isExact);
       if (isExact) {
         APInt IntVal(IntBitWidth, x);
 
         unsigned IntegerReg =
-          getRegForValue(ConstantInt::get(V->getContext(), IntVal));
+            getRegForValue(ConstantInt::get(V->getContext(), IntVal));
         if (IntegerReg != 0)
-          Reg = FastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP,
-                           IntegerReg, /*Kill=*/false);
+          Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg,
+                           /*Kill=*/false);
       }
     }
-  } else if (const Operator *Op = dyn_cast<Operator>(V)) {
-    if (!SelectOperator(Op, Op->getOpcode()))
+  } else if (const auto *Op = dyn_cast<Operator>(V)) {
+    if (!selectOperator(Op, Op->getOpcode()))
       if (!isa<Instruction>(Op) ||
-          !TargetSelectInstruction(cast<Instruction>(Op)))
+          !fastSelectInstruction(cast<Instruction>(Op)))
         return 0;
     Reg = lookUpRegForValue(Op);
   } else if (isa<UndefValue>(V)) {
@@ -236,15 +254,26 @@
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::IMPLICIT_DEF), Reg);
   }
+  return Reg;
+}
 
-  // If target-independent code couldn't handle the value, give target-specific
-  // code a try.
-  if (!Reg && isa<Constant>(V))
-    Reg = TargetMaterializeConstant(cast<Constant>(V));
+/// Helper for getRegForValue. This function is called when the value isn't
+/// already available in a register and must be materialized with new
+/// instructions.
+unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
+  unsigned Reg = 0;
+  // Give the target-specific code a try first.
+  if (isa<Constant>(V))
+    Reg = fastMaterializeConstant(cast<Constant>(V));
+
+  // If target-specific code couldn't or didn't want to handle the value, then
+  // give target-independent code a try.
+  if (!Reg)
+    Reg = materializeConstant(V, VT);
 
   // Don't cache constant materializations in the general ValueMap.
   // To do so would require tracking what uses they dominate.
-  if (Reg != 0) {
+  if (Reg) {
     LocalValueMap[V] = Reg;
     LastLocalValue = MRI.getVRegDef(Reg);
   }
@@ -262,13 +291,7 @@
   return LocalValueMap[V];
 }
 
-/// UpdateValueMap - Update the value map to include the new mapping for this
-/// instruction, or insert an extra copy to get the result in a previous
-/// determined register.
-/// NOTE: This is only necessary because we might select a block that uses
-/// a value before we select the block that defines the value.  It might be
-/// possible to fix this by selecting blocks in reverse postorder.
-void FastISel::UpdateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
+void FastISel::updateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
   if (!isa<Instruction>(I)) {
     LocalValueMap[I] = Reg;
     return;
@@ -281,7 +304,7 @@
   else if (Reg != AssignedReg) {
     // Arrange for uses of AssignedReg to be replaced by uses of Reg.
     for (unsigned i = 0; i < NumRegs; i++)
-      FuncInfo.RegFixups[AssignedReg+i] = Reg+i;
+      FuncInfo.RegFixups[AssignedReg + i] = Reg + i;
 
     AssignedReg = Reg;
   }
@@ -299,13 +322,12 @@
   MVT PtrVT = TLI.getPointerTy();
   EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
   if (IdxVT.bitsLT(PtrVT)) {
-    IdxN = FastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND,
-                      IdxN, IdxNIsKill);
+    IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN,
+                      IdxNIsKill);
     IdxNIsKill = true;
-  }
-  else if (IdxVT.bitsGT(PtrVT)) {
-    IdxN = FastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::TRUNCATE,
-                      IdxN, IdxNIsKill);
+  } else if (IdxVT.bitsGT(PtrVT)) {
+    IdxN =
+        fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::TRUNCATE, IdxN, IdxNIsKill);
     IdxNIsKill = true;
   }
   return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
@@ -327,7 +349,7 @@
 
 void FastISel::removeDeadCode(MachineBasicBlock::iterator I,
                               MachineBasicBlock::iterator E) {
-  assert (I && E && std::distance(I, E) > 0 && "Invalid iterator!");
+  assert(I && E && std::distance(I, E) > 0 && "Invalid iterator!");
   while (I != E) {
     MachineInstr *Dead = &*I;
     ++I;
@@ -342,7 +364,7 @@
   DebugLoc OldDL = DbgLoc;
   recomputeInsertPt();
   DbgLoc = DebugLoc();
-  SavePoint SP = { OldInsertPt, OldDL };
+  SavePoint SP = {OldInsertPt, OldDL};
   return SP;
 }
 
@@ -355,10 +377,7 @@
   DbgLoc = OldInsertPt.DL;
 }
 
-/// SelectBinaryOp - Select and emit code for a binary operator instruction,
-/// which has an opcode which directly corresponds to the given ISD opcode.
-///
-bool FastISel::SelectBinaryOp(const User *I, unsigned ISDOpcode) {
+bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
   EVT VT = EVT::getEVT(I->getType(), /*HandleUnknown=*/true);
   if (VT == MVT::Other || !VT.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
@@ -371,9 +390,8 @@
   if (!TLI.isTypeLegal(VT)) {
     // MVT::i1 is special. Allow AND, OR, or XOR because they
     // don't require additional zeroing, which makes them easy.
-    if (VT == MVT::i1 &&
-        (ISDOpcode == ISD::AND || ISDOpcode == ISD::OR ||
-         ISDOpcode == ISD::XOR))
+    if (VT == MVT::i1 && (ISDOpcode == ISD::AND || ISDOpcode == ISD::OR ||
+                          ISDOpcode == ISD::XOR))
       VT = TLI.getTypeToTransformTo(I->getContext(), VT);
     else
       return false;
@@ -381,38 +399,36 @@
 
   // Check if the first operand is a constant, and handle it as "ri".  At -O0,
   // we don't have anything that canonicalizes operand order.
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
+  if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
     if (isa<Instruction>(I) && cast<Instruction>(I)->isCommutative()) {
       unsigned Op1 = getRegForValue(I->getOperand(1));
-      if (Op1 == 0) return false;
-
+      if (!Op1)
+        return false;
       bool Op1IsKill = hasTrivialKill(I->getOperand(1));
 
-      unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1,
-                                        Op1IsKill, CI->getZExtValue(),
-                                        VT.getSimpleVT());
-      if (ResultReg == 0) return false;
+      unsigned ResultReg =
+          fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1, Op1IsKill,
+                       CI->getZExtValue(), VT.getSimpleVT());
+      if (!ResultReg)
+        return false;
 
       // We successfully emitted code for the given LLVM Instruction.
-      UpdateValueMap(I, ResultReg);
+      updateValueMap(I, ResultReg);
       return true;
     }
 
-
   unsigned Op0 = getRegForValue(I->getOperand(0));
-  if (Op0 == 0)   // Unhandled operand. Halt "fast" selection and bail.
+  if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
     return false;
-
   bool Op0IsKill = hasTrivialKill(I->getOperand(0));
 
   // Check if the second operand is a constant and handle it appropriately.
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+  if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
     uint64_t Imm = CI->getZExtValue();
 
     // Transform "sdiv exact X, 8" -> "sra X, 3".
     if (ISDOpcode == ISD::SDIV && isa<BinaryOperator>(I) &&
-        cast<BinaryOperator>(I)->isExact() &&
-        isPowerOf2_64(Imm)) {
+        cast<BinaryOperator>(I)->isExact() && isPowerOf2_64(Imm)) {
       Imm = Log2_64(Imm);
       ISDOpcode = ISD::SRA;
     }
@@ -424,54 +440,49 @@
       ISDOpcode = ISD::AND;
     }
 
-    unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
+    unsigned ResultReg = fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
                                       Op0IsKill, Imm, VT.getSimpleVT());
-    if (ResultReg == 0) return false;
+    if (!ResultReg)
+      return false;
 
     // We successfully emitted code for the given LLVM Instruction.
-    UpdateValueMap(I, ResultReg);
+    updateValueMap(I, ResultReg);
     return true;
   }
 
   // Check if the second operand is a constant float.
-  if (ConstantFP *CF = dyn_cast<ConstantFP>(I->getOperand(1))) {
-    unsigned ResultReg = FastEmit_rf(VT.getSimpleVT(), VT.getSimpleVT(),
+  if (const auto *CF = dyn_cast<ConstantFP>(I->getOperand(1))) {
+    unsigned ResultReg = fastEmit_rf(VT.getSimpleVT(), VT.getSimpleVT(),
                                      ISDOpcode, Op0, Op0IsKill, CF);
-    if (ResultReg != 0) {
+    if (ResultReg) {
       // We successfully emitted code for the given LLVM Instruction.
-      UpdateValueMap(I, ResultReg);
+      updateValueMap(I, ResultReg);
       return true;
     }
   }
 
   unsigned Op1 = getRegForValue(I->getOperand(1));
-  if (Op1 == 0)
-    // Unhandled operand. Halt "fast" selection and bail.
+  if (!Op1) // Unhandled operand. Halt "fast" selection and bail.
     return false;
-
   bool Op1IsKill = hasTrivialKill(I->getOperand(1));
 
   // Now we have both operands in registers. Emit the instruction.
-  unsigned ResultReg = FastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
-                                   ISDOpcode,
-                                   Op0, Op0IsKill,
-                                   Op1, Op1IsKill);
-  if (ResultReg == 0)
+  unsigned ResultReg = fastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
+                                   ISDOpcode, Op0, Op0IsKill, Op1, Op1IsKill);
+  if (!ResultReg)
     // Target-specific code wasn't able to find a machine opcode for
     // the given ISD opcode and type. Halt "fast" selection and bail.
     return false;
 
   // We successfully emitted code for the given LLVM Instruction.
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool FastISel::SelectGetElementPtr(const User *I) {
+bool FastISel::selectGetElementPtr(const User *I) {
   unsigned N = getRegForValue(I->getOperand(0));
-  if (N == 0)
-    // Unhandled operand. Halt "fast" selection and bail.
+  if (!N) // Unhandled operand. Halt "fast" selection and bail.
     return false;
-
   bool NIsKill = hasTrivialKill(I->getOperand(0));
 
   // Keep a running tab of the total offset to coalesce multiple N = N + Offset
@@ -481,18 +492,18 @@
   uint64_t MaxOffs = 2048;
   Type *Ty = I->getOperand(0)->getType();
   MVT VT = TLI.getPointerTy();
-  for (GetElementPtrInst::const_op_iterator OI = I->op_begin()+1,
-       E = I->op_end(); OI != E; ++OI) {
+  for (GetElementPtrInst::const_op_iterator OI = I->op_begin() + 1,
+                                            E = I->op_end();
+       OI != E; ++OI) {
     const Value *Idx = *OI;
-    if (StructType *StTy = dyn_cast<StructType>(Ty)) {
+    if (auto *StTy = dyn_cast<StructType>(Ty)) {
       unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
       if (Field) {
         // N = N + Offset
         TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
         if (TotalOffs >= MaxOffs) {
-          N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
-          if (N == 0)
-            // Unhandled operand. Halt "fast" selection and bail.
+          N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (!N) // Unhandled operand. Halt "fast" selection and bail.
             return false;
           NIsKill = true;
           TotalOffs = 0;
@@ -503,15 +514,15 @@
       Ty = cast<SequentialType>(Ty)->getElementType();
 
       // If this is a constant subscript, handle it quickly.
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
-        if (CI->isZero()) continue;
+      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero())
+          continue;
         // N = N + Offset
         TotalOffs +=
-          DL.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
+            DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
         if (TotalOffs >= MaxOffs) {
-          N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
-          if (N == 0)
-            // Unhandled operand. Halt "fast" selection and bail.
+          N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (!N) // Unhandled operand. Halt "fast" selection and bail.
             return false;
           NIsKill = true;
           TotalOffs = 0;
@@ -519,9 +530,8 @@
         continue;
       }
       if (TotalOffs) {
-        N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
-        if (N == 0)
-          // Unhandled operand. Halt "fast" selection and bail.
+        N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+        if (!N) // Unhandled operand. Halt "fast" selection and bail.
           return false;
         NIsKill = true;
         TotalOffs = 0;
@@ -532,43 +542,37 @@
       std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
       unsigned IdxN = Pair.first;
       bool IdxNIsKill = Pair.second;
-      if (IdxN == 0)
-        // Unhandled operand. Halt "fast" selection and bail.
+      if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
         return false;
 
       if (ElementSize != 1) {
-        IdxN = FastEmit_ri_(VT, ISD::MUL, IdxN, IdxNIsKill, ElementSize, VT);
-        if (IdxN == 0)
-          // Unhandled operand. Halt "fast" selection and bail.
+        IdxN = fastEmit_ri_(VT, ISD::MUL, IdxN, IdxNIsKill, ElementSize, VT);
+        if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
           return false;
         IdxNIsKill = true;
       }
-      N = FastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
-      if (N == 0)
-        // Unhandled operand. Halt "fast" selection and bail.
+      N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+      if (!N) // Unhandled operand. Halt "fast" selection and bail.
         return false;
     }
   }
   if (TotalOffs) {
-    N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
-    if (N == 0)
-      // Unhandled operand. Halt "fast" selection and bail.
+    N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+    if (!N) // Unhandled operand. Halt "fast" selection and bail.
       return false;
   }
 
   // We successfully emitted code for the given LLVM Instruction.
-  UpdateValueMap(I, N);
+  updateValueMap(I, N);
   return true;
 }
 
-/// \brief Add a stackmap or patchpoint intrinsic call's live variable operands
-/// to a stackmap or patchpoint machine instruction.
 bool FastISel::addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
                                    const CallInst *CI, unsigned StartIdx) {
   for (unsigned i = StartIdx, e = CI->getNumArgOperands(); i != e; ++i) {
     Value *Val = CI->getArgOperand(i);
     // Check for constants and encode them with a StackMaps::ConstantOp prefix.
-    if (auto *C = dyn_cast<ConstantInt>(Val)) {
+    if (const auto *C = dyn_cast<ConstantInt>(Val)) {
       Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
       Ops.push_back(MachineOperand::CreateImm(C->getSExtValue()));
     } else if (isa<ConstantPointerNull>(Val)) {
@@ -585,16 +589,15 @@
         return false;
     } else {
       unsigned Reg = getRegForValue(Val);
-      if (Reg == 0)
+      if (!Reg)
         return false;
       Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false));
     }
   }
-
   return true;
 }
 
-bool FastISel::SelectStackmap(const CallInst *I) {
+bool FastISel::selectStackmap(const CallInst *I) {
   // void @llvm.experimental.stackmap(i64 <id>, i32 <numShadowBytes>,
   //                                  [live variables...])
   assert(I->getCalledFunction()->getReturnType()->isVoidTy() &&
@@ -621,7 +624,7 @@
   assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos)) &&
          "Expected a constant integer.");
   const auto *NumBytes =
-    cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
   Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue()));
 
   // Push live variables for the stack map (skipping the first two arguments
@@ -637,13 +640,13 @@
   const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
   for (unsigned i = 0; ScratchRegs[i]; ++i)
     Ops.push_back(MachineOperand::CreateReg(
-      ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
-      /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
+        ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
+        /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(0);
+      .addImm(0);
 
   // Issue STACKMAP.
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -654,7 +657,8 @@
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
-    .addImm(0).addImm(0);
+      .addImm(0)
+      .addImm(0);
 
   // Inform the Frame Information that we have a stackmap in this function.
   FuncInfo.MF->getFrameInfo()->setHasStackMap();
@@ -662,11 +666,370 @@
   return true;
 }
 
-bool FastISel::SelectCall(const User *I) {
+/// \brief Lower an argument list according to the target calling convention.
+///
+/// This is a helper for lowering intrinsics that follow a target calling
+/// convention or require stack pointer adjustment. Only a subset of the
+/// intrinsic's operands need to participate in the calling convention.
+bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
+                                 unsigned NumArgs, const Value *Callee,
+                                 bool ForceRetVoidTy, CallLoweringInfo &CLI) {
+  ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  ImmutableCallSite CS(CI);
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
+       ArgI != ArgE; ++ArgI) {
+    Value *V = CI->getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    ArgListEntry Entry;
+    Entry.Val = V;
+    Entry.Ty = V->getType();
+    Entry.setAttributes(&CS, AttrI);
+    Args.push_back(Entry);
+  }
+
+  Type *RetTy = ForceRetVoidTy ? Type::getVoidTy(CI->getType()->getContext())
+                               : CI->getType();
+  CLI.setCallee(CI->getCallingConv(), RetTy, Callee, std::move(Args), NumArgs);
+
+  return lowerCallTo(CLI);
+}
+
+bool FastISel::selectPatchpoint(const CallInst *I) {
+  // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
+  //                                                 i32 <numBytes>,
+  //                                                 i8* <target>,
+  //                                                 i32 <numArgs>,
+  //                                                 [Args...],
+  //                                                 [live variables...])
+  CallingConv::ID CC = I->getCallingConv();
+  bool IsAnyRegCC = CC == CallingConv::AnyReg;
+  bool HasDef = !I->getType()->isVoidTy();
+  Value *Callee = I->getOperand(PatchPointOpers::TargetPos);
+
+  // Get the real number of arguments participating in the call <numArgs>
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NArgPos)) &&
+         "Expected a constant integer.");
+  const auto *NumArgsVal =
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NArgPos));
+  unsigned NumArgs = NumArgsVal->getZExtValue();
+
+  // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
+  // This includes all meta-operands up to but not including CC.
+  unsigned NumMetaOpers = PatchPointOpers::CCPos;
+  assert(I->getNumArgOperands() >= NumMetaOpers + NumArgs &&
+         "Not enough arguments provided to the patchpoint intrinsic");
+
+  // For AnyRegCC the arguments are lowered later on manually.
+  unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
+  CallLoweringInfo CLI;
+  if (!lowerCallOperands(I, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC, CLI))
+    return false;
+
+  assert(CLI.Call && "No call instruction specified.");
+
+  SmallVector<MachineOperand, 32> Ops;
+
+  // Add an explicit result reg if we use the anyreg calling convention.
+  if (IsAnyRegCC && HasDef) {
+    assert(CLI.NumResultRegs == 0 && "Unexpected result register.");
+    CLI.ResultReg = createResultReg(TLI.getRegClassFor(MVT::i64));
+    CLI.NumResultRegs = 1;
+    Ops.push_back(MachineOperand::CreateReg(CLI.ResultReg, /*IsDef=*/true));
+  }
+
+  // Add the <id> and <numBytes> constants.
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::IDPos)) &&
+         "Expected a constant integer.");
+  const auto *ID = cast<ConstantInt>(I->getOperand(PatchPointOpers::IDPos));
+  Ops.push_back(MachineOperand::CreateImm(ID->getZExtValue()));
+
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos)) &&
+         "Expected a constant integer.");
+  const auto *NumBytes =
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
+  Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue()));
+
+  // Assume that the callee is a constant address or null pointer.
+  // FIXME: handle function symbols in the future.
+  uint64_t CalleeAddr;
+  if (const auto *C = dyn_cast<IntToPtrInst>(Callee))
+    CalleeAddr = cast<ConstantInt>(C->getOperand(0))->getZExtValue();
+  else if (const auto *C = dyn_cast<ConstantExpr>(Callee)) {
+    if (C->getOpcode() == Instruction::IntToPtr)
+      CalleeAddr = cast<ConstantInt>(C->getOperand(0))->getZExtValue();
+    else
+      llvm_unreachable("Unsupported ConstantExpr.");
+  } else if (isa<ConstantPointerNull>(Callee))
+    CalleeAddr = 0;
+  else
+    llvm_unreachable("Unsupported callee address.");
+
+  Ops.push_back(MachineOperand::CreateImm(CalleeAddr));
+
+  // Adjust <numArgs> to account for any arguments that have been passed on
+  // the stack instead.
+  unsigned NumCallRegArgs = IsAnyRegCC ? NumArgs : CLI.OutRegs.size();
+  Ops.push_back(MachineOperand::CreateImm(NumCallRegArgs));
+
+  // Add the calling convention
+  Ops.push_back(MachineOperand::CreateImm((unsigned)CC));
+
+  // Add the arguments we omitted previously. The register allocator should
+  // place these in any free register.
+  if (IsAnyRegCC) {
+    for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i) {
+      unsigned Reg = getRegForValue(I->getArgOperand(i));
+      if (!Reg)
+        return false;
+      Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false));
+    }
+  }
+
+  // Push the arguments from the call instruction.
+  for (auto Reg : CLI.OutRegs)
+    Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false));
+
+  // Push live variables for the stack map.
+  if (!addStackMapLiveVars(Ops, I, NumMetaOpers + NumArgs))
+    return false;
+
+  // Push the register mask info.
+  Ops.push_back(MachineOperand::CreateRegMask(TRI.getCallPreservedMask(CC)));
+
+  // Add scratch registers as implicit def and early clobber.
+  const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
+  for (unsigned i = 0; ScratchRegs[i]; ++i)
+    Ops.push_back(MachineOperand::CreateReg(
+        ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
+        /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
+
+  // Add implicit defs (return values).
+  for (auto Reg : CLI.InRegs)
+    Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/true,
+                                            /*IsImpl=*/true));
+
+  // Insert the patchpoint instruction before the call generated by the target.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, CLI.Call, DbgLoc,
+                                    TII.get(TargetOpcode::PATCHPOINT));
+
+  for (auto &MO : Ops)
+    MIB.addOperand(MO);
+
+  MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI);
+
+  // Delete the original call instruction.
+  CLI.Call->eraseFromParent();
+
+  // Inform the Frame Information that we have a patchpoint in this function.
+  FuncInfo.MF->getFrameInfo()->setHasPatchPoint();
+
+  if (CLI.NumResultRegs)
+    updateValueMap(I, CLI.ResultReg, CLI.NumResultRegs);
+  return true;
+}
+
+/// Returns an AttributeSet representing the attributes applied to the return
+/// value of the given call.
+static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
+  SmallVector<Attribute::AttrKind, 2> Attrs;
+  if (CLI.RetSExt)
+    Attrs.push_back(Attribute::SExt);
+  if (CLI.RetZExt)
+    Attrs.push_back(Attribute::ZExt);
+  if (CLI.IsInReg)
+    Attrs.push_back(Attribute::InReg);
+
+  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
+                           Attrs);
+}
+
+bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName,
+                           unsigned NumArgs) {
+  ImmutableCallSite CS(CI);
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  Type *RetTy = FTy->getReturnType();
+
+  ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  for (unsigned ArgI = 0; ArgI != NumArgs; ++ArgI) {
+    Value *V = CI->getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    ArgListEntry Entry;
+    Entry.Val = V;
+    Entry.Ty = V->getType();
+    Entry.setAttributes(&CS, ArgI + 1);
+    Args.push_back(Entry);
+  }
+
+  CallLoweringInfo CLI;
+  CLI.setCallee(RetTy, FTy, SymName, std::move(Args), CS, NumArgs);
+
+  return lowerCallTo(CLI);
+}
+
+bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
+  // Handle the incoming return values from the call.
+  CLI.clearIns();
+  SmallVector<EVT, 4> RetTys;
+  ComputeValueVTs(TLI, CLI.RetTy, RetTys);
+
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, TLI);
+
+  bool CanLowerReturn = TLI.CanLowerReturn(
+      CLI.CallConv, *FuncInfo.MF, CLI.IsVarArg, Outs, CLI.RetTy->getContext());
+
+  // FIXME: sret demotion isn't supported yet - bail out.
+  if (!CanLowerReturn)
+    return false;
+
+  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+    EVT VT = RetTys[I];
+    MVT RegisterVT = TLI.getRegisterType(CLI.RetTy->getContext(), VT);
+    unsigned NumRegs = TLI.getNumRegisters(CLI.RetTy->getContext(), VT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      ISD::InputArg MyFlags;
+      MyFlags.VT = RegisterVT;
+      MyFlags.ArgVT = VT;
+      MyFlags.Used = CLI.IsReturnValueUsed;
+      if (CLI.RetSExt)
+        MyFlags.Flags.setSExt();
+      if (CLI.RetZExt)
+        MyFlags.Flags.setZExt();
+      if (CLI.IsInReg)
+        MyFlags.Flags.setInReg();
+      CLI.Ins.push_back(MyFlags);
+    }
+  }
+
+  // Handle all of the outgoing arguments.
+  CLI.clearOuts();
+  for (auto &Arg : CLI.getArgs()) {
+    Type *FinalType = Arg.Ty;
+    if (Arg.IsByVal)
+      FinalType = cast<PointerType>(Arg.Ty)->getElementType();
+    bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
+        FinalType, CLI.CallConv, CLI.IsVarArg);
+
+    ISD::ArgFlagsTy Flags;
+    if (Arg.IsZExt)
+      Flags.setZExt();
+    if (Arg.IsSExt)
+      Flags.setSExt();
+    if (Arg.IsInReg)
+      Flags.setInReg();
+    if (Arg.IsSRet)
+      Flags.setSRet();
+    if (Arg.IsByVal)
+      Flags.setByVal();
+    if (Arg.IsInAlloca) {
+      Flags.setInAlloca();
+      // Set the byval flag for CCAssignFn callbacks that don't know about
+      // inalloca. This way we can know how many bytes we should've allocated
+      // and how many bytes a callee cleanup function will pop.  If we port
+      // inalloca to more targets, we'll have to add custom inalloca handling in
+      // the various CC lowering callbacks.
+      Flags.setByVal();
+    }
+    if (Arg.IsByVal || Arg.IsInAlloca) {
+      PointerType *Ty = cast<PointerType>(Arg.Ty);
+      Type *ElementTy = Ty->getElementType();
+      unsigned FrameSize = DL.getTypeAllocSize(ElementTy);
+      // For ByVal, alignment should come from FE. BE will guess if this info is
+      // not there, but there are cases it cannot get right.
+      unsigned FrameAlign = Arg.Alignment;
+      if (!FrameAlign)
+        FrameAlign = TLI.getByValTypeAlignment(ElementTy);
+      Flags.setByValSize(FrameSize);
+      Flags.setByValAlign(FrameAlign);
+    }
+    if (Arg.IsNest)
+      Flags.setNest();
+    if (NeedsRegBlock)
+      Flags.setInConsecutiveRegs();
+    unsigned OriginalAlignment = DL.getABITypeAlignment(Arg.Ty);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    CLI.OutVals.push_back(Arg.Val);
+    CLI.OutFlags.push_back(Flags);
+  }
+
+  if (!fastLowerCall(CLI))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  assert(CLI.Call && "No call instruction specified.");
+  CLI.Call->setPhysRegsDeadExcept(CLI.InRegs, TRI);
+
+  if (CLI.NumResultRegs && CLI.CS)
+    updateValueMap(CLI.CS->getInstruction(), CLI.ResultReg, CLI.NumResultRegs);
+
+  return true;
+}
+
+bool FastISel::lowerCall(const CallInst *CI) {
+  ImmutableCallSite CS(CI);
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FuncTy = cast<FunctionType>(PT->getElementType());
+  Type *RetTy = FuncTy->getReturnType();
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Args.reserve(CS.arg_size());
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    Value *V = *i;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    Entry.Val = V;
+    Entry.Ty = V->getType();
+
+    // Skip the first return-type Attribute to get to params.
+    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+    Args.push_back(Entry);
+  }
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within fastLowerCall.
+  bool IsTailCall = CI->isTailCall();
+  if (IsTailCall && !isInTailCallPosition(CS, TM))
+    IsTailCall = false;
+
+  CallLoweringInfo CLI;
+  CLI.setCallee(RetTy, FuncTy, CI->getCalledValue(), std::move(Args), CS)
+      .setTailCall(IsTailCall);
+
+  return lowerCallTo(CLI);
+}
+
+bool FastISel::selectCall(const User *I) {
   const CallInst *Call = cast<CallInst>(I);
 
   // Handle simple inline asms.
   if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getCalledValue())) {
+    // If the inline asm has side effects, then make sure that no local value
+    // lives across by flushing the local value map.
+    if (IA->hasSideEffects())
+      flushLocalValueMap();
+
     // Don't attempt to handle constraints.
     if (!IA->getConstraintString().empty())
       return false;
@@ -679,34 +1042,46 @@
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::INLINEASM))
-      .addExternalSymbol(IA->getAsmString().c_str())
-      .addImm(ExtraInfo);
+        .addExternalSymbol(IA->getAsmString().c_str())
+        .addImm(ExtraInfo);
     return true;
   }
 
   MachineModuleInfo &MMI = FuncInfo.MF->getMMI();
   ComputeUsesVAFloatArgument(*Call, &MMI);
 
-  const Function *F = Call->getCalledFunction();
-  if (!F) return false;
+  // Handle intrinsic function calls.
+  if (const auto *II = dyn_cast<IntrinsicInst>(Call))
+    return selectIntrinsicCall(II);
 
-  // Handle selected intrinsic function calls.
-  switch (F->getIntrinsicID()) {
-  default: break;
-    // At -O0 we don't care about the lifetime intrinsics.
+  // Usually, it does not make sense to initialize a value,
+  // make an unrelated function call and use the value, because
+  // it tends to be spilled on the stack. So, we move the pointer
+  // to the last local value to the beginning of the block, so that
+  // all the values which have already been materialized,
+  // appear after the call. It also makes sense to skip intrinsics
+  // since they tend to be inlined.
+  flushLocalValueMap();
+
+  return lowerCall(Call);
+}
+
+bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+  default:
+    break;
+  // At -O0 we don't care about the lifetime intrinsics.
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
-    // The donothing intrinsic does, well, nothing.
+  // The donothing intrinsic does, well, nothing.
   case Intrinsic::donothing:
     return true;
-
   case Intrinsic::dbg_declare: {
-    const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call);
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
     DIVariable DIVar(DI->getVariable());
     assert((!DIVar || DIVar.isVariable()) &&
-      "Variable in DbgDeclareInst should be either null or a DIVariable.");
-    if (!DIVar ||
-        !FuncInfo.MF->getMMI().hasDebugInfo()) {
+           "Variable in DbgDeclareInst should be either null or a DIVariable.");
+    if (!DIVar || !FuncInfo.MF->getMMI().hasDebugInfo()) {
       DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
       return true;
     }
@@ -719,11 +1094,11 @@
 
     unsigned Offset = 0;
     Optional<MachineOperand> Op;
-    if (const Argument *Arg = dyn_cast<Argument>(Address))
+    if (const auto *Arg = dyn_cast<Argument>(Address))
       // Some arguments' frame index is recorded during argument lowering.
       Offset = FuncInfo.getArgumentFrameIndex(Arg);
     if (Offset)
-        Op = MachineOperand::CreateFI(Offset);
+      Op = MachineOperand::CreateFI(Offset);
     if (!Op)
       if (unsigned Reg = lookUpRegForValue(Address))
         Op = MachineOperand::CreateReg(Reg, false);
@@ -750,13 +1125,14 @@
         Op->setIsDebug(true);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(TargetOpcode::DBG_VALUE), false, Op->getReg(), 0,
-                DI->getVariable());
+                DI->getVariable(), DI->getExpression());
       } else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(TargetOpcode::DBG_VALUE))
             .addOperand(*Op)
             .addImm(0)
-            .addMetadata(DI->getVariable());
+            .addMetadata(DI->getVariable())
+            .addMetadata(DI->getExpression());
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
@@ -766,33 +1142,41 @@
   }
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
-    const DbgValueInst *DI = cast<DbgValueInst>(Call);
+    const DbgValueInst *DI = cast<DbgValueInst>(II);
     const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
     const Value *V = DI->getValue();
     if (!V) {
       // Currently the optimizer can produce this; insert an undef to
       // help debugging.  Probably the optimizer should not do this.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-        .addReg(0U).addImm(DI->getOffset())
-        .addMetadata(DI->getVariable());
-    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+          .addReg(0U)
+          .addImm(DI->getOffset())
+          .addMetadata(DI->getVariable())
+          .addMetadata(DI->getExpression());
+    } else if (const auto *CI = dyn_cast<ConstantInt>(V)) {
       if (CI->getBitWidth() > 64)
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-          .addCImm(CI).addImm(DI->getOffset())
-          .addMetadata(DI->getVariable());
+            .addCImm(CI)
+            .addImm(DI->getOffset())
+            .addMetadata(DI->getVariable())
+            .addMetadata(DI->getExpression());
       else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-          .addImm(CI->getZExtValue()).addImm(DI->getOffset())
-          .addMetadata(DI->getVariable());
-    } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+            .addImm(CI->getZExtValue())
+            .addImm(DI->getOffset())
+            .addMetadata(DI->getVariable())
+            .addMetadata(DI->getExpression());
+    } else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-        .addFPImm(CF).addImm(DI->getOffset())
-        .addMetadata(DI->getVariable());
+          .addFPImm(CF)
+          .addImm(DI->getOffset())
+          .addMetadata(DI->getVariable())
+          .addMetadata(DI->getExpression());
     } else if (unsigned Reg = lookUpRegForValue(V)) {
       // FIXME: This does not handle register-indirect values at offset 0.
       bool IsIndirect = DI->getOffset() != 0;
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, IsIndirect,
-              Reg, DI->getOffset(), DI->getVariable());
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, IsIndirect, Reg,
+              DI->getOffset(), DI->getVariable(), DI->getExpression());
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
@@ -801,46 +1185,38 @@
     return true;
   }
   case Intrinsic::objectsize: {
-    ConstantInt *CI = cast<ConstantInt>(Call->getArgOperand(1));
+    ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
     unsigned long long Res = CI->isZero() ? -1ULL : 0;
-    Constant *ResCI = ConstantInt::get(Call->getType(), Res);
+    Constant *ResCI = ConstantInt::get(II->getType(), Res);
     unsigned ResultReg = getRegForValue(ResCI);
-    if (ResultReg == 0)
+    if (!ResultReg)
       return false;
-    UpdateValueMap(Call, ResultReg);
+    updateValueMap(II, ResultReg);
     return true;
   }
   case Intrinsic::expect: {
-    unsigned ResultReg = getRegForValue(Call->getArgOperand(0));
-    if (ResultReg == 0)
+    unsigned ResultReg = getRegForValue(II->getArgOperand(0));
+    if (!ResultReg)
       return false;
-    UpdateValueMap(Call, ResultReg);
+    updateValueMap(II, ResultReg);
     return true;
   }
   case Intrinsic::experimental_stackmap:
-    return SelectStackmap(Call);
+    return selectStackmap(II);
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    return selectPatchpoint(II);
   }
 
-  // Usually, it does not make sense to initialize a value,
-  // make an unrelated function call and use the value, because
-  // it tends to be spilled on the stack. So, we move the pointer
-  // to the last local value to the beginning of the block, so that
-  // all the values which have already been materialized,
-  // appear after the call. It also makes sense to skip intrinsics
-  // since they tend to be inlined.
-  if (!isa<IntrinsicInst>(Call))
-    flushLocalValueMap();
-
-  // An arbitrary call. Bail.
-  return false;
+  return fastLowerIntrinsicCall(II);
 }
 
-bool FastISel::SelectCast(const User *I, unsigned Opcode) {
+bool FastISel::selectCast(const User *I, unsigned Opcode) {
   EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
   EVT DstVT = TLI.getValueType(I->getType());
 
-  if (SrcVT == MVT::Other || !SrcVT.isSimple() ||
-      DstVT == MVT::Other || !DstVT.isSimple())
+  if (SrcVT == MVT::Other || !SrcVT.isSimple() || DstVT == MVT::Other ||
+      !DstVT.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
     return false;
 
@@ -859,24 +1235,22 @@
 
   bool InputRegIsKill = hasTrivialKill(I->getOperand(0));
 
-  unsigned ResultReg = FastEmit_r(SrcVT.getSimpleVT(),
-                                  DstVT.getSimpleVT(),
-                                  Opcode,
-                                  InputReg, InputRegIsKill);
+  unsigned ResultReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(),
+                                  Opcode, InputReg, InputRegIsKill);
   if (!ResultReg)
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool FastISel::SelectBitCast(const User *I) {
+bool FastISel::selectBitCast(const User *I) {
   // If the bitcast doesn't change the type, just use the operand value.
   if (I->getType() == I->getOperand(0)->getType()) {
     unsigned Reg = getRegForValue(I->getOperand(0));
-    if (Reg == 0)
+    if (!Reg)
       return false;
-    UpdateValueMap(I, Reg);
+    updateValueMap(I, Reg);
     return true;
   }
 
@@ -891,17 +1265,15 @@
   MVT SrcVT = SrcEVT.getSimpleVT();
   MVT DstVT = DstEVT.getSimpleVT();
   unsigned Op0 = getRegForValue(I->getOperand(0));
-  if (Op0 == 0)
-    // Unhandled operand. Halt "fast" selection and bail.
+  if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
     return false;
-
   bool Op0IsKill = hasTrivialKill(I->getOperand(0));
 
   // First, try to perform the bitcast by inserting a reg-reg copy.
   unsigned ResultReg = 0;
   if (SrcVT == DstVT) {
-    const TargetRegisterClass* SrcClass = TLI.getRegClassFor(SrcVT);
-    const TargetRegisterClass* DstClass = TLI.getRegClassFor(DstVT);
+    const TargetRegisterClass *SrcClass = TLI.getRegClassFor(SrcVT);
+    const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
     // Don't attempt a cross-class copy. It will likely fail.
     if (SrcClass == DstClass) {
       ResultReg = createResultReg(DstClass);
@@ -912,28 +1284,27 @@
 
   // If the reg-reg copy failed, select a BITCAST opcode.
   if (!ResultReg)
-    ResultReg = FastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0, Op0IsKill);
+    ResultReg = fastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0, Op0IsKill);
 
   if (!ResultReg)
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool
-FastISel::SelectInstruction(const Instruction *I) {
+bool FastISel::selectInstruction(const Instruction *I) {
   // Just before the terminator instruction, insert instructions to
   // feed PHI nodes in successor blocks.
   if (isa<TerminatorInst>(I))
-    if (!HandlePHINodesInSuccessorBlocks(I->getParent()))
+    if (!handlePHINodesInSuccessorBlocks(I->getParent()))
       return false;
 
   DbgLoc = I->getDebugLoc();
 
-  MachineBasicBlock::iterator SavedInsertPt = FuncInfo.InsertPt;
+  SavedInsertPt = FuncInfo.InsertPt;
 
-  if (const CallInst *Call = dyn_cast<CallInst>(I)) {
+  if (const auto *Call = dyn_cast<CallInst>(I)) {
     const Function *F = Call->getCalledFunction();
     LibFunc::Func Func;
 
@@ -951,40 +1322,39 @@
   }
 
   // First, try doing target-independent selection.
-  if (SelectOperator(I, I->getOpcode())) {
-    ++NumFastIselSuccessIndependent;
-    DbgLoc = DebugLoc();
-    return true;
-  }
-  // Remove dead code.  However, ignore call instructions since we've flushed
-  // the local value map and recomputed the insert point.
-  if (!isa<CallInst>(I)) {
+  if (!SkipTargetIndependentISel) {
+    if (selectOperator(I, I->getOpcode())) {
+      ++NumFastIselSuccessIndependent;
+      DbgLoc = DebugLoc();
+      return true;
+    }
+    // Remove dead code.
     recomputeInsertPt();
     if (SavedInsertPt != FuncInfo.InsertPt)
       removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
+    SavedInsertPt = FuncInfo.InsertPt;
   }
-
   // Next, try calling the target to attempt to handle the instruction.
-  SavedInsertPt = FuncInfo.InsertPt;
-  if (TargetSelectInstruction(I)) {
+  if (fastSelectInstruction(I)) {
     ++NumFastIselSuccessTarget;
     DbgLoc = DebugLoc();
     return true;
   }
-  // Check for dead code and remove as necessary.
+  // Remove dead code.
   recomputeInsertPt();
   if (SavedInsertPt != FuncInfo.InsertPt)
     removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
 
   DbgLoc = DebugLoc();
+  // Undo phi node updates, because they will be added again by SelectionDAG.
+  if (isa<TerminatorInst>(I))
+    FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
   return false;
 }
 
-/// FastEmitBranch - Emit an unconditional branch to the given block,
-/// unless it is the immediate (fall-through) successor, and update
-/// the CFG.
-void
-FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
+/// Emit an unconditional branch to the given block, unless it is the immediate
+/// (fall-through) successor, and update the CFG.
+void FastISel::fastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DbgLoc) {
   if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
       FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only instruction
@@ -1002,54 +1372,51 @@
   FuncInfo.MBB->addSuccessor(MSucc, BranchWeight);
 }
 
-/// SelectFNeg - Emit an FNeg operation.
-///
-bool
-FastISel::SelectFNeg(const User *I) {
+/// Emit an FNeg operation.
+bool FastISel::selectFNeg(const User *I) {
   unsigned OpReg = getRegForValue(BinaryOperator::getFNegArgument(I));
-  if (OpReg == 0) return false;
-
+  if (!OpReg)
+    return false;
   bool OpRegIsKill = hasTrivialKill(I);
 
   // If the target has ISD::FNEG, use it.
   EVT VT = TLI.getValueType(I->getType());
-  unsigned ResultReg = FastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(),
-                                  ISD::FNEG, OpReg, OpRegIsKill);
-  if (ResultReg != 0) {
-    UpdateValueMap(I, ResultReg);
+  unsigned ResultReg = fastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(), ISD::FNEG,
+                                  OpReg, OpRegIsKill);
+  if (ResultReg) {
+    updateValueMap(I, ResultReg);
     return true;
   }
 
   // Bitcast the value to integer, twiddle the sign bit with xor,
   // and then bitcast it back to floating-point.
-  if (VT.getSizeInBits() > 64) return false;
+  if (VT.getSizeInBits() > 64)
+    return false;
   EVT IntVT = EVT::getIntegerVT(I->getContext(), VT.getSizeInBits());
   if (!TLI.isTypeLegal(IntVT))
     return false;
 
-  unsigned IntReg = FastEmit_r(VT.getSimpleVT(), IntVT.getSimpleVT(),
+  unsigned IntReg = fastEmit_r(VT.getSimpleVT(), IntVT.getSimpleVT(),
                                ISD::BITCAST, OpReg, OpRegIsKill);
-  if (IntReg == 0)
+  if (!IntReg)
     return false;
 
-  unsigned IntResultReg = FastEmit_ri_(IntVT.getSimpleVT(), ISD::XOR,
-                                       IntReg, /*Kill=*/true,
-                                       UINT64_C(1) << (VT.getSizeInBits()-1),
-                                       IntVT.getSimpleVT());
-  if (IntResultReg == 0)
+  unsigned IntResultReg = fastEmit_ri_(
+      IntVT.getSimpleVT(), ISD::XOR, IntReg, /*IsKill=*/true,
+      UINT64_C(1) << (VT.getSizeInBits() - 1), IntVT.getSimpleVT());
+  if (!IntResultReg)
     return false;
 
-  ResultReg = FastEmit_r(IntVT.getSimpleVT(), VT.getSimpleVT(),
-                         ISD::BITCAST, IntResultReg, /*Kill=*/true);
-  if (ResultReg == 0)
+  ResultReg = fastEmit_r(IntVT.getSimpleVT(), VT.getSimpleVT(), ISD::BITCAST,
+                         IntResultReg, /*IsKill=*/true);
+  if (!ResultReg)
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool
-FastISel::SelectExtractValue(const User *U) {
+bool FastISel::selectExtractValue(const User *U) {
   const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(U);
   if (!EVI)
     return false;
@@ -1085,55 +1452,54 @@
   for (unsigned i = 0; i < VTIndex; i++)
     ResultReg += TLI.getNumRegisters(FuncInfo.Fn->getContext(), AggValueVTs[i]);
 
-  UpdateValueMap(EVI, ResultReg);
+  updateValueMap(EVI, ResultReg);
   return true;
 }
 
-bool
-FastISel::SelectOperator(const User *I, unsigned Opcode) {
+bool FastISel::selectOperator(const User *I, unsigned Opcode) {
   switch (Opcode) {
   case Instruction::Add:
-    return SelectBinaryOp(I, ISD::ADD);
+    return selectBinaryOp(I, ISD::ADD);
   case Instruction::FAdd:
-    return SelectBinaryOp(I, ISD::FADD);
+    return selectBinaryOp(I, ISD::FADD);
   case Instruction::Sub:
-    return SelectBinaryOp(I, ISD::SUB);
+    return selectBinaryOp(I, ISD::SUB);
   case Instruction::FSub:
     // FNeg is currently represented in LLVM IR as a special case of FSub.
     if (BinaryOperator::isFNeg(I))
-      return SelectFNeg(I);
-    return SelectBinaryOp(I, ISD::FSUB);
+      return selectFNeg(I);
+    return selectBinaryOp(I, ISD::FSUB);
   case Instruction::Mul:
-    return SelectBinaryOp(I, ISD::MUL);
+    return selectBinaryOp(I, ISD::MUL);
   case Instruction::FMul:
-    return SelectBinaryOp(I, ISD::FMUL);
+    return selectBinaryOp(I, ISD::FMUL);
   case Instruction::SDiv:
-    return SelectBinaryOp(I, ISD::SDIV);
+    return selectBinaryOp(I, ISD::SDIV);
   case Instruction::UDiv:
-    return SelectBinaryOp(I, ISD::UDIV);
+    return selectBinaryOp(I, ISD::UDIV);
   case Instruction::FDiv:
-    return SelectBinaryOp(I, ISD::FDIV);
+    return selectBinaryOp(I, ISD::FDIV);
   case Instruction::SRem:
-    return SelectBinaryOp(I, ISD::SREM);
+    return selectBinaryOp(I, ISD::SREM);
   case Instruction::URem:
-    return SelectBinaryOp(I, ISD::UREM);
+    return selectBinaryOp(I, ISD::UREM);
   case Instruction::FRem:
-    return SelectBinaryOp(I, ISD::FREM);
+    return selectBinaryOp(I, ISD::FREM);
   case Instruction::Shl:
-    return SelectBinaryOp(I, ISD::SHL);
+    return selectBinaryOp(I, ISD::SHL);
   case Instruction::LShr:
-    return SelectBinaryOp(I, ISD::SRL);
+    return selectBinaryOp(I, ISD::SRL);
   case Instruction::AShr:
-    return SelectBinaryOp(I, ISD::SRA);
+    return selectBinaryOp(I, ISD::SRA);
   case Instruction::And:
-    return SelectBinaryOp(I, ISD::AND);
+    return selectBinaryOp(I, ISD::AND);
   case Instruction::Or:
-    return SelectBinaryOp(I, ISD::OR);
+    return selectBinaryOp(I, ISD::OR);
   case Instruction::Xor:
-    return SelectBinaryOp(I, ISD::XOR);
+    return selectBinaryOp(I, ISD::XOR);
 
   case Instruction::GetElementPtr:
-    return SelectGetElementPtr(I);
+    return selectGetElementPtr(I);
 
   case Instruction::Br: {
     const BranchInst *BI = cast<BranchInst>(I);
@@ -1141,7 +1507,7 @@
     if (BI->isUnconditional()) {
       const BasicBlock *LLVMSucc = BI->getSuccessor(0);
       MachineBasicBlock *MSucc = FuncInfo.MBBMap[LLVMSucc];
-      FastEmitBranch(MSucc, BI->getDebugLoc());
+      fastEmitBranch(MSucc, BI->getDebugLoc());
       return true;
     }
 
@@ -1152,7 +1518,7 @@
 
   case Instruction::Unreachable:
     if (TM.Options.TrapUnreachable)
-      return FastEmit_(MVT::Other, MVT::Other, ISD::TRAP) != 0;
+      return fastEmit_(MVT::Other, MVT::Other, ISD::TRAP) != 0;
     else
       return true;
 
@@ -1165,38 +1531,39 @@
     return false;
 
   case Instruction::Call:
-    return SelectCall(I);
+    return selectCall(I);
 
   case Instruction::BitCast:
-    return SelectBitCast(I);
+    return selectBitCast(I);
 
   case Instruction::FPToSI:
-    return SelectCast(I, ISD::FP_TO_SINT);
+    return selectCast(I, ISD::FP_TO_SINT);
   case Instruction::ZExt:
-    return SelectCast(I, ISD::ZERO_EXTEND);
+    return selectCast(I, ISD::ZERO_EXTEND);
   case Instruction::SExt:
-    return SelectCast(I, ISD::SIGN_EXTEND);
+    return selectCast(I, ISD::SIGN_EXTEND);
   case Instruction::Trunc:
-    return SelectCast(I, ISD::TRUNCATE);
+    return selectCast(I, ISD::TRUNCATE);
   case Instruction::SIToFP:
-    return SelectCast(I, ISD::SINT_TO_FP);
+    return selectCast(I, ISD::SINT_TO_FP);
 
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
     EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
     EVT DstVT = TLI.getValueType(I->getType());
     if (DstVT.bitsGT(SrcVT))
-      return SelectCast(I, ISD::ZERO_EXTEND);
+      return selectCast(I, ISD::ZERO_EXTEND);
     if (DstVT.bitsLT(SrcVT))
-      return SelectCast(I, ISD::TRUNCATE);
+      return selectCast(I, ISD::TRUNCATE);
     unsigned Reg = getRegForValue(I->getOperand(0));
-    if (Reg == 0) return false;
-    UpdateValueMap(I, Reg);
+    if (!Reg)
+      return false;
+    updateValueMap(I, Reg);
     return true;
   }
 
   case Instruction::ExtractValue:
-    return SelectExtractValue(I);
+    return selectExtractValue(I);
 
   case Instruction::PHI:
     llvm_unreachable("FastISel shouldn't visit PHI nodes!");
@@ -1207,83 +1574,72 @@
   }
 }
 
-FastISel::FastISel(FunctionLoweringInfo &funcInfo,
-                   const TargetLibraryInfo *libInfo)
-  : FuncInfo(funcInfo),
-    MF(funcInfo.MF),
-    MRI(FuncInfo.MF->getRegInfo()),
-    MFI(*FuncInfo.MF->getFrameInfo()),
-    MCP(*FuncInfo.MF->getConstantPool()),
-    TM(FuncInfo.MF->getTarget()),
-    DL(*TM.getDataLayout()),
-    TII(*TM.getInstrInfo()),
-    TLI(*TM.getTargetLowering()),
-    TRI(*TM.getRegisterInfo()),
-    LibInfo(libInfo) {
-}
+FastISel::FastISel(FunctionLoweringInfo &FuncInfo,
+                   const TargetLibraryInfo *LibInfo,
+                   bool SkipTargetIndependentISel)
+    : FuncInfo(FuncInfo), MF(FuncInfo.MF), MRI(FuncInfo.MF->getRegInfo()),
+      MFI(*FuncInfo.MF->getFrameInfo()), MCP(*FuncInfo.MF->getConstantPool()),
+      TM(FuncInfo.MF->getTarget()), DL(*MF->getSubtarget().getDataLayout()),
+      TII(*MF->getSubtarget().getInstrInfo()),
+      TLI(*MF->getSubtarget().getTargetLowering()),
+      TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),
+      SkipTargetIndependentISel(SkipTargetIndependentISel) {}
 
 FastISel::~FastISel() {}
 
-bool FastISel::FastLowerArguments() {
+bool FastISel::fastLowerArguments() { return false; }
+
+bool FastISel::fastLowerCall(CallLoweringInfo & /*CLI*/) { return false; }
+
+bool FastISel::fastLowerIntrinsicCall(const IntrinsicInst * /*II*/) {
   return false;
 }
 
-unsigned FastISel::FastEmit_(MVT, MVT,
-                             unsigned) {
+unsigned FastISel::fastEmit_(MVT, MVT, unsigned) { return 0; }
+
+unsigned FastISel::fastEmit_r(MVT, MVT, unsigned, unsigned /*Op0*/,
+                              bool /*Op0IsKill*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_r(MVT, MVT,
-                              unsigned,
-                              unsigned /*Op0*/, bool /*Op0IsKill*/) {
+unsigned FastISel::fastEmit_rr(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               bool /*Op0IsKill*/, unsigned /*Op1*/,
+                               bool /*Op1IsKill*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_rr(MVT, MVT,
-                               unsigned,
-                               unsigned /*Op0*/, bool /*Op0IsKill*/,
-                               unsigned /*Op1*/, bool /*Op1IsKill*/) {
+unsigned FastISel::fastEmit_i(MVT, MVT, unsigned, uint64_t /*Imm*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_i(MVT, MVT, unsigned, uint64_t /*Imm*/) {
+unsigned FastISel::fastEmit_f(MVT, MVT, unsigned,
+                              const ConstantFP * /*FPImm*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_f(MVT, MVT,
-                              unsigned, const ConstantFP * /*FPImm*/) {
+unsigned FastISel::fastEmit_ri(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               bool /*Op0IsKill*/, uint64_t /*Imm*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_ri(MVT, MVT,
-                               unsigned,
-                               unsigned /*Op0*/, bool /*Op0IsKill*/,
-                               uint64_t /*Imm*/) {
-  return 0;
-}
-
-unsigned FastISel::FastEmit_rf(MVT, MVT,
-                               unsigned,
-                               unsigned /*Op0*/, bool /*Op0IsKill*/,
+unsigned FastISel::fastEmit_rf(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               bool /*Op0IsKill*/,
                                const ConstantFP * /*FPImm*/) {
   return 0;
 }
 
-unsigned FastISel::FastEmit_rri(MVT, MVT,
-                                unsigned,
-                                unsigned /*Op0*/, bool /*Op0IsKill*/,
-                                unsigned /*Op1*/, bool /*Op1IsKill*/,
-                                uint64_t /*Imm*/) {
+unsigned FastISel::fastEmit_rri(MVT, MVT, unsigned, unsigned /*Op0*/,
+                                bool /*Op0IsKill*/, unsigned /*Op1*/,
+                                bool /*Op1IsKill*/, uint64_t /*Imm*/) {
   return 0;
 }
 
-/// FastEmit_ri_ - This method is a wrapper of FastEmit_ri. It first tries
-/// to emit an instruction with an immediate operand using FastEmit_ri.
+/// This method is a wrapper of fastEmit_ri. It first tries to emit an
+/// instruction with an immediate operand using fastEmit_ri.
 /// If that fails, it materializes the immediate into a register and try
-/// FastEmit_rr instead.
-unsigned FastISel::FastEmit_ri_(MVT VT, unsigned Opcode,
-                                unsigned Op0, bool Op0IsKill,
-                                uint64_t Imm, MVT ImmType) {
+/// fastEmit_rr instead.
+unsigned FastISel::fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0,
+                                bool Op0IsKill, uint64_t Imm, MVT ImmType) {
   // If this is a multiply by a power of two, emit this as a shift left.
   if (Opcode == ISD::MUL && isPowerOf2_64(Imm)) {
     Opcode = ISD::SHL;
@@ -1301,30 +1657,29 @@
     return 0;
 
   // First check if immediate type is legal. If not, we can't use the ri form.
-  unsigned ResultReg = FastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm);
-  if (ResultReg != 0)
+  unsigned ResultReg = fastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm);
+  if (ResultReg)
     return ResultReg;
-  unsigned MaterialReg = FastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
-  if (MaterialReg == 0) {
+  unsigned MaterialReg = fastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
+  if (!MaterialReg) {
     // This is a bit ugly/slow, but failing here means falling out of
     // fast-isel, which would be very slow.
-    IntegerType *ITy = IntegerType::get(FuncInfo.Fn->getContext(),
-                                              VT.getSizeInBits());
+    IntegerType *ITy =
+        IntegerType::get(FuncInfo.Fn->getContext(), VT.getSizeInBits());
     MaterialReg = getRegForValue(ConstantInt::get(ITy, Imm));
-    assert (MaterialReg != 0 && "Unable to materialize imm.");
-    if (MaterialReg == 0) return 0;
+    if (!MaterialReg)
+      return 0;
   }
-  return FastEmit_rr(VT, VT, Opcode,
-                     Op0, Op0IsKill,
-                     MaterialReg, /*Kill=*/true);
+  return fastEmit_rr(VT, VT, Opcode, Op0, Op0IsKill, MaterialReg,
+                     /*IsKill=*/true);
 }
 
-unsigned FastISel::createResultReg(const TargetRegisterClass* RC) {
+unsigned FastISel::createResultReg(const TargetRegisterClass *RC) {
   return MRI.createVirtualRegister(RC);
 }
 
-unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II,
-                                            unsigned Op, unsigned OpNum) {
+unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
+                                            unsigned OpNum) {
   if (TargetRegisterInfo::isVirtualRegister(Op)) {
     const TargetRegisterClass *RegClass =
         TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
@@ -1340,8 +1695,8 @@
   return Op;
 }
 
-unsigned FastISel::FastEmitInst_(unsigned MachineInstOpcode,
-                                 const TargetRegisterClass* RC) {
+unsigned FastISel::fastEmitInst_(unsigned MachineInstOpcode,
+                                 const TargetRegisterClass *RC) {
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
@@ -1349,9 +1704,9 @@
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_r(unsigned MachineInstOpcode,
-                                  const TargetRegisterClass *RC,
-                                  unsigned Op0, bool Op0IsKill) {
+unsigned FastISel::fastEmitInst_r(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC, unsigned Op0,
+                                  bool Op0IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1359,10 +1714,10 @@
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill));
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
@@ -1370,10 +1725,10 @@
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   unsigned Op1, bool Op1IsKill) {
+unsigned FastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   bool Op0IsKill, unsigned Op1,
+                                   bool Op1IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1382,23 +1737,23 @@
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill));
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   unsigned Op1, bool Op1IsKill,
-                                   unsigned Op2, bool Op2IsKill) {
+unsigned FastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    bool Op0IsKill, unsigned Op1,
+                                    bool Op1IsKill, unsigned Op2,
+                                    bool Op2IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1408,48 +1763,23 @@
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addReg(Op2, Op2IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addReg(Op2, getKillRegState(Op2IsKill));
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addReg(Op2, Op2IsKill * RegState::Kill);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addReg(Op2, getKillRegState(Op2IsKill));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   uint64_t Imm) {
-  const MCInstrDesc &II = TII.get(MachineInstOpcode);
-
-  unsigned ResultReg = createResultReg(RC);
-  RC = TII.getRegClass(II, II.getNumDefs(), &TRI, *FuncInfo.MF);
-  MRI.constrainRegClass(Op0, RC);
-
-  if (II.getNumDefs() >= 1)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Imm);
-  else {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Imm);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
-  }
-  return ResultReg;
-}
-
-unsigned FastISel::FastEmitInst_rii(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   uint64_t Imm1, uint64_t Imm2) {
+unsigned FastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   bool Op0IsKill, uint64_t Imm) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1457,24 +1787,22 @@
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Imm1)
-      .addImm(Imm2);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addImm(Imm1)
-      .addImm(Imm2);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC,
-                                   unsigned Op0, bool Op0IsKill,
-                                   const ConstantFP *FPImm) {
+unsigned FastISel::fastEmitInst_rii(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    bool Op0IsKill, uint64_t Imm1,
+                                    uint64_t Imm2) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1482,23 +1810,46 @@
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addFPImm(FPImm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addFPImm(FPImm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
-                                    const TargetRegisterClass *RC,
-                                    unsigned Op0, bool Op0IsKill,
-                                    unsigned Op1, bool Op1IsKill,
-                                    uint64_t Imm) {
+unsigned FastISel::fastEmitInst_rf(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   bool Op0IsKill, const ConstantFP *FPImm) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addFPImm(FPImm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addFPImm(FPImm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    bool Op0IsKill, unsigned Op1,
+                                    bool Op1IsKill, uint64_t Imm) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1507,25 +1858,25 @@
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addImm(Imm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addImm(Imm);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_rrii(unsigned MachineInstOpcode,
+unsigned FastISel::fastEmitInst_rrii(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill,
-                                     unsigned Op1, bool Op1IsKill,
-                                     uint64_t Imm1, uint64_t Imm2) {
+                                     unsigned Op0, bool Op0IsKill, unsigned Op1,
+                                     bool Op1IsKill, uint64_t Imm1,
+                                     uint64_t Imm2) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   unsigned ResultReg = createResultReg(RC);
@@ -1534,28 +1885,30 @@
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addImm(Imm1).addImm(Imm2);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
-      .addReg(Op0, Op0IsKill * RegState::Kill)
-      .addReg(Op1, Op1IsKill * RegState::Kill)
-      .addImm(Imm1).addImm(Imm2);
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
-                                  const TargetRegisterClass *RC,
-                                  uint64_t Imm) {
+unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC, uint64_t Imm) {
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg).addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addImm(Imm);
   else {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addImm(Imm);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1564,41 +1917,41 @@
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_ii(unsigned MachineInstOpcode,
-                                  const TargetRegisterClass *RC,
-                                  uint64_t Imm1, uint64_t Imm2) {
+unsigned FastISel::fastEmitInst_ii(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, uint64_t Imm1,
+                                   uint64_t Imm2) {
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
-      .addImm(Imm1).addImm(Imm2);
+        .addImm(Imm1)
+        .addImm(Imm2);
   else {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addImm(Imm1).addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addImm(Imm1)
+        .addImm(Imm2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
   }
   return ResultReg;
 }
 
-unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT,
-                                              unsigned Op0, bool Op0IsKill,
-                                              uint32_t Idx) {
+unsigned FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
+                                              bool Op0IsKill, uint32_t Idx) {
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
   assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
          "Cannot yet extract from physregs");
   const TargetRegisterClass *RC = MRI.getRegClass(Op0);
   MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-          DbgLoc, TII.get(TargetOpcode::COPY), ResultReg)
-    .addReg(Op0, getKillRegState(Op0IsKill), Idx);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+          ResultReg).addReg(Op0, getKillRegState(Op0IsKill), Idx);
   return ResultReg;
 }
 
-/// FastEmitZExtFromI1 - Emit MachineInstrs to compute the value of Op
-/// with all but the least significant bit set to zero.
-unsigned FastISel::FastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
-  return FastEmit_ri(VT, VT, ISD::AND, Op0, Op0IsKill, 1);
+/// Emit MachineInstrs to compute the value of Op with all but the least
+/// significant bit set to zero.
+unsigned FastISel::fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
+  return fastEmit_ri(VT, VT, ISD::AND, Op0, Op0IsKill, 1);
 }
 
 /// HandlePHINodesInSuccessorBlocks - Handle PHI nodes in successor blocks.
@@ -1607,22 +1960,24 @@
 /// nodes as input.  We cannot just directly add them, because expansion
 /// might result in multiple MBB's for one BB.  As such, the start of the
 /// BB might correspond to a different MBB than the end.
-bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
   const TerminatorInst *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
-  unsigned OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
+  FuncInfo.OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
 
   // Check successor nodes' PHI nodes that expect a constant to be available
   // from this block.
   for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
     const BasicBlock *SuccBB = TI->getSuccessor(succ);
-    if (!isa<PHINode>(SuccBB->begin())) continue;
+    if (!isa<PHINode>(SuccBB->begin()))
+      continue;
     MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
 
     // If this terminator has multiple identical successors (common for
     // switches), only handle each succ once.
-    if (!SuccsHandled.insert(SuccMBB)) continue;
+    if (!SuccsHandled.insert(SuccMBB).second)
+      continue;
 
     MachineBasicBlock::iterator MBBI = SuccMBB->begin();
 
@@ -1630,10 +1985,11 @@
     // nodes and Machine PHI nodes, but the incoming operands have not been
     // emitted yet.
     for (BasicBlock::const_iterator I = SuccBB->begin();
-         const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+         const auto *PN = dyn_cast<PHINode>(I); ++I) {
 
       // Ignore dead phi's.
-      if (PN->use_empty()) continue;
+      if (PN->use_empty())
+        continue;
 
       // Only handle legal types. Two interesting things to note here. First,
       // by bailing out early, we may leave behind some dead instructions,
@@ -1644,10 +2000,8 @@
       EVT VT = TLI.getValueType(PN->getType(), /*AllowUnknown=*/true);
       if (VT == MVT::Other || !TLI.isTypeLegal(VT)) {
         // Handle integer promotions, though, because they're common and easy.
-        if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
-          VT = TLI.getTypeToTransformTo(LLVMBB->getContext(), VT);
-        else {
-          FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+        if (!(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)) {
+          FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
           return false;
         }
       }
@@ -1657,12 +2011,12 @@
       // Set the DebugLoc for the copy. Prefer the location of the operand
       // if there is one; use the location of the PHI otherwise.
       DbgLoc = PN->getDebugLoc();
-      if (const Instruction *Inst = dyn_cast<Instruction>(PHIOp))
+      if (const auto *Inst = dyn_cast<Instruction>(PHIOp))
         DbgLoc = Inst->getDebugLoc();
 
       unsigned Reg = getRegForValue(PHIOp);
-      if (Reg == 0) {
-        FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+      if (!Reg) {
+        FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
         return false;
       }
       FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg));
@@ -1675,17 +2029,17 @@
 
 bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
   assert(LI->hasOneUse() &&
-      "tryToFoldLoad expected a LoadInst with a single use");
+         "tryToFoldLoad expected a LoadInst with a single use");
   // We know that the load has a single use, but don't know what it is.  If it
   // isn't one of the folded instructions, then we can't succeed here.  Handle
   // this by scanning the single-use users of the load until we get to FoldInst.
-  unsigned MaxUsers = 6;  // Don't scan down huge single-use chains of instrs.
+  unsigned MaxUsers = 6; // Don't scan down huge single-use chains of instrs.
 
   const Instruction *TheUser = LI->user_back();
-  while (TheUser != FoldInst &&   // Scan up until we find FoldInst.
+  while (TheUser != FoldInst && // Scan up until we find FoldInst.
          // Stay in the right block.
          TheUser->getParent() == FoldInst->getParent() &&
-         --MaxUsers) {  // Don't scan too far.
+         --MaxUsers) { // Don't scan too far.
     // If there are multiple or no uses of this instruction, then bail out.
     if (!TheUser->hasOneUse())
       return false;
@@ -1707,7 +2061,7 @@
   // then there actually was no reference to it.  Perhaps the load is referenced
   // by a dead instruction.
   unsigned LoadReg = getRegForValue(LI);
-  if (LoadReg == 0)
+  if (!LoadReg)
     return false;
 
   // We can't fold if this vreg has no uses or more than one use.  Multiple uses
@@ -1765,19 +2119,20 @@
     Flags = MachineMemOperand::MOStore;
     Ptr = SI->getPointerOperand();
     ValTy = SI->getValueOperand()->getType();
-  } else {
+  } else
     return nullptr;
-  }
 
-  bool IsNonTemporal = I->getMetadata("nontemporal") != nullptr;
-  bool IsInvariant = I->getMetadata("invariant.load") != nullptr;
-  const MDNode *TBAAInfo = I->getMetadata(LLVMContext::MD_tbaa);
+  bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal) != nullptr;
+  bool IsInvariant = I->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
   const MDNode *Ranges = I->getMetadata(LLVMContext::MD_range);
 
-  if (Alignment == 0)  // Ensure that codegen never sees alignment 0.
+  AAMDNodes AAInfo;
+  I->getAAMetadata(AAInfo);
+
+  if (Alignment == 0) // Ensure that codegen never sees alignment 0.
     Alignment = DL.getABITypeAlignment(ValTy);
 
-  unsigned Size = TM.getDataLayout()->getTypeStoreSize(ValTy);
+  unsigned Size = DL.getTypeStoreSize(ValTy);
 
   if (IsVolatile)
     Flags |= MachineMemOperand::MOVolatile;
@@ -1787,5 +2142,45 @@
     Flags |= MachineMemOperand::MOInvariant;
 
   return FuncInfo.MF->getMachineMemOperand(MachinePointerInfo(Ptr), Flags, Size,
-                                           Alignment, TBAAInfo, Ranges);
+                                           Alignment, AAInfo, Ranges);
+}
+
+CmpInst::Predicate FastISel::optimizeCmpPredicate(const CmpInst *CI) const {
+  // If both operands are the same, then try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = CI->getPredicate();
+  if (CI->getOperand(0) != CI->getOperand(1))
+    return Predicate;
+
+  switch (Predicate) {
+  default: llvm_unreachable("Invalid predicate!");
+  case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OEQ:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OGE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OLE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_ONE:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_ORD:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_UNO:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UEQ:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UGT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_ULT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UNE:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_TRUE:  Predicate = CmpInst::FCMP_TRUE;  break;
+
+  case CmpInst::ICMP_EQ:    Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_NE:    Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_ULT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SLE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  }
+
+  return Predicate;
 }

diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index ae124e8..86b9542 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp

@@ -36,6 +36,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -55,58 +56,71 @@
   return false;
 }
 
+static ISD::NodeType getPreferredExtendForValue(const Value *V) {
+  // For the users of the source value being used for compare instruction, if
+  // the number of signed predicate is greater than unsigned predicate, we
+  // prefer to use SIGN_EXTEND.
+  //
+  // With this optimization, we would be able to reduce some redundant sign or
+  // zero extension instruction, and eventually more machine CSE opportunities
+  // can be exposed.
+  ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+  unsigned NumOfSigned = 0, NumOfUnsigned = 0;
+  for (const User *U : V->users()) {
+    if (const auto *CI = dyn_cast<CmpInst>(U)) {
+      NumOfSigned += CI->isSigned();
+      NumOfUnsigned += CI->isUnsigned();
+    }
+  }
+  if (NumOfSigned > NumOfUnsigned)
+    ExtendKind = ISD::SIGN_EXTEND;
+
+  return ExtendKind;
+}
+
 void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
                                SelectionDAG *DAG) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   Fn = &fn;
   MF = &mf;
+  TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
   GetReturnInfo(Fn->getReturnType(), Fn->getAttributes(), Outs, *TLI);
   CanLowerReturn = TLI->CanLowerReturn(Fn->getCallingConv(), *MF,
-                                       Fn->isVarArg(),
-                                       Outs, Fn->getContext());
+                                       Fn->isVarArg(), Outs, Fn->getContext());
 
   // Initialize the mapping of values to registers.  This is only set up for
   // instruction values that are used outside of the block that defines
   // them.
   Function::const_iterator BB = Fn->begin(), EB = Fn->end();
-  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-    if (const AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-      // Don't fold inalloca allocas or other dynamic allocas into the initial
-      // stack frame allocation, even if they are in the entry block.
-      if (!AI->isStaticAlloca())
-        continue;
-
-      if (const ConstantInt *CUI = dyn_cast<ConstantInt>(AI->getArraySize())) {
-        Type *Ty = AI->getAllocatedType();
-        uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
-        unsigned Align =
-          std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty),
-                   AI->getAlignment());
-
-        TySize *= CUI->getZExtValue();   // Get total allocated size.
-        if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
-
-        StaticAllocaMap[AI] =
-          MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI);
-      }
-    }
-
   for (; BB != EB; ++BB)
     for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
          I != E; ++I) {
-      // Look for dynamic allocas.
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-        if (!AI->isStaticAlloca()) {
+        // Static allocas can be folded into the initial stack frame adjustment.
+        if (AI->isStaticAlloca()) {
+          const ConstantInt *CUI = cast<ConstantInt>(AI->getArraySize());
+          Type *Ty = AI->getAllocatedType();
+          uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
+          unsigned Align =
+              std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty),
+                       AI->getAlignment());
+
+          TySize *= CUI->getZExtValue();   // Get total allocated size.
+          if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
+
+          StaticAllocaMap[AI] =
+            MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI);
+
+        } else {
           unsigned Align = std::max(
               (unsigned)TLI->getDataLayout()->getPrefTypeAlignment(
                 AI->getAllocatedType()),
               AI->getAlignment());
-          unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+          unsigned StackAlign =
+              MF->getSubtarget().getFrameLowering()->getStackAlignment();
           if (Align <= StackAlign)
             Align = 0;
           // Inform the Frame Information that we have variable-sized objects.
@@ -126,9 +140,9 @@
             if (Op.Type == InlineAsm::isClobber) {
               // Clobbers don't have SDValue operands, hence SDValue().
               TLI->ComputeConstraintToUse(Op, SDValue(), DAG);
-              std::pair<unsigned, const TargetRegisterClass*> PhysReg =
-                TLI->getRegForInlineAsmConstraint(Op.ConstraintCode,
-                                                  Op.ConstraintVT);
+              std::pair<unsigned, const TargetRegisterClass *> PhysReg =
+                  TLI->getRegForInlineAsmConstraint(Op.ConstraintCode,
+                                                   Op.ConstraintVT);
               if (PhysReg.first == SP)
                 MF->getFrameInfo()->setHasInlineAsmWithSPAdjust(true);
             }
@@ -136,6 +150,21 @@
         }
       }
 
+      // Look for calls to the @llvm.va_start intrinsic. We can omit some
+      // prologue boilerplate for variadic functions that don't examine their
+      // arguments.
+      if (const auto *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::vastart)
+          MF->getFrameInfo()->setHasVAStart(true);
+      }
+
+      // If we have a musttail call in a variadic funciton, we need to ensure we
+      // forward implicit register parameters.
+      if (const auto *CI = dyn_cast<CallInst>(I)) {
+        if (CI->isMustTailCall() && Fn->isVarArg())
+          MF->getFrameInfo()->setHasMustTailInVarArgFunc(true);
+      }
+
       // Mark values used outside their block as exported, by allocating
       // a virtual register for them.
       if (isUsedOutsideOfDefiningBlock(I))
@@ -166,13 +195,16 @@
                 StaticAllocaMap.find(AI);
               if (SI != StaticAllocaMap.end()) { // Check for VLAs.
                 int FI = SI->second;
-                MMI.setVariableDbgInfo(DI->getVariable(),
+                MMI.setVariableDbgInfo(DI->getVariable(), DI->getExpression(),
                                        FI, DI->getDebugLoc());
               }
             }
           }
         }
       }
+
+      // Decide the preferred extend type for a value.
+      PreferredExtendType[I] = getPreferredExtendForValue(I);
     }
 
   // Create an initial MachineBasicBlock for each LLVM BasicBlock in F.  This
@@ -208,7 +240,7 @@
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
         unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT);
-        const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+        const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
         for (unsigned i = 0; i != NumRegisters; ++i)
           BuildMI(MBB, DL, TII->get(TargetOpcode::PHI), PHIReg + i);
         PHIReg += NumRegisters;
@@ -241,12 +273,13 @@
   ArgDbgValues.clear();
   ByValArgFrameIndexMap.clear();
   RegFixups.clear();
+  PreferredExtendType.clear();
 }
 
 /// CreateReg - Allocate a single virtual register for the given type.
 unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
-  return RegInfo->
-    createVirtualRegister(TM.getTargetLowering()->getRegClassFor(VT));
+  return RegInfo->createVirtualRegister(
+      MF->getSubtarget().getTargetLowering()->getRegClassFor(VT));
 }
 
 /// CreateRegs - Allocate the appropriate number of virtual registers of
@@ -257,7 +290,7 @@
 /// will assign registers for each member or element.
 ///
 unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
 
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(*TLI, Ty, ValueVTs);
@@ -306,8 +339,6 @@
   if (!Ty->isIntegerTy() || Ty->isVectorTy())
     return;
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   SmallVector<EVT, 1> ValueVTs;
   ComputeValueVTs(*TLI, Ty, ValueVTs);
   assert(ValueVTs.size() == 1 &&
@@ -452,7 +483,7 @@
 
   // Gather all the type infos for this landing pad and pass them along to
   // MachineModuleInfo.
-  std::vector<const GlobalVariable *> TyInfo;
+  std::vector<const GlobalValue *> TyInfo;
   unsigned N = I.getNumArgOperands();
 
   for (unsigned i = N - 1; i > 1; --i) {
@@ -510,14 +541,14 @@
     Value *Val = I.getClause(i - 1);
     if (I.isCatch(i - 1)) {
       MMI.addCatchTypeInfo(MBB,
-                           dyn_cast<GlobalVariable>(Val->stripPointerCasts()));
+                           dyn_cast<GlobalValue>(Val->stripPointerCasts()));
     } else {
       // Add filters in a list.
       Constant *CVal = cast<Constant>(Val);
-      SmallVector<const GlobalVariable*, 4> FilterList;
+      SmallVector<const GlobalValue*, 4> FilterList;
       for (User::op_iterator
              II = CVal->op_begin(), IE = CVal->op_end(); II != IE; ++II)
-        FilterList.push_back(cast<GlobalVariable>((*II)->stripPointerCasts()));
+        FilterList.push_back(cast<GlobalValue>((*II)->stripPointerCasts()));
 
       MMI.addFilterTypeInfo(MBB, FilterList);
     }

diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 7c124b8..a65f33e 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp

@@ -27,7 +27,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "instr-emitter"
@@ -265,12 +265,16 @@
       MIB.addReg(VRBase, RegState::Define);
     }
 
-    SDValue Op(Node, i);
-    if (IsClone)
-      VRBaseMap.erase(Op);
-    bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
-    (void)isNew; // Silence compiler warning.
-    assert(isNew && "Node emitted out of order - early");
+    // If this def corresponds to a result of the SDNode insert the VRBase into
+    // the lookup map.
+    if (i < NumResults) {
+      SDValue Op(Node, i);
+      if (IsClone)
+        VRBaseMap.erase(Op);
+      bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+      (void)isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+    }
   }
 }
 
@@ -402,10 +406,10 @@
     Type *Type = CP->getType();
     // MachineConstantPool wants an explicit alignment.
     if (Align == 0) {
-      Align = TM->getDataLayout()->getPrefTypeAlignment(Type);
+      Align = MF->getSubtarget().getDataLayout()->getPrefTypeAlignment(Type);
       if (Align == 0) {
         // Alignment of vector types.  FIXME!
-        Align = TM->getDataLayout()->getTypeAllocSize(Type);
+        Align = MF->getSubtarget().getDataLayout()->getTypeAllocSize(Type);
       }
     }
 
@@ -643,14 +647,18 @@
 InstrEmitter::EmitDbgValue(SDDbgValue *SD,
                            DenseMap<SDValue, unsigned> &VRBaseMap) {
   uint64_t Offset = SD->getOffset();
-  MDNode* MDPtr = SD->getMDPtr();
+  MDNode *Var = SD->getVariable();
+  MDNode *Expr = SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
 
   if (SD->getKind() == SDDbgValue::FRAMEIX) {
     // Stack address; this needs to be lowered in target-dependent fashion.
     // EmitTargetCodeForFrameDebugValue is responsible for allocation.
     return BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE))
-        .addFrameIndex(SD->getFrameIx()).addImm(Offset).addMetadata(MDPtr);
+        .addFrameIndex(SD->getFrameIx())
+        .addImm(Offset)
+        .addMetadata(Var)
+        .addMetadata(Expr);
   }
   // Otherwise, we're going to create an instruction here.
   const MCInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
@@ -696,7 +704,8 @@
     MIB.addReg(0U, RegState::Debug);
   }
 
-  MIB.addMetadata(MDPtr);
+  MIB.addMetadata(Var);
+  MIB.addMetadata(Expr);
 
   return &*MIB;
 }
@@ -859,9 +868,7 @@
     MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
 
   // Run post-isel target hook to adjust this instruction if needed.
-#ifdef NDEBUG
   if (II.hasPostISelHook())
-#endif
     TLI->AdjustInstrPostInstrSelection(MIB, Node);
 }
 
@@ -1013,11 +1020,8 @@
 /// at the given position in the given block.
 InstrEmitter::InstrEmitter(MachineBasicBlock *mbb,
                            MachineBasicBlock::iterator insertpos)
-  : MF(mbb->getParent()),
-    MRI(&MF->getRegInfo()),
-    TM(&MF->getTarget()),
-    TII(TM->getInstrInfo()),
-    TRI(TM->getRegisterInfo()),
-    TLI(TM->getTargetLowering()),
-    MBB(mbb), InsertPos(insertpos) {
-}
+    : MF(mbb->getParent()), MRI(&MF->getRegInfo()),
+      TII(MF->getSubtarget().getInstrInfo()),
+      TRI(MF->getSubtarget().getRegisterInfo()),
+      TLI(MF->getSubtarget().getTargetLowering()), MBB(mbb),
+      InsertPos(insertpos) {}

diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 920dda8..7b86f7d 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef INSTREMITTER_H
-#define INSTREMITTER_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_INSTREMITTER_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_INSTREMITTER_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -29,7 +29,6 @@
 class InstrEmitter {
   MachineFunction *MF;
   MachineRegisterInfo *MRI;
-  const TargetMachine *TM;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const TargetLowering *TLI;

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c0e8c8c..5d17a5f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -33,8 +34,11 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "legalizedag"
+
 //===----------------------------------------------------------------------===//
 /// SelectionDAGLegalize - This takes an arbitrary SelectionDAG as input and
 /// hacks on it until the target machine can handle it.  This involves
@@ -48,16 +52,17 @@
 /// will attempt merge setcc and brc instructions into brcc's.
 ///
 namespace {
-class SelectionDAGLegalize : public SelectionDAG::DAGUpdateListener {
+class SelectionDAGLegalize {
   const TargetMachine &TM;
   const TargetLowering &TLI;
   SelectionDAG &DAG;
 
-  /// LegalizePosition - The iterator for walking through the node list.
-  SelectionDAG::allnodes_iterator LegalizePosition;
+  /// \brief The set of nodes which have already been legalized. We hold a
+  /// reference to it in order to update as necessary on node deletion.
+  SmallPtrSetImpl<SDNode *> &LegalizedNodes;
 
-  /// LegalizedNodes - The set of nodes which have already been legalized.
-  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  /// \brief A set of all the nodes updated during legalization.
+  SmallSetVector<SDNode *, 16> *UpdatedNodes;
 
   EVT getSetCCResultType(EVT VT) const {
     return TLI.getSetCCResultType(*DAG.getContext(), VT);
@@ -66,14 +71,16 @@
   // Libcall insertion helpers.
 
 public:
-  explicit SelectionDAGLegalize(SelectionDAG &DAG);
+  SelectionDAGLegalize(SelectionDAG &DAG,
+                       SmallPtrSetImpl<SDNode *> &LegalizedNodes,
+                       SmallSetVector<SDNode *, 16> *UpdatedNodes = nullptr)
+      : TM(DAG.getTarget()), TLI(DAG.getTargetLoweringInfo()), DAG(DAG),
+        LegalizedNodes(LegalizedNodes), UpdatedNodes(UpdatedNodes) {}
 
-  void LegalizeDAG();
-
-private:
-  /// LegalizeOp - Legalizes the given operation.
+  /// \brief Legalizes the given operation.
   void LegalizeOp(SDNode *Node);
 
+private:
   SDValue OptimizeFloatStore(StoreSDNode *ST);
 
   void LegalizeLoadOps(SDNode *Node);
@@ -145,37 +152,49 @@
   void ExpandNode(SDNode *Node);
   void PromoteNode(SDNode *Node);
 
-  void ForgetNode(SDNode *N) {
-    LegalizedNodes.erase(N);
-    if (LegalizePosition == SelectionDAG::allnodes_iterator(N))
-      ++LegalizePosition;
-  }
-
 public:
-  // DAGUpdateListener implementation.
-  void NodeDeleted(SDNode *N, SDNode *E) override {
-    ForgetNode(N);
-  }
-  void NodeUpdated(SDNode *N) override {}
-
   // Node replacement helpers
   void ReplacedNode(SDNode *N) {
-    if (N->use_empty()) {
-      DAG.RemoveDeadNode(N);
-    } else {
-      ForgetNode(N);
-    }
+    LegalizedNodes.erase(N);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(N);
   }
   void ReplaceNode(SDNode *Old, SDNode *New) {
+    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+          dbgs() << "     with:      "; New->dump(&DAG));
+
+    assert(Old->getNumValues() == New->getNumValues() &&
+           "Replacing one node with another that produces a different number "
+           "of values!");
     DAG.ReplaceAllUsesWith(Old, New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i)
+      DAG.TransferDbgValues(SDValue(Old, i), SDValue(New, i));
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New);
     ReplacedNode(Old);
   }
   void ReplaceNode(SDValue Old, SDValue New) {
+    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+          dbgs() << "     with:      "; New->dump(&DAG));
+
     DAG.ReplaceAllUsesWith(Old, New);
+    DAG.TransferDbgValues(Old, New);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New.getNode());
     ReplacedNode(Old.getNode());
   }
   void ReplaceNode(SDNode *Old, const SDValue *New) {
+    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG));
+
     DAG.ReplaceAllUsesWith(Old, New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
+      DEBUG(dbgs() << (i == 0 ? "     with:      "
+                              : "      and:      ");
+            New[i]->dump(&DAG));
+      DAG.TransferDbgValues(SDValue(Old, i), New[i]);
+      if (UpdatedNodes)
+        UpdatedNodes->insert(New[i].getNode());
+    }
     ReplacedNode(Old);
   }
 };
@@ -213,40 +232,6 @@
   return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
 }
 
-SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag)
-  : SelectionDAG::DAGUpdateListener(dag),
-    TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
-    DAG(dag) {
-}
-
-void SelectionDAGLegalize::LegalizeDAG() {
-  DAG.AssignTopologicalOrder();
-
-  // Visit all the nodes. We start in topological order, so that we see
-  // nodes with their original operands intact. Legalization can produce
-  // new nodes which may themselves need to be legalized. Iterate until all
-  // nodes have been legalized.
-  for (;;) {
-    bool AnyLegalized = false;
-    for (LegalizePosition = DAG.allnodes_end();
-         LegalizePosition != DAG.allnodes_begin(); ) {
-      --LegalizePosition;
-
-      SDNode *N = LegalizePosition;
-      if (LegalizedNodes.insert(N)) {
-        AnyLegalized = true;
-        LegalizeOp(N);
-      }
-    }
-    if (!AnyLegalized)
-      break;
-
-  }
-
-  // Remove dead nodes now.
-  DAG.RemoveDeadNodes();
-}
-
 /// ExpandConstantFP - Expands the ConstantFP node to an integer constant or
 /// a load from the constant pool.
 SDValue
@@ -270,7 +255,7 @@
 
   EVT OrigVT = VT;
   EVT SVT = VT;
-  while (SVT != MVT::f32) {
+  while (SVT != MVT::f32 && SVT != MVT::f16) {
     SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
     if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) &&
         // Only do this if the target has a native EXTLOAD instruction from
@@ -291,7 +276,7 @@
       DAG.getExtLoad(ISD::EXTLOAD, dl, OrigVT,
                      DAG.getEntryNode(),
                      CPIdx, MachinePointerInfo::getConstantPool(),
-                     VT, false, false, Alignment);
+                     VT, false, false, false, Alignment);
     return Result;
   }
   SDValue Result =
@@ -377,7 +362,7 @@
     // Load from the stack slot.
     SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
                                   MachinePointerInfo(),
-                                  MemVT, false, false, 0);
+                                  MemVT, false, false, false, 0);
 
     Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
                                        ST->getPointerInfo()
@@ -385,7 +370,7 @@
                                        MemVT, ST->isVolatile(),
                                        ST->isNonTemporal(),
                                        MinAlign(ST->getAlignment(), Offset),
-                                       ST->getTBAAInfo()));
+                                       ST->getAAInfo()));
     // The order of the stores doesn't matter - say it with a TokenFactor.
     SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
     DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
@@ -417,7 +402,7 @@
   Store2 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Hi:Lo, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                              NewStoredVT, ST->isVolatile(), ST->isNonTemporal(),
-                             Alignment, ST->getTBAAInfo());
+                             Alignment, ST->getAAInfo());
 
   SDValue Result =
     DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
@@ -476,7 +461,7 @@
                                  LD->isVolatile(), LD->isNonTemporal(),
                                  LD->isInvariant(),
                                  MinAlign(LD->getAlignment(), Offset),
-                                 LD->getTBAAInfo());
+                                 LD->getAAInfo());
       // Follow the load with a store to the stack slot.  Remember the store.
       Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
                                     MachinePointerInfo(), false, false, 0));
@@ -494,8 +479,9 @@
                                   LD->getPointerInfo().getWithOffset(Offset),
                                   MemVT, LD->isVolatile(),
                                   LD->isNonTemporal(),
+                                  LD->isInvariant(),
                                   MinAlign(LD->getAlignment(), Offset),
-                                  LD->getTBAAInfo());
+                                  LD->getAAInfo());
     // Follow the load with a store to the stack slot.  Remember the store.
     // On big-endian machines this requires a truncating store to ensure
     // that the bits end up in the right place.
@@ -508,7 +494,8 @@
 
     // Finally, perform the original load only redirected to the stack slot.
     Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
-                          MachinePointerInfo(), LoadedVT, false, false, 0);
+                          MachinePointerInfo(), LoadedVT, false,false, false,
+                          0);
 
     // Callers expect a MERGE_VALUES node.
     ValResult = Load;
@@ -538,25 +525,27 @@
   if (TLI.isLittleEndian()) {
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
+                        LD->isNonTemporal(), LD->isInvariant(), Alignment,
+                        LD->getAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
-                        LD->getTBAAInfo());
+                        LD->isNonTemporal(),LD->isInvariant(),
+                        MinAlign(Alignment, IncrementSize), LD->getAAInfo());
   } else {
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
+                        LD->isNonTemporal(), LD->isInvariant(), Alignment,
+                        LD->getAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                       DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
-                        LD->getTBAAInfo());
+                        LD->isNonTemporal(), LD->isInvariant(),
+                        MinAlign(Alignment, IncrementSize), LD->getAAInfo());
   }
 
   // aggregate the two parts
@@ -659,7 +648,7 @@
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
-  const MDNode *TBAAInfo = ST->getTBAAInfo();
+  AAMDNodes AAInfo = ST->getAAInfo();
   SDLoc dl(ST);
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
     if (CFP->getValueType(0) == MVT::f32 &&
@@ -668,7 +657,7 @@
                                       bitcastToAPInt().zextOrTrunc(32),
                               MVT::i32);
       return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                          isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                          isVolatile, isNonTemporal, Alignment, AAInfo);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
@@ -677,7 +666,7 @@
         SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                   zextOrTrunc(64), MVT::i64);
         return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                            isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                            isVolatile, isNonTemporal, Alignment, AAInfo);
       }
 
       if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
@@ -690,13 +679,13 @@
         if (TLI.isBigEndian()) std::swap(Lo, Hi);
 
         Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
-                          isNonTemporal, Alignment, TBAAInfo);
+                          isNonTemporal, Alignment, AAInfo);
         Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                           DAG.getConstant(4, Ptr.getValueType()));
         Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
                           isVolatile, isNonTemporal, MinAlign(Alignment, 4U),
-                          TBAAInfo);
+                          AAInfo);
 
         return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
       }
@@ -714,7 +703,7 @@
     unsigned Alignment = ST->getAlignment();
     bool isVolatile = ST->isVolatile();
     bool isNonTemporal = ST->isNonTemporal();
-    const MDNode *TBAAInfo = ST->getTBAAInfo();
+    AAMDNodes AAInfo = ST->getAAInfo();
 
     if (!ST->isTruncatingStore()) {
       if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
@@ -731,10 +720,11 @@
           // If this is an unaligned store and the target doesn't support it,
           // expand it.
           unsigned AS = ST->getAddressSpace();
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT(), AS)) {
+          unsigned Align = ST->getAlignment();
+          if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) {
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
             unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
+            if (Align < ABIAlignment)
               ExpandUnalignedStore(cast<StoreSDNode>(Node),
                                    DAG, TLI, this);
           }
@@ -754,7 +744,7 @@
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr,
                          ST->getPointerInfo(), isVolatile,
-                         isNonTemporal, Alignment, TBAAInfo);
+                         isNonTemporal, Alignment, AAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -777,7 +767,7 @@
         SDValue Result =
           DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                             NVT, isVolatile, isNonTemporal, Alignment,
-                            TBAAInfo);
+                            AAInfo);
         ReplaceNode(SDValue(Node, 0), Result);
       } else if (StWidth & (StWidth - 1)) {
         // If not storing a power-of-2 number of bits, expand as two stores.
@@ -799,7 +789,7 @@
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                                  RoundVT,
                                  isVolatile, isNonTemporal, Alignment,
-                                 TBAAInfo);
+                                 AAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
@@ -811,7 +801,7 @@
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
+                                 MinAlign(Alignment, IncrementSize), AAInfo);
         } else {
           // Big endian - avoid unaligned stores.
           // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
@@ -821,7 +811,7 @@
                                    TLI.getShiftAmountTy(Value.getValueType())));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
                                  RoundVT, isVolatile, isNonTemporal, Alignment,
-                                 TBAAInfo);
+                                 AAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
@@ -830,7 +820,7 @@
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
                               ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
+                                 MinAlign(Alignment, IncrementSize), AAInfo);
         }
 
         // The order of the stores doesn't matter.
@@ -842,12 +832,13 @@
         default: llvm_unreachable("This action is not supported yet!");
         case TargetLowering::Legal: {
           unsigned AS = ST->getAddressSpace();
+          unsigned Align = ST->getAlignment();
           // If this is an unaligned store and the target doesn't support it,
           // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT(), AS)) {
+          if (!TLI.allowsMisalignedMemoryAccesses(ST->getMemoryVT(), AS, Align)) {
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
             unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
+            if (Align < ABIAlignment)
               ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           }
           break;
@@ -868,7 +859,7 @@
           Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                         isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                         isVolatile, isNonTemporal, Alignment, AAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -893,13 +884,14 @@
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Legal: {
       unsigned AS = LD->getAddressSpace();
+      unsigned Align = LD->getAlignment();
       // If this is an unaligned load and the target doesn't support it,
       // expand it.
-      if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT(), AS)) {
+      if (!TLI.allowsMisalignedMemoryAccesses(LD->getMemoryVT(), AS, Align)) {
         Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
         unsigned ABIAlignment =
           TLI.getDataLayout()->getABITypeAlignment(Ty);
-        if (LD->getAlignment() < ABIAlignment){
+        if (Align < ABIAlignment){
           ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain);
         }
       }
@@ -928,6 +920,10 @@
       assert(RVal.getNode() != Node && "Load must be completely replaced");
       DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
+      if (UpdatedNodes) {
+        UpdatedNodes->insert(RVal.getNode());
+        UpdatedNodes->insert(RChain.getNode());
+      }
       ReplacedNode(Node);
     }
     return;
@@ -938,7 +934,8 @@
   unsigned Alignment = LD->getAlignment();
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  bool isInvariant = LD->isInvariant();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   if (SrcWidth != SrcVT.getStoreSizeInBits() &&
       // Some targets pretend to have an i1 loading operation, and actually
@@ -965,7 +962,8 @@
     SDValue Result =
       DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
                      Chain, Ptr, LD->getPointerInfo(),
-                     NVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                     NVT, isVolatile, isNonTemporal, isInvariant, Alignment,
+                     AAInfo);
 
     Ch = Result.getValue(1); // The chain.
 
@@ -1002,7 +1000,7 @@
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
                           Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment, TBAAInfo);
+                          isNonTemporal, isInvariant, Alignment, AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
@@ -1010,8 +1008,8 @@
                          DAG.getConstant(IncrementSize, Ptr.getValueType()));
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
-                          ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize), TBAAInfo);
+                          ExtraVT, isVolatile, isNonTemporal, isInvariant,
+                          MinAlign(Alignment, IncrementSize), AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1031,7 +1029,7 @@
       // Load the top RoundWidth bits.
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment, TBAAInfo);
+                          isNonTemporal, isInvariant, Alignment, AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
@@ -1040,8 +1038,8 @@
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
                           dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
-                          ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize), TBAAInfo);
+                          ExtraVT, isVolatile, isNonTemporal, isInvariant,
+                          MinAlign(Alignment, IncrementSize), AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1080,12 +1078,13 @@
         // it, expand it.
         EVT MemVT = LD->getMemoryVT();
         unsigned AS = LD->getAddressSpace();
-        if (!TLI.allowsUnalignedMemoryAccesses(MemVT, AS)) {
+        unsigned Align = LD->getAlignment();
+        if (!TLI.allowsMisalignedMemoryAccesses(MemVT, AS, Align)) {
           Type *Ty =
             LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
           unsigned ABIAlignment =
             TLI.getDataLayout()->getABITypeAlignment(Ty);
-          if (LD->getAlignment() < ABIAlignment){
+          if (Align < ABIAlignment){
             ExpandUnalignedLoad(cast<LoadSDNode>(Node),
                                 DAG, TLI, Value, Chain);
           }
@@ -1148,6 +1147,10 @@
     assert(Value.getNode() != Node && "Load must be completely replaced");
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    if (UpdatedNodes) {
+      UpdatedNodes->insert(Value.getNode());
+      UpdatedNodes->insert(Chain.getNode());
+    }
     ReplacedNode(Node);
   }
 }
@@ -1155,6 +1158,8 @@
 /// LegalizeOp - Return a legal replacement for the given operation, with
 /// all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
+  DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
+
   if (Node->getOpcode() == ISD::TargetConstant) // Allow illegal target nodes.
     return;
 
@@ -1186,6 +1191,7 @@
     if (Action != TargetLowering::Promote)
       Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
     break;
+  case ISD::FP_TO_FP16:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::EXTRACT_VECTOR_ELT:
@@ -1334,10 +1340,7 @@
     }
 
     if (NewNode != Node) {
-      DAG.ReplaceAllUsesWith(Node, NewNode);
-      for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-        DAG.TransferDbgValues(SDValue(Node, i), SDValue(NewNode, i));
-      ReplacedNode(Node);
+      ReplaceNode(Node, NewNode);
       Node = NewNode;
     }
     switch (Action) {
@@ -1348,19 +1351,19 @@
       // a complete mess.
       SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
       if (Res.getNode()) {
+        if (!(Res.getNode() != Node || Res.getResNo() != 0))
+          return;
+
+        if (Node->getNumValues() == 1) {
+          // We can just directly replace this node with the lowered value.
+          ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+
         SmallVector<SDValue, 8> ResultVals;
-        for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) {
-          if (e == 1)
-            ResultVals.push_back(Res);
-          else
-            ResultVals.push_back(Res.getValue(i));
-        }
-        if (Res.getNode() != Node || Res.getResNo() != 0) {
-          DAG.ReplaceAllUsesWith(Node, ResultVals.data());
-          for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-            DAG.TransferDbgValues(SDValue(Node, i), ResultVals[i]);
-          ReplacedNode(Node);
-        }
+        for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+          ResultVals.push_back(Res.getValue(i));
+        ReplaceNode(Node, ResultVals.data());
         return;
       }
     }
@@ -1448,7 +1451,7 @@
   return DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
                         MachinePointerInfo(),
                         Vec.getValueType().getVectorElementType(),
-                        false, false, 0);
+                        false, false, false, 0);
 }
 
 SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
@@ -1483,7 +1486,7 @@
                                     StackPtr);
 
   // Store the subvector.
-  Ch = DAG.getStore(DAG.getEntryNode(), dl, Part, SubStackPtr,
+  Ch = DAG.getStore(Ch, dl, Part, SubStackPtr,
                     MachinePointerInfo(), false, false, 0);
 
   // Finally, load the updated vector.
@@ -1623,7 +1626,8 @@
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   Chain = SP.getValue(1);
   unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-  unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+  unsigned StackAlign =
+      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
   Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
   if (Align > StackAlign)
     Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
@@ -1797,7 +1801,7 @@
 
   assert(SlotSize < DestSize && "Unknown extension!");
   return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr,
-                        PtrInfo, SlotVT, false, false, DestAlign);
+                        PtrInfo, SlotVT, false, false, false, DestAlign);
 }
 
 SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
@@ -1877,7 +1881,8 @@
                                          ShuffleVec.data());
         else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
           return false;
-        NewIntermedVals.push_back(std::make_pair(Shuffle, FinalIndices));
+        NewIntermedVals.push_back(
+            std::make_pair(Shuffle, std::move(FinalIndices)));
       }
 
       // If we had an odd number of defined values, then append the last
@@ -2580,7 +2585,7 @@
     SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT,
                                   DAG.getEntryNode(), CPIdx,
                                   MachinePointerInfo::getConstantPool(),
-                                  MVT::f32, false, false, Alignment);
+                                  MVT::f32, false, false, false, Alignment);
     HandleSDNode Handle(Load);
     LegalizeOp(Load.getNode());
     FudgeInReg = Handle.getValue();
@@ -2782,7 +2787,7 @@
     // x = x | (x >>32); // for 64-bit input
     // return popcount(~x);
     //
-    // but see also: http://www.hackersdelight.org/HDcode/nlz.cc
+    // Ref: "Hacker's Delight" by Henry Warren
     EVT VT = Op.getValueType();
     EVT ShVT = TLI.getShiftAmountTy(VT);
     unsigned len = VT.getSizeInBits();
@@ -2801,7 +2806,7 @@
     // for now, we use: { return popcount(~x & (x - 1)); }
     // unless the target has ctlz but not ctpop, in which case we use:
     // { return 32 - nlz(~x & (x-1)); }
-    // see also http://www.hackersdelight.org/HDcode/ntz.cc
+    // Ref: "Hacker's Delight" by Henry Warren
     EVT VT = Op.getValueType();
     SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
                                DAG.getNOT(dl, Op, VT),
@@ -3153,65 +3158,10 @@
                                 Node->getOperand(0), Node->getValueType(0), dl);
     Results.push_back(Tmp1);
     break;
-  case ISD::FP_TO_SINT: {
-    EVT VT = Node->getOperand(0).getValueType();
-    EVT NVT = Node->getValueType(0);
-
-    // FIXME: Only f32 to i64 conversions are supported.
-    if (VT != MVT::f32 || NVT != MVT::i64)
-      break;
-
-    // Expand f32 -> i64 conversion
-    // This algorithm comes from compiler-rt's implementation of fixsfdi:
-    // https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/fixsfdi.c
-    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(),
-                                  VT.getSizeInBits());
-    SDValue ExponentMask = DAG.getConstant(0x7F800000, IntVT);
-    SDValue ExponentLoBit = DAG.getConstant(23, IntVT);
-    SDValue Bias = DAG.getConstant(127, IntVT);
-    SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()),
-                                       IntVT);
-    SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, IntVT);
-    SDValue MantissaMask = DAG.getConstant(0x007FFFFF, IntVT);
-
-    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0));
-
-    SDValue ExponentBits = DAG.getNode(ISD::SRL, dl, IntVT,
-        DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
-        DAG.getZExtOrTrunc(ExponentLoBit, dl, TLI.getShiftAmountTy(IntVT)));
-    SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
-
-    SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
-        DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
-        DAG.getZExtOrTrunc(SignLowBit, dl, TLI.getShiftAmountTy(IntVT)));
-    Sign = DAG.getSExtOrTrunc(Sign, dl, NVT);
-
-    SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
-        DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
-        DAG.getConstant(0x00800000, IntVT));
-
-    R = DAG.getZExtOrTrunc(R, dl, NVT);
-
-
-    R = DAG.getSelectCC(dl, Exponent, ExponentLoBit,
-       DAG.getNode(ISD::SHL, dl, NVT, R,
-                   DAG.getZExtOrTrunc(
-                      DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
-                      dl, TLI.getShiftAmountTy(IntVT))),
-       DAG.getNode(ISD::SRL, dl, NVT, R,
-                   DAG.getZExtOrTrunc(
-                      DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
-                      dl, TLI.getShiftAmountTy(IntVT))),
-       ISD::SETGT);
-
-    SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT,
-        DAG.getNode(ISD::XOR, dl, NVT, R, Sign),
-        Sign);
-
-    Results.push_back(DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, IntVT),
-        DAG.getConstant(0, NVT), Ret, ISD::SETLT));
+  case ISD::FP_TO_SINT:
+    if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
     break;
-  }
   case ISD::FP_TO_UINT: {
     SDValue True, False;
     EVT VT =  Node->getOperand(0).getValueType();
@@ -3450,6 +3400,16 @@
     Results.push_back(Tmp1);
     break;
   }
+  case ISD::FMINNUM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                                      RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                                      RTLIB::FMIN_PPCF128));
+    break;
+  case ISD::FMAXNUM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                                      RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                                      RTLIB::FMAX_PPCF128));
+    break;
   case ISD::FSQRT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
                                       RTLIB::SQRT_F80, RTLIB::SQRT_F128,
@@ -3568,12 +3528,38 @@
                                       RTLIB::FMA_F80, RTLIB::FMA_F128,
                                       RTLIB::FMA_PPCF128));
     break;
-  case ISD::FP16_TO_FP32:
-    Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
+  case ISD::FADD:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                      RTLIB::ADD_F80, RTLIB::ADD_F128,
+                                      RTLIB::ADD_PPCF128));
     break;
-  case ISD::FP32_TO_FP16:
-    Results.push_back(ExpandLibCall(RTLIB::FPROUND_F32_F16, Node, false));
+  case ISD::FMUL:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
+                                      RTLIB::MUL_F80, RTLIB::MUL_F128,
+                                      RTLIB::MUL_PPCF128));
     break;
+  case ISD::FP16_TO_FP: {
+    if (Node->getValueType(0) == MVT::f32) {
+      Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
+      break;
+    }
+
+    // We can extend to types bigger than f32 in two steps without changing the
+    // result. Since "f16 -> f32" is much more commonly available, give CodeGen
+    // the option of emitting that before resorting to a libcall.
+    SDValue Res =
+        DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0));
+    Results.push_back(
+        DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res));
+    break;
+  }
+  case ISD::FP_TO_FP16: {
+    RTLIB::Libcall LC =
+        RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
+    Results.push_back(ExpandLibCall(LC, Node, false));
+    break;
+  }
   case ISD::ConstantFP: {
     ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
     // Check to see if this FP immediate is already legal.
@@ -3584,12 +3570,16 @@
   }
   case ISD::FSUB: {
     EVT VT = Node->getValueType(0);
-    assert(TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
-           TLI.isOperationLegalOrCustom(ISD::FNEG, VT) &&
-           "Don't know how to expand this FP subtraction!");
-    Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
-    Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1);
-    Results.push_back(Tmp1);
+    if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) {
+      Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
+      Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1);
+      Results.push_back(Tmp1);
+    } else {
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
+                                        RTLIB::SUB_F80, RTLIB::SUB_F128,
+                                        RTLIB::SUB_PPCF128));
+    }
     break;
   }
   case ISD::SUB: {
@@ -3844,9 +3834,11 @@
       TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
                             DAG.getIntPtrConstant(1));
       // Ret is a node with an illegal type. Because such things are not
-      // generally permitted during this phase of legalization, delete the
-      // node. The above EXTRACT_ELEMENT nodes should have been folded.
-      DAG.DeleteNode(Ret.getNode());
+      // generally permitted during this phase of legalization, make sure the
+      // node has no more uses. The above EXTRACT_ELEMENT nodes should have been
+      // folded.
+      assert(Ret->use_empty() &&
+             "Unexpected uses of illegally type from expanded lib call.");
     }
 
     if (isSigned) {
@@ -3907,7 +3899,7 @@
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
     SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
                                 MachinePointerInfo::getJumpTable(), MemVT,
-                                false, false, 0);
+                                false, false, false, 0);
     Addr = LD;
     if (TM.getRelocationModel() == Reloc::PIC_) {
       // For PIC, the sequence is:
@@ -4217,6 +4209,10 @@
     // use the new one.
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp2);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    if (UpdatedNodes) {
+      UpdatedNodes->insert(Tmp2.getNode());
+      UpdatedNodes->insert(Chain.getNode());
+    }
     ReplacedNode(Node);
     break;
   }
@@ -4293,6 +4289,9 @@
                                   Tmp1, Tmp2, Node->getOperand(2)));
     break;
   }
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM:
   case ISD::FPOW: {
@@ -4323,7 +4322,55 @@
 // SelectionDAG::Legalize - This is the entry point for the file.
 //
 void SelectionDAG::Legalize() {
-  /// run - This is the main entry point to this class.
-  ///
-  SelectionDAGLegalize(*this).LegalizeDAG();
+  AssignTopologicalOrder();
+
+  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  SelectionDAGLegalize Legalizer(*this, LegalizedNodes);
+
+  // Visit all the nodes. We start in topological order, so that we see
+  // nodes with their original operands intact. Legalization can produce
+  // new nodes which may themselves need to be legalized. Iterate until all
+  // nodes have been legalized.
+  for (;;) {
+    bool AnyLegalized = false;
+    for (auto NI = allnodes_end(); NI != allnodes_begin();) {
+      --NI;
+
+      SDNode *N = NI;
+      if (N->use_empty() && N != getRoot().getNode()) {
+        ++NI;
+        DeleteNode(N);
+        continue;
+      }
+
+      if (LegalizedNodes.insert(N).second) {
+        AnyLegalized = true;
+        Legalizer.LegalizeOp(N);
+
+        if (N->use_empty() && N != getRoot().getNode()) {
+          ++NI;
+          DeleteNode(N);
+        }
+      }
+    }
+    if (!AnyLegalized)
+      break;
+
+  }
+
+  // Remove dead nodes now.
+  RemoveDeadNodes();
+}
+
+bool SelectionDAG::LegalizeOp(SDNode *N,
+                              SmallSetVector<SDNode *, 16> &UpdatedNodes) {
+  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  SelectionDAGLegalize Legalizer(*this, LegalizedNodes, &UpdatedNodes);
+
+  // Directly insert the node in question, and legalize it. This will recurse
+  // as needed through operands.
+  LegalizedNodes.insert(N);
+  Legalizer.LegalizeOp(N);
+
+  return LegalizedNodes.count(N);
 }

diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 6b8fec6..4591e79 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp

@@ -68,6 +68,8 @@
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
     case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
+    case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
     case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
@@ -85,7 +87,7 @@
     case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
     case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
     case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
-    case ISD::FP16_TO_FP32:R = SoftenFloatRes_FP16_TO_FP32(N); break;
+    case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
     case ISD::FPOW:        R = SoftenFloatRes_FPOW(N); break;
     case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
     case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
@@ -153,6 +155,32 @@
   return DAG.getNode(ISD::AND, SDLoc(N), NVT, Op, Mask);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FMIN_F32,
+                                           RTLIB::FMIN_F64,
+                                           RTLIB::FMIN_F80,
+                                           RTLIB::FMIN_F128,
+                                           RTLIB::FMIN_PPCF128),
+                         NVT, Ops, 2, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FMAX_F32,
+                                           RTLIB::FMAX_F64,
+                                           RTLIB::FMAX_F80,
+                                           RTLIB::FMAX_F128,
+                                           RTLIB::FMAX_PPCF128),
+                         NVT, Ops, 2, false, SDLoc(N)).first;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
@@ -373,23 +401,48 @@
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = N->getOperand(0);
+
+  // There's only a libcall for f16 -> f32, so proceed in two stages. Also, it's
+  // entirely possible for both f16 and f32 to be legal, so use the fully
+  // hard-float FP_EXTEND rather than FP16_TO_FP.
+  if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) {
+    Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
+    if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat)
+      SoftenFloatResult(Op.getNode(), 0);
+  }
+
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat)
+    Op = GetSoftenedFloat(Op);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
 // nodes?
-SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP32(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
+  EVT MidVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32);
   SDValue Op = N->getOperand(0);
-  return TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, NVT, &Op, 1, false,
-                         SDLoc(N)).first;
+  SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, &Op, 1,
+                                  false, SDLoc(N)).first;
+  if (N->getValueType(0) == MVT::f32)
+    return Res32;
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  RTLIB::Libcall LC = RTLIB::getFPEXT(MVT::f32, N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
+  return TLI.makeLibCall(DAG, LC, NVT, &Res32, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = N->getOperand(0);
+  if (N->getValueType(0) == MVT::f16) {
+    // Semi-soften first, to FP_TO_FP16, so that targets which support f16 as a
+    // storage-only type get a chance to select things.
+    return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, Op);
+  }
+
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
   return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
@@ -498,6 +551,9 @@
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  if (N->getValueType(0) == MVT::f16)
+    return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, N->getOperand(0));
+
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::TRUNC_F32,
@@ -520,7 +576,7 @@
                        NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
                        L->getPointerInfo(), NVT, L->isVolatile(),
                        L->isNonTemporal(), false, L->getAlignment(),
-                       L->getTBAAInfo());
+                       L->getAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
     ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -533,7 +589,7 @@
                      L->getBasePtr(), L->getOffset(), L->getPointerInfo(),
                      L->getMemoryVT(), L->isVolatile(),
                      L->isNonTemporal(), false, L->getAlignment(),
-                     L->getTBAAInfo());
+                     L->getAAInfo());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -625,10 +681,11 @@
 
   case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
   case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
+  case ISD::FP_EXTEND:   Res = SoftenFloatOp_FP_EXTEND(N); break;
+  case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
   case ISD::FP_TO_SINT:  Res = SoftenFloatOp_FP_TO_SINT(N); break;
   case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_UINT(N); break;
-  case ISD::FP32_TO_FP16:Res = SoftenFloatOp_FP32_TO_FP16(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
   case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
@@ -654,11 +711,32 @@
                      GetSoftenedFloat(N->getOperand(0)));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
+  // If we get here, the result must be legal but the source illegal.
   EVT SVT = N->getOperand(0).getValueType();
   EVT RVT = N->getValueType(0);
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
 
-  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, RVT);
+  if (SVT == MVT::f16)
+    return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), RVT, Op);
+
+  RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND libcall");
+
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
+}
+
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
+  // We actually deal with the partially-softened FP_TO_FP16 node too, which
+  // returns an i16 so doesn't meet the constraints necessary for FP_ROUND.
+  assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16);
+
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT RVT = N->getValueType(0);
+  EVT FloatRVT = N->getOpcode() == ISD::FP_TO_FP16 ? MVT::f16 : RVT;
+
+  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
@@ -704,13 +782,6 @@
   return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_FP32_TO_FP16(SDNode *N) {
-  EVT RVT = N->getValueType(0);
-  RTLIB::Libcall LC = RTLIB::FPROUND_F32_F16;
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
-}
-
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
@@ -813,6 +884,8 @@
 
   case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
   case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
+  case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
+  case ISD::FMAXNUM:    ExpandFloatRes_FMAXNUM(N, Lo, Hi); break;
   case ISD::FADD:       ExpandFloatRes_FADD(N, Lo, Hi); break;
   case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
   case ISD::FCOPYSIGN:  ExpandFloatRes_FCOPYSIGN(N, Lo, Hi); break;
@@ -876,6 +949,26 @@
                    ISD::SETEQ);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FMINNUM(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                                         RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                                         RTLIB::FMIN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMAXNUM(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                                         RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                                         RTLIB::FMAX_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),

diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index cffb0a1..b73bb0a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

@@ -99,7 +99,7 @@
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
 
-  case ISD::FP32_TO_FP16:Res = PromoteIntRes_FP32_TO_FP16(N); break;
+  case ISD::FP_TO_FP16:  Res = PromoteIntRes_FP_TO_FP16(N); break;
 
   case ISD::AND:
   case ISD::OR:
@@ -225,10 +225,9 @@
       N->getOpcode(), SDLoc(N), N->getMemoryVT(), VTs, N->getChain(),
       N->getBasePtr(), Op2, Op3, N->getMemOperand(), N->getSuccessOrdering(),
       N->getFailureOrdering(), N->getSynchScope());
-  // Legalized the chain result - switch anything that used the old chain to
-  // use the new one.
-  unsigned ChainOp = N->getNumValues() - 1;
-  ReplaceValueWith(SDValue(N, ChainOp), Res.getValue(ChainOp));
+  // Update the use to N with the newly created Res.
+  for (unsigned i = 1, NumResults = N->getNumValues(); i < NumResults; ++i)
+    ReplaceValueWith(SDValue(N, i), Res.getValue(i));
   return Res;
 }
 
@@ -402,7 +401,7 @@
                      DAG.getValueType(N->getValueType(0).getScalarType()));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_FP32_TO_FP16(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
 
@@ -827,7 +826,7 @@
   case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
                                                    OpNo); break;
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
-  case ISD::FP16_TO_FP32:
+  case ISD::FP16_TO_FP:
   case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
   case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
 
@@ -863,7 +862,26 @@
   switch (CCCode) {
   default: llvm_unreachable("Unknown integer comparison!");
   case ISD::SETEQ:
-  case ISD::SETNE:
+  case ISD::SETNE: {
+    SDValue OpL = GetPromotedInteger(NewLHS);
+    SDValue OpR = GetPromotedInteger(NewRHS);
+
+    // We would prefer to promote the comparison operand with sign extension,
+    // if we find the operand is actually to truncate an AssertSext. With this
+    // optimization, we can avoid inserting real truncate instruction, which
+    // is redudant eventually.
+    if (OpL->getOpcode() == ISD::AssertSext &&
+        cast<VTSDNode>(OpL->getOperand(1))->getVT() == NewLHS.getValueType() &&
+        OpR->getOpcode() == ISD::AssertSext &&
+        cast<VTSDNode>(OpR->getOperand(1))->getVT() == NewRHS.getValueType()) {
+      NewLHS = OpL;
+      NewRHS = OpR;
+    } else {
+      NewLHS = ZExtPromotedInteger(NewLHS);
+      NewRHS = ZExtPromotedInteger(NewRHS);
+    }
+    break;
+  }
   case ISD::SETUGE:
   case ISD::SETUGT:
   case ISD::SETULE:
@@ -947,7 +965,7 @@
   EVT VecVT = N->getValueType(0);
   unsigned NumElts = VecVT.getVectorNumElements();
   assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) &&
-		 "Legal vector of one illegal element?");
+         "Legal vector of one illegal element?");
 
   // Promote the inserted value.  The type does not need to match the
   // vector element type.  Check that any extra bits introduced will be
@@ -1861,7 +1879,7 @@
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
   bool isInvariant = N->isInvariant();
-  const MDNode *TBAAInfo = N->getTBAAInfo();
+  AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
@@ -1870,7 +1888,8 @@
     EVT MemVT = N->getMemoryVT();
 
     Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
-                        MemVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                        MemVT, isVolatile, isNonTemporal, isInvariant,
+                        Alignment, AAInfo);
 
     // Remember the chain.
     Ch = Lo.getValue(1);
@@ -1893,7 +1912,7 @@
     // Little-endian - low bits are at low addresses.
     Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
                      isVolatile, isNonTemporal, isInvariant, Alignment,
-                     TBAAInfo);
+                     AAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -1905,8 +1924,8 @@
                       DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
-                        isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize), TBAAInfo);
+                        isVolatile, isNonTemporal, isInvariant,
+                        MinAlign(Alignment, IncrementSize), AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -1924,7 +1943,8 @@
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
-                        isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                        isVolatile, isNonTemporal, isInvariant, Alignment,
+                        AAInfo);
 
     // Increment the pointer to the other half.
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@@ -1933,8 +1953,8 @@
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize),
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
-                        isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize), TBAAInfo);
+                        isVolatile, isNonTemporal, isInvariant,
+                        MinAlign(Alignment, IncrementSize), AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -2711,7 +2731,7 @@
   unsigned Alignment = N->getAlignment();
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
-  const MDNode *TBAAInfo = N->getTBAAInfo();
+  AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
   SDValue Lo, Hi;
 
@@ -2721,7 +2741,7 @@
     GetExpandedInteger(N->getValue(), Lo, Hi);
     return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
                              N->getMemoryVT(), isVolatile, isNonTemporal,
-                             Alignment, TBAAInfo);
+                             Alignment, AAInfo);
   }
 
   if (TLI.isLittleEndian()) {
@@ -2729,7 +2749,7 @@
     GetExpandedInteger(N->getValue(), Lo, Hi);
 
     Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
-                      isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                      isVolatile, isNonTemporal, Alignment, AAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -2742,7 +2762,7 @@
     Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
                            NEVT, isVolatile, isNonTemporal,
-                           MinAlign(Alignment, IncrementSize), TBAAInfo);
+                           MinAlign(Alignment, IncrementSize), AAInfo);
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
   }
 
@@ -2770,7 +2790,7 @@
 
   // Store both the high bits and maybe some of the low bits.
   Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(),
-                         HiVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                         HiVT, isVolatile, isNonTemporal, Alignment, AAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@@ -2780,7 +2800,7 @@
                          N->getPointerInfo().getWithOffset(IncrementSize),
                          EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                          isVolatile, isNonTemporal,
-                         MinAlign(Alignment, IncrementSize), TBAAInfo);
+                         MinAlign(Alignment, IncrementSize), AAInfo);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
 
@@ -2855,7 +2875,7 @@
                                    FudgePtr,
                                    MachinePointerInfo::getConstantPool(),
                                    MVT::f32,
-                                   false, false, Alignment);
+                                   false, false, false, Alignment);
     return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
   }
 

diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d0ca6f8..30f412b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SELECTIONDAG_LEGALIZETYPES_H
-#define SELECTIONDAG_LEGALIZETYPES_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -122,8 +122,8 @@
   explicit DAGTypeLegalizer(SelectionDAG &dag)
     : TLI(dag.getTargetLoweringInfo()), DAG(dag),
     ValueTypeActions(TLI.getValueTypeActions()) {
-    assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
-           "Too many value types for ValueTypeActions to hold!");
+    static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
+                  "Too many value types for ValueTypeActions to hold!");
   }
 
   /// run - This is the main entry point for the type legalizer.  This does a
@@ -237,7 +237,7 @@
   SDValue PromoteIntRes_CTTZ(SDNode *N);
   SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
-  SDValue PromoteIntRes_FP32_TO_FP16(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_FP16(SDNode *N);
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
@@ -387,6 +387,8 @@
   SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
   SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FMINNUM(SDNode *N);
+  SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
   SDValue SoftenFloatRes_FADD(SDNode *N);
   SDValue SoftenFloatRes_FCEIL(SDNode *N);
   SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
@@ -403,7 +405,7 @@
   SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
   SDValue SoftenFloatRes_FNEG(SDNode *N);
   SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
-  SDValue SoftenFloatRes_FP16_TO_FP32(SDNode *N);
+  SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N);
   SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
   SDValue SoftenFloatRes_FPOW(SDNode *N);
   SDValue SoftenFloatRes_FPOWI(SDNode *N);
@@ -425,10 +427,10 @@
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_BITCAST(SDNode *N);
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
+  SDValue SoftenFloatOp_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
   SDValue SoftenFloatOp_FP_TO_SINT(SDNode *N);
   SDValue SoftenFloatOp_FP_TO_UINT(SDNode *N);
-  SDValue SoftenFloatOp_FP32_TO_FP16(SDNode *N);
   SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
   SDValue SoftenFloatOp_SETCC(SDNode *N);
   SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
@@ -450,6 +452,8 @@
   void ExpandFloatResult(SDNode *N, unsigned ResNo);
   void ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMINNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMAXNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FADD      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FCEIL     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FCOPYSIGN (SDNode *N, SDValue &Lo, SDValue &Hi);

diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 7e2f7b6..38829b6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp

@@ -256,13 +256,13 @@
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
   Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
                    isVolatile, isNonTemporal, isInvariant, Alignment,
-                   TBAAInfo);
+                   AAInfo);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
@@ -271,7 +271,7 @@
   Hi = DAG.getLoad(NVT, dl, Chain, Ptr,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
                    isVolatile, isNonTemporal, isInvariant,
-                   MinAlign(Alignment, IncrementSize), TBAAInfo);
+                   MinAlign(Alignment, IncrementSize), AAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -470,7 +470,7 @@
   unsigned Alignment = St->getAlignment();
   bool isVolatile = St->isVolatile();
   bool isNonTemporal = St->isNonTemporal();
-  const MDNode *TBAAInfo = St->getTBAAInfo();
+  AAMDNodes AAInfo = St->getAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
@@ -482,14 +482,14 @@
     std::swap(Lo, Hi);
 
   Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
-                    isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                    isVolatile, isNonTemporal, Alignment, AAInfo);
 
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                     DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                     St->getPointerInfo().getWithOffset(IncrementSize),
                     isVolatile, isNonTemporal,
-                    MinAlign(Alignment, IncrementSize), TBAAInfo);
+                    MinAlign(Alignment, IncrementSize), AAInfo);
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 507e7ff..b5af7b7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

@@ -199,12 +199,29 @@
   if (Op.getOpcode() == ISD::LOAD) {
     LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
     ISD::LoadExtType ExtType = LD->getExtensionType();
-    if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD) {
-      if (TLI.isLoadExtLegal(LD->getExtensionType(), LD->getMemoryVT()))
+    if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD)
+      switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getMemoryVT())) {
+      default: llvm_unreachable("This action is not supported yet!");
+      case TargetLowering::Legal:
         return TranslateLegalizeResults(Op, Result);
-      Changed = true;
-      return LegalizeOp(ExpandLoad(Op));
-    }
+      case TargetLowering::Custom:
+        if (SDValue Lowered = TLI.LowerOperation(Result, DAG)) {
+          Changed = true;
+          if (Lowered->getNumValues() != Op->getNumValues()) {
+            // This expanded to something other than the load. Assume the
+            // lowering code took care of any chain values, and just handle the
+            // returned value.
+            assert(Result.getValue(1).use_empty() &&
+                   "There are still live users of the old chain!");
+            return LegalizeOp(Lowered);
+          } else {
+            return TranslateLegalizeResults(Op, Lowered);
+          }
+        }
+      case TargetLowering::Expand:
+        Changed = true;
+        return LegalizeOp(ExpandLoad(Op));
+      }
   } else if (Op.getOpcode() == ISD::STORE) {
     StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
     EVT StVT = ST->getMemoryVT();
@@ -273,6 +290,8 @@
   case ISD::FP_TO_UINT:
   case ISD::FNEG:
   case ISD::FABS:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
   case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
@@ -353,9 +372,11 @@
     return PromoteFP_TO_INT(Op, Op->getOpcode() == ISD::FP_TO_SINT);
   }
 
-  // The rest of the time, vector "promotion" is basically just bitcasting and
-  // doing the operation in a different type.  For example, x86 promotes
-  // ISD::AND on v2i32 to v1i64.
+  // There are currently two cases of vector promotion:
+  // 1) Bitcasting a vector of integers to a different type to a vector of the
+  //    same overall length. For example, x86 promotes ISD::AND on v2i32 to v1i64.
+  // 2) Extending a vector of floats to a vector of the same number oflarger
+  //    floats. For example, AArch64 promotes ISD::FADD on v4f16 to v4f32.
   MVT VT = Op.getSimpleValueType();
   assert(Op.getNode()->getNumValues() == 1 &&
          "Can't promote a vector with multiple results!");
@@ -365,14 +386,23 @@
 
   for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
     if (Op.getOperand(j).getValueType().isVector())
-      Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j));
+      if (Op.getOperand(j)
+              .getValueType()
+              .getVectorElementType()
+              .isFloatingPoint())
+        Operands[j] = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op.getOperand(j));
+      else
+        Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j));
     else
       Operands[j] = Op.getOperand(j);
   }
 
   Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands);
-
-  return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+  if (VT.isFloatingPoint() ||
+      (VT.isVector() && VT.getVectorElementType().isFloatingPoint()))
+    return DAG.getNode(ISD::FP_ROUND, dl, VT, Op, DAG.getIntPtrConstant(0));
+  else
+    return DAG.getNode(ISD::BITCAST, dl, VT, Op);
 }
 
 SDValue VectorLegalizer::PromoteINT_TO_FP(SDValue Op) {
@@ -480,7 +510,7 @@
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
                                  LD->isInvariant(), LD->getAlignment(),
-                                 LD->getTBAAInfo());
+                                 LD->getAAInfo());
       } else {
         EVT LoadVT = WideVT;
         while (RemainingBytes < LoadBytes) {
@@ -490,8 +520,8 @@
         ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
                                     LD->getPointerInfo().getWithOffset(Offset),
                                     LoadVT, LD->isVolatile(),
-                                    LD->isNonTemporal(), LD->getAlignment(),
-                                    LD->getTBAAInfo());
+                                    LD->isNonTemporal(), LD->isInvariant(),
+                                    LD->getAlignment(), LD->getAAInfo());
       }
 
       RemainingBytes -= LoadBytes;
@@ -561,8 +591,8 @@
                 Op.getNode()->getValueType(0).getScalarType(),
                 Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
                 SrcVT.getScalarType(),
-                LD->isVolatile(), LD->isNonTemporal(),
-                LD->getAlignment(), LD->getTBAAInfo());
+                LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(),
+                LD->getAlignment(), LD->getAAInfo());
 
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
                          DAG.getConstant(Stride, BasePTR.getValueType()));
@@ -593,7 +623,7 @@
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
-  const MDNode *TBAAInfo = ST->getTBAAInfo();
+  AAMDNodes AAInfo = ST->getAAInfo();
 
   unsigned NumElem = StVT.getVectorNumElements();
   // The type of the data we want to save
@@ -621,7 +651,7 @@
     // This scalar TruncStore may be illegal, but we legalize it later.
     SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
                ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
-               isVolatile, isNonTemporal, Alignment, TBAAInfo);
+               isVolatile, isNonTemporal, Alignment, AAInfo);
 
     BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
                                DAG.getConstant(Stride, BasePTR.getValueType()));

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 71240fc..27f63d2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

@@ -106,6 +106,9 @@
   case ISD::FCOPYSIGN:
   case ISD::FDIV:
   case ISD::FMUL:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+
   case ISD::FPOW:
   case ISD::FREM:
   case ISD::FSUB:
@@ -223,7 +226,7 @@
                                N->getMemoryVT().getVectorElementType(),
                                N->isVolatile(), N->isNonTemporal(),
                                N->isInvariant(), N->getOriginalAlignment(),
-                               N->getTBAAInfo());
+                               N->getAAInfo());
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -234,7 +237,23 @@
 SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
   // Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
   EVT DestVT = N->getValueType(0).getVectorElementType();
-  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  SDLoc DL(N);
+  // The result needs scalarizing, but it's not a given that the source does.
+  // This is a workaround for targets where it's impossible to scalarize the
+  // result of a conversion, because the source type is legal.
+  // For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
+  // are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
+  // legal and was not scalarized.
+  // See the similar logic in ScalarizeVecRes_VSETCC
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+                      DAG.getConstant(0, TLI.getVectorIdxTy()));
+  }
   return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op);
 }
 
@@ -408,6 +427,10 @@
     case ISD::ZERO_EXTEND:
     case ISD::SIGN_EXTEND:
     case ISD::TRUNCATE:
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
       Res = ScalarizeVecOp_UnaryOp(N);
       break;
     case ISD::CONCAT_VECTORS:
@@ -451,11 +474,11 @@
                      N->getValueType(0), Elt);
 }
 
-/// ScalarizeVecOp_EXTEND - If the value to extend is a vector that needs
-/// to be scalarized, it must be <1 x ty>.  Extend the element instead.
+/// ScalarizeVecOp_UnaryOp - If the input is a vector that needs to be
+/// scalarized, it must be <1 x ty>.  Do the operation on the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   assert(N->getValueType(0).getVectorNumElements() == 1 &&
-         "Unexected vector type!");
+         "Unexpected vector type!");
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
   SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N),
                            N->getValueType(0).getScalarType(), Elt);
@@ -509,12 +532,12 @@
                              N->getBasePtr(), N->getPointerInfo(),
                              N->getMemoryVT().getVectorElementType(),
                              N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment(), N->getTBAAInfo());
+                             N->getAlignment(), N->getAAInfo());
 
   return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
                       N->getBasePtr(), N->getPointerInfo(),
                       N->isVolatile(), N->isNonTemporal(),
-                      N->getOriginalAlignment(), N->getTBAAInfo());
+                      N->getOriginalAlignment(), N->getAAInfo());
 }
 
 /// ScalarizeVecOp_FP_ROUND - If the value to round is a vector that needs
@@ -627,6 +650,8 @@
   case ISD::FCOPYSIGN:
   case ISD::FSUB:
   case ISD::FMUL:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::FDIV:
@@ -868,6 +893,10 @@
     return;
   }
 
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(0), true))
+    return;
+
   // Spill the vector to the stack.
   EVT VecVT = Vec.getValueType();
   EVT EltVT = VecVT.getVectorElementType();
@@ -923,14 +952,14 @@
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo(), LoMemVT, isVolatile, isNonTemporal,
-                   isInvariant, Alignment, TBAAInfo);
+                   isInvariant, Alignment, AAInfo);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
@@ -938,7 +967,7 @@
   Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
                    HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment,
-                   TBAAInfo);
+                   AAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -1349,6 +1378,10 @@
                                                   Idx.getValueType())), 0);
   }
 
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(0), true))
+    return SDValue();
+
   // Store the vector to the stack.
   EVT EltVT = VecVT.getVectorElementType();
   SDLoc dl(N);
@@ -1359,7 +1392,7 @@
   // Load back the required element.
   StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
   return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
-                        MachinePointerInfo(), EltVT, false, false, 0);
+                        MachinePointerInfo(), EltVT, false, false, false, 0);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -1374,7 +1407,7 @@
   unsigned Alignment = N->getOriginalAlignment();
   bool isVol = N->isVolatile();
   bool isNT = N->isNonTemporal();
-  const MDNode *TBAAInfo = N->getTBAAInfo();
+  AAMDNodes AAInfo = N->getAAInfo();
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(1), Lo, Hi);
 
@@ -1385,10 +1418,10 @@
 
   if (isTruncating)
     Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                           LoMemVT, isVol, isNT, Alignment, TBAAInfo);
+                           LoMemVT, isVol, isNT, Alignment, AAInfo);
   else
     Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                      isVol, isNT, Alignment, TBAAInfo);
+                      isVol, isNT, Alignment, AAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@@ -1397,11 +1430,11 @@
   if (isTruncating)
     Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
-                           HiMemVT, isVol, isNT, Alignment, TBAAInfo);
+                           HiMemVT, isVol, isNT, Alignment, AAInfo);
   else
     Hi = DAG.getStore(Ch, DL, Hi, Ptr,
                       N->getPointerInfo().getWithOffset(IncrementSize),
-                      isVol, isNT, Alignment, TBAAInfo);
+                      isVol, isNT, Alignment, AAInfo);
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
@@ -1575,6 +1608,8 @@
   case ISD::OR:
   case ISD::SUB:
   case ISD::XOR:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
     Res = WidenVecRes_Binary(N);
     break;
 
@@ -2737,7 +2772,7 @@
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
   bool      isInvariant = LD->isInvariant();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;          // Difference
@@ -2748,7 +2783,7 @@
   int NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
                              isVolatile, isNonTemporal, isInvariant, Align,
-                             TBAAInfo);
+                             AAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
   // Check if we can load the element with one instruction
@@ -2793,7 +2828,7 @@
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
                       isNonTemporal, isInvariant, MinAlign(Align, Increment),
-                      TBAAInfo);
+                      AAInfo);
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector()) {
         SmallVector<SDValue, 16> Loads;
@@ -2809,7 +2844,7 @@
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
                       isNonTemporal, isInvariant, MinAlign(Align, Increment),
-                      TBAAInfo);
+                      AAInfo);
       LdChain.push_back(L.getValue(1));
     }
 
@@ -2889,7 +2924,8 @@
   unsigned  Align    = LD->getAlignment();
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
-  const MDNode *TBAAInfo = LD->getTBAAInfo();
+  bool      isInvariant = LD->isInvariant();
+  AAMDNodes AAInfo = LD->getAAInfo();
 
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
@@ -2901,7 +2937,8 @@
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
   Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr,
                           LD->getPointerInfo(),
-                          LdEltVT, isVolatile, isNonTemporal, Align, TBAAInfo);
+                          LdEltVT, isVolatile, isNonTemporal, isInvariant,
+                          Align, AAInfo);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
@@ -2911,7 +2948,8 @@
                                                      BasePtr.getValueType()));
     Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
                             LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
-                            isVolatile, isNonTemporal, Align, TBAAInfo);
+                            isVolatile, isNonTemporal, isInvariant, Align,
+                            AAInfo);
     LdChain.push_back(Ops[i].getValue(1));
   }
 
@@ -2934,7 +2972,7 @@
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
-  const MDNode *TBAAInfo = ST->getTBAAInfo();
+  AAMDNodes AAInfo = ST->getAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
@@ -2961,7 +2999,7 @@
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset), TBAAInfo));
+                                       MinAlign(Align, Offset), AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         Idx += NumVTElts;
@@ -2981,7 +3019,7 @@
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset), TBAAInfo));
+                                       MinAlign(Align, Offset), AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
@@ -3003,7 +3041,7 @@
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
-  const MDNode *TBAAInfo = ST->getTBAAInfo();
+  AAMDNodes AAInfo = ST->getAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
@@ -3027,7 +3065,7 @@
   StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
                                       ST->getPointerInfo(), StEltVT,
                                       isVolatile, isNonTemporal, Align,
-                                      TBAAInfo));
+                                      AAInfo));
   unsigned Offset = Increment;
   for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
@@ -3038,7 +3076,7 @@
     StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr,
                                       ST->getPointerInfo().getWithOffset(Offset),
                                         StEltVT, isVolatile, isNonTemporal,
-                                        MinAlign(Align, Offset), TBAAInfo));
+                                        MinAlign(Align, Offset), AAInfo));
   }
 }
 

diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 624003f..db38b76 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp

@@ -27,6 +27,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -40,32 +41,29 @@
   "dfa-sched-reg-pressure-threshold", cl::Hidden, cl::ZeroOrMore, cl::init(5),
   cl::desc("Track reg pressure and switch priority to in-depth"));
 
+ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
+    : Picker(this), InstrItins(IS->MF->getSubtarget().getInstrItineraryData()) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  TRI = STI.getRegisterInfo();
+  TLI = IS->TLI;
+  TII = STI.getInstrInfo();
+  ResourcesModel = TII->CreateTargetScheduleState(STI);
+  // This hard requirement could be relaxed, but for now
+  // do not let it procede.
+  assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
 
-ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) :
-  Picker(this),
- InstrItins(IS->getTargetLowering()->getTargetMachine().getInstrItineraryData())
-{
-   TII = IS->getTargetLowering()->getTargetMachine().getInstrInfo();
-   TRI = IS->getTargetLowering()->getTargetMachine().getRegisterInfo();
-   TLI = IS->getTargetLowering();
+  unsigned NumRC = TRI->getNumRegClasses();
+  RegLimit.resize(NumRC);
+  RegPressure.resize(NumRC);
+  std::fill(RegLimit.begin(), RegLimit.end(), 0);
+  std::fill(RegPressure.begin(), RegPressure.end(), 0);
+  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+                                             E = TRI->regclass_end();
+       I != E; ++I)
+    RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF);
 
-   const TargetMachine &tm = (*IS->MF).getTarget();
-   ResourcesModel = tm.getInstrInfo()->CreateTargetScheduleState(&tm,nullptr);
-   // This hard requirement could be relaxed, but for now
-   // do not let it procede.
-   assert (ResourcesModel && "Unimplemented CreateTargetScheduleState.");
-
-   unsigned NumRC = TRI->getNumRegClasses();
-   RegLimit.resize(NumRC);
-   RegPressure.resize(NumRC);
-   std::fill(RegLimit.begin(), RegLimit.end(), 0);
-   std::fill(RegPressure.begin(), RegPressure.end(), 0);
-   for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-        E = TRI->regclass_end(); I != E; ++I)
-     RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF);
-
-   ParallelLiveRanges = 0;
-   HorizontalVerticalBalance = 0;
+  ParallelLiveRanges = 0;
+  HorizontalVerticalBalance = 0;
 }
 
 unsigned
@@ -319,7 +317,7 @@
 
   // If packet is now full, reset the state so in the next cycle
   // we start fresh.
-  if (Packet.size() >= InstrItins->SchedModel->IssueWidth) {
+  if (Packet.size() >= InstrItins->SchedModel.IssueWidth) {
     ResourcesModel->clearResources();
     Packet.clear();
   }

diff --git a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
index ee54292..bce69d7 100644
--- a/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
+++ b/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SDNODEDBGVALUE_H
-#define LLVM_CODEGEN_SDNODEDBGVALUE_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/DebugLoc.h"
@@ -44,7 +44,8 @@
     const Value *Const;     // valid for constants
     unsigned FrameIx;       // valid for stack objects
   } u;
-  MDNode *mdPtr;
+  MDNode *Var;
+  MDNode *Expr;
   bool IsIndirect;
   uint64_t Offset;
   DebugLoc DL;
@@ -52,69 +53,72 @@
   bool Invalid;
 public:
   // Constructor for non-constants.
-  SDDbgValue(MDNode *mdP, SDNode *N, unsigned R,
-	     bool indir, uint64_t off, DebugLoc dl,
-             unsigned O) : mdPtr(mdP), IsIndirect(indir),
-			   Offset(off), DL(dl), Order(O),
-                           Invalid(false) {
+  SDDbgValue(MDNode *Var, MDNode *Expr, SDNode *N, unsigned R, bool indir,
+             uint64_t off, DebugLoc dl, unsigned O)
+      : Var(Var), Expr(Expr), IsIndirect(indir), Offset(off), DL(dl), Order(O),
+        Invalid(false) {
     kind = SDNODE;
     u.s.Node = N;
     u.s.ResNo = R;
   }
 
   // Constructor for constants.
-  SDDbgValue(MDNode *mdP, const Value *C, uint64_t off, DebugLoc dl,
-             unsigned O) : 
-    mdPtr(mdP), IsIndirect(false), Offset(off), DL(dl), Order(O),
-    Invalid(false) {
+  SDDbgValue(MDNode *Var, MDNode *Expr, const Value *C, uint64_t off,
+             DebugLoc dl, unsigned O)
+      : Var(Var), Expr(Expr), IsIndirect(false), Offset(off), DL(dl), Order(O),
+        Invalid(false) {
     kind = CONST;
     u.Const = C;
   }
 
   // Constructor for frame indices.
-  SDDbgValue(MDNode *mdP, unsigned FI, uint64_t off, DebugLoc dl, unsigned O) : 
-    mdPtr(mdP), IsIndirect(false), Offset(off), DL(dl), Order(O),
-    Invalid(false) {
+  SDDbgValue(MDNode *Var, MDNode *Expr, unsigned FI, uint64_t off, DebugLoc dl,
+             unsigned O)
+      : Var(Var), Expr(Expr), IsIndirect(false), Offset(off), DL(dl), Order(O),
+        Invalid(false) {
     kind = FRAMEIX;
     u.FrameIx = FI;
   }
 
   // Returns the kind.
-  DbgValueKind getKind() { return kind; }
+  DbgValueKind getKind() const { return kind; }
 
-  // Returns the MDNode pointer.
-  MDNode *getMDPtr() { return mdPtr; }
+  // Returns the MDNode pointer for the variable.
+  MDNode *getVariable() const { return Var; }
+
+  // Returns the MDNode pointer for the expression.
+  MDNode *getExpression() const { return Expr; }
 
   // Returns the SDNode* for a register ref
-  SDNode *getSDNode() { assert (kind==SDNODE); return u.s.Node; }
+  SDNode *getSDNode() const { assert (kind==SDNODE); return u.s.Node; }
 
   // Returns the ResNo for a register ref
-  unsigned getResNo() { assert (kind==SDNODE); return u.s.ResNo; }
+  unsigned getResNo() const { assert (kind==SDNODE); return u.s.ResNo; }
 
   // Returns the Value* for a constant
-  const Value *getConst() { assert (kind==CONST); return u.Const; }
+  const Value *getConst() const { assert (kind==CONST); return u.Const; }
 
   // Returns the FrameIx for a stack object
-  unsigned getFrameIx() { assert (kind==FRAMEIX); return u.FrameIx; }
+  unsigned getFrameIx() const { assert (kind==FRAMEIX); return u.FrameIx; }
 
   // Returns whether this is an indirect value.
-  bool isIndirect() { return IsIndirect; }
+  bool isIndirect() const { return IsIndirect; }
 
   // Returns the offset.
-  uint64_t getOffset() { return Offset; }
+  uint64_t getOffset() const { return Offset; }
 
   // Returns the DebugLoc.
-  DebugLoc getDebugLoc() { return DL; }
+  DebugLoc getDebugLoc() const { return DL; }
 
   // Returns the SDNodeOrder.  This is the order of the preceding node in the
   // input.
-  unsigned getOrder() { return Order; }
+  unsigned getOrder() const { return Order; }
 
   // setIsInvalidated / isInvalidated - Setter / getter of the "Invalidated"
   // property. A SDDbgValue is invalid if the SDNode that produces the value is
   // deleted.
   void setIsInvalidated() { Invalid = true; }
-  bool isInvalidated() { return Invalid; }
+  bool isInvalidated() const { return Invalid; }
 };
 
 } // end llvm namespace

diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 4d8c2c7..61a3fd7 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp

@@ -221,7 +221,7 @@
   SUnit *NewSU;
   bool TryUnfold = false;
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
-    EVT VT = N->getValueType(i);
+    MVT VT = N->getSimpleValueType(i);
     if (VT == MVT::Glue)
       return nullptr;
     else if (VT == MVT::Other)
@@ -229,7 +229,7 @@
   }
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     const SDValue &Op = N->getOperand(i);
-    EVT VT = Op.getNode()->getValueType(Op.getResNo());
+    MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
     if (VT == MVT::Glue)
       return nullptr;
   }
@@ -431,17 +431,23 @@
 /// getPhysicalRegisterVT - Returns the ValueType of the physical register
 /// definition of the specified node.
 /// FIXME: Move to SelectionDAG?
-static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
                                  const TargetInstrInfo *TII) {
-  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
-  assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
-  unsigned NumRes = MCID.getNumDefs();
-  for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
-    if (Reg == *ImpDef)
-      break;
-    ++NumRes;
+  unsigned NumRes;
+  if (N->getOpcode() == ISD::CopyFromReg) {
+    // CopyFromReg has: "chain, Val, glue" so operand 1 gives the type.
+    NumRes = 1;
+  } else {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+    NumRes = MCID.getNumDefs();
+    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+      if (Reg == *ImpDef)
+        break;
+      ++NumRes;
+    }
   }
-  return N->getValueType(NumRes);
+  return N->getSimpleValueType(NumRes);
 }
 
 /// CheckForLiveRegDef - Return true and update live register vector if the
@@ -454,7 +460,7 @@
   bool Added = false;
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
     if (LiveRegDefs[*AI] && LiveRegDefs[*AI] != SU) {
-      if (RegAdded.insert(*AI)) {
+      if (RegAdded.insert(*AI).second) {
         LRegs.push_back(*AI);
         Added = true;
       }
@@ -572,7 +578,7 @@
         assert(LRegs.size() == 1 && "Can't handle this yet!");
         unsigned Reg = LRegs[0];
         SUnit *LRDef = LiveRegDefs[Reg];
-        EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+        MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
         const TargetRegisterClass *RC =
           TRI->getMinimalPhysRegClass(Reg, VT);
         const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);

diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 13cfae7..8b54e656 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp

@@ -30,8 +30,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
 using namespace llvm;
 
@@ -166,12 +166,11 @@
       NeedLatency(needlatency), AvailableQueue(availqueue), CurCycle(0),
       Topo(SUnits, nullptr) {
 
-    const TargetMachine &tm = mf.getTarget();
+    const TargetSubtargetInfo &STI = mf.getSubtarget();
     if (DisableSchedCycles || !NeedLatency)
       HazardRec = new ScheduleHazardRecognizer();
     else
-      HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(
-          tm.getSubtargetImpl(), this);
+      HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
   }
 
   ~ScheduleDAGRRList() {
@@ -946,7 +945,7 @@
   SUnit *NewSU;
   bool TryUnfold = false;
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
-    EVT VT = N->getValueType(i);
+    MVT VT = N->getSimpleValueType(i);
     if (VT == MVT::Glue)
       return nullptr;
     else if (VT == MVT::Other)
@@ -954,7 +953,7 @@
   }
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     const SDValue &Op = N->getOperand(i);
-    EVT VT = Op.getNode()->getValueType(Op.getResNo());
+    MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
     if (VT == MVT::Glue)
       return nullptr;
   }
@@ -1189,17 +1188,23 @@
 /// getPhysicalRegisterVT - Returns the ValueType of the physical register
 /// definition of the specified node.
 /// FIXME: Move to SelectionDAG?
-static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
                                  const TargetInstrInfo *TII) {
-  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
-  assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
-  unsigned NumRes = MCID.getNumDefs();
-  for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
-    if (Reg == *ImpDef)
-      break;
-    ++NumRes;
+  unsigned NumRes;
+  if (N->getOpcode() == ISD::CopyFromReg) {
+    // CopyFromReg has: "chain, Val, glue" so operand 1 gives the type.
+    NumRes = 1;
+  } else {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+    NumRes = MCID.getNumDefs();
+    for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+      if (Reg == *ImpDef)
+        break;
+      ++NumRes;
+    }
   }
-  return N->getValueType(NumRes);
+  return N->getSimpleValueType(NumRes);
 }
 
 /// CheckForLiveRegDef - Return true and update live register vector if the
@@ -1218,7 +1223,7 @@
     if (LiveRegDefs[*AliasI] == SU) continue;
 
     // Add Reg to the set of interfering live regs.
-    if (RegAdded.insert(*AliasI)) {
+    if (RegAdded.insert(*AliasI).second) {
       LRegs.push_back(*AliasI);
     }
   }
@@ -1235,7 +1240,7 @@
     if (!LiveRegDefs[i]) continue;
     if (LiveRegDefs[i] == SU) continue;
     if (!MachineOperand::clobbersPhysReg(RegMask, i)) continue;
-    if (RegAdded.insert(i))
+    if (RegAdded.insert(i).second)
       LRegs.push_back(i);
   }
 }
@@ -1310,7 +1315,8 @@
         SDNode *Gen = LiveRegGens[CallResource]->getNode();
         while (SDNode *Glued = Gen->getGluedNode())
           Gen = Glued;
-        if (!IsChainDependent(Gen, Node, 0, TII) && RegAdded.insert(CallResource))
+        if (!IsChainDependent(Gen, Node, 0, TII) &&
+            RegAdded.insert(CallResource).second)
           LRegs.push_back(CallResource);
       }
     }
@@ -1373,7 +1379,7 @@
       Interferences.push_back(CurSU);
     }
     else {
-      assert(CurSU->isPending && "Intereferences are pending");
+      assert(CurSU->isPending && "Interferences are pending");
       // Update the interference with current live regs.
       LRegsPair.first->second = LRegs;
     }
@@ -1439,7 +1445,7 @@
     assert(LRegs.size() == 1 && "Can't handle this yet!");
     unsigned Reg = LRegs[0];
     SUnit *LRDef = LiveRegDefs[Reg];
-    EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+    MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
     const TargetRegisterClass *RC =
       TRI->getMinimalPhysRegClass(Reg, VT);
     const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
@@ -1930,8 +1936,8 @@
     unsigned Id = RC->getID();
     unsigned RP = RegPressure[Id];
     if (!RP) continue;
-    DEBUG(dbgs() << RC->getName() << ": " << RP << " / " << RegLimit[Id]
-          << '\n');
+    DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "
+          << RegLimit[Id] << '\n');
   }
 #endif
 }
@@ -2754,7 +2760,7 @@
     if (!SUImpDefs && !SURegMask)
       continue;
     for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
-      EVT VT = N->getValueType(i);
+      MVT VT = N->getSimpleValueType(i);
       if (VT == MVT::Glue || VT == MVT::Other)
         continue;
       if (!N->hasAnyUseOfValue(i))
@@ -2977,9 +2983,9 @@
 llvm::ScheduleDAGSDNodes *
 llvm::createBURRListDAGScheduler(SelectionDAGISel *IS,
                                  CodeGenOpt::Level OptLevel) {
-  const TargetMachine &TM = IS->TM;
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
   BURegReductionPriorityQueue *PQ =
     new BURegReductionPriorityQueue(*IS->MF, false, false, TII, TRI, nullptr);
@@ -2991,9 +2997,9 @@
 llvm::ScheduleDAGSDNodes *
 llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,
                                    CodeGenOpt::Level OptLevel) {
-  const TargetMachine &TM = IS->TM;
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
   SrcRegReductionPriorityQueue *PQ =
     new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, nullptr);
@@ -3005,10 +3011,10 @@
 llvm::ScheduleDAGSDNodes *
 llvm::createHybridListDAGScheduler(SelectionDAGISel *IS,
                                    CodeGenOpt::Level OptLevel) {
-  const TargetMachine &TM = IS->TM;
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  const TargetLowering *TLI = IS->getTargetLowering();
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetLowering *TLI = IS->TLI;
 
   HybridBURRPriorityQueue *PQ =
     new HybridBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
@@ -3021,10 +3027,10 @@
 llvm::ScheduleDAGSDNodes *
 llvm::createILPListDAGScheduler(SelectionDAGISel *IS,
                                 CodeGenOpt::Level OptLevel) {
-  const TargetMachine &TM = IS->TM;
-  const TargetInstrInfo *TII = TM.getInstrInfo();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  const TargetLowering *TLI = IS->getTargetLowering();
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetLowering *TLI = IS->TLI;
 
   ILPBURRPriorityQueue *PQ =
     new ILPBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);

diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index de910b7..8b9f618 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp

@@ -29,7 +29,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
@@ -38,17 +37,17 @@
 
 STATISTIC(LoadsClustered, "Number of loads clustered together");
 
-// This allows latency based scheduler to notice high latency instructions
-// without a target itinerary. The choise if number here has more to do with
-// balancing scheduler heursitics than with the actual machine latency.
+// This allows the latency-based scheduler to notice high latency instructions
+// without a target itinerary. The choice of number here has more to do with
+// balancing scheduler heuristics than with the actual machine latency.
 static cl::opt<int> HighLatencyCycles(
   "sched-high-latency-cycles", cl::Hidden, cl::init(10),
   cl::desc("Roughly estimate the number of cycles that 'long latency'"
            "instructions take for targets with no itinerary"));
 
 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
-  : ScheduleDAG(mf), BB(nullptr), DAG(nullptr),
-    InstrItins(mf.getTarget().getInstrItineraryData()) {}
+    : ScheduleDAG(mf), BB(nullptr), DAG(nullptr),
+      InstrItins(mf.getSubtarget().getInstrItineraryData()) {}
 
 /// Run - perform scheduling.
 ///
@@ -120,15 +119,20 @@
     return;
 
   unsigned ResNo = User->getOperand(2).getResNo();
-  if (Def->isMachineOpcode()) {
+  if (Def->getOpcode() == ISD::CopyFromReg &&
+      cast<RegisterSDNode>(Def->getOperand(1))->getReg() == Reg) {
+    PhysReg = Reg;
+  } else if (Def->isMachineOpcode()) {
     const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
     if (ResNo >= II.getNumDefs() &&
-        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg) {
+        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg)
       PhysReg = Reg;
-      const TargetRegisterClass *RC =
-        TRI->getMinimalPhysRegClass(Reg, Def->getValueType(ResNo));
-      Cost = RC->getCopyCost();
-    }
+  }
+
+  if (PhysReg != 0) {
+    const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, Def->getSimpleValueType(ResNo));
+    Cost = RC->getCopyCost();
   }
 }
 
@@ -136,7 +140,7 @@
 static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG,
                                 SmallVectorImpl<EVT> &VTs,
                                 SDValue ExtraOper = SDValue()) {
-  SmallVector<SDValue, 4> Ops;
+  SmallVector<SDValue, 8> Ops;
   for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
     Ops.push_back(N->getOperand(I));
 
@@ -226,7 +230,7 @@
   for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
        I != E && UseCount < 100; ++I, ++UseCount) {
     SDNode *User = *I;
-    if (User == Node || !Visited.insert(User))
+    if (User == Node || !Visited.insert(User).second)
       continue;
     int64_t Offset1, Offset2;
     if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
@@ -339,7 +343,7 @@
 
     // Add all operands to the worklist unless they've already been added.
     for (unsigned i = 0, e = NI->getNumOperands(); i != e; ++i)
-      if (Visited.insert(NI->getOperand(i).getNode()))
+      if (Visited.insert(NI->getOperand(i).getNode()).second)
         Worklist.push_back(NI->getOperand(i).getNode());
 
     if (isPassiveNode(NI))  // Leaf node, e.g. a TargetImmediate.
@@ -425,7 +429,7 @@
 }
 
 void ScheduleDAGSDNodes::AddSchedEdges() {
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
 
   // Check to see if the scheduler cares about latencies.
   bool UnitLatencies = forceUnitLatencies();
@@ -733,7 +737,7 @@
                   SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
                   SmallSet<unsigned, 8> &Seen) {
   unsigned Order = N->getIROrder();
-  if (!Order || !Seen.insert(Order)) {
+  if (!Order || !Seen.insert(Order).second) {
     // Process any valid SDDbgValues even if node does not have any order
     // assigned.
     ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, 0);

diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 39ebadf..2cd1f4b 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SCHEDULEDAGSDNODES_H
-#define SCHEDULEDAGSDNODES_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ScheduleDAG.h"

diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index 4589b0c..418b58e 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp

@@ -31,6 +31,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
 using namespace llvm;
 
@@ -71,10 +72,8 @@
                   AliasAnalysis *aa,
                   SchedulingPriorityQueue *availqueue)
     : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
-
-    const TargetMachine &tm = mf.getTarget();
-    HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(
-        tm.getSubtargetImpl(), this);
+    const TargetSubtargetInfo &STI = mf.getSubtarget();
+    HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
   }
 
   ~ScheduleDAGVLIW() {

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index daff1f2..7961e66 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

@@ -46,6 +46,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 #include <cmath>
 
@@ -95,7 +96,7 @@
 /// BUILD_VECTOR where all of the elements are ~0 or undef.
 bool ISD::isBuildVectorAllOnes(const SDNode *N) {
   // Look through a bit convert.
-  if (N->getOpcode() == ISD::BITCAST)
+  while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
@@ -143,7 +144,7 @@
 /// BUILD_VECTOR where all of the elements are 0 or undef.
 bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   // Look through a bit convert.
-  if (N->getOpcode() == ISD::BITCAST)
+  while (N->getOpcode() == ISD::BITCAST)
     N = N->getOperand(0).getNode();
 
   if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
@@ -686,6 +687,15 @@
   DeallocateNode(N);
 }
 
+void SDDbgInfo::erase(const SDNode *Node) {
+  DbgValMapType::iterator I = DbgValMap.find(Node);
+  if (I == DbgValMap.end())
+    return;
+  for (auto &Val: I->second)
+    Val->setIsInvalidated();
+  DbgValMap.erase(I);
+}
+
 void SelectionDAG::DeallocateNode(SDNode *N) {
   if (N->OperandsNeedDelete)
     delete[] N->OperandList;
@@ -696,10 +706,60 @@
 
   NodeAllocator.Deallocate(AllNodes.remove(N));
 
-  // If any of the SDDbgValue nodes refer to this SDNode, invalidate them.
-  ArrayRef<SDDbgValue*> DbgVals = DbgInfo->getSDDbgValues(N);
-  for (unsigned i = 0, e = DbgVals.size(); i != e; ++i)
-    DbgVals[i]->setIsInvalidated();
+  // If any of the SDDbgValue nodes refer to this SDNode, invalidate
+  // them and forget about that node.
+  DbgInfo->erase(N);
+}
+
+#ifndef NDEBUG
+/// VerifySDNode - Sanity check the given SDNode.  Aborts if it is invalid.
+static void VerifySDNode(SDNode *N) {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::BUILD_PAIR: {
+    EVT VT = N->getValueType(0);
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
+           "Wrong return type!");
+    assert(N->getNumOperands() == 2 && "Wrong number of operands!");
+    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
+           "Mismatched operand types!");
+    assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
+           "Wrong operand type!");
+    assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
+           "Wrong return type size");
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(N->getValueType(0).isVector() && "Wrong return type!");
+    assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
+           "Wrong number of operands!");
+    EVT EltVT = N->getValueType(0).getVectorElementType();
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
+      assert((I->getValueType() == EltVT ||
+             (EltVT.isInteger() && I->getValueType().isInteger() &&
+              EltVT.bitsLE(I->getValueType()))) &&
+            "Wrong operand type!");
+      assert(I->getValueType() == N->getOperand(0).getValueType() &&
+             "Operands must all have the same type");
+    }
+    break;
+  }
+  }
+}
+#endif // NDEBUG
+
+/// \brief Insert a newly allocated node into the DAG.
+///
+/// Handles insertion into the all nodes list and CSE map, as well as
+/// verification and other common operations when a new node is allocated.
+void SelectionDAG::InsertNode(SDNode *N) {
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifySDNode(N);
+#endif
 }
 
 /// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
@@ -839,83 +899,6 @@
   return Node;
 }
 
-#ifndef NDEBUG
-/// VerifyNodeCommon - Sanity check the given node.  Aborts if it is invalid.
-static void VerifyNodeCommon(SDNode *N) {
-  switch (N->getOpcode()) {
-  default:
-    break;
-  case ISD::BUILD_PAIR: {
-    EVT VT = N->getValueType(0);
-    assert(N->getNumValues() == 1 && "Too many results!");
-    assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
-           "Wrong return type!");
-    assert(N->getNumOperands() == 2 && "Wrong number of operands!");
-    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
-           "Mismatched operand types!");
-    assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
-           "Wrong operand type!");
-    assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
-           "Wrong return type size");
-    break;
-  }
-  case ISD::BUILD_VECTOR: {
-    assert(N->getNumValues() == 1 && "Too many results!");
-    assert(N->getValueType(0).isVector() && "Wrong return type!");
-    assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
-           "Wrong number of operands!");
-    EVT EltVT = N->getValueType(0).getVectorElementType();
-    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
-      assert((I->getValueType() == EltVT ||
-             (EltVT.isInteger() && I->getValueType().isInteger() &&
-              EltVT.bitsLE(I->getValueType()))) &&
-            "Wrong operand type!");
-      assert(I->getValueType() == N->getOperand(0).getValueType() &&
-             "Operands must all have the same type");
-    }
-    break;
-  }
-  }
-}
-
-/// VerifySDNode - Sanity check the given SDNode.  Aborts if it is invalid.
-static void VerifySDNode(SDNode *N) {
-  // The SDNode allocators cannot be used to allocate nodes with fields that are
-  // not present in an SDNode!
-  assert(!isa<MemSDNode>(N) && "Bad MemSDNode!");
-  assert(!isa<ShuffleVectorSDNode>(N) && "Bad ShuffleVectorSDNode!");
-  assert(!isa<ConstantSDNode>(N) && "Bad ConstantSDNode!");
-  assert(!isa<ConstantFPSDNode>(N) && "Bad ConstantFPSDNode!");
-  assert(!isa<GlobalAddressSDNode>(N) && "Bad GlobalAddressSDNode!");
-  assert(!isa<FrameIndexSDNode>(N) && "Bad FrameIndexSDNode!");
-  assert(!isa<JumpTableSDNode>(N) && "Bad JumpTableSDNode!");
-  assert(!isa<ConstantPoolSDNode>(N) && "Bad ConstantPoolSDNode!");
-  assert(!isa<BasicBlockSDNode>(N) && "Bad BasicBlockSDNode!");
-  assert(!isa<SrcValueSDNode>(N) && "Bad SrcValueSDNode!");
-  assert(!isa<MDNodeSDNode>(N) && "Bad MDNodeSDNode!");
-  assert(!isa<RegisterSDNode>(N) && "Bad RegisterSDNode!");
-  assert(!isa<BlockAddressSDNode>(N) && "Bad BlockAddressSDNode!");
-  assert(!isa<EHLabelSDNode>(N) && "Bad EHLabelSDNode!");
-  assert(!isa<ExternalSymbolSDNode>(N) && "Bad ExternalSymbolSDNode!");
-  assert(!isa<CondCodeSDNode>(N) && "Bad CondCodeSDNode!");
-  assert(!isa<CvtRndSatSDNode>(N) && "Bad CvtRndSatSDNode!");
-  assert(!isa<VTSDNode>(N) && "Bad VTSDNode!");
-  assert(!isa<MachineSDNode>(N) && "Bad MachineSDNode!");
-
-  VerifyNodeCommon(N);
-}
-
-/// VerifyMachineNode - Sanity check the given MachineNode.  Aborts if it is
-/// invalid.
-static void VerifyMachineNode(SDNode *N) {
-  // The MachineNode allocators cannot be used to allocate nodes with fields
-  // that are not present in a MachineNode!
-  // Currently there are no such nodes.
-
-  VerifyNodeCommon(N);
-}
-#endif // NDEBUG
-
 /// getEVTAlignment - Compute the default alignment value for the
 /// given type.
 ///
@@ -924,22 +907,23 @@
                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
                    VT.getTypeForEVT(*getContext());
 
-  return TM.getTargetLowering()->getDataLayout()->getABITypeAlignment(Ty);
+  return TLI->getDataLayout()->getABITypeAlignment(Ty);
 }
 
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
-  : TM(tm), TSI(*tm.getSelectionDAGInfo()), TLI(nullptr), OptLevel(OL),
-    EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
-    Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
-    UpdateListeners(nullptr) {
+    : TM(tm), TSI(nullptr), TLI(nullptr), OptLevel(OL),
+      EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
+      Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
+      UpdateListeners(nullptr) {
   AllNodes.push_back(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
 
-void SelectionDAG::init(MachineFunction &mf, const TargetLowering *tli) {
+void SelectionDAG::init(MachineFunction &mf) {
   MF = &mf;
-  TLI = tli;
+  TLI = getSubtarget().getTargetLowering();
+  TSI = getSubtarget().getSelectionDAGInfo();
   Context = &mf.getFunction()->getContext();
 }
 
@@ -1108,8 +1092,6 @@
   EVT EltVT = VT.getScalarType();
   const ConstantInt *Elt = &Val;
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   // In some cases the vector type is legal but the element type is illegal and
   // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
   // inserted value (the type does not need to match the vector element type).
@@ -1185,7 +1167,7 @@
   if (!N) {
     N = new (NodeAllocator) ConstantSDNode(isT, isO, Elt, EltVT);
     CSEMap.InsertNode(N, IP);
-    AllNodes.push_back(N);
+    InsertNode(N);
   }
 
   SDValue Result(N, 0);
@@ -1198,7 +1180,7 @@
 }
 
 SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, bool isTarget) {
-  return getConstant(Val, TM.getTargetLowering()->getPointerTy(), isTarget);
+  return getConstant(Val, TLI->getPointerTy(), isTarget);
 }
 
 
@@ -1227,7 +1209,7 @@
   if (!N) {
     N = new (NodeAllocator) ConstantFPSDNode(isTarget, &V, EltVT);
     CSEMap.InsertNode(N, IP);
-    AllNodes.push_back(N);
+    InsertNode(N);
   }
 
   SDValue Result(N, 0);
@@ -1263,7 +1245,6 @@
                                        unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTargetGA) &&
          "Cannot set target flags on target-independent globals");
-  const TargetLowering *TLI = TM.getTargetLowering();
 
   // Truncate (with sign-extension) the offset value to the pointer size.
   unsigned BitWidth = TLI->getPointerTypeSizeInBits(GV->getType());
@@ -1290,7 +1271,7 @@
                                                       DL.getDebugLoc(), GV, VT,
                                                       Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+    InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1305,7 +1286,7 @@
 
   SDNode *N = new (NodeAllocator) FrameIndexSDNode(FI, VT, isTarget);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1325,7 +1306,7 @@
   SDNode *N = new (NodeAllocator) JumpTableSDNode(JTI, VT, isTarget,
                                                   TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1336,8 +1317,7 @@
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment =
-    TM.getTargetLowering()->getDataLayout()->getPrefTypeAlignment(C->getType());
+    Alignment = TLI->getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
@@ -1352,7 +1332,7 @@
   SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
                                                      Alignment, TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1364,8 +1344,7 @@
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment =
-    TM.getTargetLowering()->getDataLayout()->getPrefTypeAlignment(C->getType());
+    Alignment = TLI->getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
@@ -1380,7 +1359,7 @@
   SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
                                                      Alignment, TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1398,7 +1377,7 @@
   SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset,
                                                     TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1412,7 +1391,7 @@
 
   SDNode *N = new (NodeAllocator) BasicBlockSDNode(MBB);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1426,7 +1405,7 @@
 
   if (N) return SDValue(N, 0);
   N = new (NodeAllocator) VTSDNode(VT);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1434,7 +1413,7 @@
   SDNode *&N = ExternalSymbols[Sym];
   if (N) return SDValue(N, 0);
   N = new (NodeAllocator) ExternalSymbolSDNode(false, Sym, 0, VT);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1445,7 +1424,7 @@
                                                                TargetFlags)];
   if (N) return SDValue(N, 0);
   N = new (NodeAllocator) ExternalSymbolSDNode(true, Sym, TargetFlags, VT);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1456,7 +1435,7 @@
   if (!CondCodeNodes[Cond]) {
     CondCodeSDNode *N = new (NodeAllocator) CondCodeSDNode(Cond);
     CondCodeNodes[Cond] = N;
-    AllNodes.push_back(N);
+    InsertNode(N);
   }
 
   return SDValue(CondCodeNodes[Cond], 0);
@@ -1594,10 +1573,31 @@
                                             dl.getDebugLoc(), N1, N2,
                                             MaskAlloc);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
+SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
+  MVT VT = SV.getSimpleValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> MaskVec;
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int Idx = SV.getMaskElt(i);
+    if (Idx >= 0) {
+      if (Idx < (int)NumElems)
+        Idx += NumElems;
+      else
+        Idx -= NumElems;
+    }
+    MaskVec.push_back(Idx);
+  }
+
+  SDValue Op0 = SV.getOperand(0);
+  SDValue Op1 = SV.getOperand(1);
+  return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, &MaskVec[0]);
+}
+
 SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl,
                                        SDValue Val, SDValue DTy,
                                        SDValue STy, SDValue Rnd, SDValue Sat,
@@ -1619,7 +1619,7 @@
                                                            dl.getDebugLoc(),
                                                            Ops, Code);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1633,7 +1633,7 @@
 
   SDNode *N = new (NodeAllocator) RegisterSDNode(RegNo, VT);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1647,7 +1647,7 @@
 
   SDNode *N = new (NodeAllocator) RegisterMaskSDNode(RegMask);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1663,7 +1663,7 @@
   SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(),
                                                 dl.getDebugLoc(), Root, Label);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1686,7 +1686,7 @@
   SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, Offset,
                                                      TargetFlags);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1704,7 +1704,7 @@
 
   SDNode *N = new (NodeAllocator) SrcValueSDNode(V);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1720,7 +1720,7 @@
 
   SDNode *N = new (NodeAllocator) MDNodeSDNode(MD);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1741,7 +1741,7 @@
                                                       dl.getDebugLoc(),
                                                       VT, Ptr, SrcAS, DestAS);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -1749,7 +1749,7 @@
 /// the target's desired shift amount type.
 SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   EVT OpTy = Op.getValueType();
-  EVT ShTy = TM.getTargetLowering()->getShiftAmountTy(LHSTy);
+  EVT ShTy = TLI->getShiftAmountTy(LHSTy);
   if (OpTy == ShTy || OpTy.isVector()) return Op;
 
   ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ?  ISD::TRUNCATE : ISD::ZERO_EXTEND;
@@ -1762,7 +1762,6 @@
   MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
   unsigned ByteSize = VT.getStoreSize();
   Type *Ty = VT.getTypeForEVT(*getContext());
-  const TargetLowering *TLI = TM.getTargetLowering();
   unsigned StackAlign =
   std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty), minAlign);
 
@@ -1777,7 +1776,6 @@
                             VT2.getStoreSizeInBits())/8;
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
   Type *Ty2 = VT2.getTypeForEVT(*getContext());
-  const TargetLowering *TLI = TM.getTargetLowering();
   const DataLayout *TD = TLI->getDataLayout();
   unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1),
                             TD->getPrefTypeAlignment(Ty2));
@@ -1796,7 +1794,6 @@
   case ISD::SETFALSE2: return getConstant(0, VT);
   case ISD::SETTRUE:
   case ISD::SETTRUE2: {
-    const TargetLowering *TLI = TM.getTargetLowering();
     TargetLowering::BooleanContent Cnt =
         TLI->getBooleanContents(N1->getValueType(0));
     return getConstant(
@@ -1885,7 +1882,7 @@
       // Ensure that the constant occurs on the RHS.
       ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
       MVT CompVT = N1.getValueType().getSimpleVT();
-      if (!TM.getTargetLowering()->isCondCodeLegal(SwappedCond, CompVT))
+      if (!TLI->isCondCodeLegal(SwappedCond, CompVT))
         return SDValue();
 
       return getSetCC(dl, VT, N2, N1, SwappedCond);
@@ -1921,7 +1918,6 @@
 /// them in the KnownZero/KnownOne bitsets.
 void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
                                     APInt &KnownOne, unsigned Depth) const {
-  const TargetLowering *TLI = TM.getTargetLowering();
   unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
 
   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
@@ -2357,7 +2353,6 @@
 /// information.  For example, immediately after an "SRA X, 2", we know that
 /// the top 3 bits are all equal to each other, so we return 3.
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
-  const TargetLowering *TLI = TM.getTargetLowering();
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
   unsigned VTBits = VT.getScalarType().getSizeInBits();
@@ -2655,10 +2650,7 @@
                                          DL.getDebugLoc(), getVTList(VT));
   CSEMap.InsertNode(N, IP);
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -2753,7 +2745,7 @@
     case ISD::FP_TO_UINT: {
       integerPart x[2];
       bool ignored;
-      assert(integerPartWidth >= 64);
+      static_assert(integerPartWidth >= 64, "APFloat parts too small!");
       // FIXME need to be more flexible about rounding mode.
       APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
                             Opcode==ISD::FP_TO_SINT,
@@ -2772,6 +2764,31 @@
     }
   }
 
+  // Constant fold unary operations with a vector integer operand.
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand.getNode())) {
+    if (BV->isConstant()) {
+      switch (Opcode) {
+      default:
+        // FIXME: Entirely reasonable to perform folding of other unary
+        // operations here as the need arises.
+        break;
+      case ISD::UINT_TO_FP:
+      case ISD::SINT_TO_FP: {
+        SmallVector<SDValue, 8> Ops;
+        for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+          SDValue OpN = BV->getOperand(i);
+          // Let the above scalar folding handle the conversion of each
+          // element.
+          OpN = getNode(ISD::SINT_TO_FP, DL, VT.getVectorElementType(),
+                        OpN);
+          Ops.push_back(OpN);
+        }
+        return getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
+      }
+      }
+    }
+  }
+
   unsigned OpOpcode = Operand.getNode()->getOpcode();
   switch (Opcode) {
   case ISD::TokenFactor:
@@ -2931,10 +2948,7 @@
                                         DL.getDebugLoc(), VTs, Operand);
   }
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -3375,6 +3389,7 @@
   }
 
   // Constant fold FP operations.
+  bool HasFPExceptions = TLI->hasFloatingPointExceptions();
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode());
   ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
   if (N1CFP) {
@@ -3388,28 +3403,32 @@
       switch (Opcode) {
       case ISD::FADD:
         s = V1.add(V2, APFloat::rmNearestTiesToEven);
-        if (s != APFloat::opInvalidOp)
+        if (!HasFPExceptions || s != APFloat::opInvalidOp)
           return getConstantFP(V1, VT);
         break;
       case ISD::FSUB:
         s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
-        if (s!=APFloat::opInvalidOp)
+        if (!HasFPExceptions || s!=APFloat::opInvalidOp)
           return getConstantFP(V1, VT);
         break;
       case ISD::FMUL:
         s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
-        if (s!=APFloat::opInvalidOp)
+        if (!HasFPExceptions || s!=APFloat::opInvalidOp)
           return getConstantFP(V1, VT);
         break;
       case ISD::FDIV:
         s = V1.divide(V2, APFloat::rmNearestTiesToEven);
-        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+        if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
+                                 s!=APFloat::opDivByZero)) {
           return getConstantFP(V1, VT);
+        }
         break;
       case ISD::FREM :
         s = V1.mod(V2, APFloat::rmNearestTiesToEven);
-        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+        if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
+                                 s!=APFloat::opDivByZero)) {
           return getConstantFP(V1, VT);
+        }
         break;
       case ISD::FCOPYSIGN:
         V1.copySign(V2);
@@ -3526,10 +3545,7 @@
     N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, nuw, nsw, exact);
   }
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -3633,10 +3649,7 @@
                                           DL.getDebugLoc(), VTs, N1, N2, N3);
   }
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -3802,7 +3815,7 @@
   if (VT == MVT::Other) {
     unsigned AS = 0;
     if (DstAlign >= TLI.getDataLayout()->getPointerPrefAlignment(AS) ||
-        TLI.allowsUnalignedMemoryAccesses(VT, AS)) {
+        TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign)) {
       VT = TLI.getPointerTy();
     } else {
       switch (DstAlign & 7) {
@@ -3862,7 +3875,7 @@
       unsigned AS = 0;
       if (NumMemOps && AllowOverlap &&
           VTSize >= 8 && NewVTSize < Size &&
-          TLI.allowsUnalignedMemoryAccesses(VT, AS, &Fast) && Fast)
+          TLI.allowsMisalignedMemoryAccesses(VT, AS, DstAlign, &Fast) && Fast)
         VTSize = Size;
       else {
         VT = NewVT;
@@ -3926,7 +3939,7 @@
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
-    const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
        while (NewAlign > Align &&
              TLI.getDataLayout()->exceedsNaturalStackAlignment(NewAlign))
@@ -3982,7 +3995,7 @@
       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
                              getMemBasePlusOffset(Src, SrcOff, dl, DAG),
                              SrcPtrInfo.getWithOffset(SrcOff), VT, isVol, false,
-                             MinAlign(SrcAlign, SrcOff));
+                             false, MinAlign(SrcAlign, SrcOff));
       Store = DAG.getTruncStore(Chain, dl, Value,
                                 getMemBasePlusOffset(Dst, DstOff, dl, DAG),
                                 DstPtrInfo.getWithOffset(DstOff), VT, isVol,
@@ -4202,9 +4215,8 @@
   // Then check to see if we should lower the memcpy with target-specific
   // code. If the target chooses to do this, this is the next best.
   SDValue Result =
-    TSI.EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align,
-                                isVol, AlwaysInline,
-                                DstPtrInfo, SrcPtrInfo);
+      TSI->EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align,
+                                   isVol, AlwaysInline, DstPtrInfo, SrcPtrInfo);
   if (Result.getNode())
     return Result;
 
@@ -4223,8 +4235,6 @@
   // beyond the given memory regions. But fixing this isn't easy, and most
   // people don't care.
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -4270,17 +4280,14 @@
 
   // Then check to see if we should lower the memmove with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result =
-    TSI.EmitTargetCodeForMemmove(*this, dl, Chain, Dst, Src, Size, Align, isVol,
-                                 DstPtrInfo, SrcPtrInfo);
+  SDValue Result = TSI->EmitTargetCodeForMemmove(
+      *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
   if (Result.getNode())
     return Result;
 
   // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
   // not be safe.  See memcpy above for more details.
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -4325,31 +4332,22 @@
 
   // Then check to see if we should lower the memset with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result =
-    TSI.EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src, Size, Align, isVol,
-                                DstPtrInfo);
+  SDValue Result = TSI->EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src,
+                                                Size, Align, isVol, DstPtrInfo);
   if (Result.getNode())
     return Result;
 
   // Emit a library call.
-  const TargetLowering *TLI = TM.getTargetLowering();
   Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(*getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Dst; Entry.Ty = IntPtrTy;
   Args.push_back(Entry);
-  // Extend or truncate the argument to be an i32 value for the call.
-  if (Src.getValueType().bitsGT(MVT::i32))
-    Src = getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
-  else
-    Src = getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
   Entry.Node = Src;
-  Entry.Ty = Type::getInt32Ty(*getContext());
-  Entry.isSExt = true;
+  Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
   Args.push_back(Entry);
   Entry.Node = Size;
   Entry.Ty = IntPtrTy;
-  Entry.isSExt = false;
   Args.push_back(Entry);
 
   // FIXME: pass in SDLoc
@@ -4396,7 +4394,7 @@
                                                SuccessOrdering, FailureOrdering,
                                                SynchScope);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4541,7 +4539,7 @@
                                   ArrayRef<SDValue> Ops,
                                   EVT MemVT, MachinePointerInfo PtrInfo,
                                   unsigned Align, bool Vol,
-                                  bool ReadMem, bool WriteMem) {
+                                  bool ReadMem, bool WriteMem, unsigned Size) {
   if (Align == 0)  // Ensure that codegen never sees alignment 0
     Align = getEVTAlignment(MemVT);
 
@@ -4553,8 +4551,10 @@
     Flags |= MachineMemOperand::MOLoad;
   if (Vol)
     Flags |= MachineMemOperand::MOVolatile;
+  if (!Size)
+    Size = MemVT.getStoreSize();
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Align);
+    MF.getMachineMemOperand(PtrInfo, Flags, Size, Align);
 
   return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
 }
@@ -4593,7 +4593,7 @@
                                                dl.getDebugLoc(), VTList, Ops,
                                                MemVT, MMO);
   }
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4637,7 +4637,7 @@
                       SDValue Ptr, SDValue Offset,
                       MachinePointerInfo PtrInfo, EVT MemVT,
                       bool isVolatile, bool isNonTemporal, bool isInvariant,
-                      unsigned Alignment, const MDNode *TBAAInfo,
+                      unsigned Alignment, const AAMDNodes &AAInfo,
                       const MDNode *Ranges) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
@@ -4660,7 +4660,7 @@
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
-                            TBAAInfo, Ranges);
+                            AAInfo, Ranges);
   return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
 }
 
@@ -4709,7 +4709,7 @@
                                              dl.getDebugLoc(), VTs, AM, ExtType,
                                              MemVT, MMO);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4718,12 +4718,12 @@
                               MachinePointerInfo PtrInfo,
                               bool isVolatile, bool isNonTemporal,
                               bool isInvariant, unsigned Alignment,
-                              const MDNode *TBAAInfo,
+                              const AAMDNodes &AAInfo,
                               const MDNode *Ranges) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
                  PtrInfo, VT, isVolatile, isNonTemporal, isInvariant, Alignment,
-                 TBAAInfo, Ranges);
+                 AAInfo, Ranges);
 }
 
 SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
@@ -4738,11 +4738,12 @@
                                  SDValue Chain, SDValue Ptr,
                                  MachinePointerInfo PtrInfo, EVT MemVT,
                                  bool isVolatile, bool isNonTemporal,
-                                 unsigned Alignment, const MDNode *TBAAInfo) {
+                                 bool isInvariant, unsigned Alignment,
+                                 const AAMDNodes &AAInfo) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
   return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
-                 PtrInfo, MemVT, isVolatile, isNonTemporal, false, Alignment,
-                 TBAAInfo);
+                 PtrInfo, MemVT, isVolatile, isNonTemporal, isInvariant,
+                 Alignment, AAInfo);
 }
 
 
@@ -4769,7 +4770,7 @@
 SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
                                SDValue Ptr, MachinePointerInfo PtrInfo,
                                bool isVolatile, bool isNonTemporal,
-                               unsigned Alignment, const MDNode *TBAAInfo) {
+                               unsigned Alignment, const AAMDNodes &AAInfo) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
@@ -4788,7 +4789,7 @@
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags,
                             Val.getValueType().getStoreSize(), Alignment,
-                            TBAAInfo);
+                            AAInfo);
 
   return getStore(Chain, dl, Val, Ptr, MMO);
 }
@@ -4816,7 +4817,7 @@
                                               dl.getDebugLoc(), VTs,
                                               ISD::UNINDEXED, false, VT, MMO);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4824,7 +4825,7 @@
                                     SDValue Ptr, MachinePointerInfo PtrInfo,
                                     EVT SVT,bool isVolatile, bool isNonTemporal,
                                     unsigned Alignment,
-                                    const MDNode *TBAAInfo) {
+                                    const AAMDNodes &AAInfo) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
@@ -4842,7 +4843,7 @@
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, SVT.getStoreSize(), Alignment,
-                            TBAAInfo);
+                            AAInfo);
 
   return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
 }
@@ -4885,7 +4886,7 @@
                                               dl.getDebugLoc(), VTs,
                                               ISD::UNINDEXED, true, SVT, MMO);
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4912,7 +4913,7 @@
                                               ST->getMemoryVT(),
                                               ST->getMemOperand());
   CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -4991,10 +4992,7 @@
                                    VTs, Ops);
   }
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
@@ -5074,15 +5072,12 @@
                                      VTList, Ops);
     }
   }
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifySDNode(N);
-#endif
+  InsertNode(N);
   return SDValue(N, 0);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList) {
-  return getNode(Opcode, DL, VTList, ArrayRef<SDValue>());
+  return getNode(Opcode, DL, VTList, None);
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
@@ -5464,6 +5459,10 @@
 /// node, and because it doesn't require CSE recalculation for any of
 /// the node's users.
 ///
+/// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
+/// As a consequence it isn't appropriate to use from within the DAG combiner or
+/// the legalizer which maintain worklists that would need to be updated when
+/// deleting things.
 SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
                                   SDVTList VTs, ArrayRef<SDValue> Ops) {
   unsigned NumOps = Ops.size();
@@ -5530,10 +5529,9 @@
   // new operands.
   if (!DeadNodeSet.empty()) {
     SmallVector<SDNode *, 16> DeadNodes;
-    for (SmallPtrSet<SDNode *, 16>::iterator I = DeadNodeSet.begin(),
-         E = DeadNodeSet.end(); I != E; ++I)
-      if ((*I)->use_empty())
-        DeadNodes.push_back(*I);
+    for (SDNode *N : DeadNodeSet)
+      if (N->use_empty())
+        DeadNodes.push_back(N);
     RemoveDeadNodes(DeadNodes);
   }
 
@@ -5702,10 +5700,7 @@
   if (DoCSE)
     CSEMap.InsertNode(N, IP);
 
-  AllNodes.push_back(N);
-#ifndef NDEBUG
-  VerifyMachineNode(N);
-#endif
+  InsertNode(N);
   return N;
 }
 
@@ -5751,26 +5746,24 @@
 /// getDbgValue - Creates a SDDbgValue node.
 ///
 /// SDNode
-SDDbgValue *
-SelectionDAG::getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R,
-			  bool IsIndirect, uint64_t Off,
-                          DebugLoc DL, unsigned O) {
-  return new (Allocator) SDDbgValue(MDPtr, N, R, IsIndirect, Off, DL, O);
+SDDbgValue *SelectionDAG::getDbgValue(MDNode *Var, MDNode *Expr, SDNode *N,
+                                      unsigned R, bool IsIndirect, uint64_t Off,
+                                      DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(Var, Expr, N, R, IsIndirect, Off, DL, O);
 }
 
 /// Constant
-SDDbgValue *
-SelectionDAG::getConstantDbgValue(MDNode *MDPtr, const Value *C,
-				  uint64_t Off,
-				  DebugLoc DL, unsigned O) {
-  return new (Allocator) SDDbgValue(MDPtr, C, Off, DL, O);
+SDDbgValue *SelectionDAG::getConstantDbgValue(MDNode *Var, MDNode *Expr,
+                                              const Value *C, uint64_t Off,
+                                              DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(Var, Expr, C, Off, DL, O);
 }
 
 /// FrameIndex
-SDDbgValue *
-SelectionDAG::getFrameIndexDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
-				    DebugLoc DL, unsigned O) {
-  return new (Allocator) SDDbgValue(MDPtr, FI, Off, DL, O);
+SDDbgValue *SelectionDAG::getFrameIndexDbgValue(MDNode *Var, MDNode *Expr,
+                                                unsigned FI, uint64_t Off,
+                                                DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(Var, Expr, FI, Off, DL, O);
 }
 
 namespace {
@@ -6159,9 +6152,11 @@
 /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
 /// value is produced by SD.
 void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
-  DbgInfo->add(DB, SD, isParameter);
-  if (SD)
+  if (SD) {
+    assert(DbgInfo->getSDDbgValues(SD).empty() || SD->getHasDebugValue());
     SD->setHasDebugValue(true);
+  }
+  DbgInfo->add(DB, SD, isParameter);
 }
 
 /// TransferDbgValues - Transfer SDDbgValues.
@@ -6176,10 +6171,10 @@
        I != E; ++I) {
     SDDbgValue *Dbg = *I;
     if (Dbg->getKind() == SDDbgValue::SDNODE) {
-      SDDbgValue *Clone = getDbgValue(Dbg->getMDPtr(), ToNode, To.getResNo(),
-				      Dbg->isIndirect(),
-                                      Dbg->getOffset(), Dbg->getDebugLoc(),
-                                      Dbg->getOrder());
+      SDDbgValue *Clone =
+          getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode,
+                      To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(),
+                      Dbg->getDebugLoc(), Dbg->getOrder());
       ClonedDVs.push_back(Clone);
     }
   }
@@ -6217,7 +6212,10 @@
   assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
   assert(isNonTemporal() == MMO->isNonTemporal() &&
          "Non-temporal encoding error!");
-  assert(memvt.getStoreSize() == MMO->getSize() && "Size mismatch!");
+  // We check here that the size of the memory operand fits within the size of
+  // the MMO. This is because the MMO might indicate only a possible address
+  // range instead of specifying the affected memory addresses precisely.
+  assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
 }
 
 MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
@@ -6227,7 +6225,7 @@
   SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
                                       MMO->isNonTemporal(), MMO->isInvariant());
   assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
-  assert(memvt.getStoreSize() == MMO->getSize() && "Size mismatch!");
+  assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
 }
 
 /// Profile - Gather unique data for the node.
@@ -6371,7 +6369,7 @@
 
 bool
 SDNode::hasPredecessorHelper(const SDNode *N,
-                             SmallPtrSet<const SDNode *, 32> &Visited,
+                             SmallPtrSetImpl<const SDNode *> &Visited,
                              SmallVectorImpl<const SDNode *> &Worklist) const {
   if (Visited.empty()) {
     Worklist.push_back(this);
@@ -6387,7 +6385,7 @@
     const SDNode *M = Worklist.pop_back_val();
     for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
       SDNode *Op = M->getOperand(i).getNode();
-      if (Visited.insert(Op))
+      if (Visited.insert(Op).second)
         Worklist.push_back(Op);
       if (Op == N)
         return true;
@@ -6427,7 +6425,6 @@
       EVT OperandVT = Operand.getValueType();
       if (OperandVT.isVector()) {
         // A vector operand; extract a single element.
-        const TargetLowering *TLI = TM.getTargetLowering();
         EVT OperandEltVT = OperandVT.getVectorElementType();
         Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               OperandEltVT,
@@ -6507,7 +6504,6 @@
   const GlobalValue *GV2 = nullptr;
   int64_t Offset1 = 0;
   int64_t Offset2 = 0;
-  const TargetLowering *TLI = TM.getTargetLowering();
   bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1);
   bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
   if (isGA1 && isGA2 && GV1 == GV2)
@@ -6522,7 +6518,6 @@
   // If this is a GlobalAddress + cst, return the alignment.
   const GlobalValue *GV;
   int64_t GVOffset = 0;
-  const TargetLowering *TLI = TM.getTargetLowering();
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
     unsigned PtrWidth = TLI->getPointerTypeSizeInBits(GV->getType());
     APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
@@ -6749,8 +6744,8 @@
 
 #ifndef NDEBUG
 static void checkForCyclesHelper(const SDNode *N,
-                                 SmallPtrSet<const SDNode*, 32> &Visited,
-                                 SmallPtrSet<const SDNode*, 32> &Checked,
+                                 SmallPtrSetImpl<const SDNode*> &Visited,
+                                 SmallPtrSetImpl<const SDNode*> &Checked,
                                  const llvm::SelectionDAG *DAG) {
   // If this node has already been checked, don't check it again.
   if (Checked.count(N))
@@ -6758,7 +6753,7 @@
 
   // If a node has already been visited on this depth-first walk, reject it as
   // a cycle.
-  if (!Visited.insert(N)) {
+  if (!Visited.insert(N).second) {
     errs() << "Detected cycle in SelectionDAG\n";
     dbgs() << "Offending node:\n";
     N->dumprFull(DAG); dbgs() << "\n";

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 28d8e98..8f582f1 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

@@ -58,6 +58,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -645,8 +646,10 @@
     /// specified value into the registers specified by this object.  This uses
     /// Chain/Flag as the input and updates them for the output Chain/Flag.
     /// If the Flag pointer is NULL, no flag is used.
-    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
-                       SDValue &Chain, SDValue *Flag, const Value *V) const;
+    void
+    getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl, SDValue &Chain,
+                  SDValue *Flag, const Value *V,
+                  ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const;
 
     /// AddInlineAsmOperands - Add this value to the specified inlineasm node
     /// operand list.  This adds the code marker, matching input operand index
@@ -761,9 +764,10 @@
 /// Chain/Flag as the input and updates them for the output Chain/Flag.
 /// If the Flag pointer is NULL, no flag is used.
 void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
-                                 SDValue &Chain, SDValue *Flag,
-                                 const Value *V) const {
+                                 SDValue &Chain, SDValue *Flag, const Value *V,
+                                 ISD::NodeType PreferredExtendType) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  ISD::NodeType ExtendKind = PreferredExtendType;
 
   // Get the list of the values's legal parts.
   unsigned NumRegs = Regs.size();
@@ -772,8 +776,9 @@
     EVT ValueVT = ValueVTs[Value];
     unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
     MVT RegisterVT = RegVTs[Value];
-    ISD::NodeType ExtendKind =
-      TLI.isZExtFree(Val, RegisterVT)? ISD::ZERO_EXTEND: ISD::ANY_EXTEND;
+
+    if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
+      ExtendKind = ISD::ZERO_EXTEND;
 
     getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value),
                    &Parts[Part], NumParts, RegisterVT, V, ExtendKind);
@@ -860,7 +865,7 @@
   AA = &aa;
   GFI = gfi;
   LibInfo = li;
-  DL = DAG.getTarget().getDataLayout();
+  DL = DAG.getSubtarget().getDataLayout();
   Context = DAG.getContext();
   LPadToCallSiteMap.clear();
 }
@@ -988,15 +993,16 @@
     DebugLoc dl = DDI.getdl();
     unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
     MDNode *Variable = DI->getVariable();
+    MDNode *Expr = DI->getExpression();
     uint64_t Offset = DI->getOffset();
     // A dbg.value for an alloca is always indirect.
     bool IsIndirect = isa<AllocaInst>(V) || Offset != 0;
     SDDbgValue *SDV;
     if (Val.getNode()) {
-      if (!EmitFuncArgumentDbgValue(V, Variable, Offset, IsIndirect, Val)) {
-        SDV = DAG.getDbgValue(Variable, Val.getNode(),
-                              Val.getResNo(), IsIndirect,
-			      Offset, dl, DbgSDNodeOrder);
+      if (!EmitFuncArgumentDbgValue(V, Variable, Expr, Offset, IsIndirect,
+                                    Val)) {
+        SDV = DAG.getDbgValue(Variable, Expr, Val.getNode(), Val.getResNo(),
+                              IsIndirect, Offset, dl, DbgSDNodeOrder);
         DAG.AddDbgValue(SDV, Val.getNode(), false);
       }
     } else
@@ -1018,8 +1024,8 @@
   DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
   if (It != FuncInfo.ValueMap.end()) {
     unsigned InReg = It->second;
-    RegsForValue RFV(*DAG.getContext(), *TM.getTargetLowering(),
-                     InReg, V->getType());
+    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(), InReg,
+                     V->getType());
     SDValue Chain = DAG.getEntryNode();
     N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
     resolveDanglingDebugInfo(V, N);
@@ -1050,10 +1056,10 @@
 /// getValueImpl - Helper function for getValue and getNonRegisterValue.
 /// Create an SDValue for the given value.
 SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (const Constant *C = dyn_cast<Constant>(V)) {
-    EVT VT = TLI->getValueType(V->getType(), true);
+    EVT VT = TLI.getValueType(V->getType(), true);
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
       return DAG.getConstant(*CI, VT);
@@ -1063,7 +1069,7 @@
 
     if (isa<ConstantPointerNull>(C)) {
       unsigned AS = V->getType()->getPointerAddressSpace();
-      return DAG.getConstant(0, TLI->getPointerTy(AS));
+      return DAG.getConstant(0, TLI.getPointerTy(AS));
     }
 
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
@@ -1117,7 +1123,7 @@
              "Unknown struct or array constant!");
 
       SmallVector<EVT, 4> ValueVTs;
-      ComputeValueVTs(*TLI, C->getType(), ValueVTs);
+      ComputeValueVTs(TLI, C->getType(), ValueVTs);
       unsigned NumElts = ValueVTs.size();
       if (NumElts == 0)
         return SDValue(); // empty struct
@@ -1149,7 +1155,7 @@
         Ops.push_back(getValue(CV->getOperand(i)));
     } else {
       assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
-      EVT EltVT = TLI->getValueType(VecTy->getElementType());
+      EVT EltVT = TLI.getValueType(VecTy->getElementType());
 
       SDValue Op;
       if (EltVT.isFloatingPoint())
@@ -1169,13 +1175,13 @@
     DenseMap<const AllocaInst*, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end())
-      return DAG.getFrameIndex(SI->second, TLI->getPointerTy());
+      return DAG.getFrameIndex(SI->second, TLI.getPointerTy());
   }
 
   // If this is an instruction which fast-isel has deferred, select it now.
   if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
-    RegsForValue RFV(*DAG.getContext(), *TLI, InReg, Inst->getType());
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType());
     SDValue Chain = DAG.getEntryNode();
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
@@ -1184,7 +1190,7 @@
 }
 
 void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Chain = getControlRoot();
   SmallVector<ISD::OutputArg, 8> Outs;
   SmallVector<SDValue, 8> OutVals;
@@ -1197,7 +1203,7 @@
     // Leave Outs empty so that LowerReturn won't try to load return
     // registers the usual way.
     SmallVector<EVT, 1> PtrValueVTs;
-    ComputeValueVTs(*TLI, PointerType::getUnqual(F->getReturnType()),
+    ComputeValueVTs(TLI, PointerType::getUnqual(F->getReturnType()),
                     PtrValueVTs);
 
     SDValue RetPtr = DAG.getRegister(DemoteReg, PtrValueVTs[0]);
@@ -1205,7 +1211,7 @@
 
     SmallVector<EVT, 4> ValueVTs;
     SmallVector<uint64_t, 4> Offsets;
-    ComputeValueVTs(*TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets);
+    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets);
     unsigned NumValues = ValueVTs.size();
 
     SmallVector<SDValue, 4> Chains(NumValues);
@@ -1224,7 +1230,7 @@
                         MVT::Other, Chains);
   } else if (I.getNumOperands() != 0) {
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, I.getOperand(0)->getType(), ValueVTs);
+    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
     if (NumValues) {
       SDValue RetOp = getValue(I.getOperand(0));
@@ -1242,10 +1248,10 @@
           ExtendKind = ISD::ZERO_EXTEND;
 
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
-          VT = TLI->getTypeForExtArgOrReturn(VT.getSimpleVT(), ExtendKind);
+          VT = TLI.getTypeForExtArgOrReturn(*DAG.getContext(), VT, ExtendKind);
 
-        unsigned NumParts = TLI->getNumRegisters(*DAG.getContext(), VT);
-        MVT PartVT = TLI->getRegisterType(*DAG.getContext(), VT);
+        unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), VT);
+        MVT PartVT = TLI.getRegisterType(*DAG.getContext(), VT);
         SmallVector<SDValue, 4> Parts(NumParts);
         getCopyToParts(DAG, getCurSDLoc(),
                        SDValue(RetOp.getNode(), RetOp.getResNo() + j),
@@ -1275,9 +1281,8 @@
   bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
   CallingConv::ID CallConv =
     DAG.getMachineFunction().getFunction()->getCallingConv();
-  Chain = TM.getTargetLowering()->LowerReturn(Chain, CallConv, isVarArg,
-                                              Outs, OutVals, getCurSDLoc(),
-                                              DAG);
+  Chain = DAG.getTargetLoweringInfo().LowerReturn(
+      Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG);
 
   // Verify that the target's LowerReturn behaved as expected.
   assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
@@ -1601,10 +1606,9 @@
   //     jle foo
   //
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
-    if (!TM.getTargetLowering()->isJumpExpensive() &&
-        BOp->hasOneUse() &&
-        (BOp->getOpcode() == Instruction::And ||
-         BOp->getOpcode() == Instruction::Or)) {
+    if (!DAG.getTargetLoweringInfo().isJumpExpensive() &&
+        BOp->hasOneUse() && (BOp->getOpcode() == Instruction::And ||
+                             BOp->getOpcode() == Instruction::Or)) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
                            BOp->getOpcode(), getEdgeWeight(BrMBB, Succ0MBB),
                            getEdgeWeight(BrMBB, Succ1MBB));
@@ -1724,7 +1728,7 @@
 void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) {
   // Emit the code for the jump table
   assert(JT.Reg != -1U && "Should lower JT Header first!");
-  EVT PTy = TM.getTargetLowering()->getPointerTy();
+  EVT PTy = DAG.getTargetLoweringInfo().getPointerTy();
   SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
                                      JT.Reg, PTy);
   SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
@@ -1752,10 +1756,10 @@
   // can be used as an index into the jump table in a subsequent basic block.
   // This value may be smaller or larger than the target's pointer type, and
   // therefore require extension or truncating.
-  const TargetLowering *TLI = TM.getTargetLowering();
-  SwitchOp = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), TLI->getPointerTy());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SwitchOp = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), TLI.getPointerTy());
 
-  unsigned JumpTableReg = FuncInfo.CreateReg(TLI->getPointerTy());
+  unsigned JumpTableReg = FuncInfo.CreateReg(TLI.getPointerTy());
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurSDLoc(),
                                     JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
@@ -1763,12 +1767,10 @@
   // Emit the range check for the jump table, and branch to the default block
   // for the switch statement if the value being switched on exceeds the largest
   // case in the switch.
-  SDValue CMP = DAG.getSetCC(getCurSDLoc(),
-                             TLI->getSetCCResultType(*DAG.getContext(),
-                                                     Sub.getValueType()),
-                             Sub,
-                             DAG.getConstant(JTH.Last - JTH.First,VT),
-                             ISD::SETUGT);
+  SDValue CMP =
+      DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(),
+                                                         Sub.getValueType()),
+                   Sub, DAG.getConstant(JTH.Last - JTH.First, VT), ISD::SETUGT);
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
@@ -1799,8 +1801,8 @@
                                                   MachineBasicBlock *ParentBB) {
 
   // First create the loads to the guard/stack slot for the comparison.
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT PtrTy = TLI->getPointerTy();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PtrTy = TLI.getPointerTy();
 
   MachineFrameInfo *MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI->getStackProtectorIndex();
@@ -1810,10 +1812,22 @@
   SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
 
   unsigned Align =
-    TLI->getDataLayout()->getPrefTypeAlignment(IRGuard->getType());
-  SDValue Guard = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
-                              GuardPtr, MachinePointerInfo(IRGuard, 0),
-                              true, false, false, Align);
+    TLI.getDataLayout()->getPrefTypeAlignment(IRGuard->getType());
+
+  SDValue Guard;
+
+  // If GuardReg is set and useLoadStackGuardNode returns true, retrieve the
+  // guard value from the virtual register holding the value. Otherwise, emit a
+  // volatile load to retrieve the stack guard value.
+  unsigned GuardReg = SPD.getGuardReg();
+
+  if (GuardReg && TLI.useLoadStackGuardNode())
+    Guard = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), GuardReg,
+                               PtrTy);
+  else
+    Guard = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
+                        GuardPtr, MachinePointerInfo(IRGuard, 0),
+                        true, false, false, Align);
 
   SDValue StackSlot = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
                                   StackSlotPtr,
@@ -1824,11 +1838,10 @@
   EVT VT = Guard.getValueType();
   SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, Guard, StackSlot);
 
-  SDValue Cmp = DAG.getSetCC(getCurSDLoc(),
-                             TLI->getSetCCResultType(*DAG.getContext(),
-                                                     Sub.getValueType()),
-                             Sub, DAG.getConstant(0, VT),
-                             ISD::SETNE);
+  SDValue Cmp =
+      DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(),
+                                                         Sub.getValueType()),
+                   Sub, DAG.getConstant(0, VT), ISD::SETNE);
 
   // If the sub is not 0, then we know the guard/stackslot do not equal, so
   // branch to failure MBB.
@@ -1853,10 +1866,10 @@
 /// StackProtectorDescriptor.
 void
 SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-  SDValue Chain = TLI->makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL,
-                                   MVT::isVoid, nullptr, 0, false,
-                                   getCurSDLoc(), false, false).second;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Chain =
+      TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
+                      nullptr, 0, false, getCurSDLoc(), false, false).second;
   DAG.setRoot(Chain);
 }
 
@@ -1871,16 +1884,15 @@
                             DAG.getConstant(B.First, VT));
 
   // Check range
-  const TargetLowering *TLI = TM.getTargetLowering();
-  SDValue RangeCmp = DAG.getSetCC(getCurSDLoc(),
-                                  TLI->getSetCCResultType(*DAG.getContext(),
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue RangeCmp =
+      DAG.getSetCC(getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(),
                                                          Sub.getValueType()),
-                                  Sub, DAG.getConstant(B.Range, VT),
-                                  ISD::SETUGT);
+                   Sub, DAG.getConstant(B.Range, VT), ISD::SETUGT);
 
   // Determine the type of the test operands.
   bool UsePtrType = false;
-  if (!TLI->isTypeLegal(VT))
+  if (!TLI.isTypeLegal(VT))
     UsePtrType = true;
   else {
     for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
@@ -1892,7 +1904,7 @@
       }
   }
   if (UsePtrType) {
-    VT = TLI->getPointerTy();
+    VT = TLI.getPointerTy();
     Sub = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), VT);
   }
 
@@ -1936,22 +1948,18 @@
                                        Reg, VT);
   SDValue Cmp;
   unsigned PopCount = CountPopulation_64(B.Mask);
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (PopCount == 1) {
     // Testing for a single bit; just compare the shift count with what it
     // would need to be to shift a 1 bit in that position.
-    Cmp = DAG.getSetCC(getCurSDLoc(),
-                       TLI->getSetCCResultType(*DAG.getContext(), VT),
-                       ShiftOp,
-                       DAG.getConstant(countTrailingZeros(B.Mask), VT),
-                       ISD::SETEQ);
+    Cmp = DAG.getSetCC(
+        getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp,
+        DAG.getConstant(countTrailingZeros(B.Mask), VT), ISD::SETEQ);
   } else if (PopCount == BB.Range) {
     // There is only one zero bit in the range, test for it directly.
-    Cmp = DAG.getSetCC(getCurSDLoc(),
-                       TLI->getSetCCResultType(*DAG.getContext(), VT),
-                       ShiftOp,
-                       DAG.getConstant(CountTrailingOnes_64(B.Mask), VT),
-                       ISD::SETNE);
+    Cmp = DAG.getSetCC(
+        getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp,
+        DAG.getConstant(CountTrailingOnes_64(B.Mask), VT), ISD::SETNE);
   } else {
     // Make desired shift
     SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurSDLoc(), VT,
@@ -1961,9 +1969,8 @@
     SDValue AndOp = DAG.getNode(ISD::AND, getCurSDLoc(),
                                 VT, SwitchVal, DAG.getConstant(B.Mask, VT));
     Cmp = DAG.getSetCC(getCurSDLoc(),
-                       TLI->getSetCCResultType(*DAG.getContext(), VT),
-                       AndOp, DAG.getConstant(0, VT),
-                       ISD::SETNE);
+                       TLI.getSetCCResultType(*DAG.getContext(), VT), AndOp,
+                       DAG.getConstant(0, VT), ISD::SETNE);
   }
 
   // The branch weight from SwitchBB to B.TargetBB is B.ExtraWeight.
@@ -2001,8 +2008,17 @@
   if (isa<InlineAsm>(Callee))
     visitInlineAsm(&I);
   else if (Fn && Fn->isIntrinsic()) {
-    assert(Fn->getIntrinsicID() == Intrinsic::donothing);
-    // Ignore invokes to @llvm.donothing: jump directly to the next BB.
+    switch (Fn->getIntrinsicID()) {
+    default:
+      llvm_unreachable("Cannot invoke this intrinsic");
+    case Intrinsic::donothing:
+      // Ignore invokes to @llvm.donothing: jump directly to the next BB.
+      break;
+    case Intrinsic::experimental_patchpoint_void:
+    case Intrinsic::experimental_patchpoint_i64:
+      visitPatchpoint(&I, LandingPad);
+      break;
+    }
   } else
     LowerCallTo(&I, getValue(Callee), false, LandingPad);
 
@@ -2034,26 +2050,26 @@
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother to create these DAG nodes.
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (TLI->getExceptionPointerRegister() == 0 &&
-      TLI->getExceptionSelectorRegister() == 0)
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.getExceptionPointerRegister() == 0 &&
+      TLI.getExceptionSelectorRegister() == 0)
     return;
 
   SmallVector<EVT, 2> ValueVTs;
-  ComputeValueVTs(*TLI, LP.getType(), ValueVTs);
+  ComputeValueVTs(TLI, LP.getType(), ValueVTs);
   assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");
 
   // Get the two live-in registers as SDValues. The physregs have already been
   // copied into virtual registers.
   SDValue Ops[2];
   Ops[0] = DAG.getZExtOrTrunc(
-    DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
-                       FuncInfo.ExceptionPointerVirtReg, TLI->getPointerTy()),
-    getCurSDLoc(), ValueVTs[0]);
+      DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                         FuncInfo.ExceptionPointerVirtReg, TLI.getPointerTy()),
+      getCurSDLoc(), ValueVTs[0]);
   Ops[1] = DAG.getZExtOrTrunc(
-    DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
-                       FuncInfo.ExceptionSelectorVirtReg, TLI->getPointerTy()),
-    getCurSDLoc(), ValueVTs[1]);
+      DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                         FuncInfo.ExceptionSelectorVirtReg, TLI.getPointerTy()),
+      getCurSDLoc(), ValueVTs[1]);
 
   // Merge into one.
   SDValue Res = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
@@ -2218,9 +2234,8 @@
 }
 
 static inline bool areJTsAllowed(const TargetLowering &TLI) {
-  return TLI.supportJumpTables() &&
-          (TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
-           TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+  return TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+         TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
 }
 
 static APInt ComputeRange(const APInt &First, const APInt &Last) {
@@ -2245,8 +2260,8 @@
   for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I)
     TSize += I->size();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (!areJTsAllowed(*TLI) || TSize.ult(TLI->getMinimumJumpTableEntries()))
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!areJTsAllowed(TLI) || TSize.ult(TLI.getMinimumJumpTableEntries()))
     return false;
 
   APInt Range = ComputeRange(First, Last);
@@ -2327,7 +2342,7 @@
   }
 
   // Create a jump table index for this jump table.
-  unsigned JTEncoding = TLI->getJumpTableEncoding();
+  unsigned JTEncoding = TLI.getJumpTableEncoding();
   unsigned JTI = CurMF->getOrCreateJumpTableInfo(JTEncoding)
                        ->createJumpTableIndex(DestBBs);
 
@@ -2347,7 +2362,6 @@
 bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
                                                   CaseRecVector& WorkList,
                                                   const Value* SV,
-                                                  MachineBasicBlock* Default,
                                                   MachineBasicBlock* SwitchBB) {
   // Get the MachineFunction which holds the current MBB.  This is used when
   // inserting any additional MBBs necessary to represent the switch.
@@ -2413,8 +2427,8 @@
     RSize -= J->size();
   }
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (areJTsAllowed(*TLI)) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (areJTsAllowed(TLI)) {
     // If our case is dense we *really* should handle it earlier!
     assert((FMetric > 0) && "Should handle dense range earlier!");
   } else {
@@ -2484,8 +2498,8 @@
                                                    const Value* SV,
                                                    MachineBasicBlock* Default,
                                                    MachineBasicBlock* SwitchBB) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT PTy = TLI->getPointerTy();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PTy = TLI.getPointerTy();
   unsigned IntPtrBits = PTy.getSizeInBits();
 
   Case& FrontCase = *CR.Range.first;
@@ -2496,7 +2510,7 @@
   MachineFunction *CurMF = FuncInfo.MF;
 
   // If target does not have legal shift left, do not emit bit tests at all.
-  if (!TLI->isOperationLegal(ISD::SHL, PTy))
+  if (!TLI.isOperationLegal(ISD::SHL, PTy))
     return false;
 
   size_t numCmps = 0;
@@ -2601,21 +2615,19 @@
 
   BitTestBlock BTB(lowBound, cmpRange, SV,
                    -1U, MVT::Other, (CR.CaseBB == SwitchBB),
-                   CR.CaseBB, Default, BTC);
+                   CR.CaseBB, Default, std::move(BTC));
 
   if (CR.CaseBB == SwitchBB)
     visitBitTestHeader(BTB, SwitchBB);
 
-  BitTestCases.push_back(BTB);
+  BitTestCases.push_back(std::move(BTB));
 
   return true;
 }
 
 /// Clusterify - Transform simple list of Cases into list of CaseRange's
-size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
-                                       const SwitchInst& SI) {
-  size_t numCmps = 0;
-
+void SelectionDAGBuilder::Clusterify(CaseVector& Cases,
+                                     const SwitchInst& SI) {
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
   for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
@@ -2653,13 +2665,15 @@
       }
     }
 
-  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
-    if (I->Low != I->High)
-      // A range counts double, since it requires two compares.
-      ++numCmps;
-  }
+  DEBUG({
+      size_t numCmps = 0;
+      for (auto &I : Cases)
+        // A range counts double, since it requires two compares.
+        numCmps += I.Low != I.High ? 2 : 1;
 
-  return numCmps;
+      dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+             << ". Total compares: " << numCmps << '\n';
+    });
 }
 
 void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
@@ -2701,10 +2715,7 @@
   // representing each one, and sort the vector so that we can efficiently
   // create a binary search tree from them.
   CaseVector Cases;
-  size_t numCmps = Clusterify(Cases, SI);
-  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
-               << ". Total compares: " << numCmps << '\n');
-  (void)numCmps;
+  Clusterify(Cases, SI);
 
   // Get the Value to be switched on and default basic blocks, which will be
   // inserted into CaseBlock records, representing basic blocks in the binary
@@ -2738,7 +2749,7 @@
 
     // Emit binary tree. We need to pick a pivot, and push left and right ranges
     // onto the worklist. Leafs are handled via handleSmallSwitchRange() call.
-    handleBTSplitSwitchCase(CR, WorkList, SV, Default, SwitchMBB);
+    handleBTSplitSwitchCase(CR, WorkList, SV, SwitchMBB);
   }
 }
 
@@ -2749,7 +2760,7 @@
   SmallSet<BasicBlock*, 32> Done;
   for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
     BasicBlock *BB = I.getSuccessor(i);
-    bool Inserted = Done.insert(BB);
+    bool Inserted = Done.insert(BB).second;
     if (!Inserted)
         continue;
 
@@ -2806,7 +2817,8 @@
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
 
-  EVT ShiftTy = TM.getTargetLowering()->getShiftAmountTy(Op2.getValueType());
+  EVT ShiftTy =
+      DAG.getTargetLoweringInfo().getShiftAmountTy(Op2.getValueType());
 
   // Coerce the shift amount to the right type if we can.
   if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
@@ -2861,8 +2873,8 @@
   if (isa<BinaryOperator>(&I) && cast<BinaryOperator>(&I)->isExact() &&
       !isa<ConstantSDNode>(Op1) &&
       isa<ConstantSDNode>(Op2) && !cast<ConstantSDNode>(Op2)->isNullValue())
-    setValue(&I, TM.getTargetLowering()->BuildExactSDIV(Op1, Op2,
-                                                        getCurSDLoc(), DAG));
+    setValue(&I, DAG.getTargetLoweringInfo()
+                     .BuildExactSDIV(Op1, Op2, getCurSDLoc(), DAG));
   else
     setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(),
                              Op1, Op2));
@@ -2878,7 +2890,7 @@
   SDValue Op2 = getValue(I.getOperand(1));
   ISD::CondCode Opcode = getICmpCondCode(predicate);
 
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
 }
 
@@ -2893,13 +2905,13 @@
   ISD::CondCode Condition = getFCmpCondCode(predicate);
   if (TM.Options.NoNaNsFPMath)
     Condition = getFCmpCodeWithoutNaN(Condition);
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
 }
 
 void SelectionDAGBuilder::visitSelect(const User &I) {
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(*TM.getTargetLowering(), I.getType(), ValueVTs);
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), I.getType(), ValueVTs);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0) return;
 
@@ -2926,7 +2938,7 @@
 void SelectionDAGBuilder::visitTrunc(const User &I) {
   // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
 }
 
@@ -2934,7 +2946,7 @@
   // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // ZExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
@@ -2942,52 +2954,51 @@
   // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // SExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPTrunc(const User &I) {
   // FPTrunc is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT DestVT = TLI->getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurSDLoc(),
-                           DestVT, N,
-                           DAG.getTargetConstant(0, TLI->getPointerTy())));
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurSDLoc(), DestVT, N,
+                           DAG.getTargetConstant(0, TLI.getPointerTy())));
 }
 
 void SelectionDAGBuilder::visitFPExt(const User &I) {
   // FPExt is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToUI(const User &I) {
   // FPToUI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToSI(const User &I) {
   // FPToSI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitUIToFP(const User &I) {
   // UIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitSIToFP(const User &I) {
   // SIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
@@ -2995,7 +3006,7 @@
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
@@ -3003,13 +3014,13 @@
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
   setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
 void SelectionDAGBuilder::visitBitCast(const User &I) {
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(I.getType());
 
   // BitCast assures us that source and destination are the same size so this is
   // either a BITCAST or a no-op.
@@ -3031,7 +3042,7 @@
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   const Value *SV = I.getOperand(0);
   SDValue N = getValue(SV);
-  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  EVT DestVT = TLI.getValueType(I.getType());
 
   unsigned SrcAS = SV->getType()->getPointerAddressSpace();
   unsigned DestAS = I.getType()->getPointerAddressSpace();
@@ -3049,8 +3060,7 @@
   SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)),
                                      getCurSDLoc(), TLI.getVectorIdxTy());
   setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
-                           TM.getTargetLowering()->getValueType(I.getType()),
-                           InVec, InVal, InIdx));
+                           TLI.getValueType(I.getType()), InVec, InVal, InIdx));
 }
 
 void SelectionDAGBuilder::visitExtractElement(const User &I) {
@@ -3059,8 +3069,7 @@
   SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)),
                                      getCurSDLoc(), TLI.getVectorIdxTy());
   setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
-                           TM.getTargetLowering()->getValueType(I.getType()),
-                           InVec, InIdx));
+                           TLI.getValueType(I.getType()), InVec, InIdx));
 }
 
 // Utility for visitShuffleVector - Return true if every element in Mask,
@@ -3082,8 +3091,8 @@
   ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
   unsigned MaskNumElts = Mask.size();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT VT = TLI->getValueType(I.getType());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getType());
   EVT SrcVT = Src1.getValueType();
   unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
@@ -3202,9 +3211,9 @@
         if (RangeUse[Input] == 0)
           Src = DAG.getUNDEF(VT);
         else
-          Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, getCurSDLoc(), VT,
-                            Src, DAG.getConstant(StartIdx[Input],
-                                                 TLI->getVectorIdxTy()));
+          Src = DAG.getNode(
+              ISD::EXTRACT_SUBVECTOR, getCurSDLoc(), VT, Src,
+              DAG.getConstant(StartIdx[Input], TLI.getVectorIdxTy()));
       }
 
       // Calculate new mask.
@@ -3230,7 +3239,7 @@
   // replacing the shuffle with extract and build vector.
   // to insert and build vector.
   EVT EltVT = VT.getVectorElementType();
-  EVT IdxVT = TLI->getVectorIdxTy();
+  EVT IdxVT = TLI.getVectorIdxTy();
   SmallVector<SDValue,8> Ops;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
     int Idx = Mask[i];
@@ -3262,16 +3271,22 @@
 
   unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
 
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> AggValueVTs;
-  ComputeValueVTs(*TLI, AggTy, AggValueVTs);
+  ComputeValueVTs(TLI, AggTy, AggValueVTs);
   SmallVector<EVT, 4> ValValueVTs;
-  ComputeValueVTs(*TLI, ValTy, ValValueVTs);
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
 
   unsigned NumAggValues = AggValueVTs.size();
   unsigned NumValValues = ValValueVTs.size();
   SmallVector<SDValue, 4> Values(NumAggValues);
 
+  // Ignore an insertvalue that produces an empty object
+  if (!NumAggValues) {
+    setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
+    return;
+  }
+
   SDValue Agg = getValue(Op0);
   unsigned i = 0;
   // Copy the beginning value(s) from the original aggregate.
@@ -3302,9 +3317,9 @@
 
   unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
 
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> ValValueVTs;
-  ComputeValueVTs(*TLI, ValTy, ValValueVTs);
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
 
   unsigned NumValValues = ValValueVTs.size();
 
@@ -3353,13 +3368,13 @@
       Ty = cast<SequentialType>(Ty)->getElementType();
 
       // If this is a constant subscript, handle it quickly.
-      const TargetLowering *TLI = TM.getTargetLowering();
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero()) continue;
         uint64_t Offs =
             DL->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         SDValue OffsVal;
-        EVT PTy = TLI->getPointerTy(AS);
+        EVT PTy = TLI.getPointerTy(AS);
         unsigned PtrBits = PTy.getSizeInBits();
         if (PtrBits < 64)
           OffsVal = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), PTy,
@@ -3373,8 +3388,8 @@
       }
 
       // N = N + Idx * ElementSize;
-      APInt ElementSize = APInt(TLI->getPointerSizeInBits(AS),
-                                DL->getTypeAllocSize(Ty));
+      APInt ElementSize =
+          APInt(TLI.getPointerSizeInBits(AS), DL->getTypeAllocSize(Ty));
       SDValue IdxN = getValue(Idx);
 
       // If the index is smaller or larger than intptr_t, truncate or extend
@@ -3411,15 +3426,15 @@
     return;   // getValue will auto-populate this.
 
   Type *Ty = I.getAllocatedType();
-  const TargetLowering *TLI = TM.getTargetLowering();
-  uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
   unsigned Align =
-    std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty),
-             I.getAlignment());
+      std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty),
+               I.getAlignment());
 
   SDValue AllocSize = getValue(I.getArraySize());
 
-  EVT IntPtr = TLI->getPointerTy();
+  EVT IntPtr = TLI.getPointerTy();
   if (AllocSize.getValueType() != IntPtr)
     AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurSDLoc(), IntPtr);
 
@@ -3430,7 +3445,8 @@
   // Handle alignment.  If the requested alignment is less than or equal to
   // the stack alignment, ignore it.  If the size is greater than or equal to
   // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
-  unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+  unsigned StackAlign =
+      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
   if (Align <= StackAlign)
     Align = 0;
 
@@ -3464,15 +3480,18 @@
   Type *Ty = I.getType();
 
   bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.getMetadata("nontemporal") != nullptr;
-  bool isInvariant = I.getMetadata("invariant.load") != nullptr;
+  bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
+  bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr;
   unsigned Alignment = I.getAlignment();
-  const MDNode *TBAAInfo = I.getMetadata(LLVMContext::MD_tbaa);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(*TM.getTargetLowering(), Ty, ValueVTs, &Offsets);
+  ComputeValueVTs(TLI, Ty, ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
@@ -3483,7 +3502,7 @@
     // Serialize volatile loads with other side effects.
     Root = getRoot();
   else if (AA->pointsToConstantMemory(
-             AliasAnalysis::Location(SV, AA->getTypeStoreSize(Ty), TBAAInfo))) {
+             AliasAnalysis::Location(SV, AA->getTypeStoreSize(Ty), AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
     ConstantMemory = true;
@@ -3492,9 +3511,8 @@
     Root = DAG.getRoot();
   }
 
-  const TargetLowering *TLI = TM.getTargetLowering();
   if (isVolatile)
-    Root = TLI->prepareVolatileOrAtomicLoad(Root, getCurSDLoc(), DAG);
+    Root = TLI.prepareVolatileOrAtomicLoad(Root, getCurSDLoc(), DAG);
 
   SmallVector<SDValue, 4> Values(NumValues);
   SmallVector<SDValue, 4> Chains(std::min(unsigned(MaxParallelChains),
@@ -3520,7 +3538,7 @@
                             DAG.getConstant(Offsets[i], PtrVT));
     SDValue L = DAG.getLoad(ValueVTs[i], getCurSDLoc(), Root,
                             A, MachinePointerInfo(SV, Offsets[i]), isVolatile,
-                            isNonTemporal, isInvariant, Alignment, TBAAInfo,
+                            isNonTemporal, isInvariant, Alignment, AAInfo,
                             Ranges);
 
     Values[i] = L;
@@ -3549,7 +3567,8 @@
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(*TM.getTargetLowering(), SrcV->getType(), ValueVTs, &Offsets);
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), SrcV->getType(),
+                  ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
@@ -3565,9 +3584,11 @@
                                           NumValues));
   EVT PtrVT = Ptr.getValueType();
   bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.getMetadata("nontemporal") != nullptr;
+  bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
   unsigned Alignment = I.getAlignment();
-  const MDNode *TBAAInfo = I.getMetadata(LLVMContext::MD_tbaa);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
 
   unsigned ChainI = 0;
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
@@ -3583,7 +3604,7 @@
     SDValue St = DAG.getStore(Root, getCurSDLoc(),
                               SDValue(Src.getNode(), Src.getResNo() + i),
                               Add, MachinePointerInfo(PtrV, Offsets[i]),
-                              isVolatile, isNonTemporal, Alignment, TBAAInfo);
+                              isVolatile, isNonTemporal, Alignment, AAInfo);
     Chains[ChainI] = St;
   }
 
@@ -3592,30 +3613,6 @@
   DAG.setRoot(StoreNode);
 }
 
-static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
-                                    SynchronizationScope Scope,
-                                    bool Before, SDLoc dl,
-                                    SelectionDAG &DAG,
-                                    const TargetLowering &TLI) {
-  // Fence, if necessary
-  if (Before) {
-    if (Order == AcquireRelease || Order == SequentiallyConsistent)
-      Order = Release;
-    else if (Order == Acquire || Order == Monotonic || Order == Unordered)
-      return Chain;
-  } else {
-    if (Order == AcquireRelease)
-      Order = Acquire;
-    else if (Order == Release || Order == Monotonic || Order == Unordered)
-      return Chain;
-  }
-  SDValue Ops[3];
-  Ops[0] = Chain;
-  Ops[1] = DAG.getConstant(Order, TLI.getPointerTy());
-  Ops[2] = DAG.getConstant(Scope, TLI.getPointerTy());
-  return DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops);
-}
-
 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
   SDLoc dl = getCurSDLoc();
   AtomicOrdering SuccessOrder = I.getSuccessOrdering();
@@ -3624,27 +3621,16 @@
 
   SDValue InChain = getRoot();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (TLI->getInsertFencesForAtomic())
-    InChain = InsertFenceForAtomic(InChain, SuccessOrder, Scope, true, dl,
-                                   DAG, *TLI);
-
   MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
   SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
   SDValue L = DAG.getAtomicCmpSwap(
       ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain,
       getValue(I.getPointerOperand()), getValue(I.getCompareOperand()),
       getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()),
-      0 /* Alignment */,
-      TLI->getInsertFencesForAtomic() ? Monotonic : SuccessOrder,
-      TLI->getInsertFencesForAtomic() ? Monotonic : FailureOrder, Scope);
+      /*Alignment=*/ 0, SuccessOrder, FailureOrder, Scope);
 
   SDValue OutChain = L.getValue(2);
 
-  if (TLI->getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, SuccessOrder, Scope, false, dl,
-                                    DAG, *TLI);
-
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
@@ -3671,38 +3657,28 @@
 
   SDValue InChain = getRoot();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  if (TLI->getInsertFencesForAtomic())
-    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
-                                   DAG, *TLI);
-
   SDValue L =
     DAG.getAtomic(NT, dl,
                   getValue(I.getValOperand()).getSimpleValueType(),
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getValOperand()),
-                  I.getPointerOperand(), 0 /* Alignment */,
-                  TLI->getInsertFencesForAtomic() ? Monotonic : Order,
-                  Scope);
+                  I.getPointerOperand(),
+                  /* Alignment=*/ 0, Order, Scope);
 
   SDValue OutChain = L.getValue(1);
 
-  if (TLI->getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, *TLI);
-
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
 
 void SelectionDAGBuilder::visitFence(const FenceInst &I) {
   SDLoc dl = getCurSDLoc();
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Ops[3];
   Ops[0] = getRoot();
-  Ops[1] = DAG.getConstant(I.getOrdering(), TLI->getPointerTy());
-  Ops[2] = DAG.getConstant(I.getSynchScope(), TLI->getPointerTy());
+  Ops[1] = DAG.getConstant(I.getOrdering(), TLI.getPointerTy());
+  Ops[2] = DAG.getConstant(I.getSynchScope(), TLI.getPointerTy());
   DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
 }
 
@@ -3713,8 +3689,8 @@
 
   SDValue InChain = getRoot();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT VT = TLI->getValueType(I.getType());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getType());
 
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic load");
@@ -3728,19 +3704,14 @@
                            I.getAlignment() ? I.getAlignment() :
                                               DAG.getEVTAlignment(VT));
 
-  InChain = TLI->prepareVolatileOrAtomicLoad(InChain, dl, DAG);
+  InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
   SDValue L =
       DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
                     getValue(I.getPointerOperand()), MMO,
-                    TLI->getInsertFencesForAtomic() ? Monotonic : Order,
-                    Scope);
+                    Order, Scope);
 
   SDValue OutChain = L.getValue(1);
 
-  if (TLI->getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, *TLI);
-
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
@@ -3753,28 +3724,19 @@
 
   SDValue InChain = getRoot();
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  EVT VT = TLI->getValueType(I.getValueOperand()->getType());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getValueOperand()->getType());
 
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic store");
 
-  if (TLI->getInsertFencesForAtomic())
-    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
-                                   DAG, *TLI);
-
   SDValue OutChain =
     DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT,
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getValueOperand()),
                   I.getPointerOperand(), I.getAlignment(),
-                  TLI->getInsertFencesForAtomic() ? Monotonic : Order,
-                  Scope);
-
-  if (TLI->getInsertFencesForAtomic())
-    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, *TLI);
+                  Order, Scope);
 
   DAG.setRoot(OutChain);
 }
@@ -3799,13 +3761,13 @@
 
   // Info is set by getTgtMemInstrinsic
   TargetLowering::IntrinsicInfo Info;
-  const TargetLowering *TLI = TM.getTargetLowering();
-  bool IsTgtIntrinsic = TLI->getTgtMemIntrinsic(Info, I, Intrinsic);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, Intrinsic);
 
   // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
   if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
       Info.opc == ISD::INTRINSIC_W_CHAIN)
-    Ops.push_back(DAG.getTargetConstant(Intrinsic, TLI->getPointerTy()));
+    Ops.push_back(DAG.getTargetConstant(Intrinsic, TLI.getPointerTy()));
 
   // Add all operands of the call to the operand list.
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
@@ -3814,7 +3776,7 @@
   }
 
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(*TLI, I.getType(), ValueVTs);
+  ComputeValueVTs(TLI, I.getType(), ValueVTs);
 
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
@@ -3829,7 +3791,7 @@
                                      VTs, Ops, Info.memVT,
                                    MachinePointerInfo(Info.ptrVal, Info.offset),
                                      Info.align, Info.vol,
-                                     Info.readMem, Info.writeMem);
+                                     Info.readMem, Info.writeMem, Info.size);
   } else if (!HasChain) {
     Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
   } else if (!I.getType()->isVoidTy()) {
@@ -3848,7 +3810,7 @@
 
   if (!I.getType()->isVoidTy()) {
     if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
-      EVT VT = TLI->getValueType(PTy);
+      EVT VT = TLI.getValueType(PTy);
       Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
     }
 
@@ -4555,16 +4517,17 @@
 /// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function
 /// argument, create the corresponding DBG_VALUE machine instruction for it now.
 /// At the end of instruction selection, they will be inserted to the entry BB.
-bool
-SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
-                                              int64_t Offset, bool IsIndirect,
-                                              const SDValue &N) {
+bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V,
+                                                   MDNode *Variable,
+                                                   MDNode *Expr, int64_t Offset,
+                                                   bool IsIndirect,
+                                                   const SDValue &N) {
   const Argument *Arg = dyn_cast<Argument>(V);
   if (!Arg)
     return false;
 
   MachineFunction &MF = DAG.getMachineFunction();
-  const TargetInstrInfo *TII = DAG.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
 
   // Ignore inlined function arguments here.
   DIVariable DV(Variable);
@@ -4610,14 +4573,16 @@
     return false;
 
   if (Op->isReg())
-    FuncInfo.ArgDbgValues.push_back(BuildMI(MF, getCurDebugLoc(),
-                                            TII->get(TargetOpcode::DBG_VALUE),
-                                            IsIndirect,
-                                            Op->getReg(), Offset, Variable));
+    FuncInfo.ArgDbgValues.push_back(
+        BuildMI(MF, getCurDebugLoc(), TII->get(TargetOpcode::DBG_VALUE),
+                IsIndirect, Op->getReg(), Offset, Variable, Expr));
   else
     FuncInfo.ArgDbgValues.push_back(
-      BuildMI(MF, getCurDebugLoc(), TII->get(TargetOpcode::DBG_VALUE))
-          .addOperand(*Op).addImm(Offset).addMetadata(Variable));
+        BuildMI(MF, getCurDebugLoc(), TII->get(TargetOpcode::DBG_VALUE))
+            .addOperand(*Op)
+            .addImm(Offset)
+            .addMetadata(Variable)
+            .addMetadata(Expr));
 
   return true;
 }
@@ -4635,7 +4600,7 @@
 /// otherwise lower it and return null.
 const char *
 SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc sdl = getCurSDLoc();
   DebugLoc dl = getCurDebugLoc();
   SDValue Res;
@@ -4649,17 +4614,17 @@
   case Intrinsic::vaend:    visitVAEnd(I); return nullptr;
   case Intrinsic::vacopy:   visitVACopy(I); return nullptr;
   case Intrinsic::returnaddress:
-    setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl, TLI->getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl, TLI.getPointerTy(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::frameaddress:
-    setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl, TLI->getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl, TLI.getPointerTy(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   case Intrinsic::read_register: {
     Value *Reg = I.getArgOperand(0);
     SDValue RegName = DAG.getMDNode(cast<MDNode>(Reg));
-    EVT VT = TM.getTargetLowering()->getValueType(I.getType());
+    EVT VT = TLI.getValueType(I.getType());
     setValue(&I, DAG.getNode(ISD::READ_REGISTER, sdl, VT, RegName));
     return nullptr;
   }
@@ -4673,9 +4638,9 @@
     return nullptr;
   }
   case Intrinsic::setjmp:
-    return &"_setjmp"[!TLI->usesUnderscoreSetJmp()];
+    return &"_setjmp"[!TLI.usesUnderscoreSetJmp()];
   case Intrinsic::longjmp:
-    return &"_longjmp"[!TLI->usesUnderscoreLongJmp()];
+    return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
   case Intrinsic::memcpy: {
     // Assert for address < 256 since we support only user defined address
     // spaces.
@@ -4736,6 +4701,7 @@
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
     MDNode *Variable = DI.getVariable();
+    MDNode *Expression = DI.getExpression();
     const Value *Address = DI.getAddress();
     DIVariable DIVar(Variable);
     assert((!DIVar || DIVar.isVariable()) &&
@@ -4771,16 +4737,16 @@
         FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
         if (FINode)
           // Byval parameter.  We have a frame index at this point.
-          SDV = DAG.getFrameIndexDbgValue(Variable, FINode->getIndex(),
-					  0, dl, SDNodeOrder);
+          SDV = DAG.getFrameIndexDbgValue(
+              Variable, Expression, FINode->getIndex(), 0, dl, SDNodeOrder);
         else {
           // Address is an argument, so try to emit its dbg value using
           // virtual register info from the FuncInfo.ValueMap.
-          EmitFuncArgumentDbgValue(Address, Variable, 0, false, N);
+          EmitFuncArgumentDbgValue(Address, Variable, Expression, 0, false, N);
           return nullptr;
         }
       } else if (AI)
-        SDV = DAG.getDbgValue(Variable, N.getNode(), N.getResNo(),
+        SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
                               true, 0, dl, SDNodeOrder);
       else {
         // Can't do anything with other non-AI cases yet.
@@ -4793,7 +4759,8 @@
     } else {
       // If Address is an argument then try to emit its dbg value using
       // virtual register info from the FuncInfo.ValueMap.
-      if (!EmitFuncArgumentDbgValue(Address, Variable, 0, false, N)) {
+      if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, 0, false,
+                                    N)) {
         // If variable is pinned by a alloca in dominating bb then
         // use StaticAllocaMap.
         if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
@@ -4801,7 +4768,7 @@
             DenseMap<const AllocaInst*, int>::iterator SI =
               FuncInfo.StaticAllocaMap.find(AI);
             if (SI != FuncInfo.StaticAllocaMap.end()) {
-              SDV = DAG.getFrameIndexDbgValue(Variable, SI->second,
+              SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second,
                                               0, dl, SDNodeOrder);
               DAG.AddDbgValue(SDV, nullptr, false);
               return nullptr;
@@ -4822,6 +4789,7 @@
       return nullptr;
 
     MDNode *Variable = DI.getVariable();
+    MDNode *Expression = DI.getExpression();
     uint64_t Offset = DI.getOffset();
     const Value *V = DI.getValue();
     if (!V)
@@ -4829,7 +4797,8 @@
 
     SDDbgValue *SDV;
     if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
-      SDV = DAG.getConstantDbgValue(Variable, V, Offset, dl, SDNodeOrder);
+      SDV = DAG.getConstantDbgValue(Variable, Expression, V, Offset, dl,
+                                    SDNodeOrder);
       DAG.AddDbgValue(SDV, nullptr, false);
     } else {
       // Do not use getValue() in here; we don't want to generate code at
@@ -4841,10 +4810,10 @@
       if (N.getNode()) {
         // A dbg.value for an alloca is always indirect.
         bool IsIndirect = isa<AllocaInst>(V) || Offset != 0;
-        if (!EmitFuncArgumentDbgValue(V, Variable, Offset, IsIndirect, N)) {
-          SDV = DAG.getDbgValue(Variable, N.getNode(),
-                                N.getResNo(), IsIndirect,
-				Offset, dl, SDNodeOrder);
+        if (!EmitFuncArgumentDbgValue(V, Variable, Expression, Offset,
+                                      IsIndirect, N)) {
+          SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
+                                IsIndirect, Offset, dl, SDNodeOrder);
           DAG.AddDbgValue(SDV, N.getNode(), false);
         }
       } else if (!V->use_empty() ) {
@@ -4878,7 +4847,7 @@
 
   case Intrinsic::eh_typeid_for: {
     // Find the type id for the given typeinfo.
-    GlobalVariable *GV = ExtractTypeInfo(I.getArgOperand(0));
+    GlobalValue *GV = ExtractTypeInfo(I.getArgOperand(0));
     unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(GV);
     Res = DAG.getConstant(TypeID, MVT::i32);
     setValue(&I, Res);
@@ -4899,15 +4868,14 @@
     return nullptr;
   case Intrinsic::eh_dwarf_cfa: {
     SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), sdl,
-                                        TLI->getPointerTy());
+                                        TLI.getPointerTy());
     SDValue Offset = DAG.getNode(ISD::ADD, sdl,
                                  CfaArg.getValueType(),
                                  DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, sdl,
                                              CfaArg.getValueType()),
                                  CfaArg);
-    SDValue FA = DAG.getNode(ISD::FRAMEADDR, sdl,
-                             TLI->getPointerTy(),
-                             DAG.getConstant(0, TLI->getPointerTy()));
+    SDValue FA = DAG.getNode(ISD::FRAMEADDR, sdl, TLI.getPointerTy(),
+                             DAG.getConstant(0, TLI.getPointerTy()));
     setValue(&I, DAG.getNode(ISD::ADD, sdl, FA.getValueType(),
                              FA, Offset));
     return nullptr;
@@ -4997,7 +4965,7 @@
     ShOps[0] = ShAmt;
     ShOps[1] = DAG.getConstant(0, MVT::i32);
     ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps);
-    EVT DestVT = TLI->getValueType(I.getType());
+    EVT DestVT = TLI.getValueType(I.getType());
     ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
     Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
                        DAG.getConstant(NewIntrinsic, MVT::i32),
@@ -5009,14 +4977,14 @@
   case Intrinsic::x86_avx_vinsertf128_ps_256:
   case Intrinsic::x86_avx_vinsertf128_si_256:
   case Intrinsic::x86_avx2_vinserti128: {
-    EVT DestVT = TLI->getValueType(I.getType());
-    EVT ElVT = TLI->getValueType(I.getArgOperand(1)->getType());
+    EVT DestVT = TLI.getValueType(I.getType());
+    EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType());
     uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue() & 1) *
                    ElVT.getVectorNumElements();
-    Res = DAG.getNode(ISD::INSERT_SUBVECTOR, sdl, DestVT,
-                      getValue(I.getArgOperand(0)),
-                      getValue(I.getArgOperand(1)),
-                      DAG.getConstant(Idx, TLI->getVectorIdxTy()));
+    Res =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, sdl, DestVT,
+                    getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
+                    DAG.getConstant(Idx, TLI.getVectorIdxTy()));
     setValue(&I, Res);
     return nullptr;
   }
@@ -5024,12 +4992,12 @@
   case Intrinsic::x86_avx_vextractf128_ps_256:
   case Intrinsic::x86_avx_vextractf128_si_256:
   case Intrinsic::x86_avx2_vextracti128: {
-    EVT DestVT = TLI->getValueType(I.getType());
+    EVT DestVT = TLI.getValueType(I.getType());
     uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(1))->getZExtValue() & 1) *
                    DestVT.getVectorNumElements();
     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, DestVT,
                       getValue(I.getArgOperand(0)),
-                      DAG.getConstant(Idx, TLI->getVectorIdxTy()));
+                      DAG.getConstant(Idx, TLI.getVectorIdxTy()));
     setValue(&I, Res);
     return nullptr;
   }
@@ -5055,7 +5023,7 @@
     case Intrinsic::convertus:  Code = ISD::CVT_US; break;
     case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
     }
-    EVT DestVT = TLI->getValueType(I.getType());
+    EVT DestVT = TLI.getValueType(I.getType());
     const Value *Op1 = I.getArgOperand(0);
     Res = DAG.getConvertRndSat(DestVT, sdl, getValue(Op1),
                                DAG.getValueType(DestVT),
@@ -5071,23 +5039,23 @@
                             getValue(I.getArgOperand(1)), DAG));
     return nullptr;
   case Intrinsic::log:
-    setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::log2:
-    setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::log10:
-    setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::exp:
-    setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::exp2:
-    setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
+    setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
     return nullptr;
   case Intrinsic::pow:
     setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
-                           getValue(I.getArgOperand(1)), DAG, *TLI));
+                           getValue(I.getArgOperand(1)), DAG, TLI));
     return nullptr;
   case Intrinsic::sqrt:
   case Intrinsic::fabs:
@@ -5119,6 +5087,18 @@
                              getValue(I.getArgOperand(0))));
     return nullptr;
   }
+  case Intrinsic::minnum:
+    setValue(&I, DAG.getNode(ISD::FMINNUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
+  case Intrinsic::maxnum:
+    setValue(&I, DAG.getNode(ISD::FMAXNUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
   case Intrinsic::copysign:
     setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -5133,9 +5113,9 @@
                              getValue(I.getArgOperand(2))));
     return nullptr;
   case Intrinsic::fmuladd: {
-    EVT VT = TLI->getValueType(I.getType());
+    EVT VT = TLI.getValueType(I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
-        TLI->isFMAFasterThanFMulAndFAdd(VT)) {
+        TLI.isFMAFasterThanFMulAndFAdd(VT)) {
       setValue(&I, DAG.getNode(ISD::FMA, sdl,
                                getValue(I.getArgOperand(0)).getValueType(),
                                getValue(I.getArgOperand(0)),
@@ -5155,12 +5135,16 @@
     return nullptr;
   }
   case Intrinsic::convert_to_fp16:
-    setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, sdl,
-                             MVT::i16, getValue(I.getArgOperand(0))));
+    setValue(&I, DAG.getNode(ISD::BITCAST, sdl, MVT::i16,
+                             DAG.getNode(ISD::FP_ROUND, sdl, MVT::f16,
+                                         getValue(I.getArgOperand(0)),
+                                         DAG.getTargetConstant(0, MVT::i32))));
     return nullptr;
   case Intrinsic::convert_from_fp16:
-    setValue(&I, DAG.getNode(ISD::FP16_TO_FP32, sdl,
-                             MVT::f32, getValue(I.getArgOperand(0))));
+    setValue(&I,
+             DAG.getNode(ISD::FP_EXTEND, sdl, TLI.getValueType(I.getType()),
+                         DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
+                                     getValue(I.getArgOperand(0)))));
     return nullptr;
   case Intrinsic::pcmarker: {
     SDValue Tmp = getValue(I.getArgOperand(0));
@@ -5205,7 +5189,7 @@
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(ISD::STACKSAVE, sdl,
-                      DAG.getVTList(TLI->getPointerTy(), MVT::Other), Op);
+                      DAG.getVTList(TLI.getPointerTy(), MVT::Other), Op);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
     return nullptr;
@@ -5219,9 +5203,44 @@
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
     MachineFrameInfo *MFI = MF.getFrameInfo();
-    EVT PtrTy = TLI->getPointerTy();
+    EVT PtrTy = TLI.getPointerTy();
+    SDValue Src, Chain = getRoot();
+    const Value *Ptr = cast<LoadInst>(I.getArgOperand(0))->getPointerOperand();
+    const GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr);
 
-    SDValue Src = getValue(I.getArgOperand(0));   // The guard's value.
+    // See if Ptr is a bitcast. If it is, look through it and see if we can get
+    // global variable __stack_chk_guard.
+    if (!GV)
+      if (const Operator *BC = dyn_cast<Operator>(Ptr))
+        if (BC->getOpcode() == Instruction::BitCast)
+          GV = dyn_cast<GlobalVariable>(BC->getOperand(0));
+
+    if (GV && TLI.useLoadStackGuardNode()) {
+      // Emit a LOAD_STACK_GUARD node.
+      MachineSDNode *Node = DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD,
+                                               sdl, PtrTy, Chain);
+      MachinePointerInfo MPInfo(GV);
+      MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+      unsigned Flags = MachineMemOperand::MOLoad |
+                       MachineMemOperand::MOInvariant;
+      *MemRefs = MF.getMachineMemOperand(MPInfo, Flags,
+                                         PtrTy.getSizeInBits() / 8,
+                                         DAG.getEVTAlignment(PtrTy));
+      Node->setMemRefs(MemRefs, MemRefs + 1);
+
+      // Copy the guard value to a virtual register so that it can be
+      // retrieved in the epilogue.
+      Src = SDValue(Node, 0);
+      const TargetRegisterClass *RC =
+          TLI.getRegClassFor(Src.getSimpleValueType());
+      unsigned Reg = MF.getRegInfo().createVirtualRegister(RC);
+
+      SPDescriptor.setGuardReg(Reg);
+      Chain = DAG.getCopyToReg(Chain, sdl, Reg, Src);
+    } else {
+      Src = getValue(I.getArgOperand(0));   // The guard's value.
+    }
+
     AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
 
     int FI = FuncInfo.StaticAllocaMap[Slot];
@@ -5230,7 +5249,7 @@
     SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
 
     // Store the stack protector onto the stack.
-    Res = DAG.getStore(getRoot(), sdl, Src, FIN,
+    Res = DAG.getStore(Chain, sdl, Src, FIN,
                        MachinePointerInfo::getFixedStack(FI),
                        true, false, 0);
     setValue(&I, Res);
@@ -5259,8 +5278,9 @@
     // Drop the intrinsic, but forward the value
     setValue(&I, getValue(I.getOperand(0)));
     return nullptr;
+  case Intrinsic::assume:
   case Intrinsic::var_annotation:
-    // Discard annotate attributes
+    // Discard annotate attributes and assumptions
     return nullptr;
 
   case Intrinsic::init_trampoline: {
@@ -5281,7 +5301,7 @@
   }
   case Intrinsic::adjust_trampoline: {
     setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
-                             TLI->getPointerTy(),
+                             TLI.getPointerTy(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
   }
@@ -5321,10 +5341,10 @@
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(sdl).setChain(getRoot())
       .setCallee(CallingConv::C, I.getType(),
-                 DAG.getExternalSymbol(TrapFuncName.data(), TLI->getPointerTy()),
+                 DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()),
                  std::move(Args), 0);
 
-    std::pair<SDValue, SDValue> Result = TLI->LowerCallTo(CLI);
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     DAG.setRoot(Result.second);
     return nullptr;
   }
@@ -5388,11 +5408,17 @@
       if (!LifetimeObject)
         continue;
 
-      int FI = FuncInfo.StaticAllocaMap[LifetimeObject];
+      // First check that the Alloca is static, otherwise it won't have a
+      // valid frame index.
+      auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
+      if (SI == FuncInfo.StaticAllocaMap.end())
+        return nullptr;
+
+      int FI = SI->second;
 
       SDValue Ops[2];
       Ops[0] = getRoot();
-      Ops[1] = DAG.getFrameIndex(FI, TLI->getPointerTy(), true);
+      Ops[1] = DAG.getFrameIndex(FI, TLI.getPointerTy(), true);
       unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
 
       Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops);
@@ -5402,7 +5428,7 @@
   }
   case Intrinsic::invariant_start:
     // Discard region information.
-    setValue(&I, DAG.getUNDEF(TLI->getPointerTy()));
+    setValue(&I, DAG.getUNDEF(TLI.getPointerTy()));
     return nullptr;
   case Intrinsic::invariant_end:
     // Discard region information.
@@ -5420,7 +5446,7 @@
     return nullptr;
   }
   case Intrinsic::clear_cache:
-    return TLI->getClearCacheBuiltinName();
+    return TLI.getClearCacheBuiltinName();
   case Intrinsic::donothing:
     // ignore
     return nullptr;
@@ -5430,42 +5456,18 @@
   }
   case Intrinsic::experimental_patchpoint_void:
   case Intrinsic::experimental_patchpoint_i64: {
-    visitPatchpoint(I);
+    visitPatchpoint(&I);
     return nullptr;
   }
   }
 }
 
-void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
-                                      bool isTailCall,
-                                      MachineBasicBlock *LandingPad) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  Type *RetTy = FTy->getReturnType();
+std::pair<SDValue, SDValue>
+SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
+                                    MachineBasicBlock *LandingPad) {
   MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
   MCSymbol *BeginLabel = nullptr;
 
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
-  Args.reserve(CS.arg_size());
-
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
-    const Value *V = *i;
-
-    // Skip empty types
-    if (V->getType()->isEmptyTy())
-      continue;
-
-    SDValue ArgNode = getValue(V);
-    Entry.Node = ArgNode; Entry.Ty = V->getType();
-
-    // Skip the first return-type Attribute to get to params.
-    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
-    Args.push_back(Entry);
-  }
-
   if (LandingPad) {
     // Insert a label before the invoke call to mark the try range.  This can be
     // used to detect deletion of the invoke via the MachineModuleInfo.
@@ -5486,24 +5488,17 @@
     // this call might not return.
     (void)getRoot();
     DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getControlRoot(), BeginLabel));
+
+    CLI.setChain(getRoot());
   }
 
-  // Check if target-independent constraints permit a tail call here.
-  // Target-dependent constraints are checked within TLI->LowerCallTo.
-  if (isTailCall && !isInTailCallPosition(CS, DAG))
-    isTailCall = false;
+  const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+  std::pair<SDValue, SDValue> Result = TLI->LowerCallTo(CLI);
 
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
-    .setCallee(RetTy, FTy, Callee, std::move(Args), CS).setTailCall(isTailCall);
-
-  std::pair<SDValue,SDValue> Result = TLI->LowerCallTo(CLI);
-  assert((isTailCall || Result.second.getNode()) &&
+  assert((CLI.IsTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
   assert((Result.second.getNode() || !Result.first.getNode()) &&
          "Null value expected with tail call!");
-  if (Result.first.getNode())
-    setValue(CS.getInstruction(), Result.first);
 
   if (!Result.second.getNode()) {
     // As a special case, a null chain means that a tail call has been emitted
@@ -5526,6 +5521,50 @@
     // Inform MachineModuleInfo of range.
     MMI.addInvoke(LandingPad, BeginLabel, EndLabel);
   }
+
+  return Result;
+}
+
+void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
+                                      bool isTailCall,
+                                      MachineBasicBlock *LandingPad) {
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  Type *RetTy = FTy->getReturnType();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Args.reserve(CS.arg_size());
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    const Value *V = *i;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    SDValue ArgNode = getValue(V);
+    Entry.Node = ArgNode; Entry.Ty = V->getType();
+
+    // Skip the first return-type Attribute to get to params.
+    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+    Args.push_back(Entry);
+  }
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within TLI->LowerCallTo.
+  if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget()))
+    isTailCall = false;
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
+    .setCallee(RetTy, FTy, Callee, std::move(Args), CS)
+    .setTailCall(isTailCall);
+  std::pair<SDValue,SDValue> Result = lowerInvokable(CLI, LandingPad);
+
+  if (Result.first.getNode())
+    setValue(CS.getInstruction(), Result.first);
 }
 
 /// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
@@ -5591,7 +5630,7 @@
 void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
                                                   SDValue Value,
                                                   bool IsSigned) {
-  EVT VT = TM.getTargetLowering()->getValueType(I.getType(), true);
+  EVT VT = DAG.getTargetLoweringInfo().getValueType(I.getType(), true);
   if (IsSigned)
     Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
   else
@@ -5616,7 +5655,7 @@
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
   if (CSize && CSize->getZExtValue() == 0) {
-    EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true);
+    EVT CallVT = DAG.getTargetLoweringInfo().getValueType(I.getType(), true);
     setValue(&I, DAG.getConstant(0, CallVT));
     return true;
   }
@@ -5673,15 +5712,16 @@
     // Require that we can find a legal MVT, and only do this if the target
     // supports unaligned loads of that type.  Expanding into byte loads would
     // bloat the code.
-    const TargetLowering *TLI = TM.getTargetLowering();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     if (ActuallyDoIt && CSize->getZExtValue() > 4) {
       unsigned DstAS = LHS->getType()->getPointerAddressSpace();
       unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
-      if (!TLI->isTypeLegal(LoadVT) ||
-          !TLI->allowsUnalignedMemoryAccesses(LoadVT, SrcAS) ||
-          !TLI->allowsUnalignedMemoryAccesses(LoadVT, DstAS))
+      // TODO: Check alignment of src and dest ptrs.
+      if (!TLI.isTypeLegal(LoadVT) ||
+          !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
+          !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
         ActuallyDoIt = false;
     }
 
@@ -5859,6 +5899,26 @@
   return true;
 }
 
+/// visitBinaryFloatCall - If a call instruction is a binary floating-point
+/// operation (as expected), translate it to an SDNode with the specified opcode
+/// and return true.
+bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
+                                               unsigned Opcode) {
+  // Sanity check that it really is a binary floating-point call.
+  if (I.getNumArgOperands() != 2 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      I.getType() != I.getArgOperand(1)->getType() ||
+      !I.onlyReadsMemory())
+    return false;
+
+  SDValue Tmp0 = getValue(I.getArgOperand(0));
+  SDValue Tmp1 = getValue(I.getArgOperand(1));
+  EVT VT = Tmp0.getValueType();
+  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1));
+  return true;
+}
+
 void SelectionDAGBuilder::visitCall(const CallInst &I) {
   // Handle inline assembly differently.
   if (isa<InlineAsm>(I.getCalledValue())) {
@@ -5915,6 +5975,18 @@
         if (visitUnaryFloatCall(I, ISD::FABS))
           return;
         break;
+      case LibFunc::fmin:
+      case LibFunc::fminf:
+      case LibFunc::fminl:
+        if (visitBinaryFloatCall(I, ISD::FMINNUM))
+          return;
+        break;
+      case LibFunc::fmax:
+      case LibFunc::fmaxf:
+      case LibFunc::fmaxl:
+        if (visitBinaryFloatCall(I, ISD::FMAXNUM))
+          return;
+        break;
       case LibFunc::sin:
       case LibFunc::sinf:
       case LibFunc::sinl:
@@ -6021,7 +6093,7 @@
     Callee = getValue(I.getCalledValue());
   else
     Callee = DAG.getExternalSymbol(RenameFn,
-                                   TM.getTargetLowering()->getPointerTy());
+                                   DAG.getTargetLoweringInfo().getPointerTy());
 
   // Check if we can potentially perform a tail call. More detailed checking is
   // be done within LowerCallTo, after more information about the call is known.
@@ -6216,9 +6288,9 @@
   /// ConstraintOperands - Information about all of the constraints.
   SDISelAsmOperandInfoVector ConstraintOperands;
 
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI->ParseConstraints(CS);
+    TargetConstraints = TLI.ParseConstraints(CS);
 
   bool hasMemory = false;
 
@@ -6243,10 +6315,10 @@
       // corresponding argument.
       assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
       if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
-        OpVT = TLI->getSimpleValueType(STy->getElementType(ResNo));
+        OpVT = TLI.getSimpleValueType(STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpVT = TLI->getSimpleValueType(CS.getType());
+        OpVT = TLI.getSimpleValueType(CS.getType());
       }
       ++ResNo;
       break;
@@ -6267,8 +6339,8 @@
         OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
       }
 
-      OpVT = OpInfo.getCallOperandValEVT(*DAG.getContext(), *TLI, DL).
-        getSimpleVT();
+      OpVT =
+          OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI, DL).getSimpleVT();
     }
 
     OpInfo.ConstraintVT = OpVT;
@@ -6279,7 +6351,7 @@
     else {
       for (unsigned j = 0, ee = OpInfo.Codes.size(); j != ee; ++j) {
         TargetLowering::ConstraintType
-          CType = TLI->getConstraintType(OpInfo.Codes[j]);
+          CType = TLI.getConstraintType(OpInfo.Codes[j]);
         if (CType == TargetLowering::C_Memory) {
           hasMemory = true;
           break;
@@ -6311,10 +6383,10 @@
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
         std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-          TLI->getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+          TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
                                             OpInfo.ConstraintVT);
         std::pair<unsigned, const TargetRegisterClass*> InputRC =
-          TLI->getRegForInlineAsmConstraint(Input.ConstraintCode,
+          TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
                                             Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
@@ -6328,7 +6400,7 @@
     }
 
     // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
+    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
 
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         OpInfo.Type == InlineAsm::isClobber)
@@ -6356,16 +6428,16 @@
       if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
           isa<ConstantVector>(OpVal) || isa<ConstantDataVector>(OpVal)) {
         OpInfo.CallOperand = DAG.getConstantPool(cast<Constant>(OpVal),
-                                                 TLI->getPointerTy());
+                                                 TLI.getPointerTy());
       } else {
         // Otherwise, create a stack slot and emit a store to it before the
         // asm.
         Type *Ty = OpVal->getType();
-        uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
-        unsigned Align  = TLI->getDataLayout()->getPrefTypeAlignment(Ty);
+        uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
+        unsigned Align  = TLI.getDataLayout()->getPrefTypeAlignment(Ty);
         MachineFunction &MF = DAG.getMachineFunction();
         int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
-        SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI->getPointerTy());
+        SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy());
         Chain = DAG.getStore(Chain, getCurSDLoc(),
                              OpInfo.CallOperand, StackSlot,
                              MachinePointerInfo::getFixedStack(SSFI),
@@ -6383,7 +6455,7 @@
     // If this constraint is for a specific register, allocate it before
     // anything else.
     if (OpInfo.ConstraintType == TargetLowering::C_Register)
-      GetRegistersForValue(DAG, *TLI, getCurSDLoc(), OpInfo);
+      GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo);
   }
 
   // Second pass - Loop over all of the operands, assigning virtual or physregs
@@ -6394,7 +6466,7 @@
     // C_Register operands have already been allocated, Other/Memory don't need
     // to be.
     if (OpInfo.ConstraintType == TargetLowering::C_RegisterClass)
-      GetRegistersForValue(DAG, *TLI, getCurSDLoc(), OpInfo);
+      GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo);
   }
 
   // AsmNodeOperands - The operands for the ISD::INLINEASM node.
@@ -6402,7 +6474,7 @@
   AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
   AsmNodeOperands.push_back(
           DAG.getTargetExternalSymbol(IA->getAsmString().c_str(),
-                                      TLI->getPointerTy()));
+                                      TLI.getPointerTy()));
 
   // If we have a !srcloc metadata node associated with it, we want to attach
   // this to the ultimately generated inline asm machineinstr.  To do this, we
@@ -6425,7 +6497,7 @@
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
 
     // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, SDValue());
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
 
     // Ideally, we would only check against memory constraints.  However, the
     // meaning of an other constraint can be target-specific and we can't easily
@@ -6443,7 +6515,7 @@
   }
 
   AsmNodeOperands.push_back(DAG.getTargetConstant(ExtraInfo,
-                                                  TLI->getPointerTy()));
+                                                  TLI.getPointerTy()));
 
   // Loop over all of the inputs, copying the operand values into the
   // appropriate registers and processing the output regs.
@@ -6465,7 +6537,7 @@
         // Add information to the INLINEASM node to know about this output.
         unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
         AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags,
-                                                        TLI->getPointerTy()));
+                                                        TLI.getPointerTy()));
         AsmNodeOperands.push_back(OpInfo.CallOperand);
         break;
       }
@@ -6545,7 +6617,7 @@
           MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
           for (unsigned i = 0, e = InlineAsm::getNumOperandRegisters(OpFlag);
                i != e; ++i) {
-            if (const TargetRegisterClass *RC = TLI->getRegClassFor(RegVT))
+            if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT))
               MatchedRegs.Regs.push_back(RegInfo.createVirtualRegister(RC));
             else {
               LLVMContext &Ctx = *DAG.getContext();
@@ -6572,7 +6644,7 @@
         OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
                                                     OpInfo.getMatchedOperand());
         AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlag,
-                                                        TLI->getPointerTy()));
+                                                        TLI.getPointerTy()));
         AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
         break;
       }
@@ -6584,7 +6656,7 @@
 
       if (OpInfo.ConstraintType == TargetLowering::C_Other) {
         std::vector<SDValue> Ops;
-        TLI->LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
+        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
                                           Ops, DAG);
         if (Ops.empty()) {
           LLVMContext &Ctx = *DAG.getContext();
@@ -6598,20 +6670,20 @@
         unsigned ResOpType =
           InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
         AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
-                                                        TLI->getPointerTy()));
+                                                        TLI.getPointerTy()));
         AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
         break;
       }
 
       if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
         assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
-        assert(InOperandVal.getValueType() == TLI->getPointerTy() &&
+        assert(InOperandVal.getValueType() == TLI.getPointerTy() &&
                "Memory operands expect pointer values");
 
         // Add information to the INLINEASM node to know about this input.
         unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
         AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
-                                                        TLI->getPointerTy()));
+                                                        TLI.getPointerTy()));
         AsmNodeOperands.push_back(InOperandVal);
         break;
       }
@@ -6674,7 +6746,7 @@
 
     // FIXME: Why don't we do this for inline asms with MRVs?
     if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
-      EVT ResultType = TLI->getValueType(CS.getType());
+      EVT ResultType = TLI.getValueType(CS.getType());
 
       // If any of the results of the inline asm is a vector, it may have the
       // wrong width/num elts.  This can happen for register classes that can
@@ -6739,9 +6811,9 @@
 }
 
 void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
-  const TargetLowering *TLI = TM.getTargetLowering();
-  const DataLayout &DL = *TLI->getDataLayout();
-  SDValue V = DAG.getVAArg(TLI->getValueType(I.getType()), getCurSDLoc(),
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = *TLI.getDataLayout();
+  SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurSDLoc(),
                            getRoot(), getValue(I.getOperand(0)),
                            DAG.getSrcValue(I.getOperand(0)),
                            DL.getABITypeAlignment(I.getType()));
@@ -6773,18 +6845,18 @@
 /// convention or require stack pointer adjustment. Only a subset of the
 /// intrinsic's operands need to participate in the calling convention.
 std::pair<SDValue, SDValue>
-SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx,
+SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx,
                                        unsigned NumArgs, SDValue Callee,
-                                       bool useVoidTy) {
+                                       bool UseVoidTy,
+                                       MachineBasicBlock *LandingPad) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumArgs);
 
   // Populate the argument list.
   // Attributes for args start at offset 1, after the return attribute.
-  ImmutableCallSite CS(&CI);
   for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
        ArgI != ArgE; ++ArgI) {
-    const Value *V = CI.getOperand(ArgI);
+    const Value *V = CS->getOperand(ArgI);
 
     assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
 
@@ -6795,14 +6867,13 @@
     Args.push_back(Entry);
   }
 
-  Type *retTy = useVoidTy ? Type::getVoidTy(*DAG.getContext()) : CI.getType();
+  Type *retTy = UseVoidTy ? Type::getVoidTy(*DAG.getContext()) : CS->getType();
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
-    .setCallee(CI.getCallingConv(), retTy, Callee, std::move(Args), NumArgs)
-    .setDiscardResult(!CI.use_empty());
+    .setCallee(CS.getCallingConv(), retTy, Callee, std::move(Args), NumArgs)
+    .setDiscardResult(CS->use_empty());
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  return TLI->LowerCallTo(CLI);
+  return lowerInvokable(CLI, LandingPad);
 }
 
 /// \brief Add a stack map intrinsic call's live variable operands to a stackmap
@@ -6822,11 +6893,11 @@
 /// assumption made by the llvm.gcroot intrinsic). If the alloca's location were
 /// only available in a register, then the runtime would need to trap when
 /// execution reaches the StackMap in order to read the alloca's location.
-static void addStackMapLiveVars(const CallInst &CI, unsigned StartIdx,
+static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
                                 SmallVectorImpl<SDValue> &Ops,
                                 SelectionDAGBuilder &Builder) {
-  for (unsigned i = StartIdx, e = CI.getNumArgOperands(); i != e; ++i) {
-    SDValue OpVal = Builder.getValue(CI.getArgOperand(i));
+  for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) {
+    SDValue OpVal = Builder.getValue(CS.getArgument(i));
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
       Ops.push_back(
         Builder.DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
@@ -6877,7 +6948,7 @@
                   cast<ConstantSDNode>(NBytesVal)->getZExtValue(), MVT::i32));
 
   // Push live variables for the stack map.
-  addStackMapLiveVars(CI, 2, Ops, *this);
+  addStackMapLiveVars(&CI, 2, Ops, *this);
 
   // We are not pushing any register mask info here on the operands list,
   // because the stackmap doesn't clobber anything.
@@ -6904,7 +6975,8 @@
 }
 
 /// \brief Lower llvm.experimental.patchpoint directly to its target opcode.
-void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
+void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
+                                          MachineBasicBlock *LandingPad) {
   // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
   //                                                 i32 <numBytes>,
   //                                                 i8* <target>,
@@ -6912,32 +6984,29 @@
   //                                                 [Args...],
   //                                                 [live variables...])
 
-  CallingConv::ID CC = CI.getCallingConv();
-  bool isAnyRegCC = CC == CallingConv::AnyReg;
-  bool hasDef = !CI.getType()->isVoidTy();
-  SDValue Callee = getValue(CI.getOperand(2)); // <target>
+  CallingConv::ID CC = CS.getCallingConv();
+  bool IsAnyRegCC = CC == CallingConv::AnyReg;
+  bool HasDef = !CS->getType()->isVoidTy();
+  SDValue Callee = getValue(CS->getOperand(2)); // <target>
 
   // Get the real number of arguments participating in the call <numArgs>
-  SDValue NArgVal = getValue(CI.getArgOperand(PatchPointOpers::NArgPos));
+  SDValue NArgVal = getValue(CS.getArgument(PatchPointOpers::NArgPos));
   unsigned NumArgs = cast<ConstantSDNode>(NArgVal)->getZExtValue();
 
   // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
   // Intrinsics include all meta-operands up to but not including CC.
   unsigned NumMetaOpers = PatchPointOpers::CCPos;
-  assert(CI.getNumArgOperands() >= NumMetaOpers + NumArgs &&
+  assert(CS.arg_size() >= NumMetaOpers + NumArgs &&
          "Not enough arguments provided to the patchpoint intrinsic");
 
   // For AnyRegCC the arguments are lowered later on manually.
-  unsigned NumCallArgs = isAnyRegCC ? 0 : NumArgs;
+  unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   std::pair<SDValue, SDValue> Result =
-    LowerCallOperands(CI, NumMetaOpers, NumCallArgs, Callee, isAnyRegCC);
+    lowerCallOperands(CS, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC,
+                      LandingPad);
 
-  // Set the root to the target-lowered call chain.
-  SDValue Chain = Result.second;
-  DAG.setRoot(Chain);
-
-  SDNode *CallEnd = Chain.getNode();
-  if (hasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
+  SDNode *CallEnd = Result.second.getNode();
+  if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
     CallEnd = CallEnd->getOperand(0).getNode();
 
   /// Get a call instruction from the call sequence chain.
@@ -6945,16 +7014,16 @@
   assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
          "Expected a callseq node.");
   SDNode *Call = CallEnd->getOperand(0).getNode();
-  bool hasGlue = Call->getGluedNode();
+  bool HasGlue = Call->getGluedNode();
 
   // Replace the target specific call node with the patchable intrinsic.
   SmallVector<SDValue, 8> Ops;
 
   // Add the <id> and <numBytes> constants.
-  SDValue IDVal = getValue(CI.getOperand(PatchPointOpers::IDPos));
+  SDValue IDVal = getValue(CS->getOperand(PatchPointOpers::IDPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(IDVal)->getZExtValue(), MVT::i64));
-  SDValue NBytesVal = getValue(CI.getOperand(PatchPointOpers::NBytesPos));
+  SDValue NBytesVal = getValue(CS->getOperand(PatchPointOpers::NBytesPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(NBytesVal)->getZExtValue(), MVT::i32));
 
@@ -6967,8 +7036,8 @@
   // Adjust <numArgs> to account for any arguments that have been passed on the
   // stack instead.
   // Call Node: Chain, Target, {Args}, RegMask, [Glue]
-  unsigned NumCallRegArgs = Call->getNumOperands() - (hasGlue ? 4 : 3);
-  NumCallRegArgs = isAnyRegCC ? NumArgs : NumCallRegArgs;
+  unsigned NumCallRegArgs = Call->getNumOperands() - (HasGlue ? 4 : 3);
+  NumCallRegArgs = IsAnyRegCC ? NumArgs : NumCallRegArgs;
   Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, MVT::i32));
 
   // Add the calling convention
@@ -6976,20 +7045,20 @@
 
   // Add the arguments we omitted previously. The register allocator should
   // place these in any free register.
-  if (isAnyRegCC)
+  if (IsAnyRegCC)
     for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
-      Ops.push_back(getValue(CI.getArgOperand(i)));
+      Ops.push_back(getValue(CS.getArgument(i)));
 
   // Push the arguments from the call instruction up to the register mask.
-  SDNode::op_iterator e = hasGlue ? Call->op_end()-2 : Call->op_end()-1;
+  SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
   for (SDNode::op_iterator i = Call->op_begin()+2; i != e; ++i)
     Ops.push_back(*i);
 
   // Push live variables for the stack map.
-  addStackMapLiveVars(CI, NumMetaOpers + NumArgs, Ops, *this);
+  addStackMapLiveVars(CS, NumMetaOpers + NumArgs, Ops, *this);
 
   // Push the register mask info.
-  if (hasGlue)
+  if (HasGlue)
     Ops.push_back(*(Call->op_end()-2));
   else
     Ops.push_back(*(Call->op_end()-1));
@@ -6999,15 +7068,15 @@
   Ops.push_back(*(Call->op_begin()));
 
   // Push the glue flag (last operand).
-  if (hasGlue)
+  if (HasGlue)
     Ops.push_back(*(Call->op_end()-1));
 
   SDVTList NodeTys;
-  if (isAnyRegCC && hasDef) {
+  if (IsAnyRegCC && HasDef) {
     // Create the return types based on the intrinsic definition
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     SmallVector<EVT, 3> ValueVTs;
-    ComputeValueVTs(TLI, CI.getType(), ValueVTs);
+    ComputeValueVTs(TLI, CS->getType(), ValueVTs);
     assert(ValueVTs.size() == 1 && "Expected only one return value type.");
 
     // There is always a chain and a glue type at the end
@@ -7022,18 +7091,18 @@
                                          getCurSDLoc(), NodeTys, Ops);
 
   // Update the NodeMap.
-  if (hasDef) {
-    if (isAnyRegCC)
-      setValue(&CI, SDValue(MN, 0));
+  if (HasDef) {
+    if (IsAnyRegCC)
+      setValue(CS.getInstruction(), SDValue(MN, 0));
     else
-      setValue(&CI, Result.first);
+      setValue(CS.getInstruction(), Result.first);
   }
 
   // Fixup the consumers of the intrinsic. The chain and glue may be used in the
   // call sequence. Furthermore the location of the chain and glue can change
   // when the AnyReg calling convention is used and the intrinsic returns a
   // value.
-  if (isAnyRegCC && hasDef) {
+  if (IsAnyRegCC && HasDef) {
     SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
     SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
     DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
@@ -7182,8 +7251,11 @@
       }
       if (Args[i].isNest)
         Flags.setNest();
-      if (NeedsRegBlock)
+      if (NeedsRegBlock) {
         Flags.setInConsecutiveRegs();
+        if (Value == NumValues - 1)
+          Flags.setInConsecutiveRegsLast();
+      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
@@ -7229,10 +7301,6 @@
         else if (j != 0)
           MyFlags.Flags.setOrigAlign(1);
 
-        // Only mark the end at the last register of the last value.
-        if (NeedsRegBlock && Value == NumValues - 1 && j == NumParts - 1)
-          MyFlags.Flags.setInConsecutiveRegsLast();
-
         CLI.Outs.push_back(MyFlags);
         CLI.OutVals.push_back(Parts[j]);
       }
@@ -7345,10 +7413,15 @@
          "Copy from a reg to the same reg!");
   assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
 
-  const TargetLowering *TLI = TM.getTargetLowering();
-  RegsForValue RFV(V->getContext(), *TLI, Reg, V->getType());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  RegsForValue RFV(V->getContext(), TLI, Reg, V->getType());
   SDValue Chain = DAG.getEntryNode();
-  RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V);
+
+  ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
+                              FuncInfo.PreferredExtendType.end())
+                                 ? ISD::ANY_EXTEND
+                                 : FuncInfo.PreferredExtendType[V];
+  RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
   PendingExports.push_back(Chain);
 }
 
@@ -7374,15 +7447,13 @@
 void SelectionDAGISel::LowerArguments(const Function &F) {
   SelectionDAG &DAG = SDB->DAG;
   SDLoc dl = SDB->getCurSDLoc();
-  const TargetLowering *TLI = getTargetLowering();
   const DataLayout *DL = TLI->getDataLayout();
   SmallVector<ISD::InputArg, 16> Ins;
 
   if (!FuncInfo->CanLowerReturn) {
     // Put in an sret pointer parameter before all the other parameters.
     SmallVector<EVT, 1> ValueVTs;
-    ComputeValueVTs(*getTargetLowering(),
-                    PointerType::getUnqual(F.getReturnType()), ValueVTs);
+    ComputeValueVTs(*TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
 
     // NOTE: Assuming that a pointer will never break down to more than one VT
     // or one register.
@@ -7447,8 +7518,11 @@
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
         Flags.setNest();
-      if (NeedsRegBlock)
+      if (NeedsRegBlock) {
         Flags.setInConsecutiveRegs();
+        if (Value == NumValues - 1)
+          Flags.setInConsecutiveRegsLast();
+      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
@@ -7461,11 +7535,6 @@
         // if it isn't first piece, alignment must be 1
         else if (i > 0)
           MyFlags.Flags.setOrigAlign(1);
-
-        // Only mark the end at the last register of the last value.
-        if (NeedsRegBlock && Value == NumValues - 1 && i == NumRegs - 1)
-          MyFlags.Flags.setInConsecutiveRegsLast();
-
         Ins.push_back(MyFlags);
       }
       PartBase += VT.getStoreSize();
@@ -7474,9 +7543,8 @@
 
   // Call the target to set up the argument values.
   SmallVector<SDValue, 8> InVals;
-  SDValue NewRoot = TLI->LowerFormalArguments(DAG.getRoot(), F.getCallingConv(),
-                                              F.isVarArg(), Ins,
-                                              dl, DAG, InVals);
+  SDValue NewRoot = TLI->LowerFormalArguments(
+      DAG.getRoot(), F.getCallingConv(), F.isVarArg(), Ins, dl, DAG, InVals);
 
   // Verify that the target's LowerFormalArguments behaved as expected.
   assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
@@ -7513,8 +7581,8 @@
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
     unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
     FuncInfo->DemoteRegister = SRetReg;
-    NewRoot = SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(),
-                                    SRetReg, ArgValue);
+    NewRoot =
+        SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
     DAG.setRoot(NewRoot);
 
     // i indexes lowered arguments.  Bump it past the hidden sret argument.
@@ -7629,7 +7697,8 @@
 
     // If this terminator has multiple identical successors (common for
     // switches), only handle each succ once.
-    if (!SuccsHandled.insert(SuccMBB)) continue;
+    if (!SuccsHandled.insert(SuccMBB).second)
+      continue;
 
     MachineBasicBlock::iterator MBBI = SuccMBB->begin();
 
@@ -7672,11 +7741,11 @@
       // Remember that this register needs to added to the machine PHI node as
       // the input for this MBB.
       SmallVector<EVT, 4> ValueVTs;
-      const TargetLowering *TLI = TM.getTargetLowering();
-      ComputeValueVTs(*TLI, PN->getType(), ValueVTs);
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
-        unsigned NumRegisters = TLI->getNumRegisters(*DAG.getContext(), VT);
+        unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
         for (unsigned i = 0, e = NumRegisters; i != e; ++i)
           FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg+i));
         Reg += NumRegisters;

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 84679f9..f74e652 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SELECTIONDAGBUILDER_H
-#define SELECTIONDAGBUILDER_H
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
@@ -21,6 +21,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
 #include <vector>
 
 namespace llvm {
@@ -200,7 +201,7 @@
     }
   };
 
-  size_t Clusterify(CaseVector &Cases, const SwitchInst &SI);
+  void Clusterify(CaseVector &Cases, const SwitchInst &SI);
 
   /// CaseBlock - This structure is used to communicate between
   /// SelectionDAGBuilder and SDISel for the code generation of additional basic
@@ -276,9 +277,9 @@
     BitTestBlock(APInt F, APInt R, const Value* SV,
                  unsigned Rg, MVT RgVT, bool E,
                  MachineBasicBlock* P, MachineBasicBlock* D,
-                 const BitTestInfo& C):
+                 BitTestInfo C):
       First(F), Range(R), SValue(SV), Reg(Rg), RegVT(RgVT), Emitted(E),
-      Parent(P), Default(D), Cases(C) { }
+      Parent(P), Default(D), Cases(std::move(C)) { }
     APInt First;
     APInt Range;
     const Value *SValue;
@@ -397,7 +398,8 @@
   class StackProtectorDescriptor {
   public:
     StackProtectorDescriptor() : ParentMBB(nullptr), SuccessMBB(nullptr),
-                                 FailureMBB(nullptr), Guard(nullptr) { }
+                                 FailureMBB(nullptr), Guard(nullptr),
+                                 GuardReg(0) { }
     ~StackProtectorDescriptor() { }
 
     /// Returns true if all fields of the stack protector descriptor are
@@ -455,6 +457,9 @@
     MachineBasicBlock *getFailureMBB() { return FailureMBB; }
     const Value *getGuard() { return Guard; }
 
+    unsigned getGuardReg() const { return GuardReg; }
+    void setGuardReg(unsigned R) { GuardReg = R; }
+
   private:
     /// The basic block for which we are generating the stack protector.
     ///
@@ -477,6 +482,9 @@
     /// stack protector stack slot.
     const Value *Guard;
 
+    /// The virtual register holding the stack guard value.
+    unsigned GuardReg;
+
     /// Add a successor machine basic block to ParentMBB. If the successor mbb
     /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
     /// block will be created.
@@ -626,17 +634,23 @@
   void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
                    MachineBasicBlock *LandingPad = nullptr);
 
-  std::pair<SDValue, SDValue> LowerCallOperands(const CallInst &CI,
-                                                unsigned ArgIdx,
-                                                unsigned NumArgs,
-                                                SDValue Callee,
-                                                bool useVoidTy = false);
+  std::pair<SDValue, SDValue> lowerCallOperands(
+          ImmutableCallSite CS,
+          unsigned ArgIdx,
+          unsigned NumArgs,
+          SDValue Callee,
+          bool UseVoidTy = false,
+          MachineBasicBlock *LandingPad = nullptr);
 
   /// UpdateSplitBlock - When an MBB was split during scheduling, update the
   /// references that need to refer to the last resulting block.
   void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
 
 private:
+  std::pair<SDValue, SDValue> lowerInvokable(
+          TargetLowering::CallLoweringInfo &CLI,
+          MachineBasicBlock *LandingPad);
+
   // Terminator instructions.
   void visitRet(const ReturnInst &I);
   void visitBr(const BranchInst &I);
@@ -658,7 +672,6 @@
   bool handleBTSplitSwitchCase(CaseRec& CR,
                                CaseRecVector& WorkList,
                                const Value* SV,
-                               MachineBasicBlock* Default,
                                MachineBasicBlock *SwitchBB);
   bool handleBitTestsSwitchCase(CaseRec& CR,
                                 CaseRecVector& WorkList,
@@ -755,6 +768,7 @@
   bool visitStrLenCall(const CallInst &I);
   bool visitStrNLenCall(const CallInst &I);
   bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
+  bool visitBinaryFloatCall(const CallInst &I, unsigned Opcode);
   void visitAtomicLoad(const LoadInst &I);
   void visitAtomicStore(const StoreInst &I);
 
@@ -767,7 +781,8 @@
   void visitVAEnd(const CallInst &I);
   void visitVACopy(const CallInst &I);
   void visitStackmap(const CallInst &I);
-  void visitPatchpoint(const CallInst &I);
+  void visitPatchpoint(ImmutableCallSite CS,
+                       MachineBasicBlock *LandingPad = nullptr);
 
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
@@ -784,7 +799,7 @@
   /// EmitFuncArgumentDbgValue - If V is an function argument then create
   /// corresponding DBG_VALUE machine instruction for it now. At the end of
   /// instruction selection, they will be inserted to the entry BB.
-  bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
+  bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable, MDNode *Expr,
                                 int64_t Offset, bool IsIndirect,
                                 const SDValue &N);
 };

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index b3a452f..c9f6cff 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp

@@ -27,6 +27,7 @@
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 std::string SDNode::getOperationName(const SelectionDAG *G) const {
@@ -36,7 +37,7 @@
       return "<<Unknown DAG Node>>";
     if (isMachineOpcode()) {
       if (G)
-        if (const TargetInstrInfo *TII = G->getTarget().getInstrInfo())
+        if (const TargetInstrInfo *TII = G->getSubtarget().getInstrInfo())
           if (getMachineOpcode() < TII->getNumOpcodes())
             return TII->getName(getMachineOpcode());
       return "<<Unknown Machine Node #" + utostr(getOpcode()) + ">>";
@@ -140,6 +141,8 @@
 
   // Unary operators
   case ISD::FABS:                       return "fabs";
+  case ISD::FMINNUM:                    return "fminnum";
+  case ISD::FMAXNUM:                    return "fmaxnum";
   case ISD::FNEG:                       return "fneg";
   case ISD::FSQRT:                      return "fsqrt";
   case ISD::FSIN:                       return "fsin";
@@ -236,8 +239,8 @@
   case ISD::FP_TO_UINT:                 return "fp_to_uint";
   case ISD::BITCAST:                    return "bitcast";
   case ISD::ADDRSPACECAST:              return "addrspacecast";
-  case ISD::FP16_TO_FP32:               return "fp16_to_fp32";
-  case ISD::FP32_TO_FP16:               return "fp32_to_fp16";
+  case ISD::FP16_TO_FP:                 return "fp16_to_fp";
+  case ISD::FP_TO_FP16:                 return "fp_to_fp16";
 
   case ISD::CONVERT_RNDSAT: {
     switch (cast<CvtRndSatSDNode>(this)->getCvtCode()) {
@@ -433,7 +436,8 @@
       OS << LBB->getName() << " ";
     OS << (const void*)BBDN->getBasicBlock() << ">";
   } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
-    OS << ' ' << PrintReg(R->getReg(), G ? G->getTarget().getRegisterInfo() :nullptr);
+    OS << ' ' << PrintReg(R->getReg(),
+                          G ? G->getSubtarget().getRegisterInfo() : nullptr);
   } else if (const ExternalSymbolSDNode *ES =
              dyn_cast<ExternalSymbolSDNode>(this)) {
     OS << "'" << ES->getSymbol() << "'";
@@ -565,7 +569,7 @@
 typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet;
 static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
                        const SelectionDAG *G, VisitedSDNodeSet &once) {
-  if (!once.insert(N))          // If we've been here before, return now.
+  if (!once.insert(N).second) // If we've been here before, return now.
     return;
 
   // Dump the current SDNode, but don't end the line yet.

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 57e22e2..79109b7 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp

@@ -284,8 +284,8 @@
   /// for the target.
   ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
                                              CodeGenOpt::Level OptLevel) {
-    const TargetLowering *TLI = IS->getTargetLowering();
-    const TargetSubtargetInfo &ST = IS->TM.getSubtarget<TargetSubtargetInfo>();
+    const TargetLowering *TLI = IS->TLI;
+    const TargetSubtargetInfo &ST = IS->MF->getSubtarget();
 
     if (OptLevel == CodeGenOpt::None || ST.useMachineScheduler() ||
         TLI->getSchedulingPreference() == Sched::Source)
@@ -336,7 +336,7 @@
 SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
                                    CodeGenOpt::Level OL) :
   MachineFunctionPass(ID), TM(tm),
-  FuncInfo(new FunctionLoweringInfo(TM)),
+  FuncInfo(new FunctionLoweringInfo()),
   CurDAG(new SelectionDAG(tm, OL)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
   GFI(),
@@ -411,32 +411,32 @@
          "-fast-isel-abort requires -fast-isel");
 
   const Function &Fn = *mf.getFunction();
-  const TargetInstrInfo &TII = *TM.getInstrInfo();
-  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
-  const TargetLowering *TLI = TM.getTargetLowering();
-
   MF = &mf;
-  RegInfo = &MF->getRegInfo();
-  AA = &getAnalysis<AliasAnalysis>();
-  LibInfo = &getAnalysis<TargetLibraryInfo>();
-  GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
 
-  TargetSubtargetInfo &ST =
-    const_cast<TargetSubtargetInfo&>(TM.getSubtarget<TargetSubtargetInfo>());
-  ST.resetSubtargetFeatures(MF);
-  TM.resetTargetOptions(MF);
-
+  // Reset the target options before resetting the optimization
+  // level below.
+  // FIXME: This is a horrible hack and should be processed via
+  // codegen looking at the optimization level explicitly when
+  // it wants to look at it.
+  TM.resetTargetOptions(Fn);
   // Reset OptLevel to None for optnone functions.
   CodeGenOpt::Level NewOptLevel = OptLevel;
   if (Fn.hasFnAttribute(Attribute::OptimizeNone))
     NewOptLevel = CodeGenOpt::None;
   OptLevelChanger OLC(*this, NewOptLevel);
 
+  TII = MF->getSubtarget().getInstrInfo();
+  TLI = MF->getSubtarget().getTargetLowering();
+  RegInfo = &MF->getRegInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+  LibInfo = &getAnalysis<TargetLibraryInfo>();
+  GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
+
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), this);
 
-  CurDAG->init(*MF, TLI);
+  CurDAG->init(*MF);
   FuncInfo->set(Fn, *MF, CurDAG);
 
   if (UseMBPI && OptLevel != CodeGenOpt::None)
@@ -454,7 +454,8 @@
   // copied into vregs, emit the copies into the top of the block before
   // emitting the code for the block.
   MachineBasicBlock *EntryMBB = MF->begin();
-  RegInfo->EmitLiveInCopies(EntryMBB, TRI, TII);
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  RegInfo->EmitLiveInCopies(EntryMBB, TRI, *TII);
 
   DenseMap<unsigned, unsigned> LiveInMap;
   if (!FuncInfo->ArgDbgValues.empty())
@@ -489,15 +490,14 @@
                        "- add if needed");
       MachineInstr *Def = RegInfo->getVRegDef(LDI->second);
       MachineBasicBlock::iterator InsertPos = Def;
-      const MDNode *Variable =
-        MI->getOperand(MI->getNumOperands()-1).getMetadata();
+      const MDNode *Variable = MI->getDebugVariable();
+      const MDNode *Expr = MI->getDebugExpression();
       bool IsIndirect = MI->isIndirectDebugValue();
       unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
       // Def is never a terminator here, so it is ok to increment InsertPos.
       BuildMI(*EntryMBB, ++InsertPos, MI->getDebugLoc(),
-              TII.get(TargetOpcode::DBG_VALUE),
-              IsIndirect,
-              LDI->second, Offset, Variable);
+              TII->get(TargetOpcode::DBG_VALUE), IsIndirect, LDI->second, Offset,
+              Variable, Expr);
 
       // If this vreg is directly copied into an exported register then
       // that COPY instructions also need DBG_VALUE, if it is the only
@@ -516,11 +516,9 @@
       }
       if (CopyUseMI) {
         MachineInstr *NewMI =
-          BuildMI(*MF, CopyUseMI->getDebugLoc(),
-                  TII.get(TargetOpcode::DBG_VALUE),
-                  IsIndirect,
-                  CopyUseMI->getOperand(0).getReg(),
-                  Offset, Variable);
+            BuildMI(*MF, CopyUseMI->getDebugLoc(),
+                    TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
+                    CopyUseMI->getOperand(0).getReg(), Offset, Variable, Expr);
         MachineBasicBlock::iterator Pos = CopyUseMI;
         EntryMBB->insertAfter(Pos, NewMI);
       }
@@ -534,7 +532,7 @@
       break;
 
     for (const auto &MI : MBB) {
-      const MCInstrDesc &MCID = TM.getInstrInfo()->get(MI.getOpcode());
+      const MCInstrDesc &MCID = TII->get(MI.getOpcode());
       if ((MCID.isCall() && !MCID.isReturn()) ||
           MI.isStackAligningInlineAsm()) {
         MFI->setHasCalls(true);
@@ -617,7 +615,7 @@
     SDNode *N = Worklist.pop_back_val();
 
     // If we've already seen this node, ignore it.
-    if (!VisitedNodes.insert(N))
+    if (!VisitedNodes.insert(N).second)
       continue;
 
     // Otherwise, add all chain operands to the worklist.
@@ -901,12 +899,11 @@
   // Assign the call site to the landing pad's begin label.
   MF->getMMI().setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
 
-  const MCInstrDesc &II = TM.getInstrInfo()->get(TargetOpcode::EH_LABEL);
+  const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL);
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
   // Mark exception register as live in.
-  const TargetLowering *TLI = getTargetLowering();
   const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy());
   if (unsigned Reg = TLI->getExceptionPointerRegister())
     FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
@@ -1042,7 +1039,7 @@
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = nullptr;
   if (TM.Options.EnableFastISel)
-    FastIS = getTargetLowering()->createFastISel(*FuncInfo, LibInfo);
+    FastIS = TLI->createFastISel(*FuncInfo, LibInfo);
 
   // Iterate over all basic blocks in the function.
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
@@ -1096,7 +1093,7 @@
         ++NumEntryBlocks;
 
         // Lower any arguments needed in this block if this is the entry block.
-        if (!FastIS->LowerArguments()) {
+        if (!FastIS->lowerArguments()) {
           // Fast isel failed to lower these arguments
           ++NumFastIselFailLowerArguments;
           if (EnableFastISelAbortArgs)
@@ -1134,7 +1131,7 @@
         FastIS->recomputeInsertPt();
 
         // Try to select the instruction with FastISel.
-        if (FastIS->SelectInstruction(Inst)) {
+        if (FastIS->selectInstruction(Inst)) {
           --NumFastIselRemaining;
           ++NumFastIselSuccess;
           // If fast isel succeeded, skip over all the folded instructions, and
@@ -1729,7 +1726,7 @@
 /// This function recursively traverses up the operand chain, ignoring
 /// certain nodes.
 static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
-                          SDNode *Root, SmallPtrSet<SDNode*, 16> &Visited,
+                          SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited,
                           bool IgnoreChains) {
   // The NodeID's are given uniques ID's where a node ID is guaranteed to be
   // greater than all of its (recursive) operands.  If we scan to a point where
@@ -1744,7 +1741,7 @@
 
   // Don't revisit nodes if we already scanned it and didn't fail, we know we
   // won't fail if we scan it again.
-  if (!Visited.insert(Use))
+  if (!Visited.insert(Use).second)
     return false;
 
   for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
@@ -1861,8 +1858,8 @@
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(0));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
-  unsigned Reg = getTargetLowering()->getRegisterByName(
-                 RegStr->getString().data(), Op->getValueType(0));
+  unsigned Reg =
+      TLI->getRegisterByName(RegStr->getString().data(), Op->getValueType(0));
   SDValue New = CurDAG->getCopyFromReg(
                         CurDAG->getEntryNode(), dl, Reg, Op->getValueType(0));
   New->setNodeId(-1);
@@ -1874,8 +1871,8 @@
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
-  unsigned Reg = getTargetLowering()->getRegisterByName(
-                 RegStr->getString().data(), Op->getOperand(2).getValueType());
+  unsigned Reg = TLI->getRegisterByName(RegStr->getString().data(),
+                                        Op->getOperand(2).getValueType());
   SDValue New = CurDAG->getCopyToReg(
                         CurDAG->getEntryNode(), dl, Reg, Op->getOperand(2));
   New->setNodeId(-1);
@@ -2375,7 +2372,7 @@
     Result = !::CheckOpcode(Table, Index, N.getNode());
     return Index;
   case SelectionDAGISel::OPC_CheckType:
-    Result = !::CheckType(Table, Index, N, SDISel.getTargetLowering());
+    Result = !::CheckType(Table, Index, N, SDISel.TLI);
     return Index;
   case SelectionDAGISel::OPC_CheckChild0Type:
   case SelectionDAGISel::OPC_CheckChild1Type:
@@ -2385,14 +2382,15 @@
   case SelectionDAGISel::OPC_CheckChild5Type:
   case SelectionDAGISel::OPC_CheckChild6Type:
   case SelectionDAGISel::OPC_CheckChild7Type:
-    Result = !::CheckChildType(Table, Index, N, SDISel.getTargetLowering(),
-                        Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Type);
+    Result = !::CheckChildType(Table, Index, N, SDISel.TLI,
+                               Table[Index - 1] -
+                                   SelectionDAGISel::OPC_CheckChild0Type);
     return Index;
   case SelectionDAGISel::OPC_CheckCondCode:
     Result = !::CheckCondCode(Table, Index, N);
     return Index;
   case SelectionDAGISel::OPC_CheckValueType:
-    Result = !::CheckValueType(Table, Index, N, SDISel.getTargetLowering());
+    Result = !::CheckValueType(Table, Index, N, SDISel.TLI);
     return Index;
   case SelectionDAGISel::OPC_CheckInteger:
     Result = !::CheckInteger(Table, Index, N);
@@ -2436,6 +2434,42 @@
   bool HasChainNodesMatched, HasGlueResultNodesMatched;
 };
 
+/// \\brief A DAG update listener to keep the matching state
+/// (i.e. RecordedNodes and MatchScope) uptodate if the target is allowed to
+/// change the DAG while matching.  X86 addressing mode matcher is an example
+/// for this.
+class MatchStateUpdater : public SelectionDAG::DAGUpdateListener
+{
+      SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes;
+      SmallVectorImpl<MatchScope> &MatchScopes;
+public:
+  MatchStateUpdater(SelectionDAG &DAG,
+                    SmallVectorImpl<std::pair<SDValue, SDNode*> > &RN,
+                    SmallVectorImpl<MatchScope> &MS) :
+    SelectionDAG::DAGUpdateListener(DAG),
+    RecordedNodes(RN), MatchScopes(MS) { }
+
+  void NodeDeleted(SDNode *N, SDNode *E) {
+    // Some early-returns here to avoid the search if we deleted the node or
+    // if the update comes from MorphNodeTo (MorphNodeTo is the last thing we
+    // do, so it's unnecessary to update matching state at that point).
+    // Neither of these can occur currently because we only install this
+    // update listener during matching a complex patterns.
+    if (!E || E->isMachineOpcode())
+      return;
+    // Performing linear search here does not matter because we almost never
+    // run this code.  You'd have to have a CSE during complex pattern
+    // matching.
+    for (auto &I : RecordedNodes)
+      if (I.first.getNode() == N)
+        I.first.setNode(E);
+
+    for (auto &I : MatchScopes)
+      for (auto &J : I.NodeStack)
+        if (J.getNode() == N)
+          J.setNode(E);
+  }
+};
 }
 
 SDNode *SelectionDAGISel::
@@ -2449,8 +2483,6 @@
   case ISD::BasicBlock:
   case ISD::Register:
   case ISD::RegisterMask:
-  //case ISD::VALUETYPE:
-  //case ISD::CONDCODE:
   case ISD::HANDLENODE:
   case ISD::MDNODE_SDNODE:
   case ISD::TargetConstant:
@@ -2692,6 +2724,14 @@
       unsigned CPNum = MatcherTable[MatcherIndex++];
       unsigned RecNo = MatcherTable[MatcherIndex++];
       assert(RecNo < RecordedNodes.size() && "Invalid CheckComplexPat");
+
+      // If target can modify DAG during matching, keep the matching state
+      // consistent.
+      std::unique_ptr<MatchStateUpdater> MSU;
+      if (ComplexPatternFuncMutatesDAG())
+        MSU.reset(new MatchStateUpdater(*CurDAG, RecordedNodes,
+                                        MatchScopes));
+
       if (!CheckComplexPattern(NodeToMatch, RecordedNodes[RecNo].second,
                                RecordedNodes[RecNo].first, CPNum,
                                RecordedNodes))
@@ -2703,7 +2743,7 @@
       continue;
 
     case OPC_CheckType:
-      if (!::CheckType(MatcherTable, MatcherIndex, N, getTargetLowering()))
+      if (!::CheckType(MatcherTable, MatcherIndex, N, TLI))
         break;
       continue;
 
@@ -2751,7 +2791,7 @@
 
         MVT CaseVT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
         if (CaseVT == MVT::iPTR)
-          CaseVT = getTargetLowering()->getPointerTy();
+          CaseVT = TLI->getPointerTy();
 
         // If the VT matches, then we will execute this case.
         if (CurNodeVT == CaseVT)
@@ -2773,7 +2813,7 @@
     case OPC_CheckChild2Type: case OPC_CheckChild3Type:
     case OPC_CheckChild4Type: case OPC_CheckChild5Type:
     case OPC_CheckChild6Type: case OPC_CheckChild7Type:
-      if (!::CheckChildType(MatcherTable, MatcherIndex, N, getTargetLowering(),
+      if (!::CheckChildType(MatcherTable, MatcherIndex, N, TLI,
                             Opcode-OPC_CheckChild0Type))
         break;
       continue;
@@ -2781,7 +2821,7 @@
       if (!::CheckCondCode(MatcherTable, MatcherIndex, N)) break;
       continue;
     case OPC_CheckValueType:
-      if (!::CheckValueType(MatcherTable, MatcherIndex, N, getTargetLowering()))
+      if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI))
         break;
       continue;
     case OPC_CheckInteger:
@@ -2980,7 +3020,8 @@
       for (unsigned i = 0; i != NumVTs; ++i) {
         MVT::SimpleValueType VT =
           (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
-        if (VT == MVT::iPTR) VT = getTargetLowering()->getPointerTy().SimpleTy;
+        if (VT == MVT::iPTR)
+          VT = TLI->getPointerTy().SimpleTy;
         VTs.push_back(VT);
       }
 
@@ -3076,7 +3117,7 @@
       if (EmitNodeInfo & OPFL_MemRefs) {
         // Only attach load or store memory operands if the generated
         // instruction may load or store.
-        const MCInstrDesc &MCID = TM.getInstrInfo()->get(TargetOpc);
+        const MCInstrDesc &MCID = TII->get(TargetOpc);
         bool mayLoad = MCID.mayLoad();
         bool mayStore = MCID.mayStore();
 

diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 42372a2..9aef5ed 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp

@@ -31,13 +31,13 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cctype>
 using namespace llvm;
 
-/// NOTE: The constructor takes ownership of TLOF.
-TargetLowering::TargetLowering(const TargetMachine &tm,
-                               const TargetLoweringObjectFile *tlof)
-  : TargetLoweringBase(tm, tlof) {}
+/// NOTE: The TargetMachine owns TLOF.
+TargetLowering::TargetLowering(const TargetMachine &tm)
+  : TargetLoweringBase(tm) {}
 
 const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
@@ -2177,7 +2177,8 @@
     std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr));
 
   // Figure out which register class contains this reg.
-  const TargetRegisterInfo *RI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *RI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
        E = RI->regclass_end(); RCI != E; ++RCI) {
     const TargetRegisterClass *RC = *RCI;
@@ -2239,14 +2240,11 @@
 
   // Do a prepass over the constraints, canonicalizing them, and building up the
   // ConstraintOperands list.
-  InlineAsm::ConstraintInfoVector
-    ConstraintInfos = IA->ParseConstraints();
-
   unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
   unsigned ResNo = 0;   // ResNo - The result number of the next output.
 
-  for (unsigned i = 0, e = ConstraintInfos.size(); i != e; ++i) {
-    ConstraintOperands.push_back(AsmOperandInfo(ConstraintInfos[i]));
+  for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+    ConstraintOperands.emplace_back(std::move(CI));
     AsmOperandInfo &OpInfo = ConstraintOperands.back();
 
     // Update multiple alternative constraint count.
@@ -2325,7 +2323,7 @@
   }
 
   // If we have multiple alternative constraints, select the best alternative.
-  if (ConstraintInfos.size()) {
+  if (ConstraintOperands.size()) {
     if (maCount) {
       unsigned bestMAIndex = 0;
       int bestWeight = -1;
@@ -2641,11 +2639,13 @@
 
 /// \brief Given an ISD::SDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
-/// multiplying by a magic number.  See:
-/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor,
                                   SelectionDAG &DAG, bool IsAfterLegalization,
                                   std::vector<SDNode *> *Created) const {
+  assert(Created && "No vector to hold sdiv ops.");
+
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
@@ -2673,38 +2673,36 @@
   // If d > 0 and m < 0, add the numerator
   if (Divisor.isStrictlyPositive() && magics.m.isNegative()) {
     Q = DAG.getNode(ISD::ADD, dl, VT, Q, N->getOperand(0));
-    if (Created)
-      Created->push_back(Q.getNode());
+    Created->push_back(Q.getNode());
   }
   // If d < 0 and m > 0, subtract the numerator.
   if (Divisor.isNegative() && magics.m.isStrictlyPositive()) {
     Q = DAG.getNode(ISD::SUB, dl, VT, Q, N->getOperand(0));
-    if (Created)
-      Created->push_back(Q.getNode());
+    Created->push_back(Q.getNode());
   }
   // Shift right algebraic if shift value is nonzero
   if (magics.s > 0) {
     Q = DAG.getNode(ISD::SRA, dl, VT, Q,
                  DAG.getConstant(magics.s, getShiftAmountTy(Q.getValueType())));
-    if (Created)
-      Created->push_back(Q.getNode());
+    Created->push_back(Q.getNode());
   }
   // Extract the sign bit and add it to the quotient
   SDValue T = DAG.getNode(ISD::SRL, dl, VT, Q,
                           DAG.getConstant(VT.getScalarSizeInBits() - 1,
                                           getShiftAmountTy(Q.getValueType())));
-  if (Created)
-    Created->push_back(T.getNode());
+  Created->push_back(T.getNode());
   return DAG.getNode(ISD::ADD, dl, VT, Q, T);
 }
 
 /// \brief Given an ISD::UDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
-/// multiplying by a magic number.  See:
-/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
                                   SelectionDAG &DAG, bool IsAfterLegalization,
                                   std::vector<SDNode *> *Created) const {
+  assert(Created && "No vector to hold udiv ops.");
+
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
@@ -2725,8 +2723,7 @@
     unsigned Shift = Divisor.countTrailingZeros();
     Q = DAG.getNode(ISD::SRL, dl, VT, Q,
                     DAG.getConstant(Shift, getShiftAmountTy(Q.getValueType())));
-    if (Created)
-      Created->push_back(Q.getNode());
+    Created->push_back(Q.getNode());
 
     // Get magic number for the shifted divisor.
     magics = Divisor.lshr(Shift).magicu(Shift);
@@ -2744,8 +2741,8 @@
                             DAG.getConstant(magics.m, VT)).getNode(), 1);
   else
     return SDValue();       // No mulhu or equvialent
-  if (Created)
-    Created->push_back(Q.getNode());
+
+  Created->push_back(Q.getNode());
 
   if (magics.a == 0) {
     assert(magics.s < Divisor.getBitWidth() &&
@@ -2754,15 +2751,12 @@
                  DAG.getConstant(magics.s, getShiftAmountTy(Q.getValueType())));
   } else {
     SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q);
-    if (Created)
-      Created->push_back(NPQ.getNode());
+    Created->push_back(NPQ.getNode());
     NPQ = DAG.getNode(ISD::SRL, dl, VT, NPQ,
                       DAG.getConstant(1, getShiftAmountTy(NPQ.getValueType())));
-    if (Created)
-      Created->push_back(NPQ.getNode());
+    Created->push_back(NPQ.getNode());
     NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
-    if (Created)
-      Created->push_back(NPQ.getNode());
+    Created->push_back(NPQ.getNode());
     return DAG.getNode(ISD::SRL, dl, VT, NPQ,
              DAG.getConstant(magics.s-1, getShiftAmountTy(NPQ.getValueType())));
   }
@@ -2785,7 +2779,7 @@
 
 bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
                                SelectionDAG &DAG, SDValue LL, SDValue LH,
-			       SDValue RL, SDValue RH) const {
+                               SDValue RL, SDValue RH) const {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
@@ -2818,8 +2812,8 @@
       // The inputs are both zero-extended.
       if (HasUMUL_LOHI) {
         // We can emit a umul_lohi.
-        Lo = DAG.getNode(ISD::UMUL_LOHI, dl,
-	                 DAG.getVTList(HiLoVT, HiLoVT), LL, RL);
+        Lo = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(HiLoVT, HiLoVT), LL,
+                         RL);
         Hi = SDValue(Lo.getNode(), 1);
         return true;
       }
@@ -2834,8 +2828,8 @@
       // The input values are both sign-extended.
       if (HasSMUL_LOHI) {
         // We can emit a smul_lohi.
-        Lo = DAG.getNode(ISD::SMUL_LOHI, dl,
-	                 DAG.getVTList(HiLoVT, HiLoVT), LL, RL);
+        Lo = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(HiLoVT, HiLoVT), LL,
+                         RL);
         Hi = SDValue(Lo.getNode(), 1);
         return true;
       }
@@ -2885,3 +2879,65 @@
   }
   return false;
 }
+
+bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
+                               SelectionDAG &DAG) const {
+  EVT VT = Node->getOperand(0).getValueType();
+  EVT NVT = Node->getValueType(0);
+  SDLoc dl(SDValue(Node, 0));
+
+  // FIXME: Only f32 to i64 conversions are supported.
+  if (VT != MVT::f32 || NVT != MVT::i64)
+    return false;
+
+  // Expand f32 -> i64 conversion
+  // This algorithm comes from compiler-rt's implementation of fixsfdi:
+  // https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/fixsfdi.c
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(),
+                                VT.getSizeInBits());
+  SDValue ExponentMask = DAG.getConstant(0x7F800000, IntVT);
+  SDValue ExponentLoBit = DAG.getConstant(23, IntVT);
+  SDValue Bias = DAG.getConstant(127, IntVT);
+  SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()),
+                                     IntVT);
+  SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, IntVT);
+  SDValue MantissaMask = DAG.getConstant(0x007FFFFF, IntVT);
+
+  SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0));
+
+  SDValue ExponentBits = DAG.getNode(ISD::SRL, dl, IntVT,
+      DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
+      DAG.getZExtOrTrunc(ExponentLoBit, dl, getShiftAmountTy(IntVT)));
+  SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
+
+  SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
+      DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
+      DAG.getZExtOrTrunc(SignLowBit, dl, getShiftAmountTy(IntVT)));
+  Sign = DAG.getSExtOrTrunc(Sign, dl, NVT);
+
+  SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
+      DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
+      DAG.getConstant(0x00800000, IntVT));
+
+  R = DAG.getZExtOrTrunc(R, dl, NVT);
+
+
+  R = DAG.getSelectCC(dl, Exponent, ExponentLoBit,
+     DAG.getNode(ISD::SHL, dl, NVT, R,
+                 DAG.getZExtOrTrunc(
+                    DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
+                    dl, getShiftAmountTy(IntVT))),
+     DAG.getNode(ISD::SRL, dl, NVT, R,
+                 DAG.getZExtOrTrunc(
+                    DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
+                    dl, getShiftAmountTy(IntVT))),
+     ISD::SETGT);
+
+  SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT,
+      DAG.getNode(ISD::XOR, dl, NVT, R, Sign),
+      Sign);
+
+  Result = DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, IntVT),
+      DAG.getConstant(0, NVT), Ret, ISD::SETLT);
+  return true;
+}

diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index f7c64da..0be00f0 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp

@@ -144,7 +144,7 @@
         LLVMContext &C = F.getContext();
         BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
         Type *ExnTy = StructType::get(Type::getInt8PtrTy(C),
-                                      Type::getInt32Ty(C), NULL);
+                                      Type::getInt32Ty(C), nullptr);
         Constant *PersFn =
           F.getParent()->
           getOrInsertFunction("__gcc_personality_v0",

diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index d2f3955..7fd8107 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp

@@ -31,6 +31,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -98,7 +99,7 @@
                                       VoidPtrTy, // __personality
                                       VoidPtrTy, // __lsda
                                       ArrayType::get(VoidPtrTy, 5), // __jbuf
-                                      NULL);
+                                      nullptr);
   RegisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
       PointerType::getUnqual(FunctionContextTy), (Type *)nullptr);
@@ -138,8 +139,8 @@
 /// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
 /// we reach blocks we've already seen.
 static void MarkBlocksLiveIn(BasicBlock *BB,
-                             SmallPtrSet<BasicBlock *, 64> &LiveBBs) {
-  if (!LiveBBs.insert(BB))
+                             SmallPtrSetImpl<BasicBlock *> &LiveBBs) {
+  if (!LiveBBs.insert(BB).second)
     return; // already been here.
 
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
@@ -190,7 +191,7 @@
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
   // that needs to be restored on all exits from the function. This is an alloca
   // because the value needs to be added to the global context list.
-  const TargetLowering *TLI = TM->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
   unsigned Align =
       TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
   FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
@@ -249,34 +250,16 @@
        ++AI) {
     Type *Ty = AI->getType();
 
-    // Aggregate types can't be cast, but are legal argument types, so we have
-    // to handle them differently. We use an extract/insert pair as a
-    // lightweight method to achieve the same goal.
-    if (isa<StructType>(Ty) || isa<ArrayType>(Ty)) {
-      Instruction *EI = ExtractValueInst::Create(AI, 0, "", AfterAllocaInsPt);
-      Instruction *NI = InsertValueInst::Create(AI, EI, 0);
-      NI->insertAfter(EI);
-      AI->replaceAllUsesWith(NI);
+    // Use 'select i8 true, %arg, undef' to simulate a 'no-op' instruction.
+    Value *TrueValue = ConstantInt::getTrue(F.getContext());
+    Value *UndefValue = UndefValue::get(Ty);
+    Instruction *SI = SelectInst::Create(TrueValue, AI, UndefValue,
+                                         AI->getName() + ".tmp",
+                                         AfterAllocaInsPt);
+    AI->replaceAllUsesWith(SI);
 
-      // Set the operand of the instructions back to the AllocaInst.
-      EI->setOperand(0, AI);
-      NI->setOperand(0, AI);
-    } else {
-      // This is always a no-op cast because we're casting AI to AI->getType()
-      // so src and destination types are identical. BitCast is the only
-      // possibility.
-      CastInst *NC = new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp",
-                                     AfterAllocaInsPt);
-      AI->replaceAllUsesWith(NC);
-
-      // Set the operand of the cast instruction back to the AllocaInst.
-      // Normally it's forbidden to replace a CastInst's operand because it
-      // could cause the opcode to reflect an illegal conversion. However, we're
-      // replacing it here with the same value it was constructed with.  We do
-      // this because the above replaceAllUsesWith() clobbered the operand, but
-      // we want this one to remain.
-      NC->setOperand(0, AI);
-    }
+    // Reset the operand, because it  was clobbered by the RAUW above.
+    SI->setOperand(1, AI);
   }
 }
 
@@ -368,10 +351,8 @@
       continue;
 
     // Demote the PHIs to the stack.
-    for (SmallPtrSet<PHINode *, 8>::iterator I = PHIsToDemote.begin(),
-                                             E = PHIsToDemote.end();
-         I != E; ++I)
-      DemotePHIToStack(*I);
+    for (PHINode *PN : PHIsToDemote)
+      DemotePHIToStack(PN);
 
     // Move the landingpad instruction back to the top of the landing pad block.
     LPI->moveBefore(UnwindBlock->begin());

diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index 24e94d1..97a5424 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp

@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
@@ -60,27 +61,6 @@
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-namespace {
-static BlockFrequency Threshold;
-}
-
-/// Decision threshold. A node gets the output value 0 if the weighted sum of
-/// its inputs falls in the open interval (-Threshold;Threshold).
-static BlockFrequency getThreshold() { return Threshold; }
-
-/// \brief Set the threshold for a given entry frequency.
-///
-/// Set the threshold relative to \c Entry.  Since the threshold is used as a
-/// bound on the open interval (-Threshold;Threshold), 1 is the minimum
-/// threshold.
-static void setThreshold(const BlockFrequency &Entry) {
-  // Apparently 2 is a good threshold when Entry==2^14, but we need to scale
-  // it.  Divide by 2^13, rounding as appropriate.
-  uint64_t Freq = Entry.getFrequency();
-  uint64_t Scaled = (Freq >> 13) + bool(Freq & (1 << 12));
-  Threshold = std::max(UINT64_C(1), Scaled);
-}
-
 /// Node - Each edge bundle corresponds to a Hopfield node.
 ///
 /// The node contains precomputed frequency data that only depends on the CFG,
@@ -126,9 +106,9 @@
 
   /// clear - Reset per-query data, but preserve frequencies that only depend on
   // the CFG.
-  void clear() {
+  void clear(const BlockFrequency &Threshold) {
     BiasN = BiasP = Value = 0;
-    SumLinkWeights = getThreshold();
+    SumLinkWeights = Threshold;
     Links.clear();
   }
 
@@ -166,7 +146,7 @@
 
   /// update - Recompute Value from Bias and Links. Return true when node
   /// preference changes.
-  bool update(const Node nodes[]) {
+  bool update(const Node nodes[], const BlockFrequency &Threshold) {
     // Compute the weighted sum of inputs.
     BlockFrequency SumN = BiasN;
     BlockFrequency SumP = BiasP;
@@ -186,9 +166,9 @@
     //  2. It helps tame rounding errors when the links nominally sum to 0.
     //
     bool Before = preferReg();
-    if (SumN >= SumP + getThreshold())
+    if (SumN >= SumP + Threshold)
       Value = -1;
-    else if (SumP >= SumN + getThreshold())
+    else if (SumP >= SumN + Threshold)
       Value = 1;
     else
       Value = 0;
@@ -227,7 +207,7 @@
   if (ActiveNodes->test(n))
     return;
   ActiveNodes->set(n);
-  nodes[n].clear();
+  nodes[n].clear(Threshold);
 
   // Very large bundles usually come from big switches, indirect branches,
   // landing pads, or loops with many 'continue' statements. It is difficult to
@@ -244,6 +224,18 @@
   }
 }
 
+/// \brief Set the threshold for a given entry frequency.
+///
+/// Set the threshold relative to \c Entry.  Since the threshold is used as a
+/// bound on the open interval (-Threshold;Threshold), 1 is the minimum
+/// threshold.
+void SpillPlacement::setThreshold(const BlockFrequency &Entry) {
+  // Apparently 2 is a good threshold when Entry==2^14, but we need to scale
+  // it.  Divide by 2^13, rounding as appropriate.
+  uint64_t Freq = Entry.getFrequency();
+  uint64_t Scaled = (Freq >> 13) + bool(Freq & (1 << 12));
+  Threshold = std::max(UINT64_C(1), Scaled);
+}
 
 /// addConstraints - Compute node biases and weights from a set of constraints.
 /// Set a bit in NodeMask for each active node.
@@ -310,7 +302,7 @@
   Linked.clear();
   RecentPositive.clear();
   for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) {
-    nodes[n].update(nodes);
+    nodes[n].update(nodes, Threshold);
     // A node that must spill, or a node without any links is not going to
     // change its value ever again, so exclude it from iterations.
     if (nodes[n].mustSpill())
@@ -330,7 +322,7 @@
   // First update the recently positive nodes. They have likely received new
   // negative bias that will turn them off.
   while (!RecentPositive.empty())
-    nodes[RecentPositive.pop_back_val()].update(nodes);
+    nodes[RecentPositive.pop_back_val()].update(nodes, Threshold);
 
   if (Linked.empty())
     return;
@@ -349,7 +341,7 @@
            iteration == 0 ? Linked.rbegin() : std::next(Linked.rbegin()),
            E = Linked.rend(); I != E; ++I) {
       unsigned n = *I;
-      if (nodes[n].update(nodes)) {
+      if (nodes[n].update(nodes, Threshold)) {
         Changed = true;
         if (nodes[n].preferReg())
           RecentPositive.push_back(n);
@@ -363,7 +355,7 @@
     for (SmallVectorImpl<unsigned>::const_iterator I =
            std::next(Linked.begin()), E = Linked.end(); I != E; ++I) {
       unsigned n = *I;
-      if (nodes[n].update(nodes)) {
+      if (nodes[n].update(nodes, Threshold)) {
         Changed = true;
         if (nodes[n].preferReg())
           RecentPositive.push_back(n);

diff --git a/lib/CodeGen/SpillPlacement.h b/lib/CodeGen/SpillPlacement.h
index 43fc7f5..622361e 100644
--- a/lib/CodeGen/SpillPlacement.h
+++ b/lib/CodeGen/SpillPlacement.h

@@ -24,8 +24,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SPILLPLACEMENT_H
-#define LLVM_CODEGEN_SPILLPLACEMENT_H
+#ifndef LLVM_LIB_CODEGEN_SPILLPLACEMENT_H
+#define LLVM_LIB_CODEGEN_SPILLPLACEMENT_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -40,7 +40,7 @@
 class MachineLoopInfo;
 class MachineBlockFrequencyInfo;
 
-class SpillPlacement  : public MachineFunctionPass {
+class SpillPlacement : public MachineFunctionPass {
   struct Node;
   const MachineFunction *MF;
   const EdgeBundles *bundles;
@@ -60,7 +60,11 @@
   SmallVector<unsigned, 8> RecentPositive;
 
   // Block frequencies are computed once. Indexed by block number.
-  SmallVector<BlockFrequency, 4> BlockFrequencies;
+  SmallVector<BlockFrequency, 8> BlockFrequencies;
+
+  /// Decision threshold. A node gets the output value 0 if the weighted sum of
+  /// its inputs falls in the open interval (-Threshold;Threshold).
+  BlockFrequency Threshold;
 
 public:
   static char ID; // Pass identification, replacement for typeid.
@@ -152,6 +156,7 @@
   void releaseMemory() override;
 
   void activate(unsigned);
+  void setThreshold(const BlockFrequency &Entry);
 };
 
 } // end namespace llvm

diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
deleted file mode 100644
index 0649448..0000000
--- a/lib/CodeGen/Spiller.cpp
+++ /dev/null

@@ -1,184 +0,0 @@
-//===-- llvm/CodeGen/Spiller.cpp -  Spiller -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Spiller.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/LiveRangeEdit.h"
-#include "llvm/CodeGen/LiveStackAnalysis.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "spiller"
-
-namespace {
-  enum SpillerName { trivial, inline_ };
-}
-
-static cl::opt<SpillerName>
-spillerOpt("spiller",
-           cl::desc("Spiller to use: (default: standard)"),
-           cl::Prefix,
-           cl::values(clEnumVal(trivial,   "trivial spiller"),
-                      clEnumValN(inline_,  "inline", "inline spiller"),
-                      clEnumValEnd),
-           cl::init(trivial));
-
-// Spiller virtual destructor implementation.
-Spiller::~Spiller() {}
-
-namespace {
-
-/// Utility class for spillers.
-class SpillerBase : public Spiller {
-protected:
-  MachineFunctionPass *pass;
-  MachineFunction *mf;
-  VirtRegMap *vrm;
-  LiveIntervals *lis;
-  MachineFrameInfo *mfi;
-  MachineRegisterInfo *mri;
-  const TargetInstrInfo *tii;
-  const TargetRegisterInfo *tri;
-
-  /// Construct a spiller base.
-  SpillerBase(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
-    : pass(&pass), mf(&mf), vrm(&vrm)
-  {
-    lis = &pass.getAnalysis<LiveIntervals>();
-    mfi = mf.getFrameInfo();
-    mri = &mf.getRegInfo();
-    tii = mf.getTarget().getInstrInfo();
-    tri = mf.getTarget().getRegisterInfo();
-  }
-
-  /// Add spill ranges for every use/def of the live interval, inserting loads
-  /// immediately before each use, and stores after each def. No folding or
-  /// remat is attempted.
-  void trivialSpillEverywhere(LiveRangeEdit& LRE) {
-    LiveInterval* li = &LRE.getParent();
-
-    DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
-
-    assert(li->weight != llvm::huge_valf &&
-           "Attempting to spill already spilled value.");
-
-    assert(!TargetRegisterInfo::isStackSlot(li->reg) &&
-           "Trying to spill a stack slot.");
-
-    DEBUG(dbgs() << "Trivial spill everywhere of reg" << li->reg << "\n");
-
-    const TargetRegisterClass *trc = mri->getRegClass(li->reg);
-    unsigned ss = vrm->assignVirt2StackSlot(li->reg);
-
-    // Iterate over reg uses/defs.
-    for (MachineRegisterInfo::reg_instr_iterator
-         regItr = mri->reg_instr_begin(li->reg);
-         regItr != mri->reg_instr_end();) {
-
-      // Grab the use/def instr.
-      MachineInstr *mi = &*regItr;
-
-      DEBUG(dbgs() << "  Processing " << *mi);
-
-      // Step regItr to the next use/def instr.
-      ++regItr;
-
-      // Collect uses & defs for this instr.
-      SmallVector<unsigned, 2> indices;
-      bool hasUse = false;
-      bool hasDef = false;
-      for (unsigned i = 0; i != mi->getNumOperands(); ++i) {
-        MachineOperand &op = mi->getOperand(i);
-        if (!op.isReg() || op.getReg() != li->reg)
-          continue;
-        hasUse |= mi->getOperand(i).isUse();
-        hasDef |= mi->getOperand(i).isDef();
-        indices.push_back(i);
-      }
-
-      // Create a new virtual register for the load and/or store.
-      unsigned NewVReg = LRE.create();
-
-      // Update the reg operands & kill flags.
-      for (unsigned i = 0; i < indices.size(); ++i) {
-        unsigned mopIdx = indices[i];
-        MachineOperand &mop = mi->getOperand(mopIdx);
-        mop.setReg(NewVReg);
-        if (mop.isUse() && !mi->isRegTiedToDefOperand(mopIdx)) {
-          mop.setIsKill(true);
-        }
-      }
-      assert(hasUse || hasDef);
-
-      // Insert reload if necessary.
-      MachineBasicBlock::iterator miItr(mi);
-      if (hasUse) {
-        MachineInstrSpan MIS(miItr);
-
-        tii->loadRegFromStackSlot(*mi->getParent(), miItr, NewVReg, ss, trc,
-                                  tri);
-        lis->InsertMachineInstrRangeInMaps(MIS.begin(), miItr);
-      }
-
-      // Insert store if necessary.
-      if (hasDef) {
-        MachineInstrSpan MIS(miItr);
-
-        tii->storeRegToStackSlot(*mi->getParent(), std::next(miItr), NewVReg,
-                                 true, ss, trc, tri);
-        lis->InsertMachineInstrRangeInMaps(std::next(miItr), MIS.end());
-      }
-    }
-  }
-};
-
-} // end anonymous namespace
-
-namespace {
-
-/// Spills any live range using the spill-everywhere method with no attempt at
-/// folding.
-class TrivialSpiller : public SpillerBase {
-public:
-
-  TrivialSpiller(MachineFunctionPass &pass, MachineFunction &mf,
-                 VirtRegMap &vrm)
-    : SpillerBase(pass, mf, vrm) {}
-
-  void spill(LiveRangeEdit &LRE) override {
-    // Ignore spillIs - we don't use it.
-    trivialSpillEverywhere(LRE);
-  }
-};
-
-} // end anonymous namespace
-
-void Spiller::anchor() { }
-
-llvm::Spiller* llvm::createSpiller(MachineFunctionPass &pass,
-                                   MachineFunction &mf,
-                                   VirtRegMap &vrm) {
-  switch (spillerOpt) {
-  case trivial: return new TrivialSpiller(pass, mf, vrm);
-  case inline_: return createInlineSpiller(pass, mf, vrm);
-  }
-  llvm_unreachable("Invalid spiller optimization");
-}

diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index b7d5bea..08f99ec 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SPILLER_H
-#define LLVM_CODEGEN_SPILLER_H
+#ifndef LLVM_LIB_CODEGEN_SPILLER_H
+#define LLVM_LIB_CODEGEN_SPILLER_H
 
 namespace llvm {
 
@@ -31,11 +31,6 @@
 
   };
 
-  /// Create and return a spiller object, as specified on the command line.
-  Spiller* createSpiller(MachineFunctionPass &pass,
-                         MachineFunction &mf,
-                         VirtRegMap &vrm);
-
   /// Create and return a spiller that will insert spill code directly instead
   /// of deferring though VirtRegMap.
   Spiller *createInlineSpiller(MachineFunctionPass &pass,

diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 7d4f568..ea7b914 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp

@@ -40,16 +40,11 @@
 //                                 Split Analysis
 //===----------------------------------------------------------------------===//
 
-SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm,
-                             const LiveIntervals &lis,
+SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
                              const MachineLoopInfo &mli)
-  : MF(vrm.getMachineFunction()),
-    VRM(vrm),
-    LIS(lis),
-    Loops(mli),
-    TII(*MF.getTarget().getInstrInfo()),
-    CurLI(nullptr),
-    LastSplitPoint(MF.getNumBlockIDs()) {}
+    : MF(vrm.getMachineFunction()), VRM(vrm), LIS(lis), Loops(mli),
+      TII(*MF.getSubtarget().getInstrInfo()), CurLI(nullptr),
+      LastSplitPoint(MF.getNumBlockIDs()) {}
 
 void SplitAnalysis::clear() {
   UseSlots.clear();
@@ -321,22 +316,14 @@
 //===----------------------------------------------------------------------===//
 
 /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
-SplitEditor::SplitEditor(SplitAnalysis &sa,
-                         LiveIntervals &lis,
-                         VirtRegMap &vrm,
+SplitEditor::SplitEditor(SplitAnalysis &sa, LiveIntervals &lis, VirtRegMap &vrm,
                          MachineDominatorTree &mdt,
                          MachineBlockFrequencyInfo &mbfi)
-  : SA(sa), LIS(lis), VRM(vrm),
-    MRI(vrm.getMachineFunction().getRegInfo()),
-    MDT(mdt),
-    TII(*vrm.getMachineFunction().getTarget().getInstrInfo()),
-    TRI(*vrm.getMachineFunction().getTarget().getRegisterInfo()),
-    MBFI(mbfi),
-    Edit(nullptr),
-    OpenIdx(0),
-    SpillMode(SM_Partition),
-    RegAssign(Allocator)
-{}
+    : SA(sa), LIS(lis), VRM(vrm), MRI(vrm.getMachineFunction().getRegInfo()),
+      MDT(mdt), TII(*vrm.getMachineFunction().getSubtarget().getInstrInfo()),
+      TRI(*vrm.getMachineFunction().getSubtarget().getRegisterInfo()),
+      MBFI(mbfi), Edit(nullptr), OpenIdx(0), SpillMode(SM_Partition),
+      RegAssign(Allocator) {}
 
 void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
   Edit = &LRE;

diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index 7048ee3..2e60c14 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SPLITKIT_H
-#define LLVM_CODEGEN_SPLITKIT_H
+#ifndef LLVM_LIB_CODEGEN_SPLITKIT_H
+#define LLVM_LIB_CODEGEN_SPLITKIT_H
 
 #include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"

diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index 370430c..dcf1b44 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp

@@ -228,7 +228,7 @@
 unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   unsigned MarkersFound = 0;
   // Scan the function to find all lifetime markers.
-  // NOTE: We use the a reverse-post-order iteration to ensure that we obtain a
+  // NOTE: We use a reverse-post-order iteration to ensure that we obtain a
   // deterministic numbering, and because we'll need a post-order iteration
   // later for solving the liveness dataflow problem.
   for (MachineBasicBlock *MBB : depth_first(MF)) {

diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp
index 3ba502f..c2ee87a 100644
--- a/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp

@@ -21,7 +21,7 @@
 #include "llvm/CodeGen/StackMapLivenessAnalysis.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -67,7 +67,7 @@
   DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: "
                << _MF.getName() << " **********\n");
   MF = &_MF;
-  TRI = MF->getTarget().getRegisterInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
   ++NumStackMapFuncVisited;
 
   // Skip this function if there are no patchpoints to process.

diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index 1473fc1..d3791c3 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp

@@ -24,6 +24,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <iterator>
 
 using namespace llvm;
@@ -83,7 +84,8 @@
     switch (MOI->getImm()) {
     default: llvm_unreachable("Unrecognized operand type.");
     case StackMaps::DirectMemRefOp: {
-      unsigned Size = AP.TM.getDataLayout()->getPointerSizeInBits();
+      unsigned Size =
+          AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSizeInBits();
       assert((Size % 8) == 0 && "Need pointer size in bytes.");
       Size /= 8;
       unsigned Reg = (++MOI)->getReg();
@@ -122,7 +124,8 @@
     assert(TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) &&
            "Virtreg operands should have been rewritten before now.");
     const TargetRegisterClass *RC =
-      AP.TM.getRegisterInfo()->getMinimalPhysRegClass(MOI->getReg());
+        AP.TM.getSubtargetImpl()->getRegisterInfo()->getMinimalPhysRegClass(
+            MOI->getReg());
     assert(!MOI->getSubReg() && "Physical subreg still around.");
     Locs.push_back(
       Location(Location::Register, RC->getSize(), MOI->getReg(), 0));
@@ -158,7 +161,7 @@
 StackMaps::LiveOutVec
 StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
   assert(Mask && "No register mask specified");
-  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = AP.TM.getSubtargetImpl()->getRegisterInfo();
   LiveOutVec LiveOuts;
 
   // Create a LiveOutReg for each bit that is set in the register mask.
@@ -217,9 +220,18 @@
        I != E; ++I) {
     // Constants are encoded as sign-extended integers.
     // -1 is directly encoded as .long 0xFFFFFFFF with no constant pool.
-    if (I->LocType == Location::Constant &&
-        ((I->Offset + (int64_t(1)<<31)) >> 32) != 0) {
+    if (I->LocType == Location::Constant && !isInt<32>(I->Offset)) {
       I->LocType = Location::ConstantIndex;
+      // ConstPool is intentionally a MapVector of 'uint64_t's (as
+      // opposed to 'int64_t's).  We should never be in a situation
+      // where we have to insert either the tombstone or the empty
+      // keys into a map, and for a DenseMap<uint64_t, T> these are
+      // (uint64_t)0 and (uint64_t)-1.  They can be and are
+      // represented using 32 bit integers.
+
+      assert((uint64_t)I->Offset != DenseMapInfo<uint64_t>::getEmptyKey() &&
+             (uint64_t)I->Offset != DenseMapInfo<uint64_t>::getTombstoneKey() &&
+             "empty and tombstone keys should fit in 32 bits!");
       auto Result = ConstPool.insert(std::make_pair(I->Offset, I->Offset));
       I->Offset = Result.first - ConstPool.begin();
     }
@@ -232,12 +244,16 @@
     MCSymbolRefExpr::Create(AP.CurrentFnSym, OutContext),
     OutContext);
 
-  CSInfos.push_back(CallsiteInfo(CSOffsetExpr, ID, Locations, LiveOuts));
+  CSInfos.emplace_back(CSOffsetExpr, ID, std::move(Locations),
+                       std::move(LiveOuts));
 
   // Record the stack size of the current function.
   const MachineFrameInfo *MFI = AP.MF->getFrameInfo();
+  const TargetRegisterInfo *RegInfo = AP.MF->getSubtarget().getRegisterInfo();
+  const bool DynamicFrameSize = MFI->hasVarSizedObjects() ||
+    RegInfo->needsStackRealignment(*(AP.MF));
   FnStackSize[AP.CurrentFnSym] =
-    MFI->hasVarSizedObjects() ? UINT64_MAX : MFI->getStackSize();
+    DynamicFrameSize ? UINT64_MAX : MFI->getStackSize();
 }
 
 void StackMaps::recordStackMap(const MachineInstr &MI) {
@@ -485,7 +501,7 @@
 
   MCContext &OutContext = AP.OutStreamer.getContext();
   MCStreamer &OS = AP.OutStreamer;
-  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = AP.TM.getSubtargetImpl()->getRegisterInfo();
 
   // Create the section.
   const MCSection *StackMapSection =

diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index accfe7b..45f97ac 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp

@@ -33,6 +33,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cstdlib>
 using namespace llvm;
 
@@ -85,7 +86,7 @@
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  TLI = TM->getTargetLowering();
+  TLI = TM->getSubtargetImpl()->getTargetLowering();
 
   Attribute Attr = Fn.getAttributes().getAttribute(
       AttributeSet::FunctionIndex, "stack-protector-buffer-size");
@@ -168,7 +169,7 @@
     } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
       // Keep track of what PHI nodes we have already visited to ensure
       // they are only visited once.
-      if (VisitedPHIs.insert(PN))
+      if (VisitedPHIs.insert(PN).second)
         if (HasAddressTaken(PN))
           return true;
     } else if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
@@ -479,12 +480,12 @@
   if (Trip.getOS() == llvm::Triple::OpenBSD) {
     Constant *StackChkFail = M->getOrInsertFunction(
         "__stack_smash_handler", Type::getVoidTy(Context),
-        Type::getInt8PtrTy(Context), NULL);
+        Type::getInt8PtrTy(Context), nullptr);
 
     B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
     Constant *StackChkFail = M->getOrInsertFunction(
-        "__stack_chk_fail", Type::getVoidTy(Context), NULL);
+        "__stack_chk_fail", Type::getVoidTy(Context), nullptr);
     B.CreateCall(StackChkFail);
   }
   B.CreateUnreachable();

diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 791168f..cc72e5e 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp

@@ -28,7 +28,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <vector>
 using namespace llvm;
 
@@ -422,7 +422,7 @@
     });
 
   MFI = MF.getFrameInfo();
-  TII = MF.getTarget().getInstrInfo();
+  TII = MF.getSubtarget().getInstrInfo();
   LS = &getAnalysis<LiveStacks>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
 

diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 723a629..4377236 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp

@@ -31,6 +31,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "tailduplication"
@@ -135,8 +136,8 @@
   if (skipOptnoneFunction(*MF.getFunction()))
     return false;
 
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   MMI = getAnalysisIfAvailable<MachineModuleInfo>();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
@@ -798,11 +799,9 @@
       RS->enterBasicBlock(PredBB);
       if (!PredBB->empty())
         RS->forward(std::prev(PredBB->end()));
-      BitVector RegsLiveAtExit(TRI->getNumRegs());
-      RS->getRegsUsed(RegsLiveAtExit, false);
       for (MachineBasicBlock::livein_iterator I = TailBB->livein_begin(),
              E = TailBB->livein_end(); I != E; ++I) {
-        if (!RegsLiveAtExit[*I])
+        if (!RS->isRegUsed(*I, false))
           // If a register is previously livein to the tail but it's not live
           // at the end of predecessor BB, then it should be added to its
           // livein list.

diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 883e9d1..1557d10 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp

@@ -14,8 +14,8 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cstdlib>
 using namespace llvm;
 
@@ -26,7 +26,7 @@
 /// the stack frame of the specified index. This is the default implementation
 /// which is overridden for some targets.
 int TargetFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                         int FI) const {
+                                             int FI) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   return MFI->getObjectOffset(FI) + MFI->getStackSize() -
     getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
@@ -34,7 +34,7 @@
 
 int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                              int FI, unsigned &FrameReg) const {
-  const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
 
   // By default, assume all frame indices are referenced via whatever
   // getFrameRegister() says. The target can override this if it's doing

diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index 83966bd0..ab45f89 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp

@@ -290,13 +290,15 @@
     Offset = 0;
     return true;
   }
-  unsigned BitSize = TM->getRegisterInfo()->getSubRegIdxSize(SubIdx);
+  unsigned BitSize =
+      TM->getSubtargetImpl()->getRegisterInfo()->getSubRegIdxSize(SubIdx);
   // Convert bit size to byte size to be consistent with
   // MCRegisterClass::getSize().
   if (BitSize % 8)
     return false;
 
-  int BitOffset = TM->getRegisterInfo()->getSubRegIdxOffset(SubIdx);
+  int BitOffset =
+      TM->getSubtargetImpl()->getRegisterInfo()->getSubRegIdxOffset(SubIdx);
   if (BitOffset < 0 || BitOffset % 8)
     return false;
 
@@ -305,7 +307,7 @@
 
   assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
 
-  if (!TM->getDataLayout()->isLittleEndian()) {
+  if (!TM->getSubtargetImpl()->getDataLayout()->isLittleEndian()) {
     Offset = RC->getSize() - (Offset + Size);
   }
   return true;
@@ -370,6 +372,10 @@
   return nullptr;
 }
 
+void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  llvm_unreachable("Not a MachO target");
+}
+
 bool TargetInstrInfo::
 canFoldMemoryOperand(const MachineInstr *MI,
                      const SmallVectorImpl<unsigned> &Ops) const {
@@ -498,7 +504,7 @@
 
   const MachineOperand &MO = MI->getOperand(1-Ops[0]);
   MachineBasicBlock::iterator Pos = MI;
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
   if (Flags == MachineMemOperand::MOStore)
     storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
@@ -562,8 +568,6 @@
                                          AliasAnalysis *AA) const {
   const MachineFunction &MF = *MI->getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetMachine &TM = MF.getTarget();
-  const TargetInstrInfo &TII = *TM.getInstrInfo();
 
   // Remat clients assume operand 0 is the defined register.
   if (!MI->getNumOperands() || !MI->getOperand(0).isReg())
@@ -582,7 +586,7 @@
   // redundant with subsequent checks, but it's target-independent,
   // simple, and a common case.
   int FrameIdx = 0;
-  if (TII.isLoadFromStackSlot(MI, FrameIdx) &&
+  if (isLoadFromStackSlot(MI, FrameIdx) &&
       MF.getFrameInfo()->isImmutableObjectIndex(FrameIdx))
     return true;
 
@@ -655,8 +659,8 @@
   // saves compile time, because it doesn't require every single
   // stack slot reference to depend on the instruction that does the
   // modification.
-  const TargetLowering &TLI = *MF.getTarget().getTargetLowering();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   if (MI->modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI))
     return true;
 
@@ -746,14 +750,14 @@
 }
 
 /// Return the default expected latency for a def based on it's opcode.
-unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel *SchedModel,
+unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel &SchedModel,
                                             const MachineInstr *DefMI) const {
   if (DefMI->isTransient())
     return 0;
   if (DefMI->mayLoad())
-    return SchedModel->LoadLatency;
+    return SchedModel.LoadLatency;
   if (isHighLatencyDef(DefMI->getOpcode()))
-    return SchedModel->HighLatency;
+    return SchedModel.HighLatency;
   return 1;
 }
 
@@ -852,3 +856,77 @@
                           defaultDefLatency(ItinData->SchedModel, DefMI));
   return InstrLatency;
 }
+
+bool TargetInstrInfo::getRegSequenceInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+  assert((MI.isRegSequence() ||
+          MI.isRegSequenceLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isRegSequence())
+    return getRegSequenceLikeInputs(MI, DefIdx, InputRegs);
+
+  // We are looking at:
+  // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
+  assert(DefIdx == 0 && "REG_SEQUENCE only has one def");
+  for (unsigned OpIdx = 1, EndOpIdx = MI.getNumOperands(); OpIdx != EndOpIdx;
+       OpIdx += 2) {
+    const MachineOperand &MOReg = MI.getOperand(OpIdx);
+    const MachineOperand &MOSubIdx = MI.getOperand(OpIdx + 1);
+    assert(MOSubIdx.isImm() &&
+           "One of the subindex of the reg_sequence is not an immediate");
+    // Record Reg:SubReg, SubIdx.
+    InputRegs.push_back(RegSubRegPairAndIdx(MOReg.getReg(), MOReg.getSubReg(),
+                                            (unsigned)MOSubIdx.getImm()));
+  }
+  return true;
+}
+
+bool TargetInstrInfo::getExtractSubregInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPairAndIdx &InputReg) const {
+  assert((MI.isExtractSubreg() ||
+      MI.isExtractSubregLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isExtractSubreg())
+    return getExtractSubregLikeInputs(MI, DefIdx, InputReg);
+
+  // We are looking at:
+  // Def = EXTRACT_SUBREG v0.sub1, sub0.
+  assert(DefIdx == 0 && "EXTRACT_SUBREG only has one def");
+  const MachineOperand &MOReg = MI.getOperand(1);
+  const MachineOperand &MOSubIdx = MI.getOperand(2);
+  assert(MOSubIdx.isImm() &&
+         "The subindex of the extract_subreg is not an immediate");
+
+  InputReg.Reg = MOReg.getReg();
+  InputReg.SubReg = MOReg.getSubReg();
+  InputReg.SubIdx = (unsigned)MOSubIdx.getImm();
+  return true;
+}
+
+bool TargetInstrInfo::getInsertSubregInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPair &BaseReg, RegSubRegPairAndIdx &InsertedReg) const {
+  assert((MI.isInsertSubreg() ||
+      MI.isInsertSubregLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isInsertSubreg())
+    return getInsertSubregLikeInputs(MI, DefIdx, BaseReg, InsertedReg);
+
+  // We are looking at:
+  // Def = INSERT_SEQUENCE v0, v1, sub0.
+  assert(DefIdx == 0 && "INSERT_SUBREG only has one def");
+  const MachineOperand &MOBaseReg = MI.getOperand(1);
+  const MachineOperand &MOInsertedReg = MI.getOperand(2);
+  const MachineOperand &MOSubIdx = MI.getOperand(3);
+  assert(MOSubIdx.isImm() &&
+         "One of the subindex of the reg_sequence is not an immediate");
+  BaseReg.Reg = MOBaseReg.getReg();
+  BaseReg.SubReg = MOBaseReg.getSubReg();
+
+  InsertedReg.Reg = MOInsertedReg.getReg();
+  InsertedReg.SubReg = MOInsertedReg.getSubReg();
+  InsertedReg.SubIdx = (unsigned)MOSubIdx.getImm();
+  return true;
+}

diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index c574fd4..e833fd3 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp

@@ -34,6 +34,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cctype>
 using namespace llvm;
 
@@ -205,6 +206,16 @@
   Names[RTLIB::FLOOR_F80] = "floorl";
   Names[RTLIB::FLOOR_F128] = "floorl";
   Names[RTLIB::FLOOR_PPCF128] = "floorl";
+  Names[RTLIB::FMIN_F32] = "fminf";
+  Names[RTLIB::FMIN_F64] = "fmin";
+  Names[RTLIB::FMIN_F80] = "fminl";
+  Names[RTLIB::FMIN_F128] = "fminl";
+  Names[RTLIB::FMIN_PPCF128] = "fminl";
+  Names[RTLIB::FMAX_F32] = "fmaxf";
+  Names[RTLIB::FMAX_F64] = "fmax";
+  Names[RTLIB::FMAX_F80] = "fmaxl";
+  Names[RTLIB::FMAX_F128] = "fmaxl";
+  Names[RTLIB::FMAX_PPCF128] = "fmaxl";
   Names[RTLIB::ROUND_F32] = "roundf";
   Names[RTLIB::ROUND_F64] = "round";
   Names[RTLIB::ROUND_F80] = "roundl";
@@ -220,6 +231,10 @@
   Names[RTLIB::FPEXT_F32_F64] = "__extendsfdf2";
   Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee";
   Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee";
+  Names[RTLIB::FPROUND_F64_F16] = "__truncdfhf2";
+  Names[RTLIB::FPROUND_F80_F16] = "__truncxfhf2";
+  Names[RTLIB::FPROUND_F128_F16] = "__trunctfhf2";
+  Names[RTLIB::FPROUND_PPCF128_F16] = "__trunctfhf2";
   Names[RTLIB::FPROUND_F64_F32] = "__truncdfsf2";
   Names[RTLIB::FPROUND_F80_F32] = "__truncxfsf2";
   Names[RTLIB::FPROUND_F128_F32] = "__trunctfsf2";
@@ -418,7 +433,10 @@
 /// getFPEXT - Return the FPEXT_*_* value for the given types, or
 /// UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
-  if (OpVT == MVT::f32) {
+  if (OpVT == MVT::f16) {
+    if (RetVT == MVT::f32)
+      return FPEXT_F16_F32;
+  } else if (OpVT == MVT::f32) {
     if (RetVT == MVT::f64)
       return FPEXT_F32_F64;
     if (RetVT == MVT::f128)
@@ -434,7 +452,18 @@
 /// getFPROUND - Return the FPROUND_*_* value for the given types, or
 /// UNKNOWN_LIBCALL if there is none.
 RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
-  if (RetVT == MVT::f32) {
+  if (RetVT == MVT::f16) {
+    if (OpVT == MVT::f32)
+      return FPROUND_F32_F16;
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_F16;
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F16;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F16;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F16;
+  } else if (RetVT == MVT::f32) {
     if (OpVT == MVT::f64)
       return FPROUND_F64_F32;
     if (OpVT == MVT::f80)
@@ -665,10 +694,9 @@
   CCs[RTLIB::O_F128] = ISD::SETEQ;
 }
 
-/// NOTE: The constructor takes ownership of TLOF.
-TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
-                                       const TargetLoweringObjectFile *tlof)
-  : TM(tm), DL(TM.getDataLayout()), TLOF(*tlof) {
+/// NOTE: The TargetMachine owns TLOF.
+TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
+    : TM(tm), DL(TM.getSubtargetImpl()->getDataLayout()) {
   initActions();
 
   // Perform these initializations only once.
@@ -682,10 +710,11 @@
   HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
   IntDivIsCheap = false;
-  Pow2DivIsCheap = false;
+  Pow2SDivIsCheap = false;
   JumpIsExpensive = false;
   PredictableSelectIsExpensive = false;
   MaskAndBranchFoldingIsLegal = false;
+  HasFloatingPointExceptions = true;
   StackPointerRegisterToSaveRestore = 0;
   ExceptionPointerRegister = 0;
   ExceptionSelectorRegister = 0;
@@ -700,7 +729,6 @@
   PrefLoopAlignment = 0;
   MinStackArgumentAlignment = 1;
   InsertFencesForAtomic = false;
-  SupportJumpTables = true;
   MinimumJumpTableEntries = 4;
 
   InitLibcallNames(LibcallRoutineNames, Triple(TM.getTargetTriple()));
@@ -708,10 +736,6 @@
   InitLibcallCallingConvs(LibcallCallingConvs);
 }
 
-TargetLoweringBase::~TargetLoweringBase() {
-  delete &TLOF;
-}
-
 void TargetLoweringBase::initActions() {
   // All operations default to being supported.
   memset(OpActions, 0, sizeof(OpActions));
@@ -738,6 +762,8 @@
     // These operations default to expand.
     setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FMINNUM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FMAXNUM, (MVT::SimpleValueType)VT, Expand);
 
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, (MVT::SimpleValueType)VT, Expand);
@@ -774,6 +800,8 @@
   setOperationAction(ISD::FEXP ,  MVT::f16, Expand);
   setOperationAction(ISD::FEXP2,  MVT::f16, Expand);
   setOperationAction(ISD::FFLOOR, MVT::f16, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f16, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f16, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::f16, Expand);
   setOperationAction(ISD::FCEIL,  MVT::f16, Expand);
   setOperationAction(ISD::FRINT,  MVT::f16, Expand);
@@ -785,6 +813,8 @@
   setOperationAction(ISD::FEXP ,  MVT::f32, Expand);
   setOperationAction(ISD::FEXP2,  MVT::f32, Expand);
   setOperationAction(ISD::FFLOOR, MVT::f32, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::f32, Expand);
   setOperationAction(ISD::FCEIL,  MVT::f32, Expand);
   setOperationAction(ISD::FRINT,  MVT::f32, Expand);
@@ -796,6 +826,8 @@
   setOperationAction(ISD::FEXP ,  MVT::f64, Expand);
   setOperationAction(ISD::FEXP2,  MVT::f64, Expand);
   setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f64, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
   setOperationAction(ISD::FCEIL,  MVT::f64, Expand);
   setOperationAction(ISD::FRINT,  MVT::f64, Expand);
@@ -807,6 +839,8 @@
   setOperationAction(ISD::FEXP ,  MVT::f128, Expand);
   setOperationAction(ISD::FEXP2,  MVT::f128, Expand);
   setOperationAction(ISD::FFLOOR, MVT::f128, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f128, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f128, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::f128, Expand);
   setOperationAction(ISD::FCEIL,  MVT::f128, Expand);
   setOperationAction(ISD::FRINT,  MVT::f128, Expand);
@@ -958,11 +992,10 @@
     // Add a new memory operand for this FI.
     const MachineFrameInfo &MFI = *MF.getFrameInfo();
     assert(MFI.getObjectOffset(FI) != -1);
-    MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
-                              MachineMemOperand::MOLoad,
-                              TM.getDataLayout()->getPointerSize(),
-                              MFI.getObjectAlignment(FI));
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad,
+        TM.getSubtargetImpl()->getDataLayout()->getPointerSize(),
+        MFI.getObjectAlignment(FI));
     MIB->addMemOperand(MF, MMO);
 
     // Replace the instruction and update the operand index.
@@ -978,7 +1011,8 @@
 /// of the register class for the specified type and its associated "cost".
 std::pair<const TargetRegisterClass*, uint8_t>
 TargetLoweringBase::findRepresentativeClass(MVT VT) const {
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
   if (!RC)
     return std::make_pair(RC, 0);
@@ -1005,8 +1039,8 @@
 /// computeRegisterProperties - Once all of the register classes are added,
 /// this allows us to compute derived properties we expose.
 void TargetLoweringBase::computeRegisterProperties() {
-  assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
-         "Too many value types for ValueTypeActions to hold!");
+  static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
+                "Too many value types for ValueTypeActions to hold!");
 
   // Everything defaults to needing one register.
   for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
@@ -1089,6 +1123,13 @@
     }
   }
 
+  if (!isTypeLegal(MVT::f16)) {
+    NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::i16];
+    RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::i16];
+    TransformToType[MVT::f16] = MVT::i16;
+    ValueTypeActions.setTypeAction(MVT::f16, TypeSoftenFloat);
+  }
+
   // Loop over all of the vector value types to see which need transformations.
   for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1154,8 +1195,12 @@
         TransformToType[i] = MVT::Other;
         if (PreferredAction == TypeScalarizeVector)
           ValueTypeActions.setTypeAction(VT, TypeScalarizeVector);
-        else
+        else if (PreferredAction == TypeSplitVector)
           ValueTypeActions.setTypeAction(VT, TypeSplitVector);
+        else
+          // Set type action according to the number of elements.
+          ValueTypeActions.setTypeAction(VT, NElts == 1 ? TypeScalarizeVector
+                                                        : TypeSplitVector);
       } else {
         TransformToType[i] = NVT;
         ValueTypeActions.setTypeAction(VT, TypeWidenVector);

diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 03f4a51..efd15e1 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp

@@ -37,6 +37,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 using namespace dwarf;
 
@@ -72,9 +73,10 @@
                                                     Flags,
                                                     SectionKind::getDataRel(),
                                                     0, Label->getName());
-  unsigned Size = TM.getDataLayout()->getPointerSize();
+  unsigned Size = TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
   Streamer.SwitchSection(Sec);
-  Streamer.EmitValueToAlignment(TM.getDataLayout()->getPointerABIAlignment());
+  Streamer.EmitValueToAlignment(
+      TM.getSubtargetImpl()->getDataLayout()->getPointerABIAlignment());
   Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::Create(Size, getContext());
   Streamer.EmitELFSize(Label, E);
@@ -287,7 +289,8 @@
     // FIXME: this is getting the alignment of the character, not the
     // alignment of the global!
     unsigned Align =
-      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV));
+        TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
+            cast<GlobalVariable>(GV));
 
     const char *SizeSpec = ".rodata.str1.";
     if (Kind.isMergeable2ByteCString())
@@ -338,8 +341,9 @@
 /// getSectionForConstant - Given a mergeable constant with the
 /// specified size and relocation information, return a section that it
 /// should be placed in.
-const MCSection *TargetLoweringObjectFileELF::
-getSectionForConstant(SectionKind Kind) const {
+const MCSection *
+TargetLoweringObjectFileELF::getSectionForConstant(SectionKind Kind,
+                                                   const Constant *C) const {
   if (Kind.isMergeableConst4() && MergeableConst4Section)
     return MergeableConst4Section;
   if (Kind.isMergeableConst8() && MergeableConst8Section)
@@ -354,44 +358,59 @@
   return DataRelROSection;
 }
 
-const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
-    unsigned Priority, const MCSymbol *KeySym) const {
-  // The default scheme is .ctor / .dtor, so we have to invert the priority
-  // numbering.
-  if (Priority == 65535)
-    return StaticCtorSection;
+static const MCSectionELF *getStaticStructorSection(MCContext &Ctx,
+                                                    bool UseInitArray,
+                                                    bool IsCtor,
+                                                    unsigned Priority,
+                                                    const MCSymbol *KeySym) {
+  std::string Name;
+  unsigned Type;
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE;
+  SectionKind Kind = SectionKind::getDataRel();
+  StringRef COMDAT = KeySym ? KeySym->getName() : "";
+
+  if (KeySym)
+    Flags |= ELF::SHF_GROUP;
 
   if (UseInitArray) {
-    std::string Name = std::string(".init_array.") + utostr(Priority);
-    return getContext().getELFSection(Name, ELF::SHT_INIT_ARRAY,
-                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
-                                      SectionKind::getDataRel());
+    if (IsCtor) {
+      Type = ELF::SHT_INIT_ARRAY;
+      Name = ".init_array";
+    } else {
+      Type = ELF::SHT_FINI_ARRAY;
+      Name = ".fini_array";
+    }
+    if (Priority != 65535) {
+      Name += '.';
+      Name += utostr(Priority);
+    }
   } else {
-    std::string Name = std::string(".ctors.") + utostr(65535 - Priority);
-    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                                      SectionKind::getDataRel());
+    // The default scheme is .ctor / .dtor, so we have to invert the priority
+    // numbering.
+    if (IsCtor)
+      Name = ".ctors";
+    else
+      Name = ".dtors";
+    if (Priority != 65535) {
+      Name += '.';
+      Name += utostr(65535 - Priority);
+    }
+    Type = ELF::SHT_PROGBITS;
   }
+
+  return Ctx.getELFSection(Name, Type, Flags, Kind, 0, COMDAT);
+}
+
+const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getStaticStructorSection(getContext(), UseInitArray, true, Priority,
+                                  KeySym);
 }
 
 const MCSection *TargetLoweringObjectFileELF::getStaticDtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
-  // The default scheme is .ctor / .dtor, so we have to invert the priority
-  // numbering.
-  if (Priority == 65535)
-    return StaticDtorSection;
-
-  if (UseInitArray) {
-    std::string Name = std::string(".fini_array.") + utostr(Priority);
-    return getContext().getELFSection(Name, ELF::SHT_FINI_ARRAY,
-                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
-                                      SectionKind::getDataRel());
-  } else {
-    std::string Name = std::string(".dtors.") + utostr(65535 - Priority);
-    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                                      SectionKind::getDataRel());
-  }
+  return getStaticStructorSection(getContext(), UseInitArray, false, Priority,
+                                  KeySym);
 }
 
 void
@@ -565,10 +584,29 @@
     if (SMO.getKind().isMergeable1ByteCString())
       return false;
 
+    if (SMO.getSegmentName() == "__TEXT" &&
+        SMO.getSectionName() == "__objc_classname" &&
+        SMO.getType() == MachO::S_CSTRING_LITERALS)
+      return false;
+
+    if (SMO.getSegmentName() == "__TEXT" &&
+        SMO.getSectionName() == "__objc_methname" &&
+        SMO.getType() == MachO::S_CSTRING_LITERALS)
+      return false;
+
+    if (SMO.getSegmentName() == "__TEXT" &&
+        SMO.getSectionName() == "__objc_methtype" &&
+        SMO.getType() == MachO::S_CSTRING_LITERALS)
+      return false;
+
     if (SMO.getSegmentName() == "__DATA" &&
         SMO.getSectionName() == "__cfstring")
       return false;
 
+    // no_dead_strip sections are not atomized in practice.
+    if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP))
+      return false;
+
     switch (SMO.getType()) {
     default:
       return true;
@@ -610,17 +648,21 @@
 
   // FIXME: Alignment check should be handled by section classifier.
   if (Kind.isMergeable1ByteCString() &&
-      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+      TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
+          cast<GlobalVariable>(GV)) < 32)
     return CStringSection;
 
   // Do not put 16-bit arrays in the UString section if they have an
   // externally visible label, this runs into issues with certain linker
   // versions.
   if (Kind.isMergeable2ByteCString() && !GV->hasExternalLinkage() &&
-      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+      TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
+          cast<GlobalVariable>(GV)) < 32)
     return UStringSection;
 
-  if (Kind.isMergeableConst()) {
+  // With MachO only variables whose corresponding symbol starts with 'l' or
+  // 'L' can be merged, so we only try merging GVs with private linkage.
+  if (GV->hasPrivateLinkage() && Kind.isMergeableConst()) {
     if (Kind.isMergeableConst4())
       return FourByteConstantSection;
     if (Kind.isMergeableConst8())
@@ -654,7 +696,8 @@
 }
 
 const MCSection *
-TargetLoweringObjectFileMachO::getSectionForConstant(SectionKind Kind) const {
+TargetLoweringObjectFileMachO::getSectionForConstant(SectionKind Kind,
+                                                     const Constant *C) const {
   // If this constant requires a relocation, we have to put it in the data
   // segment, not in the text segment.
   if (Kind.isDataRel() || Kind.isReadOnlyWithRel())
@@ -737,7 +780,7 @@
       COFF::IMAGE_SCN_MEM_EXECUTE |
       COFF::IMAGE_SCN_MEM_READ |
       COFF::IMAGE_SCN_CNT_CODE;
-  else if (K.isBSS ())
+  else if (K.isBSS())
     Flags |=
       COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
       COFF::IMAGE_SCN_MEM_READ |
@@ -747,7 +790,7 @@
       COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
       COFF::IMAGE_SCN_MEM_READ |
       COFF::IMAGE_SCN_MEM_WRITE;
-  else if (K.isReadOnly())
+  else if (K.isReadOnly() || K.isReadOnlyWithRel())
     Flags |=
       COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
       COFF::IMAGE_SCN_MEM_READ;
@@ -772,7 +815,7 @@
 
   if (ComdatGV->getComdat() != C)
     report_fatal_error("Associative COMDAT symbol '" + ComdatGVName +
-                       "' is not a key for it's COMDAT.");
+                       "' is not a key for its COMDAT.");
 
   return ComdatGV;
 }
@@ -841,9 +884,9 @@
     return ".bss";
   if (Kind.isThreadLocal())
     return ".tls$";
-  if (Kind.isWriteable())
-    return ".data";
-  return ".rdata";
+  if (Kind.isReadOnly() || Kind.isReadOnlyWithRel())
+    return ".rdata";
+  return ".data";
 }
 
 
@@ -891,7 +934,7 @@
   if (Kind.isThreadLocal())
     return TLSDataSection;
 
-  if (Kind.isReadOnly())
+  if (Kind.isReadOnly() || Kind.isReadOnlyWithRel())
     return ReadOnlySection;
 
   // Note: we claim that common symbols are put in BSSSection, but they are
@@ -958,29 +1001,14 @@
   }
 }
 
-static const MCSection *getAssociativeCOFFSection(MCContext &Ctx,
-                                                  const MCSection *Sec,
-                                                  const MCSymbol *KeySym) {
-  // Return the normal section if we don't have to be associative.
-  if (!KeySym)
-    return Sec;
-
-  // Make an associative section with the same name and kind as the normal
-  // section.
-  const MCSectionCOFF *SecCOFF = cast<MCSectionCOFF>(Sec);
-  unsigned Characteristics =
-      SecCOFF->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT;
-  return Ctx.getCOFFSection(SecCOFF->getSectionName(), Characteristics,
-                            SecCOFF->getKind(), KeySym->getName(),
-                            COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE);
-}
-
 const MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
-  return getAssociativeCOFFSection(getContext(), StaticCtorSection, KeySym);
+  return getContext().getAssociativeCOFFSection(
+      cast<MCSectionCOFF>(StaticCtorSection), KeySym);
 }
 
 const MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
     unsigned Priority, const MCSymbol *KeySym) const {
-  return getAssociativeCOFFSection(getContext(), StaticDtorSection, KeySym);
+  return getContext().getAssociativeCOFFSection(
+      cast<MCSectionCOFF>(StaticDtorSection), KeySym);
 }

diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp
index 3ca2017..618d903 100644
--- a/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/lib/CodeGen/TargetOptionsImpl.cpp

@@ -51,3 +51,10 @@
 StringRef TargetOptions::getTrapFunctionName() const {
   return TrapFuncName;
 }
+
+/// getCFIFuncName - If this returns a non-empty string, then it is the name of
+/// the function that gets called on CFI violations in CFI non-enforcing mode
+/// (!TargetOptions::CFIEnforcing).
+StringRef TargetOptions::getCFIFuncName() const {
+  return CFIFuncName;
+}

diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index a3a4fb3..61a66b6 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp

@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -108,7 +109,7 @@
 /// register of the given type, picking the most sub register class of
 /// the right type that contains this physreg.
 const TargetRegisterClass *
-TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const {
+TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const {
   assert(isPhysicalRegister(reg) && "reg must be a physical register");
 
   // Pick the most sub register class of the right type that contains
@@ -293,3 +294,11 @@
   // All clear, tell the register allocator to prefer this register.
   Hints.push_back(Phys);
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void
+TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
+                            const TargetRegisterInfo *TRI) {
+  dbgs() << PrintReg(Reg, TRI, SubRegIndex) << "\n";
+}
+#endif

diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index b0f2ca6..ef2dab1 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp

@@ -16,7 +16,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
@@ -157,7 +156,7 @@
   const MachineInstr *UseMI, unsigned UseOperIdx) const {
 
   if (!hasInstrSchedModel() && !hasInstrItineraries())
-    return TII->defaultDefLatency(&SchedModel, DefMI);
+    return TII->defaultDefLatency(SchedModel, DefMI);
 
   if (hasInstrItineraries()) {
     int OperLatency = 0;
@@ -181,7 +180,7 @@
     // applicable to the InstrItins model. InstrSchedModel should model all
     // special cases without TII hooks.
     InstrLatency = std::max(InstrLatency,
-                            TII->defaultDefLatency(&SchedModel, DefMI));
+                            TII->defaultDefLatency(SchedModel, DefMI));
     return InstrLatency;
   }
   // hasInstrSchedModel()
@@ -222,7 +221,29 @@
   // FIXME: Automatically giving all implicit defs defaultDefLatency is
   // undesirable. We should only do it for defs that are known to the MC
   // desc like flags. Truly implicit defs should get 1 cycle latency.
-  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(&SchedModel, DefMI);
+  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(SchedModel, DefMI);
+}
+
+unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const {
+  assert(hasInstrSchedModel() && "Only call this function with a SchedModel");
+
+  unsigned SCIdx = TII->get(Opcode).getSchedClass();
+  const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SCIdx);
+  unsigned Latency = 0;
+
+  if (SCDesc->isValid() && !SCDesc->isVariant()) {
+    for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
+         DefIdx != DefEnd; ++DefIdx) {
+      // Lookup the definition's write latency in SubtargetInfo.
+      const MCWriteLatencyEntry *WLEntry =
+          STI->getWriteLatencyEntry(SCDesc, DefIdx);
+      Latency = std::max(Latency, capLatency(WLEntry->Cycles));
+    }
+    return Latency;
+  }
+
+  assert(Latency && "No MI sched latency");
+  return 0;
 }
 
 unsigned
@@ -248,7 +269,7 @@
       return Latency;
     }
   }
-  return TII->defaultDefLatency(&SchedModel, MI);
+  return TII->defaultDefLatency(SchedModel, MI);
 }
 
 unsigned TargetSchedModel::
@@ -268,7 +289,7 @@
   // for predicated defs.
   unsigned Reg = DefMI->getOperand(DefOperIdx).getReg();
   const MachineFunction &MF = *DefMI->getParent()->getParent();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(DepMI))
     return computeInstrLatency(DefMI);
 

diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index f42d47b..e218a83 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp

@@ -48,6 +48,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "twoaddrinstr"
@@ -544,10 +545,21 @@
   if (ToRegA) {
     unsigned FromRegB = getMappedReg(regB, SrcRegMap);
     unsigned FromRegC = getMappedReg(regC, SrcRegMap);
-    bool BComp = !FromRegB || regsAreCompatible(FromRegB, ToRegA, TRI);
-    bool CComp = !FromRegC || regsAreCompatible(FromRegC, ToRegA, TRI);
-    if (BComp != CComp)
-      return !BComp && CComp;
+    bool CompB = FromRegB && regsAreCompatible(FromRegB, ToRegA, TRI);
+    bool CompC = FromRegC && regsAreCompatible(FromRegC, ToRegA, TRI);
+
+    // Compute if any of the following are true:
+    // -RegB is not tied to a register and RegC is compatible with RegA.
+    // -RegB is tied to the wrong physical register, but RegC is.
+    // -RegB is tied to the wrong physical register, and RegC isn't tied.
+    if ((!FromRegB && CompC) || (FromRegB && !CompB && (!FromRegC || CompC)))
+      return true;
+    // Don't compute if any of the following are true:
+    // -RegC is not tied to a register and RegB is compatible with RegA.
+    // -RegC is tied to the wrong physical register, but RegB is.
+    // -RegC is tied to the wrong physical register, and RegB isn't tied.
+    if ((!FromRegC && CompB) || (FromRegC && !CompC && (!FromRegB || CompB)))
+      return false;
   }
 
   // If there is a use of regC between its last def (could be livein) and this
@@ -666,7 +678,7 @@
   unsigned Reg = DstReg;
   while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy,
                                                       NewReg, IsDstPhys)) {
-    if (IsCopy && !Processed.insert(UseMI))
+    if (IsCopy && !Processed.insert(UseMI).second)
       break;
 
     DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
@@ -1503,9 +1515,9 @@
   MF = &Func;
   const TargetMachine &TM = MF->getTarget();
   MRI = &MF->getRegInfo();
-  TII = TM.getInstrInfo();
-  TRI = TM.getRegisterInfo();
-  InstrItins = TM.getInstrItineraryData();
+  TII = TM.getSubtargetImpl()->getInstrInfo();
+  TRI = TM.getSubtargetImpl()->getRegisterInfo();
+  InstrItins = TM.getSubtargetImpl()->getInstrItineraryData();
   LV = getAnalysisIfAvailable<LiveVariables>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();

diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 2e22082..7824f92 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp

@@ -64,9 +64,8 @@
   SmallPtrSet<BasicBlock*, 8> Reachable;
 
   // Mark all reachable blocks.
-  for (df_ext_iterator<Function*, SmallPtrSet<BasicBlock*, 8> > I =
-       df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable); I != E; ++I)
-    /* Mark all reachable blocks */;
+  for (BasicBlock *BB : depth_first_ext(&F, Reachable))
+    (void)BB/* Mark all reachable blocks */;
 
   // Loop over all dead blocks, remembering them and deleting all instructions
   // in them.
@@ -125,10 +124,8 @@
   MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
 
   // Mark all reachable blocks.
-  for (df_ext_iterator<MachineFunction*, SmallPtrSet<MachineBasicBlock*, 8> >
-       I = df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable);
-       I != E; ++I)
-    /* Mark all reachable blocks */;
+  for (MachineBasicBlock *BB : depth_first_ext(&F, Reachable))
+    (void)BB/* Mark all reachable blocks */;
 
   // Loop over all dead blocks, remembering them and deleting all instructions
   // in them.

diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 704736f..0d17d43 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp

@@ -36,6 +36,7 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -54,8 +55,8 @@
 
 bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
   MRI = &mf.getRegInfo();
-  TII = mf.getTarget().getInstrInfo();
-  TRI = mf.getTarget().getRegisterInfo();
+  TII = mf.getSubtarget().getInstrInfo();
+  TRI = mf.getSubtarget().getRegisterInfo();
   MF = &mf;
 
   Virt2PhysMap.clear();
@@ -123,7 +124,7 @@
     if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
       OS << '[' << PrintReg(Reg, TRI) << " -> "
          << PrintReg(Virt2PhysMap[Reg], TRI) << "] "
-         << MRI->getRegClass(Reg)->getName() << "\n";
+         << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
     }
   }
 
@@ -131,7 +132,7 @@
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
       OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
-         << "] " << MRI->getRegClass(Reg)->getName() << "\n";
+         << "] " << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
     }
   }
   OS << '\n';
@@ -205,8 +206,8 @@
 bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
   TM = &MF->getTarget();
-  TRI = TM->getRegisterInfo();
-  TII = TM->getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
   MRI = &MF->getRegInfo();
   Indexes = &getAnalysis<SlotIndexes>();
   LIS = &getAnalysis<LiveIntervals>();

diff --git a/lib/DebugInfo/Android.mk b/lib/DebugInfo/Android.mk
index 12dfb3b..e777e9c 100644
--- a/lib/DebugInfo/Android.mk
+++ b/lib/DebugInfo/Android.mk

@@ -3,6 +3,7 @@
 debuginfo_SRC_FILES := \
   DIContext.cpp \
   DWARFAbbreviationDeclaration.cpp \
+  DWARFAcceleratorTable.cpp \
   DWARFCompileUnit.cpp \
   DWARFContext.cpp \
   DWARFDebugAbbrev.cpp \

diff --git a/lib/DebugInfo/CMakeLists.txt b/lib/DebugInfo/CMakeLists.txt
index 61a3fb0..81fc84d 100644
--- a/lib/DebugInfo/CMakeLists.txt
+++ b/lib/DebugInfo/CMakeLists.txt

@@ -1,6 +1,7 @@
 add_llvm_library(LLVMDebugInfo
   DIContext.cpp
   DWARFAbbreviationDeclaration.cpp
+  DWARFAcceleratorTable.cpp
   DWARFCompileUnit.cpp
   DWARFContext.cpp
   DWARFDebugAbbrev.cpp

diff --git a/lib/DebugInfo/DIContext.cpp b/lib/DebugInfo/DIContext.cpp
index 49a4409..01aecf8 100644
--- a/lib/DebugInfo/DIContext.cpp
+++ b/lib/DebugInfo/DIContext.cpp

@@ -13,6 +13,6 @@
 
 DIContext::~DIContext() {}
 
-DIContext *DIContext::getDWARFContext(object::ObjectFile *Obj) {
+DIContext *DIContext::getDWARFContext(const object::ObjectFile &Obj) {
   return new DWARFContextInMemory(Obj);
 }

diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.h b/lib/DebugInfo/DWARFAbbreviationDeclaration.h
index b86b9ec..bb05c30 100644
--- a/lib/DebugInfo/DWARFAbbreviationDeclaration.h
+++ b/lib/DebugInfo/DWARFAbbreviationDeclaration.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
-#define LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
+#define LLVM_LIB_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataExtractor.h"

diff --git a/lib/DebugInfo/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARFAcceleratorTable.cpp
new file mode 100644
index 0000000..703274d
--- /dev/null
+++ b/lib/DebugInfo/DWARFAcceleratorTable.cpp

@@ -0,0 +1,133 @@
+//===--- DWARFAcceleratorTable.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFAcceleratorTable.h"
+
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+bool DWARFAcceleratorTable::extract() {
+  uint32_t Offset = 0;
+
+  // Check that we can at least read the header.
+  if (!AccelSection.isValidOffset(offsetof(Header, HeaderDataLength)+4))
+    return false;
+
+  Hdr.Magic = AccelSection.getU32(&Offset);
+  Hdr.Version = AccelSection.getU16(&Offset);
+  Hdr.HashFunction = AccelSection.getU16(&Offset);
+  Hdr.NumBuckets = AccelSection.getU32(&Offset);
+  Hdr.NumHashes = AccelSection.getU32(&Offset);
+  Hdr.HeaderDataLength = AccelSection.getU32(&Offset);
+
+  // Check that we can read all the hashes and offsets from the
+  // section (see SourceLevelDebugging.rst for the structure of the index).
+  if (!AccelSection.isValidOffset(sizeof(Hdr) + Hdr.HeaderDataLength +
+                                  Hdr.NumBuckets*4 + Hdr.NumHashes*8))
+    return false;
+
+  HdrData.DIEOffsetBase = AccelSection.getU32(&Offset);
+  uint32_t NumAtoms = AccelSection.getU32(&Offset);
+
+  for (unsigned i = 0; i < NumAtoms; ++i) {
+    uint16_t AtomType = AccelSection.getU16(&Offset);
+    uint16_t AtomForm = AccelSection.getU16(&Offset);
+    HdrData.Atoms.push_back(std::make_pair(AtomType, AtomForm));
+  }
+
+  return true;
+}
+
+void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
+  // Dump the header.
+  OS << "Magic = " << format("0x%08x", Hdr.Magic) << '\n'
+     << "Version = " << format("0x%04x", Hdr.Version) << '\n'
+     << "Hash function = " << format("0x%08x", Hdr.HashFunction) << '\n'
+     << "Bucket count = " << Hdr.NumBuckets << '\n'
+     << "Hashes count = " << Hdr.NumHashes << '\n'
+     << "HeaderData length = " << Hdr.HeaderDataLength << '\n'
+     << "DIE offset base = " << HdrData.DIEOffsetBase << '\n'
+     << "Number of atoms = " << HdrData.Atoms.size() << '\n';
+
+  unsigned i = 0;
+  SmallVector<DWARFFormValue, 3> AtomForms;
+  for (const auto &Atom: HdrData.Atoms) {
+    OS << format("Atom[%d] Type: ", i++);
+    if (const char *TypeString = dwarf::AtomTypeString(Atom.first))
+      OS << TypeString;
+    else
+      OS << format("DW_ATOM_Unknown_0x%x", Atom.first);
+    OS << " Form: ";
+    if (const char *FormString = dwarf::FormEncodingString(Atom.second))
+      OS << FormString;
+    else
+      OS << format("DW_FORM_Unknown_0x%x", Atom.second);
+    OS << '\n';
+    AtomForms.push_back(DWARFFormValue(Atom.second));
+  }
+
+  // Now go through the actual tables and dump them.
+  uint32_t Offset = sizeof(Hdr) + Hdr.HeaderDataLength;
+  unsigned HashesBase = Offset + Hdr.NumBuckets * 4;
+  unsigned OffsetsBase = HashesBase + Hdr.NumHashes * 4;
+
+  for (unsigned Bucket = 0; Bucket < Hdr.NumBuckets; ++Bucket) {
+    unsigned Index = AccelSection.getU32(&Offset);
+
+    OS << format("Bucket[%d]\n", Bucket);
+    if (Index == UINT32_MAX) {
+      OS << "  EMPTY\n";
+      continue;
+    }
+
+    for (unsigned HashIdx = Index; HashIdx < Hdr.NumHashes; ++HashIdx) {
+      unsigned HashOffset = HashesBase + HashIdx*4;
+      unsigned OffsetsOffset = OffsetsBase + HashIdx*4;
+      uint32_t Hash = AccelSection.getU32(&HashOffset);
+
+      if (Hash % Hdr.NumBuckets != Bucket)
+        break;
+
+      unsigned DataOffset = AccelSection.getU32(&OffsetsOffset);
+      OS << format("  Hash = 0x%08x Offset = 0x%08x\n", Hash, DataOffset);
+      if (!AccelSection.isValidOffset(DataOffset)) {
+        OS << "    Invalid section offset\n";
+        continue;
+      }
+      while (AccelSection.isValidOffsetForDataOfSize(DataOffset, 4)) {
+        unsigned StringOffset = AccelSection.getU32(&DataOffset);
+        RelocAddrMap::const_iterator Reloc = Relocs.find(DataOffset-4);
+        if (Reloc != Relocs.end())
+          StringOffset += Reloc->second.second;
+        if (!StringOffset)
+          break;
+        OS << format("    Name: %08x \"%s\"\n", StringOffset,
+                     StringSection.getCStr(&StringOffset));
+        unsigned NumData = AccelSection.getU32(&DataOffset);
+        for (unsigned Data = 0; Data < NumData; ++Data) {
+          OS << format("    Data[%d] => ", Data);
+          unsigned i = 0;
+          for (auto &Atom : AtomForms) {
+            OS << format("{Atom[%d]: ", i++);
+            if (Atom.extractValue(AccelSection, &DataOffset, nullptr))
+              Atom.dump(OS, nullptr);
+            else
+              OS << "Error extracting the value";
+            OS << "} ";
+          }
+          OS << '\n';
+        }
+      }
+    }
+  }
+}
+}

diff --git a/lib/DebugInfo/DWARFAcceleratorTable.h b/lib/DebugInfo/DWARFAcceleratorTable.h
new file mode 100644
index 0000000..7dc9591
--- /dev/null
+++ b/lib/DebugInfo/DWARFAcceleratorTable.h

@@ -0,0 +1,51 @@
+//===--- DWARFAcceleratorTable.h --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFRelocMap.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/DWARFFormValue.h"
+
+#include <cstdint>
+
+namespace llvm {
+
+class DWARFAcceleratorTable {
+
+  struct Header {
+    uint32_t Magic;
+    uint16_t Version;
+    uint16_t HashFunction;
+    uint32_t NumBuckets;
+    uint32_t NumHashes;
+    uint32_t HeaderDataLength;
+  };
+
+  struct HeaderData {
+    typedef uint16_t AtomType;
+    typedef uint16_t Form;
+    uint32_t DIEOffsetBase;
+    SmallVector<std::pair<AtomType, Form>, 3> Atoms;
+  };
+
+  struct Header Hdr;
+  struct HeaderData HdrData;
+  DataExtractor AccelSection;
+  DataExtractor StringSection;
+  const RelocAddrMap& Relocs;
+public:
+  DWARFAcceleratorTable(DataExtractor AccelSection, DataExtractor StringSection,
+                        const RelocAddrMap &Relocs)
+    : AccelSection(AccelSection), StringSection(StringSection), Relocs(Relocs) {}
+
+  bool extract();
+  void dump(raw_ostream &OS) const;
+};
+
+}

diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index 2ed188e..b3190b18 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
-#define LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFCOMPILEUNIT_H
+#define LLVM_LIB_DEBUGINFO_DWARFCOMPILEUNIT_H
 
 #include "DWARFUnit.h"
 
@@ -16,10 +16,11 @@
 
 class DWARFCompileUnit : public DWARFUnit {
 public:
-  DWARFCompileUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef RS,
-                   StringRef SS, StringRef SOS, StringRef AOS,
-                   const RelocAddrMap *M, bool LE)
-      : DWARFUnit(DA, IS, RS, SS, SOS, AOS, M, LE) {}
+  DWARFCompileUnit(DWARFContext &Context, const DWARFSection &Section,
+                   const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                   StringRef SOS, StringRef AOS, bool LE,
+                   const DWARFUnitSectionBase &UnitSection)
+      : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LE, UnitSection) {}
   void dump(raw_ostream &OS);
   // VTable anchor.
   ~DWARFCompileUnit() override;

diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index 3961905..9a2c7cc 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp

@@ -9,6 +9,7 @@
 
 #include "DWARFContext.h"
 #include "DWARFDebugArangeSet.h"
+#include "DWARFAcceleratorTable.h"
 
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -59,6 +60,18 @@
   }
 }
 
+static void dumpAccelSection(raw_ostream &OS, StringRef Name,
+                             const DWARFSection& Section, StringRef StringSection,
+                             bool LittleEndian) {
+  DataExtractor AccelSection(Section.Data, LittleEndian, 0);
+  DataExtractor StrData(StringSection, LittleEndian, 0);
+  OS << "\n." << Name << " contents:\n";
+  DWARFAcceleratorTable Accel(AccelSection, StrData, Section.Relocs);
+  if (!Accel.extract())
+    return;
+  Accel.dump(OS);
+}
+
 void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
   if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
     OS << ".debug_abbrev contents:\n";
@@ -86,15 +99,17 @@
 
   if ((DumpType == DIDT_All || DumpType == DIDT_Types) && getNumTypeUnits()) {
     OS << "\n.debug_types contents:\n";
-    for (const auto &TU : type_units())
-      TU->dump(OS);
+    for (const auto &TUS : type_unit_sections())
+      for (const auto &TU : TUS)
+        TU->dump(OS);
   }
 
   if ((DumpType == DIDT_All || DumpType == DIDT_TypesDwo) &&
       getNumDWOTypeUnits()) {
     OS << "\n.debug_types.dwo contents:\n";
-    for (const auto &DWOTU : dwo_type_units())
-      DWOTU->dump(OS);
+    for (const auto &DWOTUS : dwo_type_unit_sections())
+      for (const auto &DWOTU : DWOTUS)
+        DWOTU->dump(OS);
   }
 
   if (DumpType == DIDT_All || DumpType == DIDT_Loc) {
@@ -216,6 +231,22 @@
       OS << format("%8.8x\n", strOffsetExt.getU32(&offset));
     }
   }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AppleNames)
+    dumpAccelSection(OS, "apple_names", getAppleNamesSection(),
+                     getStringSection(), isLittleEndian());
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AppleTypes)
+    dumpAccelSection(OS, "apple_types", getAppleTypesSection(),
+                     getStringSection(), isLittleEndian());
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AppleNamespaces)
+    dumpAccelSection(OS, "apple_namespaces", getAppleNamespacesSection(),
+                     getStringSection(), isLittleEndian());
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AppleObjC)
+    dumpAccelSection(OS, "apple_objc", getAppleObjCSection(),
+                     getStringSection(), isLittleEndian());
 }
 
 const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
@@ -291,7 +322,7 @@
 }
 
 const DWARFLineTable *
-DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
+DWARFContext::getLineTableForUnit(DWARFUnit *cu) {
   if (!Line)
     Line.reset(new DWARFDebugLine(&getLineSection().Relocs));
 
@@ -312,110 +343,34 @@
 }
 
 void DWARFContext::parseCompileUnits() {
-  if (!CUs.empty())
-    return;
-  uint32_t offset = 0;
-  const DataExtractor &DIData = DataExtractor(getInfoSection().Data,
-                                              isLittleEndian(), 0);
-  while (DIData.isValidOffset(offset)) {
-    std::unique_ptr<DWARFCompileUnit> CU(new DWARFCompileUnit(
-        getDebugAbbrev(), getInfoSection().Data, getRangeSection(),
-        getStringSection(), StringRef(), getAddrSection(),
-        &getInfoSection().Relocs, isLittleEndian()));
-    if (!CU->extract(DIData, &offset)) {
-      break;
-    }
-    CUs.push_back(std::move(CU));
-    offset = CUs.back()->getNextUnitOffset();
-  }
+  CUs.parse(*this, getInfoSection());
 }
 
 void DWARFContext::parseTypeUnits() {
   if (!TUs.empty())
     return;
   for (const auto &I : getTypesSections()) {
-    uint32_t offset = 0;
-    const DataExtractor &DIData =
-        DataExtractor(I.second.Data, isLittleEndian(), 0);
-    while (DIData.isValidOffset(offset)) {
-      std::unique_ptr<DWARFTypeUnit> TU(
-          new DWARFTypeUnit(getDebugAbbrev(), I.second.Data, getRangeSection(),
-                            getStringSection(), StringRef(), getAddrSection(),
-                            &I.second.Relocs, isLittleEndian()));
-      if (!TU->extract(DIData, &offset))
-        break;
-      TUs.push_back(std::move(TU));
-      offset = TUs.back()->getNextUnitOffset();
-    }
+    TUs.push_back(DWARFUnitSection<DWARFTypeUnit>());
+    TUs.back().parse(*this, I.second);
   }
 }
 
 void DWARFContext::parseDWOCompileUnits() {
-  if (!DWOCUs.empty())
-    return;
-  uint32_t offset = 0;
-  const DataExtractor &DIData =
-      DataExtractor(getInfoDWOSection().Data, isLittleEndian(), 0);
-  while (DIData.isValidOffset(offset)) {
-    std::unique_ptr<DWARFCompileUnit> DWOCU(new DWARFCompileUnit(
-        getDebugAbbrevDWO(), getInfoDWOSection().Data, getRangeDWOSection(),
-        getStringDWOSection(), getStringOffsetDWOSection(), getAddrSection(),
-        &getInfoDWOSection().Relocs, isLittleEndian()));
-    if (!DWOCU->extract(DIData, &offset)) {
-      break;
-    }
-    DWOCUs.push_back(std::move(DWOCU));
-    offset = DWOCUs.back()->getNextUnitOffset();
-  }
+  DWOCUs.parseDWO(*this, getInfoDWOSection());
 }
 
 void DWARFContext::parseDWOTypeUnits() {
   if (!DWOTUs.empty())
     return;
   for (const auto &I : getTypesDWOSections()) {
-    uint32_t offset = 0;
-    const DataExtractor &DIData =
-        DataExtractor(I.second.Data, isLittleEndian(), 0);
-    while (DIData.isValidOffset(offset)) {
-      std::unique_ptr<DWARFTypeUnit> TU(new DWARFTypeUnit(
-          getDebugAbbrevDWO(), I.second.Data, getRangeDWOSection(),
-          getStringDWOSection(), getStringOffsetDWOSection(), getAddrSection(),
-          &I.second.Relocs, isLittleEndian()));
-      if (!TU->extract(DIData, &offset))
-        break;
-      DWOTUs.push_back(std::move(TU));
-      offset = DWOTUs.back()->getNextUnitOffset();
-    }
+    DWOTUs.push_back(DWARFUnitSection<DWARFTypeUnit>());
+    DWOTUs.back().parseDWO(*this, I.second);
   }
 }
 
-namespace {
-  struct OffsetComparator {
-
-    bool operator()(const std::unique_ptr<DWARFCompileUnit> &LHS,
-                    const std::unique_ptr<DWARFCompileUnit> &RHS) const {
-      return LHS->getOffset() < RHS->getOffset();
-    }
-    bool operator()(const std::unique_ptr<DWARFCompileUnit> &LHS,
-                    uint32_t RHS) const {
-      return LHS->getOffset() < RHS;
-    }
-    bool operator()(uint32_t LHS,
-                    const std::unique_ptr<DWARFCompileUnit> &RHS) const {
-      return LHS < RHS->getOffset();
-    }
-  };
-}
-
 DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
   parseCompileUnits();
-
-  std::unique_ptr<DWARFCompileUnit> *CU =
-      std::lower_bound(CUs.begin(), CUs.end(), Offset, OffsetComparator());
-  if (CU != CUs.end()) {
-    return CU->get();
-  }
-  return nullptr;
+  return CUs.getUnitForOffset(Offset);
 }
 
 DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
@@ -425,47 +380,6 @@
   return getCompileUnitForOffset(CUOffset);
 }
 
-static bool getFileNameForCompileUnit(DWARFCompileUnit *CU,
-                                      const DWARFLineTable *LineTable,
-                                      uint64_t FileIndex, FileLineInfoKind Kind,
-                                      std::string &FileName) {
-  if (!CU || !LineTable || Kind == FileLineInfoKind::None ||
-      !LineTable->getFileNameByIndex(FileIndex, Kind, FileName))
-    return false;
-  if (Kind == FileLineInfoKind::AbsoluteFilePath &&
-      sys::path::is_relative(FileName)) {
-    // We may still need to append compilation directory of compile unit.
-    SmallString<16> AbsolutePath;
-    if (const char *CompilationDir = CU->getCompilationDir()) {
-      sys::path::append(AbsolutePath, CompilationDir);
-    }
-    sys::path::append(AbsolutePath, FileName);
-    FileName = AbsolutePath.str();
-  }
-  return true;
-}
-
-static bool getFileLineInfoForCompileUnit(DWARFCompileUnit *CU,
-                                          const DWARFLineTable *LineTable,
-                                          uint64_t Address,
-                                          FileLineInfoKind Kind,
-                                          DILineInfo &Result) {
-  if (!CU || !LineTable)
-    return false;
-  // Get the index of row we're looking for in the line table.
-  uint32_t RowIndex = LineTable->lookupAddress(Address);
-  if (RowIndex == -1U)
-    return false;
-  // Take file number and line/column from the row.
-  const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
-  if (!getFileNameForCompileUnit(CU, LineTable, Row.File, Kind,
-                                 Result.FileName))
-    return false;
-  Result.Line = Row.Line;
-  Result.Column = Row.Column;
-  return true;
-}
-
 static bool getFunctionNameForAddress(DWARFCompileUnit *CU, uint64_t Address,
                                       FunctionNameKind Kind,
                                       std::string &FunctionName) {
@@ -496,8 +410,9 @@
     return Result;
   getFunctionNameForAddress(CU, Address, Spec.FNKind, Result.FunctionName);
   if (Spec.FLIKind != FileLineInfoKind::None) {
-    const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU);
-    getFileLineInfoForCompileUnit(CU, LineTable, Address, Spec.FLIKind, Result);
+    if (const DWARFLineTable *LineTable = getLineTableForUnit(CU))
+      LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
+                                           Spec.FLIKind, Result);
   }
   return Result;
 }
@@ -522,7 +437,7 @@
     return Lines;
   }
 
-  const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU);
+  const DWARFLineTable *LineTable = getLineTableForUnit(CU);
 
   // Get the index of row we're looking for in the line table.
   std::vector<uint32_t> RowVector;
@@ -533,8 +448,8 @@
     // Take file number and line/column from the row.
     const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
     DILineInfo Result;
-    getFileNameForCompileUnit(CU, LineTable, Row.File, Spec.FLIKind,
-                              Result.FileName);
+    LineTable->getFileNameByIndex(Row.File, CU->getCompilationDir(),
+                                  Spec.FLIKind, Result.FileName);
     Result.FunctionName = FunctionName;
     Result.Line = Row.Line;
     Result.Column = Row.Column;
@@ -561,11 +476,11 @@
     // try to at least get file/line info from symbol table.
     if (Spec.FLIKind != FileLineInfoKind::None) {
       DILineInfo Frame;
-      LineTable = getLineTableForCompileUnit(CU);
-      if (getFileLineInfoForCompileUnit(CU, LineTable, Address, Spec.FLIKind,
-                                        Frame)) {
+      LineTable = getLineTableForUnit(CU);
+      if (LineTable &&
+          LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
+                                               Spec.FLIKind, Frame))
         InliningInfo.addFrame(Frame);
-      }
     }
     return InliningInfo;
   }
@@ -582,15 +497,17 @@
       if (i == 0) {
         // For the topmost frame, initialize the line table of this
         // compile unit and fetch file/line info from it.
-        LineTable = getLineTableForCompileUnit(CU);
+        LineTable = getLineTableForUnit(CU);
         // For the topmost routine, get file/line info from line table.
-        getFileLineInfoForCompileUnit(CU, LineTable, Address, Spec.FLIKind,
-                                      Frame);
+        if (LineTable)
+          LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
+                                               Spec.FLIKind, Frame);
       } else {
         // Otherwise, use call file, call line and call column from
         // previous DIE in inlined chain.
-        getFileNameForCompileUnit(CU, LineTable, CallFile, Spec.FLIKind,
-                                  Frame.FileName);
+        if (LineTable)
+          LineTable->getFileNameByIndex(CallFile, CU->getCompilationDir(),
+                                        Spec.FLIKind, Frame.FileName);
         Frame.Line = CallLine;
         Frame.Column = CallColumn;
       }
@@ -621,12 +538,19 @@
   return true;
 }
 
-DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj)
-    : IsLittleEndian(Obj->isLittleEndian()),
-      AddressSize(Obj->getBytesInAddress()) {
-  for (const SectionRef &Section : Obj->sections()) {
+DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj)
+    : IsLittleEndian(Obj.isLittleEndian()),
+      AddressSize(Obj.getBytesInAddress()) {
+  for (const SectionRef &Section : Obj.sections()) {
     StringRef name;
     Section.getName(name);
+    // Skip BSS and Virtual sections, they aren't interesting.
+    bool IsBSS = Section.isBSS();
+    if (IsBSS)
+      continue;
+    bool IsVirtual = Section.isVirtual();
+    if (IsVirtual)
+      continue;
     StringRef data;
     Section.getContents(data);
 
@@ -670,6 +594,11 @@
             .Case("debug_str.dwo", &StringDWOSection)
             .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
             .Case("debug_addr", &AddrSection)
+            .Case("apple_names", &AppleNamesSection.Data)
+            .Case("apple_types", &AppleTypesSection.Data)
+            .Case("apple_namespaces", &AppleNamespacesSection.Data)
+            .Case("apple_namespac", &AppleNamespacesSection.Data)
+            .Case("apple_objc", &AppleObjCSection.Data)
             // Any more debug info sections go here.
             .Default(nullptr);
     if (SectionData) {
@@ -687,7 +616,7 @@
     }
 
     section_iterator RelocatedSection = Section.getRelocatedSection();
-    if (RelocatedSection == Obj->section_end())
+    if (RelocatedSection == Obj.section_end())
       continue;
 
     StringRef RelSecName;
@@ -702,6 +631,11 @@
         .Case("debug_loc", &LocSection.Relocs)
         .Case("debug_info.dwo", &InfoDWOSection.Relocs)
         .Case("debug_line", &LineSection.Relocs)
+        .Case("apple_names", &AppleNamesSection.Relocs)
+        .Case("apple_types", &AppleTypesSection.Relocs)
+        .Case("apple_namespaces", &AppleNamespacesSection.Relocs)
+        .Case("apple_namespac", &AppleNamespacesSection.Relocs)
+        .Case("apple_objc", &AppleObjCSection.Relocs)
         .Default(nullptr);
     if (!Map) {
       // Find debug_types relocs by section rather than name as there are
@@ -715,23 +649,19 @@
     }
 
     if (Section.relocation_begin() != Section.relocation_end()) {
-      uint64_t SectionSize;
-      RelocatedSection->getSize(SectionSize);
+      uint64_t SectionSize = RelocatedSection->getSize();
       for (const RelocationRef &Reloc : Section.relocations()) {
         uint64_t Address;
         Reloc.getOffset(Address);
         uint64_t Type;
         Reloc.getType(Type);
         uint64_t SymAddr = 0;
-        // ELF relocations may need the symbol address
-        if (Obj->isELF()) {
-          object::symbol_iterator Sym = Reloc.getSymbol();
+        object::symbol_iterator Sym = Reloc.getSymbol();
+        if (Sym != Obj.symbol_end())
           Sym->getAddress(SymAddr);
-        }
 
-        object::RelocVisitor V(Obj->getFileFormatName());
-        // The section address is always 0 for debug sections.
-        object::RelocToApply R(V.visit(Type, Reloc, 0, SymAddr));
+        object::RelocVisitor V(Obj);
+        object::RelocToApply R(V.visit(Type, Reloc, SymAddr));
         if (V.error()) {
           SmallString<32> Name;
           std::error_code ec(Reloc.getTypeName(Name));

diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index 6d1ae92..dd3fcc7 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===/
 
-#ifndef LLVM_DEBUGINFO_DWARFCONTEXT_H
-#define LLVM_DEBUGINFO_DWARFCONTEXT_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFCONTEXT_H
+#define LLVM_LIB_DEBUGINFO_DWARFCONTEXT_H
 
 #include "DWARFCompileUnit.h"
 #include "DWARFDebugAranges.h"
@@ -16,10 +16,12 @@
 #include "DWARFDebugLine.h"
 #include "DWARFDebugLoc.h"
 #include "DWARFDebugRangeList.h"
+#include "DWARFSection.h"
 #include "DWARFTypeUnit.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/DIContext.h"
+#include <vector>
 
 namespace llvm {
 
@@ -28,19 +30,17 @@
 /// information parsing. The actual data is supplied through pure virtual
 /// methods that a concrete implementation provides.
 class DWARFContext : public DIContext {
-  typedef SmallVector<std::unique_ptr<DWARFCompileUnit>, 1> CUVector;
-  typedef SmallVector<std::unique_ptr<DWARFTypeUnit>, 1> TUVector;
 
-  CUVector CUs;
-  TUVector TUs;
+  DWARFUnitSection<DWARFCompileUnit> CUs;
+  std::vector<DWARFUnitSection<DWARFTypeUnit>> TUs;
   std::unique_ptr<DWARFDebugAbbrev> Abbrev;
   std::unique_ptr<DWARFDebugLoc> Loc;
   std::unique_ptr<DWARFDebugAranges> Aranges;
   std::unique_ptr<DWARFDebugLine> Line;
   std::unique_ptr<DWARFDebugFrame> DebugFrame;
 
-  CUVector DWOCUs;
-  TUVector DWOTUs;
+  DWARFUnitSection<DWARFCompileUnit> DWOCUs;
+  std::vector<DWARFUnitSection<DWARFTypeUnit>> DWOTUs;
   std::unique_ptr<DWARFDebugAbbrev> AbbrevDWO;
   std::unique_ptr<DWARFDebugLocDWO> LocDWO;
 
@@ -64,11 +64,6 @@
   void parseDWOTypeUnits();
 
 public:
-  struct Section {
-    StringRef Data;
-    RelocAddrMap Relocs;
-  };
-
   DWARFContext() : DIContext(CK_DWARF) {}
 
   static bool classof(const DIContext *DICtx) {
@@ -77,8 +72,9 @@
 
   void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All) override;
 
-  typedef iterator_range<CUVector::iterator> cu_iterator_range;
-  typedef iterator_range<TUVector::iterator> tu_iterator_range;
+  typedef DWARFUnitSection<DWARFCompileUnit>::iterator_range cu_iterator_range;
+  typedef DWARFUnitSection<DWARFTypeUnit>::iterator_range tu_iterator_range;
+  typedef iterator_range<std::vector<DWARFUnitSection<DWARFTypeUnit>>::iterator> tu_section_iterator_range;
 
   /// Get compile units in this context.
   cu_iterator_range compile_units() {
@@ -87,9 +83,9 @@
   }
 
   /// Get type units in this context.
-  tu_iterator_range type_units() {
+  tu_section_iterator_range type_unit_sections() {
     parseTypeUnits();
-    return tu_iterator_range(TUs.begin(), TUs.end());
+    return tu_section_iterator_range(TUs.begin(), TUs.end());
   }
 
   /// Get compile units in the DWO context.
@@ -99,9 +95,9 @@
   }
 
   /// Get type units in the DWO context.
-  tu_iterator_range dwo_type_units() {
+  tu_section_iterator_range dwo_type_unit_sections() {
     parseDWOTypeUnits();
-    return tu_iterator_range(DWOTUs.begin(), DWOTUs.end());
+    return tu_section_iterator_range(DWOTUs.begin(), DWOTUs.end());
   }
 
   /// Get the number of compile units in this context.
@@ -159,8 +155,7 @@
   const DWARFDebugFrame *getDebugFrame();
 
   /// Get a pointer to a parsed line table corresponding to a compile unit.
-  const DWARFDebugLine::LineTable *
-  getLineTableForCompileUnit(DWARFCompileUnit *cu);
+  const DWARFDebugLine::LineTable *getLineTableForUnit(DWARFUnit *cu);
 
   DILineInfo getLineInfoForAddress(uint64_t Address,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
@@ -171,17 +166,15 @@
 
   virtual bool isLittleEndian() const = 0;
   virtual uint8_t getAddressSize() const = 0;
-  virtual const Section &getInfoSection() = 0;
-  typedef MapVector<object::SectionRef, Section,
-                    std::map<object::SectionRef, unsigned> > TypeSectionMap;
+  virtual const DWARFSection &getInfoSection() = 0;
+  typedef MapVector<object::SectionRef, DWARFSection,
+                    std::map<object::SectionRef, unsigned>> TypeSectionMap;
   virtual const TypeSectionMap &getTypesSections() = 0;
   virtual StringRef getAbbrevSection() = 0;
-  virtual const Section &getLocSection() = 0;
-  virtual const Section &getLocDWOSection() = 0;
+  virtual const DWARFSection &getLocSection() = 0;
   virtual StringRef getARangeSection() = 0;
   virtual StringRef getDebugFrameSection() = 0;
-  virtual const Section &getLineSection() = 0;
-  virtual const Section &getLineDWOSection() = 0;
+  virtual const DWARFSection &getLineSection() = 0;
   virtual StringRef getStringSection() = 0;
   virtual StringRef getRangeSection() = 0;
   virtual StringRef getPubNamesSection() = 0;
@@ -190,13 +183,19 @@
   virtual StringRef getGnuPubTypesSection() = 0;
 
   // Sections for DWARF5 split dwarf proposal.
-  virtual const Section &getInfoDWOSection() = 0;
+  virtual const DWARFSection &getInfoDWOSection() = 0;
   virtual const TypeSectionMap &getTypesDWOSections() = 0;
   virtual StringRef getAbbrevDWOSection() = 0;
+  virtual const DWARFSection &getLineDWOSection() = 0;
+  virtual const DWARFSection &getLocDWOSection() = 0;
   virtual StringRef getStringDWOSection() = 0;
   virtual StringRef getStringOffsetDWOSection() = 0;
   virtual StringRef getRangeDWOSection() = 0;
   virtual StringRef getAddrSection() = 0;
+  virtual const DWARFSection& getAppleNamesSection() = 0;
+  virtual const DWARFSection& getAppleTypesSection() = 0;
+  virtual const DWARFSection& getAppleNamespacesSection() = 0;
+  virtual const DWARFSection& getAppleObjCSection() = 0;
 
   static bool isSupportedVersion(unsigned version) {
     return version == 2 || version == 3 || version == 4;
@@ -217,15 +216,13 @@
   virtual void anchor();
   bool IsLittleEndian;
   uint8_t AddressSize;
-  Section InfoSection;
+  DWARFSection InfoSection;
   TypeSectionMap TypesSections;
   StringRef AbbrevSection;
-  Section LocSection;
-  Section LocDWOSection;
+  DWARFSection LocSection;
   StringRef ARangeSection;
   StringRef DebugFrameSection;
-  Section LineSection;
-  Section LineDWOSection;
+  DWARFSection LineSection;
   StringRef StringSection;
   StringRef RangeSection;
   StringRef PubNamesSection;
@@ -234,42 +231,52 @@
   StringRef GnuPubTypesSection;
 
   // Sections for DWARF5 split dwarf proposal.
-  Section InfoDWOSection;
+  DWARFSection InfoDWOSection;
   TypeSectionMap TypesDWOSections;
   StringRef AbbrevDWOSection;
+  DWARFSection LineDWOSection;
+  DWARFSection LocDWOSection;
   StringRef StringDWOSection;
   StringRef StringOffsetDWOSection;
   StringRef RangeDWOSection;
   StringRef AddrSection;
+  DWARFSection AppleNamesSection;
+  DWARFSection AppleTypesSection;
+  DWARFSection AppleNamespacesSection;
+  DWARFSection AppleObjCSection;
 
   SmallVector<SmallString<32>, 4> UncompressedSections;
 
 public:
-  DWARFContextInMemory(object::ObjectFile *);
+  DWARFContextInMemory(const object::ObjectFile &Obj);
   bool isLittleEndian() const override { return IsLittleEndian; }
   uint8_t getAddressSize() const override { return AddressSize; }
-  const Section &getInfoSection() override { return InfoSection; }
+  const DWARFSection &getInfoSection() override { return InfoSection; }
   const TypeSectionMap &getTypesSections() override { return TypesSections; }
   StringRef getAbbrevSection() override { return AbbrevSection; }
-  const Section &getLocSection() override { return LocSection; }
-  const Section &getLocDWOSection() override { return LocDWOSection; }
+  const DWARFSection &getLocSection() override { return LocSection; }
   StringRef getARangeSection() override { return ARangeSection; }
   StringRef getDebugFrameSection() override { return DebugFrameSection; }
-  const Section &getLineSection() override { return LineSection; }
-  const Section &getLineDWOSection() override { return LineDWOSection; }
+  const DWARFSection &getLineSection() override { return LineSection; }
   StringRef getStringSection() override { return StringSection; }
   StringRef getRangeSection() override { return RangeSection; }
   StringRef getPubNamesSection() override { return PubNamesSection; }
   StringRef getPubTypesSection() override { return PubTypesSection; }
   StringRef getGnuPubNamesSection() override { return GnuPubNamesSection; }
   StringRef getGnuPubTypesSection() override { return GnuPubTypesSection; }
+  const DWARFSection& getAppleNamesSection() override { return AppleNamesSection; }
+  const DWARFSection& getAppleTypesSection() override { return AppleTypesSection; }
+  const DWARFSection& getAppleNamespacesSection() override { return AppleNamespacesSection; }
+  const DWARFSection& getAppleObjCSection() override { return AppleObjCSection; }
 
   // Sections for DWARF5 split dwarf proposal.
-  const Section &getInfoDWOSection() override { return InfoDWOSection; }
+  const DWARFSection &getInfoDWOSection() override { return InfoDWOSection; }
   const TypeSectionMap &getTypesDWOSections() override {
     return TypesDWOSections;
   }
   StringRef getAbbrevDWOSection() override { return AbbrevDWOSection; }
+  const DWARFSection &getLineDWOSection() override { return LineDWOSection; }
+  const DWARFSection &getLocDWOSection() override { return LocDWOSection; }
   StringRef getStringDWOSection() override { return StringDWOSection; }
   StringRef getStringOffsetDWOSection() override {
     return StringOffsetDWOSection;

diff --git a/lib/DebugInfo/DWARFDebugAbbrev.cpp b/lib/DebugInfo/DWARFDebugAbbrev.cpp
index 8426bf9..c1a088e 100644
--- a/lib/DebugInfo/DWARFDebugAbbrev.cpp
+++ b/lib/DebugInfo/DWARFDebugAbbrev.cpp

@@ -30,7 +30,6 @@
   DWARFAbbreviationDeclaration AbbrDecl;
   uint32_t PrevAbbrCode = 0;
   while (AbbrDecl.extract(Data, OffsetPtr)) {
-    Decls.push_back(AbbrDecl);
     if (FirstAbbrCode == 0) {
       FirstAbbrCode = AbbrDecl.getCode();
     } else {
@@ -40,6 +39,7 @@
       }
     }
     PrevAbbrCode = AbbrDecl.getCode();
+    Decls.push_back(std::move(AbbrDecl));
   }
   return BeginOffset != *OffsetPtr;
 }
@@ -82,7 +82,7 @@
     uint32_t CUAbbrOffset = Offset;
     if (!AbbrDecls.extract(Data, &Offset))
       break;
-    AbbrDeclSets[CUAbbrOffset] = AbbrDecls;
+    AbbrDeclSets[CUAbbrOffset] = std::move(AbbrDecls);
   }
 }
 

diff --git a/lib/DebugInfo/DWARFDebugAbbrev.h b/lib/DebugInfo/DWARFDebugAbbrev.h
index 3a9adba..4b3b814 100644
--- a/lib/DebugInfo/DWARFDebugAbbrev.h
+++ b/lib/DebugInfo/DWARFDebugAbbrev.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFDEBUGABBREV_H
-#define LLVM_DEBUGINFO_DWARFDEBUGABBREV_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGABBREV_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGABBREV_H
 
 #include "DWARFAbbreviationDeclaration.h"
 #include <list>

diff --git a/lib/DebugInfo/DWARFDebugArangeSet.h b/lib/DebugInfo/DWARFDebugArangeSet.h
index d6c2d8b..837a8e6 100644
--- a/lib/DebugInfo/DWARFDebugArangeSet.h
+++ b/lib/DebugInfo/DWARFDebugArangeSet.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFDEBUGARANGESET_H
-#define LLVM_DEBUGINFO_DWARFDEBUGARANGESET_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGARANGESET_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGARANGESET_H
 
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/DataExtractor.h"

diff --git a/lib/DebugInfo/DWARFDebugAranges.h b/lib/DebugInfo/DWARFDebugAranges.h
index a9f37fe..791f010 100644
--- a/lib/DebugInfo/DWARFDebugAranges.h
+++ b/lib/DebugInfo/DWARFDebugAranges.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
-#define LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGARANGES_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGARANGES_H
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/DataExtractor.h"

diff --git a/lib/DebugInfo/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARFDebugFrame.cpp
index a33548e..dfa7e82 100644
--- a/lib/DebugInfo/DWARFDebugFrame.cpp
+++ b/lib/DebugInfo/DWARFDebugFrame.cpp

@@ -202,7 +202,8 @@
       SmallString<8> Augmentation, uint64_t CodeAlignmentFactor,
       int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister)
       : FrameEntry(FK_CIE, Offset, Length), Version(Version),
-        Augmentation(Augmentation), CodeAlignmentFactor(CodeAlignmentFactor),
+        Augmentation(std::move(Augmentation)),
+        CodeAlignmentFactor(CodeAlignmentFactor),
         DataAlignmentFactor(DataAlignmentFactor),
         ReturnAddressRegister(ReturnAddressRegister) {}
 

diff --git a/lib/DebugInfo/DWARFDebugFrame.h b/lib/DebugInfo/DWARFDebugFrame.h
index bd4ef45..be925cb 100644
--- a/lib/DebugInfo/DWARFDebugFrame.h
+++ b/lib/DebugInfo/DWARFDebugFrame.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFDEBUGFRAME_H
-#define LLVM_DEBUGINFO_DWARFDEBUGFRAME_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGFRAME_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGFRAME_H
 
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/raw_ostream.h"

diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index 2e7a54a..583e700 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp

@@ -12,15 +12,26 @@
 #include "DWARFContext.h"
 #include "DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARFFormValue.h"
+#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 using namespace dwarf;
-typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
 
-void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS, const DWARFUnit *u,
+// Small helper to extract a DIE pointed by a reference
+// attribute. It looks up the Unit containing the DIE and calls
+// DIE.extractFast with the right unit. Returns new unit on success,
+// nullptr otherwise.
+static const DWARFUnit *findUnitAndExtractFast(DWARFDebugInfoEntryMinimal &DIE,
+                                               const DWARFUnit *Unit,
+                                               uint32_t *Offset) {
+  Unit = Unit->getUnitSection().getUnitForOffset(*Offset);
+  return (Unit && DIE.extractFast(Unit, Offset)) ? Unit : nullptr;
+}
+
+void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS, DWARFUnit *u,
                                       unsigned recurseDepth,
                                       unsigned indent) const {
   DataExtractor debug_info_data = u->getDebugInfoExtractor();
@@ -62,12 +73,42 @@
   }
 }
 
+static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
+  OS << " (";
+  do {
+    uint64_t Bit = 1ULL << countTrailingZeros(Val);
+    if (const char *PropName = ApplePropertyString(Bit))
+      OS << PropName;
+    else
+      OS << format("DW_APPLE_PROPERTY_0x%" PRIx64, Bit);
+    if (!(Val ^= Bit))
+      break;
+    OS << ", ";
+  } while (true);
+  OS << ")";
+}
+
+static void dumpRanges(raw_ostream &OS, const DWARFAddressRangesVector& Ranges,
+                       unsigned AddressSize, unsigned Indent) {
+  if (Ranges.empty())
+    return;
+
+  for (const auto &Range: Ranges) {
+    OS << '\n';
+    OS.indent(Indent);
+    OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")",
+                 AddressSize*2, Range.first,
+                 AddressSize*2, Range.second);
+  }
+}
+
 void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
-                                               const DWARFUnit *u,
+                                               DWARFUnit *u,
                                                uint32_t *offset_ptr,
                                                uint16_t attr, uint16_t form,
                                                unsigned indent) const {
-  OS << "            ";
+  const char BaseIndent[] = "            ";
+  OS << BaseIndent;
   OS.indent(indent+2);
   const char *attrString = AttributeString(attr);
   if (attrString)
@@ -86,7 +127,48 @@
     return;
 
   OS << "\t(";
-  formValue.dump(OS, u);
+  
+  const char *Name = nullptr;
+  std::string File;
+  if (attr == DW_AT_decl_file || attr == DW_AT_call_file) {
+    if (const auto *LT = u->getContext().getLineTableForUnit(u))
+      if (LT->getFileNameByIndex(
+             formValue.getAsUnsignedConstant().getValue(),
+             u->getCompilationDir(),
+             DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, File)) {
+        File = '"' + File + '"';
+        Name = File.c_str();
+      }
+  } else if (Optional<uint64_t> Val = formValue.getAsUnsignedConstant())
+    Name = AttributeValueString(attr, *Val);
+
+  if (Name) {
+    OS << Name;
+  } else if (attr == DW_AT_decl_line || attr == DW_AT_call_line) {
+    OS << *formValue.getAsUnsignedConstant();
+  } else {
+    formValue.dump(OS, u);
+  }
+
+  // We have dumped the attribute raw value. For some attributes
+  // having both the raw value and the pretty-printed value is
+  // interesting. These attributes are handled below.
+  if ((attr == DW_AT_specification || attr == DW_AT_abstract_origin) &&
+      // The signature references aren't handled.
+      formValue.getForm() != DW_FORM_ref_sig8) {
+    uint32_t Ref = formValue.getAsReference(u).getValue();
+    DWARFDebugInfoEntryMinimal DIE;
+    if (const DWARFUnit *RefU = findUnitAndExtractFast(DIE, u, &Ref))
+      if (const char *Ref = DIE.getName(RefU, DINameKind::LinkageName))
+        OS << " \"" << Ref << '\"';
+  } else if (attr == DW_AT_APPLE_property_attribute) {
+    if (Optional<uint64_t> OptVal = formValue.getAsUnsignedConstant())
+      dumpApplePropertyAttribute(OS, *OptVal);
+  } else if (attr == DW_AT_ranges) {
+    dumpRanges(OS, getAddressRanges(u), u->getAddressByteSize(),
+               sizeof(BaseIndent)+indent+4);
+  }
+
   OS << ")\n";
 }
 
@@ -284,11 +366,19 @@
 
 const char *
 DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U,
-                                              FunctionNameKind Kind) const {
-  if (!isSubroutineDIE() || Kind == FunctionNameKind::None)
+                                              DINameKind Kind) const {
+  if (!isSubroutineDIE())
+    return nullptr;
+  return getName(U, Kind);
+}
+
+const char *
+DWARFDebugInfoEntryMinimal::getName(const DWARFUnit *U,
+                                    DINameKind Kind) const {
+  if (Kind == DINameKind::None)
     return nullptr;
   // Try to get mangled name only if it was asked for.
-  if (Kind == FunctionNameKind::LinkageName) {
+  if (Kind == DINameKind::LinkageName) {
     if (const char *name =
             getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, nullptr))
       return name;
@@ -303,8 +393,8 @@
       getAttributeValueAsReference(U, DW_AT_specification, -1U);
   if (spec_ref != -1U) {
     DWARFDebugInfoEntryMinimal spec_die;
-    if (spec_die.extractFast(U, &spec_ref)) {
-      if (const char *name = spec_die.getSubroutineName(U, Kind))
+    if (const DWARFUnit *RefU = findUnitAndExtractFast(spec_die, U, &spec_ref)) {
+      if (const char *name = spec_die.getName(RefU, Kind))
         return name;
     }
   }
@@ -313,8 +403,9 @@
       getAttributeValueAsReference(U, DW_AT_abstract_origin, -1U);
   if (abs_origin_ref != -1U) {
     DWARFDebugInfoEntryMinimal abs_origin_die;
-    if (abs_origin_die.extractFast(U, &abs_origin_ref)) {
-      if (const char *name = abs_origin_die.getSubroutineName(U, Kind))
+    if (const DWARFUnit *RefU = findUnitAndExtractFast(abs_origin_die, U,
+                                                       &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getName(RefU, Kind))
         return name;
     }
   }

diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index cc58eb6..7e7efb9 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
-#define LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGINFOENTRY_H
 
 #include "DWARFAbbreviationDeclaration.h"
 #include "DWARFDebugRangeList.h"
@@ -38,9 +38,9 @@
   DWARFDebugInfoEntryMinimal()
     : Offset(0), SiblingIdx(0), AbbrevDecl(nullptr) {}
 
-  void dump(raw_ostream &OS, const DWARFUnit *u, unsigned recurseDepth,
+  void dump(raw_ostream &OS, DWARFUnit *u, unsigned recurseDepth,
             unsigned indent = 0) const;
-  void dumpAttribute(raw_ostream &OS, const DWARFUnit *u, uint32_t *offset_ptr,
+  void dumpAttribute(raw_ostream &OS, DWARFUnit *u, uint32_t *offset_ptr,
                      uint16_t attr, uint16_t form, unsigned indent = 0) const;
 
   /// Extracts a debug info entry, which is a child of a given unit,
@@ -125,9 +125,12 @@
   /// returns its mangled name (or short name, if mangled is missing).
   /// This name may be fetched from specification or abstract origin
   /// for this subprogram. Returns null if no name is found.
-  const char *
-  getSubroutineName(const DWARFUnit *U,
-                    DILineInfoSpecifier::FunctionNameKind Kind) const;
+  const char *getSubroutineName(const DWARFUnit *U, DINameKind Kind) const;
+
+  /// Return the DIE name resolving DW_AT_sepcification or
+  /// DW_AT_abstract_origin references if necessary.
+  /// Returns null if no name is found.
+  const char *getName(const DWARFUnit *U, DINameKind Kind) const;
 
   /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
   /// DW_AT_call_column from DIE (or zeroes if they are missing).

diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
index ce87635..a6ee461 100644
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARFDebugLine.cpp

@@ -644,6 +644,7 @@
 
 bool
 DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
+                                              const char *CompDir,
                                               FileLineInfoKind Kind,
                                               std::string &Result) const {
   if (FileIndex == 0 || FileIndex > Prologue.FileNames.size() ||
@@ -656,15 +657,42 @@
     Result = FileName;
     return true;
   }
+
   SmallString<16> FilePath;
   uint64_t IncludeDirIndex = Entry.DirIdx;
+  const char *IncludeDir = "";
   // Be defensive about the contents of Entry.
   if (IncludeDirIndex > 0 &&
-      IncludeDirIndex <= Prologue.IncludeDirectories.size()) {
-    const char *IncludeDir = Prologue.IncludeDirectories[IncludeDirIndex - 1];
-    sys::path::append(FilePath, IncludeDir);
-  }
-  sys::path::append(FilePath, FileName);
+      IncludeDirIndex <= Prologue.IncludeDirectories.size())
+    IncludeDir = Prologue.IncludeDirectories[IncludeDirIndex - 1];
+
+  // We may still need to append compilation directory of compile unit.
+  // We know that FileName is not absolute, the only way to have an
+  // absolute path at this point would be if IncludeDir is absolute.
+  if (CompDir && Kind == FileLineInfoKind::AbsoluteFilePath &&
+      sys::path::is_relative(IncludeDir))
+    sys::path::append(FilePath, CompDir);
+
+  // sys::path::append skips empty strings.
+  sys::path::append(FilePath, IncludeDir, FileName);
   Result = FilePath.str();
   return true;
 }
+
+bool
+DWARFDebugLine::LineTable::getFileLineInfoForAddress(uint64_t Address,
+                                                     const char *CompDir,
+                                                     FileLineInfoKind Kind,
+                                                     DILineInfo &Result) const {
+  // Get the index of row we're looking for in the line table.
+  uint32_t RowIndex = lookupAddress(Address);
+  if (RowIndex == -1U)
+    return false;
+  // Take file number and line/column from the row.
+  const auto &Row = Rows[RowIndex];
+  if (!getFileNameByIndex(Row.File, CompDir, Kind, Result.FileName))
+    return false;
+  Result.Line = Row.Line;
+  Result.Column = Row.Column;
+  return true;
+}

diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h
index c7b7ec2..7a6f1bd 100644
--- a/lib/DebugInfo/DWARFDebugLine.h
+++ b/lib/DebugInfo/DWARFDebugLine.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFDEBUGLINE_H
-#define LLVM_DEBUGINFO_DWARFDEBUGLINE_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGLINE_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGLINE_H
 
 #include "DWARFRelocMap.h"
 #include "llvm/DebugInfo/DIContext.h"
@@ -179,10 +179,16 @@
 
     // Extracts filename by its index in filename table in prologue.
     // Returns true on success.
-    bool getFileNameByIndex(uint64_t FileIndex,
+    bool getFileNameByIndex(uint64_t FileIndex, const char *CompDir,
                             DILineInfoSpecifier::FileLineInfoKind Kind,
                             std::string &Result) const;
 
+    // Fills the Result argument with the file and line information
+    // corresponding to Address. Returns true on success.
+    bool getFileLineInfoForAddress(uint64_t Address, const char *CompDir, 
+                                   DILineInfoSpecifier::FileLineInfoKind Kind,
+                                   DILineInfo &Result) const;
+
     void dump(raw_ostream &OS) const;
     void clear();
 

diff --git a/lib/DebugInfo/DWARFDebugLoc.h b/lib/DebugInfo/DWARFDebugLoc.h
index 663acbb4..50110b3 100644
--- a/lib/DebugInfo/DWARFDebugLoc.h
+++ b/lib/DebugInfo/DWARFDebugLoc.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFDEBUGLOC_H
-#define LLVM_DEBUGINFO_DWARFDEBUGLOC_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGLOC_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGLOC_H
 
 #include "DWARFRelocMap.h"
 #include "llvm/ADT/SmallVector.h"

diff --git a/lib/DebugInfo/DWARFDebugRangeList.h b/lib/DebugInfo/DWARFDebugRangeList.h
index 587b550..4ee3bda 100644
--- a/lib/DebugInfo/DWARFDebugRangeList.h
+++ b/lib/DebugInfo/DWARFDebugRangeList.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
-#define LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGRANGELIST_H
+#define LLVM_LIB_DEBUGINFO_DWARFDEBUGRANGELIST_H
 
 #include "llvm/Support/DataExtractor.h"
 #include <vector>

diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index 8d0f966..69b9771 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp

@@ -139,6 +139,8 @@
     switch (Form) {
     case DW_FORM_addr:
     case DW_FORM_ref_addr: {
+      if (!cu)
+        return false;
       uint16_t AddrSize =
           (Form == DW_FORM_addr)
               ? cu->getAddressByteSize()
@@ -179,8 +181,10 @@
       break;
     case DW_FORM_data4:
     case DW_FORM_ref4: {
-      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr);
       Value.uval = data.getU32(offset_ptr);
+      if (!cu)
+        break;
+      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr-4);
       if (AI != cu->getRelocMap()->end())
         Value.uval += AI->second.second;
       break;
@@ -193,13 +197,12 @@
       Value.sval = data.getSLEB128(offset_ptr);
       break;
     case DW_FORM_strp: {
-      RelocAddrMap::const_iterator AI
-        = cu->getRelocMap()->find(*offset_ptr);
-      if (AI != cu->getRelocMap()->end()) {
-        const std::pair<uint8_t, int64_t> &R = AI->second;
-        Value.uval = data.getU32(offset_ptr) + R.second;
-      } else
-        Value.uval = data.getU32(offset_ptr);
+      Value.uval = data.getU32(offset_ptr);
+      if (!cu)
+        break;
+      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr-4);
+      if (AI != cu->getRelocMap()->end())
+        Value.uval += AI->second.second;
       break;
     }
     case DW_FORM_udata:
@@ -215,13 +218,12 @@
       break;
     case DW_FORM_sec_offset: {
       // FIXME: This is 64-bit for DWARF64.
-      RelocAddrMap::const_iterator AI
-        = cu->getRelocMap()->find(*offset_ptr);
-      if (AI != cu->getRelocMap()->end()) {
-        const std::pair<uint8_t, int64_t> &R = AI->second;
-        Value.uval = data.getU32(offset_ptr) + R.second;
-      } else
-        Value.uval = data.getU32(offset_ptr);
+      Value.uval = data.getU32(offset_ptr);
+      if (!cu)
+        break;
+      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr-4);
+      if (AI != cu->getRelocMap()->end())
+        Value.uval +=  AI->second.second;
       break;
     }
     case DW_FORM_flag_present:
@@ -360,8 +362,6 @@
 
 void
 DWARFFormValue::dump(raw_ostream &OS, const DWARFUnit *cu) const {
-  DataExtractor debug_str_data(cu->getStringSection(), true, 0);
-  DataExtractor debug_str_offset_data(cu->getStringOffsetSection(), true, 0);
   uint64_t uvalue = Value.uval;
   bool cu_relative_offset = false;
 
@@ -543,7 +543,15 @@
 }
 
 Optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
-  if (!isFormClass(FC_Constant) || Form == DW_FORM_sdata)
+  if ((!isFormClass(FC_Constant) && !isFormClass(FC_Flag))
+      || Form == DW_FORM_sdata)
     return None;
   return Value.uval;
 }
+
+Optional<ArrayRef<uint8_t>> DWARFFormValue::getAsBlock() const {
+  if (!isFormClass(FC_Block) && !isFormClass(FC_Exprloc))
+    return None;
+  return ArrayRef<uint8_t>(Value.data, Value.uval);
+}
+

diff --git a/lib/DebugInfo/DWARFRelocMap.h b/lib/DebugInfo/DWARFRelocMap.h
index 6929e36..d7fe303 100644
--- a/lib/DebugInfo/DWARFRelocMap.h
+++ b/lib/DebugInfo/DWARFRelocMap.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFRELOCMAP_H
-#define LLVM_DEBUGINFO_DWARFRELOCMAP_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFRELOCMAP_H
+#define LLVM_LIB_DEBUGINFO_DWARFRELOCMAP_H
 
 #include "llvm/ADT/DenseMap.h"
 
@@ -18,5 +18,5 @@
 
 } // namespace llvm
 
-#endif // LLVM_DEBUGINFO_DWARFRELOCMAP_H
+#endif
 

diff --git a/lib/DebugInfo/DWARFSection.h b/lib/DebugInfo/DWARFSection.h
new file mode 100644
index 0000000..3aaf0ff
--- /dev/null
+++ b/lib/DebugInfo/DWARFSection.h

@@ -0,0 +1,24 @@
+//===-- DWARFSection.h ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_DEBUGINFO_DWARFSECTION_H
+#define LLVM_LIB_DEBUGINFO_DWARFSECTION_H
+
+#include "DWARFRelocMap.h"
+
+namespace llvm {
+
+struct DWARFSection {
+  StringRef Data;
+  RelocAddrMap Relocs;
+};
+
+}
+
+#endif

diff --git a/lib/DebugInfo/DWARFTypeUnit.h b/lib/DebugInfo/DWARFTypeUnit.h
index cf773b8..7471b5a 100644
--- a/lib/DebugInfo/DWARFTypeUnit.h
+++ b/lib/DebugInfo/DWARFTypeUnit.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFTYPEUNIT_H
-#define LLVM_DEBUGINFO_DWARFTYPEUNIT_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFTYPEUNIT_H
+#define LLVM_LIB_DEBUGINFO_DWARFTYPEUNIT_H
 
 #include "DWARFUnit.h"
 
@@ -19,10 +19,11 @@
   uint64_t TypeHash;
   uint32_t TypeOffset;
 public:
-  DWARFTypeUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef RS,
-                StringRef SS, StringRef SOS, StringRef AOS,
-                const RelocAddrMap *M, bool LE)
-      : DWARFUnit(DA, IS, RS, SS, SOS, AOS, M, LE) {}
+  DWARFTypeUnit(DWARFContext &Context, const DWARFSection &Section,
+                const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                StringRef SOS, StringRef AOS, bool LE,
+                const DWARFUnitSectionBase &UnitSection)
+      : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LE, UnitSection) {}
   uint32_t getHeaderSize() const override {
     return DWARFUnit::getHeaderSize() + 12;
   }

diff --git a/lib/DebugInfo/DWARFUnit.cpp b/lib/DebugInfo/DWARFUnit.cpp
index 39d0a0f..82c4529 100644
--- a/lib/DebugInfo/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARFUnit.cpp

@@ -17,12 +17,26 @@
 using namespace llvm;
 using namespace dwarf;
 
-DWARFUnit::DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef RS,
-                     StringRef SS, StringRef SOS, StringRef AOS,
-                     const RelocAddrMap *M, bool LE)
-    : Abbrev(DA), InfoSection(IS), RangeSection(RS), StringSection(SS),
-      StringOffsetSection(SOS), AddrOffsetSection(AOS), RelocMap(M),
-      isLittleEndian(LE) {
+void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
+  parseImpl(C, Section, C.getDebugAbbrev(), C.getRangeSection(),
+            C.getStringSection(), StringRef(), C.getAddrSection(),
+            C.isLittleEndian());
+}
+
+void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
+                                    const DWARFSection &DWOSection) {
+  parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), C.getRangeDWOSection(),
+            C.getStringDWOSection(), C.getStringOffsetDWOSection(),
+            C.getAddrSection(), C.isLittleEndian());
+}
+
+DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
+                     const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                     StringRef SOS, StringRef AOS, bool LE,
+                     const DWARFUnitSectionBase &UnitSection)
+    : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
+      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
+      isLittleEndian(LE), UnitSection(UnitSection) {
   clear();
 }
 
@@ -235,10 +249,14 @@
   return DieArray.size();
 }
 
-DWARFUnit::DWOHolder::DWOHolder(object::ObjectFile *DWOFile)
-    : DWOFile(DWOFile),
-      DWOContext(cast<DWARFContext>(DIContext::getDWARFContext(DWOFile))),
-      DWOU(nullptr) {
+DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath)
+    : DWOFile(), DWOContext(), DWOU(nullptr) {
+  auto Obj = object::ObjectFile::createObjectFile(DWOPath);
+  if (!Obj)
+    return;
+  DWOFile = std::move(Obj.get());
+  DWOContext.reset(
+      cast<DWARFContext>(DIContext::getDWARFContext(*DWOFile.getBinary())));
   if (DWOContext->getNumDWOCompileUnits() > 0)
     DWOU = DWOContext->getDWOCompileUnitAtIndex(0);
 }
@@ -260,12 +278,7 @@
     sys::path::append(AbsolutePath, CompilationDir);
   }
   sys::path::append(AbsolutePath, DWOFileName);
-  ErrorOr<object::ObjectFile *> DWOFile =
-      object::ObjectFile::createObjectFile(AbsolutePath);
-  if (!DWOFile)
-    return false;
-  // Reset DWOHolder.
-  DWO.reset(new DWOHolder(DWOFile.get()));
+  DWO = llvm::make_unique<DWOHolder>(AbsolutePath);
   DWARFUnit *DWOCU = DWO->getUnit();
   // Verify that compile unit in .dwo file is valid.
   if (!DWOCU || DWOCU->getDWOId() != getDWOId()) {

diff --git a/lib/DebugInfo/DWARFUnit.h b/lib/DebugInfo/DWARFUnit.h
index 471da36..786f00f 100644
--- a/lib/DebugInfo/DWARFUnit.h
+++ b/lib/DebugInfo/DWARFUnit.h

@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_DWARFUNIT_H
-#define LLVM_DEBUGINFO_DWARFUNIT_H
+#ifndef LLVM_LIB_DEBUGINFO_DWARFUNIT_H
+#define LLVM_LIB_DEBUGINFO_DWARFUNIT_H
 
 #include "DWARFDebugAbbrev.h"
 #include "DWARFDebugInfoEntry.h"
 #include "DWARFDebugRangeList.h"
 #include "DWARFRelocMap.h"
+#include "DWARFSection.h"
 #include <vector>
 
 namespace llvm {
@@ -22,21 +23,96 @@
 class ObjectFile;
 }
 
+class DWARFContext;
 class DWARFDebugAbbrev;
+class DWARFUnit;
 class StringRef;
 class raw_ostream;
 
+/// Base class for all DWARFUnitSection classes. This provides the
+/// functionality common to all unit types.
+class DWARFUnitSectionBase {
+public:
+  /// Returns the Unit that contains the given section offset in the
+  /// same section this Unit originated from.
+  virtual DWARFUnit *getUnitForOffset(uint32_t Offset) const = 0;
+
+  void parse(DWARFContext &C, const DWARFSection &Section);
+  void parseDWO(DWARFContext &C, const DWARFSection &DWOSection);
+
+protected:
+  virtual void parseImpl(DWARFContext &Context, const DWARFSection &Section,
+                         const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                         StringRef SOS, StringRef AOS, bool isLittleEndian) = 0;
+
+  ~DWARFUnitSectionBase() {}
+};
+
+/// Concrete instance of DWARFUnitSection, specialized for one Unit type.
+template<typename UnitType>
+class DWARFUnitSection final : public SmallVector<std::unique_ptr<UnitType>, 1>,
+                               public DWARFUnitSectionBase {
+
+  struct UnitOffsetComparator {
+    bool operator()(uint32_t LHS,
+                    const std::unique_ptr<UnitType> &RHS) const {
+      return LHS < RHS->getNextUnitOffset();
+    }
+  };
+
+  bool Parsed;
+
+public:
+  DWARFUnitSection() : Parsed(false) {}
+  DWARFUnitSection(DWARFUnitSection &&DUS) :
+    SmallVector<std::unique_ptr<UnitType>, 1>(std::move(DUS)), Parsed(DUS.Parsed) {}
+
+  typedef llvm::SmallVectorImpl<std::unique_ptr<UnitType>> UnitVector;
+  typedef typename UnitVector::iterator iterator;
+  typedef llvm::iterator_range<typename UnitVector::iterator> iterator_range;
+
+  UnitType *getUnitForOffset(uint32_t Offset) const override {
+    auto *CU = std::upper_bound(this->begin(), this->end(), Offset,
+                                UnitOffsetComparator());
+    if (CU != this->end())
+      return CU->get();
+    return nullptr;
+  }
+
+private:
+  void parseImpl(DWARFContext &Context, const DWARFSection &Section,
+                 const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+                 StringRef SOS, StringRef AOS, bool LE) override {
+    if (Parsed)
+      return;
+    DataExtractor Data(Section.Data, LE, 0);
+    uint32_t Offset = 0;
+    while (Data.isValidOffset(Offset)) {
+      auto U = llvm::make_unique<UnitType>(Context, Section, DA, RS, SS, SOS,
+                                           AOS, LE, *this);
+      if (!U->extract(Data, &Offset))
+        break;
+      this->push_back(std::move(U));
+      Offset = this->back()->getNextUnitOffset();
+    }
+    Parsed = true;
+  }
+};
+
 class DWARFUnit {
+  DWARFContext &Context;
+  // Section containing this DWARFUnit.
+  const DWARFSection &InfoSection;
+
   const DWARFDebugAbbrev *Abbrev;
-  StringRef InfoSection;
   StringRef RangeSection;
   uint32_t RangeSectionBase;
   StringRef StringSection;
   StringRef StringOffsetSection;
   StringRef AddrOffsetSection;
   uint32_t AddrOffsetSectionBase;
-  const RelocAddrMap *RelocMap;
   bool isLittleEndian;
+  const DWARFUnitSectionBase &UnitSection;
 
   uint32_t Offset;
   uint32_t Length;
@@ -48,11 +124,11 @@
   std::vector<DWARFDebugInfoEntryMinimal> DieArray;
 
   class DWOHolder {
-    std::unique_ptr<object::ObjectFile> DWOFile;
+    object::OwningBinary<object::ObjectFile> DWOFile;
     std::unique_ptr<DWARFContext> DWOContext;
     DWARFUnit *DWOU;
   public:
-    DWOHolder(object::ObjectFile *DWOFile);
+    DWOHolder(StringRef DWOPath);
     DWARFUnit *getUnit() const { return DWOU; }
   };
   std::unique_ptr<DWOHolder> DWO;
@@ -63,12 +139,15 @@
   virtual uint32_t getHeaderSize() const { return 11; }
 
 public:
-  DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef RS,
-            StringRef SS, StringRef SOS, StringRef AOS, const RelocAddrMap *M,
-            bool LE);
+  DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
+            const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
+            StringRef SOS, StringRef AOS, bool LE,
+            const DWARFUnitSectionBase &UnitSection);
 
   virtual ~DWARFUnit();
 
+  DWARFContext& getContext() const { return Context; }
+
   StringRef getStringSection() const { return StringSection; }
   StringRef getStringOffsetSection() const { return StringOffsetSection; }
   void setAddrOffsetSection(StringRef AOS, uint32_t Base) {
@@ -85,13 +164,13 @@
   bool getStringOffsetSectionItem(uint32_t Index, uint32_t &Result) const;
 
   DataExtractor getDebugInfoExtractor() const {
-    return DataExtractor(InfoSection, isLittleEndian, AddrSize);
+    return DataExtractor(InfoSection.Data, isLittleEndian, AddrSize);
   }
   DataExtractor getStringExtractor() const {
     return DataExtractor(StringSection, false, 0);
   }
 
-  const RelocAddrMap *getRelocMap() const { return RelocMap; }
+  const RelocAddrMap *getRelocMap() const { return &InfoSection.Relocs; }
 
   bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
 
@@ -131,6 +210,9 @@
   /// chain is valid as long as parsed compile unit DIEs are not cleared.
   DWARFDebugInfoEntryInlinedChain getInlinedChainForAddress(uint64_t Address);
 
+  /// getUnitSection - Return the DWARFUnitSection containing this unit.
+  const DWARFUnitSectionBase &getUnitSection() const { return UnitSection; }
+
 private:
   /// Size in bytes of the .debug_info data associated with this compile unit.
   size_t getDebugInfoSize() const { return Length + 4 - getHeaderSize(); }

diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
index 3102c7b..fae5bb9 100644
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt

@@ -3,12 +3,12 @@
 add_llvm_library(LLVMExecutionEngine
   ExecutionEngine.cpp
   ExecutionEngineBindings.cpp
+  JITEventListener.cpp
   RTDyldMemoryManager.cpp
   TargetSelect.cpp
   )
 
 add_subdirectory(Interpreter)
-add_subdirectory(JIT)
 add_subdirectory(MCJIT)
 add_subdirectory(RuntimeDyld)
 

diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index b0e985d..5a6d656 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp

@@ -16,7 +16,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -24,6 +24,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
@@ -47,22 +48,13 @@
 void ObjectBuffer::anchor() {}
 void ObjectBufferStream::anchor() {}
 
-ExecutionEngine *(*ExecutionEngine::JITCtor)(
-  Module *M,
-  std::string *ErrorStr,
-  JITMemoryManager *JMM,
-  bool GVsWithCode,
-  TargetMachine *TM) = nullptr;
 ExecutionEngine *(*ExecutionEngine::MCJITCtor)(
-  Module *M,
-  std::string *ErrorStr,
-  RTDyldMemoryManager *MCJMM,
-  bool GVsWithCode,
-  TargetMachine *TM) = nullptr;
-ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M,
+    std::unique_ptr<Module> M, std::string *ErrorStr,
+    RTDyldMemoryManager *MCJMM, std::unique_ptr<TargetMachine> TM) = nullptr;
+ExecutionEngine *(*ExecutionEngine::InterpCtor)(std::unique_ptr<Module> M,
                                                 std::string *ErrorStr) =nullptr;
 
-ExecutionEngine::ExecutionEngine(Module *M)
+ExecutionEngine::ExecutionEngine(std::unique_ptr<Module> M)
   : EEState(*this),
     LazyFunctionCreator(nullptr) {
   CompilingLazily         = false;
@@ -77,14 +69,12 @@
   VerifyModules = false;
 #endif
 
-  Modules.push_back(M);
   assert(M && "Module is null?");
+  Modules.push_back(std::move(M));
 }
 
 ExecutionEngine::~ExecutionEngine() {
   clearAllGlobalMappings();
-  for (unsigned i = 0, e = Modules.size(); i != e; ++i)
-    delete Modules[i];
 }
 
 namespace {
@@ -101,8 +91,8 @@
     Type *ElTy = GV->getType()->getElementType();
     size_t GVSize = (size_t)TD.getTypeAllocSize(ElTy);
     void *RawMemory = ::operator new(
-      DataLayout::RoundUpAlignment(sizeof(GVMemoryBlock),
-                                   TD.getPreferredAlignment(GV))
+      RoundUpToAlignment(sizeof(GVMemoryBlock),
+                         TD.getPreferredAlignment(GV))
       + GVSize);
     new(RawMemory) GVMemoryBlock(GV);
     return static_cast<char*>(RawMemory) + sizeof(GVMemoryBlock);
@@ -126,11 +116,20 @@
   llvm_unreachable("ExecutionEngine subclass doesn't implement addObjectFile.");
 }
 
+void
+ExecutionEngine::addObjectFile(object::OwningBinary<object::ObjectFile> O) {
+  llvm_unreachable("ExecutionEngine subclass doesn't implement addObjectFile.");
+}
+
+void ExecutionEngine::addArchive(object::OwningBinary<object::Archive> A) {
+  llvm_unreachable("ExecutionEngine subclass doesn't implement addArchive.");
+}
+
 bool ExecutionEngine::removeModule(Module *M) {
-  for(SmallVectorImpl<Module *>::iterator I = Modules.begin(),
-        E = Modules.end(); I != E; ++I) {
-    Module *Found = *I;
+  for (auto I = Modules.begin(), E = Modules.end(); I != E; ++I) {
+    Module *Found = I->get();
     if (Found == M) {
+      I->release();
       Modules.erase(I);
       clearGlobalMappingsFromModule(M);
       return true;
@@ -254,19 +253,9 @@
 
 namespace {
 class ArgvArray {
-  char *Array;
-  std::vector<char*> Values;
+  std::unique_ptr<char[]> Array;
+  std::vector<std::unique_ptr<char[]>> Values;
 public:
-  ArgvArray() : Array(nullptr) {}
-  ~ArgvArray() { clear(); }
-  void clear() {
-    delete[] Array;
-    Array = nullptr;
-    for (size_t I = 0, E = Values.size(); I != E; ++I) {
-      delete[] Values[I];
-    }
-    Values.clear();
-  }
   /// Turn a vector of strings into a nice argv style array of pointers to null
   /// terminated strings.
   void *reset(LLVMContext &C, ExecutionEngine *EE,
@@ -275,38 +264,39 @@
 }  // anonymous namespace
 void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE,
                        const std::vector<std::string> &InputArgv) {
-  clear();  // Free the old contents.
+  Values.clear();  // Free the old contents.
+  Values.reserve(InputArgv.size());
   unsigned PtrSize = EE->getDataLayout()->getPointerSize();
-  Array = new char[(InputArgv.size()+1)*PtrSize];
+  Array = make_unique<char[]>((InputArgv.size()+1)*PtrSize);
 
-  DEBUG(dbgs() << "JIT: ARGV = " << (void*)Array << "\n");
+  DEBUG(dbgs() << "JIT: ARGV = " << (void*)Array.get() << "\n");
   Type *SBytePtr = Type::getInt8PtrTy(C);
 
   for (unsigned i = 0; i != InputArgv.size(); ++i) {
     unsigned Size = InputArgv[i].size()+1;
-    char *Dest = new char[Size];
-    Values.push_back(Dest);
-    DEBUG(dbgs() << "JIT: ARGV[" << i << "] = " << (void*)Dest << "\n");
+    auto Dest = make_unique<char[]>(Size);
+    DEBUG(dbgs() << "JIT: ARGV[" << i << "] = " << (void*)Dest.get() << "\n");
 
-    std::copy(InputArgv[i].begin(), InputArgv[i].end(), Dest);
+    std::copy(InputArgv[i].begin(), InputArgv[i].end(), Dest.get());
     Dest[Size-1] = 0;
 
     // Endian safe: Array[i] = (PointerTy)Dest;
-    EE->StoreValueToMemory(PTOGV(Dest), (GenericValue*)(Array+i*PtrSize),
-                           SBytePtr);
+    EE->StoreValueToMemory(PTOGV(Dest.get()),
+                           (GenericValue*)(&Array[i*PtrSize]), SBytePtr);
+    Values.push_back(std::move(Dest));
   }
 
   // Null terminate it
   EE->StoreValueToMemory(PTOGV(nullptr),
-                         (GenericValue*)(Array+InputArgv.size()*PtrSize),
+                         (GenericValue*)(&Array[InputArgv.size()*PtrSize]),
                          SBytePtr);
-  return Array;
+  return Array.get();
 }
 
-void ExecutionEngine::runStaticConstructorsDestructors(Module *module,
+void ExecutionEngine::runStaticConstructorsDestructors(Module &module,
                                                        bool isDtors) {
   const char *Name = isDtors ? "llvm.global_dtors" : "llvm.global_ctors";
-  GlobalVariable *GV = module->getNamedGlobal(Name);
+  GlobalVariable *GV = module.getNamedGlobal(Name);
 
   // If this global has internal linkage, or if it has a use, then it must be
   // an old-style (llvmgcc3) static ctor with __main linked in and in use.  If
@@ -344,8 +334,8 @@
 
 void ExecutionEngine::runStaticConstructorsDestructors(bool isDtors) {
   // Execute global ctors/dtors for each module in the program.
-  for (unsigned i = 0, e = Modules.size(); i != e; ++i)
-    runStaticConstructorsDestructors(Modules[i], isDtors);
+  for (std::unique_ptr<Module> &M : Modules)
+    runStaticConstructorsDestructors(*M, isDtors);
 }
 
 #ifndef NDEBUG
@@ -406,68 +396,14 @@
   return runFunction(Fn, GVArgs).IntVal.getZExtValue();
 }
 
-ExecutionEngine *ExecutionEngine::create(Module *M,
-                                         bool ForceInterpreter,
-                                         std::string *ErrorStr,
-                                         CodeGenOpt::Level OptLevel,
-                                         bool GVsWithCode) {
-
-  EngineBuilder EB =
-      EngineBuilder(M)
-          .setEngineKind(ForceInterpreter ? EngineKind::Interpreter
-                                          : EngineKind::Either)
-          .setErrorStr(ErrorStr)
-          .setOptLevel(OptLevel)
-          .setAllocateGVsWithCode(GVsWithCode);
-
-  return EB.create();
-}
-
-/// createJIT - This is the factory method for creating a JIT for the current
-/// machine, it does not fall back to the interpreter.  This takes ownership
-/// of the module.
-ExecutionEngine *ExecutionEngine::createJIT(Module *M,
-                                            std::string *ErrorStr,
-                                            JITMemoryManager *JMM,
-                                            CodeGenOpt::Level OL,
-                                            bool GVsWithCode,
-                                            Reloc::Model RM,
-                                            CodeModel::Model CMM) {
-  if (!ExecutionEngine::JITCtor) {
-    if (ErrorStr)
-      *ErrorStr = "JIT has not been linked in.";
-    return nullptr;
-  }
-
-  // Use the defaults for extra parameters.  Users can use EngineBuilder to
-  // set them.
-  EngineBuilder EB(M);
-  EB.setEngineKind(EngineKind::JIT);
-  EB.setErrorStr(ErrorStr);
-  EB.setRelocationModel(RM);
-  EB.setCodeModel(CMM);
-  EB.setAllocateGVsWithCode(GVsWithCode);
-  EB.setOptLevel(OL);
-  EB.setJITMemoryManager(JMM);
-
-  // TODO: permit custom TargetOptions here
-  TargetMachine *TM = EB.selectTarget();
-  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return nullptr;
-
-  return ExecutionEngine::JITCtor(M, ErrorStr, JMM, GVsWithCode, TM);
-}
-
 void EngineBuilder::InitEngine() {
   WhichEngine = EngineKind::Either;
   ErrorStr = nullptr;
   OptLevel = CodeGenOpt::Default;
   MCJMM = nullptr;
-  JMM = nullptr;
   Options = TargetOptions();
-  AllocateGVsWithCode = false;
   RelocModel = Reloc::Default;
   CMModel = CodeModel::JITDefault;
-  UseMCJIT = false;
 
 // IR module verification is enabled by default in debug builds, and disabled
 // by default in release builds.
@@ -485,13 +421,11 @@
   // to the function tells DynamicLibrary to load the program, not a library.
   if (sys::DynamicLibrary::LoadLibraryPermanently(nullptr, ErrorStr))
     return nullptr;
-
-  assert(!(JMM && MCJMM));
   
   // If the user specified a memory manager but didn't specify which engine to
   // create, we assume they only want the JIT, and we fail if they only want
   // the interpreter.
-  if (JMM || MCJMM) {
+  if (MCJMM) {
     if (WhichEngine & EngineKind::JIT)
       WhichEngine = EngineKind::JIT;
     else {
@@ -500,14 +434,6 @@
       return nullptr;
     }
   }
-  
-  if (MCJMM && ! UseMCJIT) {
-    if (ErrorStr)
-      *ErrorStr =
-        "Cannot create a legacy JIT with a runtime dyld memory "
-        "manager.";
-    return nullptr;
-  }
 
   // Unless the interpreter was explicitly selected or the JIT is not linked,
   // try making a JIT.
@@ -520,13 +446,9 @@
     }
 
     ExecutionEngine *EE = nullptr;
-    if (UseMCJIT && ExecutionEngine::MCJITCtor)
-      EE = ExecutionEngine::MCJITCtor(M, ErrorStr, MCJMM ? MCJMM : JMM,
-                                      AllocateGVsWithCode, TheTM.release());
-    else if (ExecutionEngine::JITCtor)
-      EE = ExecutionEngine::JITCtor(M, ErrorStr, JMM,
-                                    AllocateGVsWithCode, TheTM.release());
-
+    if (ExecutionEngine::MCJITCtor)
+      EE = ExecutionEngine::MCJITCtor(std::move(M), ErrorStr, MCJMM,
+                                      std::move(TheTM));
     if (EE) {
       EE->setVerifyModules(VerifyModules);
       return EE;
@@ -537,14 +459,13 @@
   // an interpreter instead.
   if (WhichEngine & EngineKind::Interpreter) {
     if (ExecutionEngine::InterpCtor)
-      return ExecutionEngine::InterpCtor(M, ErrorStr);
+      return ExecutionEngine::InterpCtor(std::move(M), ErrorStr);
     if (ErrorStr)
       *ErrorStr = "Interpreter has not been linked in.";
     return nullptr;
   }
 
-  if ((WhichEngine & EngineKind::JIT) && !ExecutionEngine::JITCtor &&
-      !ExecutionEngine::MCJITCtor) {
+  if ((WhichEngine & EngineKind::JIT) && !ExecutionEngine::MCJITCtor) {
     if (ErrorStr)
       *ErrorStr = "JIT has not been linked in.";
   }
@@ -890,9 +811,6 @@
       Result = PTOGV(getPointerToFunctionOrStub(const_cast<Function*>(F)));
     else if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
       Result = PTOGV(getOrEmitGlobalVariable(const_cast<GlobalVariable*>(GV)));
-    else if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
-      Result = PTOGV(getPointerToBasicBlock(const_cast<BasicBlock*>(
-                                                        BA->getBasicBlock())));
     else
       llvm_unreachable("Unknown constant pointer type!");
     break;

diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 6ff1e7a..58271df 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp

@@ -27,14 +27,6 @@
 // Wrapping the C bindings types.
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(GenericValue, LLVMGenericValueRef)
 
-inline TargetLibraryInfo *unwrap(LLVMTargetLibraryInfoRef P) {
-  return reinterpret_cast<TargetLibraryInfo*>(P);
-}
-
-inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfo *P) {
-  TargetLibraryInfo *X = const_cast<TargetLibraryInfo*>(P);
-  return reinterpret_cast<LLVMTargetLibraryInfoRef>(X);
-}
 
 inline LLVMTargetMachineRef wrap(const TargetMachine *P) {
   return
@@ -110,7 +102,7 @@
                                             LLVMModuleRef M,
                                             char **OutError) {
   std::string Error;
-  EngineBuilder builder(unwrap(M));
+  EngineBuilder builder(std::unique_ptr<Module>(unwrap(M)));
   builder.setEngineKind(EngineKind::Either)
          .setErrorStr(&Error);
   if (ExecutionEngine *EE = builder.create()){
@@ -125,7 +117,7 @@
                                         LLVMModuleRef M,
                                         char **OutError) {
   std::string Error;
-  EngineBuilder builder(unwrap(M));
+  EngineBuilder builder(std::unique_ptr<Module>(unwrap(M)));
   builder.setEngineKind(EngineKind::Interpreter)
          .setErrorStr(&Error);
   if (ExecutionEngine *Interp = builder.create()) {
@@ -141,7 +133,7 @@
                                         unsigned OptLevel,
                                         char **OutError) {
   std::string Error;
-  EngineBuilder builder(unwrap(M));
+  EngineBuilder builder(std::unique_ptr<Module>(unwrap(M)));
   builder.setEngineKind(EngineKind::JIT)
          .setErrorStr(&Error)
          .setOptLevel((CodeGenOpt::Level)OptLevel);
@@ -189,10 +181,9 @@
   targetOptions.EnableFastISel = options.EnableFastISel;
 
   std::string Error;
-  EngineBuilder builder(unwrap(M));
+  EngineBuilder builder(std::unique_ptr<Module>(unwrap(M)));
   builder.setEngineKind(EngineKind::JIT)
          .setErrorStr(&Error)
-         .setUseMCJIT(true)
          .setOptLevel((CodeGenOpt::Level)options.OptLevel)
          .setCodeModel(unwrap(options.CodeModel))
          .setTargetOptions(targetOptions);
@@ -275,11 +266,10 @@
 }
 
 void LLVMFreeMachineCodeForFunction(LLVMExecutionEngineRef EE, LLVMValueRef F) {
-  unwrap(EE)->freeMachineCodeForFunction(unwrap<Function>(F));
 }
 
 void LLVMAddModule(LLVMExecutionEngineRef EE, LLVMModuleRef M){
-  unwrap(EE)->addModule(unwrap(M));
+  unwrap(EE)->addModule(std::unique_ptr<Module>(unwrap(M)));
 }
 
 void LLVMAddModuleProvider(LLVMExecutionEngineRef EE, LLVMModuleProviderRef MP){
@@ -314,7 +304,7 @@
 
 void *LLVMRecompileAndRelinkFunction(LLVMExecutionEngineRef EE,
                                      LLVMValueRef Fn) {
-  return unwrap(EE)->recompileAndRelinkFunction(unwrap<Function>(Fn));
+  return nullptr;
 }
 
 LLVMTargetDataRef LLVMGetExecutionEngineTargetData(LLVMExecutionEngineRef EE) {

diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 4e22a8b..b23ca88 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp

@@ -57,29 +57,11 @@
   ~IntelJITEventListener() {
   }
 
-  virtual void NotifyFunctionEmitted(const Function &F,
-                                     void *FnStart, size_t FnSize,
-                                     const EmittedFunctionDetails &Details);
-
-  virtual void NotifyFreeingMachineCode(void *OldPtr);
-
   virtual void NotifyObjectEmitted(const ObjectImage &Obj);
 
   virtual void NotifyFreeingObject(const ObjectImage &Obj);
 };
 
-static LineNumberInfo LineStartToIntelJITFormat(
-    uintptr_t StartAddress,
-    uintptr_t Address,
-    DebugLoc Loc) {
-  LineNumberInfo Result;
-
-  Result.Offset = Address - StartAddress;
-  Result.LineNumber = Loc.getLine();
-
-  return Result;
-}
-
 static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress,
                                                  uintptr_t Address,
                                                  DILineInfo Line) {
@@ -113,84 +95,10 @@
   return Result;
 }
 
-// Adds the just-emitted function to the symbol table.
-void IntelJITEventListener::NotifyFunctionEmitted(
-    const Function &F, void *FnStart, size_t FnSize,
-    const EmittedFunctionDetails &Details) {
-  iJIT_Method_Load FunctionMessage = FunctionDescToIntelJITFormat(*Wrapper,
-                                      F.getName().data(),
-                                      reinterpret_cast<uint64_t>(FnStart),
-                                      FnSize);
-
-  std::vector<LineNumberInfo> LineInfo;
-
-  if (!Details.LineStarts.empty()) {
-    // Now convert the line number information from the address/DebugLoc
-    // format in Details to the offset/lineno in Intel JIT API format.
-
-    LineInfo.reserve(Details.LineStarts.size() + 1);
-
-    DebugLoc FirstLoc = Details.LineStarts[0].Loc;
-    assert(!FirstLoc.isUnknown()
-           && "LineStarts should not contain unknown DebugLocs");
-
-    MDNode *FirstLocScope = FirstLoc.getScope(F.getContext());
-    DISubprogram FunctionDI = getDISubprogram(FirstLocScope);
-    if (FunctionDI.Verify()) {
-      FunctionMessage.source_file_name = const_cast<char*>(
-                                          Filenames.getFullPath(FirstLocScope));
-
-      LineNumberInfo FirstLine;
-      FirstLine.Offset = 0;
-      FirstLine.LineNumber = FunctionDI.getLineNumber();
-      LineInfo.push_back(FirstLine);
-    }
-
-    for (std::vector<EmittedFunctionDetails::LineStart>::const_iterator I =
-          Details.LineStarts.begin(), E = Details.LineStarts.end();
-          I != E; ++I) {
-      // This implementation ignores the DebugLoc filename because the Intel
-      // JIT API does not support multiple source files associated with a single
-      // JIT function
-      LineInfo.push_back(LineStartToIntelJITFormat(
-                          reinterpret_cast<uintptr_t>(FnStart),
-                          I->Address,
-                          I->Loc));
-
-      // If we have no file name yet for the function, use the filename from
-      // the first instruction that has one
-      if (FunctionMessage.source_file_name == 0) {
-        MDNode *scope = I->Loc.getScope(
-          Details.MF->getFunction()->getContext());
-        FunctionMessage.source_file_name = const_cast<char*>(
-                                                  Filenames.getFullPath(scope));
-      }
-    }
-
-    FunctionMessage.line_number_size = LineInfo.size();
-    FunctionMessage.line_number_table = &*LineInfo.begin();
-  } else {
-    FunctionMessage.line_number_size = 0;
-    FunctionMessage.line_number_table = 0;
-  }
-
-  Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
-                            &FunctionMessage);
-  MethodIDs[FnStart] = FunctionMessage.method_id;
-}
-
-void IntelJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
-  MethodIDMap::iterator I = MethodIDs.find(FnStart);
-  if (I != MethodIDs.end()) {
-    Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_UNLOAD_START, &I->second);
-    MethodIDs.erase(I);
-  }
-}
-
 void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
   // Get the address of the object image for use as a unique identifier
   const void* ObjData = Obj.getData().data();
-  DIContext* Context = DIContext::getDWARFContext(Obj.getObjectFile());
+  DIContext* Context = DIContext::getDWARFContext(*Obj.getObjectFile());
   MethodAddressVector Functions;
 
   // Use symbol info to iterate functions in the object.

diff --git a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
index 9c06fda..e36493e 100644
--- a/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt
+++ b/lib/ExecutionEngine/IntelJITEvents/LLVMBuild.txt

@@ -21,3 +21,4 @@
 type = OptionalLibrary
 name = IntelJITEvents
 parent = ExecutionEngine
+required_libraries = Core DebugInfo Support

diff --git a/lib/ExecutionEngine/Interpreter/CMakeLists.txt b/lib/ExecutionEngine/Interpreter/CMakeLists.txt
index 74df8f0..1aac3ac 100644
--- a/lib/ExecutionEngine/Interpreter/CMakeLists.txt
+++ b/lib/ExecutionEngine/Interpreter/CMakeLists.txt

@@ -13,7 +13,7 @@
   )
 
 if( LLVM_ENABLE_FFI )
-  target_link_libraries( LLVMInterpreter ${FFI_LIBRARY_PATH} )
+  target_link_libraries( LLVMInterpreter ${cmake_2_8_12_PRIVATE} ${FFI_LIBRARY_PATH} )
 endif()
 
 add_dependencies(LLVMInterpreter intrinsics_gen)

diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 671bbee..b022101 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp

@@ -28,6 +28,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/UniqueLock.h"
 #include <cmath>
 #include <csignal>
 #include <cstdio>
@@ -51,7 +52,7 @@
 typedef GenericValue (*ExFunc)(FunctionType *,
                                const std::vector<GenericValue> &);
 static ManagedStatic<std::map<const Function *, ExFunc> > ExportedFunctions;
-static std::map<std::string, ExFunc> FuncNames;
+static ManagedStatic<std::map<std::string, ExFunc> > FuncNames;
 
 #ifdef USE_LIBFFI
 typedef void (*RawFunc)();
@@ -97,9 +98,9 @@
   ExtName += "_" + F->getName().str();
 
   sys::ScopedLock Writer(*FunctionsLock);
-  ExFunc FnPtr = FuncNames[ExtName];
+  ExFunc FnPtr = (*FuncNames)[ExtName];
   if (!FnPtr)
-    FnPtr = FuncNames["lle_X_" + F->getName().str()];
+    FnPtr = (*FuncNames)["lle_X_" + F->getName().str()];
   if (!FnPtr)  // Try calling a generic function... if it exists...
     FnPtr = (ExFunc)(intptr_t)
       sys::DynamicLibrary::SearchForAddressOfSymbol("lle_X_" +
@@ -248,14 +249,14 @@
                                      const std::vector<GenericValue> &ArgVals) {
   TheInterpreter = this;
 
-  FunctionsLock->acquire();
+  unique_lock<sys::Mutex> Guard(*FunctionsLock);
 
   // Do a lookup to see if the function is in our cache... this should just be a
   // deferred annotation!
   std::map<const Function *, ExFunc>::iterator FI = ExportedFunctions->find(F);
   if (ExFunc Fn = (FI == ExportedFunctions->end()) ? lookupFunction(F)
                                                    : FI->second) {
-    FunctionsLock->release();
+    Guard.unlock();
     return Fn(F->getFunctionType(), ArgVals);
   }
 
@@ -273,7 +274,7 @@
     RawFn = RF->second;
   }
 
-  FunctionsLock->release();
+  Guard.unlock();
 
   GenericValue Result;
   if (RawFn != 0 && ffiInvoke(RawFn, F, ArgVals, getDataLayout(), Result))
@@ -497,15 +498,15 @@
 
 void Interpreter::initializeExternalFunctions() {
   sys::ScopedLock Writer(*FunctionsLock);
-  FuncNames["lle_X_atexit"]       = lle_X_atexit;
-  FuncNames["lle_X_exit"]         = lle_X_exit;
-  FuncNames["lle_X_abort"]        = lle_X_abort;
+  (*FuncNames)["lle_X_atexit"]       = lle_X_atexit;
+  (*FuncNames)["lle_X_exit"]         = lle_X_exit;
+  (*FuncNames)["lle_X_abort"]        = lle_X_abort;
 
-  FuncNames["lle_X_printf"]       = lle_X_printf;
-  FuncNames["lle_X_sprintf"]      = lle_X_sprintf;
-  FuncNames["lle_X_sscanf"]       = lle_X_sscanf;
-  FuncNames["lle_X_scanf"]        = lle_X_scanf;
-  FuncNames["lle_X_fprintf"]      = lle_X_fprintf;
-  FuncNames["lle_X_memset"]       = lle_X_memset;
-  FuncNames["lle_X_memcpy"]       = lle_X_memcpy;
+  (*FuncNames)["lle_X_printf"]       = lle_X_printf;
+  (*FuncNames)["lle_X_sprintf"]      = lle_X_sprintf;
+  (*FuncNames)["lle_X_sscanf"]       = lle_X_sscanf;
+  (*FuncNames)["lle_X_scanf"]        = lle_X_scanf;
+  (*FuncNames)["lle_X_fprintf"]      = lle_X_fprintf;
+  (*FuncNames)["lle_X_memset"]       = lle_X_memset;
+  (*FuncNames)["lle_X_memcpy"]       = lle_X_memcpy;
 }

diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.cpp b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
index 814efcc..8562981 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.cpp
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.cpp

@@ -30,9 +30,10 @@
 
 extern "C" void LLVMLinkInInterpreter() { }
 
-/// create - Create a new interpreter object.  This can never fail.
+/// Create a new interpreter object.
 ///
-ExecutionEngine *Interpreter::create(Module *M, std::string* ErrStr) {
+ExecutionEngine *Interpreter::create(std::unique_ptr<Module> M,
+                                     std::string *ErrStr) {
   // Tell this Module to materialize everything and release the GVMaterializer.
   if (std::error_code EC = M->materializeAllPermanently()) {
     if (ErrStr)
@@ -41,15 +42,15 @@
     return nullptr;
   }
 
-  return new Interpreter(M);
+  return new Interpreter(std::move(M));
 }
 
 //===----------------------------------------------------------------------===//
 // Interpreter ctor - Initialize stuff
 //
-Interpreter::Interpreter(Module *M)
-  : ExecutionEngine(M), TD(M) {
-      
+Interpreter::Interpreter(std::unique_ptr<Module> M)
+  : ExecutionEngine(std::move(M)), TD(Modules.back().get()) {
+
   memset(&ExitValue.Untyped, 0, sizeof(ExitValue.Untyped));
   setDataLayout(&TD);
   // Initialize the "backend"

diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.h b/lib/ExecutionEngine/Interpreter/Interpreter.h
index 2145cde..2be9c59 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLI_INTERPRETER_H
-#define LLI_INTERPRETER_H
+#ifndef LLVM_LIB_EXECUTIONENGINE_INTERPRETER_INTERPRETER_H
+#define LLVM_LIB_EXECUTIONENGINE_INTERPRETER_INTERPRETER_H
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
@@ -37,29 +37,24 @@
 // stack, which causes the dtor to be run, which frees all the alloca'd memory.
 //
 class AllocaHolder {
-  friend class AllocaHolderHandle;
-  std::vector<void*> Allocations;
-  unsigned RefCnt;
+  std::vector<void *> Allocations;
+
 public:
-  AllocaHolder() : RefCnt(0) {}
-  void add(void *mem) { Allocations.push_back(mem); }
-  ~AllocaHolder() {
-    for (unsigned i = 0; i < Allocations.size(); ++i)
-      free(Allocations[i]);
+  AllocaHolder() {}
+
+  // Make this type move-only. Define explicit move special members for MSVC.
+  AllocaHolder(AllocaHolder &&RHS) : Allocations(std::move(RHS.Allocations)) {}
+  AllocaHolder &operator=(AllocaHolder &&RHS) {
+    Allocations = std::move(RHS.Allocations);
+    return *this;
   }
-};
 
-// AllocaHolderHandle gives AllocaHolder value semantics so we can stick it into
-// a vector...
-//
-class AllocaHolderHandle {
-  AllocaHolder *H;
-public:
-  AllocaHolderHandle() : H(new AllocaHolder()) { H->RefCnt++; }
-  AllocaHolderHandle(const AllocaHolderHandle &AH) : H(AH.H) { H->RefCnt++; }
-  ~AllocaHolderHandle() { if (--H->RefCnt == 0) delete H; }
+  ~AllocaHolder() {
+    for (void *Allocation : Allocations)
+      free(Allocation);
+  }
 
-  void add(void *mem) { H->add(mem); }
+  void add(void *Mem) { Allocations.push_back(Mem); }
 };
 
 typedef std::vector<GenericValue> ValuePlaneTy;
@@ -71,11 +66,29 @@
   Function             *CurFunction;// The currently executing function
   BasicBlock           *CurBB;      // The currently executing BB
   BasicBlock::iterator  CurInst;    // The next instruction to execute
-  std::map<Value *, GenericValue> Values; // LLVM values used in this invocation
-  std::vector<GenericValue>  VarArgs; // Values passed through an ellipsis
   CallSite             Caller;     // Holds the call that called subframes.
                                    // NULL if main func or debugger invoked fn
-  AllocaHolderHandle    Allocas;    // Track memory allocated by alloca
+  std::map<Value *, GenericValue> Values; // LLVM values used in this invocation
+  std::vector<GenericValue>  VarArgs; // Values passed through an ellipsis
+  AllocaHolder Allocas;            // Track memory allocated by alloca
+
+  ExecutionContext() : CurFunction(nullptr), CurBB(nullptr), CurInst(nullptr) {}
+
+  ExecutionContext(ExecutionContext &&O)
+      : CurFunction(O.CurFunction), CurBB(O.CurBB), CurInst(O.CurInst),
+        Caller(O.Caller), Values(std::move(O.Values)),
+        VarArgs(std::move(O.VarArgs)), Allocas(std::move(O.Allocas)) {}
+
+  ExecutionContext &operator=(ExecutionContext &&O) {
+    CurFunction = O.CurFunction;
+    CurBB = O.CurBB;
+    CurInst = O.CurInst;
+    Caller = O.Caller;
+    Values = std::move(O.Values);
+    VarArgs = std::move(O.VarArgs);
+    Allocas = std::move(O.Allocas);
+    return *this;
+  }
 };
 
 // Interpreter - This class represents the entirety of the interpreter.
@@ -94,7 +107,7 @@
   std::vector<Function*> AtExitHandlers;
 
 public:
-  explicit Interpreter(Module *M);
+  explicit Interpreter(std::unique_ptr<Module> M);
   ~Interpreter();
 
   /// runAtExitHandlers - Run any functions registered by the program's calls to
@@ -105,33 +118,23 @@
   static void Register() {
     InterpCtor = create;
   }
-  
-  /// create - Create an interpreter ExecutionEngine. This can never fail.
+
+  /// Create an interpreter ExecutionEngine.
   ///
-  static ExecutionEngine *create(Module *M, std::string *ErrorStr = nullptr);
+  static ExecutionEngine *create(std::unique_ptr<Module> M,
+                                 std::string *ErrorStr = nullptr);
 
   /// run - Start execution with the specified function and arguments.
   ///
   GenericValue runFunction(Function *F,
                            const std::vector<GenericValue> &ArgValues) override;
 
-  void *getPointerToNamedFunction(const std::string &Name,
+  void *getPointerToNamedFunction(StringRef Name,
                                   bool AbortOnFailure = true) override {
     // FIXME: not implemented.
     return nullptr;
   }
 
-  /// recompileAndRelinkFunction - For the interpreter, functions are always
-  /// up-to-date.
-  ///
-  void *recompileAndRelinkFunction(Function *F) override {
-    return getPointerToFunction(F);
-  }
-
-  /// freeMachineCodeForFunction - The interpreter does not generate any code.
-  ///
-  void freeMachineCodeForFunction(Function *F) override { }
-
   // Methods used to execute code:
   // Place a call on the stack
   void callFunction(Function *F, const std::vector<GenericValue> &ArgVals);
@@ -213,7 +216,6 @@
   void SwitchToNewBasicBlock(BasicBlock *Dest, ExecutionContext &SF);
 
   void *getPointerToFunction(Function *F) override { return (void*)F; }
-  void *getPointerToBasicBlock(BasicBlock *BB) override { return (void*)BB; }
 
   void initializeExecutionEngine() { }
   void initializeExternalFunctions();

diff --git a/lib/ExecutionEngine/JIT/Android.mk b/lib/ExecutionEngine/JIT/Android.mk
deleted file mode 100644
index 0466ba0..0000000
--- a/lib/ExecutionEngine/JIT/Android.mk
+++ /dev/null

@@ -1,17 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-# For the host
-# =====================================================
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES :=	\
-	JIT.cpp	\
-	JITEmitter.cpp	\
-	JITMemoryManager.cpp
-
-LOCAL_MODULE:= libLLVMJIT
-
-LOCAL_MODULE_TAGS := optional
-
-include $(LLVM_HOST_BUILD_MK)
-include $(BUILD_HOST_STATIC_LIBRARY)

diff --git a/lib/ExecutionEngine/JIT/CMakeLists.txt b/lib/ExecutionEngine/JIT/CMakeLists.txt
deleted file mode 100644
index e16baed..0000000
--- a/lib/ExecutionEngine/JIT/CMakeLists.txt
+++ /dev/null

@@ -1,8 +0,0 @@
-# TODO: Support other architectures. See Makefile.
-add_definitions(-DENABLE_X86_JIT)
-
-add_llvm_library(LLVMJIT
-  JIT.cpp
-  JITEmitter.cpp
-  JITMemoryManager.cpp
-  )

diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
deleted file mode 100644
index 83ec978..0000000
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ /dev/null

@@ -1,695 +0,0 @@
-//===-- JIT.cpp - LLVM Just in Time Compiler ------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tool implements a just-in-time compiler for LLVM, allowing direct
-// execution of LLVM bitcode in an efficient manner.
-//
-//===----------------------------------------------------------------------===//
-
-#include "JIT.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/CodeGen/MachineCodeInfo.h"
-#include "llvm/Config/config.h"
-#include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MutexGuard.h"
-#include "llvm/Target/TargetJITInfo.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-#ifdef __APPLE__
-// Apple gcc defaults to -fuse-cxa-atexit (i.e. calls __cxa_atexit instead
-// of atexit). It passes the address of linker generated symbol __dso_handle
-// to the function.
-// This configuration change happened at version 5330.
-# include <AvailabilityMacros.h>
-# if defined(MAC_OS_X_VERSION_10_4) && \
-     ((MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_4) || \
-      (MAC_OS_X_VERSION_MIN_REQUIRED == MAC_OS_X_VERSION_10_4 && \
-       __APPLE_CC__ >= 5330))
-#  ifndef HAVE___DSO_HANDLE
-#   define HAVE___DSO_HANDLE 1
-#  endif
-# endif
-#endif
-
-#if HAVE___DSO_HANDLE
-extern void *__dso_handle __attribute__ ((__visibility__ ("hidden")));
-#endif
-
-namespace {
-
-static struct RegisterJIT {
-  RegisterJIT() { JIT::Register(); }
-} JITRegistrator;
-
-}
-
-extern "C" void LLVMLinkInJIT() {
-}
-
-/// createJIT - This is the factory method for creating a JIT for the current
-/// machine, it does not fall back to the interpreter.  This takes ownership
-/// of the module.
-ExecutionEngine *JIT::createJIT(Module *M,
-                                std::string *ErrorStr,
-                                JITMemoryManager *JMM,
-                                bool GVsWithCode,
-                                TargetMachine *TM) {
-  // Try to register the program as a source of symbols to resolve against.
-  //
-  // FIXME: Don't do this here.
-  sys::DynamicLibrary::LoadLibraryPermanently(nullptr, nullptr);
-
-  // If the target supports JIT code generation, create the JIT.
-  if (TargetJITInfo *TJ = TM->getJITInfo()) {
-    return new JIT(M, *TM, *TJ, JMM, GVsWithCode);
-  } else {
-    if (ErrorStr)
-      *ErrorStr = "target does not support JIT code generation";
-    return nullptr;
-  }
-}
-
-namespace {
-/// This class supports the global getPointerToNamedFunction(), which allows
-/// bugpoint or gdb users to search for a function by name without any context.
-class JitPool {
-  SmallPtrSet<JIT*, 1> JITs;  // Optimize for process containing just 1 JIT.
-  mutable sys::Mutex Lock;
-public:
-  void Add(JIT *jit) {
-    MutexGuard guard(Lock);
-    JITs.insert(jit);
-  }
-  void Remove(JIT *jit) {
-    MutexGuard guard(Lock);
-    JITs.erase(jit);
-  }
-  void *getPointerToNamedFunction(const char *Name) const {
-    MutexGuard guard(Lock);
-    assert(JITs.size() != 0 && "No Jit registered");
-    //search function in every instance of JIT
-    for (SmallPtrSet<JIT*, 1>::const_iterator Jit = JITs.begin(),
-           end = JITs.end();
-         Jit != end; ++Jit) {
-      if (Function *F = (*Jit)->FindFunctionNamed(Name))
-        return (*Jit)->getPointerToFunction(F);
-    }
-    // The function is not available : fallback on the first created (will
-    // search in symbol of the current program/library)
-    return (*JITs.begin())->getPointerToNamedFunction(Name);
-  }
-};
-ManagedStatic<JitPool> AllJits;
-}
-extern "C" {
-  // getPointerToNamedFunction - This function is used as a global wrapper to
-  // JIT::getPointerToNamedFunction for the purpose of resolving symbols when
-  // bugpoint is debugging the JIT. In that scenario, we are loading an .so and
-  // need to resolve function(s) that are being mis-codegenerated, so we need to
-  // resolve their addresses at runtime, and this is the way to do it.
-  void *getPointerToNamedFunction(const char *Name) {
-    return AllJits->getPointerToNamedFunction(Name);
-  }
-}
-
-JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
-         JITMemoryManager *jmm, bool GVsWithCode)
-  : ExecutionEngine(M), TM(tm), TJI(tji),
-    JMM(jmm ? jmm : JITMemoryManager::CreateDefaultMemManager()),
-    AllocateGVsWithCode(GVsWithCode), isAlreadyCodeGenerating(false) {
-  setDataLayout(TM.getDataLayout());
-
-  jitstate = new JITState(M);
-
-  // Initialize JCE
-  JCE = createEmitter(*this, JMM, TM);
-
-  // Register in global list of all JITs.
-  AllJits->Add(this);
-
-  // Add target data
-  MutexGuard locked(lock);
-  FunctionPassManager &PM = jitstate->getPM();
-  M->setDataLayout(TM.getDataLayout());
-  PM.add(new DataLayoutPass(M));
-
-  // Turn the machine code intermediate representation into bytes in memory that
-  // may be executed.
-  if (TM.addPassesToEmitMachineCode(PM, *JCE, !getVerifyModules())) {
-    report_fatal_error("Target does not support machine code emission!");
-  }
-
-  // Initialize passes.
-  PM.doInitialization();
-}
-
-JIT::~JIT() {
-  // Cleanup.
-  AllJits->Remove(this);
-  delete jitstate;
-  delete JCE;
-  // JMM is a ownership of JCE, so we no need delete JMM here.
-  delete &TM;
-}
-
-/// addModule - Add a new Module to the JIT.  If we previously removed the last
-/// Module, we need re-initialize jitstate with a valid Module.
-void JIT::addModule(Module *M) {
-  MutexGuard locked(lock);
-
-  if (Modules.empty()) {
-    assert(!jitstate && "jitstate should be NULL if Modules vector is empty!");
-
-    jitstate = new JITState(M);
-
-    FunctionPassManager &PM = jitstate->getPM();
-    M->setDataLayout(TM.getDataLayout());
-    PM.add(new DataLayoutPass(M));
-
-    // Turn the machine code intermediate representation into bytes in memory
-    // that may be executed.
-    if (TM.addPassesToEmitMachineCode(PM, *JCE, !getVerifyModules())) {
-      report_fatal_error("Target does not support machine code emission!");
-    }
-
-    // Initialize passes.
-    PM.doInitialization();
-  }
-
-  ExecutionEngine::addModule(M);
-}
-
-/// removeModule - If we are removing the last Module, invalidate the jitstate
-/// since the PassManager it contains references a released Module.
-bool JIT::removeModule(Module *M) {
-  bool result = ExecutionEngine::removeModule(M);
-
-  MutexGuard locked(lock);
-
-  if (jitstate && jitstate->getModule() == M) {
-    delete jitstate;
-    jitstate = nullptr;
-  }
-
-  if (!jitstate && !Modules.empty()) {
-    jitstate = new JITState(Modules[0]);
-
-    FunctionPassManager &PM = jitstate->getPM();
-    M->setDataLayout(TM.getDataLayout());
-    PM.add(new DataLayoutPass(M));
-
-    // Turn the machine code intermediate representation into bytes in memory
-    // that may be executed.
-    if (TM.addPassesToEmitMachineCode(PM, *JCE, !getVerifyModules())) {
-      report_fatal_error("Target does not support machine code emission!");
-    }
-
-    // Initialize passes.
-    PM.doInitialization();
-  }
-  return result;
-}
-
-/// run - Start execution with the specified function and arguments.
-///
-GenericValue JIT::runFunction(Function *F,
-                              const std::vector<GenericValue> &ArgValues) {
-  assert(F && "Function *F was null at entry to run()");
-
-  void *FPtr = getPointerToFunction(F);
-  assert(FPtr && "Pointer to fn's code was null after getPointerToFunction");
-  FunctionType *FTy = F->getFunctionType();
-  Type *RetTy = FTy->getReturnType();
-
-  assert((FTy->getNumParams() == ArgValues.size() ||
-          (FTy->isVarArg() && FTy->getNumParams() <= ArgValues.size())) &&
-         "Wrong number of arguments passed into function!");
-  assert(FTy->getNumParams() == ArgValues.size() &&
-         "This doesn't support passing arguments through varargs (yet)!");
-
-  // Handle some common cases first.  These cases correspond to common `main'
-  // prototypes.
-  if (RetTy->isIntegerTy(32) || RetTy->isVoidTy()) {
-    switch (ArgValues.size()) {
-    case 3:
-      if (FTy->getParamType(0)->isIntegerTy(32) &&
-          FTy->getParamType(1)->isPointerTy() &&
-          FTy->getParamType(2)->isPointerTy()) {
-        int (*PF)(int, char **, const char **) =
-          (int(*)(int, char **, const char **))(intptr_t)FPtr;
-
-        // Call the function.
-        GenericValue rv;
-        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
-                                 (char **)GVTOP(ArgValues[1]),
-                                 (const char **)GVTOP(ArgValues[2])));
-        return rv;
-      }
-      break;
-    case 2:
-      if (FTy->getParamType(0)->isIntegerTy(32) &&
-          FTy->getParamType(1)->isPointerTy()) {
-        int (*PF)(int, char **) = (int(*)(int, char **))(intptr_t)FPtr;
-
-        // Call the function.
-        GenericValue rv;
-        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
-                                 (char **)GVTOP(ArgValues[1])));
-        return rv;
-      }
-      break;
-    case 1:
-      if (FTy->getParamType(0)->isIntegerTy(32)) {
-        GenericValue rv;
-        int (*PF)(int) = (int(*)(int))(intptr_t)FPtr;
-        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue()));
-        return rv;
-      }
-      if (FTy->getParamType(0)->isPointerTy()) {
-        GenericValue rv;
-        int (*PF)(char *) = (int(*)(char *))(intptr_t)FPtr;
-        rv.IntVal = APInt(32, PF((char*)GVTOP(ArgValues[0])));
-        return rv;
-      }
-      break;
-    }
-  }
-
-  // Handle cases where no arguments are passed first.
-  if (ArgValues.empty()) {
-    GenericValue rv;
-    switch (RetTy->getTypeID()) {
-    default: llvm_unreachable("Unknown return type for function call!");
-    case Type::IntegerTyID: {
-      unsigned BitWidth = cast<IntegerType>(RetTy)->getBitWidth();
-      if (BitWidth == 1)
-        rv.IntVal = APInt(BitWidth, ((bool(*)())(intptr_t)FPtr)());
-      else if (BitWidth <= 8)
-        rv.IntVal = APInt(BitWidth, ((char(*)())(intptr_t)FPtr)());
-      else if (BitWidth <= 16)
-        rv.IntVal = APInt(BitWidth, ((short(*)())(intptr_t)FPtr)());
-      else if (BitWidth <= 32)
-        rv.IntVal = APInt(BitWidth, ((int(*)())(intptr_t)FPtr)());
-      else if (BitWidth <= 64)
-        rv.IntVal = APInt(BitWidth, ((int64_t(*)())(intptr_t)FPtr)());
-      else
-        llvm_unreachable("Integer types > 64 bits not supported");
-      return rv;
-    }
-    case Type::VoidTyID:
-      rv.IntVal = APInt(32, ((int(*)())(intptr_t)FPtr)());
-      return rv;
-    case Type::FloatTyID:
-      rv.FloatVal = ((float(*)())(intptr_t)FPtr)();
-      return rv;
-    case Type::DoubleTyID:
-      rv.DoubleVal = ((double(*)())(intptr_t)FPtr)();
-      return rv;
-    case Type::X86_FP80TyID:
-    case Type::FP128TyID:
-    case Type::PPC_FP128TyID:
-      llvm_unreachable("long double not supported yet");
-    case Type::PointerTyID:
-      return PTOGV(((void*(*)())(intptr_t)FPtr)());
-    }
-  }
-
-  // Okay, this is not one of our quick and easy cases.  Because we don't have a
-  // full FFI, we have to codegen a nullary stub function that just calls the
-  // function we are interested in, passing in constants for all of the
-  // arguments.  Make this function and return.
-
-  // First, create the function.
-  FunctionType *STy=FunctionType::get(RetTy, false);
-  Function *Stub = Function::Create(STy, Function::InternalLinkage, "",
-                                    F->getParent());
-
-  // Insert a basic block.
-  BasicBlock *StubBB = BasicBlock::Create(F->getContext(), "", Stub);
-
-  // Convert all of the GenericValue arguments over to constants.  Note that we
-  // currently don't support varargs.
-  SmallVector<Value*, 8> Args;
-  for (unsigned i = 0, e = ArgValues.size(); i != e; ++i) {
-    Constant *C = nullptr;
-    Type *ArgTy = FTy->getParamType(i);
-    const GenericValue &AV = ArgValues[i];
-    switch (ArgTy->getTypeID()) {
-    default: llvm_unreachable("Unknown argument type for function call!");
-    case Type::IntegerTyID:
-        C = ConstantInt::get(F->getContext(), AV.IntVal);
-        break;
-    case Type::FloatTyID:
-        C = ConstantFP::get(F->getContext(), APFloat(AV.FloatVal));
-        break;
-    case Type::DoubleTyID:
-        C = ConstantFP::get(F->getContext(), APFloat(AV.DoubleVal));
-        break;
-    case Type::PPC_FP128TyID:
-    case Type::X86_FP80TyID:
-    case Type::FP128TyID:
-        C = ConstantFP::get(F->getContext(), APFloat(ArgTy->getFltSemantics(),
-                                                     AV.IntVal));
-        break;
-    case Type::PointerTyID:
-      void *ArgPtr = GVTOP(AV);
-      if (sizeof(void*) == 4)
-        C = ConstantInt::get(Type::getInt32Ty(F->getContext()),
-                             (int)(intptr_t)ArgPtr);
-      else
-        C = ConstantInt::get(Type::getInt64Ty(F->getContext()),
-                             (intptr_t)ArgPtr);
-      // Cast the integer to pointer
-      C = ConstantExpr::getIntToPtr(C, ArgTy);
-      break;
-    }
-    Args.push_back(C);
-  }
-
-  CallInst *TheCall = CallInst::Create(F, Args, "", StubBB);
-  TheCall->setCallingConv(F->getCallingConv());
-  TheCall->setTailCall();
-  if (!TheCall->getType()->isVoidTy())
-    // Return result of the call.
-    ReturnInst::Create(F->getContext(), TheCall, StubBB);
-  else
-    ReturnInst::Create(F->getContext(), StubBB);           // Just return void.
-
-  // Finally, call our nullary stub function.
-  GenericValue Result = runFunction(Stub, std::vector<GenericValue>());
-  // Erase it, since no other function can have a reference to it.
-  Stub->eraseFromParent();
-  // And return the result.
-  return Result;
-}
-
-void JIT::RegisterJITEventListener(JITEventListener *L) {
-  if (!L)
-    return;
-  MutexGuard locked(lock);
-  EventListeners.push_back(L);
-}
-void JIT::UnregisterJITEventListener(JITEventListener *L) {
-  if (!L)
-    return;
-  MutexGuard locked(lock);
-  std::vector<JITEventListener*>::reverse_iterator I=
-      std::find(EventListeners.rbegin(), EventListeners.rend(), L);
-  if (I != EventListeners.rend()) {
-    std::swap(*I, EventListeners.back());
-    EventListeners.pop_back();
-  }
-}
-void JIT::NotifyFunctionEmitted(
-    const Function &F,
-    void *Code, size_t Size,
-    const JITEvent_EmittedFunctionDetails &Details) {
-  MutexGuard locked(lock);
-  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
-    EventListeners[I]->NotifyFunctionEmitted(F, Code, Size, Details);
-  }
-}
-
-void JIT::NotifyFreeingMachineCode(void *OldPtr) {
-  MutexGuard locked(lock);
-  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
-    EventListeners[I]->NotifyFreeingMachineCode(OldPtr);
-  }
-}
-
-/// runJITOnFunction - Run the FunctionPassManager full of
-/// just-in-time compilation passes on F, hopefully filling in
-/// GlobalAddress[F] with the address of F's machine code.
-///
-void JIT::runJITOnFunction(Function *F, MachineCodeInfo *MCI) {
-  MutexGuard locked(lock);
-
-  class MCIListener : public JITEventListener {
-    MachineCodeInfo *const MCI;
-   public:
-    MCIListener(MachineCodeInfo *mci) : MCI(mci) {}
-    void NotifyFunctionEmitted(const Function &, void *Code, size_t Size,
-                               const EmittedFunctionDetails &) override {
-      MCI->setAddress(Code);
-      MCI->setSize(Size);
-    }
-  };
-  MCIListener MCIL(MCI);
-  if (MCI)
-    RegisterJITEventListener(&MCIL);
-
-  runJITOnFunctionUnlocked(F);
-
-  if (MCI)
-    UnregisterJITEventListener(&MCIL);
-}
-
-void JIT::runJITOnFunctionUnlocked(Function *F) {
-  assert(!isAlreadyCodeGenerating && "Error: Recursive compilation detected!");
-
-  jitTheFunctionUnlocked(F);
-
-  // If the function referred to another function that had not yet been
-  // read from bitcode, and we are jitting non-lazily, emit it now.
-  while (!jitstate->getPendingFunctions().empty()) {
-    Function *PF = jitstate->getPendingFunctions().back();
-    jitstate->getPendingFunctions().pop_back();
-
-    assert(!PF->hasAvailableExternallyLinkage() &&
-           "Externally-defined function should not be in pending list.");
-
-    jitTheFunctionUnlocked(PF);
-
-    // Now that the function has been jitted, ask the JITEmitter to rewrite
-    // the stub with real address of the function.
-    updateFunctionStubUnlocked(PF);
-  }
-}
-
-void JIT::jitTheFunctionUnlocked(Function *F) {
-  isAlreadyCodeGenerating = true;
-  jitstate->getPM().run(*F);
-  isAlreadyCodeGenerating = false;
-
-  // clear basic block addresses after this function is done
-  getBasicBlockAddressMap().clear();
-}
-
-/// getPointerToFunction - This method is used to get the address of the
-/// specified function, compiling it if necessary.
-///
-void *JIT::getPointerToFunction(Function *F) {
-
-  if (void *Addr = getPointerToGlobalIfAvailable(F))
-    return Addr;   // Check if function already code gen'd
-
-  MutexGuard locked(lock);
-
-  // Now that this thread owns the lock, make sure we read in the function if it
-  // exists in this Module.
-  std::string ErrorMsg;
-  if (F->Materialize(&ErrorMsg)) {
-    report_fatal_error("Error reading function '" + F->getName()+
-                      "' from bitcode file: " + ErrorMsg);
-  }
-
-  // ... and check if another thread has already code gen'd the function.
-  if (void *Addr = getPointerToGlobalIfAvailable(F))
-    return Addr;
-
-  if (F->isDeclaration() || F->hasAvailableExternallyLinkage()) {
-    bool AbortOnFailure = !F->hasExternalWeakLinkage();
-    void *Addr = getPointerToNamedFunction(F->getName(), AbortOnFailure);
-    addGlobalMapping(F, Addr);
-    return Addr;
-  }
-
-  runJITOnFunctionUnlocked(F);
-
-  void *Addr = getPointerToGlobalIfAvailable(F);
-  assert(Addr && "Code generation didn't add function to GlobalAddress table!");
-  return Addr;
-}
-
-void JIT::addPointerToBasicBlock(const BasicBlock *BB, void *Addr) {
-  MutexGuard locked(lock);
-
-  BasicBlockAddressMapTy::iterator I =
-    getBasicBlockAddressMap().find(BB);
-  if (I == getBasicBlockAddressMap().end()) {
-    getBasicBlockAddressMap()[BB] = Addr;
-  } else {
-    // ignore repeats: some BBs can be split into few MBBs?
-  }
-}
-
-void JIT::clearPointerToBasicBlock(const BasicBlock *BB) {
-  MutexGuard locked(lock);
-  getBasicBlockAddressMap().erase(BB);
-}
-
-void *JIT::getPointerToBasicBlock(BasicBlock *BB) {
-  // make sure it's function is compiled by JIT
-  (void)getPointerToFunction(BB->getParent());
-
-  // resolve basic block address
-  MutexGuard locked(lock);
-
-  BasicBlockAddressMapTy::iterator I =
-    getBasicBlockAddressMap().find(BB);
-  if (I != getBasicBlockAddressMap().end()) {
-    return I->second;
-  } else {
-    llvm_unreachable("JIT does not have BB address for address-of-label, was"
-                     " it eliminated by optimizer?");
-  }
-}
-
-void *JIT::getPointerToNamedFunction(const std::string &Name,
-                                     bool AbortOnFailure){
-  if (!isSymbolSearchingDisabled()) {
-    void *ptr = JMM->getPointerToNamedFunction(Name, false);
-    if (ptr)
-      return ptr;
-  }
-
-  /// If a LazyFunctionCreator is installed, use it to get/create the function.
-  if (LazyFunctionCreator)
-    if (void *RP = LazyFunctionCreator(Name))
-      return RP;
-
-  if (AbortOnFailure) {
-    report_fatal_error("Program used external function '"+Name+
-                      "' which could not be resolved!");
-  }
-  return nullptr;
-}
-
-
-/// getOrEmitGlobalVariable - Return the address of the specified global
-/// variable, possibly emitting it to memory if needed.  This is used by the
-/// Emitter.
-void *JIT::getOrEmitGlobalVariable(const GlobalVariable *GV) {
-  MutexGuard locked(lock);
-
-  void *Ptr = getPointerToGlobalIfAvailable(GV);
-  if (Ptr) return Ptr;
-
-  // If the global is external, just remember the address.
-  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage()) {
-#if HAVE___DSO_HANDLE
-    if (GV->getName() == "__dso_handle")
-      return (void*)&__dso_handle;
-#endif
-    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(GV->getName());
-    if (!Ptr) {
-      report_fatal_error("Could not resolve external global address: "
-                        +GV->getName());
-    }
-    addGlobalMapping(GV, Ptr);
-  } else {
-    // If the global hasn't been emitted to memory yet, allocate space and
-    // emit it into memory.
-    Ptr = getMemoryForGV(GV);
-    addGlobalMapping(GV, Ptr);
-    EmitGlobalVariable(GV);  // Initialize the variable.
-  }
-  return Ptr;
-}
-
-/// recompileAndRelinkFunction - This method is used to force a function
-/// which has already been compiled, to be compiled again, possibly
-/// after it has been modified. Then the entry to the old copy is overwritten
-/// with a branch to the new copy. If there was no old copy, this acts
-/// just like JIT::getPointerToFunction().
-///
-void *JIT::recompileAndRelinkFunction(Function *F) {
-  void *OldAddr = getPointerToGlobalIfAvailable(F);
-
-  // If it's not already compiled there is no reason to patch it up.
-  if (!OldAddr) return getPointerToFunction(F);
-
-  // Delete the old function mapping.
-  addGlobalMapping(F, nullptr);
-
-  // Recodegen the function
-  runJITOnFunction(F);
-
-  // Update state, forward the old function to the new function.
-  void *Addr = getPointerToGlobalIfAvailable(F);
-  assert(Addr && "Code generation didn't add function to GlobalAddress table!");
-  TJI.replaceMachineCodeForFunction(OldAddr, Addr);
-  return Addr;
-}
-
-/// getMemoryForGV - This method abstracts memory allocation of global
-/// variable so that the JIT can allocate thread local variables depending
-/// on the target.
-///
-char* JIT::getMemoryForGV(const GlobalVariable* GV) {
-  char *Ptr;
-
-  // GlobalVariable's which are not "constant" will cause trouble in a server
-  // situation. It's returned in the same block of memory as code which may
-  // not be writable.
-  if (isGVCompilationDisabled() && !GV->isConstant()) {
-    report_fatal_error("Compilation of non-internal GlobalValue is disabled!");
-  }
-
-  // Some applications require globals and code to live together, so they may
-  // be allocated into the same buffer, but in general globals are allocated
-  // through the memory manager which puts them near the code but not in the
-  // same buffer.
-  Type *GlobalType = GV->getType()->getElementType();
-  size_t S = getDataLayout()->getTypeAllocSize(GlobalType);
-  size_t A = getDataLayout()->getPreferredAlignment(GV);
-  if (GV->isThreadLocal()) {
-    MutexGuard locked(lock);
-    Ptr = TJI.allocateThreadLocalMemory(S);
-  } else if (TJI.allocateSeparateGVMemory()) {
-    if (A <= 8) {
-      Ptr = (char*)malloc(S);
-    } else {
-      // Allocate S+A bytes of memory, then use an aligned pointer within that
-      // space.
-      Ptr = (char*)malloc(S+A);
-      unsigned MisAligned = ((intptr_t)Ptr & (A-1));
-      Ptr = Ptr + (MisAligned ? (A-MisAligned) : 0);
-    }
-  } else if (AllocateGVsWithCode) {
-    Ptr = (char*)JCE->allocateSpace(S, A);
-  } else {
-    Ptr = (char*)JCE->allocateGlobal(S, A);
-  }
-  return Ptr;
-}
-
-void JIT::addPendingFunction(Function *F) {
-  MutexGuard locked(lock);
-  jitstate->getPendingFunctions().push_back(F);
-}
-
-
-JITEventListener::~JITEventListener() {}

diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
deleted file mode 100644
index 69a7c36..0000000
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ /dev/null

@@ -1,229 +0,0 @@
-//===-- JIT.h - Class definition for the JIT --------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the top-level JIT data structure.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef JIT_H
-#define JIT_H
-
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/PassManager.h"
-
-namespace llvm {
-
-class Function;
-struct JITEvent_EmittedFunctionDetails;
-class MachineCodeEmitter;
-class MachineCodeInfo;
-class TargetJITInfo;
-class TargetMachine;
-
-class JITState {
-private:
-  FunctionPassManager PM;  // Passes to compile a function
-  Module *M;               // Module used to create the PM
-
-  /// PendingFunctions - Functions which have not been code generated yet, but
-  /// were called from a function being code generated.
-  std::vector<AssertingVH<Function> > PendingFunctions;
-
-public:
-  explicit JITState(Module *M) : PM(M), M(M) {}
-
-  FunctionPassManager &getPM() {
-    return PM;
-  }
-
-  Module *getModule() const { return M; }
-  std::vector<AssertingVH<Function> > &getPendingFunctions() {
-    return PendingFunctions;
-  }
-};
-
-
-class JIT : public ExecutionEngine {
-  /// types
-  typedef ValueMap<const BasicBlock *, void *>
-      BasicBlockAddressMapTy;
-  /// data
-  TargetMachine &TM;       // The current target we are compiling to
-  TargetJITInfo &TJI;      // The JITInfo for the target we are compiling to
-  JITCodeEmitter *JCE;     // JCE object
-  JITMemoryManager *JMM;
-  std::vector<JITEventListener*> EventListeners;
-
-  /// AllocateGVsWithCode - Some applications require that global variables and
-  /// code be allocated into the same region of memory, in which case this flag
-  /// should be set to true.  Doing so breaks freeMachineCodeForFunction.
-  bool AllocateGVsWithCode;
-
-  /// True while the JIT is generating code.  Used to assert against recursive
-  /// entry.
-  bool isAlreadyCodeGenerating;
-
-  JITState *jitstate;
-
-  /// BasicBlockAddressMap - A mapping between LLVM basic blocks and their
-  /// actualized version, only filled for basic blocks that have their address
-  /// taken.
-  BasicBlockAddressMapTy BasicBlockAddressMap;
-
-
-  JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
-      JITMemoryManager *JMM, bool AllocateGVsWithCode);
-public:
-  ~JIT();
-
-  static void Register() {
-    JITCtor = createJIT;
-  }
-
-  /// getJITInfo - Return the target JIT information structure.
-  ///
-  TargetJITInfo &getJITInfo() const { return TJI; }
-
-  /// create - Create an return a new JIT compiler if there is one available
-  /// for the current target.  Otherwise, return null.
-  ///
-  static ExecutionEngine *create(Module *M,
-                                 std::string *Err,
-                                 JITMemoryManager *JMM,
-                                 CodeGenOpt::Level OptLevel =
-                                   CodeGenOpt::Default,
-                                 bool GVsWithCode = true,
-                                 Reloc::Model RM = Reloc::Default,
-                                 CodeModel::Model CMM = CodeModel::JITDefault) {
-    return ExecutionEngine::createJIT(M, Err, JMM, OptLevel, GVsWithCode,
-                                      RM, CMM);
-  }
-
-  void addModule(Module *M) override;
-
-  /// removeModule - Remove a Module from the list of modules.  Returns true if
-  /// M is found.
-  bool removeModule(Module *M) override;
-
-  /// runFunction - Start execution with the specified function and arguments.
-  ///
-  GenericValue runFunction(Function *F,
-                           const std::vector<GenericValue> &ArgValues) override;
-
-  /// getPointerToNamedFunction - This method returns the address of the
-  /// specified function by using the MemoryManager. As such it is only
-  /// useful for resolving library symbols, not code generated symbols.
-  ///
-  /// If AbortOnFailure is false and no function with the given name is
-  /// found, this function silently returns a null pointer. Otherwise,
-  /// it prints a message to stderr and aborts.
-  ///
-  void *getPointerToNamedFunction(const std::string &Name,
-                                  bool AbortOnFailure = true) override;
-
-  // CompilationCallback - Invoked the first time that a call site is found,
-  // which causes lazy compilation of the target function.
-  //
-  static void CompilationCallback();
-
-  /// getPointerToFunction - This returns the address of the specified function,
-  /// compiling it if necessary.
-  ///
-  void *getPointerToFunction(Function *F) override;
-
-  /// addPointerToBasicBlock - Adds address of the specific basic block.
-  void addPointerToBasicBlock(const BasicBlock *BB, void *Addr);
-
-  /// clearPointerToBasicBlock - Removes address of specific basic block.
-  void clearPointerToBasicBlock(const BasicBlock *BB);
-
-  /// getPointerToBasicBlock - This returns the address of the specified basic
-  /// block, assuming function is compiled.
-  void *getPointerToBasicBlock(BasicBlock *BB) override;
-
-  /// getOrEmitGlobalVariable - Return the address of the specified global
-  /// variable, possibly emitting it to memory if needed.  This is used by the
-  /// Emitter.
-  void *getOrEmitGlobalVariable(const GlobalVariable *GV) override;
-
-  /// getPointerToFunctionOrStub - If the specified function has been
-  /// code-gen'd, return a pointer to the function.  If not, compile it, or use
-  /// a stub to implement lazy compilation if available.
-  ///
-  void *getPointerToFunctionOrStub(Function *F) override;
-
-  /// recompileAndRelinkFunction - This method is used to force a function
-  /// which has already been compiled, to be compiled again, possibly
-  /// after it has been modified. Then the entry to the old copy is overwritten
-  /// with a branch to the new copy. If there was no old copy, this acts
-  /// just like JIT::getPointerToFunction().
-  ///
-  void *recompileAndRelinkFunction(Function *F) override;
-
-  /// freeMachineCodeForFunction - deallocate memory used to code-generate this
-  /// Function.
-  ///
-  void freeMachineCodeForFunction(Function *F) override;
-
-  /// addPendingFunction - while jitting non-lazily, a called but non-codegen'd
-  /// function was encountered.  Add it to a pending list to be processed after
-  /// the current function.
-  ///
-  void addPendingFunction(Function *F);
-
-  /// getCodeEmitter - Return the code emitter this JIT is emitting into.
-  ///
-  JITCodeEmitter *getCodeEmitter() const { return JCE; }
-
-  static ExecutionEngine *createJIT(Module *M,
-                                    std::string *ErrorStr,
-                                    JITMemoryManager *JMM,
-                                    bool GVsWithCode,
-                                    TargetMachine *TM);
-
-  // Run the JIT on F and return information about the generated code
-  void runJITOnFunction(Function *F, MachineCodeInfo *MCI = nullptr) override;
-
-  void RegisterJITEventListener(JITEventListener *L) override;
-  void UnregisterJITEventListener(JITEventListener *L) override;
-
-  TargetMachine *getTargetMachine() override { return &TM; }
-
-  /// These functions correspond to the methods on JITEventListener.  They
-  /// iterate over the registered listeners and call the corresponding method on
-  /// each.
-  void NotifyFunctionEmitted(
-      const Function &F, void *Code, size_t Size,
-      const JITEvent_EmittedFunctionDetails &Details);
-  void NotifyFreeingMachineCode(void *OldPtr);
-
-  BasicBlockAddressMapTy &
-  getBasicBlockAddressMap() {
-    return BasicBlockAddressMap;
-  }
-
-
-private:
-  static JITCodeEmitter *createEmitter(JIT &J, JITMemoryManager *JMM,
-                                       TargetMachine &tm);
-  void runJITOnFunctionUnlocked(Function *F);
-  void updateFunctionStubUnlocked(Function *F);
-  void jitTheFunctionUnlocked(Function *F);
-
-protected:
-
-  /// getMemoryforGV - Allocate memory for a global variable.
-  char* getMemoryForGV(const GlobalVariable* GV) override;
-
-};
-
-} // End llvm namespace
-
-#endif

diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
deleted file mode 100644
index 50b8c10..0000000
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ /dev/null

@@ -1,1256 +0,0 @@
-//===-- JITEmitter.cpp - Write machine code to executable memory ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a MachineCodeEmitter object that is used by the JIT to
-// write machine code to memory and remember where relocatable values are.
-//
-//===----------------------------------------------------------------------===//
-
-#include "JIT.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/CodeGen/MachineCodeInfo.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRelocation.h"
-#include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Disassembler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/MutexGuard.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetJITInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include <algorithm>
-#ifndef NDEBUG
-#include <iomanip>
-#endif
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-STATISTIC(NumBytes, "Number of bytes of machine code compiled");
-STATISTIC(NumRelos, "Number of relocations applied");
-STATISTIC(NumRetries, "Number of retries with more memory");
-
-
-// A declaration may stop being a declaration once it's fully read from bitcode.
-// This function returns true if F is fully read and is still a declaration.
-static bool isNonGhostDeclaration(const Function *F) {
-  return F->isDeclaration() && !F->isMaterializable();
-}
-
-//===----------------------------------------------------------------------===//
-// JIT lazy compilation code.
-//
-namespace {
-  class JITEmitter;
-  class JITResolverState;
-
-  template<typename ValueTy>
-  struct NoRAUWValueMapConfig : public ValueMapConfig<ValueTy> {
-    typedef JITResolverState *ExtraData;
-    static void onRAUW(JITResolverState *, Value *Old, Value *New) {
-      llvm_unreachable("The JIT doesn't know how to handle a"
-                       " RAUW on a value it has emitted.");
-    }
-  };
-
-  struct CallSiteValueMapConfig : public NoRAUWValueMapConfig<Function*> {
-    typedef JITResolverState *ExtraData;
-    static void onDelete(JITResolverState *JRS, Function *F);
-  };
-
-  class JITResolverState {
-  public:
-    typedef ValueMap<Function*, void*, NoRAUWValueMapConfig<Function*> >
-      FunctionToLazyStubMapTy;
-    typedef std::map<void*, AssertingVH<Function> > CallSiteToFunctionMapTy;
-    typedef ValueMap<Function *, SmallPtrSet<void*, 1>,
-                     CallSiteValueMapConfig> FunctionToCallSitesMapTy;
-    typedef std::map<AssertingVH<GlobalValue>, void*> GlobalToIndirectSymMapTy;
-  private:
-    /// FunctionToLazyStubMap - Keep track of the lazy stub created for a
-    /// particular function so that we can reuse them if necessary.
-    FunctionToLazyStubMapTy FunctionToLazyStubMap;
-
-    /// CallSiteToFunctionMap - Keep track of the function that each lazy call
-    /// site corresponds to, and vice versa.
-    CallSiteToFunctionMapTy CallSiteToFunctionMap;
-    FunctionToCallSitesMapTy FunctionToCallSitesMap;
-
-    /// GlobalToIndirectSymMap - Keep track of the indirect symbol created for a
-    /// particular GlobalVariable so that we can reuse them if necessary.
-    GlobalToIndirectSymMapTy GlobalToIndirectSymMap;
-
-#ifndef NDEBUG
-    /// Instance of the JIT this ResolverState serves.
-    JIT *TheJIT;
-#endif
-
-  public:
-    JITResolverState(JIT *jit) : FunctionToLazyStubMap(this),
-                                 FunctionToCallSitesMap(this) {
-#ifndef NDEBUG
-      TheJIT = jit;
-#endif
-    }
-
-    FunctionToLazyStubMapTy& getFunctionToLazyStubMap() {
-      return FunctionToLazyStubMap;
-    }
-
-    GlobalToIndirectSymMapTy& getGlobalToIndirectSymMap() {
-      return GlobalToIndirectSymMap;
-    }
-
-    std::pair<void *, Function *> LookupFunctionFromCallSite(
-        void *CallSite) const {
-      // The address given to us for the stub may not be exactly right, it
-      // might be a little bit after the stub.  As such, use upper_bound to
-      // find it.
-      CallSiteToFunctionMapTy::const_iterator I =
-        CallSiteToFunctionMap.upper_bound(CallSite);
-      assert(I != CallSiteToFunctionMap.begin() &&
-             "This is not a known call site!");
-      --I;
-      return *I;
-    }
-
-    void AddCallSite(void *CallSite, Function *F) {
-      bool Inserted = CallSiteToFunctionMap.insert(
-          std::make_pair(CallSite, F)).second;
-      (void)Inserted;
-      assert(Inserted && "Pair was already in CallSiteToFunctionMap");
-      FunctionToCallSitesMap[F].insert(CallSite);
-    }
-
-    void EraseAllCallSitesForPrelocked(Function *F);
-
-    // Erases _all_ call sites regardless of their function.  This is used to
-    // unregister the stub addresses from the StubToResolverMap in
-    // ~JITResolver().
-    void EraseAllCallSitesPrelocked();
-  };
-
-  /// JITResolver - Keep track of, and resolve, call sites for functions that
-  /// have not yet been compiled.
-  class JITResolver {
-    typedef JITResolverState::FunctionToLazyStubMapTy FunctionToLazyStubMapTy;
-    typedef JITResolverState::CallSiteToFunctionMapTy CallSiteToFunctionMapTy;
-    typedef JITResolverState::GlobalToIndirectSymMapTy GlobalToIndirectSymMapTy;
-
-    /// LazyResolverFn - The target lazy resolver function that we actually
-    /// rewrite instructions to use.
-    TargetJITInfo::LazyResolverFn LazyResolverFn;
-
-    JITResolverState state;
-
-    /// ExternalFnToStubMap - This is the equivalent of FunctionToLazyStubMap
-    /// for external functions.  TODO: Of course, external functions don't need
-    /// a lazy stub.  It's actually here to make it more likely that far calls
-    /// succeed, but no single stub can guarantee that.  I'll remove this in a
-    /// subsequent checkin when I actually fix far calls.
-    std::map<void*, void*> ExternalFnToStubMap;
-
-    /// revGOTMap - map addresses to indexes in the GOT
-    std::map<void*, unsigned> revGOTMap;
-    unsigned nextGOTIndex;
-
-    JITEmitter &JE;
-
-    /// Instance of JIT corresponding to this Resolver.
-    JIT *TheJIT;
-
-  public:
-    explicit JITResolver(JIT &jit, JITEmitter &je)
-      : state(&jit), nextGOTIndex(0), JE(je), TheJIT(&jit) {
-      LazyResolverFn = jit.getJITInfo().getLazyResolverFunction(JITCompilerFn);
-    }
-
-    ~JITResolver();
-
-    /// getLazyFunctionStubIfAvailable - This returns a pointer to a function's
-    /// lazy-compilation stub if it has already been created.
-    void *getLazyFunctionStubIfAvailable(Function *F);
-
-    /// getLazyFunctionStub - This returns a pointer to a function's
-    /// lazy-compilation stub, creating one on demand as needed.
-    void *getLazyFunctionStub(Function *F);
-
-    /// getExternalFunctionStub - Return a stub for the function at the
-    /// specified address, created lazily on demand.
-    void *getExternalFunctionStub(void *FnAddr);
-
-    /// getGlobalValueIndirectSym - Return an indirect symbol containing the
-    /// specified GV address.
-    void *getGlobalValueIndirectSym(GlobalValue *V, void *GVAddress);
-
-    /// getGOTIndexForAddress - Return a new or existing index in the GOT for
-    /// an address.  This function only manages slots, it does not manage the
-    /// contents of the slots or the memory associated with the GOT.
-    unsigned getGOTIndexForAddr(void *addr);
-
-    /// JITCompilerFn - This function is called to resolve a stub to a compiled
-    /// address.  If the LLVM Function corresponding to the stub has not yet
-    /// been compiled, this function compiles it first.
-    static void *JITCompilerFn(void *Stub);
-  };
-
-  class StubToResolverMapTy {
-    /// Map a stub address to a specific instance of a JITResolver so that
-    /// lazily-compiled functions can find the right resolver to use.
-    ///
-    /// Guarded by Lock.
-    std::map<void*, JITResolver*> Map;
-
-    /// Guards Map from concurrent accesses.
-    mutable sys::Mutex Lock;
-
-  public:
-    /// Registers a Stub to be resolved by Resolver.
-    void RegisterStubResolver(void *Stub, JITResolver *Resolver) {
-      MutexGuard guard(Lock);
-      Map.insert(std::make_pair(Stub, Resolver));
-    }
-    /// Unregisters the Stub when it's invalidated.
-    void UnregisterStubResolver(void *Stub) {
-      MutexGuard guard(Lock);
-      Map.erase(Stub);
-    }
-    /// Returns the JITResolver instance that owns the Stub.
-    JITResolver *getResolverFromStub(void *Stub) const {
-      MutexGuard guard(Lock);
-      // The address given to us for the stub may not be exactly right, it might
-      // be a little bit after the stub.  As such, use upper_bound to find it.
-      // This is the same trick as in LookupFunctionFromCallSite from
-      // JITResolverState.
-      std::map<void*, JITResolver*>::const_iterator I = Map.upper_bound(Stub);
-      assert(I != Map.begin() && "This is not a known stub!");
-      --I;
-      return I->second;
-    }
-    /// True if any stubs refer to the given resolver. Only used in an assert().
-    /// O(N)
-    bool ResolverHasStubs(JITResolver* Resolver) const {
-      MutexGuard guard(Lock);
-      for (std::map<void*, JITResolver*>::const_iterator I = Map.begin(),
-             E = Map.end(); I != E; ++I) {
-        if (I->second == Resolver)
-          return true;
-      }
-      return false;
-    }
-  };
-  /// This needs to be static so that a lazy call stub can access it with no
-  /// context except the address of the stub.
-  ManagedStatic<StubToResolverMapTy> StubToResolverMap;
-
-  /// JITEmitter - The JIT implementation of the MachineCodeEmitter, which is
-  /// used to output functions to memory for execution.
-  class JITEmitter : public JITCodeEmitter {
-    JITMemoryManager *MemMgr;
-
-    // When outputting a function stub in the context of some other function, we
-    // save BufferBegin/BufferEnd/CurBufferPtr here.
-    uint8_t *SavedBufferBegin, *SavedBufferEnd, *SavedCurBufferPtr;
-
-    // When reattempting to JIT a function after running out of space, we store
-    // the estimated size of the function we're trying to JIT here, so we can
-    // ask the memory manager for at least this much space.  When we
-    // successfully emit the function, we reset this back to zero.
-    uintptr_t SizeEstimate;
-
-    /// Relocations - These are the relocations that the function needs, as
-    /// emitted.
-    std::vector<MachineRelocation> Relocations;
-
-    /// MBBLocations - This vector is a mapping from MBB ID's to their address.
-    /// It is filled in by the StartMachineBasicBlock callback and queried by
-    /// the getMachineBasicBlockAddress callback.
-    std::vector<uintptr_t> MBBLocations;
-
-    /// ConstantPool - The constant pool for the current function.
-    ///
-    MachineConstantPool *ConstantPool;
-
-    /// ConstantPoolBase - A pointer to the first entry in the constant pool.
-    ///
-    void *ConstantPoolBase;
-
-    /// ConstPoolAddresses - Addresses of individual constant pool entries.
-    ///
-    SmallVector<uintptr_t, 8> ConstPoolAddresses;
-
-    /// JumpTable - The jump tables for the current function.
-    ///
-    MachineJumpTableInfo *JumpTable;
-
-    /// JumpTableBase - A pointer to the first entry in the jump table.
-    ///
-    void *JumpTableBase;
-
-    /// Resolver - This contains info about the currently resolved functions.
-    JITResolver Resolver;
-
-    /// LabelLocations - This vector is a mapping from Label ID's to their
-    /// address.
-    DenseMap<MCSymbol*, uintptr_t> LabelLocations;
-
-    /// MMI - Machine module info for exception informations
-    MachineModuleInfo* MMI;
-
-    // CurFn - The llvm function being emitted.  Only valid during
-    // finishFunction().
-    const Function *CurFn;
-
-    /// Information about emitted code, which is passed to the
-    /// JITEventListeners.  This is reset in startFunction and used in
-    /// finishFunction.
-    JITEvent_EmittedFunctionDetails EmissionDetails;
-
-    struct EmittedCode {
-      void *FunctionBody;  // Beginning of the function's allocation.
-      void *Code;  // The address the function's code actually starts at.
-      void *ExceptionTable;
-      EmittedCode() : FunctionBody(nullptr), Code(nullptr),
-                      ExceptionTable(nullptr) {}
-    };
-    struct EmittedFunctionConfig : public ValueMapConfig<const Function*> {
-      typedef JITEmitter *ExtraData;
-      static void onDelete(JITEmitter *, const Function*);
-      static void onRAUW(JITEmitter *, const Function*, const Function*);
-    };
-    ValueMap<const Function *, EmittedCode,
-             EmittedFunctionConfig> EmittedFunctions;
-
-    DebugLoc PrevDL;
-
-    /// Instance of the JIT
-    JIT *TheJIT;
-
-  public:
-    JITEmitter(JIT &jit, JITMemoryManager *JMM, TargetMachine &TM)
-      : SizeEstimate(0), Resolver(jit, *this), MMI(nullptr), CurFn(nullptr),
-        EmittedFunctions(this), TheJIT(&jit) {
-      MemMgr = JMM ? JMM : JITMemoryManager::CreateDefaultMemManager();
-      if (jit.getJITInfo().needsGOT()) {
-        MemMgr->AllocateGOT();
-        DEBUG(dbgs() << "JIT is managing a GOT\n");
-      }
-
-    }
-    ~JITEmitter() {
-      delete MemMgr;
-    }
-
-    JITResolver &getJITResolver() { return Resolver; }
-
-    void startFunction(MachineFunction &F) override;
-    bool finishFunction(MachineFunction &F) override;
-
-    void emitConstantPool(MachineConstantPool *MCP);
-    void initJumpTableInfo(MachineJumpTableInfo *MJTI);
-    void emitJumpTableInfo(MachineJumpTableInfo *MJTI);
-
-    void startGVStub(const GlobalValue* GV,
-                     unsigned StubSize, unsigned Alignment = 1);
-    void startGVStub(void *Buffer, unsigned StubSize);
-    void finishGVStub();
-    void *allocIndirectGV(const GlobalValue *GV, const uint8_t *Buffer,
-                          size_t Size, unsigned Alignment) override;
-
-    /// allocateSpace - Reserves space in the current block if any, or
-    /// allocate a new one of the given size.
-    void *allocateSpace(uintptr_t Size, unsigned Alignment) override;
-
-    /// allocateGlobal - Allocate memory for a global.  Unlike allocateSpace,
-    /// this method does not allocate memory in the current output buffer,
-    /// because a global may live longer than the current function.
-    void *allocateGlobal(uintptr_t Size, unsigned Alignment) override;
-
-    void addRelocation(const MachineRelocation &MR) override {
-      Relocations.push_back(MR);
-    }
-
-    void StartMachineBasicBlock(MachineBasicBlock *MBB) override {
-      if (MBBLocations.size() <= (unsigned)MBB->getNumber())
-        MBBLocations.resize((MBB->getNumber()+1)*2);
-      MBBLocations[MBB->getNumber()] = getCurrentPCValue();
-      if (MBB->hasAddressTaken())
-        TheJIT->addPointerToBasicBlock(MBB->getBasicBlock(),
-                                       (void*)getCurrentPCValue());
-      DEBUG(dbgs() << "JIT: Emitting BB" << MBB->getNumber() << " at ["
-                   << (void*) getCurrentPCValue() << "]\n");
-    }
-
-    uintptr_t getConstantPoolEntryAddress(unsigned Entry) const override;
-    uintptr_t getJumpTableEntryAddress(unsigned Entry) const override;
-
-    uintptr_t
-    getMachineBasicBlockAddress(MachineBasicBlock *MBB) const override {
-      assert(MBBLocations.size() > (unsigned)MBB->getNumber() &&
-             MBBLocations[MBB->getNumber()] && "MBB not emitted!");
-      return MBBLocations[MBB->getNumber()];
-    }
-
-    /// retryWithMoreMemory - Log a retry and deallocate all memory for the
-    /// given function.  Increase the minimum allocation size so that we get
-    /// more memory next time.
-    void retryWithMoreMemory(MachineFunction &F);
-
-    /// deallocateMemForFunction - Deallocate all memory for the specified
-    /// function body.
-    void deallocateMemForFunction(const Function *F);
-
-    void processDebugLoc(DebugLoc DL, bool BeforePrintingInsn) override;
-
-    void emitLabel(MCSymbol *Label) override {
-      LabelLocations[Label] = getCurrentPCValue();
-    }
-
-    DenseMap<MCSymbol*, uintptr_t> *getLabelLocations() override {
-      return &LabelLocations;
-    }
-
-    uintptr_t getLabelAddress(MCSymbol *Label) const override {
-      assert(LabelLocations.count(Label) && "Label not emitted!");
-      return LabelLocations.find(Label)->second;
-    }
-
-    void setModuleInfo(MachineModuleInfo* Info) override {
-      MMI = Info;
-    }
-
-  private:
-    void *getPointerToGlobal(GlobalValue *GV, void *Reference,
-                             bool MayNeedFarStub);
-    void *getPointerToGVIndirectSym(GlobalValue *V, void *Reference);
-  };
-}
-
-void CallSiteValueMapConfig::onDelete(JITResolverState *JRS, Function *F) {
-  JRS->EraseAllCallSitesForPrelocked(F);
-}
-
-void JITResolverState::EraseAllCallSitesForPrelocked(Function *F) {
-  FunctionToCallSitesMapTy::iterator F2C = FunctionToCallSitesMap.find(F);
-  if (F2C == FunctionToCallSitesMap.end())
-    return;
-  StubToResolverMapTy &S2RMap = *StubToResolverMap;
-  for (SmallPtrSet<void*, 1>::const_iterator I = F2C->second.begin(),
-         E = F2C->second.end(); I != E; ++I) {
-    S2RMap.UnregisterStubResolver(*I);
-    bool Erased = CallSiteToFunctionMap.erase(*I);
-    (void)Erased;
-    assert(Erased && "Missing call site->function mapping");
-  }
-  FunctionToCallSitesMap.erase(F2C);
-}
-
-void JITResolverState::EraseAllCallSitesPrelocked() {
-  StubToResolverMapTy &S2RMap = *StubToResolverMap;
-  for (CallSiteToFunctionMapTy::const_iterator
-         I = CallSiteToFunctionMap.begin(),
-         E = CallSiteToFunctionMap.end(); I != E; ++I) {
-    S2RMap.UnregisterStubResolver(I->first);
-  }
-  CallSiteToFunctionMap.clear();
-  FunctionToCallSitesMap.clear();
-}
-
-JITResolver::~JITResolver() {
-  // No need to lock because we're in the destructor, and state isn't shared.
-  state.EraseAllCallSitesPrelocked();
-  assert(!StubToResolverMap->ResolverHasStubs(this) &&
-         "Resolver destroyed with stubs still alive.");
-}
-
-/// getLazyFunctionStubIfAvailable - This returns a pointer to a function stub
-/// if it has already been created.
-void *JITResolver::getLazyFunctionStubIfAvailable(Function *F) {
-  MutexGuard locked(TheJIT->lock);
-
-  // If we already have a stub for this function, recycle it.
-  return state.getFunctionToLazyStubMap().lookup(F);
-}
-
-/// getFunctionStub - This returns a pointer to a function stub, creating
-/// one on demand as needed.
-void *JITResolver::getLazyFunctionStub(Function *F) {
-  MutexGuard locked(TheJIT->lock);
-
-  // If we already have a lazy stub for this function, recycle it.
-  void *&Stub = state.getFunctionToLazyStubMap()[F];
-  if (Stub) return Stub;
-
-  // Call the lazy resolver function if we are JIT'ing lazily.  Otherwise we
-  // must resolve the symbol now.
-  void *Actual = TheJIT->isCompilingLazily()
-    ? (void *)(intptr_t)LazyResolverFn : (void *)nullptr;
-
-  // If this is an external declaration, attempt to resolve the address now
-  // to place in the stub.
-  if (isNonGhostDeclaration(F) || F->hasAvailableExternallyLinkage()) {
-    Actual = TheJIT->getPointerToFunction(F);
-
-    // If we resolved the symbol to a null address (eg. a weak external)
-    // don't emit a stub. Return a null pointer to the application.
-    if (!Actual) return nullptr;
-  }
-
-  TargetJITInfo::StubLayout SL = TheJIT->getJITInfo().getStubLayout();
-  JE.startGVStub(F, SL.Size, SL.Alignment);
-  // Codegen a new stub, calling the lazy resolver or the actual address of the
-  // external function, if it was resolved.
-  Stub = TheJIT->getJITInfo().emitFunctionStub(F, Actual, JE);
-  JE.finishGVStub();
-
-  if (Actual != (void*)(intptr_t)LazyResolverFn) {
-    // If we are getting the stub for an external function, we really want the
-    // address of the stub in the GlobalAddressMap for the JIT, not the address
-    // of the external function.
-    TheJIT->updateGlobalMapping(F, Stub);
-  }
-
-  DEBUG(dbgs() << "JIT: Lazy stub emitted at [" << Stub << "] for function '"
-        << F->getName() << "'\n");
-
-  if (TheJIT->isCompilingLazily()) {
-    // Register this JITResolver as the one corresponding to this call site so
-    // JITCompilerFn will be able to find it.
-    StubToResolverMap->RegisterStubResolver(Stub, this);
-
-    // Finally, keep track of the stub-to-Function mapping so that the
-    // JITCompilerFn knows which function to compile!
-    state.AddCallSite(Stub, F);
-  } else if (!Actual) {
-    // If we are JIT'ing non-lazily but need to call a function that does not
-    // exist yet, add it to the JIT's work list so that we can fill in the
-    // stub address later.
-    assert(!isNonGhostDeclaration(F) && !F->hasAvailableExternallyLinkage() &&
-           "'Actual' should have been set above.");
-    TheJIT->addPendingFunction(F);
-  }
-
-  return Stub;
-}
-
-/// getGlobalValueIndirectSym - Return a lazy pointer containing the specified
-/// GV address.
-void *JITResolver::getGlobalValueIndirectSym(GlobalValue *GV, void *GVAddress) {
-  MutexGuard locked(TheJIT->lock);
-
-  // If we already have a stub for this global variable, recycle it.
-  void *&IndirectSym = state.getGlobalToIndirectSymMap()[GV];
-  if (IndirectSym) return IndirectSym;
-
-  // Otherwise, codegen a new indirect symbol.
-  IndirectSym = TheJIT->getJITInfo().emitGlobalValueIndirectSym(GV, GVAddress,
-                                                                JE);
-
-  DEBUG(dbgs() << "JIT: Indirect symbol emitted at [" << IndirectSym
-        << "] for GV '" << GV->getName() << "'\n");
-
-  return IndirectSym;
-}
-
-/// getExternalFunctionStub - Return a stub for the function at the
-/// specified address, created lazily on demand.
-void *JITResolver::getExternalFunctionStub(void *FnAddr) {
-  // If we already have a stub for this function, recycle it.
-  void *&Stub = ExternalFnToStubMap[FnAddr];
-  if (Stub) return Stub;
-
-  TargetJITInfo::StubLayout SL = TheJIT->getJITInfo().getStubLayout();
-  JE.startGVStub(nullptr, SL.Size, SL.Alignment);
-  Stub = TheJIT->getJITInfo().emitFunctionStub(nullptr, FnAddr, JE);
-  JE.finishGVStub();
-
-  DEBUG(dbgs() << "JIT: Stub emitted at [" << Stub
-               << "] for external function at '" << FnAddr << "'\n");
-  return Stub;
-}
-
-unsigned JITResolver::getGOTIndexForAddr(void* addr) {
-  unsigned idx = revGOTMap[addr];
-  if (!idx) {
-    idx = ++nextGOTIndex;
-    revGOTMap[addr] = idx;
-    DEBUG(dbgs() << "JIT: Adding GOT entry " << idx << " for addr ["
-                 << addr << "]\n");
-  }
-  return idx;
-}
-
-/// JITCompilerFn - This function is called when a lazy compilation stub has
-/// been entered.  It looks up which function this stub corresponds to, compiles
-/// it if necessary, then returns the resultant function pointer.
-void *JITResolver::JITCompilerFn(void *Stub) {
-  JITResolver *JR = StubToResolverMap->getResolverFromStub(Stub);
-  assert(JR && "Unable to find the corresponding JITResolver to the call site");
-
-  Function* F = nullptr;
-  void* ActualPtr = nullptr;
-
-  {
-    // Only lock for getting the Function. The call getPointerToFunction made
-    // in this function might trigger function materializing, which requires
-    // JIT lock to be unlocked.
-    MutexGuard locked(JR->TheJIT->lock);
-
-    // The address given to us for the stub may not be exactly right, it might
-    // be a little bit after the stub.  As such, use upper_bound to find it.
-    std::pair<void*, Function*> I =
-      JR->state.LookupFunctionFromCallSite(Stub);
-    F = I.second;
-    ActualPtr = I.first;
-  }
-
-  // If we have already code generated the function, just return the address.
-  void *Result = JR->TheJIT->getPointerToGlobalIfAvailable(F);
-
-  if (!Result) {
-    // Otherwise we don't have it, do lazy compilation now.
-
-    // If lazy compilation is disabled, emit a useful error message and abort.
-    if (!JR->TheJIT->isCompilingLazily()) {
-      report_fatal_error("LLVM JIT requested to do lazy compilation of"
-                         " function '"
-                        + F->getName() + "' when lazy compiles are disabled!");
-    }
-
-    DEBUG(dbgs() << "JIT: Lazily resolving function '" << F->getName()
-          << "' In stub ptr = " << Stub << " actual ptr = "
-          << ActualPtr << "\n");
-    (void)ActualPtr;
-
-    Result = JR->TheJIT->getPointerToFunction(F);
-  }
-
-  // Reacquire the lock to update the GOT map.
-  MutexGuard locked(JR->TheJIT->lock);
-
-  // We might like to remove the call site from the CallSiteToFunction map, but
-  // we can't do that! Multiple threads could be stuck, waiting to acquire the
-  // lock above. As soon as the 1st function finishes compiling the function,
-  // the next one will be released, and needs to be able to find the function it
-  // needs to call.
-
-  // FIXME: We could rewrite all references to this stub if we knew them.
-
-  // What we will do is set the compiled function address to map to the
-  // same GOT entry as the stub so that later clients may update the GOT
-  // if they see it still using the stub address.
-  // Note: this is done so the Resolver doesn't have to manage GOT memory
-  // Do this without allocating map space if the target isn't using a GOT
-  if(JR->revGOTMap.find(Stub) != JR->revGOTMap.end())
-    JR->revGOTMap[Result] = JR->revGOTMap[Stub];
-
-  return Result;
-}
-
-//===----------------------------------------------------------------------===//
-// JITEmitter code.
-//
-
-static GlobalObject *getSimpleAliasee(Constant *C) {
-  C = C->stripPointerCasts();
-  return dyn_cast<GlobalObject>(C);
-}
-
-void *JITEmitter::getPointerToGlobal(GlobalValue *V, void *Reference,
-                                     bool MayNeedFarStub) {
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    return TheJIT->getOrEmitGlobalVariable(GV);
-
-  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
-    // We can only handle simple cases.
-    if (GlobalValue *GV = getSimpleAliasee(GA->getAliasee()))
-      return TheJIT->getPointerToGlobal(GV);
-    return nullptr;
-  }
-
-  // If we have already compiled the function, return a pointer to its body.
-  Function *F = cast<Function>(V);
-
-  void *FnStub = Resolver.getLazyFunctionStubIfAvailable(F);
-  if (FnStub) {
-    // Return the function stub if it's already created.  We do this first so
-    // that we're returning the same address for the function as any previous
-    // call.  TODO: Yes, this is wrong. The lazy stub isn't guaranteed to be
-    // close enough to call.
-    return FnStub;
-  }
-
-  // If we know the target can handle arbitrary-distance calls, try to
-  // return a direct pointer.
-  if (!MayNeedFarStub) {
-    // If we have code, go ahead and return that.
-    void *ResultPtr = TheJIT->getPointerToGlobalIfAvailable(F);
-    if (ResultPtr) return ResultPtr;
-
-    // If this is an external function pointer, we can force the JIT to
-    // 'compile' it, which really just adds it to the map.
-    if (isNonGhostDeclaration(F) || F->hasAvailableExternallyLinkage())
-      return TheJIT->getPointerToFunction(F);
-  }
-
-  // Otherwise, we may need a to emit a stub, and, conservatively, we always do
-  // so.  Note that it's possible to return null from getLazyFunctionStub in the
-  // case of a weak extern that fails to resolve.
-  return Resolver.getLazyFunctionStub(F);
-}
-
-void *JITEmitter::getPointerToGVIndirectSym(GlobalValue *V, void *Reference) {
-  // Make sure GV is emitted first, and create a stub containing the fully
-  // resolved address.
-  void *GVAddress = getPointerToGlobal(V, Reference, false);
-  void *StubAddr = Resolver.getGlobalValueIndirectSym(V, GVAddress);
-  return StubAddr;
-}
-
-void JITEmitter::processDebugLoc(DebugLoc DL, bool BeforePrintingInsn) {
-  if (DL.isUnknown()) return;
-  if (!BeforePrintingInsn) return;
-
-  const LLVMContext &Context = EmissionDetails.MF->getFunction()->getContext();
-
-  if (DL.getScope(Context) != nullptr && PrevDL != DL) {
-    JITEvent_EmittedFunctionDetails::LineStart NextLine;
-    NextLine.Address = getCurrentPCValue();
-    NextLine.Loc = DL;
-    EmissionDetails.LineStarts.push_back(NextLine);
-  }
-
-  PrevDL = DL;
-}
-
-static unsigned GetConstantPoolSizeInBytes(MachineConstantPool *MCP,
-                                           const DataLayout *TD) {
-  const std::vector<MachineConstantPoolEntry> &Constants = MCP->getConstants();
-  if (Constants.empty()) return 0;
-
-  unsigned Size = 0;
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    MachineConstantPoolEntry CPE = Constants[i];
-    unsigned AlignMask = CPE.getAlignment() - 1;
-    Size = (Size + AlignMask) & ~AlignMask;
-    Type *Ty = CPE.getType();
-    Size += TD->getTypeAllocSize(Ty);
-  }
-  return Size;
-}
-
-void JITEmitter::startFunction(MachineFunction &F) {
-  DEBUG(dbgs() << "JIT: Starting CodeGen of Function "
-        << F.getName() << "\n");
-
-  uintptr_t ActualSize = 0;
-  // Set the memory writable, if it's not already
-  MemMgr->setMemoryWritable();
-
-  if (SizeEstimate > 0) {
-    // SizeEstimate will be non-zero on reallocation attempts.
-    ActualSize = SizeEstimate;
-  }
-
-  BufferBegin = CurBufferPtr = MemMgr->startFunctionBody(F.getFunction(),
-                                                         ActualSize);
-  BufferEnd = BufferBegin+ActualSize;
-  EmittedFunctions[F.getFunction()].FunctionBody = BufferBegin;
-
-  // Ensure the constant pool/jump table info is at least 4-byte aligned.
-  emitAlignment(16);
-
-  emitConstantPool(F.getConstantPool());
-  if (MachineJumpTableInfo *MJTI = F.getJumpTableInfo())
-    initJumpTableInfo(MJTI);
-
-  // About to start emitting the machine code for the function.
-  emitAlignment(std::max(F.getFunction()->getAlignment(), 8U));
-  TheJIT->updateGlobalMapping(F.getFunction(), CurBufferPtr);
-  EmittedFunctions[F.getFunction()].Code = CurBufferPtr;
-
-  MBBLocations.clear();
-
-  EmissionDetails.MF = &F;
-  EmissionDetails.LineStarts.clear();
-}
-
-bool JITEmitter::finishFunction(MachineFunction &F) {
-  if (CurBufferPtr == BufferEnd) {
-    // We must call endFunctionBody before retrying, because
-    // deallocateMemForFunction requires it.
-    MemMgr->endFunctionBody(F.getFunction(), BufferBegin, CurBufferPtr);
-    retryWithMoreMemory(F);
-    return true;
-  }
-
-  if (MachineJumpTableInfo *MJTI = F.getJumpTableInfo())
-    emitJumpTableInfo(MJTI);
-
-  // FnStart is the start of the text, not the start of the constant pool and
-  // other per-function data.
-  uint8_t *FnStart =
-    (uint8_t *)TheJIT->getPointerToGlobalIfAvailable(F.getFunction());
-
-  // FnEnd is the end of the function's machine code.
-  uint8_t *FnEnd = CurBufferPtr;
-
-  if (!Relocations.empty()) {
-    CurFn = F.getFunction();
-    NumRelos += Relocations.size();
-
-    // Resolve the relocations to concrete pointers.
-    for (unsigned i = 0, e = Relocations.size(); i != e; ++i) {
-      MachineRelocation &MR = Relocations[i];
-      void *ResultPtr = nullptr;
-      if (!MR.letTargetResolve()) {
-        if (MR.isExternalSymbol()) {
-          ResultPtr = TheJIT->getPointerToNamedFunction(MR.getExternalSymbol(),
-                                                        false);
-          DEBUG(dbgs() << "JIT: Map \'" << MR.getExternalSymbol() << "\' to ["
-                       << ResultPtr << "]\n");
-
-          // If the target REALLY wants a stub for this function, emit it now.
-          if (MR.mayNeedFarStub()) {
-            ResultPtr = Resolver.getExternalFunctionStub(ResultPtr);
-          }
-        } else if (MR.isGlobalValue()) {
-          ResultPtr = getPointerToGlobal(MR.getGlobalValue(),
-                                         BufferBegin+MR.getMachineCodeOffset(),
-                                         MR.mayNeedFarStub());
-        } else if (MR.isIndirectSymbol()) {
-          ResultPtr = getPointerToGVIndirectSym(
-              MR.getGlobalValue(), BufferBegin+MR.getMachineCodeOffset());
-        } else if (MR.isBasicBlock()) {
-          ResultPtr = (void*)getMachineBasicBlockAddress(MR.getBasicBlock());
-        } else if (MR.isConstantPoolIndex()) {
-          ResultPtr =
-            (void*)getConstantPoolEntryAddress(MR.getConstantPoolIndex());
-        } else {
-          assert(MR.isJumpTableIndex());
-          ResultPtr=(void*)getJumpTableEntryAddress(MR.getJumpTableIndex());
-        }
-
-        MR.setResultPointer(ResultPtr);
-      }
-
-      // if we are managing the GOT and the relocation wants an index,
-      // give it one
-      if (MR.isGOTRelative() && MemMgr->isManagingGOT()) {
-        unsigned idx = Resolver.getGOTIndexForAddr(ResultPtr);
-        MR.setGOTIndex(idx);
-        if (((void**)MemMgr->getGOTBase())[idx] != ResultPtr) {
-          DEBUG(dbgs() << "JIT: GOT was out of date for " << ResultPtr
-                       << " pointing at " << ((void**)MemMgr->getGOTBase())[idx]
-                       << "\n");
-          ((void**)MemMgr->getGOTBase())[idx] = ResultPtr;
-        }
-      }
-    }
-
-    CurFn = nullptr;
-    TheJIT->getJITInfo().relocate(BufferBegin, &Relocations[0],
-                                  Relocations.size(), MemMgr->getGOTBase());
-  }
-
-  // Update the GOT entry for F to point to the new code.
-  if (MemMgr->isManagingGOT()) {
-    unsigned idx = Resolver.getGOTIndexForAddr((void*)BufferBegin);
-    if (((void**)MemMgr->getGOTBase())[idx] != (void*)BufferBegin) {
-      DEBUG(dbgs() << "JIT: GOT was out of date for " << (void*)BufferBegin
-                   << " pointing at " << ((void**)MemMgr->getGOTBase())[idx]
-                   << "\n");
-      ((void**)MemMgr->getGOTBase())[idx] = (void*)BufferBegin;
-    }
-  }
-
-  // CurBufferPtr may have moved beyond FnEnd, due to memory allocation for
-  // global variables that were referenced in the relocations.
-  MemMgr->endFunctionBody(F.getFunction(), BufferBegin, CurBufferPtr);
-
-  if (CurBufferPtr == BufferEnd) {
-    retryWithMoreMemory(F);
-    return true;
-  } else {
-    // Now that we've succeeded in emitting the function, reset the
-    // SizeEstimate back down to zero.
-    SizeEstimate = 0;
-  }
-
-  BufferBegin = CurBufferPtr = nullptr;
-  NumBytes += FnEnd-FnStart;
-
-  // Invalidate the icache if necessary.
-  sys::Memory::InvalidateInstructionCache(FnStart, FnEnd-FnStart);
-
-  TheJIT->NotifyFunctionEmitted(*F.getFunction(), FnStart, FnEnd-FnStart,
-                                EmissionDetails);
-
-  // Reset the previous debug location.
-  PrevDL = DebugLoc();
-
-  DEBUG(dbgs() << "JIT: Finished CodeGen of [" << (void*)FnStart
-        << "] Function: " << F.getName()
-        << ": " << (FnEnd-FnStart) << " bytes of text, "
-        << Relocations.size() << " relocations\n");
-
-  Relocations.clear();
-  ConstPoolAddresses.clear();
-
-  // Mark code region readable and executable if it's not so already.
-  MemMgr->setMemoryExecutable();
-
-  DEBUG({
-      if (sys::hasDisassembler()) {
-        dbgs() << "JIT: Disassembled code:\n";
-        dbgs() << sys::disassembleBuffer(FnStart, FnEnd-FnStart,
-                                         (uintptr_t)FnStart);
-      } else {
-        dbgs() << "JIT: Binary code:\n";
-        uint8_t* q = FnStart;
-        for (int i = 0; q < FnEnd; q += 4, ++i) {
-          if (i == 4)
-            i = 0;
-          if (i == 0)
-            dbgs() << "JIT: " << (long)(q - FnStart) << ": ";
-          bool Done = false;
-          for (int j = 3; j >= 0; --j) {
-            if (q + j >= FnEnd)
-              Done = true;
-            else
-              dbgs() << (unsigned short)q[j];
-          }
-          if (Done)
-            break;
-          dbgs() << ' ';
-          if (i == 3)
-            dbgs() << '\n';
-        }
-        dbgs()<< '\n';
-      }
-    });
-
-  if (MMI)
-    MMI->EndFunction();
-
-  return false;
-}
-
-void JITEmitter::retryWithMoreMemory(MachineFunction &F) {
-  DEBUG(dbgs() << "JIT: Ran out of space for native code.  Reattempting.\n");
-  Relocations.clear();  // Clear the old relocations or we'll reapply them.
-  ConstPoolAddresses.clear();
-  ++NumRetries;
-  deallocateMemForFunction(F.getFunction());
-  // Try again with at least twice as much free space.
-  SizeEstimate = (uintptr_t)(2 * (BufferEnd - BufferBegin));
-
-  for (MachineFunction::iterator MBB = F.begin(), E = F.end(); MBB != E; ++MBB){
-    if (MBB->hasAddressTaken())
-      TheJIT->clearPointerToBasicBlock(MBB->getBasicBlock());
-  }
-}
-
-/// deallocateMemForFunction - Deallocate all memory for the specified
-/// function body.  Also drop any references the function has to stubs.
-/// May be called while the Function is being destroyed inside ~Value().
-void JITEmitter::deallocateMemForFunction(const Function *F) {
-  ValueMap<const Function *, EmittedCode, EmittedFunctionConfig>::iterator
-    Emitted = EmittedFunctions.find(F);
-  if (Emitted != EmittedFunctions.end()) {
-    MemMgr->deallocateFunctionBody(Emitted->second.FunctionBody);
-    TheJIT->NotifyFreeingMachineCode(Emitted->second.Code);
-
-    EmittedFunctions.erase(Emitted);
-  }
-}
-
-
-void *JITEmitter::allocateSpace(uintptr_t Size, unsigned Alignment) {
-  if (BufferBegin)
-    return JITCodeEmitter::allocateSpace(Size, Alignment);
-
-  // create a new memory block if there is no active one.
-  // care must be taken so that BufferBegin is invalidated when a
-  // block is trimmed
-  BufferBegin = CurBufferPtr = MemMgr->allocateSpace(Size, Alignment);
-  BufferEnd = BufferBegin+Size;
-  return CurBufferPtr;
-}
-
-void *JITEmitter::allocateGlobal(uintptr_t Size, unsigned Alignment) {
-  // Delegate this call through the memory manager.
-  return MemMgr->allocateGlobal(Size, Alignment);
-}
-
-void JITEmitter::emitConstantPool(MachineConstantPool *MCP) {
-  if (TheJIT->getJITInfo().hasCustomConstantPool())
-    return;
-
-  const std::vector<MachineConstantPoolEntry> &Constants = MCP->getConstants();
-  if (Constants.empty()) return;
-
-  unsigned Size = GetConstantPoolSizeInBytes(MCP, TheJIT->getDataLayout());
-  unsigned Align = MCP->getConstantPoolAlignment();
-  ConstantPoolBase = allocateSpace(Size, Align);
-  ConstantPool = MCP;
-
-  if (!ConstantPoolBase) return;  // Buffer overflow.
-
-  DEBUG(dbgs() << "JIT: Emitted constant pool at [" << ConstantPoolBase
-               << "] (size: " << Size << ", alignment: " << Align << ")\n");
-
-  // Initialize the memory for all of the constant pool entries.
-  unsigned Offset = 0;
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    MachineConstantPoolEntry CPE = Constants[i];
-    unsigned AlignMask = CPE.getAlignment() - 1;
-    Offset = (Offset + AlignMask) & ~AlignMask;
-
-    uintptr_t CAddr = (uintptr_t)ConstantPoolBase + Offset;
-    ConstPoolAddresses.push_back(CAddr);
-    if (CPE.isMachineConstantPoolEntry()) {
-      // FIXME: add support to lower machine constant pool values into bytes!
-      report_fatal_error("Initialize memory with machine specific constant pool"
-                        "entry has not been implemented!");
-    }
-    TheJIT->InitializeMemory(CPE.Val.ConstVal, (void*)CAddr);
-    DEBUG(dbgs() << "JIT:   CP" << i << " at [0x";
-          dbgs().write_hex(CAddr) << "]\n");
-
-    Type *Ty = CPE.Val.ConstVal->getType();
-    Offset += TheJIT->getDataLayout()->getTypeAllocSize(Ty);
-  }
-}
-
-void JITEmitter::initJumpTableInfo(MachineJumpTableInfo *MJTI) {
-  if (TheJIT->getJITInfo().hasCustomJumpTables())
-    return;
-  if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline)
-    return;
-
-  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
-  if (JT.empty()) return;
-
-  unsigned NumEntries = 0;
-  for (unsigned i = 0, e = JT.size(); i != e; ++i)
-    NumEntries += JT[i].MBBs.size();
-
-  unsigned EntrySize = MJTI->getEntrySize(*TheJIT->getDataLayout());
-
-  // Just allocate space for all the jump tables now.  We will fix up the actual
-  // MBB entries in the tables after we emit the code for each block, since then
-  // we will know the final locations of the MBBs in memory.
-  JumpTable = MJTI;
-  JumpTableBase = allocateSpace(NumEntries * EntrySize,
-                             MJTI->getEntryAlignment(*TheJIT->getDataLayout()));
-}
-
-void JITEmitter::emitJumpTableInfo(MachineJumpTableInfo *MJTI) {
-  if (TheJIT->getJITInfo().hasCustomJumpTables())
-    return;
-
-  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
-  if (JT.empty() || !JumpTableBase) return;
-
-
-  switch (MJTI->getEntryKind()) {
-  case MachineJumpTableInfo::EK_Inline:
-    return;
-  case MachineJumpTableInfo::EK_BlockAddress: {
-    // EK_BlockAddress - Each entry is a plain address of block, e.g.:
-    //     .word LBB123
-    assert(MJTI->getEntrySize(*TheJIT->getDataLayout()) == sizeof(void*) &&
-           "Cross JIT'ing?");
-
-    // For each jump table, map each target in the jump table to the address of
-    // an emitted MachineBasicBlock.
-    intptr_t *SlotPtr = (intptr_t*)JumpTableBase;
-
-    for (unsigned i = 0, e = JT.size(); i != e; ++i) {
-      const std::vector<MachineBasicBlock*> &MBBs = JT[i].MBBs;
-      // Store the address of the basic block for this jump table slot in the
-      // memory we allocated for the jump table in 'initJumpTableInfo'
-      for (unsigned mi = 0, me = MBBs.size(); mi != me; ++mi)
-        *SlotPtr++ = getMachineBasicBlockAddress(MBBs[mi]);
-    }
-    break;
-  }
-
-  case MachineJumpTableInfo::EK_Custom32:
-  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
-  case MachineJumpTableInfo::EK_LabelDifference32: {
-    assert(MJTI->getEntrySize(*TheJIT->getDataLayout()) == 4&&"Cross JIT'ing?");
-    // For each jump table, place the offset from the beginning of the table
-    // to the target address.
-    int *SlotPtr = (int*)JumpTableBase;
-
-    for (unsigned i = 0, e = JT.size(); i != e; ++i) {
-      const std::vector<MachineBasicBlock*> &MBBs = JT[i].MBBs;
-      // Store the offset of the basic block for this jump table slot in the
-      // memory we allocated for the jump table in 'initJumpTableInfo'
-      uintptr_t Base = (uintptr_t)SlotPtr;
-      for (unsigned mi = 0, me = MBBs.size(); mi != me; ++mi) {
-        uintptr_t MBBAddr = getMachineBasicBlockAddress(MBBs[mi]);
-        /// FIXME: USe EntryKind instead of magic "getPICJumpTableEntry" hook.
-        *SlotPtr++ = TheJIT->getJITInfo().getPICJumpTableEntry(MBBAddr, Base);
-      }
-    }
-    break;
-  }
-  case MachineJumpTableInfo::EK_GPRel64BlockAddress:
-    llvm_unreachable(
-           "JT Info emission not implemented for GPRel64BlockAddress yet.");
-  }
-}
-
-void JITEmitter::startGVStub(const GlobalValue* GV,
-                             unsigned StubSize, unsigned Alignment) {
-  SavedBufferBegin = BufferBegin;
-  SavedBufferEnd = BufferEnd;
-  SavedCurBufferPtr = CurBufferPtr;
-
-  BufferBegin = CurBufferPtr = MemMgr->allocateStub(GV, StubSize, Alignment);
-  BufferEnd = BufferBegin+StubSize+1;
-}
-
-void JITEmitter::startGVStub(void *Buffer, unsigned StubSize) {
-  SavedBufferBegin = BufferBegin;
-  SavedBufferEnd = BufferEnd;
-  SavedCurBufferPtr = CurBufferPtr;
-
-  BufferBegin = CurBufferPtr = (uint8_t *)Buffer;
-  BufferEnd = BufferBegin+StubSize+1;
-}
-
-void JITEmitter::finishGVStub() {
-  assert(CurBufferPtr != BufferEnd && "Stub overflowed allocated space.");
-  NumBytes += getCurrentPCOffset();
-  BufferBegin = SavedBufferBegin;
-  BufferEnd = SavedBufferEnd;
-  CurBufferPtr = SavedCurBufferPtr;
-}
-
-void *JITEmitter::allocIndirectGV(const GlobalValue *GV,
-                                  const uint8_t *Buffer, size_t Size,
-                                  unsigned Alignment) {
-  uint8_t *IndGV = MemMgr->allocateStub(GV, Size, Alignment);
-  memcpy(IndGV, Buffer, Size);
-  return IndGV;
-}
-
-// getConstantPoolEntryAddress - Return the address of the 'ConstantNum' entry
-// in the constant pool that was last emitted with the 'emitConstantPool'
-// method.
-//
-uintptr_t JITEmitter::getConstantPoolEntryAddress(unsigned ConstantNum) const {
-  assert(ConstantNum < ConstantPool->getConstants().size() &&
-         "Invalid ConstantPoolIndex!");
-  return ConstPoolAddresses[ConstantNum];
-}
-
-// getJumpTableEntryAddress - Return the address of the JumpTable with index
-// 'Index' in the jumpp table that was last initialized with 'initJumpTableInfo'
-//
-uintptr_t JITEmitter::getJumpTableEntryAddress(unsigned Index) const {
-  const std::vector<MachineJumpTableEntry> &JT = JumpTable->getJumpTables();
-  assert(Index < JT.size() && "Invalid jump table index!");
-
-  unsigned EntrySize = JumpTable->getEntrySize(*TheJIT->getDataLayout());
-
-  unsigned Offset = 0;
-  for (unsigned i = 0; i < Index; ++i)
-    Offset += JT[i].MBBs.size();
-
-   Offset *= EntrySize;
-
-  return (uintptr_t)((char *)JumpTableBase + Offset);
-}
-
-void JITEmitter::EmittedFunctionConfig::onDelete(
-  JITEmitter *Emitter, const Function *F) {
-  Emitter->deallocateMemForFunction(F);
-}
-void JITEmitter::EmittedFunctionConfig::onRAUW(
-  JITEmitter *, const Function*, const Function*) {
-  llvm_unreachable("The JIT doesn't know how to handle a"
-                   " RAUW on a value it has emitted.");
-}
-
-
-//===----------------------------------------------------------------------===//
-//  Public interface to this file
-//===----------------------------------------------------------------------===//
-
-JITCodeEmitter *JIT::createEmitter(JIT &jit, JITMemoryManager *JMM,
-                                   TargetMachine &tm) {
-  return new JITEmitter(jit, JMM, tm);
-}
-
-// getPointerToFunctionOrStub - If the specified function has been
-// code-gen'd, return a pointer to the function.  If not, compile it, or use
-// a stub to implement lazy compilation if available.
-//
-void *JIT::getPointerToFunctionOrStub(Function *F) {
-  // If we have already code generated the function, just return the address.
-  if (void *Addr = getPointerToGlobalIfAvailable(F))
-    return Addr;
-
-  // Get a stub if the target supports it.
-  JITEmitter *JE = static_cast<JITEmitter*>(getCodeEmitter());
-  return JE->getJITResolver().getLazyFunctionStub(F);
-}
-
-void JIT::updateFunctionStubUnlocked(Function *F) {
-  // Get the empty stub we generated earlier.
-  JITEmitter *JE = static_cast<JITEmitter*>(getCodeEmitter());
-  void *Stub = JE->getJITResolver().getLazyFunctionStub(F);
-  void *Addr = getPointerToGlobalIfAvailable(F);
-  assert(Addr != Stub && "Function must have non-stub address to be updated.");
-
-  // Tell the target jit info to rewrite the stub at the specified address,
-  // rather than creating a new one.
-  TargetJITInfo::StubLayout layout = getJITInfo().getStubLayout();
-  JE->startGVStub(Stub, layout.Size);
-  getJITInfo().emitFunctionStub(F, Addr, *getCodeEmitter());
-  JE->finishGVStub();
-}
-
-/// freeMachineCodeForFunction - release machine code memory for given Function.
-///
-void JIT::freeMachineCodeForFunction(Function *F) {
-  // Delete translation for this from the ExecutionEngine, so it will get
-  // retranslated next time it is used.
-  updateGlobalMapping(F, nullptr);
-
-  // Free the actual memory for the function body and related stuff.
-  static_cast<JITEmitter*>(JCE)->deallocateMemForFunction(F);
-}

diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
deleted file mode 100644
index 584b93f..0000000
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ /dev/null

@@ -1,904 +0,0 @@
-//===-- JITMemoryManager.cpp - Memory Allocator for JIT'd code ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the DefaultJITMemoryManager class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Config/config.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <climits>
-#include <cstring>
-#include <vector>
-
-#if defined(__linux__)
-#if defined(HAVE_SYS_STAT_H)
-#include <sys/stat.h>
-#endif
-#include <fcntl.h>
-#include <unistd.h>
-#endif
-
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-STATISTIC(NumSlabs, "Number of slabs of memory allocated by the JIT");
-
-JITMemoryManager::~JITMemoryManager() {}
-
-//===----------------------------------------------------------------------===//
-// Memory Block Implementation.
-//===----------------------------------------------------------------------===//
-
-namespace {
-  /// MemoryRangeHeader - For a range of memory, this is the header that we put
-  /// on the block of memory.  It is carefully crafted to be one word of memory.
-  /// Allocated blocks have just this header, free'd blocks have FreeRangeHeader
-  /// which starts with this.
-  struct FreeRangeHeader;
-  struct MemoryRangeHeader {
-    /// ThisAllocated - This is true if this block is currently allocated.  If
-    /// not, this can be converted to a FreeRangeHeader.
-    unsigned ThisAllocated : 1;
-
-    /// PrevAllocated - Keep track of whether the block immediately before us is
-    /// allocated.  If not, the word immediately before this header is the size
-    /// of the previous block.
-    unsigned PrevAllocated : 1;
-
-    /// BlockSize - This is the size in bytes of this memory block,
-    /// including this header.
-    uintptr_t BlockSize : (sizeof(intptr_t)*CHAR_BIT - 2);
-
-
-    /// getBlockAfter - Return the memory block immediately after this one.
-    ///
-    MemoryRangeHeader &getBlockAfter() const {
-      return *reinterpret_cast<MemoryRangeHeader *>(
-                reinterpret_cast<char*>(
-                  const_cast<MemoryRangeHeader *>(this))+BlockSize);
-    }
-
-    /// getFreeBlockBefore - If the block before this one is free, return it,
-    /// otherwise return null.
-    FreeRangeHeader *getFreeBlockBefore() const {
-      if (PrevAllocated) return nullptr;
-      intptr_t PrevSize = reinterpret_cast<intptr_t *>(
-                            const_cast<MemoryRangeHeader *>(this))[-1];
-      return reinterpret_cast<FreeRangeHeader *>(
-               reinterpret_cast<char*>(
-                 const_cast<MemoryRangeHeader *>(this))-PrevSize);
-    }
-
-    /// FreeBlock - Turn an allocated block into a free block, adjusting
-    /// bits in the object headers, and adding an end of region memory block.
-    FreeRangeHeader *FreeBlock(FreeRangeHeader *FreeList);
-
-    /// TrimAllocationToSize - If this allocated block is significantly larger
-    /// than NewSize, split it into two pieces (where the former is NewSize
-    /// bytes, including the header), and add the new block to the free list.
-    FreeRangeHeader *TrimAllocationToSize(FreeRangeHeader *FreeList,
-                                          uint64_t NewSize);
-  };
-
-  /// FreeRangeHeader - For a memory block that isn't already allocated, this
-  /// keeps track of the current block and has a pointer to the next free block.
-  /// Free blocks are kept on a circularly linked list.
-  struct FreeRangeHeader : public MemoryRangeHeader {
-    FreeRangeHeader *Prev;
-    FreeRangeHeader *Next;
-
-    /// getMinBlockSize - Get the minimum size for a memory block.  Blocks
-    /// smaller than this size cannot be created.
-    static unsigned getMinBlockSize() {
-      return sizeof(FreeRangeHeader)+sizeof(intptr_t);
-    }
-
-    /// SetEndOfBlockSizeMarker - The word at the end of every free block is
-    /// known to be the size of the free block.  Set it for this block.
-    void SetEndOfBlockSizeMarker() {
-      void *EndOfBlock = (char*)this + BlockSize;
-      ((intptr_t *)EndOfBlock)[-1] = BlockSize;
-    }
-
-    FreeRangeHeader *RemoveFromFreeList() {
-      assert(Next->Prev == this && Prev->Next == this && "Freelist broken!");
-      Next->Prev = Prev;
-      return Prev->Next = Next;
-    }
-
-    void AddToFreeList(FreeRangeHeader *FreeList) {
-      Next = FreeList;
-      Prev = FreeList->Prev;
-      Prev->Next = this;
-      Next->Prev = this;
-    }
-
-    /// GrowBlock - The block after this block just got deallocated.  Merge it
-    /// into the current block.
-    void GrowBlock(uintptr_t NewSize);
-
-    /// AllocateBlock - Mark this entire block allocated, updating freelists
-    /// etc.  This returns a pointer to the circular free-list.
-    FreeRangeHeader *AllocateBlock();
-  };
-}
-
-
-/// AllocateBlock - Mark this entire block allocated, updating freelists
-/// etc.  This returns a pointer to the circular free-list.
-FreeRangeHeader *FreeRangeHeader::AllocateBlock() {
-  assert(!ThisAllocated && !getBlockAfter().PrevAllocated &&
-         "Cannot allocate an allocated block!");
-  // Mark this block allocated.
-  ThisAllocated = 1;
-  getBlockAfter().PrevAllocated = 1;
-
-  // Remove it from the free list.
-  return RemoveFromFreeList();
-}
-
-/// FreeBlock - Turn an allocated block into a free block, adjusting
-/// bits in the object headers, and adding an end of region memory block.
-/// If possible, coalesce this block with neighboring blocks.  Return the
-/// FreeRangeHeader to allocate from.
-FreeRangeHeader *MemoryRangeHeader::FreeBlock(FreeRangeHeader *FreeList) {
-  MemoryRangeHeader *FollowingBlock = &getBlockAfter();
-  assert(ThisAllocated && "This block is already free!");
-  assert(FollowingBlock->PrevAllocated && "Flags out of sync!");
-
-  FreeRangeHeader *FreeListToReturn = FreeList;
-
-  // If the block after this one is free, merge it into this block.
-  if (!FollowingBlock->ThisAllocated) {
-    FreeRangeHeader &FollowingFreeBlock = *(FreeRangeHeader *)FollowingBlock;
-    // "FreeList" always needs to be a valid free block.  If we're about to
-    // coalesce with it, update our notion of what the free list is.
-    if (&FollowingFreeBlock == FreeList) {
-      FreeList = FollowingFreeBlock.Next;
-      FreeListToReturn = nullptr;
-      assert(&FollowingFreeBlock != FreeList && "No tombstone block?");
-    }
-    FollowingFreeBlock.RemoveFromFreeList();
-
-    // Include the following block into this one.
-    BlockSize += FollowingFreeBlock.BlockSize;
-    FollowingBlock = &FollowingFreeBlock.getBlockAfter();
-
-    // Tell the block after the block we are coalescing that this block is
-    // allocated.
-    FollowingBlock->PrevAllocated = 1;
-  }
-
-  assert(FollowingBlock->ThisAllocated && "Missed coalescing?");
-
-  if (FreeRangeHeader *PrevFreeBlock = getFreeBlockBefore()) {
-    PrevFreeBlock->GrowBlock(PrevFreeBlock->BlockSize + BlockSize);
-    return FreeListToReturn ? FreeListToReturn : PrevFreeBlock;
-  }
-
-  // Otherwise, mark this block free.
-  FreeRangeHeader &FreeBlock = *(FreeRangeHeader*)this;
-  FollowingBlock->PrevAllocated = 0;
-  FreeBlock.ThisAllocated = 0;
-
-  // Link this into the linked list of free blocks.
-  FreeBlock.AddToFreeList(FreeList);
-
-  // Add a marker at the end of the block, indicating the size of this free
-  // block.
-  FreeBlock.SetEndOfBlockSizeMarker();
-  return FreeListToReturn ? FreeListToReturn : &FreeBlock;
-}
-
-/// GrowBlock - The block after this block just got deallocated.  Merge it
-/// into the current block.
-void FreeRangeHeader::GrowBlock(uintptr_t NewSize) {
-  assert(NewSize > BlockSize && "Not growing block?");
-  BlockSize = NewSize;
-  SetEndOfBlockSizeMarker();
-  getBlockAfter().PrevAllocated = 0;
-}
-
-/// TrimAllocationToSize - If this allocated block is significantly larger
-/// than NewSize, split it into two pieces (where the former is NewSize
-/// bytes, including the header), and add the new block to the free list.
-FreeRangeHeader *MemoryRangeHeader::
-TrimAllocationToSize(FreeRangeHeader *FreeList, uint64_t NewSize) {
-  assert(ThisAllocated && getBlockAfter().PrevAllocated &&
-         "Cannot deallocate part of an allocated block!");
-
-  // Don't allow blocks to be trimmed below minimum required size
-  NewSize = std::max<uint64_t>(FreeRangeHeader::getMinBlockSize(), NewSize);
-
-  // Round up size for alignment of header.
-  unsigned HeaderAlign = __alignof(FreeRangeHeader);
-  NewSize = (NewSize+ (HeaderAlign-1)) & ~(HeaderAlign-1);
-
-  // Size is now the size of the block we will remove from the start of the
-  // current block.
-  assert(NewSize <= BlockSize &&
-         "Allocating more space from this block than exists!");
-
-  // If splitting this block will cause the remainder to be too small, do not
-  // split the block.
-  if (BlockSize <= NewSize+FreeRangeHeader::getMinBlockSize())
-    return FreeList;
-
-  // Otherwise, we splice the required number of bytes out of this block, form
-  // a new block immediately after it, then mark this block allocated.
-  MemoryRangeHeader &FormerNextBlock = getBlockAfter();
-
-  // Change the size of this block.
-  BlockSize = NewSize;
-
-  // Get the new block we just sliced out and turn it into a free block.
-  FreeRangeHeader &NewNextBlock = (FreeRangeHeader &)getBlockAfter();
-  NewNextBlock.BlockSize = (char*)&FormerNextBlock - (char*)&NewNextBlock;
-  NewNextBlock.ThisAllocated = 0;
-  NewNextBlock.PrevAllocated = 1;
-  NewNextBlock.SetEndOfBlockSizeMarker();
-  FormerNextBlock.PrevAllocated = 0;
-  NewNextBlock.AddToFreeList(FreeList);
-  return &NewNextBlock;
-}
-
-//===----------------------------------------------------------------------===//
-// Memory Block Implementation.
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-  class DefaultJITMemoryManager;
-
-  class JITAllocator {
-    DefaultJITMemoryManager &JMM;
-  public:
-    JITAllocator(DefaultJITMemoryManager &jmm) : JMM(jmm) { }
-    void *Allocate(size_t Size, size_t /*Alignment*/);
-    void Deallocate(void *Slab, size_t Size);
-  };
-
-  /// DefaultJITMemoryManager - Manage memory for the JIT code generation.
-  /// This splits a large block of MAP_NORESERVE'd memory into two
-  /// sections, one for function stubs, one for the functions themselves.  We
-  /// have to do this because we may need to emit a function stub while in the
-  /// middle of emitting a function, and we don't know how large the function we
-  /// are emitting is.
-  class DefaultJITMemoryManager : public JITMemoryManager {
-  public:
-    /// DefaultCodeSlabSize - When we have to go map more memory, we allocate at
-    /// least this much unless more is requested. Currently, in 512k slabs.
-    static const size_t DefaultCodeSlabSize = 512 * 1024;
-
-    /// DefaultSlabSize - Allocate globals and stubs into slabs of 64K (probably
-    /// 16 pages) unless we get an allocation above SizeThreshold.
-    static const size_t DefaultSlabSize = 64 * 1024;
-
-    /// DefaultSizeThreshold - For any allocation larger than 16K (probably
-    /// 4 pages), we should allocate a separate slab to avoid wasted space at
-    /// the end of a normal slab.
-    static const size_t DefaultSizeThreshold = 16 * 1024;
-
-  private:
-    // Whether to poison freed memory.
-    bool PoisonMemory;
-
-    /// LastSlab - This points to the last slab allocated and is used as the
-    /// NearBlock parameter to AllocateRWX so that we can attempt to lay out all
-    /// stubs, data, and code contiguously in memory.  In general, however, this
-    /// is not possible because the NearBlock parameter is ignored on Windows
-    /// platforms and even on Unix it works on a best-effort pasis.
-    sys::MemoryBlock LastSlab;
-
-    // Memory slabs allocated by the JIT.  We refer to them as slabs so we don't
-    // confuse them with the blocks of memory described above.
-    std::vector<sys::MemoryBlock> CodeSlabs;
-    BumpPtrAllocatorImpl<JITAllocator, DefaultSlabSize,
-                         DefaultSizeThreshold> StubAllocator;
-    BumpPtrAllocatorImpl<JITAllocator, DefaultSlabSize,
-                         DefaultSizeThreshold> DataAllocator;
-
-    // Circular list of free blocks.
-    FreeRangeHeader *FreeMemoryList;
-
-    // When emitting code into a memory block, this is the block.
-    MemoryRangeHeader *CurBlock;
-
-    uint8_t *GOTBase;     // Target Specific reserved memory
-  public:
-    DefaultJITMemoryManager();
-    ~DefaultJITMemoryManager();
-
-    /// allocateNewSlab - Allocates a new MemoryBlock and remembers it as the
-    /// last slab it allocated, so that subsequent allocations follow it.
-    sys::MemoryBlock allocateNewSlab(size_t size);
-
-    /// getPointerToNamedFunction - This method returns the address of the
-    /// specified function by using the dlsym function call.
-    void *getPointerToNamedFunction(const std::string &Name,
-                                    bool AbortOnFailure = true) override;
-
-    void AllocateGOT() override;
-
-    // Testing methods.
-    bool CheckInvariants(std::string &ErrorStr) override;
-    size_t GetDefaultCodeSlabSize() override { return DefaultCodeSlabSize; }
-    size_t GetDefaultDataSlabSize() override { return DefaultSlabSize; }
-    size_t GetDefaultStubSlabSize() override { return DefaultSlabSize; }
-    unsigned GetNumCodeSlabs() override { return CodeSlabs.size(); }
-    unsigned GetNumDataSlabs() override { return DataAllocator.GetNumSlabs(); }
-    unsigned GetNumStubSlabs() override { return StubAllocator.GetNumSlabs(); }
-
-    /// startFunctionBody - When a function starts, allocate a block of free
-    /// executable memory, returning a pointer to it and its actual size.
-    uint8_t *startFunctionBody(const Function *F,
-                               uintptr_t &ActualSize) override {
-
-      FreeRangeHeader* candidateBlock = FreeMemoryList;
-      FreeRangeHeader* head = FreeMemoryList;
-      FreeRangeHeader* iter = head->Next;
-
-      uintptr_t largest = candidateBlock->BlockSize;
-
-      // Search for the largest free block
-      while (iter != head) {
-        if (iter->BlockSize > largest) {
-          largest = iter->BlockSize;
-          candidateBlock = iter;
-        }
-        iter = iter->Next;
-      }
-
-      largest = largest - sizeof(MemoryRangeHeader);
-
-      // If this block isn't big enough for the allocation desired, allocate
-      // another block of memory and add it to the free list.
-      if (largest < ActualSize ||
-          largest <= FreeRangeHeader::getMinBlockSize()) {
-        DEBUG(dbgs() << "JIT: Allocating another slab of memory for function.");
-        candidateBlock = allocateNewCodeSlab((size_t)ActualSize);
-      }
-
-      // Select this candidate block for allocation
-      CurBlock = candidateBlock;
-
-      // Allocate the entire memory block.
-      FreeMemoryList = candidateBlock->AllocateBlock();
-      ActualSize = CurBlock->BlockSize - sizeof(MemoryRangeHeader);
-      return (uint8_t *)(CurBlock + 1);
-    }
-
-    /// allocateNewCodeSlab - Helper method to allocate a new slab of code
-    /// memory from the OS and add it to the free list.  Returns the new
-    /// FreeRangeHeader at the base of the slab.
-    FreeRangeHeader *allocateNewCodeSlab(size_t MinSize) {
-      // If the user needs at least MinSize free memory, then we account for
-      // two MemoryRangeHeaders: the one in the user's block, and the one at the
-      // end of the slab.
-      size_t PaddedMin = MinSize + 2 * sizeof(MemoryRangeHeader);
-      size_t SlabSize = std::max(DefaultCodeSlabSize, PaddedMin);
-      sys::MemoryBlock B = allocateNewSlab(SlabSize);
-      CodeSlabs.push_back(B);
-      char *MemBase = (char*)(B.base());
-
-      // Put a tiny allocated block at the end of the memory chunk, so when
-      // FreeBlock calls getBlockAfter it doesn't fall off the end.
-      MemoryRangeHeader *EndBlock =
-          (MemoryRangeHeader*)(MemBase + B.size()) - 1;
-      EndBlock->ThisAllocated = 1;
-      EndBlock->PrevAllocated = 0;
-      EndBlock->BlockSize = sizeof(MemoryRangeHeader);
-
-      // Start out with a vast new block of free memory.
-      FreeRangeHeader *NewBlock = (FreeRangeHeader*)MemBase;
-      NewBlock->ThisAllocated = 0;
-      // Make sure getFreeBlockBefore doesn't look into unmapped memory.
-      NewBlock->PrevAllocated = 1;
-      NewBlock->BlockSize = (uintptr_t)EndBlock - (uintptr_t)NewBlock;
-      NewBlock->SetEndOfBlockSizeMarker();
-      NewBlock->AddToFreeList(FreeMemoryList);
-
-      assert(NewBlock->BlockSize - sizeof(MemoryRangeHeader) >= MinSize &&
-             "The block was too small!");
-      return NewBlock;
-    }
-
-    /// endFunctionBody - The function F is now allocated, and takes the memory
-    /// in the range [FunctionStart,FunctionEnd).
-    void endFunctionBody(const Function *F, uint8_t *FunctionStart,
-                         uint8_t *FunctionEnd) override {
-      assert(FunctionEnd > FunctionStart);
-      assert(FunctionStart == (uint8_t *)(CurBlock+1) &&
-             "Mismatched function start/end!");
-
-      uintptr_t BlockSize = FunctionEnd - (uint8_t *)CurBlock;
-
-      // Release the memory at the end of this block that isn't needed.
-      FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
-    }
-
-    /// allocateSpace - Allocate a memory block of the given size.  This method
-    /// cannot be called between calls to startFunctionBody and endFunctionBody.
-    uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) override {
-      CurBlock = FreeMemoryList;
-      FreeMemoryList = FreeMemoryList->AllocateBlock();
-
-      uint8_t *result = (uint8_t *)(CurBlock + 1);
-
-      if (Alignment == 0) Alignment = 1;
-      result = (uint8_t*)(((intptr_t)result+Alignment-1) &
-               ~(intptr_t)(Alignment-1));
-
-      uintptr_t BlockSize = result + Size - (uint8_t *)CurBlock;
-      FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
-
-      return result;
-    }
-
-    /// allocateStub - Allocate memory for a function stub.
-    uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
-                          unsigned Alignment) override {
-      return (uint8_t*)StubAllocator.Allocate(StubSize, Alignment);
-    }
-
-    /// allocateGlobal - Allocate memory for a global.
-    uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment) override {
-      return (uint8_t*)DataAllocator.Allocate(Size, Alignment);
-    }
-
-    /// allocateCodeSection - Allocate memory for a code section.
-    uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID,
-                                 StringRef SectionName) override {
-      // Grow the required block size to account for the block header
-      Size += sizeof(*CurBlock);
-
-      // Alignment handling.
-      if (!Alignment)
-        Alignment = 16;
-      Size += Alignment - 1;
-
-      FreeRangeHeader* candidateBlock = FreeMemoryList;
-      FreeRangeHeader* head = FreeMemoryList;
-      FreeRangeHeader* iter = head->Next;
-
-      uintptr_t largest = candidateBlock->BlockSize;
-
-      // Search for the largest free block.
-      while (iter != head) {
-        if (iter->BlockSize > largest) {
-          largest = iter->BlockSize;
-          candidateBlock = iter;
-        }
-        iter = iter->Next;
-      }
-
-      largest = largest - sizeof(MemoryRangeHeader);
-
-      // If this block isn't big enough for the allocation desired, allocate
-      // another block of memory and add it to the free list.
-      if (largest < Size || largest <= FreeRangeHeader::getMinBlockSize()) {
-        DEBUG(dbgs() << "JIT: Allocating another slab of memory for function.");
-        candidateBlock = allocateNewCodeSlab((size_t)Size);
-      }
-
-      // Select this candidate block for allocation
-      CurBlock = candidateBlock;
-
-      // Allocate the entire memory block.
-      FreeMemoryList = candidateBlock->AllocateBlock();
-      // Release the memory at the end of this block that isn't needed.
-      FreeMemoryList = CurBlock->TrimAllocationToSize(FreeMemoryList, Size);
-      uintptr_t unalignedAddr = (uintptr_t)CurBlock + sizeof(*CurBlock);
-      return (uint8_t*)RoundUpToAlignment((uint64_t)unalignedAddr, Alignment);
-    }
-
-    /// allocateDataSection - Allocate memory for a data section.
-    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID, StringRef SectionName,
-                                 bool IsReadOnly) override {
-      return (uint8_t*)DataAllocator.Allocate(Size, Alignment);
-    }
-
-    bool finalizeMemory(std::string *ErrMsg) override {
-      return false;
-    }
-
-    uint8_t *getGOTBase() const override {
-      return GOTBase;
-    }
-
-    void deallocateBlock(void *Block) {
-      // Find the block that is allocated for this function.
-      MemoryRangeHeader *MemRange = static_cast<MemoryRangeHeader*>(Block) - 1;
-      assert(MemRange->ThisAllocated && "Block isn't allocated!");
-
-      // Fill the buffer with garbage!
-      if (PoisonMemory) {
-        memset(MemRange+1, 0xCD, MemRange->BlockSize-sizeof(*MemRange));
-      }
-
-      // Free the memory.
-      FreeMemoryList = MemRange->FreeBlock(FreeMemoryList);
-    }
-
-    /// deallocateFunctionBody - Deallocate all memory for the specified
-    /// function body.
-    void deallocateFunctionBody(void *Body) override {
-      if (Body) deallocateBlock(Body);
-    }
-
-    /// setMemoryWritable - When code generation is in progress,
-    /// the code pages may need permissions changed.
-    void setMemoryWritable() override {
-      for (unsigned i = 0, e = CodeSlabs.size(); i != e; ++i)
-        sys::Memory::setWritable(CodeSlabs[i]);
-    }
-    /// setMemoryExecutable - When code generation is done and we're ready to
-    /// start execution, the code pages may need permissions changed.
-    void setMemoryExecutable() override {
-      for (unsigned i = 0, e = CodeSlabs.size(); i != e; ++i)
-        sys::Memory::setExecutable(CodeSlabs[i]);
-    }
-
-    /// setPoisonMemory - Controls whether we write garbage over freed memory.
-    ///
-    void setPoisonMemory(bool poison) override {
-      PoisonMemory = poison;
-    }
-  };
-}
-
-void *JITAllocator::Allocate(size_t Size, size_t /*Alignment*/) {
-  sys::MemoryBlock B = JMM.allocateNewSlab(Size);
-  return B.base();
-}
-
-void JITAllocator::Deallocate(void *Slab, size_t Size) {
-  sys::MemoryBlock B(Slab, Size);
-  sys::Memory::ReleaseRWX(B);
-}
-
-DefaultJITMemoryManager::DefaultJITMemoryManager()
-    :
-#ifdef NDEBUG
-      PoisonMemory(false),
-#else
-      PoisonMemory(true),
-#endif
-      LastSlab(nullptr, 0), StubAllocator(*this), DataAllocator(*this) {
-
-  // Allocate space for code.
-  sys::MemoryBlock MemBlock = allocateNewSlab(DefaultCodeSlabSize);
-  CodeSlabs.push_back(MemBlock);
-  uint8_t *MemBase = (uint8_t*)MemBlock.base();
-
-  // We set up the memory chunk with 4 mem regions, like this:
-  //  [ START
-  //    [ Free      #0 ] -> Large space to allocate functions from.
-  //    [ Allocated #1 ] -> Tiny space to separate regions.
-  //    [ Free      #2 ] -> Tiny space so there is always at least 1 free block.
-  //    [ Allocated #3 ] -> Tiny space to prevent looking past end of block.
-  //  END ]
-  //
-  // The last three blocks are never deallocated or touched.
-
-  // Add MemoryRangeHeader to the end of the memory region, indicating that
-  // the space after the block of memory is allocated.  This is block #3.
-  MemoryRangeHeader *Mem3 = (MemoryRangeHeader*)(MemBase+MemBlock.size())-1;
-  Mem3->ThisAllocated = 1;
-  Mem3->PrevAllocated = 0;
-  Mem3->BlockSize     = sizeof(MemoryRangeHeader);
-
-  /// Add a tiny free region so that the free list always has one entry.
-  FreeRangeHeader *Mem2 =
-    (FreeRangeHeader *)(((char*)Mem3)-FreeRangeHeader::getMinBlockSize());
-  Mem2->ThisAllocated = 0;
-  Mem2->PrevAllocated = 1;
-  Mem2->BlockSize     = FreeRangeHeader::getMinBlockSize();
-  Mem2->SetEndOfBlockSizeMarker();
-  Mem2->Prev = Mem2;   // Mem2 *is* the free list for now.
-  Mem2->Next = Mem2;
-
-  /// Add a tiny allocated region so that Mem2 is never coalesced away.
-  MemoryRangeHeader *Mem1 = (MemoryRangeHeader*)Mem2-1;
-  Mem1->ThisAllocated = 1;
-  Mem1->PrevAllocated = 0;
-  Mem1->BlockSize     = sizeof(MemoryRangeHeader);
-
-  // Add a FreeRangeHeader to the start of the function body region, indicating
-  // that the space is free.  Mark the previous block allocated so we never look
-  // at it.
-  FreeRangeHeader *Mem0 = (FreeRangeHeader*)MemBase;
-  Mem0->ThisAllocated = 0;
-  Mem0->PrevAllocated = 1;
-  Mem0->BlockSize = (char*)Mem1-(char*)Mem0;
-  Mem0->SetEndOfBlockSizeMarker();
-  Mem0->AddToFreeList(Mem2);
-
-  // Start out with the freelist pointing to Mem0.
-  FreeMemoryList = Mem0;
-
-  GOTBase = nullptr;
-}
-
-void DefaultJITMemoryManager::AllocateGOT() {
-  assert(!GOTBase && "Cannot allocate the got multiple times");
-  GOTBase = new uint8_t[sizeof(void*) * 8192];
-  HasGOT = true;
-}
-
-DefaultJITMemoryManager::~DefaultJITMemoryManager() {
-  for (unsigned i = 0, e = CodeSlabs.size(); i != e; ++i)
-    sys::Memory::ReleaseRWX(CodeSlabs[i]);
-
-  delete[] GOTBase;
-}
-
-sys::MemoryBlock DefaultJITMemoryManager::allocateNewSlab(size_t size) {
-  // Allocate a new block close to the last one.
-  std::string ErrMsg;
-  sys::MemoryBlock *LastSlabPtr = LastSlab.base() ? &LastSlab : nullptr;
-  sys::MemoryBlock B = sys::Memory::AllocateRWX(size, LastSlabPtr, &ErrMsg);
-  if (!B.base()) {
-    report_fatal_error("Allocation failed when allocating new memory in the"
-                       " JIT\n" + Twine(ErrMsg));
-  }
-  LastSlab = B;
-  ++NumSlabs;
-  // Initialize the slab to garbage when debugging.
-  if (PoisonMemory) {
-    memset(B.base(), 0xCD, B.size());
-  }
-  return B;
-}
-
-/// CheckInvariants - For testing only.  Return "" if all internal invariants
-/// are preserved, and a helpful error message otherwise.  For free and
-/// allocated blocks, make sure that adding BlockSize gives a valid block.
-/// For free blocks, make sure they're in the free list and that their end of
-/// block size marker is correct.  This function should return an error before
-/// accessing bad memory.  This function is defined here instead of in
-/// JITMemoryManagerTest.cpp so that we don't have to expose all of the
-/// implementation details of DefaultJITMemoryManager.
-bool DefaultJITMemoryManager::CheckInvariants(std::string &ErrorStr) {
-  raw_string_ostream Err(ErrorStr);
-
-  // Construct a the set of FreeRangeHeader pointers so we can query it
-  // efficiently.
-  llvm::SmallPtrSet<MemoryRangeHeader*, 16> FreeHdrSet;
-  FreeRangeHeader* FreeHead = FreeMemoryList;
-  FreeRangeHeader* FreeRange = FreeHead;
-
-  do {
-    // Check that the free range pointer is in the blocks we've allocated.
-    bool Found = false;
-    for (std::vector<sys::MemoryBlock>::iterator I = CodeSlabs.begin(),
-         E = CodeSlabs.end(); I != E && !Found; ++I) {
-      char *Start = (char*)I->base();
-      char *End = Start + I->size();
-      Found = (Start <= (char*)FreeRange && (char*)FreeRange < End);
-    }
-    if (!Found) {
-      Err << "Corrupt free list; points to " << FreeRange;
-      return false;
-    }
-
-    if (FreeRange->Next->Prev != FreeRange) {
-      Err << "Next and Prev pointers do not match.";
-      return false;
-    }
-
-    // Otherwise, add it to the set.
-    FreeHdrSet.insert(FreeRange);
-    FreeRange = FreeRange->Next;
-  } while (FreeRange != FreeHead);
-
-  // Go over each block, and look at each MemoryRangeHeader.
-  for (std::vector<sys::MemoryBlock>::iterator I = CodeSlabs.begin(),
-       E = CodeSlabs.end(); I != E; ++I) {
-    char *Start = (char*)I->base();
-    char *End = Start + I->size();
-
-    // Check each memory range.
-    for (MemoryRangeHeader *Hdr = (MemoryRangeHeader*)Start, *LastHdr = nullptr;
-         Start <= (char*)Hdr && (char*)Hdr < End;
-         Hdr = &Hdr->getBlockAfter()) {
-      if (Hdr->ThisAllocated == 0) {
-        // Check that this range is in the free list.
-        if (!FreeHdrSet.count(Hdr)) {
-          Err << "Found free header at " << Hdr << " that is not in free list.";
-          return false;
-        }
-
-        // Now make sure the size marker at the end of the block is correct.
-        uintptr_t *Marker = ((uintptr_t*)&Hdr->getBlockAfter()) - 1;
-        if (!(Start <= (char*)Marker && (char*)Marker < End)) {
-          Err << "Block size in header points out of current MemoryBlock.";
-          return false;
-        }
-        if (Hdr->BlockSize != *Marker) {
-          Err << "End of block size marker (" << *Marker << ") "
-              << "and BlockSize (" << Hdr->BlockSize << ") don't match.";
-          return false;
-        }
-      }
-
-      if (LastHdr && LastHdr->ThisAllocated != Hdr->PrevAllocated) {
-        Err << "Hdr->PrevAllocated (" << Hdr->PrevAllocated << ") != "
-            << "LastHdr->ThisAllocated (" << LastHdr->ThisAllocated << ")";
-        return false;
-      } else if (!LastHdr && !Hdr->PrevAllocated) {
-        Err << "The first header should have PrevAllocated true.";
-        return false;
-      }
-
-      // Remember the last header.
-      LastHdr = Hdr;
-    }
-  }
-
-  // All invariants are preserved.
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// getPointerToNamedFunction() implementation.
-//===----------------------------------------------------------------------===//
-
-// AtExitHandlers - List of functions to call when the program exits,
-// registered with the atexit() library function.
-static std::vector<void (*)()> AtExitHandlers;
-
-/// runAtExitHandlers - Run any functions registered by the program's
-/// calls to atexit(3), which we intercept and store in
-/// AtExitHandlers.
-///
-static void runAtExitHandlers() {
-  while (!AtExitHandlers.empty()) {
-    void (*Fn)() = AtExitHandlers.back();
-    AtExitHandlers.pop_back();
-    Fn();
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Function stubs that are invoked instead of certain library calls
-//
-// Force the following functions to be linked in to anything that uses the
-// JIT. This is a hack designed to work around the all-too-clever Glibc
-// strategy of making these functions work differently when inlined vs. when
-// not inlined, and hiding their real definitions in a separate archive file
-// that the dynamic linker can't see. For more info, search for
-// 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-#if defined(__linux__) && defined(__GLIBC__)
-/* stat functions are redirecting to __xstat with a version number.  On x86-64
- * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
- * available as an exported symbol, so we have to add it explicitly.
- */
-namespace {
-class StatSymbols {
-public:
-  StatSymbols() {
-    sys::DynamicLibrary::AddSymbol("stat", (void*)(intptr_t)stat);
-    sys::DynamicLibrary::AddSymbol("fstat", (void*)(intptr_t)fstat);
-    sys::DynamicLibrary::AddSymbol("lstat", (void*)(intptr_t)lstat);
-    sys::DynamicLibrary::AddSymbol("stat64", (void*)(intptr_t)stat64);
-    sys::DynamicLibrary::AddSymbol("\x1stat64", (void*)(intptr_t)stat64);
-    sys::DynamicLibrary::AddSymbol("\x1open64", (void*)(intptr_t)open64);
-    sys::DynamicLibrary::AddSymbol("\x1lseek64", (void*)(intptr_t)lseek64);
-    sys::DynamicLibrary::AddSymbol("fstat64", (void*)(intptr_t)fstat64);
-    sys::DynamicLibrary::AddSymbol("lstat64", (void*)(intptr_t)lstat64);
-    sys::DynamicLibrary::AddSymbol("atexit", (void*)(intptr_t)atexit);
-    sys::DynamicLibrary::AddSymbol("mknod", (void*)(intptr_t)mknod);
-  }
-};
-}
-static StatSymbols initStatSymbols;
-#endif // __linux__
-
-// jit_exit - Used to intercept the "exit" library call.
-static void jit_exit(int Status) {
-  runAtExitHandlers();   // Run atexit handlers...
-  exit(Status);
-}
-
-// jit_atexit - Used to intercept the "atexit" library call.
-static int jit_atexit(void (*Fn)()) {
-  AtExitHandlers.push_back(Fn);    // Take note of atexit handler...
-  return 0;  // Always successful
-}
-
-static int jit_noop() {
-  return 0;
-}
-
-//===----------------------------------------------------------------------===//
-//
-/// getPointerToNamedFunction - This method returns the address of the specified
-/// function by using the dynamic loader interface.  As such it is only useful
-/// for resolving library symbols, not code generated symbols.
-///
-void *DefaultJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
-                                                         bool AbortOnFailure) {
-  // Check to see if this is one of the functions we want to intercept.  Note,
-  // we cast to intptr_t here to silence a -pedantic warning that complains
-  // about casting a function pointer to a normal pointer.
-  if (Name == "exit") return (void*)(intptr_t)&jit_exit;
-  if (Name == "atexit") return (void*)(intptr_t)&jit_atexit;
-
-  // We should not invoke parent's ctors/dtors from generated main()!
-  // On Mingw and Cygwin, the symbol __main is resolved to
-  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
-  // (and register wrong callee's dtors with atexit(3)).
-  // We expect ExecutionEngine::runStaticConstructorsDestructors()
-  // is called before ExecutionEngine::runFunctionAsMain() is called.
-  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
-
-  const char *NameStr = Name.c_str();
-  // If this is an asm specifier, skip the sentinal.
-  if (NameStr[0] == 1) ++NameStr;
-
-  // If it's an external function, look it up in the process image...
-  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
-  if (Ptr) return Ptr;
-
-  // If it wasn't found and if it starts with an underscore ('_') character,
-  // try again without the underscore.
-  if (NameStr[0] == '_') {
-    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
-    if (Ptr) return Ptr;
-  }
-
-  // Darwin/PPC adds $LDBLStub suffixes to various symbols like printf.  These
-  // are references to hidden visibility symbols that dlsym cannot resolve.
-  // If we have one of these, strip off $LDBLStub and try again.
-#if defined(__APPLE__) && defined(__ppc__)
-  if (Name.size() > 9 && Name[Name.size()-9] == '$' &&
-      memcmp(&Name[Name.size()-8], "LDBLStub", 8) == 0) {
-    // First try turning $LDBLStub into $LDBL128. If that fails, strip it off.
-    // This mirrors logic in libSystemStubs.a.
-    std::string Prefix = std::string(Name.begin(), Name.end()-9);
-    if (void *Ptr = getPointerToNamedFunction(Prefix+"$LDBL128", false))
-      return Ptr;
-    if (void *Ptr = getPointerToNamedFunction(Prefix, false))
-      return Ptr;
-  }
-#endif
-
-  if (AbortOnFailure) {
-    report_fatal_error("Program used external function '"+Name+
-                      "' which could not be resolved!");
-  }
-  return nullptr;
-}
-
-
-
-JITMemoryManager *JITMemoryManager::CreateDefaultMemManager() {
-  return new DefaultJITMemoryManager();
-}
-
-const size_t DefaultJITMemoryManager::DefaultCodeSlabSize;
-const size_t DefaultJITMemoryManager::DefaultSlabSize;
-const size_t DefaultJITMemoryManager::DefaultSizeThreshold;

diff --git a/lib/ExecutionEngine/JIT/LLVMBuild.txt b/lib/ExecutionEngine/JIT/LLVMBuild.txt
deleted file mode 100644
index dd22f1b..0000000
--- a/lib/ExecutionEngine/JIT/LLVMBuild.txt
+++ /dev/null

@@ -1,22 +0,0 @@
-;===- ./lib/ExecutionEngine/JIT/LLVMBuild.txt ------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = JIT
-parent = ExecutionEngine
-required_libraries = CodeGen Core ExecutionEngine Support

diff --git a/lib/ExecutionEngine/JIT/Makefile b/lib/ExecutionEngine/JIT/Makefile
deleted file mode 100644
index aafa3d9..0000000
--- a/lib/ExecutionEngine/JIT/Makefile
+++ /dev/null

@@ -1,38 +0,0 @@
-##===- lib/ExecutionEngine/JIT/Makefile --------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMJIT
-
-# Get the $(ARCH) setting
-include $(LEVEL)/Makefile.config
-
-# Enable the X86 JIT if compiling on X86
-ifeq ($(ARCH), x86)
-  ENABLE_X86_JIT = 1
-endif
-
-# This flag can also be used on the command line to force inclusion
-# of the X86 JIT on non-X86 hosts
-ifdef ENABLE_X86_JIT
-  CPPFLAGS += -DENABLE_X86_JIT
-endif
-
-# Enable the Sparc JIT if compiling on Sparc
-ifeq ($(ARCH), Sparc)
-  ENABLE_SPARC_JIT = 1
-endif
-
-# This flag can also be used on the command line to force inclusion
-# of the Sparc JIT on non-Sparc hosts
-ifdef ENABLE_SPARC_JIT
-  CPPFLAGS += -DENABLE_SPARC_JIT
-endif
-
-include $(LEVEL)/Makefile.common

diff --git a/lib/ExecutionEngine/JITEventListener.cpp b/lib/ExecutionEngine/JITEventListener.cpp
new file mode 100644
index 0000000..2a6a007
--- /dev/null
+++ b/lib/ExecutionEngine/JITEventListener.cpp

@@ -0,0 +1,15 @@
+//===-- JITEventListener.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITEventListener.h"
+
+using namespace llvm;
+
+// Out-of-line definition of the virtual destructor as this is the key function.
+JITEventListener::~JITEventListener() {}

diff --git a/lib/ExecutionEngine/LLVMBuild.txt b/lib/ExecutionEngine/LLVMBuild.txt
index 6dc75af..ecae078 100644
--- a/lib/ExecutionEngine/LLVMBuild.txt
+++ b/lib/ExecutionEngine/LLVMBuild.txt

@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = Interpreter JIT MCJIT RuntimeDyld IntelJITEvents OProfileJIT
+subdirectories = Interpreter MCJIT RuntimeDyld IntelJITEvents OProfileJIT
 
 [component_0]
 type = Library

diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index e9ba96a..da5f037 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp

@@ -10,7 +10,6 @@
 #include "MCJIT.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
@@ -28,6 +27,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -42,31 +42,23 @@
 extern "C" void LLVMLinkInMCJIT() {
 }
 
-ExecutionEngine *MCJIT::createJIT(Module *M,
+ExecutionEngine *MCJIT::createJIT(std::unique_ptr<Module> M,
                                   std::string *ErrorStr,
                                   RTDyldMemoryManager *MemMgr,
-                                  bool GVsWithCode,
-                                  TargetMachine *TM) {
+                                  std::unique_ptr<TargetMachine> TM) {
   // Try to register the program as a source of symbols to resolve against.
   //
   // FIXME: Don't do this here.
   sys::DynamicLibrary::LoadLibraryPermanently(nullptr, nullptr);
 
-  return new MCJIT(M, TM, MemMgr ? MemMgr : new SectionMemoryManager(),
-                   GVsWithCode);
+  return new MCJIT(std::move(M), std::move(TM),
+                   MemMgr ? MemMgr : new SectionMemoryManager());
 }
 
-MCJIT::MCJIT(Module *m, TargetMachine *tm, RTDyldMemoryManager *MM,
-             bool AllocateGVsWithCode)
-  : ExecutionEngine(m), TM(tm), Ctx(nullptr), MemMgr(this, MM), Dyld(&MemMgr),
-    ObjCache(nullptr) {
-
-  OwnedModules.addModule(m);
-  setDataLayout(TM->getDataLayout());
-}
-
-MCJIT::~MCJIT() {
-  MutexGuard locked(lock);
+MCJIT::MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm,
+             RTDyldMemoryManager *MM)
+    : ExecutionEngine(std::move(M)), TM(std::move(tm)), Ctx(nullptr),
+      MemMgr(this, MM), Dyld(&MemMgr), ObjCache(nullptr) {
   // FIXME: We are managing our modules, so we do not want the base class
   // ExecutionEngine to manage them as well. To avoid double destruction
   // of the first (and only) module added in ExecutionEngine constructor
@@ -77,33 +69,28 @@
   // If so, additional functions: addModule, removeModule, FindFunctionNamed,
   // runStaticConstructorsDestructors could be moved back to EE as well.
   //
+  std::unique_ptr<Module> First = std::move(Modules[0]);
   Modules.clear();
-  Dyld.deregisterEHFrames();
 
-  LoadedObjectList::iterator it, end;
-  for (it = LoadedObjects.begin(), end = LoadedObjects.end(); it != end; ++it) {
-    ObjectImage *Obj = *it;
-    if (Obj) {
-      NotifyFreeingObject(*Obj);
-      delete Obj;
-    }
-  }
-  LoadedObjects.clear();
-
-
-  SmallVector<object::Archive *, 2>::iterator ArIt, ArEnd;
-  for (ArIt = Archives.begin(), ArEnd = Archives.end(); ArIt != ArEnd; ++ArIt) {
-    object::Archive *A = *ArIt;
-    delete A;
-  }
-  Archives.clear();
-
-  delete TM;
+  OwnedModules.addModule(std::move(First));
+  setDataLayout(TM->getSubtargetImpl()->getDataLayout());
 }
 
-void MCJIT::addModule(Module *M) {
+MCJIT::~MCJIT() {
   MutexGuard locked(lock);
-  OwnedModules.addModule(M);
+
+  Dyld.deregisterEHFrames();
+
+  for (auto &Obj : LoadedObjects)
+    if (Obj)
+      NotifyFreeingObject(*Obj);
+
+  Archives.clear();
+}
+
+void MCJIT::addModule(std::unique_ptr<Module> M) {
+  MutexGuard locked(lock);
+  OwnedModules.addModule(std::move(M));
 }
 
 bool MCJIT::removeModule(Module *M) {
@@ -111,29 +98,34 @@
   return OwnedModules.removeModule(M);
 }
 
-
-
 void MCJIT::addObjectFile(std::unique_ptr<object::ObjectFile> Obj) {
-  ObjectImage *LoadedObject = Dyld.loadObject(std::move(Obj));
+  std::unique_ptr<ObjectImage> LoadedObject = Dyld.loadObject(std::move(Obj));
   if (!LoadedObject || Dyld.hasError())
     report_fatal_error(Dyld.getErrorString());
 
-  LoadedObjects.push_back(LoadedObject);
-
   NotifyObjectEmitted(*LoadedObject);
+
+  LoadedObjects.push_back(std::move(LoadedObject));
 }
 
-void MCJIT::addArchive(object::Archive *A) {
-  Archives.push_back(A);
+void MCJIT::addObjectFile(object::OwningBinary<object::ObjectFile> Obj) {
+  std::unique_ptr<object::ObjectFile> ObjFile;
+  std::unique_ptr<MemoryBuffer> MemBuf;
+  std::tie(ObjFile, MemBuf) = Obj.takeBinary();
+  addObjectFile(std::move(ObjFile));
+  Buffers.push_back(std::move(MemBuf));
 }
 
+void MCJIT::addArchive(object::OwningBinary<object::Archive> A) {
+  Archives.push_back(std::move(A));
+}
 
 void MCJIT::setObjectCache(ObjectCache* NewCache) {
   MutexGuard locked(lock);
   ObjCache = NewCache;
 }
 
-ObjectBufferStream* MCJIT::emitObject(Module *M) {
+std::unique_ptr<ObjectBufferStream> MCJIT::emitObject(Module *M) {
   MutexGuard locked(lock);
 
   // This must be a module which has already been added but not loaded to this
@@ -142,8 +134,8 @@
 
   PassManager PM;
 
-  M->setDataLayout(TM->getDataLayout());
-  PM.add(new DataLayoutPass(M));
+  M->setDataLayout(TM->getSubtargetImpl()->getDataLayout());
+  PM.add(new DataLayoutPass());
 
   // The RuntimeDyld will take ownership of this shortly
   std::unique_ptr<ObjectBufferStream> CompiledObject(new ObjectBufferStream());
@@ -165,11 +157,11 @@
   if (ObjCache) {
     // MemoryBuffer is a thin wrapper around the actual memory, so it's OK
     // to create a temporary object here and delete it after the call.
-    std::unique_ptr<MemoryBuffer> MB(CompiledObject->getMemBuffer());
-    ObjCache->notifyObjectCompiled(M, MB.get());
+    MemoryBufferRef MB = CompiledObject->getMemBuffer();
+    ObjCache->notifyObjectCompiled(M, MB);
   }
 
-  return CompiledObject.release();
+  return CompiledObject;
 }
 
 void MCJIT::generateCodeForModule(Module *M) {
@@ -187,21 +179,22 @@
   std::unique_ptr<ObjectBuffer> ObjectToLoad;
   // Try to load the pre-compiled object from cache if possible
   if (ObjCache) {
-    std::unique_ptr<MemoryBuffer> PreCompiledObject(ObjCache->getObject(M));
-    if (PreCompiledObject.get())
-      ObjectToLoad.reset(new ObjectBuffer(PreCompiledObject.release()));
+    if (std::unique_ptr<MemoryBuffer> PreCompiledObject =
+            ObjCache->getObject(M))
+      ObjectToLoad =
+          llvm::make_unique<ObjectBuffer>(std::move(PreCompiledObject));
   }
 
   // If the cache did not contain a suitable object, compile the object
   if (!ObjectToLoad) {
-    ObjectToLoad.reset(emitObject(M));
-    assert(ObjectToLoad.get() && "Compilation did not produce an object.");
+    ObjectToLoad = emitObject(M);
+    assert(ObjectToLoad && "Compilation did not produce an object.");
   }
 
   // Load the object into the dynamic linker.
   // MCJIT now owns the ObjectImage pointer (via its LoadedObjects list).
-  ObjectImage *LoadedObject = Dyld.loadObject(ObjectToLoad.release());
-  LoadedObjects.push_back(LoadedObject);
+  std::unique_ptr<ObjectImage> LoadedObject =
+      Dyld.loadObject(std::move(ObjectToLoad));
   if (!LoadedObject)
     report_fatal_error(Dyld.getErrorString());
 
@@ -210,6 +203,8 @@
 
   NotifyObjectEmitted(*LoadedObject);
 
+  LoadedObjects.push_back(std::move(LoadedObject));
+
   OwnedModules.markModuleAsLoaded(M);
 }
 
@@ -232,12 +227,14 @@
 void MCJIT::finalizeObject() {
   MutexGuard locked(lock);
 
-  for (ModulePtrSet::iterator I = OwnedModules.begin_added(),
-                              E = OwnedModules.end_added();
-       I != E; ++I) {
-    Module *M = *I;
+  // Generate code for module is going to move objects out of the 'added' list,
+  // so we need to copy that out before using it:
+  SmallVector<Module*, 16> ModsToAdd;
+  for (auto M : OwnedModules.added())
+    ModsToAdd.push_back(M);
+
+  for (auto M : ModsToAdd)
     generateCodeForModule(M);
-  }
 
   finalizeLoadedModules();
 }
@@ -255,12 +252,8 @@
   finalizeLoadedModules();
 }
 
-void *MCJIT::getPointerToBasicBlock(BasicBlock *BB) {
-  report_fatal_error("not yet implemented");
-}
-
 uint64_t MCJIT::getExistingSymbolAddress(const std::string &Name) {
-  Mangler Mang(TM->getDataLayout());
+  Mangler Mang(TM->getSubtargetImpl()->getDataLayout());
   SmallString<128> FullName;
   Mang.getNameWithPrefix(FullName, Name);
   return Dyld.getSymbolLoadAddress(FullName);
@@ -299,9 +292,8 @@
   if (Addr)
     return Addr;
 
-  SmallVector<object::Archive*, 2>::iterator I, E;
-  for (I = Archives.begin(), E = Archives.end(); I != E; ++I) {
-    object::Archive *A = *I;
+  for (object::OwningBinary<object::Archive> &OB : Archives) {
+    object::Archive *A = OB.getBinary();
     // Look for our symbols in each Archive
     object::Archive::child_iterator ChildIt = A->findSym(Name);
     if (ChildIt != A->child_end()) {
@@ -310,7 +302,7 @@
           ChildIt->getAsBinary();
       if (ChildBinOrErr.getError())
         continue;
-      std::unique_ptr<object::Binary> ChildBin = std::move(ChildBinOrErr.get());
+      std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
       if (ChildBin->isObject()) {
         std::unique_ptr<object::ObjectFile> OF(
             static_cast<object::ObjectFile *>(ChildBin.release()));
@@ -326,13 +318,19 @@
 
   // If it hasn't already been generated, see if it's in one of our modules.
   Module *M = findModuleForSymbol(Name, CheckFunctionsOnly);
-  if (!M)
-    return 0;
+  if (M) {
+    generateCodeForModule(M);
 
-  generateCodeForModule(M);
+    // Check the RuntimeDyld table again, it should be there now.
+    return getExistingSymbolAddress(Name);
+  }
 
-  // Check the RuntimeDyld table again, it should be there now.
-  return getExistingSymbolAddress(Name);
+  // If a LazyFunctionCreator is installed, use it to get/create the function.
+  // FIXME: Should we instead have a LazySymbolCreator callback?
+  if (LazyFunctionCreator)
+    Addr = (uint64_t)LazyFunctionCreator(Name);
+
+  return Addr;
 }
 
 uint64_t MCJIT::getGlobalValueAddress(const std::string &Name) {
@@ -355,10 +353,14 @@
 void *MCJIT::getPointerToFunction(Function *F) {
   MutexGuard locked(lock);
 
+  Mangler Mang(TM->getSubtargetImpl()->getDataLayout());
+  SmallString<128> Name;
+  TM->getNameWithPrefix(Name, F, Mang);
+
   if (F->isDeclaration() || F->hasAvailableExternallyLinkage()) {
     bool AbortOnFailure = !F->hasExternalWeakLinkage();
-    void *Addr = getPointerToNamedFunction(F->getName(), AbortOnFailure);
-    addGlobalMapping(F, Addr);
+    void *Addr = getPointerToNamedFunction(Name, AbortOnFailure);
+    updateGlobalMapping(F, Addr);
     return Addr;
   }
 
@@ -368,32 +370,25 @@
   // Make sure the relevant module has been compiled and loaded.
   if (HasBeenAddedButNotLoaded)
     generateCodeForModule(M);
-  else if (!OwnedModules.hasModuleBeenLoaded(M))
+  else if (!OwnedModules.hasModuleBeenLoaded(M)) {
     // If this function doesn't belong to one of our modules, we're done.
+    // FIXME: Asking for the pointer to a function that hasn't been registered,
+    //        and isn't a declaration (which is handled above) should probably
+    //        be an assertion.
     return nullptr;
+  }
 
   // FIXME: Should the Dyld be retaining module information? Probably not.
   //
   // This is the accessor for the target address, so make sure to check the
   // load address of the symbol, not the local address.
-  Mangler Mang(TM->getDataLayout());
-  SmallString<128> Name;
-  TM->getNameWithPrefix(Name, F, Mang);
   return (void*)Dyld.getSymbolLoadAddress(Name);
 }
 
-void *MCJIT::recompileAndRelinkFunction(Function *F) {
-  report_fatal_error("not yet implemented");
-}
-
-void MCJIT::freeMachineCodeForFunction(Function *F) {
-  report_fatal_error("not yet implemented");
-}
-
 void MCJIT::runStaticConstructorsDestructorsInModulePtrSet(
     bool isDtors, ModulePtrSet::iterator I, ModulePtrSet::iterator E) {
   for (; I != E; ++I) {
-    ExecutionEngine::runStaticConstructorsDestructors(*I, isDtors);
+    ExecutionEngine::runStaticConstructorsDestructors(**I, isDtors);
   }
 }
 
@@ -529,8 +524,7 @@
   llvm_unreachable("Full-featured argument passing not supported yet!");
 }
 
-void *MCJIT::getPointerToNamedFunction(const std::string &Name,
-                                       bool AbortOnFailure) {
+void *MCJIT::getPointerToNamedFunction(StringRef Name, bool AbortOnFailure) {
   if (!isSymbolSearchingDisabled()) {
     void *ptr = MemMgr.getPointerToNamedFunction(Name, false);
     if (ptr)
@@ -559,8 +553,7 @@
   if (!L)
     return;
   MutexGuard locked(lock);
-  SmallVector<JITEventListener*, 2>::reverse_iterator I=
-      std::find(EventListeners.rbegin(), EventListeners.rend(), L);
+  auto I = std::find(EventListeners.rbegin(), EventListeners.rend(), L);
   if (I != EventListeners.rend()) {
     std::swap(*I, EventListeners.back());
     EventListeners.pop_back();
@@ -575,9 +568,8 @@
 }
 void MCJIT::NotifyFreeingObject(const ObjectImage& Obj) {
   MutexGuard locked(lock);
-  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
-    EventListeners[I]->NotifyFreeingObject(Obj);
-  }
+  for (JITEventListener *L : EventListeners)
+    L->NotifyFreeingObject(Obj);
 }
 
 uint64_t LinkingMemoryManager::getSymbolAddress(const std::string &Name) {
@@ -588,5 +580,7 @@
     Result = ParentEngine->getSymbolAddress(Name.substr(1), false);
   if (Result)
     return Result;
+  if (ParentEngine->isSymbolSearchingDisabled())
+    return 0;
   return ClientMM->getSymbolAddress(Name);
 }

diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 100e9a2..bc943b9 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_EXECUTIONENGINE_MCJIT_H
-#define LLVM_LIB_EXECUTIONENGINE_MCJIT_H
+#ifndef LLVM_LIB_EXECUTIONENGINE_MCJIT_MCJIT_H
+#define LLVM_LIB_EXECUTIONENGINE_MCJIT_MCJIT_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -101,8 +101,8 @@
 // called.
 
 class MCJIT : public ExecutionEngine {
-  MCJIT(Module *M, TargetMachine *tm, RTDyldMemoryManager *MemMgr,
-        bool AllocateGVsWithCode);
+  MCJIT(std::unique_ptr<Module> M, std::unique_ptr<TargetMachine> tm,
+        RTDyldMemoryManager *MemMgr);
 
   typedef llvm::SmallPtrSet<Module *, 4> ModulePtrSet;
 
@@ -118,6 +118,9 @@
 
     ModulePtrSet::iterator begin_added() { return AddedModules.begin(); }
     ModulePtrSet::iterator end_added() { return AddedModules.end(); }
+    iterator_range<ModulePtrSet::iterator> added() {
+      return iterator_range<ModulePtrSet::iterator>(begin_added(), end_added());
+    }
 
     ModulePtrSet::iterator begin_loaded() { return LoadedModules.begin(); }
     ModulePtrSet::iterator end_loaded() { return LoadedModules.end(); }
@@ -125,8 +128,8 @@
     ModulePtrSet::iterator begin_finalized() { return FinalizedModules.begin(); }
     ModulePtrSet::iterator end_finalized() { return FinalizedModules.end(); }
 
-    void addModule(Module *M) {
-      AddedModules.insert(M);
+    void addModule(std::unique_ptr<Module> M) {
+      AddedModules.insert(M.release());
     }
 
     bool removeModule(Module *M) {
@@ -208,18 +211,18 @@
     }
   };
 
-  TargetMachine *TM;
+  std::unique_ptr<TargetMachine> TM;
   MCContext *Ctx;
   LinkingMemoryManager MemMgr;
   RuntimeDyld Dyld;
-  SmallVector<JITEventListener*, 2> EventListeners;
+  std::vector<JITEventListener*> EventListeners;
 
   OwningModuleContainer OwnedModules;
 
-  SmallVector<object::Archive*, 2> Archives;
+  SmallVector<object::OwningBinary<object::Archive>, 2> Archives;
+  SmallVector<std::unique_ptr<MemoryBuffer>, 2> Buffers;
 
-  typedef SmallVector<ObjectImage *, 2> LoadedObjectList;
-  LoadedObjectList  LoadedObjects;
+  SmallVector<std::unique_ptr<ObjectImage>, 2> LoadedObjects;
 
   // An optional ObjectCache to be notified of compiled objects and used to
   // perform lookup of pre-compiled code to avoid re-compilation.
@@ -238,9 +241,10 @@
 
   /// @name ExecutionEngine interface implementation
   /// @{
-  void addModule(Module *M) override;
+  void addModule(std::unique_ptr<Module> M) override;
   void addObjectFile(std::unique_ptr<object::ObjectFile> O) override;
-  void addArchive(object::Archive *O) override;
+  void addObjectFile(object::OwningBinary<object::ObjectFile> O) override;
+  void addArchive(object::OwningBinary<object::Archive> O) override;
   bool removeModule(Module *M) override;
 
   /// FindFunctionNamed - Search all of the active modules to find the one that
@@ -276,14 +280,8 @@
   /// \param isDtors - Run the destructors instead of constructors.
   void runStaticConstructorsDestructors(bool isDtors) override;
 
-  void *getPointerToBasicBlock(BasicBlock *BB) override;
-
   void *getPointerToFunction(Function *F) override;
 
-  void *recompileAndRelinkFunction(Function *F) override;
-
-  void freeMachineCodeForFunction(Function *F) override;
-
   GenericValue runFunction(Function *F,
                            const std::vector<GenericValue> &ArgValues) override;
 
@@ -295,7 +293,7 @@
   /// found, this function silently returns a null pointer. Otherwise,
   /// it prints a message to stderr and aborts.
   ///
-  void *getPointerToNamedFunction(const std::string &Name,
+  void *getPointerToNamedFunction(StringRef Name,
                                   bool AbortOnFailure = true) override;
 
   /// mapSectionAddress - map a section to its target address space value.
@@ -315,7 +313,7 @@
   uint64_t getGlobalValueAddress(const std::string &Name) override;
   uint64_t getFunctionAddress(const std::string &Name) override;
 
-  TargetMachine *getTargetMachine() override { return TM; }
+  TargetMachine *getTargetMachine() override { return TM.get(); }
 
   /// @}
   /// @name (Private) Registration Interfaces
@@ -325,11 +323,10 @@
     MCJITCtor = createJIT;
   }
 
-  static ExecutionEngine *createJIT(Module *M,
+  static ExecutionEngine *createJIT(std::unique_ptr<Module> M,
                                     std::string *ErrorStr,
                                     RTDyldMemoryManager *MemMgr,
-                                    bool GVsWithCode,
-                                    TargetMachine *TM);
+                                    std::unique_ptr<TargetMachine> TM);
 
   // @}
 
@@ -344,7 +341,7 @@
   /// this function call is expected to be the contained module.  The module
   /// is passed as a parameter here to prepare for multiple module support in
   /// the future.
-  ObjectBufferStream* emitObject(Module *M);
+  std::unique_ptr<ObjectBufferStream> emitObject(Module *M);
 
   void NotifyObjectEmitted(const ObjectImage& Obj);
   void NotifyFreeingObject(const ObjectImage& Obj);

diff --git a/lib/ExecutionEngine/Makefile b/lib/ExecutionEngine/Makefile
index c26e0ad..cf71432 100644
--- a/lib/ExecutionEngine/Makefile
+++ b/lib/ExecutionEngine/Makefile

@@ -11,7 +11,7 @@
 
 include $(LEVEL)/Makefile.config
 
-PARALLEL_DIRS = Interpreter JIT MCJIT RuntimeDyld
+PARALLEL_DIRS = Interpreter MCJIT RuntimeDyld
 
 ifeq ($(USE_INTEL_JITEVENTS), 1)
 PARALLEL_DIRS += IntelJITEvents

diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index fd37a13..5a8ccb6 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp

@@ -49,12 +49,6 @@
 
   ~OProfileJITEventListener();
 
-  virtual void NotifyFunctionEmitted(const Function &F,
-                                void *FnStart, size_t FnSize,
-                                const JITEvent_EmittedFunctionDetails &Details);
-
-  virtual void NotifyFreeingMachineCode(void *OldPtr);
-
   virtual void NotifyObjectEmitted(const ObjectImage &Obj);
 
   virtual void NotifyFreeingObject(const ObjectImage &Obj);
@@ -81,90 +75,6 @@
   }
 }
 
-static debug_line_info LineStartToOProfileFormat(
-    const MachineFunction &MF, FilenameCache &Filenames,
-    uintptr_t Address, DebugLoc Loc) {
-  debug_line_info Result;
-  Result.vma = Address;
-  Result.lineno = Loc.getLine();
-  Result.filename = Filenames.getFilename(
-    Loc.getScope(MF.getFunction()->getContext()));
-  DEBUG(dbgs() << "Mapping " << reinterpret_cast<void*>(Result.vma) << " to "
-               << Result.filename << ":" << Result.lineno << "\n");
-  return Result;
-}
-
-// Adds the just-emitted function to the symbol table.
-void OProfileJITEventListener::NotifyFunctionEmitted(
-    const Function &F, void *FnStart, size_t FnSize,
-    const JITEvent_EmittedFunctionDetails &Details) {
-  assert(F.hasName() && FnStart != 0 && "Bad symbol to add");
-  if (Wrapper.op_write_native_code(F.getName().data(),
-                           reinterpret_cast<uint64_t>(FnStart),
-                           FnStart, FnSize) == -1) {
-    DEBUG(dbgs() << "Failed to tell OProfile about native function "
-          << F.getName() << " at ["
-          << FnStart << "-" << ((char*)FnStart + FnSize) << "]\n");
-    return;
-  }
-
-  if (!Details.LineStarts.empty()) {
-    // Now we convert the line number information from the address/DebugLoc
-    // format in Details to the address/filename/lineno format that OProfile
-    // expects.  Note that OProfile 0.9.4 has a bug that causes it to ignore
-    // line numbers for addresses above 4G.
-    FilenameCache Filenames;
-    std::vector<debug_line_info> LineInfo;
-    LineInfo.reserve(1 + Details.LineStarts.size());
-
-    DebugLoc FirstLoc = Details.LineStarts[0].Loc;
-    assert(!FirstLoc.isUnknown()
-           && "LineStarts should not contain unknown DebugLocs");
-    MDNode *FirstLocScope = FirstLoc.getScope(F.getContext());
-    DISubprogram FunctionDI = getDISubprogram(FirstLocScope);
-    if (FunctionDI.Verify()) {
-      // If we have debug info for the function itself, use that as the line
-      // number of the first several instructions.  Otherwise, after filling
-      // LineInfo, we'll adjust the address of the first line number to point at
-      // the start of the function.
-      debug_line_info line_info;
-      line_info.vma = reinterpret_cast<uintptr_t>(FnStart);
-      line_info.lineno = FunctionDI.getLineNumber();
-      line_info.filename = Filenames.getFilename(FirstLocScope);
-      LineInfo.push_back(line_info);
-    }
-
-    for (std::vector<EmittedFunctionDetails::LineStart>::const_iterator
-           I = Details.LineStarts.begin(), E = Details.LineStarts.end();
-         I != E; ++I) {
-      LineInfo.push_back(LineStartToOProfileFormat(
-                           *Details.MF, Filenames, I->Address, I->Loc));
-    }
-
-    // In case the function didn't have line info of its own, adjust the first
-    // line info's address to include the start of the function.
-    LineInfo[0].vma = reinterpret_cast<uintptr_t>(FnStart);
-
-    if (Wrapper.op_write_debug_line_info(FnStart, LineInfo.size(),
-                                      &*LineInfo.begin()) == -1) {
-      DEBUG(dbgs()
-            << "Failed to tell OProfile about line numbers for native function "
-            << F.getName() << " at ["
-            << FnStart << "-" << ((char*)FnStart + FnSize) << "]\n");
-    }
-  }
-}
-
-// Removes the being-deleted function from the symbol table.
-void OProfileJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
-  assert(FnStart && "Invalid function pointer");
-  if (Wrapper.op_unload_native_code(reinterpret_cast<uint64_t>(FnStart)) == -1) {
-    DEBUG(dbgs()
-          << "Failed to tell OProfile about unload of native function at "
-          << FnStart << "\n");
-  }
-}
-
 void OProfileJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
   if (!Wrapper.isAgentAvailable()) {
     return;

diff --git a/lib/ExecutionEngine/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RTDyldMemoryManager.cpp
index 1646937..51b2d0f 100644
--- a/lib/ExecutionEngine/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RTDyldMemoryManager.cpp

@@ -210,7 +210,8 @@
 #undef ARM_MATH_DECL
 #endif
 
-uint64_t RTDyldMemoryManager::getSymbolAddress(const std::string &Name) {
+uint64_t
+RTDyldMemoryManager::getSymbolAddressInProcess(const std::string &Name) {
   // This implementation assumes that the host program is the target.
   // Clients generating code for a remote target should implement their own
   // memory manager.
@@ -253,19 +254,19 @@
   // is called before ExecutionEngine::runFunctionAsMain() is called.
   if (Name == "__main") return (uint64_t)&jit_noop;
 
-  const char *NameStr = Name.c_str();
-  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
-  if (Ptr)
-    return (uint64_t)Ptr;
+  // Try to demangle Name before looking it up in the process, otherwise symbol
+  // '_<Name>' (if present) will shadow '<Name>', and there will be no way to
+  // refer to the latter.
 
-  // If it wasn't found and if it starts with an underscore ('_') character,
-  // try again without the underscore.
-  if (NameStr[0] == '_') {
-    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
-    if (Ptr)
+  const char *NameStr = Name.c_str();
+
+  if (NameStr[0] == '_')
+    if (void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr + 1))
       return (uint64_t)Ptr;
-  }
-  return 0;
+
+  // If we Name did not require demangling, or we failed to find the demangled
+  // name, try again without demangling.
+  return (uint64_t)sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
 }
 
 void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,

diff --git a/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp b/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
index 8546571..dfa3a20 100644
--- a/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp

@@ -101,7 +101,7 @@
 
 /// Lock used to serialize all jit registration events, since they
 /// modify global variables.
-llvm::sys::Mutex JITDebugLock;
+ManagedStatic<sys::Mutex> JITDebugLock;
 
 /// Do the registration.
 void NotifyDebugger(jit_code_entry* JITCodeEntry) {
@@ -121,7 +121,7 @@
 
 GDBJITRegistrar::~GDBJITRegistrar() {
   // Free all registered object files.
-  llvm::MutexGuard locked(JITDebugLock);
+  llvm::MutexGuard locked(*JITDebugLock);
   for (RegisteredObjectBufferMap::iterator I = ObjectBufferMap.begin(), E = ObjectBufferMap.end();
        I != E; ++I) {
     // Call the private method that doesn't update the map so our iterator
@@ -137,7 +137,7 @@
   size_t      Size = Object.getBufferSize();
 
   assert(Buffer && "Attempt to register a null object with a debugger.");
-  llvm::MutexGuard locked(JITDebugLock);
+  llvm::MutexGuard locked(*JITDebugLock);
   assert(ObjectBufferMap.find(Buffer) == ObjectBufferMap.end() &&
          "Second attempt to perform debug registration.");
   jit_code_entry* JITCodeEntry = new jit_code_entry();
@@ -156,7 +156,7 @@
 
 bool GDBJITRegistrar::deregisterObject(const ObjectBuffer& Object) {
   const char *Buffer = Object.getBufferStart();
-  llvm::MutexGuard locked(JITDebugLock);
+  llvm::MutexGuard locked(*JITDebugLock);
   RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(Buffer);
 
   if (I != ObjectBufferMap.end()) {

diff --git a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
index 6a514ea..636011f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
+++ b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
-#define LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_JITREGISTRAR_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_JITREGISTRAR_H
 
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 
@@ -41,4 +41,4 @@
 
 } // end namespace llvm
 
-#endif // LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
+#endif

diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
index c3a2182..9bbf6a0d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
-#define LLVM_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
 
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
@@ -36,20 +36,17 @@
 
   // This form of the constructor allows subclasses to use
   // format-specific subclasses of ObjectFile directly
-  ObjectImageCommon(ObjectBuffer *Input, std::unique_ptr<object::ObjectFile> Obj)
-  : ObjectImage(Input), // saves Input as Buffer and takes ownership
-    ObjFile(std::move(Obj))
-  {
-  }
+  ObjectImageCommon(std::unique_ptr<ObjectBuffer> Input,
+                    std::unique_ptr<object::ObjectFile> Obj)
+      : ObjectImage(std::move(Input)), ObjFile(std::move(Obj)) {}
 
 public:
-  ObjectImageCommon(ObjectBuffer* Input)
-  : ObjectImage(Input) // saves Input as Buffer and takes ownership
-  {
+  ObjectImageCommon(std::unique_ptr<ObjectBuffer> Input)
+      : ObjectImage(std::move(Input)) {
     // FIXME: error checking? createObjectFile returns an ErrorOr<ObjectFile*>
     // and should probably be checked for failure.
-    std::unique_ptr<MemoryBuffer> Buf(Buffer->getMemBuffer());
-    ObjFile.reset(object::ObjectFile::createObjectFile(Buf).get());
+    MemoryBufferRef Buf = Buffer->getMemBuffer();
+    ObjFile = std::move(object::ObjectFile::createObjectFile(Buf).get());
   }
   ObjectImageCommon(std::unique_ptr<object::ObjectFile> Input)
   : ObjectImage(nullptr), ObjFile(std::move(Input))  {}
@@ -86,4 +83,4 @@
 
 } // end namespace llvm
 
-#endif // LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
+#endif

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 9dfd167..c7c67f6 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp

@@ -14,6 +14,7 @@
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "JITRegistrar.h"
 #include "ObjectImageCommon.h"
+#include "RuntimeDyldCheckerImpl.h"
 #include "RuntimeDyldELF.h"
 #include "RuntimeDyldImpl.h"
 #include "RuntimeDyldMachO.h"
@@ -40,6 +41,44 @@
 
 void RuntimeDyldImpl::deregisterEHFrames() {}
 
+#ifndef NDEBUG
+static void dumpSectionMemory(const SectionEntry &S, StringRef State) {
+  dbgs() << "----- Contents of section " << S.Name << " " << State << " -----";
+
+  if (S.Address == nullptr) {
+    dbgs() << "\n          <section not emitted>\n";
+    return;
+  }
+
+  const unsigned ColsPerRow = 16;
+
+  uint8_t *DataAddr = S.Address;
+  uint64_t LoadAddr = S.LoadAddress;
+
+  unsigned StartPadding = LoadAddr & (ColsPerRow - 1);
+  unsigned BytesRemaining = S.Size;
+
+  if (StartPadding) {
+    dbgs() << "\n" << format("0x%016" PRIx64, LoadAddr & ~(ColsPerRow - 1)) << ":";
+    while (StartPadding--)
+      dbgs() << "   ";
+  }
+
+  while (BytesRemaining > 0) {
+    if ((LoadAddr & (ColsPerRow - 1)) == 0)
+      dbgs() << "\n" << format("0x%016" PRIx64, LoadAddr) << ":";
+
+    dbgs() << " " << format("%02x", *DataAddr);
+
+    ++DataAddr;
+    ++LoadAddr;
+    --BytesRemaining;
+  }
+
+  dbgs() << "\n";
+}
+#endif
+
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
   MutexGuard locked(lock);
@@ -55,8 +94,10 @@
     // entry provides the section to which the relocation will be applied.
     uint64_t Addr = Sections[i].LoadAddress;
     DEBUG(dbgs() << "Resolving relocations Section #" << i << "\t"
-                 << format("%p", (uint8_t *)Addr) << "\n");
+                 << format("0x%x", Addr) << "\n");
+    DEBUG(dumpSectionMemory(Sections[i], "before relocations"));
     resolveRelocationList(Relocations[i], Addr);
+    DEBUG(dumpSectionMemory(Sections[i], "after relocations"));
     Relocations.erase(i);
   }
 }
@@ -88,23 +129,20 @@
   if (std::error_code EC = Sym.getSection(SecI))
     return EC;
 
- if (SecI == Obj->section_end()) {
-   Result = UnknownAddressOrSize;
-   return object_error::success;
- }
+  if (SecI == Obj->section_end()) {
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  }
 
-  uint64_t SectionAddress;
-  if (std::error_code EC = SecI->getAddress(SectionAddress))
-    return EC;
-
+  uint64_t SectionAddress = SecI->getAddress();
   Result = Address - SectionAddress;
   return object_error::success;
 }
 
-ObjectImage *RuntimeDyldImpl::loadObject(ObjectImage *InputObject) {
+std::unique_ptr<ObjectImage>
+RuntimeDyldImpl::loadObject(std::unique_ptr<ObjectImage> Obj) {
   MutexGuard locked(lock);
 
-  std::unique_ptr<ObjectImage> Obj(InputObject);
   if (!Obj)
     return nullptr;
 
@@ -158,14 +196,13 @@
           SymType == object::SymbolRef::ST_Unknown) {
         uint64_t SectOffset;
         StringRef SectionData;
-        bool IsCode;
         section_iterator SI = Obj->end_sections();
         Check(getOffset(*I, SectOffset));
         Check(I->getSection(SI));
         if (SI == Obj->end_sections())
           continue;
         Check(SI->getContents(SectionData));
-        Check(SI->isText(IsCode));
+        bool IsCode = SI->isText();
         unsigned SectionID =
             findOrEmitSection(*Obj, *SI, IsCode, LocalSections);
         LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
@@ -195,8 +232,7 @@
     if (I == E && !ProcessAllSections)
       continue;
 
-    bool IsCode = false;
-    Check(RelocatedSection->isText(IsCode));
+    bool IsCode = RelocatedSection->isText();
     SectionID =
         findOrEmitSection(*Obj, *RelocatedSection, IsCode, LocalSections);
     DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
@@ -204,12 +240,17 @@
     for (; I != E;)
       I = processRelocationRef(SectionID, I, *Obj, LocalSections, LocalSymbols,
                                Stubs);
+
+    // If there is an attached checker, notify it about the stubs for this
+    // section so that they can be verified.
+    if (Checker)
+      Checker->registerStubMap(Obj->getImageName(), SectionID, Stubs);
   }
 
   // Give the subclasses a chance to tie-up any loose ends.
   finalizeLoad(*Obj, LocalSections);
 
-  return Obj.release();
+  return Obj;
 }
 
 // A helper method for computeTotalAllocSize.
@@ -245,20 +286,15 @@
        SI != SE; ++SI) {
     const SectionRef &Section = *SI;
 
-    bool IsRequired;
-    Check(Section.isRequiredForExecution(IsRequired));
+    bool IsRequired = Section.isRequiredForExecution();
 
     // Consider only the sections that are required to be loaded for execution
     if (IsRequired) {
-      uint64_t DataSize = 0;
-      uint64_t Alignment64 = 0;
-      bool IsCode = false;
-      bool IsReadOnly = false;
       StringRef Name;
-      Check(Section.getSize(DataSize));
-      Check(Section.getAlignment(Alignment64));
-      Check(Section.isText(IsCode));
-      Check(Section.isReadOnlyData(IsReadOnly));
+      uint64_t DataSize = Section.getSize();
+      uint64_t Alignment64 = Section.getAlignment();
+      bool IsCode = Section.isText();
+      bool IsReadOnly = Section.isReadOnlyData();
       Check(Section.getName(Name));
       unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
 
@@ -340,10 +376,8 @@
   }
 
   // Get section data size and alignment
-  uint64_t Alignment64;
-  uint64_t DataSize;
-  Check(Section.getSize(DataSize));
-  Check(Section.getAlignment(Alignment64));
+  uint64_t DataSize = Section.getSize();
+  uint64_t Alignment64 = Section.getAlignment();
 
   // Add stubbuf size alignment
   unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
@@ -354,6 +388,36 @@
   return StubBufSize;
 }
 
+uint64_t RuntimeDyldImpl::readBytesUnaligned(uint8_t *Src,
+                                             unsigned Size) const {
+  uint64_t Result = 0;
+  if (IsTargetLittleEndian) {
+    Src += Size - 1;
+    while (Size--)
+      Result = (Result << 8) | *Src--;
+  } else
+    while (Size--)
+      Result = (Result << 8) | *Src++;
+
+  return Result;
+}
+
+void RuntimeDyldImpl::writeBytesUnaligned(uint64_t Value, uint8_t *Dst,
+                                          unsigned Size) const {
+  if (IsTargetLittleEndian) {
+    while (Size--) {
+      *Dst++ = Value & 0xFF;
+      Value >>= 8;
+    }
+  } else {
+    Dst += Size - 1;
+    while (Size--) {
+      *Dst-- = Value & 0xFF;
+      Value >>= 8;
+    }
+  }
+}
+
 void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
                                         const CommonSymbolMap &CommonSymbols,
                                         uint64_t TotalSize,
@@ -365,7 +429,7 @@
   if (!Addr)
     report_fatal_error("Unable to allocate memory for common symbols!");
   uint64_t Offset = 0;
-  Sections.push_back(SectionEntry(StringRef(), Addr, TotalSize, 0));
+  Sections.push_back(SectionEntry("<common symbols>", Addr, TotalSize, 0));
   memset(Addr, 0, TotalSize);
 
   DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID << " new addr: "
@@ -397,24 +461,18 @@
                                       const SectionRef &Section, bool IsCode) {
 
   StringRef data;
-  uint64_t Alignment64;
   Check(Section.getContents(data));
-  Check(Section.getAlignment(Alignment64));
+  uint64_t Alignment64 = Section.getAlignment();
 
   unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
-  bool IsRequired;
-  bool IsVirtual;
-  bool IsZeroInit;
-  bool IsReadOnly;
-  uint64_t DataSize;
   unsigned PaddingSize = 0;
   unsigned StubBufSize = 0;
   StringRef Name;
-  Check(Section.isRequiredForExecution(IsRequired));
-  Check(Section.isVirtual(IsVirtual));
-  Check(Section.isZeroInit(IsZeroInit));
-  Check(Section.isReadOnlyData(IsReadOnly));
-  Check(Section.getSize(DataSize));
+  bool IsRequired = Section.isRequiredForExecution();
+  bool IsVirtual = Section.isVirtual();
+  bool IsZeroInit = Section.isZeroInit();
+  bool IsReadOnly = Section.isReadOnlyData();
+  uint64_t DataSize = Section.getSize();
   Check(Section.getName(Name));
 
   StubBufSize = computeSectionStubBufSize(Obj, Section);
@@ -477,6 +535,10 @@
   }
 
   Sections.push_back(SectionEntry(Name, Addr, DataSize, (uintptr_t)pData));
+
+  if (Checker)
+    Checker->registerSection(Obj.getImageName(), SectionID);
+
   return SectionID;
 }
 
@@ -517,34 +579,26 @@
   }
 }
 
-uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
-  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be ||
-      Arch == Triple::arm64 || Arch == Triple::arm64_be) {
+uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr,
+                                             unsigned AbiVariant) {
+  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be) {
     // This stub has to be able to access the full address space,
     // since symbol lookup won't necessarily find a handy, in-range,
     // PLT stub for functions which could be anywhere.
-    uint32_t *StubAddr = (uint32_t *)Addr;
-
     // Stub can use ip0 (== x16) to calculate address
-    *StubAddr = 0xd2e00010; // movz ip0, #:abs_g3:<addr>
-    StubAddr++;
-    *StubAddr = 0xf2c00010; // movk ip0, #:abs_g2_nc:<addr>
-    StubAddr++;
-    *StubAddr = 0xf2a00010; // movk ip0, #:abs_g1_nc:<addr>
-    StubAddr++;
-    *StubAddr = 0xf2800010; // movk ip0, #:abs_g0_nc:<addr>
-    StubAddr++;
-    *StubAddr = 0xd61f0200; // br ip0
+    writeBytesUnaligned(0xd2e00010, Addr,    4); // movz ip0, #:abs_g3:<addr>
+    writeBytesUnaligned(0xf2c00010, Addr+4,  4); // movk ip0, #:abs_g2_nc:<addr>
+    writeBytesUnaligned(0xf2a00010, Addr+8,  4); // movk ip0, #:abs_g1_nc:<addr>
+    writeBytesUnaligned(0xf2800010, Addr+12, 4); // movk ip0, #:abs_g0_nc:<addr>
+    writeBytesUnaligned(0xd61f0200, Addr+16, 4); // br ip0
 
     return Addr;
   } else if (Arch == Triple::arm || Arch == Triple::armeb) {
     // TODO: There is only ARM far stub now. We should add the Thumb stub,
     // and stubs for branches Thumb - ARM and ARM - Thumb.
-    uint32_t *StubAddr = (uint32_t *)Addr;
-    *StubAddr = 0xe51ff004; // ldr pc,<label>
-    return (uint8_t *)++StubAddr;
+    writeBytesUnaligned(0xe51ff004, Addr, 4); // ldr pc,<label>
+    return Addr + 4;
   } else if (Arch == Triple::mipsel || Arch == Triple::mips) {
-    uint32_t *StubAddr = (uint32_t *)Addr;
     // 0:   3c190000        lui     t9,%hi(addr).
     // 4:   27390000        addiu   t9,t9,%lo(addr).
     // 8:   03200008        jr      t9.
@@ -552,31 +606,37 @@
     const unsigned LuiT9Instr = 0x3c190000, AdduiT9Instr = 0x27390000;
     const unsigned JrT9Instr = 0x03200008, NopInstr = 0x0;
 
-    *StubAddr = LuiT9Instr;
-    StubAddr++;
-    *StubAddr = AdduiT9Instr;
-    StubAddr++;
-    *StubAddr = JrT9Instr;
-    StubAddr++;
-    *StubAddr = NopInstr;
+    writeBytesUnaligned(LuiT9Instr, Addr, 4);
+    writeBytesUnaligned(AdduiT9Instr, Addr+4, 4);
+    writeBytesUnaligned(JrT9Instr, Addr+8, 4);
+    writeBytesUnaligned(NopInstr, Addr+12, 4);
     return Addr;
   } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
-    // PowerPC64 stub: the address points to a function descriptor
-    // instead of the function itself. Load the function address
-    // on r11 and sets it to control register. Also loads the function
-    // TOC in r2 and environment pointer to r11.
+    // Depending on which version of the ELF ABI is in use, we need to
+    // generate one of two variants of the stub.  They both start with
+    // the same sequence to load the target address into r12.
     writeInt32BE(Addr,    0x3D800000); // lis   r12, highest(addr)
     writeInt32BE(Addr+4,  0x618C0000); // ori   r12, higher(addr)
     writeInt32BE(Addr+8,  0x798C07C6); // sldi  r12, r12, 32
     writeInt32BE(Addr+12, 0x658C0000); // oris  r12, r12, h(addr)
     writeInt32BE(Addr+16, 0x618C0000); // ori   r12, r12, l(addr)
-    writeInt32BE(Addr+20, 0xF8410028); // std   r2,  40(r1)
-    writeInt32BE(Addr+24, 0xE96C0000); // ld    r11, 0(r12)
-    writeInt32BE(Addr+28, 0xE84C0008); // ld    r2,  0(r12)
-    writeInt32BE(Addr+32, 0x7D6903A6); // mtctr r11
-    writeInt32BE(Addr+36, 0xE96C0010); // ld    r11, 16(r2)
-    writeInt32BE(Addr+40, 0x4E800420); // bctr
-
+    if (AbiVariant == 2) {
+      // PowerPC64 stub ELFv2 ABI: The address points to the function itself.
+      // The address is already in r12 as required by the ABI.  Branch to it.
+      writeInt32BE(Addr+20, 0xF8410018); // std   r2,  24(r1)
+      writeInt32BE(Addr+24, 0x7D8903A6); // mtctr r12
+      writeInt32BE(Addr+28, 0x4E800420); // bctr
+    } else {
+      // PowerPC64 stub ELFv1 ABI: The address points to a function descriptor.
+      // Load the function address on r11 and sets it to control register. Also
+      // loads the function TOC in r2 and environment pointer to r11.
+      writeInt32BE(Addr+20, 0xF8410028); // std   r2,  40(r1)
+      writeInt32BE(Addr+24, 0xE96C0000); // ld    r11, 0(r12)
+      writeInt32BE(Addr+28, 0xE84C0008); // ld    r2,  0(r12)
+      writeInt32BE(Addr+32, 0x7D6903A6); // mtctr r11
+      writeInt32BE(Addr+36, 0xE96C0010); // ld    r11, 16(r2)
+      writeInt32BE(Addr+40, 0x4E800420); // bctr
+    }
     return Addr;
   } else if (Arch == Triple::systemz) {
     writeInt16BE(Addr,    0xC418);     // lgrl %r1,.+8
@@ -609,6 +669,10 @@
   // Addr is a uint64_t because we can't assume the pointer width
   // of the target is the same as that of the host. Just use a generic
   // "big enough" type.
+  DEBUG(dbgs() << "Reassigning address for section "
+               << SectionID << " (" << Sections[SectionID].Name << "): "
+               << format("0x%016" PRIx64, Sections[SectionID].LoadAddress) << " -> "
+               << format("0x%016" PRIx64, Addr) << "\n");
   Sections[SectionID].LoadAddress = Addr;
 }
 
@@ -685,25 +749,31 @@
   Dyld = nullptr;
   MM = mm;
   ProcessAllSections = false;
+  Checker = nullptr;
 }
 
-RuntimeDyld::~RuntimeDyld() { delete Dyld; }
+RuntimeDyld::~RuntimeDyld() {}
 
 static std::unique_ptr<RuntimeDyldELF>
-createRuntimeDyldELF(RTDyldMemoryManager *MM, bool ProcessAllSections) {
+createRuntimeDyldELF(RTDyldMemoryManager *MM, bool ProcessAllSections,
+                     RuntimeDyldCheckerImpl *Checker) {
   std::unique_ptr<RuntimeDyldELF> Dyld(new RuntimeDyldELF(MM));
   Dyld->setProcessAllSections(ProcessAllSections);
+  Dyld->setRuntimeDyldChecker(Checker);
   return Dyld;
 }
 
 static std::unique_ptr<RuntimeDyldMachO>
-createRuntimeDyldMachO(RTDyldMemoryManager *MM, bool ProcessAllSections) {
-  std::unique_ptr<RuntimeDyldMachO> Dyld(new RuntimeDyldMachO(MM));
+createRuntimeDyldMachO(Triple::ArchType Arch, RTDyldMemoryManager *MM,
+                       bool ProcessAllSections, RuntimeDyldCheckerImpl *Checker) {
+  std::unique_ptr<RuntimeDyldMachO> Dyld(RuntimeDyldMachO::create(Arch, MM));
   Dyld->setProcessAllSections(ProcessAllSections);
+  Dyld->setRuntimeDyldChecker(Checker);
   return Dyld;
 }
 
-ObjectImage *RuntimeDyld::loadObject(std::unique_ptr<ObjectFile> InputObject) {
+std::unique_ptr<ObjectImage>
+RuntimeDyld::loadObject(std::unique_ptr<ObjectFile> InputObject) {
   std::unique_ptr<ObjectImage> InputImage;
 
   ObjectFile &Obj = *InputObject;
@@ -711,33 +781,37 @@
   if (InputObject->isELF()) {
     InputImage.reset(RuntimeDyldELF::createObjectImageFromFile(std::move(InputObject)));
     if (!Dyld)
-      Dyld = createRuntimeDyldELF(MM, ProcessAllSections).release();
+      Dyld = createRuntimeDyldELF(MM, ProcessAllSections, Checker);
   } else if (InputObject->isMachO()) {
     InputImage.reset(RuntimeDyldMachO::createObjectImageFromFile(std::move(InputObject)));
     if (!Dyld)
-      Dyld = createRuntimeDyldMachO(MM, ProcessAllSections).release();
+      Dyld = createRuntimeDyldMachO(
+          static_cast<Triple::ArchType>(InputImage->getArch()), MM,
+          ProcessAllSections, Checker);
   } else
     report_fatal_error("Incompatible object format!");
 
   if (!Dyld->isCompatibleFile(&Obj))
     report_fatal_error("Incompatible object format!");
 
-  Dyld->loadObject(InputImage.get());
-  return InputImage.release();
+  return Dyld->loadObject(std::move(InputImage));
 }
 
-ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
+std::unique_ptr<ObjectImage>
+RuntimeDyld::loadObject(std::unique_ptr<ObjectBuffer> InputBuffer) {
   std::unique_ptr<ObjectImage> InputImage;
   sys::fs::file_magic Type = sys::fs::identify_magic(InputBuffer->getBuffer());
+  auto *InputBufferPtr = InputBuffer.get();
 
   switch (Type) {
+  case sys::fs::file_magic::elf:
   case sys::fs::file_magic::elf_relocatable:
   case sys::fs::file_magic::elf_executable:
   case sys::fs::file_magic::elf_shared_object:
   case sys::fs::file_magic::elf_core:
-    InputImage.reset(RuntimeDyldELF::createObjectImage(InputBuffer));
+    InputImage = RuntimeDyldELF::createObjectImage(std::move(InputBuffer));
     if (!Dyld)
-      Dyld = createRuntimeDyldELF(MM, ProcessAllSections).release();
+      Dyld = createRuntimeDyldELF(MM, ProcessAllSections, Checker);
     break;
   case sys::fs::file_magic::macho_object:
   case sys::fs::file_magic::macho_executable:
@@ -749,9 +823,11 @@
   case sys::fs::file_magic::macho_bundle:
   case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
   case sys::fs::file_magic::macho_dsym_companion:
-    InputImage.reset(RuntimeDyldMachO::createObjectImage(InputBuffer));
+    InputImage = RuntimeDyldMachO::createObjectImage(std::move(InputBuffer));
     if (!Dyld)
-      Dyld = createRuntimeDyldMachO(MM, ProcessAllSections).release();
+      Dyld = createRuntimeDyldMachO(
+          static_cast<Triple::ArchType>(InputImage->getArch()), MM,
+          ProcessAllSections, Checker);
     break;
   case sys::fs::file_magic::unknown:
   case sys::fs::file_magic::bitcode:
@@ -764,20 +840,19 @@
     report_fatal_error("Incompatible object format!");
   }
 
-  if (!Dyld->isCompatibleFormat(InputBuffer))
+  if (!Dyld->isCompatibleFormat(InputBufferPtr))
     report_fatal_error("Incompatible object format!");
 
-  Dyld->loadObject(InputImage.get());
-  return InputImage.release();
+  return Dyld->loadObject(std::move(InputImage));
 }
 
-void *RuntimeDyld::getSymbolAddress(StringRef Name) {
+void *RuntimeDyld::getSymbolAddress(StringRef Name) const {
   if (!Dyld)
     return nullptr;
   return Dyld->getSymbolAddress(Name);
 }
 
-uint64_t RuntimeDyld::getSymbolLoadAddress(StringRef Name) {
+uint64_t RuntimeDyld::getSymbolLoadAddress(StringRef Name) const {
   if (!Dyld)
     return 0;
   return Dyld->getSymbolLoadAddress(Name);

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 190bbbf..8818349 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp

@@ -7,11 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Support/StringRefMemoryObject.h"
+#include "llvm/Support/Path.h"
+#include "RuntimeDyldCheckerImpl.h"
 #include "RuntimeDyldImpl.h"
 #include <cctype>
 #include <memory>
@@ -22,579 +24,696 @@
 
 namespace llvm {
 
-  // Helper class that implements the language evaluated by RuntimeDyldChecker.
-  class RuntimeDyldCheckerExprEval {
-  public:
+// Helper class that implements the language evaluated by RuntimeDyldChecker.
+class RuntimeDyldCheckerExprEval {
+public:
+  RuntimeDyldCheckerExprEval(const RuntimeDyldCheckerImpl &Checker,
+                             raw_ostream &ErrStream)
+      : Checker(Checker) {}
 
-    RuntimeDyldCheckerExprEval(const RuntimeDyldChecker &Checker,
-                               llvm::raw_ostream &ErrStream)
-      : Checker(Checker), ErrStream(ErrStream) {}
+  bool evaluate(StringRef Expr) const {
+    // Expect equality expression of the form 'LHS = RHS'.
+    Expr = Expr.trim();
+    size_t EQIdx = Expr.find('=');
 
-    bool evaluate(StringRef Expr) const {
-      // Expect equality expression of the form 'LHS = RHS'.
-      Expr = Expr.trim();
-      size_t EQIdx = Expr.find('=');
+    ParseContext OutsideLoad(false);
 
-      // Evaluate LHS.
-      StringRef LHSExpr = Expr.substr(0, EQIdx).rtrim();
-      StringRef RemainingExpr;
-      EvalResult LHSResult;
-      std::tie(LHSResult, RemainingExpr) =
-        evalComplexExpr(evalSimpleExpr(LHSExpr));
-      if (LHSResult.hasError())
-        return handleError(Expr, LHSResult);
-      if (RemainingExpr != "")
-        return handleError(Expr, unexpectedToken(RemainingExpr, LHSExpr, ""));
+    // Evaluate LHS.
+    StringRef LHSExpr = Expr.substr(0, EQIdx).rtrim();
+    StringRef RemainingExpr;
+    EvalResult LHSResult;
+    std::tie(LHSResult, RemainingExpr) =
+        evalComplexExpr(evalSimpleExpr(LHSExpr, OutsideLoad), OutsideLoad);
+    if (LHSResult.hasError())
+      return handleError(Expr, LHSResult);
+    if (RemainingExpr != "")
+      return handleError(Expr, unexpectedToken(RemainingExpr, LHSExpr, ""));
 
-      // Evaluate RHS.
-      StringRef RHSExpr = Expr.substr(EQIdx + 1).ltrim();
-      EvalResult RHSResult;
-      std::tie(RHSResult, RemainingExpr) =
-        evalComplexExpr(evalSimpleExpr(RHSExpr));
-      if (RHSResult.hasError())
-        return handleError(Expr, RHSResult);
-      if (RemainingExpr != "")
-        return handleError(Expr, unexpectedToken(RemainingExpr, RHSExpr, ""));
+    // Evaluate RHS.
+    StringRef RHSExpr = Expr.substr(EQIdx + 1).ltrim();
+    EvalResult RHSResult;
+    std::tie(RHSResult, RemainingExpr) =
+        evalComplexExpr(evalSimpleExpr(RHSExpr, OutsideLoad), OutsideLoad);
+    if (RHSResult.hasError())
+      return handleError(Expr, RHSResult);
+    if (RemainingExpr != "")
+      return handleError(Expr, unexpectedToken(RemainingExpr, RHSExpr, ""));
 
-      if (LHSResult.getValue() != RHSResult.getValue()) {
-        ErrStream << "Expression '" << Expr << "' is false: "
-                  << format("0x%lx", LHSResult.getValue()) << " != "
-                  << format("0x%lx", RHSResult.getValue()) << "\n";
-        return false;
-      }
-      return true;
-    }
-
-  private:
-    const RuntimeDyldChecker &Checker;
-    llvm::raw_ostream &ErrStream;
-
-    enum class BinOpToken : unsigned { Invalid, Add, Sub, BitwiseAnd,
-                                       BitwiseOr, ShiftLeft, ShiftRight };
-
-    class EvalResult {
-    public:
-      EvalResult()
-        : Value(0), ErrorMsg("") {}
-      EvalResult(uint64_t Value)
-        : Value(Value), ErrorMsg("") {}
-      EvalResult(std::string ErrorMsg)
-        : Value(0), ErrorMsg(ErrorMsg) {}
-      uint64_t getValue() const { return Value; }
-      bool hasError() const { return ErrorMsg != ""; }
-      const std::string& getErrorMsg() const { return ErrorMsg; }
-    private:
-      uint64_t Value;
-      std::string ErrorMsg;
-    };
-
-    StringRef getTokenForError(StringRef Expr) const {
-      if (Expr.empty())
-        return "";
-
-      StringRef Token, Remaining;
-      if (isalpha(Expr[0]))
-        std::tie(Token, Remaining) = parseSymbol(Expr);
-      else if (isdigit(Expr[0]))
-        std::tie(Token, Remaining) = parseNumberString(Expr);
-      else {
-        unsigned TokLen = 1;
-        if (Expr.startswith("<<") || Expr.startswith(">>"))
-          TokLen = 2;
-        Token = Expr.substr(0, TokLen);
-      }
-      return Token;
-    }
-
-    EvalResult unexpectedToken(StringRef TokenStart,
-                               StringRef SubExpr,
-                               StringRef ErrText) const {
-      std::string ErrorMsg("Encountered unexpected token '");
-      ErrorMsg += getTokenForError(TokenStart);
-      if (SubExpr != "") {
-        ErrorMsg += "' while parsing subexpression '";
-        ErrorMsg += SubExpr;
-      }
-      ErrorMsg += "'";
-      if (ErrText != "") {
-        ErrorMsg += " ";
-        ErrorMsg += ErrText;
-      }
-      return EvalResult(std::move(ErrorMsg));
-    }
-
-    bool handleError(StringRef Expr, const EvalResult &R) const {
-      assert(R.hasError() && "Not an error result.");
-      ErrStream << "Error evaluating expression '" << Expr << "': "
-                << R.getErrorMsg() << "\n";
+    if (LHSResult.getValue() != RHSResult.getValue()) {
+      Checker.ErrStream << "Expression '" << Expr << "' is false: "
+                        << format("0x%" PRIx64, LHSResult.getValue())
+                        << " != " << format("0x%" PRIx64, RHSResult.getValue())
+                        << "\n";
       return false;
     }
+    return true;
+  }
 
-    std::pair<BinOpToken, StringRef> parseBinOpToken(StringRef Expr) const {
-      if (Expr.empty())
-        return std::make_pair(BinOpToken::Invalid, "");
+private:
+  // RuntimeDyldCheckerExprEval requires some context when parsing exprs. In
+  // particular, it needs to know whether a symbol is being evaluated in the
+  // context of a load, in which case we want the linker's local address for
+  // the symbol, or outside of a load, in which case we want the symbol's
+  // address in the remote target.
 
-      // Handle the two 2-character tokens.
-      if (Expr.startswith("<<"))
-        return std::make_pair(BinOpToken::ShiftLeft,
-                              Expr.substr(2).ltrim());
-      if (Expr.startswith(">>"))
-        return std::make_pair(BinOpToken::ShiftRight,
-                              Expr.substr(2).ltrim());
-
-      // Handle one-character tokens.
-      BinOpToken Op;
-      switch (Expr[0]) {
-        default: return std::make_pair(BinOpToken::Invalid, Expr);
-        case '+': Op = BinOpToken::Add; break;
-        case '-': Op = BinOpToken::Sub; break;
-        case '&': Op = BinOpToken::BitwiseAnd; break;
-        case '|': Op = BinOpToken::BitwiseOr; break;
-      }
-
-      return std::make_pair(Op, Expr.substr(1).ltrim());
-    }
-
-    EvalResult computeBinOpResult(BinOpToken Op, const EvalResult &LHSResult,
-                                  const EvalResult &RHSResult) const {
-      switch (Op) {
-      default: llvm_unreachable("Tried to evaluate unrecognized operation.");
-      case BinOpToken::Add:
-        return EvalResult(LHSResult.getValue() + RHSResult.getValue());
-      case BinOpToken::Sub:
-        return EvalResult(LHSResult.getValue() - RHSResult.getValue());
-      case BinOpToken::BitwiseAnd:
-        return EvalResult(LHSResult.getValue() & RHSResult.getValue());
-      case BinOpToken::BitwiseOr:
-        return EvalResult(LHSResult.getValue() | RHSResult.getValue());
-      case BinOpToken::ShiftLeft:
-        return EvalResult(LHSResult.getValue() << RHSResult.getValue());
-      case BinOpToken::ShiftRight:
-        return EvalResult(LHSResult.getValue() >> RHSResult.getValue());
-      }
-    }
-
-    // Parse a symbol and return a (string, string) pair representing the symbol
-    // name and expression remaining to be parsed.
-    std::pair<StringRef, StringRef> parseSymbol(StringRef Expr) const {
-      size_t FirstNonSymbol =
-        Expr.find_first_not_of("0123456789"
-                               "abcdefghijklmnopqrstuvwxyz"
-                               "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                               ":_");
-      return std::make_pair(Expr.substr(0, FirstNonSymbol),
-                            Expr.substr(FirstNonSymbol).ltrim());
-    }
-
-    // Evaluate a call to decode_operand. Decode the instruction operand at the
-    // given symbol and get the value of the requested operand.
-    // Returns an error if the instruction cannot be decoded, or the requested
-    // operand is not an immediate.
-    // On success, retuns a pair containing the value of the operand, plus
-    // the expression remaining to be evaluated.
-    std::pair<EvalResult, StringRef> evalDecodeOperand(StringRef Expr) const {
-      if (!Expr.startswith("("))
-        return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), "");
-      StringRef RemainingExpr = Expr.substr(1).ltrim();
-      StringRef Symbol;
-      std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
-
-      if (!Checker.checkSymbolIsValidForLoad(Symbol))
-        return std::make_pair(EvalResult(("Cannot decode unknown symbol '" +
-                                          Symbol + "'").str()),
-                              "");
-
-      if (!RemainingExpr.startswith(","))
-        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
-                                              "expected ','"),
-                              "");
-      RemainingExpr = RemainingExpr.substr(1).ltrim();
-
-      EvalResult OpIdxExpr;
-      std::tie(OpIdxExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
-      if (OpIdxExpr.hasError())
-        return std::make_pair(OpIdxExpr, "");
-
-      if (!RemainingExpr.startswith(")"))
-        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
-                                              "expected ')'"),
-                              "");
-      RemainingExpr = RemainingExpr.substr(1).ltrim();
-
-      MCInst Inst;
-      uint64_t Size;
-      if (!decodeInst(Symbol, Inst, Size))
-        return std::make_pair(EvalResult(("Couldn't decode instruction at '" +
-                                          Symbol + "'").str()),
-                              "");
-
-      unsigned OpIdx = OpIdxExpr.getValue();
-      if (OpIdx >= Inst.getNumOperands()) {
-        std::string ErrMsg;
-        raw_string_ostream ErrMsgStream(ErrMsg);
-        ErrMsgStream << "Invalid operand index '" << format("%i", OpIdx)
-                     << " for instruction '" << Symbol
-                     << ". Instruction has only "
-                     << format("%i", Inst.getNumOperands()) << " operands.";
-        return std::make_pair(EvalResult(ErrMsgStream.str()), "");
-      }
-
-      const MCOperand &Op = Inst.getOperand(OpIdx);
-      if (!Op.isImm()) {
-        std::string ErrMsg;
-        raw_string_ostream ErrMsgStream(ErrMsg);
-        ErrMsgStream << "Operand '" << format("%i", OpIdx)
-                     << "' of instruction '" << Symbol
-                     << "' is not an immediate.\nInstruction is:\n  ";
-        Inst.dump_pretty(ErrMsgStream,
-                         Checker.Disassembler->getContext().getAsmInfo(),
-                         Checker.InstPrinter);
-
-        return std::make_pair(EvalResult(ErrMsgStream.str()), "");
-      }
-
-      return std::make_pair(EvalResult(Op.getImm()), RemainingExpr);
-    }
-
-    // Evaluate a call to next_pc. Decode the instruction at the given
-    // symbol and return the following program counter..
-    // Returns an error if the instruction cannot be decoded.
-    // On success, returns a pair containing the next PC, plus the length of the
-    // expression remaining to be evaluated.
-    std::pair<EvalResult, StringRef> evalNextPC(StringRef Expr) const {
-      if (!Expr.startswith("("))
-        return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), "");
-      StringRef RemainingExpr = Expr.substr(1).ltrim();
-      StringRef Symbol;
-      std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
-
-      if (!Checker.checkSymbolIsValidForLoad(Symbol))
-        return std::make_pair(EvalResult(("Cannot decode unknown symbol '"
-                                          + Symbol + "'").str()),
-                              "");
-
-      if (!RemainingExpr.startswith(")"))
-        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
-                                              "expected ')'"),
-                              "");
-      RemainingExpr = RemainingExpr.substr(1).ltrim();
-
-      MCInst Inst;
-      uint64_t Size;
-      if (!decodeInst(Symbol, Inst, Size))
-        return std::make_pair(EvalResult(("Couldn't decode instruction at '" +
-                                          Symbol + "'").str()),
-                              "");
-      uint64_t NextPC = Checker.getSymbolAddress(Symbol) + Size;
-
-      return std::make_pair(EvalResult(NextPC), RemainingExpr);
-    }
-
-    // Evaluate an identiefer expr, which may be a symbol, or a call to
-    // one of the builtin functions: get_insn_opcode or get_insn_length.
-    // Return the result, plus the expression remaining to be parsed.
-    std::pair<EvalResult, StringRef> evalIdentifierExpr(StringRef Expr) const {
-      StringRef Symbol;
-      StringRef RemainingExpr;
-      std::tie(Symbol, RemainingExpr) = parseSymbol(Expr);
-
-      // Check for builtin function calls.
-      if (Symbol == "decode_operand")
-        return evalDecodeOperand(RemainingExpr);
-      else if (Symbol == "next_pc")
-        return evalNextPC(RemainingExpr);
-
-      // Looks like a plain symbol reference.
-      return std::make_pair(EvalResult(Checker.getSymbolAddress(Symbol)),
-                            RemainingExpr);
-    }
-
-    // Parse a number (hexadecimal or decimal) and return a (string, string)
-    // pair representing the number and the expression remaining to be parsed.
-    std::pair<StringRef, StringRef> parseNumberString(StringRef Expr) const {
-      size_t FirstNonDigit = StringRef::npos;
-      if (Expr.startswith("0x")) {
-        FirstNonDigit = Expr.find_first_not_of("0123456789abcdefABCDEF", 2);
-        if (FirstNonDigit == StringRef::npos)
-          FirstNonDigit = Expr.size();
-      } else {
-        FirstNonDigit = Expr.find_first_not_of("0123456789");
-        if (FirstNonDigit == StringRef::npos)
-          FirstNonDigit = Expr.size();
-      }
-      return std::make_pair(Expr.substr(0, FirstNonDigit),
-                            Expr.substr(FirstNonDigit));
-    }
-
-    // Evaluate a constant numeric expression (hexidecimal or decimal) and
-    // return a pair containing the result, and the expression remaining to be
-    // evaluated.
-    std::pair<EvalResult, StringRef> evalNumberExpr(StringRef Expr) const {
-      StringRef ValueStr;
-      StringRef RemainingExpr;
-      std::tie(ValueStr, RemainingExpr) = parseNumberString(Expr);
-
-      if (ValueStr.empty() || !isdigit(ValueStr[0]))
-        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
-                                              "expected number"),
-                              "");
-      uint64_t Value;
-      ValueStr.getAsInteger(0, Value);
-      return std::make_pair(EvalResult(Value), RemainingExpr);
-    }
-
-    // Evaluate an expression of the form "(<expr>)" and return a pair
-    // containing the result of evaluating <expr>, plus the expression
-    // remaining to be parsed.
-    std::pair<EvalResult, StringRef> evalParensExpr(StringRef Expr) const {
-      assert(Expr.startswith("(") && "Not a parenthesized expression");
-      EvalResult SubExprResult;
-      StringRef RemainingExpr;
-      std::tie(SubExprResult, RemainingExpr) =
-        evalComplexExpr(evalSimpleExpr(Expr.substr(1).ltrim()));
-      if (SubExprResult.hasError())
-        return std::make_pair(SubExprResult, "");
-      if (!RemainingExpr.startswith(")"))
-        return std::make_pair(unexpectedToken(RemainingExpr, Expr,
-                                              "expected ')'"),
-                              "");
-      RemainingExpr = RemainingExpr.substr(1).ltrim();
-      return std::make_pair(SubExprResult, RemainingExpr);
-    }
-
-    // Evaluate an expression in one of the following forms:
-    //   *{<number>}<symbol>
-    //   *{<number>}(<symbol> + <number>)
-    //   *{<number>}(<symbol> - <number>)
-    // Return a pair containing the result, plus the expression remaining to be
-    // parsed.
-    std::pair<EvalResult, StringRef> evalLoadExpr(StringRef Expr) const {
-      assert(Expr.startswith("*") && "Not a load expression");
-      StringRef RemainingExpr = Expr.substr(1).ltrim();
-      // Parse read size.
-      if (!RemainingExpr.startswith("{"))
-        return std::make_pair(EvalResult("Expected '{' following '*'."), "");
-      RemainingExpr = RemainingExpr.substr(1).ltrim();
-      EvalResult ReadSizeExpr;
-      std::tie(ReadSizeExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
-      if (ReadSizeExpr.hasError())
-        return std::make_pair(ReadSizeExpr, RemainingExpr);
-      uint64_t ReadSize = ReadSizeExpr.getValue();
-      if (ReadSize < 1 || ReadSize > 8)
-        return std::make_pair(EvalResult("Invalid size for dereference."), "");
-      if (!RemainingExpr.startswith("}"))
-        return std::make_pair(EvalResult("Missing '}' for dereference."), "");
-      RemainingExpr = RemainingExpr.substr(1).ltrim();
-
-      // Check for '(symbol +/- constant)' form.
-      bool SymbolPlusConstant = false;
-      if (RemainingExpr.startswith("(")) {
-        SymbolPlusConstant = true;
-        RemainingExpr = RemainingExpr.substr(1).ltrim();
-      }
-
-      // Read symbol.
-      StringRef Symbol;
-      std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
-
-      if (!Checker.checkSymbolIsValidForLoad(Symbol))
-        return std::make_pair(EvalResult(("Cannot dereference unknown symbol '"
-                                          + Symbol + "'").str()),
-                              "");
-
-      // Set up defaut offset.
-      int64_t Offset = 0;
-
-      // Handle "+/- constant)" portion if necessary.
-      if (SymbolPlusConstant) {
-        char OpChar = RemainingExpr[0];
-        if (OpChar != '+' && OpChar != '-')
-          return std::make_pair(EvalResult("Invalid operator in load address."),
-                                "");
-        RemainingExpr = RemainingExpr.substr(1).ltrim();
-
-        EvalResult OffsetExpr;
-        std::tie(OffsetExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
-
-        Offset = (OpChar == '+') ?
-                   OffsetExpr.getValue() : -1 * OffsetExpr.getValue();
-
-        if (!RemainingExpr.startswith(")"))
-          return std::make_pair(EvalResult("Missing ')' in load address."),
-                                "");
-
-        RemainingExpr = RemainingExpr.substr(1).ltrim();
-      }
-
-      return std::make_pair(
-               EvalResult(Checker.readMemoryAtSymbol(Symbol, Offset, ReadSize)),
-               RemainingExpr);
-    }
-
-    // Evaluate a "simple" expression. This is any expression that _isn't_ an
-    // un-parenthesized binary expression.
-    //
-    // "Simple" expressions can be optionally bit-sliced. See evalSlicedExpr.
-    //
-    // Returns a pair containing the result of the evaluation, plus the
-    // expression remaining to be parsed.
-    std::pair<EvalResult, StringRef> evalSimpleExpr(StringRef Expr) const {
-      EvalResult SubExprResult;
-      StringRef RemainingExpr;
-
-      if (Expr.empty())
-        return std::make_pair(EvalResult("Unexpected end of expression"), "");
-
-      if (Expr[0] == '(')
-        std::tie(SubExprResult, RemainingExpr) = evalParensExpr(Expr);
-      else if (Expr[0] == '*')
-        std::tie(SubExprResult, RemainingExpr) = evalLoadExpr(Expr);
-      else if (isalpha(Expr[0]))
-        std::tie(SubExprResult, RemainingExpr) = evalIdentifierExpr(Expr);
-      else if (isdigit(Expr[0]))
-        std::tie(SubExprResult, RemainingExpr) = evalNumberExpr(Expr);
-
-      if (SubExprResult.hasError())
-        return std::make_pair(SubExprResult, RemainingExpr);
-
-      // Evaluate bit-slice if present.
-      if (RemainingExpr.startswith("["))
-        std::tie(SubExprResult, RemainingExpr) =
-          evalSliceExpr(std::make_pair(SubExprResult, RemainingExpr));
-
-      return std::make_pair(SubExprResult, RemainingExpr);
-    }
-
-    // Evaluate a bit-slice of an expression.
-    // A bit-slice has the form "<expr>[high:low]". The result of evaluating a
-    // slice is the bits between high and low (inclusive) in the original
-    // expression, right shifted so that the "low" bit is in position 0 in the
-    // result.
-    // Returns a pair containing the result of the slice operation, plus the
-    // expression remaining to be parsed.
-    std::pair<EvalResult, StringRef> evalSliceExpr(
-                                    std::pair<EvalResult, StringRef> Ctx) const{
-      EvalResult SubExprResult;
-      StringRef RemainingExpr;
-      std::tie(SubExprResult, RemainingExpr) = Ctx;
-
-      assert(RemainingExpr.startswith("[") && "Not a slice expr.");
-      RemainingExpr = RemainingExpr.substr(1).ltrim();
-
-      EvalResult HighBitExpr;
-      std::tie(HighBitExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
-
-      if (HighBitExpr.hasError())
-        return std::make_pair(HighBitExpr, RemainingExpr);
-
-      if (!RemainingExpr.startswith(":"))
-        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
-                                              "expected ':'"),
-                              "");
-      RemainingExpr = RemainingExpr.substr(1).ltrim();
-
-      EvalResult LowBitExpr;
-      std::tie(LowBitExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
-
-      if (LowBitExpr.hasError())
-        return std::make_pair(LowBitExpr, RemainingExpr);
-
-      if (!RemainingExpr.startswith("]"))
-        return std::make_pair(unexpectedToken(RemainingExpr, RemainingExpr,
-                                              "expected ']'"),
-                              "");
-      RemainingExpr = RemainingExpr.substr(1).ltrim();
-
-      unsigned HighBit = HighBitExpr.getValue();
-      unsigned LowBit = LowBitExpr.getValue();
-      uint64_t Mask = ((uint64_t)1 << (HighBit - LowBit + 1)) - 1;
-      uint64_t SlicedValue = (SubExprResult.getValue() >> LowBit) & Mask;
-      return std::make_pair(EvalResult(SlicedValue), RemainingExpr);
-    }
-
-    // Evaluate a "complex" expression.
-    // Takes an already evaluated subexpression and checks for the presence of a
-    // binary operator, computing the result of the binary operation if one is
-    // found. Used to make arithmetic expressions left-associative.
-    // Returns a pair containing the ultimate result of evaluating the
-    // expression, plus the expression remaining to be evaluated.
-    std::pair<EvalResult, StringRef> evalComplexExpr(
-                                   std::pair<EvalResult, StringRef> Ctx) const {
-      EvalResult LHSResult;
-      StringRef RemainingExpr;
-      std::tie(LHSResult, RemainingExpr) = Ctx;
-
-      // If there was an error, or there's nothing left to evaluate, return the
-      // result.
-      if (LHSResult.hasError() || RemainingExpr == "")
-        return std::make_pair(LHSResult, RemainingExpr);
-
-      // Otherwise check if this is a binary expressioan.
-      BinOpToken BinOp;
-      std::tie(BinOp, RemainingExpr) = parseBinOpToken(RemainingExpr);
-
-      // If this isn't a recognized expression just return.
-      if (BinOp == BinOpToken::Invalid)
-        return std::make_pair(LHSResult, RemainingExpr);
-
-      // This is a recognized bin-op. Evaluate the RHS, then evaluate the binop.
-      EvalResult RHSResult;
-      std::tie(RHSResult, RemainingExpr) = evalSimpleExpr(RemainingExpr);
-
-      // If there was an error evaluating the RHS, return it.
-      if (RHSResult.hasError())
-        return std::make_pair(RHSResult, RemainingExpr);
-
-      // This is a binary expression - evaluate and try to continue as a
-      // complex expr.
-      EvalResult ThisResult(computeBinOpResult(BinOp, LHSResult, RHSResult));
-
-      return evalComplexExpr(std::make_pair(ThisResult, RemainingExpr));
-    }
-
-    bool decodeInst(StringRef Symbol, MCInst &Inst, uint64_t &Size) const {
-      MCDisassembler *Dis = Checker.Disassembler;
-      StringRef SectionMem = Checker.getSubsectionStartingAt(Symbol);
-      StringRefMemoryObject SectionBytes(SectionMem, 0);
-
-      MCDisassembler::DecodeStatus S =
-        Dis->getInstruction(Inst, Size, SectionBytes, 0, nulls(), nulls());
-
-      return (S == MCDisassembler::Success);
-    }
-
+  struct ParseContext {
+    bool IsInsideLoad;
+    ParseContext(bool IsInsideLoad) : IsInsideLoad(IsInsideLoad) {}
   };
 
+  const RuntimeDyldCheckerImpl &Checker;
+
+  enum class BinOpToken : unsigned {
+    Invalid,
+    Add,
+    Sub,
+    BitwiseAnd,
+    BitwiseOr,
+    ShiftLeft,
+    ShiftRight
+  };
+
+  class EvalResult {
+  public:
+    EvalResult() : Value(0), ErrorMsg("") {}
+    EvalResult(uint64_t Value) : Value(Value), ErrorMsg("") {}
+    EvalResult(std::string ErrorMsg) : Value(0), ErrorMsg(ErrorMsg) {}
+    uint64_t getValue() const { return Value; }
+    bool hasError() const { return ErrorMsg != ""; }
+    const std::string &getErrorMsg() const { return ErrorMsg; }
+
+  private:
+    uint64_t Value;
+    std::string ErrorMsg;
+  };
+
+  StringRef getTokenForError(StringRef Expr) const {
+    if (Expr.empty())
+      return "";
+
+    StringRef Token, Remaining;
+    if (isalpha(Expr[0]))
+      std::tie(Token, Remaining) = parseSymbol(Expr);
+    else if (isdigit(Expr[0]))
+      std::tie(Token, Remaining) = parseNumberString(Expr);
+    else {
+      unsigned TokLen = 1;
+      if (Expr.startswith("<<") || Expr.startswith(">>"))
+        TokLen = 2;
+      Token = Expr.substr(0, TokLen);
+    }
+    return Token;
+  }
+
+  EvalResult unexpectedToken(StringRef TokenStart, StringRef SubExpr,
+                             StringRef ErrText) const {
+    std::string ErrorMsg("Encountered unexpected token '");
+    ErrorMsg += getTokenForError(TokenStart);
+    if (SubExpr != "") {
+      ErrorMsg += "' while parsing subexpression '";
+      ErrorMsg += SubExpr;
+    }
+    ErrorMsg += "'";
+    if (ErrText != "") {
+      ErrorMsg += " ";
+      ErrorMsg += ErrText;
+    }
+    return EvalResult(std::move(ErrorMsg));
+  }
+
+  bool handleError(StringRef Expr, const EvalResult &R) const {
+    assert(R.hasError() && "Not an error result.");
+    Checker.ErrStream << "Error evaluating expression '" << Expr
+                      << "': " << R.getErrorMsg() << "\n";
+    return false;
+  }
+
+  std::pair<BinOpToken, StringRef> parseBinOpToken(StringRef Expr) const {
+    if (Expr.empty())
+      return std::make_pair(BinOpToken::Invalid, "");
+
+    // Handle the two 2-character tokens.
+    if (Expr.startswith("<<"))
+      return std::make_pair(BinOpToken::ShiftLeft, Expr.substr(2).ltrim());
+    if (Expr.startswith(">>"))
+      return std::make_pair(BinOpToken::ShiftRight, Expr.substr(2).ltrim());
+
+    // Handle one-character tokens.
+    BinOpToken Op;
+    switch (Expr[0]) {
+    default:
+      return std::make_pair(BinOpToken::Invalid, Expr);
+    case '+':
+      Op = BinOpToken::Add;
+      break;
+    case '-':
+      Op = BinOpToken::Sub;
+      break;
+    case '&':
+      Op = BinOpToken::BitwiseAnd;
+      break;
+    case '|':
+      Op = BinOpToken::BitwiseOr;
+      break;
+    }
+
+    return std::make_pair(Op, Expr.substr(1).ltrim());
+  }
+
+  EvalResult computeBinOpResult(BinOpToken Op, const EvalResult &LHSResult,
+                                const EvalResult &RHSResult) const {
+    switch (Op) {
+    default:
+      llvm_unreachable("Tried to evaluate unrecognized operation.");
+    case BinOpToken::Add:
+      return EvalResult(LHSResult.getValue() + RHSResult.getValue());
+    case BinOpToken::Sub:
+      return EvalResult(LHSResult.getValue() - RHSResult.getValue());
+    case BinOpToken::BitwiseAnd:
+      return EvalResult(LHSResult.getValue() & RHSResult.getValue());
+    case BinOpToken::BitwiseOr:
+      return EvalResult(LHSResult.getValue() | RHSResult.getValue());
+    case BinOpToken::ShiftLeft:
+      return EvalResult(LHSResult.getValue() << RHSResult.getValue());
+    case BinOpToken::ShiftRight:
+      return EvalResult(LHSResult.getValue() >> RHSResult.getValue());
+    }
+  }
+
+  // Parse a symbol and return a (string, string) pair representing the symbol
+  // name and expression remaining to be parsed.
+  std::pair<StringRef, StringRef> parseSymbol(StringRef Expr) const {
+    size_t FirstNonSymbol = Expr.find_first_not_of("0123456789"
+                                                   "abcdefghijklmnopqrstuvwxyz"
+                                                   "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                                   ":_.$");
+    return std::make_pair(Expr.substr(0, FirstNonSymbol),
+                          Expr.substr(FirstNonSymbol).ltrim());
+  }
+
+  // Evaluate a call to decode_operand. Decode the instruction operand at the
+  // given symbol and get the value of the requested operand.
+  // Returns an error if the instruction cannot be decoded, or the requested
+  // operand is not an immediate.
+  // On success, retuns a pair containing the value of the operand, plus
+  // the expression remaining to be evaluated.
+  std::pair<EvalResult, StringRef> evalDecodeOperand(StringRef Expr) const {
+    if (!Expr.startswith("("))
+      return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), "");
+    StringRef RemainingExpr = Expr.substr(1).ltrim();
+    StringRef Symbol;
+    std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
+
+    if (!Checker.isSymbolValid(Symbol))
+      return std::make_pair(
+          EvalResult(("Cannot decode unknown symbol '" + Symbol + "'").str()),
+          "");
+
+    if (!RemainingExpr.startswith(","))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, RemainingExpr, "expected ','"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    EvalResult OpIdxExpr;
+    std::tie(OpIdxExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
+    if (OpIdxExpr.hasError())
+      return std::make_pair(OpIdxExpr, "");
+
+    if (!RemainingExpr.startswith(")"))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, RemainingExpr, "expected ')'"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    MCInst Inst;
+    uint64_t Size;
+    if (!decodeInst(Symbol, Inst, Size))
+      return std::make_pair(
+          EvalResult(("Couldn't decode instruction at '" + Symbol + "'").str()),
+          "");
+
+    unsigned OpIdx = OpIdxExpr.getValue();
+    if (OpIdx >= Inst.getNumOperands()) {
+      std::string ErrMsg;
+      raw_string_ostream ErrMsgStream(ErrMsg);
+      ErrMsgStream << "Invalid operand index '" << format("%i", OpIdx)
+                   << "' for instruction '" << Symbol
+                   << "'. Instruction has only "
+                   << format("%i", Inst.getNumOperands())
+                   << " operands.\nInstruction is:\n  ";
+      Inst.dump_pretty(ErrMsgStream,
+                       Checker.Disassembler->getContext().getAsmInfo(),
+                       Checker.InstPrinter);
+      return std::make_pair(EvalResult(ErrMsgStream.str()), "");
+    }
+
+    const MCOperand &Op = Inst.getOperand(OpIdx);
+    if (!Op.isImm()) {
+      std::string ErrMsg;
+      raw_string_ostream ErrMsgStream(ErrMsg);
+      ErrMsgStream << "Operand '" << format("%i", OpIdx) << "' of instruction '"
+                   << Symbol << "' is not an immediate.\nInstruction is:\n  ";
+      Inst.dump_pretty(ErrMsgStream,
+                       Checker.Disassembler->getContext().getAsmInfo(),
+                       Checker.InstPrinter);
+
+      return std::make_pair(EvalResult(ErrMsgStream.str()), "");
+    }
+
+    return std::make_pair(EvalResult(Op.getImm()), RemainingExpr);
+  }
+
+  // Evaluate a call to next_pc.
+  // Decode the instruction at the given symbol and return the following program
+  // counter.
+  // Returns an error if the instruction cannot be decoded.
+  // On success, returns a pair containing the next PC, plus of the
+  // expression remaining to be evaluated.
+  std::pair<EvalResult, StringRef> evalNextPC(StringRef Expr,
+                                              ParseContext PCtx) const {
+    if (!Expr.startswith("("))
+      return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), "");
+    StringRef RemainingExpr = Expr.substr(1).ltrim();
+    StringRef Symbol;
+    std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
+
+    if (!Checker.isSymbolValid(Symbol))
+      return std::make_pair(
+          EvalResult(("Cannot decode unknown symbol '" + Symbol + "'").str()),
+          "");
+
+    if (!RemainingExpr.startswith(")"))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, RemainingExpr, "expected ')'"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    MCInst Inst;
+    uint64_t InstSize;
+    if (!decodeInst(Symbol, Inst, InstSize))
+      return std::make_pair(
+          EvalResult(("Couldn't decode instruction at '" + Symbol + "'").str()),
+          "");
+
+    uint64_t SymbolAddr = PCtx.IsInsideLoad
+                              ? Checker.getSymbolLinkerAddr(Symbol)
+                              : Checker.getSymbolRemoteAddr(Symbol);
+    uint64_t NextPC = SymbolAddr + InstSize;
+
+    return std::make_pair(EvalResult(NextPC), RemainingExpr);
+  }
+
+  // Evaluate a call to stub_addr.
+  // Look up and return the address of the stub for the given
+  // (<file name>, <section name>, <symbol name>) tuple.
+  // On success, returns a pair containing the stub address, plus the expression
+  // remaining to be evaluated.
+  std::pair<EvalResult, StringRef> evalStubAddr(StringRef Expr,
+                                                ParseContext PCtx) const {
+    if (!Expr.startswith("("))
+      return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), "");
+    StringRef RemainingExpr = Expr.substr(1).ltrim();
+
+    // Handle file-name specially, as it may contain characters that aren't
+    // legal for symbols.
+    StringRef FileName;
+    size_t ComaIdx = RemainingExpr.find(',');
+    FileName = RemainingExpr.substr(0, ComaIdx).rtrim();
+    RemainingExpr = RemainingExpr.substr(ComaIdx).ltrim();
+
+    if (!RemainingExpr.startswith(","))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, Expr, "expected ','"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    StringRef SectionName;
+    std::tie(SectionName, RemainingExpr) = parseSymbol(RemainingExpr);
+
+    if (!RemainingExpr.startswith(","))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, Expr, "expected ','"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    StringRef Symbol;
+    std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
+
+    if (!RemainingExpr.startswith(")"))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, Expr, "expected ')'"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    uint64_t StubAddr;
+    std::string ErrorMsg = "";
+    std::tie(StubAddr, ErrorMsg) = Checker.getStubAddrFor(
+        FileName, SectionName, Symbol, PCtx.IsInsideLoad);
+
+    if (ErrorMsg != "")
+      return std::make_pair(EvalResult(ErrorMsg), "");
+
+    return std::make_pair(EvalResult(StubAddr), RemainingExpr);
+  }
+
+  std::pair<EvalResult, StringRef> evalSectionAddr(StringRef Expr,
+                                                   ParseContext PCtx) const {
+    if (!Expr.startswith("("))
+      return std::make_pair(unexpectedToken(Expr, Expr, "expected '('"), "");
+    StringRef RemainingExpr = Expr.substr(1).ltrim();
+
+    // Handle file-name specially, as it may contain characters that aren't
+    // legal for symbols.
+    StringRef FileName;
+    size_t ComaIdx = RemainingExpr.find(',');
+    FileName = RemainingExpr.substr(0, ComaIdx).rtrim();
+    RemainingExpr = RemainingExpr.substr(ComaIdx).ltrim();
+
+    if (!RemainingExpr.startswith(","))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, Expr, "expected ','"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    StringRef SectionName;
+    std::tie(SectionName, RemainingExpr) = parseSymbol(RemainingExpr);
+
+    if (!RemainingExpr.startswith(")"))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, Expr, "expected ')'"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    uint64_t StubAddr;
+    std::string ErrorMsg = "";
+    std::tie(StubAddr, ErrorMsg) = Checker.getSectionAddr(
+        FileName, SectionName, PCtx.IsInsideLoad);
+
+    if (ErrorMsg != "")
+      return std::make_pair(EvalResult(ErrorMsg), "");
+
+    return std::make_pair(EvalResult(StubAddr), RemainingExpr);
+  }
+
+  // Evaluate an identiefer expr, which may be a symbol, or a call to
+  // one of the builtin functions: get_insn_opcode or get_insn_length.
+  // Return the result, plus the expression remaining to be parsed.
+  std::pair<EvalResult, StringRef> evalIdentifierExpr(StringRef Expr,
+                                                      ParseContext PCtx) const {
+    StringRef Symbol;
+    StringRef RemainingExpr;
+    std::tie(Symbol, RemainingExpr) = parseSymbol(Expr);
+
+    // Check for builtin function calls.
+    if (Symbol == "decode_operand")
+      return evalDecodeOperand(RemainingExpr);
+    else if (Symbol == "next_pc")
+      return evalNextPC(RemainingExpr, PCtx);
+    else if (Symbol == "stub_addr")
+      return evalStubAddr(RemainingExpr, PCtx);
+    else if (Symbol == "section_addr")
+      return evalSectionAddr(RemainingExpr, PCtx);
+
+    if (!Checker.isSymbolValid(Symbol)) {
+      std::string ErrMsg("No known address for symbol '");
+      ErrMsg += Symbol;
+      ErrMsg += "'";
+      if (Symbol.startswith("L"))
+        ErrMsg += " (this appears to be an assembler local label - "
+                  " perhaps drop the 'L'?)";
+
+      return std::make_pair(EvalResult(ErrMsg), "");
+    }
+
+    // The value for the symbol depends on the context we're evaluating in:
+    // Inside a load this is the address in the linker's memory, outside a
+    // load it's the address in the target processes memory.
+    uint64_t Value = PCtx.IsInsideLoad ? Checker.getSymbolLinkerAddr(Symbol)
+                                       : Checker.getSymbolRemoteAddr(Symbol);
+
+    // Looks like a plain symbol reference.
+    return std::make_pair(EvalResult(Value), RemainingExpr);
+  }
+
+  // Parse a number (hexadecimal or decimal) and return a (string, string)
+  // pair representing the number and the expression remaining to be parsed.
+  std::pair<StringRef, StringRef> parseNumberString(StringRef Expr) const {
+    size_t FirstNonDigit = StringRef::npos;
+    if (Expr.startswith("0x")) {
+      FirstNonDigit = Expr.find_first_not_of("0123456789abcdefABCDEF", 2);
+      if (FirstNonDigit == StringRef::npos)
+        FirstNonDigit = Expr.size();
+    } else {
+      FirstNonDigit = Expr.find_first_not_of("0123456789");
+      if (FirstNonDigit == StringRef::npos)
+        FirstNonDigit = Expr.size();
+    }
+    return std::make_pair(Expr.substr(0, FirstNonDigit),
+                          Expr.substr(FirstNonDigit));
+  }
+
+  // Evaluate a constant numeric expression (hexidecimal or decimal) and
+  // return a pair containing the result, and the expression remaining to be
+  // evaluated.
+  std::pair<EvalResult, StringRef> evalNumberExpr(StringRef Expr) const {
+    StringRef ValueStr;
+    StringRef RemainingExpr;
+    std::tie(ValueStr, RemainingExpr) = parseNumberString(Expr);
+
+    if (ValueStr.empty() || !isdigit(ValueStr[0]))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, RemainingExpr, "expected number"), "");
+    uint64_t Value;
+    ValueStr.getAsInteger(0, Value);
+    return std::make_pair(EvalResult(Value), RemainingExpr);
+  }
+
+  // Evaluate an expression of the form "(<expr>)" and return a pair
+  // containing the result of evaluating <expr>, plus the expression
+  // remaining to be parsed.
+  std::pair<EvalResult, StringRef> evalParensExpr(StringRef Expr,
+                                                  ParseContext PCtx) const {
+    assert(Expr.startswith("(") && "Not a parenthesized expression");
+    EvalResult SubExprResult;
+    StringRef RemainingExpr;
+    std::tie(SubExprResult, RemainingExpr) =
+        evalComplexExpr(evalSimpleExpr(Expr.substr(1).ltrim(), PCtx), PCtx);
+    if (SubExprResult.hasError())
+      return std::make_pair(SubExprResult, "");
+    if (!RemainingExpr.startswith(")"))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, Expr, "expected ')'"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+    return std::make_pair(SubExprResult, RemainingExpr);
+  }
+
+  // Evaluate an expression in one of the following forms:
+  //   *{<number>}<expr>
+  // Return a pair containing the result, plus the expression remaining to be
+  // parsed.
+  std::pair<EvalResult, StringRef> evalLoadExpr(StringRef Expr) const {
+    assert(Expr.startswith("*") && "Not a load expression");
+    StringRef RemainingExpr = Expr.substr(1).ltrim();
+
+    // Parse read size.
+    if (!RemainingExpr.startswith("{"))
+      return std::make_pair(EvalResult("Expected '{' following '*'."), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+    EvalResult ReadSizeExpr;
+    std::tie(ReadSizeExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
+    if (ReadSizeExpr.hasError())
+      return std::make_pair(ReadSizeExpr, RemainingExpr);
+    uint64_t ReadSize = ReadSizeExpr.getValue();
+    if (ReadSize < 1 || ReadSize > 8)
+      return std::make_pair(EvalResult("Invalid size for dereference."), "");
+    if (!RemainingExpr.startswith("}"))
+      return std::make_pair(EvalResult("Missing '}' for dereference."), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    // Evaluate the expression representing the load address.
+    ParseContext LoadCtx(true);
+    EvalResult LoadAddrExprResult;
+    std::tie(LoadAddrExprResult, RemainingExpr) =
+        evalComplexExpr(evalSimpleExpr(RemainingExpr, LoadCtx), LoadCtx);
+
+    if (LoadAddrExprResult.hasError())
+      return std::make_pair(LoadAddrExprResult, "");
+
+    uint64_t LoadAddr = LoadAddrExprResult.getValue();
+
+    return std::make_pair(
+        EvalResult(Checker.readMemoryAtAddr(LoadAddr, ReadSize)),
+        RemainingExpr);
+  }
+
+  // Evaluate a "simple" expression. This is any expression that _isn't_ an
+  // un-parenthesized binary expression.
+  //
+  // "Simple" expressions can be optionally bit-sliced. See evalSlicedExpr.
+  //
+  // Returns a pair containing the result of the evaluation, plus the
+  // expression remaining to be parsed.
+  std::pair<EvalResult, StringRef> evalSimpleExpr(StringRef Expr,
+                                                  ParseContext PCtx) const {
+    EvalResult SubExprResult;
+    StringRef RemainingExpr;
+
+    if (Expr.empty())
+      return std::make_pair(EvalResult("Unexpected end of expression"), "");
+
+    if (Expr[0] == '(')
+      std::tie(SubExprResult, RemainingExpr) = evalParensExpr(Expr, PCtx);
+    else if (Expr[0] == '*')
+      std::tie(SubExprResult, RemainingExpr) = evalLoadExpr(Expr);
+    else if (isalpha(Expr[0]) || Expr[0] == '_')
+      std::tie(SubExprResult, RemainingExpr) = evalIdentifierExpr(Expr, PCtx);
+    else if (isdigit(Expr[0]))
+      std::tie(SubExprResult, RemainingExpr) = evalNumberExpr(Expr);
+    else
+      return std::make_pair(
+          unexpectedToken(Expr, Expr,
+                          "expected '(', '*', identifier, or number"), "");
+
+    if (SubExprResult.hasError())
+      return std::make_pair(SubExprResult, RemainingExpr);
+
+    // Evaluate bit-slice if present.
+    if (RemainingExpr.startswith("["))
+      std::tie(SubExprResult, RemainingExpr) =
+          evalSliceExpr(std::make_pair(SubExprResult, RemainingExpr));
+
+    return std::make_pair(SubExprResult, RemainingExpr);
+  }
+
+  // Evaluate a bit-slice of an expression.
+  // A bit-slice has the form "<expr>[high:low]". The result of evaluating a
+  // slice is the bits between high and low (inclusive) in the original
+  // expression, right shifted so that the "low" bit is in position 0 in the
+  // result.
+  // Returns a pair containing the result of the slice operation, plus the
+  // expression remaining to be parsed.
+  std::pair<EvalResult, StringRef>
+  evalSliceExpr(std::pair<EvalResult, StringRef> Ctx) const {
+    EvalResult SubExprResult;
+    StringRef RemainingExpr;
+    std::tie(SubExprResult, RemainingExpr) = Ctx;
+
+    assert(RemainingExpr.startswith("[") && "Not a slice expr.");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    EvalResult HighBitExpr;
+    std::tie(HighBitExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
+
+    if (HighBitExpr.hasError())
+      return std::make_pair(HighBitExpr, RemainingExpr);
+
+    if (!RemainingExpr.startswith(":"))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, RemainingExpr, "expected ':'"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    EvalResult LowBitExpr;
+    std::tie(LowBitExpr, RemainingExpr) = evalNumberExpr(RemainingExpr);
+
+    if (LowBitExpr.hasError())
+      return std::make_pair(LowBitExpr, RemainingExpr);
+
+    if (!RemainingExpr.startswith("]"))
+      return std::make_pair(
+          unexpectedToken(RemainingExpr, RemainingExpr, "expected ']'"), "");
+    RemainingExpr = RemainingExpr.substr(1).ltrim();
+
+    unsigned HighBit = HighBitExpr.getValue();
+    unsigned LowBit = LowBitExpr.getValue();
+    uint64_t Mask = ((uint64_t)1 << (HighBit - LowBit + 1)) - 1;
+    uint64_t SlicedValue = (SubExprResult.getValue() >> LowBit) & Mask;
+    return std::make_pair(EvalResult(SlicedValue), RemainingExpr);
+  }
+
+  // Evaluate a "complex" expression.
+  // Takes an already evaluated subexpression and checks for the presence of a
+  // binary operator, computing the result of the binary operation if one is
+  // found. Used to make arithmetic expressions left-associative.
+  // Returns a pair containing the ultimate result of evaluating the
+  // expression, plus the expression remaining to be evaluated.
+  std::pair<EvalResult, StringRef>
+  evalComplexExpr(std::pair<EvalResult, StringRef> LHSAndRemaining,
+                  ParseContext PCtx) const {
+    EvalResult LHSResult;
+    StringRef RemainingExpr;
+    std::tie(LHSResult, RemainingExpr) = LHSAndRemaining;
+
+    // If there was an error, or there's nothing left to evaluate, return the
+    // result.
+    if (LHSResult.hasError() || RemainingExpr == "")
+      return std::make_pair(LHSResult, RemainingExpr);
+
+    // Otherwise check if this is a binary expressioan.
+    BinOpToken BinOp;
+    std::tie(BinOp, RemainingExpr) = parseBinOpToken(RemainingExpr);
+
+    // If this isn't a recognized expression just return.
+    if (BinOp == BinOpToken::Invalid)
+      return std::make_pair(LHSResult, RemainingExpr);
+
+    // This is a recognized bin-op. Evaluate the RHS, then evaluate the binop.
+    EvalResult RHSResult;
+    std::tie(RHSResult, RemainingExpr) = evalSimpleExpr(RemainingExpr, PCtx);
+
+    // If there was an error evaluating the RHS, return it.
+    if (RHSResult.hasError())
+      return std::make_pair(RHSResult, RemainingExpr);
+
+    // This is a binary expression - evaluate and try to continue as a
+    // complex expr.
+    EvalResult ThisResult(computeBinOpResult(BinOp, LHSResult, RHSResult));
+
+    return evalComplexExpr(std::make_pair(ThisResult, RemainingExpr), PCtx);
+  }
+
+  bool decodeInst(StringRef Symbol, MCInst &Inst, uint64_t &Size) const {
+    MCDisassembler *Dis = Checker.Disassembler;
+    StringRef SectionMem = Checker.getSubsectionStartingAt(Symbol);
+    ArrayRef<uint8_t> SectionBytes(
+        reinterpret_cast<const uint8_t *>(SectionMem.data()),
+        SectionMem.size());
+
+    MCDisassembler::DecodeStatus S =
+        Dis->getInstruction(Inst, Size, SectionBytes, 0, nulls(), nulls());
+
+    return (S == MCDisassembler::Success);
+  }
+};
 }
 
-bool RuntimeDyldChecker::check(StringRef CheckExpr) const {
+RuntimeDyldCheckerImpl::RuntimeDyldCheckerImpl(RuntimeDyld &RTDyld,
+                                               MCDisassembler *Disassembler,
+                                               MCInstPrinter *InstPrinter,
+                                               raw_ostream &ErrStream)
+    : RTDyld(RTDyld), Disassembler(Disassembler), InstPrinter(InstPrinter),
+      ErrStream(ErrStream) {
+  RTDyld.Checker = this;
+}
+
+bool RuntimeDyldCheckerImpl::check(StringRef CheckExpr) const {
   CheckExpr = CheckExpr.trim();
-  DEBUG(llvm::dbgs() << "RuntimeDyldChecker: Checking '" << CheckExpr
-                     << "'...\n");
+  DEBUG(dbgs() << "RuntimeDyldChecker: Checking '" << CheckExpr << "'...\n");
   RuntimeDyldCheckerExprEval P(*this, ErrStream);
   bool Result = P.evaluate(CheckExpr);
   (void)Result;
-  DEBUG(llvm::dbgs() << "RuntimeDyldChecker: '" << CheckExpr << "' "
-                     << (Result ? "passed" : "FAILED") << ".\n");
+  DEBUG(dbgs() << "RuntimeDyldChecker: '" << CheckExpr << "' "
+               << (Result ? "passed" : "FAILED") << ".\n");
   return Result;
 }
 
-bool RuntimeDyldChecker::checkAllRulesInBuffer(StringRef RulePrefix,
-                                               MemoryBuffer* MemBuf) const {
+bool RuntimeDyldCheckerImpl::checkAllRulesInBuffer(StringRef RulePrefix,
+                                                   MemoryBuffer *MemBuf) const {
   bool DidAllTestsPass = true;
   unsigned NumRules = 0;
 
   const char *LineStart = MemBuf->getBufferStart();
 
   // Eat whitespace.
-  while (LineStart != MemBuf->getBufferEnd() &&
-         std::isspace(*LineStart))
+  while (LineStart != MemBuf->getBufferEnd() && std::isspace(*LineStart))
     ++LineStart;
 
   while (LineStart != MemBuf->getBufferEnd() && *LineStart != '\0') {
     const char *LineEnd = LineStart;
-    while (LineEnd != MemBuf->getBufferEnd() &&
-           *LineEnd != '\r' && *LineEnd != '\n')
+    while (LineEnd != MemBuf->getBufferEnd() && *LineEnd != '\r' &&
+           *LineEnd != '\n')
       ++LineEnd;
 
     StringRef Line(LineStart, LineEnd - LineStart);
@@ -605,37 +724,210 @@
 
     // Eat whitespace.
     LineStart = LineEnd;
-    while (LineStart != MemBuf->getBufferEnd() &&
-           std::isspace(*LineStart))
+    while (LineStart != MemBuf->getBufferEnd() && std::isspace(*LineStart))
       ++LineStart;
   }
   return DidAllTestsPass && (NumRules != 0);
 }
 
-bool RuntimeDyldChecker::checkSymbolIsValidForLoad(StringRef Symbol) const {
-  return RTDyld.getSymbolAddress(Symbol) != nullptr;
+bool RuntimeDyldCheckerImpl::isSymbolValid(StringRef Symbol) const {
+  return getRTDyld().getSymbolAddress(Symbol) != nullptr;
 }
 
-uint64_t RuntimeDyldChecker::getSymbolAddress(StringRef Symbol) const {
-  return RTDyld.getAnySymbolRemoteAddress(Symbol);
+uint64_t RuntimeDyldCheckerImpl::getSymbolLinkerAddr(StringRef Symbol) const {
+  return static_cast<uint64_t>(
+      reinterpret_cast<uintptr_t>(getRTDyld().getSymbolAddress(Symbol)));
 }
 
-uint64_t RuntimeDyldChecker::readMemoryAtSymbol(StringRef Symbol,
-                                                int64_t Offset,
-                                                unsigned Size) const {
-  uint8_t *Src = RTDyld.getSymbolAddress(Symbol);
-  uint64_t Result = 0;
-  memcpy(&Result, Src + Offset, Size);
-  return Result;
+uint64_t RuntimeDyldCheckerImpl::getSymbolRemoteAddr(StringRef Symbol) const {
+  return getRTDyld().getAnySymbolRemoteAddress(Symbol);
 }
 
-StringRef RuntimeDyldChecker::getSubsectionStartingAt(StringRef Name) const {
+uint64_t RuntimeDyldCheckerImpl::readMemoryAtAddr(uint64_t SrcAddr,
+                                                  unsigned Size) const {
+  uintptr_t PtrSizedAddr = static_cast<uintptr_t>(SrcAddr);
+  assert(PtrSizedAddr == SrcAddr && "Linker memory pointer out-of-range.");
+  uint8_t *Src = reinterpret_cast<uint8_t*>(PtrSizedAddr);
+  return getRTDyld().readBytesUnaligned(Src, Size);
+}
+
+
+std::pair<const RuntimeDyldCheckerImpl::SectionAddressInfo*, std::string>
+RuntimeDyldCheckerImpl::findSectionAddrInfo(StringRef FileName,
+                                            StringRef SectionName) const {
+
+  auto SectionMapItr = Stubs.find(FileName);
+  if (SectionMapItr == Stubs.end()) {
+    std::string ErrorMsg = "File '";
+    ErrorMsg += FileName;
+    ErrorMsg += "' not found. ";
+    if (Stubs.empty())
+      ErrorMsg += "No stubs registered.";
+    else {
+      ErrorMsg += "Available files are:";
+      for (const auto& StubEntry : Stubs) {
+        ErrorMsg += " '";
+        ErrorMsg += StubEntry.first;
+        ErrorMsg += "'";
+      }
+    }
+    ErrorMsg += "\n";
+    return std::make_pair(nullptr, ErrorMsg);
+  }
+
+  auto SectionInfoItr = SectionMapItr->second.find(SectionName);
+  if (SectionInfoItr == SectionMapItr->second.end())
+    return std::make_pair(nullptr,
+                          ("Section '" + SectionName + "' not found in file '" +
+                           FileName + "'\n").str());
+
+  return std::make_pair(&SectionInfoItr->second, std::string(""));
+}
+
+std::pair<uint64_t, std::string> RuntimeDyldCheckerImpl::getSectionAddr(
+    StringRef FileName, StringRef SectionName, bool IsInsideLoad) const {
+
+  const SectionAddressInfo *SectionInfo = nullptr;
+  {
+    std::string ErrorMsg;
+    std::tie(SectionInfo, ErrorMsg) =
+      findSectionAddrInfo(FileName, SectionName);
+    if (ErrorMsg != "")
+      return std::make_pair(0, ErrorMsg);
+  }
+
+  unsigned SectionID = SectionInfo->SectionID;
+  uint64_t Addr;
+  if (IsInsideLoad)
+    Addr =
+      static_cast<uint64_t>(
+        reinterpret_cast<uintptr_t>(getRTDyld().Sections[SectionID].Address));
+  else
+    Addr = getRTDyld().Sections[SectionID].LoadAddress;
+
+  return std::make_pair(Addr, std::string(""));
+}
+
+std::pair<uint64_t, std::string> RuntimeDyldCheckerImpl::getStubAddrFor(
+    StringRef FileName, StringRef SectionName, StringRef SymbolName,
+    bool IsInsideLoad) const {
+
+  const SectionAddressInfo *SectionInfo = nullptr;
+  {
+    std::string ErrorMsg;
+    std::tie(SectionInfo, ErrorMsg) =
+      findSectionAddrInfo(FileName, SectionName);
+    if (ErrorMsg != "")
+      return std::make_pair(0, ErrorMsg);
+  }
+
+  unsigned SectionID = SectionInfo->SectionID;
+  const StubOffsetsMap &SymbolStubs = SectionInfo->StubOffsets;
+  auto StubOffsetItr = SymbolStubs.find(SymbolName);
+  if (StubOffsetItr == SymbolStubs.end())
+    return std::make_pair(0,
+                          ("Stub for symbol '" + SymbolName + "' not found. "
+                           "If '" + SymbolName + "' is an internal symbol this "
+                           "may indicate that the stub target offset is being "
+                           "computed incorrectly.\n").str());
+
+  uint64_t StubOffset = StubOffsetItr->second;
+
+  uint64_t Addr;
+  if (IsInsideLoad) {
+    uintptr_t SectionBase =
+        reinterpret_cast<uintptr_t>(getRTDyld().Sections[SectionID].Address);
+    Addr = static_cast<uint64_t>(SectionBase) + StubOffset;
+  } else {
+    uint64_t SectionBase = getRTDyld().Sections[SectionID].LoadAddress;
+    Addr = SectionBase + StubOffset;
+  }
+
+  return std::make_pair(Addr, std::string(""));
+}
+
+StringRef
+RuntimeDyldCheckerImpl::getSubsectionStartingAt(StringRef Name) const {
   RuntimeDyldImpl::SymbolTableMap::const_iterator pos =
-    RTDyld.GlobalSymbolTable.find(Name);
-  if (pos == RTDyld.GlobalSymbolTable.end())
+      getRTDyld().GlobalSymbolTable.find(Name);
+  if (pos == getRTDyld().GlobalSymbolTable.end())
     return StringRef();
   RuntimeDyldImpl::SymbolLoc Loc = pos->second;
-  uint8_t *SectionAddr = RTDyld.getSectionAddress(Loc.first);
-  return StringRef(reinterpret_cast<const char*>(SectionAddr) + Loc.second,
-                   RTDyld.Sections[Loc.first].Size - Loc.second);
+  uint8_t *SectionAddr = getRTDyld().getSectionAddress(Loc.first);
+  return StringRef(reinterpret_cast<const char *>(SectionAddr) + Loc.second,
+                   getRTDyld().Sections[Loc.first].Size - Loc.second);
+}
+
+void RuntimeDyldCheckerImpl::registerSection(
+    StringRef FilePath, unsigned SectionID) {
+  StringRef FileName = sys::path::filename(FilePath);
+  const SectionEntry &Section = getRTDyld().Sections[SectionID];
+  StringRef SectionName = Section.Name;
+
+  Stubs[FileName][SectionName].SectionID = SectionID;
+}
+
+void RuntimeDyldCheckerImpl::registerStubMap(
+    StringRef FilePath, unsigned SectionID,
+    const RuntimeDyldImpl::StubMap &RTDyldStubs) {
+  StringRef FileName = sys::path::filename(FilePath);
+  const SectionEntry &Section = getRTDyld().Sections[SectionID];
+  StringRef SectionName = Section.Name;
+
+  Stubs[FileName][SectionName].SectionID = SectionID;
+
+  for (auto &StubMapEntry : RTDyldStubs) {
+    std::string SymbolName = "";
+
+    if (StubMapEntry.first.SymbolName)
+      SymbolName = StubMapEntry.first.SymbolName;
+    else {
+      // If this is a (Section, Offset) pair, do a reverse lookup in the
+      // global symbol table to find the name.
+      for (auto &GSTEntry : getRTDyld().GlobalSymbolTable) {
+        if (GSTEntry.second.first == StubMapEntry.first.SectionID &&
+            GSTEntry.second.second ==
+                static_cast<uint64_t>(StubMapEntry.first.Offset)) {
+          SymbolName = GSTEntry.first();
+          break;
+        }
+      }
+    }
+
+    if (SymbolName != "")
+      Stubs[FileName][SectionName].StubOffsets[SymbolName] =
+        StubMapEntry.second;
+  }
+}
+
+RuntimeDyldChecker::RuntimeDyldChecker(RuntimeDyld &RTDyld,
+                                       MCDisassembler *Disassembler,
+                                       MCInstPrinter *InstPrinter,
+                                       raw_ostream &ErrStream)
+    : Impl(make_unique<RuntimeDyldCheckerImpl>(RTDyld, Disassembler,
+                                               InstPrinter, ErrStream)) {}
+
+RuntimeDyldChecker::~RuntimeDyldChecker() {}
+
+RuntimeDyld& RuntimeDyldChecker::getRTDyld() {
+  return Impl->RTDyld;
+}
+
+const RuntimeDyld& RuntimeDyldChecker::getRTDyld() const {
+  return Impl->RTDyld;
+}
+
+bool RuntimeDyldChecker::check(StringRef CheckExpr) const {
+  return Impl->check(CheckExpr);
+}
+
+bool RuntimeDyldChecker::checkAllRulesInBuffer(StringRef RulePrefix,
+                                               MemoryBuffer *MemBuf) const {
+  return Impl->checkAllRulesInBuffer(RulePrefix, MemBuf);
+}
+
+std::pair<uint64_t, std::string>
+RuntimeDyldChecker::getSectionAddr(StringRef FileName, StringRef SectionName,
+                                   bool LinkerAddress) {
+  return Impl->getSectionAddr(FileName, SectionName, LinkerAddress);
 }

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
new file mode 100644
index 0000000..de20c1e
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h

@@ -0,0 +1,76 @@
+//===-- RuntimeDyldCheckerImpl.h -- RuntimeDyld test framework --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDCHECKERIMPL_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDCHECKERIMPL_H
+
+#include "RuntimeDyldImpl.h"
+#include <set>
+
+namespace llvm {
+
+class RuntimeDyldCheckerImpl {
+  friend class RuntimeDyldChecker;
+  friend class RuntimeDyldImpl;
+  friend class RuntimeDyldCheckerExprEval;
+
+public:
+  RuntimeDyldCheckerImpl(RuntimeDyld &RTDyld, MCDisassembler *Disassembler,
+                         MCInstPrinter *InstPrinter,
+                         llvm::raw_ostream &ErrStream);
+
+  bool check(StringRef CheckExpr) const;
+  bool checkAllRulesInBuffer(StringRef RulePrefix, MemoryBuffer *MemBuf) const;
+
+private:
+
+  // StubMap typedefs.
+  typedef std::map<std::string, uint64_t> StubOffsetsMap;
+  struct SectionAddressInfo {
+    uint64_t SectionID;
+    StubOffsetsMap StubOffsets;
+  };
+  typedef std::map<std::string, SectionAddressInfo> SectionMap;
+  typedef std::map<std::string, SectionMap> StubMap;
+
+  RuntimeDyldImpl &getRTDyld() const { return *RTDyld.Dyld; }
+
+  bool isSymbolValid(StringRef Symbol) const;
+  uint64_t getSymbolLinkerAddr(StringRef Symbol) const;
+  uint64_t getSymbolRemoteAddr(StringRef Symbol) const;
+  uint64_t readMemoryAtAddr(uint64_t Addr, unsigned Size) const;
+
+  std::pair<const SectionAddressInfo*, std::string> findSectionAddrInfo(
+                                                   StringRef FileName,
+                                                   StringRef SectionName) const;
+
+  std::pair<uint64_t, std::string> getSectionAddr(StringRef FileName,
+                                                  StringRef SectionName,
+                                                  bool IsInsideLoad) const;
+
+  std::pair<uint64_t, std::string> getStubAddrFor(StringRef FileName,
+                                                  StringRef SectionName,
+                                                  StringRef Symbol,
+                                                  bool IsInsideLoad) const;
+  StringRef getSubsectionStartingAt(StringRef Name) const;
+
+  void registerSection(StringRef FilePath, unsigned SectionID);
+  void registerStubMap(StringRef FilePath, unsigned SectionID,
+                       const RuntimeDyldImpl::StubMap &RTDyldStubs);
+
+  RuntimeDyld &RTDyld;
+  MCDisassembler *Disassembler;
+  MCInstPrinter *InstPrinter;
+  llvm::raw_ostream &ErrStream;
+
+  StubMap Stubs;
+};
+}
+
+#endif

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 80e489c..d95cffe 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp

@@ -23,6 +23,7 @@
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
@@ -55,9 +56,9 @@
 
 public:
   DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
-                std::unique_ptr<MemoryBuffer> Wrapper, std::error_code &ec);
+                MemoryBufferRef Wrapper, std::error_code &ec);
 
-  DyldELFObject(std::unique_ptr<MemoryBuffer> Wrapper, std::error_code &ec);
+  DyldELFObject(MemoryBufferRef Wrapper, std::error_code &ec);
 
   void updateSectionAddress(const SectionRef &Sec, uint64_t Addr);
   void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr);
@@ -76,8 +77,10 @@
   bool Registered;
 
 public:
-  ELFObjectImage(ObjectBuffer *Input, std::unique_ptr<DyldELFObject<ELFT>> Obj)
-      : ObjectImageCommon(Input, std::move(Obj)), Registered(false) {}
+  ELFObjectImage(std::unique_ptr<ObjectBuffer> Input,
+                 std::unique_ptr<DyldELFObject<ELFT>> Obj)
+      : ObjectImageCommon(std::move(Input), std::move(Obj)), Registered(false) {
+  }
 
   virtual ~ELFObjectImage() {
     if (Registered)
@@ -109,17 +112,15 @@
 // actual memory.  Ultimately, the Binary parent class will take ownership of
 // this MemoryBuffer object but not the underlying memory.
 template <class ELFT>
-DyldELFObject<ELFT>::DyldELFObject(std::unique_ptr<MemoryBuffer> Wrapper,
-                                   std::error_code &EC)
-    : ELFObjectFile<ELFT>(std::move(Wrapper), EC) {
+DyldELFObject<ELFT>::DyldELFObject(MemoryBufferRef Wrapper, std::error_code &EC)
+    : ELFObjectFile<ELFT>(Wrapper, EC) {
   this->isDyldELFObject = true;
 }
 
 template <class ELFT>
 DyldELFObject<ELFT>::DyldELFObject(std::unique_ptr<ObjectFile> UnderlyingFile,
-                                   std::unique_ptr<MemoryBuffer> Wrapper,
-                                   std::error_code &EC)
-    : ELFObjectFile<ELFT>(std::move(Wrapper), EC),
+                                   MemoryBufferRef Wrapper, std::error_code &EC)
+    : ELFObjectFile<ELFT>(Wrapper, EC),
       UnderlyingFile(std::move(UnderlyingFile)) {
   this->isDyldELFObject = true;
 }
@@ -185,36 +186,36 @@
     return nullptr;
 
   std::error_code ec;
-  std::unique_ptr<MemoryBuffer> Buffer(
-      MemoryBuffer::getMemBuffer(ObjFile->getData(), "", false));
+  MemoryBufferRef Buffer = ObjFile->getMemoryBufferRef();
 
   if (ObjFile->getBytesInAddress() == 4 && ObjFile->isLittleEndian()) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::little, 2, false>>>(
-            std::move(ObjFile), std::move(Buffer), ec);
+            std::move(ObjFile), Buffer, ec);
     return new ELFObjectImage<ELFType<support::little, 2, false>>(
         nullptr, std::move(Obj));
   } else if (ObjFile->getBytesInAddress() == 4 && !ObjFile->isLittleEndian()) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::big, 2, false>>>(
-            std::move(ObjFile), std::move(Buffer), ec);
+            std::move(ObjFile), Buffer, ec);
     return new ELFObjectImage<ELFType<support::big, 2, false>>(nullptr, std::move(Obj));
   } else if (ObjFile->getBytesInAddress() == 8 && !ObjFile->isLittleEndian()) {
     auto Obj = llvm::make_unique<DyldELFObject<ELFType<support::big, 2, true>>>(
-        std::move(ObjFile), std::move(Buffer), ec);
+        std::move(ObjFile), Buffer, ec);
     return new ELFObjectImage<ELFType<support::big, 2, true>>(nullptr,
                                                               std::move(Obj));
   } else if (ObjFile->getBytesInAddress() == 8 && ObjFile->isLittleEndian()) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::little, 2, true>>>(
-            std::move(ObjFile), std::move(Buffer), ec);
+            std::move(ObjFile), Buffer, ec);
     return new ELFObjectImage<ELFType<support::little, 2, true>>(
         nullptr, std::move(Obj));
   } else
     llvm_unreachable("Unexpected ELF format");
 }
 
-ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
+std::unique_ptr<ObjectImage>
+RuntimeDyldELF::createObjectImage(std::unique_ptr<ObjectBuffer> Buffer) {
   if (Buffer->getBufferSize() < ELF::EI_NIDENT)
     llvm_unreachable("Unexpected ELF object size");
   std::pair<unsigned char, unsigned char> Ident =
@@ -222,34 +223,36 @@
                      (uint8_t)Buffer->getBufferStart()[ELF::EI_DATA]);
   std::error_code ec;
 
-  std::unique_ptr<MemoryBuffer> Buf(Buffer->getMemBuffer());
+  MemoryBufferRef Buf = Buffer->getMemBuffer();
 
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) {
     auto Obj =
         llvm::make_unique<DyldELFObject<ELFType<support::little, 4, false>>>(
-            std::move(Buf), ec);
-    return new ELFObjectImage<ELFType<support::little, 4, false>>(
-        Buffer, std::move(Obj));
-  } else if (Ident.first == ELF::ELFCLASS32 &&
-             Ident.second == ELF::ELFDATA2MSB) {
+            Buf, ec);
+    return llvm::make_unique<
+        ELFObjectImage<ELFType<support::little, 4, false>>>(std::move(Buffer),
+                                                            std::move(Obj));
+  }
+  if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB) {
     auto Obj =
-        llvm::make_unique<DyldELFObject<ELFType<support::big, 4, false>>>(
-            std::move(Buf), ec);
-    return new ELFObjectImage<ELFType<support::big, 4, false>>(Buffer,
-                                                               std::move(Obj));
-  } else if (Ident.first == ELF::ELFCLASS64 &&
-             Ident.second == ELF::ELFDATA2MSB) {
+        llvm::make_unique<DyldELFObject<ELFType<support::big, 4, false>>>(Buf,
+                                                                          ec);
+    return llvm::make_unique<ELFObjectImage<ELFType<support::big, 4, false>>>(
+        std::move(Buffer), std::move(Obj));
+  }
+  if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB) {
     auto Obj = llvm::make_unique<DyldELFObject<ELFType<support::big, 8, true>>>(
-        std::move(Buf), ec);
-    return new ELFObjectImage<ELFType<support::big, 8, true>>(Buffer, std::move(Obj));
-  } else if (Ident.first == ELF::ELFCLASS64 &&
-             Ident.second == ELF::ELFDATA2LSB) {
-    auto Obj =
-        llvm::make_unique<DyldELFObject<ELFType<support::little, 8, true>>>(
-            std::move(Buf), ec);
-    return new ELFObjectImage<ELFType<support::little, 8, true>>(Buffer, std::move(Obj));
-  } else
-    llvm_unreachable("Unexpected ELF format");
+        Buf, ec);
+    return llvm::make_unique<ELFObjectImage<ELFType<support::big, 8, true>>>(
+        std::move(Buffer), std::move(Obj));
+  }
+  assert(Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB &&
+         "Unexpected ELF format");
+  auto Obj =
+      llvm::make_unique<DyldELFObject<ELFType<support::little, 8, true>>>(Buf,
+                                                                          ec);
+  return llvm::make_unique<ELFObjectImage<ELFType<support::little, 8, true>>>(
+      std::move(Buffer), std::move(Obj));
 }
 
 RuntimeDyldELF::~RuntimeDyldELF() {}
@@ -263,10 +266,9 @@
     llvm_unreachable("Relocation type not implemented yet!");
     break;
   case ELF::R_X86_64_64: {
-    uint64_t *Target = reinterpret_cast<uint64_t *>(Section.Address + Offset);
-    *Target = Value + Addend;
+    support::ulittle64_t::ref(Section.Address + Offset) = Value + Addend;
     DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend)) << " at "
-                 << format("%p\n", Target));
+                 << format("%p\n", Section.Address + Offset));
     break;
   }
   case ELF::R_X86_64_32:
@@ -276,17 +278,15 @@
            (Type == ELF::R_X86_64_32S &&
             ((int64_t)Value <= INT32_MAX && (int64_t)Value >= INT32_MIN)));
     uint32_t TruncatedAddr = (Value & 0xFFFFFFFF);
-    uint32_t *Target = reinterpret_cast<uint32_t *>(Section.Address + Offset);
-    *Target = TruncatedAddr;
+    support::ulittle32_t::ref(Section.Address + Offset) = TruncatedAddr;
     DEBUG(dbgs() << "Writing " << format("%p", TruncatedAddr) << " at "
-                 << format("%p\n", Target));
+                 << format("%p\n", Section.Address + Offset));
     break;
   }
   case ELF::R_X86_64_GOTPCREL: {
     // findGOTEntry returns the 'G + GOT' part of the relocation calculation
     // based on the load/target address of the GOT (not the current/local addr).
     uint64_t GOTAddr = findGOTEntry(Value, SymOffset);
-    uint32_t *Target = reinterpret_cast<uint32_t *>(Section.Address + Offset);
     uint64_t FinalAddress = Section.LoadAddress + Offset;
     // The processRelocationRef method combines the symbol offset and the addend
     // and in most cases that's what we want.  For this relocation type, we need
@@ -294,30 +294,29 @@
     int64_t RealOffset = GOTAddr + Addend - SymOffset - FinalAddress;
     assert(RealOffset <= INT32_MAX && RealOffset >= INT32_MIN);
     int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
-    *Target = TruncOffset;
+    support::ulittle32_t::ref(Section.Address + Offset) = TruncOffset;
     break;
   }
   case ELF::R_X86_64_PC32: {
     // Get the placeholder value from the generated object since
     // a previous relocation attempt may have overwritten the loaded version
-    uint32_t *Placeholder =
-        reinterpret_cast<uint32_t *>(Section.ObjAddress + Offset);
-    uint32_t *Target = reinterpret_cast<uint32_t *>(Section.Address + Offset);
+    support::ulittle32_t::ref Placeholder(
+        (void *)(Section.ObjAddress + Offset));
     uint64_t FinalAddress = Section.LoadAddress + Offset;
-    int64_t RealOffset = *Placeholder + Value + Addend - FinalAddress;
+    int64_t RealOffset = Placeholder + Value + Addend - FinalAddress;
     assert(RealOffset <= INT32_MAX && RealOffset >= INT32_MIN);
     int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
-    *Target = TruncOffset;
+    support::ulittle32_t::ref(Section.Address + Offset) = TruncOffset;
     break;
   }
   case ELF::R_X86_64_PC64: {
     // Get the placeholder value from the generated object since
     // a previous relocation attempt may have overwritten the loaded version
-    uint64_t *Placeholder =
-        reinterpret_cast<uint64_t *>(Section.ObjAddress + Offset);
-    uint64_t *Target = reinterpret_cast<uint64_t *>(Section.Address + Offset);
+    support::ulittle64_t::ref Placeholder(
+        (void *)(Section.ObjAddress + Offset));
     uint64_t FinalAddress = Section.LoadAddress + Offset;
-    *Target = *Placeholder + Value + Addend - FinalAddress;
+    support::ulittle64_t::ref(Section.Address + Offset) =
+        Placeholder + Value + Addend - FinalAddress;
     break;
   }
   }
@@ -330,21 +329,20 @@
   case ELF::R_386_32: {
     // Get the placeholder value from the generated object since
     // a previous relocation attempt may have overwritten the loaded version
-    uint32_t *Placeholder =
-        reinterpret_cast<uint32_t *>(Section.ObjAddress + Offset);
-    uint32_t *Target = reinterpret_cast<uint32_t *>(Section.Address + Offset);
-    *Target = *Placeholder + Value + Addend;
+    support::ulittle32_t::ref Placeholder(
+        (void *)(Section.ObjAddress + Offset));
+    support::ulittle32_t::ref(Section.Address + Offset) =
+        Placeholder + Value + Addend;
     break;
   }
   case ELF::R_386_PC32: {
     // Get the placeholder value from the generated object since
     // a previous relocation attempt may have overwritten the loaded version
-    uint32_t *Placeholder =
-        reinterpret_cast<uint32_t *>(Section.ObjAddress + Offset);
-    uint32_t *Target = reinterpret_cast<uint32_t *>(Section.Address + Offset);
+    support::ulittle32_t::ref Placeholder(
+        (void *)(Section.ObjAddress + Offset));
     uint32_t FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF);
-    uint32_t RealOffset = *Placeholder + Value + Addend - FinalAddress;
-    *Target = RealOffset;
+    uint32_t RealOffset = Placeholder + Value + Addend - FinalAddress;
+    support::ulittle32_t::ref(Section.Address + Offset) = RealOffset;
     break;
   }
   default:
@@ -704,8 +702,7 @@
 
       section_iterator tsi(Obj.end_sections());
       check(TargetSymbol->getSection(tsi));
-      bool IsCode = false;
-      tsi->isText(IsCode);
+      bool IsCode = tsi->isText();
       Rel.SectionID = findOrEmitSection(Obj, (*tsi), IsCode, LocalSections);
       Rel.Addend = (intptr_t)Addend;
       return;
@@ -911,8 +908,6 @@
     break;
   case Triple::aarch64:
   case Triple::aarch64_be:
-  case Triple::arm64:
-  case Triple::arm64_be:
     resolveAArch64Relocation(Section, Offset, Value, Type, Addend);
     break;
   case Triple::arm: // Fall through.
@@ -987,9 +982,7 @@
         if (si == Obj.end_sections())
           llvm_unreachable("Symbol section not found, bad object file format!");
         DEBUG(dbgs() << "\t\tThis is section symbol\n");
-        // Default to 'true' in case isText fails (though it never does).
-        bool isCode = true;
-        si->isText(isCode);
+        bool isCode = si->isText();
         Value.SectionID = findOrEmitSection(Obj, (*si), isCode, ObjSectionToID);
         Value.Addend = Addend;
         break;
@@ -1018,8 +1011,7 @@
 
   DEBUG(dbgs() << "\t\tSectionID: " << SectionID << " Offset: " << Offset
                << "\n");
-  if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be ||
-       Arch == Triple::arm64 || Arch == Triple::arm64_be) &&
+  if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be) &&
       (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26)) {
     // This is an AArch64 branch relocation, need to use a stub function.
     DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
@@ -1141,6 +1133,10 @@
     }
   } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
     if (RelType == ELF::R_PPC64_REL24) {
+      // Determine ABI variant in use for this object.
+      unsigned AbiVariant;
+      Obj.getObjectFile()->getPlatformFlags(AbiVariant);
+      AbiVariant &= ELF::EF_PPC64_ABI;
       // A PPC branch relocation will need a stub function if the target is
       // an external symbol (Symbol::ST_Unknown) or if the target address
       // is not within the signed 24-bits branch address.
@@ -1148,10 +1144,18 @@
       uint8_t *Target = Section.Address + Offset;
       bool RangeOverflow = false;
       if (SymType != SymbolRef::ST_Unknown) {
-        // A function call may points to the .opd entry, so the final symbol
-        // value
-        // in calculated based in the relocation values in .opd section.
-        findOPDEntrySection(Obj, ObjSectionToID, Value);
+        if (AbiVariant != 2) {
+          // In the ELFv1 ABI, a function call may point to the .opd entry,
+          // so the final symbol value is calculated based on the relocation
+          // values in the .opd section.
+          findOPDEntrySection(Obj, ObjSectionToID, Value);
+        } else {
+          // In the ELFv2 ABI, a function symbol may provide a local entry
+          // point, which must be used for direct calls.
+          uint8_t SymOther;
+          Symbol->getOther(SymOther);
+          Value.Addend += ELF::decodePPC64LocalEntryOffset(SymOther);
+        }
         uint8_t *RelocTarget = Sections[Value.SectionID].Address + Value.Addend;
         int32_t delta = static_cast<int32_t>(Target - RelocTarget);
         // If it is within 24-bits branch range, just set the branch target
@@ -1179,7 +1183,8 @@
           DEBUG(dbgs() << " Create a new stub function\n");
           Stubs[Value] = Section.StubOffset;
           uint8_t *StubTargetAddr =
-              createStubFunction(Section.Address + Section.StubOffset);
+              createStubFunction(Section.Address + Section.StubOffset,
+                                 AbiVariant);
           RelocationEntry RE(SectionID, StubTargetAddr - Section.Address,
                              ELF::R_PPC64_ADDR64, Value.Addend);
 
@@ -1217,9 +1222,13 @@
                             RelType, 0);
           Section.StubOffset += getMaxStubSize();
         }
-        if (SymType == SymbolRef::ST_Unknown)
+        if (SymType == SymbolRef::ST_Unknown) {
           // Restore the TOC for external calls
-          writeInt32BE(Target + 4, 0xE8410028); // ld r2,40(r1)
+          if (AbiVariant == 2)
+            writeInt32BE(Target + 4, 0xE8410018); // ld r2,28(r1)
+          else
+            writeInt32BE(Target + 4, 0xE8410028); // ld r2,40(r1)
+        }
       }
     } else if (RelType == ELF::R_PPC64_TOC16 ||
                RelType == ELF::R_PPC64_TOC16_DS ||
@@ -1306,7 +1315,7 @@
       Stubs[Value] = StubOffset;
       createStubFunction((uint8_t *)StubAddress);
       RelocationEntry RE(SectionID, StubOffset + 8, ELF::R_390_64,
-                         Value.Addend - Addend);
+                         Value.Offset);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
@@ -1414,8 +1423,6 @@
   case Triple::x86_64:
   case Triple::aarch64:
   case Triple::aarch64_be:
-  case Triple::arm64:
-  case Triple::arm64_be:
   case Triple::ppc64:
   case Triple::ppc64le:
   case Triple::systemz:

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 59fdfbe..4aeab81 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_RUNTIME_DYLD_ELF_H
-#define LLVM_RUNTIME_DYLD_ELF_H
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDELF_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDELF_H
 
 #include "RuntimeDyldImpl.h"
 #include "llvm/ADT/DenseMap.h"
@@ -58,8 +58,7 @@
                                 uint64_t Value, uint32_t Type, int64_t Addend);
 
   unsigned getMaxStubSize() override {
-    if (Arch == Triple::aarch64 || Arch == Triple::arm64 ||
-        Arch == Triple::aarch64_be || Arch == Triple::arm64_be)
+    if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be)
       return 20; // movz; movk; movk; movk; br
     if (Arch == Triple::arm || Arch == Triple::thumb)
       return 8; // 32-bit instruction and 32-bit address
@@ -120,7 +119,8 @@
                     ObjSectionToIDMap &SectionMap) override;
   virtual ~RuntimeDyldELF();
 
-  static ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
+  static std::unique_ptr<ObjectImage>
+  createObjectImage(std::unique_ptr<ObjectBuffer> InputBuffer);
   static ObjectImage *createObjectImageFromFile(std::unique_ptr<object::ObjectFile> Obj);
 };
 

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 0336cba..69ea3b4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_RUNTIME_DYLD_IMPL_H
-#define LLVM_RUNTIME_DYLD_IMPL_H
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDIMPL_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDIMPL_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
@@ -70,7 +70,7 @@
   SectionEntry(StringRef name, uint8_t *address, size_t size,
                uintptr_t objAddress)
       : Name(name), Address(address), Size(size),
-        LoadAddress((uintptr_t)address), StubOffset(size),
+        LoadAddress(reinterpret_cast<uintptr_t>(address)), StubOffset(size),
         ObjAddress(objAddress) {}
 };
 
@@ -159,7 +159,7 @@
 };
 
 class RuntimeDyldImpl {
-  friend class RuntimeDyldChecker;
+  friend class RuntimeDyldCheckerImpl;
 private:
 
   uint64_t getAnySymbolRemoteAddress(StringRef Symbol) {
@@ -172,6 +172,9 @@
   // The MemoryManager to load objects into.
   RTDyldMemoryManager *MemMgr;
 
+  // Attached RuntimeDyldChecker instance. Null if no instance attached.
+  RuntimeDyldCheckerImpl *Checker;
+
   // A list of all sections emitted by the dynamic linker.  These sections are
   // referenced in the code by means of their index in this list - SectionID.
   typedef SmallVector<SectionEntry, 64> SectionList;
@@ -211,6 +214,7 @@
   // modules.  This map is indexed by symbol name.
   StringMap<RelocationList> ExternalSymbolRelocations;
 
+
   typedef std::map<RelocationValueRef, uintptr_t> StubMap;
 
   Triple::ArchType Arch;
@@ -245,11 +249,11 @@
     return true;
   }
 
-  uint64_t getSectionLoadAddress(unsigned SectionID) {
+  uint64_t getSectionLoadAddress(unsigned SectionID) const {
     return Sections[SectionID].LoadAddress;
   }
 
-  uint8_t *getSectionAddress(unsigned SectionID) {
+  uint8_t *getSectionAddress(unsigned SectionID) const {
     return (uint8_t *)Sections[SectionID].Address;
   }
 
@@ -282,6 +286,13 @@
     *(Addr + 7) = Value & 0xFF;
   }
 
+  /// Endian-aware read Read the least significant Size bytes from Src.
+  uint64_t readBytesUnaligned(uint8_t *Src, unsigned Size) const;
+
+  /// Endian-aware write. Write the least significant Size bytes from Value to
+  /// Dst.
+  void writeBytesUnaligned(uint64_t Value, uint8_t *Dst, unsigned Size) const;
+
   /// \brief Given the common symbols discovered in the object file, emit a
   /// new section for them and update the symbol mappings in the object and
   /// symbol table.
@@ -312,7 +323,7 @@
 
   /// \brief Emits long jump instruction to Addr.
   /// \return Pointer to the memory area for emitting target address.
-  uint8_t *createStubFunction(uint8_t *Addr);
+  uint8_t *createStubFunction(uint8_t *Addr, unsigned AbiVariant = 0);
 
   /// \brief Resolves relocations from Relocs list with address from Value.
   void resolveRelocationList(const RelocationList &Relocs, uint64_t Value);
@@ -349,7 +360,7 @@
 
 public:
   RuntimeDyldImpl(RTDyldMemoryManager *mm)
-      : MemMgr(mm), ProcessAllSections(false), HasError(false) {
+    : MemMgr(mm), Checker(nullptr), ProcessAllSections(false), HasError(false) {
   }
 
   virtual ~RuntimeDyldImpl();
@@ -358,9 +369,14 @@
     this->ProcessAllSections = ProcessAllSections;
   }
 
-  ObjectImage *loadObject(ObjectImage *InputObject);
+  void setRuntimeDyldChecker(RuntimeDyldCheckerImpl *Checker) {
+    this->Checker = Checker;
+  }
 
-  uint8_t* getSymbolAddress(StringRef Name) {
+  std::unique_ptr<ObjectImage>
+  loadObject(std::unique_ptr<ObjectImage> InputObject);
+
+  uint8_t* getSymbolAddress(StringRef Name) const {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
     SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
@@ -370,7 +386,7 @@
     return getSectionAddress(Loc.first) + Loc.second;
   }
 
-  uint64_t getSymbolLoadAddress(StringRef Name) {
+  uint64_t getSymbolLoadAddress(StringRef Name) const {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
     SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 4eb516c..d3d6f5d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp

@@ -14,8 +14,12 @@
 #include "RuntimeDyldMachO.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "ObjectImageCommon.h"
-#include "JITRegistrar.h"
+
+#include "Targets/RuntimeDyldMachOARM.h"
+#include "Targets/RuntimeDyldMachOAArch64.h"
+#include "Targets/RuntimeDyldMachOI386.h"
+#include "Targets/RuntimeDyldMachOX86_64.h"
+
 using namespace llvm;
 using namespace llvm::object;
 
@@ -23,479 +27,109 @@
 
 namespace llvm {
 
-class MachOObjectImage : public ObjectImageCommon {
-private:
-  typedef SmallVector<uint64_t, 1> SectionAddrList;
-  SectionAddrList OldSectionAddrList;
+int64_t RuntimeDyldMachO::memcpyAddend(const RelocationEntry &RE) const {
+  unsigned NumBytes = 1 << RE.Size;
+  uint8_t *Src = Sections[RE.SectionID].Address + RE.Offset;
 
-protected:
-  bool is64;
-  bool Registered;
+  return static_cast<int64_t>(readBytesUnaligned(Src, NumBytes));
+}
 
-private:
-  void initOldAddress() {
-    MachOObjectFile *objf = static_cast<MachOObjectFile *>(ObjFile.get());
-    // Unfortunately we need to do this, since there's information encoded
-    // in the original addr of the section that we could not otherwise
-    // recover. The reason for this is that symbols do not actually store
-    // their file offset, but only their vmaddr. This means that in order
-    // to locate the symbol correctly in the object file, we need to know
-    // where the original start of the section was (including any padding,
-    // etc).
-    for (section_iterator i = objf->section_begin(), e = objf->section_end();
-         i != e; ++i) {
-      uint64_t Addr;
-      i->getAddress(Addr);
-      OldSectionAddrList[i->getRawDataRefImpl().d.a] = Addr;
-    }
-  }
+RelocationValueRef RuntimeDyldMachO::getRelocationValueRef(
+    ObjectImage &ObjImg, const relocation_iterator &RI,
+    const RelocationEntry &RE, ObjSectionToIDMap &ObjSectionToID,
+    const SymbolTableMap &Symbols) {
 
-public:
-  MachOObjectImage(ObjectBuffer *Input, bool is64)
-      : ObjectImageCommon(Input),
-        OldSectionAddrList(ObjFile->section_end()->getRawDataRefImpl().d.a, 0),
-        is64(is64), Registered(false) {
-    initOldAddress();
-  }
+  const MachOObjectFile &Obj =
+      static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+  MachO::any_relocation_info RelInfo =
+      Obj.getRelocation(RI->getRawDataRefImpl());
+  RelocationValueRef Value;
 
-  MachOObjectImage(std::unique_ptr<object::ObjectFile> Input, bool is64)
-      : ObjectImageCommon(std::move(Input)),
-        OldSectionAddrList(ObjFile->section_end()->getRawDataRefImpl().d.a, 0),
-        is64(is64), Registered(false) {
-    initOldAddress();
-  }
-
-  virtual ~MachOObjectImage() {
-    if (Registered)
-      deregisterWithDebugger();
-  }
-
-  // Subclasses can override these methods to update the image with loaded
-  // addresses for sections and common symbols
-  virtual void updateSectionAddress(const SectionRef &Sec, uint64_t Addr) {
-    MachOObjectFile *objf = static_cast<MachOObjectFile *>(ObjFile.get());
-    char *data =
-        const_cast<char *>(objf->getSectionPointer(Sec.getRawDataRefImpl()));
-
-    uint64_t oldAddr = OldSectionAddrList[Sec.getRawDataRefImpl().d.a];
-
-    if (is64) {
-      ((MachO::section_64 *)data)->addr = Addr;
+  bool IsExternal = Obj.getPlainRelocationExternal(RelInfo);
+  if (IsExternal) {
+    symbol_iterator Symbol = RI->getSymbol();
+    StringRef TargetName;
+    Symbol->getName(TargetName);
+    SymbolTableMap::const_iterator SI = Symbols.find(TargetName.data());
+    if (SI != Symbols.end()) {
+      Value.SectionID = SI->second.first;
+      Value.Offset = SI->second.second + RE.Addend;
     } else {
-      ((MachO::section *)data)->addr = Addr;
-    }
-
-    for (symbol_iterator i = objf->symbol_begin(), e = objf->symbol_end();
-         i != e; ++i) {
-      section_iterator symSec(objf->section_end());
-      (*i).getSection(symSec);
-      if (*symSec == Sec) {
-        uint64_t symAddr;
-        (*i).getAddress(symAddr);
-        updateSymbolAddress(*i, symAddr + Addr - oldAddr);
+      SI = GlobalSymbolTable.find(TargetName.data());
+      if (SI != GlobalSymbolTable.end()) {
+        Value.SectionID = SI->second.first;
+        Value.Offset = SI->second.second + RE.Addend;
+      } else {
+        Value.SymbolName = TargetName.data();
+        Value.Offset = RE.Addend;
       }
     }
+  } else {
+    SectionRef Sec = Obj.getRelocationSection(RelInfo);
+    bool IsCode = Sec.isText();
+    Value.SectionID = findOrEmitSection(ObjImg, Sec, IsCode, ObjSectionToID);
+    uint64_t Addr = Sec.getAddress();
+    Value.Offset = RE.Addend - Addr;
   }
 
-  uint64_t getOldSectionAddr(const SectionRef &Sec) const {
-    return OldSectionAddrList[Sec.getRawDataRefImpl().d.a];
-  }
-
-  virtual void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr) {
-    char *data = const_cast<char *>(
-        reinterpret_cast<const char *>(Sym.getRawDataRefImpl().p));
-    if (is64)
-      ((MachO::nlist_64 *)data)->n_value = Addr;
-    else
-      ((MachO::nlist *)data)->n_value = Addr;
-  }
-
-  virtual void registerWithDebugger() {
-    JITRegistrar::getGDBRegistrar().registerObject(*Buffer);
-    Registered = true;
-  }
-
-  virtual void deregisterWithDebugger() {
-    JITRegistrar::getGDBRegistrar().deregisterObject(*Buffer);
-  }
-};
-
-ObjectImage *RuntimeDyldMachO::createObjectImage(ObjectBuffer *Buffer) {
-  uint32_t magic = *((const uint32_t *)Buffer->getBufferStart());
-  bool is64 = (magic == MachO::MH_MAGIC_64);
-  assert((magic == MachO::MH_MAGIC_64 || magic == MachO::MH_MAGIC) &&
-         "Unrecognized Macho Magic");
-  return new MachOObjectImage(Buffer, is64);
+  return Value;
 }
 
-ObjectImage *RuntimeDyldMachO::createObjectImageFromFile(
-    std::unique_ptr<object::ObjectFile> ObjFile) {
-  if (!ObjFile)
-    return nullptr;
+void RuntimeDyldMachO::makeValueAddendPCRel(RelocationValueRef &Value,
+                                            ObjectImage &ObjImg,
+                                            const relocation_iterator &RI,
+                                            unsigned OffsetToNextPC) {
+  const MachOObjectFile &Obj =
+      static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+  MachO::any_relocation_info RelInfo =
+      Obj.getRelocation(RI->getRawDataRefImpl());
 
-  MemoryBuffer *Buffer =
-      MemoryBuffer::getMemBuffer(ObjFile->getData(), "", false);
-
-  uint32_t magic = *((const uint32_t *)Buffer->getBufferStart());
-  bool is64 = (magic == MachO::MH_MAGIC_64);
-  assert((magic == MachO::MH_MAGIC_64 || magic == MachO::MH_MAGIC) &&
-         "Unrecognized Macho Magic");
-  return new MachOObjectImage(std::move(ObjFile), is64);
-}
-
-static unsigned char *processFDE(unsigned char *P, intptr_t DeltaForText,
-                                 intptr_t DeltaForEH) {
-  DEBUG(dbgs() << "Processing FDE: Delta for text: " << DeltaForText
-               << ", Delta for EH: " << DeltaForEH << "\n");
-  uint32_t Length = *((uint32_t *)P);
-  P += 4;
-  unsigned char *Ret = P + Length;
-  uint32_t Offset = *((uint32_t *)P);
-  if (Offset == 0) // is a CIE
-    return Ret;
-
-  P += 4;
-  intptr_t FDELocation = *((intptr_t *)P);
-  intptr_t NewLocation = FDELocation - DeltaForText;
-  *((intptr_t *)P) = NewLocation;
-  P += sizeof(intptr_t);
-
-  // Skip the FDE address range
-  P += sizeof(intptr_t);
-
-  uint8_t Augmentationsize = *P;
-  P += 1;
-  if (Augmentationsize != 0) {
-    intptr_t LSDA = *((intptr_t *)P);
-    intptr_t NewLSDA = LSDA - DeltaForEH;
-    *((intptr_t *)P) = NewLSDA;
-  }
-
-  return Ret;
-}
-
-static intptr_t computeDelta(SectionEntry *A, SectionEntry *B) {
-  intptr_t ObjDistance = A->ObjAddress - B->ObjAddress;
-  intptr_t MemDistance = A->LoadAddress - B->LoadAddress;
-  return ObjDistance - MemDistance;
-}
-
-void RuntimeDyldMachO::registerEHFrames() {
-
-  if (!MemMgr)
-    return;
-  for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
-    EHFrameRelatedSections &SectionInfo = UnregisteredEHFrameSections[i];
-    if (SectionInfo.EHFrameSID == RTDYLD_INVALID_SECTION_ID ||
-        SectionInfo.TextSID == RTDYLD_INVALID_SECTION_ID)
-      continue;
-    SectionEntry *Text = &Sections[SectionInfo.TextSID];
-    SectionEntry *EHFrame = &Sections[SectionInfo.EHFrameSID];
-    SectionEntry *ExceptTab = nullptr;
-    if (SectionInfo.ExceptTabSID != RTDYLD_INVALID_SECTION_ID)
-      ExceptTab = &Sections[SectionInfo.ExceptTabSID];
-
-    intptr_t DeltaForText = computeDelta(Text, EHFrame);
-    intptr_t DeltaForEH = 0;
-    if (ExceptTab)
-      DeltaForEH = computeDelta(ExceptTab, EHFrame);
-
-    unsigned char *P = EHFrame->Address;
-    unsigned char *End = P + EHFrame->Size;
-    do {
-      P = processFDE(P, DeltaForText, DeltaForEH);
-    } while (P != End);
-
-    MemMgr->registerEHFrames(EHFrame->Address, EHFrame->LoadAddress,
-                             EHFrame->Size);
-  }
-  UnregisteredEHFrameSections.clear();
-}
-
-void RuntimeDyldMachO::finalizeLoad(ObjectImage &ObjImg,
-                                    ObjSectionToIDMap &SectionMap) {
-  unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID;
-  unsigned TextSID = RTDYLD_INVALID_SECTION_ID;
-  unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID;
-  ObjSectionToIDMap::iterator i, e;
-  for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
-    const SectionRef &Section = i->first;
-    StringRef Name;
-    Section.getName(Name);
-    if (Name == "__eh_frame")
-      EHFrameSID = i->second;
-    else if (Name == "__text")
-      TextSID = i->second;
-    else if (Name == "__gcc_except_tab")
-      ExceptTabSID = i->second;
-    else if (Name == "__jump_table")
-      populateJumpTable(cast<MachOObjectFile>(*ObjImg.getObjectFile()),
-                        Section, i->second);
-    else if (Name == "__pointers")
-      populatePointersSection(cast<MachOObjectFile>(*ObjImg.getObjectFile()),
-                              Section, i->second);
-  }
-  UnregisteredEHFrameSections.push_back(
-      EHFrameRelatedSections(EHFrameSID, TextSID, ExceptTabSID));
-}
-
-// The target location for the relocation is described by RE.SectionID and
-// RE.Offset.  RE.SectionID can be used to find the SectionEntry.  Each
-// SectionEntry has three members describing its location.
-// SectionEntry::Address is the address at which the section has been loaded
-// into memory in the current (host) process.  SectionEntry::LoadAddress is the
-// address that the section will have in the target process.
-// SectionEntry::ObjAddress is the address of the bits for this section in the
-// original emitted object image (also in the current address space).
-//
-// Relocations will be applied as if the section were loaded at
-// SectionEntry::LoadAddress, but they will be applied at an address based
-// on SectionEntry::Address.  SectionEntry::ObjAddress will be used to refer to
-// Target memory contents if they are required for value calculations.
-//
-// The Value parameter here is the load address of the symbol for the
-// relocation to be applied.  For relocations which refer to symbols in the
-// current object Value will be the LoadAddress of the section in which
-// the symbol resides (RE.Addend provides additional information about the
-// symbol location).  For external symbols, Value will be the address of the
-// symbol in the target address space.
-void RuntimeDyldMachO::resolveRelocation(const RelocationEntry &RE,
-                                         uint64_t Value) {
-  DEBUG (
-    const SectionEntry &Section = Sections[RE.SectionID];
-    uint8_t* LocalAddress = Section.Address + RE.Offset;
-    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
-
-    dbgs() << "resolveRelocation Section: " << RE.SectionID
-           << " LocalAddress: " << format("%p", LocalAddress)
-           << " FinalAddress: " << format("%p", FinalAddress)
-           << " Value: " << format("%p", Value)
-           << " Addend: " << RE.Addend
-           << " isPCRel: " << RE.IsPCRel
-           << " MachoType: " << RE.RelType
-           << " Size: " << (1 << RE.Size) << "\n";
-  );
-
-  // This just dispatches to the proper target specific routine.
-  switch (Arch) {
-  default:
-    llvm_unreachable("Unsupported CPU type!");
-  case Triple::x86_64:
-    resolveX86_64Relocation(RE, Value);
-    break;
-  case Triple::x86:
-    resolveI386Relocation(RE, Value);
-    break;
-  case Triple::arm: // Fall through.
-  case Triple::thumb:
-    resolveARMRelocation(RE, Value);
-    break;
-  case Triple::aarch64:
-  case Triple::arm64:
-    resolveAArch64Relocation(RE, Value);
-    break;
+  bool IsPCRel = Obj.getAnyRelocationPCRel(RelInfo);
+  if (IsPCRel) {
+    uint64_t RelocAddr = 0;
+    RI->getAddress(RelocAddr);
+    Value.Offset += RelocAddr + OffsetToNextPC;
   }
 }
 
-bool RuntimeDyldMachO::resolveI386Relocation(const RelocationEntry &RE,
-                                             uint64_t Value) {
+void RuntimeDyldMachO::dumpRelocationToResolve(const RelocationEntry &RE,
+                                               uint64_t Value) const {
   const SectionEntry &Section = Sections[RE.SectionID];
-  uint8_t* LocalAddress = Section.Address + RE.Offset;
+  uint8_t *LocalAddress = Section.Address + RE.Offset;
+  uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
 
-  if (RE.IsPCRel) {
-    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
-    Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation.
-  }
-
-  switch (RE.RelType) {
-    default:
-      llvm_unreachable("Invalid relocation type!");
-    case MachO::GENERIC_RELOC_VANILLA:
-      return applyRelocationValue(LocalAddress, Value + RE.Addend,
-                                  1 << RE.Size);
-    case MachO::GENERIC_RELOC_SECTDIFF:
-    case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: {
-      uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress;
-      uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress;
-      assert((Value == SectionABase || Value == SectionBBase) &&
-             "Unexpected SECTDIFF relocation value.");
-      Value = SectionABase - SectionBBase + RE.Addend;
-      return applyRelocationValue(LocalAddress, Value, 1 << RE.Size);
-    }
-    case MachO::GENERIC_RELOC_PB_LA_PTR:
-      return Error("Relocation type not implemented yet!");
-  }
+  dbgs() << "resolveRelocation Section: " << RE.SectionID
+         << " LocalAddress: " << format("%p", LocalAddress)
+         << " FinalAddress: " << format("0x%016" PRIx64, FinalAddress)
+         << " Value: " << format("0x%016" PRIx64, Value) << " Addend: " << RE.Addend
+         << " isPCRel: " << RE.IsPCRel << " MachoType: " << RE.RelType
+         << " Size: " << (1 << RE.Size) << "\n";
 }
 
-bool RuntimeDyldMachO::resolveX86_64Relocation(const RelocationEntry &RE,
-                                               uint64_t Value) {
-  const SectionEntry &Section = Sections[RE.SectionID];
-  uint8_t* LocalAddress = Section.Address + RE.Offset;
+section_iterator
+RuntimeDyldMachO::getSectionByAddress(const MachOObjectFile &Obj,
+                                      uint64_t Addr) {
+  section_iterator SI = Obj.section_begin();
+  section_iterator SE = Obj.section_end();
 
-  // If the relocation is PC-relative, the value to be encoded is the
-  // pointer difference.
-  if (RE.IsPCRel) {
-    // FIXME: It seems this value needs to be adjusted by 4 for an effective PC
-    // address. Is that expected? Only for branches, perhaps?
-    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
-    Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation.
+  for (; SI != SE; ++SI) {
+    uint64_t SAddr = SI->getAddress();
+    uint64_t SSize = SI->getSize();
+    if ((Addr >= SAddr) && (Addr < SAddr + SSize))
+      return SI;
   }
 
-  switch (RE.RelType) {
-  default:
-    llvm_unreachable("Invalid relocation type!");
-  case MachO::X86_64_RELOC_SIGNED_1:
-  case MachO::X86_64_RELOC_SIGNED_2:
-  case MachO::X86_64_RELOC_SIGNED_4:
-  case MachO::X86_64_RELOC_SIGNED:
-  case MachO::X86_64_RELOC_UNSIGNED:
-  case MachO::X86_64_RELOC_BRANCH:
-    return applyRelocationValue(LocalAddress, Value + RE.Addend, 1 << RE.Size);
-  case MachO::X86_64_RELOC_GOT_LOAD:
-  case MachO::X86_64_RELOC_GOT:
-  case MachO::X86_64_RELOC_SUBTRACTOR:
-  case MachO::X86_64_RELOC_TLV:
-    return Error("Relocation type not implemented yet!");
-  }
+  return SE;
 }
 
-bool RuntimeDyldMachO::resolveARMRelocation(const RelocationEntry &RE,
-                                            uint64_t Value) {
-  const SectionEntry &Section = Sections[RE.SectionID];
-  uint8_t* LocalAddress = Section.Address + RE.Offset;
 
-  // If the relocation is PC-relative, the value to be encoded is the
-  // pointer difference.
-  if (RE.IsPCRel) {
-    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
-    Value -= FinalAddress;
-    // ARM PCRel relocations have an effective-PC offset of two instructions
-    // (four bytes in Thumb mode, 8 bytes in ARM mode).
-    // FIXME: For now, assume ARM mode.
-    Value -= 8;
-  }
-
-  switch (RE.RelType) {
-  default:
-    llvm_unreachable("Invalid relocation type!");
-  case MachO::ARM_RELOC_VANILLA:
-    return applyRelocationValue(LocalAddress, Value, 1 << RE.Size);
-  case MachO::ARM_RELOC_BR24: {
-    // Mask the value into the target address. We know instructions are
-    // 32-bit aligned, so we can do it all at once.
-    uint32_t *p = (uint32_t *)LocalAddress;
-    // The low two bits of the value are not encoded.
-    Value >>= 2;
-    // Mask the value to 24 bits.
-    uint64_t FinalValue = Value & 0xffffff;
-    // Check for overflow.
-    if (Value != FinalValue)
-      return Error("ARM BR24 relocation out of range.");
-    // FIXME: If the destination is a Thumb function (and the instruction
-    // is a non-predicated BL instruction), we need to change it to a BLX
-    // instruction instead.
-
-    // Insert the value into the instruction.
-    *p = (*p & ~0xffffff) | FinalValue;
-    break;
-  }
-  case MachO::ARM_THUMB_RELOC_BR22:
-  case MachO::ARM_THUMB_32BIT_BRANCH:
-  case MachO::ARM_RELOC_HALF:
-  case MachO::ARM_RELOC_HALF_SECTDIFF:
-  case MachO::ARM_RELOC_PAIR:
-  case MachO::ARM_RELOC_SECTDIFF:
-  case MachO::ARM_RELOC_LOCAL_SECTDIFF:
-  case MachO::ARM_RELOC_PB_LA_PTR:
-    return Error("Relocation type not implemented yet!");
-  }
-  return false;
-}
-
-bool RuntimeDyldMachO::resolveAArch64Relocation(const RelocationEntry &RE,
-                                                uint64_t Value) {
-  const SectionEntry &Section = Sections[RE.SectionID];
-  uint8_t* LocalAddress = Section.Address + RE.Offset;
-
-  // If the relocation is PC-relative, the value to be encoded is the
-  // pointer difference.
-  if (RE.IsPCRel) {
-    uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
-    Value -= FinalAddress;
-  }
-
-  switch (RE.RelType) {
-  default:
-    llvm_unreachable("Invalid relocation type!");
-  case MachO::ARM64_RELOC_UNSIGNED:
-    return applyRelocationValue(LocalAddress, Value, 1 << RE.Size);
-  case MachO::ARM64_RELOC_BRANCH26: {
-    // Mask the value into the target address. We know instructions are
-    // 32-bit aligned, so we can do it all at once.
-    uint32_t *p = (uint32_t *)LocalAddress;
-    // The low two bits of the value are not encoded.
-    Value >>= 2;
-    // Mask the value to 26 bits.
-    uint64_t FinalValue = Value & 0x3ffffff;
-    // Check for overflow.
-    if (FinalValue != Value)
-      return Error("ARM64 BRANCH26 relocation out of range.");
-    // Insert the value into the instruction.
-    *p = (*p & ~0x3ffffff) | FinalValue;
-    break;
-  }
-  case MachO::ARM64_RELOC_SUBTRACTOR:
-  case MachO::ARM64_RELOC_PAGE21:
-  case MachO::ARM64_RELOC_PAGEOFF12:
-  case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
-  case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12:
-  case MachO::ARM64_RELOC_POINTER_TO_GOT:
-  case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21:
-  case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12:
-  case MachO::ARM64_RELOC_ADDEND:
-    return Error("Relocation type not implemented yet!");
-  }
-  return false;
-}
-
-void RuntimeDyldMachO::populateJumpTable(MachOObjectFile &Obj,
-                                         const SectionRef &JTSection,
-                                         unsigned JTSectionID) {
+// Populate __pointers section.
+void RuntimeDyldMachO::populateIndirectSymbolPointersSection(
+                                                    MachOObjectFile &Obj,
+                                                    const SectionRef &PTSection,
+                                                    unsigned PTSectionID) {
   assert(!Obj.is64Bit() &&
-         "__jump_table section not supported in 64-bit MachO.");
-
-  MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand();
-  MachO::section Sec32 = Obj.getSection(JTSection.getRawDataRefImpl());
-  uint32_t JTSectionSize = Sec32.size;
-  unsigned FirstIndirectSymbol = Sec32.reserved1;
-  unsigned JTEntrySize = Sec32.reserved2;
-  unsigned NumJTEntries = JTSectionSize / JTEntrySize;
-  uint8_t* JTSectionAddr = getSectionAddress(JTSectionID);
-  unsigned JTEntryOffset = 0;
-
-  assert((JTSectionSize % JTEntrySize) == 0 &&
-         "Jump-table section does not contain a whole number of stubs?");
-
-  for (unsigned i = 0; i < NumJTEntries; ++i) {
-    unsigned SymbolIndex =
-      Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i);
-    symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex);
-    StringRef IndirectSymbolName;
-    SI->getName(IndirectSymbolName);
-    uint8_t* JTEntryAddr = JTSectionAddr + JTEntryOffset;
-    createStubFunction(JTEntryAddr);
-    RelocationEntry RE(JTSectionID, JTEntryOffset + 1,
-                       MachO::GENERIC_RELOC_VANILLA, 0, true, 2);
-    addRelocationForSymbol(RE, IndirectSymbolName);
-    JTEntryOffset += JTEntrySize;
-  }
-}
-
-void RuntimeDyldMachO::populatePointersSection(MachOObjectFile &Obj,
-                                               const SectionRef &PTSection,
-                                               unsigned PTSectionID) {
-  assert(!Obj.is64Bit() &&
-         "__pointers section not supported in 64-bit MachO.");
+         "Pointer table section not supported in 64-bit MachO.");
 
   MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand();
   MachO::section Sec32 = Obj.getSection(PTSection.getRawDataRefImpl());
@@ -508,9 +142,11 @@
   assert((PTSectionSize % PTEntrySize) == 0 &&
          "Pointers section does not contain a whole number of stubs?");
 
-  DEBUG(dbgs() << "Populating __pointers, Section ID " << PTSectionID
-               << ", " << NumPTEntries << " entries, "
-               << PTEntrySize << " bytes each:\n");
+  DEBUG(dbgs() << "Populating pointer table section "
+               << Sections[PTSectionID].Name
+               << ", Section ID " << PTSectionID << ", "
+               << NumPTEntries << " entries, " << PTEntrySize
+               << " bytes each:\n");
 
   for (unsigned i = 0; i < NumPTEntries; ++i) {
     unsigned SymbolIndex =
@@ -527,282 +163,6 @@
   }
 }
 
-
-section_iterator getSectionByAddress(const MachOObjectFile &Obj,
-                                     uint64_t Addr) {
-  section_iterator SI = Obj.section_begin();
-  section_iterator SE = Obj.section_end();
-
-  for (; SI != SE; ++SI) {
-    uint64_t SAddr, SSize;
-    SI->getAddress(SAddr);
-    SI->getSize(SSize);
-    if ((Addr >= SAddr) && (Addr < SAddr + SSize))
-      return SI;
-  }
-
-  return SE;
-}
-
-relocation_iterator RuntimeDyldMachO::processSECTDIFFRelocation(
-                                            unsigned SectionID,
-                                            relocation_iterator RelI,
-                                            ObjectImage &Obj,
-                                            ObjSectionToIDMap &ObjSectionToID) {
-  const MachOObjectFile *MachO =
-    static_cast<const MachOObjectFile*>(Obj.getObjectFile());
-  MachO::any_relocation_info RE =
-    MachO->getRelocation(RelI->getRawDataRefImpl());
-
-  SectionEntry &Section = Sections[SectionID];
-  uint32_t RelocType = MachO->getAnyRelocationType(RE);
-  bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
-  unsigned Size = MachO->getAnyRelocationLength(RE);
-  uint64_t Offset;
-  RelI->getOffset(Offset);
-  uint8_t *LocalAddress = Section.Address + Offset;
-  unsigned NumBytes = 1 << Size;
-  int64_t Addend = 0;
-  memcpy(&Addend, LocalAddress, NumBytes);
-
-  ++RelI;
-  MachO::any_relocation_info RE2 =
-    MachO->getRelocation(RelI->getRawDataRefImpl());
-
-  uint32_t AddrA = MachO->getScatteredRelocationValue(RE);
-  section_iterator SAI = getSectionByAddress(*MachO, AddrA);
-  assert(SAI != MachO->section_end() && "Can't find section for address A");
-  uint64_t SectionABase;
-  SAI->getAddress(SectionABase);
-  uint64_t SectionAOffset = AddrA - SectionABase;
-  SectionRef SectionA = *SAI;
-  bool IsCode;
-  SectionA.isText(IsCode);
-  uint32_t SectionAID = findOrEmitSection(Obj, SectionA, IsCode,
-                                          ObjSectionToID);
-
-  uint32_t AddrB = MachO->getScatteredRelocationValue(RE2);
-  section_iterator SBI = getSectionByAddress(*MachO, AddrB);
-  assert(SBI != MachO->section_end() && "Can't find section for address B");
-  uint64_t SectionBBase;
-  SBI->getAddress(SectionBBase);
-  uint64_t SectionBOffset = AddrB - SectionBBase;
-  SectionRef SectionB = *SBI;
-  uint32_t SectionBID = findOrEmitSection(Obj, SectionB, IsCode,
-                                          ObjSectionToID);
-
-  if (Addend != AddrA - AddrB)
-    Error("Unexpected SECTDIFF relocation addend.");
-
-  DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA << ", AddrB: " << AddrB
-               << ", Addend: " << Addend << ", SectionA ID: "
-               << SectionAID << ", SectionAOffset: " << SectionAOffset
-               << ", SectionB ID: " << SectionBID << ", SectionBOffset: "
-               << SectionBOffset << "\n");
-  RelocationEntry R(SectionID, Offset, RelocType, 0,
-                    SectionAID, SectionAOffset, SectionBID, SectionBOffset,
-                    IsPCRel, Size);
-
-  addRelocationForSection(R, SectionAID);
-  addRelocationForSection(R, SectionBID);
-
-  return ++RelI;
-}
-
-relocation_iterator RuntimeDyldMachO::processI386ScatteredVANILLA(
-                                            unsigned SectionID,
-                                            relocation_iterator RelI,
-                                            ObjectImage &Obj,
-                                            ObjSectionToIDMap &ObjSectionToID) {
-  const MachOObjectFile *MachO =
-    static_cast<const MachOObjectFile*>(Obj.getObjectFile());
-  MachO::any_relocation_info RE =
-    MachO->getRelocation(RelI->getRawDataRefImpl());
-
-  SectionEntry &Section = Sections[SectionID];
-  uint32_t RelocType = MachO->getAnyRelocationType(RE);
-  bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
-  unsigned Size = MachO->getAnyRelocationLength(RE);
-  uint64_t Offset;
-  RelI->getOffset(Offset);
-  uint8_t *LocalAddress = Section.Address + Offset;
-  unsigned NumBytes = 1 << Size;
-  int64_t Addend = 0;
-  memcpy(&Addend, LocalAddress, NumBytes);
-
-  unsigned SymbolBaseAddr = MachO->getScatteredRelocationValue(RE);
-  section_iterator TargetSI = getSectionByAddress(*MachO, SymbolBaseAddr);
-  assert(TargetSI != MachO->section_end() && "Can't find section for symbol");
-  uint64_t SectionBaseAddr;
-  TargetSI->getAddress(SectionBaseAddr);
-  SectionRef TargetSection = *TargetSI;
-  bool IsCode;
-  TargetSection.isText(IsCode);
-  uint32_t TargetSectionID = findOrEmitSection(Obj, TargetSection, IsCode,
-                                               ObjSectionToID);
-
-  Addend -= SectionBaseAddr;
-  RelocationEntry R(SectionID, Offset, RelocType, Addend,
-                    IsPCRel, Size);
-
-  addRelocationForSection(R, TargetSectionID);
-
-  return ++RelI;
-}
-
-relocation_iterator RuntimeDyldMachO::processRelocationRef(
-    unsigned SectionID, relocation_iterator RelI, ObjectImage &Obj,
-    ObjSectionToIDMap &ObjSectionToID, const SymbolTableMap &Symbols,
-    StubMap &Stubs) {
-  const ObjectFile *OF = Obj.getObjectFile();
-  const MachOObjectImage &MachOObj = *static_cast<MachOObjectImage *>(&Obj);
-  const MachOObjectFile *MachO = static_cast<const MachOObjectFile *>(OF);
-  MachO::any_relocation_info RE =
-      MachO->getRelocation(RelI->getRawDataRefImpl());
-
-  uint32_t RelType = MachO->getAnyRelocationType(RE);
-
-  // FIXME: Properly handle scattered relocations.
-  //        Special case the couple of scattered relocations that we know how
-  //        to handle: SECTDIFF relocations, and scattered VANILLA relocations
-  //        on I386.
-  //        For all other scattered relocations, just bail out and hope for the
-  //        best, since the offsets computed by scattered relocations have often
-  //        been optimisticaly filled in by the compiler. This will fail
-  //        horribly where the relocations *do* need to be applied, but that was
-  //        already the case.
-  if (MachO->isRelocationScattered(RE)) {
-    if (RelType == MachO::GENERIC_RELOC_SECTDIFF ||
-        RelType == MachO::GENERIC_RELOC_LOCAL_SECTDIFF)
-      return processSECTDIFFRelocation(SectionID, RelI, Obj, ObjSectionToID);
-    else if (Arch == Triple::x86 && RelType == MachO::GENERIC_RELOC_VANILLA)
-      return processI386ScatteredVANILLA(SectionID, RelI, Obj, ObjSectionToID);
-    else
-      return ++RelI;
-  }
-
-  RelocationValueRef Value;
-  SectionEntry &Section = Sections[SectionID];
-
-  bool IsExtern = MachO->getPlainRelocationExternal(RE);
-  bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
-  unsigned Size = MachO->getAnyRelocationLength(RE);
-  uint64_t Offset;
-  RelI->getOffset(Offset);
-  uint8_t *LocalAddress = Section.Address + Offset;
-  unsigned NumBytes = 1 << Size;
-  uint64_t Addend = 0;
-  memcpy(&Addend, LocalAddress, NumBytes);
-
-  if (IsExtern) {
-    // Obtain the symbol name which is referenced in the relocation
-    symbol_iterator Symbol = RelI->getSymbol();
-    StringRef TargetName;
-    Symbol->getName(TargetName);
-    // First search for the symbol in the local symbol table
-    SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
-    if (lsi != Symbols.end()) {
-      Value.SectionID = lsi->second.first;
-      Value.Addend = lsi->second.second + Addend;
-    } else {
-      // Search for the symbol in the global symbol table
-      SymbolTableMap::const_iterator gsi =
-          GlobalSymbolTable.find(TargetName.data());
-      if (gsi != GlobalSymbolTable.end()) {
-        Value.SectionID = gsi->second.first;
-        Value.Addend = gsi->second.second + Addend;
-      } else {
-        Value.SymbolName = TargetName.data();
-        Value.Addend = Addend;
-      }
-    }
-
-    // Addends for external, PC-rel relocations on i386 point back to the zero
-    // offset. Calculate the final offset from the relocation target instead.
-    // This allows us to use the same logic for both external and internal
-    // relocations in resolveI386RelocationRef.
-    if (Arch == Triple::x86 && IsPCRel) {
-      uint64_t RelocAddr = 0;
-      RelI->getAddress(RelocAddr);
-      Value.Addend += RelocAddr + 4;
-    }
-
-  } else {
-    SectionRef Sec = MachO->getRelocationSection(RE);
-    bool IsCode = false;
-    Sec.isText(IsCode);
-    Value.SectionID = findOrEmitSection(Obj, Sec, IsCode, ObjSectionToID);
-    uint64_t Addr = MachOObj.getOldSectionAddr(Sec);
-    DEBUG(dbgs() << "\nAddr: " << Addr << "\nAddend: " << Addend);
-    Value.Addend = Addend - Addr;
-    if (IsPCRel)
-      Value.Addend += Offset + NumBytes;
-  }
-
-  if (Arch == Triple::x86_64 && (RelType == MachO::X86_64_RELOC_GOT ||
-                                 RelType == MachO::X86_64_RELOC_GOT_LOAD)) {
-    assert(IsPCRel);
-    assert(Size == 2);
-
-    // FIXME: Teach the generic code above not to prematurely conflate
-    //        relocation addends and symbol offsets.
-    Value.Addend -= Addend;
-    StubMap::const_iterator i = Stubs.find(Value);
-    uint8_t *Addr;
-    if (i != Stubs.end()) {
-      Addr = Section.Address + i->second;
-    } else {
-      Stubs[Value] = Section.StubOffset;
-      uint8_t *GOTEntry = Section.Address + Section.StubOffset;
-      RelocationEntry GOTRE(SectionID, Section.StubOffset,
-                            MachO::X86_64_RELOC_UNSIGNED, Value.Addend, false,
-                            3);
-      if (Value.SymbolName)
-        addRelocationForSymbol(GOTRE, Value.SymbolName);
-      else
-        addRelocationForSection(GOTRE, Value.SectionID);
-      Section.StubOffset += 8;
-      Addr = GOTEntry;
-    }
-    RelocationEntry TargetRE(SectionID, Offset,
-                             MachO::X86_64_RELOC_UNSIGNED, Addend, true,
-                             2);
-    resolveRelocation(TargetRE, (uint64_t)Addr);
-  } else if (Arch == Triple::arm && (RelType & 0xf) == MachO::ARM_RELOC_BR24) {
-    // This is an ARM branch relocation, need to use a stub function.
-
-    //  Look up for existing stub.
-    StubMap::const_iterator i = Stubs.find(Value);
-    uint8_t *Addr;
-    if (i != Stubs.end()) {
-      Addr = Section.Address + i->second;
-    } else {
-      // Create a new stub function.
-      Stubs[Value] = Section.StubOffset;
-      uint8_t *StubTargetAddr =
-          createStubFunction(Section.Address + Section.StubOffset);
-      RelocationEntry StubRE(SectionID, StubTargetAddr - Section.Address,
-                             MachO::GENERIC_RELOC_VANILLA, Value.Addend);
-      if (Value.SymbolName)
-        addRelocationForSymbol(StubRE, Value.SymbolName);
-      else
-        addRelocationForSection(StubRE, Value.SectionID);
-      Addr = Section.Address + Section.StubOffset;
-      Section.StubOffset += getMaxStubSize();
-    }
-    RelocationEntry TargetRE(Value.SectionID, Offset, RelType, 0, IsPCRel,
-                             Size);
-    resolveRelocation(TargetRE, (uint64_t)Addr);
-  } else {
-    RelocationEntry RE(SectionID, Offset, RelType, Value.Addend, IsPCRel, Size);
-    if (Value.SymbolName)
-      addRelocationForSymbol(RE, Value.SymbolName);
-    else
-      addRelocationForSection(RE, Value.SectionID);
-  }
-  return ++RelI;
-}
-
 bool
 RuntimeDyldMachO::isCompatibleFormat(const ObjectBuffer *InputBuffer) const {
   if (InputBuffer->getBufferSize() < 4)
@@ -823,4 +183,117 @@
   return Obj->isMachO();
 }
 
+template <typename Impl>
+void RuntimeDyldMachOCRTPBase<Impl>::finalizeLoad(ObjectImage &ObjImg,
+                                                  ObjSectionToIDMap &SectionMap) {
+  unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID;
+  unsigned TextSID = RTDYLD_INVALID_SECTION_ID;
+  unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID;
+  ObjSectionToIDMap::iterator i, e;
+
+  for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
+    const SectionRef &Section = i->first;
+    StringRef Name;
+    Section.getName(Name);
+    if (Name == "__eh_frame")
+      EHFrameSID = i->second;
+    else if (Name == "__text")
+      TextSID = i->second;
+    else if (Name == "__gcc_except_tab")
+      ExceptTabSID = i->second;
+    else
+      impl().finalizeSection(ObjImg, i->second, Section);
+  }
+  UnregisteredEHFrameSections.push_back(
+    EHFrameRelatedSections(EHFrameSID, TextSID, ExceptTabSID));
+}
+
+template <typename Impl>
+unsigned char *RuntimeDyldMachOCRTPBase<Impl>::processFDE(unsigned char *P,
+                                                          int64_t DeltaForText,
+                                                          int64_t DeltaForEH) {
+  typedef typename Impl::TargetPtrT TargetPtrT;
+
+  DEBUG(dbgs() << "Processing FDE: Delta for text: " << DeltaForText
+               << ", Delta for EH: " << DeltaForEH << "\n");
+  uint32_t Length = readBytesUnaligned(P, 4);
+  P += 4;
+  unsigned char *Ret = P + Length;
+  uint32_t Offset = readBytesUnaligned(P, 4);
+  if (Offset == 0) // is a CIE
+    return Ret;
+
+  P += 4;
+  TargetPtrT FDELocation = readBytesUnaligned(P, sizeof(TargetPtrT));
+  TargetPtrT NewLocation = FDELocation - DeltaForText;
+  writeBytesUnaligned(NewLocation, P, sizeof(TargetPtrT));
+
+  P += sizeof(TargetPtrT);
+
+  // Skip the FDE address range
+  P += sizeof(TargetPtrT);
+
+  uint8_t Augmentationsize = *P;
+  P += 1;
+  if (Augmentationsize != 0) {
+    TargetPtrT LSDA = readBytesUnaligned(P, sizeof(TargetPtrT));
+    TargetPtrT NewLSDA = LSDA - DeltaForEH;
+    writeBytesUnaligned(NewLSDA, P, sizeof(TargetPtrT));
+  }
+
+  return Ret;
+}
+
+static int64_t computeDelta(SectionEntry *A, SectionEntry *B) {
+  int64_t ObjDistance = A->ObjAddress - B->ObjAddress;
+  int64_t MemDistance = A->LoadAddress - B->LoadAddress;
+  return ObjDistance - MemDistance;
+}
+
+template <typename Impl>
+void RuntimeDyldMachOCRTPBase<Impl>::registerEHFrames() {
+
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
+    EHFrameRelatedSections &SectionInfo = UnregisteredEHFrameSections[i];
+    if (SectionInfo.EHFrameSID == RTDYLD_INVALID_SECTION_ID ||
+        SectionInfo.TextSID == RTDYLD_INVALID_SECTION_ID)
+      continue;
+    SectionEntry *Text = &Sections[SectionInfo.TextSID];
+    SectionEntry *EHFrame = &Sections[SectionInfo.EHFrameSID];
+    SectionEntry *ExceptTab = nullptr;
+    if (SectionInfo.ExceptTabSID != RTDYLD_INVALID_SECTION_ID)
+      ExceptTab = &Sections[SectionInfo.ExceptTabSID];
+
+    int64_t DeltaForText = computeDelta(Text, EHFrame);
+    int64_t DeltaForEH = 0;
+    if (ExceptTab)
+      DeltaForEH = computeDelta(ExceptTab, EHFrame);
+
+    unsigned char *P = EHFrame->Address;
+    unsigned char *End = P + EHFrame->Size;
+    do {
+      P = processFDE(P, DeltaForText, DeltaForEH);
+    } while (P != End);
+
+    MemMgr->registerEHFrames(EHFrame->Address, EHFrame->LoadAddress,
+                             EHFrame->Size);
+  }
+  UnregisteredEHFrameSections.clear();
+}
+
+std::unique_ptr<RuntimeDyldMachO>
+llvm::RuntimeDyldMachO::create(Triple::ArchType Arch, RTDyldMemoryManager *MM) {
+  switch (Arch) {
+  default:
+    llvm_unreachable("Unsupported target for RuntimeDyldMachO.");
+    break;
+  case Triple::arm: return make_unique<RuntimeDyldMachOARM>(MM);
+  case Triple::aarch64: return make_unique<RuntimeDyldMachOAArch64>(MM);
+  case Triple::x86: return make_unique<RuntimeDyldMachOI386>(MM);
+  case Triple::x86_64: return make_unique<RuntimeDyldMachOX86_64>(MM);
+  }
+}
+
 } // end namespace llvm

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 35f0720..7583474 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h

@@ -11,74 +11,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_RUNTIME_DYLD_MACHO_H
-#define LLVM_RUNTIME_DYLD_MACHO_H
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDMACHO_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_RUNTIMEDYLDMACHO_H
 
 #include "ObjectImageCommon.h"
 #include "RuntimeDyldImpl.h"
-#include "llvm/ADT/IndexedMap.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/Format.h"
 
+#define DEBUG_TYPE "dyld"
+
 using namespace llvm;
 using namespace llvm::object;
 
 namespace llvm {
 class RuntimeDyldMachO : public RuntimeDyldImpl {
-private:
-
-  /// Write the least significant 'Size' bytes in 'Value' out at the address
-  /// pointed to by Addr.
-  bool applyRelocationValue(uint8_t *Addr, uint64_t Value, unsigned Size) {
-    for (unsigned i = 0; i < Size; ++i) {
-      *Addr++ = (uint8_t)Value;
-      Value >>= 8;
-    }
-
-    return false;
-  }
-
-  bool resolveI386Relocation(const RelocationEntry &RE, uint64_t Value);
-  bool resolveX86_64Relocation(const RelocationEntry &RE, uint64_t Value);
-  bool resolveARMRelocation(const RelocationEntry &RE, uint64_t Value);
-  bool resolveAArch64Relocation(const RelocationEntry &RE, uint64_t Value);
-
-  // Populate stubs in __jump_table section.
-  void populateJumpTable(MachOObjectFile &Obj, const SectionRef &JTSection,
-                         unsigned JTSectionID);
-
-  // Populate __pointers section.
-  void populatePointersSection(MachOObjectFile &Obj, const SectionRef &PTSection,
-                               unsigned PTSectionID);
-
-  unsigned getMaxStubSize() override {
-    if (Arch == Triple::arm || Arch == Triple::thumb)
-      return 8; // 32-bit instruction and 32-bit address
-    else if (Arch == Triple::x86_64)
-      return 8; // GOT entry
-    else
-      return 0;
-  }
-
-  unsigned getStubAlignment() override { return 1; }
-
-  relocation_iterator processSECTDIFFRelocation(
-                                             unsigned SectionID,
-                                             relocation_iterator RelI,
-                                             ObjectImage &ObjImg,
-                                             ObjSectionToIDMap &ObjSectionToID);
-
-  relocation_iterator processI386ScatteredVANILLA(
-					     unsigned SectionID,
-					     relocation_iterator RelI,
-					     ObjectImage &ObjImg,
-					     ObjSectionToIDMap &ObjSectionToID);
+protected:
+  struct SectionOffsetPair {
+    unsigned SectionID;
+    uint64_t Offset;
+  };
 
   struct EHFrameRelatedSections {
     EHFrameRelatedSections()
         : EHFrameSID(RTDYLD_INVALID_SECTION_ID),
           TextSID(RTDYLD_INVALID_SECTION_ID),
           ExceptTabSID(RTDYLD_INVALID_SECTION_ID) {}
+
     EHFrameRelatedSections(SID EH, SID T, SID Ex)
         : EHFrameSID(EH), TextSID(T), ExceptTabSID(Ex) {}
     SID EHFrameSID;
@@ -91,25 +50,116 @@
   // EH frame sections with the memory manager.
   SmallVector<EHFrameRelatedSections, 2> UnregisteredEHFrameSections;
 
-public:
   RuntimeDyldMachO(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
 
-  void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override;
-  relocation_iterator
-  processRelocationRef(unsigned SectionID, relocation_iterator RelI,
-                       ObjectImage &Obj, ObjSectionToIDMap &ObjSectionToID,
-                       const SymbolTableMap &Symbols, StubMap &Stubs) override;
+  /// This convenience method uses memcpy to extract a contiguous addend (the
+  /// addend size and offset are taken from the corresponding fields of the RE).
+  int64_t memcpyAddend(const RelocationEntry &RE) const;
+
+  /// Given a relocation_iterator for a non-scattered relocation, construct a
+  /// RelocationEntry and fill in the common fields. The 'Addend' field is *not*
+  /// filled in, since immediate encodings are highly target/opcode specific.
+  /// For targets/opcodes with simple, contiguous immediates (e.g. X86) the
+  /// memcpyAddend method can be used to read the immediate.
+  RelocationEntry getRelocationEntry(unsigned SectionID, ObjectImage &ObjImg,
+                                     const relocation_iterator &RI) const {
+    const MachOObjectFile &Obj =
+      static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+    MachO::any_relocation_info RelInfo =
+      Obj.getRelocation(RI->getRawDataRefImpl());
+
+    bool IsPCRel = Obj.getAnyRelocationPCRel(RelInfo);
+    unsigned Size = Obj.getAnyRelocationLength(RelInfo);
+    uint64_t Offset;
+    RI->getOffset(Offset);
+    MachO::RelocationInfoType RelType =
+      static_cast<MachO::RelocationInfoType>(Obj.getAnyRelocationType(RelInfo));
+
+    return RelocationEntry(SectionID, Offset, RelType, 0, IsPCRel, Size);
+  }
+
+  /// Construct a RelocationValueRef representing the relocation target.
+  /// For Symbols in known sections, this will return a RelocationValueRef
+  /// representing a (SectionID, Offset) pair.
+  /// For Symbols whose section is not known, this will return a
+  /// (SymbolName, Offset) pair, where the Offset is taken from the instruction
+  /// immediate (held in RE.Addend).
+  /// In both cases the Addend field is *NOT* fixed up to be PC-relative. That
+  /// should be done by the caller where appropriate by calling makePCRel on
+  /// the RelocationValueRef.
+  RelocationValueRef getRelocationValueRef(ObjectImage &ObjImg,
+                                           const relocation_iterator &RI,
+                                           const RelocationEntry &RE,
+                                           ObjSectionToIDMap &ObjSectionToID,
+                                           const SymbolTableMap &Symbols);
+
+  /// Make the RelocationValueRef addend PC-relative.
+  void makeValueAddendPCRel(RelocationValueRef &Value, ObjectImage &ObjImg,
+                            const relocation_iterator &RI,
+                            unsigned OffsetToNextPC);
+
+  /// Dump information about the relocation entry (RE) and resolved value.
+  void dumpRelocationToResolve(const RelocationEntry &RE, uint64_t Value) const;
+
+  // Return a section iterator for the section containing the given address.
+  static section_iterator getSectionByAddress(const MachOObjectFile &Obj,
+                                              uint64_t Addr);
+
+
+  // Populate __pointers section.
+  void populateIndirectSymbolPointersSection(MachOObjectFile &Obj,
+                                             const SectionRef &PTSection,
+                                             unsigned PTSectionID);
+
+public:
+  /// Create an ObjectImage from the given ObjectBuffer.
+  static std::unique_ptr<ObjectImage>
+  createObjectImage(std::unique_ptr<ObjectBuffer> InputBuffer) {
+    return llvm::make_unique<ObjectImageCommon>(std::move(InputBuffer));
+  }
+
+  /// Create an ObjectImage from the given ObjectFile.
+  static ObjectImage *
+  createObjectImageFromFile(std::unique_ptr<object::ObjectFile> InputObject) {
+    return new ObjectImageCommon(std::move(InputObject));
+  }
+
+  /// Create a RuntimeDyldMachO instance for the given target architecture.
+  static std::unique_ptr<RuntimeDyldMachO> create(Triple::ArchType Arch,
+                                                  RTDyldMemoryManager *mm);
+
+  SectionEntry &getSection(unsigned SectionID) { return Sections[SectionID]; }
+
   bool isCompatibleFormat(const ObjectBuffer *Buffer) const override;
   bool isCompatibleFile(const object::ObjectFile *Obj) const override;
-  void registerEHFrames() override;
+};
+
+/// RuntimeDyldMachOTarget - Templated base class for generic MachO linker
+/// algorithms and data structures.
+///
+/// Concrete, target specific sub-classes can be accessed via the impl()
+/// methods. (i.e. the RuntimeDyldMachO hierarchy uses the Curiously
+/// Recurring Template Idiom). Concrete subclasses for each target
+/// can be found in ./Targets.
+template <typename Impl>
+class RuntimeDyldMachOCRTPBase : public RuntimeDyldMachO {
+private:
+  Impl &impl() { return static_cast<Impl &>(*this); }
+  const Impl &impl() const { return static_cast<const Impl &>(*this); }
+
+  unsigned char *processFDE(unsigned char *P, int64_t DeltaForText,
+                            int64_t DeltaForEH);
+
+public:
+  RuntimeDyldMachOCRTPBase(RTDyldMemoryManager *mm) : RuntimeDyldMachO(mm) {}
+
   void finalizeLoad(ObjectImage &ObjImg,
                     ObjSectionToIDMap &SectionMap) override;
-
-  static ObjectImage *createObjectImage(ObjectBuffer *Buffer);
-  static ObjectImage *
-  createObjectImageFromFile(std::unique_ptr<object::ObjectFile> InputObject);
+  void registerEHFrames() override;
 };
 
 } // end namespace llvm
 
+#undef DEBUG_TYPE
+
 #endif

diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h
new file mode 100644
index 0000000..f5cf9ac
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOAArch64.h

@@ -0,0 +1,405 @@
+//===-- RuntimeDyldMachOAArch64.h -- MachO/AArch64 specific code. -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOAARCH64_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOAARCH64_H
+
+#include "../RuntimeDyldMachO.h"
+#include "llvm/Support/Endian.h"
+
+#define DEBUG_TYPE "dyld"
+
+namespace llvm {
+
+class RuntimeDyldMachOAArch64
+    : public RuntimeDyldMachOCRTPBase<RuntimeDyldMachOAArch64> {
+public:
+
+  typedef uint64_t TargetPtrT;
+
+  RuntimeDyldMachOAArch64(RTDyldMemoryManager *MM)
+      : RuntimeDyldMachOCRTPBase(MM) {}
+
+  unsigned getMaxStubSize() override { return 8; }
+
+  unsigned getStubAlignment() override { return 8; }
+
+  /// Extract the addend encoded in the instruction / memory location.
+  int64_t decodeAddend(const RelocationEntry &RE) const {
+    const SectionEntry &Section = Sections[RE.SectionID];
+    uint8_t *LocalAddress = Section.Address + RE.Offset;
+    unsigned NumBytes = 1 << RE.Size;
+    int64_t Addend = 0;
+    // Verify that the relocation has the correct size and alignment.
+    switch (RE.RelType) {
+    default:
+      llvm_unreachable("Unsupported relocation type!");
+    case MachO::ARM64_RELOC_UNSIGNED:
+      assert((NumBytes == 4 || NumBytes == 8) && "Invalid relocation size.");
+      break;
+    case MachO::ARM64_RELOC_BRANCH26:
+    case MachO::ARM64_RELOC_PAGE21:
+    case MachO::ARM64_RELOC_PAGEOFF12:
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12:
+      assert(NumBytes == 4 && "Invalid relocation size.");
+      assert((((uintptr_t)LocalAddress & 0x3) == 0) &&
+             "Instruction address is not aligned to 4 bytes.");
+      break;
+    }
+
+    switch (RE.RelType) {
+    default:
+      llvm_unreachable("Unsupported relocation type!");
+    case MachO::ARM64_RELOC_UNSIGNED:
+      // This could be an unaligned memory location.
+      if (NumBytes == 4)
+        Addend = *reinterpret_cast<support::ulittle32_t *>(LocalAddress);
+      else
+        Addend = *reinterpret_cast<support::ulittle64_t *>(LocalAddress);
+      break;
+    case MachO::ARM64_RELOC_BRANCH26: {
+      // Verify that the relocation points to the expected branch instruction.
+      auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
+      assert((*p & 0xFC000000) == 0x14000000 && "Expected branch instruction.");
+
+      // Get the 26 bit addend encoded in the branch instruction and sign-extend
+      // to 64 bit. The lower 2 bits are always zeros and are therefore implicit
+      // (<< 2).
+      Addend = (*p & 0x03FFFFFF) << 2;
+      Addend = SignExtend64(Addend, 28);
+      break;
+    }
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
+    case MachO::ARM64_RELOC_PAGE21: {
+      // Verify that the relocation points to the expected adrp instruction.
+      auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
+      assert((*p & 0x9F000000) == 0x90000000 && "Expected adrp instruction.");
+
+      // Get the 21 bit addend encoded in the adrp instruction and sign-extend
+      // to 64 bit. The lower 12 bits (4096 byte page) are always zeros and are
+      // therefore implicit (<< 12).
+      Addend = ((*p & 0x60000000) >> 29) | ((*p & 0x01FFFFE0) >> 3) << 12;
+      Addend = SignExtend64(Addend, 33);
+      break;
+    }
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12: {
+      // Verify that the relocation points to one of the expected load / store
+      // instructions.
+      auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
+      (void)p;
+      assert((*p & 0x3B000000) == 0x39000000 &&
+             "Only expected load / store instructions.");
+    } // fall-through
+    case MachO::ARM64_RELOC_PAGEOFF12: {
+      // Verify that the relocation points to one of the expected load / store
+      // or add / sub instructions.
+      auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
+      assert((((*p & 0x3B000000) == 0x39000000) ||
+              ((*p & 0x11C00000) == 0x11000000)   ) &&
+             "Expected load / store  or add/sub instruction.");
+
+      // Get the 12 bit addend encoded in the instruction.
+      Addend = (*p & 0x003FFC00) >> 10;
+
+      // Check which instruction we are decoding to obtain the implicit shift
+      // factor of the instruction.
+      int ImplicitShift = 0;
+      if ((*p & 0x3B000000) == 0x39000000) { // << load / store
+        // For load / store instructions the size is encoded in bits 31:30.
+        ImplicitShift = ((*p >> 30) & 0x3);
+        if (ImplicitShift == 0) {
+          // Check if this a vector op to get the correct shift value.
+          if ((*p & 0x04800000) == 0x04800000)
+            ImplicitShift = 4;
+        }
+      }
+      // Compensate for implicit shift.
+      Addend <<= ImplicitShift;
+      break;
+    }
+    }
+    return Addend;
+  }
+
+  /// Extract the addend encoded in the instruction.
+  void encodeAddend(uint8_t *LocalAddress, unsigned NumBytes,
+                    MachO::RelocationInfoType RelType, int64_t Addend) const {
+    // Verify that the relocation has the correct alignment.
+    switch (RelType) {
+    default:
+      llvm_unreachable("Unsupported relocation type!");
+    case MachO::ARM64_RELOC_UNSIGNED:
+      assert((NumBytes == 4 || NumBytes == 8) && "Invalid relocation size.");
+      break;
+    case MachO::ARM64_RELOC_BRANCH26:
+    case MachO::ARM64_RELOC_PAGE21:
+    case MachO::ARM64_RELOC_PAGEOFF12:
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12:
+      assert(NumBytes == 4 && "Invalid relocation size.");
+      assert((((uintptr_t)LocalAddress & 0x3) == 0) &&
+             "Instruction address is not aligned to 4 bytes.");
+      break;
+    }
+
+    switch (RelType) {
+    default:
+      llvm_unreachable("Unsupported relocation type!");
+    case MachO::ARM64_RELOC_UNSIGNED:
+      // This could be an unaligned memory location.
+      if (NumBytes == 4)
+        *reinterpret_cast<support::ulittle32_t *>(LocalAddress) = Addend;
+      else
+        *reinterpret_cast<support::ulittle64_t *>(LocalAddress) = Addend;
+      break;
+    case MachO::ARM64_RELOC_BRANCH26: {
+      auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
+      // Verify that the relocation points to the expected branch instruction.
+      assert((*p & 0xFC000000) == 0x14000000 && "Expected branch instruction.");
+
+      // Verify addend value.
+      assert((Addend & 0x3) == 0 && "Branch target is not aligned");
+      assert(isInt<28>(Addend) && "Branch target is out of range.");
+
+      // Encode the addend as 26 bit immediate in the branch instruction.
+      *p = (*p & 0xFC000000) | ((uint32_t)(Addend >> 2) & 0x03FFFFFF);
+      break;
+    }
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
+    case MachO::ARM64_RELOC_PAGE21: {
+      // Verify that the relocation points to the expected adrp instruction.
+      auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
+      assert((*p & 0x9F000000) == 0x90000000 && "Expected adrp instruction.");
+
+      // Check that the addend fits into 21 bits (+ 12 lower bits).
+      assert((Addend & 0xFFF) == 0 && "ADRP target is not page aligned.");
+      assert(isInt<33>(Addend) && "Invalid page reloc value.");
+
+      // Encode the addend into the instruction.
+      uint32_t ImmLoValue = (uint32_t)(Addend << 17) & 0x60000000;
+      uint32_t ImmHiValue = (uint32_t)(Addend >> 9) & 0x00FFFFE0;
+      *p = (*p & 0x9F00001F) | ImmHiValue | ImmLoValue;
+      break;
+    }
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12: {
+      // Verify that the relocation points to one of the expected load / store
+      // instructions.
+      auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
+      assert((*p & 0x3B000000) == 0x39000000 &&
+             "Only expected load / store instructions.");
+      (void)p;
+    } // fall-through
+    case MachO::ARM64_RELOC_PAGEOFF12: {
+      // Verify that the relocation points to one of the expected load / store
+      // or add / sub instructions.
+      auto *p = reinterpret_cast<support::aligned_ulittle32_t *>(LocalAddress);
+      assert((((*p & 0x3B000000) == 0x39000000) ||
+              ((*p & 0x11C00000) == 0x11000000)   ) &&
+             "Expected load / store  or add/sub instruction.");
+
+      // Check which instruction we are decoding to obtain the implicit shift
+      // factor of the instruction and verify alignment.
+      int ImplicitShift = 0;
+      if ((*p & 0x3B000000) == 0x39000000) { // << load / store
+        // For load / store instructions the size is encoded in bits 31:30.
+        ImplicitShift = ((*p >> 30) & 0x3);
+        switch (ImplicitShift) {
+        case 0:
+          // Check if this a vector op to get the correct shift value.
+          if ((*p & 0x04800000) == 0x04800000) {
+            ImplicitShift = 4;
+            assert(((Addend & 0xF) == 0) &&
+                   "128-bit LDR/STR not 16-byte aligned.");
+          }
+          break;
+        case 1:
+          assert(((Addend & 0x1) == 0) && "16-bit LDR/STR not 2-byte aligned.");
+          break;
+        case 2:
+          assert(((Addend & 0x3) == 0) && "32-bit LDR/STR not 4-byte aligned.");
+          break;
+        case 3:
+          assert(((Addend & 0x7) == 0) && "64-bit LDR/STR not 8-byte aligned.");
+          break;
+        }
+      }
+      // Compensate for implicit shift.
+      Addend >>= ImplicitShift;
+      assert(isUInt<12>(Addend) && "Addend cannot be encoded.");
+
+      // Encode the addend into the instruction.
+      *p = (*p & 0xFFC003FF) | ((uint32_t)(Addend << 10) & 0x003FFC00);
+      break;
+    }
+    }
+  }
+
+  relocation_iterator
+  processRelocationRef(unsigned SectionID, relocation_iterator RelI,
+                       ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID,
+                       const SymbolTableMap &Symbols, StubMap &Stubs) override {
+    const MachOObjectFile &Obj =
+        static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+    MachO::any_relocation_info RelInfo =
+        Obj.getRelocation(RelI->getRawDataRefImpl());
+
+    assert(!Obj.isRelocationScattered(RelInfo) && "");
+
+    // ARM64 has an ARM64_RELOC_ADDEND relocation type that carries an explicit
+    // addend for the following relocation. If found: (1) store the associated
+    // addend, (2) consume the next relocation, and (3) use the stored addend to
+    // override the addend.
+    int64_t ExplicitAddend = 0;
+    if (Obj.getAnyRelocationType(RelInfo) == MachO::ARM64_RELOC_ADDEND) {
+      assert(!Obj.getPlainRelocationExternal(RelInfo));
+      assert(!Obj.getAnyRelocationPCRel(RelInfo));
+      assert(Obj.getAnyRelocationLength(RelInfo) == 2);
+      int64_t RawAddend = Obj.getPlainRelocationSymbolNum(RelInfo);
+      // Sign-extend the 24-bit to 64-bit.
+      ExplicitAddend = SignExtend64(RawAddend, 24);
+      ++RelI;
+      RelInfo = Obj.getRelocation(RelI->getRawDataRefImpl());
+    }
+
+    RelocationEntry RE(getRelocationEntry(SectionID, ObjImg, RelI));
+    RE.Addend = decodeAddend(RE);
+    RelocationValueRef Value(
+        getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols));
+
+    assert((ExplicitAddend == 0 || RE.Addend == 0) && "Relocation has "\
+      "ARM64_RELOC_ADDEND and embedded addend in the instruction.");
+    if (ExplicitAddend) {
+      RE.Addend = ExplicitAddend;
+      Value.Offset = ExplicitAddend;
+    }
+
+    bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
+    if (!IsExtern && RE.IsPCRel)
+      makeValueAddendPCRel(Value, ObjImg, RelI, 1 << RE.Size);
+
+    RE.Addend = Value.Offset;
+
+    if (RE.RelType == MachO::ARM64_RELOC_GOT_LOAD_PAGE21 ||
+        RE.RelType == MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12)
+      processGOTRelocation(RE, Value, Stubs);
+    else {
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+    }
+
+    return ++RelI;
+  }
+
+  void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
+    DEBUG(dumpRelocationToResolve(RE, Value));
+
+    const SectionEntry &Section = Sections[RE.SectionID];
+    uint8_t *LocalAddress = Section.Address + RE.Offset;
+    MachO::RelocationInfoType RelType =
+      static_cast<MachO::RelocationInfoType>(RE.RelType);
+
+    switch (RelType) {
+    default:
+      llvm_unreachable("Invalid relocation type!");
+    case MachO::ARM64_RELOC_UNSIGNED: {
+      assert(!RE.IsPCRel && "PCRel and ARM64_RELOC_UNSIGNED not supported");
+      // Mask in the target value a byte at a time (we don't have an alignment
+      // guarantee for the target address, so this is safest).
+      if (RE.Size < 2)
+        llvm_unreachable("Invalid size for ARM64_RELOC_UNSIGNED");
+
+      encodeAddend(LocalAddress, 1 << RE.Size, RelType, Value + RE.Addend);
+      break;
+    }
+    case MachO::ARM64_RELOC_BRANCH26: {
+      assert(RE.IsPCRel && "not PCRel and ARM64_RELOC_BRANCH26 not supported");
+      // Check if branch is in range.
+      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      int64_t PCRelVal = Value - FinalAddress + RE.Addend;
+      encodeAddend(LocalAddress, /*Size=*/4, RelType, PCRelVal);
+      break;
+    }
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
+    case MachO::ARM64_RELOC_PAGE21: {
+      assert(RE.IsPCRel && "not PCRel and ARM64_RELOC_PAGE21 not supported");
+      // Adjust for PC-relative relocation and offset.
+      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      int64_t PCRelVal =
+        ((Value + RE.Addend) & (-4096)) - (FinalAddress & (-4096));
+      encodeAddend(LocalAddress, /*Size=*/4, RelType, PCRelVal);
+      break;
+    }
+    case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12:
+    case MachO::ARM64_RELOC_PAGEOFF12: {
+      assert(!RE.IsPCRel && "PCRel and ARM64_RELOC_PAGEOFF21 not supported");
+      // Add the offset from the symbol.
+      Value += RE.Addend;
+      // Mask out the page address and only use the lower 12 bits.
+      Value &= 0xFFF;
+      encodeAddend(LocalAddress, /*Size=*/4, RelType, Value);
+      break;
+    }
+    case MachO::ARM64_RELOC_SUBTRACTOR:
+    case MachO::ARM64_RELOC_POINTER_TO_GOT:
+    case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21:
+    case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12:
+      llvm_unreachable("Relocation type not yet implemented!");
+    case MachO::ARM64_RELOC_ADDEND:
+      llvm_unreachable("ARM64_RELOC_ADDEND should have been handeled by "
+                       "processRelocationRef!");
+    }
+  }
+
+  void finalizeSection(ObjectImage &ObjImg, unsigned SectionID,
+                       const SectionRef &Section) {}
+
+private:
+  void processGOTRelocation(const RelocationEntry &RE,
+                            RelocationValueRef &Value, StubMap &Stubs) {
+    assert(RE.Size == 2);
+    SectionEntry &Section = Sections[RE.SectionID];
+    StubMap::const_iterator i = Stubs.find(Value);
+    int64_t Offset;
+    if (i != Stubs.end())
+      Offset = static_cast<int64_t>(i->second);
+    else {
+      // FIXME: There must be a better way to do this then to check and fix the
+      // alignment every time!!!
+      uintptr_t BaseAddress = uintptr_t(Section.Address);
+      uintptr_t StubAlignment = getStubAlignment();
+      uintptr_t StubAddress =
+          (BaseAddress + Section.StubOffset + StubAlignment - 1) &
+          -StubAlignment;
+      unsigned StubOffset = StubAddress - BaseAddress;
+      Stubs[Value] = StubOffset;
+      assert(((StubAddress % getStubAlignment()) == 0) &&
+             "GOT entry not aligned");
+      RelocationEntry GOTRE(RE.SectionID, StubOffset,
+                            MachO::ARM64_RELOC_UNSIGNED, Value.Offset,
+                            /*IsPCRel=*/false, /*Size=*/3);
+      if (Value.SymbolName)
+        addRelocationForSymbol(GOTRE, Value.SymbolName);
+      else
+        addRelocationForSection(GOTRE, Value.SectionID);
+      Section.StubOffset = StubOffset + getMaxStubSize();
+      Offset = static_cast<int64_t>(StubOffset);
+    }
+    RelocationEntry TargetRE(RE.SectionID, RE.Offset, RE.RelType, Offset,
+                             RE.IsPCRel, RE.Size);
+    addRelocationForSection(TargetRE, RE.SectionID);
+  }
+};
+}
+
+#undef DEBUG_TYPE
+
+#endif

diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
new file mode 100644
index 0000000..9766751
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h

@@ -0,0 +1,277 @@
+//===----- RuntimeDyldMachOARM.h ---- MachO/ARM specific code. ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOARM_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOARM_H
+
+#include "../RuntimeDyldMachO.h"
+
+#define DEBUG_TYPE "dyld"
+
+namespace llvm {
+
+class RuntimeDyldMachOARM
+    : public RuntimeDyldMachOCRTPBase<RuntimeDyldMachOARM> {
+private:
+  typedef RuntimeDyldMachOCRTPBase<RuntimeDyldMachOARM> ParentT;
+
+public:
+
+  typedef uint32_t TargetPtrT;
+
+  RuntimeDyldMachOARM(RTDyldMemoryManager *MM) : RuntimeDyldMachOCRTPBase(MM) {}
+
+  unsigned getMaxStubSize() override { return 8; }
+
+  unsigned getStubAlignment() override { return 4; }
+
+  int64_t decodeAddend(const RelocationEntry &RE) const {
+    const SectionEntry &Section = Sections[RE.SectionID];
+    uint8_t *LocalAddress = Section.Address + RE.Offset;
+
+    switch (RE.RelType) {
+      default:
+        return memcpyAddend(RE);
+      case MachO::ARM_RELOC_BR24: {
+        uint32_t Temp = readBytesUnaligned(LocalAddress, 4);
+        Temp &= 0x00ffffff; // Mask out the opcode.
+        // Now we've got the shifted immediate, shift by 2, sign extend and ret.
+        return SignExtend32<26>(Temp << 2);
+      }
+    }
+  }
+
+  relocation_iterator
+  processRelocationRef(unsigned SectionID, relocation_iterator RelI,
+                       ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID,
+                       const SymbolTableMap &Symbols, StubMap &Stubs) override {
+    const MachOObjectFile &Obj =
+        static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+    MachO::any_relocation_info RelInfo =
+        Obj.getRelocation(RelI->getRawDataRefImpl());
+    uint32_t RelType = Obj.getAnyRelocationType(RelInfo);
+
+    if (Obj.isRelocationScattered(RelInfo)) {
+      if (RelType == MachO::ARM_RELOC_HALF_SECTDIFF)
+        return processHALFSECTDIFFRelocation(SectionID, RelI, ObjImg,
+                                             ObjSectionToID);
+      else
+        return ++++RelI;
+    }
+
+    RelocationEntry RE(getRelocationEntry(SectionID, ObjImg, RelI));
+    RE.Addend = decodeAddend(RE);
+    RelocationValueRef Value(
+        getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols));
+
+    if (RE.IsPCRel)
+      makeValueAddendPCRel(Value, ObjImg, RelI, 8);
+
+    if ((RE.RelType & 0xf) == MachO::ARM_RELOC_BR24)
+      processBranchRelocation(RE, Value, Stubs);
+    else {
+      RE.Addend = Value.Offset;
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+    }
+
+    return ++RelI;
+  }
+
+  void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
+    DEBUG(dumpRelocationToResolve(RE, Value));
+    const SectionEntry &Section = Sections[RE.SectionID];
+    uint8_t *LocalAddress = Section.Address + RE.Offset;
+
+    // If the relocation is PC-relative, the value to be encoded is the
+    // pointer difference.
+    if (RE.IsPCRel) {
+      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      Value -= FinalAddress;
+      // ARM PCRel relocations have an effective-PC offset of two instructions
+      // (four bytes in Thumb mode, 8 bytes in ARM mode).
+      // FIXME: For now, assume ARM mode.
+      Value -= 8;
+    }
+
+    switch (RE.RelType) {
+    default:
+      llvm_unreachable("Invalid relocation type!");
+    case MachO::ARM_RELOC_VANILLA:
+      writeBytesUnaligned(Value + RE.Addend, LocalAddress, 1 << RE.Size);
+      break;
+    case MachO::ARM_RELOC_BR24: {
+      // Mask the value into the target address. We know instructions are
+      // 32-bit aligned, so we can do it all at once.
+      Value += RE.Addend;
+      // The low two bits of the value are not encoded.
+      Value >>= 2;
+      // Mask the value to 24 bits.
+      uint64_t FinalValue = Value & 0xffffff;
+      // FIXME: If the destination is a Thumb function (and the instruction
+      // is a non-predicated BL instruction), we need to change it to a BLX
+      // instruction instead.
+
+      // Insert the value into the instruction.
+      uint32_t Temp = readBytesUnaligned(LocalAddress, 4);
+      writeBytesUnaligned((Temp & ~0xffffff) | FinalValue, LocalAddress, 4);
+
+      break;
+    }
+    case MachO::ARM_RELOC_HALF_SECTDIFF: {
+      uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress;
+      uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress;
+      assert((Value == SectionABase || Value == SectionBBase) &&
+             "Unexpected HALFSECTDIFF relocation value.");
+      Value = SectionABase - SectionBBase + RE.Addend;
+      if (RE.Size & 0x1) // :upper16:
+        Value = (Value >> 16);
+      Value &= 0xffff;
+
+      uint32_t Insn = readBytesUnaligned(LocalAddress, 4);
+      Insn = (Insn & 0xfff0f000) | ((Value & 0xf000) << 4) | (Value & 0x0fff);
+      writeBytesUnaligned(Insn, LocalAddress, 4);
+      break;
+    }
+
+    case MachO::ARM_THUMB_RELOC_BR22:
+    case MachO::ARM_THUMB_32BIT_BRANCH:
+    case MachO::ARM_RELOC_HALF:
+    case MachO::ARM_RELOC_PAIR:
+    case MachO::ARM_RELOC_SECTDIFF:
+    case MachO::ARM_RELOC_LOCAL_SECTDIFF:
+    case MachO::ARM_RELOC_PB_LA_PTR:
+      Error("Relocation type not implemented yet!");
+      return;
+    }
+  }
+
+  void finalizeSection(ObjectImage &ObjImg, unsigned SectionID,
+                       const SectionRef &Section) {
+    StringRef Name;
+    Section.getName(Name);
+
+    if (Name == "__nl_symbol_ptr")
+      populateIndirectSymbolPointersSection(
+                                 cast<MachOObjectFile>(*ObjImg.getObjectFile()),
+                                 Section, SectionID);
+  }
+
+private:
+
+  void processBranchRelocation(const RelocationEntry &RE,
+                               const RelocationValueRef &Value,
+                               StubMap &Stubs) {
+    // This is an ARM branch relocation, need to use a stub function.
+    // Look up for existing stub.
+    SectionEntry &Section = Sections[RE.SectionID];
+    RuntimeDyldMachO::StubMap::const_iterator i = Stubs.find(Value);
+    uint8_t *Addr;
+    if (i != Stubs.end()) {
+      Addr = Section.Address + i->second;
+    } else {
+      // Create a new stub function.
+      Stubs[Value] = Section.StubOffset;
+      uint8_t *StubTargetAddr =
+          createStubFunction(Section.Address + Section.StubOffset);
+      RelocationEntry StubRE(RE.SectionID, StubTargetAddr - Section.Address,
+                             MachO::GENERIC_RELOC_VANILLA, Value.Offset, false,
+                             2);
+      if (Value.SymbolName)
+        addRelocationForSymbol(StubRE, Value.SymbolName);
+      else
+        addRelocationForSection(StubRE, Value.SectionID);
+      Addr = Section.Address + Section.StubOffset;
+      Section.StubOffset += getMaxStubSize();
+    }
+    RelocationEntry TargetRE(RE.SectionID, RE.Offset, RE.RelType, 0,
+                             RE.IsPCRel, RE.Size);
+    resolveRelocation(TargetRE, (uint64_t)Addr);
+  }
+
+  relocation_iterator
+  processHALFSECTDIFFRelocation(unsigned SectionID, relocation_iterator RelI,
+                                ObjectImage &Obj,
+                                ObjSectionToIDMap &ObjSectionToID) {
+    const MachOObjectFile *MachO =
+        static_cast<const MachOObjectFile *>(Obj.getObjectFile());
+    MachO::any_relocation_info RE =
+        MachO->getRelocation(RelI->getRawDataRefImpl());
+
+
+    // For a half-diff relocation the length bits actually record whether this
+    // is a movw/movt, and whether this is arm or thumb.
+    // Bit 0 indicates movw (b0 == 0) or movt (b0 == 1).
+    // Bit 1 indicates arm (b1 == 0) or thumb (b1 == 1).
+    unsigned HalfDiffKindBits = MachO->getAnyRelocationLength(RE);
+    if (HalfDiffKindBits & 0x2)
+      llvm_unreachable("Thumb not yet supported.");
+
+    SectionEntry &Section = Sections[SectionID];
+    uint32_t RelocType = MachO->getAnyRelocationType(RE);
+    bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
+    uint64_t Offset;
+    RelI->getOffset(Offset);
+    uint8_t *LocalAddress = Section.Address + Offset;
+    int64_t Immediate = readBytesUnaligned(LocalAddress, 4); // Copy the whole instruction out.
+    Immediate = ((Immediate >> 4) & 0xf000) | (Immediate & 0xfff);
+
+    ++RelI;
+    MachO::any_relocation_info RE2 =
+        MachO->getRelocation(RelI->getRawDataRefImpl());
+    uint32_t AddrA = MachO->getScatteredRelocationValue(RE);
+    section_iterator SAI = getSectionByAddress(*MachO, AddrA);
+    assert(SAI != MachO->section_end() && "Can't find section for address A");
+    uint64_t SectionABase = SAI->getAddress();
+    uint64_t SectionAOffset = AddrA - SectionABase;
+    SectionRef SectionA = *SAI;
+    bool IsCode = SectionA.isText();
+    uint32_t SectionAID =
+        findOrEmitSection(Obj, SectionA, IsCode, ObjSectionToID);
+
+    uint32_t AddrB = MachO->getScatteredRelocationValue(RE2);
+    section_iterator SBI = getSectionByAddress(*MachO, AddrB);
+    assert(SBI != MachO->section_end() && "Can't find section for address B");
+    uint64_t SectionBBase = SBI->getAddress();
+    uint64_t SectionBOffset = AddrB - SectionBBase;
+    SectionRef SectionB = *SBI;
+    uint32_t SectionBID =
+        findOrEmitSection(Obj, SectionB, IsCode, ObjSectionToID);
+
+    uint32_t OtherHalf = MachO->getAnyRelocationAddress(RE2) & 0xffff;
+    unsigned Shift = (HalfDiffKindBits & 0x1) ? 16 : 0;
+    uint32_t FullImmVal = (Immediate << Shift) | (OtherHalf << (16 - Shift));
+    int64_t Addend = FullImmVal - (AddrA - AddrB);
+
+    // addend = Encoded - Expected
+    //        = Encoded - (AddrA - AddrB)
+
+    DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA << ", AddrB: " << AddrB
+                 << ", Addend: " << Addend << ", SectionA ID: " << SectionAID
+                 << ", SectionAOffset: " << SectionAOffset
+                 << ", SectionB ID: " << SectionBID
+                 << ", SectionBOffset: " << SectionBOffset << "\n");
+    RelocationEntry R(SectionID, Offset, RelocType, Addend, SectionAID,
+                      SectionAOffset, SectionBID, SectionBOffset, IsPCRel,
+                      HalfDiffKindBits);
+
+    addRelocationForSection(R, SectionAID);
+    addRelocationForSection(R, SectionBID);
+
+    return ++RelI;
+  }
+
+};
+}
+
+#undef DEBUG_TYPE
+
+#endif

diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h
new file mode 100644
index 0000000..258b847
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOI386.h

@@ -0,0 +1,261 @@
+//===---- RuntimeDyldMachOI386.h ---- MachO/I386 specific code. ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOI386_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOI386_H
+
+#include "../RuntimeDyldMachO.h"
+
+#define DEBUG_TYPE "dyld"
+
+namespace llvm {
+
+class RuntimeDyldMachOI386
+    : public RuntimeDyldMachOCRTPBase<RuntimeDyldMachOI386> {
+public:
+
+  typedef uint32_t TargetPtrT;
+
+  RuntimeDyldMachOI386(RTDyldMemoryManager *MM)
+      : RuntimeDyldMachOCRTPBase(MM) {}
+
+  unsigned getMaxStubSize() override { return 0; }
+
+  unsigned getStubAlignment() override { return 1; }
+
+  relocation_iterator
+  processRelocationRef(unsigned SectionID, relocation_iterator RelI,
+                       ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID,
+                       const SymbolTableMap &Symbols, StubMap &Stubs) override {
+    const MachOObjectFile &Obj =
+        static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+    MachO::any_relocation_info RelInfo =
+        Obj.getRelocation(RelI->getRawDataRefImpl());
+    uint32_t RelType = Obj.getAnyRelocationType(RelInfo);
+
+    if (Obj.isRelocationScattered(RelInfo)) {
+      if (RelType == MachO::GENERIC_RELOC_SECTDIFF ||
+          RelType == MachO::GENERIC_RELOC_LOCAL_SECTDIFF)
+        return processSECTDIFFRelocation(SectionID, RelI, ObjImg,
+                                         ObjSectionToID);
+      else if (RelType == MachO::GENERIC_RELOC_VANILLA)
+        return processI386ScatteredVANILLA(SectionID, RelI, ObjImg,
+                                           ObjSectionToID);
+      llvm_unreachable("Unhandled scattered relocation.");
+    }
+
+    RelocationEntry RE(getRelocationEntry(SectionID, ObjImg, RelI));
+    RE.Addend = memcpyAddend(RE);
+    RelocationValueRef Value(
+        getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols));
+
+    // Addends for external, PC-rel relocations on i386 point back to the zero
+    // offset. Calculate the final offset from the relocation target instead.
+    // This allows us to use the same logic for both external and internal
+    // relocations in resolveI386RelocationRef.
+    // bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
+    // if (IsExtern && RE.IsPCRel) {
+    //   uint64_t RelocAddr = 0;
+    //   RelI->getAddress(RelocAddr);
+    //   Value.Addend += RelocAddr + 4;
+    // }
+    if (RE.IsPCRel)
+      makeValueAddendPCRel(Value, ObjImg, RelI, 1 << RE.Size);
+
+    RE.Addend = Value.Offset;
+
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+
+    return ++RelI;
+  }
+
+  void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
+    DEBUG(dumpRelocationToResolve(RE, Value));
+
+    const SectionEntry &Section = Sections[RE.SectionID];
+    uint8_t *LocalAddress = Section.Address + RE.Offset;
+
+    if (RE.IsPCRel) {
+      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      Value -= FinalAddress + 4; // see MachOX86_64::resolveRelocation.
+    }
+
+    switch (RE.RelType) {
+    default:
+      llvm_unreachable("Invalid relocation type!");
+    case MachO::GENERIC_RELOC_VANILLA:
+      writeBytesUnaligned(Value + RE.Addend, LocalAddress, 1 << RE.Size);
+      break;
+    case MachO::GENERIC_RELOC_SECTDIFF:
+    case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: {
+      uint64_t SectionABase = Sections[RE.Sections.SectionA].LoadAddress;
+      uint64_t SectionBBase = Sections[RE.Sections.SectionB].LoadAddress;
+      assert((Value == SectionABase || Value == SectionBBase) &&
+             "Unexpected SECTDIFF relocation value.");
+      Value = SectionABase - SectionBBase + RE.Addend;
+      writeBytesUnaligned(Value, LocalAddress, 1 << RE.Size);
+      break;
+    }
+    case MachO::GENERIC_RELOC_PB_LA_PTR:
+      Error("Relocation type not implemented yet!");
+    }
+  }
+
+  void finalizeSection(ObjectImage &ObjImg, unsigned SectionID,
+                       const SectionRef &Section) {
+    StringRef Name;
+    Section.getName(Name);
+
+    if (Name == "__jump_table")
+      populateJumpTable(cast<MachOObjectFile>(*ObjImg.getObjectFile()), Section,
+                        SectionID);
+    else if (Name == "__pointers")
+      populateIndirectSymbolPointersSection(
+                                 cast<MachOObjectFile>(*ObjImg.getObjectFile()),
+                                 Section, SectionID);
+  }
+
+private:
+  relocation_iterator
+  processSECTDIFFRelocation(unsigned SectionID, relocation_iterator RelI,
+                            ObjectImage &Obj,
+                            ObjSectionToIDMap &ObjSectionToID) {
+    const MachOObjectFile *MachO =
+        static_cast<const MachOObjectFile *>(Obj.getObjectFile());
+    MachO::any_relocation_info RE =
+        MachO->getRelocation(RelI->getRawDataRefImpl());
+
+    SectionEntry &Section = Sections[SectionID];
+    uint32_t RelocType = MachO->getAnyRelocationType(RE);
+    bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
+    unsigned Size = MachO->getAnyRelocationLength(RE);
+    uint64_t Offset;
+    RelI->getOffset(Offset);
+    uint8_t *LocalAddress = Section.Address + Offset;
+    unsigned NumBytes = 1 << Size;
+    uint64_t Addend = readBytesUnaligned(LocalAddress, NumBytes);
+
+    ++RelI;
+    MachO::any_relocation_info RE2 =
+        MachO->getRelocation(RelI->getRawDataRefImpl());
+
+    uint32_t AddrA = MachO->getScatteredRelocationValue(RE);
+    section_iterator SAI = getSectionByAddress(*MachO, AddrA);
+    assert(SAI != MachO->section_end() && "Can't find section for address A");
+    uint64_t SectionABase = SAI->getAddress();
+    uint64_t SectionAOffset = AddrA - SectionABase;
+    SectionRef SectionA = *SAI;
+    bool IsCode = SectionA.isText();
+    uint32_t SectionAID =
+        findOrEmitSection(Obj, SectionA, IsCode, ObjSectionToID);
+
+    uint32_t AddrB = MachO->getScatteredRelocationValue(RE2);
+    section_iterator SBI = getSectionByAddress(*MachO, AddrB);
+    assert(SBI != MachO->section_end() && "Can't find section for address B");
+    uint64_t SectionBBase = SBI->getAddress();
+    uint64_t SectionBOffset = AddrB - SectionBBase;
+    SectionRef SectionB = *SBI;
+    uint32_t SectionBID =
+        findOrEmitSection(Obj, SectionB, IsCode, ObjSectionToID);
+
+    if (Addend != AddrA - AddrB)
+      Error("Unexpected SECTDIFF relocation addend.");
+
+    DEBUG(dbgs() << "Found SECTDIFF: AddrA: " << AddrA << ", AddrB: " << AddrB
+                 << ", Addend: " << Addend << ", SectionA ID: " << SectionAID
+                 << ", SectionAOffset: " << SectionAOffset
+                 << ", SectionB ID: " << SectionBID
+                 << ", SectionBOffset: " << SectionBOffset << "\n");
+    RelocationEntry R(SectionID, Offset, RelocType, 0, SectionAID,
+                      SectionAOffset, SectionBID, SectionBOffset, IsPCRel,
+                      Size);
+
+    addRelocationForSection(R, SectionAID);
+    addRelocationForSection(R, SectionBID);
+
+    return ++RelI;
+  }
+
+  relocation_iterator processI386ScatteredVANILLA(
+      unsigned SectionID, relocation_iterator RelI, ObjectImage &Obj,
+      RuntimeDyldMachO::ObjSectionToIDMap &ObjSectionToID) {
+    const MachOObjectFile *MachO =
+        static_cast<const MachOObjectFile *>(Obj.getObjectFile());
+    MachO::any_relocation_info RE =
+        MachO->getRelocation(RelI->getRawDataRefImpl());
+
+    SectionEntry &Section = Sections[SectionID];
+    uint32_t RelocType = MachO->getAnyRelocationType(RE);
+    bool IsPCRel = MachO->getAnyRelocationPCRel(RE);
+    unsigned Size = MachO->getAnyRelocationLength(RE);
+    uint64_t Offset;
+    RelI->getOffset(Offset);
+    uint8_t *LocalAddress = Section.Address + Offset;
+    unsigned NumBytes = 1 << Size;
+    int64_t Addend = readBytesUnaligned(LocalAddress, NumBytes);
+
+    unsigned SymbolBaseAddr = MachO->getScatteredRelocationValue(RE);
+    section_iterator TargetSI = getSectionByAddress(*MachO, SymbolBaseAddr);
+    assert(TargetSI != MachO->section_end() && "Can't find section for symbol");
+    uint64_t SectionBaseAddr = TargetSI->getAddress();
+    SectionRef TargetSection = *TargetSI;
+    bool IsCode = TargetSection.isText();
+    uint32_t TargetSectionID =
+        findOrEmitSection(Obj, TargetSection, IsCode, ObjSectionToID);
+
+    Addend -= SectionBaseAddr;
+    RelocationEntry R(SectionID, Offset, RelocType, Addend, IsPCRel, Size);
+
+    addRelocationForSection(R, TargetSectionID);
+
+    return ++RelI;
+  }
+
+  // Populate stubs in __jump_table section.
+  void populateJumpTable(MachOObjectFile &Obj, const SectionRef &JTSection,
+                         unsigned JTSectionID) {
+    assert(!Obj.is64Bit() &&
+           "__jump_table section not supported in 64-bit MachO.");
+
+    MachO::dysymtab_command DySymTabCmd = Obj.getDysymtabLoadCommand();
+    MachO::section Sec32 = Obj.getSection(JTSection.getRawDataRefImpl());
+    uint32_t JTSectionSize = Sec32.size;
+    unsigned FirstIndirectSymbol = Sec32.reserved1;
+    unsigned JTEntrySize = Sec32.reserved2;
+    unsigned NumJTEntries = JTSectionSize / JTEntrySize;
+    uint8_t *JTSectionAddr = getSectionAddress(JTSectionID);
+    unsigned JTEntryOffset = 0;
+
+    assert((JTSectionSize % JTEntrySize) == 0 &&
+           "Jump-table section does not contain a whole number of stubs?");
+
+    for (unsigned i = 0; i < NumJTEntries; ++i) {
+      unsigned SymbolIndex =
+          Obj.getIndirectSymbolTableEntry(DySymTabCmd, FirstIndirectSymbol + i);
+      symbol_iterator SI = Obj.getSymbolByIndex(SymbolIndex);
+      StringRef IndirectSymbolName;
+      SI->getName(IndirectSymbolName);
+      uint8_t *JTEntryAddr = JTSectionAddr + JTEntryOffset;
+      createStubFunction(JTEntryAddr);
+      RelocationEntry RE(JTSectionID, JTEntryOffset + 1,
+                         MachO::GENERIC_RELOC_VANILLA, 0, true, 2);
+      addRelocationForSymbol(RE, IndirectSymbolName);
+      JTEntryOffset += JTEntrySize;
+    }
+  }
+
+};
+}
+
+#undef DEBUG_TYPE
+
+#endif

diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h
new file mode 100644
index 0000000..84d9e80
--- /dev/null
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOX86_64.h

@@ -0,0 +1,136 @@
+//===-- RuntimeDyldMachOX86_64.h ---- MachO/X86_64 specific code. -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOX86_64_H
+#define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDMACHOX86_64_H
+
+#include "../RuntimeDyldMachO.h"
+
+#define DEBUG_TYPE "dyld"
+
+namespace llvm {
+
+class RuntimeDyldMachOX86_64
+    : public RuntimeDyldMachOCRTPBase<RuntimeDyldMachOX86_64> {
+public:
+
+  typedef uint64_t TargetPtrT;
+
+  RuntimeDyldMachOX86_64(RTDyldMemoryManager *MM)
+      : RuntimeDyldMachOCRTPBase(MM) {}
+
+  unsigned getMaxStubSize() override { return 8; }
+
+  unsigned getStubAlignment() override { return 1; }
+
+  relocation_iterator
+  processRelocationRef(unsigned SectionID, relocation_iterator RelI,
+                       ObjectImage &ObjImg, ObjSectionToIDMap &ObjSectionToID,
+                       const SymbolTableMap &Symbols, StubMap &Stubs) override {
+    const MachOObjectFile &Obj =
+        static_cast<const MachOObjectFile &>(*ObjImg.getObjectFile());
+    MachO::any_relocation_info RelInfo =
+        Obj.getRelocation(RelI->getRawDataRefImpl());
+
+    assert(!Obj.isRelocationScattered(RelInfo) &&
+           "Scattered relocations not supported on X86_64");
+
+    RelocationEntry RE(getRelocationEntry(SectionID, ObjImg, RelI));
+    RE.Addend = memcpyAddend(RE);
+    RelocationValueRef Value(
+        getRelocationValueRef(ObjImg, RelI, RE, ObjSectionToID, Symbols));
+
+    bool IsExtern = Obj.getPlainRelocationExternal(RelInfo);
+    if (!IsExtern && RE.IsPCRel)
+      makeValueAddendPCRel(Value, ObjImg, RelI, 1 << RE.Size);
+
+    if (RE.RelType == MachO::X86_64_RELOC_GOT ||
+        RE.RelType == MachO::X86_64_RELOC_GOT_LOAD)
+      processGOTRelocation(RE, Value, Stubs);
+    else {
+      RE.Addend = Value.Offset;
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+    }
+
+    return ++RelI;
+  }
+
+  void resolveRelocation(const RelocationEntry &RE, uint64_t Value) override {
+    DEBUG(dumpRelocationToResolve(RE, Value));
+    const SectionEntry &Section = Sections[RE.SectionID];
+    uint8_t *LocalAddress = Section.Address + RE.Offset;
+
+    // If the relocation is PC-relative, the value to be encoded is the
+    // pointer difference.
+    if (RE.IsPCRel) {
+      // FIXME: It seems this value needs to be adjusted by 4 for an effective
+      // PC address. Is that expected? Only for branches, perhaps?
+      uint64_t FinalAddress = Section.LoadAddress + RE.Offset;
+      Value -= FinalAddress + 4;
+    }
+
+    switch (RE.RelType) {
+    default:
+      llvm_unreachable("Invalid relocation type!");
+    case MachO::X86_64_RELOC_SIGNED_1:
+    case MachO::X86_64_RELOC_SIGNED_2:
+    case MachO::X86_64_RELOC_SIGNED_4:
+    case MachO::X86_64_RELOC_SIGNED:
+    case MachO::X86_64_RELOC_UNSIGNED:
+    case MachO::X86_64_RELOC_BRANCH:
+      writeBytesUnaligned(Value + RE.Addend, LocalAddress, 1 << RE.Size);
+      break;
+    case MachO::X86_64_RELOC_GOT_LOAD:
+    case MachO::X86_64_RELOC_GOT:
+    case MachO::X86_64_RELOC_SUBTRACTOR:
+    case MachO::X86_64_RELOC_TLV:
+      Error("Relocation type not implemented yet!");
+    }
+  }
+
+  void finalizeSection(ObjectImage &ObjImg, unsigned SectionID,
+                       const SectionRef &Section) {}
+
+private:
+  void processGOTRelocation(const RelocationEntry &RE,
+                            RelocationValueRef &Value, StubMap &Stubs) {
+    SectionEntry &Section = Sections[RE.SectionID];
+    assert(RE.IsPCRel);
+    assert(RE.Size == 2);
+    Value.Offset -= RE.Addend;
+    RuntimeDyldMachO::StubMap::const_iterator i = Stubs.find(Value);
+    uint8_t *Addr;
+    if (i != Stubs.end()) {
+      Addr = Section.Address + i->second;
+    } else {
+      Stubs[Value] = Section.StubOffset;
+      uint8_t *GOTEntry = Section.Address + Section.StubOffset;
+      RelocationEntry GOTRE(RE.SectionID, Section.StubOffset,
+                            MachO::X86_64_RELOC_UNSIGNED, Value.Offset, false,
+                            3);
+      if (Value.SymbolName)
+        addRelocationForSymbol(GOTRE, Value.SymbolName);
+      else
+        addRelocationForSection(GOTRE, Value.SectionID);
+      Section.StubOffset += 8;
+      Addr = GOTEntry;
+    }
+    RelocationEntry TargetRE(RE.SectionID, RE.Offset,
+                             MachO::X86_64_RELOC_UNSIGNED, RE.Addend, true, 2);
+    resolveRelocation(TargetRE, (uint64_t)Addr);
+  }
+};
+}
+
+#undef DEBUG_TYPE
+
+#endif

diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index b10d51f..e6679cf 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp

@@ -30,7 +30,7 @@
 
   // MCJIT can generate code for remote targets, but the old JIT and Interpreter
   // must use the host architecture.
-  if (UseMCJIT && WhichEngine != EngineKind::Interpreter && M)
+  if (WhichEngine != EngineKind::Interpreter && M)
     TT.setTriple(M->getTargetTriple());
 
   return selectTarget(TT, MArch, MCPU, MAttrs);
@@ -89,8 +89,7 @@
   }
 
   // FIXME: non-iOS ARM FastISel is broken with MCJIT.
-  if (UseMCJIT &&
-      TheTriple.getArch() == Triple::arm &&
+  if (TheTriple.getArch() == Triple::arm &&
       !TheTriple.isiOS() &&
       OptLevel == CodeGenOpt::None) {
     OptLevel = CodeGenOpt::Less;

diff --git a/lib/IR/Android.mk b/lib/IR/Android.mk
index c51b241..a3632cf 100644
--- a/lib/IR/Android.mk
+++ b/lib/IR/Android.mk

@@ -41,6 +41,7 @@
   Type.cpp \
   TypeFinder.cpp \
   Use.cpp \
+  UseListOrder.cpp \
   User.cpp \
   Value.cpp \
   ValueSymbolTable.cpp \

diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index a7499bc..1961a20 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp

@@ -49,6 +49,213 @@
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
+namespace {
+struct OrderMap {
+  DenseMap<const Value *, std::pair<unsigned, bool>> IDs;
+
+  unsigned size() const { return IDs.size(); }
+  std::pair<unsigned, bool> &operator[](const Value *V) { return IDs[V]; }
+  std::pair<unsigned, bool> lookup(const Value *V) const {
+    return IDs.lookup(V);
+  }
+  void index(const Value *V) {
+    // Explicitly sequence get-size and insert-value operations to avoid UB.
+    unsigned ID = IDs.size() + 1;
+    IDs[V].first = ID;
+  }
+};
+}
+
+static void orderValue(const Value *V, OrderMap &OM) {
+  if (OM.lookup(V).first)
+    return;
+
+  if (const Constant *C = dyn_cast<Constant>(V))
+    if (C->getNumOperands() && !isa<GlobalValue>(C))
+      for (const Value *Op : C->operands())
+        if (!isa<BasicBlock>(Op) && !isa<GlobalValue>(Op))
+          orderValue(Op, OM);
+
+  // Note: we cannot cache this lookup above, since inserting into the map
+  // changes the map's size, and thus affects the other IDs.
+  OM.index(V);
+}
+
+static OrderMap orderModule(const Module *M) {
+  // This needs to match the order used by ValueEnumerator::ValueEnumerator()
+  // and ValueEnumerator::incorporateFunction().
+  OrderMap OM;
+
+  for (const GlobalVariable &G : M->globals()) {
+    if (G.hasInitializer())
+      if (!isa<GlobalValue>(G.getInitializer()))
+        orderValue(G.getInitializer(), OM);
+    orderValue(&G, OM);
+  }
+  for (const GlobalAlias &A : M->aliases()) {
+    if (!isa<GlobalValue>(A.getAliasee()))
+      orderValue(A.getAliasee(), OM);
+    orderValue(&A, OM);
+  }
+  for (const Function &F : *M) {
+    if (F.hasPrefixData())
+      if (!isa<GlobalValue>(F.getPrefixData()))
+        orderValue(F.getPrefixData(), OM);
+    orderValue(&F, OM);
+
+    if (F.isDeclaration())
+      continue;
+
+    for (const Argument &A : F.args())
+      orderValue(&A, OM);
+    for (const BasicBlock &BB : F) {
+      orderValue(&BB, OM);
+      for (const Instruction &I : BB) {
+        for (const Value *Op : I.operands())
+          if ((isa<Constant>(*Op) && !isa<GlobalValue>(*Op)) ||
+              isa<InlineAsm>(*Op))
+            orderValue(Op, OM);
+        orderValue(&I, OM);
+      }
+    }
+  }
+  return OM;
+}
+
+static void predictValueUseListOrderImpl(const Value *V, const Function *F,
+                                         unsigned ID, const OrderMap &OM,
+                                         UseListOrderStack &Stack) {
+  // Predict use-list order for this one.
+  typedef std::pair<const Use *, unsigned> Entry;
+  SmallVector<Entry, 64> List;
+  for (const Use &U : V->uses())
+    // Check if this user will be serialized.
+    if (OM.lookup(U.getUser()).first)
+      List.push_back(std::make_pair(&U, List.size()));
+
+  if (List.size() < 2)
+    // We may have lost some users.
+    return;
+
+  bool GetsReversed =
+      !isa<GlobalVariable>(V) && !isa<Function>(V) && !isa<BasicBlock>(V);
+  if (auto *BA = dyn_cast<BlockAddress>(V))
+    ID = OM.lookup(BA->getBasicBlock()).first;
+  std::sort(List.begin(), List.end(), [&](const Entry &L, const Entry &R) {
+    const Use *LU = L.first;
+    const Use *RU = R.first;
+    if (LU == RU)
+      return false;
+
+    auto LID = OM.lookup(LU->getUser()).first;
+    auto RID = OM.lookup(RU->getUser()).first;
+
+    // If ID is 4, then expect: 7 6 5 1 2 3.
+    if (LID < RID) {
+      if (GetsReversed)
+        if (RID <= ID)
+          return true;
+      return false;
+    }
+    if (RID < LID) {
+      if (GetsReversed)
+        if (LID <= ID)
+          return false;
+      return true;
+    }
+
+    // LID and RID are equal, so we have different operands of the same user.
+    // Assume operands are added in order for all instructions.
+    if (GetsReversed)
+      if (LID <= ID)
+        return LU->getOperandNo() < RU->getOperandNo();
+    return LU->getOperandNo() > RU->getOperandNo();
+  });
+
+  if (std::is_sorted(
+          List.begin(), List.end(),
+          [](const Entry &L, const Entry &R) { return L.second < R.second; }))
+    // Order is already correct.
+    return;
+
+  // Store the shuffle.
+  Stack.emplace_back(V, F, List.size());
+  assert(List.size() == Stack.back().Shuffle.size() && "Wrong size");
+  for (size_t I = 0, E = List.size(); I != E; ++I)
+    Stack.back().Shuffle[I] = List[I].second;
+}
+
+static void predictValueUseListOrder(const Value *V, const Function *F,
+                                     OrderMap &OM, UseListOrderStack &Stack) {
+  auto &IDPair = OM[V];
+  assert(IDPair.first && "Unmapped value");
+  if (IDPair.second)
+    // Already predicted.
+    return;
+
+  // Do the actual prediction.
+  IDPair.second = true;
+  if (!V->use_empty() && std::next(V->use_begin()) != V->use_end())
+    predictValueUseListOrderImpl(V, F, IDPair.first, OM, Stack);
+
+  // Recursive descent into constants.
+  if (const Constant *C = dyn_cast<Constant>(V))
+    if (C->getNumOperands()) // Visit GlobalValues.
+      for (const Value *Op : C->operands())
+        if (isa<Constant>(Op)) // Visit GlobalValues.
+          predictValueUseListOrder(Op, F, OM, Stack);
+}
+
+static UseListOrderStack predictUseListOrder(const Module *M) {
+  OrderMap OM = orderModule(M);
+
+  // Use-list orders need to be serialized after all the users have been added
+  // to a value, or else the shuffles will be incomplete.  Store them per
+  // function in a stack.
+  //
+  // Aside from function order, the order of values doesn't matter much here.
+  UseListOrderStack Stack;
+
+  // We want to visit the functions backward now so we can list function-local
+  // constants in the last Function they're used in.  Module-level constants
+  // have already been visited above.
+  for (auto I = M->rbegin(), E = M->rend(); I != E; ++I) {
+    const Function &F = *I;
+    if (F.isDeclaration())
+      continue;
+    for (const BasicBlock &BB : F)
+      predictValueUseListOrder(&BB, &F, OM, Stack);
+    for (const Argument &A : F.args())
+      predictValueUseListOrder(&A, &F, OM, Stack);
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        for (const Value *Op : I.operands())
+          if (isa<Constant>(*Op) || isa<InlineAsm>(*Op)) // Visit GlobalValues.
+            predictValueUseListOrder(Op, &F, OM, Stack);
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        predictValueUseListOrder(&I, &F, OM, Stack);
+  }
+
+  // Visit globals last.
+  for (const GlobalVariable &G : M->globals())
+    predictValueUseListOrder(&G, nullptr, OM, Stack);
+  for (const Function &F : *M)
+    predictValueUseListOrder(&F, nullptr, OM, Stack);
+  for (const GlobalAlias &A : M->aliases())
+    predictValueUseListOrder(&A, nullptr, OM, Stack);
+  for (const GlobalVariable &G : M->globals())
+    if (G.hasInitializer())
+      predictValueUseListOrder(G.getInitializer(), nullptr, OM, Stack);
+  for (const GlobalAlias &A : M->aliases())
+    predictValueUseListOrder(A.getAliasee(), nullptr, OM, Stack);
+  for (const Function &F : *M)
+    if (F.hasPrefixData())
+      predictValueUseListOrder(F.getPrefixData(), nullptr, OM, Stack);
+
+  return Stack;
+}
+
 static const Module *getModuleFromVal(const Value *V) {
   if (const Argument *MA = dyn_cast<Argument>(V))
     return MA->getParent() ? MA->getParent()->getParent() : nullptr;
@@ -78,6 +285,7 @@
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
   case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
+  case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break;
   case CallingConv::Intel_OCL_BI:  Out << "intel_ocl_bicc"; break;
   case CallingConv::ARM_APCS:      Out << "arm_apcscc"; break;
   case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
@@ -347,6 +555,8 @@
     FunctionProcessed = false;
   }
 
+  const Function *getFunction() const { return TheFunction; }
+
   /// After calling incorporateFunction, use this method to remove the
   /// most recently incorporated function from the SlotTracker. This
   /// will reset the state of the machine back to just the module contents.
@@ -508,7 +718,7 @@
 
   ST_DEBUG("Inserting Instructions:\n");
 
-  SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst;
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDForInst;
 
   // Add all of the basic blocks and instructions with no names.
   for (Function::const_iterator BB = TheFunction->begin(),
@@ -1279,6 +1489,9 @@
 void AssemblyWriter::printModule(const Module *M) {
   Machine.initialize();
 
+  if (shouldPreserveAssemblyUseListOrder())
+    UseListOrders = predictUseListOrder(M);
+
   if (!M->getModuleIdentifier().empty() &&
       // Don't print the ID if it will start a new line (which would
       // require a comment char before it).
@@ -1339,9 +1552,13 @@
        I != E; ++I)
     printAlias(I);
 
+  // Output global use-lists.
+  printUseLists(nullptr);
+
   // Output all of the functions.
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
     printFunction(I);
+  assert(UseListOrders.empty() && "All use-lists should have been consumed");
 
   // Output all attribute groups.
   if (!Machine.as_empty()) {
@@ -1509,6 +1726,7 @@
     PrintLLVMName(Out, GA);
     Out << " = ";
   }
+  PrintLinkage(GA->getLinkage(), Out);
   PrintVisibility(GA->getVisibility(), Out);
   PrintDLLStorageClass(GA->getDLLStorageClass(), Out);
   PrintThreadLocalModel(GA->getThreadLocalMode(), Out);
@@ -1517,8 +1735,6 @@
 
   Out << "alias ";
 
-  PrintLinkage(GA->getLinkage(), Out);
-
   const Constant *Aliasee = GA->getAliasee();
 
   if (!Aliasee) {
@@ -1693,6 +1909,9 @@
     for (Function::const_iterator I = F->begin(), E = F->end(); I != E; ++I)
       printBasicBlock(I);
 
+    // Output the function's use-lists.
+    printUseLists(F);
+
     Out << "}\n";
   }
 
@@ -1956,6 +2175,14 @@
         Out << ", ";
       writeParamOperand(CI->getArgOperand(op), PAL, op + 1);
     }
+
+    // Emit an ellipsis if this is a musttail call in a vararg function.  This
+    // is only to aid readability, musttail calls forward varargs by default.
+    if (CI->isMustTailCall() && CI->getParent() &&
+        CI->getParent()->getParent() &&
+        CI->getParent()->getParent()->isVarArg())
+      Out << ", ...";
+
     Out << ')';
     if (PAL.hasAttributes(AttributeSet::FunctionIndex))
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
@@ -2088,7 +2315,7 @@
   }
 
   // Print Metadata info.
-  SmallVector<std::pair<unsigned, MDNode*>, 4> InstMD;
+  SmallVector<std::pair<unsigned, MDNode *>, 4> InstMD;
   I.getAllMetadata(InstMD);
   if (!InstMD.empty()) {
     SmallVector<StringRef, 8> MDNames;
@@ -2114,7 +2341,7 @@
     return;
 
   Value *Op = Node->getOperand(0);
-  if (!Op || !isa<ConstantInt>(Op) || cast<ConstantInt>(Op)->getBitWidth() < 32)
+  if (!Op || !isa<MDString>(Op))
     return;
 
   DIDescriptor Desc(Node);
@@ -2170,6 +2397,45 @@
 
 } // namespace llvm
 
+void AssemblyWriter::printUseListOrder(const UseListOrder &Order) {
+  bool IsInFunction = Machine.getFunction();
+  if (IsInFunction)
+    Out << "  ";
+
+  Out << "uselistorder";
+  if (const BasicBlock *BB =
+          IsInFunction ? nullptr : dyn_cast<BasicBlock>(Order.V)) {
+    Out << "_bb ";
+    writeOperand(BB->getParent(), false);
+    Out << ", ";
+    writeOperand(BB, false);
+  } else {
+    Out << " ";
+    writeOperand(Order.V, true);
+  }
+  Out << ", { ";
+
+  assert(Order.Shuffle.size() >= 2 && "Shuffle too small");
+  Out << Order.Shuffle[0];
+  for (unsigned I = 1, E = Order.Shuffle.size(); I != E; ++I)
+    Out << ", " << Order.Shuffle[I];
+  Out << " }\n";
+}
+
+void AssemblyWriter::printUseLists(const Function *F) {
+  auto hasMore =
+      [&]() { return !UseListOrders.empty() && UseListOrders.back().F == F; };
+  if (!hasMore())
+    // Nothing to do.
+    return;
+
+  Out << "\n; uselistorder directives\n";
+  while (hasMore()) {
+    printUseListOrder(UseListOrders.back());
+    UseListOrders.pop_back();
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //                       External Interface declarations
 //===----------------------------------------------------------------------===//
@@ -2291,7 +2557,7 @@
 void Value::dump() const { print(dbgs()); dbgs() << '\n'; }
 
 // Type::dump - allow easy printing of Types from the debugger.
-void Type::dump() const { print(dbgs()); }
+void Type::dump() const { print(dbgs()); dbgs() << '\n'; }
 
 // Module::dump() - Allow printing of Modules from the debugger.
 void Module::dump() const { print(dbgs(), nullptr); }

diff --git a/lib/IR/AsmWriter.h b/lib/IR/AsmWriter.h
index aef9c8a..60da5ad 100644
--- a/lib/IR/AsmWriter.h
+++ b/lib/IR/AsmWriter.h

@@ -12,14 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_IR_ASSEMBLYWRITER_H
-#define LLVM_IR_ASSEMBLYWRITER_H
+#ifndef LLVM_LIB_IR_ASMWRITER_H
+#define LLVM_LIB_IR_ASMWRITER_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/UseListOrder.h"
 #include "llvm/Support/FormattedStream.h"
 
 namespace llvm {
@@ -73,6 +74,7 @@
   TypePrinting TypePrinter;
   AssemblyAnnotationWriter *AnnotationWriter;
   SetVector<const Comdat *> Comdats;
+  UseListOrderStack UseListOrders;
 
 public:
   /// Construct an AssemblyWriter with an external SlotTracker
@@ -111,6 +113,9 @@
   void printInstructionLine(const Instruction &I);
   void printInstruction(const Instruction &I);
 
+  void printUseListOrder(const UseListOrder &Order);
+  void printUseLists(const Function *F);
+
 private:
   void init();
 
@@ -121,4 +126,4 @@
 
 } // namespace llvm
 
-#endif //LLVM_IR_ASMWRITER_H
+#endif

diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 9f3fd3e..0448dc1 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h

@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ATTRIBUTESIMPL_H
-#define LLVM_ATTRIBUTESIMPL_H
+#ifndef LLVM_LIB_IR_ATTRIBUTEIMPL_H
+#define LLVM_LIB_IR_ATTRIBUTEIMPL_H
 
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/IR/Attributes.h"
@@ -39,7 +39,7 @@
 protected:
   enum AttrEntryKind {
     EnumAttrEntry,
-    AlignAttrEntry,
+    IntAttrEntry,
     StringAttrEntry
   };
 
@@ -49,7 +49,7 @@
   virtual ~AttributeImpl();
 
   bool isEnumAttribute() const { return KindID == EnumAttrEntry; }
-  bool isAlignAttribute() const { return KindID == AlignAttrEntry; }
+  bool isIntAttribute() const { return KindID == IntAttrEntry; }
   bool isStringAttribute() const { return KindID == StringAttrEntry; }
 
   bool hasAttribute(Attribute::AttrKind A) const;
@@ -67,7 +67,7 @@
   void Profile(FoldingSetNodeID &ID) const {
     if (isEnumAttribute())
       Profile(ID, getKindAsEnum(), 0);
-    else if (isAlignAttribute())
+    else if (isIntAttribute())
       Profile(ID, getKindAsEnum(), getValueAsInt());
     else
       Profile(ID, getKindAsString(), getValueAsString());
@@ -108,19 +108,20 @@
   Attribute::AttrKind getEnumKind() const { return Kind; }
 };
 
-class AlignAttributeImpl : public EnumAttributeImpl {
+class IntAttributeImpl : public EnumAttributeImpl {
   void anchor() override;
-  unsigned Align;
+  uint64_t Val;
 
 public:
-  AlignAttributeImpl(Attribute::AttrKind Kind, unsigned Align)
-      : EnumAttributeImpl(AlignAttrEntry, Kind), Align(Align) {
+  IntAttributeImpl(Attribute::AttrKind Kind, uint64_t Val)
+      : EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) {
     assert(
-        (Kind == Attribute::Alignment || Kind == Attribute::StackAlignment) &&
-        "Wrong kind for alignment attribute!");
+        (Kind == Attribute::Alignment || Kind == Attribute::StackAlignment ||
+         Kind == Attribute::Dereferenceable) &&
+        "Wrong kind for int attribute!");
   }
 
-  unsigned getAlignment() const { return Align; }
+  uint64_t getValue() const { return Val; }
 };
 
 class StringAttributeImpl : public AttributeImpl {
@@ -164,6 +165,7 @@
 
   unsigned getAlignment() const;
   unsigned getStackAlignment() const;
+  uint64_t getDereferenceableBytes() const;
   std::string getAsString(bool InAttrGrp) const;
 
   typedef const Attribute *iterator;

diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 48a2ce8..04545ea 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp

@@ -47,7 +47,7 @@
     if (!Val)
       PA = new EnumAttributeImpl(Kind);
     else
-      PA = new AlignAttributeImpl(Kind, Val);
+      PA = new IntAttributeImpl(Kind, Val);
     pImpl->AttrsSet.InsertNode(PA, InsertPoint);
   }
 
@@ -88,6 +88,12 @@
   return get(Context, StackAlignment, Align);
 }
 
+Attribute Attribute::getWithDereferenceableBytes(LLVMContext &Context,
+                                                uint64_t Bytes) {
+  assert(Bytes && "Bytes must be non-zero.");
+  return get(Context, Dereferenceable, Bytes);
+}
+
 //===----------------------------------------------------------------------===//
 // Attribute Accessor Methods
 //===----------------------------------------------------------------------===//
@@ -96,8 +102,8 @@
   return pImpl && pImpl->isEnumAttribute();
 }
 
-bool Attribute::isAlignAttribute() const {
-  return pImpl && pImpl->isAlignAttribute();
+bool Attribute::isIntAttribute() const {
+  return pImpl && pImpl->isIntAttribute();
 }
 
 bool Attribute::isStringAttribute() const {
@@ -106,15 +112,15 @@
 
 Attribute::AttrKind Attribute::getKindAsEnum() const {
   if (!pImpl) return None;
-  assert((isEnumAttribute() || isAlignAttribute()) &&
+  assert((isEnumAttribute() || isIntAttribute()) &&
          "Invalid attribute type to get the kind as an enum!");
   return pImpl ? pImpl->getKindAsEnum() : None;
 }
 
 uint64_t Attribute::getValueAsInt() const {
   if (!pImpl) return 0;
-  assert(isAlignAttribute() &&
-         "Expected the attribute to be an alignment attribute!");
+  assert(isIntAttribute() &&
+         "Expected the attribute to be an integer attribute!");
   return pImpl ? pImpl->getValueAsInt() : 0;
 }
 
@@ -156,6 +162,14 @@
   return pImpl->getValueAsInt();
 }
 
+/// This returns the number of dereferenceable bytes.
+uint64_t Attribute::getDereferenceableBytes() const {
+  assert(hasAttribute(Attribute::Dereferenceable) &&
+         "Trying to get dereferenceable bytes from "
+         "non-dereferenceable attribute!");
+  return pImpl->getValueAsInt();
+}
+
 std::string Attribute::getAsString(bool InAttrGrp) const {
   if (!pImpl) return "";
 
@@ -263,6 +277,20 @@
     return Result;
   }
 
+  if (hasAttribute(Attribute::Dereferenceable)) {
+    std::string Result;
+    Result += "dereferenceable";
+    if (InAttrGrp) {
+      Result += "=";
+      Result += utostr(getValueAsInt());
+    } else {
+      Result += "(";
+      Result += utostr(getValueAsInt());
+      Result += ")";
+    }
+    return Result;
+  }
+
   // Convert target-dependent attributes to strings of the form:
   //
   //   "kind"
@@ -296,7 +324,7 @@
 // Pin the vtables to this file.
 AttributeImpl::~AttributeImpl() {}
 void EnumAttributeImpl::anchor() {}
-void AlignAttributeImpl::anchor() {}
+void IntAttributeImpl::anchor() {}
 void StringAttributeImpl::anchor() {}
 
 bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
@@ -310,13 +338,13 @@
 }
 
 Attribute::AttrKind AttributeImpl::getKindAsEnum() const {
-  assert(isEnumAttribute() || isAlignAttribute());
+  assert(isEnumAttribute() || isIntAttribute());
   return static_cast<const EnumAttributeImpl *>(this)->getEnumKind();
 }
 
 uint64_t AttributeImpl::getValueAsInt() const {
-  assert(isAlignAttribute());
-  return static_cast<const AlignAttributeImpl *>(this)->getAlignment();
+  assert(isIntAttribute());
+  return static_cast<const IntAttributeImpl *>(this)->getValue();
 }
 
 StringRef AttributeImpl::getKindAsString() const {
@@ -334,18 +362,18 @@
   // relative to their enum value) and then strings.
   if (isEnumAttribute()) {
     if (AI.isEnumAttribute()) return getKindAsEnum() < AI.getKindAsEnum();
-    if (AI.isAlignAttribute()) return true;
+    if (AI.isIntAttribute()) return true;
     if (AI.isStringAttribute()) return true;
   }
 
-  if (isAlignAttribute()) {
+  if (isIntAttribute()) {
     if (AI.isEnumAttribute()) return false;
-    if (AI.isAlignAttribute()) return getValueAsInt() < AI.getValueAsInt();
+    if (AI.isIntAttribute()) return getValueAsInt() < AI.getValueAsInt();
     if (AI.isStringAttribute()) return true;
   }
 
   if (AI.isEnumAttribute()) return false;
-  if (AI.isAlignAttribute()) return false;
+  if (AI.isIntAttribute()) return false;
   if (getKindAsString() == AI.getKindAsString())
     return getValueAsString() < AI.getValueAsString();
   return getKindAsString() < AI.getKindAsString();
@@ -398,6 +426,8 @@
   case Attribute::InAlloca:        return 1ULL << 43;
   case Attribute::NonNull:         return 1ULL << 44;
   case Attribute::JumpTable:       return 1ULL << 45;
+  case Attribute::Dereferenceable:
+    llvm_unreachable("dereferenceable attribute not supported in raw format");
   }
   llvm_unreachable("Unsupported attribute type");
 }
@@ -482,6 +512,13 @@
   return 0;
 }
 
+uint64_t AttributeSetNode::getDereferenceableBytes() const {
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    if (I->hasAttribute(Attribute::Dereferenceable))
+      return I->getDereferenceableBytes();
+  return 0;
+}
+
 std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
   std::string Str;
   for (iterator I = begin(), E = end(); I != E; ++I) {
@@ -515,6 +552,8 @@
         Mask |= (Log2_32(ASN->getAlignment()) + 1) << 16;
       else if (Kind == Attribute::StackAlignment)
         Mask |= (Log2_32(ASN->getStackAlignment()) + 1) << 26;
+      else if (Kind == Attribute::Dereferenceable)
+        llvm_unreachable("dereferenceable not supported in bit mask");
       else
         Mask |= AttributeImpl::getAttrMask(Kind);
     }
@@ -620,6 +659,10 @@
     else if (Kind == Attribute::StackAlignment)
       Attrs.push_back(std::make_pair(Index, Attribute::
                               getWithStackAlignment(C, B.getStackAlignment())));
+    else if (Kind == Attribute::Dereferenceable)
+      Attrs.push_back(std::make_pair(Index,
+                                     Attribute::getWithDereferenceableBytes(C,
+                                       B.getDereferenceableBytes())));
     else
       Attrs.push_back(std::make_pair(Index, Attribute::get(C, Kind)));
   }
@@ -877,6 +920,11 @@
   return ASN ? ASN->getStackAlignment() : 0;
 }
 
+uint64_t AttributeSet::getDereferenceableBytes(unsigned Index) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getDereferenceableBytes() : 0;
+}
+
 std::string AttributeSet::getAsString(unsigned Index,
                                       bool InAttrGrp) const {
   AttributeSetNode *ASN = getAttributes(Index);
@@ -956,7 +1004,7 @@
 //===----------------------------------------------------------------------===//
 
 AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index)
-  : Attrs(0), Alignment(0), StackAlignment(0) {
+  : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0) {
   AttributeSetImpl *pImpl = AS.pImpl;
   if (!pImpl) return;
 
@@ -973,13 +1021,14 @@
 
 void AttrBuilder::clear() {
   Attrs.reset();
-  Alignment = StackAlignment = 0;
+  Alignment = StackAlignment = DerefBytes = 0;
 }
 
 AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Val) {
   assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!");
   assert(Val != Attribute::Alignment && Val != Attribute::StackAlignment &&
-         "Adding alignment attribute without adding alignment value!");
+         Val != Attribute::Dereferenceable &&
+         "Adding integer attribute without adding a value!");
   Attrs[Val] = true;
   return *this;
 }
@@ -997,6 +1046,8 @@
     Alignment = Attr.getAlignment();
   else if (Kind == Attribute::StackAlignment)
     StackAlignment = Attr.getStackAlignment();
+  else if (Kind == Attribute::Dereferenceable)
+    DerefBytes = Attr.getDereferenceableBytes();
   return *this;
 }
 
@@ -1013,6 +1064,8 @@
     Alignment = 0;
   else if (Val == Attribute::StackAlignment)
     StackAlignment = 0;
+  else if (Val == Attribute::Dereferenceable)
+    DerefBytes = 0;
 
   return *this;
 }
@@ -1029,7 +1082,7 @@
 
   for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) {
     Attribute Attr = *I;
-    if (Attr.isEnumAttribute() || Attr.isAlignAttribute()) {
+    if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
       Attribute::AttrKind Kind = I->getKindAsEnum();
       Attrs[Kind] = false;
 
@@ -1037,6 +1090,8 @@
         Alignment = 0;
       else if (Kind == Attribute::StackAlignment)
         StackAlignment = 0;
+      else if (Kind == Attribute::Dereferenceable)
+        DerefBytes = 0;
     } else {
       assert(Attr.isStringAttribute() && "Invalid attribute type!");
       std::map<std::string, std::string>::iterator
@@ -1079,6 +1134,14 @@
   return *this;
 }
 
+AttrBuilder &AttrBuilder::addDereferenceableAttr(uint64_t Bytes) {
+  if (Bytes == 0) return *this;
+
+  Attrs[Attribute::Dereferenceable] = true;
+  DerefBytes = Bytes;
+  return *this;
+}
+
 AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
   // FIXME: What if both have alignments, but they don't match?!
   if (!Alignment)
@@ -1087,6 +1150,9 @@
   if (!StackAlignment)
     StackAlignment = B.StackAlignment;
 
+  if (!DerefBytes)
+    DerefBytes = B.DerefBytes;
+
   Attrs |= B.Attrs;
 
   for (td_const_iterator I = B.TargetDepAttrs.begin(),
@@ -1117,7 +1183,7 @@
   for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot);
        I != E; ++I) {
     Attribute Attr = *I;
-    if (Attr.isEnumAttribute() || Attr.isAlignAttribute()) {
+    if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
       if (Attrs[I->getKindAsEnum()])
         return true;
     } else {
@@ -1142,7 +1208,8 @@
     if (B.TargetDepAttrs.find(I->first) == B.TargetDepAttrs.end())
       return false;
 
-  return Alignment == B.Alignment && StackAlignment == B.StackAlignment;
+  return Alignment == B.Alignment && StackAlignment == B.StackAlignment &&
+         DerefBytes == B.DerefBytes;
 }
 
 AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) {
@@ -1151,6 +1218,8 @@
 
   for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
        I = Attribute::AttrKind(I + 1)) {
+    if (I == Attribute::Dereferenceable)
+      continue;
     if (uint64_t A = (Val & AttributeImpl::getAttrMask(I))) {
       Attrs[I] = true;
  
@@ -1184,6 +1253,7 @@
       .addAttribute(Attribute::NoAlias)
       .addAttribute(Attribute::NoCapture)
       .addAttribute(Attribute::NonNull)
+      .addDereferenceableAttr(1) // the int here is ignored
       .addAttribute(Attribute::ReadNone)
       .addAttribute(Attribute::ReadOnly)
       .addAttribute(Attribute::StructRet)

diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 6554b3c..c24dfea 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp

@@ -17,6 +17,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
@@ -43,6 +44,22 @@
   return true;
 }
 
+// Upgrade the declarations of intrinsic functions whose 8-bit immediate mask
+// arguments have changed their type from i32 to i8.
+static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
+                                             Function *&NewFn) {
+  // Check that the last argument is an i32.
+  Type *LastArgType = F->getFunctionType()->getParamType(
+     F->getFunctionType()->getNumParams() - 1);
+  if (!LastArgType->isIntegerTy(32))
+    return false;
+
+  // Move this function aside and map down.
+  F->setName(F->getName() + ".old");
+  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  return true;
+}
+
 static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
@@ -90,6 +107,20 @@
     }
     break;
   }
+  case 'd': {
+    if (Name.startswith("dbg.declare") && F->arg_size() == 2) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::dbg_declare);
+      return true;
+    }
+    if (Name.startswith("dbg.value") && F->arg_size() == 3) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::dbg_value);
+      return true;
+    }
+    break;
+  }
+
   case 'o':
     // We only need to change the name to match the mangling including the
     // address space.
@@ -130,6 +161,51 @@
       if (Name == "x86.sse41.ptestnzc")
         return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
     }
+    // Several blend and other instructions with maskes used the wrong number of
+    // bits.
+    if (Name == "x86.sse41.pblendw")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_pblendw,
+                                              NewFn);
+    if (Name == "x86.sse41.blendpd")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_blendpd,
+                                              NewFn);
+    if (Name == "x86.sse41.blendps")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_blendps,
+                                              NewFn);
+    if (Name == "x86.sse41.insertps")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_insertps,
+                                              NewFn);
+    if (Name == "x86.sse41.dppd")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dppd,
+                                              NewFn);
+    if (Name == "x86.sse41.dpps")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dpps,
+                                              NewFn);
+    if (Name == "x86.sse41.mpsadbw")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
+                                              NewFn);
+    if (Name == "x86.avx.blend.pd.256")
+      return UpgradeX86IntrinsicsWith8BitMask(
+          F, Intrinsic::x86_avx_blend_pd_256, NewFn);
+    if (Name == "x86.avx.blend.ps.256")
+      return UpgradeX86IntrinsicsWith8BitMask(
+          F, Intrinsic::x86_avx_blend_ps_256, NewFn);
+    if (Name == "x86.avx.dp.ps.256")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
+                                              NewFn);
+    if (Name == "x86.avx2.pblendw")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_pblendw,
+                                              NewFn);
+    if (Name == "x86.avx2.pblendd.128")
+      return UpgradeX86IntrinsicsWith8BitMask(
+          F, Intrinsic::x86_avx2_pblendd_128, NewFn);
+    if (Name == "x86.avx2.pblendd.256")
+      return UpgradeX86IntrinsicsWith8BitMask(
+          F, Intrinsic::x86_avx2_pblendd_256, NewFn);
+    if (Name == "x86.avx2.mpsadbw")
+      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
+                                              NewFn);
+
     // frcz.ss/sd may need to have an argument dropped
     if (Name.startswith("x86.xop.vfrcz.ss") && F->arg_size() == 2) {
       F->setName(Name + ".old");
@@ -173,66 +249,27 @@
   return Upgraded;
 }
 
-static bool UpgradeGlobalStructors(GlobalVariable *GV) {
-  ArrayType *ATy = dyn_cast<ArrayType>(GV->getType()->getElementType());
-  StructType *OldTy =
-      ATy ? dyn_cast<StructType>(ATy->getElementType()) : nullptr;
-
-  // Only upgrade an array of a two field struct with the appropriate field
-  // types.
-  if (!OldTy || OldTy->getNumElements() != 2)
-    return false;
-
-  // Get the upgraded 3 element type.
-  PointerType *VoidPtrTy = Type::getInt8Ty(GV->getContext())->getPointerTo();
-  Type *Tys[3] = {
-    OldTy->getElementType(0),
-    OldTy->getElementType(1),
-    VoidPtrTy
-  };
-  StructType *NewTy =
-      StructType::get(GV->getContext(), Tys, /*isPacked=*/false);
-
-  // Build new constants with a null third field filled in.
-  Constant *OldInitC = GV->getInitializer();
-  ConstantArray *OldInit = dyn_cast<ConstantArray>(OldInitC);
-  if (!OldInit && !isa<ConstantAggregateZero>(OldInitC))
-    return false;
-  std::vector<Constant *> Initializers;
-  if (OldInit) {
-    for (Use &U : OldInit->operands()) {
-      ConstantStruct *Init = cast<ConstantStruct>(&U);
-      Constant *NewInit =
-        ConstantStruct::get(NewTy, Init->getOperand(0), Init->getOperand(1),
-                            Constant::getNullValue(VoidPtrTy), nullptr);
-      Initializers.push_back(NewInit);
-    }
-  }
-  assert(Initializers.size() == ATy->getNumElements());
-
-  // Replace the old GV with a new one.
-  ATy = ArrayType::get(NewTy, Initializers.size());
-  Constant *NewInit = ConstantArray::get(ATy, Initializers);
-  GlobalVariable *NewGV = new GlobalVariable(
-      *GV->getParent(), ATy, GV->isConstant(), GV->getLinkage(), NewInit, "",
-      GV, GV->getThreadLocalMode(), GV->getType()->getAddressSpace(),
-      GV->isExternallyInitialized());
-  NewGV->copyAttributesFrom(GV);
-  NewGV->takeName(GV);
-  assert(GV->use_empty() && "program cannot use initializer list");
-  GV->eraseFromParent();
-  return true;
-}
-
 bool llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
-  if (GV->getName() == "llvm.global_ctors" ||
-      GV->getName() == "llvm.global_dtors")
-    return UpgradeGlobalStructors(GV);
-
   // Nothing to do yet.
   return false;
 }
 
+static MDNode *getNodeField(const MDNode *DbgNode, unsigned Elt) {
+  if (!DbgNode || Elt >= DbgNode->getNumOperands())
+    return nullptr;
+  return dyn_cast_or_null<MDNode>(DbgNode->getOperand(Elt));
+}
+
+static DIExpression getExpression(Value *VarOperand, Function *F) {
+  // Old-style DIVariables have an optional expression as the 8th element.
+  DIExpression Expr(getNodeField(cast<MDNode>(VarOperand), 8));
+  if (!Expr) {
+    DIBuilder DIB(*F->getParent());
+    Expr = DIB.createExpression();
+  }
+  return Expr;
+}
+
 // UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
 // upgraded intrinsic. All argument and return casting must be provided in
 // order to seamlessly integrate with existing context.
@@ -396,12 +433,32 @@
   }
 
   std::string Name = CI->getName().str();
-  CI->setName(Name + ".old");
+  if (!Name.empty())
+    CI->setName(Name + ".old");
 
   switch (NewFn->getIntrinsicID()) {
   default:
     llvm_unreachable("Unknown function for CallInst upgrade.");
 
+  // Upgrade debug intrinsics to use an additional DIExpression argument.
+  case Intrinsic::dbg_declare: {
+    auto NewCI =
+        Builder.CreateCall3(NewFn, CI->getArgOperand(0), CI->getArgOperand(1),
+                            getExpression(CI->getArgOperand(1), F), Name);
+    NewCI->setDebugLoc(CI->getDebugLoc());
+    CI->replaceAllUsesWith(NewCI);
+    CI->eraseFromParent();
+    return;
+  }
+  case Intrinsic::dbg_value: {
+    auto NewCI = Builder.CreateCall4(
+        NewFn, CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2),
+        getExpression(CI->getArgOperand(2), F), Name);
+    NewCI->setDebugLoc(CI->getDebugLoc());
+    CI->replaceAllUsesWith(NewCI);
+    CI->eraseFromParent();
+    return;
+  }
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
     assert(CI->getNumArgOperands() == 1 &&
@@ -419,14 +476,6 @@
     CI->eraseFromParent();
     return;
 
-  case Intrinsic::arm_neon_vclz: {
-    // Change name from llvm.arm.neon.vclz.* to llvm.ctlz.*
-    CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
-                                               Builder.getFalse(),
-                                               "llvm.ctlz." + Name.substr(14)));
-    CI->eraseFromParent();
-    return;
-  }
   case Intrinsic::ctpop: {
     CI->replaceAllUsesWith(Builder.CreateCall(NewFn, CI->getArgOperand(0)));
     CI->eraseFromParent();
@@ -468,6 +517,34 @@
     CI->eraseFromParent();
     return;
   }
+
+  case Intrinsic::x86_sse41_pblendw:
+  case Intrinsic::x86_sse41_blendpd:
+  case Intrinsic::x86_sse41_blendps:
+  case Intrinsic::x86_sse41_insertps:
+  case Intrinsic::x86_sse41_dppd:
+  case Intrinsic::x86_sse41_dpps:
+  case Intrinsic::x86_sse41_mpsadbw:
+  case Intrinsic::x86_avx_blend_pd_256:
+  case Intrinsic::x86_avx_blend_ps_256:
+  case Intrinsic::x86_avx_dp_ps_256:
+  case Intrinsic::x86_avx2_pblendw:
+  case Intrinsic::x86_avx2_pblendd_128:
+  case Intrinsic::x86_avx2_pblendd_256:
+  case Intrinsic::x86_avx2_mpsadbw: {
+    // Need to truncate the last argument from i32 to i8 -- this argument models
+    // an inherently 8-bit immediate operand to these x86 instructions.
+    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
+                                 CI->arg_operands().end());
+
+    // Replace the last argument with a trunc.
+    Args.back() = Builder.CreateTrunc(Args.back(), Type::getInt8Ty(C), "trunc");
+
+    CallInst *NewCall = Builder.CreateCall(NewFn, Args);
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+    return;
+  }
   }
 }
 
@@ -580,7 +657,9 @@
 
 void llvm::UpgradeMDStringConstant(std::string &String) {
   const std::string OldPrefix = "llvm.vectorizer.";
-  if (String.find(OldPrefix) == 0) {
-        String.replace(0, OldPrefix.size(), "llvm.loop.vectorize.");
+  if (String == "llvm.vectorizer.unroll") {
+    String = "llvm.loop.interleave.count";
+  } else if (String.find(OldPrefix) == 0) {
+    String.replace(0, OldPrefix.size(), "llvm.loop.vectorize.");
   }
 }

diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index ba07433..5ed9bed 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp

@@ -50,17 +50,24 @@
   // Make sure that we get added to a function
   LeakDetector::addGarbageObject(this);
 
-  if (InsertBefore) {
-    assert(NewParent &&
+  if (NewParent)
+    insertInto(NewParent, InsertBefore);
+  else
+    assert(!InsertBefore &&
            "Cannot insert block before another block with no function!");
-    NewParent->getBasicBlockList().insert(InsertBefore, this);
-  } else if (NewParent) {
-    NewParent->getBasicBlockList().push_back(this);
-  }
 
   setName(Name);
 }
 
+void BasicBlock::insertInto(Function *NewParent, BasicBlock *InsertBefore) {
+  assert(NewParent && "Expected a parent");
+  assert(!Parent && "Already has a parent");
+
+  if (InsertBefore)
+    NewParent->getBasicBlockList().insert(InsertBefore, this);
+  else
+    NewParent->getBasicBlockList().push_back(this);
+}
 
 BasicBlock::~BasicBlock() {
   // If the address of the block is taken and it is being deleted (e.g. because
@@ -131,6 +138,37 @@
   return dyn_cast<TerminatorInst>(&InstList.back());
 }
 
+CallInst *BasicBlock::getTerminatingMustTailCall() {
+  if (InstList.empty())
+    return nullptr;
+  ReturnInst *RI = dyn_cast<ReturnInst>(&InstList.back());
+  if (!RI || RI == &InstList.front())
+    return nullptr;
+
+  Instruction *Prev = RI->getPrevNode();
+  if (!Prev)
+    return nullptr;
+
+  if (Value *RV = RI->getReturnValue()) {
+    if (RV != Prev)
+      return nullptr;
+
+    // Look through the optional bitcast.
+    if (auto *BI = dyn_cast<BitCastInst>(Prev)) {
+      RV = BI->getOperand(0);
+      Prev = BI->getPrevNode();
+      if (!Prev || RV != Prev)
+        return nullptr;
+    }
+  }
+
+  if (auto *CI = dyn_cast<CallInst>(Prev)) {
+    if (CI->isMustTailCall())
+      return CI;
+  }
+  return nullptr;
+}
+
 Instruction* BasicBlock::getFirstNonPHI() {
   BasicBlock::iterator i = begin();
   // All valid basic blocks should have a terminator,

diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index 38a80b1..b3889e6 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt

@@ -39,6 +39,7 @@
   Type.cpp
   TypeFinder.cpp
   Use.cpp
+  UseListOrder.cpp
   User.cpp
   Value.cpp
   ValueSymbolTable.cpp

diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 395ac39..cdfb41f 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp

@@ -593,8 +593,13 @@
       bool ignored;
       uint64_t x[2]; 
       uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
-      (void) V.convertToInteger(x, DestBitWidth, opc==Instruction::FPToSI,
-                                APFloat::rmTowardZero, &ignored);
+      if (APFloat::opInvalidOp ==
+          V.convertToInteger(x, DestBitWidth, opc==Instruction::FPToSI,
+                             APFloat::rmTowardZero, &ignored)) {
+        // Undefined behavior invoked - the destination type can't represent
+        // the input constant.
+        return UndefValue::get(DestTy);
+      }
       APInt Val(DestBitWidth, x);
       return ConstantInt::get(FPC->getContext(), Val);
     }
@@ -653,9 +658,13 @@
       APInt api = CI->getValue();
       APFloat apf(DestTy->getFltSemantics(),
                   APInt::getNullValue(DestTy->getPrimitiveSizeInBits()));
-      (void)apf.convertFromAPInt(api, 
-                                 opc==Instruction::SIToFP,
-                                 APFloat::rmNearestTiesToEven);
+      if (APFloat::opOverflow &
+          apf.convertFromAPInt(api, opc==Instruction::SIToFP,
+                              APFloat::rmNearestTiesToEven)) {
+        // Undefined behavior invoked - the destination type can't represent
+        // the input constant.
+        return UndefValue::get(DestTy);
+      }
       return ConstantFP::get(V->getContext(), apf);
     }
     return nullptr;
@@ -674,6 +683,9 @@
     }
     return nullptr;
   case Instruction::Trunc: {
+    if (V->getType()->isVectorTy())
+      return nullptr;
+
     uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
       return ConstantInt::get(V->getContext(),
@@ -2144,9 +2156,10 @@
 
   // If all indices are known integers and normalized, we can do a simple
   // check for the "inbounds" property.
-  if (!Unknown && !inBounds &&
-      isa<GlobalVariable>(C) && isInBoundsIndices(Idxs))
-    return ConstantExpr::getInBoundsGetElementPtr(C, Idxs);
+  if (!Unknown && !inBounds)
+    if (auto *GV = dyn_cast<GlobalVariable>(C))
+      if (!GV->hasExternalWeakLinkage() && isInBoundsIndices(Idxs))
+        return ConstantExpr::getInBoundsGetElementPtr(C, Idxs);
 
   return nullptr;
 }

diff --git a/lib/IR/ConstantFold.h b/lib/IR/ConstantFold.h
index e12f27a..a516abe 100644
--- a/lib/IR/ConstantFold.h
+++ b/lib/IR/ConstantFold.h

@@ -16,8 +16,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CONSTANTFOLDING_H
-#define CONSTANTFOLDING_H
+#ifndef LLVM_LIB_IR_CONSTANTFOLD_H
+#define LLVM_LIB_IR_CONSTANTFOLD_H
 
 #include "llvm/ADT/ArrayRef.h"
 

diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index b815936..e0cb835 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp

@@ -107,6 +107,28 @@
   return false;
 }
 
+bool Constant::isOneValue() const {
+  // Check for 1 integers
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
+    return CI->isOne();
+
+  // Check for FP which are bitcasted from 1 integers
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->getValueAPF().bitcastToAPInt() == 1;
+
+  // Check for constant vectors which are splats of 1 values.
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
+    if (Constant *Splat = CV->getSplatValue())
+      return Splat->isOneValue();
+
+  // Check for constant vectors which are splats of 1 values.
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
+    if (Constant *Splat = CV->getSplatValue())
+      return Splat->isOneValue();
+
+  return false;
+}
+
 bool Constant::isMinSignedValue() const {
   // Check for INT_MIN integers
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
@@ -129,6 +151,29 @@
   return false;
 }
 
+bool Constant::isNotMinSignedValue() const {
+  // Check for INT_MIN integers
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
+    return !CI->isMinValue(/*isSigned=*/true);
+
+  // Check for FP which are bitcasted from INT_MIN integers
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return !CFP->getValueAPF().bitcastToAPInt().isMinSignedValue();
+
+  // Check for constant vectors which are splats of INT_MIN values.
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
+    if (Constant *Splat = CV->getSplatValue())
+      return Splat->isNotMinSignedValue();
+
+  // Check for constant vectors which are splats of INT_MIN values.
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
+    if (Constant *Splat = CV->getSplatValue())
+      return Splat->isNotMinSignedValue();
+
+  // It *may* contain INT_MIN, we can't tell.
+  return false;
+}
+
 // Constructor to create a '0' constant of arbitrary type...
 Constant *Constant::getNullValue(Type *Ty) {
   switch (Ty->getTypeID()) {
@@ -261,7 +306,7 @@
 }
 
 static bool canTrapImpl(const Constant *C,
-                        SmallPtrSet<const ConstantExpr *, 4> &NonTrappingOps) {
+                        SmallPtrSetImpl<const ConstantExpr *> &NonTrappingOps) {
   assert(C->getType()->isFirstClassType() && "Cannot evaluate aggregate vals!");
   // The only thing that could possibly trap are constant exprs.
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
@@ -271,7 +316,7 @@
   // ConstantExpr traps if any operands can trap.
   for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
     if (ConstantExpr *Op = dyn_cast<ConstantExpr>(CE->getOperand(i))) {
-      if (NonTrappingOps.insert(Op) && canTrapImpl(Op, NonTrappingOps))
+      if (NonTrappingOps.insert(Op).second && canTrapImpl(Op, NonTrappingOps))
         return true;
     }
   }
@@ -318,7 +363,7 @@
       const Constant *ConstOp = dyn_cast<Constant>(Op);
       if (!ConstOp)
         continue;
-      if (Visited.insert(ConstOp))
+      if (Visited.insert(ConstOp).second)
         WorkList.push_back(ConstOp);
     }
   }
@@ -781,6 +826,11 @@
 }
 
 Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
+  if (Constant *C = getImpl(Ty, V))
+    return C;
+  return Ty->getContext().pImpl->ArrayConstants.getOrCreate(Ty, V);
+}
+Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
   // Empty arrays are canonicalized to ConstantAggregateZero.
   if (V.empty())
     return ConstantAggregateZero::get(Ty);
@@ -789,7 +839,6 @@
     assert(V[i]->getType() == Ty->getElementType() &&
            "Wrong type in array element initializer");
   }
-  LLVMContextImpl *pImpl = Ty->getContext().pImpl;
 
   // If this is an all-zero array, return a ConstantAggregateZero object.  If
   // all undef, return an UndefValue, if "all simple", then return a
@@ -871,7 +920,7 @@
   }
 
   // Otherwise, we really do want to create a ConstantArray.
-  return pImpl->ArrayConstants.getOrCreate(Ty, V);
+  return nullptr;
 }
 
 /// getTypeForElements - Return an anonymous struct type to use for a constant
@@ -959,9 +1008,14 @@
 
 // ConstantVector accessors.
 Constant *ConstantVector::get(ArrayRef<Constant*> V) {
+  if (Constant *C = getImpl(V))
+    return C;
+  VectorType *Ty = VectorType::get(V.front()->getType(), V.size());
+  return Ty->getContext().pImpl->VectorConstants.getOrCreate(Ty, V);
+}
+Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   assert(!V.empty() && "Vectors can't be empty");
   VectorType *T = VectorType::get(V.front()->getType(), V.size());
-  LLVMContextImpl *pImpl = T->getContext().pImpl;
 
   // If this is an all-undef or all-zero vector, return a
   // ConstantAggregateZero or UndefValue.
@@ -1053,7 +1107,7 @@
 
   // Otherwise, the element type isn't compatible with ConstantDataVector, or
   // the operand list constants a ConstantExpr or something else strange.
-  return pImpl->VectorConstants.getOrCreate(T, V);
+  return nullptr;
 }
 
 Constant *ConstantVector::getSplat(unsigned NumElts, Constant *V) {
@@ -1141,8 +1195,8 @@
 /// getWithOperands - This returns the current constant expression with the
 /// operands replaced with the specified values.  The specified array must
 /// have the same number of operands as our current one.
-Constant *ConstantExpr::
-getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const {
+Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
+                                        bool OnlyIfReduced) const {
   assert(Ops.size() == getNumOperands() && "Operand count mismatch!");
   bool AnyChange = Ty != getType();
   for (unsigned i = 0; i != Ops.size(); ++i)
@@ -1151,6 +1205,7 @@
   if (!AnyChange)  // No operands changed, return self.
     return const_cast<ConstantExpr*>(this);
 
+  Type *OnlyIfReducedTy = OnlyIfReduced ? Ty : nullptr;
   switch (getOpcode()) {
   case Instruction::Trunc:
   case Instruction::ZExt:
@@ -1165,28 +1220,34 @@
   case Instruction::IntToPtr:
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
-    return ConstantExpr::getCast(getOpcode(), Ops[0], Ty);
+    return ConstantExpr::getCast(getOpcode(), Ops[0], Ty, OnlyIfReduced);
   case Instruction::Select:
-    return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
+    return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2], OnlyIfReducedTy);
   case Instruction::InsertElement:
-    return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
+    return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2],
+                                          OnlyIfReducedTy);
   case Instruction::ExtractElement:
-    return ConstantExpr::getExtractElement(Ops[0], Ops[1]);
+    return ConstantExpr::getExtractElement(Ops[0], Ops[1], OnlyIfReducedTy);
   case Instruction::InsertValue:
-    return ConstantExpr::getInsertValue(Ops[0], Ops[1], getIndices());
+    return ConstantExpr::getInsertValue(Ops[0], Ops[1], getIndices(),
+                                        OnlyIfReducedTy);
   case Instruction::ExtractValue:
-    return ConstantExpr::getExtractValue(Ops[0], getIndices());
+    return ConstantExpr::getExtractValue(Ops[0], getIndices(), OnlyIfReducedTy);
   case Instruction::ShuffleVector:
-    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
+    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2],
+                                          OnlyIfReducedTy);
   case Instruction::GetElementPtr:
     return ConstantExpr::getGetElementPtr(Ops[0], Ops.slice(1),
-                                      cast<GEPOperator>(this)->isInBounds());
+                                          cast<GEPOperator>(this)->isInBounds(),
+                                          OnlyIfReducedTy);
   case Instruction::ICmp:
   case Instruction::FCmp:
-    return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1]);
+    return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1],
+                                    OnlyIfReducedTy);
   default:
     assert(getNumOperands() == 2 && "Must be binary operator?");
-    return ConstantExpr::get(getOpcode(), Ops[0], Ops[1], SubclassOptionalData);
+    return ConstantExpr::get(getOpcode(), Ops[0], Ops[1], SubclassOptionalData,
+                             OnlyIfReducedTy);
   }
 }
 
@@ -1447,27 +1508,21 @@
   // and return early.
   BlockAddress *&NewBA =
     getContext().pImpl->BlockAddresses[std::make_pair(NewF, NewBB)];
-  if (!NewBA) {
-    getBasicBlock()->AdjustBlockAddressRefCount(-1);
-
-    // Remove the old entry, this can't cause the map to rehash (just a
-    // tombstone will get added).
-    getContext().pImpl->BlockAddresses.erase(std::make_pair(getFunction(),
-                                                            getBasicBlock()));
-    NewBA = this;
-    setOperand(0, NewF);
-    setOperand(1, NewBB);
-    getBasicBlock()->AdjustBlockAddressRefCount(1);
+  if (NewBA) {
+    replaceUsesOfWithOnConstantImpl(NewBA);
     return;
   }
 
-  // Otherwise, I do need to replace this with an existing value.
-  assert(NewBA != this && "I didn't contain From!");
+  getBasicBlock()->AdjustBlockAddressRefCount(-1);
 
-  // Everyone using this now uses the replacement.
-  replaceAllUsesWith(NewBA);
-
-  destroyConstant();
+  // Remove the old entry, this can't cause the map to rehash (just a
+  // tombstone will get added).
+  getContext().pImpl->BlockAddresses.erase(std::make_pair(getFunction(),
+                                                          getBasicBlock()));
+  NewBA = this;
+  setOperand(0, NewF);
+  setOperand(1, NewBB);
+  getBasicBlock()->AdjustBlockAddressRefCount(1);
 }
 
 //---- ConstantExpr::get() implementations.
@@ -1475,22 +1530,26 @@
 
 /// This is a utility function to handle folding of casts and lookup of the
 /// cast in the ExprConstants map. It is used by the various get* methods below.
-static inline Constant *getFoldedCast(
-  Instruction::CastOps opc, Constant *C, Type *Ty) {
+static Constant *getFoldedCast(Instruction::CastOps opc, Constant *C, Type *Ty,
+                               bool OnlyIfReduced = false) {
   assert(Ty->isFirstClassType() && "Cannot cast to an aggregate type!");
   // Fold a few common cases
   if (Constant *FC = ConstantFoldCastInstruction(opc, C, Ty))
     return FC;
 
+  if (OnlyIfReduced)
+    return nullptr;
+
   LLVMContextImpl *pImpl = Ty->getContext().pImpl;
 
   // Look up the constant in the table first to ensure uniqueness.
-  ExprMapKeyType Key(opc, C);
+  ConstantExprKeyType Key(opc, C);
 
   return pImpl->ExprConstants.getOrCreate(Ty, Key);
 }
 
-Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
+Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty,
+                                bool OnlyIfReduced) {
   Instruction::CastOps opc = Instruction::CastOps(oc);
   assert(Instruction::isCast(opc) && "opcode out of range");
   assert(C && Ty && "Null arguments to getCast");
@@ -1499,19 +1558,32 @@
   switch (opc) {
   default:
     llvm_unreachable("Invalid cast opcode");
-  case Instruction::Trunc:    return getTrunc(C, Ty);
-  case Instruction::ZExt:     return getZExt(C, Ty);
-  case Instruction::SExt:     return getSExt(C, Ty);
-  case Instruction::FPTrunc:  return getFPTrunc(C, Ty);
-  case Instruction::FPExt:    return getFPExtend(C, Ty);
-  case Instruction::UIToFP:   return getUIToFP(C, Ty);
-  case Instruction::SIToFP:   return getSIToFP(C, Ty);
-  case Instruction::FPToUI:   return getFPToUI(C, Ty);
-  case Instruction::FPToSI:   return getFPToSI(C, Ty);
-  case Instruction::PtrToInt: return getPtrToInt(C, Ty);
-  case Instruction::IntToPtr: return getIntToPtr(C, Ty);
-  case Instruction::BitCast:  return getBitCast(C, Ty);
-  case Instruction::AddrSpaceCast:  return getAddrSpaceCast(C, Ty);
+  case Instruction::Trunc:
+    return getTrunc(C, Ty, OnlyIfReduced);
+  case Instruction::ZExt:
+    return getZExt(C, Ty, OnlyIfReduced);
+  case Instruction::SExt:
+    return getSExt(C, Ty, OnlyIfReduced);
+  case Instruction::FPTrunc:
+    return getFPTrunc(C, Ty, OnlyIfReduced);
+  case Instruction::FPExt:
+    return getFPExtend(C, Ty, OnlyIfReduced);
+  case Instruction::UIToFP:
+    return getUIToFP(C, Ty, OnlyIfReduced);
+  case Instruction::SIToFP:
+    return getSIToFP(C, Ty, OnlyIfReduced);
+  case Instruction::FPToUI:
+    return getFPToUI(C, Ty, OnlyIfReduced);
+  case Instruction::FPToSI:
+    return getFPToSI(C, Ty, OnlyIfReduced);
+  case Instruction::PtrToInt:
+    return getPtrToInt(C, Ty, OnlyIfReduced);
+  case Instruction::IntToPtr:
+    return getIntToPtr(C, Ty, OnlyIfReduced);
+  case Instruction::BitCast:
+    return getBitCast(C, Ty, OnlyIfReduced);
+  case Instruction::AddrSpaceCast:
+    return getAddrSpaceCast(C, Ty, OnlyIfReduced);
   }
 }
 
@@ -1584,7 +1656,7 @@
   return getCast(opcode, C, Ty);
 }
 
-Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty) {
+Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1595,10 +1667,10 @@
   assert(C->getType()->getScalarSizeInBits() > Ty->getScalarSizeInBits()&&
          "SrcTy must be larger than DestTy for Trunc!");
 
-  return getFoldedCast(Instruction::Trunc, C, Ty);
+  return getFoldedCast(Instruction::Trunc, C, Ty, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getSExt(Constant *C, Type *Ty) {
+Constant *ConstantExpr::getSExt(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1609,10 +1681,10 @@
   assert(C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&&
          "SrcTy must be smaller than DestTy for SExt!");
 
-  return getFoldedCast(Instruction::SExt, C, Ty);
+  return getFoldedCast(Instruction::SExt, C, Ty, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getZExt(Constant *C, Type *Ty) {
+Constant *ConstantExpr::getZExt(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1623,10 +1695,10 @@
   assert(C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&&
          "SrcTy must be smaller than DestTy for ZExt!");
 
-  return getFoldedCast(Instruction::ZExt, C, Ty);
+  return getFoldedCast(Instruction::ZExt, C, Ty, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getFPTrunc(Constant *C, Type *Ty) {
+Constant *ConstantExpr::getFPTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1635,10 +1707,10 @@
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
          C->getType()->getScalarSizeInBits() > Ty->getScalarSizeInBits()&&
          "This is an illegal floating point truncation!");
-  return getFoldedCast(Instruction::FPTrunc, C, Ty);
+  return getFoldedCast(Instruction::FPTrunc, C, Ty, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getFPExtend(Constant *C, Type *Ty) {
+Constant *ConstantExpr::getFPExtend(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1647,10 +1719,10 @@
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
          C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&&
          "This is an illegal floating point extension!");
-  return getFoldedCast(Instruction::FPExt, C, Ty);
+  return getFoldedCast(Instruction::FPExt, C, Ty, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getUIToFP(Constant *C, Type *Ty) {
+Constant *ConstantExpr::getUIToFP(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1658,10 +1730,10 @@
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isIntOrIntVectorTy() && Ty->isFPOrFPVectorTy() &&
          "This is an illegal uint to floating point cast!");
-  return getFoldedCast(Instruction::UIToFP, C, Ty);
+  return getFoldedCast(Instruction::UIToFP, C, Ty, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getSIToFP(Constant *C, Type *Ty) {
+Constant *ConstantExpr::getSIToFP(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1669,10 +1741,10 @@
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isIntOrIntVectorTy() && Ty->isFPOrFPVectorTy() &&
          "This is an illegal sint to floating point cast!");
-  return getFoldedCast(Instruction::SIToFP, C, Ty);
+  return getFoldedCast(Instruction::SIToFP, C, Ty, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getFPToUI(Constant *C, Type *Ty) {
+Constant *ConstantExpr::getFPToUI(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1680,10 +1752,10 @@
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isIntOrIntVectorTy() &&
          "This is an illegal floating point to uint cast!");
-  return getFoldedCast(Instruction::FPToUI, C, Ty);
+  return getFoldedCast(Instruction::FPToUI, C, Ty, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getFPToSI(Constant *C, Type *Ty) {
+Constant *ConstantExpr::getFPToSI(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1691,10 +1763,11 @@
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isIntOrIntVectorTy() &&
          "This is an illegal floating point to sint cast!");
-  return getFoldedCast(Instruction::FPToSI, C, Ty);
+  return getFoldedCast(Instruction::FPToSI, C, Ty, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy) {
+Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
+                                    bool OnlyIfReduced) {
   assert(C->getType()->getScalarType()->isPointerTy() &&
          "PtrToInt source must be pointer or pointer vector");
   assert(DstTy->getScalarType()->isIntegerTy() && 
@@ -1703,10 +1776,11 @@
   if (isa<VectorType>(C->getType()))
     assert(C->getType()->getVectorNumElements()==DstTy->getVectorNumElements()&&
            "Invalid cast between a different number of vector elements");
-  return getFoldedCast(Instruction::PtrToInt, C, DstTy);
+  return getFoldedCast(Instruction::PtrToInt, C, DstTy, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy) {
+Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy,
+                                    bool OnlyIfReduced) {
   assert(C->getType()->getScalarType()->isIntegerTy() &&
          "IntToPtr source must be integer or integer vector");
   assert(DstTy->getScalarType()->isPointerTy() &&
@@ -1715,10 +1789,11 @@
   if (isa<VectorType>(C->getType()))
     assert(C->getType()->getVectorNumElements()==DstTy->getVectorNumElements()&&
            "Invalid cast between a different number of vector elements");
-  return getFoldedCast(Instruction::IntToPtr, C, DstTy);
+  return getFoldedCast(Instruction::IntToPtr, C, DstTy, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy) {
+Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy,
+                                   bool OnlyIfReduced) {
   assert(CastInst::castIsValid(Instruction::BitCast, C, DstTy) &&
          "Invalid constantexpr bitcast!");
 
@@ -1726,10 +1801,11 @@
   // speedily.
   if (C->getType() == DstTy) return C;
 
-  return getFoldedCast(Instruction::BitCast, C, DstTy);
+  return getFoldedCast(Instruction::BitCast, C, DstTy, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy) {
+Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy,
+                                         bool OnlyIfReduced) {
   assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) &&
          "Invalid constantexpr addrspacecast!");
 
@@ -1746,11 +1822,11 @@
     }
     C = getBitCast(C, MidTy);
   }
-  return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy);
+  return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced);
 }
 
 Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
-                            unsigned Flags) {
+                            unsigned Flags, Type *OnlyIfReducedTy) {
   // Check the operands for consistency first.
   assert(Opcode >= Instruction::BinaryOpsBegin &&
          Opcode <  Instruction::BinaryOpsEnd   &&
@@ -1819,8 +1895,11 @@
   if (Constant *FC = ConstantFoldBinaryInstruction(Opcode, C1, C2))
     return FC;          // Fold a few common cases.
 
+  if (OnlyIfReducedTy == C1->getType())
+    return nullptr;
+
   Constant *ArgVec[] = { C1, C2 };
-  ExprMapKeyType Key(Opcode, ArgVec, 0, Flags);
+  ConstantExprKeyType Key(Opcode, ArgVec, 0, Flags);
 
   LLVMContextImpl *pImpl = C1->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(C1->getType(), Key);
@@ -1840,7 +1919,7 @@
   // alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1
   // Note that a non-inbounds gep is used, as null isn't within any object.
   Type *AligningTy = 
-    StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, NULL);
+    StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, nullptr);
   Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0));
   Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
   Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
@@ -1868,8 +1947,8 @@
                      Type::getInt64Ty(Ty->getContext()));
 }
 
-Constant *ConstantExpr::getCompare(unsigned short Predicate, 
-                                   Constant *C1, Constant *C2) {
+Constant *ConstantExpr::getCompare(unsigned short Predicate, Constant *C1,
+                                   Constant *C2, bool OnlyIfReduced) {
   assert(C1->getType() == C2->getType() && "Op types should be identical!");
 
   switch (Predicate) {
@@ -1880,31 +1959,35 @@
   case CmpInst::FCMP_UEQ:   case CmpInst::FCMP_UGT: case CmpInst::FCMP_UGE:
   case CmpInst::FCMP_ULT:   case CmpInst::FCMP_ULE: case CmpInst::FCMP_UNE:
   case CmpInst::FCMP_TRUE:
-    return getFCmp(Predicate, C1, C2);
+    return getFCmp(Predicate, C1, C2, OnlyIfReduced);
 
   case CmpInst::ICMP_EQ:  case CmpInst::ICMP_NE:  case CmpInst::ICMP_UGT:
   case CmpInst::ICMP_UGE: case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE:
   case CmpInst::ICMP_SGT: case CmpInst::ICMP_SGE: case CmpInst::ICMP_SLT:
   case CmpInst::ICMP_SLE:
-    return getICmp(Predicate, C1, C2);
+    return getICmp(Predicate, C1, C2, OnlyIfReduced);
   }
 }
 
-Constant *ConstantExpr::getSelect(Constant *C, Constant *V1, Constant *V2) {
+Constant *ConstantExpr::getSelect(Constant *C, Constant *V1, Constant *V2,
+                                  Type *OnlyIfReducedTy) {
   assert(!SelectInst::areInvalidOperands(C, V1, V2)&&"Invalid select operands");
 
   if (Constant *SC = ConstantFoldSelectInstruction(C, V1, V2))
     return SC;        // Fold common cases
 
+  if (OnlyIfReducedTy == V1->getType())
+    return nullptr;
+
   Constant *ArgVec[] = { C, V1, V2 };
-  ExprMapKeyType Key(Instruction::Select, ArgVec);
+  ConstantExprKeyType Key(Instruction::Select, ArgVec);
 
   LLVMContextImpl *pImpl = C->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(V1->getType(), Key);
 }
 
 Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef<Value *> Idxs,
-                                         bool InBounds) {
+                                         bool InBounds, Type *OnlyIfReducedTy) {
   assert(C->getType()->isPtrOrPtrVectorTy() &&
          "Non-pointer type for constant GetElementPtr expression");
 
@@ -1919,6 +2002,9 @@
   if (VectorType *VecTy = dyn_cast<VectorType>(C->getType()))
     ReqTy = VectorType::get(ReqTy, VecTy->getNumElements());
 
+  if (OnlyIfReducedTy == ReqTy)
+    return nullptr;
+
   // Look up the constant in the table first to ensure uniqueness
   std::vector<Constant*> ArgVec;
   ArgVec.reserve(1 + Idxs.size());
@@ -1932,15 +2018,15 @@
            "getelementptr index type missmatch");
     ArgVec.push_back(cast<Constant>(Idxs[i]));
   }
-  const ExprMapKeyType Key(Instruction::GetElementPtr, ArgVec, 0,
-                           InBounds ? GEPOperator::IsInBounds : 0);
+  const ConstantExprKeyType Key(Instruction::GetElementPtr, ArgVec, 0,
+                                InBounds ? GEPOperator::IsInBounds : 0);
 
   LLVMContextImpl *pImpl = C->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
 }
 
-Constant *
-ConstantExpr::getICmp(unsigned short pred, Constant *LHS, Constant *RHS) {
+Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS,
+                                Constant *RHS, bool OnlyIfReduced) {
   assert(LHS->getType() == RHS->getType());
   assert(pred >= ICmpInst::FIRST_ICMP_PREDICATE && 
          pred <= ICmpInst::LAST_ICMP_PREDICATE && "Invalid ICmp Predicate");
@@ -1948,10 +2034,13 @@
   if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
     return FC;          // Fold a few common cases...
 
+  if (OnlyIfReduced)
+    return nullptr;
+
   // Look up the constant in the table first to ensure uniqueness
   Constant *ArgVec[] = { LHS, RHS };
   // Get the key type with both the opcode and predicate
-  const ExprMapKeyType Key(Instruction::ICmp, ArgVec, pred);
+  const ConstantExprKeyType Key(Instruction::ICmp, ArgVec, pred);
 
   Type *ResultTy = Type::getInt1Ty(LHS->getContext());
   if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
@@ -1961,18 +2050,21 @@
   return pImpl->ExprConstants.getOrCreate(ResultTy, Key);
 }
 
-Constant *
-ConstantExpr::getFCmp(unsigned short pred, Constant *LHS, Constant *RHS) {
+Constant *ConstantExpr::getFCmp(unsigned short pred, Constant *LHS,
+                                Constant *RHS, bool OnlyIfReduced) {
   assert(LHS->getType() == RHS->getType());
   assert(pred <= FCmpInst::LAST_FCMP_PREDICATE && "Invalid FCmp Predicate");
 
   if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
     return FC;          // Fold a few common cases...
 
+  if (OnlyIfReduced)
+    return nullptr;
+
   // Look up the constant in the table first to ensure uniqueness
   Constant *ArgVec[] = { LHS, RHS };
   // Get the key type with both the opcode and predicate
-  const ExprMapKeyType Key(Instruction::FCmp, ArgVec, pred);
+  const ConstantExprKeyType Key(Instruction::FCmp, ArgVec, pred);
 
   Type *ResultTy = Type::getInt1Ty(LHS->getContext());
   if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
@@ -1982,7 +2074,8 @@
   return pImpl->ExprConstants.getOrCreate(ResultTy, Key);
 }
 
-Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx) {
+Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx,
+                                          Type *OnlyIfReducedTy) {
   assert(Val->getType()->isVectorTy() &&
          "Tried to create extractelement operation on non-vector type!");
   assert(Idx->getType()->isIntegerTy() &&
@@ -1991,17 +2084,20 @@
   if (Constant *FC = ConstantFoldExtractElementInstruction(Val, Idx))
     return FC;          // Fold a few common cases.
 
+  Type *ReqTy = Val->getType()->getVectorElementType();
+  if (OnlyIfReducedTy == ReqTy)
+    return nullptr;
+
   // Look up the constant in the table first to ensure uniqueness
   Constant *ArgVec[] = { Val, Idx };
-  const ExprMapKeyType Key(Instruction::ExtractElement, ArgVec);
+  const ConstantExprKeyType Key(Instruction::ExtractElement, ArgVec);
 
   LLVMContextImpl *pImpl = Val->getContext().pImpl;
-  Type *ReqTy = Val->getType()->getVectorElementType();
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
 }
 
-Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt, 
-                                         Constant *Idx) {
+Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt,
+                                         Constant *Idx, Type *OnlyIfReducedTy) {
   assert(Val->getType()->isVectorTy() &&
          "Tried to create insertelement operation on non-vector type!");
   assert(Elt->getType() == Val->getType()->getVectorElementType() &&
@@ -2011,16 +2107,20 @@
 
   if (Constant *FC = ConstantFoldInsertElementInstruction(Val, Elt, Idx))
     return FC;          // Fold a few common cases.
+
+  if (OnlyIfReducedTy == Val->getType())
+    return nullptr;
+
   // Look up the constant in the table first to ensure uniqueness
   Constant *ArgVec[] = { Val, Elt, Idx };
-  const ExprMapKeyType Key(Instruction::InsertElement, ArgVec);
+  const ConstantExprKeyType Key(Instruction::InsertElement, ArgVec);
 
   LLVMContextImpl *pImpl = Val->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(Val->getType(), Key);
 }
 
-Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2, 
-                                         Constant *Mask) {
+Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2,
+                                         Constant *Mask, Type *OnlyIfReducedTy) {
   assert(ShuffleVectorInst::isValidOperands(V1, V2, Mask) &&
          "Invalid shuffle vector constant expr operands!");
 
@@ -2031,16 +2131,20 @@
   Type *EltTy = V1->getType()->getVectorElementType();
   Type *ShufTy = VectorType::get(EltTy, NElts);
 
+  if (OnlyIfReducedTy == ShufTy)
+    return nullptr;
+
   // Look up the constant in the table first to ensure uniqueness
   Constant *ArgVec[] = { V1, V2, Mask };
-  const ExprMapKeyType Key(Instruction::ShuffleVector, ArgVec);
+  const ConstantExprKeyType Key(Instruction::ShuffleVector, ArgVec);
 
   LLVMContextImpl *pImpl = ShufTy->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ShufTy, Key);
 }
 
 Constant *ConstantExpr::getInsertValue(Constant *Agg, Constant *Val,
-                                       ArrayRef<unsigned> Idxs) {
+                                       ArrayRef<unsigned> Idxs,
+                                       Type *OnlyIfReducedTy) {
   assert(Agg->getType()->isFirstClassType() &&
          "Non-first-class type for constant insertvalue expression");
 
@@ -2052,15 +2156,18 @@
   if (Constant *FC = ConstantFoldInsertValueInstruction(Agg, Val, Idxs))
     return FC;
 
+  if (OnlyIfReducedTy == ReqTy)
+    return nullptr;
+
   Constant *ArgVec[] = { Agg, Val };
-  const ExprMapKeyType Key(Instruction::InsertValue, ArgVec, 0, 0, Idxs);
+  const ConstantExprKeyType Key(Instruction::InsertValue, ArgVec, 0, 0, Idxs);
 
   LLVMContextImpl *pImpl = Agg->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
 }
 
-Constant *ConstantExpr::getExtractValue(Constant *Agg,
-                                        ArrayRef<unsigned> Idxs) {
+Constant *ConstantExpr::getExtractValue(Constant *Agg, ArrayRef<unsigned> Idxs,
+                                        Type *OnlyIfReducedTy) {
   assert(Agg->getType()->isFirstClassType() &&
          "Tried to create extractelement operation on non-first-class type!");
 
@@ -2073,8 +2180,11 @@
   if (Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs))
     return FC;
 
+  if (OnlyIfReducedTy == ReqTy)
+    return nullptr;
+
   Constant *ArgVec[] = { Agg };
-  const ExprMapKeyType Key(Instruction::ExtractValue, ArgVec, 0, 0, Idxs);
+  const ConstantExprKeyType Key(Instruction::ExtractValue, ArgVec, 0, 0, Idxs);
 
   LLVMContextImpl *pImpl = Agg->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
@@ -2326,14 +2436,16 @@
     return ConstantAggregateZero::get(Ty);
 
   // Do a lookup to see if we have already formed one of these.
-  StringMap<ConstantDataSequential*>::MapEntryTy &Slot =
-    Ty->getContext().pImpl->CDSConstants.GetOrCreateValue(Elements);
+  auto &Slot =
+      *Ty->getContext()
+           .pImpl->CDSConstants.insert(std::make_pair(Elements, nullptr))
+           .first;
 
   // The bucket can point to a linked list of different CDS's that have the same
   // body but different types.  For example, 0,0,0,1 could be a 4 element array
   // of i8, or a 1-element array of i32.  They'll both end up in the same
   /// StringMap bucket, linked up by their Next pointers.  Walk the list.
-  ConstantDataSequential **Entry = &Slot.getValue();
+  ConstantDataSequential **Entry = &Slot.second;
   for (ConstantDataSequential *Node = *Entry; Node;
        Entry = &Node->Next, Node = *Entry)
     if (Node->getType() == Ty)
@@ -2342,10 +2454,10 @@
   // Okay, we didn't get a hit.  Create a node of the right class, link it in,
   // and return it.
   if (isa<ArrayType>(Ty))
-    return *Entry = new ConstantDataArray(Ty, Slot.getKeyData());
+    return *Entry = new ConstantDataArray(Ty, Slot.first().data());
 
   assert(isa<VectorType>(Ty));
-  return *Entry = new ConstantDataVector(Ty, Slot.getKeyData());
+  return *Entry = new ConstantDataVector(Ty, Slot.first().data());
 }
 
 void ConstantDataSequential::destroyConstant() {
@@ -2431,7 +2543,7 @@
                                        StringRef Str, bool AddNull) {
   if (!AddNull) {
     const uint8_t *Data = reinterpret_cast<const uint8_t *>(Str.data());
-    return get(Context, ArrayRef<uint8_t>(const_cast<uint8_t *>(Data),
+    return get(Context, makeArrayRef(const_cast<uint8_t *>(Data),
                Str.size()));
   }
 
@@ -2602,7 +2714,7 @@
 }
 
 /// getSplatValue - If this is a splat constant, meaning that all of the
-/// elements have the same value, return that value. Otherwise return NULL.
+/// elements have the same value, return that value. Otherwise return nullptr.
 Constant *ConstantDataVector::getSplatValue() const {
   const char *Base = getRawDataValues().data();
 
@@ -2630,16 +2742,23 @@
 /// work, but would be really slow because it would have to unique each updated
 /// array instance.
 ///
+void Constant::replaceUsesOfWithOnConstantImpl(Constant *Replacement) {
+  // I do need to replace this with an existing value.
+  assert(Replacement != this && "I didn't contain From!");
+
+  // Everyone using this now uses the replacement.
+  replaceAllUsesWith(Replacement);
+
+  // Delete the old constant!
+  destroyConstant();
+}
+
 void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
                                                 Use *U) {
   assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
   Constant *ToC = cast<Constant>(To);
 
-  LLVMContextImpl *pImpl = getType()->getContext().pImpl;
-
   SmallVector<Constant*, 8> Values;
-  LLVMContextImpl::ArrayConstantsTy::LookupKey Lookup;
-  Lookup.first = cast<ArrayType>(getType());
   Values.reserve(getNumOperands());  // Build replacement array.
 
   // Fill values with the modified operands of the constant array.  Also,
@@ -2658,51 +2777,25 @@
     AllSame &= Val == ToC;
   }
 
-  Constant *Replacement = nullptr;
   if (AllSame && ToC->isNullValue()) {
-    Replacement = ConstantAggregateZero::get(getType());
-  } else if (AllSame && isa<UndefValue>(ToC)) {
-    Replacement = UndefValue::get(getType());
-  } else {
-    // Check to see if we have this array type already.
-    Lookup.second = makeArrayRef(Values);
-    LLVMContextImpl::ArrayConstantsTy::MapTy::iterator I =
-      pImpl->ArrayConstants.find(Lookup);
-
-    if (I != pImpl->ArrayConstants.map_end()) {
-      Replacement = I->first;
-    } else {
-      // Okay, the new shape doesn't exist in the system yet.  Instead of
-      // creating a new constant array, inserting it, replaceallusesof'ing the
-      // old with the new, then deleting the old... just update the current one
-      // in place!
-      pImpl->ArrayConstants.remove(this);
-
-      // Update to the new value.  Optimize for the case when we have a single
-      // operand that we're changing, but handle bulk updates efficiently.
-      if (NumUpdated == 1) {
-        unsigned OperandToUpdate = U - OperandList;
-        assert(getOperand(OperandToUpdate) == From &&
-               "ReplaceAllUsesWith broken!");
-        setOperand(OperandToUpdate, ToC);
-      } else {
-        for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-          if (getOperand(i) == From)
-            setOperand(i, ToC);
-      }
-      pImpl->ArrayConstants.insert(this);
-      return;
-    }
+    replaceUsesOfWithOnConstantImpl(ConstantAggregateZero::get(getType()));
+    return;
+  }
+  if (AllSame && isa<UndefValue>(ToC)) {
+    replaceUsesOfWithOnConstantImpl(UndefValue::get(getType()));
+    return;
   }
 
-  // Otherwise, I do need to replace this with an existing value.
-  assert(Replacement != this && "I didn't contain From!");
+  // Check for any other type of constant-folding.
+  if (Constant *C = getImpl(getType(), Values)) {
+    replaceUsesOfWithOnConstantImpl(C);
+    return;
+  }
 
-  // Everyone using this now uses the replacement.
-  replaceAllUsesWith(Replacement);
-
-  // Delete the old constant!
-  destroyConstant();
+  // Update to the new value.
+  if (Constant *C = getContext().pImpl->ArrayConstants.replaceOperandsInPlace(
+          Values, this, From, ToC, NumUpdated, U - OperandList))
+    replaceUsesOfWithOnConstantImpl(C);
 }
 
 void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
@@ -2714,8 +2807,6 @@
   assert(getOperand(OperandToUpdate) == From && "ReplaceAllUsesWith broken!");
 
   SmallVector<Constant*, 8> Values;
-  LLVMContextImpl::StructConstantsTy::LookupKey Lookup;
-  Lookup.first = cast<StructType>(getType());
   Values.reserve(getNumOperands());  // Build replacement struct.
 
   // Fill values with the modified operands of the constant struct.  Also,
@@ -2742,64 +2833,47 @@
   }
   Values[OperandToUpdate] = ToC;
 
-  LLVMContextImpl *pImpl = getContext().pImpl;
-
-  Constant *Replacement = nullptr;
   if (isAllZeros) {
-    Replacement = ConstantAggregateZero::get(getType());
-  } else if (isAllUndef) {
-    Replacement = UndefValue::get(getType());
-  } else {
-    // Check to see if we have this struct type already.
-    Lookup.second = makeArrayRef(Values);
-    LLVMContextImpl::StructConstantsTy::MapTy::iterator I =
-      pImpl->StructConstants.find(Lookup);
-
-    if (I != pImpl->StructConstants.map_end()) {
-      Replacement = I->first;
-    } else {
-      // Okay, the new shape doesn't exist in the system yet.  Instead of
-      // creating a new constant struct, inserting it, replaceallusesof'ing the
-      // old with the new, then deleting the old... just update the current one
-      // in place!
-      pImpl->StructConstants.remove(this);
-
-      // Update to the new value.
-      setOperand(OperandToUpdate, ToC);
-      pImpl->StructConstants.insert(this);
-      return;
-    }
+    replaceUsesOfWithOnConstantImpl(ConstantAggregateZero::get(getType()));
+    return;
+  }
+  if (isAllUndef) {
+    replaceUsesOfWithOnConstantImpl(UndefValue::get(getType()));
+    return;
   }
 
-  assert(Replacement != this && "I didn't contain From!");
-
-  // Everyone using this now uses the replacement.
-  replaceAllUsesWith(Replacement);
-
-  // Delete the old constant!
-  destroyConstant();
+  // Update to the new value.
+  if (Constant *C = getContext().pImpl->StructConstants.replaceOperandsInPlace(
+          Values, this, From, ToC))
+    replaceUsesOfWithOnConstantImpl(C);
 }
 
 void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To,
                                                  Use *U) {
   assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
+  Constant *ToC = cast<Constant>(To);
 
   SmallVector<Constant*, 8> Values;
   Values.reserve(getNumOperands());  // Build replacement array...
+  unsigned NumUpdated = 0;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     Constant *Val = getOperand(i);
-    if (Val == From) Val = cast<Constant>(To);
+    if (Val == From) {
+      ++NumUpdated;
+      Val = ToC;
+    }
     Values.push_back(Val);
   }
 
-  Constant *Replacement = get(Values);
-  assert(Replacement != this && "I didn't contain From!");
+  if (Constant *C = getImpl(Values)) {
+    replaceUsesOfWithOnConstantImpl(C);
+    return;
+  }
 
-  // Everyone using this now uses the replacement.
-  replaceAllUsesWith(Replacement);
-
-  // Delete the old constant!
-  destroyConstant();
+  // Update to the new value.
+  if (Constant *C = getContext().pImpl->VectorConstants.replaceOperandsInPlace(
+          Values, this, From, ToC, NumUpdated, U - OperandList))
+    replaceUsesOfWithOnConstantImpl(C);
 }
 
 void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV,
@@ -2808,19 +2882,26 @@
   Constant *To = cast<Constant>(ToV);
 
   SmallVector<Constant*, 8> NewOps;
+  unsigned NumUpdated = 0;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     Constant *Op = getOperand(i);
-    NewOps.push_back(Op == From ? To : Op);
+    if (Op == From) {
+      ++NumUpdated;
+      Op = To;
+    }
+    NewOps.push_back(Op);
+  }
+  assert(NumUpdated && "I didn't contain From!");
+
+  if (Constant *C = getWithOperands(NewOps, getType(), true)) {
+    replaceUsesOfWithOnConstantImpl(C);
+    return;
   }
 
-  Constant *Replacement = getWithOperands(NewOps);
-  assert(Replacement != this && "I didn't contain From!");
-
-  // Everyone using this now uses the replacement.
-  replaceAllUsesWith(Replacement);
-
-  // Delete the old constant!
-  destroyConstant();
+  // Update to the new value.
+  if (Constant *C = getContext().pImpl->ExprConstants.replaceOperandsInPlace(
+          NewOps, this, From, To, NumUpdated, U - OperandList))
+    replaceUsesOfWithOnConstantImpl(C);
 }
 
 Instruction *ConstantExpr::getAsInstruction() {

diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index f06509f..571dec2 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CONSTANTSCONTEXT_H
-#define LLVM_CONSTANTSCONTEXT_H
+#ifndef LLVM_LIB_IR_CONSTANTSCONTEXT_H
+#define LLVM_LIB_IR_CONSTANTSCONTEXT_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
@@ -29,8 +29,6 @@
 #define DEBUG_TYPE "ir"
 
 namespace llvm {
-template<class ValType>
-struct ConstantTraits;
 
 /// UnaryConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement unary constant exprs.
@@ -169,11 +167,10 @@
   void *operator new(size_t s) {
     return User::operator new(s, 1);
   }
-  ExtractValueConstantExpr(Constant *Agg,
-                           const SmallVector<unsigned, 4> &IdxList,
+  ExtractValueConstantExpr(Constant *Agg, ArrayRef<unsigned> IdxList,
                            Type *DestTy)
-    : ConstantExpr(DestTy, Instruction::ExtractValue, &Op<0>(), 1),
-      Indices(IdxList) {
+      : ConstantExpr(DestTy, Instruction::ExtractValue, &Op<0>(), 1),
+        Indices(IdxList.begin(), IdxList.end()) {
     Op<0>() = Agg;
   }
 
@@ -196,10 +193,9 @@
     return User::operator new(s, 2);
   }
   InsertValueConstantExpr(Constant *Agg, Constant *Val,
-                          const SmallVector<unsigned, 4> &IdxList,
-                          Type *DestTy)
-    : ConstantExpr(DestTy, Instruction::InsertValue, &Op<0>(), 2),
-      Indices(IdxList) {
+                          ArrayRef<unsigned> IdxList, Type *DestTy)
+      : ConstantExpr(DestTy, Instruction::InsertValue, &Op<0>(), 2),
+        Indices(IdxList.begin(), IdxList.end()) {
     Op<0>() = Agg;
     Op<1>() = Val;
   }
@@ -316,379 +312,241 @@
 };
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value)
 
-struct ExprMapKeyType {
-  ExprMapKeyType(unsigned opc,
-      ArrayRef<Constant*> ops,
-      unsigned short flags = 0,
-      unsigned short optionalflags = 0,
-      ArrayRef<unsigned> inds = None)
-        : opcode(opc), subclassoptionaldata(optionalflags), subclassdata(flags),
-        operands(ops.begin(), ops.end()), indices(inds.begin(), inds.end()) {}
-  uint8_t opcode;
-  uint8_t subclassoptionaldata;
-  uint16_t subclassdata;
-  std::vector<Constant*> operands;
-  SmallVector<unsigned, 4> indices;
-  bool operator==(const ExprMapKeyType& that) const {
-    return this->opcode == that.opcode &&
-           this->subclassdata == that.subclassdata &&
-           this->subclassoptionaldata == that.subclassoptionaldata &&
-           this->operands == that.operands &&
-           this->indices == that.indices;
-  }
-  bool operator<(const ExprMapKeyType & that) const {
-    return std::tie(opcode, operands, subclassdata, subclassoptionaldata,
-                    indices) <
-           std::tie(that.opcode, that.operands, that.subclassdata,
-                    that.subclassoptionaldata, that.indices);
+template <class ConstantClass> struct ConstantAggrKeyType;
+struct InlineAsmKeyType;
+struct ConstantExprKeyType;
+
+template <class ConstantClass> struct ConstantInfo;
+template <> struct ConstantInfo<ConstantExpr> {
+  typedef ConstantExprKeyType ValType;
+  typedef Type TypeClass;
+};
+template <> struct ConstantInfo<InlineAsm> {
+  typedef InlineAsmKeyType ValType;
+  typedef PointerType TypeClass;
+};
+template <> struct ConstantInfo<ConstantArray> {
+  typedef ConstantAggrKeyType<ConstantArray> ValType;
+  typedef ArrayType TypeClass;
+};
+template <> struct ConstantInfo<ConstantStruct> {
+  typedef ConstantAggrKeyType<ConstantStruct> ValType;
+  typedef StructType TypeClass;
+};
+template <> struct ConstantInfo<ConstantVector> {
+  typedef ConstantAggrKeyType<ConstantVector> ValType;
+  typedef VectorType TypeClass;
+};
+
+template <class ConstantClass> struct ConstantAggrKeyType {
+  ArrayRef<Constant *> Operands;
+  ConstantAggrKeyType(ArrayRef<Constant *> Operands) : Operands(Operands) {}
+  ConstantAggrKeyType(ArrayRef<Constant *> Operands, const ConstantClass *)
+      : Operands(Operands) {}
+  ConstantAggrKeyType(const ConstantClass *C,
+                      SmallVectorImpl<Constant *> &Storage) {
+    assert(Storage.empty() && "Expected empty storage");
+    for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I)
+      Storage.push_back(C->getOperand(I));
+    Operands = Storage;
   }
 
-  bool operator!=(const ExprMapKeyType& that) const {
-    return !(*this == that);
+  bool operator==(const ConstantAggrKeyType &X) const {
+    return Operands == X.Operands;
+  }
+  bool operator==(const ConstantClass *C) const {
+    if (Operands.size() != C->getNumOperands())
+      return false;
+    for (unsigned I = 0, E = Operands.size(); I != E; ++I)
+      if (Operands[I] != C->getOperand(I))
+        return false;
+    return true;
+  }
+  unsigned getHash() const {
+    return hash_combine_range(Operands.begin(), Operands.end());
+  }
+
+  typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
+  ConstantClass *create(TypeClass *Ty) const {
+    return new (Operands.size()) ConstantClass(Ty, Operands);
   }
 };
 
 struct InlineAsmKeyType {
-  InlineAsmKeyType(StringRef AsmString,
-                   StringRef Constraints, bool hasSideEffects,
-                   bool isAlignStack, InlineAsm::AsmDialect asmDialect)
-    : asm_string(AsmString), constraints(Constraints),
-      has_side_effects(hasSideEffects), is_align_stack(isAlignStack),
-      asm_dialect(asmDialect) {}
-  std::string asm_string;
-  std::string constraints;
-  bool has_side_effects;
-  bool is_align_stack;
-  InlineAsm::AsmDialect asm_dialect;
-  bool operator==(const InlineAsmKeyType& that) const {
-    return this->asm_string == that.asm_string &&
-           this->constraints == that.constraints &&
-           this->has_side_effects == that.has_side_effects &&
-           this->is_align_stack == that.is_align_stack &&
-           this->asm_dialect == that.asm_dialect;
+  StringRef AsmString;
+  StringRef Constraints;
+  bool HasSideEffects;
+  bool IsAlignStack;
+  InlineAsm::AsmDialect AsmDialect;
+
+  InlineAsmKeyType(StringRef AsmString, StringRef Constraints,
+                   bool HasSideEffects, bool IsAlignStack,
+                   InlineAsm::AsmDialect AsmDialect)
+      : AsmString(AsmString), Constraints(Constraints),
+        HasSideEffects(HasSideEffects), IsAlignStack(IsAlignStack),
+        AsmDialect(AsmDialect) {}
+  InlineAsmKeyType(const InlineAsm *Asm, SmallVectorImpl<Constant *> &)
+      : AsmString(Asm->getAsmString()), Constraints(Asm->getConstraintString()),
+        HasSideEffects(Asm->hasSideEffects()),
+        IsAlignStack(Asm->isAlignStack()), AsmDialect(Asm->getDialect()) {}
+
+  bool operator==(const InlineAsmKeyType &X) const {
+    return HasSideEffects == X.HasSideEffects &&
+           IsAlignStack == X.IsAlignStack && AsmDialect == X.AsmDialect &&
+           AsmString == X.AsmString && Constraints == X.Constraints;
   }
-  bool operator<(const InlineAsmKeyType& that) const {
-    return std::tie(asm_string, constraints, has_side_effects, is_align_stack,
-                    asm_dialect) <
-           std::tie(that.asm_string, that.constraints, that.has_side_effects,
-                    that.is_align_stack, that.asm_dialect);
+  bool operator==(const InlineAsm *Asm) const {
+    return HasSideEffects == Asm->hasSideEffects() &&
+           IsAlignStack == Asm->isAlignStack() &&
+           AsmDialect == Asm->getDialect() &&
+           AsmString == Asm->getAsmString() &&
+           Constraints == Asm->getConstraintString();
+  }
+  unsigned getHash() const {
+    return hash_combine(AsmString, Constraints, HasSideEffects, IsAlignStack,
+                        AsmDialect);
   }
 
-  bool operator!=(const InlineAsmKeyType& that) const {
-    return !(*this == that);
+  typedef ConstantInfo<InlineAsm>::TypeClass TypeClass;
+  InlineAsm *create(TypeClass *Ty) const {
+    return new InlineAsm(Ty, AsmString, Constraints, HasSideEffects,
+                         IsAlignStack, AsmDialect);
   }
 };
 
-// The number of operands for each ConstantCreator::create method is
-// determined by the ConstantTraits template.
-// ConstantCreator - A class that is used to create constants by
-// ConstantUniqueMap*.  This class should be partially specialized if there is
-// something strange that needs to be done to interface to the ctor for the
-// constant.
-//
-template<typename T, typename Alloc>
-struct ConstantTraits< std::vector<T, Alloc> > {
-  static unsigned uses(const std::vector<T, Alloc>& v) {
-    return v.size();
-  }
-};
+struct ConstantExprKeyType {
+  uint8_t Opcode;
+  uint8_t SubclassOptionalData;
+  uint16_t SubclassData;
+  ArrayRef<Constant *> Ops;
+  ArrayRef<unsigned> Indexes;
 
-template<>
-struct ConstantTraits<Constant *> {
-  static unsigned uses(Constant * const & v) {
-    return 1;
+  ConstantExprKeyType(unsigned Opcode, ArrayRef<Constant *> Ops,
+                      unsigned short SubclassData = 0,
+                      unsigned short SubclassOptionalData = 0,
+                      ArrayRef<unsigned> Indexes = None)
+      : Opcode(Opcode), SubclassOptionalData(SubclassOptionalData),
+        SubclassData(SubclassData), Ops(Ops), Indexes(Indexes) {}
+  ConstantExprKeyType(ArrayRef<Constant *> Operands, const ConstantExpr *CE)
+      : Opcode(CE->getOpcode()),
+        SubclassOptionalData(CE->getRawSubclassOptionalData()),
+        SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands),
+        Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()) {}
+  ConstantExprKeyType(const ConstantExpr *CE,
+                      SmallVectorImpl<Constant *> &Storage)
+      : Opcode(CE->getOpcode()),
+        SubclassOptionalData(CE->getRawSubclassOptionalData()),
+        SubclassData(CE->isCompare() ? CE->getPredicate() : 0),
+        Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()) {
+    assert(Storage.empty() && "Expected empty storage");
+    for (unsigned I = 0, E = CE->getNumOperands(); I != E; ++I)
+      Storage.push_back(CE->getOperand(I));
+    Ops = Storage;
   }
-};
 
-template<class ConstantClass, class TypeClass, class ValType>
-struct ConstantCreator {
-  static ConstantClass *create(TypeClass *Ty, const ValType &V) {
-    return new(ConstantTraits<ValType>::uses(V)) ConstantClass(Ty, V);
+  bool operator==(const ConstantExprKeyType &X) const {
+    return Opcode == X.Opcode && SubclassData == X.SubclassData &&
+           SubclassOptionalData == X.SubclassOptionalData && Ops == X.Ops &&
+           Indexes == X.Indexes;
   }
-};
 
-template<class ConstantClass, class TypeClass>
-struct ConstantArrayCreator {
-  static ConstantClass *create(TypeClass *Ty, ArrayRef<Constant*> V) {
-    return new(V.size()) ConstantClass(Ty, V);
+  bool operator==(const ConstantExpr *CE) const {
+    if (Opcode != CE->getOpcode())
+      return false;
+    if (SubclassOptionalData != CE->getRawSubclassOptionalData())
+      return false;
+    if (Ops.size() != CE->getNumOperands())
+      return false;
+    if (SubclassData != (CE->isCompare() ? CE->getPredicate() : 0))
+      return false;
+    for (unsigned I = 0, E = Ops.size(); I != E; ++I)
+      if (Ops[I] != CE->getOperand(I))
+        return false;
+    if (Indexes != (CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()))
+      return false;
+    return true;
   }
-};
 
-template<class ConstantClass>
-struct ConstantKeyData {
-  typedef void ValType;
-  static ValType getValType(ConstantClass *C) {
-    llvm_unreachable("Unknown Constant type!");
+  unsigned getHash() const {
+    return hash_combine(Opcode, SubclassOptionalData, SubclassData,
+                        hash_combine_range(Ops.begin(), Ops.end()),
+                        hash_combine_range(Indexes.begin(), Indexes.end()));
   }
-};
 
-template<>
-struct ConstantCreator<ConstantExpr, Type, ExprMapKeyType> {
-  static ConstantExpr *create(Type *Ty, const ExprMapKeyType &V,
-      unsigned short pred = 0) {
-    if (Instruction::isCast(V.opcode))
-      return new UnaryConstantExpr(V.opcode, V.operands[0], Ty);
-    if ((V.opcode >= Instruction::BinaryOpsBegin &&
-         V.opcode < Instruction::BinaryOpsEnd))
-      return new BinaryConstantExpr(V.opcode, V.operands[0], V.operands[1],
-                                    V.subclassoptionaldata);
-    if (V.opcode == Instruction::Select)
-      return new SelectConstantExpr(V.operands[0], V.operands[1], 
-                                    V.operands[2]);
-    if (V.opcode == Instruction::ExtractElement)
-      return new ExtractElementConstantExpr(V.operands[0], V.operands[1]);
-    if (V.opcode == Instruction::InsertElement)
-      return new InsertElementConstantExpr(V.operands[0], V.operands[1],
-                                           V.operands[2]);
-    if (V.opcode == Instruction::ShuffleVector)
-      return new ShuffleVectorConstantExpr(V.operands[0], V.operands[1],
-                                           V.operands[2]);
-    if (V.opcode == Instruction::InsertValue)
-      return new InsertValueConstantExpr(V.operands[0], V.operands[1],
-                                         V.indices, Ty);
-    if (V.opcode == Instruction::ExtractValue)
-      return new ExtractValueConstantExpr(V.operands[0], V.indices, Ty);
-    if (V.opcode == Instruction::GetElementPtr) {
-      std::vector<Constant*> IdxList(V.operands.begin()+1, V.operands.end());
-      return GetElementPtrConstantExpr::Create(V.operands[0], IdxList, Ty,
-                                               V.subclassoptionaldata);
+  typedef ConstantInfo<ConstantExpr>::TypeClass TypeClass;
+  ConstantExpr *create(TypeClass *Ty) const {
+    switch (Opcode) {
+    default:
+      if (Instruction::isCast(Opcode))
+        return new UnaryConstantExpr(Opcode, Ops[0], Ty);
+      if ((Opcode >= Instruction::BinaryOpsBegin &&
+           Opcode < Instruction::BinaryOpsEnd))
+        return new BinaryConstantExpr(Opcode, Ops[0], Ops[1],
+                                      SubclassOptionalData);
+      llvm_unreachable("Invalid ConstantExpr!");
+    case Instruction::Select:
+      return new SelectConstantExpr(Ops[0], Ops[1], Ops[2]);
+    case Instruction::ExtractElement:
+      return new ExtractElementConstantExpr(Ops[0], Ops[1]);
+    case Instruction::InsertElement:
+      return new InsertElementConstantExpr(Ops[0], Ops[1], Ops[2]);
+    case Instruction::ShuffleVector:
+      return new ShuffleVectorConstantExpr(Ops[0], Ops[1], Ops[2]);
+    case Instruction::InsertValue:
+      return new InsertValueConstantExpr(Ops[0], Ops[1], Indexes, Ty);
+    case Instruction::ExtractValue:
+      return new ExtractValueConstantExpr(Ops[0], Indexes, Ty);
+    case Instruction::GetElementPtr:
+      return GetElementPtrConstantExpr::Create(Ops[0], Ops.slice(1), Ty,
+                                               SubclassOptionalData);
+    case Instruction::ICmp:
+      return new CompareConstantExpr(Ty, Instruction::ICmp, SubclassData,
+                                     Ops[0], Ops[1]);
+    case Instruction::FCmp:
+      return new CompareConstantExpr(Ty, Instruction::FCmp, SubclassData,
+                                     Ops[0], Ops[1]);
     }
-
-    // The compare instructions are weird. We have to encode the predicate
-    // value and it is combined with the instruction opcode by multiplying
-    // the opcode by one hundred. We must decode this to get the predicate.
-    if (V.opcode == Instruction::ICmp)
-      return new CompareConstantExpr(Ty, Instruction::ICmp, V.subclassdata,
-                                     V.operands[0], V.operands[1]);
-    if (V.opcode == Instruction::FCmp) 
-      return new CompareConstantExpr(Ty, Instruction::FCmp, V.subclassdata,
-                                     V.operands[0], V.operands[1]);
-    llvm_unreachable("Invalid ConstantExpr!");
   }
 };
 
-template<>
-struct ConstantKeyData<ConstantExpr> {
-  typedef ExprMapKeyType ValType;
-  static ValType getValType(ConstantExpr *CE) {
-    std::vector<Constant*> Operands;
-    Operands.reserve(CE->getNumOperands());
-    for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
-      Operands.push_back(cast<Constant>(CE->getOperand(i)));
-    return ExprMapKeyType(CE->getOpcode(), Operands,
-        CE->isCompare() ? CE->getPredicate() : 0,
-        CE->getRawSubclassOptionalData(),
-        CE->hasIndices() ?
-          CE->getIndices() : ArrayRef<unsigned>());
-  }
-};
-
-template<>
-struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType> {
-  static InlineAsm *create(PointerType *Ty, const InlineAsmKeyType &Key) {
-    return new InlineAsm(Ty, Key.asm_string, Key.constraints,
-                         Key.has_side_effects, Key.is_align_stack,
-                         Key.asm_dialect);
-  }
-};
-
-template<>
-struct ConstantKeyData<InlineAsm> {
-  typedef InlineAsmKeyType ValType;
-  static ValType getValType(InlineAsm *Asm) {
-    return InlineAsmKeyType(Asm->getAsmString(), Asm->getConstraintString(),
-                            Asm->hasSideEffects(), Asm->isAlignStack(),
-                            Asm->getDialect());
-  }
-};
-
-template<class ValType, class ValRefType, class TypeClass, class ConstantClass,
-         bool HasLargeKey = false /*true for arrays and structs*/ >
-class ConstantUniqueMap {
+template <class ConstantClass> class ConstantUniqueMap {
 public:
-  typedef std::pair<TypeClass*, ValType> MapKey;
-  typedef std::map<MapKey, ConstantClass *> MapTy;
-  typedef std::map<ConstantClass *, typename MapTy::iterator> InverseMapTy;
-private:
-  /// Map - This is the main map from the element descriptor to the Constants.
-  /// This is the primary way we avoid creating two of the same shape
-  /// constant.
-  MapTy Map;
-    
-  /// InverseMap - If "HasLargeKey" is true, this contains an inverse mapping
-  /// from the constants to their element in Map.  This is important for
-  /// removal of constants from the array, which would otherwise have to scan
-  /// through the map with very large keys.
-  InverseMapTy InverseMap;
+  typedef typename ConstantInfo<ConstantClass>::ValType ValType;
+  typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
+  typedef std::pair<TypeClass *, ValType> LookupKey;
 
-public:
-  typename MapTy::iterator map_begin() { return Map.begin(); }
-  typename MapTy::iterator map_end() { return Map.end(); }
-
-  void freeConstants() {
-    for (typename MapTy::iterator I=Map.begin(), E=Map.end();
-         I != E; ++I) {
-      // Asserts that use_empty().
-      delete I->second;
-    }
-  }
-    
-  /// InsertOrGetItem - Return an iterator for the specified element.
-  /// If the element exists in the map, the returned iterator points to the
-  /// entry and Exists=true.  If not, the iterator points to the newly
-  /// inserted entry and returns Exists=false.  Newly inserted entries have
-  /// I->second == 0, and should be filled in.
-  typename MapTy::iterator InsertOrGetItem(std::pair<MapKey, ConstantClass *>
-                                 &InsertVal,
-                                 bool &Exists) {
-    std::pair<typename MapTy::iterator, bool> IP = Map.insert(InsertVal);
-    Exists = !IP.second;
-    return IP.first;
-  }
-    
-private:
-  typename MapTy::iterator FindExistingElement(ConstantClass *CP) {
-    if (HasLargeKey) {
-      typename InverseMapTy::iterator IMI = InverseMap.find(CP);
-      assert(IMI != InverseMap.end() && IMI->second != Map.end() &&
-             IMI->second->second == CP &&
-             "InverseMap corrupt!");
-      return IMI->second;
-    }
-      
-    typename MapTy::iterator I =
-      Map.find(MapKey(static_cast<TypeClass*>(CP->getType()),
-                      ConstantKeyData<ConstantClass>::getValType(CP)));
-    if (I == Map.end() || I->second != CP) {
-      // FIXME: This should not use a linear scan.  If this gets to be a
-      // performance problem, someone should look at this.
-      for (I = Map.begin(); I != Map.end() && I->second != CP; ++I)
-        /* empty */;
-    }
-    return I;
-  }
-
-  ConstantClass *Create(TypeClass *Ty, ValRefType V,
-                        typename MapTy::iterator I) {
-    ConstantClass* Result =
-      ConstantCreator<ConstantClass,TypeClass,ValType>::create(Ty, V);
-
-    assert(Result->getType() == Ty && "Type specified is not correct!");
-    I = Map.insert(I, std::make_pair(MapKey(Ty, V), Result));
-
-    if (HasLargeKey)  // Remember the reverse mapping if needed.
-      InverseMap.insert(std::make_pair(Result, I));
-
-    return Result;
-  }
-public:
-    
-  /// getOrCreate - Return the specified constant from the map, creating it if
-  /// necessary.
-  ConstantClass *getOrCreate(TypeClass *Ty, ValRefType V) {
-    MapKey Lookup(Ty, V);
-    ConstantClass* Result = nullptr;
-    
-    typename MapTy::iterator I = Map.find(Lookup);
-    // Is it in the map?  
-    if (I != Map.end())
-      Result = I->second;
-        
-    if (!Result) {
-      // If no preexisting value, create one now...
-      Result = Create(Ty, V, I);
-    }
-        
-    return Result;
-  }
-
-  void remove(ConstantClass *CP) {
-    typename MapTy::iterator I = FindExistingElement(CP);
-    assert(I != Map.end() && "Constant not found in constant table!");
-    assert(I->second == CP && "Didn't find correct element?");
-
-    if (HasLargeKey)  // Remember the reverse mapping if needed.
-      InverseMap.erase(CP);
-
-    Map.erase(I);
-  }
-
-  /// MoveConstantToNewSlot - If we are about to change C to be the element
-  /// specified by I, update our internal data structures to reflect this
-  /// fact.
-  void MoveConstantToNewSlot(ConstantClass *C, typename MapTy::iterator I) {
-    // First, remove the old location of the specified constant in the map.
-    typename MapTy::iterator OldI = FindExistingElement(C);
-    assert(OldI != Map.end() && "Constant not found in constant table!");
-    assert(OldI->second == C && "Didn't find correct element?");
-      
-     // Remove the old entry from the map.
-    Map.erase(OldI);
-    
-    // Update the inverse map so that we know that this constant is now
-    // located at descriptor I.
-    if (HasLargeKey) {
-      assert(I->second == C && "Bad inversemap entry!");
-      InverseMap[C] = I;
-    }
-  }
-
-  void dump() const {
-    DEBUG(dbgs() << "Constant.cpp: ConstantUniqueMap\n");
-  }
-};
-
-// Unique map for aggregate constants
-template<class TypeClass, class ConstantClass>
-class ConstantAggrUniqueMap {
-public:
-  typedef ArrayRef<Constant*> Operands;
-  typedef std::pair<TypeClass*, Operands> LookupKey;
 private:
   struct MapInfo {
-    typedef DenseMapInfo<ConstantClass*> ConstantClassInfo;
-    typedef DenseMapInfo<Constant*> ConstantInfo;
-    typedef DenseMapInfo<TypeClass*> TypeClassInfo;
-    static inline ConstantClass* getEmptyKey() {
+    typedef DenseMapInfo<ConstantClass *> ConstantClassInfo;
+    static inline ConstantClass *getEmptyKey() {
       return ConstantClassInfo::getEmptyKey();
     }
-    static inline ConstantClass* getTombstoneKey() {
+    static inline ConstantClass *getTombstoneKey() {
       return ConstantClassInfo::getTombstoneKey();
     }
     static unsigned getHashValue(const ConstantClass *CP) {
-      SmallVector<Constant*, 8> CPOperands;
-      CPOperands.reserve(CP->getNumOperands());
-      for (unsigned I = 0, E = CP->getNumOperands(); I < E; ++I)
-        CPOperands.push_back(CP->getOperand(I));
-      return getHashValue(LookupKey(CP->getType(), CPOperands));
+      SmallVector<Constant *, 8> Storage;
+      return getHashValue(LookupKey(CP->getType(), ValType(CP, Storage)));
     }
     static bool isEqual(const ConstantClass *LHS, const ConstantClass *RHS) {
       return LHS == RHS;
     }
     static unsigned getHashValue(const LookupKey &Val) {
-      return hash_combine(Val.first, hash_combine_range(Val.second.begin(),
-                                                        Val.second.end()));
+      return hash_combine(Val.first, Val.second.getHash());
     }
     static bool isEqual(const LookupKey &LHS, const ConstantClass *RHS) {
       if (RHS == getEmptyKey() || RHS == getTombstoneKey())
         return false;
-      if (LHS.first != RHS->getType()
-          || LHS.second.size() != RHS->getNumOperands())
+      if (LHS.first != RHS->getType())
         return false;
-      for (unsigned I = 0, E = RHS->getNumOperands(); I < E; ++I) {
-        if (LHS.second[I] != RHS->getOperand(I))
-          return false;
-      }
-      return true;
+      return LHS.second == RHS;
     }
   };
+
 public:
   typedef DenseMap<ConstantClass *, char, MapInfo> MapTy;
 
 private:
-  /// Map - This is the main map from the element descriptor to the Constants.
-  /// This is the primary way we avoid creating two of the same shape
-  /// constant.
   MapTy Map;
 
 public:
@@ -696,44 +554,33 @@
   typename MapTy::iterator map_end() { return Map.end(); }
 
   void freeConstants() {
-    for (typename MapTy::iterator I=Map.begin(), E=Map.end();
-         I != E; ++I) {
+    for (auto &I : Map)
       // Asserts that use_empty().
-      delete I->first;
-    }
+      delete I.first;
   }
 
 private:
-  typename MapTy::iterator findExistingElement(ConstantClass *CP) {
-    return Map.find(CP);
-  }
-
-  ConstantClass *Create(TypeClass *Ty, Operands V, typename MapTy::iterator I) {
-    ConstantClass* Result =
-      ConstantArrayCreator<ConstantClass,TypeClass>::create(Ty, V);
+  ConstantClass *create(TypeClass *Ty, ValType V) {
+    ConstantClass *Result = V.create(Ty);
 
     assert(Result->getType() == Ty && "Type specified is not correct!");
-    Map[Result] = '\0';
+    insert(Result);
 
     return Result;
   }
+
 public:
-
-  /// getOrCreate - Return the specified constant from the map, creating it if
-  /// necessary.
-  ConstantClass *getOrCreate(TypeClass *Ty, Operands V) {
+  /// Return the specified constant from the map, creating it if necessary.
+  ConstantClass *getOrCreate(TypeClass *Ty, ValType V) {
     LookupKey Lookup(Ty, V);
-    ConstantClass* Result = nullptr;
+    ConstantClass *Result = nullptr;
 
-    typename MapTy::iterator I = Map.find_as(Lookup);
-    // Is it in the map?
-    if (I != Map.end())
+    auto I = find(Lookup);
+    if (I == Map.end())
+      Result = create(Ty, V);
+    else
       Result = I->first;
-
-    if (!Result) {
-      // If no preexisting value, create one now...
-      Result = Create(Ty, V, I);
-    }
+    assert(Result && "Unexpected nullptr");
 
     return Result;
   }
@@ -744,23 +591,44 @@
   }
 
   /// Insert the constant into its proper slot.
-  void insert(ConstantClass *CP) {
-    Map[CP] = '\0';
-  }
+  void insert(ConstantClass *CP) { Map[CP] = '\0'; }
 
   /// Remove this constant from the map
   void remove(ConstantClass *CP) {
-    typename MapTy::iterator I = findExistingElement(CP);
+    typename MapTy::iterator I = Map.find(CP);
     assert(I != Map.end() && "Constant not found in constant table!");
     assert(I->first == CP && "Didn't find correct element?");
     Map.erase(I);
   }
 
-  void dump() const {
-    DEBUG(dbgs() << "Constant.cpp: ConstantUniqueMap\n");
+  ConstantClass *replaceOperandsInPlace(ArrayRef<Constant *> Operands,
+                                        ConstantClass *CP, Value *From,
+                                        Constant *To, unsigned NumUpdated = 0,
+                                        unsigned OperandNo = ~0u) {
+    LookupKey Lookup(CP->getType(), ValType(Operands, CP));
+    auto I = find(Lookup);
+    if (I != Map.end())
+      return I->first;
+
+    // Update to the new value.  Optimize for the case when we have a single
+    // operand that we're changing, but handle bulk updates efficiently.
+    remove(CP);
+    if (NumUpdated == 1) {
+      assert(OperandNo < CP->getNumOperands() && "Invalid index");
+      assert(CP->getOperand(OperandNo) != To && "I didn't contain From!");
+      CP->setOperand(OperandNo, To);
+    } else {
+      for (unsigned I = 0, E = CP->getNumOperands(); I != E; ++I)
+        if (CP->getOperand(I) == From)
+          CP->setOperand(I, To);
+    }
+    insert(CP);
+    return nullptr;
   }
+
+  void dump() const { DEBUG(dbgs() << "Constant.cpp: ConstantUniqueMap\n"); }
 };
 
-}
+} // end namespace llvm
 
 #endif

diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 87099a6..3576137 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp

@@ -183,20 +183,22 @@
 
 LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
                                char **ErrorMessage) {
-  std::string error;
-  raw_fd_ostream dest(Filename, error, sys::fs::F_Text);
-  if (!error.empty()) {
-    *ErrorMessage = strdup(error.c_str());
+  std::error_code EC;
+  raw_fd_ostream dest(Filename, EC, sys::fs::F_Text);
+  if (EC) {
+    *ErrorMessage = strdup(EC.message().c_str());
     return true;
   }
 
   unwrap(M)->print(dest, nullptr);
 
-  if (!error.empty()) {
-    *ErrorMessage = strdup(error.c_str());
+  dest.close();
+
+  if (dest.has_error()) {
+    *ErrorMessage = strdup("Error printing to file");
     return true;
   }
-  dest.flush();
+
   return false;
 }
 
@@ -558,8 +560,8 @@
 }
 
 void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef MD) {
-  unwrap<Instruction>(Inst)->setMetadata(KindID,
-                                         MD ? unwrap<MDNode>(MD) : nullptr);
+  unwrap<Instruction>(Inst)
+      ->setMetadata(KindID, MD ? unwrap<MDNode>(MD) : nullptr);
 }
 
 /*--.. Conversion functions ................................................--*/
@@ -603,6 +605,11 @@
   return wrap(cast<User>(V)->getOperand(Index));
 }
 
+LLVMUseRef LLVMGetOperandUse(LLVMValueRef Val, unsigned Index) {
+  Value *V = unwrap(Val);
+  return wrap(&cast<User>(V)->getOperandUse(Index));
+}
+
 void LLVMSetOperand(LLVMValueRef Val, unsigned Index, LLVMValueRef Op) {
   unwrap<User>(Val)->setOperand(Index, unwrap(Op));
 }
@@ -767,6 +774,27 @@
   return unwrap<ConstantInt>(ConstantVal)->getSExtValue();
 }
 
+double LLVMConstRealGetDouble(LLVMValueRef ConstantVal, LLVMBool *LosesInfo) {
+  ConstantFP *cFP = unwrap<ConstantFP>(ConstantVal) ;
+  Type *Ty = cFP->getType();
+
+  if (Ty->isFloatTy()) {
+    *LosesInfo = false;
+    return cFP->getValueAPF().convertToFloat();
+  }
+
+  if (Ty->isDoubleTy()) {
+    *LosesInfo = false;
+    return cFP->getValueAPF().convertToDouble();
+  }
+
+  bool APFLosesInfo;
+  APFloat APF = cFP->getValueAPF();
+  APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &APFLosesInfo);
+  *LosesInfo = APFLosesInfo;
+  return APF.convertToDouble();
+}
+
 /*--.. Operations on composite constants ...................................--*/
 
 LLVMValueRef LLVMConstStringInContext(LLVMContextRef C, const char *Str,
@@ -790,11 +818,27 @@
   return LLVMConstStringInContext(LLVMGetGlobalContext(), Str, Length,
                                   DontNullTerminate);
 }
+
+LLVMValueRef LLVMGetElementAsConstant(LLVMValueRef c, unsigned idx) {
+  return wrap(static_cast<ConstantDataSequential*>(unwrap(c))->getElementAsConstant(idx));
+}
+
+LLVMBool LLVMIsConstantString(LLVMValueRef c) {
+  return static_cast<ConstantDataSequential*>(unwrap(c))->isString();
+}
+
+const char *LLVMGetAsString(LLVMValueRef c, size_t* Length) {
+  StringRef str = static_cast<ConstantDataSequential*>(unwrap(c))->getAsString();
+  *Length = str.size();
+  return str.data();
+}
+
 LLVMValueRef LLVMConstArray(LLVMTypeRef ElementTy,
                             LLVMValueRef *ConstantVals, unsigned Length) {
   ArrayRef<Constant*> V(unwrap<Constant>(ConstantVals, Length), Length);
   return wrap(ConstantArray::get(ArrayType::get(unwrap(ElementTy), Length), V));
 }
+
 LLVMValueRef LLVMConstStruct(LLVMValueRef *ConstantVals, unsigned Count,
                              LLVMBool Packed) {
   return LLVMConstStructInContext(LLVMGetGlobalContext(), ConstantVals, Count,
@@ -1859,12 +1903,27 @@
   return (LLVMIntPredicate)0;
 }
 
+LLVMRealPredicate LLVMGetFCmpPredicate(LLVMValueRef Inst) {
+  if (FCmpInst *I = dyn_cast<FCmpInst>(unwrap(Inst)))
+    return (LLVMRealPredicate)I->getPredicate();
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(unwrap(Inst)))
+    if (CE->getOpcode() == Instruction::FCmp)
+      return (LLVMRealPredicate)CE->getPredicate();
+  return (LLVMRealPredicate)0;
+}
+
 LLVMOpcode LLVMGetInstructionOpcode(LLVMValueRef Inst) {
   if (Instruction *C = dyn_cast<Instruction>(unwrap(Inst)))
     return map_to_llvmopcode(C->getOpcode());
   return (LLVMOpcode)0;
 }
 
+LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst) {
+  if (Instruction *C = dyn_cast<Instruction>(unwrap(Inst)))
+    return wrap(C->clone());
+  return nullptr;
+}
+
 /*--.. Call and invoke instructions ........................................--*/
 
 unsigned LLVMGetInstructionCallConv(LLVMValueRef Instr) {
@@ -1926,6 +1985,34 @@
   unwrap<CallInst>(Call)->setTailCall(isTailCall);
 }
 
+/*--.. Operations on terminators ...........................................--*/
+
+unsigned LLVMGetNumSuccessors(LLVMValueRef Term) {
+  return unwrap<TerminatorInst>(Term)->getNumSuccessors();
+}
+
+LLVMBasicBlockRef LLVMGetSuccessor(LLVMValueRef Term, unsigned i) {
+  return wrap(unwrap<TerminatorInst>(Term)->getSuccessor(i));
+}
+
+void LLVMSetSuccessor(LLVMValueRef Term, unsigned i, LLVMBasicBlockRef block) {
+  return unwrap<TerminatorInst>(Term)->setSuccessor(i,unwrap(block));
+}
+
+/*--.. Operations on branch instructions (only) ............................--*/
+
+LLVMBool LLVMIsConditional(LLVMValueRef Branch) {
+  return unwrap<BranchInst>(Branch)->isConditional();
+}
+
+LLVMValueRef LLVMGetCondition(LLVMValueRef Branch) {
+  return wrap(unwrap<BranchInst>(Branch)->getCondition());
+}
+
+void LLVMSetCondition(LLVMValueRef Branch, LLVMValueRef Cond) {
+  return unwrap<BranchInst>(Branch)->setCondition(unwrap(Cond));
+}
+
 /*--.. Operations on switch instructions (only) ............................--*/
 
 LLVMBasicBlockRef LLVMGetSwitchDefaultDest(LLVMValueRef Switch) {
@@ -2313,7 +2400,7 @@
     case LLVMAtomicOrderingSequentiallyConsistent:
       return SequentiallyConsistent;
   }
-  
+
   llvm_unreachable("Invalid LLVMAtomicOrdering value!");
 }
 
@@ -2632,10 +2719,9 @@
     const char *BufferName,
     LLVMBool RequiresNullTerminator) {
 
-  return wrap(MemoryBuffer::getMemBuffer(
-      StringRef(InputData, InputDataLength),
-      StringRef(BufferName),
-      RequiresNullTerminator));
+  return wrap(MemoryBuffer::getMemBuffer(StringRef(InputData, InputDataLength),
+                                         StringRef(BufferName),
+                                         RequiresNullTerminator).release());
 }
 
 LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRangeCopy(
@@ -2643,9 +2729,9 @@
     size_t InputDataLength,
     const char *BufferName) {
 
-  return wrap(MemoryBuffer::getMemBufferCopy(
-      StringRef(InputData, InputDataLength),
-      StringRef(BufferName)));
+  return wrap(
+      MemoryBuffer::getMemBufferCopy(StringRef(InputData, InputDataLength),
+                                     StringRef(BufferName)).release());
 }
 
 const char *LLVMGetBufferStart(LLVMMemoryBufferRef MemBuf) {

diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 218787c..4fe2be6 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp

@@ -23,10 +23,29 @@
 using namespace llvm;
 using namespace llvm::dwarf;
 
-static Constant *GetTagConstant(LLVMContext &VMContext, unsigned Tag) {
-  assert((Tag & LLVMDebugVersionMask) == 0 &&
-         "Tag too large for debug encoding!");
-  return ConstantInt::get(Type::getInt32Ty(VMContext), Tag | LLVMDebugVersion);
+namespace {
+class HeaderBuilder {
+  SmallVector<char, 256> Chars;
+
+public:
+  explicit HeaderBuilder(Twine T) { T.toVector(Chars); }
+  HeaderBuilder(const HeaderBuilder &X) : Chars(X.Chars) {}
+  HeaderBuilder(HeaderBuilder &&X) : Chars(std::move(X.Chars)) {}
+
+  template <class Twineable> HeaderBuilder &concat(Twineable &&X) {
+    Chars.push_back(0);
+    Twine(X).toVector(Chars);
+    return *this;
+  }
+
+  MDString *get(LLVMContext &Context) const {
+    return MDString::get(Context, StringRef(Chars.begin(), Chars.size()));
+  }
+
+  static HeaderBuilder get(unsigned Tag) {
+    return HeaderBuilder("0x" + Twine::utohexstr(Tag));
+  }
+};
 }
 
 DIBuilder::DIBuilder(Module &m)
@@ -34,7 +53,6 @@
       TempRetainTypes(nullptr), TempSubprograms(nullptr), TempGVs(nullptr),
       DeclareFn(nullptr), ValueFn(nullptr) {}
 
-/// finalize - Construct any deferred debug info descriptors.
 void DIBuilder::finalize() {
   DIArray Enums = getOrCreateArray(AllEnumTypes);
   DIType(TempEnumTypes).replaceAllUsesWith(Enums);
@@ -46,7 +64,7 @@
   // TrackingVHs back into Values.
   SmallPtrSet<Value *, 16> RetainSet;
   for (unsigned I = 0, E = AllRetainTypes.size(); I < E; I++)
-    if (RetainSet.insert(AllRetainTypes[I]))
+    if (RetainSet.insert(AllRetainTypes[I]).second)
       RetainValues.push_back(AllRetainTypes[I]);
   DIArray RetainTypes = getOrCreateArray(RetainValues);
   DIType(TempRetainTypes).replaceAllUsesWith(RetainTypes);
@@ -55,13 +73,10 @@
   DIType(TempSubprograms).replaceAllUsesWith(SPs);
   for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
     DISubprogram SP(SPs.getElement(i));
-    SmallVector<Value *, 4> Variables;
-    if (NamedMDNode *NMD = getFnSpecificMDNode(M, SP)) {
-      for (unsigned ii = 0, ee = NMD->getNumOperands(); ii != ee; ++ii)
-        Variables.push_back(NMD->getOperand(ii));
-      NMD->eraseFromParent();
-    }
     if (MDNode *Temp = SP.getVariablesNodes()) {
+      SmallVector<Value *, 4> Variables;
+      for (Value *V : PreservedVariables.lookup(SP))
+        Variables.push_back(V);
       DIArray AV = getOrCreateArray(Variables);
       DIType(Temp).replaceAllUsesWith(AV);
     }
@@ -77,8 +92,7 @@
   DIType(TempImportedModules).replaceAllUsesWith(IMs);
 }
 
-/// getNonCompileUnitScope - If N is compile unit return NULL otherwise return
-/// N.
+/// If N is compile unit return NULL otherwise return N.
 static MDNode *getNonCompileUnitScope(MDNode *N) {
   if (DIDescriptor(N).isCompileUnit())
     return nullptr;
@@ -95,8 +109,6 @@
   return MDNode::get(VMContext, Pair);
 }
 
-/// createCompileUnit - A CompileUnit provides an anchor for all debugging
-/// information generated during this instance of compilation.
 DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
                                            StringRef Directory,
                                            StringRef Producer, bool isOptimized,
@@ -110,7 +122,7 @@
          "Invalid Language tag");
   assert(!Filename.empty() &&
          "Unable to create compile unit without filename");
-  Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
+  Value *TElts[] = {HeaderBuilder::get(DW_TAG_base_type).get(VMContext)};
   TempEnumTypes = MDNode::getTemporary(VMContext, TElts);
 
   TempRetainTypes = MDNode::getTemporary(VMContext, TElts);
@@ -121,22 +133,18 @@
 
   TempImportedModules = MDNode::getTemporary(VMContext, TElts);
 
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_compile_unit),
-    createFilePathPair(VMContext, Filename, Directory),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Lang),
-    MDString::get(VMContext, Producer),
-    ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
-    MDString::get(VMContext, Flags),
-    ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeVer),
-    TempEnumTypes,
-    TempRetainTypes,
-    TempSubprograms,
-    TempGVs,
-    TempImportedModules,
-    MDString::get(VMContext, SplitName),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Kind)
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_compile_unit)
+                       .concat(Lang)
+                       .concat(Producer)
+                       .concat(isOptimized)
+                       .concat(Flags)
+                       .concat(RunTimeVer)
+                       .concat(SplitName)
+                       .concat(Kind)
+                       .get(VMContext),
+                   createFilePathPair(VMContext, Filename, Directory),
+                   TempEnumTypes, TempRetainTypes, TempSubprograms, TempGVs,
+                   TempImportedModules};
 
   MDNode *CUNode = MDNode::get(VMContext, Elts);
 
@@ -158,24 +166,9 @@
                      Value *NS, unsigned Line, StringRef Name,
                      SmallVectorImpl<TrackingVH<MDNode>> &AllImportedModules) {
   const MDNode *R;
-  if (Name.empty()) {
-    Value *Elts[] = {
-      GetTagConstant(C, Tag),
-      Context,
-      NS,
-      ConstantInt::get(Type::getInt32Ty(C), Line),
-    };
-    R = MDNode::get(C, Elts);
-  } else {
-    Value *Elts[] = {
-      GetTagConstant(C, Tag),
-      Context,
-      NS,
-      ConstantInt::get(Type::getInt32Ty(C), Line),
-      MDString::get(C, Name)
-    };
-    R = MDNode::get(C, Elts);
-  }
+  Value *Elts[] = {HeaderBuilder::get(Tag).concat(Line).concat(Name).get(C),
+                   Context, NS};
+  R = MDNode::get(C, Elts);
   DIImportedEntity M(R);
   assert(M.Verify() && "Imported module should be valid");
   AllImportedModules.push_back(TrackingVH<MDNode>(M));
@@ -197,10 +190,13 @@
 }
 
 DIImportedEntity DIBuilder::createImportedDeclaration(DIScope Context,
-                                                      DIScope Decl,
+                                                      DIDescriptor Decl,
                                                       unsigned Line, StringRef Name) {
+  // Make sure to use the unique identifier based metadata reference for
+  // types that have one.
+  Value *V = Decl.isType() ? static_cast<Value*>(DIType(Decl).getRef()) : Decl;
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration,
-                                Context, Decl.getRef(), Line, Name,
+                                Context, V, Line, Name,
                                 AllImportedModules);
 }
 
@@ -211,54 +207,44 @@
                                 Context, Imp, Line, Name, AllImportedModules);
 }
 
-/// createFile - Create a file descriptor to hold debugging information
-/// for a file.
 DIFile DIBuilder::createFile(StringRef Filename, StringRef Directory) {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_file_type),
-    createFilePathPair(VMContext, Filename, Directory)
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_file_type).get(VMContext),
+                   createFilePathPair(VMContext, Filename, Directory)};
   return DIFile(MDNode::get(VMContext, Elts));
 }
 
-/// createEnumerator - Create a single enumerator value.
 DIEnumerator DIBuilder::createEnumerator(StringRef Name, int64_t Val) {
   assert(!Name.empty() && "Unable to create enumerator without name");
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_enumerator),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt64Ty(VMContext), Val)
-  };
+      HeaderBuilder::get(dwarf::DW_TAG_enumerator).concat(Name).concat(Val).get(
+          VMContext)};
   return DIEnumerator(MDNode::get(VMContext, Elts));
 }
 
-/// \brief Create a DWARF unspecified type.
 DIBasicType DIBuilder::createUnspecifiedType(StringRef Name) {
   assert(!Name.empty() && "Unable to create type without name");
   // Unspecified types are encoded in DIBasicType format. Line number, filename,
   // size, alignment, offset and flags are always empty here.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_type),
-    nullptr, // Filename
-    nullptr, // Unused
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0)  // Encoding
+      HeaderBuilder::get(dwarf::DW_TAG_unspecified_type)
+          .concat(Name)
+          .concat(0)
+          .concat(0)
+          .concat(0)
+          .concat(0)
+          .concat(0)
+          .concat(0)
+          .get(VMContext),
+      nullptr, // Filename
+      nullptr  // Unused
   };
   return DIBasicType(MDNode::get(VMContext, Elts));
 }
 
-/// \brief Create C++11 nullptr type.
 DIBasicType DIBuilder::createNullPtrType() {
   return createUnspecifiedType("decltype(nullptr)");
 }
 
-/// createBasicType - Create debugging information entry for a basic
-/// type, e.g 'char'.
 DIBasicType
 DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
                            uint64_t AlignInBits, unsigned Encoding) {
@@ -266,160 +252,139 @@
   // Basic types are encoded in DIBasicType format. Line number, filename,
   // offset and flags are always empty here.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_base_type),
-    nullptr, // File/directory name
-    nullptr, // Unused
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
-    ConstantInt::get(Type::getInt32Ty(VMContext), Encoding)
+      HeaderBuilder::get(dwarf::DW_TAG_base_type)
+          .concat(Name)
+          .concat(0) // Line
+          .concat(SizeInBits)
+          .concat(AlignInBits)
+          .concat(0) // Offset
+          .concat(0) // Flags
+          .concat(Encoding)
+          .get(VMContext),
+      nullptr, // Filename
+      nullptr  // Unused
   };
   return DIBasicType(MDNode::get(VMContext, Elts));
 }
 
-/// createQualifiedType - Create debugging information entry for a qualified
-/// type, e.g. 'const int'.
 DIDerivedType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
   // Qualified types are encoded in DIDerivedType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, Tag),
-    nullptr, // Filename
-    nullptr, // Unused
-    MDString::get(VMContext, StringRef()), // Empty name.
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    FromTy.getRef()
-  };
+  Value *Elts[] = {HeaderBuilder::get(Tag)
+                       .concat(StringRef()) // Name
+                       .concat(0)           // Line
+                       .concat(0)           // Size
+                       .concat(0)           // Align
+                       .concat(0)           // Offset
+                       .concat(0)           // Flags
+                       .get(VMContext),
+                   nullptr, // Filename
+                   nullptr, // Unused
+                   FromTy.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-/// createPointerType - Create debugging information entry for a pointer.
 DIDerivedType
 DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
                              uint64_t AlignInBits, StringRef Name) {
   // Pointer types are encoded in DIDerivedType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_pointer_type),
-    nullptr, // Filename
-    nullptr, // Unused
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    PointeeTy.getRef()
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_pointer_type)
+                       .concat(Name)
+                       .concat(0) // Line
+                       .concat(SizeInBits)
+                       .concat(AlignInBits)
+                       .concat(0) // Offset
+                       .concat(0) // Flags
+                       .get(VMContext),
+                   nullptr, // Filename
+                   nullptr, // Unused
+                   PointeeTy.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 DIDerivedType DIBuilder::createMemberPointerType(DIType PointeeTy,
                                                  DIType Base) {
   // Pointer types are encoded in DIDerivedType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_ptr_to_member_type),
-    nullptr, // Filename
-    nullptr, // Unused
-    nullptr,
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    PointeeTy.getRef(),
-    Base.getRef()
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_ptr_to_member_type)
+                       .concat(StringRef())
+                       .concat(0) // Line
+                       .concat(0) // Size
+                       .concat(0) // Align
+                       .concat(0) // Offset
+                       .concat(0) // Flags
+                       .get(VMContext),
+                   nullptr, // Filename
+                   nullptr, // Unused
+                   PointeeTy.getRef(), Base.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-/// createReferenceType - Create debugging information entry for a reference
-/// type.
 DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
   assert(RTy.isType() && "Unable to create reference type");
   // References are encoded in DIDerivedType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, Tag),
-    nullptr, // Filename
-    nullptr, // TheCU,
-    nullptr, // Name
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    RTy.getRef()
-  };
+  Value *Elts[] = {HeaderBuilder::get(Tag)
+                       .concat(StringRef()) // Name
+                       .concat(0)           // Line
+                       .concat(0)           // Size
+                       .concat(0)           // Align
+                       .concat(0)           // Offset
+                       .concat(0)           // Flags
+                       .get(VMContext),
+                   nullptr, // Filename
+                   nullptr, // TheCU,
+                   RTy.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-/// createTypedef - Create debugging information entry for a typedef.
 DIDerivedType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
                                        unsigned LineNo, DIDescriptor Context) {
   // typedefs are encoded in DIDerivedType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_typedef),
-    File.getFileNode(),
-    DIScope(getNonCompileUnitScope(Context)).getRef(),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    Ty.getRef()
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_typedef)
+                       .concat(Name)
+                       .concat(LineNo)
+                       .concat(0) // Size
+                       .concat(0) // Align
+                       .concat(0) // Offset
+                       .concat(0) // Flags
+                       .get(VMContext),
+                   File.getFileNode(),
+                   DIScope(getNonCompileUnitScope(Context)).getRef(),
+                   Ty.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-/// createFriend - Create debugging information entry for a 'friend'.
 DIDerivedType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
   // typedefs are encoded in DIDerivedType format.
   assert(Ty.isType() && "Invalid type!");
   assert(FriendTy.isType() && "Invalid friend type!");
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_friend),
-    nullptr,
-    Ty.getRef(),
-    nullptr, // Name
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    FriendTy.getRef()
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_friend)
+                       .concat(StringRef()) // Name
+                       .concat(0)           // Line
+                       .concat(0)           // Size
+                       .concat(0)           // Align
+                       .concat(0)           // Offset
+                       .concat(0)           // Flags
+                       .get(VMContext),
+                   nullptr, Ty.getRef(), FriendTy.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-/// createInheritance - Create debugging information entry to establish
-/// inheritance relationship between two types.
 DIDerivedType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
                                            uint64_t BaseOffset,
                                            unsigned Flags) {
   assert(Ty.isType() && "Unable to create inheritance");
   // TAG_inheritance is encoded in DIDerivedType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_inheritance),
-    nullptr,
-    Ty.getRef(),
-    nullptr, // Name
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
-    ConstantInt::get(Type::getInt64Ty(VMContext), BaseOffset),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    BaseTy.getRef()
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_inheritance)
+                       .concat(StringRef()) // Name
+                       .concat(0)           // Line
+                       .concat(0)           // Size
+                       .concat(0)           // Align
+                       .concat(BaseOffset)
+                       .concat(Flags)
+                       .get(VMContext),
+                   nullptr, Ty.getRef(), BaseTy.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-/// createMemberType - Create debugging information entry for a member.
 DIDerivedType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
                                           DIFile File, unsigned LineNumber,
                                           uint64_t SizeInBits,
@@ -427,76 +392,41 @@
                                           uint64_t OffsetInBits, unsigned Flags,
                                           DIType Ty) {
   // TAG_member is encoded in DIDerivedType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_member),
-    File.getFileNode(),
-    DIScope(getNonCompileUnitScope(Scope)).getRef(),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty.getRef()
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_member)
+                       .concat(Name)
+                       .concat(LineNumber)
+                       .concat(SizeInBits)
+                       .concat(AlignInBits)
+                       .concat(OffsetInBits)
+                       .concat(Flags)
+                       .get(VMContext),
+                   File.getFileNode(),
+                   DIScope(getNonCompileUnitScope(Scope)).getRef(),
+                   Ty.getRef()};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-/// createStaticMemberType - Create debugging information entry for a
-/// C++ static data member.
-DIDerivedType
-DIBuilder::createStaticMemberType(DIDescriptor Scope, StringRef Name,
-                                  DIFile File, unsigned LineNumber,
-                                  DIType Ty, unsigned Flags,
-                                  llvm::Value *Val) {
+DIDerivedType DIBuilder::createStaticMemberType(DIDescriptor Scope,
+                                                StringRef Name, DIFile File,
+                                                unsigned LineNumber, DIType Ty,
+                                                unsigned Flags,
+                                                llvm::Constant *Val) {
   // TAG_member is encoded in DIDerivedType format.
   Flags |= DIDescriptor::FlagStaticMember;
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_member),
-    File.getFileNode(),
-    DIScope(getNonCompileUnitScope(Scope)).getRef(),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty.getRef(),
-    Val
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_member)
+                       .concat(Name)
+                       .concat(LineNumber)
+                       .concat(0) // Size
+                       .concat(0) // Align
+                       .concat(0) // Offset
+                       .concat(Flags)
+                       .get(VMContext),
+                   File.getFileNode(),
+                   DIScope(getNonCompileUnitScope(Scope)).getRef(), Ty.getRef(),
+                   Val};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-/// createObjCIVar - Create debugging information entry for Objective-C
-/// instance variable.
-DIDerivedType
-DIBuilder::createObjCIVar(StringRef Name, DIFile File, unsigned LineNumber,
-                          uint64_t SizeInBits, uint64_t AlignInBits,
-                          uint64_t OffsetInBits, unsigned Flags, DIType Ty,
-                          StringRef PropertyName, StringRef GetterName,
-                          StringRef SetterName, unsigned PropertyAttributes) {
-  // TAG_member is encoded in DIDerivedType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_member),
-    File.getFileNode(),
-    getNonCompileUnitScope(File),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty,
-    MDString::get(VMContext, PropertyName),
-    MDString::get(VMContext, GetterName),
-    MDString::get(VMContext, SetterName),
-    ConstantInt::get(Type::getInt32Ty(VMContext), PropertyAttributes)
-  };
-  return DIDerivedType(MDNode::get(VMContext, Elts));
-}
-
-/// createObjCIVar - Create debugging information entry for Objective-C
-/// instance variable.
 DIDerivedType DIBuilder::createObjCIVar(StringRef Name, DIFile File,
                                         unsigned LineNumber,
                                         uint64_t SizeInBits,
@@ -504,88 +434,66 @@
                                         uint64_t OffsetInBits, unsigned Flags,
                                         DIType Ty, MDNode *PropertyNode) {
   // TAG_member is encoded in DIDerivedType format.
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_member),
-    File.getFileNode(),
-    getNonCompileUnitScope(File),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty,
-    PropertyNode
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_member)
+                       .concat(Name)
+                       .concat(LineNumber)
+                       .concat(SizeInBits)
+                       .concat(AlignInBits)
+                       .concat(OffsetInBits)
+                       .concat(Flags)
+                       .get(VMContext),
+                   File.getFileNode(), getNonCompileUnitScope(File), Ty,
+                   PropertyNode};
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-/// createObjCProperty - Create debugging information entry for Objective-C
-/// property.
 DIObjCProperty
 DIBuilder::createObjCProperty(StringRef Name, DIFile File, unsigned LineNumber,
                               StringRef GetterName, StringRef SetterName,
                               unsigned PropertyAttributes, DIType Ty) {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_property),
-    MDString::get(VMContext, Name),
-    File,
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    MDString::get(VMContext, GetterName),
-    MDString::get(VMContext, SetterName),
-    ConstantInt::get(Type::getInt32Ty(VMContext), PropertyAttributes),
-    Ty
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_APPLE_property)
+                       .concat(Name)
+                       .concat(LineNumber)
+                       .concat(GetterName)
+                       .concat(SetterName)
+                       .concat(PropertyAttributes)
+                       .get(VMContext),
+                   File, Ty};
   return DIObjCProperty(MDNode::get(VMContext, Elts));
 }
 
-/// createTemplateTypeParameter - Create debugging information for template
-/// type parameter.
 DITemplateTypeParameter
 DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
                                        DIType Ty, MDNode *File, unsigned LineNo,
                                        unsigned ColumnNo) {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_template_type_parameter),
-    DIScope(getNonCompileUnitScope(Context)).getRef(),
-    MDString::get(VMContext, Name),
-    Ty.getRef(),
-    File,
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
-    ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_template_type_parameter)
+                       .concat(Name)
+                       .concat(LineNo)
+                       .concat(ColumnNo)
+                       .get(VMContext),
+                   DIScope(getNonCompileUnitScope(Context)).getRef(),
+                   Ty.getRef(), File};
   return DITemplateTypeParameter(MDNode::get(VMContext, Elts));
 }
 
-DITemplateValueParameter
-DIBuilder::createTemplateValueParameter(unsigned Tag, DIDescriptor Context,
-                                        StringRef Name, DIType Ty,
-                                        Value *Val, MDNode *File,
-                                        unsigned LineNo,
-                                        unsigned ColumnNo) {
+static DITemplateValueParameter createTemplateValueParameterHelper(
+    LLVMContext &VMContext, unsigned Tag, DIDescriptor Context, StringRef Name,
+    DIType Ty, Value *Val, MDNode *File, unsigned LineNo, unsigned ColumnNo) {
   Value *Elts[] = {
-    GetTagConstant(VMContext, Tag),
-    DIScope(getNonCompileUnitScope(Context)).getRef(),
-    MDString::get(VMContext, Name),
-    Ty.getRef(),
-    Val,
-    File,
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
-    ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
-  };
+      HeaderBuilder::get(Tag).concat(Name).concat(LineNo).concat(ColumnNo).get(
+          VMContext),
+      DIScope(getNonCompileUnitScope(Context)).getRef(), Ty.getRef(), Val,
+      File};
   return DITemplateValueParameter(MDNode::get(VMContext, Elts));
 }
 
-/// createTemplateValueParameter - Create debugging information for template
-/// value parameter.
 DITemplateValueParameter
 DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
-                                        DIType Ty, Value *Val,
-                                        MDNode *File, unsigned LineNo,
-                                        unsigned ColumnNo) {
-  return createTemplateValueParameter(dwarf::DW_TAG_template_value_parameter,
-                                      Context, Name, Ty, Val, File, LineNo,
-                                      ColumnNo);
+                                        DIType Ty, Constant *Val, MDNode *File,
+                                        unsigned LineNo, unsigned ColumnNo) {
+  return createTemplateValueParameterHelper(
+      VMContext, dwarf::DW_TAG_template_value_parameter, Context, Name, Ty, Val,
+      File, LineNo, ColumnNo);
 }
 
 DITemplateValueParameter
@@ -593,8 +501,8 @@
                                            DIType Ty, StringRef Val,
                                            MDNode *File, unsigned LineNo,
                                            unsigned ColumnNo) {
-  return createTemplateValueParameter(
-      dwarf::DW_TAG_GNU_template_template_param, Context, Name, Ty,
+  return createTemplateValueParameterHelper(
+      VMContext, dwarf::DW_TAG_GNU_template_template_param, Context, Name, Ty,
       MDString::get(VMContext, Val), File, LineNo, ColumnNo);
 }
 
@@ -603,12 +511,11 @@
                                        DIType Ty, DIArray Val,
                                        MDNode *File, unsigned LineNo,
                                        unsigned ColumnNo) {
-  return createTemplateValueParameter(dwarf::DW_TAG_GNU_template_parameter_pack,
-                                      Context, Name, Ty, Val, File, LineNo,
-                                      ColumnNo);
+  return createTemplateValueParameterHelper(
+      VMContext, dwarf::DW_TAG_GNU_template_parameter_pack, Context, Name, Ty,
+      Val, File, LineNo, ColumnNo);
 }
 
-/// createClassType - Create debugging information entry for a class.
 DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
                                            DIFile File, unsigned LineNumber,
                                            uint64_t SizeInBits,
@@ -623,23 +530,19 @@
          "createClassType should be called with a valid Context");
   // TAG_class_type is encoded in DICompositeType format.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
-    File.getFileNode(),
-    DIScope(getNonCompileUnitScope(Context)).getRef(),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom.getRef(),
-    Elements,
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    VTableHolder.getRef(),
-    TemplateParams,
-    UniqueIdentifier.empty() ? nullptr
-                             : MDString::get(VMContext, UniqueIdentifier)
-  };
+      HeaderBuilder::get(dwarf::DW_TAG_class_type)
+          .concat(Name)
+          .concat(LineNumber)
+          .concat(SizeInBits)
+          .concat(AlignInBits)
+          .concat(OffsetInBits)
+          .concat(Flags)
+          .concat(0)
+          .get(VMContext),
+      File.getFileNode(), DIScope(getNonCompileUnitScope(Context)).getRef(),
+      DerivedFrom.getRef(), Elements, VTableHolder.getRef(), TemplateParams,
+      UniqueIdentifier.empty() ? nullptr
+                               : MDString::get(VMContext, UniqueIdentifier)};
   DICompositeType R(MDNode::get(VMContext, Elts));
   assert(R.isCompositeType() &&
          "createClassType should return a DICompositeType");
@@ -648,7 +551,6 @@
   return R;
 }
 
-/// createStructType - Create debugging information entry for a struct.
 DICompositeType DIBuilder::createStructType(DIDescriptor Context,
                                             StringRef Name, DIFile File,
                                             unsigned LineNumber,
@@ -661,23 +563,19 @@
                                             StringRef UniqueIdentifier) {
  // TAG_structure_type is encoded in DICompositeType format.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_structure_type),
-    File.getFileNode(),
-    DIScope(getNonCompileUnitScope(Context)).getRef(),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom.getRef(),
-    Elements,
-    ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    VTableHolder.getRef(),
-    nullptr,
-    UniqueIdentifier.empty() ? nullptr
-                             : MDString::get(VMContext, UniqueIdentifier)
-  };
+      HeaderBuilder::get(dwarf::DW_TAG_structure_type)
+          .concat(Name)
+          .concat(LineNumber)
+          .concat(SizeInBits)
+          .concat(AlignInBits)
+          .concat(0)
+          .concat(Flags)
+          .concat(RunTimeLang)
+          .get(VMContext),
+      File.getFileNode(), DIScope(getNonCompileUnitScope(Context)).getRef(),
+      DerivedFrom.getRef(), Elements, VTableHolder.getRef(), nullptr,
+      UniqueIdentifier.empty() ? nullptr
+                               : MDString::get(VMContext, UniqueIdentifier)};
   DICompositeType R(MDNode::get(VMContext, Elts));
   assert(R.isCompositeType() &&
          "createStructType should return a DICompositeType");
@@ -686,7 +584,6 @@
   return R;
 }
 
-/// createUnionType - Create debugging information entry for an union.
 DICompositeType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
                                            DIFile File, unsigned LineNumber,
                                            uint64_t SizeInBits,
@@ -696,79 +593,64 @@
                                            StringRef UniqueIdentifier) {
   // TAG_union_type is encoded in DICompositeType format.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_union_type),
-    File.getFileNode(),
-    DIScope(getNonCompileUnitScope(Scope)).getRef(),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    nullptr,
-    Elements,
-    ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    nullptr,
-    nullptr,
-    UniqueIdentifier.empty() ? nullptr
-                             : MDString::get(VMContext, UniqueIdentifier)
-  };
+      HeaderBuilder::get(dwarf::DW_TAG_union_type)
+          .concat(Name)
+          .concat(LineNumber)
+          .concat(SizeInBits)
+          .concat(AlignInBits)
+          .concat(0) // Offset
+          .concat(Flags)
+          .concat(RunTimeLang)
+          .get(VMContext),
+      File.getFileNode(), DIScope(getNonCompileUnitScope(Scope)).getRef(),
+      nullptr, Elements, nullptr, nullptr,
+      UniqueIdentifier.empty() ? nullptr
+                               : MDString::get(VMContext, UniqueIdentifier)};
   DICompositeType R(MDNode::get(VMContext, Elts));
   if (!UniqueIdentifier.empty())
     retainType(R);
   return R;
 }
 
-/// createSubroutineType - Create subroutine type.
-DICompositeType DIBuilder::createSubroutineType(DIFile File,
-                                                DIArray ParameterTypes,
-                                                unsigned Flags) {
+DISubroutineType DIBuilder::createSubroutineType(DIFile File,
+                                                 DITypeArray ParameterTypes,
+                                                 unsigned Flags) {
   // TAG_subroutine_type is encoded in DICompositeType format.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    nullptr,
-    MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags), // Flags
-    nullptr,
-    ParameterTypes,
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    nullptr,
-    nullptr,
-    nullptr  // Type Identifer
+      HeaderBuilder::get(dwarf::DW_TAG_subroutine_type)
+          .concat(StringRef())
+          .concat(0)     // Line
+          .concat(0)     // Size
+          .concat(0)     // Align
+          .concat(0)     // Offset
+          .concat(Flags) // Flags
+          .concat(0)
+          .get(VMContext),
+      nullptr, nullptr, nullptr, ParameterTypes, nullptr, nullptr,
+      nullptr // Type Identifer
   };
-  return DICompositeType(MDNode::get(VMContext, Elts));
+  return DISubroutineType(MDNode::get(VMContext, Elts));
 }
 
-/// createEnumerationType - Create debugging information entry for an
-/// enumeration.
 DICompositeType DIBuilder::createEnumerationType(
     DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
     uint64_t SizeInBits, uint64_t AlignInBits, DIArray Elements,
     DIType UnderlyingType, StringRef UniqueIdentifier) {
   // TAG_enumeration_type is encoded in DICompositeType format.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
-    File.getFileNode(),
-    DIScope(getNonCompileUnitScope(Scope)).getRef(),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    UnderlyingType.getRef(),
-    Elements,
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    nullptr,
-    nullptr,
-    UniqueIdentifier.empty() ? nullptr
-                             : MDString::get(VMContext, UniqueIdentifier)
-  };
+      HeaderBuilder::get(dwarf::DW_TAG_enumeration_type)
+          .concat(Name)
+          .concat(LineNumber)
+          .concat(SizeInBits)
+          .concat(AlignInBits)
+          .concat(0) // Offset
+          .concat(0) // Flags
+          .concat(0)
+          .get(VMContext),
+      File.getFileNode(), DIScope(getNonCompileUnitScope(Scope)).getRef(),
+      UnderlyingType.getRef(), Elements, nullptr, nullptr,
+      UniqueIdentifier.empty() ? nullptr
+                               : MDString::get(VMContext, UniqueIdentifier)};
   DICompositeType CTy(MDNode::get(VMContext, Elts));
   AllEnumTypes.push_back(CTy);
   if (!UniqueIdentifier.empty())
@@ -776,114 +658,96 @@
   return CTy;
 }
 
-/// createArrayType - Create debugging information entry for an array.
 DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
                                            DIType Ty, DIArray Subscripts) {
   // TAG_array_type is encoded in DICompositeType format.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
-    nullptr, // Filename/Directory,
-    nullptr, // Unused
-    MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), Size),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    Ty.getRef(),
-    Subscripts,
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    nullptr,
-    nullptr,
-    nullptr  // Type Identifer
+      HeaderBuilder::get(dwarf::DW_TAG_array_type)
+          .concat(StringRef())
+          .concat(0) // Line
+          .concat(Size)
+          .concat(AlignInBits)
+          .concat(0) // Offset
+          .concat(0) // Flags
+          .concat(0)
+          .get(VMContext),
+      nullptr, // Filename/Directory,
+      nullptr, // Unused
+      Ty.getRef(), Subscripts, nullptr, nullptr,
+      nullptr // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
 
-/// createVectorType - Create debugging information entry for a vector.
 DICompositeType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
                                             DIType Ty, DIArray Subscripts) {
   // A vector is an array type with the FlagVector flag applied.
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
-    nullptr, // Filename/Directory,
-    nullptr, // Unused
-    MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), Size),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), DIType::FlagVector),
-    Ty.getRef(),
-    Subscripts,
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    nullptr,
-    nullptr,
-    nullptr  // Type Identifer
+      HeaderBuilder::get(dwarf::DW_TAG_array_type)
+          .concat("")
+          .concat(0) // Line
+          .concat(Size)
+          .concat(AlignInBits)
+          .concat(0) // Offset
+          .concat(DIType::FlagVector)
+          .concat(0)
+          .get(VMContext),
+      nullptr, // Filename/Directory,
+      nullptr, // Unused
+      Ty.getRef(), Subscripts, nullptr, nullptr,
+      nullptr // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
 
-/// createArtificialType - Create a new DIType with "artificial" flag set.
+static HeaderBuilder setTypeFlagsInHeader(StringRef Header,
+                                          unsigned FlagsToSet) {
+  DIHeaderFieldIterator I(Header);
+  std::advance(I, 6);
+
+  unsigned Flags;
+  if (I->getAsInteger(0, Flags))
+    Flags = 0;
+  Flags |= FlagsToSet;
+
+  return HeaderBuilder(Twine(I.getPrefix())).concat(Flags).concat(
+      I.getSuffix());
+}
+
+static DIType createTypeWithFlags(LLVMContext &Context, DIType Ty,
+                                  unsigned FlagsToSet) {
+  SmallVector<Value *, 9> Elts;
+  MDNode *N = Ty;
+  assert(N && "Unexpected input DIType!");
+  // Update header field.
+  Elts.push_back(setTypeFlagsInHeader(Ty.getHeader(), FlagsToSet).get(Context));
+  for (unsigned I = 1, E = N->getNumOperands(); I != E; ++I)
+    Elts.push_back(N->getOperand(I));
+
+  return DIType(MDNode::get(Context, Elts));
+}
+
 DIType DIBuilder::createArtificialType(DIType Ty) {
   if (Ty.isArtificial())
     return Ty;
-
-  SmallVector<Value *, 9> Elts;
-  MDNode *N = Ty;
-  assert (N && "Unexpected input DIType!");
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    Elts.push_back(N->getOperand(i));
-
-  unsigned CurFlags = Ty.getFlags();
-  CurFlags = CurFlags | DIType::FlagArtificial;
-
-  // Flags are stored at this slot.
-  // FIXME: Add an enum for this magic value.
-  Elts[8] =  ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
-
-  return DIType(MDNode::get(VMContext, Elts));
+  return createTypeWithFlags(VMContext, Ty, DIType::FlagArtificial);
 }
 
-/// createObjectPointerType - Create a new type with both the object pointer
-/// and artificial flags set.
 DIType DIBuilder::createObjectPointerType(DIType Ty) {
   if (Ty.isObjectPointer())
     return Ty;
-
-  SmallVector<Value *, 9> Elts;
-  MDNode *N = Ty;
-  assert (N && "Unexpected input DIType!");
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    Elts.push_back(N->getOperand(i));
-
-  unsigned CurFlags = Ty.getFlags();
-  CurFlags = CurFlags | (DIType::FlagObjectPointer | DIType::FlagArtificial);
-
-  // Flags are stored at this slot.
-  // FIXME: Add an enum for this magic value.
-  Elts[8] = ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
-
-  return DIType(MDNode::get(VMContext, Elts));
+  unsigned Flags = DIType::FlagObjectPointer | DIType::FlagArtificial;
+  return createTypeWithFlags(VMContext, Ty, Flags);
 }
 
-/// retainType - Retain DIType in a module even if it is not referenced
-/// through debug info anchors.
 void DIBuilder::retainType(DIType T) {
   AllRetainTypes.push_back(TrackingVH<MDNode>(T));
 }
 
-/// createUnspecifiedParameter - Create unspeicified type descriptor
-/// for the subroutine type.
-DIDescriptor DIBuilder::createUnspecifiedParameter() {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_parameters)
-  };
-  return DIDescriptor(MDNode::get(VMContext, Elts));
+DIBasicType DIBuilder::createUnspecifiedParameter() {
+  return DIBasicType();
 }
 
-/// createForwardDecl - Create a temporary forward-declared type that
-/// can be RAUW'd if the full type is seen.
 DICompositeType
 DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
                              DIFile F, unsigned Line, unsigned RuntimeLang,
@@ -891,23 +755,20 @@
                              StringRef UniqueIdentifier) {
   // Create a temporary MDNode.
   Value *Elts[] = {
-    GetTagConstant(VMContext, Tag),
-    F.getFileNode(),
-    DIScope(getNonCompileUnitScope(Scope)).getRef(),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Line),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), DIDescriptor::FlagFwdDecl),
-    nullptr,
-    DIArray(),
-    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang),
-    nullptr,
-    nullptr, //TemplateParams
-    UniqueIdentifier.empty() ? nullptr
-                             : MDString::get(VMContext, UniqueIdentifier)
-  };
+      HeaderBuilder::get(Tag)
+          .concat(Name)
+          .concat(Line)
+          .concat(SizeInBits)
+          .concat(AlignInBits)
+          .concat(0) // Offset
+          .concat(DIDescriptor::FlagFwdDecl)
+          .concat(RuntimeLang)
+          .get(VMContext),
+      F.getFileNode(), DIScope(getNonCompileUnitScope(Scope)).getRef(), nullptr,
+      DIArray(), nullptr,
+      nullptr, // TemplateParams
+      UniqueIdentifier.empty() ? nullptr
+                               : MDString::get(VMContext, UniqueIdentifier)};
   MDNode *Node = MDNode::get(VMContext, Elts);
   DICompositeType RetTy(Node);
   assert(RetTy.isCompositeType() &&
@@ -917,123 +778,102 @@
   return RetTy;
 }
 
-/// createForwardDecl - Create a temporary forward-declared type that
-/// can be RAUW'd if the full type is seen.
 DICompositeType DIBuilder::createReplaceableForwardDecl(
     unsigned Tag, StringRef Name, DIDescriptor Scope, DIFile F, unsigned Line,
     unsigned RuntimeLang, uint64_t SizeInBits, uint64_t AlignInBits,
     StringRef UniqueIdentifier) {
   // Create a temporary MDNode.
   Value *Elts[] = {
-    GetTagConstant(VMContext, Tag),
-    F.getFileNode(),
-    DIScope(getNonCompileUnitScope(Scope)).getRef(),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Line),
-    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
-    ConstantInt::get(Type::getInt32Ty(VMContext), DIDescriptor::FlagFwdDecl),
-    nullptr,
-    DIArray(),
-    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang),
-    nullptr,
-    nullptr, //TemplateParams
-    UniqueIdentifier.empty() ? nullptr
-                             : MDString::get(VMContext, UniqueIdentifier)
-  };
+      HeaderBuilder::get(Tag)
+          .concat(Name)
+          .concat(Line)
+          .concat(SizeInBits)
+          .concat(AlignInBits)
+          .concat(0) // Offset
+          .concat(DIDescriptor::FlagFwdDecl)
+          .concat(RuntimeLang)
+          .get(VMContext),
+      F.getFileNode(), DIScope(getNonCompileUnitScope(Scope)).getRef(), nullptr,
+      DIArray(), nullptr,
+      nullptr, // TemplateParams
+      UniqueIdentifier.empty() ? nullptr
+                               : MDString::get(VMContext, UniqueIdentifier)};
   MDNode *Node = MDNode::getTemporary(VMContext, Elts);
   DICompositeType RetTy(Node);
   assert(RetTy.isCompositeType() &&
-         "createForwardDecl result should be a DIType");
+         "createReplaceableForwardDecl result should be a DIType");
   if (!UniqueIdentifier.empty())
     retainType(RetTy);
   return RetTy;
 }
 
-/// getOrCreateArray - Get a DIArray, create one if required.
 DIArray DIBuilder::getOrCreateArray(ArrayRef<Value *> Elements) {
   return DIArray(MDNode::get(VMContext, Elements));
 }
 
-/// getOrCreateSubrange - Create a descriptor for a value range.  This
-/// implicitly uniques the values returned.
+DITypeArray DIBuilder::getOrCreateTypeArray(ArrayRef<Value *> Elements) {
+  SmallVector<llvm::Value *, 16> Elts; 
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
+    if (Elements[i] && isa<MDNode>(Elements[i]))
+      Elts.push_back(DIType(cast<MDNode>(Elements[i])).getRef());
+    else
+      Elts.push_back(Elements[i]);
+  }
+  return DITypeArray(MDNode::get(VMContext, Elts));
+}
+
 DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Count) {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_subrange_type),
-    ConstantInt::get(Type::getInt64Ty(VMContext), Lo),
-    ConstantInt::get(Type::getInt64Ty(VMContext), Count)
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_subrange_type)
+                       .concat(Lo)
+                       .concat(Count)
+                       .get(VMContext)};
 
   return DISubrange(MDNode::get(VMContext, Elts));
 }
 
-/// \brief Create a new descriptor for the specified global.
-DIGlobalVariable DIBuilder::createGlobalVariable(StringRef Name,
-                                                 StringRef LinkageName,
-                                                 DIFile F, unsigned LineNumber,
-                                                 DITypeRef Ty, bool isLocalToUnit,
-                                                 Value *Val) {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_variable),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    nullptr, // TheCU,
-    MDString::get(VMContext, Name),
-    MDString::get(VMContext, Name),
-    MDString::get(VMContext, LinkageName),
-    F,
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    Ty,
-    ConstantInt::get(Type::getInt32Ty(VMContext), isLocalToUnit),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/
-    Val,
-    DIDescriptor()
-  };
-  MDNode *Node = MDNode::get(VMContext, Elts);
-  AllGVs.push_back(Node);
-  return DIGlobalVariable(Node);
+static DIGlobalVariable createGlobalVariableHelper(
+    LLVMContext &VMContext, DIDescriptor Context, StringRef Name,
+    StringRef LinkageName, DIFile F, unsigned LineNumber, DITypeRef Ty,
+    bool isLocalToUnit, Constant *Val, MDNode *Decl, bool isDefinition,
+    std::function<MDNode *(ArrayRef<Value *>)> CreateFunc) {
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_variable)
+                       .concat(Name)
+                       .concat(Name)
+                       .concat(LinkageName)
+                       .concat(LineNumber)
+                       .concat(isLocalToUnit)
+                       .concat(isDefinition)
+                       .get(VMContext),
+                   DIScope(getNonCompileUnitScope(Context)).getRef(), F, Ty, Val,
+                   DIDescriptor(Decl)};
+
+  return DIGlobalVariable(CreateFunc(Elts));
 }
 
-/// \brief Create a new descriptor for the specified global.
-DIGlobalVariable DIBuilder::createGlobalVariable(StringRef Name, DIFile F,
-                                                 unsigned LineNumber,
-                                                 DITypeRef Ty,
-                                                 bool isLocalToUnit,
-                                                 Value *Val) {
-  return createGlobalVariable(Name, Name, F, LineNumber, Ty, isLocalToUnit,
-                              Val);
+DIGlobalVariable DIBuilder::createGlobalVariable(
+    DIDescriptor Context, StringRef Name, StringRef LinkageName, DIFile F,
+    unsigned LineNumber, DITypeRef Ty, bool isLocalToUnit, Constant *Val,
+    MDNode *Decl) {
+  return createGlobalVariableHelper(VMContext, Context, Name, LinkageName, F,
+                                    LineNumber, Ty, isLocalToUnit, Val, Decl, true,
+                                    [&] (ArrayRef<Value *> Elts) -> MDNode * {
+                                      MDNode *Node = MDNode::get(VMContext, Elts);
+                                      AllGVs.push_back(Node);
+                                      return Node;
+                                    });
 }
 
-/// createStaticVariable - Create a new descriptor for the specified static
-/// variable.
-DIGlobalVariable DIBuilder::createStaticVariable(DIDescriptor Context,
-                                                 StringRef Name,
-                                                 StringRef LinkageName,
-                                                 DIFile F, unsigned LineNumber,
-                                                 DITypeRef Ty,
-                                                 bool isLocalToUnit,
-                                                 Value *Val, MDNode *Decl) {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_variable),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    getNonCompileUnitScope(Context),
-    MDString::get(VMContext, Name),
-    MDString::get(VMContext, Name),
-    MDString::get(VMContext, LinkageName),
-    F,
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    Ty,
-    ConstantInt::get(Type::getInt32Ty(VMContext), isLocalToUnit),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/
-    Val,
-    DIDescriptor(Decl)
-  };
-  MDNode *Node = MDNode::get(VMContext, Elts);
-  AllGVs.push_back(Node);
-  return DIGlobalVariable(Node);
+DIGlobalVariable DIBuilder::createTempGlobalVariableFwdDecl(
+    DIDescriptor Context, StringRef Name, StringRef LinkageName, DIFile F,
+    unsigned LineNumber, DITypeRef Ty, bool isLocalToUnit, Constant *Val,
+    MDNode *Decl) {
+  return createGlobalVariableHelper(VMContext, Context, Name, LinkageName, F,
+                                    LineNumber, Ty, isLocalToUnit, Val, Decl, false,
+                                    [&] (ArrayRef<Value *> Elts) {
+                                      return MDNode::getTemporary(VMContext, Elts);
+                                    });
 }
 
-/// createVariable - Create a new descriptor for the specified variable.
 DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
                                           StringRef Name, DIFile File,
                                           unsigned LineNo, DITypeRef Ty,
@@ -1042,24 +882,20 @@
   DIDescriptor Context(getNonCompileUnitScope(Scope));
   assert((!Context || Context.isScope()) &&
          "createLocalVariable should be called with a valid Context");
-  Value *Elts[] = {
-    GetTagConstant(VMContext, Tag),
-    getNonCompileUnitScope(Scope),
-    MDString::get(VMContext, Name),
-    File,
-    ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24))),
-    Ty,
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
-  };
+  Value *Elts[] = {HeaderBuilder::get(Tag)
+                       .concat(Name)
+                       .concat(LineNo | (ArgNo << 24))
+                       .concat(Flags)
+                       .get(VMContext),
+                   getNonCompileUnitScope(Scope), File, Ty};
   MDNode *Node = MDNode::get(VMContext, Elts);
   if (AlwaysPreserve) {
     // The optimizer may remove local variable. If there is an interest
     // to preserve variable info in such situation then stash it in a
     // named mdnode.
     DISubprogram Fn(getDISubprogram(Scope));
-    NamedMDNode *FnLocals = getOrInsertFnSpecificMDNode(M, Fn);
-    FnLocals->addOperand(Node);
+    assert(Fn && "Missing subprogram for local variable");
+    PreservedVariables[Fn].push_back(Node);
   }
   DIVariable RetVar(Node);
   assert(RetVar.isVariable() &&
@@ -1067,33 +903,20 @@
   return RetVar;
 }
 
-/// createComplexVariable - Create a new descriptor for the specified variable
-/// which has a complex address expression for its address.
-DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
-                                            StringRef Name, DIFile F,
-                                            unsigned LineNo,
-                                            DITypeRef Ty,
-                                            ArrayRef<Value *> Addr,
-                                            unsigned ArgNo) {
-  assert(Addr.size() > 0 && "complex address is empty");
-  Value *Elts[] = {
-    GetTagConstant(VMContext, Tag),
-    getNonCompileUnitScope(Scope),
-    MDString::get(VMContext, Name),
-    F,
-    ConstantInt::get(Type::getInt32Ty(VMContext),
-                     (LineNo | (ArgNo << 24))),
-    Ty,
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    MDNode::get(VMContext, Addr)
-  };
-  return DIVariable(MDNode::get(VMContext, Elts));
+DIExpression DIBuilder::createExpression(ArrayRef<int64_t> Addr) {
+  auto Header = HeaderBuilder::get(DW_TAG_expression);
+  for (int64_t I : Addr)
+    Header.concat(I);
+  Value *Elts[] = {Header.get(VMContext)};
+  return DIExpression(MDNode::get(VMContext, Elts));
 }
 
-/// createFunction - Create a new descriptor for the specified function.
-/// FIXME: this is added for dragonegg. Once we update dragonegg
-/// to call resolve function, this will be removed.
+DIExpression DIBuilder::createPieceExpression(unsigned OffsetInBytes,
+                                              unsigned SizeInBytes) {
+  int64_t Addr[] = {dwarf::DW_OP_piece, OffsetInBytes, SizeInBytes};
+  return createExpression(Addr);
+}
+
 DISubprogram DIBuilder::createFunction(DIScopeRef Context, StringRef Name,
                                        StringRef LinkageName, DIFile File,
                                        unsigned LineNo, DICompositeType Ty,
@@ -1109,7 +932,39 @@
                         Flags, isOptimized, Fn, TParams, Decl);
 }
 
-/// createFunction - Create a new descriptor for the specified function.
+static DISubprogram
+createFunctionHelper(LLVMContext &VMContext, DIDescriptor Context, StringRef Name,
+                     StringRef LinkageName, DIFile File, unsigned LineNo,
+                     DICompositeType Ty, bool isLocalToUnit, bool isDefinition,
+                     unsigned ScopeLine, unsigned Flags, bool isOptimized,
+                     Function *Fn, MDNode *TParams, MDNode *Decl, MDNode *Vars,
+                     std::function<MDNode *(ArrayRef<Value *>)> CreateFunc) {
+  assert(Ty.getTag() == dwarf::DW_TAG_subroutine_type &&
+         "function types should be subroutines");
+  Value *Elts[] = {
+      HeaderBuilder::get(dwarf::DW_TAG_subprogram)
+          .concat(Name)
+          .concat(Name)
+          .concat(LinkageName)
+          .concat(LineNo)
+          .concat(isLocalToUnit)
+          .concat(isDefinition)
+          .concat(0)
+          .concat(0)
+          .concat(Flags)
+          .concat(isOptimized)
+          .concat(ScopeLine)
+          .get(VMContext),
+      File.getFileNode(), DIScope(getNonCompileUnitScope(Context)).getRef(), Ty,
+      nullptr, Fn, TParams, Decl, Vars};
+
+  DISubprogram S(CreateFunc(Elts));
+  assert(S.isSubprogram() &&
+         "createFunction should return a valid DISubprogram");
+  return S;
+}
+
+
 DISubprogram DIBuilder::createFunction(DIDescriptor Context, StringRef Name,
                                        StringRef LinkageName, DIFile File,
                                        unsigned LineNo, DICompositeType Ty,
@@ -1117,43 +972,36 @@
                                        unsigned ScopeLine, unsigned Flags,
                                        bool isOptimized, Function *Fn,
                                        MDNode *TParams, MDNode *Decl) {
-  assert(Ty.getTag() == dwarf::DW_TAG_subroutine_type &&
-         "function types should be subroutines");
-  Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
-    File.getFileNode(),
-    DIScope(getNonCompileUnitScope(Context)).getRef(),
-    MDString::get(VMContext, Name),
-    MDString::get(VMContext, Name),
-    MDString::get(VMContext, LinkageName),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
-    Ty,
-    ConstantInt::get(Type::getInt1Ty(VMContext), isLocalToUnit),
-    ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    nullptr,
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
-    Fn,
-    TParams,
-    Decl,
-    MDNode::getTemporary(VMContext, TElts),
-    ConstantInt::get(Type::getInt32Ty(VMContext), ScopeLine)
-  };
-  MDNode *Node = MDNode::get(VMContext, Elts);
-
-  // Create a named metadata so that we do not lose this mdnode.
-  if (isDefinition)
-    AllSubprograms.push_back(Node);
-  DISubprogram S(Node);
-  assert(S.isSubprogram() &&
-         "createFunction should return a valid DISubprogram");
-  return S;
+  return createFunctionHelper(VMContext, Context, Name, LinkageName, File,
+                              LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
+                              Flags, isOptimized, Fn, TParams, Decl,
+                              MDNode::getTemporary(VMContext, None),
+                              [&] (ArrayRef<Value *> Elts) -> MDNode *{
+                                MDNode *Node = MDNode::get(VMContext, Elts);
+                                // Create a named metadata so that we
+                                // do not lose this mdnode.
+                                if (isDefinition)
+                                  AllSubprograms.push_back(Node);
+                                return Node;
+                              });
 }
 
-/// createMethod - Create a new descriptor for the specified C++ method.
+DISubprogram
+DIBuilder::createTempFunctionFwdDecl(DIDescriptor Context, StringRef Name,
+                                     StringRef LinkageName, DIFile File,
+                                     unsigned LineNo, DICompositeType Ty,
+                                     bool isLocalToUnit, bool isDefinition,
+                                     unsigned ScopeLine, unsigned Flags,
+                                     bool isOptimized, Function *Fn,
+                                     MDNode *TParams, MDNode *Decl) {
+  return createFunctionHelper(VMContext, Context, Name, LinkageName, File,
+                              LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
+                              Flags, isOptimized, Fn, TParams, Decl, nullptr,
+                              [&] (ArrayRef<Value *> Elts) {
+                                return MDNode::getTemporary(VMContext, Elts);
+                              });
+}
+
 DISubprogram DIBuilder::createMethod(DIDescriptor Context, StringRef Name,
                                      StringRef LinkageName, DIFile F,
                                      unsigned LineNo, DICompositeType Ty,
@@ -1167,29 +1015,22 @@
   assert(getNonCompileUnitScope(Context) &&
          "Methods should have both a Context and a context that isn't "
          "the compile unit.");
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
-    F.getFileNode(),
-    DIScope(Context).getRef(),
-    MDString::get(VMContext, Name),
-    MDString::get(VMContext, Name),
-    MDString::get(VMContext, LinkageName),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
-    Ty,
-    ConstantInt::get(Type::getInt1Ty(VMContext), isLocalToUnit),
-    ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
-    ConstantInt::get(Type::getInt32Ty(VMContext), VK),
-    ConstantInt::get(Type::getInt32Ty(VMContext), VIndex),
-    VTableHolder.getRef(),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
-    Fn,
-    TParam,
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    nullptr,
-    // FIXME: Do we want to use different scope/lines?
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_subprogram)
+                       .concat(Name)
+                       .concat(Name)
+                       .concat(LinkageName)
+                       .concat(LineNo)
+                       .concat(isLocalToUnit)
+                       .concat(isDefinition)
+                       .concat(VK)
+                       .concat(VIndex)
+                       .concat(Flags)
+                       .concat(isOptimized)
+                       .concat(LineNo)
+                       // FIXME: Do we want to use different scope/lines?
+                       .get(VMContext),
+                   F.getFileNode(), DIScope(Context).getRef(), Ty,
+                   VTableHolder.getRef(), Fn, TParam, nullptr, nullptr};
   MDNode *Node = MDNode::get(VMContext, Elts);
   if (isDefinition)
     AllSubprograms.push_back(Node);
@@ -1198,32 +1039,26 @@
   return S;
 }
 
-/// createNameSpace - This creates new descriptor for a namespace
-/// with the specified parent scope.
 DINameSpace DIBuilder::createNameSpace(DIDescriptor Scope, StringRef Name,
                                        DIFile File, unsigned LineNo) {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_namespace),
-    File.getFileNode(),
-    getNonCompileUnitScope(Scope),
-    MDString::get(VMContext, Name),
-    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_namespace)
+                       .concat(Name)
+                       .concat(LineNo)
+                       .get(VMContext),
+                   File.getFileNode(), getNonCompileUnitScope(Scope)};
   DINameSpace R(MDNode::get(VMContext, Elts));
   assert(R.Verify() &&
          "createNameSpace should return a verifiable DINameSpace");
   return R;
 }
 
-/// createLexicalBlockFile - This creates a new MDNode that encapsulates
-/// an existing scope with a new filename.
 DILexicalBlockFile DIBuilder::createLexicalBlockFile(DIDescriptor Scope,
-                                                     DIFile File) {
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_lexical_block),
-    File.getFileNode(),
-    Scope
-  };
+                                                     DIFile File,
+                                                     unsigned Discriminator) {
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_lexical_block)
+                       .concat(Discriminator)
+                       .get(VMContext),
+                   File.getFileNode(), Scope};
   DILexicalBlockFile R(MDNode::get(VMContext, Elts));
   assert(
       R.Verify() &&
@@ -1232,8 +1067,7 @@
 }
 
 DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File,
-                                             unsigned Line, unsigned Col,
-                                             unsigned Discriminator) {
+                                             unsigned Line, unsigned Col) {
   // FIXME: This isn't thread safe nor the right way to defeat MDNode uniquing.
   // I believe the right way is to have a self-referential element in the node.
   // Also: why do we bother with line/column - they're not used and the
@@ -1243,23 +1077,20 @@
 
   // Defeat MDNode uniquing for lexical blocks by using unique id.
   static unsigned int unique_id = 0;
-  Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_lexical_block),
-    File.getFileNode(),
-    getNonCompileUnitScope(Scope),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Line),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Col),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Discriminator),
-    ConstantInt::get(Type::getInt32Ty(VMContext), unique_id++)
-  };
+  Value *Elts[] = {HeaderBuilder::get(dwarf::DW_TAG_lexical_block)
+                       .concat(Line)
+                       .concat(Col)
+                       .concat(unique_id++)
+                       .get(VMContext),
+                   File.getFileNode(), getNonCompileUnitScope(Scope)};
   DILexicalBlock R(MDNode::get(VMContext, Elts));
   assert(R.Verify() &&
          "createLexicalBlock should return a verifiable DILexicalBlock");
   return R;
 }
 
-/// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
 Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
+                                      DIExpression Expr,
                                       Instruction *InsertBefore) {
   assert(Storage && "no storage passed to dbg.declare");
   assert(VarInfo.isVariable() &&
@@ -1267,12 +1098,12 @@
   if (!DeclareFn)
     DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 
-  Value *Args[] = { MDNode::get(Storage->getContext(), Storage), VarInfo };
+  Value *Args[] = {MDNode::get(Storage->getContext(), Storage), VarInfo, Expr};
   return CallInst::Create(DeclareFn, Args, "", InsertBefore);
 }
 
-/// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
 Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
+                                      DIExpression Expr,
                                       BasicBlock *InsertAtEnd) {
   assert(Storage && "no storage passed to dbg.declare");
   assert(VarInfo.isVariable() &&
@@ -1280,7 +1111,7 @@
   if (!DeclareFn)
     DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 
-  Value *Args[] = { MDNode::get(Storage->getContext(), Storage), VarInfo };
+  Value *Args[] = {MDNode::get(Storage->getContext(), Storage), VarInfo, Expr};
 
   // If this block already has a terminator then insert this intrinsic
   // before the terminator.
@@ -1290,9 +1121,9 @@
     return CallInst::Create(DeclareFn, Args, "", InsertAtEnd);
 }
 
-/// insertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
 Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
                                                 DIVariable VarInfo,
+                                                DIExpression Expr,
                                                 Instruction *InsertBefore) {
   assert(V && "no value passed to dbg.value");
   assert(VarInfo.isVariable() &&
@@ -1300,15 +1131,15 @@
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
-  Value *Args[] = { MDNode::get(V->getContext(), V),
-                    ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
-                    VarInfo };
+  Value *Args[] = {MDNode::get(V->getContext(), V),
+                   ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
+                   VarInfo, Expr};
   return CallInst::Create(ValueFn, Args, "", InsertBefore);
 }
 
-/// insertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
 Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
                                                 DIVariable VarInfo,
+                                                DIExpression Expr,
                                                 BasicBlock *InsertAtEnd) {
   assert(V && "no value passed to dbg.value");
   assert(VarInfo.isVariable() &&
@@ -1316,8 +1147,8 @@
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
-  Value *Args[] = { MDNode::get(V->getContext(), V),
-                    ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
-                    VarInfo };
+  Value *Args[] = {MDNode::get(V->getContext(), V),
+                   ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
+                   VarInfo, Expr};
   return CallInst::Create(ValueFn, Args, "", InsertAtEnd);
 }

diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index dea05fb..8a057f5 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp

@@ -55,7 +55,7 @@
 
     // Add padding if necessary to align the data element properly.
     if ((StructSize & (TyAlign-1)) != 0)
-      StructSize = DataLayout::RoundUpAlignment(StructSize, TyAlign);
+      StructSize = RoundUpToAlignment(StructSize, TyAlign);
 
     // Keep track of maximum alignment constraint.
     StructAlignment = std::max(TyAlign, StructAlignment);
@@ -70,7 +70,7 @@
   // Add padding to the end of the struct so that it could be put in an array
   // and all array elements would be aligned correctly.
   if ((StructSize & (StructAlignment-1)) != 0)
-    StructSize = DataLayout::RoundUpAlignment(StructSize, StructAlignment);
+    StructSize = RoundUpToAlignment(StructSize, StructAlignment);
 }
 
 
@@ -179,7 +179,7 @@
   clear();
 
   LayoutMap = nullptr;
-  LittleEndian = false;
+  BigEndian = false;
   StackNaturalAlign = 0;
   ManglingMode = MM_None;
 
@@ -239,10 +239,10 @@
       // FIXME: remove this on LLVM 4.0.
       break;
     case 'E':
-      LittleEndian = false;
+      BigEndian = true;
       break;
     case 'e':
-      LittleEndian = true;
+      BigEndian = false;
       break;
     case 'p': {
       // Address space.
@@ -345,6 +345,10 @@
 }
 
 DataLayout::DataLayout(const Module *M) : LayoutMap(nullptr) {
+  init(M);
+}
+
+void DataLayout::init(const Module *M) {
   const DataLayout *Other = M->getDataLayout();
   if (Other)
     *this = *Other;
@@ -353,7 +357,7 @@
 }
 
 bool DataLayout::operator==(const DataLayout &Other) const {
-  bool Ret = LittleEndian == Other.LittleEndian &&
+  bool Ret = BigEndian == Other.BigEndian &&
              StackNaturalAlign == Other.StackNaturalAlign &&
              ManglingMode == Other.ManglingMode &&
              LegalIntWidths == Other.LegalIntWidths &&
@@ -522,7 +526,7 @@
   std::string Result;
   raw_string_ostream OS(Result);
 
-  OS << (LittleEndian ? "e" : "E");
+  OS << (BigEndian ? "E" : "e");
 
   switch (ManglingMode) {
   case MM_None:
@@ -637,7 +641,7 @@
             ? getPointerABIAlignment(0)
             : getPointerPrefAlignment(0));
   case Type::PointerTyID: {
-    unsigned AS = dyn_cast<PointerType>(Ty)->getAddressSpace();
+    unsigned AS = cast<PointerType>(Ty)->getAddressSpace();
     return (abi_or_pref
             ? getPointerABIAlignment(AS)
             : getPointerPrefAlignment(AS));
@@ -796,17 +800,17 @@
 }
 
 DataLayoutPass::DataLayoutPass() : ImmutablePass(ID), DL("") {
-  report_fatal_error("Bad DataLayoutPass ctor used. Tool did not specify a "
-                     "DataLayout to use?");
+  initializeDataLayoutPassPass(*PassRegistry::getPassRegistry());
 }
 
 DataLayoutPass::~DataLayoutPass() {}
 
-DataLayoutPass::DataLayoutPass(const DataLayout &DL)
-    : ImmutablePass(ID), DL(DL) {
-  initializeDataLayoutPassPass(*PassRegistry::getPassRegistry());
+bool DataLayoutPass::doInitialization(Module &M) {
+  DL.init(&M);
+  return false;
 }
 
-DataLayoutPass::DataLayoutPass(const Module *M) : ImmutablePass(ID), DL(M) {
-  initializeDataLayoutPassPass(*PassRegistry::getPassRegistry());
+bool DataLayoutPass::doFinalization(Module &M) {
+  DL.reset("");
+  return false;
 }

diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index 5e39b24..bb5161d 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp

@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -46,10 +47,9 @@
           DILexicalBlockFile(DbgNode).Verify() ||
           DISubrange(DbgNode).Verify() || DIEnumerator(DbgNode).Verify() ||
           DIObjCProperty(DbgNode).Verify() ||
-          DIUnspecifiedParameter(DbgNode).Verify() ||
           DITemplateTypeParameter(DbgNode).Verify() ||
           DITemplateValueParameter(DbgNode).Verify() ||
-          DIImportedEntity(DbgNode).Verify());
+          DIImportedEntity(DbgNode).Verify() || DIExpression(DbgNode).Verify());
 }
 
 static Value *getField(const MDNode *DbgNode, unsigned Elt) {
@@ -138,25 +138,53 @@
   }
 }
 
-uint64_t DIVariable::getAddrElement(unsigned Idx) const {
-  DIDescriptor ComplexExpr = getDescriptorField(8);
-  if (Idx < ComplexExpr->getNumOperands())
-    if (auto *CI = dyn_cast_or_null<ConstantInt>(ComplexExpr->getOperand(Idx)))
-      return CI->getZExtValue();
-
-  assert(false && "non-existing complex address element requested");
-  return 0;
+static unsigned DIVariableInlinedAtIndex = 4;
+MDNode *DIVariable::getInlinedAt() const {
+  return getNodeField(DbgNode, DIVariableInlinedAtIndex);
 }
 
-/// getInlinedAt - If this variable is inlined then return inline location.
-MDNode *DIVariable::getInlinedAt() const { return getNodeField(DbgNode, 7); }
+/// \brief Return the size reported by the variable's type.
+unsigned DIVariable::getSizeInBits(const DITypeIdentifierMap &Map) {
+  DIType Ty = getType().resolve(Map);
+  // Follow derived types until we reach a type that
+  // reports back a size.
+  while (Ty.isDerivedType() && !Ty.getSizeInBits()) {
+    DIDerivedType DT(&*Ty);
+    Ty = DT.getTypeDerivedFrom().resolve(Map);
+  }
+  assert(Ty.getSizeInBits() && "type with size 0");
+  return Ty.getSizeInBits();
+}
+
+uint64_t DIExpression::getElement(unsigned Idx) const {
+  unsigned I = Idx + 1;
+  assert(I < getNumHeaderFields() &&
+         "non-existing complex address element requested");
+  return getHeaderFieldAs<int64_t>(I);
+}
+
+bool DIExpression::isVariablePiece() const {
+  return getNumElements() && getElement(0) == dwarf::DW_OP_piece;
+}
+
+uint64_t DIExpression::getPieceOffset() const {
+  assert(isVariablePiece());
+  return getElement(1);
+}
+
+uint64_t DIExpression::getPieceSize() const {
+  assert(isVariablePiece());
+  return getElement(2);
+}
 
 //===----------------------------------------------------------------------===//
 // Predicates
 //===----------------------------------------------------------------------===//
 
-/// isBasicType - Return true if the specified tag is legal for
-/// DIBasicType.
+bool DIDescriptor::isSubroutineType() const {
+  return isCompositeType() && getTag() == dwarf::DW_TAG_subroutine_type;
+}
+
 bool DIDescriptor::isBasicType() const {
   if (!DbgNode)
     return false;
@@ -169,7 +197,6 @@
   }
 }
 
-/// isDerivedType - Return true if the specified tag is legal for DIDerivedType.
 bool DIDescriptor::isDerivedType() const {
   if (!DbgNode)
     return false;
@@ -192,8 +219,6 @@
   }
 }
 
-/// isCompositeType - Return true if the specified tag is legal for
-/// DICompositeType.
 bool DIDescriptor::isCompositeType() const {
   if (!DbgNode)
     return false;
@@ -210,7 +235,6 @@
   }
 }
 
-/// isVariable - Return true if the specified tag is legal for DIVariable.
 bool DIDescriptor::isVariable() const {
   if (!DbgNode)
     return false;
@@ -223,32 +247,19 @@
   }
 }
 
-/// isType - Return true if the specified tag is legal for DIType.
 bool DIDescriptor::isType() const {
   return isBasicType() || isCompositeType() || isDerivedType();
 }
 
-/// isSubprogram - Return true if the specified tag is legal for
-/// DISubprogram.
 bool DIDescriptor::isSubprogram() const {
   return DbgNode && getTag() == dwarf::DW_TAG_subprogram;
 }
 
-/// isGlobalVariable - Return true if the specified tag is legal for
-/// DIGlobalVariable.
 bool DIDescriptor::isGlobalVariable() const {
   return DbgNode && (getTag() == dwarf::DW_TAG_variable ||
                      getTag() == dwarf::DW_TAG_constant);
 }
 
-/// isUnspecifiedParmeter - Return true if the specified tag is
-/// DW_TAG_unspecified_parameters.
-bool DIDescriptor::isUnspecifiedParameter() const {
-  return DbgNode && getTag() == dwarf::DW_TAG_unspecified_parameters;
-}
-
-/// isScope - Return true if the specified tag is one of the scope
-/// related tag.
 bool DIDescriptor::isScope() const {
   if (!DbgNode)
     return false;
@@ -265,83 +276,67 @@
   return isType();
 }
 
-/// isTemplateTypeParameter - Return true if the specified tag is
-/// DW_TAG_template_type_parameter.
 bool DIDescriptor::isTemplateTypeParameter() const {
   return DbgNode && getTag() == dwarf::DW_TAG_template_type_parameter;
 }
 
-/// isTemplateValueParameter - Return true if the specified tag is
-/// DW_TAG_template_value_parameter.
 bool DIDescriptor::isTemplateValueParameter() const {
   return DbgNode && (getTag() == dwarf::DW_TAG_template_value_parameter ||
                      getTag() == dwarf::DW_TAG_GNU_template_template_param ||
                      getTag() == dwarf::DW_TAG_GNU_template_parameter_pack);
 }
 
-/// isCompileUnit - Return true if the specified tag is DW_TAG_compile_unit.
 bool DIDescriptor::isCompileUnit() const {
   return DbgNode && getTag() == dwarf::DW_TAG_compile_unit;
 }
 
-/// isFile - Return true if the specified tag is DW_TAG_file_type.
 bool DIDescriptor::isFile() const {
   return DbgNode && getTag() == dwarf::DW_TAG_file_type;
 }
 
-/// isNameSpace - Return true if the specified tag is DW_TAG_namespace.
 bool DIDescriptor::isNameSpace() const {
   return DbgNode && getTag() == dwarf::DW_TAG_namespace;
 }
 
-/// isLexicalBlockFile - Return true if the specified descriptor is a
-/// lexical block with an extra file.
 bool DIDescriptor::isLexicalBlockFile() const {
   return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
-         (DbgNode->getNumOperands() == 3);
+         DbgNode->getNumOperands() == 3 && getNumHeaderFields() == 2;
 }
 
-/// isLexicalBlock - Return true if the specified tag is DW_TAG_lexical_block.
 bool DIDescriptor::isLexicalBlock() const {
+  // FIXME: There are always exactly 4 header fields in DILexicalBlock, but
+  // something relies on this returning true for DILexicalBlockFile.
   return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
-         (DbgNode->getNumOperands() > 3);
+         DbgNode->getNumOperands() == 3 &&
+         (getNumHeaderFields() == 2 || getNumHeaderFields() == 4);
 }
 
-/// isSubrange - Return true if the specified tag is DW_TAG_subrange_type.
 bool DIDescriptor::isSubrange() const {
   return DbgNode && getTag() == dwarf::DW_TAG_subrange_type;
 }
 
-/// isEnumerator - Return true if the specified tag is DW_TAG_enumerator.
 bool DIDescriptor::isEnumerator() const {
   return DbgNode && getTag() == dwarf::DW_TAG_enumerator;
 }
 
-/// isObjCProperty - Return true if the specified tag is DW_TAG_APPLE_property.
 bool DIDescriptor::isObjCProperty() const {
   return DbgNode && getTag() == dwarf::DW_TAG_APPLE_property;
 }
 
-/// \brief Return true if the specified tag is DW_TAG_imported_module or
-/// DW_TAG_imported_declaration.
 bool DIDescriptor::isImportedEntity() const {
   return DbgNode && (getTag() == dwarf::DW_TAG_imported_module ||
                      getTag() == dwarf::DW_TAG_imported_declaration);
 }
 
+bool DIDescriptor::isExpression() const {
+  return DbgNode && (getTag() == dwarf::DW_TAG_expression);
+}
+
 //===----------------------------------------------------------------------===//
 // Simple Descriptor Constructors and other Methods
 //===----------------------------------------------------------------------===//
 
-unsigned DIArray::getNumElements() const {
-  if (!DbgNode)
-    return 0;
-  return DbgNode->getNumOperands();
-}
-
-/// replaceAllUsesWith - Replace all uses of the MDNode used by this
-/// type with the one in the passed descriptor.
-void DIType::replaceAllUsesWith(LLVMContext &VMContext, DIDescriptor D) {
+void DIDescriptor::replaceAllUsesWith(LLVMContext &VMContext, DIDescriptor D) {
 
   assert(DbgNode && "Trying to replace an unverified type!");
 
@@ -362,12 +357,10 @@
   const Value *V = cast_or_null<Value>(DN);
   Node->replaceAllUsesWith(const_cast<Value *>(V));
   MDNode::deleteTemporary(Node);
-  DbgNode = D;
+  DbgNode = DN;
 }
 
-/// replaceAllUsesWith - Replace all uses of the MDNode used by this
-/// type with the one in D.
-void DIType::replaceAllUsesWith(MDNode *D) {
+void DIDescriptor::replaceAllUsesWith(MDNode *D) {
 
   assert(DbgNode && "Trying to replace an unverified type!");
   assert(DbgNode != D && "This replacement should always happen");
@@ -378,7 +371,6 @@
   MDNode::deleteTemporary(Node);
 }
 
-/// Verify - Verify that a compile unit is well formed.
 bool DICompileUnit::Verify() const {
   if (!isCompileUnit())
     return false;
@@ -388,19 +380,19 @@
   if (getFilename().empty())
     return false;
 
-  return DbgNode->getNumOperands() == 14;
+  return DbgNode->getNumOperands() == 7 && getNumHeaderFields() == 8;
 }
 
-/// Verify - Verify that an ObjC property is well formed.
 bool DIObjCProperty::Verify() const {
   if (!isObjCProperty())
     return false;
 
   // Don't worry about the rest of the strings for now.
-  return DbgNode->getNumOperands() == 8;
+  return DbgNode->getNumOperands() == 3 && getNumHeaderFields() == 6;
 }
 
-/// Check if a field at position Elt of a MDNode is a MDNode.
+/// \brief Check if a field at position Elt of a MDNode is a MDNode.
+///
 /// We currently allow an empty string and an integer.
 /// But we don't allow a non-empty string in a MDNode field.
 static bool fieldIsMDNode(const MDNode *DbgNode, unsigned Elt) {
@@ -412,41 +404,42 @@
   return true;
 }
 
-/// Check if a field at position Elt of a MDNode is a MDString.
+/// \brief Check if a field at position Elt of a MDNode is a MDString.
 static bool fieldIsMDString(const MDNode *DbgNode, unsigned Elt) {
   Value *Fld = getField(DbgNode, Elt);
   return !Fld || isa<MDString>(Fld);
 }
 
-/// Check if a value can be a reference to a type.
-static bool isTypeRef(const Value *Val) {
-  return !Val ||
-         (isa<MDString>(Val) && !cast<MDString>(Val)->getString().empty()) ||
-         (isa<MDNode>(Val) && DIType(cast<MDNode>(Val)).isType());
+/// \brief Check if a value can be a reference to a type.
+static bool isTypeRef(const Metadata *MD) {
+  if (!MD)
+    return true;
+  if (auto *S = dyn_cast<MDString>(MD))
+    return !S->getString().empty();
+  if (auto *N = dyn_cast<MDNode>(MD))
+    return DIType(N).isType();
+  return false;
 }
 
-/// Check if a field at position Elt of a MDNode can be a reference to a type.
+/// \brief Check if referenced field might be a type.
 static bool fieldIsTypeRef(const MDNode *DbgNode, unsigned Elt) {
-  Value *Fld = getField(DbgNode, Elt);
-  return isTypeRef(Fld);
+  return isTypeRef(dyn_cast_or_null<Metadata>(getField(DbgNode, Elt)));
 }
 
-/// Check if a value can be a ScopeRef.
-static bool isScopeRef(const Value *Val) {
-  return !Val ||
-    (isa<MDString>(Val) && !cast<MDString>(Val)->getString().empty()) ||
-    // Not checking for Val->isScope() here, because it would work
-    // only for lexical scopes and not all subclasses of DIScope.
-    isa<MDNode>(Val);
+/// \brief Check if a value can be a ScopeRef.
+static bool isScopeRef(const Metadata *MD) {
+  if (!MD)
+    return true;
+  if (auto *S = dyn_cast<MDString>(MD))
+    return !S->getString().empty();
+  return isa<MDNode>(MD);
 }
 
-/// Check if a field at position Elt of a MDNode can be a ScopeRef.
+/// \brief Check if a field at position Elt of a MDNode can be a ScopeRef.
 static bool fieldIsScopeRef(const MDNode *DbgNode, unsigned Elt) {
-  Value *Fld = getField(DbgNode, Elt);
-  return isScopeRef(Fld);
+  return isScopeRef(dyn_cast_or_null<Metadata>(getField(DbgNode, Elt)));
 }
 
-/// Verify - Verify that a type descriptor is well formed.
 bool DIType::Verify() const {
   if (!isType())
     return false;
@@ -467,6 +460,7 @@
       Tag != dwarf::DW_TAG_inheritance && Tag != dwarf::DW_TAG_friend &&
       getFilename().empty())
     return false;
+
   // DIType is abstract, it should be a BasicType, a DerivedType or
   // a CompositeType.
   if (isBasicType())
@@ -479,89 +473,113 @@
     return false;
 }
 
-/// Verify - Verify that a basic type descriptor is well formed.
 bool DIBasicType::Verify() const {
-  return isBasicType() && DbgNode->getNumOperands() == 10;
+  return isBasicType() && DbgNode->getNumOperands() == 3 &&
+         getNumHeaderFields() == 8;
 }
 
-/// Verify - Verify that a derived type descriptor is well formed.
 bool DIDerivedType::Verify() const {
-  // Make sure DerivedFrom @ field 9 is TypeRef.
-  if (!fieldIsTypeRef(DbgNode, 9))
+  // Make sure DerivedFrom @ field 3 is TypeRef.
+  if (!fieldIsTypeRef(DbgNode, 3))
     return false;
   if (getTag() == dwarf::DW_TAG_ptr_to_member_type)
-    // Make sure ClassType @ field 10 is a TypeRef.
-    if (!fieldIsTypeRef(DbgNode, 10))
+    // Make sure ClassType @ field 4 is a TypeRef.
+    if (!fieldIsTypeRef(DbgNode, 4))
       return false;
 
-  return isDerivedType() && DbgNode->getNumOperands() >= 10 &&
-         DbgNode->getNumOperands() <= 14;
+  return isDerivedType() && DbgNode->getNumOperands() >= 4 &&
+         DbgNode->getNumOperands() <= 8 && getNumHeaderFields() >= 7 &&
+         getNumHeaderFields() <= 8;
 }
 
-/// Verify - Verify that a composite type descriptor is well formed.
 bool DICompositeType::Verify() const {
   if (!isCompositeType())
     return false;
 
-  // Make sure DerivedFrom @ field 9 and ContainingType @ field 12 are TypeRef.
-  if (!fieldIsTypeRef(DbgNode, 9))
+  // Make sure DerivedFrom @ field 3 and ContainingType @ field 5 are TypeRef.
+  if (!fieldIsTypeRef(DbgNode, 3))
     return false;
-  if (!fieldIsTypeRef(DbgNode, 12))
+  if (!fieldIsTypeRef(DbgNode, 5))
     return false;
 
-  // Make sure the type identifier at field 14 is MDString, it can be null.
-  if (!fieldIsMDString(DbgNode, 14))
+  // Make sure the type identifier at field 7 is MDString, it can be null.
+  if (!fieldIsMDString(DbgNode, 7))
     return false;
 
   // A subroutine type can't be both & and &&.
   if (isLValueReference() && isRValueReference())
     return false;
 
-  return DbgNode->getNumOperands() == 15;
+  return DbgNode->getNumOperands() == 8 && getNumHeaderFields() == 8;
 }
 
-/// Verify - Verify that a subprogram descriptor is well formed.
 bool DISubprogram::Verify() const {
   if (!isSubprogram())
     return false;
 
-  // Make sure context @ field 2 is a ScopeRef and type @ field 7 is a MDNode.
+  // Make sure context @ field 2 is a ScopeRef and type @ field 3 is a MDNode.
   if (!fieldIsScopeRef(DbgNode, 2))
     return false;
-  if (!fieldIsMDNode(DbgNode, 7))
+  if (!fieldIsMDNode(DbgNode, 3))
     return false;
-  // Containing type @ field 12.
-  if (!fieldIsTypeRef(DbgNode, 12))
+  // Containing type @ field 4.
+  if (!fieldIsTypeRef(DbgNode, 4))
     return false;
 
   // A subprogram can't be both & and &&.
   if (isLValueReference() && isRValueReference())
     return false;
 
-  return DbgNode->getNumOperands() == 20;
+  // If a DISubprogram has an llvm::Function*, then scope chains from all
+  // instructions within the function should lead to this DISubprogram.
+  if (auto *F = getFunction()) {
+    LLVMContext &Ctxt = F->getContext();
+    for (auto &BB : *F) {
+      for (auto &I : BB) {
+        DebugLoc DL = I.getDebugLoc();
+        if (DL.isUnknown())
+          continue;
+
+        MDNode *Scope = nullptr;
+        MDNode *IA = nullptr;
+        // walk the inlined-at scopes
+        while (DL.getScopeAndInlinedAt(Scope, IA, F->getContext()), IA)
+          DL = DebugLoc::getFromDILocation(IA);
+        DL.getScopeAndInlinedAt(Scope, IA, Ctxt);
+        assert(!IA);
+        while (!DIDescriptor(Scope).isSubprogram()) {
+          DILexicalBlockFile D(Scope);
+          Scope = D.isLexicalBlockFile()
+                      ? D.getScope()
+                      : DebugLoc::getFromDILexicalBlock(Scope).getScope(Ctxt);
+        }
+        if (!DISubprogram(Scope).describes(F))
+          return false;
+      }
+    }
+  }
+  return DbgNode->getNumOperands() == 9 && getNumHeaderFields() == 12;
 }
 
-/// Verify - Verify that a global variable descriptor is well formed.
 bool DIGlobalVariable::Verify() const {
   if (!isGlobalVariable())
     return false;
 
   if (getDisplayName().empty())
     return false;
-  // Make sure context @ field 2 is an MDNode.
-  if (!fieldIsMDNode(DbgNode, 2))
+  // Make sure context @ field 1 is a ScopeRef.
+  if (!fieldIsScopeRef(DbgNode, 1))
     return false;
-  // Make sure that type @ field 8 is a DITypeRef.
-  if (!fieldIsTypeRef(DbgNode, 8))
+  // Make sure that type @ field 3 is a DITypeRef.
+  if (!fieldIsTypeRef(DbgNode, 3))
     return false;
-  // Make sure StaticDataMemberDeclaration @ field 12 is MDNode.
-  if (!fieldIsMDNode(DbgNode, 12))
+  // Make sure StaticDataMemberDeclaration @ field 5 is MDNode.
+  if (!fieldIsMDNode(DbgNode, 5))
     return false;
 
-  return DbgNode->getNumOperands() == 13;
+  return DbgNode->getNumOperands() == 6 && getNumHeaderFields() == 7;
 }
 
-/// Verify - Verify that a variable descriptor is well formed.
 bool DIVariable::Verify() const {
   if (!isVariable())
     return false;
@@ -569,19 +587,31 @@
   // Make sure context @ field 1 is an MDNode.
   if (!fieldIsMDNode(DbgNode, 1))
     return false;
-  // Make sure that type @ field 5 is a DITypeRef.
-  if (!fieldIsTypeRef(DbgNode, 5))
+  // Make sure that type @ field 3 is a DITypeRef.
+  if (!fieldIsTypeRef(DbgNode, 3))
     return false;
 
-  // Variable without a complex expression.
-  if (DbgNode->getNumOperands() == 8)
+  // Check the number of header fields, which is common between complex and
+  // simple variables.
+  if (getNumHeaderFields() != 4)
+    return false;
+
+  // Variable without an inline location.
+  if (DbgNode->getNumOperands() == 4)
     return true;
 
-  // Make sure the complex expression is an MDNode.
-  return (DbgNode->getNumOperands() == 9 && fieldIsMDNode(DbgNode, 8));
+  // Variable with an inline location.
+  return getInlinedAt() != nullptr && DbgNode->getNumOperands() == 5;
 }
 
-/// Verify - Verify that a location descriptor is well formed.
+bool DIExpression::Verify() const {
+  // Empty DIExpressions may be represented as a nullptr.
+  if (!DbgNode)
+    return true;
+
+  return isExpression() && DbgNode->getNumOperands() == 1;
+}
+
 bool DILocation::Verify() const {
   if (!DbgNode)
     return false;
@@ -589,69 +619,59 @@
   return DbgNode->getNumOperands() == 4;
 }
 
-/// Verify - Verify that a namespace descriptor is well formed.
 bool DINameSpace::Verify() const {
   if (!isNameSpace())
     return false;
-  return DbgNode->getNumOperands() == 5;
+  return DbgNode->getNumOperands() == 3 && getNumHeaderFields() == 3;
 }
 
-/// \brief Retrieve the MDNode for the directory/file pair.
 MDNode *DIFile::getFileNode() const { return getNodeField(DbgNode, 1); }
 
-/// \brief Verify that the file descriptor is well formed.
 bool DIFile::Verify() const {
   return isFile() && DbgNode->getNumOperands() == 2;
 }
 
-/// \brief Verify that the enumerator descriptor is well formed.
 bool DIEnumerator::Verify() const {
-  return isEnumerator() && DbgNode->getNumOperands() == 3;
+  return isEnumerator() && DbgNode->getNumOperands() == 1 &&
+         getNumHeaderFields() == 3;
 }
 
-/// \brief Verify that the subrange descriptor is well formed.
 bool DISubrange::Verify() const {
-  return isSubrange() && DbgNode->getNumOperands() == 3;
+  return isSubrange() && DbgNode->getNumOperands() == 1 &&
+         getNumHeaderFields() == 3;
 }
 
-/// \brief Verify that the lexical block descriptor is well formed.
 bool DILexicalBlock::Verify() const {
-  return isLexicalBlock() && DbgNode->getNumOperands() == 7;
+  return isLexicalBlock() && DbgNode->getNumOperands() == 3 &&
+         getNumHeaderFields() == 4;
 }
 
-/// \brief Verify that the file-scoped lexical block descriptor is well formed.
 bool DILexicalBlockFile::Verify() const {
-  return isLexicalBlockFile() && DbgNode->getNumOperands() == 3;
+  return isLexicalBlockFile() && DbgNode->getNumOperands() == 3 &&
+         getNumHeaderFields() == 2;
 }
 
-/// \brief Verify that an unspecified parameter descriptor is well formed.
-bool DIUnspecifiedParameter::Verify() const {
-  return isUnspecifiedParameter() && DbgNode->getNumOperands() == 1;
-}
-
-/// \brief Verify that the template type parameter descriptor is well formed.
 bool DITemplateTypeParameter::Verify() const {
-  return isTemplateTypeParameter() && DbgNode->getNumOperands() == 7;
+  return isTemplateTypeParameter() && DbgNode->getNumOperands() == 4 &&
+         getNumHeaderFields() == 4;
 }
 
-/// \brief Verify that the template value parameter descriptor is well formed.
 bool DITemplateValueParameter::Verify() const {
-  return isTemplateValueParameter() && DbgNode->getNumOperands() == 8;
+  return isTemplateValueParameter() && DbgNode->getNumOperands() == 5 &&
+         getNumHeaderFields() == 4;
 }
 
-/// \brief Verify that the imported module descriptor is well formed.
 bool DIImportedEntity::Verify() const {
-  return isImportedEntity() &&
-         (DbgNode->getNumOperands() == 4 || DbgNode->getNumOperands() == 5);
+  return isImportedEntity() && DbgNode->getNumOperands() == 3 &&
+         getNumHeaderFields() == 3;
 }
 
-/// getObjCProperty - Return property node, if this ivar is associated with one.
 MDNode *DIDerivedType::getObjCProperty() const {
-  return getNodeField(DbgNode, 10);
+  return getNodeField(DbgNode, 4);
 }
 
 MDString *DICompositeType::getIdentifier() const {
-  return cast_or_null<MDString>(getField(DbgNode, 14));
+  return cast_or_null<MDString>(getField(DbgNode, 7));
 }
 
 #ifndef NDEBUG
@@ -669,27 +689,21 @@
 }
 #endif
 
-/// \brief Set the array of member DITypes.
-void DICompositeType::setTypeArray(DIArray Elements, DIArray TParams) {
-  assert((!TParams || DbgNode->getNumOperands() == 15) &&
-         "If you're setting the template parameters this should include a slot "
-         "for that!");
+void DICompositeType::setArraysHelper(MDNode *Elements, MDNode *TParams) {
   TrackingVH<MDNode> N(*this);
   if (Elements) {
 #ifndef NDEBUG
     // Check that the new list of members contains all the old members as well.
-    if (const MDNode *El = cast_or_null<MDNode>(N->getOperand(10)))
+    if (const MDNode *El = cast_or_null<MDNode>(N->getOperand(4)))
       VerifySubsetOf(El, Elements);
 #endif
-    N->replaceOperandWith(10, Elements);
+    N->replaceOperandWith(4, Elements);
   }
   if (TParams)
-    N->replaceOperandWith(13, TParams);
+    N->replaceOperandWith(6, TParams);
   DbgNode = N;
 }
 
-/// Generate a reference to this DIType. Uses the type identifier instead
-/// of the actual MDNode if possible, to help type uniquing.
 DIScopeRef DIScope::getRef() const {
   if (!isCompositeType())
     return DIScopeRef(*this);
@@ -699,15 +713,12 @@
   return DIScopeRef(DTy.getIdentifier());
 }
 
-/// \brief Set the containing type.
 void DICompositeType::setContainingType(DICompositeType ContainingType) {
   TrackingVH<MDNode> N(*this);
-  N->replaceOperandWith(12, ContainingType.getRef());
+  N->replaceOperandWith(5, ContainingType.getRef());
   DbgNode = N;
 }
 
-/// isInlinedFnArgument - Return true if this variable provides debugging
-/// information for an inlined function arguments.
 bool DIVariable::isInlinedFnArgument(const Function *CurFn) {
   assert(CurFn && "Invalid function");
   if (!getContext().isSubprogram())
@@ -717,8 +728,6 @@
   return !DISubprogram(getContext()).describes(CurFn);
 }
 
-/// describes - Return true if this subprogram provides debugging
-/// information for the function F.
 bool DISubprogram::describes(const Function *F) {
   assert(F && "Invalid function");
   if (F == getFunction())
@@ -731,27 +740,18 @@
   return false;
 }
 
-unsigned DISubprogram::isOptimized() const {
-  assert(DbgNode && "Invalid subprogram descriptor!");
-  if (DbgNode->getNumOperands() == 15)
-    return getUnsignedField(14);
-  return 0;
-}
-
 MDNode *DISubprogram::getVariablesNodes() const {
-  return getNodeField(DbgNode, 18);
+  return getNodeField(DbgNode, 8);
 }
 
 DIArray DISubprogram::getVariables() const {
-  return DIArray(getNodeField(DbgNode, 18));
+  return DIArray(getNodeField(DbgNode, 8));
 }
 
 Value *DITemplateValueParameter::getValue() const {
-  return getField(DbgNode, 4);
+  return getField(DbgNode, 3);
 }
 
-// If the current node has a parent scope then return that,
-// else return an empty scope.
 DIScopeRef DIScope::getContext() const {
 
   if (isType())
@@ -773,7 +773,6 @@
   return DIScopeRef(nullptr);
 }
 
-// If the scope node has a name, return that, else return an empty string.
 StringRef DIScope::getName() const {
   if (isType())
     return DIType(DbgNode).getName();
@@ -800,44 +799,58 @@
 }
 
 DIArray DICompileUnit::getEnumTypes() const {
-  if (!DbgNode || DbgNode->getNumOperands() < 13)
+  if (!DbgNode || DbgNode->getNumOperands() < 7)
     return DIArray();
 
-  return DIArray(getNodeField(DbgNode, 7));
+  return DIArray(getNodeField(DbgNode, 2));
 }
 
 DIArray DICompileUnit::getRetainedTypes() const {
-  if (!DbgNode || DbgNode->getNumOperands() < 13)
+  if (!DbgNode || DbgNode->getNumOperands() < 7)
     return DIArray();
 
-  return DIArray(getNodeField(DbgNode, 8));
+  return DIArray(getNodeField(DbgNode, 3));
 }
 
 DIArray DICompileUnit::getSubprograms() const {
-  if (!DbgNode || DbgNode->getNumOperands() < 13)
+  if (!DbgNode || DbgNode->getNumOperands() < 7)
     return DIArray();
 
-  return DIArray(getNodeField(DbgNode, 9));
+  return DIArray(getNodeField(DbgNode, 4));
 }
 
 DIArray DICompileUnit::getGlobalVariables() const {
-  if (!DbgNode || DbgNode->getNumOperands() < 13)
+  if (!DbgNode || DbgNode->getNumOperands() < 7)
     return DIArray();
 
-  return DIArray(getNodeField(DbgNode, 10));
+  return DIArray(getNodeField(DbgNode, 5));
 }
 
 DIArray DICompileUnit::getImportedEntities() const {
-  if (!DbgNode || DbgNode->getNumOperands() < 13)
+  if (!DbgNode || DbgNode->getNumOperands() < 7)
     return DIArray();
 
-  return DIArray(getNodeField(DbgNode, 11));
+  return DIArray(getNodeField(DbgNode, 6));
 }
 
-/// copyWithNewScope - Return a copy of this location, replacing the
-/// current scope with the given one.
+void DICompileUnit::replaceSubprograms(DIArray Subprograms) {
+  assert(Verify() && "Expected compile unit");
+  if (Subprograms == getSubprograms())
+    return;
+
+  const_cast<MDNode *>(DbgNode)->replaceOperandWith(4, Subprograms);
+}
+
+void DICompileUnit::replaceGlobalVariables(DIArray GlobalVariables) {
+  assert(Verify() && "Expected compile unit");
+  if (GlobalVariables == getGlobalVariables())
+    return;
+
+  const_cast<MDNode *>(DbgNode)->replaceOperandWith(5, GlobalVariables);
+}
+
 DILocation DILocation::copyWithNewScope(LLVMContext &Ctx,
-                                        DILexicalBlock NewScope) {
+                                        DILexicalBlockFile NewScope) {
   SmallVector<Value *, 10> Elts;
   assert(Verify());
   for (unsigned I = 0; I < DbgNode->getNumOperands(); ++I) {
@@ -850,78 +863,43 @@
   return DILocation(NewDIL);
 }
 
-/// computeNewDiscriminator - Generate a new discriminator value for this
-/// file and line location.
 unsigned DILocation::computeNewDiscriminator(LLVMContext &Ctx) {
   std::pair<const char *, unsigned> Key(getFilename().data(), getLineNumber());
   return ++Ctx.pImpl->DiscriminatorTable[Key];
 }
 
-/// fixupSubprogramName - Replace contains special characters used
-/// in a typical Objective-C names with '.' in a given string.
-static void fixupSubprogramName(DISubprogram Fn, SmallVectorImpl<char> &Out) {
-  StringRef FName =
-      Fn.getFunction() ? Fn.getFunction()->getName() : Fn.getName();
-  FName = Function::getRealLinkageName(FName);
-
-  StringRef Prefix("llvm.dbg.lv.");
-  Out.reserve(FName.size() + Prefix.size());
-  Out.append(Prefix.begin(), Prefix.end());
-
-  bool isObjCLike = false;
-  for (size_t i = 0, e = FName.size(); i < e; ++i) {
-    char C = FName[i];
-    if (C == '[')
-      isObjCLike = true;
-
-    if (isObjCLike && (C == '[' || C == ']' || C == ' ' || C == ':' ||
-                       C == '+' || C == '(' || C == ')'))
-      Out.push_back('.');
-    else
-      Out.push_back(C);
-  }
-}
-
-/// getFnSpecificMDNode - Return a NameMDNode, if available, that is
-/// suitable to hold function specific information.
-NamedMDNode *llvm::getFnSpecificMDNode(const Module &M, DISubprogram Fn) {
-  SmallString<32> Name;
-  fixupSubprogramName(Fn, Name);
-  return M.getNamedMetadata(Name.str());
-}
-
-/// getOrInsertFnSpecificMDNode - Return a NameMDNode that is suitable
-/// to hold function specific information.
-NamedMDNode *llvm::getOrInsertFnSpecificMDNode(Module &M, DISubprogram Fn) {
-  SmallString<32> Name;
-  fixupSubprogramName(Fn, Name);
-  return M.getOrInsertNamedMetadata(Name.str());
-}
-
-/// createInlinedVariable - Create a new inlined variable based on current
-/// variable.
-/// @param DV            Current Variable.
-/// @param InlinedScope  Location at current variable is inlined.
 DIVariable llvm::createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
                                        LLVMContext &VMContext) {
-  SmallVector<Value *, 16> Elts;
-  // Insert inlined scope as 7th element.
-  for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
-    i == 7 ? Elts.push_back(InlinedScope) : Elts.push_back(DV->getOperand(i));
-  return DIVariable(MDNode::get(VMContext, Elts));
+  assert(DIVariable(DV).Verify() && "Expected a DIVariable");
+  if (!InlinedScope)
+    return cleanseInlinedVariable(DV, VMContext);
+
+  // Insert inlined scope.
+  SmallVector<Value *, 8> Elts;
+  for (unsigned I = 0, E = DIVariableInlinedAtIndex; I != E; ++I)
+    Elts.push_back(DV->getOperand(I));
+  Elts.push_back(InlinedScope);
+
+  DIVariable Inlined(MDNode::get(VMContext, Elts));
+  assert(Inlined.Verify() && "Expected to create a DIVariable");
+  return Inlined;
 }
 
-/// cleanseInlinedVariable - Remove inlined scope from the variable.
 DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) {
-  SmallVector<Value *, 16> Elts;
-  // Insert inlined scope as 7th element.
-  for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
-    i == 7 ? Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)))
-           : Elts.push_back(DV->getOperand(i));
-  return DIVariable(MDNode::get(VMContext, Elts));
+  assert(DIVariable(DV).Verify() && "Expected a DIVariable");
+  if (!DIVariable(DV).getInlinedAt())
+    return DIVariable(DV);
+
+  // Remove inlined scope.
+  SmallVector<Value *, 8> Elts;
+  for (unsigned I = 0, E = DIVariableInlinedAtIndex; I != E; ++I)
+    Elts.push_back(DV->getOperand(I));
+
+  DIVariable Cleansed(MDNode::get(VMContext, Elts));
+  assert(Cleansed.Verify() && "Expected to create a DIVariable");
+  return Cleansed;
 }
 
-/// getDISubprogram - Find subprogram that is enclosing this scope.
 DISubprogram llvm::getDISubprogram(const MDNode *Scope) {
   DIDescriptor D(Scope);
   if (D.isSubprogram())
@@ -936,7 +914,23 @@
   return DISubprogram();
 }
 
-/// getDICompositeType - Find underlying composite type.
+DISubprogram llvm::getDISubprogram(const Function *F) {
+  // We look for the first instr that has a debug annotation leading back to F.
+  for (auto &BB : *F) {
+    auto Inst = std::find_if(BB.begin(), BB.end(), [](const Instruction &Inst) {
+      return !Inst.getDebugLoc().isUnknown();
+    });
+    if (Inst == BB.end())
+      continue;
+    DebugLoc DLoc = Inst->getDebugLoc();
+    const MDNode *Scope = DLoc.getScopeNode(F->getParent()->getContext());
+    DISubprogram Subprogram = getDISubprogram(Scope);
+    return Subprogram.describes(F) ? Subprogram : DISubprogram();
+  }
+
+  return DISubprogram();
+}
+
 DICompositeType llvm::getDICompositeType(DIType T) {
   if (T.isCompositeType())
     return DICompositeType(T);
@@ -953,7 +947,6 @@
   return DICompositeType();
 }
 
-/// Update DITypeIdentifierMap by going through retained types of each CU.
 DITypeIdentifierMap
 llvm::generateDITypeIdentifierMap(const NamedMDNode *CU_Nodes) {
   DITypeIdentifierMap Map;
@@ -1002,7 +995,6 @@
     }
 }
 
-/// processModule - Process entire module and collect debug info.
 void DebugInfoFinder::processModule(const Module &M) {
   InitializeTypeMap(M);
   if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
@@ -1013,7 +1005,7 @@
       for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) {
         DIGlobalVariable DIG(GVs.getElement(i));
         if (addGlobalVariable(DIG)) {
-          processScope(DIG.getContext());
+          processScope(DIG.getContext().resolve(TypeIdentifierMap));
           processType(DIG.getType().resolve(TypeIdentifierMap));
         }
       }
@@ -1041,7 +1033,6 @@
   }
 }
 
-/// processLocation - Process DILocation.
 void DebugInfoFinder::processLocation(const Module &M, DILocation Loc) {
   if (!Loc)
     return;
@@ -1050,7 +1041,6 @@
   processLocation(M, Loc.getOrigLocation());
 }
 
-/// processType - Process DIType.
 void DebugInfoFinder::processType(DIType DT) {
   if (!addType(DT))
     return;
@@ -1058,7 +1048,13 @@
   if (DT.isCompositeType()) {
     DICompositeType DCT(DT);
     processType(DCT.getTypeDerivedFrom().resolve(TypeIdentifierMap));
-    DIArray DA = DCT.getTypeArray();
+    if (DT.isSubroutineType()) {
+      DITypeArray DTA = DISubroutineType(DT).getTypeArray();
+      for (unsigned i = 0, e = DTA.getNumElements(); i != e; ++i)
+        processType(DTA.getElement(i).resolve(TypeIdentifierMap));
+      return;
+    }
+    DIArray DA = DCT.getElements();
     for (unsigned i = 0, e = DA.getNumElements(); i != e; ++i) {
       DIDescriptor D = DA.getElement(i);
       if (D.isType())
@@ -1100,7 +1096,6 @@
   }
 }
 
-/// processSubprogram - Process DISubprogram.
 void DebugInfoFinder::processSubprogram(DISubprogram SP) {
   if (!addSubprogram(SP))
     return;
@@ -1121,7 +1116,6 @@
   }
 }
 
-/// processDeclare - Process DbgDeclareInst.
 void DebugInfoFinder::processDeclare(const Module &M,
                                      const DbgDeclareInst *DDI) {
   MDNode *N = dyn_cast<MDNode>(DDI->getVariable());
@@ -1133,7 +1127,7 @@
   if (!DV.isVariable())
     return;
 
-  if (!NodesSeen.insert(DV))
+  if (!NodesSeen.insert(DV).second)
     return;
   processScope(DIVariable(N).getContext());
   processType(DIVariable(N).getType().resolve(TypeIdentifierMap));
@@ -1149,53 +1143,49 @@
   if (!DV.isVariable())
     return;
 
-  if (!NodesSeen.insert(DV))
+  if (!NodesSeen.insert(DV).second)
     return;
   processScope(DIVariable(N).getContext());
   processType(DIVariable(N).getType().resolve(TypeIdentifierMap));
 }
 
-/// addType - Add type into Tys.
 bool DebugInfoFinder::addType(DIType DT) {
   if (!DT)
     return false;
 
-  if (!NodesSeen.insert(DT))
+  if (!NodesSeen.insert(DT).second)
     return false;
 
   TYs.push_back(DT);
   return true;
 }
 
-/// addCompileUnit - Add compile unit into CUs.
 bool DebugInfoFinder::addCompileUnit(DICompileUnit CU) {
   if (!CU)
     return false;
-  if (!NodesSeen.insert(CU))
+  if (!NodesSeen.insert(CU).second)
     return false;
 
   CUs.push_back(CU);
   return true;
 }
 
-/// addGlobalVariable - Add global variable into GVs.
 bool DebugInfoFinder::addGlobalVariable(DIGlobalVariable DIG) {
   if (!DIG)
     return false;
 
-  if (!NodesSeen.insert(DIG))
+  if (!NodesSeen.insert(DIG).second)
     return false;
 
   GVs.push_back(DIG);
   return true;
 }
 
-// addSubprogram - Add subprgoram into SPs.
 bool DebugInfoFinder::addSubprogram(DISubprogram SP) {
   if (!SP)
     return false;
 
-  if (!NodesSeen.insert(SP))
+  if (!NodesSeen.insert(SP).second)
     return false;
 
   SPs.push_back(SP);
@@ -1209,7 +1199,7 @@
   // as null for now.
   if (Scope->getNumOperands() == 0)
     return false;
-  if (!NodesSeen.insert(Scope))
+  if (!NodesSeen.insert(Scope).second)
     return false;
   Scopes.push_back(Scope);
   return true;
@@ -1219,13 +1209,11 @@
 // DIDescriptor: dump routines for all descriptors.
 //===----------------------------------------------------------------------===//
 
-/// dump - Print descriptor to dbgs() with a newline.
 void DIDescriptor::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
 
-/// print - Print descriptor.
 void DIDescriptor::print(raw_ostream &OS) const {
   if (!DbgNode)
     return;
@@ -1259,6 +1247,8 @@
     DINameSpace(DbgNode).printInternal(OS);
   } else if (this->isScope()) {
     DIScope(DbgNode).printInternal(OS);
+  } else if (this->isExpression()) {
+    DIExpression(DbgNode).printInternal(OS);
   }
 }
 
@@ -1311,6 +1301,8 @@
     OS << " [private]";
   else if (isProtected())
     OS << " [protected]";
+  else if (isPublic())
+    OS << " [public]";
 
   if (isArtificial())
     OS << " [artificial]";
@@ -1341,7 +1333,7 @@
 
 void DICompositeType::printInternal(raw_ostream &OS) const {
   DIType::printInternal(OS);
-  DIArray A = getTypeArray();
+  DIArray A = getElements();
   OS << " [" << A.getNumElements() << " elements]";
 }
 
@@ -1370,6 +1362,8 @@
     OS << " [private]";
   else if (isProtected())
     OS << " [protected]";
+  else if (isPublic())
+    OS << " [public]";
 
   if (isLValueReference())
     OS << " [reference]";
@@ -1406,6 +1400,30 @@
   OS << " [line " << getLineNumber() << ']';
 }
 
+void DIExpression::printInternal(raw_ostream &OS) const {
+  for (unsigned I = 0; I < getNumElements(); ++I) {
+    uint64_t OpCode = getElement(I);
+    OS << " [" << OperationEncodingString(OpCode);
+    switch (OpCode) {
+    case DW_OP_plus: {
+      OS << " " << getElement(++I);
+      break;
+    }
+    case DW_OP_piece: {
+      unsigned Offset = getElement(++I);
+      unsigned Size = getElement(++I);
+      OS << " offset=" << Offset << ", size=" << Size;
+      break;
+    }
+    default:
+      // Else bail out early. This may be a line table entry.
+      OS << "Unknown]";
+      return;
+    }
+    OS << "]";
+  }
+}
+
 void DIObjCProperty::printInternal(raw_ostream &OS) const {
   StringRef Name = getObjCPropertyName();
   if (!Name.empty())
@@ -1449,30 +1467,22 @@
   }
 }
 
-/// Specialize constructor to make sure it has the correct type.
-template <> DIRef<DIScope>::DIRef(const Value *V) : Val(V) {
+template <> DIRef<DIScope>::DIRef(const Metadata *V) : Val(V) {
   assert(isScopeRef(V) && "DIScopeRef should be a MDString or MDNode");
 }
-template <> DIRef<DIType>::DIRef(const Value *V) : Val(V) {
+template <> DIRef<DIType>::DIRef(const Metadata *V) : Val(V) {
   assert(isTypeRef(V) && "DITypeRef should be a MDString or MDNode");
 }
 
-/// Specialize getFieldAs to handle fields that are references to DIScopes.
 template <>
 DIScopeRef DIDescriptor::getFieldAs<DIScopeRef>(unsigned Elt) const {
-  return DIScopeRef(getField(DbgNode, Elt));
+  return DIScopeRef(cast_or_null<Metadata>(getField(DbgNode, Elt)));
 }
-/// Specialize getFieldAs to handle fields that are references to DITypes.
 template <> DITypeRef DIDescriptor::getFieldAs<DITypeRef>(unsigned Elt) const {
-  return DITypeRef(getField(DbgNode, Elt));
+  return DITypeRef(cast_or_null<Metadata>(getField(DbgNode, Elt)));
 }
 
-/// Strip debug info in the module if it exists.
-/// To do this, we remove all calls to the debugger intrinsics and any named
-/// metadata for debugging. We also remove debug locations for instructions.
-/// Return true if module is modified.
 bool llvm::StripDebugInfo(Module &M) {
-
   bool Changed = false;
 
   // Remove all of the calls to the debugger intrinsics, and remove them from
@@ -1519,7 +1529,6 @@
   return Changed;
 }
 
-/// Return Debug Info Metadata Version by checking module flags.
 unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
   Value *Val = M.getModuleFlag("Debug Info Version");
   if (!Val)

diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index e8bdcce..718da85 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp

@@ -79,14 +79,8 @@
 DebugLoc DebugLoc::getFnDebugLoc(const LLVMContext &Ctx) const {
   const MDNode *Scope = getScopeNode(Ctx);
   DISubprogram SP = getDISubprogram(Scope);
-  if (SP.isSubprogram()) {
-    // Check for number of operands since the compatibility is
-    // cheap here.  FIXME: Name the magic constant.
-    if (SP->getNumOperands() > 19)
-      return DebugLoc::get(SP.getScopeLineNumber(), 0, SP);
-    else
-      return DebugLoc::get(SP.getLineNumber(), 0, SP);
-  }
+  if (SP.isSubprogram())
+    return DebugLoc::get(SP.getScopeLineNumber(), 0, SP);
 
   return DebugLoc();
 }

diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index 2727063..37cce2b 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp

@@ -127,20 +127,20 @@
   DP << getMsg();
 }
 
-bool DiagnosticInfoOptimizationRemarkBase::isLocationAvailable() const {
+bool DiagnosticInfoOptimizationBase::isLocationAvailable() const {
   return getDebugLoc().isUnknown() == false;
 }
 
-void DiagnosticInfoOptimizationRemarkBase::getLocation(StringRef *Filename,
-                                                       unsigned *Line,
-                                                       unsigned *Column) const {
+void DiagnosticInfoOptimizationBase::getLocation(StringRef *Filename,
+                                                 unsigned *Line,
+                                                 unsigned *Column) const {
   DILocation DIL(getDebugLoc().getAsMDNode(getFunction().getContext()));
   *Filename = DIL.getFilename();
   *Line = DIL.getLineNumber();
   *Column = DIL.getColumnNumber();
 }
 
-const std::string DiagnosticInfoOptimizationRemarkBase::getLocationStr() const {
+const std::string DiagnosticInfoOptimizationBase::getLocationStr() const {
   StringRef Filename("<unknown>");
   unsigned Line = 0;
   unsigned Column = 0;
@@ -149,7 +149,7 @@
   return Twine(Filename + ":" + Twine(Line) + ":" + Twine(Column)).str();
 }
 
-void DiagnosticInfoOptimizationRemarkBase::print(DiagnosticPrinter &DP) const {
+void DiagnosticInfoOptimizationBase::print(DiagnosticPrinter &DP) const {
   DP << getLocationStr() << ": " << getMsg();
 }
 
@@ -189,3 +189,20 @@
   Ctx.diagnose(
       DiagnosticInfoOptimizationRemarkAnalysis(PassName, Fn, DLoc, Msg));
 }
+
+bool DiagnosticInfoOptimizationFailure::isEnabled() const {
+  // Only print warnings.
+  return getSeverity() == DS_Warning;
+}
+
+void llvm::emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn,
+                                    const DebugLoc &DLoc, const Twine &Msg) {
+  Ctx.diagnose(DiagnosticInfoOptimizationFailure(
+      Fn, DLoc, Twine("loop not vectorized: " + Msg)));
+}
+
+void llvm::emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn,
+                                     const DebugLoc &DLoc, const Twine &Msg) {
+  Ctx.diagnose(DiagnosticInfoOptimizationFailure(
+      Fn, DLoc, Twine("loop not interleaved: " + Msg)));
+}

diff --git a/lib/IR/DiagnosticPrinter.cpp b/lib/IR/DiagnosticPrinter.cpp
index 5e16026..f25fc20 100644
--- a/lib/IR/DiagnosticPrinter.cpp
+++ b/lib/IR/DiagnosticPrinter.cpp

@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the a diagnostic printer relying on raw_ostream.
+// This file defines a diagnostic printer relying on raw_ostream.
 //
 //===----------------------------------------------------------------------===//
 

diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 1443571..32b2ec5 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp

@@ -77,11 +77,17 @@
 }
 
 /// hasNonNullAttr - Return true if this argument has the nonnull attribute on
-/// it in its containing function.
+/// it in its containing function. Also returns true if at least one byte is
+/// known to be dereferenceable and the pointer is in addrspace(0).
 bool Argument::hasNonNullAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::NonNull);
+  if (getParent()->getAttributes().
+        hasAttribute(getArgNo()+1, Attribute::NonNull))
+    return true;
+  else if (getDereferenceableBytes() > 0 &&
+           getType()->getPointerAddressSpace() == 0)
+    return true;
+  return false;
 }
 
 /// hasByValAttr - Return true if this argument has the byval attribute on it
@@ -113,6 +119,12 @@
 
 }
 
+uint64_t Argument::getDereferenceableBytes() const {
+  assert(getType()->isPointerTy() &&
+         "Only pointers have dereferenceable bytes");
+  return getParent()->getDereferenceableBytes(getArgNo()+1);
+}
+
 /// hasNestAttr - Return true if this argument has the nest attribute on
 /// it in its containing function.
 bool Argument::hasNestAttr() const {
@@ -154,6 +166,20 @@
     hasAttribute(getArgNo()+1, Attribute::Returned);
 }
 
+/// hasZExtAttr - Return true if this argument has the zext attribute on it in
+/// its containing function.
+bool Argument::hasZExtAttr() const {
+  return getParent()->getAttributes().
+    hasAttribute(getArgNo()+1, Attribute::ZExt);
+}
+
+/// hasSExtAttr Return true if this argument has the sext attribute on it in its
+/// containing function.
+bool Argument::hasSExtAttr() const {
+  return getParent()->getAttributes().
+    hasAttribute(getArgNo()+1, Attribute::SExt);
+}
+
 /// Return true if this argument has the readonly or readnone attribute on it
 /// in its containing function.
 bool Argument::onlyReadsMemory() const {
@@ -187,6 +213,12 @@
 // Helper Methods in Function
 //===----------------------------------------------------------------------===//
 
+bool Function::isMaterializable() const {
+  return getGlobalObjectSubClassData();
+}
+
+void Function::setIsMaterializable(bool V) { setGlobalObjectSubClassData(V); }
+
 LLVMContext &Function::getContext() const {
   return getType()->getContext();
 }
@@ -215,12 +247,13 @@
 // Function Implementation
 //===----------------------------------------------------------------------===//
 
-Function::Function(FunctionType *Ty, LinkageTypes Linkage,
-                   const Twine &name, Module *ParentModule)
-  : GlobalObject(PointerType::getUnqual(Ty),
-                Value::FunctionVal, nullptr, 0, Linkage, name) {
+Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
+                   Module *ParentModule)
+    : GlobalObject(PointerType::getUnqual(Ty), Value::FunctionVal, nullptr, 0,
+                   Linkage, name) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
+  setIsMaterializable(false);
   SymTab = new ValueSymbolTable();
 
   // If the function has arguments, mark them as lazily built.
@@ -292,6 +325,8 @@
 // delete.
 //
 void Function::dropAllReferences() {
+  setIsMaterializable(false);
+
   for (iterator I = begin(), E = end(); I != E; ++I)
     I->dropAllReferences();
 
@@ -420,6 +455,33 @@
   return 0;
 }
 
+/// Returns a stable mangling for the type specified for use in the name
+/// mangling scheme used by 'any' types in intrinsic signatures.
+static std::string getMangledTypeStr(Type* Ty) {
+  std::string Result;
+  if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
+    Result += "p" + llvm::utostr(PTyp->getAddressSpace()) +
+      getMangledTypeStr(PTyp->getElementType());
+  } else if (ArrayType* ATyp = dyn_cast<ArrayType>(Ty)) {
+    Result += "a" + llvm::utostr(ATyp->getNumElements()) +
+      getMangledTypeStr(ATyp->getElementType());
+  } else if (StructType* STyp = dyn_cast<StructType>(Ty)) {
+    if (!STyp->isLiteral())
+      Result += STyp->getName();
+    else
+      llvm_unreachable("TODO: implement literal types");
+  } else if (FunctionType* FT = dyn_cast<FunctionType>(Ty)) {
+    Result += "f_" + getMangledTypeStr(FT->getReturnType());
+    for (size_t i = 0; i < FT->getNumParams(); i++)
+      Result += getMangledTypeStr(FT->getParamType(i));
+    if (FT->isVarArg())
+      Result += "vararg";
+    Result += "f"; //ensure distinguishable
+  } else if (Ty)
+    Result += EVT::getEVT(Ty).getEVTString();
+  return Result;
+}
+
 std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   assert(id < num_intrinsics && "Invalid intrinsic ID!");
   static const char * const Table[] = {
@@ -432,12 +494,7 @@
     return Table[id];
   std::string Result(Table[id]);
   for (unsigned i = 0; i < Tys.size(); ++i) {
-    if (PointerType* PTyp = dyn_cast<PointerType>(Tys[i])) {
-      Result += ".p" + llvm::utostr(PTyp->getAddressSpace()) +
-                EVT::getEVT(PTyp->getElementType()).getEVTString();
-    }
-    else if (Tys[i])
-      Result += "." + EVT::getEVT(Tys[i]).getEVTString();
+    Result += "." + getMangledTypeStr(Tys[i]);
   }
   return Result;
 }
@@ -467,19 +524,20 @@
   IIT_ARG  = 15,
 
   // Values from 16+ are only encodable with the inefficient encoding.
-  IIT_MMX  = 16,
-  IIT_METADATA = 17,
-  IIT_EMPTYSTRUCT = 18,
-  IIT_STRUCT2 = 19,
-  IIT_STRUCT3 = 20,
-  IIT_STRUCT4 = 21,
-  IIT_STRUCT5 = 22,
-  IIT_EXTEND_ARG = 23,
-  IIT_TRUNC_ARG = 24,
-  IIT_ANYPTR = 25,
-  IIT_V1   = 26,
-  IIT_VARARG = 27,
-  IIT_HALF_VEC_ARG = 28
+  IIT_V64  = 16,
+  IIT_MMX  = 17,
+  IIT_METADATA = 18,
+  IIT_EMPTYSTRUCT = 19,
+  IIT_STRUCT2 = 20,
+  IIT_STRUCT3 = 21,
+  IIT_STRUCT4 = 22,
+  IIT_STRUCT5 = 23,
+  IIT_EXTEND_ARG = 24,
+  IIT_TRUNC_ARG = 25,
+  IIT_ANYPTR = 26,
+  IIT_V1   = 27,
+  IIT_VARARG = 28,
+  IIT_HALF_VEC_ARG = 29
 };
 
 
@@ -550,6 +608,10 @@
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 32));
     DecodeIITType(NextElt, Infos, OutputTable);
     return;
+  case IIT_V64:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 64));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
   case IIT_PTR:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0));
     DecodeIITType(NextElt, Infos, OutputTable);
@@ -666,7 +728,7 @@
     assert(D.Struct_NumElements <= 5 && "Can't handle this yet");
     for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
       Elts[i] = DecodeFixedType(Infos, Tys, Context);
-    return StructType::get(Context, ArrayRef<Type*>(Elts,D.Struct_NumElements));
+    return StructType::get(Context, makeArrayRef(Elts,D.Struct_NumElements));
   }
 
   case IITDescriptor::Argument:
@@ -708,6 +770,12 @@
   while (!TableRef.empty())
     ArgTys.push_back(DecodeFixedType(TableRef, Tys, Context));
 
+  // DecodeFixedType returns Void for IITDescriptor::Void and IITDescriptor::VarArg
+  // If we see void type as the type of the last argument, it is vararg intrinsic
+  if (!ArgTys.empty() && ArgTys.back()->isVoidTy()) {
+    ArgTys.pop_back();
+    return FunctionType::get(ResultTy, ArgTys, true);
+  }
   return FunctionType::get(ResultTy, ArgTys, false);
 }
 

diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index 1667401..245c500 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp

@@ -298,7 +298,8 @@
 
 /// dump - Dump GCOVFunction content to dbgs() for debugging purposes.
 void GCOVFunction::dump() const {
-  dbgs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
+  dbgs() << "===== " << Name << " (" << Ident << ") @ " << Filename << ":"
+         << LineNumber << "\n";
   for (const auto &Block : Blocks)
     Block->dump();
 }
@@ -517,11 +518,11 @@
   if (Options.NoOutput)
     return llvm::make_unique<raw_null_ostream>();
 
-  std::string ErrorInfo;
-  auto OS = llvm::make_unique<raw_fd_ostream>(CoveragePath.str().c_str(),
-                                              ErrorInfo, sys::fs::F_Text);
-  if (!ErrorInfo.empty()) {
-    errs() << ErrorInfo << "\n";
+  std::error_code EC;
+  auto OS = llvm::make_unique<raw_fd_ostream>(CoveragePath.str(), EC,
+                                              sys::fs::F_Text);
+  if (EC) {
+    errs() << EC.message() << "\n";
     return llvm::make_unique<raw_null_ostream>();
   }
   return std::move(OS);

diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 244e3e4..e181d62 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp

@@ -29,13 +29,15 @@
 //===----------------------------------------------------------------------===//
 
 bool GlobalValue::isMaterializable() const {
-  return getParent() && getParent()->isMaterializable(this);
+  if (const Function *F = dyn_cast<Function>(this))
+    return F->isMaterializable();
+  return false;
 }
 bool GlobalValue::isDematerializable() const {
   return getParent() && getParent()->isDematerializable(this);
 }
-bool GlobalValue::Materialize(std::string *ErrInfo) {
-  return getParent()->Materialize(this, ErrInfo);
+std::error_code GlobalValue::materialize() {
+  return getParent()->materialize(this);
 }
 void GlobalValue::Dematerialize() {
   getParent()->Dematerialize(this);
@@ -77,10 +79,24 @@
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
   assert(Align <= MaximumAlignment &&
          "Alignment is greater than MaximumAlignment!");
-  setGlobalValueSubClassData(Log2_32(Align) + 1);
+  unsigned AlignmentData = Log2_32(Align) + 1;
+  unsigned OldData = getGlobalValueSubClassData();
+  setGlobalValueSubClassData((OldData & ~AlignmentMask) | AlignmentData);
   assert(getAlignment() == Align && "Alignment representation error!");
 }
 
+unsigned GlobalObject::getGlobalObjectSubClassData() const {
+  unsigned ValueData = getGlobalValueSubClassData();
+  return ValueData >> AlignmentBits;
+}
+
+void GlobalObject::setGlobalObjectSubClassData(unsigned Val) {
+  unsigned OldData = getGlobalValueSubClassData();
+  setGlobalValueSubClassData((OldData & AlignmentMask) |
+                             (Val << AlignmentBits));
+  assert(getGlobalObjectSubClassData() == Val && "representation error");
+}
+
 void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
   const auto *GV = cast<GlobalObject>(Src);
   GlobalValue::copyAttributesFrom(GV);
@@ -117,7 +133,7 @@
 
   // Functions are definitions if they have a body.
   if (const Function *F = dyn_cast<Function>(this))
-    return F->empty();
+    return F->empty() && !F->isMaterializable();
 
   // Aliases are always definitions.
   assert(isa<GlobalAlias>(this));
@@ -230,6 +246,7 @@
   GlobalObject::copyAttributesFrom(Src);
   const GlobalVariable *SrcVar = cast<GlobalVariable>(Src);
   setThreadLocalMode(SrcVar->getThreadLocalMode());
+  setExternallyInitialized(SrcVar->isExternallyInitialized());
 }
 
 

diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index 435e54f..a4c5d97 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp

@@ -62,7 +62,8 @@
 
 CallInst *IRBuilderBase::
 CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align,
-             bool isVolatile, MDNode *TBAATag) {
+             bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag,
+             MDNode *NoAliasTag) {
   Ptr = getCastedInt8PtrValue(Ptr);
   Value *Ops[] = { Ptr, Val, Size, getInt32(Align), getInt1(isVolatile) };
   Type *Tys[] = { Ptr->getType(), Size->getType() };
@@ -74,13 +75,20 @@
   // Set the TBAA info if present.
   if (TBAATag)
     CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
-  
+
+  if (ScopeTag)
+    CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+ 
+  if (NoAliasTag)
+    CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+ 
   return CI;
 }
 
 CallInst *IRBuilderBase::
 CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
-             bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag) {
+             bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag,
+             MDNode *ScopeTag, MDNode *NoAliasTag) {
   Dst = getCastedInt8PtrValue(Dst);
   Src = getCastedInt8PtrValue(Src);
 
@@ -98,13 +106,20 @@
   // Set the TBAA Struct info if present.
   if (TBAAStructTag)
     CI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag);
-  
+ 
+  if (ScopeTag)
+    CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+ 
+  if (NoAliasTag)
+    CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+ 
   return CI;  
 }
 
 CallInst *IRBuilderBase::
 CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
-              bool isVolatile, MDNode *TBAATag) {
+              bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag,
+              MDNode *NoAliasTag) {
   Dst = getCastedInt8PtrValue(Dst);
   Src = getCastedInt8PtrValue(Src);
   
@@ -118,7 +133,13 @@
   // Set the TBAA info if present.
   if (TBAATag)
     CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
-  
+ 
+  if (ScopeTag)
+    CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+ 
+  if (NoAliasTag)
+    CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+ 
   return CI;  
 }
 
@@ -151,3 +172,14 @@
   Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
   return createCallHelper(TheFn, Ops, this);
 }
+
+CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
+  assert(Cond->getType() == getInt1Ty() &&
+         "an assumption condition must be of type i1");
+
+  Value *Ops[] = { Cond };
+  Module *M = BB->getParent()->getParent();
+  Value *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
+  return createCallHelper(FnAssume, Ops, this);
+}
+

diff --git a/lib/IR/InlineAsm.cpp b/lib/IR/InlineAsm.cpp
index a3e1da3b1..16d874f 100644
--- a/lib/IR/InlineAsm.cpp
+++ b/lib/IR/InlineAsm.cpp

@@ -91,6 +91,10 @@
   if (*I == '~') {
     Type = isClobber;
     ++I;
+
+    // '{' must immediately follow '~'.
+    if (I != E && *I != '{')
+      return true;
   } else if (*I == '=') {
     ++I;
     Type = isOutput;

diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 86421c4..3ee66f5 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp

@@ -143,6 +143,11 @@
   cast<FPMathOperator>(this)->setFastMathFlags(FMF);
 }
 
+void Instruction::copyFastMathFlags(FastMathFlags FMF) {
+  assert(isa<FPMathOperator>(this) && "copying fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->copyFastMathFlags(FMF);
+}
+
 /// Determine whether the unsafe-algebra flag is set.
 bool Instruction::hasUnsafeAlgebra() const {
   assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
@@ -175,7 +180,7 @@
 
 /// Convenience function for getting all the fast-math flags, which must be an
 /// operator which supports these flags. See LangRef.html for the meaning of
-/// these flats.
+/// these flags.
 FastMathFlags Instruction::getFastMathFlags() const {
   assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->getFastMathFlags();
@@ -183,7 +188,7 @@
 
 /// Copy I's fast-math flags
 void Instruction::copyFastMathFlags(const Instruction *I) {
-  setFastMathFlags(I->getFastMathFlags());
+  copyFastMathFlags(I->getFastMathFlags());
 }
 
 
@@ -438,6 +443,21 @@
   }
 }
 
+bool Instruction::isAtomic() const {
+  switch (getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+  case Instruction::Fence:
+    return true;
+  case Instruction::Load:
+    return cast<LoadInst>(this)->getOrdering() != NotAtomic;
+  case Instruction::Store:
+    return cast<StoreInst>(this)->getOrdering() != NotAtomic;
+  }
+}
+
 bool Instruction::mayThrow() const {
   if (const CallInst *CI = dyn_cast<CallInst>(this))
     return !CI->doesNotThrow();
@@ -528,7 +548,7 @@
 
   // Otherwise, enumerate and copy over metadata from the old instruction to the
   // new one.
-  SmallVector<std::pair<unsigned, MDNode*>, 4> TheMDs;
+  SmallVector<std::pair<unsigned, MDNode *>, 4> TheMDs;
   getAllMetadataOtherThanDebugLoc(TheMDs);
   for (const auto &MD : TheMDs)
     New->setMetadata(MD.first, MD.second);

diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index a5ceacb..57a4f0b 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp

@@ -364,8 +364,9 @@
 
 /// IsConstantOne - Return true only if val is constant int 1
 static bool IsConstantOne(Value *val) {
-  assert(val && "IsConstantOne does not work with NULL val");
-  return isa<ConstantInt>(val) && cast<ConstantInt>(val)->isOne();
+  assert(val && "IsConstantOne does not work with nullptr val");
+  const ConstantInt *CVal = dyn_cast<ConstantInt>(val);
+  return CVal && CVal->isOne();
 }
 
 static Instruction *createMalloc(Instruction *InsertBefore,
@@ -418,7 +419,7 @@
   Value *MallocFunc = MallocF;
   if (!MallocFunc)
     // prototype malloc as "void *malloc(size_t)"
-    MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy, NULL);
+    MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy, nullptr);
   PointerType *AllocPtrType = PointerType::getUnqual(AllocTy);
   CallInst *MCall = nullptr;
   Instruction *Result = nullptr;
@@ -491,7 +492,7 @@
   Type *VoidTy = Type::getVoidTy(M->getContext());
   Type *IntPtrTy = Type::getInt8PtrTy(M->getContext());
   // prototype free as "void free(void*)"
-  Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy, NULL);
+  Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy, nullptr);
   CallInst* Result = nullptr;
   Value *PtrCast = Source;
   if (InsertBefore) {
@@ -2030,6 +2031,39 @@
   return cast<PossiblyExactOperator>(this)->isExact();
 }
 
+void BinaryOperator::copyIRFlags(const Value *V) {
+  // Copy the wrapping flags.
+  if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
+    setHasNoSignedWrap(OB->hasNoSignedWrap());
+    setHasNoUnsignedWrap(OB->hasNoUnsignedWrap());
+  }
+
+  // Copy the exact flag.
+  if (auto *PE = dyn_cast<PossiblyExactOperator>(V))
+    setIsExact(PE->isExact());
+  
+  // Copy the fast-math flags.
+  if (auto *FP = dyn_cast<FPMathOperator>(V))
+    copyFastMathFlags(FP->getFastMathFlags());
+}
+
+void BinaryOperator::andIRFlags(const Value *V) {
+  if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
+    setHasNoSignedWrap(hasNoSignedWrap() & OB->hasNoSignedWrap());
+    setHasNoUnsignedWrap(hasNoUnsignedWrap() & OB->hasNoUnsignedWrap());
+  }
+  
+  if (auto *PE = dyn_cast<PossiblyExactOperator>(V))
+    setIsExact(isExact() & PE->isExact());
+  
+  if (auto *FP = dyn_cast<FPMathOperator>(V)) {
+    FastMathFlags FM = getFastMathFlags();
+    FM &= FP->getFastMathFlags();
+    copyFastMathFlags(FM);
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 //                             FPMathOperator Class
 //===----------------------------------------------------------------------===//
@@ -2039,7 +2073,7 @@
 /// default precision.
 float FPMathOperator::getFPAccuracy() const {
   const MDNode *MD =
-    cast<Instruction>(this)->getMetadata(LLVMContext::MD_fpmath);
+      cast<Instruction>(this)->getMetadata(LLVMContext::MD_fpmath);
   if (!MD)
     return 0.0;
   ConstantFP *Accuracy = cast<ConstantFP>(MD->getOperand(0));
@@ -2478,11 +2512,7 @@
   if (Ty->isIntOrIntVectorTy())
     return Create(Instruction::PtrToInt, S, Ty, Name, InsertAtEnd);
 
-  Type *STy = S->getType();
-  if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace())
-    return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertAtEnd);
-
-  return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+  return CreatePointerBitCastOrAddrSpaceCast(S, Ty, Name, InsertAtEnd);
 }
 
 /// @brief Create a BitCast or a PtrToInt cast instruction
@@ -2500,14 +2530,36 @@
   if (Ty->isIntOrIntVectorTy())
     return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
 
-  Type *STy = S->getType();
-  if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+  return CreatePointerBitCastOrAddrSpaceCast(S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreatePointerBitCastOrAddrSpaceCast(
+  Value *S, Type *Ty,
+  const Twine &Name,
+  BasicBlock *InsertAtEnd) {
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast");
+
+  if (S->getType()->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertAtEnd);
+
+  return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+}
+
+CastInst *CastInst::CreatePointerBitCastOrAddrSpaceCast(
+  Value *S, Type *Ty,
+  const Twine &Name,
+  Instruction *InsertBefore) {
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast");
+
+  if (S->getType()->getPointerAddressSpace() != Ty->getPointerAddressSpace())
     return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertBefore);
 
   return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, 
+CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty,
                                       bool isSigned, const Twine &Name,
                                       Instruction *InsertBefore) {
   assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&

diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index de825f0..c62bc09 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp

@@ -66,6 +66,33 @@
   unsigned InvariantLdId = getMDKindID("invariant.load");
   assert(InvariantLdId == MD_invariant_load && "invariant.load kind id drifted");
   (void)InvariantLdId;
+
+  // Create the 'alias.scope' metadata kind.
+  unsigned AliasScopeID = getMDKindID("alias.scope");
+  assert(AliasScopeID == MD_alias_scope && "alias.scope kind id drifted");
+  (void)AliasScopeID;
+
+  // Create the 'noalias' metadata kind.
+  unsigned NoAliasID = getMDKindID("noalias");
+  assert(NoAliasID == MD_noalias && "noalias kind id drifted");
+  (void)NoAliasID;
+
+  // Create the 'nontemporal' metadata kind.
+  unsigned NonTemporalID = getMDKindID("nontemporal");
+  assert(NonTemporalID == MD_nontemporal && "nontemporal kind id drifted");
+  (void)NonTemporalID;
+
+  // Create the 'llvm.mem.parallel_loop_access' metadata kind.
+  unsigned MemParallelLoopAccessID = getMDKindID("llvm.mem.parallel_loop_access");
+  assert(MemParallelLoopAccessID == MD_mem_parallel_loop_access &&
+         "mem_parallel_loop_access kind id drifted");
+  (void)MemParallelLoopAccessID;
+
+
+  // Create the 'nonnull' metadata kind.
+  unsigned NonNullID = getMDKindID("nonnull");
+  assert(NonNullID == MD_nonnull && "nonnull kind id drifted");
+  (void)NonNullID;
 }
 LLVMContext::~LLVMContext() { delete pImpl; }
 
@@ -102,9 +129,11 @@
 }
 
 void LLVMContext::setDiagnosticHandler(DiagnosticHandlerTy DiagnosticHandler,
-                                       void *DiagnosticContext) {
+                                       void *DiagnosticContext,
+                                       bool RespectFilters) {
   pImpl->DiagnosticHandler = DiagnosticHandler;
   pImpl->DiagnosticContext = DiagnosticContext;
+  pImpl->RespectDiagnosticFilters = RespectFilters;
 }
 
 LLVMContext::DiagnosticHandlerTy LLVMContext::getDiagnosticHandler() const {
@@ -135,13 +164,7 @@
   diagnose(DiagnosticInfoInlineAsm(*I, ErrorStr));
 }
 
-void LLVMContext::diagnose(const DiagnosticInfo &DI) {
-  // If there is a report handler, use it.
-  if (pImpl->DiagnosticHandler) {
-    pImpl->DiagnosticHandler(DI, pImpl->DiagnosticContext);
-    return;
-  }
-
+static bool isDiagnosticEnabled(const DiagnosticInfo &DI) {
   // Optimization remarks are selective. They need to check whether the regexp
   // pattern, passed via one of the -pass-remarks* flags, matches the name of
   // the pass that is emitting the diagnostic. If there is no match, ignore the
@@ -149,19 +172,32 @@
   switch (DI.getKind()) {
   case llvm::DK_OptimizationRemark:
     if (!cast<DiagnosticInfoOptimizationRemark>(DI).isEnabled())
-      return;
+      return false;
     break;
   case llvm::DK_OptimizationRemarkMissed:
     if (!cast<DiagnosticInfoOptimizationRemarkMissed>(DI).isEnabled())
-      return;
+      return false;
     break;
   case llvm::DK_OptimizationRemarkAnalysis:
     if (!cast<DiagnosticInfoOptimizationRemarkAnalysis>(DI).isEnabled())
-      return;
+      return false;
     break;
   default:
     break;
   }
+  return true;
+}
+
+void LLVMContext::diagnose(const DiagnosticInfo &DI) {
+  // If there is a report handler, use it.
+  if (pImpl->DiagnosticHandler) {
+    if (!pImpl->RespectDiagnosticFilters || isDiagnosticEnabled(DI))
+      pImpl->DiagnosticHandler(DI, pImpl->DiagnosticContext);
+    return;
+  }
+
+  if (!isDiagnosticEnabled(DI))
+    return;
 
   // Otherwise, print the message with a prefix based on the severity.
   std::string MsgStorage;
@@ -217,9 +253,10 @@
   assert(isValidName(Name) && "Invalid MDNode name");
 
   // If this is new, assign it its ID.
-  return
-    pImpl->CustomMDKindNames.GetOrCreateValue(
-      Name, pImpl->CustomMDKindNames.size()).second;
+  return pImpl->CustomMDKindNames.insert(std::make_pair(
+                                             Name,
+                                             pImpl->CustomMDKindNames.size()))
+      .first->second;
 }
 
 /// getHandlerNames - Populate client supplied smallvector using custome

diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index 4c2791f..3fd0bb3 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp

@@ -40,6 +40,7 @@
   InlineAsmDiagContext = nullptr;
   DiagnosticHandler = nullptr;
   DiagnosticContext = nullptr;
+  RespectDiagnosticFilters = false;
   YieldCallback = nullptr;
   YieldOpaqueHandle = nullptr;
   NamedStructTypesUniqueID = 0;
@@ -75,7 +76,7 @@
   // Free the constants.  This is important to do here to ensure that they are
   // freed before the LeakDetector is torn down.
   std::for_each(ExprConstants.map_begin(), ExprConstants.map_end(),
-                DropReferences());
+                DropFirst());
   std::for_each(ArrayConstants.map_begin(), ArrayConstants.map_end(),
                 DropFirst());
   std::for_each(StructConstants.map_begin(), StructConstants.map_end(),
@@ -121,20 +122,19 @@
 
   // Destroy MDNodes.  ~MDNode can move and remove nodes between the MDNodeSet
   // and the NonUniquedMDNodes sets, so copy the values out first.
-  SmallVector<MDNode*, 8> MDNodes;
+  SmallVector<GenericMDNode *, 8> MDNodes;
   MDNodes.reserve(MDNodeSet.size() + NonUniquedMDNodes.size());
-  for (FoldingSetIterator<MDNode> I = MDNodeSet.begin(), E = MDNodeSet.end();
-       I != E; ++I)
-    MDNodes.push_back(&*I);
+  MDNodes.append(MDNodeSet.begin(), MDNodeSet.end());
   MDNodes.append(NonUniquedMDNodes.begin(), NonUniquedMDNodes.end());
-  for (SmallVectorImpl<MDNode *>::iterator I = MDNodes.begin(),
-         E = MDNodes.end(); I != E; ++I)
-    (*I)->destroy();
+  for (GenericMDNode *I : MDNodes)
+    I->dropAllReferences();
+  for (GenericMDNode *I : MDNodes)
+    delete I;
   assert(MDNodeSet.empty() && NonUniquedMDNodes.empty() &&
          "Destroying all MDNodes didn't empty the Context's sets.");
 
   // Destroy MDStrings.
-  DeleteContainerSeconds(MDStringCache);
+  MDStringCache.clear();
 }
 
 // ConstantsContext anchors

diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 808c239..e743ec3 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LLVMCONTEXT_IMPL_H
-#define LLVM_LLVMCONTEXT_IMPL_H
+#ifndef LLVM_LIB_IR_LLVMCONTEXTIMPL_H
+#define LLVM_LIB_IR_LLVMCONTEXTIMPL_H
 
 #include "AttributeImpl.h"
 #include "ConstantsContext.h"
@@ -22,6 +22,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -150,7 +151,7 @@
       ReturnType(R), Params(P), isVarArg(V) {}
     KeyTy(const FunctionType* FT) :
       ReturnType(FT->getReturnType()),
-      Params(ArrayRef<Type*>(FT->param_begin(), FT->param_end())),
+      Params(makeArrayRef(FT->param_begin(), FT->param_end())),
       isVarArg(FT->isVarArg()) {}
     bool operator==(const KeyTy& that) const {
       if (ReturnType != that.ReturnType)
@@ -190,23 +191,52 @@
   }
 };
 
-// Provide a FoldingSetTrait::Equals specialization for MDNode that can use a
-// shortcut to avoid comparing all operands.
-template<> struct FoldingSetTrait<MDNode> : DefaultFoldingSetTrait<MDNode> {
-  static bool Equals(const MDNode &X, const FoldingSetNodeID &ID,
-                     unsigned IDHash, FoldingSetNodeID &TempID) {
-    assert(!X.isNotUniqued() && "Non-uniqued MDNode in FoldingSet?");
-    // First, check if the cached hashes match.  If they don't we can skip the
-    // expensive operand walk.
-    if (X.Hash != IDHash)
-      return false;
+/// \brief DenseMapInfo for GenericMDNode.
+///
+/// Note that we don't need the is-function-local bit, since that's implicit in
+/// the operands.
+struct GenericMDNodeInfo {
+  struct KeyTy {
+    ArrayRef<Value *> Ops;
+    unsigned Hash;
 
-    // If they match we have to compare the operands.
-    X.Profile(TempID);
-    return TempID == ID;
+    KeyTy(ArrayRef<Value *> Ops)
+        : Ops(Ops), Hash(hash_combine_range(Ops.begin(), Ops.end())) {}
+
+    KeyTy(GenericMDNode *N, SmallVectorImpl<Value *> &Storage) {
+      Storage.resize(N->getNumOperands());
+      for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
+        Storage[I] = N->getOperand(I);
+      Ops = Storage;
+      Hash = hash_combine_range(Ops.begin(), Ops.end());
+    }
+
+    bool operator==(const GenericMDNode *RHS) const {
+      if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+        return false;
+      if (Hash != RHS->getHash() || Ops.size() != RHS->getNumOperands())
+        return false;
+      for (unsigned I = 0, E = Ops.size(); I != E; ++I)
+        if (Ops[I] != RHS->getOperand(I))
+          return false;
+      return true;
+    }
+  };
+  static inline GenericMDNode *getEmptyKey() {
+    return DenseMapInfo<GenericMDNode *>::getEmptyKey();
   }
-  static unsigned ComputeHash(const MDNode &X, FoldingSetNodeID &) {
-    return X.Hash; // Return cached hash.
+  static inline GenericMDNode *getTombstoneKey() {
+    return DenseMapInfo<GenericMDNode *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(const KeyTy &Key) { return Key.Hash; }
+  static unsigned getHashValue(const GenericMDNode *U) {
+    return U->getHash();
+  }
+  static bool isEqual(const KeyTy &LHS, const GenericMDNode *RHS) {
+    return LHS == RHS;
+  }
+  static bool isEqual(const GenericMDNode *LHS, const GenericMDNode *RHS) {
+    return LHS == RHS;
   }
 };
 
@@ -244,6 +274,7 @@
 
   LLVMContext::DiagnosticHandlerTy DiagnosticHandler;
   void *DiagnosticContext;
+  bool RespectDiagnosticFilters;
 
   LLVMContext::YieldCallbackTy YieldCallback;
   void *YieldOpaqueHandle;
@@ -260,25 +291,25 @@
   FoldingSet<AttributeSetImpl> AttrsLists;
   FoldingSet<AttributeSetNode> AttrsSetNodes;
 
-  StringMap<Value*> MDStringCache;
+  StringMap<MDString> MDStringCache;
 
-  FoldingSet<MDNode> MDNodeSet;
+  DenseSet<GenericMDNode *, GenericMDNodeInfo> MDNodeSet;
 
   // MDNodes may be uniqued or not uniqued.  When they're not uniqued, they
   // aren't in the MDNodeSet, but they're still shared between objects, so no
   // one object can destroy them.  This set allows us to at least destroy them
   // on Context destruction.
-  SmallPtrSet<MDNode*, 1> NonUniquedMDNodes;
-  
+  SmallPtrSet<GenericMDNode *, 1> NonUniquedMDNodes;
+
   DenseMap<Type*, ConstantAggregateZero*> CAZConstants;
 
-  typedef ConstantAggrUniqueMap<ArrayType, ConstantArray> ArrayConstantsTy;
+  typedef ConstantUniqueMap<ConstantArray> ArrayConstantsTy;
   ArrayConstantsTy ArrayConstants;
   
-  typedef ConstantAggrUniqueMap<StructType, ConstantStruct> StructConstantsTy;
+  typedef ConstantUniqueMap<ConstantStruct> StructConstantsTy;
   StructConstantsTy StructConstants;
   
-  typedef ConstantAggrUniqueMap<VectorType, ConstantVector> VectorConstantsTy;
+  typedef ConstantUniqueMap<ConstantVector> VectorConstantsTy;
   VectorConstantsTy VectorConstants;
   
   DenseMap<PointerType*, ConstantPointerNull*> CPNConstants;
@@ -289,12 +320,10 @@
 
   DenseMap<std::pair<const Function *, const BasicBlock *>, BlockAddress *>
     BlockAddresses;
-  ConstantUniqueMap<ExprMapKeyType, const ExprMapKeyType&, Type, ConstantExpr>
-    ExprConstants;
+  ConstantUniqueMap<ConstantExpr> ExprConstants;
 
-  ConstantUniqueMap<InlineAsmKeyType, const InlineAsmKeyType&, PointerType,
-                    InlineAsm> InlineAsms;
-  
+  ConstantUniqueMap<InlineAsm> InlineAsms;
+
   ConstantInt *TheTrueVal;
   ConstantInt *TheFalseVal;
   

diff --git a/lib/IR/LeaksContext.h b/lib/IR/LeaksContext.h
index 52ac170..3e485ab 100644
--- a/lib/IR/LeaksContext.h
+++ b/lib/IR/LeaksContext.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_IR_LEAKSCONTEXT_H
-#define LLVM_IR_LEAKSCONTEXT_H
+#ifndef LLVM_LIB_IR_LEAKSCONTEXT_H
+#define LLVM_LIB_IR_LEAKSCONTEXT_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Value.h"
@@ -95,4 +95,4 @@
 
 }
 
-#endif // LLVM_IR_LEAKSCONTEXT_H
+#endif

diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
index d3f3482..28fa74c 100644
--- a/lib/IR/LegacyPassManager.cpp
+++ b/lib/IR/LegacyPassManager.cpp

@@ -573,9 +573,8 @@
     return;
 
   SmallPtrSet<Pass *, 8> &LU = DMI->second;
-  for (SmallPtrSet<Pass *, 8>::iterator I = LU.begin(),
-         E = LU.end(); I != E; ++I) {
-    LastUses.push_back(*I);
+  for (Pass *LUP : LU) {
+    LastUses.push_back(LUP);
   }
 
 }
@@ -1404,11 +1403,8 @@
 /// so, return true.
 ///
 bool FunctionPassManager::run(Function &F) {
-  if (F.isMaterializable()) {
-    std::string errstr;
-    if (F.Materialize(&errstr))
-      report_fatal_error("Error reading bitcode file: " + Twine(errstr));
-  }
+  if (std::error_code EC = F.materialize())
+    report_fatal_error("Error reading bitcode file: " + EC.message());
   return FPM->run(F);
 }
 
@@ -1684,7 +1680,7 @@
   if (!FoundPass) {
     FoundPass = RequiredPass;
     // This should be guaranteed to add RequiredPass to the passmanager given
-    // that we checked for an avaiable analysis above.
+    // that we checked for an available analysis above.
     FPP->add(RequiredPass);
   }
   // Register P as the last user of FoundPass or RequiredPass.

diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp
index 65cdf38..3ec613c 100644
--- a/lib/IR/MDBuilder.cpp
+++ b/lib/IR/MDBuilder.cpp

@@ -60,10 +60,17 @@
   return MDNode::get(Context, Range);
 }
 
-MDNode *MDBuilder::createAnonymousTBAARoot() {
+MDNode *MDBuilder::createAnonymousAARoot(StringRef Name, MDNode *Extra) {
   // To ensure uniqueness the root node is self-referential.
-  MDNode *Dummy = MDNode::getTemporary(Context, ArrayRef<Value *>());
-  MDNode *Root = MDNode::get(Context, Dummy);
+  MDNode *Dummy = MDNode::getTemporary(Context, None);
+
+  SmallVector<Value *, 3> Args(1, Dummy);
+  if (Extra)
+    Args.push_back(Extra);
+  if (!Name.empty())
+    Args.push_back(createString(Name));
+  MDNode *Root = MDNode::get(Context, Args);
+
   // At this point we have
   //   !0 = metadata !{}            <- dummy
   //   !1 = metadata !{metadata !0} <- root
@@ -93,6 +100,15 @@
   }
 }
 
+MDNode *MDBuilder::createAliasScopeDomain(StringRef Name) {
+  return MDNode::get(Context, createString(Name));
+}
+
+MDNode *MDBuilder::createAliasScope(StringRef Name, MDNode *Domain) {
+  Value *Ops[2] = { createString(Name), Domain };
+  return MDNode::get(Context, Ops);
+}
+
 /// \brief Return metadata for a tbaa.struct node with the given
 /// struct field descriptions.
 MDNode *MDBuilder::createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {

diff --git a/lib/IR/Mangler.cpp b/lib/IR/Mangler.cpp
index 27d973b..5eeb797 100644
--- a/lib/IR/Mangler.cpp
+++ b/lib/IR/Mangler.cpp

@@ -22,23 +22,25 @@
 
 static void getNameWithPrefixx(raw_ostream &OS, const Twine &GVName,
                               Mangler::ManglerPrefixTy PrefixTy,
-                              const DataLayout &DL, bool UseAt) {
+                              const DataLayout &DL, char Prefix) {
   SmallString<256> TmpData;
   StringRef Name = GVName.toStringRef(TmpData);
   assert(!Name.empty() && "getNameWithPrefix requires non-empty name");
 
+  // No need to do anything special if the global has the special "do not
+  // mangle" flag in the name.
+  if (Name[0] == '\1') {
+    OS << Name.substr(1);
+    return;
+  }
+
   if (PrefixTy == Mangler::Private)
     OS << DL.getPrivateGlobalPrefix();
   else if (PrefixTy == Mangler::LinkerPrivate)
     OS << DL.getLinkerPrivateGlobalPrefix();
 
-  if (UseAt) {
-    OS << '@';
-  } else {
-    char Prefix = DL.getGlobalPrefix();
-    if (Prefix != '\0')
-      OS << Prefix;
-  }
+  if (Prefix != '\0')
+    OS << Prefix;
 
   // If this is a simple string that doesn't need escaping, just append it.
   OS << Name;
@@ -46,7 +48,8 @@
 
 void Mangler::getNameWithPrefix(raw_ostream &OS, const Twine &GVName,
                                 ManglerPrefixTy PrefixTy) const {
-  return getNameWithPrefixx(OS, GVName, PrefixTy, *DL, false);
+  char Prefix = DL->getGlobalPrefix();
+  return getNameWithPrefixx(OS, GVName, PrefixTy, *DL, Prefix);
 }
 
 void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
@@ -56,11 +59,21 @@
   return getNameWithPrefix(OS, GVName, PrefixTy);
 }
 
-/// AddFastCallStdCallSuffix - Microsoft fastcall and stdcall functions require
-/// a suffix on their name indicating the number of words of arguments they
-/// take.
-static void AddFastCallStdCallSuffix(raw_ostream &OS, const Function *F,
-                                     const DataLayout &TD) {
+static bool hasByteCountSuffix(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::X86_FastCall:
+  case CallingConv::X86_StdCall:
+  case CallingConv::X86_VectorCall:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// Microsoft fastcall and stdcall functions require a suffix on their name
+/// indicating the number of words of arguments they take.
+static void addByteCountSuffix(raw_ostream &OS, const Function *F,
+                               const DataLayout &TD) {
   // Calculate arguments size total.
   unsigned ArgWords = 0;
   for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
@@ -69,8 +82,9 @@
     // 'Dereference' type in case of byval or inalloca parameter attribute.
     if (AI->hasByValOrInAllocaAttr())
       Ty = cast<PointerType>(Ty)->getElementType();
-    // Size should be aligned to DWORD boundary
-    ArgWords += ((TD.getTypeAllocSize(Ty) + 3)/4)*4;
+    // Size should be aligned to pointer size.
+    unsigned PtrSize = TD.getPointerSize();
+    ArgWords += RoundUpToAlignment(TD.getTypeAllocSize(Ty), PtrSize);
   }
 
   OS << '@' << ArgWords;
@@ -99,41 +113,41 @@
   }
 
   StringRef Name = GV->getName();
+  char Prefix = DL->getGlobalPrefix();
 
-  // No need to do anything special if the global has the special "do not
-  // mangle" flag in the name.
-  if (Name[0] == '\1') {
-    OS << Name.substr(1);
-    return;
+  // Mangle functions with Microsoft calling conventions specially.  Only do
+  // this mangling for x86_64 vectorcall and 32-bit x86.
+  const Function *MSFunc = dyn_cast<Function>(GV);
+  if (Name.startswith("\01"))
+    MSFunc = nullptr; // Don't mangle when \01 is present.
+  CallingConv::ID CC =
+      MSFunc ? MSFunc->getCallingConv() : (unsigned)CallingConv::C;
+  if (!DL->hasMicrosoftFastStdCallMangling() &&
+      CC != CallingConv::X86_VectorCall)
+    MSFunc = nullptr;
+  if (MSFunc) {
+    if (CC == CallingConv::X86_FastCall)
+      Prefix = '@'; // fastcall functions have an @ prefix instead of _.
+    else if (CC == CallingConv::X86_VectorCall)
+      Prefix = '\0'; // vectorcall functions have no prefix.
   }
 
-  bool UseAt = false;
-  const Function *MSFunc = nullptr;
-  CallingConv::ID CC;
-  if (DL->hasMicrosoftFastStdCallMangling()) {
-    if ((MSFunc = dyn_cast<Function>(GV))) {
-      CC = MSFunc->getCallingConv();
-      // fastcall functions need to start with @ instead of _.
-      if (CC == CallingConv::X86_FastCall)
-        UseAt = true;
-    }
-  }
-
-  getNameWithPrefixx(OS, Name, PrefixTy, *DL, UseAt);
+  getNameWithPrefixx(OS, Name, PrefixTy, *DL, Prefix);
 
   if (!MSFunc)
     return;
 
-  // If we are supposed to add a microsoft-style suffix for stdcall/fastcall,
-  // add it.
-  // fastcall and stdcall functions usually need @42 at the end to specify
-  // the argument info.
+  // If we are supposed to add a microsoft-style suffix for stdcall, fastcall,
+  // or vectorcall, add it.  These functions have a suffix of @N where N is the
+  // cumulative byte size of all of the parameters to the function in decimal.
+  if (CC == CallingConv::X86_VectorCall)
+    OS << '@'; // vectorcall functions use a double @ suffix.
   FunctionType *FT = MSFunc->getFunctionType();
-  if ((CC == CallingConv::X86_FastCall || CC == CallingConv::X86_StdCall) &&
+  if (hasByteCountSuffix(CC) &&
       // "Pure" variadic functions do not receive @0 suffix.
       (!FT->isVarArg() || FT->getNumParams() == 0 ||
        (FT->getNumParams() == 1 && MSFunc->hasStructRetAttr())))
-    AddFastCallStdCallSuffix(OS, MSFunc, *DL);
+    addByteCountSuffix(OS, MSFunc, *DL);
 }
 
 void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,

diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 59137e4..27ba9f7 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp

@@ -25,25 +25,34 @@
 #include "llvm/IR/LeakDetector.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
+
 using namespace llvm;
 
+Metadata::Metadata(LLVMContext &Context, unsigned ID)
+    : Value(Type::getMetadataTy(Context), ID) {}
+
 //===----------------------------------------------------------------------===//
 // MDString implementation.
 //
 
 void MDString::anchor() { }
 
-MDString::MDString(LLVMContext &C)
-  : Value(Type::getMetadataTy(C), Value::MDStringVal) {}
-
 MDString *MDString::get(LLVMContext &Context, StringRef Str) {
-  LLVMContextImpl *pImpl = Context.pImpl;
-  StringMapEntry<Value*> &Entry =
-    pImpl->MDStringCache.GetOrCreateValue(Str);
-  Value *&S = Entry.getValue();
-  if (!S) S = new MDString(Context);
-  S->setValueName(&Entry);
-  return cast<MDString>(S);
+  auto &Store = Context.pImpl->MDStringCache;
+  auto I = Store.find(Str);
+  if (I != Store.end())
+    return &I->second;
+
+  auto *Entry =
+      StringMapEntry<MDString>::Create(Str, Store.getAllocator(), Context);
+  bool WasInserted = Store.insert(Entry);
+  (void)WasInserted;
+  assert(WasInserted && "Expected entry to be inserted");
+  return &Entry->second;
+}
+
+StringRef MDString::getString() const {
+  return StringMapEntry<MDString>::GetStringMapEntryFromValue(*this).first();
 }
 
 //===----------------------------------------------------------------------===//
@@ -57,26 +66,25 @@
     MDNodeOperand *Cur = this;
 
     while (Cur->getValPtrInt() != 1)
-      --Cur;
+      ++Cur;
 
     assert(Cur->getValPtrInt() == 1 &&
-           "Couldn't find the beginning of the operand list!");
-    return reinterpret_cast<MDNode*>(Cur) - 1;
+           "Couldn't find the end of the operand list!");
+    return reinterpret_cast<MDNode *>(Cur + 1);
   }
 
 public:
-  MDNodeOperand(Value *V) : CallbackVH(V) {}
+  MDNodeOperand() {}
   virtual ~MDNodeOperand();
 
   void set(Value *V) {
-    unsigned IsFirst = this->getValPtrInt();
+    unsigned IsLast = this->getValPtrInt();
     this->setValPtr(V);
-    this->setAsFirstOperand(IsFirst);
+    this->setAsLastOperand(IsLast);
   }
 
-  /// setAsFirstOperand - Accessor method to mark the operand as the first in
-  /// the list.
-  void setAsFirstOperand(unsigned V) { this->setValPtrInt(V); }
+  /// \brief Accessor method to mark the operand as the first in the list.
+  void setAsLastOperand(unsigned I) { this->setValPtrInt(I); }
 
   void deleted() override;
   void allUsesReplacedWith(Value *NV) override;
@@ -98,12 +106,11 @@
 // MDNode implementation.
 //
 
-/// getOperandPtr - Helper function to get the MDNodeOperand's coallocated on
-/// the end of the MDNode.
+/// \brief Get the MDNodeOperand's coallocated on the end of the MDNode.
 static MDNodeOperand *getOperandPtr(MDNode *N, unsigned Op) {
   // Use <= instead of < to permit a one-past-the-end address.
   assert(Op <= N->getNumOperands() && "Invalid operand number");
-  return reinterpret_cast<MDNodeOperand*>(N + 1) + Op;
+  return reinterpret_cast<MDNodeOperand *>(N) - N->getNumOperands() + Op;
 }
 
 void MDNode::replaceOperandWith(unsigned i, Value *Val) {
@@ -111,40 +118,54 @@
   replaceOperand(Op, Val);
 }
 
-MDNode::MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal)
-: Value(Type::getMetadataTy(C), Value::MDNodeVal) {
+void *MDNode::operator new(size_t Size, unsigned NumOps) {
+  void *Ptr = ::operator new(Size + NumOps * sizeof(MDNodeOperand));
+  MDNodeOperand *Op = static_cast<MDNodeOperand *>(Ptr);
+  if (NumOps) {
+    MDNodeOperand *Last = Op + NumOps;
+    for (; Op != Last; ++Op)
+      new (Op) MDNodeOperand();
+    (Op - 1)->setAsLastOperand(1);
+  }
+  return Op;
+}
+
+void MDNode::operator delete(void *Mem) {
+  MDNode *N = static_cast<MDNode *>(Mem);
+  MDNodeOperand *Op = static_cast<MDNodeOperand *>(Mem);
+  for (unsigned I = 0, E = N->NumOperands; I != E; ++I)
+    (--Op)->~MDNodeOperand();
+  ::operator delete(Op);
+}
+
+MDNode::MDNode(LLVMContext &C, unsigned ID, ArrayRef<Value *> Vals,
+               bool isFunctionLocal)
+    : Metadata(C, ID) {
   NumOperands = Vals.size();
 
   if (isFunctionLocal)
     setValueSubclassData(getSubclassDataFromValue() | FunctionLocalBit);
 
-  // Initialize the operand list, which is co-allocated on the end of the node.
+  // Initialize the operand list.
   unsigned i = 0;
-  for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op+NumOperands;
-       Op != E; ++Op, ++i) {
-    new (Op) MDNodeOperand(Vals[i]);
-
-    // Mark the first MDNodeOperand as being the first in the list of operands.
-    if (i == 0)
-      Op->setAsFirstOperand(1);
-  }
+  for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op + NumOperands;
+       Op != E; ++Op, ++i)
+    Op->set(Vals[i]);
 }
 
-/// ~MDNode - Destroy MDNode.
-MDNode::~MDNode() {
-  assert((getSubclassDataFromValue() & DestroyFlag) != 0 &&
-         "Not being destroyed through destroy()?");
+GenericMDNode::~GenericMDNode() {
   LLVMContextImpl *pImpl = getType()->getContext().pImpl;
   if (isNotUniqued()) {
     pImpl->NonUniquedMDNodes.erase(this);
   } else {
-    pImpl->MDNodeSet.RemoveNode(this);
+    pImpl->MDNodeSet.erase(this);
   }
+}
 
-  // Destroy the operands.
-  for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op+NumOperands;
+void GenericMDNode::dropAllReferences() {
+  for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op + NumOperands;
        Op != E; ++Op)
-    Op->~MDNodeOperand();
+    Op->set(nullptr);
 }
 
 static const Function *getFunctionForValue(Value *V) {
@@ -201,16 +222,7 @@
 #endif
 }
 
-// destroy - Delete this node.  Only when there are no uses.
-void MDNode::destroy() {
-  setValueSubclassData(getSubclassDataFromValue() | DestroyFlag);
-  // Placement delete, then free the memory.
-  this->~MDNode();
-  free(this);
-}
-
-/// isFunctionLocalValue - Return true if this is a value that would require a
-/// function-local MDNode.
+/// \brief Check if the Value  would require a function-local MDNode.
 static bool isFunctionLocalValue(Value *V) {
   return isa<Instruction>(V) || isa<Argument>(V) || isa<BasicBlock>(V) ||
          (isa<MDNode>(V) && cast<MDNode>(V)->isFunctionLocal());
@@ -218,21 +230,14 @@
 
 MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals,
                           FunctionLocalness FL, bool Insert) {
-  LLVMContextImpl *pImpl = Context.pImpl;
+  auto &Store = Context.pImpl->MDNodeSet;
 
-  // Add all the operand pointers. Note that we don't have to add the
-  // isFunctionLocal bit because that's implied by the operands.
-  // Note that if the operands are later nulled out, the node will be
-  // removed from the uniquing map.
-  FoldingSetNodeID ID;
-  for (Value *V : Vals)
-    ID.AddPointer(V);
-
-  void *InsertPoint;
-  MDNode *N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
-
-  if (N || !Insert)
-    return N;
+  GenericMDNodeInfo::KeyTy Key(Vals);
+  auto I = Store.find_as(Key);
+  if (I != Store.end())
+    return *I;
+  if (!Insert)
+    return nullptr;
 
   bool isFunctionLocal = false;
   switch (FL) {
@@ -254,15 +259,11 @@
   }
 
   // Coallocate space for the node and Operands together, then placement new.
-  void *Ptr = malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand));
-  N = new (Ptr) MDNode(Context, Vals, isFunctionLocal);
+  GenericMDNode *N =
+      new (Vals.size()) GenericMDNode(Context, Vals, isFunctionLocal);
 
-  // Cache the operand hash.
-  N->Hash = ID.ComputeHash();
-
-  // InsertPoint will have been set by the FindNodeOrInsertPos call.
-  pImpl->MDNodeSet.InsertNode(N, InsertPoint);
-
+  N->Hash = Key.Hash;
+  Store.insert(N);
   return N;
 }
 
@@ -281,48 +282,33 @@
 }
 
 MDNode *MDNode::getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals) {
-  MDNode *N =
-    (MDNode *)malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand));
-  N = new (N) MDNode(Context, Vals, FL_No);
-  N->setValueSubclassData(N->getSubclassDataFromValue() |
-                          NotUniquedBit);
+  MDNode *N = new (Vals.size()) MDNodeFwdDecl(Context, Vals, FL_No);
+  N->setValueSubclassData(N->getSubclassDataFromValue() | NotUniquedBit);
   LeakDetector::addGarbageObject(N);
   return N;
 }
 
 void MDNode::deleteTemporary(MDNode *N) {
   assert(N->use_empty() && "Temporary MDNode has uses!");
-  assert(!N->getContext().pImpl->MDNodeSet.RemoveNode(N) &&
-         "Deleting a non-temporary uniqued node!");
-  assert(!N->getContext().pImpl->NonUniquedMDNodes.erase(N) &&
-         "Deleting a non-temporary non-uniqued node!");
+  assert(isa<MDNodeFwdDecl>(N) && "Expected forward declaration");
   assert((N->getSubclassDataFromValue() & NotUniquedBit) &&
          "Temporary MDNode does not have NotUniquedBit set!");
-  assert((N->getSubclassDataFromValue() & DestroyFlag) == 0 &&
-         "Temporary MDNode has DestroyFlag set!");
   LeakDetector::removeGarbageObject(N);
-  N->destroy();
+  delete cast<MDNodeFwdDecl>(N);
 }
 
-/// getOperand - Return specified operand.
+/// \brief Return specified operand.
 Value *MDNode::getOperand(unsigned i) const {
   assert(i < getNumOperands() && "Invalid operand number");
   return *getOperandPtr(const_cast<MDNode*>(this), i);
 }
 
-void MDNode::Profile(FoldingSetNodeID &ID) const {
-  // Add all the operand pointers. Note that we don't have to add the
-  // isFunctionLocal bit because that's implied by the operands.
-  // Note that if the operands are later nulled out, the node will be
-  // removed from the uniquing map.
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    ID.AddPointer(getOperand(i));
-}
-
 void MDNode::setIsNotUniqued() {
   setValueSubclassData(getSubclassDataFromValue() | NotUniquedBit);
   LLVMContextImpl *pImpl = getType()->getContext().pImpl;
-  pImpl->NonUniquedMDNodes.insert(this);
+  auto *G = cast<GenericMDNode>(this);
+  G->Hash = 0;
+  pImpl->NonUniquedMDNodes.insert(G);
 }
 
 // Replace value from this node's operand list.
@@ -350,44 +336,45 @@
   if (From == To)
     return;
 
-  // Update the operand.
-  Op->set(To);
-
   // If this node is already not being uniqued (because one of the operands
   // already went to null), then there is nothing else to do here.
-  if (isNotUniqued()) return;
+  if (isNotUniqued()) {
+    Op->set(To);
+    return;
+  }
 
-  LLVMContextImpl *pImpl = getType()->getContext().pImpl;
+  auto &Store = getContext().pImpl->MDNodeSet;
+  auto *N = cast<GenericMDNode>(this);
 
-  // Remove "this" from the context map.  FoldingSet doesn't have to reprofile
-  // this node to remove it, so we don't care what state the operands are in.
-  pImpl->MDNodeSet.RemoveNode(this);
+  // Remove "this" from the context map.
+  Store.erase(N);
+
+  // Update the operand.
+  Op->set(To);
 
   // If we are dropping an argument to null, we choose to not unique the MDNode
   // anymore.  This commonly occurs during destruction, and uniquing these
   // brings little reuse.  Also, this means we don't need to include
-  // isFunctionLocal bits in FoldingSetNodeIDs for MDNodes.
+  // isFunctionLocal bits in the hash for MDNodes.
   if (!To) {
     setIsNotUniqued();
     return;
   }
 
-  // Now that the node is out of the folding set, get ready to reinsert it.
-  // First, check to see if another node with the same operands already exists
-  // in the set.  If so, then this node is redundant.
-  FoldingSetNodeID ID;
-  Profile(ID);
-  void *InsertPoint;
-  if (MDNode *N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint)) {
-    replaceAllUsesWith(N);
-    destroy();
+  // Now that the node is out of the table, get ready to reinsert it.  First,
+  // check to see if another node with the same operands already exists in the
+  // set.  If so, then this node is redundant.
+  SmallVector<Value *, 8> Vals;
+  GenericMDNodeInfo::KeyTy Key(N, Vals);
+  auto I = Store.find_as(Key);
+  if (I != Store.end()) {
+    N->replaceAllUsesWith(*I);
+    delete N;
     return;
   }
 
-  // Cache the operand hash.
-  Hash = ID.ComputeHash();
-  // InsertPoint will have been set by the FindNodeOrInsertPos call.
-  pImpl->MDNodeSet.InsertNode(this, InsertPoint);
+  N->Hash = Key.Hash;
+  Store.insert(N);
 
   // If this MDValue was previously function-local but no longer is, clear
   // its function-local flag.
@@ -406,6 +393,41 @@
   }
 }
 
+MDNode *MDNode::concatenate(MDNode *A, MDNode *B) {
+  if (!A)
+    return B;
+  if (!B)
+    return A;
+
+  SmallVector<Value *, 4> Vals(A->getNumOperands() +
+                               B->getNumOperands());
+
+  unsigned j = 0;
+  for (unsigned i = 0, ie = A->getNumOperands(); i != ie; ++i)
+    Vals[j++] = A->getOperand(i);
+  for (unsigned i = 0, ie = B->getNumOperands(); i != ie; ++i)
+    Vals[j++] = B->getOperand(i);
+
+  return MDNode::get(A->getContext(), Vals);
+}
+
+MDNode *MDNode::intersect(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return nullptr;
+
+  SmallVector<Value *, 4> Vals;
+  for (unsigned i = 0, ie = A->getNumOperands(); i != ie; ++i) {
+    Value *V = A->getOperand(i);
+    for (unsigned j = 0, je = B->getNumOperands(); j != je; ++j)
+      if (V == B->getOperand(j)) {
+        Vals.push_back(V);
+        break;
+      }
+  }
+
+  return MDNode::get(A->getContext(), Vals);
+}
+
 MDNode *MDNode::getMostGenericFPMath(MDNode *A, MDNode *B) {
   if (!A || !B)
     return nullptr;
@@ -524,49 +546,41 @@
 //
 
 static SmallVector<TrackingVH<MDNode>, 4> &getNMDOps(void *Operands) {
-  return *(SmallVector<TrackingVH<MDNode>, 4>*)Operands;
+  return *(SmallVector<TrackingVH<MDNode>, 4> *)Operands;
 }
 
 NamedMDNode::NamedMDNode(const Twine &N)
-  : Name(N.str()), Parent(nullptr),
-    Operands(new SmallVector<TrackingVH<MDNode>, 4>()) {
-}
+    : Name(N.str()), Parent(nullptr),
+      Operands(new SmallVector<TrackingVH<MDNode>, 4>()) {}
 
 NamedMDNode::~NamedMDNode() {
   dropAllReferences();
   delete &getNMDOps(Operands);
 }
 
-/// getNumOperands - Return number of NamedMDNode operands.
 unsigned NamedMDNode::getNumOperands() const {
   return (unsigned)getNMDOps(Operands).size();
 }
 
-/// getOperand - Return specified operand.
 MDNode *NamedMDNode::getOperand(unsigned i) const {
   assert(i < getNumOperands() && "Invalid Operand number!");
-  return dyn_cast<MDNode>(&*getNMDOps(Operands)[i]);
+  return &*getNMDOps(Operands)[i];
 }
 
-/// addOperand - Add metadata Operand.
 void NamedMDNode::addOperand(MDNode *M) {
   assert(!M->isFunctionLocal() &&
          "NamedMDNode operands must not be function-local!");
   getNMDOps(Operands).push_back(TrackingVH<MDNode>(M));
 }
 
-/// eraseFromParent - Drop all references and remove the node from parent
-/// module.
 void NamedMDNode::eraseFromParent() {
   getParent()->eraseNamedMetadata(this);
 }
 
-/// dropAllReferences - Remove all uses and clear node vector.
 void NamedMDNode::dropAllReferences() {
   getNMDOps(Operands).clear();
 }
 
-/// getName - Return a constant reference to this named metadata's name.
 StringRef NamedMDNode::getName() const {
   return StringRef(Name);
 }
@@ -576,7 +590,8 @@
 //
 
 void Instruction::setMetadata(StringRef Kind, MDNode *Node) {
-  if (!Node && !hasMetadata()) return;
+  if (!Node && !hasMetadata())
+    return;
   setMetadata(getContext().getMDKindID(Kind), Node);
 }
 
@@ -632,7 +647,8 @@
 /// node.  This updates/replaces metadata if already present, or removes it if
 /// Node is null.
 void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
-  if (!Node && !hasMetadata()) return;
+  if (!Node && !hasMetadata())
+    return;
 
   // Handle 'dbg' as a special case since it is not stored in the hash table.
   if (KindID == LLVMContext::MD_dbg) {
@@ -687,6 +703,12 @@
   // Otherwise, removing an entry that doesn't exist on the instruction.
 }
 
+void Instruction::setAAMetadata(const AAMDNodes &N) {
+  setMetadata(LLVMContext::MD_tbaa, N.TBAA);
+  setMetadata(LLVMContext::MD_alias_scope, N.Scope);
+  setMetadata(LLVMContext::MD_noalias, N.NoAlias);
+}
+
 MDNode *Instruction::getMetadataImpl(unsigned KindID) const {
   // Handle 'dbg' as a special case since it is not stored in the hash table.
   if (KindID == LLVMContext::MD_dbg)
@@ -703,8 +725,8 @@
   return nullptr;
 }
 
-void Instruction::getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned,
-                                       MDNode*> > &Result) const {
+void Instruction::getAllMetadataImpl(
+    SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const {
   Result.clear();
   
   // Handle 'dbg' as a special case since it is not stored in the hash table.
@@ -728,9 +750,8 @@
     array_pod_sort(Result.begin(), Result.end());
 }
 
-void Instruction::
-getAllMetadataOtherThanDebugLocImpl(SmallVectorImpl<std::pair<unsigned,
-                                    MDNode*> > &Result) const {
+void Instruction::getAllMetadataOtherThanDebugLocImpl(
+    SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const {
   Result.clear();
   assert(hasMetadataHashEntry() &&
          getContext().pImpl->MetadataStore.count(this) &&

diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index f1b1f9a..14e534b 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp

@@ -259,6 +259,17 @@
   NamedMDList.erase(NMD);
 }
 
+bool Module::isValidModFlagBehavior(Value *V, ModFlagBehavior &MFB) {
+  if (ConstantInt *Behavior = dyn_cast<ConstantInt>(V)) {
+    uint64_t Val = Behavior->getLimitedValue();
+    if (Val >= ModFlagBehaviorFirstVal && Val <= ModFlagBehaviorLastVal) {
+      MFB = static_cast<ModFlagBehavior>(Val);
+      return true;
+    }
+  }
+  return false;
+}
+
 /// getModuleFlagsMetadata - Returns the module flags in the provided vector.
 void Module::
 getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const {
@@ -266,15 +277,15 @@
   if (!ModFlags) return;
 
   for (const MDNode *Flag : ModFlags->operands()) {
-    if (Flag->getNumOperands() >= 3 && isa<ConstantInt>(Flag->getOperand(0)) &&
+    ModFlagBehavior MFB;
+    if (Flag->getNumOperands() >= 3 &&
+        isValidModFlagBehavior(Flag->getOperand(0), MFB) &&
         isa<MDString>(Flag->getOperand(1))) {
       // Check the operands of the MDNode before accessing the operands.
       // The verifier will actually catch these failures.
-      ConstantInt *Behavior = cast<ConstantInt>(Flag->getOperand(0));
       MDString *Key = cast<MDString>(Flag->getOperand(1));
       Value *Val = Flag->getOperand(2);
-      Flags.push_back(ModuleFlagEntry(ModFlagBehavior(Behavior->getZExtValue()),
-                                      Key, Val));
+      Flags.push_back(ModuleFlagEntry(MFB, Key, Val));
     }
   }
 }
@@ -378,28 +389,17 @@
   Materializer.reset(GVM);
 }
 
-bool Module::isMaterializable(const GlobalValue *GV) const {
-  if (Materializer)
-    return Materializer->isMaterializable(GV);
-  return false;
-}
-
 bool Module::isDematerializable(const GlobalValue *GV) const {
   if (Materializer)
     return Materializer->isDematerializable(GV);
   return false;
 }
 
-bool Module::Materialize(GlobalValue *GV, std::string *ErrInfo) {
+std::error_code Module::materialize(GlobalValue *GV) {
   if (!Materializer)
-    return false;
+    return std::error_code();
 
-  std::error_code EC = Materializer->Materialize(GV);
-  if (!EC)
-    return false;
-  if (ErrInfo)
-    *ErrInfo = EC.message();
-  return true;
+  return Materializer->materialize(GV);
 }
 
 void Module::Dematerialize(GlobalValue *GV) {
@@ -413,13 +413,10 @@
   return Materializer->MaterializeModule(this);
 }
 
-std::error_code Module::materializeAllPermanently(bool ReleaseBuffer) {
+std::error_code Module::materializeAllPermanently() {
   if (std::error_code EC = materializeAll())
     return EC;
 
-  if (ReleaseBuffer)
-    Materializer->releaseBuffer();
-
   Materializer.reset();
   return std::error_code();
 }
@@ -455,9 +452,20 @@
 }
 
 Comdat *Module::getOrInsertComdat(StringRef Name) {
-  Comdat C;
-  StringMapEntry<Comdat> &Entry =
-      ComdatSymTab.GetOrCreateValue(Name, std::move(C));
+  auto &Entry = *ComdatSymTab.insert(std::make_pair(Name, Comdat())).first;
   Entry.second.Name = &Entry;
   return &Entry.second;
 }
+
+PICLevel::Level Module::getPICLevel() const {
+  Value *Val = getModuleFlag("PIC Level");
+
+  if (Val == NULL)
+    return PICLevel::Default;
+
+  return static_cast<PICLevel::Level>(cast<ConstantInt>(Val)->getZExtValue());
+}
+
+void Module::setPICLevel(PICLevel::Level PL) {
+  addModuleFlag(ModFlagBehavior::Error, "PIC Level", PL);
+}

diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index 0defb6a..2e2a7cb 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp

@@ -53,7 +53,7 @@
   // If we don't have a cached result for this module, look up the pass and run
   // it to produce a result, which we then add to the cache.
   if (Inserted)
-    RI->second = std::move(lookupPass(PassID).run(M, this));
+    RI->second = lookupPass(PassID).run(M, this);
 
   return *RI->second;
 }

diff --git a/lib/IR/PassRegistry.cpp b/lib/IR/PassRegistry.cpp
index 91940a9..b879fef 100644
--- a/lib/IR/PassRegistry.cpp
+++ b/lib/IR/PassRegistry.cpp

@@ -36,8 +36,7 @@
 // Accessors
 //
 
-PassRegistry::~PassRegistry() {
-}
+PassRegistry::~PassRegistry() {}
 
 const PassInfo *PassRegistry::getPassInfo(const void *TI) const {
   sys::SmartScopedReader<true> Guard(Lock);
@@ -58,77 +57,62 @@
 void PassRegistry::registerPass(const PassInfo &PI, bool ShouldFree) {
   sys::SmartScopedWriter<true> Guard(Lock);
   bool Inserted =
-    PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
+      PassInfoMap.insert(std::make_pair(PI.getTypeInfo(), &PI)).second;
   assert(Inserted && "Pass registered multiple times!");
   (void)Inserted;
   PassInfoStringMap[PI.getPassArgument()] = &PI;
-  
-  // Notify any listeners.
-  for (std::vector<PassRegistrationListener*>::iterator
-       I = Listeners.begin(), E = Listeners.end(); I != E; ++I)
-    (*I)->passRegistered(&PI);
-  
-  if (ShouldFree) ToFree.push_back(std::unique_ptr<const PassInfo>(&PI));
-}
 
-void PassRegistry::unregisterPass(const PassInfo &PI) {
-  sys::SmartScopedWriter<true> Guard(Lock);
-  MapType::iterator I = PassInfoMap.find(PI.getTypeInfo());
-  assert(I != PassInfoMap.end() && "Pass registered but not in map!");
-  
-  // Remove pass from the map.
-  PassInfoMap.erase(I);
-  PassInfoStringMap.erase(PI.getPassArgument());
+  // Notify any listeners.
+  for (auto *Listener : Listeners)
+    Listener->passRegistered(&PI);
+
+  if (ShouldFree)
+    ToFree.push_back(std::unique_ptr<const PassInfo>(&PI));
 }
 
 void PassRegistry::enumerateWith(PassRegistrationListener *L) {
   sys::SmartScopedReader<true> Guard(Lock);
-  for (auto I = PassInfoMap.begin(), E = PassInfoMap.end(); I != E; ++I)
-    L->passEnumerate(I->second);
+  for (auto PassInfoPair : PassInfoMap)
+    L->passEnumerate(PassInfoPair.second);
 }
 
-
 /// Analysis Group Mechanisms.
-void PassRegistry::registerAnalysisGroup(const void *InterfaceID, 
+void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
                                          const void *PassID,
-                                         PassInfo& Registeree,
-                                         bool isDefault,
+                                         PassInfo &Registeree, bool isDefault,
                                          bool ShouldFree) {
-  PassInfo *InterfaceInfo =  const_cast<PassInfo*>(getPassInfo(InterfaceID));
+  PassInfo *InterfaceInfo = const_cast<PassInfo *>(getPassInfo(InterfaceID));
   if (!InterfaceInfo) {
     // First reference to Interface, register it now.
     registerPass(Registeree);
     InterfaceInfo = &Registeree;
   }
-  assert(Registeree.isAnalysisGroup() && 
+  assert(Registeree.isAnalysisGroup() &&
          "Trying to join an analysis group that is a normal pass!");
 
   if (PassID) {
-    PassInfo *ImplementationInfo = const_cast<PassInfo*>(getPassInfo(PassID));
+    PassInfo *ImplementationInfo = const_cast<PassInfo *>(getPassInfo(PassID));
     assert(ImplementationInfo &&
            "Must register pass before adding to AnalysisGroup!");
 
     sys::SmartScopedWriter<true> Guard(Lock);
-    
+
     // Make sure we keep track of the fact that the implementation implements
     // the interface.
     ImplementationInfo->addInterfaceImplemented(InterfaceInfo);
 
-    AnalysisGroupInfo &AGI = AnalysisGroupInfoMap[InterfaceInfo];
-    assert(AGI.Implementations.count(ImplementationInfo) == 0 &&
-           "Cannot add a pass to the same analysis group more than once!");
-    AGI.Implementations.insert(ImplementationInfo);
     if (isDefault) {
       assert(InterfaceInfo->getNormalCtor() == nullptr &&
              "Default implementation for analysis group already specified!");
-      assert(ImplementationInfo->getNormalCtor() &&
-           "Cannot specify pass as default if it does not have a default ctor");
+      assert(
+          ImplementationInfo->getNormalCtor() &&
+          "Cannot specify pass as default if it does not have a default ctor");
       InterfaceInfo->setNormalCtor(ImplementationInfo->getNormalCtor());
       InterfaceInfo->setTargetMachineCtor(
           ImplementationInfo->getTargetMachineCtor());
     }
   }
-  
+
   if (ShouldFree)
     ToFree.push_back(std::unique_ptr<const PassInfo>(&Registeree));
 }
@@ -140,7 +124,7 @@
 
 void PassRegistry::removeRegistrationListener(PassRegistrationListener *L) {
   sys::SmartScopedWriter<true> Guard(Lock);
-  
+
   auto I = std::find(Listeners.begin(), Listeners.end(), L);
   Listeners.erase(I);
 }

diff --git a/lib/IR/SymbolTableListTraitsImpl.h b/lib/IR/SymbolTableListTraitsImpl.h
index 8302597..a18f982 100644
--- a/lib/IR/SymbolTableListTraitsImpl.h
+++ b/lib/IR/SymbolTableListTraitsImpl.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYMBOLTABLELISTTRAITS_IMPL_H
-#define LLVM_SYMBOLTABLELISTTRAITS_IMPL_H
+#ifndef LLVM_LIB_IR_SYMBOLTABLELISTTRAITSIMPL_H
+#define LLVM_LIB_IR_SYMBOLTABLELISTTRAITSIMPL_H
 
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/ValueSymbolTable.h"

diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 1efde47..0458b5f 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp

@@ -89,9 +89,13 @@
 
   // At this point we have only various mismatches of the first class types
   // remaining and ptr->ptr. Just select the lossless conversions. Everything
-  // else is not lossless.
-  if (this->isPointerTy())
-    return Ty->isPointerTy();
+  // else is not lossless. Conservatively assume we can't losslessly convert
+  // between pointers with different address spaces.
+  if (const PointerType *PTy = dyn_cast<PointerType>(this)) {
+    if (const PointerType *OtherPTy = dyn_cast<PointerType>(Ty))
+      return PTy->getAddressSpace() == OtherPTy->getAddressSpace();
+    return false;
+  }
   return false;  // Other types have no identity values
 }
 
@@ -155,7 +159,7 @@
 /// isSizedDerivedType - Derived types like structures and arrays are sized
 /// iff all of the members of the type are sized as well.  Since asking for
 /// their size is relatively uncommon, move this operation out of line.
-bool Type::isSizedDerivedType(SmallPtrSet<const Type*, 4> *Visited) const {
+bool Type::isSizedDerivedType(SmallPtrSetImpl<const Type*> *Visited) const {
   if (const ArrayType *ATy = dyn_cast<ArrayType>(this))
     return ATy->getElementType()->isSized(Visited);
 
@@ -454,10 +458,11 @@
   }
   
   // Look up the entry for the name.
-  EntryTy *Entry = &getContext().pImpl->NamedStructTypes.GetOrCreateValue(Name);
-  
+  auto IterBool =
+      getContext().pImpl->NamedStructTypes.insert(std::make_pair(Name, this));
+
   // While we have a name collision, try a random rename.
-  if (Entry->getValue()) {
+  if (!IterBool.second) {
     SmallString<64> TempStr(Name);
     TempStr.push_back('.');
     raw_svector_ostream TmpStream(TempStr);
@@ -467,19 +472,16 @@
       TempStr.resize(NameSize + 1);
       TmpStream.resync();
       TmpStream << getContext().pImpl->NamedStructTypesUniqueID++;
-      
-      Entry = &getContext().pImpl->
-                 NamedStructTypes.GetOrCreateValue(TmpStream.str());
-    } while (Entry->getValue());
-  }
 
-  // Okay, we found an entry that isn't used.  It's us!
-  Entry->setValue(this);
+      IterBool = getContext().pImpl->NamedStructTypes.insert(
+          std::make_pair(TmpStream.str(), this));
+    } while (!IterBool.second);
+  }
 
   // Delete the old string data.
   if (SymbolTableEntry)
     ((EntryTy *)SymbolTableEntry)->Destroy(SymbolTable.getAllocator());
-  SymbolTableEntry = Entry;
+  SymbolTableEntry = &*IterBool.first;
 }
 
 //===----------------------------------------------------------------------===//
@@ -506,7 +508,9 @@
     StructFields.push_back(type);
     type = va_arg(ap, llvm::Type*);
   }
-  return llvm::StructType::get(Ctx, StructFields);
+  auto *Ret = llvm::StructType::get(Ctx, StructFields);
+  va_end(ap);
+  return Ret;
 }
 
 StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements,
@@ -547,16 +551,18 @@
     StructFields.push_back(type);
     type = va_arg(ap, llvm::Type*);
   }
-  return llvm::StructType::create(Ctx, StructFields, Name);
+  auto *Ret = llvm::StructType::create(Ctx, StructFields, Name);
+  va_end(ap);
+  return Ret;
 }
 
-bool StructType::isSized(SmallPtrSet<const Type*, 4> *Visited) const {
+bool StructType::isSized(SmallPtrSetImpl<const Type*> *Visited) const {
   if ((getSubclassData() & SCDB_IsSized) != 0)
     return true;
   if (isOpaque())
     return false;
 
-  if (Visited && !Visited->insert(this))
+  if (Visited && !Visited->insert(this).second)
     return false;
 
   // Okay, our struct is sized if all of the elements are, but if one of the
@@ -591,6 +597,7 @@
     type = va_arg(ap, llvm::Type*);
   }
   setBody(StructFields);
+  va_end(ap);
 }
 
 bool StructType::isValidElementType(Type *ElemTy) {

diff --git a/lib/IR/TypeFinder.cpp b/lib/IR/TypeFinder.cpp
index 689b903..6796075 100644
--- a/lib/IR/TypeFinder.cpp
+++ b/lib/IR/TypeFinder.cpp

@@ -40,7 +40,7 @@
   }
 
   // Get types from functions.
-  SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst;
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDForInst;
   for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
     incorporateType(FI->getType());
 

diff --git a/lib/IR/Use.cpp b/lib/IR/Use.cpp
index 047861c..cae845d 100644
--- a/lib/IR/Use.cpp
+++ b/lib/IR/Use.cpp

@@ -52,7 +52,7 @@
 // Sets up the waymarking algorithm's tags for a series of Uses. See the
 // algorithm details here:
 //
-//   http://www.llvm.org/docs/ProgrammersManual.html#UserLayout
+//   http://www.llvm.org/docs/ProgrammersManual.html#the-waymarking-algorithm
 //
 Use *Use::initTags(Use *const Start, Use *Stop) {
   ptrdiff_t Done = 0;

diff --git a/lib/IR/UseListOrder.cpp b/lib/IR/UseListOrder.cpp
new file mode 100644
index 0000000..d064e67
--- /dev/null
+++ b/lib/IR/UseListOrder.cpp

@@ -0,0 +1,43 @@
+//===- UseListOrder.cpp - Implement Use List Order ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement structures and command-line options for preserving use-list order.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/UseListOrder.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<bool> PreserveBitcodeUseListOrder(
+    "preserve-bc-use-list-order",
+    cl::desc("Experimental support to preserve bitcode use-list order."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> PreserveAssemblyUseListOrder(
+    "preserve-ll-use-list-order",
+    cl::desc("Experimental support to preserve assembly use-list order."),
+    cl::init(false), cl::Hidden);
+
+bool llvm::shouldPreserveBitcodeUseListOrder() {
+  return PreserveBitcodeUseListOrder;
+}
+
+bool llvm::shouldPreserveAssemblyUseListOrder() {
+  return PreserveAssemblyUseListOrder;
+}
+
+void llvm::setPreserveBitcodeUseListOrder(bool ShouldPreserve) {
+  PreserveBitcodeUseListOrder = ShouldPreserve;
+}
+
+void llvm::setPreserveAssemblyUseListOrder(bool ShouldPreserve) {
+  PreserveAssemblyUseListOrder = ShouldPreserve;
+}

diff --git a/lib/IR/User.cpp b/lib/IR/User.cpp
index 9406828..ee83eac 100644
--- a/lib/IR/User.cpp
+++ b/lib/IR/User.cpp

@@ -20,9 +20,6 @@
 
 void User::anchor() {}
 
-// replaceUsesOfWith - Replaces all references to the "From" definition with
-// references to the "To" definition.
-//
 void User::replaceUsesOfWith(Value *From, Value *To) {
   if (From == To) return;   // Duh what?
 

diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 35c241a..4e0c11f1 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp

@@ -15,6 +15,7 @@
 #include "LLVMContextImpl.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -44,7 +45,8 @@
 
 Value::Value(Type *ty, unsigned scid)
     : VTy(checkType(ty)), UseList(nullptr), Name(nullptr), SubclassID(scid),
-      HasValueHandle(0), SubclassOptionalData(0), SubclassData(0) {
+      HasValueHandle(0), SubclassOptionalData(0), SubclassData(0),
+      NumOperands(0) {
   // FIXME: Why isn't this in the subclass gunk??
   // Note, we cannot call isa<CallInst> before the CallInst has been
   // constructed.
@@ -87,8 +89,6 @@
   LeakDetector::removeGarbageObject(this);
 }
 
-/// hasNUses - Return true if this Value has exactly N users.
-///
 bool Value::hasNUses(unsigned N) const {
   const_use_iterator UI = use_begin(), E = use_end();
 
@@ -97,9 +97,6 @@
   return UI == E;
 }
 
-/// hasNUsesOrMore - Return true if this value has N users or more.  This is
-/// logically equivalent to getNumUses() >= N.
-///
 bool Value::hasNUsesOrMore(unsigned N) const {
   const_use_iterator UI = use_begin(), E = use_end();
 
@@ -109,8 +106,6 @@
   return true;
 }
 
-/// isUsedInBasicBlock - Return true if this value is used in the specified
-/// basic block.
 bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
   // This can be computed either by scanning the instructions in BB, or by
   // scanning the use list of this Value. Both lists can be very long, but
@@ -132,10 +127,6 @@
   return false;
 }
 
-
-/// getNumUses - This method computes the number of uses of this Value.  This
-/// is a linear time operation.  Use hasOneUse or hasNUses to check for specific
-/// values.
 unsigned Value::getNumUses() const {
   return (unsigned)std::distance(use_begin(), use_end());
 }
@@ -235,9 +226,6 @@
   Name = ST->createValueName(NameRef, this);
 }
 
-
-/// takeName - transfer the name from V to this value, setting V's name to
-/// empty.  It is an error to call V->takeName(V).
 void Value::takeName(Value *V) {
   assert(SubclassID != MDStringVal && "Cannot take the name of an MDString!");
 
@@ -302,9 +290,9 @@
 }
 
 #ifndef NDEBUG
-static bool contains(SmallPtrSet<ConstantExpr *, 4> &Cache, ConstantExpr *Expr,
+static bool contains(SmallPtrSetImpl<ConstantExpr *> &Cache, ConstantExpr *Expr,
                      Constant *C) {
-  if (!Cache.insert(Expr))
+  if (!Cache.insert(Expr).second)
     return false;
 
   for (auto &O : Expr->operands()) {
@@ -413,7 +401,7 @@
       return V;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
-  } while (Visited.insert(V));
+  } while (Visited.insert(V).second);
 
   return V;
 }
@@ -454,7 +442,8 @@
         return V;
       Offset = GEPOffset;
       V = GEP->getPointerOperand();
-    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+    } else if (Operator::getOpcode(V) == Instruction::BitCast ||
+               Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       V = GA->getAliasee();
@@ -462,7 +451,7 @@
       return V;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
-  } while (Visited.insert(V));
+  } while (Visited.insert(V).second);
 
   return V;
 }
@@ -471,10 +460,12 @@
   return stripPointerCastsAndOffsets<PSK_InBounds>(this);
 }
 
-/// isDereferenceablePointer - Test if this value is always a pointer to
-/// allocated and suitably aligned memory for a simple load or store.
+/// \brief Check if Value is always a dereferenceable pointer.
+///
+/// Test if V is always a pointer to allocated and suitably aligned memory for
+/// a simple load or store.
 static bool isDereferenceablePointer(const Value *V, const DataLayout *DL,
-                                     SmallPtrSet<const Value *, 32> &Visited) {
+                                     SmallPtrSetImpl<const Value *> &Visited) {
   // Note that it is not safe to speculate into a malloc'd region because
   // malloc may return null.
 
@@ -504,14 +495,34 @@
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     return !GV->hasExternalWeakLinkage();
 
-  // byval arguments are ok.
-  if (const Argument *A = dyn_cast<Argument>(V))
-    return A->hasByValAttr();
+  // byval arguments are okay. Arguments specifically marked as
+  // dereferenceable are okay too.
+  if (const Argument *A = dyn_cast<Argument>(V)) {
+    if (A->hasByValAttr())
+      return true;
+    else if (uint64_t Bytes = A->getDereferenceableBytes()) {
+      Type *Ty = V->getType()->getPointerElementType();
+      if (Ty->isSized() && DL && DL->getTypeStoreSize(Ty) <= Bytes)
+        return true;
+    }
+
+    return false;
+  }
+
+  // Return values from call sites specifically marked as dereferenceable are
+  // also okay.
+  if (ImmutableCallSite CS = V) {
+    if (uint64_t Bytes = CS.getDereferenceableBytes(0)) {
+      Type *Ty = V->getType()->getPointerElementType();
+      if (Ty->isSized() && DL && DL->getTypeStoreSize(Ty) <= Bytes)
+        return true;
+    }
+  }
 
   // For GEPs, determine if the indexing lands within the allocated object.
   if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     // Conservatively require that the base pointer be fully dereferenceable.
-    if (!Visited.insert(GEP->getOperand(0)))
+    if (!Visited.insert(GEP->getOperand(0)).second)
       return false;
     if (!isDereferenceablePointer(GEP->getOperand(0), DL, Visited))
       return false;
@@ -543,21 +554,39 @@
     return true;
   }
 
+  if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
+    return isDereferenceablePointer(ASC->getOperand(0), DL, Visited);
+
   // If we don't know, assume the worst.
   return false;
 }
 
-/// isDereferenceablePointer - Test if this value is always a pointer to
-/// allocated and suitably aligned memory for a simple load or store.
 bool Value::isDereferenceablePointer(const DataLayout *DL) const {
+  // When dereferenceability information is provided by a dereferenceable
+  // attribute, we know exactly how many bytes are dereferenceable. If we can
+  // determine the exact offset to the attributed variable, we can use that
+  // information here.
+  Type *Ty = getType()->getPointerElementType();
+  if (Ty->isSized() && DL) {
+    APInt Offset(DL->getTypeStoreSizeInBits(getType()), 0);
+    const Value *BV = stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
+
+    APInt DerefBytes(Offset.getBitWidth(), 0);
+    if (const Argument *A = dyn_cast<Argument>(BV))
+      DerefBytes = A->getDereferenceableBytes();
+    else if (ImmutableCallSite CS = BV)
+      DerefBytes = CS.getDereferenceableBytes(0);
+
+    if (DerefBytes.getBoolValue() && Offset.isNonNegative()) {
+      if (DerefBytes.uge(Offset + DL->getTypeStoreSize(Ty)))
+        return true;
+    }
+  }
+
   SmallPtrSet<const Value *, 32> Visited;
   return ::isDereferenceablePointer(this, DL, Visited);
 }
 
-/// DoPHITranslation - If this value is a PHI node with CurBB as its parent,
-/// return the value in the PHI node corresponding to PredBB.  If not, return
-/// ourself.  This is useful if you want to know the value something has in a
-/// predecessor block.
 Value *Value::DoPHITranslation(const BasicBlock *CurBB,
                                const BasicBlock *PredBB) {
   PHINode *PN = dyn_cast<PHINode>(this);
@@ -568,12 +597,29 @@
 
 LLVMContext &Value::getContext() const { return VTy->getContext(); }
 
+void Value::reverseUseList() {
+  if (!UseList || !UseList->Next)
+    // No need to reverse 0 or 1 uses.
+    return;
+
+  Use *Head = UseList;
+  Use *Current = UseList->Next;
+  Head->Next = nullptr;
+  while (Current) {
+    Use *Next = Current->Next;
+    Current->Next = Head;
+    Head->setPrev(&Current->Next);
+    Head = Current;
+    Current = Next;
+  }
+  UseList = Head;
+  Head->setPrev(&UseList);
+}
+
 //===----------------------------------------------------------------------===//
 //                             ValueHandleBase Class
 //===----------------------------------------------------------------------===//
 
-/// AddToExistingUseList - Add this ValueHandle to the use list for VP, where
-/// List is known to point into the existing use list.
 void ValueHandleBase::AddToExistingUseList(ValueHandleBase **List) {
   assert(List && "Handle list is null?");
 
@@ -597,7 +643,6 @@
     Next->setPrevPtr(&Next);
 }
 
-/// AddToUseList - Add this ValueHandle to the use list for VP.
 void ValueHandleBase::AddToUseList() {
   assert(VP.getPointer() && "Null pointer doesn't have a use list!");
 
@@ -641,7 +686,6 @@
   }
 }
 
-/// RemoveFromUseList - Remove this ValueHandle from its current use list.
 void ValueHandleBase::RemoveFromUseList() {
   assert(VP.getPointer() && VP.getPointer()->HasValueHandle &&
          "Pointer doesn't have a use list!");
@@ -729,6 +773,8 @@
 void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
   assert(Old->HasValueHandle &&"Should only be called if ValueHandles present");
   assert(Old != New && "Changing value into itself!");
+  assert(Old->getType() == New->getType() &&
+         "replaceAllUses of value with new value of different type!");
 
   // Get the linked list base, which is guaranteed to exist since the
   // HasValueHandle flag is set.

diff --git a/lib/IR/ValueSymbolTable.cpp b/lib/IR/ValueSymbolTable.cpp
index e9e979a..2b23f6d 100644
--- a/lib/IR/ValueSymbolTable.cpp
+++ b/lib/IR/ValueSymbolTable.cpp

@@ -56,11 +56,10 @@
     raw_svector_ostream(UniqueName) << ++LastUnique;
 
     // Try insert the vmap entry with this suffix.
-    ValueName &NewName = vmap.GetOrCreateValue(UniqueName);
-    if (!NewName.getValue()) {
+    auto IterBool = vmap.insert(std::make_pair(UniqueName, V));
+    if (IterBool.second) {
       // Newly inserted name.  Success!
-      NewName.setValue(V);
-      V->Name = &NewName;
+      V->Name = &*IterBool.first;
      //DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V << "\n");
       return;
     }
@@ -78,12 +77,11 @@
 /// auto-renames the name and returns that instead.
 ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
   // In the common case, the name is not already in the symbol table.
-  ValueName &Entry = vmap.GetOrCreateValue(Name);
-  if (!Entry.getValue()) {
-    Entry.setValue(V);
+  auto IterBool = vmap.insert(std::make_pair(Name, V));
+  if (IterBool.second) {
     //DEBUG(dbgs() << " Inserted value: " << Entry.getKeyData() << ": "
     //           << *V << "\n");
-    return &Entry;
+    return &*IterBool.first;
   }
   
   // Otherwise, there is a naming conflict.  Rename this value.
@@ -95,12 +93,11 @@
     raw_svector_ostream(UniqueName) << ++LastUnique;
     
     // Try insert the vmap entry with this suffix.
-    ValueName &NewName = vmap.GetOrCreateValue(UniqueName);
-    if (!NewName.getValue()) {
-      // Newly inserted name.  Success!
-      NewName.setValue(V);
-     //DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V << "\n");
-      return &NewName;
+    auto IterBool = vmap.insert(std::make_pair(UniqueName, V));
+    if (IterBool.second) {
+      // DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V <<
+      //       "\n");
+      return &*IterBool.first;
     }
   }
 }

diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 314bad3..9698dbd 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp

@@ -257,7 +257,7 @@
   void visitGlobalVariable(const GlobalVariable &GV);
   void visitGlobalAlias(const GlobalAlias &GA);
   void visitAliaseeSubExpr(const GlobalAlias &A, const Constant &C);
-  void visitAliaseeSubExpr(SmallPtrSet<const GlobalAlias *, 4> &Visited,
+  void visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias *> &Visited,
                            const GlobalAlias &A, const Constant &C);
   void visitNamedMDNode(const NamedMDNode &NMD);
   void visitMDNode(MDNode &MD, Function *F);
@@ -269,6 +269,8 @@
                        SmallVectorImpl<const MDNode *> &Requirements);
   void visitFunction(const Function &F);
   void visitBasicBlock(BasicBlock &BB);
+  void visitRangeMetadata(Instruction& I, MDNode* Range, Type* Ty);
+
 
   // InstVisitor overrides...
   using InstVisitor<Verifier>::visit;
@@ -375,11 +377,13 @@
 
 
 void Verifier::visitGlobalValue(const GlobalValue &GV) {
-  Assert1(!GV.isDeclaration() || GV.isMaterializable() ||
-              GV.hasExternalLinkage() || GV.hasExternalWeakLinkage(),
+  Assert1(!GV.isDeclaration() || GV.hasExternalLinkage() ||
+              GV.hasExternalWeakLinkage(),
           "Global is external, but doesn't have external or weak linkage!",
           &GV);
 
+  Assert1(GV.getAlignment() <= Value::MaximumAlignment,
+          "huge alignment values are unsupported", &GV);
   Assert1(!GV.hasAppendingLinkage() || isa<GlobalVariable>(GV),
           "Only global variables can have appending linkage!", &GV);
 
@@ -476,7 +480,7 @@
 
   while (!WorkStack.empty()) {
     const Value *V = WorkStack.pop_back_val();
-    if (!Visited.insert(V))
+    if (!Visited.insert(V).second)
       continue;
 
     if (const User *U = dyn_cast<User>(V)) {
@@ -500,13 +504,13 @@
   visitAliaseeSubExpr(Visited, GA, C);
 }
 
-void Verifier::visitAliaseeSubExpr(SmallPtrSet<const GlobalAlias *, 4> &Visited,
+void Verifier::visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias*> &Visited,
                                    const GlobalAlias &GA, const Constant &C) {
   if (const auto *GV = dyn_cast<GlobalValue>(&C)) {
     Assert1(!GV->isDeclaration(), "Alias must point to a definition", &GA);
 
     if (const auto *GA2 = dyn_cast<GlobalAlias>(GV)) {
-      Assert1(Visited.insert(GA2), "Aliases cannot form a cycle", &GA);
+      Assert1(Visited.insert(GA2).second, "Aliases cannot form a cycle", &GA);
 
       Assert1(!GA2->mayBeOverridden(), "Alias cannot point to a weak alias",
               &GA);
@@ -564,7 +568,7 @@
 void Verifier::visitMDNode(MDNode &MD, Function *F) {
   // Only visit each node once.  Metadata can be mutually recursive, so this
   // avoids infinite recursion here, as well as being an optimization.
-  if (!MDNodes.insert(&MD))
+  if (!MDNodes.insert(&MD).second)
     return;
 
   for (unsigned i = 0, e = MD.getNumOperands(); i != e; ++i) {
@@ -605,11 +609,10 @@
     Assert1(GV,
             "comdat selection kind requires a global value with the same name",
             &C);
-  // The Module is invalid if the GlobalValue has local linkage.  Allowing
-  // otherwise opens us up to seeing the underling global value get renamed if
-  // collisions occur.
+  // The Module is invalid if the GlobalValue has private linkage.  Entities
+  // with private linkage don't have entries in the symbol table.
   if (GV)
-    Assert1(!GV->hasLocalLinkage(), "comdat global value has local linkage",
+    Assert1(!GV->hasPrivateLinkage(), "comdat global value has private linkage",
             GV);
 }
 
@@ -672,24 +675,23 @@
   // constant int), the flag ID (an MDString), and the value.
   Assert1(Op->getNumOperands() == 3,
           "incorrect number of operands in module flag", Op);
-  ConstantInt *Behavior = dyn_cast<ConstantInt>(Op->getOperand(0));
+  Module::ModFlagBehavior MFB;
+  if (!Module::isValidModFlagBehavior(Op->getOperand(0), MFB)) {
+    Assert1(
+        dyn_cast<ConstantInt>(Op->getOperand(0)),
+        "invalid behavior operand in module flag (expected constant integer)",
+        Op->getOperand(0));
+    Assert1(false,
+            "invalid behavior operand in module flag (unexpected constant)",
+            Op->getOperand(0));
+  }
   MDString *ID = dyn_cast<MDString>(Op->getOperand(1));
-  Assert1(Behavior,
-          "invalid behavior operand in module flag (expected constant integer)",
-          Op->getOperand(0));
-  unsigned BehaviorValue = Behavior->getZExtValue();
   Assert1(ID,
           "invalid ID operand in module flag (expected metadata string)",
           Op->getOperand(1));
 
   // Sanity check the values for behaviors with additional requirements.
-  switch (BehaviorValue) {
-  default:
-    Assert1(false,
-            "invalid behavior operand in module flag (unexpected constant)",
-            Op->getOperand(0));
-    break;
-
+  switch (MFB) {
   case Module::Error:
   case Module::Warning:
   case Module::Override:
@@ -725,7 +727,7 @@
   }
 
   // Unless this is a "requires" flag, check the ID is unique.
-  if (BehaviorValue != Module::Require) {
+  if (MFB != Module::Require) {
     bool Inserted = SeenIDs.insert(std::make_pair(ID, Op)).second;
     Assert1(Inserted,
             "module flag identifiers must be unique (or of 'require' type)",
@@ -1054,20 +1056,19 @@
           "Attribute 'builtin' can only be applied to a callsite.", &F);
 
   // Check that this function meets the restrictions on this calling convention.
+  // Sometimes varargs is used for perfectly forwarding thunks, so some of these
+  // restrictions can be lifted.
   switch (F.getCallingConv()) {
   default:
-    break;
   case CallingConv::C:
     break;
   case CallingConv::Fast:
   case CallingConv::Cold:
-  case CallingConv::X86_FastCall:
-  case CallingConv::X86_ThisCall:
   case CallingConv::Intel_OCL_BI:
   case CallingConv::PTX_Kernel:
   case CallingConv::PTX_Device:
-    Assert1(!F.isVarArg(),
-            "Varargs functions must have C calling conventions!", &F);
+    Assert1(!F.isVarArg(), "Calling convention does not support varargs or "
+                           "perfect forwarding!", &F);
     break;
   }
 
@@ -1175,6 +1176,12 @@
       }
     }
   }
+
+  // Check that all instructions have their parent pointers set up correctly.
+  for (auto &I : BB)
+  {
+    Assert(I.getParent() == &BB, "Instruction has bogus parent pointer!");
+  }
 }
 
 void Verifier::visitTerminatorInst(TerminatorInst &I) {
@@ -1217,7 +1224,7 @@
   for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
     Assert1(i.getCaseValue()->getType() == SwitchTy,
             "Switch constants must all be same type as switch value!", &SI);
-    Assert2(Constants.insert(i.getCaseValue()),
+    Assert2(Constants.insert(i.getCaseValue()).second,
             "Duplicate integer as switch case", &SI, i.getCaseValue());
   }
 
@@ -1886,12 +1893,63 @@
   return A.getUpper() == B.getLower() || A.getLower() == B.getUpper();
 }
 
+void Verifier::visitRangeMetadata(Instruction& I,
+                                  MDNode* Range, Type* Ty) {
+  assert(Range &&
+         Range == I.getMetadata(LLVMContext::MD_range) &&
+         "precondition violation");
+
+  unsigned NumOperands = Range->getNumOperands();
+  Assert1(NumOperands % 2 == 0, "Unfinished range!", Range);
+  unsigned NumRanges = NumOperands / 2;
+  Assert1(NumRanges >= 1, "It should have at least one range!", Range);
+  
+  ConstantRange LastRange(1); // Dummy initial value
+  for (unsigned i = 0; i < NumRanges; ++i) {
+    ConstantInt *Low = dyn_cast<ConstantInt>(Range->getOperand(2*i));
+    Assert1(Low, "The lower limit must be an integer!", Low);
+    ConstantInt *High = dyn_cast<ConstantInt>(Range->getOperand(2*i + 1));
+    Assert1(High, "The upper limit must be an integer!", High);
+    Assert1(High->getType() == Low->getType() &&
+            High->getType() == Ty, "Range types must match instruction type!",
+            &I);
+    
+    APInt HighV = High->getValue();
+    APInt LowV = Low->getValue();
+    ConstantRange CurRange(LowV, HighV);
+    Assert1(!CurRange.isEmptySet() && !CurRange.isFullSet(),
+            "Range must not be empty!", Range);
+    if (i != 0) {
+      Assert1(CurRange.intersectWith(LastRange).isEmptySet(),
+              "Intervals are overlapping", Range);
+      Assert1(LowV.sgt(LastRange.getLower()), "Intervals are not in order",
+              Range);
+      Assert1(!isContiguous(CurRange, LastRange), "Intervals are contiguous",
+              Range);
+    }
+    LastRange = ConstantRange(LowV, HighV);
+  }
+  if (NumRanges > 2) {
+    APInt FirstLow =
+      dyn_cast<ConstantInt>(Range->getOperand(0))->getValue();
+    APInt FirstHigh =
+      dyn_cast<ConstantInt>(Range->getOperand(1))->getValue();
+    ConstantRange FirstRange(FirstLow, FirstHigh);
+    Assert1(FirstRange.intersectWith(LastRange).isEmptySet(),
+            "Intervals are overlapping", Range);
+    Assert1(!isContiguous(FirstRange, LastRange), "Intervals are contiguous",
+            Range);
+  }
+}
+
 void Verifier::visitLoadInst(LoadInst &LI) {
   PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType());
   Assert1(PTy, "Load operand must be a pointer.", &LI);
   Type *ElTy = PTy->getElementType();
   Assert2(ElTy == LI.getType(),
           "Load result type does not match pointer operand type!", &LI, ElTy);
+  Assert1(LI.getAlignment() <= Value::MaximumAlignment,
+          "huge alignment values are unsupported", &LI);
   if (LI.isAtomic()) {
     Assert1(LI.getOrdering() != Release && LI.getOrdering() != AcquireRelease,
             "Load cannot have Release ordering", &LI);
@@ -1911,52 +1969,6 @@
             "Non-atomic load cannot have SynchronizationScope specified", &LI);
   }
 
-  if (MDNode *Range = LI.getMetadata(LLVMContext::MD_range)) {
-    unsigned NumOperands = Range->getNumOperands();
-    Assert1(NumOperands % 2 == 0, "Unfinished range!", Range);
-    unsigned NumRanges = NumOperands / 2;
-    Assert1(NumRanges >= 1, "It should have at least one range!", Range);
-
-    ConstantRange LastRange(1); // Dummy initial value
-    for (unsigned i = 0; i < NumRanges; ++i) {
-      ConstantInt *Low = dyn_cast<ConstantInt>(Range->getOperand(2*i));
-      Assert1(Low, "The lower limit must be an integer!", Low);
-      ConstantInt *High = dyn_cast<ConstantInt>(Range->getOperand(2*i + 1));
-      Assert1(High, "The upper limit must be an integer!", High);
-      Assert1(High->getType() == Low->getType() &&
-              High->getType() == ElTy, "Range types must match load type!",
-              &LI);
-
-      APInt HighV = High->getValue();
-      APInt LowV = Low->getValue();
-      ConstantRange CurRange(LowV, HighV);
-      Assert1(!CurRange.isEmptySet() && !CurRange.isFullSet(),
-              "Range must not be empty!", Range);
-      if (i != 0) {
-        Assert1(CurRange.intersectWith(LastRange).isEmptySet(),
-                "Intervals are overlapping", Range);
-        Assert1(LowV.sgt(LastRange.getLower()), "Intervals are not in order",
-                Range);
-        Assert1(!isContiguous(CurRange, LastRange), "Intervals are contiguous",
-                Range);
-      }
-      LastRange = ConstantRange(LowV, HighV);
-    }
-    if (NumRanges > 2) {
-      APInt FirstLow =
-        dyn_cast<ConstantInt>(Range->getOperand(0))->getValue();
-      APInt FirstHigh =
-        dyn_cast<ConstantInt>(Range->getOperand(1))->getValue();
-      ConstantRange FirstRange(FirstLow, FirstHigh);
-      Assert1(FirstRange.intersectWith(LastRange).isEmptySet(),
-              "Intervals are overlapping", Range);
-      Assert1(!isContiguous(FirstRange, LastRange), "Intervals are contiguous",
-              Range);
-    }
-
-
-  }
-
   visitInstruction(LI);
 }
 
@@ -1967,6 +1979,8 @@
   Assert2(ElTy == SI.getOperand(0)->getType(),
           "Stored value type does not match pointer operand type!",
           &SI, ElTy);
+  Assert1(SI.getAlignment() <= Value::MaximumAlignment,
+          "huge alignment values are unsupported", &SI);
   if (SI.isAtomic()) {
     Assert1(SI.getOrdering() != Acquire && SI.getOrdering() != AcquireRelease,
             "Store cannot have Acquire ordering", &SI);
@@ -1998,6 +2012,8 @@
           &AI);
   Assert1(AI.getArraySize()->getType()->isIntegerTy(),
           "Alloca array size must have integer type", &AI);
+  Assert1(AI.getAlignment() <= Value::MaximumAlignment,
+          "huge alignment values are unsupported", &AI);
 
   visitInstruction(AI);
 }
@@ -2207,11 +2223,15 @@
     if (Function *F = dyn_cast<Function>(I.getOperand(i))) {
       // Check to make sure that the "address of" an intrinsic function is never
       // taken.
-      Assert1(!F->isIntrinsic() || i == (isa<CallInst>(I) ? e-1 : 0),
+      Assert1(!F->isIntrinsic() || i == (isa<CallInst>(I) ? e-1 :
+                                         isa<InvokeInst>(I) ? e-3 : 0),
               "Cannot take the address of an intrinsic!", &I);
       Assert1(!F->isIntrinsic() || isa<CallInst>(I) ||
-              F->getIntrinsicID() == Intrinsic::donothing,
-              "Cannot invoke an intrinsinc other than donothing", &I);
+              F->getIntrinsicID() == Intrinsic::donothing ||
+              F->getIntrinsicID() == Intrinsic::experimental_patchpoint_void ||
+              F->getIntrinsicID() == Intrinsic::experimental_patchpoint_i64,
+              "Cannot invoke an intrinsinc other than"
+              " donothing or patchpoint", &I);
       Assert1(F->getParent() == M, "Referencing function in another module!",
               &I);
     } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) {
@@ -2239,7 +2259,7 @@
 
         while (!Stack.empty()) {
           const ConstantExpr *V = Stack.pop_back_val();
-          if (!Visited.insert(V))
+          if (!Visited.insert(V).second)
             continue;
 
           VerifyConstantExprBitcastType(V);
@@ -2267,9 +2287,19 @@
     }
   }
 
-  MDNode *MD = I.getMetadata(LLVMContext::MD_range);
-  Assert1(!MD || isa<LoadInst>(I) || isa<CallInst>(I) || isa<InvokeInst>(I),
-          "Ranges are only for loads, calls and invokes!", &I);
+  if (MDNode *Range = I.getMetadata(LLVMContext::MD_range)) {
+    Assert1(isa<LoadInst>(I) || isa<CallInst>(I) || isa<InvokeInst>(I),
+            "Ranges are only for loads, calls and invokes!", &I);
+    visitRangeMetadata(I, Range, I.getType());
+  }
+
+  if (I.getMetadata(LLVMContext::MD_nonnull)) {
+    Assert1(I.getType()->isPointerTy(),
+            "nonnull applies only to pointer types", &I);
+    Assert1(isa<LoadInst>(I),
+            "nonnull applies only to load instructions, use attributes"
+            " for calls or invokes", &I);
+  }
 
   InstsInThisBlock.insert(&I);
 }
@@ -2608,7 +2638,7 @@
 
   bool Broken = false;
   for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!I->isDeclaration())
+    if (!I->isDeclaration() && !I->isMaterializable())
       Broken |= !V.verify(*I);
 
   // Note that this function's return value is inverted from what you would

diff --git a/lib/IRReader/IRReader.cpp b/lib/IRReader/IRReader.cpp
index f8d2f5a..7bc6f07 100644
--- a/lib/IRReader/IRReader.cpp
+++ b/lib/IRReader/IRReader.cpp

@@ -29,28 +29,27 @@
 static const char *const TimeIRParsingGroupName = "LLVM IR Parsing";
 static const char *const TimeIRParsingName = "Parse IR";
 
-static Module *getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
-                               LLVMContext &Context) {
+static std::unique_ptr<Module>
+getLazyIRModule(std::unique_ptr<MemoryBuffer> Buffer, SMDiagnostic &Err,
+                LLVMContext &Context) {
   if (isBitcode((const unsigned char *)Buffer->getBufferStart(),
                 (const unsigned char *)Buffer->getBufferEnd())) {
-    std::string ErrMsg;
-    ErrorOr<Module *> ModuleOrErr = getLazyBitcodeModule(Buffer, Context);
+    ErrorOr<Module *> ModuleOrErr =
+        getLazyBitcodeModule(std::move(Buffer), Context);
     if (std::error_code EC = ModuleOrErr.getError()) {
       Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
                          EC.message());
-      // getLazyBitcodeModule does not take ownership of the Buffer in the
-      // case of an error.
-      delete Buffer;
       return nullptr;
     }
-    return ModuleOrErr.get();
+    return std::unique_ptr<Module>(ModuleOrErr.get());
   }
 
-  return ParseAssembly(Buffer, nullptr, Err, Context);
+  return parseAssembly(Buffer->getMemBufferRef(), Err, Context);
 }
 
-Module *llvm::getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err,
-                                  LLVMContext &Context) {
+std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename,
+                                                  SMDiagnostic &Err,
+                                                  LLVMContext &Context) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -59,33 +58,29 @@
     return nullptr;
   }
 
-  return getLazyIRModule(FileOrErr.get().release(), Err, Context);
+  return getLazyIRModule(std::move(FileOrErr.get()), Err, Context);
 }
 
-Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err,
-                      LLVMContext &Context) {
+std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
+                                      LLVMContext &Context) {
   NamedRegionTimer T(TimeIRParsingName, TimeIRParsingGroupName,
                      TimePassesIsEnabled);
-  if (isBitcode((const unsigned char *)Buffer->getBufferStart(),
-                (const unsigned char *)Buffer->getBufferEnd())) {
+  if (isBitcode((const unsigned char *)Buffer.getBufferStart(),
+                (const unsigned char *)Buffer.getBufferEnd())) {
     ErrorOr<Module *> ModuleOrErr = parseBitcodeFile(Buffer, Context);
-    Module *M = nullptr;
-    if (std::error_code EC = ModuleOrErr.getError())
-      Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
+    if (std::error_code EC = ModuleOrErr.getError()) {
+      Err = SMDiagnostic(Buffer.getBufferIdentifier(), SourceMgr::DK_Error,
                          EC.message());
-    else
-      M = ModuleOrErr.get();
-    // parseBitcodeFile does not take ownership of the Buffer.
-    return M;
+      return nullptr;
+    }
+    return std::unique_ptr<Module>(ModuleOrErr.get());
   }
 
-  return ParseAssembly(MemoryBuffer::getMemBuffer(
-                           Buffer->getBuffer(), Buffer->getBufferIdentifier()),
-                       nullptr, Err, Context);
+  return parseAssembly(Buffer, Err, Context);
 }
 
-Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
-                          LLVMContext &Context) {
+std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
+                                          LLVMContext &Context) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -94,7 +89,7 @@
     return nullptr;
   }
 
-  return ParseIR(FileOrErr.get().get(), Err, Context);
+  return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -107,7 +102,8 @@
   SMDiagnostic Diag;
 
   std::unique_ptr<MemoryBuffer> MB(unwrap(MemBuf));
-  *OutM = wrap(ParseIR(MB.get(), Diag, *unwrap(ContextRef)));
+  *OutM =
+      wrap(parseIR(MB->getMemBufferRef(), Diag, *unwrap(ContextRef)).release());
 
   if(!*OutM) {
     if (OutMessage) {

diff --git a/lib/LTO/LLVMBuild.txt b/lib/LTO/LLVMBuild.txt
index 29ed92c..b9178e9 100644
--- a/lib/LTO/LLVMBuild.txt
+++ b/lib/LTO/LLVMBuild.txt

@@ -19,4 +19,4 @@
 type = Library
 name = LTO
 parent = Libraries
-required_libraries = BitReader BitWriter Core IPA IPO InstCombine Linker MC MCParser ObjCARC Object Scalar Support Target TransformUtils
+required_libraries = BitReader BitWriter Core IPA IPO InstCombine Linker MC ObjCARC Object Scalar Support Target TransformUtils CodeGen

diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 335197a..c663d43 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp

@@ -48,6 +48,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/ObjCARC.h"
@@ -63,18 +64,30 @@
 }
 
 LTOCodeGenerator::LTOCodeGenerator()
-    : Context(getGlobalContext()), IRLinker(new Module("ld-temp.o", Context)),
-      TargetMach(nullptr), EmitDwarfDebugInfo(false),
-      ScopeRestrictionsDone(false), CodeModel(LTO_CODEGEN_PIC_MODEL_DEFAULT),
-      NativeObjectFile(nullptr), DiagHandler(nullptr), DiagContext(nullptr) {
+    : Context(getGlobalContext()), IRLinker(new Module("ld-temp.o", Context)) {
+  initialize();
+}
+
+LTOCodeGenerator::LTOCodeGenerator(std::unique_ptr<LLVMContext> Context)
+    : OwnedContext(std::move(Context)), Context(*OwnedContext),
+      IRLinker(new Module("ld-temp.o", *OwnedContext)) {
+  initialize();
+}
+
+void LTOCodeGenerator::initialize() {
+  TargetMach = nullptr;
+  EmitDwarfDebugInfo = false;
+  ScopeRestrictionsDone = false;
+  CodeModel = LTO_CODEGEN_PIC_MODEL_DEFAULT;
+  DiagHandler = nullptr;
+  DiagContext = nullptr;
+
   initializeLTOPasses();
 }
 
 LTOCodeGenerator::~LTOCodeGenerator() {
   delete TargetMach;
-  delete NativeObjectFile;
   TargetMach = nullptr;
-  NativeObjectFile = nullptr;
 
   IRLinker.deleteModule();
 
@@ -107,14 +120,18 @@
   initializeFunctionAttrsPass(R);
   initializeGlobalsModRefPass(R);
   initializeLICMPass(R);
+  initializeMergedLoadStoreMotionPass(R);
   initializeGVNPass(R);
   initializeMemCpyOptPass(R);
   initializeDCEPass(R);
   initializeCFGSimplifyPassPass(R);
 }
 
-bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) {
-  bool ret = IRLinker.linkInModule(&mod->getModule(), &errMsg);
+bool LTOCodeGenerator::addModule(LTOModule *mod) {
+  assert(&mod->getModule().getContext() == &Context &&
+         "Expected module in same context");
+
+  bool ret = IRLinker.linkInModule(&mod->getModule());
 
   const std::vector<const char*> &undefs = mod->getAsmUndefinedRefs();
   for (int i = 0, e = undefs.size(); i != e; ++i)
@@ -161,9 +178,9 @@
   applyScopeRestrictions();
 
   // create output file
-  std::string ErrInfo;
-  tool_output_file Out(path, ErrInfo, sys::fs::F_None);
-  if (!ErrInfo.empty()) {
+  std::error_code EC;
+  tool_output_file Out(path, EC, sys::fs::F_None);
+  if (EC) {
     errMsg = "could not open bitcode file for writing: ";
     errMsg += path;
     return false;
@@ -188,6 +205,7 @@
                                        bool disableOpt,
                                        bool disableInline,
                                        bool disableGVNLoadPRE,
+                                       bool disableVectorization,
                                        std::string& errMsg) {
   // make unique temp .o file to put generated object file
   SmallString<128> Filename;
@@ -202,8 +220,9 @@
   // generate object file
   tool_output_file objFile(Filename.c_str(), FD);
 
-  bool genResult = generateObjectFile(objFile.os(), disableOpt, disableInline,
-                                      disableGVNLoadPRE, errMsg);
+  bool genResult =
+      generateObjectFile(objFile.os(), disableOpt, disableInline,
+                         disableGVNLoadPRE, disableVectorization, errMsg);
   objFile.os().close();
   if (objFile.os().has_error()) {
     objFile.os().clear_error();
@@ -226,15 +245,13 @@
                                       bool disableOpt,
                                       bool disableInline,
                                       bool disableGVNLoadPRE,
+                                      bool disableVectorization,
                                       std::string& errMsg) {
   const char *name;
   if (!compile_to_file(&name, disableOpt, disableInline, disableGVNLoadPRE,
-                       errMsg))
+                       disableVectorization, errMsg))
     return nullptr;
 
-  // remove old buffer if compile() called twice
-  delete NativeObjectFile;
-
   // read .o file into memory buffer
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFile(name, -1, false);
@@ -243,7 +260,7 @@
     sys::fs::remove(NativeObjectPath);
     return nullptr;
   }
-  NativeObjectFile = BufferOrErr.get().release();
+  NativeObjectFile = std::move(*BufferOrErr);
 
   // remove temp files
   sys::fs::remove(NativeObjectPath);
@@ -298,8 +315,7 @@
       MCpu = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       MCpu = "yonah";
-    else if (Triple.getArch() == llvm::Triple::arm64 ||
-             Triple.getArch() == llvm::Triple::aarch64)
+    else if (Triple.getArch() == llvm::Triple::aarch64)
       MCpu = "cyclone";
   }
 
@@ -311,9 +327,9 @@
 
 void LTOCodeGenerator::
 applyRestriction(GlobalValue &GV,
-                 const ArrayRef<StringRef> &Libcalls,
+                 ArrayRef<StringRef> Libcalls,
                  std::vector<const char*> &MustPreserveList,
-                 SmallPtrSet<GlobalValue*, 8> &AsmUsed,
+                 SmallPtrSetImpl<GlobalValue*> &AsmUsed,
                  Mangler &Mangler) {
   // There are no restrictions to apply to declarations.
   if (GV.isDeclaration())
@@ -342,7 +358,7 @@
 }
 
 static void findUsedValues(GlobalVariable *LLVMUsed,
-                           SmallPtrSet<GlobalValue*, 8> &UsedValues) {
+                           SmallPtrSetImpl<GlobalValue*> &UsedValues) {
   if (!LLVMUsed) return;
 
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
@@ -390,12 +406,13 @@
   passes.add(createDebugInfoVerifierPass());
 
   // mark which symbols can not be internalized
-  Mangler Mangler(TargetMach->getDataLayout());
+  Mangler Mangler(TargetMach->getSubtargetImpl()->getDataLayout());
   std::vector<const char*> MustPreserveList;
   SmallPtrSet<GlobalValue*, 8> AsmUsed;
   std::vector<StringRef> Libcalls;
   TargetLibraryInfo TLI(Triple(TargetMach->getTargetTriple()));
-  accumulateAndSortLibcalls(Libcalls, TLI, TargetMach->getTargetLowering());
+  accumulateAndSortLibcalls(
+      Libcalls, TLI, TargetMach->getSubtargetImpl()->getTargetLowering());
 
   for (Module::iterator f = mergedModule->begin(),
          e = mergedModule->end(); f != e; ++f)
@@ -444,6 +461,7 @@
                                           bool DisableOpt,
                                           bool DisableInline,
                                           bool DisableGVNLoadPRE,
+                                          bool DisableVectorization,
                                           std::string &errMsg) {
   if (!this->determineTarget(errMsg))
     return false;
@@ -456,35 +474,27 @@
   // Instantiate the pass manager to organize the passes.
   PassManager passes;
 
-  // Start off with a verification pass.
-  passes.add(createVerifierPass());
-  passes.add(createDebugInfoVerifierPass());
-
   // Add an appropriate DataLayout instance for this module...
-  mergedModule->setDataLayout(TargetMach->getDataLayout());
-  passes.add(new DataLayoutPass(mergedModule));
+  mergedModule->setDataLayout(TargetMach->getSubtargetImpl()->getDataLayout());
 
-  // Add appropriate TargetLibraryInfo for this module.
-  passes.add(new TargetLibraryInfo(Triple(TargetMach->getTargetTriple())));
+  Triple TargetTriple(TargetMach->getTargetTriple());
+  PassManagerBuilder PMB;
+  PMB.DisableGVNLoadPRE = DisableGVNLoadPRE;
+  PMB.LoopVectorize = !DisableVectorization;
+  PMB.SLPVectorize = !DisableVectorization;
+  if (!DisableInline)
+    PMB.Inliner = createFunctionInliningPass();
+  PMB.LibraryInfo = new TargetLibraryInfo(TargetTriple);
+  if (DisableOpt)
+    PMB.OptLevel = 0;
+  PMB.VerifyInput = true;
+  PMB.VerifyOutput = true;
 
-  TargetMach->addAnalysisPasses(passes);
-
-  // Enabling internalize here would use its AllButMain variant. It
-  // keeps only main if it exists and does nothing for libraries. Instead
-  // we create the pass ourselves with the symbol list provided by the linker.
-  if (!DisableOpt)
-    PassManagerBuilder().populateLTOPassManager(passes,
-                                              /*Internalize=*/false,
-                                              !DisableInline,
-                                              DisableGVNLoadPRE);
-
-  // Make sure everything is still good.
-  passes.add(createVerifierPass());
-  passes.add(createDebugInfoVerifierPass());
+  PMB.populateLTOPassManager(passes, TargetMach);
 
   PassManager codeGenPasses;
 
-  codeGenPasses.add(new DataLayoutPass(mergedModule));
+  codeGenPasses.add(new DataLayoutPass());
 
   formatted_raw_ostream Out(out);
 
@@ -571,5 +581,6 @@
     return Context.setDiagnosticHandler(nullptr, nullptr);
   // Register the LTOCodeGenerator stub in the LLVMContext to forward the
   // diagnostic to the external DiagHandler.
-  Context.setDiagnosticHandler(LTOCodeGenerator::DiagnosticHandler, this);
+  Context.setDiagnosticHandler(LTOCodeGenerator::DiagnosticHandler, this,
+                               /* RespectFilters */ true);
 }

diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 844c0f2..4108ef2 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp

@@ -15,6 +15,7 @@
 #include "llvm/LTO/LTOModule.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -28,6 +29,8 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -39,32 +42,51 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include <system_error>
 using namespace llvm;
+using namespace llvm::object;
 
 LTOModule::LTOModule(std::unique_ptr<object::IRObjectFile> Obj,
                      llvm::TargetMachine *TM)
     : IRFile(std::move(Obj)), _target(TM) {}
 
+LTOModule::LTOModule(std::unique_ptr<object::IRObjectFile> Obj,
+                     llvm::TargetMachine *TM,
+                     std::unique_ptr<LLVMContext> Context)
+    : OwnedContext(std::move(Context)), IRFile(std::move(Obj)), _target(TM) {}
+
+LTOModule::~LTOModule() {}
+
 /// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
 /// bitcode.
-bool LTOModule::isBitcodeFile(const void *mem, size_t length) {
-  return sys::fs::identify_magic(StringRef((const char *)mem, length)) ==
-         sys::fs::file_magic::bitcode;
+bool LTOModule::isBitcodeFile(const void *Mem, size_t Length) {
+  ErrorOr<MemoryBufferRef> BCData = IRObjectFile::findBitcodeInMemBuffer(
+      MemoryBufferRef(StringRef((const char *)Mem, Length), "<mem>"));
+  return bool(BCData);
 }
 
-bool LTOModule::isBitcodeFile(const char *path) {
-  sys::fs::file_magic type;
-  if (sys::fs::identify_magic(path, type))
+bool LTOModule::isBitcodeFile(const char *Path) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
+      MemoryBuffer::getFile(Path);
+  if (!BufferOrErr)
     return false;
-  return type == sys::fs::file_magic::bitcode;
+
+  ErrorOr<MemoryBufferRef> BCData = IRObjectFile::findBitcodeInMemBuffer(
+      BufferOrErr.get()->getMemBufferRef());
+  return bool(BCData);
 }
 
-bool LTOModule::isBitcodeForTarget(MemoryBuffer *buffer,
-                                   StringRef triplePrefix) {
-  std::string Triple = getBitcodeTargetTriple(buffer, getGlobalContext());
-  return StringRef(Triple).startswith(triplePrefix);
+bool LTOModule::isBitcodeForTarget(MemoryBuffer *Buffer,
+                                   StringRef TriplePrefix) {
+  ErrorOr<MemoryBufferRef> BCOrErr =
+      IRObjectFile::findBitcodeInMemBuffer(Buffer->getMemBufferRef());
+  if (!BCOrErr)
+    return false;
+  LLVMContext Context;
+  std::string Triple = getBitcodeTargetTriple(*BCOrErr, Context);
+  return StringRef(Triple).startswith(TriplePrefix);
 }
 
 LTOModule *LTOModule::createFromFile(const char *path, TargetOptions options,
@@ -75,7 +97,9 @@
     errMsg = EC.message();
     return nullptr;
   }
-  return makeLTOModule(std::move(BufferOrErr.get()), options, errMsg);
+  std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
+  return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg,
+                       &getGlobalContext());
 }
 
 LTOModule *LTOModule::createFromOpenFile(int fd, const char *path, size_t size,
@@ -94,23 +118,50 @@
     errMsg = EC.message();
     return nullptr;
   }
-  return makeLTOModule(std::move(BufferOrErr.get()), options, errMsg);
+  std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
+  return makeLTOModule(Buffer->getMemBufferRef(), options, errMsg,
+                       &getGlobalContext());
 }
 
 LTOModule *LTOModule::createFromBuffer(const void *mem, size_t length,
                                        TargetOptions options,
                                        std::string &errMsg, StringRef path) {
-  std::unique_ptr<MemoryBuffer> buffer(makeBuffer(mem, length, path));
-  if (!buffer)
-    return nullptr;
-  return makeLTOModule(std::move(buffer), options, errMsg);
+  return createInContext(mem, length, options, errMsg, path,
+                         &getGlobalContext());
 }
 
-LTOModule *LTOModule::makeLTOModule(std::unique_ptr<MemoryBuffer> Buffer,
-                                    TargetOptions options,
-                                    std::string &errMsg) {
-  ErrorOr<Module *> MOrErr =
-      getLazyBitcodeModule(Buffer.get(), getGlobalContext());
+LTOModule *LTOModule::createInLocalContext(const void *mem, size_t length,
+                                           TargetOptions options,
+                                           std::string &errMsg,
+                                           StringRef path) {
+  return createInContext(mem, length, options, errMsg, path, nullptr);
+}
+
+LTOModule *LTOModule::createInContext(const void *mem, size_t length,
+                                      TargetOptions options,
+                                      std::string &errMsg, StringRef path,
+                                      LLVMContext *Context) {
+  StringRef Data((const char *)mem, length);
+  MemoryBufferRef Buffer(Data, path);
+  return makeLTOModule(Buffer, options, errMsg, Context);
+}
+
+LTOModule *LTOModule::makeLTOModule(MemoryBufferRef Buffer,
+                                    TargetOptions options, std::string &errMsg,
+                                    LLVMContext *Context) {
+  std::unique_ptr<LLVMContext> OwnedContext;
+  if (!Context) {
+    OwnedContext = llvm::make_unique<LLVMContext>();
+    Context = OwnedContext.get();
+  }
+
+  ErrorOr<MemoryBufferRef> MBOrErr =
+      IRObjectFile::findBitcodeInMemBuffer(Buffer);
+  if (std::error_code EC = MBOrErr.getError()) {
+    errMsg = EC.message();
+    return nullptr;
+  }
+  ErrorOr<Module *> MOrErr = parseBitcodeFile(*MBOrErr, *Context);
   if (std::error_code EC = MOrErr.getError()) {
     errMsg = EC.message();
     return nullptr;
@@ -138,20 +189,22 @@
       CPU = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       CPU = "yonah";
-    else if (Triple.getArch() == llvm::Triple::arm64 ||
-             Triple.getArch() == llvm::Triple::aarch64)
+    else if (Triple.getArch() == llvm::Triple::aarch64)
       CPU = "cyclone";
   }
 
   TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
                                                      options);
-  M->materializeAllPermanently(true);
-  M->setDataLayout(target->getDataLayout());
+  M->setDataLayout(target->getSubtargetImpl()->getDataLayout());
 
   std::unique_ptr<object::IRObjectFile> IRObj(
-      new object::IRObjectFile(std::move(Buffer), std::move(M)));
+      new object::IRObjectFile(Buffer, std::move(M)));
 
-  LTOModule *Ret = new LTOModule(std::move(IRObj), target);
+  LTOModule *Ret;
+  if (OwnedContext)
+    Ret = new LTOModule(std::move(IRObj), target, std::move(OwnedContext));
+  else
+    Ret = new LTOModule(std::move(IRObj), target);
 
   if (Ret->parseSymbols(errMsg)) {
     delete Ret;
@@ -164,8 +217,8 @@
 }
 
 /// Create a MemoryBuffer from a memory range with an optional name.
-MemoryBuffer *LTOModule::makeBuffer(const void *mem, size_t length,
-                                    StringRef name) {
+std::unique_ptr<MemoryBuffer>
+LTOModule::makeBuffer(const void *mem, size_t length, StringRef name) {
   const char *startPtr = (const char*)mem;
   return MemoryBuffer::getMemBuffer(StringRef(startPtr, length), name, false);
 }
@@ -196,27 +249,24 @@
   // second slot in __OBJC,__class is pointer to superclass name
   std::string superclassName;
   if (objcClassNameFromExpression(c->getOperand(1), superclassName)) {
-    NameAndAttributes info;
-    StringMap<NameAndAttributes>::value_type &entry =
-      _undefines.GetOrCreateValue(superclassName);
-    if (!entry.getValue().name) {
-      const char *symbolName = entry.getKey().data();
-      info.name = symbolName;
+    auto IterBool =
+        _undefines.insert(std::make_pair(superclassName, NameAndAttributes()));
+    if (IterBool.second) {
+      NameAndAttributes &info = IterBool.first->second;
+      info.name = IterBool.first->first().data();
       info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
       info.isFunction = false;
       info.symbol = clgv;
-      entry.setValue(info);
     }
   }
 
   // third slot in __OBJC,__class is pointer to class name
   std::string className;
   if (objcClassNameFromExpression(c->getOperand(2), className)) {
-    StringSet::value_type &entry = _defines.GetOrCreateValue(className);
-    entry.setValue(1);
+    auto Iter = _defines.insert(className).first;
 
     NameAndAttributes info;
-    info.name = entry.getKey().data();
+    info.name = Iter->first().data();
     info.attributes = LTO_SYMBOL_PERMISSIONS_DATA |
       LTO_SYMBOL_DEFINITION_REGULAR | LTO_SYMBOL_SCOPE_DEFAULT;
     info.isFunction = false;
@@ -235,19 +285,17 @@
   if (!objcClassNameFromExpression(c->getOperand(1), targetclassName))
     return;
 
-  NameAndAttributes info;
-  StringMap<NameAndAttributes>::value_type &entry =
-    _undefines.GetOrCreateValue(targetclassName);
+  auto IterBool =
+      _undefines.insert(std::make_pair(targetclassName, NameAndAttributes()));
 
-  if (entry.getValue().name)
+  if (!IterBool.second)
     return;
 
-  const char *symbolName = entry.getKey().data();
-  info.name = symbolName;
+  NameAndAttributes &info = IterBool.first->second;
+  info.name = IterBool.first->first().data();
   info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
   info.isFunction = false;
   info.symbol = clgv;
-  entry.setValue(info);
 }
 
 /// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
@@ -256,18 +304,17 @@
   if (!objcClassNameFromExpression(clgv->getInitializer(), targetclassName))
     return;
 
-  NameAndAttributes info;
-  StringMap<NameAndAttributes>::value_type &entry =
-    _undefines.GetOrCreateValue(targetclassName);
-  if (entry.getValue().name)
+  auto IterBool =
+      _undefines.insert(std::make_pair(targetclassName, NameAndAttributes()));
+
+  if (!IterBool.second)
     return;
 
-  const char *symbolName = entry.getKey().data();
-  info.name = symbolName;
+  NameAndAttributes &info = IterBool.first->second;
+  info.name = IterBool.first->first().data();
   info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
   info.isFunction = false;
   info.symbol = clgv;
-  entry.setValue(info);
 }
 
 void LTOModule::addDefinedDataSymbol(const object::BasicSymbolRef &Sym) {
@@ -348,30 +395,6 @@
   addDefinedSymbol(Name, F, true);
 }
 
-static bool canBeHidden(const GlobalValue *GV) {
-  // FIXME: this is duplicated with another static function in AsmPrinter.cpp
-  GlobalValue::LinkageTypes L = GV->getLinkage();
-
-  if (L != GlobalValue::LinkOnceODRLinkage)
-    return false;
-
-  if (GV->hasUnnamedAddr())
-    return true;
-
-  // If it is a non constant variable, it needs to be uniqued across shared
-  // objects.
-  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) {
-    if (!Var->isConstant())
-      return false;
-  }
-
-  GlobalStatus GS;
-  if (GlobalStatus::analyzeGlobal(GV, GS))
-    return false;
-
-  return !GS.IsCompared;
-}
-
 void LTOModule::addDefinedSymbol(const char *Name, const GlobalValue *def,
                                  bool isFunction) {
   // set alignment part log2() can have rounding errors
@@ -405,17 +428,16 @@
     attr |= LTO_SYMBOL_SCOPE_HIDDEN;
   else if (def->hasProtectedVisibility())
     attr |= LTO_SYMBOL_SCOPE_PROTECTED;
-  else if (canBeHidden(def))
+  else if (canBeOmittedFromSymbolTable(def))
     attr |= LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
   else
     attr |= LTO_SYMBOL_SCOPE_DEFAULT;
 
-  StringSet::value_type &entry = _defines.GetOrCreateValue(Name);
-  entry.setValue(1);
+  auto Iter = _defines.insert(Name).first;
 
   // fill information structure
   NameAndAttributes info;
-  StringRef NameRef = entry.getKey();
+  StringRef NameRef = Iter->first();
   info.name = NameRef.data();
   assert(info.name[NameRef.size()] == '\0');
   info.attributes = attr;
@@ -430,15 +452,13 @@
 /// defined list.
 void LTOModule::addAsmGlobalSymbol(const char *name,
                                    lto_symbol_attributes scope) {
-  StringSet::value_type &entry = _defines.GetOrCreateValue(name);
+  auto IterBool = _defines.insert(name);
 
   // only add new define if not already defined
-  if (entry.getValue())
+  if (!IterBool.second)
     return;
 
-  entry.setValue(1);
-
-  NameAndAttributes &info = _undefines[entry.getKey().data()];
+  NameAndAttributes &info = _undefines[IterBool.first->first().data()];
 
   if (info.symbol == nullptr) {
     // FIXME: This is trying to take care of module ASM like this:
@@ -450,7 +470,7 @@
     // much.
 
     // fill information structure
-    info.name = entry.getKey().data();
+    info.name = IterBool.first->first().data();
     info.attributes =
       LTO_SYMBOL_PERMISSIONS_DATA | LTO_SYMBOL_DEFINITION_REGULAR | scope;
     info.isFunction = false;
@@ -473,24 +493,21 @@
 /// addAsmGlobalSymbolUndef - Add a global symbol from module-level ASM to the
 /// undefined list.
 void LTOModule::addAsmGlobalSymbolUndef(const char *name) {
-  StringMap<NameAndAttributes>::value_type &entry =
-    _undefines.GetOrCreateValue(name);
+  auto IterBool = _undefines.insert(std::make_pair(name, NameAndAttributes()));
 
-  _asm_undefines.push_back(entry.getKey().data());
+  _asm_undefines.push_back(IterBool.first->first().data());
 
   // we already have the symbol
-  if (entry.getValue().name)
+  if (!IterBool.second)
     return;
 
   uint32_t attr = LTO_SYMBOL_DEFINITION_UNDEFINED;
   attr |= LTO_SYMBOL_SCOPE_DEFAULT;
-  NameAndAttributes info;
-  info.name = entry.getKey().data();
+  NameAndAttributes &info = IterBool.first->second;
+  info.name = IterBool.first->first().data();
   info.attributes = attr;
   info.isFunction = false;
   info.symbol = nullptr;
-
-  entry.setValue(info);
 }
 
 /// Add a symbol which isn't defined just yet to a list to be resolved later.
@@ -502,16 +519,15 @@
     Sym.printName(OS);
   }
 
-  StringMap<NameAndAttributes>::value_type &entry =
-    _undefines.GetOrCreateValue(name);
+  auto IterBool = _undefines.insert(std::make_pair(name, NameAndAttributes()));
 
   // we already have the symbol
-  if (entry.getValue().name)
+  if (!IterBool.second)
     return;
 
-  NameAndAttributes info;
+  NameAndAttributes &info = IterBool.first->second;
 
-  info.name = entry.getKey().data();
+  info.name = IterBool.first->first().data();
 
   const GlobalValue *decl = IRFile->getSymbolGV(Sym.getRawDataRefImpl());
 
@@ -522,8 +538,6 @@
 
   info.isFunction = isFunc;
   info.symbol = decl;
-
-  entry.setValue(info);
 }
 
 /// parseSymbols - Parse the symbols from the module and model-level ASM and add
@@ -596,10 +610,15 @@
       MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
       for (unsigned ii = 0, ie = MDOptions->getNumOperands(); ii != ie; ++ii) {
         MDString *MDOption = cast<MDString>(MDOptions->getOperand(ii));
-        StringRef Op = _linkeropt_strings.
-            GetOrCreateValue(MDOption->getString()).getKey();
-        StringRef DepLibName = _target->getTargetLowering()->
-            getObjFileLowering().getDepLibFromLinkerOpt(Op);
+        // FIXME: Make StringSet::insert match Self-Associative Container
+        // requirements, returning <iter,bool> rather than bool, and use that
+        // here.
+        StringRef Op =
+            _linkeropt_strings.insert(MDOption->getString()).first->first();
+        StringRef DepLibName = _target->getSubtargetImpl()
+                                   ->getTargetLowering()
+                                   ->getObjFileLowering()
+                                   .getDepLibFromLinkerOpt(Op);
         if (!DepLibName.empty())
           _deplibs.push_back(DepLibName.data());
         else if (!Op.empty())

diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 5bb2862..8321bcf 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp

@@ -17,6 +17,9 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/Support/CommandLine.h"
@@ -33,61 +36,58 @@
 //===----------------------------------------------------------------------===//
 
 namespace {
-  typedef SmallPtrSet<StructType*, 32> TypeSet;
+typedef SmallPtrSet<StructType *, 32> TypeSet;
 
 class TypeMapTy : public ValueMapTypeRemapper {
-  /// MappedTypes - This is a mapping from a source type to a destination type
-  /// to use.
+  /// This is a mapping from a source type to a destination type to use.
   DenseMap<Type*, Type*> MappedTypes;
 
-  /// SpeculativeTypes - When checking to see if two subgraphs are isomorphic,
-  /// we speculatively add types to MappedTypes, but keep track of them here in
-  /// case we need to roll back.
+  /// When checking to see if two subgraphs are isomorphic, we speculatively
+  /// add types to MappedTypes, but keep track of them here in case we need to
+  /// roll back.
   SmallVector<Type*, 16> SpeculativeTypes;
 
-  /// SrcDefinitionsToResolve - This is a list of non-opaque structs in the
-  /// source module that are mapped to an opaque struct in the destination
-  /// module.
+  /// This is a list of non-opaque structs in the source module that are mapped
+  /// to an opaque struct in the destination module.
   SmallVector<StructType*, 16> SrcDefinitionsToResolve;
 
-  /// DstResolvedOpaqueTypes - This is the set of opaque types in the
-  /// destination modules who are getting a body from the source module.
+  /// This is the set of opaque types in the destination modules who are
+  /// getting a body from the source module.
   SmallPtrSet<StructType*, 16> DstResolvedOpaqueTypes;
 
 public:
   TypeMapTy(TypeSet &Set) : DstStructTypesSet(Set) {}
 
   TypeSet &DstStructTypesSet;
-  /// addTypeMapping - Indicate that the specified type in the destination
-  /// module is conceptually equivalent to the specified type in the source
-  /// module.
+  /// Indicate that the specified type in the destination module is conceptually
+  /// equivalent to the specified type in the source module.
   void addTypeMapping(Type *DstTy, Type *SrcTy);
 
   /// linkDefinedTypeBodies - Produce a body for an opaque type in the dest
   /// module from a type definition in the source module.
   void linkDefinedTypeBodies();
 
-  /// get - Return the mapped type to use for the specified input type from the
+  /// Return the mapped type to use for the specified input type from the
   /// source module.
   Type *get(Type *SrcTy);
 
   FunctionType *get(FunctionType *T) {return cast<FunctionType>(get((Type*)T));}
 
-  /// dump - Dump out the type map for debugging purposes.
+  /// Dump out the type map for debugging purposes.
   void dump() const {
     for (DenseMap<Type*, Type*>::const_iterator
            I = MappedTypes.begin(), E = MappedTypes.end(); I != E; ++I) {
       dbgs() << "TypeMap: ";
-      I->first->dump();
+      I->first->print(dbgs());
       dbgs() << " => ";
-      I->second->dump();
+      I->second->print(dbgs());
       dbgs() << '\n';
     }
   }
 
 private:
   Type *getImpl(Type *T);
-  /// remapType - Implement the ValueMapTypeRemapper interface.
+  /// Implement the ValueMapTypeRemapper interface.
   Type *remapType(Type *SrcTy) override {
     return get(SrcTy);
   }
@@ -116,8 +116,8 @@
   SpeculativeTypes.clear();
 }
 
-/// areTypesIsomorphic - Recursively walk this pair of types, returning true
-/// if they are isomorphic, false if they are not.
+/// Recursively walk this pair of types, returning true if they are isomorphic,
+/// false if they are not.
 bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
   // Two types with differing kinds are clearly not isomorphic.
   if (DstTy->getTypeID() != SrcTy->getTypeID()) return false;
@@ -152,7 +152,7 @@
     // same opaque type then we fail.
     if (cast<StructType>(DstTy)->isOpaque()) {
       // We can only map one source type onto the opaque destination type.
-      if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)))
+      if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)).second)
         return false;
       SrcDefinitionsToResolve.push_back(SSTy);
       Entry = DstTy;
@@ -201,8 +201,8 @@
   return true;
 }
 
-/// linkDefinedTypeBodies - Produce a body for an opaque type in the dest
-/// module from a type definition in the source module.
+/// Produce a body for an opaque type in the dest module from a type definition
+/// in the source module.
 void TypeMapTy::linkDefinedTypeBodies() {
   SmallVector<Type*, 16> Elements;
   SmallString<16> TmpName;
@@ -242,8 +242,6 @@
   DstResolvedOpaqueTypes.clear();
 }
 
-/// get - Return the mapped type to use for the specified input type from the
-/// source module.
 Type *TypeMapTy::get(Type *Ty) {
   Type *Result = getImpl(Ty);
 
@@ -253,7 +251,7 @@
   return Result;
 }
 
-/// getImpl - This is the recursive version of get().
+/// This is the recursive version of get().
 Type *TypeMapTy::getImpl(Type *Ty) {
   // If we already have an entry for this type, return it.
   Type **Entry = &MappedTypes[Ty];
@@ -359,9 +357,9 @@
 namespace {
   class ModuleLinker;
 
-  /// ValueMaterializerTy - Creates prototypes for functions that are lazily
-  /// linked on the fly. This speeds up linking for modules with many
-  /// lazily linked functions of which few get used.
+  /// Creates prototypes for functions that are lazily linked on the fly. This
+  /// speeds up linking for modules with many/ lazily linked functions of which
+  /// few get used.
   class ValueMaterializerTy : public ValueMaterializer {
     TypeMapTy &TypeMap;
     Module *DstM;
@@ -376,57 +374,73 @@
     Value *materializeValueFor(Value *V) override;
   };
 
-  /// ModuleLinker - This is an implementation class for the LinkModules
-  /// function, which is the entrypoint for this file.
+  namespace {
+  class LinkDiagnosticInfo : public DiagnosticInfo {
+    const Twine &Msg;
+
+  public:
+    LinkDiagnosticInfo(DiagnosticSeverity Severity, const Twine &Msg);
+    void print(DiagnosticPrinter &DP) const override;
+  };
+  LinkDiagnosticInfo::LinkDiagnosticInfo(DiagnosticSeverity Severity,
+                                         const Twine &Msg)
+      : DiagnosticInfo(DK_Linker, Severity), Msg(Msg) {}
+  void LinkDiagnosticInfo::print(DiagnosticPrinter &DP) const { DP << Msg; }
+  }
+
+  /// This is an implementation class for the LinkModules function, which is the
+  /// entrypoint for this file.
   class ModuleLinker {
     Module *DstM, *SrcM;
 
     TypeMapTy TypeMap;
     ValueMaterializerTy ValMaterializer;
 
-    /// ValueMap - Mapping of values from what they used to be in Src, to what
-    /// they are now in DstM.  ValueToValueMapTy is a ValueMap, which involves
-    /// some overhead due to the use of Value handles which the Linker doesn't
-    /// actually need, but this allows us to reuse the ValueMapper code.
+    /// Mapping of values from what they used to be in Src, to what they are now
+    /// in DstM.  ValueToValueMapTy is a ValueMap, which involves some overhead
+    /// due to the use of Value handles which the Linker doesn't actually need,
+    /// but this allows us to reuse the ValueMapper code.
     ValueToValueMapTy ValueMap;
 
     struct AppendingVarInfo {
-      GlobalVariable *NewGV;  // New aggregate global in dest module.
-      Constant *DstInit;      // Old initializer from dest module.
-      Constant *SrcInit;      // Old initializer from src module.
+      GlobalVariable *NewGV;   // New aggregate global in dest module.
+      const Constant *DstInit; // Old initializer from dest module.
+      const Constant *SrcInit; // Old initializer from src module.
     };
 
     std::vector<AppendingVarInfo> AppendingVars;
 
-    unsigned Mode; // Mode to treat source module.
-
     // Set of items not to link in from source.
     SmallPtrSet<const Value*, 16> DoNotLinkFromSource;
 
     // Vector of functions to lazily link in.
     std::vector<Function*> LazilyLinkFunctions;
 
-    bool SuppressWarnings;
+    Linker::DiagnosticHandlerFunction DiagnosticHandler;
 
   public:
-    std::string ErrorMsg;
-
-    ModuleLinker(Module *dstM, TypeSet &Set, Module *srcM, unsigned mode,
-                 bool SuppressWarnings=false)
+    ModuleLinker(Module *dstM, TypeSet &Set, Module *srcM,
+                 Linker::DiagnosticHandlerFunction DiagnosticHandler)
         : DstM(dstM), SrcM(srcM), TypeMap(Set),
-          ValMaterializer(TypeMap, DstM, LazilyLinkFunctions), Mode(mode),
-          SuppressWarnings(SuppressWarnings) {}
+          ValMaterializer(TypeMap, DstM, LazilyLinkFunctions),
+          DiagnosticHandler(DiagnosticHandler) {}
 
     bool run();
 
   private:
-    /// emitError - Helper method for setting a message and returning an error
-    /// code.
+    bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
+                              const GlobalValue &Src);
+
+    /// Helper method for setting a message and returning an error code.
     bool emitError(const Twine &Message) {
-      ErrorMsg = Message.str();
+      DiagnosticHandler(LinkDiagnosticInfo(DS_Error, Message));
       return true;
     }
 
+    void emitWarning(const Twine &Message) {
+      DiagnosticHandler(LinkDiagnosticInfo(DS_Warning, Message));
+    }
+
     bool getComdatLeader(Module *M, StringRef ComdatName,
                          const GlobalVariable *&GVar);
     bool computeResultingSelectionKind(StringRef ComdatName,
@@ -439,16 +453,9 @@
     bool getComdatResult(const Comdat *SrcC, Comdat::SelectionKind &SK,
                          bool &LinkFromSrc);
 
-    /// getLinkageResult - This analyzes the two global values and determines
-    /// what the result will look like in the destination module.
-    bool getLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
-                          GlobalValue::LinkageTypes &LT,
-                          GlobalValue::VisibilityTypes &Vis,
-                          bool &LinkFromSrc);
-
-    /// getLinkedToGlobal - Given a global in the source module, return the
-    /// global in the destination module that is being linked to, if any.
-    GlobalValue *getLinkedToGlobal(GlobalValue *SrcGV) {
+    /// Given a global in the source module, return the global in the
+    /// destination module that is being linked to, if any.
+    GlobalValue *getLinkedToGlobal(const GlobalValue *SrcGV) {
       // If the source has no name it can't link.  If it has local linkage,
       // there is no name match-up going on.
       if (!SrcGV->hasName() || SrcGV->hasLocalLinkage())
@@ -469,10 +476,20 @@
 
     void computeTypeMapping();
 
-    bool linkAppendingVarProto(GlobalVariable *DstGV, GlobalVariable *SrcGV);
-    bool linkGlobalProto(GlobalVariable *SrcGV);
-    bool linkFunctionProto(Function *SrcF);
-    bool linkAliasProto(GlobalAlias *SrcA);
+    void upgradeMismatchedGlobalArray(StringRef Name);
+    void upgradeMismatchedGlobals();
+
+    bool linkAppendingVarProto(GlobalVariable *DstGV,
+                               const GlobalVariable *SrcGV);
+
+    bool linkGlobalValueProto(GlobalValue *GV);
+    GlobalValue *linkGlobalVariableProto(const GlobalVariable *SGVar,
+                                         GlobalValue *DGV, bool LinkFromSrc);
+    GlobalValue *linkFunctionProto(const Function *SF, GlobalValue *DGV,
+                                   bool LinkFromSrc);
+    GlobalValue *linkGlobalAliasProto(const GlobalAlias *SGA, GlobalValue *DGV,
+                                      bool LinkFromSrc);
+
     bool linkModuleFlagsMetadata();
 
     void linkAppendingVarInit(const AppendingVarInfo &AVI);
@@ -483,9 +500,9 @@
   };
 }
 
-/// forceRenaming - The LLVM SymbolTable class autorenames globals that conflict
-/// in the symbol table.  This is good for all clients except for us.  Go
-/// through the trouble to force this back.
+/// The LLVM SymbolTable class autorenames globals that conflict in the symbol
+/// table. This is good for all clients except for us. Go through the trouble
+/// to force this back.
 static void forceRenaming(GlobalValue *GV, StringRef Name) {
   // If the global doesn't force its name or if it already has the right name,
   // there is nothing for us to do.
@@ -504,8 +521,8 @@
   }
 }
 
-/// copyGVAttributes - copy additional attributes (those not needed to construct
-/// a GlobalValue) from the SrcGV to the DestGV.
+/// copy additional attributes (those not needed to construct a GlobalValue)
+/// from the SrcGV to the DestGV.
 static void copyGVAttributes(GlobalValue *DestGV, const GlobalValue *SrcGV) {
   // Use the maximum alignment, rather than just copying the alignment of SrcGV.
   auto *DestGO = dyn_cast<GlobalObject>(DestGV);
@@ -543,6 +560,11 @@
                                   SF->getLinkage(), SF->getName(), DstM);
   copyGVAttributes(DF, SF);
 
+  if (Comdat *SC = SF->getComdat()) {
+    Comdat *DC = DstM->getOrInsertComdat(SC->getName());
+    DF->setComdat(DC);
+  }
+
   LazilyLinkFunctions.push_back(SF);
   return DF;
 }
@@ -644,99 +666,106 @@
 bool ModuleLinker::getComdatResult(const Comdat *SrcC,
                                    Comdat::SelectionKind &Result,
                                    bool &LinkFromSrc) {
+  Comdat::SelectionKind SSK = SrcC->getSelectionKind();
   StringRef ComdatName = SrcC->getName();
   Module::ComdatSymTabType &ComdatSymTab = DstM->getComdatSymbolTable();
   Module::ComdatSymTabType::iterator DstCI = ComdatSymTab.find(ComdatName);
-  if (DstCI != ComdatSymTab.end()) {
-    const Comdat *DstC = &DstCI->second;
-    Comdat::SelectionKind SSK = SrcC->getSelectionKind();
-    Comdat::SelectionKind DSK = DstC->getSelectionKind();
-    if (computeResultingSelectionKind(ComdatName, SSK, DSK, Result, LinkFromSrc))
-      return true;
+
+  if (DstCI == ComdatSymTab.end()) {
+    // Use the comdat if it is only available in one of the modules.
+    LinkFromSrc = true;
+    Result = SSK;
+    return false;
   }
-  return false;
+
+  const Comdat *DstC = &DstCI->second;
+  Comdat::SelectionKind DSK = DstC->getSelectionKind();
+  return computeResultingSelectionKind(ComdatName, SSK, DSK, Result,
+                                       LinkFromSrc);
 }
 
-/// getLinkageResult - This analyzes the two global values and determines what
-/// the result will look like in the destination module.  In particular, it
-/// computes the resultant linkage type and visibility, computes whether the
-/// global in the source should be copied over to the destination (replacing
-/// the existing one), and computes whether this linkage is an error or not.
-bool ModuleLinker::getLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
-                                    GlobalValue::LinkageTypes &LT,
-                                    GlobalValue::VisibilityTypes &Vis,
-                                    bool &LinkFromSrc) {
-  assert(Dest && "Must have two globals being queried");
-  assert(!Src->hasLocalLinkage() &&
-         "If Src has internal linkage, Dest shouldn't be set!");
+bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
+                                        const GlobalValue &Dest,
+                                        const GlobalValue &Src) {
+  // We always have to add Src if it has appending linkage.
+  if (Src.hasAppendingLinkage()) {
+    LinkFromSrc = true;
+    return false;
+  }
 
-  bool SrcIsDeclaration = Src->isDeclaration() && !Src->isMaterializable();
-  bool DestIsDeclaration = Dest->isDeclaration();
+  bool SrcIsDeclaration = Src.isDeclarationForLinker();
+  bool DestIsDeclaration = Dest.isDeclarationForLinker();
 
   if (SrcIsDeclaration) {
     // If Src is external or if both Src & Dest are external..  Just link the
     // external globals, we aren't adding anything.
-    if (Src->hasDLLImportStorageClass()) {
+    if (Src.hasDLLImportStorageClass()) {
       // If one of GVs is marked as DLLImport, result should be dllimport'ed.
-      if (DestIsDeclaration) {
-        LinkFromSrc = true;
-        LT = Src->getLinkage();
-      }
-    } else if (Dest->hasExternalWeakLinkage()) {
-      // If the Dest is weak, use the source linkage.
-      LinkFromSrc = true;
-      LT = Src->getLinkage();
-    } else {
-      LinkFromSrc = false;
-      LT = Dest->getLinkage();
+      LinkFromSrc = DestIsDeclaration;
+      return false;
     }
-  } else if (DestIsDeclaration && !Dest->hasDLLImportStorageClass()) {
-    // If Dest is external but Src is not:
-    LinkFromSrc = true;
-    LT = Src->getLinkage();
-  } else if (Src->isWeakForLinker()) {
-    // At this point we know that Dest has LinkOnce, External*, Weak, Common,
-    // or DLL* linkage.
-    if (Dest->hasExternalWeakLinkage() ||
-        Dest->hasAvailableExternallyLinkage() ||
-        (Dest->hasLinkOnceLinkage() &&
-         (Src->hasWeakLinkage() || Src->hasCommonLinkage()))) {
-      LinkFromSrc = true;
-      LT = Src->getLinkage();
-    } else {
-      LinkFromSrc = false;
-      LT = Dest->getLinkage();
-    }
-  } else if (Dest->isWeakForLinker()) {
-    // At this point we know that Src has External* or DLL* linkage.
-    if (Src->hasExternalWeakLinkage()) {
-      LinkFromSrc = false;
-      LT = Dest->getLinkage();
-    } else {
-      LinkFromSrc = true;
-      LT = GlobalValue::ExternalLinkage;
-    }
-  } else {
-    assert((Dest->hasExternalLinkage()  || Dest->hasExternalWeakLinkage()) &&
-           (Src->hasExternalLinkage()   || Src->hasExternalWeakLinkage()) &&
-           "Unexpected linkage type!");
-    return emitError("Linking globals named '" + Src->getName() +
-                 "': symbol multiply defined!");
+    // If the Dest is weak, use the source linkage.
+    LinkFromSrc = Dest.hasExternalWeakLinkage();
+    return false;
   }
 
-  // Compute the visibility. We follow the rules in the System V Application
-  // Binary Interface.
-  assert(!GlobalValue::isLocalLinkage(LT) &&
-         "Symbols with local linkage should not be merged");
-  Vis = isLessConstraining(Src->getVisibility(), Dest->getVisibility()) ?
-    Dest->getVisibility() : Src->getVisibility();
-  return false;
+  if (DestIsDeclaration) {
+    // If Dest is external but Src is not:
+    LinkFromSrc = true;
+    return false;
+  }
+
+  if (Src.hasCommonLinkage()) {
+    if (Dest.hasLinkOnceLinkage() || Dest.hasWeakLinkage()) {
+      LinkFromSrc = true;
+      return false;
+    }
+
+    if (!Dest.hasCommonLinkage()) {
+      LinkFromSrc = false;
+      return false;
+    }
+
+    // FIXME: Make datalayout mandatory and just use getDataLayout().
+    DataLayout DL(Dest.getParent());
+
+    uint64_t DestSize = DL.getTypeAllocSize(Dest.getType()->getElementType());
+    uint64_t SrcSize = DL.getTypeAllocSize(Src.getType()->getElementType());
+    LinkFromSrc = SrcSize > DestSize;
+    return false;
+  }
+
+  if (Src.isWeakForLinker()) {
+    assert(!Dest.hasExternalWeakLinkage());
+    assert(!Dest.hasAvailableExternallyLinkage());
+
+    if (Dest.hasLinkOnceLinkage() && Src.hasWeakLinkage()) {
+      LinkFromSrc = true;
+      return false;
+    }
+
+    LinkFromSrc = false;
+    return false;
+  }
+
+  if (Dest.isWeakForLinker()) {
+    assert(Src.hasExternalLinkage());
+    LinkFromSrc = true;
+    return false;
+  }
+
+  assert(!Src.hasExternalWeakLinkage());
+  assert(!Dest.hasExternalWeakLinkage());
+  assert(Dest.hasExternalLinkage() && Src.hasExternalLinkage() &&
+         "Unexpected linkage type!");
+  return emitError("Linking globals named '" + Src.getName() +
+                   "': symbol multiply defined!");
 }
 
-/// computeTypeMapping - Loop over all of the linked values to compute type
-/// mappings.  For example, if we link "extern Foo *x" and "Foo *x = NULL", then
-/// we have two struct types 'Foo' but one got renamed when the module was
-/// loaded into the same LLVMContext.
+/// Loop over all of the linked values to compute type mappings.  For example,
+/// if we link "extern Foo *x" and "Foo *x = NULL", then we have two struct
+/// types 'Foo' but one got renamed when the module was loaded into the same
+/// LLVMContext.
 void ModuleLinker::computeTypeMapping() {
   // Incorporate globals.
   for (Module::global_iterator I = SrcM->global_begin(),
@@ -811,10 +840,87 @@
   TypeMap.linkDefinedTypeBodies();
 }
 
-/// linkAppendingVarProto - If there were any appending global variables, link
-/// them together now.  Return true on error.
+static void upgradeGlobalArray(GlobalVariable *GV) {
+  ArrayType *ATy = cast<ArrayType>(GV->getType()->getElementType());
+  StructType *OldTy = cast<StructType>(ATy->getElementType());
+  assert(OldTy->getNumElements() == 2 && "Expected to upgrade from 2 elements");
+
+  // Get the upgraded 3 element type.
+  PointerType *VoidPtrTy = Type::getInt8Ty(GV->getContext())->getPointerTo();
+  Type *Tys[3] = {OldTy->getElementType(0), OldTy->getElementType(1),
+                  VoidPtrTy};
+  StructType *NewTy = StructType::get(GV->getContext(), Tys, false);
+
+  // Build new constants with a null third field filled in.
+  Constant *OldInitC = GV->getInitializer();
+  ConstantArray *OldInit = dyn_cast<ConstantArray>(OldInitC);
+  if (!OldInit && !isa<ConstantAggregateZero>(OldInitC))
+    // Invalid initializer; give up.
+    return;
+  std::vector<Constant *> Initializers;
+  if (OldInit && OldInit->getNumOperands()) {
+    Value *Null = Constant::getNullValue(VoidPtrTy);
+    for (Use &U : OldInit->operands()) {
+      ConstantStruct *Init = cast<ConstantStruct>(U.get());
+      Initializers.push_back(ConstantStruct::get(
+          NewTy, Init->getOperand(0), Init->getOperand(1), Null, nullptr));
+    }
+  }
+  assert(Initializers.size() == ATy->getNumElements() &&
+         "Failed to copy all array elements");
+
+  // Replace the old GV with a new one.
+  ATy = ArrayType::get(NewTy, Initializers.size());
+  Constant *NewInit = ConstantArray::get(ATy, Initializers);
+  GlobalVariable *NewGV = new GlobalVariable(
+      *GV->getParent(), ATy, GV->isConstant(), GV->getLinkage(), NewInit, "",
+      GV, GV->getThreadLocalMode(), GV->getType()->getAddressSpace(),
+      GV->isExternallyInitialized());
+  NewGV->copyAttributesFrom(GV);
+  NewGV->takeName(GV);
+  assert(GV->use_empty() && "program cannot use initializer list");
+  GV->eraseFromParent();
+}
+
+void ModuleLinker::upgradeMismatchedGlobalArray(StringRef Name) {
+  // Look for the global arrays.
+  auto *DstGV = dyn_cast_or_null<GlobalVariable>(DstM->getNamedValue(Name));
+  if (!DstGV)
+    return;
+  auto *SrcGV = dyn_cast_or_null<GlobalVariable>(SrcM->getNamedValue(Name));
+  if (!SrcGV)
+    return;
+
+  // Check if the types already match.
+  auto *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
+  auto *SrcTy =
+      cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
+  if (DstTy == SrcTy)
+    return;
+
+  // Grab the element types.  We can only upgrade an array of a two-field
+  // struct.  Only bother if the other one has three-fields.
+  auto *DstEltTy = cast<StructType>(DstTy->getElementType());
+  auto *SrcEltTy = cast<StructType>(SrcTy->getElementType());
+  if (DstEltTy->getNumElements() == 2 && SrcEltTy->getNumElements() == 3) {
+    upgradeGlobalArray(DstGV);
+    return;
+  }
+  if (DstEltTy->getNumElements() == 3 && SrcEltTy->getNumElements() == 2)
+    upgradeGlobalArray(SrcGV);
+
+  // We can't upgrade any other differences.
+}
+
+void ModuleLinker::upgradeMismatchedGlobals() {
+  upgradeMismatchedGlobalArray("llvm.global_ctors");
+  upgradeMismatchedGlobalArray("llvm.global_dtors");
+}
+
+/// If there were any appending global variables, link them together now.
+/// Return true on error.
 bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
-                                         GlobalVariable *SrcGV) {
+                                         const GlobalVariable *SrcGV) {
 
   if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
     return emitError("Linking globals named '" + SrcGV->getName() +
@@ -879,252 +985,167 @@
   return false;
 }
 
-/// linkGlobalProto - Loop through the global variables in the src module and
-/// merge them into the dest module.
-bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
+bool ModuleLinker::linkGlobalValueProto(GlobalValue *SGV) {
   GlobalValue *DGV = getLinkedToGlobal(SGV);
-  llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+
+  // Handle the ultra special appending linkage case first.
+  if (DGV && DGV->hasAppendingLinkage())
+    return linkAppendingVarProto(cast<GlobalVariable>(DGV),
+                                 cast<GlobalVariable>(SGV));
+
+  bool LinkFromSrc = true;
+  Comdat *C = nullptr;
+  GlobalValue::VisibilityTypes Visibility = SGV->getVisibility();
   bool HasUnnamedAddr = SGV->hasUnnamedAddr();
 
-  bool LinkFromSrc = false;
-  Comdat *DC = nullptr;
   if (const Comdat *SC = SGV->getComdat()) {
     Comdat::SelectionKind SK;
     std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    DC = DstM->getOrInsertComdat(SC->getName());
-    DC->setSelectionKind(SK);
+    C = DstM->getOrInsertComdat(SC->getName());
+    C->setSelectionKind(SK);
+  } else if (DGV) {
+    if (shouldLinkFromSource(LinkFromSrc, *DGV, *SGV))
+      return true;
+  }
+
+  if (!LinkFromSrc) {
+    // Track the source global so that we don't attempt to copy it over when
+    // processing global initializers.
+    DoNotLinkFromSource.insert(SGV);
+
+    if (DGV)
+      // Make sure to remember this mapping.
+      ValueMap[SGV] =
+          ConstantExpr::getBitCast(DGV, TypeMap.get(SGV->getType()));
   }
 
   if (DGV) {
-    if (!DC) {
-      // Concatenation of appending linkage variables is magic and handled later.
-      if (DGV->hasAppendingLinkage() || SGV->hasAppendingLinkage())
-        return linkAppendingVarProto(cast<GlobalVariable>(DGV), SGV);
+    Visibility = isLessConstraining(Visibility, DGV->getVisibility())
+                     ? DGV->getVisibility()
+                     : Visibility;
+    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
+  }
 
-      // Determine whether linkage of these two globals follows the source
-      // module's definition or the destination module's definition.
-      GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
-      GlobalValue::VisibilityTypes NV;
-      if (getLinkageResult(DGV, SGV, NewLinkage, NV, LinkFromSrc))
-        return true;
-      NewVisibility = NV;
-      HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
+  if (!LinkFromSrc && !DGV)
+    return false;
 
-      // If we're not linking from the source, then keep the definition that we
-      // have.
-      if (!LinkFromSrc) {
-        // Special case for const propagation.
-        if (GlobalVariable *DGVar = dyn_cast<GlobalVariable>(DGV))
-          if (DGVar->isDeclaration() && SGV->isConstant() &&
-              !DGVar->isConstant())
-            DGVar->setConstant(true);
+  GlobalValue *NewGV;
+  if (auto *SGVar = dyn_cast<GlobalVariable>(SGV)) {
+    NewGV = linkGlobalVariableProto(SGVar, DGV, LinkFromSrc);
+    if (!NewGV)
+      return true;
+  } else if (auto *SF = dyn_cast<Function>(SGV)) {
+    NewGV = linkFunctionProto(SF, DGV, LinkFromSrc);
+  } else {
+    NewGV = linkGlobalAliasProto(cast<GlobalAlias>(SGV), DGV, LinkFromSrc);
+  }
 
-        // Set calculated linkage, visibility and unnamed_addr.
-        DGV->setLinkage(NewLinkage);
-        DGV->setVisibility(*NewVisibility);
-        DGV->setUnnamedAddr(HasUnnamedAddr);
-      }
+  if (NewGV) {
+    if (NewGV != DGV)
+      copyGVAttributes(NewGV, SGV);
+
+    NewGV->setUnnamedAddr(HasUnnamedAddr);
+    NewGV->setVisibility(Visibility);
+
+    if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
+      if (C)
+        NewGO->setComdat(C);
     }
 
-    if (!LinkFromSrc) {
-      // Make sure to remember this mapping.
-      ValueMap[SGV] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGV->getType()));
-
-      // Track the source global so that we don't attempt to copy it over when
-      // processing global initializers.
-      DoNotLinkFromSource.insert(SGV);
-
-      return false;
+    // Make sure to remember this mapping.
+    if (NewGV != DGV) {
+      if (DGV) {
+        DGV->replaceAllUsesWith(
+            ConstantExpr::getBitCast(NewGV, DGV->getType()));
+        DGV->eraseFromParent();
+      }
+      ValueMap[SGV] = NewGV;
     }
   }
 
-  // If the Comdat this variable was inside of wasn't selected, skip it.
-  if (DC && !DGV && !LinkFromSrc) {
-    DoNotLinkFromSource.insert(SGV);
-    return false;
+  return false;
+}
+
+/// Loop through the global variables in the src module and merge them into the
+/// dest module.
+GlobalValue *ModuleLinker::linkGlobalVariableProto(const GlobalVariable *SGVar,
+                                                   GlobalValue *DGV,
+                                                   bool LinkFromSrc) {
+  unsigned Alignment = 0;
+  bool ClearConstant = false;
+
+  if (DGV) {
+    if (DGV->hasCommonLinkage() && SGVar->hasCommonLinkage())
+      Alignment = std::max(SGVar->getAlignment(), DGV->getAlignment());
+
+    auto *DGVar = dyn_cast<GlobalVariable>(DGV);
+    if (!SGVar->isConstant() || (DGVar && !DGVar->isConstant()))
+      ClearConstant = true;
+  }
+
+  if (!LinkFromSrc) {
+    if (auto *NewGVar = dyn_cast<GlobalVariable>(DGV)) {
+      if (Alignment)
+        NewGVar->setAlignment(Alignment);
+      if (NewGVar->isDeclaration() && ClearConstant)
+        NewGVar->setConstant(false);
+    }
+    return DGV;
   }
 
   // No linking to be performed or linking from the source: simply create an
   // identical version of the symbol over in the dest module... the
   // initializer will be filled in later by LinkGlobalInits.
-  GlobalVariable *NewDGV =
-    new GlobalVariable(*DstM, TypeMap.get(SGV->getType()->getElementType()),
-                       SGV->isConstant(), SGV->getLinkage(), /*init*/nullptr,
-                       SGV->getName(), /*insertbefore*/nullptr,
-                       SGV->getThreadLocalMode(),
-                       SGV->getType()->getAddressSpace());
-  // Propagate alignment, visibility and section info.
-  copyGVAttributes(NewDGV, SGV);
-  if (NewVisibility)
-    NewDGV->setVisibility(*NewVisibility);
-  NewDGV->setUnnamedAddr(HasUnnamedAddr);
+  GlobalVariable *NewDGV = new GlobalVariable(
+      *DstM, TypeMap.get(SGVar->getType()->getElementType()),
+      SGVar->isConstant(), SGVar->getLinkage(), /*init*/ nullptr,
+      SGVar->getName(), /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
+      SGVar->getType()->getAddressSpace());
 
-  if (DC)
-    NewDGV->setComdat(DC);
+  if (Alignment)
+    NewDGV->setAlignment(Alignment);
 
-  if (DGV) {
-    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDGV, DGV->getType()));
-    DGV->eraseFromParent();
-  }
-
-  // Make sure to remember this mapping.
-  ValueMap[SGV] = NewDGV;
-  return false;
+  return NewDGV;
 }
 
-/// linkFunctionProto - Link the function in the source module into the
-/// destination module if needed, setting up mapping information.
-bool ModuleLinker::linkFunctionProto(Function *SF) {
-  GlobalValue *DGV = getLinkedToGlobal(SF);
-  llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
-  bool HasUnnamedAddr = SF->hasUnnamedAddr();
-
-  bool LinkFromSrc = false;
-  Comdat *DC = nullptr;
-  if (const Comdat *SC = SF->getComdat()) {
-    Comdat::SelectionKind SK;
-    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    DC = DstM->getOrInsertComdat(SC->getName());
-    DC->setSelectionKind(SK);
-  }
-
-  if (DGV) {
-    if (!DC) {
-      GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
-      GlobalValue::VisibilityTypes NV;
-      if (getLinkageResult(DGV, SF, NewLinkage, NV, LinkFromSrc))
-        return true;
-      NewVisibility = NV;
-      HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
-
-      if (!LinkFromSrc) {
-        // Set calculated linkage
-        DGV->setLinkage(NewLinkage);
-        DGV->setVisibility(*NewVisibility);
-        DGV->setUnnamedAddr(HasUnnamedAddr);
-      }
-    }
-
-    if (!LinkFromSrc) {
-      // Make sure to remember this mapping.
-      ValueMap[SF] = ConstantExpr::getBitCast(DGV, TypeMap.get(SF->getType()));
-
-      // Track the function from the source module so we don't attempt to remap
-      // it.
-      DoNotLinkFromSource.insert(SF);
-
-      return false;
-    }
-  }
+/// Link the function in the source module into the destination module if
+/// needed, setting up mapping information.
+GlobalValue *ModuleLinker::linkFunctionProto(const Function *SF,
+                                             GlobalValue *DGV,
+                                             bool LinkFromSrc) {
+  if (!LinkFromSrc)
+    return DGV;
 
   // If the function is to be lazily linked, don't create it just yet.
   // The ValueMaterializerTy will deal with creating it if it's used.
   if (!DGV && (SF->hasLocalLinkage() || SF->hasLinkOnceLinkage() ||
                SF->hasAvailableExternallyLinkage())) {
     DoNotLinkFromSource.insert(SF);
-    return false;
-  }
-
-  // If the Comdat this function was inside of wasn't selected, skip it.
-  if (DC && !DGV && !LinkFromSrc) {
-    DoNotLinkFromSource.insert(SF);
-    return false;
+    return nullptr;
   }
 
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
-  Function *NewDF = Function::Create(TypeMap.get(SF->getFunctionType()),
-                                     SF->getLinkage(), SF->getName(), DstM);
-  copyGVAttributes(NewDF, SF);
-  if (NewVisibility)
-    NewDF->setVisibility(*NewVisibility);
-  NewDF->setUnnamedAddr(HasUnnamedAddr);
-
-  if (DC)
-    NewDF->setComdat(DC);
-
-  if (DGV) {
-    // Any uses of DF need to change to NewDF, with cast.
-    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDF, DGV->getType()));
-    DGV->eraseFromParent();
-  }
-
-  ValueMap[SF] = NewDF;
-  return false;
+  return Function::Create(TypeMap.get(SF->getFunctionType()), SF->getLinkage(),
+                          SF->getName(), DstM);
 }
 
-/// LinkAliasProto - Set up prototypes for any aliases that come over from the
-/// source module.
-bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
-  GlobalValue *DGV = getLinkedToGlobal(SGA);
-  llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
-  bool HasUnnamedAddr = SGA->hasUnnamedAddr();
-
-  bool LinkFromSrc = false;
-  Comdat *DC = nullptr;
-  if (const Comdat *SC = SGA->getComdat()) {
-    Comdat::SelectionKind SK;
-    std::tie(SK, LinkFromSrc) = ComdatsChosen[SC];
-    DC = DstM->getOrInsertComdat(SC->getName());
-    DC->setSelectionKind(SK);
-  }
-
-  if (DGV) {
-    if (!DC) {
-      GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
-      GlobalValue::VisibilityTypes NV;
-      if (getLinkageResult(DGV, SGA, NewLinkage, NV, LinkFromSrc))
-        return true;
-      NewVisibility = NV;
-      HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
-
-      if (!LinkFromSrc) {
-        // Set calculated linkage.
-        DGV->setLinkage(NewLinkage);
-        DGV->setVisibility(*NewVisibility);
-        DGV->setUnnamedAddr(HasUnnamedAddr);
-      }
-    }
-
-    if (!LinkFromSrc) {
-      // Make sure to remember this mapping.
-      ValueMap[SGA] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGA->getType()));
-
-      // Track the alias from the source module so we don't attempt to remap it.
-      DoNotLinkFromSource.insert(SGA);
-
-      return false;
-    }
-  }
-
-  // If the Comdat this alias was inside of wasn't selected, skip it.
-  if (DC && !DGV && !LinkFromSrc) {
-    DoNotLinkFromSource.insert(SGA);
-    return false;
-  }
+/// Set up prototypes for any aliases that come over from the source module.
+GlobalValue *ModuleLinker::linkGlobalAliasProto(const GlobalAlias *SGA,
+                                                GlobalValue *DGV,
+                                                bool LinkFromSrc) {
+  if (!LinkFromSrc)
+    return DGV;
 
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
   auto *PTy = cast<PointerType>(TypeMap.get(SGA->getType()));
-  auto *NewDA =
-      GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
-                          SGA->getLinkage(), SGA->getName(), DstM);
-  copyGVAttributes(NewDA, SGA);
-  if (NewVisibility)
-    NewDA->setVisibility(*NewVisibility);
-  NewDA->setUnnamedAddr(HasUnnamedAddr);
-
-  if (DGV) {
-    // Any uses of DGV need to change to NewDA, with cast.
-    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDA, DGV->getType()));
-    DGV->eraseFromParent();
-  }
-
-  ValueMap[SGA] = NewDA;
-  return false;
+  return GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+                             SGA->getLinkage(), SGA->getName(), DstM);
 }
 
-static void getArrayElements(Constant *C, SmallVectorImpl<Constant*> &Dest) {
+static void getArrayElements(const Constant *C,
+                             SmallVectorImpl<Constant *> &Dest) {
   unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
 
   for (unsigned i = 0; i != NumElements; ++i)
@@ -1133,18 +1154,38 @@
 
 void ModuleLinker::linkAppendingVarInit(const AppendingVarInfo &AVI) {
   // Merge the initializer.
-  SmallVector<Constant*, 16> Elements;
-  getArrayElements(AVI.DstInit, Elements);
+  SmallVector<Constant *, 16> DstElements;
+  getArrayElements(AVI.DstInit, DstElements);
 
-  Constant *SrcInit = MapValue(AVI.SrcInit, ValueMap, RF_None, &TypeMap, &ValMaterializer);
-  getArrayElements(SrcInit, Elements);
+  SmallVector<Constant *, 16> SrcElements;
+  getArrayElements(AVI.SrcInit, SrcElements);
 
   ArrayType *NewType = cast<ArrayType>(AVI.NewGV->getType()->getElementType());
-  AVI.NewGV->setInitializer(ConstantArray::get(NewType, Elements));
+
+  StringRef Name = AVI.NewGV->getName();
+  bool IsNewStructor =
+      (Name == "llvm.global_ctors" || Name == "llvm.global_dtors") &&
+      cast<StructType>(NewType->getElementType())->getNumElements() == 3;
+
+  for (auto *V : SrcElements) {
+    if (IsNewStructor) {
+      Constant *Key = V->getAggregateElement(2);
+      if (DoNotLinkFromSource.count(Key))
+        continue;
+    }
+    DstElements.push_back(
+        MapValue(V, ValueMap, RF_None, &TypeMap, &ValMaterializer));
+  }
+  if (IsNewStructor) {
+    NewType = ArrayType::get(NewType->getElementType(), DstElements.size());
+    AVI.NewGV->mutateType(PointerType::get(NewType, 0));
+  }
+
+  AVI.NewGV->setInitializer(ConstantArray::get(NewType, DstElements));
 }
 
-/// linkGlobalInits - Update the initializers in the Dest module now that all
-/// globals that may be referenced are in Dest.
+/// Update the initializers in the Dest module now that all globals that may be
+/// referenced are in Dest.
 void ModuleLinker::linkGlobalInits() {
   // Loop over all of the globals in the src module, mapping them over as we go
   for (Module::const_global_iterator I = SrcM->global_begin(),
@@ -1161,9 +1202,9 @@
   }
 }
 
-/// linkFunctionBody - Copy the source function over into the dest function and
-/// fix up references to values.  At this point we know that Dest is an external
-/// function, and that Src is not.
+/// Copy the source function over into the dest function and fix up references
+/// to values. At this point we know that Dest is an external function, and
+/// that Src is not.
 void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
   assert(Src && Dst && Dst->isDeclaration() && !Src->isDeclaration());
 
@@ -1177,25 +1218,17 @@
     ValueMap[I] = DI;
   }
 
-  if (Mode == Linker::DestroySource) {
-    // Splice the body of the source function into the dest function.
-    Dst->getBasicBlockList().splice(Dst->end(), Src->getBasicBlockList());
+  // Splice the body of the source function into the dest function.
+  Dst->getBasicBlockList().splice(Dst->end(), Src->getBasicBlockList());
 
-    // At this point, all of the instructions and values of the function are now
-    // copied over.  The only problem is that they are still referencing values in
-    // the Source function as operands.  Loop through all of the operands of the
-    // functions and patch them up to point to the local versions.
-    for (Function::iterator BB = Dst->begin(), BE = Dst->end(); BB != BE; ++BB)
-      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-        RemapInstruction(I, ValueMap, RF_IgnoreMissingEntries,
-                         &TypeMap, &ValMaterializer);
-
-  } else {
-    // Clone the body of the function into the dest function.
-    SmallVector<ReturnInst*, 8> Returns; // Ignore returns.
-    CloneFunctionInto(Dst, Src, ValueMap, false, Returns, "", nullptr,
-                      &TypeMap, &ValMaterializer);
-  }
+  // At this point, all of the instructions and values of the function are now
+  // copied over.  The only problem is that they are still referencing values in
+  // the Source function as operands.  Loop through all of the operands of the
+  // functions and patch them up to point to the local versions.
+  for (Function::iterator BB = Dst->begin(), BE = Dst->end(); BB != BE; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      RemapInstruction(I, ValueMap, RF_IgnoreMissingEntries, &TypeMap,
+                       &ValMaterializer);
 
   // There is no need to map the arguments anymore.
   for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
@@ -1204,7 +1237,7 @@
 
 }
 
-/// linkAliasBodies - Insert all of the aliases in Src into the Dest module.
+/// Insert all of the aliases in Src into the Dest module.
 void ModuleLinker::linkAliasBodies() {
   for (Module::alias_iterator I = SrcM->alias_begin(), E = SrcM->alias_end();
        I != E; ++I) {
@@ -1219,8 +1252,7 @@
   }
 }
 
-/// linkNamedMDNodes - Insert all of the named MDNodes in Src into the Dest
-/// module.
+/// Insert all of the named MDNodes in Src into the Dest module.
 void ModuleLinker::linkNamedMDNodes() {
   const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
   for (Module::const_named_metadata_iterator I = SrcM->named_metadata_begin(),
@@ -1235,8 +1267,7 @@
   }
 }
 
-/// linkModuleFlagsMetadata - Merge the linker flags in Src into the Dest
-/// module.
+/// Merge the linker flags in Src into the Dest module.
 bool ModuleLinker::linkModuleFlagsMetadata() {
   // If the source module has no module flags, we are done.
   const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
@@ -1336,10 +1367,8 @@
     case Module::Warning: {
       // Emit a warning if the values differ.
       if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        if (!SuppressWarnings) {
-          errs() << "WARNING: linking module flags '" << ID->getString()
-                 << "': IDs have conflicting values";
-        }
+        emitWarning("linking module flags '" + ID->getString() +
+                    "': IDs have conflicting values");
       }
       continue;
     }
@@ -1406,23 +1435,19 @@
 
   if (SrcM->getDataLayout() && DstM->getDataLayout() &&
       *SrcM->getDataLayout() != *DstM->getDataLayout()) {
-    if (!SuppressWarnings) {
-      errs() << "WARNING: Linking two modules of different data layouts: '"
-             << SrcM->getModuleIdentifier() << "' is '"
-             << SrcM->getDataLayoutStr() << "' whereas '"
-             << DstM->getModuleIdentifier() << "' is '"
-             << DstM->getDataLayoutStr() << "'\n";
-    }
+    emitWarning("Linking two modules of different data layouts: '" +
+                SrcM->getModuleIdentifier() + "' is '" +
+                SrcM->getDataLayoutStr() + "' whereas '" +
+                DstM->getModuleIdentifier() + "' is '" +
+                DstM->getDataLayoutStr() + "'\n");
   }
   if (!SrcM->getTargetTriple().empty() &&
       DstM->getTargetTriple() != SrcM->getTargetTriple()) {
-    if (!SuppressWarnings) {
-      errs() << "WARNING: Linking two modules of different target triples: "
-             << SrcM->getModuleIdentifier() << "' is '"
-             << SrcM->getTargetTriple() << "' whereas '"
-             << DstM->getModuleIdentifier() << "' is '"
-             << DstM->getTargetTriple() << "'\n";
-    }
+    emitWarning("Linking two modules of different target triples: " +
+                SrcM->getModuleIdentifier() + "' is '" +
+                SrcM->getTargetTriple() + "' whereas '" +
+                DstM->getModuleIdentifier() + "' is '" +
+                DstM->getTargetTriple() + "'\n");
   }
 
   // Append the module inline asm string.
@@ -1438,7 +1463,7 @@
   computeTypeMapping();
 
   ComdatsChosen.clear();
-  for (const StringMapEntry<llvm::Comdat> &SMEC : SrcM->getComdatSymbolTable()) {
+  for (const auto &SMEC : SrcM->getComdatSymbolTable()) {
     const Comdat &C = SMEC.getValue();
     if (ComdatsChosen.count(&C))
       continue;
@@ -1449,11 +1474,14 @@
     ComdatsChosen[&C] = std::make_pair(SK, LinkFromSrc);
   }
 
+  // Upgrade mismatched global arrays.
+  upgradeMismatchedGlobals();
+
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
   for (Module::global_iterator I = SrcM->global_begin(),
        E = SrcM->global_end(); I != E; ++I)
-    if (linkGlobalProto(I))
+    if (linkGlobalValueProto(I))
       return true;
 
   // Link the functions together between the two modules, without doing function
@@ -1462,13 +1490,13 @@
   // all of the global values that may be referenced are available in our
   // ValueMap.
   for (Module::iterator I = SrcM->begin(), E = SrcM->end(); I != E; ++I)
-    if (linkFunctionProto(I))
+    if (linkGlobalValueProto(I))
       return true;
 
   // If there were any aliases, link them now.
   for (Module::alias_iterator I = SrcM->alias_begin(),
        E = SrcM->alias_end(); I != E; ++I)
-    if (linkAliasProto(I))
+    if (linkGlobalValueProto(I))
       return true;
 
   for (unsigned i = 0, e = AppendingVars.size(); i != e; ++i)
@@ -1487,13 +1515,13 @@
           SF->getPrefixData(), ValueMap, RF_None, &TypeMap, &ValMaterializer));
     }
 
-    // Skip if no body (function is external) or materialize.
-    if (SF->isDeclaration()) {
-      if (!SF->isMaterializable())
-        continue;
-      if (SF->Materialize(&ErrorMsg))
-        return true;
-    }
+    // Materialize if needed.
+    if (std::error_code EC = SF->materialize())
+      return emitError(EC.message());
+
+    // Skip if no body (function is external).
+    if (SF->isDeclaration())
+      continue;
 
     linkFunctionBody(DF, SF);
     SF->Dematerialize();
@@ -1536,13 +1564,13 @@
                                    &ValMaterializer));
       }
 
-      // Materialize if necessary.
-      if (SF->isDeclaration()) {
-        if (!SF->isMaterializable())
-          continue;
-        if (SF->Materialize(&ErrorMsg))
-          return true;
-      }
+      // Materialize if needed.
+      if (std::error_code EC = SF->materialize())
+        return emitError(EC.message());
+
+      // Skip if no body (function is external).
+      if (SF->isDeclaration())
+        continue;
 
       // Erase from vector *before* the function body is linked - linkFunctionBody could
       // invalidate I.
@@ -1566,13 +1594,25 @@
   return false;
 }
 
-Linker::Linker(Module *M, bool SuppressWarnings)
-    : Composite(M), SuppressWarnings(SuppressWarnings) {
+void Linker::init(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
+  this->Composite = M;
+  this->DiagnosticHandler = DiagnosticHandler;
+
   TypeFinder StructTypes;
   StructTypes.run(*M, true);
   IdentifiedStructTypes.insert(StructTypes.begin(), StructTypes.end());
 }
 
+Linker::Linker(Module *M, DiagnosticHandlerFunction DiagnosticHandler) {
+  init(M, DiagnosticHandler);
+}
+
+Linker::Linker(Module *M) {
+  init(M, [this](const DiagnosticInfo &DI) {
+    Composite->getContext().diagnose(DI);
+  });
+}
+
 Linker::~Linker() {
 }
 
@@ -1581,30 +1621,30 @@
   Composite = nullptr;
 }
 
-bool Linker::linkInModule(Module *Src, unsigned Mode, std::string *ErrorMsg) {
-  ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src, Mode,
-                         SuppressWarnings);
-  if (TheLinker.run()) {
-    if (ErrorMsg)
-      *ErrorMsg = TheLinker.ErrorMsg;
-    return true;
-  }
-  return false;
+bool Linker::linkInModule(Module *Src) {
+  ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src,
+                         DiagnosticHandler);
+  return TheLinker.run();
 }
 
 //===----------------------------------------------------------------------===//
 // LinkModules entrypoint.
 //===----------------------------------------------------------------------===//
 
-/// LinkModules - This function links two modules together, with the resulting
-/// Dest module modified to be the composite of the two input modules.  If an
-/// error occurs, true is returned and ErrorMsg (if not null) is set to indicate
-/// the problem.  Upon failure, the Dest module could be in a modified state,
-/// and shouldn't be relied on to be consistent.
-bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode,
-                         std::string *ErrorMsg) {
+/// This function links two modules together, with the resulting Dest module
+/// modified to be the composite of the two input modules. If an error occurs,
+/// true is returned and ErrorMsg (if not null) is set to indicate the problem.
+/// Upon failure, the Dest module could be in a modified state, and shouldn't be
+/// relied on to be consistent.
+bool Linker::LinkModules(Module *Dest, Module *Src,
+                         DiagnosticHandlerFunction DiagnosticHandler) {
+  Linker L(Dest, DiagnosticHandler);
+  return L.linkInModule(Src);
+}
+
+bool Linker::LinkModules(Module *Dest, Module *Src) {
   Linker L(Dest);
-  return L.linkInModule(Src, Mode, ErrorMsg);
+  return L.linkInModule(Src);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1613,10 +1653,15 @@
 
 LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
                          LLVMLinkerMode Mode, char **OutMessages) {
-  std::string Messages;
-  LLVMBool Result = Linker::LinkModules(unwrap(Dest), unwrap(Src),
-                                        Mode, OutMessages? &Messages : nullptr);
-  if (OutMessages)
-    *OutMessages = strdup(Messages.c_str());
+  Module *D = unwrap(Dest);
+  std::string Message;
+  raw_string_ostream Stream(Message);
+  DiagnosticPrinterRawOStream DP(Stream);
+
+  LLVMBool Result = Linker::LinkModules(
+      D, unwrap(Src), [&](const DiagnosticInfo &DI) { DI.print(DP); });
+
+  if (OutMessages && Result)
+    *OutMessages = strdup(Message.c_str());
   return Result;
 }

diff --git a/lib/MC/Android.mk b/lib/MC/Android.mk
index fd587c4..c7c5c1f 100644
--- a/lib/MC/Android.mk
+++ b/lib/MC/Android.mk

@@ -13,13 +13,11 @@
   MCCodeEmitter.cpp \
   MCCodeGenInfo.cpp \
   MCContext.cpp \
-  MCDisassembler.cpp \
   MCDwarf.cpp \
   MCELF.cpp \
   MCELFObjectTargetWriter.cpp \
   MCELFStreamer.cpp \
   MCExpr.cpp \
-  MCExternalSymbolizer.cpp \
   MCInst.cpp \
   MCInstPrinter.cpp \
   MCInstrAnalysis.cpp \
@@ -32,7 +30,6 @@
   MCObjectStreamer.cpp \
   MCObjectWriter.cpp \
   MCRegisterInfo.cpp \
-  MCRelocationInfo.cpp \
   MCSection.cpp \
   MCSectionCOFF.cpp \
   MCSectionELF.cpp \
@@ -44,6 +41,7 @@
   MCTargetOptions.cpp \
   MCValue.cpp \
   MCWin64EH.cpp \
+  MCWinEH.cpp \
   MachObjectWriter.cpp \
   StringTableBuilder.cpp \
   SubtargetFeature.cpp \

diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 330519e..7181bdc 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt

@@ -11,13 +11,11 @@
   MCCodeEmitter.cpp
   MCCodeGenInfo.cpp
   MCContext.cpp
-  MCDisassembler.cpp
   MCDwarf.cpp
   MCELF.cpp
   MCELFObjectTargetWriter.cpp
   MCELFStreamer.cpp
   MCExpr.cpp
-  MCExternalSymbolizer.cpp
   MCInst.cpp
   MCInstPrinter.cpp
   MCInstrAnalysis.cpp
@@ -30,7 +28,6 @@
   MCObjectStreamer.cpp
   MCObjectWriter.cpp
   MCRegisterInfo.cpp
-  MCRelocationInfo.cpp
   MCSection.cpp
   MCSectionCOFF.cpp
   MCSectionELF.cpp
@@ -42,6 +39,7 @@
   MCTargetOptions.cpp
   MCValue.cpp
   MCWin64EH.cpp
+  MCWinEH.cpp
   MachObjectWriter.cpp
   StringTableBuilder.cpp
   SubtargetFeature.cpp
@@ -50,6 +48,5 @@
   YAML.cpp
   )
 
-add_subdirectory(MCAnalysis)
 add_subdirectory(MCParser)
 add_subdirectory(MCDisassembler)

diff --git a/lib/MC/ConstantPools.cpp b/lib/MC/ConstantPools.cpp
index f979dad..c4cea60 100644
--- a/lib/MC/ConstantPools.cpp
+++ b/lib/MC/ConstantPools.cpp

@@ -24,21 +24,22 @@
 void ConstantPool::emitEntries(MCStreamer &Streamer) {
   if (Entries.empty())
     return;
-  Streamer.EmitCodeAlignment(4); // align to 4-byte address
   Streamer.EmitDataRegion(MCDR_DataRegion);
   for (EntryVecTy::const_iterator I = Entries.begin(), E = Entries.end();
        I != E; ++I) {
-    Streamer.EmitLabel(I->first);
-    Streamer.EmitValue(I->second, 4);
+    Streamer.EmitCodeAlignment(I->Size); // align naturally
+    Streamer.EmitLabel(I->Label);
+    Streamer.EmitValue(I->Value, I->Size);
   }
   Streamer.EmitDataRegion(MCDR_DataRegionEnd);
   Entries.clear();
 }
 
-const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context) {
+const MCExpr *ConstantPool::addEntry(const MCExpr *Value, MCContext &Context,
+                                     unsigned Size) {
   MCSymbol *CPEntryLabel = Context.CreateTempSymbol();
 
-  Entries.push_back(std::make_pair(CPEntryLabel, Value));
+  Entries.push_back(ConstantPoolEntry(CPEntryLabel, Value, Size));
   return MCSymbolRefExpr::Create(CPEntryLabel, Context);
 }
 
@@ -89,7 +90,9 @@
 }
 
 const MCExpr *AssemblerConstantPools::addEntry(MCStreamer &Streamer,
-                                               const MCExpr *Expr) {
+                                               const MCExpr *Expr,
+                                               unsigned Size) {
   const MCSection *Section = Streamer.getCurrentSection().first;
-  return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext());
+  return getOrCreateConstantPool(Section).addEntry(Expr, Streamer.getContext(),
+                                                   Size);
 }

diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 7fb9fae..e4442e1 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp

@@ -81,23 +81,13 @@
 
 struct ELFRelocationEntry {
   uint64_t Offset; // Where is the relocation.
-  bool UseSymbol;  // Relocate with a symbol, not the section.
-  union {
-    const MCSymbol *Symbol;       // The symbol to relocate with.
-    const MCSectionData *Section; // The section to relocate with.
-  };
+  const MCSymbol *Symbol;       // The symbol to relocate with.
   unsigned Type;   // The type of the relocation.
   uint64_t Addend; // The addend to use.
 
   ELFRelocationEntry(uint64_t Offset, const MCSymbol *Symbol, unsigned Type,
                      uint64_t Addend)
-      : Offset(Offset), UseSymbol(true), Symbol(Symbol), Type(Type),
-        Addend(Addend) {}
-
-  ELFRelocationEntry(uint64_t Offset, const MCSectionData *Section,
-                     unsigned Type, uint64_t Addend)
-      : Offset(Offset), UseSymbol(false), Section(Section), Type(Type),
-        Addend(Addend) {}
+      : Offset(Offset), Symbol(Symbol), Type(Type), Addend(Addend) {}
 };
 
 class ELFObjectWriter : public MCObjectWriter {
@@ -137,6 +127,14 @@
 
       // Support lexicographic sorting.
       bool operator<(const ELFSymbolData &RHS) const {
+        unsigned LHSType = MCELF::GetType(*SymbolData);
+        unsigned RHSType = MCELF::GetType(*RHS.SymbolData);
+        if (LHSType == ELF::STT_SECTION && RHSType != ELF::STT_SECTION)
+          return false;
+        if (LHSType != ELF::STT_SECTION && RHSType == ELF::STT_SECTION)
+          return true;
+        if (LHSType == ELF::STT_SECTION && RHSType == ELF::STT_SECTION)
+          return SectionIndex < RHS.SectionIndex;
         return Name < RHS.Name;
       }
     };
@@ -246,7 +244,7 @@
     /// \param NumRegularSections - Number of non-relocation sections.
     void computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
                             const SectionIndexMapTy &SectionIndexMap,
-                            RevGroupMapTy RevGroupMap,
+                            const RevGroupMapTy &RevGroupMap,
                             unsigned NumRegularSections);
 
     void ComputeIndexMap(MCAssembler &Asm,
@@ -651,22 +649,6 @@
     WriteSymbol(Writer, MSD, Layout);
   }
 
-  // Write out a symbol table entry for each regular section.
-  for (MCAssembler::const_iterator i = Asm.begin(), e = Asm.end(); i != e;
-       ++i) {
-    const MCSectionELF &Section =
-      static_cast<const MCSectionELF&>(i->getSection());
-    if (Section.getType() == ELF::SHT_RELA ||
-        Section.getType() == ELF::SHT_REL ||
-        Section.getType() == ELF::SHT_STRTAB ||
-        Section.getType() == ELF::SHT_SYMTAB ||
-        Section.getType() == ELF::SHT_SYMTAB_SHNDX)
-      continue;
-    Writer.writeSymbol(0, ELF::STT_SECTION, 0, 0, ELF::STV_DEFAULT,
-                       SectionIndexMap.lookup(&Section), false);
-    LastLocalSymbolIndex++;
-  }
-
   for (unsigned i = 0, e = ExternalSymbolData.size(); i != e; ++i) {
     ELFSymbolData &MSD = ExternalSymbolData[i];
     MCSymbolData &Data = *MSD.SymbolData;
@@ -770,8 +752,9 @@
   }
 
   // Most TLS relocations use a got, so they need the symbol. Even those that
-  // are just an offset (@tpoff), require a symbol in some linkers (gold,
-  // but not bfd ld).
+  // are just an offset (@tpoff), require a symbol in gold versions before
+  // 5efeedf61e4fe720fd3e9a08e6c91c10abb66d42 (2014-09-26) which fixed
+  // http://sourceware.org/PR16773.
   if (Flags & ELF::SHF_TLS)
     return true;
 
@@ -782,7 +765,7 @@
   if (Asm.isThumbFunc(&Sym))
     return true;
 
-  if (TargetObjectWriter->needsRelocateWithSymbol(Type))
+  if (TargetObjectWriter->needsRelocateWithSymbol(*SD, Type))
     return true;
   return false;
 }
@@ -881,8 +864,11 @@
   if (!RelocateWithSymbol) {
     const MCSection *SecA =
         (SymA && !SymA->isUndefined()) ? &SymA->getSection() : nullptr;
-    const MCSectionData *SecAD = SecA ? &Asm.getSectionData(*SecA) : nullptr;
-    ELFRelocationEntry Rec(FixupOffset, SecAD, Type, Addend);
+    auto *ELFSec = cast_or_null<MCSectionELF>(SecA);
+    MCSymbol *SectionSymbol =
+        ELFSec ? Asm.getContext().getOrCreateSectionSymbol(*ELFSec)
+               : nullptr;
+    ELFRelocationEntry Rec(FixupOffset, SectionSymbol, Type, Addend);
     Relocations[FixupSection].push_back(Rec);
     return;
   }
@@ -991,7 +977,7 @@
 void
 ELFObjectWriter::computeSymbolTable(MCAssembler &Asm, const MCAsmLayout &Layout,
                                     const SectionIndexMapTy &SectionIndexMap,
-                                    RevGroupMapTy RevGroupMap,
+                                    const RevGroupMapTy &RevGroupMap,
                                     unsigned NumRegularSections) {
   // FIXME: Is this the correct place to do this?
   // FIXME: Why is an undefined reference to _GLOBAL_OFFSET_TABLE_ needed?
@@ -1037,7 +1023,7 @@
       MSD.SectionIndex = ELF::SHN_COMMON;
     } else if (BaseSymbol->isUndefined()) {
       if (isSignature && !Used)
-        MSD.SectionIndex = SectionIndexMap.lookup(RevGroupMap[&Symbol]);
+        MSD.SectionIndex = SectionIndexMap.lookup(RevGroupMap.lookup(&Symbol));
       else
         MSD.SectionIndex = ELF::SHN_UNDEF;
       if (!Used && WeakrefUsed)
@@ -1060,7 +1046,10 @@
       Buf += Name.substr(Pos + Skip);
       Name = Buf;
     }
-    MSD.Name = StrTabBuilder.add(Name);
+
+    // Sections have their own string table
+    if (MCELF::GetType(SD) != ELF::STT_SECTION)
+      MSD.Name = StrTabBuilder.add(Name);
 
     if (MSD.SectionIndex == ELF::SHN_UNDEF)
       UndefinedSymbolData.push_back(MSD);
@@ -1073,14 +1062,16 @@
   for (auto i = Asm.file_names_begin(), e = Asm.file_names_end(); i != e; ++i)
     StrTabBuilder.add(*i);
 
-  StrTabBuilder.finalize();
+  StrTabBuilder.finalize(StringTableBuilder::ELF);
 
   for (auto i = Asm.file_names_begin(), e = Asm.file_names_end(); i != e; ++i)
     FileSymbolData.push_back(StrTabBuilder.getOffset(*i));
 
-  for (ELFSymbolData& MSD : LocalSymbolData)
-    MSD.StringIndex = StrTabBuilder.getOffset(MSD.Name);
-  for (ELFSymbolData& MSD : ExternalSymbolData)
+  for (ELFSymbolData &MSD : LocalSymbolData)
+    MSD.StringIndex = MCELF::GetType(*MSD.SymbolData) == ELF::STT_SECTION
+                          ? 0
+                          : StrTabBuilder.getOffset(MSD.Name);
+  for (ELFSymbolData &MSD : ExternalSymbolData)
     MSD.StringIndex = StrTabBuilder.getOffset(MSD.Name);
   for (ELFSymbolData& MSD : UndefinedSymbolData)
     MSD.StringIndex = StrTabBuilder.getOffset(MSD.Name);
@@ -1096,8 +1087,6 @@
   for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i)
     LocalSymbolData[i].SymbolData->setIndex(Index++);
 
-  Index += NumRegularSections;
-
   for (unsigned i = 0, e = ExternalSymbolData.size(); i != e; ++i)
     ExternalSymbolData[i].SymbolData->setIndex(Index++);
   for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
@@ -1353,18 +1342,8 @@
 
   for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
     const ELFRelocationEntry &Entry = Relocs[e - i - 1];
-
-    unsigned Index;
-    if (Entry.UseSymbol) {
-      Index = getSymbolIndexInSymbolTable(Asm, Entry.Symbol);
-    } else {
-      const MCSectionData *Sec = Entry.Section;
-      if (Sec)
-        Index = Sec->getOrdinal() + FileSymbolData.size() +
-                LocalSymbolData.size() + 1;
-      else
-        Index = 0;
-    }
+    unsigned Index =
+        Entry.Symbol ? getSymbolIndexInSymbolTable(Asm, Entry.Symbol) : 0;
 
     if (is64Bit()) {
       write(*F, Entry.Offset);
@@ -1446,7 +1425,7 @@
       static_cast<const MCSectionELF&>(it->getSection());
     ShStrTabBuilder.add(Section.getSectionName());
   }
-  ShStrTabBuilder.finalize();
+  ShStrTabBuilder.finalize(StringTableBuilder::ELF);
   F->getContents().append(ShStrTabBuilder.data().begin(),
                           ShStrTabBuilder.data().end());
 }
@@ -1457,14 +1436,7 @@
                                             RevGroupMapTy &RevGroupMap,
                                             SectionIndexMapTy &SectionIndexMap,
                                             const RelMapTy &RelMap) {
-  // Create the .note.GNU-stack section if needed.
   MCContext &Ctx = Asm.getContext();
-  if (Asm.getNoExecStack()) {
-    const MCSectionELF *GnuStackSection =
-      Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0,
-                        SectionKind::getReadOnly());
-    Asm.getOrCreateSectionData(*GnuStackSection);
-  }
 
   // Build the groups
   for (MCAssembler::const_iterator it = Asm.begin(), ie = Asm.end();

diff --git a/lib/MC/LLVMBuild.txt b/lib/MC/LLVMBuild.txt
index 3fcb50b..f06be45 100644
--- a/lib/MC/LLVMBuild.txt
+++ b/lib/MC/LLVMBuild.txt

@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = MCAnalysis MCDisassembler MCParser
+subdirectories = MCDisassembler MCParser
 
 [component_0]
 type = Library

diff --git a/lib/MC/MCAnalysis/Android.mk b/lib/MC/MCAnalysis/Android.mk
deleted file mode 100644
index 27f848a..0000000
--- a/lib/MC/MCAnalysis/Android.mk
+++ /dev/null

@@ -1,37 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-mc_analysis_SRC_FILES := \
-  MCAtom.cpp \
-  MCFunction.cpp \
-  MCModule.cpp \
-  MCModuleYAML.cpp \
-  MCObjectDisassembler.cpp \
-  MCObjectSymbolizer.cpp
-
-# For the host
-# =====================================================
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := $(mc_analysis_SRC_FILES)
-
-LOCAL_MODULE:= libLLVMMCAnalysis
-
-LOCAL_MODULE_TAGS := optional
-
-include $(LLVM_HOST_BUILD_MK)
-include $(BUILD_HOST_STATIC_LIBRARY)
-
-# For the device
-# =====================================================
-include $(CLEAR_VARS)
-ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
-
-LOCAL_SRC_FILES := $(mc_analysis_SRC_FILES)
-
-LOCAL_MODULE:= libLLVMMCAnalysis
-
-LOCAL_MODULE_TAGS := optional
-
-include $(LLVM_DEVICE_BUILD_MK)
-include $(BUILD_STATIC_LIBRARY)
-endif

diff --git a/lib/MC/MCAnalysis/CMakeLists.txt b/lib/MC/MCAnalysis/CMakeLists.txt
deleted file mode 100644
index 81eae2d..0000000
--- a/lib/MC/MCAnalysis/CMakeLists.txt
+++ /dev/null

@@ -1,8 +0,0 @@
-add_llvm_library(LLVMMCAnalysis
- MCAtom.cpp
- MCFunction.cpp
- MCModule.cpp
- MCModuleYAML.cpp
- MCObjectDisassembler.cpp
- MCObjectSymbolizer.cpp
-)

diff --git a/lib/MC/MCAnalysis/LLVMBuild.txt b/lib/MC/MCAnalysis/LLVMBuild.txt
deleted file mode 100644
index 1b58fec..0000000
--- a/lib/MC/MCAnalysis/LLVMBuild.txt
+++ /dev/null

@@ -1,5 +0,0 @@
-[component_0]
-type = Library
-name = MCAnalysis
-parent = Libraries
-required_libraries = MC Object Support

diff --git a/lib/MC/MCAnalysis/MCAtom.cpp b/lib/MC/MCAnalysis/MCAtom.cpp
deleted file mode 100644
index 82056ee..0000000
--- a/lib/MC/MCAnalysis/MCAtom.cpp
+++ /dev/null

@@ -1,114 +0,0 @@
-//===- lib/MC/MCAtom.cpp - MCAtom implementation --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCAnalysis/MCAtom.h"
-#include "llvm/MC/MCAnalysis/MCModule.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <iterator>
-
-using namespace llvm;
-
-// Pin the vtable to this file.
-void MCAtom::anchor() {}
-
-void MCAtom::remap(uint64_t NewBegin, uint64_t NewEnd) {
-  Parent->remap(this, NewBegin, NewEnd);
-}
-
-void MCAtom::remapForTruncate(uint64_t TruncPt) {
-  assert((TruncPt >= Begin && TruncPt < End) &&
-         "Truncation point not contained in atom!");
-  remap(Begin, TruncPt);
-}
-
-void MCAtom::remapForSplit(uint64_t SplitPt,
-                           uint64_t &LBegin, uint64_t &LEnd,
-                           uint64_t &RBegin, uint64_t &REnd) {
-  assert((SplitPt > Begin && SplitPt <= End) &&
-         "Splitting at point not contained in atom!");
-
-  // Compute the new begin/end points.
-  LBegin = Begin;
-  LEnd = SplitPt - 1;
-  RBegin = SplitPt;
-  REnd = End;
-
-  // Remap this atom to become the lower of the two new ones.
-  remap(LBegin, LEnd);
-}
-
-// MCDataAtom
-
-void MCDataAtom::addData(const MCData &D) {
-  Data.push_back(D);
-  if (Data.size() > End + 1 - Begin)
-    remap(Begin, End + 1);
-}
-
-void MCDataAtom::truncate(uint64_t TruncPt) {
-  remapForTruncate(TruncPt);
-
-  Data.resize(TruncPt - Begin + 1);
-}
-
-MCDataAtom *MCDataAtom::split(uint64_t SplitPt) {
-  uint64_t LBegin, LEnd, RBegin, REnd;
-  remapForSplit(SplitPt, LBegin, LEnd, RBegin, REnd);
-
-  MCDataAtom *RightAtom = Parent->createDataAtom(RBegin, REnd);
-  RightAtom->setName(getName());
-
-  std::vector<MCData>::iterator I = Data.begin() + (RBegin - LBegin);
-  assert(I != Data.end() && "Split point not found in range!");
-
-  std::copy(I, Data.end(), std::back_inserter(RightAtom->Data));
-  Data.erase(I, Data.end());
-  return RightAtom;
-}
-
-// MCTextAtom
-
-void MCTextAtom::addInst(const MCInst &I, uint64_t Size) {
-  if (NextInstAddress + Size - 1 > End)
-    remap(Begin, NextInstAddress + Size - 1);
-  Insts.push_back(MCDecodedInst(I, NextInstAddress, Size));
-  NextInstAddress += Size;
-}
-
-void MCTextAtom::truncate(uint64_t TruncPt) {
-  remapForTruncate(TruncPt);
-
-  InstListTy::iterator I = Insts.begin();
-  while (I != Insts.end() && I->Address <= TruncPt) ++I;
-
-  assert(I != Insts.end() && "Truncation point not found in disassembly!");
-  assert(I->Address == TruncPt + 1 &&
-         "Truncation point does not fall on instruction boundary");
-
-  Insts.erase(I, Insts.end());
-}
-
-MCTextAtom *MCTextAtom::split(uint64_t SplitPt) {
-  uint64_t LBegin, LEnd, RBegin, REnd;
-  remapForSplit(SplitPt, LBegin, LEnd, RBegin, REnd);
-
-  MCTextAtom *RightAtom = Parent->createTextAtom(RBegin, REnd);
-  RightAtom->setName(getName());
-
-  InstListTy::iterator I = Insts.begin();
-  while (I != Insts.end() && I->Address < SplitPt) ++I;
-  assert(I != Insts.end() && "Split point not found in disassembly!");
-  assert(I->Address == SplitPt &&
-         "Split point does not fall on instruction boundary!");
-
-  std::copy(I, Insts.end(), std::back_inserter(RightAtom->Insts));
-  Insts.erase(I, Insts.end());
-  Parent->splitBasicBlocksForAtom(this, RightAtom);
-  return RightAtom;
-}

diff --git a/lib/MC/MCAnalysis/MCFunction.cpp b/lib/MC/MCAnalysis/MCFunction.cpp
deleted file mode 100644
index 4e09d1a..0000000
--- a/lib/MC/MCAnalysis/MCFunction.cpp
+++ /dev/null

@@ -1,76 +0,0 @@
-//===-- lib/MC/MCFunction.cpp -----------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCAnalysis/MCFunction.h"
-#include "llvm/MC/MCAnalysis/MCAtom.h"
-#include "llvm/MC/MCAnalysis/MCModule.h"
-#include <algorithm>
-
-using namespace llvm;
-
-// MCFunction
-
-MCFunction::MCFunction(StringRef Name, MCModule *Parent)
-  : Name(Name), ParentModule(Parent)
-{}
-
-MCBasicBlock &MCFunction::createBlock(const MCTextAtom &TA) {
-  std::unique_ptr<MCBasicBlock> MCBB(new MCBasicBlock(TA, this));
-  Blocks.push_back(std::move(MCBB));
-  return *Blocks.back();
-}
-
-MCBasicBlock *MCFunction::find(uint64_t StartAddr) {
-  for (const_iterator I = begin(), E = end(); I != E; ++I)
-    if ((*I)->getInsts()->getBeginAddr() == StartAddr)
-      return I->get();
-  return nullptr;
-}
-
-const MCBasicBlock *MCFunction::find(uint64_t StartAddr) const {
-  return const_cast<MCFunction *>(this)->find(StartAddr);
-}
-
-// MCBasicBlock
-
-MCBasicBlock::MCBasicBlock(const MCTextAtom &Insts, MCFunction *Parent)
-  : Insts(&Insts), Parent(Parent) {
-  getParent()->getParent()->trackBBForAtom(&Insts, this);
-}
-
-void MCBasicBlock::addSuccessor(const MCBasicBlock *MCBB) {
-  if (!isSuccessor(MCBB))
-    Successors.push_back(MCBB);
-}
-
-bool MCBasicBlock::isSuccessor(const MCBasicBlock *MCBB) const {
-  return std::find(Successors.begin(), Successors.end(),
-                   MCBB) != Successors.end();
-}
-
-void MCBasicBlock::addPredecessor(const MCBasicBlock *MCBB) {
-  if (!isPredecessor(MCBB))
-    Predecessors.push_back(MCBB);
-}
-
-bool MCBasicBlock::isPredecessor(const MCBasicBlock *MCBB) const {
-  return std::find(Predecessors.begin(), Predecessors.end(),
-                   MCBB) != Predecessors.end();
-}
-
-void MCBasicBlock::splitBasicBlock(MCBasicBlock *SplitBB) {
-  assert(Insts->getEndAddr() + 1 == SplitBB->Insts->getBeginAddr() &&
-         "Splitting unrelated basic blocks!");
-  SplitBB->addPredecessor(this);
-  assert(SplitBB->Successors.empty() &&
-         "Split basic block shouldn't already have successors!");
-  SplitBB->Successors = Successors;
-  Successors.clear();
-  addSuccessor(SplitBB);
-}

diff --git a/lib/MC/MCAnalysis/MCModule.cpp b/lib/MC/MCAnalysis/MCModule.cpp
deleted file mode 100644
index 7512299..0000000
--- a/lib/MC/MCAnalysis/MCModule.cpp
+++ /dev/null

@@ -1,142 +0,0 @@
-//===- lib/MC/MCModule.cpp - MCModule implementation ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCAnalysis/MCModule.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCAnalysis/MCAtom.h"
-#include "llvm/MC/MCAnalysis/MCFunction.h"
-#include <algorithm>
-
-using namespace llvm;
-
-static bool AtomComp(const MCAtom *L, uint64_t Addr) {
-  return L->getEndAddr() < Addr;
-}
-
-static bool AtomCompInv(uint64_t Addr, const MCAtom *R) {
-  return Addr < R->getEndAddr();
-}
-
-void MCModule::map(MCAtom *NewAtom) {
-  uint64_t Begin = NewAtom->Begin;
-
-  assert(Begin <= NewAtom->End && "Creating MCAtom with endpoints reversed?");
-
-  // Check for atoms already covering this range.
-  AtomListTy::iterator I = std::lower_bound(atom_begin(), atom_end(),
-                                            Begin, AtomComp);
-  assert((I == atom_end() || (*I)->getBeginAddr() > NewAtom->End)
-         && "Offset range already occupied!");
-
-  // Insert the new atom to the list.
-  Atoms.insert(I, NewAtom);
-}
-
-MCTextAtom *MCModule::createTextAtom(uint64_t Begin, uint64_t End) {
-  MCTextAtom *NewAtom = new MCTextAtom(this, Begin, End);
-  map(NewAtom);
-  return NewAtom;
-}
-
-MCDataAtom *MCModule::createDataAtom(uint64_t Begin, uint64_t End) {
-  MCDataAtom *NewAtom = new MCDataAtom(this, Begin, End);
-  map(NewAtom);
-  return NewAtom;
-}
-
-// remap - Update the interval mapping for an atom.
-void MCModule::remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd) {
-  // Find and erase the old mapping.
-  AtomListTy::iterator I = std::lower_bound(atom_begin(), atom_end(),
-                                            Atom->Begin, AtomComp);
-  assert(I != atom_end() && "Atom offset not found in module!");
-  assert(*I == Atom && "Previous atom mapping was invalid!");
-  Atoms.erase(I);
-
-  // FIXME: special case NewBegin == Atom->Begin
-
-  // Insert the new mapping.
-  AtomListTy::iterator NewI = std::lower_bound(atom_begin(), atom_end(),
-                                               NewBegin, AtomComp);
-  assert((NewI == atom_end() || (*NewI)->getBeginAddr() > Atom->End)
-         && "Offset range already occupied!");
-  Atoms.insert(NewI, Atom);
-
-  // Update the atom internal bounds.
-  Atom->Begin = NewBegin;
-  Atom->End = NewEnd;
-}
-
-const MCAtom *MCModule::findAtomContaining(uint64_t Addr) const {
-  AtomListTy::const_iterator I = std::lower_bound(atom_begin(), atom_end(),
-                                                  Addr, AtomComp);
-  if (I != atom_end() && (*I)->getBeginAddr() <= Addr)
-    return *I;
-  return nullptr;
-}
-
-MCAtom *MCModule::findAtomContaining(uint64_t Addr) {
-  return const_cast<MCAtom*>(
-    const_cast<const MCModule *>(this)->findAtomContaining(Addr));
-}
-
-const MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) const {
-  AtomListTy::const_iterator I = std::upper_bound(atom_begin(), atom_end(),
-                                                  Addr, AtomCompInv);
-  if (I != atom_end())
-    return *I;
-  return nullptr;
-}
-
-MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) {
-  return const_cast<MCAtom*>(
-    const_cast<const MCModule *>(this)->findFirstAtomAfter(Addr));
-}
-
-MCFunction *MCModule::createFunction(StringRef Name) {
-  std::unique_ptr<MCFunction> MCF(new MCFunction(Name, this));
-  Functions.push_back(std::move(MCF));
-  return Functions.back().get();
-}
-
-static bool CompBBToAtom(MCBasicBlock *BB, const MCTextAtom *Atom) {
-  return BB->getInsts() < Atom;
-}
-
-void MCModule::splitBasicBlocksForAtom(const MCTextAtom *TA,
-                                       const MCTextAtom *NewTA) {
-  BBsByAtomTy::iterator
-    I = std::lower_bound(BBsByAtom.begin(), BBsByAtom.end(),
-                         TA, CompBBToAtom);
-  for (; I != BBsByAtom.end() && (*I)->getInsts() == TA; ++I) {
-    MCBasicBlock *BB = *I;
-    MCBasicBlock *NewBB = &BB->getParent()->createBlock(*NewTA);
-    BB->splitBasicBlock(NewBB);
-  }
-}
-
-void MCModule::trackBBForAtom(const MCTextAtom *Atom, MCBasicBlock *BB) {
-  assert(Atom == BB->getInsts() && "Text atom doesn't back the basic block!");
-  BBsByAtomTy::iterator I = std::lower_bound(BBsByAtom.begin(),
-                                             BBsByAtom.end(),
-                                             Atom, CompBBToAtom);
-  for (; I != BBsByAtom.end() && (*I)->getInsts() == Atom; ++I)
-    if (*I == BB)
-      return;
-  BBsByAtom.insert(I, BB);
-}
-
-MCModule::MCModule() : Entrypoint(0) { }
-
-MCModule::~MCModule() {
-  for (AtomListTy::iterator AI = atom_begin(),
-                            AE = atom_end();
-                            AI != AE; ++AI)
-    delete *AI;
-}

diff --git a/lib/MC/MCAnalysis/MCModuleYAML.cpp b/lib/MC/MCAnalysis/MCModuleYAML.cpp
deleted file mode 100644
index 876b06d..0000000
--- a/lib/MC/MCAnalysis/MCModuleYAML.cpp
+++ /dev/null

@@ -1,464 +0,0 @@
-//===- MCModuleYAML.cpp - MCModule YAMLIO implementation ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines classes for handling the YAML representation of MCModule.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCAnalysis/MCModuleYAML.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/MC/MCAnalysis/MCAtom.h"
-#include "llvm/MC/MCAnalysis/MCFunction.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/YAML.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/YAMLTraits.h"
-#include <vector>
-
-namespace llvm {
-
-namespace {
-
-// This class is used to map opcode and register names to enum values.
-//
-// There are at least 3 obvious ways to do this:
-// 1- Generate an MII/MRI method using a tablegen StringMatcher
-// 2- Write an MII/MRI method using std::lower_bound and the assumption that
-//    the enums are sorted (starting at a fixed value).
-// 3- Do the matching manually as is done here.
-//
-// Why 3?
-// 1- A StringMatcher function for thousands of entries would incur
-//    a non-negligible binary size overhead.
-// 2- The lower_bound comparators would be somewhat involved and aren't
-//    obviously reusable (see LessRecordRegister in llvm/TableGen/Record.h)
-// 3- This isn't actually something useful outside tests (but the same argument
-//    can be made against having {MII,MRI}::getName).
-//
-// If this becomes useful outside this specific situation, feel free to do
-// the Right Thing (tm) and move the functionality to MII/MRI.
-//
-class InstrRegInfoHolder {
-  typedef StringMap<unsigned, BumpPtrAllocator> EnumValByNameTy;
-  EnumValByNameTy InstEnumValueByName;
-  EnumValByNameTy RegEnumValueByName;
-
-public:
-  const MCInstrInfo &MII;
-  const MCRegisterInfo &MRI;
-  InstrRegInfoHolder(const MCInstrInfo &MII, const MCRegisterInfo &MRI)
-      : InstEnumValueByName(NextPowerOf2(MII.getNumOpcodes())),
-        RegEnumValueByName(NextPowerOf2(MRI.getNumRegs())), MII(MII), MRI(MRI) {
-    for (int i = 0, e = MII.getNumOpcodes(); i != e; ++i)
-      InstEnumValueByName[MII.getName(i)] = i;
-    for (int i = 0, e = MRI.getNumRegs(); i != e; ++i)
-      RegEnumValueByName[MRI.getName(i)] = i;
-  }
-
-  bool matchRegister(StringRef Name, unsigned &Reg) {
-    EnumValByNameTy::const_iterator It = RegEnumValueByName.find(Name);
-    if (It == RegEnumValueByName.end())
-      return false;
-    Reg = It->getValue();
-    return true;
-  }
-  bool matchOpcode(StringRef Name, unsigned &Opc) {
-    EnumValByNameTy::const_iterator It = InstEnumValueByName.find(Name);
-    if (It == InstEnumValueByName.end())
-      return false;
-    Opc = It->getValue();
-    return true;
-  }
-};
-
-} // end unnamed namespace
-
-namespace MCModuleYAML {
-
-LLVM_YAML_STRONG_TYPEDEF(unsigned, OpcodeEnum)
-
-struct Operand {
-  MCOperand MCOp;
-};
-
-struct Inst {
-  OpcodeEnum Opcode;
-  std::vector<Operand> Operands;
-  uint64_t Size;
-};
-
-struct Atom {
-  MCAtom::AtomKind Type;
-  yaml::Hex64 StartAddress;
-  uint64_t Size;
-
-  std::vector<Inst> Insts;
-  yaml::BinaryRef Data;
-};
-
-struct BasicBlock {
-  yaml::Hex64 Address;
-  std::vector<yaml::Hex64> Preds;
-  std::vector<yaml::Hex64> Succs;
-};
-
-struct Function {
-  StringRef Name;
-  std::vector<BasicBlock> BasicBlocks;
-};
-
-struct Module {
-  std::vector<Atom> Atoms;
-  std::vector<Function> Functions;
-};
-
-} // end namespace MCModuleYAML
-} // end namespace llvm
-
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::Hex64)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::MCModuleYAML::Operand)
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Inst)
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Atom)
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::BasicBlock)
-LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Function)
-
-namespace llvm {
-
-namespace yaml {
-
-template <> struct ScalarEnumerationTraits<MCAtom::AtomKind> {
-  static void enumeration(IO &IO, MCAtom::AtomKind &Kind);
-};
-
-template <> struct MappingTraits<MCModuleYAML::Atom> {
-  static void mapping(IO &IO, MCModuleYAML::Atom &A);
-};
-
-template <> struct MappingTraits<MCModuleYAML::Inst> {
-  static void mapping(IO &IO, MCModuleYAML::Inst &I);
-};
-
-template <> struct MappingTraits<MCModuleYAML::BasicBlock> {
-  static void mapping(IO &IO, MCModuleYAML::BasicBlock &BB);
-};
-
-template <> struct MappingTraits<MCModuleYAML::Function> {
-  static void mapping(IO &IO, MCModuleYAML::Function &Fn);
-};
-
-template <> struct MappingTraits<MCModuleYAML::Module> {
-  static void mapping(IO &IO, MCModuleYAML::Module &M);
-};
-
-template <> struct ScalarTraits<MCModuleYAML::Operand> {
-  static void output(const MCModuleYAML::Operand &, void *,
-                     llvm::raw_ostream &);
-  static StringRef input(StringRef, void *, MCModuleYAML::Operand &);
-  static bool mustQuote(StringRef) { return false; }
-};
-
-template <> struct ScalarTraits<MCModuleYAML::OpcodeEnum> {
-  static void output(const MCModuleYAML::OpcodeEnum &, void *,
-                     llvm::raw_ostream &);
-  static StringRef input(StringRef, void *, MCModuleYAML::OpcodeEnum &);
-  static bool mustQuote(StringRef) { return false; }
-};
-
-void ScalarEnumerationTraits<MCAtom::AtomKind>::enumeration(
-    IO &IO, MCAtom::AtomKind &Value) {
-  IO.enumCase(Value, "Text", MCAtom::TextAtom);
-  IO.enumCase(Value, "Data", MCAtom::DataAtom);
-}
-
-void MappingTraits<MCModuleYAML::Atom>::mapping(IO &IO, MCModuleYAML::Atom &A) {
-  IO.mapRequired("StartAddress", A.StartAddress);
-  IO.mapRequired("Size", A.Size);
-  IO.mapRequired("Type", A.Type);
-  if (A.Type == MCAtom::TextAtom)
-    IO.mapRequired("Content", A.Insts);
-  else if (A.Type == MCAtom::DataAtom)
-    IO.mapRequired("Content", A.Data);
-}
-
-void MappingTraits<MCModuleYAML::Inst>::mapping(IO &IO, MCModuleYAML::Inst &I) {
-  IO.mapRequired("Inst", I.Opcode);
-  IO.mapRequired("Size", I.Size);
-  IO.mapRequired("Ops", I.Operands);
-}
-
-void
-MappingTraits<MCModuleYAML::BasicBlock>::mapping(IO &IO,
-                                                 MCModuleYAML::BasicBlock &BB) {
-  IO.mapRequired("Address", BB.Address);
-  IO.mapRequired("Preds", BB.Preds);
-  IO.mapRequired("Succs", BB.Succs);
-}
-
-void MappingTraits<MCModuleYAML::Function>::mapping(IO &IO,
-                                                    MCModuleYAML::Function &F) {
-  IO.mapRequired("Name", F.Name);
-  IO.mapRequired("BasicBlocks", F.BasicBlocks);
-}
-
-void MappingTraits<MCModuleYAML::Module>::mapping(IO &IO,
-                                                  MCModuleYAML::Module &M) {
-  IO.mapRequired("Atoms", M.Atoms);
-  IO.mapOptional("Functions", M.Functions);
-}
-
-void
-ScalarTraits<MCModuleYAML::Operand>::output(const MCModuleYAML::Operand &Val,
-                                            void *Ctx, raw_ostream &Out) {
-  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
-
-  // FIXME: Doesn't support FPImm and expr/inst, but do these make sense?
-  if (Val.MCOp.isImm())
-    Out << "I" << Val.MCOp.getImm();
-  else if (Val.MCOp.isReg())
-    Out << "R" << IRI->MRI.getName(Val.MCOp.getReg());
-  else
-    llvm_unreachable("Trying to output invalid MCOperand!");
-}
-
-StringRef
-ScalarTraits<MCModuleYAML::Operand>::input(StringRef Scalar, void *Ctx,
-                                           MCModuleYAML::Operand &Val) {
-  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
-  char Type = 0;
-  if (Scalar.size() >= 1)
-    Type = Scalar.front();
-  if (Type != 'R' && Type != 'I')
-    return "Operand must start with 'R' (register) or 'I' (immediate).";
-  if (Type == 'R') {
-    unsigned Reg;
-    if (!IRI->matchRegister(Scalar.substr(1), Reg))
-      return "Invalid register name.";
-    Val.MCOp = MCOperand::CreateReg(Reg);
-  } else if (Type == 'I') {
-    int64_t RIVal;
-    if (Scalar.substr(1).getAsInteger(10, RIVal))
-      return "Invalid immediate value.";
-    Val.MCOp = MCOperand::CreateImm(RIVal);
-  } else {
-    Val.MCOp = MCOperand();
-  }
-  return StringRef();
-}
-
-void ScalarTraits<MCModuleYAML::OpcodeEnum>::output(
-    const MCModuleYAML::OpcodeEnum &Val, void *Ctx, raw_ostream &Out) {
-  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
-  Out << IRI->MII.getName(Val);
-}
-
-StringRef
-ScalarTraits<MCModuleYAML::OpcodeEnum>::input(StringRef Scalar, void *Ctx,
-                                              MCModuleYAML::OpcodeEnum &Val) {
-  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
-  unsigned Opc;
-  if (!IRI->matchOpcode(Scalar, Opc))
-    return "Invalid instruction opcode.";
-  Val = Opc;
-  return "";
-}
-
-} // end namespace yaml
-
-namespace {
-
-class MCModule2YAML {
-  const MCModule &MCM;
-  MCModuleYAML::Module YAMLModule;
-  void dumpAtom(const MCAtom *MCA);
-  void dumpFunction(const MCFunction &MCF);
-  void dumpBasicBlock(const MCBasicBlock *MCBB);
-
-public:
-  MCModule2YAML(const MCModule &MCM);
-  MCModuleYAML::Module &getYAMLModule();
-};
-
-class YAML2MCModule {
-  MCModule &MCM;
-
-public:
-  YAML2MCModule(MCModule &MCM);
-  StringRef parse(const MCModuleYAML::Module &YAMLModule);
-};
-
-} // end unnamed namespace
-
-MCModule2YAML::MCModule2YAML(const MCModule &MCM) : MCM(MCM), YAMLModule() {
-  for (MCModule::const_atom_iterator AI = MCM.atom_begin(), AE = MCM.atom_end();
-       AI != AE; ++AI)
-    dumpAtom(*AI);
-  for (MCModule::const_func_iterator FI = MCM.func_begin(), FE = MCM.func_end();
-       FI != FE; ++FI)
-    dumpFunction(**FI);
-}
-
-void MCModule2YAML::dumpAtom(const MCAtom *MCA) {
-  YAMLModule.Atoms.resize(YAMLModule.Atoms.size() + 1);
-  MCModuleYAML::Atom &A = YAMLModule.Atoms.back();
-  A.Type = MCA->getKind();
-  A.StartAddress = MCA->getBeginAddr();
-  A.Size = MCA->getEndAddr() - MCA->getBeginAddr() + 1;
-  if (const MCTextAtom *TA = dyn_cast<MCTextAtom>(MCA)) {
-    const size_t InstCount = TA->size();
-    A.Insts.resize(InstCount);
-    for (size_t i = 0; i != InstCount; ++i) {
-      const MCDecodedInst &MCDI = TA->at(i);
-      A.Insts[i].Opcode = MCDI.Inst.getOpcode();
-      A.Insts[i].Size = MCDI.Size;
-      const unsigned OpCount = MCDI.Inst.getNumOperands();
-      A.Insts[i].Operands.resize(OpCount);
-      for (unsigned oi = 0; oi != OpCount; ++oi)
-        A.Insts[i].Operands[oi].MCOp = MCDI.Inst.getOperand(oi);
-    }
-  } else if (const MCDataAtom *DA = dyn_cast<MCDataAtom>(MCA)) {
-    A.Data = DA->getData();
-  } else {
-    llvm_unreachable("Unknown atom type.");
-  }
-}
-
-void MCModule2YAML::dumpFunction(const MCFunction &MCF) {
-  YAMLModule.Functions.resize(YAMLModule.Functions.size() + 1);
-  MCModuleYAML::Function &F = YAMLModule.Functions.back();
-  F.Name = MCF.getName();
-  for (MCFunction::const_iterator BBI = MCF.begin(), BBE = MCF.end();
-       BBI != BBE; ++BBI) {
-    const MCBasicBlock &MCBB = **BBI;
-    F.BasicBlocks.resize(F.BasicBlocks.size() + 1);
-    MCModuleYAML::BasicBlock &BB = F.BasicBlocks.back();
-    BB.Address = MCBB.getInsts()->getBeginAddr();
-    for (MCBasicBlock::pred_const_iterator PI = MCBB.pred_begin(),
-                                           PE = MCBB.pred_end();
-         PI != PE; ++PI)
-      BB.Preds.push_back((*PI)->getInsts()->getBeginAddr());
-    for (MCBasicBlock::succ_const_iterator SI = MCBB.succ_begin(),
-                                           SE = MCBB.succ_end();
-         SI != SE; ++SI)
-      BB.Succs.push_back((*SI)->getInsts()->getBeginAddr());
-  }
-}
-
-MCModuleYAML::Module &MCModule2YAML::getYAMLModule() { return YAMLModule; }
-
-YAML2MCModule::YAML2MCModule(MCModule &MCM) : MCM(MCM) {}
-
-StringRef YAML2MCModule::parse(const MCModuleYAML::Module &YAMLModule) {
-  typedef std::vector<MCModuleYAML::Atom>::const_iterator AtomIt;
-  typedef std::vector<MCModuleYAML::Inst>::const_iterator InstIt;
-  typedef std::vector<MCModuleYAML::Operand>::const_iterator OpIt;
-
-  typedef DenseMap<uint64_t, MCTextAtom *> AddrToTextAtomTy;
-  AddrToTextAtomTy TAByAddr;
-
-  for (AtomIt AI = YAMLModule.Atoms.begin(), AE = YAMLModule.Atoms.end();
-       AI != AE; ++AI) {
-    uint64_t StartAddress = AI->StartAddress;
-    if (AI->Size == 0)
-      return "Atoms can't be empty!";
-    uint64_t EndAddress = StartAddress + AI->Size - 1;
-    switch (AI->Type) {
-    case MCAtom::TextAtom: {
-      MCTextAtom *TA = MCM.createTextAtom(StartAddress, EndAddress);
-      TAByAddr[StartAddress] = TA;
-      for (InstIt II = AI->Insts.begin(), IE = AI->Insts.end(); II != IE;
-           ++II) {
-        MCInst MI;
-        MI.setOpcode(II->Opcode);
-        for (OpIt OI = II->Operands.begin(), OE = II->Operands.end(); OI != OE;
-             ++OI)
-          MI.addOperand(OI->MCOp);
-        TA->addInst(MI, II->Size);
-      }
-      break;
-    }
-    case MCAtom::DataAtom: {
-      MCDataAtom *DA = MCM.createDataAtom(StartAddress, EndAddress);
-      SmallVector<char, 64> Data;
-      raw_svector_ostream OS(Data);
-      AI->Data.writeAsBinary(OS);
-      OS.flush();
-      for (size_t i = 0, e = Data.size(); i != e; ++i)
-        DA->addData((uint8_t)Data[i]);
-      break;
-    }
-    }
-  }
-
-  typedef std::vector<MCModuleYAML::Function>::const_iterator FuncIt;
-  typedef std::vector<MCModuleYAML::BasicBlock>::const_iterator BBIt;
-  typedef std::vector<yaml::Hex64>::const_iterator AddrIt;
-  for (FuncIt FI = YAMLModule.Functions.begin(),
-              FE = YAMLModule.Functions.end();
-       FI != FE; ++FI) {
-    MCFunction *MCFN = MCM.createFunction(FI->Name);
-    for (BBIt BBI = FI->BasicBlocks.begin(), BBE = FI->BasicBlocks.end();
-         BBI != BBE; ++BBI) {
-      AddrToTextAtomTy::const_iterator It = TAByAddr.find(BBI->Address);
-      if (It == TAByAddr.end())
-        return "Basic block start address doesn't match any text atom!";
-      MCFN->createBlock(*It->second);
-    }
-    for (BBIt BBI = FI->BasicBlocks.begin(), BBE = FI->BasicBlocks.end();
-         BBI != BBE; ++BBI) {
-      MCBasicBlock *MCBB = MCFN->find(BBI->Address);
-      if (!MCBB)
-        return "Couldn't find matching basic block in function.";
-      for (AddrIt PI = BBI->Preds.begin(), PE = BBI->Preds.end(); PI != PE;
-           ++PI) {
-        MCBasicBlock *Pred = MCFN->find(*PI);
-        if (!Pred)
-          return "Couldn't find predecessor basic block.";
-        MCBB->addPredecessor(Pred);
-      }
-      for (AddrIt SI = BBI->Succs.begin(), SE = BBI->Succs.end(); SI != SE;
-           ++SI) {
-        MCBasicBlock *Succ = MCFN->find(*SI);
-        if (!Succ)
-          return "Couldn't find predecessor basic block.";
-        MCBB->addSuccessor(Succ);
-      }
-    }
-  }
-  return "";
-}
-
-StringRef mcmodule2yaml(raw_ostream &OS, const MCModule &MCM,
-                        const MCInstrInfo &MII, const MCRegisterInfo &MRI) {
-  MCModule2YAML Dumper(MCM);
-  InstrRegInfoHolder IRI(MII, MRI);
-  yaml::Output YOut(OS, (void *)&IRI);
-  YOut << Dumper.getYAMLModule();
-  return "";
-}
-
-StringRef yaml2mcmodule(std::unique_ptr<MCModule> &MCM, StringRef YamlContent,
-                        const MCInstrInfo &MII, const MCRegisterInfo &MRI) {
-  MCM.reset(new MCModule);
-  YAML2MCModule Parser(*MCM);
-  MCModuleYAML::Module YAMLModule;
-  InstrRegInfoHolder IRI(MII, MRI);
-  yaml::Input YIn(YamlContent, (void *)&IRI);
-  YIn >> YAMLModule;
-  if (std::error_code ec = YIn.error())
-    return ec.message();
-  StringRef err = Parser.parse(YAMLModule);
-  if (!err.empty())
-    return err;
-  return "";
-}
-
-} // end namespace llvm

diff --git a/lib/MC/MCAnalysis/MCObjectDisassembler.cpp b/lib/MC/MCAnalysis/MCObjectDisassembler.cpp
deleted file mode 100644
index 0f789ff..0000000
--- a/lib/MC/MCAnalysis/MCObjectDisassembler.cpp
+++ /dev/null

@@ -1,574 +0,0 @@
-//===- lib/MC/MCObjectDisassembler.cpp ------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCObjectDisassembler.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAnalysis/MCAtom.h"
-#include "llvm/MC/MCAnalysis/MCFunction.h"
-#include "llvm/MC/MCAnalysis/MCModule.h"
-#include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCInstrAnalysis.h"
-#include "llvm/MC/MCObjectSymbolizer.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MachO.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/StringRefMemoryObject.h"
-#include "llvm/Support/raw_ostream.h"
-#include <map>
-
-using namespace llvm;
-using namespace object;
-
-#define DEBUG_TYPE "mc"
-
-MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj,
-                                           const MCDisassembler &Dis,
-                                           const MCInstrAnalysis &MIA)
-    : Obj(Obj), Dis(Dis), MIA(MIA), MOS(nullptr) {}
-
-uint64_t MCObjectDisassembler::getEntrypoint() {
-  for (const SymbolRef &Symbol : Obj.symbols()) {
-    StringRef Name;
-    Symbol.getName(Name);
-    if (Name == "main" || Name == "_main") {
-      uint64_t Entrypoint;
-      Symbol.getAddress(Entrypoint);
-      return getEffectiveLoadAddr(Entrypoint);
-    }
-  }
-  return 0;
-}
-
-ArrayRef<uint64_t> MCObjectDisassembler::getStaticInitFunctions() {
-  return ArrayRef<uint64_t>();
-}
-
-ArrayRef<uint64_t> MCObjectDisassembler::getStaticExitFunctions() {
-  return ArrayRef<uint64_t>();
-}
-
-MemoryObject *MCObjectDisassembler::getRegionFor(uint64_t Addr) {
-  // FIXME: Keep track of object sections.
-  return FallbackRegion.get();
-}
-
-uint64_t MCObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
-  return Addr;
-}
-
-uint64_t MCObjectDisassembler::getOriginalLoadAddr(uint64_t Addr) {
-  return Addr;
-}
-
-MCModule *MCObjectDisassembler::buildEmptyModule() {
-  MCModule *Module = new MCModule;
-  Module->Entrypoint = getEntrypoint();
-  return Module;
-}
-
-MCModule *MCObjectDisassembler::buildModule(bool withCFG) {
-  MCModule *Module = buildEmptyModule();
-
-  buildSectionAtoms(Module);
-  if (withCFG)
-    buildCFG(Module);
-  return Module;
-}
-
-void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
-  for (const SectionRef &Section : Obj.sections()) {
-    bool isText;
-    Section.isText(isText);
-    bool isData;
-    Section.isData(isData);
-    if (!isData && !isText)
-      continue;
-
-    uint64_t StartAddr;
-    Section.getAddress(StartAddr);
-    uint64_t SecSize;
-    Section.getSize(SecSize);
-    if (StartAddr == UnknownAddressOrSize || SecSize == UnknownAddressOrSize)
-      continue;
-    StartAddr = getEffectiveLoadAddr(StartAddr);
-
-    StringRef Contents;
-    Section.getContents(Contents);
-    StringRefMemoryObject memoryObject(Contents, StartAddr);
-
-    // We don't care about things like non-file-backed sections yet.
-    if (Contents.size() != SecSize || !SecSize)
-      continue;
-    uint64_t EndAddr = StartAddr + SecSize - 1;
-
-    StringRef SecName;
-    Section.getName(SecName);
-
-    if (isText) {
-      MCTextAtom *Text = nullptr;
-      MCDataAtom *InvalidData = nullptr;
-
-      uint64_t InstSize;
-      for (uint64_t Index = 0; Index < SecSize; Index += InstSize) {
-        const uint64_t CurAddr = StartAddr + Index;
-        MCInst Inst;
-        if (Dis.getInstruction(Inst, InstSize, memoryObject, CurAddr, nulls(),
-                               nulls())) {
-          if (!Text) {
-            Text = Module->createTextAtom(CurAddr, CurAddr);
-            Text->setName(SecName);
-          }
-          Text->addInst(Inst, InstSize);
-          InvalidData = nullptr;
-        } else {
-          assert(InstSize && "getInstruction() consumed no bytes");
-          if (!InvalidData) {
-            Text = nullptr;
-            InvalidData = Module->createDataAtom(CurAddr, CurAddr+InstSize - 1);
-          }
-          for (uint64_t I = 0; I < InstSize; ++I)
-            InvalidData->addData(Contents[Index+I]);
-        }
-      }
-    } else {
-      MCDataAtom *Data = Module->createDataAtom(StartAddr, EndAddr);
-      Data->setName(SecName);
-      for (uint64_t Index = 0; Index < SecSize; ++Index)
-        Data->addData(Contents[Index]);
-    }
-  }
-}
-
-namespace {
-  struct BBInfo;
-  typedef SmallPtrSet<BBInfo*, 2> BBInfoSetTy;
-
-  struct BBInfo {
-    MCTextAtom *Atom;
-    MCBasicBlock *BB;
-    BBInfoSetTy Succs;
-    BBInfoSetTy Preds;
-    MCObjectDisassembler::AddressSetTy SuccAddrs;
-
-    BBInfo() : Atom(nullptr), BB(nullptr) {}
-
-    void addSucc(BBInfo &Succ) {
-      Succs.insert(&Succ);
-      Succ.Preds.insert(this);
-    }
-  };
-}
-
-static void RemoveDupsFromAddressVector(MCObjectDisassembler::AddressSetTy &V) {
-  std::sort(V.begin(), V.end());
-  V.erase(std::unique(V.begin(), V.end()), V.end());
-}
-
-void MCObjectDisassembler::buildCFG(MCModule *Module) {
-  typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
-  BBInfoByAddrTy BBInfos;
-  AddressSetTy Splits;
-  AddressSetTy Calls;
-
-  for (const SymbolRef &Symbol : Obj.symbols()) {
-    SymbolRef::Type SymType;
-    Symbol.getType(SymType);
-    if (SymType == SymbolRef::ST_Function) {
-      uint64_t SymAddr;
-      Symbol.getAddress(SymAddr);
-      SymAddr = getEffectiveLoadAddr(SymAddr);
-      Calls.push_back(SymAddr);
-      Splits.push_back(SymAddr);
-    }
-  }
-
-  assert(Module->func_begin() == Module->func_end()
-         && "Module already has a CFG!");
-
-  // First, determine the basic block boundaries and call targets.
-  for (MCModule::atom_iterator AI = Module->atom_begin(),
-                               AE = Module->atom_end();
-       AI != AE; ++AI) {
-    MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
-    if (!TA) continue;
-    Calls.push_back(TA->getBeginAddr());
-    BBInfos[TA->getBeginAddr()].Atom = TA;
-    for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
-         II != IE; ++II) {
-      if (MIA.isTerminator(II->Inst))
-        Splits.push_back(II->Address + II->Size);
-      uint64_t Target;
-      if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) {
-        if (MIA.isCall(II->Inst))
-          Calls.push_back(Target);
-        Splits.push_back(Target);
-      }
-    }
-  }
-
-  RemoveDupsFromAddressVector(Splits);
-  RemoveDupsFromAddressVector(Calls);
-
-  // Split text atoms into basic block atoms.
-  for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end();
-       SI != SE; ++SI) {
-    MCAtom *A = Module->findAtomContaining(*SI);
-    if (!A) continue;
-    MCTextAtom *TA = cast<MCTextAtom>(A);
-    if (TA->getBeginAddr() == *SI)
-      continue;
-    MCTextAtom *NewAtom = TA->split(*SI);
-    BBInfos[NewAtom->getBeginAddr()].Atom = NewAtom;
-    StringRef BBName = TA->getName();
-    BBName = BBName.substr(0, BBName.find_last_of(':'));
-    NewAtom->setName((BBName + ":" + utohexstr(*SI)).str());
-  }
-
-  // Compute succs/preds.
-  for (MCModule::atom_iterator AI = Module->atom_begin(),
-                               AE = Module->atom_end();
-                               AI != AE; ++AI) {
-    MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
-    if (!TA) continue;
-    BBInfo &CurBB = BBInfos[TA->getBeginAddr()];
-    const MCDecodedInst &LI = TA->back();
-    if (MIA.isBranch(LI.Inst)) {
-      uint64_t Target;
-      if (MIA.evaluateBranch(LI.Inst, LI.Address, LI.Size, Target))
-        CurBB.addSucc(BBInfos[Target]);
-      if (MIA.isConditionalBranch(LI.Inst))
-        CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
-    } else if (!MIA.isTerminator(LI.Inst))
-      CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
-  }
-
-
-  // Create functions and basic blocks.
-  for (AddressSetTy::const_iterator CI = Calls.begin(), CE = Calls.end();
-       CI != CE; ++CI) {
-    BBInfo &BBI = BBInfos[*CI];
-    if (!BBI.Atom) continue;
-
-    MCFunction &MCFN = *Module->createFunction(BBI.Atom->getName());
-
-    // Create MCBBs.
-    SmallSetVector<BBInfo*, 16> Worklist;
-    Worklist.insert(&BBI);
-    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
-      BBInfo *BBI = Worklist[wi];
-      if (!BBI->Atom)
-        continue;
-      BBI->BB = &MCFN.createBlock(*BBI->Atom);
-      // Add all predecessors and successors to the worklist.
-      for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
-                                 SI != SE; ++SI)
-        Worklist.insert(*SI);
-      for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
-                                 PI != PE; ++PI)
-        Worklist.insert(*PI);
-    }
-
-    // Set preds/succs.
-    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
-      BBInfo *BBI = Worklist[wi];
-      MCBasicBlock *MCBB = BBI->BB;
-      if (!MCBB)
-        continue;
-      for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
-           SI != SE; ++SI)
-        if ((*SI)->BB)
-          MCBB->addSuccessor((*SI)->BB);
-      for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
-           PI != PE; ++PI)
-        if ((*PI)->BB)
-          MCBB->addPredecessor((*PI)->BB);
-    }
-  }
-}
-
-// Basic idea of the disassembly + discovery:
-//
-// start with the wanted address, insert it in the worklist
-// while worklist not empty, take next address in the worklist:
-// - check if atom exists there
-//   - if middle of atom:
-//     - split basic blocks referencing the atom
-//     - look for an already encountered BBInfo (using a map<atom, bbinfo>)
-//       - if there is, split it (new one, fallthrough, move succs, etc..)
-//   - if start of atom: nothing else to do
-//   - if no atom: create new atom and new bbinfo
-// - look at the last instruction in the atom, add succs to worklist
-// for all elements in the worklist:
-// - create basic block, update preds/succs, etc..
-//
-MCBasicBlock *MCObjectDisassembler::getBBAt(MCModule *Module, MCFunction *MCFN,
-                                            uint64_t BBBeginAddr,
-                                            AddressSetTy &CallTargets,
-                                            AddressSetTy &TailCallTargets) {
-  typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
-  typedef SmallSetVector<uint64_t, 16> AddrWorklistTy;
-  BBInfoByAddrTy BBInfos;
-  AddrWorklistTy Worklist;
-
-  Worklist.insert(BBBeginAddr);
-  for (size_t wi = 0; wi < Worklist.size(); ++wi) {
-    const uint64_t BeginAddr = Worklist[wi];
-    BBInfo *BBI = &BBInfos[BeginAddr];
-
-    MCTextAtom *&TA = BBI->Atom;
-    assert(!TA && "Discovered basic block already has an associated atom!");
-
-    // Look for an atom at BeginAddr.
-    if (MCAtom *A = Module->findAtomContaining(BeginAddr)) {
-      // FIXME: We don't care about mixed atoms, see above.
-      TA = cast<MCTextAtom>(A);
-
-      // The found atom doesn't begin at BeginAddr, we have to split it.
-      if (TA->getBeginAddr() != BeginAddr) {
-        // FIXME: Handle overlapping atoms: middle-starting instructions, etc..
-        MCTextAtom *NewTA = TA->split(BeginAddr);
-
-        // Look for an already encountered basic block that needs splitting
-        BBInfoByAddrTy::iterator It = BBInfos.find(TA->getBeginAddr());
-        if (It != BBInfos.end() && It->second.Atom) {
-          BBI->SuccAddrs = It->second.SuccAddrs;
-          It->second.SuccAddrs.clear();
-          It->second.SuccAddrs.push_back(BeginAddr);
-        }
-        TA = NewTA;
-      }
-      BBI->Atom = TA;
-    } else {
-      // If we didn't find an atom, then we have to disassemble to create one!
-
-      MemoryObject *Region = getRegionFor(BeginAddr);
-      if (!Region)
-        llvm_unreachable(("Couldn't find suitable region for disassembly at " +
-                          utostr(BeginAddr)).c_str());
-
-      uint64_t InstSize;
-      uint64_t EndAddr = Region->getBase() + Region->getExtent();
-
-      // We want to stop before the next atom and have a fallthrough to it.
-      if (MCTextAtom *NextAtom =
-              cast_or_null<MCTextAtom>(Module->findFirstAtomAfter(BeginAddr)))
-        EndAddr = std::min(EndAddr, NextAtom->getBeginAddr());
-
-      for (uint64_t Addr = BeginAddr; Addr < EndAddr; Addr += InstSize) {
-        MCInst Inst;
-        if (Dis.getInstruction(Inst, InstSize, *Region, Addr, nulls(),
-                               nulls())) {
-          if (!TA)
-            TA = Module->createTextAtom(Addr, Addr);
-          TA->addInst(Inst, InstSize);
-        } else {
-          // We don't care about splitting mixed atoms either.
-          llvm_unreachable("Couldn't disassemble instruction in atom.");
-        }
-
-        uint64_t BranchTarget;
-        if (MIA.evaluateBranch(Inst, Addr, InstSize, BranchTarget)) {
-          if (MIA.isCall(Inst))
-            CallTargets.push_back(BranchTarget);
-        }
-
-        if (MIA.isTerminator(Inst))
-          break;
-      }
-      BBI->Atom = TA;
-    }
-
-    assert(TA && "Couldn't disassemble atom, none was created!");
-    assert(TA->begin() != TA->end() && "Empty atom!");
-
-    MemoryObject *Region = getRegionFor(TA->getBeginAddr());
-    assert(Region && "Couldn't find region for already disassembled code!");
-    uint64_t EndRegion = Region->getBase() + Region->getExtent();
-
-    // Now we have a basic block atom, add successors.
-    // Add the fallthrough block.
-    if ((MIA.isConditionalBranch(TA->back().Inst) ||
-         !MIA.isTerminator(TA->back().Inst)) &&
-        (TA->getEndAddr() + 1 < EndRegion)) {
-      BBI->SuccAddrs.push_back(TA->getEndAddr() + 1);
-      Worklist.insert(TA->getEndAddr() + 1);
-    }
-
-    // If the terminator is a branch, add the target block.
-    if (MIA.isBranch(TA->back().Inst)) {
-      uint64_t BranchTarget;
-      if (MIA.evaluateBranch(TA->back().Inst, TA->back().Address,
-                             TA->back().Size, BranchTarget)) {
-        StringRef ExtFnName;
-        if (MOS)
-          ExtFnName =
-              MOS->findExternalFunctionAt(getOriginalLoadAddr(BranchTarget));
-        if (!ExtFnName.empty()) {
-          TailCallTargets.push_back(BranchTarget);
-          CallTargets.push_back(BranchTarget);
-        } else {
-          BBI->SuccAddrs.push_back(BranchTarget);
-          Worklist.insert(BranchTarget);
-        }
-      }
-    }
-  }
-
-  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
-    const uint64_t BeginAddr = Worklist[wi];
-    BBInfo *BBI = &BBInfos[BeginAddr];
-
-    assert(BBI->Atom && "Found a basic block without an associated atom!");
-
-    // Look for a basic block at BeginAddr.
-    BBI->BB = MCFN->find(BeginAddr);
-    if (BBI->BB) {
-      // FIXME: check that the succs/preds are the same
-      continue;
-    }
-    // If there was none, we have to create one from the atom.
-    BBI->BB = &MCFN->createBlock(*BBI->Atom);
-  }
-
-  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
-    const uint64_t BeginAddr = Worklist[wi];
-    BBInfo *BBI = &BBInfos[BeginAddr];
-    MCBasicBlock *BB = BBI->BB;
-
-    RemoveDupsFromAddressVector(BBI->SuccAddrs);
-    for (AddressSetTy::const_iterator SI = BBI->SuccAddrs.begin(),
-         SE = BBI->SuccAddrs.end();
-         SE != SE; ++SI) {
-      MCBasicBlock *Succ = BBInfos[*SI].BB;
-      BB->addSuccessor(Succ);
-      Succ->addPredecessor(BB);
-    }
-  }
-
-  assert(BBInfos[Worklist[0]].BB &&
-         "No basic block created at requested address?");
-
-  return BBInfos[Worklist[0]].BB;
-}
-
-MCFunction *
-MCObjectDisassembler::createFunction(MCModule *Module, uint64_t BeginAddr,
-                                     AddressSetTy &CallTargets,
-                                     AddressSetTy &TailCallTargets) {
-  // First, check if this is an external function.
-  StringRef ExtFnName;
-  if (MOS)
-    ExtFnName = MOS->findExternalFunctionAt(getOriginalLoadAddr(BeginAddr));
-  if (!ExtFnName.empty())
-    return Module->createFunction(ExtFnName);
-
-  // If it's not, look for an existing function.
-  for (MCModule::func_iterator FI = Module->func_begin(),
-                               FE = Module->func_end();
-       FI != FE; ++FI) {
-    if ((*FI)->empty())
-      continue;
-    // FIXME: MCModule should provide a findFunctionByAddr()
-    if ((*FI)->getEntryBlock()->getInsts()->getBeginAddr() == BeginAddr)
-      return FI->get();
-  }
-
-  // Finally, just create a new one.
-  MCFunction *MCFN = Module->createFunction("");
-  getBBAt(Module, MCFN, BeginAddr, CallTargets, TailCallTargets);
-  return MCFN;
-}
-
-// MachO MCObjectDisassembler implementation.
-
-MCMachOObjectDisassembler::MCMachOObjectDisassembler(
-    const MachOObjectFile &MOOF, const MCDisassembler &Dis,
-    const MCInstrAnalysis &MIA, uint64_t VMAddrSlide,
-    uint64_t HeaderLoadAddress)
-    : MCObjectDisassembler(MOOF, Dis, MIA), MOOF(MOOF),
-      VMAddrSlide(VMAddrSlide), HeaderLoadAddress(HeaderLoadAddress) {
-
-  for (const SectionRef &Section : MOOF.sections()) {
-    StringRef Name;
-    Section.getName(Name);
-    // FIXME: We should use the S_ section type instead of the name.
-    if (Name == "__mod_init_func") {
-      DEBUG(dbgs() << "Found __mod_init_func section!\n");
-      Section.getContents(ModInitContents);
-    } else if (Name == "__mod_exit_func") {
-      DEBUG(dbgs() << "Found __mod_exit_func section!\n");
-      Section.getContents(ModExitContents);
-    }
-  }
-}
-
-// FIXME: Only do the translations for addresses actually inside the object.
-uint64_t MCMachOObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
-  return Addr + VMAddrSlide;
-}
-
-uint64_t
-MCMachOObjectDisassembler::getOriginalLoadAddr(uint64_t EffectiveAddr) {
-  return EffectiveAddr - VMAddrSlide;
-}
-
-uint64_t MCMachOObjectDisassembler::getEntrypoint() {
-  uint64_t EntryFileOffset = 0;
-
-  // Look for LC_MAIN.
-  {
-    uint32_t LoadCommandCount = MOOF.getHeader().ncmds;
-    MachOObjectFile::LoadCommandInfo Load = MOOF.getFirstLoadCommandInfo();
-    for (unsigned I = 0;; ++I) {
-      if (Load.C.cmd == MachO::LC_MAIN) {
-        EntryFileOffset =
-            ((const MachO::entry_point_command *)Load.Ptr)->entryoff;
-        break;
-      }
-
-      if (I == LoadCommandCount - 1)
-        break;
-      else
-        Load = MOOF.getNextLoadCommandInfo(Load);
-    }
-  }
-
-  // If we didn't find anything, default to the common implementation.
-  // FIXME: Maybe we could also look at LC_UNIXTHREAD and friends?
-  if (EntryFileOffset)
-    return MCObjectDisassembler::getEntrypoint();
-
-  return EntryFileOffset + HeaderLoadAddress;
-}
-
-ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticInitFunctions() {
-  // FIXME: We only handle 64bit mach-o
-  assert(MOOF.is64Bit());
-
-  size_t EntrySize = 8;
-  size_t EntryCount = ModInitContents.size() / EntrySize;
-  return ArrayRef<uint64_t>(
-      reinterpret_cast<const uint64_t *>(ModInitContents.data()), EntryCount);
-}
-
-ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticExitFunctions() {
-  // FIXME: We only handle 64bit mach-o
-  assert(MOOF.is64Bit());
-
-  size_t EntrySize = 8;
-  size_t EntryCount = ModExitContents.size() / EntrySize;
-  return ArrayRef<uint64_t>(
-      reinterpret_cast<const uint64_t *>(ModExitContents.data()), EntryCount);
-}

diff --git a/lib/MC/MCAnalysis/MCObjectSymbolizer.cpp b/lib/MC/MCAnalysis/MCObjectSymbolizer.cpp
deleted file mode 100644
index b149596..0000000
--- a/lib/MC/MCAnalysis/MCObjectSymbolizer.cpp
+++ /dev/null

@@ -1,268 +0,0 @@
-//===-- lib/MC/MCObjectSymbolizer.cpp -------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCObjectSymbolizer.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRelocationInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-
-using namespace llvm;
-using namespace object;
-
-//===- MCMachObjectSymbolizer ---------------------------------------------===//
-
-namespace {
-class MCMachObjectSymbolizer : public MCObjectSymbolizer {
-  const MachOObjectFile *MOOF;
-  // __TEXT;__stubs support.
-  uint64_t StubsStart;
-  uint64_t StubsCount;
-  uint64_t StubSize;
-  uint64_t StubsIndSymIndex;
-
-public:
-  MCMachObjectSymbolizer(MCContext &Ctx,
-                         std::unique_ptr<MCRelocationInfo> RelInfo,
-                         const MachOObjectFile *MOOF);
-
-  StringRef findExternalFunctionAt(uint64_t Addr) override;
-
-  void tryAddingPcLoadReferenceComment(raw_ostream &cStream, int64_t Value,
-                                       uint64_t Address) override;
-};
-} // End unnamed namespace
-
-MCMachObjectSymbolizer::MCMachObjectSymbolizer(
-    MCContext &Ctx, std::unique_ptr<MCRelocationInfo> RelInfo,
-    const MachOObjectFile *MOOF)
-  : MCObjectSymbolizer(Ctx, std::move(RelInfo), MOOF), MOOF(MOOF),
-    StubsStart(0), StubsCount(0), StubSize(0), StubsIndSymIndex(0) {
-
-  for (const SectionRef &Section : MOOF->sections()) {
-    StringRef Name;
-    Section.getName(Name);
-    if (Name == "__stubs") {
-      SectionRef StubsSec = Section;
-      if (MOOF->is64Bit()) {
-        MachO::section_64 S = MOOF->getSection64(StubsSec.getRawDataRefImpl());
-        StubsIndSymIndex = S.reserved1;
-        StubSize = S.reserved2;
-      } else {
-        MachO::section S = MOOF->getSection(StubsSec.getRawDataRefImpl());
-        StubsIndSymIndex = S.reserved1;
-        StubSize = S.reserved2;
-      }
-      assert(StubSize && "Mach-O stub entry size can't be zero!");
-      StubsSec.getAddress(StubsStart);
-      StubsSec.getSize(StubsCount);
-      StubsCount /= StubSize;
-    }
-  }
-}
-
-StringRef MCMachObjectSymbolizer::findExternalFunctionAt(uint64_t Addr) {
-  // FIXME: also, this can all be done at the very beginning, by iterating over
-  // all stubs and creating the calls to outside functions. Is it worth it
-  // though?
-  if (!StubSize)
-    return StringRef();
-  uint64_t StubIdx = (Addr - StubsStart) / StubSize;
-  if (StubIdx >= StubsCount)
-    return StringRef();
-
-  uint32_t SymtabIdx =
-    MOOF->getIndirectSymbolTableEntry(MOOF->getDysymtabLoadCommand(), StubIdx);
-
-  StringRef SymName;
-  symbol_iterator SI = MOOF->symbol_begin();
-  for (uint32_t i = 0; i != SymtabIdx; ++i)
-    ++SI;
-  SI->getName(SymName);
-  assert(SI != MOOF->symbol_end() && "Stub wasn't found in the symbol table!");
-  assert(SymName.front() == '_' && "Mach-O symbol doesn't start with '_'!");
-  return SymName.substr(1);
-}
-
-void MCMachObjectSymbolizer::
-tryAddingPcLoadReferenceComment(raw_ostream &cStream, int64_t Value,
-                                uint64_t Address) {
-  if (const RelocationRef *R = findRelocationAt(Address)) {
-    const MCExpr *RelExpr = RelInfo->createExprForRelocation(*R);
-    if (!RelExpr || RelExpr->EvaluateAsAbsolute(Value) == false)
-      return;
-  }
-  uint64_t Addr = Value;
-  if (const SectionRef *S = findSectionContaining(Addr)) {
-    StringRef Name; S->getName(Name);
-    uint64_t SAddr; S->getAddress(SAddr);
-    if (Name == "__cstring") {
-      StringRef Contents;
-      S->getContents(Contents);
-      Contents = Contents.substr(Addr - SAddr);
-      cStream << " ## literal pool for: "
-              << Contents.substr(0, Contents.find_first_of(0));
-    }
-  }
-}
-
-//===- MCObjectSymbolizer -------------------------------------------------===//
-
-MCObjectSymbolizer::MCObjectSymbolizer(
-  MCContext &Ctx, std::unique_ptr<MCRelocationInfo> RelInfo,
-  const ObjectFile *Obj)
-  : MCSymbolizer(Ctx, std::move(RelInfo)), Obj(Obj), SortedSections(),
-    AddrToReloc() {}
-
-bool MCObjectSymbolizer::
-tryAddingSymbolicOperand(MCInst &MI, raw_ostream &cStream,
-                         int64_t Value, uint64_t Address, bool IsBranch,
-                         uint64_t Offset, uint64_t InstSize) {
-  if (IsBranch) {
-    StringRef ExtFnName = findExternalFunctionAt((uint64_t)Value);
-    if (!ExtFnName.empty()) {
-      MCSymbol *Sym = Ctx.GetOrCreateSymbol(ExtFnName);
-      const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
-      MI.addOperand(MCOperand::CreateExpr(Expr));
-      return true;
-    }
-  }
-
-  if (const RelocationRef *R = findRelocationAt(Address + Offset)) {
-    if (const MCExpr *RelExpr = RelInfo->createExprForRelocation(*R)) {
-      MI.addOperand(MCOperand::CreateExpr(RelExpr));
-      return true;
-    }
-    // Only try to create a symbol+offset expression if there is no relocation.
-    return false;
-  }
-
-  // Interpret Value as a branch target.
-  if (IsBranch == false)
-    return false;
-  uint64_t UValue = Value;
-  // FIXME: map instead of looping each time?
-  for (const SymbolRef &Symbol : Obj->symbols()) {
-    uint64_t SymAddr;
-    Symbol.getAddress(SymAddr);
-    uint64_t SymSize;
-    Symbol.getSize(SymSize);
-    StringRef SymName;
-    Symbol.getName(SymName);
-    SymbolRef::Type SymType;
-    Symbol.getType(SymType);
-    if (SymAddr == UnknownAddressOrSize || SymSize == UnknownAddressOrSize ||
-        SymName.empty() || SymType != SymbolRef::ST_Function)
-      continue;
-
-    if ( SymAddr == UValue ||
-        (SymAddr <= UValue && SymAddr + SymSize > UValue)) {
-      MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
-      const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
-      if (SymAddr != UValue) {
-        const MCExpr *Off = MCConstantExpr::Create(UValue - SymAddr, Ctx);
-        Expr = MCBinaryExpr::CreateAdd(Expr, Off, Ctx);
-      }
-      MI.addOperand(MCOperand::CreateExpr(Expr));
-      return true;
-    }
-  }
-  return false;
-}
-
-void MCObjectSymbolizer::
-tryAddingPcLoadReferenceComment(raw_ostream &cStream,
-                                int64_t Value, uint64_t Address) {
-}
-
-StringRef MCObjectSymbolizer::findExternalFunctionAt(uint64_t Addr) {
-  return StringRef();
-}
-
-MCObjectSymbolizer *MCObjectSymbolizer::createObjectSymbolizer(
-    MCContext &Ctx, std::unique_ptr<MCRelocationInfo> RelInfo,
-    const ObjectFile *Obj) {
-  if (const MachOObjectFile *MOOF = dyn_cast<MachOObjectFile>(Obj))
-    return new MCMachObjectSymbolizer(Ctx, std::move(RelInfo), MOOF);
-  return new MCObjectSymbolizer(Ctx, std::move(RelInfo), Obj);
-}
-
-// SortedSections implementation.
-
-static bool SectionStartsBefore(const SectionRef &S, uint64_t Addr) {
-  uint64_t SAddr; S.getAddress(SAddr);
-  return SAddr < Addr;
-}
-
-const SectionRef *MCObjectSymbolizer::findSectionContaining(uint64_t Addr) {
-  if (SortedSections.empty())
-    buildSectionList();
-
-  SortedSectionList::iterator
-    EndIt = SortedSections.end(),
-    It = std::lower_bound(SortedSections.begin(), EndIt,
-                          Addr, SectionStartsBefore);
-  if (It == EndIt)
-    return nullptr;
-  uint64_t SAddr; It->getAddress(SAddr);
-  uint64_t SSize; It->getSize(SSize);
-  if (Addr >= SAddr + SSize)
-    return nullptr;
-  return &*It;
-}
-
-const RelocationRef *MCObjectSymbolizer::findRelocationAt(uint64_t Addr) {
-  if (AddrToReloc.empty())
-    buildRelocationByAddrMap();
-
-  AddrToRelocMap::const_iterator RI = AddrToReloc.find(Addr);
-  if (RI == AddrToReloc.end())
-    return nullptr;
-  return &RI->second;
-}
-
-void MCObjectSymbolizer::buildSectionList() {
-  for (const SectionRef &Section : Obj->sections()) {
-    bool RequiredForExec;
-    Section.isRequiredForExecution(RequiredForExec);
-    if (RequiredForExec == false)
-      continue;
-    uint64_t SAddr;
-    Section.getAddress(SAddr);
-    uint64_t SSize;
-    Section.getSize(SSize);
-    SortedSectionList::iterator It =
-        std::lower_bound(SortedSections.begin(), SortedSections.end(), SAddr,
-                         SectionStartsBefore);
-    if (It != SortedSections.end()) {
-      uint64_t FoundSAddr; It->getAddress(FoundSAddr);
-      if (FoundSAddr < SAddr + SSize)
-        llvm_unreachable("Inserting overlapping sections");
-    }
-    SortedSections.insert(It, Section);
-  }
-}
-
-void MCObjectSymbolizer::buildRelocationByAddrMap() {
-  for (const SectionRef &Section : Obj->sections()) {
-    for (const RelocationRef &Reloc : Section.relocations()) {
-      uint64_t Address;
-      Reloc.getAddress(Address);
-      // At a specific address, only keep the first relocation.
-      if (AddrToReloc.find(Address) == AddrToReloc.end())
-        AddrToReloc[Address] = Reloc;
-    }
-  }
-}

diff --git a/lib/MC/MCAnalysis/Makefile b/lib/MC/MCAnalysis/Makefile
deleted file mode 100644
index add2dbd..0000000
--- a/lib/MC/MCAnalysis/Makefile
+++ /dev/null

@@ -1,14 +0,0 @@
-##===- lib/MC/MCAnalysys/Makefile --------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMMCAnalysis
-BUILD_ARCHIVE := 1
-
-include $(LEVEL)/Makefile.common

diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index f8081ef..2fb558f 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp

@@ -32,7 +32,6 @@
   HasMachoZeroFillDirective = false;
   HasMachoTBSSDirective = false;
   HasStaticCtorDtorReferenceInStaticMode = false;
-  LinkerRequiresNonEmptyDwarfLines = false;
   MaxInstLength = 4;
   MinInstAlignment = 1;
   DollarIsPC = false;
@@ -64,7 +63,7 @@
   GPRel64Directive = nullptr;
   GPRel32Directive = nullptr;
   GlobalDirective = "\t.globl\t";
-  HasSetDirective = true;
+  SetDirectiveSuppressesReloc = false;
   HasAggressiveSymbolFolding = true;
   COMMDirectiveAlignmentIsInBytes = true;
   LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
@@ -79,10 +78,9 @@
   HiddenVisibilityAttr = MCSA_Hidden;
   HiddenDeclarationVisibilityAttr = MCSA_Hidden;
   ProtectedVisibilityAttr = MCSA_Protected;
-  HasLEB128 = false;
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
-  WinEHEncodingType = WinEH::EncodingType::ET_Invalid;
+  WinEHEncodingType = WinEH::EncodingType::Invalid;
   DwarfUsesRelocationsAcrossSections = true;
   DwarfFDESymbolsUseAbsDiff = false;
   DwarfRegNumForCFI = false;

diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 9945637..bb3f0d3 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp

@@ -32,7 +32,6 @@
   ProtectedVisibilityAttr = MCSA_Invalid;
 
   // Set up DWARF directives
-  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
   SupportsDebugInformation = true;
   NeedsDwarfSectionOffsetDirective = true;
 

diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index eaf28dd..66a138b 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp

@@ -42,9 +42,6 @@
   HasMachoTBSSDirective = true; // Uses .tbss
   HasStaticCtorDtorReferenceInStaticMode = true;
 
-  // FIXME: Darwin 10 and newer don't need this.
-  LinkerRequiresNonEmptyDwarfLines = true;
-
   // FIXME: Change this once MC is the system assembler.
   HasAggressiveSymbolFolding = false;
 
@@ -60,4 +57,5 @@
   DwarfUsesRelocationsAcrossSections = false;
 
   UseIntegratedAssembler = true;
+  SetDirectiveSuppressesReloc = true;
 }

diff --git a/lib/MC/MCAsmInfoELF.cpp b/lib/MC/MCAsmInfoELF.cpp
index ccb3dc3..9f70d8d 100644
--- a/lib/MC/MCAsmInfoELF.cpp
+++ b/lib/MC/MCAsmInfoELF.cpp

@@ -13,10 +13,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
 using namespace llvm;
 
 void MCAsmInfoELF::anchor() { }
 
+const MCSection *
+MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
+  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
+                           0, SectionKind::getMetadata());
+}
+
 MCAsmInfoELF::MCAsmInfoELF() {
   HasIdentDirective = true;
   WeakRefDirective = "\t.weak\t";

diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 6973bbb..f60c7fc 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp

@@ -700,7 +700,6 @@
     EmitULEB128IntValue(IntValue);
     return;
   }
-  assert(MAI->hasLEB128() && "Cannot print a .uleb");
   OS << ".uleb128 " << *Value;
   EmitEOL();
 }
@@ -711,7 +710,6 @@
     EmitSLEB128IntValue(IntValue);
     return;
   }
-  assert(MAI->hasLEB128() && "Cannot print a .sleb");
   OS << ".sleb128 " << *Value;
   EmitEOL();
 }
@@ -1089,19 +1087,6 @@
   EmitEOL();
 }
 
-static const MCSection *getWin64EHTableSection(StringRef suffix,
-                                               MCContext &context) {
-  // FIXME: This doesn't belong in MCObjectFileInfo. However,
-  /// this duplicate code in MCWin64EH.cpp.
-  if (suffix == "")
-    return context.getObjectFileInfo()->getXDataSection();
-  return context.getCOFFSection((".xdata"+suffix).str(),
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ |
-                                COFF::IMAGE_SCN_MEM_WRITE,
-                                SectionKind::getDataRel());
-}
-
 void MCAsmStreamer::EmitWinEHHandlerData() {
   MCStreamer::EmitWinEHHandlerData();
 
@@ -1109,11 +1094,10 @@
   // cause the section switch to be visible in the emitted assembly.
   // We only do this so the section switch that terminates the handler
   // data block is visible.
-  MCWin64EHUnwindInfo *CurFrame = getCurrentW64UnwindInfo();
-  StringRef suffix=MCWin64EHUnwindEmitter::GetSectionSuffix(CurFrame->Function);
-  const MCSection *xdataSect = getWin64EHTableSection(suffix, getContext());
-  if (xdataSect)
-    SwitchSectionNoChange(xdataSect);
+  WinEH::FrameInfo *CurFrame = getCurrentWinFrameInfo();
+  if (const MCSection *XData = WinEH::UnwindEmitter::getXDataSection(
+          CurFrame->Function, getContext()))
+    SwitchSectionNoChange(XData);
 
   OS << "\t.seh_handlerdata";
   EmitEOL();

diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index a8aad71..85d0c13 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp

@@ -141,7 +141,7 @@
 
   // If SD is a variable, evaluate it.
   MCValue Target;
-  if (!S.getVariableValue()->EvaluateAsValue(Target, &Layout))
+  if (!S.getVariableValue()->EvaluateAsValue(Target, &Layout, nullptr))
     report_fatal_error("unable to evaluate offset for variable '" +
                        S.getName() + "'");
 
@@ -187,7 +187,7 @@
 
   const MCExpr *Expr = Symbol.getVariableValue();
   MCValue Value;
-  if (!Expr->EvaluateAsValue(Value, this))
+  if (!Expr->EvaluateAsValue(Value, this, nullptr))
     llvm_unreachable("Invalid Expression");
 
   const MCSymbolRefExpr *RefB = Value.getSymB();
@@ -291,7 +291,9 @@
   : Section(&_Section),
     Ordinal(~UINT32_C(0)),
     Alignment(1),
-    BundleLockState(NotBundleLocked), BundleGroupBeforeFirstInst(false),
+    BundleLockState(NotBundleLocked),
+    BundleLockNestingDepth(0),
+    BundleGroupBeforeFirstInst(false),
     HasInstructions(false)
 {
   if (A)
@@ -328,17 +330,33 @@
   return IP;
 }
 
+void MCSectionData::setBundleLockState(BundleLockStateType NewState) {
+  if (NewState == NotBundleLocked) {
+    if (BundleLockNestingDepth == 0) {
+      report_fatal_error("Mismatched bundle_lock/unlock directives");
+    }
+    if (--BundleLockNestingDepth == 0) {
+      BundleLockState = NotBundleLocked;
+    }
+    return;
+  }
+
+  // If any of the directives is an align_to_end directive, the whole nested
+  // group is align_to_end. So don't downgrade from align_to_end to just locked.
+  if (BundleLockState != BundleLockedAlignToEnd) {
+    BundleLockState = NewState;
+  }
+  ++BundleLockNestingDepth;
+}
+
 /* *** */
 
 MCSymbolData::MCSymbolData() : Symbol(nullptr) {}
 
 MCSymbolData::MCSymbolData(const MCSymbol &_Symbol, MCFragment *_Fragment,
                            uint64_t _Offset, MCAssembler *A)
-  : Symbol(&_Symbol), Fragment(_Fragment), Offset(_Offset),
-    IsExternal(false), IsPrivateExtern(false),
-    CommonSize(0), SymbolSize(nullptr), CommonAlign(0),
-    Flags(0), Index(0)
-{
+    : Symbol(&_Symbol), Fragment(_Fragment), Offset(_Offset),
+      SymbolSize(nullptr), CommonAlign(-1U), Flags(0), Index(0) {
   if (A)
     A->getSymbolList().push_back(this);
 }
@@ -348,9 +366,9 @@
 MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_,
                          MCCodeEmitter &Emitter_, MCObjectWriter &Writer_,
                          raw_ostream &OS_)
-  : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(Writer_),
-    OS(OS_), BundleAlignSize(0), RelaxAll(false), NoExecStack(false),
-    SubsectionsViaSymbols(false), ELFHeaderEFlags(0) {
+    : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(Writer_),
+      OS(OS_), BundleAlignSize(0), RelaxAll(false),
+      SubsectionsViaSymbols(false), ELFHeaderEFlags(0) {
   VersionMinInfo.Major = 0; // Major version == 0 for "none specified"
 }
 
@@ -364,11 +382,15 @@
   SymbolMap.clear();
   IndirectSymbols.clear();
   DataRegions.clear();
+  LinkerOptions.clear();
+  FileNames.clear();
   ThumbFuncs.clear();
+  BundleAlignSize = 0;
   RelaxAll = false;
-  NoExecStack = false;
   SubsectionsViaSymbols = false;
   ELFHeaderEFlags = 0;
+  LOHContainer.reset();
+  VersionMinInfo.Major = 0;
 
   // reset objects owned by us
   getBackend().reset();
@@ -438,11 +460,12 @@
 // a relocatable expr.
 // FIXME: Should this be the behavior of EvaluateAsRelocatable itself?
 static bool evaluate(const MCExpr &Expr, const MCAsmLayout &Layout,
-                     MCValue &Target) {
-  if (Expr.EvaluateAsValue(Target, &Layout))
+                     const MCFixup &Fixup, MCValue &Target) {
+  if (Expr.EvaluateAsValue(Target, &Layout, &Fixup)) {
     if (Target.isAbsolute())
       return true;
-  return Expr.EvaluateAsRelocatable(Target, &Layout);
+  }
+  return Expr.EvaluateAsRelocatable(Target, &Layout, &Fixup);
 }
 
 bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
@@ -454,7 +477,7 @@
   // probably merge the two into a single callback that tries to evaluate a
   // fixup and records a relocation if one is needed.
   const MCExpr *Expr = Fixup.getValue();
-  if (!evaluate(*Expr, Layout, Target))
+  if (!evaluate(*Expr, Layout, Fixup, Target))
     getContext().FatalError(Fixup.getLoc(), "expected relocatable expression");
 
   bool IsPCRel = Backend.getFixupKindInfo(
@@ -993,11 +1016,8 @@
 }
 
 bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
-  int64_t Value = 0;
   uint64_t OldSize = LF.getContents().size();
-  bool IsAbs = LF.getValue().EvaluateAsAbsolute(Value, Layout);
-  (void)IsAbs;
-  assert(IsAbs);
+  int64_t Value = LF.getValue().evaluateKnownAbsolute(Layout);
   SmallString<8> &Data = LF.getContents();
   Data.clear();
   raw_svector_ostream OSE(Data);
@@ -1012,11 +1032,8 @@
 bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
                                      MCDwarfLineAddrFragment &DF) {
   MCContext &Context = Layout.getAssembler().getContext();
-  int64_t AddrDelta = 0;
   uint64_t OldSize = DF.getContents().size();
-  bool IsAbs = DF.getAddrDelta().EvaluateAsAbsolute(AddrDelta, Layout);
-  (void)IsAbs;
-  assert(IsAbs);
+  int64_t AddrDelta = DF.getAddrDelta().evaluateKnownAbsolute(Layout);
   int64_t LineDelta;
   LineDelta = DF.getLineDelta();
   SmallString<8> &Data = DF.getContents();
@@ -1030,11 +1047,8 @@
 bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
                                               MCDwarfCallFrameFragment &DF) {
   MCContext &Context = Layout.getAssembler().getContext();
-  int64_t AddrDelta = 0;
   uint64_t OldSize = DF.getContents().size();
-  bool IsAbs = DF.getAddrDelta().EvaluateAsAbsolute(AddrDelta, Layout);
-  (void)IsAbs;
-  assert(IsAbs);
+  int64_t AddrDelta = DF.getAddrDelta().evaluateKnownAbsolute(Layout);
   SmallString<8> &Data = DF.getContents();
   Data.clear();
   raw_svector_ostream OSE(Data);
@@ -1247,8 +1261,10 @@
   raw_ostream &OS = llvm::errs();
 
   OS << "<MCSymbolData Symbol:" << getSymbol()
-     << " Fragment:" << getFragment() << " Offset:" << getOffset()
-     << " Flags:" << getFlags() << " Index:" << getIndex();
+     << " Fragment:" << getFragment();
+  if (!isCommon())
+    OS << " Offset:" << getOffset();
+  OS << " Flags:" << getFlags() << " Index:" << getIndex();
   if (isCommon())
     OS << " (common, size:" << getCommonSize()
        << " align: " << getCommonAlignment() << ")";

diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 960a071..8630b25 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp

@@ -73,7 +73,10 @@
   Symbols.clear();
   Allocator.Reset();
   Instances.clear();
+  CompilationDir.clear();
+  MainFileName.clear();
   MCDwarfLineTablesCUMap.clear();
+  SectionStartEndSyms.clear();
   MCGenDwarfLabelEntries.clear();
   DwarfDebugFlags = StringRef();
   DwarfCompileUnitID = 0;
@@ -97,16 +100,33 @@
 MCSymbol *MCContext::GetOrCreateSymbol(StringRef Name) {
   assert(!Name.empty() && "Normal symbols cannot be unnamed!");
 
-  // Do the lookup and get the entire StringMapEntry.  We want access to the
-  // key if we are creating the entry.
-  StringMapEntry<MCSymbol*> &Entry = Symbols.GetOrCreateValue(Name);
-  MCSymbol *Sym = Entry.getValue();
+  MCSymbol *&Sym = Symbols[Name];
 
+  if (!Sym)
+    Sym = CreateSymbol(Name);
+
+  return Sym;
+}
+
+MCSymbol *MCContext::getOrCreateSectionSymbol(const MCSectionELF &Section) {
+  MCSymbol *&Sym = SectionSymbols[&Section];
   if (Sym)
     return Sym;
 
-  Sym = CreateSymbol(Name);
-  Entry.setValue(Sym);
+  StringRef Name = Section.getSectionName();
+
+  MCSymbol *&OldSym = Symbols[Name];
+  if (OldSym && OldSym->isUndefined()) {
+    Sym = OldSym;
+    return OldSym;
+  }
+
+  auto NameIter = UsedNames.insert(std::make_pair(Name, true)).first;
+  Sym = new (*this) MCSymbol(NameIter->getKey(), /*isTemporary*/ false);
+
+  if (!OldSym)
+    OldSym = Sym;
+
   return Sym;
 }
 
@@ -116,21 +136,21 @@
   if (AllowTemporaryLabels)
     isTemporary = Name.startswith(MAI->getPrivateGlobalPrefix());
 
-  StringMapEntry<bool> *NameEntry = &UsedNames.GetOrCreateValue(Name);
-  if (NameEntry->getValue()) {
+  auto NameEntry = UsedNames.insert(std::make_pair(Name, true));
+  if (!NameEntry.second) {
     assert(isTemporary && "Cannot rename non-temporary symbols");
     SmallString<128> NewName = Name;
     do {
       NewName.resize(Name.size());
       raw_svector_ostream(NewName) << NextUniqueID++;
-      NameEntry = &UsedNames.GetOrCreateValue(NewName);
-    } while (NameEntry->getValue());
+      NameEntry = UsedNames.insert(std::make_pair(NewName, true));
+    } while (!NameEntry.second);
   }
-  NameEntry->setValue(true);
 
   // Ok, the entry doesn't already exist.  Have the MCSymbol object itself refer
   // to the copy of the string that is embedded in the UsedNames entry.
-  MCSymbol *Result = new (*this) MCSymbol(NameEntry->getKey(), isTemporary);
+  MCSymbol *Result =
+      new (*this) MCSymbol(NameEntry.first->getKey(), isTemporary);
 
   return Result;
 }
@@ -291,7 +311,7 @@
   if (!IterBool.second)
     return Iter->second;
 
-  const MCSymbol *COMDATSymbol = nullptr;
+  MCSymbol *COMDATSymbol = nullptr;
   if (!COMDATSymName.empty())
     COMDATSymbol = GetOrCreateSymbol(COMDATSymName);
 
@@ -317,6 +337,22 @@
   return Iter->second;
 }
 
+const MCSectionCOFF *
+MCContext::getAssociativeCOFFSection(const MCSectionCOFF *Sec,
+                                     const MCSymbol *KeySym) {
+  // Return the normal section if we don't have to be associative.
+  if (!KeySym)
+    return Sec;
+
+  // Make an associative section with the same name and kind as the normal
+  // section.
+  unsigned Characteristics =
+      Sec->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT;
+  return getCOFFSection(Sec->getSectionName(), Characteristics, Sec->getKind(),
+                        KeySym->getName(),
+                        COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE);
+}
+
 //===----------------------------------------------------------------------===//
 // Dwarf Management
 //===----------------------------------------------------------------------===//

diff --git a/lib/MC/MCDisassembler.cpp b/lib/MC/MCDisassembler.cpp
deleted file mode 100644
index 77d9ce1..0000000
--- a/lib/MC/MCDisassembler.cpp
+++ /dev/null

@@ -1,39 +0,0 @@
-//===-- lib/MC/MCDisassembler.cpp - Disassembler interface ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCExternalSymbolizer.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-MCDisassembler::~MCDisassembler() {
-}
-
-bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
-                                              uint64_t Address, bool IsBranch,
-                                              uint64_t Offset,
-                                              uint64_t InstSize) const {
-  raw_ostream &cStream = CommentStream ? *CommentStream : nulls();
-  if (Symbolizer)
-    return Symbolizer->tryAddingSymbolicOperand(Inst, cStream, Value, Address,
-                                                IsBranch, Offset, InstSize);
-  return false;
-}
-
-void MCDisassembler::tryAddingPcLoadReferenceComment(int64_t Value,
-                                                     uint64_t Address) const {
-  raw_ostream &cStream = CommentStream ? *CommentStream : nulls();
-  if (Symbolizer)
-    Symbolizer->tryAddingPcLoadReferenceComment(cStream, Value, Address);
-}
-
-void MCDisassembler::setSymbolizer(std::unique_ptr<MCSymbolizer> Symzer) {
-  Symbolizer = std::move(Symzer);
-}

diff --git a/lib/MC/MCDisassembler/Android.mk b/lib/MC/MCDisassembler/Android.mk
index 7f73df3..87455e2 100644
--- a/lib/MC/MCDisassembler/Android.mk
+++ b/lib/MC/MCDisassembler/Android.mk

@@ -1,15 +1,37 @@
 LOCAL_PATH:= $(call my-dir)
 
+mc_disassembler_SRC_FILES := \
+  Disassembler.cpp \
+  MCDisassembler.cpp \
+  MCExternalSymbolizer.cpp \
+  MCRelocationInfo.cpp
+
+
 # For the host
 # =====================================================
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES :=	\
-	Disassembler.cpp
+LOCAL_SRC_FILES := $(mc_disassembler_SRC_FILES)
 
 LOCAL_MODULE:= libLLVMMCDisassembler
 
 LOCAL_MODULE_TAGS := optional
 
+
 include $(LLVM_HOST_BUILD_MK)
 include $(BUILD_HOST_STATIC_LIBRARY)
+
+# For the device
+# =====================================================
+include $(CLEAR_VARS)
+ifneq (true,$(DISABLE_LLVM_DEVICE_BUILDS))
+
+LOCAL_SRC_FILES := $(mc_disassembler_SRC_FILES)
+
+LOCAL_MODULE:= libLLVMMCDisassembler
+
+LOCAL_MODULE_TAGS := optional
+
+include $(LLVM_DEVICE_BUILD_MK)
+include $(BUILD_STATIC_LIBRARY)
+endif

diff --git a/lib/MC/MCDisassembler/CMakeLists.txt b/lib/MC/MCDisassembler/CMakeLists.txt
index 5195b9e..f266f8f 100644
--- a/lib/MC/MCDisassembler/CMakeLists.txt
+++ b/lib/MC/MCDisassembler/CMakeLists.txt

@@ -1,3 +1,6 @@
 add_llvm_library(LLVMMCDisassembler
   Disassembler.cpp
+  MCRelocationInfo.cpp
+  MCExternalSymbolizer.cpp
+  MCDisassembler.cpp
   )

diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index 0530c26..d0d7f30 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp

@@ -21,7 +21,6 @@
 #include "llvm/MC/MCSymbolizer.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -33,10 +32,11 @@
 // functions can all be passed as NULL.  If successful, this returns a
 // disassembler context.  If not, it returns NULL.
 //
-LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
-                                         void *DisInfo, int TagType,
-                                         LLVMOpInfoCallback GetOpInfo,
-                                         LLVMSymbolLookupCallback SymbolLookUp){
+LLVMDisasmContextRef
+LLVMCreateDisasmCPUFeatures(const char *Triple, const char *CPU,
+                            const char *Features, void *DisInfo, int TagType,
+                            LLVMOpInfoCallback GetOpInfo,
+                            LLVMSymbolLookupCallback SymbolLookUp) {
   // Get the target.
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
@@ -56,11 +56,8 @@
   if (!MII)
     return nullptr;
 
-  // Package up features to be passed to target/subtarget
-  std::string FeaturesStr;
-
   const MCSubtargetInfo *STI = TheTarget->createMCSubtargetInfo(Triple, CPU,
-                                                                FeaturesStr);
+                                                                Features);
   if (!STI)
     return nullptr;
 
@@ -101,11 +98,19 @@
   return DC;
 }
 
+LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
+                                         void *DisInfo, int TagType,
+                                         LLVMOpInfoCallback GetOpInfo,
+                                         LLVMSymbolLookupCallback SymbolLookUp){
+  return LLVMCreateDisasmCPUFeatures(Triple, CPU, "", DisInfo, TagType,
+                                     GetOpInfo, SymbolLookUp);
+}
+
 LLVMDisasmContextRef LLVMCreateDisasm(const char *Triple, void *DisInfo,
                                       int TagType, LLVMOpInfoCallback GetOpInfo,
                                       LLVMSymbolLookupCallback SymbolLookUp) {
-  return LLVMCreateDisasmCPU(Triple, "", DisInfo, TagType, GetOpInfo,
-                             SymbolLookUp);
+  return LLVMCreateDisasmCPUFeatures(Triple, "", "", DisInfo, TagType,
+                                     GetOpInfo, SymbolLookUp);
 }
 
 //
@@ -116,30 +121,6 @@
   delete DC;
 }
 
-namespace {
-//
-// The memory object created by LLVMDisasmInstruction().
-//
-class DisasmMemoryObject : public MemoryObject {
-  uint8_t *Bytes;
-  uint64_t Size;
-  uint64_t BasePC;
-public:
-  DisasmMemoryObject(uint8_t *bytes, uint64_t size, uint64_t basePC) :
-                     Bytes(bytes), Size(size), BasePC(basePC) {}
-
-  uint64_t getBase() const override { return BasePC; }
-  uint64_t getExtent() const override { return Size; }
-
-  int readByte(uint64_t Addr, uint8_t *Byte) const override {
-    if (Addr - BasePC >= Size)
-      return -1;
-    *Byte = Bytes[Addr - BasePC];
-    return 0;
-  }
-};
-} // end anonymous namespace
-
 /// \brief Emits the comments that are stored in \p DC comment stream.
 /// Each comment in the comment stream must end with a newline.
 static void emitComments(LLVMDisasmContext *DC,
@@ -202,19 +183,19 @@
 static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
   // Try to compute scheduling information.
   const MCSubtargetInfo *STI = DC->getSubtargetInfo();
-  const MCSchedModel *SCModel = STI->getSchedModel();
+  const MCSchedModel SCModel = STI->getSchedModel();
   const int NoInformationAvailable = -1;
 
   // Check if we have a scheduling model for instructions.
-  if (!SCModel || !SCModel->hasInstrSchedModel())
-    // Try to fall back to the itinerary model if we do not have a
-    // scheduling model.
+  if (!SCModel.hasInstrSchedModel())
+    // Try to fall back to the itinerary model if the scheduling model doesn't
+    // have a scheduling table.  Note the default does not have a table.
     return getItineraryLatency(DC, Inst);
 
   // Get the scheduling class of the requested instruction.
   const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
   unsigned SCClass = Desc.getSchedClass();
-  const MCSchedClassDesc *SCDesc = SCModel->getSchedClassDesc(SCClass);
+  const MCSchedClassDesc *SCDesc = SCModel.getSchedClassDesc(SCClass);
   // Resolving the variant SchedClass requires an MI to pass to
   // SubTargetInfo::resolveSchedClass.
   if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant())
@@ -263,7 +244,7 @@
                              size_t OutStringSize){
   LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
   // Wrap the pointer to the Bytes, BytesSize and PC in a MemoryObject.
-  DisasmMemoryObject MemoryObject(Bytes, BytesSize, PC);
+  ArrayRef<uint8_t> Data(Bytes, BytesSize);
 
   uint64_t Size;
   MCInst Inst;
@@ -272,7 +253,7 @@
   MCDisassembler::DecodeStatus S;
   SmallVector<char, 64> InsnStr;
   raw_svector_ostream Annotations(InsnStr);
-  S = DisAsm->getInstruction(Inst, Size, MemoryObject, PC,
+  S = DisAsm->getInstruction(Inst, Size, Data, PC,
                              /*REMOVE*/ nulls(), Annotations);
   switch (S) {
   case MCDisassembler::Fail:

diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index d1d40cd..46d0c4c 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_DISASSEMBLER_H
-#define LLVM_MC_DISASSEMBLER_H
+#ifndef LLVM_LIB_MC_MCDISASSEMBLER_DISASSEMBLER_H
+#define LLVM_LIB_MC_MCDISASSEMBLER_DISASSEMBLER_H
 
 #include "llvm-c/Disassembler.h"
 #include "llvm/ADT/SmallString.h"

diff --git a/lib/MC/MCDisassembler/MCDisassembler.cpp b/lib/MC/MCDisassembler/MCDisassembler.cpp
new file mode 100644
index 0000000..1084e5e
--- /dev/null
+++ b/lib/MC/MCDisassembler/MCDisassembler.cpp

@@ -0,0 +1,39 @@
+//===-- MCDisassembler.cpp - Disassembler interface -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExternalSymbolizer.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCDisassembler::~MCDisassembler() {
+}
+
+bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
+                                              uint64_t Address, bool IsBranch,
+                                              uint64_t Offset,
+                                              uint64_t InstSize) const {
+  raw_ostream &cStream = CommentStream ? *CommentStream : nulls();
+  if (Symbolizer)
+    return Symbolizer->tryAddingSymbolicOperand(Inst, cStream, Value, Address,
+                                                IsBranch, Offset, InstSize);
+  return false;
+}
+
+void MCDisassembler::tryAddingPcLoadReferenceComment(int64_t Value,
+                                                     uint64_t Address) const {
+  raw_ostream &cStream = CommentStream ? *CommentStream : nulls();
+  if (Symbolizer)
+    Symbolizer->tryAddingPcLoadReferenceComment(cStream, Value, Address);
+}
+
+void MCDisassembler::setSymbolizer(std::unique_ptr<MCSymbolizer> Symzer) {
+  Symbolizer = std::move(Symzer);
+}

diff --git a/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp b/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp
new file mode 100644
index 0000000..0145623
--- /dev/null
+++ b/lib/MC/MCDisassembler/MCExternalSymbolizer.cpp

@@ -0,0 +1,198 @@
+//===-- MCExternalSymbolizer.cpp - External symbolizer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCExternalSymbolizer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+
+using namespace llvm;
+
+// This function tries to add a symbolic operand in place of the immediate
+// Value in the MCInst. The immediate Value has had any PC adjustment made by
+// the caller. If the instruction is a branch instruction then IsBranch is true,
+// else false. If the getOpInfo() function was set as part of the
+// setupForSymbolicDisassembly() call then that function is called to get any
+// symbolic information at the Address for this instruction. If that returns
+// non-zero then the symbolic information it returns is used to create an MCExpr
+// and that is added as an operand to the MCInst. If getOpInfo() returns zero
+// and IsBranch is true then a symbol look up for Value is done and if a symbol
+// is found an MCExpr is created with that, else an MCExpr with Value is
+// created. This function returns true if it adds an operand to the MCInst and
+// false otherwise.
+bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
+                                                    raw_ostream &cStream,
+                                                    int64_t Value,
+                                                    uint64_t Address,
+                                                    bool IsBranch,
+                                                    uint64_t Offset,
+                                                    uint64_t InstSize) {
+  struct LLVMOpInfo1 SymbolicOp;
+  std::memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+
+  if (!GetOpInfo ||
+      !GetOpInfo(DisInfo, Address, Offset, InstSize, 1, &SymbolicOp)) {
+    // Clear SymbolicOp.Value from above and also all other fields.
+    std::memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+
+    // At this point, GetOpInfo() did not find any relocation information about
+    // this operand and we are left to use the SymbolLookUp() call back to guess
+    // if the Value is the address of a symbol.  In the case this is a branch
+    // that always makes sense to guess.  But in the case of an immediate it is
+    // a bit more questionable if it is an address of a symbol or some other
+    // reference.  So if the immediate Value comes from a width of 1 byte,
+    // InstSize, we will not guess it is an address of a symbol.  Because in
+    // object files assembled starting at address 0 this usually leads to
+    // incorrect symbolication.
+    if (!SymbolLookUp || (InstSize == 1 && !IsBranch))
+      return false;
+
+    uint64_t ReferenceType;
+    if (IsBranch)
+       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+    else
+       ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+    const char *ReferenceName;
+    const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
+                                    &ReferenceName);
+    if (Name) {
+      SymbolicOp.AddSymbol.Name = Name;
+      SymbolicOp.AddSymbol.Present = true;
+      // If Name is a C++ symbol name put the human readable name in a comment.
+      if(ReferenceType == LLVMDisassembler_ReferenceType_DeMangled_Name)
+        cStream << ReferenceName;
+    }
+    // For branches always create an MCExpr so it gets printed as hex address.
+    else if (IsBranch) {
+      SymbolicOp.Value = Value;
+    }
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+      cStream << "symbol stub for: " << ReferenceName;
+    else if(ReferenceType == LLVMDisassembler_ReferenceType_Out_Objc_Message)
+      cStream << "Objc message: " << ReferenceName;
+    if (!Name && !IsBranch)
+      return false;
+  }
+
+  const MCExpr *Add = nullptr;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      Add = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Sub = nullptr;
+  if (SymbolicOp.SubtractSymbol.Present) {
+      if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Off = nullptr;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, Ctx);
+  }
+
+  Expr = RelInfo->createExprForCAPIVariantKind(Expr, SymbolicOp.VariantKind);
+  if (!Expr)
+    return false;
+
+  MI.addOperand(MCOperand::CreateExpr(Expr));
+  return true;
+}
+
+// This function tries to add a comment as to what is being referenced by a load
+// instruction with the base register that is the Pc.  These can often be values
+// in a literal pool near the Address of the instruction. The Address of the
+// instruction and its immediate Value are used as a possible literal pool entry.
+// The SymbolLookUp call back will return the name of a symbol referenced by the
+// literal pool's entry if the referenced address is that of a symbol. Or it
+// will return a pointer to a literal 'C' string if the referenced address of
+// the literal pool's entry is an address into a section with C string literals.
+// Or if the reference is to an Objective-C data structure it will return a
+// specific reference type for it and a string.
+void MCExternalSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+                                                           int64_t Value,
+                                                           uint64_t Address) {
+  if (SymbolLookUp) {
+    uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
+    const char *ReferenceName;
+    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+      cStream << "literal pool symbol address: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) {
+      cStream << "literal pool for: \"";
+      cStream.write_escaped(ReferenceName);
+      cStream << "\"";
+    }
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+      cStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Message)
+      cStream << "Objc message: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+      cStream << "Objc message ref: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+      cStream << "Objc selector ref: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+      cStream << "Objc class ref: " << ReferenceName;
+  }
+}
+
+namespace llvm {
+MCSymbolizer *createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+                                 LLVMSymbolLookupCallback SymbolLookUp,
+                                 void *DisInfo,
+                                 MCContext *Ctx,
+                                 MCRelocationInfo *RelInfo) {
+  assert(Ctx && "No MCContext given for symbolic disassembly");
+
+  return new MCExternalSymbolizer(*Ctx,
+                                  std::unique_ptr<MCRelocationInfo>(RelInfo),
+                                  GetOpInfo, SymbolLookUp, DisInfo);
+}
+}

diff --git a/lib/MC/MCDisassembler/MCRelocationInfo.cpp b/lib/MC/MCDisassembler/MCRelocationInfo.cpp
new file mode 100644
index 0000000..ff0c27f
--- /dev/null
+++ b/lib/MC/MCDisassembler/MCRelocationInfo.cpp

@@ -0,0 +1,39 @@
+//==-- MCRelocationInfo.cpp ------------------------------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm-c/Disassembler.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+MCRelocationInfo::MCRelocationInfo(MCContext &Ctx)
+  : Ctx(Ctx) {
+}
+
+MCRelocationInfo::~MCRelocationInfo() {
+}
+
+const MCExpr *
+MCRelocationInfo::createExprForRelocation(object::RelocationRef Rel) {
+  return nullptr;
+}
+
+const MCExpr *
+MCRelocationInfo::createExprForCAPIVariantKind(const MCExpr *SubExpr,
+                                               unsigned VariantKind) {
+  if (VariantKind != LLVMDisassembler_VariantKind_None)
+    return nullptr;
+  return SubExpr;
+}
+
+MCRelocationInfo *llvm::createMCRelocationInfo(StringRef TT, MCContext &Ctx) {
+  return new MCRelocationInfo(Ctx);
+}

diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 0a3fab8..5effb01 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp

@@ -247,6 +247,22 @@
   return Emit(MCOS, StandardOpcodeLengths);
 }
 
+static const MCExpr *forceExpAbs(MCStreamer &OS, const MCExpr* Expr) {
+  MCContext &Context = OS.getContext();
+  assert(!isa<MCSymbolRefExpr>(Expr));
+  if (Context.getAsmInfo()->hasAggressiveSymbolFolding())
+    return Expr;
+
+  MCSymbol *ABS = Context.CreateTempSymbol();
+  OS.EmitAssignment(ABS, Expr);
+  return MCSymbolRefExpr::Create(ABS, Context);
+}
+
+static void emitAbsValue(MCStreamer &OS, const MCExpr *Value, unsigned Size) {
+  const MCExpr *ABS = forceExpAbs(OS, Value);
+  OS.EmitValue(ABS, Size);
+}
+
 std::pair<MCSymbol *, MCSymbol *>
 MCDwarfLineTableHeader::Emit(MCStreamer *MCOS,
                              ArrayRef<char> StandardOpcodeLengths) const {
@@ -265,8 +281,8 @@
 
   // The first 4 bytes is the total length of the information for this
   // compilation unit (not including these 4 bytes for the length).
-  MCOS->EmitAbsValue(MakeStartMinusEndExpr(*MCOS, *LineStartSym, *LineEndSym,4),
-                     4);
+  emitAbsValue(*MCOS,
+               MakeStartMinusEndExpr(*MCOS, *LineStartSym, *LineEndSym, 4), 4);
 
   // Next 2 bytes is the Version, which is Dwarf 2.
   MCOS->EmitIntValue(2, 2);
@@ -278,8 +294,9 @@
   // section to the end of the prologue.  Not including the 4 bytes for the
   // total length, the 2 bytes for the version, and these 4 bytes for the
   // length of the prologue.
-  MCOS->EmitAbsValue(MakeStartMinusEndExpr(*MCOS, *LineStartSym, *ProEndSym,
-                                           (4 + 2 + 4)), 4);
+  emitAbsValue(
+      *MCOS,
+      MakeStartMinusEndExpr(*MCOS, *LineStartSym, *ProEndSym, (4 + 2 + 4)), 4);
 
   // Parameters of the state machine, are next.
   MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1);
@@ -327,18 +344,6 @@
   for (const auto &LineSec : MCLineSections.getMCLineEntries())
     EmitDwarfLineTable(MCOS, LineSec.first, LineSec.second);
 
-  if (MCOS->getContext().getAsmInfo()->getLinkerRequiresNonEmptyDwarfLines() &&
-      MCLineSections.getMCLineEntries().empty()) {
-    // The darwin9 linker has a bug (see PR8715). For for 32-bit architectures
-    // it requires:
-    // total_length >= prologue_length + 10
-    // We are 4 bytes short, since we have total_length = 51 and
-    // prologue_length = 45
-
-    // The regular end_sequence should be sufficient.
-    MCDwarfLineAddr::Emit(MCOS, INT64_MAX, 0);
-  }
-
   // This is the end of the section, so set the value of the symbol at the end
   // of this section (that was used in a previous expression).
   MCOS->EmitLabel(LineEndSym);
@@ -363,10 +368,10 @@
     FileNumber = SourceIdMap.size() + 1;
     assert((MCDwarfFiles.empty() || FileNumber == MCDwarfFiles.size()) &&
            "Don't mix autonumbered and explicit numbered line table usage");
-    StringMapEntry<unsigned> &Ent = SourceIdMap.GetOrCreateValue(
-        (Directory + Twine('\0') + FileName).str(), FileNumber);
-    if (Ent.getValue() != FileNumber)
-      return Ent.getValue();
+    auto IterBool = SourceIdMap.insert(
+        std::make_pair((Directory + Twine('\0') + FileName).str(), FileNumber));
+    if (!IterBool.second)
+      return IterBool.first->second;
   }
   // Make space for this FileNumber in the MCDwarfFiles vector if needed.
   MCDwarfFiles.resize(FileNumber + 1);
@@ -519,7 +524,8 @@
   MCOS->EmitULEB128IntValue(dwarf::DW_TAG_compile_unit);
   MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1);
   EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4);
-  if (MCOS->getContext().getGenDwarfSectionSyms().size() > 1) {
+  if (MCOS->getContext().getGenDwarfSectionSyms().size() > 1 &&
+      MCOS->getContext().getDwarfVersion() >= 3) {
     EmitAbbrev(MCOS, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4);
   } else {
     EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
@@ -596,7 +602,8 @@
   // The 4 byte offset to the compile unit in the .debug_info from the start
   // of the .debug_info.
   if (InfoSectionSymbol)
-    MCOS->EmitSymbolValue(InfoSectionSymbol, 4);
+    MCOS->EmitSymbolValue(InfoSectionSymbol, 4,
+                          asmInfo->needsDwarfSectionOffsetDirective());
   else
     MCOS->EmitIntValue(0, 4);
   // The 1 byte size of an address.
@@ -620,7 +627,7 @@
     const MCExpr *Size = MakeStartMinusEndExpr(*MCOS,
       *StartSymbol, *EndSymbol, 0);
     MCOS->EmitValue(Addr, AddrSize);
-    MCOS->EmitAbsValue(Size, AddrSize);
+    emitAbsValue(*MCOS, Size, AddrSize);
   }
 
   // And finally the pair of terminating zeros.
@@ -650,18 +657,19 @@
   // The 4 byte total length of the information for this compilation unit, not
   // including these 4 bytes.
   const MCExpr *Length = MakeStartMinusEndExpr(*MCOS, *InfoStart, *InfoEnd, 4);
-  MCOS->EmitAbsValue(Length, 4);
+  emitAbsValue(*MCOS, Length, 4);
 
   // The 2 byte DWARF version.
   MCOS->EmitIntValue(context.getDwarfVersion(), 2);
 
+  const MCAsmInfo &AsmInfo = *context.getAsmInfo();
   // The 4 byte offset to the debug abbrevs from the start of the .debug_abbrev,
   // it is at the start of that section so this is zero.
-  if (AbbrevSectionSymbol) {
-    MCOS->EmitSymbolValue(AbbrevSectionSymbol, 4);
-  } else {
+  if (AbbrevSectionSymbol == nullptr)
     MCOS->EmitIntValue(0, 4);
-  }
+  else
+    MCOS->EmitSymbolValue(AbbrevSectionSymbol, 4,
+                          AsmInfo.needsDwarfSectionOffsetDirective());
 
   const MCAsmInfo *asmInfo = context.getAsmInfo();
   int AddrSize = asmInfo->getPointerSize();
@@ -675,11 +683,11 @@
 
   // DW_AT_stmt_list, a 4 byte offset from the start of the .debug_line section,
   // which is at the start of that section so this is zero.
-  if (LineSectionSymbol) {
-    MCOS->EmitSymbolValue(LineSectionSymbol, 4);
-  } else {
+  if (LineSectionSymbol)
+    MCOS->EmitSymbolValue(LineSectionSymbol, 4,
+                          AsmInfo.needsDwarfSectionOffsetDirective());
+  else
     MCOS->EmitIntValue(0, 4);
-  }
 
   if (RangesSectionSymbol) {
     // There are multiple sections containing code, so we must use the
@@ -740,14 +748,10 @@
 
   // AT_producer, the version of the assembler tool.
   StringRef DwarfDebugProducer = context.getDwarfDebugProducer();
-  if (!DwarfDebugProducer.empty()){
+  if (!DwarfDebugProducer.empty())
     MCOS->EmitBytes(DwarfDebugProducer);
-  }
-  else {
-    MCOS->EmitBytes(StringRef("llvm-mc (based on LLVM "));
-    MCOS->EmitBytes(StringRef(PACKAGE_VERSION));
-    MCOS->EmitBytes(StringRef(")"));
-  }
+  else
+    MCOS->EmitBytes(StringRef("llvm-mc (based on LLVM " PACKAGE_VERSION ")"));
   MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
 
   // AT_language, a 4 byte value.  We use DW_LANG_Mips_Assembler as the dwarf2
@@ -824,7 +828,7 @@
     const MCExpr *SectionSize = MakeStartMinusEndExpr(*MCOS,
       *StartSymbol, *EndSymbol, 0);
     MCOS->EmitIntValue(0, AddrSize);
-    MCOS->EmitAbsValue(SectionSize, AddrSize);
+    emitAbsValue(*MCOS, SectionSize, AddrSize);
   }
 
   // Emit end of list entry
@@ -858,10 +862,11 @@
   if (MCOS->getContext().getGenDwarfSectionSyms().empty())
     return;
 
-  // We only need to use the .debug_ranges section if we have multiple
-  // code sections.
+  // We only use the .debug_ranges section if we have multiple code sections,
+  // and we are emitting a DWARF version which supports it.
   const bool UseRangesSection =
-      MCOS->getContext().getGenDwarfSectionSyms().size() > 1;
+      MCOS->getContext().getGenDwarfSectionSyms().size() > 1 &&
+      MCOS->getContext().getDwarfVersion() >= 3;
   CreateDwarfSectionSymbols |= UseRangesSection;
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
@@ -974,18 +979,16 @@
   }
 }
 
-static void EmitFDESymbol(MCStreamer &streamer, const MCSymbol &symbol,
-                       unsigned symbolEncoding, bool isEH,
-                       const char *comment = nullptr) {
+static void emitFDESymbol(MCObjectStreamer &streamer, const MCSymbol &symbol,
+                       unsigned symbolEncoding, bool isEH) {
   MCContext &context = streamer.getContext();
   const MCAsmInfo *asmInfo = context.getAsmInfo();
   const MCExpr *v = asmInfo->getExprForFDESymbol(&symbol,
                                                  symbolEncoding,
                                                  streamer);
   unsigned size = getSizeForEncoding(streamer, symbolEncoding);
-  if (streamer.isVerboseAsm() && comment) streamer.AddComment(comment);
   if (asmInfo->doDwarfFDESymbolsUseAbsDiff() && isEH)
-    streamer.EmitAbsValue(v, size);
+    emitAbsValue(streamer, v, size);
   else
     streamer.EmitValue(v, size);
 }
@@ -1004,17 +1007,16 @@
 namespace {
   class FrameEmitterImpl {
     int CFAOffset;
-    int CIENum;
     bool IsEH;
     const MCSymbol *SectionStart;
   public:
     FrameEmitterImpl(bool isEH)
-        : CFAOffset(0), CIENum(0), IsEH(isEH), SectionStart(nullptr) {}
+        : CFAOffset(0), IsEH(isEH), SectionStart(nullptr) {}
 
     void setSectionStart(const MCSymbol *Label) { SectionStart = Label; }
 
-    /// EmitCompactUnwind - Emit the unwind information in a compact way.
-    void EmitCompactUnwind(MCStreamer &streamer,
+    /// Emit the unwind information in a compact way.
+    void EmitCompactUnwind(MCObjectStreamer &streamer,
                            const MCDwarfFrameInfo &frame);
 
     const MCSymbol &EmitCIE(MCObjectStreamer &streamer,
@@ -1036,65 +1038,18 @@
 
 } // end anonymous namespace
 
-static void EmitEncodingByte(MCStreamer &Streamer, unsigned Encoding,
-                             StringRef Prefix) {
-  if (Streamer.isVerboseAsm()) {
-    const char *EncStr;
-    switch (Encoding) {
-    default: EncStr = "<unknown encoding>"; break;
-    case dwarf::DW_EH_PE_absptr: EncStr = "absptr"; break;
-    case dwarf::DW_EH_PE_omit:   EncStr = "omit"; break;
-    case dwarf::DW_EH_PE_pcrel:  EncStr = "pcrel"; break;
-    case dwarf::DW_EH_PE_udata4: EncStr = "udata4"; break;
-    case dwarf::DW_EH_PE_udata8: EncStr = "udata8"; break;
-    case dwarf::DW_EH_PE_sdata4: EncStr = "sdata4"; break;
-    case dwarf::DW_EH_PE_sdata8: EncStr = "sdata8"; break;
-    case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata4:
-      EncStr = "pcrel udata4";
-      break;
-    case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4:
-      EncStr = "pcrel sdata4";
-      break;
-    case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8:
-      EncStr = "pcrel udata8";
-      break;
-    case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8:
-      EncStr = "screl sdata8";
-      break;
-    case dwarf::DW_EH_PE_indirect |dwarf::DW_EH_PE_pcrel|dwarf::DW_EH_PE_udata4:
-      EncStr = "indirect pcrel udata4";
-      break;
-    case dwarf::DW_EH_PE_indirect |dwarf::DW_EH_PE_pcrel|dwarf::DW_EH_PE_sdata4:
-      EncStr = "indirect pcrel sdata4";
-      break;
-    case dwarf::DW_EH_PE_indirect |dwarf::DW_EH_PE_pcrel|dwarf::DW_EH_PE_udata8:
-      EncStr = "indirect pcrel udata8";
-      break;
-    case dwarf::DW_EH_PE_indirect |dwarf::DW_EH_PE_pcrel|dwarf::DW_EH_PE_sdata8:
-      EncStr = "indirect pcrel sdata8";
-      break;
-    }
-
-    Streamer.AddComment(Twine(Prefix) + " = " + EncStr);
-  }
-
+static void emitEncodingByte(MCObjectStreamer &Streamer, unsigned Encoding) {
   Streamer.EmitIntValue(Encoding, 1);
 }
 
 void FrameEmitterImpl::EmitCFIInstruction(MCObjectStreamer &Streamer,
                                           const MCCFIInstruction &Instr) {
   int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
-  bool VerboseAsm = Streamer.isVerboseAsm();
 
   switch (Instr.getOperation()) {
   case MCCFIInstruction::OpRegister: {
     unsigned Reg1 = Instr.getRegister();
     unsigned Reg2 = Instr.getRegister2();
-    if (VerboseAsm) {
-      Streamer.AddComment("DW_CFA_register");
-      Streamer.AddComment(Twine("Reg1 ") + Twine(Reg1));
-      Streamer.AddComment(Twine("Reg2 ") + Twine(Reg2));
-    }
     Streamer.EmitIntValue(dwarf::DW_CFA_register, 1);
     Streamer.EmitULEB128IntValue(Reg1);
     Streamer.EmitULEB128IntValue(Reg2);
@@ -1106,10 +1061,6 @@
   }
   case MCCFIInstruction::OpUndefined: {
     unsigned Reg = Instr.getRegister();
-    if (VerboseAsm) {
-      Streamer.AddComment("DW_CFA_undefined");
-      Streamer.AddComment(Twine("Reg ") + Twine(Reg));
-    }
     Streamer.EmitIntValue(dwarf::DW_CFA_undefined, 1);
     Streamer.EmitULEB128IntValue(Reg);
     return;
@@ -1119,8 +1070,6 @@
     const bool IsRelative =
       Instr.getOperation() == MCCFIInstruction::OpAdjustCfaOffset;
 
-    if (VerboseAsm)
-      Streamer.AddComment("DW_CFA_def_cfa_offset");
     Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_offset, 1);
 
     if (IsRelative)
@@ -1128,37 +1077,21 @@
     else
       CFAOffset = -Instr.getOffset();
 
-    if (VerboseAsm)
-      Streamer.AddComment(Twine("Offset " + Twine(CFAOffset)));
     Streamer.EmitULEB128IntValue(CFAOffset);
 
     return;
   }
   case MCCFIInstruction::OpDefCfa: {
-    if (VerboseAsm)
-      Streamer.AddComment("DW_CFA_def_cfa");
     Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa, 1);
-
-    if (VerboseAsm)
-      Streamer.AddComment(Twine("Reg ") + Twine(Instr.getRegister()));
     Streamer.EmitULEB128IntValue(Instr.getRegister());
-
     CFAOffset = -Instr.getOffset();
-
-    if (VerboseAsm)
-      Streamer.AddComment(Twine("Offset " + Twine(CFAOffset)));
     Streamer.EmitULEB128IntValue(CFAOffset);
 
     return;
   }
 
   case MCCFIInstruction::OpDefCfaRegister: {
-    if (VerboseAsm)
-      Streamer.AddComment("DW_CFA_def_cfa_register");
     Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_register, 1);
-
-    if (VerboseAsm)
-      Streamer.AddComment(Twine("Reg ") + Twine(Instr.getRegister()));
     Streamer.EmitULEB128IntValue(Instr.getRegister());
 
     return;
@@ -1176,63 +1109,44 @@
     Offset = Offset / dataAlignmentFactor;
 
     if (Offset < 0) {
-      if (VerboseAsm) Streamer.AddComment("DW_CFA_offset_extended_sf");
       Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended_sf, 1);
-      if (VerboseAsm) Streamer.AddComment(Twine("Reg ") + Twine(Reg));
       Streamer.EmitULEB128IntValue(Reg);
-      if (VerboseAsm) Streamer.AddComment(Twine("Offset ") + Twine(Offset));
       Streamer.EmitSLEB128IntValue(Offset);
     } else if (Reg < 64) {
-      if (VerboseAsm) Streamer.AddComment(Twine("DW_CFA_offset + Reg(") +
-                                          Twine(Reg) + ")");
       Streamer.EmitIntValue(dwarf::DW_CFA_offset + Reg, 1);
-      if (VerboseAsm) Streamer.AddComment(Twine("Offset ") + Twine(Offset));
       Streamer.EmitULEB128IntValue(Offset);
     } else {
-      if (VerboseAsm) Streamer.AddComment("DW_CFA_offset_extended");
       Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended, 1);
-      if (VerboseAsm) Streamer.AddComment(Twine("Reg ") + Twine(Reg));
       Streamer.EmitULEB128IntValue(Reg);
-      if (VerboseAsm) Streamer.AddComment(Twine("Offset ") + Twine(Offset));
       Streamer.EmitULEB128IntValue(Offset);
     }
     return;
   }
   case MCCFIInstruction::OpRememberState:
-    if (VerboseAsm) Streamer.AddComment("DW_CFA_remember_state");
     Streamer.EmitIntValue(dwarf::DW_CFA_remember_state, 1);
     return;
   case MCCFIInstruction::OpRestoreState:
-    if (VerboseAsm) Streamer.AddComment("DW_CFA_restore_state");
     Streamer.EmitIntValue(dwarf::DW_CFA_restore_state, 1);
     return;
   case MCCFIInstruction::OpSameValue: {
     unsigned Reg = Instr.getRegister();
-    if (VerboseAsm) Streamer.AddComment("DW_CFA_same_value");
     Streamer.EmitIntValue(dwarf::DW_CFA_same_value, 1);
-    if (VerboseAsm) Streamer.AddComment(Twine("Reg ") + Twine(Reg));
     Streamer.EmitULEB128IntValue(Reg);
     return;
   }
   case MCCFIInstruction::OpRestore: {
     unsigned Reg = Instr.getRegister();
-    if (VerboseAsm) {
-      Streamer.AddComment("DW_CFA_restore");
-      Streamer.AddComment(Twine("Reg ") + Twine(Reg));
-    }
     Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
     return;
   }
   case MCCFIInstruction::OpEscape:
-    if (VerboseAsm) Streamer.AddComment("Escape bytes");
     Streamer.EmitBytes(Instr.getValues());
     return;
   }
   llvm_unreachable("Unhandled case in switch");
 }
 
-/// EmitFrameMoves - Emit frame instructions to describe the layout of the
-/// frame.
+/// Emit frame instructions to describe the layout of the frame.
 void FrameEmitterImpl::EmitCFIInstructions(MCObjectStreamer &streamer,
                                            ArrayRef<MCCFIInstruction> Instrs,
                                            MCSymbol *BaseLabel) {
@@ -1246,7 +1160,6 @@
     if (BaseLabel && Label) {
       MCSymbol *ThisSym = Label;
       if (ThisSym != BaseLabel) {
-        if (streamer.isVerboseAsm()) streamer.AddComment("DW_CFA_advance_loc4");
         streamer.EmitDwarfAdvanceFrameAddr(BaseLabel, ThisSym);
         BaseLabel = ThisSym;
       }
@@ -1256,12 +1169,11 @@
   }
 }
 
-/// EmitCompactUnwind - Emit the unwind information in a compact way.
-void FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
+/// Emit the unwind information in a compact way.
+void FrameEmitterImpl::EmitCompactUnwind(MCObjectStreamer &Streamer,
                                          const MCDwarfFrameInfo &Frame) {
   MCContext &Context = Streamer.getContext();
   const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
-  bool VerboseAsm = Streamer.isVerboseAsm();
 
   // range-start range-length  compact-unwind-enc personality-func   lsda
   //  _foo       LfooEnd-_foo  0x00000023          0                 0
@@ -1296,24 +1208,19 @@
   // Range Start
   unsigned FDEEncoding = MOFI->getFDEEncoding();
   unsigned Size = getSizeForEncoding(Streamer, FDEEncoding);
-  if (VerboseAsm) Streamer.AddComment("Range Start");
   Streamer.EmitSymbolValue(Frame.Begin, Size);
 
   // Range Length
   const MCExpr *Range = MakeStartMinusEndExpr(Streamer, *Frame.Begin,
                                               *Frame.End, 0);
-  if (VerboseAsm) Streamer.AddComment("Range Length");
-  Streamer.EmitAbsValue(Range, 4);
+  emitAbsValue(Streamer, Range, 4);
 
   // Compact Encoding
   Size = getSizeForEncoding(Streamer, dwarf::DW_EH_PE_udata4);
-  if (VerboseAsm) Streamer.AddComment("Compact Unwind Encoding: 0x" +
-                                      Twine::utohexstr(Encoding));
   Streamer.EmitIntValue(Encoding, Size);
 
   // Personality Function
   Size = getSizeForEncoding(Streamer, dwarf::DW_EH_PE_absptr);
-  if (VerboseAsm) Streamer.AddComment("Personality Function");
   if (!DwarfEHFrameOnly && Frame.Personality)
     Streamer.EmitSymbolValue(Frame.Personality, Size);
   else
@@ -1321,7 +1228,6 @@
 
   // LSDA
   Size = getSizeForEncoding(Streamer, Frame.LsdaEncoding);
-  if (VerboseAsm) Streamer.AddComment("LSDA");
   if (!DwarfEHFrameOnly && Frame.Lsda)
     Streamer.EmitSymbolValue(Frame.Lsda, Size);
   else
@@ -1338,27 +1244,22 @@
   MCContext &context = streamer.getContext();
   const MCRegisterInfo *MRI = context.getRegisterInfo();
   const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
-  bool verboseAsm = streamer.isVerboseAsm();
 
   MCSymbol *sectionStart = context.CreateTempSymbol();
   streamer.EmitLabel(sectionStart);
-  CIENum++;
 
   MCSymbol *sectionEnd = context.CreateTempSymbol();
 
   // Length
   const MCExpr *Length = MakeStartMinusEndExpr(streamer, *sectionStart,
                                                *sectionEnd, 4);
-  if (verboseAsm) streamer.AddComment("CIE Length");
-  streamer.EmitAbsValue(Length, 4);
+  emitAbsValue(streamer, Length, 4);
 
   // CIE ID
   unsigned CIE_ID = IsEH ? 0 : -1;
-  if (verboseAsm) streamer.AddComment("CIE ID Tag");
   streamer.EmitIntValue(CIE_ID, 4);
 
   // Version
-  if (verboseAsm) streamer.AddComment("DW_CIE_VERSION");
   // For DWARF2, we use CIE version 1
   // For DWARF3+, we use CIE version 3
   uint8_t CIEVersion = context.getDwarfVersion() <= 2 ? 1 : 3;
@@ -1367,7 +1268,6 @@
   // Augmentation String
   SmallString<8> Augmentation;
   if (IsEH) {
-    if (verboseAsm) streamer.AddComment("CIE Augmentation");
     Augmentation += "z";
     if (personality)
       Augmentation += "P";
@@ -1381,15 +1281,12 @@
   streamer.EmitIntValue(0, 1);
 
   // Code Alignment Factor
-  if (verboseAsm) streamer.AddComment("CIE Code Alignment Factor");
   streamer.EmitULEB128IntValue(context.getAsmInfo()->getMinInstAlignment());
 
   // Data Alignment Factor
-  if (verboseAsm) streamer.AddComment("CIE Data Alignment Factor");
   streamer.EmitSLEB128IntValue(getDataAlignmentFactor(streamer));
 
   // Return Address Register
-  if (verboseAsm) streamer.AddComment("CIE Return Address Column");
   if (CIEVersion == 1) {
     assert(MRI->getRARegister() <= 255 &&
            "DWARF 2 encodes return_address_register in one byte");
@@ -1414,24 +1311,21 @@
     // Encoding of the FDE pointers
     augmentationLength += 1;
 
-    if (verboseAsm) streamer.AddComment("Augmentation Size");
     streamer.EmitULEB128IntValue(augmentationLength);
 
     // Augmentation Data (optional)
     if (personality) {
       // Personality Encoding
-      EmitEncodingByte(streamer, personalityEncoding,
-                       "Personality Encoding");
+      emitEncodingByte(streamer, personalityEncoding);
       // Personality
-      if (verboseAsm) streamer.AddComment("Personality");
       EmitPersonality(streamer, *personality, personalityEncoding);
     }
 
     if (lsda)
-      EmitEncodingByte(streamer, lsdaEncoding, "LSDA Encoding");
+      emitEncodingByte(streamer, lsdaEncoding);
 
     // Encoding of the FDE pointers
-    EmitEncodingByte(streamer, MOFI->getFDEEncoding(), "FDE Encoding");
+    emitEncodingByte(streamer, MOFI->getFDEEncoding());
   }
 
   // Initial Instructions
@@ -1457,12 +1351,10 @@
   MCSymbol *fdeStart = context.CreateTempSymbol();
   MCSymbol *fdeEnd = context.CreateTempSymbol();
   const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
-  bool verboseAsm = streamer.isVerboseAsm();
 
   // Length
   const MCExpr *Length = MakeStartMinusEndExpr(streamer, *fdeStart, *fdeEnd, 0);
-  if (verboseAsm) streamer.AddComment("FDE Length");
-  streamer.EmitAbsValue(Length, 4);
+  emitAbsValue(streamer, Length, 4);
 
   streamer.EmitLabel(fdeStart);
 
@@ -1471,12 +1363,11 @@
   if (IsEH) {
     const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart,
                                                  0);
-    if (verboseAsm) streamer.AddComment("FDE CIE Offset");
-    streamer.EmitAbsValue(offset, 4);
+    emitAbsValue(streamer, offset, 4);
   } else if (!asmInfo->doesDwarfUseRelocationsAcrossSections()) {
     const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart,
                                                  cieStart, 0);
-    streamer.EmitAbsValue(offset, 4);
+    emitAbsValue(streamer, offset, 4);
   } else {
     streamer.EmitSymbolValue(&cieStart, 4);
   }
@@ -1485,13 +1376,12 @@
   unsigned PCEncoding =
       IsEH ? MOFI->getFDEEncoding() : (unsigned)dwarf::DW_EH_PE_absptr;
   unsigned PCSize = getSizeForEncoding(streamer, PCEncoding);
-  EmitFDESymbol(streamer, *frame.Begin, PCEncoding, IsEH, "FDE initial location");
+  emitFDESymbol(streamer, *frame.Begin, PCEncoding, IsEH);
 
   // PC Range
   const MCExpr *Range = MakeStartMinusEndExpr(streamer, *frame.Begin,
                                               *frame.End, 0);
-  if (verboseAsm) streamer.AddComment("FDE address range");
-  streamer.EmitAbsValue(Range, PCSize);
+  emitAbsValue(streamer, Range, PCSize);
 
   if (IsEH) {
     // Augmentation Data Length
@@ -1500,13 +1390,11 @@
     if (frame.Lsda)
       augmentationLength += getSizeForEncoding(streamer, frame.LsdaEncoding);
 
-    if (verboseAsm) streamer.AddComment("Augmentation size");
     streamer.EmitULEB128IntValue(augmentationLength);
 
     // Augmentation Data
     if (frame.Lsda)
-      EmitFDESymbol(streamer, *frame.Lsda, frame.LsdaEncoding, true,
-                    "Language Specific Data Area");
+      emitFDESymbol(streamer, *frame.Lsda, frame.LsdaEncoding, true);
   }
 
   // Call Frame Instructions
@@ -1574,7 +1462,7 @@
   MCContext &Context = Streamer.getContext();
   const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
   FrameEmitterImpl Emitter(IsEH);
-  ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getFrameInfos();
+  ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getDwarfFrameInfos();
 
   // Emit the compact unwind info if available.
   bool NeedsEHFrameSection = !MOFI->getSupportsCompactUnwindWithoutEHFrame();

diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp
index 0a9cd31..386c209 100644
--- a/lib/MC/MCELF.cpp
+++ b/lib/MC/MCELF.cpp

@@ -61,7 +61,7 @@
   SD.setFlags(OtherFlags | (Visibility << ELF_STV_Shift));
 }
 
-unsigned MCELF::GetVisibility(MCSymbolData &SD) {
+unsigned MCELF::GetVisibility(const MCSymbolData &SD) {
   unsigned Visibility =
     (SD.getFlags() & (0x3 << ELF_STV_Shift)) >> ELF_STV_Shift;
   assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL ||
@@ -76,7 +76,7 @@
   SD.setFlags(OtherFlags | (Other << ELF_STO_Shift));
 }
 
-unsigned MCELF::getOther(MCSymbolData &SD) {
+unsigned MCELF::getOther(const MCSymbolData &SD) {
   unsigned Other =
     (SD.getFlags() & (0x3f << ELF_STO_Shift)) >> ELF_STO_Shift;
   return Other;

diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index 4012c44..84176dc 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp

@@ -24,6 +24,7 @@
     IsN64(IsN64_){
 }
 
-bool MCELFObjectTargetWriter::needsRelocateWithSymbol(unsigned Type) const {
+bool MCELFObjectTargetWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
+                                                      unsigned Type) const {
   return false;
 }

diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 7c70540..bdc4a84 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp

@@ -15,6 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -38,19 +39,23 @@
 MCELFStreamer::~MCELFStreamer() {
 }
 
-void MCELFStreamer::InitSections() {
+void MCELFStreamer::InitSections(bool NoExecStack) {
   // This emulates the same behavior of GNU as. This makes it easier
   // to compare the output as the major sections are in the same order.
-  SwitchSection(getContext().getObjectFileInfo()->getTextSection());
+  MCContext &Ctx = getContext();
+  SwitchSection(Ctx.getObjectFileInfo()->getTextSection());
   EmitCodeAlignment(4);
 
-  SwitchSection(getContext().getObjectFileInfo()->getDataSection());
+  SwitchSection(Ctx.getObjectFileInfo()->getDataSection());
   EmitCodeAlignment(4);
 
-  SwitchSection(getContext().getObjectFileInfo()->getBSSSection());
+  SwitchSection(Ctx.getObjectFileInfo()->getBSSSection());
   EmitCodeAlignment(4);
 
-  SwitchSection(getContext().getObjectFileInfo()->getTextSection());
+  SwitchSection(Ctx.getObjectFileInfo()->getTextSection());
+
+  if (NoExecStack)
+    SwitchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
 }
 
 void MCELFStreamer::EmitLabel(MCSymbol *Symbol) {
@@ -87,10 +92,19 @@
   MCSectionData *CurSection = getCurrentSectionData();
   if (CurSection && CurSection->isBundleLocked())
     report_fatal_error("Unterminated .bundle_lock when changing a section");
-  const MCSymbol *Grp = static_cast<const MCSectionELF *>(Section)->getGroup();
+
+  MCAssembler &Asm = getAssembler();
+  auto *SectionELF = static_cast<const MCSectionELF *>(Section);
+  const MCSymbol *Grp = SectionELF->getGroup();
   if (Grp)
-    getAssembler().getOrCreateSymbolData(*Grp);
+    Asm.getOrCreateSymbolData(*Grp);
+
   this->MCObjectStreamer::ChangeSection(Section, Subsection);
+  MCSymbol *SectionSymbol = getContext().getOrCreateSectionSymbol(*SectionELF);
+  if (SectionSymbol->isUndefined()) {
+    EmitLabel(SectionSymbol);
+    MCELF::SetType(Asm.getSymbolData(*SectionSymbol), ELF::STT_SECTION);
+  }
 }
 
 void MCELFStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
@@ -448,11 +462,13 @@
     } else {
       DF = new MCDataFragment();
       insert(DF);
-      if (SD->getBundleLockState() == MCSectionData::BundleLockedAlignToEnd) {
-        // If this is a new fragment created for a bundle-locked group, and the
-        // group was marked as "align_to_end", set a flag in the fragment.
-        DF->setAlignToBundleEnd(true);
-      }
+    }
+    if (SD->getBundleLockState() == MCSectionData::BundleLockedAlignToEnd) {
+      // If this fragment is for a group marked "align_to_end", set a flag
+      // in the fragment. This can happen after the fragment has already been
+      // created if there are nested bundle_align groups and an inner one
+      // is the one marked align_to_end.
+      DF->setAlignToBundleEnd(true);
     }
 
     // We're now emitting an instruction in a bundle group, so this flag has
@@ -474,10 +490,11 @@
 void MCELFStreamer::EmitBundleAlignMode(unsigned AlignPow2) {
   assert(AlignPow2 <= 30 && "Invalid bundle alignment");
   MCAssembler &Assembler = getAssembler();
-  if (Assembler.getBundleAlignSize() == 0 && AlignPow2 > 0)
-    Assembler.setBundleAlignSize(1 << AlignPow2);
+  if (AlignPow2 > 0 && (Assembler.getBundleAlignSize() == 0 ||
+                        Assembler.getBundleAlignSize() == 1U << AlignPow2))
+    Assembler.setBundleAlignSize(1U << AlignPow2);
   else
-    report_fatal_error(".bundle_align_mode should be only set once per file");
+    report_fatal_error(".bundle_align_mode cannot be changed once set");
 }
 
 void MCELFStreamer::EmitBundleLock(bool AlignToEnd) {
@@ -487,12 +504,12 @@
   //
   if (!getAssembler().isBundlingEnabled())
     report_fatal_error(".bundle_lock forbidden when bundling is disabled");
-  else if (SD->isBundleLocked())
-    report_fatal_error("Nesting of .bundle_lock is forbidden");
+
+  if (!SD->isBundleLocked())
+    SD->setBundleGroupBeforeFirstInst(true);
 
   SD->setBundleLockState(AlignToEnd ? MCSectionData::BundleLockedAlignToEnd :
                                       MCSectionData::BundleLocked);
-  SD->setBundleGroupBeforeFirstInst(true);
 }
 
 void MCELFStreamer::EmitBundleUnlock() {
@@ -543,12 +560,10 @@
 
 MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                     raw_ostream &OS, MCCodeEmitter *CE,
-                                    bool RelaxAll, bool NoExecStack) {
+                                    bool RelaxAll) {
   MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
-  if (NoExecStack)
-    S->getAssembler().setNoExecStack(true);
   return S;
 }
 

diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index f724716..6e648b2 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp

@@ -49,12 +49,8 @@
     else
       OS << Sym;
 
-    if (SRE.getKind() != MCSymbolRefExpr::VK_None) {
-      if (SRE.getMCAsmInfo().useParensForSymbolVariant())
-        OS << '(' << MCSymbolRefExpr::getVariantKindName(SRE.getKind()) << ')';
-      else
-        OS << '@' << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
-    }
+    if (SRE.getKind() != MCSymbolRefExpr::VK_None)
+      SRE.printVariantKind(OS);
 
     return;
   }
@@ -150,6 +146,15 @@
 
 /* *** */
 
+MCSymbolRefExpr::MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
+                                 const MCAsmInfo *MAI)
+    : MCExpr(MCExpr::SymbolRef), Kind(Kind),
+      UseParensForSymbolVariant(MAI->useParensForSymbolVariant()),
+      HasSubsectionsViaSymbols(MAI->hasSubsectionsViaSymbols()),
+      Symbol(Symbol) {
+  assert(Symbol);
+}
+
 const MCSymbolRefExpr *MCSymbolRefExpr::Create(const MCSymbol *Sym,
                                                VariantKind Kind,
                                                MCContext &Ctx) {
@@ -247,6 +252,7 @@
   case VK_PPC_GOT_TLSLD_HI: return "got@tlsld@h";
   case VK_PPC_GOT_TLSLD_HA: return "got@tlsld@ha";
   case VK_PPC_TLSLD: return "tlsld";
+  case VK_PPC_LOCAL: return "local";
   case VK_Mips_GPREL: return "GPREL";
   case VK_Mips_GOT_CALL: return "GOT_CALL";
   case VK_Mips_GOT16: return "GOT16";
@@ -273,7 +279,7 @@
   case VK_Mips_CALL_LO16: return "CALL_LO16";
   case VK_Mips_PCREL_HI16: return "PCREL_HI16";
   case VK_Mips_PCREL_LO16: return "PCREL_LO16";
-  case VK_COFF_IMGREL32: return "IMGREL32";
+  case VK_COFF_IMGREL32: return "IMGREL";
   }
   llvm_unreachable("Invalid variant kind");
 }
@@ -442,6 +448,13 @@
     .Default(VK_Invalid);
 }
 
+void MCSymbolRefExpr::printVariantKind(raw_ostream &OS) const {
+  if (UseParensForSymbolVariant)
+    OS << '(' << MCSymbolRefExpr::getVariantKindName(getKind()) << ')';
+  else
+    OS << '@' << MCSymbolRefExpr::getVariantKindName(getKind());
+}
+
 /* *** */
 
 void MCTargetExpr::anchor() {}
@@ -467,9 +480,27 @@
   return EvaluateAsAbsolute(Res, &Asm, nullptr, nullptr);
 }
 
+int64_t MCExpr::evaluateKnownAbsolute(const MCAsmLayout &Layout) const {
+  int64_t Res;
+  bool Abs =
+      evaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, nullptr, true);
+  (void)Abs;
+  assert(Abs && "Not actually absolute");
+  return Res;
+}
+
 bool MCExpr::EvaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
                                 const MCAsmLayout *Layout,
                                 const SectionAddrMap *Addrs) const {
+  // FIXME: The use if InSet = Addrs is a hack. Setting InSet causes us
+  // absolutize differences across sections and that is what the MachO writer
+  // uses Addrs for.
+  return evaluateAsAbsolute(Res, Asm, Layout, Addrs, Addrs);
+}
+
+bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
+                                const MCAsmLayout *Layout,
+                                const SectionAddrMap *Addrs, bool InSet) const {
   MCValue Value;
 
   // Fast path constants.
@@ -478,12 +509,8 @@
     return true;
   }
 
-  // FIXME: The use if InSet = Addrs is a hack. Setting InSet causes us
-  // absolutize differences across sections and that is what the MachO writer
-  // uses Addrs for.
-  bool IsRelocatable =
-      EvaluateAsRelocatableImpl(Value, Asm, Layout, Addrs, /*InSet*/ Addrs,
-                                /*ForceVarExpansion*/ false);
+  bool IsRelocatable = EvaluateAsRelocatableImpl(
+      Value, Asm, Layout, nullptr, Addrs, InSet, /*ForceVarExpansion*/ false);
 
   // Record the current value.
   Res = Value.getConstant();
@@ -632,27 +659,31 @@
 }
 
 bool MCExpr::EvaluateAsRelocatable(MCValue &Res,
-                                   const MCAsmLayout *Layout) const {
+                                   const MCAsmLayout *Layout,
+                                   const MCFixup *Fixup) const {
   MCAssembler *Assembler = Layout ? &Layout->getAssembler() : nullptr;
-  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, nullptr, false,
-                                   /*ForceVarExpansion*/ false);
+  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, Fixup, nullptr,
+                                   false, /*ForceVarExpansion*/ false);
 }
 
-bool MCExpr::EvaluateAsValue(MCValue &Res, const MCAsmLayout *Layout) const {
+bool MCExpr::EvaluateAsValue(MCValue &Res, const MCAsmLayout *Layout,
+                             const MCFixup *Fixup) const {
   MCAssembler *Assembler = Layout ? &Layout->getAssembler() : nullptr;
-  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, nullptr, false,
-                                   /*ForceVarExpansion*/ true);
+  return EvaluateAsRelocatableImpl(Res, Assembler, Layout, Fixup, nullptr,
+                                   false, /*ForceVarExpansion*/ true);
 }
 
 bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
                                        const MCAsmLayout *Layout,
+                                       const MCFixup *Fixup,
                                        const SectionAddrMap *Addrs, bool InSet,
                                        bool ForceVarExpansion) const {
   ++stats::MCExprEvaluate;
 
   switch (getKind()) {
   case Target:
-    return cast<MCTargetExpr>(this)->EvaluateAsRelocatableImpl(Res, Layout);
+    return cast<MCTargetExpr>(this)->EvaluateAsRelocatableImpl(Res, Layout,
+                                                               Fixup);
 
   case Constant:
     Res = MCValue::get(cast<MCConstantExpr>(this)->getValue());
@@ -661,16 +692,15 @@
   case SymbolRef: {
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(this);
     const MCSymbol &Sym = SRE->getSymbol();
-    const MCAsmInfo &MCAsmInfo = SRE->getMCAsmInfo();
 
     // Evaluate recursively if this is a variable.
     if (Sym.isVariable() && SRE->getKind() == MCSymbolRefExpr::VK_None) {
       if (Sym.getVariableValue()->EvaluateAsRelocatableImpl(
-              Res, Asm, Layout, Addrs, true, ForceVarExpansion)) {
+              Res, Asm, Layout, Fixup, Addrs, true, ForceVarExpansion)) {
         const MCSymbolRefExpr *A = Res.getSymA();
         const MCSymbolRefExpr *B = Res.getSymB();
 
-        if (MCAsmInfo.hasSubsectionsViaSymbols()) {
+        if (SRE->hasSubsectionsViaSymbols()) {
           // FIXME: This is small hack. Given
           // a = b + 4
           // .long a
@@ -697,8 +727,9 @@
     const MCUnaryExpr *AUE = cast<MCUnaryExpr>(this);
     MCValue Value;
 
-    if (!AUE->getSubExpr()->EvaluateAsRelocatableImpl(Value, Asm, Layout, Addrs,
-                                                      InSet, ForceVarExpansion))
+    if (!AUE->getSubExpr()->EvaluateAsRelocatableImpl(Value, Asm, Layout,
+                                                      Fixup, Addrs, InSet,
+                                                      ForceVarExpansion))
       return false;
 
     switch (AUE->getOpcode()) {
@@ -731,10 +762,12 @@
     const MCBinaryExpr *ABE = cast<MCBinaryExpr>(this);
     MCValue LHSValue, RHSValue;
 
-    if (!ABE->getLHS()->EvaluateAsRelocatableImpl(LHSValue, Asm, Layout, Addrs,
-                                                  InSet, ForceVarExpansion) ||
-        !ABE->getRHS()->EvaluateAsRelocatableImpl(RHSValue, Asm, Layout, Addrs,
-                                                  InSet, ForceVarExpansion))
+    if (!ABE->getLHS()->EvaluateAsRelocatableImpl(LHSValue, Asm, Layout,
+                                                  Fixup, Addrs, InSet,
+                                                  ForceVarExpansion) ||
+        !ABE->getRHS()->EvaluateAsRelocatableImpl(RHSValue, Asm, Layout,
+                                                  Fixup, Addrs, InSet,
+                                                  ForceVarExpansion))
       return false;
 
     // We only support a few operations on non-constant expressions, handle

diff --git a/lib/MC/MCExternalSymbolizer.cpp b/lib/MC/MCExternalSymbolizer.cpp
deleted file mode 100644
index 7c3073a..0000000
--- a/lib/MC/MCExternalSymbolizer.cpp
+++ /dev/null

@@ -1,198 +0,0 @@
-//===-- lib/MC/MCExternalSymbolizer.cpp - External symbolizer ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCExternalSymbolizer.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstring>
-
-using namespace llvm;
-
-// This function tries to add a symbolic operand in place of the immediate
-// Value in the MCInst. The immediate Value has had any PC adjustment made by
-// the caller. If the instruction is a branch instruction then IsBranch is true,
-// else false. If the getOpInfo() function was set as part of the
-// setupForSymbolicDisassembly() call then that function is called to get any
-// symbolic information at the Address for this instruction. If that returns
-// non-zero then the symbolic information it returns is used to create an MCExpr
-// and that is added as an operand to the MCInst. If getOpInfo() returns zero
-// and IsBranch is true then a symbol look up for Value is done and if a symbol
-// is found an MCExpr is created with that, else an MCExpr with Value is
-// created. This function returns true if it adds an operand to the MCInst and
-// false otherwise.
-bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
-                                                    raw_ostream &cStream,
-                                                    int64_t Value,
-                                                    uint64_t Address,
-                                                    bool IsBranch,
-                                                    uint64_t Offset,
-                                                    uint64_t InstSize) {
-  struct LLVMOpInfo1 SymbolicOp;
-  std::memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-  SymbolicOp.Value = Value;
-
-  if (!GetOpInfo ||
-      !GetOpInfo(DisInfo, Address, Offset, InstSize, 1, &SymbolicOp)) {
-    // Clear SymbolicOp.Value from above and also all other fields.
-    std::memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-
-    // At this point, GetOpInfo() did not find any relocation information about
-    // this operand and we are left to use the SymbolLookUp() call back to guess
-    // if the Value is the address of a symbol.  In the case this is a branch
-    // that always makes sense to guess.  But in the case of an immediate it is
-    // a bit more questionable if it is an address of a symbol or some other
-    // reference.  So if the immediate Value comes from a width of 1 byte,
-    // InstSize, we will not guess it is an address of a symbol.  Because in
-    // object files assembled starting at address 0 this usually leads to
-    // incorrect symbolication.
-    if (!SymbolLookUp || (InstSize == 1 && !IsBranch))
-      return false;
-
-    uint64_t ReferenceType;
-    if (IsBranch)
-       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
-    else
-       ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
-    const char *ReferenceName;
-    const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
-                                    &ReferenceName);
-    if (Name) {
-      SymbolicOp.AddSymbol.Name = Name;
-      SymbolicOp.AddSymbol.Present = true;
-      // If Name is a C++ symbol name put the human readable name in a comment.
-      if(ReferenceType == LLVMDisassembler_ReferenceType_DeMangled_Name)
-        cStream << ReferenceName;
-    }
-    // For branches always create an MCExpr so it gets printed as hex address.
-    else if (IsBranch) {
-      SymbolicOp.Value = Value;
-    }
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
-      cStream << "symbol stub for: " << ReferenceName;
-    else if(ReferenceType == LLVMDisassembler_ReferenceType_Out_Objc_Message)
-      cStream << "Objc message: " << ReferenceName;
-    if (!Name && !IsBranch)
-      return false;
-  }
-
-  const MCExpr *Add = nullptr;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
-      Add = MCSymbolRefExpr::Create(Sym, Ctx);
-    } else {
-      Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, Ctx);
-    }
-  }
-
-  const MCExpr *Sub = nullptr;
-  if (SymbolicOp.SubtractSymbol.Present) {
-      if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, Ctx);
-    } else {
-      Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, Ctx);
-    }
-  }
-
-  const MCExpr *Off = nullptr;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
-    if (Off)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, Ctx);
-  }
-
-  Expr = RelInfo->createExprForCAPIVariantKind(Expr, SymbolicOp.VariantKind);
-  if (!Expr)
-    return false;
-
-  MI.addOperand(MCOperand::CreateExpr(Expr));
-  return true;
-}
-
-// This function tries to add a comment as to what is being referenced by a load
-// instruction with the base register that is the Pc.  These can often be values
-// in a literal pool near the Address of the instruction. The Address of the
-// instruction and its immediate Value are used as a possible literal pool entry.
-// The SymbolLookUp call back will return the name of a symbol referenced by the
-// literal pool's entry if the referenced address is that of a symbol. Or it
-// will return a pointer to a literal 'C' string if the referenced address of
-// the literal pool's entry is an address into a section with C string literals.
-// Or if the reference is to an Objective-C data structure it will return a
-// specific reference type for it and a string.
-void MCExternalSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
-                                                           int64_t Value,
-                                                           uint64_t Address) {
-  if (SymbolLookUp) {
-    uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
-    const char *ReferenceName;
-    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
-      cStream << "literal pool symbol address: " << ReferenceName;
-    else if(ReferenceType ==
-            LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) {
-      cStream << "literal pool for: \"";
-      cStream.write_escaped(ReferenceName);
-      cStream << "\"";
-    }
-    else if(ReferenceType ==
-            LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
-      cStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
-    else if(ReferenceType ==
-            LLVMDisassembler_ReferenceType_Out_Objc_Message)
-      cStream << "Objc message: " << ReferenceName;
-    else if(ReferenceType ==
-            LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
-      cStream << "Objc message ref: " << ReferenceName;
-    else if(ReferenceType ==
-            LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
-      cStream << "Objc selector ref: " << ReferenceName;
-    else if(ReferenceType ==
-            LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
-      cStream << "Objc class ref: " << ReferenceName;
-  }
-}
-
-namespace llvm {
-MCSymbolizer *createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
-                                 LLVMSymbolLookupCallback SymbolLookUp,
-                                 void *DisInfo,
-                                 MCContext *Ctx,
-                                 MCRelocationInfo *RelInfo) {
-  assert(Ctx && "No MCContext given for symbolic disassembly");
-
-  return new MCExternalSymbolizer(*Ctx,
-                                  std::unique_ptr<MCRelocationInfo>(RelInfo),
-                                  GetOpInfo, SymbolLookUp, DisInfo);
-}
-}

diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 9e8bc94..a147c3d 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp

@@ -55,6 +55,12 @@
       : MCObjectStreamer(Context, MAB, OS, Emitter),
         LabelSections(label) {}
 
+  /// state management
+  void reset() override {
+    HasSectionLabel.clear();
+    MCObjectStreamer::reset();
+  }
+
   /// @name MCStreamer Interface
   /// @{
 
@@ -90,8 +96,8 @@
                              unsigned ByteAlignment) override;
   void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = nullptr,
                     uint64_t Size = 0, unsigned ByteAlignment = 0) override;
-  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                      uint64_t Size, unsigned ByteAlignment = 0) override;
+  void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol, uint64_t Size,
+                      unsigned ByteAlignment = 0) override;
 
   void EmitFileDirective(StringRef Filename) override {
     // FIXME: Just ignore the .file; it isn't important enough to fail the

diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index d543402..fc56728 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp

@@ -29,7 +29,6 @@
       return true;
     }
 
-    void EmitCOFFSecRel32(MCSymbol const *Symbol) override {}
     void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                           unsigned ByteAlignment) override {}
     void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = nullptr,

diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index d490ef3..1b88462 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp

@@ -24,7 +24,7 @@
     return false;
 
   // aarch64 always has it.
-  if (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64)
+  if (T.getArch() == Triple::aarch64)
     return true;
 
   // Use it on newer version of OS X.
@@ -43,8 +43,7 @@
   // MachO
   SupportsWeakOmittedEHFrame = false;
 
-  if (T.isOSDarwin() &&
-      (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))
+  if (T.isOSDarwin() && T.getArch() == Triple::aarch64)
     SupportsCompactUnwindWithoutEHFrame = true;
 
   PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
@@ -178,7 +177,7 @@
 
     if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)
       CompactUnwindDwarfEHFrameOnly = 0x04000000;
-    else if (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64)
+    else if (T.getArch() == Triple::aarch64)
       CompactUnwindDwarfEHFrameOnly = 0x03000000;
   }
 
@@ -287,6 +286,7 @@
     if (Ctx->getAsmInfo()->getExceptionHandlingType() == ExceptionHandling::ARM)
       break;
     // Fallthrough if not using EHABI
+  case Triple::ppc:
   case Triple::x86:
     PersonalityEncoding = (RelocM == Reloc::PIC_)
      ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
@@ -321,8 +321,6 @@
     break;
   case Triple::aarch64:
   case Triple::aarch64_be:
-  case Triple::arm64:
-  case Triple::arm64_be:
     // The small model guarantees static code/data size < 4GB, but not where it
     // will be in memory. Most of these could end up >2GB away so even a signed
     // pc-relative 32-bit address is insufficient, theoretically.
@@ -340,6 +338,8 @@
     break;
   case Triple::mips:
   case Triple::mipsel:
+  case Triple::mips64:
+  case Triple::mips64el:
     // MIPS uses indirect pointer to refer personality functions, so that the
     // eh_frame section can be read-only.  DW.ref.personality will be generated
     // for relocation.
@@ -563,6 +563,9 @@
   DwarfInfoDWOSection =
     Ctx->getELFSection(".debug_info.dwo", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
+  DwarfTypesDWOSection =
+    Ctx->getELFSection(".debug_types.dwo", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
   DwarfAbbrevDWOSection =
     Ctx->getELFSection(".debug_abbrev.dwo", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
@@ -582,15 +585,19 @@
   DwarfAddrSection =
     Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
+
+  StackMapSection =
+    Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC,
+                       SectionKind::getMetadata());
+
 }
 
 
 void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
   bool IsWoA = T.getArch() == Triple::arm || T.getArch() == Triple::thumb;
 
-  // The object file format cannot represent common symbols with explicit
-  // alignments.
-  CommDirectiveSupportsAlignment = false;
+  CommDirectiveSupportsAlignment = true;
 
   // COFF
   BSSSection =
@@ -738,6 +745,10 @@
                           COFF::IMAGE_SCN_MEM_DISCARDABLE |
                           COFF::IMAGE_SCN_MEM_READ,
                           SectionKind::getMetadata());
+  DwarfTypesDWOSection =
+      Ctx->getCOFFSection(".debug_types.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                                                  COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getMetadata());
   DwarfAbbrevDWOSection =
       Ctx->getCOFFSection(".debug_abbrev.dwo",
                           COFF::IMAGE_SCN_MEM_DISCARDABLE |
@@ -770,6 +781,27 @@
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
 
+  DwarfAccelNamesSection =
+      Ctx->getCOFFSection(".apple_names",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getMetadata());
+  DwarfAccelNamespaceSection =
+      Ctx->getCOFFSection(".apple_namespaces",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getMetadata());
+  DwarfAccelTypesSection =
+      Ctx->getCOFFSection(".apple_types",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getMetadata());
+  DwarfAccelObjCSection =
+      Ctx->getCOFFSection(".apple_objc",
+                          COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getMetadata());
+
   DrectveSection =
     Ctx->getCOFFSection(".drectve",
                         COFF::IMAGE_SCN_LNK_INFO |
@@ -827,7 +859,7 @@
   // cellspu-apple-darwin. Perhaps we should fix in Triple?
   if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
        Arch == Triple::arm || Arch == Triple::thumb ||
-       Arch == Triple::arm64 || Arch == Triple::aarch64 ||
+       Arch == Triple::aarch64 ||
        Arch == Triple::ppc || Arch == Triple::ppc64 ||
        Arch == Triple::UnknownArch) &&
       (TT.isOSDarwin() || TT.isOSBinFormatMachO())) {
@@ -849,13 +881,6 @@
                             SectionKind::getMetadata(), 0, utostr(Hash));
 }
 
-const MCSection *
-MCObjectFileInfo::getDwarfTypesDWOSection(uint64_t Hash) const {
-  return Ctx->getELFSection(".debug_types.dwo", ELF::SHT_PROGBITS,
-                            ELF::SHF_GROUP, SectionKind::getMetadata(), 0,
-                            utostr(Hash));
-}
-
 void MCObjectFileInfo::InitEHFrameSection() {
   if (Env == IsMachO)
     EHFrameSection =

diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index a721b59..21e6867 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp

@@ -42,6 +42,21 @@
   delete Assembler;
 }
 
+void MCObjectStreamer::flushPendingLabels(MCFragment *F) {
+  if (PendingLabels.size()) {
+    if (!F) {
+      F = new MCDataFragment();
+      CurSectionData->getFragmentList().insert(CurInsertionPoint, F);
+      F->setParent(CurSectionData);
+    }
+    for (MCSymbolData *SD : PendingLabels) {
+      SD->setFragment(F);
+      SD->setOffset(0);
+    }
+    PendingLabels.clear();
+  }
+}
+
 void MCObjectStreamer::reset() {
   if (Assembler)
     Assembler->reset();
@@ -49,6 +64,7 @@
   CurInsertionPoint = MCSectionData::iterator();
   EmitEHFrame = true;
   EmitDebugFrame = false;
+  PendingLabels.clear();
   MCStreamer::reset();
 }
 
@@ -72,7 +88,7 @@
   return nullptr;
 }
 
-MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() const {
+MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() {
   MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
   // When bundling is enabled, we don't want to add data to a fragment that
   // already has instructions (see MCELFStreamer::EmitInstToData for details)
@@ -127,15 +143,17 @@
   MCStreamer::EmitLabel(Symbol);
 
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-
-  // FIXME: This is wasteful, we don't necessarily need to create a data
-  // fragment. Instead, we should mark the symbol as pointing into the data
-  // fragment if it exists, otherwise we should just queue the label and set its
-  // fragment pointer when we emit the next fragment.
-  MCDataFragment *F = getOrCreateDataFragment();
   assert(!SD.getFragment() && "Unexpected fragment on symbol data!");
-  SD.setFragment(F);
-  SD.setOffset(F->getContents().size());
+
+  // If there is a current fragment, mark the symbol as pointing into it.
+  // Otherwise queue the label and set its fragment pointer when we emit the
+  // next fragment.
+  if (auto *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment())) {
+    SD.setFragment(F);
+    SD.setOffset(F->getContents().size());
+  } else {
+    PendingLabels.push_back(&SD);
+  }
 }
 
 void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
@@ -144,7 +162,6 @@
     EmitULEB128IntValue(IntValue);
     return;
   }
-  Value = ForceExpAbs(Value);
   insert(new MCLEBFragment(*Value, false));
 }
 
@@ -154,7 +171,6 @@
     EmitSLEB128IntValue(IntValue);
     return;
   }
-  Value = ForceExpAbs(Value);
   insert(new MCLEBFragment(*Value, true));
 }
 
@@ -166,6 +182,7 @@
 void MCObjectStreamer::ChangeSection(const MCSection *Section,
                                      const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
+  flushPendingLabels(nullptr);
 
   CurSectionData = &getAssembler().getOrCreateSectionData(*Section);
 
@@ -266,33 +283,54 @@
                                           Isa, Discriminator, FileName);
 }
 
+static const MCExpr *buildSymbolDiff(MCObjectStreamer &OS, const MCSymbol *A,
+                                     const MCSymbol *B) {
+  MCContext &Context = OS.getContext();
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  const MCExpr *ARef = MCSymbolRefExpr::Create(A, Variant, Context);
+  const MCExpr *BRef = MCSymbolRefExpr::Create(B, Variant, Context);
+  const MCExpr *AddrDelta =
+      MCBinaryExpr::Create(MCBinaryExpr::Sub, ARef, BRef, Context);
+  return AddrDelta;
+}
+
+static void emitDwarfSetLineAddr(MCObjectStreamer &OS, int64_t LineDelta,
+                                 const MCSymbol *Label, int PointerSize) {
+  // emit the sequence to set the address
+  OS.EmitIntValue(dwarf::DW_LNS_extended_op, 1);
+  OS.EmitULEB128IntValue(PointerSize + 1);
+  OS.EmitIntValue(dwarf::DW_LNE_set_address, 1);
+  OS.EmitSymbolValue(Label, PointerSize);
+
+  // emit the sequence for the LineDelta (from 1) and a zero address delta.
+  MCDwarfLineAddr::Emit(&OS, LineDelta, 0);
+}
+
 void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
                                                 const MCSymbol *LastLabel,
                                                 const MCSymbol *Label,
                                                 unsigned PointerSize) {
   if (!LastLabel) {
-    EmitDwarfSetLineAddr(LineDelta, Label, PointerSize);
+    emitDwarfSetLineAddr(*this, LineDelta, Label, PointerSize);
     return;
   }
-  const MCExpr *AddrDelta = BuildSymbolDiff(getContext(), Label, LastLabel);
+  const MCExpr *AddrDelta = buildSymbolDiff(*this, Label, LastLabel);
   int64_t Res;
   if (AddrDelta->EvaluateAsAbsolute(Res, getAssembler())) {
     MCDwarfLineAddr::Emit(this, LineDelta, Res);
     return;
   }
-  AddrDelta = ForceExpAbs(AddrDelta);
   insert(new MCDwarfLineAddrFragment(LineDelta, *AddrDelta));
 }
 
 void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                                  const MCSymbol *Label) {
-  const MCExpr *AddrDelta = BuildSymbolDiff(getContext(), Label, LastLabel);
+  const MCExpr *AddrDelta = buildSymbolDiff(*this, Label, LastLabel);
   int64_t Res;
   if (AddrDelta->EvaluateAsAbsolute(Res, getAssembler())) {
     MCDwarfFrameEmitter::EmitAdvanceLoc(*this, Res);
     return;
   }
-  AddrDelta = ForceExpAbs(AddrDelta);
   insert(new MCDwarfCallFrameFragment(*AddrDelta));
 }
 
@@ -379,5 +417,6 @@
   // Dump out the dwarf file & directory tables and line tables.
   MCDwarfLineTable::Emit(this);
 
+  flushPendingLabels(nullptr);
   getAssembler().Finish();
 }

diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 145ad4a..5c8ec66 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp

@@ -417,7 +417,7 @@
 StringRef AsmLexer::LexUntilEndOfStatement() {
   TokStart = CurPtr;
 
-  while (!isAtStartOfComment(*CurPtr) &&    // Start of line comment.
+  while (!isAtStartOfComment(CurPtr) &&     // Start of line comment.
          !isAtStatementSeparator(CurPtr) && // End of statement marker.
          *CurPtr != '\n' && *CurPtr != '\r' &&
          (*CurPtr != 0 || CurPtr != CurBuf.end())) {
@@ -458,9 +458,17 @@
   return Token;
 }
 
-bool AsmLexer::isAtStartOfComment(char Char) {
-  // FIXME: This won't work for multi-character comment indicators like "//".
-  return Char == *MAI.getCommentString();
+bool AsmLexer::isAtStartOfComment(const char *Ptr) {
+  const char *CommentString = MAI.getCommentString();
+
+  if (CommentString[1] == '\0')
+    return CommentString[0] == Ptr[0];
+
+  // FIXME: special case for the bogus "##" comment string in X86MCAsmInfoDarwin
+  if (CommentString[1] == '#')
+    return CommentString[0] == Ptr[0];
+
+  return strncmp(Ptr, CommentString, strlen(CommentString)) == 0;
 }
 
 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
@@ -473,7 +481,7 @@
   // This always consumes at least one character.
   int CurChar = getNextChar();
 
-  if (isAtStartOfComment(CurChar)) {
+  if (isAtStartOfComment(TokStart)) {
     // If this comment starts with a '#', then return the Hash token and let
     // the assembler parser see if it can be parsed as a cpp line filename
     // comment. We do this only if we are at the start of a line.

diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 62ab4a5..de7d961 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp

@@ -45,10 +45,6 @@
 #include <vector>
 using namespace llvm;
 
-static cl::opt<bool>
-FatalAssemblerWarnings("fatal-assembler-warnings",
-                       cl::desc("Consider warnings as error"));
-
 MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {}
 
 namespace {
@@ -73,19 +69,13 @@
   MCAsmMacroParameters Parameters;
 
 public:
-  MCAsmMacro(StringRef N, StringRef B, ArrayRef<MCAsmMacroParameter> P) :
-    Name(N), Body(B), Parameters(P) {}
+  MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
+      : Name(N), Body(B), Parameters(std::move(P)) {}
 };
 
 /// \brief Helper class for storing information about an active macro
 /// instantiation.
 struct MacroInstantiation {
-  /// The macro being instantiated.
-  const MCAsmMacro *TheMacro;
-
-  /// The macro instantiation with substitutions.
-  MemoryBuffer *Instantiation;
-
   /// The location of the instantiation.
   SMLoc InstantiationLoc;
 
@@ -95,9 +85,11 @@
   /// The location where parsing should resume upon instantiation completion.
   SMLoc ExitLoc;
 
+  /// The depth of TheCondStack at the start of the instantiation.
+  size_t CondStackDepth;
+
 public:
-  MacroInstantiation(const MCAsmMacro *M, SMLoc IL, int EB, SMLoc EL,
-                     MemoryBuffer *I);
+  MacroInstantiation(SMLoc IL, int EB, SMLoc EL, size_t CondStackDepth);
 };
 
 struct ParseStatementInfo {
@@ -129,7 +121,7 @@
   SourceMgr &SrcMgr;
   SourceMgr::DiagHandlerTy SavedDiagHandler;
   void *SavedDiagContext;
-  MCAsmParserExtension *PlatformParser;
+  std::unique_ptr<MCAsmParserExtension> PlatformParser;
 
   /// This is the current buffer index we're lexing from as managed by the
   /// SourceMgr object.
@@ -144,7 +136,7 @@
   StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
 
   /// \brief Map of currently defined macros.
-  StringMap<MCAsmMacro*> MacroMap;
+  StringMap<MCAsmMacro> MacroMap;
 
   /// \brief Stack of active macro instantiations.
   std::vector<MacroInstantiation*> ActiveMacros;
@@ -246,7 +238,8 @@
 
 private:
 
-  bool parseStatement(ParseStatementInfo &Info);
+  bool parseStatement(ParseStatementInfo &Info,
+                      MCAsmParserSemaCallback *SI);
   void eatToEndOfLine();
   bool parseCppHashLineFilenameComment(const SMLoc &L);
 
@@ -269,7 +262,7 @@
   const MCAsmMacro* lookupMacro(StringRef Name);
 
   /// \brief Define a new macro with the given name and information.
-  void defineMacro(StringRef Name, const MCAsmMacro& Macro);
+  void defineMacro(StringRef Name, MCAsmMacro Macro);
 
   /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
   void undefineMacro(StringRef Name);
@@ -355,9 +348,10 @@
     DK_CFI_REMEMBER_STATE, DK_CFI_RESTORE_STATE, DK_CFI_SAME_VALUE,
     DK_CFI_RESTORE, DK_CFI_ESCAPE, DK_CFI_SIGNAL_FRAME, DK_CFI_UNDEFINED,
     DK_CFI_REGISTER, DK_CFI_WINDOW_SAVE,
-    DK_MACROS_ON, DK_MACROS_OFF, DK_MACRO, DK_ENDM, DK_ENDMACRO, DK_PURGEM,
+    DK_MACROS_ON, DK_MACROS_OFF,
+    DK_MACRO, DK_EXITM, DK_ENDM, DK_ENDMACRO, DK_PURGEM,
     DK_SLEB128, DK_ULEB128,
-    DK_ERR, DK_ERROR,
+    DK_ERR, DK_ERROR, DK_WARNING,
     DK_END
   };
 
@@ -407,6 +401,7 @@
 
   // macro directives
   bool parseDirectivePurgeMacro(SMLoc DirectiveLoc);
+  bool parseDirectiveExitMacro(StringRef Directive);
   bool parseDirectiveEndMacro(StringRef Directive);
   bool parseDirectiveMacro(SMLoc DirectiveLoc);
   bool parseDirectiveMacrosOnOff(StringRef Directive);
@@ -474,6 +469,9 @@
   // ".err" or ".error"
   bool parseDirectiveError(SMLoc DirectiveLoc, bool WithMessage);
 
+  // ".warning"
+  bool parseDirectiveWarning(SMLoc DirectiveLoc);
+
   void initializeDirectiveKindMap();
 };
 }
@@ -504,34 +502,24 @@
   // Initialize the platform / file format parser.
   switch (_Ctx.getObjectFileInfo()->getObjectFileType()) {
   case MCObjectFileInfo::IsCOFF:
-      PlatformParser = createCOFFAsmParser();
-      PlatformParser->Initialize(*this);
-      break;
+    PlatformParser.reset(createCOFFAsmParser());
+    break;
   case MCObjectFileInfo::IsMachO:
-      PlatformParser = createDarwinAsmParser();
-      PlatformParser->Initialize(*this);
-      IsDarwin = true;
-      break;
+    PlatformParser.reset(createDarwinAsmParser());
+    IsDarwin = true;
+    break;
   case MCObjectFileInfo::IsELF:
-      PlatformParser = createELFAsmParser();
-      PlatformParser->Initialize(*this);
-      break;
+    PlatformParser.reset(createELFAsmParser());
+    break;
   }
 
+  PlatformParser->Initialize(*this);
   initializeDirectiveKindMap();
 }
 
 AsmParser::~AsmParser() {
   assert((HadError || ActiveMacros.empty()) &&
          "Unexpected active macro instantiation!");
-
-  // Destroy any macros.
-  for (StringMap<MCAsmMacro *>::iterator it = MacroMap.begin(),
-                                         ie = MacroMap.end();
-       it != ie; ++it)
-    delete it->getValue();
-
-  delete PlatformParser;
 }
 
 void AsmParser::printMacroInstantiations() {
@@ -550,7 +538,7 @@
 }
 
 bool AsmParser::Warning(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
-  if (FatalAssemblerWarnings)
+  if (getTargetParser().getTargetOptions().MCFatalWarnings)
     return Error(L, Msg, Ranges);
   printMessage(L, SourceMgr::DK_Warning, Msg, Ranges);
   printMacroInstantiations();
@@ -619,7 +607,7 @@
 bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // Create the initial section, if requested.
   if (!NoInitialTextSection)
-    Out.InitSections();
+    Out.InitSections(false);
 
   // Prime the lexer.
   Lex();
@@ -643,7 +631,7 @@
   // While we have input, parse each statement.
   while (Lexer.isNot(AsmToken::Eof)) {
     ParseStatementInfo Info;
-    if (!parseStatement(Info))
+    if (!parseStatement(Info, nullptr))
       continue;
 
     // We had an error, validate that one was emitted and recover by skipping to
@@ -702,7 +690,7 @@
 void AsmParser::checkForValidSection() {
   if (!ParsingInlineAsm && !getStreamer().getCurrentSection().first) {
     TokError("expected section directive before assembly directive");
-    Out.InitSections();
+    Out.InitSections(false);
   }
 }
 
@@ -1188,7 +1176,8 @@
 ///   ::= EndOfStatement
 ///   ::= Label* Directive ...Operands... EndOfStatement
 ///   ::= Label* Identifier OperandList* EndOfStatement
-bool AsmParser::parseStatement(ParseStatementInfo &Info) {
+bool AsmParser::parseStatement(ParseStatementInfo &Info,
+                               MCAsmParserSemaCallback *SI) {
   if (Lexer.is(AsmToken::EndOfStatement)) {
     Out.AddBlankLine();
     Lex();
@@ -1298,9 +1287,16 @@
     // FIXME: This doesn't diagnose assignment to a symbol which has been
     // implicitly marked as external.
     MCSymbol *Sym;
-    if (LocalLabelVal == -1)
+    if (LocalLabelVal == -1) {
+      if (ParsingInlineAsm && SI) {
+        StringRef RewrittenLabel = SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
+        assert(RewrittenLabel.size() && "We should have an internal name here.");
+        Info.AsmRewrites->push_back(AsmRewrite(AOK_Label, IDLoc,
+                                               IDVal.size(), RewrittenLabel));
+        IDVal = RewrittenLabel;
+      }
       Sym = getContext().GetOrCreateSymbol(IDVal);
-    else
+    } else
       Sym = Ctx.CreateDirectionalLocalSymbol(LocalLabelVal);
     if (!Sym->isUndefined() || Sym->isVariable())
       return Error(IDLoc, "invalid symbol redefinition");
@@ -1542,6 +1538,8 @@
       return parseDirectiveMacrosOnOff(IDVal);
     case DK_MACRO:
       return parseDirectiveMacro(IDLoc);
+    case DK_EXITM:
+      return parseDirectiveExitMacro(IDVal);
     case DK_ENDM:
     case DK_ENDMACRO:
       return parseDirectiveEndMacro(IDVal);
@@ -1553,6 +1551,8 @@
       return parseDirectiveError(IDLoc, false);
     case DK_ERROR:
       return parseDirectiveError(IDLoc, true);
+    case DK_WARNING:
+      return parseDirectiveWarning(IDLoc);
     }
 
     return Error(IDLoc, "unknown directive");
@@ -1630,7 +1630,7 @@
 
   // If parsing succeeded, match the instruction.
   if (!HadError) {
-    unsigned ErrorInfo;
+    uint64_t ErrorInfo;
     getTargetParser().MatchAndEmitInstruction(IDLoc, Info.Opcode,
                                               Info.ParsedOperands, Out,
                                               ErrorInfo, ParsingInlineAsm);
@@ -1856,10 +1856,10 @@
   return false;
 }
 
-MacroInstantiation::MacroInstantiation(const MCAsmMacro *M, SMLoc IL, int EB,
-                                       SMLoc EL, MemoryBuffer *I)
-    : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitBuffer(EB),
-      ExitLoc(EL) {}
+MacroInstantiation::MacroInstantiation(SMLoc IL, int EB, SMLoc EL,
+                                       size_t CondStackDepth)
+    : InstantiationLoc(IL), ExitBuffer(EB), ExitLoc(EL),
+      CondStackDepth(CondStackDepth) {}
 
 static bool isOperator(AsmToken::TokenKind kind) {
   switch (kind) {
@@ -2078,21 +2078,15 @@
 }
 
 const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
-  StringMap<MCAsmMacro *>::iterator I = MacroMap.find(Name);
-  return (I == MacroMap.end()) ? nullptr : I->getValue();
+  StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
+  return (I == MacroMap.end()) ? nullptr : &I->getValue();
 }
 
-void AsmParser::defineMacro(StringRef Name, const MCAsmMacro &Macro) {
-  MacroMap[Name] = new MCAsmMacro(Macro);
+void AsmParser::defineMacro(StringRef Name, MCAsmMacro Macro) {
+  MacroMap.insert(std::make_pair(Name, std::move(Macro)));
 }
 
-void AsmParser::undefineMacro(StringRef Name) {
-  StringMap<MCAsmMacro *>::iterator I = MacroMap.find(Name);
-  if (I != MacroMap.end()) {
-    delete I->getValue();
-    MacroMap.erase(I);
-  }
-}
+void AsmParser::undefineMacro(StringRef Name) { MacroMap.erase(Name); }
 
 bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
@@ -2117,17 +2111,17 @@
   // instantiation.
   OS << ".endmacro\n";
 
-  MemoryBuffer *Instantiation =
+  std::unique_ptr<MemoryBuffer> Instantiation =
       MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
   MacroInstantiation *MI = new MacroInstantiation(
-      M, NameLoc, CurBuffer, getTok().getLoc(), Instantiation);
+      NameLoc, CurBuffer, getTok().getLoc(), TheCondStack.size());
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
-  CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc());
+  CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
   Lex();
 
@@ -2600,12 +2594,14 @@
   if (!isUInt<32>(FillExpr) && FillSize > 4)
     Warning(ExprLoc, "'.fill' directive pattern has been truncated to 32-bits");
 
-  int64_t NonZeroFillSize = FillSize > 4 ? 4 : FillSize;
-  FillExpr &= ~0ULL >> (64 - NonZeroFillSize * 8);
-
-  for (uint64_t i = 0, e = NumValues; i != e; ++i) {
-    getStreamer().EmitIntValue(FillExpr, NonZeroFillSize);
-    getStreamer().EmitIntValue(0, FillSize - NonZeroFillSize);
+  if (NumValues > 0) {
+    int64_t NonZeroFillSize = FillSize > 4 ? 4 : FillSize;
+    FillExpr &= ~0ULL >> (64 - NonZeroFillSize * 8);
+    for (uint64_t i = 0, e = NumValues; i != e; ++i) {
+      getStreamer().EmitIntValue(FillExpr, NonZeroFillSize);
+      if (NonZeroFillSize < FillSize)
+        getStreamer().EmitIntValue(0, FillSize - NonZeroFillSize);
+    }
   }
 
   return false;
@@ -3292,7 +3288,7 @@
 
       if (Qualifier == "req")
         Parameter.Required = true;
-      else if (Qualifier == "vararg" && !IsDarwin)
+      else if (Qualifier == "vararg")
         Parameter.Vararg = true;
       else
         return Error(QualLoc, Qualifier + " is not a valid parameter qualifier "
@@ -3313,7 +3309,7 @@
                 "'" + Parameter.Name + "' in macro '" + Name + "'");
     }
 
-    Parameters.push_back(Parameter);
+    Parameters.push_back(std::move(Parameter));
 
     if (getLexer().is(AsmToken::Comma))
       Lex();
@@ -3365,7 +3361,7 @@
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
   checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
-  defineMacro(Name, MCAsmMacro(Name, Body, Parameters));
+  defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
   return false;
 }
 
@@ -3471,6 +3467,26 @@
                           "found in body which will have no effect");
 }
 
+/// parseDirectiveExitMacro
+/// ::= .exitm
+bool AsmParser::parseDirectiveExitMacro(StringRef Directive) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Directive + "' directive");
+
+  if (!isInsideMacroInstantiation())
+    return TokError("unexpected '" + Directive + "' in file, "
+                                                 "no current macro definition");
+
+  // Exit all conditionals that are active in the current macro.
+  while (TheCondStack.size() != ActiveMacros.back()->CondStackDepth) {
+    TheCondState = TheCondStack.back();
+    TheCondStack.pop_back();
+  }
+
+  handleMacroExit();
+  return false;
+}
+
 /// parseDirectiveEndMacro
 /// ::= .endm
 /// ::= .endmacro
@@ -4073,6 +4089,32 @@
   return true;
 }
 
+/// parseDirectiveWarning
+///   ::= .warning [string]
+bool AsmParser::parseDirectiveWarning(SMLoc L) {
+  if (!TheCondStack.empty()) {
+    if (TheCondStack.back().Ignore) {
+      eatToEndOfStatement();
+      return false;
+    }
+  }
+
+  StringRef Message = ".warning directive invoked in source file";
+  if (Lexer.isNot(AsmToken::EndOfStatement)) {
+    if (Lexer.isNot(AsmToken::String)) {
+      TokError(".warning argument must be a string");
+      eatToEndOfStatement();
+      return true;
+    }
+
+    Message = getTok().getStringContents();
+    Lex();
+  }
+
+  Warning(L, Message);
+  return false;
+}
+
 /// parseDirectiveEndIf
 /// ::= .endif
 bool AsmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
@@ -4200,11 +4242,13 @@
   DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
   DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
   DirectiveKindMap[".macro"] = DK_MACRO;
+  DirectiveKindMap[".exitm"] = DK_EXITM;
   DirectiveKindMap[".endm"] = DK_ENDM;
   DirectiveKindMap[".endmacro"] = DK_ENDMACRO;
   DirectiveKindMap[".purgem"] = DK_PURGEM;
   DirectiveKindMap[".err"] = DK_ERR;
   DirectiveKindMap[".error"] = DK_ERROR;
+  DirectiveKindMap[".warning"] = DK_WARNING;
 }
 
 MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
@@ -4246,7 +4290,8 @@
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
 
   // We Are Anonymous.
-  MacroLikeBodies.push_back(MCAsmMacro(StringRef(), Body, None));
+  MacroLikeBodies.push_back(
+      MCAsmMacro(StringRef(), Body, MCAsmMacroParameters()));
   return &MacroLikeBodies.back();
 }
 
@@ -4254,17 +4299,17 @@
                                          raw_svector_ostream &OS) {
   OS << ".endr\n";
 
-  MemoryBuffer *Instantiation =
+  std::unique_ptr<MemoryBuffer> Instantiation =
       MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
   MacroInstantiation *MI = new MacroInstantiation(
-      M, DirectiveLoc, CurBuffer, getTok().getLoc(), Instantiation);
+      DirectiveLoc, CurBuffer, getTok().getLoc(), TheCondStack.size());
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
-  CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc());
+  CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
   Lex();
 }
@@ -4490,7 +4535,7 @@
   unsigned OutputIdx = 0;
   while (getLexer().isNot(AsmToken::Eof)) {
     ParseStatementInfo Info(&AsmStrRewrites);
-    if (parseStatement(Info))
+    if (parseStatement(Info, &SI))
       return true;
 
     if (Info.ParseError)
@@ -4510,7 +4555,8 @@
         continue;
 
       // Register operand.
-      if (Operand.isReg() && !Operand.needAddressOf()) {
+      if (Operand.isReg() && !Operand.needAddressOf() &&
+          !getTargetParser().OmitRegisterFromClobberLists(Operand.getReg())) {
         unsigned NumDefs = Desc.getNumDefs();
         // Clobber.
         if (NumDefs && Operand.getMCOperandNum() < NumDefs)
@@ -4615,6 +4661,9 @@
     case AOK_ImmPrefix:
       OS << "$$";
       break;
+    case AOK_Label:
+      OS << Ctx.getAsmInfo()->getPrivateGlobalPrefix() << AR.Label;
+      break;
     case AOK_Input:
       OS << '$' << InputIdx++;
       break;

diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index 5ecf9e5..6f82e6e 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp

@@ -364,6 +364,10 @@
 
     Flags |= COFF::IMAGE_SCN_LNK_COMDAT;
 
+    if (!getLexer().is(AsmToken::Identifier))
+      return TokError("expected comdat type such as 'discard' or 'largest' "
+                      "after protection bits");
+
     if (parseCOMDATType(Type))
       return true;
 

diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index b2a6785..3ea745e 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp

@@ -638,13 +638,13 @@
   // Open the secure log file if we haven't already.
   raw_ostream *OS = getContext().getSecureLog();
   if (!OS) {
-    std::string Err;
-    OS = new raw_fd_ostream(SecureLogFile, Err,
+    std::error_code EC;
+    OS = new raw_fd_ostream(SecureLogFile, EC,
                             sys::fs::F_Append | sys::fs::F_Text);
-    if (!Err.empty()) {
+    if (EC) {
        delete OS;
        return Error(IDLoc, Twine("can't open secure log file: ") +
-                    SecureLogFile + " (" + Err + ")");
+                               SecureLogFile + " (" + EC.message() + ")");
     }
     getContext().setSecureLog(OS);
   }

diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 98b2b3b..e302004 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp

@@ -555,7 +555,7 @@
         std::make_pair(ELFSection, std::make_pair(nullptr, nullptr)));
     if (InsertResult.second) {
       if (getContext().getDwarfVersion() <= 2)
-        Error(loc, "DWARF2 only supports one section per compilation unit");
+        Warning(loc, "DWARF2 only supports one section per compilation unit");
 
       MCSymbol *SectionStartSymbol = getContext().CreateTempSymbol();
       getStreamer().EmitLabel(SectionStartSymbol);

diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index 530814b..795cc85 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp

@@ -30,3 +30,7 @@
 SMLoc AsmToken::getEndLoc() const {
   return SMLoc::getFromPointer(Str.data() + Str.size());
 }
+
+SMRange AsmToken::getLocRange() const {
+  return SMRange(getLoc(), getEndLoc());
+}

diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index e417aa9..290dcb2 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp

@@ -29,7 +29,7 @@
   TargetParser->Initialize(*this);
 }
 
-const AsmToken &MCAsmParser::getTok() {
+const AsmToken &MCAsmParser::getTok() const {
   return getLexer().getTok();
 }
 

diff --git a/lib/MC/MCRelocationInfo.cpp b/lib/MC/MCRelocationInfo.cpp
deleted file mode 100644
index a00c009..0000000
--- a/lib/MC/MCRelocationInfo.cpp
+++ /dev/null

@@ -1,39 +0,0 @@
-//==-- lib/MC/MCRelocationInfo.cpp -------------------------------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCRelocationInfo.h"
-#include "llvm-c/Disassembler.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-MCRelocationInfo::MCRelocationInfo(MCContext &Ctx)
-  : Ctx(Ctx) {
-}
-
-MCRelocationInfo::~MCRelocationInfo() {
-}
-
-const MCExpr *
-MCRelocationInfo::createExprForRelocation(object::RelocationRef Rel) {
-  return nullptr;
-}
-
-const MCExpr *
-MCRelocationInfo::createExprForCAPIVariantKind(const MCExpr *SubExpr,
-                                               unsigned VariantKind) {
-  if (VariantKind != LLVMDisassembler_VariantKind_None)
-    return nullptr;
-  return SubExpr;
-}
-
-MCRelocationInfo *llvm::createMCRelocationInfo(StringRef TT, MCContext &Ctx) {
-  return new MCRelocationInfo(Ctx);
-}

diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index fc2bd36..e95845f0 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp

@@ -47,18 +47,22 @@
   }
 
   OS << "\t.section\t" << getSectionName() << ",\"";
-  if (getKind().isText())
+  if (getCharacteristics() & COFF::IMAGE_SCN_MEM_EXECUTE)
     OS << 'x';
-  else if (getKind().isBSS())
-    OS << 'b';
-  if (getKind().isWriteable())
+  if (getCharacteristics() & COFF::IMAGE_SCN_MEM_WRITE)
     OS << 'w';
-  else
+  else if (getCharacteristics() & COFF::IMAGE_SCN_MEM_READ)
     OS << 'r';
-  if (getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE)
-    OS << 'n';
+  else
+    OS << 'y';
   if (getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
     OS << 'd';
+  if (getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+    OS << 'b';
+  if (getCharacteristics() & COFF::IMAGE_SCN_LNK_REMOVE)
+    OS << 'n';
+  if (getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED)
+    OS << 's';
   OS << '"';
 
   if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {

diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index 09eb3e7..a29bb97 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp

@@ -19,8 +19,8 @@
 
 MCSectionELF::~MCSectionELF() {} // anchor.
 
-// ShouldOmitSectionDirective - Decides whether a '.section' directive
-// should be printed before the section name
+// Decides whether a '.section' directive
+// should be printed before the section name.
 bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
                                               const MCAsmInfo &MAI) const {
 

diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index bdcdb97..f11ee66 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp

@@ -17,6 +17,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCWin64EH.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
@@ -37,47 +38,26 @@
 void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
-    : Context(Ctx), CurrentW64UnwindInfo(nullptr) {
+    : Context(Ctx), CurrentWinFrameInfo(nullptr) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
 MCStreamer::~MCStreamer() {
-  for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
-    delete W64UnwindInfos[i];
+  for (unsigned i = 0; i < getNumWinFrameInfos(); ++i)
+    delete WinFrameInfos[i];
 }
 
 void MCStreamer::reset() {
-  for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
-    delete W64UnwindInfos[i];
-  W64UnwindInfos.clear();
-  CurrentW64UnwindInfo = nullptr;
+  DwarfFrameInfos.clear();
+  for (unsigned i = 0; i < getNumWinFrameInfos(); ++i)
+    delete WinFrameInfos[i];
+  WinFrameInfos.clear();
+  CurrentWinFrameInfo = nullptr;
+  SymbolOrdering.clear();
   SectionStack.clear();
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
-const MCExpr *MCStreamer::BuildSymbolDiff(MCContext &Context,
-                                          const MCSymbol *A,
-                                          const MCSymbol *B) {
-  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
-  const MCExpr *ARef =
-    MCSymbolRefExpr::Create(A, Variant, Context);
-  const MCExpr *BRef =
-    MCSymbolRefExpr::Create(B, Variant, Context);
-  const MCExpr *AddrDelta =
-    MCBinaryExpr::Create(MCBinaryExpr::Sub, ARef, BRef, Context);
-  return AddrDelta;
-}
-
-const MCExpr *MCStreamer::ForceExpAbs(const MCExpr* Expr) {
-  assert(!isa<MCSymbolRefExpr>(Expr));
-  if (Context.getAsmInfo()->hasAggressiveSymbolFolding())
-    return Expr;
-
-  MCSymbol *ABS = Context.CreateTempSymbol();
-  EmitAssignment(ABS, Expr);
-  return MCSymbolRefExpr::Create(ABS, Context);
-}
-
 raw_ostream &MCStreamer::GetCommentOS() {
   // By default, discard comments.
   return nulls();
@@ -86,28 +66,15 @@
 void MCStreamer::emitRawComment(const Twine &T, bool TabPrefix) {}
 
 void MCStreamer::generateCompactUnwindEncodings(MCAsmBackend *MAB) {
-  for (std::vector<MCDwarfFrameInfo>::iterator I = FrameInfos.begin(),
-         E = FrameInfos.end(); I != E; ++I)
-    I->CompactUnwindEncoding =
-      (MAB ? MAB->generateCompactUnwindEncoding(I->Instructions) : 0);
-}
-
-void MCStreamer::EmitDwarfSetLineAddr(int64_t LineDelta,
-                                      const MCSymbol *Label, int PointerSize) {
-  // emit the sequence to set the address
-  EmitIntValue(dwarf::DW_LNS_extended_op, 1);
-  EmitULEB128IntValue(PointerSize + 1);
-  EmitIntValue(dwarf::DW_LNE_set_address, 1);
-  EmitSymbolValue(Label, PointerSize);
-
-  // emit the sequence for the LineDelta (from 1) and a zero address delta.
-  MCDwarfLineAddr::Emit(this, LineDelta, 0);
+  for (auto &FI : DwarfFrameInfos)
+    FI.CompactUnwindEncoding =
+        (MAB ? MAB->generateCompactUnwindEncoding(FI.Instructions) : 0);
 }
 
 /// EmitIntValue - Special case of EmitValue that avoids the client having to
 /// pass in a MCExpr for constant integers.
 void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
-  assert(Size <= 8 && "Invalid size");
+  assert(1 <= Size && Size <= 8 && "Invalid size");
   assert((isUIntN(8 * Size, Value) || isIntN(8 * Size, Value)) &&
          "Invalid size");
   char buf[8];
@@ -137,19 +104,20 @@
   EmitBytes(OSE.str());
 }
 
-void MCStreamer::EmitAbsValue(const MCExpr *Value, unsigned Size) {
-  const MCExpr *ABS = ForceExpAbs(Value);
-  EmitValue(ABS, Size);
-}
-
-
 void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size,
                            const SMLoc &Loc) {
   EmitValueImpl(Value, Size, Loc);
 }
 
-void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size) {
-  EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size);
+void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
+                                 bool IsSectionRelative) {
+  assert((!IsSectionRelative || Size == 4) &&
+         "SectionRelative value requires 4-bytes");
+
+  if (!IsSectionRelative)
+    EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size);
+  else
+    EmitCOFFSecRel32(Sym);
 }
 
 void MCStreamer::EmitGPRel64Value(const MCExpr *Value) {
@@ -198,14 +166,14 @@
   return Table.getLabel();
 }
 
-MCDwarfFrameInfo *MCStreamer::getCurrentFrameInfo() {
-  if (FrameInfos.empty())
+MCDwarfFrameInfo *MCStreamer::getCurrentDwarfFrameInfo() {
+  if (DwarfFrameInfos.empty())
     return nullptr;
-  return &FrameInfos.back();
+  return &DwarfFrameInfos.back();
 }
 
-void MCStreamer::EnsureValidFrame() {
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+void MCStreamer::EnsureValidDwarfFrame() {
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame || CurFrame->End)
     report_fatal_error("No open frame");
 }
@@ -214,7 +182,7 @@
                                      MCSymbol *EHSymbol) {
 }
 
-void MCStreamer::InitSections() {
+void MCStreamer::InitSections(bool NoExecStack) {
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
@@ -239,18 +207,12 @@
     TS->emitLabel(Symbol);
 }
 
-void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) {
-  EnsureValidFrame();
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
-  CurFrame->CompactUnwindEncoding = CompactUnwindEncoding;
-}
-
 void MCStreamer::EmitCFISections(bool EH, bool Debug) {
   assert(EH || Debug);
 }
 
 void MCStreamer::EmitCFIStartProc(bool IsSimple) {
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (CurFrame && !CurFrame->End)
     report_fatal_error("Starting a frame before finishing the previous one!");
 
@@ -258,15 +220,25 @@
   Frame.IsSimple = IsSimple;
   EmitCFIStartProcImpl(Frame);
 
-  FrameInfos.push_back(Frame);
+  const MCAsmInfo* MAI = Context.getAsmInfo();
+  if (MAI) {
+    for (const MCCFIInstruction& Inst : MAI->getInitialFrameState()) {
+      if (Inst.getOperation() == MCCFIInstruction::OpDefCfa ||
+          Inst.getOperation() == MCCFIInstruction::OpDefCfaRegister) {
+        Frame.CurrentCfaRegister = Inst.getRegister();
+      }
+    }
+  }
+
+  DwarfFrameInfos.push_back(Frame);
 }
 
 void MCStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
 }
 
 void MCStreamer::EmitCFIEndProc() {
-  EnsureValidFrame();
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  EnsureValidDwarfFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   EmitCFIEndProcImpl(*CurFrame);
 }
 
@@ -277,7 +249,7 @@
 }
 
 MCSymbol *MCStreamer::EmitCFICommon() {
-  EnsureValidFrame();
+  EnsureValidDwarfFrame();
   MCSymbol *Label = getContext().CreateTempSymbol();
   EmitLabel(Label);
   return Label;
@@ -287,15 +259,16 @@
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createDefCfa(Label, Register, Offset);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
+  CurFrame->CurrentCfaRegister = static_cast<unsigned>(Register);
 }
 
 void MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createDefCfaOffset(Label, Offset);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
@@ -303,7 +276,7 @@
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createAdjustCfaOffset(Label, Adjustment);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
@@ -311,15 +284,16 @@
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createDefCfaRegister(Label, Register);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
+  CurFrame->CurrentCfaRegister = static_cast<unsigned>(Register);
 }
 
 void MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createOffset(Label, Register, Offset);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
@@ -327,21 +301,21 @@
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createRelOffset(Label, Register, Offset);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
 void MCStreamer::EmitCFIPersonality(const MCSymbol *Sym,
                                     unsigned Encoding) {
-  EnsureValidFrame();
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  EnsureValidDwarfFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Personality = Sym;
   CurFrame->PersonalityEncoding = Encoding;
 }
 
 void MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
-  EnsureValidFrame();
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  EnsureValidDwarfFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Lsda = Sym;
   CurFrame->LsdaEncoding = Encoding;
 }
@@ -349,7 +323,7 @@
 void MCStreamer::EmitCFIRememberState() {
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction = MCCFIInstruction::createRememberState(Label);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
@@ -357,7 +331,7 @@
   // FIXME: Error if there is no matching cfi_remember_state.
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction = MCCFIInstruction::createRestoreState(Label);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
@@ -365,7 +339,7 @@
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createSameValue(Label, Register);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
@@ -373,20 +347,20 @@
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createRestore(Label, Register);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
 void MCStreamer::EmitCFIEscape(StringRef Values) {
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction = MCCFIInstruction::createEscape(Label, Values);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
 void MCStreamer::EmitCFISignalFrame() {
-  EnsureValidFrame();
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  EnsureValidDwarfFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->IsSignalFrame = true;
 }
 
@@ -394,7 +368,7 @@
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createUndefined(Label, Register);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
@@ -402,7 +376,7 @@
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createRegister(Label, Register1, Register2);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
@@ -410,164 +384,167 @@
   MCSymbol *Label = EmitCFICommon();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createWindowSave(Label);
-  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame) {
-  W64UnwindInfos.push_back(Frame);
-  CurrentW64UnwindInfo = W64UnwindInfos.back();
-}
-
-void MCStreamer::EnsureValidW64UnwindInfo() {
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  if (!CurFrame || CurFrame->End)
+void MCStreamer::EnsureValidWinFrameInfo() {
+  if (!CurrentWinFrameInfo || CurrentWinFrameInfo->End)
     report_fatal_error("No open Win64 EH frame function!");
 }
 
 void MCStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol) {
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  if (CurFrame && !CurFrame->End)
+  if (CurrentWinFrameInfo && !CurrentWinFrameInfo->End)
     report_fatal_error("Starting a function before ending the previous one!");
-  MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo;
-  Frame->Begin = getContext().CreateTempSymbol();
-  Frame->Function = Symbol;
-  EmitLabel(Frame->Begin);
-  setCurrentW64UnwindInfo(Frame);
+
+  MCSymbol *StartProc = getContext().CreateTempSymbol();
+  EmitLabel(StartProc);
+
+  WinFrameInfos.push_back(new WinEH::FrameInfo(Symbol, StartProc));
+  CurrentWinFrameInfo = WinFrameInfos.back();
 }
 
 void MCStreamer::EmitWinCFIEndProc() {
-  EnsureValidW64UnwindInfo();
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  if (CurFrame->ChainedParent)
+  EnsureValidWinFrameInfo();
+  if (CurrentWinFrameInfo->ChainedParent)
     report_fatal_error("Not all chained regions terminated!");
-  CurFrame->End = getContext().CreateTempSymbol();
-  EmitLabel(CurFrame->End);
+
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  EmitLabel(Label);
+  CurrentWinFrameInfo->End = Label;
 }
 
 void MCStreamer::EmitWinCFIStartChained() {
-  EnsureValidW64UnwindInfo();
-  MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo;
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  Frame->Begin = getContext().CreateTempSymbol();
-  Frame->Function = CurFrame->Function;
-  Frame->ChainedParent = CurFrame;
-  EmitLabel(Frame->Begin);
-  setCurrentW64UnwindInfo(Frame);
+  EnsureValidWinFrameInfo();
+
+  MCSymbol *StartProc = getContext().CreateTempSymbol();
+  EmitLabel(StartProc);
+
+  WinFrameInfos.push_back(new WinEH::FrameInfo(CurrentWinFrameInfo->Function,
+                                               StartProc, CurrentWinFrameInfo));
+  CurrentWinFrameInfo = WinFrameInfos.back();
 }
 
 void MCStreamer::EmitWinCFIEndChained() {
-  EnsureValidW64UnwindInfo();
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  if (!CurFrame->ChainedParent)
+  EnsureValidWinFrameInfo();
+  if (!CurrentWinFrameInfo->ChainedParent)
     report_fatal_error("End of a chained region outside a chained region!");
-  CurFrame->End = getContext().CreateTempSymbol();
-  EmitLabel(CurFrame->End);
-  CurrentW64UnwindInfo = CurFrame->ChainedParent;
+
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  EmitLabel(Label);
+
+  CurrentWinFrameInfo->End = Label;
+  CurrentWinFrameInfo =
+      const_cast<WinEH::FrameInfo *>(CurrentWinFrameInfo->ChainedParent);
 }
 
 void MCStreamer::EmitWinEHHandler(const MCSymbol *Sym, bool Unwind,
                                   bool Except) {
-  EnsureValidW64UnwindInfo();
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  if (CurFrame->ChainedParent)
+  EnsureValidWinFrameInfo();
+  if (CurrentWinFrameInfo->ChainedParent)
     report_fatal_error("Chained unwind areas can't have handlers!");
-  CurFrame->ExceptionHandler = Sym;
+  CurrentWinFrameInfo->ExceptionHandler = Sym;
   if (!Except && !Unwind)
     report_fatal_error("Don't know what kind of handler this is!");
   if (Unwind)
-    CurFrame->HandlesUnwind = true;
+    CurrentWinFrameInfo->HandlesUnwind = true;
   if (Except)
-    CurFrame->HandlesExceptions = true;
+    CurrentWinFrameInfo->HandlesExceptions = true;
 }
 
 void MCStreamer::EmitWinEHHandlerData() {
-  EnsureValidW64UnwindInfo();
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  if (CurFrame->ChainedParent)
+  EnsureValidWinFrameInfo();
+  if (CurrentWinFrameInfo->ChainedParent)
     report_fatal_error("Chained unwind areas can't have handlers!");
 }
 
 void MCStreamer::EmitWinCFIPushReg(unsigned Register) {
-  EnsureValidW64UnwindInfo();
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  EnsureValidWinFrameInfo();
+
   MCSymbol *Label = getContext().CreateTempSymbol();
-  MCWin64EHInstruction Inst(Win64EH::UOP_PushNonVol, Label, Register);
   EmitLabel(Label);
-  CurFrame->Instructions.push_back(Inst);
+
+  WinEH::Instruction Inst = Win64EH::Instruction::PushNonVol(Label, Register);
+  CurrentWinFrameInfo->Instructions.push_back(Inst);
 }
 
 void MCStreamer::EmitWinCFISetFrame(unsigned Register, unsigned Offset) {
-  EnsureValidW64UnwindInfo();
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  if (CurFrame->LastFrameInst >= 0)
+  EnsureValidWinFrameInfo();
+  if (CurrentWinFrameInfo->LastFrameInst >= 0)
     report_fatal_error("Frame register and offset already specified!");
   if (Offset & 0x0F)
     report_fatal_error("Misaligned frame pointer offset!");
   if (Offset > 240)
     report_fatal_error("Frame offset must be less than or equal to 240!");
+
   MCSymbol *Label = getContext().CreateTempSymbol();
-  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, Label, Register, Offset);
   EmitLabel(Label);
-  CurFrame->LastFrameInst = CurFrame->Instructions.size();
-  CurFrame->Instructions.push_back(Inst);
+
+  WinEH::Instruction Inst =
+      Win64EH::Instruction::SetFPReg(Label, Register, Offset);
+  CurrentWinFrameInfo->LastFrameInst = CurrentWinFrameInfo->Instructions.size();
+  CurrentWinFrameInfo->Instructions.push_back(Inst);
 }
 
 void MCStreamer::EmitWinCFIAllocStack(unsigned Size) {
-  EnsureValidW64UnwindInfo();
+  EnsureValidWinFrameInfo();
   if (Size == 0)
     report_fatal_error("Allocation size must be non-zero!");
   if (Size & 7)
     report_fatal_error("Misaligned stack allocation!");
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+
   MCSymbol *Label = getContext().CreateTempSymbol();
-  MCWin64EHInstruction Inst(Label, Size);
   EmitLabel(Label);
-  CurFrame->Instructions.push_back(Inst);
+
+  WinEH::Instruction Inst = Win64EH::Instruction::Alloc(Label, Size);
+  CurrentWinFrameInfo->Instructions.push_back(Inst);
 }
 
 void MCStreamer::EmitWinCFISaveReg(unsigned Register, unsigned Offset) {
-  EnsureValidW64UnwindInfo();
+  EnsureValidWinFrameInfo();
   if (Offset & 7)
     report_fatal_error("Misaligned saved register offset!");
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+
   MCSymbol *Label = getContext().CreateTempSymbol();
-  MCWin64EHInstruction Inst(
-     Offset > 512*1024-8 ? Win64EH::UOP_SaveNonVolBig : Win64EH::UOP_SaveNonVol,
-                            Label, Register, Offset);
   EmitLabel(Label);
-  CurFrame->Instructions.push_back(Inst);
+
+  WinEH::Instruction Inst =
+      Win64EH::Instruction::SaveNonVol(Label, Register, Offset);
+  CurrentWinFrameInfo->Instructions.push_back(Inst);
 }
 
 void MCStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset) {
-  EnsureValidW64UnwindInfo();
+  EnsureValidWinFrameInfo();
   if (Offset & 0x0F)
     report_fatal_error("Misaligned saved vector register offset!");
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+
   MCSymbol *Label = getContext().CreateTempSymbol();
-  MCWin64EHInstruction Inst(
-    Offset > 512*1024-16 ? Win64EH::UOP_SaveXMM128Big : Win64EH::UOP_SaveXMM128,
-                            Label, Register, Offset);
   EmitLabel(Label);
-  CurFrame->Instructions.push_back(Inst);
+
+  WinEH::Instruction Inst =
+      Win64EH::Instruction::SaveXMM(Label, Register, Offset);
+  CurrentWinFrameInfo->Instructions.push_back(Inst);
 }
 
 void MCStreamer::EmitWinCFIPushFrame(bool Code) {
-  EnsureValidW64UnwindInfo();
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  if (CurFrame->Instructions.size() > 0)
+  EnsureValidWinFrameInfo();
+  if (CurrentWinFrameInfo->Instructions.size() > 0)
     report_fatal_error("If present, PushMachFrame must be the first UOP");
+
   MCSymbol *Label = getContext().CreateTempSymbol();
-  MCWin64EHInstruction Inst(Win64EH::UOP_PushMachFrame, Label, Code);
   EmitLabel(Label);
-  CurFrame->Instructions.push_back(Inst);
+
+  WinEH::Instruction Inst = Win64EH::Instruction::PushMachFrame(Label, Code);
+  CurrentWinFrameInfo->Instructions.push_back(Inst);
 }
 
 void MCStreamer::EmitWinCFIEndProlog() {
-  EnsureValidW64UnwindInfo();
-  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
-  CurFrame->PrologEnd = getContext().CreateTempSymbol();
-  EmitLabel(CurFrame->PrologEnd);
+  EnsureValidWinFrameInfo();
+
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  EmitLabel(Label);
+
+  CurrentWinFrameInfo->PrologEnd = Label;
 }
 
 void MCStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
@@ -590,15 +567,11 @@
   EmitRawTextImpl(T.toStringRef(Str));
 }
 
-void MCStreamer::EmitW64Tables() {
-  if (!getNumW64UnwindInfos())
-    return;
-
-  MCWin64EHUnwindEmitter::Emit(*this);
+void MCStreamer::EmitWindowsUnwindTables() {
 }
 
 void MCStreamer::Finish() {
-  if (!FrameInfos.empty() && !FrameInfos.back().End)
+  if (!DwarfFrameInfos.empty() && !DwarfFrameInfos.back().End)
     report_fatal_error("Unfinished frame!");
 
   MCTargetStreamer *TS = getTargetStreamer();

diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index 4424c91..b8e42bd 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp

@@ -17,8 +17,6 @@
 
 using namespace llvm;
 
-MCSchedModel MCSchedModel::DefaultSchedModel; // For unknown processors.
-
 /// InitMCProcessorInfo - Set or change the CPU (optionally supplemented
 /// with feature string). Recompute feature bits and scheduling model.
 void
@@ -33,7 +31,7 @@
   if (!CPU.empty())
     CPUSchedModel = getSchedModelForCPU(CPU);
   else
-    CPUSchedModel = &MCSchedModel::DefaultSchedModel;
+    CPUSchedModel = MCSchedModel::GetDefaultSchedModel();
 }
 
 void
@@ -78,7 +76,7 @@
 }
 
 
-const MCSchedModel *
+MCSchedModel
 MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
   assert(ProcSchedModels && "Processor machine model not available!");
 
@@ -97,15 +95,15 @@
     errs() << "'" << CPU
            << "' is not a recognized processor for this target"
            << " (ignoring processor)\n";
-    return &MCSchedModel::DefaultSchedModel;
+    return MCSchedModel::GetDefaultSchedModel();
   }
   assert(Found->Value && "Missing processor SchedModel value");
-  return (const MCSchedModel *)Found->Value;
+  return *(const MCSchedModel *)Found->Value;
 }
 
 InstrItineraryData
 MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
-  const MCSchedModel *SchedModel = getSchedModelForCPU(CPU);
+  const MCSchedModel SchedModel = getSchedModelForCPU(CPU);
   return InstrItineraryData(SchedModel, Stages, OperandCycles, ForwardingPaths);
 }
 

diff --git a/lib/MC/MCTargetOptions.cpp b/lib/MC/MCTargetOptions.cpp
index efd724a..3093ba2 100644
--- a/lib/MC/MCTargetOptions.cpp
+++ b/lib/MC/MCTargetOptions.cpp

@@ -13,8 +13,8 @@
 
 MCTargetOptions::MCTargetOptions()
     : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
-      MCSaveTempLabels(false), MCUseDwarfDirectory(false),
-      ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false),
-      DwarfVersion(0) {}
+      MCFatalWarnings(false), MCSaveTempLabels(false),
+      MCUseDwarfDirectory(false), ShowMCEncoding(false), ShowMCInst(false),
+      AsmVerbose(false), DwarfVersion(0) {}
 
 } // end namespace llvm

diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index bb651647..dfadb3c 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp

@@ -15,15 +15,16 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Win64EH.h"
 
 namespace llvm {
 
 // NOTE: All relocations generated here are 4-byte image-relative.
 
-static uint8_t CountOfUnwindCodes(std::vector<MCWin64EHInstruction> &Insns) {
+static uint8_t CountOfUnwindCodes(std::vector<WinEH::Instruction> &Insns) {
   uint8_t Count = 0;
   for (const auto &I : Insns) {
-    switch (I.getOperation()) {
+    switch (static_cast<Win64EH::UnwindOpcodes>(I.Operation)) {
     case Win64EH::UOP_PushNonVol:
     case Win64EH::UOP_AllocSmall:
     case Win64EH::UOP_SetFPReg:
@@ -39,86 +40,83 @@
       Count += 3;
       break;
     case Win64EH::UOP_AllocLarge:
-      Count += (I.getSize() > 512 * 1024 - 8) ? 3 : 2;
+      Count += (I.Offset > 512 * 1024 - 8) ? 3 : 2;
       break;
     }
   }
   return Count;
 }
 
-static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs,
-                              MCSymbol *rhs) {
-  MCContext &context = streamer.getContext();
-  const MCExpr *diff = MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(
-                                                                  lhs, context),
-                                               MCSymbolRefExpr::Create(
-                                                                  rhs, context),
-                                               context);
-  streamer.EmitAbsValue(diff, 1);
-
+static void EmitAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS,
+                              const MCSymbol *RHS) {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Diff =
+      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(LHS, Context),
+                              MCSymbolRefExpr::Create(RHS, Context), Context);
+  Streamer.EmitValue(Diff, 1);
 }
 
-static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
-                           MCWin64EHInstruction &inst) {
+static void EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
+                           WinEH::Instruction &inst) {
   uint8_t b2;
   uint16_t w;
-  b2 = (inst.getOperation() & 0x0F);
-  switch (inst.getOperation()) {
+  b2 = (inst.Operation & 0x0F);
+  switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) {
   case Win64EH::UOP_PushNonVol:
-    EmitAbsDifference(streamer, inst.getLabel(), begin);
-    b2 |= (inst.getRegister() & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.Label, begin);
+    b2 |= (inst.Register & 0x0F) << 4;
     streamer.EmitIntValue(b2, 1);
     break;
   case Win64EH::UOP_AllocLarge:
-    EmitAbsDifference(streamer, inst.getLabel(), begin);
-    if (inst.getSize() > 512*1024-8) {
+    EmitAbsDifference(streamer, inst.Label, begin);
+    if (inst.Offset > 512 * 1024 - 8) {
       b2 |= 0x10;
       streamer.EmitIntValue(b2, 1);
-      w = inst.getSize() & 0xFFF8;
+      w = inst.Offset & 0xFFF8;
       streamer.EmitIntValue(w, 2);
-      w = inst.getSize() >> 16;
+      w = inst.Offset >> 16;
     } else {
       streamer.EmitIntValue(b2, 1);
-      w = inst.getSize() >> 3;
+      w = inst.Offset >> 3;
     }
     streamer.EmitIntValue(w, 2);
     break;
   case Win64EH::UOP_AllocSmall:
-    b2 |= (((inst.getSize()-8) >> 3) & 0x0F) << 4;
-    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    b2 |= (((inst.Offset - 8) >> 3) & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.Label, begin);
     streamer.EmitIntValue(b2, 1);
     break;
   case Win64EH::UOP_SetFPReg:
-    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    EmitAbsDifference(streamer, inst.Label, begin);
     streamer.EmitIntValue(b2, 1);
     break;
   case Win64EH::UOP_SaveNonVol:
   case Win64EH::UOP_SaveXMM128:
-    b2 |= (inst.getRegister() & 0x0F) << 4;
-    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    b2 |= (inst.Register & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.Label, begin);
     streamer.EmitIntValue(b2, 1);
-    w = inst.getOffset() >> 3;
-    if (inst.getOperation() == Win64EH::UOP_SaveXMM128)
+    w = inst.Offset >> 3;
+    if (inst.Operation == Win64EH::UOP_SaveXMM128)
       w >>= 1;
     streamer.EmitIntValue(w, 2);
     break;
   case Win64EH::UOP_SaveNonVolBig:
   case Win64EH::UOP_SaveXMM128Big:
-    b2 |= (inst.getRegister() & 0x0F) << 4;
-    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    b2 |= (inst.Register & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.Label, begin);
     streamer.EmitIntValue(b2, 1);
-    if (inst.getOperation() == Win64EH::UOP_SaveXMM128Big)
-      w = inst.getOffset() & 0xFFF0;
+    if (inst.Operation == Win64EH::UOP_SaveXMM128Big)
+      w = inst.Offset & 0xFFF0;
     else
-      w = inst.getOffset() & 0xFFF8;
+      w = inst.Offset & 0xFFF8;
     streamer.EmitIntValue(w, 2);
-    w = inst.getOffset() >> 16;
+    w = inst.Offset >> 16;
     streamer.EmitIntValue(w, 2);
     break;
   case Win64EH::UOP_PushMachFrame:
-    if (inst.isPushCodeFrame())
+    if (inst.Offset == 1)
       b2 |= 0x10;
-    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    EmitAbsDifference(streamer, inst.Label, begin);
     streamer.EmitIntValue(b2, 1);
     break;
   }
@@ -138,7 +136,7 @@
 }
 
 static void EmitRuntimeFunction(MCStreamer &streamer,
-                                const MCWin64EHUnwindInfo *info) {
+                                const WinEH::FrameInfo *info) {
   MCContext &context = streamer.getContext();
 
   streamer.EmitValueToAlignment(4);
@@ -149,14 +147,17 @@
                                              context), 4);
 }
 
-static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
+static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   // If this UNWIND_INFO already has a symbol, it's already been emitted.
-  if (info->Symbol) return;
+  if (info->Symbol)
+    return;
 
   MCContext &context = streamer.getContext();
+  MCSymbol *Label = context.CreateTempSymbol();
+
   streamer.EmitValueToAlignment(4);
-  info->Symbol = context.CreateTempSymbol();
-  streamer.EmitLabel(info->Symbol);
+  streamer.EmitLabel(Label);
+  info->Symbol = Label;
 
   // Upper 3 bits are the version number (currently 1).
   uint8_t flags = 0x01;
@@ -180,17 +181,16 @@
 
   uint8_t frame = 0;
   if (info->LastFrameInst >= 0) {
-    MCWin64EHInstruction &frameInst = info->Instructions[info->LastFrameInst];
-    assert(frameInst.getOperation() == Win64EH::UOP_SetFPReg);
-    frame = (frameInst.getRegister() & 0x0F) |
-            (frameInst.getOffset() & 0xF0);
+    WinEH::Instruction &frameInst = info->Instructions[info->LastFrameInst];
+    assert(frameInst.Operation == Win64EH::UOP_SetFPReg);
+    frame = (frameInst.Register & 0x0F) | (frameInst.Offset & 0xF0);
   }
   streamer.EmitIntValue(frame, 1);
 
   // Emit unwind instructions (in reverse order).
   uint8_t numInst = info->Instructions.size();
   for (uint8_t c = 0; c < numInst; ++c) {
-    MCWin64EHInstruction inst = info->Instructions.back();
+    WinEH::Instruction inst = info->Instructions.back();
     info->Instructions.pop_back();
     EmitUnwindCode(streamer, info->Begin, inst);
   }
@@ -218,77 +218,38 @@
   }
 }
 
-StringRef MCWin64EHUnwindEmitter::GetSectionSuffix(const MCSymbol *func) {
-  if (!func || !func->isInSection()) return "";
-  const MCSection *section = &func->getSection();
-  const MCSectionCOFF *COFFSection;
-  if ((COFFSection = dyn_cast<MCSectionCOFF>(section))) {
-    StringRef name = COFFSection->getSectionName();
-    size_t dollar = name.find('$');
-    size_t dot = name.find('.', 1);
-    if (dollar == StringRef::npos && dot == StringRef::npos)
-      return "";
-    if (dot == StringRef::npos)
-      return name.substr(dollar);
-    if (dollar == StringRef::npos || dot < dollar)
-      return name.substr(dot);
-    return name.substr(dollar);
-  }
-  return "";
-}
-
-static const MCSection *getWin64EHTableSection(StringRef suffix,
-                                               MCContext &context) {
-  if (suffix == "")
-    return context.getObjectFileInfo()->getXDataSection();
-
-  return context.getCOFFSection((".xdata"+suffix).str(),
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getDataRel());
-}
-
-static const MCSection *getWin64EHFuncTableSection(StringRef suffix,
-                                                   MCContext &context) {
-  if (suffix == "")
-    return context.getObjectFileInfo()->getPDataSection();
-  return context.getCOFFSection((".pdata"+suffix).str(),
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getDataRel());
-}
-
-void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer,
-                                            MCWin64EHUnwindInfo *info) {
-  // Switch sections (the static function above is meant to be called from
-  // here and from Emit().
-  MCContext &context = streamer.getContext();
-  const MCSection *xdataSect =
-    getWin64EHTableSection(GetSectionSuffix(info->Function), context);
-  streamer.SwitchSection(xdataSect);
-
-  llvm::EmitUnwindInfo(streamer, info);
-}
-
-void MCWin64EHUnwindEmitter::Emit(MCStreamer &Streamer) {
+namespace Win64EH {
+void UnwindEmitter::Emit(MCStreamer &Streamer) const {
   MCContext &Context = Streamer.getContext();
 
   // Emit the unwind info structs first.
-  for (const auto &CFI : Streamer.getW64UnwindInfos()) {
+  for (const auto &CFI : Streamer.getWinFrameInfos()) {
     const MCSection *XData =
-        getWin64EHTableSection(GetSectionSuffix(CFI->Function), Context);
+        getXDataSection(CFI->Function, Context);
     Streamer.SwitchSection(XData);
     EmitUnwindInfo(Streamer, CFI);
   }
 
   // Now emit RUNTIME_FUNCTION entries.
-  for (const auto &CFI : Streamer.getW64UnwindInfos()) {
+  for (const auto &CFI : Streamer.getWinFrameInfos()) {
     const MCSection *PData =
-        getWin64EHFuncTableSection(GetSectionSuffix(CFI->Function), Context);
+        getPDataSection(CFI->Function, Context);
     Streamer.SwitchSection(PData);
     EmitRuntimeFunction(Streamer, CFI);
   }
 }
 
+void UnwindEmitter::EmitUnwindInfo(MCStreamer &Streamer,
+                                   WinEH::FrameInfo *info) const {
+  // Switch sections (the static function above is meant to be called from
+  // here and from Emit().
+  MCContext &context = Streamer.getContext();
+  const MCSection *xdataSect =
+    getXDataSection(info->Function, context);
+  Streamer.SwitchSection(xdataSect);
+
+  llvm::EmitUnwindInfo(Streamer, info);
+}
+}
 } // End of namespace llvm
 

diff --git a/lib/MC/MCWinEH.cpp b/lib/MC/MCWinEH.cpp
new file mode 100644
index 0000000..f0c354f
--- /dev/null
+++ b/lib/MC/MCWinEH.cpp

@@ -0,0 +1,84 @@
+//===- lib/MC/MCWinEH.cpp - Windows EH implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCWinEH.h"
+#include "llvm/Support/COFF.h"
+
+namespace llvm {
+namespace WinEH {
+static StringRef getSectionSuffix(const MCSymbol *Function) {
+  if (!Function || !Function->isInSection())
+    return "";
+
+  const MCSection *FunctionSection = &Function->getSection();
+  if (const auto Section = dyn_cast<MCSectionCOFF>(FunctionSection)) {
+    StringRef Name = Section->getSectionName();
+    size_t Dollar = Name.find('$');
+    size_t Dot = Name.find('.', 1);
+
+    if (Dollar == StringRef::npos && Dot == StringRef::npos)
+      return "";
+    if (Dot == StringRef::npos)
+      return Name.substr(Dollar);
+    if (Dollar == StringRef::npos || Dot < Dollar)
+      return Name.substr(Dot);
+
+    return Name.substr(Dollar);
+  }
+
+  return "";
+}
+
+static const MCSection *getUnwindInfoSection(
+    StringRef SecName, const MCSectionCOFF *UnwindSec, const MCSymbol *Function,
+    MCContext &Context) {
+  // If Function is in a COMDAT, get or create an unwind info section in that
+  // COMDAT group.
+  if (Function && Function->isInSection()) {
+    const MCSectionCOFF *FunctionSection =
+        cast<MCSectionCOFF>(&Function->getSection());
+    if (FunctionSection->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
+      return Context.getAssociativeCOFFSection(
+          UnwindSec, FunctionSection->getCOMDATSymbol());
+    }
+  }
+
+  // If Function is in a section other than .text, create a new .pdata section.
+  // Otherwise use the plain .pdata section.
+  StringRef Suffix = getSectionSuffix(Function);
+  if (Suffix.empty())
+    return UnwindSec;
+  return Context.getCOFFSection((SecName + Suffix).str(),
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ,
+                                SectionKind::getDataRel());
+}
+
+const MCSection *UnwindEmitter::getPDataSection(const MCSymbol *Function,
+                                                MCContext &Context) {
+  const MCSectionCOFF *PData =
+      cast<MCSectionCOFF>(Context.getObjectFileInfo()->getPDataSection());
+  return getUnwindInfoSection(".pdata", PData, Function, Context);
+}
+
+const MCSection *UnwindEmitter::getXDataSection(const MCSymbol *Function,
+                                                MCContext &Context) {
+  const MCSectionCOFF *XData =
+      cast<MCSectionCOFF>(Context.getObjectFileInfo()->getXDataSection());
+  return getUnwindInfoSection(".xdata", XData, Function, Context);
+}
+
+}
+}
+

diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 5214398..577c4b7 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp

@@ -41,7 +41,7 @@
 bool MachObjectWriter::
 doesSymbolRequireExternRelocation(const MCSymbolData *SD) {
   // Undefined symbols are always extern.
-  if (SD->Symbol->isUndefined())
+  if (SD->getSymbol().isUndefined())
     return true;
 
   // References to weak definitions require external relocation entries; the
@@ -84,7 +84,7 @@
 
 
     MCValue Target;
-    if (!S.getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+    if (!S.getVariableValue()->EvaluateAsRelocatable(Target, &Layout, nullptr))
       report_fatal_error("unable to evaluate offset for variable '" +
                          S.getName() + "'");
 
@@ -525,15 +525,10 @@
 }
 
 /// ComputeSymbolTable - Compute the symbol table data
-///
-/// \param StringTable [out] - The string table data.
-/// \param StringIndexMap [out] - Map from symbol names to offsets in the
-/// string table.
-void MachObjectWriter::
-ComputeSymbolTable(MCAssembler &Asm, SmallString<256> &StringTable,
-                   std::vector<MachSymbolData> &LocalSymbolData,
-                   std::vector<MachSymbolData> &ExternalSymbolData,
-                   std::vector<MachSymbolData> &UndefinedSymbolData) {
+void MachObjectWriter::ComputeSymbolTable(
+    MCAssembler &Asm, std::vector<MachSymbolData> &LocalSymbolData,
+    std::vector<MachSymbolData> &ExternalSymbolData,
+    std::vector<MachSymbolData> &UndefinedSymbolData) {
   // Build section lookup table.
   DenseMap<const MCSection*, uint8_t> SectionIndexMap;
   unsigned Index = 1;
@@ -542,37 +537,34 @@
     SectionIndexMap[&it->getSection()] = Index;
   assert(Index <= 256 && "Too many sections!");
 
-  // Index 0 is always the empty string.
-  StringMap<uint64_t> StringIndexMap;
-  StringTable += '\x00';
+  // Build the string table.
+  for (MCSymbolData &SD : Asm.symbols()) {
+    const MCSymbol &Symbol = SD.getSymbol();
+    if (!Asm.isSymbolLinkerVisible(Symbol))
+      continue;
 
-  // Build the symbol arrays and the string table, but only for non-local
-  // symbols.
+    StringTable.add(Symbol.getName());
+  }
+  StringTable.finalize(StringTableBuilder::MachO);
+
+  // Build the symbol arrays but only for non-local symbols.
   //
-  // The particular order that we collect the symbols and create the string
-  // table, then sort the symbols is chosen to match 'as'. Even though it
-  // doesn't matter for correctness, this is important for letting us diff .o
-  // files.
+  // The particular order that we collect and then sort the symbols is chosen to
+  // match 'as'. Even though it doesn't matter for correctness, this is
+  // important for letting us diff .o files.
   for (MCSymbolData &SD : Asm.symbols()) {
     const MCSymbol &Symbol = SD.getSymbol();
 
     // Ignore non-linker visible symbols.
-    if (!Asm.isSymbolLinkerVisible(SD.getSymbol()))
+    if (!Asm.isSymbolLinkerVisible(Symbol))
       continue;
 
     if (!SD.isExternal() && !Symbol.isUndefined())
       continue;
 
-    uint64_t &Entry = StringIndexMap[Symbol.getName()];
-    if (!Entry) {
-      Entry = StringTable.size();
-      StringTable += Symbol.getName();
-      StringTable += '\x00';
-    }
-
     MachSymbolData MSD;
     MSD.SymbolData = &SD;
-    MSD.StringIndex = Entry;
+    MSD.StringIndex = StringTable.getOffset(Symbol.getName());
 
     if (Symbol.isUndefined()) {
       MSD.SectionIndex = 0;
@@ -592,22 +584,15 @@
     const MCSymbol &Symbol = SD.getSymbol();
 
     // Ignore non-linker visible symbols.
-    if (!Asm.isSymbolLinkerVisible(SD.getSymbol()))
+    if (!Asm.isSymbolLinkerVisible(Symbol))
       continue;
 
     if (SD.isExternal() || Symbol.isUndefined())
       continue;
 
-    uint64_t &Entry = StringIndexMap[Symbol.getName()];
-    if (!Entry) {
-      Entry = StringTable.size();
-      StringTable += Symbol.getName();
-      StringTable += '\x00';
-    }
-
     MachSymbolData MSD;
     MSD.SymbolData = &SD;
-    MSD.StringIndex = Entry;
+    MSD.StringIndex = StringTable.getOffset(Symbol.getName());
 
     if (Symbol.isAbsolute()) {
       MSD.SectionIndex = 0;
@@ -631,10 +616,6 @@
     ExternalSymbolData[i].SymbolData->setIndex(Index++);
   for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
     UndefinedSymbolData[i].SymbolData->setIndex(Index++);
-
-  // The string table is padded to a multiple of 4.
-  while (StringTable.size() % 4)
-    StringTable += '\x00';
 }
 
 void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm,
@@ -664,7 +645,7 @@
     // and neither symbol is external, mark the variable as absolute.
     const MCExpr *Expr = SD.getSymbol().getVariableValue();
     MCValue Value;
-    if (Expr->EvaluateAsRelocatable(Value, &Layout)) {
+    if (Expr->EvaluateAsRelocatable(Value, &Layout, nullptr)) {
       if (Value.getSymA() && Value.getSymB())
         const_cast<MCSymbol*>(&SD.getSymbol())->setAbsolute();
     }
@@ -683,7 +664,7 @@
   markAbsoluteVariableSymbols(Asm, Layout);
 
   // Compute symbol table information and bind symbol indices.
-  ComputeSymbolTable(Asm, StringTable, LocalSymbolData, ExternalSymbolData,
+  ComputeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
                      UndefinedSymbolData);
 }
 
@@ -745,6 +726,10 @@
       return false;
   }
 
+  // If they are not in the same section, we can't compute the diff.
+  if (&SecA != &SecB)
+    return false;
+
   const MCFragment *FA = Asm.getSymbolData(SA).getFragment();
 
   // Bail if the symbol has no fragment.
@@ -752,12 +737,7 @@
     return false;
 
   A_Base = FA->getAtom();
-  if (!A_Base)
-    return false;
-
   B_Base = FB.getAtom();
-  if (!B_Base)
-    return false;
 
   // If the atoms are the same, they are guaranteed to have the same address.
   if (A_Base == B_Base)
@@ -922,7 +902,7 @@
                                               sizeof(MachO::nlist_64) :
                                               sizeof(MachO::nlist));
     WriteSymtabLoadCommand(SymbolTableOffset, NumSymTabSymbols,
-                           StringTableOffset, StringTable.size());
+                           StringTableOffset, StringTable.data().size());
 
     WriteDysymtabLoadCommand(FirstLocalSymbol, NumLocalSymbols,
                              FirstExternalSymbol, NumExternalSymbols,
@@ -1028,7 +1008,7 @@
       WriteNlist(UndefinedSymbolData[i], Layout);
 
     // Write the string table.
-    OS << StringTable.str();
+    OS << StringTable.data();
   }
 }
 

diff --git a/lib/MC/Makefile b/lib/MC/Makefile
index a10f17e..bf8b7c0 100644
--- a/lib/MC/Makefile
+++ b/lib/MC/Makefile

@@ -10,7 +10,7 @@
 LEVEL = ../..
 LIBRARYNAME = LLVMMC
 BUILD_ARCHIVE := 1
-PARALLEL_DIRS := MCAnalysis MCParser MCDisassembler
+PARALLEL_DIRS := MCParser MCDisassembler
 
 include $(LEVEL)/Makefile.common
 

diff --git a/lib/MC/StringTableBuilder.cpp b/lib/MC/StringTableBuilder.cpp
index db58ece..9de9363 100644
--- a/lib/MC/StringTableBuilder.cpp
+++ b/lib/MC/StringTableBuilder.cpp

@@ -9,6 +9,8 @@
 
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Endian.h"
 
 using namespace llvm;
 
@@ -25,19 +27,32 @@
   return sizeA > sizeB;
 }
 
-void StringTableBuilder::finalize() {
+void StringTableBuilder::finalize(Kind kind) {
   SmallVector<StringRef, 8> Strings;
+  Strings.reserve(StringIndexMap.size());
+
   for (auto i = StringIndexMap.begin(), e = StringIndexMap.end(); i != e; ++i)
     Strings.push_back(i->getKey());
 
   std::sort(Strings.begin(), Strings.end(), compareBySuffix);
 
-  // FIXME: Starting with a null byte is ELF specific. Generalize this so we
-  // can use the class with other object formats.
-  StringTable += '\x00';
+  switch (kind) {
+  case ELF:
+  case MachO:
+    // Start the table with a NUL byte.
+    StringTable += '\x00';
+    break;
+  case WinCOFF:
+    // Make room to write the table size later.
+    StringTable.append(4, '\x00');
+    break;
+  }
 
   StringRef Previous;
   for (StringRef s : Strings) {
+    if (kind == WinCOFF)
+      assert(s.size() > COFF::NameSize && "Short string in COFF string table!");
+
     if (Previous.endswith(s)) {
       StringIndexMap[s] = StringTable.size() - 1 - s.size();
       continue;
@@ -48,4 +63,26 @@
     StringTable += '\x00';
     Previous = s;
   }
+
+  switch (kind) {
+  case ELF:
+    break;
+  case MachO:
+    // Pad to multiple of 4.
+    while (StringTable.size() % 4)
+      StringTable += '\x00';
+    break;
+  case WinCOFF:
+    // Write the table size in the first word.
+    assert(StringTable.size() <= std::numeric_limits<uint32_t>::max());
+    uint32_t size = static_cast<uint32_t>(StringTable.size());
+    support::endian::write<uint32_t, support::little, support::unaligned>(
+        StringTable.data(), size);
+    break;
+  }
+}
+
+void StringTableBuilder::clear() {
+  StringTable.clear();
+  StringIndexMap.clear();
 }

diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 27525c7..587be54 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp

@@ -27,7 +27,7 @@
 
 /// hasFlag - Determine if a feature has a flag; '+' or '-'
 ///
-static inline bool hasFlag(const StringRef Feature) {
+static inline bool hasFlag(StringRef Feature) {
   assert(!Feature.empty() && "Empty string");
   // Get first character
   char Ch = Feature[0];
@@ -37,13 +37,13 @@
 
 /// StripFlag - Return string stripped of flag.
 ///
-static inline std::string StripFlag(const StringRef Feature) {
+static inline std::string StripFlag(StringRef Feature) {
   return hasFlag(Feature) ? Feature.substr(1) : Feature;
 }
 
 /// isEnabled - Return true if enable flag; '+'.
 ///
-static inline bool isEnabled(const StringRef Feature) {
+static inline bool isEnabled(StringRef Feature) {
   assert(!Feature.empty() && "Empty string");
   // Get first character
   char Ch = Feature[0];
@@ -53,8 +53,8 @@
 
 /// Split - Splits a string of comma separated items in to a vector of strings.
 ///
-static void Split(std::vector<std::string> &V, const StringRef S) {
-  SmallVector<StringRef, 2> Tmp;
+static void Split(std::vector<std::string> &V, StringRef S) {
+  SmallVector<StringRef, 3> Tmp;
   S.split(Tmp, ",", -1, false /* KeepEmpty */);
   V.assign(Tmp.begin(), Tmp.end());
 }
@@ -81,7 +81,7 @@
 }
 
 /// Adding features.
-void SubtargetFeatures::AddFeature(const StringRef String) {
+void SubtargetFeatures::AddFeature(StringRef String) {
   // Don't add empty features or features we already have.
   if (!String.empty())
     // Convert to lowercase, prepend flag if we don't already have a flag.
@@ -136,7 +136,7 @@
 //                    SubtargetFeatures Implementation
 //===----------------------------------------------------------------------===//
 
-SubtargetFeatures::SubtargetFeatures(const StringRef Initial) {
+SubtargetFeatures::SubtargetFeatures(StringRef Initial) {
   // Break up string into separate features
   Split(Features, Initial);
 }
@@ -181,7 +181,7 @@
 /// ToggleFeature - Toggle a feature and returns the newly updated feature
 /// bits.
 uint64_t
-SubtargetFeatures::ToggleFeature(uint64_t Bits, const StringRef Feature,
+SubtargetFeatures::ToggleFeature(uint64_t Bits, StringRef Feature,
                                  ArrayRef<SubtargetFeatureKV> FeatureTable) {
 
   // Find feature in table.
@@ -213,7 +213,7 @@
 /// getFeatureBits - Get feature bits a CPU.
 ///
 uint64_t
-SubtargetFeatures::getFeatureBits(const StringRef CPU,
+SubtargetFeatures::getFeatureBits(StringRef CPU,
                                   ArrayRef<SubtargetFeatureKV> CPUTable,
                                   ArrayRef<SubtargetFeatureKV> FeatureTable) {
 

diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index a462c0d..1046e04 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp

@@ -26,8 +26,10 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TimeValue.h"
 #include <cstdio>
@@ -71,7 +73,6 @@
   MCSymbolData const *MCData;
 
   COFFSymbol(StringRef name);
-  size_t size() const;
   void set_name_offset(uint32_t Offset);
 
   bool should_keep() const;
@@ -102,20 +103,6 @@
   static size_t size();
 };
 
-// This class holds the COFF string table.
-class StringTable {
-  typedef StringMap<size_t> map;
-  map Map;
-
-  void update_length();
-public:
-  std::vector<char> Data;
-
-  StringTable();
-  size_t size() const;
-  size_t insert(StringRef String);
-};
-
 class WinCOFFObjectWriter : public MCObjectWriter {
 public:
 
@@ -131,13 +118,26 @@
   COFF::header Header;
   sections     Sections;
   symbols      Symbols;
-  StringTable  Strings;
+  StringTableBuilder Strings;
 
   // Maps used during object file creation.
   section_map SectionMap;
   symbol_map  SymbolMap;
 
+  bool UseBigObj;
+
   WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW, raw_ostream &OS);
+  
+  void reset() override {
+    memset(&Header, 0, sizeof(Header));
+    Header.Machine = TargetObjectWriter->getMachine();
+    Sections.clear();
+    Symbols.clear();
+    Strings.clear();
+    SectionMap.clear();
+    SymbolMap.clear();
+    MCObjectWriter::reset();
+  }
 
   COFFSymbol *createSymbol(StringRef Name);
   COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol * Symbol);
@@ -150,10 +150,10 @@
   void DefineSymbol(MCSymbolData const &SymbolData, MCAssembler &Assembler,
                     const MCAsmLayout &Layout);
 
-  void MakeSymbolReal(COFFSymbol &S, size_t Index);
-  void MakeSectionReal(COFFSection &S, size_t Number);
+  void SetSymbolName(COFFSymbol &S);
+  void SetSectionName(COFFSection &S);
 
-  bool ExportSymbol(MCSymbolData const &SymbolData, MCAssembler &Asm);
+  bool ExportSymbol(const MCSymbol &Symbol, MCAssembler &Asm);
 
   bool IsPhysicalSection(COFFSection *S);
 
@@ -170,6 +170,11 @@
   void ExecutePostLayoutBinding(MCAssembler &Asm,
                                 const MCAsmLayout &Layout) override;
 
+  bool IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                              const MCSymbolData &DataA,
+                                              const MCFragment &FB, bool InSet,
+                                              bool IsPCRel) const override;
+
   void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
@@ -179,12 +184,9 @@
 };
 }
 
-static inline void write_uint32_le(void *Data, uint32_t const &Value) {
-  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
-  Ptr[0] = (Value & 0x000000FF) >>  0;
-  Ptr[1] = (Value & 0x0000FF00) >>  8;
-  Ptr[2] = (Value & 0x00FF0000) >> 16;
-  Ptr[3] = (Value & 0xFF000000) >> 24;
+static inline void write_uint32_le(void *Data, uint32_t Value) {
+  support::endian::write<uint32_t, support::little, support::unaligned>(Data,
+                                                                        Value);
 }
 
 //------------------------------------------------------------------------------
@@ -199,10 +201,6 @@
   memset(&Data, 0, sizeof(Data));
 }
 
-size_t COFFSymbol::size() const {
-  return COFF::SymbolSize + (Data.NumberOfAuxSymbols * COFF::SymbolSize);
-}
-
 // In the case that the name does not fit within 8 bytes, the offset
 // into the string table is stored in the last 4 bytes instead, leaving
 // the first 4 bytes as 0.
@@ -254,55 +252,11 @@
 }
 
 //------------------------------------------------------------------------------
-// StringTable class implementation
-
-/// Write the length of the string table into Data.
-/// The length of the string table includes uint32 length header.
-void StringTable::update_length() {
-  write_uint32_le(&Data.front(), Data.size());
-}
-
-StringTable::StringTable() {
-  // The string table data begins with the length of the entire string table
-  // including the length header. Allocate space for this header.
-  Data.resize(4);
-  update_length();
-}
-
-size_t StringTable::size() const {
-  return Data.size();
-}
-
-/// Add String to the table iff it is not already there.
-/// @returns the index into the string table where the string is now located.
-size_t StringTable::insert(StringRef String) {
-  map::iterator i = Map.find(String);
-
-  if (i != Map.end())
-    return i->second;
-
-  size_t Offset = Data.size();
-
-  // Insert string data into string table.
-  Data.insert(Data.end(), String.begin(), String.end());
-  Data.push_back('\0');
-
-  // Put a reference to it in the map.
-  Map[String] = Offset;
-
-  // Update the internal length field.
-  update_length();
-
-  return Offset;
-}
-
-//------------------------------------------------------------------------------
 // WinCOFFObjectWriter class implementation
 
 WinCOFFObjectWriter::WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW,
                                          raw_ostream &OS)
-  : MCObjectWriter(OS, true)
-  , TargetObjectWriter(MOTW) {
+    : MCObjectWriter(OS, true), TargetObjectWriter(MOTW) {
   memset(&Header, 0, sizeof(Header));
 
   Header.Machine = TargetObjectWriter->getMachine();
@@ -456,19 +410,22 @@
 
     // If no storage class was specified in the streamer, define it here.
     if (coff_symbol->Data.StorageClass == 0) {
-      bool external = ResSymData.isExternal() || !ResSymData.Fragment;
+      bool IsExternal =
+          ResSymData.isExternal() ||
+          (!ResSymData.getFragment() && !ResSymData.getSymbol().isVariable());
 
-      coff_symbol->Data.StorageClass =
-       external ? COFF::IMAGE_SYM_CLASS_EXTERNAL : COFF::IMAGE_SYM_CLASS_STATIC;
+      coff_symbol->Data.StorageClass = IsExternal
+                                           ? COFF::IMAGE_SYM_CLASS_EXTERNAL
+                                           : COFF::IMAGE_SYM_CLASS_STATIC;
     }
 
     if (!Base) {
       coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
     } else {
       const MCSymbolData &BaseData = Assembler.getSymbolData(*Base);
-      if (BaseData.Fragment) {
+      if (BaseData.getFragment()) {
         COFFSection *Sec =
-            SectionMap[&BaseData.Fragment->getParent()->getSection()];
+            SectionMap[&BaseData.getFragment()->getParent()->getSection()];
 
         if (coff_symbol->Section && coff_symbol->Section != Sec)
           report_fatal_error("conflicting sections for symbol");
@@ -508,11 +465,9 @@
   }
 }
 
-/// making a section real involves assigned it a number and putting
-/// name into the string table if needed
-void WinCOFFObjectWriter::MakeSectionReal(COFFSection &S, size_t Number) {
+void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
   if (S.Name.size() > COFF::NameSize) {
-    uint64_t StringTableEntry = Strings.insert(S.Name.c_str());
+    uint64_t StringTableEntry = Strings.getOffset(S.Name);
 
     if (StringTableEntry <= Max6DecimalOffset) {
       std::sprintf(S.Header.Name, "/%d", unsigned(StringTableEntry));
@@ -530,32 +485,33 @@
     }
   } else
     std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
-
-  S.Number = Number;
-  S.Symbol->Data.SectionNumber = S.Number;
-  S.Symbol->Aux[0].Aux.SectionDefinition.Number = S.Number;
 }
 
-void WinCOFFObjectWriter::MakeSymbolReal(COFFSymbol &S, size_t Index) {
-  if (S.Name.size() > COFF::NameSize) {
-    size_t StringTableEntry = Strings.insert(S.Name.c_str());
-
-    S.set_name_offset(StringTableEntry);
-  } else
+void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {
+  if (S.Name.size() > COFF::NameSize)
+    S.set_name_offset(Strings.getOffset(S.Name));
+  else
     std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size());
-  S.Index = Index;
 }
 
-bool WinCOFFObjectWriter::ExportSymbol(MCSymbolData const &SymbolData,
+bool WinCOFFObjectWriter::ExportSymbol(const MCSymbol &Symbol,
                                        MCAssembler &Asm) {
   // This doesn't seem to be right. Strings referred to from the .data section
   // need symbols so they can be linked to code in the .text section right?
 
-  // return Asm.isSymbolLinkerVisible(SymbolData.getSymbol());
+  // return Asm.isSymbolLinkerVisible(Symbol);
+
+  // Non-temporary labels should always be visible to the linker.
+  if (!Symbol.isTemporary())
+    return true;
+
+  // Absolute temporary labels are never visible.
+  if (!Symbol.isInSection())
+    return false;
 
   // For now, all non-variable symbols are exported,
   // the linker will sort the rest out for us.
-  return SymbolData.isExternal() || !SymbolData.getSymbol().isVariable();
+  return !Symbol.isVariable();
 }
 
 bool WinCOFFObjectWriter::IsPhysicalSection(COFFSection *S) {
@@ -567,19 +523,39 @@
 // entity writing methods
 
 void WinCOFFObjectWriter::WriteFileHeader(const COFF::header &Header) {
-  WriteLE16(Header.Machine);
-  WriteLE16(Header.NumberOfSections);
-  WriteLE32(Header.TimeDateStamp);
-  WriteLE32(Header.PointerToSymbolTable);
-  WriteLE32(Header.NumberOfSymbols);
-  WriteLE16(Header.SizeOfOptionalHeader);
-  WriteLE16(Header.Characteristics);
+  if (UseBigObj) {
+    WriteLE16(COFF::IMAGE_FILE_MACHINE_UNKNOWN);
+    WriteLE16(0xFFFF);
+    WriteLE16(COFF::BigObjHeader::MinBigObjectVersion);
+    WriteLE16(Header.Machine);
+    WriteLE32(Header.TimeDateStamp);
+    for (uint8_t MagicChar : COFF::BigObjMagic)
+      Write8(MagicChar);
+    WriteLE32(0);
+    WriteLE32(0);
+    WriteLE32(0);
+    WriteLE32(0);
+    WriteLE32(Header.NumberOfSections);
+    WriteLE32(Header.PointerToSymbolTable);
+    WriteLE32(Header.NumberOfSymbols);
+  } else {
+    WriteLE16(Header.Machine);
+    WriteLE16(static_cast<int16_t>(Header.NumberOfSections));
+    WriteLE32(Header.TimeDateStamp);
+    WriteLE32(Header.PointerToSymbolTable);
+    WriteLE32(Header.NumberOfSymbols);
+    WriteLE16(Header.SizeOfOptionalHeader);
+    WriteLE16(Header.Characteristics);
+  }
 }
 
 void WinCOFFObjectWriter::WriteSymbol(const COFFSymbol &S) {
   WriteBytes(StringRef(S.Data.Name, COFF::NameSize));
   WriteLE32(S.Data.Value);
-  WriteLE16(S.Data.SectionNumber);
+  if (UseBigObj)
+    WriteLE32(S.Data.SectionNumber);
+  else
+    WriteLE16(static_cast<int16_t>(S.Data.SectionNumber));
   WriteLE16(S.Data.Type);
   Write8(S.Data.StorageClass);
   Write8(S.Data.NumberOfAuxSymbols);
@@ -597,6 +573,8 @@
       WriteLE32(i->Aux.FunctionDefinition.PointerToLinenumber);
       WriteLE32(i->Aux.FunctionDefinition.PointerToNextFunction);
       WriteZeros(sizeof(i->Aux.FunctionDefinition.unused));
+      if (UseBigObj)
+        WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     case ATbfAndefSymbol:
       WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused1));
@@ -604,24 +582,32 @@
       WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused2));
       WriteLE32(i->Aux.bfAndefSymbol.PointerToNextFunction);
       WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused3));
+      if (UseBigObj)
+        WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     case ATWeakExternal:
       WriteLE32(i->Aux.WeakExternal.TagIndex);
       WriteLE32(i->Aux.WeakExternal.Characteristics);
       WriteZeros(sizeof(i->Aux.WeakExternal.unused));
+      if (UseBigObj)
+        WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     case ATFile:
-      WriteBytes(StringRef(reinterpret_cast<const char *>(i->Aux.File.FileName),
-                 sizeof(i->Aux.File.FileName)));
+      WriteBytes(
+          StringRef(reinterpret_cast<const char *>(&i->Aux),
+                    UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size));
       break;
     case ATSectionDefinition:
       WriteLE32(i->Aux.SectionDefinition.Length);
       WriteLE16(i->Aux.SectionDefinition.NumberOfRelocations);
       WriteLE16(i->Aux.SectionDefinition.NumberOfLinenumbers);
       WriteLE32(i->Aux.SectionDefinition.CheckSum);
-      WriteLE16(i->Aux.SectionDefinition.Number);
+      WriteLE16(static_cast<int16_t>(i->Aux.SectionDefinition.Number));
       Write8(i->Aux.SectionDefinition.Selection);
       WriteZeros(sizeof(i->Aux.SectionDefinition.unused));
+      WriteLE16(static_cast<int16_t>(i->Aux.SectionDefinition.Number >> 16));
+      if (UseBigObj)
+        WriteZeros(COFF::Symbol32Size - COFF::Symbol16Size);
       break;
     }
   }
@@ -654,45 +640,27 @@
                                                    const MCAsmLayout &Layout) {
   // "Define" each section & symbol. This creates section & symbol
   // entries in the staging area.
-
-  static_assert(sizeof(((COFF::AuxiliaryFile *)nullptr)->FileName) == COFF::SymbolSize,
-                "size mismatch for COFF::AuxiliaryFile::FileName");
-  for (auto FI = Asm.file_names_begin(), FE = Asm.file_names_end();
-       FI != FE; ++FI) {
-    // round up to calculate the number of auxiliary symbols required
-    unsigned Count = (FI->size() + COFF::SymbolSize - 1) / COFF::SymbolSize;
-
-    COFFSymbol *file = createSymbol(".file");
-    file->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
-    file->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
-    file->Aux.resize(Count);
-
-    unsigned Offset = 0;
-    unsigned Length = FI->size();
-    for (auto & Aux : file->Aux) {
-      Aux.AuxType = ATFile;
-
-      if (Length > COFF::SymbolSize) {
-        memcpy(Aux.Aux.File.FileName, FI->c_str() + Offset, COFF::SymbolSize);
-        Length = Length - COFF::SymbolSize;
-      } else {
-        memcpy(Aux.Aux.File.FileName, FI->c_str() + Offset, Length);
-        memset(&Aux.Aux.File.FileName[Length], 0, COFF::SymbolSize - Length);
-        Length = 0;
-      }
-
-      Offset = Offset + COFF::SymbolSize;
-    }
-  }
-
   for (const auto & Section : Asm)
     DefineSection(Section);
 
   for (MCSymbolData &SD : Asm.symbols())
-    if (ExportSymbol(SD, Asm))
+    if (ExportSymbol(SD.getSymbol(), Asm))
       DefineSymbol(SD, Asm, Layout);
 }
 
+bool WinCOFFObjectWriter::IsSymbolRefDifferenceFullyResolvedImpl(
+    const MCAssembler &Asm, const MCSymbolData &DataA, const MCFragment &FB,
+    bool InSet, bool IsPCRel) const {
+  // MS LINK expects to be able to replace all references to a function with a
+  // thunk to implement their /INCREMENTAL feature.  Make sure we don't optimize
+  // away any relocations to functions.
+  if ((((DataA.getFlags() & COFF::SF_TypeMask) >> COFF::SF_TypeShift) >>
+       COFF::SCT_COMPLEX_TYPE_SHIFT) == COFF::IMAGE_SYM_DTYPE_FUNCTION)
+    return false;
+  return MCObjectWriter::IsSymbolRefDifferenceFullyResolvedImpl(Asm, DataA, FB,
+                                                                InSet, IsPCRel);
+}
+
 void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                            const MCAsmLayout &Layout,
                                            const MCFragment *Fragment,
@@ -744,7 +712,7 @@
     // Offset of the symbol in the section
     int64_t a = Layout.getSymbolOffset(&B_SD);
 
-    // Ofeset of the relocation in the section
+    // Offset of the relocation in the section
     int64_t b = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
 
     FixedValue = b - a;
@@ -765,8 +733,8 @@
   // Turn relocations for temporary symbols into section relocations.
   if (coff_symbol->MCData->getSymbol().isTemporary() || CrossSection) {
     Reloc.Symb = coff_symbol->Section->Symbol;
-    FixedValue += Layout.getFragmentOffset(coff_symbol->MCData->Fragment)
-                + coff_symbol->MCData->getOffset();
+    FixedValue += Layout.getFragmentOffset(coff_symbol->MCData->getFragment()) +
+                  coff_symbol->MCData->getOffset();
   } else
     Reloc.Symb = coff_symbol;
 
@@ -828,26 +796,67 @@
 
 void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
                                       const MCAsmLayout &Layout) {
-  // Assign symbol and section indexes and offsets.
-  Header.NumberOfSections = 0;
+  size_t SectionsSize = Sections.size();
+  if (SectionsSize > static_cast<size_t>(INT32_MAX))
+    report_fatal_error(
+        "PE COFF object files can't have more than 2147483647 sections");
 
-  DenseMap<COFFSection *, uint16_t> SectionIndices;
-  for (auto & Section : Sections) {
-    size_t Number = ++Header.NumberOfSections;
+  // Assign symbol and section indexes and offsets.
+  int32_t NumberOfSections = static_cast<int32_t>(SectionsSize);
+
+  UseBigObj = NumberOfSections > COFF::MaxNumberOfSections16;
+
+  DenseMap<COFFSection *, int32_t> SectionIndices(
+      NextPowerOf2(NumberOfSections));
+
+  // Assign section numbers.
+  size_t Number = 1;
+  for (const auto &Section : Sections) {
     SectionIndices[Section.get()] = Number;
-    MakeSectionReal(*Section, Number);
+    Section->Number = Number;
+    Section->Symbol->Data.SectionNumber = Number;
+    Section->Symbol->Aux[0].Aux.SectionDefinition.Number = Number;
+    ++Number;
   }
 
+  Header.NumberOfSections = NumberOfSections;
   Header.NumberOfSymbols = 0;
 
-  for (auto & Symbol : Symbols) {
+  for (auto FI = Asm.file_names_begin(), FE = Asm.file_names_end();
+       FI != FE; ++FI) {
+    // round up to calculate the number of auxiliary symbols required
+    unsigned SymbolSize = UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size;
+    unsigned Count = (FI->size() + SymbolSize - 1) / SymbolSize;
+
+    COFFSymbol *file = createSymbol(".file");
+    file->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
+    file->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
+    file->Aux.resize(Count);
+
+    unsigned Offset = 0;
+    unsigned Length = FI->size();
+    for (auto & Aux : file->Aux) {
+      Aux.AuxType = ATFile;
+
+      if (Length > SymbolSize) {
+        memcpy(&Aux.Aux, FI->c_str() + Offset, SymbolSize);
+        Length = Length - SymbolSize;
+      } else {
+        memcpy(&Aux.Aux, FI->c_str() + Offset, Length);
+        memset((char *)&Aux.Aux + Length, 0, SymbolSize - Length);
+        break;
+      }
+
+      Offset += SymbolSize;
+    }
+  }
+
+  for (auto &Symbol : Symbols) {
     // Update section number & offset for symbols that have them.
     if (Symbol->Section)
       Symbol->Data.SectionNumber = Symbol->Section->Number;
-
     if (Symbol->should_keep()) {
-      MakeSymbolReal(*Symbol, Header.NumberOfSymbols++);
-
+      Symbol->Index = Header.NumberOfSymbols++;
       // Update auxiliary symbol info.
       Symbol->Data.NumberOfAuxSymbols = Symbol->Aux.size();
       Header.NumberOfSymbols += Symbol->Data.NumberOfAuxSymbols;
@@ -855,6 +864,22 @@
       Symbol->Index = -1;
   }
 
+  // Build string table.
+  for (const auto &S : Sections)
+    if (S->Name.size() > COFF::NameSize)
+      Strings.add(S->Name);
+  for (const auto &S : Symbols)
+    if (S->should_keep() && S->Name.size() > COFF::NameSize)
+      Strings.add(S->Name);
+  Strings.finalize(StringTableBuilder::WinCOFF);
+
+  // Set names.
+  for (const auto &S : Sections)
+    SetSectionName(*S);
+  for (auto &S : Symbols)
+    if (S->should_keep())
+      SetSymbolName(*S);
+
   // Fixup weak external references.
   for (auto & Symbol : Symbols) {
     if (Symbol->Other) {
@@ -897,7 +922,10 @@
 
   unsigned offset = 0;
 
-  offset += COFF::HeaderSize;
+  if (UseBigObj)
+    offset += COFF::Header32Size;
+  else
+    offset += COFF::Header16Size;
   offset += COFF::SectionSize * Header.NumberOfSections;
 
   for (const auto & Section : Asm) {
@@ -918,7 +946,7 @@
       bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
 
       if (RelocationsOverflow) {
-        // Signal overflow by setting NumberOfSections to max value. Actual
+        // Signal overflow by setting NumberOfRelocations to max value. Actual
         // size is found in reloc #0. Microsoft tools understand this.
         Sec->Header.NumberOfRelocations = 0xffff;
       } else {
@@ -1014,7 +1042,7 @@
     if (Symbol->Index != -1)
       WriteSymbol(*Symbol);
 
-  OS.write((char const *)&Strings.Data.front(), Strings.Data.size());
+  OS.write(Strings.data().data(), Strings.data().size());
 }
 
 MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_) :

diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index d391a3f..6a8054d 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp

@@ -25,11 +25,11 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/MC/MCWin64EH.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -61,7 +61,7 @@
   DF->getContents().append(Code.begin(), Code.end());
 }
 
-void MCWinCOFFStreamer::InitSections() {
+void MCWinCOFFStreamer::InitSections(bool NoExecStack) {
   // FIXME: this is identical to the ELF one.
   // This emulates the same behavior of GNU as. This makes it easier
   // to compare the output as the major sections are in the same order.
@@ -133,7 +133,7 @@
   if (!CurSymbol)
     FatalError("storage class specified outside of symbol definition");
 
-  if (StorageClass & ~0xff)
+  if (StorageClass & ~COFF::SSC_Invalid)
     FatalError(Twine("storage class value '") + itostr(StorageClass) +
                "' out of range");
 
@@ -163,7 +163,7 @@
   const MCSymbolRefExpr *SRE = MCSymbolRefExpr::Create(Symbol, getContext());
   MCFixup Fixup = MCFixup::Create(DF->getContents().size(), SRE, FK_SecRel_2);
   DF->getFixups().push_back(Fixup);
-  DF->getContents().resize(DF->getContents().size() + 4, 0);
+  DF->getContents().resize(DF->getContents().size() + 2, 0);
 }
 
 void MCWinCOFFStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
@@ -184,14 +184,35 @@
           Symbol->getSection().getVariant() == MCSection::SV_COFF) &&
          "Got non-COFF section in the COFF backend!");
 
-  if (ByteAlignment > 32)
-    report_fatal_error("alignment is limited to 32-bytes");
+  const Triple &T = getContext().getObjectFileInfo()->getTargetTriple();
+  if (T.isKnownWindowsMSVCEnvironment()) {
+    if (ByteAlignment > 32)
+      report_fatal_error("alignment is limited to 32-bytes");
+
+    // Round size up to alignment so that we will honor the alignment request.
+    Size = std::max(Size, static_cast<uint64_t>(ByteAlignment));
+  }
 
   AssignSection(Symbol, nullptr);
 
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
   SD.setExternal(true);
   SD.setCommon(Size, ByteAlignment);
+
+  if (!T.isKnownWindowsMSVCEnvironment() && ByteAlignment > 1) {
+    SmallString<128> Directive;
+    raw_svector_ostream OS(Directive);
+    const MCObjectFileInfo *MFI = getContext().getObjectFileInfo();
+
+    OS << " -aligncomm:\"" << Symbol->getName() << "\","
+       << Log2_32_Ceil(ByteAlignment);
+    OS.flush();
+
+    PushSection();
+    SwitchSection(MFI->getDrectveSection());
+    EmitBytes(Directive);
+    PopSection();
+  }
 }
 
 void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,

diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index 6d09bdb..d169dbe 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp

@@ -109,7 +109,7 @@
   const char *NextLoc = Data.data() + SpaceToSkip;
 
   // Check to see if this is past the end of the archive.
-  if (NextLoc >= Parent->Data->getBufferEnd())
+  if (NextLoc >= Parent->Data.getBufferEnd())
     return Child(Parent, nullptr);
 
   return Child(Parent, NextLoc);
@@ -159,46 +159,36 @@
   return name;
 }
 
-ErrorOr<std::unique_ptr<MemoryBuffer>>
-Archive::Child::getMemoryBuffer(bool FullPath) const {
+ErrorOr<MemoryBufferRef> Archive::Child::getMemoryBufferRef() const {
   ErrorOr<StringRef> NameOrErr = getName();
   if (std::error_code EC = NameOrErr.getError())
     return EC;
   StringRef Name = NameOrErr.get();
-  SmallString<128> Path;
-  std::unique_ptr<MemoryBuffer> Ret(MemoryBuffer::getMemBuffer(
-      getBuffer(),
-      FullPath
-          ? (Twine(Parent->getFileName()) + "(" + Name + ")").toStringRef(Path)
-          : Name,
-      false));
-  return std::move(Ret);
+  return MemoryBufferRef(getBuffer(), Name);
 }
 
 ErrorOr<std::unique_ptr<Binary>>
 Archive::Child::getAsBinary(LLVMContext *Context) const {
-  std::unique_ptr<Binary> ret;
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr = getMemoryBuffer();
+  ErrorOr<MemoryBufferRef> BuffOrErr = getMemoryBufferRef();
   if (std::error_code EC = BuffOrErr.getError())
     return EC;
 
-  std::unique_ptr<MemoryBuffer> Buff(BuffOrErr.get().release());
-  return createBinary(Buff, Context);
+  return createBinary(BuffOrErr.get(), Context);
 }
 
-ErrorOr<Archive *> Archive::create(std::unique_ptr<MemoryBuffer> Source) {
+ErrorOr<std::unique_ptr<Archive>> Archive::create(MemoryBufferRef Source) {
   std::error_code EC;
-  std::unique_ptr<Archive> Ret(new Archive(std::move(Source), EC));
+  std::unique_ptr<Archive> Ret(new Archive(Source, EC));
   if (EC)
     return EC;
-  return Ret.release();
+  return std::move(Ret);
 }
 
-Archive::Archive(std::unique_ptr<MemoryBuffer> Source, std::error_code &ec)
-    : Binary(Binary::ID_Archive, std::move(Source)), SymbolTable(child_end()) {
+Archive::Archive(MemoryBufferRef Source, std::error_code &ec)
+    : Binary(Binary::ID_Archive, Source), SymbolTable(child_end()) {
   // Check for sufficient magic.
-  if (Data->getBufferSize() < 8 ||
-      StringRef(Data->getBufferStart(), 8) != Magic) {
+  if (Data.getBufferSize() < 8 ||
+      StringRef(Data.getBufferStart(), 8) != Magic) {
     ec = object_error::invalid_file_type;
     return;
   }
@@ -250,7 +240,7 @@
     if (ec)
       return;
     Name = NameOrErr.get();
-    if (Name == "__.SYMDEF SORTED") {
+    if (Name == "__.SYMDEF SORTED" || Name == "__.SYMDEF") {
       SymbolTable = i;
       ++i;
     }
@@ -312,13 +302,13 @@
 }
 
 Archive::child_iterator Archive::child_begin(bool SkipInternal) const {
-  if (Data->getBufferSize() == 8) // empty archive.
+  if (Data.getBufferSize() == 8) // empty archive.
     return child_end();
 
   if (SkipInternal)
     return FirstRegular;
 
-  const char *Loc = Data->getBufferStart() + strlen(Magic);
+  const char *Loc = Data.getBufferStart() + strlen(Magic);
   Child c(this, Loc);
   return c;
 }

diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index 9f6a685..c56eeb1 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp

@@ -27,24 +27,23 @@
 
 Binary::~Binary() {}
 
-Binary::Binary(unsigned int Type, std::unique_ptr<MemoryBuffer> Source)
-    : TypeID(Type), Data(std::move(Source)) {}
+Binary::Binary(unsigned int Type, MemoryBufferRef Source)
+    : TypeID(Type), Data(Source) {}
 
-StringRef Binary::getData() const {
-  return Data->getBuffer();
-}
+StringRef Binary::getData() const { return Data.getBuffer(); }
 
-StringRef Binary::getFileName() const {
-  return Data->getBufferIdentifier();
-}
+StringRef Binary::getFileName() const { return Data.getBufferIdentifier(); }
 
-ErrorOr<Binary *> object::createBinary(std::unique_ptr<MemoryBuffer> &Buffer,
-                                       LLVMContext *Context) {
-  sys::fs::file_magic Type = sys::fs::identify_magic(Buffer->getBuffer());
+MemoryBufferRef Binary::getMemoryBufferRef() const { return Data; }
+
+ErrorOr<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
+                                                      LLVMContext *Context) {
+  sys::fs::file_magic Type = sys::fs::identify_magic(Buffer.getBuffer());
 
   switch (Type) {
     case sys::fs::file_magic::archive:
-      return Archive::create(std::move(Buffer));
+      return Archive::create(Buffer);
+    case sys::fs::file_magic::elf:
     case sys::fs::file_magic::elf_relocatable:
     case sys::fs::file_magic::elf_executable:
     case sys::fs::file_magic::elf_shared_object:
@@ -65,7 +64,7 @@
     case sys::fs::file_magic::bitcode:
       return ObjectFile::createSymbolicFile(Buffer, Type, Context);
     case sys::fs::file_magic::macho_universal_binary:
-      return MachOUniversalBinary::create(std::move(Buffer));
+      return MachOUniversalBinary::create(Buffer);
     case sys::fs::file_magic::unknown:
     case sys::fs::file_magic::windows_resource:
       // Unrecognized object file format.
@@ -74,10 +73,18 @@
   llvm_unreachable("Unexpected Binary File Type");
 }
 
-ErrorOr<Binary *> object::createBinary(StringRef Path) {
+ErrorOr<OwningBinary<Binary>> object::createBinary(StringRef Path) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   if (std::error_code EC = FileOrErr.getError())
     return EC;
-  return createBinary(FileOrErr.get());
+  std::unique_ptr<MemoryBuffer> &Buffer = FileOrErr.get();
+
+  ErrorOr<std::unique_ptr<Binary>> BinOrErr =
+      createBinary(Buffer->getMemBufferRef());
+  if (std::error_code EC = BinOrErr.getError())
+    return EC;
+  std::unique_ptr<Binary> &Bin = BinOrErr.get();
+
+  return OwningBinary<Binary>(std::move(Bin), std::move(Buffer));
 }

diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 46ef87d..d5ff7d6 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp

@@ -25,14 +25,13 @@
 using namespace llvm;
 using namespace object;
 
-using support::ulittle8_t;
 using support::ulittle16_t;
 using support::ulittle32_t;
+using support::ulittle64_t;
 using support::little16_t;
 
 // Returns false if size is greater than the buffer size. And sets ec.
-static bool checkSize(const MemoryBuffer &M, std::error_code &EC,
-                      uint64_t Size) {
+static bool checkSize(MemoryBufferRef M, std::error_code &EC, uint64_t Size) {
   if (M.getBufferSize() < Size) {
     EC = object_error::unexpected_eof;
     return false;
@@ -40,17 +39,25 @@
   return true;
 }
 
+static std::error_code checkOffset(MemoryBufferRef M, uintptr_t Addr,
+                                   const uint64_t Size) {
+  if (Addr + Size < Addr || Addr + Size < Size ||
+      Addr + Size > uintptr_t(M.getBufferEnd()) ||
+      Addr < uintptr_t(M.getBufferStart())) {
+    return object_error::unexpected_eof;
+  }
+  return object_error::success;
+}
+
 // Sets Obj unless any bytes in [addr, addr + size) fall outsize of m.
 // Returns unexpected_eof if error.
 template <typename T>
-static std::error_code getObject(const T *&Obj, const MemoryBuffer &M,
-                                 const uint8_t *Ptr,
-                                 const size_t Size = sizeof(T)) {
+static std::error_code getObject(const T *&Obj, MemoryBufferRef M,
+                                 const void *Ptr,
+                                 const uint64_t Size = sizeof(T)) {
   uintptr_t Addr = uintptr_t(Ptr);
-  if (Addr + Size < Addr || Addr + Size < Size ||
-      Addr + Size > uintptr_t(M.getBufferEnd())) {
-    return object_error::unexpected_eof;
-  }
+  if (std::error_code EC = checkOffset(M, Addr, Size))
+    return EC;
   Obj = reinterpret_cast<const T *>(Addr);
   return object_error::success;
 }
@@ -89,20 +96,19 @@
   return false;
 }
 
-const coff_symbol *COFFObjectFile::toSymb(DataRefImpl Ref) const {
-  const coff_symbol *Addr = reinterpret_cast<const coff_symbol*>(Ref.p);
+template <typename coff_symbol_type>
+const coff_symbol_type *COFFObjectFile::toSymb(DataRefImpl Ref) const {
+  const coff_symbol_type *Addr =
+      reinterpret_cast<const coff_symbol_type *>(Ref.p);
 
-# ifndef NDEBUG
+  assert(!checkOffset(Data, uintptr_t(Addr), sizeof(*Addr)));
+#ifndef NDEBUG
   // Verify that the symbol points to a valid entry in the symbol table.
   uintptr_t Offset = uintptr_t(Addr) - uintptr_t(base());
-  if (Offset < COFFHeader->PointerToSymbolTable
-      || Offset >= COFFHeader->PointerToSymbolTable
-         + (COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
-    report_fatal_error("Symbol was outside of symbol table.");
 
-  assert((Offset - COFFHeader->PointerToSymbolTable) % sizeof(coff_symbol)
-         == 0 && "Symbol did not point to the beginning of a symbol");
-# endif
+  assert((Offset - getPointerToSymbolTable()) % sizeof(coff_symbol_type) == 0 &&
+         "Symbol did not point to the beginning of a symbol");
+#endif
 
   return Addr;
 }
@@ -112,8 +118,7 @@
 
 # ifndef NDEBUG
   // Verify that the section points to a valid entry in the section table.
-  if (Addr < SectionTable
-      || Addr >= (SectionTable + COFFHeader->NumberOfSections))
+  if (Addr < SectionTable || Addr >= (SectionTable + getNumberOfSections()))
     report_fatal_error("Section was outside of section table.");
 
   uintptr_t Offset = uintptr_t(Addr) - uintptr_t(SectionTable);
@@ -125,112 +130,180 @@
 }
 
 void COFFObjectFile::moveSymbolNext(DataRefImpl &Ref) const {
-  const coff_symbol *Symb = toSymb(Ref);
-  Symb += 1 + Symb->NumberOfAuxSymbols;
-  Ref.p = reinterpret_cast<uintptr_t>(Symb);
+  auto End = reinterpret_cast<uintptr_t>(StringTable);
+  if (SymbolTable16) {
+    const coff_symbol16 *Symb = toSymb<coff_symbol16>(Ref);
+    Symb += 1 + Symb->NumberOfAuxSymbols;
+    Ref.p = std::min(reinterpret_cast<uintptr_t>(Symb), End);
+  } else if (SymbolTable32) {
+    const coff_symbol32 *Symb = toSymb<coff_symbol32>(Ref);
+    Symb += 1 + Symb->NumberOfAuxSymbols;
+    Ref.p = std::min(reinterpret_cast<uintptr_t>(Symb), End);
+  } else {
+    llvm_unreachable("no symbol table pointer!");
+  }
 }
 
 std::error_code COFFObjectFile::getSymbolName(DataRefImpl Ref,
                                               StringRef &Result) const {
-  const coff_symbol *Symb = toSymb(Ref);
+  COFFSymbolRef Symb = getCOFFSymbol(Ref);
   return getSymbolName(Symb, Result);
 }
 
 std::error_code COFFObjectFile::getSymbolAddress(DataRefImpl Ref,
                                                  uint64_t &Result) const {
-  const coff_symbol *Symb = toSymb(Ref);
-  const coff_section *Section = nullptr;
-  if (std::error_code EC = getSection(Symb->SectionNumber, Section))
-    return EC;
+  COFFSymbolRef Symb = getCOFFSymbol(Ref);
 
-  if (Symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
+  if (Symb.isAnyUndefined()) {
     Result = UnknownAddressOrSize;
-  else if (Section)
-    Result = Section->VirtualAddress + Symb->Value;
-  else
-    Result = Symb->Value;
+    return object_error::success;
+  }
+  if (Symb.isCommon()) {
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  }
+  int32_t SectionNumber = Symb.getSectionNumber();
+  if (!COFF::isReservedSectionNumber(SectionNumber)) {
+    const coff_section *Section = nullptr;
+    if (std::error_code EC = getSection(SectionNumber, Section))
+      return EC;
+
+    Result = Section->VirtualAddress + Symb.getValue();
+    return object_error::success;
+  }
+
+  Result = Symb.getValue();
   return object_error::success;
 }
 
 std::error_code COFFObjectFile::getSymbolType(DataRefImpl Ref,
                                               SymbolRef::Type &Result) const {
-  const coff_symbol *Symb = toSymb(Ref);
+  COFFSymbolRef Symb = getCOFFSymbol(Ref);
+  int32_t SectionNumber = Symb.getSectionNumber();
   Result = SymbolRef::ST_Other;
-  if (Symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
-      Symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED) {
+
+  if (Symb.isAnyUndefined()) {
     Result = SymbolRef::ST_Unknown;
-  } else if (Symb->isFunctionDefinition()) {
+  } else if (Symb.isFunctionDefinition()) {
     Result = SymbolRef::ST_Function;
-  } else {
-    uint32_t Characteristics = 0;
-    if (!COFF::isReservedSectionNumber(Symb->SectionNumber)) {
-      const coff_section *Section = nullptr;
-      if (std::error_code EC = getSection(Symb->SectionNumber, Section))
-        return EC;
-      Characteristics = Section->Characteristics;
-    }
-    if (Characteristics & COFF::IMAGE_SCN_MEM_READ &&
-        ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
+  } else if (Symb.isCommon()) {
+    Result = SymbolRef::ST_Data;
+  } else if (Symb.isFileRecord()) {
+    Result = SymbolRef::ST_File;
+  } else if (SectionNumber == COFF::IMAGE_SYM_DEBUG) {
+    Result = SymbolRef::ST_Debug;
+  } else if (!COFF::isReservedSectionNumber(SectionNumber)) {
+    const coff_section *Section = nullptr;
+    if (std::error_code EC = getSection(SectionNumber, Section))
+      return EC;
+    uint32_t Characteristics = Section->Characteristics;
+    if (Characteristics & COFF::IMAGE_SCN_CNT_CODE)
+      Result = SymbolRef::ST_Function;
+    else if (Characteristics & (COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA))
       Result = SymbolRef::ST_Data;
   }
   return object_error::success;
 }
 
 uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
-  const coff_symbol *Symb = toSymb(Ref);
+  COFFSymbolRef Symb = getCOFFSymbol(Ref);
   uint32_t Result = SymbolRef::SF_None;
 
-  // TODO: Correctly set SF_FormatSpecific, SF_Common
-
-  if (Symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED) {
-    if (Symb->Value == 0)
-      Result |= SymbolRef::SF_Undefined;
-    else
-      Result |= SymbolRef::SF_Common;
-  }
-
-
-  // TODO: This are certainly too restrictive.
-  if (Symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL)
+  if (Symb.isExternal() || Symb.isWeakExternal())
     Result |= SymbolRef::SF_Global;
 
-  if (Symb->StorageClass == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL)
+  if (Symb.isWeakExternal())
     Result |= SymbolRef::SF_Weak;
 
-  if (Symb->SectionNumber == COFF::IMAGE_SYM_ABSOLUTE)
+  if (Symb.getSectionNumber() == COFF::IMAGE_SYM_ABSOLUTE)
     Result |= SymbolRef::SF_Absolute;
 
+  if (Symb.isFileRecord())
+    Result |= SymbolRef::SF_FormatSpecific;
+
+  if (Symb.isSectionDefinition())
+    Result |= SymbolRef::SF_FormatSpecific;
+
+  if (Symb.isCommon())
+    Result |= SymbolRef::SF_Common;
+
+  if (Symb.isAnyUndefined())
+    Result |= SymbolRef::SF_Undefined;
+
   return Result;
 }
 
 std::error_code COFFObjectFile::getSymbolSize(DataRefImpl Ref,
                                               uint64_t &Result) const {
-  // FIXME: Return the correct size. This requires looking at all the symbols
-  //        in the same section as this symbol, and looking for either the next
-  //        symbol, or the end of the section.
-  const coff_symbol *Symb = toSymb(Ref);
-  const coff_section *Section = nullptr;
-  if (std::error_code EC = getSection(Symb->SectionNumber, Section))
-    return EC;
+  COFFSymbolRef Symb = getCOFFSymbol(Ref);
 
-  if (Symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
+  if (Symb.isAnyUndefined()) {
     Result = UnknownAddressOrSize;
-  else if (Section)
-    Result = Section->SizeOfRawData - Symb->Value;
-  else
+    return object_error::success;
+  }
+  if (Symb.isCommon()) {
+    Result = Symb.getValue();
+    return object_error::success;
+  }
+
+  // Let's attempt to get the size of the symbol by looking at the address of
+  // the symbol after the symbol in question.
+  uint64_t SymbAddr;
+  if (std::error_code EC = getSymbolAddress(Ref, SymbAddr))
+    return EC;
+  int32_t SectionNumber = Symb.getSectionNumber();
+  if (COFF::isReservedSectionNumber(SectionNumber)) {
+    // Absolute and debug symbols aren't sorted in any interesting way.
     Result = 0;
+    return object_error::success;
+  }
+  const section_iterator SecEnd = section_end();
+  uint64_t AfterAddr = UnknownAddressOrSize;
+  for (const symbol_iterator &SymbI : symbols()) {
+    section_iterator SecI = SecEnd;
+    if (std::error_code EC = SymbI->getSection(SecI))
+      return EC;
+    // Check the symbol's section, skip it if it's in the wrong section.
+    // First, make sure it is in any section.
+    if (SecI == SecEnd)
+      continue;
+    // Second, make sure it is in the same section as the symbol in question.
+    if (!sectionContainsSymbol(SecI->getRawDataRefImpl(), Ref))
+      continue;
+    uint64_t Addr;
+    if (std::error_code EC = SymbI->getAddress(Addr))
+      return EC;
+    // We want to compare our symbol in question with the closest possible
+    // symbol that comes after.
+    if (AfterAddr > Addr && Addr > SymbAddr)
+      AfterAddr = Addr;
+  }
+  if (AfterAddr == UnknownAddressOrSize) {
+    // No symbol comes after this one, assume that everything after our symbol
+    // is part of it.
+    const coff_section *Section = nullptr;
+    if (std::error_code EC = getSection(SectionNumber, Section))
+      return EC;
+    Result = Section->SizeOfRawData - Symb.getValue();
+  } else {
+    // Take the difference between our symbol and the symbol that comes after
+    // our symbol.
+    Result = AfterAddr - SymbAddr;
+  }
+
   return object_error::success;
 }
 
 std::error_code
 COFFObjectFile::getSymbolSection(DataRefImpl Ref,
                                  section_iterator &Result) const {
-  const coff_symbol *Symb = toSymb(Ref);
-  if (COFF::isReservedSectionNumber(Symb->SectionNumber)) {
+  COFFSymbolRef Symb = getCOFFSymbol(Ref);
+  if (COFF::isReservedSectionNumber(Symb.getSectionNumber())) {
     Result = section_end();
   } else {
     const coff_section *Sec = nullptr;
-    if (std::error_code EC = getSection(Symb->SectionNumber, Sec))
+    if (std::error_code EC = getSection(Symb.getSectionNumber(), Sec))
       return EC;
     DataRefImpl Ref;
     Ref.p = reinterpret_cast<uintptr_t>(Sec);
@@ -251,18 +324,13 @@
   return getSectionName(Sec, Result);
 }
 
-std::error_code COFFObjectFile::getSectionAddress(DataRefImpl Ref,
-                                                  uint64_t &Result) const {
+uint64_t COFFObjectFile::getSectionAddress(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
-  Result = Sec->VirtualAddress;
-  return object_error::success;
+  return Sec->VirtualAddress;
 }
 
-std::error_code COFFObjectFile::getSectionSize(DataRefImpl Ref,
-                                               uint64_t &Result) const {
-  const coff_section *Sec = toSec(Ref);
-  Result = Sec->SizeOfRawData;
-  return object_error::success;
+uint64_t COFFObjectFile::getSectionSize(DataRefImpl Ref) const {
+  return getSectionSize(toSec(Ref));
 }
 
 std::error_code COFFObjectFile::getSectionContents(DataRefImpl Ref,
@@ -274,146 +342,144 @@
   return EC;
 }
 
-std::error_code COFFObjectFile::getSectionAlignment(DataRefImpl Ref,
-                                                    uint64_t &Res) const {
+uint64_t COFFObjectFile::getSectionAlignment(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
-  if (!Sec)
-    return object_error::parse_failed;
-  Res = uint64_t(1) << (((Sec->Characteristics & 0x00F00000) >> 20) - 1);
-  return object_error::success;
+  return uint64_t(1) << (((Sec->Characteristics & 0x00F00000) >> 20) - 1);
 }
 
-std::error_code COFFObjectFile::isSectionText(DataRefImpl Ref,
-                                              bool &Result) const {
+bool COFFObjectFile::isSectionText(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
-  Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_CODE;
-  return object_error::success;
+  return Sec->Characteristics & COFF::IMAGE_SCN_CNT_CODE;
 }
 
-std::error_code COFFObjectFile::isSectionData(DataRefImpl Ref,
-                                              bool &Result) const {
+bool COFFObjectFile::isSectionData(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
-  Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
-  return object_error::success;
+  return Sec->Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
 }
 
-std::error_code COFFObjectFile::isSectionBSS(DataRefImpl Ref,
-                                             bool &Result) const {
+bool COFFObjectFile::isSectionBSS(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
-  Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
-  return object_error::success;
+  return Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
 }
 
-std::error_code
-COFFObjectFile::isSectionRequiredForExecution(DataRefImpl Ref,
-                                              bool &Result) const {
-  // FIXME: Unimplemented
-  Result = true;
-  return object_error::success;
-}
-
-std::error_code COFFObjectFile::isSectionVirtual(DataRefImpl Ref,
-                                                 bool &Result) const {
+bool COFFObjectFile::isSectionRequiredForExecution(DataRefImpl Ref) const {
+  // Sections marked 'Info', 'Remove', or 'Discardable' aren't required for
+  // execution.
   const coff_section *Sec = toSec(Ref);
-  Result = Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
-  return object_error::success;
+  return !(Sec->Characteristics &
+           (COFF::IMAGE_SCN_LNK_INFO | COFF::IMAGE_SCN_LNK_REMOVE |
+            COFF::IMAGE_SCN_MEM_DISCARDABLE));
 }
 
-std::error_code COFFObjectFile::isSectionZeroInit(DataRefImpl Ref,
-                                                  bool &Result) const {
-  // FIXME: Unimplemented.
-  Result = false;
-  return object_error::success;
+bool COFFObjectFile::isSectionVirtual(DataRefImpl Ref) const {
+  const coff_section *Sec = toSec(Ref);
+  return Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
 }
 
-std::error_code COFFObjectFile::isSectionReadOnlyData(DataRefImpl Ref,
-                                                      bool &Result) const {
-  // FIXME: Unimplemented.
-  Result = false;
-  return object_error::success;
+bool COFFObjectFile::isSectionZeroInit(DataRefImpl Ref) const {
+  const coff_section *Sec = toSec(Ref);
+  return Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
 }
 
-std::error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl SecRef,
-                                                      DataRefImpl SymbRef,
-                                                      bool &Result) const {
+bool COFFObjectFile::isSectionReadOnlyData(DataRefImpl Ref) const {
+  const coff_section *Sec = toSec(Ref);
+  // Check if it's any sort of data section.
+  if (!(Sec->Characteristics & (COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)))
+    return false;
+  // If it's writable or executable or contains code, it isn't read-only data.
+  if (Sec->Characteristics &
+      (COFF::IMAGE_SCN_CNT_CODE | COFF::IMAGE_SCN_MEM_EXECUTE |
+       COFF::IMAGE_SCN_MEM_WRITE))
+    return false;
+  return true;
+}
+
+bool COFFObjectFile::sectionContainsSymbol(DataRefImpl SecRef,
+                                           DataRefImpl SymbRef) const {
   const coff_section *Sec = toSec(SecRef);
-  const coff_symbol *Symb = toSymb(SymbRef);
-  const coff_section *SymbSec = nullptr;
-  if (std::error_code EC = getSection(Symb->SectionNumber, SymbSec))
-    return EC;
-  if (SymbSec == Sec)
-    Result = true;
-  else
-    Result = false;
-  return object_error::success;
-}
-
-relocation_iterator COFFObjectFile::section_rel_begin(DataRefImpl Ref) const {
-  const coff_section *Sec = toSec(Ref);
-  DataRefImpl Ret;
-  if (Sec->NumberOfRelocations == 0) {
-    Ret.p = 0;
-  } else {
-    auto begin = reinterpret_cast<const coff_relocation*>(
-        base() + Sec->PointerToRelocations);
-    if (Sec->hasExtendedRelocations()) {
-      // Skip the first relocation entry repurposed to store the number of
-      // relocations.
-      begin++;
-    }
-    Ret.p = reinterpret_cast<uintptr_t>(begin);
-  }
-  return relocation_iterator(RelocationRef(Ret, this));
+  COFFSymbolRef Symb = getCOFFSymbol(SymbRef);
+  int32_t SecNumber = (Sec - SectionTable) + 1;
+  return SecNumber == Symb.getSectionNumber();
 }
 
 static uint32_t getNumberOfRelocations(const coff_section *Sec,
-                                       const uint8_t *base) {
+                                       MemoryBufferRef M, const uint8_t *base) {
   // The field for the number of relocations in COFF section table is only
   // 16-bit wide. If a section has more than 65535 relocations, 0xFFFF is set to
   // NumberOfRelocations field, and the actual relocation count is stored in the
   // VirtualAddress field in the first relocation entry.
   if (Sec->hasExtendedRelocations()) {
-    auto *FirstReloc = reinterpret_cast<const coff_relocation*>(
-        base + Sec->PointerToRelocations);
+    const coff_relocation *FirstReloc;
+    if (getObject(FirstReloc, M, reinterpret_cast<const coff_relocation*>(
+        base + Sec->PointerToRelocations)))
+      return 0;
     return FirstReloc->VirtualAddress;
   }
   return Sec->NumberOfRelocations;
 }
 
+static const coff_relocation *
+getFirstReloc(const coff_section *Sec, MemoryBufferRef M, const uint8_t *Base) {
+  uint64_t NumRelocs = getNumberOfRelocations(Sec, M, Base);
+  if (!NumRelocs)
+    return nullptr;
+  auto begin = reinterpret_cast<const coff_relocation *>(
+      Base + Sec->PointerToRelocations);
+  if (Sec->hasExtendedRelocations()) {
+    // Skip the first relocation entry repurposed to store the number of
+    // relocations.
+    begin++;
+  }
+  if (checkOffset(M, uintptr_t(begin), sizeof(coff_relocation) * NumRelocs))
+    return nullptr;
+  return begin;
+}
+
+relocation_iterator COFFObjectFile::section_rel_begin(DataRefImpl Ref) const {
+  const coff_section *Sec = toSec(Ref);
+  const coff_relocation *begin = getFirstReloc(Sec, Data, base());
+  DataRefImpl Ret;
+  Ret.p = reinterpret_cast<uintptr_t>(begin);
+  return relocation_iterator(RelocationRef(Ret, this));
+}
+
 relocation_iterator COFFObjectFile::section_rel_end(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
+  const coff_relocation *I = getFirstReloc(Sec, Data, base());
+  if (I)
+    I += getNumberOfRelocations(Sec, Data, base());
   DataRefImpl Ret;
-  if (Sec->NumberOfRelocations == 0) {
-    Ret.p = 0;
-  } else {
-    auto begin = reinterpret_cast<const coff_relocation*>(
-        base() + Sec->PointerToRelocations);
-    uint32_t NumReloc = getNumberOfRelocations(Sec, base());
-    Ret.p = reinterpret_cast<uintptr_t>(begin + NumReloc);
-  }
+  Ret.p = reinterpret_cast<uintptr_t>(I);
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
 // Initialize the pointer to the symbol table.
 std::error_code COFFObjectFile::initSymbolTablePtr() {
-  if (std::error_code EC = getObject(
-          SymbolTable, *Data, base() + COFFHeader->PointerToSymbolTable,
-          COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
-    return EC;
+  if (COFFHeader)
+    if (std::error_code EC = getObject(
+            SymbolTable16, Data, base() + getPointerToSymbolTable(),
+            (uint64_t)getNumberOfSymbols() * getSymbolTableEntrySize()))
+      return EC;
+
+  if (COFFBigObjHeader)
+    if (std::error_code EC = getObject(
+            SymbolTable32, Data, base() + getPointerToSymbolTable(),
+            (uint64_t)getNumberOfSymbols() * getSymbolTableEntrySize()))
+      return EC;
 
   // Find string table. The first four byte of the string table contains the
   // total size of the string table, including the size field itself. If the
   // string table is empty, the value of the first four byte would be 4.
-  const uint8_t *StringTableAddr =
-      base() + COFFHeader->PointerToSymbolTable +
-      COFFHeader->NumberOfSymbols * sizeof(coff_symbol);
+  uint32_t StringTableOffset = getPointerToSymbolTable() +
+                               getNumberOfSymbols() * getSymbolTableEntrySize();
+  const uint8_t *StringTableAddr = base() + StringTableOffset;
   const ulittle32_t *StringTableSizePtr;
-  if (std::error_code EC =
-          getObject(StringTableSizePtr, *Data, StringTableAddr))
+  if (std::error_code EC = getObject(StringTableSizePtr, Data, StringTableAddr))
     return EC;
   StringTableSize = *StringTableSizePtr;
   if (std::error_code EC =
-          getObject(StringTable, *Data, StringTableAddr, StringTableSize))
+          getObject(StringTable, Data, StringTableAddr, StringTableSize))
     return EC;
 
   // Treat table sizes < 4 as empty because contrary to the PECOFF spec, some
@@ -477,8 +543,9 @@
     return object_error::success;
 
   uint32_t ImportTableRva = DataEntry->RelativeVirtualAddress;
+  // -1 because the last entry is the null entry.
   NumberOfImportDirectory = DataEntry->Size /
-      sizeof(import_directory_table_entry);
+      sizeof(import_directory_table_entry) - 1;
 
   // Find the section that contains the RVA. This is needed because the RVA is
   // the import table's memory address which is different from its file offset.
@@ -490,6 +557,26 @@
   return object_error::success;
 }
 
+// Initializes DelayImportDirectory and NumberOfDelayImportDirectory.
+std::error_code COFFObjectFile::initDelayImportTablePtr() {
+  const data_directory *DataEntry;
+  if (getDataDirectory(COFF::DELAY_IMPORT_DESCRIPTOR, DataEntry))
+    return object_error::success;
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return object_error::success;
+
+  uint32_t RVA = DataEntry->RelativeVirtualAddress;
+  NumberOfDelayImportDirectory = DataEntry->Size /
+      sizeof(delay_import_directory_table_entry) - 1;
+
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC = getRvaPtr(RVA, IntPtr))
+    return EC;
+  DelayImportDirectory = reinterpret_cast<
+      const delay_import_directory_table_entry *>(IntPtr);
+  return object_error::success;
+}
+
 // Find the export table.
 std::error_code COFFObjectFile::initExportTablePtr() {
   // First, we get the RVA of the export table. If the file lacks a pointer to
@@ -511,15 +598,34 @@
   return object_error::success;
 }
 
-COFFObjectFile::COFFObjectFile(std::unique_ptr<MemoryBuffer> Object,
-                               std::error_code &EC)
-    : ObjectFile(Binary::ID_COFF, std::move(Object)), COFFHeader(nullptr),
-      PE32Header(nullptr), PE32PlusHeader(nullptr), DataDirectory(nullptr),
-      SectionTable(nullptr), SymbolTable(nullptr), StringTable(nullptr),
-      StringTableSize(0), ImportDirectory(nullptr), NumberOfImportDirectory(0),
-      ExportDirectory(nullptr) {
+std::error_code COFFObjectFile::initBaseRelocPtr() {
+  const data_directory *DataEntry;
+  if (getDataDirectory(COFF::BASE_RELOCATION_TABLE, DataEntry))
+    return object_error::success;
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return object_error::success;
+
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+    return EC;
+  BaseRelocHeader = reinterpret_cast<const coff_base_reloc_block_header *>(
+      IntPtr);
+  BaseRelocEnd = reinterpret_cast<coff_base_reloc_block_header *>(
+      IntPtr + DataEntry->Size);
+  return object_error::success;
+}
+
+COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
+    : ObjectFile(Binary::ID_COFF, Object), COFFHeader(nullptr),
+      COFFBigObjHeader(nullptr), PE32Header(nullptr), PE32PlusHeader(nullptr),
+      DataDirectory(nullptr), SectionTable(nullptr), SymbolTable16(nullptr),
+      SymbolTable32(nullptr), StringTable(nullptr), StringTableSize(0),
+      ImportDirectory(nullptr), NumberOfImportDirectory(0),
+      DelayImportDirectory(nullptr), NumberOfDelayImportDirectory(0),
+      ExportDirectory(nullptr), BaseRelocHeader(nullptr),
+      BaseRelocEnd(nullptr) {
   // Check that we at least have enough room for a header.
-  if (!checkSize(*Data, EC, sizeof(coff_file_header)))
+  if (!checkSize(Data, EC, sizeof(coff_file_header)))
     return;
 
   // The current location in the file where we are looking at.
@@ -530,37 +636,66 @@
   bool HasPEHeader = false;
 
   // Check if this is a PE/COFF file.
-  if (base()[0] == 0x4d && base()[1] == 0x5a) {
+  if (checkSize(Data, EC, sizeof(dos_header) + sizeof(COFF::PEMagic))) {
     // PE/COFF, seek through MS-DOS compatibility stub and 4-byte
     // PE signature to find 'normal' COFF header.
-    if (!checkSize(*Data, EC, 0x3c + 8))
-      return;
-    CurPtr = *reinterpret_cast<const ulittle16_t *>(base() + 0x3c);
-    // Check the PE magic bytes. ("PE\0\0")
-    if (std::memcmp(base() + CurPtr, "PE\0\0", 4) != 0) {
-      EC = object_error::parse_failed;
-      return;
+    const auto *DH = reinterpret_cast<const dos_header *>(base());
+    if (DH->Magic[0] == 'M' && DH->Magic[1] == 'Z') {
+      CurPtr = DH->AddressOfNewExeHeader;
+      // Check the PE magic bytes. ("PE\0\0")
+      if (memcmp(base() + CurPtr, COFF::PEMagic, sizeof(COFF::PEMagic)) != 0) {
+        EC = object_error::parse_failed;
+        return;
+      }
+      CurPtr += sizeof(COFF::PEMagic); // Skip the PE magic bytes.
+      HasPEHeader = true;
     }
-    CurPtr += 4; // Skip the PE magic bytes.
-    HasPEHeader = true;
   }
 
-  if ((EC = getObject(COFFHeader, *Data, base() + CurPtr)))
+  if ((EC = getObject(COFFHeader, Data, base() + CurPtr)))
     return;
-  CurPtr += sizeof(coff_file_header);
+
+  // It might be a bigobj file, let's check.  Note that COFF bigobj and COFF
+  // import libraries share a common prefix but bigobj is more restrictive.
+  if (!HasPEHeader && COFFHeader->Machine == COFF::IMAGE_FILE_MACHINE_UNKNOWN &&
+      COFFHeader->NumberOfSections == uint16_t(0xffff) &&
+      checkSize(Data, EC, sizeof(coff_bigobj_file_header))) {
+    if ((EC = getObject(COFFBigObjHeader, Data, base() + CurPtr)))
+      return;
+
+    // Verify that we are dealing with bigobj.
+    if (COFFBigObjHeader->Version >= COFF::BigObjHeader::MinBigObjectVersion &&
+        std::memcmp(COFFBigObjHeader->UUID, COFF::BigObjMagic,
+                    sizeof(COFF::BigObjMagic)) == 0) {
+      COFFHeader = nullptr;
+      CurPtr += sizeof(coff_bigobj_file_header);
+    } else {
+      // It's not a bigobj.
+      COFFBigObjHeader = nullptr;
+    }
+  }
+  if (COFFHeader) {
+    // The prior checkSize call may have failed.  This isn't a hard error
+    // because we were just trying to sniff out bigobj.
+    EC = object_error::success;
+    CurPtr += sizeof(coff_file_header);
+
+    if (COFFHeader->isImportLibrary())
+      return;
+  }
 
   if (HasPEHeader) {
     const pe32_header *Header;
-    if ((EC = getObject(Header, *Data, base() + CurPtr)))
+    if ((EC = getObject(Header, Data, base() + CurPtr)))
       return;
 
     const uint8_t *DataDirAddr;
     uint64_t DataDirSize;
-    if (Header->Magic == 0x10b) {
+    if (Header->Magic == COFF::PE32Header::PE32) {
       PE32Header = Header;
       DataDirAddr = base() + CurPtr + sizeof(pe32_header);
       DataDirSize = sizeof(data_directory) * PE32Header->NumberOfRvaAndSize;
-    } else if (Header->Magic == 0x20b) {
+    } else if (Header->Magic == COFF::PE32Header::PE32_PLUS) {
       PE32PlusHeader = reinterpret_cast<const pe32plus_header *>(Header);
       DataDirAddr = base() + CurPtr + sizeof(pe32plus_header);
       DataDirSize = sizeof(data_directory) * PE32PlusHeader->NumberOfRvaAndSize;
@@ -569,37 +704,47 @@
       EC = object_error::parse_failed;
       return;
     }
-    if ((EC = getObject(DataDirectory, *Data, DataDirAddr, DataDirSize)))
+    if ((EC = getObject(DataDirectory, Data, DataDirAddr, DataDirSize)))
       return;
     CurPtr += COFFHeader->SizeOfOptionalHeader;
   }
 
-  if (COFFHeader->isImportLibrary())
-    return;
-
-  if ((EC = getObject(SectionTable, *Data, base() + CurPtr,
-                      COFFHeader->NumberOfSections * sizeof(coff_section))))
+  if ((EC = getObject(SectionTable, Data, base() + CurPtr,
+                      (uint64_t)getNumberOfSections() * sizeof(coff_section))))
     return;
 
   // Initialize the pointer to the symbol table.
-  if (COFFHeader->PointerToSymbolTable != 0)
+  if (getPointerToSymbolTable() != 0) {
     if ((EC = initSymbolTablePtr()))
       return;
+  } else {
+    // We had better not have any symbols if we don't have a symbol table.
+    if (getNumberOfSymbols() != 0) {
+      EC = object_error::parse_failed;
+      return;
+    }
+  }
 
   // Initialize the pointer to the beginning of the import table.
   if ((EC = initImportTablePtr()))
     return;
+  if ((EC = initDelayImportTablePtr()))
+    return;
 
   // Initialize the pointer to the export table.
   if ((EC = initExportTablePtr()))
     return;
 
+  // Initialize the pointer to the base relocation table.
+  if ((EC = initBaseRelocPtr()))
+    return;
+
   EC = object_error::success;
 }
 
 basic_symbol_iterator COFFObjectFile::symbol_begin_impl() const {
   DataRefImpl Ret;
-  Ret.p = reinterpret_cast<uintptr_t>(SymbolTable);
+  Ret.p = getSymbolTable();
   return basic_symbol_iterator(SymbolRef(Ret, this));
 }
 
@@ -610,21 +755,6 @@
   return basic_symbol_iterator(SymbolRef(Ret, this));
 }
 
-library_iterator COFFObjectFile::needed_library_begin() const {
-  // TODO: implement
-  report_fatal_error("Libraries needed unimplemented in COFFObjectFile");
-}
-
-library_iterator COFFObjectFile::needed_library_end() const {
-  // TODO: implement
-  report_fatal_error("Libraries needed unimplemented in COFFObjectFile");
-}
-
-StringRef COFFObjectFile::getLoadName() const {
-  // COFF does not have this field.
-  return "";
-}
-
 import_directory_iterator COFFObjectFile::import_directory_begin() const {
   return import_directory_iterator(
       ImportDirectoryEntryRef(ImportDirectory, 0, this));
@@ -635,6 +765,19 @@
       ImportDirectoryEntryRef(ImportDirectory, NumberOfImportDirectory, this));
 }
 
+delay_import_directory_iterator
+COFFObjectFile::delay_import_directory_begin() const {
+  return delay_import_directory_iterator(
+      DelayImportDirectoryEntryRef(DelayImportDirectory, 0, this));
+}
+
+delay_import_directory_iterator
+COFFObjectFile::delay_import_directory_end() const {
+  return delay_import_directory_iterator(
+      DelayImportDirectoryEntryRef(
+          DelayImportDirectory, NumberOfDelayImportDirectory, this));
+}
+
 export_directory_iterator COFFObjectFile::export_directory_begin() const {
   return export_directory_iterator(
       ExportDirectoryEntryRef(ExportDirectory, 0, this));
@@ -656,18 +799,26 @@
 
 section_iterator COFFObjectFile::section_end() const {
   DataRefImpl Ret;
-  int NumSections = COFFHeader->isImportLibrary()
-      ? 0 : COFFHeader->NumberOfSections;
+  int NumSections =
+      COFFHeader && COFFHeader->isImportLibrary() ? 0 : getNumberOfSections();
   Ret.p = reinterpret_cast<uintptr_t>(SectionTable + NumSections);
   return section_iterator(SectionRef(Ret, this));
 }
 
+base_reloc_iterator COFFObjectFile::base_reloc_begin() const {
+  return base_reloc_iterator(BaseRelocRef(BaseRelocHeader, this));
+}
+
+base_reloc_iterator COFFObjectFile::base_reloc_end() const {
+  return base_reloc_iterator(BaseRelocRef(BaseRelocEnd, this));
+}
+
 uint8_t COFFObjectFile::getBytesInAddress() const {
   return getArch() == Triple::x86_64 ? 8 : 4;
 }
 
 StringRef COFFObjectFile::getFileFormatName() const {
-  switch(COFFHeader->Machine) {
+  switch(getMachine()) {
   case COFF::IMAGE_FILE_MACHINE_I386:
     return "COFF-i386";
   case COFF::IMAGE_FILE_MACHINE_AMD64:
@@ -680,7 +831,7 @@
 }
 
 unsigned COFFObjectFile::getArch() const {
-  switch(COFFHeader->Machine) {
+  switch (getMachine()) {
   case COFF::IMAGE_FILE_MACHINE_I386:
     return Triple::x86;
   case COFF::IMAGE_FILE_MACHINE_AMD64:
@@ -692,16 +843,24 @@
   }
 }
 
-// This method is kept here because lld uses this. As soon as we make
-// lld to use getCOFFHeader, this method will be removed.
-std::error_code COFFObjectFile::getHeader(const coff_file_header *&Res) const {
-  return getCOFFHeader(Res);
+iterator_range<import_directory_iterator>
+COFFObjectFile::import_directories() const {
+  return make_range(import_directory_begin(), import_directory_end());
 }
 
-std::error_code
-COFFObjectFile::getCOFFHeader(const coff_file_header *&Res) const {
-  Res = COFFHeader;
-  return object_error::success;
+iterator_range<delay_import_directory_iterator>
+COFFObjectFile::delay_import_directories() const {
+  return make_range(delay_import_directory_begin(),
+                    delay_import_directory_end());
+}
+
+iterator_range<export_directory_iterator>
+COFFObjectFile::export_directories() const {
+  return make_range(export_directory_begin(), export_directory_end());
+}
+
+iterator_range<base_reloc_iterator> COFFObjectFile::base_relocs() const {
+  return make_range(base_reloc_begin(), base_reloc_end());
 }
 
 std::error_code COFFObjectFile::getPE32Header(const pe32_header *&Res) const {
@@ -719,28 +878,32 @@
 COFFObjectFile::getDataDirectory(uint32_t Index,
                                  const data_directory *&Res) const {
   // Error if if there's no data directory or the index is out of range.
-  if (!DataDirectory)
+  if (!DataDirectory) {
+    Res = nullptr;
     return object_error::parse_failed;
+  }
   assert(PE32Header || PE32PlusHeader);
   uint32_t NumEnt = PE32Header ? PE32Header->NumberOfRvaAndSize
                                : PE32PlusHeader->NumberOfRvaAndSize;
-  if (Index > NumEnt)
+  if (Index >= NumEnt) {
+    Res = nullptr;
     return object_error::parse_failed;
+  }
   Res = &DataDirectory[Index];
   return object_error::success;
 }
 
 std::error_code COFFObjectFile::getSection(int32_t Index,
                                            const coff_section *&Result) const {
-  // Check for special index values.
+  Result = nullptr;
   if (COFF::isReservedSectionNumber(Index))
-    Result = nullptr;
-  else if (Index > 0 && Index <= COFFHeader->NumberOfSections)
+    return object_error::success;
+  if (static_cast<uint32_t>(Index) <= getNumberOfSections()) {
     // We already verified the section table data, so no need to check again.
     Result = SectionTable + (Index - 1);
-  else
-    return object_error::parse_failed;
-  return object_error::success;
+    return object_error::success;
+  }
+  return object_error::parse_failed;
 }
 
 std::error_code COFFObjectFile::getString(uint32_t Offset,
@@ -754,71 +917,62 @@
   return object_error::success;
 }
 
-std::error_code COFFObjectFile::getSymbol(uint32_t Index,
-                                          const coff_symbol *&Result) const {
-  if (Index < COFFHeader->NumberOfSymbols)
-    Result = SymbolTable + Index;
-  else
-    return object_error::parse_failed;
-  return object_error::success;
-}
-
-std::error_code COFFObjectFile::getSymbolName(const coff_symbol *Symbol,
+std::error_code COFFObjectFile::getSymbolName(COFFSymbolRef Symbol,
                                               StringRef &Res) const {
   // Check for string table entry. First 4 bytes are 0.
-  if (Symbol->Name.Offset.Zeroes == 0) {
-    uint32_t Offset = Symbol->Name.Offset.Offset;
+  if (Symbol.getStringTableOffset().Zeroes == 0) {
+    uint32_t Offset = Symbol.getStringTableOffset().Offset;
     if (std::error_code EC = getString(Offset, Res))
       return EC;
     return object_error::success;
   }
 
-  if (Symbol->Name.ShortName[7] == 0)
+  if (Symbol.getShortName()[COFF::NameSize - 1] == 0)
     // Null terminated, let ::strlen figure out the length.
-    Res = StringRef(Symbol->Name.ShortName);
+    Res = StringRef(Symbol.getShortName());
   else
     // Not null terminated, use all 8 bytes.
-    Res = StringRef(Symbol->Name.ShortName, 8);
+    Res = StringRef(Symbol.getShortName(), COFF::NameSize);
   return object_error::success;
 }
 
-ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
-                                  const coff_symbol *Symbol) const {
+ArrayRef<uint8_t>
+COFFObjectFile::getSymbolAuxData(COFFSymbolRef Symbol) const {
   const uint8_t *Aux = nullptr;
 
-  if (Symbol->NumberOfAuxSymbols > 0) {
-  // AUX data comes immediately after the symbol in COFF
-    Aux = reinterpret_cast<const uint8_t *>(Symbol + 1);
+  size_t SymbolSize = getSymbolTableEntrySize();
+  if (Symbol.getNumberOfAuxSymbols() > 0) {
+    // AUX data comes immediately after the symbol in COFF
+    Aux = reinterpret_cast<const uint8_t *>(Symbol.getRawPtr()) + SymbolSize;
 # ifndef NDEBUG
     // Verify that the Aux symbol points to a valid entry in the symbol table.
     uintptr_t Offset = uintptr_t(Aux) - uintptr_t(base());
-    if (Offset < COFFHeader->PointerToSymbolTable
-        || Offset >= COFFHeader->PointerToSymbolTable
-           + (COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
+    if (Offset < getPointerToSymbolTable() ||
+        Offset >=
+            getPointerToSymbolTable() + (getNumberOfSymbols() * SymbolSize))
       report_fatal_error("Aux Symbol data was outside of symbol table.");
 
-    assert((Offset - COFFHeader->PointerToSymbolTable) % sizeof(coff_symbol)
-         == 0 && "Aux Symbol data did not point to the beginning of a symbol");
+    assert((Offset - getPointerToSymbolTable()) % SymbolSize == 0 &&
+           "Aux Symbol data did not point to the beginning of a symbol");
 # endif
   }
-  return ArrayRef<uint8_t>(Aux,
-                           Symbol->NumberOfAuxSymbols * sizeof(coff_symbol));
+  return makeArrayRef(Aux, Symbol.getNumberOfAuxSymbols() * SymbolSize);
 }
 
 std::error_code COFFObjectFile::getSectionName(const coff_section *Sec,
                                                StringRef &Res) const {
   StringRef Name;
-  if (Sec->Name[7] == 0)
+  if (Sec->Name[COFF::NameSize - 1] == 0)
     // Null terminated, let ::strlen figure out the length.
     Name = Sec->Name;
   else
     // Not null terminated, use all 8 bytes.
-    Name = StringRef(Sec->Name, 8);
+    Name = StringRef(Sec->Name, COFF::NameSize);
 
   // Check for string table entry. First byte is '/'.
-  if (Name[0] == '/') {
+  if (Name.startswith("/")) {
     uint32_t Offset;
-    if (Name[1] == '/') {
+    if (Name.startswith("//")) {
       if (decodeBase64StringEntry(Name.substr(2), Offset))
         return object_error::parse_failed;
     } else {
@@ -833,18 +987,41 @@
   return object_error::success;
 }
 
+uint64_t COFFObjectFile::getSectionSize(const coff_section *Sec) const {
+  // SizeOfRawData and VirtualSize change what they represent depending on
+  // whether or not we have an executable image.
+  //
+  // For object files, SizeOfRawData contains the size of section's data;
+  // VirtualSize is always zero.
+  //
+  // For executables, SizeOfRawData *must* be a multiple of FileAlignment; the
+  // actual section size is in VirtualSize.  It is possible for VirtualSize to
+  // be greater than SizeOfRawData; the contents past that point should be
+  // considered to be zero.
+  uint32_t SectionSize;
+  if (Sec->VirtualSize)
+    SectionSize = std::min(Sec->VirtualSize, Sec->SizeOfRawData);
+  else
+    SectionSize = Sec->SizeOfRawData;
+
+  return SectionSize;
+}
+
 std::error_code
 COFFObjectFile::getSectionContents(const coff_section *Sec,
                                    ArrayRef<uint8_t> &Res) const {
+  // PointerToRawData and SizeOfRawData won't make sense for BSS sections,
+  // don't do anything interesting for them.
+  assert((Sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) == 0 &&
+         "BSS sections don't have contents!");
   // The only thing that we need to verify is that the contents is contained
   // within the file bounds. We don't need to make sure it doesn't cover other
   // data, as there's nothing that says that is not allowed.
   uintptr_t ConStart = uintptr_t(base()) + Sec->PointerToRawData;
-  uintptr_t ConEnd = ConStart + Sec->SizeOfRawData;
-  if (ConEnd > uintptr_t(Data->getBufferEnd()))
+  uint32_t SectionSize = getSectionSize(Sec);
+  if (checkOffset(Data, ConStart, SectionSize))
     return object_error::parse_failed;
-  Res = ArrayRef<uint8_t>(reinterpret_cast<const unsigned char*>(ConStart),
-                          Sec->SizeOfRawData);
+  Res = makeArrayRef(reinterpret_cast<const uint8_t *>(ConStart), SectionSize);
   return object_error::success;
 }
 
@@ -864,14 +1041,26 @@
 
 std::error_code COFFObjectFile::getRelocationOffset(DataRefImpl Rel,
                                                     uint64_t &Res) const {
-  Res = toRel(Rel)->VirtualAddress;
+  const coff_relocation *R = toRel(Rel);
+  const support::ulittle32_t *VirtualAddressPtr;
+  if (std::error_code EC =
+          getObject(VirtualAddressPtr, Data, &R->VirtualAddress))
+    return EC;
+  Res = *VirtualAddressPtr;
   return object_error::success;
 }
 
 symbol_iterator COFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
-  const coff_relocation* R = toRel(Rel);
+  const coff_relocation *R = toRel(Rel);
   DataRefImpl Ref;
-  Ref.p = reinterpret_cast<uintptr_t>(SymbolTable + R->SymbolTableIndex);
+  if (R->SymbolTableIndex >= getNumberOfSymbols())
+    return symbol_end();
+  if (SymbolTable16)
+    Ref.p = reinterpret_cast<uintptr_t>(SymbolTable16 + R->SymbolTableIndex);
+  else if (SymbolTable32)
+    Ref.p = reinterpret_cast<uintptr_t>(SymbolTable32 + R->SymbolTableIndex);
+  else
+    return symbol_end();
   return symbol_iterator(SymbolRef(Ref, this));
 }
 
@@ -887,9 +1076,16 @@
   return toSec(Section.getRawDataRefImpl());
 }
 
-const coff_symbol *
-COFFObjectFile::getCOFFSymbol(const SymbolRef &Symbol) const {
-  return toSymb(Symbol.getRawDataRefImpl());
+COFFSymbolRef COFFObjectFile::getCOFFSymbol(const DataRefImpl &Ref) const {
+  if (SymbolTable16)
+    return toSymb<coff_symbol16>(Ref);
+  if (SymbolTable32)
+    return toSymb<coff_symbol32>(Ref);
+  llvm_unreachable("no symbol table pointer!");
+}
+
+COFFSymbolRef COFFObjectFile::getCOFFSymbol(const SymbolRef &Symbol) const {
+  return getCOFFSymbol(Symbol.getRawDataRefImpl());
 }
 
 const coff_relocation *
@@ -907,7 +1103,7 @@
                                       SmallVectorImpl<char> &Result) const {
   const coff_relocation *Reloc = toRel(Rel);
   StringRef Res;
-  switch (COFFHeader->Machine) {
+  switch (getMachine()) {
   case COFF::IMAGE_FILE_MACHINE_AMD64:
     switch (Reloc->Type) {
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ABSOLUTE);
@@ -982,11 +1178,11 @@
 COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
                                          SmallVectorImpl<char> &Result) const {
   const coff_relocation *Reloc = toRel(Rel);
-  const coff_symbol *Symb = nullptr;
-  if (std::error_code EC = getSymbol(Reloc->SymbolTableIndex, Symb))
-    return EC;
   DataRefImpl Sym;
-  Sym.p = reinterpret_cast<uintptr_t>(Symb);
+  ErrorOr<COFFSymbolRef> Symb = getSymbol(Reloc->SymbolTableIndex);
+  if (std::error_code EC = Symb.getError())
+    return EC;
+  Sym.p = reinterpret_cast<uintptr_t>(Symb->getRawPtr());
   StringRef SymName;
   if (std::error_code EC = getSymbolName(Sym, SymName))
     return EC;
@@ -994,14 +1190,8 @@
   return object_error::success;
 }
 
-std::error_code COFFObjectFile::getLibraryNext(DataRefImpl LibData,
-                                               LibraryRef &Result) const {
-  report_fatal_error("getLibraryNext not implemented in COFFObjectFile");
-}
-
-std::error_code COFFObjectFile::getLibraryPath(DataRefImpl LibData,
-                                               StringRef &Result) const {
-  report_fatal_error("getLibraryPath not implemented in COFFObjectFile");
+bool COFFObjectFile::isRelocatableObject() const {
+  return !DataDirectory;
 }
 
 bool ImportDirectoryEntryRef::
@@ -1015,29 +1205,148 @@
 
 std::error_code ImportDirectoryEntryRef::getImportTableEntry(
     const import_directory_table_entry *&Result) const {
-  Result = ImportTable;
+  Result = ImportTable + Index;
   return object_error::success;
 }
 
+static imported_symbol_iterator
+makeImportedSymbolIterator(const COFFObjectFile *Object,
+                           uintptr_t Ptr, int Index) {
+  if (Object->getBytesInAddress() == 4) {
+    auto *P = reinterpret_cast<const import_lookup_table_entry32 *>(Ptr);
+    return imported_symbol_iterator(ImportedSymbolRef(P, Index, Object));
+  }
+  auto *P = reinterpret_cast<const import_lookup_table_entry64 *>(Ptr);
+  return imported_symbol_iterator(ImportedSymbolRef(P, Index, Object));
+}
+
+static imported_symbol_iterator
+importedSymbolBegin(uint32_t RVA, const COFFObjectFile *Object) {
+  uintptr_t IntPtr = 0;
+  Object->getRvaPtr(RVA, IntPtr);
+  return makeImportedSymbolIterator(Object, IntPtr, 0);
+}
+
+static imported_symbol_iterator
+importedSymbolEnd(uint32_t RVA, const COFFObjectFile *Object) {
+  uintptr_t IntPtr = 0;
+  Object->getRvaPtr(RVA, IntPtr);
+  // Forward the pointer to the last entry which is null.
+  int Index = 0;
+  if (Object->getBytesInAddress() == 4) {
+    auto *Entry = reinterpret_cast<ulittle32_t *>(IntPtr);
+    while (*Entry++)
+      ++Index;
+  } else {
+    auto *Entry = reinterpret_cast<ulittle64_t *>(IntPtr);
+    while (*Entry++)
+      ++Index;
+  }
+  return makeImportedSymbolIterator(Object, IntPtr, Index);
+}
+
+imported_symbol_iterator
+ImportDirectoryEntryRef::imported_symbol_begin() const {
+  return importedSymbolBegin(ImportTable[Index].ImportLookupTableRVA,
+                             OwningObject);
+}
+
+imported_symbol_iterator
+ImportDirectoryEntryRef::imported_symbol_end() const {
+  return importedSymbolEnd(ImportTable[Index].ImportLookupTableRVA,
+                           OwningObject);
+}
+
+iterator_range<imported_symbol_iterator>
+ImportDirectoryEntryRef::imported_symbols() const {
+  return make_range(imported_symbol_begin(), imported_symbol_end());
+}
+
 std::error_code ImportDirectoryEntryRef::getName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
   if (std::error_code EC =
-          OwningObject->getRvaPtr(ImportTable->NameRVA, IntPtr))
+          OwningObject->getRvaPtr(ImportTable[Index].NameRVA, IntPtr))
     return EC;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
   return object_error::success;
 }
 
+std::error_code
+ImportDirectoryEntryRef::getImportLookupTableRVA(uint32_t  &Result) const {
+  Result = ImportTable[Index].ImportLookupTableRVA;
+  return object_error::success;
+}
+
+std::error_code
+ImportDirectoryEntryRef::getImportAddressTableRVA(uint32_t &Result) const {
+  Result = ImportTable[Index].ImportAddressTableRVA;
+  return object_error::success;
+}
+
 std::error_code ImportDirectoryEntryRef::getImportLookupEntry(
     const import_lookup_table_entry32 *&Result) const {
   uintptr_t IntPtr = 0;
-  if (std::error_code EC =
-          OwningObject->getRvaPtr(ImportTable->ImportLookupTableRVA, IntPtr))
+  uint32_t RVA = ImportTable[Index].ImportLookupTableRVA;
+  if (std::error_code EC = OwningObject->getRvaPtr(RVA, IntPtr))
     return EC;
   Result = reinterpret_cast<const import_lookup_table_entry32 *>(IntPtr);
   return object_error::success;
 }
 
+bool DelayImportDirectoryEntryRef::
+operator==(const DelayImportDirectoryEntryRef &Other) const {
+  return Table == Other.Table && Index == Other.Index;
+}
+
+void DelayImportDirectoryEntryRef::moveNext() {
+  ++Index;
+}
+
+imported_symbol_iterator
+DelayImportDirectoryEntryRef::imported_symbol_begin() const {
+  return importedSymbolBegin(Table[Index].DelayImportNameTable,
+                             OwningObject);
+}
+
+imported_symbol_iterator
+DelayImportDirectoryEntryRef::imported_symbol_end() const {
+  return importedSymbolEnd(Table[Index].DelayImportNameTable,
+                           OwningObject);
+}
+
+iterator_range<imported_symbol_iterator>
+DelayImportDirectoryEntryRef::imported_symbols() const {
+  return make_range(imported_symbol_begin(), imported_symbol_end());
+}
+
+std::error_code DelayImportDirectoryEntryRef::getName(StringRef &Result) const {
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC = OwningObject->getRvaPtr(Table[Index].Name, IntPtr))
+    return EC;
+  Result = StringRef(reinterpret_cast<const char *>(IntPtr));
+  return object_error::success;
+}
+
+std::error_code DelayImportDirectoryEntryRef::
+getDelayImportTable(const delay_import_directory_table_entry *&Result) const {
+  Result = Table;
+  return object_error::success;
+}
+
+std::error_code DelayImportDirectoryEntryRef::
+getImportAddress(int AddrIndex, uint64_t &Result) const {
+  uint32_t RVA = Table[Index].DelayImportAddressTable +
+      AddrIndex * (OwningObject->is64() ? 8 : 4);
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC = OwningObject->getRvaPtr(RVA, IntPtr))
+    return EC;
+  if (OwningObject->is64())
+    Result = *reinterpret_cast<const ulittle64_t *>(IntPtr);
+  else
+    Result = *reinterpret_cast<const ulittle32_t *>(IntPtr);
+  return object_error::success;
+}
+
 bool ExportDirectoryEntryRef::
 operator==(const ExportDirectoryEntryRef &Other) const {
   return ExportTable == Other.ExportTable && Index == Other.Index;
@@ -1112,12 +1421,98 @@
   return object_error::success;
 }
 
-ErrorOr<ObjectFile *>
-ObjectFile::createCOFFObjectFile(std::unique_ptr<MemoryBuffer> Object) {
+bool ImportedSymbolRef::
+operator==(const ImportedSymbolRef &Other) const {
+  return Entry32 == Other.Entry32 && Entry64 == Other.Entry64
+      && Index == Other.Index;
+}
+
+void ImportedSymbolRef::moveNext() {
+  ++Index;
+}
+
+std::error_code
+ImportedSymbolRef::getSymbolName(StringRef &Result) const {
+  uint32_t RVA;
+  if (Entry32) {
+    // If a symbol is imported only by ordinal, it has no name.
+    if (Entry32[Index].isOrdinal())
+      return object_error::success;
+    RVA = Entry32[Index].getHintNameRVA();
+  } else {
+    if (Entry64[Index].isOrdinal())
+      return object_error::success;
+    RVA = Entry64[Index].getHintNameRVA();
+  }
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC = OwningObject->getRvaPtr(RVA, IntPtr))
+    return EC;
+  // +2 because the first two bytes is hint.
+  Result = StringRef(reinterpret_cast<const char *>(IntPtr + 2));
+  return object_error::success;
+}
+
+std::error_code ImportedSymbolRef::getOrdinal(uint16_t &Result) const {
+  uint32_t RVA;
+  if (Entry32) {
+    if (Entry32[Index].isOrdinal()) {
+      Result = Entry32[Index].getOrdinal();
+      return object_error::success;
+    }
+    RVA = Entry32[Index].getHintNameRVA();
+  } else {
+    if (Entry64[Index].isOrdinal()) {
+      Result = Entry64[Index].getOrdinal();
+      return object_error::success;
+    }
+    RVA = Entry64[Index].getHintNameRVA();
+  }
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC = OwningObject->getRvaPtr(RVA, IntPtr))
+    return EC;
+  Result = *reinterpret_cast<const ulittle16_t *>(IntPtr);
+  return object_error::success;
+}
+
+ErrorOr<std::unique_ptr<COFFObjectFile>>
+ObjectFile::createCOFFObjectFile(MemoryBufferRef Object) {
   std::error_code EC;
-  std::unique_ptr<COFFObjectFile> Ret(
-      new COFFObjectFile(std::move(Object), EC));
+  std::unique_ptr<COFFObjectFile> Ret(new COFFObjectFile(Object, EC));
   if (EC)
     return EC;
-  return Ret.release();
+  return std::move(Ret);
+}
+
+bool BaseRelocRef::operator==(const BaseRelocRef &Other) const {
+  return Header == Other.Header && Index == Other.Index;
+}
+
+void BaseRelocRef::moveNext() {
+  // Header->BlockSize is the size of the current block, including the
+  // size of the header itself.
+  uint32_t Size = sizeof(*Header) +
+      sizeof(coff_base_reloc_block_entry) * (Index + 1);
+  if (Size == Header->BlockSize) {
+    // .reloc contains a list of base relocation blocks. Each block
+    // consists of the header followed by entries. The header contains
+    // how many entories will follow. When we reach the end of the
+    // current block, proceed to the next block.
+    Header = reinterpret_cast<const coff_base_reloc_block_header *>(
+        reinterpret_cast<const uint8_t *>(Header) + Size);
+    Index = 0;
+  } else {
+    ++Index;
+  }
+}
+
+std::error_code BaseRelocRef::getType(uint8_t &Type) const {
+  auto *Entry = reinterpret_cast<const coff_base_reloc_block_entry *>(Header + 1);
+  Type = Entry[Index].getType();
+  return object_error::success;
+}
+
+std::error_code BaseRelocRef::getRVA(uint32_t &Result) const {
+  auto *Entry = reinterpret_cast<const coff_base_reloc_block_entry *>(Header + 1);
+  Result = Header->PageRVA + Entry[Index].getOffset();
+  return object_error::success;
 }

diff --git a/lib/Object/COFFYAML.cpp b/lib/Object/COFFYAML.cpp
index 49c5dda..9a24b53 100644
--- a/lib/Object/COFFYAML.cpp
+++ b/lib/Object/COFFYAML.cpp

@@ -168,6 +168,24 @@
   ECase(IMAGE_REL_AMD64_PAIR);
   ECase(IMAGE_REL_AMD64_SSPAN32);
 }
+
+void ScalarEnumerationTraits<COFF::WindowsSubsystem>::enumeration(
+    IO &IO, COFF::WindowsSubsystem &Value) {
+    ECase(IMAGE_SUBSYSTEM_UNKNOWN);
+    ECase(IMAGE_SUBSYSTEM_NATIVE);
+    ECase(IMAGE_SUBSYSTEM_WINDOWS_GUI);
+    ECase(IMAGE_SUBSYSTEM_WINDOWS_CUI);
+    ECase(IMAGE_SUBSYSTEM_OS2_CUI);
+    ECase(IMAGE_SUBSYSTEM_POSIX_CUI);
+    ECase(IMAGE_SUBSYSTEM_NATIVE_WINDOWS);
+    ECase(IMAGE_SUBSYSTEM_WINDOWS_CE_GUI);
+    ECase(IMAGE_SUBSYSTEM_EFI_APPLICATION);
+    ECase(IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER);
+    ECase(IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER);
+    ECase(IMAGE_SUBSYSTEM_EFI_ROM);
+    ECase(IMAGE_SUBSYSTEM_XBOX);
+    ECase(IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION);
+}
 #undef ECase
 
 #define BCase(X) IO.bitSetCase(Value, #X, COFF::X);
@@ -214,6 +232,21 @@
   BCase(IMAGE_SCN_MEM_READ);
   BCase(IMAGE_SCN_MEM_WRITE);
 }
+
+void ScalarBitSetTraits<COFF::DLLCharacteristics>::bitset(
+    IO &IO, COFF::DLLCharacteristics &Value) {
+  BCase(IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA);
+  BCase(IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE);
+  BCase(IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY);
+  BCase(IMAGE_DLL_CHARACTERISTICS_NX_COMPAT);
+  BCase(IMAGE_DLL_CHARACTERISTICS_NO_ISOLATION);
+  BCase(IMAGE_DLL_CHARACTERISTICS_NO_SEH);
+  BCase(IMAGE_DLL_CHARACTERISTICS_NO_BIND);
+  BCase(IMAGE_DLL_CHARACTERISTICS_APPCONTAINER);
+  BCase(IMAGE_DLL_CHARACTERISTICS_WDM_DRIVER);
+  BCase(IMAGE_DLL_CHARACTERISTICS_GUARD_CF);
+  BCase(IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE);
+}
 #undef BCase
 
 namespace {
@@ -285,6 +318,23 @@
   RelocType Type;
 };
 
+struct NWindowsSubsystem {
+  NWindowsSubsystem(IO &) : Subsystem(COFF::WindowsSubsystem(0)) {}
+  NWindowsSubsystem(IO &, uint16_t C) : Subsystem(COFF::WindowsSubsystem(C)) {}
+  uint16_t denormalize(IO &) { return Subsystem; }
+
+  COFF::WindowsSubsystem Subsystem;
+};
+
+struct NDLLCharacteristics {
+  NDLLCharacteristics(IO &) : Characteristics(COFF::DLLCharacteristics(0)) {}
+  NDLLCharacteristics(IO &, uint16_t C)
+      : Characteristics(COFF::DLLCharacteristics(C)) {}
+  uint16_t denormalize(IO &) { return Characteristics; }
+
+  COFF::DLLCharacteristics Characteristics;
+};
+
 }
 
 void MappingTraits<COFFYAML::Relocation>::mapping(IO &IO,
@@ -306,6 +356,59 @@
   }
 }
 
+void MappingTraits<COFF::DataDirectory>::mapping(IO &IO,
+                                                 COFF::DataDirectory &DD) {
+  IO.mapRequired("RelativeVirtualAddress", DD.RelativeVirtualAddress);
+  IO.mapRequired("Size", DD.Size);
+}
+
+void MappingTraits<COFFYAML::PEHeader>::mapping(IO &IO,
+                                                COFFYAML::PEHeader &PH) {
+  MappingNormalization<NWindowsSubsystem, uint16_t> NWS(IO,
+                                                        PH.Header.Subsystem);
+  MappingNormalization<NDLLCharacteristics, uint16_t> NDC(
+      IO, PH.Header.DLLCharacteristics);
+
+  IO.mapRequired("AddressOfEntryPoint", PH.Header.AddressOfEntryPoint);
+  IO.mapRequired("ImageBase", PH.Header.ImageBase);
+  IO.mapRequired("SectionAlignment", PH.Header.SectionAlignment);
+  IO.mapRequired("FileAlignment", PH.Header.FileAlignment);
+  IO.mapRequired("MajorOperatingSystemVersion",
+                 PH.Header.MajorOperatingSystemVersion);
+  IO.mapRequired("MinorOperatingSystemVersion",
+                 PH.Header.MinorOperatingSystemVersion);
+  IO.mapRequired("MajorImageVersion", PH.Header.MajorImageVersion);
+  IO.mapRequired("MinorImageVersion", PH.Header.MinorImageVersion);
+  IO.mapRequired("MajorSubsystemVersion", PH.Header.MajorSubsystemVersion);
+  IO.mapRequired("MinorSubsystemVersion", PH.Header.MinorSubsystemVersion);
+  IO.mapRequired("Subsystem", NWS->Subsystem);
+  IO.mapRequired("DLLCharacteristics", NDC->Characteristics);
+  IO.mapRequired("SizeOfStackReserve", PH.Header.SizeOfStackReserve);
+  IO.mapRequired("SizeOfStackCommit", PH.Header.SizeOfStackCommit);
+  IO.mapRequired("SizeOfHeapReserve", PH.Header.SizeOfHeapReserve);
+  IO.mapRequired("SizeOfHeapCommit", PH.Header.SizeOfHeapCommit);
+
+  IO.mapOptional("ExportTable", PH.DataDirectories[COFF::EXPORT_TABLE]);
+  IO.mapOptional("ImportTable", PH.DataDirectories[COFF::IMPORT_TABLE]);
+  IO.mapOptional("ResourceTable", PH.DataDirectories[COFF::RESOURCE_TABLE]);
+  IO.mapOptional("ExceptionTable", PH.DataDirectories[COFF::EXCEPTION_TABLE]);
+  IO.mapOptional("CertificateTable", PH.DataDirectories[COFF::CERTIFICATE_TABLE]);
+  IO.mapOptional("BaseRelocationTable",
+                 PH.DataDirectories[COFF::BASE_RELOCATION_TABLE]);
+  IO.mapOptional("Debug", PH.DataDirectories[COFF::DEBUG]);
+  IO.mapOptional("Architecture", PH.DataDirectories[COFF::ARCHITECTURE]);
+  IO.mapOptional("GlobalPtr", PH.DataDirectories[COFF::GLOBAL_PTR]);
+  IO.mapOptional("TlsTable", PH.DataDirectories[COFF::TLS_TABLE]);
+  IO.mapOptional("LoadConfigTable",
+                 PH.DataDirectories[COFF::LOAD_CONFIG_TABLE]);
+  IO.mapOptional("BoundImport", PH.DataDirectories[COFF::BOUND_IMPORT]);
+  IO.mapOptional("IAT", PH.DataDirectories[COFF::IAT]);
+  IO.mapOptional("DelayImportDescriptor",
+                 PH.DataDirectories[COFF::DELAY_IMPORT_DESCRIPTOR]);
+  IO.mapOptional("ClrRuntimeHeader",
+                 PH.DataDirectories[COFF::CLR_RUNTIME_HEADER]);
+}
+
 void MappingTraits<COFF::header>::mapping(IO &IO, COFF::header &H) {
   MappingNormalization<NMachine, uint16_t> NM(IO, H.Machine);
   MappingNormalization<NHeaderCharacteristics, uint16_t> NC(IO,
@@ -380,12 +483,15 @@
       IO, Sec.Header.Characteristics);
   IO.mapRequired("Name", Sec.Name);
   IO.mapRequired("Characteristics", NC->Characteristics);
+  IO.mapOptional("VirtualAddress", Sec.Header.VirtualAddress, 0U);
+  IO.mapOptional("VirtualSize", Sec.Header.VirtualSize, 0U);
   IO.mapOptional("Alignment", Sec.Alignment);
   IO.mapRequired("SectionData", Sec.SectionData);
   IO.mapOptional("Relocations", Sec.Relocations);
 }
 
 void MappingTraits<COFFYAML::Object>::mapping(IO &IO, COFFYAML::Object &Obj) {
+  IO.mapOptional("OptionalHeader", Obj.OptionalHeader);
   IO.mapRequired("header", Obj.Header);
   IO.mapRequired("sections", Obj.Sections);
   IO.mapRequired("symbols", Obj.Symbols);

diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index df4dd5e..11099bd 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp

@@ -223,6 +223,8 @@
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST32_ABS_LO12_NC);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST64_ABS_LO12_NC);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST128_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_GOTREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_GOTREL32);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_GOT_PAGE);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD64_GOT_LO12_NC);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G2);
@@ -266,6 +268,15 @@
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_LD64_LO12_NC);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADD_LO12_NC);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLS_DTPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLS_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLS_TPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_IRELATIVE);
     default:
       break;
     }
@@ -519,6 +530,7 @@
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_LO);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HI);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_PLTREL24);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL32);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLS);
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPMOD32);

diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index 4f0f60b..8ccb253 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp

@@ -17,61 +17,56 @@
 namespace llvm {
 using namespace object;
 
-ErrorOr<ObjectFile *>
-ObjectFile::createELFObjectFile(std::unique_ptr<MemoryBuffer> &Obj) {
+ELFObjectFileBase::ELFObjectFileBase(unsigned int Type, MemoryBufferRef Source)
+    : ObjectFile(Type, Source) {}
+
+ErrorOr<std::unique_ptr<ObjectFile>>
+ObjectFile::createELFObjectFile(MemoryBufferRef Obj) {
   std::pair<unsigned char, unsigned char> Ident =
-      getElfArchType(Obj->getBuffer());
+      getElfArchType(Obj.getBuffer());
   std::size_t MaxAlignment =
-    1ULL << countTrailingZeros(uintptr_t(Obj->getBufferStart()));
+      1ULL << countTrailingZeros(uintptr_t(Obj.getBufferStart()));
 
   std::error_code EC;
   std::unique_ptr<ObjectFile> R;
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 4)
-      R.reset(new ELFObjectFile<ELFType<support::little, 4, false>>(
-          std::move(Obj), EC));
+      R.reset(new ELFObjectFile<ELFType<support::little, 4, false>>(Obj, EC));
     else
 #endif
     if (MaxAlignment >= 2)
-      R.reset(new ELFObjectFile<ELFType<support::little, 2, false>>(
-          std::move(Obj), EC));
+      R.reset(new ELFObjectFile<ELFType<support::little, 2, false>>(Obj, EC));
     else
       return object_error::parse_failed;
   else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 4)
-      R.reset(new ELFObjectFile<ELFType<support::big, 4, false>>(std::move(Obj),
-                                                                 EC));
+      R.reset(new ELFObjectFile<ELFType<support::big, 4, false>>(Obj, EC));
     else
 #endif
     if (MaxAlignment >= 2)
-      R.reset(new ELFObjectFile<ELFType<support::big, 2, false>>(std::move(Obj),
-                                                                 EC));
+      R.reset(new ELFObjectFile<ELFType<support::big, 2, false>>(Obj, EC));
     else
       return object_error::parse_failed;
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 8)
-      R.reset(new ELFObjectFile<ELFType<support::big, 8, true>>(std::move(Obj),
-                                                                EC));
+      R.reset(new ELFObjectFile<ELFType<support::big, 8, true>>(Obj, EC));
     else
 #endif
     if (MaxAlignment >= 2)
-      R.reset(new ELFObjectFile<ELFType<support::big, 2, true>>(std::move(Obj),
-                                                                EC));
+      R.reset(new ELFObjectFile<ELFType<support::big, 2, true>>(Obj, EC));
     else
       return object_error::parse_failed;
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) {
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 8)
-      R.reset(new ELFObjectFile<ELFType<support::little, 8, true>>(
-          std::move(Obj), EC));
+      R.reset(new ELFObjectFile<ELFType<support::little, 8, true>>(Obj, EC));
     else
 #endif
     if (MaxAlignment >= 2)
-      R.reset(new ELFObjectFile<ELFType<support::little, 2, true>>(
-          std::move(Obj), EC));
+      R.reset(new ELFObjectFile<ELFType<support::little, 2, true>>(Obj, EC));
     else
       return object_error::parse_failed;
   }
@@ -80,7 +75,7 @@
 
   if (EC)
     return EC;
-  return R.release();
+  return std::move(R);
 }
 
 } // end namespace llvm

diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp
index dc3d467..f513c11 100644
--- a/lib/Object/ELFYAML.cpp
+++ b/lib/Object/ELFYAML.cpp

@@ -264,6 +264,7 @@
     BCase(EF_MIPS_CPIC)
     BCase(EF_MIPS_ABI2)
     BCase(EF_MIPS_32BITMODE)
+    BCase(EF_MIPS_NAN2008)
     BCase(EF_MIPS_ABI_O32)
     BCase(EF_MIPS_MICROMIPS)
     BCase(EF_MIPS_ARCH_ASE_M16)
@@ -298,6 +299,8 @@
 
 void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
     IO &IO, ELFYAML::ELF_SHT &Value) {
+  const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
+  assert(Object && "The IO context is not initialized");
 #define ECase(X) IO.enumCase(Value, #X, ELF::X);
   ECase(SHT_NULL)
   ECase(SHT_PROGBITS)
@@ -325,15 +328,29 @@
   ECase(SHT_GNU_versym)
   ECase(SHT_HIOS)
   ECase(SHT_LOPROC)
-  ECase(SHT_ARM_EXIDX)
-  ECase(SHT_ARM_PREEMPTMAP)
-  ECase(SHT_ARM_ATTRIBUTES)
-  ECase(SHT_ARM_DEBUGOVERLAY)
-  ECase(SHT_ARM_OVERLAYSECTION)
-  ECase(SHT_HEX_ORDERED)
-  ECase(SHT_X86_64_UNWIND)
-  ECase(SHT_MIPS_REGINFO)
-  ECase(SHT_MIPS_OPTIONS)
+  switch (Object->Header.Machine) {
+  case ELF::EM_ARM:
+    ECase(SHT_ARM_EXIDX)
+    ECase(SHT_ARM_PREEMPTMAP)
+    ECase(SHT_ARM_ATTRIBUTES)
+    ECase(SHT_ARM_DEBUGOVERLAY)
+    ECase(SHT_ARM_OVERLAYSECTION)
+    break;
+  case ELF::EM_HEXAGON:
+    ECase(SHT_HEX_ORDERED)
+    break;
+  case ELF::EM_X86_64:
+    ECase(SHT_X86_64_UNWIND)
+    break;
+  case ELF::EM_MIPS:
+    ECase(SHT_MIPS_REGINFO)
+    ECase(SHT_MIPS_OPTIONS)
+    ECase(SHT_MIPS_ABIFLAGS)
+    break;
+  default:
+    // Nothing to do.
+    break;
+  }
 #undef ECase
 }
 
@@ -378,6 +395,25 @@
 #undef ECase
 }
 
+void ScalarBitSetTraits<ELFYAML::ELF_STO>::bitset(IO &IO,
+                                                  ELFYAML::ELF_STO &Value) {
+  const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
+  assert(Object && "The IO context is not initialized");
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
+  switch (Object->Header.Machine) {
+  case ELF::EM_MIPS:
+    BCase(STO_MIPS_OPTIONAL)
+    BCase(STO_MIPS_PLT)
+    BCase(STO_MIPS_PIC)
+    BCase(STO_MIPS_MICROMIPS)
+    break;
+  default:
+    break; // Nothing to do
+  }
+#undef BCase
+#undef BCaseMask
+}
+
 void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
     IO &IO, ELFYAML::ELF_REL &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
@@ -636,6 +672,92 @@
     ECase(R_386_IRELATIVE)
     ECase(R_386_NUM)
     break;
+  case ELF::EM_AARCH64:
+    ECase(R_AARCH64_NONE)
+    ECase(R_AARCH64_ABS64)
+    ECase(R_AARCH64_ABS32)
+    ECase(R_AARCH64_ABS16)
+    ECase(R_AARCH64_PREL64)
+    ECase(R_AARCH64_PREL32)
+    ECase(R_AARCH64_PREL16)
+    ECase(R_AARCH64_MOVW_UABS_G0)
+    ECase(R_AARCH64_MOVW_UABS_G0_NC)
+    ECase(R_AARCH64_MOVW_UABS_G1)
+    ECase(R_AARCH64_MOVW_UABS_G1_NC)
+    ECase(R_AARCH64_MOVW_UABS_G2)
+    ECase(R_AARCH64_MOVW_UABS_G2_NC)
+    ECase(R_AARCH64_MOVW_UABS_G3)
+    ECase(R_AARCH64_MOVW_SABS_G0)
+    ECase(R_AARCH64_MOVW_SABS_G1)
+    ECase(R_AARCH64_MOVW_SABS_G2)
+    ECase(R_AARCH64_LD_PREL_LO19)
+    ECase(R_AARCH64_ADR_PREL_LO21)
+    ECase(R_AARCH64_ADR_PREL_PG_HI21)
+    ECase(R_AARCH64_ADD_ABS_LO12_NC)
+    ECase(R_AARCH64_LDST8_ABS_LO12_NC)
+    ECase(R_AARCH64_TSTBR14)
+    ECase(R_AARCH64_CONDBR19)
+    ECase(R_AARCH64_JUMP26)
+    ECase(R_AARCH64_CALL26)
+    ECase(R_AARCH64_LDST16_ABS_LO12_NC)
+    ECase(R_AARCH64_LDST32_ABS_LO12_NC)
+    ECase(R_AARCH64_LDST64_ABS_LO12_NC)
+    ECase(R_AARCH64_LDST128_ABS_LO12_NC)
+    ECase(R_AARCH64_GOTREL64)
+    ECase(R_AARCH64_GOTREL32)
+    ECase(R_AARCH64_ADR_GOT_PAGE)
+    ECase(R_AARCH64_LD64_GOT_LO12_NC)
+    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G2)
+    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G1)
+    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC)
+    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G0)
+    ECase(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC)
+    ECase(R_AARCH64_TLSLD_ADD_DTPREL_HI12)
+    ECase(R_AARCH64_TLSLD_ADD_DTPREL_LO12)
+    ECase(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC)
+    ECase(R_AARCH64_TLSLD_LDST8_DTPREL_LO12)
+    ECase(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC)
+    ECase(R_AARCH64_TLSLD_LDST16_DTPREL_LO12)
+    ECase(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC)
+    ECase(R_AARCH64_TLSLD_LDST32_DTPREL_LO12)
+    ECase(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC)
+    ECase(R_AARCH64_TLSLD_LDST64_DTPREL_LO12)
+    ECase(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC)
+    ECase(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1)
+    ECase(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC)
+    ECase(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21)
+    ECase(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC)
+    ECase(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19)
+    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G2)
+    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G1)
+    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC)
+    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G0)
+    ECase(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC)
+    ECase(R_AARCH64_TLSLE_ADD_TPREL_HI12)
+    ECase(R_AARCH64_TLSLE_ADD_TPREL_LO12)
+    ECase(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC)
+    ECase(R_AARCH64_TLSLE_LDST8_TPREL_LO12)
+    ECase(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC)
+    ECase(R_AARCH64_TLSLE_LDST16_TPREL_LO12)
+    ECase(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC)
+    ECase(R_AARCH64_TLSLE_LDST32_TPREL_LO12)
+    ECase(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC)
+    ECase(R_AARCH64_TLSLE_LDST64_TPREL_LO12)
+    ECase(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC)
+    ECase(R_AARCH64_TLSDESC_ADR_PAGE)
+    ECase(R_AARCH64_TLSDESC_LD64_LO12_NC)
+    ECase(R_AARCH64_TLSDESC_ADD_LO12_NC)
+    ECase(R_AARCH64_TLSDESC_CALL)
+    ECase(R_AARCH64_COPY)
+    ECase(R_AARCH64_GLOB_DAT)
+    ECase(R_AARCH64_JUMP_SLOT)
+    ECase(R_AARCH64_RELATIVE)
+    ECase(R_AARCH64_TLS_DTPREL64)
+    ECase(R_AARCH64_TLS_DTPMOD64)
+    ECase(R_AARCH64_TLS_TPREL64)
+    ECase(R_AARCH64_TLSDESC)
+    ECase(R_AARCH64_IRELATIVE)
+    break;
   default:
     llvm_unreachable("Unsupported architecture");
   }
@@ -653,13 +775,30 @@
   IO.mapOptional("Entry", FileHdr.Entry, Hex64(0));
 }
 
+namespace {
+struct NormalizedOther {
+  NormalizedOther(IO &)
+      : Visibility(ELFYAML::ELF_STV(0)), Other(ELFYAML::ELF_STO(0)) {}
+  NormalizedOther(IO &, uint8_t Original)
+      : Visibility(Original & 0x3), Other(Original & ~0x3) {}
+
+  uint8_t denormalize(IO &) { return Visibility | Other; }
+
+  ELFYAML::ELF_STV Visibility;
+  ELFYAML::ELF_STO Other;
+};
+}
+
 void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
   IO.mapOptional("Name", Symbol.Name, StringRef());
   IO.mapOptional("Type", Symbol.Type, ELFYAML::ELF_STT(0));
   IO.mapOptional("Section", Symbol.Section, StringRef());
   IO.mapOptional("Value", Symbol.Value, Hex64(0));
   IO.mapOptional("Size", Symbol.Size, Hex64(0));
-  IO.mapOptional("Visibility", Symbol.Visibility, ELFYAML::ELF_STV(0));
+
+  MappingNormalization<NormalizedOther, uint8_t> Keys(IO, Symbol.Other);
+  IO.mapOptional("Visibility", Keys->Visibility, ELFYAML::ELF_STV(0));
+  IO.mapOptional("Other", Keys->Other, ELFYAML::ELF_STO(0));
 }
 
 void MappingTraits<ELFYAML::LocalGlobalWeakSymbols>::mapping(

diff --git a/lib/Object/Error.cpp b/lib/Object/Error.cpp
index 9d25269..d2daab7 100644
--- a/lib/Object/Error.cpp
+++ b/lib/Object/Error.cpp

@@ -13,6 +13,7 @@
 
 #include "llvm/Object/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 using namespace object;
@@ -25,7 +26,7 @@
 };
 }
 
-const char *_object_error_category::name() const {
+const char *_object_error_category::name() const LLVM_NOEXCEPT {
   return "llvm.object";
 }
 
@@ -41,12 +42,15 @@
     return "Invalid data was encountered while parsing the file";
   case object_error::unexpected_eof:
     return "The end of the file was unexpectedly encountered";
+  case object_error::bitcode_section_not_found:
+    return "Bitcode section not found in object file";
   }
   llvm_unreachable("An enumerator of object_error does not have a message "
                    "defined.");
 }
 
+static ManagedStatic<_object_error_category> error_category;
+
 const std::error_category &object::object_category() {
-  static _object_error_category o;
-  return o;
+  return *error_category;
 }

diff --git a/lib/Object/IRObjectFile.cpp b/lib/Object/IRObjectFile.cpp
index 5323d92..7256a2f 100644
--- a/lib/Object/IRObjectFile.cpp
+++ b/lib/Object/IRObjectFile.cpp

@@ -25,6 +25,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -32,9 +33,8 @@
 using namespace llvm;
 using namespace object;
 
-IRObjectFile::IRObjectFile(std::unique_ptr<MemoryBuffer> Object,
-                           std::unique_ptr<Module> Mod)
-    : SymbolicFile(Binary::ID_IR, std::move(Object)), M(std::move(Mod)) {
+IRObjectFile::IRObjectFile(MemoryBufferRef Object, std::unique_ptr<Module> Mod)
+    : SymbolicFile(Binary::ID_IR, Object), M(std::move(Mod)) {
   // If we have a DataLayout, setup a mangler.
   const DataLayout *DL = M->getDataLayout();
   if (!DL)
@@ -76,7 +76,7 @@
 
   std::unique_ptr<MemoryBuffer> Buffer(MemoryBuffer::getMemBuffer(InlineAsm));
   SourceMgr SrcMgr;
-  SrcMgr.AddNewSourceBuffer(Buffer.release(), SMLoc());
+  SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, MCCtx, *Streamer, *MAI));
 
@@ -114,9 +114,6 @@
 }
 
 IRObjectFile::~IRObjectFile() {
-  GVMaterializer *GVM =  M->getMaterializer();
-  if (GVM)
-    GVM->releaseBuffer();
  }
 
 static const GlobalValue *getGV(DataRefImpl &Symb) {
@@ -207,16 +204,6 @@
   return object_error::success;
 }
 
-static bool isDeclaration(const GlobalValue &V) {
-  if (V.hasAvailableExternallyLinkage())
-    return true;
-
-  if (V.isMaterializable())
-    return false;
-
-  return V.isDeclaration();
-}
-
 uint32_t IRObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   const GlobalValue *GV = getGV(Symb);
 
@@ -227,7 +214,7 @@
   }
 
   uint32_t Res = BasicSymbolRef::SF_None;
-  if (isDeclaration(*GV))
+  if (GV->isDeclarationForLinker())
     Res |= BasicSymbolRef::SF_Undefined;
   if (GV->hasPrivateLinkage())
     Res |= BasicSymbolRef::SF_FormatSpecific;
@@ -268,12 +255,55 @@
   return basic_symbol_iterator(BasicSymbolRef(Ret, this));
 }
 
-ErrorOr<IRObjectFile *> llvm::object::IRObjectFile::createIRObjectFile(
-    std::unique_ptr<MemoryBuffer> Object, LLVMContext &Context) {
-  ErrorOr<Module *> MOrErr = getLazyBitcodeModule(Object.get(), Context);
+ErrorOr<MemoryBufferRef> IRObjectFile::findBitcodeInObject(const ObjectFile &Obj) {
+  for (const SectionRef &Sec : Obj.sections()) {
+    StringRef SecName;
+    if (std::error_code EC = Sec.getName(SecName))
+      return EC;
+    if (SecName == ".llvmbc") {
+      StringRef SecContents;
+      if (std::error_code EC = Sec.getContents(SecContents))
+        return EC;
+      return MemoryBufferRef(SecContents, Obj.getFileName());
+    }
+  }
+
+  return object_error::bitcode_section_not_found;
+}
+
+ErrorOr<MemoryBufferRef> IRObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Object) {
+  sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer());
+  switch (Type) {
+  case sys::fs::file_magic::bitcode:
+    return Object;
+  case sys::fs::file_magic::elf_relocatable:
+  case sys::fs::file_magic::macho_object:
+  case sys::fs::file_magic::coff_object: {
+    ErrorOr<std::unique_ptr<ObjectFile>> ObjFile =
+        ObjectFile::createObjectFile(Object, Type);
+    if (!ObjFile)
+      return ObjFile.getError();
+    return findBitcodeInObject(*ObjFile->get());
+  }
+  default:
+    return object_error::invalid_file_type;
+  }
+}
+
+ErrorOr<std::unique_ptr<IRObjectFile>>
+llvm::object::IRObjectFile::createIRObjectFile(MemoryBufferRef Object,
+                                               LLVMContext &Context) {
+  ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
+  if (!BCOrErr)
+    return BCOrErr.getError();
+
+  std::unique_ptr<MemoryBuffer> Buff(
+      MemoryBuffer::getMemBuffer(BCOrErr.get(), false));
+
+  ErrorOr<Module *> MOrErr = getLazyBitcodeModule(std::move(Buff), Context);
   if (std::error_code EC = MOrErr.getError())
     return EC;
 
   std::unique_ptr<Module> M(MOrErr.get());
-  return new IRObjectFile(std::move(Object), std::move(M));
+  return llvm::make_unique<IRObjectFile>(Object, std::move(M));
 }

diff --git a/lib/Object/LLVMBuild.txt b/lib/Object/LLVMBuild.txt
index 8acacba..bae578c 100644
--- a/lib/Object/LLVMBuild.txt
+++ b/lib/Object/LLVMBuild.txt

@@ -19,4 +19,4 @@
 type = Library
 name = Object
 parent = Libraries
-required_libraries = BitReader Core Support MC MCParser
+required_libraries = BitReader Core MC MCParser Support

diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 4919114..bbef639 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp

@@ -14,10 +14,14 @@
 
 #include "llvm/Object/MachO.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
@@ -27,220 +31,19 @@
 using namespace llvm;
 using namespace object;
 
-namespace llvm {
-
-namespace object {
-
-struct nlist_base {
-  uint32_t n_strx;
-  uint8_t n_type;
-  uint8_t n_sect;
-  uint16_t n_desc;
-};
-
-struct section_base {
-  char sectname[16];
-  char segname[16];
-};
-
-template<typename T>
-static void SwapStruct(T &Value);
-
-template<>
-void SwapStruct(MachO::any_relocation_info &H) {
-  sys::swapByteOrder(H.r_word0);
-  sys::swapByteOrder(H.r_word1);
-}
-
-template<>
-void SwapStruct(MachO::load_command &L) {
-  sys::swapByteOrder(L.cmd);
-  sys::swapByteOrder(L.cmdsize);
-}
-
-template<>
-void SwapStruct(nlist_base &S) {
-  sys::swapByteOrder(S.n_strx);
-  sys::swapByteOrder(S.n_desc);
-}
-
-template<>
-void SwapStruct(MachO::section &S) {
-  sys::swapByteOrder(S.addr);
-  sys::swapByteOrder(S.size);
-  sys::swapByteOrder(S.offset);
-  sys::swapByteOrder(S.align);
-  sys::swapByteOrder(S.reloff);
-  sys::swapByteOrder(S.nreloc);
-  sys::swapByteOrder(S.flags);
-  sys::swapByteOrder(S.reserved1);
-  sys::swapByteOrder(S.reserved2);
-}
-
-template<>
-void SwapStruct(MachO::section_64 &S) {
-  sys::swapByteOrder(S.addr);
-  sys::swapByteOrder(S.size);
-  sys::swapByteOrder(S.offset);
-  sys::swapByteOrder(S.align);
-  sys::swapByteOrder(S.reloff);
-  sys::swapByteOrder(S.nreloc);
-  sys::swapByteOrder(S.flags);
-  sys::swapByteOrder(S.reserved1);
-  sys::swapByteOrder(S.reserved2);
-  sys::swapByteOrder(S.reserved3);
-}
-
-template<>
-void SwapStruct(MachO::nlist &S) {
-  sys::swapByteOrder(S.n_strx);
-  sys::swapByteOrder(S.n_desc);
-  sys::swapByteOrder(S.n_value);
-}
-
-template<>
-void SwapStruct(MachO::nlist_64 &S) {
-  sys::swapByteOrder(S.n_strx);
-  sys::swapByteOrder(S.n_desc);
-  sys::swapByteOrder(S.n_value);
-}
-
-template<>
-void SwapStruct(MachO::mach_header &H) {
-  sys::swapByteOrder(H.magic);
-  sys::swapByteOrder(H.cputype);
-  sys::swapByteOrder(H.cpusubtype);
-  sys::swapByteOrder(H.filetype);
-  sys::swapByteOrder(H.ncmds);
-  sys::swapByteOrder(H.sizeofcmds);
-  sys::swapByteOrder(H.flags);
-}
-
-template<>
-void SwapStruct(MachO::mach_header_64 &H) {
-  sys::swapByteOrder(H.magic);
-  sys::swapByteOrder(H.cputype);
-  sys::swapByteOrder(H.cpusubtype);
-  sys::swapByteOrder(H.filetype);
-  sys::swapByteOrder(H.ncmds);
-  sys::swapByteOrder(H.sizeofcmds);
-  sys::swapByteOrder(H.flags);
-  sys::swapByteOrder(H.reserved);
-}
-
-template<>
-void SwapStruct(MachO::symtab_command &C) {
-  sys::swapByteOrder(C.cmd);
-  sys::swapByteOrder(C.cmdsize);
-  sys::swapByteOrder(C.symoff);
-  sys::swapByteOrder(C.nsyms);
-  sys::swapByteOrder(C.stroff);
-  sys::swapByteOrder(C.strsize);
-}
-
-template<>
-void SwapStruct(MachO::dysymtab_command &C) {
-  sys::swapByteOrder(C.cmd);
-  sys::swapByteOrder(C.cmdsize);
-  sys::swapByteOrder(C.ilocalsym);
-  sys::swapByteOrder(C.nlocalsym);
-  sys::swapByteOrder(C.iextdefsym);
-  sys::swapByteOrder(C.nextdefsym);
-  sys::swapByteOrder(C.iundefsym);
-  sys::swapByteOrder(C.nundefsym);
-  sys::swapByteOrder(C.tocoff);
-  sys::swapByteOrder(C.ntoc);
-  sys::swapByteOrder(C.modtaboff);
-  sys::swapByteOrder(C.nmodtab);
-  sys::swapByteOrder(C.extrefsymoff);
-  sys::swapByteOrder(C.nextrefsyms);
-  sys::swapByteOrder(C.indirectsymoff);
-  sys::swapByteOrder(C.nindirectsyms);
-  sys::swapByteOrder(C.extreloff);
-  sys::swapByteOrder(C.nextrel);
-  sys::swapByteOrder(C.locreloff);
-  sys::swapByteOrder(C.nlocrel);
-}
-
-template<>
-void SwapStruct(MachO::linkedit_data_command &C) {
-  sys::swapByteOrder(C.cmd);
-  sys::swapByteOrder(C.cmdsize);
-  sys::swapByteOrder(C.dataoff);
-  sys::swapByteOrder(C.datasize);
-}
-
-template<>
-void SwapStruct(MachO::segment_command &C) {
-  sys::swapByteOrder(C.cmd);
-  sys::swapByteOrder(C.cmdsize);
-  sys::swapByteOrder(C.vmaddr);
-  sys::swapByteOrder(C.vmsize);
-  sys::swapByteOrder(C.fileoff);
-  sys::swapByteOrder(C.filesize);
-  sys::swapByteOrder(C.maxprot);
-  sys::swapByteOrder(C.initprot);
-  sys::swapByteOrder(C.nsects);
-  sys::swapByteOrder(C.flags);
-}
-
-template<>
-void SwapStruct(MachO::segment_command_64 &C) {
-  sys::swapByteOrder(C.cmd);
-  sys::swapByteOrder(C.cmdsize);
-  sys::swapByteOrder(C.vmaddr);
-  sys::swapByteOrder(C.vmsize);
-  sys::swapByteOrder(C.fileoff);
-  sys::swapByteOrder(C.filesize);
-  sys::swapByteOrder(C.maxprot);
-  sys::swapByteOrder(C.initprot);
-  sys::swapByteOrder(C.nsects);
-  sys::swapByteOrder(C.flags);
-}
-
-template<>
-void SwapStruct(uint32_t &C) {
-  sys::swapByteOrder(C);
-}
-
-template<>
-void SwapStruct(MachO::linker_options_command &C) {
-  sys::swapByteOrder(C.cmd);
-  sys::swapByteOrder(C.cmdsize);
-  sys::swapByteOrder(C.count);
-}
-
-template<>
-void SwapStruct(MachO::version_min_command&C) {
-  sys::swapByteOrder(C.cmd);
-  sys::swapByteOrder(C.cmdsize);
-  sys::swapByteOrder(C.version);
-  sys::swapByteOrder(C.reserved);
-}
-
-template<>
-void SwapStruct(MachO::dylib_command&C) {
-  sys::swapByteOrder(C.cmd);
-  sys::swapByteOrder(C.cmdsize);
-  sys::swapByteOrder(C.dylib.name);
-  sys::swapByteOrder(C.dylib.timestamp);
-  sys::swapByteOrder(C.dylib.current_version);
-  sys::swapByteOrder(C.dylib.compatibility_version);
-}
-
-template<>
-void SwapStruct(MachO::data_in_code_entry &C) {
-  sys::swapByteOrder(C.offset);
-  sys::swapByteOrder(C.length);
-  sys::swapByteOrder(C.kind);
+namespace {
+  struct section_base {
+    char sectname[16];
+    char segname[16];
+  };
 }
 
 template<typename T>
-T getStruct(const MachOObjectFile *O, const char *P) {
+static T getStruct(const MachOObjectFile *O, const char *P) {
   T Cmd;
   memcpy(&Cmd, P, sizeof(T));
   if (O->isLittleEndian() != sys::IsLittleEndianHost)
-    SwapStruct(Cmd);
+    MachO::swapStruct(Cmd);
   return Cmd;
 }
 
@@ -255,6 +58,17 @@
   return S.nsects;
 }
 
+static bool isPageZeroSegment(const MachOObjectFile *O,
+                              const MachOObjectFile::LoadCommandInfo &L) {
+  if (O->is64Bit()) {
+    MachO::segment_command_64 S = O->getSegment64LoadCommand(L);
+    return StringRef("__PAGEZERO").equals(S.segname);
+  }
+  MachO::segment_command S = O->getSegmentLoadCommand(L);
+  return StringRef("__PAGEZERO").equals(S.segname);
+}
+
+
 static const char *
 getSectionPtr(const MachOObjectFile *O, MachOObjectFile::LoadCommandInfo L,
               unsigned Sec) {
@@ -274,10 +88,10 @@
   return O->getData().substr(Offset, 1).data();
 }
 
-static nlist_base
+static MachO::nlist_base
 getSymbolTableEntryBase(const MachOObjectFile *O, DataRefImpl DRI) {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<nlist_base>(O, P);
+  return getStruct<MachO::nlist_base>(O, P);
 }
 
 static StringRef parseSegmentOrSectionName(const char *P) {
@@ -330,11 +144,9 @@
     // to find a section beginning instead.
     for (const SectionRef &Section : O->sections()) {
       std::error_code ec;
-      uint64_t Addr;
-      StringRef Name;
 
-      if ((ec = Section.getAddress(Addr)))
-        report_fatal_error(ec.message());
+      StringRef Name;
+      uint64_t Addr = Section.getAddress();
       if (Addr != Val)
         continue;
       if ((ec = Section.getName(Name)))
@@ -407,11 +219,6 @@
   return RE.r_word1 & 0xf;
 }
 
-static unsigned
-getScatteredRelocationType(const MachO::any_relocation_info &RE) {
-  return (RE.r_word0 >> 24) & 0xf;
-}
-
 static uint32_t getSectionFlags(const MachOObjectFile *O,
                                 DataRefImpl Sec) {
   if (O->is64Bit()) {
@@ -422,12 +229,12 @@
   return Sect.flags;
 }
 
-MachOObjectFile::MachOObjectFile(std::unique_ptr<MemoryBuffer> Object,
-                                 bool IsLittleEndian, bool Is64bits,
-                                 std::error_code &EC)
-    : ObjectFile(getMachOType(IsLittleEndian, Is64bits), std::move(Object)),
+MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
+                                 bool Is64bits, std::error_code &EC)
+    : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object),
       SymtabLoadCmd(nullptr), DysymtabLoadCmd(nullptr),
-      DataInCodeLoadCmd(nullptr) {
+      DataInCodeLoadCmd(nullptr), DyldInfoLoadCmd(nullptr),
+      UuidLoadCmd(nullptr), HasPageZeroSegment(false) {
   uint32_t LoadCommandCount = this->getHeader().ncmds;
   MachO::LoadCommandType SegmentLoadType = is64Bit() ?
     MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT;
@@ -435,20 +242,49 @@
   MachOObjectFile::LoadCommandInfo Load = getFirstLoadCommandInfo();
   for (unsigned I = 0; ; ++I) {
     if (Load.C.cmd == MachO::LC_SYMTAB) {
-      assert(!SymtabLoadCmd && "Multiple symbol tables");
+      // Multiple symbol tables
+      if (SymtabLoadCmd) {
+        EC = object_error::parse_failed;
+        return;
+      }
       SymtabLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == MachO::LC_DYSYMTAB) {
-      assert(!DysymtabLoadCmd && "Multiple dynamic symbol tables");
+      // Multiple dynamic symbol tables
+      if (DysymtabLoadCmd) {
+        EC = object_error::parse_failed;
+        return;
+      }
       DysymtabLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == MachO::LC_DATA_IN_CODE) {
-      assert(!DataInCodeLoadCmd && "Multiple data in code tables");
+      // Multiple data in code tables
+      if (DataInCodeLoadCmd) {
+        EC = object_error::parse_failed;
+        return;
+      }
       DataInCodeLoadCmd = Load.Ptr;
+    } else if (Load.C.cmd == MachO::LC_DYLD_INFO || 
+               Load.C.cmd == MachO::LC_DYLD_INFO_ONLY) {
+      // Multiple dyldinfo load commands
+      if (DyldInfoLoadCmd) {
+        EC = object_error::parse_failed;
+        return;
+      }
+      DyldInfoLoadCmd = Load.Ptr;
+    } else if (Load.C.cmd == MachO::LC_UUID) {
+      // Multiple UUID load commands
+      if (UuidLoadCmd) {
+        EC = object_error::parse_failed;
+        return;
+      }
+      UuidLoadCmd = Load.Ptr;
     } else if (Load.C.cmd == SegmentLoadType) {
       uint32_t NumSections = getSegmentLoadCommandNumSections(this, Load);
       for (unsigned J = 0; J < NumSections; ++J) {
         const char *Sec = getSectionPtr(this, Load, J);
         Sections.push_back(Sec);
       }
+      if (isPageZeroSegment(this, Load))
+        HasPageZeroSegment = true;
     } else if (Load.C.cmd == MachO::LC_LOAD_DYLIB ||
                Load.C.cmd == MachO::LC_LOAD_WEAK_DYLIB ||
                Load.C.cmd == MachO::LC_LAZY_LOAD_DYLIB ||
@@ -474,7 +310,7 @@
 std::error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
                                                StringRef &Res) const {
   StringRef StringTable = getStringTableData();
-  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   const char *Start = &StringTable.data()[Entry.n_strx];
   Res = StringRef(Start);
   return object_error::success;
@@ -528,7 +364,7 @@
                                                     uint32_t &Result) const {
   uint32_t flags = getSymbolFlags(DRI);
   if (flags & SymbolRef::SF_Common) {
-    nlist_base Entry = getSymbolTableEntryBase(this, DRI);
+    MachO::nlist_base Entry = getSymbolTableEntryBase(this, DRI);
     Result = 1 << MachO::GET_COMM_ALIGN(Entry.n_desc);
   } else {
     Result = 0;
@@ -542,7 +378,7 @@
   uint64_t EndOffset = 0;
   uint8_t SectionIndex;
 
-  nlist_base Entry = getSymbolTableEntryBase(this, DRI);
+  MachO::nlist_base Entry = getSymbolTableEntryBase(this, DRI);
   uint64_t Value;
   getSymbolAddress(DRI, Value);
   if (Value == UnknownAddressOrSize) {
@@ -574,11 +410,10 @@
         EndOffset = Value;
   }
   if (!EndOffset) {
-    uint64_t Size;
     DataRefImpl Sec;
     Sec.d.a = SectionIndex-1;
-    getSectionSize(Sec, Size);
-    getSectionAddress(Sec, EndOffset);
+    uint64_t Size = getSectionSize(Sec);
+    EndOffset = getSectionAddress(Sec);
     EndOffset += Size;
   }
   Result = EndOffset - BeginOffset;
@@ -587,7 +422,7 @@
 
 std::error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
                                                SymbolRef::Type &Res) const {
-  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   uint8_t n_type = Entry.n_type;
 
   Res = SymbolRef::ST_Other;
@@ -610,7 +445,7 @@
 }
 
 uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
-  nlist_base Entry = getSymbolTableEntryBase(this, DRI);
+  MachO::nlist_base Entry = getSymbolTableEntryBase(this, DRI);
 
   uint8_t MachOType = Entry.n_type;
   uint16_t MachOFlags = Entry.n_desc;
@@ -639,6 +474,9 @@
   if (MachOFlags & (MachO::N_WEAK_REF | MachO::N_WEAK_DEF))
     Result |= SymbolRef::SF_Weak;
 
+  if (MachOFlags & (MachO::N_ARM_THUMB_DEF))
+    Result |= SymbolRef::SF_Thumb;
+
   if ((MachOType & MachO::N_TYPE) == MachO::N_ABS)
     Result |= SymbolRef::SF_Absolute;
 
@@ -647,7 +485,7 @@
 
 std::error_code MachOObjectFile::getSymbolSection(DataRefImpl Symb,
                                                   section_iterator &Res) const {
-  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  MachO::nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   uint8_t index = Entry.n_sect;
 
   if (index == 0) {
@@ -672,29 +510,16 @@
   return object_error::success;
 }
 
-std::error_code MachOObjectFile::getSectionAddress(DataRefImpl Sec,
-                                                   uint64_t &Res) const {
-  if (is64Bit()) {
-    MachO::section_64 Sect = getSection64(Sec);
-    Res = Sect.addr;
-  } else {
-    MachO::section Sect = getSection(Sec);
-    Res = Sect.addr;
-  }
-  return object_error::success;
+uint64_t MachOObjectFile::getSectionAddress(DataRefImpl Sec) const {
+  if (is64Bit())
+    return getSection64(Sec).addr;
+  return getSection(Sec).addr;
 }
 
-std::error_code MachOObjectFile::getSectionSize(DataRefImpl Sec,
-                                                uint64_t &Res) const {
-  if (is64Bit()) {
-    MachO::section_64 Sect = getSection64(Sec);
-    Res = Sect.size;
-  } else {
-    MachO::section Sect = getSection(Sec);
-    Res = Sect.size;
-  }
-
-  return object_error::success;
+uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const {
+  if (is64Bit())
+    return getSection64(Sec).size;
+  return getSection(Sec).size;
 }
 
 std::error_code MachOObjectFile::getSectionContents(DataRefImpl Sec,
@@ -716,8 +541,7 @@
   return object_error::success;
 }
 
-std::error_code MachOObjectFile::getSectionAlignment(DataRefImpl Sec,
-                                                     uint64_t &Res) const {
+uint64_t MachOObjectFile::getSectionAlignment(DataRefImpl Sec) const {
   uint32_t Align;
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
@@ -727,92 +551,70 @@
     Align = Sect.align;
   }
 
-  Res = uint64_t(1) << Align;
-  return object_error::success;
+  return uint64_t(1) << Align;
 }
 
-std::error_code MachOObjectFile::isSectionText(DataRefImpl Sec,
-                                               bool &Res) const {
+bool MachOObjectFile::isSectionText(DataRefImpl Sec) const {
   uint32_t Flags = getSectionFlags(this, Sec);
-  Res = Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
-  return object_error::success;
+  return Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
 }
 
-std::error_code MachOObjectFile::isSectionData(DataRefImpl Sec,
-                                               bool &Result) const {
+bool MachOObjectFile::isSectionData(DataRefImpl Sec) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   unsigned SectionType = Flags & MachO::SECTION_TYPE;
-  Result = !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
-           !(SectionType == MachO::S_ZEROFILL ||
-             SectionType == MachO::S_GB_ZEROFILL);
-  return object_error::success;
+  return !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
+         !(SectionType == MachO::S_ZEROFILL ||
+           SectionType == MachO::S_GB_ZEROFILL);
 }
 
-std::error_code MachOObjectFile::isSectionBSS(DataRefImpl Sec,
-                                              bool &Result) const {
+bool MachOObjectFile::isSectionBSS(DataRefImpl Sec) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   unsigned SectionType = Flags & MachO::SECTION_TYPE;
-  Result = !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
-           (SectionType == MachO::S_ZEROFILL ||
-            SectionType == MachO::S_GB_ZEROFILL);
-  return object_error::success;
+  return !(Flags & MachO::S_ATTR_PURE_INSTRUCTIONS) &&
+         (SectionType == MachO::S_ZEROFILL ||
+          SectionType == MachO::S_GB_ZEROFILL);
 }
 
-std::error_code
-MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
-                                               bool &Result) const {
+bool MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sect) const {
   // FIXME: Unimplemented.
-  Result = true;
-  return object_error::success;
+  return true;
 }
 
-std::error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
-                                                  bool &Result) const {
+bool MachOObjectFile::isSectionVirtual(DataRefImpl Sec) const {
   // FIXME: Unimplemented.
-  Result = false;
-  return object_error::success;
+  return false;
 }
 
-std::error_code MachOObjectFile::isSectionZeroInit(DataRefImpl Sec,
-                                                   bool &Res) const {
+bool MachOObjectFile::isSectionZeroInit(DataRefImpl Sec) const {
   uint32_t Flags = getSectionFlags(this, Sec);
   unsigned SectionType = Flags & MachO::SECTION_TYPE;
-  Res = SectionType == MachO::S_ZEROFILL ||
-    SectionType == MachO::S_GB_ZEROFILL;
-  return object_error::success;
+  return SectionType == MachO::S_ZEROFILL ||
+         SectionType == MachO::S_GB_ZEROFILL;
 }
 
-std::error_code MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
-                                                       bool &Result) const {
+bool MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec) const {
   // Consider using the code from isSectionText to look for __const sections.
   // Alternately, emit S_ATTR_PURE_INSTRUCTIONS and/or S_ATTR_SOME_INSTRUCTIONS
   // to use section attributes to distinguish code from data.
 
   // FIXME: Unimplemented.
-  Result = false;
-  return object_error::success;
+  return false;
 }
 
-std::error_code MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
-                                                       DataRefImpl Symb,
-                                                       bool &Result) const {
+bool MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
+                                            DataRefImpl Symb) const {
   SymbolRef::Type ST;
   this->getSymbolType(Symb, ST);
-  if (ST == SymbolRef::ST_Unknown) {
-    Result = false;
-    return object_error::success;
-  }
+  if (ST == SymbolRef::ST_Unknown)
+    return false;
 
-  uint64_t SectBegin, SectEnd;
-  getSectionAddress(Sec, SectBegin);
-  getSectionSize(Sec, SectEnd);
+  uint64_t SectBegin = getSectionAddress(Sec);
+  uint64_t SectEnd = getSectionSize(Sec);
   SectEnd += SectBegin;
 
   uint64_t SymAddr;
   getSymbolAddress(Symb, SymAddr);
-  Result = (SymAddr >= SectBegin) && (SymAddr < SectEnd);
-
-  return object_error::success;
+  return (SymAddr >= SectBegin) && (SymAddr < SectEnd);
 }
 
 relocation_iterator MachOObjectFile::section_rel_begin(DataRefImpl Sec) const {
@@ -850,8 +652,7 @@
 
   DataRefImpl Sec;
   Sec.d.a = Rel.d.a;
-  uint64_t SecAddress;
-  getSectionAddress(Sec, SecAddress);
+  uint64_t SecAddress = getSectionAddress(Sec);
   Res = SecAddress + Offset;
   return object_error::success;
 }
@@ -956,7 +757,6 @@
         res = Table[RType];
       break;
     }
-    case Triple::arm64:
     case Triple::aarch64: {
       static const char *const Table[] = {
         "ARM64_RELOC_UNSIGNED",           "ARM64_RELOC_SUBTRACTOR",
@@ -1210,16 +1010,6 @@
   return object_error::success;
 }
 
-std::error_code MachOObjectFile::getLibraryNext(DataRefImpl LibData,
-                                                LibraryRef &Res) const {
-  report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
-}
-
-std::error_code MachOObjectFile::getLibraryPath(DataRefImpl LibData,
-                                                StringRef &Res) const {
-  report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
-}
-
 //
 // guessLibraryShortName() is passed a name of a dynamic library and returns a
 // guess on what the short name is.  Then name is returned as a substring of the
@@ -1368,31 +1158,26 @@
 // It is passed the index (0 - based) of the library as translated from
 // GET_LIBRARY_ORDINAL (1 - based).
 std::error_code MachOObjectFile::getLibraryShortNameByIndex(unsigned Index,
-                                                            StringRef &Res) {
+                                                         StringRef &Res) const {
   if (Index >= Libraries.size())
     return object_error::parse_failed;
 
-  MachO::dylib_command D =
-    getStruct<MachO::dylib_command>(this, Libraries[Index]);
-  if (D.dylib.name >= D.cmdsize)
-    return object_error::parse_failed;
-
   // If the cache of LibrariesShortNames is not built up do that first for
   // all the Libraries.
   if (LibrariesShortNames.size() == 0) {
     for (unsigned i = 0; i < Libraries.size(); i++) {
       MachO::dylib_command D =
         getStruct<MachO::dylib_command>(this, Libraries[i]);
-      if (D.dylib.name >= D.cmdsize) {
-        LibrariesShortNames.push_back(StringRef());
-        continue;
-      }
+      if (D.dylib.name >= D.cmdsize)
+        return object_error::parse_failed;
       const char *P = (const char *)(Libraries[i]) + D.dylib.name;
       StringRef Name = StringRef(P);
+      if (D.dylib.name+Name.size() >= D.cmdsize)
+        return object_error::parse_failed;
       StringRef Suffix;
       bool isFramework;
       StringRef shortName = guessLibraryShortName(Name, isFramework, Suffix);
-      if (shortName == StringRef())
+      if (shortName.empty())
         LibrariesShortNames.push_back(Name);
       else
         LibrariesShortNames.push_back(shortName);
@@ -1447,16 +1232,6 @@
   return section_iterator(SectionRef(DRI, this));
 }
 
-library_iterator MachOObjectFile::needed_library_begin() const {
-  // TODO: implement
-  report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
-}
-
-library_iterator MachOObjectFile::needed_library_end() const {
-  // TODO: implement
-  report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
-}
-
 uint8_t MachOObjectFile::getBytesInAddress() const {
   return is64Bit() ? 8 : 4;
 }
@@ -1472,17 +1247,10 @@
     case llvm::MachO::CPU_TYPE_POWERPC:
       return "Mach-O 32-bit ppc";
     default:
-      assert((CPUType & llvm::MachO::CPU_ARCH_ABI64) == 0 &&
-             "64-bit object file when we're not 64-bit?");
       return "Mach-O 32-bit unknown";
     }
   }
 
-  // Make sure the cpu type has the correct mask.
-  assert((CPUType & llvm::MachO::CPU_ARCH_ABI64)
-         == llvm::MachO::CPU_ARCH_ABI64 &&
-         "32-bit object file when we're 64-bit?");
-
   switch (CPUType) {
   case llvm::MachO::CPU_TYPE_X86_64:
     return "Mach-O 64-bit x86-64";
@@ -1504,7 +1272,7 @@
   case llvm::MachO::CPU_TYPE_ARM:
     return Triple::arm;
   case llvm::MachO::CPU_TYPE_ARM64:
-    return Triple::arm64;
+    return Triple::aarch64;
   case llvm::MachO::CPU_TYPE_POWERPC:
     return Triple::ppc;
   case llvm::MachO::CPU_TYPE_POWERPC64:
@@ -1514,7 +1282,11 @@
   }
 }
 
-Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType) {
+Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType,
+                                const char **McpuDefault) {
+  if (McpuDefault)
+    *McpuDefault = nullptr;
+
   switch (CPUType) {
   case MachO::CPU_TYPE_I386:
     switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
@@ -1538,15 +1310,25 @@
       return Triple("armv4t-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V5TEJ:
       return Triple("armv5e-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_XSCALE:
+      return Triple("xscale-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V6:
       return Triple("armv6-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V6M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m0";
       return Triple("armv6m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7:
+      return Triple("armv7-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7EM:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m4";
       return Triple("armv7em-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7K:
       return Triple("armv7k-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m3";
       return Triple("armv7m-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7S:
       return Triple("armv7s-apple-darwin");
@@ -1579,50 +1361,102 @@
   }
 }
 
+Triple MachOObjectFile::getThumbArch(uint32_t CPUType, uint32_t CPUSubType,
+                                     const char **McpuDefault) {
+  if (McpuDefault)
+    *McpuDefault = nullptr;
+
+  switch (CPUType) {
+  case MachO::CPU_TYPE_ARM:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM_V4T:
+      return Triple("thumbv4t-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
+      return Triple("thumbv5e-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_XSCALE:
+      return Triple("xscale-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6:
+      return Triple("thumbv6-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m0";
+      return Triple("thumbv6m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7:
+      return Triple("thumbv7-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7EM:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m4";
+      return Triple("thumbv7em-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7K:
+      return Triple("thumbv7k-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m3";
+      return Triple("thumbv7m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7S:
+      return Triple("thumbv7s-apple-darwin");
+    default:
+      return Triple();
+    }
+  default:
+    return Triple();
+  }
+}
+
+Triple MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType,
+                                const char **McpuDefault,
+				Triple *ThumbTriple) {
+  Triple T = MachOObjectFile::getArch(CPUType, CPUSubType, McpuDefault);
+  *ThumbTriple = MachOObjectFile::getThumbArch(CPUType, CPUSubType,
+                                               McpuDefault);
+  return T;
+}
+
 Triple MachOObjectFile::getHostArch() {
   return Triple(sys::getDefaultTargetTriple());
 }
 
-Triple MachOObjectFile::getArch(StringRef ArchFlag) {
-  if (ArchFlag == "i386")
-    return Triple("i386-apple-darwin");
-  else if (ArchFlag == "x86_64")
-    return Triple("x86_64-apple-darwin");
-  else if (ArchFlag == "x86_64h")
-    return Triple("x86_64h-apple-darwin");
-  else if (ArchFlag == "armv4t" || ArchFlag == "arm")
-    return Triple("armv4t-apple-darwin");
-  else if (ArchFlag == "armv5e")
-    return Triple("armv5e-apple-darwin");
-  else if (ArchFlag == "armv6")
-    return Triple("armv6-apple-darwin");
-  else if (ArchFlag == "armv6m")
-    return Triple("armv6m-apple-darwin");
-  else if (ArchFlag == "armv7em")
-    return Triple("armv7em-apple-darwin");
-  else if (ArchFlag == "armv7k")
-    return Triple("armv7k-apple-darwin");
-  else if (ArchFlag == "armv7k")
-    return Triple("armv7m-apple-darwin");
-  else if (ArchFlag == "armv7s")
-    return Triple("armv7s-apple-darwin");
-  else if (ArchFlag == "arm64")
-    return Triple("arm64-apple-darwin");
-  else if (ArchFlag == "ppc")
-    return Triple("ppc-apple-darwin");
-  else if (ArchFlag == "ppc64")
-    return Triple("ppc64-apple-darwin");
-  else
-    return Triple();
+bool MachOObjectFile::isValidArch(StringRef ArchFlag) {
+  return StringSwitch<bool>(ArchFlag)
+      .Case("i386", true)
+      .Case("x86_64", true)
+      .Case("x86_64h", true)
+      .Case("armv4t", true)
+      .Case("arm", true)
+      .Case("armv5e", true)
+      .Case("armv6", true)
+      .Case("armv6m", true)
+      .Case("armv7em", true)
+      .Case("armv7k", true)
+      .Case("armv7m", true)
+      .Case("armv7s", true)
+      .Case("arm64", true)
+      .Case("ppc", true)
+      .Case("ppc64", true)
+      .Default(false);
 }
 
 unsigned MachOObjectFile::getArch() const {
   return getArch(getCPUType(this));
 }
 
-StringRef MachOObjectFile::getLoadName() const {
-  // TODO: Implement
-  report_fatal_error("get_load_name() unimplemented in MachOObjectFile");
+Triple MachOObjectFile::getArch(const char **McpuDefault,
+                                Triple *ThumbTriple) const {
+  Triple T;
+  if (is64Bit()) {
+    MachO::mach_header_64 H_64;
+    H_64 = getHeader64();
+    T = MachOObjectFile::getArch(H_64.cputype, H_64.cpusubtype, McpuDefault);
+    *ThumbTriple = MachOObjectFile::getThumbArch(H_64.cputype, H_64.cpusubtype,
+                                                 McpuDefault);
+  } else {
+    MachO::mach_header H;
+    H = getHeader();
+    T = MachOObjectFile::getArch(H.cputype, H.cpusubtype, McpuDefault);
+    *ThumbTriple = MachOObjectFile::getThumbArch(H.cputype, H.cpusubtype,
+                                                 McpuDefault);
+  }
+  return T;
 }
 
 relocation_iterator MachOObjectFile::section_rel_begin(unsigned Index) const {
@@ -1658,6 +1492,620 @@
   return dice_iterator(DiceRef(DRI, this));
 }
 
+ExportEntry::ExportEntry(ArrayRef<uint8_t> T) 
+  : Trie(T), Malformed(false), Done(false) { }
+
+void ExportEntry::moveToFirst() {
+  pushNode(0);
+  pushDownUntilBottom();
+}
+
+void ExportEntry::moveToEnd() {
+  Stack.clear();
+  Done = true;
+}
+
+bool ExportEntry::operator==(const ExportEntry &Other) const {
+  // Common case, one at end, other iterating from begin. 
+  if (Done || Other.Done)
+    return (Done == Other.Done);
+  // Not equal if different stack sizes.
+  if (Stack.size() != Other.Stack.size())
+    return false;
+  // Not equal if different cumulative strings.
+  if (!CumulativeString.str().equals(Other.CumulativeString.str()))
+    return false;
+  // Equal if all nodes in both stacks match.
+  for (unsigned i=0; i < Stack.size(); ++i) {
+    if (Stack[i].Start != Other.Stack[i].Start)
+      return false;
+  }
+  return true;  
+}
+
+uint64_t ExportEntry::readULEB128(const uint8_t *&Ptr) {
+  unsigned Count;
+  uint64_t Result = decodeULEB128(Ptr, &Count);
+  Ptr += Count;
+  if (Ptr > Trie.end()) {
+    Ptr = Trie.end();
+    Malformed = true;
+  }
+  return Result;
+}
+
+StringRef ExportEntry::name() const {
+  return CumulativeString.str();
+}
+
+uint64_t ExportEntry::flags() const {
+  return Stack.back().Flags;
+}
+
+uint64_t ExportEntry::address() const {
+  return Stack.back().Address;
+}
+
+uint64_t ExportEntry::other() const {
+  return Stack.back().Other;
+}
+
+StringRef ExportEntry::otherName() const {
+  const char* ImportName = Stack.back().ImportName;
+  if (ImportName)
+    return StringRef(ImportName);
+  return StringRef();
+}
+
+uint32_t ExportEntry::nodeOffset() const {
+  return Stack.back().Start - Trie.begin();
+}
+
+ExportEntry::NodeState::NodeState(const uint8_t *Ptr) 
+  : Start(Ptr), Current(Ptr), Flags(0), Address(0), Other(0), 
+    ImportName(nullptr), ChildCount(0), NextChildIndex(0),  
+    ParentStringLength(0), IsExportNode(false) {
+}
+
+void ExportEntry::pushNode(uint64_t offset) {
+  const uint8_t *Ptr = Trie.begin() + offset;
+  NodeState State(Ptr);
+  uint64_t ExportInfoSize = readULEB128(State.Current);
+  State.IsExportNode = (ExportInfoSize != 0);
+  const uint8_t* Children = State.Current + ExportInfoSize;
+  if (State.IsExportNode) {
+    State.Flags = readULEB128(State.Current);
+    if (State.Flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {
+      State.Address = 0;
+      State.Other = readULEB128(State.Current); // dylib ordinal
+      State.ImportName = reinterpret_cast<const char*>(State.Current);
+    } else {
+      State.Address = readULEB128(State.Current);
+      if (State.Flags & MachO::EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER)
+        State.Other = readULEB128(State.Current); 
+    }
+  }
+  State.ChildCount = *Children;
+  State.Current = Children + 1;
+  State.NextChildIndex = 0;
+  State.ParentStringLength = CumulativeString.size();
+  Stack.push_back(State);
+}
+
+void ExportEntry::pushDownUntilBottom() {
+  while (Stack.back().NextChildIndex < Stack.back().ChildCount) {
+    NodeState &Top = Stack.back();
+    CumulativeString.resize(Top.ParentStringLength);
+    for (;*Top.Current != 0; Top.Current++) {
+      char C = *Top.Current;
+      CumulativeString.push_back(C);
+    }
+    Top.Current += 1;
+    uint64_t childNodeIndex = readULEB128(Top.Current);
+    Top.NextChildIndex += 1;
+    pushNode(childNodeIndex);
+  }
+  if (!Stack.back().IsExportNode) {
+    Malformed = true;
+    moveToEnd();
+  }
+}
+
+// We have a trie data structure and need a way to walk it that is compatible
+// with the C++ iterator model. The solution is a non-recursive depth first
+// traversal where the iterator contains a stack of parent nodes along with a
+// string that is the accumulation of all edge strings along the parent chain
+// to this point.
+//
+// There is one "export" node for each exported symbol.  But because some
+// symbols may be a prefix of another symbol (e.g. _dup and _dup2), an export
+// node may have child nodes too.  
+//
+// The algorithm for moveNext() is to keep moving down the leftmost unvisited
+// child until hitting a node with no children (which is an export node or
+// else the trie is malformed). On the way down, each node is pushed on the
+// stack ivar.  If there is no more ways down, it pops up one and tries to go
+// down a sibling path until a childless node is reached.
+void ExportEntry::moveNext() {
+  if (Stack.empty() || !Stack.back().IsExportNode) {
+    Malformed = true;
+    moveToEnd();
+    return;
+  }
+
+  Stack.pop_back();
+  while (!Stack.empty()) {
+    NodeState &Top = Stack.back();
+    if (Top.NextChildIndex < Top.ChildCount) {
+      pushDownUntilBottom();
+      // Now at the next export node.
+      return;
+    } else {
+      if (Top.IsExportNode) {
+        // This node has no children but is itself an export node.
+        CumulativeString.resize(Top.ParentStringLength);
+        return;
+      }
+      Stack.pop_back();
+    }
+  }
+  Done = true;
+}
+
+iterator_range<export_iterator> 
+MachOObjectFile::exports(ArrayRef<uint8_t> Trie) {
+  ExportEntry Start(Trie);
+  Start.moveToFirst();
+
+  ExportEntry Finish(Trie);
+  Finish.moveToEnd();
+
+  return iterator_range<export_iterator>(export_iterator(Start), 
+                                         export_iterator(Finish));
+}
+
+iterator_range<export_iterator> MachOObjectFile::exports() const {
+  return exports(getDyldInfoExportsTrie());
+}
+
+
+MachORebaseEntry::MachORebaseEntry(ArrayRef<uint8_t> Bytes, bool is64Bit)
+    : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
+      RemainingLoopCount(0), AdvanceAmount(0), RebaseType(0),
+      PointerSize(is64Bit ? 8 : 4), Malformed(false), Done(false) {}
+
+void MachORebaseEntry::moveToFirst() {
+  Ptr = Opcodes.begin();
+  moveNext();
+}
+
+void MachORebaseEntry::moveToEnd() {
+  Ptr = Opcodes.end();
+  RemainingLoopCount = 0;
+  Done = true;
+}
+
+void MachORebaseEntry::moveNext() {
+  // If in the middle of some loop, move to next rebasing in loop.
+  SegmentOffset += AdvanceAmount;
+  if (RemainingLoopCount) {
+    --RemainingLoopCount;
+    return;
+  }
+  if (Ptr == Opcodes.end()) {
+    Done = true;
+    return;
+  }
+  bool More = true;
+  while (More && !Malformed) {
+    // Parse next opcode and set up next loop.
+    uint8_t Byte = *Ptr++;
+    uint8_t ImmValue = Byte & MachO::REBASE_IMMEDIATE_MASK;
+    uint8_t Opcode = Byte & MachO::REBASE_OPCODE_MASK;
+    switch (Opcode) {
+    case MachO::REBASE_OPCODE_DONE:
+      More = false;
+      Done = true;
+      moveToEnd();
+      DEBUG_WITH_TYPE("mach-o-rebase", llvm::dbgs() << "REBASE_OPCODE_DONE\n");
+      break;
+    case MachO::REBASE_OPCODE_SET_TYPE_IMM:
+      RebaseType = ImmValue;
+      DEBUG_WITH_TYPE(
+          "mach-o-rebase",
+          llvm::dbgs() << "REBASE_OPCODE_SET_TYPE_IMM: "
+                       << "RebaseType=" << (int) RebaseType << "\n");
+      break;
+    case MachO::REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
+      SegmentIndex = ImmValue;
+      SegmentOffset = readULEB128();
+      DEBUG_WITH_TYPE(
+          "mach-o-rebase",
+          llvm::dbgs() << "REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
+                       << "SegmentIndex=" << SegmentIndex << ", "
+                       << format("SegmentOffset=0x%06X", SegmentOffset)
+                       << "\n");
+      break;
+    case MachO::REBASE_OPCODE_ADD_ADDR_ULEB:
+      SegmentOffset += readULEB128();
+      DEBUG_WITH_TYPE("mach-o-rebase",
+                      llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_ULEB: "
+                                   << format("SegmentOffset=0x%06X",
+                                             SegmentOffset) << "\n");
+      break;
+    case MachO::REBASE_OPCODE_ADD_ADDR_IMM_SCALED:
+      SegmentOffset += ImmValue * PointerSize;
+      DEBUG_WITH_TYPE("mach-o-rebase",
+                      llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_IMM_SCALED: "
+                                   << format("SegmentOffset=0x%06X",
+                                             SegmentOffset) << "\n");
+      break;
+    case MachO::REBASE_OPCODE_DO_REBASE_IMM_TIMES:
+      AdvanceAmount = PointerSize;
+      RemainingLoopCount = ImmValue - 1;
+      DEBUG_WITH_TYPE(
+          "mach-o-rebase",
+          llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_IMM_TIMES: "
+                       << format("SegmentOffset=0x%06X", SegmentOffset)
+                       << ", AdvanceAmount=" << AdvanceAmount
+                       << ", RemainingLoopCount=" << RemainingLoopCount
+                       << "\n");
+      return;
+    case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES:
+      AdvanceAmount = PointerSize;
+      RemainingLoopCount = readULEB128() - 1;
+      DEBUG_WITH_TYPE(
+          "mach-o-rebase",
+          llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES: "
+                       << format("SegmentOffset=0x%06X", SegmentOffset)
+                       << ", AdvanceAmount=" << AdvanceAmount
+                       << ", RemainingLoopCount=" << RemainingLoopCount
+                       << "\n");
+      return;
+    case MachO::REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB:
+      AdvanceAmount = readULEB128() + PointerSize;
+      RemainingLoopCount = 0;
+      DEBUG_WITH_TYPE(
+          "mach-o-rebase",
+          llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB: "
+                       << format("SegmentOffset=0x%06X", SegmentOffset)
+                       << ", AdvanceAmount=" << AdvanceAmount
+                       << ", RemainingLoopCount=" << RemainingLoopCount
+                       << "\n");
+      return;
+    case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB:
+      RemainingLoopCount = readULEB128() - 1;
+      AdvanceAmount = readULEB128() + PointerSize;
+      DEBUG_WITH_TYPE(
+          "mach-o-rebase",
+          llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB: "
+                       << format("SegmentOffset=0x%06X", SegmentOffset)
+                       << ", AdvanceAmount=" << AdvanceAmount
+                       << ", RemainingLoopCount=" << RemainingLoopCount
+                       << "\n");
+      return;
+    default:
+      Malformed = true;
+    }
+  }
+}
+
+uint64_t MachORebaseEntry::readULEB128() {
+  unsigned Count;
+  uint64_t Result = decodeULEB128(Ptr, &Count);
+  Ptr += Count;
+  if (Ptr > Opcodes.end()) {
+    Ptr = Opcodes.end();
+    Malformed = true;
+  }
+  return Result;
+}
+
+uint32_t MachORebaseEntry::segmentIndex() const { return SegmentIndex; }
+
+uint64_t MachORebaseEntry::segmentOffset() const { return SegmentOffset; }
+
+StringRef MachORebaseEntry::typeName() const {
+  switch (RebaseType) {
+  case MachO::REBASE_TYPE_POINTER:
+    return "pointer";
+  case MachO::REBASE_TYPE_TEXT_ABSOLUTE32:
+    return "text abs32";
+  case MachO::REBASE_TYPE_TEXT_PCREL32:
+    return "text rel32";
+  }
+  return "unknown";
+}
+
+bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
+  assert(Opcodes == Other.Opcodes && "compare iterators of different files");
+  return (Ptr == Other.Ptr) &&
+         (RemainingLoopCount == Other.RemainingLoopCount) &&
+         (Done == Other.Done);
+}
+
+iterator_range<rebase_iterator>
+MachOObjectFile::rebaseTable(ArrayRef<uint8_t> Opcodes, bool is64) {
+  MachORebaseEntry Start(Opcodes, is64);
+  Start.moveToFirst();
+
+  MachORebaseEntry Finish(Opcodes, is64);
+  Finish.moveToEnd();
+
+  return iterator_range<rebase_iterator>(rebase_iterator(Start),
+                                         rebase_iterator(Finish));
+}
+
+iterator_range<rebase_iterator> MachOObjectFile::rebaseTable() const {
+  return rebaseTable(getDyldInfoRebaseOpcodes(), is64Bit());
+}
+
+
+MachOBindEntry::MachOBindEntry(ArrayRef<uint8_t> Bytes, bool is64Bit,
+                               Kind BK)
+    : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
+      Ordinal(0), Flags(0), Addend(0), RemainingLoopCount(0), AdvanceAmount(0),
+      BindType(0), PointerSize(is64Bit ? 8 : 4),
+      TableKind(BK), Malformed(false), Done(false) {}
+
+void MachOBindEntry::moveToFirst() {
+  Ptr = Opcodes.begin();
+  moveNext();
+}
+
+void MachOBindEntry::moveToEnd() {
+  Ptr = Opcodes.end();
+  RemainingLoopCount = 0;
+  Done = true;
+}
+
+void MachOBindEntry::moveNext() {
+  // If in the middle of some loop, move to next binding in loop.
+  SegmentOffset += AdvanceAmount;
+  if (RemainingLoopCount) {
+    --RemainingLoopCount;
+    return;
+  }
+  if (Ptr == Opcodes.end()) {
+    Done = true;
+    return;
+  }
+  bool More = true;
+  while (More && !Malformed) {
+    // Parse next opcode and set up next loop.
+    uint8_t Byte = *Ptr++;
+    uint8_t ImmValue = Byte & MachO::BIND_IMMEDIATE_MASK;
+    uint8_t Opcode = Byte & MachO::BIND_OPCODE_MASK;
+    int8_t SignExtended;
+    const uint8_t *SymStart;
+    switch (Opcode) {
+    case MachO::BIND_OPCODE_DONE:
+      if (TableKind == Kind::Lazy) {
+        // Lazying bindings have a DONE opcode between entries.  Need to ignore
+        // it to advance to next entry.  But need not if this is last entry.
+        bool NotLastEntry = false;
+        for (const uint8_t *P = Ptr; P < Opcodes.end(); ++P) {
+          if (*P) {
+            NotLastEntry = true;
+          }
+        }
+        if (NotLastEntry)
+          break;
+      }
+      More = false;
+      Done = true;
+      moveToEnd();
+      DEBUG_WITH_TYPE("mach-o-bind", llvm::dbgs() << "BIND_OPCODE_DONE\n");
+      break;
+    case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_IMM:
+      Ordinal = ImmValue;
+      DEBUG_WITH_TYPE(
+          "mach-o-bind",
+          llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_IMM: "
+                       << "Ordinal=" << Ordinal << "\n");
+      break;
+    case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB:
+      Ordinal = readULEB128();
+      DEBUG_WITH_TYPE(
+          "mach-o-bind",
+          llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB: "
+                       << "Ordinal=" << Ordinal << "\n");
+      break;
+    case MachO::BIND_OPCODE_SET_DYLIB_SPECIAL_IMM:
+      if (ImmValue) {
+        SignExtended = MachO::BIND_OPCODE_MASK | ImmValue;
+        Ordinal = SignExtended;
+      } else
+        Ordinal = 0;
+      DEBUG_WITH_TYPE(
+          "mach-o-bind",
+          llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_SPECIAL_IMM: "
+                       << "Ordinal=" << Ordinal << "\n");
+      break;
+    case MachO::BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM:
+      Flags = ImmValue;
+      SymStart = Ptr;
+      while (*Ptr) {
+        ++Ptr;
+      }
+      SymbolName = StringRef(reinterpret_cast<const char*>(SymStart),
+                             Ptr-SymStart);
+      ++Ptr;
+      DEBUG_WITH_TYPE(
+          "mach-o-bind",
+          llvm::dbgs() << "BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM: "
+                       << "SymbolName=" << SymbolName << "\n");
+      if (TableKind == Kind::Weak) {
+        if (ImmValue & MachO::BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION)
+          return;
+      }
+      break;
+    case MachO::BIND_OPCODE_SET_TYPE_IMM:
+      BindType = ImmValue;
+      DEBUG_WITH_TYPE(
+          "mach-o-bind",
+          llvm::dbgs() << "BIND_OPCODE_SET_TYPE_IMM: "
+                       << "BindType=" << (int)BindType << "\n");
+      break;
+    case MachO::BIND_OPCODE_SET_ADDEND_SLEB:
+      Addend = readSLEB128();
+      if (TableKind == Kind::Lazy)
+        Malformed = true;
+      DEBUG_WITH_TYPE(
+          "mach-o-bind",
+          llvm::dbgs() << "BIND_OPCODE_SET_ADDEND_SLEB: "
+                       << "Addend=" << Addend << "\n");
+      break;
+    case MachO::BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
+      SegmentIndex = ImmValue;
+      SegmentOffset = readULEB128();
+      DEBUG_WITH_TYPE(
+          "mach-o-bind",
+          llvm::dbgs() << "BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
+                       << "SegmentIndex=" << SegmentIndex << ", "
+                       << format("SegmentOffset=0x%06X", SegmentOffset)
+                       << "\n");
+      break;
+    case MachO::BIND_OPCODE_ADD_ADDR_ULEB:
+      SegmentOffset += readULEB128();
+      DEBUG_WITH_TYPE("mach-o-bind",
+                      llvm::dbgs() << "BIND_OPCODE_ADD_ADDR_ULEB: "
+                                   << format("SegmentOffset=0x%06X",
+                                             SegmentOffset) << "\n");
+      break;
+    case MachO::BIND_OPCODE_DO_BIND:
+      AdvanceAmount = PointerSize;
+      RemainingLoopCount = 0;
+      DEBUG_WITH_TYPE("mach-o-bind",
+                      llvm::dbgs() << "BIND_OPCODE_DO_BIND: "
+                                   << format("SegmentOffset=0x%06X",
+                                             SegmentOffset) << "\n");
+      return;
+     case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB:
+      AdvanceAmount = readULEB128() + PointerSize;
+      RemainingLoopCount = 0;
+      if (TableKind == Kind::Lazy)
+        Malformed = true;
+      DEBUG_WITH_TYPE(
+          "mach-o-bind",
+          llvm::dbgs() << "BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB: "
+                       << format("SegmentOffset=0x%06X", SegmentOffset)
+                       << ", AdvanceAmount=" << AdvanceAmount
+                       << ", RemainingLoopCount=" << RemainingLoopCount
+                       << "\n");
+      return;
+    case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED:
+      AdvanceAmount = ImmValue * PointerSize + PointerSize;
+      RemainingLoopCount = 0;
+      if (TableKind == Kind::Lazy)
+        Malformed = true;
+      DEBUG_WITH_TYPE("mach-o-bind",
+                      llvm::dbgs()
+                      << "BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED: "
+                      << format("SegmentOffset=0x%06X",
+                                             SegmentOffset) << "\n");
+      return;
+    case MachO::BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB:
+      RemainingLoopCount = readULEB128() - 1;
+      AdvanceAmount = readULEB128() + PointerSize;
+      if (TableKind == Kind::Lazy)
+        Malformed = true;
+      DEBUG_WITH_TYPE(
+          "mach-o-bind",
+          llvm::dbgs() << "BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB: "
+                       << format("SegmentOffset=0x%06X", SegmentOffset)
+                       << ", AdvanceAmount=" << AdvanceAmount
+                       << ", RemainingLoopCount=" << RemainingLoopCount
+                       << "\n");
+      return;
+    default:
+      Malformed = true;
+    }
+  }
+}
+
+uint64_t MachOBindEntry::readULEB128() {
+  unsigned Count;
+  uint64_t Result = decodeULEB128(Ptr, &Count);
+  Ptr += Count;
+  if (Ptr > Opcodes.end()) {
+    Ptr = Opcodes.end();
+    Malformed = true;
+  }
+  return Result;
+}
+
+int64_t MachOBindEntry::readSLEB128() {
+  unsigned Count;
+  int64_t Result = decodeSLEB128(Ptr, &Count);
+  Ptr += Count;
+  if (Ptr > Opcodes.end()) {
+    Ptr = Opcodes.end();
+    Malformed = true;
+  }
+  return Result;
+}
+
+
+uint32_t MachOBindEntry::segmentIndex() const { return SegmentIndex; }
+
+uint64_t MachOBindEntry::segmentOffset() const { return SegmentOffset; }
+
+StringRef MachOBindEntry::typeName() const {
+  switch (BindType) {
+  case MachO::BIND_TYPE_POINTER:
+    return "pointer";
+  case MachO::BIND_TYPE_TEXT_ABSOLUTE32:
+    return "text abs32";
+  case MachO::BIND_TYPE_TEXT_PCREL32:
+    return "text rel32";
+  }
+  return "unknown";
+}
+
+StringRef MachOBindEntry::symbolName() const { return SymbolName; }
+
+int64_t MachOBindEntry::addend() const { return Addend; }
+
+uint32_t MachOBindEntry::flags() const { return Flags; }
+
+int MachOBindEntry::ordinal() const { return Ordinal; }
+
+bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
+  assert(Opcodes == Other.Opcodes && "compare iterators of different files");
+  return (Ptr == Other.Ptr) &&
+         (RemainingLoopCount == Other.RemainingLoopCount) &&
+         (Done == Other.Done);
+}
+
+iterator_range<bind_iterator>
+MachOObjectFile::bindTable(ArrayRef<uint8_t> Opcodes, bool is64,
+                           MachOBindEntry::Kind BKind) {
+  MachOBindEntry Start(Opcodes, is64, BKind);
+  Start.moveToFirst();
+
+  MachOBindEntry Finish(Opcodes, is64, BKind);
+  Finish.moveToEnd();
+
+  return iterator_range<bind_iterator>(bind_iterator(Start),
+                                       bind_iterator(Finish));
+}
+
+iterator_range<bind_iterator> MachOObjectFile::bindTable() const {
+  return bindTable(getDyldInfoBindOpcodes(), is64Bit(),
+                   MachOBindEntry::Kind::Regular);
+}
+
+iterator_range<bind_iterator> MachOObjectFile::lazyBindTable() const {
+  return bindTable(getDyldInfoLazyBindOpcodes(), is64Bit(),
+                   MachOBindEntry::Kind::Lazy);
+}
+
+iterator_range<bind_iterator> MachOObjectFile::weakBindTable() const {
+  return bindTable(getDyldInfoWeakBindOpcodes(), is64Bit(),
+                   MachOBindEntry::Kind::Weak);
+}
+
 StringRef
 MachOObjectFile::getSectionFinalSegmentName(DataRefImpl Sec) const {
   ArrayRef<char> Raw = getSectionRawFinalSegmentName(Sec);
@@ -1668,14 +2116,14 @@
 MachOObjectFile::getSectionRawName(DataRefImpl Sec) const {
   const section_base *Base =
     reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
-  return ArrayRef<char>(Base->sectname);
+  return makeArrayRef(Base->sectname);
 }
 
 ArrayRef<char>
 MachOObjectFile::getSectionRawFinalSegmentName(DataRefImpl Sec) const {
   const section_base *Base =
     reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
-  return ArrayRef<char>(Base->segname);
+  return makeArrayRef(Base->segname);
 }
 
 bool
@@ -1710,6 +2158,11 @@
   return RE.r_word1;
 }
 
+uint32_t MachOObjectFile::getScatteredRelocationType(
+    const MachO::any_relocation_info &RE) const {
+  return (RE.r_word0 >> 24) & 0xf;
+}
+
 unsigned MachOObjectFile::getAnyRelocationAddress(
     const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
@@ -1831,6 +2284,31 @@
   return getStruct<MachO::dylib_command>(this, L.Ptr);
 }
 
+MachO::dyld_info_command
+MachOObjectFile::getDyldInfoLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::dyld_info_command>(this, L.Ptr);
+}
+
+MachO::dylinker_command
+MachOObjectFile::getDylinkerCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::dylinker_command>(this, L.Ptr);
+}
+
+MachO::uuid_command
+MachOObjectFile::getUuidCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::uuid_command>(this, L.Ptr);
+}
+
+MachO::source_version_command
+MachOObjectFile::getSourceVersionCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::source_version_command>(this, L.Ptr);
+}
+
+MachO::entry_point_command
+MachOObjectFile::getEntryPointCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::entry_point_command>(this, L.Ptr);
+}
+
 
 MachO::any_relocation_info
 MachOObjectFile::getRelocation(DataRefImpl Rel) const {
@@ -1880,11 +2358,47 @@
 }
 
 MachO::symtab_command MachOObjectFile::getSymtabLoadCommand() const {
-  return getStruct<MachO::symtab_command>(this, SymtabLoadCmd);
+  if (SymtabLoadCmd)
+    return getStruct<MachO::symtab_command>(this, SymtabLoadCmd);
+
+  // If there is no SymtabLoadCmd return a load command with zero'ed fields.
+  MachO::symtab_command Cmd;
+  Cmd.cmd = MachO::LC_SYMTAB;
+  Cmd.cmdsize = sizeof(MachO::symtab_command);
+  Cmd.symoff = 0;
+  Cmd.nsyms = 0;
+  Cmd.stroff = 0;
+  Cmd.strsize = 0;
+  return Cmd;
 }
 
 MachO::dysymtab_command MachOObjectFile::getDysymtabLoadCommand() const {
-  return getStruct<MachO::dysymtab_command>(this, DysymtabLoadCmd);
+  if (DysymtabLoadCmd)
+    return getStruct<MachO::dysymtab_command>(this, DysymtabLoadCmd);
+
+  // If there is no DysymtabLoadCmd return a load command with zero'ed fields.
+  MachO::dysymtab_command Cmd;
+  Cmd.cmd = MachO::LC_DYSYMTAB;
+  Cmd.cmdsize = sizeof(MachO::dysymtab_command);
+  Cmd.ilocalsym = 0;
+  Cmd.nlocalsym = 0;
+  Cmd.iextdefsym = 0;
+  Cmd.nextdefsym = 0;
+  Cmd.iundefsym = 0;
+  Cmd.nundefsym = 0;
+  Cmd.tocoff = 0;
+  Cmd.ntoc = 0;
+  Cmd.modtaboff = 0;
+  Cmd.nmodtab = 0;
+  Cmd.extrefsymoff = 0;
+  Cmd.nextrefsyms = 0;
+  Cmd.indirectsymoff = 0;
+  Cmd.nindirectsyms = 0;
+  Cmd.extreloff = 0;
+  Cmd.nextrel = 0;
+  Cmd.locreloff = 0;
+  Cmd.nlocrel = 0;
+  return Cmd;
 }
 
 MachO::linkedit_data_command
@@ -1901,6 +2415,69 @@
   return Cmd;
 }
 
+ArrayRef<uint8_t> MachOObjectFile::getDyldInfoRebaseOpcodes() const {
+  if (!DyldInfoLoadCmd) 
+    return ArrayRef<uint8_t>();
+
+  MachO::dyld_info_command DyldInfo 
+                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
+                                             getPtr(this, DyldInfo.rebase_off));
+  return ArrayRef<uint8_t>(Ptr, DyldInfo.rebase_size);
+}
+
+ArrayRef<uint8_t> MachOObjectFile::getDyldInfoBindOpcodes() const {
+  if (!DyldInfoLoadCmd) 
+    return ArrayRef<uint8_t>();
+
+  MachO::dyld_info_command DyldInfo 
+                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
+                                               getPtr(this, DyldInfo.bind_off));
+  return ArrayRef<uint8_t>(Ptr, DyldInfo.bind_size);
+}
+
+ArrayRef<uint8_t> MachOObjectFile::getDyldInfoWeakBindOpcodes() const {
+  if (!DyldInfoLoadCmd) 
+    return ArrayRef<uint8_t>();
+
+  MachO::dyld_info_command DyldInfo 
+                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
+                                          getPtr(this, DyldInfo.weak_bind_off));
+  return ArrayRef<uint8_t>(Ptr, DyldInfo.weak_bind_size);
+}
+
+ArrayRef<uint8_t> MachOObjectFile::getDyldInfoLazyBindOpcodes() const {
+  if (!DyldInfoLoadCmd) 
+    return ArrayRef<uint8_t>();
+
+  MachO::dyld_info_command DyldInfo 
+                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
+                                          getPtr(this, DyldInfo.lazy_bind_off));
+  return ArrayRef<uint8_t>(Ptr, DyldInfo.lazy_bind_size);
+}
+
+ArrayRef<uint8_t> MachOObjectFile::getDyldInfoExportsTrie() const {
+  if (!DyldInfoLoadCmd) 
+    return ArrayRef<uint8_t>();
+
+  MachO::dyld_info_command DyldInfo 
+                   = getStruct<MachO::dyld_info_command>(this, DyldInfoLoadCmd);
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t*>(
+                                             getPtr(this, DyldInfo.export_off));
+  return ArrayRef<uint8_t>(Ptr, DyldInfo.export_size);
+}
+
+ArrayRef<uint8_t> MachOObjectFile::getUuid() const {
+  if (!UuidLoadCmd)
+    return ArrayRef<uint8_t>();
+  // Returning a pointer is fine as uuid doesn't need endian swapping.
+  const char *Ptr = UuidLoadCmd + offsetof(MachO::uuid_command, uuid);
+  return ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(Ptr), 16);
+}
+
 StringRef MachOObjectFile::getStringTableData() const {
   MachO::symtab_command S = getSymtabLoadCommand();
   return getData().substr(S.stroff, S.strsize);
@@ -1908,7 +2485,7 @@
 
 bool MachOObjectFile::is64Bit() const {
   return getType() == getMachOType(false, true) ||
-         getType() == getMachOType(true, true);
+    getType() == getMachOType(true, true);
 }
 
 void MachOObjectFile::ReadULEB128s(uint64_t Index,
@@ -1923,30 +2500,28 @@
   }
 }
 
-const char *MachOObjectFile::getSectionPointer(DataRefImpl Rel) const {
-  return Sections[Rel.d.a];
+bool MachOObjectFile::isRelocatableObject() const {
+  return getHeader().filetype == MachO::MH_OBJECT;
 }
 
-ErrorOr<ObjectFile *>
-ObjectFile::createMachOObjectFile(std::unique_ptr<MemoryBuffer> &Buffer) {
-  StringRef Magic = Buffer->getBuffer().slice(0, 4);
+ErrorOr<std::unique_ptr<MachOObjectFile>>
+ObjectFile::createMachOObjectFile(MemoryBufferRef Buffer) {
+  StringRef Magic = Buffer.getBuffer().slice(0, 4);
   std::error_code EC;
   std::unique_ptr<MachOObjectFile> Ret;
   if (Magic == "\xFE\xED\xFA\xCE")
-    Ret.reset(new MachOObjectFile(std::move(Buffer), false, false, EC));
+    Ret.reset(new MachOObjectFile(Buffer, false, false, EC));
   else if (Magic == "\xCE\xFA\xED\xFE")
-    Ret.reset(new MachOObjectFile(std::move(Buffer), true, false, EC));
+    Ret.reset(new MachOObjectFile(Buffer, true, false, EC));
   else if (Magic == "\xFE\xED\xFA\xCF")
-    Ret.reset(new MachOObjectFile(std::move(Buffer), false, true, EC));
+    Ret.reset(new MachOObjectFile(Buffer, false, true, EC));
   else if (Magic == "\xCF\xFA\xED\xFE")
-    Ret.reset(new MachOObjectFile(std::move(Buffer), true, true, EC));
+    Ret.reset(new MachOObjectFile(Buffer, true, true, EC));
   else
     return object_error::parse_failed;
 
   if (EC)
     return EC;
-  return Ret.release();
+  return std::move(Ret);
 }
 
-} // end namespace object
-} // end namespace llvm

diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
index 4ba5d96..77aeb63 100644
--- a/lib/Object/MachOUniversal.cpp
+++ b/lib/Object/MachOUniversal.cpp

@@ -67,14 +67,13 @@
   }
 }
 
-ErrorOr<std::unique_ptr<ObjectFile>>
+ErrorOr<std::unique_ptr<MachOObjectFile>>
 MachOUniversalBinary::ObjectForArch::getAsObjectFile() const {
   if (Parent) {
     StringRef ParentData = Parent->getData();
     StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
-    std::string ObjectName = Parent->getFileName().str();
-    std::unique_ptr<MemoryBuffer> ObjBuffer(
-        MemoryBuffer::getMemBuffer(ObjectData, ObjectName, false));
+    StringRef ObjectName = Parent->getFileName();
+    MemoryBufferRef ObjBuffer(ObjectData, ObjectName);
     return ObjectFile::createMachOObjectFile(ObjBuffer);
   }
   return object_error::parse_failed;
@@ -85,13 +84,12 @@
   if (Parent) {
     StringRef ParentData = Parent->getData();
     StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
-    std::string ObjectName = Parent->getFileName().str();
-    std::unique_ptr<MemoryBuffer> ObjBuffer(
-        MemoryBuffer::getMemBuffer(ObjectData, ObjectName, false));
-    ErrorOr<Archive *> Obj = Archive::create(std::move(ObjBuffer));
+    StringRef ObjectName = Parent->getFileName();
+    MemoryBufferRef ObjBuffer(ObjectData, ObjectName);
+    ErrorOr<std::unique_ptr<Archive>> Obj = Archive::create(ObjBuffer);
     if (std::error_code EC = Obj.getError())
       return EC;
-    Result.reset(Obj.get());
+    Result = std::move(Obj.get());
     return object_error::success;
   }
   return object_error::parse_failed;
@@ -99,21 +97,20 @@
 
 void MachOUniversalBinary::anchor() { }
 
-ErrorOr<MachOUniversalBinary *>
-MachOUniversalBinary::create(std::unique_ptr<MemoryBuffer> Source) {
+ErrorOr<std::unique_ptr<MachOUniversalBinary>>
+MachOUniversalBinary::create(MemoryBufferRef Source) {
   std::error_code EC;
   std::unique_ptr<MachOUniversalBinary> Ret(
-      new MachOUniversalBinary(std::move(Source), EC));
+      new MachOUniversalBinary(Source, EC));
   if (EC)
     return EC;
-  return Ret.release();
+  return std::move(Ret);
 }
 
-MachOUniversalBinary::MachOUniversalBinary(std::unique_ptr<MemoryBuffer> Source,
+MachOUniversalBinary::MachOUniversalBinary(MemoryBufferRef Source,
                                            std::error_code &ec)
-    : Binary(Binary::ID_MachOUniversalBinary, std::move(Source)),
-      NumberOfObjects(0) {
-  if (Data->getBufferSize() < sizeof(MachO::fat_header)) {
+    : Binary(Binary::ID_MachOUniversalBinary, Source), NumberOfObjects(0) {
+  if (Data.getBufferSize() < sizeof(MachO::fat_header)) {
     ec = object_error::invalid_file_type;
     return;
   }
@@ -142,7 +139,7 @@
   }
 }
 
-ErrorOr<std::unique_ptr<ObjectFile>>
+ErrorOr<std::unique_ptr<MachOObjectFile>>
 MachOUniversalBinary::getObjectForArch(Triple::ArchType Arch) const {
   MachO::CPUType CTM;
   if (!getCTMForArch(Arch, CTM))

diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index 567d87f..84a5df0 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp

@@ -19,12 +19,13 @@
 using namespace llvm;
 using namespace object;
 
-inline ObjectFile *unwrap(LLVMObjectFileRef OF) {
-  return reinterpret_cast<ObjectFile*>(OF);
+inline OwningBinary<ObjectFile> *unwrap(LLVMObjectFileRef OF) {
+  return reinterpret_cast<OwningBinary<ObjectFile> *>(OF);
 }
 
-inline LLVMObjectFileRef wrap(const ObjectFile *OF) {
-  return reinterpret_cast<LLVMObjectFileRef>(const_cast<ObjectFile*>(OF));
+inline LLVMObjectFileRef wrap(const OwningBinary<ObjectFile> *OF) {
+  return reinterpret_cast<LLVMObjectFileRef>(
+      const_cast<OwningBinary<ObjectFile> *>(OF));
 }
 
 inline section_iterator *unwrap(LLVMSectionIteratorRef SI) {
@@ -60,10 +61,14 @@
 // ObjectFile creation
 LLVMObjectFileRef LLVMCreateObjectFile(LLVMMemoryBufferRef MemBuf) {
   std::unique_ptr<MemoryBuffer> Buf(unwrap(MemBuf));
-  ErrorOr<ObjectFile *> ObjOrErr(ObjectFile::createObjectFile(Buf));
-  Buf.release();
-  ObjectFile *Obj = ObjOrErr ? ObjOrErr.get() : nullptr;
-  return wrap(Obj);
+  ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr(
+      ObjectFile::createObjectFile(Buf->getMemBufferRef()));
+  std::unique_ptr<ObjectFile> Obj;
+  if (!ObjOrErr)
+    return nullptr;
+
+  auto *Ret = new OwningBinary<ObjectFile>(std::move(ObjOrErr.get()), std::move(Buf));
+  return wrap(Ret);
 }
 
 void LLVMDisposeObjectFile(LLVMObjectFileRef ObjectFile) {
@@ -71,8 +76,9 @@
 }
 
 // ObjectFile Section iterators
-LLVMSectionIteratorRef LLVMGetSections(LLVMObjectFileRef ObjectFile) {
-  section_iterator SI = unwrap(ObjectFile)->section_begin();
+LLVMSectionIteratorRef LLVMGetSections(LLVMObjectFileRef OF) {
+  OwningBinary<ObjectFile> *OB = unwrap(OF);
+  section_iterator SI = OB->getBinary()->section_begin();
   return wrap(new section_iterator(SI));
 }
 
@@ -80,9 +86,10 @@
   delete unwrap(SI);
 }
 
-LLVMBool LLVMIsSectionIteratorAtEnd(LLVMObjectFileRef ObjectFile,
-                                LLVMSectionIteratorRef SI) {
-  return (*unwrap(SI) == unwrap(ObjectFile)->section_end()) ? 1 : 0;
+LLVMBool LLVMIsSectionIteratorAtEnd(LLVMObjectFileRef OF,
+                                    LLVMSectionIteratorRef SI) {
+  OwningBinary<ObjectFile> *OB = unwrap(OF);
+  return (*unwrap(SI) == OB->getBinary()->section_end()) ? 1 : 0;
 }
 
 void LLVMMoveToNextSection(LLVMSectionIteratorRef SI) {
@@ -96,8 +103,9 @@
 }
 
 // ObjectFile Symbol iterators
-LLVMSymbolIteratorRef LLVMGetSymbols(LLVMObjectFileRef ObjectFile) {
-  symbol_iterator SI = unwrap(ObjectFile)->symbol_begin();
+LLVMSymbolIteratorRef LLVMGetSymbols(LLVMObjectFileRef OF) {
+  OwningBinary<ObjectFile> *OB = unwrap(OF);
+  symbol_iterator SI = OB->getBinary()->symbol_begin();
   return wrap(new symbol_iterator(SI));
 }
 
@@ -105,9 +113,10 @@
   delete unwrap(SI);
 }
 
-LLVMBool LLVMIsSymbolIteratorAtEnd(LLVMObjectFileRef ObjectFile,
-                                LLVMSymbolIteratorRef SI) {
-  return (*unwrap(SI) == unwrap(ObjectFile)->symbol_end()) ? 1 : 0;
+LLVMBool LLVMIsSymbolIteratorAtEnd(LLVMObjectFileRef OF,
+                                   LLVMSymbolIteratorRef SI) {
+  OwningBinary<ObjectFile> *OB = unwrap(OF);
+  return (*unwrap(SI) == OB->getBinary()->symbol_end()) ? 1 : 0;
 }
 
 void LLVMMoveToNextSymbol(LLVMSymbolIteratorRef SI) {
@@ -123,10 +132,7 @@
 }
 
 uint64_t LLVMGetSectionSize(LLVMSectionIteratorRef SI) {
-  uint64_t ret;
-  if (std::error_code ec = (*unwrap(SI))->getSize(ret))
-    report_fatal_error(ec.message());
-  return ret;
+  return (*unwrap(SI))->getSize();
 }
 
 const char *LLVMGetSectionContents(LLVMSectionIteratorRef SI) {
@@ -137,18 +143,12 @@
 }
 
 uint64_t LLVMGetSectionAddress(LLVMSectionIteratorRef SI) {
-  uint64_t ret;
-  if (std::error_code ec = (*unwrap(SI))->getAddress(ret))
-    report_fatal_error(ec.message());
-  return ret;
+  return (*unwrap(SI))->getAddress();
 }
 
 LLVMBool LLVMGetSectionContainsSymbol(LLVMSectionIteratorRef SI,
                                  LLVMSymbolIteratorRef Sym) {
-  bool ret;
-  if (std::error_code ec = (*unwrap(SI))->containsSymbol(**unwrap(Sym), ret))
-    report_fatal_error(ec.message());
-  return ret;
+  return (*unwrap(SI))->containsSymbol(**unwrap(Sym));
 }
 
 // Section Relocation iterators

diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index f5488c6..fd78271 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp

@@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -23,8 +25,8 @@
 
 void ObjectFile::anchor() { }
 
-ObjectFile::ObjectFile(unsigned int Type, std::unique_ptr<MemoryBuffer> Source)
-    : SymbolicFile(Type, std::move(Source)) {}
+ObjectFile::ObjectFile(unsigned int Type, MemoryBufferRef Source)
+    : SymbolicFile(Type, Source) {}
 
 std::error_code ObjectFile::printSymbolName(raw_ostream &OS,
                                             DataRefImpl Symb) const {
@@ -45,11 +47,11 @@
   return section_iterator(SectionRef(Sec, this));
 }
 
-ErrorOr<ObjectFile *>
-ObjectFile::createObjectFile(std::unique_ptr<MemoryBuffer> &Object,
-                             sys::fs::file_magic Type) {
+ErrorOr<std::unique_ptr<ObjectFile>>
+ObjectFile::createObjectFile(MemoryBufferRef Object, sys::fs::file_magic Type) {
+  StringRef Data = Object.getBuffer();
   if (Type == sys::fs::file_magic::unknown)
-    Type = sys::fs::identify_magic(Object->getBuffer());
+    Type = sys::fs::identify_magic(Data);
 
   switch (Type) {
   case sys::fs::file_magic::unknown:
@@ -58,6 +60,7 @@
   case sys::fs::file_magic::macho_universal_binary:
   case sys::fs::file_magic::windows_resource:
     return object_error::invalid_file_type;
+  case sys::fs::file_magic::elf:
   case sys::fs::file_magic::elf_relocatable:
   case sys::fs::file_magic::elf_executable:
   case sys::fs::file_magic::elf_shared_object:
@@ -77,15 +80,24 @@
   case sys::fs::file_magic::coff_object:
   case sys::fs::file_magic::coff_import_library:
   case sys::fs::file_magic::pecoff_executable:
-    return createCOFFObjectFile(std::move(Object));
+    return createCOFFObjectFile(Object);
   }
   llvm_unreachable("Unexpected Object File Type");
 }
 
-ErrorOr<ObjectFile *> ObjectFile::createObjectFile(StringRef ObjectPath) {
+ErrorOr<OwningBinary<ObjectFile>>
+ObjectFile::createObjectFile(StringRef ObjectPath) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFile(ObjectPath);
   if (std::error_code EC = FileOrErr.getError())
     return EC;
-  return createObjectFile(FileOrErr.get());
+  std::unique_ptr<MemoryBuffer> Buffer = std::move(FileOrErr.get());
+
+  ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr =
+      createObjectFile(Buffer->getMemBufferRef());
+  if (std::error_code EC = ObjOrErr.getError())
+    return EC;
+  std::unique_ptr<ObjectFile> Obj = std::move(ObjOrErr.get());
+
+  return OwningBinary<ObjectFile>(std::move(Obj), std::move(Buffer));
 }

diff --git a/lib/Object/RecordStreamer.h b/lib/Object/RecordStreamer.h
index 10e70ef..7dacbdf 100644
--- a/lib/Object/RecordStreamer.h
+++ b/lib/Object/RecordStreamer.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJECT_RECORD_STREAMER
-#define LLVM_OBJECT_RECORD_STREAMER
+#ifndef LLVM_LIB_OBJECT_RECORDSTREAMER_H
+#define LLVM_LIB_OBJECT_RECORDSTREAMER_H
 
 #include "llvm/MC/MCStreamer.h"
 

diff --git a/lib/Object/SymbolicFile.cpp b/lib/Object/SymbolicFile.cpp
index 30cf1a0..ffd3dbc 100644
--- a/lib/Object/SymbolicFile.cpp
+++ b/lib/Object/SymbolicFile.cpp

@@ -19,34 +19,31 @@
 using namespace llvm;
 using namespace object;
 
-SymbolicFile::SymbolicFile(unsigned int Type,
-                           std::unique_ptr<MemoryBuffer> Source)
-    : Binary(Type, std::move(Source)) {}
+SymbolicFile::SymbolicFile(unsigned int Type, MemoryBufferRef Source)
+    : Binary(Type, Source) {}
 
 SymbolicFile::~SymbolicFile() {}
 
-ErrorOr<SymbolicFile *>
-SymbolicFile::createSymbolicFile(std::unique_ptr<MemoryBuffer> &Object,
-                                 sys::fs::file_magic Type,
-                                 LLVMContext *Context) {
+ErrorOr<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
+    MemoryBufferRef Object, sys::fs::file_magic Type, LLVMContext *Context) {
+  StringRef Data = Object.getBuffer();
   if (Type == sys::fs::file_magic::unknown)
-    Type = sys::fs::identify_magic(Object->getBuffer());
+    Type = sys::fs::identify_magic(Data);
 
   switch (Type) {
   case sys::fs::file_magic::bitcode:
     if (Context)
-      return IRObjectFile::createIRObjectFile(std::move(Object), *Context);
+      return IRObjectFile::createIRObjectFile(Object, *Context);
   // Fallthrough
   case sys::fs::file_magic::unknown:
   case sys::fs::file_magic::archive:
   case sys::fs::file_magic::macho_universal_binary:
   case sys::fs::file_magic::windows_resource:
     return object_error::invalid_file_type;
-  case sys::fs::file_magic::elf_relocatable:
+  case sys::fs::file_magic::elf:
   case sys::fs::file_magic::elf_executable:
   case sys::fs::file_magic::elf_shared_object:
   case sys::fs::file_magic::elf_core:
-  case sys::fs::file_magic::macho_object:
   case sys::fs::file_magic::macho_executable:
   case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
   case sys::fs::file_magic::macho_core:
@@ -56,10 +53,26 @@
   case sys::fs::file_magic::macho_bundle:
   case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
   case sys::fs::file_magic::macho_dsym_companion:
-  case sys::fs::file_magic::coff_object:
   case sys::fs::file_magic::coff_import_library:
   case sys::fs::file_magic::pecoff_executable:
     return ObjectFile::createObjectFile(Object, Type);
+  case sys::fs::file_magic::elf_relocatable:
+  case sys::fs::file_magic::macho_object:
+  case sys::fs::file_magic::coff_object: {
+    ErrorOr<std::unique_ptr<ObjectFile>> Obj =
+        ObjectFile::createObjectFile(Object, Type);
+    if (!Obj || !Context)
+      return std::move(Obj);
+
+    ErrorOr<MemoryBufferRef> BCData =
+        IRObjectFile::findBitcodeInObject(*Obj->get());
+    if (!BCData)
+      return std::move(Obj);
+
+    return IRObjectFile::createIRObjectFile(
+        MemoryBufferRef(BCData->getBuffer(), Object.getBufferIdentifier()),
+        *Context);
+  }
   }
   llvm_unreachable("Unexpected Binary File Type");
 }

diff --git a/lib/Option/ArgList.cpp b/lib/Option/ArgList.cpp
index 5848bb1..041e552 100644
--- a/lib/Option/ArgList.cpp
+++ b/lib/Option/ArgList.cpp

@@ -54,6 +54,15 @@
   return nullptr;
 }
 
+Arg *ArgList::getLastArgNoClaim(OptSpecifier Id0, OptSpecifier Id1) const {
+  // FIXME: Make search efficient?
+  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
+    if ((*it)->getOption().matches(Id0) ||
+        (*it)->getOption().matches(Id1))
+      return *it;
+  return nullptr;
+}
+
 Arg *ArgList::getLastArg(OptSpecifier Id) const {
   Arg *Res = nullptr;
   for (const_iterator it = begin(), ie = end(); it != ie; ++it) {

diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index 6842f4d..dca02c1 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp

@@ -264,6 +264,11 @@
   MissingArgIndex = MissingArgCount = 0;
   unsigned Index = 0, End = ArgEnd - ArgBegin;
   while (Index < End) {
+    // Ingore nullptrs, they are response file's EOL markers
+    if (Args->getArgString(Index) == nullptr) {
+      ++Index;
+      continue;
+    }
     // Ignore empty arguments (other things may still take them as arguments).
     StringRef Str = Args->getArgString(Index);
     if (Str == "") {
@@ -300,7 +305,18 @@
     llvm_unreachable("Invalid option with help text.");
 
   case Option::MultiArgClass:
-    llvm_unreachable("Cannot print metavar for this kind of option.");
+    if (const char *MetaVarName = Opts.getOptionMetaVar(Id)) {
+      // For MultiArgs, metavar is full list of all argument names.
+      Name += ' ';
+      Name += MetaVarName;
+    }
+    else {
+      // For MultiArgs<N>, if metavar not supplied, print <value> N times.
+      for (unsigned i=0, e=O.getNumArgs(); i< e; ++i) {
+        Name += " <value>";
+      }
+    }
+    break;
 
   case Option::FlagClass:
     break;

diff --git a/lib/Option/Option.cpp b/lib/Option/Option.cpp
index 10662a3..cdc63c3 100644
--- a/lib/Option/Option.cpp
+++ b/lib/Option/Option.cpp

@@ -169,7 +169,8 @@
       return nullptr;
 
     Index += 2;
-    if (Index > Args.getNumInputArgStrings())
+    if (Index > Args.getNumInputArgStrings() ||
+        Args.getArgString(Index - 1) == nullptr)
       return nullptr;
 
     return new Arg(UnaliasedOption, Spelling,
@@ -200,7 +201,8 @@
 
     // Otherwise it must be separate.
     Index += 2;
-    if (Index > Args.getNumInputArgStrings())
+    if (Index > Args.getNumInputArgStrings() ||
+        Args.getArgString(Index - 1) == nullptr)
       return nullptr;
 
     return new Arg(UnaliasedOption, Spelling,
@@ -209,7 +211,8 @@
   case JoinedAndSeparateClass:
     // Always matches.
     Index += 2;
-    if (Index > Args.getNumInputArgStrings())
+    if (Index > Args.getNumInputArgStrings() ||
+        Args.getArgString(Index - 1) == nullptr)
       return nullptr;
 
     return new Arg(UnaliasedOption, Spelling, Index - 2,
@@ -221,7 +224,8 @@
     if (ArgSize != strlen(Args.getArgString(Index)))
       return nullptr;
     Arg *A = new Arg(UnaliasedOption, Spelling, Index++);
-    while (Index < Args.getNumInputArgStrings())
+    while (Index < Args.getNumInputArgStrings() &&
+           Args.getArgString(Index) != nullptr)
       A->getValues().push_back(Args.getArgString(Index++));
     return A;
   }

diff --git a/lib/ProfileData/Android.mk b/lib/ProfileData/Android.mk
index f4b3fa9..1e1d5f2 100644
--- a/lib/ProfileData/Android.mk
+++ b/lib/ProfileData/Android.mk

@@ -1,9 +1,15 @@
 LOCAL_PATH:= $(call my-dir)
 
 profiledata_SRC_FILES := \
-  InstrProf.cpp          \
-  InstrProfReader.cpp    \
-  InstrProfWriter.cpp    \
+  CoverageMapping.cpp \
+  CoverageMappingReader.cpp \
+  CoverageMappingWriter.cpp \
+  InstrProf.cpp \
+  InstrProfReader.cpp \
+  InstrProfWriter.cpp \
+  SampleProf.cpp \
+  SampleProfReader.cpp \
+  SampleProfWriter.cpp
 
 # For the host
 # =====================================================

diff --git a/lib/ProfileData/CMakeLists.txt b/lib/ProfileData/CMakeLists.txt
index aefb16c..b9d472d 100644
--- a/lib/ProfileData/CMakeLists.txt
+++ b/lib/ProfileData/CMakeLists.txt

@@ -2,4 +2,10 @@
   InstrProf.cpp
   InstrProfReader.cpp
   InstrProfWriter.cpp
+  CoverageMapping.cpp
+  CoverageMappingWriter.cpp
+  CoverageMappingReader.cpp
+  SampleProf.cpp
+  SampleProfReader.cpp
+  SampleProfWriter.cpp
   )

diff --git a/lib/ProfileData/CoverageMapping.cpp b/lib/ProfileData/CoverageMapping.cpp
new file mode 100644
index 0000000..0ccebc2
--- /dev/null
+++ b/lib/ProfileData/CoverageMapping.cpp

@@ -0,0 +1,475 @@
+//=-- CoverageMapping.cpp - Code coverage mapping support ---------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for clang's and llvm's instrumentation based
+// code coverage.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/CoverageMapping.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ProfileData/CoverageMappingReader.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+using namespace coverage;
+
+#define DEBUG_TYPE "coverage-mapping"
+
+Counter CounterExpressionBuilder::get(const CounterExpression &E) {
+  auto It = ExpressionIndices.find(E);
+  if (It != ExpressionIndices.end())
+    return Counter::getExpression(It->second);
+  unsigned I = Expressions.size();
+  Expressions.push_back(E);
+  ExpressionIndices[E] = I;
+  return Counter::getExpression(I);
+}
+
+void CounterExpressionBuilder::extractTerms(
+    Counter C, int Sign, SmallVectorImpl<std::pair<unsigned, int>> &Terms) {
+  switch (C.getKind()) {
+  case Counter::Zero:
+    break;
+  case Counter::CounterValueReference:
+    Terms.push_back(std::make_pair(C.getCounterID(), Sign));
+    break;
+  case Counter::Expression:
+    const auto &E = Expressions[C.getExpressionID()];
+    extractTerms(E.LHS, Sign, Terms);
+    extractTerms(E.RHS, E.Kind == CounterExpression::Subtract ? -Sign : Sign,
+                 Terms);
+    break;
+  }
+}
+
+Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
+  // Gather constant terms.
+  llvm::SmallVector<std::pair<unsigned, int>, 32> Terms;
+  extractTerms(ExpressionTree, +1, Terms);
+
+  // If there are no terms, this is just a zero. The algorithm below assumes at
+  // least one term.
+  if (Terms.size() == 0)
+    return Counter::getZero();
+
+  // Group the terms by counter ID.
+  std::sort(Terms.begin(), Terms.end(),
+            [](const std::pair<unsigned, int> &LHS,
+               const std::pair<unsigned, int> &RHS) {
+    return LHS.first < RHS.first;
+  });
+
+  // Combine terms by counter ID to eliminate counters that sum to zero.
+  auto Prev = Terms.begin();
+  for (auto I = Prev + 1, E = Terms.end(); I != E; ++I) {
+    if (I->first == Prev->first) {
+      Prev->second += I->second;
+      continue;
+    }
+    ++Prev;
+    *Prev = *I;
+  }
+  Terms.erase(++Prev, Terms.end());
+
+  Counter C;
+  // Create additions. We do this before subtractions to avoid constructs like
+  // ((0 - X) + Y), as opposed to (Y - X).
+  for (auto Term : Terms) {
+    if (Term.second <= 0)
+      continue;
+    for (int I = 0; I < Term.second; ++I)
+      if (C.isZero())
+        C = Counter::getCounter(Term.first);
+      else
+        C = get(CounterExpression(CounterExpression::Add, C,
+                                  Counter::getCounter(Term.first)));
+  }
+
+  // Create subtractions.
+  for (auto Term : Terms) {
+    if (Term.second >= 0)
+      continue;
+    for (int I = 0; I < -Term.second; ++I)
+      C = get(CounterExpression(CounterExpression::Subtract, C,
+                                Counter::getCounter(Term.first)));
+  }
+  return C;
+}
+
+Counter CounterExpressionBuilder::add(Counter LHS, Counter RHS) {
+  return simplify(get(CounterExpression(CounterExpression::Add, LHS, RHS)));
+}
+
+Counter CounterExpressionBuilder::subtract(Counter LHS, Counter RHS) {
+  return simplify(
+      get(CounterExpression(CounterExpression::Subtract, LHS, RHS)));
+}
+
+void CounterMappingContext::dump(const Counter &C,
+                                 llvm::raw_ostream &OS) const {
+  switch (C.getKind()) {
+  case Counter::Zero:
+    OS << '0';
+    return;
+  case Counter::CounterValueReference:
+    OS << '#' << C.getCounterID();
+    break;
+  case Counter::Expression: {
+    if (C.getExpressionID() >= Expressions.size())
+      return;
+    const auto &E = Expressions[C.getExpressionID()];
+    OS << '(';
+    dump(E.LHS, OS);
+    OS << (E.Kind == CounterExpression::Subtract ? " - " : " + ");
+    dump(E.RHS, OS);
+    OS << ')';
+    break;
+  }
+  }
+  if (CounterValues.empty())
+    return;
+  ErrorOr<int64_t> Value = evaluate(C);
+  if (!Value)
+    return;
+  OS << '[' << *Value << ']';
+}
+
+ErrorOr<int64_t> CounterMappingContext::evaluate(const Counter &C) const {
+  switch (C.getKind()) {
+  case Counter::Zero:
+    return 0;
+  case Counter::CounterValueReference:
+    if (C.getCounterID() >= CounterValues.size())
+      return std::make_error_code(std::errc::argument_out_of_domain);
+    return CounterValues[C.getCounterID()];
+  case Counter::Expression: {
+    if (C.getExpressionID() >= Expressions.size())
+      return std::make_error_code(std::errc::argument_out_of_domain);
+    const auto &E = Expressions[C.getExpressionID()];
+    ErrorOr<int64_t> LHS = evaluate(E.LHS);
+    if (!LHS)
+      return LHS;
+    ErrorOr<int64_t> RHS = evaluate(E.RHS);
+    if (!RHS)
+      return RHS;
+    return E.Kind == CounterExpression::Subtract ? *LHS - *RHS : *LHS + *RHS;
+  }
+  }
+  llvm_unreachable("Unhandled CounterKind");
+}
+
+void FunctionRecordIterator::skipOtherFiles() {
+  while (Current != Records.end() && !Filename.empty() &&
+         Filename != Current->Filenames[0])
+    ++Current;
+  if (Current == Records.end())
+    *this = FunctionRecordIterator();
+}
+
+ErrorOr<std::unique_ptr<CoverageMapping>>
+CoverageMapping::load(ObjectFileCoverageMappingReader &CoverageReader,
+                      IndexedInstrProfReader &ProfileReader) {
+  auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
+
+  std::vector<uint64_t> Counts;
+  for (const auto &Record : CoverageReader) {
+    Counts.clear();
+    if (std::error_code EC = ProfileReader.getFunctionCounts(
+            Record.FunctionName, Record.FunctionHash, Counts)) {
+      if (EC != instrprof_error::hash_mismatch &&
+          EC != instrprof_error::unknown_function)
+        return EC;
+      Coverage->MismatchedFunctionCount++;
+      continue;
+    }
+
+    assert(Counts.size() != 0 && "Function's counts are empty");
+    FunctionRecord Function(Record.FunctionName, Record.Filenames,
+                            Counts.front());
+    CounterMappingContext Ctx(Record.Expressions, Counts);
+    for (const auto &Region : Record.MappingRegions) {
+      ErrorOr<int64_t> ExecutionCount = Ctx.evaluate(Region.Count);
+      if (!ExecutionCount)
+        break;
+      Function.CountedRegions.push_back(CountedRegion(Region, *ExecutionCount));
+    }
+    if (Function.CountedRegions.size() != Record.MappingRegions.size()) {
+      Coverage->MismatchedFunctionCount++;
+      continue;
+    }
+
+    Coverage->Functions.push_back(std::move(Function));
+  }
+
+  return std::move(Coverage);
+}
+
+ErrorOr<std::unique_ptr<CoverageMapping>>
+CoverageMapping::load(StringRef ObjectFilename, StringRef ProfileFilename) {
+  auto CounterMappingBuff = MemoryBuffer::getFileOrSTDIN(ObjectFilename);
+  if (auto EC = CounterMappingBuff.getError())
+    return EC;
+  ObjectFileCoverageMappingReader CoverageReader(CounterMappingBuff.get());
+  if (auto EC = CoverageReader.readHeader())
+    return EC;
+  std::unique_ptr<IndexedInstrProfReader> ProfileReader;
+  if (auto EC = IndexedInstrProfReader::create(ProfileFilename, ProfileReader))
+    return EC;
+  return load(CoverageReader, *ProfileReader);
+}
+
+namespace {
+/// \brief Distributes functions into instantiation sets.
+///
+/// An instantiation set is a collection of functions that have the same source
+/// code, ie, template functions specializations.
+class FunctionInstantiationSetCollector {
+  typedef DenseMap<std::pair<unsigned, unsigned>,
+                   std::vector<const FunctionRecord *>> MapT;
+  MapT InstantiatedFunctions;
+
+public:
+  void insert(const FunctionRecord &Function, unsigned FileID) {
+    auto I = Function.CountedRegions.begin(), E = Function.CountedRegions.end();
+    while (I != E && I->FileID != FileID)
+      ++I;
+    assert(I != E && "function does not cover the given file");
+    auto &Functions = InstantiatedFunctions[I->startLoc()];
+    Functions.push_back(&Function);
+  }
+
+  MapT::iterator begin() { return InstantiatedFunctions.begin(); }
+
+  MapT::iterator end() { return InstantiatedFunctions.end(); }
+};
+
+class SegmentBuilder {
+  std::vector<CoverageSegment> Segments;
+  SmallVector<const CountedRegion *, 8> ActiveRegions;
+
+  /// Start a segment with no count specified.
+  void startSegment(unsigned Line, unsigned Col) {
+    DEBUG(dbgs() << "Top level segment at " << Line << ":" << Col << "\n");
+    Segments.emplace_back(Line, Col, /*IsRegionEntry=*/false);
+  }
+
+  /// Start a segment with the given Region's count.
+  void startSegment(unsigned Line, unsigned Col, bool IsRegionEntry,
+                    const CountedRegion &Region) {
+    if (Segments.empty())
+      Segments.emplace_back(Line, Col, IsRegionEntry);
+    CoverageSegment S = Segments.back();
+    // Avoid creating empty regions.
+    if (S.Line != Line || S.Col != Col) {
+      Segments.emplace_back(Line, Col, IsRegionEntry);
+      S = Segments.back();
+    }
+    DEBUG(dbgs() << "Segment at " << Line << ":" << Col);
+    // Set this region's count.
+    if (Region.Kind != coverage::CounterMappingRegion::SkippedRegion) {
+      DEBUG(dbgs() << " with count " << Region.ExecutionCount);
+      Segments.back().setCount(Region.ExecutionCount);
+    }
+    DEBUG(dbgs() << "\n");
+  }
+
+  /// Start a segment for the given region.
+  void startSegment(const CountedRegion &Region) {
+    startSegment(Region.LineStart, Region.ColumnStart, true, Region);
+  }
+
+  /// Pop the top region off of the active stack, starting a new segment with
+  /// the containing Region's count.
+  void popRegion() {
+    const CountedRegion *Active = ActiveRegions.back();
+    unsigned Line = Active->LineEnd, Col = Active->ColumnEnd;
+    ActiveRegions.pop_back();
+    if (ActiveRegions.empty())
+      startSegment(Line, Col);
+    else
+      startSegment(Line, Col, false, *ActiveRegions.back());
+  }
+
+public:
+  /// Build a list of CoverageSegments from a sorted list of Regions.
+  std::vector<CoverageSegment> buildSegments(ArrayRef<CountedRegion> Regions) {
+    for (const auto &Region : Regions) {
+      // Pop any regions that end before this one starts.
+      while (!ActiveRegions.empty() &&
+             ActiveRegions.back()->endLoc() <= Region.startLoc())
+        popRegion();
+      if (Segments.size() && Segments.back().Line == Region.LineStart &&
+          Segments.back().Col == Region.ColumnStart) {
+        if (Region.Kind != coverage::CounterMappingRegion::SkippedRegion)
+          Segments.back().addCount(Region.ExecutionCount);
+      } else {
+        // Add this region to the stack.
+        ActiveRegions.push_back(&Region);
+        startSegment(Region);
+      }
+    }
+    // Pop any regions that are left in the stack.
+    while (!ActiveRegions.empty())
+      popRegion();
+    return Segments;
+  }
+};
+}
+
+std::vector<StringRef> CoverageMapping::getUniqueSourceFiles() const {
+  std::vector<StringRef> Filenames;
+  for (const auto &Function : getCoveredFunctions())
+    for (const auto &Filename : Function.Filenames)
+      Filenames.push_back(Filename);
+  std::sort(Filenames.begin(), Filenames.end());
+  auto Last = std::unique(Filenames.begin(), Filenames.end());
+  Filenames.erase(Last, Filenames.end());
+  return Filenames;
+}
+
+static Optional<unsigned> findMainViewFileID(StringRef SourceFile,
+                                             const FunctionRecord &Function) {
+  llvm::SmallVector<bool, 8> IsExpandedFile(Function.Filenames.size(), false);
+  llvm::SmallVector<bool, 8> FilenameEquivalence(Function.Filenames.size(),
+                                                 false);
+  for (unsigned I = 0, E = Function.Filenames.size(); I < E; ++I)
+    if (SourceFile == Function.Filenames[I])
+      FilenameEquivalence[I] = true;
+  for (const auto &CR : Function.CountedRegions)
+    if (CR.Kind == CounterMappingRegion::ExpansionRegion &&
+        FilenameEquivalence[CR.FileID])
+      IsExpandedFile[CR.ExpandedFileID] = true;
+  for (unsigned I = 0, E = Function.Filenames.size(); I < E; ++I)
+    if (FilenameEquivalence[I] && !IsExpandedFile[I])
+      return I;
+  return None;
+}
+
+static Optional<unsigned> findMainViewFileID(const FunctionRecord &Function) {
+  llvm::SmallVector<bool, 8> IsExpandedFile(Function.Filenames.size(), false);
+  for (const auto &CR : Function.CountedRegions)
+    if (CR.Kind == CounterMappingRegion::ExpansionRegion)
+      IsExpandedFile[CR.ExpandedFileID] = true;
+  for (unsigned I = 0, E = Function.Filenames.size(); I < E; ++I)
+    if (!IsExpandedFile[I])
+      return I;
+  return None;
+}
+
+static SmallSet<unsigned, 8> gatherFileIDs(StringRef SourceFile,
+                                           const FunctionRecord &Function) {
+  SmallSet<unsigned, 8> IDs;
+  for (unsigned I = 0, E = Function.Filenames.size(); I < E; ++I)
+    if (SourceFile == Function.Filenames[I])
+      IDs.insert(I);
+  return IDs;
+}
+
+/// Sort a nested sequence of regions from a single file.
+template <class It> static void sortNestedRegions(It First, It Last) {
+  std::sort(First, Last,
+            [](const CountedRegion &LHS, const CountedRegion &RHS) {
+    if (LHS.startLoc() == RHS.startLoc())
+      // When LHS completely contains RHS, we sort LHS first.
+      return RHS.endLoc() < LHS.endLoc();
+    return LHS.startLoc() < RHS.startLoc();
+  });
+}
+
+static bool isExpansion(const CountedRegion &R, unsigned FileID) {
+  return R.Kind == CounterMappingRegion::ExpansionRegion && R.FileID == FileID;
+}
+
+CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) {
+  CoverageData FileCoverage(Filename);
+  std::vector<coverage::CountedRegion> Regions;
+
+  for (const auto &Function : Functions) {
+    auto MainFileID = findMainViewFileID(Filename, Function);
+    if (!MainFileID)
+      continue;
+    auto FileIDs = gatherFileIDs(Filename, Function);
+    for (const auto &CR : Function.CountedRegions)
+      if (FileIDs.count(CR.FileID)) {
+        Regions.push_back(CR);
+        if (isExpansion(CR, *MainFileID))
+          FileCoverage.Expansions.emplace_back(CR, Function);
+      }
+  }
+
+  sortNestedRegions(Regions.begin(), Regions.end());
+  FileCoverage.Segments = SegmentBuilder().buildSegments(Regions);
+
+  return FileCoverage;
+}
+
+std::vector<const FunctionRecord *>
+CoverageMapping::getInstantiations(StringRef Filename) {
+  FunctionInstantiationSetCollector InstantiationSetCollector;
+  for (const auto &Function : Functions) {
+    auto MainFileID = findMainViewFileID(Filename, Function);
+    if (!MainFileID)
+      continue;
+    InstantiationSetCollector.insert(Function, *MainFileID);
+  }
+
+  std::vector<const FunctionRecord *> Result;
+  for (const auto &InstantiationSet : InstantiationSetCollector) {
+    if (InstantiationSet.second.size() < 2)
+      continue;
+    for (auto Function : InstantiationSet.second)
+      Result.push_back(Function);
+  }
+  return Result;
+}
+
+CoverageData
+CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) {
+  auto MainFileID = findMainViewFileID(Function);
+  if (!MainFileID)
+    return CoverageData();
+
+  CoverageData FunctionCoverage(Function.Filenames[*MainFileID]);
+  std::vector<coverage::CountedRegion> Regions;
+  for (const auto &CR : Function.CountedRegions)
+    if (CR.FileID == *MainFileID) {
+      Regions.push_back(CR);
+      if (isExpansion(CR, *MainFileID))
+        FunctionCoverage.Expansions.emplace_back(CR, Function);
+    }
+
+  sortNestedRegions(Regions.begin(), Regions.end());
+  FunctionCoverage.Segments = SegmentBuilder().buildSegments(Regions);
+
+  return FunctionCoverage;
+}
+
+CoverageData
+CoverageMapping::getCoverageForExpansion(const ExpansionRecord &Expansion) {
+  CoverageData ExpansionCoverage(
+      Expansion.Function.Filenames[Expansion.FileID]);
+  std::vector<coverage::CountedRegion> Regions;
+  for (const auto &CR : Expansion.Function.CountedRegions)
+    if (CR.FileID == Expansion.FileID) {
+      Regions.push_back(CR);
+      if (isExpansion(CR, Expansion.FileID))
+        ExpansionCoverage.Expansions.emplace_back(CR, Expansion.Function);
+    }
+
+  sortNestedRegions(Regions.begin(), Regions.end());
+  ExpansionCoverage.Segments = SegmentBuilder().buildSegments(Regions);
+
+  return ExpansionCoverage;
+}

diff --git a/lib/ProfileData/CoverageMappingReader.cpp b/lib/ProfileData/CoverageMappingReader.cpp
new file mode 100644
index 0000000..6476d28
--- /dev/null
+++ b/lib/ProfileData/CoverageMappingReader.cpp

@@ -0,0 +1,553 @@
+//=-- CoverageMappingReader.cpp - Code coverage mapping reader ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading coverage mapping data for
+// instrumentation based coverage.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/CoverageMappingReader.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+using namespace coverage;
+using namespace object;
+
+#define DEBUG_TYPE "coverage-mapping"
+
+void CoverageMappingIterator::increment() {
+  // Check if all the records were read or if an error occurred while reading
+  // the next record.
+  if (Reader->readNextRecord(Record))
+    *this = CoverageMappingIterator();
+}
+
+std::error_code RawCoverageReader::readULEB128(uint64_t &Result) {
+  if (Data.size() < 1)
+    return error(instrprof_error::truncated);
+  unsigned N = 0;
+  Result = decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N);
+  if (N > Data.size())
+    return error(instrprof_error::malformed);
+  Data = Data.substr(N);
+  return success();
+}
+
+std::error_code RawCoverageReader::readIntMax(uint64_t &Result,
+                                              uint64_t MaxPlus1) {
+  if (auto Err = readULEB128(Result))
+    return Err;
+  if (Result >= MaxPlus1)
+    return error(instrprof_error::malformed);
+  return success();
+}
+
+std::error_code RawCoverageReader::readSize(uint64_t &Result) {
+  if (auto Err = readULEB128(Result))
+    return Err;
+  // Sanity check the number.
+  if (Result > Data.size())
+    return error(instrprof_error::malformed);
+  return success();
+}
+
+std::error_code RawCoverageReader::readString(StringRef &Result) {
+  uint64_t Length;
+  if (auto Err = readSize(Length))
+    return Err;
+  Result = Data.substr(0, Length);
+  Data = Data.substr(Length);
+  return success();
+}
+
+std::error_code RawCoverageFilenamesReader::read() {
+  uint64_t NumFilenames;
+  if (auto Err = readSize(NumFilenames))
+    return Err;
+  for (size_t I = 0; I < NumFilenames; ++I) {
+    StringRef Filename;
+    if (auto Err = readString(Filename))
+      return Err;
+    Filenames.push_back(Filename);
+  }
+  return success();
+}
+
+std::error_code RawCoverageMappingReader::decodeCounter(unsigned Value,
+                                                        Counter &C) {
+  auto Tag = Value & Counter::EncodingTagMask;
+  switch (Tag) {
+  case Counter::Zero:
+    C = Counter::getZero();
+    return success();
+  case Counter::CounterValueReference:
+    C = Counter::getCounter(Value >> Counter::EncodingTagBits);
+    return success();
+  default:
+    break;
+  }
+  Tag -= Counter::Expression;
+  switch (Tag) {
+  case CounterExpression::Subtract:
+  case CounterExpression::Add: {
+    auto ID = Value >> Counter::EncodingTagBits;
+    if (ID >= Expressions.size())
+      return error(instrprof_error::malformed);
+    Expressions[ID].Kind = CounterExpression::ExprKind(Tag);
+    C = Counter::getExpression(ID);
+    break;
+  }
+  default:
+    return error(instrprof_error::malformed);
+  }
+  return success();
+}
+
+std::error_code RawCoverageMappingReader::readCounter(Counter &C) {
+  uint64_t EncodedCounter;
+  if (auto Err =
+          readIntMax(EncodedCounter, std::numeric_limits<unsigned>::max()))
+    return Err;
+  if (auto Err = decodeCounter(EncodedCounter, C))
+    return Err;
+  return success();
+}
+
+static const unsigned EncodingExpansionRegionBit = 1
+                                                   << Counter::EncodingTagBits;
+
+/// \brief Read the sub-array of regions for the given inferred file id.
+/// \param NumFileIDs the number of file ids that are defined for this
+/// function.
+std::error_code RawCoverageMappingReader::readMappingRegionsSubArray(
+    std::vector<CounterMappingRegion> &MappingRegions, unsigned InferredFileID,
+    size_t NumFileIDs) {
+  uint64_t NumRegions;
+  if (auto Err = readSize(NumRegions))
+    return Err;
+  unsigned LineStart = 0;
+  for (size_t I = 0; I < NumRegions; ++I) {
+    Counter C;
+    CounterMappingRegion::RegionKind Kind = CounterMappingRegion::CodeRegion;
+
+    // Read the combined counter + region kind.
+    uint64_t EncodedCounterAndRegion;
+    if (auto Err = readIntMax(EncodedCounterAndRegion,
+                              std::numeric_limits<unsigned>::max()))
+      return Err;
+    unsigned Tag = EncodedCounterAndRegion & Counter::EncodingTagMask;
+    uint64_t ExpandedFileID = 0;
+    if (Tag != Counter::Zero) {
+      if (auto Err = decodeCounter(EncodedCounterAndRegion, C))
+        return Err;
+    } else {
+      // Is it an expansion region?
+      if (EncodedCounterAndRegion & EncodingExpansionRegionBit) {
+        Kind = CounterMappingRegion::ExpansionRegion;
+        ExpandedFileID = EncodedCounterAndRegion >>
+                         Counter::EncodingCounterTagAndExpansionRegionTagBits;
+        if (ExpandedFileID >= NumFileIDs)
+          return error(instrprof_error::malformed);
+      } else {
+        switch (EncodedCounterAndRegion >>
+                Counter::EncodingCounterTagAndExpansionRegionTagBits) {
+        case CounterMappingRegion::CodeRegion:
+          // Don't do anything when we have a code region with a zero counter.
+          break;
+        case CounterMappingRegion::SkippedRegion:
+          Kind = CounterMappingRegion::SkippedRegion;
+          break;
+        default:
+          return error(instrprof_error::malformed);
+        }
+      }
+    }
+
+    // Read the source range.
+    uint64_t LineStartDelta, CodeBeforeColumnStart, NumLines, ColumnEnd;
+    if (auto Err =
+            readIntMax(LineStartDelta, std::numeric_limits<unsigned>::max()))
+      return Err;
+    if (auto Err = readULEB128(CodeBeforeColumnStart))
+      return Err;
+    bool HasCodeBefore = CodeBeforeColumnStart & 1;
+    uint64_t ColumnStart = CodeBeforeColumnStart >>
+                           CounterMappingRegion::EncodingHasCodeBeforeBits;
+    if (ColumnStart > std::numeric_limits<unsigned>::max())
+      return error(instrprof_error::malformed);
+    if (auto Err = readIntMax(NumLines, std::numeric_limits<unsigned>::max()))
+      return Err;
+    if (auto Err = readIntMax(ColumnEnd, std::numeric_limits<unsigned>::max()))
+      return Err;
+    LineStart += LineStartDelta;
+    // Adjust the column locations for the empty regions that are supposed to
+    // cover whole lines. Those regions should be encoded with the
+    // column range (1 -> std::numeric_limits<unsigned>::max()), but because
+    // the encoded std::numeric_limits<unsigned>::max() is several bytes long,
+    // we set the column range to (0 -> 0) to ensure that the column start and
+    // column end take up one byte each.
+    // The std::numeric_limits<unsigned>::max() is used to represent a column
+    // position at the end of the line without knowing the length of that line.
+    if (ColumnStart == 0 && ColumnEnd == 0) {
+      ColumnStart = 1;
+      ColumnEnd = std::numeric_limits<unsigned>::max();
+    }
+
+    DEBUG({
+      dbgs() << "Counter in file " << InferredFileID << " " << LineStart << ":"
+             << ColumnStart << " -> " << (LineStart + NumLines) << ":"
+             << ColumnEnd << ", ";
+      if (Kind == CounterMappingRegion::ExpansionRegion)
+        dbgs() << "Expands to file " << ExpandedFileID;
+      else
+        CounterMappingContext(Expressions).dump(C, dbgs());
+      dbgs() << "\n";
+    });
+
+    MappingRegions.push_back(CounterMappingRegion(
+        C, InferredFileID, LineStart, ColumnStart, LineStart + NumLines,
+        ColumnEnd, HasCodeBefore, Kind));
+    MappingRegions.back().ExpandedFileID = ExpandedFileID;
+  }
+  return success();
+}
+
+std::error_code RawCoverageMappingReader::read(CoverageMappingRecord &Record) {
+
+  // Read the virtual file mapping.
+  llvm::SmallVector<unsigned, 8> VirtualFileMapping;
+  uint64_t NumFileMappings;
+  if (auto Err = readSize(NumFileMappings))
+    return Err;
+  for (size_t I = 0; I < NumFileMappings; ++I) {
+    uint64_t FilenameIndex;
+    if (auto Err = readIntMax(FilenameIndex, TranslationUnitFilenames.size()))
+      return Err;
+    VirtualFileMapping.push_back(FilenameIndex);
+  }
+
+  // Construct the files using unique filenames and virtual file mapping.
+  for (auto I : VirtualFileMapping) {
+    Filenames.push_back(TranslationUnitFilenames[I]);
+  }
+
+  // Read the expressions.
+  uint64_t NumExpressions;
+  if (auto Err = readSize(NumExpressions))
+    return Err;
+  // Create an array of dummy expressions that get the proper counters
+  // when the expressions are read, and the proper kinds when the counters
+  // are decoded.
+  Expressions.resize(
+      NumExpressions,
+      CounterExpression(CounterExpression::Subtract, Counter(), Counter()));
+  for (size_t I = 0; I < NumExpressions; ++I) {
+    if (auto Err = readCounter(Expressions[I].LHS))
+      return Err;
+    if (auto Err = readCounter(Expressions[I].RHS))
+      return Err;
+  }
+
+  // Read the mapping regions sub-arrays.
+  for (unsigned InferredFileID = 0, S = VirtualFileMapping.size();
+       InferredFileID < S; ++InferredFileID) {
+    if (auto Err = readMappingRegionsSubArray(MappingRegions, InferredFileID,
+                                              VirtualFileMapping.size()))
+      return Err;
+  }
+
+  // Set the counters for the expansion regions.
+  // i.e. Counter of expansion region = counter of the first region
+  // from the expanded file.
+  // Perform multiple passes to correctly propagate the counters through
+  // all the nested expansion regions.
+  SmallVector<CounterMappingRegion *, 8> FileIDExpansionRegionMapping;
+  FileIDExpansionRegionMapping.resize(VirtualFileMapping.size(), nullptr);
+  for (unsigned Pass = 1, S = VirtualFileMapping.size(); Pass < S; ++Pass) {
+    for (auto &R : MappingRegions) {
+      if (R.Kind != CounterMappingRegion::ExpansionRegion)
+        continue;
+      assert(!FileIDExpansionRegionMapping[R.ExpandedFileID]);
+      FileIDExpansionRegionMapping[R.ExpandedFileID] = &R;
+    }
+    for (auto &R : MappingRegions) {
+      if (FileIDExpansionRegionMapping[R.FileID]) {
+        FileIDExpansionRegionMapping[R.FileID]->Count = R.Count;
+        FileIDExpansionRegionMapping[R.FileID] = nullptr;
+      }
+    }
+  }
+
+  Record.FunctionName = FunctionName;
+  Record.Filenames = Filenames;
+  Record.Expressions = Expressions;
+  Record.MappingRegions = MappingRegions;
+  return success();
+}
+
+ObjectFileCoverageMappingReader::ObjectFileCoverageMappingReader(
+    StringRef FileName)
+    : CurrentRecord(0) {
+  auto File = llvm::object::ObjectFile::createObjectFile(FileName);
+  if (!File)
+    error(File.getError());
+  else
+    Object = std::move(File.get());
+}
+
+namespace {
+/// \brief The coverage mapping data for a single function.
+/// It points to the function's name.
+template <typename IntPtrT> struct CoverageMappingFunctionRecord {
+  IntPtrT FunctionNamePtr;
+  uint32_t FunctionNameSize;
+  uint32_t CoverageMappingSize;
+  uint64_t FunctionHash;
+};
+
+/// \brief The coverage mapping data for a single translation unit.
+/// It points to the array of function coverage mapping records and the encoded
+/// filenames array.
+template <typename IntPtrT> struct CoverageMappingTURecord {
+  uint32_t FunctionRecordsSize;
+  uint32_t FilenamesSize;
+  uint32_t CoverageMappingsSize;
+  uint32_t Version;
+};
+
+/// \brief A helper structure to access the data from a section
+/// in an object file.
+struct SectionData {
+  StringRef Data;
+  uint64_t Address;
+
+  std::error_code load(SectionRef &Section) {
+    if (auto Err = Section.getContents(Data))
+      return Err;
+    Address = Section.getAddress();
+    return instrprof_error::success;
+  }
+
+  std::error_code get(uint64_t Pointer, size_t Size, StringRef &Result) {
+    if (Pointer < Address)
+      return instrprof_error::malformed;
+    auto Offset = Pointer - Address;
+    if (Offset + Size > Data.size())
+      return instrprof_error::malformed;
+    Result = Data.substr(Pointer - Address, Size);
+    return instrprof_error::success;
+  }
+};
+}
+
+template <typename T>
+std::error_code readCoverageMappingData(
+    SectionData &ProfileNames, StringRef Data,
+    std::vector<ObjectFileCoverageMappingReader::ProfileMappingRecord> &Records,
+    std::vector<StringRef> &Filenames) {
+  llvm::DenseSet<T> UniqueFunctionMappingData;
+
+  // Read the records in the coverage data section.
+  while (!Data.empty()) {
+    if (Data.size() < sizeof(CoverageMappingTURecord<T>))
+      return instrprof_error::malformed;
+    auto TU = reinterpret_cast<const CoverageMappingTURecord<T> *>(Data.data());
+    Data = Data.substr(sizeof(CoverageMappingTURecord<T>));
+    switch (TU->Version) {
+    case CoverageMappingVersion1:
+      break;
+    default:
+      return instrprof_error::unsupported_version;
+    }
+    auto Version = CoverageMappingVersion(TU->Version);
+
+    // Get the function records.
+    auto FunctionRecords =
+        reinterpret_cast<const CoverageMappingFunctionRecord<T> *>(Data.data());
+    if (Data.size() <
+        sizeof(CoverageMappingFunctionRecord<T>) * TU->FunctionRecordsSize)
+      return instrprof_error::malformed;
+    Data = Data.substr(sizeof(CoverageMappingFunctionRecord<T>) *
+                       TU->FunctionRecordsSize);
+
+    // Get the filenames.
+    if (Data.size() < TU->FilenamesSize)
+      return instrprof_error::malformed;
+    auto RawFilenames = Data.substr(0, TU->FilenamesSize);
+    Data = Data.substr(TU->FilenamesSize);
+    size_t FilenamesBegin = Filenames.size();
+    RawCoverageFilenamesReader Reader(RawFilenames, Filenames);
+    if (auto Err = Reader.read())
+      return Err;
+
+    // Get the coverage mappings.
+    if (Data.size() < TU->CoverageMappingsSize)
+      return instrprof_error::malformed;
+    auto CoverageMappings = Data.substr(0, TU->CoverageMappingsSize);
+    Data = Data.substr(TU->CoverageMappingsSize);
+
+    for (unsigned I = 0; I < TU->FunctionRecordsSize; ++I) {
+      auto &MappingRecord = FunctionRecords[I];
+
+      // Get the coverage mapping.
+      if (CoverageMappings.size() < MappingRecord.CoverageMappingSize)
+        return instrprof_error::malformed;
+      auto Mapping =
+          CoverageMappings.substr(0, MappingRecord.CoverageMappingSize);
+      CoverageMappings =
+          CoverageMappings.substr(MappingRecord.CoverageMappingSize);
+
+      // Ignore this record if we already have a record that points to the same
+      // function name.
+      // This is useful to ignore the redundant records for the functions
+      // with ODR linkage.
+      if (!UniqueFunctionMappingData.insert(MappingRecord.FunctionNamePtr)
+               .second)
+        continue;
+      StringRef FunctionName;
+      if (auto Err =
+              ProfileNames.get(MappingRecord.FunctionNamePtr,
+                               MappingRecord.FunctionNameSize, FunctionName))
+        return Err;
+      Records.push_back(ObjectFileCoverageMappingReader::ProfileMappingRecord(
+          Version, FunctionName, MappingRecord.FunctionHash, Mapping,
+          FilenamesBegin, Filenames.size() - FilenamesBegin));
+    }
+  }
+
+  return instrprof_error::success;
+}
+
+static const char *TestingFormatMagic = "llvmcovmtestdata";
+
+static std::error_code decodeTestingFormat(StringRef Data,
+                                           SectionData &ProfileNames,
+                                           StringRef &CoverageMapping) {
+  Data = Data.substr(StringRef(TestingFormatMagic).size());
+  if (Data.size() < 1)
+    return instrprof_error::truncated;
+  unsigned N = 0;
+  auto ProfileNamesSize =
+      decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N);
+  if (N > Data.size())
+    return instrprof_error::malformed;
+  Data = Data.substr(N);
+  if (Data.size() < 1)
+    return instrprof_error::truncated;
+  N = 0;
+  ProfileNames.Address =
+      decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N);
+  if (N > Data.size())
+    return instrprof_error::malformed;
+  Data = Data.substr(N);
+  if (Data.size() < ProfileNamesSize)
+    return instrprof_error::malformed;
+  ProfileNames.Data = Data.substr(0, ProfileNamesSize);
+  CoverageMapping = Data.substr(ProfileNamesSize);
+  return instrprof_error::success;
+}
+
+ObjectFileCoverageMappingReader::ObjectFileCoverageMappingReader(
+    std::unique_ptr<MemoryBuffer> &ObjectBuffer, sys::fs::file_magic Type)
+    : CurrentRecord(0) {
+  if (ObjectBuffer->getBuffer().startswith(TestingFormatMagic)) {
+    // This is a special format used for testing.
+    SectionData ProfileNames;
+    StringRef CoverageMapping;
+    if (auto Err = decodeTestingFormat(ObjectBuffer->getBuffer(), ProfileNames,
+                                       CoverageMapping)) {
+      error(Err);
+      return;
+    }
+    error(readCoverageMappingData<uint64_t>(ProfileNames, CoverageMapping,
+                                            MappingRecords, Filenames));
+    Object = OwningBinary<ObjectFile>(std::unique_ptr<ObjectFile>(),
+                                      std::move(ObjectBuffer));
+    return;
+  }
+
+  auto File = object::ObjectFile::createObjectFile(
+      ObjectBuffer->getMemBufferRef(), Type);
+  if (!File)
+    error(File.getError());
+  else
+    Object = OwningBinary<ObjectFile>(std::move(File.get()),
+                                      std::move(ObjectBuffer));
+}
+
+std::error_code ObjectFileCoverageMappingReader::readHeader() {
+  const ObjectFile *OF = Object.getBinary();
+  if (!OF)
+    return getError();
+  auto BytesInAddress = OF->getBytesInAddress();
+  if (BytesInAddress != 4 && BytesInAddress != 8)
+    return error(instrprof_error::malformed);
+
+  // Look for the sections that we are interested in.
+  int FoundSectionCount = 0;
+  SectionRef ProfileNames, CoverageMapping;
+  for (const auto &Section : OF->sections()) {
+    StringRef Name;
+    if (auto Err = Section.getName(Name))
+      return Err;
+    if (Name == "__llvm_prf_names") {
+      ProfileNames = Section;
+    } else if (Name == "__llvm_covmap") {
+      CoverageMapping = Section;
+    } else
+      continue;
+    ++FoundSectionCount;
+  }
+  if (FoundSectionCount != 2)
+    return error(instrprof_error::bad_header);
+
+  // Get the contents of the given sections.
+  StringRef Data;
+  if (auto Err = CoverageMapping.getContents(Data))
+    return Err;
+  SectionData ProfileNamesData;
+  if (auto Err = ProfileNamesData.load(ProfileNames))
+    return Err;
+
+  // Load the data from the found sections.
+  std::error_code Err;
+  if (BytesInAddress == 4)
+    Err = readCoverageMappingData<uint32_t>(ProfileNamesData, Data,
+                                            MappingRecords, Filenames);
+  else
+    Err = readCoverageMappingData<uint64_t>(ProfileNamesData, Data,
+                                            MappingRecords, Filenames);
+  if (Err)
+    return error(Err);
+
+  return success();
+}
+
+std::error_code
+ObjectFileCoverageMappingReader::readNextRecord(CoverageMappingRecord &Record) {
+  if (CurrentRecord >= MappingRecords.size())
+    return error(instrprof_error::eof);
+
+  FunctionsFilenames.clear();
+  Expressions.clear();
+  MappingRegions.clear();
+  auto &R = MappingRecords[CurrentRecord];
+  RawCoverageMappingReader Reader(
+      R.FunctionName, R.CoverageMapping,
+      makeArrayRef(Filenames.data() + R.FilenamesBegin, R.FilenamesSize),
+      FunctionsFilenames, Expressions, MappingRegions);
+  if (auto Err = Reader.read(Record))
+    return Err;
+  Record.FunctionHash = R.FunctionHash;
+  ++CurrentRecord;
+  return success();
+}

diff --git a/lib/ProfileData/CoverageMappingWriter.cpp b/lib/ProfileData/CoverageMappingWriter.cpp
new file mode 100644
index 0000000..6969c2a
--- /dev/null
+++ b/lib/ProfileData/CoverageMappingWriter.cpp

@@ -0,0 +1,187 @@
+//=-- CoverageMappingWriter.cpp - Code coverage mapping writer -------------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing coverage mapping data for
+// instrumentation based coverage.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/CoverageMappingWriter.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+using namespace coverage;
+
+void CoverageFilenamesSectionWriter::write(raw_ostream &OS) {
+  encodeULEB128(Filenames.size(), OS);
+  for (const auto &Filename : Filenames) {
+    encodeULEB128(Filename.size(), OS);
+    OS << Filename;
+  }
+}
+
+namespace {
+/// \brief Gather only the expressions that are used by the mapping
+/// regions in this function.
+class CounterExpressionsMinimizer {
+  ArrayRef<CounterExpression> Expressions;
+  llvm::SmallVector<CounterExpression, 16> UsedExpressions;
+  std::vector<unsigned> AdjustedExpressionIDs;
+
+public:
+  void mark(Counter C) {
+    if (!C.isExpression())
+      return;
+    unsigned ID = C.getExpressionID();
+    AdjustedExpressionIDs[ID] = 1;
+    mark(Expressions[ID].LHS);
+    mark(Expressions[ID].RHS);
+  }
+
+  void gatherUsed(Counter C) {
+    if (!C.isExpression() || !AdjustedExpressionIDs[C.getExpressionID()])
+      return;
+    AdjustedExpressionIDs[C.getExpressionID()] = UsedExpressions.size();
+    const auto &E = Expressions[C.getExpressionID()];
+    UsedExpressions.push_back(E);
+    gatherUsed(E.LHS);
+    gatherUsed(E.RHS);
+  }
+
+  CounterExpressionsMinimizer(ArrayRef<CounterExpression> Expressions,
+                              ArrayRef<CounterMappingRegion> MappingRegions)
+      : Expressions(Expressions) {
+    AdjustedExpressionIDs.resize(Expressions.size(), 0);
+    for (const auto &I : MappingRegions)
+      mark(I.Count);
+    for (const auto &I : MappingRegions)
+      gatherUsed(I.Count);
+  }
+
+  ArrayRef<CounterExpression> getExpressions() const { return UsedExpressions; }
+
+  /// \brief Adjust the given counter to correctly transition from the old
+  /// expression ids to the new expression ids.
+  Counter adjust(Counter C) const {
+    if (C.isExpression())
+      C = Counter::getExpression(AdjustedExpressionIDs[C.getExpressionID()]);
+    return C;
+  }
+};
+}
+
+/// \brief Encode the counter.
+///
+/// The encoding uses the following format:
+/// Low 2 bits - Tag:
+///   Counter::Zero(0) - A Counter with kind Counter::Zero
+///   Counter::CounterValueReference(1) - A counter with kind
+///     Counter::CounterValueReference
+///   Counter::Expression(2) + CounterExpression::Subtract(0) -
+///     A counter with kind Counter::Expression and an expression
+///     with kind CounterExpression::Subtract
+///   Counter::Expression(2) + CounterExpression::Add(1) -
+///     A counter with kind Counter::Expression and an expression
+///     with kind CounterExpression::Add
+/// Remaining bits - Counter/Expression ID.
+static unsigned encodeCounter(ArrayRef<CounterExpression> Expressions,
+                              Counter C) {
+  unsigned Tag = unsigned(C.getKind());
+  if (C.isExpression())
+    Tag += Expressions[C.getExpressionID()].Kind;
+  unsigned ID = C.getCounterID();
+  assert(ID <=
+         (std::numeric_limits<unsigned>::max() >> Counter::EncodingTagBits));
+  return Tag | (ID << Counter::EncodingTagBits);
+}
+
+static void writeCounter(ArrayRef<CounterExpression> Expressions, Counter C,
+                         raw_ostream &OS) {
+  encodeULEB128(encodeCounter(Expressions, C), OS);
+}
+
+void CoverageMappingWriter::write(raw_ostream &OS) {
+  // Sort the regions in an ascending order by the file id and the starting
+  // location.
+  std::sort(MappingRegions.begin(), MappingRegions.end());
+
+  // Write out the fileid -> filename mapping.
+  encodeULEB128(VirtualFileMapping.size(), OS);
+  for (const auto &FileID : VirtualFileMapping)
+    encodeULEB128(FileID, OS);
+
+  // Write out the expressions.
+  CounterExpressionsMinimizer Minimizer(Expressions, MappingRegions);
+  auto MinExpressions = Minimizer.getExpressions();
+  encodeULEB128(MinExpressions.size(), OS);
+  for (const auto &E : MinExpressions) {
+    writeCounter(MinExpressions, Minimizer.adjust(E.LHS), OS);
+    writeCounter(MinExpressions, Minimizer.adjust(E.RHS), OS);
+  }
+
+  // Write out the mapping regions.
+  // Split the regions into subarrays where each region in a
+  // subarray has a fileID which is the index of that subarray.
+  unsigned PrevLineStart = 0;
+  unsigned CurrentFileID = ~0U;
+  for (auto I = MappingRegions.begin(), E = MappingRegions.end(); I != E; ++I) {
+    if (I->FileID != CurrentFileID) {
+      // Ensure that all file ids have at least one mapping region.
+      assert(I->FileID == (CurrentFileID + 1));
+      // Find the number of regions with this file id.
+      unsigned RegionCount = 1;
+      for (auto J = I + 1; J != E && I->FileID == J->FileID; ++J)
+        ++RegionCount;
+      // Start a new region sub-array.
+      encodeULEB128(RegionCount, OS);
+
+      CurrentFileID = I->FileID;
+      PrevLineStart = 0;
+    }
+    Counter Count = Minimizer.adjust(I->Count);
+    switch (I->Kind) {
+    case CounterMappingRegion::CodeRegion:
+      writeCounter(MinExpressions, Count, OS);
+      break;
+    case CounterMappingRegion::ExpansionRegion: {
+      assert(Count.isZero());
+      assert(I->ExpandedFileID <=
+             (std::numeric_limits<unsigned>::max() >>
+              Counter::EncodingCounterTagAndExpansionRegionTagBits));
+      // Mark an expansion region with a set bit that follows the counter tag,
+      // and pack the expanded file id into the remaining bits.
+      unsigned EncodedTagExpandedFileID =
+          (1 << Counter::EncodingTagBits) |
+          (I->ExpandedFileID
+           << Counter::EncodingCounterTagAndExpansionRegionTagBits);
+      encodeULEB128(EncodedTagExpandedFileID, OS);
+      break;
+    }
+    case CounterMappingRegion::SkippedRegion:
+      assert(Count.isZero());
+      encodeULEB128(unsigned(I->Kind)
+                        << Counter::EncodingCounterTagAndExpansionRegionTagBits,
+                    OS);
+      break;
+    }
+    assert(I->LineStart >= PrevLineStart);
+    encodeULEB128(I->LineStart - PrevLineStart, OS);
+    uint64_t CodeBeforeColumnStart =
+        uint64_t(I->HasCodeBefore) |
+        (uint64_t(I->ColumnStart)
+         << CounterMappingRegion::EncodingHasCodeBeforeBits);
+    encodeULEB128(CodeBeforeColumnStart, OS);
+    assert(I->LineEnd >= I->LineStart);
+    encodeULEB128(I->LineEnd - I->LineStart, OS);
+    encodeULEB128(I->ColumnEnd, OS);
+    PrevLineStart = I->LineStart;
+  }
+  // Ensure that all file ids have at least one mapping region.
+  assert(CurrentFileID == (VirtualFileMapping.size() - 1));
+}

diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 0121222..900dff9 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp

@@ -14,6 +14,7 @@
 
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
@@ -55,7 +56,8 @@
 };
 }
 
+static ManagedStatic<InstrProfErrorCategoryType> ErrorCategory;
+
 const std::error_category &llvm::instrprof_category() {
-  static InstrProfErrorCategoryType C;
-  return C;
+  return *ErrorCategory;
 }

diff --git a/lib/ProfileData/InstrProfIndexed.h b/lib/ProfileData/InstrProfIndexed.h
index 7761704..c2bc46c 100644
--- a/lib/ProfileData/InstrProfIndexed.h
+++ b/lib/ProfileData/InstrProfIndexed.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PROFILEDATA_INSTRPROF_INDEXED_H_
-#define LLVM_PROFILEDATA_INSTRPROF_INDEXED_H_
+#ifndef LLVM_LIB_PROFILEDATA_INSTRPROFINDEXED_H
+#define LLVM_LIB_PROFILEDATA_INSTRPROFINDEXED_H
 
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MD5.h"
@@ -46,10 +46,10 @@
 }
 
 const uint64_t Magic = 0x8169666f72706cff; // "\xfflprofi\x81"
-const uint64_t Version = 1;
+const uint64_t Version = 2;
 const HashT HashType = HashT::MD5;
 }
 
 } // end namespace llvm
 
-#endif // LLVM_PROFILEDATA_INSTRPROF_INDEXED_H_
+#endif

diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 0b36728..0160a64 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp

@@ -21,32 +21,34 @@
 
 using namespace llvm;
 
-static std::error_code
-setupMemoryBuffer(std::string Path, std::unique_ptr<MemoryBuffer> &Buffer) {
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+setupMemoryBuffer(std::string Path) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   if (std::error_code EC = BufferOrErr.getError())
     return EC;
-  Buffer = std::move(BufferOrErr.get());
+  auto Buffer = std::move(BufferOrErr.get());
 
   // Sanity check the file.
   if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
     return instrprof_error::too_large;
-  return instrprof_error::success;
+  return std::move(Buffer);
 }
 
 static std::error_code initializeReader(InstrProfReader &Reader) {
   return Reader.readHeader();
 }
 
-std::error_code
-InstrProfReader::create(std::string Path,
-                        std::unique_ptr<InstrProfReader> &Result) {
+ErrorOr<std::unique_ptr<InstrProfReader>>
+InstrProfReader::create(std::string Path) {
   // Set up the buffer to read.
-  std::unique_ptr<MemoryBuffer> Buffer;
-  if (std::error_code EC = setupMemoryBuffer(Path, Buffer))
+  auto BufferOrError = setupMemoryBuffer(Path);
+  if (std::error_code EC = BufferOrError.getError())
     return EC;
 
+  auto Buffer = std::move(BufferOrError.get());
+  std::unique_ptr<InstrProfReader> Result;
+
   // Create the reader.
   if (IndexedInstrProfReader::hasFormat(*Buffer))
     Result.reset(new IndexedInstrProfReader(std::move(Buffer)));
@@ -58,16 +60,20 @@
     Result.reset(new TextInstrProfReader(std::move(Buffer)));
 
   // Initialize the reader and return the result.
-  return initializeReader(*Result);
+  if (std::error_code EC = initializeReader(*Result))
+    return EC;
+
+  return std::move(Result);
 }
 
 std::error_code IndexedInstrProfReader::create(
     std::string Path, std::unique_ptr<IndexedInstrProfReader> &Result) {
   // Set up the buffer to read.
-  std::unique_ptr<MemoryBuffer> Buffer;
-  if (std::error_code EC = setupMemoryBuffer(Path, Buffer))
+  auto BufferOrError = setupMemoryBuffer(Path);
+  if (std::error_code EC = BufferOrError.getError())
     return EC;
 
+  auto Buffer = std::move(BufferOrError.get());
   // Create the reader.
   if (!IndexedInstrProfReader::hasFormat(*Buffer))
     return instrprof_error::bad_magic;
@@ -83,8 +89,8 @@
 }
 
 std::error_code TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
-  // Skip empty lines.
-  while (!Line.is_at_end() && Line->empty())
+  // Skip empty lines and comments.
+  while (!Line.is_at_end() && (Line->empty() || Line->startswith("#")))
     ++Line;
   // If we hit EOF while looking for a name, we're done.
   if (Line.is_at_end())
@@ -190,6 +196,9 @@
   // garbage at the end of the file.
   if (CurrentPos + sizeof(RawHeader) > End)
     return instrprof_error::malformed;
+  // The writer ensures each profile is padded to start at an aligned address.
+  if (reinterpret_cast<size_t>(CurrentPos) % alignOf<uint64_t>())
+    return instrprof_error::malformed;
   // The magic should have the same byte order as in the previous header.
   uint64_t Magic = *reinterpret_cast<const uint64_t *>(CurrentPos);
   if (Magic != swap(getRawMagic<IntPtrT>()))
@@ -307,8 +316,8 @@
     return error(instrprof_error::bad_magic);
 
   // Read the version.
-  uint64_t Version = endian::readNext<uint64_t, little, unaligned>(Cur);
-  if (Version != IndexedInstrProf::Version)
+  FormatVersion = endian::readNext<uint64_t, little, unaligned>(Cur);
+  if (FormatVersion > IndexedInstrProf::Version)
     return error(instrprof_error::unsupported_version);
 
   // Read the maximal function count.
@@ -331,18 +340,31 @@
 }
 
 std::error_code IndexedInstrProfReader::getFunctionCounts(
-    StringRef FuncName, uint64_t &FuncHash, std::vector<uint64_t> &Counts) {
-  const auto &Iter = Index->find(FuncName);
+    StringRef FuncName, uint64_t FuncHash, std::vector<uint64_t> &Counts) {
+  auto Iter = Index->find(FuncName);
   if (Iter == Index->end())
     return error(instrprof_error::unknown_function);
 
-  // Found it. Make sure it's valid before giving back a result.
-  const InstrProfRecord &Record = *Iter;
-  if (Record.Name.empty())
-    return error(instrprof_error::malformed);
-  FuncHash = Record.Hash;
-  Counts = Record.Counts;
-  return success();
+  // Found it. Look for counters with the right hash.
+  ArrayRef<uint64_t> Data = (*Iter).Data;
+  uint64_t NumCounts;
+  for (uint64_t I = 0, E = Data.size(); I != E; I += NumCounts) {
+    // The function hash comes first.
+    uint64_t FoundHash = Data[I++];
+    // In v1, we have at least one count. Later, we have the number of counts.
+    if (I == E)
+      return error(instrprof_error::malformed);
+    NumCounts = FormatVersion == 1 ? E - I : Data[I++];
+    // If we have more counts than data, this is bogus.
+    if (I + NumCounts > E)
+      return error(instrprof_error::malformed);
+    // Check for a match and fill the vector if there is one.
+    if (FoundHash == FuncHash) {
+      Counts = Data.slice(I, NumCounts);
+      return success();
+    }
+  }
+  return error(instrprof_error::hash_mismatch);
 }
 
 std::error_code
@@ -351,10 +373,30 @@
   if (RecordIterator == Index->data_end())
     return error(instrprof_error::eof);
 
-  // Read the next one.
-  Record = *RecordIterator;
-  ++RecordIterator;
-  if (Record.Name.empty())
+  // Record the current function name.
+  Record.Name = (*RecordIterator).Name;
+
+  ArrayRef<uint64_t> Data = (*RecordIterator).Data;
+  // Valid data starts with a hash and either a count or the number of counts.
+  if (CurrentOffset + 1 > Data.size())
     return error(instrprof_error::malformed);
+  // First we have a function hash.
+  Record.Hash = Data[CurrentOffset++];
+  // In version 1 we knew the number of counters implicitly, but in newer
+  // versions we store the number of counters next.
+  uint64_t NumCounts =
+      FormatVersion == 1 ? Data.size() - CurrentOffset : Data[CurrentOffset++];
+  if (CurrentOffset + NumCounts > Data.size())
+    return error(instrprof_error::malformed);
+  // And finally the counts themselves.
+  Record.Counts = Data.slice(CurrentOffset, NumCounts);
+
+  // If we've exhausted this function's data, increment the record.
+  CurrentOffset += NumCounts;
+  if (CurrentOffset == Data.size()) {
+    ++RecordIterator;
+    CurrentOffset = 0;
+  }
+
   return success();
 }

diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index e55c299..ad1b876 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp

@@ -45,7 +45,9 @@
     offset_type N = K.size();
     LE.write<offset_type>(N);
 
-    offset_type M = (1 + V->Counts.size()) * sizeof(uint64_t);
+    offset_type M = 0;
+    for (const auto &Counts : *V)
+      M += (2 + Counts.second.size()) * sizeof(uint64_t);
     LE.write<offset_type>(M);
 
     return std::make_pair(N, M);
@@ -59,9 +61,13 @@
                        offset_type) {
     using namespace llvm::support;
     endian::Writer<little> LE(Out);
-    LE.write<uint64_t>(V->Hash);
-    for (uint64_t I : V->Counts)
-      LE.write<uint64_t>(I);
+
+    for (const auto &Counts : *V) {
+      LE.write<uint64_t>(Counts.first);
+      LE.write<uint64_t>(Counts.second.size());
+      for (uint64_t I : Counts.second)
+        LE.write<uint64_t>(I);
+    }
   }
 };
 }
@@ -70,41 +76,43 @@
 InstrProfWriter::addFunctionCounts(StringRef FunctionName,
                                    uint64_t FunctionHash,
                                    ArrayRef<uint64_t> Counters) {
-  auto Where = FunctionData.find(FunctionName);
-  if (Where == FunctionData.end()) {
-    // If this is the first time we've seen this function, just add it.
-    auto &Data = FunctionData[FunctionName];
-    Data.Hash = FunctionHash;
-    Data.Counts = Counters;
+  auto &CounterData = FunctionData[FunctionName];
+
+  auto Where = CounterData.find(FunctionHash);
+  if (Where == CounterData.end()) {
+    // We've never seen a function with this name and hash, add it.
+    CounterData[FunctionHash] = Counters;
+    // We keep track of the max function count as we go for simplicity.
+    if (Counters[0] > MaxFunctionCount)
+      MaxFunctionCount = Counters[0];
     return instrprof_error::success;
   }
 
-  auto &Data = Where->getValue();
-  // We can only add to existing functions if they match, so we check the hash
-  // and number of counters.
-  if (Data.Hash != FunctionHash)
-    return instrprof_error::hash_mismatch;
-  if (Data.Counts.size() != Counters.size())
+  // We're updating a function we've seen before.
+  auto &FoundCounters = Where->second;
+  // If the number of counters doesn't match we either have bad data or a hash
+  // collision.
+  if (FoundCounters.size() != Counters.size())
     return instrprof_error::count_mismatch;
-  // These match, add up the counters.
+
   for (size_t I = 0, E = Counters.size(); I < E; ++I) {
-    if (Data.Counts[I] + Counters[I] < Data.Counts[I])
+    if (FoundCounters[I] + Counters[I] < FoundCounters[I])
       return instrprof_error::counter_overflow;
-    Data.Counts[I] += Counters[I];
+    FoundCounters[I] += Counters[I];
   }
+  // We keep track of the max function count as we go for simplicity.
+  if (FoundCounters[0] > MaxFunctionCount)
+    MaxFunctionCount = FoundCounters[0];
+
   return instrprof_error::success;
 }
 
 void InstrProfWriter::write(raw_fd_ostream &OS) {
   OnDiskChainedHashTableGenerator<InstrProfRecordTrait> Generator;
-  uint64_t MaxFunctionCount = 0;
 
   // Populate the hash table generator.
-  for (const auto &I : FunctionData) {
+  for (const auto &I : FunctionData)
     Generator.insert(I.getKey(), &I.getValue());
-    if (I.getValue().Counts[0] > MaxFunctionCount)
-      MaxFunctionCount = I.getValue().Counts[0];
-  }
 
   using namespace llvm::support;
   endian::Writer<little> LE(OS);

diff --git a/lib/ProfileData/LLVMBuild.txt b/lib/ProfileData/LLVMBuild.txt
index 0a8cbe3..a7f471f 100644
--- a/lib/ProfileData/LLVMBuild.txt
+++ b/lib/ProfileData/LLVMBuild.txt

@@ -19,4 +19,4 @@
 type = Library
 name = ProfileData
 parent = Libraries
-required_libraries = Support
+required_libraries = Core Support Object

diff --git a/lib/ProfileData/SampleProf.cpp b/lib/ProfileData/SampleProf.cpp
new file mode 100644
index 0000000..920c48a
--- /dev/null
+++ b/lib/ProfileData/SampleProf.cpp

@@ -0,0 +1,51 @@
+//=-- SampleProf.cpp - Sample profiling format support --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains common definitions used in the reading and writing of
+// sample profile data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+
+namespace {
+class SampleProfErrorCategoryType : public std::error_category {
+  const char *name() const LLVM_NOEXCEPT override { return "llvm.sampleprof"; }
+  std::string message(int IE) const override {
+    sampleprof_error E = static_cast<sampleprof_error>(IE);
+    switch (E) {
+    case sampleprof_error::success:
+      return "Success";
+    case sampleprof_error::bad_magic:
+      return "Invalid file format (bad magic)";
+    case sampleprof_error::unsupported_version:
+      return "Unsupported format version";
+    case sampleprof_error::too_large:
+      return "Too much profile data";
+    case sampleprof_error::truncated:
+      return "Truncated profile data";
+    case sampleprof_error::malformed:
+      return "Malformed profile data";
+    case sampleprof_error::unrecognized_format:
+      return "Unrecognized profile encoding format";
+    }
+    llvm_unreachable("A value of sampleprof_error has no message.");
+  }
+};
+}
+
+static ManagedStatic<SampleProfErrorCategoryType> ErrorCategory;
+
+const std::error_category &llvm::sampleprof_category() {
+  return *ErrorCategory;
+}

diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp
new file mode 100644
index 0000000..b39bfd6
--- /dev/null
+++ b/lib/ProfileData/SampleProfReader.cpp

@@ -0,0 +1,399 @@
+//===- SampleProfReader.cpp - Read LLVM sample profile data ---------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the class that reads LLVM sample profiles. It
+// supports two file formats: text and binary. The textual representation
+// is useful for debugging and testing purposes. The binary representation
+// is more compact, resulting in smaller file sizes. However, they can
+// both be used interchangeably.
+//
+// NOTE: If you are making changes to the file format, please remember
+//       to document them in the Clang documentation at
+//       tools/clang/docs/UsersManual.rst.
+//
+// Text format
+// -----------
+//
+// Sample profiles are written as ASCII text. The file is divided into
+// sections, which correspond to each of the functions executed at runtime.
+// Each section has the following format
+//
+//     function1:total_samples:total_head_samples
+//     offset1[.discriminator]: number_of_samples [fn1:num fn2:num ... ]
+//     offset2[.discriminator]: number_of_samples [fn3:num fn4:num ... ]
+//     ...
+//     offsetN[.discriminator]: number_of_samples [fn5:num fn6:num ... ]
+//
+// The file may contain blank lines between sections and within a
+// section. However, the spacing within a single line is fixed. Additional
+// spaces will result in an error while reading the file.
+//
+// Function names must be mangled in order for the profile loader to
+// match them in the current translation unit. The two numbers in the
+// function header specify how many total samples were accumulated in the
+// function (first number), and the total number of samples accumulated
+// in the prologue of the function (second number). This head sample
+// count provides an indicator of how frequently the function is invoked.
+//
+// Each sampled line may contain several items. Some are optional (marked
+// below):
+//
+// a. Source line offset. This number represents the line number
+//    in the function where the sample was collected. The line number is
+//    always relative to the line where symbol of the function is
+//    defined. So, if the function has its header at line 280, the offset
+//    13 is at line 293 in the file.
+//
+//    Note that this offset should never be a negative number. This could
+//    happen in cases like macros. The debug machinery will register the
+//    line number at the point of macro expansion. So, if the macro was
+//    expanded in a line before the start of the function, the profile
+//    converter should emit a 0 as the offset (this means that the optimizers
+//    will not be able to associate a meaningful weight to the instructions
+//    in the macro).
+//
+// b. [OPTIONAL] Discriminator. This is used if the sampled program
+//    was compiled with DWARF discriminator support
+//    (http://wiki.dwarfstd.org/index.php?title=Path_Discriminators).
+//    DWARF discriminators are unsigned integer values that allow the
+//    compiler to distinguish between multiple execution paths on the
+//    same source line location.
+//
+//    For example, consider the line of code ``if (cond) foo(); else bar();``.
+//    If the predicate ``cond`` is true 80% of the time, then the edge
+//    into function ``foo`` should be considered to be taken most of the
+//    time. But both calls to ``foo`` and ``bar`` are at the same source
+//    line, so a sample count at that line is not sufficient. The
+//    compiler needs to know which part of that line is taken more
+//    frequently.
+//
+//    This is what discriminators provide. In this case, the calls to
+//    ``foo`` and ``bar`` will be at the same line, but will have
+//    different discriminator values. This allows the compiler to correctly
+//    set edge weights into ``foo`` and ``bar``.
+//
+// c. Number of samples. This is an integer quantity representing the
+//    number of samples collected by the profiler at this source
+//    location.
+//
+// d. [OPTIONAL] Potential call targets and samples. If present, this
+//    line contains a call instruction. This models both direct and
+//    number of samples. For example,
+//
+//      130: 7  foo:3  bar:2  baz:7
+//
+//    The above means that at relative line offset 130 there is a call
+//    instruction that calls one of ``foo()``, ``bar()`` and ``baz()``,
+//    with ``baz()`` being the relatively more frequently called target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm::sampleprof;
+using namespace llvm;
+
+/// \brief Print the samples collected for a function on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+void FunctionSamples::print(raw_ostream &OS) {
+  OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size()
+     << " sampled lines\n";
+  for (const auto &SI : BodySamples) {
+    LineLocation Loc = SI.first;
+    const SampleRecord &Sample = SI.second;
+    OS << "\tline offset: " << Loc.LineOffset
+       << ", discriminator: " << Loc.Discriminator
+       << ", number of samples: " << Sample.getSamples();
+    if (Sample.hasCalls()) {
+      OS << ", calls:";
+      for (const auto &I : Sample.getCallTargets())
+        OS << " " << I.first() << ":" << I.second;
+    }
+    OS << "\n";
+  }
+  OS << "\n";
+}
+
+/// \brief Dump the function profile for \p FName.
+///
+/// \param FName Name of the function to print.
+/// \param OS Stream to emit the output to.
+void SampleProfileReader::dumpFunctionProfile(StringRef FName,
+                                              raw_ostream &OS) {
+  OS << "Function: " << FName << ": ";
+  Profiles[FName].print(OS);
+}
+
+/// \brief Dump all the function profiles found on stream \p OS.
+void SampleProfileReader::dump(raw_ostream &OS) {
+  for (const auto &I : Profiles)
+    dumpFunctionProfile(I.getKey(), OS);
+}
+
+/// \brief Load samples from a text file.
+///
+/// See the documentation at the top of the file for an explanation of
+/// the expected format.
+///
+/// \returns true if the file was loaded successfully, false otherwise.
+std::error_code SampleProfileReaderText::read() {
+  line_iterator LineIt(*Buffer, /*SkipBlanks=*/true, '#');
+
+  // Read the profile of each function. Since each function may be
+  // mentioned more than once, and we are collecting flat profiles,
+  // accumulate samples as we parse them.
+  Regex HeadRE("^([^0-9].*):([0-9]+):([0-9]+)$");
+  Regex LineSampleRE("^([0-9]+)\\.?([0-9]+)?: ([0-9]+)(.*)$");
+  Regex CallSampleRE(" +([^0-9 ][^ ]*):([0-9]+)");
+  while (!LineIt.is_at_eof()) {
+    // Read the header of each function.
+    //
+    // Note that for function identifiers we are actually expecting
+    // mangled names, but we may not always get them. This happens when
+    // the compiler decides not to emit the function (e.g., it was inlined
+    // and removed). In this case, the binary will not have the linkage
+    // name for the function, so the profiler will emit the function's
+    // unmangled name, which may contain characters like ':' and '>' in its
+    // name (member functions, templates, etc).
+    //
+    // The only requirement we place on the identifier, then, is that it
+    // should not begin with a number.
+    SmallVector<StringRef, 4> Matches;
+    if (!HeadRE.match(*LineIt, &Matches)) {
+      reportParseError(LineIt.line_number(),
+                       "Expected 'mangled_name:NUM:NUM', found " + *LineIt);
+      return sampleprof_error::malformed;
+    }
+    assert(Matches.size() == 4);
+    StringRef FName = Matches[1];
+    unsigned NumSamples, NumHeadSamples;
+    Matches[2].getAsInteger(10, NumSamples);
+    Matches[3].getAsInteger(10, NumHeadSamples);
+    Profiles[FName] = FunctionSamples();
+    FunctionSamples &FProfile = Profiles[FName];
+    FProfile.addTotalSamples(NumSamples);
+    FProfile.addHeadSamples(NumHeadSamples);
+    ++LineIt;
+
+    // Now read the body. The body of the function ends when we reach
+    // EOF or when we see the start of the next function.
+    while (!LineIt.is_at_eof() && isdigit((*LineIt)[0])) {
+      if (!LineSampleRE.match(*LineIt, &Matches)) {
+        reportParseError(
+            LineIt.line_number(),
+            "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt);
+        return sampleprof_error::malformed;
+      }
+      assert(Matches.size() == 5);
+      unsigned LineOffset, NumSamples, Discriminator = 0;
+      Matches[1].getAsInteger(10, LineOffset);
+      if (Matches[2] != "")
+        Matches[2].getAsInteger(10, Discriminator);
+      Matches[3].getAsInteger(10, NumSamples);
+
+      // If there are function calls in this line, generate a call sample
+      // entry for each call.
+      std::string CallsLine(Matches[4]);
+      while (CallsLine != "") {
+        SmallVector<StringRef, 3> CallSample;
+        if (!CallSampleRE.match(CallsLine, &CallSample)) {
+          reportParseError(LineIt.line_number(),
+                           "Expected 'mangled_name:NUM', found " + CallsLine);
+          return sampleprof_error::malformed;
+        }
+        StringRef CalledFunction = CallSample[1];
+        unsigned CalledFunctionSamples;
+        CallSample[2].getAsInteger(10, CalledFunctionSamples);
+        FProfile.addCalledTargetSamples(LineOffset, Discriminator,
+                                        CalledFunction, CalledFunctionSamples);
+        CallsLine = CallSampleRE.sub("", CallsLine);
+      }
+
+      FProfile.addBodySamples(LineOffset, Discriminator, NumSamples);
+      ++LineIt;
+    }
+  }
+
+  return sampleprof_error::success;
+}
+
+template <typename T> ErrorOr<T> SampleProfileReaderBinary::readNumber() {
+  unsigned NumBytesRead = 0;
+  std::error_code EC;
+  uint64_t Val = decodeULEB128(Data, &NumBytesRead);
+
+  if (Val > std::numeric_limits<T>::max())
+    EC = sampleprof_error::malformed;
+  else if (Data + NumBytesRead > End)
+    EC = sampleprof_error::truncated;
+  else
+    EC = sampleprof_error::success;
+
+  if (EC) {
+    reportParseError(0, EC.message());
+    return EC;
+  }
+
+  Data += NumBytesRead;
+  return static_cast<T>(Val);
+}
+
+ErrorOr<StringRef> SampleProfileReaderBinary::readString() {
+  std::error_code EC;
+  StringRef Str(reinterpret_cast<const char *>(Data));
+  if (Data + Str.size() + 1 > End) {
+    EC = sampleprof_error::truncated;
+    reportParseError(0, EC.message());
+    return EC;
+  }
+
+  Data += Str.size() + 1;
+  return Str;
+}
+
+std::error_code SampleProfileReaderBinary::read() {
+  while (!at_eof()) {
+    auto FName(readString());
+    if (std::error_code EC = FName.getError())
+      return EC;
+
+    Profiles[*FName] = FunctionSamples();
+    FunctionSamples &FProfile = Profiles[*FName];
+
+    auto Val = readNumber<unsigned>();
+    if (std::error_code EC = Val.getError())
+      return EC;
+    FProfile.addTotalSamples(*Val);
+
+    Val = readNumber<unsigned>();
+    if (std::error_code EC = Val.getError())
+      return EC;
+    FProfile.addHeadSamples(*Val);
+
+    // Read the samples in the body.
+    auto NumRecords = readNumber<unsigned>();
+    if (std::error_code EC = NumRecords.getError())
+      return EC;
+    for (unsigned I = 0; I < *NumRecords; ++I) {
+      auto LineOffset = readNumber<uint64_t>();
+      if (std::error_code EC = LineOffset.getError())
+        return EC;
+
+      auto Discriminator = readNumber<uint64_t>();
+      if (std::error_code EC = Discriminator.getError())
+        return EC;
+
+      auto NumSamples = readNumber<uint64_t>();
+      if (std::error_code EC = NumSamples.getError())
+        return EC;
+
+      auto NumCalls = readNumber<unsigned>();
+      if (std::error_code EC = NumCalls.getError())
+        return EC;
+
+      for (unsigned J = 0; J < *NumCalls; ++J) {
+        auto CalledFunction(readString());
+        if (std::error_code EC = CalledFunction.getError())
+          return EC;
+
+        auto CalledFunctionSamples = readNumber<uint64_t>();
+        if (std::error_code EC = CalledFunctionSamples.getError())
+          return EC;
+
+        FProfile.addCalledTargetSamples(*LineOffset, *Discriminator,
+                                        *CalledFunction,
+                                        *CalledFunctionSamples);
+      }
+
+      FProfile.addBodySamples(*LineOffset, *Discriminator, *NumSamples);
+    }
+  }
+
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderBinary::readHeader() {
+  Data = reinterpret_cast<const uint8_t *>(Buffer->getBufferStart());
+  End = Data + Buffer->getBufferSize();
+
+  // Read and check the magic identifier.
+  auto Magic = readNumber<uint64_t>();
+  if (std::error_code EC = Magic.getError())
+    return EC;
+  else if (*Magic != SPMagic())
+    return sampleprof_error::bad_magic;
+
+  // Read the version number.
+  auto Version = readNumber<uint64_t>();
+  if (std::error_code EC = Version.getError())
+    return EC;
+  else if (*Version != SPVersion())
+    return sampleprof_error::unsupported_version;
+
+  return sampleprof_error::success;
+}
+
+bool SampleProfileReaderBinary::hasFormat(const MemoryBuffer &Buffer) {
+  const uint8_t *Data =
+      reinterpret_cast<const uint8_t *>(Buffer.getBufferStart());
+  uint64_t Magic = decodeULEB128(Data);
+  return Magic == SPMagic();
+}
+
+/// \brief Prepare a memory buffer for the contents of \p Filename.
+///
+/// \returns an error code indicating the status of the buffer.
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+setupMemoryBuffer(std::string Filename) {
+  auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = BufferOrErr.getError())
+    return EC;
+  auto Buffer = std::move(BufferOrErr.get());
+
+  // Sanity check the file.
+  if (Buffer->getBufferSize() > std::numeric_limits<unsigned>::max())
+    return sampleprof_error::too_large;
+
+  return std::move(Buffer);
+}
+
+/// \brief Create a sample profile reader based on the format of the input file.
+///
+/// \param Filename The file to open.
+///
+/// \param Reader The reader to instantiate according to \p Filename's format.
+///
+/// \param C The LLVM context to use to emit diagnostics.
+///
+/// \returns an error code indicating the status of the created reader.
+ErrorOr<std::unique_ptr<SampleProfileReader>>
+SampleProfileReader::create(StringRef Filename, LLVMContext &C) {
+  auto BufferOrError = setupMemoryBuffer(Filename);
+  if (std::error_code EC = BufferOrError.getError())
+    return EC;
+
+  auto Buffer = std::move(BufferOrError.get());
+  std::unique_ptr<SampleProfileReader> Reader;
+  if (SampleProfileReaderBinary::hasFormat(*Buffer))
+    Reader.reset(new SampleProfileReaderBinary(std::move(Buffer), C));
+  else
+    Reader.reset(new SampleProfileReaderText(std::move(Buffer), C));
+
+  if (std::error_code EC = Reader->readHeader())
+    return EC;
+
+  return std::move(Reader);
+}

diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
new file mode 100644
index 0000000..8525045
--- /dev/null
+++ b/lib/ProfileData/SampleProfWriter.cpp

@@ -0,0 +1,126 @@
+//===- SampleProfWriter.cpp - Write LLVM sample profile data --------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the class that writes LLVM sample profiles. It
+// supports two file formats: text and binary. The textual representation
+// is useful for debugging and testing purposes. The binary representation
+// is more compact, resulting in smaller file sizes. However, they can
+// both be used interchangeably.
+//
+// See lib/ProfileData/SampleProfReader.cpp for documentation on each of the
+// supported formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/SampleProfWriter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm::sampleprof;
+using namespace llvm;
+
+/// \brief Write samples to a text file.
+bool SampleProfileWriterText::write(StringRef FName, const FunctionSamples &S) {
+  if (S.empty())
+    return true;
+
+  OS << FName << ":" << S.getTotalSamples() << ":" << S.getHeadSamples()
+     << "\n";
+
+  for (const auto &I : S.getBodySamples()) {
+    LineLocation Loc = I.first;
+    const SampleRecord &Sample = I.second;
+    if (Loc.Discriminator == 0)
+      OS << Loc.LineOffset << ": ";
+    else
+      OS << Loc.LineOffset << "." << Loc.Discriminator << ": ";
+
+    OS << Sample.getSamples();
+
+    for (const auto &J : Sample.getCallTargets())
+      OS << " " << J.first() << ":" << J.second;
+    OS << "\n";
+  }
+
+  return true;
+}
+
+SampleProfileWriterBinary::SampleProfileWriterBinary(StringRef F,
+                                                     std::error_code &EC)
+    : SampleProfileWriter(F, EC, sys::fs::F_None) {
+  if (EC)
+    return;
+
+  // Write the file header.
+  encodeULEB128(SPMagic(), OS);
+  encodeULEB128(SPVersion(), OS);
+}
+
+/// \brief Write samples to a binary file.
+///
+/// \returns true if the samples were written successfully, false otherwise.
+bool SampleProfileWriterBinary::write(StringRef FName,
+                                      const FunctionSamples &S) {
+  if (S.empty())
+    return true;
+
+  OS << FName;
+  encodeULEB128(0, OS);
+  encodeULEB128(S.getTotalSamples(), OS);
+  encodeULEB128(S.getHeadSamples(), OS);
+  encodeULEB128(S.getBodySamples().size(), OS);
+  for (const auto &I : S.getBodySamples()) {
+    LineLocation Loc = I.first;
+    const SampleRecord &Sample = I.second;
+    encodeULEB128(Loc.LineOffset, OS);
+    encodeULEB128(Loc.Discriminator, OS);
+    encodeULEB128(Sample.getSamples(), OS);
+    encodeULEB128(Sample.getCallTargets().size(), OS);
+    for (const auto &J : Sample.getCallTargets()) {
+      std::string Callee = J.first();
+      unsigned CalleeSamples = J.second;
+      OS << Callee;
+      encodeULEB128(0, OS);
+      encodeULEB128(CalleeSamples, OS);
+    }
+  }
+
+  return true;
+}
+
+/// \brief Create a sample profile writer based on the specified format.
+///
+/// \param Filename The file to create.
+///
+/// \param Writer The writer to instantiate according to the specified format.
+///
+/// \param Format Encoding format for the profile file.
+///
+/// \returns an error code indicating the status of the created writer.
+ErrorOr<std::unique_ptr<SampleProfileWriter>>
+SampleProfileWriter::create(StringRef Filename, SampleProfileFormat Format) {
+  std::error_code EC;
+  std::unique_ptr<SampleProfileWriter> Writer;
+
+  if (Format == SPF_Binary)
+    Writer.reset(new SampleProfileWriterBinary(Filename, EC));
+  else if (Format == SPF_Text)
+    Writer.reset(new SampleProfileWriterText(Filename, EC));
+  else
+    EC = sampleprof_error::unrecognized_format;
+
+  if (EC)
+    return EC;
+
+  return std::move(Writer);
+}

diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 7989e30..295b16c 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp

@@ -35,8 +35,7 @@
 
 /* Assumed in hexadecimal significand parsing, and conversion to
    hexadecimal strings.  */
-#define COMPILE_TIME_ASSERT(cond) extern int CTAssert[(cond) ? 1 : -1]
-COMPILE_TIME_ASSERT(integerPartWidth % 4 == 0);
+static_assert(integerPartWidth % 4 == 0, "Part width must be divisible by 4!");
 
 namespace llvm {
 
@@ -212,15 +211,15 @@
 {
   StringRef::iterator p = begin;
   *dot = end;
-  while (*p == '0' && p != end)
+  while (p != end && *p == '0')
     p++;
 
-  if (*p == '.') {
+  if (p != end && *p == '.') {
     *dot = p++;
 
     assert(end - begin != 1 && "Significand has no digits");
 
-    while (*p == '0' && p != end)
+    while (p != end && *p == '0')
       p++;
   }
 
@@ -927,7 +926,10 @@
   assert(semantics == rhs.semantics);
 
   precision = semantics->precision;
-  newPartsCount = partCountForBits(precision * 2);
+
+  // Allocate space for twice as many bits as the original significand, plus one
+  // extra bit for the addition to overflow into.
+  newPartsCount = partCountForBits(precision * 2 + 1);
 
   if (newPartsCount > 4)
     fullSignificand = new integerPart[newPartsCount];
@@ -949,13 +951,14 @@
   //   *this = a23 . a22 ... a0 * 2^e1
   //     rhs = b23 . b22 ... b0 * 2^e2
   // the result of multiplication is:
-  //   *this = c47 c46 . c45 ... c0 * 2^(e1+e2)
-  // Note that there are two significant bits at the left-hand side of the 
-  // radix point. Move the radix point toward left by one bit, and adjust
-  // exponent accordingly.
-  exponent += 1;
+  //   *this = c48 c47 c46 . c45 ... c0 * 2^(e1+e2)
+  // Note that there are three significant bits at the left-hand side of the 
+  // radix point: two for the multiplication, and an overflow bit for the
+  // addition (that will always be zero at this point). Move the radix point
+  // toward left by two bits, and adjust exponent accordingly.
+  exponent += 2;
 
-  if (addend) {
+  if (addend && addend->isNonZero()) {
     // The intermediate result of the multiplication has "2 * precision" 
     // signicant bit; adjust the addend to be consistent with mul result.
     //
@@ -965,13 +968,13 @@
     opStatus status;
     unsigned int extendedPrecision;
 
-    /* Normalize our MSB.  */
-    extendedPrecision = 2 * precision;
-    if (omsb != extendedPrecision) {
+    // Normalize our MSB to one below the top bit to allow for overflow.
+    extendedPrecision = 2 * precision + 1;
+    if (omsb != extendedPrecision - 1) {
       assert(extendedPrecision > omsb);
       APInt::tcShiftLeft(fullSignificand, newPartsCount,
-                         extendedPrecision - omsb);
-      exponent -= extendedPrecision - omsb;
+                         (extendedPrecision - 1) - omsb);
+      exponent -= (extendedPrecision - 1) - omsb;
     }
 
     /* Create new semantics.  */
@@ -988,6 +991,14 @@
     status = extendedAddend.convert(extendedSemantics, rmTowardZero, &ignored);
     assert(status == opOK);
     (void)status;
+
+    // Shift the significand of the addend right by one bit. This guarantees
+    // that the high bit of the significand is zero (same as fullSignificand),
+    // so the addition will overflow (if it does overflow at all) into the top bit.
+    lost_fraction = extendedAddend.shiftSignificandRight(1);
+    assert(lost_fraction == lfExactlyZero &&
+           "Lost precision while shifting addend for fused-multiply-add.");
+
     lost_fraction = addOrSubtractSignificand(extendedAddend, false);
 
     /* Restore our state.  */
@@ -1003,7 +1014,7 @@
   // having "precision" significant-bits. First, move the radix point from 
   // poision "2*precision - 1" to "precision - 1". The exponent need to be
   // adjusted by "2*precision - 1" - "precision - 1" = "precision".
-  exponent -= precision;
+  exponent -= precision + 1;
 
   // In case MSB resides at the left-hand side of radix point, shift the
   // mantissa right by some amount to make sure the MSB reside right before
@@ -1801,7 +1812,7 @@
      extended-precision calculation.  */
   if (isFiniteNonZero() &&
       multiplicand.isFiniteNonZero() &&
-      addend.isFiniteNonZero()) {
+      addend.isFinite()) {
     lostFraction lost_fraction;
 
     lost_fraction = multiplySignificand(multiplicand, &addend);
@@ -3377,7 +3388,9 @@
   // internal consistency.
   const unsigned NumUnusedHighBits =
     PartCount*integerPartWidth - semantics->precision;
-  significand[PartCount - 1] = ~integerPart(0) >> NumUnusedHighBits;
+  significand[PartCount - 1] = (NumUnusedHighBits < integerPartWidth)
+                                   ? (~integerPart(0) >> NumUnusedHighBits)
+                                   : 0;
 }
 
 /// Make this number the smallest magnitude denormal number in the given
@@ -3904,3 +3917,20 @@
   exponent = semantics->minExponent-1;
   APInt::tcSet(significandParts(), 0, partCount());  
 }
+
+APFloat llvm::scalbn(APFloat X, int Exp) {
+  if (X.isInfinity() || X.isZero() || X.isNaN())
+    return std::move(X);
+
+  auto MaxExp = X.getSemantics().maxExponent;
+  auto MinExp = X.getSemantics().minExponent;
+  if (Exp > (MaxExp - X.exponent))
+    // Overflow saturates to infinity.
+    return APFloat::getInf(X.getSemantics(), X.isNegative());
+  if (Exp < (MinExp - X.exponent))
+    // Underflow saturates to zero.
+    return APFloat::getZero(X.getSemantics(), X.isNegative());
+
+  X.exponent += Exp;
+  return std::move(X);
+}

diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index fa929eb..c20eeb2 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp

@@ -454,8 +454,10 @@
   for (unsigned i = 0; i < numWords; ++i)
     val[i] = pVal[i] ^ RHS.pVal[i];
 
+  APInt Result(val, getBitWidth());
   // 0^0==1 so clear the high bits in case they got set.
-  return APInt(val, getBitWidth()).clearUnusedBits();
+  Result.clearUnusedBits();
+  return Result;
 }
 
 APInt APInt::operator*(const APInt& RHS) const {
@@ -473,7 +475,8 @@
     return APInt(BitWidth, VAL + RHS.VAL);
   APInt Result(BitWidth, 0);
   add(Result.pVal, this->pVal, RHS.pVal, getNumWords());
-  return Result.clearUnusedBits();
+  Result.clearUnusedBits();
+  return Result;
 }
 
 APInt APInt::operator-(const APInt& RHS) const {
@@ -482,7 +485,8 @@
     return APInt(BitWidth, VAL - RHS.VAL);
   APInt Result(BitWidth, 0);
   sub(Result.pVal, this->pVal, RHS.pVal, getNumWords());
-  return Result.clearUnusedBits();
+  Result.clearUnusedBits();
+  return Result;
 }
 
 bool APInt::EqualSlowCase(const APInt& RHS) const {
@@ -1114,7 +1118,9 @@
   uint64_t fillValue = (isNegative() ? -1ULL : 0);
   for (unsigned i = breakWord+1; i < getNumWords(); ++i)
     val[i] = fillValue;
-  return APInt(val, BitWidth).clearUnusedBits();
+  APInt Result(val, BitWidth);
+  Result.clearUnusedBits();
+  return Result;
 }
 
 /// Logical right-shift this APInt by shiftAmt.
@@ -1151,7 +1157,9 @@
   // If we are shifting less than a word, compute the shift with a simple carry
   if (shiftAmt < APINT_BITS_PER_WORD) {
     lshrNear(val, pVal, getNumWords(), shiftAmt);
-    return APInt(val, BitWidth).clearUnusedBits();
+    APInt Result(val, BitWidth);
+    Result.clearUnusedBits();
+    return Result;
   }
 
   // Compute some values needed by the remaining shift algorithms
@@ -1164,7 +1172,9 @@
       val[i] = pVal[i+offset];
     for (unsigned i = getNumWords()-offset; i < getNumWords(); i++)
       val[i] = 0;
-    return APInt(val,BitWidth).clearUnusedBits();
+    APInt Result(val, BitWidth);
+    Result.clearUnusedBits();
+    return Result;
   }
 
   // Shift the low order words
@@ -1178,7 +1188,9 @@
   // Remaining words are 0
   for (unsigned i = breakWord+1; i < getNumWords(); ++i)
     val[i] = 0;
-  return APInt(val, BitWidth).clearUnusedBits();
+  APInt Result(val, BitWidth);
+  Result.clearUnusedBits();
+  return Result;
 }
 
 /// Left-shift this APInt by shiftAmt.
@@ -1211,7 +1223,9 @@
       val[i] = pVal[i] << shiftAmt | carry;
       carry = pVal[i] >> (APINT_BITS_PER_WORD - shiftAmt);
     }
-    return APInt(val, BitWidth).clearUnusedBits();
+    APInt Result(val, BitWidth);
+    Result.clearUnusedBits();
+    return Result;
   }
 
   // Compute some values needed by the remaining shift algorithms
@@ -1224,7 +1238,9 @@
       val[i] = 0;
     for (unsigned i = offset; i < getNumWords(); i++)
       val[i] = pVal[i-offset];
-    return APInt(val,BitWidth).clearUnusedBits();
+    APInt Result(val, BitWidth);
+    Result.clearUnusedBits();
+    return Result;
   }
 
   // Copy whole words from this to Result.
@@ -1235,7 +1251,9 @@
   val[offset] = pVal[0] << wordShift;
   for (i = 0; i < offset; ++i)
     val[i] = 0;
-  return APInt(val, BitWidth).clearUnusedBits();
+  APInt Result(val, BitWidth);
+  Result.clearUnusedBits();
+  return Result;
 }
 
 APInt APInt::rotl(const APInt &rotateAmt) const {
@@ -1303,7 +1321,7 @@
 
   // Okay, all the short cuts are exhausted. We must compute it. The following
   // is a classical Babylonian method for computing the square root. This code
-  // was adapted to APINt from a wikipedia article on such computations.
+  // was adapted to APInt from a wikipedia article on such computations.
   // See http://www.wikipedia.org/ and go to the page named
   // Calculate_an_integer_square_root.
   unsigned nbits = BitWidth, i = 4;
@@ -2046,19 +2064,29 @@
   return Res;
 }
 
-APInt APInt::sshl_ov(unsigned ShAmt, bool &Overflow) const {
-  Overflow = ShAmt >= getBitWidth();
+APInt APInt::sshl_ov(const APInt &ShAmt, bool &Overflow) const {
+  Overflow = ShAmt.uge(getBitWidth());
   if (Overflow)
-    ShAmt = getBitWidth()-1;
+    return APInt(BitWidth, 0);
 
   if (isNonNegative()) // Don't allow sign change.
-    Overflow = ShAmt >= countLeadingZeros();
+    Overflow = ShAmt.uge(countLeadingZeros());
   else
-    Overflow = ShAmt >= countLeadingOnes();
+    Overflow = ShAmt.uge(countLeadingOnes());
   
   return *this << ShAmt;
 }
 
+APInt APInt::ushl_ov(const APInt &ShAmt, bool &Overflow) const {
+  Overflow = ShAmt.uge(getBitWidth());
+  if (Overflow)
+    return APInt(BitWidth, 0);
+
+  Overflow = ShAmt.ugt(countLeadingZeros());
+
+  return *this << ShAmt;
+}
+
 
 
 
@@ -2270,8 +2298,7 @@
 
 // Assumed by lowHalf, highHalf, partMSB and partLSB.  A fairly safe
 // and unrestricting assumption.
-#define COMPILE_TIME_ASSERT(cond) extern int CTAssert[(cond) ? 1 : -1]
-COMPILE_TIME_ASSERT(integerPartWidth % 2 == 0);
+static_assert(integerPartWidth % 2 == 0, "Part width must be divisible by 2!");
 
 /* Some handy functions local to this file.  */
 namespace {

diff --git a/lib/Support/Android.mk b/lib/Support/Android.mk
index 7968697..34448a7 100644
--- a/lib/Support/Android.mk
+++ b/lib/Support/Android.mk

@@ -22,7 +22,6 @@
   DataExtractor.cpp \
   Debug.cpp \
   DeltaAlgorithm.cpp \
-  Disassembler.cpp \
   Dwarf.cpp \
   DynamicLibrary.cpp \
   Errno.cpp \
@@ -44,10 +43,12 @@
   LockFileManager.cpp \
   MD5.cpp \
   ManagedStatic.cpp \
+  MathExtras.cpp \
   Memory.cpp \
   MemoryBuffer.cpp \
   MemoryObject.cpp \
   Mutex.cpp \
+  Options.cpp \
   Path.cpp \
   PluginLoader.cpp \
   PrettyStackTrace.cpp \
@@ -61,15 +62,14 @@
   Signals.cpp \
   SmallPtrSet.cpp \
   SmallVector.cpp \
+  StreamingMemoryObject.cpp \
   SourceMgr.cpp \
   SpecialCaseList.cpp \
   Statistic.cpp \
-  StreamableMemoryObject.cpp \
   StringExtras.cpp \
   StringMap.cpp \
   StringPool.cpp \
   StringRef.cpp \
-  StringRefMemoryObject.cpp \
   SystemUtils.cpp \
   TargetRegistry.cpp \
   Threading.cpp \

diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 9ecd559..fa62591 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt

@@ -1,3 +1,32 @@
+set(system_libs)
+if( NOT MSVC )
+  if( MINGW )
+    set(system_libs ${system_libs} imagehlp psapi shell32)
+  elseif( CMAKE_HOST_UNIX )
+    if( HAVE_LIBRT )
+      set(system_libs ${system_libs} rt)
+    endif()
+    if( HAVE_LIBDL )
+      set(system_libs ${system_libs} ${CMAKE_DL_LIBS})
+    endif()
+    if(LLVM_ENABLE_TERMINFO)
+      if(HAVE_TERMINFO)
+        set(system_libs ${system_libs} ${TERMINFO_LIBS})
+      endif()
+    endif()
+    if( LLVM_ENABLE_THREADS AND HAVE_LIBATOMIC )
+      set(system_libs ${system_libs} atomic)
+    endif()
+    if( LLVM_ENABLE_THREADS AND HAVE_LIBPTHREAD )
+      set(system_libs ${system_libs} pthread)
+    endif()
+    if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ )
+      set(system_libs ${system_libs} z)
+    endif()
+    set(system_libs ${system_libs} m)
+  endif( MINGW )
+endif( NOT MSVC )
+
 add_llvm_library(LLVMSupport
   APFloat.cpp
   APInt.cpp
@@ -36,9 +65,11 @@
   Locale.cpp
   LockFileManager.cpp
   ManagedStatic.cpp
+  MathExtras.cpp
   MemoryBuffer.cpp
   MemoryObject.cpp
   MD5.cpp
+  Options.cpp
   PluginLoader.cpp
   PrettyStackTrace.cpp
   RandomNumberGenerator.cpp
@@ -49,12 +80,11 @@
   SourceMgr.cpp
   SpecialCaseList.cpp
   Statistic.cpp
-  StreamableMemoryObject.cpp
+  StreamingMemoryObject.cpp
   StringExtras.cpp
   StringMap.cpp
   StringPool.cpp
   StringRef.cpp
-  StringRefMemoryObject.cpp
   SystemUtils.cpp
   Timer.cpp
   ToolOutputFile.cpp
@@ -73,11 +103,9 @@
 
 # System
   Atomic.cpp
-  Disassembler.cpp
   DynamicLibrary.cpp
   Errno.cpp
   Host.cpp
-  IncludeFile.cpp
   Memory.cpp
   Mutex.cpp
   Path.cpp
@@ -117,38 +145,8 @@
   Windows/ThreadLocal.inc
   Windows/TimeValue.inc
   Windows/Watchdog.inc
+
+  LINK_LIBS ${system_libs}
   )
-set(system_libs)
-if( NOT MSVC )
-  if( MINGW )
-    set(system_libs ${system_libs} imagehlp psapi shell32)
-  elseif( CMAKE_HOST_UNIX )
-    if( HAVE_LIBRT )
-      set(system_libs ${system_libs} rt)
-    endif()
-    if( HAVE_LIBDL )
-      set(system_libs ${system_libs} ${CMAKE_DL_LIBS})
-    endif()
-    if(LLVM_ENABLE_TERMINFO)
-      if(HAVE_TERMINFO)
-        set(system_libs ${system_libs} ${TERMINFO_LIBS})
-      endif()
-    endif()
-    if( LLVM_ENABLE_THREADS AND HAVE_LIBPTHREAD )
-      set(system_libs ${system_libs} pthread)
-    endif()
-    if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ )
-      set(system_libs ${system_libs} z)
-    endif()
-  endif( MINGW )
-endif( NOT MSVC )
-
-
-if(POLICY CMP0022 AND BUILD_SHARED_LIBS)
-  # FIXME: Should this be really PUBLIC?
-  target_link_libraries(LLVMSupport PUBLIC ${system_libs})
-else()
-  target_link_libraries(LLVMSupport ${cmake_2_8_12_INTERFACE} ${system_libs})
-endif()
 
 set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${system_libs}")

diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 87348f7..985c877 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp

@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/CommandLine.h"
+#include "llvm-c/Support.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
@@ -113,9 +114,15 @@
 }
 
 void Option::removeArgument() {
-  assert(NextRegistered && "argument never registered");
-  assert(RegisteredOptionList == this && "argument is not the last registered");
-  RegisteredOptionList = NextRegistered;
+  if (RegisteredOptionList == this) {
+    RegisteredOptionList = NextRegistered;
+    MarkOptionsChanged();
+    return;
+  }
+  Option *O = RegisteredOptionList;
+  for (; O->NextRegistered != this; O = O->NextRegistered)
+    ;
+  O->NextRegistered = NextRegistered;
   MarkOptionsChanged();
 }
 
@@ -158,7 +165,7 @@
     // Handle named options.
     for (size_t i = 0, e = OptionNames.size(); i != e; ++i) {
       // Add argument to the argument map!
-      if (OptionsMap.GetOrCreateValue(OptionNames[i], O).second != O) {
+      if (!OptionsMap.insert(std::make_pair(OptionNames[i], O)).second) {
         errs() << ProgramName << ": CommandLine Error: Option '"
                << OptionNames[i] << "' registered more than once!\n";
         HadErrors = true;
@@ -474,13 +481,18 @@
 }
 
 void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
-                                SmallVectorImpl<const char *> &NewArgv) {
+                                SmallVectorImpl<const char *> &NewArgv,
+                                bool MarkEOLs) {
   SmallString<128> Token;
   for (size_t I = 0, E = Src.size(); I != E; ++I) {
     // Consume runs of whitespace.
     if (Token.empty()) {
-      while (I != E && isWhitespace(Src[I]))
+      while (I != E && isWhitespace(Src[I])) {
+        // Mark the end of lines in response files
+        if (MarkEOLs && Src[I] == '\n')
+          NewArgv.push_back(nullptr);
         ++I;
+      }
       if (I == E) break;
     }
 
@@ -521,6 +533,9 @@
   // Append the last token after hitting EOF with no whitespace.
   if (!Token.empty())
     NewArgv.push_back(Saver.SaveString(Token.c_str()));
+  // Mark the end of response files
+  if (MarkEOLs)
+    NewArgv.push_back(nullptr);
 }
 
 /// Backslashes are interpreted in a rather complicated way in the Windows-style
@@ -562,7 +577,8 @@
 }
 
 void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
-                                    SmallVectorImpl<const char *> &NewArgv) {
+                                    SmallVectorImpl<const char *> &NewArgv,
+                                    bool MarkEOLs) {
   SmallString<128> Token;
 
   // This is a small state machine to consume characters until it reaches the
@@ -572,8 +588,12 @@
     // INIT state indicates that the current input index is at the start of
     // the string or between tokens.
     if (State == INIT) {
-      if (isWhitespace(Src[I]))
+      if (isWhitespace(Src[I])) {
+        // Mark the end of lines in response files
+        if (MarkEOLs && Src[I] == '\n')
+          NewArgv.push_back(nullptr);
         continue;
+      }
       if (Src[I] == '"') {
         State = QUOTED;
         continue;
@@ -596,6 +616,9 @@
         NewArgv.push_back(Saver.SaveString(Token.c_str()));
         Token.clear();
         State = INIT;
+        // Mark the end of lines in response files
+        if (MarkEOLs && Src[I] == '\n')
+          NewArgv.push_back(nullptr);
         continue;
       }
       if (Src[I] == '"') {
@@ -626,20 +649,24 @@
   // Append the last token after hitting EOF with no whitespace.
   if (!Token.empty())
     NewArgv.push_back(Saver.SaveString(Token.c_str()));
+  // Mark the end of response files
+  if (MarkEOLs)
+    NewArgv.push_back(nullptr);
 }
 
 static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
                                TokenizerCallback Tokenizer,
-                               SmallVectorImpl<const char *> &NewArgv) {
+                               SmallVectorImpl<const char *> &NewArgv,
+                               bool MarkEOLs = false) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> MemBufOrErr =
       MemoryBuffer::getFile(FName);
   if (!MemBufOrErr)
     return false;
-  std::unique_ptr<MemoryBuffer> MemBuf = std::move(MemBufOrErr.get());
-  StringRef Str(MemBuf->getBufferStart(), MemBuf->getBufferSize());
+  MemoryBuffer &MemBuf = *MemBufOrErr.get();
+  StringRef Str(MemBuf.getBufferStart(), MemBuf.getBufferSize());
 
   // If we have a UTF-16 byte order mark, convert to UTF-8 for parsing.
-  ArrayRef<char> BufRef(MemBuf->getBufferStart(), MemBuf->getBufferEnd());
+  ArrayRef<char> BufRef(MemBuf.getBufferStart(), MemBuf.getBufferEnd());
   std::string UTF8Buf;
   if (hasUTF16ByteOrderMark(BufRef)) {
     if (!convertUTF16ToUTF8String(BufRef, UTF8Buf))
@@ -648,7 +675,7 @@
   }
 
   // Tokenize the contents into NewArgv.
-  Tokenizer(Str, Saver, NewArgv);
+  Tokenizer(Str, Saver, NewArgv, MarkEOLs);
 
   return true;
 }
@@ -656,13 +683,19 @@
 /// \brief Expand response files on a command line recursively using the given
 /// StringSaver and tokenization strategy.
 bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
-                             SmallVectorImpl<const char *> &Argv) {
+                             SmallVectorImpl<const char *> &Argv,
+                             bool MarkEOLs) {
   unsigned RspFiles = 0;
   bool AllExpanded = true;
 
   // Don't cache Argv.size() because it can change.
   for (unsigned I = 0; I != Argv.size(); ) {
     const char *Arg = Argv[I];
+    // Check if it is an EOL marker
+    if (Arg == nullptr) {
+      ++I;
+      continue;
+    }
     if (Arg[0] != '@') {
       ++I;
       continue;
@@ -678,7 +711,8 @@
     // FIXME: If a nested response file uses a relative path, is it relative to
     // the cwd of the process or the response file?
     SmallVector<const char *, 0> ExpandedArgv;
-    if (!ExpandResponseFile(Arg + 1, Saver, Tokenizer, ExpandedArgv)) {
+    if (!ExpandResponseFile(Arg + 1, Saver, Tokenizer, ExpandedArgv,
+                            MarkEOLs)) {
       // We couldn't read this file, so we leave it in the argument stream and
       // move on.
       AllExpanded = false;
@@ -1018,13 +1052,12 @@
   }
 
   // Loop over args and make sure all required args are specified!
-  for (StringMap<Option*>::iterator I = Opts.begin(),
-         E = Opts.end(); I != E; ++I) {
-    switch (I->second->getNumOccurrencesFlag()) {
+  for (const auto &Opt : Opts) {
+    switch (Opt.second->getNumOccurrencesFlag()) {
     case Required:
     case OneOrMore:
-      if (I->second->getNumOccurrences() == 0) {
-        I->second->error("must be specified at least once!");
+      if (Opt.second->getNumOccurrences() == 0) {
+        Opt.second->error("must be specified at least once!");
         ErrorParsing = true;
       }
       // Fall through
@@ -1422,7 +1455,7 @@
       continue;
 
     // If we've already seen this option, don't add it to the list again.
-    if (!OptionSet.insert(I->second))
+    if (!OptionSet.insert(I->second).second)
       continue;
 
     Opts.push_back(std::pair<const char *, Option*>(I->getKey().data(),
@@ -1807,3 +1840,8 @@
   GetOptionInfo(PositionalOpts, SinkOpts, Map);
   return;
 }
+
+void LLVMParseCommandLineOptions(int argc, const char *const *argv,
+                                 const char *Overview) {
+  llvm::cl::ParseCommandLineOptions(argc, argv, Overview);
+}

diff --git a/lib/Support/DataStream.cpp b/lib/Support/DataStream.cpp
index 32653de..dbf6465 100644
--- a/lib/Support/DataStream.cpp
+++ b/lib/Support/DataStream.cpp

@@ -32,12 +32,12 @@
 #define DEBUG_TYPE "Data-stream"
 
 // Interface goals:
-// * StreamableMemoryObject doesn't care about complexities like using
+// * StreamingMemoryObject doesn't care about complexities like using
 //   threads/async callbacks to actually overlap download+compile
 // * Don't want to duplicate Data in memory
 // * Don't need to know total Data len in advance
 // Non-goals:
-// StreamableMemoryObject already has random access so this interface only does
+// StreamingMemoryObject already has random access so this interface only does
 // in-order streaming (no arbitrary seeking, else we'd have to buffer all the
 // Data here in addition to MemoryObject).  This also means that if we want
 // to be able to to free Data, BitstreamBytes/BitcodeReader will implement it

diff --git a/lib/Support/Debug.cpp b/lib/Support/Debug.cpp
index ad4d4ef..8246542 100644
--- a/lib/Support/Debug.cpp
+++ b/lib/Support/Debug.cpp

@@ -27,6 +27,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/circular_raw_ostream.h"
+#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
@@ -50,14 +51,14 @@
                 cl::Hidden,
                 cl::init(0));
 
-static std::string CurrentDebugType;
+static ManagedStatic<std::string> CurrentDebugType;
 
 namespace {
 
 struct DebugOnlyOpt {
   void operator=(const std::string &Val) const {
     DebugFlag |= !Val.empty();
-    CurrentDebugType = Val;
+    *CurrentDebugType = Val;
   }
 };
 
@@ -86,7 +87,7 @@
 // with the -debug-only=X option.
 //
 bool llvm::isCurrentDebugType(const char *DebugType) {
-  return CurrentDebugType.empty() || DebugType == CurrentDebugType;
+  return CurrentDebugType->empty() || DebugType == *CurrentDebugType;
 }
 
 /// setCurrentDebugType - Set the current debug type, as if the -debug-only=X
@@ -94,7 +95,7 @@
 /// debug output to be produced.
 ///
 void llvm::setCurrentDebugType(const char *Type) {
-  CurrentDebugType = Type;
+  *CurrentDebugType = Type;
 }
 
 /// dbgs - Return a circular-buffered debug stream.

diff --git a/lib/Support/Disassembler.cpp b/lib/Support/Disassembler.cpp
deleted file mode 100644
index 27df3a9..0000000
--- a/lib/Support/Disassembler.cpp
+++ /dev/null

@@ -1,74 +0,0 @@
-//===- lib/Support/Disassembler.cpp -----------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the necessary glue to call external disassembler
-// libraries.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Disassembler.h"
-#include "llvm/Config/config.h"
-#include <cassert>
-#include <iomanip>
-#include <sstream>
-#include <string>
-
-#if USE_UDIS86
-#include <udis86.h>
-#endif
-
-using namespace llvm;
-
-bool llvm::sys::hasDisassembler()
-{
-#if defined (__i386__) || defined (__amd64__) || defined (__x86_64__)
-  // We have option to enable udis86 library.
-# if USE_UDIS86
-  return true;
-#else
-  return false;
-#endif
-#else
-  return false;
-#endif
-}
-
-std::string llvm::sys::disassembleBuffer(uint8_t* start, size_t length,
-                                         uint64_t pc) {
-#if (defined (__i386__) || defined (__amd64__) || defined (__x86_64__)) \
-  && USE_UDIS86
-  std::stringstream res;
-
-  unsigned bits;
-# if defined(__i386__)
-  bits = 32;
-# else
-  bits = 64;
-# endif
-
-  ud_t ud_obj;
-
-  ud_init(&ud_obj);
-  ud_set_input_buffer(&ud_obj, start, length);
-  ud_set_mode(&ud_obj, bits);
-  ud_set_pc(&ud_obj, pc);
-  ud_set_syntax(&ud_obj, UD_SYN_ATT);
-
-  res << std::setbase(16)
-      << std::setw(bits/4);
-
-  while (ud_disassemble(&ud_obj)) {
-    res << ud_insn_off(&ud_obj) << ":\t" << ud_insn_asm(&ud_obj) << "\n";
-  }
-
-  return res.str();
-#else
-  return "No disassembler available. See configure help for options.\n";
-#endif
-}

diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index c9efa61..4b6337e 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp

@@ -17,8 +17,6 @@
 using namespace llvm;
 using namespace dwarf;
 
-/// TagString - Return the string for the specified tag.
-///
 const char *llvm::dwarf::TagString(unsigned Tag) {
   switch (Tag) {
   case DW_TAG_array_type:                return "DW_TAG_array_type";
@@ -82,6 +80,7 @@
   case DW_TAG_hi_user:                   return "DW_TAG_hi_user";
   case DW_TAG_auto_variable:             return "DW_TAG_auto_variable";
   case DW_TAG_arg_variable:              return "DW_TAG_arg_variable";
+  case DW_TAG_expression:                return "DW_TAG_expression";
   case DW_TAG_rvalue_reference_type:     return "DW_TAG_rvalue_reference_type";
   case DW_TAG_template_alias:            return "DW_TAG_template_alias";
   case DW_TAG_coarray_type:              return "DW_TAG_coarray_type";
@@ -103,8 +102,6 @@
   return nullptr;
 }
 
-/// ChildrenString - Return the string for the specified children flag.
-///
 const char *llvm::dwarf::ChildrenString(unsigned Children) {
   switch (Children) {
   case DW_CHILDREN_no:                   return "DW_CHILDREN_no";
@@ -113,8 +110,6 @@
   return nullptr;
 }
 
-/// AttributeString - Return the string for the specified attribute.
-///
 const char *llvm::dwarf::AttributeString(unsigned Attribute) {
   switch (Attribute) {
   case DW_AT_sibling:                    return "DW_AT_sibling";
@@ -274,8 +269,6 @@
   return nullptr;
 }
 
-/// FormEncodingString - Return the string for the specified form encoding.
-///
 const char *llvm::dwarf::FormEncodingString(unsigned Encoding) {
   switch (Encoding) {
   case DW_FORM_addr:                     return "DW_FORM_addr";
@@ -311,8 +304,6 @@
   return nullptr;
 }
 
-/// OperationEncodingString - Return the string for the specified operation
-/// encoding.
 const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
   switch (Encoding) {
   case DW_OP_addr:                       return "DW_OP_addr";
@@ -480,8 +471,6 @@
   return nullptr;
 }
 
-/// AttributeEncodingString - Return the string for the specified attribute
-/// encoding.
 const char *llvm::dwarf::AttributeEncodingString(unsigned Encoding) {
   switch (Encoding) {
   case DW_ATE_address:                   return "DW_ATE_address";
@@ -506,8 +495,6 @@
   return nullptr;
 }
 
-/// DecimalSignString - Return the string for the specified decimal sign
-/// attribute.
 const char *llvm::dwarf::DecimalSignString(unsigned Sign) {
   switch (Sign) {
   case DW_DS_unsigned:                   return "DW_DS_unsigned";
@@ -519,8 +506,6 @@
   return nullptr;
 }
 
-/// EndianityString - Return the string for the specified endianity.
-///
 const char *llvm::dwarf::EndianityString(unsigned Endian) {
   switch (Endian) {
   case DW_END_default:                   return "DW_END_default";
@@ -532,8 +517,6 @@
   return nullptr;
 }
 
-/// AccessibilityString - Return the string for the specified accessibility.
-///
 const char *llvm::dwarf::AccessibilityString(unsigned Access) {
   switch (Access) {
   // Accessibility codes
@@ -544,8 +527,6 @@
   return nullptr;
 }
 
-/// VisibilityString - Return the string for the specified visibility.
-///
 const char *llvm::dwarf::VisibilityString(unsigned Visibility) {
   switch (Visibility) {
   case DW_VIS_local:                     return "DW_VIS_local";
@@ -555,8 +536,6 @@
   return nullptr;
 }
 
-/// VirtualityString - Return the string for the specified virtuality.
-///
 const char *llvm::dwarf::VirtualityString(unsigned Virtuality) {
   switch (Virtuality) {
   case DW_VIRTUALITY_none:               return "DW_VIRTUALITY_none";
@@ -566,8 +545,6 @@
   return nullptr;
 }
 
-/// LanguageString - Return the string for the specified language.
-///
 const char *llvm::dwarf::LanguageString(unsigned Language) {
   switch (Language) {
   case DW_LANG_C89:                      return "DW_LANG_C89";
@@ -598,13 +575,12 @@
   case DW_LANG_C_plus_plus_11:           return "DW_LANG_C_plus_plus_11";
   case DW_LANG_OCaml:                    return "DW_LANG_OCaml";
   case DW_LANG_lo_user:                  return "DW_LANG_lo_user";
+  case DW_LANG_Mips_Assembler:           return "DW_LANG_Mips_Assembler";
   case DW_LANG_hi_user:                  return "DW_LANG_hi_user";
   }
   return nullptr;
 }
 
-/// CaseString - Return the string for the specified identifier case.
-///
 const char *llvm::dwarf::CaseString(unsigned Case) {
   switch (Case) {
   case DW_ID_case_sensitive:             return "DW_ID_case_sensitive";
@@ -615,8 +591,6 @@
   return nullptr;
 }
 
-/// ConventionString - Return the string for the specified calling convention.
-///
 const char *llvm::dwarf::ConventionString(unsigned Convention) {
    switch (Convention) {
    case DW_CC_normal:                     return "DW_CC_normal";
@@ -628,8 +602,6 @@
   return nullptr;
 }
 
-/// InlineCodeString - Return the string for the specified inline code.
-///
 const char *llvm::dwarf::InlineCodeString(unsigned Code) {
   switch (Code) {
   case DW_INL_not_inlined:               return "DW_INL_not_inlined";
@@ -640,8 +612,6 @@
   return nullptr;
 }
 
-/// ArrayOrderString - Return the string for the specified array order.
-///
 const char *llvm::dwarf::ArrayOrderString(unsigned Order) {
   switch (Order) {
   case DW_ORD_row_major:                 return "DW_ORD_row_major";
@@ -650,8 +620,6 @@
   return nullptr;
 }
 
-/// DiscriminantString - Return the string for the specified discriminant
-/// descriptor.
 const char *llvm::dwarf::DiscriminantString(unsigned Discriminant) {
   switch (Discriminant) {
   case DW_DSC_label:                     return "DW_DSC_label";
@@ -660,8 +628,6 @@
   return nullptr;
 }
 
-/// LNStandardString - Return the string for the specified line number standard.
-///
 const char *llvm::dwarf::LNStandardString(unsigned Standard) {
   switch (Standard) {
   case DW_LNS_copy:                      return "DW_LNS_copy";
@@ -680,8 +646,6 @@
   return nullptr;
 }
 
-/// LNExtendedString - Return the string for the specified line number extended
-/// opcode encodings.
 const char *llvm::dwarf::LNExtendedString(unsigned Encoding) {
   switch (Encoding) {
   // Line Number Extended Opcode Encodings
@@ -695,8 +659,6 @@
   return nullptr;
 }
 
-/// MacinfoString - Return the string for the specified macinfo type encodings.
-///
 const char *llvm::dwarf::MacinfoString(unsigned Encoding) {
   switch (Encoding) {
   // Macinfo Type Encodings
@@ -709,8 +671,6 @@
   return nullptr;
 }
 
-/// CallFrameString - Return the string for the specified call frame instruction
-/// encodings.
 const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   switch (Encoding) {
   case DW_CFA_nop:                       return "DW_CFA_nop";
@@ -748,6 +708,36 @@
   return nullptr;
 }
 
+const char *llvm::dwarf::ApplePropertyString(unsigned Prop) {
+  switch (Prop) {
+  case DW_APPLE_PROPERTY_readonly:
+    return "DW_APPLE_PROPERTY_readonly";
+  case DW_APPLE_PROPERTY_getter:
+    return "DW_APPLE_PROPERTY_getter";
+  case DW_APPLE_PROPERTY_assign:
+    return "DW_APPLE_PROPERTY_assign";
+  case DW_APPLE_PROPERTY_readwrite:
+    return "DW_APPLE_PROPERTY_readwrite";
+  case DW_APPLE_PROPERTY_retain:
+    return "DW_APPLE_PROPERTY_retain";
+  case DW_APPLE_PROPERTY_copy:
+    return "DW_APPLE_PROPERTY_copy";
+  case DW_APPLE_PROPERTY_nonatomic:
+    return "DW_APPLE_PROPERTY_nonatomic";
+  case DW_APPLE_PROPERTY_setter:
+    return "DW_APPLE_PROPERTY_setter";
+  case DW_APPLE_PROPERTY_atomic:
+    return "DW_APPLE_PROPERTY_atomic";
+  case DW_APPLE_PROPERTY_weak:
+    return "DW_APPLE_PROPERTY_weak";
+  case DW_APPLE_PROPERTY_strong:
+    return "DW_APPLE_PROPERTY_strong";
+  case DW_APPLE_PROPERTY_unsafe_unretained:
+    return "DW_APPLE_PROPERTY_unsafe_unretained";
+  }
+  return nullptr;
+}
+
 const char *llvm::dwarf::AtomTypeString(unsigned AT) {
   switch (AT) {
   case dwarf::DW_ATOM_null:
@@ -795,3 +785,34 @@
   }
   llvm_unreachable("Unknown GDBIndexEntryLinkage value");
 }
+
+const char *llvm::dwarf::AttributeValueString(uint16_t Attr, unsigned Val) {
+  switch (Attr) {
+  case DW_AT_accessibility:
+    return AccessibilityString(Val);
+  case DW_AT_virtuality:
+    return VirtualityString(Val);
+  case DW_AT_language:
+    return LanguageString(Val);
+  case DW_AT_encoding:
+    return AttributeEncodingString(Val);
+  case DW_AT_decimal_sign:
+    return DecimalSignString(Val);
+  case DW_AT_endianity:
+    return EndianityString(Val);
+  case DW_AT_visibility:
+    return VisibilityString(Val);
+  case DW_AT_identifier_case:
+    return CaseString(Val);
+  case DW_AT_calling_convention:
+    return ConventionString(Val);
+  case DW_AT_inline:
+    return InlineCodeString(Val);
+  case DW_AT_ordering:
+    return ArrayOrderString(Val);
+  case DW_AT_discr_value:
+    return DiscriminantString(Val);
+  }
+
+  return nullptr;
+}

diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index c36007f..8e65066 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp

@@ -20,6 +20,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/Threading.h"
@@ -41,18 +42,18 @@
 static fatal_error_handler_t ErrorHandler = nullptr;
 static void *ErrorHandlerUserData = nullptr;
 
-static sys::Mutex ErrorHandlerMutex;
+static ManagedStatic<sys::Mutex> ErrorHandlerMutex;
 
 void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
                                        void *user_data) {
-  llvm::MutexGuard Lock(ErrorHandlerMutex);
+  llvm::MutexGuard Lock(*ErrorHandlerMutex);
   assert(!ErrorHandler && "Error handler already registered!\n");
   ErrorHandler = handler;
   ErrorHandlerUserData = user_data;
 }
 
 void llvm::remove_fatal_error_handler() {
-  llvm::MutexGuard Lock(ErrorHandlerMutex);
+  llvm::MutexGuard Lock(*ErrorHandlerMutex);
   ErrorHandler = nullptr;
   ErrorHandlerUserData = nullptr;
 }
@@ -75,7 +76,7 @@
   {
     // Only acquire the mutex while reading the handler, so as not to invoke a
     // user-supplied callback under a lock.
-    llvm::MutexGuard Lock(ErrorHandlerMutex);
+    llvm::MutexGuard Lock(*ErrorHandlerMutex);
     handler = ErrorHandler;
     handlerData = ErrorHandlerUserData;
   }

diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 2e740ca..c62655d 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp

@@ -14,18 +14,16 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <system_error>
 
 using llvm::sys::fs::mapped_file_region;
 
 namespace llvm {
-FileOutputBuffer::FileOutputBuffer(mapped_file_region * R,
+FileOutputBuffer::FileOutputBuffer(std::unique_ptr<mapped_file_region> R,
                                    StringRef Path, StringRef TmpPath)
-  : Region(R)
-  , FinalPath(Path)
-  , TempPath(TmpPath) {
-}
+    : Region(std::move(R)), FinalPath(Path), TempPath(TmpPath) {}
 
 FileOutputBuffer::~FileOutputBuffer() {
   sys::fs::remove(Twine(TempPath));
@@ -73,21 +71,20 @@
   if (EC)
     return EC;
 
-  std::unique_ptr<mapped_file_region> MappedFile(new mapped_file_region(
-      FD, true, mapped_file_region::readwrite, Size, 0, EC));
+  auto MappedFile = llvm::make_unique<mapped_file_region>(
+      FD, true, mapped_file_region::readwrite, Size, 0, EC);
   if (EC)
     return EC;
 
-  Result.reset(new FileOutputBuffer(MappedFile.get(), FilePath, TempFilePath));
-  if (Result)
-    MappedFile.release();
+  Result.reset(
+      new FileOutputBuffer(std::move(MappedFile), FilePath, TempFilePath));
 
   return std::error_code();
 }
 
 std::error_code FileOutputBuffer::commit(int64_t NewSmallerSize) {
   // Unmap buffer, letting OS flush dirty pages to file on disk.
-  Region.reset(nullptr);
+  Region.reset();
 
   // If requested, resize file as part of commit.
   if ( NewSmallerSize != -1 ) {

diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
index 8a23491..5316f04 100644
--- a/lib/Support/FileUtilities.cpp
+++ b/lib/Support/FileUtilities.cpp

@@ -182,7 +182,7 @@
       *Error = EC.message();
     return 2;
   }
-  std::unique_ptr<MemoryBuffer> F1 = std::move(F1OrErr.get());
+  MemoryBuffer &F1 = *F1OrErr.get();
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> F2OrErr = MemoryBuffer::getFile(NameB);
   if (std::error_code EC = F2OrErr.getError()) {
@@ -190,17 +190,17 @@
       *Error = EC.message();
     return 2;
   }
-  std::unique_ptr<MemoryBuffer> F2 = std::move(F2OrErr.get());
+  MemoryBuffer &F2 = *F2OrErr.get();
 
   // Okay, now that we opened the files, scan them for the first difference.
-  const char *File1Start = F1->getBufferStart();
-  const char *File2Start = F2->getBufferStart();
-  const char *File1End = F1->getBufferEnd();
-  const char *File2End = F2->getBufferEnd();
+  const char *File1Start = F1.getBufferStart();
+  const char *File2Start = F2.getBufferStart();
+  const char *File1End = F1.getBufferEnd();
+  const char *File2End = F2.getBufferEnd();
   const char *F1P = File1Start;
   const char *F2P = File2Start;
-  uint64_t A_size = F1->getBufferSize();
-  uint64_t B_size = F2->getBufferSize();
+  uint64_t A_size = F1.getBufferSize();
+  uint64_t B_size = F2.getBufferSize();
 
   // Are the buffers identical?  Common case: Handle this efficiently.
   if (A_size == B_size &&

diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index e68ee43..054df52 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp

@@ -105,9 +105,10 @@
     SmallVector<StringRef, 8> parts;
     Names.split(parts, "|");
     for (auto Name : parts) {
-      ProgramPath = sys::FindProgramByName(Name);
-      if (!ProgramPath.empty())
+      if (ErrorOr<std::string> P = sys::findProgramByName(Name)) {
+        ProgramPath = *P;
         return true;
+      }
       Log << "  Tried '" << Name << "'\n";
     }
     return false;

diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index e2dd6d5..8782e2e 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp

@@ -759,13 +759,13 @@
 #endif
 
     if (LLVMFeatureStr != "")
-      Features.GetOrCreateValue(LLVMFeatureStr).setValue(true);
+      Features[LLVMFeatureStr] = true;
   }
 
 #if defined(__aarch64__)
   // If we have all crypto bits we can add the feature
   if (crypto == (CAP_AES | CAP_PMULL | CAP_SHA1 | CAP_SHA2))
-    Features.GetOrCreateValue("crypto").setValue(true);
+    Features["crypto"] = true;
 #endif
 
   return true;

diff --git a/lib/Support/IncludeFile.cpp b/lib/Support/IncludeFile.cpp
deleted file mode 100644
index e67acb3..0000000
--- a/lib/Support/IncludeFile.cpp
+++ /dev/null

@@ -1,20 +0,0 @@
-//===- lib/Support/IncludeFile.cpp - Ensure Linking Of Implementation -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the IncludeFile constructor.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/IncludeFile.h"
-
-using namespace llvm;
-
-// This constructor is used to ensure linking of other modules. See the
-// llvm/Support/IncludeFile.h header for details.
-IncludeFile::IncludeFile(const void*) {}

diff --git a/lib/Support/LineIterator.cpp b/lib/Support/LineIterator.cpp
index 947a8fb..5baa1a3 100644
--- a/lib/Support/LineIterator.cpp
+++ b/lib/Support/LineIterator.cpp

@@ -12,16 +12,39 @@
 
 using namespace llvm;
 
-line_iterator::line_iterator(const MemoryBuffer &Buffer, char CommentMarker)
+static bool isAtLineEnd(const char *P) {
+  if (*P == '\n')
+    return true;
+  if (*P == '\r' && *(P + 1) == '\n')
+    return true;
+  return false;
+}
+
+static bool skipIfAtLineEnd(const char *&P) {
+  if (*P == '\n') {
+    ++P;
+    return true;
+  }
+  if (*P == '\r' && *(P + 1) == '\n') {
+    P += 2;
+    return true;
+  }
+  return false;
+}
+
+line_iterator::line_iterator(const MemoryBuffer &Buffer, bool SkipBlanks,
+                             char CommentMarker)
     : Buffer(Buffer.getBufferSize() ? &Buffer : nullptr),
-      CommentMarker(CommentMarker), LineNumber(1),
+      CommentMarker(CommentMarker), SkipBlanks(SkipBlanks), LineNumber(1),
       CurrentLine(Buffer.getBufferSize() ? Buffer.getBufferStart() : nullptr,
                   0) {
   // Ensure that if we are constructed on a non-empty memory buffer that it is
   // a null terminated buffer.
   if (Buffer.getBufferSize()) {
     assert(Buffer.getBufferEnd()[0] == '\0');
-    advance();
+    // Make sure we don't skip a leading newline if we're keeping blanks
+    if (SkipBlanks || !isAtLineEnd(Buffer.getBufferStart()))
+      advance();
   }
 }
 
@@ -29,25 +52,27 @@
   assert(Buffer && "Cannot advance past the end!");
 
   const char *Pos = CurrentLine.end();
-  assert(Pos == Buffer->getBufferStart() || *Pos == '\n' || *Pos == '\0');
+  assert(Pos == Buffer->getBufferStart() || isAtLineEnd(Pos) || *Pos == '\0');
 
-  if (CommentMarker == '\0') {
+  if (skipIfAtLineEnd(Pos))
+    ++LineNumber;
+  if (!SkipBlanks && isAtLineEnd(Pos)) {
+    // Nothing to do for a blank line.
+  } else if (CommentMarker == '\0') {
     // If we're not stripping comments, this is simpler.
-    size_t Blanks = 0;
-    while (Pos[Blanks] == '\n')
-      ++Blanks;
-    Pos += Blanks;
-    LineNumber += Blanks;
+    while (skipIfAtLineEnd(Pos))
+      ++LineNumber;
   } else {
     // Skip comments and count line numbers, which is a bit more complex.
     for (;;) {
+      if (isAtLineEnd(Pos) && !SkipBlanks)
+        break;
       if (*Pos == CommentMarker)
         do {
           ++Pos;
-        } while (*Pos != '\0' && *Pos != '\n');
-      if (*Pos != '\n')
+        } while (*Pos != '\0' && !isAtLineEnd(Pos));
+      if (!skipIfAtLineEnd(Pos))
         break;
-      ++Pos;
       ++LineNumber;
     }
   }
@@ -61,9 +86,9 @@
 
   // Measure the line.
   size_t Length = 0;
-  do {
+  while (Pos[Length] != '\0' && !isAtLineEnd(&Pos[Length])) {
     ++Length;
-  } while (Pos[Length] != '\0' && Pos[Length] != '\n');
+  }
 
   CurrentLine = StringRef(Pos, Length);
 }

diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 3f224e0..5b82c36 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp

@@ -39,11 +39,11 @@
     sys::fs::remove(LockFileName);
     return None;
   }
-  std::unique_ptr<MemoryBuffer> MB = std::move(MBOrErr.get());
+  MemoryBuffer &MB = *MBOrErr.get();
 
   StringRef Hostname;
   StringRef PIDStr;
-  std::tie(Hostname, PIDStr) = getToken(MB->getBuffer(), " ");
+  std::tie(Hostname, PIDStr) = getToken(MB.getBuffer(), " ");
   PIDStr = PIDStr.substr(PIDStr.find_first_not_of(" "));
   int PID;
   if (!PIDStr.getAsInteger(10, PID)) {
@@ -204,8 +204,8 @@
     // If the lock file is still expected to be there, check whether it still
     // is.
     if (!LockFileGone) {
-      bool Exists;
-      if (!sys::fs::exists(LockFileName.str(), Exists) && !Exists) {
+      if (sys::fs::access(LockFileName.c_str(), sys::fs::AccessMode::Exist) ==
+          errc::no_such_file_or_directory) {
         LockFileGone = true;
         LockFileJustDisappeared = true;
       }

diff --git a/lib/Support/MD5.cpp b/lib/Support/MD5.cpp
index 514466c..ceab580 100644
--- a/lib/Support/MD5.cpp
+++ b/lib/Support/MD5.cpp

@@ -208,11 +208,11 @@
     memcpy(&buffer[used], Ptr, free);
     Ptr = Ptr + free;
     Size -= free;
-    body(ArrayRef<uint8_t>(buffer, 64));
+    body(makeArrayRef(buffer, 64));
   }
 
   if (Size >= 64) {
-    Ptr = body(ArrayRef<uint8_t>(Ptr, Size & ~(unsigned long) 0x3f));
+    Ptr = body(makeArrayRef(Ptr, Size & ~(unsigned long) 0x3f));
     Size &= 0x3f;
   }
 
@@ -229,7 +229,7 @@
 
 /// \brief Finish the hash and place the resulting hash into \p result.
 /// \param result is assumed to be a minimum of 16-bytes in size.
-void MD5::final(MD5Result &result) {
+void MD5::final(MD5Result &Result) {
   unsigned long used, free;
 
   used = lo & 0x3f;
@@ -240,7 +240,7 @@
 
   if (free < 8) {
     memset(&buffer[used], 0, free);
-    body(ArrayRef<uint8_t>(buffer, 64));
+    body(makeArrayRef(buffer, 64));
     used = 0;
     free = 64;
   }
@@ -257,30 +257,30 @@
   buffer[62] = hi >> 16;
   buffer[63] = hi >> 24;
 
-  body(ArrayRef<uint8_t>(buffer, 64));
+  body(makeArrayRef(buffer, 64));
 
-  result[0] = a;
-  result[1] = a >> 8;
-  result[2] = a >> 16;
-  result[3] = a >> 24;
-  result[4] = b;
-  result[5] = b >> 8;
-  result[6] = b >> 16;
-  result[7] = b >> 24;
-  result[8] = c;
-  result[9] = c >> 8;
-  result[10] = c >> 16;
-  result[11] = c >> 24;
-  result[12] = d;
-  result[13] = d >> 8;
-  result[14] = d >> 16;
-  result[15] = d >> 24;
+  Result[0] = a;
+  Result[1] = a >> 8;
+  Result[2] = a >> 16;
+  Result[3] = a >> 24;
+  Result[4] = b;
+  Result[5] = b >> 8;
+  Result[6] = b >> 16;
+  Result[7] = b >> 24;
+  Result[8] = c;
+  Result[9] = c >> 8;
+  Result[10] = c >> 16;
+  Result[11] = c >> 24;
+  Result[12] = d;
+  Result[13] = d >> 8;
+  Result[14] = d >> 16;
+  Result[15] = d >> 24;
 }
 
-void MD5::stringifyResult(MD5Result &result, SmallString<32> &Str) {
+void MD5::stringifyResult(MD5Result &Result, SmallString<32> &Str) {
   raw_svector_ostream Res(Str);
   for (int i = 0; i < 16; ++i)
-    Res << format("%.2x", result[i]);
+    Res << format("%.2x", Result[i]);
 }
 
 }

diff --git a/lib/Support/MathExtras.cpp b/lib/Support/MathExtras.cpp
new file mode 100644
index 0000000..ba09245
--- /dev/null
+++ b/lib/Support/MathExtras.cpp

@@ -0,0 +1,32 @@
+//===-- MathExtras.cpp - Implement the MathExtras header --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MathExtras.h header
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MathExtras.h"
+
+#ifdef _MSC_VER
+#include <limits>
+#else
+#include <math.h>
+#endif
+
+namespace llvm {
+
+#if defined(_MSC_VER)
+  // Visual Studio defines the HUGE_VAL class of macros using purposeful
+  // constant arithmetic overflow, which it then warns on when encountered.
+  const float huge_valf = std::numeric_limits<float>::infinity();
+#else
+  const float huge_valf = HUGE_VALF;
+#endif
+
+}

diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 5f4b7da..7eb0752 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp

@@ -64,14 +64,17 @@
 
 namespace {
 struct NamedBufferAlloc {
-  StringRef Name;
-  NamedBufferAlloc(StringRef Name) : Name(Name) {}
+  const Twine &Name;
+  NamedBufferAlloc(const Twine &Name) : Name(Name) {}
 };
 }
 
 void *operator new(size_t N, const NamedBufferAlloc &Alloc) {
-  char *Mem = static_cast<char *>(operator new(N + Alloc.Name.size() + 1));
-  CopyStringRef(Mem + N, Alloc.Name);
+  SmallString<256> NameBuf;
+  StringRef NameRef = Alloc.Name.toStringRef(NameBuf);
+
+  char *Mem = static_cast<char *>(operator new(N + NameRef.size() + 1));
+  CopyStringRef(Mem + N, NameRef);
   return Mem;
 }
 
@@ -94,71 +97,86 @@
 };
 }
 
-/// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
-/// that InputData must be a null terminated if RequiresNullTerminator is true!
-MemoryBuffer *MemoryBuffer::getMemBuffer(StringRef InputData,
-                                         StringRef BufferName,
-                                         bool RequiresNullTerminator) {
-  return new (NamedBufferAlloc(BufferName))
+static ErrorOr<std::unique_ptr<MemoryBuffer>>
+getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, 
+           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize);
+
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName,
+                           bool RequiresNullTerminator) {
+  auto *Ret = new (NamedBufferAlloc(BufferName))
       MemoryBufferMem(InputData, RequiresNullTerminator);
+  return std::unique_ptr<MemoryBuffer>(Ret);
 }
 
-/// getMemBufferCopy - Open the specified memory range as a MemoryBuffer,
-/// copying the contents and taking ownership of it.  This has no requirements
-/// on EndPtr[0].
-MemoryBuffer *MemoryBuffer::getMemBufferCopy(StringRef InputData,
-                                             StringRef BufferName) {
-  MemoryBuffer *Buf = getNewUninitMemBuffer(InputData.size(), BufferName);
-  if (!Buf) return nullptr;
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getMemBuffer(MemoryBufferRef Ref, bool RequiresNullTerminator) {
+  return std::unique_ptr<MemoryBuffer>(getMemBuffer(
+      Ref.getBuffer(), Ref.getBufferIdentifier(), RequiresNullTerminator));
+}
+
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) {
+  std::unique_ptr<MemoryBuffer> Buf =
+      getNewUninitMemBuffer(InputData.size(), BufferName);
+  if (!Buf)
+    return nullptr;
   memcpy(const_cast<char*>(Buf->getBufferStart()), InputData.data(),
          InputData.size());
   return Buf;
 }
 
-/// getNewUninitMemBuffer - Allocate a new MemoryBuffer of the specified size
-/// that is not initialized.  Note that the caller should initialize the
-/// memory allocated by this method.  The memory is owned by the MemoryBuffer
-/// object.
-MemoryBuffer *MemoryBuffer::getNewUninitMemBuffer(size_t Size,
-                                                  StringRef BufferName) {
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName) {
   // Allocate space for the MemoryBuffer, the data and the name. It is important
   // that MemoryBuffer and data are aligned so PointerIntPair works with them.
   // TODO: Is 16-byte alignment enough?  We copy small object files with large
   // alignment expectations into this buffer.
+  SmallString<256> NameBuf;
+  StringRef NameRef = BufferName.toStringRef(NameBuf);
   size_t AlignedStringLen =
-      RoundUpToAlignment(sizeof(MemoryBufferMem) + BufferName.size() + 1, 16);
+      RoundUpToAlignment(sizeof(MemoryBufferMem) + NameRef.size() + 1, 16);
   size_t RealLen = AlignedStringLen + Size + 1;
   char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
-  if (!Mem) return nullptr;
+  if (!Mem)
+    return nullptr;
 
   // The name is stored after the class itself.
-  CopyStringRef(Mem + sizeof(MemoryBufferMem), BufferName);
+  CopyStringRef(Mem + sizeof(MemoryBufferMem), NameRef);
 
   // The buffer begins after the name and must be aligned.
   char *Buf = Mem + AlignedStringLen;
   Buf[Size] = 0; // Null terminate buffer.
 
-  return new (Mem) MemoryBufferMem(StringRef(Buf, Size), true);
+  auto *Ret = new (Mem) MemoryBufferMem(StringRef(Buf, Size), true);
+  return std::unique_ptr<MemoryBuffer>(Ret);
 }
 
-/// getNewMemBuffer - Allocate a new MemoryBuffer of the specified size that
-/// is completely initialized to zeros.  Note that the caller should
-/// initialize the memory allocated by this method.  The memory is owned by
-/// the MemoryBuffer object.
-MemoryBuffer *MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
-  MemoryBuffer *SB = getNewUninitMemBuffer(Size, BufferName);
-  if (!SB) return nullptr;
+std::unique_ptr<MemoryBuffer>
+MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
+  std::unique_ptr<MemoryBuffer> SB = getNewUninitMemBuffer(Size, BufferName);
+  if (!SB)
+    return nullptr;
   memset(const_cast<char*>(SB->getBufferStart()), 0, Size);
   return SB;
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
-MemoryBuffer::getFileOrSTDIN(StringRef Filename, int64_t FileSize) {
-  if (Filename == "-")
+MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize) {
+  SmallString<256> NameBuf;
+  StringRef NameRef = Filename.toStringRef(NameBuf);
+
+  if (NameRef == "-")
     return getSTDIN();
   return getFile(Filename, FileSize);
 }
 
+ErrorOr<std::unique_ptr<MemoryBuffer>>
+MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize, 
+                           uint64_t Offset) {
+  return getFileAux(FilePath, -1, MapSize, Offset, false, false);
+}
+
 
 //===----------------------------------------------------------------------===//
 // MemoryBuffer::getFile implementation.
@@ -206,7 +224,7 @@
 }
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
-getMemoryBufferForStream(int FD, StringRef BufferName) {
+getMemoryBufferForStream(int FD, const Twine &BufferName) {
   const ssize_t ChunkSize = 4096*4;
   SmallString<ChunkSize> Buffer;
   ssize_t ReadBytes;
@@ -221,40 +239,32 @@
     Buffer.set_size(Buffer.size() + ReadBytes);
   } while (ReadBytes != 0);
 
-  std::unique_ptr<MemoryBuffer> Ret(
-      MemoryBuffer::getMemBufferCopy(Buffer, BufferName));
-  return std::move(Ret);
+  return MemoryBuffer::getMemBufferCopy(Buffer, BufferName);
 }
 
-static ErrorOr<std::unique_ptr<MemoryBuffer>>
-getFileAux(const char *Filename, int64_t FileSize, bool RequiresNullTerminator,
-           bool IsVolatileSize);
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
-MemoryBuffer::getFile(Twine Filename, int64_t FileSize,
+MemoryBuffer::getFile(const Twine &Filename, int64_t FileSize,
                       bool RequiresNullTerminator, bool IsVolatileSize) {
-  // Ensure the path is null terminated.
-  SmallString<256> PathBuf;
-  StringRef NullTerminatedName = Filename.toNullTerminatedStringRef(PathBuf);
-  return getFileAux(NullTerminatedName.data(), FileSize, RequiresNullTerminator,
-                    IsVolatileSize);
+  return getFileAux(Filename, FileSize, FileSize, 0,
+                    RequiresNullTerminator, IsVolatileSize);
 }
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
-getOpenFileImpl(int FD, const char *Filename, uint64_t FileSize,
+getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
                 bool IsVolatileSize);
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
-getFileAux(const char *Filename, int64_t FileSize, bool RequiresNullTerminator,
-           bool IsVolatileSize) {
+getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
+           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize) {
   int FD;
   std::error_code EC = sys::fs::openFileForRead(Filename, FD);
   if (EC)
     return EC;
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> Ret =
-      getOpenFileImpl(FD, Filename, FileSize, FileSize, 0,
+      getOpenFileImpl(FD, Filename, FileSize, MapSize, Offset,
                       RequiresNullTerminator, IsVolatileSize);
   close(FD);
   return Ret;
@@ -305,11 +315,19 @@
   if ((FileSize & (PageSize -1)) == 0)
     return false;
 
+#if defined(__CYGWIN__)
+  // Don't try to map files that are exactly a multiple of the physical page size
+  // if we need a null terminator.
+  // FIXME: We should reorganize again getPageSize() on Win32.
+  if ((FileSize & (4096 - 1)) == 0)
+    return false;
+#endif
+
   return true;
 }
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
-getOpenFileImpl(int FD, const char *Filename, uint64_t FileSize,
+getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
                 bool IsVolatileSize) {
   static int PageSize = sys::process::get_self()->page_size();
@@ -347,15 +365,15 @@
       return std::move(Result);
   }
 
-  MemoryBuffer *Buf = MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename);
+  std::unique_ptr<MemoryBuffer> Buf =
+      MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename);
   if (!Buf) {
     // Failed to create a buffer. The only way it can fail is if
     // new(std::nothrow) returns 0.
     return make_error_code(errc::not_enough_memory);
   }
 
-  std::unique_ptr<MemoryBuffer> SB(Buf);
-  char *BufPtr = const_cast<char*>(SB->getBufferStart());
+  char *BufPtr = const_cast<char *>(Buf->getBufferStart());
 
   size_t BytesLeft = MapSize;
 #ifndef HAVE_PREAD
@@ -383,21 +401,22 @@
     BufPtr += NumRead;
   }
 
-  return std::move(SB);
+  return std::move(Buf);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
-MemoryBuffer::getOpenFile(int FD, const char *Filename, uint64_t FileSize,
+MemoryBuffer::getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,
                           bool RequiresNullTerminator, bool IsVolatileSize) {
   return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0,
                          RequiresNullTerminator, IsVolatileSize);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
-MemoryBuffer::getOpenFileSlice(int FD, const char *Filename, uint64_t MapSize,
-                               int64_t Offset, bool IsVolatileSize) {
+MemoryBuffer::getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,
+                               int64_t Offset) {
+  assert(MapSize != uint64_t(-1));
   return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false,
-                         IsVolatileSize);
+                         /*IsVolatileSize*/ false);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>> MemoryBuffer::getSTDIN() {
@@ -409,3 +428,9 @@
 
   return getMemoryBufferForStream(0, "<stdin>");
 }
+
+MemoryBufferRef MemoryBuffer::getMemBufferRef() const {
+  StringRef Data = getBuffer();
+  StringRef Identifier = getBufferIdentifier();
+  return MemoryBufferRef(Data, Identifier);
+}

diff --git a/lib/Support/MemoryObject.cpp b/lib/Support/MemoryObject.cpp
index 02b5b50..d796acf 100644
--- a/lib/Support/MemoryObject.cpp
+++ b/lib/Support/MemoryObject.cpp

@@ -12,22 +12,3 @@
   
 MemoryObject::~MemoryObject() {
 }
-
-int MemoryObject::readBytes(uint64_t address,
-                            uint64_t size,
-                            uint8_t* buf) const {
-  uint64_t current = address;
-  uint64_t limit = getBase() + getExtent();
-
-  if (current + size > limit)
-    return -1;
-
-  while (current - address < size) {
-    if (readByte(current, &buf[(current - address)]))
-      return -1;
-    
-    current++;
-  }
-  
-  return 0;
-}

diff --git a/lib/Support/Options.cpp b/lib/Support/Options.cpp
new file mode 100644
index 0000000..7125845
--- /dev/null
+++ b/lib/Support/Options.cpp

@@ -0,0 +1,33 @@
+//===- llvm/Support/Options.cpp - Debug options support ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the helper objects for defining debug options using the
+// new API built on cl::opt, but not requiring the use of static globals.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Options.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+
+OptionRegistry::~OptionRegistry() {
+  for (auto IT = Options.begin(); IT != Options.end(); ++IT)
+    delete IT->second;
+}
+
+void OptionRegistry::addOption(void *Key, cl::Option *O) {
+  assert(Options.find(Key) == Options.end() &&
+         "Argument with this key already registerd");
+  Options.insert(std::make_pair(Key, O));
+}
+
+static ManagedStatic<OptionRegistry> OR;
+
+OptionRegistry &OptionRegistry::instance() { return *OR; }

diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index d5a0ec5..a7a9919 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp

@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Process.h"
@@ -164,9 +165,6 @@
   FS_Name
 };
 
-// Implemented in Unix/Path.inc and Windows/Path.inc.
-static std::error_code TempDir(SmallVectorImpl<char> &result);
-
 static std::error_code createUniqueEntity(const Twine &Model, int &ResultFD,
                                           SmallVectorImpl<char> &ResultPath,
                                           bool MakeAbsolute, unsigned Mode,
@@ -178,8 +176,7 @@
     // Make model absolute by prepending a temp directory if it's not already.
     if (!sys::path::is_absolute(Twine(ModelStorage))) {
       SmallString<128> TDir;
-      if (std::error_code EC = TempDir(TDir))
-        return EC;
+      sys::path::system_temp_directory(true, TDir);
       sys::path::append(TDir, Twine(ModelStorage));
       ModelStorage.swap(TDir);
     }
@@ -214,13 +211,13 @@
   }
 
   case FS_Name: {
-    bool Exists;
-    std::error_code EC = sys::fs::exists(ResultPath.begin(), Exists);
+    std::error_code EC =
+        sys::fs::access(ResultPath.begin(), sys::fs::AccessMode::Exist);
+    if (EC == errc::no_such_file_or_directory)
+      return std::error_code();
     if (EC)
       return EC;
-    if (Exists)
-      goto retry_random_path;
-    return std::error_code();
+    goto retry_random_path;
   }
 
   case FS_Dir: {
@@ -308,7 +305,30 @@
   return *this;
 }
 
-const_iterator &const_iterator::operator--() {
+bool const_iterator::operator==(const const_iterator &RHS) const {
+  return Path.begin() == RHS.Path.begin() && Position == RHS.Position;
+}
+
+ptrdiff_t const_iterator::operator-(const const_iterator &RHS) const {
+  return Position - RHS.Position;
+}
+
+reverse_iterator rbegin(StringRef Path) {
+  reverse_iterator I;
+  I.Path = Path;
+  I.Position = Path.size();
+  return ++I;
+}
+
+reverse_iterator rend(StringRef Path) {
+  reverse_iterator I;
+  I.Path = Path;
+  I.Component = Path.substr(0, 0);
+  I.Position = 0;
+  return I;
+}
+
+reverse_iterator &reverse_iterator::operator++() {
   // If we're at the end and the previous char was a '/', return '.' unless
   // we are the root path.
   size_t root_dir_pos = root_dir_start(Path);
@@ -335,20 +355,12 @@
   return *this;
 }
 
-bool const_iterator::operator==(const const_iterator &RHS) const {
-  return Path.begin() == RHS.Path.begin() &&
+bool reverse_iterator::operator==(const reverse_iterator &RHS) const {
+  return Path.begin() == RHS.Path.begin() && Component == RHS.Component &&
          Position == RHS.Position;
 }
 
-bool const_iterator::operator!=(const const_iterator &RHS) const {
-  return !(*this == RHS);
-}
-
-ptrdiff_t const_iterator::operator-(const const_iterator &RHS) const {
-  return Position - RHS.Position;
-}
-
-const StringRef root_path(StringRef path) {
+StringRef root_path(StringRef path) {
   const_iterator b = begin(path),
                  pos = b,
                  e = end(path);
@@ -380,7 +392,7 @@
   return StringRef();
 }
 
-const StringRef root_name(StringRef path) {
+StringRef root_name(StringRef path) {
   const_iterator b = begin(path),
                  e = end(path);
   if (b != e) {
@@ -402,7 +414,7 @@
   return StringRef();
 }
 
-const StringRef root_directory(StringRef path) {
+StringRef root_directory(StringRef path) {
   const_iterator b = begin(path),
                  pos = b,
                  e = end(path);
@@ -431,7 +443,7 @@
   return StringRef();
 }
 
-const StringRef relative_path(StringRef path) {
+StringRef relative_path(StringRef path) {
   StringRef root = root_path(path);
   return path.substr(root.size());
 }
@@ -483,7 +495,7 @@
     path::append(path, *begin);
 }
 
-const StringRef parent_path(StringRef path) {
+StringRef parent_path(StringRef path) {
   size_t end_pos = parent_path_end(path);
   if (end_pos == StringRef::npos)
     return StringRef();
@@ -525,17 +537,27 @@
   native(result);
 }
 
-void native(SmallVectorImpl<char> &path) {
+void native(SmallVectorImpl<char> &Path) {
 #ifdef LLVM_ON_WIN32
-  std::replace(path.begin(), path.end(), '/', '\\');
+  std::replace(Path.begin(), Path.end(), '/', '\\');
+#else
+  for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
+    if (*PI == '\\') {
+      auto PN = PI + 1;
+      if (PN < PE && *PN == '\\')
+        ++PI; // increment once, the for loop will move over the escaped slash
+      else
+        *PI = '/';
+    }
+  }
 #endif
 }
 
-const StringRef filename(StringRef path) {
-  return *(--end(path));
+StringRef filename(StringRef path) {
+  return *rbegin(path);
 }
 
-const StringRef stem(StringRef path) {
+StringRef stem(StringRef path) {
   StringRef fname = filename(path);
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
@@ -548,7 +570,7 @@
       return fname.substr(0, pos);
 }
 
-const StringRef extension(StringRef path) {
+StringRef extension(StringRef path) {
   StringRef fname = filename(path);
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
@@ -573,62 +595,10 @@
 
 static const char preferred_separator_string[] = { preferred_separator, '\0' };
 
-const StringRef get_separator() {
+StringRef get_separator() {
   return preferred_separator_string;
 }
 
-void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result) {
-  result.clear();
-
-#if defined(_CS_DARWIN_USER_TEMP_DIR) && defined(_CS_DARWIN_USER_CACHE_DIR)
-  // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
-  // macros defined in <unistd.h> on darwin >= 9
-  int ConfName = erasedOnReboot? _CS_DARWIN_USER_TEMP_DIR
-                               : _CS_DARWIN_USER_CACHE_DIR;
-  size_t ConfLen = confstr(ConfName, nullptr, 0);
-  if (ConfLen > 0) {
-    do {
-      result.resize(ConfLen);
-      ConfLen = confstr(ConfName, result.data(), result.size());
-    } while (ConfLen > 0 && ConfLen != result.size());
-
-    if (ConfLen > 0) {
-      assert(result.back() == 0);
-      result.pop_back();
-      return;
-    }
-
-    result.clear();
-  }
-#endif
-
-  // Check whether the temporary directory is specified by an environment
-  // variable.
-  const char *EnvironmentVariable;
-#ifdef LLVM_ON_WIN32
-  EnvironmentVariable = "TEMP";
-#else
-  EnvironmentVariable = "TMPDIR";
-#endif
-  if (char *RequestedDir = getenv(EnvironmentVariable)) {
-    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
-    return;
-  }
-
-  // Fall back to a system default.
-  const char *DefaultResult;
-#ifdef LLVM_ON_WIN32
-  (void)erasedOnReboot;
-  DefaultResult = "C:\\TEMP";
-#else
-  if (erasedOnReboot)
-    DefaultResult = "/tmp";
-  else
-    DefaultResult = "/var/tmp";
-#endif
-  result.append(DefaultResult, DefaultResult + strlen(DefaultResult));
-}
-
 bool has_root_name(const Twine &path) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
@@ -932,10 +902,23 @@
     return file_magic::unknown;
   switch ((unsigned char)Magic[0]) {
     case 0x00: {
-      // COFF short import library file
+      // COFF bigobj or short import library file
       if (Magic[1] == (char)0x00 && Magic[2] == (char)0xff &&
-          Magic[3] == (char)0xff)
-        return file_magic::coff_import_library;
+          Magic[3] == (char)0xff) {
+        size_t MinSize = offsetof(COFF::BigObjHeader, UUID) + sizeof(COFF::BigObjMagic);
+        if (Magic.size() < MinSize)
+          return file_magic::coff_import_library;
+
+        int BigObjVersion = *reinterpret_cast<const support::ulittle16_t*>(
+            Magic.data() + offsetof(COFF::BigObjHeader, Version));
+        if (BigObjVersion < COFF::BigObjHeader::MinBigObjectVersion)
+          return file_magic::coff_import_library;
+
+        const char *Start = Magic.data() + offsetof(COFF::BigObjHeader, UUID);
+        if (memcmp(Start, COFF::BigObjMagic, sizeof(COFF::BigObjMagic)) != 0)
+          return file_magic::coff_import_library;
+        return file_magic::coff_object;
+      }
       // Windows resource file
       const char Expected[] = { 0, 0, 0, 0, '\x20', 0, 0, 0, '\xff' };
       if (Magic.size() >= sizeof(Expected) &&
@@ -975,6 +958,9 @@
             case 3: return file_magic::elf_shared_object;
             case 4: return file_magic::elf_core;
           }
+        else
+          // It's still some type of ELF file.
+          return file_magic::elf;
       }
       break;
 
@@ -1016,7 +1002,7 @@
         case 6: return file_magic::macho_dynamically_linked_shared_lib;
         case 7: return file_magic::macho_dynamic_linker;
         case 8: return file_magic::macho_bundle;
-        case 9: return file_magic::macho_dynamic_linker;
+        case 9: return file_magic::macho_dynamically_linked_shared_lib_stub;
         case 10: return file_magic::macho_dsym_companion;
       }
       break;
@@ -1037,12 +1023,13 @@
         return file_magic::coff_object;
       break;
 
-    case 0x4d: // Possible MS-DOS stub on Windows PE file
-      if (Magic[1] == 0x5a) {
+    case 'M': // Possible MS-DOS stub on Windows PE file
+      if (Magic[1] == 'Z') {
         uint32_t off =
           *reinterpret_cast<const support::ulittle32_t*>(Magic.data() + 0x3c);
         // PE/COFF file, either EXE or DLL.
-        if (off < Magic.size() && memcmp(Magic.data() + off, "PE\0\0",4) == 0)
+        if (off < Magic.size() &&
+            memcmp(Magic.data()+off, COFF::PEMagic, sizeof(COFF::PEMagic)) == 0)
           return file_magic::pecoff_executable;
       }
       break;

diff --git a/lib/Support/ScaledNumber.cpp b/lib/Support/ScaledNumber.cpp
index 3fe027b..fc6d4e7 100644
--- a/lib/Support/ScaledNumber.cpp
+++ b/lib/Support/ScaledNumber.cpp

@@ -220,6 +220,9 @@
   } else if (E > -64) {
     Above0 = D >> -E;
     Below0 = D << (64 + E);
+  } else if (E == -64) {
+    // Special case: shift by 64 bits is undefined behavior.
+    Below0 = D;
   } else if (E > -120) {
     Below0 = D >> (-E - 64);
     Extra = D << (128 + E);

diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index a80e095..c87ee7d 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp

@@ -34,18 +34,19 @@
   memset(CurArray, -1, CurArraySize*sizeof(void*));
 }
 
-bool SmallPtrSetImplBase::insert_imp(const void * Ptr) {
+std::pair<const void *const *, bool>
+SmallPtrSetImplBase::insert_imp(const void *Ptr) {
   if (isSmall()) {
     // Check to see if it is already in the set.
     for (const void **APtr = SmallArray, **E = SmallArray+NumElements;
          APtr != E; ++APtr)
       if (*APtr == Ptr)
-        return false;
-    
+        return std::make_pair(APtr, false);
+
     // Nope, there isn't.  If we stay small, just 'pushback' now.
-    if (NumElements < CurArraySize-1) {
+    if (NumElements < CurArraySize) {
       SmallArray[NumElements++] = Ptr;
-      return true;
+      return std::make_pair(SmallArray + (NumElements - 1), true);
     }
     // Otherwise, hit the big set case, which will call grow.
   }
@@ -61,14 +62,15 @@
   
   // Okay, we know we have space.  Find a hash bucket.
   const void **Bucket = const_cast<const void**>(FindBucketFor(Ptr));
-  if (*Bucket == Ptr) return false; // Already inserted, good.
-  
+  if (*Bucket == Ptr)
+    return std::make_pair(Bucket, false); // Already inserted, good.
+
   // Otherwise, insert it!
   if (*Bucket == getTombstoneMarker())
     --NumTombstones;
   *Bucket = Ptr;
   ++NumElements;  // Track density.
-  return true;
+  return std::make_pair(Bucket, true);
 }
 
 bool SmallPtrSetImplBase::erase_imp(const void * Ptr) {
@@ -200,13 +202,12 @@
   if (that.isSmall()) {
     CurArray = SmallArray;
     memcpy(CurArray, that.CurArray, sizeof(void *) * CurArraySize);
-    return;
+  } else {
+    // Otherwise, we steal the large memory allocation and no copy is needed.
+    CurArray = that.CurArray;
+    that.CurArray = that.SmallArray;
   }
 
-  // Otherwise, we steal the large memory allocation and no copy is needed.
-  CurArray = that.CurArray;
-  that.CurArray = that.SmallArray;
-
   // Make the "that" object small and empty.
   that.CurArraySize = SmallSize;
   assert(that.CurArray == that.SmallArray);

diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 003cb56..b50a66b 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp

@@ -42,11 +42,6 @@
   // Delete the line # cache if allocated.
   if (LineNoCacheTy *Cache = getCache(LineNoCache))
     delete Cache;
-
-  while (!Buffers.empty()) {
-    delete Buffers.back().Buffer;
-    Buffers.pop_back();
-  }
 }
 
 unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
@@ -54,20 +49,20 @@
                                    std::string &IncludedFile) {
   IncludedFile = Filename;
   ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr =
-      MemoryBuffer::getFile(IncludedFile.c_str());
+    MemoryBuffer::getFile(IncludedFile);
 
   // If the file didn't exist directly, see if it's in an include path.
   for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBufOrErr;
        ++i) {
     IncludedFile =
         IncludeDirectories[i] + sys::path::get_separator().data() + Filename;
-    NewBufOrErr = MemoryBuffer::getFile(IncludedFile.c_str());
+    NewBufOrErr = MemoryBuffer::getFile(IncludedFile);
   }
 
   if (!NewBufOrErr)
     return 0;
 
-  return AddNewSourceBuffer(NewBufOrErr.get().release(), IncludeLoc);
+  return AddNewSourceBuffer(std::move(*NewBufOrErr), IncludeLoc);
 }
 
 unsigned SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {

diff --git a/lib/Support/SpecialCaseList.cpp b/lib/Support/SpecialCaseList.cpp
index 21e43c5..785cc60 100644
--- a/lib/Support/SpecialCaseList.cpp
+++ b/lib/Support/SpecialCaseList.cpp

@@ -48,10 +48,10 @@
 
 SpecialCaseList::SpecialCaseList() : Entries() {}
 
-SpecialCaseList *SpecialCaseList::create(
-    const StringRef Path, std::string &Error) {
+std::unique_ptr<SpecialCaseList> SpecialCaseList::create(StringRef Path,
+                                                         std::string &Error) {
   if (Path.empty())
-    return new SpecialCaseList();
+    return std::unique_ptr<SpecialCaseList>(new SpecialCaseList());
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFile(Path);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -61,17 +61,17 @@
   return create(FileOrErr.get().get(), Error);
 }
 
-SpecialCaseList *SpecialCaseList::create(
-    const MemoryBuffer *MB, std::string &Error) {
+std::unique_ptr<SpecialCaseList> SpecialCaseList::create(const MemoryBuffer *MB,
+                                                         std::string &Error) {
   std::unique_ptr<SpecialCaseList> SCL(new SpecialCaseList());
   if (!SCL->parse(MB, Error))
     return nullptr;
-  return SCL.release();
+  return SCL;
 }
 
-SpecialCaseList *SpecialCaseList::createOrDie(const StringRef Path) {
+std::unique_ptr<SpecialCaseList> SpecialCaseList::createOrDie(StringRef Path) {
   std::string Error;
-  if (SpecialCaseList *SCL = create(Path, Error))
+  if (auto SCL = create(Path, Error))
     return SCL;
   report_fatal_error(Error);
 }
@@ -103,18 +103,6 @@
     std::string Regexp = SplitRegexp.first;
     StringRef Category = SplitRegexp.second;
 
-    // Backwards compatibility.
-    if (Prefix == "global-init") {
-      Prefix = "global";
-      Category = "init";
-    } else if (Prefix == "global-init-type") {
-      Prefix = "type";
-      Category = "init";
-    } else if (Prefix == "global-init-src") {
-      Prefix = "src";
-      Category = "init";
-    }
-
     // See if we can store Regexp in Strings.
     if (Regex::isLiteralERE(Regexp)) {
       Entries[Prefix][Category].Strings.insert(Regexp);
@@ -157,8 +145,8 @@
 
 SpecialCaseList::~SpecialCaseList() {}
 
-bool SpecialCaseList::inSection(const StringRef Section, const StringRef Query,
-                                const StringRef Category) const {
+bool SpecialCaseList::inSection(StringRef Section, StringRef Query,
+                                StringRef Category) const {
   StringMap<StringMap<Entry> >::const_iterator I = Entries.find(Section);
   if (I == Entries.end()) return false;
   StringMap<Entry>::const_iterator II = I->second.find(Category);

diff --git a/lib/Support/StreamableMemoryObject.cpp b/lib/Support/StreamableMemoryObject.cpp
deleted file mode 100644
index 5cb0680..0000000
--- a/lib/Support/StreamableMemoryObject.cpp
+++ /dev/null

@@ -1,140 +0,0 @@
-//===- StreamableMemoryObject.cpp - Streamable data interface -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/StreamableMemoryObject.h"
-#include "llvm/Support/Compiler.h"
-#include <cassert>
-#include <cstddef>
-#include <cstring>
-
-
-using namespace llvm;
-
-namespace {
-
-class RawMemoryObject : public StreamableMemoryObject {
-public:
-  RawMemoryObject(const unsigned char *Start, const unsigned char *End) :
-    FirstChar(Start), LastChar(End) {
-    assert(LastChar >= FirstChar && "Invalid start/end range");
-  }
-
-  uint64_t getBase() const override { return 0; }
-  uint64_t getExtent() const override {
-    return LastChar - FirstChar;
-  }
-  int readByte(uint64_t address, uint8_t* ptr) const override;
-  int readBytes(uint64_t address, uint64_t size,
-                uint8_t *buf) const override;
-  const uint8_t *getPointer(uint64_t address, uint64_t size) const override;
-  bool isValidAddress(uint64_t address) const override {
-    return validAddress(address);
-  }
-  bool isObjectEnd(uint64_t address) const override {
-    return objectEnd(address);
-  }
-
-private:
-  const uint8_t* const FirstChar;
-  const uint8_t* const LastChar;
-
-  // These are implemented as inline functions here to avoid multiple virtual
-  // calls per public function
-  bool validAddress(uint64_t address) const {
-    return static_cast<std::ptrdiff_t>(address) < LastChar - FirstChar;
-  }
-  bool objectEnd(uint64_t address) const {
-    return static_cast<std::ptrdiff_t>(address) == LastChar - FirstChar;
-  }
-
-  RawMemoryObject(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
-  void operator=(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
-};
-
-int RawMemoryObject::readByte(uint64_t address, uint8_t* ptr) const {
-  if (!validAddress(address)) return -1;
-  *ptr = *((uint8_t *)(uintptr_t)(address + FirstChar));
-  return 0;
-}
-
-int RawMemoryObject::readBytes(uint64_t address,
-                               uint64_t size,
-                               uint8_t *buf) const {
-  if (!validAddress(address) || !validAddress(address + size - 1)) return -1;
-  memcpy(buf, (uint8_t *)(uintptr_t)(address + FirstChar), size);
-  return size;
-}
-
-const uint8_t *RawMemoryObject::getPointer(uint64_t address,
-                                           uint64_t size) const {
-  return FirstChar + address;
-}
-} // anonymous namespace
-
-namespace llvm {
-// If the bitcode has a header, then its size is known, and we don't have to
-// block until we actually want to read it.
-bool StreamingMemoryObject::isValidAddress(uint64_t address) const {
-  if (ObjectSize && address < ObjectSize) return true;
-    return fetchToPos(address);
-}
-
-bool StreamingMemoryObject::isObjectEnd(uint64_t address) const {
-  if (ObjectSize) return address == ObjectSize;
-  fetchToPos(address);
-  return address == ObjectSize && address != 0;
-}
-
-uint64_t StreamingMemoryObject::getExtent() const {
-  if (ObjectSize) return ObjectSize;
-  size_t pos = BytesRead + kChunkSize;
-  // keep fetching until we run out of bytes
-  while (fetchToPos(pos)) pos += kChunkSize;
-  return ObjectSize;
-}
-
-int StreamingMemoryObject::readByte(uint64_t address, uint8_t* ptr) const {
-  if (!fetchToPos(address)) return -1;
-  *ptr = Bytes[address + BytesSkipped];
-  return 0;
-}
-
-int StreamingMemoryObject::readBytes(uint64_t address,
-                                     uint64_t size,
-                                     uint8_t *buf) const {
-  if (!fetchToPos(address + size - 1)) return -1;
-  memcpy(buf, &Bytes[address + BytesSkipped], size);
-  return 0;
-}
-
-bool StreamingMemoryObject::dropLeadingBytes(size_t s) {
-  if (BytesRead < s) return true;
-  BytesSkipped = s;
-  BytesRead -= s;
-  return false;
-}
-
-void StreamingMemoryObject::setKnownObjectSize(size_t size) {
-  ObjectSize = size;
-  Bytes.reserve(size);
-}
-
-StreamableMemoryObject *getNonStreamedMemoryObject(
-    const unsigned char *Start, const unsigned char *End) {
-  return new RawMemoryObject(Start, End);
-}
-
-StreamableMemoryObject::~StreamableMemoryObject() { }
-
-StreamingMemoryObject::StreamingMemoryObject(DataStreamer *streamer) :
-  Bytes(kChunkSize), Streamer(streamer), BytesRead(0), BytesSkipped(0),
-  ObjectSize(0), EOFReached(false) {
-  BytesRead = streamer->GetBytes(&Bytes[0], kChunkSize);
-}
-}

diff --git a/lib/Support/StreamingMemoryObject.cpp b/lib/Support/StreamingMemoryObject.cpp
new file mode 100644
index 0000000..68beeef
--- /dev/null
+++ b/lib/Support/StreamingMemoryObject.cpp

@@ -0,0 +1,127 @@
+//===- StreamingMemoryObject.cpp - Streamable data interface -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/StreamingMemoryObject.h"
+#include "llvm/Support/Compiler.h"
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+
+
+using namespace llvm;
+
+namespace {
+
+class RawMemoryObject : public MemoryObject {
+public:
+  RawMemoryObject(const unsigned char *Start, const unsigned char *End) :
+    FirstChar(Start), LastChar(End) {
+    assert(LastChar >= FirstChar && "Invalid start/end range");
+  }
+
+  uint64_t getExtent() const override {
+    return LastChar - FirstChar;
+  }
+  uint64_t readBytes(uint8_t *Buf, uint64_t Size,
+                     uint64_t Address) const override;
+  const uint8_t *getPointer(uint64_t address, uint64_t size) const override;
+  bool isValidAddress(uint64_t address) const override {
+    return validAddress(address);
+  }
+
+private:
+  const uint8_t* const FirstChar;
+  const uint8_t* const LastChar;
+
+  // These are implemented as inline functions here to avoid multiple virtual
+  // calls per public function
+  bool validAddress(uint64_t address) const {
+    return static_cast<std::ptrdiff_t>(address) < LastChar - FirstChar;
+  }
+
+  RawMemoryObject(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
+  void operator=(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
+};
+
+uint64_t RawMemoryObject::readBytes(uint8_t *Buf, uint64_t Size,
+                                    uint64_t Address) const {
+  uint64_t BufferSize = LastChar - FirstChar;
+  if (Address >= BufferSize)
+    return 0;
+
+  uint64_t End = Address + Size;
+  if (End > BufferSize)
+    End = BufferSize;
+
+  assert(static_cast<int64_t>(End - Address) >= 0);
+  Size = End - Address;
+  memcpy(Buf, Address + FirstChar, Size);
+  return Size;
+}
+
+const uint8_t *RawMemoryObject::getPointer(uint64_t address,
+                                           uint64_t size) const {
+  return FirstChar + address;
+}
+} // anonymous namespace
+
+namespace llvm {
+// If the bitcode has a header, then its size is known, and we don't have to
+// block until we actually want to read it.
+bool StreamingMemoryObject::isValidAddress(uint64_t address) const {
+  if (ObjectSize && address < ObjectSize) return true;
+    return fetchToPos(address);
+}
+
+uint64_t StreamingMemoryObject::getExtent() const {
+  if (ObjectSize) return ObjectSize;
+  size_t pos = BytesRead + kChunkSize;
+  // keep fetching until we run out of bytes
+  while (fetchToPos(pos)) pos += kChunkSize;
+  return ObjectSize;
+}
+
+uint64_t StreamingMemoryObject::readBytes(uint8_t *Buf, uint64_t Size,
+                                          uint64_t Address) const {
+  fetchToPos(Address + Size - 1);
+  if (Address >= BytesRead)
+    return 0;
+
+  uint64_t End = Address + Size;
+  if (End > BytesRead)
+    End = BytesRead;
+  assert(static_cast<int64_t>(End - Address) >= 0);
+  Size = End - Address;
+  memcpy(Buf, &Bytes[Address + BytesSkipped], Size);
+  return Size;
+}
+
+bool StreamingMemoryObject::dropLeadingBytes(size_t s) {
+  if (BytesRead < s) return true;
+  BytesSkipped = s;
+  BytesRead -= s;
+  return false;
+}
+
+void StreamingMemoryObject::setKnownObjectSize(size_t size) {
+  ObjectSize = size;
+  Bytes.reserve(size);
+}
+
+MemoryObject *getNonStreamedMemoryObject(const unsigned char *Start,
+                                         const unsigned char *End) {
+  return new RawMemoryObject(Start, End);
+}
+
+StreamingMemoryObject::StreamingMemoryObject(DataStreamer *streamer) :
+  Bytes(kChunkSize), Streamer(streamer), BytesRead(0), BytesSkipped(0),
+  ObjectSize(0), EOFReached(false) {
+  BytesRead = streamer->GetBytes(&Bytes[0], kChunkSize);
+}
+}

diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index cde8258..ddece08 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp

@@ -50,7 +50,7 @@
 
 /// compare_lower - Compare strings, ignoring case.
 int StringRef::compare_lower(StringRef RHS) const {
-  if (int Res = ascii_strncasecmp(Data, RHS.Data, min(Length, RHS.Length)))
+  if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
     return Res;
   if (Length == RHS.Length)
     return 0;
@@ -71,7 +71,7 @@
 
 /// compare_numeric - Compare strings, handle embedded numbers.
 int StringRef::compare_numeric(StringRef RHS) const {
-  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
+  for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
     // Check for sequences of digits.
     if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
       // The longer sequence of numbers is considered larger.
@@ -105,8 +105,8 @@
                                   bool AllowReplacements,
                                   unsigned MaxEditDistance) const {
   return llvm::ComputeEditDistance(
-      llvm::ArrayRef<char>(data(), size()),
-      llvm::ArrayRef<char>(Other.data(), Other.size()),
+      makeArrayRef(data(), size()),
+      makeArrayRef(Other.data(), Other.size()),
       AllowReplacements, MaxEditDistance);
 }
 
@@ -146,7 +146,7 @@
 
   // For short haystacks or unsupported needles fall back to the naive algorithm
   if (Length < 16 || N > 255 || N == 0) {
-    for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
+    for (size_t e = Length - N + 1, i = std::min(From, e); i != e; ++i)
       if (substr(i, N).equals(Str))
         return i;
     return npos;
@@ -201,7 +201,7 @@
   for (size_type i = 0; i != Chars.size(); ++i)
     CharBits.set((unsigned char)Chars[i]);
 
-  for (size_type i = min(From, Length), e = Length; i != e; ++i)
+  for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
     if (CharBits.test((unsigned char)Data[i]))
       return i;
   return npos;
@@ -210,7 +210,7 @@
 /// find_first_not_of - Find the first character in the string that is not
 /// \arg C or npos if not found.
 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
-  for (size_type i = min(From, Length), e = Length; i != e; ++i)
+  for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
     if (Data[i] != C)
       return i;
   return npos;
@@ -226,7 +226,7 @@
   for (size_type i = 0; i != Chars.size(); ++i)
     CharBits.set((unsigned char)Chars[i]);
 
-  for (size_type i = min(From, Length), e = Length; i != e; ++i)
+  for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
     if (!CharBits.test((unsigned char)Data[i]))
       return i;
   return npos;
@@ -242,7 +242,7 @@
   for (size_type i = 0; i != Chars.size(); ++i)
     CharBits.set((unsigned char)Chars[i]);
 
-  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+  for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
     if (CharBits.test((unsigned char)Data[i]))
       return i;
   return npos;
@@ -251,7 +251,7 @@
 /// find_last_not_of - Find the last character in the string that is not
 /// \arg C, or npos if not found.
 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
-  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+  for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
     if (Data[i] != C)
       return i;
   return npos;
@@ -267,7 +267,7 @@
   for (size_type i = 0, e = Chars.size(); i != e; ++i)
     CharBits.set((unsigned char)Chars[i]);
 
-  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+  for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
     if (!CharBits.test((unsigned char)Data[i]))
       return i;
   return npos;

diff --git a/lib/Support/StringRefMemoryObject.cpp b/lib/Support/StringRefMemoryObject.cpp
deleted file mode 100644
index e035ed1..0000000
--- a/lib/Support/StringRefMemoryObject.cpp
+++ /dev/null

@@ -1,29 +0,0 @@
-//===- lib/Support/StringRefMemoryObject.cpp --------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/StringRefMemoryObject.h"
-
-using namespace llvm;
-
-int StringRefMemoryObject::readByte(uint64_t Addr, uint8_t *Byte) const {
-  if (Addr >= Base + getExtent() || Addr < Base)
-    return -1;
-  *Byte = Bytes[Addr - Base];
-  return 0;
-}
-
-int StringRefMemoryObject::readBytes(uint64_t Addr,
-                                     uint64_t Size,
-                                     uint8_t *Buf) const {
-  uint64_t Offset = Addr - Base;
-  if (Addr >= Base + getExtent() || Offset + Size > getExtent() || Addr < Base)
-    return -1;
-  memcpy(Buf, Bytes.data() + Offset, Size);
-  return 0;
-}

diff --git a/lib/Support/TimeValue.cpp b/lib/Support/TimeValue.cpp
index 4a70797..136b93e 100644
--- a/lib/Support/TimeValue.cpp
+++ b/lib/Support/TimeValue.cpp

@@ -22,12 +22,6 @@
 const TimeValue::SecondsType
   TimeValue::Win32ZeroTimeSeconds = -12591158400ULL;
 
-const TimeValue TimeValue::MinTime       = TimeValue ( INT64_MIN,0 );
-const TimeValue TimeValue::MaxTime       = TimeValue ( INT64_MAX,0 );
-const TimeValue TimeValue::ZeroTime      = TimeValue ( 0,0 );
-const TimeValue TimeValue::PosixZeroTime = TimeValue ( PosixZeroTimeSeconds,0 );
-const TimeValue TimeValue::Win32ZeroTime = TimeValue ( Win32ZeroTimeSeconds,0 );
-
 void
 TimeValue::normalize( void ) {
   if ( nanos_ >= NANOSECONDS_PER_SECOND ) {

diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 210bda7..e1a531a 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp

@@ -66,10 +66,10 @@
   // each time -stats or -time-passes wants to print output to it. To
   // compensate for this, the test-suite Makefiles have code to delete the
   // info output file before running commands which write to it.
-  std::string Error;
-  raw_ostream *Result = new raw_fd_ostream(
-      OutputFilename.c_str(), Error, sys::fs::F_Append | sys::fs::F_Text);
-  if (Error.empty())
+  std::error_code EC;
+  raw_ostream *Result = new raw_fd_ostream(OutputFilename, EC,
+                                           sys::fs::F_Append | sys::fs::F_Text);
+  if (!EC)
     return Result;
   
   errs() << "Error opening info-output-file '"

diff --git a/lib/Support/ToolOutputFile.cpp b/lib/Support/ToolOutputFile.cpp
index b5fb20f..8ae977d 100644
--- a/lib/Support/ToolOutputFile.cpp
+++ b/lib/Support/ToolOutputFile.cpp

@@ -16,8 +16,8 @@
 #include "llvm/Support/Signals.h"
 using namespace llvm;
 
-tool_output_file::CleanupInstaller::CleanupInstaller(const char *filename)
-  : Filename(filename), Keep(false) {
+tool_output_file::CleanupInstaller::CleanupInstaller(StringRef Filename)
+    : Filename(Filename), Keep(false) {
   // Arrange for the file to be deleted if the process is killed.
   if (Filename != "-")
     sys::RemoveFileOnSignal(Filename);
@@ -34,14 +34,13 @@
     sys::DontRemoveFileOnSignal(Filename);
 }
 
-tool_output_file::tool_output_file(const char *filename, std::string &ErrorInfo,
+tool_output_file::tool_output_file(StringRef Filename, std::error_code &EC,
                                    sys::fs::OpenFlags Flags)
-    : Installer(filename), OS(filename, ErrorInfo, Flags) {
+    : Installer(Filename), OS(Filename, EC, Flags) {
   // If open fails, no cleanup is needed.
-  if (!ErrorInfo.empty())
+  if (EC)
     Installer.Keep = true;
 }
 
-tool_output_file::tool_output_file(const char *Filename, int FD)
-    : Installer(Filename), OS(FD, true) {
-}
+tool_output_file::tool_output_file(StringRef Filename, int FD)
+    : Installer(Filename), OS(FD, true) {}

diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index b74ee13..4a4773e 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp

@@ -23,8 +23,6 @@
   case aarch64_be:  return "aarch64_be";
   case arm:         return "arm";
   case armeb:       return "armeb";
-  case arm64:       return "arm64";
-  case arm64_be:    return "arm64_be";
   case hexagon:     return "hexagon";
   case mips:        return "mips";
   case mipsel:      return "mipsel";
@@ -47,7 +45,11 @@
   case nvptx:       return "nvptx";
   case nvptx64:     return "nvptx64";
   case le32:        return "le32";
+  case le64:        return "le64";
   case amdil:       return "amdil";
+  case amdil64:     return "amdil64";
+  case hsail:       return "hsail";
+  case hsail64:     return "hsail64";
   case spir:        return "spir";
   case spir64:      return "spir64";
   case kalimba:     return "kalimba";
@@ -69,9 +71,6 @@
   case thumb:
   case thumbeb:     return "arm";
 
-  case arm64:       
-  case arm64_be:    return "arm64";
-
   case ppc64:
   case ppc64le:
   case ppc:         return "ppc";
@@ -99,8 +98,15 @@
   case nvptx64:     return "nvptx";
 
   case le32:        return "le32";
-  case amdil:       return "amdil";
-  case spir:        return "spir";
+  case le64:        return "le64";
+
+  case amdil:
+  case amdil64:     return "amdil";
+
+  case hsail:
+  case hsail64:     return "hsail";
+
+  case spir:
   case spir64:      return "spir";
   case kalimba:     return "kalimba";
   }
@@ -118,6 +124,7 @@
   case Freescale: return "fsl";
   case IBM: return "ibm";
   case ImaginationTechnologies: return "img";
+  case MipsTechnologies: return "mti";
   case NVIDIA: return "nvidia";
   case CSR: return "csr";
   }
@@ -129,8 +136,6 @@
   switch (Kind) {
   case UnknownOS: return "unknown";
 
-  case AuroraUX: return "auroraux";
-  case Cygwin: return "cygwin";
   case Darwin: return "darwin";
   case DragonFly: return "dragonfly";
   case FreeBSD: return "freebsd";
@@ -139,7 +144,6 @@
   case Linux: return "linux";
   case Lv2: return "lv2";
   case MacOSX: return "macosx";
-  case MinGW32: return "mingw32";
   case NetBSD: return "netbsd";
   case OpenBSD: return "openbsd";
   case Solaris: return "solaris";
@@ -181,10 +185,9 @@
   return StringSwitch<Triple::ArchType>(Name)
     .Case("aarch64", aarch64)
     .Case("aarch64_be", aarch64_be)
+    .Case("arm64", aarch64) // "arm64" is an alias for "aarch64"
     .Case("arm", arm)
     .Case("armeb", armeb)
-    .Case("arm64", arm64)
-    .Case("arm64_be", arm64_be)
     .Case("mips", mips)
     .Case("mipsel", mipsel)
     .Case("mips64", mips64)
@@ -208,40 +211,59 @@
     .Case("nvptx", nvptx)
     .Case("nvptx64", nvptx64)
     .Case("le32", le32)
+    .Case("le64", le64)
     .Case("amdil", amdil)
+    .Case("amdil64", amdil64)
+    .Case("hsail", hsail)
+    .Case("hsail64", hsail64)
     .Case("spir", spir)
     .Case("spir64", spir64)
     .Case("kalimba", kalimba)
     .Default(UnknownArch);
 }
 
-// Returns architecture name that is understood by the target assembler.
-const char *Triple::getArchNameForAssembler() {
-  if (!isOSDarwin() && getVendor() != Triple::Apple)
-    return nullptr;
+static Triple::ArchType parseARMArch(StringRef ArchName) {
+  size_t offset = StringRef::npos;
+  Triple::ArchType arch = Triple::UnknownArch;
+  bool isThumb = ArchName.startswith("thumb");
 
-  return StringSwitch<const char*>(getArchName())
-    .Case("i386", "i386")
-    .Case("x86_64", "x86_64")
-    .Case("powerpc", "ppc")
-    .Case("powerpc64", "ppc64")
-    .Case("powerpc64le", "ppc64le")
-    .Case("arm", "arm")
-    .Cases("armv4t", "thumbv4t", "armv4t")
-    .Cases("armv5", "armv5e", "thumbv5", "thumbv5e", "armv5")
-    .Cases("armv6", "thumbv6", "armv6")
-    .Cases("armv7", "thumbv7", "armv7")
-    .Case("armeb", "armeb")
-    .Case("arm64", "arm64")
-    .Case("arm64_be", "arm64")
-    .Case("r600", "r600")
-    .Case("nvptx", "nvptx")
-    .Case("nvptx64", "nvptx64")
-    .Case("le32", "le32")
-    .Case("amdil", "amdil")
-    .Case("spir", "spir")
-    .Case("spir64", "spir64")
-    .Default(nullptr);
+  if (ArchName.equals("arm"))
+    return Triple::arm;
+  if (ArchName.equals("armeb"))
+    return Triple::armeb;
+  if (ArchName.equals("thumb"))
+    return Triple::thumb;
+  if (ArchName.equals("thumbeb"))
+    return Triple::thumbeb;
+  if (ArchName.equals("arm64") || ArchName.equals("aarch64"))
+    return Triple::aarch64;
+  if (ArchName.equals("aarch64_be"))
+    return Triple::aarch64_be;
+
+  if (ArchName.startswith("armv")) {
+    offset = 3;
+    arch = Triple::arm;
+  } else if (ArchName.startswith("armebv")) {
+    offset = 5;
+    arch = Triple::armeb;
+  } else if (ArchName.startswith("thumbv")) {
+    offset = 5;
+    arch = Triple::thumb;
+  } else if (ArchName.startswith("thumbebv")) {
+    offset = 7;
+    arch = Triple::thumbeb;
+  }
+  return StringSwitch<Triple::ArchType>(ArchName.substr(offset))
+    .Cases("v2", "v2a", isThumb ? Triple::UnknownArch : arch)
+    .Cases("v3", "v3m", isThumb ? Triple::UnknownArch : arch)
+    .Cases("v4", "v4t", arch)
+    .Cases("v5", "v5e", "v5t", "v5te", "v5tej", arch)
+    .Cases("v6", "v6j", "v6k", "v6m", arch)
+    .Cases("v6t2", "v6z", "v6zk", arch)
+    .Cases("v7", "v7a", "v7em", "v7l", arch)
+    .Cases("v7m", "v7r", "v7s", arch)
+    .Cases("v8", "v8a", arch)
+    .Default(Triple::UnknownArch);
 }
 
 static Triple::ArchType parseArch(StringRef ArchName) {
@@ -253,20 +275,10 @@
     .Case("powerpc", Triple::ppc)
     .Cases("powerpc64", "ppu", Triple::ppc64)
     .Case("powerpc64le", Triple::ppc64le)
-    .Case("aarch64", Triple::aarch64)
-    .Case("aarch64_be", Triple::aarch64_be)
-    .Cases("arm", "xscale", Triple::arm)
-    // FIXME: It would be good to replace these with explicit names for all the
-    // various suffixes supported.
-    .StartsWith("armv", Triple::arm)
-    .Case("armeb", Triple::armeb)
-    .StartsWith("armebv", Triple::armeb)
-    .Case("thumb", Triple::thumb)
-    .StartsWith("thumbv", Triple::thumb)
-    .Case("thumbeb", Triple::thumbeb)
-    .StartsWith("thumbebv", Triple::thumbeb)
-    .Case("arm64", Triple::arm64)
-    .Case("arm64_be", Triple::arm64_be)
+    .Case("xscale", Triple::arm)
+    .StartsWith("arm", parseARMArch(ArchName))
+    .StartsWith("thumb", parseARMArch(ArchName))
+    .StartsWith("aarch64", parseARMArch(ArchName))
     .Case("msp430", Triple::msp430)
     .Cases("mips", "mipseb", "mipsallegrex", Triple::mips)
     .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
@@ -282,10 +294,14 @@
     .Case("nvptx", Triple::nvptx)
     .Case("nvptx64", Triple::nvptx64)
     .Case("le32", Triple::le32)
+    .Case("le64", Triple::le64)
     .Case("amdil", Triple::amdil)
+    .Case("amdil64", Triple::amdil64)
+    .Case("hsail", Triple::hsail)
+    .Case("hsail64", Triple::hsail64)
     .Case("spir", Triple::spir)
     .Case("spir64", Triple::spir64)
-    .Case("kalimba", Triple::kalimba)
+    .StartsWith("kalimba", Triple::kalimba)
     .Default(Triple::UnknownArch);
 }
 
@@ -299,6 +315,7 @@
     .Case("fsl", Triple::Freescale)
     .Case("ibm", Triple::IBM)
     .Case("img", Triple::ImaginationTechnologies)
+    .Case("mti", Triple::MipsTechnologies)
     .Case("nvidia", Triple::NVIDIA)
     .Case("csr", Triple::CSR)
     .Default(Triple::UnknownVendor);
@@ -306,8 +323,6 @@
 
 static Triple::OSType parseOS(StringRef OSName) {
   return StringSwitch<Triple::OSType>(OSName)
-    .StartsWith("auroraux", Triple::AuroraUX)
-    .StartsWith("cygwin", Triple::Cygwin)
     .StartsWith("darwin", Triple::Darwin)
     .StartsWith("dragonfly", Triple::DragonFly)
     .StartsWith("freebsd", Triple::FreeBSD)
@@ -316,7 +331,6 @@
     .StartsWith("linux", Triple::Linux)
     .StartsWith("lv2", Triple::Lv2)
     .StartsWith("macosx", Triple::MacOSX)
-    .StartsWith("mingw32", Triple::MinGW32)
     .StartsWith("netbsd", Triple::NetBSD)
     .StartsWith("openbsd", Triple::OpenBSD)
     .StartsWith("solaris", Triple::Solaris)
@@ -358,6 +372,31 @@
     .Default(Triple::UnknownObjectFormat);
 }
 
+static Triple::SubArchType parseSubArch(StringRef SubArchName) {
+  return StringSwitch<Triple::SubArchType>(SubArchName)
+    .EndsWith("v8", Triple::ARMSubArch_v8)
+    .EndsWith("v8a", Triple::ARMSubArch_v8)
+    .EndsWith("v7", Triple::ARMSubArch_v7)
+    .EndsWith("v7a", Triple::ARMSubArch_v7)
+    .EndsWith("v7em", Triple::ARMSubArch_v7em)
+    .EndsWith("v7l", Triple::ARMSubArch_v7)
+    .EndsWith("v7m", Triple::ARMSubArch_v7m)
+    .EndsWith("v7r", Triple::ARMSubArch_v7)
+    .EndsWith("v7s", Triple::ARMSubArch_v7s)
+    .EndsWith("v6", Triple::ARMSubArch_v6)
+    .EndsWith("v6m", Triple::ARMSubArch_v6m)
+    .EndsWith("v6t2", Triple::ARMSubArch_v6t2)
+    .EndsWith("v5", Triple::ARMSubArch_v5)
+    .EndsWith("v5e", Triple::ARMSubArch_v5)
+    .EndsWith("v5t", Triple::ARMSubArch_v5)
+    .EndsWith("v5te", Triple::ARMSubArch_v5te)
+    .EndsWith("v4t", Triple::ARMSubArch_v4t)
+    .EndsWith("kalimba3", Triple::KalimbaSubArch_v3)
+    .EndsWith("kalimba4", Triple::KalimbaSubArch_v4)
+    .EndsWith("kalimba5", Triple::KalimbaSubArch_v5)
+    .Default(Triple::NoSubArch);
+}
+
 static const char *getObjectFormatTypeName(Triple::ObjectFormatType Kind) {
   switch (Kind) {
   case Triple::UnknownObjectFormat: return "";
@@ -383,6 +422,7 @@
 Triple::Triple(const Twine &Str)
     : Data(Str.str()),
       Arch(parseArch(getArchName())),
+      SubArch(parseSubArch(getArchName())),
       Vendor(parseVendor(getVendorName())),
       OS(parseOS(getOSName())),
       Environment(parseEnvironment(getEnvironmentName())),
@@ -400,6 +440,7 @@
 Triple::Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr)
     : Data((ArchStr + Twine('-') + VendorStr + Twine('-') + OSStr).str()),
       Arch(parseArch(ArchStr.str())),
+      SubArch(parseSubArch(ArchStr.str())),
       Vendor(parseVendor(VendorStr.str())),
       OS(parseOS(OSStr.str())),
       Environment(), ObjectFormat(Triple::UnknownObjectFormat) {
@@ -416,6 +457,7 @@
     : Data((ArchStr + Twine('-') + VendorStr + Twine('-') + OSStr + Twine('-') +
             EnvironmentStr).str()),
       Arch(parseArch(ArchStr.str())),
+      SubArch(parseSubArch(ArchStr.str())),
       Vendor(parseVendor(VendorStr.str())),
       OS(parseOS(OSStr.str())),
       Environment(parseEnvironment(EnvironmentStr.str())),
@@ -425,6 +467,9 @@
 }
 
 std::string Triple::normalize(StringRef Str) {
+  bool IsMinGW32 = false;
+  bool IsCygwin = false;
+
   // Parse into components.
   SmallVector<StringRef, 4> Components;
   Str.split(Components, "-");
@@ -441,8 +486,11 @@
   if (Components.size() > 1)
     Vendor = parseVendor(Components[1]);
   OSType OS = UnknownOS;
-  if (Components.size() > 2)
+  if (Components.size() > 2) {
     OS = parseOS(Components[2]);
+    IsCygwin = Components[2].startswith("cygwin");
+    IsMinGW32 = Components[2].startswith("mingw");
+  }
   EnvironmentType Environment = UnknownEnvironment;
   if (Components.size() > 3)
     Environment = parseEnvironment(Components[3]);
@@ -485,7 +533,9 @@
         break;
       case 2:
         OS = parseOS(Comp);
-        Valid = OS != UnknownOS;
+        IsCygwin = Comp.startswith("cygwin");
+        IsMinGW32 = Comp.startswith("mingw");
+        Valid = OS != UnknownOS || IsCygwin || IsMinGW32;
         break;
       case 3:
         Environment = parseEnvironment(Comp);
@@ -565,16 +615,16 @@
       else
         Components[3] = getObjectFormatTypeName(ObjectFormat);
     }
-  } else if (OS == Triple::MinGW32) {
+  } else if (IsMinGW32) {
     Components.resize(4);
     Components[2] = "windows";
     Components[3] = "gnu";
-  } else if (OS == Triple::Cygwin) {
+  } else if (IsCygwin) {
     Components.resize(4);
     Components[2] = "windows";
     Components[3] = "cygnus";
   }
-  if (OS == Triple::MinGW32 || OS == Triple::Cygwin ||
+  if (IsMinGW32 || IsCygwin ||
       (OS == Triple::Win32 && Environment != UnknownEnvironment)) {
     if (ObjectFormat != UnknownObjectFormat && ObjectFormat != Triple::COFF) {
       Components.resize(5);
@@ -716,7 +766,7 @@
     getOSVersion(Major, Minor, Micro);
     // Default to 5.0 (or 7.0 for arm64).
     if (Major == 0)
-      Major = (getArch() == arm64) ? 7 : 5;
+      Major = (getArch() == aarch64) ? 7 : 5;
     break;
   }
 }
@@ -789,7 +839,6 @@
   case llvm::Triple::msp430:
     return 16;
 
-  case llvm::Triple::amdil:
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::hexagon:
@@ -805,14 +854,15 @@
   case llvm::Triple::thumbeb:
   case llvm::Triple::x86:
   case llvm::Triple::xcore:
+  case llvm::Triple::amdil:
+  case llvm::Triple::hsail:
   case llvm::Triple::spir:
   case llvm::Triple::kalimba:
     return 32;
 
-  case llvm::Triple::arm64:
-  case llvm::Triple::arm64_be:
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_be:
+  case llvm::Triple::le64:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
   case llvm::Triple::nvptx64:
@@ -821,6 +871,8 @@
   case llvm::Triple::sparcv9:
   case llvm::Triple::systemz:
   case llvm::Triple::x86_64:
+  case llvm::Triple::amdil64:
+  case llvm::Triple::hsail64:
   case llvm::Triple::spir64:
     return 64;
   }
@@ -845,8 +897,6 @@
   case Triple::UnknownArch:
   case Triple::aarch64:
   case Triple::aarch64_be:
-  case Triple::arm64:
-  case Triple::arm64_be:
   case Triple::msp430:
   case Triple::systemz:
   case Triple::ppc64le:
@@ -854,6 +904,7 @@
     break;
 
   case Triple::amdil:
+  case Triple::hsail:
   case Triple::spir:
   case Triple::arm:
   case Triple::armeb:
@@ -874,12 +925,15 @@
     // Already 32-bit.
     break;
 
+  case Triple::le64:      T.setArch(Triple::le32);    break;
   case Triple::mips64:    T.setArch(Triple::mips);    break;
   case Triple::mips64el:  T.setArch(Triple::mipsel);  break;
   case Triple::nvptx64:   T.setArch(Triple::nvptx);   break;
   case Triple::ppc64:     T.setArch(Triple::ppc);     break;
   case Triple::sparcv9:   T.setArch(Triple::sparc);   break;
   case Triple::x86_64:    T.setArch(Triple::x86);     break;
+  case Triple::amdil64:   T.setArch(Triple::amdil);   break;
+  case Triple::hsail64:   T.setArch(Triple::hsail);   break;
   case Triple::spir64:    T.setArch(Triple::spir);    break;
   }
   return T;
@@ -889,12 +943,10 @@
   Triple T(*this);
   switch (getArch()) {
   case Triple::UnknownArch:
-  case Triple::amdil:
   case Triple::arm:
   case Triple::armeb:
   case Triple::hexagon:
   case Triple::kalimba:
-  case Triple::le32:
   case Triple::msp430:
   case Triple::r600:
   case Triple::tce:
@@ -906,6 +958,9 @@
 
   case Triple::aarch64:
   case Triple::aarch64_be:
+  case Triple::le64:
+  case Triple::amdil64:
+  case Triple::hsail64:
   case Triple::spir64:
   case Triple::mips64:
   case Triple::mips64el:
@@ -915,18 +970,102 @@
   case Triple::sparcv9:
   case Triple::systemz:
   case Triple::x86_64:
-  case Triple::arm64:
-  case Triple::arm64_be:
     // Already 64-bit.
     break;
 
+  case Triple::le32:    T.setArch(Triple::le64);      break;
   case Triple::mips:    T.setArch(Triple::mips64);    break;
   case Triple::mipsel:  T.setArch(Triple::mips64el);  break;
   case Triple::nvptx:   T.setArch(Triple::nvptx64);   break;
   case Triple::ppc:     T.setArch(Triple::ppc64);     break;
   case Triple::sparc:   T.setArch(Triple::sparcv9);   break;
   case Triple::x86:     T.setArch(Triple::x86_64);    break;
+  case Triple::amdil:   T.setArch(Triple::amdil64);   break;
+  case Triple::hsail:   T.setArch(Triple::hsail64);   break;
   case Triple::spir:    T.setArch(Triple::spir64);    break;
   }
   return T;
 }
+
+// FIXME: tblgen this.
+const char *Triple::getARMCPUForArch(StringRef MArch) const {
+  if (MArch.empty())
+    MArch = getArchName();
+
+  switch (getOS()) {
+  case llvm::Triple::FreeBSD:
+  case llvm::Triple::NetBSD:
+    if (MArch == "armv6")
+      return "arm1176jzf-s";
+    break;
+  case llvm::Triple::Win32:
+    // FIXME: this is invalid for WindowsCE
+    return "cortex-a9";
+  default:
+    break;
+  }
+
+  const char *result = nullptr;
+  size_t offset = StringRef::npos;
+  if (MArch.startswith("arm"))
+    offset = 3;
+  if (MArch.startswith("thumb"))
+    offset = 5;
+  if (offset != StringRef::npos && MArch.substr(offset, 2) == "eb")
+    offset += 2;
+  if (offset != StringRef::npos)
+    result = llvm::StringSwitch<const char *>(MArch.substr(offset))
+      .Cases("v2", "v2a", "arm2")
+      .Case("v3", "arm6")
+      .Case("v3m", "arm7m")
+      .Case("v4", "strongarm")
+      .Case("v4t", "arm7tdmi")
+      .Cases("v5", "v5t", "arm10tdmi")
+      .Cases("v5e", "v5te", "arm1022e")
+      .Case("v5tej", "arm926ej-s")
+      .Cases("v6", "v6k", "arm1136jf-s")
+      .Case("v6j", "arm1136j-s")
+      .Cases("v6z", "v6zk", "arm1176jzf-s")
+      .Case("v6t2", "arm1156t2-s")
+      .Cases("v6m", "v6-m", "cortex-m0")
+      .Cases("v7", "v7a", "v7-a", "v7l", "v7-l", "cortex-a8")
+      .Cases("v7s", "v7-s", "swift")
+      .Cases("v7r", "v7-r", "cortex-r4")
+      .Cases("v7m", "v7-m", "cortex-m3")
+      .Cases("v7em", "v7e-m", "cortex-m4")
+      .Cases("v8", "v8a", "v8-a", "cortex-a53")
+      .Default(nullptr);
+  else
+    result = llvm::StringSwitch<const char *>(MArch)
+      .Case("ep9312", "ep9312")
+      .Case("iwmmxt", "iwmmxt")
+      .Case("xscale", "xscale")
+      .Default(nullptr);
+
+  if (result)
+    return result;
+
+  // If all else failed, return the most base CPU with thumb interworking
+  // supported by LLVM.
+  // FIXME: Should warn once that we're falling back.
+  switch (getOS()) {
+  case llvm::Triple::NetBSD:
+    switch (getEnvironment()) {
+    case llvm::Triple::GNUEABIHF:
+    case llvm::Triple::GNUEABI:
+    case llvm::Triple::EABIHF:
+    case llvm::Triple::EABI:
+      return "arm926ej-s";
+    default:
+      return "strongarm";
+    }
+  default:
+    switch (getEnvironment()) {
+    case llvm::Triple::EABIHF:
+    case llvm::Triple::GNUEABIHF:
+      return "arm1176jzf-s";
+    default:
+      return "arm7tdmi";
+    }
+  }
+}

diff --git a/lib/Support/Unix/Host.inc b/lib/Support/Unix/Host.inc
index c5d36ff..fcb3638 100644
--- a/lib/Support/Unix/Host.inc
+++ b/lib/Support/Unix/Host.inc

@@ -1,4 +1,4 @@
- //===- llvm/Support/Unix/Host.inc -------------------------------*- C++ -*-===//
+//===- llvm/Support/Unix/Host.inc -------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -22,7 +22,6 @@
 #include <sys/utsname.h>
 #include <cctype>
 #include <string>
-#include <cstdlib> // ::getenv
 
 using namespace llvm;
 
@@ -39,7 +38,8 @@
   StringRef TargetTripleString(LLVM_DEFAULT_TARGET_TRIPLE);
   std::pair<StringRef, StringRef> ArchSplit = TargetTripleString.split('-');
 
-  // Normalize the arch, since the target triple may not actually match the target.
+  // Normalize the arch, since the target triple may not actually match the
+  // target.
   std::string Arch = ArchSplit.first;
 
   std::string Triple(Arch);

diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 623547a..634d404 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc

@@ -87,22 +87,6 @@
   };
 }
 
-static std::error_code TempDir(SmallVectorImpl<char> &result) {
-  // FIXME: Don't use TMPDIR if program is SUID or SGID enabled.
-  const char *dir = nullptr;
-  (dir = std::getenv("TMPDIR")) || (dir = std::getenv("TMP")) ||
-      (dir = std::getenv("TEMP")) || (dir = std::getenv("TEMPDIR")) ||
-#ifdef P_tmpdir
-      (dir = P_tmpdir) ||
-#endif
-      (dir = "/tmp");
-
-  result.clear();
-  StringRef d(dir);
-  result.append(d.begin(), d.end());
-  return std::error_code();
-}
-
 namespace llvm {
 namespace sys  {
 namespace fs {
@@ -272,19 +256,6 @@
   return std::error_code();
 }
 
-std::error_code normalize_separators(SmallVectorImpl<char> &Path) {
-  for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
-    if (*PI == '\\') {
-      auto PN = PI + 1;
-      if (PN < PE && *PN == '\\')
-        ++PI; // increment once, the for loop will move over the escaped slash
-      else
-        *PI = '/';
-    }
-  }
-  return std::error_code();
-}
-
 // Note that we are using symbolic link because hard links are not supported by
 // all filesystems (SMB doesn't).
 std::error_code create_link(const Twine &to, const Twine &from) {
@@ -350,40 +321,37 @@
   return std::error_code();
 }
 
-std::error_code exists(const Twine &path, bool &result) {
-  SmallString<128> path_storage;
-  StringRef p = path.toNullTerminatedStringRef(path_storage);
+static int convertAccessMode(AccessMode Mode) {
+  switch (Mode) {
+  case AccessMode::Exist:
+    return F_OK;
+  case AccessMode::Write:
+    return W_OK;
+  case AccessMode::Execute:
+    return R_OK | X_OK; // scripts also need R_OK.
+  }
+  llvm_unreachable("invalid enum");
+}
 
-  if (::access(p.begin(), F_OK) == -1) {
-    if (errno != ENOENT)
-      return std::error_code(errno, std::generic_category());
-    result = false;
-  } else
-    result = true;
+std::error_code access(const Twine &Path, AccessMode Mode) {
+  SmallString<128> PathStorage;
+  StringRef P = Path.toNullTerminatedStringRef(PathStorage);
+
+  if (::access(P.begin(), convertAccessMode(Mode)) == -1)
+    return std::error_code(errno, std::generic_category());
+
+  if (Mode == AccessMode::Execute) {
+    // Don't say that directories are executable.
+    struct stat buf;
+    if (0 != stat(P.begin(), &buf))
+      return errc::permission_denied;
+    if (!S_ISREG(buf.st_mode))
+      return errc::permission_denied;
+  }
 
   return std::error_code();
 }
 
-bool can_write(const Twine &Path) {
-  SmallString<128> PathStorage;
-  StringRef P = Path.toNullTerminatedStringRef(PathStorage);
-  return 0 == access(P.begin(), W_OK);
-}
-
-bool can_execute(const Twine &Path) {
-  SmallString<128> PathStorage;
-  StringRef P = Path.toNullTerminatedStringRef(PathStorage);
-
-  if (0 != access(P.begin(), R_OK | X_OK))
-    return false;
-  struct stat buf;
-  if (0 != stat(P.begin(), &buf))
-    return false;
-  if (!S_ISREG(buf.st_mode))
-    return false;
-  return true;
-}
-
 bool equivalent(file_status A, file_status B) {
   assert(status_known(A) && status_known(B));
   return A.fs_st_dev == B.fs_st_dev &&
@@ -678,6 +646,66 @@
   return false;
 }
 
+static const char *getEnvTempDir() {
+  // Check whether the temporary directory is specified by an environment
+  // variable.
+  const char *EnvironmentVariables[] = {"TMPDIR", "TMP", "TEMP", "TEMPDIR"};
+  for (const char *Env : EnvironmentVariables) {
+    if (const char *Dir = std::getenv(Env))
+      return Dir;
+  }
+
+  return nullptr;
+}
+
+static const char *getDefaultTempDir(bool ErasedOnReboot) {
+#ifdef P_tmpdir
+  if ((bool)P_tmpdir)
+    return P_tmpdir;
+#endif
+
+  if (ErasedOnReboot)
+    return "/tmp";
+  return "/var/tmp";
+}
+
+void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) {
+  Result.clear();
+
+  if (ErasedOnReboot) {
+    // There is no env variable for the cache directory.
+    if (const char *RequestedDir = getEnvTempDir()) {
+      Result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+      return;
+    }
+  }
+
+#if defined(_CS_DARWIN_USER_TEMP_DIR) && defined(_CS_DARWIN_USER_CACHE_DIR)
+  // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
+  // macros defined in <unistd.h> on darwin >= 9
+  int ConfName = ErasedOnReboot? _CS_DARWIN_USER_TEMP_DIR
+                               : _CS_DARWIN_USER_CACHE_DIR;
+  size_t ConfLen = confstr(ConfName, nullptr, 0);
+  if (ConfLen > 0) {
+    do {
+      Result.resize(ConfLen);
+      ConfLen = confstr(ConfName, Result.data(), Result.size());
+    } while (ConfLen > 0 && ConfLen != Result.size());
+
+    if (ConfLen > 0) {
+      assert(Result.back() == 0);
+      Result.pop_back();
+      return;
+    }
+
+    Result.clear();
+  }
+#endif
+
+  const char *RequestedDir = getDefaultTempDir(ErasedOnReboot);
+  Result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+}
+
 } // end namespace path
 
 } // end namespace sys

diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index d2c5dbc..a429bb3 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc

@@ -14,15 +14,25 @@
 #include "Unix.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/TimeValue.h"
+#if HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
 #ifdef HAVE_SYS_TIME_H
 #include <sys/time.h>
 #endif
 #ifdef HAVE_SYS_RESOURCE_H
 #include <sys/resource.h>
 #endif
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_SIGNAL_H
+#include <signal.h>
+#endif
 // DragonFlyBSD, OpenBSD, and Bitrig have deprecated <malloc.h> for
 // <stdlib.h> instead. Unix.h includes this for us already.
 #if defined(HAVE_MALLOC_H) && !defined(__DragonFly__) && \
@@ -198,6 +208,97 @@
   return std::error_code();
 }
 
+namespace {
+class FDCloser {
+public:
+  FDCloser(int &FD) : FD(FD), KeepOpen(false) {}
+  void keepOpen() { KeepOpen = true; }
+  ~FDCloser() {
+    if (!KeepOpen && FD >= 0)
+      ::close(FD);
+  }
+
+private:
+  FDCloser(const FDCloser &) LLVM_DELETED_FUNCTION;
+  void operator=(const FDCloser &) LLVM_DELETED_FUNCTION;
+
+  int &FD;
+  bool KeepOpen;
+};
+}
+
+std::error_code Process::FixupStandardFileDescriptors() {
+  int NullFD = -1;
+  FDCloser FDC(NullFD);
+  const int StandardFDs[] = {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO};
+  for (int StandardFD : StandardFDs) {
+    struct stat st;
+    errno = 0;
+    while (fstat(StandardFD, &st) < 0) {
+      assert(errno && "expected errno to be set if fstat failed!");
+      // fstat should return EBADF if the file descriptor is closed.
+      if (errno == EBADF)
+        break;
+      // retry fstat if we got EINTR, otherwise bubble up the failure.
+      if (errno != EINTR)
+        return std::error_code(errno, std::generic_category());
+    }
+    // if fstat succeeds, move on to the next FD.
+    if (!errno)
+      continue;
+    assert(errno == EBADF && "expected errno to have EBADF at this point!");
+
+    if (NullFD < 0) {
+      while ((NullFD = open("/dev/null", O_RDWR)) < 0) {
+        if (errno == EINTR)
+          continue;
+        return std::error_code(errno, std::generic_category());
+      }
+    }
+
+    if (NullFD == StandardFD)
+      FDC.keepOpen();
+    else if (dup2(NullFD, StandardFD) < 0)
+      return std::error_code(errno, std::generic_category());
+  }
+  return std::error_code();
+}
+
+std::error_code Process::SafelyCloseFileDescriptor(int FD) {
+  // Create a signal set filled with *all* signals.
+  sigset_t FullSet;
+  if (sigfillset(&FullSet) < 0)
+    return std::error_code(errno, std::generic_category());
+  // Atomically swap our current signal mask with a full mask.
+  sigset_t SavedSet;
+#if LLVM_ENABLE_THREADS
+  if (int EC = pthread_sigmask(SIG_SETMASK, &FullSet, &SavedSet))
+    return std::error_code(EC, std::generic_category());
+#else
+  if (sigprocmask(SIG_SETMASK, &FullSet, &SavedSet) < 0)
+    return std::error_code(errno, std::generic_category());
+#endif
+  // Attempt to close the file descriptor.
+  // We need to save the error, if one occurs, because our subsequent call to
+  // pthread_sigmask might tamper with errno.
+  int ErrnoFromClose = 0;
+  if (::close(FD) < 0)
+    ErrnoFromClose = errno;
+  // Restore the signal mask back to what we saved earlier.
+  int EC = 0;
+#if LLVM_ENABLE_THREADS
+  EC = pthread_sigmask(SIG_SETMASK, &SavedSet, nullptr);
+#else
+  if (sigprocmask(SIG_SETMASK, &SavedSet, nullptr) < 0)
+    EC = errno;
+#endif
+  // The error code from close takes precedence over the one from
+  // pthread_sigmask.
+  if (ErrnoFromClose)
+    return std::error_code(ErrnoFromClose, std::generic_category());
+  return std::error_code(EC, std::generic_category());
+}
+
 bool Process::StandardInIsUserInput() {
   return FileDescriptorIsDisplayed(STDIN_FILENO);
 }
@@ -263,11 +364,14 @@
 extern "C" int tigetnum(char *capname);
 #endif
 
+#ifdef HAVE_TERMINFO
+static ManagedStatic<sys::Mutex> TermColorMutex;
+#endif
+
 static bool terminalHasColors(int fd) {
 #ifdef HAVE_TERMINFO
   // First, acquire a global lock because these C routines are thread hostile.
-  static sys::Mutex M;
-  MutexGuard G(M);
+  MutexGuard G(*TermColorMutex);
 
   int errret = 0;
   if (setupterm((char *)nullptr, fd, &errret) != 0)

diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index 06a33cd..0670ad3 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc

@@ -17,8 +17,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "Unix.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
 #include <llvm/Config/config.h>
 #if HAVE_SYS_STAT_H
 #include <sys/stat.h>
@@ -53,50 +55,31 @@
 
 ProcessInfo::ProcessInfo() : Pid(0), ReturnCode(0) {}
 
-// This function just uses the PATH environment variable to find the program.
-std::string
-sys::FindProgramByName(const std::string& progName) {
-
-  // Check some degenerate cases
-  if (progName.length() == 0) // no program
-    return "";
-  std::string temp = progName;
+ErrorOr<std::string> sys::findProgramByName(StringRef Name,
+                                            ArrayRef<StringRef> Paths) {
+  assert(!Name.empty() && "Must have a name!");
   // Use the given path verbatim if it contains any slashes; this matches
   // the behavior of sh(1) and friends.
-  if (progName.find('/') != std::string::npos)
-    return temp;
+  if (Name.find('/') != StringRef::npos)
+    return std::string(Name);
 
-  // At this point, the file name is valid and does not contain slashes. Search
-  // for it through the directories specified in the PATH environment variable.
+  if (Paths.empty()) {
+    SmallVector<StringRef, 16> SearchPaths;
+    SplitString(std::getenv("PATH"), SearchPaths, ":");
+    return findProgramByName(Name, SearchPaths);
+  }
 
-  // Get the path. If its empty, we can't do anything to find it.
-  const char *PathStr = getenv("PATH");
-  if (!PathStr)
-    return "";
-
-  // Now we have a colon separated list of directories to search; try them.
-  size_t PathLen = strlen(PathStr);
-  while (PathLen) {
-    // Find the first colon...
-    const char *Colon = std::find(PathStr, PathStr+PathLen, ':');
+  for (auto Path : Paths) {
+    if (Path.empty())
+      continue;
 
     // Check to see if this first directory contains the executable...
-    SmallString<128> FilePath(PathStr,Colon);
-    sys::path::append(FilePath, progName);
-    if (sys::fs::can_execute(Twine(FilePath)))
-      return FilePath.str();                    // Found the executable!
-
-    // Nope it wasn't in this directory, check the next path in the list!
-    PathLen -= Colon-PathStr;
-    PathStr = Colon;
-
-    // Advance past duplicate colons
-    while (*PathStr == ':') {
-      PathStr++;
-      PathLen--;
-    }
+    SmallString<128> FilePath(Path);
+    sys::path::append(FilePath, Name);
+    if (sys::fs::can_execute(FilePath.c_str()))
+      return std::string(FilePath.str()); // Found the executable!
   }
-  return "";
+  return std::errc::no_such_file_or_directory;
 }
 
 static bool RedirectIO(const StringRef *Path, int FD, std::string* ErrMsg) {
@@ -334,7 +317,6 @@
   pid_t ChildPid = PI.Pid;
   if (WaitUntilTerminates) {
     SecondsToWait = 0;
-    ChildPid = -1; // mimic a wait() using waitpid()
   } else if (SecondsToWait) {
     // Install a timeout handler.  The handler itself does nothing, but the
     // simple fact of having a handler at all causes the wait below to return
@@ -440,6 +422,23 @@
     return std::error_code();
 }
 
+std::error_code
+llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
+                                 WindowsEncodingMethod Encoding /*unused*/) {
+  std::error_code EC;
+  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text);
+
+  if (EC)
+    return EC;
+
+  OS << Contents;
+
+  if (OS.has_error())
+    return std::make_error_code(std::errc::io_error);
+
+  return EC;
+}
+
 bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
   static long ArgMax = sysconf(_SC_ARG_MAX);
 
@@ -448,13 +447,13 @@
     return true;
 
   // Conservatively account for space required by environment variables.
-  ArgMax /= 2;
+  long HalfArgMax = ArgMax / 2;
 
   size_t ArgLength = 0;
   for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
        I != E; ++I) {
     ArgLength += strlen(*I) + 1;
-    if (ArgLength > size_t(ArgMax)) {
+    if (ArgLength > size_t(HalfArgMax)) {
       return false;
     }
   }

diff --git a/lib/Support/Unix/RWMutex.inc b/lib/Support/Unix/RWMutex.inc
index edcbd52..85a1043 100644
--- a/lib/Support/Unix/RWMutex.inc
+++ b/lib/Support/Unix/RWMutex.inc

@@ -26,26 +26,26 @@
 // will therefore deadlock if a thread tries to acquire a read lock
 // multiple times.
 
-RWMutexImpl::RWMutexImpl() : data_(new Mutex(false)) { }
+RWMutexImpl::RWMutexImpl() : data_(new MutexImpl(false)) { }
 
 RWMutexImpl::~RWMutexImpl() {
-  delete static_cast<Mutex *>(data_);
+  delete static_cast<MutexImpl *>(data_);
 }
 
 bool RWMutexImpl::reader_acquire() {
-  return static_cast<Mutex *>(data_)->acquire();
+  return static_cast<MutexImpl *>(data_)->acquire();
 }
 
 bool RWMutexImpl::reader_release() {
-  return static_cast<Mutex *>(data_)->release();
+  return static_cast<MutexImpl *>(data_)->release();
 }
 
 bool RWMutexImpl::writer_acquire() {
-  return static_cast<Mutex *>(data_)->acquire();
+  return static_cast<MutexImpl *>(data_)->acquire();
 }
 
 bool RWMutexImpl::writer_release() {
-  return static_cast<Mutex *>(data_)->release();
+  return static_cast<MutexImpl *>(data_)->release();
 }
 
 }

diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 1841fea..e8f4643 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc

@@ -14,7 +14,14 @@
 
 #include "Unix.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/UniqueLock.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <string>
 #include <vector>
@@ -36,18 +43,22 @@
 #if HAVE_MACH_MACH_H
 #include <mach/mach.h>
 #endif
+#if HAVE_LINK_H
+#include <link.h>
+#endif
 
 using namespace llvm;
 
 static RETSIGTYPE SignalHandler(int Sig);  // defined below.
 
-static SmartMutex<true> SignalsMutex;
+static ManagedStatic<SmartMutex<true> > SignalsMutex;
 
 /// InterruptFunction - The function to call if ctrl-c is pressed.
 static void (*InterruptFunction)() = nullptr;
 
-static std::vector<std::string> FilesToRemove;
-static std::vector<std::pair<void(*)(void*), void*> > CallBacksToRun;
+static ManagedStatic<std::vector<std::string>> FilesToRemove;
+static ManagedStatic<std::vector<std::pair<void (*)(void *), void *>>>
+    CallBacksToRun;
 
 // IntSigs - Signals that represent requested termination. There's no bug
 // or failure, or if there is, it's not our direct responsibility. For whatever
@@ -55,7 +66,6 @@
 static const int IntSigs[] = {
   SIGHUP, SIGINT, SIGPIPE, SIGTERM, SIGUSR1, SIGUSR2
 };
-static const int *const IntSigsEnd = std::end(IntSigs);
 
 // KillSigs - Signals that represent that we have a bug, and our prompt
 // termination has been ordered.
@@ -74,7 +84,6 @@
   , SIGEMT
 #endif
 };
-static const int *const KillSigsEnd = std::end(KillSigs);
 
 static unsigned NumRegisteredSignals = 0;
 static struct {
@@ -105,8 +114,8 @@
   // If the handlers are already registered, we're done.
   if (NumRegisteredSignals != 0) return;
 
-  std::for_each(IntSigs, IntSigsEnd, RegisterHandler);
-  std::for_each(KillSigs, KillSigsEnd, RegisterHandler);
+  for (auto S : IntSigs) RegisterHandler(S);
+  for (auto S : KillSigs) RegisterHandler(S);
 }
 
 static void UnregisterHandlers() {
@@ -125,11 +134,12 @@
 static void RemoveFilesToRemove() {
   // We avoid iterators in case of debug iterators that allocate or release
   // memory.
-  for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i) {
+  std::vector<std::string>& FilesToRemoveRef = *FilesToRemove;
+  for (unsigned i = 0, e = FilesToRemoveRef.size(); i != e; ++i) {
     // We rely on a std::string implementation for which repeated calls to
     // 'c_str()' don't allocate memory. We pre-call 'c_str()' on all of these
     // strings to try to ensure this is safe.
-    const char *path = FilesToRemove[i].c_str();
+    const char *path = FilesToRemoveRef[i].c_str();
 
     // Get the status so we can determine if it's a file or directory. If we
     // can't stat the file, ignore it.
@@ -162,28 +172,31 @@
   sigfillset(&SigMask);
   sigprocmask(SIG_UNBLOCK, &SigMask, nullptr);
 
-  SignalsMutex.acquire();
-  RemoveFilesToRemove();
+  {
+    unique_lock<SmartMutex<true>> Guard(*SignalsMutex);
+    RemoveFilesToRemove();
 
-  if (std::find(IntSigs, IntSigsEnd, Sig) != IntSigsEnd) {
-    if (InterruptFunction) {
-      void (*IF)() = InterruptFunction;
-      SignalsMutex.release();
-      InterruptFunction = nullptr;
-      IF();        // run the interrupt function.
+    if (std::find(std::begin(IntSigs), std::end(IntSigs), Sig)
+        != std::end(IntSigs)) {
+      if (InterruptFunction) {
+        void (*IF)() = InterruptFunction;
+        Guard.unlock();
+        InterruptFunction = nullptr;
+        IF();        // run the interrupt function.
+        return;
+      }
+
+      Guard.unlock();
+      raise(Sig);   // Execute the default handler.
       return;
-    }
-
-    SignalsMutex.release();
-    raise(Sig);   // Execute the default handler.
-    return;
+   }
   }
 
-  SignalsMutex.release();
-
   // Otherwise if it is a fault (like SEGV) run any handler.
-  for (unsigned i = 0, e = CallBacksToRun.size(); i != e; ++i)
-    CallBacksToRun[i].first(CallBacksToRun[i].second);
+  std::vector<std::pair<void (*)(void *), void *>>& CallBacksToRunRef =
+      *CallBacksToRun;
+  for (unsigned i = 0, e = CallBacksToRun->size(); i != e; ++i)
+    CallBacksToRunRef[i].first(CallBacksToRunRef[i].second);
 
 #ifdef __s390__
   // On S/390, certain signals are delivered with PSW Address pointing to
@@ -196,37 +209,39 @@
 }
 
 void llvm::sys::RunInterruptHandlers() {
-  SignalsMutex.acquire();
+  sys::SmartScopedLock<true> Guard(*SignalsMutex);
   RemoveFilesToRemove();
-  SignalsMutex.release();
 }
 
 void llvm::sys::SetInterruptFunction(void (*IF)()) {
-  SignalsMutex.acquire();
-  InterruptFunction = IF;
-  SignalsMutex.release();
+  {
+    sys::SmartScopedLock<true> Guard(*SignalsMutex);
+    InterruptFunction = IF;
+  }
   RegisterHandlers();
 }
 
 // RemoveFileOnSignal - The public API
 bool llvm::sys::RemoveFileOnSignal(StringRef Filename,
                                    std::string* ErrMsg) {
-  SignalsMutex.acquire();
-  std::string *OldPtr = FilesToRemove.empty() ? nullptr : &FilesToRemove[0];
-  FilesToRemove.push_back(Filename);
+  {
+    sys::SmartScopedLock<true> Guard(*SignalsMutex);
+    std::vector<std::string>& FilesToRemoveRef = *FilesToRemove;
+    std::string *OldPtr =
+        FilesToRemoveRef.empty() ? nullptr : &FilesToRemoveRef[0];
+    FilesToRemoveRef.push_back(Filename);
 
-  // We want to call 'c_str()' on every std::string in this vector so that if
-  // the underlying implementation requires a re-allocation, it happens here
-  // rather than inside of the signal handler. If we see the vector grow, we
-  // have to call it on every entry. If it remains in place, we only need to
-  // call it on the latest one.
-  if (OldPtr == &FilesToRemove[0])
-    FilesToRemove.back().c_str();
-  else
-    for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i)
-      FilesToRemove[i].c_str();
-
-  SignalsMutex.release();
+    // We want to call 'c_str()' on every std::string in this vector so that if
+    // the underlying implementation requires a re-allocation, it happens here
+    // rather than inside of the signal handler. If we see the vector grow, we
+    // have to call it on every entry. If it remains in place, we only need to
+    // call it on the latest one.
+    if (OldPtr == &FilesToRemoveRef[0])
+      FilesToRemoveRef.back().c_str();
+    else
+      for (unsigned i = 0, e = FilesToRemoveRef.size(); i != e; ++i)
+        FilesToRemoveRef[i].c_str();
+  }
 
   RegisterHandlers();
   return false;
@@ -234,31 +249,166 @@
 
 // DontRemoveFileOnSignal - The public API
 void llvm::sys::DontRemoveFileOnSignal(StringRef Filename) {
-  SignalsMutex.acquire();
+  sys::SmartScopedLock<true> Guard(*SignalsMutex);
   std::vector<std::string>::reverse_iterator RI =
-    std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename);
-  std::vector<std::string>::iterator I = FilesToRemove.end();
-  if (RI != FilesToRemove.rend())
-    I = FilesToRemove.erase(RI.base()-1);
+    std::find(FilesToRemove->rbegin(), FilesToRemove->rend(), Filename);
+  std::vector<std::string>::iterator I = FilesToRemove->end();
+  if (RI != FilesToRemove->rend())
+    I = FilesToRemove->erase(RI.base()-1);
 
   // We need to call c_str() on every element which would have been moved by
   // the erase. These elements, in a C++98 implementation where c_str()
   // requires a reallocation on the first call may have had the call to c_str()
   // made on insertion become invalid by being copied down an element.
-  for (std::vector<std::string>::iterator E = FilesToRemove.end(); I != E; ++I)
+  for (std::vector<std::string>::iterator E = FilesToRemove->end(); I != E; ++I)
     I->c_str();
-
-  SignalsMutex.release();
 }
 
 /// AddSignalHandler - Add a function to be called when a signal is delivered
 /// to the process.  The handler can have a cookie passed to it to identify
 /// what instance of the handler it is.
 void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
-  CallBacksToRun.push_back(std::make_pair(FnPtr, Cookie));
+  CallBacksToRun->push_back(std::make_pair(FnPtr, Cookie));
   RegisterHandlers();
 }
 
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
+
+#if HAVE_LINK_H && (defined(__linux__) || defined(__FreeBSD__) ||              \
+                    defined(__FreeBSD_kernel__) || defined(__NetBSD__))
+struct DlIteratePhdrData {
+  void **StackTrace;
+  int depth;
+  bool first;
+  const char **modules;
+  intptr_t *offsets;
+  const char *main_exec_name;
+};
+
+static int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *arg) {
+  DlIteratePhdrData *data = (DlIteratePhdrData*)arg;
+  const char *name = data->first ? data->main_exec_name : info->dlpi_name;
+  data->first = false;
+  for (int i = 0; i < info->dlpi_phnum; i++) {
+    const auto *phdr = &info->dlpi_phdr[i];
+    if (phdr->p_type != PT_LOAD)
+      continue;
+    intptr_t beg = info->dlpi_addr + phdr->p_vaddr;
+    intptr_t end = beg + phdr->p_memsz;
+    for (int j = 0; j < data->depth; j++) {
+      if (data->modules[j])
+        continue;
+      intptr_t addr = (intptr_t)data->StackTrace[j];
+      if (beg <= addr && addr < end) {
+        data->modules[j] = name;
+        data->offsets[j] = addr - info->dlpi_addr;
+      }
+    }
+  }
+  return 0;
+}
+
+static bool findModulesAndOffsets(void **StackTrace, int Depth,
+                                  const char **Modules, intptr_t *Offsets,
+                                  const char *MainExecutableName) {
+  DlIteratePhdrData data = {StackTrace, Depth,   true,
+                            Modules,    Offsets, MainExecutableName};
+  dl_iterate_phdr(dl_iterate_phdr_cb, &data);
+  return true;
+}
+#else
+static bool findModulesAndOffsets(void **StackTrace, int Depth,
+                                  const char **Modules, intptr_t *Offsets,
+                                  const char *MainExecutableName) {
+  return false;
+}
+#endif
+
+static bool printSymbolizedStackTrace(void **StackTrace, int Depth, FILE *FD) {
+  // FIXME: Subtract necessary number from StackTrace entries to turn return addresses
+  // into actual instruction addresses.
+  // Use llvm-symbolizer tool to symbolize the stack traces.
+  ErrorOr<std::string> LLVMSymbolizerPathOrErr =
+      sys::findProgramByName("llvm-symbolizer");
+  if (!LLVMSymbolizerPathOrErr)
+    return false;
+  const std::string &LLVMSymbolizerPath = *LLVMSymbolizerPathOrErr;
+  // We don't know argv0 or the address of main() at this point, but try
+  // to guess it anyway (it's possible on some platforms).
+  std::string MainExecutableName = sys::fs::getMainExecutable(nullptr, nullptr);
+  if (MainExecutableName.empty() ||
+      MainExecutableName.find("llvm-symbolizer") != std::string::npos)
+    return false;
+
+  std::vector<const char *> Modules(Depth, nullptr);
+  std::vector<intptr_t> Offsets(Depth, 0);
+  if (!findModulesAndOffsets(StackTrace, Depth, Modules.data(), Offsets.data(),
+                             MainExecutableName.c_str()))
+    return false;
+  int InputFD;
+  SmallString<32> InputFile, OutputFile;
+  sys::fs::createTemporaryFile("symbolizer-input", "", InputFD, InputFile);
+  sys::fs::createTemporaryFile("symbolizer-output", "", OutputFile);
+  FileRemover InputRemover(InputFile.c_str());
+  FileRemover OutputRemover(OutputFile.c_str());
+
+  {
+    raw_fd_ostream Input(InputFD, true);
+    for (int i = 0; i < Depth; i++) {
+      if (Modules[i])
+        Input << Modules[i] << " " << (void*)Offsets[i] << "\n";
+    }
+  }
+
+  StringRef InputFileStr(InputFile);
+  StringRef OutputFileStr(OutputFile);
+  StringRef StderrFileStr;
+  const StringRef *Redirects[] = {&InputFileStr, &OutputFileStr,
+                                  &StderrFileStr};
+  const char *Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining",
+                        "--demangle", nullptr};
+  int RunResult =
+      sys::ExecuteAndWait(LLVMSymbolizerPath, Args, nullptr, Redirects);
+  if (RunResult != 0)
+    return false;
+
+  auto OutputBuf = MemoryBuffer::getFile(OutputFile.c_str());
+  if (!OutputBuf)
+    return false;
+  StringRef Output = OutputBuf.get()->getBuffer();
+  SmallVector<StringRef, 32> Lines;
+  Output.split(Lines, "\n");
+  auto CurLine = Lines.begin();
+  int frame_no = 0;
+  for (int i = 0; i < Depth; i++) {
+    if (!Modules[i]) {
+      fprintf(FD, "#%d %p\n", frame_no++, StackTrace[i]);
+      continue;
+    }
+    // Read pairs of lines (function name and file/line info) until we
+    // encounter empty line.
+    for (;;) {
+      if (CurLine == Lines.end())
+        return false;
+      StringRef FunctionName = *CurLine++;
+      if (FunctionName.empty())
+        break;
+      fprintf(FD, "#%d %p ", frame_no++, StackTrace[i]);
+      if (!FunctionName.startswith("??"))
+        fprintf(FD, "%s ", FunctionName.str().c_str());
+      if (CurLine == Lines.end())
+        return false;
+      StringRef FileLineInfo = *CurLine++;
+      if (!FileLineInfo.startswith("??"))
+        fprintf(FD, "%s", FileLineInfo.str().c_str());
+      else
+        fprintf(FD, "(%s+%p)", Modules[i], (void *)Offsets[i]);
+      fprintf(FD, "\n");
+    }
+  }
+  return true;
+}
+#endif // defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
 
 // PrintStackTrace - In the case of a program crash or fault, print out a stack
 // trace so that the user has an indication of why and where we died.
@@ -271,6 +421,8 @@
   // Use backtrace() to output a backtrace on Linux systems with glibc.
   int depth = backtrace(StackTrace,
                         static_cast<int>(array_lengthof(StackTrace)));
+  if (printSymbolizedStackTrace(StackTrace, depth, FD))
+    return;
 #if HAVE_DLFCN_H && __GNUG__
   int width = 0;
   for (int i = 0; i < depth; ++i) {

diff --git a/lib/Support/Unix/TimeValue.inc b/lib/Support/Unix/TimeValue.inc
index 7d4acf7..042e0da 100644
--- a/lib/Support/Unix/TimeValue.inc
+++ b/lib/Support/Unix/TimeValue.inc

@@ -41,7 +41,7 @@
     // errors concern the timezone parameter which we're passing in as 0.
     // In the unlikely case it does happen, just return MinTime, no error
     // message needed.
-    return MinTime;
+    return MinTime();
   }
 
   return TimeValue(

diff --git a/lib/Support/Unix/Unix.h b/lib/Support/Unix/Unix.h
index ba688e3..e16a226 100644
--- a/lib/Support/Unix/Unix.h
+++ b/lib/Support/Unix/Unix.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_UNIX_UNIX_H
-#define LLVM_SYSTEM_UNIX_UNIX_H
+#ifndef LLVM_LIB_SUPPORT_UNIX_UNIX_H
+#define LLVM_LIB_SUPPORT_UNIX_UNIX_H
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only generic UNIX code that

diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 5ed0b70..79d5f79 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc

@@ -41,32 +41,11 @@
 
 static DenseSet<HMODULE> *OpenedHandles;
 
-extern "C" {
-
-  static BOOL CALLBACK ELM_Callback(WIN32_ELMCB_PCSTR ModuleName,
-                                    ULONG_PTR ModuleBase,
-                                    ULONG ModuleSize,
-                                    PVOID UserContext)
-  {
-    // Ignore VC++ runtimes prior to 7.1.  Somehow some of them get loaded
-    // into the process.
-    if (stricmp(ModuleName, "msvci70") != 0 &&
-        stricmp(ModuleName, "msvcirt") != 0 &&
-        stricmp(ModuleName, "msvcp50") != 0 &&
-        stricmp(ModuleName, "msvcp60") != 0 &&
-        stricmp(ModuleName, "msvcp70") != 0 &&
-        stricmp(ModuleName, "msvcr70") != 0 &&
-#ifndef __MINGW32__
-        // Mingw32 uses msvcrt.dll by default. Don't ignore it.
-        // Otherwise the user should be aware what they are doing.
-        stricmp(ModuleName, "msvcrt") != 0 &&
-#endif
-        stricmp(ModuleName, "msvcrt20") != 0 &&
-        stricmp(ModuleName, "msvcrt40") != 0) {
-      OpenedHandles->insert((HMODULE)ModuleBase);
-    }
-    return TRUE;
-  }
+static BOOL CALLBACK
+ELM_Callback(WIN32_ELMCB_PCSTR ModuleName, ULONG_PTR ModuleBase,
+             ULONG ModuleSize, PVOID UserContext) {
+  OpenedHandles->insert((HMODULE)ModuleBase);
+  return TRUE;
 }
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
@@ -115,10 +94,24 @@
   extern "C" { extern void *SYM; }
 #define EXPLICIT_SYMBOL2(SYMFROM, SYMTO) EXPLICIT_SYMBOL(SYMTO)
 
+#ifdef _M_IX86
+// Win32 on x86 implements certain single-precision math functions as macros.
+// These functions are not exported by the DLL, but will still be needed
+// for symbol-resolution by the JIT loader. Therefore, this Support libray
+// provides helper functions with the same implementation.
+
+#define INLINE_DEF_SYMBOL1(TYP, SYM)                                           \
+  extern "C" TYP inline_##SYM(TYP _X) { return SYM(_X); }
+#define INLINE_DEF_SYMBOL2(TYP, SYM)                                           \
+  extern "C" TYP inline_##SYM(TYP _X, TYP _Y) { return SYM(_X, _Y); }
+#endif
+
 #include "explicit_symbols.inc"
 
 #undef EXPLICIT_SYMBOL
 #undef EXPLICIT_SYMBOL2
+#undef INLINE_DEF_SYMBOL1
+#undef INLINE_DEF_SYMBOL2
 
 void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
   SmartScopedLock<true> Lock(*SymbolsMutex);
@@ -142,22 +135,32 @@
     }
   }
 
-  #define EXPLICIT_SYMBOL(SYM)                    \
-    if (!strcmp(symbolName, #SYM)) return (void*)&SYM;
-  #define EXPLICIT_SYMBOL2(SYMFROM, SYMTO)        \
-    if (!strcmp(symbolName, #SYMFROM)) return (void*)&SYMTO;
+#define EXPLICIT_SYMBOL(SYM)                                                   \
+  if (!strcmp(symbolName, #SYM))                                               \
+    return (void *)&SYM;
+#define EXPLICIT_SYMBOL2(SYMFROM, SYMTO)                                       \
+  if (!strcmp(symbolName, #SYMFROM))                                           \
+    return (void *)&SYMTO;
+
+#ifdef _M_IX86
+#define INLINE_DEF_SYMBOL1(TYP, SYM)                                           \
+  if (!strcmp(symbolName, #SYM))                                               \
+    return (void *)&inline_##SYM;
+#define INLINE_DEF_SYMBOL2(TYP, SYM) INLINE_DEF_SYMBOL1(TYP, SYM)
+#endif
 
   {
-    #include "explicit_symbols.inc"
+#include "explicit_symbols.inc"
   }
 
-  #undef EXPLICIT_SYMBOL
-  #undef EXPLICIT_SYMBOL2
+#undef EXPLICIT_SYMBOL
+#undef EXPLICIT_SYMBOL2
+#undef INLINE_DEF_SYMBOL1
+#undef INLINE_DEF_SYMBOL2
 
   return 0;
 }
 
-
 void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
   if (!isValid())
     return NULL;
@@ -166,5 +169,4 @@
   return (void *)(intptr_t)GetProcAddress((HMODULE)Data, symbolName);
 }
 
-
 }

diff --git a/lib/Support/Windows/Host.inc b/lib/Support/Windows/Host.inc
index 0c02bf9..fe89fe0a 100644
--- a/lib/Support/Windows/Host.inc
+++ b/lib/Support/Windows/Host.inc

@@ -1,4 +1,4 @@
-//===- llvm/Support/Win32/Host.inc -------------------------------*- C++ -*-===//
+//===- llvm/Support/Win32/Host.inc ------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //

diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 7a1bc04..365031c 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc

@@ -49,23 +49,6 @@
   return mapWindowsError(E);
 }
 
-static std::error_code TempDir(SmallVectorImpl<char> &Result) {
-  SmallVector<wchar_t, 64> Res;
-retry_temp_dir:
-  DWORD Len = ::GetTempPathW(Res.capacity(), Res.begin());
-
-  if (Len == 0)
-    return windows_error(::GetLastError());
-
-  if (Len > Res.capacity()) {
-    Res.reserve(Len);
-    goto retry_temp_dir;
-  }
-
-  Res.set_size(Len);
-  return UTF16ToUTF8(Res.begin(), Res.size(), Result);
-}
-
 static bool is_separator(const wchar_t value) {
   switch (value) {
   case L'\\':
@@ -76,6 +59,59 @@
   }
 }
 
+// Convert a UTF-8 path to UTF-16.  Also, if the absolute equivalent of the
+// path is longer than CreateDirectory can tolerate, make it absolute and
+// prefixed by '\\?\'.
+static std::error_code widenPath(const Twine &Path8,
+                                 SmallVectorImpl<wchar_t> &Path16) {
+  const size_t MaxDirLen = MAX_PATH - 12; // Must leave room for 8.3 filename.
+
+  // Several operations would convert Path8 to SmallString; more efficient to
+  // do it once up front.
+  SmallString<128> Path8Str;
+  Path8.toVector(Path8Str);
+
+  // If we made this path absolute, how much longer would it get?
+  size_t CurPathLen;
+  if (llvm::sys::path::is_absolute(Twine(Path8Str)))
+    CurPathLen = 0; // No contribution from current_path needed.
+  else {
+    CurPathLen = ::GetCurrentDirectoryW(0, NULL);
+    if (CurPathLen == 0)
+      return windows_error(::GetLastError());
+  }
+
+  // Would the absolute path be longer than our limit?
+  if ((Path8Str.size() + CurPathLen) >= MaxDirLen &&
+      !Path8Str.startswith("\\\\?\\")) {
+    SmallString<2*MAX_PATH> FullPath("\\\\?\\");
+    if (CurPathLen) {
+      SmallString<80> CurPath;
+      if (std::error_code EC = llvm::sys::fs::current_path(CurPath))
+        return EC;
+      FullPath.append(CurPath);
+    }
+    // Traverse the requested path, canonicalizing . and .. as we go (because
+    // the \\?\ prefix is documented to treat them as real components).
+    // The iterators don't report separators and append() always attaches
+    // preferred_separator so we don't need to call native() on the result.
+    for (llvm::sys::path::const_iterator I = llvm::sys::path::begin(Path8Str),
+                                         E = llvm::sys::path::end(Path8Str);
+                                         I != E; ++I) {
+      if (I->size() == 1 && *I == ".")
+        continue;
+      if (I->size() == 2 && *I == "..")
+        llvm::sys::path::remove_filename(FullPath);
+      else
+        llvm::sys::path::append(FullPath, *I);
+    }
+    return UTF8ToUTF16(FullPath, Path16);
+  }
+
+  // Just use the caller's original path.
+  return UTF8ToUTF16(Path8Str, Path16);
+}
+
 namespace llvm {
 namespace sys  {
 namespace fs {
@@ -147,11 +183,9 @@
 }
 
 std::error_code create_directory(const Twine &path, bool IgnoreExisting) {
-  SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (std::error_code ec =
-          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
+  if (std::error_code ec = widenPath(path, path_utf16))
     return ec;
 
   if (!::CreateDirectoryW(path_utf16.begin(), NULL)) {
@@ -163,25 +197,14 @@
   return std::error_code();
 }
 
-std::error_code normalize_separators(SmallVectorImpl<char> &Path) {
-  (void) Path;
-  return std::error_code();
-}
-
 // We can't use symbolic links for windows.
 std::error_code create_link(const Twine &to, const Twine &from) {
-  // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toStringRef(from_storage);
-  StringRef t = to.toStringRef(to_storage);
-
   // Convert to utf-16.
   SmallVector<wchar_t, 128> wide_from;
   SmallVector<wchar_t, 128> wide_to;
-  if (std::error_code ec = UTF8ToUTF16(f, wide_from))
+  if (std::error_code ec = widenPath(from, wide_from))
     return ec;
-  if (std::error_code ec = UTF8ToUTF16(t, wide_to))
+  if (std::error_code ec = widenPath(to, wide_to))
     return ec;
 
   if (!::CreateHardLinkW(wide_from.begin(), wide_to.begin(), NULL))
@@ -191,7 +214,6 @@
 }
 
 std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
-  SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
   file_status ST;
@@ -201,8 +223,7 @@
     return std::error_code();
   }
 
-  if (std::error_code ec =
-          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
+  if (std::error_code ec = widenPath(path, path_utf16))
     return ec;
 
   if (ST.type() == file_type::directory_file) {
@@ -222,18 +243,12 @@
 }
 
 std::error_code rename(const Twine &from, const Twine &to) {
-  // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toStringRef(from_storage);
-  StringRef t = to.toStringRef(to_storage);
-
   // Convert to utf-16.
   SmallVector<wchar_t, 128> wide_from;
   SmallVector<wchar_t, 128> wide_to;
-  if (std::error_code ec = UTF8ToUTF16(f, wide_from))
+  if (std::error_code ec = widenPath(from, wide_from))
     return ec;
-  if (std::error_code ec = UTF8ToUTF16(t, wide_to))
+  if (std::error_code ec = widenPath(to, wide_to))
     return ec;
 
   std::error_code ec = std::error_code();
@@ -254,11 +269,9 @@
 }
 
 std::error_code resize_file(const Twine &path, uint64_t size) {
-  SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (std::error_code ec =
-          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
+  if (std::error_code ec = widenPath(path, path_utf16))
     return ec;
 
   int fd = ::_wopen(path_utf16.begin(), O_BINARY | _O_RDWR, S_IWRITE);
@@ -273,51 +286,29 @@
   return std::error_code(error, std::generic_category());
 }
 
-std::error_code exists(const Twine &path, bool &result) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
+std::error_code access(const Twine &Path, AccessMode Mode) {
+  SmallVector<wchar_t, 128> PathUtf16;
 
-  if (std::error_code ec =
-          UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
-    return ec;
+  if (std::error_code EC = widenPath(Path, PathUtf16))
+    return EC;
 
-  DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
+  DWORD Attributes = ::GetFileAttributesW(PathUtf16.begin());
 
-  if (attributes == INVALID_FILE_ATTRIBUTES) {
+  if (Attributes == INVALID_FILE_ATTRIBUTES) {
     // See if the file didn't actually exist.
     DWORD LastError = ::GetLastError();
     if (LastError != ERROR_FILE_NOT_FOUND &&
         LastError != ERROR_PATH_NOT_FOUND)
       return windows_error(LastError);
-    result = false;
-  } else
-    result = true;
+    return errc::no_such_file_or_directory;
+  }
+
+  if (Mode == AccessMode::Write && (Attributes & FILE_ATTRIBUTE_READONLY))
+    return errc::permission_denied;
+
   return std::error_code();
 }
 
-bool can_write(const Twine &Path) {
-  // FIXME: take security attributes into account.
-  SmallString<128> PathStorage;
-  SmallVector<wchar_t, 128> PathUtf16;
-
-  if (UTF8ToUTF16(Path.toStringRef(PathStorage), PathUtf16))
-    return false;
-
-  DWORD Attr = ::GetFileAttributesW(PathUtf16.begin());
-  return (Attr != INVALID_FILE_ATTRIBUTES) && !(Attr & FILE_ATTRIBUTE_READONLY);
-}
-
-bool can_execute(const Twine &Path) {
-  SmallString<128> PathStorage;
-  SmallVector<wchar_t, 128> PathUtf16;
-
-  if (UTF8ToUTF16(Path.toStringRef(PathStorage), PathUtf16))
-    return false;
-
-  DWORD Attr = ::GetFileAttributesW(PathUtf16.begin());
-  return Attr != INVALID_FILE_ATTRIBUTES;
-}
-
 bool equivalent(file_status A, file_status B) {
   assert(status_known(A) && status_known(B));
   return A.FileIndexHigh      == B.FileIndexHigh &&
@@ -424,7 +415,7 @@
     return std::error_code();
   }
 
-  if (std::error_code ec = UTF8ToUTF16(path8, path_utf16))
+  if (std::error_code ec = widenPath(path8, path_utf16))
     return ec;
 
   DWORD attr = ::GetFileAttributesW(path_utf16.begin());
@@ -567,11 +558,10 @@
   , FileDescriptor()
   , FileHandle(INVALID_HANDLE_VALUE)
   , FileMappingHandle() {
-  SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
   // Convert path to UTF-16.
-  if ((ec = UTF8ToUTF16(path.toStringRef(path_storage), path_utf16)))
+  if ((ec = widenPath(path, path_utf16)))
     return;
 
   // Get file handle for creating a file mapping.
@@ -677,7 +667,7 @@
                                                 StringRef path){
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (std::error_code ec = UTF8ToUTF16(path, path_utf16))
+  if (std::error_code ec = widenPath(path, path_utf16))
     return ec;
 
   // Convert path to the format that Windows is happy with.
@@ -760,11 +750,9 @@
 }
 
 std::error_code openFileForRead(const Twine &Name, int &ResultFD) {
-  SmallString<128> PathStorage;
   SmallVector<wchar_t, 128> PathUTF16;
 
-  if (std::error_code EC =
-          UTF8ToUTF16(Name.toStringRef(PathStorage), PathUTF16))
+  if (std::error_code EC = widenPath(Name, PathUTF16))
     return EC;
 
   HANDLE H = ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
@@ -799,11 +787,9 @@
   assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
          "Cannot specify both 'excl' and 'append' file creation flags!");
 
-  SmallString<128> PathStorage;
   SmallVector<wchar_t, 128> PathUTF16;
 
-  if (std::error_code EC =
-          UTF8ToUTF16(Name.toStringRef(PathStorage), PathUTF16))
+  if (std::error_code EC = widenPath(Name, PathUTF16))
     return EC;
 
   DWORD CreationDisposition;
@@ -867,6 +853,51 @@
   return true;
 }
 
+static bool getTempDirEnvVar(const char *Var, SmallVectorImpl<char> &Res) {
+  SmallVector<wchar_t, 128> NameUTF16;
+  if (windows::UTF8ToUTF16(Var, NameUTF16))
+    return false;
+
+  SmallVector<wchar_t, 1024> Buf;
+  size_t Size = 1024;
+  do {
+    Buf.reserve(Size);
+    Size =
+        GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.capacity());
+    if (Size == 0)
+      return false;
+
+    // Try again with larger buffer.
+  } while (Size > Buf.capacity());
+  Buf.set_size(Size);
+
+  if (windows::UTF16ToUTF8(Buf.data(), Size, Res))
+    return false;
+  return true;
+}
+
+static bool getTempDirEnvVar(SmallVectorImpl<char> &Res) {
+  const char *EnvironmentVariables[] = {"TMP", "TEMP", "USERPROFILE"};
+  for (const char *Env : EnvironmentVariables) {
+    if (getTempDirEnvVar(Env, Res))
+      return true;
+  }
+  return false;
+}
+
+void system_temp_directory(bool ErasedOnReboot, SmallVectorImpl<char> &Result) {
+  (void)ErasedOnReboot;
+  Result.clear();
+
+  // Check whether the temporary directory is specified by an environment
+  // variable.
+  if (getTempDirEnvVar(Result))
+    return;
+
+  // Fall back to a system default.
+  const char *DefaultResult = "C:\\TEMP";
+  Result.append(DefaultResult, DefaultResult + strlen(DefaultResult));
+}
 } // end namespace path
 
 namespace windows {
@@ -896,11 +927,13 @@
   return std::error_code();
 }
 
-std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                            llvm::SmallVectorImpl<char> &utf8) {
+static
+std::error_code UTF16ToCodePage(unsigned codepage, const wchar_t *utf16,
+                                size_t utf16_len,
+                                llvm::SmallVectorImpl<char> &utf8) {
   if (utf16_len) {
     // Get length.
-    int len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.begin(),
+    int len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.begin(),
                                     0, NULL, NULL);
 
     if (len == 0)
@@ -910,7 +943,7 @@
     utf8.set_size(len);
 
     // Now do the actual conversion.
-    len = ::WideCharToMultiByte(CP_UTF8, 0, utf16, utf16_len, utf8.data(),
+    len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.data(),
                                 utf8.size(), NULL, NULL);
 
     if (len == 0)
@@ -923,6 +956,16 @@
 
   return std::error_code();
 }
+
+std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                            llvm::SmallVectorImpl<char> &utf8) {
+  return UTF16ToCodePage(CP_UTF8, utf16, utf16_len, utf8);
+}
+
+std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
+                             llvm::SmallVectorImpl<char> &utf8) {
+  return UTF16ToCodePage(CP_ACP, utf16, utf16_len, utf8);
+}
 } // end namespace windows
 } // end namespace sys
 } // end namespace llvm

diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 81aee0e..3819e63 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc

@@ -183,36 +183,103 @@
   return mapWindowsError(E);
 }
 
+static void AllocateAndPush(const SmallVectorImpl<char> &S,
+                            SmallVectorImpl<const char *> &Vector,
+                            SpecificBumpPtrAllocator<char> &Allocator) {
+  char *Buffer = Allocator.Allocate(S.size() + 1);
+  ::memcpy(Buffer, S.data(), S.size());
+  Buffer[S.size()] = '\0';
+  Vector.push_back(Buffer);
+}
+
+/// Convert Arg from UTF-16 to UTF-8 and push it onto Args.
+static std::error_code
+ConvertAndPushArg(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
+                  SpecificBumpPtrAllocator<char> &Allocator) {
+  SmallVector<char, MAX_PATH> ArgString;
+  if (std::error_code ec = windows::UTF16ToUTF8(Arg, wcslen(Arg), ArgString))
+    return ec;
+  AllocateAndPush(ArgString, Args, Allocator);
+  return std::error_code();
+}
+
+/// \brief Perform wildcard expansion of Arg, or just push it into Args if it
+/// doesn't have wildcards or doesn't match any files.
+static std::error_code
+WildcardExpand(const wchar_t *Arg, SmallVectorImpl<const char *> &Args,
+               SpecificBumpPtrAllocator<char> &Allocator) {
+  if (!wcspbrk(Arg, L"*?")) {
+    // Arg does not contain any wildcard characters. This is the common case.
+    return ConvertAndPushArg(Arg, Args, Allocator);
+  }
+
+  if (wcscmp(Arg, L"/?") == 0 || wcscmp(Arg, L"-?") == 0) {
+    // Don't wildcard expand /?. Always treat it as an option.
+    return ConvertAndPushArg(Arg, Args, Allocator);
+  }
+
+  // Extract any directory part of the argument.
+  SmallVector<char, MAX_PATH> Dir;
+  if (std::error_code ec = windows::UTF16ToUTF8(Arg, wcslen(Arg), Dir))
+    return ec;
+  sys::path::remove_filename(Dir);
+  const int DirSize = Dir.size();
+
+  // Search for matching files.
+  WIN32_FIND_DATAW FileData;
+  HANDLE FindHandle = FindFirstFileW(Arg, &FileData);
+  if (FindHandle == INVALID_HANDLE_VALUE) {
+    return ConvertAndPushArg(Arg, Args, Allocator);
+  }
+
+  std::error_code ec;
+  do {
+    SmallVector<char, MAX_PATH> FileName;
+    ec = windows::UTF16ToUTF8(FileData.cFileName, wcslen(FileData.cFileName),
+                              FileName);
+    if (ec)
+      break;
+
+    // Push the filename onto Dir, and remove it afterwards.
+    llvm::sys::path::append(Dir, StringRef(FileName.data(), FileName.size()));
+    AllocateAndPush(Dir, Args, Allocator);
+    Dir.resize(DirSize);
+  } while (FindNextFileW(FindHandle, &FileData));
+
+  FindClose(FindHandle);
+  return ec;
+}
+
 std::error_code
 Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
                            ArrayRef<const char *>,
                            SpecificBumpPtrAllocator<char> &ArgAllocator) {
-  int NewArgCount;
-  std::error_code ec;
-
-  wchar_t **UnicodeCommandLine = CommandLineToArgvW(GetCommandLineW(),
-                                                    &NewArgCount);
+  int ArgCount;
+  wchar_t **UnicodeCommandLine =
+      CommandLineToArgvW(GetCommandLineW(), &ArgCount);
   if (!UnicodeCommandLine)
     return windows_error(::GetLastError());
 
-  Args.reserve(NewArgCount);
+  Args.reserve(ArgCount);
+  std::error_code ec;
 
-  for (int i = 0; i < NewArgCount; ++i) {
-    SmallVector<char, MAX_PATH> NewArgString;
-    ec = windows::UTF16ToUTF8(UnicodeCommandLine[i],
-                              wcslen(UnicodeCommandLine[i]),
-                              NewArgString);
+  for (int i = 0; i < ArgCount; ++i) {
+    ec = WildcardExpand(UnicodeCommandLine[i], Args, ArgAllocator);
     if (ec)
       break;
-
-    char *Buffer = ArgAllocator.Allocate(NewArgString.size() + 1);
-    ::memcpy(Buffer, NewArgString.data(), NewArgString.size() + 1);
-    Args.push_back(Buffer);
   }
-  LocalFree(UnicodeCommandLine);
-  if (ec)
-    return ec;
 
+  LocalFree(UnicodeCommandLine);
+  return ec;
+}
+
+std::error_code Process::FixupStandardFileDescriptors() {
+  return std::error_code();
+}
+
+std::error_code Process::SafelyCloseFileDescriptor(int FD) {
+  if (::close(FD) < 0)
+    return std::error_code(errno, std::generic_category());
   return std::error_code();
 }
 

diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index b2f71ae..72c2a58 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc

@@ -12,7 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "WindowsSupport.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/WindowsError.h"
 #include <cstdio>
 #include <fcntl.h>
 #include <io.h>
@@ -28,43 +32,67 @@
 
 ProcessInfo::ProcessInfo() : ProcessHandle(0), Pid(0), ReturnCode(0) {}
 
-// This function just uses the PATH environment variable to find the program.
-std::string sys::FindProgramByName(const std::string &progName) {
-  // Check some degenerate cases
-  if (progName.length() == 0) // no program
-    return "";
-  std::string temp = progName;
-  // Return paths with slashes verbatim.
-  if (progName.find('\\') != std::string::npos ||
-      progName.find('/') != std::string::npos)
-    return temp;
+ErrorOr<std::string> sys::findProgramByName(StringRef Name,
+                                            ArrayRef<StringRef> Paths) {
+  assert(!Name.empty() && "Must have a name!");
 
-  // At this point, the file name is valid and does not contain slashes.
-  // Let Windows search for it.
-  SmallVector<wchar_t, MAX_PATH> progNameUnicode;
-  if (windows::UTF8ToUTF16(progName, progNameUnicode))
-    return "";
+  if (Name.find_first_of("/\\") != StringRef::npos)
+    return std::string(Name);
 
-  SmallVector<wchar_t, MAX_PATH> buffer;
-  DWORD len = MAX_PATH;
-  do {
-    buffer.reserve(len);
-    len = ::SearchPathW(NULL, progNameUnicode.data(), L".exe",
-                        buffer.capacity(), buffer.data(), NULL);
+  const wchar_t *Path = nullptr;
+  std::wstring PathStorage;
+  if (!Paths.empty()) {
+    PathStorage.reserve(Paths.size() * MAX_PATH);
+    for (unsigned i = 0; i < Paths.size(); ++i) {
+      if (i)
+        PathStorage.push_back(L';');
+      StringRef P = Paths[i];
+      SmallVector<wchar_t, MAX_PATH> TmpPath;
+      if (std::error_code EC = windows::UTF8ToUTF16(P, TmpPath))
+        return EC;
+      PathStorage.append(TmpPath.begin(), TmpPath.end());
+    }
+    Path = PathStorage.c_str();
+  }
 
-    // See if it wasn't found.
-    if (len == 0)
-      return "";
+  SmallVector<wchar_t, MAX_PATH> U16Name;
+  if (std::error_code EC = windows::UTF8ToUTF16(Name, U16Name))
+    return EC;
 
-    // Buffer was too small; grow and retry.
-  } while (len > buffer.capacity());
+  SmallVector<StringRef, 12> PathExts;
+  PathExts.push_back("");
+  PathExts.push_back(".exe"); // FIXME: This must be in %PATHEXT%.
+  SplitString(std::getenv("PATHEXT"), PathExts, ";");
 
-  buffer.set_size(len);
-  SmallVector<char, MAX_PATH> result;
-  if (windows::UTF16ToUTF8(buffer.begin(), buffer.size(), result))
-    return "";
+  SmallVector<wchar_t, MAX_PATH> U16Result;
+  DWORD Len = MAX_PATH;
+  for (StringRef Ext : PathExts) {
+    SmallVector<wchar_t, MAX_PATH> U16Ext;
+    if (std::error_code EC = windows::UTF8ToUTF16(Ext, U16Ext))
+      return EC;
 
-  return std::string(result.data(), result.size());
+    do {
+      U16Result.reserve(Len);
+      Len = ::SearchPathW(Path, c_str(U16Name),
+                          U16Ext.empty() ? nullptr : c_str(U16Ext),
+                          U16Result.capacity(), U16Result.data(), nullptr);
+    } while (Len > U16Result.capacity());
+
+    if (Len != 0)
+      break; // Found it.
+  }
+
+  if (Len == 0)
+    return mapWindowsError(::GetLastError());
+
+  U16Result.set_size(Len);
+
+  SmallVector<char, MAX_PATH> U8Result;
+  if (std::error_code EC =
+          windows::UTF16ToUTF8(U16Result.data(), U16Result.size(), U8Result))
+    return EC;
+
+  return std::string(U8Result.begin(), U8Result.end());
 }
 
 static HANDLE RedirectIO(const StringRef *path, int fd, std::string* ErrMsg) {
@@ -166,19 +194,7 @@
 
 }
 
-static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
-                    const char **envp, const StringRef **redirects,
-                    unsigned memoryLimit, std::string *ErrMsg) {
-  if (!sys::fs::can_execute(Program)) {
-    if (ErrMsg)
-      *ErrMsg = "program not executable";
-    return false;
-  }
-
-  // Windows wants a command line, not an array of args, to pass to the new
-  // process.  We have to concatenate them all, while quoting the args that
-  // have embedded spaces (or are empty).
-
+static std::unique_ptr<char[]> flattenArgs(const char **args) {
   // First, determine the length of the command line.
   unsigned len = 0;
   for (unsigned i = 0; args[i]; i++) {
@@ -216,6 +232,22 @@
   }
 
   *p = 0;
+  return command;
+}
+
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
+                    const char **envp, const StringRef **redirects,
+                    unsigned memoryLimit, std::string *ErrMsg) {
+  if (!sys::fs::can_execute(Program)) {
+    if (ErrMsg)
+      *ErrMsg = "program not executable";
+    return false;
+  }
+
+  // Windows wants a command line, not an array of args, to pass to the new
+  // process.  We have to concatenate them all, while quoting the args that
+  // have embedded spaces (or are empty).
+  std::unique_ptr<char[]> command = flattenArgs(args);
 
   // The pointer to the environment block for the new process.
   std::vector<wchar_t> EnvBlock;
@@ -422,20 +454,64 @@
   return WaitResult;
 }
 
-  std::error_code sys::ChangeStdinToBinary(){
-  int result = _setmode( _fileno(stdin), _O_BINARY );
+std::error_code sys::ChangeStdinToBinary() {
+  int result = _setmode(_fileno(stdin), _O_BINARY);
   if (result == -1)
     return std::error_code(errno, std::generic_category());
   return std::error_code();
 }
 
-  std::error_code sys::ChangeStdoutToBinary(){
-  int result = _setmode( _fileno(stdout), _O_BINARY );
+std::error_code sys::ChangeStdoutToBinary() {
+  int result = _setmode(_fileno(stdout), _O_BINARY);
   if (result == -1)
     return std::error_code(errno, std::generic_category());
   return std::error_code();
 }
 
+std::error_code
+llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
+                                 WindowsEncodingMethod Encoding) {
+  std::error_code EC;
+  llvm::raw_fd_ostream OS(FileName, EC, llvm::sys::fs::OpenFlags::F_Text);
+  if (EC)
+    return EC;
+
+  if (Encoding == WEM_UTF8) {
+    OS << Contents;
+  } else if (Encoding == WEM_CurrentCodePage) {
+    SmallVector<wchar_t, 1> ArgsUTF16;
+    SmallVector<char, 1> ArgsCurCP;
+
+    if ((EC = windows::UTF8ToUTF16(Contents, ArgsUTF16)))
+      return EC;
+
+    if ((EC = windows::UTF16ToCurCP(
+             ArgsUTF16.data(), ArgsUTF16.size(), ArgsCurCP)))
+      return EC;
+
+    OS.write(ArgsCurCP.data(), ArgsCurCP.size());
+  } else if (Encoding == WEM_UTF16) {
+    SmallVector<wchar_t, 1> ArgsUTF16;
+
+    if ((EC = windows::UTF8ToUTF16(Contents, ArgsUTF16)))
+      return EC;
+
+    // Endianness guessing
+    char BOM[2];
+    uint16_t src = UNI_UTF16_BYTE_ORDER_MARK_NATIVE;
+    memcpy(BOM, &src, 2);
+    OS.write(BOM, 2);
+    OS.write((char *)ArgsUTF16.data(), ArgsUTF16.size() << 1);
+  } else {
+    llvm_unreachable("Unknown encoding");
+  }
+
+  if (OS.has_error())
+    return std::make_error_code(std::errc::io_error);
+
+  return EC;
+}
+
 bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
   // The documented max length of the command line passed to CreateProcess.
   static const size_t MaxCommandStringLength = 32768;

diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
index 00d0e93..2d1d25f 100644
--- a/lib/Support/Windows/RWMutex.inc
+++ b/lib/Support/Windows/RWMutex.inc

@@ -84,12 +84,10 @@
 }
 
 RWMutexImpl::~RWMutexImpl() {
-  if (sHasSRW) {
-    // Nothing to do in the case of slim reader/writers
-  } else {
+  if (!sHasSRW)
     DeleteCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
-    free(data_);
-  }
+  // Nothing to do in the case of slim reader/writers except free the memory.
+  free(data_);
 }
 
 bool RWMutexImpl::reader_acquire() {

diff --git a/lib/Support/Windows/ThreadLocal.inc b/lib/Support/Windows/ThreadLocal.inc
index 3914cf7..14ce619 100644
--- a/lib/Support/Windows/ThreadLocal.inc
+++ b/lib/Support/Windows/ThreadLocal.inc

@@ -23,7 +23,7 @@
 using namespace sys;
 
 ThreadLocalImpl::ThreadLocalImpl() : data() {
-  typedef int SIZE_TOO_BIG[sizeof(DWORD) <= sizeof(data) ? 1 : -1];
+  static_assert(sizeof(DWORD) <= sizeof(data), "size too big");
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   *tls = TlsAlloc();
   assert(*tls != TLS_OUT_OF_INDEXES);

diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index f68835b..6d9c5fb 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h

@@ -166,6 +166,9 @@
 std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
 std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
                             SmallVectorImpl<char> &utf8);
+/// Convert from UTF16 to the current code page used in the system
+std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
+                             SmallVectorImpl<char> &utf8);
 } // end namespace windows
 } // end namespace sys
 } // end namespace llvm.

diff --git a/lib/Support/Windows/explicit_symbols.inc b/lib/Support/Windows/explicit_symbols.inc
index 379645d..cd56b13 100644
--- a/lib/Support/Windows/explicit_symbols.inc
+++ b/lib/Support/Windows/explicit_symbols.inc

@@ -63,4 +63,34 @@
 /* msvcrt */
 #if defined(_MSC_VER)
   EXPLICIT_SYMBOL2(alloca, _alloca_probe)
+
+#ifdef _M_IX86
+#define INLINE_DEF_FLOAT_SYMBOL(SYM, ARGC) INLINE_DEF_SYMBOL##ARGC(float, SYM)
+  INLINE_DEF_FLOAT_SYMBOL(acosf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(asinf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(atanf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(atan2f, 2)
+  INLINE_DEF_FLOAT_SYMBOL(ceilf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(cosf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(coshf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(expf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(floorf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(fmodf, 2)
+  INLINE_DEF_FLOAT_SYMBOL(logf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(powf, 2)
+  INLINE_DEF_FLOAT_SYMBOL(sinf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(sinhf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(sqrtf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(tanf, 1)
+  INLINE_DEF_FLOAT_SYMBOL(tanhf, 1)
+
+  // These were added in VS 2013.
+#if (1800 <= _MSC_VER && _MSC_VER < 1900)
+  INLINE_DEF_FLOAT_SYMBOL(copysignf, 2)
+  INLINE_DEF_FLOAT_SYMBOL(fminf, 2)
+  INLINE_DEF_FLOAT_SYMBOL(fmaxf, 2)
+#endif
+#undef INLINE_DEF_FLOAT_SYMBOL
+#endif
+
 #endif

diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 3be02ee..4688ff1 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp

@@ -259,8 +259,8 @@
 /// @brief Scans YAML tokens from a MemoryBuffer.
 class Scanner {
 public:
-  Scanner(const StringRef Input, SourceMgr &SM);
-  Scanner(MemoryBuffer *Buffer, SourceMgr &SM_);
+  Scanner(StringRef Input, SourceMgr &SM);
+  Scanner(MemoryBufferRef Buffer, SourceMgr &SM_);
 
   /// @brief Parse the next token and return it without popping it.
   Token &peekNext();
@@ -294,6 +294,8 @@
   }
 
 private:
+  void init(MemoryBufferRef Buffer);
+
   StringRef currentInput() {
     return StringRef(Current, End - Current);
   }
@@ -469,7 +471,7 @@
   SourceMgr &SM;
 
   /// @brief The original input.
-  MemoryBuffer *InputBuffer;
+  MemoryBufferRef InputBuffer;
 
   /// @brief The current position of the scanner.
   StringRef::iterator Current;
@@ -699,34 +701,28 @@
   return EscapedInput;
 }
 
-Scanner::Scanner(StringRef Input, SourceMgr &sm)
-  : SM(sm)
-  , Indent(-1)
-  , Column(0)
-  , Line(0)
-  , FlowLevel(0)
-  , IsStartOfStream(true)
-  , IsSimpleKeyAllowed(true)
-  , Failed(false) {
-  InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
-  SM.AddNewSourceBuffer(InputBuffer, SMLoc());
-  Current = InputBuffer->getBufferStart();
-  End = InputBuffer->getBufferEnd();
+Scanner::Scanner(StringRef Input, SourceMgr &sm) : SM(sm) {
+  init(MemoryBufferRef(Input, "YAML"));
 }
 
-Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_)
-  : SM(SM_)
-  , InputBuffer(Buffer)
-  , Current(InputBuffer->getBufferStart())
-  , End(InputBuffer->getBufferEnd())
-  , Indent(-1)
-  , Column(0)
-  , Line(0)
-  , FlowLevel(0)
-  , IsStartOfStream(true)
-  , IsSimpleKeyAllowed(true)
-  , Failed(false) {
-    SM.AddNewSourceBuffer(InputBuffer, SMLoc());
+Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_) : SM(SM_) {
+  init(Buffer);
+}
+
+void Scanner::init(MemoryBufferRef Buffer) {
+  InputBuffer = Buffer;
+  Current = InputBuffer.getBufferStart();
+  End = InputBuffer.getBufferEnd();
+  Indent = -1;
+  Column = 0;
+  Line = 0;
+  FlowLevel = 0;
+  IsStartOfStream = true;
+  IsSimpleKeyAllowed = true;
+  Failed = false;
+  std::unique_ptr<MemoryBuffer> InputBufferOwner =
+      MemoryBuffer::getMemBuffer(Buffer);
+  SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
 }
 
 Token &Scanner::peekNext() {
@@ -1524,7 +1520,7 @@
 Stream::Stream(StringRef Input, SourceMgr &SM)
     : scanner(new Scanner(Input, SM)), CurrentDoc() {}
 
-Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM)
+Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM)
     : scanner(new Scanner(InputBuffer, SM)), CurrentDoc() {}
 
 Stream::~Stream() {}

diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 5212624..81edca2 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp

@@ -63,6 +63,8 @@
 void Input::HNode::anchor() {}
 void Input::EmptyHNode::anchor() {}
 void Input::ScalarHNode::anchor() {}
+void Input::MapHNode::anchor() {}
+void Input::SequenceHNode::anchor() {}
 
 bool Input::outputting() {
   return false;
@@ -82,7 +84,7 @@
       ++DocIterator;
       return setCurrentDocument();
     }
-    TopNode.reset(this->createHNodes(N));
+    TopNode = this->createHNodes(N);
     CurrentNode = TopNode.get();
     return true;
   }
@@ -133,7 +135,7 @@
     return false;
   }
   MN->ValidKeys.push_back(Key);
-  HNode *Value = MN->Mapping[Key];
+  HNode *Value = MN->Mapping[Key].get();
   if (!Value) {
     if (Required)
       setError(CurrentNode, Twine("missing required key '") + Key + "'");
@@ -159,7 +161,7 @@
     return;
   for (const auto &NN : MN->Mapping) {
     if (!MN->isValidKey(NN.first())) {
-      setError(NN.second, Twine("unknown key '") + NN.first() + "'");
+      setError(NN.second.get(), Twine("unknown key '") + NN.first() + "'");
       break;
     }
   }
@@ -180,7 +182,7 @@
     return false;
   if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
     SaveInfo = CurrentNode;
-    CurrentNode = SQ->Entries[Index];
+    CurrentNode = SQ->Entries[Index].get();
     return true;
   }
   return false;
@@ -202,7 +204,7 @@
     return false;
   if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
     SaveInfo = CurrentNode;
-    CurrentNode = SQ->Entries[index];
+    CurrentNode = SQ->Entries[index].get();
     return true;
   }
   return false;
@@ -253,8 +255,8 @@
     return false;
   if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
     unsigned Index = 0;
-    for (HNode *N : SQ->Entries) {
-      if (ScalarHNode *SN = dyn_cast<ScalarHNode>(N)) {
+    for (auto &N : SQ->Entries) {
+      if (ScalarHNode *SN = dyn_cast<ScalarHNode>(N.get())) {
         if (SN->value().equals(Str)) {
           BitValuesUsed[Index] = true;
           return true;
@@ -277,7 +279,7 @@
     assert(BitValuesUsed.size() == SQ->Entries.size());
     for (unsigned i = 0; i < SQ->Entries.size(); ++i) {
       if (!BitValuesUsed[i]) {
-        setError(SQ->Entries[i], "unknown bit value");
+        setError(SQ->Entries[i].get(), "unknown bit value");
         return;
       }
     }
@@ -302,7 +304,7 @@
   EC = make_error_code(errc::invalid_argument);
 }
 
-Input::HNode *Input::createHNodes(Node *N) {
+std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
   SmallString<128> StringStorage;
   if (ScalarNode *SN = dyn_cast<ScalarNode>(N)) {
     StringRef KeyStr = SN->getValue(StringStorage);
@@ -313,20 +315,25 @@
       memcpy(Buf, &StringStorage[0], Len);
       KeyStr = StringRef(Buf, Len);
     }
-    return new ScalarHNode(N, KeyStr);
+    return llvm::make_unique<ScalarHNode>(N, KeyStr);
   } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) {
-    SequenceHNode *SQHNode = new SequenceHNode(N);
+    auto SQHNode = llvm::make_unique<SequenceHNode>(N);
     for (Node &SN : *SQ) {
-      HNode *Entry = this->createHNodes(&SN);
+      auto Entry = this->createHNodes(&SN);
       if (EC)
         break;
-      SQHNode->Entries.push_back(Entry);
+      SQHNode->Entries.push_back(std::move(Entry));
     }
-    return SQHNode;
+    return std::move(SQHNode);
   } else if (MappingNode *Map = dyn_cast<MappingNode>(N)) {
-    MapHNode *mapHNode = new MapHNode(N);
+    auto mapHNode = llvm::make_unique<MapHNode>(N);
     for (KeyValueNode &KVN : *Map) {
-      ScalarNode *KeyScalar = dyn_cast<ScalarNode>(KVN.getKey());
+      Node *KeyNode = KVN.getKey();
+      ScalarNode *KeyScalar = dyn_cast<ScalarNode>(KeyNode);
+      if (!KeyScalar) {
+        setError(KeyNode, "Map key must be a scalar");
+        break;
+      }
       StringStorage.clear();
       StringRef KeyStr = KeyScalar->getValue(StringStorage);
       if (!StringStorage.empty()) {
@@ -336,14 +343,14 @@
         memcpy(Buf, &StringStorage[0], Len);
         KeyStr = StringRef(Buf, Len);
       }
-      HNode *ValueHNode = this->createHNodes(KVN.getValue());
+      auto ValueHNode = this->createHNodes(KVN.getValue());
       if (EC)
         break;
-      mapHNode->Mapping[KeyStr] = ValueHNode;
+      mapHNode->Mapping[KeyStr] = std::move(ValueHNode);
     }
-    return mapHNode;
+    return std::move(mapHNode);
   } else if (isa<NullNode>(N)) {
-    return new EmptyHNode(N);
+    return llvm::make_unique<EmptyHNode>(N);
   } else {
     setError(N, "unknown node kind");
     return nullptr;
@@ -366,18 +373,6 @@
   return false;
 }
 
-Input::MapHNode::~MapHNode() {
-  for (auto &N : Mapping)
-    delete N.second;
-}
-
-Input::SequenceHNode::~SequenceHNode() {
-  for (HNode *N : Entries)
-    delete N;
-}
-
-
-
 //===----------------------------------------------------------------------===//
 //  Output
 //===----------------------------------------------------------------------===//

diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index f7c213a..bbbbe4a 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp

@@ -20,6 +20,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include <cctype>
@@ -394,6 +395,62 @@
   }
 }
 
+raw_ostream &raw_ostream::operator<<(const FormattedString &FS) {
+  unsigned Len = FS.Str.size(); 
+  int PadAmount = FS.Width - Len;
+  if (FS.RightJustify && (PadAmount > 0))
+    this->indent(PadAmount);
+  this->operator<<(FS.Str);
+  if (!FS.RightJustify && (PadAmount > 0))
+    this->indent(PadAmount);
+  return *this;
+}
+
+raw_ostream &raw_ostream::operator<<(const FormattedNumber &FN) {
+  if (FN.Hex) {
+    unsigned Nibbles = (64 - countLeadingZeros(FN.HexValue)+3)/4;
+    unsigned Width = (FN.Width > Nibbles+2) ? FN.Width : Nibbles+2;
+        
+    char NumberBuffer[20] = "0x0000000000000000";
+    char *EndPtr = NumberBuffer+Width;
+    char *CurPtr = EndPtr;
+    const char A = FN.Upper ? 'A' : 'a';
+    unsigned long long N = FN.HexValue;
+    while (N) {
+      uintptr_t x = N % 16;
+      *--CurPtr = (x < 10 ? '0' + x : A + x - 10);
+      N /= 16;
+    }
+
+    return write(NumberBuffer, Width);
+  } else {
+    // Zero is a special case.
+    if (FN.DecValue == 0) {
+      this->indent(FN.Width-1);
+      return *this << '0';
+    }
+    char NumberBuffer[32];
+    char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+    char *CurPtr = EndPtr;
+    bool Neg = (FN.DecValue < 0);
+    uint64_t N = Neg ? -static_cast<uint64_t>(FN.DecValue) : FN.DecValue;
+    while (N) {
+      *--CurPtr = '0' + char(N % 10);
+      N /= 10;
+    }
+    int Len = EndPtr - CurPtr;
+    int Pad = FN.Width - Len;
+    if (Neg) 
+      --Pad;
+    if (Pad > 0)
+      this->indent(Pad);
+    if (Neg)
+      *this << '-';
+    return write(CurPtr, Len);
+  }
+}
+
+
 /// indent - Insert 'NumSpaces' spaces.
 raw_ostream &raw_ostream::indent(unsigned NumSpaces) {
   static const char Spaces[] = "                                "
@@ -426,20 +483,14 @@
 //  raw_fd_ostream
 //===----------------------------------------------------------------------===//
 
-/// raw_fd_ostream - Open the specified file for writing. If an error
-/// occurs, information about the error is put into ErrorInfo, and the
-/// stream should be immediately destroyed; the string will be empty
-/// if no error occurred.
-raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
+raw_fd_ostream::raw_fd_ostream(StringRef Filename, std::error_code &EC,
                                sys::fs::OpenFlags Flags)
     : Error(false), UseAtomicWrites(false), pos(0) {
-  assert(Filename && "Filename is null");
-  ErrorInfo.clear();
-
+  EC = std::error_code();
   // Handle "-" as stdout. Note that when we do this, we consider ourself
   // the owner of stdout. This means that we can do things like close the
   // file descriptor when we're done and set the "binary" flag globally.
-  if (Filename[0] == '-' && Filename[1] == 0) {
+  if (Filename == "-") {
     FD = STDOUT_FILENO;
     // If user requested binary then put stdout into binary mode if
     // possible.
@@ -450,11 +501,9 @@
     return;
   }
 
-  std::error_code EC = sys::fs::openFileForWrite(Filename, FD, Flags);
+  EC = sys::fs::openFileForWrite(Filename, FD, Flags);
 
   if (EC) {
-    ErrorInfo = "Error opening output file '" + std::string(Filename) + "': " +
-                EC.message();
     ShouldClose = false;
     return;
   }
@@ -487,12 +536,8 @@
 raw_fd_ostream::~raw_fd_ostream() {
   if (FD >= 0) {
     flush();
-    if (ShouldClose)
-      while (::close(FD) != 0)
-        if (errno != EINTR) {
-          error_detected();
-          break;
-        }
+    if (ShouldClose && sys::Process::SafelyCloseFileDescriptor(FD))
+      error_detected();
   }
 
 #ifdef __MINGW32__
@@ -566,11 +611,8 @@
   assert(ShouldClose);
   ShouldClose = false;
   flush();
-  while (::close(FD) != 0)
-    if (errno != EINTR) {
-      error_detected();
-      break;
-    }
+  if (sys::Process::SafelyCloseFileDescriptor(FD))
+    error_detected();
   FD = -1;
 }
 
@@ -660,7 +702,7 @@
 /// Use it like: outs() << "foo" << "bar";
 raw_ostream &llvm::outs() {
   // Set buffer settings to model stdout behavior.
-  // Delete the file descriptor when the program exists, forcing error
+  // Delete the file descriptor when the program exits, forcing error
   // detection. If you don't want this behavior, don't use outs().
   static raw_fd_ostream S(STDOUT_FILENO, true);
   return S;
@@ -729,24 +771,17 @@
 }
 
 void raw_svector_ostream::write_impl(const char *Ptr, size_t Size) {
-  // If we're writing bytes from the end of the buffer into the smallvector, we
-  // don't need to copy the bytes, just commit the bytes because they are
-  // already in the right place.
   if (Ptr == OS.end()) {
-    assert(OS.size() + Size <= OS.capacity() && "Invalid write_impl() call!");
-    OS.set_size(OS.size() + Size);
+    // Grow the buffer to include the scratch area without copying.
+    size_t NewSize = OS.size() + Size;
+    assert(NewSize <= OS.capacity() && "Invalid write_impl() call!");
+    OS.set_size(NewSize);
   } else {
-    assert(GetNumBytesInBuffer() == 0 &&
-           "Should be writing from buffer if some bytes in it");
-    // Otherwise, do copy the bytes.
-    OS.append(Ptr, Ptr+Size);
+    assert(!GetNumBytesInBuffer());
+    OS.append(Ptr, Ptr + Size);
   }
 
-  // Grow the vector if necessary.
-  if (OS.capacity() - OS.size() < 64)
-    OS.reserve(OS.capacity() * 2);
-
-  // Update the buffer position.
+  OS.reserve(OS.size() + 64);
   SetBuffer(OS.end(), OS.capacity() - OS.size());
 }
 

diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index e317fbf..2578cc2 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp

@@ -56,11 +56,11 @@
     errs() << argv0 << ": the option -d must be used together with -o\n";
     return 1;
   }
-  std::string Error;
-  tool_output_file DepOut(DependFilename.c_str(), Error, sys::fs::F_Text);
-  if (!Error.empty()) {
-    errs() << argv0 << ": error opening " << DependFilename
-      << ":" << Error << "\n";
+  std::error_code EC;
+  tool_output_file DepOut(DependFilename, EC, sys::fs::F_Text);
+  if (EC) {
+    errs() << argv0 << ": error opening " << DependFilename << ":"
+           << EC.message() << "\n";
     return 1;
   }
   DepOut.os() << OutputFilename << ":";
@@ -88,10 +88,9 @@
            << "': " << EC.message() << "\n";
     return 1;
   }
-  MemoryBuffer *F = FileOrErr.get().release();
 
   // Tell SrcMgr about this buffer, which is what TGParser will pick up.
-  SrcMgr.AddNewSourceBuffer(F, SMLoc());
+  SrcMgr.AddNewSourceBuffer(std::move(*FileOrErr), SMLoc());
 
   // Record the location of the include directory so that the lexer can find
   // it later.
@@ -102,11 +101,11 @@
   if (Parser.ParseFile())
     return 1;
 
-  std::string Error;
-  tool_output_file Out(OutputFilename.c_str(), Error, sys::fs::F_Text);
-  if (!Error.empty()) {
-    errs() << argv0 << ": error opening " << OutputFilename
-      << ":" << Error << "\n";
+  std::error_code EC;
+  tool_output_file Out(OutputFilename, EC, sys::fs::F_Text);
+  if (EC) {
+    errs() << argv0 << ": error opening " << OutputFilename << ":"
+           << EC.message() << "\n";
     return 1;
   }
   if (!DependFilename.empty()) {

diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index f7843dc..34e3ab4 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp

@@ -114,8 +114,21 @@
 
 Init *BitRecTy::convertValue(TypedInit *VI) {
   RecTy *Ty = VI->getType();
-  if (isa<BitRecTy>(Ty) || isa<BitsRecTy>(Ty) || isa<IntRecTy>(Ty))
+  if (isa<BitRecTy>(Ty))
     return VI;  // Accept variable if it is already of bit type!
+  if (auto *BitsTy = dyn_cast<BitsRecTy>(Ty))
+    // Accept only bits<1> expression.
+    return BitsTy->getNumBits() == 1 ? VI : nullptr;
+  // Ternary !if can be converted to bit, but only if both sides are
+  // convertible to a bit.
+  if (TernOpInit *TOI = dyn_cast<TernOpInit>(VI)) {
+    if (TOI->getOpcode() != TernOpInit::TernaryOp::IF)
+      return nullptr;
+    if (!TOI->getMHS()->convertInitializerTo(BitRecTy::get()) ||
+        !TOI->getRHS()->convertInitializerTo(BitRecTy::get()))
+      return nullptr;
+    return TOI;
+  }
   return nullptr;
 }
 
@@ -952,17 +965,21 @@
     break;
   }
   case ADD:
+  case AND:
   case SHL:
   case SRA:
   case SRL: {
-    IntInit *LHSi = dyn_cast<IntInit>(LHS);
-    IntInit *RHSi = dyn_cast<IntInit>(RHS);
+    IntInit *LHSi =
+      dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get()));
+    IntInit *RHSi =
+      dyn_cast_or_null<IntInit>(RHS->convertInitializerTo(IntRecTy::get()));
     if (LHSi && RHSi) {
       int64_t LHSv = LHSi->getValue(), RHSv = RHSi->getValue();
       int64_t Result;
       switch (getOpcode()) {
       default: llvm_unreachable("Bad opcode!");
       case ADD: Result = LHSv +  RHSv; break;
+      case AND: Result = LHSv &  RHSv; break;
       case SHL: Result = LHSv << RHSv; break;
       case SRA: Result = LHSv >> RHSv; break;
       case SRL: Result = (uint64_t)LHSv >> (uint64_t)RHSv; break;
@@ -989,6 +1006,7 @@
   switch (Opc) {
   case CONCAT: Result = "!con"; break;
   case ADD: Result = "!add"; break;
+  case AND: Result = "!and"; break;
   case SHL: Result = "!shl"; break;
   case SRA: Result = "!sra"; break;
   case SRL: Result = "!srl"; break;
@@ -1690,13 +1708,6 @@
 }
 
 void Record::setName(Init *NewName) {
-  if (TrackedRecords.getDef(Name->getAsUnquotedString()) == this) {
-    TrackedRecords.removeDef(Name->getAsUnquotedString());
-    TrackedRecords.addDef(this);
-  } else if (TrackedRecords.getClass(Name->getAsUnquotedString()) == this) {
-    TrackedRecords.removeClass(Name->getAsUnquotedString());
-    TrackedRecords.addClass(this);
-  }  // Otherwise this isn't yet registered.
   Name = NewName;
   checkName();
   // DO NOT resolve record values to the name at this point because
@@ -1996,16 +2007,14 @@
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) {
   OS << "------------- Classes -----------------\n";
-  const std::map<std::string, Record*> &Classes = RK.getClasses();
-  for (std::map<std::string, Record*>::const_iterator I = Classes.begin(),
-         E = Classes.end(); I != E; ++I)
-    OS << "class " << *I->second;
+  const auto &Classes = RK.getClasses();
+  for (const auto &C : Classes)
+    OS << "class " << *C.second;
 
   OS << "------------- Defs -----------------\n";
-  const std::map<std::string, Record*> &Defs = RK.getDefs();
-  for (std::map<std::string, Record*>::const_iterator I = Defs.begin(),
-         E = Defs.end(); I != E; ++I)
-    OS << "def " << *I->second;
+  const auto &Defs = RK.getDefs();
+  for (const auto &D : Defs)
+    OS << "def " << *D.second;
   return OS;
 }
 
@@ -2020,10 +2029,9 @@
     PrintFatalError("ERROR: Couldn't find the `" + ClassName + "' class!\n");
 
   std::vector<Record*> Defs;
-  for (std::map<std::string, Record*>::const_iterator I = getDefs().begin(),
-         E = getDefs().end(); I != E; ++I)
-    if (I->second->isSubClassOf(Class))
-      Defs.push_back(I->second);
+  for (const auto &D : getDefs())
+    if (D.second->isSubClassOf(Class))
+      Defs.push_back(D.second.get());
 
   return Defs;
 }

diff --git a/lib/TableGen/TGLexer.cpp b/lib/TableGen/TGLexer.cpp
index fc1d3ca..63b8584 100644
--- a/lib/TableGen/TGLexer.cpp
+++ b/lib/TableGen/TGLexer.cpp

@@ -411,7 +411,7 @@
       if (CurPtr == NumStart)
         return ReturnError(CurPtr-2, "Invalid binary number");
       CurIntVal = strtoll(NumStart, nullptr, 2);
-      return tgtok::IntVal;
+      return tgtok::BinaryIntVal;
     }
   }
 
@@ -471,6 +471,7 @@
     .Case("tail", tgtok::XTail)
     .Case("con", tgtok::XConcat)
     .Case("add", tgtok::XADD)
+    .Case("and", tgtok::XAND)
     .Case("shl", tgtok::XSHL)
     .Case("sra", tgtok::XSRA)
     .Case("srl", tgtok::XSRL)

diff --git a/lib/TableGen/TGLexer.h b/lib/TableGen/TGLexer.h
index a2c95ca..1f750fc 100644
--- a/lib/TableGen/TGLexer.h
+++ b/lib/TableGen/TGLexer.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TGLEXER_H
-#define TGLEXER_H
+#ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
+#define LLVM_LIB_TABLEGEN_TGLEXER_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
@@ -47,11 +47,15 @@
     MultiClass, String,
     
     // !keywords.
-    XConcat, XADD, XSRA, XSRL, XSHL, XListConcat, XStrConcat, XCast, XSubst,
-    XForEach, XHead, XTail, XEmpty, XIf, XEq,
+    XConcat, XADD, XAND, XSRA, XSRL, XSHL, XListConcat, XStrConcat, XCast,
+    XSubst, XForEach, XHead, XTail, XEmpty, XIf, XEq,
 
     // Integer value.
     IntVal,
+
+    // Binary constant.  Note that these are sized according to the number of
+    // bits given.
+    BinaryIntVal,
     
     // String valued tokens.
     Id, StrVal, VarName, CodeFragment
@@ -105,6 +109,11 @@
     assert(CurCode == tgtok::IntVal && "This token isn't an integer");
     return CurIntVal;
   }
+  std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
+    assert(CurCode == tgtok::BinaryIntVal &&
+           "This token isn't a binary integer");
+    return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
+  }
 
   SMLoc getLoc() const;
   

diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 0550692..4d4bbe9 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp

@@ -135,11 +135,18 @@
     V = BitsInit::get(NewBits);
   }
 
-  if (RV->setValue(V))
+  if (RV->setValue(V)) {
+    std::string InitType = "";
+    if (BitsInit *BI = dyn_cast<BitsInit>(V)) {
+      InitType = (Twine("' of type bit initializer with length ") +
+                  Twine(BI->getNumBits())).str();
+    }
     return Error(Loc, "Value '" + ValName->getAsUnquotedString() + "' of type '"
                  + RV->getType()->getAsString() +
                  "' is incompatible with initializer '" + V->getAsString()
+                 + InitType
                  + "'");
+  }
   return false;
 }
 
@@ -225,14 +232,14 @@
        i != iend;
        ++i) {
     // Clone the def and add it to the current multiclass
-    Record *NewDef = new Record(**i);
+    auto NewDef = make_unique<Record>(**i);
 
     // Add all of the values in the superclass into the current def.
     for (unsigned i = 0, e = MCVals.size(); i != e; ++i)
-      if (AddValue(NewDef, SubMultiClass.RefRange.Start, MCVals[i]))
+      if (AddValue(NewDef.get(), SubMultiClass.RefRange.Start, MCVals[i]))
         return true;
 
-    CurMC->DefPrototypes.push_back(NewDef);
+    CurMC->DefPrototypes.push_back(NewDef.release());
   }
 
   const std::vector<Init *> &SMCTArgs = SMC->Rec.getTemplateArgs();
@@ -341,6 +348,7 @@
     TypedInit *IVal = dyn_cast<TypedInit>(IterVals[i].IterValue);
     if (!IVal) {
       Error(Loc, "foreach iterator value is untyped");
+      delete IterRec;
       return true;
     }
 
@@ -349,6 +357,7 @@
     if (SetValue(IterRec, Loc, IterVar->getName(),
                  std::vector<unsigned>(), IVal)) {
       Error(Loc, "when instantiating this def");
+      delete IterRec;
       return true;
     }
 
@@ -365,6 +374,7 @@
       IterRec->setName(GetNewAnonymousName());
     else {
       Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
+      delete IterRec;
       return true;
     }
   }
@@ -904,6 +914,7 @@
 
   case tgtok::XConcat:
   case tgtok::XADD:
+  case tgtok::XAND:
   case tgtok::XSRA:
   case tgtok::XSRL:
   case tgtok::XSHL:
@@ -921,6 +932,7 @@
     default: llvm_unreachable("Unhandled code!");
     case tgtok::XConcat: Code = BinOpInit::CONCAT;Type = DagRecTy::get(); break;
     case tgtok::XADD:    Code = BinOpInit::ADD;   Type = IntRecTy::get(); break;
+    case tgtok::XAND:    Code = BinOpInit::AND;   Type = IntRecTy::get(); break;
     case tgtok::XSRA:    Code = BinOpInit::SRA;   Type = IntRecTy::get(); break;
     case tgtok::XSRL:    Code = BinOpInit::SRL;   Type = IntRecTy::get(); break;
     case tgtok::XSHL:    Code = BinOpInit::SHL;   Type = IntRecTy::get(); break;
@@ -1173,6 +1185,15 @@
     Lex.Lex();  // Skip '#'.
     return ParseSimpleValue(CurRec, ItemType, Mode);
   case tgtok::IntVal: R = IntInit::get(Lex.getCurIntVal()); Lex.Lex(); break;
+  case tgtok::BinaryIntVal: {
+    auto BinaryVal = Lex.getCurBinaryIntVal();
+    SmallVector<Init*, 16> Bits(BinaryVal.second);
+    for (unsigned i = 0, e = BinaryVal.second; i != e; ++i)
+      Bits[i] = BitInit::get(BinaryVal.first & (1LL << i));
+    R = BitsInit::get(Bits);
+    Lex.Lex();
+    break;
+  }
   case tgtok::StrVal: {
     std::string Val = Lex.getCurStrVal();
     Lex.Lex();
@@ -1233,12 +1254,17 @@
     SCRef.Rec = Class;
     SCRef.TemplateArgs = ValueList;
     // Add info about the subclass to NewRec.
-    if (AddSubClass(NewRec, SCRef))
+    if (AddSubClass(NewRec, SCRef)) {
+      delete NewRec;
       return nullptr;
+    }
     if (!CurMultiClass) {
       NewRec->resolveReferences();
       Records.addDef(NewRec);
     } else {
+      // This needs to get resolved once the multiclass template arguments are
+      // known before any use.
+      NewRec->setResolveFirst(true);
       // Otherwise, we're inside a multiclass, add it to the multiclass.
       CurMultiClass->DefPrototypes.push_back(NewRec);
 
@@ -1284,17 +1310,40 @@
     }
     Lex.Lex();  // eat the '}'
 
-    SmallVector<Init *, 16> NewBits(Vals.size());
+    SmallVector<Init *, 16> NewBits;
 
+    // As we parse { a, b, ... }, 'a' is the highest bit, but we parse it
+    // first.  We'll first read everything in to a vector, then we can reverse
+    // it to get the bits in the correct order for the BitsInit value.
     for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+      // FIXME: The following two loops would not be duplicated
+      //        if the API was a little more orthogonal.
+
+      // bits<n> values are allowed to initialize n bits.
+      if (BitsInit *BI = dyn_cast<BitsInit>(Vals[i])) {
+        for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i)
+          NewBits.push_back(BI->getBit((e - i) - 1));
+        continue;
+      }
+      // bits<n> can also come from variable initializers.
+      if (VarInit *VI = dyn_cast<VarInit>(Vals[i])) {
+        if (BitsRecTy *BitsRec = dyn_cast<BitsRecTy>(VI->getType())) {
+          for (unsigned i = 0, e = BitsRec->getNumBits(); i != e; ++i)
+            NewBits.push_back(VI->getBit((e - i) - 1));
+          continue;
+        }
+        // Fallthrough to try convert this to a bit.
+      }
+      // All other values must be convertible to just a single bit.
       Init *Bit = Vals[i]->convertInitializerTo(BitRecTy::get());
       if (!Bit) {
         Error(BraceLoc, "Element #" + utostr(i) + " (" + Vals[i]->getAsString()+
               ") is not convertable to a bit");
         return nullptr;
       }
-      NewBits[Vals.size()-i-1] = Bit;
+      NewBits.push_back(Bit);
     }
+    std::reverse(NewBits.begin(), NewBits.end());
     return BitsInit::get(NewBits);
   }
   case tgtok::l_square: {          // Value ::= '[' ValueList ']'
@@ -1439,6 +1488,7 @@
   case tgtok::XCast:  // Value ::= !unop '(' Value ')'
   case tgtok::XConcat:
   case tgtok::XADD:
+  case tgtok::XAND:
   case tgtok::XSRA:
   case tgtok::XSRL:
   case tgtok::XSHL:
@@ -1727,7 +1777,10 @@
     Init *Val = ParseValue(CurRec, Type);
     if (!Val ||
         SetValue(CurRec, ValLoc, DeclName, std::vector<unsigned>(), Val))
-      return nullptr;
+      // Return the name, even if an error is thrown.  This is so that we can
+      // continue to make some progress, even without the value having been
+      // initialized.
+      return DeclName;
   }
 
   return DeclName;
@@ -1984,6 +2037,7 @@
 
   // Parse ObjectName and make a record for it.
   Record *CurRec;
+  bool CurRecOwnershipTransferred = false;
   Init *Name = ParseObjectName(CurMultiClass);
   if (Name)
     CurRec = new Record(Name, DefLoc, Records);
@@ -1998,9 +2052,11 @@
     if (Records.getDef(CurRec->getNameInitAsString())) {
       Error(DefLoc, "def '" + CurRec->getNameInitAsString()
             + "' already defined");
+      delete CurRec;
       return true;
     }
     Records.addDef(CurRec);
+    CurRecOwnershipTransferred = true;
 
     if (ParseObjectBody(CurRec))
       return true;
@@ -2010,8 +2066,10 @@
     // before this object, instantiated prior to defs derived from this object,
     // and this available for indirect name resolution when defs derived from
     // this object are instantiated.
-    if (ParseObjectBody(CurRec))
+    if (ParseObjectBody(CurRec)) {
+      delete CurRec;
       return true;
+    }
 
     // Otherwise, a def inside a multiclass, add it to the multiclass.
     for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size(); i != e; ++i)
@@ -2019,11 +2077,15 @@
           == CurRec->getNameInit()) {
         Error(DefLoc, "def '" + CurRec->getNameInitAsString() +
               "' already defined in this multiclass!");
+        delete CurRec;
         return true;
       }
     CurMultiClass->DefPrototypes.push_back(CurRec);
-  } else if (ParseObjectBody(CurRec))
+    CurRecOwnershipTransferred = true;
+  } else if (ParseObjectBody(CurRec)) {
+    delete CurRec;
     return true;
+  }
 
   if (!CurMultiClass)  // Def's in multiclasses aren't really defs.
     // See Record::setName().  This resolve step will see any new name
@@ -2049,9 +2111,13 @@
   if (ProcessForeachDefs(CurRec, DefLoc)) {
     Error(DefLoc,
           "Could not process loops for def" + CurRec->getNameInitAsString());
+    if (!CurRecOwnershipTransferred)
+      delete CurRec;
     return true;
   }
 
+  if (!CurRecOwnershipTransferred)
+    delete CurRec;
   return false;
 }
 
@@ -2196,7 +2262,7 @@
   // Add this entry to the let stack.
   std::vector<LetRecord> LetInfo = ParseLetList();
   if (LetInfo.empty()) return true;
-  LetStack.push_back(LetInfo);
+  LetStack.push_back(std::move(LetInfo));
 
   if (Lex.getCode() != tgtok::In)
     return TokError("expected 'in' at end of top-level 'let'");
@@ -2365,6 +2431,7 @@
     Error(DefmPrefixRange.Start, "Could not resolve "
           + CurRec->getNameInitAsString() + ":NAME to '"
           + DefmPrefix->getAsUnquotedString() + "'");
+    delete CurRec;
     return nullptr;
   }
 
@@ -2396,6 +2463,7 @@
       Error(DefmPrefixRange.Start, "def '" + CurRec->getNameInitAsString() +
             "' already defined, instantiating defm with subdef '" + 
             DefProto->getNameInitAsString() + "'");
+      delete CurRec;
       return nullptr;
     }
 
@@ -2535,6 +2603,12 @@
       if (ResolveMulticlassDef(*MC, CurRec, DefProto, DefmLoc))
         return Error(SubClassLoc, "could not instantiate def");
 
+      // Defs that can be used by other definitions should be fully resolved
+      // before any use.
+      if (DefProto->isResolveFirst() && !CurMultiClass) {
+        CurRec->resolveReferences();
+        CurRec->setResolveFirst(false);
+      }
       NewRecDefs.push_back(CurRec);
     }
 

diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index 9f4b7e9..79994cb 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TGPARSER_H
-#define TGPARSER_H
+#ifndef LLVM_LIB_TABLEGEN_TGPARSER_H
+#define LLVM_LIB_TABLEGEN_TGPARSER_H
 
 #include "TGLexer.h"
 #include "llvm/ADT/Twine.h"

diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 7b52e55..e96d18b 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h

@@ -12,13 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TARGET_AArch64_H
-#define TARGET_AArch64_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64_H
 
-#include "Utils/AArch64BaseInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
+#include "Utils/AArch64BaseInfo.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
@@ -36,6 +36,7 @@
 FunctionPass *createAArch64ExpandPseudoPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
 ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();

diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 7742fea..2503764 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp

@@ -102,7 +102,9 @@
 /// A "color", which is either even or odd. Yes, these aren't really colors
 /// but the algorithm is conceptually doing two-color graph coloring.
 enum class Color { Even, Odd };
+#ifndef NDEBUG
 static const char *ColorNames[2] = { "Even", "Odd" };
+#endif
 
 class Chain;
 
@@ -350,7 +352,7 @@
   for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
     std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end());
     if (Cs.empty()) continue;
-    V.push_back(Cs);
+    V.push_back(std::move(Cs));
   }
 
   // Now we have a set of sets, order them by start address so
@@ -375,7 +377,7 @@
   int Parity = 0;
 
   for (auto &I : V)
-    Changed |= colorChainSet(I, MBB, Parity);
+    Changed |= colorChainSet(std::move(I), MBB, Parity);
 
   return Changed;
 }

diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index ab2c4b7..287989f 100644
--- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp

@@ -19,7 +19,7 @@
 // a = add nsw i64 f, 3
 // e = getelementptr ..., i64 a
 //
-// This is legal to do so if the computations are markers with either nsw or nuw
+// This is legal to do if the computations are marked with either nsw or nuw
 // markers.
 // Moreover, the current heuristic is simple: it does not create new sext
 // operations, i.e., it gives up when a sext would have forked (e.g., if
@@ -223,7 +223,7 @@
 }
 
 // Input:
-// - SExtInsts contains all the sext instructions that are use direclty in
+// - SExtInsts contains all the sext instructions that are used directly in
 //   GetElementPtrInst, i.e., access to memory.
 // Algorithm:
 // - For each sext operation in SExtInsts:
@@ -353,7 +353,7 @@
 
     // If the use is already of the right type, connect its uses to its argument
     // and delete it.
-    // This can happen for an Instruction which all uses are sign extended.
+    // This can happen for an Instruction all uses of which are sign extended.
     if (!ToRemove.count(SExt) &&
         SExt->getType() == SExt->getOperand(0)->getType()) {
       DEBUG(dbgs() << "Sign extension is useless, attach its use to "

diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 734fb21..5afe0f4 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp

@@ -36,9 +36,10 @@
 #include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -166,6 +167,12 @@
     return AArch64::ADDv1i64;
   case AArch64::SUBXrr:
     return AArch64::SUBv1i64;
+  case AArch64::ANDXrr:
+    return AArch64::ANDv8i8;
+  case AArch64::EORXrr:
+    return AArch64::EORv8i8;
+  case AArch64::ORRXrr:
+    return AArch64::ORRv8i8;
   }
   // No AdvSIMD equivalent, so just return the original opcode.
   return Opc;
@@ -371,7 +378,8 @@
 
   const TargetMachine &TM = mf.getTarget();
   MRI = &mf.getRegInfo();
-  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
 
   // Just check things on a one-block-at-a-time basis.
   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)

diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index cd94e24..8bee4f5 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp

@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "AArch64MachineFunctionInfo.h"
 #include "AArch64MCInstLower.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "InstPrinter/AArch64InstPrinter.h"
@@ -23,8 +23,8 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -54,7 +54,7 @@
   AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer),
         Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
-        MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr),
+        MCInstLowering(OutContext, *this), SM(*this), AArch64FI(nullptr),
         LOHLabelCounter(0) {}
 
   const char *getPassName() const override {
@@ -145,7 +145,7 @@
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
+      const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);
@@ -252,8 +252,8 @@
                                            const TargetRegisterClass *RC,
                                            bool isVector, raw_ostream &O) {
   assert(MO.isReg() && "Should only get here with a register!");
-  const AArch64RegisterInfo *RI =
-      static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo());
+  const AArch64RegisterInfo *RI = static_cast<const AArch64RegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
   unsigned Reg = MO.getReg();
   unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
   assert(RI->regsOverlap(RegToPrint, Reg));
@@ -518,7 +518,5 @@
 extern "C" void LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
   RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
-
-  RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget);
-  RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64Target);
 }

diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index 484e7e8..e2b6367 100644
--- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp

@@ -12,15 +12,16 @@
 #include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-branch-relax"
@@ -136,7 +137,7 @@
   if (NextBB == MBB->getParent()->end())
     return false;
 
-  for (MachineBasicBlock *S : MBB->successors()) 
+  for (MachineBasicBlock *S : MBB->successors())
     if (S == NextBB)
       return true;
 
@@ -475,7 +476,9 @@
 
   DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
 
-  TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo();
+  TII = (const AArch64InstrInfo *)MF->getTarget()
+            .getSubtargetImpl()
+            ->getInstrInfo();
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.

diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 8e8bd3d..9e707e4 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td

@@ -16,7 +16,7 @@
   CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
 /// CCIfBigEndian - Match only if we're in big endian mode.
 class CCIfBigEndian<CCAction A> :
-  CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
+  CCIf<"State.getMachineFunction().getSubtarget().getDataLayout()->isBigEndian()", A>;
 
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
@@ -54,22 +54,24 @@
 
   CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
            CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                    [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>,
+  CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>,
   CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
            CCAssignToStack<16, 16>>
 ]>;
 
@@ -88,14 +90,16 @@
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
       CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                               [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
 ]>;
 
@@ -129,23 +133,26 @@
 
   CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
            CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                    [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
   CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
-  CCIf<"ValVT == MVT::i16", CCAssignToStack<2, 2>>,
+  CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
 ]>;
 
 def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
@@ -154,13 +161,15 @@
 
   // Handle all scalar types as either i64 or f64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
-  CCIfType<[f32],          CCPromoteToType<f64>>,
+  CCIfType<[f16, f32],     CCPromoteToType<f64>>,
 
   // Everything is on the stack.
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],   CCAssignToStack<16, 16>>
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
 ]>;
 
 // The WebKit_JS calling convention only passes the first argument (the callee)

diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 4d23dc5..aab8e38 100644
--- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp

@@ -94,7 +94,7 @@
     MachineFunction *MF = I->getParent()->getParent();
     const AArch64TargetMachine *TM =
         static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-    const AArch64InstrInfo *TII = TM->getInstrInfo();
+    const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
     // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
     // code sequence assumes the address will be.
@@ -114,7 +114,7 @@
     MachineFunction *MF = I->getParent()->getParent();
     const AArch64TargetMachine *TM =
         static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-    const AArch64InstrInfo *TII = TM->getInstrInfo();
+    const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
     // Create a virtual register for the TLS base address.
     MachineRegisterInfo &RegInfo = MF->getRegInfo();

diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 6b1f096..87b545b 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp

@@ -101,25 +101,26 @@
 #include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-collect-loh"
@@ -194,12 +195,14 @@
 /// Map a basic block to a set of instructions per register.
 /// This is used to represent the exposed uses of a basic block
 /// per register.
-typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
+typedef MapVector<const MachineBasicBlock *,
+                  std::unique_ptr<SetOfMachineInstr[]>>
 BlockToSetOfInstrsPerColor;
 /// Map a basic block to an instruction per register.
 /// This is used to represent the live-out definitions of a basic block
 /// per register.
-typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
+typedef MapVector<const MachineBasicBlock *,
+                  std::unique_ptr<const MachineInstr *[]>>
 BlockToInstrPerColor;
 /// Map an instruction to a set of instructions. Used to represent the
 /// mapping def to reachable uses or use to definitions.
@@ -236,9 +239,9 @@
   SetOfMachineInstr *result;
   BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
   if (it != sets.end())
-    result = it->second;
+    result = it->second.get();
   else
-    result = sets[&MBB] = new SetOfMachineInstr[nbRegs];
+    result = (sets[&MBB] = make_unique<SetOfMachineInstr[]>(nbRegs)).get();
 
   return result[reg];
 }
@@ -283,14 +286,14 @@
                             const MapRegToId &RegToId,
                             const MachineInstr *DummyOp, bool ADRPMode) {
   const TargetMachine &TM = MF.getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
 
   unsigned NbReg = RegToId.size();
 
   for (MachineBasicBlock &MBB : MF) {
-    const MachineInstr **&BBGen = Gen[&MBB];
-    BBGen = new const MachineInstr *[NbReg];
-    memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
+    auto &BBGen = Gen[&MBB];
+    BBGen = make_unique<const MachineInstr *[]>(NbReg);
+    std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr);
 
     BitVector &BBKillSet = Kill[&MBB];
     BBKillSet.resize(NbReg);
@@ -421,22 +424,6 @@
   } while (HasChanged);
 }
 
-/// Release all memory dynamically allocated during the reaching
-/// definition algorithm.
-static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
-                             BlockToSetOfInstrsPerColor &Out,
-                             BlockToInstrPerColor &Gen,
-                             BlockToSetOfInstrsPerColor &ReachableUses) {
-  for (auto &IT : Out)
-    delete[] IT.second;
-  for (auto &IT : In)
-    delete[] IT.second;
-  for (auto &IT : ReachableUses)
-    delete[] IT.second;
-  for (auto &IT : Gen)
-    delete[] IT.second;
-}
-
 /// Reaching definition algorithm.
 /// \param MF function on which the algorithm will operate.
 /// \param[out] ColorOpToReachedUses will contain the result of the reaching
@@ -473,9 +460,6 @@
   if (!DummyOp)
     reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
                          ReachableUses, RegToId.size());
-
-  // finit.
-  finitReachingDef(In, Out, Gen, ReachableUses);
 }
 
 #ifndef NDEBUG
@@ -1043,7 +1027,7 @@
 
 bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
   const TargetMachine &TM = MF.getTarget();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
   const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
 
   MapRegToId RegToId;
@@ -1059,8 +1043,8 @@
 
   MachineInstr *DummyOp = nullptr;
   if (BasicBlockScopeOnly) {
-    const AArch64InstrInfo *TII =
-        static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+    const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>(
+        TM.getSubtargetImpl()->getInstrInfo());
     // For local analysis, create a dummy operation to record uses that are not
     // local.
     DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());

diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
new file mode 100644
index 0000000..0fbd3c6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp

@@ -0,0 +1,422 @@
+//=- AArch64ConditionOptimizer.cpp - Remove useless comparisons for AArch64 -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to make consecutive compares of values use same operands to
+// allow CSE pass to remove duplicated instructions.  For this it analyzes
+// branches and adjusts comparisons with immediate values by converting:
+//  * GE -> GT
+//  * GT -> GE
+//  * LT -> LE
+//  * LE -> LT
+// and adjusting immediate values appropriately.  It basically corrects two
+// immediate values towards each other to make them equal.
+//
+// Consider the following example in C:
+//
+//   if ((a < 5 && ...) || (a > 5 && ...)) {
+//        ~~~~~             ~~~~~
+//          ^                 ^
+//          x                 y
+//
+// Here both "x" and "y" expressions compare "a" with "5".  When "x" evaluates
+// to "false", "y" can just check flags set by the first comparison.  As a
+// result of the canonicalization employed by
+// SelectionDAGBuilder::visitSwitchCase, DAGCombine, and other target-specific
+// code, assembly ends up in the form that is not CSE friendly:
+//
+//     ...
+//     cmp      w8, #4
+//     b.gt     .LBB0_3
+//     ...
+//   .LBB0_3:
+//     cmp      w8, #6
+//     b.lt     .LBB0_6
+//     ...
+//
+// Same assembly after the pass:
+//
+//     ...
+//     cmp      w8, #5
+//     b.ge     .LBB0_3
+//     ...
+//   .LBB0_3:
+//     cmp      w8, #5     // <-- CSE pass removes this instruction
+//     b.le     .LBB0_6
+//     ...
+//
+// Currently only SUBS and ADDS followed by b.?? are supported.
+//
+// TODO: maybe handle TBNZ/TBZ the same way as CMP when used instead for "a < 0"
+// TODO: handle other conditional instructions (e.g. CSET)
+// TODO: allow second branching to be anything if it doesn't require adjusting
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cstdlib>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-condopt"
+
+STATISTIC(NumConditionsAdjusted, "Number of conditions adjusted");
+
+namespace {
+class AArch64ConditionOptimizer : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  MachineDominatorTree *DomTree;
+
+public:
+  // Stores immediate, compare instruction opcode and branch condition (in this
+  // order) of adjusted comparison.
+  typedef std::tuple<int, int, AArch64CC::CondCode> CmpInfo;
+
+  static char ID;
+  AArch64ConditionOptimizer() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  MachineInstr *findSuitableCompare(MachineBasicBlock *MBB);
+  CmpInfo adjustCmp(MachineInstr *CmpMI, AArch64CC::CondCode Cmp);
+  void modifyCmp(MachineInstr *CmpMI, const CmpInfo &Info);
+  bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To,
+                int ToImm);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override {
+    return "AArch64 Condition Optimizer";
+  }
+};
+} // end anonymous namespace
+
+char AArch64ConditionOptimizer::ID = 0;
+
+namespace llvm {
+void initializeAArch64ConditionOptimizerPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt",
+                      "AArch64 CondOpt Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt",
+                    "AArch64 CondOpt Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionOptimizerPass() {
+  return new AArch64ConditionOptimizer();
+}
+
+void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+// Finds compare instruction that corresponds to supported types of branching.
+// Returns the instruction or nullptr on failures or detecting unsupported
+// instructions.
+MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+  if (I == MBB->end())
+    return nullptr;
+
+  if (I->getOpcode() != AArch64::Bcc)
+    return nullptr;
+
+  // Now find the instruction controlling the terminator.
+  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+    --I;
+    assert(!I->isTerminator() && "Spurious terminator");
+    switch (I->getOpcode()) {
+    // cmp is an alias for subs with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri:
+    // cmn is an alias for adds with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
+      if (I->getOperand(0).isDead())
+        return I;
+
+      DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+      return nullptr;
+
+    // Prevent false positive case like:
+    // cmp      w19, #0
+    // cinc     w0, w19, gt
+    // ...
+    // fcmp     d8, #0.0
+    // b.gt     .LBB0_5
+    case AArch64::FCMPDri:
+    case AArch64::FCMPSri:
+    case AArch64::FCMPESri:
+    case AArch64::FCMPEDri:
+
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXrr:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXrr:
+    case AArch64::FCMPSrr:
+    case AArch64::FCMPDrr:
+    case AArch64::FCMPESrr:
+    case AArch64::FCMPEDrr:
+      // Skip comparison instructions without immediate operands.
+      return nullptr;
+    }
+  }
+  DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+  return nullptr;
+}
+
+// Changes opcode adds <-> subs considering register operand width.
+static int getComplementOpc(int Opc) {
+  switch (Opc) {
+  case AArch64::ADDSWri: return AArch64::SUBSWri;
+  case AArch64::ADDSXri: return AArch64::SUBSXri;
+  case AArch64::SUBSWri: return AArch64::ADDSWri;
+  case AArch64::SUBSXri: return AArch64::ADDSXri;
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+// Changes form of comparison inclusive <-> exclusive.
+static AArch64CC::CondCode getAdjustedCmp(AArch64CC::CondCode Cmp) {
+  switch (Cmp) {
+  case AArch64CC::GT: return AArch64CC::GE;
+  case AArch64CC::GE: return AArch64CC::GT;
+  case AArch64CC::LT: return AArch64CC::LE;
+  case AArch64CC::LE: return AArch64CC::LT;
+  default:
+    llvm_unreachable("Unexpected condition code");
+  }
+}
+
+// Transforms GT -> GE, GE -> GT, LT -> LE, LE -> LT by updating comparison
+// operator and condition code.
+AArch64ConditionOptimizer::CmpInfo AArch64ConditionOptimizer::adjustCmp(
+    MachineInstr *CmpMI, AArch64CC::CondCode Cmp) {
+  int Opc = CmpMI->getOpcode();
+
+  // CMN (compare with negative immediate) is an alias to ADDS (as
+  // "operand - negative" == "operand + positive")
+  bool Negative = (Opc == AArch64::ADDSWri || Opc == AArch64::ADDSXri);
+
+  int Correction = (Cmp == AArch64CC::GT) ? 1 : -1;
+  // Negate Correction value for comparison with negative immediate (CMN).
+  if (Negative) {
+    Correction = -Correction;
+  }
+
+  const int OldImm = (int)CmpMI->getOperand(2).getImm();
+  const int NewImm = std::abs(OldImm + Correction);
+
+  // Handle +0 -> -1 and -0 -> +1 (CMN with 0 immediate) transitions by
+  // adjusting compare instruction opcode.
+  if (OldImm == 0 && ((Negative && Correction == 1) ||
+                      (!Negative && Correction == -1))) {
+    Opc = getComplementOpc(Opc);
+  }
+
+  return CmpInfo(NewImm, Opc, getAdjustedCmp(Cmp));
+}
+
+// Applies changes to comparison instruction suggested by adjustCmp().
+void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
+    const CmpInfo &Info) {
+  int Imm;
+  int Opc;
+  AArch64CC::CondCode Cmp;
+  std::tie(Imm, Opc, Cmp) = Info;
+
+  MachineBasicBlock *const MBB = CmpMI->getParent();
+
+  // Change immediate in comparison instruction (ADDS or SUBS).
+  BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc))
+      .addOperand(CmpMI->getOperand(0))
+      .addOperand(CmpMI->getOperand(1))
+      .addImm(Imm)
+      .addOperand(CmpMI->getOperand(3));
+  CmpMI->eraseFromParent();
+
+  // The fact that this comparison was picked ensures that it's related to the
+  // first terminator instruction.
+  MachineInstr *BrMI = MBB->getFirstTerminator();
+
+  // Change condition in branch instruction.
+  BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc))
+      .addImm(Cmp)
+      .addOperand(BrMI->getOperand(1));
+  BrMI->eraseFromParent();
+
+  MBB->updateTerminator();
+
+  ++NumConditionsAdjusted;
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Returns true if parsing was successful, otherwise false is returned.
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+  // A normal br.cond simply has the condition code.
+  if (Cond[0].getImm() != -1) {
+    assert(Cond.size() == 1 && "Unknown Cond array format");
+    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    return true;
+  }
+  return false;
+}
+
+// Adjusts one cmp instruction to another one if result of adjustment will allow
+// CSE.  Returns true if compare instruction was changed, otherwise false is
+// returned.
+bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
+  AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm)
+{
+  CmpInfo Info = adjustCmp(CmpMI, Cmp);
+  if (std::get<0>(Info) == ToImm && std::get<1>(Info) == To->getOpcode()) {
+    modifyCmp(CmpMI, Info);
+    return true;
+  }
+  return false;
+}
+
+bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TII = MF.getTarget().getSubtargetImpl()->getInstrInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+
+  bool Changed = false;
+
+  // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+  // cmp-conversions from the same head block.
+  // Note that updateDomTree() modifies the children of the DomTree node
+  // currently being visited. The df_iterator supports that; it doesn't look at
+  // child_begin() / child_end() until after a node has been visited.
+  for (MachineDomTreeNode *I : depth_first(DomTree)) {
+    MachineBasicBlock *HBB = I->getBlock();
+
+    SmallVector<MachineOperand, 4> HeadCond;
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) {
+      continue;
+    }
+
+    // Equivalence check is to skip loops.
+    if (!TBB || TBB == HBB) {
+      continue;
+    }
+
+    SmallVector<MachineOperand, 4> TrueCond;
+    MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr;
+    if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
+      continue;
+    }
+
+    MachineInstr *HeadCmpMI = findSuitableCompare(HBB);
+    if (!HeadCmpMI) {
+      continue;
+    }
+
+    MachineInstr *TrueCmpMI = findSuitableCompare(TBB);
+    if (!TrueCmpMI) {
+      continue;
+    }
+
+    AArch64CC::CondCode HeadCmp;
+    if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) {
+      continue;
+    }
+
+    AArch64CC::CondCode TrueCmp;
+    if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) {
+      continue;
+    }
+
+    const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm();
+    const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm();
+
+    DEBUG(dbgs() << "Head branch:\n");
+    DEBUG(dbgs() << "\tcondition: "
+          << AArch64CC::getCondCodeName(HeadCmp) << '\n');
+    DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n');
+
+    DEBUG(dbgs() << "True branch:\n");
+    DEBUG(dbgs() << "\tcondition: "
+          << AArch64CC::getCondCodeName(TrueCmp) << '\n');
+    DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n');
+
+    if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) ||
+         (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) &&
+        std::abs(TrueImm - HeadImm) == 2) {
+      // This branch transforms machine instructions that correspond to
+      //
+      // 1) (a > {TrueImm} && ...) || (a < {HeadImm} && ...)
+      // 2) (a < {TrueImm} && ...) || (a > {HeadImm} && ...)
+      //
+      // into
+      //
+      // 1) (a >= {NewImm} && ...) || (a <= {NewImm} && ...)
+      // 2) (a <= {NewImm} && ...) || (a >= {NewImm} && ...)
+
+      CmpInfo HeadCmpInfo = adjustCmp(HeadCmpMI, HeadCmp);
+      CmpInfo TrueCmpInfo = adjustCmp(TrueCmpMI, TrueCmp);
+      if (std::get<0>(HeadCmpInfo) == std::get<0>(TrueCmpInfo) &&
+          std::get<1>(HeadCmpInfo) == std::get<1>(TrueCmpInfo)) {
+        modifyCmp(HeadCmpMI, HeadCmpInfo);
+        modifyCmp(TrueCmpMI, TrueCmpInfo);
+        Changed = true;
+      }
+    } else if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::GT) ||
+                (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::LT)) &&
+                std::abs(TrueImm - HeadImm) == 1) {
+      // This branch transforms machine instructions that correspond to
+      //
+      // 1) (a > {TrueImm} && ...) || (a > {HeadImm} && ...)
+      // 2) (a < {TrueImm} && ...) || (a < {HeadImm} && ...)
+      //
+      // into
+      //
+      // 1) (a <= {NewImm} && ...) || (a >  {NewImm} && ...)
+      // 2) (a <  {NewImm} && ...) || (a >= {NewImm} && ...)
+
+      // GT -> GE transformation increases immediate value, so picking the
+      // smaller one; LT -> LE decreases immediate value so invert the choice.
+      bool adjustHeadCond = (HeadImm < TrueImm);
+      if (HeadCmp == AArch64CC::LT) {
+          adjustHeadCond = !adjustHeadCond;
+      }
+
+      if (adjustHeadCond) {
+        Changed |= adjustTo(HeadCmpMI, HeadCmp, TrueCmpMI, TrueImm);
+      } else {
+        Changed |= adjustTo(TrueCmpMI, TrueCmp, HeadCmpMI, HeadImm);
+      }
+    }
+    // Other transformation cases almost never occur due to generation of < or >
+    // comparisons instead of <= and >=.
+  }
+
+  return Changed;
+}

diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 452cdec..54f53dc 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp

@@ -191,8 +191,8 @@
   /// runOnMachineFunction - Initialize per-function data structures.
   void runOnMachineFunction(MachineFunction &MF) {
     this->MF = &MF;
-    TII = MF.getTarget().getInstrInfo();
-    TRI = MF.getTarget().getRegisterInfo();
+    TII = MF.getSubtarget().getInstrInfo();
+    TRI = MF.getSubtarget().getRegisterInfo();
     MRI = &MF.getRegInfo();
   }
 
@@ -723,7 +723,7 @@
 class AArch64ConditionalCompares : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
-  const MCSchedModel *SchedModel;
+  MCSchedModel SchedModel;
   // Does the proceeded function has Oz attribute.
   bool MinSize;
   MachineRegisterInfo *MRI;
@@ -845,7 +845,7 @@
   // the cost of a misprediction.
   //
   // Set a limit on the delay we will accept.
-  unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
+  unsigned DelayLimit = SchedModel.MispredictPenalty * 3 / 4;
 
   // Instruction depths can be computed for all trace instructions above CmpBB.
   unsigned HeadDepth =
@@ -891,8 +891,8 @@
 bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
                << "********** Function: " << MF.getName() << '\n');
-  TII = MF.getTarget().getInstrInfo();
-  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   SchedModel =
       MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
   MRI = &MF.getRegInfo();

diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index a2d853c..74fc167 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp

@@ -14,11 +14,12 @@
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-dead-defs"
@@ -36,11 +37,11 @@
   static char ID; // Pass identification, replacement for typeid.
   explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
 
-  virtual bool runOnMachineFunction(MachineFunction &F) override;
+  bool runOnMachineFunction(MachineFunction &F) override;
 
   const char *getPassName() const override { return "Dead register definitions"; }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -119,7 +120,7 @@
 // Scan the function for instructions that have a dead definition of a
 // register. Replace that register with the zero register when possible.
 bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
-  TRI = MF.getTarget().getRegisterInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
   bool Changed = false;
   DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
 

diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index a76fd76..c850680 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp

@@ -16,6 +16,7 @@
 
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/MathExtras.h"
@@ -634,19 +635,6 @@
     return true;
   }
 
-  case AArch64::FCVTSHpseudo: {
-    MachineOperand Src = MI.getOperand(1);
-    Src.setImplicit();
-    unsigned SrcH =
-        TII->getRegisterInfo().getSubReg(Src.getReg(), AArch64::hsub);
-    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::FCVTSHr))
-                   .addOperand(MI.getOperand(0))
-                   .addReg(SrcH, RegState::Undef)
-                   .addOperand(Src);
-    transferImpOps(MI, MIB, MIB);
-    MI.eraseFromParent();
-    return true;
-  }
   case AArch64::LOADgot: {
     // Expand into ADRP + LDR.
     unsigned DstReg = MI.getOperand(0).getReg();
@@ -735,7 +723,7 @@
 }
 
 bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   bool Modified = false;
   for (auto &MBB : MF)

diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 2164d77..612cb00 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp

@@ -14,9 +14,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "AArch64TargetMachine.h"
 #include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -39,8 +40,7 @@
 
 namespace {
 
-class AArch64FastISel : public FastISel {
-
+class AArch64FastISel final : public FastISel {
   class Address {
   public:
     typedef enum {
@@ -50,16 +50,23 @@
 
   private:
     BaseKind Kind;
+    AArch64_AM::ShiftExtendType ExtType;
     union {
       unsigned Reg;
       int FI;
     } Base;
+    unsigned OffsetReg;
+    unsigned Shift;
     int64_t Offset;
+    const GlobalValue *GV;
 
   public:
-    Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
+    Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend),
+      OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; }
     void setKind(BaseKind K) { Kind = K; }
     BaseKind getKind() const { return Kind; }
+    void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; }
+    AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
     void setReg(unsigned Reg) {
@@ -70,6 +77,12 @@
       assert(isRegBase() && "Invalid base register access!");
       return Base.Reg;
     }
+    void setOffsetReg(unsigned Reg) {
+      OffsetReg = Reg;
+    }
+    unsigned getOffsetReg() const {
+      return OffsetReg;
+    }
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index  access!");
       Base.FI = FI;
@@ -80,8 +93,11 @@
     }
     void setOffset(int64_t O) { Offset = O; }
     int64_t getOffset() { return Offset; }
+    void setShift(unsigned S) { Shift = S; }
+    unsigned getShift() { return Shift; }
 
-    bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
+    void setGlobalValue(const GlobalValue *G) { GV = G; }
+    const GlobalValue *getGlobalValue() { return GV; }
   };
 
   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
@@ -89,74 +105,152 @@
   const AArch64Subtarget *Subtarget;
   LLVMContext *Context;
 
+  bool fastLowerArguments() override;
+  bool fastLowerCall(CallLoweringInfo &CLI) override;
+  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
 private:
   // Selection routines.
-  bool SelectLoad(const Instruction *I);
-  bool SelectStore(const Instruction *I);
-  bool SelectBranch(const Instruction *I);
-  bool SelectIndirectBr(const Instruction *I);
-  bool SelectCmp(const Instruction *I);
-  bool SelectSelect(const Instruction *I);
-  bool SelectFPExt(const Instruction *I);
-  bool SelectFPTrunc(const Instruction *I);
-  bool SelectFPToInt(const Instruction *I, bool Signed);
-  bool SelectIntToFP(const Instruction *I, bool Signed);
-  bool SelectRem(const Instruction *I, unsigned ISDOpcode);
-  bool SelectCall(const Instruction *I, const char *IntrMemName);
-  bool SelectIntrinsicCall(const IntrinsicInst &I);
-  bool SelectRet(const Instruction *I);
-  bool SelectTrunc(const Instruction *I);
-  bool SelectIntExt(const Instruction *I);
-  bool SelectMul(const Instruction *I);
+  bool selectAddSub(const Instruction *I);
+  bool selectLogicalOp(const Instruction *I);
+  bool selectLoad(const Instruction *I);
+  bool selectStore(const Instruction *I);
+  bool selectBranch(const Instruction *I);
+  bool selectIndirectBr(const Instruction *I);
+  bool selectCmp(const Instruction *I);
+  bool selectSelect(const Instruction *I);
+  bool selectFPExt(const Instruction *I);
+  bool selectFPTrunc(const Instruction *I);
+  bool selectFPToInt(const Instruction *I, bool Signed);
+  bool selectIntToFP(const Instruction *I, bool Signed);
+  bool selectRem(const Instruction *I, unsigned ISDOpcode);
+  bool selectRet(const Instruction *I);
+  bool selectTrunc(const Instruction *I);
+  bool selectIntExt(const Instruction *I);
+  bool selectMul(const Instruction *I);
+  bool selectShift(const Instruction *I);
+  bool selectBitCast(const Instruction *I);
+  bool selectFRem(const Instruction *I);
+  bool selectSDiv(const Instruction *I);
+  bool selectGetElementPtr(const Instruction *I);
 
   // Utility helper routines.
   bool isTypeLegal(Type *Ty, MVT &VT);
-  bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
-  bool ComputeAddress(const Value *Obj, Address &Addr);
-  bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
-                       bool UseUnscaled);
-  void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
-                            unsigned Flags, bool UseUnscaled);
-  bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
-  bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+  bool isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed = false);
+  bool isValueAvailable(const Value *V) const;
+  bool computeAddress(const Value *Obj, Address &Addr, Type *Ty = nullptr);
+  bool computeCallAddress(const Value *V, Address &Addr);
+  bool simplifyAddress(Address &Addr, MVT VT);
+  void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+                            unsigned Flags, unsigned ScaleFactor,
+                            MachineMemOperand *MMO);
+  bool isMemCpySmall(uint64_t Len, unsigned Alignment);
+  bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
                           unsigned Alignment);
-  // Emit functions.
-  bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
-  bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                bool UseUnscaled = false);
-  bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                 bool UseUnscaled = false);
-  unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
-  unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+  bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I,
+                         const Value *Cond);
+  bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT);
+  bool optimizeSelect(const SelectInst *SI);
+  std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx);
 
-  unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT);
-  unsigned AArch64MaterializeGV(const GlobalValue *GV);
+  // Emit helper routines.
+  unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+                      const Value *RHS, bool SetFlags = false,
+                      bool WantResult = true,  bool IsZExt = false);
+  unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                         bool SetFlags = false, bool WantResult = true);
+  unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, uint64_t Imm, bool SetFlags = false,
+                         bool WantResult = true);
+  unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                         AArch64_AM::ShiftExtendType ShiftType,
+                         uint64_t ShiftImm, bool SetFlags = false,
+                         bool WantResult = true);
+  unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                         bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                          AArch64_AM::ShiftExtendType ExtType,
+                          uint64_t ShiftImm, bool SetFlags = false,
+                         bool WantResult = true);
+
+  // Emit functions.
+  bool emitCompareAndBranch(const BranchInst *BI);
+  bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt);
+  bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt);
+  bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+  bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS);
+  unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true,
+                    MachineMemOperand *MMO = nullptr);
+  bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
+                 MachineMemOperand *MMO = nullptr);
+  unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+  unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+                   bool SetFlags = false, bool WantResult = true,
+                   bool IsZExt = false);
+  unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm);
+  unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+                   bool SetFlags = false, bool WantResult = true,
+                   bool IsZExt = false);
+  unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                       unsigned RHSReg, bool RHSIsKill, bool WantResult = true);
+  unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                       unsigned RHSReg, bool RHSIsKill,
+                       AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm,
+                       bool WantResult = true);
+  unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS,
+                         const Value *RHS);
+  unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+                            bool LHSIsKill, uint64_t Imm);
+  unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+                            bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+                            uint64_t ShiftImm);
+  unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+  unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                      unsigned Op1, bool Op1IsKill);
+  unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                        unsigned Op1, bool Op1IsKill);
+  unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                        unsigned Op1, bool Op1IsKill);
+  unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                      unsigned Op1Reg, bool Op1IsKill);
+  unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+                      uint64_t Imm, bool IsZExt = true);
+  unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                      unsigned Op1Reg, bool Op1IsKill);
+  unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+                      uint64_t Imm, bool IsZExt = true);
+  unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                      unsigned Op1Reg, bool Op1IsKill);
+  unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+                      uint64_t Imm, bool IsZExt = false);
+
+  unsigned materializeInt(const ConstantInt *CI, MVT VT);
+  unsigned materializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned materializeGV(const GlobalValue *GV);
 
   // Call handling routines.
 private:
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
-  bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
-                       SmallVectorImpl<unsigned> &ArgRegs,
-                       SmallVectorImpl<MVT> &ArgVTs,
-                       SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                       SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+  bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
                        unsigned &NumBytes);
-  bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                  const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
+  bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
 
 public:
   // Backend specific FastISel code.
-  unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
-  unsigned TargetMaterializeConstant(const Constant *C) override;
+  unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned fastMaterializeConstant(const Constant *C) override;
+  unsigned fastMaterializeFloatZero(const ConstantFP* CF) override;
 
-  explicit AArch64FastISel(FunctionLoweringInfo &funcInfo,
-                         const TargetLibraryInfo *libInfo)
-      : FastISel(funcInfo, libInfo) {
+  explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo,
+                         const TargetLibraryInfo *LibInfo)
+      : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
     Subtarget = &TM.getSubtarget<AArch64Subtarget>();
-    Context = &funcInfo.Fn->getContext();
+    Context = &FuncInfo.Fn->getContext();
   }
 
-  bool TargetSelectInstruction(const Instruction *I) override;
+  bool fastSelectInstruction(const Instruction *I) override;
 
 #include "AArch64GenFastISel.inc"
 };
@@ -165,13 +259,52 @@
 
 #include "AArch64GenCallingConv.inc"
 
+/// \brief Check if the sign-/zero-extend will be a noop.
+static bool isIntExtFree(const Instruction *I) {
+  assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+         "Unexpected integer extend instruction.");
+  assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() &&
+         "Unexpected value type.");
+  bool IsZExt = isa<ZExtInst>(I);
+
+  if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0)))
+    if (LI->hasOneUse())
+      return true;
+
+  if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0)))
+    if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr()))
+      return true;
+
+  return false;
+}
+
+/// \brief Determine the implicit scale factor that is applied by a memory
+/// operation for a given value type.
+static unsigned getImplicitScaleFactor(MVT VT) {
+  switch (VT.SimpleTy) {
+  default:
+    return 0;    // invalid
+  case MVT::i1:  // fall-through
+  case MVT::i8:
+    return 1;
+  case MVT::i16:
+    return 2;
+  case MVT::i32: // fall-through
+  case MVT::f32:
+    return 4;
+  case MVT::i64: // fall-through
+  case MVT::f64:
+    return 8;
+  }
+}
+
 CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
   if (CC == CallingConv::WebKit_JS)
     return CC_AArch64_WebKit_JS;
   return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
 }
 
-unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) {
   assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
          "Alloca should always return a pointer.");
 
@@ -183,7 +316,7 @@
       FuncInfo.StaticAllocaMap.find(AI);
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
         .addFrameIndex(SI->second)
@@ -195,29 +328,42 @@
   return 0;
 }
 
-unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) {
+  if (VT > MVT::i64)
+    return 0;
+
+  if (!CI->isZero())
+    return fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+
+  // Create a copy from the zero register to materialize a "0" value.
+  const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass
+                                                   : &AArch64::GPR32RegClass;
+  unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+          ResultReg).addReg(ZeroReg, getKillRegState(true));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
+  // Positive zero (+0.0) has to be materialized with a fmov from the zero
+  // register, because the immediate version of fmov cannot encode zero.
+  if (CFP->isNullValue())
+    return fastMaterializeFloatZero(CFP);
+
   if (VT != MVT::f32 && VT != MVT::f64)
     return 0;
 
   const APFloat Val = CFP->getValueAPF();
-  bool is64bit = (VT == MVT::f64);
-
+  bool Is64Bit = (VT == MVT::f64);
   // This checks to see if we can use FMOV instructions to materialize
   // a constant, otherwise we have to materialize via the constant pool.
   if (TLI.isFPImmLegal(Val, VT)) {
-    int Imm;
-    unsigned Opc;
-    if (is64bit) {
-      Imm = AArch64_AM::getFP64Imm(Val);
-      Opc = AArch64::FMOVDi;
-    } else {
-      Imm = AArch64_AM::getFP32Imm(Val);
-      Opc = AArch64::FMOVSi;
-    }
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-        .addImm(Imm);
-    return ResultReg;
+    int Imm =
+        Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val);
+    assert((Imm != -1) && "Cannot encode floating-point constant.");
+    unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi;
+    return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
   }
 
   // Materialize via constant pool.  MachineConstantPool wants an explicit
@@ -226,20 +372,20 @@
   if (Align == 0)
     Align = DL.getTypeAllocSize(CFP->getType());
 
-  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
   unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
-          ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE);
+          ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE);
 
-  unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui;
+  unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui;
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(ADRPReg)
-      .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+      .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   return ResultReg;
 }
 
-unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
+unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
   // We can't handle thread-local variables quickly yet.
   if (GV->isThreadLocal())
     return 0;
@@ -262,30 +408,34 @@
     // ADRP + LDRX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
             ADRPReg)
-        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
 
     ResultReg = createResultReg(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
             ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
-                          AArch64II::MO_NC);
+      .addReg(ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                        AArch64II::MO_NC);
+  } else if (OpFlags & AArch64II::MO_CONSTPOOL) {
+    // We can't handle addresses loaded from a constant pool quickly yet.
+    return 0;
   } else {
     // ADRP + ADDX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
-            ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+            ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
 
     ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
-        .addImm(0);
+      .addReg(ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
+      .addImm(0);
   }
   return ResultReg;
 }
 
-unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
+unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
   EVT CEVT = TLI.getValueType(C->getType(), true);
 
   // Only handle simple types.
@@ -293,17 +443,48 @@
     return 0;
   MVT VT = CEVT.getSimpleVT();
 
-  // FIXME: Handle ConstantInt.
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
-    return AArch64MaterializeFP(CFP, VT);
+  if (const auto *CI = dyn_cast<ConstantInt>(C))
+    return materializeInt(CI, VT);
+  else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return materializeFP(CFP, VT);
   else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return AArch64MaterializeGV(GV);
+    return materializeGV(GV);
 
   return 0;
 }
 
+unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) {
+  assert(CFP->isNullValue() &&
+         "Floating-point constant is not a positive zero.");
+  MVT VT;
+  if (!isTypeLegal(CFP->getType(), VT))
+    return 0;
+
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  bool Is64Bit = (VT == MVT::f64);
+  unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+  unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr;
+  return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true);
+}
+
+/// \brief Check if the multiply is by a power-of-2 constant.
+static bool isMulPowOf2(const Value *I) {
+  if (const auto *MI = dyn_cast<MulOperator>(I)) {
+    if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0)))
+      if (C->getValue().isPowerOf2())
+        return true;
+    if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(1)))
+      if (C->getValue().isPowerOf2())
+        return true;
+  }
+  return false;
+}
+
 // Computes the address to get to an object.
-bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
+{
   const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
@@ -330,18 +511,18 @@
     break;
   case Instruction::BitCast: {
     // Look through bitcasts.
-    return ComputeAddress(U->getOperand(0), Addr);
+    return computeAddress(U->getOperand(0), Addr, Ty);
   }
   case Instruction::IntToPtr: {
     // Look past no-op inttoptrs.
     if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
-      return ComputeAddress(U->getOperand(0), Addr);
+      return computeAddress(U->getOperand(0), Addr, Ty);
     break;
   }
   case Instruction::PtrToInt: {
     // Look past no-op ptrtoints.
     if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
-      return ComputeAddress(U->getOperand(0), Addr);
+      return computeAddress(U->getOperand(0), Addr, Ty);
     break;
   }
   case Instruction::GetElementPtr: {
@@ -383,7 +564,7 @@
 
     // Try to grab the base operand now.
     Addr.setOffset(TmpOffset);
-    if (ComputeAddress(U->getOperand(0), Addr))
+    if (computeAddress(U->getOperand(0), Addr, Ty))
       return true;
 
     // We failed, restore everything and try the other options.
@@ -403,14 +584,301 @@
     }
     break;
   }
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (isa<ConstantInt>(LHS))
+      std::swap(LHS, RHS);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() + CI->getSExtValue());
+      return computeAddress(LHS, Addr, Ty);
+    }
+
+    Address Backup = Addr;
+    if (computeAddress(LHS, Addr, Ty) && computeAddress(RHS, Addr, Ty))
+      return true;
+    Addr = Backup;
+
+    break;
+  }
+  case Instruction::Sub: {
+    // Subs of constants are common and easy enough.
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+      Addr.setOffset(Addr.getOffset() - CI->getSExtValue());
+      return computeAddress(LHS, Addr, Ty);
+    }
+    break;
+  }
+  case Instruction::Shl: {
+    if (Addr.getOffsetReg())
+      break;
+
+    const auto *CI = dyn_cast<ConstantInt>(U->getOperand(1));
+    if (!CI)
+      break;
+
+    unsigned Val = CI->getZExtValue();
+    if (Val < 1 || Val > 3)
+      break;
+
+    uint64_t NumBytes = 0;
+    if (Ty && Ty->isSized()) {
+      uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+      NumBytes = NumBits / 8;
+      if (!isPowerOf2_64(NumBits))
+        NumBytes = 0;
+    }
+
+    if (NumBytes != (1ULL << Val))
+      break;
+
+    Addr.setShift(Val);
+    Addr.setExtendType(AArch64_AM::LSL);
+
+    const Value *Src = U->getOperand(0);
+    if (const auto *I = dyn_cast<Instruction>(Src))
+      if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
+        Src = I;
+
+    // Fold the zext or sext when it won't become a noop.
+    if (const auto *ZE = dyn_cast<ZExtInst>(Src)) {
+      if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+          Addr.setExtendType(AArch64_AM::UXTW);
+          Src = ZE->getOperand(0);
+      }
+    } else if (const auto *SE = dyn_cast<SExtInst>(Src)) {
+      if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::SXTW);
+        Src = SE->getOperand(0);
+      }
+    }
+
+    if (const auto *AI = dyn_cast<BinaryOperator>(Src))
+      if (AI->getOpcode() == Instruction::And) {
+        const Value *LHS = AI->getOperand(0);
+        const Value *RHS = AI->getOperand(1);
+
+        if (const auto *C = dyn_cast<ConstantInt>(LHS))
+          if (C->getValue() == 0xffffffff)
+            std::swap(LHS, RHS);
+
+        if (const auto *C = dyn_cast<ConstantInt>(RHS))
+          if (C->getValue() == 0xffffffff) {
+            Addr.setExtendType(AArch64_AM::UXTW);
+            unsigned Reg = getRegForValue(LHS);
+            if (!Reg)
+              return false;
+            bool RegIsKill = hasTrivialKill(LHS);
+            Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+                                             AArch64::sub_32);
+            Addr.setOffsetReg(Reg);
+            return true;
+          }
+      }
+
+    unsigned Reg = getRegForValue(Src);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+  case Instruction::Mul: {
+    if (Addr.getOffsetReg())
+      break;
+
+    if (!isMulPowOf2(U))
+      break;
+
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    // Canonicalize power-of-2 value to the RHS.
+    if (const auto *C = dyn_cast<ConstantInt>(LHS))
+      if (C->getValue().isPowerOf2())
+        std::swap(LHS, RHS);
+
+    assert(isa<ConstantInt>(RHS) && "Expected an ConstantInt.");
+    const auto *C = cast<ConstantInt>(RHS);
+    unsigned Val = C->getValue().logBase2();
+    if (Val < 1 || Val > 3)
+      break;
+
+    uint64_t NumBytes = 0;
+    if (Ty && Ty->isSized()) {
+      uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+      NumBytes = NumBits / 8;
+      if (!isPowerOf2_64(NumBits))
+        NumBytes = 0;
+    }
+
+    if (NumBytes != (1ULL << Val))
+      break;
+
+    Addr.setShift(Val);
+    Addr.setExtendType(AArch64_AM::LSL);
+
+    const Value *Src = LHS;
+    if (const auto *I = dyn_cast<Instruction>(Src))
+      if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
+        Src = I;
+
+
+    // Fold the zext or sext when it won't become a noop.
+    if (const auto *ZE = dyn_cast<ZExtInst>(Src)) {
+      if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::UXTW);
+        Src = ZE->getOperand(0);
+      }
+    } else if (const auto *SE = dyn_cast<SExtInst>(Src)) {
+      if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::SXTW);
+        Src = SE->getOperand(0);
+      }
+    }
+
+    unsigned Reg = getRegForValue(Src);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+  case Instruction::And: {
+    if (Addr.getOffsetReg())
+      break;
+
+    if (DL.getTypeSizeInBits(Ty) != 8)
+      break;
+
+    const Value *LHS = U->getOperand(0);
+    const Value *RHS = U->getOperand(1);
+
+    if (const auto *C = dyn_cast<ConstantInt>(LHS))
+      if (C->getValue() == 0xffffffff)
+        std::swap(LHS, RHS);
+
+    if (const auto *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue() == 0xffffffff) {
+        Addr.setShift(0);
+        Addr.setExtendType(AArch64_AM::LSL);
+        Addr.setExtendType(AArch64_AM::UXTW);
+
+        unsigned Reg = getRegForValue(LHS);
+        if (!Reg)
+          return false;
+        bool RegIsKill = hasTrivialKill(LHS);
+        Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+                                         AArch64::sub_32);
+        Addr.setOffsetReg(Reg);
+        return true;
+      }
+    break;
+  }
+  case Instruction::SExt:
+  case Instruction::ZExt: {
+    if (!Addr.getReg() || Addr.getOffsetReg())
+      break;
+
+    const Value *Src = nullptr;
+    // Fold the zext or sext when it won't become a noop.
+    if (const auto *ZE = dyn_cast<ZExtInst>(U)) {
+      if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::UXTW);
+        Src = ZE->getOperand(0);
+      }
+    } else if (const auto *SE = dyn_cast<SExtInst>(U)) {
+      if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+        Addr.setExtendType(AArch64_AM::SXTW);
+        Src = SE->getOperand(0);
+      }
+    }
+
+    if (!Src)
+      break;
+
+    Addr.setShift(0);
+    unsigned Reg = getRegForValue(Src);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+  } // end switch
+
+  if (Addr.isRegBase() && !Addr.getReg()) {
+    unsigned Reg = getRegForValue(Obj);
+    if (!Reg)
+      return false;
+    Addr.setReg(Reg);
+    return true;
   }
 
-  // Try to get this in a register if nothing else has worked.
-  if (!Addr.isValid())
-    Addr.setReg(getRegForValue(Obj));
-  return Addr.isValid();
+  if (!Addr.getOffsetReg()) {
+    unsigned Reg = getRegForValue(Obj);
+    if (!Reg)
+      return false;
+    Addr.setOffsetReg(Reg);
+    return true;
+  }
+
+  return false;
 }
 
+bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
+  const User *U = nullptr;
+  unsigned Opcode = Instruction::UserOp1;
+  bool InMBB = true;
+
+  if (const auto *I = dyn_cast<Instruction>(V)) {
+    Opcode = I->getOpcode();
+    U = I;
+    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+  } else if (const auto *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts if its operand is in the same BB.
+    if (InMBB)
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return computeCallAddress(U->getOperand(0), Addr);
+    break;
+  }
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    Addr.setGlobalValue(GV);
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!Addr.getGlobalValue()) {
+    Addr.setReg(getRegForValue(V));
+    return Addr.getReg() != 0;
+  }
+
+  return false;
+}
+
+
 bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(Ty, true);
 
@@ -428,62 +896,122 @@
   return TLI.isTypeLegal(VT);
 }
 
-bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+/// \brief Determine if the value type is supported by FastISel.
+///
+/// FastISel for AArch64 can handle more value types than are legal. This adds
+/// simple value type such as i1, i8, and i16.
+bool AArch64FastISel::isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed) {
+  if (Ty->isVectorTy() && !IsVectorAllowed)
+    return false;
+
   if (isTypeLegal(Ty, VT))
     return true;
 
   // If this is a type than can be sign or zero-extended to a basic operation
-  // go ahead and accept it now. For stores, this reflects truncation.
+  // go ahead and accept it now.
   if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
     return true;
 
   return false;
 }
 
-bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
-                                      int64_t ScaleFactor, bool UseUnscaled) {
-  bool needsLowering = false;
-  int64_t Offset = Addr.getOffset();
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-  case MVT::i64:
-  case MVT::f32:
-  case MVT::f64:
-    if (!UseUnscaled)
-      // Using scaled, 12-bit, unsigned immediate offsets.
-      needsLowering = ((Offset & 0xfff) != Offset);
-    else
-      // Using unscaled, 9-bit, signed immediate offsets.
-      needsLowering = (Offset > 256 || Offset < -256);
-    break;
-  }
+bool AArch64FastISel::isValueAvailable(const Value *V) const {
+  if (!isa<Instruction>(V))
+    return true;
 
-  //If this is a stack pointer and the offset needs to be simplified then put
+  const auto *I = cast<Instruction>(V);
+  if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
+    return true;
+
+  return false;
+}
+
+bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
+  unsigned ScaleFactor = getImplicitScaleFactor(VT);
+  if (!ScaleFactor)
+    return false;
+
+  bool ImmediateOffsetNeedsLowering = false;
+  bool RegisterOffsetNeedsLowering = false;
+  int64_t Offset = Addr.getOffset();
+  if (((Offset < 0) || (Offset & (ScaleFactor - 1))) && !isInt<9>(Offset))
+    ImmediateOffsetNeedsLowering = true;
+  else if (Offset > 0 && !(Offset & (ScaleFactor - 1)) &&
+           !isUInt<12>(Offset / ScaleFactor))
+    ImmediateOffsetNeedsLowering = true;
+
+  // Cannot encode an offset register and an immediate offset in the same
+  // instruction. Fold the immediate offset into the load/store instruction and
+  // emit an additonal add to take care of the offset register.
+  if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg())
+    RegisterOffsetNeedsLowering = true;
+
+  // Cannot encode zero register as base.
+  if (Addr.isRegBase() && Addr.getOffsetReg() && !Addr.getReg())
+    RegisterOffsetNeedsLowering = true;
+
+  // If this is a stack pointer and the offset needs to be simplified then put
   // the alloca address into a register, set the base type back to register and
   // continue. This should almost never happen.
-  if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
-    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+  if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase())
+  {
+    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
-        .addFrameIndex(Addr.getFI())
-        .addImm(0)
-        .addImm(0);
+      .addFrameIndex(Addr.getFI())
+      .addImm(0)
+      .addImm(0);
     Addr.setKind(Address::RegBase);
     Addr.setReg(ResultReg);
   }
 
+  if (RegisterOffsetNeedsLowering) {
+    unsigned ResultReg = 0;
+    if (Addr.getReg()) {
+      if (Addr.getExtendType() == AArch64_AM::SXTW ||
+          Addr.getExtendType() == AArch64_AM::UXTW   )
+        ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+                                  /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+                                  /*TODO:IsKill=*/false, Addr.getExtendType(),
+                                  Addr.getShift());
+      else
+        ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+                                  /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+                                  /*TODO:IsKill=*/false, AArch64_AM::LSL,
+                                  Addr.getShift());
+    } else {
+      if (Addr.getExtendType() == AArch64_AM::UXTW)
+        ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+                               /*Op0IsKill=*/false, Addr.getShift(),
+                               /*IsZExt=*/true);
+      else if (Addr.getExtendType() == AArch64_AM::SXTW)
+        ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+                               /*Op0IsKill=*/false, Addr.getShift(),
+                               /*IsZExt=*/false);
+      else
+        ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(),
+                               /*Op0IsKill=*/false, Addr.getShift());
+    }
+    if (!ResultReg)
+      return false;
+
+    Addr.setReg(ResultReg);
+    Addr.setOffsetReg(0);
+    Addr.setShift(0);
+    Addr.setExtendType(AArch64_AM::InvalidShiftExtend);
+  }
+
   // Since the offset is too large for the load/store instruction get the
   // reg+offset into a register.
-  if (needsLowering) {
-    uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
-    unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
-                                      UnscaledOffset, MVT::i64);
-    if (ResultReg == 0)
+  if (ImmediateOffsetNeedsLowering) {
+    unsigned ResultReg;
+    if (Addr.getReg())
+      // Try to fold the immediate into the add instruction.
+      ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset);
+    else
+      ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset);
+
+    if (!ResultReg)
       return false;
     Addr.setReg(ResultReg);
     Addr.setOffset(0);
@@ -491,222 +1019,1021 @@
   return true;
 }
 
-void AArch64FastISel::AddLoadStoreOperands(Address &Addr,
+void AArch64FastISel::addLoadStoreOperands(Address &Addr,
                                            const MachineInstrBuilder &MIB,
-                                           unsigned Flags, bool UseUnscaled) {
-  int64_t Offset = Addr.getOffset();
+                                           unsigned Flags,
+                                           unsigned ScaleFactor,
+                                           MachineMemOperand *MMO) {
+  int64_t Offset = Addr.getOffset() / ScaleFactor;
   // Frame base works a bit differently. Handle it separately.
-  if (Addr.getKind() == Address::FrameIndexBase) {
+  if (Addr.isFIBase()) {
     int FI = Addr.getFI();
     // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
     // and alignment should be based on the VT.
-    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(FI, Offset), Flags,
-        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+    MMO = FuncInfo.MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(FI, Offset), Flags,
+      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
     // Now add the rest of the operands.
-    MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+    MIB.addFrameIndex(FI).addImm(Offset);
   } else {
-    // Now add the rest of the operands.
-    MIB.addReg(Addr.getReg());
-    MIB.addImm(Offset);
+    assert(Addr.isRegBase() && "Unexpected address kind.");
+    const MCInstrDesc &II = MIB->getDesc();
+    unsigned Idx = (Flags & MachineMemOperand::MOStore) ? 1 : 0;
+    Addr.setReg(
+      constrainOperandRegClass(II, Addr.getReg(), II.getNumDefs()+Idx));
+    Addr.setOffsetReg(
+      constrainOperandRegClass(II, Addr.getOffsetReg(), II.getNumDefs()+Idx+1));
+    if (Addr.getOffsetReg()) {
+      assert(Addr.getOffset() == 0 && "Unexpected offset");
+      bool IsSigned = Addr.getExtendType() == AArch64_AM::SXTW ||
+                      Addr.getExtendType() == AArch64_AM::SXTX;
+      MIB.addReg(Addr.getReg());
+      MIB.addReg(Addr.getOffsetReg());
+      MIB.addImm(IsSigned);
+      MIB.addImm(Addr.getShift() != 0);
+    } else
+      MIB.addReg(Addr.getReg()).addImm(Offset);
+  }
+
+  if (MMO)
+    MIB.addMemOperand(MMO);
+}
+
+unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+                                     const Value *RHS, bool SetFlags,
+                                     bool WantResult,  bool IsZExt) {
+  AArch64_AM::ShiftExtendType ExtendType = AArch64_AM::InvalidShiftExtend;
+  bool NeedExtend = false;
+  switch (RetVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+    NeedExtend = true;
+    break;
+  case MVT::i8:
+    NeedExtend = true;
+    ExtendType = IsZExt ? AArch64_AM::UXTB : AArch64_AM::SXTB;
+    break;
+  case MVT::i16:
+    NeedExtend = true;
+    ExtendType = IsZExt ? AArch64_AM::UXTH : AArch64_AM::SXTH;
+    break;
+  case MVT::i32:  // fall-through
+  case MVT::i64:
+    break;
+  }
+  MVT SrcVT = RetVT;
+  RetVT.SimpleTy = std::max(RetVT.SimpleTy, MVT::i32);
+
+  // Canonicalize immediates to the RHS first.
+  if (UseAdd && isa<Constant>(LHS) && !isa<Constant>(RHS))
+    std::swap(LHS, RHS);
+
+  // Canonicalize mul by power of 2 to the RHS.
+  if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+    if (isMulPowOf2(LHS))
+      std::swap(LHS, RHS);
+
+  // Canonicalize shift immediate to the RHS.
+  if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+    if (const auto *SI = dyn_cast<BinaryOperator>(LHS))
+      if (isa<ConstantInt>(SI->getOperand(1)))
+        if (SI->getOpcode() == Instruction::Shl  ||
+            SI->getOpcode() == Instruction::LShr ||
+            SI->getOpcode() == Instruction::AShr   )
+          std::swap(LHS, RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return 0;
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  if (NeedExtend)
+    LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt);
+
+  unsigned ResultReg = 0;
+  if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+    uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue();
+    if (C->isNegative())
+      ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm,
+                                SetFlags, WantResult);
+    else
+      ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags,
+                                WantResult);
+  } else if (const auto *C = dyn_cast<Constant>(RHS))
+    if (C->isNullValue())
+      ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags,
+                                WantResult);
+
+  if (ResultReg)
+    return ResultReg;
+
+  // Only extend the RHS within the instruction if there is a valid extend type.
+  if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() &&
+      isValueAvailable(RHS)) {
+    if (const auto *SI = dyn_cast<BinaryOperator>(RHS))
+      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1)))
+        if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) {
+          unsigned RHSReg = getRegForValue(SI->getOperand(0));
+          if (!RHSReg)
+            return 0;
+          bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+          return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+                               RHSIsKill, ExtendType, C->getZExtValue(),
+                               SetFlags, WantResult);
+        }
+    unsigned RHSReg = getRegForValue(RHS);
+    if (!RHSReg)
+      return 0;
+    bool RHSIsKill = hasTrivialKill(RHS);
+    return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                         ExtendType, 0, SetFlags, WantResult);
+  }
+
+  // Check if the mul can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS))
+    if (isMulPowOf2(RHS)) {
+      const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+      const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+      if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+        if (C->getValue().isPowerOf2())
+          std::swap(MulLHS, MulRHS);
+
+      assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+      uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+      unsigned RHSReg = getRegForValue(MulLHS);
+      if (!RHSReg)
+        return 0;
+      bool RHSIsKill = hasTrivialKill(MulLHS);
+      return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                           AArch64_AM::LSL, ShiftVal, SetFlags, WantResult);
+    }
+
+  // Check if the shift can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS))
+    if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) {
+      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+        AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend;
+        switch (SI->getOpcode()) {
+        default: break;
+        case Instruction::Shl:  ShiftType = AArch64_AM::LSL; break;
+        case Instruction::LShr: ShiftType = AArch64_AM::LSR; break;
+        case Instruction::AShr: ShiftType = AArch64_AM::ASR; break;
+        }
+        uint64_t ShiftVal = C->getZExtValue();
+        if (ShiftType != AArch64_AM::InvalidShiftExtend) {
+          unsigned RHSReg = getRegForValue(SI->getOperand(0));
+          if (!RHSReg)
+            return 0;
+          bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+          return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+                               RHSIsKill, ShiftType, ShiftVal, SetFlags,
+                               WantResult);
+        }
+      }
+    }
+
+  unsigned RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return 0;
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  if (NeedExtend)
+    RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt);
+
+  return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                       SetFlags, WantResult);
+}
+
+unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, unsigned RHSReg,
+                                        bool RHSIsKill, bool SetFlags,
+                                        bool WantResult) {
+  assert(LHSReg && RHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWrr,  AArch64::SUBXrr  },
+      { AArch64::ADDWrr,  AArch64::ADDXrr  }  },
+    { { AArch64::SUBSWrr, AArch64::SUBSXrr },
+      { AArch64::ADDSWrr, AArch64::ADDSXrr }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, uint64_t Imm,
+                                        bool SetFlags, bool WantResult) {
+  assert(LHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  unsigned ShiftImm;
+  if (isUInt<12>(Imm))
+    ShiftImm = 0;
+  else if ((Imm & 0xfff000) == Imm) {
+    ShiftImm = 12;
+    Imm >>= 12;
+  } else
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWri,  AArch64::SUBXri  },
+      { AArch64::ADDWri,  AArch64::ADDXri  }  },
+    { { AArch64::SUBSWri, AArch64::SUBSXri },
+      { AArch64::ADDSWri, AArch64::ADDSXri }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC;
+  if (SetFlags)
+    RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  else
+    RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addImm(Imm)
+      .addImm(getShifterImm(AArch64_AM::LSL, ShiftImm));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, unsigned RHSReg,
+                                        bool RHSIsKill,
+                                        AArch64_AM::ShiftExtendType ShiftType,
+                                        uint64_t ShiftImm, bool SetFlags,
+                                        bool WantResult) {
+  assert(LHSReg && RHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWrs,  AArch64::SUBXrs  },
+      { AArch64::ADDWrs,  AArch64::ADDXrs  }  },
+    { { AArch64::SUBSWrs, AArch64::SUBSXrs },
+      { AArch64::ADDSWrs, AArch64::ADDSXrs }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill))
+      .addImm(getShifterImm(ShiftType, ShiftImm));
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+                                        bool LHSIsKill, unsigned RHSReg,
+                                        bool RHSIsKill,
+                                        AArch64_AM::ShiftExtendType ExtType,
+                                        uint64_t ShiftImm, bool SetFlags,
+                                        bool WantResult) {
+  assert(LHSReg && RHSReg && "Invalid register number.");
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return 0;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { { AArch64::SUBWrx,  AArch64::SUBXrx  },
+      { AArch64::ADDWrx,  AArch64::ADDXrx  }  },
+    { { AArch64::SUBSWrx, AArch64::SUBSXrx },
+      { AArch64::ADDSWrx, AArch64::ADDSXrx }  }
+  };
+  bool Is64Bit = RetVT == MVT::i64;
+  unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+  const TargetRegisterClass *RC = nullptr;
+  if (SetFlags)
+    RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  else
+    RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+  unsigned ResultReg;
+  if (WantResult)
+    ResultReg = createResultReg(RC);
+  else
+    ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  const MCInstrDesc &II = TII.get(Opc);
+  LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+  RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill))
+      .addImm(getArithExtendImm(ExtType, ShiftImm));
+  return ResultReg;
+}
+
+bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) {
+  Type *Ty = LHS->getType();
+  EVT EVT = TLI.getValueType(Ty, true);
+  if (!EVT.isSimple())
+    return false;
+  MVT VT = EVT.getSimpleVT();
+
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+    return emitICmp(VT, LHS, RHS, IsZExt);
+  case MVT::f32:
+  case MVT::f64:
+    return emitFCmp(VT, LHS, RHS);
   }
 }
 
-bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                               bool UseUnscaled) {
+bool AArch64FastISel::emitICmp(MVT RetVT, const Value *LHS, const Value *RHS,
+                               bool IsZExt) {
+  return emitSub(RetVT, LHS, RHS, /*SetFlags=*/true, /*WantResult=*/false,
+                 IsZExt) != 0;
+}
+
+bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                                  uint64_t Imm) {
+  return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm,
+                       /*SetFlags=*/true, /*WantResult=*/false) != 0;
+}
+
+bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) {
+  if (RetVT != MVT::f32 && RetVT != MVT::f64)
+    return false;
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  bool UseImm = false;
+  if (const auto *CFP = dyn_cast<ConstantFP>(RHS))
+    if (CFP->isZero() && !CFP->isNegative())
+      UseImm = true;
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return false;
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  if (UseImm) {
+    unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+        .addReg(LHSReg, getKillRegState(LHSIsKill));
+    return true;
+  }
+
+  unsigned RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return false;
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+      .addReg(LHSReg, getKillRegState(LHSIsKill))
+      .addReg(RHSReg, getKillRegState(RHSIsKill));
+  return true;
+}
+
+unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+                                  bool SetFlags, bool WantResult, bool IsZExt) {
+  return emitAddSub(/*UseAdd=*/true, RetVT, LHS, RHS, SetFlags, WantResult,
+                    IsZExt);
+}
+
+/// \brief This method is a wrapper to simplify add emission.
+///
+/// First try to emit an add with an immediate operand using emitAddSub_ri. If
+/// that fails, then try to materialize the immediate into a register and use
+/// emitAddSub_rr instead.
+unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill,
+                                      int64_t Imm) {
+  unsigned ResultReg;
+  if (Imm < 0)
+    ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm);
+  else
+    ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm);
+
+  if (ResultReg)
+    return ResultReg;
+
+  unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm);
+  if (!CReg)
+    return 0;
+
+  ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+                                  bool SetFlags, bool WantResult, bool IsZExt) {
+  return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult,
+                    IsZExt);
+}
+
+unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg,
+                                      bool LHSIsKill, unsigned RHSReg,
+                                      bool RHSIsKill, bool WantResult) {
+  return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+                       RHSIsKill, /*SetFlags=*/true, WantResult);
+}
+
+unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg,
+                                      bool LHSIsKill, unsigned RHSReg,
+                                      bool RHSIsKill,
+                                      AArch64_AM::ShiftExtendType ShiftType,
+                                      uint64_t ShiftImm, bool WantResult) {
+  return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+                       RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true,
+                       WantResult);
+}
+
+unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
+                                        const Value *LHS, const Value *RHS) {
+  // Canonicalize immediates to the RHS first.
+  if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS))
+    std::swap(LHS, RHS);
+
+  // Canonicalize mul by power-of-2 to the RHS.
+  if (LHS->hasOneUse() && isValueAvailable(LHS))
+    if (isMulPowOf2(LHS))
+      std::swap(LHS, RHS);
+
+  // Canonicalize shift immediate to the RHS.
+  if (LHS->hasOneUse() && isValueAvailable(LHS))
+    if (const auto *SI = dyn_cast<ShlOperator>(LHS))
+      if (isa<ConstantInt>(SI->getOperand(1)))
+        std::swap(LHS, RHS);
+
+  unsigned LHSReg = getRegForValue(LHS);
+  if (!LHSReg)
+    return 0;
+  bool LHSIsKill = hasTrivialKill(LHS);
+
+  unsigned ResultReg = 0;
+  if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+    uint64_t Imm = C->getZExtValue();
+    ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm);
+  }
+  if (ResultReg)
+    return ResultReg;
+
+  // Check if the mul can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS))
+    if (isMulPowOf2(RHS)) {
+      const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+      const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+      if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+        if (C->getValue().isPowerOf2())
+          std::swap(MulLHS, MulRHS);
+
+      assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+      uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+
+      unsigned RHSReg = getRegForValue(MulLHS);
+      if (!RHSReg)
+        return 0;
+      bool RHSIsKill = hasTrivialKill(MulLHS);
+      return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+                              RHSIsKill, ShiftVal);
+    }
+
+  // Check if the shift can be folded into the instruction.
+  if (RHS->hasOneUse() && isValueAvailable(RHS))
+    if (const auto *SI = dyn_cast<ShlOperator>(RHS))
+      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+        uint64_t ShiftVal = C->getZExtValue();
+        unsigned RHSReg = getRegForValue(SI->getOperand(0));
+        if (!RHSReg)
+          return 0;
+        bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+        return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+                                RHSIsKill, ShiftVal);
+      }
+
+  unsigned RHSReg = getRegForValue(RHS);
+  if (!RHSReg)
+    return 0;
+  bool RHSIsKill = hasTrivialKill(RHS);
+
+  MVT VT = std::max(MVT::i32, RetVT.SimpleTy);
+  ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+  if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+    uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
+                                           unsigned LHSReg, bool LHSIsKill,
+                                           uint64_t Imm) {
+  assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
+         "ISD nodes are not consecutive!");
+  static const unsigned OpcTable[3][2] = {
+    { AArch64::ANDWri, AArch64::ANDXri },
+    { AArch64::ORRWri, AArch64::ORRXri },
+    { AArch64::EORWri, AArch64::EORXri }
+  };
+  const TargetRegisterClass *RC;
+  unsigned Opc;
+  unsigned RegSize;
+  switch (RetVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32: {
+    unsigned Idx = ISDOpc - ISD::AND;
+    Opc = OpcTable[Idx][0];
+    RC = &AArch64::GPR32spRegClass;
+    RegSize = 32;
+    break;
+  }
+  case MVT::i64:
+    Opc = OpcTable[ISDOpc - ISD::AND][1];
+    RC = &AArch64::GPR64spRegClass;
+    RegSize = 64;
+    break;
+  }
+
+  if (!AArch64_AM::isLogicalImmediate(Imm, RegSize))
+    return 0;
+
+  unsigned ResultReg =
+      fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill,
+                      AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
+  if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) {
+    uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
+                                           unsigned LHSReg, bool LHSIsKill,
+                                           unsigned RHSReg, bool RHSIsKill,
+                                           uint64_t ShiftImm) {
+  assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
+         "ISD nodes are not consecutive!");
+  static const unsigned OpcTable[3][2] = {
+    { AArch64::ANDWrs, AArch64::ANDXrs },
+    { AArch64::ORRWrs, AArch64::ORRXrs },
+    { AArch64::EORWrs, AArch64::EORXrs }
+  };
+  const TargetRegisterClass *RC;
+  unsigned Opc;
+  switch (RetVT.SimpleTy) {
+  default:
+    return 0;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    Opc = OpcTable[ISDOpc - ISD::AND][0];
+    RC = &AArch64::GPR32RegClass;
+    break;
+  case MVT::i64:
+    Opc = OpcTable[ISDOpc - ISD::AND][1];
+    RC = &AArch64::GPR64RegClass;
+    break;
+  }
+  unsigned ResultReg =
+      fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+                       AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm));
+  if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+    uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  }
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+                                     uint64_t Imm) {
+  return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm);
+}
+
+unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr,
+                                   bool WantZExt, MachineMemOperand *MMO) {
+  // Simplify this down to something we can handle.
+  if (!simplifyAddress(Addr, VT))
+    return 0;
+
+  unsigned ScaleFactor = getImplicitScaleFactor(VT);
+  if (!ScaleFactor)
+    llvm_unreachable("Unexpected value type.");
+
   // Negative offsets require unscaled, 9-bit, signed immediate offsets.
   // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
-  if (!UseUnscaled && Addr.getOffset() < 0)
-    UseUnscaled = true;
+  bool UseScaled = true;
+  if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+    UseScaled = false;
+    ScaleFactor = 1;
+  }
+
+  static const unsigned GPOpcTable[2][8][4] = {
+    // Sign-extend.
+    { { AArch64::LDURSBWi,  AArch64::LDURSHWi,  AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDURSBXi,  AArch64::LDURSHXi,  AArch64::LDURSWi,
+        AArch64::LDURXi  },
+      { AArch64::LDRSBWui,  AArch64::LDRSHWui,  AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRSBXui,  AArch64::LDRSHXui,  AArch64::LDRSWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW,
+        AArch64::LDRXroW },
+      { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW,
+        AArch64::LDRXroW }
+    },
+    // Zero-extend.
+    { { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
+        AArch64::LDRXroW },
+      { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
+        AArch64::LDRXroW }
+    }
+  };
+
+  static const unsigned FPOpcTable[4][2] = {
+    { AArch64::LDURSi,  AArch64::LDURDi  },
+    { AArch64::LDRSui,  AArch64::LDRDui  },
+    { AArch64::LDRSroX, AArch64::LDRDroX },
+    { AArch64::LDRSroW, AArch64::LDRDroW }
+  };
 
   unsigned Opc;
   const TargetRegisterClass *RC;
-  bool VTIsi1 = false;
-  int64_t ScaleFactor = 0;
+  bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+                      Addr.getOffsetReg();
+  unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+  if (Addr.getExtendType() == AArch64_AM::UXTW ||
+      Addr.getExtendType() == AArch64_AM::SXTW)
+    Idx++;
+
+  bool IsRet64Bit = RetVT == MVT::i64;
   switch (VT.SimpleTy) {
   default:
-    return false;
-  case MVT::i1:
-    VTIsi1 = true;
-  // Intentional fall-through.
+    llvm_unreachable("Unexpected value type.");
+  case MVT::i1: // Intentional fall-through.
   case MVT::i8:
-    Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui;
-    RC = &AArch64::GPR32RegClass;
-    ScaleFactor = 1;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i16:
-    Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui;
-    RC = &AArch64::GPR32RegClass;
-    ScaleFactor = 2;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i32:
-    Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui;
-    RC = &AArch64::GPR32RegClass;
-    ScaleFactor = 4;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i64:
-    Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3];
     RC = &AArch64::GPR64RegClass;
-    ScaleFactor = 8;
     break;
   case MVT::f32:
-    Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui;
-    RC = TLI.getRegClassFor(VT);
-    ScaleFactor = 4;
+    Opc = FPOpcTable[Idx][0];
+    RC = &AArch64::FPR32RegClass;
     break;
   case MVT::f64:
-    Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui;
-    RC = TLI.getRegClassFor(VT);
-    ScaleFactor = 8;
+    Opc = FPOpcTable[Idx][1];
+    RC = &AArch64::FPR64RegClass;
     break;
   }
-  // Scale the offset.
-  if (!UseUnscaled) {
-    int64_t Offset = Addr.getOffset();
-    if (Offset & (ScaleFactor - 1))
-      // Retry using an unscaled, 9-bit, signed immediate offset.
-      return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
-
-    Addr.setOffset(Offset / ScaleFactor);
-  }
-
-  // Simplify this down to something we can handle.
-  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
-    return false;
 
   // Create the base instruction, then add the operands.
-  ResultReg = createResultReg(RC);
+  unsigned ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(Opc), ResultReg);
-  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
+  addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO);
 
   // Loading an i1 requires special handling.
-  if (VTIsi1) {
-    MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass);
-    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-            ANDReg)
-        .addReg(ResultReg)
-        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+  if (VT == MVT::i1) {
+    unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1);
+    assert(ANDReg && "Unexpected AND instruction emission failure.");
     ResultReg = ANDReg;
   }
+
+  // For zero-extending loads to 64bit we emit a 32bit load and then convert
+  // the 32bit reg to a 64bit reg.
+  if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) {
+    unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Reg64)
+        .addImm(0)
+        .addReg(ResultReg, getKillRegState(true))
+        .addImm(AArch64::sub_32);
+    ResultReg = Reg64;
+  }
+  return ResultReg;
+}
+
+bool AArch64FastISel::selectAddSub(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (VT.isVector())
+    return selectOperator(I, I->getOpcode());
+
+  unsigned ResultReg;
+  switch (I->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  case Instruction::Add:
+    ResultReg = emitAdd(VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Sub:
+    ResultReg = emitSub(VT, I->getOperand(0), I->getOperand(1));
+    break;
+  }
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectLoad(const Instruction *I) {
+bool AArch64FastISel::selectLogicalOp(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (VT.isVector())
+    return selectOperator(I, I->getOpcode());
+
+  unsigned ResultReg;
+  switch (I->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction.");
+  case Instruction::And:
+    ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Or:
+    ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  case Instruction::Xor:
+    ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1));
+    break;
+  }
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectLoad(const Instruction *I) {
   MVT VT;
   // Verify we have a legal type before going any further.  Currently, we handle
   // simple types that will directly fit in a register (i32/f32/i64/f64) or
   // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
-  if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true) ||
+      cast<LoadInst>(I)->isAtomic())
     return false;
 
   // See if we can handle this address.
   Address Addr;
-  if (!ComputeAddress(I->getOperand(0), Addr))
+  if (!computeAddress(I->getOperand(0), Addr, I->getType()))
     return false;
 
-  unsigned ResultReg;
-  if (!EmitLoad(VT, ResultReg, Addr))
+  // Fold the following sign-/zero-extend into the load instruction.
+  bool WantZExt = true;
+  MVT RetVT = VT;
+  const Value *IntExtVal = nullptr;
+  if (I->hasOneUse()) {
+    if (const auto *ZE = dyn_cast<ZExtInst>(I->use_begin()->getUser())) {
+      if (isTypeSupported(ZE->getType(), RetVT))
+        IntExtVal = ZE;
+      else
+        RetVT = VT;
+    } else if (const auto *SE = dyn_cast<SExtInst>(I->use_begin()->getUser())) {
+      if (isTypeSupported(SE->getType(), RetVT))
+        IntExtVal = SE;
+      else
+        RetVT = VT;
+      WantZExt = false;
+    }
+  }
+
+  unsigned ResultReg =
+      emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I));
+  if (!ResultReg)
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  // There are a few different cases we have to handle, because the load or the
+  // sign-/zero-extend might not be selected by FastISel if we fall-back to
+  // SelectionDAG. There is also an ordering issue when both instructions are in
+  // different basic blocks.
+  // 1.) The load instruction is selected by FastISel, but the integer extend
+  //     not. This usually happens when the integer extend is in a different
+  //     basic block and SelectionDAG took over for that basic block.
+  // 2.) The load instruction is selected before the integer extend. This only
+  //     happens when the integer extend is in a different basic block.
+  // 3.) The load instruction is selected by SelectionDAG and the integer extend
+  //     by FastISel. This happens if there are instructions between the load
+  //     and the integer extend that couldn't be selected by FastISel.
+  if (IntExtVal) {
+    // The integer extend hasn't been emitted yet. FastISel or SelectionDAG
+    // could select it. Emit a copy to subreg if necessary. FastISel will remove
+    // it when it selects the integer extend.
+    unsigned Reg = lookUpRegForValue(IntExtVal);
+    if (!Reg) {
+      if (RetVT == MVT::i64 && VT <= MVT::i32) {
+        if (WantZExt) {
+          // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG).
+          std::prev(FuncInfo.InsertPt)->eraseFromParent();
+          ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg();
+        } else
+          ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg,
+                                                 /*IsKill=*/true,
+                                                 AArch64::sub_32);
+      }
+      updateValueMap(I, ResultReg);
+      return true;
+    }
+
+    // The integer extend has already been emitted - delete all the instructions
+    // that have been emitted by the integer extend lowering code and use the
+    // result from the load instruction directly.
+    while (Reg) {
+      auto *MI = MRI.getUniqueVRegDef(Reg);
+      if (!MI)
+        break;
+      Reg = 0;
+      for (auto &Opnd : MI->uses()) {
+        if (Opnd.isReg()) {
+          Reg = Opnd.getReg();
+          break;
+        }
+      }
+      MI->eraseFromParent();
+    }
+    updateValueMap(IntExtVal, ResultReg);
+    return true;
+  }
+
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                                bool UseUnscaled) {
+bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr,
+                                MachineMemOperand *MMO) {
+  // Simplify this down to something we can handle.
+  if (!simplifyAddress(Addr, VT))
+    return false;
+
+  unsigned ScaleFactor = getImplicitScaleFactor(VT);
+  if (!ScaleFactor)
+    llvm_unreachable("Unexpected value type.");
+
   // Negative offsets require unscaled, 9-bit, signed immediate offsets.
   // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
-  if (!UseUnscaled && Addr.getOffset() < 0)
-    UseUnscaled = true;
-
-  unsigned StrOpc;
-  bool VTIsi1 = false;
-  int64_t ScaleFactor = 0;
-  // Using scaled, 12-bit, unsigned immediate offsets.
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i1:
-    VTIsi1 = true;
-  case MVT::i8:
-    StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui;
+  bool UseScaled = true;
+  if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+    UseScaled = false;
     ScaleFactor = 1;
-    break;
-  case MVT::i16:
-    StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui;
-    ScaleFactor = 2;
-    break;
-  case MVT::i32:
-    StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui;
-    ScaleFactor = 4;
-    break;
-  case MVT::i64:
-    StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui;
-    ScaleFactor = 8;
-    break;
-  case MVT::f32:
-    StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui;
-    ScaleFactor = 4;
-    break;
-  case MVT::f64:
-    StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui;
-    ScaleFactor = 8;
-    break;
-  }
-  // Scale the offset.
-  if (!UseUnscaled) {
-    int64_t Offset = Addr.getOffset();
-    if (Offset & (ScaleFactor - 1))
-      // Retry using an unscaled, 9-bit, signed immediate offset.
-      return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
-
-    Addr.setOffset(Offset / ScaleFactor);
   }
 
-  // Simplify this down to something we can handle.
-  if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
-    return false;
+  static const unsigned OpcTable[4][6] = {
+    { AArch64::STURBBi,  AArch64::STURHHi,  AArch64::STURWi,  AArch64::STURXi,
+      AArch64::STURSi,   AArch64::STURDi },
+    { AArch64::STRBBui,  AArch64::STRHHui,  AArch64::STRWui,  AArch64::STRXui,
+      AArch64::STRSui,   AArch64::STRDui },
+    { AArch64::STRBBroX, AArch64::STRHHroX, AArch64::STRWroX, AArch64::STRXroX,
+      AArch64::STRSroX,  AArch64::STRDroX },
+    { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW,
+      AArch64::STRSroW,  AArch64::STRDroW }
+  };
+
+  unsigned Opc;
+  bool VTIsi1 = false;
+  bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+                      Addr.getOffsetReg();
+  unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+  if (Addr.getExtendType() == AArch64_AM::UXTW ||
+      Addr.getExtendType() == AArch64_AM::SXTW)
+    Idx++;
+
+  switch (VT.SimpleTy) {
+  default: llvm_unreachable("Unexpected value type.");
+  case MVT::i1:  VTIsi1 = true;
+  case MVT::i8:  Opc = OpcTable[Idx][0]; break;
+  case MVT::i16: Opc = OpcTable[Idx][1]; break;
+  case MVT::i32: Opc = OpcTable[Idx][2]; break;
+  case MVT::i64: Opc = OpcTable[Idx][3]; break;
+  case MVT::f32: Opc = OpcTable[Idx][4]; break;
+  case MVT::f64: Opc = OpcTable[Idx][5]; break;
+  }
 
   // Storing an i1 requires special handling.
-  if (VTIsi1) {
-    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
-    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-            ANDReg)
-        .addReg(SrcReg)
-        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+  if (VTIsi1 && SrcReg != AArch64::WZR) {
+    unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+    assert(ANDReg && "Unexpected AND instruction emission failure.");
     SrcReg = ANDReg;
   }
   // Create the base instruction, then add the operands.
-  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(StrOpc)).addReg(SrcReg);
-  AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
+  const MCInstrDesc &II = TII.get(Opc);
+  SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(SrcReg);
+  addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, ScaleFactor, MMO);
+
   return true;
 }
 
-bool AArch64FastISel::SelectStore(const Instruction *I) {
+bool AArch64FastISel::selectStore(const Instruction *I) {
   MVT VT;
-  Value *Op0 = I->getOperand(0);
+  const Value *Op0 = I->getOperand(0);
   // Verify we have a legal type before going any further.  Currently, we handle
   // simple types that will directly fit in a register (i32/f32/i64/f64) or
   // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
-  if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
+  if (!isTypeSupported(Op0->getType(), VT, /*IsVectorAllowed=*/true) ||
       cast<StoreInst>(I)->isAtomic())
     return false;
 
-  // Get the value to be stored into a register.
-  unsigned SrcReg = getRegForValue(Op0);
-  if (SrcReg == 0)
+  // Get the value to be stored into a register. Use the zero register directly
+  // when possible to avoid an unnecessary copy and a wasted register.
+  unsigned SrcReg = 0;
+  if (const auto *CI = dyn_cast<ConstantInt>(Op0)) {
+    if (CI->isZero())
+      SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+  } else if (const auto *CF = dyn_cast<ConstantFP>(Op0)) {
+    if (CF->isZero() && !CF->isNegative()) {
+      VT = MVT::getIntegerVT(VT.getSizeInBits());
+      SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+    }
+  }
+
+  if (!SrcReg)
+    SrcReg = getRegForValue(Op0);
+
+  if (!SrcReg)
     return false;
 
   // See if we can handle this address.
   Address Addr;
-  if (!ComputeAddress(I->getOperand(1), Addr))
+  if (!computeAddress(I->getOperand(1), Addr, I->getOperand(0)->getType()))
     return false;
 
-  if (!EmitStore(VT, SrcReg, Addr))
+  if (!emitStore(VT, SrcReg, Addr, createMachineMemOperandFor(I)))
     return false;
   return true;
 }
@@ -757,58 +2084,234 @@
   }
 }
 
-bool AArch64FastISel::SelectBranch(const Instruction *I) {
-  const BranchInst *BI = cast<BranchInst>(I);
+/// \brief Try to emit a combined compare-and-branch instruction.
+bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
+  assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
+  const CmpInst *CI = cast<CmpInst>(BI->getCondition());
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+  const Value *LHS = CI->getOperand(0);
+  const Value *RHS = CI->getOperand(1);
+
+  MVT VT;
+  if (!isTypeSupported(LHS->getType(), VT))
+    return false;
+
+  unsigned BW = VT.getSizeInBits();
+  if (BW > 64)
+    return false;
+
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
+  // Try to take advantage of fallthrough opportunities.
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    Predicate = CmpInst::getInversePredicate(Predicate);
+  }
+
+  int TestBit = -1;
+  bool IsCmpNE;
+  if ((Predicate == CmpInst::ICMP_EQ) || (Predicate == CmpInst::ICMP_NE)) {
+    if (const auto *C = dyn_cast<Constant>(LHS))
+      if (C->isNullValue())
+        std::swap(LHS, RHS);
+
+    if (!isa<Constant>(RHS))
+      return false;
+
+    if (!cast<Constant>(RHS)->isNullValue())
+      return false;
+
+    if (const auto *AI = dyn_cast<BinaryOperator>(LHS))
+      if (AI->getOpcode() == Instruction::And && isValueAvailable(AI)) {
+        const Value *AndLHS = AI->getOperand(0);
+        const Value *AndRHS = AI->getOperand(1);
+
+        if (const auto *C = dyn_cast<ConstantInt>(AndLHS))
+          if (C->getValue().isPowerOf2())
+            std::swap(AndLHS, AndRHS);
+
+        if (const auto *C = dyn_cast<ConstantInt>(AndRHS))
+          if (C->getValue().isPowerOf2()) {
+            TestBit = C->getValue().logBase2();
+            LHS = AndLHS;
+          }
+      }
+
+    if (VT == MVT::i1)
+      TestBit = 0;
+
+    IsCmpNE = Predicate == CmpInst::ICMP_NE;
+  } else if (Predicate == CmpInst::ICMP_SLT) {
+    if (!isa<Constant>(RHS))
+      return false;
+
+    if (!cast<Constant>(RHS)->isNullValue())
+      return false;
+
+    TestBit = BW - 1;
+    IsCmpNE = true;
+  } else if (Predicate == CmpInst::ICMP_SGT) {
+    if (!isa<ConstantInt>(RHS))
+      return false;
+
+    if (cast<ConstantInt>(RHS)->getValue() != -1)
+      return false;
+
+    TestBit = BW - 1;
+    IsCmpNE = false;
+  } else
+    return false;
+
+  static const unsigned OpcTable[2][2][2] = {
+    { {AArch64::CBZW,  AArch64::CBZX },
+      {AArch64::CBNZW, AArch64::CBNZX} },
+    { {AArch64::TBZW,  AArch64::TBZX },
+      {AArch64::TBNZW, AArch64::TBNZX} }
+  };
+
+  bool IsBitTest = TestBit != -1;
+  bool Is64Bit = BW == 64;
+  if (TestBit < 32 && TestBit >= 0)
+    Is64Bit = false;
+
+  unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit];
+  const MCInstrDesc &II = TII.get(Opc);
+
+  unsigned SrcReg = getRegForValue(LHS);
+  if (!SrcReg)
+    return false;
+  bool SrcIsKill = hasTrivialKill(LHS);
+
+  if (BW == 64 && !Is64Bit)
+    SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
+                                        AArch64::sub_32);
+
+  if ((BW < 32) && !IsBitTest)
+    SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true);
+
+  // Emit the combined compare and branch instruction.
+  SrcReg = constrainOperandRegClass(II, SrcReg,  II.getNumDefs());
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+          .addReg(SrcReg, getKillRegState(SrcIsKill));
+  if (IsBitTest)
+    MIB.addImm(TestBit);
+  MIB.addMBB(TBB);
+
+  // Obtain the branch weight and add the TrueBB to the successor list.
+  uint32_t BranchWeight = 0;
+  if (FuncInfo.BPI)
+    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                               TBB->getBasicBlock());
+  FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+  fastEmitBranch(FBB, DbgLoc);
+
+  return true;
+}
+
+bool AArch64FastISel::selectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  if (BI->isUnconditional()) {
+    MachineBasicBlock *MSucc = FuncInfo.MBBMap[BI->getSuccessor(0)];
+    fastEmitBranch(MSucc, BI->getDebugLoc());
+    return true;
+  }
+
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  AArch64CC::CondCode CC = AArch64CC::NE;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
-    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
-      // We may not handle every CC for now.
-      AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
-      if (CC == AArch64CC::AL)
-        return false;
+    if (CI->hasOneUse() && isValueAvailable(CI)) {
+      // Try to optimize or fold the cmp.
+      CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+      switch (Predicate) {
+      default:
+        break;
+      case CmpInst::FCMP_FALSE:
+        fastEmitBranch(FBB, DbgLoc);
+        return true;
+      case CmpInst::FCMP_TRUE:
+        fastEmitBranch(TBB, DbgLoc);
+        return true;
+      }
+
+      // Try to emit a combined compare-and-branch first.
+      if (emitCompareAndBranch(BI))
+        return true;
+
+      // Try to take advantage of fallthrough opportunities.
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
 
       // Emit the cmp.
-      if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+      if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
         return false;
 
+      // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
+      // instruction.
+      CC = getCompareCC(Predicate);
+      AArch64CC::CondCode ExtraCC = AArch64CC::AL;
+      switch (Predicate) {
+      default:
+        break;
+      case CmpInst::FCMP_UEQ:
+        ExtraCC = AArch64CC::EQ;
+        CC = AArch64CC::VS;
+        break;
+      case CmpInst::FCMP_ONE:
+        ExtraCC = AArch64CC::MI;
+        CC = AArch64CC::GT;
+        break;
+      }
+      assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+
+      // Emit the extra branch for FCMP_UEQ and FCMP_ONE.
+      if (ExtraCC != AArch64CC::AL) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+            .addImm(ExtraCC)
+            .addMBB(TBB);
+      }
+
       // Emit the branch.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
           .addImm(CC)
           .addMBB(TBB);
-      FuncInfo.MBB->addSuccessor(TBB);
 
-      FastEmitBranch(FBB, DbgLoc);
+      // Obtain the branch weight and add the TrueBB to the successor list.
+      uint32_t BranchWeight = 0;
+      if (FuncInfo.BPI)
+        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                  TBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+      fastEmitBranch(FBB, DbgLoc);
       return true;
     }
   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
     MVT SrcVT;
-    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
-        (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
+    if (TI->hasOneUse() && isValueAvailable(TI) &&
+        isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) {
       unsigned CondReg = getRegForValue(TI->getOperand(0));
-      if (CondReg == 0)
+      if (!CondReg)
         return false;
+      bool CondIsKill = hasTrivialKill(TI->getOperand(0));
 
       // Issue an extract_subreg to get the lower 32-bits.
-      if (SrcVT == MVT::i64)
-        CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
+      if (SrcVT == MVT::i64) {
+        CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill,
                                              AArch64::sub_32);
+        CondIsKill = true;
+      }
 
-      MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
-      unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(AArch64::ANDWri), ANDReg)
-          .addReg(CondReg)
-          .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(AArch64::SUBSWri))
-          .addReg(ANDReg)
-          .addReg(ANDReg)
-          .addImm(0)
-          .addImm(0);
+      unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1);
+      assert(ANDReg && "Unexpected AND instruction emission failure.");
+      emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
 
-      unsigned CC = AArch64CC::NE;
       if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
         std::swap(TBB, FBB);
         CC = AArch64CC::EQ;
@@ -816,23 +2319,57 @@
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
           .addImm(CC)
           .addMBB(TBB);
-      FuncInfo.MBB->addSuccessor(TBB);
-      FastEmitBranch(FBB, DbgLoc);
+
+      // Obtain the branch weight and add the TrueBB to the successor list.
+      uint32_t BranchWeight = 0;
+      if (FuncInfo.BPI)
+        BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                  TBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+      fastEmitBranch(FBB, DbgLoc);
       return true;
     }
-  } else if (const ConstantInt *CI =
-                 dyn_cast<ConstantInt>(BI->getCondition())) {
+  } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
     uint64_t Imm = CI->getZExtValue();
     MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
         .addMBB(Target);
-    FuncInfo.MBB->addSuccessor(Target);
+
+    // Obtain the branch weight and add the target to the successor list.
+    uint32_t BranchWeight = 0;
+    if (FuncInfo.BPI)
+      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                 Target->getBasicBlock());
+    FuncInfo.MBB->addSuccessor(Target, BranchWeight);
+    return true;
+  } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+    // Fake request the condition, otherwise the intrinsic might be completely
+    // optimized away.
+    unsigned CondReg = getRegForValue(BI->getCondition());
+    if (!CondReg)
+      return false;
+
+    // Emit the branch.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+      .addImm(CC)
+      .addMBB(TBB);
+
+    // Obtain the branch weight and add the TrueBB to the successor list.
+    uint32_t BranchWeight = 0;
+    if (FuncInfo.BPI)
+      BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                                 TBB->getBasicBlock());
+    FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+    fastEmitBranch(FBB, DbgLoc);
     return true;
   }
 
   unsigned CondReg = getRegForValue(BI->getCondition());
   if (CondReg == 0)
     return false;
+  bool CondRegIsKill = hasTrivialKill(BI->getCondition());
 
   // We've been divorced from our compare!  Our block was split, and
   // now our compare lives in a predecessor block.  We musn't
@@ -841,13 +2378,8 @@
   // Regardless, the compare has been done in the predecessor block,
   // and it left a value for us in a virtual register.  Ergo, we test
   // the one-bit value left in the virtual register.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri),
-          AArch64::WZR)
-      .addReg(CondReg)
-      .addImm(0)
-      .addImm(0);
+  emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0);
 
-  unsigned CC = AArch64CC::NE;
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
     std::swap(TBB, FBB);
     CC = AArch64CC::EQ;
@@ -856,20 +2388,28 @@
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
       .addImm(CC)
       .addMBB(TBB);
-  FuncInfo.MBB->addSuccessor(TBB);
-  FastEmitBranch(FBB, DbgLoc);
+
+  // Obtain the branch weight and add the TrueBB to the successor list.
+  uint32_t BranchWeight = 0;
+  if (FuncInfo.BPI)
+    BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+                                               TBB->getBasicBlock());
+  FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+  fastEmitBranch(FBB, DbgLoc);
   return true;
 }
 
-bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
+bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
   const IndirectBrInst *BI = cast<IndirectBrInst>(I);
   unsigned AddrReg = getRegForValue(BI->getOperand(0));
   if (AddrReg == 0)
     return false;
 
   // Emit the indirect branch.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR))
-      .addReg(AddrReg);
+  const MCInstrDesc &II = TII.get(AArch64::BR);
+  AddrReg = constrainOperandRegClass(II, AddrReg,  II.getNumDefs());
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg);
 
   // Make sure the CFG is up-to-date.
   for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
@@ -878,211 +2418,271 @@
   return true;
 }
 
-bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
-  Type *Ty = Src1Value->getType();
-  EVT SrcEVT = TLI.getValueType(Ty, true);
-  if (!SrcEVT.isSimple())
-    return false;
-  MVT SrcVT = SrcEVT.getSimpleVT();
+bool AArch64FastISel::selectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
 
-  // Check to see if the 2nd operand is a constant that we can encode directly
-  // in the compare.
-  uint64_t Imm;
-  bool UseImm = false;
-  bool isNegativeImm = false;
-  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
-    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
-        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
-      const APInt &CIVal = ConstInt->getValue();
-
-      Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
-      if (CIVal.isNegative()) {
-        isNegativeImm = true;
-        Imm = -Imm;
-      }
-      // FIXME: We can handle more immediates using shifts.
-      UseImm = ((Imm & 0xfff) == Imm);
-    }
-  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
-    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
-      if (ConstFP->isZero() && !ConstFP->isNegative())
-        UseImm = true;
+  // Try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+  unsigned ResultReg = 0;
+  switch (Predicate) {
+  default:
+    break;
+  case CmpInst::FCMP_FALSE:
+    ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(AArch64::WZR, getKillRegState(true));
+    break;
+  case CmpInst::FCMP_TRUE:
+    ResultReg = fastEmit_i(MVT::i32, MVT::i32, ISD::Constant, 1);
+    break;
   }
 
-  unsigned ZReg;
-  unsigned CmpOpc;
-  bool isICmp = true;
-  bool needsExt = false;
-  switch (SrcVT.SimpleTy) {
+  if (ResultReg) {
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  // Emit the cmp.
+  if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
+
+  ResultReg = createResultReg(&AArch64::GPR32RegClass);
+
+  // FCMP_UEQ and FCMP_ONE cannot be checked with a single instruction. These
+  // condition codes are inverted, because they are used by CSINC.
+  static unsigned CondCodeTable[2][2] = {
+    { AArch64CC::NE, AArch64CC::VC },
+    { AArch64CC::PL, AArch64CC::LE }
+  };
+  unsigned *CondCodes = nullptr;
+  switch (Predicate) {
+  default:
+    break;
+  case CmpInst::FCMP_UEQ:
+    CondCodes = &CondCodeTable[0][0];
+    break;
+  case CmpInst::FCMP_ONE:
+    CondCodes = &CondCodeTable[1][0];
+    break;
+  }
+
+  if (CondCodes) {
+    unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+            TmpReg1)
+        .addReg(AArch64::WZR, getKillRegState(true))
+        .addReg(AArch64::WZR, getKillRegState(true))
+        .addImm(CondCodes[0]);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+            ResultReg)
+        .addReg(TmpReg1, getKillRegState(true))
+        .addReg(AArch64::WZR, getKillRegState(true))
+        .addImm(CondCodes[1]);
+
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  // Now set a register based on the comparison.
+  AArch64CC::CondCode CC = getCompareCC(Predicate);
+  assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+  AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+          ResultReg)
+      .addReg(AArch64::WZR, getKillRegState(true))
+      .addReg(AArch64::WZR, getKillRegState(true))
+      .addImm(invertedCC);
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false'
+/// value.
+bool AArch64FastISel::optimizeSelect(const SelectInst *SI) {
+  if (!SI->getType()->isIntegerTy(1))
+    return false;
+
+  const Value *Src1Val, *Src2Val;
+  unsigned Opc = 0;
+  bool NeedExtraOp = false;
+  if (auto *CI = dyn_cast<ConstantInt>(SI->getTrueValue())) {
+    if (CI->isOne()) {
+      Src1Val = SI->getCondition();
+      Src2Val = SI->getFalseValue();
+      Opc = AArch64::ORRWrr;
+    } else {
+      assert(CI->isZero());
+      Src1Val = SI->getFalseValue();
+      Src2Val = SI->getCondition();
+      Opc = AArch64::BICWrr;
+    }
+  } else if (auto *CI = dyn_cast<ConstantInt>(SI->getFalseValue())) {
+    if (CI->isOne()) {
+      Src1Val = SI->getCondition();
+      Src2Val = SI->getTrueValue();
+      Opc = AArch64::ORRWrr;
+      NeedExtraOp = true;
+    } else {
+      assert(CI->isZero());
+      Src1Val = SI->getCondition();
+      Src2Val = SI->getTrueValue();
+      Opc = AArch64::ANDWrr;
+    }
+  }
+
+  if (!Opc)
+    return false;
+
+  unsigned Src1Reg = getRegForValue(Src1Val);
+  if (!Src1Reg)
+    return false;
+  bool Src1IsKill = hasTrivialKill(Src1Val);
+
+  unsigned Src2Reg = getRegForValue(Src2Val);
+  if (!Src2Reg)
+    return false;
+  bool Src2IsKill = hasTrivialKill(Src2Val);
+
+  if (NeedExtraOp) {
+    Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1);
+    Src1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32spRegClass, Src1Reg,
+                                       Src1IsKill, Src2Reg, Src2IsKill);
+  updateValueMap(SI, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectSelect(const Instruction *I) {
+  assert(isa<SelectInst>(I) && "Expected a select instruction.");
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT))
+    return false;
+
+  unsigned Opc;
+  const TargetRegisterClass *RC;
+  switch (VT.SimpleTy) {
   default:
     return false;
   case MVT::i1:
   case MVT::i8:
   case MVT::i16:
-    needsExt = true;
-  // Intentional fall-through.
   case MVT::i32:
-    ZReg = AArch64::WZR;
-    if (UseImm)
-      CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri;
-    else
-      CmpOpc = AArch64::SUBSWrr;
+    Opc = AArch64::CSELWr;
+    RC = &AArch64::GPR32RegClass;
     break;
   case MVT::i64:
-    ZReg = AArch64::XZR;
-    if (UseImm)
-      CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri;
-    else
-      CmpOpc = AArch64::SUBSXrr;
+    Opc = AArch64::CSELXr;
+    RC = &AArch64::GPR64RegClass;
     break;
   case MVT::f32:
-    isICmp = false;
-    CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr;
+    Opc = AArch64::FCSELSrrr;
+    RC = &AArch64::FPR32RegClass;
     break;
   case MVT::f64:
-    isICmp = false;
-    CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr;
+    Opc = AArch64::FCSELDrrr;
+    RC = &AArch64::FPR64RegClass;
     break;
   }
 
-  unsigned SrcReg1 = getRegForValue(Src1Value);
-  if (SrcReg1 == 0)
-    return false;
-
-  unsigned SrcReg2;
-  if (!UseImm) {
-    SrcReg2 = getRegForValue(Src2Value);
-    if (SrcReg2 == 0)
-      return false;
-  }
-
-  // We have i1, i8, or i16, we need to either zero extend or sign extend.
-  if (needsExt) {
-    SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
-    if (SrcReg1 == 0)
-      return false;
-    if (!UseImm) {
-      SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
-      if (SrcReg2 == 0)
-        return false;
-    }
-  }
-
-  if (isICmp) {
-    if (UseImm)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(ZReg)
-          .addReg(SrcReg1)
-          .addImm(Imm)
-          .addImm(0);
-    else
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(ZReg)
-          .addReg(SrcReg1)
-          .addReg(SrcReg2);
-  } else {
-    if (UseImm)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(SrcReg1);
-    else
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
-          .addReg(SrcReg1)
-          .addReg(SrcReg2);
-  }
-  return true;
-}
-
-bool AArch64FastISel::SelectCmp(const Instruction *I) {
-  const CmpInst *CI = cast<CmpInst>(I);
-
-  // We may not handle every CC for now.
-  AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
-  if (CC == AArch64CC::AL)
-    return false;
-
-  // Emit the cmp.
-  if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
-    return false;
-
-  // Now set a register based on the comparison.
-  AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
-  unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
-          ResultReg)
-      .addReg(AArch64::WZR)
-      .addReg(AArch64::WZR)
-      .addImm(invertedCC);
-
-  UpdateValueMap(I, ResultReg);
-  return true;
-}
-
-bool AArch64FastISel::SelectSelect(const Instruction *I) {
   const SelectInst *SI = cast<SelectInst>(I);
+  const Value *Cond = SI->getCondition();
+  AArch64CC::CondCode CC = AArch64CC::NE;
+  AArch64CC::CondCode ExtraCC = AArch64CC::AL;
 
-  EVT DestEVT = TLI.getValueType(SI->getType(), true);
-  if (!DestEVT.isSimple())
-    return false;
+  if (optimizeSelect(SI))
+    return true;
 
-  MVT DestVT = DestEVT.getSimpleVT();
-  if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
-      DestVT != MVT::f64)
-    return false;
+  // Try to pickup the flags, so we don't have to emit another compare.
+  if (foldXALUIntrinsic(CC, I, Cond)) {
+    // Fake request the condition to force emission of the XALU intrinsic.
+    unsigned CondReg = getRegForValue(Cond);
+    if (!CondReg)
+      return false;
+  } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() &&
+             isValueAvailable(Cond)) {
+    const auto *Cmp = cast<CmpInst>(Cond);
+    // Try to optimize or fold the cmp.
+    CmpInst::Predicate Predicate = optimizeCmpPredicate(Cmp);
+    const Value *FoldSelect = nullptr;
+    switch (Predicate) {
+    default:
+      break;
+    case CmpInst::FCMP_FALSE:
+      FoldSelect = SI->getFalseValue();
+      break;
+    case CmpInst::FCMP_TRUE:
+      FoldSelect = SI->getTrueValue();
+      break;
+    }
 
-  unsigned CondReg = getRegForValue(SI->getCondition());
-  if (CondReg == 0)
-    return false;
-  unsigned TrueReg = getRegForValue(SI->getTrueValue());
-  if (TrueReg == 0)
-    return false;
-  unsigned FalseReg = getRegForValue(SI->getFalseValue());
-  if (FalseReg == 0)
-    return false;
+    if (FoldSelect) {
+      unsigned SrcReg = getRegForValue(FoldSelect);
+      if (!SrcReg)
+        return false;
+      unsigned UseReg = lookUpRegForValue(SI);
+      if (UseReg)
+        MRI.clearKillFlags(UseReg);
 
+      updateValueMap(I, SrcReg);
+      return true;
+    }
 
-  MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
-  unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-          ANDReg)
-      .addReg(CondReg)
-      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+    // Emit the cmp.
+    if (!emitCmp(Cmp->getOperand(0), Cmp->getOperand(1), Cmp->isUnsigned()))
+      return false;
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri))
-      .addReg(ANDReg)
-      .addReg(ANDReg)
-      .addImm(0)
-      .addImm(0);
+    // FCMP_UEQ and FCMP_ONE cannot be checked with a single select instruction.
+    CC = getCompareCC(Predicate);
+    switch (Predicate) {
+    default:
+      break;
+    case CmpInst::FCMP_UEQ:
+      ExtraCC = AArch64CC::EQ;
+      CC = AArch64CC::VS;
+      break;
+    case CmpInst::FCMP_ONE:
+      ExtraCC = AArch64CC::MI;
+      CC = AArch64CC::GT;
+      break;
+    }
+    assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+  } else {
+    unsigned CondReg = getRegForValue(Cond);
+    if (!CondReg)
+      return false;
+    bool CondIsKill = hasTrivialKill(Cond);
 
-  unsigned SelectOpc;
-  switch (DestVT.SimpleTy) {
-  default:
-    return false;
-  case MVT::i32:
-    SelectOpc = AArch64::CSELWr;
-    break;
-  case MVT::i64:
-    SelectOpc = AArch64::CSELXr;
-    break;
-  case MVT::f32:
-    SelectOpc = AArch64::FCSELSrrr;
-    break;
-  case MVT::f64:
-    SelectOpc = AArch64::FCSELDrrr;
-    break;
+    // Emit a TST instruction (ANDS wzr, reg, #imm).
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDSWri),
+            AArch64::WZR)
+        .addReg(CondReg, getKillRegState(CondIsKill))
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
   }
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
-          ResultReg)
-      .addReg(TrueReg)
-      .addReg(FalseReg)
-      .addImm(AArch64CC::NE);
+  unsigned Src1Reg = getRegForValue(SI->getTrueValue());
+  bool Src1IsKill = hasTrivialKill(SI->getTrueValue());
 
-  UpdateValueMap(I, ResultReg);
+  unsigned Src2Reg = getRegForValue(SI->getFalseValue());
+  bool Src2IsKill = hasTrivialKill(SI->getFalseValue());
+
+  if (!Src1Reg || !Src2Reg)
+    return false;
+
+  if (ExtraCC != AArch64CC::AL) {
+    Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+                               Src2IsKill, ExtraCC);
+    Src2IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+                                        Src2IsKill, CC);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectFPExt(const Instruction *I) {
+bool AArch64FastISel::selectFPExt(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
     return false;
@@ -1094,11 +2694,11 @@
   unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
           ResultReg).addReg(Op);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
+bool AArch64FastISel::selectFPTrunc(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
     return false;
@@ -1110,12 +2710,12 @@
   unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
           ResultReg).addReg(Op);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
 // FPToUI and FPToSI
-bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
@@ -1144,11 +2744,11 @@
       DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(SrcReg);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
@@ -1156,22 +2756,21 @@
           "Unexpected value type.");
 
   unsigned SrcReg = getRegForValue(I->getOperand(0));
-  if (SrcReg == 0)
+  if (!SrcReg)
     return false;
+  bool SrcIsKill = hasTrivialKill(I->getOperand(0));
 
   EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
 
   // Handle sign-extension.
   if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
     SrcReg =
-        EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
-    if (SrcReg == 0)
+        emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+    if (!SrcReg)
       return false;
+    SrcIsKill = true;
   }
 
-  MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass
-                                                  : &AArch64::GPR32RegClass);
-
   unsigned Opc;
   if (SrcVT == MVT::i64) {
     if (Signed)
@@ -1185,21 +2784,128 @@
       Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
   }
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg);
-  UpdateValueMap(I, ResultReg);
+  unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg,
+                                      SrcIsKill);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::ProcessCallArgs(
-    SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs,
-    SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-    SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
-    unsigned &NumBytes) {
+bool AArch64FastISel::fastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C)
+    return false;
+
+  // Only handle simple cases of up to 8 GPR and FPR each.
+  unsigned GPRCnt = 0;
+  unsigned FPRCnt = 0;
+  unsigned Idx = 0;
+  for (auto const &Arg : F->args()) {
+    // The first argument is at index 1.
+    ++Idx;
+    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = Arg.getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(ArgTy);
+    if (!ArgVT.isSimple())
+      return false;
+
+    MVT VT = ArgVT.getSimpleVT().SimpleTy;
+    if (VT.isFloatingPoint() && !Subtarget->hasFPARMv8())
+      return false;
+
+    if (VT.isVector() &&
+        (!Subtarget->hasNEON() || !Subtarget->isLittleEndian()))
+      return false;
+
+    if (VT >= MVT::i1 && VT <= MVT::i64)
+      ++GPRCnt;
+    else if ((VT >= MVT::f16 && VT <= MVT::f64) || VT.is64BitVector() ||
+             VT.is128BitVector())
+      ++FPRCnt;
+    else
+      return false;
+
+    if (GPRCnt > 8 || FPRCnt > 8)
+      return false;
+  }
+
+  static const MCPhysReg Registers[6][8] = {
+    { AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4,
+      AArch64::W5, AArch64::W6, AArch64::W7 },
+    { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4,
+      AArch64::X5, AArch64::X6, AArch64::X7 },
+    { AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4,
+      AArch64::H5, AArch64::H6, AArch64::H7 },
+    { AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4,
+      AArch64::S5, AArch64::S6, AArch64::S7 },
+    { AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4,
+      AArch64::D5, AArch64::D6, AArch64::D7 },
+    { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
+      AArch64::Q5, AArch64::Q6, AArch64::Q7 }
+  };
+
+  unsigned GPRIdx = 0;
+  unsigned FPRIdx = 0;
+  for (auto const &Arg : F->args()) {
+    MVT VT = TLI.getSimpleValueType(Arg.getType());
+    unsigned SrcReg;
+    const TargetRegisterClass *RC;
+    if (VT >= MVT::i1 && VT <= MVT::i32) {
+      SrcReg = Registers[0][GPRIdx++];
+      RC = &AArch64::GPR32RegClass;
+      VT = MVT::i32;
+    } else if (VT == MVT::i64) {
+      SrcReg = Registers[1][GPRIdx++];
+      RC = &AArch64::GPR64RegClass;
+    } else if (VT == MVT::f16) {
+      SrcReg = Registers[2][FPRIdx++];
+      RC = &AArch64::FPR16RegClass;
+    } else if (VT ==  MVT::f32) {
+      SrcReg = Registers[3][FPRIdx++];
+      RC = &AArch64::FPR32RegClass;
+    } else if ((VT == MVT::f64) || VT.is64BitVector()) {
+      SrcReg = Registers[4][FPRIdx++];
+      RC = &AArch64::FPR64RegClass;
+    } else if (VT.is128BitVector()) {
+      SrcReg = Registers[5][FPRIdx++];
+      RC = &AArch64::FPR128RegClass;
+    } else
+      llvm_unreachable("Unexpected value type.");
+
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(DstReg, getKillRegState(true));
+    updateValueMap(&Arg, ResultReg);
+  }
+  return true;
+}
+
+bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
+                                      SmallVectorImpl<MVT> &OutVTs,
+                                      unsigned &NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
-  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+  CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
 
   // Get a count of how many bytes are to be pushed on the stack.
   NumBytes = CCInfo.getNextStackOffset();
@@ -1207,13 +2913,17 @@
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-      .addImm(NumBytes);
+    .addImm(NumBytes);
 
   // Process the args.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    unsigned Arg = ArgRegs[VA.getValNo()];
-    MVT ArgVT = ArgVTs[VA.getValNo()];
+    const Value *ArgVal = CLI.OutVals[VA.getValNo()];
+    MVT ArgVT = OutVTs[VA.getValNo()];
+
+    unsigned ArgReg = getRegForValue(ArgVal);
+    if (!ArgReg)
+      return false;
 
     // Handle arg promotion: SExt, ZExt, AExt.
     switch (VA.getLocInfo()) {
@@ -1222,8 +2932,8 @@
     case CCValAssign::SExt: {
       MVT DestVT = VA.getLocVT();
       MVT SrcVT = ArgVT;
-      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
-      if (Arg == 0)
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false);
+      if (!ArgReg)
         return false;
       break;
     }
@@ -1232,8 +2942,8 @@
     case CCValAssign::ZExt: {
       MVT DestVT = VA.getLocVT();
       MVT SrcVT = ArgVT;
-      Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
-      if (Arg == 0)
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true);
+      if (!ArgReg)
         return false;
       break;
     }
@@ -1244,14 +2954,18 @@
     // Now copy/store arg to correct locations.
     if (VA.isRegLoc() && !VA.needsCustom()) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
-      RegArgs.push_back(VA.getLocReg());
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+      CLI.OutRegs.push_back(VA.getLocReg());
     } else if (VA.needsCustom()) {
       // FIXME: Handle custom args.
       return false;
     } else {
       assert(VA.isMemLoc() && "Assuming store on stack.");
 
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
       // Need to store on the stack.
       unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
 
@@ -1264,26 +2978,31 @@
       Addr.setReg(AArch64::SP);
       Addr.setOffset(VA.getLocMemOffset() + BEAlign);
 
-      if (!EmitStore(ArgVT, Arg, Addr))
+      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getStack(Addr.getOffset()),
+        MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+
+      if (!emitStore(ArgVT, ArgReg, Addr, MMO))
         return false;
     }
   }
   return true;
 }
 
-bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                                 const Instruction *I, CallingConv::ID CC,
-                                 unsigned &NumBytes) {
+bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
+                                 unsigned NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
-      .addImm(NumBytes)
-      .addImm(0);
+    .addImm(NumBytes).addImm(0);
 
   // Now the return value.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
 
     // Only handle a single return value.
@@ -1294,147 +3013,147 @@
     MVT CopyVT = RVLocs[0].getValVT();
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY),
-            ResultReg).addReg(RVLocs[0].getLocReg());
-    UsedRegs.push_back(RVLocs[0].getLocReg());
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(RVLocs[0].getLocReg());
+    CLI.InRegs.push_back(RVLocs[0].getLocReg());
 
-    // Finally update the result.
-    UpdateValueMap(I, ResultReg);
+    CLI.ResultReg = ResultReg;
+    CLI.NumResultRegs = 1;
   }
 
   return true;
 }
 
-bool AArch64FastISel::SelectCall(const Instruction *I,
-                                 const char *IntrMemName = nullptr) {
-  const CallInst *CI = cast<CallInst>(I);
-  const Value *Callee = CI->getCalledValue();
+bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  CallingConv::ID CC  = CLI.CallConv;
+  bool IsTailCall     = CLI.IsTailCall;
+  bool IsVarArg       = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  const char *SymName = CLI.SymName;
 
-  // Don't handle inline asm or intrinsics.
-  if (isa<InlineAsm>(Callee))
+  if (!Callee && !SymName)
     return false;
 
-  // Only handle global variable Callees.
-  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV)
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
     return false;
 
-  // Check the calling convention.
-  ImmutableCallSite CS(CI);
-  CallingConv::ID CC = CS.getCallingConv();
+  CodeModel::Model CM = TM.getCodeModel();
+  // Only support the small and large code model.
+  if (CM != CodeModel::Small && CM != CodeModel::Large)
+    return false;
+
+  // FIXME: Add large code model support for ELF.
+  if (CM == CodeModel::Large && !Subtarget->isTargetMachO())
+    return false;
 
   // Let SDISel handle vararg functions.
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  if (FTy->isVarArg())
+  if (IsVarArg)
     return false;
 
-  // Handle *simple* calls for now.
+  // FIXME: Only handle *simple* calls for now.
   MVT RetVT;
-  Type *RetTy = I->getType();
-  if (RetTy->isVoidTy())
+  if (CLI.RetTy->isVoidTy())
     RetVT = MVT::isVoid;
-  else if (!isTypeLegal(RetTy, RetVT))
+  else if (!isTypeLegal(CLI.RetTy, RetVT))
     return false;
 
+  for (auto Flag : CLI.OutFlags)
+    if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal())
+      return false;
+
   // Set up the argument vectors.
-  SmallVector<Value *, 8> Args;
-  SmallVector<unsigned, 8> ArgRegs;
-  SmallVector<MVT, 8> ArgVTs;
-  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
-  Args.reserve(CS.arg_size());
-  ArgRegs.reserve(CS.arg_size());
-  ArgVTs.reserve(CS.arg_size());
-  ArgFlags.reserve(CS.arg_size());
+  SmallVector<MVT, 16> OutVTs;
+  OutVTs.reserve(CLI.OutVals.size());
 
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
-    // If we're lowering a memory intrinsic instead of a regular call, skip the
-    // last two arguments, which shouldn't be passed to the underlying function.
-    if (IntrMemName && e - i <= 2)
-      break;
-
-    unsigned Arg = getRegForValue(*i);
-    if (Arg == 0)
-      return false;
-
-    ISD::ArgFlagsTy Flags;
-    unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
-      Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
-      Flags.setZExt();
-
-    // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
-        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
-        CS.paramHasAttr(AttrInd, Attribute::ByVal))
-      return false;
-
-    MVT ArgVT;
-    Type *ArgTy = (*i)->getType();
-    if (!isTypeLegal(ArgTy, ArgVT) &&
-        !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
+  for (auto *Val : CLI.OutVals) {
+    MVT VT;
+    if (!isTypeLegal(Val->getType(), VT) &&
+        !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16))
       return false;
 
     // We don't handle vector parameters yet.
-    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+    if (VT.isVector() || VT.getSizeInBits() > 64)
       return false;
 
-    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
-    Flags.setOrigAlign(OriginalAlignment);
-
-    Args.push_back(*i);
-    ArgRegs.push_back(Arg);
-    ArgVTs.push_back(ArgVT);
-    ArgFlags.push_back(Flags);
+    OutVTs.push_back(VT);
   }
 
+  Address Addr;
+  if (Callee && !computeCallAddress(Callee, Addr))
+    return false;
+
   // Handle the arguments now that we've gotten them.
-  SmallVector<unsigned, 4> RegArgs;
   unsigned NumBytes;
-  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+  if (!processCallArgs(CLI, OutVTs, NumBytes))
     return false;
 
   // Issue the call.
   MachineInstrBuilder MIB;
-  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL));
-  if (!IntrMemName)
-    MIB.addGlobalAddress(GV, 0, 0);
-  else
-    MIB.addExternalSymbol(IntrMemName, 0);
+  if (CM == CodeModel::Small) {
+    const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
+    if (SymName)
+      MIB.addExternalSymbol(SymName, 0);
+    else if (Addr.getGlobalValue())
+      MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0);
+    else if (Addr.getReg()) {
+      unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0);
+      MIB.addReg(Reg);
+    } else
+      return false;
+  } else {
+    unsigned CallReg = 0;
+    if (SymName) {
+      unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+              ADRPReg)
+        .addExternalSymbol(SymName, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+      CallReg = createResultReg(&AArch64::GPR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+              CallReg)
+        .addReg(ADRPReg)
+        .addExternalSymbol(SymName, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                           AArch64II::MO_NC);
+    } else if (Addr.getGlobalValue())
+      CallReg = materializeGV(Addr.getGlobalValue());
+    else if (Addr.getReg())
+      CallReg = Addr.getReg();
+
+    if (!CallReg)
+      return false;
+
+    const MCInstrDesc &II = TII.get(AArch64::BLR);
+    CallReg = constrainOperandRegClass(II, CallReg, 0);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg);
+  }
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (auto Reg : CLI.OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
-  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  CLI.Call = MIB;
 
   // Finish off the call including any return values.
-  SmallVector<unsigned, 4> UsedRegs;
-  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
-    return false;
-
-  // Set all unused physreg defs as dead.
-  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
-
-  return true;
+  return finishCall(CLI, RetVT, NumBytes);
 }
 
-bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+bool AArch64FastISel::isMemCpySmall(uint64_t Len, unsigned Alignment) {
   if (Alignment)
     return Len / Alignment <= 4;
   else
     return Len < 32;
 }
 
-bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
+bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src,
                                          uint64_t Len, unsigned Alignment) {
   // Make sure we don't bloat code by inlining very large memcpy's.
-  if (!IsMemCpySmall(Len, Alignment))
+  if (!isMemCpySmall(Len, Alignment))
     return false;
 
   int64_t UnscaledOffset = 0;
@@ -1464,14 +3183,11 @@
       }
     }
 
-    bool RV;
-    unsigned ResultReg;
-    RV = EmitLoad(VT, ResultReg, Src);
-    if (!RV)
+    unsigned ResultReg = emitLoad(VT, VT, Src);
+    if (!ResultReg)
       return false;
 
-    RV = EmitStore(VT, ResultReg, Dest);
-    if (!RV)
+    if (!emitStore(VT, ResultReg, Dest))
       return false;
 
     int64_t Size = VT.getSizeInBits() / 8;
@@ -1486,73 +3202,430 @@
   return true;
 }
 
-bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
-  // FIXME: Handle more intrinsics.
-  switch (I.getIntrinsicID()) {
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
+                                        const Instruction *I,
+                                        const Value *Cond) {
+  if (!isa<ExtractValueInst>(Cond))
+    return false;
+
+  const auto *EV = cast<ExtractValueInst>(Cond);
+  if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+    return false;
+
+  const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+  MVT RetVT;
+  const Function *Callee = II->getCalledFunction();
+  Type *RetTy =
+  cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+  if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  if (RetVT != MVT::i32 && RetVT != MVT::i64)
+    return false;
+
+  const Value *LHS = II->getArgOperand(0);
+  const Value *RHS = II->getArgOperand(1);
+
+  // Canonicalize immediate to the RHS.
+  if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+      isCommutativeIntrinsic(II))
+    std::swap(LHS, RHS);
+
+  // Simplify multiplies.
+  unsigned IID = II->getIntrinsicID();
+  switch (IID) {
+  default:
+    break;
+  case Intrinsic::smul_with_overflow:
+    if (const auto *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue() == 2)
+        IID = Intrinsic::sadd_with_overflow;
+    break;
+  case Intrinsic::umul_with_overflow:
+    if (const auto *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue() == 2)
+        IID = Intrinsic::uadd_with_overflow;
+    break;
+  }
+
+  AArch64CC::CondCode TmpCC;
+  switch (IID) {
   default:
     return false;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+    TmpCC = AArch64CC::VS;
+    break;
+  case Intrinsic::uadd_with_overflow:
+    TmpCC = AArch64CC::HS;
+    break;
+  case Intrinsic::usub_with_overflow:
+    TmpCC = AArch64CC::LO;
+    break;
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    TmpCC = AArch64CC::NE;
+    break;
+  }
+
+  // Check if both instructions are in the same basic block.
+  if (!isValueAvailable(II))
+    return false;
+
+  // Make sure nothing is in the way
+  BasicBlock::const_iterator Start = I;
+  BasicBlock::const_iterator End = II;
+  for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+    // We only expect extractvalue instructions between the intrinsic and the
+    // instruction to be selected.
+    if (!isa<ExtractValueInst>(Itr))
+      return false;
+
+    // Check that the extractvalue operand comes from the intrinsic.
+    const auto *EVI = cast<ExtractValueInst>(Itr);
+    if (EVI->getAggregateOperand() != II)
+      return false;
+  }
+
+  CC = TmpCC;
+  return true;
+}
+
+bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+  // FIXME: Handle more intrinsics.
+  switch (II->getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::frameaddress: {
+    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    const AArch64RegisterInfo *RegInfo =
+        static_cast<const AArch64RegisterInfo *>(
+            TM.getSubtargetImpl()->getRegisterInfo());
+    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr);
+    // Recursively load frame address
+    // ldr x0, [fp]
+    // ldr x0, [x0]
+    // ldr x0, [x0]
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass,
+                                SrcReg, /*IsKill=*/true, 0);
+      assert(DestReg && "Unexpected LDR instruction emission failure.");
+      SrcReg = DestReg;
+    }
+
+    updateValueMap(II, SrcReg);
+    return true;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
-    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    const auto *MTI = cast<MemTransferInst>(II);
     // Don't handle volatile.
-    if (MTI.isVolatile())
+    if (MTI->isVolatile())
       return false;
 
     // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
     // we would emit dead code because we don't currently handle memmoves.
-    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
-    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+    bool IsMemCpy = (II->getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI->getLength()) && IsMemCpy) {
       // Small memcpy's are common enough that we want to do them without a call
       // if possible.
-      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
-      unsigned Alignment = MTI.getAlignment();
-      if (IsMemCpySmall(Len, Alignment)) {
+      uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue();
+      unsigned Alignment = MTI->getAlignment();
+      if (isMemCpySmall(Len, Alignment)) {
         Address Dest, Src;
-        if (!ComputeAddress(MTI.getRawDest(), Dest) ||
-            !ComputeAddress(MTI.getRawSource(), Src))
+        if (!computeAddress(MTI->getRawDest(), Dest) ||
+            !computeAddress(MTI->getRawSource(), Src))
           return false;
-        if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+        if (tryEmitSmallMemCpy(Dest, Src, Len, Alignment))
           return true;
       }
     }
 
-    if (!MTI.getLength()->getType()->isIntegerTy(64))
+    if (!MTI->getLength()->getType()->isIntegerTy(64))
       return false;
 
-    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+    if (MTI->getSourceAddressSpace() > 255 || MTI->getDestAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
       return false;
 
-    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
-    return SelectCall(&I, IntrMemName);
+    const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
+    return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
   }
   case Intrinsic::memset: {
-    const MemSetInst &MSI = cast<MemSetInst>(I);
+    const MemSetInst *MSI = cast<MemSetInst>(II);
     // Don't handle volatile.
-    if (MSI.isVolatile())
+    if (MSI->isVolatile())
       return false;
 
-    if (!MSI.getLength()->getType()->isIntegerTy(64))
+    if (!MSI->getLength()->getType()->isIntegerTy(64))
       return false;
 
-    if (MSI.getDestAddressSpace() > 255)
+    if (MSI->getDestAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
       return false;
 
-    return SelectCall(&I, "memset");
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+  }
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::pow: {
+    MVT RetVT;
+    if (!isTypeLegal(II->getType(), RetVT))
+      return false;
+
+    if (RetVT != MVT::f32 && RetVT != MVT::f64)
+      return false;
+
+    static const RTLIB::Libcall LibCallTable[3][2] = {
+      { RTLIB::SIN_F32, RTLIB::SIN_F64 },
+      { RTLIB::COS_F32, RTLIB::COS_F64 },
+      { RTLIB::POW_F32, RTLIB::POW_F64 }
+    };
+    RTLIB::Libcall LC;
+    bool Is64Bit = RetVT == MVT::f64;
+    switch (II->getIntrinsicID()) {
+    default:
+      llvm_unreachable("Unexpected intrinsic.");
+    case Intrinsic::sin:
+      LC = LibCallTable[0][Is64Bit];
+      break;
+    case Intrinsic::cos:
+      LC = LibCallTable[1][Is64Bit];
+      break;
+    case Intrinsic::pow:
+      LC = LibCallTable[2][Is64Bit];
+      break;
+    }
+
+    ArgListTy Args;
+    Args.reserve(II->getNumArgOperands());
+
+    // Populate the argument list.
+    for (auto &Arg : II->arg_operands()) {
+      ArgListEntry Entry;
+      Entry.Val = Arg;
+      Entry.Ty = Arg->getType();
+      Args.push_back(Entry);
+    }
+
+    CallLoweringInfo CLI;
+    CLI.setCallee(TLI.getLibcallCallingConv(LC), II->getType(),
+                  TLI.getLibcallName(LC), std::move(Args));
+    if (!lowerCallTo(CLI))
+      return false;
+    updateValueMap(II, CLI.ResultReg);
+    return true;
+  }
+  case Intrinsic::fabs: {
+    MVT VT;
+    if (!isTypeLegal(II->getType(), VT))
+      return false;
+
+    unsigned Opc;
+    switch (VT.SimpleTy) {
+    default:
+      return false;
+    case MVT::f32:
+      Opc = AArch64::FABSSr;
+      break;
+    case MVT::f64:
+      Opc = AArch64::FABSDr;
+      break;
+    }
+    unsigned SrcReg = getRegForValue(II->getOperand(0));
+    if (!SrcReg)
+      return false;
+    bool SrcRegIsKill = hasTrivialKill(II->getOperand(0));
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+      .addReg(SrcReg, getKillRegState(SrcRegIsKill));
+    updateValueMap(II, ResultReg);
+    return true;
   }
   case Intrinsic::trap: {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
         .addImm(1);
     return true;
   }
+  case Intrinsic::sqrt: {
+    Type *RetTy = II->getCalledFunction()->getReturnType();
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    unsigned Op0Reg = getRegForValue(II->getOperand(0));
+    if (!Op0Reg)
+      return false;
+    bool Op0IsKill = hasTrivialKill(II->getOperand(0));
+
+    unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill);
+    if (!ResultReg)
+      return false;
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    // This implements the basic lowering of the xalu with overflow intrinsics.
+    const Function *Callee = II->getCalledFunction();
+    auto *Ty = cast<StructType>(Callee->getReturnType());
+    Type *RetTy = Ty->getTypeAtIndex(0U);
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    if (VT != MVT::i32 && VT != MVT::i64)
+      return false;
+
+    const Value *LHS = II->getArgOperand(0);
+    const Value *RHS = II->getArgOperand(1);
+    // Canonicalize immediate to the RHS.
+    if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+        isCommutativeIntrinsic(II))
+      std::swap(LHS, RHS);
+
+    // Simplify multiplies.
+    unsigned IID = II->getIntrinsicID();
+    switch (IID) {
+    default:
+      break;
+    case Intrinsic::smul_with_overflow:
+      if (const auto *C = dyn_cast<ConstantInt>(RHS))
+        if (C->getValue() == 2) {
+          IID = Intrinsic::sadd_with_overflow;
+          RHS = LHS;
+        }
+      break;
+    case Intrinsic::umul_with_overflow:
+      if (const auto *C = dyn_cast<ConstantInt>(RHS))
+        if (C->getValue() == 2) {
+          IID = Intrinsic::uadd_with_overflow;
+          RHS = LHS;
+        }
+      break;
+    }
+
+    unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0;
+    AArch64CC::CondCode CC = AArch64CC::Invalid;
+    switch (IID) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::sadd_with_overflow:
+      ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::VS;
+      break;
+    case Intrinsic::uadd_with_overflow:
+      ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::HS;
+      break;
+    case Intrinsic::ssub_with_overflow:
+      ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::VS;
+      break;
+    case Intrinsic::usub_with_overflow:
+      ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+      CC = AArch64CC::LO;
+      break;
+    case Intrinsic::smul_with_overflow: {
+      CC = AArch64CC::NE;
+      unsigned LHSReg = getRegForValue(LHS);
+      if (!LHSReg)
+        return false;
+      bool LHSIsKill = hasTrivialKill(LHS);
+
+      unsigned RHSReg = getRegForValue(RHS);
+      if (!RHSReg)
+        return false;
+      bool RHSIsKill = hasTrivialKill(RHS);
+
+      if (VT == MVT::i32) {
+        MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg,
+                                       /*IsKill=*/false, 32);
+        MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+                                            AArch64::sub_32);
+        ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true,
+                                              AArch64::sub_32);
+        emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+                    AArch64_AM::ASR, 31, /*WantResult=*/false);
+      } else {
+        assert(VT == MVT::i64 && "Unexpected value type.");
+        MulReg = emitMul_rr(VT, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill,
+                                        RHSReg, RHSIsKill);
+        emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+                    AArch64_AM::ASR, 63, /*WantResult=*/false);
+      }
+      break;
+    }
+    case Intrinsic::umul_with_overflow: {
+      CC = AArch64CC::NE;
+      unsigned LHSReg = getRegForValue(LHS);
+      if (!LHSReg)
+        return false;
+      bool LHSIsKill = hasTrivialKill(LHS);
+
+      unsigned RHSReg = getRegForValue(RHS);
+      if (!RHSReg)
+        return false;
+      bool RHSIsKill = hasTrivialKill(RHS);
+
+      if (VT == MVT::i32) {
+        MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg,
+                    /*IsKill=*/false, AArch64_AM::LSR, 32,
+                    /*WantResult=*/false);
+        MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+                                            AArch64::sub_32);
+      } else {
+        assert(VT == MVT::i64 && "Unexpected value type.");
+        MulReg = emitMul_rr(VT, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+        unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill,
+                                        RHSReg, RHSIsKill);
+        emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg,
+                    /*IsKill=*/false, /*WantResult=*/false);
+      }
+      break;
+    }
+    }
+
+    if (MulReg) {
+      ResultReg1 = createResultReg(TLI.getRegClassFor(VT));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg1).addReg(MulReg);
+    }
+
+    ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass,
+                                  AArch64::WZR, /*IsKill=*/true, AArch64::WZR,
+                                  /*IsKill=*/true, getInvertedCondCode(CC));
+    (void)ResultReg2;
+    assert((ResultReg1 + 1) == ResultReg2 &&
+           "Nonconsecutive result registers.");
+    updateValueMap(II, ResultReg1, 2);
+    return true;
+  }
   }
   return false;
 }
 
-bool AArch64FastISel::SelectRet(const Instruction *I) {
+bool AArch64FastISel::selectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
 
@@ -1572,8 +3645,7 @@
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
-                   I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
     CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
                                                      : RetCC_AArch64_AAPCS;
     CCInfo.AnalyzeReturn(Outs, RetCC);
@@ -1586,11 +3658,14 @@
     const Value *RV = Ret->getOperand(0);
 
     // Don't bother handling odd stuff for now.
-    if (VA.getLocInfo() != CCValAssign::Full)
+    if ((VA.getLocInfo() != CCValAssign::Full) &&
+        (VA.getLocInfo() != CCValAssign::BCvt))
       return false;
+
     // Only handle register returns for now.
     if (!VA.isRegLoc())
       return false;
+
     unsigned Reg = getRegForValue(RV);
     if (Reg == 0)
       return false;
@@ -1606,12 +3681,14 @@
       return false;
 
     // Vectors (of > 1 lane) in big endian need tricky handling.
-    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1)
+    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 &&
+        !Subtarget->isLittleEndian())
       return false;
 
     MVT RVVT = RVEVT.getSimpleVT();
     if (RVVT == MVT::f128)
       return false;
+
     MVT DestVT = VA.getValVT();
     // Special handling for extended integers.
     if (RVVT != DestVT) {
@@ -1621,8 +3698,8 @@
       if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
         return false;
 
-      bool isZExt = Outs[0].Flags.isZExt();
-      SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
+      bool IsZExt = Outs[0].Flags.isZExt();
+      SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt);
       if (SrcReg == 0)
         return false;
     }
@@ -1642,7 +3719,7 @@
   return true;
 }
 
-bool AArch64FastISel::SelectTrunc(const Instruction *I) {
+bool AArch64FastISel::selectTrunc(const Instruction *I) {
   Type *DestTy = I->getType();
   Value *Op = I->getOperand(0);
   Type *SrcTy = Op->getType();
@@ -1667,10 +3744,14 @@
   unsigned SrcReg = getRegForValue(Op);
   if (!SrcReg)
     return false;
+  bool SrcIsKill = hasTrivialKill(Op);
 
   // If we're truncating from i64 to a smaller non-legal type then generate an
-  // AND.  Otherwise, we know the high bits are undefined and a truncate doesn't
-  // generate any code.
+  // AND. Otherwise, we know the high bits are undefined and a truncate only
+  // generate a COPY. We cannot mark the source register also as result
+  // register, because this can incorrectly transfer the kill flag onto the
+  // source register.
+  unsigned ResultReg;
   if (SrcVT == MVT::i64) {
     uint64_t Mask = 0;
     switch (DestVT.SimpleTy) {
@@ -1688,23 +3769,23 @@
       break;
     }
     // Issue an extract_subreg to get the lower 32-bits.
-    unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
+    unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
                                                 AArch64::sub_32);
-    MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass);
     // Create the AND instruction which performs the actual truncation.
-    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-            ANDReg)
-        .addReg(Reg32)
-        .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32));
-    SrcReg = ANDReg;
+    ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask);
+    assert(ResultReg && "Unexpected AND instruction emission failure.");
+  } else {
+    ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SrcReg, getKillRegState(SrcIsKill));
   }
 
-  UpdateValueMap(I, SrcReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) {
   assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
           DestVT == MVT::i64) &&
          "Unexpected value type.");
@@ -1712,14 +3793,9 @@
   if (DestVT == MVT::i8 || DestVT == MVT::i16)
     DestVT = MVT::i32;
 
-  if (isZExt) {
-    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
-    unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
-            ResultReg)
-        .addReg(SrcReg)
-        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
-
+  if (IsZExt) {
+    unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+    assert(ResultReg && "Unexpected AND instruction emission failure.");
     if (DestVT == MVT::i64) {
       // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
       // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
@@ -1737,18 +3813,389 @@
       // FIXME: We're SExt i1 to i64.
       return 0;
     }
-    unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri),
-            ResultReg)
-        .addReg(SrcReg)
-        .addImm(0)
-        .addImm(0);
-    return ResultReg;
+    return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg,
+                            /*TODO:IsKill=*/false, 0, 0);
   }
 }
 
-unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
-                                     bool isZExt) {
+unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  unsigned Opc, ZReg;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    RetVT = MVT::i32;
+    Opc = AArch64::MADDWrrr; ZReg = AArch64::WZR; break;
+  case MVT::i64:
+    Opc = AArch64::MADDXrrr; ZReg = AArch64::XZR; break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill,
+                          /*IsKill=*/ZReg, true);
+}
+
+unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                                        unsigned Op1, bool Op1IsKill) {
+  if (RetVT != MVT::i64)
+    return 0;
+
+  return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass,
+                          Op0, Op0IsKill, Op1, Op1IsKill,
+                          AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+                                        unsigned Op1, bool Op1IsKill) {
+  if (RetVT != MVT::i64)
+    return 0;
+
+  return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass,
+                          Op0, Op0IsKill, Op1, Op1IsKill,
+                          AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                                     unsigned Op1Reg, bool Op1IsKill) {
+  unsigned Opc = 0;
+  bool NeedTrunc = false;
+  uint64_t Mask = 0;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:  Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xff;   break;
+  case MVT::i16: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xffff; break;
+  case MVT::i32: Opc = AArch64::LSLVWr;                                  break;
+  case MVT::i64: Opc = AArch64::LSLVXr;                                  break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  if (NeedTrunc) {
+    Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+    Op1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+                                       Op1IsKill);
+  if (NeedTrunc)
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+                                     bool Op0IsKill, uint64_t Shift,
+                                     bool IsZExt) {
+  assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+         "Unexpected source/return type pair.");
+  assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+          SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+         "Unexpected source value type.");
+  assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+          RetVT == MVT::i64) && "Unexpected return value type.");
+
+  bool Is64Bit = (RetVT == MVT::i64);
+  unsigned RegSize = Is64Bit ? 64 : 32;
+  unsigned DstBits = RetVT.getSizeInBits();
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+  // Just emit a copy for "zero" shifts.
+  if (Shift == 0) {
+    if (RetVT == SrcVT) {
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+          .addReg(Op0, getKillRegState(Op0IsKill));
+      return ResultReg;
+    } else
+      return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+  }
+
+  // Don't deal with undefined shifts.
+  if (Shift >= DstBits)
+    return 0;
+
+  // For immediate shifts we can fold the zero-/sign-extension into the shift.
+  // {S|U}BFM Wd, Wn, #r, #s
+  // Wd<32+s-r,32-r> = Wn<s:0> when r > s
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = shl i16 %1, 4
+  // Wd<32+7-28,32-28> = Wn<7:0> <- clamp s to 7
+  // 0b1111_1111_1111_1111__1111_1010_1010_0000 sext
+  // 0b0000_0000_0000_0000__0000_0101_0101_0000 sext | zext
+  // 0b0000_0000_0000_0000__0000_1010_1010_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = shl i16 %1, 8
+  // Wd<32+7-24,32-24> = Wn<7:0>
+  // 0b1111_1111_1111_1111__1010_1010_0000_0000 sext
+  // 0b0000_0000_0000_0000__0101_0101_0000_0000 sext | zext
+  // 0b0000_0000_0000_0000__1010_1010_0000_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = shl i16 %1, 12
+  // Wd<32+3-20,32-20> = Wn<3:0>
+  // 0b1111_1111_1111_1111__1010_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0101_0000_0000_0000 sext | zext
+  // 0b0000_0000_0000_0000__1010_0000_0000_0000 zext
+
+  unsigned ImmR = RegSize - Shift;
+  // Limit the width to the length of the source type.
+  unsigned ImmS = std::min<unsigned>(SrcBits - 1, DstBits - 1 - Shift);
+  static const unsigned OpcTable[2][2] = {
+    {AArch64::SBFMWri, AArch64::SBFMXri},
+    {AArch64::UBFMWri, AArch64::UBFMXri}
+  };
+  unsigned Opc = OpcTable[IsZExt][Is64Bit];
+  if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+        .addImm(0)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(AArch64::sub_32);
+    Op0 = TmpReg;
+    Op0IsKill = true;
+  }
+  return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                                     unsigned Op1Reg, bool Op1IsKill) {
+  unsigned Opc = 0;
+  bool NeedTrunc = false;
+  uint64_t Mask = 0;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:  Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xff;   break;
+  case MVT::i16: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xffff; break;
+  case MVT::i32: Opc = AArch64::LSRVWr; break;
+  case MVT::i64: Opc = AArch64::LSRVXr; break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  if (NeedTrunc) {
+    Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask);
+    Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+    Op0IsKill = Op1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+                                       Op1IsKill);
+  if (NeedTrunc)
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+                                     bool Op0IsKill, uint64_t Shift,
+                                     bool IsZExt) {
+  assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+         "Unexpected source/return type pair.");
+  assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+          SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+         "Unexpected source value type.");
+  assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+          RetVT == MVT::i64) && "Unexpected return value type.");
+
+  bool Is64Bit = (RetVT == MVT::i64);
+  unsigned RegSize = Is64Bit ? 64 : 32;
+  unsigned DstBits = RetVT.getSizeInBits();
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+  // Just emit a copy for "zero" shifts.
+  if (Shift == 0) {
+    if (RetVT == SrcVT) {
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(Op0, getKillRegState(Op0IsKill));
+      return ResultReg;
+    } else
+      return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+  }
+
+  // Don't deal with undefined shifts.
+  if (Shift >= DstBits)
+    return 0;
+
+  // For immediate shifts we can fold the zero-/sign-extension into the shift.
+  // {S|U}BFM Wd, Wn, #r, #s
+  // Wd<s-r:0> = Wn<s:r> when r <= s
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = lshr i16 %1, 4
+  // Wd<7-4:0> = Wn<7:4>
+  // 0b0000_0000_0000_0000__0000_1111_1111_1010 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+  // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = lshr i16 %1, 8
+  // Wd<7-7,0> = Wn<7:7>
+  // 0b0000_0000_0000_0000__0000_0000_1111_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = lshr i16 %1, 12
+  // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+  // 0b0000_0000_0000_0000__0000_0000_0000_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  if (Shift >= SrcBits && IsZExt)
+    return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+  // It is not possible to fold a sign-extend into the LShr instruction. In this
+  // case emit a sign-extend.
+  if (!IsZExt) {
+    Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+    if (!Op0)
+      return 0;
+    Op0IsKill = true;
+    SrcVT = RetVT;
+    SrcBits = SrcVT.getSizeInBits();
+    IsZExt = true;
+  }
+
+  unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+  unsigned ImmS = SrcBits - 1;
+  static const unsigned OpcTable[2][2] = {
+    {AArch64::SBFMWri, AArch64::SBFMXri},
+    {AArch64::UBFMWri, AArch64::UBFMXri}
+  };
+  unsigned Opc = OpcTable[IsZExt][Is64Bit];
+  if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+        .addImm(0)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(AArch64::sub_32);
+    Op0 = TmpReg;
+    Op0IsKill = true;
+  }
+  return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+                                     unsigned Op1Reg, bool Op1IsKill) {
+  unsigned Opc = 0;
+  bool NeedTrunc = false;
+  uint64_t Mask = 0;
+  switch (RetVT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:  Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xff;   break;
+  case MVT::i16: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xffff; break;
+  case MVT::i32: Opc = AArch64::ASRVWr;                                  break;
+  case MVT::i64: Opc = AArch64::ASRVXr;                                  break;
+  }
+
+  const TargetRegisterClass *RC =
+      (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  if (NeedTrunc) {
+    Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false);
+    Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+    Op0IsKill = Op1IsKill = true;
+  }
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+                                       Op1IsKill);
+  if (NeedTrunc)
+    ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+  return ResultReg;
+}
+
+unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+                                     bool Op0IsKill, uint64_t Shift,
+                                     bool IsZExt) {
+  assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+         "Unexpected source/return type pair.");
+  assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+          SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+         "Unexpected source value type.");
+  assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+          RetVT == MVT::i64) && "Unexpected return value type.");
+
+  bool Is64Bit = (RetVT == MVT::i64);
+  unsigned RegSize = Is64Bit ? 64 : 32;
+  unsigned DstBits = RetVT.getSizeInBits();
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  const TargetRegisterClass *RC =
+      Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+  // Just emit a copy for "zero" shifts.
+  if (Shift == 0) {
+    if (RetVT == SrcVT) {
+      unsigned ResultReg = createResultReg(RC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+      .addReg(Op0, getKillRegState(Op0IsKill));
+      return ResultReg;
+    } else
+      return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+  }
+
+  // Don't deal with undefined shifts.
+  if (Shift >= DstBits)
+    return 0;
+
+  // For immediate shifts we can fold the zero-/sign-extension into the shift.
+  // {S|U}BFM Wd, Wn, #r, #s
+  // Wd<s-r:0> = Wn<s:r> when r <= s
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = ashr i16 %1, 4
+  // Wd<7-4:0> = Wn<7:4>
+  // 0b1111_1111_1111_1111__1111_1111_1111_1010 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+  // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = ashr i16 %1, 8
+  // Wd<7-7,0> = Wn<7:7>
+  // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+  // %2 = ashr i16 %1, 12
+  // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+  // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+  // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+  if (Shift >= SrcBits && IsZExt)
+    return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+  unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+  unsigned ImmS = SrcBits - 1;
+  static const unsigned OpcTable[2][2] = {
+    {AArch64::SBFMWri, AArch64::SBFMXri},
+    {AArch64::UBFMWri, AArch64::UBFMXri}
+  };
+  unsigned Opc = OpcTable[IsZExt][Is64Bit];
+  if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+    unsigned TmpReg = MRI.createVirtualRegister(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+        .addImm(0)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(AArch64::sub_32);
+    Op0 = TmpReg;
+    Op0IsKill = true;
+  }
+  return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                     bool IsZExt) {
   assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
 
   // FastISel does not have plumbing to deal with extensions where the SrcVT or
@@ -1768,24 +4215,24 @@
   default:
     return 0;
   case MVT::i1:
-    return Emiti1Ext(SrcReg, DestVT, isZExt);
+    return emiti1Ext(SrcReg, DestVT, IsZExt);
   case MVT::i8:
     if (DestVT == MVT::i64)
-      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+      Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
-      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+      Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 7;
     break;
   case MVT::i16:
     if (DestVT == MVT::i64)
-      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+      Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
-      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+      Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 15;
     break;
   case MVT::i32:
     assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
-    Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+    Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     Imm = 31;
     break;
   }
@@ -1803,45 +4250,167 @@
     SrcReg = Src64;
   }
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(SrcReg)
-      .addImm(0)
-      .addImm(Imm);
-
-  return ResultReg;
+  const TargetRegisterClass *RC =
+      (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm);
 }
 
-bool AArch64FastISel::SelectIntExt(const Instruction *I) {
-  // On ARM, in general, integer casts don't involve legal types; this code
-  // handles promotable integers.  The high bits for a type smaller than
-  // the register size are assumed to be undefined.
-  Type *DestTy = I->getType();
-  Value *Src = I->getOperand(0);
-  Type *SrcTy = Src->getType();
+static bool isZExtLoad(const MachineInstr *LI) {
+  switch (LI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDURBBi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURWi:
+  case AArch64::LDRBBui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRWui:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRBBroW:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRWroW:
+    return true;
+  }
+}
 
-  bool isZExt = isa<ZExtInst>(I);
-  unsigned SrcReg = getRegForValue(Src);
-  if (!SrcReg)
+static bool isSExtLoad(const MachineInstr *LI) {
+  switch (LI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSWroW:
+    return true;
+  }
+}
+
+bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
+                                         MVT SrcVT) {
+  const auto *LI = dyn_cast<LoadInst>(I->getOperand(0));
+  if (!LI || !LI->hasOneUse())
     return false;
 
-  EVT SrcEVT = TLI.getValueType(SrcTy, true);
-  EVT DestEVT = TLI.getValueType(DestTy, true);
-  if (!SrcEVT.isSimple())
-    return false;
-  if (!DestEVT.isSimple())
+  // Check if the load instruction has already been selected.
+  unsigned Reg = lookUpRegForValue(LI);
+  if (!Reg)
     return false;
 
-  MVT SrcVT = SrcEVT.getSimpleVT();
-  MVT DestVT = DestEVT.getSimpleVT();
-  unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
-  if (ResultReg == 0)
+  MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+  if (!MI)
     return false;
-  UpdateValueMap(I, ResultReg);
+
+  // Check if the correct load instruction has been emitted - SelectionDAG might
+  // have emitted a zero-extending load, but we need a sign-extending load.
+  bool IsZExt = isa<ZExtInst>(I);
+  const auto *LoadMI = MI;
+  if (LoadMI->getOpcode() == TargetOpcode::COPY &&
+      LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) {
+    unsigned LoadReg = MI->getOperand(1).getReg();
+    LoadMI = MRI.getUniqueVRegDef(LoadReg);
+    assert(LoadMI && "Expected valid instruction");
+  }
+  if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI)))
+    return false;
+
+  // Nothing to be done.
+  if (RetVT != MVT::i64 || SrcVT > MVT::i32) {
+    updateValueMap(I, Reg);
+    return true;
+  }
+
+  if (IsZExt) {
+    unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Reg64)
+        .addImm(0)
+        .addReg(Reg, getKillRegState(true))
+        .addImm(AArch64::sub_32);
+    Reg = Reg64;
+  } else {
+    assert((MI->getOpcode() == TargetOpcode::COPY &&
+            MI->getOperand(1).getSubReg() == AArch64::sub_32) &&
+           "Expected copy instruction");
+    Reg = MI->getOperand(1).getReg();
+    MI->eraseFromParent();
+  }
+  updateValueMap(I, Reg);
   return true;
 }
 
-bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+bool AArch64FastISel::selectIntExt(const Instruction *I) {
+  assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+         "Unexpected integer extend instruction.");
+  MVT RetVT;
+  MVT SrcVT;
+  if (!isTypeSupported(I->getType(), RetVT))
+    return false;
+
+  if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT))
+    return false;
+
+  // Try to optimize already sign-/zero-extended values from load instructions.
+  if (optimizeIntExtLoad(I, RetVT, SrcVT))
+    return true;
+
+  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  if (!SrcReg)
+    return false;
+  bool SrcIsKill = hasTrivialKill(I->getOperand(0));
+
+  // Try to optimize already sign-/zero-extended values from function arguments.
+  bool IsZExt = isa<ZExtInst>(I);
+  if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) {
+    if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) {
+      if (RetVT == MVT::i64 && SrcVT != MVT::i64) {
+        unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(AArch64::SUBREG_TO_REG), ResultReg)
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(SrcIsKill))
+            .addImm(AArch64::sub_32);
+        SrcReg = ResultReg;
+      }
+      // Conservatively clear all kill flags from all uses, because we are
+      // replacing a sign-/zero-extend instruction at IR level with a nop at MI
+      // level. The result of the instruction at IR level might have been
+      // trivially dead, which is now not longer true.
+      unsigned UseReg = lookUpRegForValue(I);
+      if (UseReg)
+        MRI.clearKillFlags(UseReg);
+
+      updateValueMap(I, SrcReg);
+      return true;
+    }
+  }
+
+  unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt);
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
   EVT DestEVT = TLI.getValueType(I->getType(), true);
   if (!DestEVT.isSimple())
     return false;
@@ -1851,144 +4420,529 @@
     return false;
 
   unsigned DivOpc;
-  bool is64bit = (DestVT == MVT::i64);
+  bool Is64bit = (DestVT == MVT::i64);
   switch (ISDOpcode) {
   default:
     return false;
   case ISD::SREM:
-    DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
+    DivOpc = Is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
     break;
   case ISD::UREM:
-    DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
+    DivOpc = Is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
     break;
   }
-  unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
+  unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
   unsigned Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
+  bool Src0IsKill = hasTrivialKill(I->getOperand(0));
 
   unsigned Src1Reg = getRegForValue(I->getOperand(1));
   if (!Src1Reg)
     return false;
+  bool Src1IsKill = hasTrivialKill(I->getOperand(1));
 
-  unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg)
-      .addReg(Src0Reg)
-      .addReg(Src1Reg);
+  const TargetRegisterClass *RC =
+      (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false,
+                                     Src1Reg, /*IsKill=*/false);
+  assert(QuotReg && "Unexpected DIV instruction emission failure.");
   // The remainder is computed as numerator - (quotient * denominator) using the
   // MSUB instruction.
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
-      .addReg(QuotReg)
-      .addReg(Src1Reg)
-      .addReg(Src0Reg);
-  UpdateValueMap(I, ResultReg);
+  unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true,
+                                        Src1Reg, Src1IsKill, Src0Reg,
+                                        Src0IsKill);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::SelectMul(const Instruction *I) {
-  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
-  if (!SrcEVT.isSimple())
+bool AArch64FastISel::selectMul(const Instruction *I) {
+  MVT VT;
+  if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
     return false;
-  MVT SrcVT = SrcEVT.getSimpleVT();
 
-  // Must be simple value type.  Don't handle vectors.
-  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
-      SrcVT != MVT::i8)
+  if (VT.isVector())
+    return selectBinaryOp(I, ISD::MUL);
+
+  const Value *Src0 = I->getOperand(0);
+  const Value *Src1 = I->getOperand(1);
+  if (const auto *C = dyn_cast<ConstantInt>(Src0))
+    if (C->getValue().isPowerOf2())
+      std::swap(Src0, Src1);
+
+  // Try to simplify to a shift instruction.
+  if (const auto *C = dyn_cast<ConstantInt>(Src1))
+    if (C->getValue().isPowerOf2()) {
+      uint64_t ShiftVal = C->getValue().logBase2();
+      MVT SrcVT = VT;
+      bool IsZExt = true;
+      if (const auto *ZExt = dyn_cast<ZExtInst>(Src0)) {
+        if (!isIntExtFree(ZExt)) {
+          MVT VT;
+          if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) {
+            SrcVT = VT;
+            IsZExt = true;
+            Src0 = ZExt->getOperand(0);
+          }
+        }
+      } else if (const auto *SExt = dyn_cast<SExtInst>(Src0)) {
+        if (!isIntExtFree(SExt)) {
+          MVT VT;
+          if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) {
+            SrcVT = VT;
+            IsZExt = false;
+            Src0 = SExt->getOperand(0);
+          }
+        }
+      }
+
+      unsigned Src0Reg = getRegForValue(Src0);
+      if (!Src0Reg)
+        return false;
+      bool Src0IsKill = hasTrivialKill(Src0);
+
+      unsigned ResultReg =
+          emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt);
+
+      if (ResultReg) {
+        updateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+  bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  if (!Src1Reg)
+    return false;
+  bool Src1IsKill = hasTrivialKill(I->getOperand(1));
+
+  unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill);
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectShift(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeSupported(I->getType(), RetVT, /*IsVectorAllowed=*/true))
+    return false;
+
+  if (RetVT.isVector())
+    return selectOperator(I, I->getOpcode());
+
+  if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    unsigned ResultReg = 0;
+    uint64_t ShiftVal = C->getZExtValue();
+    MVT SrcVT = RetVT;
+    bool IsZExt = (I->getOpcode() == Instruction::AShr) ? false : true;
+    const Value *Op0 = I->getOperand(0);
+    if (const auto *ZExt = dyn_cast<ZExtInst>(Op0)) {
+      if (!isIntExtFree(ZExt)) {
+        MVT TmpVT;
+        if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) {
+          SrcVT = TmpVT;
+          IsZExt = true;
+          Op0 = ZExt->getOperand(0);
+        }
+      }
+    } else if (const auto *SExt = dyn_cast<SExtInst>(Op0)) {
+      if (!isIntExtFree(SExt)) {
+        MVT TmpVT;
+        if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) {
+          SrcVT = TmpVT;
+          IsZExt = false;
+          Op0 = SExt->getOperand(0);
+        }
+      }
+    }
+
+    unsigned Op0Reg = getRegForValue(Op0);
+    if (!Op0Reg)
+      return false;
+    bool Op0IsKill = hasTrivialKill(Op0);
+
+    switch (I->getOpcode()) {
+    default: llvm_unreachable("Unexpected instruction.");
+    case Instruction::Shl:
+      ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+      break;
+    case Instruction::AShr:
+      ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+      break;
+    case Instruction::LShr:
+      ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+      break;
+    }
+    if (!ResultReg)
+      return false;
+
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (!Op0Reg)
+    return false;
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (!Op1Reg)
+    return false;
+  bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+  unsigned ResultReg = 0;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unexpected instruction.");
+  case Instruction::Shl:
+    ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+    break;
+  case Instruction::AShr:
+    ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+    break;
+  case Instruction::LShr:
+    ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+    break;
+  }
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectBitCast(const Instruction *I) {
+  MVT RetVT, SrcVT;
+
+  if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT))
+    return false;
+  if (!isTypeLegal(I->getType(), RetVT))
     return false;
 
   unsigned Opc;
-  unsigned ZReg;
-  switch (SrcVT.SimpleTy) {
-  default:
+  if (RetVT == MVT::f32 && SrcVT == MVT::i32)
+    Opc = AArch64::FMOVWSr;
+  else if (RetVT == MVT::f64 && SrcVT == MVT::i64)
+    Opc = AArch64::FMOVXDr;
+  else if (RetVT == MVT::i32 && SrcVT == MVT::f32)
+    Opc = AArch64::FMOVSWr;
+  else if (RetVT == MVT::i64 && SrcVT == MVT::f64)
+    Opc = AArch64::FMOVDXr;
+  else
     return false;
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-    ZReg = AArch64::WZR;
-    Opc = AArch64::MADDWrrr;
-    SrcVT = MVT::i32;
-    break;
-  case MVT::i64:
-    ZReg = AArch64::XZR;
-    Opc = AArch64::MADDXrrr;
-    break;
+
+  const TargetRegisterClass *RC = nullptr;
+  switch (RetVT.SimpleTy) {
+  default: llvm_unreachable("Unexpected value type.");
+  case MVT::i32: RC = &AArch64::GPR32RegClass; break;
+  case MVT::i64: RC = &AArch64::GPR64RegClass; break;
+  case MVT::f32: RC = &AArch64::FPR32RegClass; break;
+  case MVT::f64: RC = &AArch64::FPR64RegClass; break;
   }
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (!Op0Reg)
+    return false;
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+  unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill);
 
-  unsigned Src0Reg = getRegForValue(I->getOperand(0));
-  if (!Src0Reg)
+  if (!ResultReg)
     return false;
 
-  unsigned Src1Reg = getRegForValue(I->getOperand(1));
-  if (!Src1Reg)
-    return false;
-
-  // Create the base instruction, then add the operands.
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
-      .addReg(Src0Reg)
-      .addReg(Src1Reg)
-      .addReg(ZReg);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) {
+bool AArch64FastISel::selectFRem(const Instruction *I) {
+  MVT RetVT;
+  if (!isTypeLegal(I->getType(), RetVT))
+    return false;
+
+  RTLIB::Libcall LC;
+  switch (RetVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::f32:
+    LC = RTLIB::REM_F32;
+    break;
+  case MVT::f64:
+    LC = RTLIB::REM_F64;
+    break;
+  }
+
+  ArgListTy Args;
+  Args.reserve(I->getNumOperands());
+
+  // Populate the argument list.
+  for (auto &Arg : I->operands()) {
+    ArgListEntry Entry;
+    Entry.Val = Arg;
+    Entry.Ty = Arg->getType();
+    Args.push_back(Entry);
+  }
+
+  CallLoweringInfo CLI;
+  CLI.setCallee(TLI.getLibcallCallingConv(LC), I->getType(),
+                TLI.getLibcallName(LC), std::move(Args));
+  if (!lowerCallTo(CLI))
+    return false;
+  updateValueMap(I, CLI.ResultReg);
+  return true;
+}
+
+bool AArch64FastISel::selectSDiv(const Instruction *I) {
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  if (!isa<ConstantInt>(I->getOperand(1)))
+    return selectBinaryOp(I, ISD::SDIV);
+
+  const APInt &C = cast<ConstantInt>(I->getOperand(1))->getValue();
+  if ((VT != MVT::i32 && VT != MVT::i64) || !C ||
+      !(C.isPowerOf2() || (-C).isPowerOf2()))
+    return selectBinaryOp(I, ISD::SDIV);
+
+  unsigned Lg2 = C.countTrailingZeros();
+  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  if (!Src0Reg)
+    return false;
+  bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+  if (cast<BinaryOperator>(I)->isExact()) {
+    unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2);
+    if (!ResultReg)
+      return false;
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  int64_t Pow2MinusOne = (1ULL << Lg2) - 1;
+  unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne);
+  if (!AddReg)
+    return false;
+
+  // (Src0 < 0) ? Pow2 - 1 : 0;
+  if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0))
+    return false;
+
+  unsigned SelectOpc;
+  const TargetRegisterClass *RC;
+  if (VT == MVT::i64) {
+    SelectOpc = AArch64::CSELXr;
+    RC = &AArch64::GPR64RegClass;
+  } else {
+    SelectOpc = AArch64::CSELWr;
+    RC = &AArch64::GPR32RegClass;
+  }
+  unsigned SelectReg =
+      fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg,
+                       Src0IsKill, AArch64CC::LT);
+  if (!SelectReg)
+    return false;
+
+  // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also
+  // negate the result.
+  unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+  unsigned ResultReg;
+  if (C.isNegative())
+    ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true,
+                              SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2);
+  else
+    ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2);
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+/// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We
+/// have to duplicate it for AArch64, because otherwise we would fail during the
+/// sign-extend emission.
+std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
+  unsigned IdxN = getRegForValue(Idx);
+  if (IdxN == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return std::pair<unsigned, bool>(0, false);
+
+  bool IdxNIsKill = hasTrivialKill(Idx);
+
+  // If the index is smaller or larger than intptr_t, truncate or extend it.
+  MVT PtrVT = TLI.getPointerTy();
+  EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
+  if (IdxVT.bitsLT(PtrVT)) {
+    IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false);
+    IdxNIsKill = true;
+  } else if (IdxVT.bitsGT(PtrVT))
+    llvm_unreachable("AArch64 FastISel doesn't support types larger than i64");
+  return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
+}
+
+/// This is mostly a copy of the existing FastISel GEP code, but we have to
+/// duplicate it for AArch64, because otherwise we would bail out even for
+/// simple cases. This is because the standard fastEmit functions don't cover
+/// MUL at all and ADD is lowered very inefficientily.
+bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
+  unsigned N = getRegForValue(I->getOperand(0));
+  if (!N)
+    return false;
+  bool NIsKill = hasTrivialKill(I->getOperand(0));
+
+  // Keep a running tab of the total offset to coalesce multiple N = N + Offset
+  // into a single N = N + TotalOffset.
+  uint64_t TotalOffs = 0;
+  Type *Ty = I->getOperand(0)->getType();
+  MVT VT = TLI.getPointerTy();
+  for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) {
+    const Value *Idx = *OI;
+    if (auto *StTy = dyn_cast<StructType>(Ty)) {
+      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+      // N = N + Offset
+      if (Field)
+        TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
+      Ty = StTy->getElementType(Field);
+    } else {
+      Ty = cast<SequentialType>(Ty)->getElementType();
+      // If this is a constant subscript, handle it quickly.
+      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero())
+          continue;
+        // N = N + Offset
+        TotalOffs +=
+            DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
+        continue;
+      }
+      if (TotalOffs) {
+        N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+        if (!N)
+          return false;
+        NIsKill = true;
+        TotalOffs = 0;
+      }
+
+      // N = N + Idx * ElementSize;
+      uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+      std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
+      unsigned IdxN = Pair.first;
+      bool IdxNIsKill = Pair.second;
+      if (!IdxN)
+        return false;
+
+      if (ElementSize != 1) {
+        unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize);
+        if (!C)
+          return false;
+        IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true);
+        if (!IdxN)
+          return false;
+        IdxNIsKill = true;
+      }
+      N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+      if (!N)
+        return false;
+    }
+  }
+  if (TotalOffs) {
+    N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+    if (!N)
+      return false;
+  }
+  updateValueMap(I, N);
+  return true;
+}
+
+bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
   switch (I->getOpcode()) {
   default:
     break;
-  case Instruction::Load:
-    return SelectLoad(I);
-  case Instruction::Store:
-    return SelectStore(I);
-  case Instruction::Br:
-    return SelectBranch(I);
-  case Instruction::IndirectBr:
-    return SelectIndirectBr(I);
-  case Instruction::FCmp:
-  case Instruction::ICmp:
-    return SelectCmp(I);
-  case Instruction::Select:
-    return SelectSelect(I);
-  case Instruction::FPExt:
-    return SelectFPExt(I);
-  case Instruction::FPTrunc:
-    return SelectFPTrunc(I);
-  case Instruction::FPToSI:
-    return SelectFPToInt(I, /*Signed=*/true);
-  case Instruction::FPToUI:
-    return SelectFPToInt(I, /*Signed=*/false);
-  case Instruction::SIToFP:
-    return SelectIntToFP(I, /*Signed=*/true);
-  case Instruction::UIToFP:
-    return SelectIntToFP(I, /*Signed=*/false);
+  case Instruction::Add:
+  case Instruction::Sub:
+    return selectAddSub(I);
+  case Instruction::Mul:
+    return selectMul(I);
+  case Instruction::SDiv:
+    return selectSDiv(I);
   case Instruction::SRem:
-    return SelectRem(I, ISD::SREM);
+    if (!selectBinaryOp(I, ISD::SREM))
+      return selectRem(I, ISD::SREM);
+    return true;
   case Instruction::URem:
-    return SelectRem(I, ISD::UREM);
-  case Instruction::Call:
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
-      return SelectIntrinsicCall(*II);
-    return SelectCall(I);
-  case Instruction::Ret:
-    return SelectRet(I);
-  case Instruction::Trunc:
-    return SelectTrunc(I);
+    if (!selectBinaryOp(I, ISD::UREM))
+      return selectRem(I, ISD::UREM);
+    return true;
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return selectShift(I);
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return selectLogicalOp(I);
+  case Instruction::Br:
+    return selectBranch(I);
+  case Instruction::IndirectBr:
+    return selectIndirectBr(I);
+  case Instruction::BitCast:
+    if (!FastISel::selectBitCast(I))
+      return selectBitCast(I);
+    return true;
+  case Instruction::FPToSI:
+    if (!selectCast(I, ISD::FP_TO_SINT))
+      return selectFPToInt(I, /*Signed=*/true);
+    return true;
+  case Instruction::FPToUI:
+    return selectFPToInt(I, /*Signed=*/false);
   case Instruction::ZExt:
   case Instruction::SExt:
-    return SelectIntExt(I);
-  case Instruction::Mul:
-    // FIXME: This really should be handled by the target-independent selector.
-    return SelectMul(I);
+    return selectIntExt(I);
+  case Instruction::Trunc:
+    if (!selectCast(I, ISD::TRUNCATE))
+      return selectTrunc(I);
+    return true;
+  case Instruction::FPExt:
+    return selectFPExt(I);
+  case Instruction::FPTrunc:
+    return selectFPTrunc(I);
+  case Instruction::SIToFP:
+    if (!selectCast(I, ISD::SINT_TO_FP))
+      return selectIntToFP(I, /*Signed=*/true);
+    return true;
+  case Instruction::UIToFP:
+    return selectIntToFP(I, /*Signed=*/false);
+  case Instruction::Load:
+    return selectLoad(I);
+  case Instruction::Store:
+    return selectStore(I);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return selectCmp(I);
+  case Instruction::Select:
+    return selectSelect(I);
+  case Instruction::Ret:
+    return selectRet(I);
+  case Instruction::FRem:
+    return selectFRem(I);
+  case Instruction::GetElementPtr:
+    return selectGetElementPtr(I);
   }
-  return false;
+
+  // fall-back to target-independent instruction selection.
+  return selectOperator(I, I->getOpcode());
   // Silence warnings.
   (void)&CC_AArch64_DarwinPCS_VarArg;
 }
 
 namespace llvm {
-llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo,
-                                        const TargetLibraryInfo *libInfo) {
-  return new AArch64FastISel(funcInfo, libInfo);
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                        const TargetLibraryInfo *LibInfo) {
+  return new AArch64FastISel(FuncInfo, LibInfo);
 }
 }

diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 9c33717..a7779d6 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp

@@ -17,16 +17,16 @@
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -86,13 +86,14 @@
   const MachineFrameInfo *MFI = MF.getFrameInfo();
 
 #ifndef NDEBUG
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   assert(!RegInfo->needsStackRealignment(MF) &&
          "No stack realignment on AArch64!");
 #endif
 
   return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
+          MFI->isFrameAddressTaken() || MFI->hasStackMap() ||
+          MFI->hasPatchPoint());
 }
 
 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
@@ -109,13 +110,13 @@
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
   const AArch64InstrInfo *TII =
-      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
   DebugLoc DL = I->getDebugLoc();
   int Opc = I->getOpcode();
   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
 
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   if (!TFI->hasReservedCallFrame(MF)) {
     unsigned Align = getStackAlignment();
 
@@ -131,7 +132,7 @@
       // FIXME: in-function stack adjustment for calls is limited to 24-bits
       // because there's no guaranteed temporary register available.
       //
-      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+      // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
       // 1) For offset <= 12-bit, we use LSL #0
       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
       // LSL #0, and the other uses LSL #12.
@@ -158,7 +159,7 @@
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   // Add callee saved registers to move list.
@@ -166,7 +167,7 @@
   if (CSI.empty())
     return;
 
-  const DataLayout *TD = MF.getTarget().getDataLayout();
+  const DataLayout *TD = MF.getSubtarget().getDataLayout();
   bool HasFP = hasFP(MF);
 
   // Calculate amount of bytes used for return address storing.
@@ -205,8 +206,8 @@
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+      MF.getSubtarget().getRegisterInfo());
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
@@ -300,7 +301,7 @@
     TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
 
   if (needsFrameMoves) {
-    const DataLayout *TD = MF.getTarget().getDataLayout();
+    const DataLayout *TD = MF.getSubtarget().getDataLayout();
     const int StackGrowth = -TD->getPointerSize(0);
     unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
@@ -435,9 +436,9 @@
   assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const AArch64InstrInfo *TII =
-      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
+      MF.getSubtarget().getRegisterInfo());
   DebugLoc DL = MBBI->getDebugLoc();
   unsigned RetOpcode = MBBI->getOpcode();
 
@@ -548,7 +549,7 @@
                                                      bool PreferFP) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
+      MF.getSubtarget().getRegisterInfo());
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   int FPOffset = MFI->getObjectOffset(FI) + 16;
   int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
@@ -617,7 +618,7 @@
     const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   unsigned Count = CSI.size();
   DebugLoc DL;
   assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
@@ -693,7 +694,7 @@
     const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   unsigned Count = CSI.size();
   DebugLoc DL;
   assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
@@ -761,7 +762,7 @@
 void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
     MachineFunction &MF, RegScavenger *RS) const {
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
+      MF.getSubtarget().getRegisterInfo());
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   MachineRegisterInfo *MRI = &MF.getRegInfo();
   SmallVector<unsigned, 4> UnspilledCSGPRs;

diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 7686e6f..df3875f 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64_FRAMELOWERING_H
-#define AArch64_FRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
 
 #include "llvm/Target/TargetFrameLowering.h"
 

diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3f49fab..87a6d80 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

@@ -303,7 +303,7 @@
 
 /// \brief Determine wether it is worth to fold V into an extended register.
 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the a value is used at least twice, unless we are optimizing
+  // it hurts if the value is used at least twice, unless we are optimizing
   // for code size.
   if (ForCodeSize || V.hasOneUse())
     return true;
@@ -777,6 +777,21 @@
   return false;
 }
 
+// Check if the given immediate is preferred by ADD. If an immediate can be
+// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
+// encoded by one MOVZ, return true.
+static bool isPreferredADD(int64_t ImmOff) {
+  // Constant in [0x0, 0xfff] can be encoded in ADD.
+  if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
+    return true;
+  // Check if it can be encoded in an "ADD LSL #12".
+  if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
+    // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
+    return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
+           (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
+  return false;
+}
+
 bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
                                             SDValue &Base, SDValue &Offset,
                                             SDValue &SignExtend,
@@ -786,11 +801,6 @@
   SDValue LHS = N.getOperand(0);
   SDValue RHS = N.getOperand(1);
 
-  // We don't want to match immediate adds here, because they are better lowered
-  // to the register-immediate addressing modes.
-  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
-    return false;
-
   // Check if this particular node is reused in any non-memory related
   // operation.  If yes, do not try to fold this node into the address
   // computation, since the computation will be kept.
@@ -800,6 +810,36 @@
       return false;
   }
 
+  // Watch out if RHS is a wide immediate, it can not be selected into
+  // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
+  // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
+  // instructions like:
+  //     MOV  X0, WideImmediate
+  //     ADD  X1, BaseReg, X0
+  //     LDR  X2, [X1, 0]
+  // For such situation, using [BaseReg, XReg] addressing mode can save one
+  // ADD/SUB:
+  //     MOV  X0, WideImmediate
+  //     LDR  X2, [BaseReg, X0]
+  if (isa<ConstantSDNode>(RHS)) {
+    int64_t ImmOff = (int64_t)dyn_cast<ConstantSDNode>(RHS)->getZExtValue();
+    unsigned Scale = Log2_32(Size);
+    // Skip the immediate can be seleced by load/store addressing mode.
+    // Also skip the immediate can be encoded by a single ADD (SUB is also
+    // checked by using -ImmOff).
+    if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
+        isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
+      return false;
+
+    SDLoc DL(N.getNode());
+    SDValue Ops[] = { RHS };
+    SDNode *MOVI =
+        CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
+    SDValue MOVIV = SDValue(MOVI, 0);
+    // This ADD of two X register will be selected into [Reg+Reg] mode.
+    N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
+  }
+
   // Remember if it is worth folding N when it produces extended register.
   bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
 
@@ -1381,20 +1421,21 @@
   return true;
 }
 
-static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     unsigned &LSB, unsigned &MSB) {
-  // We are looking for the following pattern which basically extracts a single
-  // bit from the source value and places it in the LSB of the destination
-  // value, all other bits of the destination value or set to zero:
+static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
+                                          SDValue &Opd0, unsigned &LSB,
+                                          unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts several
+  // continuous bits from the source value and places it from the LSB of the
+  // destination value, all other bits of the destination value or set to zero:
   //
   // Value2 = AND Value, MaskImm
   // SRL Value2, ShiftImm
   //
-  // with MaskImm >> ShiftImm == 1.
+  // with MaskImm >> ShiftImm to search for the bit width.
   //
   // This gets selected into a single UBFM:
   //
-  // UBFM Value, ShiftImm, ShiftImm
+  // UBFM Value, ShiftImm, BitWide + Srl_imm -1
   //
 
   if (N->getOpcode() != ISD::SRL)
@@ -1410,15 +1451,16 @@
   if (!isIntImmediate(N->getOperand(1), Srl_imm))
     return false;
 
-  // Check whether we really have a one bit extract here.
-  if (And_mask >> Srl_imm == 0x1) {
+  // Check whether we really have several bits extract here.
+  unsigned BitWide = 64 - CountLeadingOnes_64(~(And_mask >> Srl_imm));
+  if (BitWide && isMask_64(And_mask >> Srl_imm)) {
     if (N->getValueType(0) == MVT::i32)
       Opc = AArch64::UBFMWri;
     else
       Opc = AArch64::UBFMXri;
 
-    LSB = MSB = Srl_imm;
-
+    LSB = Srl_imm;
+    MSB = BitWide + Srl_imm - 1;
     return true;
   }
 
@@ -1439,8 +1481,8 @@
   assert((VT == MVT::i32 || VT == MVT::i64) &&
          "Type checking must have been done before calling this function");
 
-  // Check for AND + SRL doing a one bit extract.
-  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+  // Check for AND + SRL doing several bits extract.
+  if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
     return true;
 
   // we're looking for a shift of a shift
@@ -2116,7 +2158,9 @@
     case 32:
       SubReg = AArch64::ssub;
       break;
-    case 16: // FALLTHROUGH
+    case 16:
+      SubReg = AArch64::hsub;
+      break;
     case 8:
       llvm_unreachable("unexpected zext-requiring extract element!");
     }
@@ -2204,9 +2248,9 @@
         return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
@@ -2222,9 +2266,9 @@
         return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
@@ -2240,9 +2284,9 @@
         return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
@@ -2258,9 +2302,9 @@
         return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
@@ -2276,9 +2320,9 @@
         return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
@@ -2294,9 +2338,9 @@
         return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16  || VT == MVT::v8f16)
         return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
@@ -2312,9 +2356,9 @@
         return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
@@ -2330,9 +2374,9 @@
         return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
@@ -2348,9 +2392,9 @@
         return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
       else if (VT == MVT::v16i8)
         return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
@@ -2364,7 +2408,8 @@
     case Intrinsic::aarch64_neon_ld2lane:
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectLoadLane(Node, 2, AArch64::LD2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectLoadLane(Node, 2, AArch64::LD2i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2376,7 +2421,8 @@
     case Intrinsic::aarch64_neon_ld3lane:
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectLoadLane(Node, 3, AArch64::LD3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectLoadLane(Node, 3, AArch64::LD3i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2388,7 +2434,8 @@
     case Intrinsic::aarch64_neon_ld4lane:
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectLoadLane(Node, 4, AArch64::LD4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectLoadLane(Node, 4, AArch64::LD4i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2448,9 +2495,9 @@
         return SelectStore(Node, 2, AArch64::ST1Twov8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 2, AArch64::ST1Twov16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 2, AArch64::ST1Twov4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 2, AArch64::ST1Twov8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 2, AArch64::ST1Twov2s);
@@ -2467,9 +2514,9 @@
         return SelectStore(Node, 3, AArch64::ST1Threev8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 3, AArch64::ST1Threev16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 3, AArch64::ST1Threev4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 3, AArch64::ST1Threev8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 3, AArch64::ST1Threev2s);
@@ -2486,9 +2533,9 @@
         return SelectStore(Node, 4, AArch64::ST1Fourv8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 4, AArch64::ST1Fourv16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 4, AArch64::ST1Fourv4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 4, AArch64::ST1Fourv8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 4, AArch64::ST1Fourv2s);
@@ -2505,9 +2552,9 @@
         return SelectStore(Node, 2, AArch64::ST2Twov8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 2, AArch64::ST2Twov16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 2, AArch64::ST2Twov4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 2, AArch64::ST2Twov8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 2, AArch64::ST2Twov2s);
@@ -2524,9 +2571,9 @@
         return SelectStore(Node, 3, AArch64::ST3Threev8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 3, AArch64::ST3Threev16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 3, AArch64::ST3Threev4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 3, AArch64::ST3Threev8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 3, AArch64::ST3Threev2s);
@@ -2543,9 +2590,9 @@
         return SelectStore(Node, 4, AArch64::ST4Fourv8b);
       else if (VT == MVT::v16i8)
         return SelectStore(Node, 4, AArch64::ST4Fourv16b);
-      else if (VT == MVT::v4i16)
+      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
         return SelectStore(Node, 4, AArch64::ST4Fourv4h);
-      else if (VT == MVT::v8i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
         return SelectStore(Node, 4, AArch64::ST4Fourv8h);
       else if (VT == MVT::v2i32 || VT == MVT::v2f32)
         return SelectStore(Node, 4, AArch64::ST4Fourv2s);
@@ -2560,7 +2607,8 @@
     case Intrinsic::aarch64_neon_st2lane: {
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectStoreLane(Node, 2, AArch64::ST2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectStoreLane(Node, 2, AArch64::ST2i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2573,7 +2621,8 @@
     case Intrinsic::aarch64_neon_st3lane: {
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectStoreLane(Node, 3, AArch64::ST3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectStoreLane(Node, 3, AArch64::ST3i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2586,7 +2635,8 @@
     case Intrinsic::aarch64_neon_st4lane: {
       if (VT == MVT::v16i8 || VT == MVT::v8i8)
         return SelectStoreLane(Node, 4, AArch64::ST4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16)
         return SelectStoreLane(Node, 4, AArch64::ST4i16);
       else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
                VT == MVT::v2f32)
@@ -2603,9 +2653,9 @@
       return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
@@ -2622,9 +2672,9 @@
       return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
@@ -2641,9 +2691,9 @@
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
@@ -2660,9 +2710,9 @@
       return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
@@ -2679,9 +2729,9 @@
       return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
@@ -2698,9 +2748,9 @@
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
@@ -2717,9 +2767,9 @@
       return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
@@ -2736,9 +2786,9 @@
       return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
@@ -2755,9 +2805,9 @@
       return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
@@ -2774,9 +2824,9 @@
       return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
     else if (VT == MVT::v16i8)
       return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
@@ -2791,7 +2841,8 @@
   case AArch64ISD::LD1LANEpost: {
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2804,7 +2855,8 @@
   case AArch64ISD::LD2LANEpost: {
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2817,7 +2869,8 @@
   case AArch64ISD::LD3LANEpost: {
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2830,7 +2883,8 @@
   case AArch64ISD::LD4LANEpost: {
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2846,9 +2900,9 @@
       return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
@@ -2866,9 +2920,9 @@
       return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
@@ -2886,9 +2940,9 @@
       return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
@@ -2906,9 +2960,9 @@
       return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
@@ -2926,9 +2980,9 @@
       return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
@@ -2946,9 +3000,9 @@
       return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
     else if (VT == MVT::v16i8)
       return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
-    else if (VT == MVT::v4i16)
+    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
       return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
-    else if (VT == MVT::v8i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
       return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
     else if (VT == MVT::v2i32 || VT == MVT::v2f32)
       return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
@@ -2964,7 +3018,8 @@
     VT = Node->getOperand(1).getValueType();
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2978,7 +3033,8 @@
     VT = Node->getOperand(1).getValueType();
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)
@@ -2992,7 +3048,8 @@
     VT = Node->getOperand(1).getValueType();
     if (VT == MVT::v16i8 || VT == MVT::v8i8)
       return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+             VT == MVT::v8f16)
       return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
     else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
              VT == MVT::v2f32)

diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7c33423..7c94d83 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64ISelLowering.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64PerfectShuffle.h"
 #include "AArch64Subtarget.h"
-#include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -38,10 +38,12 @@
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 
+namespace {
 enum AlignMode {
   StrictAlign,
   NoStrictAlign
 };
+}
 
 static cl::opt<AlignMode>
 Align(cl::desc("Load/store alignment support"),
@@ -64,18 +66,9 @@
                          cl::desc("Allow AArch64 SLI/SRI formation"),
                          cl::init(false));
 
-//===----------------------------------------------------------------------===//
-// AArch64 Lowering public interface.
-//===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
-  if (TT.isOSBinFormatMachO())
-    return new AArch64_MachoTargetObjectFile();
 
-  return new AArch64_ELFTargetObjectFile();
-}
-
-AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
+    : TargetLowering(TM) {
   Subtarget = &TM.getSubtarget<AArch64Subtarget>();
 
   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
@@ -106,6 +99,7 @@
     addDRTypeForNEON(MVT::v2i32);
     addDRTypeForNEON(MVT::v1i64);
     addDRTypeForNEON(MVT::v1f64);
+    addDRTypeForNEON(MVT::v4f16);
 
     addQRTypeForNEON(MVT::v4f32);
     addQRTypeForNEON(MVT::v2f64);
@@ -113,6 +107,7 @@
     addQRTypeForNEON(MVT::v8i16);
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
+    addQRTypeForNEON(MVT::v8f16);
   }
 
   // Compute derived properties from the register classes
@@ -278,6 +273,94 @@
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
+  // f16 is storage-only, so we promote operations to f32 if we know this is
+  // valid, and ignore them otherwise. The operations not mentioned here will
+  // fail to select, but this is not a major problem as no source language
+  // should be emitting native f16 operations yet.
+  setOperationAction(ISD::FADD, MVT::f16, Promote);
+  setOperationAction(ISD::FDIV, MVT::f16, Promote);
+  setOperationAction(ISD::FMUL, MVT::f16, Promote);
+  setOperationAction(ISD::FSUB, MVT::f16, Promote);
+
+  // v4f16 is also a storage-only type, so promote it to v4f32 when that is
+  // known to be safe.
+  setOperationAction(ISD::FADD, MVT::v4f16, Promote);
+  setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
+  setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
+  setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
+  AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
+
+  // Expand all other v4f16 operations.
+  // FIXME: We could generate better code by promoting some operations to
+  // a pair of v4f32s
+  setOperationAction(ISD::FABS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v4f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
+
+
+  // v8f16 is also a storage-only type, so expand it.
+  setOperationAction(ISD::FABS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v8f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
+
   // AArch64 has implementations of a lot of rounding-like FP operations.
   static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
   for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
@@ -305,6 +388,7 @@
 
   // AArch64 does not have floating-point extending loads, i1 sign-extending
   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
@@ -316,6 +400,10 @@
   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+
+  setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+  setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+
   // Indexed loads and stores are supported.
   for (unsigned im = (unsigned)ISD::PRE_INC;
        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
@@ -434,6 +522,11 @@
 
     // AArch64 doesn't have MUL.2d:
     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    // Custom handling for some quad-vector types to detect MULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     // Likewise, narrowing and extending vector loads/stores aren't handled
@@ -479,13 +572,13 @@
 }
 
 void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32) {
+  if (VT == MVT::v2f32 || VT == MVT::v4f16) {
     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
 
     setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
 
@@ -726,6 +819,7 @@
   case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
   case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
   case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::NVCAST:            return "AArch64ISD::NVCAST";
   case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
   case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
   case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
@@ -755,6 +849,8 @@
   case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
   case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
+  case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
   }
 }
 
@@ -773,7 +869,8 @@
   // EndBB:
   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
 
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineFunction *MF = MBB->getParent();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   DebugLoc DL = MI->getDebugLoc();
@@ -1019,6 +1116,8 @@
 
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  SDValue Cmp;
+  AArch64CC::CondCode AArch64CC;
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     EVT VT = RHS.getValueType();
     uint64_t C = RHSC->getZExtValue();
@@ -1050,9 +1149,9 @@
         break;
       case ISD::SETLE:
       case ISD::SETGT:
-        if ((VT == MVT::i32 && C != 0x7fffffff &&
+        if ((VT == MVT::i32 && C != INT32_MAX &&
              isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+            (VT == MVT::i64 && C != INT64_MAX &&
              isLegalArithImmed(C + 1ULL))) {
           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1061,9 +1160,9 @@
         break;
       case ISD::SETULE:
       case ISD::SETUGT:
-        if ((VT == MVT::i32 && C != 0xffffffff &&
+        if ((VT == MVT::i32 && C != UINT32_MAX &&
              isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+            (VT == MVT::i64 && C != UINT64_MAX &&
              isLegalArithImmed(C + 1ULL))) {
           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1073,9 +1172,45 @@
       }
     }
   }
-
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+  // For the i8 operand, the largest immediate is 255, so this can be easily
+  // encoded in the compare instruction. For the i16 operand, however, the
+  // largest immediate cannot be encoded in the compare.
+  // Therefore, use a sign extending load and cmn to avoid materializing the -1
+  // constant. For example,
+  // movz w1, #65535
+  // ldrh w0, [x0, #0]
+  // cmp w0, w1
+  // >
+  // ldrsh w0, [x0, #0]
+  // cmn w0, #1
+  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+  // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
+  // both the LHS and RHS are truely zero extended and to make sure the
+  // transformation is profitable.
+  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
+    if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
+        isa<LoadSDNode>(LHS)) {
+      if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+          cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+          LHS.getNode()->hasNUsesOfValue(1, 0)) {
+        int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+        if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+          SDValue SExt =
+              DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+                          DAG.getValueType(MVT::i16));
+          Cmp = emitComparison(SExt,
+                               DAG.getConstant(ValueofRHS, RHS.getValueType()),
+                               CC, dl, DAG);
+          AArch64CC = changeIntCCToAArch64CC(CC);
+          AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+          return Cmp;
+        }
+      }
+    }
+  }
+  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC = changeIntCCToAArch64CC(CC);
   AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
   return Cmp;
 }
@@ -1332,8 +1467,7 @@
   SDLoc DL(Op);
   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  // The data thing is not used.
-  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
 
   bool IsStream = !Locality;
   // When the locality number is set
@@ -1348,6 +1482,7 @@
 
   // built the mask value encoding the expected behavior.
   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (!IsData << 3) |     // IsDataCache bit
                    (Locality << 1) |    // Cache level bits
                    (unsigned)IsStream;  // Stream bit
   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
@@ -1399,7 +1534,10 @@
 
   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
     SDLoc dl(Op);
-    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    MVT ExtVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+                         VT.getVectorNumElements());
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
   }
 
@@ -1504,7 +1642,7 @@
       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
 
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
     .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
@@ -1513,12 +1651,221 @@
   return CallResult.first;
 }
 
+static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType() != MVT::f16)
+    return SDValue();
+
+  assert(Op.getOperand(0).getValueType() == MVT::i16);
+  SDLoc DL(Op);
+
+  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
+  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
+  return SDValue(
+      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
+                         DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
+      0);
+}
+
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
+static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
+                                                 const EVT &OrigTy,
+                                                 const EVT &ExtTy,
+                                                 unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  EVT VT = N->getValueType(0);
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        if (!isIntN(HalfSize, C->getSExtValue()))
+          return false;
+      } else {
+        if (!isUIntN(HalfSize, C->getZExtValue()))
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
+                                             N->getOperand(0)->getValueType(0),
+                                             N->getValueType(0),
+                                             N->getOpcode());
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+}
+
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
+    NewOpc = AArch64ISD::SMULL;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = AArch64ISD::UMULL;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = AArch64ISD::SMULL;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
+
+  // Legalize to a S/UMULL instruction
+  SDLoc DL(Op);
+  SDValue Op0;
+  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
+  if (!isMLA) {
+    Op0 = skipExtensionForVectorMULL(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
+  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
+  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
+
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("unimplemented operand");
     return SDValue();
+  case ISD::BITCAST:
+    return LowerBITCAST(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:
@@ -1610,6 +1957,8 @@
     return LowerFP_TO_INT(Op, DAG);
   case ISD::FSINCOS:
     return LowerFSINCOS(Op, DAG);
+  case ISD::MUL:
+    return LowerMUL(Op, DAG);
   }
 }
 
@@ -1624,8 +1973,7 @@
 
 #include "AArch64GenCallingConv.inc"
 
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
+/// Selects the correct CCAssignFn for a given CallingConvention value.
 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                      bool IsVarArg) const {
   switch (CC) {
@@ -1650,8 +1998,8 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   // At this point, Ins[].VT may already be promoted to i32. To correctly
   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
@@ -1715,6 +2063,8 @@
         RC = &AArch64::GPR32RegClass;
       else if (RegVT == MVT::i64)
         RC = &AArch64::GPR64RegClass;
+      else if (RegVT == MVT::f16)
+        RC = &AArch64::FPR16RegClass;
       else if (RegVT == MVT::f32)
         RC = &AArch64::FPR32RegClass;
       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
@@ -1753,7 +2103,7 @@
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
       unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
 
       uint32_t BEAlign = 0;
       if (ArgSize < 8 && !Subtarget->isLittleEndian())
@@ -1788,7 +2138,7 @@
 
       ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
                                 MachinePointerInfo::getFixedStack(FI),
-                                MemVT, false, false, false, nullptr);
+                                MemVT, false, false, false, 0);
 
       InVals.push_back(ArgValue);
     }
@@ -1920,8 +2270,8 @@
                           : RetCC_AArch64_AAPCS;
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC);
 
   // Copy all of the result registers out of their specified physreg.
@@ -1990,6 +2340,19 @@
     return false;
   }
 
+  // Externally-defined functions with weak linkage should not be
+  // tail-called on AArch64 when the OS does not support dynamic
+  // pre-emption of symbols, as the AAELF spec requires normal calls
+  // to undefined weak functions to be replaced with a NOP or jump to the
+  // next instruction. The behaviour of branch instructions in this
+  // situation (as used for tail calls) is implementation-defined, so we
+  // cannot rely on the linker replacing the tail call with a return.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    if (GV->hasExternalWeakLinkage())
+      return false;
+  }
+
   // Now we search for cases where we can use a tail call without changing the
   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
   // concept.
@@ -2007,8 +2370,8 @@
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
 
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2020,13 +2383,13 @@
   // results are returned in the same way as what the caller expects.
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+                    *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
 
     SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+                    *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -2051,8 +2414,8 @@
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
 
@@ -2149,8 +2512,8 @@
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   if (IsVarArg) {
     // Handle fixed and variable vector arguments differently.
@@ -2295,7 +2658,7 @@
       // common case. It should also work for fundamental types too.
       uint32_t BEAlign = 0;
       unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
-                                        : VA.getLocVT().getSizeInBits();
+                                        : VA.getValVT().getSizeInBits();
       OpSize = (OpSize + 7) / 8;
       if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
         if (OpSize < 8)
@@ -2329,8 +2692,8 @@
             DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
         SDValue Cpy = DAG.getMemcpy(
             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
-            /*isVolatile = */ false,
-            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+            /*isVol = */ false,
+            /*AlwaysInline = */ false, DstInfo, MachinePointerInfo());
 
         MemOpChains.push_back(Cpy);
       } else {
@@ -2419,7 +2782,8 @@
 
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const AArch64RegisterInfo *ARI =
       static_cast<const AArch64RegisterInfo *>(TRI);
   if (IsThisReturn) {
@@ -2473,7 +2837,7 @@
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC);
 }
 
@@ -2487,8 +2851,8 @@
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC);
 
   // Copy the result values into the output registers.
@@ -2539,7 +2903,8 @@
                                                   SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
   SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
 
@@ -2554,6 +2919,25 @@
     return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
   }
 
+  if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
+    assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+           "use of MO_CONSTPOOL only supported on small model");
+    SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
+    SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
+                                     MachinePointerInfo::getConstantPool(),
+                                     /*isVolatile=*/ false,
+                                     /*isNonTemporal=*/ true,
+                                     /*isInvariant=*/ true, 8);
+    if (GN->getOffset() != 0)
+      return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
+                         DAG.getConstant(GN->getOffset(), PtrVT));
+    return GlobalAddr;
+  }
+
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     const unsigned char MO_NC = AArch64II::MO_NC;
     return DAG.getNode(
@@ -2630,7 +3014,8 @@
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const AArch64RegisterInfo *ARI =
       static_cast<const AArch64RegisterInfo *>(TRI);
   const uint32_t *Mask = ARI->getTLSCallPreservedMask();
@@ -2680,7 +3065,8 @@
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const AArch64RegisterInfo *ARI =
       static_cast<const AArch64RegisterInfo *>(TRI);
   const uint32_t *Mask = ARI->getTLSCallPreservedMask();
@@ -2895,11 +3281,6 @@
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
           uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
         }
@@ -2915,18 +3296,29 @@
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
           uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBNZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
         }
 
         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
+        // Don't combine AND since emitComparison converts the AND to an ANDS
+        // (a.k.a. TST) and the test in the test bit and branch instruction
+        // becomes redundant.  This would also increase register pressure.
+        uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+        return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
+                           DAG.getConstant(Mask, MVT::i64), Dest);
       }
     }
+    if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
+        LHS.getOpcode() != ISD::AND) {
+      // Don't combine AND since emitComparison converts the AND to an ANDS
+      // (a.k.a. TST) and the test in the test bit and branch instruction
+      // becomes redundant.  This would also increase register pressure.
+      uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+      return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
+                         DAG.getConstant(Mask, MVT::i64), Dest);
+    }
 
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
@@ -3041,6 +3433,9 @@
           AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
     return SDValue();
 
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
   // While there is no integer popcount instruction, it can
   // be more efficiently lowered to the following sequence that uses
   // AdvSIMD registers/instructions as long as the copies to/from
@@ -3992,8 +4387,10 @@
       return;
     case 'J': {
       uint64_t NVal = -C->getSExtValue();
-      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
+        CVal = C->getSExtValue();
         break;
+      }
       return;
     }
     // The K and L constraints apply *only* to logical immediates, including
@@ -4117,10 +4514,30 @@
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
-  SmallVector<SDValue, 2> SourceVecs;
-  SmallVector<unsigned, 2> MinElts;
-  SmallVector<unsigned, 2> MaxElts;
+  struct ShuffleSourceInfo {
+    SDValue Vec;
+    unsigned MinElt;
+    unsigned MaxElt;
 
+    // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+    // be compatible with the shuffle we intend to construct. As a result
+    // ShuffleVec will be some sliding window into the original Vec.
+    SDValue ShuffleVec;
+
+    // Code should guarantee that element i in Vec starts at element "WindowBase
+    // + i * WindowScale in ShuffleVec".
+    int WindowBase;
+    int WindowScale;
+
+    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+    ShuffleSourceInfo(SDValue Vec)
+        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+          WindowScale(1) {}
+  };
+
+  // First gather all vectors used as an immediate source for this BUILD_VECTOR
+  // node.
+  SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.getOpcode() == ISD::UNDEF)
@@ -4131,133 +4548,153 @@
       return SDValue();
     }
 
-    // Record this extraction against the appropriate vector if possible...
+    // Add this element source to the list if it's not already there.
     SDValue SourceVec = V.getOperand(0);
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
-    bool FoundSource = false;
-    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
-      if (SourceVecs[j] == SourceVec) {
-        if (MinElts[j] > EltNo)
-          MinElts[j] = EltNo;
-        if (MaxElts[j] < EltNo)
-          MaxElts[j] = EltNo;
-        FoundSource = true;
-        break;
-      }
-    }
+    auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
+    if (Source == Sources.end())
+      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
-    // Or record a new source if not...
-    if (!FoundSource) {
-      SourceVecs.push_back(SourceVec);
-      MinElts.push_back(EltNo);
-      MaxElts.push_back(EltNo);
-    }
+    // Update the minimum and maximum lane number seen.
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    Source->MinElt = std::min(Source->MinElt, EltNo);
+    Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
 
   // Currently only do something sane when at most two source vectors
-  // involved.
-  if (SourceVecs.size() > 2)
+  // are involved.
+  if (Sources.size() > 2)
     return SDValue();
 
-  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
-  int VEXTOffsets[2] = { 0, 0 };
-  int OffsetMultipliers[2] = { 1, 1 };
-
-  // This loop extracts the usage patterns of the source vectors
-  // and prepares appropriate SDValues for a shuffle if possible.
-  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
-    SDValue CurSource = SourceVecs[i];
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType()) {
-      // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
-      // Then bitcast it to the vector which holds asserted element type,
-      // and record the multiplier of element width between SourceVecs and
-      // Build_vector which is needed to extract the correct lanes later.
-      EVT CastVT =
-          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                           SourceVecs[i].getValueSizeInBits() /
-                               VT.getVectorElementType().getSizeInBits());
-
-      CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
-      OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
-      NumSrcElts *= OffsetMultipliers[i];
-      MaxElts[i] *= OffsetMultipliers[i];
-      MinElts[i] *= OffsetMultipliers[i];
+  // Find out the smallest element size among result and two sources, and use
+  // it as element size to build the shuffle_vector.
+  EVT SmallestEltTy = VT.getVectorElementType();
+  for (auto &Source : Sources) {
+    EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+    if (SrcEltTy.bitsLT(SmallestEltTy)) {
+      SmallestEltTy = SrcEltTy;
     }
+  }
+  unsigned ResMultiplier =
+      VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
+  NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+  EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
 
-    if (CurSource.getValueType() == VT) {
-      // No VEXT necessary
-      ShuffleSrcs[i] = CurSource;
-      VEXTOffsets[i] = 0;
+  // If the source vector is too wide or too narrow, we may nevertheless be able
+  // to construct a compatible shuffle either by concatenating it with UNDEF or
+  // extracting a suitable range of elements.
+  for (auto &Src : Sources) {
+    EVT SrcVT = Src.ShuffleVec.getValueType();
+
+    if (SrcVT.getSizeInBits() == VT.getSizeInBits())
       continue;
-    } else if (NumSrcElts < NumElts) {
+
+    // This stage of the search produces a source with the same element type as
+    // the original, but with a total width matching the BUILD_VECTOR output.
+    EVT EltVT = SrcVT.getVectorElementType();
+    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
+
+    if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+      assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
-      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
-                                   DAG.getUNDEF(CurSource.getValueType()));
+      Src.ShuffleVec =
+          DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+                      DAG.getUNDEF(Src.ShuffleVec.getValueType()));
       continue;
     }
 
-    // Since only 64-bit and 128-bit vectors are legal on ARM and
-    // we've eliminated the other cases...
-    assert(NumSrcElts == 2 * NumElts &&
-           "unexpected vector sizes in ReconstructShuffle");
+    assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
 
-    if (MaxElts[i] - MinElts[i] >= NumElts) {
+    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
       // Span too large for a VEXT to cope
       return SDValue();
     }
 
-    if (MinElts[i] >= NumElts) {
+    if (Src.MinElt >= NumSrcElts) {
       // The extraction can just take the second half
-      VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                   DAG.getIntPtrConstant(NumElts));
-    } else if (MaxElts[i] < NumElts) {
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getIntPtrConstant(NumSrcElts));
+      Src.WindowBase = -NumSrcElts;
+    } else if (Src.MaxElt < NumSrcElts) {
       // The extraction can just take the first half
-      VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                   DAG.getIntPtrConstant(0));
+      Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
+                                   Src.ShuffleVec, DAG.getIntPtrConstant(0));
     } else {
       // An actual VEXT is needed
-      VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                     DAG.getIntPtrConstant(0));
-      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
-                                     DAG.getIntPtrConstant(NumElts));
-      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
-      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(Imm, MVT::i32));
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
+                                     Src.ShuffleVec, DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getIntPtrConstant(NumSrcElts));
+      unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
+
+      Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
+                                   VEXTSrc2, DAG.getConstant(Imm, MVT::i32));
+      Src.WindowBase = -Src.MinElt;
     }
   }
 
-  SmallVector<int, 8> Mask;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF) {
-      Mask.push_back(-1);
+  // Another possible incompatibility occurs from the vector element types. We
+  // can fix this by bitcasting the source vectors to the same type we intend
+  // for the shuffle.
+  for (auto &Src : Sources) {
+    EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+    if (SrcEltTy == SmallestEltTy)
       continue;
-    }
+    assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+    Src.WindowBase *= Src.WindowScale;
+  }
 
-    SDValue ExtractVec = Entry.getOperand(0);
-    int ExtractElt =
-        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
-    if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
-    } else {
-      Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
-                     VEXTOffsets[1]);
-    }
+  // Final sanity check before we try to actually produce a shuffle.
+  DEBUG(
+    for (auto Src : Sources)
+      assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+  );
+
+  // The stars all align, our next step is to produce the mask for the shuffle.
+  SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+  int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF)
+      continue;
+
+    auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
+    int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+    // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+    // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+    // segment.
+    EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+    int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+                               VT.getVectorElementType().getSizeInBits());
+    int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+    // This source is expected to fill ResMultiplier lanes of the final shuffle,
+    // starting at the appropriate offset.
+    int *LaneMask = &Mask[i * ResMultiplier];
+
+    int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+    ExtractBase += NumElts * (Src - Sources.begin());
+    for (int j = 0; j < LanesDefined; ++j)
+      LaneMask[j] = ExtractBase + j;
   }
 
   // Final check before we try to produce nonsense...
-  if (isShuffleMaskLegal(Mask, VT))
-    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
-                                &Mask[0]);
+  if (!isShuffleMaskLegal(Mask, ShuffleVT))
+    return SDValue();
 
-  return SDValue();
+  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+  for (unsigned i = 0; i < Sources.size(); ++i)
+    ShuffleOps[i] = Sources[i].ShuffleVec;
+
+  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+                                         ShuffleOps[1], &Mask[0]);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
 // check if an EXT instruction can handle the shuffle mask when the
@@ -4586,7 +5023,8 @@
         VT.getVectorElementType() == MVT::f32)
       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
     // vrev <4 x i16> -> REV32
-    if (VT.getVectorElementType() == MVT::i16)
+    if (VT.getVectorElementType() == MVT::i16 ||
+        VT.getVectorElementType() == MVT::f16)
       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
     // vrev <4 x i8> -> REV16
     assert(VT.getVectorElementType() == MVT::i8);
@@ -4706,7 +5144,7 @@
 static unsigned getDUPLANEOp(EVT EltType) {
   if (EltType == MVT::i8)
     return AArch64ISD::DUPLANE8;
-  if (EltType == MVT::i16)
+  if (EltType == MVT::i16 || EltType == MVT::f16)
     return AArch64ISD::DUPLANE16;
   if (EltType == MVT::i32 || EltType == MVT::f32)
     return AArch64ISD::DUPLANE32;
@@ -4836,7 +5274,8 @@
     SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
 
     EVT ScalarVT = VT.getVectorElementType();
-    if (ScalarVT.getSizeInBits() < 32)
+
+    if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
       ScalarVT = MVT::i32;
 
     return DAG.getNode(
@@ -4924,7 +5363,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -4933,7 +5372,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -4942,7 +5381,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -4951,7 +5390,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -4960,7 +5399,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -4969,7 +5408,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
     }
 
@@ -5124,7 +5563,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5133,7 +5572,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5142,7 +5581,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5151,7 +5590,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5160,7 +5599,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5169,7 +5608,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
     }
 
@@ -5242,13 +5681,13 @@
         if (VT.getSizeInBits() == 128) {
           SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
                                     DAG.getConstant(CnstVal, MVT::i32));
-          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+          return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
         }
 
         // Support the V64 version via subregister insertion.
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
                                   DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
@@ -5257,7 +5696,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5266,7 +5705,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5275,7 +5714,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5284,7 +5723,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5293,7 +5732,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5302,7 +5741,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5311,7 +5750,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5320,7 +5759,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
@@ -5328,7 +5767,7 @@
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       // The few faces of FMOV...
@@ -5337,7 +5776,7 @@
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
@@ -5345,7 +5784,7 @@
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
                                   DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       // The many faces of MVNI...
@@ -5356,7 +5795,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5365,7 +5804,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5374,7 +5813,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5383,7 +5822,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5392,7 +5831,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5401,7 +5840,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5410,7 +5849,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5419,7 +5858,7 @@
         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
     }
 
@@ -5586,19 +6025,21 @@
                                                       SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
 
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(2)))
+  // Check for non-constant or out of range lane.
+  EVT VT = Op.getOperand(0).getValueType();
+  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
     return SDValue();
 
-  EVT VT = Op.getOperand(0).getValueType();
 
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
     return SDValue();
 
   // For V64 types, we perform insertion by expanding the value
@@ -5618,19 +6059,21 @@
                                                SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
 
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+  // Check for non-constant or out of range lane.
+  EVT VT = Op.getOperand(0).getValueType();
+  ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
     return SDValue();
 
-  EVT VT = Op.getOperand(0).getValueType();
 
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
     return SDValue();
 
   // For V64 types, we perform extraction by expanding the value
@@ -6164,7 +6607,7 @@
       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                        Attribute::NoImplicitFloat) &&
       (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+       (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
     return MVT::f128;
 
   return Size >= 8 ? MVT::i64 : MVT::i32;
@@ -6359,6 +6802,48 @@
   return performIntegerAbsCombine(N, DAG);
 }
 
+SDValue
+AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                     SelectionDAG &DAG,
+                                     std::vector<SDNode *> *Created) const {
+  // fold (sdiv X, pow2)
+  EVT VT = N->getValueType(0);
+  if ((VT != MVT::i32 && VT != MVT::i64) ||
+      !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  unsigned Lg2 = Divisor.countTrailingZeros();
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT);
+
+  // Add (N0 < 0) ? Pow2 - 1 : 0;
+  SDValue CCVal;
+  SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+  SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
+
+  if (Created) {
+    Created->push_back(Cmp.getNode());
+    Created->push_back(Add.getNode());
+    Created->push_back(CSel.getNode());
+  }
+
+  // Divide by pow2.
+  SDValue SRA =
+      DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64));
+
+  // If we're dividing by a positive value, we're done.  Otherwise, we must
+  // negate the result.
+  if (Divisor.isNonNegative())
+    return SRA;
+
+  if (Created)
+    Created->push_back(SRA.getNode());
+  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA);
+}
+
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
@@ -6417,10 +6902,63 @@
   return SDValue();
 }
 
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation, the operand of the
+  // unary operation isn't a bitwise AND, or if the sizes of the operations
+  // aren't the same.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant.
+    if (!BV->isConstant())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+  // First try to optimize away the conversion when it's conditionally from
+  // a constant. Vectors only.
+  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
+  if (Res != SDValue())
+    return Res;
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::f32 && VT != MVT::f64)
     return SDValue();
+
   // Only optimize when the source and destination types have the same width.
   if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
     return SDValue();
@@ -7190,11 +7728,11 @@
   // If the vector type isn't a simple VT, it's beyond the scope of what
   // we're  worried about here. Let legalization do its thing and hope for
   // the best.
-  if (!ResVT.isSimple())
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src->getValueType(0);
+  if (!ResVT.isSimple() || !SrcVT.isSimple())
     return SDValue();
 
-  SDValue Src = N->getOperand(0);
-  MVT SrcVT = Src->getValueType(0).getSimpleVT();
   // If the source VT is a 64-bit vector, we can play games and get the
   // better results we want.
   if (SrcVT.getSizeInBits() != 64)
@@ -7428,7 +7966,7 @@
     Ops.push_back(Inc);
 
     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    SDVTList SDTys = DAG.getVTList(Tys);
     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
                                            MemVT,
@@ -7558,7 +8096,7 @@
       Tys[n] = VecTy;
     Tys[n++] = MVT::i64;  // Type of write back register
     Tys[n] = MVT::Other;  // Type of the chain
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
 
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
@@ -7579,10 +8117,272 @@
   return SDValue();
 }
 
+// Checks to see if the value is the prescribed width and returns information
+// about its extension mode.
+static
+bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+  ExtType = ISD::NON_EXTLOAD;
+  switch(V.getNode()->getOpcode()) {
+  default:
+    return false;
+  case ISD::LOAD: {
+    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
+    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
+       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+      ExtType = LoadNode->getExtensionType();
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertSext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::SEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertZext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::ZEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::Constant:
+  case ISD::TargetConstant: {
+    if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+        1LL << (width - 1))
+      return true;
+    return false;
+  }
+  }
+
+  return true;
+}
+
+// This function does a whole lot of voodoo to determine if the tests are
+// equivalent without and with a mask. Essentially what happens is that given a
+// DAG resembling:
+//
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//  |    Input    | | AddConstant | | CompConstant| |     CC      |
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//           |           |           |               |
+//           V           V           |    +----------+
+//          +-------------+  +----+  |    |
+//          |     ADD     |  |0xff|  |    |
+//          +-------------+  +----+  |    |
+//                  |           |    |    |
+//                  V           V    |    |
+//                 +-------------+   |    |
+//                 |     AND     |   |    |
+//                 +-------------+   |    |
+//                      |            |    |
+//                      +-----+      |    |
+//                            |      |    |
+//                            V      V    V
+//                           +-------------+
+//                           |     CMP     |
+//                           +-------------+
+//
+// The AND node may be safely removed for some combinations of inputs. In
+// particular we need to take into account the extension type of the Input,
+// the exact values of AddConstant, CompConstant, and CC, along with the nominal
+// width of the input (this can work for any width inputs, the above graph is
+// specific to 8 bits.
+//
+// The specific equations were worked out by generating output tables for each
+// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
+// problem was simplified by working with 4 bit inputs, which means we only
+// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
+// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
+// patterns present in both extensions (0,7). For every distinct set of
+// AddConstant and CompConstants bit patterns we can consider the masked and
+// unmasked versions to be equivalent if the result of this function is true for
+// all 16 distinct bit patterns of for the current extension type of Input (w0).
+//
+//   sub      w8, w0, w1
+//   and      w10, w8, #0x0f
+//   cmp      w8, w2
+//   cset     w9, AArch64CC
+//   cmp      w10, w2
+//   cset     w11, AArch64CC
+//   cmp      w9, w11
+//   cset     w0, eq
+//   ret
+//
+// Since the above function shows when the outputs are equivalent it defines
+// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
+// would be expensive to run during compiles. The equations below were written
+// in a test harness that confirmed they gave equivalent outputs to the above
+// for all inputs function, so they can be used determine if the removal is
+// legal instead.
+//
+// isEquivalentMaskless() is the code for testing if the AND can be removed
+// factored out of the DAG recognition as the DAG can take several forms.
+
+static
+bool isEquivalentMaskless(unsigned CC, unsigned width,
+                          ISD::LoadExtType ExtType, signed AddConstant,
+                          signed CompConstant) {
+  // By being careful about our equations and only writing the in term
+  // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
+  // make them generally applicable to all bit widths.
+  signed MaxUInt = (1 << width);
+
+  // For the purposes of these comparisons sign extending the type is
+  // equivalent to zero extending the add and displacing it by half the integer
+  // width. Provided we are careful and make sure our equations are valid over
+  // the whole range we can just adjust the input and avoid writing equations
+  // for sign extended inputs.
+  if (ExtType == ISD::SEXTLOAD)
+    AddConstant -= (1 << (width-1));
+
+  switch(CC) {
+  case AArch64CC::LE:
+  case AArch64CC::GT: {
+    if ((AddConstant == 0) ||
+        (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
+        (AddConstant >= 0 && CompConstant < 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LT:
+  case AArch64CC::GE: {
+    if ((AddConstant == 0) ||
+        (AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::HI:
+  case AArch64CC::LS: {
+    if ((AddConstant >= 0 && CompConstant < 0) ||
+       (AddConstant <= 0 && CompConstant >= -1 &&
+        CompConstant < AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::PL:
+  case AArch64CC::MI: {
+    if ((AddConstant == 0) ||
+        (AddConstant > 0 && CompConstant <= 0) ||
+        (AddConstant < 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LO:
+  case AArch64CC::HS: {
+    if ((AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant >= 0 &&
+         CompConstant <= AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::EQ:
+  case AArch64CC::NE: {
+    if ((AddConstant > 0 && CompConstant < 0) ||
+        (AddConstant < 0 && CompConstant >= 0 &&
+         CompConstant < AddConstant + MaxUInt) ||
+        (AddConstant >= 0 && CompConstant >= 0 &&
+         CompConstant >= AddConstant) ||
+        (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
+
+      return true;
+  } break;
+  case AArch64CC::VS:
+  case AArch64CC::VC:
+  case AArch64CC::AL:
+  case AArch64CC::NV:
+    return true;
+  case AArch64CC::Invalid:
+    break;
+  }
+
+  return false;
+}
+
+static
+SDValue performCONDCombine(SDNode *N,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           SelectionDAG &DAG, unsigned CCIndex,
+                           unsigned CmpIndex) {
+  unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
+  SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
+  unsigned CondOpcode = SubsNode->getOpcode();
+
+  if (CondOpcode != AArch64ISD::SUBS)
+    return SDValue();
+
+  // There is a SUBS feeding this condition. Is it fed by a mask we can
+  // use?
+
+  SDNode *AndNode = SubsNode->getOperand(0).getNode();
+  unsigned MaskBits = 0;
+
+  if (AndNode->getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
+    uint32_t CNV = CN->getZExtValue();
+    if (CNV == 255)
+      MaskBits = 8;
+    else if (CNV == 65535)
+      MaskBits = 16;
+  }
+
+  if (!MaskBits)
+    return SDValue();
+
+  SDValue AddValue = AndNode->getOperand(0);
+
+  if (AddValue.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The basic dag structure is correct, grab the inputs and validate them.
+
+  SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
+  SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
+  SDValue SubsInputValue = SubsNode->getOperand(1);
+
+  // The mask is present and the provenance of all the values is a smaller type,
+  // lets see if the mask is superfluous.
+
+  if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
+      !isa<ConstantSDNode>(SubsInputValue.getNode()))
+    return SDValue();
+
+  ISD::LoadExtType ExtType;
+
+  if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+    return SDValue();
+
+  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
+                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+    return SDValue();
+
+  // The AND is not necessary, remove it.
+
+  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
+                               SubsNode->getValueType(1));
+  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+
+  SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
+  DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
+
+  return SDValue(N, 0);
+}
+
 // Optimize compare with zero and branch.
 static SDValue performBRCONDCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
+  SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
+  if (NV.getNode())
+    N = NV.getNode();
   SDValue Chain = N->getOperand(0);
   SDValue Dest = N->getOperand(1);
   SDValue CCVal = N->getOperand(2);
@@ -7671,21 +8471,23 @@
   SDValue N0 = N->getOperand(0);
   EVT ResVT = N->getValueType(0);
 
-  if (!N->getOperand(1).getValueType().isVector())
-    return SDValue();
-
   if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
     return SDValue();
 
-  SDLoc DL(N0);
-
+  // If NumMaskElts == 0, the comparison is larger than select result. The
+  // largest real NEON comparison is 64-bits per lane, which means the result is
+  // at most 32-bits and an illegal vector. Just bail out for now.
   EVT SrcVT = N0.getOperand(0).getValueType();
-  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
-                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
+  if (!ResVT.isVector() || NumMaskElts == 0)
+    return SDValue();
+
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
 
   // First perform a vector comparison, where lane 0 is the one we're interested
   // in.
+  SDLoc DL(N0);
   SDValue LHS =
       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
   SDValue RHS =
@@ -7695,8 +8497,8 @@
   // Now duplicate the comparison mask we want across all other lanes.
   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
-  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
-                     Mask);
+  Mask = DAG.getNode(ISD::BITCAST, DL,
+                     ResVT.changeVectorElementTypeToInteger(), Mask);
 
   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
@@ -7737,6 +8539,8 @@
     return performSTORECombine(N, DCI, DAG, Subtarget);
   case AArch64ISD::BRCOND:
     return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::CSEL:
+    return performCONDCombine(N, DCI, DAG, 2, 3);
   case AArch64ISD::DUP:
     return performPostLD1Combine(N, DCI, false);
   case ISD::INSERT_VECTOR_ELT:
@@ -7890,11 +8694,32 @@
   return true;
 }
 
+static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) {
+  if (N->getValueType(0) != MVT::i16)
+    return;
+
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+  assert(Op.getValueType() == MVT::f16 &&
+         "Inconsistent bitcast? Only 16-bit types should be i16 or f16");
+  Op = SDValue(
+      DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
+                         DAG.getUNDEF(MVT::i32), Op,
+                         DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
+      0);
+  Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
+  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
+}
+
 void AArch64TargetLowering::ReplaceNodeResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Don't know how to custom expand this");
+  case ISD::BITCAST:
+    ReplaceBITCASTResults(N, Results, DAG);
+    return;
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:
     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
@@ -7903,17 +8728,8 @@
   }
 }
 
-bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
-  // Loads and stores less than 128-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 128;
-
-  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
+bool AArch64TargetLowering::useLoadStackGuardNode() const {
+  return true;
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -7928,12 +8744,37 @@
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  return Size <= 128;
+}
+
+bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
+  return true;
+}
+
 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                              AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsAcquire = isAtLeastAcquire(Ord);
 
   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i64, i64} and we have to recombine them into a
@@ -7968,8 +8809,7 @@
                                                    Value *Val, Value *Addr,
                                                    AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsRelease = isAtLeastRelease(Ord);
 
   // Since the intrinsics must have legal type, the i128 intrinsics take two
   // parameters: "i64, i64". We must marshal Val into the appropriate form

diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index cb0b9ef..2f5708d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H
-#define LLVM_TARGET_AArch64_ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -162,6 +162,16 @@
   SITOF,
   UITOF,
 
+  /// Natural vector cast. ISD::BITCAST is not natural in the big-endian
+  /// world w.r.t vectors; which causes additional REV instructions to be
+  /// generated to compensate for the byte-swapping. But sometimes we do
+  /// need to re-interpret the data in SIMD vector registers in big-endian
+  /// mode without emitting such REV instructions.
+  NVCAST,
+
+  SMULL,
+  UMULL,
+
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LD3post,
@@ -197,10 +207,9 @@
   bool RequireStrictAlign;
 
 public:
-  explicit AArch64TargetLowering(TargetMachine &TM);
+  explicit AArch64TargetLowering(const TargetMachine &TM);
 
-  /// Selects the correct CCAssignFn for a the given CallingConvention
-  /// value.
+  /// Selects the correct CCAssignFn for a given CallingConvention value.
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 
   /// computeKnownBitsForTargetNode - Determine which of the bits specified in
@@ -212,10 +221,11 @@
 
   MVT getScalarShiftAmountTy(EVT LHSTy) const override;
 
-  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+  /// allowsMisalignedMemoryAccesses - Returns true if the target allows
   /// unaligned memory accesses. of the specified type.
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
-                                     bool *Fast = nullptr) const override {
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                      unsigned Align = 1,
+                                      bool *Fast = nullptr) const override {
     if (RequireStrictAlign)
       return false;
     // FIXME: True for Cyclone, but not necessary others.
@@ -317,13 +327,17 @@
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override;
 
+  bool hasLoadLinkedStoreConditional() const override;
   Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                               Value *Addr, AtomicOrdering Ord) const override;
 
-  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+  bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+  bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+  bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
+  bool useLoadStackGuardNode() const override;
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
@@ -424,6 +438,9 @@
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                        std::vector<SDNode *> *Created) const override;
+
   ConstraintType
   getConstraintType(const std::string &Constraint) const override;
   unsigned getRegisterByName(const char* RegName, EVT VT) const override;
@@ -464,4 +481,4 @@
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_AArch64_ISELLOWERING_H
+#endif

diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 3b9e3c6..4923a11 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td

@@ -29,8 +29,7 @@
 class acquiring_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected load ordering");
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+  return isAtLeastAcquire(Ordering);
 }]>;
 
 // An atomic load operation that does not need either acquire or release
@@ -38,7 +37,7 @@
 class relaxed_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
+  return !isAtLeastAcquire(Ordering);
 }]>;
 
 // 8-bit loads
@@ -114,14 +113,14 @@
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
   assert(Ordering != AcquireRelease && "unexpected store ordering");
-  return Ordering == Release || Ordering == SequentiallyConsistent;
+  return isAtLeastRelease(Ordering);
 }]>;
 
 // An atomic store operation that doesn't actually need to be atomic on AArch64.
 class relaxed_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
+  return !isAtLeastRelease(Ordering);
 }]>;
 
 // 8-bit stores

diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 4876c7d..2b0f5d2 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td

@@ -539,6 +539,11 @@
   let ParserMatchClass = Imm0_7Operand;
 }
 
+// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
+def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 16;
+}]>;
+
 // An arithmetic shifter operand:
 //  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
 //  {5-0} - imm6
@@ -776,15 +781,17 @@
 
 // Base encoding for system instruction operands.
 let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands,
+                  list<dag> pattern = []>
+    : I<oops, iops, asm, operands, "", pattern> {
   let Inst{31-22} = 0b1101010100;
   let Inst{21}    = L;
 }
 
 // System instructions which do not have an Rt register.
-class SimpleSystemI<bit L, dag iops, string asm, string operands>
-    : BaseSystemI<L, (outs), iops, asm, operands> {
+class SimpleSystemI<bit L, dag iops, string asm, string operands,
+                    list<dag> pattern = []>
+    : BaseSystemI<L, (outs), iops, asm, operands, pattern> {
   let Inst{4-0} = 0b11111;
 }
 
@@ -797,13 +804,17 @@
 }
 
 // Hint instructions that take both a CRm and a 3-bit immediate.
-class HintI<string mnemonic>
-    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
-      Sched<[WriteHint]> {
-  bits <7> imm;
-  let Inst{20-12} = 0b000110010;
-  let Inst{11-5} = imm;
-}
+// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
+// model patterns with sufficiently fine granularity
+let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in
+  class HintI<string mnemonic>
+      : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "",
+                      [(int_aarch64_hint imm0_127:$imm)]>,
+        Sched<[WriteHint]> {
+    bits <7> imm;
+    let Inst{20-12} = 0b000110010;
+    let Inst{11-5} = imm;
+  }
 
 // System instructions taking a single literal operand which encodes into
 // CRm. op2 differentiates the opcodes.
@@ -815,8 +826,9 @@
   let PrintMethod = "printBarrierOption";
   let ParserMatchClass = BarrierAsmOperand;
 }
-class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
-    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm,
+                 list<dag> pattern = []>
+    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>,
       Sched<[WriteBarrier]> {
   bits<4> CRm;
   let Inst{20-12} = 0b000110011;
@@ -831,7 +843,7 @@
   let ParserMethod = "tryParseSysReg";
   let DiagnosticType = "MRS";
 }
-// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate.
 def mrs_sysreg_op : Operand<i32> {
   let ParserMatchClass = MRSSystemRegisterOperand;
   let DecoderMethod = "DecodeMRSSystemRegister";
@@ -851,9 +863,8 @@
 
 class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
                        "mrs", "\t$Rt, $systemreg"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
+  bits<16> systemreg;
+  let Inst{20-5} = systemreg;
 }
 
 // FIXME: Some of these def NZCV, others don't. Best way to model that?
@@ -861,9 +872,8 @@
 // would do it, but feels like overkill at this point.
 class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
                        "msr", "\t$systemreg, $Rt"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
+  bits<16> systemreg;
+  let Inst{20-5} = systemreg;
 }
 
 def SystemPStateFieldOperand : AsmOperandClass {
@@ -1339,14 +1349,15 @@
 }
 
 multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  // MADD/MSUB generation is decided by MachineCombiner.cpp
   def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
-      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>,
       Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 0;
   }
 
   def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
-      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>,
       Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
     let Inst{31} = 1;
   }
@@ -2985,7 +2996,7 @@
     : BaseLoadStorePreIdx<sz, V, opc,
                      (outs GPR64sp:$wback, regtype:$Rt),
                      (ins GPR64sp:$Rn, simm9:$offset), asm,
-                     "$Rn = $wback", []>,
+                     "$Rn = $wback,@earlyclobber $wback", []>,
       Sched<[WriteLD, WriteAdr]>;
 
 let mayStore = 1, mayLoad = 0 in
@@ -2994,7 +3005,7 @@
     : BaseLoadStorePreIdx<sz, V, opc,
                       (outs GPR64sp:$wback),
                       (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                      asm, "$Rn = $wback",
+                      asm, "$Rn = $wback,@earlyclobber $wback",
       [(set GPR64sp:$wback,
             (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
       Sched<[WriteAdr, WriteST]>;
@@ -3004,7 +3015,6 @@
 // Load/store post-indexed
 //---
 
-// (pre-index) load/stores.
 class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
                           string asm, string cstr, list<dag> pat>
     : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
@@ -3032,7 +3042,7 @@
     : BaseLoadStorePostIdx<sz, V, opc,
                       (outs GPR64sp:$wback, regtype:$Rt),
                       (ins GPR64sp:$Rn, simm9:$offset),
-                      asm, "$Rn = $wback", []>,
+                      asm, "$Rn = $wback,@earlyclobber $wback", []>,
       Sched<[WriteLD, WriteI]>;
 
 let mayStore = 1, mayLoad = 0 in
@@ -3041,7 +3051,7 @@
     : BaseLoadStorePostIdx<sz, V, opc,
                       (outs GPR64sp:$wback),
                       (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
-                       asm, "$Rn = $wback",
+                       asm, "$Rn = $wback,@earlyclobber $wback",
       [(set GPR64sp:$wback,
             (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
     Sched<[WriteAdr, WriteST, ReadAdrBase]>;
@@ -3105,7 +3115,7 @@
 // (pre-indexed)
 class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
                               string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> {
   bits<5> Rt;
   bits<5> Rt2;
   bits<5> Rn;
@@ -3146,7 +3156,7 @@
 
 class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
                               string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> {
   bits<5> Rt;
   bits<5> Rt2;
   bits<5> Rn;
@@ -5250,6 +5260,10 @@
   def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
       asm, ".2d", OpNode, v2i64>;
 
+  def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
   def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
         (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
   def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),

diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index b702275..2dbb31c 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp

@@ -14,6 +14,7 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -260,8 +261,9 @@
     BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
   } else {
     // Folded compare-and-branch
+    // Note that we use addOperand instead of addReg to keep the flags.
     const MachineInstrBuilder MIB =
-        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]);
     if (Cond.size() > 3)
       MIB.addImm(Cond[3].getImm());
     MIB.addMBB(TBB);
@@ -606,6 +608,42 @@
   }
 }
 
+bool
+AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
+                                                  MachineInstr *MIb,
+                                                  AliasAnalysis *AA) const {
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned BaseRegA = 0, BaseRegB = 0;
+  int OffsetA = 0, OffsetB = 0;
+  int WidthA = 0, WidthB = 0;
+
+  assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
+         "MIa must be a store or a load");
+  assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
+         "MIb must be a store or a load");
+
+  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
+      MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+    return false;
+
+  // Retrieve the base register, offset from the base register and width. Width
+  // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8).  If
+  // base registers are identical, and the offset of a lower memory access +
+  // the width doesn't overlap the offset of a higher memory access,
+  // then the memory accesses are different.
+  if (getLdStBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
+      getLdStBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
+    if (BaseRegA == BaseRegB) {
+      int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+      int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+      int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+      if (LowOffset + LowWidth <= HighOffset)
+        return true;
+    }
+  }
+  return false;
+}
+
 /// analyzeCompare - For a comparison instruction, return the source registers
 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
 /// Return true if the comparison instruction can be analyzed.
@@ -640,7 +678,8 @@
     SrcReg = MI->getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    CmpValue = MI->getOperand(2).getImm();
+    // FIXME: In order to convert CmpValue to 0 or 1
+    CmpValue = (MI->getOperand(2).getImm() != 0);
     return true;
   case AArch64::ANDSWri:
   case AArch64::ANDSXri:
@@ -649,9 +688,14 @@
     SrcReg = MI->getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    CmpValue = AArch64_AM::decodeLogicalImmediate(
-        MI->getOperand(2).getImm(),
-        MI->getOpcode() == AArch64::ANDSWri ? 32 : 64);
+    // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
+    // while the type of CmpValue is int. When converting uint64_t to int,
+    // the high 32 bits of uint64_t will be lost.
+    // In fact it causes a bug in spec2006-483.xalancbmk
+    // CmpValue is only used to compare with zero in OptimizeCompareInstr
+    CmpValue = (AArch64_AM::decodeLogicalImmediate(
+                    MI->getOperand(2).getImm(),
+                    MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0);
     return true;
   }
 
@@ -664,8 +708,8 @@
   MachineFunction *MF = MBB->getParent();
   assert(MF && "Can't get MachineFunction here");
   const TargetMachine *TM = &MF->getTarget();
-  const TargetInstrInfo *TII = TM->getInstrInfo();
-  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+  const TargetInstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+  const TargetRegisterInfo *TRI = TM->getSubtargetImpl()->getRegisterInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
 
   for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
@@ -697,6 +741,87 @@
   return true;
 }
 
+/// \brief Return the opcode that does not set flags when possible - otherwise
+/// return the original opcode. The caller is responsible to do the actual
+/// substitution and legality checking.
+static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
+  // Don't convert all compare instructions, because for some the zero register
+  // encoding becomes the sp register.
+  bool MIDefinesZeroReg = false;
+  if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR))
+    MIDefinesZeroReg = true;
+
+  switch (MI->getOpcode()) {
+  default:
+    return MI->getOpcode();
+  case AArch64::ADDSWrr:
+    return AArch64::ADDWrr;
+  case AArch64::ADDSWri:
+    return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
+  case AArch64::ADDSWrs:
+    return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
+  case AArch64::ADDSWrx:
+    return AArch64::ADDWrx;
+  case AArch64::ADDSXrr:
+    return AArch64::ADDXrr;
+  case AArch64::ADDSXri:
+    return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
+  case AArch64::ADDSXrs:
+    return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
+  case AArch64::ADDSXrx:
+    return AArch64::ADDXrx;
+  case AArch64::SUBSWrr:
+    return AArch64::SUBWrr;
+  case AArch64::SUBSWri:
+    return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
+  case AArch64::SUBSWrs:
+    return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
+  case AArch64::SUBSWrx:
+    return AArch64::SUBWrx;
+  case AArch64::SUBSXrr:
+    return AArch64::SUBXrr;
+  case AArch64::SUBSXri:
+    return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
+  case AArch64::SUBSXrs:
+    return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
+  case AArch64::SUBSXrx:
+    return AArch64::SUBXrx;
+  }
+}
+
+/// True when condition code could be modified on the instruction
+/// trace starting at from and ending at to.
+static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To,
+                                  const bool CheckOnlyCCWrites,
+                                  const TargetRegisterInfo *TRI) {
+  // We iterate backward starting \p To until we hit \p From
+  MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin();
+
+  // Early exit if To is at the beginning of the BB.
+  if (I == B)
+    return true;
+
+  // Check whether the definition of SrcReg is in the same basic block as
+  // Compare. If not, assume the condition code gets modified on some path.
+  if (To->getParent() != From->getParent())
+    return true;
+
+  // Check that NZCV isn't set on the trace.
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
+        (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI)))
+      // This instruction modifies or uses NZCV after the one we want to
+      // change.
+      return true;
+    if (I == B)
+      // We currently don't allow the instruction trace to cross basic
+      // block boundaries
+      return true;
+  }
+  return false;
+}
 /// optimizeCompareInstr - Convert the instruction supplying the argument to the
 /// comparison into one that sets the zero bit in the flags register.
 bool AArch64InstrInfo::optimizeCompareInstr(
@@ -706,28 +831,15 @@
   // Replace SUBSWrr with SUBWrr if NZCV is not used.
   int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
   if (Cmp_NZCV != -1) {
-    unsigned NewOpc;
-    switch (CmpInstr->getOpcode()) {
-    default:
-      return false;
-    case AArch64::ADDSWrr:      NewOpc = AArch64::ADDWrr; break;
-    case AArch64::ADDSWri:      NewOpc = AArch64::ADDWri; break;
-    case AArch64::ADDSWrs:      NewOpc = AArch64::ADDWrs; break;
-    case AArch64::ADDSWrx:      NewOpc = AArch64::ADDWrx; break;
-    case AArch64::ADDSXrr:      NewOpc = AArch64::ADDXrr; break;
-    case AArch64::ADDSXri:      NewOpc = AArch64::ADDXri; break;
-    case AArch64::ADDSXrs:      NewOpc = AArch64::ADDXrs; break;
-    case AArch64::ADDSXrx:      NewOpc = AArch64::ADDXrx; break;
-    case AArch64::SUBSWrr:      NewOpc = AArch64::SUBWrr; break;
-    case AArch64::SUBSWri:      NewOpc = AArch64::SUBWri; break;
-    case AArch64::SUBSWrs:      NewOpc = AArch64::SUBWrs; break;
-    case AArch64::SUBSWrx:      NewOpc = AArch64::SUBWrx; break;
-    case AArch64::SUBSXrr:      NewOpc = AArch64::SUBXrr; break;
-    case AArch64::SUBSXri:      NewOpc = AArch64::SUBXri; break;
-    case AArch64::SUBSXrs:      NewOpc = AArch64::SUBXrs; break;
-    case AArch64::SUBSXrx:      NewOpc = AArch64::SUBXrx; break;
+    if (CmpInstr->definesRegister(AArch64::WZR) ||
+        CmpInstr->definesRegister(AArch64::XZR)) {
+      CmpInstr->eraseFromParent();
+      return true;
     }
-
+    unsigned Opc = CmpInstr->getOpcode();
+    unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+    if (NewOpc == Opc)
+      return false;
     const MCInstrDesc &MCID = get(NewOpc);
     CmpInstr->setDesc(MCID);
     CmpInstr->RemoveOperand(Cmp_NZCV);
@@ -738,6 +850,9 @@
   }
 
   // Continue only if we have a "ri" where immediate is zero.
+  // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
+  // function.
+  assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
   if (CmpValue != 0 || SrcReg2 != 0)
     return false;
 
@@ -750,36 +865,10 @@
   if (!MI)
     return false;
 
-  // We iterate backward, starting from the instruction before CmpInstr and
-  // stop when reaching the definition of the source register or done with the
-  // basic block, to check whether NZCV is used or modified in between.
-  MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
-
-  // Early exit if CmpInstr is at the beginning of the BB.
-  if (I == B)
-    return false;
-
-  // Check whether the definition of SrcReg is in the same basic block as
-  // Compare. If not, we can't optimize away the Compare.
-  if (MI->getParent() != CmpInstr->getParent())
-    return false;
-
-  // Check that NZCV isn't set between the comparison instruction and the one we
-  // want to change.
+  bool CheckOnlyCCWrites = false;
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  for (--I; I != E; --I) {
-    const MachineInstr &Instr = *I;
-
-    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
-        Instr.readsRegister(AArch64::NZCV, TRI))
-      // This instruction modifies or uses NZCV after the one we want to
-      // change. We can't do this transformation.
-      return false;
-    if (I == B)
-      // The 'and' is below the comparison instruction.
-      return false;
-  }
+  if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI))
+    return false;
 
   unsigned NewOpc = MI->getOpcode();
   switch (MI->getOpcode()) {
@@ -893,6 +982,56 @@
   return true;
 }
 
+bool
+AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+    return false;
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Reg = MI->getOperand(0).getReg();
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+  const TargetMachine &TM = MBB.getParent()->getTarget();
+  unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
+  const unsigned char MO_NC = AArch64II::MO_NC;
+
+  if ((OpFlags & AArch64II::MO_GOT) != 0) {
+    BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT);
+    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+        .addReg(Reg, RegState::Kill).addImm(0)
+        .addMemOperand(*MI->memoperands_begin());
+  } else if (TM.getCodeModel() == CodeModel::Large) {
+    BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
+    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
+    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
+    BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
+    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+        .addReg(Reg, RegState::Kill).addImm(0)
+        .addMemOperand(*MI->memoperands_begin());
+  } else {
+    BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
+        .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
+    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addGlobalAddress(GV, 0, LoFlags)
+        .addMemOperand(*MI->memoperands_begin());
+  }
+
+  MBB.erase(MI);
+
+  return true;
+}
+
 /// Return true if this is this instruction has a non-zero immediate
 bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
@@ -1008,12 +1147,14 @@
              MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
       return true;
     }
+    break;
   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
     if (MI->getOperand(2).getImm() == 0) {
       assert(MI->getDesc().getNumOperands() == 4 &&
              MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
       return true;
     }
+    break;
   }
   return false;
 }
@@ -1036,6 +1177,7 @@
              "invalid ORRv16i8 operands");
       return true;
     }
+    break;
   }
   return false;
 }
@@ -1197,6 +1339,102 @@
   };
 }
 
+bool AArch64InstrInfo::getLdStBaseRegImmOfsWidth(
+    MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width,
+    const TargetRegisterInfo *TRI) const {
+  // Handle only loads/stores with base register followed by immediate offset.
+  if (LdSt->getNumOperands() != 3)
+    return false;
+  if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+    return false;
+
+  // Offset is calculated as the immediate operand multiplied by the scaling factor.
+  // Unscaled instructions have scaling factor set to 1.
+  int Scale = 0;
+  switch (LdSt->getOpcode()) {
+  default:
+    return false;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+    Width = 16;
+    Scale = 1;
+    break;
+  case AArch64::LDURXi:
+  case AArch64::LDURDi:
+  case AArch64::STURXi:
+  case AArch64::STURDi:
+    Width = 8;
+    Scale = 1;
+    break;
+  case AArch64::LDURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURSWi:
+  case AArch64::STURWi:
+  case AArch64::STURSi:
+    Width = 4;
+    Scale = 1;
+    break;
+  case AArch64::LDURHi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSHWi:
+  case AArch64::STURHi:
+  case AArch64::STURHHi:
+    Width = 2;
+    Scale = 1;
+    break;
+  case AArch64::LDURBi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSBWi:
+  case AArch64::STURBi:
+  case AArch64::STURBBi:
+    Width = 1;
+    Scale = 1;
+    break;
+  case AArch64::LDRXui:
+  case AArch64::STRXui:
+    Scale = Width = 8;
+    break;
+  case AArch64::LDRWui:
+  case AArch64::STRWui:
+    Scale = Width = 4;
+    break;
+  case AArch64::LDRBui:
+  case AArch64::STRBui:
+    Scale = Width = 1;
+    break;
+  case AArch64::LDRHui:
+  case AArch64::STRHui:
+    Scale = Width = 2;
+    break;
+  case AArch64::LDRSui:
+  case AArch64::STRSui:
+    Scale = Width = 4;
+    break;
+  case AArch64::LDRDui:
+  case AArch64::STRDui:
+    Scale = Width = 8;
+    break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    Scale = Width = 16;
+    break;
+  case AArch64::LDRBBui:
+  case AArch64::STRBBui:
+    Scale = Width = 1;
+    break;
+  case AArch64::LDRHHui:
+  case AArch64::STRHHui:
+    Scale = Width = 2;
+    break;
+  };
+
+  BaseReg = LdSt->getOperand(1).getReg();
+  Offset = LdSt->getOperand(2).getImm() * Scale;
+  return true;
+}
+
 /// Detect opportunities for ldp/stp formation.
 ///
 /// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
@@ -1239,16 +1477,15 @@
   }
 }
 
-MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                                         int FrameIx,
-                                                         uint64_t Offset,
-                                                         const MDNode *MDPtr,
-                                                         DebugLoc DL) const {
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
+    MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
+    const MDNode *Expr, DebugLoc DL) const {
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
                                 .addFrameIndex(FrameIx)
                                 .addImm(0)
                                 .addImm(Offset)
-                                .addMetadata(MDPtr);
+                                .addMetadata(Var)
+                                .addMetadata(Expr);
   return &*MIB;
 }
 
@@ -2132,3 +2369,592 @@
   NopInst.setOpcode(AArch64::HINT);
   NopInst.addOperand(MCOperand::CreateImm(0));
 }
+/// useMachineCombiner - return true when a target supports MachineCombiner
+bool AArch64InstrInfo::useMachineCombiner() const {
+  // AArch64 supports the combiner
+  return true;
+}
+//
+// True when Opc sets flag
+static bool isCombineInstrSettingFlag(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSXrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// 32b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate32(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDWrr:
+  case AArch64::ADDWri:
+  case AArch64::SUBWrr:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSWrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBWri:
+  case AArch64::SUBSWri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// 64b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate64(unsigned Opc) {
+  switch (Opc) {
+  case AArch64::ADDXrr:
+  case AArch64::ADDXri:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSXrr:
+  // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+  case AArch64::SUBXri:
+  case AArch64::SUBSXri:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+//
+// Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate(unsigned Opc) {
+  return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
+}
+
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                              unsigned MulOpc, unsigned ZeroReg) {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineInstr *MI = nullptr;
+  // We need a virtual register definition.
+  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    MI = MRI.getUniqueVRegDef(MO.getReg());
+  // And it needs to be in the trace (otherwise, it won't have a depth).
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
+    return false;
+
+  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+         MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+         MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+
+  // The third input reg must be zero.
+  if (MI->getOperand(3).getReg() != ZeroReg)
+    return false;
+
+  // Must only used by the user we combine with.
+  if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+    return false;
+
+  return true;
+}
+
+/// hasPattern - return true when there is potentially a faster code sequence
+/// for an instruction chain ending in \p Root. All potential patterns are
+/// listed
+/// in the \p Pattern vector. Pattern should be sorted in priority order since
+/// the pattern evaluator stops checking as soon as it finds a faster sequence.
+
+bool AArch64InstrInfo::hasPattern(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const {
+  unsigned Opc = Root.getOpcode();
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  if (!isCombineInstrCandidate(Opc))
+    return 0;
+  if (isCombineInstrSettingFlag(Opc)) {
+    int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
+    // When NZCV is live bail out.
+    if (Cmp_NZCV == -1)
+      return 0;
+    unsigned NewOpc = convertFlagSettingOpcode(&Root);
+    // When opcode can't change bail out.
+    // CHECKME: do we miss any cases for opcode conversion?
+    if (NewOpc == Opc)
+      return 0;
+    Opc = NewOpc;
+  }
+
+  switch (Opc) {
+  default:
+    break;
+  case AArch64::ADDWrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "ADDWrr does not have register operands");
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDXrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBWrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBXrr:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP1);
+      Found = true;
+    }
+    if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDWri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDWI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::ADDXri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULADDXI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBWri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+                          AArch64::WZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1);
+      Found = true;
+    }
+    break;
+  case AArch64::SUBXri:
+    if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+                          AArch64::XZR)) {
+      Pattern.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1);
+      Found = true;
+    }
+    break;
+  }
+  return Found;
+}
+
+/// genMadd - Generate madd instruction and combine mul and add.
+/// Example:
+///  MUL I=A,B,0
+///  ADD R,I,C
+///  ==> MADD R,A,B,C
+/// \param Root is the ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
+                             const TargetInstrInfo *TII, MachineInstr &Root,
+                             SmallVectorImpl<MachineInstr *> &InsInstrs,
+                             unsigned IdxMulOpd, unsigned MaddOpc,
+                             const TargetRegisterClass *RC) {
+  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+  unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
+  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+  unsigned ResultReg = Root.getOperand(0).getReg();
+  unsigned SrcReg0 = MUL->getOperand(1).getReg();
+  bool Src0IsKill = MUL->getOperand(1).isKill();
+  unsigned SrcReg1 = MUL->getOperand(2).getReg();
+  bool Src1IsKill = MUL->getOperand(2).isKill();
+  unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
+  bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
+
+  if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+    MRI.constrainRegClass(ResultReg, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+    MRI.constrainRegClass(SrcReg0, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+    MRI.constrainRegClass(SrcReg1, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
+    MRI.constrainRegClass(SrcReg2, RC);
+
+  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
+                                    ResultReg)
+                                .addReg(SrcReg0, getKillRegState(Src0IsKill))
+                                .addReg(SrcReg1, getKillRegState(Src1IsKill))
+                                .addReg(SrcReg2, getKillRegState(Src2IsKill));
+  // Insert the MADD
+  InsInstrs.push_back(MIB);
+  return MUL;
+}
+
+/// genMaddR - Generate madd instruction and combine mul and add using
+/// an extra virtual register
+/// Example - an ADD intermediate needs to be stored in a register:
+///   MUL I=A,B,0
+///   ADD R,I,Imm
+///   ==> ORR  V, ZR, Imm
+///   ==> MADD R,A,B,V
+/// \param Root is the ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+/// \param VR is a virtual register that holds the value of an ADD operand
+/// (V in the example above).
+static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
+                              const TargetInstrInfo *TII, MachineInstr &Root,
+                              SmallVectorImpl<MachineInstr *> &InsInstrs,
+                              unsigned IdxMulOpd, unsigned MaddOpc,
+                              unsigned VR, const TargetRegisterClass *RC) {
+  assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+  MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+  unsigned ResultReg = Root.getOperand(0).getReg();
+  unsigned SrcReg0 = MUL->getOperand(1).getReg();
+  bool Src0IsKill = MUL->getOperand(1).isKill();
+  unsigned SrcReg1 = MUL->getOperand(2).getReg();
+  bool Src1IsKill = MUL->getOperand(2).isKill();
+
+  if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+    MRI.constrainRegClass(ResultReg, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+    MRI.constrainRegClass(SrcReg0, RC);
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+    MRI.constrainRegClass(SrcReg1, RC);
+  if (TargetRegisterInfo::isVirtualRegister(VR))
+    MRI.constrainRegClass(VR, RC);
+
+  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
+                                    ResultReg)
+                                .addReg(SrcReg0, getKillRegState(Src0IsKill))
+                                .addReg(SrcReg1, getKillRegState(Src1IsKill))
+                                .addReg(VR);
+  // Insert the MADD
+  InsInstrs.push_back(MIB);
+  return MUL;
+}
+
+/// genAlternativeCodeSequence - when hasPattern() finds a pattern
+/// this function generates the instructions that could replace the
+/// original code sequence
+void AArch64InstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  MachineBasicBlock &MBB = *Root.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  MachineInstr *MUL;
+  const TargetRegisterClass *RC;
+  unsigned Opc;
+  switch (Pattern) {
+  default:
+    // signal error.
+    break;
+  case MachineCombinerPattern::MC_MULADDW_OP1:
+  case MachineCombinerPattern::MC_MULADDX_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,C
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) {
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MC_MULADDW_OP2:
+  case MachineCombinerPattern::MC_MULADDX_OP2:
+    // MUL I=A,B,0
+    // ADD R,C,I
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) {
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MC_MULADDWI_OP1:
+  case MachineCombinerPattern::MC_MULADDXI_OP1: {
+    // MUL I=A,B,0
+    // ADD R,I,Imm
+    // ==> ORR  V, ZR, Imm
+    // ==> MADD R,A,B,V
+    // --- Create(MADD);
+    const TargetRegisterClass *OrrRC;
+    unsigned BitSize, OrrOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) {
+      OrrOpc = AArch64::ORRWri;
+      OrrRC = &AArch64::GPR32spRegClass;
+      BitSize = 32;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      OrrOpc = AArch64::ORRXri;
+      OrrRC = &AArch64::GPR64spRegClass;
+      BitSize = 64;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+    uint64_t Imm = Root.getOperand(2).getImm();
+
+    if (Root.getOperand(3).isImm()) {
+      unsigned Val = Root.getOperand(3).getImm();
+      Imm = Imm << Val;
+    }
+    uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+    uint64_t Encoding;
+    if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+      MachineInstrBuilder MIB1 =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+              .addReg(ZeroReg)
+              .addImm(Encoding);
+      InsInstrs.push_back(MIB1);
+      InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+      MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+    }
+    break;
+  }
+  case MachineCombinerPattern::MC_MULSUBW_OP1:
+  case MachineCombinerPattern::MC_MULSUBX_OP1: {
+    // MUL I=A,B,0
+    // SUB R,I, C
+    // ==> SUB  V, 0, C
+    // ==> MADD R,A,B,V // = -C + A*B
+    // --- Create(MADD);
+    const TargetRegisterClass *SubRC;
+    unsigned SubOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) {
+      SubOpc = AArch64::SUBWrr;
+      SubRC = &AArch64::GPR32spRegClass;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      SubOpc = AArch64::SUBXrr;
+      SubRC = &AArch64::GPR64spRegClass;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    unsigned NewVR = MRI.createVirtualRegister(SubRC);
+    // SUB NewVR, 0, C
+    MachineInstrBuilder MIB1 =
+        BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
+            .addReg(ZeroReg)
+            .addOperand(Root.getOperand(2));
+    InsInstrs.push_back(MIB1);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+    MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+    break;
+  }
+  case MachineCombinerPattern::MC_MULSUBW_OP2:
+  case MachineCombinerPattern::MC_MULSUBX_OP2:
+    // MUL I=A,B,0
+    // SUB R,C,I
+    // ==> MSUB R,A,B,C (computes C - A*B)
+    // --- Create(MSUB);
+    if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) {
+      Opc = AArch64::MSUBWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      Opc = AArch64::MSUBXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MC_MULSUBWI_OP1:
+  case MachineCombinerPattern::MC_MULSUBXI_OP1: {
+    // MUL I=A,B,0
+    // SUB R,I, Imm
+    // ==> ORR  V, ZR, -Imm
+    // ==> MADD R,A,B,V // = -Imm + A*B
+    // --- Create(MADD);
+    const TargetRegisterClass *OrrRC;
+    unsigned BitSize, OrrOpc, ZeroReg;
+    if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) {
+      OrrOpc = AArch64::ORRWri;
+      OrrRC = &AArch64::GPR32spRegClass;
+      BitSize = 32;
+      ZeroReg = AArch64::WZR;
+      Opc = AArch64::MADDWrrr;
+      RC = &AArch64::GPR32RegClass;
+    } else {
+      OrrOpc = AArch64::ORRXri;
+      OrrRC = &AArch64::GPR64spRegClass;
+      BitSize = 64;
+      ZeroReg = AArch64::XZR;
+      Opc = AArch64::MADDXrrr;
+      RC = &AArch64::GPR64RegClass;
+    }
+    unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+    int Imm = Root.getOperand(2).getImm();
+    if (Root.getOperand(3).isImm()) {
+      unsigned Val = Root.getOperand(3).getImm();
+      Imm = Imm << Val;
+    }
+    uint64_t UImm = -Imm << (64 - BitSize) >> (64 - BitSize);
+    uint64_t Encoding;
+    if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+      MachineInstrBuilder MIB1 =
+          BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+              .addReg(ZeroReg)
+              .addImm(Encoding);
+      InsInstrs.push_back(MIB1);
+      InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+      MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+    }
+    break;
+  }
+  } // end switch (Pattern)
+  // Record MUL and ADD/SUB for deletion
+  DelInstrs.push_back(MUL);
+  DelInstrs.push_back(&Root);
+
+  return;
+}
+
+/// \brief Replace csincr-branch sequence by simple conditional branch
+///
+/// Examples:
+/// 1.
+///   csinc  w9, wzr, wzr, <condition code>
+///   tbnz   w9, #0, 0x44
+/// to
+///   b.<inverted condition code>
+///
+/// 2.
+///   csinc w9, wzr, wzr, <condition code>
+///   tbz   w9, #0, 0x44
+/// to
+///   b.<condition code>
+///
+/// \param  MI Conditional Branch
+/// \return True when the simple conditional branch is generated
+///
+bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
+  bool IsNegativeBranch = false;
+  bool IsTestAndBranch = false;
+  unsigned TargetBBInMI = 0;
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown branch instruction?");
+  case AArch64::Bcc:
+    return false;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    TargetBBInMI = 1;
+    break;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    TargetBBInMI = 1;
+    IsNegativeBranch = true;
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+    TargetBBInMI = 2;
+    IsTestAndBranch = true;
+    break;
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    TargetBBInMI = 2;
+    IsNegativeBranch = true;
+    IsTestAndBranch = true;
+    break;
+  }
+  // So we increment a zero register and test for bits other
+  // than bit 0? Conservatively bail out in case the verifier
+  // missed this case.
+  if (IsTestAndBranch && MI->getOperand(1).getImm())
+    return false;
+
+  // Find Definition.
+  assert(MI->getParent() && "Incomplete machine instruciton\n");
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  unsigned VReg = MI->getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return false;
+
+  MachineInstr *DefMI = MRI->getVRegDef(VReg);
+
+  // Look for CSINC
+  if (!(DefMI->getOpcode() == AArch64::CSINCWr &&
+        DefMI->getOperand(1).getReg() == AArch64::WZR &&
+        DefMI->getOperand(2).getReg() == AArch64::WZR) &&
+      !(DefMI->getOpcode() == AArch64::CSINCXr &&
+        DefMI->getOperand(1).getReg() == AArch64::XZR &&
+        DefMI->getOperand(2).getReg() == AArch64::XZR))
+    return false;
+
+  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+    return false;
+
+  AArch64CC::CondCode CC =
+      (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
+  bool CheckOnlyCCWrites = true;
+  // Convert only when the condition code is not modified between
+  // the CSINC and the branch. The CC may be used by other
+  // instructions in between.
+  if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo()))
+    return false;
+  MachineBasicBlock &RefToMBB = *MBB;
+  MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB();
+  DebugLoc DL = MI->getDebugLoc();
+  if (IsNegativeBranch)
+    CC = AArch64CC::getInvertedCondCode(CC);
+  BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
+  MI->eraseFromParent();
+  return true;
+}

diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index b27565e..30bf650 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h

@@ -11,12 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64INSTRINFO_H
-#define LLVM_TARGET_AArch64INSTRINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
 
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
 
 #define GET_INSTRINFO_HEADER
 #include "AArch64GenInstrInfo.inc"
@@ -51,6 +52,10 @@
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
                              unsigned &DstReg, unsigned &SubIdx) const override;
 
+  bool
+  areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
+                                  AliasAnalysis *AA = nullptr) const override;
+
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                int &FrameIndex) const override;
   unsigned isStoreToStackSlot(const MachineInstr *MI,
@@ -89,6 +94,10 @@
                             unsigned &Offset,
                             const TargetRegisterInfo *TRI) const override;
 
+  bool getLdStBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg,
+                                 int &Offset, int &Width,
+                                 const TargetRegisterInfo *TRI) const;
+
   bool enableClusterLoads() const override { return true; }
 
   bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
@@ -98,8 +107,8 @@
                               MachineInstr *Second) const override;
 
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                         uint64_t Offset, const MDNode *MDPtr,
-                                         DebugLoc DL) const;
+                                         uint64_t Offset, const MDNode *Var,
+                                         const MDNode *Expr, DebugLoc DL) const;
   void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                         DebugLoc DL, unsigned DestReg, unsigned SrcReg,
                         bool KillSrc, unsigned Opcode,
@@ -119,6 +128,7 @@
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  using TargetInstrInfo::foldMemoryOperandImpl;
   MachineInstr *
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                         const SmallVectorImpl<unsigned> &Ops,
@@ -155,7 +165,27 @@
   bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
+  bool optimizeCondBranch(MachineInstr *MI) const override;
+  /// hasPattern - return true when there is potentially a faster code sequence
+  /// for an instruction chain ending in <Root>. All potential patterns are
+  /// listed
+  /// in the <Pattern> array.
+  bool hasPattern(MachineInstr &Root,
+                  SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern)
+      const override;
 
+  /// genAlternativeCodeSequence - when hasPattern() finds a pattern
+  /// this function generates the instructions that could replace the
+  /// original code sequence
+  void genAlternativeCodeSequence(
+      MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
+      SmallVectorImpl<MachineInstr *> &InsInstrs,
+      SmallVectorImpl<MachineInstr *> &DelInstrs,
+      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+  /// useMachineCombiner - AArch64 supports MachineCombiner
+  bool useMachineCombiner() const override;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 private:
   void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
                              MachineBasicBlock *TBB,

diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 1211fba..252ed40 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td

@@ -24,6 +24,7 @@
                                  AssemblerPredicate<"FeatureCRC", "crc">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
+def IsCyclone        : Predicate<"Subtarget->isCyclone()">;
 
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
@@ -236,6 +237,12 @@
 def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
                                  SDT_AArch64WrapperLarge>;
 
+def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
+
+def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                    SDTCisSameAs<1, 2>]>;
+def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
+def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
 
 //===----------------------------------------------------------------------===//
 
@@ -331,13 +338,23 @@
 def : InstAlias<"sev",  (HINT 0b100)>;
 def : InstAlias<"sevl", (HINT 0b101)>;
 
-  // As far as LLVM is concerned this writes to the system's exclusive monitors.
+// As far as LLVM is concerned this writes to the system's exclusive monitors.
 let mayLoad = 1, mayStore = 1 in
 def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
 
-def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
-def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
-def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
+// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
+// model patterns with sufficiently fine granularity.
+let mayLoad = ?, mayStore = ? in {
+def DMB   : CRmSystemI<barrier_op, 0b101, "dmb",
+                       [(int_aarch64_dmb (i32 imm32_0_15:$CRm))]>;
+
+def DSB   : CRmSystemI<barrier_op, 0b100, "dsb",
+                       [(int_aarch64_dsb (i32 imm32_0_15:$CRm))]>;
+
+def ISB   : CRmSystemI<barrier_op, 0b110, "isb",
+                       [(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
+}
+
 def : InstAlias<"clrex", (CLREX 0xf)>;
 def : InstAlias<"isb", (ISB 0xf)>;
 
@@ -1163,6 +1180,9 @@
 defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
 defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
 
+defm : ScalToVecROLoadPat<ro16, load,       i32, v4f16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, load,       i32, v8f16, LDRHroW, LDRHroX, hsub>;
+
 defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
 defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
 
@@ -1203,6 +1223,7 @@
   defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
 }
 
 defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
@@ -1216,6 +1237,7 @@
   defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8f16,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
 }
 } // AddedComplexity = 10
@@ -1345,6 +1367,8 @@
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
 }
 def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
@@ -1366,6 +1390,8 @@
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
 }
 def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
           (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
@@ -1502,6 +1528,8 @@
             (LDURDi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
             (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
 }
 def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
           (LDURDi GPR64sp:$Rn, simm9:$offset)>;
@@ -1522,6 +1550,8 @@
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
 }
 
 //  anyext -> zext
@@ -1818,6 +1848,7 @@
   defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
 }
 
 defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
@@ -1832,6 +1863,7 @@
   defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
 }
 } // AddedComplexity = 10
 
@@ -1882,6 +1914,9 @@
   def : Pat<(store (v2i32 FPR64:$Rt),
                    (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
             (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4f16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
 }
 def : Pat<(store (v1f64 FPR64:$Rt),
                  (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
@@ -1911,6 +1946,9 @@
   def : Pat<(store (v2i64 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8f16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
 }
 def : Pat<(store (f128  FPR128:$Rt),
                  (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
@@ -1973,6 +2011,9 @@
   def : Pat<(store (v2i32 FPR64:$Rt),
                    (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
             (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4f16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
           (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
@@ -2003,6 +2044,9 @@
   def : Pat<(store (v2f64 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8f16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 
 // unscaled i64 truncating stores
@@ -2079,6 +2123,8 @@
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
@@ -2092,6 +2138,8 @@
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 //---
 // (immediate post-indexed)
@@ -2129,6 +2177,8 @@
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
@@ -2142,6 +2192,8 @@
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 //===----------------------------------------------------------------------===//
 // Load/store exclusive instructions.
@@ -2234,89 +2286,6 @@
 
 defm FCVT : FPConversion<"fcvt">;
 
-def : Pat<(f32_to_f16 FPR32:$Rn),
-          (i32 (COPY_TO_REGCLASS
-                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
-                   GPR32))>;
-
-def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
-                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
-
-// When converting from f16 coming directly from a load, make sure we
-// load into the FPR16 registers rather than going through the GPRs.
-//   f16->f32
-def : Pat<(f32 (f16_to_f32 (i32
-                (zextloadi16 (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
-                                    ro_Wextend16:$extend))))),
-          (FCVTSHr (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend))>;
-def : Pat<(f32 (f16_to_f32 (i32
-                (zextloadi16 (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
-                                    ro_Xextend16:$extend))))),
-          (FCVTSHr (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend))>;
-def : Pat <(f32 (f16_to_f32 (i32
-                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-           (FCVTSHr (LDRHui GPR64sp:$Rn, uimm12s2:$offset))>;
-def : Pat <(f32 (f16_to_f32 (i32
-                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
-           (FCVTSHr (LDURHi GPR64sp:$Rn, simm9:$offset))>;
-
-//   f16->f64
-def : Pat<(f64 (fextend (f32 (f16_to_f32 (i32
-                (zextloadi16 (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
-                                    ro_Wextend16:$extend))))))),
-          (FCVTDHr (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend))>;
-def : Pat<(f64 (fextend (f32 (f16_to_f32 (i32
-                (zextloadi16 (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
-                                    ro_Xextend16:$extend))))))),
-          (FCVTDHr (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend))>;
-def : Pat <(f64 (fextend (f32 (f16_to_f32 (i32
-                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))))),
-           (FCVTDHr (LDRHui GPR64sp:$Rn, uimm12s2:$offset))>;
-def : Pat <(f64 (fextend (f32 (f16_to_f32 (i32
-                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))))),
-           (FCVTDHr (LDURHi GPR64sp:$Rn, simm9:$offset))>;
-
-// When converting to f16 going directly to a store, make sure we use the
-// appropriate direct conversion instructions and store via the FPR16
-// registers rather than going through the GPRs.
-let AddedComplexity = 10 in {
-// f32->f16
-def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))),
-                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
-                                         ro_Wextend16:$extend)),
-           (STRHroW (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, GPR32:$Rm,
-                                         ro_Wextend16:$extend)>;
-def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))),
-                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
-                                         ro_Xextend16:$extend)),
-           (STRHroX (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, GPR64:$Rm,
-                                         ro_Xextend16:$extend)>;
-def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))),
-              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
-           (STRHui (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, uimm12s2:$offset)>;
-def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 FPR32:$Rt))),
-              (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
-           (STURHi (FCVTHSr FPR32:$Rt), GPR64sp:$Rn, simm9:$offset)>;
-// f64->f16
-def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))),
-                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
-                                         ro_Wextend16:$extend)),
-           (STRHroW (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, GPR32:$Rm,
-                                         ro_Wextend16:$extend)>;
-def : Pat< (truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))),
-                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
-                                         ro_Xextend16:$extend)),
-           (STRHroX (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, GPR64:$Rm,
-                                         ro_Xextend16:$extend)>;
-def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))),
-              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
-           (STRHui (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, uimm12s2:$offset)>;
-def : Pat <(truncstorei16 (assertzext (i32 (f32_to_f16 (f32 (fround FPR64:$Rt))))),
-              (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
-           (STURHi (FCVTHDr FPR64:$Rt), GPR64sp:$Rn, simm9:$offset)>;
-}
-
-
 //===----------------------------------------------------------------------===//
 // Floating point single operand instructions.
 //===----------------------------------------------------------------------===//
@@ -2457,6 +2426,28 @@
 //===----------------------------------------------------------------------===//
 
 defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
+               (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
+          (ABSv8i8 V64:$src)>;
+def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))),
+               (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))),
+          (ABSv4i16 V64:$src)>;
+def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))),
+               (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))),
+          (ABSv2i32 V64:$src)>;
+def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))),
+               (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))),
+          (ABSv16i8 V128:$src)>;
+def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))),
+               (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))),
+          (ABSv8i16 V128:$src)>;
+def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))),
+               (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))),
+          (ABSv4i32 V128:$src)>;
+def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))),
+               (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))),
+          (ABSv2i64 V128:$src)>;
+
 defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
 defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
 defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
@@ -2485,6 +2476,11 @@
                                                     (i64 2))))),
           (FCVTLv4i32 V128:$Rn)>;
 
+def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
+                                                    (i64 4))))),
+          (FCVTLv8i16 V128:$Rn)>;
+
 defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
 defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
 defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
@@ -2496,6 +2492,7 @@
                           (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
           (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
 def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
           (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
@@ -2578,6 +2575,10 @@
 defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
 defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
 
+def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
+def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
+def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
+def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
 def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
 def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
 
@@ -3174,6 +3175,46 @@
 defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
                  BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
 
+// Additional patterns for SMULL and UMULL
+multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (INST8B V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (INST4H V64:$Rn, V64:$Rm)>;
+  def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (INST2S V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
+  SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
+defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
+  UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
+
+// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
+multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
+  Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+  def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+            (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+            (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
+  def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+            (INST2S  V128:$Rd, V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+  SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+  UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+  SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+  TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+  UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
+
 // Patterns for 64-bit pmull
 def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
           (PMULLv1i64 V64:$Rn, V64:$Rm)>;
@@ -3256,6 +3297,10 @@
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 
 // We use EXT to handle extract_subvector to copy the upper 64-bits of a
 // 128-bit vector.
@@ -3267,6 +3312,8 @@
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
@@ -3379,6 +3426,19 @@
           (v2f64 (DUPv2i64lane
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
             (i64 0)))>;
+def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
+          (v4f16 (DUPv4i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
+def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
+          (v8f16 (DUPv8i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
+
+def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
+def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
 
 def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
           (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
@@ -3500,6 +3560,23 @@
 def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
           (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
 
+def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
+            (f16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi16lane
+              (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+              (i64 0)),
+            dsub)>;
+
+def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
+            (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
+          (INSvi16lane
+            V128:$Rn, VectorIndexH:$imm,
+            (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+            (i64 0))>;
+
 def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
             (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
           (EXTRACT_SUBREG
@@ -3580,6 +3657,7 @@
                 dsub)>;
 }
 
+defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
 defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
 defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
 defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
@@ -3595,6 +3673,8 @@
           (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
 def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
           (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
+          (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
 def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
           (f64 (EXTRACT_SUBREG
             (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
@@ -3605,6 +3685,11 @@
             (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
                          V128:$Rn, VectorIndexS:$idx),
             ssub))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
+          (f16 (EXTRACT_SUBREG
+            (INSvi16lane (v8f16 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexH:$idx),
+            hsub))>;
 
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
@@ -3619,6 +3704,7 @@
 def : ConcatPat<v4i32, v2i32>;
 def : ConcatPat<v4f32, v2f32>;
 def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v8f16, v4f16>;
 def : ConcatPat<v16i8, v8i8>;
 
 // If the high lanes are undef, though, we can just ignore them:
@@ -4459,7 +4545,7 @@
                                     0),
                                   dsub)),
                                0),
-                             ssub)))>, Requires<[NotForCodeSize]>;
+                             ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
 
 def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
                           (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -4512,8 +4598,8 @@
                                      0),
                                    dsub)),
                                0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-
+                             dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+ 
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
 def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
@@ -4636,6 +4722,10 @@
           (LD1Rv2d GPR64sp:$Rn)>;
 def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
           (LD1Rv1d GPR64sp:$Rn)>;
+def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
 
 class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
                     ValueType VTy, ValueType STy, Instruction LD1>
@@ -4649,6 +4739,7 @@
 def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
 def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
 def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexH, v8f16, f16, LD1i16>;
 
 class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
                    ValueType VTy, ValueType STy, Instruction LD1>
@@ -4663,6 +4754,7 @@
 def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
 def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
 def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexH, v4f16, f16, LD1i16>;
 
 
 defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
@@ -4690,6 +4782,7 @@
 def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
 def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
 def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexH, v8f16, f16, ST1i16>;
 
 let AddedComplexity = 15 in
 class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
@@ -4704,6 +4797,7 @@
 def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
 def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
 def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexH, v4f16, f16, ST1i16>;
 
 multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
                              ValueType VTy, ValueType STy, Instruction ST1,
@@ -4728,6 +4822,7 @@
 defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
 defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
 defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
 
 multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
                              ValueType VTy, ValueType STy, Instruction ST1,
@@ -4751,6 +4846,7 @@
 defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
 defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
 defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
 
 let mayStore = 1, neverHasSideEffects = 1 in {
 defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
@@ -4929,10 +5025,77 @@
 //   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
 //
 
+// Natural vector casts (64 bit)
+def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+// Natural vector casts (128 bit)
+def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 
 def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
@@ -4941,6 +5104,8 @@
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
@@ -4953,6 +5118,8 @@
                  (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
+                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 
@@ -4962,6 +5129,8 @@
           (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 }
@@ -4990,6 +5159,7 @@
 def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
@@ -4999,6 +5169,8 @@
                              (v1i64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
                              (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
                              (v1i64 (REV64v2i32 FPR64:$src))>;
 }
@@ -5011,6 +5183,7 @@
 def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
@@ -5023,6 +5196,8 @@
                              (v2i32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
                              (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
+                             (v2i32 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
 
@@ -5031,6 +5206,7 @@
 def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
 }
@@ -5043,6 +5219,8 @@
                              (v4i16 (REV16v8i8 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
                              (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
                              (v4i16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
@@ -5050,12 +5228,41 @@
 }
 
 let Predicates = [IsLE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4f16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4f16 (REV64v4i16 FPR64:$src))>;
+}
+
+
+
+let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))), (v8i8  FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
@@ -5070,6 +5277,8 @@
                              (v8i8 (REV32v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
                              (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
 }
 
 let Predicates = [IsLE] in {
@@ -5077,6 +5286,7 @@
 def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))), (f64   FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
@@ -5087,6 +5297,8 @@
                              (f64 (REV64v2i32 FPR64:$src))>;
 def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
                              (f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
@@ -5096,6 +5308,7 @@
 def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
@@ -5106,6 +5319,8 @@
                              (v1f64 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
                              (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
@@ -5116,6 +5331,7 @@
 def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
@@ -5128,6 +5344,8 @@
                              (v2f32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
                              (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
+                             (v2f32 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
 
@@ -5137,6 +5355,7 @@
 def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
@@ -5148,6 +5367,9 @@
 def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
                             (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
                                             (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
                             (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
@@ -5162,6 +5384,7 @@
 def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
 }
@@ -5173,6 +5396,8 @@
                              (v2f64 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
                              (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
                              (v2f64 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
@@ -5183,6 +5408,7 @@
 let Predicates = [IsLE] in {
 def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -5193,6 +5419,8 @@
                                     (REV64v4i32 FPR128:$src), (i32 8)))>;
 def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
                              (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
                              (v4f32 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
@@ -5208,6 +5436,7 @@
 def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
@@ -5221,6 +5450,8 @@
                              (v2i64 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
                              (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
 }
 def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 
@@ -5230,6 +5461,7 @@
 def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
@@ -5244,6 +5476,8 @@
                              (v4i32 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
                              (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
 }
 def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 
@@ -5254,6 +5488,7 @@
 def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
@@ -5270,6 +5505,36 @@
                              (v8i16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
                              (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))),
+                             (v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8f16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
+                             (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8f16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8f16 (REV32v8i16 FPR128:$src))>;
 }
 
 let Predicates = [IsLE] in {
@@ -5279,6 +5544,7 @@
 def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
@@ -5295,6 +5561,8 @@
                              (v16i8 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
                              (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
 }
 
 def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
@@ -5318,6 +5586,8 @@
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 

diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 3df9c4f..8157981 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

@@ -13,20 +13,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-ldst-opt"
@@ -108,7 +109,7 @@
   int getMemSize(MachineInstr *MemMI);
 };
 char AArch64LoadStoreOpt::ID = 0;
-}
+} // namespace
 
 static bool isUnscaledLdst(unsigned Opc) {
   switch (Opc) {
@@ -931,8 +932,9 @@
 
 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
-  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
-  TRI = TM.getRegisterInfo();
+  TII = static_cast<const AArch64InstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
+  TRI = TM.getSubtargetImpl()->getRegisterInfo();
 
   bool Modified = false;
   for (auto &MBB : Fn)

diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 75a17b9..e57b0f4 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp

@@ -25,8 +25,7 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang,
-                                       AsmPrinter &printer)
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer)
     : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
 
 MCSymbol *

diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
index ba50ba9..1e29b80 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/lib/Target/AArch64/AArch64MCInstLower.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64_MCINSTLOWER_H
-#define AArch64_MCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Compiler.h"
@@ -33,7 +33,7 @@
   Triple TargetTriple;
 
 public:
-  AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+  AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer);
 
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;

diff --git a/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/lib/Target/AArch64/AArch64MachineCombinerPattern.h
new file mode 100644
index 0000000..4164b33
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MachineCombinerPattern.h

@@ -0,0 +1,42 @@
+//===- AArch64MachineCombinerPattern.h                                    -===//
+//===- AArch64 instruction pattern supported by combiner                  -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines instruction pattern supported by combiner
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
+
+namespace llvm {
+
+/// Enumeration of instruction pattern supported by machine combiner
+///
+///
+namespace MachineCombinerPattern {
+enum MC_PATTERN : int {
+  MC_NONE = 0,
+  MC_MULADDW_OP1 = 1,
+  MC_MULADDW_OP2 = 2,
+  MC_MULSUBW_OP1 = 3,
+  MC_MULSUBW_OP2 = 4,
+  MC_MULADDWI_OP1 = 5,
+  MC_MULSUBWI_OP1 = 6,
+  MC_MULADDX_OP1 = 7,
+  MC_MULADDX_OP2 = 8,
+  MC_MULSUBX_OP1 = 9,
+  MC_MULSUBX_OP2 = 10,
+  MC_MULADDXI_OP1 = 11,
+  MC_MULSUBXI_OP1 = 12
+};
+} // end namespace MachineCombinerPattern
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 7c257ba..536a8d0 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64MACHINEFUNCTIONINFO_H
-#define AArch64MACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -160,4 +160,4 @@
 };
 } // End llvm namespace
 
-#endif // AArch64MACHINEFUNCTIONINFO_H
+#endif

diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
new file mode 100644
index 0000000..f942c4e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp

@@ -0,0 +1,383 @@
+//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file contains the AArch64 / Cortex-A57 specific register allocation
+// constraints for use by the PBQP register allocator.
+//
+// It is essentially a transcription of what is contained in
+// AArch64A57FPLoadBalancing, which tries to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply-accumulates.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-pbqp"
+
+#include "AArch64.h"
+#include "AArch64PBQPRegAlloc.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocPBQP.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+#ifndef NDEBUG
+bool isFPReg(unsigned reg) {
+  return AArch64::FPR32RegClass.contains(reg) ||
+         AArch64::FPR64RegClass.contains(reg) ||
+         AArch64::FPR128RegClass.contains(reg);
+}
+#endif
+
+bool isOdd(unsigned reg) {
+  switch (reg) {
+  default:
+    llvm_unreachable("Register is not from the expected class !");
+  case AArch64::S1:
+  case AArch64::S3:
+  case AArch64::S5:
+  case AArch64::S7:
+  case AArch64::S9:
+  case AArch64::S11:
+  case AArch64::S13:
+  case AArch64::S15:
+  case AArch64::S17:
+  case AArch64::S19:
+  case AArch64::S21:
+  case AArch64::S23:
+  case AArch64::S25:
+  case AArch64::S27:
+  case AArch64::S29:
+  case AArch64::S31:
+  case AArch64::D1:
+  case AArch64::D3:
+  case AArch64::D5:
+  case AArch64::D7:
+  case AArch64::D9:
+  case AArch64::D11:
+  case AArch64::D13:
+  case AArch64::D15:
+  case AArch64::D17:
+  case AArch64::D19:
+  case AArch64::D21:
+  case AArch64::D23:
+  case AArch64::D25:
+  case AArch64::D27:
+  case AArch64::D29:
+  case AArch64::D31:
+  case AArch64::Q1:
+  case AArch64::Q3:
+  case AArch64::Q5:
+  case AArch64::Q7:
+  case AArch64::Q9:
+  case AArch64::Q11:
+  case AArch64::Q13:
+  case AArch64::Q15:
+  case AArch64::Q17:
+  case AArch64::Q19:
+  case AArch64::Q21:
+  case AArch64::Q23:
+  case AArch64::Q25:
+  case AArch64::Q27:
+  case AArch64::Q29:
+  case AArch64::Q31:
+    return true;
+  case AArch64::S0:
+  case AArch64::S2:
+  case AArch64::S4:
+  case AArch64::S6:
+  case AArch64::S8:
+  case AArch64::S10:
+  case AArch64::S12:
+  case AArch64::S14:
+  case AArch64::S16:
+  case AArch64::S18:
+  case AArch64::S20:
+  case AArch64::S22:
+  case AArch64::S24:
+  case AArch64::S26:
+  case AArch64::S28:
+  case AArch64::S30:
+  case AArch64::D0:
+  case AArch64::D2:
+  case AArch64::D4:
+  case AArch64::D6:
+  case AArch64::D8:
+  case AArch64::D10:
+  case AArch64::D12:
+  case AArch64::D14:
+  case AArch64::D16:
+  case AArch64::D18:
+  case AArch64::D20:
+  case AArch64::D22:
+  case AArch64::D24:
+  case AArch64::D26:
+  case AArch64::D28:
+  case AArch64::D30:
+  case AArch64::Q0:
+  case AArch64::Q2:
+  case AArch64::Q4:
+  case AArch64::Q6:
+  case AArch64::Q8:
+  case AArch64::Q10:
+  case AArch64::Q12:
+  case AArch64::Q14:
+  case AArch64::Q16:
+  case AArch64::Q18:
+  case AArch64::Q20:
+  case AArch64::Q22:
+  case AArch64::Q24:
+  case AArch64::Q26:
+  case AArch64::Q28:
+  case AArch64::Q30:
+    return false;
+
+  }
+}
+
+bool haveSameParity(unsigned reg1, unsigned reg2) {
+  assert(isFPReg(reg1) && "Expecting an FP register for reg1");
+  assert(isFPReg(reg2) && "Expecting an FP register for reg2");
+
+  return isOdd(reg1) == isOdd(reg2);
+}
+
+}
+
+bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
+                                                 unsigned Ra) {
+  if (Rd == Ra)
+    return false;
+
+  LiveIntervals &LIs = G.getMetadata().LIS;
+
+  if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
+    DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
+          << '\n');
+    DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
+          << '\n');
+    return false;
+  }
+
+  PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+  PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(Ra);
+
+  const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+    &G.getNodeMetadata(node1).getAllowedRegs();
+  const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRaAllowed =
+    &G.getNodeMetadata(node2).getAllowedRegs();
+
+  PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+
+  // The edge does not exist. Create one with the appropriate interference
+  // costs.
+  if (edge == G.invalidEdgeId()) {
+    const LiveInterval &ld = LIs.getInterval(Rd);
+    const LiveInterval &la = LIs.getInterval(Ra);
+    bool livesOverlap = ld.overlaps(la);
+
+    PBQPRAGraph::RawMatrix costs(vRdAllowed->size() + 1,
+                                 vRaAllowed->size() + 1, 0);
+    for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+      unsigned pRd = (*vRdAllowed)[i];
+      for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+        unsigned pRa = (*vRaAllowed)[j];
+        if (livesOverlap && TRI->regsOverlap(pRd, pRa))
+          costs[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+        else
+          costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0;
+      }
+    }
+    G.addEdge(node1, node2, std::move(costs));
+    return true;
+  }
+
+  if (G.getEdgeNode1Id(edge) == node2) {
+    std::swap(node1, node2);
+    std::swap(vRdAllowed, vRaAllowed);
+  }
+
+  // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass))
+  PBQPRAGraph::RawMatrix costs(G.getEdgeCosts(edge));
+  for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+    unsigned pRd = (*vRdAllowed)[i];
+
+    // Get the maximum cost (excluding unallocatable reg) for same parity
+    // registers
+    PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+    for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+      unsigned pRa = (*vRaAllowed)[j];
+      if (haveSameParity(pRd, pRa))
+        if (costs[i + 1][j + 1] !=
+                std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+            costs[i + 1][j + 1] > sameParityMax)
+          sameParityMax = costs[i + 1][j + 1];
+    }
+
+    // Ensure all registers with a different parity have a higher cost
+    // than sameParityMax
+    for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+      unsigned pRa = (*vRaAllowed)[j];
+      if (!haveSameParity(pRd, pRa))
+        if (sameParityMax > costs[i + 1][j + 1])
+          costs[i + 1][j + 1] = sameParityMax + 1.0;
+    }
+  }
+  G.setEdgeCosts(edge, std::move(costs));
+
+  return true;
+}
+
+void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
+                                                 unsigned Ra) {
+  LiveIntervals &LIs = G.getMetadata().LIS;
+
+  // Do some Chain management
+  if (Chains.count(Ra)) {
+    if (Rd != Ra) {
+      DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to "
+                   << PrintReg(Rd, TRI) << '\n';);
+      Chains.remove(Ra);
+      Chains.insert(Rd);
+    }
+  } else {
+    DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI)
+                 << '\n';);
+    Chains.insert(Rd);
+  }
+
+  PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+
+  const LiveInterval &ld = LIs.getInterval(Rd);
+  for (auto r : Chains) {
+    // Skip self
+    if (r == Rd)
+      continue;
+
+    const LiveInterval &lr = LIs.getInterval(r);
+    if (ld.overlaps(lr)) {
+      const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+        &G.getNodeMetadata(node1).getAllowedRegs();
+
+      PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(r);
+      const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRrAllowed =
+        &G.getNodeMetadata(node2).getAllowedRegs();
+
+      PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+      assert(edge != G.invalidEdgeId() &&
+             "PBQP error ! The edge should exist !");
+
+      DEBUG(dbgs() << "Refining constraint !\n";);
+
+      if (G.getEdgeNode1Id(edge) == node2) {
+        std::swap(node1, node2);
+        std::swap(vRdAllowed, vRrAllowed);
+      }
+
+      // Enforce that cost is higher with all other Chains of the same parity
+      PBQP::Matrix costs(G.getEdgeCosts(edge));
+      for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+        unsigned pRd = (*vRdAllowed)[i];
+
+        // Get the maximum cost (excluding unallocatable reg) for all other
+        // parity registers
+        PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+        for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+          unsigned pRa = (*vRrAllowed)[j];
+          if (!haveSameParity(pRd, pRa))
+            if (costs[i + 1][j + 1] !=
+                    std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+                costs[i + 1][j + 1] > sameParityMax)
+              sameParityMax = costs[i + 1][j + 1];
+        }
+
+        // Ensure all registers with same parity have a higher cost
+        // than sameParityMax
+        for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+          unsigned pRa = (*vRrAllowed)[j];
+          if (haveSameParity(pRd, pRa))
+            if (sameParityMax > costs[i + 1][j + 1])
+              costs[i + 1][j + 1] = sameParityMax + 1.0;
+        }
+      }
+      G.setEdgeCosts(edge, std::move(costs));
+    }
+  }
+}
+
+static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg,
+                                const MachineInstr &MI) {
+  LiveInterval LI = LIs.getInterval(reg);
+  SlotIndex SI = LIs.getInstructionIndex(&MI);
+  return LI.expiredAt(SI);
+}
+
+void A57ChainingConstraint::apply(PBQPRAGraph &G) {
+  const MachineFunction &MF = G.getMetadata().MF;
+  LiveIntervals &LIs = G.getMetadata().LIS;
+
+  TRI = MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+  DEBUG(MF.dump());
+
+  for (const auto &MBB: MF) {
+    Chains.clear(); // FIXME: really needed ? Could not work at MF level ?
+
+    for (const auto &MI: MBB) {
+
+      // Forget Chains which have expired
+      for (auto r : Chains) {
+        SmallVector<unsigned, 8> toDel;
+        if(regJustKilledBefore(LIs, r, MI)) {
+          DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at ";
+                MI.print(dbgs()););
+          toDel.push_back(r);
+        }
+
+        while (!toDel.empty()) {
+          Chains.remove(toDel.back());
+          toDel.pop_back();
+        }
+      }
+
+      switch (MI.getOpcode()) {
+      case AArch64::FMSUBSrrr:
+      case AArch64::FMADDSrrr:
+      case AArch64::FNMSUBSrrr:
+      case AArch64::FNMADDSrrr:
+      case AArch64::FMSUBDrrr:
+      case AArch64::FMADDDrrr:
+      case AArch64::FNMSUBDrrr:
+      case AArch64::FNMADDDrrr: {
+        unsigned Rd = MI.getOperand(0).getReg();
+        unsigned Ra = MI.getOperand(3).getReg();
+
+        if (addIntraChainConstraint(G, Rd, Ra))
+          addInterChainConstraint(G, Rd, Ra);
+        break;
+      }
+
+      case AArch64::FMLAv2f32:
+      case AArch64::FMLSv2f32: {
+        unsigned Rd = MI.getOperand(0).getReg();
+        addInterChainConstraint(G, Rd, Rd);
+        break;
+      }
+
+      default:
+        break;
+      }
+    }
+  }
+}

diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/lib/Target/AArch64/AArch64PBQPRegAlloc.h
new file mode 100644
index 0000000..4f656f9
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.h

@@ -0,0 +1,38 @@
+//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
+
+namespace llvm {
+
+/// Add the accumulator chaining constraint to a PBQP graph
+class A57ChainingConstraint : public PBQPRAConstraint {
+public:
+  // Add A57 specific constraints to the PBQP graph.
+  void apply(PBQPRAGraph &G) override;
+
+private:
+  SmallSetVector<unsigned, 32> Chains;
+  const TargetRegisterInfo *TRI;
+
+  // Add the accumulator chaining constraint, inside the chain, i.e. so that
+  // parity(Rd) == parity(Ra).
+  // \return true if a constraint was added
+  bool addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+
+  // Add constraints between existing chains
+  void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+};
+}
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H

diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h
index b22fa24..9e9eec4 100644
--- a/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/lib/Target/AArch64/AArch64PerfectShuffle.h

@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+
 // 31 entries have cost 0
 // 242 entries have cost 1
 // 1447 entries have cost 2
@@ -6584,3 +6587,5 @@
   835584U, // <u,u,u,u>: Cost 0 copy LHS
   0
 };
+
+#endif

diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 4723cc4..16c33b7 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp

@@ -21,18 +21,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -569,7 +569,7 @@
         // global. Do not promote constant expressions either, as they may
         // require some code expansion.
         if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
-            AlreadyChecked.insert(Cst))
+            AlreadyChecked.insert(Cst).second)
           LocalChange |= promoteConstant(Cst);
       }
     }

diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 01b9587..d734d43 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp

@@ -76,7 +76,7 @@
 
 BitVector
 AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
@@ -105,7 +105,7 @@
 
 bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
                                       unsigned Reg) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   switch (Reg) {
   default:
@@ -169,7 +169,7 @@
 
 unsigned
 AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
 }
@@ -236,7 +236,7 @@
   // Note that the incoming offset is based on the SP value at function entry,
   // so it'll be negative.
   MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Estimate an offset from the frame pointer.
@@ -326,7 +326,7 @@
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
-      MF.getTarget().getFrameLowering());
+      MF.getSubtarget().getFrameLowering());
 
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned FrameReg;
@@ -364,7 +364,7 @@
 
 unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                                   MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   switch (RC->getID()) {
   default:

diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 76af1ed..51a5034 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64REGISTERINFO_H
-#define LLVM_TARGET_AArch64REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
 
 #define GET_REGINFO_HEADER
 #include "AArch64GenRegisterInfo.inc"
@@ -98,4 +98,4 @@
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_AArch64REGISTERINFO_H
+#endif

diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index a30e4ad..d5ff3f1 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td

@@ -390,13 +390,14 @@
 }
 def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
 def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
-                                    v1i64],
+                                    v1i64, v4f16],
                                     64, (sequence "D%u", 0, 31)>;
 // We don't (yet) have an f128 legal type, so don't use that here. We
 // normalize 128-bit vectors to v2f64 for arg passing and such, so use
 // that here.
 def FPR128 : RegisterClass<"AArch64",
-                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128,
+                            v8f16],
                            128, (sequence "Q%u", 0, 31)>;
 
 // The lower 16 vector registers.  Some instructions can only take registers

diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 8209f96..3ec4157 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td

@@ -12,11 +12,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// The Cortex-A57 is a traditional superscaler microprocessor with a
+// conservative 3-wide in-order stage for decode and dispatch. Combined with the
+// much wider out-of-order issue stage, this produced a need to carefully
+// schedule micro-ops so that all three decoded each cycle are successfully
+// issued as the reservation station(s) simply don't stay occupied for long.
+// Therefore, IssueWidth is set to the narrower of the two at three, while still
+// modeling the machine as out-of-order.
+
 def CortexA57Model : SchedMachineModel {
-  let IssueWidth        =   8; // 3-way decode and 8-way issue
+  let IssueWidth        =   3; // 3-way decode and dispatch
   let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
   let LoadLatency       =   4; // Optimistic load latency
   let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling. The magic number is chosen based on
+  // experiments and benchmarking data.
+  let LoopMicroOpBufferSize = 16;
 }
 
 //===----------------------------------------------------------------------===//
@@ -24,18 +37,17 @@
 // Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
 // micro-ops wait for their operands and then issue out-of-order.
 
-def A57UnitB : ProcResource<1> { let BufferSize = 8; }  // Type B micro-ops
-def A57UnitI : ProcResource<2> { let BufferSize = 8; }  // Type I micro-ops
-def A57UnitM : ProcResource<1> { let BufferSize = 8; }  // Type M micro-ops
-def A57UnitL : ProcResource<1> { let BufferSize = 8; }  // Type L micro-ops
-def A57UnitS : ProcResource<1> { let BufferSize = 8; }  // Type S micro-ops
-def A57UnitX : ProcResource<1> { let BufferSize = 8; }  // Type X micro-ops
-def A57UnitW : ProcResource<1> { let BufferSize = 8; }  // Type W micro-ops
+def A57UnitB : ProcResource<1>;  // Type B micro-ops
+def A57UnitI : ProcResource<2>;  // Type I micro-ops
+def A57UnitM : ProcResource<1>;  // Type M micro-ops
+def A57UnitL : ProcResource<1>;  // Type L micro-ops
+def A57UnitS : ProcResource<1>;  // Type S micro-ops
+def A57UnitX : ProcResource<1>;  // Type X micro-ops
+def A57UnitW : ProcResource<1>;  // Type W micro-ops
 let SchedModel = CortexA57Model in {
   def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
 }
 
-
 let SchedModel = CortexA57Model in {
 
 //===----------------------------------------------------------------------===//
@@ -71,7 +83,7 @@
 def : SchedAlias<WriteF,     A57Write_3cyc_1V>;
 def : SchedAlias<WriteFCmp,  A57Write_3cyc_1V>;
 def : SchedAlias<WriteFCvt,  A57Write_5cyc_1V>;
-def : SchedAlias<WriteFCopy, A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
 def : SchedAlias<WriteFImm,  A57Write_3cyc_1V>;
 def : SchedAlias<WriteFMul,  A57Write_5cyc_1V>;
 def : SchedAlias<WriteFDiv,  A57Write_18cyc_1X>;
@@ -85,13 +97,12 @@
 
 def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
 
-// Forwarding logic is not [yet] explicitly modeled beyond what is captured
-// in the latencies of the A57 Generic SchedWriteRes's.
+// Forwarding logic is only modeled for multiply and accumulate
 def : ReadAdvance<ReadI,       0>;
 def : ReadAdvance<ReadISReg,   0>;
 def : ReadAdvance<ReadIEReg,   0>;
 def : ReadAdvance<ReadIM,      0>;
-def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
 def : ReadAdvance<ReadID,      0>;
 def : ReadAdvance<ReadExtrHi,  0>;
 def : ReadAdvance<ReadAdrBase, 0>;
@@ -134,7 +145,13 @@
 // Cryptography Extensions
 // -----------------------------------------------------------------------------
 
-def : InstRW<[A57Write_3cyc_1W], (instregex "CRC32")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^CRC32")>;
 
 
 // Vector Load
@@ -301,4 +318,330 @@
 def : InstRW<[A57Write_8cyc_8S],                (instregex "ST4Fourv(2d)$")>;
 def : InstRW<[A57Write_8cyc_8S, WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
 
+// Vector - Integer
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v8i8, v4i16, v2i32
+//   Q form - v16i8, v8i16, v4i32
+//   D form - v1i8, v1i16, v1i32, v1i64
+//   Q form - v16i8, v8i16, v4i32, v2i64
+//   D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
+//   Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU]?ADDL?Vv16i8v$")>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+
+// ASIMD multiply, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+def A57WriteIVMA   : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57ReadIVMA4   : SchedReadAdvance<4, [A57WriteIVMA]>;
+def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>;
+
+// ASIMD multiply long
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+def A57WriteIVA    : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57ReadIVA3    : SchedReadAdvance<3, [A57WriteIVA]>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[A57Write_4cyc_1X], (instregex "^SQSHLU")>;
+
+
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A57Write_4cyc_2X], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, complex, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Vector - Floating Point
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v2f32
+//   Q form - v4f32, v2f64
+//   D form - 32, 64
+//   D form - v1i32, v1i64
+//   D form - v2i32
+//   Q form - v4i32, v2i64
+
+// ASIMD FP arith, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FABD|FADD|FSUB)(v2f32|32|64|v2i32p)")>;
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FABD|FADD|FSUB)(v4f32|v2f64|v2i64p)")>;
+
+// ASIMD FP arith, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FADDP(v2f32|32|64|v2i32)")>;
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^FADDP(v4f32|v2f64|v2i64)")>;
+
+// ASIMD FP compare, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v2f32|32|64|v1i32|v2i32|v1i64)")>;
+// ASIMD FP compare, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[A57Write_18cyc_1X], (instregex "FDIVv2f32")>;
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[A57Write_36cyc_2X], (instregex "FDIVv4f32")>;
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[A57Write_64cyc_2X], (instregex "FDIVv2f64")>;
+
+// Note: These were simply duplicated from ASIMD FDIV because of missing documentation
+// ASIMD FP square root, D-form, F32
+def : InstRW<[A57Write_18cyc_1X], (instregex "FSQRTv2f32")>;
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[A57Write_36cyc_2X], (instregex "FSQRTv4f32")>;
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[A57Write_64cyc_2X], (instregex "FSQRTv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?(v2f32)")>;
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FMAX|FMIN)(NM)?(v4f32|v2f64)")>;
+// ASIMD FP max/min, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?P(v2f32|v2i32)")>;
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i64)")>;
+// ASIMD FP max/min, reduce
+def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>;
+
+// ASIMD FP multiply, D-form, FZ
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD FP multiply, Q-form, FZ
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, D-form, FZ
+// ASIMD FP multiply accumulate, Q-form, FZ
+def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10;  }
+def A57ReadFPVMA5  : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>;
+def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+
+// Vector - Miscellaneous
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+//   D form - v8i8, v4i16, v2i32
+//   Q form - v16i8, v8i16, v4i32
+//   D form - v1i8, v1i16, v1i32, v1i64
+//   Q form - v16i8, v8i16, v4i32, v2i64
+
+// ASIMD bitwise insert, Q-form
+def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>;
+
+// ASIMD duplicate, gen reg, D-form and Q-form
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>;
+
+// ASIMD move, saturating
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]QXTU?N")>;
+
+// ASIMD reciprocal estimate, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f64|v4f32|v4i32)")>;
+
+// ASIMD reciprocal step, D-form, FZ
+def : InstRW<[A57Write_9cyc_1V], (instregex "^F(RECP|RSQRT)S(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD reciprocal step, Q-form, FZ
+def : InstRW<[A57Write_9cyc_2V], (instregex "^F(RECP|RSQRT)S(v2f64|v4f32|v4i32)")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[A57Write_3cyc_1V], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[A57Write_9cyc_3V], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[A57Write_12cyc_4V], (instregex "^TB[LX]v8i8Four")>;
+// ASIMD table lookup, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[A57Write_9cyc_5V], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[A57Write_12cyc_7V], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[A57Write_15cyc_9V], (instregex "^TB[LX]v16i8Four")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[A57Write_6cyc_1I_1L], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^INSv")>;
+
+// ASIMD unzip/zip, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Remainder
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>;
+
+def A57WriteFPMA  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57ReadFPMA5  : SchedReadAdvance<5, [A57WriteFPMA]>;
+def A57ReadFPM    : SchedReadAdvance<0>;
+def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[SU]CVTF")>;
+
+def : InstRW<[A57Write_32cyc_1X], (instrs FDIVDrr)>;
+def : InstRW<[A57Write_18cyc_1X], (instrs FDIVSrr)>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(MAX|MIN).+rr")>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT.+r")>;
+
+def : InstRW<[A57Write_32cyc_1X], (instrs FSQRTDr)>;
+def : InstRW<[A57Write_18cyc_1X], (instrs FSQRTSr)>;
+
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpre)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpost)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpre)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpost)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRBpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRBui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRHpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRHui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRQpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURBi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURDi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURHi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURQi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURSi)>;
+
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPDi)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STNPQi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPXi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPDi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpre)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STPQi)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_1I_4S], (instrs STPQpost)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_2I_4S], (instrs STPQpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPXi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRBpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRDpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRDpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroX)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STRQpre)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroW)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroX)>;
+def : InstRW<[A57Write_2cyc_1I_2S], (instrs STRQui)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STURQi)>;
+
 } // SchedModel = CortexA57Model

diff --git a/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
index a8f421b..6f30108 100644
--- a/lib/Target/AArch64/AArch64SchedA57WriteRes.td
+++ b/lib/Target/AArch64/AArch64SchedA57WriteRes.td

@@ -28,14 +28,18 @@
 def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
 def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
 def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
-def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; }
-def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; }
+def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18;
+                                                    let ResourceCycles = [18]; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
+                                                    let ResourceCycles = [19]; }
 def A57Write_1cyc_1B  : SchedWriteRes<[A57UnitB]> { let Latency = 1;  }
 def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;  }
 def A57Write_1cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 1;  }
 def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;  }
-def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; }
-def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; }
+def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32;
+                                                    let ResourceCycles = [32]; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35;
+                                                    let ResourceCycles = [35]; }
 def A57Write_3cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 3;  }
 def A57Write_3cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 3;  }
 def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
@@ -53,6 +57,7 @@
 def A57Write_64cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
   let Latency     = 64;
   let NumMicroOps = 2;
+  let ResourceCycles = [32, 32];
 }
 def A57Write_6cyc_1I_1L  : SchedWriteRes<[A57UnitI,
                                           A57UnitL]> {
@@ -137,6 +142,7 @@
 def A57Write_36cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
   let Latency     = 36;
   let NumMicroOps = 2;
+  let ResourceCycles = [18, 18];
 }
 def A57Write_3cyc_1I_1M  : SchedWriteRes<[A57UnitI,
                                           A57UnitM]> {
@@ -153,6 +159,10 @@
   let Latency     = 3;
   let NumMicroOps = 2;
 }
+def A57Write_3cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
 def A57Write_4cyc_1I_1L  : SchedWriteRes<[A57UnitI,
                                           A57UnitL]> {
   let Latency     = 4;
@@ -295,6 +305,11 @@
   let Latency     = 9;
   let NumMicroOps = 4;
 }
+def A57Write_12cyc_4V      : SchedWriteRes<[A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 12;
+  let NumMicroOps = 4;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -334,6 +349,11 @@
   let Latency     = 9;
   let NumMicroOps = 5;
 }
+def A57Write_9cyc_5V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+                                            A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 5;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -399,7 +419,7 @@
   let Latency     = 4;
   let NumMicroOps = 7;
 }
-def A57Write_6cyc_1I_6S  : SchedWriteRes<[A57UnitI,
+def A57Write_6cyc_1I_6S     : SchedWriteRes<[A57UnitI,
                                           A57UnitS, A57UnitS, A57UnitS,
                                           A57UnitS, A57UnitS, A57UnitS]> {
   let Latency     = 6;
@@ -412,6 +432,12 @@
   let Latency     = 9;
   let NumMicroOps = 7;
 }
+def A57Write_12cyc_7V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV]> {
+  let Latency     = 12;
+  let NumMicroOps = 7;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -443,11 +469,11 @@
 //===----------------------------------------------------------------------===//
 // Define Generic 9 micro-op types
 
-def A57Write_8cyc_1I_8S  : SchedWriteRes<[A57UnitI,
-                                          A57UnitS, A57UnitS,
-                                          A57UnitS, A57UnitS,
-                                          A57UnitS, A57UnitS,
-                                          A57UnitS, A57UnitS]> {
+def A57Write_8cyc_1I_8S     : SchedWriteRes<[A57UnitI,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS,
+                                            A57UnitS, A57UnitS]> {
   let Latency     = 8;
   let NumMicroOps = 9;
 }
@@ -459,6 +485,12 @@
   let Latency     = 11;
   let NumMicroOps = 9;
 }
+def A57Write_15cyc_9V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV, A57UnitV,
+                                             A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 15;
+  let NumMicroOps = 9;
+}
 
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 1bf64fc..0cfd582 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp

@@ -36,8 +36,7 @@
   // instead of memset.
   if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
     const AArch64TargetLowering &TLI =
-        *static_cast<const AArch64TargetLowering *>(
-            DAG.getTarget().getTargetLowering());
+        *DAG.getTarget().getSubtarget<AArch64Subtarget>().getTargetLowering();
 
     EVT IntPtr = TLI.getPointerTy();
     Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());

diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 1180eea..11932d2 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64SELECTIONDAGINFO_H
-#define AArch64SELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 

diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 45f8ddb..0c36e8f 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp

@@ -39,7 +39,7 @@
   static char ID;
   AArch64StorePairSuppress() : MachineFunctionPass(ID) {}
 
-  virtual const char *getPassName() const override {
+  const char *getPassName() const override {
     return "AArch64 Store Pair Suppression";
   }
 
@@ -50,7 +50,7 @@
 
   bool isNarrowFPStore(const MachineInstr &MI);
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineTraceMetrics>();
     AU.addPreserved<MachineTraceMetrics>();
@@ -85,8 +85,7 @@
 
   // If a subtarget does not define resources for STPQi, bail here.
   if (SCDesc->isValid() && !SCDesc->isVariant()) {
-    unsigned ResLenWithSTP = BBTrace.getResourceLength(
-        ArrayRef<const MachineBasicBlock *>(), SCDesc);
+    unsigned ResLenWithSTP = BBTrace.getResourceLength(None, SCDesc);
     if (ResLenWithSTP > ResLength) {
       DEBUG(dbgs() << "  Suppress STP in BB: " << BB->getNumber()
                    << " resources " << ResLength << " -> " << ResLenWithSTP
@@ -118,12 +117,13 @@
 
 bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
-  TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo());
-  TRI = MF->getTarget().getRegisterInfo();
+  TII =
+      static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo());
+  TRI = MF->getSubtarget().getRegisterInfo();
   MRI = &MF->getRegInfo();
   const TargetSubtargetInfo &ST =
       MF->getTarget().getSubtarget<TargetSubtargetInfo>();
-  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
 
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;

diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index bb0b72c..47b5d54 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp

@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64PBQPRegAlloc.h"
 #include "AArch64Subtarget.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineScheduler.h"
@@ -43,8 +44,8 @@
 
 AArch64Subtarget::AArch64Subtarget(const std::string &TT,
                                    const std::string &CPU,
-                                   const std::string &FS, TargetMachine &TM,
-                                   bool LittleEndian)
+                                   const std::string &FS,
+                                   const TargetMachine &TM, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
       HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
       HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
@@ -64,13 +65,7 @@
 unsigned char
 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
                                         const TargetMachine &TM) const {
-
-  // Determine whether this is a reference to a definition or a declaration.
-  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
-  // load from stub.
-  bool isDecl = GV->hasAvailableExternallyLinkage();
-  if (GV->isDeclaration() && !GV->isMaterializable())
-    isDecl = true;
+  bool isDecl = GV->isDeclarationForLinker();
 
   // MachO large model always goes via a GOT, simply to get a single 8-byte
   // absolute relocation on all global addresses.
@@ -78,10 +73,15 @@
     return AArch64II::MO_GOT;
 
   // The small code mode's direct accesses use ADRP, which cannot necessarily
-  // produce the value 0 (if the code is above 4GB). Therefore they must use the
-  // GOT.
-  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
-    return AArch64II::MO_GOT;
+  // produce the value 0 (if the code is above 4GB).
+  if (TM.getCodeModel() == CodeModel::Small &&
+      GV->isWeakForLinker() && isDecl) {
+    // In PIC mode use the GOT, but in absolute mode use a constant pool load.
+    if (TM.getRelocationModel() == Reloc::Static)
+        return AArch64II::MO_CONSTPOOL;
+    else
+        return AArch64II::MO_GOT;
+  }
 
   // If symbol visibility is hidden, the extra load is not needed if
   // the symbol is definitely defined in the current translation unit.
@@ -128,3 +128,11 @@
 bool AArch64Subtarget::enableEarlyIfConversion() const {
   return EnableEarlyIfConvert;
 }
+
+std::unique_ptr<PBQPRAConstraint>
+AArch64Subtarget::getCustomPBQPConstraints() const {
+  if (!isCortexA57())
+    return nullptr;
+
+  return llvm::make_unique<A57ChainingConstraint>();
+}

diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 10c646d..e2740f1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h

@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64SUBTARGET_H
-#define AArch64SUBTARGET_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
 
-#include "AArch64InstrInfo.h"
 #include "AArch64FrameLowering.h"
 #include "AArch64ISelLowering.h"
+#include "AArch64InstrInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64SelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -69,18 +69,27 @@
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   AArch64Subtarget(const std::string &TT, const std::string &CPU,
-		   const std::string &FS, TargetMachine &TM, bool LittleEndian);
+                   const std::string &FS, const TargetMachine &TM,
+                   bool LittleEndian);
 
-  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
-  const AArch64FrameLowering *getFrameLowering() const {
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const AArch64FrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  const AArch64TargetLowering *getTargetLowering() const {
+  const AArch64TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
-  const AArch64InstrInfo *getInstrInfo() const { return &InstrInfo; }
-  const DataLayout *getDataLayout() const { return &DL; }
+  const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const AArch64RegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
   bool enableMachineScheduler() const override { return true; }
+  bool enablePostMachineScheduler() const override {
+    return isCortexA53() || isCortexA57();
+  }
 
   bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
 
@@ -94,15 +103,20 @@
   bool isLittleEndian() const { return DL.isLittleEndian(); }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetIOS() const { return TargetTriple.isiOS(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
 
+  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
   bool isCyclone() const { return CPUString == "cyclone"; }
   bool isCortexA57() const { return CPUString == "cortex-a57"; }
   bool isCortexA53() const { return CPUString == "cortex-a53"; }
 
+  bool useAA() const override { return isCortexA53(); }
+
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
   unsigned getMaxInlineSizeThreshold() const { return 64; }
@@ -128,7 +142,9 @@
                            unsigned NumRegionInstrs) const override;
 
   bool enableEarlyIfConversion() const override;
+
+  std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
 };
 } // End llvm namespace
 
-#endif // AArch64SUBTARGET_H
+#endif

diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 722e5e7..beed8e0 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp

@@ -12,8 +12,11 @@
 
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
-#include "llvm/PassManager.h"
+#include "AArch64TargetObjectFile.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/IR/Function.h"
+#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
@@ -24,6 +27,10 @@
 EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
            cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableMCR("aarch64-mcr",
+                               cl::desc("Enable the machine combiner pass"),
+                               cl::init(true), cl::Hidden);
+
 static cl::opt<bool>
 EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
                      cl::init(true), cl::Hidden);
@@ -64,19 +71,36 @@
                         cl::desc("Run early if-conversion"),
                         cl::init(true));
 
+static cl::opt<bool>
+EnableCondOpt("aarch64-condopt",
+              cl::desc("Enable the condition optimizer pass"),
+              cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
 EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
                 cl::desc("Work around Cortex-A53 erratum 835769"),
                 cl::init(false));
 
+static cl::opt<bool>
+EnableGEPOpt("aarch64-gep-opt", cl::Hidden,
+             cl::desc("Enable optimizations on complex GEPs"),
+             cl::init(true));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
   RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
+  RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64Target);
+}
 
-  RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget);
-  RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget);
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
+    return make_unique<AArch64_MachoTargetObjectFile>();
+
+  return make_unique<AArch64_ELFTargetObjectFile>();
 }
 
 /// TargetMachine ctor - Create an AArch64 architecture model.
@@ -88,10 +112,39 @@
                                            CodeGenOpt::Level OL,
                                            bool LittleEndian)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, *this, LittleEndian) {
+      TLOF(createTLOF(Triple(getTargetTriple()))),
+      Subtarget(TT, CPU, FS, *this, LittleEndian), isLittle(LittleEndian) {
   initAsmInfo();
 }
 
+AArch64TargetMachine::~AArch64TargetMachine() {}
+
+const AArch64Subtarget *
+AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
+  AttributeSet FnAttrs = F.getAttributes();
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, isLittle);
+  }
+  return I.get();
+}
+
 void AArch64leTargetMachine::anchor() { }
 
 AArch64leTargetMachine::
@@ -115,7 +168,10 @@
 class AArch64PassConfig : public TargetPassConfig {
 public:
   AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {
+    if (TM->getOptLevel() != CodeGenOpt::None)
+      substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
+  }
 
   AArch64TargetMachine &getAArch64TargetMachine() const {
     return getTM<AArch64TargetMachine>();
@@ -147,7 +203,7 @@
 void AArch64PassConfig::addIRPasses() {
   // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
   // ourselves.
-  addPass(createAtomicExpandLoadLinkedPass(TM));
+  addPass(createAtomicExpandPass(TM));
 
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
@@ -156,6 +212,19 @@
     addPass(createCFGSimplificationPass());
 
   TargetPassConfig::addIRPasses();
+
+  if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
+    // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+    // and lower a GEP with multiple indices to either arithmetic operations or
+    // multiple GEPs with single index.
+    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    // Call EarlyCSE pass to find and remove subexpressions in the lowered
+    // result.
+    addPass(createEarlyCSEPass());
+    // Do loop invariant code motion in case part of the lowered result is
+    // invariant.
+    addPass(createLICMPass());
+  }
 }
 
 // Pass Pipeline Configuration
@@ -185,8 +254,12 @@
 }
 
 bool AArch64PassConfig::addILPOpts() {
+  if (EnableCondOpt)
+    addPass(createAArch64ConditionOptimizerPass());
   if (EnableCCMP)
     addPass(createAArch64ConditionalCompares());
+  if (EnableMCR)
+    addPass(&MachineCombinerID);
   if (EnableEarlyIfConversion)
     addPass(&EarlyIfConverterID);
   if (EnableStPairSuppress)
@@ -196,8 +269,12 @@
 
 bool AArch64PassConfig::addPreRegAlloc() {
   // Use AdvSIMD scalar instructions whenever profitable.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) {
     addPass(createAArch64AdvSIMDScalar());
+    // The AdvSIMD pass may produce copies that can be rewritten to
+    // be register coaleascer friendly.
+    addPass(&PeepholeOptimizerID);
+  }
   return true;
 }
 
@@ -206,7 +283,9 @@
   if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
     addPass(createAArch64DeadRegisterDefinitions());
   if (TM->getOptLevel() != CodeGenOpt::None &&
-      TM->getSubtarget<AArch64Subtarget>().isCortexA57())
+      (TM->getSubtarget<AArch64Subtarget>().isCortexA53() ||
+       TM->getSubtarget<AArch64Subtarget>().isCortexA57()) &&
+      usingDefaultRegAlloc())
     // Improve performance for some FP/SIMD code for A57.
     addPass(createAArch64A57FPLoadBalancing());
   return true;

diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 852cb3f..75c65c5 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64TARGETMACHINE_H
-#define AArch64TARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
 
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
@@ -23,7 +23,9 @@
 
 class AArch64TargetMachine : public LLVMTargetMachine {
 protected:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AArch64Subtarget Subtarget;
+  mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap;
 
 public:
   AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
@@ -31,33 +33,25 @@
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL, bool IsLittleEndian);
 
+  ~AArch64TargetMachine() override;
+
   const AArch64Subtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
-  const AArch64TargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
-  const AArch64FrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  const AArch64InstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const AArch64RegisterInfo *getRegisterInfo() const override {
-    return &getInstrInfo()->getRegisterInfo();
-  }
-  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
+  const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   /// \brief Register AArch64 analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile* getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+private:
+  bool isLittle;
 };
 
 // AArch64leTargetMachine - AArch64 little endian target machine.

diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index de63cb4..2e595f9 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"

diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1dac14b..b1a2914 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

@@ -51,7 +51,7 @@
 
   AArch64TTI(const AArch64TargetMachine *TM)
       : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
+        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
     initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
   }
 
@@ -104,7 +104,7 @@
     return 64;
   }
 
-  unsigned getMaximumUnrollFactor() const override { return 2; }
+  unsigned getMaxInterleaveFactor() const override;
 
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
       override;
@@ -112,10 +112,11 @@
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
       override;
 
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                  OperandValueKind Opd1Info = OK_AnyValue,
-                                  OperandValueKind Opd2Info = OK_AnyValue) const
-      override;
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
+      OperandValueKind Opd2Info = OK_AnyValue,
+      OperandValueProperties Opd1PropInfo = OP_None,
+      OperandValueProperties Opd2PropInfo = OP_None) const override;
 
   unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
 
@@ -124,6 +125,13 @@
 
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                            unsigned AddressSpace) const override;
+
+  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override;
+
+  void getUnrollingPreferences(const Function *F, Loop *L,
+                               UnrollingPreferences &UP) const override;
+
+
   /// @}
 };
 
@@ -400,18 +408,42 @@
   return 2;
 }
 
-unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind Opd1Info,
-                                          OperandValueKind Opd2Info) const {
+unsigned AArch64TTI::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
+    OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
+    OperandValueProperties Opd2PropInfo) const {
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
+  if (ISD == ISD::SDIV &&
+      Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+    // On AArch64, scalar signed division by constants power-of-two are
+    // normally expanded to the sequence ADD + CMP + SELECT + SRA.
+    // The OperandValue properties many not be same as that of previous
+    // operation; conservatively assume OP_None.
+    unsigned Cost =
+      getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                             TargetTransformInfo::OP_None,
+                             TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    return Cost;
+  }
+
   switch (ISD) {
   default:
-    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
-                                                       Opd2Info);
+    return TargetTransformInfo::getArithmeticInstrCost(
+        Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -498,3 +530,27 @@
 
   return LT.first;
 }
+
+unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const {
+  unsigned Cost = 0;
+  for (auto *I : Tys) {
+    if (!I->isVectorTy())
+      continue;
+    if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
+      Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
+        getMemoryOpCost(Instruction::Load, I, 128, 0);
+  }
+  return Cost;
+}
+
+unsigned AArch64TTI::getMaxInterleaveFactor() const {
+  if (ST->isCortexA57())
+    return 4;
+  return 2;
+}
+
+void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L,
+                                         UnrollingPreferences &UP) const {
+  // Disable partial & runtime unrolling on -Os.
+  UP.PartialOptSizeThreshold = 0;
+}

diff --git a/lib/Target/AArch64/Android.mk b/lib/Target/AArch64/Android.mk
index d7b3317..f3acd3a 100644
--- a/lib/Target/AArch64/Android.mk
+++ b/lib/Target/AArch64/Android.mk

@@ -24,6 +24,7 @@
   AArch64CleanupLocalDynamicTLSPass.cpp \
   AArch64CollectLOH.cpp \
   AArch64ConditionalCompares.cpp \
+  AArch64ConditionOptimizer.cpp \
   AArch64DeadRegisterDefinitionsPass.cpp \
   AArch64ExpandPseudoInsts.cpp \
   AArch64FastISel.cpp \
@@ -33,6 +34,7 @@
   AArch64ISelLowering.cpp \
   AArch64LoadStoreOptimizer.cpp \
   AArch64MCInstLower.cpp \
+  AArch64PBQPRegAlloc.cpp \
   AArch64PromoteConstant.cpp \
   AArch64RegisterInfo.cpp \
   AArch64SelectionDAGInfo.cpp \

diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index c42d11e..98e0ea8 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp

@@ -10,26 +10,28 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
 #include <cstdio>
 using namespace llvm;
 
@@ -41,7 +43,6 @@
 private:
   StringRef Mnemonic; ///< Instruction mnemonic.
   MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
 
   // Map of register aliases registers via the .req directive.
   StringMap<std::pair<bool, unsigned> > RegisterReqs;
@@ -51,10 +52,7 @@
     return static_cast<AArch64TargetStreamer &>(TS);
   }
 
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
-  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+  SMLoc getLoc() const { return getParser().getTok().getLoc(); }
 
   bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
   AArch64CC::CondCode parseCondCodeString(StringRef Cond);
@@ -68,11 +66,13 @@
   bool parseOperand(OperandVector &Operands, bool isCondCode,
                     bool invertCondCode);
 
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); }
   bool showMatchError(SMLoc Loc, unsigned ErrCode);
 
   bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveInst(SMLoc L);
+
   bool parseDirectiveTLSDescCall(SMLoc L);
 
   bool parseDirectiveLOH(StringRef LOH, SMLoc L);
@@ -84,7 +84,7 @@
   bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
+                               uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
 /// @name Auto-generated Match Functions
 /// {
@@ -116,10 +116,11 @@
   AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
                  const MCInstrInfo &MII,
                  const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+      : MCTargetAsmParser(), STI(_STI) {
     MCAsmParserExtension::Initialize(_Parser);
-    if (Parser.getStreamer().getTargetStreamer() == nullptr)
-      new AArch64TargetStreamer(Parser.getStreamer());
+    MCStreamer &S = getParser().getStreamer();
+    if (S.getTargetStreamer() == nullptr)
+      new AArch64TargetStreamer(S);
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -1253,134 +1254,116 @@
 
   void addSImm9Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
   }
 
   void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
   }
 
   void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
   }
 
   void addImm0_7Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm1_8Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm0_15Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm1_16Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     assert(MCE && "Invalid constant immediate operand!");
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm0_31Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm1_31Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm1_32Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm0_63Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm1_63Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm1_64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm0_127Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm0_255Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addImm32_63Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
   void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     uint64_t encoding =
         AArch64_AM::encodeLogicalImmediate(MCE->getValue() & 0xFFFFFFFF, 32);
     Inst.addOperand(MCOperand::CreateImm(encoding));
@@ -1388,8 +1371,7 @@
 
   void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
     Inst.addOperand(MCOperand::CreateImm(encoding));
   }
@@ -1412,8 +1394,7 @@
 
   void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid immediate operand!");
+    const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
     uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
     Inst.addOperand(MCOperand::CreateImm(encoding));
   }
@@ -1894,6 +1875,7 @@
 /// Identifier when called, and if it is a register name the token is eaten and
 /// the register is added to the operand list.
 int AArch64AsmParser::tryParseRegister() {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
 
@@ -1918,6 +1900,7 @@
 /// tryMatchVectorRegister - Try to parse a vector register name with optional
 /// kind specifier. If it is a register specifier, eat the token and return it.
 int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+  MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
     TokError("vector register expected");
     return -1;
@@ -1950,6 +1933,7 @@
 /// tryParseSysCROperand - Try to parse a system instruction CR operand name.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
@@ -1979,6 +1963,7 @@
 /// tryParsePrefetch - Try to parse a prefetch operand.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const AsmToken &Tok = Parser.getTok();
   // Either an identifier for named values or a 5-bit immediate.
@@ -2026,6 +2011,7 @@
 /// instruction.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const MCExpr *Expr;
 
@@ -2076,6 +2062,7 @@
 /// instruction.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const MCExpr *Expr;
 
@@ -2095,6 +2082,7 @@
 /// tryParseFPImm - A floating point immediate expression operand.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
   bool Hash = false;
@@ -2157,6 +2145,7 @@
 /// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
 
   if (Parser.getTok().is(AsmToken::Hash))
@@ -2248,6 +2237,7 @@
 /// parseCondCode - Parse a Condition Code operand.
 bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
                                      bool invertCondCode) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
@@ -2273,6 +2263,7 @@
 /// them if present.
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   std::string LowerID = Tok.getString().lower();
   AArch64_AM::ShiftExtendType ShOp =
@@ -2318,10 +2309,11 @@
   if (Hash)
     Parser.Lex(); // Eat the '#'.
 
-  // Make sure we do actually have a number
-  if (!Parser.getTok().is(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(),
-          "expected integer shift amount");
+  // Make sure we do actually have a number or a parenthesized expression.
+  SMLoc E = Parser.getTok().getLoc();
+  if (!Parser.getTok().is(AsmToken::Integer) &&
+      !Parser.getTok().is(AsmToken::LParen)) {
+    Error(E, "expected integer shift amount");
     return MatchOperand_ParseFail;
   }
 
@@ -2331,11 +2323,11 @@
 
   const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
   if (!MCE) {
-    TokError("expected #imm after shift specifier");
+    Error(E, "expected constant '#imm' after shift specifier");
     return MatchOperand_ParseFail;
   }
 
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
   Operands.push_back(AArch64Operand::CreateShiftExtend(
       ShOp, MCE->getValue(), true, S, E, getContext()));
   return MatchOperand_Success;
@@ -2352,6 +2344,7 @@
   Operands.push_back(
       AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
 
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   StringRef Op = Tok.getString();
   SMLoc S = Tok.getLoc();
@@ -2590,6 +2583,7 @@
 
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
 
   // Can be either a #imm style literal or an option name
@@ -2643,6 +2637,7 @@
 
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
 
   if (Tok.isNot(AsmToken::Identifier))
@@ -2657,6 +2652,7 @@
 
 /// tryParseVectorRegister - Parse a vector register operand.
 bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::Identifier))
     return true;
 
@@ -2705,6 +2701,7 @@
 
 /// parseRegister - Parse a non-vector register operand.
 bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   // Try for a vector register.
   if (!tryParseVectorRegister(Operands))
@@ -2747,6 +2744,7 @@
 }
 
 bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+  MCAsmParser &Parser = getParser();
   bool HasELFModifier = false;
   AArch64MCExpr::VariantKind RefKind;
 
@@ -2825,6 +2823,7 @@
 
 /// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
 bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
   SMLoc S = getLoc();
   Parser.Lex(); // Eat left bracket token.
@@ -2923,6 +2922,7 @@
 
 AArch64AsmParser::OperandMatchResultTy
 AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
     return MatchOperand_NoMatch;
@@ -2968,6 +2968,7 @@
 /// operand regardless of the mnemonic.
 bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
                                   bool invertCondCode) {
+  MCAsmParser &Parser = getParser();
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -3087,13 +3088,18 @@
     if (getParser().parseExpression(SubExprVal))
       return true;
 
+    if (Operands.size() < 2 ||
+        !static_cast<AArch64Operand &>(*Operands[1]).isReg())
+      return true;
+
+    bool IsXReg =
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Operands[1]->getReg());
+
     MCContext& Ctx = getContext();
     E = SMLoc::getFromPointer(Loc.getPointer() - 1);
     // If the op is an imm and can be fit into a mov, then replace ldr with mov.
-    if (isa<MCConstantExpr>(SubExprVal) && Operands.size() >= 2 &&
-        static_cast<AArch64Operand &>(*Operands[1]).isReg()) {
-      bool IsXReg =  AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
-            Operands[1]->getReg());
+    if (isa<MCConstantExpr>(SubExprVal)) {
       uint64_t Imm = (cast<MCConstantExpr>(SubExprVal))->getValue();
       uint32_t ShiftAmt = 0, MaxShiftAmt = IsXReg ? 48 : 16;
       while(Imm > 0xFFFF && countTrailingZeros(Imm) >= 16) {
@@ -3109,9 +3115,14 @@
                      ShiftAmt, true, S, E, Ctx));
         return false;
       }
+      APInt Simm = APInt(64, Imm << ShiftAmt);
+      // check if the immediate is an unsigned or signed 32-bit int for W regs
+      if (!IsXReg && !(Simm.isIntN(32) || Simm.isSignedIntN(32)))
+        return Error(Loc, "Immediate too large for register");
     }
     // If it is a label or an imm that cannot fit in a movz, put it into CP.
-    const MCExpr *CPLoc = getTargetStreamer().addConstantPoolEntry(SubExprVal);
+    const MCExpr *CPLoc =
+        getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4);
     Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx));
     return false;
   }
@@ -3123,6 +3134,7 @@
 bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                         StringRef Name, SMLoc NameLoc,
                                         OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   Name = StringSwitch<StringRef>(Name.lower())
              .Case("beq", "b.eq")
              .Case("bne", "b.ne")
@@ -3571,12 +3583,12 @@
   }
 }
 
-static const char *getSubtargetFeatureName(unsigned Val);
+static const char *getSubtargetFeatureName(uint64_t Val);
 
 bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
                                                MCStreamer &Out,
-                                               unsigned &ErrorInfo,
+                                               uint64_t &ErrorInfo,
                                                bool MatchingInlineAsm) {
   assert(!Operands.empty() && "Unexpect empty operand list!");
   AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
@@ -3826,7 +3838,7 @@
     // Special case the error message for the very common case where only
     // a single subtarget feature is missing (neon, e.g.).
     std::string Msg = "instruction requires:";
-    unsigned Mask = 1;
+    uint64_t Mask = 1;
     for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
       if (ErrorInfo & Mask) {
         Msg += " ";
@@ -3840,7 +3852,7 @@
     return showMatchError(IDLoc, MatchResult);
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
+    if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
@@ -3920,6 +3932,11 @@
 
 /// ParseDirective parses the arm specific directives
 bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
+  const MCObjectFileInfo::Environment Format =
+    getContext().getObjectFileInfo()->getObjectFileType();
+  bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+  bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
+
   StringRef IDVal = DirectiveID.getIdentifier();
   SMLoc Loc = DirectiveID.getLoc();
   if (IDVal == ".hword")
@@ -3935,12 +3952,18 @@
   if (IDVal == ".unreq")
     return parseDirectiveUnreq(DirectiveID.getLoc());
 
+  if (!IsMachO && !IsCOFF) {
+    if (IDVal == ".inst")
+      return parseDirectiveInst(Loc);
+  }
+
   return parseDirectiveLOH(IDVal, Loc);
 }
 
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
@@ -3963,6 +3986,47 @@
   return false;
 }
 
+/// parseDirectiveInst
+///  ::= .inst opcode [, ...]
+bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
+  MCAsmParser &Parser = getParser();
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    Error(Loc, "expected expression following directive");
+    return false;
+  }
+
+  for (;;) {
+    const MCExpr *Expr;
+
+    if (getParser().parseExpression(Expr)) {
+      Error(Loc, "expected expression");
+      return false;
+    }
+
+    const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+    if (!Value) {
+      Error(Loc, "expected constant expression");
+      return false;
+    }
+
+    getTargetStreamer().emitInst(Value->getValue());
+
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      Error(Loc, "unexpected token in directive");
+      return false;
+    }
+
+    Parser.Lex(); // Eat comma.
+  }
+
+  Parser.Lex();
+  return false;
+}
+
 // parseDirectiveTLSDescCall:
 //   ::= .tlsdesccall symbol
 bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
@@ -3994,10 +4058,9 @@
     // We successfully get a numeric value for the identifier.
     // Check if it is valid.
     int64_t Id = getParser().getTok().getIntVal();
-    Kind = (MCLOHType)Id;
-    // Check that Id does not overflow MCLOHType.
-    if (!isValidMCLOHType(Kind) || Id != Kind)
+    if (Id <= -1U && !isValidMCLOHType(Id))
       return TokError("invalid numeric identifier in directive");
+    Kind = (MCLOHType)Id;
   } else {
     StringRef Name = getTok().getIdentifier();
     // We successfully parse an identifier.
@@ -4045,6 +4108,7 @@
 /// parseDirectiveReq
 ///  ::= name .req registername
 bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   Parser.Lex(); // Eat the '.req' token.
   SMLoc SRegLoc = getLoc();
   unsigned RegNum = tryParseRegister();
@@ -4076,7 +4140,7 @@
   Parser.Lex(); // Consume the EndOfStatement
 
   auto pair = std::make_pair(IsVector, RegNum);
-  if (RegisterReqs.GetOrCreateValue(Name, pair).getValue() != pair)
+  if (!RegisterReqs.insert(std::make_pair(Name, pair)).second)
     Warning(L, "ignoring redefinition of register alias '" + Name + "'");
 
   return true;
@@ -4085,6 +4149,7 @@
 /// parseDirectiveUneq
 ///  ::= .unreq registername
 bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
     Error(Parser.getTok().getLoc(), "unexpected input in .unreq directive.");
     Parser.eatToEndOfStatement();
@@ -4149,9 +4214,7 @@
 extern "C" void LLVMInitializeAArch64AsmParser() {
   RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
   RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
-
-  RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget);
-  RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget);
+  RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64Target);
 }
 
 #define GET_REGISTER_MATCHER

diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index c2f0488..f26327f 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt

@@ -2,7 +2,7 @@
 
 tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
 tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
@@ -28,12 +28,14 @@
   AArch64FastISel.cpp
   AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
+  AArch64ConditionOptimizer.cpp
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
   AArch64LoadStoreOptimizer.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
+  AArch64PBQPRegAlloc.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
   AArch64StorePairSuppress.cpp

diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 6de27d6..878e29c 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp

@@ -15,12 +15,11 @@
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -200,26 +199,24 @@
 }
 
 DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                               const MemoryObject &Region,
-                                               uint64_t Address,
-                                               raw_ostream &os,
-                                               raw_ostream &cs) const {
-  CommentStream = &cs;
-
-  uint8_t bytes[4];
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &OS,
+                                                 raw_ostream &CS) const {
+  CommentStream = &CS;
 
   Size = 0;
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+  if (Bytes.size() < 4)
     return Fail;
   Size = 4;
 
   // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn =
-      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
+  uint32_t Insn =
+      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
 
   // Calling the auto-generated decoder function.
-  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
+  return decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
 }
 
 static MCSymbolizer *
@@ -243,13 +240,9 @@
   TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
                                        createAArch64ExternalSymbolizer);
 
-  TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
+  TargetRegistry::RegisterMCDisassembler(TheARM64Target,
                                          createAArch64Disassembler);
-  TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
-                                       createAArch64ExternalSymbolizer);
-  TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
+  TargetRegistry::RegisterMCSymbolizer(TheARM64Target,
                                        createAArch64ExternalSymbolizer);
 }
 
@@ -592,7 +585,7 @@
                                                uint64_t Addr,
                                                const void *Decoder) {
   // scale{5} is asserted as 1 in tblgen.
-  Imm |= 0x20;  
+  Imm |= 0x20;
   Inst.addOperand(MCOperand::CreateImm(64 - Imm));
   return Success;
 }
@@ -614,7 +607,7 @@
   if (ImmVal & (1 << (19 - 1)))
     ImmVal |= ~((1LL << 19) - 1);
 
-  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
+  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal *  4, Addr,
                                      Inst.getOpcode() != AArch64::LDRXl, 0, 4))
     Inst.addOperand(MCOperand::CreateImm(ImmVal));
   return Success;
@@ -630,35 +623,19 @@
 static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
-  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
-  Imm |= 0x8000;
   Inst.addOperand(MCOperand::CreateImm(Imm));
 
-  bool ValidNamed;
-  (void)AArch64SysReg::MRSMapper(STI.getFeatureBits())
-      .toString(Imm, ValidNamed);
-
-  return ValidNamed ? Success : Fail;
+  // Every system register in the encoding space is valid with the syntax
+  // S<op0>_<op1>_<Cn>_<Cm>_<op2>, so decoding system registers always succeeds.
+  return Success;
 }
 
 static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  const AArch64Disassembler *Dis =
-      static_cast<const AArch64Disassembler *>(Decoder);
-  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
-  Imm |= 0x8000;
   Inst.addOperand(MCOperand::CreateImm(Imm));
 
-  bool ValidNamed;
-  (void)AArch64SysReg::MSRMapper(STI.getFeatureBits())
-      .toString(Imm, ValidNamed);
-
-  return ValidNamed ? Success : Fail;
+  return Success;
 }
 
 static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
@@ -1510,7 +1487,7 @@
   if (imm & (1 << (26 - 1)))
     imm |= ~((1LL << 26) - 1);
 
-  if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4))
     Inst.addOperand(MCOperand::CreateImm(imm));
 
   return Success;
@@ -1530,7 +1507,7 @@
 
   bool ValidNamed;
   (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
-  
+
   return ValidNamed ? Success : Fail;
 }
 
@@ -1552,7 +1529,7 @@
   else
     DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
   Inst.addOperand(MCOperand::CreateImm(bit));
-  if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
+  if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4))
     Inst.addOperand(MCOperand::CreateImm(dst));
 
   return Success;

diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 68d4867..7fb57ad 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h

@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64DISASSEMBLER_H
-#define AArch64DISASSEMBLER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
 
 #include "llvm/MC/MCDisassembler.h"
 
@@ -28,11 +28,10 @@
 
   ~AArch64Disassembler() {}
 
-  /// getInstruction - See MCDisassembler.
   MCDisassembler::DecodeStatus
-  getInstruction(MCInst &instr, uint64_t &size, const MemoryObject &region,
-                 uint64_t address, raw_ostream &vStream,
-                 raw_ostream &cStream) const override;
+  getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
+                 uint64_t Address, raw_ostream &VStream,
+                 raw_ostream &CStream) const override;
 };
 
 } // namespace llvm

diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index 171d31c..12b8450 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64EXTERNALSYMBOLIZER_H
-#define AArch64EXTERNALSYMBOLIZER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
 
 #include "llvm/MC/MCExternalSymbolizer.h"
 

diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
index a4224f4..62827e8 100644
--- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = AArch64Disassembler
 parent = AArch64
-required_libraries = AArch64Info AArch64Utils MC Support
+required_libraries = AArch64Info AArch64Utils MC MCDisassembler Support
 add_to_library_groups = AArch64

diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 8a21f06..46a1d79 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp

@@ -16,8 +16,8 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
@@ -1223,7 +1223,7 @@
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 2);
+    O << "#" << (Op.getImm() * 4);
     return;
   }
 
@@ -1247,7 +1247,7 @@
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 12);
+    O << "#" << (Op.getImm() * (1 << 12));
     return;
   }
 
@@ -1276,24 +1276,20 @@
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  bool Valid;
   auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val, Valid);
+  std::string Name = Mapper.toString(Val);
 
-  if (Valid)
-    O << StringRef(Name).upper();
+  O << StringRef(Name).upper();
 }
 
 void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  bool Valid;
   auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val, Valid);
+  std::string Name = Mapper.toString(Val);
 
-  if (Valid)
-    O << StringRef(Name).upper();
+  O << StringRef(Name).upper();
 }
 
 void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,

diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index fe7666e..5f51621 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64INSTPRINTER_H
-#define AArch64INSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
+#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/StringRef.h"
@@ -127,8 +127,9 @@
 
   void printInstruction(const MCInst *MI, raw_ostream &O) override;
   bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
-  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                                       unsigned PrintMethodIdx, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx,
+                               raw_ostream &O) override;
   StringRef getRegName(unsigned RegNo) const override {
     return getRegisterName(RegNo);
   }

diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 642c183..573fa10 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt

@@ -31,5 +31,5 @@
 type = Library
 name = AArch64CodeGen
 parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
 add_to_library_groups = AArch64

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 8b1e44e..1dc506a 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
-#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -210,67 +210,62 @@
 /// as the immediate operand of a logical instruction for the given register
 /// size.  If so, return true with "encoding" set to the encoded value in
 /// the form N:immr:imms.
-static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
-                                           uint64_t &encoding) {
-  if (imm == 0ULL || imm == ~0ULL ||
-      (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
+static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize,
+                                           uint64_t &Encoding) {
+  if (Imm == 0ULL || Imm == ~0ULL ||
+      (RegSize != 64 && (Imm >> RegSize != 0 || Imm == ~0U)))
     return false;
 
-  unsigned size = 2;
-  uint64_t eltVal = imm;
-
   // First, determine the element size.
-  while (size < regSize) {
-    unsigned numElts = regSize / size;
-    unsigned mask = (1ULL << size) - 1;
-    uint64_t lowestEltVal = imm & mask;
+  unsigned Size = RegSize;
 
-    bool allMatched = true;
-    for (unsigned i = 1; i < numElts; ++i) {
-     uint64_t currEltVal = (imm >> (i*size)) & mask;
-      if (currEltVal != lowestEltVal) {
-        allMatched = false;
-        break;
-      }
-    }
+  do {
+    Size /= 2;
+    uint64_t Mask = (1ULL << Size) - 1;
 
-    if (allMatched) {
-      eltVal = lowestEltVal;
+    if ((Imm & Mask) != ((Imm >> Size) & Mask)) {
+      Size *= 2;
       break;
     }
-
-    size *= 2;
-  }
+  } while (Size > 2);
 
   // Second, determine the rotation to make the element be: 0^m 1^n.
-  for (unsigned i = 0; i < size; ++i) {
-    eltVal = ror(eltVal, size);
-    uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
-    uint32_t cto = CountTrailingOnes_64(eltVal);
+  uint32_t CTO, I;
+  uint64_t Mask = ((uint64_t)-1LL) >> (64 - Size);
+  Imm &= Mask;
 
-    if (clz + cto == size) {
-      // Encode in immr the number of RORs it would take to get *from* this
-      // element value to our target value, where i+1 is the number of RORs
-      // to go the opposite direction.
-      unsigned immr = size - (i + 1);
+  if (isShiftedMask_64(Imm)) {
+    I = countTrailingZeros(Imm);
+    CTO = CountTrailingOnes_64(Imm >> I);
+  } else {
+    Imm |= ~Mask;
+    if (!isShiftedMask_64(~Imm))
+      return false;
 
-      // If size has a 1 in the n'th bit, create a value that has zeroes in
-      // bits [0, n] and ones above that.
-      uint64_t nimms = ~(size-1) << 1;
-
-      // Or the CTO value into the low bits, which must be below the Nth bit
-      // bit mentioned above.
-      nimms |= (cto-1);
-
-      // Extract the seventh bit and toggle it to create the N field.
-      unsigned N = ((nimms >> 6) & 1) ^ 1;
-
-      encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
-      return true;
-    }
+    unsigned CLO = CountLeadingOnes_64(Imm);
+    I = 64 - CLO;
+    CTO = CLO + CountTrailingOnes_64(Imm) - (64 - Size);
   }
 
-  return false;
+  // Encode in Immr the number of RORs it would take to get *from* 0^m 1^n
+  // to our target value, where i is the number of RORs to go the opposite
+  // direction.
+  assert(Size > I && "I should be smaller than element Size");
+  unsigned Immr = (Size - I) & (Size - 1);
+
+  // If size has a 1 in the n'th bit, create a value that has zeroes in
+  // bits [0, n] and ones above that.
+  uint64_t NImms = ~(Size-1) << 1;
+
+  // Or the CTO value into the low bits, which must be below the Nth bit
+  // bit mentioned above.
+  NImms |= (CTO-1);
+
+  // Extract the seventh bit and toggle it to create the N field.
+  unsigned N = ((NImms >> 6) & 1) ^ 1;
+
+  Encoding = (N << 12) | (Immr << 6) | (NImms & 0x3f);
+  return true;
 }
 
 /// isLogicalImmediate - Return true if the immediate is valid for a logical

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a917616..0bc2f77 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp

@@ -14,9 +14,10 @@
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachO.h"
 using namespace llvm;
@@ -534,8 +535,8 @@
   // store fixups in .eh_frame section in big endian order
   if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
     const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
-    const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
-    if (SecELF->getSectionName() == ".eh_frame")
+    const MCSectionELF *SecELF = dyn_cast_or_null<const MCSectionELF>(Sec);
+    if (SecELF && SecELF->getSectionName() == ".eh_frame")
       Value = ByteSwap_32(unsigned(Value));
   }
   AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
@@ -551,7 +552,8 @@
     return new DarwinAArch64AsmBackend(T, MRI);
 
   assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
-  return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true);
 }
 
 MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
@@ -561,6 +563,7 @@
 
   assert(TheTriple.isOSBinFormatELF() &&
          "Big endian is only supported for ELF targets!");
-  return new ELFAArch64AsmBackend(T, TheTriple.getOS(),
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  return new ELFAArch64AsmBackend(T, OSABI,
                                   /*IsLittleEndian=*/false);
 }

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index a79406d..60e9c19 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp

@@ -15,8 +15,10 @@
 
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -34,12 +36,42 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 namespace {
 
+class AArch64ELFStreamer;
+
+class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
+  formatted_raw_ostream &OS;
+
+  void emitInst(uint32_t Inst) override;
+
+public:
+  AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+};
+
+AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S,
+                                                   formatted_raw_ostream &OS)
+  : AArch64TargetStreamer(S), OS(OS) {}
+
+void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) {
+  OS << "\t.inst\t0x" << utohexstr(Inst) << "\n";
+}
+
+class AArch64TargetELFStreamer : public AArch64TargetStreamer {
+private:
+  AArch64ELFStreamer &getStreamer();
+
+  void emitInst(uint32_t Inst) override;
+
+public:
+  AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
+};
+
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
 /// the appropriate points in the object files. These symbols are defined in the
 /// AArch64 ELF ABI:
@@ -55,6 +87,8 @@
 /// by MachO. Beware!
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
+  friend class AArch64TargetELFStreamer;
+
   AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
                    MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
@@ -82,6 +116,18 @@
     MCELFStreamer::EmitInstruction(Inst, STI);
   }
 
+  void emitInst(uint32_t Inst) {
+    char Buffer[4];
+    const bool LittleEndian = getContext().getAsmInfo()->isLittleEndian();
+
+    EmitA64MappingSymbol();
+    for (unsigned II = 0; II != 4; ++II) {
+      const unsigned I = LittleEndian ? (4 - II - 1) : II;
+      Buffer[4 - II - 1] = uint8_t(Inst >> I * CHAR_BIT);
+    }
+    MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
+  }
+
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
@@ -144,17 +190,35 @@
 
   /// @}
 };
+} // end anonymous namespace
+
+AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
+  return static_cast<AArch64ELFStreamer &>(Streamer);
+}
+
+void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
+  getStreamer().emitInst(Inst);
 }
 
 namespace llvm {
+MCStreamer *
+createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                           bool isVerboseAsm, bool useDwarfDirectory,
+                           MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                           MCAsmBackend *TAB, bool ShowInst) {
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
+  new AArch64TargetAsmStreamer(*S, OS);
+  return S;
+}
+
 MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                         raw_ostream &OS, MCCodeEmitter *Emitter,
-                                        bool RelaxAll, bool NoExecStack) {
+                                        bool RelaxAll) {
   AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+  new AArch64TargetELFStreamer(*S);
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
-  if (NoExecStack)
-    S->getAssembler().setNoExecStack(true);
   return S;
 }
 }

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index bc6973b..71b05cc 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_ELF_STREAMER_H
-#define LLVM_AARCH64_ELF_STREAMER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
 
 #include "llvm/MC/MCELFStreamer.h"
 
@@ -20,7 +20,7 @@
 
 MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                         raw_ostream &OS, MCCodeEmitter *Emitter,
-                                        bool RelaxAll, bool NoExecStack);
+                                        bool RelaxAll);
 }
 
-#endif // AArch64_ELF_STREAMER_H
+#endif

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index bf405fb..0f5b765 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AArch64FIXUPKINDS_H
-#define LLVM_AArch64FIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 1763b40..70b9329 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp

@@ -13,8 +13,8 @@
 
 #include "AArch64MCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
@@ -66,7 +66,7 @@
 
 AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
   Triple T(TT);
-  if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be)
+  if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
 
   // We prefer NEON instructions to be printed in the short form.
@@ -89,7 +89,6 @@
 
   WeakRefDirective = "\t.weak\t";
 
-  HasLEB128 = true;
   SupportsDebugInformation = true;
 
   // Exceptions handling

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 42a031d..5d03c21 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64TARGETASMINFO_H
-#define AArch64TARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
 

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index f051357..c306b11 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp

@@ -15,13 +15,13 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 42a6787..e396df8 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp

@@ -90,8 +90,9 @@
 }
 
 bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                            const MCAsmLayout *Layout) const {
-  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+                                            const MCAsmLayout *Layout,
+					    const MCFixup *Fixup) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout, Fixup))
     return false;
 
   Res =

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 5422f9d..db48ac9 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AArch64MCEXPR_H
-#define LLVM_AArch64MCEXPR_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
 
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -152,7 +152,8 @@
   const MCSection *FindAssociatedSection() const override;
 
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const override;
+                                 const MCAsmLayout *Layout,
+				 const MCFixup *Fixup) const override;
 
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
 

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index ae698c5..0f7a6b8 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp

@@ -126,15 +126,14 @@
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &TAB,
                                     raw_ostream &OS, MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI, bool RelaxAll,
-                                    bool NoExecStack) {
+                                    const MCSubtargetInfo &STI, bool RelaxAll) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin())
     return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
                                /*LabelSections*/ true);
 
-  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
 }
 
 // Force static initialization.
@@ -142,17 +141,14 @@
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo);
   RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo);
-  RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo);
-  RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Z(TheARM64Target, createAArch64MCAsmInfo);
 
   // Register the MC codegen info.
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
                                         createAArch64MCCodeGenInfo);
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
                                         createAArch64MCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
-                                        createAArch64MCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target,
                                         createAArch64MCCodeGenInfo);
 
   // Register the MC instruction info.
@@ -160,9 +156,7 @@
                                       createAArch64MCInstrInfo);
   TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
                                       createAArch64MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget,
-                                      createAArch64MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget,
+  TargetRegistry::RegisterMCInstrInfo(TheARM64Target,
                                       createAArch64MCInstrInfo);
 
   // Register the MC register info.
@@ -170,9 +164,7 @@
                                     createAArch64MCRegisterInfo);
   TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
                                     createAArch64MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheARM64leTarget,
-                                    createAArch64MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheARM64beTarget,
+  TargetRegistry::RegisterMCRegInfo(TheARM64Target,
                                     createAArch64MCRegisterInfo);
 
   // Register the MC subtarget info.
@@ -180,9 +172,7 @@
                                           createAArch64MCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
                                           createAArch64MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
-                                          createAArch64MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target,
                                           createAArch64MCSubtargetInfo);
 
   // Register the asm backend.
@@ -190,19 +180,15 @@
                                        createAArch64leAsmBackend);
   TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
                                        createAArch64beAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget,
+  TargetRegistry::RegisterMCAsmBackend(TheARM64Target,
                                        createAArch64leAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget,
-                                       createAArch64beAsmBackend);
 
   // Register the MC Code Emitter
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
                                         createAArch64MCCodeEmitter);
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
                                         createAArch64MCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
-                                        createAArch64MCCodeEmitter);
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64Target,
                                         createAArch64MCCodeEmitter);
 
   // Register the object streamer.
@@ -210,16 +196,21 @@
                                            createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
                                            createMCStreamer);
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer);
+
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheAArch64leTarget,
+                                      createAArch64MCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheAArch64beTarget,
+                                      createAArch64MCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheARM64Target,
+                                      createAArch64MCAsmStreamer);
 
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
                                         createAArch64MCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
                                         createAArch64MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
-                                        createAArch64MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
+  TargetRegistry::RegisterMCInstPrinter(TheARM64Target,
                                         createAArch64MCInstPrinter);
 }

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index d886ea2..1553115 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h

@@ -11,19 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64MCTARGETDESC_H
-#define AArch64MCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
 #include <string>
 
 namespace llvm {
+class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
+class MCInstPrinter;
 class MCRegisterInfo;
 class MCObjectWriter;
+class MCStreamer;
 class MCSubtargetInfo;
 class StringRef;
 class Target;
@@ -31,8 +34,7 @@
 
 extern Target TheAArch64leTarget;
 extern Target TheAArch64beTarget;
-extern Target TheARM64leTarget;
-extern Target TheARM64beTarget;
+extern Target TheARM64Target;
 
 MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo &MRI,
@@ -51,6 +53,11 @@
 MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
                                             uint32_t CPUSubtype);
 
+MCStreamer *
+createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                           bool isVerboseAsm, bool useDwarfDirectory,
+                           MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                           MCAsmBackend *TAB, bool ShowInst);
 } // End llvm namespace
 
 // Defines symbolic names for AArch64 registers.  This defines a mapping from

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index ba95366..e12a24b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp

@@ -9,15 +9,15 @@
 
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/MC/MCAssembler.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachO.h"
 using namespace llvm;
@@ -288,7 +288,8 @@
       // FIXME: Will the Target we already have ever have any data in it
       // we need to preserve and merge with the new Target? How about
       // the FixedValue?
-      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+      if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout,
+                                                             &Fixup))
         Asm.getContext().FatalError(Fixup.getLoc(),
                                     "unable to resolve variable '" +
                                         Symbol->getName() + "'");

diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index f9aeb35..e3112fa 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp

@@ -28,8 +28,9 @@
 
 // The constant pool handling is shared by all AArch64TargetStreamer
 // implementations.
-const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr) {
-  return ConstantPools->addEntry(Streamer, Expr);
+const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr,
+                                                          unsigned Size) {
+  return ConstantPools->addEntry(Streamer, Expr, Size);
 }
 
 void AArch64TargetStreamer::emitCurrentConstantPool() {
@@ -38,3 +39,5 @@
 
 // finish() - write out any non-empty assembler constant pools.
 void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+
+void AArch64TargetStreamer::emitInst(uint32_t Inst) {}

diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 3a382c1..f42ecb1 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp

@@ -14,18 +14,19 @@
 namespace llvm {
 Target TheAArch64leTarget;
 Target TheAArch64beTarget;
-Target TheARM64leTarget;
-Target TheARM64beTarget;
+Target TheARM64Target;
 } // end namespace llvm
 
 extern "C" void LLVMInitializeAArch64TargetInfo() {
-  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
-                                                   "AArch64 (little endian)");
-  RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
-                                                      "AArch64 (big endian)");
+  // Now register the "arm64" name for use with "-march". We don't want it to
+  // take possession of the Triple::aarch64 tag though.
+  TargetRegistry::RegisterTarget(TheARM64Target, "arm64",
+                                 "ARM64 (little endian)",
+                                 [](Triple::ArchType) { return false; }, true);
 
   RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
       TheAArch64leTarget, "aarch64", "AArch64 (little endian)");
   RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
       TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)");
+
 }

diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 3c24bb3..bc6c7a9 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp

@@ -791,22 +791,22 @@
     }
   }
 
-  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits
-  // are: 11 xxx 1x11 xxxx xxx
-  Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$");
+  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
+  Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$");
 
-  SmallVector<StringRef, 4> Ops;
+  SmallVector<StringRef, 5> Ops;
   if (!GenericRegPattern.match(NameLower, &Ops)) {
     Valid = false;
     return -1;
   }
 
-  uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
+  uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
   uint32_t Bits;
-  Ops[1].getAsInteger(10, Op1);
-  Ops[2].getAsInteger(10, CRn);
-  Ops[3].getAsInteger(10, CRm);
-  Ops[4].getAsInteger(10, Op2);
+  Ops[1].getAsInteger(10, Op0);
+  Ops[2].getAsInteger(10, Op1);
+  Ops[3].getAsInteger(10, CRn);
+  Ops[4].getAsInteger(10, CRm);
+  Ops[5].getAsInteger(10, Op2);
   Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
 
   Valid = true;
@@ -814,11 +814,10 @@
 }
 
 std::string
-AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits) const {
   // First search the registers shared by all
   for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
     if (SysRegPairs[i].Value == Bits) {
-      Valid = true;
       return SysRegPairs[i].Name;
     }
   }
@@ -827,7 +826,6 @@
   if (FeatureBits & AArch64::ProcCyclone) {
     for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
       if (CycloneSysRegPairs[i].Value == Bits) {
-        Valid = true;
         return CycloneSysRegPairs[i].Name;
       }
     }
@@ -837,28 +835,18 @@
   // write-only).
   for (unsigned i = 0; i < NumInstPairs; ++i) {
     if (InstPairs[i].Value == Bits) {
-      Valid = true;
       return InstPairs[i].Name;
     }
   }
 
+  assert(Bits < 0x10000);
   uint32_t Op0 = (Bits >> 14) & 0x3;
   uint32_t Op1 = (Bits >> 11) & 0x7;
   uint32_t CRn = (Bits >> 7) & 0xf;
   uint32_t CRm = (Bits >> 3) & 0xf;
   uint32_t Op2 = Bits & 0x7;
 
-  // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic
-  // name.
-  if (Op0 != 3 || (CRn != 11 && CRn != 15)) {
-      Valid = false;
-      return "";
-  }
-
-  assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg");
-
-  Valid = true;
-  return "s3_" + utostr(Op1) + "_c" + utostr(CRn)
+  return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn)
                + "_c" + utostr(CRm) + "_" + utostr(Op2);
 }
 

diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 9d2ce21..c60b09a 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AArch64BASEINFO_H
-#define AArch64BASEINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
+#define LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
 
 // FIXME: Is it easiest to fix this layering violation by moving the .inc
 // #includes from AArch64MCTargetDesc.h to here?
@@ -1143,7 +1143,7 @@
 
     SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
     uint32_t fromString(StringRef Name, bool &Valid) const;
-    std::string toString(uint32_t Bits, bool &Valid) const;
+    std::string toString(uint32_t Bits) const;
   };
 
   struct MSRMapper : SysRegMapper {
@@ -1271,7 +1271,12 @@
     /// thread-local symbol. On Darwin, only one type of thread-local access
     /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
     /// referee will affect interpretation.
-    MO_TLS = 0x20
+    MO_TLS = 0x20,
+
+    /// MO_CONSTPOOL - This flag indicates that a symbol operand represents
+    /// the address of a constant pool entry for the symbol, rather than the
+    /// address of the symbol itself.
+    MO_CONSTPOOL = 0x40
   };
 } // end namespace AArch64II
 

diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index 92eaf9e..387f1f6 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp

@@ -34,6 +34,8 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <map>
 #include <set>
 
 using namespace llvm;
@@ -676,8 +678,8 @@
 }
 
 bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
-  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
-  TRI = Fn.getTarget().getRegisterInfo();
+  TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo());
+  TRI = Fn.getSubtarget().getRegisterInfo();
   MRI = &Fn.getRegInfo();
   bool Modified = false;
 

diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 55df29c..02db53a 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TARGET_ARM_H
-#define TARGET_ARM_H
+#ifndef LLVM_LIB_TARGET_ARM_ARM_H
+#define LLVM_LIB_TARGET_ARM_ARM_H
 
 #include "llvm/Support/CodeGen.h"
 
@@ -23,7 +23,6 @@
 class ARMBaseTargetMachine;
 class FunctionPass;
 class ImmutablePass;
-class JITCodeEmitter;
 class MachineInstr;
 class MCInst;
 class TargetLowering;
@@ -31,10 +30,6 @@
 
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
-
-FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
-                                          JITCodeEmitter &JCE);
-
 FunctionPass *createA15SDOptimizerPass();
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();

diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 25385a6..80b976b 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td

@@ -228,6 +228,15 @@
                                     FeatureAvoidPartialCPSR,
                                     FeatureTrustZone, FeatureVirtualization]>;
 
+def ProcA17     : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17",
+                                   "Cortex-A17 ARM processors",
+                                   [FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureVFP4,
+                                    FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureAvoidPartialCPSR,
+                                    FeatureVirtualization,
+                                    FeatureTrustZone]>;
+
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors",
                                    [FeatureHWDiv, FeatureHWDivARM,
@@ -351,12 +360,8 @@
                                      FeatureAClass]>;
 def : ProcessorModel<"cortex-a9",   CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureDSPThumb2, FeatureHasRAS, FeatureMP,
                                      FeatureAClass]>;
-def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
-                                    [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureMP,
-                                     FeatureHasRAS, FeatureAClass]>;
 
 // FIXME: A12 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a12", CortexA9Model,
@@ -370,6 +375,12 @@
                                      FeatureDSPThumb2, FeatureHasRAS,
                                      FeatureAClass]>;
 
+// FIXME: A17 has currently the same Schedule model as A9
+def : ProcessorModel<"cortex-a17",  CortexA9Model,
+                                    [ProcA17, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureMP,
+                                     FeatureHasRAS, FeatureAClass]>;
+
 // FIXME: krait has currently the same Schedule model as A9
 def : ProcessorModel<"krait",       CortexA9Model,
                                     [ProcKrait, HasV7Ops,
@@ -396,6 +407,12 @@
                                      FeatureT2XtPk, FeatureVFP4,
                                      FeatureVFPOnlySP, FeatureD16,
                                      FeatureMClass]>;
+def : ProcNoItin<"cortex-m7",       [HasV7Ops,
+                                     FeatureThumb2, FeatureNoARM, FeatureDB,
+                                     FeatureHWDiv, FeatureDSPThumb2,
+                                     FeatureT2XtPk, FeatureFPARMv8,
+                                     FeatureD16, FeatureMClass]>;
+
 
 // Swift uArch Processors.
 def : ProcessorModel<"swift",       SwiftModel,

diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 28d2610..695fd4d 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp

@@ -76,7 +76,8 @@
 }
 
 void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
-  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
+  uint64_t Size =
+      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(CV->getType());
   assert(Size && "C++ constructor pointer had zero size!");
 
   const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
@@ -136,7 +137,7 @@
     assert(!MO.getSubReg() && "Subregs should be eliminated!");
     if(ARM::GPRPairRegClass.contains(Reg)) {
       const MachineFunction &MF = *MI->getParent()->getParent();
-      const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+      const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
       Reg = TRI->getSubReg(Reg, ARM::gsub_0);
     }
     O << ARMInstPrinter::getRegisterName(Reg);
@@ -182,7 +183,7 @@
 
 MCSymbol *ARMAsmPrinter::
 GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   SmallString<60> Name;
   raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
     << getFunctionNumber() << '_' << uid << '_' << uid2;
@@ -191,7 +192,7 @@
 
 
 MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel() const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   SmallString<60> Name;
   raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "SJLJEH"
     << getFunctionNumber();
@@ -229,7 +230,7 @@
     case 'y': // Print a VFP single precision register as indexed double.
       if (MI->getOperand(OpNum).isReg()) {
         unsigned Reg = MI->getOperand(OpNum).getReg();
-        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
         // Find the 'd' register that has this 's' register as a sub-register,
         // and determine the lane number.
         for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
@@ -261,7 +262,7 @@
       // inline asm statement.
       O << "{";
       if (ARM::GPRPairRegClass.contains(RegBegin)) {
-        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
         unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
         O << ARMInstPrinter::getRegisterName(Reg0) << ", ";
         RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
@@ -317,7 +318,7 @@
         const MachineOperand &MO = MI->getOperand(OpNum);
         if (!MO.isReg())
           return true;
-        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
         unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ?
             ARM::gsub_0 : ARM::gsub_1);
         O << ARMInstPrinter::getRegisterName(Reg);
@@ -343,7 +344,7 @@
       unsigned Reg = MI->getOperand(OpNum).getReg();
       if (!ARM::QPRRegClass.contains(Reg))
         return true;
-      const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
       unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ?
                                        ARM::dsub_0 : ARM::dsub_1);
       O << ARMInstPrinter::getRegisterName(SubReg);
@@ -358,7 +359,7 @@
       if (!MO.isReg())
         return true;
       const MachineFunction &MF = *MI->getParent()->getParent();
-      const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+      const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
       unsigned Reg = MO.getReg();
       if(!ARM::GPRPairRegClass.contains(Reg))
         return false;
@@ -478,6 +479,9 @@
   // Emit ARM Build Attributes
   if (Subtarget->isTargetELF())
     emitAttributes();
+
+  if (!M.getModuleInlineAsm().empty() && Subtarget->isThumb())
+    OutStreamer.EmitAssemblerFlag(MCAF_Code16);
 }
 
 static void
@@ -558,7 +562,7 @@
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
+      const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
       for (auto &stub: Stubs) {
         OutStreamer.EmitLabel(stub.first);
@@ -663,7 +667,9 @@
                         ARMBuildAttrs::AllowNeonARMv8);
   } else {
     if (Subtarget->hasFPARMv8())
-      ATS.emitFPU(ARM::FP_ARMV8);
+      // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
+      // FPU, but there are two different names for it depending on the CPU.
+      ATS.emitFPU(Subtarget->hasD16() ? ARM::FPV5_D16 : ARM::FP_ARMV8);
     else if (Subtarget->hasVFP4())
       ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV4_D16 : ARM::VFPV4);
     else if (Subtarget->hasVFP3())
@@ -700,6 +706,13 @@
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
                       ARMBuildAttrs::AllowIEE754);
 
+  if (Subtarget->allowsUnalignedMem())
+    ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
+                      ARMBuildAttrs::Allowed);
+  else
+    ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
+                      ARMBuildAttrs::Not_Allowed);
+
   // FIXME: add more flags to ARMBuildAttributes.h
   // 8-bytes alignment stuff.
   ATS.emitAttribute(ARMBuildAttrs::ABI_align_needed, 1);
@@ -757,6 +770,17 @@
     }
   }
 
+  // TODO: We currently only support either reserving the register, or treating
+  // it as another callee-saved register, but not as SB or a TLS pointer; It
+  // would instead be nicer to push this from the frontend as metadata, as we do
+  // for the wchar and enum size tags
+  if (Subtarget->isR9Reserved())
+      ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
+                        ARMBuildAttrs::R9Reserved);
+  else
+      ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
+                        ARMBuildAttrs::R9IsGPR);
+
   if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization())
       ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
                         ARMBuildAttrs::AllowTZVirtualization);
@@ -834,8 +858,9 @@
 
 void ARMAsmPrinter::
 EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
-  const DataLayout *DL = TM.getDataLayout();
-  int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType());
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  int Size =
+      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(MCPV->getType());
 
   ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
 
@@ -1013,7 +1038,7 @@
   MCTargetStreamer &TS = *OutStreamer.getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
   const MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
 
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
@@ -1151,7 +1176,7 @@
 #include "ARMGenMCPseudoLowering.inc"
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
@@ -1567,6 +1592,9 @@
     EmitJumpTable(MI);
     return;
   }
+  case ARM::SPACE:
+    OutStreamer.EmitZeros(MI->getOperand(1).getImm());
+    return;
   case ARM::TRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.

diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 7c103c6..5ff20ce 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMASMPRINTER_H
-#define ARMASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H
+#define LLVM_LIB_TARGET_ARM_ARMASMPRINTER_H
 
 #include "ARMSubtarget.h"
 #include "llvm/CodeGen/AsmPrinter.h"

diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 0288db9..7a315c4 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp

@@ -108,7 +108,7 @@
                                                const ScheduleDAG *DAG) const {
   if (usePreRAHazardRecognizer()) {
     const InstrItineraryData *II =
-        &static_cast<const ARMSubtarget *>(STI)->getInstrItineraryData();
+        static_cast<const ARMSubtarget *>(STI)->getInstrItineraryData();
     return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched");
   }
   return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG);
@@ -518,6 +518,42 @@
   return Found;
 }
 
+static bool isCPSRDefined(const MachineInstr *MI) {
+  for (const auto &MO : MI->operands())
+    if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef())
+      return true;
+  return false;
+}
+
+static bool isEligibleForITBlock(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return true;
+  case ARM::tADC:   // ADC (register) T1
+  case ARM::tADDi3: // ADD (immediate) T1
+  case ARM::tADDi8: // ADD (immediate) T2
+  case ARM::tADDrr: // ADD (register) T1
+  case ARM::tAND:   // AND (register) T1
+  case ARM::tASRri: // ASR (immediate) T1
+  case ARM::tASRrr: // ASR (register) T1
+  case ARM::tBIC:   // BIC (register) T1
+  case ARM::tEOR:   // EOR (register) T1
+  case ARM::tLSLri: // LSL (immediate) T1
+  case ARM::tLSLrr: // LSL (register) T1
+  case ARM::tLSRri: // LSR (immediate) T1
+  case ARM::tLSRrr: // LSR (register) T1
+  case ARM::tMUL:   // MUL T1
+  case ARM::tMVN:   // MVN (register) T1
+  case ARM::tORR:   // ORR (register) T1
+  case ARM::tROR:   // ROR (register) T1
+  case ARM::tRSB:   // RSB (immediate) T1
+  case ARM::tSBC:   // SBC (register) T1
+  case ARM::tSUBi3: // SUB (immediate) T1
+  case ARM::tSUBi8: // SUB (immediate) T2
+  case ARM::tSUBrr: // SUB (register) T1
+    return !isCPSRDefined(MI);
+  }
+}
+
 /// isPredicable - Return true if the specified instruction can be predicated.
 /// By default, this returns true for every instruction with a
 /// PredicateOperand.
@@ -525,6 +561,9 @@
   if (!MI->isPredicable())
     return false;
 
+  if (!isEligibleForITBlock(MI))
+    return false;
+
   ARMFunctionInfo *AFI =
     MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
 
@@ -555,16 +594,6 @@
 }
 }
 
-/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing.
-LLVM_ATTRIBUTE_NOINLINE
-static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
-                                unsigned JTI);
-static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
-                                unsigned JTI) {
-  assert(JTI < JT.size());
-  return JT[JTI].MBBs.size();
-}
-
 /// GetInstSize - Return the size of the specified MachineInstr.
 ///
 unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
@@ -637,7 +666,7 @@
     // bytes, we can use 16-bit entries instead. Then there won't be an
     // alignment issue.
     unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4;
-    unsigned NumEntries = getNumJTEntries(JT, JTI);
+    unsigned NumEntries = JT[JTI].MBBs.size();
     if (Opc == ARM::t2TBB_JT && (NumEntries & 1))
       // Make sure the instruction that follows TBB is 2-byte aligned.
       // FIXME: Constant island pass should insert an "ALIGN" instruction
@@ -645,6 +674,8 @@
       ++NumEntries;
     return NumEntries * EntrySize + InstSize;
   }
+  case ARM::SPACE:
+    return MI->getOperand(1).getImm();
   }
 }
 
@@ -659,6 +690,49 @@
   return Size;
 }
 
+void ARMBaseInstrInfo::copyFromCPSR(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg, bool KillSrc,
+                                    const ARMSubtarget &Subtarget) const {
+  unsigned Opc = Subtarget.isThumb()
+                     ? (Subtarget.isMClass() ? ARM::t2MRS_M : ARM::t2MRS_AR)
+                     : ARM::MRS;
+
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, I, I->getDebugLoc(), get(Opc), DestReg);
+
+  // There is only 1 A/R class MRS instruction, and it always refers to
+  // APSR. However, there are lots of other possibilities on M-class cores.
+  if (Subtarget.isMClass())
+    MIB.addImm(0x800);
+
+  AddDefaultPred(MIB);
+
+  MIB.addReg(ARM::CPSR, RegState::Implicit | getKillRegState(KillSrc));
+}
+
+void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned SrcReg, bool KillSrc,
+                                  const ARMSubtarget &Subtarget) const {
+  unsigned Opc = Subtarget.isThumb()
+                     ? (Subtarget.isMClass() ? ARM::t2MSR_M : ARM::t2MSR_AR)
+                     : ARM::MSR;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
+
+  if (Subtarget.isMClass())
+    MIB.addImm(0x800);
+  else
+    MIB.addImm(8);
+
+  MIB.addReg(SrcReg, getKillRegState(KillSrc));
+
+  AddDefaultPred(MIB);
+
+  MIB.addReg(ARM::CPSR, RegState::Implicit | RegState::Define);
+}
+
 void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I, DebugLoc DL,
                                    unsigned DestReg, unsigned SrcReg,
@@ -682,7 +756,7 @@
     Opc = ARM::VMOVRS;
   else if (SPRDest && GPRSrc)
     Opc = ARM::VMOVSR;
-  else if (ARM::DPRRegClass.contains(DestReg, SrcReg))
+  else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && !Subtarget.isFPOnlySP())
     Opc = ARM::VMOVD;
   else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
     Opc = ARM::VORRq;
@@ -742,6 +816,16 @@
     BeginIdx = ARM::dsub_0;
     SubRegs = 4;
     Spacing = 2;
+  } else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.isFPOnlySP()) {
+    Opc = ARM::VMOVS;
+    BeginIdx = ARM::ssub_0;
+    SubRegs = 2;
+  } else if (SrcReg == ARM::CPSR) {
+    copyFromCPSR(MBB, I, DestReg, KillSrc, Subtarget);
+    return;
+  } else if (DestReg == ARM::CPSR) {
+    copyToCPSR(MBB, I, SrcReg, KillSrc, Subtarget);
+    return;
   }
 
   assert(Opc && "Impossible reg-to-reg copy");
@@ -1174,12 +1258,26 @@
   return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
 }
 
-bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{
+bool
+ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  Reloc::Model RM = MF.getTarget().getRelocationModel();
+
+  if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
+    assert(getSubtarget().getTargetTriple().getObjectFormat() ==
+           Triple::MachO &&
+           "LOAD_STACK_GUARD currently supported only for MachO.");
+    expandLoadStackGuard(MI, RM);
+    MI->getParent()->erase(MI);
+    return true;
+  }
+
   // This hook gets to expand COPY instructions before they become
   // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
   // widened to VMOVD.  We prefer the VMOVD when possible because it may be
   // changed into a VORR that can go down the NEON pipeline.
-  if (!WidenVMOVS || !MI->isCopy() || Subtarget.isCortexA15())
+  if (!WidenVMOVS || !MI->isCopy() || Subtarget.isCortexA15() ||
+      Subtarget.isFPOnlySP())
     return false;
 
   // Look for a copy between even S-registers.  That is where we keep floats
@@ -2832,7 +2930,7 @@
 // FIXME: The current MachineInstr design does not support relying on machine
 // mem operands to determine the width of a memory access. Instead, we expect
 // the target to provide this information based on the instruction opcode and
-// operands. However, using MachineMemOperand is a the best solution now for
+// operands. However, using MachineMemOperand is the best solution now for
 // two reasons:
 //
 // 1) getNumMicroOps tries to infer LDM memory width from the total number of MI
@@ -3933,6 +4031,38 @@
   return true;
 }
 
+// LoadStackGuard has so far only been implemented for MachO. Different code
+// sequence is needed for other targets.
+void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
+                                                unsigned LoadImmOpc,
+                                                unsigned LoadOpc,
+                                                Reloc::Model RM) const {
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Reg = MI->getOperand(0).getReg();
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+  MachineInstrBuilder MIB;
+
+  BuildMI(MBB, MI, DL, get(LoadImmOpc), Reg)
+      .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
+
+  if (Subtarget.GVIsIndirectSymbol(GV, RM)) {
+    MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
+    MIB.addReg(Reg, RegState::Kill).addImm(0);
+    unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+    MachineMemOperand *MMO = MBB.getParent()->
+        getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 4, 4);
+    MIB.addMemOperand(MMO);
+    AddDefaultPred(MIB);
+  }
+
+  MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
+  MIB.addReg(Reg, RegState::Kill).addImm(0);
+  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  AddDefaultPred(MIB);
+}
+
 bool
 ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
                                      unsigned &AddSubOpc,
@@ -4361,29 +4491,6 @@
   MI->addRegisterKilled(DReg, TRI, true);
 }
 
-void ARMBaseInstrInfo::getUnconditionalBranch(
-    MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
-  if (Subtarget.isThumb())
-    Branch.setOpcode(ARM::tB);
-  else if (Subtarget.isThumb2())
-    Branch.setOpcode(ARM::t2B);
-  else
-    Branch.setOpcode(ARM::Bcc);
-
-  Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
-  Branch.addOperand(MCOperand::CreateImm(ARMCC::AL));
-  Branch.addOperand(MCOperand::CreateReg(0));
-}
-
-void ARMBaseInstrInfo::getTrap(MCInst &MI) const {
-  if (Subtarget.isThumb())
-    MI.setOpcode(ARM::tTRAP);
-  else if (Subtarget.useNaClTrap())
-    MI.setOpcode(ARM::TRAPNaCl);
-  else
-    MI.setOpcode(ARM::TRAP);
-}
-
 bool ARMBaseInstrInfo::hasNOP() const {
   return (Subtarget.getFeatureBits() & ARM::HasV6T2Ops) != 0;
 }
@@ -4401,3 +4508,72 @@
 
   return false;
 }
+
+bool ARMBaseInstrInfo::getRegSequenceLikeInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+  assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index");
+  assert(MI.isRegSequenceLike() && "Invalid kind of instruction");
+
+  switch (MI.getOpcode()) {
+  case ARM::VMOVDRR:
+    // dX = VMOVDRR rY, rZ
+    // is the same as:
+    // dX = REG_SEQUENCE rY, ssub_0, rZ, ssub_1
+    // Populate the InputRegs accordingly.
+    // rY
+    const MachineOperand *MOReg = &MI.getOperand(1);
+    InputRegs.push_back(
+        RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_0));
+    // rZ
+    MOReg = &MI.getOperand(2);
+    InputRegs.push_back(
+        RegSubRegPairAndIdx(MOReg->getReg(), MOReg->getSubReg(), ARM::ssub_1));
+    return true;
+  }
+  llvm_unreachable("Target dependent opcode missing");
+}
+
+bool ARMBaseInstrInfo::getExtractSubregLikeInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPairAndIdx &InputReg) const {
+  assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index");
+  assert(MI.isExtractSubregLike() && "Invalid kind of instruction");
+
+  switch (MI.getOpcode()) {
+  case ARM::VMOVRRD:
+    // rX, rY = VMOVRRD dZ
+    // is the same as:
+    // rX = EXTRACT_SUBREG dZ, ssub_0
+    // rY = EXTRACT_SUBREG dZ, ssub_1
+    const MachineOperand &MOReg = MI.getOperand(2);
+    InputReg.Reg = MOReg.getReg();
+    InputReg.SubReg = MOReg.getSubReg();
+    InputReg.SubIdx = DefIdx == 0 ? ARM::ssub_0 : ARM::ssub_1;
+    return true;
+  }
+  llvm_unreachable("Target dependent opcode missing");
+}
+
+bool ARMBaseInstrInfo::getInsertSubregLikeInputs(
+    const MachineInstr &MI, unsigned DefIdx, RegSubRegPair &BaseReg,
+    RegSubRegPairAndIdx &InsertedReg) const {
+  assert(DefIdx < MI.getDesc().getNumDefs() && "Invalid definition index");
+  assert(MI.isInsertSubregLike() && "Invalid kind of instruction");
+
+  switch (MI.getOpcode()) {
+  case ARM::VSETLNi32:
+    // dX = VSETLNi32 dY, rZ, imm
+    const MachineOperand &MOBaseReg = MI.getOperand(1);
+    const MachineOperand &MOInsertedReg = MI.getOperand(2);
+    const MachineOperand &MOIndex = MI.getOperand(3);
+    BaseReg.Reg = MOBaseReg.getReg();
+    BaseReg.SubReg = MOBaseReg.getSubReg();
+
+    InsertedReg.Reg = MOInsertedReg.getReg();
+    InsertedReg.SubReg = MOInsertedReg.getSubReg();
+    InsertedReg.SubIdx = MOIndex.getImm() == 0 ? ARM::ssub_0 : ARM::ssub_1;
+    return true;
+  }
+  llvm_unreachable("Target dependent opcode missing");
+}

diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index b8d6758..0ae291b 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h

@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMBASEINSTRUCTIONINFO_H
-#define ARMBASEINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
 
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
@@ -34,6 +35,57 @@
   // Can be only subclassed.
   explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
 
+  void expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
+                                unsigned LoadImmOpc, unsigned LoadOpc,
+                                Reloc::Model RM) const;
+
+  /// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI
+  /// and \p DefIdx.
+  /// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of
+  /// the list is modeled as <Reg:SubReg, SubIdx>.
+  /// E.g., REG_SEQUENCE vreg1:sub1, sub0, vreg2, sub1 would produce
+  /// two elements:
+  /// - vreg1:sub1, sub0
+  /// - vreg2<:0>, sub1
+  ///
+  /// \returns true if it is possible to build such an input sequence
+  /// with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isRegSequenceLike().
+  bool getRegSequenceLikeInputs(
+      const MachineInstr &MI, unsigned DefIdx,
+      SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const override;
+
+  /// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI
+  /// and \p DefIdx.
+  /// \p [out] InputReg of the equivalent EXTRACT_SUBREG.
+  /// E.g., EXTRACT_SUBREG vreg1:sub1, sub0, sub1 would produce:
+  /// - vreg1:sub1, sub0
+  ///
+  /// \returns true if it is possible to build such an input sequence
+  /// with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isExtractSubregLike().
+  bool getExtractSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
+                                  RegSubRegPairAndIdx &InputReg) const override;
+
+  /// Build the equivalent inputs of a INSERT_SUBREG for the given \p MI
+  /// and \p DefIdx.
+  /// \p [out] BaseReg and \p [out] InsertedReg contain
+  /// the equivalent inputs of INSERT_SUBREG.
+  /// E.g., INSERT_SUBREG vreg0:sub0, vreg1:sub1, sub3 would produce:
+  /// - BaseReg: vreg0:sub0
+  /// - InsertedReg: vreg1:sub1, sub3
+  ///
+  /// \returns true if it is possible to build such an input sequence
+  /// with the pair \p MI, \p DefIdx. False otherwise.
+  ///
+  /// \pre MI.isInsertSubregLike().
+  bool
+  getInsertSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
+                            RegSubRegPair &BaseReg,
+                            RegSubRegPairAndIdx &InsertedReg) const override;
+
 public:
   // Return whether the target has an explicit NOP encoding.
   bool hasNOP() const;
@@ -104,6 +156,13 @@
   unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
                                     int &FrameIndex) const override;
 
+  void copyToCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                  unsigned SrcReg, bool KillSrc,
+                  const ARMSubtarget &Subtarget) const;
+  void copyFromCPSR(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned DestReg, bool KillSrc,
+                    const ARMSubtarget &Subtarget) const;
+
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    DebugLoc DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
@@ -230,12 +289,6 @@
   void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
                                  const TargetRegisterInfo *TRI) const override;
 
-  void
-  getUnconditionalBranch(MCInst &Branch,
-                         const MCSymbolRefExpr *BranchTarget) const override;
-
-  void getTrap(MCInst &MI) const override;
-
   /// Get the number of addresses by LDM or VLDM or zero for unknown.
   unsigned getNumLDMAddresses(const MachineInstr *MI) const;
 
@@ -286,6 +339,9 @@
   bool verifyInstruction(const MachineInstr *MI,
                          StringRef &ErrInfo) const override;
 
+  virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI,
+                                    Reloc::Model RM) const = 0;
+
 private:
   /// Modeling special VFP / NEON fp MLA / MLS hazards.
 

diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index cdd91c7..6dc0493 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp

@@ -38,6 +38,8 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+#define DEBUG_TYPE "arm-register-info"
+
 #define GET_REGINFO_TARGET_DESC
 #include "ARMGenRegisterInfo.inc"
 
@@ -121,7 +123,7 @@
 
 BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
@@ -180,14 +182,14 @@
 const TargetRegisterClass *
 ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
   if (RC == &ARM::CCRRegClass)
-    return nullptr;  // Can't copy CCR registers.
+    return &ARM::rGPRRegClass;  // Can't copy CCR registers.
   return RC;
 }
 
 unsigned
 ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                          MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   switch (RC->getID()) {
   default:
@@ -309,7 +311,7 @@
 bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   // When outgoing call frames are so large that we adjust the stack pointer
   // around the call, we can no longer use the stack pointer to reach the
@@ -354,7 +356,10 @@
     return false;
   // We may also need a base pointer if there are dynamic allocas or stack
   // pointer adjustments around calls.
-  if (MF.getTarget().getFrameLowering()->hasReservedCallFrame(MF))
+  if (MF.getTarget()
+          .getSubtargetImpl()
+          ->getFrameLowering()
+          ->hasReservedCallFrame(MF))
     return true;
   // A base pointer is required and allowed.  Check that it isn't too late to
   // reserve it.
@@ -365,7 +370,10 @@
 needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = MF.getTarget()
+                            .getSubtargetImpl()
+                            ->getFrameLowering()
+                            ->getStackAlignment();
   bool requiresRealignment =
     ((MFI->getMaxAlignment() > StackAlign) ||
      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
@@ -385,7 +393,7 @@
 
 unsigned
 ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   if (TFI->hasFP(MF))
     return FramePtr;
@@ -402,7 +410,7 @@
                   ARMCC::CondCodes Pred,
                   unsigned PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C =
         ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
@@ -415,11 +423,6 @@
     .setMIFlags(MIFlags);
 }
 
-bool ARMBaseRegisterInfo::mayOverrideLocalAssignment() const {
-  // The native linux build hits a downstream codegen bug when this is enabled.
-  return STI.isTargetDarwin();
-}
-
 bool ARMBaseRegisterInfo::
 requiresRegisterScavenging(const MachineFunction &MF) const {
   return true;
@@ -529,7 +532,7 @@
   // Note that the incoming offset is based on the SP value at function entry,
   // so it'll be negative.
   MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
@@ -582,7 +585,7 @@
                              int64_t Offset) const {
   ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
   unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
-    (AFI->isThumb1OnlyFunction() ? ARM::tADDrSPi : ARM::t2ADDri);
+    (AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri);
 
   MachineBasicBlock::iterator Ins = MBB->begin();
   DebugLoc DL;                  // Defaults to "unknown"
@@ -591,15 +594,15 @@
 
   const MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
 
-  MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg)
-    .addFrameIndex(FrameIdx).addImm(Offset));
+  MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+    .addFrameIndex(FrameIdx).addImm(Offset);
 
   if (!AFI->isThumb1OnlyFunction())
-    AddDefaultCC(MIB);
+    AddDefaultCC(AddDefaultPred(MIB));
 }
 
 void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
@@ -607,7 +610,7 @@
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   int Off = Offset; // ARM doesn't need the general 64-bit offsets
   unsigned i = 0;
@@ -706,9 +709,9 @@
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
-  const ARMFrameLowering *TFI =
-    static_cast<const ARMFrameLowering*>(MF.getTarget().getFrameLowering());
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const ARMFrameLowering *TFI = static_cast<const ARMFrameLowering *>(
+      MF.getSubtarget().getFrameLowering());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   assert(!AFI->isThumb1OnlyFunction() &&
          "This eliminateFrameIndex does not support Thumb1!");
@@ -775,3 +778,60 @@
     MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false,true);
   }
 }
+
+bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SubReg,
+                                  const TargetRegisterClass *DstRC,
+                                  unsigned DstSubReg,
+                                  const TargetRegisterClass *NewRC) const {
+  auto MBB = MI->getParent();
+  auto MF = MBB->getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  // If not copying into a sub-register this should be ok because we shouldn't
+  // need to split the reg.
+  if (!DstSubReg)
+    return true;
+  // Small registers don't frequently cause a problem, so we can coalesce them.
+  if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32)
+    return true;
+
+  auto NewRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC);
+  auto SrcRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC);
+  auto DstRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC);
+  // If the source register class is more expensive than the destination, the
+  // coalescing is probably profitable.
+  if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+  if (DstRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+
+  // If the register allocator isn't constrained, we can always allow coalescing
+  // unfortunately we don't know yet if we will be constrained.
+  // The goal of this heuristic is to restrict how many expensive registers
+  // we allow to coalesce in a given basic block.
+  auto AFI = MF->getInfo<ARMFunctionInfo>();
+  auto It = AFI->getCoalescedWeight(MBB);
+
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: "
+    << It->second << "\n");
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: "
+    << NewRCWeight.RegWeight << "\n");
+
+  // This number is the largest round number that which meets the criteria:
+  //  (1) addresses PR18825
+  //  (2) generates better code in some test cases (like vldm-shed-a9.ll)
+  //  (3) Doesn't regress any test cases (in-tree, test-suite, and SPEC)
+  // In practice the SizeMultiplier will only factor in for straight line code
+  // that uses a lot of NEON vectors, which isn't terribly common.
+  unsigned SizeMultiplier = MBB->size()/100;
+  SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1;
+  if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) {
+    It->second += NewRCWeight.RegWeight;
+    return true;
+  }
+  return false;
+}

diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 91df565..e9bc412 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMBASEREGISTERINFO_H
-#define ARMBASEREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
 
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -174,8 +174,6 @@
                                  unsigned MIFlags = MachineInstr::NoFlags)const;
 
   /// Code Generation virtual methods...
-  bool mayOverrideLocalAssignment() const override;
-
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
   bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
@@ -187,6 +185,14 @@
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
+
+  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true
+  bool shouldCoalesce(MachineInstr *MI,
+                      const TargetRegisterClass *SrcRC,
+                      unsigned SubReg,
+                      const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg,
+                      const TargetRegisterClass *NewRC) const override;
 };
 
 } // end namespace llvm

diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index dc41c1c..bd07236 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMCALLINGCONV_H
-#define ARMCALLINGCONV_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H
+#define LLVM_LIB_TARGET_ARM_ARMCALLINGCONV_H
 
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
@@ -177,8 +177,9 @@
                                    CCValAssign::LocInfo &LocInfo,
                                    ISD::ArgFlagsTy &ArgFlags, CCState &State) {
   SmallVectorImpl<CCValAssign> &PendingHAMembers = State.getPendingLocs();
+
   // AAPCS HFAs must have 1-4 elements, all of the same type
-  assert(PendingHAMembers.size() < 8);
+  assert(PendingHAMembers.size() < 4);
   if (PendingHAMembers.size() > 0)
     assert(PendingHAMembers[0].getLocVT() == LocVT);
 
@@ -188,7 +189,7 @@
       CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
 
   if (ArgFlags.isInConsecutiveRegsLast()) {
-    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 8 &&
+    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 4 &&
            "Homogeneous aggregates must have between 1 and 4 members");
 
     // Try to allocate a contiguous block of registers, each of the correct
@@ -196,7 +197,6 @@
     const uint16_t *RegList;
     unsigned NumRegs;
     switch (LocVT.SimpleTy) {
-    case MVT::i32:
     case MVT::f32:
       RegList = SRegList;
       NumRegs = 16;
@@ -235,20 +235,11 @@
       State.AllocateReg(SRegList[regNo]);
 
     unsigned Size = LocVT.getSizeInBits() / 8;
-    unsigned Align = Size;
-
-    if (LocVT.SimpleTy == MVT::v2f64 || LocVT.SimpleTy == MVT::i32) {
-      // Vectors are always aligned to 8 bytes. If we've seen an i32 here
-      // it's because it's been split from a larger type, also with align 8.
-      Align = 8;
-    }
+    unsigned Align = std::min(Size, 8U);
 
     for (auto It : PendingHAMembers) {
       It.convertToMem(State.AllocateStack(Size, Align));
       State.addLoc(It);
-
-      // Only the first member needs to be aligned.
-      Align = 1;
     }
 
     // All pending members have now been allocated

diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
deleted file mode 100644
index 5fb6ebf..0000000
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ /dev/null

@@ -1,1909 +0,0 @@
-//===-- ARM/ARMCodeEmitter.cpp - Convert ARM code to machine code ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the pass that transforms the ARM machine instructions into
-// relocatable machine code.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM.h"
-#include "ARMBaseInstrInfo.h"
-#include "ARMConstantPoolValue.h"
-#include "ARMMachineFunctionInfo.h"
-#include "ARMRelocations.h"
-#include "ARMSubtarget.h"
-#include "ARMTargetMachine.h"
-#include "MCTargetDesc/ARMAddressingModes.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#ifndef NDEBUG
-#include <iomanip>
-#endif
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-STATISTIC(NumEmitted, "Number of machine instructions emitted");
-
-namespace {
-
-  class ARMCodeEmitter : public MachineFunctionPass {
-    ARMJITInfo                *JTI;
-    const ARMBaseInstrInfo    *II;
-    const DataLayout          *TD;
-    const ARMSubtarget        *Subtarget;
-    TargetMachine             &TM;
-    JITCodeEmitter            &MCE;
-    MachineModuleInfo *MMI;
-    const std::vector<MachineConstantPoolEntry> *MCPEs;
-    const std::vector<MachineJumpTableEntry> *MJTEs;
-    bool IsPIC;
-    bool IsThumb;
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineModuleInfo>();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-
-    static char ID;
-  public:
-    ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-      : MachineFunctionPass(ID), JTI(nullptr),
-        II((const ARMBaseInstrInfo *)tm.getInstrInfo()),
-        TD(tm.getDataLayout()), TM(tm),
-        MCE(mce), MCPEs(nullptr), MJTEs(nullptr),
-        IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {}
-
-    /// getBinaryCodeForInstr - This function, generated by the
-    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
-    /// machine instructions.
-    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
-
-    bool runOnMachineFunction(MachineFunction &MF) override;
-
-    const char *getPassName() const override {
-      return "ARM Machine Code Emitter";
-    }
-
-    void emitInstruction(const MachineInstr &MI);
-
-  private:
-
-    void emitWordLE(unsigned Binary);
-    void emitDWordLE(uint64_t Binary);
-    void emitConstPoolInstruction(const MachineInstr &MI);
-    void emitMOVi32immInstruction(const MachineInstr &MI);
-    void emitMOVi2piecesInstruction(const MachineInstr &MI);
-    void emitLEApcrelJTInstruction(const MachineInstr &MI);
-    void emitPseudoMoveInstruction(const MachineInstr &MI);
-    void addPCLabel(unsigned LabelID);
-    void emitPseudoInstruction(const MachineInstr &MI);
-    unsigned getMachineSoRegOpValue(const MachineInstr &MI,
-                                    const MCInstrDesc &MCID,
-                                    const MachineOperand &MO,
-                                    unsigned OpIdx);
-
-    unsigned getMachineSoImmOpValue(unsigned SoImm);
-    unsigned getAddrModeSBit(const MachineInstr &MI,
-                             const MCInstrDesc &MCID) const;
-
-    void emitDataProcessingInstruction(const MachineInstr &MI,
-                                       unsigned ImplicitRd = 0,
-                                       unsigned ImplicitRn = 0);
-
-    void emitLoadStoreInstruction(const MachineInstr &MI,
-                                  unsigned ImplicitRd = 0,
-                                  unsigned ImplicitRn = 0);
-
-    void emitMiscLoadStoreInstruction(const MachineInstr &MI,
-                                      unsigned ImplicitRn = 0);
-
-    void emitLoadStoreMultipleInstruction(const MachineInstr &MI);
-
-    void emitMulFrmInstruction(const MachineInstr &MI);
-
-    void emitExtendInstruction(const MachineInstr &MI);
-
-    void emitMiscArithInstruction(const MachineInstr &MI);
-
-    void emitSaturateInstruction(const MachineInstr &MI);
-
-    void emitBranchInstruction(const MachineInstr &MI);
-
-    void emitInlineJumpTable(unsigned JTIndex);
-
-    void emitMiscBranchInstruction(const MachineInstr &MI);
-
-    void emitVFPArithInstruction(const MachineInstr &MI);
-
-    void emitVFPConversionInstruction(const MachineInstr &MI);
-
-    void emitVFPLoadStoreInstruction(const MachineInstr &MI);
-
-    void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI);
-
-    void emitNEONLaneInstruction(const MachineInstr &MI);
-    void emitNEONDupInstruction(const MachineInstr &MI);
-    void emitNEON1RegModImmInstruction(const MachineInstr &MI);
-    void emitNEON2RegInstruction(const MachineInstr &MI);
-    void emitNEON3RegInstruction(const MachineInstr &MI);
-
-    /// getMachineOpValue - Return binary encoding of operand. If the machine
-    /// operand requires relocation, record the relocation and return zero.
-    unsigned getMachineOpValue(const MachineInstr &MI,
-                               const MachineOperand &MO) const;
-    unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) const {
-      return getMachineOpValue(MI, MI.getOperand(OpIdx));
-    }
-
-    // FIXME: The legacy JIT ARMCodeEmitter doesn't rely on the the
-    //  TableGen'erated getBinaryCodeForInstr() function to encode any
-    //  operand values, instead querying getMachineOpValue() directly for
-    //  each operand it needs to encode. Thus, any of the new encoder
-    //  helper functions can simply return 0 as the values the return
-    //  are already handled elsewhere. They are placeholders to allow this
-    //  encoder to continue to function until the MC encoder is sufficiently
-    //  far along that this one can be eliminated entirely.
-    unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val)
-      const { return 0; }
-    unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val)
-      const { return 0; }
-    unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val)
-      const { return 0; }
-    unsigned NEONThumb2V8PostEncoder(const MachineInstr &MI,unsigned Val)
-      const { return 0; }
-    unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val)
-      const { return 0; }
-    unsigned getAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getThumbAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getThumbBLTargetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getThumbBLXTargetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getThumbBRTargetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getThumbBCCTargetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getThumbCBTargetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getUnconditionalBranchTargetOpValue(const MachineInstr &MI,
-      unsigned Op) const { return 0; }
-    unsigned getARMBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getARMBLTargetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getARMBLXTargetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getCCOutOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getSOImmOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getT2SOImmOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getSORegRegOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getSORegImmOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getThumbAddrModeRegRegOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getT2AddrModeImm8OpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getT2Imm8s4OpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getT2AddrModeImm8s4OpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getT2AddrModeImm0_1020s4OpValue(const MachineInstr &MI,unsigned Op)
-      const { return 0; }
-    unsigned getT2AddrModeImm8OffsetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getT2AddrModeSORegOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getT2SORegOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getT2AdrLabelOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getAddrMode6OneLane32AddressOpValue(const MachineInstr &MI,
-                                                 unsigned Op)
-      const { return 0; }
-    unsigned getAddrMode6DupAddressOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getAddrMode6OffsetOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getBitfieldInvertedMaskOpValue(const MachineInstr &MI,
-                                            unsigned Op) const { return 0; }
-    uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx)
-      const { return 0; }
-
-    unsigned getAddrModeImm12OpValue(const MachineInstr &MI, unsigned Op)
-      const {
-      // {17-13} = reg
-      // {12}    = (U)nsigned (add == '1', sub == '0')
-      // {11-0}  = imm12
-      const MachineOperand &MO  = MI.getOperand(Op);
-      const MachineOperand &MO1 = MI.getOperand(Op + 1);
-      if (!MO.isReg()) {
-        emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
-        return 0;
-      }
-      unsigned Reg = II->getRegisterInfo().getEncodingValue(MO.getReg());
-      int32_t Imm12 = MO1.getImm();
-      uint32_t Binary;
-      Binary = Imm12 & 0xfff;
-      if (Imm12 >= 0)
-        Binary |= (1 << 12);
-      Binary |= (Reg << 13);
-      return Binary;
-    }
-
-    unsigned getHiLo16ImmOpValue(const MachineInstr &MI, unsigned Op) const {
-      return 0;
-    }
-
-    uint32_t getAddrMode2OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
-      const { return 0;}
-    uint32_t getPostIdxRegOpValue(const MachineInstr &MI, unsigned OpIdx)
-      const { return 0;}
-    uint32_t getAddrMode3OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
-      const { return 0;}
-    uint32_t getAddrMode3OpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    uint32_t getAddrModeThumbSPOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    uint32_t getAddrModeISOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    uint32_t getAddrModePCOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    uint32_t getAddrMode5OpValue(const MachineInstr &MI, unsigned Op) const {
-      // {17-13} = reg
-      // {12}    = (U)nsigned (add == '1', sub == '0')
-      // {11-0}  = imm12
-      const MachineOperand &MO  = MI.getOperand(Op);
-      const MachineOperand &MO1 = MI.getOperand(Op + 1);
-      if (!MO.isReg()) {
-        emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
-        return 0;
-      }
-      unsigned Reg = II->getRegisterInfo().getEncodingValue(MO.getReg());
-      int32_t Imm12 = MO1.getImm();
-
-      // Special value for #-0
-      if (Imm12 == INT32_MIN)
-        Imm12 = 0;
-
-      // Immediate is always encoded as positive. The 'U' bit controls add vs
-      // sub.
-      bool isAdd = true;
-      if (Imm12 < 0) {
-        Imm12 = -Imm12;
-        isAdd = false;
-      }
-
-      uint32_t Binary = Imm12 & 0xfff;
-      if (isAdd)
-        Binary |= (1 << 12);
-      Binary |= (Reg << 13);
-      return Binary;
-    }
-    unsigned getNEONVcvtImm32OpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-
-    unsigned getRegisterListOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-
-    unsigned getShiftRight8Imm(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getShiftRight16Imm(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getShiftRight32Imm(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getShiftRight64Imm(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-
-    /// getMovi32Value - Return binary encoding of operand for movw/movt. If the
-    /// machine operand requires relocation, record the relocation and return
-    /// zero.
-    unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO,
-                            unsigned Reloc);
-
-    /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
-    ///
-    unsigned getShiftOp(unsigned Imm) const ;
-
-    /// Routines that handle operands which add machine relocations which are
-    /// fixed up by the relocation stage.
-    void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
-                           bool MayNeedFarStub,  bool Indirect,
-                           intptr_t ACPV = 0) const;
-    void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
-    void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
-    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
-    void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc,
-                               intptr_t JTBase = 0) const;
-    unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) const;
-    unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) const;
-    unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) const;
-    unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) const;
-    unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) const;
-    unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) const;
-  };
-}
-
-char ARMCodeEmitter::ID = 0;
-
-/// createARMJITCodeEmitterPass - Return a pass that emits the collected ARM
-/// code to the specified MCE object.
-FunctionPass *llvm::createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
-                                                JITCodeEmitter &JCE) {
-  return new ARMCodeEmitter(TM, JCE);
-}
-
-bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
-  TargetMachine &Target = const_cast<TargetMachine&>(MF.getTarget());
-
-  assert((Target.getRelocationModel() != Reloc::Default ||
-          Target.getRelocationModel() != Reloc::Static) &&
-         "JIT relocation model must be set to static or default!");
-
-  JTI = static_cast<ARMJITInfo*>(Target.getJITInfo());
-  II = static_cast<const ARMBaseInstrInfo*>(Target.getInstrInfo());
-  TD = Target.getDataLayout();
-
-  Subtarget = &TM.getSubtarget<ARMSubtarget>();
-  MCPEs = &MF.getConstantPool()->getConstants();
-  MJTEs = nullptr;
-  if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
-  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
-  IsThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction();
-  JTI->Initialize(MF, IsPIC);
-  MMI = &getAnalysis<MachineModuleInfo>();
-  MCE.setModuleInfo(MMI);
-
-  do {
-    DEBUG(errs() << "JITTing function '"
-          << MF.getName() << "'\n");
-    MCE.startFunction(MF);
-    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
-         MBB != E; ++MBB) {
-      MCE.StartMachineBasicBlock(MBB);
-      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-           I != E; ++I)
-        emitInstruction(*I);
-    }
-  } while (MCE.finishFunction(MF));
-
-  return false;
-}
-
-/// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
-///
-unsigned ARMCodeEmitter::getShiftOp(unsigned Imm) const {
-  switch (ARM_AM::getAM2ShiftOpc(Imm)) {
-  default: llvm_unreachable("Unknown shift opc!");
-  case ARM_AM::asr: return 2;
-  case ARM_AM::lsl: return 0;
-  case ARM_AM::lsr: return 1;
-  case ARM_AM::ror:
-  case ARM_AM::rrx: return 3;
-  }
-}
-
-/// getMovi32Value - Return binary encoding of operand for movw/movt. If the
-/// machine operand requires relocation, record the relocation and return zero.
-unsigned ARMCodeEmitter::getMovi32Value(const MachineInstr &MI,
-                                        const MachineOperand &MO,
-                                        unsigned Reloc) {
-  assert(((Reloc == ARM::reloc_arm_movt) || (Reloc == ARM::reloc_arm_movw))
-      && "Relocation to this function should be for movt or movw");
-
-  if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-  else if (MO.isGlobal())
-    emitGlobalAddress(MO.getGlobal(), Reloc, true, false);
-  else if (MO.isSymbol())
-    emitExternalSymbolAddress(MO.getSymbolName(), Reloc);
-  else if (MO.isMBB())
-    emitMachineBasicBlock(MO.getMBB(), Reloc);
-  else {
-#ifndef NDEBUG
-    errs() << MO;
-#endif
-    llvm_unreachable("Unsupported operand type for movw/movt");
-  }
-  return 0;
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI,
-                                           const MachineOperand &MO) const {
-  if (MO.isReg())
-    return II->getRegisterInfo().getEncodingValue(MO.getReg());
-  else if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-  else if (MO.isGlobal())
-    emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false);
-  else if (MO.isSymbol())
-    emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch);
-  else if (MO.isCPI()) {
-    const MCInstrDesc &MCID = MI.getDesc();
-    // For VFP load, the immediate offset is multiplied by 4.
-    unsigned Reloc =  ((MCID.TSFlags & ARMII::FormMask) == ARMII::VFPLdStFrm)
-      ? ARM::reloc_arm_vfp_cp_entry : ARM::reloc_arm_cp_entry;
-    emitConstPoolAddress(MO.getIndex(), Reloc);
-  } else if (MO.isJTI())
-    emitJumpTableAddress(MO.getIndex(), ARM::reloc_arm_relative);
-  else if (MO.isMBB())
-    emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch);
-  else
-    llvm_unreachable("Unable to encode MachineOperand!");
-  return 0;
-}
-
-/// emitGlobalAddress - Emit the specified address to the code stream.
-///
-void ARMCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
-                                       bool MayNeedFarStub, bool Indirect,
-                                       intptr_t ACPV) const {
-  MachineRelocation MR = Indirect
-    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
-                                           const_cast<GlobalValue *>(GV),
-                                           ACPV, MayNeedFarStub)
-    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
-                               const_cast<GlobalValue *>(GV), ACPV,
-                               MayNeedFarStub);
-  MCE.addRelocation(MR);
-}
-
-/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
-/// be emitted to the current location in the function, and allow it to be PC
-/// relative.
-void ARMCodeEmitter::
-emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
-                                                 Reloc, ES));
-}
-
-/// emitConstPoolAddress - Arrange for the address of an constant pool
-/// to be emitted to the current location in the function, and allow it to be PC
-/// relative.
-void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
-  // Tell JIT emitter we'll resolve the address.
-  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
-                                                    Reloc, CPI, 0, true));
-}
-
-/// emitJumpTableAddress - Arrange for the address of a jump table to
-/// be emitted to the current location in the function, and allow it to be PC
-/// relative.
-void ARMCodeEmitter::
-emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
-                                                    Reloc, JTIndex, 0, true));
-}
-
-/// emitMachineBasicBlock - Emit the specified address basic block.
-void ARMCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
-                                           unsigned Reloc,
-                                           intptr_t JTBase) const {
-  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
-                                             Reloc, BB, JTBase));
-}
-
-void ARMCodeEmitter::emitWordLE(unsigned Binary) {
-  DEBUG(errs() << "  0x";
-        errs().write_hex(Binary) << "\n");
-  MCE.emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitDWordLE(uint64_t Binary) {
-  DEBUG(errs() << "  0x";
-        errs().write_hex(Binary) << "\n");
-  MCE.emitDWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
-  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI);
-
-  MCE.processDebugLoc(MI.getDebugLoc(), true);
-
-  ++NumEmitted;  // Keep track of the # of mi's emitted
-  switch (MI.getDesc().TSFlags & ARMII::FormMask) {
-  default: {
-    llvm_unreachable("Unhandled instruction encoding format!");
-  }
-  case ARMII::MiscFrm:
-    if (MI.getOpcode() == ARM::LEApcrelJT) {
-      // Materialize jumptable address.
-      emitLEApcrelJTInstruction(MI);
-      break;
-    }
-    llvm_unreachable("Unhandled instruction encoding!");
-  case ARMII::Pseudo:
-    emitPseudoInstruction(MI);
-    break;
-  case ARMII::DPFrm:
-  case ARMII::DPSoRegFrm:
-    emitDataProcessingInstruction(MI);
-    break;
-  case ARMII::LdFrm:
-  case ARMII::StFrm:
-    emitLoadStoreInstruction(MI);
-    break;
-  case ARMII::LdMiscFrm:
-  case ARMII::StMiscFrm:
-    emitMiscLoadStoreInstruction(MI);
-    break;
-  case ARMII::LdStMulFrm:
-    emitLoadStoreMultipleInstruction(MI);
-    break;
-  case ARMII::MulFrm:
-    emitMulFrmInstruction(MI);
-    break;
-  case ARMII::ExtFrm:
-    emitExtendInstruction(MI);
-    break;
-  case ARMII::ArithMiscFrm:
-    emitMiscArithInstruction(MI);
-    break;
-  case ARMII::SatFrm:
-    emitSaturateInstruction(MI);
-    break;
-  case ARMII::BrFrm:
-    emitBranchInstruction(MI);
-    break;
-  case ARMII::BrMiscFrm:
-    emitMiscBranchInstruction(MI);
-    break;
-  // VFP instructions.
-  case ARMII::VFPUnaryFrm:
-  case ARMII::VFPBinaryFrm:
-    emitVFPArithInstruction(MI);
-    break;
-  case ARMII::VFPConv1Frm:
-  case ARMII::VFPConv2Frm:
-  case ARMII::VFPConv3Frm:
-  case ARMII::VFPConv4Frm:
-  case ARMII::VFPConv5Frm:
-    emitVFPConversionInstruction(MI);
-    break;
-  case ARMII::VFPLdStFrm:
-    emitVFPLoadStoreInstruction(MI);
-    break;
-  case ARMII::VFPLdStMulFrm:
-    emitVFPLoadStoreMultipleInstruction(MI);
-    break;
-
-  // NEON instructions.
-  case ARMII::NGetLnFrm:
-  case ARMII::NSetLnFrm:
-    emitNEONLaneInstruction(MI);
-    break;
-  case ARMII::NDupFrm:
-    emitNEONDupInstruction(MI);
-    break;
-  case ARMII::N1RegModImmFrm:
-    emitNEON1RegModImmInstruction(MI);
-    break;
-  case ARMII::N2RegFrm:
-    emitNEON2RegInstruction(MI);
-    break;
-  case ARMII::N3RegFrm:
-    emitNEON3RegInstruction(MI);
-    break;
-  }
-  MCE.processDebugLoc(MI.getDebugLoc(), false);
-}
-
-void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) {
-  unsigned CPI = MI.getOperand(0).getImm();       // CP instruction index.
-  unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index.
-  const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex];
-
-  // Remember the CONSTPOOL_ENTRY address for later relocation.
-  JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue());
-
-  // Emit constpool island entry. In most cases, the actual values will be
-  // resolved and relocated after code emission.
-  if (MCPE.isMachineConstantPoolEntry()) {
-    ARMConstantPoolValue *ACPV =
-      static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
-
-    DEBUG(errs() << "  ** ARM constant pool #" << CPI << " @ "
-          << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n');
-
-    assert(ACPV->isGlobalValue() && "unsupported constant pool value");
-    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
-    if (GV) {
-      Reloc::Model RelocM = TM.getRelocationModel();
-      emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
-                        isa<Function>(GV),
-                        Subtarget->GVIsIndirectSymbol(GV, RelocM),
-                        (intptr_t)ACPV);
-    } else  {
-      const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
-      emitExternalSymbolAddress(Sym, ARM::reloc_arm_absolute);
-    }
-    emitWordLE(0);
-  } else {
-    const Constant *CV = MCPE.Val.ConstVal;
-
-    DEBUG({
-        errs() << "  ** Constant pool #" << CPI << " @ "
-               << (void*)MCE.getCurrentPCValue() << " ";
-        if (const Function *F = dyn_cast<Function>(CV))
-          errs() << F->getName();
-        else
-          errs() << *CV;
-        errs() << '\n';
-      });
-
-    if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
-      emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false);
-      emitWordLE(0);
-    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
-      uint32_t Val = uint32_t(*CI->getValue().getRawData());
-      emitWordLE(Val);
-    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
-      if (CFP->getType()->isFloatTy())
-        emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
-      else if (CFP->getType()->isDoubleTy())
-        emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
-      else {
-        llvm_unreachable("Unable to handle this constantpool entry!");
-      }
-    } else {
-      llvm_unreachable("Unable to handle this constantpool entry!");
-    }
-  }
-}
-
-void ARMCodeEmitter::emitMOVi32immInstruction(const MachineInstr &MI) {
-  const MachineOperand &MO0 = MI.getOperand(0);
-  const MachineOperand &MO1 = MI.getOperand(1);
-
-  // Emit the 'movw' instruction.
-  unsigned Binary = 0x30 << 20;  // mov: Insts{27-20} = 0b00110000
-
-  unsigned Lo16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movw) & 0xFFFF;
-
-  // Set the conditional execution predicate.
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode Rd.
-  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
-
-  // Encode imm16 as imm4:imm12
-  Binary |= Lo16 & 0xFFF; // Insts{11-0} = imm12
-  Binary |= ((Lo16 >> 12) & 0xF) << 16; // Insts{19-16} = imm4
-  emitWordLE(Binary);
-
-  unsigned Hi16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movt) >> 16;
-  // Emit the 'movt' instruction.
-  Binary = 0x34 << 20; // movt: Insts{27-20} = 0b00110100
-
-  // Set the conditional execution predicate.
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode Rd.
-  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
-
-  // Encode imm16 as imm4:imm1, same as movw above.
-  Binary |= Hi16 & 0xFFF;
-  Binary |= ((Hi16 >> 12) & 0xF) << 16;
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitMOVi2piecesInstruction(const MachineInstr &MI) {
-  const MachineOperand &MO0 = MI.getOperand(0);
-  const MachineOperand &MO1 = MI.getOperand(1);
-  assert(MO1.isImm() && ARM_AM::isSOImmTwoPartVal(MO1.getImm()) &&
-                                                  "Not a valid so_imm value!");
-  unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm());
-  unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm());
-
-  // Emit the 'mov' instruction.
-  unsigned Binary = 0xd << 21;  // mov: Insts{24-21} = 0b1101
-
-  // Set the conditional execution predicate.
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode Rd.
-  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
-
-  // Encode so_imm.
-  // Set bit I(25) to identify this is the immediate form of <shifter_op>
-  Binary |= 1 << ARMII::I_BitShift;
-  Binary |= getMachineSoImmOpValue(V1);
-  emitWordLE(Binary);
-
-  // Now the 'orr' instruction.
-  Binary = 0xc << 21;  // orr: Insts{24-21} = 0b1100
-
-  // Set the conditional execution predicate.
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode Rd.
-  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
-
-  // Encode Rn.
-  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRnShift;
-
-  // Encode so_imm.
-  // Set bit I(25) to identify this is the immediate form of <shifter_op>
-  Binary |= 1 << ARMII::I_BitShift;
-  Binary |= getMachineSoImmOpValue(V2);
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) {
-  // It's basically add r, pc, (LJTI - $+8)
-
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  // Emit the 'add' instruction.
-  unsigned Binary = 0x4 << 21;  // add: Insts{24-21} = 0b0100
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode S bit if MI modifies CPSR.
-  Binary |= getAddrModeSBit(MI, MCID);
-
-  // Encode Rd.
-  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
-
-  // Encode Rn which is PC.
-  Binary |= II->getRegisterInfo().getEncodingValue(ARM::PC) << ARMII::RegRnShift;
-
-  // Encode the displacement.
-  Binary |= 1 << ARMII::I_BitShift;
-  emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base);
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitPseudoMoveInstruction(const MachineInstr &MI) {
-  unsigned Opcode = MI.getDesc().Opcode;
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode S bit if MI modifies CPSR.
-  if (Opcode == ARM::MOVsrl_flag || Opcode == ARM::MOVsra_flag)
-    Binary |= 1 << ARMII::S_BitShift;
-
-  // Encode register def if there is one.
-  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
-
-  // Encode the shift operation.
-  switch (Opcode) {
-  default: break;
-  case ARM::RRX:
-    // rrx
-    Binary |= 0x6 << 4;
-    break;
-  case ARM::MOVsrl_flag:
-    // lsr #1
-    Binary |= (0x2 << 4) | (1 << 7);
-    break;
-  case ARM::MOVsra_flag:
-    // asr #1
-    Binary |= (0x4 << 4) | (1 << 7);
-    break;
-  }
-
-  // Encode register Rm.
-  Binary |= getMachineOpValue(MI, 1);
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::addPCLabel(unsigned LabelID) {
-  DEBUG(errs() << "  ** LPC" << LabelID << " @ "
-        << (void*)MCE.getCurrentPCValue() << '\n');
-  JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue());
-}
-
-void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
-  unsigned Opcode = MI.getDesc().Opcode;
-  switch (Opcode) {
-  default:
-    llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction");
-  case ARM::BX_CALL:
-  case ARM::BMOVPCRX_CALL: {
-    // First emit mov lr, pc
-    unsigned Binary = 0x01a0e00f;
-    Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-    emitWordLE(Binary);
-
-    // and then emit the branch.
-    emitMiscBranchInstruction(MI);
-    break;
-  }
-  case TargetOpcode::INLINEASM: {
-    // We allow inline assembler nodes with empty bodies - they can
-    // implicitly define registers, which is ok for JIT.
-    if (MI.getOperand(0).getSymbolName()[0]) {
-      report_fatal_error("JIT does not support inline asm!");
-    }
-    break;
-  }
-  case TargetOpcode::CFI_INSTRUCTION:
-    break;
-  case TargetOpcode::EH_LABEL:
-    MCE.emitLabel(MI.getOperand(0).getMCSymbol());
-    break;
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-    // Do nothing.
-    break;
-  case ARM::CONSTPOOL_ENTRY:
-    emitConstPoolInstruction(MI);
-    break;
-  case ARM::PICADD: {
-    // Remember of the address of the PC label for relocation later.
-    addPCLabel(MI.getOperand(2).getImm());
-    // PICADD is just an add instruction that implicitly read pc.
-    emitDataProcessingInstruction(MI, 0, ARM::PC);
-    break;
-  }
-  case ARM::PICLDR:
-  case ARM::PICLDRB:
-  case ARM::PICSTR:
-  case ARM::PICSTRB: {
-    // Remember of the address of the PC label for relocation later.
-    addPCLabel(MI.getOperand(2).getImm());
-    // These are just load / store instructions that implicitly read pc.
-    emitLoadStoreInstruction(MI, 0, ARM::PC);
-    break;
-  }
-  case ARM::PICLDRH:
-  case ARM::PICLDRSH:
-  case ARM::PICLDRSB:
-  case ARM::PICSTRH: {
-    // Remember of the address of the PC label for relocation later.
-    addPCLabel(MI.getOperand(2).getImm());
-    // These are just load / store instructions that implicitly read pc.
-    emitMiscLoadStoreInstruction(MI, ARM::PC);
-    break;
-  }
-
-  case ARM::MOVi32imm:
-    // Two instructions to materialize a constant.
-    if (Subtarget->hasV6T2Ops())
-      emitMOVi32immInstruction(MI);
-    else
-      emitMOVi2piecesInstruction(MI);
-    break;
-
-  case ARM::LEApcrelJT:
-    // Materialize jumptable address.
-    emitLEApcrelJTInstruction(MI);
-    break;
-  case ARM::RRX:
-  case ARM::MOVsrl_flag:
-  case ARM::MOVsra_flag:
-    emitPseudoMoveInstruction(MI);
-    break;
-  }
-}
-
-unsigned ARMCodeEmitter::getMachineSoRegOpValue(const MachineInstr &MI,
-                                                const MCInstrDesc &MCID,
-                                                const MachineOperand &MO,
-                                                unsigned OpIdx) {
-  unsigned Binary = getMachineOpValue(MI, MO);
-
-  const MachineOperand &MO1 = MI.getOperand(OpIdx + 1);
-  const MachineOperand &MO2 = MI.getOperand(OpIdx + 2);
-  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
-
-  // Encode the shift opcode.
-  unsigned SBits = 0;
-  unsigned Rs = MO1.getReg();
-  if (Rs) {
-    // Set shift operand (bit[7:4]).
-    // LSL - 0001
-    // LSR - 0011
-    // ASR - 0101
-    // ROR - 0111
-    // RRX - 0110 and bit[11:8] clear.
-    switch (SOpc) {
-    default: llvm_unreachable("Unknown shift opc!");
-    case ARM_AM::lsl: SBits = 0x1; break;
-    case ARM_AM::lsr: SBits = 0x3; break;
-    case ARM_AM::asr: SBits = 0x5; break;
-    case ARM_AM::ror: SBits = 0x7; break;
-    case ARM_AM::rrx: SBits = 0x6; break;
-    }
-  } else {
-    // Set shift operand (bit[6:4]).
-    // LSL - 000
-    // LSR - 010
-    // ASR - 100
-    // ROR - 110
-    switch (SOpc) {
-    default: llvm_unreachable("Unknown shift opc!");
-    case ARM_AM::lsl: SBits = 0x0; break;
-    case ARM_AM::lsr: SBits = 0x2; break;
-    case ARM_AM::asr: SBits = 0x4; break;
-    case ARM_AM::ror: SBits = 0x6; break;
-    }
-  }
-  Binary |= SBits << 4;
-  if (SOpc == ARM_AM::rrx)
-    return Binary;
-
-  // Encode the shift operation Rs or shift_imm (except rrx).
-  if (Rs) {
-    // Encode Rs bit[11:8].
-    assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
-    return Binary | (II->getRegisterInfo().getEncodingValue(Rs) << ARMII::RegRsShift);
-  }
-
-  // Encode shift_imm bit[11:7].
-  return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7;
-}
-
-unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) {
-  int SoImmVal = ARM_AM::getSOImmVal(SoImm);
-  assert(SoImmVal != -1 && "Not a valid so_imm value!");
-
-  // Encode rotate_imm.
-  unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
-    << ARMII::SoRotImmShift;
-
-  // Encode immed_8.
-  Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
-  return Binary;
-}
-
-unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI,
-                                         const MCInstrDesc &MCID) const {
-  for (unsigned i = MI.getNumOperands(), e = MCID.getNumOperands(); i >= e;--i){
-    const MachineOperand &MO = MI.getOperand(i-1);
-    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)
-      return 1 << ARMII::S_BitShift;
-  }
-  return 0;
-}
-
-void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
-                                                   unsigned ImplicitRd,
-                                                   unsigned ImplicitRn) {
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode S bit if MI modifies CPSR.
-  Binary |= getAddrModeSBit(MI, MCID);
-
-  // Encode register def if there is one.
-  unsigned NumDefs = MCID.getNumDefs();
-  unsigned OpIdx = 0;
-  if (NumDefs)
-    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
-  else if (ImplicitRd)
-    // Special handling for implicit use (e.g. PC).
-    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRd) << ARMII::RegRdShift);
-
-  if (MCID.Opcode == ARM::MOVi16) {
-      // Get immediate from MI.
-      unsigned Lo16 = getMovi32Value(MI, MI.getOperand(OpIdx),
-                      ARM::reloc_arm_movw);
-      // Encode imm which is the same as in emitMOVi32immInstruction().
-      Binary |= Lo16 & 0xFFF;
-      Binary |= ((Lo16 >> 12) & 0xF) << 16;
-      emitWordLE(Binary);
-      return;
-  } else if(MCID.Opcode == ARM::MOVTi16) {
-      unsigned Hi16 = (getMovi32Value(MI, MI.getOperand(OpIdx),
-                       ARM::reloc_arm_movt) >> 16);
-      Binary |= Hi16 & 0xFFF;
-      Binary |= ((Hi16 >> 12) & 0xF) << 16;
-      emitWordLE(Binary);
-      return;
-  } else if ((MCID.Opcode == ARM::BFC) || (MCID.Opcode == ARM::BFI)) {
-      uint32_t v = ~MI.getOperand(2).getImm();
-      int32_t lsb = countTrailingZeros(v);
-      int32_t msb = (32 - countLeadingZeros(v)) - 1;
-      // Instr{20-16} = msb, Instr{11-7} = lsb
-      Binary |= (msb & 0x1F) << 16;
-      Binary |= (lsb & 0x1F) << 7;
-      emitWordLE(Binary);
-      return;
-  } else if ((MCID.Opcode == ARM::UBFX) || (MCID.Opcode == ARM::SBFX)) {
-      // Encode Rn in Instr{0-3}
-      Binary |= getMachineOpValue(MI, OpIdx++);
-
-      uint32_t lsb = MI.getOperand(OpIdx++).getImm();
-      uint32_t widthm1 = MI.getOperand(OpIdx++).getImm() - 1;
-
-      // Instr{20-16} = widthm1, Instr{11-7} = lsb
-      Binary |= (widthm1 & 0x1F) << 16;
-      Binary |= (lsb & 0x1F) << 7;
-      emitWordLE(Binary);
-      return;
-  }
-
-  // If this is a two-address operand, skip it. e.g. MOVCCr operand 1.
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
-    ++OpIdx;
-
-  // Encode first non-shifter register operand if there is one.
-  bool isUnary = MCID.TSFlags & ARMII::UnaryDP;
-  if (!isUnary) {
-    if (ImplicitRn)
-      // Special handling for implicit use (e.g. PC).
-      Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
-    else {
-      Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift;
-      ++OpIdx;
-    }
-  }
-
-  // Encode shifter operand.
-  const MachineOperand &MO = MI.getOperand(OpIdx);
-  if ((MCID.TSFlags & ARMII::FormMask) == ARMII::DPSoRegFrm) {
-    // Encode SoReg.
-    emitWordLE(Binary | getMachineSoRegOpValue(MI, MCID, MO, OpIdx));
-    return;
-  }
-
-  if (MO.isReg()) {
-    // Encode register Rm.
-    emitWordLE(Binary | II->getRegisterInfo().getEncodingValue(MO.getReg()));
-    return;
-  }
-
-  // Encode so_imm.
-  Binary |= getMachineSoImmOpValue((unsigned)MO.getImm());
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI,
-                                              unsigned ImplicitRd,
-                                              unsigned ImplicitRn) {
-  const MCInstrDesc &MCID = MI.getDesc();
-  unsigned Form = MCID.TSFlags & ARMII::FormMask;
-  bool IsPrePost = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // If this is an LDRi12, STRi12 or LDRcp, nothing more needs be done.
-  if (MI.getOpcode() == ARM::LDRi12 || MI.getOpcode() == ARM::LDRcp ||
-      MI.getOpcode() == ARM::STRi12) {
-    emitWordLE(Binary);
-    return;
-  }
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  unsigned OpIdx = 0;
-
-  // Operand 0 of a pre- and post-indexed store is the address base
-  // writeback. Skip it.
-  bool Skipped = false;
-  if (IsPrePost && Form == ARMII::StFrm) {
-    ++OpIdx;
-    Skipped = true;
-  }
-
-  // Set first operand
-  if (ImplicitRd)
-    // Special handling for implicit use (e.g. PC).
-    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRd) << ARMII::RegRdShift);
-  else
-    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
-
-  // Set second operand
-  if (ImplicitRn)
-    // Special handling for implicit use (e.g. PC).
-    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
-  else
-    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
-
-  // If this is a two-address operand, skip it. e.g. LDR_PRE.
-  if (!Skipped && MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
-    ++OpIdx;
-
-  const MachineOperand &MO2 = MI.getOperand(OpIdx);
-  unsigned AM2Opc = (ImplicitRn == ARM::PC)
-    ? 0 : MI.getOperand(OpIdx+1).getImm();
-
-  // Set bit U(23) according to sign of immed value (positive or negative).
-  Binary |= ((ARM_AM::getAM2Op(AM2Opc) == ARM_AM::add ? 1 : 0) <<
-             ARMII::U_BitShift);
-  if (!MO2.getReg()) { // is immediate
-    if (ARM_AM::getAM2Offset(AM2Opc))
-      // Set the value of offset_12 field
-      Binary |= ARM_AM::getAM2Offset(AM2Opc);
-    emitWordLE(Binary);
-    return;
-  }
-
-  // Set bit I(25), because this is not in immediate encoding.
-  Binary |= 1 << ARMII::I_BitShift;
-  assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
-  // Set bit[3:0] to the corresponding Rm register
-  Binary |= II->getRegisterInfo().getEncodingValue(MO2.getReg());
-
-  // If this instr is in scaled register offset/index instruction, set
-  // shift_immed(bit[11:7]) and shift(bit[6:5]) fields.
-  if (unsigned ShImm = ARM_AM::getAM2Offset(AM2Opc)) {
-    Binary |= getShiftOp(AM2Opc) << ARMII::ShiftImmShift;  // shift
-    Binary |= ShImm              << ARMII::ShiftShift;     // shift_immed
-  }
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI,
-                                                  unsigned ImplicitRn) {
-  const MCInstrDesc &MCID = MI.getDesc();
-  unsigned Form = MCID.TSFlags & ARMII::FormMask;
-  bool IsPrePost = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  unsigned OpIdx = 0;
-
-  // Operand 0 of a pre- and post-indexed store is the address base
-  // writeback. Skip it.
-  bool Skipped = false;
-  if (IsPrePost && Form == ARMII::StMiscFrm) {
-    ++OpIdx;
-    Skipped = true;
-  }
-
-  // Set first operand
-  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
-
-  // Skip LDRD and STRD's second operand.
-  if (MCID.Opcode == ARM::LDRD || MCID.Opcode == ARM::STRD)
-    ++OpIdx;
-
-  // Set second operand
-  if (ImplicitRn)
-    // Special handling for implicit use (e.g. PC).
-    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
-  else
-    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
-
-  // If this is a two-address operand, skip it. e.g. LDRH_POST.
-  if (!Skipped && MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
-    ++OpIdx;
-
-  const MachineOperand &MO2 = MI.getOperand(OpIdx);
-  unsigned AM3Opc = (ImplicitRn == ARM::PC)
-    ? 0 : MI.getOperand(OpIdx+1).getImm();
-
-  // Set bit U(23) according to sign of immed value (positive or negative)
-  Binary |= ((ARM_AM::getAM3Op(AM3Opc) == ARM_AM::add ? 1 : 0) <<
-             ARMII::U_BitShift);
-
-  // If this instr is in register offset/index encoding, set bit[3:0]
-  // to the corresponding Rm register.
-  if (MO2.getReg()) {
-    Binary |= II->getRegisterInfo().getEncodingValue(MO2.getReg());
-    emitWordLE(Binary);
-    return;
-  }
-
-  // This instr is in immediate offset/index encoding, set bit 22 to 1.
-  Binary |= 1 << ARMII::AM3_I_BitShift;
-  if (unsigned ImmOffs = ARM_AM::getAM3Offset(AM3Opc)) {
-    // Set operands
-    Binary |= (ImmOffs >> 4) << ARMII::ImmHiShift;  // immedH
-    Binary |= (ImmOffs & 0xF);                      // immedL
-  }
-
-  emitWordLE(Binary);
-}
-
-static unsigned getAddrModeUPBits(unsigned Mode) {
-  unsigned Binary = 0;
-
-  // Set addressing mode by modifying bits U(23) and P(24)
-  // IA - Increment after  - bit U = 1 and bit P = 0
-  // IB - Increment before - bit U = 1 and bit P = 1
-  // DA - Decrement after  - bit U = 0 and bit P = 0
-  // DB - Decrement before - bit U = 0 and bit P = 1
-  switch (Mode) {
-  default: llvm_unreachable("Unknown addressing sub-mode!");
-  case ARM_AM::da:                                     break;
-  case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break;
-  case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break;
-  case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break;
-  }
-
-  return Binary;
-}
-
-void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-  bool IsUpdating = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Skip operand 0 of an instruction with base register update.
-  unsigned OpIdx = 0;
-  if (IsUpdating)
-    ++OpIdx;
-
-  // Set base address operand
-  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
-
-  // Set addressing mode by modifying bits U(23) and P(24)
-  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode());
-  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode));
-
-  // Set bit W(21)
-  if (IsUpdating)
-    Binary |= 0x1 << ARMII::W_BitShift;
-
-  // Set registers
-  for (unsigned i = OpIdx+2, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
-    if (!MO.isReg() || MO.isImplicit())
-      break;
-    unsigned RegNum = II->getRegisterInfo().getEncodingValue(MO.getReg());
-    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
-           RegNum < 16);
-    Binary |= 0x1 << RegNum;
-  }
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitMulFrmInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode S bit if MI modifies CPSR.
-  Binary |= getAddrModeSBit(MI, MCID);
-
-  // 32x32->64bit operations have two destination registers. The number
-  // of register definitions will tell us if that's what we're dealing with.
-  unsigned OpIdx = 0;
-  if (MCID.getNumDefs() == 2)
-    Binary |= getMachineOpValue (MI, OpIdx++) << ARMII::RegRdLoShift;
-
-  // Encode Rd
-  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdHiShift;
-
-  // Encode Rm
-  Binary |= getMachineOpValue(MI, OpIdx++);
-
-  // Encode Rs
-  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRsShift;
-
-  // Many multiple instructions (e.g. MLA) have three src operands. Encode
-  // it as Rn (for multiply, that's in the same offset as RdLo.
-  if (MCID.getNumOperands() > OpIdx &&
-      !MCID.OpInfo[OpIdx].isPredicate() &&
-      !MCID.OpInfo[OpIdx].isOptionalDef())
-    Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRdLoShift;
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitExtendInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  unsigned OpIdx = 0;
-
-  // Encode Rd
-  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
-
-  const MachineOperand &MO1 = MI.getOperand(OpIdx++);
-  const MachineOperand &MO2 = MI.getOperand(OpIdx);
-  if (MO2.isReg()) {
-    // Two register operand form.
-    // Encode Rn.
-    Binary |= getMachineOpValue(MI, MO1) << ARMII::RegRnShift;
-
-    // Encode Rm.
-    Binary |= getMachineOpValue(MI, MO2);
-    ++OpIdx;
-  } else {
-    Binary |= getMachineOpValue(MI, MO1);
-  }
-
-  // Encode rot imm (0, 8, 16, or 24) if it has a rotate immediate operand.
-  if (MI.getOperand(OpIdx).isImm() &&
-      !MCID.OpInfo[OpIdx].isPredicate() &&
-      !MCID.OpInfo[OpIdx].isOptionalDef())
-    Binary |= (getMachineOpValue(MI, OpIdx) / 8) << ARMII::ExtRotImmShift;
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // PKH instructions are finished at this point
-  if (MCID.Opcode == ARM::PKHBT || MCID.Opcode == ARM::PKHTB) {
-    emitWordLE(Binary);
-    return;
-  }
-
-  unsigned OpIdx = 0;
-
-  // Encode Rd
-  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
-
-  const MachineOperand &MO = MI.getOperand(OpIdx++);
-  if (OpIdx == MCID.getNumOperands() ||
-      MCID.OpInfo[OpIdx].isPredicate() ||
-      MCID.OpInfo[OpIdx].isOptionalDef()) {
-    // Encode Rm and it's done.
-    Binary |= getMachineOpValue(MI, MO);
-    emitWordLE(Binary);
-    return;
-  }
-
-  // Encode Rn.
-  Binary |= getMachineOpValue(MI, MO) << ARMII::RegRnShift;
-
-  // Encode Rm.
-  Binary |= getMachineOpValue(MI, OpIdx++);
-
-  // Encode shift_imm.
-  unsigned ShiftAmt = MI.getOperand(OpIdx).getImm();
-  if (MCID.Opcode == ARM::PKHTB) {
-    assert(ShiftAmt != 0 && "PKHTB shift_imm is 0!");
-    if (ShiftAmt == 32)
-      ShiftAmt = 0;
-  }
-  assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
-  Binary |= ShiftAmt << ARMII::ShiftShift;
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitSaturateInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  // Part of binary is determined by TableGen.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Encode Rd
-  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
-
-  // Encode saturate bit position.
-  unsigned Pos = MI.getOperand(1).getImm();
-  if (MCID.Opcode == ARM::SSAT || MCID.Opcode == ARM::SSAT16)
-    Pos -= 1;
-  assert((Pos < 16 || (Pos < 32 &&
-                       MCID.Opcode != ARM::SSAT16 &&
-                       MCID.Opcode != ARM::USAT16)) &&
-         "saturate bit position out of range");
-  Binary |= Pos << 16;
-
-  // Encode Rm
-  Binary |= getMachineOpValue(MI, 2);
-
-  // Encode shift_imm.
-  if (MCID.getNumOperands() == 4) {
-    unsigned ShiftOp = MI.getOperand(3).getImm();
-    ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp);
-    if (Opc == ARM_AM::asr)
-      Binary |= (1 << 6);
-    unsigned ShiftAmt = MI.getOperand(3).getImm();
-    if (ShiftAmt == 32 && Opc == ARM_AM::asr)
-      ShiftAmt = 0;
-    assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
-    Binary |= ShiftAmt << ARMII::ShiftShift;
-  }
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitBranchInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  if (MCID.Opcode == ARM::TPsoft) {
-    llvm_unreachable("ARM::TPsoft FIXME"); // FIXME
-  }
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Set signed_immed_24 field
-  Binary |= getMachineOpValue(MI, 0);
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitInlineJumpTable(unsigned JTIndex) {
-  // Remember the base address of the inline jump table.
-  uintptr_t JTBase = MCE.getCurrentPCValue();
-  JTI->addJumpTableBaseAddr(JTIndex, JTBase);
-  DEBUG(errs() << "  ** Jump Table #" << JTIndex << " @ " << (void*)JTBase
-               << '\n');
-
-  // Now emit the jump table entries.
-  const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs;
-  for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
-    if (IsPIC)
-      // DestBB address - JT base.
-      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_pic_jt, JTBase);
-    else
-      // Absolute DestBB address.
-      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_absolute);
-    emitWordLE(0);
-  }
-}
-
-void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  // Handle jump tables.
-  if (MCID.Opcode == ARM::BR_JTr || MCID.Opcode == ARM::BR_JTadd) {
-    // First emit a ldr pc, [] instruction.
-    emitDataProcessingInstruction(MI, ARM::PC);
-
-    // Then emit the inline jump table.
-    unsigned JTIndex =
-      (MCID.Opcode == ARM::BR_JTr)
-      ? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex();
-    emitInlineJumpTable(JTIndex);
-    return;
-  } else if (MCID.Opcode == ARM::BR_JTm) {
-    // First emit a ldr pc, [] instruction.
-    emitLoadStoreInstruction(MI, ARM::PC);
-
-    // Then emit the inline jump table.
-    emitInlineJumpTable(MI.getOperand(3).getIndex());
-    return;
-  }
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  if (MCID.Opcode == ARM::BX_RET || MCID.Opcode == ARM::MOVPCLR)
-    // The return register is LR.
-    Binary |= II->getRegisterInfo().getEncodingValue(ARM::LR);
-  else
-    // otherwise, set the return register
-    Binary |= getMachineOpValue(MI, 0);
-
-  emitWordLE(Binary);
-}
-
-unsigned ARMCodeEmitter::encodeVFPRd(const MachineInstr &MI,
-                                     unsigned OpIdx) const {
-  unsigned RegD = MI.getOperand(OpIdx).getReg();
-  unsigned Binary = 0;
-  bool isSPVFP = ARM::SPRRegClass.contains(RegD);
-  RegD = II->getRegisterInfo().getEncodingValue(RegD);
-  if (!isSPVFP)
-    Binary |=   RegD               << ARMII::RegRdShift;
-  else {
-    Binary |= ((RegD & 0x1E) >> 1) << ARMII::RegRdShift;
-    Binary |=  (RegD & 0x01)       << ARMII::D_BitShift;
-  }
-  return Binary;
-}
-
-unsigned ARMCodeEmitter::encodeVFPRn(const MachineInstr &MI,
-                                     unsigned OpIdx) const {
-  unsigned RegN = MI.getOperand(OpIdx).getReg();
-  unsigned Binary = 0;
-  bool isSPVFP = ARM::SPRRegClass.contains(RegN);
-  RegN = II->getRegisterInfo().getEncodingValue(RegN);
-  if (!isSPVFP)
-    Binary |=   RegN               << ARMII::RegRnShift;
-  else {
-    Binary |= ((RegN & 0x1E) >> 1) << ARMII::RegRnShift;
-    Binary |=  (RegN & 0x01)       << ARMII::N_BitShift;
-  }
-  return Binary;
-}
-
-unsigned ARMCodeEmitter::encodeVFPRm(const MachineInstr &MI,
-                                     unsigned OpIdx) const {
-  unsigned RegM = MI.getOperand(OpIdx).getReg();
-  unsigned Binary = 0;
-  bool isSPVFP = ARM::SPRRegClass.contains(RegM);
-  RegM = II->getRegisterInfo().getEncodingValue(RegM);
-  if (!isSPVFP)
-    Binary |=   RegM;
-  else {
-    Binary |= ((RegM & 0x1E) >> 1);
-    Binary |=  (RegM & 0x01)       << ARMII::M_BitShift;
-  }
-  return Binary;
-}
-
-void ARMCodeEmitter::emitVFPArithInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  unsigned OpIdx = 0;
-  assert((Binary & ARMII::D_BitShift) == 0 &&
-         (Binary & ARMII::N_BitShift) == 0 &&
-         (Binary & ARMII::M_BitShift) == 0 && "VFP encoding bug!");
-
-  // Encode Dd / Sd.
-  Binary |= encodeVFPRd(MI, OpIdx++);
-
-  // If this is a two-address operand, skip it, e.g. FMACD.
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
-    ++OpIdx;
-
-  // Encode Dn / Sn.
-  if ((MCID.TSFlags & ARMII::FormMask) == ARMII::VFPBinaryFrm)
-    Binary |= encodeVFPRn(MI, OpIdx++);
-
-  if (OpIdx == MCID.getNumOperands() ||
-      MCID.OpInfo[OpIdx].isPredicate() ||
-      MCID.OpInfo[OpIdx].isOptionalDef()) {
-    // FCMPEZD etc. has only one operand.
-    emitWordLE(Binary);
-    return;
-  }
-
-  // Encode Dm / Sm.
-  Binary |= encodeVFPRm(MI, OpIdx);
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitVFPConversionInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-  unsigned Form = MCID.TSFlags & ARMII::FormMask;
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  switch (Form) {
-  default: break;
-  case ARMII::VFPConv1Frm:
-  case ARMII::VFPConv2Frm:
-  case ARMII::VFPConv3Frm:
-    // Encode Dd / Sd.
-    Binary |= encodeVFPRd(MI, 0);
-    break;
-  case ARMII::VFPConv4Frm:
-    // Encode Dn / Sn.
-    Binary |= encodeVFPRn(MI, 0);
-    break;
-  case ARMII::VFPConv5Frm:
-    // Encode Dm / Sm.
-    Binary |= encodeVFPRm(MI, 0);
-    break;
-  }
-
-  switch (Form) {
-  default: break;
-  case ARMII::VFPConv1Frm:
-    // Encode Dm / Sm.
-    Binary |= encodeVFPRm(MI, 1);
-    break;
-  case ARMII::VFPConv2Frm:
-  case ARMII::VFPConv3Frm:
-    // Encode Dn / Sn.
-    Binary |= encodeVFPRn(MI, 1);
-    break;
-  case ARMII::VFPConv4Frm:
-  case ARMII::VFPConv5Frm:
-    // Encode Dd / Sd.
-    Binary |= encodeVFPRd(MI, 1);
-    break;
-  }
-
-  if (Form == ARMII::VFPConv5Frm)
-    // Encode Dn / Sn.
-    Binary |= encodeVFPRn(MI, 2);
-  else if (Form == ARMII::VFPConv3Frm)
-    // Encode Dm / Sm.
-    Binary |= encodeVFPRm(MI, 2);
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitVFPLoadStoreInstruction(const MachineInstr &MI) {
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  unsigned OpIdx = 0;
-
-  // Encode Dd / Sd.
-  Binary |= encodeVFPRd(MI, OpIdx++);
-
-  // Encode address base.
-  const MachineOperand &Base = MI.getOperand(OpIdx++);
-  Binary |= getMachineOpValue(MI, Base) << ARMII::RegRnShift;
-
-  // If there is a non-zero immediate offset, encode it.
-  if (Base.isReg()) {
-    const MachineOperand &Offset = MI.getOperand(OpIdx);
-    if (unsigned ImmOffs = ARM_AM::getAM5Offset(Offset.getImm())) {
-      if (ARM_AM::getAM5Op(Offset.getImm()) == ARM_AM::add)
-        Binary |= 1 << ARMII::U_BitShift;
-      Binary |= ImmOffs;
-      emitWordLE(Binary);
-      return;
-    }
-  }
-
-  // If immediate offset is omitted, default to +0.
-  Binary |= 1 << ARMII::U_BitShift;
-
-  emitWordLE(Binary);
-}
-
-void
-ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-  bool IsUpdating = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
-
-  // Part of binary is determined by TableGn.
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
-
-  // Skip operand 0 of an instruction with base register update.
-  unsigned OpIdx = 0;
-  if (IsUpdating)
-    ++OpIdx;
-
-  // Set base address operand
-  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
-
-  // Set addressing mode by modifying bits U(23) and P(24)
-  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode());
-  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode));
-
-  // Set bit W(21)
-  if (IsUpdating)
-    Binary |= 0x1 << ARMII::W_BitShift;
-
-  // First register is encoded in Dd.
-  Binary |= encodeVFPRd(MI, OpIdx+2);
-
-  // Count the number of registers.
-  unsigned NumRegs = 1;
-  for (unsigned i = OpIdx+3, e = MI.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
-    if (!MO.isReg() || MO.isImplicit())
-      break;
-    ++NumRegs;
-  }
-  // Bit 8 will be set if <list> is consecutive 64-bit registers (e.g., D0)
-  // Otherwise, it will be 0, in the case of 32-bit registers.
-  if(Binary & 0x100)
-    Binary |= NumRegs * 2;
-  else
-    Binary |= NumRegs;
-
-  emitWordLE(Binary);
-}
-
-unsigned ARMCodeEmitter::encodeNEONRd(const MachineInstr &MI,
-                                      unsigned OpIdx) const {
-  unsigned RegD = MI.getOperand(OpIdx).getReg();
-  unsigned Binary = 0;
-  RegD = II->getRegisterInfo().getEncodingValue(RegD);
-  Binary |= (RegD & 0xf) << ARMII::RegRdShift;
-  Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift;
-  return Binary;
-}
-
-unsigned ARMCodeEmitter::encodeNEONRn(const MachineInstr &MI,
-                                      unsigned OpIdx) const {
-  unsigned RegN = MI.getOperand(OpIdx).getReg();
-  unsigned Binary = 0;
-  RegN = II->getRegisterInfo().getEncodingValue(RegN);
-  Binary |= (RegN & 0xf) << ARMII::RegRnShift;
-  Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift;
-  return Binary;
-}
-
-unsigned ARMCodeEmitter::encodeNEONRm(const MachineInstr &MI,
-                                      unsigned OpIdx) const {
-  unsigned RegM = MI.getOperand(OpIdx).getReg();
-  unsigned Binary = 0;
-  RegM = II->getRegisterInfo().getEncodingValue(RegM);
-  Binary |= (RegM & 0xf);
-  Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift;
-  return Binary;
-}
-
-/// convertNEONDataProcToThumb - Convert the ARM mode encoding for a NEON
-/// data-processing instruction to the corresponding Thumb encoding.
-static unsigned convertNEONDataProcToThumb(unsigned Binary) {
-  assert((Binary & 0xfe000000) == 0xf2000000 &&
-         "not an ARM NEON data-processing instruction");
-  unsigned UBit = (Binary >> 24) & 1;
-  return 0xef000000 | (UBit << 28) | (Binary & 0xffffff);
-}
-
-void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) {
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  unsigned RegTOpIdx, RegNOpIdx, LnOpIdx;
-  const MCInstrDesc &MCID = MI.getDesc();
-  if ((MCID.TSFlags & ARMII::FormMask) == ARMII::NGetLnFrm) {
-    RegTOpIdx = 0;
-    RegNOpIdx = 1;
-    LnOpIdx = 2;
-  } else { // ARMII::NSetLnFrm
-    RegTOpIdx = 2;
-    RegNOpIdx = 0;
-    LnOpIdx = 3;
-  }
-
-  // Set the conditional execution predicate
-  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
-
-  unsigned RegT = MI.getOperand(RegTOpIdx).getReg();
-  RegT = II->getRegisterInfo().getEncodingValue(RegT);
-  Binary |= (RegT << ARMII::RegRdShift);
-  Binary |= encodeNEONRn(MI, RegNOpIdx);
-
-  unsigned LaneShift;
-  if ((Binary & (1 << 22)) != 0)
-    LaneShift = 0; // 8-bit elements
-  else if ((Binary & (1 << 5)) != 0)
-    LaneShift = 1; // 16-bit elements
-  else
-    LaneShift = 2; // 32-bit elements
-
-  unsigned Lane = MI.getOperand(LnOpIdx).getImm() << LaneShift;
-  unsigned Opc1 = Lane >> 2;
-  unsigned Opc2 = Lane & 3;
-  assert((Opc1 & 3) == 0 && "out-of-range lane number operand");
-  Binary |= (Opc1 << 21);
-  Binary |= (Opc2 << 5);
-
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) {
-  unsigned Binary = getBinaryCodeForInstr(MI);
-
-  // Set the conditional execution predicate
-  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
-
-  unsigned RegT = MI.getOperand(1).getReg();
-  RegT = II->getRegisterInfo().getEncodingValue(RegT);
-  Binary |= (RegT << ARMII::RegRdShift);
-  Binary |= encodeNEONRn(MI, 0);
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitNEON1RegModImmInstruction(const MachineInstr &MI) {
-  unsigned Binary = getBinaryCodeForInstr(MI);
-  // Destination register is encoded in Dd.
-  Binary |= encodeNEONRd(MI, 0);
-  // Immediate fields: Op, Cmode, I, Imm3, Imm4
-  unsigned Imm = MI.getOperand(1).getImm();
-  unsigned Op = (Imm >> 12) & 1;
-  unsigned Cmode = (Imm >> 8) & 0xf;
-  unsigned I = (Imm >> 7) & 1;
-  unsigned Imm3 = (Imm >> 4) & 0x7;
-  unsigned Imm4 = Imm & 0xf;
-  Binary |= (I << 24) | (Imm3 << 16) | (Cmode << 8) | (Op << 5) | Imm4;
-  if (IsThumb)
-    Binary = convertNEONDataProcToThumb(Binary);
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitNEON2RegInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-  unsigned Binary = getBinaryCodeForInstr(MI);
-  // Destination register is encoded in Dd; source register in Dm.
-  unsigned OpIdx = 0;
-  Binary |= encodeNEONRd(MI, OpIdx++);
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
-    ++OpIdx;
-  Binary |= encodeNEONRm(MI, OpIdx);
-  if (IsThumb)
-    Binary = convertNEONDataProcToThumb(Binary);
-  // FIXME: This does not handle VDUPfdf or VDUPfqf.
-  emitWordLE(Binary);
-}
-
-void ARMCodeEmitter::emitNEON3RegInstruction(const MachineInstr &MI) {
-  const MCInstrDesc &MCID = MI.getDesc();
-  unsigned Binary = getBinaryCodeForInstr(MI);
-  // Destination register is encoded in Dd; source registers in Dn and Dm.
-  unsigned OpIdx = 0;
-  Binary |= encodeNEONRd(MI, OpIdx++);
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
-    ++OpIdx;
-  Binary |= encodeNEONRn(MI, OpIdx++);
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
-    ++OpIdx;
-  Binary |= encodeNEONRm(MI, OpIdx);
-  if (IsThumb)
-    Binary = convertNEONDataProcToThumb(Binary);
-  // FIXME: This does not handle VMOVDneon or VMOVQ.
-  emitWordLE(Binary);
-}
-
-#include "ARMGenCodeEmitter.inc"

diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index ce264ee..29405eb 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp

@@ -275,6 +275,7 @@
 
   private:
     void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    bool BBHasFallthrough(MachineBasicBlock *MBB);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
     unsigned getCPELogAlign(const MachineInstr *CPEMI);
     void scanFunctionJumpTables();
@@ -382,7 +383,9 @@
                << MCP->getConstants().size() << " CP entries, aligned to "
                << MCP->getConstantPoolAlignment() << " bytes *****\n");
 
-  TII = (const ARMBaseInstrInfo*)MF->getTarget().getInstrInfo();
+  TII = (const ARMBaseInstrInfo *)MF->getTarget()
+            .getSubtargetImpl()
+            ->getInstrInfo();
   AFI = MF->getInfo<ARMFunctionInfo>();
   STI = &MF->getTarget().getSubtarget<ARMSubtarget>();
 
@@ -529,7 +532,7 @@
   // identity mapping of CPI's to CPE's.
   const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
 
-  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  const DataLayout &TD = *MF->getSubtarget().getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
     assert(Size >= 4 && "Too small constant pool entry");
@@ -554,9 +557,7 @@
         InsPoint[a] = CPEMI;
 
     // Add a new CPEntry, but no corresponding CPUser yet.
-    std::vector<CPEntry> CPEs;
-    CPEs.push_back(CPEntry(CPEMI, i));
-    CPEntries.push_back(CPEs);
+    CPEntries.emplace_back(1, CPEntry(CPEMI, i));
     ++NumCPEs;
     DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
                  << Size << ", align = " << Align <<'\n');
@@ -566,7 +567,7 @@
 
 /// BBHasFallthrough - Return true if the specified basic block can fallthrough
 /// into the block immediately after it.
-static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
   MachineFunction::iterator MBBI = MBB;
   // Can't fall off end of function.
@@ -574,12 +575,15 @@
     return false;
 
   MachineBasicBlock *NextBB = std::next(MBBI);
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I)
-    if (*I == NextBB)
-      return true;
+  if (std::find(MBB->succ_begin(), MBB->succ_end(), NextBB) == MBB->succ_end())
+    return false;
 
-  return false;
+  // Try to analyze the end of the block. A potential fallthrough may already
+  // have an unconditional branch for whatever reason.
+  MachineBasicBlock *TBB, *FBB;
+  SmallVector<MachineOperand, 4> Cond;
+  bool TooDifficult = TII->AnalyzeBranch(*MBB, TBB, FBB, Cond);
+  return TooDifficult || FBB == nullptr;
 }
 
 /// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
@@ -1203,7 +1207,8 @@
     unsigned Growth;
     if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
         (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
-         NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+         NewWaterList.count(WaterBB) || WaterBB == U.MI->getParent()) &&
+        Growth < BestGrowth) {
       // This is the least amount of required padding seen so far.
       BestGrowth = Growth;
       WaterIter = IP;
@@ -1309,7 +1314,12 @@
   // Back past any possible branches (allow for a conditional and a maximally
   // long unconditional).
   if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
-    BaseInsertOffset = UserBBI.postOffset() - UPad - 8;
+    // Ensure BaseInsertOffset is larger than the offset of the instruction
+    // following UserMI so that the loop which searches for the split point
+    // iterates at least once.
+    BaseInsertOffset =
+        std::max(UserBBI.postOffset() - UPad - 8,
+                 UserOffset + TII->GetInstSizeInBytes(UserMI) + 1);
     DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
   }
   unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
@@ -1352,6 +1362,11 @@
     if (CC != ARMCC::AL)
       MI = LastIT;
   }
+
+  // We really must not split an IT block.
+  DEBUG(unsigned PredReg;
+        assert(!isThumb || getITInstrPredicate(MI, PredReg) == ARMCC::AL));
+
   NewMBB = splitBlockBeforeInstr(MI);
 }
 

diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index c7a8415..13bef54 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
-#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
+#define LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
 
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/Support/Casting.h"

diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 51d3dbb..2d80518 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp

@@ -867,7 +867,7 @@
       if (RI.hasBasePointer(MF)) {
         int32_t NumBytes = AFI->getFramePtrSpillOffset();
         unsigned FramePtr = RI.getFrameRegister(MF);
-        assert(MF.getTarget().getFrameLowering()->hasFP(MF) &&
+        assert(MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
                "base pointer without frame pointer?");
 
         if (AFI->isThumb2Function()) {
@@ -1343,8 +1343,9 @@
 
 bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
   const TargetMachine &TM = MF.getTarget();
-  TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
-  TRI = TM.getRegisterInfo();
+  TII = static_cast<const ARMBaseInstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
+  TRI = TM.getSubtargetImpl()->getRegisterInfo();
   STI = &TM.getSubtarget<ARMSubtarget>();
   AFI = MF.getInfo<ARMFunctionInfo>();
 

diff --git a/lib/Target/ARM/ARMFPUName.def b/lib/Target/ARM/ARMFPUName.def
index 1fef3b3..34ce85d 100644
--- a/lib/Target/ARM/ARMFPUName.def
+++ b/lib/Target/ARM/ARMFPUName.def

@@ -23,6 +23,7 @@
 ARM_FPU_NAME("vfpv3-d16", VFPV3_D16)
 ARM_FPU_NAME("vfpv4", VFPV4)
 ARM_FPU_NAME("vfpv4-d16", VFPV4_D16)
+ARM_FPU_NAME("fpv5-d16", FPV5_D16)
 ARM_FPU_NAME("fp-armv8", FP_ARMV8)
 ARM_FPU_NAME("neon", NEON)
 ARM_FPU_NAME("neon-vfpv4", NEON_VFPV4)

diff --git a/lib/Target/ARM/ARMFPUName.h b/lib/Target/ARM/ARMFPUName.h
index 2a64cce..86acffb 100644
--- a/lib/Target/ARM/ARMFPUName.h
+++ b/lib/Target/ARM/ARMFPUName.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMFPUNAME_H
-#define ARMFPUNAME_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMFPUNAME_H
+#define LLVM_LIB_TARGET_ARM_ARMFPUNAME_H
 
 namespace llvm {
 namespace ARM {
@@ -23,4 +23,4 @@
 } // namespace ARM
 } // namespace llvm
 
-#endif // ARMFPUNAME_H
+#endif

diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index e2d90cd..a5f635e 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp

@@ -92,11 +92,11 @@
   public:
     explicit ARMFastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo)
-    : FastISel(funcInfo, libInfo),
-      M(const_cast<Module&>(*funcInfo.Fn->getParent())),
-      TM(funcInfo.MF->getTarget()),
-      TII(*TM.getInstrInfo()),
-      TLI(*TM.getTargetLowering()) {
+        : FastISel(funcInfo, libInfo),
+          M(const_cast<Module &>(*funcInfo.Fn->getParent())),
+          TM(funcInfo.MF->getTarget()),
+          TII(*TM.getSubtargetImpl()->getInstrInfo()),
+          TLI(*TM.getSubtargetImpl()->getTargetLowering()) {
       Subtarget = &TM.getSubtarget<ARMSubtarget>();
       AFI = funcInfo.MF->getInfo<ARMFunctionInfo>();
       isThumb2 = AFI->isThumbFunction();
@@ -105,39 +105,39 @@
 
     // Code from FastISel.cpp.
   private:
-    unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+    unsigned fastEmitInst_r(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC,
                             unsigned Op0, bool Op0IsKill);
-    unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+    unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
                              const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill,
                              unsigned Op1, bool Op1IsKill);
-    unsigned FastEmitInst_rrr(unsigned MachineInstOpcode,
+    unsigned fastEmitInst_rrr(unsigned MachineInstOpcode,
                               const TargetRegisterClass *RC,
                               unsigned Op0, bool Op0IsKill,
                               unsigned Op1, bool Op1IsKill,
                               unsigned Op2, bool Op2IsKill);
-    unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+    unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
                              const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill,
                              uint64_t Imm);
-    unsigned FastEmitInst_rri(unsigned MachineInstOpcode,
+    unsigned fastEmitInst_rri(unsigned MachineInstOpcode,
                               const TargetRegisterClass *RC,
                               unsigned Op0, bool Op0IsKill,
                               unsigned Op1, bool Op1IsKill,
                               uint64_t Imm);
-    unsigned FastEmitInst_i(unsigned MachineInstOpcode,
+    unsigned fastEmitInst_i(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC,
                             uint64_t Imm);
 
     // Backend specific FastISel code.
   private:
-    bool TargetSelectInstruction(const Instruction *I) override;
-    unsigned TargetMaterializeConstant(const Constant *C) override;
-    unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+    bool fastSelectInstruction(const Instruction *I) override;
+    unsigned fastMaterializeConstant(const Constant *C) override;
+    unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
     bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                              const LoadInst *LI) override;
-    bool FastLowerArguments() override;
+    bool fastLowerArguments() override;
   private:
   #include "ARMGenFastISel.inc"
 
@@ -189,7 +189,9 @@
     unsigned ARMSelectCallOp(bool UseReg);
     unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
 
-    const TargetLowering *getTargetLowering() { return TM.getTargetLowering(); }
+    const TargetLowering *getTargetLowering() {
+      return TM.getSubtargetImpl()->getTargetLowering();
+    }
 
     // Call handling routines.
   private:
@@ -283,7 +285,7 @@
   return MIB;
 }
 
-unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
                                      unsigned Op0, bool Op0IsKill) {
   unsigned ResultReg = createResultReg(RC);
@@ -305,7 +307,7 @@
   return ResultReg;
 }
 
-unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
                                       const TargetRegisterClass *RC,
                                       unsigned Op0, bool Op0IsKill,
                                       unsigned Op1, bool Op1IsKill) {
@@ -333,7 +335,7 @@
   return ResultReg;
 }
 
-unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
+unsigned ARMFastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
                                        const TargetRegisterClass *RC,
                                        unsigned Op0, bool Op0IsKill,
                                        unsigned Op1, bool Op1IsKill,
@@ -365,7 +367,7 @@
   return ResultReg;
 }
 
-unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
                                       const TargetRegisterClass *RC,
                                       unsigned Op0, bool Op0IsKill,
                                       uint64_t Imm) {
@@ -391,7 +393,7 @@
   return ResultReg;
 }
 
-unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
+unsigned ARMFastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
                                        const TargetRegisterClass *RC,
                                        unsigned Op0, bool Op0IsKill,
                                        unsigned Op1, bool Op1IsKill,
@@ -421,7 +423,7 @@
   return ResultReg;
 }
 
-unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode,
+unsigned ARMFastISel::fastEmitInst_i(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
                                      uint64_t Imm) {
   unsigned ResultReg = createResultReg(RC);
@@ -511,7 +513,7 @@
 unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
 
   if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
-    return false;
+    return 0;
 
   // If we can do this in a single instruction without a constant pool entry
   // do so now.
@@ -534,7 +536,9 @@
       (ARM_AM::getSOImmVal(Imm) != -1);
     if (UseImm) {
       unsigned Opc = isThumb2 ? ARM::t2MVNi : ARM::MVNi;
-      unsigned ImmReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+      const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
+                                                 &ARM::GPRRegClass;
+      unsigned ImmReg = createResultReg(RC);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(Opc), ImmReg)
                       .addImm(Imm));
@@ -542,11 +546,16 @@
     }
   }
 
+  unsigned ResultReg = 0;
+  if (Subtarget->useMovt(*FuncInfo.MF))
+    ResultReg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+
+  if (ResultReg)
+    return ResultReg;
+
   // Load from constant pool.  For now 32-bit only.
   if (VT != MVT::i32)
-    return false;
-
-  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+    return 0;
 
   // MachineConstantPool wants an explicit alignment.
   unsigned Align = DL.getPrefTypeAlignment(C->getType());
@@ -555,21 +564,20 @@
     Align = DL.getTypeAllocSize(C->getType());
   }
   unsigned Idx = MCP.getConstantPoolIndex(C, Align);
-
+  ResultReg = createResultReg(TLI.getRegClassFor(VT));
   if (isThumb2)
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                            TII.get(ARM::t2LDRpci), DestReg)
-                    .addConstantPoolIndex(Idx));
+                            TII.get(ARM::t2LDRpci), ResultReg)
+                      .addConstantPoolIndex(Idx));
   else {
     // The extra immediate is for addrmode2.
-    DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
+    ResultReg = constrainOperandRegClass(TII.get(ARM::LDRcp), ResultReg, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                            TII.get(ARM::LDRcp), DestReg)
-                    .addConstantPoolIndex(Idx)
-                    .addImm(0));
+                            TII.get(ARM::LDRcp), ResultReg)
+                      .addConstantPoolIndex(Idx)
+                      .addImm(0));
   }
-
-  return DestReg;
+  return ResultReg;
 }
 
 unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
@@ -679,7 +687,7 @@
   return DestReg;
 }
 
-unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
+unsigned ARMFastISel::fastMaterializeConstant(const Constant *C) {
   EVT CEVT = TLI.getValueType(C->getType(), true);
 
   // Only handle simple types.
@@ -698,7 +706,7 @@
 
 // TODO: unsigned ARMFastISel::TargetMaterializeFloatZero(const ConstantFP *CF);
 
-unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+unsigned ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
   // Don't handle dynamic allocas.
   if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
 
@@ -901,7 +909,7 @@
   // Since the offset is too large for the load/store instruction
   // get the reg+offset into a register.
   if (needsLowering) {
-    Addr.Base.Reg = FastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg,
+    Addr.Base.Reg = fastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg,
                                  /*Op0IsKill*/false, Addr.Offset, MVT::i32);
     Addr.Offset = 0;
   }
@@ -1074,7 +1082,7 @@
   unsigned ResultReg;
   if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
     return false;
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1276,7 +1284,7 @@
       unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
       .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR);
-      FastEmitBranch(FBB, DbgLoc);
+      fastEmitBranch(FBB, DbgLoc);
       FuncInfo.MBB->addSuccessor(TBB);
       return true;
     }
@@ -1301,7 +1309,7 @@
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
       .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
 
-      FastEmitBranch(FBB, DbgLoc);
+      fastEmitBranch(FBB, DbgLoc);
       FuncInfo.MBB->addSuccessor(TBB);
       return true;
     }
@@ -1309,7 +1317,7 @@
              dyn_cast<ConstantInt>(BI->getCondition())) {
     uint64_t Imm = CI->getZExtValue();
     MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
-    FastEmitBranch(Target, DbgLoc);
+    fastEmitBranch(Target, DbgLoc);
     return true;
   }
 
@@ -1339,7 +1347,7 @@
   unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc))
                   .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
-  FastEmitBranch(FBB, DbgLoc);
+  fastEmitBranch(FBB, DbgLoc);
   FuncInfo.MBB->addSuccessor(TBB);
   return true;
 }
@@ -1497,13 +1505,13 @@
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
   Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
-  unsigned ZeroReg = TargetMaterializeConstant(Zero);
+  unsigned ZeroReg = fastMaterializeConstant(Zero);
   // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovCCOpc), DestReg)
           .addReg(ZeroReg).addImm(1)
           .addImm(ARMPred).addReg(ARM::CPSR);
 
-  UpdateValueMap(I, DestReg);
+  updateValueMap(I, DestReg);
   return true;
 }
 
@@ -1522,7 +1530,7 @@
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VCVTDS), Result)
                   .addReg(Op));
-  UpdateValueMap(I, Result);
+  updateValueMap(I, Result);
   return true;
 }
 
@@ -1541,7 +1549,7 @@
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VCVTSD), Result)
                   .addReg(Op));
-  UpdateValueMap(I, Result);
+  updateValueMap(I, Result);
   return true;
 }
 
@@ -1585,7 +1593,7 @@
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(Opc), ResultReg).addReg(FP));
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1617,7 +1625,7 @@
   unsigned IntReg = ARMMoveToIntReg(DstVT, ResultReg);
   if (IntReg == 0) return false;
 
-  UpdateValueMap(I, IntReg);
+  updateValueMap(I, IntReg);
   return true;
 }
 
@@ -1693,7 +1701,7 @@
         .addImm(ARMCC::EQ)
         .addReg(ARM::CPSR);
   }
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1783,7 +1791,7 @@
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(Opc), ResultReg)
                   .addReg(SrcReg1).addReg(SrcReg2));
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1825,7 +1833,7 @@
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(Opc), ResultReg)
                   .addReg(Op1).addReg(Op2));
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1883,7 +1891,7 @@
                                   unsigned &NumBytes,
                                   bool isVarArg) {
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, ArgLocs, *Context);
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags,
                              CCAssignFnForCall(CC, false, isVarArg));
 
@@ -1941,6 +1949,7 @@
   // Process the args.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
+    const Value *ArgVal = Args[VA.getValNo()];
     unsigned Arg = ArgRegs[VA.getValNo()];
     MVT ArgVT = ArgVTs[VA.getValNo()];
 
@@ -1967,7 +1976,7 @@
         break;
       }
       case CCValAssign::BCvt: {
-        unsigned BC = FastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg,
+        unsigned BC = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg,
                                  /*TODO: Kill=*/false);
         assert(BC != 0 && "Failed to emit a bitcast!");
         Arg = BC;
@@ -2001,6 +2010,11 @@
     } else {
       assert(VA.isMemLoc());
       // Need to store on the stack.
+
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
       Address Addr;
       Addr.BaseType = Address::RegBase;
       Addr.Base.Reg = ARM::SP;
@@ -2026,7 +2040,7 @@
   // Now the return value.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
 
     // Copy all of the result registers out of their specified physreg.
@@ -2045,7 +2059,7 @@
       UsedRegs.push_back(RVLocs[1].getLocReg());
 
       // Finally update the result.
-      UpdateValueMap(I, ResultReg);
+      updateValueMap(I, ResultReg);
     } else {
       assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!");
       MVT CopyVT = RVLocs[0].getValVT();
@@ -2063,7 +2077,7 @@
       UsedRegs.push_back(RVLocs[0].getLocReg());
 
       // Finally update the result.
-      UpdateValueMap(I, ResultReg);
+      updateValueMap(I, ResultReg);
     }
   }
 
@@ -2087,7 +2101,7 @@
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
     CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */,
                                                  F.isVarArg()));
 
@@ -2192,7 +2206,7 @@
   // Can't handle non-double multi-reg retvals.
   if (RetVT != MVT::isVoid && RetVT != MVT::i32) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false));
     if (RVLocs.size() >= 2 && RetVT != MVT::f64)
       return false;
@@ -2303,7 +2317,7 @@
   if (RetVT != MVT::isVoid && RetVT != MVT::i1 && RetVT != MVT::i8 &&
       RetVT != MVT::i16 && RetVT != MVT::i32) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
     if (RVLocs.size() >= 2 && RetVT != MVT::f64)
       return false;
@@ -2487,7 +2501,8 @@
     }
 
     const ARMBaseRegisterInfo *RegInfo =
-          static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo());
+        static_cast<const ARMBaseRegisterInfo *>(
+            TM.getSubtargetImpl()->getRegisterInfo());
     unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
     unsigned SrcReg = FramePtr;
 
@@ -2505,7 +2520,7 @@
                       .addReg(SrcReg).addImm(0));
       SrcReg = DestReg;
     }
-    UpdateValueMap(&I, SrcReg);
+    updateValueMap(&I, SrcReg);
     return true;
   }
   case Intrinsic::memcpy:
@@ -2583,7 +2598,7 @@
 
   // Because the high bits are undefined, a truncate doesn't generate
   // any code.
-  UpdateValueMap(I, SrcReg);
+  updateValueMap(I, SrcReg);
   return true;
 }
 
@@ -2745,7 +2760,7 @@
   MVT DestVT = DestEVT.getSimpleVT();
   unsigned ResultReg = ARMEmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
   if (ResultReg == 0) return false;
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -2800,12 +2815,12 @@
   }
 
   AddOptionalDefs(MIB);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
 // TODO: SoftFP support.
-bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
+bool ARMFastISel::fastSelectInstruction(const Instruction *I) {
 
   switch (I->getOpcode()) {
     case Instruction::Load:
@@ -2983,7 +2998,7 @@
   return DestReg2;
 }
 
-bool ARMFastISel::FastLowerArguments() {
+bool ARMFastISel::fastLowerArguments() {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
@@ -3050,7 +3065,7 @@
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY),
             ResultReg).addReg(DstReg, getKillRegState(true));
-    UpdateValueMap(I, ResultReg);
+    updateValueMap(I, ResultReg);
   }
 
   return true;

diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
index e191a3c..0c910ab 100644
--- a/lib/Target/ARM/ARMFeatures.h
+++ b/lib/Target/ARM/ARMFeatures.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TARGET_ARM_FEATURES_H
-#define TARGET_ARM_FEATURES_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMFEATURES_H
+#define LLVM_LIB_TARGET_ARM_ARMFEATURES_H
 
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 

diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index a67b360..80add7a 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp

@@ -47,7 +47,7 @@
 /// pointer register.  This is true if the function has variable sized allocas
 /// or if frame pointer elimination is disabled.
 bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   // iOS requires FP not to be clobbered for backtracing purpose.
   if (STI.isTargetIOS())
@@ -137,12 +137,27 @@
 }
 
 static int sizeOfSPAdjustment(const MachineInstr *MI) {
-  assert(MI->getOpcode() == ARM::VSTMDDB_UPD);
+  int RegSize;
+  switch (MI->getOpcode()) {
+  case ARM::VSTMDDB_UPD:
+    RegSize = 8;
+    break;
+  case ARM::STMDB_UPD:
+  case ARM::t2STMDB_UPD:
+    RegSize = 4;
+    break;
+  case ARM::t2STR_PRE:
+  case ARM::STR_PRE_IMM:
+    return 4;
+  default:
+    llvm_unreachable("Unknown push or pop like instruction");
+  }
+
   int count = 0;
   // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
   // pred) so the list starts at 4.
   for (int i = MI->getNumOperands() - 1; i >= 4; --i)
-    count += 8;
+    count += RegSize;
   return count;
 }
 
@@ -154,6 +169,46 @@
   return StackSizeInBytes >= 4096;
 }
 
+namespace {
+struct StackAdjustingInsts {
+  struct InstInfo {
+    MachineBasicBlock::iterator I;
+    unsigned SPAdjust;
+    bool BeforeFPSet;
+  };
+
+  SmallVector<InstInfo, 4> Insts;
+
+  void addInst(MachineBasicBlock::iterator I, unsigned SPAdjust,
+               bool BeforeFPSet = false) {
+    InstInfo Info = {I, SPAdjust, BeforeFPSet};
+    Insts.push_back(Info);
+  }
+
+  void addExtraBytes(const MachineBasicBlock::iterator I, unsigned ExtraBytes) {
+    auto Info = std::find_if(Insts.begin(), Insts.end(),
+                             [&](InstInfo &Info) { return Info.I == I; });
+    assert(Info != Insts.end() && "invalid sp adjusting instruction");
+    Info->SPAdjust += ExtraBytes;
+  }
+
+  void emitDefCFAOffsets(MachineModuleInfo &MMI, MachineBasicBlock &MBB,
+                         DebugLoc dl, const ARMBaseInstrInfo &TII, bool HasFP) {
+    unsigned CFAOffset = 0;
+    for (auto &Info : Insts) {
+      if (HasFP && !Info.BeforeFPSet)
+        return;
+
+      CFAOffset -= Info.SPAdjust;
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+      BuildMI(MBB, std::next(Info.I), dl,
+              TII.get(TargetOpcode::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+    }
+  }
+};
+}
+
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -163,20 +218,20 @@
   MCContext &Context = MMI.getContext();
   const TargetMachine &TM = MF.getTarget();
   const MCRegisterInfo *MRI = Context.getRegisterInfo();
-  const ARMBaseRegisterInfo *RegInfo =
-    static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo());
-  const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
+  const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
+                                    TM.getSubtargetImpl()->getInstrInfo());
   assert(!AFI->isThumb1OnlyFunction() &&
          "This emitPrologue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
-  unsigned Align = TM.getFrameLowering()->getStackAlignment();
+  unsigned Align =
+      TM.getSubtargetImpl()->getFrameLowering()->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
-  int CFAOffset = 0;
 
   // Determine the sizes of each callee-save spill areas and record which frame
   // belongs to which callee-save spill areas.
@@ -189,15 +244,13 @@
   if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
     return;
 
+  StackAdjustingInsts DefCFAOffsetCandidates;
+
   // Allocate the vararg register save area.
   if (ArgRegsSaveSize) {
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize,
                  MachineInstr::FrameSetup);
-    CFAOffset -= ArgRegsSaveSize;
-    unsigned CFIIndex = MMI.addFrameInst(
-        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
-    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+    DefCFAOffsetCandidates.addInst(std::prev(MBBI), ArgRegsSaveSize, true);
   }
 
   if (!AFI->hasStackFrame() &&
@@ -205,11 +258,8 @@
     if (NumBytes - ArgRegsSaveSize != 0) {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize),
                    MachineInstr::FrameSetup);
-      CFAOffset -= NumBytes - ArgRegsSaveSize;
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
-      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
+      DefCFAOffsetCandidates.addInst(std::prev(MBBI),
+                                     NumBytes - ArgRegsSaveSize, true);
     }
     return;
   }
@@ -252,21 +302,23 @@
   }
 
   // Move past area 1.
-  MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push,
-      DPRCSPush;
-  if (GPRCS1Size > 0)
+  MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push;
+  if (GPRCS1Size > 0) {
     GPRCS1Push = LastPush = MBBI++;
+    DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true);
+  }
 
   // Determine starting offsets of spill areas.
   bool HasFP = hasFP(MF);
-  unsigned DPRCSOffset  = NumBytes - (ArgRegsSaveSize + GPRCS1Size
-                                      + GPRCS2Size + DPRCSSize);
-  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
-  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  unsigned GPRCS1Offset = NumBytes - ArgRegsSaveSize - GPRCS1Size;
+  unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size;
+  unsigned DPRAlign = DPRCSSize ? std::min(8U, Align) : 4U;
+  unsigned DPRGapSize = (GPRCS1Size + GPRCS2Size + ArgRegsSaveSize) % DPRAlign;
+  unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
   int FramePtrOffsetInPush = 0;
   if (HasFP) {
-    FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI)
-                           + GPRCS1Size + ArgRegsSaveSize;
+    FramePtrOffsetInPush =
+        MFI->getObjectOffset(FramePtrSpillFI) + ArgRegsSaveSize;
     AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
   }
@@ -275,16 +327,32 @@
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
 
   // Move past area 2.
-  if (GPRCS2Size > 0)
+  if (GPRCS2Size > 0) {
     GPRCS2Push = LastPush = MBBI++;
+    DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size);
+  }
+
+  // Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our
+  // .cfi_offset operations will reflect that.
+  if (DPRGapSize) {
+    assert(DPRGapSize == 4 && "unexpected alignment requirements for DPRs");
+    if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, DPRGapSize))
+      DefCFAOffsetCandidates.addExtraBytes(LastPush, DPRGapSize);
+    else {
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize,
+                   MachineInstr::FrameSetup);
+      DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize);
+    }
+  }
 
   // Move past area 3.
   if (DPRCSSize > 0) {
-    DPRCSPush = MBBI;
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
-    while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
+    while (MBBI->getOpcode() == ARM::VSTMDDB_UPD) {
+      DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(MBBI));
       LastPush = MBBI++;
+    }
   }
 
   // Move past the aligned DPRCS2 area.
@@ -343,18 +411,15 @@
     NumBytes = 0;
   }
 
-  unsigned adjustedGPRCS1Size = GPRCS1Size;
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
-    if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) {
-      if (LastPush == GPRCS1Push) {
-        FramePtrOffsetInPush += NumBytes;
-        adjustedGPRCS1Size += NumBytes;
-        NumBytes = 0;
-      }
-    } else
+    if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes))
+      DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes);
+    else {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
                    MachineInstr::FrameSetup);
+      DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes);
+    }
 
     if (HasFP && isARM)
       // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
@@ -368,13 +433,40 @@
       AFI->setShouldRestoreSPFromFP(true);
   }
 
-  if (adjustedGPRCS1Size > 0) {
-    CFAOffset -= adjustedGPRCS1Size;
-    unsigned CFIIndex = MMI.addFrameInst(
-        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
-    MachineBasicBlock::iterator Pos = ++GPRCS1Push;
-    BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For iOS, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not iOS, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it
+  // is in area one and the adjustment needs to take place just after
+  // that push.
+  if (HasFP) {
+    MachineBasicBlock::iterator AfterPush = std::next(GPRCS1Push);
+    unsigned PushSize = sizeOfSPAdjustment(GPRCS1Push);
+    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush,
+                         dl, TII, FramePtr, ARM::SP,
+                         PushSize + FramePtrOffsetInPush,
+                         MachineInstr::FrameSetup);
+    if (FramePtrOffsetInPush + PushSize != 0) {
+      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa(
+          nullptr, MRI->getDwarfRegNum(FramePtr, true),
+          -(ArgRegsSaveSize - FramePtrOffsetInPush)));
+      BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      unsigned CFIIndex =
+          MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MRI->getDwarfRegNum(FramePtr, true)));
+      BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  // Now that the prologue's actual instructions are finalised, we can insert
+  // the necessary DWARF cf instructions to describe the situation. Start by
+  // recording where each register ended up:
+  if (GPRCS1Size > 0) {
+    MachineBasicBlock::iterator Pos = std::next(GPRCS1Push);
+    int CFIIndex;
     for (const auto &Entry : CSI) {
       unsigned Reg = Entry.getReg();
       int FI = Entry.getFrameIdx();
@@ -405,41 +497,8 @@
     }
   }
 
-  // Set FP to point to the stack slot that contains the previous FP.
-  // For iOS, FP is R7, which has now been stored in spill area 1.
-  // Otherwise, if this is not iOS, all the callee-saved registers go
-  // into spill area 1, including the FP in R11.  In either case, it
-  // is in area one and the adjustment needs to take place just after
-  // that push.
-  if (HasFP) {
-    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, GPRCS1Push, dl, TII,
-                         FramePtr, ARM::SP, FramePtrOffsetInPush,
-                         MachineInstr::FrameSetup);
-    if (FramePtrOffsetInPush) {
-      CFAOffset += FramePtrOffsetInPush;
-      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa(
-          nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
-      BuildMI(MBB, GPRCS1Push, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-
-    } else {
-      unsigned CFIIndex =
-          MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
-              nullptr, MRI->getDwarfRegNum(FramePtr, true)));
-      BuildMI(MBB, GPRCS1Push, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    }
-  }
-
   if (GPRCS2Size > 0) {
-    MachineBasicBlock::iterator Pos = ++GPRCS2Push;
-    if (!HasFP) {
-      CFAOffset -= GPRCS2Size;
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
-      BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    }
+    MachineBasicBlock::iterator Pos = std::next(GPRCS2Push);
     for (const auto &Entry : CSI) {
       unsigned Reg = Entry.getReg();
       int FI = Entry.getFrameIdx();
@@ -465,17 +524,7 @@
   if (DPRCSSize > 0) {
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
-    do {
-      MachineBasicBlock::iterator Push = DPRCSPush++;
-      if (!HasFP) {
-        CFAOffset -= sizeOfSPAdjustment(Push);
-        unsigned CFIIndex = MMI.addFrameInst(
-            MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
-        BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-            .addCFIIndex(CFIIndex);
-      }
-    } while (DPRCSPush->getOpcode() == ARM::VSTMDDB_UPD);
-
+    MachineBasicBlock::iterator Pos = std::next(LastPush);
     for (const auto &Entry : CSI) {
       unsigned Reg = Entry.getReg();
       int FI = Entry.getFrameIdx();
@@ -485,21 +534,17 @@
         unsigned Offset = MFI->getObjectOffset(FI);
         unsigned CFIIndex = MMI.addFrameInst(
             MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
-        BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
       }
     }
   }
 
-  if (NumBytes) {
-    if (!HasFP) {
-      CFAOffset -= NumBytes;
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
-      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    }
-  }
+  // Now we can emit descriptions of where the canonical frame address was
+  // throughout the process. If we have a frame pointer, it takes over the job
+  // half-way through, so only the first few .cfi_def_cfa_offset instructions
+  // actually get emitted.
+  DefCFAOffsetCandidates.emitDefCFAOffsets(MMI, MBB, dl, TII, HasFP);
 
   if (STI.isTargetELF() && hasFP(MF))
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
@@ -507,6 +552,7 @@
 
   AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
   AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedGapSize(DPRGapSize);
   AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
 
   // If we need dynamic stack realignment, do it here. Be paranoid and make
@@ -574,14 +620,17 @@
   DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
   assert(!AFI->isThumb1OnlyFunction() &&
          "This emitEpilogue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
 
-  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned Align = MF.getTarget()
+                       .getSubtargetImpl()
+                       ->getFrameLowering()
+                       ->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   int NumBytes = (int)MFI->getStackSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
@@ -609,6 +658,7 @@
     NumBytes -= (ArgRegsSaveSize +
                  AFI->getGPRCalleeSavedArea1Size() +
                  AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedGapSize() +
                  AFI->getDPRCalleeSavedAreaSize());
 
     // Reset SP based on frame pointer only if the stack frame extends beyond
@@ -657,6 +707,12 @@
       while (MBBI->getOpcode() == ARM::VLDMDIA_UPD)
         MBBI++;
     }
+    if (AFI->getDPRCalleeSavedGapSize()) {
+      assert(AFI->getDPRCalleeSavedGapSize() == 4 &&
+             "unexpected DPR alignment gap");
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize());
+    }
+
     if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
   }
@@ -717,8 +773,8 @@
                                              int FI, unsigned &FrameReg,
                                              int SPAdj) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARMBaseRegisterInfo *RegInfo =
-    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
   int FPOffset = Offset - AFI->getFramePtrSpillOffset();
@@ -803,7 +859,7 @@
                                     unsigned NumAlignedDPRCS2Regs,
                                     unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
@@ -876,7 +932,7 @@
                                    bool(*Func)(unsigned, bool),
                                    unsigned NumAlignedDPRCS2Regs) const {
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL = MI->getDebugLoc();
   unsigned RetOpcode = MI->getOpcode();
@@ -966,7 +1022,7 @@
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
   // Mark the D-register spill slots as properly aligned.  Since MFI computes
@@ -1125,7 +1181,7 @@
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   // Find the frame index assigned to d8.
   int D8SpillFI = 0;
@@ -1340,12 +1396,15 @@
     return;
 
   // Don't bother if the default stack alignment is sufficiently high.
-  if (MF.getTarget().getFrameLowering()->getStackAlignment() >= 8)
+  if (MF.getTarget()
+          .getSubtargetImpl()
+          ->getFrameLowering()
+          ->getStackAlignment() >= 8)
     return;
 
   // Aligned spills require stack realignment.
-  const ARMBaseRegisterInfo *RegInfo =
-    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   if (!RegInfo->canRealignStack(MF))
     return;
 
@@ -1384,10 +1443,10 @@
   unsigned NumGPRSpills = 0;
   SmallVector<unsigned, 4> UnspilledCS1GPRs;
   SmallVector<unsigned, 4> UnspilledCS2GPRs;
-  const ARMBaseRegisterInfo *RegInfo =
-    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1550,7 +1609,7 @@
     // of GPRs, spill one extra callee save GPR so we won't have to pad between
     // the integer and double callee save areas.
     unsigned TargetAlign = getStackAlignment();
-    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
+    if (TargetAlign >= 8 && (NumGPRSpills & 1)) {
       if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
         for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
           unsigned Reg = UnspilledCS1GPRs[i];
@@ -1628,7 +1687,7 @@
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
   if (!hasReservedCallFrame(MF)) {
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
@@ -1746,7 +1805,7 @@
   MCContext &Context = MMI.getContext();
   const MCRegisterInfo *MRI = Context.getRegisterInfo();
   const ARMBaseInstrInfo &TII =
-      *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
   ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL;
 

diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 709afbc..a83b773 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARM_FRAMEINFO_H
-#define ARM_FRAMEINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
+#define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
 
 #include "llvm/Target/TargetFrameLowering.h"
 

diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 0885c4e..0e4f81c 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp

@@ -46,8 +46,8 @@
       const MCInstrDesc &LastMCID = LastMI->getDesc();
       const TargetMachine &TM =
         MI->getParent()->getParent()->getTarget();
-      const ARMBaseInstrInfo &TII =
-        *static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+      const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>(
+                                        TM.getSubtargetImpl()->getInstrInfo());
 
       // Skip over one non-VFP / NEON instruction.
       if (!LastMI->isBarrier() &&

diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h
index a8198e2..ccf09db 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/lib/Target/ARM/ARMHazardRecognizer.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMHAZARDRECOGNIZER_H
-#define ARMHAZARDRECOGNIZER_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
+#define LLVM_LIB_TARGET_ARM_ARMHAZARDRECOGNIZER_H
 
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 
@@ -46,4 +46,4 @@
 
 } // end namespace llvm
 
-#endif // ARMHAZARDRECOGNIZER_H
+#endif

diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 38547cf..6941579 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp

@@ -425,7 +425,7 @@
     return true;
   if (Use->isMachineOpcode()) {
     const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
-        CurDAG->getTarget().getInstrInfo());
+        CurDAG->getSubtarget().getInstrInfo());
 
     const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode());
     if (MCID.mayStore())
@@ -526,8 +526,7 @@
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI,
-                                         getTargetLowering()->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
@@ -542,16 +541,15 @@
   }
 
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    int RHSC = (int)RHS->getZExtValue();
+    int RHSC = (int)RHS->getSExtValue();
     if (N.getOpcode() == ISD::SUB)
       RHSC = -RHSC;
 
-    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
+    if (RHSC > -0x1000 && RHSC < 0x1000) { // 12 bits
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI,
-                                           getTargetLowering()->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -697,8 +695,7 @@
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI,
-                                         getTargetLowering()->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
@@ -718,8 +715,7 @@
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI,
-                                           getTargetLowering()->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -896,8 +892,7 @@
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI,
-                                         getTargetLowering()->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
@@ -911,8 +906,7 @@
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI,
-                                         getTargetLowering()->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -957,8 +951,7 @@
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI,
-                                         getTargetLowering()->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) {
       Base = N.getOperand(0);
@@ -975,8 +968,7 @@
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI,
-                                         getTargetLowering()->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     }
 
     ARM_AM::AddrOpc AddSub = ARM_AM::add;
@@ -1199,8 +1191,7 @@
                                             SDValue &Base, SDValue &OffImm) {
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI,
-                                       getTargetLowering()->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     OffImm = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
@@ -1217,8 +1208,7 @@
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI,
-                                           getTargetLowering()->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1266,8 +1256,7 @@
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI,
-                                         getTargetLowering()->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
@@ -1296,8 +1285,7 @@
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI,
-                                           getTargetLowering()->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1326,8 +1314,7 @@
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI,
-                                           getTargetLowering()->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1392,10 +1379,7 @@
         OffReg = OffReg.getOperand(0);
       else {
         ShAmt = 0;
-        ShOpcVal = ARM_AM::no_shift;
       }
-    } else {
-      ShOpcVal = ARM_AM::no_shift;
     }
   }
 
@@ -1425,7 +1409,7 @@
   Base = N.getOperand(0);
   if (Base.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, getTargetLowering()->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
   }
 
   OffImm = CurDAG->getTargetConstant(RHSC / 4, MVT::i32);
@@ -2361,6 +2345,25 @@
       return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
     }
   }
+
+  if (N->getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+    unsigned LSB = 0;
+    if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, LSB) &&
+        !isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRA, LSB))
+      return nullptr;
+
+    if (LSB + Width > 32)
+      return nullptr;
+
+    SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                      CurDAG->getTargetConstant(LSB, MVT::i32),
+                      CurDAG->getTargetConstant(Width - 1, MVT::i32),
+                      getAL(CurDAG), Reg0 };
+    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+  }
+
   return nullptr;
 }
 
@@ -2457,10 +2460,9 @@
     }
 
     if (UseCP) {
-      SDValue CPIdx =
-        CurDAG->getTargetConstantPool(ConstantInt::get(
-                                  Type::getInt32Ty(*CurDAG->getContext()), Val),
-                                      getTargetLowering()->getPointerTy());
+      SDValue CPIdx = CurDAG->getTargetConstantPool(
+          ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
+          TLI->getPointerTy());
 
       SDNode *ResNode;
       if (Subtarget->isThumb()) {
@@ -2490,12 +2492,10 @@
   case ISD::FrameIndex: {
     // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI,
-                                           getTargetLowering()->getPointerTy());
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
     if (Subtarget->isThumb1Only()) {
-      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
-                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops);
+      return CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
+                                  CurDAG->getTargetConstant(0, MVT::i32));
     } else {
       unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
                       ARM::t2ADDri : ARM::ADDri);
@@ -2509,6 +2509,7 @@
     if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
       return I;
     break;
+  case ISD::SIGN_EXTEND_INREG:
   case ISD::SRA:
     if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true))
       return I;
@@ -3393,7 +3394,6 @@
       std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
       Ops.push_back(T1.getValue(1));
       CurDAG->UpdateNodeOperands(GU, Ops);
-      GU = T1.getNode();
     }
     else {
       // For Kind  == InlineAsm::Kind_RegUse, we first copy two GPRs into a

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 4bfa5a8..0d0d81f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp

@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -70,9 +71,9 @@
   class ARMCCState : public CCState {
   public:
     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-               const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
-               LLVMContext &C, ParmContext PC)
-        : CCState(CC, isVarArg, MF, TM, locs, C) {
+               SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+               ParmContext PC)
+        : CCState(CC, isVarArg, MF, locs, C) {
       assert(((PC == Call) || (PC == Prologue)) &&
              "ARMCCState users must specify whether their context is call"
              "or prologue generation.");
@@ -155,19 +156,11 @@
   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
-  if (TT.isOSBinFormatMachO())
-    return new TargetLoweringObjectFileMachO();
-  if (TT.isOSWindows())
-    return new TargetLoweringObjectFileCOFF();
-  return new ARMElfTargetObjectFile();
-}
-
-ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM)
+    : TargetLowering(TM) {
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
-  RegInfo = TM.getRegisterInfo();
-  Itins = TM.getInstrItineraryData();
+  RegInfo = TM.getSubtargetImpl()->getRegisterInfo();
+  Itins = TM.getSubtargetImpl()->getInstrItineraryData();
 
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
@@ -312,6 +305,7 @@
       // Conversions between floating types.
       // RTABI chapter 4.1.2, Table 7
       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 
       // Integer to floating-point conversions.
@@ -387,6 +381,19 @@
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
   }
 
+  // The half <-> float conversion functions are always soft-float, but are
+  // needed for some targets which use a hard-float calling convention by
+  // default.
+  if (Subtarget->isAAPCS_ABI()) {
+    setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
+  } else {
+    setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
+    setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
+    setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
+  }
+
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
@@ -394,10 +401,7 @@
   if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
       !Subtarget->isThumb1Only()) {
     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
-    if (!Subtarget->isFPOnlySP())
-      addRegisterClass(MVT::f64, &ARM::DPRRegClass);
-
-    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+    addRegisterClass(MVT::f64, &ARM::DPRRegClass);
   }
 
   for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
@@ -579,11 +583,50 @@
   if (!Subtarget->isThumb1Only())
     setTargetDAGCombine(ISD::ADDC);
 
+  if (Subtarget->isFPOnlySP()) {
+    // When targetting a floating-point unit with only single-precision
+    // operations, f64 is legal for the few double-precision instructions which
+    // are present However, no double-precision operations other than moves,
+    // loads and stores are provided by the hardware.
+    setOperationAction(ISD::FADD,       MVT::f64, Expand);
+    setOperationAction(ISD::FSUB,       MVT::f64, Expand);
+    setOperationAction(ISD::FMUL,       MVT::f64, Expand);
+    setOperationAction(ISD::FMA,        MVT::f64, Expand);
+    setOperationAction(ISD::FDIV,       MVT::f64, Expand);
+    setOperationAction(ISD::FREM,       MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
+    setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
+    setOperationAction(ISD::FNEG,       MVT::f64, Expand);
+    setOperationAction(ISD::FABS,       MVT::f64, Expand);
+    setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
+    setOperationAction(ISD::FSIN,       MVT::f64, Expand);
+    setOperationAction(ISD::FCOS,       MVT::f64, Expand);
+    setOperationAction(ISD::FPOWI,      MVT::f64, Expand);
+    setOperationAction(ISD::FPOW,       MVT::f64, Expand);
+    setOperationAction(ISD::FLOG,       MVT::f64, Expand);
+    setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
+    setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
+    setOperationAction(ISD::FEXP,       MVT::f64, Expand);
+    setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
+    setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
+    setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
+    setOperationAction(ISD::FRINT,      MVT::f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
+    setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
+    setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
+    setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
+  }
 
   computeRegisterProperties();
 
-  // ARM does not have f32 extending load.
+  // ARM does not have floating-point extending loads.
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+
+  // ... or truncating stores
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 
   // ARM does not have i1 sign extending load.
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -716,8 +759,12 @@
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
-  // the default expansion.
-  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
+  // the default expansion. If we are targeting a single threaded system,
+  // then set them all for expand so we can lower them later into their
+  // non-atomic form.
+  if (TM.Options.ThreadModel == ThreadModel::Single)
+    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
+  else if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
     // to ldrex/strex loops already.
     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
@@ -725,7 +772,7 @@
     // On v8, we have particularly efficient implementations of atomic fences
     // if they can be combined with nearby atomic loads and stores.
     if (!Subtarget->hasV8Ops()) {
-      // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
+      // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
       setInsertFencesForAtomic(true);
     }
   } else {
@@ -825,10 +872,17 @@
       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
       setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
     }
-    // Special handling for half-precision FP.
+
+    // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
+    if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
+      setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+      setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+    }
+
+    // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
     if (!Subtarget->hasFP16()) {
-      setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
-      setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
+      setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+      setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
     }
   }
 
@@ -836,7 +890,7 @@
   if (Subtarget->hasSinCos()) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
-    if (Subtarget->getTargetTriple().getOS() == Triple::IOS) {
+    if (Subtarget->getTargetTriple().isiOS()) {
       // For iOS, we don't want to the normal expansion of a libcall to
       // sincos. We want to issue a libcall to __sincos_stret.
       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
@@ -844,6 +898,23 @@
     }
   }
 
+  // FP-ARMv8 implements a lot of rounding-like FP operations.
+  if (Subtarget->hasFPARMv8()) {
+    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::f32, Legal);
+    if (!Subtarget->isFPOnlySP()) {
+      setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+      setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+      setOperationAction(ISD::FROUND, MVT::f64, Legal);
+      setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+      setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+      setOperationAction(ISD::FRINT, MVT::f64, Legal);
+    }
+  }
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
   setTargetDAGCombine(ISD::ADD);
@@ -1119,7 +1190,8 @@
 
   // Load are scheduled for latency even if there instruction itinerary
   // is not available.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
 
   if (MCID.getNumDefs() == 0)
@@ -1256,8 +1328,8 @@
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext(), Call);
   CCInfo.AnalyzeCallResult(Ins,
                            CCAssignFnForNode(CallConv, /* Return*/ true,
                                              isVarArg));
@@ -1417,8 +1489,8 @@
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                    *DAG.getContext(), Call);
   CCInfo.AnalyzeCallOperands(Outs,
                              CCAssignFnForNode(CallConv, /* Return*/ false,
                                                isVarArg));
@@ -1510,7 +1582,7 @@
       // True if this byval aggregate will be split between registers
       // and memory.
       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
-      unsigned CurByValIdx = CCInfo.getInRegsParamsProceed();
+      unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
 
       if (CurByValIdx < ByValArgsCount) {
 
@@ -1646,14 +1718,17 @@
     bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
     bool isStub = (isExt && Subtarget->isTargetMachO()) &&
                    getTargetMachine().getRelocationModel() != Reloc::Static;
-    isARMFunc = !Subtarget->isThumb() || isStub;
+    isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
     // ARM call to a local ARM function is predicable.
     isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
     // tBX takes a register source operand.
     if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
       Callee = DAG.getNode(ARMISD::WrapperPIC, dl, getPointerTy(),
-                           DAG.getTargetGlobalAddress(GV, dl, getPointerTy()));
+                           DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
+                                                      0, ARMII::MO_NONLAZY));
+      Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
+                           MachinePointerInfo::getGOT(), false, false, true, 0);
     } else if (Subtarget->isTargetCOFF()) {
       assert(Subtarget->isTargetWindows() &&
              "Windows is the only supported COFF target");
@@ -1679,7 +1754,7 @@
     isDirect = true;
     bool isStub = Subtarget->isTargetMachO() &&
                   getTargetMachine().getRelocationModel() != Reloc::Static;
-    isARMFunc = !Subtarget->isThumb() || isStub;
+    isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
     // tBX takes a register source operand.
     const char *Sym = S->getSymbol();
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
@@ -1740,7 +1815,8 @@
   // Add a register mask operand representing the call-preserved registers.
   if (!isTailCall) {
     const uint32_t *Mask;
-    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    const TargetRegisterInfo *TRI =
+        getTargetMachine().getSubtargetImpl()->getRegisterInfo();
     const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
     if (isThisReturn) {
       // For 'this' returns, use the R0-preserving mask if applicable
@@ -1940,17 +2016,30 @@
   if (Subtarget->isThumb1Only())
     return false;
 
+  // Externally-defined functions with weak linkage should not be
+  // tail-called on ARM when the OS does not support dynamic
+  // pre-emption of symbols, as the AAELF spec requires normal calls
+  // to undefined weak functions to be replaced with a NOP or jump to the
+  // next instruction. The behaviour of branch instructions in this
+  // situation (as used for tail calls) is implementation-defined, so we
+  // cannot rely on the linker replacing the tail call with a return.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    if (GV->hasExternalWeakLinkage())
+      return false;
+  }
+
   // If the calling conventions do not match, then we'd better make sure the
   // results are returned in the same way as what the caller expects.
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
-    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                       getTargetMachine(), RVLocs1, *DAG.getContext(), Call);
+    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+                       *DAG.getContext(), Call);
     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
 
     SmallVector<CCValAssign, 16> RVLocs2;
-    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                       getTargetMachine(), RVLocs2, *DAG.getContext(), Call);
+    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+                       *DAG.getContext(), Call);
     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -1984,8 +2073,8 @@
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                      getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
+    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                      *DAG.getContext(), Call);
     CCInfo.AnalyzeCallOperands(Outs,
                                CCAssignFnForNode(CalleeCC, false, isVarArg));
     if (CCInfo.getNextStackOffset()) {
@@ -1995,7 +2084,8 @@
       // the caller's fixed stack objects.
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
-      const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+      const TargetInstrInfo *TII =
+          getTargetMachine().getSubtargetImpl()->getInstrInfo();
       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
            i != e;
            ++i, ++realArgIdx) {
@@ -2038,7 +2128,7 @@
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                                   LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
                                                     isVarArg));
 }
@@ -2086,8 +2176,8 @@
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slots.
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext(), Call);
 
   // Analyze outgoing return values.
   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
@@ -2098,6 +2188,10 @@
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   bool isLittleEndian = Subtarget->isLittle();
 
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  AFI->setReturnRegsCount(RVLocs.size());
+
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
        i != RVLocs.size();
@@ -2216,9 +2310,15 @@
       if (Copies.count(UseChain.getNode()))
         // Second CopyToReg
         Copy = *UI;
-      else
+      else {
+        // We are at the top of this chain.
+        // If the copy has a glue operand, we conservatively assume it
+        // isn't safe to perform a tail call.
+        if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
+          return false;
         // First CopyToReg
         TCChain = UseChain;
+      }
     }
   } else if (Copy->getOpcode() == ISD::BITCAST) {
     // f32 returned in a single GPR.
@@ -2227,6 +2327,10 @@
     Copy = *Copy->use_begin();
     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
       return false;
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
     TCChain = Copy->getOperand(0);
   } else {
     return false;
@@ -2567,9 +2671,9 @@
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
   case Intrinsic::arm_rbit: {
-    assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+    assert(Op.getOperand(1).getValueType() == MVT::i32 &&
            "RBIT intrinsic must have i32 type!");
-    return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(0));
+    return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1));
   }
   case Intrinsic::arm_thread_pointer: {
     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -2626,7 +2730,7 @@
 
   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
-  unsigned Domain = ARM_MB::ISH;
+  ARM_MB::MemBOpt Domain = ARM_MB::ISH;
   if (Subtarget->isMClass()) {
     // Only a full system barrier exists in the M-class architectures.
     Domain = ARM_MB::SY;
@@ -2739,7 +2843,10 @@
     NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
   }
 
-  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned Align = MF.getTarget()
+                       .getSubtargetImpl()
+                       ->getFrameLowering()
+                       ->getStackAlignment();
   ArgRegsSize = NumGPRs * 4;
 
   // If parameter is split between stack and GPRs...
@@ -2916,8 +3023,8 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                    *DAG.getContext(), Prologue);
   CCInfo.AnalyzeFormalArguments(Ins,
                                 CCAssignFnForNode(CallConv, /* Return*/ false,
                                                   isVarArg));
@@ -2951,7 +3058,7 @@
         if (Flags.isByVal()) {
           unsigned ExtraArgRegsSize;
           unsigned ExtraArgRegsSaveSize;
-          computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsProceed(),
+          computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsProcessed(),
                          Flags.getByValSize(),
                          ExtraArgRegsSize, ExtraArgRegsSaveSize);
 
@@ -2966,7 +3073,7 @@
   }
   CCInfo.rewindByValRegsInfo();
   lastInsIndex = -1;
-  if (isVarArg) {
+  if (isVarArg && MFI->hasVAStart()) {
     unsigned ExtraArgRegsSize;
     unsigned ExtraArgRegsSaveSize;
     computeRegArea(CCInfo, MF, CCInfo.getInRegsParamsCount(), 0,
@@ -3075,7 +3182,7 @@
           // Since they could be overwritten by lowering of arguments in case of
           // a tail call.
           if (Flags.isByVal()) {
-            unsigned CurByValIndex = CCInfo.getInRegsParamsProceed();
+            unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
 
             ByValStoreOffset = RoundUpToAlignment(ByValStoreOffset, Flags.getByValAlign());
             int FrameIndex = StoreByValRegs(
@@ -3108,7 +3215,7 @@
   }
 
   // varargs
-  if (isVarArg)
+  if (isVarArg && MFI->hasVAStart())
     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
                          CCInfo.getNextStackOffset(),
                          TotalArgRegsSaveSize);
@@ -3130,6 +3237,18 @@
         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
           return CFP->getValueAPF().isPosZero();
     }
+  } else if (Op->getOpcode() == ISD::BITCAST &&
+             Op->getValueType(0) == MVT::f64) {
+    // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
+    // created by LowerConstantFP().
+    SDValue BitcastOp = Op->getOperand(0);
+    if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) {
+      SDValue MoveOp = BitcastOp->getOperand(0);
+      if (MoveOp->getOpcode() == ISD::TargetConstant &&
+          cast<ConstantSDNode>(MoveOp)->getZExtValue() == 0) {
+        return true;
+      }
+    }
   }
   return false;
 }
@@ -3198,6 +3317,7 @@
 SDValue
 ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
                              SDLoc dl) const {
+  assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
   SDValue Cmp;
   if (!isFloatingPointZero(RHS))
     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
@@ -3313,9 +3433,8 @@
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     EVT VT = Op.getValueType();
 
-    return DAG.getNode(ARMISD::CMOV, SDLoc(Op), VT, SelectTrue, SelectFalse,
-                       ARMcc, CCR, OverflowCmp);
-
+    return getCMOV(SDLoc(Op), VT, SelectTrue, SelectFalse, ARMcc, CCR,
+                   OverflowCmp, DAG);
   }
 
   // Convert:
@@ -3349,7 +3468,7 @@
         SDValue CCR = Cond.getOperand(3);
         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
         assert(True.getValueType() == VT);
-        return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
+        return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
       }
     }
   }
@@ -3419,6 +3538,32 @@
   }
 }
 
+SDValue ARMTargetLowering::getCMOV(SDLoc dl, EVT VT, SDValue FalseVal,
+                                   SDValue TrueVal, SDValue ARMcc, SDValue CCR,
+                                   SDValue Cmp, SelectionDAG &DAG) const {
+  if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
+    FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
+    TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
+                          DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
+
+    SDValue TrueLow = TrueVal.getValue(0);
+    SDValue TrueHigh = TrueVal.getValue(1);
+    SDValue FalseLow = FalseVal.getValue(0);
+    SDValue FalseHigh = FalseVal.getValue(1);
+
+    SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
+                              ARMcc, CCR, Cmp);
+    SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
+                               ARMcc, CCR, duplicateCmp(Cmp, DAG));
+
+    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
+  } else {
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
+                       Cmp);
+  }
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
@@ -3428,6 +3573,18 @@
   SDValue FalseVal = Op.getOperand(3);
   SDLoc dl(Op);
 
+  if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
+    DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
+                                                    dl);
+
+    // If softenSetCCOperands only returned one value, we should compare it to
+    // zero.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
   if (LHS.getValueType() == MVT::i32) {
     // Try to generate VSEL on ARMv8.
     // The VSEL instruction can't use all the usual ARM condition
@@ -3452,8 +3609,7 @@
     SDValue ARMcc;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
-    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
-                       Cmp);
+    return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
@@ -3468,12 +3624,18 @@
     //   select c, a, b
     // We only do this in unsafe-fp-math, because signed zeros and NaNs are
     // handled differently than the original code sequence.
-    if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal &&
-        RHS == FalseVal) {
-      if (CC == ISD::SETOGT || CC == ISD::SETUGT)
-        return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal);
-      if (CC == ISD::SETOLT || CC == ISD::SETULT)
-        return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal);
+    if (getTargetMachine().Options.UnsafeFPMath) {
+      if (LHS == TrueVal && RHS == FalseVal) {
+        if (CC == ISD::SETOGT || CC == ISD::SETUGT)
+          return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal);
+        if (CC == ISD::SETOLT || CC == ISD::SETULT)
+          return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal);
+      } else if (LHS == FalseVal && RHS == TrueVal) {
+        if (CC == ISD::SETOLT || CC == ISD::SETULT)
+          return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal);
+        if (CC == ISD::SETOGT || CC == ISD::SETUGT)
+          return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal);
+      }
     }
 
     bool swpCmpOps = false;
@@ -3492,14 +3654,12 @@
   SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
-  SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
-                               ARMcc, CCR, Cmp);
+  SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
   if (CondCode2 != ARMCC::AL) {
     SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32);
     // FIXME: Needs another CMP because flag can have but one use.
     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
-    Result = DAG.getNode(ARMISD::CMOV, dl, VT,
-                         Result, TrueVal, ARMcc2, CCR, Cmp2);
+    Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
   }
   return Result;
 }
@@ -3632,6 +3792,18 @@
   SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
 
+  if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
+    DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
+                                                    dl);
+
+    // If softenSetCCOperands only returned one value, we should compare it to
+    // zero.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
   if (LHS.getValueType() == MVT::i32) {
     SDValue ARMcc;
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
@@ -3724,11 +3896,23 @@
   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
 }
 
-static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   if (VT.isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
+  if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
+    RTLIB::Libcall LC;
+    if (Op.getOpcode() == ISD::FP_TO_SINT)
+      LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
+                              Op.getValueType());
+    else
+      LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
+                              Op.getValueType());
+    return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1,
+                       /*isSigned*/ false, SDLoc(Op)).first;
+  }
+
   SDLoc dl(Op);
   unsigned Opc;
 
@@ -3778,11 +3962,23 @@
   return DAG.getNode(Opc, dl, VT, Op);
 }
 
-static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   if (VT.isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
 
+  if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
+    RTLIB::Libcall LC;
+    if (Op.getOpcode() == ISD::SINT_TO_FP)
+      LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
+                              Op.getValueType());
+    else
+      LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
+                              Op.getValueType());
+    return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1,
+                       /*isSigned*/ false, SDLoc(Op)).first;
+  }
+
   SDLoc dl(Op);
   unsigned Opc;
 
@@ -4291,7 +4487,7 @@
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   SDLoc dl(Op);
 
-  if (Op.getOperand(1).getValueType().isFloatingPoint()) {
+  if (Op1.getValueType().isFloatingPoint()) {
     switch (SetCCOpcode) {
     default: llvm_unreachable("Illegal FP comparison");
     case ISD::SETUNE:
@@ -4555,6 +4751,11 @@
   bool IsDouble = Op.getValueType() == MVT::f64;
   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
 
+  // Use the default (constant pool) lowering for double constants when we have
+  // an SP-only FPU
+  if (IsDouble && Subtarget->isFPOnlySP())
+    return SDValue();
+
   // Try splatting with a VMOV.f32...
   APFloat FPVal = CFP->getValueAPF();
   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
@@ -5733,7 +5934,7 @@
   // operation legalization where we can't create illegal types.
   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
-                        LD->getMemoryVT(), LD->isVolatile(),
+                        LD->getMemoryVT(), LD->isVolatile(), LD->isInvariant(),
                         LD->isNonTemporal(), LD->getAlignment());
 }
 
@@ -6088,7 +6289,7 @@
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
 
   // Create stack object for sret.
   const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy);
@@ -6258,6 +6459,8 @@
     if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
       return LowerDYNAMIC_STACKALLOC(Op, DAG);
     llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
   }
 }
 
@@ -6294,7 +6497,8 @@
 void ARMTargetLowering::
 SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
                        MachineBasicBlock *DispatchBB, int FI) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
@@ -6377,9 +6581,9 @@
                    .addReg(NewVReg2, RegState::Kill)
                    .addReg(NewVReg3, RegState::Kill));
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5)
-                   .addFrameIndex(FI)
-                   .addImm(36)); // &jbuf[1] :: pc
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
+            .addFrameIndex(FI)
+            .addImm(36); // &jbuf[1] :: pc
     AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
                    .addReg(NewVReg4, RegState::Kill)
                    .addReg(NewVReg5, RegState::Kill)
@@ -6409,7 +6613,8 @@
 
 MachineBasicBlock *ARMTargetLowering::
 EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
@@ -6738,16 +6943,14 @@
   for (std::vector<MachineBasicBlock*>::iterator
          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
     MachineBasicBlock *CurMBB = *I;
-    if (SeenMBBs.insert(CurMBB))
+    if (SeenMBBs.insert(CurMBB).second)
       DispContBB->addSuccessor(CurMBB);
   }
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
   SmallVector<MachineBasicBlock*, 64> MBBLPads;
-  for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
-         I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
-    MachineBasicBlock *BB = *I;
+  for (MachineBasicBlock *BB : InvokeBBs) {
 
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
@@ -6926,7 +7129,8 @@
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
   // Otherwise, we will generate unrolled scalar copies.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = BB;
   ++It;
@@ -7160,7 +7364,7 @@
 ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI,
                                        MachineBasicBlock *MBB) const {
   const TargetMachine &TM = getTargetMachine();
-  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   assert(Subtarget->isTargetWindows() &&
@@ -7216,8 +7420,7 @@
 
   AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
                                       ARM::SP)
-                              .addReg(ARM::SP, RegState::Define)
-                              .addReg(ARM::R4, RegState::Kill)));
+                              .addReg(ARM::SP).addReg(ARM::R4)));
 
   MI->eraseFromParent();
   return MBB;
@@ -7226,7 +7429,8 @@
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
   switch (MI->getOpcode()) {
@@ -7479,12 +7683,6 @@
 
 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                       SDNode *Node) const {
-  if (!MI->hasPostISelHook()) {
-    assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
-           "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
-    return;
-  }
-
   const MCInstrDesc *MCID = &MI->getDesc();
   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
@@ -7496,8 +7694,8 @@
   // Rename pseudo opcodes.
   unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
   if (NewOpc) {
-    const ARMBaseInstrInfo *TII =
-      static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo());
+    const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
+        getTargetMachine().getSubtargetImpl()->getInstrInfo());
     MCID = &TII->get(NewOpc);
 
     assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
@@ -8398,10 +8596,11 @@
 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
 /// ARMISD::VMOVRRD.
 static SDValue PerformVMOVRRDCombine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI) {
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
   // vmovrrd(vmovdrr x, y) -> x,y
   SDValue InDouble = N->getOperand(0);
-  if (InDouble.getOpcode() == ARMISD::VMOVDRR)
+  if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
 
   // vmovrrd(load f64) -> (load i32), (load i32)
@@ -8432,8 +8631,6 @@
     if (DCI.DAG.getTargetLoweringInfo().isBigEndian())
       std::swap (NewLD1, NewLD2);
     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
-    DCI.RemoveFromWorklist(LD);
-    DAG.DeleteNode(LD);
     return Result;
   }
 
@@ -8596,7 +8793,7 @@
   return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
                       St->getPointerInfo(), St->isVolatile(),
                       St->isNonTemporal(), St->getAlignment(),
-                      St->getTBAAInfo());
+                      St->getAAInfo());
 }
 
 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
@@ -8616,7 +8813,8 @@
 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
 /// ISD::BUILD_VECTOR.
 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
-                                          TargetLowering::DAGCombinerInfo &DCI){
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const ARMSubtarget *Subtarget) {
   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
   // into a pair of GPRs, which is fine when the value is used as a scalar,
@@ -8922,7 +9120,7 @@
       Tys[n] = VecTy;
     Tys[n++] = MVT::i32;
     Tys[n] = MVT::Other;
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs+2));
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(N->getOperand(0)); // incoming chain
     Ops.push_back(N->getOperand(AddrOpIdx));
@@ -9001,7 +9199,7 @@
   for (n = 0; n < NumVecs; ++n)
     Tys[n] = VT;
   Tys[n] = MVT::Other;
-  SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumVecs+1));
+  SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
@@ -9631,10 +9829,10 @@
   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
-  case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
+  case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
   case ISD::STORE:      return PerformSTORECombine(N, DCI);
-  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI);
+  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
@@ -9686,8 +9884,10 @@
   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
 }
 
-bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, unsigned,
-                                                      bool *Fast) const {
+bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                       unsigned,
+                                                       unsigned,
+                                                       bool *Fast) const {
   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
 
@@ -9741,11 +9941,12 @@
     bool Fast;
     if (Size >= 16 &&
         (memOpAlign(SrcAlign, DstAlign, 16) ||
-         (allowsUnalignedMemoryAccesses(MVT::v2f64, 0, &Fast) && Fast))) {
+         (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
       return MVT::v2f64;
     } else if (Size >= 8 &&
                (memOpAlign(SrcAlign, DstAlign, 8) ||
-                (allowsUnalignedMemoryAccesses(MVT::f64, 0, &Fast) && Fast))) {
+                (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
+                 Fast))) {
       return MVT::f64;
     }
   }
@@ -10348,6 +10549,8 @@
         return RCPair(0U, &ARM::hGPRRegClass);
       break;
     case 'r':
+      if (Subtarget->isThumb1Only())
+        return RCPair(0U, &ARM::tGPRRegClass);
       return RCPair(0U, &ARM::GPRRegClass);
     case 'w':
       if (VT == MVT::Other)
@@ -10552,7 +10755,7 @@
   assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only");
   unsigned Opcode = Op->getOpcode();
   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
-      "Invalid opcode for Div/Rem lowering");
+         "Invalid opcode for Div/Rem lowering");
   bool isSigned = (Opcode == ISD::SDIVREM);
   EVT VT = Op->getValueType(0);
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
@@ -10560,10 +10763,10 @@
   RTLIB::Libcall LC;
   switch (VT.getSimpleVT().SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
-  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
-  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
-  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
-  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
   }
 
   SDValue InChain = DAG.getEntryNode();
@@ -10583,7 +10786,7 @@
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy());
 
-  Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL);
+  Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
 
   SDLoc dl(Op);
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -10611,7 +10814,7 @@
   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
   Flag = Chain.getValue(1);
 
-  SDVTList NodeTys = DAG.getVTList(MVT::i32, MVT::Glue);
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
 
   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
@@ -10621,6 +10824,31 @@
   return DAG.getMergeValues(Ops, DL);
 }
 
+SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
+         "Unexpected type for custom-lowering FP_EXTEND");
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, SDLoc(Op)).first;
+}
+
+SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getOperand(0).getValueType() == MVT::f64 &&
+         Subtarget->isFPOnlySP() &&
+         "Unexpected type for custom-lowering FP_ROUND");
+
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, SDLoc(Op)).first;
+}
+
 bool
 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The ARM target isn't yet aware of offsets.
@@ -10648,7 +10876,7 @@
     return false;
   if (VT == MVT::f32)
     return ARM_AM::getFP32Imm(Imm) != -1;
-  if (VT == MVT::f64)
+  if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
     return ARM_AM::getFP64Imm(Imm) != -1;
   return false;
 }
@@ -10775,31 +11003,154 @@
   return true;
 }
 
-bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
-  // Loads and stores less than 64-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
-  // anything for those.
-  bool IsMClass = Subtarget->isMClass();
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
-    return Size == 64 && !IsMClass;
-  } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-    return LI->getType()->getPrimitiveSizeInBits() == 64 && !IsMClass;
-  }
+bool ARMTargetLowering::hasLoadLinkedStoreConditional() const { return true; }
 
-  // For the real atomic operations, we have ldrex/strex up to 32 bits,
-  // and up to 64 bits on the non-M profiles
-  unsigned AtomicLimit = IsMClass ? 32 : 64;
-  return Inst->getType()->getPrimitiveSizeInBits() <= AtomicLimit;
+Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
+                                        ARM_MB::MemBOpt Domain) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+
+  // First, if the target has no DMB, see what fallback we can use.
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
+      Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
+      Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
+                        Builder.getInt32(0), Builder.getInt32(7),
+                        Builder.getInt32(10), Builder.getInt32(5)};
+      return Builder.CreateCall(MCR, args);
+    } else {
+      // Instead of using barriers, atomic accesses on these subtargets use
+      // libcalls.
+      llvm_unreachable("makeDMB on a target so old that it has no barriers");
+    }
+  } else {
+    Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
+    // Only a full system barrier exists in the M-class architectures.
+    Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
+    Constant *CDomain = Builder.getInt32(Domain);
+    return Builder.CreateCall(DMB, CDomain);
+  }
+}
+
+// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                         AtomicOrdering Ord, bool IsStore,
+                                         bool IsLoad) const {
+  if (!getInsertFencesForAtomic())
+    return nullptr;
+
+  switch (Ord) {
+  case NotAtomic:
+  case Unordered:
+    llvm_unreachable("Invalid fence: unordered/non-atomic");
+  case Monotonic:
+  case Acquire:
+    return nullptr; // Nothing to do
+  case SequentiallyConsistent:
+    if (!IsStore)
+      return nullptr; // Nothing to do
+    /*FALLTHROUGH*/
+  case Release:
+  case AcquireRelease:
+    if (Subtarget->isSwift())
+      return makeDMB(Builder, ARM_MB::ISHST);
+    // FIXME: add a comment with a link to documentation justifying this.
+    else
+      return makeDMB(Builder, ARM_MB::ISH);
+  }
+  llvm_unreachable("Unknown fence ordering in emitLeadingFence");
+}
+
+Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                          AtomicOrdering Ord, bool IsStore,
+                                          bool IsLoad) const {
+  if (!getInsertFencesForAtomic())
+    return nullptr;
+
+  switch (Ord) {
+  case NotAtomic:
+  case Unordered:
+    llvm_unreachable("Invalid fence: unordered/not-atomic");
+  case Monotonic:
+  case Release:
+    return nullptr; // Nothing to do
+  case Acquire:
+  case AcquireRelease:
+  case SequentiallyConsistent:
+    return makeDMB(Builder, ARM_MB::ISH);
+  }
+  llvm_unreachable("Unknown fence ordering in emitTrailingFence");
+}
+
+// Loads and stores less than 64-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+// anything for those.
+bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  return (Size == 64) && !Subtarget->isMClass();
+}
+
+// Loads and stores less than 64-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
+// anything for those.
+// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
+// guarantee, see DDI0406C ARM architecture reference manual,
+// sections A8.8.72-74 LDRD)
+bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  return (Size == 64) && !Subtarget->isMClass();
+}
+
+// For the real atomic operations, we have ldrex/strex up to 32 bits,
+// and up to 64 bits on the non-M profiles
+bool ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  return Size <= (Subtarget->isMClass() ? 32U : 64U);
+}
+
+// This has so far only been implemented for MachO.
+bool ARMTargetLowering::useLoadStackGuardNode() const {
+  return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO;
+}
+
+bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                                  unsigned &Cost) const {
+  // If we do not have NEON, vector types are not natively supported.
+  if (!Subtarget->hasNEON())
+    return false;
+
+  // Floating point values and vector values map to the same register file.
+  // Therefore, althought we could do a store extract of a vector type, this is
+  // better to leave at float as we have more freedom in the addressing mode for
+  // those.
+  if (VectorTy->isFPOrFPVectorTy())
+    return false;
+
+  // If the index is unknown at compile time, this is very expensive to lower
+  // and it is not possible to combine the store with the extract.
+  if (!isa<ConstantInt>(Idx))
+    return false;
+
+  assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
+  unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
+  // We can do a store + vector extract on any vector that fits perfectly in a D
+  // or Q register.
+  if (BitWidth == 64 || BitWidth == 128) {
+    Cost = 0;
+    return true;
+  }
+  return false;
 }
 
 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                          AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsAcquire = isAtLeastAcquire(Ord);
 
   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i32, i32} and we have to recombine them into a
@@ -10835,8 +11186,7 @@
                                                Value *Addr,
                                                AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+  bool IsRelease = isAtLeastRelease(Ord);
 
   // Since the intrinsics must have legal type, the i64 intrinsics take two
   // parameters: "i32, i32". We must marshal Val into the appropriate form
@@ -10934,6 +11284,6 @@
   HABaseType Base = HA_UNKNOWN;
   uint64_t Members = 0;
   bool result = isHomogeneousAggregate(Ty, Base, Members);
-  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump(); dbgs() << "\n");
+  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump());
   return result;
 }

diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 1ace0f3..89b0c31 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMISELLOWERING_H
-#define ARMISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
+#define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
 
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -232,7 +232,7 @@
 
   class ARMTargetLowering : public TargetLowering {
   public:
-    explicit ARMTargetLowering(TargetMachine &TM);
+    explicit ARMTargetLowering(const TargetMachine &TM);
 
     unsigned getJumpTableEncoding() const override;
 
@@ -266,11 +266,12 @@
 
     bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
 
-    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// allowsMisalignedMemoryAccesses - Returns true if the target allows
     /// unaligned memory accesses of the specified type. Returns whether it
     /// is "fast" by reference in the second argument.
-    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
-                                       bool *Fast) const override;
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                        unsigned Align,
+                                        bool *Fast) const override;
 
     EVT getOptimalMemOpType(uint64_t Size,
                             unsigned DstAlign, unsigned SrcAlign,
@@ -391,12 +392,26 @@
     bool functionArgumentNeedsConsecutiveRegisters(
         Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
 
+    bool hasLoadLinkedStoreConditional() const override;
+    Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const;
     Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                           AtomicOrdering Ord) const override;
     Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                 Value *Addr, AtomicOrdering Ord) const override;
 
-    bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                          bool IsStore, bool IsLoad) const override;
+    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                           bool IsStore, bool IsLoad) const override;
+
+    bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+    bool useLoadStackGuardNode() const override;
+
+    bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                   unsigned &Cost) const override;
 
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
@@ -473,6 +488,10 @@
     SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
     unsigned getRegisterByName(const char* RegName, EVT VT) const override;
 
@@ -562,6 +581,9 @@
 
     bool mayBeEmittedAsTailCall(CallInst *CI) const override;
 
+    SDValue getCMOV(SDLoc dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
+                    SDValue ARMcc, SDValue CCR, SDValue Cmp,
+                    SelectionDAG &DAG) const;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS,

diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 59e9260..7d27cf3 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td

@@ -203,6 +203,16 @@
   let ParserMatchClass = MSRMaskOperand;
 }
 
+def BankedRegOperand : AsmOperandClass {
+  let Name = "BankedReg";
+  let ParserMethod = "parseBankedRegOperand";
+}
+def banked_reg : Operand<i32> {
+  let PrintMethod = "printBankedRegOperand";
+  let DecoderMethod = "DecodeBankedReg";
+  let ParserMatchClass = BankedRegOperand;
+}
+
 // Shift Right Immediate - A shift right immediate is encoded differently from
 // other shift immediates. The imm6 field is encoded like so:
 //

diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index f235ac2..17d1ffa 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp

@@ -90,6 +90,49 @@
   return 0;
 }
 
+void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
+                                        Reloc::Model RM) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const ARMSubtarget &Subtarget = MF.getTarget().getSubtarget<ARMSubtarget>();
+
+  if (!Subtarget.useMovt(MF)) {
+    if (RM == Reloc::PIC_)
+      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12, RM);
+    else
+      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12, RM);
+    return;
+  }
+
+  if (RM != Reloc::PIC_) {
+    expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12, RM);
+    return;
+  }
+
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+
+  if (!Subtarget.GVIsIndirectSymbol(GV, RM)) {
+    expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12, RM);
+    return;
+  }
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Reg = MI->getOperand(0).getReg();
+  MachineInstrBuilder MIB;
+
+  MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
+            .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
+  unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
+      MachinePointerInfo::getGOT(), Flag, 4, 4);
+  MIB.addMemOperand(MMO);
+  MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
+  MIB.addReg(Reg, RegState::Kill).addImm(0);
+  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+  AddDefaultPred(MIB);
+}
+
 namespace {
   /// ARMCGBR - Create Global Base Reg pass. This initializes the PIC
   /// global base register for ARM ELF.
@@ -113,8 +156,9 @@
       ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
           *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj);
 
-      unsigned Align = TM->getDataLayout()
-          ->getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
+      unsigned Align =
+          TM->getSubtargetImpl()->getDataLayout()->getPrefTypeAlignment(
+              Type::getInt32PtrTy(*Context));
       unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align);
 
       MachineBasicBlock &FirstMBB = MF.front();
@@ -124,7 +168,7 @@
           MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
       unsigned Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ?
                      ARM::t2LDRpci : ARM::LDRcp;
-      const TargetInstrInfo &TII = *TM->getInstrInfo();
+      const TargetInstrInfo &TII = *TM->getSubtargetImpl()->getInstrInfo();
       MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
                                         TII.get(Opc), TempReg)
                                 .addConstantPoolIndex(Idx);

diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
index b09958a..90f34ea 100644
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMINSTRUCTIONINFO_H
-#define ARMINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMINSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMINSTRINFO_H
 
 #include "ARMBaseInstrInfo.h"
 #include "ARMRegisterInfo.h"
@@ -37,6 +37,10 @@
   /// always be able to get register info as well (through this method).
   ///
   const ARMRegisterInfo &getRegisterInfo() const override { return RI; }
+
+private:
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI,
+                            Reloc::Model RM) const override;
 };
 
 }

diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 2bb8976..3177114 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td

@@ -241,6 +241,9 @@
 def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
                                  AssemblerPredicate<"FeatureMP",
                                                     "mp-extensions">;
+def HasVirtualization: Predicate<"false">,
+                                 AssemblerPredicate<"FeatureVirtualization",
+                                                   "virtualization-extensions">;
 def HasTrustZone     : Predicate<"Subtarget->hasTrustZone()">,
                                  AssemblerPredicate<"FeatureTrustZone",
                                                     "TrustZone">;
@@ -633,6 +636,8 @@
   let ParserMatchClass = Imm32AsmOperand;
 }
 
+def imm8_or_16 : ImmLeaf<i32, [{ return Imm == 8 || Imm == 16;}]>;
+
 /// imm1_7 predicate - Immediate in the range [1,7].
 def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; }
 def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
@@ -1961,7 +1966,7 @@
 }
 
 def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
-             []>, Requires<[IsARM, HasV7]> {
+             [(int_arm_dbg imm0_15:$opt)]>, Requires<[IsARM, HasV7]> {
   bits<4> opt;
   let Inst{27-4} = 0b001100100000111100001111;
   let Inst{3-0} = opt;
@@ -2708,7 +2713,8 @@
   def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
                             (ins GPR:$Rt, addrmode_imm12_pre:$addr), IndexModePre,
                             StFrm, iii,
-                            opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+                            opc, "\t$Rt, $addr!",
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
     bits<17> addr;
     let Inst{25} = 0;
     let Inst{23}    = addr{12};     // U (add = ('U' == 1))
@@ -2720,7 +2726,8 @@
   def _PRE_REG  : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
                       (ins GPR:$Rt, ldst_so_reg:$addr),
                       IndexModePre, StFrm, iir,
-                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+                      opc, "\t$Rt, $addr!",
+                      "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
     bits<17> addr;
     let Inst{25} = 1;
     let Inst{23}    = addr{12};    // U (add = ('U' == 1))
@@ -2733,7 +2740,7 @@
                 (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
                 IndexModePost, StFrm, iir,
                 opc, "\t$Rt, $addr, $offset",
-                "$addr.base = $Rn_wb", []> {
+                "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
      // {12}     isAdd
      // {11-0}   imm12/Rm
      bits<14> offset;
@@ -2751,7 +2758,7 @@
                 (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
                 IndexModePost, StFrm, iii,
                 opc, "\t$Rt, $addr, $offset",
-                "$addr.base = $Rn_wb", []> {
+                "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
     // {12}     isAdd
     // {11-0}   imm12/Rm
     bits<14> offset;
@@ -2828,7 +2835,8 @@
 def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
                            (ins GPR:$Rt, addrmode3_pre:$addr), IndexModePre,
                            StMiscFrm, IIC_iStore_bh_ru,
-                           "strh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+                           "strh", "\t$Rt, $addr!",
+                           "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
   bits<14> addr;
   let Inst{23}    = addr{8};      // U bit
   let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
@@ -2841,7 +2849,8 @@
 def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
                        (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset),
                        IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
-                       "strh", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                       "strh", "\t$Rt, $addr, $offset",
+                       "$addr.base = $Rn_wb,@earlyclobber $Rn_wb",
                    [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
                                                       addr_offset_none:$addr,
                                                       am3offset:$offset))]> {
@@ -3417,7 +3426,8 @@
 def : ARMPat<(ARMadde GPR:$src, so_imm_not:$imm, CPSR),
              (SBCri   GPR:$src, so_imm_not:$imm)>;
 def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR),
-             (SBCrr   GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>;
+             (SBCrr   GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
 
 // Note: These are implemented in C++ code, because they have to generate
 // ADD/SUBrs instructions, which use a complex pattern that a xform function
@@ -3932,14 +3942,12 @@
 
   def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (sra (opnode GPR:$Rn,
-                                    (sext_inreg GPR:$Rm, i16)), (i32 16)))]>,
+              []>,
            Requires<[IsARM, HasV5TE]>;
 
   def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
-              [(set GPR:$Rd, (sra (opnode GPR:$Rn,
-                                    (sra GPR:$Rm, (i32 16))), (i32 16)))]>,
+              []>,
             Requires<[IsARM, HasV5TE]>;
 }
 
@@ -3981,17 +3989,13 @@
   def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPRnopc:$Rd,
-                    (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
-                                  (sext_inreg GPRnopc:$Rm, i16)), (i32 16))))]>,
+              []>,
            Requires<[IsARM, HasV5TE, UseMulOps]>;
 
   def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPRnopc:$Rd,
-                 (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
-                                    (sra GPRnopc:$Rm, (i32 16))), (i32 16))))]>,
+              []>,
             Requires<[IsARM, HasV5TE, UseMulOps]>;
   }
 }
@@ -4111,7 +4115,7 @@
 //  Misc. Arithmetic Instructions.
 //
 
-def CLZ  : AMiscA1I<0b000010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
+def CLZ  : AMiscA1I<0b00010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
               IIC_iUNAr, "clz", "\t$Rd, $Rm",
               [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>,
            Sched<[WriteALU]>;
@@ -4629,7 +4633,7 @@
 class acquiring_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+  return isAtLeastAcquire(Ordering);
 }]>;
 
 def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
@@ -4639,7 +4643,7 @@
 class releasing_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Release || Ordering == SequentiallyConsistent;
+  return isAtLeastRelease(Ordering);
 }]>;
 
 def atomic_store_release_8  : releasing_store<atomic_store_8>;
@@ -5060,12 +5064,31 @@
   let Unpredictable{11-0} = 0b110100001111;
 }
 
+// However, the MRS (banked register) system instruction (ARMv7VE) *does* have a
+// separate encoding (distinguished by bit 5.
+def MRSbanked : ABI<0b0001, (outs GPRnopc:$Rd), (ins banked_reg:$banked),
+                    NoItinerary, "mrs", "\t$Rd, $banked", []>,
+                Requires<[IsARM, HasVirtualization]> {
+  bits<6> banked;
+  bits<4> Rd;
+
+  let Inst{23} = 0;
+  let Inst{22} = banked{5}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = banked{3-0};
+  let Inst{15-12} = Rd;
+  let Inst{11-9} = 0b001;
+  let Inst{8} = banked{4};
+  let Inst{7-0} = 0b00000000;
+}
+
 // Move from ARM core register to Special Register
 //
-// No need to have both system and application versions, the encodings are the
-// same and the assembly parser has no way to distinguish between them. The mask
-// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
-// the mask with the fields to be accessed in the special register.
+// No need to have both system and application versions of MSR (immediate) or
+// MSR (register), the encodings are the same and the assembly parser has no way
+// to distinguish between them. The mask operand contains the special register
+// (R Bit) in bit 4 and bits 3-0 contains the mask with the fields to be
+// accessed in the special register.
 def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
               "msr", "\t$mask, $Rn", []> {
   bits<5> mask;
@@ -5093,6 +5116,25 @@
   let Inst{11-0} = a;
 }
 
+// However, the MSR (banked register) system instruction (ARMv7VE) *does* have a
+// separate encoding (distinguished by bit 5.
+def MSRbanked : ABI<0b0001, (outs), (ins banked_reg:$banked, GPRnopc:$Rn),
+                    NoItinerary, "msr", "\t$banked, $Rn", []>,
+                Requires<[IsARM, HasVirtualization]> {
+  bits<6> banked;
+  bits<4> Rn;
+
+  let Inst{23} = 0;
+  let Inst{22} = banked{5}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = banked{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-9} = 0b001;
+  let Inst{8} = banked{4};
+  let Inst{7-4} = 0b0000;
+  let Inst{3-0} = Rn;
+}
+
 // Dynamic stack allocation yields a _chkstk for Windows targets.  These calls
 // are needed to probe the stack when allocating more than
 // 4k bytes in one go. Touching the stack at 4K increments is necessary to
@@ -5278,11 +5320,6 @@
                  (SMULTB GPR:$a, GPR:$b)>;
 def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
                 (SMULTB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
-                      (i32 16)),
-                 (SMULWB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
-                 (SMULWB GPR:$a, GPR:$b)>;
 
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
@@ -5305,13 +5342,6 @@
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
                  (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5MOPat<(add GPR:$acc,
-                      (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
-                           (i32 16))),
-                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5MOPat<(add GPR:$acc,
-                      (sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
-                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
 
 
 // Pre-v7 uses MCR for synchronization barriers.
@@ -5591,3 +5621,8 @@
 // is discarded.
 def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
          ComplexDeprecationPredicate<"IT">;
+
+let mayLoad = 1, mayStore =1, hasSideEffects = 1 in
+def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
+                       NoItinerary,
+                       [(set GPR:$Rd, (int_arm_space imm:$size, GPR:$Rn))]>;

diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index c02bb3b..a0c627c 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td

@@ -34,6 +34,14 @@
   let PrintMethod = "printNEONModImmOperand";
   let ParserMatchClass = nImmSplatI32AsmOperand;
 }
+def nImmSplatNotI16AsmOperand : AsmOperandClass { let Name = "NEONi16splatNot"; }
+def nImmSplatNotI16 : Operand<i32> {
+  let ParserMatchClass = nImmSplatNotI16AsmOperand;
+}
+def nImmSplatNotI32AsmOperand : AsmOperandClass { let Name = "NEONi32splatNot"; }
+def nImmSplatNotI32 : Operand<i32> {
+  let ParserMatchClass = nImmSplatNotI32AsmOperand;
+}
 def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; }
 def nImmVMOVI32 : Operand<i32> {
   let PrintMethod = "printNEONModImmOperand";
@@ -4376,7 +4384,7 @@
 //   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
 defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
                             "vqdmlsl", "s", null_frag>;
-defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", null_frag>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>;
 
 def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
                      (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
@@ -5429,7 +5437,7 @@
                           IIC_VMOVSI, "vmov", "32", "$R, $V$lane",
                           [(set GPR:$R, (extractelt (v2i32 DPR:$V),
                                            imm:$lane))]>,
-                Requires<[HasNEON, HasFastVGETLNi32]> {
+                Requires<[HasVFP2, HasFastVGETLNi32]> {
   let Inst{21} = lane{0};
 }
 // def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
@@ -5497,8 +5505,12 @@
                           (ins DPR:$src1, GPR:$R, VectorIndex32:$lane),
                           IIC_VMOVISL, "vmov", "32", "$V$lane, $R",
                           [(set DPR:$V, (insertelt (v2i32 DPR:$src1),
-                                           GPR:$R, imm:$lane))]> {
+                                           GPR:$R, imm:$lane))]>,
+                Requires<[HasVFP2]> {
   let Inst{21} = lane{0};
+  // This instruction is equivalent as
+  // $V = INSERT_SUBREG $src1, $R, translateImmToSubIdx($imm)
+  let isInsertSubreg = 1;
 }
 }
 def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
@@ -6635,6 +6647,16 @@
                          (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
 defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
                          (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+// ... immediates
+def : NEONInstAlias<"vand${p}.i16 $Vd, $imm",
+                    (VBICiv4i16 DPR:$Vd, nImmSplatNotI16:$imm, pred:$p)>;
+def : NEONInstAlias<"vand${p}.i32 $Vd, $imm",
+                    (VBICiv2i32 DPR:$Vd, nImmSplatNotI32:$imm, pred:$p)>;
+def : NEONInstAlias<"vand${p}.i16 $Vd, $imm",
+                    (VBICiv8i16 QPR:$Vd, nImmSplatNotI16:$imm, pred:$p)>;
+def : NEONInstAlias<"vand${p}.i32 $Vd, $imm",
+                    (VBICiv4i32 QPR:$Vd, nImmSplatNotI32:$imm, pred:$p)>;
+
 
 // VLD1 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.

diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index e17f73a..a867844 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td

@@ -311,7 +311,7 @@
 }
 
 def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
-                  []>, T1Encoding<0b101101>, Deprecated<HasV8Ops> {
+                  []>, T1Encoding<0b101101>, Requires<[IsNotMClass]>, Deprecated<HasV8Ops> {
   bits<1> end;
   // A8.6.156
   let Inst{9-5} = 0b10010;
@@ -360,6 +360,14 @@
   let DecoderMethod = "DecodeThumbAddSpecialReg";
 }
 
+// Thumb1 frame lowering is rather fragile, we hope to be able to use
+// tADDrSPi, but we may need to insert a sequence that clobbers CPSR.
+def tADDframe : PseudoInst<(outs tGPR:$dst), (ins i32imm:$base, i32imm:$offset),
+                           NoItinerary, []>,
+                Requires<[IsThumb, IsThumb1Only]> {
+  let Defs = [CPSR];
+}
+
 // ADD sp, sp, #<imm7>
 def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
                      IIC_iALUi, "add", "\t$Rdn, $imm", []>,
@@ -466,7 +474,7 @@
                  (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br,
                    "blx${p}\t$func",
                    [(ARMcall tglobaladdr:$func)]>,
-              Requires<[IsThumb, HasV5T]>, Sched<[WriteBrL]> {
+              Requires<[IsThumb, HasV5T, IsNotMClass]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
@@ -1355,7 +1363,7 @@
       Requires<[IsThumb]>;
 
 def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>,
-      Requires<[IsThumb, HasV5T]>;
+      Requires<[IsThumb, HasV5T, IsNotMClass]>;
 
 // Indirect calls to ARM routines
 def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,

diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 85e9351..807c252 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td

@@ -1262,15 +1262,15 @@
 
 // Loads with zero extension
 defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPR, UnOpFrag<(zextloadi16 node:$Src)>>;
+                      GPRnopc, UnOpFrag<(zextloadi16 node:$Src)>>;
 defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPR, UnOpFrag<(zextloadi8  node:$Src)>>;
+                      GPRnopc, UnOpFrag<(zextloadi8  node:$Src)>>;
 
 // Loads with sign extension
 defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPR, UnOpFrag<(sextloadi16 node:$Src)>>;
+                      GPRnopc, UnOpFrag<(sextloadi16 node:$Src)>>;
 defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      GPR, UnOpFrag<(sextloadi8  node:$Src)>>;
+                      GPRnopc, UnOpFrag<(sextloadi8  node:$Src)>>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
@@ -1973,6 +1973,16 @@
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
 def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
 
+// A simple right-shift can also be used in most cases (the exception is the
+// SXTH operations with a rotate of 24: there the non-contiguous bits are
+// relevant).
+def : Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, rot_imm:$rot), i8)),
+          (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>,
+      Requires<[HasT2ExtractPack, IsThumb2]>;
+def : Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, imm8_or_16:$rot), i16)),
+          (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>,
+      Requires<[HasT2ExtractPack, IsThumb2]>;
+
 // Zero extenders
 
 let AddedComplexity = 16 in {
@@ -1999,8 +2009,16 @@
 def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
 def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
+
+def : Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot), 0xFF)),
+          (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>,
+      Requires<[HasT2ExtractPack, IsThumb2]>;
+def : Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)),
+          (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>,
+      Requires<[HasT2ExtractPack, IsThumb2]>;
 }
 
+
 //===----------------------------------------------------------------------===//
 //  Arithmetic Instructions.
 //
@@ -2708,8 +2726,7 @@
 
   def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
               !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
-              [(set rGPR:$Rd, (sra (opnode rGPR:$Rn,
-                                    (sext_inreg rGPR:$Rm, i16)), (i32 16)))]>,
+              []>,
           Requires<[IsThumb2, HasThumb2DSP]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
@@ -2721,8 +2738,7 @@
 
   def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
               !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
-              [(set rGPR:$Rd, (sra (opnode rGPR:$Rn,
-                                    (sra rGPR:$Rm, (i32 16))), (i32 16)))]>,
+              []>,
           Requires<[IsThumb2, HasThumb2DSP]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
@@ -2791,8 +2807,7 @@
   def WB : T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
               !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
-                                    (sext_inreg rGPR:$Rm, i16)), (i32 16))))]>,
+              []>,
            Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
@@ -2804,8 +2819,7 @@
   def WT : T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
               !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
-                                      (sra rGPR:$Rm, (i32 16))), (i32 16))))]>,
+              []>,
            Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
@@ -3291,7 +3305,8 @@
                          (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexd", "\t$Rt, $Rt2, $addr", "",
-                         [], {?, ?, ?, ?}> {
+                         [], {?, ?, ?, ?}>,
+               Requires<[IsThumb2, IsNotMClass]> {
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
@@ -3367,7 +3382,8 @@
                          (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
-                         {?, ?, ?, ?}> {
+                         {?, ?, ?, ?}>,
+               Requires<[IsThumb2, IsNotMClass]> {
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
@@ -3614,7 +3630,7 @@
 // Branch and Exchange Jazelle -- for disassembly only
 // Rm = Inst{19-16}
 def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []>,
-    Sched<[WriteBr]> {
+    Sched<[WriteBr]>, Requires<[IsThumb2, IsNotMClass, PreV8]> {
   bits<4> func;
   let Inst{31-27} = 0b11110;
   let Inst{26} = 0;
@@ -3656,7 +3672,8 @@
 // operands, create 3 versions of the same instruction. Once there's a clean
 // framework to represent optional operands, change this behavior.
 class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
-            !strconcat("cps", asm_op), []> {
+            !strconcat("cps", asm_op), []>,
+          Requires<[IsThumb2, IsNotMClass]> {
   bits<2> imod;
   bits<3> iflags;
   bits<5> mode;
@@ -3702,7 +3719,8 @@
   let Predicates = [IsThumb2, HasV8];
 }
 
-def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
+def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt",
+                [(int_arm_dbg imm0_15:$opt)]> {
   bits<4> opt;
   let Inst{31-20} = 0b111100111010;
   let Inst{19-16} = 0b1111;
@@ -3739,7 +3757,8 @@
 
 class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
+  : T2I<oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsThumb2,IsNotMClass]> {
   bits<5> mode;
   let Inst{31-25} = 0b1110100;
   let Inst{24-23} = Op;
@@ -3770,7 +3789,8 @@
 // Return From Exception is a system instruction.
 class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
           string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
+  : T2I<oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsThumb2,IsNotMClass]> {
   let Inst{31-20} = op31_20{11-0};
 
   bits<4> Rn;
@@ -3797,7 +3817,7 @@
 def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
                         "subs", "\tpc, lr, $imm",
                         [(ARMintretflag imm0_255:$imm)]>,
-                   Requires<[IsThumb2]> {
+                   Requires<[IsThumb2,IsNotMClass]> {
   let Inst{31-8} = 0b111100111101111010001111;
 
   bits<8> imm;
@@ -3941,10 +3961,10 @@
 defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
 defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
 defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
-defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">, Requires<[PreV8]>;
-defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">, Requires<[PreV8]>;
-defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">, Requires<[PreV8]>;
-defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">, Requires<[PreV8]>;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">, Requires<[PreV8,IsThumb2]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">, Requires<[PreV8,IsThumb2]>;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">, Requires<[PreV8,IsThumb2]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">, Requires<[PreV8,IsThumb2]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -3960,7 +3980,7 @@
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111011111000;
   let Inst{11-8} = Rd;
-  let Inst{7-0} = 0b0000;
+  let Inst{7-0} = 0b00000000;
 }
 
 def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
@@ -3970,22 +3990,41 @@
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111111111000;
   let Inst{11-8} = Rd;
-  let Inst{7-0} = 0b0000;
+  let Inst{7-0} = 0b00000000;
 }
 
+def t2MRSbanked : T2I<(outs rGPR:$Rd), (ins banked_reg:$banked),
+                      NoItinerary, "mrs", "\t$Rd, $banked", []>,
+                  Requires<[IsThumb, HasVirtualization]> {
+  bits<6> banked;
+  bits<4> Rd;
+
+  let Inst{31-21} = 0b11110011111;
+  let Inst{20} = banked{5}; // R bit
+  let Inst{19-16} = banked{3-0};
+  let Inst{15-12} = 0b1000;
+  let Inst{11-8} = Rd;
+  let Inst{7-5} = 0b001;
+  let Inst{4} = banked{4};
+  let Inst{3-0} = 0b0000;
+}
+
+
 // M class MRS.
 //
 // This MRS has a mask field in bits 7-0 and can take more values than
 // the A/R class (a full msr_mask).
-def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
-                  "mrs", "\t$Rd, $mask", []>,
+def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$SYSm), NoItinerary,
+                  "mrs", "\t$Rd, $SYSm", []>,
               Requires<[IsThumb,IsMClass]> {
   bits<4> Rd;
-  bits<8> mask;
+  bits<8> SYSm;
   let Inst{31-12} = 0b11110011111011111000;
   let Inst{11-8} = Rd;
-  let Inst{19-16} = 0b1111;
-  let Inst{7-0} = mask;
+  let Inst{7-0} = SYSm;
+
+  let Unpredictable{20-16} = 0b11111;
+  let Unpredictable{13} = 0b1;
 }
 
 
@@ -4010,6 +4049,25 @@
   let Inst{7-0}   = 0;
 }
 
+// However, the MSR (banked register) system instruction (ARMv7VE) *does* have a
+// separate encoding (distinguished by bit 5.
+def t2MSRbanked : T2I<(outs), (ins banked_reg:$banked, rGPR:$Rn),
+                      NoItinerary, "msr", "\t$banked, $Rn", []>,
+                  Requires<[IsThumb, HasVirtualization]> {
+  bits<6> banked;
+  bits<4> Rn;
+
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20} = banked{5}; // R bit
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-8} = banked{3-0};
+  let Inst{7-5} = 0b001;
+  let Inst{4} = banked{4};
+  let Inst{3-0} = 0b0000;
+}
+
+
 // M class MSR.
 //
 // Move from ARM core register to Special Register
@@ -4022,7 +4080,13 @@
   let Inst{20}    = 0b0;
   let Inst{19-16} = Rn;
   let Inst{15-12} = 0b1000;
-  let Inst{11-0}  = SYSm;
+  let Inst{11-10} = SYSm{11-10};
+  let Inst{9-8}   = 0b00;
+  let Inst{7-0}   = SYSm{7-0};
+
+  let Unpredictable{20} = 0b1;
+  let Unpredictable{13} = 0b1;
+  let Unpredictable{9-8} = 0b11;
 }
 
 

diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 1d7802a..d78f2ac 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td

@@ -515,6 +515,8 @@
   let Inst{5}     = Sm{0};
   let Inst{15-12} = Dd{3-0};
   let Inst{22}    = Dd{4};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 // Special case encoding: bits 11-8 is 0b1011.
@@ -551,12 +553,6 @@
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
-def : Pat<(f32_to_f16 SPR:$a),
-          (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
-
-def : Pat<(f16_to_f32 GPR:$a),
-          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
-
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
@@ -619,26 +615,42 @@
   let Inst{5}     = Dm{4};
 }
 
-multiclass vcvt_inst<string opc, bits<2> rm> {
+def : Pat<(fp_to_f16 SPR:$a),
+          (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+
+def : Pat<(fp_to_f16 (f64 DPR:$a)),
+          (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
+
+def : Pat<(f16_to_fp GPR:$a),
+          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+def : Pat<(f64 (f16_to_fp GPR:$a)),
+          (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+multiclass vcvt_inst<string opc, bits<2> rm,
+                     SDPatternOperator node = null_frag> {
   let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
     def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
                     (outs SPR:$Sd), (ins SPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
-                    []>, Requires<[HasFPARMv8]> {
+                    [(set SPR:$Sd, (arm_ftosi (node SPR:$Sm)))]>,
+                    Requires<[HasFPARMv8]> {
       let Inst{17-16} = rm;
     }
 
     def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
                     (outs SPR:$Sd), (ins SPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"),
-                    []>, Requires<[HasFPARMv8]> {
+                    [(set SPR:$Sd, (arm_ftoui (node SPR:$Sm)))]>,
+                    Requires<[HasFPARMv8]> {
       let Inst{17-16} = rm;
     }
 
     def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
                     (outs SPR:$Sd), (ins DPR:$Dm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"),
-                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
+                    [(set SPR:$Sd, (arm_ftosi (f64 (node (f64 DPR:$Dm)))))]>,
+                    Requires<[HasFPARMv8, HasDPVFP]> {
       bits<5> Dm;
 
       let Inst{17-16} = rm;
@@ -652,7 +664,8 @@
     def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
                     (outs SPR:$Sd), (ins DPR:$Dm),
                     NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"),
-                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
+                    [(set SPR:$Sd, (arm_ftoui (f64 (node (f64 DPR:$Dm)))))]>,
+                    Requires<[HasFPARMv8, HasDPVFP]> {
       bits<5> Dm;
 
       let Inst{17-16} = rm;
@@ -665,10 +678,10 @@
   }
 }
 
-defm VCVTA : vcvt_inst<"a", 0b00>;
+defm VCVTA : vcvt_inst<"a", 0b00, frnd>;
 defm VCVTN : vcvt_inst<"n", 0b01>;
-defm VCVTP : vcvt_inst<"p", 0b10>;
-defm VCVTM : vcvt_inst<"m", 0b11>;
+defm VCVTP : vcvt_inst<"p", 0b10, fceil>;
+defm VCVTM : vcvt_inst<"m", 0b11, ffloor>;
 
 def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
@@ -684,18 +697,20 @@
   let D = VFPNeonA8Domain;
 }
 
-multiclass vrint_inst_zrx<string opc, bit op, bit op2> {
+multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
   def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
                (outs SPR:$Sd), (ins SPR:$Sm),
                NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
-               []>, Requires<[HasFPARMv8]> {
+               [(set (f32 SPR:$Sd), (node (f32 SPR:$Sm)))]>,
+               Requires<[HasFPARMv8]> {
     let Inst{7} = op2;
     let Inst{16} = op;
   }
   def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0,
                 (outs DPR:$Dd), (ins DPR:$Dm),
                 NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm",
-                []>, Requires<[HasFPARMv8, HasDPVFP]> {
+                [(set (f64 DPR:$Dd), (node (f64 DPR:$Dm)))]>,
+                Requires<[HasFPARMv8, HasDPVFP]> {
     let Inst{7} = op2;
     let Inst{16} = op;
   }
@@ -708,22 +723,25 @@
         Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
-defm VRINTZ : vrint_inst_zrx<"z", 0, 1>;
-defm VRINTR : vrint_inst_zrx<"r", 0, 0>;
-defm VRINTX : vrint_inst_zrx<"x", 1, 0>;
+defm VRINTZ : vrint_inst_zrx<"z", 0, 1, ftrunc>;
+defm VRINTR : vrint_inst_zrx<"r", 0, 0, fnearbyint>;
+defm VRINTX : vrint_inst_zrx<"x", 1, 0, frint>;
 
-multiclass vrint_inst_anpm<string opc, bits<2> rm> {
+multiclass vrint_inst_anpm<string opc, bits<2> rm,
+                           SDPatternOperator node = null_frag> {
   let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
     def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
                    (outs SPR:$Sd), (ins SPR:$Sm),
                    NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
-                   []>, Requires<[HasFPARMv8]> {
+                   [(set (f32 SPR:$Sd), (node (f32 SPR:$Sm)))]>,
+                   Requires<[HasFPARMv8]> {
       let Inst{17-16} = rm;
     }
     def D : ADuInp<0b11101, 0b11, 0b1000, 0b01, 0,
                    (outs DPR:$Dd), (ins DPR:$Dm),
                    NoItinerary, !strconcat("vrint", opc, ".f64\t$Dd, $Dm"),
-                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+                   [(set (f64 DPR:$Dd), (node (f64 DPR:$Dm)))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]> {
       let Inst{17-16} = rm;
     }
   }
@@ -736,10 +754,10 @@
         Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
-defm VRINTA : vrint_inst_anpm<"a", 0b00>;
+defm VRINTA : vrint_inst_anpm<"a", 0b00, frnd>;
 defm VRINTN : vrint_inst_anpm<"n", 0b01>;
-defm VRINTP : vrint_inst_anpm<"p", 0b10>;
-defm VRINTM : vrint_inst_anpm<"m", 0b11>;
+defm VRINTP : vrint_inst_anpm<"p", 0b10, fceil>;
+defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
 
 def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
@@ -830,6 +848,11 @@
   // Some single precision VFP instructions may be executed on both NEON and VFP
   // pipelines.
   let D = VFPNeonDomain;
+
+  // This instruction is equivalent to
+  // $Rt = EXTRACT_SUBREG $Dm, ssub_0
+  // $Rt2 = EXTRACT_SUBREG $Dm, ssub_1
+  let isExtractSubreg = 1;
 }
 
 def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
@@ -878,6 +901,10 @@
   // Some single precision VFP instructions may be executed on both NEON and VFP
   // pipelines.
   let D = VFPNeonDomain;
+
+  // This instruction is equivalent to
+  // $Dm = REG_SEQUENCE $Rt, ssub_0, $Rt2, ssub_1
+  let isRegSequence = 1;
 }
 
 let neverHasSideEffects = 1 in

diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
deleted file mode 100644
index 6d1114d..0000000
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ /dev/null

@@ -1,344 +0,0 @@
-//===-- ARMJITInfo.cpp - Implement the JIT interfaces for the ARM target --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the JIT interfaces for the ARM target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARMJITInfo.h"
-#include "ARMConstantPoolValue.h"
-#include "ARMMachineFunctionInfo.h"
-#include "ARMRelocations.h"
-#include "MCTargetDesc/ARMBaseInfo.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdlib>
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
-  report_fatal_error("ARMJITInfo::replaceMachineCodeForFunction");
-}
-
-/// JITCompilerFunction - This contains the address of the JIT function used to
-/// compile a function lazily.
-static TargetJITInfo::JITCompilerFn JITCompilerFunction;
-
-// Get the ASMPREFIX for the current host.  This is often '_'.
-#ifndef __USER_LABEL_PREFIX__
-#define __USER_LABEL_PREFIX__
-#endif
-#define GETASMPREFIX2(X) #X
-#define GETASMPREFIX(X) GETASMPREFIX2(X)
-#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
-
-// CompilationCallback stub - We can't use a C function with inline assembly in
-// it, because the prolog/epilog inserted by GCC won't work for us. (We need
-// to preserve more context and manipulate the stack directly).  Instead,
-// write our own wrapper, which does things our way, so we have complete
-// control over register saving and restoring.
-extern "C" {
-#if defined(__arm__)
-  void ARMCompilationCallback();
-  asm(
-    ".text\n"
-    ".align 2\n"
-    ".globl " ASMPREFIX "ARMCompilationCallback\n"
-    ASMPREFIX "ARMCompilationCallback:\n"
-    // Save caller saved registers since they may contain stuff
-    // for the real target function right now. We have to act as if this
-    // whole compilation callback doesn't exist as far as the caller is
-    // concerned, so we can't just preserve the callee saved regs.
-    "stmdb sp!, {r0, r1, r2, r3, lr}\n"
-#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
-    "vstmdb sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
-#endif
-    // The LR contains the address of the stub function on entry.
-    // pass it as the argument to the C part of the callback
-    "mov  r0, lr\n"
-    "sub  sp, sp, #4\n"
-    // Call the C portion of the callback
-    "bl   " ASMPREFIX "ARMCompilationCallbackC\n"
-    "add  sp, sp, #4\n"
-    // Restoring the LR to the return address of the function that invoked
-    // the stub and de-allocating the stack space for it requires us to
-    // swap the two saved LR values on the stack, as they're backwards
-    // for what we need since the pop instruction has a pre-determined
-    // order for the registers.
-    //      +--------+
-    //   0  | LR     | Original return address
-    //      +--------+
-    //   1  | LR     | Stub address (start of stub)
-    // 2-5  | R3..R0 | Saved registers (we need to preserve all regs)
-    // 6-20 | D0..D7 | Saved VFP registers
-    //      +--------+
-    //
-#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
-    // Restore VFP caller-saved registers.
-    "vldmia sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
-#endif
-    //
-    //      We need to exchange the values in slots 0 and 1 so we can
-    //      return to the address in slot 1 with the address in slot 0
-    //      restored to the LR.
-    "ldr  r0, [sp,#20]\n"
-    "ldr  r1, [sp,#16]\n"
-    "str  r1, [sp,#20]\n"
-    "str  r0, [sp,#16]\n"
-    // Return to the (newly modified) stub to invoke the real function.
-    // The above twiddling of the saved return addresses allows us to
-    // deallocate everything, including the LR the stub saved, with two
-    // updating load instructions.
-    "ldmia  sp!, {r0, r1, r2, r3, lr}\n"
-    "ldr    pc, [sp], #4\n"
-      );
-#else  // Not an ARM host
-  void ARMCompilationCallback() {
-    llvm_unreachable("Cannot call ARMCompilationCallback() on a non-ARM arch!");
-  }
-#endif
-}
-
-/// ARMCompilationCallbackC - This is the target-specific function invoked
-/// by the function stub when we did not know the real target of a call.
-/// This function must locate the start of the stub or call site and pass
-/// it into the JIT compiler function.
-extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) {
-  // Get the address of the compiled code for this function.
-  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)StubAddr);
-
-  // Rewrite the call target... so that we don't end up here every time we
-  // execute the call. We're replacing the first two instructions of the
-  // stub with:
-  //   ldr pc, [pc,#-4]
-  //   <addr>
-  if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) {
-    llvm_unreachable("ERROR: Unable to mark stub writable");
-  }
-  *(intptr_t *)StubAddr = 0xe51ff004;  // ldr pc, [pc, #-4]
-  *(intptr_t *)(StubAddr+4) = NewVal;
-  if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) {
-    llvm_unreachable("ERROR: Unable to mark stub executable");
-  }
-}
-
-TargetJITInfo::LazyResolverFn
-ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) {
-  JITCompilerFunction = F;
-  return ARMCompilationCallback;
-}
-
-void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr,
-                                             JITCodeEmitter &JCE) {
-  uint8_t Buffer[4];
-  uint8_t *Cur = Buffer;
-  MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)Ptr);
-  void *PtrAddr = JCE.allocIndirectGV(
-      GV, Buffer, sizeof(Buffer), /*Alignment=*/4);
-  addIndirectSymAddr(Ptr, (intptr_t)PtrAddr);
-  return PtrAddr;
-}
-
-TargetJITInfo::StubLayout ARMJITInfo::getStubLayout() {
-  // The stub contains up to 3 4-byte instructions, aligned at 4 bytes, and a
-  // 4-byte address.  See emitFunctionStub for details.
-  StubLayout Result = {16, 4};
-  return Result;
-}
-
-void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
-                                   JITCodeEmitter &JCE) {
-  void *Addr;
-  // If this is just a call to an external function, emit a branch instead of a
-  // call.  The code is the same except for one bit of the last instruction.
-  if (Fn != (void*)(intptr_t)ARMCompilationCallback) {
-    // Branch to the corresponding function addr.
-    if (IsPIC) {
-      // The stub is 16-byte size and 4-aligned.
-      intptr_t LazyPtr = getIndirectSymAddr(Fn);
-      if (!LazyPtr) {
-        // In PIC mode, the function stub is loading a lazy-ptr.
-        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((const GlobalValue*)F, Fn, JCE);
-        DEBUG(if (F)
-                errs() << "JIT: Indirect symbol emitted at [" << LazyPtr
-                       << "] for GV '" << F->getName() << "'\n";
-              else
-                errs() << "JIT: Stub emitted at [" << LazyPtr
-                       << "] for external function at '" << Fn << "'\n");
-      }
-      JCE.emitAlignment(4);
-      Addr = (void*)JCE.getCurrentPCValue();
-      if (!sys::Memory::setRangeWritable(Addr, 16)) {
-        llvm_unreachable("ERROR: Unable to mark stub writable");
-      }
-      JCE.emitWordLE(0xe59fc004);            // ldr ip, [pc, #+4]
-      JCE.emitWordLE(0xe08fc00c);            // L_func$scv: add ip, pc, ip
-      JCE.emitWordLE(0xe59cf000);            // ldr pc, [ip]
-      JCE.emitWordLE(LazyPtr - (intptr_t(Addr)+4+8));  // func - (L_func$scv+8)
-      sys::Memory::InvalidateInstructionCache(Addr, 16);
-      if (!sys::Memory::setRangeExecutable(Addr, 16)) {
-        llvm_unreachable("ERROR: Unable to mark stub executable");
-      }
-    } else {
-      // The stub is 8-byte size and 4-aligned.
-      JCE.emitAlignment(4);
-      Addr = (void*)JCE.getCurrentPCValue();
-      if (!sys::Memory::setRangeWritable(Addr, 8)) {
-        llvm_unreachable("ERROR: Unable to mark stub writable");
-      }
-      JCE.emitWordLE(0xe51ff004);    // ldr pc, [pc, #-4]
-      JCE.emitWordLE((intptr_t)Fn);  // addr of function
-      sys::Memory::InvalidateInstructionCache(Addr, 8);
-      if (!sys::Memory::setRangeExecutable(Addr, 8)) {
-        llvm_unreachable("ERROR: Unable to mark stub executable");
-      }
-    }
-  } else {
-    // The compilation callback will overwrite the first two words of this
-    // stub with indirect branch instructions targeting the compiled code.
-    // This stub sets the return address to restart the stub, so that
-    // the new branch will be invoked when we come back.
-    //
-    // Branch and link to the compilation callback.
-    // The stub is 16-byte size and 4-byte aligned.
-    JCE.emitAlignment(4);
-    Addr = (void*)JCE.getCurrentPCValue();
-    if (!sys::Memory::setRangeWritable(Addr, 16)) {
-      llvm_unreachable("ERROR: Unable to mark stub writable");
-    }
-    // Save LR so the callback can determine which stub called it.
-    // The compilation callback is responsible for popping this prior
-    // to returning.
-    JCE.emitWordLE(0xe92d4000); // push {lr}
-    // Set the return address to go back to the start of this stub.
-    JCE.emitWordLE(0xe24fe00c); // sub lr, pc, #12
-    // Invoke the compilation callback.
-    JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4]
-    // The address of the compilation callback.
-    JCE.emitWordLE((intptr_t)ARMCompilationCallback);
-    sys::Memory::InvalidateInstructionCache(Addr, 16);
-    if (!sys::Memory::setRangeExecutable(Addr, 16)) {
-      llvm_unreachable("ERROR: Unable to mark stub executable");
-    }
-  }
-
-  return Addr;
-}
-
-intptr_t ARMJITInfo::resolveRelocDestAddr(MachineRelocation *MR) const {
-  ARM::RelocationType RT = (ARM::RelocationType)MR->getRelocationType();
-  switch (RT) {
-  default:
-    return (intptr_t)(MR->getResultPointer());
-  case ARM::reloc_arm_pic_jt:
-    // Destination address - jump table base.
-    return (intptr_t)(MR->getResultPointer()) - MR->getConstantVal();
-  case ARM::reloc_arm_jt_base:
-    // Jump table base address.
-    return getJumpTableBaseAddr(MR->getJumpTableIndex());
-  case ARM::reloc_arm_cp_entry:
-  case ARM::reloc_arm_vfp_cp_entry:
-    // Constant pool entry address.
-    return getConstantPoolEntryAddr(MR->getConstantPoolIndex());
-  case ARM::reloc_arm_machine_cp_entry: {
-    ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)MR->getConstantVal();
-    assert((!ACPV->hasModifier() && !ACPV->mustAddCurrentAddress()) &&
-           "Can't handle this machine constant pool entry yet!");
-    intptr_t Addr = (intptr_t)(MR->getResultPointer());
-    Addr -= getPCLabelAddr(ACPV->getLabelId()) + ACPV->getPCAdjustment();
-    return Addr;
-  }
-  }
-}
-
-/// relocate - Before the JIT can run a block of code that has been emitted,
-/// it must rewrite the code to contain the actual addresses of any
-/// referenced global symbols.
-void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char* GOTBase) {
-  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
-    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
-    intptr_t ResultPtr = resolveRelocDestAddr(MR);
-    switch ((ARM::RelocationType)MR->getRelocationType()) {
-    case ARM::reloc_arm_cp_entry:
-    case ARM::reloc_arm_vfp_cp_entry:
-    case ARM::reloc_arm_relative: {
-      // It is necessary to calculate the correct PC relative value. We
-      // subtract the base addr from the target addr to form a byte offset.
-      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
-      // If the result is positive, set bit U(23) to 1.
-      if (ResultPtr >= 0)
-        *((intptr_t*)RelocPos) |= 1 << ARMII::U_BitShift;
-      else {
-        // Otherwise, obtain the absolute value and set bit U(23) to 0.
-        *((intptr_t*)RelocPos) &= ~(1 << ARMII::U_BitShift);
-        ResultPtr = - ResultPtr;
-      }
-      // Set the immed value calculated.
-      // VFP immediate offset is multiplied by 4.
-      if (MR->getRelocationType() == ARM::reloc_arm_vfp_cp_entry)
-        ResultPtr = ResultPtr >> 2;
-      *((intptr_t*)RelocPos) |= ResultPtr;
-      // Set register Rn to PC (which is register 15 on all architectures).
-      // FIXME: This avoids the need for register info in the JIT class.
-      *((intptr_t*)RelocPos) |= 15 << ARMII::RegRnShift;
-      break;
-    }
-    case ARM::reloc_arm_pic_jt:
-    case ARM::reloc_arm_machine_cp_entry:
-    case ARM::reloc_arm_absolute: {
-      // These addresses have already been resolved.
-      *((intptr_t*)RelocPos) |= (intptr_t)ResultPtr;
-      break;
-    }
-    case ARM::reloc_arm_branch: {
-      // It is necessary to calculate the correct value of signed_immed_24
-      // field. We subtract the base addr from the target addr to form a
-      // byte offset, which must be inside the range -33554432 and +33554428.
-      // Then, we set the signed_immed_24 field of the instruction to bits
-      // [25:2] of the byte offset. More details ARM-ARM p. A4-11.
-      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
-      ResultPtr = (ResultPtr & 0x03FFFFFC) >> 2;
-      assert(ResultPtr >= -33554432 && ResultPtr <= 33554428);
-      *((intptr_t*)RelocPos) |= ResultPtr;
-      break;
-    }
-    case ARM::reloc_arm_jt_base: {
-      // JT base - (instruction addr + 8)
-      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
-      *((intptr_t*)RelocPos) |= ResultPtr;
-      break;
-    }
-    case ARM::reloc_arm_movw: {
-      ResultPtr = ResultPtr & 0xFFFF;
-      *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
-      *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
-      break;
-    }
-    case ARM::reloc_arm_movt: {
-      ResultPtr = (ResultPtr >> 16) & 0xFFFF;
-      *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
-      *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
-      break;
-    }
-    }
-  }
-}
-
-void ARMJITInfo::Initialize(const MachineFunction &MF, bool isPIC) {
-  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  ConstPoolId2AddrMap.resize(AFI->getNumPICLabels());
-  JumpTableId2AddrMap.resize(AFI->getNumJumpTables());
-  IsPIC = isPIC;
-}

diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h
deleted file mode 100644
index 27e2a20..0000000
--- a/lib/Target/ARM/ARMJITInfo.h
+++ /dev/null

@@ -1,177 +0,0 @@
-//===-- ARMJITInfo.h - ARM implementation of the JIT interface  -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the ARMJITInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARMJITINFO_H
-#define ARMJITINFO_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/Target/TargetJITInfo.h"
-
-namespace llvm {
-  class ARMTargetMachine;
-
-  class ARMJITInfo : public TargetJITInfo {
-    // ConstPoolId2AddrMap - A map from constant pool ids to the corresponding
-    // CONSTPOOL_ENTRY addresses.
-    SmallVector<intptr_t, 16> ConstPoolId2AddrMap;
-
-    // JumpTableId2AddrMap - A map from inline jumptable ids to the
-    // corresponding inline jump table bases.
-    SmallVector<intptr_t, 16> JumpTableId2AddrMap;
-
-    // PCLabelMap - A map from PC labels to addresses.
-    DenseMap<unsigned, intptr_t> PCLabelMap;
-
-    // Sym2IndirectSymMap - A map from symbol (GlobalValue and ExternalSymbol)
-    // addresses to their indirect symbol addresses.
-    DenseMap<void*, intptr_t> Sym2IndirectSymMap;
-
-    // IsPIC - True if the relocation model is PIC. This is used to determine
-    // how to codegen function stubs.
-    bool IsPIC;
-
-  public:
-    explicit ARMJITInfo() : IsPIC(false) { useGOT = false; }
-
-    /// replaceMachineCodeForFunction - Make it so that calling the function
-    /// whose machine code is at OLD turns into a call to NEW, perhaps by
-    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
-    /// code.
-    ///
-    void replaceMachineCodeForFunction(void *Old, void *New) override;
-
-    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
-    /// to emit an indirect symbol which contains the address of the specified
-    /// ptr.
-    void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
-                                    JITCodeEmitter &JCE) override;
-
-    // getStubLayout - Returns the size and alignment of the largest call stub
-    // on ARM.
-    StubLayout getStubLayout() override;
-
-    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
-    /// small native function that simply calls the function at the specified
-    /// address.
-    void *emitFunctionStub(const Function* F, void *Fn,
-                           JITCodeEmitter &JCE) override;
-
-    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
-
-    /// relocate - Before the JIT can run a block of code that has been emitted,
-    /// it must rewrite the code to contain the actual addresses of any
-    /// referenced global symbols.
-    void relocate(void *Function, MachineRelocation *MR,
-                  unsigned NumRelocs, unsigned char* GOTBase) override;
-
-    /// hasCustomConstantPool - Allows a target to specify that constant
-    /// pool address resolution is handled by the target.
-    bool hasCustomConstantPool() const override { return true; }
-
-    /// hasCustomJumpTables - Allows a target to specify that jumptables
-    /// are emitted by the target.
-    bool hasCustomJumpTables() const override { return true; }
-
-    /// allocateSeparateGVMemory - If true, globals should be placed in
-    /// separately allocated heap memory rather than in the same
-    /// code memory allocated by JITCodeEmitter.
-    bool allocateSeparateGVMemory() const override {
-#ifdef __APPLE__
-      return true;
-#else
-      return false;
-#endif
-    }
-
-    /// Initialize - Initialize internal stage for the function being JITted.
-    /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize
-    /// jump table ids to jump table bases map; remember if codegen relocation
-    /// model is PIC.
-    void Initialize(const MachineFunction &MF, bool isPIC);
-
-    /// getConstantPoolEntryAddr - The ARM target puts all constant
-    /// pool entries into constant islands. This returns the address of the
-    /// constant pool entry of the specified index.
-    intptr_t getConstantPoolEntryAddr(unsigned CPI) const {
-      assert(CPI < ConstPoolId2AddrMap.size());
-      return ConstPoolId2AddrMap[CPI];
-    }
-
-    /// addConstantPoolEntryAddr - Map a Constant Pool Index to the address
-    /// where its associated value is stored. When relocations are processed,
-    /// this value will be used to resolve references to the constant.
-    void addConstantPoolEntryAddr(unsigned CPI, intptr_t Addr) {
-      assert(CPI < ConstPoolId2AddrMap.size());
-      ConstPoolId2AddrMap[CPI] = Addr;
-    }
-
-    /// getJumpTableBaseAddr - The ARM target inline all jump tables within
-    /// text section of the function. This returns the address of the base of
-    /// the jump table of the specified index.
-    intptr_t getJumpTableBaseAddr(unsigned JTI) const {
-      assert(JTI < JumpTableId2AddrMap.size());
-      return JumpTableId2AddrMap[JTI];
-    }
-
-    /// addJumpTableBaseAddr - Map a jump table index to the address where
-    /// the corresponding inline jump table is emitted. When relocations are
-    /// processed, this value will be used to resolve references to the
-    /// jump table.
-    void addJumpTableBaseAddr(unsigned JTI, intptr_t Addr) {
-      assert(JTI < JumpTableId2AddrMap.size());
-      JumpTableId2AddrMap[JTI] = Addr;
-    }
-
-    /// getPCLabelAddr - Retrieve the address of the PC label of the
-    /// specified id.
-    intptr_t getPCLabelAddr(unsigned Id) const {
-      DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id);
-      assert(I != PCLabelMap.end());
-      return I->second;
-    }
-
-    /// addPCLabelAddr - Remember the address of the specified PC label.
-    void addPCLabelAddr(unsigned Id, intptr_t Addr) {
-      PCLabelMap.insert(std::make_pair(Id, Addr));
-    }
-
-    /// getIndirectSymAddr - Retrieve the address of the indirect symbol of the
-    /// specified symbol located at address. Returns 0 if the indirect symbol
-    /// has not been emitted.
-    intptr_t getIndirectSymAddr(void *Addr) const {
-      DenseMap<void*,intptr_t>::const_iterator I= Sym2IndirectSymMap.find(Addr);
-      if (I != Sym2IndirectSymMap.end())
-        return I->second;
-      return 0;
-    }
-
-    /// addIndirectSymAddr - Add a mapping from address of an emitted symbol to
-    /// its indirect symbol address.
-    void addIndirectSymAddr(void *SymAddr, intptr_t IndSymAddr) {
-      Sym2IndirectSymMap.insert(std::make_pair(SymAddr, IndSymAddr));
-    }
-
-  private:
-    /// resolveRelocDestAddr - Resolve the resulting address of the relocation
-    /// if it's not already solved. Constantpool entries must be resolved by
-    /// ARM target.
-    intptr_t resolveRelocDestAddr(MachineRelocation *MR) const;
-  };
-}
-
-#endif

diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index a03bcdb..c429ac1 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp

@@ -144,6 +144,46 @@
   char ARMLoadStoreOpt::ID = 0;
 }
 
+static bool definesCPSR(const MachineInstr *MI) {
+  for (const auto &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
+      // If the instruction has live CPSR def, then it's not safe to fold it
+      // into load / store.
+      return true;
+  }
+
+  return false;
+}
+
+static int getMemoryOpOffset(const MachineInstr *MI) {
+  int Opcode = MI->getOpcode();
+  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
+
+  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
+      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
+      Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
+    return OffField;
+
+  // Thumb1 immediate offsets are scaled by 4
+  if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi)
+    return OffField * 4;
+
+  int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
+    : ARM_AM::getAM5Offset(OffField) * 4;
+  ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField)
+    : ARM_AM::getAM5Op(OffField);
+
+  if (Op == ARM_AM::sub)
+    return -Offset;
+
+  return Offset;
+}
+
 static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
   switch (Opcode) {
   default: llvm_unreachable("Unhandled opcode!");
@@ -335,40 +375,50 @@
                                    unsigned WordOffset,
                                    ARMCC::CondCodes Pred, unsigned PredReg) {
   assert(isThumb1 && "Can only update base register uses for Thumb1!");
-
-  // Start updating any instructions with immediate offsets. Insert a sub before
+  // Start updating any instructions with immediate offsets. Insert a SUB before
   // the first non-updateable instruction (if any).
   for (; MBBI != MBB.end(); ++MBBI) {
-    if (MBBI->readsRegister(Base)) {
-      unsigned Opc = MBBI->getOpcode();
-      int Offset;
-      bool InsertSub = false;
+    bool InsertSub = false;
+    unsigned Opc = MBBI->getOpcode();
 
-      if (Opc == ARM::tLDRi  || Opc == ARM::tSTRi  ||
-          Opc == ARM::tLDRHi || Opc == ARM::tSTRHi ||
-          Opc == ARM::tLDRBi || Opc == ARM::tSTRBi) {
+    if (MBBI->readsRegister(Base)) {
+      int Offset;
+      bool IsLoad =
+        Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi;
+      bool IsStore =
+        Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi;
+
+      if (IsLoad || IsStore) {
         // Loads and stores with immediate offsets can be updated, but only if
         // the new offset isn't negative.
         // The MachineOperand containing the offset immediate is the last one
         // before predicates.
         MachineOperand &MO =
           MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
-        // The offsets are scaled by 1, 2 or 4 depending on the Opcode
+        // The offsets are scaled by 1, 2 or 4 depending on the Opcode.
         Offset = MO.getImm() - WordOffset * getImmScale(Opc);
-        if (Offset >= 0)
+
+        // If storing the base register, it needs to be reset first.
+        unsigned InstrSrcReg = MBBI->getOperand(0).getReg();
+
+        if (Offset >= 0 && !(IsStore && InstrSrcReg == Base))
           MO.setImm(Offset);
         else
           InsertSub = true;
 
-      } else if (Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) {
-        // SUB/ADD using this register. Merge it with the update.
-        // If the merged offset is too large, insert a new sub instead.
+      } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) &&
+                 !definesCPSR(MBBI)) {
+        // SUBS/ADDS using this register, with a dead def of the CPSR.
+        // Merge it with the update; if the merged offset is too large,
+        // insert a new sub instead.
         MachineOperand &MO =
           MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
         Offset = (Opc == ARM::tSUBi8) ?
           MO.getImm() + WordOffset * 4 :
           MO.getImm() - WordOffset * 4 ;
-        if (TL->isLegalAddImmediate(Offset)) {
+        if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) {
+          // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if
+          // Offset == 0.
           MO.setImm(Offset);
           // The base register has now been reset, so exit early.
           return;
@@ -381,13 +431,19 @@
         InsertSub = true;
       }
 
-      if (InsertSub) {
-        // An instruction above couldn't be updated, so insert a sub.
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base))
-          .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
-          .addImm(Pred).addReg(PredReg);
-        return;
-      }
+    } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) {
+      // Since SUBS sets the condition flags, we can't place the base reset
+      // after an instruction that has a live CPSR def.
+      // The base register might also contain an argument for a function call.
+      InsertSub = true;
+    }
+
+    if (InsertSub) {
+      // An instruction above couldn't be updated, so insert a sub.
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true)
+        .addReg(Base, getKillRegState(false)).addImm(WordOffset * 4)
+        .addImm(Pred).addReg(PredReg);
+      return;
     }
 
     if (MBBI->killsRegister(Base))
@@ -395,15 +451,18 @@
       return;
   }
 
-  // The end of the block was reached. This means register liveness escapes the
-  // block, and it's necessary to insert a sub before the last instruction.
-  if (MBB.succ_size() > 0)
-    // But only insert the SUB if there is actually a successor block.
-    // FIXME: Check more carefully if register is live at this point, e.g. by
-    // also examining the successor block's register liveness information.
-    AddDefaultT1CC(BuildMI(MBB, --MBBI, dl, TII->get(ARM::tSUBi8), Base))
-      .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
+  // End of block was reached.
+  if (MBB.succ_size() > 0) {
+    // FIXME: Because of a bug, live registers are sometimes missing from
+    // the successor blocks' live-in sets. This means we can't trust that
+    // information and *always* have to reset at the end of a block.
+    // See PR21029.
+    if (MBBI != MBB.end()) --MBBI;
+    AddDefaultT1CC(
+      BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true)
+      .addReg(Base, getKillRegState(false)).addImm(WordOffset * 4)
       .addImm(Pred).addReg(PredReg);
+  }
 }
 
 /// MergeOps - Create and insert a LDM or STM with Base as base register and
@@ -422,6 +481,28 @@
   if (NumRegs <= 1)
     return false;
 
+  // For Thumb1 targets, it might be necessary to clobber the CPSR to merge.
+  // Compute liveness information for that register to make the decision.
+  bool SafeToClobberCPSR = !isThumb1 ||
+    (MBB.computeRegisterLiveness(TRI, ARM::CPSR, std::prev(MBBI), 15) ==
+     MachineBasicBlock::LQR_Dead);
+
+  bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
+
+  // Exception: If the base register is in the input reglist, Thumb1 LDM is
+  // non-writeback.
+  // It's also not possible to merge an STR of the base register in Thumb1.
+  if (isThumb1)
+    for (unsigned I = 0; I < NumRegs; ++I)
+      if (Base == Regs[I].first) {
+        if (Opcode == ARM::tLDRi) {
+          Writeback = false;
+          break;
+        } else if (Opcode == ARM::tSTRi) {
+          return false;
+        }
+      }
+
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
   // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
@@ -445,6 +526,11 @@
     if (NumRegs <= 2)
       return false;
 
+    // On Thumb1, it's not worth materializing a new base register without
+    // clobbering the CPSR (i.e. not using ADDS/SUBS).
+    if (!SafeToClobberCPSR)
+      return false;
+
     unsigned NewBase;
     if (isi32Load(Opcode)) {
       // If it is a load, then just use one of the destination register to
@@ -459,13 +545,15 @@
 
     int BaseOpc =
       isThumb2 ? ARM::t2ADDri :
+      (isThumb1 && Offset < 8) ? ARM::tADDi3 :
       isThumb1 ? ARM::tADDi8  : ARM::ADDri;
 
     if (Offset < 0) {
+      Offset = - Offset;
       BaseOpc =
         isThumb2 ? ARM::t2SUBri :
+        (isThumb1 && Offset < 8) ? ARM::tSUBi3 :
         isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
-      Offset = - Offset;
     }
 
     if (!TL->isLegalAddImmediate(Offset))
@@ -473,22 +561,28 @@
       return false; // Probably not worth it then.
 
     if (isThumb1) {
-      if (Base != NewBase) {
+      // Thumb1: depending on immediate size, use either
+      //   ADDS NewBase, Base, #imm3
+      // or
+      //   MOV  NewBase, Base
+      //   ADDS NewBase, #imm8.
+      if (Base != NewBase && Offset >= 8) {
         // Need to insert a MOV to the new base first.
-        // FIXME: If the immediate fits in 3 bits, use ADD instead.
         BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
           .addReg(Base, getKillRegState(BaseKill))
           .addImm(Pred).addReg(PredReg);
+        // Set up BaseKill and Base correctly to insert the ADDS/SUBS below.
+        Base = NewBase;
+        BaseKill = false;
       }
-      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase))
-        .addReg(NewBase, getKillRegState(true)).addImm(Offset)
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase), true)
+        .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
         .addImm(Pred).addReg(PredReg);
     } else {
       BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
         .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
         .addImm(Pred).addReg(PredReg).addReg(0);
     }
-
     Base = NewBase;
     BaseKill = true; // New base is always killed straight away.
   }
@@ -501,16 +595,16 @@
   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
   if (!Opcode) return false;
 
-  bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
-
-  // Exception: If the base register is in the input reglist, Thumb1 LDM is
-  // non-writeback. Check for this.
-  if (Opcode == ARM::tLDMIA && isThumb1)
-    for (unsigned I = 0; I < NumRegs; ++I)
-      if (Base == Regs[I].first) {
-        Writeback = false;
-        break;
-      }
+  // Check if a Thumb1 LDM/STM merge is safe. This is the case if:
+  // - There is no writeback (LDM of base register),
+  // - the base register is killed by the merged instruction,
+  // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS
+  //   to reset the base register.
+  // Otherwise, don't merge.
+  // It's safe to return here since the code to materialize a new base register
+  // above is also conditional on SafeToClobberCPSR.
+  if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill)
+    return false;
 
   MachineInstrBuilder MIB;
 
@@ -525,11 +619,11 @@
     MIB.addReg(Base, getDefRegState(true))
        .addReg(Base, getKillRegState(BaseKill));
 
-    // The base isn't dead after a merged instruction with writeback. Update
-    // future uses of the base with the added offset (if possible), or reset
-    // the base register as necessary.
+    // The base isn't dead after a merged instruction with writeback.
+    // Insert a sub instruction after the newly formed instruction to reset.
     if (!BaseKill)
       UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
+
   } else {
     // No writeback, simply build the MachineInstr.
     MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
@@ -700,6 +794,11 @@
     memOps[i].MBBI = Merges.back();
     memOps[i].Position = insertPos;
   }
+
+  // Update memOps offsets, since they may have been modified by MergeOps.
+  for (auto &MemOp : memOps) {
+    MemOp.Offset = getMemoryOpOffset(MemOp.MBBI);
+  }
 }
 
 /// MergeLDR_STR - Merge a number of load / store instructions into one or more
@@ -721,7 +820,7 @@
   unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
   unsigned Count = 1;
   unsigned Limit = ~0U;
-
+  bool BaseKill = false;
   // vldm / vstm limit are 32 for S variants, 16 for D variants.
 
   switch (Opcode) {
@@ -760,36 +859,28 @@
       ++Count;
     } else {
       // Can't merge this in. Try merge the earlier ones first.
-      MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
-                     Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges);
+      // We need to compute BaseKill here because the MemOps may have been
+      // reordered.
+      BaseKill = Loc->killsRegister(Base);
+
+      MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, Base,
+                     BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
       MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
                    MemOps, Merges);
       return;
     }
 
-    if (MemOps[i].Position > MemOps[insertAfter].Position)
+    if (MemOps[i].Position > MemOps[insertAfter].Position) {
       insertAfter = i;
+      Loc = MemOps[i].MBBI;
+    }
   }
 
-  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
+  BaseKill =  Loc->killsRegister(Base);
   MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
                  Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
 }
 
-static bool definesCPSR(MachineInstr *MI) {
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
-      continue;
-    if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
-      // If the instruction has live CPSR def, then it's not safe to fold it
-      // into load / store.
-      return true;
-  }
-
-  return false;
-}
-
 static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
                                 unsigned Bytes, unsigned Limit,
                                 ARMCC::CondCodes Pred, unsigned PredReg) {
@@ -1327,34 +1418,6 @@
     RS->forward(std::prev(Loc));
 }
 
-static int getMemoryOpOffset(const MachineInstr *MI) {
-  int Opcode = MI->getOpcode();
-  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
-  unsigned NumOperands = MI->getDesc().getNumOperands();
-  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
-
-  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
-      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
-      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
-      Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
-    return OffField;
-
-  // Thumb1 immediate offsets are scaled by 4
-  if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi)
-    return OffField * 4;
-
-  int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
-    : ARM_AM::getAM5Offset(OffField) * 4;
-  if (isAM3) {
-    if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
-      Offset = -Offset;
-  } else {
-    if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
-      Offset = -Offset;
-  }
-  return Offset;
-}
-
 static void InsertLDR_STR(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator &MBBI,
                           int Offset, bool isDef,
@@ -1725,21 +1788,15 @@
 
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
-  TL = TM.getTargetLowering();
+  TL = TM.getSubtargetImpl()->getTargetLowering();
   AFI = Fn.getInfo<ARMFunctionInfo>();
-  TII = TM.getInstrInfo();
-  TRI = TM.getRegisterInfo();
+  TII = TM.getSubtargetImpl()->getInstrInfo();
+  TRI = TM.getSubtargetImpl()->getRegisterInfo();
   STI = &TM.getSubtarget<ARMSubtarget>();
   RS = new RegScavenger();
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
-  // FIXME: Temporarily disabling for Thumb-1 due to miscompiles
-  if (isThumb1) {
-    delete RS;
-    return false;
-  }
-
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
        ++MFI) {
@@ -1793,10 +1850,10 @@
 }
 
 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  TD  = Fn.getTarget().getDataLayout();
-  TII = Fn.getTarget().getInstrInfo();
-  TRI = Fn.getTarget().getRegisterInfo();
-  STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  TD = Fn.getSubtarget().getDataLayout();
+  TII = Fn.getSubtarget().getInstrInfo();
+  TRI = Fn.getSubtarget().getRegisterInfo();
+  STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   MRI = &Fn.getRegInfo();
   MF  = &Fn;
 
@@ -1811,7 +1868,7 @@
 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
                                       MachineBasicBlock::iterator I,
                                       MachineBasicBlock::iterator E,
-                                      SmallPtrSet<MachineInstr*, 4> &MemOps,
+                                      SmallPtrSetImpl<MachineInstr*> &MemOps,
                                       SmallSet<unsigned, 4> &MemRegs,
                                       const TargetRegisterInfo *TRI) {
   // Are there stores / loads / calls between them?

diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 44a9e34..4e67fa1 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h

@@ -11,14 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMMACHINEFUNCTIONINFO_H
-#define ARMMACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
 
 #include "ARMSubtarget.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
 
 namespace llvm {
 
@@ -47,6 +48,9 @@
   ///
   unsigned ArgRegsSaveSize;
 
+  /// ReturnRegsCount - Number of registers used up in the return.
+  unsigned ReturnRegsCount;
+
   /// HasStackFrame - True if this function has a stack frame. Set by
   /// processFunctionBeforeCalleeSavedScan().
   bool HasStackFrame;
@@ -82,6 +86,7 @@
   /// areas.
   unsigned GPRCS1Size;
   unsigned GPRCS2Size;
+  unsigned DPRCSAlignGapSize;
   unsigned DPRCSSize;
 
   /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
@@ -118,14 +123,19 @@
   /// being passed on the stack
   unsigned ArgumentStackSize;
 
+  /// CoalescedWeights - mapping of basic blocks to the rolling counter of
+  /// coalesced weights.
+  DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
+
 public:
   ARMFunctionInfo() :
     isThumb(false),
     hasThumb2(false),
-    ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
+    ArgRegsSaveSize(0), ReturnRegsCount(0), HasStackFrame(false),
+    RestoreSPFromFP(false),
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
-    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0),
     NumAlignedDPRCS2Regs(0),
     JumpTableUId(0), PICLabelUId(0),
     VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
@@ -146,6 +156,9 @@
   }
   void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; }
 
+  unsigned getReturnRegsCount() const { return ReturnRegsCount; }
+  void setReturnRegsCount(unsigned s) { ReturnRegsCount = s; }
+
   bool hasStackFrame() const { return HasStackFrame; }
   void setHasStackFrame(bool s) { HasStackFrame = s; }
 
@@ -171,10 +184,12 @@
 
   unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
   unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
+  unsigned getDPRCalleeSavedGapSize() const   { return DPRCSAlignGapSize; }
   unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
 
   void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
   void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
+  void setDPRCalleeSavedGapSize(unsigned s)   { DPRCSAlignGapSize = s; }
   void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
 
   unsigned getArgumentStackSize() const { return ArgumentStackSize; }
@@ -221,7 +236,16 @@
     else
       return -1U;
   }
+
+  DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight(
+                                                  MachineBasicBlock* MBB) {
+    auto It = CoalescedWeights.find(MBB);
+    if (It == CoalescedWeights.end()) {
+      It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first;
+    }
+    return It;
+  }
 };
 } // End llvm namespace
 
-#endif // ARMMACHINEFUNCTIONINFO_H
+#endif

diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h
index efa22fb..3ff0bee 100644
--- a/lib/Target/ARM/ARMPerfectShuffle.h
+++ b/lib/Target/ARM/ARMPerfectShuffle.h

@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_ARM_ARMPERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_ARM_ARMPERFECTSHUFFLE_H
+
 // 31 entries have cost 0
 // 242 entries have cost 1
 // 1447 entries have cost 2
@@ -6584,3 +6587,5 @@
   835584U, // <u,u,u,u>: Cost 0 copy LHS
   0
 };
+
+#endif

diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
index 3e6af3f..b623173 100644
--- a/lib/Target/ARM/ARMRegisterInfo.h
+++ b/lib/Target/ARM/ARMRegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMREGISTERINFO_H
-#define ARMREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMREGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMREGISTERINFO_H
 
 #include "ARMBaseRegisterInfo.h"
 

diff --git a/lib/Target/ARM/ARMRelocations.h b/lib/Target/ARM/ARMRelocations.h
deleted file mode 100644
index 21877fd..0000000
--- a/lib/Target/ARM/ARMRelocations.h
+++ /dev/null

@@ -1,62 +0,0 @@
-//===-- ARMRelocations.h - ARM Code Relocations -----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the ARM target-specific relocation types.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARMRELOCATIONS_H
-#define ARMRELOCATIONS_H
-
-#include "llvm/CodeGen/MachineRelocation.h"
-
-namespace llvm {
-  namespace ARM {
-    enum RelocationType {
-      // reloc_arm_absolute - Absolute relocation, just add the relocated value
-      // to the value already in memory.
-      reloc_arm_absolute,
-
-      // reloc_arm_relative - PC relative relocation, add the relocated value to
-      // the value already in memory, after we adjust it for where the PC is.
-      reloc_arm_relative,
-
-      // reloc_arm_cp_entry - PC relative relocation for constpool_entry's whose
-      // addresses are kept locally in a map.
-      reloc_arm_cp_entry,
-
-      // reloc_arm_vfp_cp_entry - Same as reloc_arm_cp_entry except the offset
-      // should be divided by 4.
-      reloc_arm_vfp_cp_entry,
-
-      // reloc_arm_machine_cp_entry - Relocation of a ARM machine constantpool
-      // entry.
-      reloc_arm_machine_cp_entry,
-
-      // reloc_arm_jt_base - PC relative relocation for jump tables whose
-      // addresses are kept locally in a map.
-      reloc_arm_jt_base,
-
-      // reloc_arm_pic_jt - PIC jump table entry relocation: dest bb - jt base.
-      reloc_arm_pic_jt,
-
-      // reloc_arm_branch - Branch address relocation.
-      reloc_arm_branch,
-
-      // reloc_arm_movt  - MOVT immediate relocation.
-      reloc_arm_movt,
-
-      // reloc_arm_movw  - MOVW immediate relocation.
-      reloc_arm_movw
-    };
-  }
-}
-
-#endif
-

diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 3dcc0df..fa30ac3 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp

@@ -157,7 +157,7 @@
     return SDValue();
 
   const ARMTargetLowering &TLI =
-    *static_cast<const ARMTargetLowering*>(DAG.getTarget().getTargetLowering());
+      *DAG.getTarget().getSubtarget<ARMSubtarget>().getTargetLowering();
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 

diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 13769dc..94b98e6 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMSELECTIONDAGINFO_H
-#define ARMSELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_ARM_ARMSELECTIONDAGINFO_H
 
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"

diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 0eb24ef..600f39d 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp

@@ -15,9 +15,9 @@
 #include "ARMFrameLowering.h"
 #include "ARMISelLowering.h"
 #include "ARMInstrInfo.h"
-#include "ARMJITInfo.h"
 #include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
+#include "ARMMachineFunctionInfo.h"
 #include "Thumb1FrameLowering.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb2InstrInfo.h"
@@ -27,6 +27,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
 
@@ -47,11 +49,13 @@
 UseFusedMulOps("arm-use-mulops",
                cl::init(true), cl::Hidden);
 
+namespace {
 enum AlignMode {
   DefaultAlign,
   StrictAlign,
   NoStrictAlign
 };
+}
 
 static cl::opt<AlignMode>
 Align(cl::desc("Load/store alignment support"),
@@ -98,11 +102,6 @@
   // Pointers are 32 bits and aligned to 32 bits.
   Ret += "-p:32:32";
 
-  // On thumb, i16,i18 and i1 have natural aligment requirements, but we try to
-  // align to 32.
-  if (ST.isThumb())
-    Ret += "-i1:8:32-i8:8:32-i16:16:32";
-
   // ABIs other than APCS have 64 bit integers with natural alignment.
   if (!ST.isAPCS_ABI())
     Ret += "-i64:64";
@@ -119,10 +118,9 @@
   else
     Ret += "-v128:64:128";
 
-  // On thumb and APCS, only try to align aggregates to 32 bits (the default is
-  // 64 bits).
-  if (ST.isThumb() || ST.isAPCS_ABI())
-    Ret += "-a:0:32";
+  // Try to align aggregates to 32 bits (the default is 64 bits, which has no
+  // particular hardware support on 32-bit ARM).
+  Ret += "-a:0:32";
 
   // Integer registers are 32 bits.
   Ret += "-n32";
@@ -144,18 +142,18 @@
 ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
   initializeEnvironment();
-  resetSubtargetFeatures(CPU, FS);
+  initSubtargetFeatures(CPU, FS);
   return *this;
 }
 
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, TargetMachine &TM,
-                           bool IsLittle, const TargetOptions &Options)
+                           const std::string &FS, const TargetMachine &TM,
+                           bool IsLittle)
     : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
       ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle),
-      TargetTriple(TT), Options(Options), TargetABI(ARM_ABI_UNKNOWN),
+      TargetTriple(TT), Options(TM.Options), TargetABI(ARM_ABI_UNKNOWN),
       DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS))),
-      TSInfo(DL), JITInfo(),
+      TSInfo(DL),
       InstrInfo(isThumb1Only()
                     ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this)
                     : !isThumb()
@@ -188,7 +186,6 @@
   InThumbMode = false;
   HasThumb2 = false;
   NoARM = false;
-  PostRAScheduler = false;
   IsR9Reserved = ReserveR9;
   UseMovt = false;
   SupportsTailCall = false;
@@ -217,23 +214,7 @@
   UseLong64 = false;
 }
 
-void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
-  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
-  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
-                                           "target-cpu");
-  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
-                                          "target-features");
-  std::string CPU =
-    !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
-  std::string FS =
-    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
-  if (!FS.empty()) {
-    initializeEnvironment();
-    resetSubtargetFeatures(CPU, FS);
-  }
-}
-
-void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (CPUString.empty()) {
     if (isTargetIOS() && TargetTriple.getArchName().endswith("v7s"))
       // Default to the Swift CPU when targeting armv7s/thumbv7s.
@@ -275,9 +256,8 @@
       TargetABI = ARM_ABI_AAPCS;
       break;
     default:
-      if ((isTargetIOS() && isMClass()) ||
-          (TargetTriple.isOSBinFormatMachO() &&
-           TargetTriple.getOS() == Triple::UnknownOS))
+      if (TargetTriple.isOSBinFormatMachO() &&
+          TargetTriple.getOS() == Triple::UnknownOS)
         TargetABI = ARM_ABI_AAPCS;
       else
         TargetABI = ARM_ABI_APCS;
@@ -299,49 +279,39 @@
   UseMovt = hasV6T2Ops() && ArmUseMOVT;
 
   if (isTargetMachO()) {
-    IsR9Reserved = ReserveR9 | !HasV6Ops;
+    IsR9Reserved = ReserveR9 || !HasV6Ops;
     SupportsTailCall = !isTargetIOS() || !getTargetTriple().isOSVersionLT(5, 0);
   } else {
     IsR9Reserved = ReserveR9;
     SupportsTailCall = !isThumb1Only();
   }
 
-  if (!isThumb() || hasThumb2())
-    PostRAScheduler = true;
-
-  switch (Align) {
-    case DefaultAlign:
-      // Assume pre-ARMv6 doesn't support unaligned accesses.
-      //
-      // ARMv6 may or may not support unaligned accesses depending on the
-      // SCTLR.U bit, which is architecture-specific. We assume ARMv6
-      // Darwin and NetBSD targets support unaligned accesses, and others don't.
-      //
-      // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
-      // which raises an alignment fault on unaligned accesses. Linux
-      // defaults this bit to 0 and handles it as a system-wide (not
-      // per-process) setting. It is therefore safe to assume that ARMv7+
-      // Linux targets support unaligned accesses. The same goes for NaCl.
-      //
-      // The above behavior is consistent with GCC.
-      AllowsUnalignedMem =
-          (hasV7Ops() && (isTargetLinux() || isTargetNaCl() ||
-                          isTargetNetBSD())) ||
-          (hasV6Ops() && (isTargetMachO() || isTargetNetBSD()));
-      // The one exception is cortex-m0, which despite being v6, does not
-      // support unaligned accesses. Rather than make the above boolean
-      // expression even more obtuse, just override the value here.
-      if (isThumb1Only() && isMClass())
-        AllowsUnalignedMem = false;
-      break;
-    case StrictAlign:
-      AllowsUnalignedMem = false;
-      break;
-    case NoStrictAlign:
-      AllowsUnalignedMem = true;
-      break;
+  if (Align == DefaultAlign) {
+    // Assume pre-ARMv6 doesn't support unaligned accesses.
+    //
+    // ARMv6 may or may not support unaligned accesses depending on the
+    // SCTLR.U bit, which is architecture-specific. We assume ARMv6
+    // Darwin and NetBSD targets support unaligned accesses, and others don't.
+    //
+    // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
+    // which raises an alignment fault on unaligned accesses. Linux
+    // defaults this bit to 0 and handles it as a system-wide (not
+    // per-process) setting. It is therefore safe to assume that ARMv7+
+    // Linux targets support unaligned accesses. The same goes for NaCl.
+    //
+    // The above behavior is consistent with GCC.
+    AllowsUnalignedMem =
+      (hasV7Ops() && (isTargetLinux() || isTargetNaCl() ||
+                      isTargetNetBSD())) ||
+      (hasV6Ops() && (isTargetMachO() || isTargetNetBSD()));
+  } else {
+    AllowsUnalignedMem = !(Align == StrictAlign);
   }
 
+  // No v6M core supports unaligned memory access (v6M ARM ARM A3.2)
+  if (isV6M())
+    AllowsUnalignedMem = false;
+
   switch (IT) {
   case DefaultIT:
     RestrictIT = hasV8Ops() ? true : false;
@@ -368,11 +338,7 @@
   if (RelocM == Reloc::Static)
     return false;
 
-  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
-  // load from stub.
-  bool isDecl = GV->hasAvailableExternallyLinkage();
-  if (GV->isDeclaration() && !GV->isMaterializable())
-    isDecl = true;
+  bool isDecl = GV->isDeclarationForLinker();
 
   if (!isTargetMachO()) {
     // Extra load is needed for all externally visible.
@@ -415,33 +381,22 @@
 }
 
 unsigned ARMSubtarget::getMispredictionPenalty() const {
-  return SchedModel->MispredictPenalty;
+  return SchedModel.MispredictPenalty;
 }
 
 bool ARMSubtarget::hasSinCos() const {
-  return getTargetTriple().getOS() == Triple::IOS &&
-    !getTargetTriple().isOSVersionLT(7, 0);
+  return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0);
 }
 
-// Enable the PostMachineScheduler if the target selects it instead of
-// PostRAScheduler. Currently only available on the command line via
-// -misched-postra.
+// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool ARMSubtarget::enablePostMachineScheduler() const {
-  return PostRAScheduler;
+  return (!isThumb() || hasThumb2());
 }
 
-bool ARMSubtarget::enableAtomicExpandLoadLinked() const {
+bool ARMSubtarget::enableAtomicExpand() const {
   return hasAnyDataBarrier() && !isThumb1Only();
 }
 
-bool ARMSubtarget::enablePostRAScheduler(
-           CodeGenOpt::Level OptLevel,
-           TargetSubtargetInfo::AntiDepBreakMode& Mode,
-           RegClassVector& CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
-  return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
-}
-
 bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
   // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
   // immediates as it is inherently position independent, and may be out of

diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 8f6c165..d5ee009 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h

@@ -11,20 +11,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMSUBTARGET_H
-#define ARMSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
+#define LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
 
 
 #include "ARMFrameLowering.h"
 #include "ARMISelLowering.h"
 #include "ARMInstrInfo.h"
-#include "ARMJITInfo.h"
 #include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
 #include "Thumb1FrameLowering.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb2InstrInfo.h"
-#include "ARMJITInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
@@ -44,7 +42,7 @@
 protected:
   enum ARMProcFamilyEnum {
     Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
-    CortexR5, Swift, CortexA53, CortexA57, Krait
+    CortexA17, CortexR5, Swift, CortexA53, CortexA57, Krait, 
   };
   enum ARMProcClassEnum {
     None, AClass, RClass, MClass
@@ -105,9 +103,6 @@
   /// NoARM - True if subtarget does not support ARM mode execution.
   bool NoARM;
 
-  /// PostRAScheduler - True if using post-register-allocation scheduler.
-  bool PostRAScheduler;
-
   /// IsR9Reserved - True if R9 is a not available as general purpose register.
   bool IsR9Reserved;
 
@@ -191,7 +186,7 @@
 
   /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
   /// accesses for some types.  For details, see
-  /// ARMTargetLowering::allowsUnalignedMemoryAccesses().
+  /// ARMTargetLowering::allowsMisalignedMemoryAccesses().
   bool AllowsUnalignedMem;
 
   /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
@@ -225,7 +220,7 @@
   Triple TargetTriple;
 
   /// SchedModel - Processor specific instruction costs.
-  const MCSchedModel *SchedModel;
+  MCSchedModel SchedModel;
 
   /// Selected instruction itineraries (one entry per itinerary class.)
   InstrItineraryData InstrItins;
@@ -244,8 +239,7 @@
   /// of the specified triple.
   ///
   ARMSubtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS, TargetMachine &TM, bool IsLittle,
-               const TargetOptions &Options);
+               const std::string &FS, const TargetMachine &TM, bool IsLittle);
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
@@ -256,27 +250,30 @@
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  /// \brief Reset the features for the ARM target.
-  void resetSubtargetFeatures(const MachineFunction *MF) override;
-
   /// initializeSubtargetDependencies - Initializes using a CPU and feature string
   /// so that we can use initializer lists for subtarget initialization.
   ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
 
-  const DataLayout *getDataLayout() const { return &DL; }
-  const ARMSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
-  ARMJITInfo *getJITInfo() { return &JITInfo; }
-  const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo.get(); }
-  const ARMTargetLowering *getTargetLowering() const { return &TLInfo; }
-  const ARMFrameLowering *getFrameLowering() const { return FrameLowering.get(); }
-  const ARMBaseRegisterInfo *getRegisterInfo() const {
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const ARMBaseInstrInfo *getInstrInfo() const override {
+    return InstrInfo.get();
+  }
+  const ARMTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const ARMFrameLowering *getFrameLowering() const override {
+    return FrameLowering.get();
+  }
+  const ARMBaseRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo->getRegisterInfo();
   }
 
 private:
   const DataLayout DL;
   ARMSelectionDAGInfo TSInfo;
-  ARMJITInfo JITInfo;
   // Either Thumb1InstrInfo or Thumb2InstrInfo.
   std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
   ARMTargetLowering   TLInfo;
@@ -284,7 +281,7 @@
   std::unique_ptr<ARMFrameLowering> FrameLowering;
 
   void initializeEnvironment();
-  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
 public:
   void computeIssueWidth();
 
@@ -411,6 +408,10 @@
   bool isRClass() const { return ARMProcClass == RClass; }
   bool isAClass() const { return ARMProcClass == AClass; }
 
+  bool isV6M() const {
+    return isThumb1Only() && isMClass();
+  }
+
   bool isR9Reserved() const { return IsR9Reserved; }
 
   bool useMovt(const MachineFunction &MF) const;
@@ -432,19 +433,16 @@
   bool hasSinCos() const;
 
   /// True for some subtargets at > -O0.
-  bool enablePostMachineScheduler() const;
+  bool enablePostMachineScheduler() const override;
 
-  /// enablePostRAScheduler - True at 'More' optimization.
-  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                             RegClassVector& CriticalPathRCs) const override;
+  // enableAtomicExpand- True if we need to expand our atomics.
+  bool enableAtomicExpand() const override;
 
-  // enableAtomicExpandLoadLinked - True if we need to expand our atomics.
-  bool enableAtomicExpandLoadLinked() const override;
-
-  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
-  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
 
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
@@ -454,6 +452,7 @@
   /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
   /// symbol.
   bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+
 };
 } // End llvm namespace
 

diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index d85194b..88d6c5e 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp

@@ -13,7 +13,9 @@
 #include "ARM.h"
 #include "ARMTargetMachine.h"
 #include "ARMFrameLowering.h"
+#include "ARMTargetObjectFile.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
@@ -42,6 +44,13 @@
   RegisterTargetMachine<ThumbBETargetMachine> B(TheThumbBETarget);
 }
 
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO())
+    return make_unique<TargetLoweringObjectFileMachO>();
+  if (TT.isOSWindows())
+    return make_unique<TargetLoweringObjectFileCOFF>();
+  return make_unique<ARMElfTargetObjectFile>();
+}
 
 /// TargetMachine ctor - Create an ARM architecture model.
 ///
@@ -51,7 +60,8 @@
                                            Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL, bool isLittle)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, *this, isLittle, Options) {
+      TLOF(createTLOF(Triple(getTargetTriple()))),
+      Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) {
 
   // Default to triple-appropriate float ABI
   if (Options.FloatABIType == FloatABI::Default)
@@ -59,6 +69,46 @@
         Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
 }
 
+ARMBaseTargetMachine::~ARMBaseTargetMachine() {}
+
+const ARMSubtarget *
+ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
+  AttributeSet FnAttrs = F.getAttributes();
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function before we can generate a subtarget. We also need to use
+  // it as a key for the subtarget since that can be the only difference
+  // between two functions.
+  Attribute SFAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
+  bool SoftFloat = !SFAttr.hasAttribute(Attribute::None)
+                       ? SFAttr.getValueAsString() == "true"
+                       : Options.UseSoftFloat;
+
+  auto &I = SubtargetMap[CPU + FS + (SoftFloat ? "use-soft-float=true"
+                                               : "use-soft-float=false")];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle);
+  }
+  return I.get();
+}
+
 void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   // Add first the target-independent BasicTTI pass, then our ARM pass. This
   // allows the ARM pass to delegate to the target independent layer when
@@ -158,7 +208,10 @@
 }
 
 void ARMPassConfig::addIRPasses() {
-  addPass(createAtomicExpandLoadLinkedPass(TM));
+  if (TM->Options.ThreadModel == ThreadModel::Single)
+    addPass(createLowerAtomicPass());
+  else
+    addPass(createAtomicExpandPass(TM));
 
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
@@ -244,10 +297,3 @@
 
   return true;
 }
-
-bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
-                                          JITCodeEmitter &JCE) {
-  // Machine code emitter pass for ARM.
-  PM.add(createARMJITCodeEmitterPass(*this, JCE));
-  return false;
-}

diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index b72b1df..fba0ec2 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMTARGETMACHINE_H
-#define ARMTARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
+#define LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
 
 #include "ARMInstrInfo.h"
 #include "ARMSubtarget.h"
@@ -23,7 +23,11 @@
 
 class ARMBaseTargetMachine : public LLVMTargetMachine {
 protected:
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   ARMSubtarget        Subtarget;
+  bool isLittle;
+  mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap;
+
 public:
   ARMBaseTargetMachine(const Target &T, StringRef TT,
                        StringRef CPU, StringRef FS,
@@ -31,30 +35,10 @@
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL,
                        bool isLittle);
+  ~ARMBaseTargetMachine() override;
 
   const ARMSubtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const ARMBaseRegisterInfo *getRegisterInfo() const override {
-    return getSubtargetImpl()->getRegisterInfo();
-  }
-  const ARMTargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
-  const ARMBaseInstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const ARMFrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return &getSubtargetImpl()->getInstrItineraryData();
-  }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
-  ARMJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
+  const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
 
   /// \brief Register ARM analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
@@ -62,7 +46,9 @@
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &MCE) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
 };
 
 /// ARMTargetMachine - ARM target machine.

diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index c926421..98e8763 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_ARM_TARGETOBJECTFILE_H
-#define LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 

diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index a2ace62..ec834e8 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp

@@ -49,7 +49,7 @@
 
   ARMTTI(const ARMBaseTargetMachine *TM)
       : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
+        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
     initializeARMTTIPass(*PassRegistry::getPassRegistry());
   }
 
@@ -104,7 +104,7 @@
     return 32;
   }
 
-  unsigned getMaximumUnrollFactor() const override {
+  unsigned getMaxInterleaveFactor() const override {
     // These are out of order CPUs:
     if (ST->isCortexA15() || ST->isSwift())
       return 2;
@@ -126,10 +126,11 @@
   unsigned getAddressComputationCost(Type *Val,
                                      bool IsComplex) const override;
 
-  unsigned
-  getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                         OperandValueKind Op1Info = OK_AnyValue,
-                         OperandValueKind Op2Info = OK_AnyValue) const override;
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty, OperandValueKind Op1Info = OK_AnyValue,
+      OperandValueKind Op2Info = OK_AnyValue,
+      OperandValueProperties Opd1PropInfo = OP_None,
+      OperandValueProperties Opd2PropInfo = OP_None) const override;
 
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                            unsigned AddressSpace) const override;
@@ -389,6 +390,13 @@
       ValTy->getScalarSizeInBits() <= 32)
     return 3;
 
+  // Cross-class copies are expensive on many microarchitectures,
+  // so assume they are expensive by default.
+  if ((Opcode == Instruction::InsertElement ||
+       Opcode == Instruction::ExtractElement) &&
+      ValTy->getVectorElementType()->isIntegerTy())
+    return 3;
+
   return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
@@ -497,9 +505,10 @@
   return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                        OperandValueKind Op1Info,
-                                        OperandValueKind Op2Info) const {
+unsigned ARMTTI::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
+    OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
+    OperandValueProperties Opd2PropInfo) const {
 
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
@@ -555,8 +564,8 @@
   if (Idx != -1)
     return LT.first * CostTbl[Idx].Cost;
 
-  unsigned Cost =
-      TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+  unsigned Cost = TargetTransformInfo::getArithmeticInstrCost(
+      Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
 
   // This is somewhat of a hack. The problem that we are facing is that SROA
   // creates a sequence of shift, and, or instructions to construct values.

diff --git a/lib/Target/ARM/Android.mk b/lib/Target/ARM/Android.mk
index 095955b..55a5775 100644
--- a/lib/Target/ARM/Android.mk
+++ b/lib/Target/ARM/Android.mk

@@ -19,7 +19,6 @@
   ARMAsmPrinter.cpp \
   ARMBaseInstrInfo.cpp \
   ARMBaseRegisterInfo.cpp \
-  ARMCodeEmitter.cpp \
   ARMConstantIslandPass.cpp \
   ARMConstantPoolValue.cpp \
   ARMExpandPseudoInsts.cpp \
@@ -29,7 +28,6 @@
   ARMISelDAGToDAG.cpp \
   ARMISelLowering.cpp \
   ARMInstrInfo.cpp \
-  ARMJITInfo.cpp \
   ARMLoadStoreOptimizer.cpp \
   ARMMCInstLower.cpp \
   ARMMachineFunctionInfo.cpp \

diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index b62706c..9cc89bd 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp

@@ -129,12 +129,13 @@
 
 class ARMAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
   const MCInstrInfo &MII;
   const MCRegisterInfo *MRI;
   UnwindContext UC;
 
   ARMTargetStreamer &getTargetStreamer() {
+    assert(getParser().getStreamer().getTargetStreamer() &&
+           "do not have a target streamer");
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
     return static_cast<ARMTargetStreamer &>(TS);
   }
@@ -173,20 +174,16 @@
       ITState.CurPosition = ~0U; // Done with the IT block after this.
   }
 
-
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
   void Note(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges = None) {
-    return Parser.Note(L, Msg, Ranges);
+    return getParser().Note(L, Msg, Ranges);
   }
   bool Warning(SMLoc L, const Twine &Msg,
                ArrayRef<SMRange> Ranges = None) {
-    return Parser.Warning(L, Msg, Ranges);
+    return getParser().Warning(L, Msg, Ranges);
   }
   bool Error(SMLoc L, const Twine &Msg,
              ArrayRef<SMRange> Ranges = None) {
-    return Parser.Error(L, Msg, Ranges);
+    return getParser().Error(L, Msg, Ranges);
   }
 
   int tryParseRegister();
@@ -265,9 +262,15 @@
   bool hasARM() const {
     return !(STI.getFeatureBits() & ARM::FeatureNoARM);
   }
+  bool hasThumb2DSP() const {
+    return STI.getFeatureBits() & ARM::FeatureDSPThumb2;
+  }
+  bool hasD16() const {
+    return STI.getFeatureBits() & ARM::FeatureD16;
+  }
 
   void SwitchMode() {
-    unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
+    uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
     setAvailableFeatures(FB);
   }
   bool isMClass() const {
@@ -290,6 +293,7 @@
   OperandMatchResultTy parseInstSyncBarrierOptOperand(OperandVector &);
   OperandMatchResultTy parseProcIFlagsOperand(OperandVector &);
   OperandMatchResultTy parseMSRMaskOperand(OperandVector &);
+  OperandMatchResultTy parseBankedRegOperand(OperandVector &);
   OperandMatchResultTy parsePKHImm(OperandVector &O, StringRef Op, int Low,
                                    int High);
   OperandMatchResultTy parsePKHLSLImm(OperandVector &O) {
@@ -329,10 +333,9 @@
 
   };
 
-  ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-               const MCInstrInfo &MII,
-               const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(MII), UC(_Parser) {
+  ARMAsmParser(MCSubtargetInfo & _STI, MCAsmParser & _Parser,
+               const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(_STI), MII(MII), UC(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Cache the MCRegisterInfo.
@@ -359,7 +362,7 @@
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
+                               uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
   void onLabelParsed(MCSymbol *Symbol) override;
 };
@@ -383,6 +386,7 @@
     k_Memory,
     k_PostIndexRegister,
     k_MSRMask,
+    k_BankedReg,
     k_ProcIFlags,
     k_VectorIndex,
     k_Register,
@@ -435,6 +439,10 @@
     unsigned Val;
   };
 
+  struct BankedRegOp {
+    unsigned Val;
+  };
+
   struct TokOp {
     const char *Data;
     unsigned Length;
@@ -517,6 +525,7 @@
     struct ITMaskOp ITMask;
     struct IFlagsOp IFlags;
     struct MMaskOp MMask;
+    struct BankedRegOp BankedReg;
     struct TokOp Tok;
     struct RegOp Reg;
     struct VectorListOp VectorList;
@@ -585,6 +594,9 @@
     case k_MSRMask:
       MMask = o.MMask;
       break;
+    case k_BankedReg:
+      BankedReg = o.BankedReg;
+      break;
     case k_ProcIFlags:
       IFlags = o.IFlags;
       break;
@@ -679,6 +691,11 @@
     return MMask.Val;
   }
 
+  unsigned getBankedReg() const {
+    assert(Kind == k_BankedReg && "Invalid access!");
+    return BankedReg.Val;
+  }
+
   bool isCoprocNum() const { return Kind == k_CoprocNum; }
   bool isCoprocReg() const { return Kind == k_CoprocReg; }
   bool isCoprocOption() const { return Kind == k_CoprocOption; }
@@ -1384,6 +1401,7 @@
   }
 
   bool isMSRMask() const { return Kind == k_MSRMask; }
+  bool isBankedReg() const { return Kind == k_BankedReg; }
   bool isProcIFlags() const { return Kind == k_ProcIFlags; }
 
   // NEON operands.
@@ -1601,9 +1619,18 @@
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     // Must be a constant.
     if (!CE) return false;
-    int64_t Value = CE->getValue();
-    // i16 value in the range [0,255] or [0x0100, 0xff00]
-    return (Value >= 0 && Value < 256) || (Value >= 0x0100 && Value <= 0xff00);
+    unsigned Value = CE->getValue();
+    return ARM_AM::isNEONi16splat(Value);
+  }
+
+  bool isNEONi16splatNot() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    unsigned Value = CE->getValue();
+    return ARM_AM::isNEONi16splat(~Value & 0xffff);
   }
 
   bool isNEONi32splat() const {
@@ -1614,12 +1641,18 @@
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     // Must be a constant.
     if (!CE) return false;
-    int64_t Value = CE->getValue();
-    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
-    return (Value >= 0 && Value < 256) ||
-      (Value >= 0x0100 && Value <= 0xff00) ||
-      (Value >= 0x010000 && Value <= 0xff0000) ||
-      (Value >= 0x01000000 && Value <= 0xff000000);
+    unsigned Value = CE->getValue();
+    return ARM_AM::isNEONi32splat(Value);
+  }
+
+  bool isNEONi32splatNot() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    unsigned Value = CE->getValue();
+    return ARM_AM::isNEONi32splat(~Value);
   }
 
   bool isNEONByteReplicate(unsigned NumBytes) const {
@@ -1655,6 +1688,7 @@
     int64_t Value = CE->getValue();
     // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
     // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+    // FIXME: This is probably wrong and a copy and paste from previous example
     return (Value >= 0 && Value < 256) ||
       (Value >= 0x0100 && Value <= 0xff00) ||
       (Value >= 0x010000 && Value <= 0xff0000) ||
@@ -1670,6 +1704,7 @@
     int64_t Value = ~CE->getValue();
     // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
     // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+    // FIXME: This is probably wrong and a copy and paste from previous example
     return (Value >= 0 && Value < 256) ||
       (Value >= 0x0100 && Value <= 0xff00) ||
       (Value >= 0x010000 && Value <= 0xff0000) ||
@@ -2334,6 +2369,11 @@
     Inst.addOperand(MCOperand::CreateImm(unsigned(getMSRMask())));
   }
 
+  void addBankedRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getBankedReg())));
+  }
+
   void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateImm(unsigned(getProcIFlags())));
@@ -2378,10 +2418,16 @@
     // The immediate encodes the type of constant as well as the value.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     unsigned Value = CE->getValue();
-    if (Value >= 256)
-      Value = (Value >> 8) | 0xa00;
-    else
-      Value |= 0x800;
+    Value = ARM_AM::encodeNEONi16splat(Value);
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONi16splatNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    Value = ARM_AM::encodeNEONi16splat(~Value & 0xffff);
     Inst.addOperand(MCOperand::CreateImm(Value));
   }
 
@@ -2390,12 +2436,16 @@
     // The immediate encodes the type of constant as well as the value.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     unsigned Value = CE->getValue();
-    if (Value >= 256 && Value <= 0xff00)
-      Value = (Value >> 8) | 0x200;
-    else if (Value > 0xffff && Value <= 0xff0000)
-      Value = (Value >> 16) | 0x400;
-    else if (Value > 0xffffff)
-      Value = (Value >> 24) | 0x600;
+    Value = ARM_AM::encodeNEONi32splat(Value);
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONi32splatNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    Value = ARM_AM::encodeNEONi32splat(~Value);
     Inst.addOperand(MCOperand::CreateImm(Value));
   }
 
@@ -2736,6 +2786,14 @@
     Op->EndLoc = S;
     return Op;
   }
+
+  static std::unique_ptr<ARMOperand> CreateBankedReg(unsigned Reg, SMLoc S) {
+    auto Op = make_unique<ARMOperand>(k_BankedReg);
+    Op->BankedReg.Val = Reg;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
 };
 
 } // end anonymous namespace.
@@ -2769,6 +2827,9 @@
   case k_MSRMask:
     OS << "<mask: " << getMSRMask() << ">";
     break;
+  case k_BankedReg:
+    OS << "<banked reg: " << getBankedReg() << ">";
+    break;
   case k_Immediate:
     getImm()->print(OS);
     break;
@@ -2871,8 +2932,9 @@
 
 bool ARMAsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
-  StartLoc = Parser.getTok().getLoc();
-  EndLoc = Parser.getTok().getEndLoc();
+  const AsmToken &Tok = getParser().getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
   RegNo = tryParseRegister();
 
   return (RegNo == (unsigned)-1);
@@ -2883,6 +2945,7 @@
 /// returned.  Otherwise return -1.
 ///
 int ARMAsmParser::tryParseRegister() {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) return -1;
 
@@ -2924,6 +2987,10 @@
     return Entry->getValue();
   }
 
+  // Some FPUs only have 16 D registers, so D16-D31 are invalid
+  if (hasD16() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
+    return -1;
+
   Parser.Lex(); // Eat identifier token.
 
   return RegNum;
@@ -2935,6 +3002,7 @@
 // consumed in the process of trying to parse the shifter (i.e., when it is
 // indeed a shifter operand, but malformed).
 int ARMAsmParser::tryParseShiftRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -3037,6 +3105,7 @@
 /// TODO this is likely to change to allow different register types and or to
 /// parse for a specific register type.
 bool ARMAsmParser::tryParseRegisterWithWriteBack(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &RegTok = Parser.getTok();
   int RegNo = tryParseRegister();
   if (RegNo == -1)
@@ -3118,9 +3187,10 @@
       return -1;
     switch (Name[1]) {
     default:  return -1;
-    // p10 and p11 are invalid for coproc instructions (reserved for FP/NEON)
-    case '0': return CoprocOp == 'p'? -1: 10;
-    case '1': return CoprocOp == 'p'? -1: 11;
+    // CP10 and CP11 are VFP/NEON and so vector instructions should be used.
+    // However, old cores (v5/v6) did use them in that way.
+    case '0': return 10;
+    case '1': return 11;
     case '2': return 12;
     case '3': return 13;
     case '4': return 14;
@@ -3132,6 +3202,7 @@
 /// parseITCondCode - Try to parse a condition code for an IT instruction.
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseITCondCode(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
@@ -3169,6 +3240,7 @@
 /// number, the token is eaten and the operand is added to the operand list.
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseCoprocNumOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -3177,6 +3249,9 @@
   int Num = MatchCoprocessorOperandName(Tok.getString(), 'p');
   if (Num == -1)
     return MatchOperand_NoMatch;
+  // ARMv7 and v8 don't allow cp10/cp11 due to VFP/NEON specific instructions
+  if ((hasV7Ops() || hasV8Ops()) && (Num == 10 || Num == 11))
+    return MatchOperand_NoMatch;
 
   Parser.Lex(); // Eat identifier token.
   Operands.push_back(ARMOperand::CreateCoprocNum(Num, S));
@@ -3188,6 +3263,7 @@
 /// number, the token is eaten and the operand is added to the operand list.
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseCoprocRegOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -3206,6 +3282,7 @@
 /// coproc_option : '{' imm0_255 '}'
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseCoprocOptionOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
 
   // If this isn't a '{', this isn't a coprocessor immediate operand.
@@ -3283,6 +3360,7 @@
 
 /// Parse a register list.
 bool ARMAsmParser::parseRegisterList(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   assert(Parser.getTok().is(AsmToken::LCurly) &&
          "Token is not a Left Curly Brace");
   SMLoc S = Parser.getTok().getLoc();
@@ -3414,6 +3492,7 @@
 // Helper function to parse the lane index for vector lists.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
+  MCAsmParser &Parser = getParser();
   Index = 0; // Always return a defined index value.
   if (Parser.getTok().is(AsmToken::LBrac)) {
     Parser.Lex(); // Eat the '['.
@@ -3465,6 +3544,7 @@
 // parse a vector register list
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseVectorList(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   VectorLaneTy LaneKind;
   unsigned LaneIndex;
   SMLoc S = Parser.getTok().getLoc();
@@ -3716,6 +3796,7 @@
 /// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseMemBarrierOptOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   unsigned Opt;
@@ -3787,6 +3868,7 @@
 /// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseInstSyncBarrierOptOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   unsigned Opt;
@@ -3838,6 +3920,7 @@
 /// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseProcIFlagsOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier)) 
@@ -3872,6 +3955,7 @@
 /// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
@@ -3892,9 +3976,6 @@
       // should really only be allowed when writing a special register.  Note
       // they get dropped in the MRS instruction reading a special register as
       // the SYSm field is only 8 bits.
-      //
-      // FIXME: the _g and _nzcvqg versions are only allowed if the processor
-      // includes the DSP extension but that is not checked.
       .Case("apsr", 0x800)
       .Case("apsr_nzcvq", 0x800)
       .Case("apsr_g", 0x400)
@@ -3926,6 +4007,11 @@
     if (FlagsVal == ~0U)
       return MatchOperand_NoMatch;
 
+    if (!hasThumb2DSP() && (FlagsVal & 0x400))
+      // The _g and _nzcvqg versions are only valid if the DSP extension is
+      // available.
+      return MatchOperand_NoMatch;
+
     if (!hasV7Ops() && FlagsVal >= 0x811 && FlagsVal <= 0x813)
       // basepri, basepri_max and faultmask only valid for V7m.
       return MatchOperand_NoMatch;
@@ -3998,9 +4084,67 @@
   return MatchOperand_Success;
 }
 
+/// parseBankedRegOperand - Try to parse a banked register (e.g. "lr_irq") for
+/// use in the MRS/MSR instructions added to support virtualization.
+ARMAsmParser::OperandMatchResultTy
+ARMAsmParser::parseBankedRegOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef RegName = Tok.getString();
+
+  // The values here come from B9.2.3 of the ARM ARM, where bits 4-0 are SysM
+  // and bit 5 is R.
+  unsigned Encoding = StringSwitch<unsigned>(RegName.lower())
+                          .Case("r8_usr", 0x00)
+                          .Case("r9_usr", 0x01)
+                          .Case("r10_usr", 0x02)
+                          .Case("r11_usr", 0x03)
+                          .Case("r12_usr", 0x04)
+                          .Case("sp_usr", 0x05)
+                          .Case("lr_usr", 0x06)
+                          .Case("r8_fiq", 0x08)
+                          .Case("r9_fiq", 0x09)
+                          .Case("r10_fiq", 0x0a)
+                          .Case("r11_fiq", 0x0b)
+                          .Case("r12_fiq", 0x0c)
+                          .Case("sp_fiq", 0x0d)
+                          .Case("lr_fiq", 0x0e)
+                          .Case("lr_irq", 0x10)
+                          .Case("sp_irq", 0x11)
+                          .Case("lr_svc", 0x12)
+                          .Case("sp_svc", 0x13)
+                          .Case("lr_abt", 0x14)
+                          .Case("sp_abt", 0x15)
+                          .Case("lr_und", 0x16)
+                          .Case("sp_und", 0x17)
+                          .Case("lr_mon", 0x1c)
+                          .Case("sp_mon", 0x1d)
+                          .Case("elr_hyp", 0x1e)
+                          .Case("sp_hyp", 0x1f)
+                          .Case("spsr_fiq", 0x2e)
+                          .Case("spsr_irq", 0x30)
+                          .Case("spsr_svc", 0x32)
+                          .Case("spsr_abt", 0x34)
+                          .Case("spsr_und", 0x36)
+                          .Case("spsr_mon", 0x3c)
+                          .Case("spsr_hyp", 0x3e)
+                          .Default(~0U);
+
+  if (Encoding == ~0U)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateBankedReg(Encoding, S));
+  return MatchOperand_Success;
+}
+
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parsePKHImm(OperandVector &Operands, StringRef Op, int Low,
                           int High) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
     Error(Parser.getTok().getLoc(), Op + " operand expected.");
@@ -4048,6 +4192,7 @@
 
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseSetEndImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -4077,6 +4222,7 @@
 ///             n == 32 encoded as n == 0.
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseShifterImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -4147,6 +4293,7 @@
 ///     ror #n  'n' in {0, 8, 16, 24}
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseRotImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   if (Tok.isNot(AsmToken::Identifier))
@@ -4193,6 +4340,7 @@
 
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseBitfield(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   // The bitfield descriptor is really two operands, the LSB and the width.
   if (Parser.getTok().isNot(AsmToken::Hash) &&
@@ -4269,6 +4417,7 @@
   // This method must return MatchOperand_NoMatch without consuming any tokens
   // in the case where there is no match, as other alternatives take other
   // parse methods.
+  MCAsmParser &Parser = getParser();
   AsmToken Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
   bool haveEaten = false;
@@ -4321,6 +4470,7 @@
   // This method must return MatchOperand_NoMatch without consuming any tokens
   // in the case where there is no match, as other alternatives take other
   // parse methods.
+  MCAsmParser &Parser = getParser();
   AsmToken Tok = Parser.getTok();
   SMLoc S = Tok.getLoc();
 
@@ -4458,6 +4608,7 @@
 /// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
 bool ARMAsmParser::parseMemory(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S, E;
   assert(Parser.getTok().is(AsmToken::LBrac) &&
          "Token is not a Left Bracket");
@@ -4649,6 +4800,7 @@
 /// return true if it parses a shift otherwise it returns false.
 bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
                                           unsigned &Amount) {
+  MCAsmParser &Parser = getParser();
   SMLoc Loc = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
@@ -4709,6 +4861,7 @@
 /// parseFPImm - A floating point immediate expression operand.
 ARMAsmParser::OperandMatchResultTy
 ARMAsmParser::parseFPImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   // Anything that can accept a floating point constant as an operand
   // needs to go through here, as the regular parseExpression is
   // integer only.
@@ -4789,6 +4942,7 @@
 /// Parse a arm instruction operand.  For now this parses the operand regardless
 /// of the mnemonic.
 bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+  MCAsmParser &Parser = getParser();
   SMLoc S, E;
 
   // Check if the current operand has a custom associated parser, if so, try to
@@ -4921,6 +5075,7 @@
 // parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
 //  :lower16: and :upper16:.
 bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
+  MCAsmParser &Parser = getParser();
   RefKind = ARMMCExpr::VK_ARM_None;
 
   // consume an optional '#' (GNU compatibility)
@@ -5271,7 +5426,7 @@
 static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) {
   return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm");
 }
-static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features,
+static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features,
                                  unsigned VariantID);
 
 static bool RequiresVFPRegListValidation(StringRef Inst,
@@ -5296,6 +5451,7 @@
 /// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                     SMLoc NameLoc, OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   // FIXME: Can this be done via tablegen in some fashion?
   bool RequireVFPRegisterListCheck;
   bool AcceptSinglePrecisionOnly;
@@ -5309,7 +5465,7 @@
   // The generic tblgen'erated code does this later, at the start of
   // MatchInstructionImpl(), but that's too late for aliases that include
   // any sort of suffix.
-  unsigned AvailableFeatures = getAvailableFeatures();
+  uint64_t AvailableFeatures = getAvailableFeatures();
   unsigned AssemblerDialect = getParser().getAssemblerDialect();
   applyMnemonicAliases(Name, AvailableFeatures, AssemblerDialect);
 
@@ -5415,6 +5571,8 @@
     Operands.push_back(ARMOperand::CreateImm(
           MCConstantExpr::Create(ProcessorIMod, getContext()),
                                  NameLoc, NameLoc));
+  } else if (Mnemonic == "cps" && isMClass()) {
+    return Error(NameLoc, "instruction 'cps' requires effect for M-class");
   }
 
   // Add the remaining tokens in the mnemonic.
@@ -5546,6 +5704,48 @@
     }
   }
 
+  // If first 2 operands of a 3 operand instruction are the same
+  // then transform to 2 operand version of the same instruction
+  // e.g. 'adds r0, r0, #1' transforms to 'adds r0, #1'
+  // FIXME: We would really like to be able to tablegen'erate this.
+  if (isThumbOne() && Operands.size() == 6 &&
+       (Mnemonic == "add" || Mnemonic == "sub" || Mnemonic == "and" ||
+        Mnemonic == "eor" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+        Mnemonic == "asr" || Mnemonic == "adc" || Mnemonic == "sbc" ||
+        Mnemonic == "ror" || Mnemonic == "orr" || Mnemonic == "bic")) {
+      ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+      ARMOperand &Op4 = static_cast<ARMOperand &>(*Operands[4]);
+      ARMOperand &Op5 = static_cast<ARMOperand &>(*Operands[5]);
+
+      // If both registers are the same then remove one of them from
+      // the operand list.
+      if (Op3.isReg() && Op4.isReg() && Op3.getReg() == Op4.getReg()) {
+          // If 3rd operand (variable Op5) is a register and the instruction is adds/sub
+          // then do not transform as the backend already handles this instruction
+          // correctly.
+          if (!Op5.isReg() || !((Mnemonic == "add" && CarrySetting) || Mnemonic == "sub")) {
+              Operands.erase(Operands.begin() + 3);
+              if (Mnemonic == "add" && !CarrySetting) {
+                  // Special case for 'add' (not 'adds') instruction must
+                  // remove the CCOut operand as well.
+                  Operands.erase(Operands.begin() + 1);
+              }
+          }
+      }
+  }
+
+  // If instruction is 'add' and first two register operands
+  // use SP register, then remove one of the SP registers from
+  // the instruction.
+  // FIXME: We would really like to be able to tablegen'erate this.
+  if (isThumbOne() && Operands.size() == 5 && Mnemonic == "add" && !CarrySetting) {
+      ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
+      ARMOperand &Op3 = static_cast<ARMOperand &>(*Operands[3]);
+      if (Op2.isReg() && Op3.isReg() && Op2.getReg() == ARM::SP && Op3.getReg() == ARM::SP) {
+          Operands.erase(Operands.begin() + 2);
+      }
+  }
+
   // GNU Assembler extension (compatibility)
   if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
     ARMOperand &Op2 = static_cast<ARMOperand &>(*Operands[2]);
@@ -5728,6 +5928,48 @@
                    "source operands must be sequential");
     return false;
   }
+  case ARM::STR_PRE_IMM:
+  case ARM::STR_PRE_REG:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRB_PRE_REG:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG: {
+    // Rt must be different from Rn.
+    const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+
+    if (Rt == Rn)
+      return Error(Operands[3]->getStartLoc(),
+                   "source register and base register can't be identical");
+    return false;
+  }
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDR_PRE_REG:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDR_POST_REG:
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDRB_PRE_REG:
+  case ARM::LDRB_POST_IMM:
+  case ARM::LDRB_POST_REG:
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST: {
+    // Rt must be different from Rn.
+    const unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+
+    if (Rt == Rn)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination register and base register can't be identical");
+    return false;
+  }
   case ARM::SBFX:
   case ARM::UBFX: {
     // Width must be in range [1, 32-lsb].
@@ -5764,7 +6006,9 @@
       return Error(Operands[3]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
-
+    if (listContainsReg(Inst, 3 + HasWritebackToken, ARM::SP))
+      return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
+                   "SP not allowed in register list");
     break;
   }
   case ARM::LDMIA_UPD:
@@ -5775,7 +6019,19 @@
     // UNPREDICTABLE on v7 upwards. Goodness knows what they did before.
     if (!hasV7Ops())
       break;
-    // Fallthrough
+    if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
+      return Error(Operands.back()->getStartLoc(),
+                   "writeback register not allowed in register list");
+    break;
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB: {
+    if (listContainsReg(Inst, 3, ARM::SP))
+      return Error(Operands.back()->getStartLoc(),
+                   "SP not allowed in register list");
+    break;
+  }
   case ARM::t2LDMIA_UPD:
   case ARM::t2LDMDB_UPD:
   case ARM::t2STMIA_UPD:
@@ -5783,6 +6039,10 @@
     if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
       return Error(Operands.back()->getStartLoc(),
                    "writeback register not allowed in register list");
+
+    if (listContainsReg(Inst, 4, ARM::SP))
+      return Error(Operands.back()->getStartLoc(),
+                   "SP not allowed in register list");
     break;
   }
   case ARM::sysLDMIA_UPD:
@@ -5851,6 +6111,9 @@
       return Error(Operands[4]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
+    if (listContainsReg(Inst, 4, ARM::SP) && !inITBlock())
+      return Error(Operands.back()->getStartLoc(),
+                   "SP not allowed in register list");
     break;
   }
   case ARM::tADDrSP: {
@@ -8010,7 +8273,7 @@
   }
   // Some high-register supporting Thumb1 encodings only allow both registers
   // to be from r0-r7 when in Thumb2.
-  else if (Opc == ARM::tADDhirr && isThumbOne() &&
+  else if (Opc == ARM::tADDhirr && isThumbOne() && !hasV6MOps() &&
            isARMLowRegister(Inst.getOperand(1).getReg()) &&
            isARMLowRegister(Inst.getOperand(2).getReg()))
     return Match_RequiresThumb2;
@@ -8028,10 +8291,10 @@
 }
 }
 
-static const char *getSubtargetFeatureName(unsigned Val);
+static const char *getSubtargetFeatureName(uint64_t Val);
 bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                            OperandVector &Operands,
-                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           MCStreamer &Out, uint64_t &ErrorInfo,
                                            bool MatchingInlineAsm) {
   MCInst Inst;
   unsigned MatchResult;
@@ -8085,7 +8348,7 @@
     // Special case the error message for the very common case where only
     // a single subtarget feature is missing (Thumb vs. ARM, e.g.).
     std::string Msg = "instruction requires:";
-    unsigned Mask = 1;
+    uint64_t Mask = 1;
     for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
       if (ErrorInfo & Mask) {
         Msg += " ";
@@ -8097,7 +8360,7 @@
   }
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
+    if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
@@ -8174,6 +8437,7 @@
   const MCObjectFileInfo::Environment Format =
     getContext().getObjectFileInfo()->getObjectFileType();
   bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+  bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
 
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
@@ -8225,7 +8489,7 @@
   else if (IDVal == ".thumb_set")
     return parseDirectiveThumbSet(DirectiveID.getLoc());
 
-  if (!IsMachO) {
+  if (!IsMachO && !IsCOFF) {
     if (IDVal == ".arch")
       return parseDirectiveArch(DirectiveID.getLoc());
     else if (IDVal == ".cpu")
@@ -8256,6 +8520,7 @@
 ///  ::= .short expression [, expression]*
 ///  ::= .word expression [, expression]*
 bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
@@ -8285,6 +8550,7 @@
 /// parseDirectiveThumb
 ///  ::= .thumb
 bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     Error(L, "unexpected token in directive");
     return false;
@@ -8306,6 +8572,7 @@
 /// parseDirectiveARM
 ///  ::= .arm
 bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     Error(L, "unexpected token in directive");
     return false;
@@ -8334,12 +8601,13 @@
 /// parseDirectiveThumbFunc
 ///  ::= .thumbfunc symbol_name
 bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
+  MCAsmParser &Parser = getParser();
+  const auto Format = getContext().getObjectFileInfo()->getObjectFileType();
+  bool IsMachO = Format == MCObjectFileInfo::IsMachO;
 
   // Darwin asm has (optionally) function name after .thumb_func direction
   // ELF doesn't
-  if (isMachO) {
+  if (IsMachO) {
     const AsmToken &Tok = Parser.getTok();
     if (Tok.isNot(AsmToken::EndOfStatement)) {
       if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String)) {
@@ -8356,7 +8624,8 @@
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    Error(L, "unexpected token in directive");
+    Error(Parser.getTok().getLoc(), "unexpected token in directive");
+    Parser.eatToEndOfStatement();
     return false;
   }
 
@@ -8367,6 +8636,7 @@
 /// parseDirectiveSyntax
 ///  ::= .syntax unified | divided
 bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
     Error(L, "unexpected token in .syntax directive");
@@ -8398,6 +8668,7 @@
 /// parseDirectiveCode
 ///  ::= .code 16 | 32
 bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Integer)) {
     Error(L, "unexpected token in .code directive");
@@ -8442,6 +8713,7 @@
 /// parseDirectiveReq
 ///  ::= name .req registername
 bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   Parser.Lex(); // Eat the '.req' token.
   unsigned Reg;
   SMLoc SRegLoc, ERegLoc;
@@ -8460,7 +8732,7 @@
 
   Parser.Lex(); // Consume the EndOfStatement
 
-  if (RegisterReqs.GetOrCreateValue(Name, Reg).getValue() != Reg) {
+  if (!RegisterReqs.insert(std::make_pair(Name, Reg)).second) {
     Error(SRegLoc, "redefinition of '" + Name + "' does not match original.");
     return false;
   }
@@ -8471,6 +8743,7 @@
 /// parseDirectiveUneq
 ///  ::= .unreq registername
 bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
     Parser.eatToEndOfStatement();
     Error(L, "unexpected input in .unreq directive.");
@@ -8507,6 +8780,7 @@
 ///  ::= .eabi_attribute int, int [, "str"]
 ///  ::= .eabi_attribute Tag_name, int [, "str"]
 bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   int64_t Tag;
   SMLoc TagLoc;
   TagLoc = Parser.getTok().getLoc();
@@ -8617,6 +8891,32 @@
   return false;
 }
 
+// FIXME: This is duplicated in getARMFPUFeatures() in
+// tools/clang/lib/Driver/Tools.cpp
+static const struct {
+  const unsigned Fpu;
+  const uint64_t Enabled;
+  const uint64_t Disabled;
+} Fpus[] = {
+      {ARM::VFP, ARM::FeatureVFP2, ARM::FeatureNEON},
+      {ARM::VFPV2, ARM::FeatureVFP2, ARM::FeatureNEON},
+      {ARM::VFPV3, ARM::FeatureVFP3, ARM::FeatureNEON},
+      {ARM::VFPV3_D16, ARM::FeatureVFP3 | ARM::FeatureD16, ARM::FeatureNEON},
+      {ARM::VFPV4, ARM::FeatureVFP4, ARM::FeatureNEON},
+      {ARM::VFPV4_D16, ARM::FeatureVFP4 | ARM::FeatureD16, ARM::FeatureNEON},
+      {ARM::FPV5_D16, ARM::FeatureFPARMv8 | ARM::FeatureD16,
+       ARM::FeatureNEON | ARM::FeatureCrypto},
+      {ARM::FP_ARMV8, ARM::FeatureFPARMv8,
+       ARM::FeatureNEON | ARM::FeatureCrypto},
+      {ARM::NEON, ARM::FeatureNEON, 0},
+      {ARM::NEON_VFPV4, ARM::FeatureVFP4 | ARM::FeatureNEON, 0},
+      {ARM::NEON_FP_ARMV8, ARM::FeatureFPARMv8 | ARM::FeatureNEON,
+       ARM::FeatureCrypto},
+      {ARM::CRYPTO_NEON_FP_ARMV8,
+       ARM::FeatureFPARMv8 | ARM::FeatureNEON | ARM::FeatureCrypto, 0},
+      {ARM::SOFTVFP, 0, 0},
+};
+
 /// parseDirectiveFPU
 ///  ::= .fpu str
 bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
@@ -8632,6 +8932,18 @@
     return false;
   }
 
+  for (const auto &Fpu : Fpus) {
+    if (Fpu.Fpu != ID)
+      continue;
+
+    // Need to toggle features that should be on but are off and that
+    // should off but are on.
+    uint64_t Toggle = (Fpu.Enabled & ~STI.getFeatureBits()) |
+                      (Fpu.Disabled & STI.getFeatureBits());
+    setAvailableFeatures(ComputeAvailableFeatures(STI.ToggleFeature(Toggle)));
+    break;
+  }
+
   getTargetStreamer().emitFPU(ID);
   return false;
 }
@@ -8698,6 +9010,7 @@
 /// parseDirectivePersonality
 ///  ::= .personality name
 bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   bool HasExistingPersonality = UC.hasPersonality();
 
   UC.recordPersonality(L);
@@ -8761,6 +9074,7 @@
 /// parseDirectiveSetFP
 ///  ::= .setfp fpreg, spreg [, offset]
 bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   // Check the ordering of unwind directives
   if (!UC.hasFnStart()) {
     Error(L, ".fnstart must precede .setfp directive");
@@ -8838,6 +9152,7 @@
 /// parseDirective
 ///  ::= .pad offset
 bool ARMAsmParser::parseDirectivePad(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   // Check the ordering of unwind directives
   if (!UC.hasFnStart()) {
     Error(L, ".fnstart must precede .pad directive");
@@ -8912,6 +9227,7 @@
 ///  ::= .inst.n opcode [, ...]
 ///  ::= .inst.w opcode [, ...]
 bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) {
+  MCAsmParser &Parser = getParser();
   int Width;
 
   if (isThumb()) {
@@ -9008,7 +9324,7 @@
   }
 
   if (!Section) {
-    getStreamer().InitSections();
+    getStreamer().InitSections(false);
     Section = getStreamer().getCurrentSection().first;
   }
 
@@ -9024,6 +9340,7 @@
 /// parseDirectivePersonalityIndex
 ///   ::= .personalityindex index
 bool ARMAsmParser::parseDirectivePersonalityIndex(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   bool HasExistingPersonality = UC.hasPersonality();
 
   UC.recordPersonalityIndex(L);
@@ -9079,6 +9396,7 @@
 /// parseDirectiveUnwindRaw
 ///   ::= .unwind_raw offset, opcode [, opcode...]
 bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (!UC.hasFnStart()) {
     Parser.eatToEndOfStatement();
     Error(L, ".fnstart must precede .unwind_raw directives");
@@ -9160,6 +9478,8 @@
 /// parseDirectiveTLSDescSeq
 ///   ::= .tlsdescseq tls-variable
 bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+
   if (getLexer().isNot(AsmToken::Identifier)) {
     TokError("expected variable after '.tlsdescseq' directive");
     Parser.eatToEndOfStatement();
@@ -9184,6 +9504,7 @@
 /// parseDirectiveMovSP
 ///  ::= .movsp reg [, #offset]
 bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (!UC.hasFnStart()) {
     Parser.eatToEndOfStatement();
     Error(L, ".fnstart must precede .movsp directives");
@@ -9247,6 +9568,7 @@
 /// parseDirectiveObjectArch
 ///   ::= .object_arch name
 bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::Identifier)) {
     Error(getLexer().getLoc(), "unexpected token");
     Parser.eatToEndOfStatement();
@@ -9303,6 +9625,8 @@
 /// parseDirectiveThumbSet
 ///  ::= .thumb_set name, value
 bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+
   StringRef Name;
   if (Parser.parseIdentifier(Name)) {
     TokError("expected identifier after '.thumb_set'");
@@ -9349,8 +9673,8 @@
 #define GET_MATCHER_IMPLEMENTATION
 #include "ARMGenAsmMatcher.inc"
 
-static const struct ExtMapEntry {
-  const char *Extension;
+static const struct {
+  const char *Name;
   const unsigned ArchCheck;
   const uint64_t Features;
 } Extensions[] = {
@@ -9381,46 +9705,47 @@
 /// parseDirectiveArchExtension
 ///   ::= .arch_extension [no]feature
 bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
+  MCAsmParser &Parser = getParser();
+
   if (getLexer().isNot(AsmToken::Identifier)) {
     Error(getLexer().getLoc(), "unexpected token");
     Parser.eatToEndOfStatement();
     return false;
   }
 
-  StringRef Extension = Parser.getTok().getString();
+  StringRef Name = Parser.getTok().getString();
   SMLoc ExtLoc = Parser.getTok().getLoc();
   getLexer().Lex();
 
   bool EnableFeature = true;
-  if (Extension.startswith_lower("no")) {
+  if (Name.startswith_lower("no")) {
     EnableFeature = false;
-    Extension = Extension.substr(2);
+    Name = Name.substr(2);
   }
 
-  for (unsigned EI = 0, EE = array_lengthof(Extensions); EI != EE; ++EI) {
-    if (Extensions[EI].Extension != Extension)
+  for (const auto &Extension : Extensions) {
+    if (Extension.Name != Name)
       continue;
 
-    unsigned FB = getAvailableFeatures();
-    if ((FB & Extensions[EI].ArchCheck) != Extensions[EI].ArchCheck) {
-      Error(ExtLoc, "architectural extension '" + Extension + "' is not "
+    if (!Extension.Features)
+      report_fatal_error("unsupported architectural extension: " + Name);
+
+    if ((getAvailableFeatures() & Extension.ArchCheck) != Extension.ArchCheck) {
+      Error(ExtLoc, "architectural extension '" + Name + "' is not "
             "allowed for the current base architecture");
       return false;
     }
 
-    if (!Extensions[EI].Features)
-      report_fatal_error("unsupported architectural extension: " + Extension);
-
-    if (EnableFeature)
-      FB |= ComputeAvailableFeatures(Extensions[EI].Features);
-    else
-      FB &= ~ComputeAvailableFeatures(Extensions[EI].Features);
-
-    setAvailableFeatures(FB);
+    uint64_t ToggleFeatures = EnableFeature
+                                  ? (~STI.getFeatureBits() & Extension.Features)
+                                  : ( STI.getFeatureBits() & Extension.Features);
+    uint64_t Features =
+        ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+    setAvailableFeatures(Features);
     return false;
   }
 
-  Error(ExtLoc, "unknown architectural extension: " + Extension);
+  Error(ExtLoc, "unknown architectural extension: " + Name);
   Parser.eatToEndOfStatement();
   return false;
 }

diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 9b5fa75..2530640 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt

@@ -2,8 +2,7 @@
 
 tablegen(LLVM ARMGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM ARMGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM ARMGenCodeEmitter.inc -gen-emitter)
-tablegen(LLVM ARMGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM ARMGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM ARMGenMCPseudoLowering.inc -gen-pseudo-lowering)
 tablegen(LLVM ARMGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM ARMGenAsmMatcher.inc -gen-asm-matcher)
@@ -19,7 +18,6 @@
   ARMAsmPrinter.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
-  ARMCodeEmitter.cpp
   ARMConstantIslandPass.cpp
   ARMConstantPoolValue.cpp
   ARMExpandPseudoInsts.cpp
@@ -29,7 +27,6 @@
   ARMISelDAGToDAG.cpp
   ARMISelLowering.cpp
   ARMInstrInfo.cpp
-  ARMJITInfo.cpp
   ARMLoadStoreOptimizer.cpp
   ARMMCInstLower.cpp
   ARMMachineFunctionInfo.cpp

diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 4d4038d..ef65418 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp

@@ -20,7 +20,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <vector>
@@ -85,42 +84,34 @@
 }
 
 namespace {
-/// ARMDisassembler - ARM disassembler for all ARM platforms.
+/// ARM disassembler for all ARM platforms.
 class ARMDisassembler : public MCDisassembler {
 public:
-  /// Constructor     - Initializes the disassembler.
-  ///
   ARMDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
     MCDisassembler(STI, Ctx) {
   }
 
-  ~ARMDisassembler() {
-  }
+  ~ARMDisassembler() {}
 
-  /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
-                              const MemoryObject &region, uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const override;
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
 };
 
-/// ThumbDisassembler - Thumb disassembler for all Thumb platforms.
+/// Thumb disassembler for all Thumb platforms.
 class ThumbDisassembler : public MCDisassembler {
 public:
-  /// Constructor     - Initializes the disassembler.
-  ///
   ThumbDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
     MCDisassembler(STI, Ctx) {
   }
 
-  ~ThumbDisassembler() {
-  }
+  ~ThumbDisassembler() {}
 
-  /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
-                              const MemoryObject &region, uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const override;
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
 
 private:
   mutable ITStatus ITBlock;
@@ -281,6 +272,8 @@
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
@@ -413,103 +406,99 @@
 }
 
 DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                             const MemoryObject &Region,
-                                             uint64_t Address,
-                                             raw_ostream &os,
-                                             raw_ostream &cs) const {
-  CommentStream = &cs;
-
-  uint8_t bytes[4];
+                                             ArrayRef<uint8_t> Bytes,
+                                             uint64_t Address, raw_ostream &OS,
+                                             raw_ostream &CS) const {
+  CommentStream = &CS;
 
   assert(!(STI.getFeatureBits() & ARM::ModeThumb) &&
-         "Asked to disassemble an ARM instruction but Subtarget is in Thumb mode!");
+         "Asked to disassemble an ARM instruction but Subtarget is in Thumb "
+         "mode!");
 
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, bytes) == -1) {
+  if (Bytes.size() < 4) {
     Size = 0;
     return MCDisassembler::Fail;
   }
 
   // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn = (bytes[3] << 24) |
-                  (bytes[2] << 16) |
-                  (bytes[1] <<  8) |
-                  (bytes[0] <<  0);
+  uint32_t Insn =
+      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
 
   // Calling the auto-generated decoder function.
-  DecodeStatus result = decodeInstruction(DecoderTableARM32, MI, insn,
-                                          Address, this, STI);
-  if (result != MCDisassembler::Fail) {
+  DecodeStatus Result =
+      decodeInstruction(DecoderTableARM32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
-    return result;
+    return Result;
   }
 
   // VFP and NEON instructions, similarly, are shared between ARM
   // and Thumb modes.
   MI.clear();
-  result = decodeInstruction(DecoderTableVFP32, MI, insn, Address, this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
-    return result;
+    return Result;
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableVFPV832, MI, insn, Address, this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
-    return result;
+    return Result;
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableNEONData32, MI, insn, Address,
-                             this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result =
+      decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
     // Add a fake predicate operand, because we share these instruction
     // definitions with Thumb2 where these instructions are predicable.
     if (!DecodePredicateOperand(MI, 0xE, Address, this))
       return MCDisassembler::Fail;
-    return result;
+    return Result;
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableNEONLoadStore32, MI, insn, Address,
+  Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address,
                              this, STI);
-  if (result != MCDisassembler::Fail) {
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
     // Add a fake predicate operand, because we share these instruction
     // definitions with Thumb2 where these instructions are predicable.
     if (!DecodePredicateOperand(MI, 0xE, Address, this))
       return MCDisassembler::Fail;
-    return result;
+    return Result;
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableNEONDup32, MI, insn, Address,
-                             this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result =
+      decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
     // Add a fake predicate operand, because we share these instruction
     // definitions with Thumb2 where these instructions are predicable.
     if (!DecodePredicateOperand(MI, 0xE, Address, this))
       return MCDisassembler::Fail;
-    return result;
+    return Result;
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTablev8NEON32, MI, insn, Address,
-                             this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result =
+      decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
-    return result;
+    return Result;
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTablev8Crypto32, MI, insn, Address,
-                             this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result =
+      decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
-    return result;
+    return Result;
   }
 
   MI.clear();
@@ -681,55 +670,53 @@
 }
 
 DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                               const MemoryObject &Region,
+                                               ArrayRef<uint8_t> Bytes,
                                                uint64_t Address,
-                                               raw_ostream &os,
-                                               raw_ostream &cs) const {
-  CommentStream = &cs;
-
-  uint8_t bytes[4];
+                                               raw_ostream &OS,
+                                               raw_ostream &CS) const {
+  CommentStream = &CS;
 
   assert((STI.getFeatureBits() & ARM::ModeThumb) &&
          "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
 
   // We want to read exactly 2 bytes of data.
-  if (Region.readBytes(Address, 2, bytes) == -1) {
+  if (Bytes.size() < 2) {
     Size = 0;
     return MCDisassembler::Fail;
   }
 
-  uint16_t insn16 = (bytes[1] << 8) | bytes[0];
-  DecodeStatus result = decodeInstruction(DecoderTableThumb16, MI, insn16,
-                                          Address, this, STI);
-  if (result != MCDisassembler::Fail) {
+  uint16_t Insn16 = (Bytes[1] << 8) | Bytes[0];
+  DecodeStatus Result =
+      decodeInstruction(DecoderTableThumb16, MI, Insn16, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 2;
-    Check(result, AddThumbPredicate(MI));
-    return result;
+    Check(Result, AddThumbPredicate(MI));
+    return Result;
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableThumbSBit16, MI, insn16,
-                             Address, this, STI);
-  if (result) {
+  Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this,
+                             STI);
+  if (Result) {
     Size = 2;
     bool InITBlock = ITBlock.instrInITBlock();
-    Check(result, AddThumbPredicate(MI));
+    Check(Result, AddThumbPredicate(MI));
     AddThumb1SBit(MI, InITBlock);
-    return result;
+    return Result;
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableThumb216, MI, insn16,
-                             Address, this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result =
+      decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 2;
 
     // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
     // the Thumb predicate.
     if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
-      result = MCDisassembler::SoftFail;
+      Result = MCDisassembler::SoftFail;
 
-    Check(result, AddThumbPredicate(MI));
+    Check(Result, AddThumbPredicate(MI));
 
     // If we find an IT instruction, we need to parse its condition
     // code and mask operands so that we can apply them correctly
@@ -741,115 +728,115 @@
       ITBlock.setITState(Firstcond, Mask);
     }
 
-    return result;
+    return Result;
   }
 
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, bytes) == -1) {
+  if (Bytes.size() < 4) {
     Size = 0;
     return MCDisassembler::Fail;
   }
 
-  uint32_t insn32 = (bytes[3] <<  8) |
-                    (bytes[2] <<  0) |
-                    (bytes[1] << 24) |
-                    (bytes[0] << 16);
+  uint32_t Insn32 =
+      (Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16);
   MI.clear();
-  result = decodeInstruction(DecoderTableThumb32, MI, insn32, Address,
-                             this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result =
+      decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
     bool InITBlock = ITBlock.instrInITBlock();
-    Check(result, AddThumbPredicate(MI));
+    Check(Result, AddThumbPredicate(MI));
     AddThumb1SBit(MI, InITBlock);
-    return result;
+    return Result;
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableThumb232, MI, insn32, Address,
-                             this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result =
+      decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
-    Check(result, AddThumbPredicate(MI));
-    return result;
+    Check(Result, AddThumbPredicate(MI));
+    return Result;
   }
 
-  if (fieldFromInstruction(insn32, 28, 4) == 0xE) {
+  if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
     MI.clear();
-    result = decodeInstruction(DecoderTableVFP32, MI, insn32, Address, this, STI);
-    if (result != MCDisassembler::Fail) {
+    Result =
+        decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
       Size = 4;
       UpdateThumbVFPPredicate(MI);
-      return result;
+      return Result;
     }
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableVFPV832, MI, insn32, Address, this, STI);
-  if (result != MCDisassembler::Fail) {
+  Result =
+      decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
     Size = 4;
-    return result;
+    return Result;
   }
 
-  if (fieldFromInstruction(insn32, 28, 4) == 0xE) {
+  if (fieldFromInstruction(Insn32, 28, 4) == 0xE) {
     MI.clear();
-    result = decodeInstruction(DecoderTableNEONDup32, MI, insn32, Address,
-                               this, STI);
-    if (result != MCDisassembler::Fail) {
+    Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this,
+                               STI);
+    if (Result != MCDisassembler::Fail) {
       Size = 4;
-      Check(result, AddThumbPredicate(MI));
-      return result;
+      Check(Result, AddThumbPredicate(MI));
+      return Result;
     }
   }
 
-  if (fieldFromInstruction(insn32, 24, 8) == 0xF9) {
+  if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) {
     MI.clear();
-    uint32_t NEONLdStInsn = insn32;
+    uint32_t NEONLdStInsn = Insn32;
     NEONLdStInsn &= 0xF0FFFFFF;
     NEONLdStInsn |= 0x04000000;
-    result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
+    Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
                                Address, this, STI);
-    if (result != MCDisassembler::Fail) {
+    if (Result != MCDisassembler::Fail) {
       Size = 4;
-      Check(result, AddThumbPredicate(MI));
-      return result;
+      Check(Result, AddThumbPredicate(MI));
+      return Result;
     }
   }
 
-  if (fieldFromInstruction(insn32, 24, 4) == 0xF) {
+  if (fieldFromInstruction(Insn32, 24, 4) == 0xF) {
     MI.clear();
-    uint32_t NEONDataInsn = insn32;
+    uint32_t NEONDataInsn = Insn32;
     NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
     NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
     NEONDataInsn |= 0x12000000; // Set bits 28 and 25
-    result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
+    Result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
                                Address, this, STI);
-    if (result != MCDisassembler::Fail) {
+    if (Result != MCDisassembler::Fail) {
       Size = 4;
-      Check(result, AddThumbPredicate(MI));
-      return result;
+      Check(Result, AddThumbPredicate(MI));
+      return Result;
     }
 
     MI.clear();
-    uint32_t NEONCryptoInsn = insn32;
+    uint32_t NEONCryptoInsn = Insn32;
     NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
     NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
     NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
-    result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
+    Result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
                                Address, this, STI);
-    if (result != MCDisassembler::Fail) {
+    if (Result != MCDisassembler::Fail) {
       Size = 4;
-      return result;
+      return Result;
     }
 
     MI.clear();
-    uint32_t NEONv8Insn = insn32;
+    uint32_t NEONv8Insn = Insn32;
     NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
-    result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+    Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
                                this, STI);
-    if (result != MCDisassembler::Fail) {
+    if (Result != MCDisassembler::Fail) {
       Size = 4;
-      return result;
+      return Result;
     }
   }
 
@@ -1015,7 +1002,11 @@
 
 static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  bool hasD16 = featureBits & ARM::FeatureD16;
+
+  if (RegNo > 31 || (hasD16 && RegNo > 15))
     return MCDisassembler::Fail;
 
   unsigned Register = DPRDecoderTable[RegNo];
@@ -2973,11 +2964,9 @@
   if (size == 0x3) {
     if (align == 0)
       return MCDisassembler::Fail;
-    size = 4;
     align = 16;
   } else {
     if (size == 2) {
-      size = 1 << size;
       align *= 8;
     } else {
       size = 1 << size;
@@ -3267,6 +3256,11 @@
   unsigned Rt = fieldFromInstruction(Insn, 12, 4);
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
 
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  bool hasMP = featureBits & ARM::FeatureMP;
+  bool hasV7Ops = featureBits & ARM::HasV7Ops;
+
   if (Rn == 15) {
     switch (Inst.getOpcode()) {
     case ARM::t2LDRBs:
@@ -3302,11 +3296,10 @@
     case ARM::t2LDRSHs:
       return MCDisassembler::Fail;
     case ARM::t2LDRHs:
-      // FIXME: this instruction is only available with MP extensions,
-      // this should be checked first but we don't have access to the
-      // feature bits here.
       Inst.setOpcode(ARM::t2PLDWs);
       break;
+    case ARM::t2LDRSBs:
+      Inst.setOpcode(ARM::t2PLIs);
     default:
       break;
     }
@@ -3314,8 +3307,14 @@
 
   switch (Inst.getOpcode()) {
     case ARM::t2PLDs:
-    case ARM::t2PLDWs:
+      break;
     case ARM::t2PLIs:
+      if (!hasV7Ops)
+        return MCDisassembler::Fail;
+      break;
+    case ARM::t2PLDWs:
+      if (!hasV7Ops || !hasMP)
+        return MCDisassembler::Fail;
       break;
     default:
       if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
@@ -3341,6 +3340,12 @@
   unsigned imm = fieldFromInstruction(Insn, 0, 8);
   imm |= (U << 8);
   imm |= (Rn << 9);
+  unsigned add = fieldFromInstruction(Insn, 9, 1);
+
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  bool hasMP = featureBits & ARM::FeatureMP;
+  bool hasV7Ops = featureBits & ARM::HasV7Ops;
 
   if (Rn == 15) {
     switch (Inst.getOpcode()) {
@@ -3375,6 +3380,13 @@
     switch (Inst.getOpcode()) {
     case ARM::t2LDRSHi8:
       return MCDisassembler::Fail;
+    case ARM::t2LDRHi8:
+      if (!add)
+        Inst.setOpcode(ARM::t2PLDWi8);
+      break;
+    case ARM::t2LDRSBi8:
+      Inst.setOpcode(ARM::t2PLIi8);
+      break;
     default:
       break;
     }
@@ -3382,9 +3394,15 @@
 
   switch (Inst.getOpcode()) {
   case ARM::t2PLDi8:
-  case ARM::t2PLIi8:
-  case ARM::t2PLDWi8:
     break;
+  case ARM::t2PLIi8:
+    if (!hasV7Ops)
+      return MCDisassembler::Fail;
+    break;
+  case ARM::t2PLDWi8:
+      if (!hasV7Ops || !hasMP)
+        return MCDisassembler::Fail;
+      break;
   default:
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -3404,6 +3422,11 @@
   unsigned imm = fieldFromInstruction(Insn, 0, 12);
   imm |= (Rn << 13);
 
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  bool hasMP = (featureBits & ARM::FeatureMP);
+  bool hasV7Ops = (featureBits & ARM::HasV7Ops);
+
   if (Rn == 15) {
     switch (Inst.getOpcode()) {
     case ARM::t2LDRi12:
@@ -3438,7 +3461,10 @@
     case ARM::t2LDRSHi12:
       return MCDisassembler::Fail;
     case ARM::t2LDRHi12:
-      Inst.setOpcode(ARM::t2PLDi12);
+      Inst.setOpcode(ARM::t2PLDWi12);
+      break;
+    case ARM::t2LDRSBi12:
+      Inst.setOpcode(ARM::t2PLIi12);
       break;
     default:
       break;
@@ -3447,9 +3473,15 @@
 
   switch (Inst.getOpcode()) {
   case ARM::t2PLDi12:
-  case ARM::t2PLDWi12:
-  case ARM::t2PLIi12:
     break;
+  case ARM::t2PLIi12:
+    if (!hasV7Ops)
+      return MCDisassembler::Fail;
+    break;
+  case ARM::t2PLDWi12:
+      if (!hasV7Ops || !hasMP)
+        return MCDisassembler::Fail;
+      break;
   default:
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -3507,6 +3539,10 @@
   unsigned U = fieldFromInstruction(Insn, 23, 1);
   int imm = fieldFromInstruction(Insn, 0, 12);
 
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  bool hasV7Ops = (featureBits & ARM::HasV7Ops);
+
   if (Rt == 15) {
     switch (Inst.getOpcode()) {
       case ARM::t2LDRBpci:
@@ -3525,7 +3561,10 @@
 
   switch(Inst.getOpcode()) {
   case ARM::t2PLDpci:
+    break;
   case ARM::t2PLIpci:
+    if (!hasV7Ops)
+      return MCDisassembler::Fail;
     break;
   default:
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
@@ -3974,7 +4013,85 @@
 
 static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
                           uint64_t Address, const void *Decoder) {
-  if (!Val) return MCDisassembler::Fail;
+  DecodeStatus S = MCDisassembler::Success;
+  uint64_t FeatureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  if (FeatureBits & ARM::FeatureMClass) {
+    unsigned ValLow = Val & 0xff;
+
+    // Validate the SYSm value first.
+    switch (ValLow) {
+    case  0: // apsr
+    case  1: // iapsr
+    case  2: // eapsr
+    case  3: // xpsr
+    case  5: // ipsr
+    case  6: // epsr
+    case  7: // iepsr
+    case  8: // msp
+    case  9: // psp
+    case 16: // primask
+    case 20: // control
+      break;
+    case 17: // basepri
+    case 18: // basepri_max
+    case 19: // faultmask
+      if (!(FeatureBits & ARM::HasV7Ops))
+        // Values basepri, basepri_max and faultmask are only valid for v7m.
+        return MCDisassembler::Fail;
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+
+    if (Inst.getOpcode() == ARM::t2MSR_M) {
+      unsigned Mask = fieldFromInstruction(Val, 10, 2);
+      if (!(FeatureBits & ARM::HasV7Ops)) {
+        // The ARMv6-M MSR bits {11-10} can be only 0b10, other values are
+        // unpredictable.
+        if (Mask != 2)
+          S = MCDisassembler::SoftFail;
+      }
+      else {
+        // The ARMv7-M architecture stores an additional 2-bit mask value in
+        // MSR bits {11-10}. The mask is used only with apsr, iapsr, eapsr and
+        // xpsr, it has to be 0b10 in other cases. Bit mask{1} indicates if
+        // the NZCVQ bits should be moved by the instruction. Bit mask{0}
+        // indicates the move for the GE{3:0} bits, the mask{0} bit can be set
+        // only if the processor includes the DSP extension.
+        if (Mask == 0 || (Mask != 2 && ValLow > 3) ||
+            (!(FeatureBits & ARM::FeatureDSPThumb2) && (Mask & 1)))
+          S = MCDisassembler::SoftFail;
+      }
+    }
+  } else {
+    // A/R class
+    if (Val == 0)
+      return MCDisassembler::Fail;
+  }
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return S;
+}
+
+static DecodeStatus DecodeBankedReg(MCInst &Inst, unsigned Val,
+                                    uint64_t Address, const void *Decoder) {
+
+  unsigned R = fieldFromInstruction(Val, 5, 1);
+  unsigned SysM = fieldFromInstruction(Val, 0, 5);
+
+  // The table of encodings for these banked registers comes from B9.2.3 of the
+  // ARM ARM. There are patterns, but nothing regular enough to make this logic
+  // neater. So by fiat, these values are UNPREDICTABLE:
+  if (!R) {
+    if (SysM == 0x7 || SysM == 0xf || SysM == 0x18 || SysM == 0x19 ||
+        SysM == 0x1a || SysM == 0x1b)
+      return MCDisassembler::SoftFail;
+  } else {
+    if (SysM != 0xe && SysM != 0x10 && SysM != 0x12 && SysM != 0x14 &&
+        SysM != 0x16 && SysM != 0x1c && SysM != 0x1e)
+      return MCDisassembler::SoftFail;
+  }
+
   Inst.addOperand(MCOperand::CreateImm(Val));
   return MCDisassembler::Success;
 }

diff --git a/lib/Target/ARM/Disassembler/LLVMBuild.txt b/lib/Target/ARM/Disassembler/LLVMBuild.txt
index 52d8338..a64a8a9 100644
--- a/lib/Target/ARM/Disassembler/LLVMBuild.txt
+++ b/lib/Target/ARM/Disassembler/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = ARMDisassembler
 parent = ARM
-required_libraries = ARMDesc ARMInfo MC Support
+required_libraries = ARMDesc ARMInfo MCDisassembler Support
 add_to_library_groups = ARM

diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 228fb57..0570084 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp

@@ -503,30 +503,6 @@
 // Addressing Mode #3
 //===--------------------------------------------------------------------===//
 
-void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op,
-                                         raw_ostream &O) {
-  const MCOperand &MO1 = MI->getOperand(Op);
-  const MCOperand &MO2 = MI->getOperand(Op+1);
-  const MCOperand &MO3 = MI->getOperand(Op+2);
-
-  O << markup("<mem:") << "[";
-  printRegName(O, MO1.getReg());
-  O << "], " << markup(">");
-
-  if (MO2.getReg()) {
-    O << (char)ARM_AM::getAM3Op(MO3.getImm());
-    printRegName(O, MO2.getReg());
-    return;
-  }
-
-  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
-  O << markup("<imm:")
-    << '#'
-    << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
-    << ImmOffs
-    << markup(">");
-}
-
 void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
                                                 raw_ostream &O,
                                                 bool AlwaysPrintImm0) {
@@ -568,13 +544,9 @@
     return;
   }
 
-  const MCOperand &MO3 = MI->getOperand(Op+2);
-  unsigned IdxMode = ARM_AM::getAM3IdxMode(MO3.getImm());
-
-  if (IdxMode == ARMII::IndexModePost) {
-    printAM3PostIndexOp(MI, Op, O);
-    return;
-  }
+  assert(ARM_AM::getAM3IdxMode(MI->getOperand(Op + 2).getImm()) !=
+             ARMII::IndexModePost &&
+         "unexpected idxmode");
   printAM3PreOrOffsetIndexOp(MI, Op, O, AlwaysPrintImm0);
 }
 
@@ -807,52 +779,56 @@
   const MCOperand &Op = MI->getOperand(OpNum);
   unsigned SpecRegRBit = Op.getImm() >> 4;
   unsigned Mask = Op.getImm() & 0xf;
+  uint64_t FeatureBits = getAvailableFeatures();
 
-  if (getAvailableFeatures() & ARM::FeatureMClass) {
+  if (FeatureBits & ARM::FeatureMClass) {
     unsigned SYSm = Op.getImm();
     unsigned Opcode = MI->getOpcode();
-    // For reads of the special registers ignore the "mask encoding" bits
-    // which are only for writes.
-    if (Opcode == ARM::t2MRS_M)
-      SYSm &= 0xff;
+
+    // For writes, handle extended mask bits if the DSP extension is present.
+    if (Opcode == ARM::t2MSR_M && (FeatureBits & ARM::FeatureDSPThumb2)) {
+      switch (SYSm) {
+      case 0x400: O << "apsr_g"; return;
+      case 0xc00: O << "apsr_nzcvqg"; return;
+      case 0x401: O << "iapsr_g"; return;
+      case 0xc01: O << "iapsr_nzcvqg"; return;
+      case 0x402: O << "eapsr_g"; return;
+      case 0xc02: O << "eapsr_nzcvqg"; return;
+      case 0x403: O << "xpsr_g"; return;
+      case 0xc03: O << "xpsr_nzcvqg"; return;
+      }
+    }
+
+    // Handle the basic 8-bit mask.
+    SYSm &= 0xff;
+
+    if (Opcode == ARM::t2MSR_M && (FeatureBits & ARM::HasV7Ops)) {
+      // ARMv7-M deprecates using MSR APSR without a _<bits> qualifier as an
+      // alias for MSR APSR_nzcvq.
+      switch (SYSm) {
+      case 0: O << "apsr_nzcvq"; return;
+      case 1: O << "iapsr_nzcvq"; return;
+      case 2: O << "eapsr_nzcvq"; return;
+      case 3: O << "xpsr_nzcvq"; return;
+      }
+    }
+
     switch (SYSm) {
     default: llvm_unreachable("Unexpected mask value!");
-    case     0:
-    case 0x800: O << "apsr"; return; // with _nzcvq bits is an alias for aspr
-    case 0x400: O << "apsr_g"; return;
-    case 0xc00: O << "apsr_nzcvqg"; return;
-    case     1:
-    case 0x801: O << "iapsr"; return; // with _nzcvq bits is an alias for iapsr
-    case 0x401: O << "iapsr_g"; return;
-    case 0xc01: O << "iapsr_nzcvqg"; return;
-    case     2:
-    case 0x802: O << "eapsr"; return; // with _nzcvq bits is an alias for eapsr
-    case 0x402: O << "eapsr_g"; return;
-    case 0xc02: O << "eapsr_nzcvqg"; return;
-    case     3:
-    case 0x803: O << "xpsr"; return; // with _nzcvq bits is an alias for xpsr
-    case 0x403: O << "xpsr_g"; return;
-    case 0xc03: O << "xpsr_nzcvqg"; return;
-    case     5:
-    case 0x805: O << "ipsr"; return;
-    case     6:
-    case 0x806: O << "epsr"; return;
-    case     7:
-    case 0x807: O << "iepsr"; return;
-    case     8:
-    case 0x808: O << "msp"; return;
-    case     9:
-    case 0x809: O << "psp"; return;
-    case  0x10:
-    case 0x810: O << "primask"; return;
-    case  0x11:
-    case 0x811: O << "basepri"; return;
-    case  0x12:
-    case 0x812: O << "basepri_max"; return;
-    case  0x13:
-    case 0x813: O << "faultmask"; return;
-    case  0x14:
-    case 0x814: O << "control"; return;
+    case  0: O << "apsr"; return;
+    case  1: O << "iapsr"; return;
+    case  2: O << "eapsr"; return;
+    case  3: O << "xpsr"; return;
+    case  5: O << "ipsr"; return;
+    case  6: O << "epsr"; return;
+    case  7: O << "iepsr"; return;
+    case  8: O << "msp"; return;
+    case  9: O << "psp"; return;
+    case 16: O << "primask"; return;
+    case 17: O << "basepri"; return;
+    case 18: O << "basepri_max"; return;
+    case 19: O << "faultmask"; return;
+    case 20: O << "control"; return;
     }
   }
 
@@ -882,6 +858,42 @@
   }
 }
 
+void ARMInstPrinter::printBankedRegOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  uint32_t Banked = MI->getOperand(OpNum).getImm();
+  uint32_t R = (Banked & 0x20) >> 5;
+  uint32_t SysM = Banked & 0x1f;
+
+  // Nothing much we can do about this, the encodings are specified in B9.2.3 of
+  // the ARM ARM v7C, and are all over the shop.
+  if (R) {
+    O << "SPSR_";
+
+    switch(SysM) {
+    case 0x0e: O << "fiq"; return;
+    case 0x10: O << "irq"; return;
+    case 0x12: O << "svc"; return;
+    case 0x14: O << "abt"; return;
+    case 0x16: O << "und"; return;
+    case 0x1c: O << "mon"; return;
+    case 0x1e: O << "hyp"; return;
+    default: llvm_unreachable("Invalid banked SPSR register");
+    }
+  }
+
+  assert(!R && "should have dealt with SPSR regs");
+  const char *RegNames[] = {
+    "r8_usr", "r9_usr", "r10_usr", "r11_usr", "r12_usr", "sp_usr", "lr_usr", "",
+    "r8_fiq", "r9_fiq", "r10_fiq", "r11_fiq", "r12_fiq", "sp_fiq", "lr_fiq", "",
+    "lr_irq", "sp_irq", "lr_svc",  "sp_svc",  "lr_abt",  "sp_abt", "lr_und", "sp_und",
+    "",       "",       "",        "",        "lr_mon",  "sp_mon", "elr_hyp", "sp_hyp"
+  };
+  const char *Name = RegNames[SysM];
+  assert(Name[0] && "invalid banked register operand");
+
+  O << Name;
+}
+
 void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
   ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();

diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index f671fe4..09fd536 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMINSTPRINTER_H
-#define ARMINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
+#define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -51,7 +51,6 @@
   void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
-  void printAM3PostIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O,
                                   bool AlwaysPrintImm0);
   void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum,
@@ -117,6 +116,7 @@
   void printCPSIMod(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printCPSIFlag(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBankedRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum,
                                       raw_ostream &O);

diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index b6c85c2..f0eed9b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
-#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMADDRESSINGMODES_H
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -575,6 +575,53 @@
     return Val;
   }
 
+  // Generic validation for single-byte immediate (0X00, 00X0, etc).
+  static inline bool isNEONBytesplat(unsigned Value, unsigned Size) {
+    assert(Size >= 1 && Size <= 4 && "Invalid size");
+    unsigned count = 0;
+    for (unsigned i = 0; i < Size; ++i) {
+      if (Value & 0xff) count++;
+      Value >>= 8;
+    }
+    return count == 1;
+  }
+
+  /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
+  static inline bool isNEONi16splat(unsigned Value) {
+    if (Value > 0xffff)
+      return false;
+    // i16 value with set bits only in one byte X0 or 0X.
+    return Value == 0 || isNEONBytesplat(Value, 2);
+  }
+
+  // Encode NEON 16 bits Splat immediate for instructions like VBIC/VORR
+  static inline unsigned encodeNEONi16splat(unsigned Value) {
+    assert(isNEONi16splat(Value) && "Invalid NEON splat value");
+    if (Value >= 0x100)
+      Value = (Value >> 8) | 0xa00;
+    else
+      Value |= 0x800;
+    return Value;
+  }
+
+  /// Checks if Value is a correct immediate for instructions like VBIC/VORR.
+  static inline bool isNEONi32splat(unsigned Value) {
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
+    return Value == 0 || isNEONBytesplat(Value, 4);
+  }
+
+  /// Encode NEON 32 bits Splat immediate for instructions like VBIC/VORR.
+  static inline unsigned encodeNEONi32splat(unsigned Value) {
+    assert(isNEONi32splat(Value) && "Invalid NEON splat value");
+    if (Value >= 0x100 && Value <= 0xff00)
+      Value = (Value >> 8) | 0x200;
+    else if (Value > 0xffff && Value <= 0xff0000)
+      Value = (Value >> 16) | 0x400;
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    return Value;
+  }
+
   AMSubMode getLoadStoreMultipleSubMode(int Opcode);
 
   //===--------------------------------------------------------------------===//

diff --git a/lib/Target/ARM/MCTargetDesc/ARMArchName.h b/lib/Target/ARM/MCTargetDesc/ARMArchName.h
index 34b9fc1..bc05673 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMArchName.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMArchName.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMARCHNAME_H
-#define ARMARCHNAME_H
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMARCHNAME_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMARCHNAME_H
 
 namespace llvm {
 namespace ARM {
@@ -24,4 +24,4 @@
 } // namespace ARM
 } // namespace llvm
 
-#endif // ARMARCHNAME_H
+#endif

diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 7acd9cc..0b2e3b0 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp

@@ -9,6 +9,10 @@
 
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMAsmBackend.h"
+#include "MCTargetDesc/ARMAsmBackendDarwin.h"
+#include "MCTargetDesc/ARMAsmBackendELF.h"
+#include "MCTargetDesc/ARMAsmBackendWinCOFF.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -35,164 +39,136 @@
 class ARMELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   ARMELFObjectWriter(uint8_t OSABI)
-    : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_ARM,
-                              /*HasRelocationAddend*/ false) {}
+      : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_ARM,
+                                /*HasRelocationAddend*/ false) {}
 };
 
-class ARMAsmBackend : public MCAsmBackend {
-  const MCSubtargetInfo* STI;
-  bool isThumbMode;     // Currently emitting Thumb code.
-  bool IsLittleEndian;  // Big or little endian.
-public:
-  ARMAsmBackend(const Target &T, const StringRef TT, bool IsLittle)
-    : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
-      isThumbMode(TT.startswith("thumb")), IsLittleEndian(IsLittle) {}
+const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // ARMFixupKinds.h.
+      //
+      // Name                      Offset (bits) Size (bits)     Flags
+      {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_ldst_pcrel_12", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_10", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_thumb_adr_pcrel_10", 0, 8,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_adr_pcrel_12", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_condbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_uncondbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_uncondbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_condbl", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_blx", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_cp", 0, 8,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_thumb_bcc", 0, 8, MCFixupKindInfo::FKF_IsPCRel},
+      // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16
+      // - 19.
+      {"fixup_arm_movt_hi16", 0, 20, 0},
+      {"fixup_arm_movw_lo16", 0, 20, 0},
+      {"fixup_t2_movt_hi16", 0, 20, 0},
+      {"fixup_t2_movw_lo16", 0, 20, 0},
+  };
+  const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // ARMFixupKinds.h.
+      //
+      // Name                      Offset (bits) Size (bits)     Flags
+      {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_ldst_pcrel_12", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_pcrel_10", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_thumb_adr_pcrel_10", 8, 8,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_adr_pcrel_12", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_condbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_uncondbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_t2_uncondbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_uncondbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_condbl", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_blx", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_bl", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_blx", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_cb", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_thumb_cp", 8, 8,
+       MCFixupKindInfo::FKF_IsPCRel |
+           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_thumb_bcc", 8, 8, MCFixupKindInfo::FKF_IsPCRel},
+      // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16
+      // - 19.
+      {"fixup_arm_movt_hi16", 12, 20, 0},
+      {"fixup_arm_movw_lo16", 12, 20, 0},
+      {"fixup_t2_movt_hi16", 12, 20, 0},
+      {"fixup_t2_movw_lo16", 12, 20, 0},
+  };
 
-  ~ARMAsmBackend() {
-    delete STI;
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return (IsLittleEndian ? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
+}
+
+void ARMAsmBackend::handleAssemblerFlag(MCAssemblerFlag Flag) {
+  switch (Flag) {
+  default:
+    break;
+  case MCAF_Code16:
+    setIsThumb(true);
+    break;
+  case MCAF_Code32:
+    setIsThumb(false);
+    break;
   }
-
-  unsigned getNumFixupKinds() const override {
-    return ARM::NumTargetFixupKinds;
-  }
-
-  bool hasNOP() const {
-    return (STI->getFeatureBits() & ARM::HasV6T2Ops) != 0;
-  }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
-    const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
-// This table *must* be in the order that the fixup_* kinds are defined in
-// ARMFixupKinds.h.
-//
-// Name                      Offset (bits) Size (bits)     Flags
-{ "fixup_arm_ldst_pcrel_12", 0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_pcrel_10_unscaled", 0,        32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_pcrel_10",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_pcrel_10",       0,            32,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_thumb_adr_pcrel_10",0,            8,   MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_adr_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_condbranch",    0,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_uncondbranch",  0,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_uncondbl",      0,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_condbl",        0,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_blx",           0,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_cp",      0,             8,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
-// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
-{ "fixup_arm_movt_hi16",     0,            20,  0 },
-{ "fixup_arm_movw_lo16",     0,            20,  0 },
-{ "fixup_t2_movt_hi16",      0,            20,  0 },
-{ "fixup_t2_movw_lo16",      0,            20,  0 },
-    };
-    const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
-// This table *must* be in the order that the fixup_* kinds are defined in
-// ARMFixupKinds.h.
-//
-// Name                      Offset (bits) Size (bits)     Flags
-{ "fixup_arm_ldst_pcrel_12", 0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_pcrel_10_unscaled", 0,        32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_pcrel_10",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_pcrel_10",       0,            32,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_thumb_adr_pcrel_10",8,            8,   MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_adr_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_condbranch",    8,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_uncondbranch",  8,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_uncondbl",      8,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_condbl",        8,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_blx",           8,            24,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_cp",      8,             8,  MCFixupKindInfo::FKF_IsPCRel |
-                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_thumb_bcc",     8,             8,  MCFixupKindInfo::FKF_IsPCRel },
-// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
-{ "fixup_arm_movt_hi16",     12,           20,  0 },
-{ "fixup_arm_movw_lo16",     12,           20,  0 },
-{ "fixup_t2_movt_hi16",      12,           20,  0 },
-{ "fixup_t2_movw_lo16",      12,           20,  0 },
-    };
-
-    if (Kind < FirstTargetFixupKind)
-      return MCAsmBackend::getFixupKindInfo(Kind);
-
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return (IsLittleEndian ? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
-  }
-
-  /// processFixupValue - Target hook to process the literal value of a fixup
-  /// if necessary.
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
-
-  bool mayNeedRelaxation(const MCInst &Inst) const override;
-
-  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const override;
-
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
-
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
-
-  void handleAssemblerFlag(MCAssemblerFlag Flag) override {
-    switch (Flag) {
-    default: break;
-    case MCAF_Code16:
-      setIsThumb(true);
-      break;
-    case MCAF_Code32:
-      setIsThumb(false);
-      break;
-    }
-  }
-
-  unsigned getPointerSize() const { return 4; }
-  bool isThumb() const { return isThumbMode; }
-  void setIsThumb(bool it) { isThumbMode = it; }
-  bool isLittle() const { return IsLittleEndian; }
-};
+}
 } // end anonymous namespace
 
 static unsigned getRelaxedOpcode(unsigned Op) {
   switch (Op) {
-  default: return Op;
-  case ARM::tBcc:       return ARM::t2Bcc;
-  case ARM::tLDRpci:    return ARM::t2LDRpci;
-  case ARM::tADR:       return ARM::t2ADR;
-  case ARM::tB:         return ARM::t2B;
-  case ARM::tCBZ:       return ARM::tHINT;
-  case ARM::tCBNZ:      return ARM::tHINT;
+  default:
+    return Op;
+  case ARM::tBcc:
+    return ARM::t2Bcc;
+  case ARM::tLDRpci:
+    return ARM::t2LDRpci;
+  case ARM::tADR:
+    return ARM::t2ADR;
+  case ARM::tB:
+    return ARM::t2B;
+  case ARM::tCBZ:
+    return ARM::tHINT;
+  case ARM::tCBNZ:
+    return ARM::tHINT;
   }
 }
 
@@ -202,8 +178,7 @@
   return false;
 }
 
-bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                         uint64_t Value,
+bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                                          const MCRelaxableFragment *DF,
                                          const MCAsmLayout &Layout) const {
   switch ((unsigned)Fixup.getKind()) {
@@ -265,7 +240,7 @@
     Res.addOperand(MCOperand::CreateImm(14));
     Res.addOperand(MCOperand::CreateReg(0));
     return;
-  } 
+  }
 
   // The rest of instructions we're relaxing have the same operands.
   // We just need to update to the proper opcode.
@@ -276,11 +251,11 @@
 bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
   const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
-  const uint32_t ARMv4_NopEncoding = 0xe1a00000; // using MOV r0,r0
+  const uint32_t ARMv4_NopEncoding = 0xe1a00000;   // using MOV r0,r0
   const uint32_t ARMv6T2_NopEncoding = 0xe320f000; // NOP
   if (isThumb()) {
-    const uint16_t nopEncoding = hasNOP() ? Thumb2_16bitNopEncoding
-                                          : Thumb1_16bitNopEncoding;
+    const uint16_t nopEncoding =
+        hasNOP() ? Thumb2_16bitNopEncoding : Thumb1_16bitNopEncoding;
     uint64_t NumNops = Count / 2;
     for (uint64_t i = 0; i != NumNops; ++i)
       OW->Write16(nopEncoding);
@@ -289,18 +264,26 @@
     return true;
   }
   // ARM mode
-  const uint32_t nopEncoding = hasNOP() ? ARMv6T2_NopEncoding
-                                        : ARMv4_NopEncoding;
+  const uint32_t nopEncoding =
+      hasNOP() ? ARMv6T2_NopEncoding : ARMv4_NopEncoding;
   uint64_t NumNops = Count / 4;
   for (uint64_t i = 0; i != NumNops; ++i)
     OW->Write32(nopEncoding);
   // FIXME: should this function return false when unable to write exactly
   // 'Count' bytes with NOP encodings?
   switch (Count % 4) {
-  default: break; // No leftover bytes to write
-  case 1: OW->Write8(0); break;
-  case 2: OW->Write16(0); break;
-  case 3: OW->Write16(0); OW->Write8(0xa0); break;
+  default:
+    break; // No leftover bytes to write
+  case 1:
+    OW->Write8(0);
+    break;
+  case 2:
+    OW->Write16(0);
+    break;
+  case 3:
+    OW->Write16(0);
+    OW->Write8(0xa0);
+    break;
   }
 
   return true;
@@ -313,8 +296,7 @@
     uint32_t Swapped = (Value & 0xFFFF0000) >> 16;
     Swapped |= (Value & 0x0000FFFF) << 16;
     return Swapped;
-  }
-  else
+  } else
     return Value;
 }
 
@@ -351,7 +333,7 @@
   case ARM::fixup_arm_movt_hi16:
     if (!IsPCRel)
       Value >>= 16;
-    // Fallthrough
+  // Fallthrough
   case ARM::fixup_arm_movw_lo16: {
     unsigned Hi4 = (Value & 0xF000) >> 12;
     unsigned Lo12 = Value & 0x0FFF;
@@ -363,7 +345,7 @@
   case ARM::fixup_t2_movt_hi16:
     if (!IsPCRel)
       Value >>= 16;
-    // Fallthrough
+  // Fallthrough
   case ARM::fixup_t2_movw_lo16: {
     unsigned Hi4 = (Value & 0xF000) >> 12;
     unsigned i = (Value & 0x800) >> 11;
@@ -379,7 +361,7 @@
   case ARM::fixup_arm_ldst_pcrel_12:
     // ARM PC-relative values are offset by 8.
     Value -= 4;
-    // FALLTHROUGH
+  // FALLTHROUGH
   case ARM::fixup_t2_ldst_pcrel_12: {
     // Offset by 4, adjusted by two due to the half-word ordering of thumb.
     Value -= 4;
@@ -438,7 +420,8 @@
   case ARM::fixup_arm_blx:
     // These values don't encode the low two bits since they're always zero.
     // Offset by 8 just as above.
-    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+    if (const MCSymbolRefExpr *SRE =
+            dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
       if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
         return 0;
     return 0xffffff & ((Value - 8) >> 2);
@@ -447,17 +430,17 @@
     Value >>= 1; // Low bit is not encoded.
 
     uint32_t out = 0;
-    bool I =  Value & 0x800000;
+    bool I = Value & 0x800000;
     bool J1 = Value & 0x400000;
     bool J2 = Value & 0x200000;
     J1 ^= I;
     J2 ^= I;
 
-    out |= I  << 26; // S bit
-    out |= !J1 << 13; // J1 bit
-    out |= !J2 << 11; // J2 bit
-    out |= (Value & 0x1FF800)  << 5; // imm6 field
-    out |= (Value & 0x0007FF);        // imm11 field
+    out |= I << 26;                 // S bit
+    out |= !J1 << 13;               // J1 bit
+    out |= !J2 << 11;               // J2 bit
+    out |= (Value & 0x1FF800) << 5; // imm6 field
+    out |= (Value & 0x0007FF);      // imm11 field
 
     return swapHalfWords(out, IsLittleEndian);
   }
@@ -498,7 +481,7 @@
 
     uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
     uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
-                          (uint16_t)imm11Bits);
+                           (uint16_t)imm11Bits);
     return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_blx: {
@@ -515,7 +498,8 @@
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
     uint32_t offset = (Value - 2) >> 2;
-    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
+    if (const MCSymbolRefExpr *SRE =
+            dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
       if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_TLSCALL)
         offset = 0;
     uint32_t signBit = (offset & 0x400000) >> 22;
@@ -528,7 +512,7 @@
 
     uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
     uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
-                          ((uint16_t)imm10LBits) << 1);
+                           ((uint16_t)imm10LBits) << 1);
     return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_cp:
@@ -564,7 +548,7 @@
   case ARM::fixup_arm_pcrel_10:
     Value = Value - 4; // ARM fixups offset by an additional word and don't
                        // need to adjust for the half-word ordering.
-    // Fall through.
+                       // Fall through.
   case ARM::fixup_t2_pcrel_10: {
     // Offset by 4, adjusted by two due to the half-word ordering of thumb.
     Value = Value - 4;
@@ -735,7 +719,8 @@
                                bool IsPCRel) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian);
-  if (!Value) return;           // Doesn't change encoding.
+  if (!Value)
+    return; // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
   assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
@@ -757,80 +742,36 @@
   }
 }
 
-namespace {
-// FIXME: This should be in a separate file.
-class ARMWinCOFFAsmBackend : public ARMAsmBackend {
-public:
-  ARMWinCOFFAsmBackend(const Target &T, const StringRef &Triple)
-    : ARMAsmBackend(T, Triple, true) { }
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
-    return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false);
-  }
-};
-
-// FIXME: This should be in a separate file.
-// ELF is an ELF of course...
-class ELFARMAsmBackend : public ARMAsmBackend {
-public:
-  uint8_t OSABI;
-  ELFARMAsmBackend(const Target &T, const StringRef TT,
-                   uint8_t OSABI, bool IsLittle)
-    : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) { }
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
-    return createARMELFObjectWriter(OS, OSABI, isLittle());
-  }
-};
-
-// FIXME: This should be in a separate file.
-class DarwinARMAsmBackend : public ARMAsmBackend {
-public:
-  const MachO::CPUSubTypeARM Subtype;
-  DarwinARMAsmBackend(const Target &T, const StringRef TT,
-                      MachO::CPUSubTypeARM st)
-    : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), Subtype(st) {
-      HasDataInCodeSupport = true;
-    }
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
-    return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
-                                     MachO::CPU_TYPE_ARM,
-                                     Subtype);
-  }
-};
-
-} // end anonymous namespace
-
 MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
-                                        const MCRegisterInfo &MRI,
-                                        StringRef TT, StringRef CPU,
-                                        bool isLittle) {
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU, bool isLittle) {
   Triple TheTriple(TT);
 
   switch (TheTriple.getObjectFormat()) {
-  default: llvm_unreachable("unsupported object format");
+  default:
+    llvm_unreachable("unsupported object format");
   case Triple::MachO: {
     MachO::CPUSubTypeARM CS =
-      StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
-      .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
-      .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ)
-      .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6)
-      .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M)
-      .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM)
-      .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K)
-      .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M)
-      .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S)
-      .Default(MachO::CPU_SUBTYPE_ARM_V7);
+        StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
+            .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
+            .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ)
+            .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6)
+            .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M)
+            .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM)
+            .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K)
+            .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M)
+            .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S)
+            .Default(MachO::CPU_SUBTYPE_ARM_V7);
 
-    return new DarwinARMAsmBackend(T, TT, CS);
+    return new ARMAsmBackendDarwin(T, TT, CS);
   }
   case Triple::COFF:
     assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
-    return new ARMWinCOFFAsmBackend(T, TT);
+    return new ARMAsmBackendWinCOFF(T, TT);
   case Triple::ELF:
     assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target");
     uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
-    return new ELFARMAsmBackend(T, TT, OSABI, isLittle);
+    return new ARMAsmBackendELF(T, TT, OSABI, isLittle);
   }
 }
 
@@ -847,14 +788,13 @@
 }
 
 MCAsmBackend *llvm::createThumbLEAsmBackend(const Target &T,
-                                          const MCRegisterInfo &MRI,
-                                          StringRef TT, StringRef CPU) {
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
   return createARMAsmBackend(T, MRI, TT, CPU, true);
 }
 
 MCAsmBackend *llvm::createThumbBEAsmBackend(const Target &T,
-                                          const MCRegisterInfo &MRI,
-                                          StringRef TT, StringRef CPU) {
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
   return createARMAsmBackend(T, MRI, TT, CPU, false);
 }
-

diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
new file mode 100644
index 0000000..f4f1082
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h

@@ -0,0 +1,69 @@
+//===-- ARMAsmBackend.h - ARM Assembler Backend -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H
+#define LLVM_LIB_TARGET_ARM_ARMASMBACKEND_H
+
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class ARMAsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo *STI;
+  bool isThumbMode;    // Currently emitting Thumb code.
+  bool IsLittleEndian; // Big or little endian.
+public:
+  ARMAsmBackend(const Target &T, StringRef TT, bool IsLittle)
+      : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
+        isThumbMode(TT.startswith("thumb")), IsLittleEndian(IsLittle) {}
+
+  ~ARMAsmBackend() override { delete STI; }
+
+  unsigned getNumFixupKinds() const override {
+    return ARM::NumTargetFixupKinds;
+  }
+
+  bool hasNOP() const { return (STI->getFeatureBits() & ARM::HasV6T2Ops) != 0; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  /// processFixupValue - Target hook to process the literal value of a fixup
+  /// if necessary.
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  void handleAssemblerFlag(MCAssemblerFlag Flag) override;
+
+  unsigned getPointerSize() const { return 4; }
+  bool isThumb() const { return isThumbMode; }
+  void setIsThumb(bool it) { isThumbMode = it; }
+  bool isLittle() const { return IsLittleEndian; }
+};
+} // end anonymous namespace
+
+#endif

diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
new file mode 100644
index 0000000..3bd7ab7
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h

@@ -0,0 +1,33 @@
+//===-- ARMAsmBackendDarwin.h   ARM Asm Backend Darwin ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
+#define LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
+
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class ARMAsmBackendDarwin : public ARMAsmBackend {
+public:
+  const MachO::CPUSubTypeARM Subtype;
+  ARMAsmBackendDarwin(const Target &T, StringRef TT, MachO::CPUSubTypeARM st)
+      : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), Subtype(st) {
+    HasDataInCodeSupport = true;
+  }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
+                                     Subtype);
+  }
+};
+}
+
+#endif

diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h
new file mode 100644
index 0000000..4efd325
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendELF.h

@@ -0,0 +1,27 @@
+//===-- ARMAsmBackendELF.h  ARM Asm Backend ELF -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
+#define LLVM_LIB_TARGET_ARM_ELFARMASMBACKEND_H
+
+using namespace llvm;
+namespace {
+class ARMAsmBackendELF : public ARMAsmBackend {
+public:
+  uint8_t OSABI;
+  ARMAsmBackendELF(const Target &T, StringRef TT, uint8_t OSABI, bool IsLittle)
+      : ARMAsmBackend(T, TT, IsLittle), OSABI(OSABI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createARMELFObjectWriter(OS, OSABI, isLittle());
+  }
+};
+}
+
+#endif

diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h
new file mode 100644
index 0000000..33be347
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackendWinCOFF.h

@@ -0,0 +1,26 @@
+//===-- ARMAsmBackendWinCOFF.h - ARM Asm Backend WinCOFF --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
+#define LLVM_LIB_TARGET_ARM_ARMASMBACKENDWINCOFF_H
+
+using namespace llvm;
+
+namespace {
+class ARMAsmBackendWinCOFF : public ARMAsmBackend {
+public:
+  ARMAsmBackendWinCOFF(const Target &T, StringRef Triple)
+      : ARMAsmBackend(T, Triple, true) {}
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false);
+  }
+};
+}
+
+#endif

diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 1686d76..4289a73 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMBASEINFO_H
-#define ARMBASEINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMBASEINFO_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMBASEINFO_H
 
 #include "ARMMCTargetDesc.h"
 #include "llvm/Support/ErrorHandling.h"

diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 1c84263..f24b419 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp

@@ -37,7 +37,8 @@
     unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                           bool IsPCRel) const override;
 
-    bool needsRelocateWithSymbol(unsigned Type) const override;
+    bool needsRelocateWithSymbol(const MCSymbolData &SD,
+                                 unsigned Type) const override;
   };
 }
 
@@ -48,7 +49,8 @@
 
 ARMELFObjectWriter::~ARMELFObjectWriter() {}
 
-bool ARMELFObjectWriter::needsRelocateWithSymbol(unsigned Type) const {
+bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
+                                                 unsigned Type) const {
   // FIXME: This is extremelly conservative. This really needs to use a
   // whitelist with a clear explanation for why each realocation needs to
   // point to the symbol, not to the section.
@@ -100,7 +102,7 @@
     case ARM::fixup_arm_uncondbl:
       switch (Modifier) {
       case MCSymbolRefExpr::VK_PLT:
-        Type = ELF::R_ARM_PLT32;
+        Type = ELF::R_ARM_CALL;
         break;
       case MCSymbolRefExpr::VK_ARM_TLSCALL:
         Type = ELF::R_ARM_TLS_CALL;

diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 7b5d8b0..24ee537 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp

@@ -848,6 +848,14 @@
                      /* OverwriteExisting= */ false);
     break;
 
+  // FPV5_D16 is identical to FP_ARMV8 except for the number of D registers, so
+  // uses the FP_ARMV8_D16 build attribute.
+  case ARM::FPV5_D16:
+    setAttributeItem(ARMBuildAttrs::FP_arch,
+                     ARMBuildAttrs::AllowFPARMv8B,
+                     /* OverwriteExisting= */ false);
+    break;
+
   case ARM::NEON:
     setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPv3A,
@@ -1339,10 +1347,9 @@
   return S;
 }
 
-  MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack,
-                                      bool IsThumb) {
+MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    bool RelaxAll, bool IsThumb) {
     ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb);
     new ARMTargetELFStreamer(*S);
     // FIXME: This should eventually end up somewhere else where more
@@ -1352,8 +1359,6 @@
 
     if (RelaxAll)
       S->getAssembler().setRelaxAll(true);
-    if (NoExecStack)
-      S->getAssembler().setNoExecStack(true);
     return S;
   }
 

diff --git a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index bfd9e33..46ba571 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ARM_ARMFIXUPKINDS_H
-#define LLVM_ARM_ARMFIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMFIXUPKINDS_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMFIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 7a19208..1d82099 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp

@@ -55,7 +55,6 @@
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
 
-  HasLEB128 = true;
   SupportsDebugInformation = true;
 
   // Exceptions handling
@@ -103,7 +102,6 @@
   Code32Directive = ".code\t32";
   PrivateGlobalPrefix = ".L";
 
-  HasLEB128 = true;
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::None;
   UseParensForSymbolVariant = true;

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index 51cfa0a..f1fef41 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ARMTARGETASMINFO_H
-#define LLVM_ARMTARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index e545e3c..68d32b2 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp

@@ -35,12 +35,6 @@
     OS << ')';
 }
 
-bool
-ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                     const MCAsmLayout *Layout) const {
-  return false;
-}
-
 void ARMMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
   Streamer.visitUsedExpr(*getSubExpr());
 }

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index c5c0b10..06bf6c9 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMMCEXPR_H
-#define ARMMCEXPR_H
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
 
 #include "llvm/MC/MCExpr.h"
 
@@ -58,8 +58,11 @@
 
   void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const override;
-  void visitUsedExpr(MCStreamer &Streamer) const override;
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override {
+    return false;
+  }
+  void visitUsedExpr(MCStreamer &Streamer) const override; 
   const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 2b3855d..98190ba 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp

@@ -84,93 +84,89 @@
 std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   Triple triple(TT);
 
-  // Set the boolean corresponding to the current target triple, or the default
-  // if one cannot be determined, to true.
-  unsigned Len = TT.size();
-  unsigned Idx = 0;
-
-  // FIXME: Enhance Triple helper class to extract ARM version.
   bool isThumb = triple.getArch() == Triple::thumb ||
                  triple.getArch() == Triple::thumbeb;
-  if (Len >= 5 && TT.substr(0, 4) == "armv")
-    Idx = 4;
-  else if (Len >= 7 && TT.substr(0, 6) == "armebv")
-    Idx = 6;
-  else if (Len >= 7 && TT.substr(0, 6) == "thumbv")
-    Idx = 6;
-  else if (Len >= 9 && TT.substr(0, 8) == "thumbebv")
-    Idx = 8;
 
   bool NoCPU = CPU == "generic" || CPU.empty();
   std::string ARMArchFeature;
-  if (Idx) {
-    unsigned SubVer = TT[Idx];
-    if (SubVer == '8') {
-      if (NoCPU)
-        // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
-        //      FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
-        //      FeatureT2XtPk, FeatureCrypto, FeatureCRC
-        ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
-                         "+trustzone,+t2xtpk,+crypto,+crc";
-      else
-        // Use CPU to figure out the exact features
-        ARMArchFeature = "+v8";
-    } else if (SubVer == '7') {
-      if (Len >= Idx+2 && TT[Idx+1] == 'm') {
-        isThumb = true;
-        if (NoCPU)
-          // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
-          ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
-        else
-          // Use CPU to figure out the exact features.
-          ARMArchFeature = "+v7";
-      } else if (Len >= Idx+3 && TT[Idx+1] == 'e'&& TT[Idx+2] == 'm') {
-        if (NoCPU)
-          // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
-          //       FeatureT2XtPk, FeatureMClass
-          ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
-        else
-          // Use CPU to figure out the exact features.
-          ARMArchFeature = "+v7";
-      } else if (Len >= Idx+2 && TT[Idx+1] == 's') {
-        if (NoCPU)
-          // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS
-          //      Swift
-          ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras";
-        else
-          // Use CPU to figure out the exact features.
-          ARMArchFeature = "+v7";
-      } else {
-        // v7 CPUs have lots of different feature sets. If no CPU is specified,
-        // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
-        // the "minimum" feature set and use CPU string to figure out the exact
-        // features.
-        if (NoCPU)
-          // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
-          ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
-        else
-          // Use CPU to figure out the exact features.
-          ARMArchFeature = "+v7";
-      }
-    } else if (SubVer == '6') {
-      if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
-        ARMArchFeature = "+v6t2";
-      else if (Len >= Idx+2 && TT[Idx+1] == 'm') {
-        isThumb = true;
-        if (NoCPU)
-          // v6m: FeatureNoARM, FeatureMClass
-          ARMArchFeature = "+v6m,+noarm,+mclass";
-        else
-          ARMArchFeature = "+v6";
-      } else
-        ARMArchFeature = "+v6";
-    } else if (SubVer == '5') {
-      if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e')
-        ARMArchFeature = "+v5te";
-      else
-        ARMArchFeature = "+v5t";
-    } else if (SubVer == '4' && Len >= Idx+2 && TT[Idx+1] == 't')
-      ARMArchFeature = "+v4t";
+  switch (triple.getSubArch()) {
+  default:
+    llvm_unreachable("invalid sub-architecture for ARM");
+  case Triple::ARMSubArch_v8:
+    if (NoCPU)
+      // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
+      //      FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
+      //      FeatureT2XtPk, FeatureCrypto, FeatureCRC
+      ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
+                       "+trustzone,+t2xtpk,+crypto,+crc";
+    else
+      // Use CPU to figure out the exact features
+      ARMArchFeature = "+v8";
+    break;
+  case Triple::ARMSubArch_v7m:
+    isThumb = true;
+    if (NoCPU)
+      // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
+      ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
+    else
+      // Use CPU to figure out the exact features.
+      ARMArchFeature = "+v7";
+    break;
+  case Triple::ARMSubArch_v7em:
+    if (NoCPU)
+      // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
+      //       FeatureT2XtPk, FeatureMClass
+      ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
+    else
+      // Use CPU to figure out the exact features.
+      ARMArchFeature = "+v7";
+    break;
+  case Triple::ARMSubArch_v7s:
+    if (NoCPU)
+      // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS
+      //      Swift
+      ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras";
+    else
+      // Use CPU to figure out the exact features.
+      ARMArchFeature = "+v7";
+    break;
+  case Triple::ARMSubArch_v7:
+    // v7 CPUs have lots of different feature sets. If no CPU is specified,
+    // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
+    // the "minimum" feature set and use CPU string to figure out the exact
+    // features.
+    if (NoCPU)
+      // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+      ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
+    else
+      // Use CPU to figure out the exact features.
+      ARMArchFeature = "+v7";
+    break;
+  case Triple::ARMSubArch_v6t2:
+    ARMArchFeature = "+v6t2";
+    break;
+  case Triple::ARMSubArch_v6m:
+    isThumb = true;
+    if (NoCPU)
+      // v6m: FeatureNoARM, FeatureMClass
+      ARMArchFeature = "+v6m,+noarm,+mclass";
+    else
+      ARMArchFeature = "+v6";
+    break;
+  case Triple::ARMSubArch_v6:
+    ARMArchFeature = "+v6";
+    break;
+  case Triple::ARMSubArch_v5te:
+    ARMArchFeature = "+v5te";
+    break;
+  case Triple::ARMSubArch_v5:
+    ARMArchFeature = "+v5t";
+    break;
+  case Triple::ARMSubArch_v4t:
+    ARMArchFeature = "+v4t";
+    break;
+  case Triple::NoSubArch:
+    break;
   }
 
   if (isThumb) {
@@ -221,31 +217,14 @@
   Triple TheTriple(TT);
 
   MCAsmInfo *MAI;
-  switch (TheTriple.getOS()) {
-  case llvm::Triple::Darwin:
-  case llvm::Triple::IOS:
-  case llvm::Triple::MacOSX:
+  if (TheTriple.isOSDarwin() || TheTriple.isOSBinFormatMachO())
     MAI = new ARMMCAsmInfoDarwin(TT);
-    break;
-  case llvm::Triple::Win32:
-    switch (TheTriple.getEnvironment()) {
-    case llvm::Triple::Itanium:
-      MAI = new ARMCOFFMCAsmInfoGNU();
-      break;
-    case llvm::Triple::MSVC:
-      MAI = new ARMCOFFMCAsmInfoMicrosoft();
-      break;
-    default:
-      llvm_unreachable("invalid environment");
-    }
-    break;
-  default:
-    if (TheTriple.isOSBinFormatMachO())
-      MAI = new ARMMCAsmInfoDarwin(TT);
-    else
-      MAI = new ARMELFMCAsmInfo(TT);
-    break;
-  }
+  else if (TheTriple.isWindowsItaniumEnvironment())
+    MAI = new ARMCOFFMCAsmInfoGNU();
+  else if (TheTriple.isWindowsMSVCEnvironment())
+    MAI = new ARMCOFFMCAsmInfoMicrosoft();
+  else
+    MAI = new ARMELFMCAsmInfo(TT);
 
   unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
   MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0));
@@ -269,11 +248,8 @@
 // This is duplicated code. Refactor this.
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &OS,
-                                    MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll) {
   Triple TheTriple(TT);
 
   switch (TheTriple.getObjectFormat()) {
@@ -287,7 +263,7 @@
     assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
     return createARMWinCOFFStreamer(Ctx, MAB, *Emitter, OS);
   case Triple::ELF:
-    return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, NoExecStack,
+    return createARMELFStreamer(Ctx, MAB, OS, Emitter, false,
                                 TheTriple.getArch() == Triple::thumb);
   }
 }
@@ -362,8 +338,10 @@
   // Register the MC codegen info.
   TargetRegistry::RegisterMCCodeGenInfo(TheARMLETarget, createARMMCCodeGenInfo);
   TargetRegistry::RegisterMCCodeGenInfo(TheARMBETarget, createARMMCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheThumbLETarget, createARMMCCodeGenInfo);
-  TargetRegistry::RegisterMCCodeGenInfo(TheThumbBETarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheThumbLETarget,
+                                        createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheThumbBETarget,
+                                        createARMMCCodeGenInfo);
 
   // Register the MC instruction info.
   TargetRegistry::RegisterMCInstrInfo(TheARMLETarget, createARMMCInstrInfo);

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 5326e56..a6c20d5 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARMMCTARGETDESC_H
-#define ARMMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
 #include <string>

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 186776a..7da5003 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp

@@ -428,7 +428,7 @@
       // For external relocations, make sure to offset the fixup value to
       // compensate for the addend of the symbol address, if it was
       // undefined. This occurs with weak definitions, for example.
-      if (!SD->Symbol->isUndefined())
+      if (!SD->getSymbol().isUndefined())
         FixedValue -= Layout.getSymbolOffset(SD);
     } else {
       // The index is the section ordinal (1-based).

diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index ad3f1ca..8acd7af 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp

@@ -28,7 +28,7 @@
 // The constant pool handling is shared by all ARMTargetStreamer
 // implementations.
 const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr) {
-  return ConstantPools->addEntry(Streamer, Expr);
+  return ConstantPools->addEntry(Streamer, Expr, 4);
 }
 
 void ARMTargetStreamer::emitCurrentConstantPool() {

diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index cd58759..e0c113e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARM_UNWIND_OP_ASM_H
-#define ARM_UNWIND_OP_ASM_H
+#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
+#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ARMEHABI.h"
@@ -90,4 +90,4 @@
 
 } // namespace llvm
 
-#endif // ARM_UNWIND_OP_ASM_H
+#endif

diff --git a/lib/Target/ARM/MCTargetDesc/LLVMBuild.txt b/lib/Target/ARM/MCTargetDesc/LLVMBuild.txt
index 2a7fe61..db8fc92 100644
--- a/lib/Target/ARM/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/ARM/MCTargetDesc/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = ARMDesc
 parent = ARM
-required_libraries = ARMAsmPrinter ARMInfo MC Support
+required_libraries = ARMAsmPrinter ARMInfo MC MCDisassembler Support
 add_to_library_groups = ARM

diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index f6d24e9..35fe9b3 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp

@@ -378,8 +378,8 @@
 }
 
 bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
-  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
-  TRI = Fn.getTarget().getRegisterInfo();
+  TII = static_cast<const ARMBaseInstrInfo *>(Fn.getSubtarget().getInstrInfo());
+  TRI = Fn.getSubtarget().getRegisterInfo();
   MRI = &Fn.getRegInfo();
   const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
   isLikeA9 = STI->isLikeA9() || STI->isSwift();

diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
index f069535..c1601a3 100644
--- a/lib/Target/ARM/Makefile
+++ b/lib/Target/ARM/Makefile

@@ -15,7 +15,7 @@
 BUILT_SOURCES = ARMGenRegisterInfo.inc ARMGenInstrInfo.inc \
 		ARMGenAsmWriter.inc ARMGenAsmMatcher.inc \
                 ARMGenDAGISel.inc ARMGenSubtargetInfo.inc \
-                ARMGenCodeEmitter.inc ARMGenCallingConv.inc \
+                ARMGenCallingConv.inc \
                 ARMGenFastISel.inc ARMGenMCCodeEmitter.inc \
                 ARMGenMCPseudoLowering.inc ARMGenDisassemblerTables.inc
 

diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index baa97a7..6deab4f 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp

@@ -52,9 +52,9 @@
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const Thumb1InstrInfo &TII =
-    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
-  const Thumb1RegisterInfo *RegInfo =
-    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+      *static_cast<const Thumb1InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const Thumb1RegisterInfo *RegInfo = static_cast<const Thumb1RegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   if (!hasReservedCallFrame(MF)) {
     // If we have alloca, convert as follows:
     // ADJCALLSTACKDOWN -> sub, sp, sp, amount
@@ -89,12 +89,15 @@
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const Thumb1RegisterInfo *RegInfo =
-    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const Thumb1RegisterInfo *RegInfo = static_cast<const Thumb1RegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   const Thumb1InstrInfo &TII =
-    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const Thumb1InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
-  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned Align = MF.getTarget()
+                       .getSubtargetImpl()
+                       ->getFrameLowering()
+                       ->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   assert(NumBytes >= ArgRegsSaveSize &&
@@ -321,12 +324,15 @@
   DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const Thumb1RegisterInfo *RegInfo =
-    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const Thumb1RegisterInfo *RegInfo = static_cast<const Thumb1RegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   const Thumb1InstrInfo &TII =
-    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const Thumb1InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
-  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned Align = MF.getTarget()
+                       .getSubtargetImpl()
+                       ->getFrameLowering()
+                       ->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   int NumBytes = (int)MFI->getStackSize();
   assert((unsigned)NumBytes >= ArgRegsSaveSize &&
@@ -382,28 +388,65 @@
     }
   }
 
-  if (ArgRegsSaveSize) {
-    // Unlike T2 and ARM mode, the T1 pop instruction cannot restore
-    // to LR, and we can't pop the value directly to the PC since
-    // we need to update the SP after popping the value. Therefore, we
-    // pop the old LR into R3 as a temporary.
+  bool IsV4PopReturn = false;
+  for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo())
+    if (CSI.getReg() == ARM::LR)
+      IsV4PopReturn = true;
+  IsV4PopReturn &= STI.hasV4TOps() && !STI.hasV5TOps();
 
+  // Unlike T2 and ARM mode, the T1 pop instruction cannot restore
+  // to LR, and we can't pop the value directly to the PC since
+  // we need to update the SP after popping the value. So instead
+  // we have to emit:
+  //   POP {r3}
+  //   ADD sp, #offset
+  //   BX r3
+  // If this would clobber a return value, then generate this sequence instead:
+  //   MOV ip, r3
+  //   POP {r3}
+  //   ADD sp, #offset
+  //   MOV lr, r3
+  //   MOV r3, ip
+  //   BX lr
+  if (ArgRegsSaveSize || IsV4PopReturn) {
     // Get the last instruction, tBX_RET
     MBBI = MBB.getLastNonDebugInstr();
     assert (MBBI->getOpcode() == ARM::tBX_RET);
-    // Epilogue for vararg functions: pop LR to R3 and branch off it.
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
-      .addReg(ARM::R3, RegState::Define);
+    DebugLoc dl = MBBI->getDebugLoc();
 
-    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+    if (AFI->getReturnRegsCount() <= 3) {
+      // Epilogue: pop saved LR to R3 and branch off it. 
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+        .addReg(ARM::R3, RegState::Define);
 
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg))
-      .addReg(ARM::R3, RegState::Kill);
-    AddDefaultPred(MIB);
-    MIB.copyImplicitOps(&*MBBI);
-    // erase the old tBX_RET instruction
-    MBB.erase(MBBI);
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX))
+        .addReg(ARM::R3, RegState::Kill);
+      AddDefaultPred(MIB);
+      MIB.copyImplicitOps(&*MBBI);
+      // erase the old tBX_RET instruction
+      MBB.erase(MBBI);
+    } else {
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+        .addReg(ARM::R12, RegState::Define)
+        .addReg(ARM::R3, RegState::Kill));
+
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+        .addReg(ARM::R3, RegState::Define);
+
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
+
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+        .addReg(ARM::LR, RegState::Define)
+        .addReg(ARM::R3, RegState::Kill));
+
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+        .addReg(ARM::R3, RegState::Define)
+        .addReg(ARM::R12, RegState::Kill));
+      // Keep the tBX_RET instruction
+    }
   }
 }
 
@@ -417,7 +460,7 @@
 
   DebugLoc DL;
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   if (MI != MBB.end()) DL = MI->getDebugLoc();
 
@@ -456,7 +499,7 @@
 
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   bool isVarArg = AFI->getArgRegsSaveSize() > 0;
   DebugLoc DL = MI->getDebugLoc();
@@ -470,6 +513,9 @@
       // Special epilogue for vararg functions. See emitEpilogue
       if (isVarArg)
         continue;
+      // ARMv4T requires BX, see emitEpilogue
+      if (STI.hasV4TOps() && !STI.hasV5TOps())
+        continue;
       Reg = ARM::PC;
       (*MIB).setDesc(TII.get(ARM::tPOP_RET));
       MIB.copyImplicitOps(&*MI);

diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index a227f8e..b785b28 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ARM_THUMB1FRAMELOWERING_H
-#define LLVM_ARM_THUMB1FRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_ARM_THUMB1FRAMELOWERING_H
+#define LLVM_LIB_TARGET_ARM_THUMB1FRAMELOWERING_H
 
 #include "ARMFrameLowering.h"
 #include "Thumb1InstrInfo.h"

diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 68cbb5c..8ea912e 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp

@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMSubtarget.h"
 #include "Thumb1InstrInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -41,10 +42,30 @@
                                   MachineBasicBlock::iterator I, DebugLoc DL,
                                   unsigned DestReg, unsigned SrcReg,
                                   bool KillSrc) const {
-  AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc)));
+  // Need to check the arch.
+  MachineFunction &MF = *MBB.getParent();
+  const ARMSubtarget &st = MF.getTarget().getSubtarget<ARMSubtarget>();
+
   assert(ARM::GPRRegClass.contains(DestReg, SrcReg) &&
          "Thumb1 can only copy GPR registers");
+
+  if (st.hasV6Ops() || ARM::hGPRRegClass.contains(SrcReg)
+      || !ARM::tGPRRegClass.contains(DestReg))
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc)));
+  else {
+    // FIXME: The performance consequences of this are going to be atrocious.
+    // Some things to try that should be better:
+    //   * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11
+    //   * 'movs $dst, $src' if cpsr isn't live
+    // See: http://lists.cs.uiuc.edu/pipermail/llvmdev/2014-August/075998.html
+
+    // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH)))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPOP)))
+      .addReg(DestReg, getDefRegState(true));
+  }
 }
 
 void Thumb1InstrInfo::
@@ -101,3 +122,12 @@
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
   }
 }
+
+void
+Thumb1InstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
+                                      Reloc::Model RM) const {
+  if (RM == Reloc::PIC_)
+    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_pcrel, ARM::tLDRi, RM);
+  else
+    expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi, RM);
+}

diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
index c5845b7..9fba760 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/lib/Target/ARM/Thumb1InstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef THUMB1INSTRUCTIONINFO_H
-#define THUMB1INSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB1INSTRINFO_H
 
 #include "ARMBaseInstrInfo.h"
 #include "Thumb1RegisterInfo.h"
@@ -54,7 +54,10 @@
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+private:
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI,
+                            Reloc::Model RM) const override;
 };
 }
 
-#endif // THUMB1INSTRUCTIONINFO_H
+#endif

diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index f907b14..c10c809 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp

@@ -66,8 +66,12 @@
                                       int Val,
                                       ARMCC::CondCodes Pred, unsigned PredReg,
                                       unsigned MIFlags) const {
+  assert((isARMLowRegister(DestReg) ||
+          isVirtualRegister(DestReg)) &&
+             "Thumb1 does not have ldr to high register");
+
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
@@ -106,15 +110,15 @@
       NumBytes = -NumBytes;
     }
     unsigned LdReg = DestReg;
-    if (DestReg == ARM::SP) {
+    if (DestReg == ARM::SP)
       assert(BaseReg == ARM::SP && "Unexpected!");
+    if (!isARMLowRegister(DestReg) && !MRI.isVirtualRegister(DestReg))
       LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
-    }
 
-    if (NumBytes <= 255 && NumBytes >= 0)
+    if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
       AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
         .addImm(NumBytes).setMIFlags(MIFlags);
-    else if (NumBytes < 0 && NumBytes >= -255) {
+    } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
       AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
         .addImm(NumBytes).setMIFlags(MIFlags);
       AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
@@ -124,7 +128,8 @@
                             ARMCC::AL, 0, MIFlags);
 
     // Emit add / sub.
-    int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
+    int Opc = (isSub) ? ARM::tSUBrr : ((isHigh || !CanChangeCC) ? ARM::tADDhirr
+                                                                : ARM::tADDrr);
     MachineInstrBuilder MIB =
       BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
     if (Opc != ARM::tADDhirr)
@@ -136,32 +141,10 @@
     AddDefaultPred(MIB);
 }
 
-/// calcNumMI - Returns the number of instructions required to materialize
-/// the specific add / sub r, c instruction.
-static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
-                          unsigned NumBits, unsigned Scale) {
-  unsigned NumMIs = 0;
-  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
-
-  if (Opc == ARM::tADDrSPi) {
-    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
-    Bytes -= ThisVal;
-    NumMIs++;
-    NumBits = 8;
-    Scale = 1;  // Followed by a number of tADDi8.
-    Chunk = ((1 << NumBits) - 1) * Scale;
-  }
-
-  NumMIs += Bytes / Chunk;
-  if ((Bytes % Chunk) != 0)
-    NumMIs++;
-  if (ExtraOpc)
-    NumMIs++;
-  return NumMIs;
-}
-
 /// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
-/// a destreg = basereg + immediate in Thumb code.
+/// a destreg = basereg + immediate in Thumb code. Tries a series of ADDs or
+/// SUBs first, and uses a constant pool value if the instruction sequence would
+/// be too long. This is allowed to modify the condition flags.
 void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator &MBBI,
                                      DebugLoc dl,
@@ -172,151 +155,146 @@
   bool isSub = NumBytes < 0;
   unsigned Bytes = (unsigned)NumBytes;
   if (isSub) Bytes = -NumBytes;
-  bool isMul4 = (Bytes & 3) == 0;
-  bool isTwoAddr = false;
-  bool DstNotEqBase = false;
-  unsigned NumBits = 1;
-  unsigned Scale = 1;
-  int Opc = 0;
-  int ExtraOpc = 0;
-  bool NeedCC = false;
 
-  if (DestReg == BaseReg && BaseReg == ARM::SP) {
-    assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
-    NumBits = 7;
-    Scale = 4;
-    Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
-    isTwoAddr = true;
-  } else if (!isSub && BaseReg == ARM::SP) {
-    // r1 = add sp, 403
-    // =>
-    // r1 = add sp, 100 * 4
-    // r1 = add r1, 3
-    if (!isMul4) {
-      Bytes &= ~3;
-      ExtraOpc = ARM::tADDi3;
-    }
-    NumBits = 8;
-    Scale = 4;
-    Opc = ARM::tADDrSPi;
-  } else {
-    // sp = sub sp, c
-    // r1 = sub sp, c
-    // r8 = sub sp, c
-    if (DestReg != BaseReg)
-      DstNotEqBase = true;
-    NumBits = 8;
-    if (DestReg == ARM::SP) {
-      Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
-      assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
-      NumBits = 7;
-      Scale = 4;
+  int CopyOpc = 0;
+  unsigned CopyBits = 0;
+  unsigned CopyScale = 1;
+  bool CopyNeedsCC = false;
+  int ExtraOpc = 0;
+  unsigned ExtraBits = 0;
+  unsigned ExtraScale = 1;
+  bool ExtraNeedsCC = false;
+
+  // Strategy:
+  // We need to select two types of instruction, maximizing the available
+  // immediate range of each. The instructions we use will depend on whether
+  // DestReg and BaseReg are low, high or the stack pointer.
+  // * CopyOpc  - DestReg = BaseReg + imm
+  //              This will be emitted once if DestReg != BaseReg, and never if
+  //              DestReg == BaseReg.
+  // * ExtraOpc - DestReg = DestReg + imm
+  //              This will be emitted as many times as necessary to add the
+  //              full immediate.
+  // If the immediate ranges of these instructions are not large enough to cover
+  // NumBytes with a reasonable number of instructions, we fall back to using a
+  // value loaded from a constant pool.
+  if (DestReg == ARM::SP) {
+    if (BaseReg == ARM::SP) {
+      // sp -> sp
+      // Already in right reg, no copy needed
     } else {
-      Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
-      NumBits = 8;
-      NeedCC = true;
+      // low -> sp or high -> sp
+      CopyOpc = ARM::tMOVr;
+      CopyBits = 0;
     }
-    isTwoAddr = true;
+    ExtraOpc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+    ExtraBits = 7;
+    ExtraScale = 4;
+  } else if (isARMLowRegister(DestReg)) {
+    if (BaseReg == ARM::SP) {
+      // sp -> low
+      assert(!isSub && "Thumb1 does not have tSUBrSPi");
+      CopyOpc = ARM::tADDrSPi;
+      CopyBits = 8;
+      CopyScale = 4;
+    } else if (DestReg == BaseReg) {
+      // low -> same low
+      // Already in right reg, no copy needed
+    } else if (isARMLowRegister(BaseReg)) {
+      // low -> different low
+      CopyOpc = isSub ? ARM::tSUBi3 : ARM::tADDi3;
+      CopyBits = 3;
+      CopyNeedsCC = true;
+    } else {
+      // high -> low
+      CopyOpc = ARM::tMOVr;
+      CopyBits = 0;
+    }
+    ExtraOpc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+    ExtraBits = 8;
+    ExtraNeedsCC = true;
+  } else /* DestReg is high */ {
+    if (DestReg == BaseReg) {
+      // high -> same high
+      // Already in right reg, no copy needed
+    } else {
+      // {low,high,sp} -> high
+      CopyOpc = ARM::tMOVr;
+      CopyBits = 0;
+    }
+    ExtraOpc = 0;
   }
 
-  unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale);
+  // We could handle an unaligned immediate with an unaligned copy instruction
+  // and an aligned extra instruction, but this case is not currently needed.
+  assert(((Bytes & 3) == 0 || ExtraScale == 1) &&
+         "Unaligned offset, but all instructions require alignment");
+
+  unsigned CopyRange = ((1 << CopyBits) - 1) * CopyScale;
+  // If we would emit the copy with an immediate of 0, just use tMOVr.
+  if (CopyOpc && Bytes < CopyScale) {
+    CopyOpc = ARM::tMOVr;
+    CopyBits = 0;
+    CopyScale = 1;
+    CopyNeedsCC = false;
+    CopyRange = 0;
+  }
+  unsigned ExtraRange = ((1 << ExtraBits) - 1) * ExtraScale; // per instruction
+  unsigned RequiredCopyInstrs = CopyOpc ? 1 : 0;
+  unsigned RangeAfterCopy = (CopyRange > Bytes) ? 0 : (Bytes - CopyRange);
+
+  // We could handle this case when the copy instruction does not require an
+  // aligned immediate, but we do not currently do this.
+  assert(RangeAfterCopy % ExtraScale == 0 &&
+         "Extra instruction requires immediate to be aligned");
+
+  unsigned RequiredExtraInstrs;
+  if (ExtraRange)
+    RequiredExtraInstrs = RoundUpToAlignment(RangeAfterCopy, ExtraRange) / ExtraRange;
+  else if (RangeAfterCopy > 0)
+    // We need an extra instruction but none is available
+    RequiredExtraInstrs = 1000000;
+  else
+    RequiredExtraInstrs = 0;
+  unsigned RequiredInstrs = RequiredCopyInstrs + RequiredExtraInstrs;
   unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
-  if (NumMIs > Threshold) {
-    // This will expand into too many instructions. Load the immediate from a
-    // constpool entry.
+
+  // Use a constant pool, if the sequence of ADDs/SUBs is too expensive.
+  if (RequiredInstrs > Threshold) {
     emitThumbRegPlusImmInReg(MBB, MBBI, dl,
                              DestReg, BaseReg, NumBytes, true,
                              TII, MRI, MIFlags);
     return;
   }
 
-  if (DstNotEqBase) {
-    if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) {
-      // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7)
-      unsigned Chunk = (1 << 3) - 1;
-      unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
-      Bytes -= ThisVal;
-      const MCInstrDesc &MCID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
-      const MachineInstrBuilder MIB =
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg)
-                         .setMIFlags(MIFlags));
-      AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
-    } else {
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
-        .addReg(BaseReg, RegState::Kill))
-        .setMIFlags(MIFlags);
+  // Emit zero or one copy instructions
+  if (CopyOpc) {
+    unsigned CopyImm = std::min(Bytes, CopyRange) / CopyScale;
+    Bytes -= CopyImm * CopyScale;
+
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(CopyOpc), DestReg);
+    if (CopyNeedsCC)
+      MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(BaseReg, RegState::Kill);
+    if (CopyOpc != ARM::tMOVr) {
+      MIB.addImm(CopyImm);
     }
+    AddDefaultPred(MIB.setMIFlags(MIFlags));
+
     BaseReg = DestReg;
   }
 
-  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+  // Emit zero or more in-place add/sub instructions
   while (Bytes) {
-    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
-    Bytes -= ThisVal;
-    ThisVal /= Scale;
-    // Build the new tADD / tSUB.
-    if (isTwoAddr) {
-      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
-      if (NeedCC)
-        MIB = AddDefaultT1CC(MIB);
-      MIB.addReg(DestReg).addImm(ThisVal);
-      MIB = AddDefaultPred(MIB);
-      MIB.setMIFlags(MIFlags);
-    } else {
-      bool isKill = BaseReg != ARM::SP;
-      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
-      if (NeedCC)
-        MIB = AddDefaultT1CC(MIB);
-      MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
-      MIB = AddDefaultPred(MIB);
-      MIB.setMIFlags(MIFlags);
+    unsigned ExtraImm = std::min(Bytes, ExtraRange) / ExtraScale;
+    Bytes -= ExtraImm * ExtraScale;
 
-      BaseReg = DestReg;
-      if (Opc == ARM::tADDrSPi) {
-        // r4 = add sp, imm
-        // r4 = add r4, imm
-        // ...
-        NumBits = 8;
-        Scale = 1;
-        Chunk = ((1 << NumBits) - 1) * Scale;
-        Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
-        NeedCC = isTwoAddr = true;
-      }
-    }
-  }
-
-  if (ExtraOpc) {
-    const MCInstrDesc &MCID = TII.get(ExtraOpc);
-    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg))
-                   .addReg(DestReg, RegState::Kill)
-                   .addImm(((unsigned)NumBytes) & 3)
-                   .setMIFlags(MIFlags));
-  }
-}
-
-/// emitThumbConstant - Emit a series of instructions to materialize a
-/// constant.
-static void emitThumbConstant(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator &MBBI,
-                              unsigned DestReg, int Imm,
-                              const TargetInstrInfo &TII,
-                              const Thumb1RegisterInfo& MRI,
-                              DebugLoc dl) {
-  bool isSub = Imm < 0;
-  if (isSub) Imm = -Imm;
-
-  int Chunk = (1 << 8) - 1;
-  int ThisVal = (Imm > Chunk) ? Chunk : Imm;
-  Imm -= ThisVal;
-  AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8),
-                                        DestReg))
-                 .addImm(ThisVal));
-  if (Imm > 0)
-    emitThumbRegPlusImmediate(MBB, MBBI, dl, DestReg, DestReg, Imm, TII, MRI);
-  if (isSub) {
-    const MCInstrDesc &MCID = TII.get(ARM::tRSB);
-    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg))
-                   .addReg(DestReg, RegState::Kill));
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg);
+    if (ExtraNeedsCC)
+      MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(BaseReg).addImm(ExtraImm);
+    MIB = AddDefaultPred(MIB);
+    MIB.setMIFlags(MIFlags);
   }
 }
 
@@ -352,86 +330,13 @@
   const MCInstrDesc &Desc = MI.getDesc();
   unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
 
-  if (Opcode == ARM::tADDrSPi) {
+  if (Opcode == ARM::tADDframe) {
     Offset += MI.getOperand(FrameRegIdx+1).getImm();
-
-    // Can't use tADDrSPi if it's based off the frame pointer.
-    unsigned NumBits = 0;
-    unsigned Scale = 1;
-    if (FrameReg != ARM::SP) {
-      Opcode = ARM::tADDi3;
-      NumBits = 3;
-    } else {
-      NumBits = 8;
-      Scale = 4;
-      assert((Offset & 3) == 0 &&
-             "Thumb add/sub sp, #imm immediate must be multiple of 4!");
-    }
-
-    unsigned PredReg;
-    if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
-      // Turn it into a move.
-      MI.setDesc(TII.get(ARM::tMOVr));
-      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-      // Remove offset
-      MI.RemoveOperand(FrameRegIdx+1);
-      return true;
-    }
-
-    // Common case: small offset, fits into instruction.
-    unsigned Mask = (1 << NumBits) - 1;
-    if (((Offset / Scale) & ~Mask) == 0) {
-      // Replace the FrameIndex with sp / fp
-      if (Opcode == ARM::tADDi3) {
-        MI.setDesc(TII.get(Opcode));
-        removeOperands(MI, FrameRegIdx);
-        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg)
-                       .addImm(Offset / Scale));
-      } else {
-        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-        MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset / Scale);
-      }
-      return true;
-    }
-
     unsigned DestReg = MI.getOperand(0).getReg();
-    unsigned Bytes = (Offset > 0) ? Offset : -Offset;
-    unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale);
-    // MI would expand into a large number of instructions. Don't try to
-    // simplify the immediate.
-    if (NumMIs > 2) {
-      emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
-                                *this);
-      MBB.erase(II);
-      return true;
-    }
 
-    if (Offset > 0) {
-      // Translate r0 = add sp, imm to
-      // r0 = add sp, 255*4
-      // r0 = add r0, (imm - 255*4)
-      if (Opcode == ARM::tADDi3) {
-        MI.setDesc(TII.get(Opcode));
-        removeOperands(MI, FrameRegIdx);
-        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask));
-      } else {
-        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-        MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Mask);
-      }
-      Offset = (Offset - Mask * Scale);
-      MachineBasicBlock::iterator NII = std::next(II);
-      emitThumbRegPlusImmediate(MBB, NII, dl, DestReg, DestReg, Offset, TII,
-                                *this);
-    } else {
-      // Translate r0 = add sp, -imm to
-      // r0 = -imm (this is then translated into a series of instructions)
-      // r0 = add r0, sp
-      emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
-
-      MI.setDesc(TII.get(ARM::tADDhirr));
-      MI.getOperand(FrameRegIdx).ChangeToRegister(DestReg, false, false, true);
-      MI.getOperand(FrameRegIdx+1).ChangeToRegister(FrameReg, false);
-    }
+    emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
+                              *this);
+    MBB.erase(II);
     return true;
   } else {
     if (AddrMode != ARMII::AddrModeT1_s)
@@ -485,8 +390,11 @@
 void Thumb1RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
                                            int64_t Offset) const {
   const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(
-      MI.getParent()->getParent()->getTarget().getInstrInfo());
+      *static_cast<const ARMBaseInstrInfo *>(MI.getParent()
+                                                 ->getParent()
+                                                 ->getTarget()
+                                                 .getSubtargetImpl()
+                                                 ->getInstrInfo());
   int Off = Offset; // ARM doesn't need the general 64-bit offsets
   unsigned i = 0;
 
@@ -512,7 +420,7 @@
   // off the frame pointer (if, for example, there are alloca() calls in
   // the function, the offset will be negative. Use R12 instead since that's
   // a call clobbered register that we know won't be used in Thumb1 mode.
-  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL;
   AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
     .addReg(ARM::R12, RegState::Define)
@@ -559,7 +467,7 @@
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB(*MBB.getParent(), &MI);
@@ -570,7 +478,7 @@
                MF.getFrameInfo()->getStackSize() + SPAdj;
 
   if (MF.getFrameInfo()->hasVarSizedObjects()) {
-    assert(SPAdj == 0 && MF.getTarget().getFrameLowering()->hasFP(MF) &&
+    assert(SPAdj == 0 && MF.getSubtarget().getFrameLowering()->hasFP(MF) &&
            "Unexpected");
     // There are alloca()'s in this function, must reference off the frame
     // pointer or base pointer instead.
@@ -587,7 +495,10 @@
   // when !hasReservedCallFrame().
 #ifndef NDEBUG
   if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
-    assert(MF.getTarget().getFrameLowering()->hasReservedCallFrame(MF) &&
+    assert(MF.getTarget()
+               .getSubtargetImpl()
+               ->getFrameLowering()
+               ->hasReservedCallFrame(MF) &&
            "Cannot use SP to access the emergency spill slot in "
            "functions without a reserved call frame");
     assert(!MF.getFrameInfo()->hasVarSizedObjects() &&

diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 0c0abbe..5feaf52 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef THUMB1REGISTERINFO_H
-#define THUMB1REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_THUMB1REGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB1REGISTERINFO_H
 
 #include "ARMBaseRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -60,4 +60,4 @@
 };
 }
 
-#endif // THUMB1REGISTERINFO_H
+#endif

diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index edb9ff3..fdcb522 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp

@@ -188,7 +188,7 @@
                                              true/*isImp*/, false/*isKill*/));
 
     MachineInstr *LastITMI = MI;
-    MachineBasicBlock::iterator InsertPos = MIB;
+    MachineBasicBlock::iterator InsertPos = MIB.getInstr();
     ++MBBI;
 
     // Form IT block.
@@ -255,8 +255,9 @@
 bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
   AFI = Fn.getInfo<ARMFunctionInfo>();
-  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
-  TRI = TM.getRegisterInfo();
+  TII = static_cast<const Thumb2InstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
+  TRI = TM.getSubtargetImpl()->getRegisterInfo();
   restrictIT = TM.getSubtarget<ARMSubtarget>().restrictIT();
 
   if (!AFI->isThumbFunction())

diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index a9df006..91973e1 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp

@@ -209,6 +209,15 @@
   ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
 }
 
+void
+Thumb2InstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
+                                      Reloc::Model RM) const {
+  if (RM == Reloc::PIC_)
+    expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12, RM);
+  else
+    expandLoadStackGuardBase(MI, ARM::t2MOVi32imm, ARM::t2LDRi12, RM);
+}
+
 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,

diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 34d45d3..46a1f6d 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef THUMB2INSTRUCTIONINFO_H
-#define THUMB2INSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_THUMB2INSTRINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB2INSTRINFO_H
 
 #include "ARMBaseInstrInfo.h"
 #include "Thumb2RegisterInfo.h"
@@ -61,6 +61,10 @@
   /// always be able to get register info as well (through this method).
   ///
   const Thumb2RegisterInfo &getRegisterInfo() const override { return RI; }
+
+private:
+  void expandLoadStackGuard(MachineBasicBlock::iterator MI,
+                            Reloc::Model RM) const override;
 };
 
 /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
@@ -71,4 +75,4 @@
 
 }
 
-#endif // THUMB2INSTRUCTIONINFO_H
+#endif

diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 782d81f..0d5d85a 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp

@@ -40,7 +40,7 @@
                                       ARMCC::CondCodes Pred, unsigned PredReg,
                                       unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
            Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);

diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h
index 8a33e6c..1dd94cc 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.h
+++ b/lib/Target/ARM/Thumb2RegisterInfo.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef THUMB2REGISTERINFO_H
-#define THUMB2REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_ARM_THUMB2REGISTERINFO_H
+#define LLVM_LIB_TARGET_ARM_THUMB2REGISTERINFO_H
 
 #include "ARMBaseRegisterInfo.h"
 
@@ -35,4 +35,4 @@
 };
 }
 
-#endif // THUMB2REGISTERINFO_H
+#endif

diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 09debe7..c51eb8b 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp

@@ -335,7 +335,7 @@
   bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA     ||
                  Opc == ARM::t2LDMDB     || Opc == ARM::t2LDMIA_UPD ||
                  Opc == ARM::t2LDMDB_UPD);
-  bool isLROk = (Opc == ARM::t2STMIA_UPD || Opc == ARM::t2STMDB_UPD);
+  bool isLROk = (Opc == ARM::t2STMDB_UPD);
   bool isSPOk = isPCOk || isLROk;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
@@ -384,7 +384,6 @@
     if (MI->getOperand(1).getReg() == ARM::SP) {
       Opc = Entry.NarrowOpc2;
       ImmLimit = Entry.Imm2Limit;
-      HasOffReg = false;
     }
 
     Scale = 4;
@@ -1003,7 +1002,8 @@
 
 bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
   const TargetMachine &TM = MF.getTarget();
-  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+  TII = static_cast<const Thumb2InstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
   STI = &TM.getSubtarget<ARMSubtarget>();
 
   // Optimizing / minimizing size?

diff --git a/lib/Target/Android.mk b/lib/Target/Android.mk
index 1b43ce4..4494eb0 100644
--- a/lib/Target/Android.mk
+++ b/lib/Target/Android.mk

@@ -3,7 +3,6 @@
 target_SRC_FILES := \
   Target.cpp \
   TargetIntrinsicInfo.cpp \
-  TargetJITInfo.cpp \
   TargetLibraryInfo.cpp \
   TargetLoweringObjectFile.cpp \
   TargetMachineC.cpp \

diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 06a74d7..c61805b 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt

@@ -1,7 +1,6 @@
 add_llvm_library(LLVMTarget
   Target.cpp
   TargetIntrinsicInfo.cpp
-  TargetJITInfo.cpp
   TargetLibraryInfo.cpp
   TargetLoweringObjectFile.cpp
   TargetMachine.cpp

diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 673ade7..4bae7f8 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h

@@ -11,29 +11,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CPPTARGETMACHINE_H
-#define CPPTARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_CPPBACKEND_CPPTARGETMACHINE_H
+#define LLVM_LIB_TARGET_CPPBACKEND_CPPTARGETMACHINE_H
 
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 namespace llvm {
 
 class formatted_raw_ostream;
 
+class CPPSubtarget : public TargetSubtargetInfo {
+};
+
 struct CPPTargetMachine : public TargetMachine {
   CPPTargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS, const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL)
-    : TargetMachine(T, TT, CPU, FS, Options) {}
+    : TargetMachine(T, TT, CPU, FS, Options), Subtarget() {}
+private:
+  CPPSubtarget Subtarget;
 
+public:
+  const CPPSubtarget *getSubtargetImpl() const override { return &Subtarget; }
   bool addPassesToEmitFile(PassManagerBase &PM, formatted_raw_ostream &Out,
                            CodeGenFileType FileType, bool DisableVerify,
                            AnalysisID StartAfter,
                            AnalysisID StopAfter) override;
-
-  const DataLayout *getDataLayout() const override { return nullptr; }
 };
 
 extern Target TheCppBackendTarget;

diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 81b0e56..af7914f 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt

@@ -1,28 +1,32 @@
 set(LLVM_TARGET_DEFINITIONS Hexagon.td)
 
-tablegen(LLVM HexagonGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM HexagonGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM HexagonGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM HexagonGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM HexagonGenCallingConv.inc -gen-callingconv)
-tablegen(LLVM HexagonGenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM HexagonGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM HexagonGenDFAPacketizer.inc -gen-dfa-packetizer)
+tablegen(LLVM HexagonGenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM HexagonGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM HexagonGenMCCodeEmitter.inc -gen-emitter)
+tablegen(LLVM HexagonGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM HexagonGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(HexagonCommonTableGen)
 
 add_llvm_target(HexagonCodeGen
   HexagonAsmPrinter.cpp
   HexagonCallingConvLower.cpp
   HexagonCFGOptimizer.cpp
+  HexagonCopyToCombine.cpp
   HexagonExpandPredSpillCode.cpp
+  HexagonFixupHwLoops.cpp
   HexagonFrameLowering.cpp
   HexagonHardwareLoops.cpp
-  HexagonFixupHwLoops.cpp
-  HexagonMachineFunctionInfo.cpp
-  HexagonMachineScheduler.cpp
-  HexagonMCInstLower.cpp
   HexagonInstrInfo.cpp
   HexagonISelDAGToDAG.cpp
   HexagonISelLowering.cpp
+  HexagonMachineFunctionInfo.cpp
+  HexagonMachineScheduler.cpp
+  HexagonMCInstLower.cpp
+  HexagonNewValueJump.cpp
   HexagonPeephole.cpp
   HexagonRegisterInfo.cpp
   HexagonRemoveSZExtArgs.cpp
@@ -33,11 +37,8 @@
   HexagonTargetMachine.cpp
   HexagonTargetObjectFile.cpp
   HexagonVLIWPacketizer.cpp
-  HexagonNewValueJump.cpp
-  HexagonCopyToCombine.cpp
 )
 
 add_subdirectory(TargetInfo)
-add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
-
+add_subdirectory(Disassembler)

diff --git a/lib/Target/Hexagon/Disassembler/CMakeLists.txt b/lib/Target/Hexagon/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..755a45e
--- /dev/null
+++ b/lib/Target/Hexagon/Disassembler/CMakeLists.txt

@@ -0,0 +1,3 @@
+add_llvm_library(LLVMHexagonDisassembler
+  HexagonDisassembler.cpp
+  )

diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
new file mode 100644
index 0000000..bc64be1
--- /dev/null
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp

@@ -0,0 +1,114 @@
+//===-- HexagonDisassembler.cpp - Disassembler for Hexagon ISA ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Endian.h"
+
+#include <vector>
+#include <array>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-disassembler"
+
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+/// \brief Hexagon disassembler for all Hexagon platforms.
+class HexagonDisassembler : public MCDisassembler {
+public:
+  HexagonDisassembler(MCSubtargetInfo const &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+}
+
+static const uint16_t IntRegDecoderTable[] = {
+  Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+  Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9,
+  Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14,
+  Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+  Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24,
+  Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29,
+  Hexagon::R30, Hexagon::R31 };
+
+static const uint16_t PredRegDecoderTable[] = { Hexagon::P0, Hexagon::P1,
+Hexagon::P2, Hexagon::P3 };
+
+static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+  uint64_t /*Address*/,
+  void const *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = IntRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
+  uint64_t /*Address*/,
+  void const *Decoder) {
+  if (RegNo > 3)
+    return MCDisassembler::Fail;
+
+  unsigned Register = PredRegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+#include "HexagonGenDisassemblerTables.inc"
+
+static MCDisassembler *createHexagonDisassembler(Target const &T,
+                                                 MCSubtargetInfo const &STI,
+                                                 MCContext &Ctx) {
+  return new HexagonDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeHexagonDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheHexagonTarget,
+                                         createHexagonDisassembler);
+}
+
+DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &os,
+                                                 raw_ostream &cs) const {
+  Size = 4;
+  if (Bytes.size() < 4)
+    return MCDisassembler::Fail;
+
+  uint32_t insn =
+      llvm::support::endian::read<uint32_t, llvm::support::little,
+                                  llvm::support::unaligned>(Bytes.data());
+
+  // Remove parse bits.
+  insn &= ~static_cast<uint32_t>(HexagonII::InstParseBits::INST_PARSE_MASK);
+  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
+}

diff --git a/lib/Target/Hexagon/Disassembler/LLVMBuild.txt b/lib/Target/Hexagon/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000..17ad11b
--- /dev/null
+++ b/lib/Target/Hexagon/Disassembler/LLVMBuild.txt

@@ -0,0 +1,23 @@
+;===-- ./lib/Target/Hexagon/Disassembler/LLVMBuild.txt ---------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = HexagonDisassembler
+parent = Hexagon
+required_libraries = HexagonInfo MCDisassembler Support
+add_to_library_groups = Hexagon

diff --git a/lib/Target/Hexagon/Disassembler/Makefile b/lib/Target/Hexagon/Disassembler/Makefile
new file mode 100644
index 0000000..e55fd58
--- /dev/null
+++ b/lib/Target/Hexagon/Disassembler/Makefile

@@ -0,0 +1,16 @@
+##===-- lib/Target/Hexagon/Disassembler/Makefile -----------*- Makefile -*-===##

+#

+#                     The LLVM Compiler Infrastructure

+#

+# This file is distributed under the University of Illinois Open Source

+# License. See LICENSE.TXT for details.

+#

+##===----------------------------------------------------------------------===##

+

+LEVEL = ../../../..

+LIBRARYNAME = LLVMHexagonDisassembler

+

+# Hack: we need to include 'main' target directory to grab private headers

+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..

+

+include $(LEVEL)/Makefile.common


diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index 5467ee3..64ae69c 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TARGET_Hexagon_H
-#define TARGET_Hexagon_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGON_H
 
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/Target/TargetLowering.h"

diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 2e011bd..9240282 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp

@@ -18,7 +18,7 @@
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
-#include "InstPrinter/HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCInst.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"

diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h
index 7fe8c57..5f4c162 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HEXAGONASMPRINTER_H
-#define HEXAGONASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONASMPRINTER_H
 
 #include "Hexagon.h"
 #include "HexagonTargetMachine.h"

diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index de340e0..8a4e02c 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp

@@ -72,7 +72,7 @@
 void
 HexagonCFGOptimizer::InvertAndChangeJumpTarget(MachineInstr* MI,
                                                MachineBasicBlock* NewTarget) {
-  const HexagonInstrInfo *QII = QTM.getInstrInfo();
+  const HexagonInstrInfo *QII = QTM.getSubtargetImpl()->getInstrInfo();
   int NewOpcode = 0;
   switch(MI->getOpcode()) {
   case Hexagon::JMP_t:

diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
index f5f958c..8d78409 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.cpp
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.cpp

@@ -21,6 +21,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 Hexagon_CCState::Hexagon_CCState(CallingConv::ID CC, bool isVarArg,
@@ -31,7 +32,8 @@
   // No stack is used.
   StackOffset = 0;
 
-  UsedRegs.resize((TM.getRegisterInfo()->getNumRegs()+31)/32);
+  UsedRegs.resize(
+      (TM.getSubtargetImpl()->getRegisterInfo()->getNumRegs() + 31) / 32);
 }
 
 // HandleByVal - Allocate a stack slot large enough to pass an argument by
@@ -55,7 +57,7 @@
 
 /// MarkAllocated - Mark a register and all of its aliases as allocated.
 void Hexagon_CCState::MarkAllocated(unsigned Reg) {
-  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  const TargetRegisterInfo &TRI = *TM.getSubtargetImpl()->getRegisterInfo();
   for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
     UsedRegs[*AI/32] |= 1 << (*AI&31);
 }

diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.h b/lib/Target/Hexagon/HexagonCallingConvLower.h
index 70b8b64..738ed1a 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.h
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_Hexagon_CODEGEN_CALLINGCONVLOWER_H
-#define LLVM_Hexagon_CODEGEN_CALLINGCONVLOWER_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONCALLINGCONVLOWER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONCALLINGCONVLOWER_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"

diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index aeff680..4e76698 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp

@@ -417,8 +417,8 @@
   bool HasChanged = false;
 
   // Get target info.
-  TRI = MF.getTarget().getRegisterInfo();
-  TII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo());
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   // Combine aggressively (for code size)
   ShouldCombineAggressively =

diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
index 3dafe80..8ef4c3a 100644
--- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp

@@ -72,7 +72,7 @@
 
 bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
 
-  const HexagonInstrInfo *TII = QTM.getInstrInfo();
+  const HexagonInstrInfo *TII = QTM.getSubtargetImpl()->getInstrInfo();
 
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
@@ -86,8 +86,10 @@
       if (Opc == Hexagon::STriw_pred) {
         // STriw_pred [R30], ofst, SrcReg;
         unsigned FP = MI->getOperand(0).getReg();
-        assert(FP == QTM.getRegisterInfo()->getFrameRegister() &&
-               "Not a Frame Pointer, Nor a Spill Slot");
+        assert(
+            FP ==
+                QTM.getSubtargetImpl()->getRegisterInfo()->getFrameRegister() &&
+            "Not a Frame Pointer, Nor a Spill Slot");
         assert(MI->getOperand(1).isImm() && "Not an offset");
         int Offset = MI->getOperand(1).getImm();
         int SrcReg = MI->getOperand(2).getReg();
@@ -98,7 +100,7 @@
             BuildMI(*MBB, MII, MI->getDebugLoc(),
                     TII->get(Hexagon::CONST32_Int_Real),
                       HEXAGON_RESERVED_REG_1).addImm(Offset);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_rr),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_add),
                     HEXAGON_RESERVED_REG_1)
               .addReg(FP).addReg(HEXAGON_RESERVED_REG_1);
             BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
@@ -133,8 +135,10 @@
         assert(Hexagon::PredRegsRegClass.contains(DstReg) &&
                "Not a predicate register");
         unsigned FP = MI->getOperand(1).getReg();
-        assert(FP == QTM.getRegisterInfo()->getFrameRegister() &&
-               "Not a Frame Pointer, Nor a Spill Slot");
+        assert(
+            FP ==
+                QTM.getSubtargetImpl()->getRegisterInfo()->getFrameRegister() &&
+            "Not a Frame Pointer, Nor a Spill Slot");
         assert(MI->getOperand(2).isImm() && "Not an offset");
         int Offset = MI->getOperand(2).getImm();
         if (!TII->isValidOffset(Hexagon::LDriw, Offset)) {
@@ -142,7 +146,7 @@
             BuildMI(*MBB, MII, MI->getDebugLoc(),
                     TII->get(Hexagon::CONST32_Int_Real),
                       HEXAGON_RESERVED_REG_1).addImm(Offset);
-            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_rr),
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::A2_add),
                     HEXAGON_RESERVED_REG_1)
               .addReg(FP)
               .addReg(HEXAGON_RESERVED_REG_1);

diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index d41939a..5f9b927 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp

@@ -160,7 +160,7 @@
 void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF,
                                            MachineBasicBlock::iterator &MII,
                                            RegScavenger &RS) {
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   MachineBasicBlock *MBB = MII->getParent();
   DebugLoc DL = MII->getDebugLoc();
   unsigned Scratch = RS.scavengeRegister(&Hexagon::IntRegsRegClass, MII, 0);

diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 21df12f..356f279 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp

@@ -50,7 +50,10 @@
   unsigned FrameSize = MFI->getStackSize();
 
   // Get the alignments provided by the target.
-  unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned TargetAlign = MF.getTarget()
+                             .getSubtargetImpl()
+                             ->getFrameLowering()
+                             ->getStackAlignment();
   // Get the maximum call frame size of all the calls.
   unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
 
@@ -77,8 +80,8 @@
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineBasicBlock::iterator MBBI = MBB.begin();
-  const HexagonRegisterInfo *QRI =
-    static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   determineFrameLayout(MF);
 
@@ -115,7 +118,7 @@
     // Check for overflow.
     // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
     const int ALLOCFRAME_MAX = 16384;
-    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
     if (NumBytes >= ALLOCFRAME_MAX) {
       // Emit allocframe(#0).
@@ -154,12 +157,12 @@
     MachineBasicBlock::iterator MBBI = std::prev(MBB.end());
     MachineBasicBlock::iterator MBBI_end = MBB.end();
 
-    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
     // Handle EH_RETURN.
     if (MBBI->getOpcode() == Hexagon::EH_RETURN_JMPR) {
       assert(MBBI->getOperand(0).isReg() && "Offset should be in register!");
       BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME));
-      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::ADD_rr),
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::A2_add),
               Hexagon::R29).addReg(Hexagon::R29).addReg(Hexagon::R28);
       return;
     }
@@ -225,7 +228,7 @@
                                         const std::vector<CalleeSavedInfo> &CSI,
                                         const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
-  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
 
   if (CSI.empty()) {
     return false;
@@ -280,7 +283,7 @@
                                         const TargetRegisterInfo *TRI) const {
 
   MachineFunction *MF = MBB.getParent();
-  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
 
   if (CSI.empty()) {
     return false;

diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index 2d4b0b9..2d6b457 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HEXAGON_FRAMEINFO_H
-#define HEXAGON_FRAMEINFO_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONFRAMELOWERING_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONFRAMELOWERING_H
 
 #include "Hexagon.h"
 #include "llvm/Target/TargetFrameLowering.h"

diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 7f76421..e2062a3 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp

@@ -220,7 +220,7 @@
   int HexagonHardwareLoops::Counter = 0;
 #endif
 
-  /// \brief Abstraction for a trip count of a loop. A smaller vesrsion
+  /// \brief Abstraction for a trip count of a loop. A smaller version
   /// of the MachineOperand class without the concerns of changing the
   /// operand representation.
   class CountValue {
@@ -266,7 +266,8 @@
     }
 
     void print(raw_ostream &OS, const TargetMachine *TM = nullptr) const {
-      const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : nullptr;
+      const TargetRegisterInfo *TRI =
+          TM ? TM->getSubtargetImpl()->getRegisterInfo() : nullptr;
       if (isReg()) { OS << PrintReg(Contents.R.Reg, TRI, Contents.R.Sub); }
       if (isImm()) { OS << Contents.ImmVal; }
     }
@@ -302,8 +303,10 @@
   MRI = &MF.getRegInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
   TM  = static_cast<const HexagonTargetMachine*>(&MF.getTarget());
-  TII = static_cast<const HexagonInstrInfo*>(TM->getInstrInfo());
-  TRI = static_cast<const HexagonRegisterInfo*>(TM->getRegisterInfo());
+  TII = static_cast<const HexagonInstrInfo *>(
+      TM->getSubtargetImpl()->getInstrInfo());
+  TRI = static_cast<const HexagonRegisterInfo *>(
+      TM->getSubtargetImpl()->getRegisterInfo());
 
   for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
        I != E; ++I) {

diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index dabe650..dc58c42 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp

@@ -446,8 +446,8 @@
 
   if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
       N1.getNode()->getValueType(0) == MVT::i32) {
-    const HexagonInstrInfo *TII =
-      static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+    const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
+        TM.getSubtargetImpl()->getInstrInfo());
     if (TII->isValidAutoIncImm(LoadedVT, Val)) {
       SDValue TargetConst = CurDAG->getTargetConstant(Val, MVT::i32);
       SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32,
@@ -513,8 +513,8 @@
 
   if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
       N1.getNode()->getValueType(0) == MVT::i32) {
-    const HexagonInstrInfo *TII =
-      static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+    const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
+        TM.getSubtargetImpl()->getInstrInfo());
     if (TII->isValidAutoIncImm(LoadedVT, Val)) {
       SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
       SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
@@ -591,8 +591,8 @@
   bool zextval = (LD->getExtensionType() == ISD::ZEXTLOAD);
 
   // Figure out the opcode.
-  const HexagonInstrInfo *TII =
-    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+  const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
   if (LoadedVT == MVT::i64) {
     if (TII->isValidAutoIncImm(LoadedVT, Val))
       Opcode = Hexagon::POST_LDrid;
@@ -701,8 +701,8 @@
 
   // Offset value must be within representable range
   // and must have correct alignment properties.
-  const HexagonInstrInfo *TII =
-    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+  const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
   if (TII->isValidAutoIncImm(StoredVT, Val)) {
     SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, MVT::i32), Value,
                      Chain};
@@ -1218,10 +1218,10 @@
   // as at least one of the operands.
   if (IntrinsicWithPred) {
     SmallVector<SDValue, 8> Ops;
-    const HexagonInstrInfo *TII =
-      static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
+    const HexagonInstrInfo *TII = static_cast<const HexagonInstrInfo *>(
+        TM.getSubtargetImpl()->getInstrInfo());
     const MCInstrDesc &MCID = TII->get(IntrinsicWithPred);
-    const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+    const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
 
     // Iterate over all the operands of the intrinsics.
     // For PredRegs, do the transfer.

diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index a460ea4..7646088 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp

@@ -51,9 +51,9 @@
 
 public:
   HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-                 const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
-                 LLVMContext &C, int NumNamedVarArgParams)
-      : CCState(CC, isVarArg, MF, TM, locs, C),
+                 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+                 int NumNamedVarArgParams)
+      : CCState(CC, isVarArg, MF, locs, C),
         NumNamedVarArgParams(NumNamedVarArgParams) {}
 
   int getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
@@ -322,8 +322,8 @@
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   // Analyze return values of ISD::RET
   CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
@@ -372,8 +372,8 @@
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
 
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
 
@@ -427,9 +427,8 @@
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  HexagonCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                        getTargetMachine(), ArgLocs, *DAG.getContext(),
-                        NumNamedVarArgParams);
+  HexagonCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                        *DAG.getContext(), NumNamedVarArgParams);
 
   if (NumNamedVarArgParams > 0)
     CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
@@ -464,7 +463,7 @@
   SmallVector<SDValue, 8> MemOpChains;
 
   const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
-      DAG.getTarget().getRegisterInfo());
+      DAG.getSubtarget().getRegisterInfo());
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, dl, QRI->getStackRegister(), getPointerTy());
 
@@ -723,7 +722,7 @@
               // Check it to be lr
               const HexagonRegisterInfo *QRI =
                   static_cast<const HexagonRegisterInfo *>(
-                      DAG.getTarget().getRegisterInfo());
+                      DAG.getSubtarget().getRegisterInfo());
               if (Reg == QRI->getRARegister()) {
                 FuncInfo->setHasClobberLR(true);
                 break;
@@ -817,7 +816,7 @@
   // The Sub result contains the new stack start address, so it
   // must be placed in the stack pointer register.
   const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
-      DAG.getTarget().getRegisterInfo());
+      DAG.getSubtarget().getRegisterInfo());
   SDValue CopyChain = DAG.getCopyToReg(Chain, dl, QRI->getStackRegister(), Sub);
 
   SDValue Ops[2] = { ArgAdjust, CopyChain };
@@ -843,8 +842,8 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
 
@@ -964,7 +963,7 @@
 
 SDValue
 HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
-  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
@@ -990,8 +989,8 @@
 
 SDValue
 HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
-  const HexagonRegisterInfo *TRI =
-      static_cast<const HexagonRegisterInfo *>(DAG.getTarget().getRegisterInfo());
+  const HexagonRegisterInfo *TRI = static_cast<const HexagonRegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
@@ -1044,7 +1043,7 @@
 //===----------------------------------------------------------------------===//
 
 HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &targetmachine)
-    : TargetLowering(targetmachine, new HexagonTargetObjectFile()),
+    : TargetLowering(targetmachine),
       TM(targetmachine) {
 
   const HexagonSubtarget &Subtarget = TM.getSubtarget<HexagonSubtarget>();
@@ -1453,8 +1452,8 @@
   setMinFunctionAlignment(2);
 
   // Needed for DYNAMIC_STACKALLOC expansion.
-  const HexagonRegisterInfo *QRI =
-      static_cast<const HexagonRegisterInfo *>(TM.getRegisterInfo());
+  const HexagonRegisterInfo *QRI = static_cast<const HexagonRegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
   setStackPointerRegisterToSaveRestore(QRI->getStackRegister());
   setSchedulingPreference(Sched::VLIW);
 }

diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index ec16cc8..63e4392 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef Hexagon_ISELLOWERING_H
-#define Hexagon_ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONISELLOWERING_H
 
 #include "Hexagon.h"
 #include "llvm/CodeGen/CallingConvLower.h"

diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 1057343..cc27c4c 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td

@@ -92,12 +92,18 @@
   let AsmString = asmstr;
   let Pattern = pattern;
   let Constraints = cstr;
-  let Itinerary = itin;
-  let Size = 4;
-
-  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
-
-  // Instruction type according to the ISA.
+  let Itinerary = itin;

+  let Size = 4;

+

+  // SoftFail is a field the disassembler can use to provide a way for

+  // instructions to not match without killing the whole decode process. It is

+  // mainly used for ARM, but Tablegen expects this field to exist or it fails

+  // to build the decode table.

+  field bits<32> SoftFail = 0;

+

+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***

+

+  // Instruction type according to the ISA.

   IType Type = type;
   let TSFlags{4-0} = Type.Value;
 
@@ -186,6 +192,7 @@
                                     "");
   let PNewValue = !if(isPredicatedNew, "new", "");
   let NValueST = !if(isNVStore, "true", "false");
+  let isCodeGenOnly = 1;
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 }

diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 1c95e06..1688c4a 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp

@@ -1295,16 +1295,24 @@
   switch (MI->getOpcode())
   {
     default: return false;
+    case Hexagon::A2_paddf:
+    case Hexagon::A2_paddfnew:
+    case Hexagon::A2_paddt:
+    case Hexagon::A2_paddtnew:
+    case Hexagon::A2_pandf:
+    case Hexagon::A2_pandfnew:
+    case Hexagon::A2_pandt:
+    case Hexagon::A2_pandtnew:
+    case Hexagon::A2_porf:
+    case Hexagon::A2_porfnew:
+    case Hexagon::A2_port:
+    case Hexagon::A2_portnew:
+    case Hexagon::A2_pxorf:
+    case Hexagon::A2_pxorfnew:
+    case Hexagon::A2_pxort:
+    case Hexagon::A2_pxortnew:
     case Hexagon::ADD_ri_cPt:
     case Hexagon::ADD_ri_cNotPt:
-    case Hexagon::ADD_rr_cPt:
-    case Hexagon::ADD_rr_cNotPt:
-    case Hexagon::XOR_rr_cPt:
-    case Hexagon::XOR_rr_cNotPt:
-    case Hexagon::AND_rr_cPt:
-    case Hexagon::AND_rr_cNotPt:
-    case Hexagon::OR_rr_cPt:
-    case Hexagon::OR_rr_cNotPt:
     case Hexagon::SUB_rr_cPt:
     case Hexagon::SUB_rr_cNotPt:
     case Hexagon::COMBINE_rr_cPt:
@@ -1636,11 +1644,10 @@
   MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
 }
 
-DFAPacketizer *HexagonInstrInfo::
-CreateTargetScheduleState(const TargetMachine *TM,
-                           const ScheduleDAG *DAG) const {
-  const InstrItineraryData *II = TM->getInstrItineraryData();
-  return TM->getSubtarget<HexagonGenSubtargetInfo>().createDFAPacketizer(II);
+DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
+    const TargetSubtargetInfo &STI) const {
+  const InstrItineraryData *II = STI.getInstrItineraryData();
+  return static_cast<const HexagonSubtarget &>(STI).createDFAPacketizer(II);
 }
 
 bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
@@ -1765,7 +1772,7 @@
                     & HexagonII::ExtentBitsMask;
 
   if (isSigned) // if value is signed
-    return -1 << (bits - 1);
+    return -1U << (bits - 1);
   else
     return 0;
 }
@@ -1779,9 +1786,9 @@
                     & HexagonII::ExtentBitsMask;
 
   if (isSigned) // if value is signed
-    return ~(-1 << (bits - 1));
+    return ~(-1U << (bits - 1));
   else
-    return ~(-1 << bits);
+    return ~(-1U << bits);
 }
 
 // Returns true if an instruction can be converted into a non-extended

diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 6b032c9..6acfbec 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HexagonINSTRUCTIONINFO_H
-#define HexagonINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONINSTRINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONINSTRINFO_H
 
 #include "HexagonRegisterInfo.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
@@ -148,9 +148,8 @@
   bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            const BranchProbability &Probability) const override;
 
-  DFAPacketizer*
-  CreateTargetScheduleState(const TargetMachine *TM,
-                            const ScheduleDAG *DAG) const override;
+  DFAPacketizer *
+  CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override;
 
   bool isSchedulingBoundary(const MachineInstr *MI,
                             const MachineBasicBlock *MBB,

diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index 4dcf101..4090681 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td

@@ -92,6 +92,92 @@
 def HexagonWrapperCombineRR :
   SDNode<"HexagonISD::WrapperCombineRR", SDTHexagonI64I32I32>;
 
+let hasSideEffects = 0, hasNewValue = 1, InputType = "reg" in
+class T_ALU32_3op<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit OpsRev,
+                  bit IsComm>
+  : ALU32_rr<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
+             "$Rd = "#mnemonic#"($Rs, $Rt)",
+             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredRel {
+  let isCommutable = IsComm;
+  let BaseOpcode = mnemonic#_rr;
+  let CextOpcode = mnemonic;
+
+  bits<5> Rs;
+  bits<5> Rt;
+  bits<5> Rd;
+
+  let IClass = 0b1111;
+  let Inst{27} = 0b0;
+  let Inst{26-24} = MajOp;
+  let Inst{23-21} = MinOp;
+  let Inst{20-16} = !if(OpsRev,Rt,Rs);
+  let Inst{12-8} = !if(OpsRev,Rs,Rt);
+  let Inst{4-0} = Rd;
+}
+
+let hasSideEffects = 0, hasNewValue = 1 in
+class T_ALU32_3op_pred<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                       bit OpsRev, bit PredNot, bit PredNew>
+  : ALU32_rr<(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
+             "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") "#
+             "$Rd = "#mnemonic#"($Rs, $Rt)",
+             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
+  let isPredicated = 1;
+  let isPredicatedFalse = PredNot;
+  let isPredicatedNew = PredNew;
+  let BaseOpcode = mnemonic#_rr;
+  let CextOpcode = mnemonic;
+
+  bits<2> Pu;
+  bits<5> Rs;
+  bits<5> Rt;
+  bits<5> Rd;
+
+  let IClass = 0b1111;
+  let Inst{27} = 0b1;
+  let Inst{26-24} = MajOp;
+  let Inst{23-21} = MinOp;
+  let Inst{20-16} = !if(OpsRev,Rt,Rs);
+  let Inst{13} = PredNew;
+  let Inst{12-8} = !if(OpsRev,Rs,Rt);
+  let Inst{7} = PredNot;
+  let Inst{6-5} = Pu;
+  let Inst{4-0} = Rd;
+}
+
+multiclass T_ALU32_3op_p<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                         bit OpsRev> {
+  def t    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 0>;
+  def f    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 0>;
+  def tnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 1>;
+  def fnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 1>;
+}
+
+multiclass T_ALU32_3op_A2<string mnemonic, bits<3> MajOp, bits<3> MinOp,
+                          bit OpsRev, bit IsComm> {
+  let isPredicable = 1 in
+  def  A2_#NAME  : T_ALU32_3op  <mnemonic, MajOp, MinOp, OpsRev, IsComm>;
+  defm A2_p#NAME : T_ALU32_3op_p<mnemonic, MajOp, MinOp, OpsRev>;
+}
+
+let isCodeGenOnly = 0 in
+defm add : T_ALU32_3op_A2<"add", 0b011, 0b000, 0, 1>;
+defm and : T_ALU32_3op_A2<"and", 0b001, 0b000, 0, 1>;
+defm or  : T_ALU32_3op_A2<"or",  0b001, 0b001, 0, 1>;
+defm sub : T_ALU32_3op_A2<"sub", 0b011, 0b001, 1, 0>;
+defm xor : T_ALU32_3op_A2<"xor", 0b001, 0b011, 0, 1>;
+
+// Pats for instruction selection.
+class BinOp32_pat<SDNode Op, InstHexagon MI, ValueType ResT>
+  : Pat<(ResT (Op (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))),
+        (ResT (MI IntRegs:$Rs, IntRegs:$Rt))>;
+
+def: BinOp32_pat<add, A2_add, i32>;
+def: BinOp32_pat<and, A2_and, i32>;
+def: BinOp32_pat<or,  A2_or,  i32>;
+def: BinOp32_pat<sub, A2_sub, i32>;
+def: BinOp32_pat<xor, A2_xor, i32>;
+
 multiclass ALU32_Pbase<string mnemonic, RegisterClass RC, bit isNot,
                        bit isPredNew> {
   let isPredicatedNew = isPredNew in
@@ -127,13 +213,6 @@
   }
 }
 
-let isCommutable = 1 in {
-  defm ADD_rr : ALU32_base<"add", "ADD", add>, ImmRegRel, PredNewRel;
-  defm AND_rr : ALU32_base<"and", "AND", and>, ImmRegRel, PredNewRel;
-  defm XOR_rr : ALU32_base<"xor", "XOR", xor>, ImmRegRel, PredNewRel;
-  defm OR_rr  : ALU32_base<"or", "OR", or>, ImmRegRel, PredNewRel;
-}
-
 defm SUB_rr : ALU32_base<"sub", "SUB", sub>, ImmRegRel, PredNewRel;
 
 // Combines the two integer registers SRC1 and SRC2 into a double register.
@@ -225,7 +304,7 @@
                                            s10ExtPred:$src2))]>, ImmRegRel;
 
 // Nop.
-let neverHasSideEffects = 1 in
+let neverHasSideEffects = 1, isCodeGenOnly = 0 in
 def NOP : ALU32_rr<(outs), (ins),
           "nop",
           []>;
@@ -753,7 +832,7 @@
 
 let InputType = "imm", isBarrier = 1, isPredicable = 1,
 Defs = [PC], isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
-opExtentBits = 24 in
+opExtentBits = 24, isCodeGenOnly = 0 in
 class T_JMP <dag InsDag, list<dag> JumpList = []>
             : JInst<(outs), InsDag,
             "jump $dst" , JumpList> {
@@ -2212,7 +2291,7 @@
 // Map from i1 loads to 32 bits. This assumes that the i1* is byte aligned.
 let AddedComplexity = 10 in
 def : Pat <(i32 (zextloadi1 ADDRriS11_0:$addr)),
-      (i32 (AND_rr (i32 (LDrib ADDRriS11_0:$addr)), (TFRI 0x1)))>;
+      (i32 (A2_and (i32 (LDrib ADDRriS11_0:$addr)), (TFRI 0x1)))>;
 
 // Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = SXTW(Rss.lo).
 def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i32)),

diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index db5b7ea..d39f7d7 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td

@@ -2130,6 +2130,42 @@
 // incorrect code for negative numbers.
 // Pd=cmpb.eq(Rs,#u8)
 
+let isCompare = 1, isExtendable = 1, opExtendable = 2, hasSideEffects = 0,
+    validSubTargets = HasV4SubT in
+class CMP_NOT_REG_IMM<string OpName, bits<2> op, Operand ImmOp,
+                      list<dag> Pattern>
+  : ALU32Inst <(outs PredRegs:$dst), (ins IntRegs:$src1, ImmOp:$src2),
+    "$dst = !cmp."#OpName#"($src1, #$src2)",
+    Pattern,
+    "", ALU32_2op_tc_2early_SLOT0123> {
+    bits<2> dst;
+    bits<5> src1;
+    bits<10> src2;
+
+    let IClass = 0b0111;
+    let Inst{27-24} = 0b0101;
+    let Inst{23-22} = op;
+    let Inst{20-16} = src1;
+    let Inst{21} = !if (!eq(OpName, "gtu"), 0b0, src2{9});
+    let Inst{13-5} = src2{8-0};
+    let Inst{4-2} = 0b100;
+    let Inst{1-0} = dst;
+}
+
+let opExtentBits = 10, isExtentSigned = 1 in {
+def C4_cmpneqi : CMP_NOT_REG_IMM <"eq", 0b00, s10Ext, [(set (i1 PredRegs:$dst),
+                 (setne (i32 IntRegs:$src1), s10ExtPred:$src2))]>;
+
+def C4_cmpltei : CMP_NOT_REG_IMM <"gt", 0b01, s10Ext, [(set (i1 PredRegs:$dst),
+                 (not (setgt (i32 IntRegs:$src1), s10ExtPred:$src2)))]>;
+
+}
+let opExtentBits = 9 in
+def C4_cmplteui : CMP_NOT_REG_IMM <"gtu", 0b10, u9Ext, [(set (i1 PredRegs:$dst),
+                  (not (setugt (i32 IntRegs:$src1), u9ExtPred:$src2)))]>;
+
+
+
 // p=!cmp.eq(r1,r2)
 let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPnotEQ_rr : ALU32_rr<(outs PredRegs:$dst),
@@ -2139,15 +2175,6 @@
             (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2)))]>,
       Requires<[HasV4T]>;
 
-// p=!cmp.eq(r1,#s10)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPnotEQ_ri : ALU32_ri<(outs PredRegs:$dst),
-                           (ins IntRegs:$src1, s10Ext:$src2),
-      "$dst = !cmp.eq($src1, #$src2)",
-      [(set (i1 PredRegs:$dst),
-            (setne (i32 IntRegs:$src1), s10ImmPred:$src2))]>,
-      Requires<[HasV4T]>;
-
 // p=!cmp.gt(r1,r2)
 let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPnotGT_rr : ALU32_rr<(outs PredRegs:$dst),
@@ -2157,14 +2184,6 @@
             (not (setgt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
       Requires<[HasV4T]>;
 
-// p=!cmp.gt(r1,#s10)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPnotGT_ri : ALU32_ri<(outs PredRegs:$dst),
-                           (ins IntRegs:$src1, s10Ext:$src2),
-      "$dst = !cmp.gt($src1, #$src2)",
-      [(set (i1 PredRegs:$dst),
-            (not (setgt (i32 IntRegs:$src1), s10ImmPred:$src2)))]>,
-      Requires<[HasV4T]>;
 
 // p=!cmp.gtu(r1,r2)
 let isCompare = 1, validSubTargets = HasV4SubT in
@@ -2175,15 +2194,6 @@
             (not (setugt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
       Requires<[HasV4T]>;
 
-// p=!cmp.gtu(r1,#u9)
-let isCompare = 1, validSubTargets = HasV4SubT in
-def CMPnotGTU_ri : ALU32_ri<(outs PredRegs:$dst),
-                            (ins IntRegs:$src1, u9Ext:$src2),
-      "$dst = !cmp.gtu($src1, #$src2)",
-      [(set (i1 PredRegs:$dst),
-            (not (setugt (i32 IntRegs:$src1), u9ImmPred:$src2)))]>,
-      Requires<[HasV4T]>;
-
 let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPbEQri_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, u8Imm:$src2),

diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index 99f59d5..b3385d8 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td

@@ -1843,6 +1843,11 @@
              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
              [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
 
+
+class T_RI_pat <InstHexagon MI, Intrinsic IntID>
+  : Pat<(IntID (i32 IntRegs:$Rs), imm:$It),
+        (MI IntRegs:$Rs, imm:$It)>;
+
 //
 // LDInst classes.
 //

diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/lib/Target/Hexagon/HexagonIntrinsicsV4.td
index dd28ebb..77b148b 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV4.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV4.td

@@ -217,12 +217,13 @@
 // ALU32 / PRED / Conditional Sign Extend.
 // ALU32 / PRED / Conditional Zero Extend.
 // ALU32 / PRED / Compare.
-def Hexagon_C4_cmpneq  : qi_neg_ALU32_sisi  <"cmp.eq", int_hexagon_C4_cmpneq>;
-def Hexagon_C4_cmpneqi : qi_neg_ALU32_sis10 <"cmp.eq", int_hexagon_C4_cmpneqi>;
-def Hexagon_C4_cmplte  : qi_neg_ALU32_sisi  <"cmp.gt", int_hexagon_C4_cmplte>;
 def Hexagon_C4_cmpltei : qi_neg_ALU32_sis10 <"cmp.gt", int_hexagon_C4_cmpltei>;
+def Hexagon_C4_cmplte  : qi_neg_ALU32_sisi  <"cmp.gt", int_hexagon_C4_cmplte>;
 def Hexagon_C4_cmplteu : qi_neg_ALU32_sisi  <"cmp.gtu",int_hexagon_C4_cmplteu>;
-def Hexagon_C4_cmplteui: qi_neg_ALU32_siu9  <"cmp.gtu",int_hexagon_C4_cmplteui>;
+
+def: T_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi>;
+def: T_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei>;
+def: T_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui>;
 
 // ALU32 / PRED / cmpare To General Register.
 def Hexagon_A4_rcmpneq : si_neg_ALU32_sisi <"cmp.eq", int_hexagon_A4_rcmpneq>;

diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index d799bdb..cb18df6 100644
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HexagonMACHINEFUNCTIONINFO_H
-#define HexagonMACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include <map>

diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 6fcaa20..97c626f 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp

@@ -145,7 +145,7 @@
         << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber()
         << " " << BB->getName()
         << " in_func " << BB->getParent()->getFunction()->getName()
-        << " at loop depth "  << MLI.getLoopDepth(BB)
+        << " at loop depth "  << MLI->getLoopDepth(BB)
         << " \n");
 
   buildDAGWithRegPressure();
@@ -208,8 +208,12 @@
   const TargetMachine &TM = DAG->MF.getTarget();
   delete Top.HazardRec;
   delete Bot.HazardRec;
-  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Top.HazardRec =
+      TM.getSubtargetImpl()->getInstrInfo()->CreateTargetMIHazardRecognizer(
+          Itin, DAG);
+  Bot.HazardRec =
+      TM.getSubtargetImpl()->getInstrInfo()->CreateTargetMIHazardRecognizer(
+          Itin, DAG);
 
   delete Top.ResourceModel;
   delete Bot.ResourceModel;

diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 8c41086..1e023c3 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HEXAGONASMPRINTER_H
-#define HEXAGONASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H
 
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -56,7 +56,9 @@
 public:
 VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
     SchedModel(SM), TotalPackets(0) {
-    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM, nullptr);
+  ResourcesModel =
+      TM.getSubtargetImpl()->getInstrInfo()->CreateTargetScheduleState(
+          *TM.getSubtargetImpl());
 
     // This hard requirement could be relaxed,
     // but for now do not let it proceed.
@@ -99,7 +101,7 @@
 
   /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
   /// time to do some work.
-  virtual void schedule() override;
+  void schedule() override;
   /// Perform platform-specific DAG postprocessing.
   void postprocessDAG();
 };
@@ -207,15 +209,15 @@
     : DAG(nullptr), SchedModel(nullptr), Top(TopQID, "TopQ"),
       Bot(BotQID, "BotQ") {}
 
-  virtual void initialize(ScheduleDAGMI *dag) override;
+  void initialize(ScheduleDAGMI *dag) override;
 
-  virtual SUnit *pickNode(bool &IsTopNode) override;
+  SUnit *pickNode(bool &IsTopNode) override;
 
-  virtual void schedNode(SUnit *SU, bool IsTopNode) override;
+  void schedNode(SUnit *SU, bool IsTopNode) override;
 
-  virtual void releaseTopNode(SUnit *SU) override;
+  void releaseTopNode(SUnit *SU) override;
 
-  virtual void releaseBottomNode(SUnit *SU) override;
+  void releaseBottomNode(SUnit *SU) override;
 
   unsigned ReportPackets() {
     return Top.ResourceModel->getTotalPackets() +

diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index b7c03a7..782c979 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp

@@ -362,9 +362,9 @@
   LiveVariables &LVs = getAnalysis<LiveVariables>();
 #endif
 
-  QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo());
-  QRI =
-    static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
+  QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  QRI = static_cast<const HexagonRegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
 
   if (!QRI->Subtarget.hasV4TOps() ||

diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 48b6159..8912152 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp

@@ -111,10 +111,8 @@
                 false, false)
 
 bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
-  QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().
-                                        getInstrInfo());
-  QRI = static_cast<const HexagonRegisterInfo *>(MF.getTarget().
-                                       getRegisterInfo());
+  QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  QRI = MF.getTarget().getSubtarget<HexagonSubtarget>().getRegisterInfo();
   MRI = &MF.getRegInfo();
 
   DenseMap<unsigned, unsigned> PeepholeMap;

diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index fb466d3..2b6741c 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp

@@ -128,12 +128,12 @@
   // Addressable stack objects are accessed using neg. offsets from %fp.
   MachineFunction &MF = *MI.getParent()->getParent();
   const HexagonInstrInfo &TII =
-    *static_cast<const HexagonInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
   unsigned FrameReg = getFrameRegister(MF);
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   if (!TFI->hasFP(MF)) {
     // We will not reserve space on the stack for the lr and fp registers.
     Offset -= 2 * Hexagon_WordSize;
@@ -176,7 +176,7 @@
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
                   TII.get(Hexagon::CONST32_Int_Real), dstReg).addImm(Offset);
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
-                  TII.get(Hexagon::ADD_rr),
+                  TII.get(Hexagon::A2_add),
                   dstReg).addReg(FrameReg).addReg(dstReg);
         } else {
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
@@ -205,7 +205,7 @@
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
                   TII.get(Hexagon::CONST32_Int_Real), resReg).addImm(Offset);
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
-                  TII.get(Hexagon::ADD_rr),
+                  TII.get(Hexagon::A2_add),
                   resReg).addReg(FrameReg).addReg(resReg);
         } else {
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
@@ -237,7 +237,7 @@
             BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
                     TII.get(Hexagon::CONST32_Int_Real), ResReg).addImm(Offset);
             BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
-                    TII.get(Hexagon::ADD_rr), ResReg).addReg(FrameReg).
+                    TII.get(Hexagon::A2_add), ResReg).addReg(FrameReg).
               addReg(ResReg);
             MI.getOperand(FIOperandNum).ChangeToRegister(ResReg, false, false,
                                                          true);
@@ -256,7 +256,7 @@
         BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
                 TII.get(Hexagon::CONST32_Int_Real), dstReg).addImm(Offset);
         BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
-                TII.get(Hexagon::ADD_rr),
+                TII.get(Hexagon::A2_add),
                 dstReg).addReg(FrameReg).addReg(dstReg);
         // Can we delete MI??? r2 = add (r2, #0).
         MI.getOperand(FIOperandNum).ChangeToRegister(dstReg, false, false,true);
@@ -278,7 +278,7 @@
 
 unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
                                                &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   if (TFI->hasFP(MF)) {
     return Hexagon::R30;
   }

diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 648b4af..a83b502 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HexagonREGISTERINFO_H
-#define HexagonREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONREGISTERINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONREGISTERINFO_H
 
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Target/TargetRegisterInfo.h"

diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index 8ea1b7e..9750984 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td

@@ -13,46 +13,48 @@
 
 let Namespace = "Hexagon" in {
 
-  class HexagonReg<string n> : Register<n> {
+  class HexagonReg<bits<5> num, string n> : Register<n> {
     field bits<5> Num;
+    let HWEncoding{4-0} = num;
   }
 
-  class HexagonDoubleReg<string n, list<Register> subregs> :
+  class HexagonDoubleReg<bits<5> num, string n, list<Register> subregs> :
         RegisterWithSubRegs<n, subregs> {
     field bits<5> Num;
+    let HWEncoding{4-0} = num;
   }
 
   // Registers are identified with 5-bit ID numbers.
   // Ri - 32-bit integer registers.
-  class Ri<bits<5> num, string n> : HexagonReg<n> {
+  class Ri<bits<5> num, string n> : HexagonReg<num, n> {
     let Num = num;
   }
 
   // Rf - 32-bit floating-point registers.
-  class Rf<bits<5> num, string n> : HexagonReg<n> {
+  class Rf<bits<5> num, string n> : HexagonReg<num, n> {
     let Num = num;
   }
 
 
   // Rd - 64-bit registers.
   class Rd<bits<5> num, string n, list<Register> subregs> :
-        HexagonDoubleReg<n, subregs> {
+        HexagonDoubleReg<num, n, subregs> {
     let Num = num;
     let SubRegs = subregs;
   }
 
   // Rp - predicate registers
-  class Rp<bits<5> num, string n> : HexagonReg<n> {
+  class Rp<bits<5> num, string n> : HexagonReg<num, n> {
     let Num = num;
   }
 
   // Rc - control registers
-  class Rc<bits<5> num, string n> : HexagonReg<n> {
+  class Rc<bits<5> num, string n> : HexagonReg<num, n> {
     let Num = num;
   }
 
   // Rj - aliased integer registers
-  class Rj<string n, Ri R>: HexagonReg<n> {
+  class Rj<string n, Ri R>: HexagonReg<R.Num, n> {
     let Num = R.Num;
     let Aliases = [R];
   }
@@ -61,38 +63,9 @@
   def subreg_hireg  : SubRegIndex<32, 32>;
 
   // Integer registers.
-  def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
-  def R1 : Ri< 1, "r1">, DwarfRegNum<[1]>;
-  def R2 : Ri< 2, "r2">, DwarfRegNum<[2]>;
-  def R3 : Ri< 3, "r3">, DwarfRegNum<[3]>;
-  def R4 : Ri< 4, "r4">, DwarfRegNum<[4]>;
-  def R5 : Ri< 5, "r5">, DwarfRegNum<[5]>;
-  def R6 : Ri< 6, "r6">, DwarfRegNum<[6]>;
-  def R7 : Ri< 7, "r7">, DwarfRegNum<[7]>;
-  def R8 : Ri< 8, "r8">, DwarfRegNum<[8]>;
-  def R9 : Ri< 9, "r9">, DwarfRegNum<[9]>;
-  def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
-  def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
-  def R12 : Ri<12, "r12">, DwarfRegNum<[12]>;
-  def R13 : Ri<13, "r13">, DwarfRegNum<[13]>;
-  def R14 : Ri<14, "r14">, DwarfRegNum<[14]>;
-  def R15 : Ri<15, "r15">, DwarfRegNum<[15]>;
-  def R16 : Ri<16, "r16">, DwarfRegNum<[16]>;
-  def R17 : Ri<17, "r17">, DwarfRegNum<[17]>;
-  def R18 : Ri<18, "r18">, DwarfRegNum<[18]>;
-  def R19 : Ri<19, "r19">, DwarfRegNum<[19]>;
-  def R20 : Ri<20, "r20">, DwarfRegNum<[20]>;
-  def R21 : Ri<21, "r21">, DwarfRegNum<[21]>;
-  def R22 : Ri<22, "r22">, DwarfRegNum<[22]>;
-  def R23 : Ri<23, "r23">, DwarfRegNum<[23]>;
-  def R24 : Ri<24, "r24">, DwarfRegNum<[24]>;
-  def R25 : Ri<25, "r25">, DwarfRegNum<[25]>;
-  def R26 : Ri<26, "r26">, DwarfRegNum<[26]>;
-  def R27 : Ri<27, "r27">, DwarfRegNum<[27]>;
-  def R28 : Ri<28, "r28">, DwarfRegNum<[28]>;
-  def R29 : Ri<29, "r29">, DwarfRegNum<[29]>;
-  def R30 : Ri<30, "r30">, DwarfRegNum<[30]>;
-  def R31 : Ri<31, "r31">, DwarfRegNum<[31]>;
+  foreach I = 0-31 in {
+    def R#I  : Ri<I, "r"#I>,  DwarfRegNum<[I]>;
+  }
 
   def SP : Rj<"sp", R29>, DwarfRegNum<[29]>;
   def FP : Rj<"fp", R30>, DwarfRegNum<[30]>;

diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index b40b303..8ac2e43 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HexagonSELECTIONDAGINFO_H
-#define HexagonSELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONSELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 

diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 247207f..8fdd493 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp

@@ -68,12 +68,13 @@
 bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
 
   const HexagonTargetObjectFile &TLOF =
-      (const HexagonTargetObjectFile &)
-      QTM.getTargetLowering()->getObjFileLowering();
+      (const HexagonTargetObjectFile &)QTM.getSubtargetImpl()
+          ->getTargetLowering()
+          ->getObjFileLowering();
   if (TLOF.IsSmallDataEnabled())
     return true;
 
-  const TargetInstrInfo *TII = QTM.getInstrInfo();
+  const TargetInstrInfo *TII = QTM.getSubtargetImpl()->getInstrInfo();
 
   // Loop over all of the basic blocks
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
@@ -138,10 +139,10 @@
       else if (Opc == Hexagon::CONST64_Int_Real) {
         int DestReg = MI->getOperand(0).getReg();
         int64_t ImmValue = MI->getOperand(1).getImm ();
-        unsigned DestLo =
-          QTM.getRegisterInfo()->getSubReg (DestReg, Hexagon::subreg_loreg);
-        unsigned DestHi =
-          QTM.getRegisterInfo()->getSubReg (DestReg, Hexagon::subreg_hireg);
+        unsigned DestLo = QTM.getSubtargetImpl()->getRegisterInfo()->getSubReg(
+            DestReg, Hexagon::subreg_loreg);
+        unsigned DestHi = QTM.getSubtargetImpl()->getRegisterInfo()->getSubReg(
+            DestReg, Hexagon::subreg_hireg);
 
         int32_t LowWord = (ImmValue & 0xFFFFFFFF);
         int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF;

diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
index 9601090..1052b80 100644
--- a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
+++ b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp

@@ -80,7 +80,7 @@
 
 bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
 
-  const TargetInstrInfo *TII = QTM.getInstrInfo();
+  const TargetInstrInfo *TII = QTM.getSubtargetImpl()->getInstrInfo();
 
   // Loop over all of the basic blocks.
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();

diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index b184e62..10776ae 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef Hexagon_SUBTARGET_H
-#define Hexagon_SUBTARGET_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 
 #include "HexagonFrameLowering.h"
 #include "HexagonInstrInfo.h"
@@ -56,19 +56,25 @@
   HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS,
                    const TargetMachine &TM);
 
-  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
-  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
-  const HexagonInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  const HexagonRegisterInfo *getRegisterInfo() const {
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+  const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const HexagonRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  const HexagonTargetLowering *getTargetLowering() const { return &TLInfo; }
-  const HexagonFrameLowering *getFrameLowering() const {
+  const HexagonTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const HexagonFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  const HexagonSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
-  const DataLayout *getDataLayout() const { return &DL; }
+  const HexagonSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const DataLayout *getDataLayout() const override { return &DL; }
 
   HexagonSubtarget &initializeSubtargetDependencies(StringRef CPU,
                                                     StringRef FS);

diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 7831410..cd18dfb 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp

@@ -70,10 +70,13 @@
                                            Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      TLOF(make_unique<HexagonTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
     initAsmInfo();
 }
 
+HexagonTargetMachine::~HexagonTargetMachine() {}
+
 namespace {
 /// Hexagon Code Generator Pass Configuration Options.
 class HexagonPassConfig : public TargetPassConfig {

diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index d88178e..4a9f447 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HexagonTARGETMACHINE_H
-#define HexagonTARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETMACHINE_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETMACHINE_H
 
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
@@ -23,6 +23,7 @@
 class Module;
 
 class HexagonTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   HexagonSubtarget Subtarget;
 
 public:
@@ -30,34 +31,18 @@
                        StringRef FS, const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
+  ~HexagonTargetMachine() override;
 
-  const HexagonInstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
   const HexagonSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
-  const HexagonRegisterInfo *getRegisterInfo() const override {
-    return getSubtargetImpl()->getRegisterInfo();
-  }
-  const InstrItineraryData* getInstrItineraryData() const override {
-    return &getSubtargetImpl()->getInstrItineraryData();
-  }
-  const HexagonTargetLowering* getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const HexagonFrameLowering* getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  const HexagonSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
   static unsigned getModuleMatchQuality(const Module &M);
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
 };
 
 extern bool flag_aligned_memcpy;

diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index c97526e..f4ab5e2 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp

@@ -31,7 +31,7 @@
 void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
                                          const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-
+  InitializeELF(TM.Options.UseInitArray);
 
   SmallDataSection =
     getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
@@ -79,7 +79,8 @@
 
   if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) {
     Type *Ty = GV->getType()->getElementType();
-    return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+    return IsInSmallSection(
+        TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(Ty));
   }
 
   return false;

diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 1bd1272..c974204 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HexagonTARGETOBJECTFILE_H
-#define HexagonTARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/MC/MCSectionELF.h"

diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 87ce960..e7296d6 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp

@@ -118,7 +118,6 @@
   public:
     // Ctor.
     HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
-                          MachineDominatorTree &MDT,
                           const MachineBranchProbabilityInfo *MBPI);
 
     // initPacketizerState - initialize some internal flags.
@@ -146,23 +145,23 @@
     bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
                          MachineBasicBlock::iterator &MII,
                          const TargetRegisterClass* RC);
-    bool CanPromoteToDotNew(MachineInstr* MI, SUnit* PacketSU,
-                            unsigned DepReg,
-                            std::map <MachineInstr*, SUnit*> MIToSUnit,
+    bool CanPromoteToDotNew(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
+                            const std::map<MachineInstr *, SUnit *> &MIToSUnit,
                             MachineBasicBlock::iterator &MII,
-                            const TargetRegisterClass* RC);
-    bool CanPromoteToNewValue(MachineInstr* MI, SUnit* PacketSU,
-                              unsigned DepReg,
-                              std::map <MachineInstr*, SUnit*> MIToSUnit,
-                              MachineBasicBlock::iterator &MII);
-    bool CanPromoteToNewValueStore(MachineInstr* MI, MachineInstr* PacketMI,
-                                   unsigned DepReg,
-                                   std::map <MachineInstr*, SUnit*> MIToSUnit);
-    bool DemoteToDotOld(MachineInstr* MI);
-    bool ArePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2,
-                    std::map <MachineInstr*, SUnit*> MIToSUnit);
-    bool RestrictingDepExistInPacket(MachineInstr*,
-                    unsigned, std::map <MachineInstr*, SUnit*>);
+                            const TargetRegisterClass *RC);
+    bool
+    CanPromoteToNewValue(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
+                         const std::map<MachineInstr *, SUnit *> &MIToSUnit,
+                         MachineBasicBlock::iterator &MII);
+    bool CanPromoteToNewValueStore(
+        MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg,
+        const std::map<MachineInstr *, SUnit *> &MIToSUnit);
+    bool DemoteToDotOld(MachineInstr *MI);
+    bool ArePredicatesComplements(
+        MachineInstr *MI1, MachineInstr *MI2,
+        const std::map<MachineInstr *, SUnit *> &MIToSUnit);
+    bool RestrictingDepExistInPacket(MachineInstr *, unsigned,
+                                     const std::map<MachineInstr *, SUnit *> &);
     bool isNewifiable(MachineInstr* MI);
     bool isCondInst(MachineInstr* MI);
     bool tryAllocateResourcesForConstExt(MachineInstr* MI);
@@ -184,20 +183,19 @@
 
 // HexagonPacketizerList Ctor.
 HexagonPacketizerList::HexagonPacketizerList(
-  MachineFunction &MF, MachineLoopInfo &MLI,MachineDominatorTree &MDT,
-  const MachineBranchProbabilityInfo *MBPI)
-  : VLIWPacketizerList(MF, MLI, MDT, true){
+    MachineFunction &MF, MachineLoopInfo &MLI,
+    const MachineBranchProbabilityInfo *MBPI)
+    : VLIWPacketizerList(MF, MLI, true) {
   this->MBPI = MBPI;
 }
 
 bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
-  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
   const MachineBranchProbabilityInfo *MBPI =
     &getAnalysis<MachineBranchProbabilityInfo>();
   // Instantiate the packetizer.
-  HexagonPacketizerList Packetizer(Fn, MLI, MDT, MBPI);
+  HexagonPacketizerList Packetizer(Fn, MLI, MBPI);
 
   // DFA state table should not be empty.
   assert(Packetizer.getResourceTracker() && "Empty DFA table!");
@@ -324,8 +322,8 @@
                                           unsigned DepReg) {
 
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  const HexagonRegisterInfo* QRI =
-              (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const HexagonRegisterInfo *QRI =
+      (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
 
   // Check for lr dependence
   if (DepReg == QRI->getRARegister()) {
@@ -536,9 +534,9 @@
 //    if there is a  new value store in the packet. Corollary, if there is
 //    already a store in a packet, there can not be a new value store.
 //    Arch Spec: 3.4.4.2
-bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
-                MachineInstr *PacketMI, unsigned DepReg,
-                std::map <MachineInstr*, SUnit*> MIToSUnit) {
+bool HexagonPacketizerList::CanPromoteToNewValueStore(
+    MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg,
+    const std::map<MachineInstr *, SUnit *> &MIToSUnit) {
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   // Make sure we are looking at the store, that can be promoted.
   if (!QII->mayBeNewStore(MI))
@@ -549,8 +547,8 @@
       GetStoreValueOperand(MI).getReg() != DepReg)
     return false;
 
-  const HexagonRegisterInfo* QRI =
-                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const HexagonRegisterInfo *QRI =
+      (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
   const MCInstrDesc& MCID = PacketMI->getDesc();
   // first operand is always the result
 
@@ -561,7 +559,7 @@
   for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
          VE = CurrentPacketMIs.end();
        (VI != VE); ++VI) {
-    SUnit* PacketSU = MIToSUnit[*VI];
+    SUnit *PacketSU = MIToSUnit.find(*VI)->second;
     if (PacketSU->getInstr()->getDesc().mayStore() ||
         // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME,
         // then we don't need this
@@ -661,7 +659,7 @@
 
   for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end();
       (VI != VE); ++VI) {
-    SUnit* TempSU = MIToSUnit[*VI];
+    SUnit *TempSU = MIToSUnit.find(*VI)->second;
     MachineInstr* TempMI = TempSU->getInstr();
 
     // Following condition is true for all the instructions until PacketMI is
@@ -717,15 +715,14 @@
 
 // can this MI to promoted to either
 // new value store or new value jump
-bool HexagonPacketizerList::CanPromoteToNewValue( MachineInstr *MI,
-                SUnit *PacketSU, unsigned DepReg,
-                std::map <MachineInstr*, SUnit*> MIToSUnit,
-                MachineBasicBlock::iterator &MII)
-{
+bool HexagonPacketizerList::CanPromoteToNewValue(
+    MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
+    const std::map<MachineInstr *, SUnit *> &MIToSUnit,
+    MachineBasicBlock::iterator &MII) {
 
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  const HexagonRegisterInfo* QRI =
-                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const HexagonRegisterInfo *QRI =
+      (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
   if (!QRI->Subtarget.hasV4TOps() ||
       !QII->mayBeNewStore(MI))
     return false;
@@ -746,12 +743,10 @@
 // 1. dot new on predicate - V2/V3/V4
 // 2. dot new on stores NV/ST - V4
 // 3. dot new on jump NV/J - V4 -- This is generated in a pass.
-bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI,
-                              SUnit *PacketSU, unsigned DepReg,
-                              std::map <MachineInstr*, SUnit*> MIToSUnit,
-                              MachineBasicBlock::iterator &MII,
-                              const TargetRegisterClass* RC )
-{
+bool HexagonPacketizerList::CanPromoteToDotNew(
+    MachineInstr *MI, SUnit *PacketSU, unsigned DepReg,
+    const std::map<MachineInstr *, SUnit *> &MIToSUnit,
+    MachineBasicBlock::iterator &MII, const TargetRegisterClass *RC) {
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   // Already a dot new instruction.
   if (QII->isDotNewInst(MI) && !QII->mayBeNewStore(MI))
@@ -803,12 +798,12 @@
 // The P3 from a) and d) will be complements after
 // a)'s P3 is converted to .new form
 // Anti Dep between c) and b) is irrelevant for this case
-bool HexagonPacketizerList::RestrictingDepExistInPacket (MachineInstr* MI,
-      unsigned DepReg,
-      std::map <MachineInstr*, SUnit*> MIToSUnit) {
+bool HexagonPacketizerList::RestrictingDepExistInPacket(
+    MachineInstr *MI, unsigned DepReg,
+    const std::map<MachineInstr *, SUnit *> &MIToSUnit) {
 
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  SUnit* PacketSUDep = MIToSUnit[MI];
+  SUnit *PacketSUDep = MIToSUnit.find(MI)->second;
 
   for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
        VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
@@ -817,7 +812,7 @@
     if(!QII->isPredicated(*VIN)) continue;
 
     // Scheduling Unit for current insn in the packet
-    SUnit* PacketSU = MIToSUnit[*VIN];
+    SUnit *PacketSU = MIToSUnit.find(*VIN)->second;
 
     // Look at dependencies between current members of the packet
     // and predicate defining instruction MI.
@@ -861,8 +856,9 @@
 
 // Given two predicated instructions, this function detects whether
 // the predicates are complements
-bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
-     MachineInstr* MI2, std::map <MachineInstr*, SUnit*> MIToSUnit) {
+bool HexagonPacketizerList::ArePredicatesComplements(
+    MachineInstr *MI1, MachineInstr *MI2,
+    const std::map<MachineInstr *, SUnit *> &MIToSUnit) {
 
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
 
@@ -873,7 +869,7 @@
     return false;
 
   // Scheduling unit for candidate
-  SUnit* SU = MIToSUnit[MI1];
+  SUnit *SU = MIToSUnit.find(MI1)->second;
 
   // One corner case deals with the following scenario:
   // Trying to add
@@ -898,7 +894,7 @@
        VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
 
     // Scheduling Unit for current insn in the packet
-    SUnit* PacketSU = MIToSUnit[*VIN];
+    SUnit *PacketSU = MIToSUnit.find(*VIN)->second;
 
     // If this instruction in the packet is succeeded by the candidate...
     if (PacketSU->isSucc(SU)) {
@@ -1007,8 +1003,8 @@
   MachineBasicBlock::iterator II = I;
 
   const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
-  const HexagonRegisterInfo* QRI =
-                      (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const HexagonRegisterInfo *QRI =
+      (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo();
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
 
   // Inline asm cannot go in the packet.
@@ -1103,7 +1099,7 @@
             VI = CurrentPacketMIs.begin(),
              VE = CurrentPacketMIs.end();
            (VI != VE && maintainNewValueJump); ++VI) {
-        SUnit* PacketSU = MIToSUnit[*VI];
+        SUnit *PacketSU = MIToSUnit.find(*VI)->second;
 
         // NVJ can not be part of the dual jump - Arch Spec: section 7.8
         if (PacketSU->getInstr()->getDesc().isCall()) {
@@ -1278,9 +1274,9 @@
       }
 
       // For V4, special case ALLOCFRAME. Even though there is dependency
-      // between ALLOCAFRAME and subsequent store, allow it to be
+      // between ALLOCFRAME and subsequent store, allow it to be
       // packetized in a same packet. This implies that the store is using
-      // caller's SP. Hense, offset needs to be updated accordingly.
+      // caller's SP. Hence, offset needs to be updated accordingly.
       else if (DepType == SDep::Data
                && QRI->Subtarget.hasV4TOps()
                && J->getOpcode() == Hexagon::ALLOCFRAME

diff --git a/lib/Target/Hexagon/HexagonVarargsCallingConvention.h b/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
index 668ca98..edbe29a 100644
--- a/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
+++ b/lib/Target/Hexagon/HexagonVarargsCallingConvention.h

@@ -74,10 +74,14 @@
   }
 
   const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
-  unsigned Alignment =
-    State.getTarget().getDataLayout()->getABITypeAlignment(ArgTy);
+  unsigned Alignment = State.getTarget()
+                           .getSubtargetImpl()
+                           ->getDataLayout()
+                           ->getABITypeAlignment(ArgTy);
   unsigned Size =
-    State.getTarget().getDataLayout()->getTypeSizeInBits(ArgTy) / 8;
+      State.getTarget().getSubtargetImpl()->getDataLayout()->getTypeSizeInBits(
+          ArgTy) /
+      8;
 
   // If it's passed by value, then we need the size of the aggregate not of
   // the pointer.
@@ -129,10 +133,14 @@
   }
 
   const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
-  unsigned Alignment =
-    State.getTarget().getDataLayout()->getABITypeAlignment(ArgTy);
+  unsigned Alignment = State.getTarget()
+                           .getSubtargetImpl()
+                           ->getDataLayout()
+                           ->getABITypeAlignment(ArgTy);
   unsigned Size =
-    State.getTarget().getDataLayout()->getTypeSizeInBits(ArgTy) / 8;
+      State.getTarget().getSubtargetImpl()->getDataLayout()->getTypeSizeInBits(
+          ArgTy) /
+      8;
 
   unsigned Offset3 = State.AllocateStack(Size, Alignment);
   State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,

diff --git a/lib/Target/Hexagon/InstPrinter/CMakeLists.txt b/lib/Target/Hexagon/InstPrinter/CMakeLists.txt
deleted file mode 100644
index 1ddaf9b..0000000
--- a/lib/Target/Hexagon/InstPrinter/CMakeLists.txt
+++ /dev/null

@@ -1,3 +0,0 @@
-add_llvm_library(LLVMHexagonAsmPrinter
-  HexagonInstPrinter.cpp
-  )

diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
deleted file mode 100644
index 9942a60..0000000
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ /dev/null

@@ -1,204 +0,0 @@
-//===- HexagonInstPrinter.cpp - Convert Hexagon MCInst to assembly syntax -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an Hexagon MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "HexagonAsmPrinter.h"
-#include "Hexagon.h"
-#include "HexagonInstPrinter.h"
-#include "MCTargetDesc/HexagonMCInst.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "asm-printer"
-
-#define GET_INSTRUCTION_NAME
-#include "HexagonGenAsmWriter.inc"
-
-const char HexagonInstPrinter::PacketPadding = '\t';
-
-StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
-  return MII.getName(Opcode);
-}
-
-StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
-  return getRegisterName(RegNo);
-}
-
-void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                   StringRef Annot) {
-  printInst((const HexagonMCInst*)(MI), O, Annot);
-}
-
-void HexagonInstPrinter::printInst(const HexagonMCInst *MI, raw_ostream &O,
-                                   StringRef Annot) {
-  const char startPacket = '{',
-             endPacket = '}';
-  // TODO: add outer HW loop when it's supported too.
-  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
-    // Ending a harware loop is different from ending an regular packet.
-    assert(MI->isPacketEnd() && "Loop-end must also end the packet");
-
-    if (MI->isPacketStart()) {
-      // There must be a packet to end a loop.
-      // FIXME: when shuffling is always run, this shouldn't be needed.
-      HexagonMCInst Nop;
-      StringRef NoAnnot;
-
-      Nop.setOpcode (Hexagon::NOP);
-      Nop.setPacketStart (MI->isPacketStart());
-      printInst (&Nop, O, NoAnnot);
-    }
-
-    // Close the packet.
-    if (MI->isPacketEnd())
-      O << PacketPadding << endPacket;
-
-    printInstruction(MI, O);
-  }
-  else {
-    // Prefix the insn opening the packet.
-    if (MI->isPacketStart())
-      O << PacketPadding << startPacket << '\n';
-
-    printInstruction(MI, O);
-
-    // Suffix the insn closing the packet.
-    if (MI->isPacketEnd())
-      // Suffix the packet in a new line always, since the GNU assembler has
-      // issues with a closing brace on the same line as CONST{32,64}.
-      O << '\n' << PacketPadding << endPacket;
-  }
-
-  printAnnotation(O, Annot);
-}
-
-void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                      raw_ostream &O) const {
-  const MCOperand& MO = MI->getOperand(OpNo);
-
-  if (MO.isReg()) {
-    O << getRegisterName(MO.getReg());
-  } else if(MO.isExpr()) {
-    O << *MO.getExpr();
-  } else if(MO.isImm()) {
-    printImmOperand(MI, OpNo, O);
-  } else {
-    llvm_unreachable("Unknown operand");
-  }
-}
-
-void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) const {
-  const MCOperand& MO = MI->getOperand(OpNo);
-
-  if(MO.isExpr()) {
-    O << *MO.getExpr();
-  } else if(MO.isImm()) {
-    O << MI->getOperand(OpNo).getImm();
-  } else {
-    llvm_unreachable("Unknown operand");
-  }
-}
-
-void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) const {
-  const HexagonMCInst *HMCI = static_cast<const HexagonMCInst*>(MI);
-  if (HMCI->isConstExtended())
-    O << "#";
-  printOperand(MI, OpNo, O);
-}
-
-void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI,
-                                    unsigned OpNo, raw_ostream &O) const {
-  O << MI->getOperand(OpNo).getImm();
-}
-
-void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) const {
-  O << -MI->getOperand(OpNo).getImm();
-}
-
-void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  O << -1;
-}
-
-void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) const {
-  const MCOperand& MO0 = MI->getOperand(OpNo);
-  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
-
-  O << getRegisterName(MO0.getReg());
-  O << " + #" << MO1.getImm();
-}
-
-void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
-                                                raw_ostream &O) const {
-  const MCOperand& MO0 = MI->getOperand(OpNo);
-  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
-
-  O << getRegisterName(MO0.getReg()) << ", #" << MO1.getImm();
-}
-
-void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) const {
-  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
-
-  printOperand(MI, OpNo, O);
-}
-
-void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) const {
-  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
-
-  printOperand(MI, OpNo, O);
-}
-
-void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) const {
-  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
-
-  printOperand(MI, OpNo, O);
-}
-
-void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) const {
-  // Branches can take an immediate operand.  This is used by the branch
-  // selection pass to print $+8, an eight byte displacement from the PC.
-  llvm_unreachable("Unknown branch operand.");
-}
-
-void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) const {
-}
-
-void HexagonInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-}
-
-void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
-                                               raw_ostream &O) const {
-}
-
-void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo,
-                                     raw_ostream &O, bool hi) const {
-  assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand");
-
-  O << '#' << (hi ? "HI" : "LO") << "(#";
-  printOperand(MI, OpNo, O);
-  O << ')';
-}

diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
deleted file mode 100644
index 09e3f88..0000000
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
+++ /dev/null

@@ -1,87 +0,0 @@
-//===-- HexagonInstPrinter.h - Convert Hexagon MCInst to assembly syntax --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an Hexagon MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef HEXAGONINSTPRINTER_H
-#define HEXAGONINSTPRINTER_H
-
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCInstrInfo.h"
-
-namespace llvm {
-  class HexagonMCInst;
-
-  class HexagonInstPrinter : public MCInstPrinter {
-  public:
-    explicit HexagonInstPrinter(const MCAsmInfo &MAI,
-                                const MCInstrInfo &MII,
-                                const MCRegisterInfo &MRI)
-      : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
-
-    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
-    void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
-    virtual StringRef getOpcodeName(unsigned Opcode) const;
-    void printInstruction(const MCInst *MI, raw_ostream &O);
-    StringRef getRegName(unsigned RegNo) const;
-    static const char *getRegisterName(unsigned RegNo);
-
-    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-    void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-    void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo,
-                                 raw_ostream &O) const;
-    void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
-                                raw_ostream &O) const;
-    void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
-           const;
-    void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
-
-    void printConstantPool(const MCInst *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-
-    void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
-      { printSymbol(MI, OpNo, O, true); }
-    void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
-      { printSymbol(MI, OpNo, O, false); }
-
-    const MCInstrInfo &getMII() const {
-      return MII;
-    }
-
-  protected:
-    void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi)
-           const;
-
-    static const char PacketPadding;
-
-  private:
-    const MCInstrInfo &MII;
-
-  };
-
-} // end namespace llvm
-
-#endif

diff --git a/lib/Target/Hexagon/InstPrinter/LLVMBuild.txt b/lib/Target/Hexagon/InstPrinter/LLVMBuild.txt
deleted file mode 100644
index 59849aa..0000000
--- a/lib/Target/Hexagon/InstPrinter/LLVMBuild.txt
+++ /dev/null

@@ -1,23 +0,0 @@
-;===- ./lib/Target/Hexagon/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = HexagonAsmPrinter
-parent = Hexagon
-required_libraries = HexagonDesc MC Support
-add_to_library_groups = Hexagon

diff --git a/lib/Target/Hexagon/InstPrinter/Makefile b/lib/Target/Hexagon/InstPrinter/Makefile
deleted file mode 100644
index 20331d8..0000000
--- a/lib/Target/Hexagon/InstPrinter/Makefile
+++ /dev/null

@@ -1,15 +0,0 @@
-##===- lib/Target/Hexagon/InstPrinter/Makefile ----------------------------===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMHexagonAsmPrinter
-
-# Hack: we need to include 'main' Hexagon target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Hexagon/LLVMBuild.txt b/lib/Target/Hexagon/LLVMBuild.txt
index 0cf9a06..6ffd26a 100644
--- a/lib/Target/Hexagon/LLVMBuild.txt
+++ b/lib/Target/Hexagon/LLVMBuild.txt

@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = InstPrinter MCTargetDesc TargetInfo
+subdirectories = Disassembler MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
@@ -28,5 +28,5 @@
 type = Library
 name = HexagonCodeGen
 parent = Hexagon
-required_libraries = Analysis AsmPrinter CodeGen Core HexagonAsmPrinter HexagonDesc HexagonInfo MC Scalar SelectionDAG Support Target TransformUtils
+required_libraries = Analysis AsmPrinter CodeGen Core HexagonDesc HexagonInfo MC SelectionDAG Support Target
 add_to_library_groups = Hexagon

diff --git a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
index eeef3ef..2a6124e 100644
--- a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt

@@ -1,5 +1,11 @@
 add_llvm_library(LLVMHexagonDesc
+  HexagonAsmBackend.cpp
+  HexagonELFObjectWriter.cpp
+  HexagonInstPrinter.cpp
   HexagonMCAsmInfo.cpp
+  HexagonMCCodeEmitter.cpp
   HexagonMCInst.cpp
   HexagonMCTargetDesc.cpp
   )
+
+add_dependencies(LLVMHexagonDesc HexagonCommonTableGen)

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
new file mode 100644
index 0000000..bdccf88
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp

@@ -0,0 +1,74 @@
+//===-- HexagonAsmBackend.cpp - Hexagon Assembler Backend -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+
+using namespace llvm;
+
+namespace {
+
+class HexagonAsmBackend : public MCAsmBackend {
+public:
+  HexagonAsmBackend(Target const & /*T*/) {}
+
+  unsigned getNumFixupKinds() const override { return 0; }
+
+  void applyFixup(MCFixup const & /*Fixup*/, char * /*Data*/,
+                  unsigned /*DataSize*/, uint64_t /*Value*/,
+                  bool /*IsPCRel*/) const override {
+    return;
+  }
+
+  bool mayNeedRelaxation(MCInst const & /*Inst*/) const override {
+    return false;
+  }
+
+  bool fixupNeedsRelaxation(MCFixup const & /*Fixup*/, uint64_t /*Value*/,
+                            MCRelaxableFragment const * /*DF*/,
+                            MCAsmLayout const & /*Layout*/) const override {
+    llvm_unreachable("fixupNeedsRelaxation() unimplemented");
+  }
+
+  void relaxInstruction(MCInst const & /*Inst*/,
+                        MCInst & /*Res*/) const override {
+    llvm_unreachable("relaxInstruction() unimplemented");
+  }
+
+  bool writeNopData(uint64_t /*Count*/,
+                    MCObjectWriter * /*OW*/) const override {
+    return true;
+  }
+};
+} // end anonymous namespace
+
+namespace {
+class ELFHexagonAsmBackend : public HexagonAsmBackend {
+  uint8_t OSABI;
+
+public:
+  ELFHexagonAsmBackend(Target const &T, uint8_t OSABI)
+      : HexagonAsmBackend(T), OSABI(OSABI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    StringRef CPU("HexagonV4");
+    return createHexagonELFObjectWriter(OS, OSABI, CPU);
+  }
+};
+} // end anonymous namespace
+
+namespace llvm {
+MCAsmBackend *createHexagonAsmBackend(Target const &T,
+                                      MCRegisterInfo const & /*MRI*/,
+                                      StringRef TT, StringRef /*CPU*/) {
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
+  return new ELFHexagonAsmBackend(T, OSABI);
+}
+}

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index f8be77c..c0a3fae 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h

@@ -14,12 +14,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HEXAGONBASEINFO_H
-#define HEXAGONBASEINFO_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
 
 #include "HexagonMCTargetDesc.h"
 #include "llvm/Support/ErrorHandling.h"
 
+#include <stdint.h>
+
 namespace llvm {
 
 /// HexagonII - This namespace holds all of the target specific flags that
@@ -189,6 +191,15 @@
     MO_GPREL
   };
 
+  enum class InstParseBits : uint32_t {
+    INST_PARSE_MASK       = 0x0000c000,
+    INST_PARSE_PACKET_END = 0x0000c000,
+    INST_PARSE_LOOP_END   = 0x00008000,
+    INST_PARSE_NOT_END    = 0x00004000,
+    INST_PARSE_DUPLEX     = 0x00000000,
+    INST_PARSE_EXTENDER   = 0x00000000
+  };
+
 } // End namespace HexagonII.
 
 } // End namespace llvm.

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
new file mode 100644
index 0000000..56c9dc7
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp

@@ -0,0 +1,62 @@
+//===-- HexagonELFObjectWriter.cpp - Hexagon Target Descriptions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "hexagon-elf-writer"
+
+using namespace llvm;
+using namespace Hexagon;
+
+namespace {
+
+class HexagonELFObjectWriter : public MCELFObjectTargetWriter {
+private:
+  StringRef CPU;
+
+public:
+  HexagonELFObjectWriter(uint8_t OSABI, StringRef C);
+
+  virtual unsigned GetRelocType(MCValue const &Target, MCFixup const &Fixup,
+                                bool IsPCRel) const override;
+};
+}
+
+HexagonELFObjectWriter::HexagonELFObjectWriter(uint8_t OSABI, StringRef C)
+    : MCELFObjectTargetWriter(/*Is64bit*/ false, OSABI, ELF::EM_HEXAGON,
+                              /*HasRelocationAddend*/ true),
+      CPU(C) {}
+
+unsigned HexagonELFObjectWriter::GetRelocType(MCValue const &/*Target*/,
+                                              MCFixup const &Fixup,
+                                              bool IsPCRel) const {
+  unsigned Type = (unsigned)ELF::R_HEX_NONE;
+  llvm::MCFixupKind Kind = Fixup.getKind();
+
+  switch (Kind) {
+  default:
+    DEBUG(dbgs() << "unrecognized relocation " << Fixup.getKind() << "\n");
+    llvm_unreachable("Unimplemented Fixup kind!");
+    break;
+  case FK_Data_4:
+    Type = (IsPCRel) ? ELF::R_HEX_32_PCREL : ELF::R_HEX_32;
+    break;
+  }
+  return Type;
+}
+
+MCObjectWriter *llvm::createHexagonELFObjectWriter(raw_ostream &OS,
+                                                   uint8_t OSABI,
+                                                   StringRef CPU) {
+  MCELFObjectTargetWriter *MOTW = new HexagonELFObjectWriter(OSABI, CPU);
+  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian*/ true);
+}
\ No newline at end of file

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
new file mode 100644
index 0000000..1fd8d70
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp

@@ -0,0 +1,254 @@
+//===- HexagonInstPrinter.cpp - Convert Hexagon MCInst to assembly syntax -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonAsmPrinter.h"
+#include "Hexagon.h"
+#include "HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+#define GET_INSTRUCTION_NAME
+#include "HexagonGenAsmWriter.inc"
+
+const char HexagonInstPrinter::PacketPadding = '\t';
+// Return the minimum value that a constant extendable operand can have
+// without being extended.
+static int getMinValue(uint64_t TSFlags) {
+  unsigned isSigned =
+      (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+  unsigned bits =
+      (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+
+  if (isSigned)
+    return -1U << (bits - 1);
+
+  return 0;
+}
+
+// Return the maximum value that a constant extendable operand can have
+// without being extended.
+static int getMaxValue(uint64_t TSFlags) {
+  unsigned isSigned =
+      (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
+  unsigned bits =
+      (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+
+  if (isSigned)
+    return ~(-1U << (bits - 1));
+
+  return ~(-1U << bits);
+}
+
+// Return true if the instruction must be extended.
+static bool isExtended(uint64_t TSFlags) {
+  return (TSFlags >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+}
+
+// Currently just used in an assert statement
+static bool isExtendable(uint64_t TSFlags) LLVM_ATTRIBUTE_UNUSED;
+// Return true if the instruction may be extended based on the operand value.
+static bool isExtendable(uint64_t TSFlags) {
+  return (TSFlags >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+}
+
+StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
+  return MII.getName(Opcode);
+}
+
+StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
+  return getRegisterName(RegNo);
+}
+
+void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  printInst((const HexagonMCInst*)(MI), O, Annot);
+}
+
+void HexagonInstPrinter::printInst(const HexagonMCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  const char startPacket = '{',
+             endPacket = '}';
+  // TODO: add outer HW loop when it's supported too.
+  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+    // Ending a harware loop is different from ending an regular packet.
+    assert(MI->isPacketEnd() && "Loop-end must also end the packet");
+
+    if (MI->isPacketStart()) {
+      // There must be a packet to end a loop.
+      // FIXME: when shuffling is always run, this shouldn't be needed.
+      HexagonMCInst Nop;
+      StringRef NoAnnot;
+
+      Nop.setOpcode (Hexagon::NOP);
+      Nop.setPacketStart (MI->isPacketStart());
+      printInst (&Nop, O, NoAnnot);
+    }
+
+    // Close the packet.
+    if (MI->isPacketEnd())
+      O << PacketPadding << endPacket;
+
+    printInstruction(MI, O);
+  }
+  else {
+    // Prefix the insn opening the packet.
+    if (MI->isPacketStart())
+      O << PacketPadding << startPacket << '\n';
+
+    printInstruction(MI, O);
+
+    // Suffix the insn closing the packet.
+    if (MI->isPacketEnd())
+      // Suffix the packet in a new line always, since the GNU assembler has
+      // issues with a closing brace on the same line as CONST{32,64}.
+      O << '\n' << PacketPadding << endPacket;
+  }
+
+  printAnnotation(O, Annot);
+}
+
+void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  if (MO.isReg()) {
+    O << getRegisterName(MO.getReg());
+  } else if(MO.isExpr()) {
+    O << *MO.getExpr();
+  } else if(MO.isImm()) {
+    printImmOperand(MI, OpNo, O);
+  } else {
+    llvm_unreachable("Unknown operand");
+  }
+}
+
+void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  if(MO.isExpr()) {
+    O << *MO.getExpr();
+  } else if(MO.isImm()) {
+    O << MI->getOperand(OpNo).getImm();
+  } else {
+    llvm_unreachable("Unknown operand");
+  }
+}
+
+void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) const {
+  const MCOperand &MO = MI->getOperand(OpNo);
+  const MCInstrDesc &MII = getMII().get(MI->getOpcode());
+
+  assert((isExtendable(MII.TSFlags) || isExtended(MII.TSFlags)) &&
+         "Expecting an extendable operand");
+
+  if (MO.isExpr() || isExtended(MII.TSFlags)) {
+    O << "#";
+  } else if (MO.isImm()) {
+    int ImmValue = MO.getImm();
+    if (ImmValue < getMinValue(MII.TSFlags) ||
+        ImmValue > getMaxValue(MII.TSFlags))
+      O << "#";
+  }
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI,
+                                    unsigned OpNo, raw_ostream &O) const {
+  O << MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  O << -MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  O << -1;
+}
+
+void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
+  const MCOperand& MO0 = MI->getOperand(OpNo);
+  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+
+  O << getRegisterName(MO0.getReg());
+  O << " + #" << MO1.getImm();
+}
+
+void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) const {
+  const MCOperand& MO0 = MI->getOperand(OpNo);
+  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+
+  O << getRegisterName(MO0.getReg()) << ", #" << MO1.getImm();
+}
+
+void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  // Branches can take an immediate operand.  This is used by the branch
+  // selection pass to print $+8, an eight byte displacement from the PC.
+  llvm_unreachable("Unknown branch operand.");
+}
+
+void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O, bool hi) const {
+  assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand");
+
+  O << '#' << (hi ? "HI" : "LO") << "(#";
+  printOperand(MI, OpNo, O);
+  O << ')';
+}

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
new file mode 100644
index 0000000..55ae95c
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h

@@ -0,0 +1,87 @@
+//===-- HexagonInstPrinter.h - Convert Hexagon MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H
+#define LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+namespace llvm {
+  class HexagonMCInst;
+
+  class HexagonInstPrinter : public MCInstPrinter {
+  public:
+    explicit HexagonInstPrinter(const MCAsmInfo &MAI,
+                                const MCInstrInfo &MII,
+                                const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
+
+    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+    void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
+    virtual StringRef getOpcodeName(unsigned Opcode) const;
+    void printInstruction(const MCInst *MI, raw_ostream &O);
+    StringRef getRegName(unsigned RegNo) const;
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) const;
+    void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) const;
+    void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+
+    void printConstantPool(const MCInst *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+
+    void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
+      { printSymbol(MI, OpNo, O, true); }
+    void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
+      { printSymbol(MI, OpNo, O, false); }
+
+    const MCInstrInfo &getMII() const {
+      return MII;
+    }
+
+  protected:
+    void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi)
+           const;
+
+    static const char PacketPadding;
+
+  private:
+    const MCInstrInfo &MII;
+
+  };
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 141e514..ad5e0fb 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp

@@ -24,7 +24,6 @@
   Data64bitsDirective = nullptr;  // .xword is only supported by V9.
   ZeroDirective = "\t.skip\t";
   CommentString = "//";
-  HasLEB128 = true;
 
   LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index 953d804..ab18f0b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HexagonMCASMINFO_H
-#define HexagonMCASMINFO_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCASMINFO_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCASMINFO_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfoELF.h"

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
new file mode 100644
index 0000000..4471977
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp

@@ -0,0 +1,88 @@
+//===-- HexagonMCCodeEmitter.cpp - Hexagon Target Descriptions ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCCodeEmitter.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+using namespace llvm;
+using namespace Hexagon;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+/// \brief 10.6 Instruction Packets
+/// Possible values for instruction packet parse field.
+enum class ParseField { duplex = 0x0, last0 = 0x1, last1 = 0x2, end = 0x3 };
+/// \brief Returns the packet bits based on instruction position.
+uint32_t getPacketBits(HexagonMCInst const &HMI) {
+  unsigned const ParseFieldOffset = 14;
+  ParseField Field = HMI.isPacketEnd() ? ParseField::end : ParseField::last0;
+  return static_cast <uint32_t> (Field) << ParseFieldOffset;
+}
+void emitLittleEndian(uint64_t Binary, raw_ostream &OS) {
+  OS << static_cast<uint8_t>((Binary >> 0x00) & 0xff);
+  OS << static_cast<uint8_t>((Binary >> 0x08) & 0xff);
+  OS << static_cast<uint8_t>((Binary >> 0x10) & 0xff);
+  OS << static_cast<uint8_t>((Binary >> 0x18) & 0xff);
+}
+}
+
+HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
+                                           MCSubtargetInfo const &aMST,
+                                           MCContext &aMCT)
+    : MST(aMST), MCT(aMCT) {}
+
+void HexagonMCCodeEmitter::EncodeInstruction(MCInst const &MI, raw_ostream &OS,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             MCSubtargetInfo const &STI) const {
+  HexagonMCInst const &HMB = static_cast<HexagonMCInst const &>(MI);
+  uint64_t Binary = getBinaryCodeForInstr(HMB, Fixups, STI) | getPacketBits(HMB);
+  assert(HMB.getDesc().getSize() == 4 && "All instructions should be 32bit");
+  emitLittleEndian(Binary, OS);
+  ++MCNumEmitted;
+}
+
+unsigned
+HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        MCSubtargetInfo const &STI) const {
+  if (MO.isReg())
+    return MCT.getRegisterInfo()->getEncodingValue(MO.getReg());
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  llvm_unreachable("Only Immediates and Registers implemented right now");
+}
+
+MCSubtargetInfo const &HexagonMCCodeEmitter::getSubtargetInfo() const {
+  return MST;
+}
+
+MCCodeEmitter *llvm::createHexagonMCCodeEmitter(MCInstrInfo const &MII,
+                                                MCRegisterInfo const &MRI,
+                                                MCSubtargetInfo const &MST,
+                                                MCContext &MCT) {
+  return new HexagonMCCodeEmitter(MII, MST, MCT);
+}
+
+#include "HexagonGenMCCodeEmitter.inc"

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
new file mode 100644
index 0000000..96048ad
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h

@@ -0,0 +1,60 @@
+//===-- HexagonMCCodeEmitter.h - Hexagon Target Descriptions ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Definition for classes that emit Hexagon machine code from MCInsts
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCCODEEMITTER_H
+#define HEXAGONMCCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class HexagonMCCodeEmitter : public MCCodeEmitter {
+  MCSubtargetInfo const &MST;
+  MCContext &MCT;
+
+public:
+  HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCSubtargetInfo const &aMST,
+                       MCContext &aMCT);
+
+  MCSubtargetInfo const &getSubtargetInfo() const;
+
+  void EncodeInstruction(MCInst const &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         MCSubtargetInfo const &STI) const override;
+
+  // \brief TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(MCInst const &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 MCSubtargetInfo const &STI) const;
+
+  /// \brief Return binary encoding of operand.
+  unsigned getMachineOpValue(MCInst const &MI, MCOperand const &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             MCSubtargetInfo const &STI) const;
+
+private:
+  HexagonMCCodeEmitter(HexagonMCCodeEmitter const &) LLVM_DELETED_FUNCTION;
+  void operator=(HexagonMCCodeEmitter const &) LLVM_DELETED_FUNCTION;
+}; // class HexagonMCCodeEmitter
+
+} // namespace llvm
+
+#endif /* HEXAGONMCCODEEMITTER_H */

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
index 9260b4a..c842b9b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp

@@ -20,8 +20,9 @@
 
 // Return the slots used by the insn.
 unsigned HexagonMCInst::getUnits(const HexagonTargetMachine* TM) const {
-  const HexagonInstrInfo* QII = TM->getInstrInfo();
-  const InstrItineraryData* II = TM->getInstrItineraryData();
+  const HexagonInstrInfo *QII = TM->getSubtargetImpl()->getInstrInfo();
+  const InstrItineraryData *II =
+      TM->getSubtargetImpl()->getInstrItineraryData();
   const InstrStage*
     IS = II->beginStage(QII->get(this->getOpcode()).getSchedClass());
 
@@ -154,7 +155,7 @@
                     & HexagonII::ExtentBitsMask;
 
   if (isSigned) // if value is signed
-    return -1 << (bits - 1);
+    return -1U << (bits - 1);
   else
     return 0;
 }
@@ -169,7 +170,7 @@
                     & HexagonII::ExtentBitsMask;
 
   if (isSigned) // if value is signed
-    return ~(-1 << (bits - 1));
+    return ~(-1U << (bits - 1));
   else
-    return ~(-1 << bits);
+    return ~(-1U << bits);
 }

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
index 3c52d45..90fbbf3 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HEXAGONMCINST_H
-#define HEXAGONMCINST_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINST_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINST_H
 
 #include "HexagonTargetMachine.h"
 #include "llvm/MC/MCInst.h"

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 581674d..14ddd9d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp

@@ -13,8 +13,9 @@
 
 #include "HexagonMCTargetDesc.h"
 #include "HexagonMCAsmInfo.h"
-#include "InstPrinter/HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonInstPrinter.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -46,9 +47,17 @@
   return X;
 }
 
-static MCSubtargetInfo *createHexagonMCSubtargetInfo(StringRef TT,
-                                                     StringRef CPU,
-                                                     StringRef FS) {
+static MCStreamer *
+createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                         raw_ostream &OS, MCCodeEmitter *CE,
+                         bool RelaxAll) {
+  MCELFStreamer *ES = new MCELFStreamer(Context, MAB, OS, CE);
+  return ES;
+}
+
+
+static MCSubtargetInfo *
+createHexagonMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
   MCSubtargetInfo *X = new MCSubtargetInfo();
   InitHexagonMCSubtargetInfo(X, TT, CPU, FS);
   return X;
@@ -59,22 +68,40 @@
   MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
 
   // VirtualFP = (R30 + #0).
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
-      nullptr, Hexagon::R30, 0);
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(nullptr, Hexagon::R30, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
 
+static MCStreamer *createMCStreamer(Target const &T, StringRef TT,
+                                    MCContext &Context, MCAsmBackend &MAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    MCSubtargetInfo const &STI, bool RelaxAll) {
+  MCStreamer *ES = createHexagonELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
+  new MCTargetStreamer(*ES);
+  return ES;
+}
+
+
 static MCCodeGenInfo *createHexagonMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                             CodeModel::Model CM,
-                                             CodeGenOpt::Level OL) {
+                                                 CodeModel::Model CM,
+                                                 CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
   // For the time being, use static relocations, since there's really no
   // support for PIC yet.
   X->InitMCCodeGenInfo(Reloc::Static, CM, OL);
   return X;
 }
+static MCInstPrinter *createHexagonMCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+    return new HexagonInstPrinter(MAI, MII, MRI);
+}
 
 // Force static initialization.
 extern "C" void LLVMInitializeHexagonTargetMC() {
@@ -86,7 +113,8 @@
                                         createHexagonMCCodeGenInfo);
 
   // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(TheHexagonTarget, createHexagonMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheHexagonTarget,
+                                      createHexagonMCInstrInfo);
 
   // Register the MC register info.
   TargetRegistry::RegisterMCRegInfo(TheHexagonTarget,
@@ -95,4 +123,19 @@
   // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(TheHexagonTarget,
                                           createHexagonMCSubtargetInfo);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheHexagonTarget,
+                                        createHexagonMCCodeEmitter);
+
+  // Register the MC Inst Printer
+  TargetRegistry::RegisterMCInstPrinter(TheHexagonTarget,
+                                        createHexagonMCInstPrinter);
+
+  // Register the asm backend
+  TargetRegistry::RegisterMCAsmBackend(TheHexagonTarget,
+                                       createHexagonAsmBackend);
+
+  // Register the obj streamer
+  TargetRegistry::RegisterMCObjectStreamer(TheHexagonTarget, createMCStreamer);
 }

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 2238b1a..02fd516 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h

@@ -11,15 +11,37 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef HEXAGONMCTARGETDESC_H
-#define HEXAGONMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
+#define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
+
+#include <cstdint>
 
 namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class Target;
+class StringRef;
+class raw_ostream;
 
 extern Target TheHexagonTarget;
 
+MCCodeEmitter *createHexagonMCCodeEmitter(MCInstrInfo const &MCII,
+                                          MCRegisterInfo const &MRI,
+                                          MCSubtargetInfo const &MST,
+                                          MCContext &MCT);
+
+MCAsmBackend *createHexagonAsmBackend(Target const &T,
+                                      MCRegisterInfo const &MRI, StringRef TT,
+                                      StringRef CPU);
+
+MCObjectWriter *createHexagonELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
+                                             StringRef CPU);
+
 } // End llvm namespace
 
 // Define symbolic names for Hexagon registers.  This defines a mapping from

diff --git a/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt b/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt
index 73c7e01..f559a21 100644
--- a/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/Hexagon/MCTargetDesc/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = HexagonDesc
 parent = Hexagon
-required_libraries = HexagonInfo MC
+required_libraries = HexagonInfo MC Support
 add_to_library_groups = Hexagon

diff --git a/lib/Target/Hexagon/Makefile b/lib/Target/Hexagon/Makefile
index dc387c5..329c9d3 100644
--- a/lib/Target/Hexagon/Makefile
+++ b/lib/Target/Hexagon/Makefile

@@ -14,10 +14,12 @@
 BUILT_SOURCES = HexagonGenRegisterInfo.inc \
                 HexagonGenInstrInfo.inc  \
                 HexagonGenAsmWriter.inc \
-                HexagonGenDAGISel.inc HexagonGenSubtargetInfo.inc \
-                HexagonGenCallingConv.inc \
-                HexagonGenDFAPacketizer.inc
-
-DIRS = InstPrinter TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
+                HexagonGenDAGISel.inc HexagonGenSubtargetInfo.inc \

+                HexagonGenCallingConv.inc \

+                HexagonGenDFAPacketizer.inc \

+                HexagonGenMCCodeEmitter.inc \

+                HexagonGenDisassemblerTables.inc

+

+DIRS = TargetInfo MCTargetDesc Disassembler

+

+include $(LEVEL)/Makefile.common


diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index 5afbd20..7fae505 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MSP430INSTPRINTER_H
-#define MSP430INSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H
+#define LLVM_LIB_TARGET_MSP430_INSTPRINTER_MSP430INSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
 

diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index ef805bb..2c9532d 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MSP430TARGETASMINFO_H
-#define MSP430TARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCASMINFO_H
+#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
 

diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 72adb45..4c70803 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp

@@ -39,7 +39,7 @@
 
 static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitMSP430MCRegisterInfo(X, MSP430::PCW);
+  InitMSP430MCRegisterInfo(X, MSP430::PC);
   return X;
 }
 

diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
index 7f3505c..586f5d9 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MSP430MCTARGETDESC_H
-#define MSP430MCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H
+#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H
 
 namespace llvm {
 class Target;

diff --git a/lib/Target/MSP430/MSP430.h b/lib/Target/MSP430/MSP430.h
index 4574ce5..796f252 100644
--- a/lib/Target/MSP430/MSP430.h
+++ b/lib/Target/MSP430/MSP430.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_MSP430_H
-#define LLVM_TARGET_MSP430_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430_H
+#define LLVM_LIB_TARGET_MSP430_MSP430_H
 
 #include "MCTargetDesc/MSP430MCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"

diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index a96930a..ffcf222 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp

@@ -17,6 +17,7 @@
 
 #include "MSP430.h"
 #include "MSP430InstrInfo.h"
+#include "MSP430Subtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -54,7 +55,7 @@
 
 bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
   const MSP430InstrInfo *TII =
-             static_cast<const MSP430InstrInfo*>(Fn.getTarget().getInstrInfo());
+      static_cast<const MSP430InstrInfo *>(Fn.getSubtarget().getInstrInfo());
   // Give the blocks of the function a dense, in-order, numbering.
   Fn.RenumberBlocks();
   BlockSizes.resize(Fn.getNumBlockIDs());

diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td
index 8a69d1e..b38f578 100644
--- a/lib/Target/MSP430/MSP430CallingConv.td
+++ b/lib/Target/MSP430/MSP430CallingConv.td

@@ -17,7 +17,7 @@
   CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
 
   // i16 are returned in registers R15, R14, R13, R12
-  CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>
+  CCIfType<[i16], CCAssignToReg<[R15, R14, R13, R12]>>
 ]>;
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index 82c8b29..d6cb9f6 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp

@@ -14,6 +14,7 @@
 #include "MSP430FrameLowering.h"
 #include "MSP430InstrInfo.h"
 #include "MSP430MachineFunctionInfo.h"
+#include "MSP430Subtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -43,7 +44,7 @@
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
   const MSP430InstrInfo &TII =
-    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -62,18 +63,18 @@
     // Update the frame offset adjustment.
     MFI->setOffsetAdjustment(-NumBytes);
 
-    // Save FPW into the appropriate stack slot...
+    // Save FP into the appropriate stack slot...
     BuildMI(MBB, MBBI, DL, TII.get(MSP430::PUSH16r))
-      .addReg(MSP430::FPW, RegState::Kill);
+      .addReg(MSP430::FP, RegState::Kill);
 
-    // Update FPW with the new base value...
-    BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::FPW)
-      .addReg(MSP430::SPW);
+    // Update FP with the new base value...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::FP)
+      .addReg(MSP430::SP);
 
     // Mark the FramePtr as live-in in every block except the entry.
     for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
          I != E; ++I)
-      I->addLiveIn(MSP430::FPW);
+      I->addLiveIn(MSP430::FP);
 
   } else
     NumBytes = StackSize - MSP430FI->getCalleeSavedFrameSize();
@@ -85,18 +86,18 @@
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
 
-  if (NumBytes) { // adjust stack pointer: SPW -= numbytes
-    // If there is an SUB16ri of SPW immediately before this instruction, merge
+  if (NumBytes) { // adjust stack pointer: SP -= numbytes
+    // If there is an SUB16ri of SP immediately before this instruction, merge
     // the two.
     //NumBytes -= mergeSPUpdates(MBB, MBBI, true);
-    // If there is an ADD16ri or SUB16ri of SPW immediately after this
+    // If there is an ADD16ri or SUB16ri of SP immediately after this
     // instruction, merge the two instructions.
     // mergeSPUpdatesDown(MBB, MBBI, &NumBytes);
 
     if (NumBytes) {
       MachineInstr *MI =
-        BuildMI(MBB, MBBI, DL, TII.get(MSP430::SUB16ri), MSP430::SPW)
-        .addReg(MSP430::SPW).addImm(NumBytes);
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::SUB16ri), MSP430::SP)
+        .addReg(MSP430::SP).addImm(NumBytes);
       // The SRW implicit def is dead.
       MI->getOperand(3).setIsDead();
     }
@@ -108,7 +109,7 @@
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
   const MSP430InstrInfo &TII =
-    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   unsigned RetOpcode = MBBI->getOpcode();
@@ -131,8 +132,8 @@
     uint64_t FrameSize = StackSize - 2;
     NumBytes = FrameSize - CSSize;
 
-    // pop FPW.
-    BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::FPW);
+    // pop FP.
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::FP);
   } else
     NumBytes = StackSize - CSSize;
 
@@ -147,28 +148,28 @@
 
   DL = MBBI->getDebugLoc();
 
-  // If there is an ADD16ri or SUB16ri of SPW immediately before this
+  // If there is an ADD16ri or SUB16ri of SP immediately before this
   // instruction, merge the two instructions.
   //if (NumBytes || MFI->hasVarSizedObjects())
   //  mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
 
   if (MFI->hasVarSizedObjects()) {
     BuildMI(MBB, MBBI, DL,
-            TII.get(MSP430::MOV16rr), MSP430::SPW).addReg(MSP430::FPW);
+            TII.get(MSP430::MOV16rr), MSP430::SP).addReg(MSP430::FP);
     if (CSSize) {
       MachineInstr *MI =
         BuildMI(MBB, MBBI, DL,
-                TII.get(MSP430::SUB16ri), MSP430::SPW)
-        .addReg(MSP430::SPW).addImm(CSSize);
+                TII.get(MSP430::SUB16ri), MSP430::SP)
+        .addReg(MSP430::SP).addImm(CSSize);
       // The SRW implicit def is dead.
       MI->getOperand(3).setIsDead();
     }
   } else {
-    // adjust stack pointer back: SPW += numbytes
+    // adjust stack pointer back: SP += numbytes
     if (NumBytes) {
       MachineInstr *MI =
-        BuildMI(MBB, MBBI, DL, TII.get(MSP430::ADD16ri), MSP430::SPW)
-        .addReg(MSP430::SPW).addImm(NumBytes);
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::ADD16ri), MSP430::SP)
+        .addReg(MSP430::SP).addImm(NumBytes);
       // The SRW implicit def is dead.
       MI->getOperand(3).setIsDead();
     }
@@ -188,7 +189,7 @@
   if (MI != MBB.end()) DL = MI->getDebugLoc();
 
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MSP430MachineFunctionInfo *MFI = MF.getInfo<MSP430MachineFunctionInfo>();
   MFI->setCalleeSavedFrameSize(CSI.size() * 2);
 
@@ -214,7 +215,7 @@
   if (MI != MBB.end()) DL = MI->getDebugLoc();
 
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i)
     BuildMI(MBB, MI, DL, TII.get(MSP430::POP16r), CSI[i].getReg());
@@ -226,13 +227,13 @@
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const MSP430InstrInfo &TII =
-    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
   unsigned StackAlign = getStackAlignment();
 
   if (!hasReservedCallFrame(MF)) {
     // If the stack pointer can be changed after prologue, turn the
-    // adjcallstackup instruction into a 'sub SPW, <amt>' and the
-    // adjcallstackdown instruction into 'add SPW, <amt>'
+    // adjcallstackup instruction into a 'sub SP, <amt>' and the
+    // adjcallstackdown instruction into 'add SP, <amt>'
     // TODO: consider using push / pop instead of sub + store / add
     MachineInstr *Old = I;
     uint64_t Amount = Old->getOperand(0).getImm();
@@ -245,8 +246,8 @@
       MachineInstr *New = nullptr;
       if (Old->getOpcode() == TII.getCallFrameSetupOpcode()) {
         New = BuildMI(MF, Old->getDebugLoc(),
-                      TII.get(MSP430::SUB16ri), MSP430::SPW)
-          .addReg(MSP430::SPW).addImm(Amount);
+                      TII.get(MSP430::SUB16ri), MSP430::SP)
+          .addReg(MSP430::SP).addImm(Amount);
       } else {
         assert(Old->getOpcode() == TII.getCallFrameDestroyOpcode());
         // factor out the amount the callee already popped.
@@ -254,8 +255,8 @@
         Amount -= CalleeAmt;
         if (Amount)
           New = BuildMI(MF, Old->getDebugLoc(),
-                        TII.get(MSP430::ADD16ri), MSP430::SPW)
-            .addReg(MSP430::SPW).addImm(Amount);
+                        TII.get(MSP430::ADD16ri), MSP430::SP)
+            .addReg(MSP430::SP).addImm(Amount);
       }
 
       if (New) {
@@ -273,7 +274,7 @@
       MachineInstr *Old = I;
       MachineInstr *New =
         BuildMI(MF, Old->getDebugLoc(), TII.get(MSP430::SUB16ri),
-                MSP430::SPW).addReg(MSP430::SPW).addImm(CalleeAmt);
+                MSP430::SP).addReg(MSP430::SP).addImm(CalleeAmt);
       // The SRW implicit def is dead.
       New->getOperand(3).setIsDead();
 
@@ -287,11 +288,11 @@
 void
 MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                                          RegScavenger *) const {
-  // Create a frame entry for the FPW register that must be saved.
+  // Create a frame entry for the FP register that must be saved.
   if (hasFP(MF)) {
     int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
     (void)FrameIdx;
     assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
-           "Slot for FPW register must be last in order to be found!");
+           "Slot for FP register must be last in order to be found!");
   }
 }

diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index fadfeed..1941af2 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MSP430_FRAMEINFO_H
-#define MSP430_FRAMEINFO_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H
+#define LLVM_LIB_TARGET_MSP430_MSP430FRAMELOWERING_H
 
 #include "MSP430.h"
 #include "llvm/Target/TargetFrameLowering.h"

diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index a9b9035..81c176b 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp

@@ -97,9 +97,9 @@
 
   public:
     MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(TM, OptLevel),
-        Lowering(*TM.getTargetLowering()),
-        Subtarget(*TM.getSubtargetImpl()) { }
+        : SelectionDAGISel(TM, OptLevel),
+          Lowering(*TM.getSubtargetImpl()->getTargetLowering()),
+          Subtarget(*TM.getSubtargetImpl()) {}
 
     const char *getPassName() const override {
       return "MSP430 DAG->DAG Pattern Instruction Selection";

diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 3d3ee92..22936dd 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp

@@ -58,7 +58,7 @@
              clEnumValEnd));
 
 MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM)
-    : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+    : TargetLowering(TM) {
 
   // Set up the register classes.
   addRegisterClass(MVT::i8,  &MSP430::GR8RegClass);
@@ -72,7 +72,7 @@
   // Division is expensive
   setIntDivIsCheap(false);
 
-  setStackPointerRegisterToSaveRestore(MSP430::SPW);
+  setStackPointerRegisterToSaveRestore(MSP430::SP);
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
@@ -282,7 +282,7 @@
                              SmallVectorImpl<CCValAssign> &ArgLocs,
                              const SmallVectorImpl<ArgT> &Args) {
   static const MCPhysReg RegList[] = {
-    MSP430::R15W, MSP430::R14W, MSP430::R13W, MSP430::R12W
+    MSP430::R15, MSP430::R14, MSP430::R13, MSP430::R12
   };
   static const unsigned NbRegs = array_lengthof(RegList);
 
@@ -437,8 +437,8 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   AnalyzeArguments(CCInfo, ArgLocs, Ins);
 
   // Create frame index for the start of the first vararg value
@@ -533,8 +533,8 @@
     report_fatal_error("ISRs cannot return any value");
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   // Analize return values.
   AnalyzeReturnValues(CCInfo, RVLocs, Outs);
@@ -583,8 +583,8 @@
                                      SmallVectorImpl<SDValue> &InVals) const {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   AnalyzeArguments(CCInfo, ArgLocs, Outs);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -627,7 +627,7 @@
       assert(VA.isMemLoc());
 
       if (!StackPtr.getNode())
-        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SPW, getPointerTy());
+        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SP, getPointerTy());
 
       SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                                    StackPtr,
@@ -719,8 +719,8 @@
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   AnalyzeReturnValues(CCInfo, RVLocs, Ins);
 
@@ -941,31 +941,31 @@
     Convert = false;
     break;
    case MSP430CC::COND_HS:
-     // Res = SRW & 1, no processing is required
+     // Res = SR & 1, no processing is required
      break;
    case MSP430CC::COND_LO:
-     // Res = ~(SRW & 1)
+     // Res = ~(SR & 1)
      Invert = true;
      break;
    case MSP430CC::COND_NE:
      if (andCC) {
-       // C = ~Z, thus Res = SRW & 1, no processing is required
+       // C = ~Z, thus Res = SR & 1, no processing is required
      } else {
-       // Res = ~((SRW >> 1) & 1)
+       // Res = ~((SR >> 1) & 1)
        Shift = true;
        Invert = true;
      }
      break;
    case MSP430CC::COND_E:
      Shift = true;
-     // C = ~Z for AND instruction, thus we can put Res = ~(SRW & 1), however,
-     // Res = (SRW >> 1) & 1 is 1 word shorter.
+     // C = ~Z for AND instruction, thus we can put Res = ~(SR & 1), however,
+     // Res = (SR >> 1) & 1 is 1 word shorter.
      break;
   }
   EVT VT = Op.getValueType();
   SDValue One  = DAG.getConstant(1, VT);
   if (Convert) {
-    SDValue SR = DAG.getCopyFromReg(DAG.getEntryNode(), dl, MSP430::SRW,
+    SDValue SR = DAG.getCopyFromReg(DAG.getEntryNode(), dl, MSP430::SR,
                                     MVT::i16, Flag);
     if (Shift)
       // FIXME: somewhere this is turned into a SRL, lower it MSP specific?
@@ -1074,7 +1074,7 @@
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
-                                         MSP430::FPW, VT);
+                                         MSP430::FP, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo(),
@@ -1199,7 +1199,8 @@
   MachineFunction *F = BB->getParent();
   MachineRegisterInfo &RI = F->getRegInfo();
   DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  const TargetInstrInfo &TII =
+      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
 
   unsigned Opc;
   const TargetRegisterClass * RC;
@@ -1310,7 +1311,8 @@
       Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
     return EmitShiftInstr(MI, BB);
 
-  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  const TargetInstrInfo &TII =
+      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
 
   assert((Opc == MSP430::Select16 || Opc == MSP430::Select8) &&

diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 3e2f344..073ddc9 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_MSP430_ISELLOWERING_H
-#define LLVM_TARGET_MSP430_ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430ISELLOWERING_H
+#define LLVM_LIB_TARGET_MSP430_MSP430ISELLOWERING_H
 
 #include "MSP430.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -170,4 +170,4 @@
   };
 } // namespace llvm
 
-#endif // LLVM_TARGET_MSP430_ISELLOWERING_H
+#endif

diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index ccb6c09..27681aa 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp

@@ -307,7 +307,7 @@
       return 0;
     case TargetOpcode::INLINEASM: {
       const MachineFunction *MF = MI->getParent()->getParent();
-      const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+      const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
       return TII.getInlineAsmLength(MI->getOperand(0).getSymbolName(),
                                     *MF->getTarget().getMCAsmInfo());
     }

diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index e6baaef..f9b25b6 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_MSP430INSTRINFO_H
-#define LLVM_TARGET_MSP430INSTRINFO_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430INSTRINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430INSTRINFO_H
 
 #include "MSP430RegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"

diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 50e3fda..7c5aa11 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td

@@ -111,8 +111,8 @@
 // a stack adjustment and the codegen must know that they may modify the stack
 // pointer before prolog-epilog rewriting occurs.
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
-// sub / add which can clobber SRW.
-let Defs = [SPW, SRW], Uses = [SPW] in {
+// sub / add which can clobber SR.
+let Defs = [SP, SR], Uses = [SP] in {
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
                               "#ADJCALLSTACKDOWN",
                               [(MSP430callseq_start timm:$amt)]>;
@@ -130,7 +130,7 @@
                         "# Select16 PSEUDO",
                         [(set GR16:$dst,
                           (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>;
-  let Defs = [SRW] in {
+  let Defs = [SR] in {
   def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Shl8 PSEUDO",
                         [(set GR8:$dst, (MSP430shl GR8:$src, GR8:$cnt))]>;
@@ -192,7 +192,7 @@
 }
 
 // Conditional branches
-let Uses = [SRW] in
+let Uses = [SR] in
   def JCC : CJForm<0, 0,
                    (outs), (ins jmptarget:$dst, cc:$cc),
                    "j$cc\t$dst",
@@ -207,8 +207,8 @@
   // a use to prevent stack-pointer assignments that appear immediately
   // before calls from potentially appearing dead. Uses for argument
   // registers are added manually.
-  let Defs = [R12W, R13W, R14W, R15W, SRW],
-      Uses = [SPW] in {
+  let Defs = [R12, R13, R14, R15, SR],
+      Uses = [SP] in {
     def CALLi     : II16i<0x0,
                           (outs), (ins i16imm:$dst),
                           "call\t$dst", [(MSP430call imm:$dst)]>;
@@ -224,7 +224,7 @@
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions...
 //
-let Defs = [SPW], Uses = [SPW], neverHasSideEffects=1 in {
+let Defs = [SP], Uses = [SP], neverHasSideEffects=1 in {
 let mayLoad = 1 in
 def POP16r   : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
                        (outs GR16:$reg), (ins), "pop.w\t$reg", []>;
@@ -337,7 +337,7 @@
 
 let Constraints = "$src = $dst" in {
 
-let Defs = [SRW] in {
+let Defs = [SR] in {
 
 let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
 
@@ -345,24 +345,24 @@
                    (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "add.b\t{$src2, $dst}",
                    [(set GR8:$dst, (add GR8:$src, GR8:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADD16rr : I16rr<0x0,
                     (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "add.w\t{$src2, $dst}",
                     [(set GR16:$dst, (add GR16:$src, GR16:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 }
 
 def ADD8rm  : I8rm<0x0,
                    (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "add.b\t{$src2, $dst}",
                    [(set GR8:$dst, (add GR8:$src, (load addr:$src2))),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADD16rm : I16rm<0x0,
                     (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "add.w\t{$src2, $dst}",
                     [(set GR16:$dst, (add GR16:$src, (load addr:$src2))),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
 Constraints = "$base = $base_wb, $src = $dst" in {
@@ -381,160 +381,160 @@
                    (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "add.b\t{$src2, $dst}",
                    [(set GR8:$dst, (add GR8:$src, imm:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADD16ri : I16ri<0x0,
                     (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "add.w\t{$src2, $dst}",
                     [(set GR16:$dst, (add GR16:$src, imm:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 let Constraints = "" in {
 def ADD8mr  : I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "add.b\t{$src, $dst}",
                    [(store (add (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADD16mr : I16mr<0x0,
                     (outs), (ins memdst:$dst, GR16:$src),
                     "add.w\t{$src, $dst}",
                     [(store (add (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def ADD8mi  : I8mi<0x0,
                    (outs), (ins memdst:$dst, i8imm:$src),
                    "add.b\t{$src, $dst}",
                    [(store (add (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADD16mi : I16mi<0x0,
                     (outs), (ins memdst:$dst, i16imm:$src),
                     "add.w\t{$src, $dst}",
                     [(store (add (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def ADD8mm  : I8mm<0x0,
                    (outs), (ins memdst:$dst, memsrc:$src),
                    "add.b\t{$src, $dst}",
                    [(store (add (load addr:$dst), 
                                 (i8 (load addr:$src))), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADD16mm : I16mm<0x0,
                     (outs), (ins memdst:$dst, memsrc:$src),
                     "add.w\t{$src, $dst}",
                     [(store (add (load addr:$dst), 
                                   (i16 (load addr:$src))), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 }
 
-let Uses = [SRW] in {
+let Uses = [SR] in {
 
 let isCommutable = 1 in { // X = ADDC Y, Z  == X = ADDC Z, Y
 def ADC8rr  : I8rr<0x0,
                    (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "addc.b\t{$src2, $dst}",
                    [(set GR8:$dst, (adde GR8:$src, GR8:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADC16rr : I16rr<0x0,
                     (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "addc.w\t{$src2, $dst}",
                     [(set GR16:$dst, (adde GR16:$src, GR16:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 } // isCommutable
 
 def ADC8ri  : I8ri<0x0,
                    (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "addc.b\t{$src2, $dst}",
                    [(set GR8:$dst, (adde GR8:$src, imm:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADC16ri : I16ri<0x0,
                     (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "addc.w\t{$src2, $dst}",
                     [(set GR16:$dst, (adde GR16:$src, imm:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def ADC8rm  : I8rm<0x0,
                    (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "addc.b\t{$src2, $dst}",
                    [(set GR8:$dst, (adde GR8:$src, (load addr:$src2))),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADC16rm : I16rm<0x0,
                     (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "addc.w\t{$src2, $dst}",
                     [(set GR16:$dst, (adde GR16:$src, (load addr:$src2))),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 let Constraints = "" in {
 def ADC8mr  : I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "addc.b\t{$src, $dst}",
                    [(store (adde (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADC16mr : I16mr<0x0,
                     (outs), (ins memdst:$dst, GR16:$src),
                     "addc.w\t{$src, $dst}",
                     [(store (adde (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def ADC8mi  : I8mi<0x0,
                    (outs), (ins memdst:$dst, i8imm:$src),
                    "addc.b\t{$src, $dst}",
                    [(store (adde (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADC16mi : I16mi<0x0,
                     (outs), (ins memdst:$dst, i16imm:$src),
                     "addc.w\t{$src, $dst}",
                     [(store (adde (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def ADC8mm  : I8mm<0x0,
                    (outs), (ins memdst:$dst, memsrc:$src),
                    "addc.b\t{$src, $dst}",
                    [(store (adde (load addr:$dst), 
                                  (i8 (load addr:$src))), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def ADC16mm : I8mm<0x0,
                    (outs), (ins memdst:$dst, memsrc:$src),
                    "addc.w\t{$src, $dst}",
                    [(store (adde (load addr:$dst), 
                                  (i16 (load addr:$src))), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 }
 
-} // Uses = [SRW]
+} // Uses = [SR]
 
 let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
 def AND8rr  : I8rr<0x0,
                    (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "and.b\t{$src2, $dst}",
                    [(set GR8:$dst, (and GR8:$src, GR8:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def AND16rr : I16rr<0x0,
                     (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "and.w\t{$src2, $dst}",
                     [(set GR16:$dst, (and GR16:$src, GR16:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 }
 
 def AND8ri  : I8ri<0x0,
                    (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "and.b\t{$src2, $dst}",
                    [(set GR8:$dst, (and GR8:$src, imm:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def AND16ri : I16ri<0x0,
                     (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "and.w\t{$src2, $dst}",
                     [(set GR16:$dst, (and GR16:$src, imm:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def AND8rm  : I8rm<0x0,
                    (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "and.b\t{$src2, $dst}",
                    [(set GR8:$dst, (and GR8:$src, (load addr:$src2))),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def AND16rm : I16rm<0x0,
                     (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "and.w\t{$src2, $dst}",
                     [(set GR16:$dst, (and GR16:$src, (load addr:$src2))),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
 Constraints = "$base = $base_wb, $src = $dst" in {
@@ -553,36 +553,36 @@
                    (outs), (ins memdst:$dst, GR8:$src),
                    "and.b\t{$src, $dst}",
                    [(store (and (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def AND16mr : I16mr<0x0,
                     (outs), (ins memdst:$dst, GR16:$src),
                     "and.w\t{$src, $dst}",
                     [(store (and (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def AND8mi  : I8mi<0x0,
                    (outs), (ins memdst:$dst, i8imm:$src),
                    "and.b\t{$src, $dst}",
                    [(store (and (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def AND16mi : I16mi<0x0,
                     (outs), (ins memdst:$dst, i16imm:$src),
                     "and.w\t{$src, $dst}",
                     [(store (and (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def AND8mm  : I8mm<0x0,
                    (outs), (ins memdst:$dst, memsrc:$src),
                    "and.b\t{$src, $dst}",
                    [(store (and (load addr:$dst), 
                                 (i8 (load addr:$src))), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def AND16mm : I16mm<0x0,
                     (outs), (ins memdst:$dst, memsrc:$src),
                     "and.w\t{$src, $dst}",
                     [(store (and (load addr:$dst), 
                                  (i16 (load addr:$src))), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 }
 
 let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
@@ -703,35 +703,35 @@
                    (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "xor.b\t{$src2, $dst}",
                    [(set GR8:$dst, (xor GR8:$src, GR8:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def XOR16rr : I16rr<0x0,
                     (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "xor.w\t{$src2, $dst}",
                     [(set GR16:$dst, (xor GR16:$src, GR16:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 }
 
 def XOR8ri  : I8ri<0x0,
                    (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "xor.b\t{$src2, $dst}",
                    [(set GR8:$dst, (xor GR8:$src, imm:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def XOR16ri : I16ri<0x0,
                     (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "xor.w\t{$src2, $dst}",
                     [(set GR16:$dst, (xor GR16:$src, imm:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def XOR8rm  : I8rm<0x0,
                    (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "xor.b\t{$src2, $dst}",
                    [(set GR8:$dst, (xor GR8:$src, (load addr:$src2))),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def XOR16rm : I16rm<0x0,
                     (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "xor.w\t{$src2, $dst}",
                     [(set GR16:$dst, (xor GR16:$src, (load addr:$src2))),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
 Constraints = "$base = $base_wb, $src = $dst" in {
@@ -750,34 +750,34 @@
                    (outs), (ins memdst:$dst, GR8:$src),
                    "xor.b\t{$src, $dst}",
                    [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def XOR16mr : I16mr<0x0,
                     (outs), (ins memdst:$dst, GR16:$src),
                     "xor.w\t{$src, $dst}",
                     [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def XOR8mi  : I8mi<0x0,
                    (outs), (ins memdst:$dst, i8imm:$src),
                    "xor.b\t{$src, $dst}",
                    [(store (xor (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def XOR16mi : I16mi<0x0,
                     (outs), (ins memdst:$dst, i16imm:$src),
                     "xor.w\t{$src, $dst}",
                     [(store (xor (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def XOR8mm  : I8mm<0x0,
                    (outs), (ins memdst:$dst, memsrc:$src),
                    "xor.b\t{$src, $dst}",
                    [(store (xor (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def XOR16mm : I16mm<0x0,
                     (outs), (ins memdst:$dst, memsrc:$src),
                     "xor.w\t{$src, $dst}",
                     [(store (xor (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 }
 
 
@@ -785,34 +785,34 @@
                    (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "sub.b\t{$src2, $dst}",
                    [(set GR8:$dst, (sub GR8:$src, GR8:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SUB16rr : I16rr<0x0,
                     (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "sub.w\t{$src2, $dst}",
                     [(set GR16:$dst, (sub GR16:$src, GR16:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SUB8ri  : I8ri<0x0,
                    (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "sub.b\t{$src2, $dst}",
                    [(set GR8:$dst, (sub GR8:$src, imm:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SUB16ri : I16ri<0x0,
                     (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "sub.w\t{$src2, $dst}",
                     [(set GR16:$dst, (sub GR16:$src, imm:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SUB8rm  : I8rm<0x0,
                    (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "sub.b\t{$src2, $dst}",
                    [(set GR8:$dst, (sub GR8:$src, (load addr:$src2))),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SUB16rm : I16rm<0x0,
                     (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "sub.w\t{$src2, $dst}",
                     [(set GR16:$dst, (sub GR16:$src, (load addr:$src2))),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
 Constraints = "$base = $base_wb, $src = $dst" in {
@@ -831,153 +831,153 @@
                    (outs), (ins memdst:$dst, GR8:$src),
                    "sub.b\t{$src, $dst}",
                    [(store (sub (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SUB16mr : I16mr<0x0,
                     (outs), (ins memdst:$dst, GR16:$src),
                     "sub.w\t{$src, $dst}",
                     [(store (sub (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SUB8mi  : I8mi<0x0,
                    (outs), (ins memdst:$dst, i8imm:$src),
                    "sub.b\t{$src, $dst}",
                    [(store (sub (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SUB16mi : I16mi<0x0,
                     (outs), (ins memdst:$dst, i16imm:$src),
                     "sub.w\t{$src, $dst}",
                     [(store (sub (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SUB8mm  : I8mm<0x0,
                    (outs), (ins memdst:$dst, memsrc:$src),
                    "sub.b\t{$src, $dst}",
                    [(store (sub (load addr:$dst), 
                                 (i8 (load addr:$src))), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SUB16mm : I16mm<0x0,
                     (outs), (ins memdst:$dst, memsrc:$src),
                     "sub.w\t{$src, $dst}",
                     [(store (sub (load addr:$dst), 
                                  (i16 (load addr:$src))), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 }
 
-let Uses = [SRW] in {
+let Uses = [SR] in {
 def SBC8rr  : I8rr<0x0,
                    (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
                    "subc.b\t{$src2, $dst}",
                    [(set GR8:$dst, (sube GR8:$src, GR8:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SBC16rr : I16rr<0x0,
                     (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
                     "subc.w\t{$src2, $dst}",
                     [(set GR16:$dst, (sube GR16:$src, GR16:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SBC8ri  : I8ri<0x0,
                    (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
                    "subc.b\t{$src2, $dst}",
                    [(set GR8:$dst, (sube GR8:$src, imm:$src2)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SBC16ri : I16ri<0x0,
                     (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
                     "subc.w\t{$src2, $dst}",
                     [(set GR16:$dst, (sube GR16:$src, imm:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SBC8rm  : I8rm<0x0,
                    (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
                    "subc.b\t{$src2, $dst}",
                    [(set GR8:$dst, (sube GR8:$src, (load addr:$src2))),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SBC16rm : I16rm<0x0,
                     (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
                     "subc.w\t{$src2, $dst}",
                     [(set GR16:$dst, (sube GR16:$src, (load addr:$src2))),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 let Constraints = "" in {
 def SBC8mr  : I8mr<0x0,
                    (outs), (ins memdst:$dst, GR8:$src),
                    "subc.b\t{$src, $dst}",
                   [(store (sube (load addr:$dst), GR8:$src), addr:$dst),
-                   (implicit SRW)]>;
+                   (implicit SR)]>;
 def SBC16mr : I16mr<0x0,
                     (outs), (ins memdst:$dst, GR16:$src),
                     "subc.w\t{$src, $dst}",
                     [(store (sube (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SBC8mi  : I8mi<0x0,
                    (outs), (ins memdst:$dst, i8imm:$src),
                    "subc.b\t{$src, $dst}",
                    [(store (sube (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SBC16mi : I16mi<0x0,
                     (outs), (ins memdst:$dst, i16imm:$src),
                     "subc.w\t{$src, $dst}",
                     [(store (sube (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SBC8mm  : I8mm<0x0,
                    (outs), (ins memdst:$dst, memsrc:$src),
                    "subc.b\t{$src, $dst}",
                    [(store (sube (load addr:$dst),
                                  (i8 (load addr:$src))), addr:$dst),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SBC16mm : I16mm<0x0,
                     (outs), (ins memdst:$dst, memsrc:$src),
                     "subc.w\t{$src, $dst}",
                     [(store (sube (load addr:$dst),
                             (i16 (load addr:$src))), addr:$dst),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 }
 
-} // Uses = [SRW]
+} // Uses = [SR]
 
 // FIXME: memory variant!
 def SAR8r1  : II8r<0x0,
                    (outs GR8:$dst), (ins GR8:$src),
                    "rra.b\t$dst",
                    [(set GR8:$dst, (MSP430rra GR8:$src)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SAR16r1 : II16r<0x0,
                     (outs GR16:$dst), (ins GR16:$src),
                     "rra.w\t$dst",
                     [(set GR16:$dst, (MSP430rra GR16:$src)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SHL8r1  : I8rr<0x0,
                    (outs GR8:$dst), (ins GR8:$src),
                    "rla.b\t$dst",
                    [(set GR8:$dst, (MSP430rla GR8:$src)),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def SHL16r1 : I16rr<0x0,
                     (outs GR16:$dst), (ins GR16:$src),
                     "rla.w\t$dst",
                     [(set GR16:$dst, (MSP430rla GR16:$src)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def SAR8r1c  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
                       "clrc\n\t"
                       "rrc.b\t$dst",
                       [(set GR8:$dst, (MSP430rrc GR8:$src)),
-                       (implicit SRW)]>;
+                       (implicit SR)]>;
 def SAR16r1c : Pseudo<(outs GR16:$dst), (ins GR16:$src),
                       "clrc\n\t"
                       "rrc.w\t$dst",
                       [(set GR16:$dst, (MSP430rrc GR16:$src)),
-                       (implicit SRW)]>;
+                       (implicit SR)]>;
 
 // FIXME: Memory sext's ?
 def SEXT16r : II16r<0x0,
                     (outs GR16:$dst), (ins GR16:$src),
                     "sxt\t$dst",
                     [(set GR16:$dst, (sext_inreg GR16:$src, i8)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
-} // Defs = [SRW]
+} // Defs = [SR]
 
 def ZEXT16r : I8rr<0x0,
                    (outs GR16:$dst), (ins GR16:$src),
@@ -993,57 +993,57 @@
 } // Constraints = "$src = $dst"
 
 // Integer comparisons
-let Defs = [SRW] in {
+let Defs = [SR] in {
 def CMP8rr  : I8rr<0x0,
                    (outs), (ins GR8:$src, GR8:$src2),
                    "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp GR8:$src, GR8:$src2), (implicit SRW)]>;
+                   [(MSP430cmp GR8:$src, GR8:$src2), (implicit SR)]>;
 def CMP16rr : I16rr<0x0,
                     (outs), (ins GR16:$src, GR16:$src2),
                     "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp GR16:$src, GR16:$src2), (implicit SRW)]>;
+                    [(MSP430cmp GR16:$src, GR16:$src2), (implicit SR)]>;
 
 def CMP8ri  : I8ri<0x0,
                    (outs), (ins GR8:$src, i8imm:$src2),
                    "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp GR8:$src, imm:$src2), (implicit SRW)]>;
+                   [(MSP430cmp GR8:$src, imm:$src2), (implicit SR)]>;
 def CMP16ri : I16ri<0x0,
                     (outs), (ins GR16:$src, i16imm:$src2),
                     "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp GR16:$src, imm:$src2), (implicit SRW)]>;
+                    [(MSP430cmp GR16:$src, imm:$src2), (implicit SR)]>;
 
 def CMP8mi  : I8mi<0x0,
                    (outs), (ins memsrc:$src, i8imm:$src2),
                    "cmp.b\t{$src2, $src}",
                    [(MSP430cmp (load addr:$src),
-                               (i8 imm:$src2)), (implicit SRW)]>;
+                               (i8 imm:$src2)), (implicit SR)]>;
 def CMP16mi : I16mi<0x0,
                     (outs), (ins memsrc:$src, i16imm:$src2),
                     "cmp.w\t{$src2, $src}",
                      [(MSP430cmp (load addr:$src),
-                                 (i16 imm:$src2)), (implicit SRW)]>;
+                                 (i16 imm:$src2)), (implicit SR)]>;
 
 def CMP8rm  : I8rm<0x0,
                    (outs), (ins GR8:$src, memsrc:$src2),
                    "cmp.b\t{$src2, $src}",
                    [(MSP430cmp GR8:$src, (load addr:$src2)), 
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def CMP16rm : I16rm<0x0,
                     (outs), (ins GR16:$src, memsrc:$src2),
                     "cmp.w\t{$src2, $src}",
                     [(MSP430cmp GR16:$src, (load addr:$src2)),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def CMP8mr  : I8mr<0x0,
                    (outs), (ins memsrc:$src, GR8:$src2),
                    "cmp.b\t{$src2, $src}",
                    [(MSP430cmp (load addr:$src), GR8:$src2),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def CMP16mr : I16mr<0x0,
                     (outs), (ins memsrc:$src, GR16:$src2),
                     "cmp.w\t{$src2, $src}",
                     [(MSP430cmp (load addr:$src), GR16:$src2), 
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 
 // BIT TESTS, just sets condition codes
@@ -1053,56 +1053,56 @@
                    (outs), (ins GR8:$src, GR8:$src2),
                    "bit.b\t{$src2, $src}",
                    [(MSP430cmp (and_su GR8:$src, GR8:$src2), 0),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def BIT16rr : I16rr<0x0,
                     (outs), (ins GR16:$src, GR16:$src2),
                     "bit.w\t{$src2, $src}",
                     [(MSP430cmp (and_su GR16:$src, GR16:$src2), 0),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 }
 def BIT8ri  : I8ri<0x0,
                    (outs), (ins GR8:$src, i8imm:$src2),
                    "bit.b\t{$src2, $src}",
                    [(MSP430cmp (and_su GR8:$src, imm:$src2), 0),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def BIT16ri : I16ri<0x0,
                     (outs), (ins GR16:$src, i16imm:$src2),
                     "bit.w\t{$src2, $src}",
                     [(MSP430cmp (and_su GR16:$src, imm:$src2), 0),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def BIT8rm  : I8rm<0x0,
                    (outs), (ins GR8:$src, memdst:$src2),
                    "bit.b\t{$src2, $src}",
                    [(MSP430cmp (and_su GR8:$src,  (load addr:$src2)), 0),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def BIT16rm : I16rm<0x0,
                     (outs), (ins GR16:$src, memdst:$src2),
                     "bit.w\t{$src2, $src}",
                     [(MSP430cmp (and_su GR16:$src,  (load addr:$src2)), 0),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def BIT8mr  : I8mr<0x0,
                   (outs), (ins memsrc:$src, GR8:$src2),
                   "bit.b\t{$src2, $src}",
                   [(MSP430cmp (and_su (load addr:$src), GR8:$src2), 0),
-                   (implicit SRW)]>;
+                   (implicit SR)]>;
 def BIT16mr : I16mr<0x0,
                     (outs), (ins memsrc:$src, GR16:$src2),
                     "bit.w\t{$src2, $src}",
                     [(MSP430cmp (and_su (load addr:$src), GR16:$src2), 0),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def BIT8mi  : I8mi<0x0,
                    (outs), (ins memsrc:$src, i8imm:$src2),
                    "bit.b\t{$src2, $src}",
                    [(MSP430cmp (and_su (load addr:$src), (i8 imm:$src2)), 0),
-                    (implicit SRW)]>;
+                    (implicit SR)]>;
 def BIT16mi : I16mi<0x0,
                     (outs), (ins memsrc:$src, i16imm:$src2),
                     "bit.w\t{$src2, $src}",
                     [(MSP430cmp (and_su (load addr:$src), (i16 imm:$src2)), 0),
-                     (implicit SRW)]>;
+                     (implicit SR)]>;
 
 def BIT8mm  : I8mm<0x0,
                    (outs), (ins memsrc:$src, memsrc:$src2),
@@ -1110,15 +1110,15 @@
                    [(MSP430cmp (and_su (i8 (load addr:$src)),
                                        (load addr:$src2)),
                                  0),
-                      (implicit SRW)]>;
+                      (implicit SR)]>;
 def BIT16mm : I16mm<0x0,
                     (outs), (ins memsrc:$src, memsrc:$src2),
                     "bit.w\t{$src2, $src}",
                     [(MSP430cmp (and_su (i16 (load addr:$src)),
                                         (load addr:$src2)),
                                  0),
-                     (implicit SRW)]>;
-} // Defs = [SRW]
+                     (implicit SR)]>;
+} // Defs = [SR]
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns

diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index 05352a2..77b91b7 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp

@@ -26,6 +26,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 MCSymbol *MSP430MCInstLower::
@@ -50,7 +51,7 @@
 
 MCSymbol *MSP430MCInstLower::
 GetJumpTableSymbol(const MachineOperand &MO) const {
-  const DataLayout *DL = Printer.TM.getDataLayout();
+  const DataLayout *DL = Printer.TM.getSubtargetImpl()->getDataLayout();
   SmallString<256> Name;
   raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI"
                             << Printer.getFunctionNumber() << '_'
@@ -67,7 +68,7 @@
 
 MCSymbol *MSP430MCInstLower::
 GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
-  const DataLayout *DL = Printer.TM.getDataLayout();
+  const DataLayout *DL = Printer.TM.getSubtargetImpl()->getDataLayout();
   SmallString<256> Name;
   raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "CPI"
                             << Printer.getFunctionNumber() << '_'

diff --git a/lib/Target/MSP430/MSP430MCInstLower.h b/lib/Target/MSP430/MSP430MCInstLower.h
index 794aa56..ebd6397 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.h
+++ b/lib/Target/MSP430/MSP430MCInstLower.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MSP430_MCINSTLOWER_H
-#define MSP430_MCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430MCINSTLOWER_H
+#define LLVM_LIB_TARGET_MSP430_MSP430MCINSTLOWER_H
 
 #include "llvm/Support/Compiler.h"
 

diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index d1697f4..fcc5f5b 100644
--- a/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MSP430MACHINEFUNCTIONINFO_H
-#define MSP430MACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430MACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 

diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 691bcee..614467b 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp

@@ -33,32 +33,32 @@
 
 // FIXME: Provide proper call frame setup / destroy opcodes.
 MSP430RegisterInfo::MSP430RegisterInfo()
-  : MSP430GenRegisterInfo(MSP430::PCW) {}
+  : MSP430GenRegisterInfo(MSP430::PC) {}
 
 const MCPhysReg*
 MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  const TargetFrameLowering *TFI = MF->getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
   const Function* F = MF->getFunction();
   static const MCPhysReg CalleeSavedRegs[] = {
-    MSP430::FPW, MSP430::R5W, MSP430::R6W, MSP430::R7W,
-    MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
+    MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
+    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
     0
   };
   static const MCPhysReg CalleeSavedRegsFP[] = {
-    MSP430::R5W, MSP430::R6W, MSP430::R7W,
-    MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
+    MSP430::R5, MSP430::R6, MSP430::R7,
+    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
     0
   };
   static const MCPhysReg CalleeSavedRegsIntr[] = {
-    MSP430::FPW,  MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
-    MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
-    MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
+    MSP430::FP,  MSP430::R5,  MSP430::R6,  MSP430::R7,
+    MSP430::R8,  MSP430::R9,  MSP430::R10, MSP430::R11,
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15,
     0
   };
   static const MCPhysReg CalleeSavedRegsIntrFP[] = {
-    MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
-    MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
-    MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
+    MSP430::R5,  MSP430::R6,  MSP430::R7,
+    MSP430::R8,  MSP430::R9,  MSP430::R10, MSP430::R11,
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15,
     0
   };
 
@@ -73,22 +73,22 @@
 
 BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   // Mark 4 special registers with subregisters as reserved.
   Reserved.set(MSP430::PCB);
   Reserved.set(MSP430::SPB);
   Reserved.set(MSP430::SRB);
   Reserved.set(MSP430::CGB);
-  Reserved.set(MSP430::PCW);
-  Reserved.set(MSP430::SPW);
-  Reserved.set(MSP430::SRW);
-  Reserved.set(MSP430::CGW);
+  Reserved.set(MSP430::PC);
+  Reserved.set(MSP430::SP);
+  Reserved.set(MSP430::SR);
+  Reserved.set(MSP430::CG);
 
   // Mark frame pointer as reserved if needed.
   if (TFI->hasFP(MF)) {
     Reserved.set(MSP430::FPB);
-    Reserved.set(MSP430::FPW);
+    Reserved.set(MSP430::FP);
   }
 
   return Reserved;
@@ -109,11 +109,11 @@
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   DebugLoc dl = MI.getDebugLoc();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
-  unsigned BasePtr = (TFI->hasFP(MF) ? MSP430::FPW : MSP430::SPW);
+  unsigned BasePtr = (TFI->hasFP(MF) ? MSP430::FP : MSP430::SP);
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
 
   // Skip the saved PC
@@ -122,7 +122,7 @@
   if (!TFI->hasFP(MF))
     Offset += MF.getFrameInfo()->getStackSize();
   else
-    Offset += 2; // Skip the saved FPW
+    Offset += 2; // Skip the saved FP
 
   // Fold imm into offset
   Offset += MI.getOperand(FIOperandNum + 1).getImm();
@@ -131,7 +131,7 @@
     // This is actually "load effective address" of the stack slot
     // instruction. We have only two-address instructions, thus we need to
     // expand it into mov + add
-    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
     MI.setDesc(TII.get(MSP430::MOV16rr));
     MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
@@ -156,7 +156,7 @@
 }
 
 unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
-  return TFI->hasFP(MF) ? MSP430::FPW : MSP430::SPW;
+  return TFI->hasFP(MF) ? MSP430::FP : MSP430::SP;
 }

diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index cb01961..3f88a69 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_MSP430REGISTERINFO_H
-#define LLVM_TARGET_MSP430REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430REGISTERINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430REGISTERINFO_H
 
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -44,4 +44,4 @@
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_MSP430REGISTERINFO_H
+#endif

diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index 4010781..b5a6ed0 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td

@@ -46,22 +46,22 @@
 def subreg_8bit : SubRegIndex<8> { let Namespace = "MSP430"; }
 
 let SubRegIndices = [subreg_8bit] in {
-def PCW  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
-def SPW  : MSP430RegWithSubregs<1,  "r1",  [SPB]>;
-def SRW  : MSP430RegWithSubregs<2,  "r2",  [SRB]>;
-def CGW  : MSP430RegWithSubregs<3,  "r3",  [CGB]>;
-def FPW  : MSP430RegWithSubregs<4,  "r4",  [FPB]>;
-def R5W  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
-def R6W  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
-def R7W  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
-def R8W  : MSP430RegWithSubregs<8,  "r8",  [R8B]>;
-def R9W  : MSP430RegWithSubregs<9,  "r9",  [R9B]>;
-def R10W : MSP430RegWithSubregs<10, "r10", [R10B]>;
-def R11W : MSP430RegWithSubregs<11, "r11", [R11B]>;
-def R12W : MSP430RegWithSubregs<12, "r12", [R12B]>;
-def R13W : MSP430RegWithSubregs<13, "r13", [R13B]>;
-def R14W : MSP430RegWithSubregs<14, "r14", [R14B]>;
-def R15W : MSP430RegWithSubregs<15, "r15", [R15B]>;
+def PC  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
+def SP  : MSP430RegWithSubregs<1,  "r1",  [SPB]>;
+def SR  : MSP430RegWithSubregs<2,  "r2",  [SRB]>;
+def CG  : MSP430RegWithSubregs<3,  "r3",  [CGB]>;
+def FP  : MSP430RegWithSubregs<4,  "r4",  [FPB]>;
+def R5  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
+def R6  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
+def R7  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
+def R8  : MSP430RegWithSubregs<8,  "r8",  [R8B]>;
+def R9  : MSP430RegWithSubregs<9,  "r9",  [R9B]>;
+def R10 : MSP430RegWithSubregs<10, "r10", [R10B]>;
+def R11 : MSP430RegWithSubregs<11, "r11", [R11B]>;
+def R12 : MSP430RegWithSubregs<12, "r12", [R12B]>;
+def R13 : MSP430RegWithSubregs<13, "r13", [R13B]>;
+def R14 : MSP430RegWithSubregs<14, "r14", [R14B]>;
+def R15 : MSP430RegWithSubregs<15, "r15", [R15B]>;
 }
 
 def GR8 : RegisterClass<"MSP430", [i8], 8,
@@ -74,8 +74,8 @@
 
 def GR16 : RegisterClass<"MSP430", [i16], 16,
    // Volatile registers
-  (add R12W, R13W, R14W, R15W, R11W, R10W, R9W, R8W, R7W, R6W, R5W,
+  (add R12, R13, R14, R15, R11, R10, R9, R8, R7, R6, R5,
    // Frame pointer, sometimes allocable
-   FPW,
+   FP,
    // Volatile, but not allocable
-   PCW, SPW, SRW, CGW)>;
+   PC, SP, SR, CG)>;

diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/lib/Target/MSP430/MSP430SelectionDAGInfo.h
index cb04adc..61a6b19 100644
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.h
+++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MSP430SELECTIONDAGINFO_H
-#define MSP430SELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_MSP430_MSP430SELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 

diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index dbddc52..cb83b92 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp

@@ -34,6 +34,6 @@
                                  const std::string &FS, const TargetMachine &TM)
     : MSP430GenSubtargetInfo(TT, CPU, FS),
       // FIXME: Check DataLayout string.
-      DL("e-m:e-p:16:16-i32:16:32-n8:16"), FrameLowering(),
+      DL("e-m:e-p:16:16-i32:16:32-a:16-n8:16"), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
       TSInfo(DL) {}

diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index 0152ad1..d1845db 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_MSP430_SUBTARGET_H
-#define LLVM_TARGET_MSP430_SUBTARGET_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430SUBTARGET_H
+#define LLVM_LIB_TARGET_MSP430_MSP430SUBTARGET_H
 
 #include "MSP430FrameLowering.h"
 #include "MSP430InstrInfo.h"
@@ -51,14 +51,20 @@
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
-  const MSP430InstrInfo *getInstrInfo() const { return &InstrInfo; }
-  const DataLayout *getDataLayout() const { return &DL; }
-  const TargetRegisterInfo *getRegisterInfo() const {
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const MSP430InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const TargetRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  const MSP430TargetLowering *getTargetLowering() const { return &TLInfo; }
-  const MSP430SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const MSP430TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const MSP430SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
 };
 } // End llvm namespace
 

diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 5ca36f2..8cee016 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp

@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "MSP430.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -30,10 +31,13 @@
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
+MSP430TargetMachine::~MSP430TargetMachine() {}
+
 namespace {
 /// MSP430 Code Generator Pass Configuration Options.
 class MSP430PassConfig : public TargetPassConfig {

diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index efa8403..0e54ed6 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h

@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef LLVM_TARGET_MSP430_TARGETMACHINE_H
-#define LLVM_TARGET_MSP430_TARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
+#define LLVM_LIB_TARGET_MSP430_MSP430TARGETMACHINE_H
 
 #include "MSP430Subtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -24,6 +24,7 @@
 /// MSP430TargetMachine
 ///
 class MSP430TargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   MSP430Subtarget        Subtarget;
 
 public:
@@ -31,31 +32,18 @@
                       StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
+  ~MSP430TargetMachine() override;
 
-  const TargetFrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  const MSP430InstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
   const MSP430Subtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
-  const TargetRegisterInfo *getRegisterInfo() const override {
-    return getSubtargetImpl()->getRegisterInfo();
-  }
-  const MSP430TargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const MSP430SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
 }; // MSP430TargetMachine.
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_MSP430_TARGETMACHINE_H
+#endif

diff --git a/lib/Target/Mips/Android.mk b/lib/Target/Mips/Android.mk
index 9f437f8..18d1177 100644
--- a/lib/Target/Mips/Android.mk
+++ b/lib/Target/Mips/Android.mk

@@ -20,9 +20,10 @@
   Mips16ISelLowering.cpp \
   Mips16InstrInfo.cpp \
   Mips16RegisterInfo.cpp \
+  MipsABIInfo.cpp \
   MipsAnalyzeImmediate.cpp \
   MipsAsmPrinter.cpp \
-  MipsCodeEmitter.cpp \
+  MipsCCState.cpp \
   MipsConstantIslandPass.cpp \
   MipsDelaySlotFiller.cpp \
   MipsFastISel.cpp \
@@ -30,7 +31,6 @@
   MipsInstrInfo.cpp \
   MipsISelDAGToDAG.cpp \
   MipsISelLowering.cpp \
-  MipsJITInfo.cpp \
   MipsLongBranch.cpp \
   MipsMachineFunction.cpp \
   MipsMCInstLower.cpp \

diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 0c06be8..0c5b41f 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp

@@ -13,6 +13,7 @@
 #include "MipsTargetStreamer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -26,6 +27,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/SourceMgr.h"
+#include <memory>
 
 using namespace llvm;
 
@@ -38,36 +41,69 @@
 namespace {
 class MipsAssemblerOptions {
 public:
-  MipsAssemblerOptions() : aTReg(1), reorder(true), macro(true) {}
+  MipsAssemblerOptions(uint64_t Features_) : 
+    ATReg(1), Reorder(true), Macro(true), Features(Features_) {}
 
-  unsigned getATRegNum() { return aTReg; }
+  MipsAssemblerOptions(const MipsAssemblerOptions *Opts) {
+    ATReg = Opts->getATRegNum();
+    Reorder = Opts->isReorder();
+    Macro = Opts->isMacro();
+    Features = Opts->getFeatures();
+  }
+
+  unsigned getATRegNum() const { return ATReg; }
   bool setATReg(unsigned Reg);
 
-  bool isReorder() { return reorder; }
-  void setReorder() { reorder = true; }
-  void setNoreorder() { reorder = false; }
+  bool isReorder() const { return Reorder; }
+  void setReorder() { Reorder = true; }
+  void setNoReorder() { Reorder = false; }
 
-  bool isMacro() { return macro; }
-  void setMacro() { macro = true; }
-  void setNomacro() { macro = false; }
+  bool isMacro() const { return Macro; }
+  void setMacro() { Macro = true; }
+  void setNoMacro() { Macro = false; }
+
+  uint64_t getFeatures() const { return Features; }
+  void setFeatures(uint64_t Features_) { Features = Features_; }
+
+  // Set of features that are either architecture features or referenced
+  // by them (e.g.: FeatureNaN2008 implied by FeatureMips32r6).
+  // The full table can be found in MipsGenSubtargetInfo.inc (MipsFeatureKV[]).
+  // The reason we need this mask is explained in the selectArch function.
+  // FIXME: Ideally we would like TableGen to generate this information.
+  static const uint64_t AllArchRelatedMask =
+      Mips::FeatureMips1 | Mips::FeatureMips2 | Mips::FeatureMips3 |
+      Mips::FeatureMips3_32 | Mips::FeatureMips3_32r2 | Mips::FeatureMips4 |
+      Mips::FeatureMips4_32 | Mips::FeatureMips4_32r2 | Mips::FeatureMips5 |
+      Mips::FeatureMips5_32r2 | Mips::FeatureMips32 | Mips::FeatureMips32r2 |
+      Mips::FeatureMips32r6 | Mips::FeatureMips64 | Mips::FeatureMips64r2 |
+      Mips::FeatureMips64r6 | Mips::FeatureCnMips | Mips::FeatureFP64Bit |
+      Mips::FeatureGP64Bit | Mips::FeatureNaN2008;
 
 private:
-  unsigned aTReg;
-  bool reorder;
-  bool macro;
+  unsigned ATReg;
+  bool Reorder;
+  bool Macro;
+  uint64_t Features;
 };
 }
 
 namespace {
 class MipsAsmParser : public MCTargetAsmParser {
   MipsTargetStreamer &getTargetStreamer() {
-    MCTargetStreamer &TS = *Parser.getStreamer().getTargetStreamer();
+    MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
     return static_cast<MipsTargetStreamer &>(TS);
   }
 
   MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
-  MipsAssemblerOptions Options;
+  SmallVector<std::unique_ptr<MipsAssemblerOptions>, 2> AssemblerOptions;
+  MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a
+                       // nullptr, which indicates that no function is currently
+                       // selected. This usually happens after an '.end func'
+                       // directive.
+
+  // Print a warning along with its fix-it message at the given range.
+  void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
+                             SMRange Range, bool ShowColors = true);
 
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
@@ -76,15 +112,15 @@
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
+                               uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
   /// Parse a register as used in CFI directives
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
-  bool ParseParenSuffix(StringRef Name, OperandVector &Operands);
+  bool parseParenSuffix(StringRef Name, OperandVector &Operands);
 
-  bool ParseBracketSuffix(StringRef Name, OperandVector &Operands);
+  bool parseBracketSuffix(StringRef Name, OperandVector &Operands);
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
@@ -94,25 +130,28 @@
   MipsAsmParser::OperandMatchResultTy parseMemOperand(OperandVector &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  MatchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+  matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
                                     StringRef Identifier, SMLoc S);
 
   MipsAsmParser::OperandMatchResultTy
-  MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S);
+  matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S);
 
-  MipsAsmParser::OperandMatchResultTy ParseAnyRegister(OperandVector &Operands);
+  MipsAsmParser::OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
 
-  MipsAsmParser::OperandMatchResultTy ParseImm(OperandVector &Operands);
+  MipsAsmParser::OperandMatchResultTy parseImm(OperandVector &Operands);
 
-  MipsAsmParser::OperandMatchResultTy ParseJumpTarget(OperandVector &Operands);
+  MipsAsmParser::OperandMatchResultTy parseJumpTarget(OperandVector &Operands);
 
   MipsAsmParser::OperandMatchResultTy parseInvNum(OperandVector &Operands);
 
-  MipsAsmParser::OperandMatchResultTy ParseLSAImm(OperandVector &Operands);
+  MipsAsmParser::OperandMatchResultTy parseLSAImm(OperandVector &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseRegisterList (OperandVector  &Operands);
 
   bool searchSymbolAlias(OperandVector &Operands);
 
-  bool ParseOperand(OperandVector &, StringRef Mnemonic);
+  bool parseOperand(OperandVector &, StringRef Mnemonic);
 
   bool needsExpansion(MCInst &Inst);
 
@@ -130,6 +169,9 @@
   bool expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
                             SmallVectorImpl<MCInst> &Instructions);
 
+  void expandLoadAddressSym(MCInst &Inst, SMLoc IDLoc,
+                            SmallVectorImpl<MCInst> &Instructions);
+
   void expandMemInst(MCInst &Inst, SMLoc IDLoc,
                      SmallVectorImpl<MCInst> &Instructions, bool isLoad,
                      bool isImmOpnd);
@@ -142,8 +184,10 @@
   const MCExpr *evaluateRelocExpr(const MCExpr *Expr, StringRef RelocStr);
 
   bool isEvaluated(const MCExpr *Expr);
+  bool parseSetMips0Directive();
+  bool parseSetArchDirective();
   bool parseSetFeature(uint64_t Feature);
-  bool parseDirectiveCPLoad(SMLoc Loc);
+  bool parseDirectiveCpLoad(SMLoc Loc);
   bool parseDirectiveCPSetup();
   bool parseDirectiveNaN();
   bool parseDirectiveSet();
@@ -153,10 +197,16 @@
   bool parseSetNoAtDirective();
   bool parseSetMacroDirective();
   bool parseSetNoMacroDirective();
+  bool parseSetMsaDirective();
+  bool parseSetNoMsaDirective();
+  bool parseSetNoDspDirective();
   bool parseSetReorderDirective();
   bool parseSetNoReorderDirective();
+  bool parseSetMips16Directive();
   bool parseSetNoMips16Directive();
   bool parseSetFpDirective();
+  bool parseSetPopDirective();
+  bool parseSetPushDirective();
 
   bool parseSetAssignment();
 
@@ -174,6 +224,8 @@
 
   int matchCPURegisterName(StringRef Symbol);
 
+  int matchHWRegsRegisterName(StringRef Symbol);
+
   int matchRegisterByNumber(unsigned RegNum, unsigned RegClass);
 
   int matchFPURegisterName(StringRef Name);
@@ -200,18 +252,51 @@
   // Example: INSERT.B $w0[n], $1 => 16 > n >= 0
   bool validateMSAIndex(int Val, int RegKind);
 
-  void setFeatureBits(unsigned Feature, StringRef FeatureString) {
+  // Selects a new architecture by updating the FeatureBits with the necessary
+  // info including implied dependencies.
+  // Internally, it clears all the feature bits related to *any* architecture
+  // and selects the new one using the ToggleFeature functionality of the
+  // MCSubtargetInfo object that handles implied dependencies. The reason we
+  // clear all the arch related bits manually is because ToggleFeature only
+  // clears the features that imply the feature being cleared and not the
+  // features implied by the feature being cleared. This is easier to see
+  // with an example:
+  //  --------------------------------------------------
+  // | Feature         | Implies                        |
+  // | -------------------------------------------------|
+  // | FeatureMips1    | None                           |
+  // | FeatureMips2    | FeatureMips1                   |
+  // | FeatureMips3    | FeatureMips2 | FeatureMipsGP64 |
+  // | FeatureMips4    | FeatureMips3                   |
+  // | ...             |                                |
+  //  --------------------------------------------------
+  //
+  // Setting Mips3 is equivalent to set: (FeatureMips3 | FeatureMips2 |
+  // FeatureMipsGP64 | FeatureMips1)
+  // Clearing Mips3 is equivalent to clear (FeatureMips3 | FeatureMips4).
+  void selectArch(StringRef ArchFeature) {
+    uint64_t FeatureBits = STI.getFeatureBits();
+    FeatureBits &= ~MipsAssemblerOptions::AllArchRelatedMask;
+    STI.setFeatureBits(FeatureBits);
+    setAvailableFeatures(
+        ComputeAvailableFeatures(STI.ToggleFeature(ArchFeature)));
+    AssemblerOptions.back()->setFeatures(getAvailableFeatures());
+  }
+
+  void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
     if (!(STI.getFeatureBits() & Feature)) {
       setAvailableFeatures(
           ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
     }
+    AssemblerOptions.back()->setFeatures(getAvailableFeatures());
   }
 
-  void clearFeatureBits(unsigned Feature, StringRef FeatureString) {
+  void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
     if (STI.getFeatureBits() & Feature) {
       setAvailableFeatures(
           ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
     }
+    AssemblerOptions.back()->setFeatures(getAvailableFeatures());
   }
 
 public:
@@ -225,9 +310,19 @@
 
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
                 const MCInstrInfo &MII, const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(sti), Parser(parser) {
+      : MCTargetAsmParser(), STI(sti) {
+    MCAsmParserExtension::Initialize(parser);
+
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+    
+    // Remember the initial assembler options. The user can not modify these.
+    AssemblerOptions.push_back(
+                     make_unique<MipsAssemblerOptions>(getAvailableFeatures()));
+    
+    // Create an assembler options environment for the user to modify.
+    AssemblerOptions.push_back(
+                     make_unique<MipsAssemblerOptions>(getAvailableFeatures()));
 
     getTargetStreamer().updateABIInfo(*this);
 
@@ -237,12 +332,11 @@
             ((STI.getFeatureBits() & Mips::FeatureN32) != 0) +
             ((STI.getFeatureBits() & Mips::FeatureN64) != 0)) == 1);
 
-    if (!isABI_O32() && !allowOddSPReg() != 0)
+    if (!isABI_O32() && !useOddSPReg() != 0)
       report_fatal_error("-mno-odd-spreg requires the O32 ABI");
-  }
 
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+    CurrentFn = nullptr;
+  }
 
   /// True if all of $fcc0 - $fcc7 exist for the current ISA.
   bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); }
@@ -252,9 +346,9 @@
   bool isABI_N32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
   bool isABI_N64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
   bool isABI_O32() const { return STI.getFeatureBits() & Mips::FeatureO32; }
-  bool isABI_FPXX() const { return false; } // TODO: add check for FeatureXX
+  bool isABI_FPXX() const { return STI.getFeatureBits() & Mips::FeatureFPXX; }
 
-  bool allowOddSPReg() const {
+  bool useOddSPReg() const {
     return !(STI.getFeatureBits() & Mips::FeatureNoOddSPReg);
   }
 
@@ -292,10 +386,10 @@
     return STI.getFeatureBits() & Mips::FeatureMips16;
   }
   // TODO: see how can we get this info.
-  bool mipsSEUsesSoftFloat() const { return false; }
+  bool abiUsesSoftFloat() const { return false; }
 
   /// Warn if RegNo is the current assembler temporary.
-  void WarnIfAssemblerTemporary(int RegNo, SMLoc Loc);
+  void warnIfAssemblerTemporary(int RegNo, SMLoc Loc);
 };
 }
 
@@ -333,7 +427,8 @@
     k_Memory,        /// Base + Offset Memory Address
     k_PhysRegister,  /// A physical register from the Mips namespace
     k_RegisterIndex, /// A register index in one or more RegKind.
-    k_Token          /// A simple token
+    k_Token,         /// A simple token
+    k_RegList        /// A physical register list
   } Kind;
 
 public:
@@ -368,12 +463,17 @@
     const MCExpr *Off;
   };
 
+  struct RegListOp {
+    SmallVector<unsigned, 10> *List;
+  };
+
   union {
     struct Token Tok;
     struct PhysRegOp PhysReg;
     struct RegIdxOp RegIdx;
     struct ImmOp Imm;
     struct MemOp Mem;
+    struct RegListOp RegList;
   };
 
   SMLoc StartLoc, EndLoc;
@@ -397,7 +497,15 @@
   /// target.
   unsigned getGPR32Reg() const {
     assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
-    AsmParser.WarnIfAssemblerTemporary(RegIdx.Index, StartLoc);
+    AsmParser.warnIfAssemblerTemporary(RegIdx.Index, StartLoc);
+    unsigned ClassID = Mips::GPR32RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
+  /// Coerce the register to GPR32 and return the real register for the current
+  /// target.
+  unsigned getGPRMM16Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_GPR) && "Invalid access!");
     unsigned ClassID = Mips::GPR32RegClassID;
     return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
   }
@@ -550,6 +658,11 @@
     Inst.addOperand(MCOperand::CreateReg(getGPR32Reg()));
   }
 
+  void addGPRMM16AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getGPRMM16Reg()));
+  }
+
   /// Render the operand to an MCInst as a GPR64
   /// Asserts if the wrong number of operands are requested, or the operand
   /// is not a k_RegisterIndex compatible with RegKind_GPR
@@ -572,7 +685,7 @@
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(getFGR32Reg()));
     // FIXME: We ought to do this for -integrated-as without -via-file-asm too.
-    if (!AsmParser.allowOddSPReg() && RegIdx.Index & 1)
+    if (!AsmParser.useOddSPReg() && RegIdx.Index & 1)
       AsmParser.Error(StartLoc, "-mno-odd-spreg prohibits the use of odd FPU "
                                 "registers");
   }
@@ -647,6 +760,13 @@
     addExpr(Inst, Expr);
   }
 
+  void addRegListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    for (auto RegNo : getRegList())
+      Inst.addOperand(MCOperand::CreateReg(RegNo));
+  }
+
   bool isReg() const override {
     // As a special case until we sort out the definition of div/divu, pretend
     // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
@@ -679,6 +799,7 @@
     int64_t Val = getConstantImm();
     return 1 <= Val && Val <= 4;
   }
+  bool isRegList() const { return Kind == k_RegList; }
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
@@ -720,6 +841,11 @@
     return static_cast<const MCConstantExpr *>(getMemOff())->getValue();
   }
 
+  const SmallVectorImpl<unsigned> &getRegList() const {
+    assert((Kind == k_RegList) && "Invalid access!");
+    return *(RegList.List);
+  }
+
   static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
                                                   MipsAsmParser &Parser) {
     auto Op = make_unique<MipsOperand>(k_Token, Parser);
@@ -733,16 +859,16 @@
   /// Create a numeric register (e.g. $1). The exact register remains
   /// unresolved until an instruction successfully matches
   static std::unique_ptr<MipsOperand>
-  CreateNumericReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+  createNumericReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
                    SMLoc E, MipsAsmParser &Parser) {
-    DEBUG(dbgs() << "CreateNumericReg(" << Index << ", ...)\n");
+    DEBUG(dbgs() << "createNumericReg(" << Index << ", ...)\n");
     return CreateReg(Index, RegKind_Numeric, RegInfo, S, E, Parser);
   }
 
   /// Create a register that is definitely a GPR.
   /// This is typically only used for named registers such as $gp.
   static std::unique_ptr<MipsOperand>
-  CreateGPRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+  createGPRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
                MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_GPR, RegInfo, S, E, Parser);
   }
@@ -750,15 +876,23 @@
   /// Create a register that is definitely a FGR.
   /// This is typically only used for named registers such as $f0.
   static std::unique_ptr<MipsOperand>
-  CreateFGRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+  createFGRReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
                MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_FGR, RegInfo, S, E, Parser);
   }
 
+  /// Create a register that is definitely a HWReg.
+  /// This is typically only used for named registers such as $hwr_cpunum.
+  static std::unique_ptr<MipsOperand>
+  createHWRegsReg(unsigned Index, const MCRegisterInfo *RegInfo,
+                  SMLoc S, SMLoc E, MipsAsmParser &Parser) {
+    return CreateReg(Index, RegKind_HWRegs, RegInfo, S, E, Parser);
+  }
+
   /// Create a register that is definitely an FCC.
   /// This is typically only used for named registers such as $fcc0.
   static std::unique_ptr<MipsOperand>
-  CreateFCCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+  createFCCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
                MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_FCC, RegInfo, S, E, Parser);
   }
@@ -766,7 +900,7 @@
   /// Create a register that is definitely an ACC.
   /// This is typically only used for named registers such as $ac0.
   static std::unique_ptr<MipsOperand>
-  CreateACCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
+  createACCReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S, SMLoc E,
                MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_ACC, RegInfo, S, E, Parser);
   }
@@ -774,7 +908,7 @@
   /// Create a register that is definitely an MSA128.
   /// This is typically only used for named registers such as $w0.
   static std::unique_ptr<MipsOperand>
-  CreateMSA128Reg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+  createMSA128Reg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
                   SMLoc E, MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_MSA128, RegInfo, S, E, Parser);
   }
@@ -782,7 +916,7 @@
   /// Create a register that is definitely an MSACtrl.
   /// This is typically only used for named registers such as $msaaccess.
   static std::unique_ptr<MipsOperand>
-  CreateMSACtrlReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
+  createMSACtrlReg(unsigned Index, const MCRegisterInfo *RegInfo, SMLoc S,
                    SMLoc E, MipsAsmParser &Parser) {
     return CreateReg(Index, RegKind_MSACtrl, RegInfo, S, E, Parser);
   }
@@ -807,9 +941,29 @@
     return Op;
   }
 
+  static std::unique_ptr<MipsOperand>
+  CreateRegList(SmallVectorImpl<unsigned> &Regs, SMLoc StartLoc, SMLoc EndLoc,
+                MipsAsmParser &Parser) {
+    assert (Regs.size() > 0 && "Empty list not allowed");
+
+    auto Op = make_unique<MipsOperand>(k_RegList, Parser);
+    Op->RegList.List = new SmallVector<unsigned, 10>();
+    for (auto Reg : Regs)
+      Op->RegList.List->push_back(Reg);
+    Op->StartLoc = StartLoc;
+    Op->EndLoc = EndLoc;
+    return Op;
+  }
+
   bool isGPRAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
   }
+  bool isMM16AsmReg() const {
+    if (!(isRegIdx() && RegIdx.Kind))
+      return false;
+    return ((RegIdx.Index >= 2 && RegIdx.Index <= 7)
+            || RegIdx.Index == 16 || RegIdx.Index == 17);
+  }
   bool isFGRAsmReg() const {
     // AFGR64 is $0-$15 but we handle this in getAFGR64()
     return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
@@ -855,6 +1009,8 @@
     case k_Memory:
       delete Mem.Base;
       break;
+    case k_RegList:
+      delete RegList.List;
     case k_PhysRegister:
     case k_RegisterIndex:
     case k_Token:
@@ -885,6 +1041,12 @@
     case k_Token:
       OS << Tok.Data;
       break;
+    case k_RegList:
+      OS << "RegList< ";
+      for (auto Reg : (*RegList.List))
+        OS << Reg << " ";
+      OS <<  ">";
+      break;
     }
   }
 }; // class MipsOperand
@@ -897,6 +1059,19 @@
   return MipsInsts[Opcode];
 }
 
+static bool hasShortDelaySlot(unsigned Opcode) {
+  switch (Opcode) {
+    case Mips::JALS_MM:
+    case Mips::JALRS_MM:
+    case Mips::JALRS16_MM:
+    case Mips::BGEZALS_MM:
+    case Mips::BLTZALS_MM:
+      return true;
+    default:
+      return false;
+  }
+}
+
 bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                        SmallVectorImpl<MCInst> &Instructions) {
   const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
@@ -961,15 +1136,21 @@
                                                       "nop instruction");
   }
 
-  if (MCID.hasDelaySlot() && Options.isReorder()) {
+  if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) {
     // If this instruction has a delay slot and .set reorder is active,
     // emit a NOP after it.
     Instructions.push_back(Inst);
     MCInst NopInst;
-    NopInst.setOpcode(Mips::SLL);
-    NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
-    NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
-    NopInst.addOperand(MCOperand::CreateImm(0));
+    if (hasShortDelaySlot(Inst.getOpcode())) {
+      NopInst.setOpcode(Mips::MOVE16_MM);
+      NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+      NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+    } else {
+      NopInst.setOpcode(Mips::SLL);
+      NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+      NopInst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+      NopInst.addOperand(MCOperand::CreateImm(0));
+    }
     Instructions.push_back(NopInst);
     return false;
   }
@@ -1008,6 +1189,80 @@
     } // for
   }   // if load/store
 
+  // TODO: Handle this with the AsmOperandClass.PredicateMethod.
+  if (inMicroMipsMode()) {
+    MCOperand Opnd;
+    int Imm;
+
+    switch (Inst.getOpcode()) {
+      default:
+        break;
+      case Mips::ADDIUS5_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < -8 || Imm > 7)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::ADDIUSP_MM:
+        Opnd = Inst.getOperand(0);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < -1032 || Imm > 1028 || (Imm < 8 && Imm > -12) ||
+            Imm % 4 != 0)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::SLL16_MM:
+      case Mips::SRL16_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < 1 || Imm > 8)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::LI16_MM:
+        Opnd = Inst.getOperand(1);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (Imm < -1 || Imm > 126)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::ADDIUR2_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (!(Imm == 1 || Imm == -1 ||
+              ((Imm % 4 == 0) && Imm < 28 && Imm > 0)))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::ADDIUR1SP_MM:
+        Opnd = Inst.getOperand(1);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (OffsetToAlignment(Imm, 4LL))
+          return Error(IDLoc, "misaligned immediate operand value");
+        if (Imm < 0 || Imm > 255)
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+      case Mips::ANDI16_MM:
+        Opnd = Inst.getOperand(2);
+        if (!Opnd.isImm())
+          return Error(IDLoc, "expected immediate operand kind");
+        Imm = Opnd.getImm();
+        if (!(Imm == 128 || (Imm >= 1 && Imm <= 4) || Imm == 7 || Imm == 8 ||
+              Imm == 15 || Imm == 16 || Imm == 31 || Imm == 32 || Imm == 63 ||
+              Imm == 64 || Imm == 255 || Imm == 32768 || Imm == 65535))
+          return Error(IDLoc, "immediate operand value out of range");
+        break;
+    }
+  }
+
   if (needsExpansion(Inst))
     return expandInstruction(Inst, IDLoc, Instructions);
   else
@@ -1039,7 +1294,7 @@
     return expandLoadImm(Inst, IDLoc, Instructions);
   case Mips::LoadImm64Reg:
     if (!isGP64bit()) {
-      Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+      Error(IDLoc, "instruction requires a 64-bit architecture");
       return true;
     }
     return expandLoadImm(Inst, IDLoc, Instructions);
@@ -1051,8 +1306,8 @@
 }
 
 namespace {
-template <int Shift, bool PerformShift>
-void createShiftOr(int64_t Value, unsigned RegNo, SMLoc IDLoc,
+template <bool PerformShift>
+void createShiftOr(MCOperand Operand, unsigned RegNo, SMLoc IDLoc,
                    SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   if (PerformShift) {
@@ -1067,11 +1322,18 @@
   tmpInst.setOpcode(Mips::ORi);
   tmpInst.addOperand(MCOperand::CreateReg(RegNo));
   tmpInst.addOperand(MCOperand::CreateReg(RegNo));
-  tmpInst.addOperand(
-      MCOperand::CreateImm(((Value & (0xffffLL << Shift)) >> Shift)));
+  tmpInst.addOperand(Operand);
   tmpInst.setLoc(IDLoc);
   Instructions.push_back(tmpInst);
 }
+
+template <int Shift, bool PerformShift>
+void createShiftOr(int64_t Value, unsigned RegNo, SMLoc IDLoc,
+                   SmallVectorImpl<MCInst> &Instructions) {
+  createShiftOr<PerformShift>(
+      MCOperand::CreateImm(((Value & (0xffffLL << Shift)) >> Shift)), RegNo,
+      IDLoc, Instructions);
+}
 }
 
 bool MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
@@ -1114,7 +1376,7 @@
     createShiftOr<0, false>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
   } else if ((ImmValue & (0xffffLL << 48)) == 0) {
     if (!isGP64bit()) {
-      Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+      Error(IDLoc, "instruction requires a 64-bit architecture");
       return true;
     }
 
@@ -1141,7 +1403,7 @@
     createShiftOr<0, true>(ImmValue, RegOp.getReg(), IDLoc, Instructions);
   } else {
     if (!isGP64bit()) {
-      Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+      Error(IDLoc, "instruction requires a 64-bit architecture");
       return true;
     }
 
@@ -1176,7 +1438,12 @@
                                     SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   const MCOperand &ImmOp = Inst.getOperand(2);
-  assert(ImmOp.isImm() && "expected immediate operand kind");
+  assert((ImmOp.isImm() || ImmOp.isExpr()) &&
+         "expected immediate operand kind");
+  if (!ImmOp.isImm()) {
+    expandLoadAddressSym(Inst, IDLoc, Instructions);
+    return false;
+  }
   const MCOperand &SrcRegOp = Inst.getOperand(1);
   assert(SrcRegOp.isReg() && "expected register operand kind");
   const MCOperand &DstRegOp = Inst.getOperand(0);
@@ -1220,7 +1487,12 @@
                                     SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   const MCOperand &ImmOp = Inst.getOperand(1);
-  assert(ImmOp.isImm() && "expected immediate operand kind");
+  assert((ImmOp.isImm() || ImmOp.isExpr()) &&
+         "expected immediate operand kind");
+  if (!ImmOp.isImm()) {
+    expandLoadAddressSym(Inst, IDLoc, Instructions);
+    return false;
+  }
   const MCOperand &RegOp = Inst.getOperand(0);
   assert(RegOp.isReg() && "expected register operand kind");
   int ImmValue = ImmOp.getImm();
@@ -1250,6 +1522,71 @@
   return false;
 }
 
+void
+MipsAsmParser::expandLoadAddressSym(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
+  // FIXME: If we do have a valid at register to use, we should generate a
+  // slightly shorter sequence here.
+  MCInst tmpInst;
+  int ExprOperandNo = 1;
+  // Sometimes the assembly parser will get the immediate expression as
+  // a $zero + an immediate.
+  if (Inst.getNumOperands() == 3) {
+    assert(Inst.getOperand(1).getReg() ==
+           (isGP64bit() ? Mips::ZERO_64 : Mips::ZERO));
+    ExprOperandNo = 2;
+  }
+  const MCOperand &SymOp = Inst.getOperand(ExprOperandNo);
+  assert(SymOp.isExpr() && "expected symbol operand kind");
+  const MCOperand &RegOp = Inst.getOperand(0);
+  unsigned RegNo = RegOp.getReg();
+  const MCSymbolRefExpr *Symbol = cast<MCSymbolRefExpr>(SymOp.getExpr());
+  const MCSymbolRefExpr *HiExpr =
+      MCSymbolRefExpr::Create(Symbol->getSymbol().getName(),
+                              MCSymbolRefExpr::VK_Mips_ABS_HI, getContext());
+  const MCSymbolRefExpr *LoExpr =
+      MCSymbolRefExpr::Create(Symbol->getSymbol().getName(),
+                              MCSymbolRefExpr::VK_Mips_ABS_LO, getContext());
+  if (isGP64bit()) {
+    // If it's a 64-bit architecture, expand to:
+    // la d,sym => lui  d,highest(sym)
+    //             ori  d,d,higher(sym)
+    //             dsll d,d,16
+    //             ori  d,d,hi16(sym)
+    //             dsll d,d,16
+    //             ori  d,d,lo16(sym)
+    const MCSymbolRefExpr *HighestExpr =
+        MCSymbolRefExpr::Create(Symbol->getSymbol().getName(),
+                                MCSymbolRefExpr::VK_Mips_HIGHEST, getContext());
+    const MCSymbolRefExpr *HigherExpr =
+        MCSymbolRefExpr::Create(Symbol->getSymbol().getName(),
+                                MCSymbolRefExpr::VK_Mips_HIGHER, getContext());
+
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+    tmpInst.addOperand(MCOperand::CreateExpr(HighestExpr));
+    Instructions.push_back(tmpInst);
+
+    createShiftOr<false>(MCOperand::CreateExpr(HigherExpr), RegNo, SMLoc(),
+                         Instructions);
+    createShiftOr<true>(MCOperand::CreateExpr(HiExpr), RegNo, SMLoc(),
+                        Instructions);
+    createShiftOr<true>(MCOperand::CreateExpr(LoExpr), RegNo, SMLoc(),
+                        Instructions);
+  } else {
+    // Otherwise, expand to:
+    // la d,sym => lui  d,hi16(sym)
+    //             ori  d,d,lo16(sym)
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegNo));
+    tmpInst.addOperand(MCOperand::CreateExpr(HiExpr));
+    Instructions.push_back(tmpInst);
+
+    createShiftOr<false>(MCOperand::CreateExpr(LoExpr), RegNo, SMLoc(),
+                         Instructions);
+  }
+}
+
 void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
                                   SmallVectorImpl<MCInst> &Instructions,
                                   bool isLoad, bool isImmOpnd) {
@@ -1381,7 +1718,7 @@
 bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                             OperandVector &Operands,
                                             MCStreamer &Out,
-                                            unsigned &ErrorInfo,
+                                            uint64_t &ErrorInfo,
                                             bool MatchingInlineAsm) {
 
   MCInst Inst;
@@ -1404,7 +1741,7 @@
     return true;
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
+    if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
@@ -1423,16 +1760,25 @@
   return true;
 }
 
-void MipsAsmParser::WarnIfAssemblerTemporary(int RegIndex, SMLoc Loc) {
-  if ((RegIndex != 0) && ((int)Options.getATRegNum() == RegIndex)) {
+void MipsAsmParser::warnIfAssemblerTemporary(int RegIndex, SMLoc Loc) {
+  if ((RegIndex != 0) && 
+      ((int)AssemblerOptions.back()->getATRegNum() == RegIndex)) {
     if (RegIndex == 1)
-      Warning(Loc, "Used $at without \".set noat\"");
+      Warning(Loc, "used $at without \".set noat\"");
     else
-      Warning(Loc, Twine("Used $") + Twine(RegIndex) + " with \".set at=$" +
+      Warning(Loc, Twine("used $") + Twine(RegIndex) + " with \".set at=$" +
                        Twine(RegIndex) + "\"");
   }
 }
 
+void
+MipsAsmParser::printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg,
+                                     SMRange Range, bool ShowColors) {
+  getSourceManager().PrintMessage(Range.Start, SourceMgr::DK_Warning, Msg,
+                                  Range, SMFixIt(Range, FixMsg),
+                                  ShowColors);
+}
+
 int MipsAsmParser::matchCPURegisterName(StringRef Name) {
   int CC;
 
@@ -1472,24 +1818,56 @@
            .Case("t9", 25)
            .Default(-1);
 
-  if (isABI_N32() || isABI_N64()) {
-    // Although SGI documentation just cuts out t0-t3 for n32/n64,
-    // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
-    // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
-    if (8 <= CC && CC <= 11)
-      CC += 4;
+  if (!(isABI_N32() || isABI_N64()))
+    return CC;
 
-    if (CC == -1)
-      CC = StringSwitch<unsigned>(Name)
-               .Case("a4", 8)
-               .Case("a5", 9)
-               .Case("a6", 10)
-               .Case("a7", 11)
-               .Case("kt0", 26)
-               .Case("kt1", 27)
-               .Default(-1);
+  if (12 <= CC && CC <= 15) {
+    // Name is one of t4-t7
+    AsmToken RegTok = getLexer().peekTok();
+    SMRange RegRange = RegTok.getLocRange();
+
+    StringRef FixedName = StringSwitch<StringRef>(Name)
+                              .Case("t4", "t0")
+                              .Case("t5", "t1")
+                              .Case("t6", "t2")
+                              .Case("t7", "t3")
+                              .Default("");
+    assert(FixedName != "" &&  "Register name is not one of t4-t7.");
+
+    printWarningWithFixIt("register names $t4-$t7 are only available in O32.",
+                          "Did you mean $" + FixedName + "?", RegRange);
   }
 
+  // Although SGI documentation just cuts out t0-t3 for n32/n64,
+  // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
+  // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
+  if (8 <= CC && CC <= 11)
+    CC += 4;
+
+  if (CC == -1)
+    CC = StringSwitch<unsigned>(Name)
+             .Case("a4", 8)
+             .Case("a5", 9)
+             .Case("a6", 10)
+             .Case("a7", 11)
+             .Case("kt0", 26)
+             .Case("kt1", 27)
+             .Default(-1);
+
+  return CC;
+}
+
+int MipsAsmParser::matchHWRegsRegisterName(StringRef Name) {
+  int CC;
+
+  CC = StringSwitch<unsigned>(Name)
+            .Case("hwr_cpunum", 0)
+            .Case("hwr_synci_step", 1)
+            .Case("hwr_cc", 2)
+            .Case("hwr_ccres", 3)
+            .Case("hwr_ulr", 29)
+            .Default(-1);
+
   return CC;
 }
 
@@ -1568,15 +1946,15 @@
   if (Reg > 31)
     return false;
 
-  aTReg = Reg;
+  ATReg = Reg;
   return true;
 }
 
 int MipsAsmParser::getATReg(SMLoc Loc) {
-  int AT = Options.getATRegNum();
+  int AT = AssemblerOptions.back()->getATRegNum();
   if (AT == 0)
     reportParseError(Loc,
-                     "Pseudo instruction requires $at, which is not available");
+                     "pseudo-instruction requires $at, which is not available");
   return AT;
 }
 
@@ -1597,8 +1975,9 @@
   return getReg(RegClass, RegNum);
 }
 
-bool MipsAsmParser::ParseOperand(OperandVector &Operands, StringRef Mnemonic) {
-  DEBUG(dbgs() << "ParseOperand\n");
+bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+  MCAsmParser &Parser = getParser();
+  DEBUG(dbgs() << "parseOperand\n");
 
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
@@ -1626,7 +2005,7 @@
     // for div, divu, and similar instructions because it is not an operand
     // to the instruction definition but an explicit register. Special case
     // this situation for now.
-    if (ParseAnyRegister(Operands) != MatchOperand_NoMatch)
+    if (parseAnyRegister(Operands) != MatchOperand_NoMatch)
       return false;
 
     // Maybe it is a symbol reference.
@@ -1651,7 +2030,7 @@
   case AsmToken::Tilde:
   case AsmToken::String: {
     DEBUG(dbgs() << ".. generic integer\n");
-    OperandMatchResultTy ResTy = ParseImm(Operands);
+    OperandMatchResultTy ResTy = parseImm(Operands);
     return ResTy != MatchOperand_Success;
   }
   case AsmToken::Percent: {
@@ -1696,7 +2075,7 @@
       Val = ((MCE->getValue() + 0x800080008000LL) >> 48) & 0xffff;
       break;
     default:
-      report_fatal_error("Unsupported reloc value!");
+      report_fatal_error("unsupported reloc value");
     }
     return MCConstantExpr::Create(Val, getContext());
   }
@@ -1753,6 +2132,7 @@
 }
 
 bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
+  MCAsmParser &Parser = getParser();
   Parser.Lex();                          // Eat the % token.
   const AsmToken &Tok = Parser.getTok(); // Get next token, operation.
   if (Tok.isNot(AsmToken::Identifier))
@@ -1797,7 +2177,7 @@
 bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                   SMLoc &EndLoc) {
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
-  OperandMatchResultTy ResTy = ParseAnyRegister(Operands);
+  OperandMatchResultTy ResTy = parseAnyRegister(Operands);
   if (ResTy == MatchOperand_Success) {
     assert(Operands.size() == 1);
     MipsOperand &Operand = static_cast<MipsOperand &>(*Operands.front());
@@ -1821,6 +2201,7 @@
 }
 
 bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
+  MCAsmParser &Parser = getParser();
   SMLoc S;
   bool Result = true;
 
@@ -1850,6 +2231,7 @@
 
 MipsAsmParser::OperandMatchResultTy
 MipsAsmParser::parseMemOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   DEBUG(dbgs() << "parseMemOperand\n");
   const MCExpr *IdVal = nullptr;
   SMLoc S;
@@ -1882,7 +2264,7 @@
 
         // Zero register assumed, add a memory operand with ZERO as its base.
         // "Base" will be managed by k_Memory.
-        auto Base = MipsOperand::CreateGPRReg(0, getContext().getRegisterInfo(),
+        auto Base = MipsOperand::createGPRReg(0, getContext().getRegisterInfo(),
                                               S, E, *this);
         Operands.push_back(
             MipsOperand::CreateMem(std::move(Base), IdVal, S, E, *this));
@@ -1895,7 +2277,7 @@
     Parser.Lex(); // Eat the '(' token.
   }
 
-  Res = ParseAnyRegister(Operands);
+  Res = parseAnyRegister(Operands);
   if (Res != MatchOperand_Success)
     return Res;
 
@@ -1932,7 +2314,7 @@
 }
 
 bool MipsAsmParser::searchSymbolAlias(OperandVector &Operands) {
-
+  MCAsmParser &Parser = getParser();
   MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
   if (Sym) {
     SMLoc S = Parser.getTok().getLoc();
@@ -1943,10 +2325,10 @@
       return false;
     if (Expr->getKind() == MCExpr::SymbolRef) {
       const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
-      const StringRef DefSymbol = Ref->getSymbol().getName();
+      StringRef DefSymbol = Ref->getSymbol().getName();
       if (DefSymbol.startswith("$")) {
         OperandMatchResultTy ResTy =
-            MatchAnyRegisterNameWithoutDollar(Operands, DefSymbol.substr(1), S);
+            matchAnyRegisterNameWithoutDollar(Operands, DefSymbol.substr(1), S);
         if (ResTy == MatchOperand_Success) {
           Parser.Lex();
           return true;
@@ -1966,47 +2348,54 @@
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::MatchAnyRegisterNameWithoutDollar(OperandVector &Operands,
+MipsAsmParser::matchAnyRegisterNameWithoutDollar(OperandVector &Operands,
                                                  StringRef Identifier,
                                                  SMLoc S) {
   int Index = matchCPURegisterName(Identifier);
   if (Index != -1) {
-    Operands.push_back(MipsOperand::CreateGPRReg(
+    Operands.push_back(MipsOperand::createGPRReg(
+        Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
+    return MatchOperand_Success;
+  }
+
+  Index = matchHWRegsRegisterName(Identifier);
+  if (Index != -1) {
+    Operands.push_back(MipsOperand::createHWRegsReg(
         Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
     return MatchOperand_Success;
   }
 
   Index = matchFPURegisterName(Identifier);
   if (Index != -1) {
-    Operands.push_back(MipsOperand::CreateFGRReg(
+    Operands.push_back(MipsOperand::createFGRReg(
         Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
     return MatchOperand_Success;
   }
 
   Index = matchFCCRegisterName(Identifier);
   if (Index != -1) {
-    Operands.push_back(MipsOperand::CreateFCCReg(
+    Operands.push_back(MipsOperand::createFCCReg(
         Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
     return MatchOperand_Success;
   }
 
   Index = matchACRegisterName(Identifier);
   if (Index != -1) {
-    Operands.push_back(MipsOperand::CreateACCReg(
+    Operands.push_back(MipsOperand::createACCReg(
         Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
     return MatchOperand_Success;
   }
 
   Index = matchMSA128RegisterName(Identifier);
   if (Index != -1) {
-    Operands.push_back(MipsOperand::CreateMSA128Reg(
+    Operands.push_back(MipsOperand::createMSA128Reg(
         Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
     return MatchOperand_Success;
   }
 
   Index = matchMSA128CtrlRegisterName(Identifier);
   if (Index != -1) {
-    Operands.push_back(MipsOperand::CreateMSACtrlReg(
+    Operands.push_back(MipsOperand::createMSACtrlReg(
         Index, getContext().getRegisterInfo(), S, getLexer().getLoc(), *this));
     return MatchOperand_Success;
   }
@@ -2015,18 +2404,19 @@
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::MatchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
+MipsAsmParser::matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S) {
+  MCAsmParser &Parser = getParser();
   auto Token = Parser.getLexer().peekTok(false);
 
   if (Token.is(AsmToken::Identifier)) {
     DEBUG(dbgs() << ".. identifier\n");
     StringRef Identifier = Token.getIdentifier();
     OperandMatchResultTy ResTy =
-        MatchAnyRegisterNameWithoutDollar(Operands, Identifier, S);
+        matchAnyRegisterNameWithoutDollar(Operands, Identifier, S);
     return ResTy;
   } else if (Token.is(AsmToken::Integer)) {
     DEBUG(dbgs() << ".. integer\n");
-    Operands.push_back(MipsOperand::CreateNumericReg(
+    Operands.push_back(MipsOperand::createNumericReg(
         Token.getIntVal(), getContext().getRegisterInfo(), S, Token.getLoc(),
         *this));
     return MatchOperand_Success;
@@ -2038,8 +2428,9 @@
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::ParseAnyRegister(OperandVector &Operands) {
-  DEBUG(dbgs() << "ParseAnyRegister\n");
+MipsAsmParser::parseAnyRegister(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  DEBUG(dbgs() << "parseAnyRegister\n");
 
   auto Token = Parser.getTok();
 
@@ -2056,7 +2447,7 @@
   }
   DEBUG(dbgs() << ".. $\n");
 
-  OperandMatchResultTy ResTy = MatchAnyRegisterWithoutDollar(Operands, S);
+  OperandMatchResultTy ResTy = matchAnyRegisterWithoutDollar(Operands, S);
   if (ResTy == MatchOperand_Success) {
     Parser.Lex(); // $
     Parser.Lex(); // identifier
@@ -2065,7 +2456,8 @@
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::ParseImm(OperandVector &Operands) {
+MipsAsmParser::parseImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   switch (getLexer().getKind()) {
   default:
     return MatchOperand_NoMatch;
@@ -2089,18 +2481,19 @@
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::ParseJumpTarget(OperandVector &Operands) {
-  DEBUG(dbgs() << "ParseJumpTarget\n");
+MipsAsmParser::parseJumpTarget(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  DEBUG(dbgs() << "parseJumpTarget\n");
 
   SMLoc S = getLexer().getLoc();
 
   // Integers and expressions are acceptable
-  OperandMatchResultTy ResTy = ParseImm(Operands);
+  OperandMatchResultTy ResTy = parseImm(Operands);
   if (ResTy != MatchOperand_NoMatch)
     return ResTy;
 
   // Registers are a valid target and have priority over symbols.
-  ResTy = ParseAnyRegister(Operands);
+  ResTy = parseAnyRegister(Operands);
   if (ResTy != MatchOperand_NoMatch)
     return ResTy;
 
@@ -2116,6 +2509,7 @@
 
 MipsAsmParser::OperandMatchResultTy
 MipsAsmParser::parseInvNum(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   const MCExpr *IdVal;
   // If the first token is '$' we may have register operand.
   if (Parser.getTok().is(AsmToken::Dollar))
@@ -2133,7 +2527,8 @@
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::ParseLSAImm(OperandVector &Operands) {
+MipsAsmParser::parseLSAImm(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   switch (getLexer().getKind()) {
   default:
     return MatchOperand_NoMatch;
@@ -2171,6 +2566,82 @@
   return MatchOperand_Success;
 }
 
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseRegisterList(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
+  SmallVector<unsigned, 10> Regs;
+  unsigned RegNo;
+  unsigned PrevReg = Mips::NoRegister;
+  bool RegRange = false;
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> TmpOperands;
+
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_ParseFail;
+
+  SMLoc S = Parser.getTok().getLoc();
+  while (parseAnyRegister(TmpOperands) == MatchOperand_Success) {
+    SMLoc E = getLexer().getLoc();
+    MipsOperand &Reg = static_cast<MipsOperand &>(*TmpOperands.back());
+    RegNo = isGP64bit() ? Reg.getGPR64Reg() : Reg.getGPR32Reg();
+    if (RegRange) {
+      // Remove last register operand because registers from register range
+      // should be inserted first.
+      if (RegNo == Mips::RA) {
+        Regs.push_back(RegNo);
+      } else {
+        unsigned TmpReg = PrevReg + 1;
+        while (TmpReg <= RegNo) {
+          if ((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) {
+            Error(E, "invalid register operand");
+            return MatchOperand_ParseFail;
+          }
+
+          PrevReg = TmpReg;
+          Regs.push_back(TmpReg++);
+        }
+      }
+
+      RegRange = false;
+    } else {
+      if ((PrevReg == Mips::NoRegister) && (RegNo != Mips::S0) &&
+          (RegNo != Mips::RA)) {
+        Error(E, "$16 or $31 expected");
+        return MatchOperand_ParseFail;
+      } else if (((RegNo < Mips::S0) || (RegNo > Mips::S7)) &&
+                 (RegNo != Mips::FP) && (RegNo != Mips::RA)) {
+        Error(E, "invalid register operand");
+        return MatchOperand_ParseFail;
+      } else if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) &&
+                 (RegNo != Mips::FP) && (RegNo != Mips::RA)) {
+        Error(E, "consecutive register numbers expected");
+        return MatchOperand_ParseFail;
+      }
+
+      Regs.push_back(RegNo);
+    }
+
+    if (Parser.getTok().is(AsmToken::Minus))
+      RegRange = true;
+
+    if (!Parser.getTok().isNot(AsmToken::Minus) &&
+        !Parser.getTok().isNot(AsmToken::Comma)) {
+      Error(E, "',' or '-' expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Lex(); // Consume comma or minus
+    if (Parser.getTok().isNot(AsmToken::Dollar))
+      break;
+
+    PrevReg = RegNo;
+  }
+
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(MipsOperand::CreateRegList(Regs, S, E, *this));
+  parseMemOperand(Operands);
+  return MatchOperand_Success;
+}
+
 MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
 
   MCSymbolRefExpr::VariantKind VK =
@@ -2212,12 +2683,13 @@
 /// ::= '(', register, ')'
 /// handle it before we iterate so we don't get tripped up by the lack of
 /// a comma.
-bool MipsAsmParser::ParseParenSuffix(StringRef Name, OperandVector &Operands) {
+bool MipsAsmParser::parseParenSuffix(StringRef Name, OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().is(AsmToken::LParen)) {
     Operands.push_back(
         MipsOperand::CreateToken("(", getLexer().getLoc(), *this));
     Parser.Lex();
-    if (ParseOperand(Operands, Name)) {
+    if (parseOperand(Operands, Name)) {
       SMLoc Loc = getLexer().getLoc();
       Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
@@ -2240,13 +2712,14 @@
 /// ::= '[', integer, ']'
 /// handle it before we iterate so we don't get tripped up by the lack of
 /// a comma.
-bool MipsAsmParser::ParseBracketSuffix(StringRef Name,
+bool MipsAsmParser::parseBracketSuffix(StringRef Name,
                                        OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().is(AsmToken::LBrac)) {
     Operands.push_back(
         MipsOperand::CreateToken("[", getLexer().getLoc(), *this));
     Parser.Lex();
-    if (ParseOperand(Operands, Name)) {
+    if (parseOperand(Operands, Name)) {
       SMLoc Loc = getLexer().getLoc();
       Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
@@ -2265,14 +2738,16 @@
 
 bool MipsAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                      SMLoc NameLoc, OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   DEBUG(dbgs() << "ParseInstruction\n");
-  // We have reached first instruction, module directive after
-  // this is forbidden.
-  getTargetStreamer().setCanHaveModuleDir(false);
+
+  // We have reached first instruction, module directive are now forbidden.
+  getTargetStreamer().forbidModuleDirective();
+
   // Check if we have valid mnemonic
   if (!mnemonicIsValid(Name, 0)) {
     Parser.eatToEndOfStatement();
-    return Error(NameLoc, "Unknown instruction");
+    return Error(NameLoc, "unknown instruction");
   }
   // First operand in MCInst is instruction mnemonic.
   Operands.push_back(MipsOperand::CreateToken(Name, NameLoc, *this));
@@ -2280,29 +2755,29 @@
   // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
-    if (ParseOperand(Operands, Name)) {
+    if (parseOperand(Operands, Name)) {
       SMLoc Loc = getLexer().getLoc();
       Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
     }
-    if (getLexer().is(AsmToken::LBrac) && ParseBracketSuffix(Name, Operands))
+    if (getLexer().is(AsmToken::LBrac) && parseBracketSuffix(Name, Operands))
       return true;
     // AFAIK, parenthesis suffixes are never on the first operand
 
     while (getLexer().is(AsmToken::Comma)) {
       Parser.Lex(); // Eat the comma.
       // Parse and remember the operand.
-      if (ParseOperand(Operands, Name)) {
+      if (parseOperand(Operands, Name)) {
         SMLoc Loc = getLexer().getLoc();
         Parser.eatToEndOfStatement();
         return Error(Loc, "unexpected token in argument list");
       }
       // Parse bracket and parenthesis suffixes before we iterate
       if (getLexer().is(AsmToken::LBrac)) {
-        if (ParseBracketSuffix(Name, Operands))
+        if (parseBracketSuffix(Name, Operands))
           return true;
       } else if (getLexer().is(AsmToken::LParen) &&
-                 ParseParenSuffix(Name, Operands))
+                 parseParenSuffix(Name, Operands))
         return true;
     }
   }
@@ -2316,6 +2791,7 @@
 }
 
 bool MipsAsmParser::reportParseError(Twine ErrorMsg) {
+  MCAsmParser &Parser = getParser();
   SMLoc Loc = getLexer().getLoc();
   Parser.eatToEndOfStatement();
   return Error(Loc, ErrorMsg);
@@ -2326,14 +2802,15 @@
 }
 
 bool MipsAsmParser::parseSetNoAtDirective() {
+  MCAsmParser &Parser = getParser();
   // Line should look like: ".set noat".
   // set at reg to 0.
-  Options.setATReg(0);
+  AssemblerOptions.back()->setATReg(0);
   // eat noat
   Parser.Lex();
   // If this is not the end of the statement, report an error.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    reportParseError("unexpected token in statement");
+    reportParseError("unexpected token, expected end of statement");
     return false;
   }
   Parser.Lex(); // Consume the EndOfStatement.
@@ -2341,18 +2818,19 @@
 }
 
 bool MipsAsmParser::parseSetAtDirective() {
+  MCAsmParser &Parser = getParser();
   // Line can be .set at - defaults to $1
   // or .set at=$reg
   int AtRegNo;
   getParser().Lex();
   if (getLexer().is(AsmToken::EndOfStatement)) {
-    Options.setATReg(1);
+    AssemblerOptions.back()->setATReg(1);
     Parser.Lex(); // Consume the EndOfStatement.
     return false;
   } else if (getLexer().is(AsmToken::Equal)) {
     getParser().Lex(); // Eat the '='.
     if (getLexer().isNot(AsmToken::Dollar)) {
-      reportParseError("unexpected token in statement");
+      reportParseError("unexpected token, expected dollar sign '$'");
       return false;
     }
     Parser.Lex(); // Eat the '$'.
@@ -2362,7 +2840,7 @@
     } else if (Reg.is(AsmToken::Integer)) {
       AtRegNo = Reg.getIntVal();
     } else {
-      reportParseError("unexpected token in statement");
+      reportParseError("unexpected token, expected identifier or integer");
       return false;
     }
 
@@ -2371,14 +2849,14 @@
       return false;
     }
 
-    if (!Options.setATReg(AtRegNo)) {
-      reportParseError("unexpected token in statement");
+    if (!AssemblerOptions.back()->setATReg(AtRegNo)) {
+      reportParseError("invalid register");
       return false;
     }
     getParser().Lex(); // Eat the register.
 
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
-      reportParseError("unexpected token in statement");
+      reportParseError("unexpected token, expected end of statement");
       return false;
     }
     Parser.Lex(); // Consume the EndOfStatement.
@@ -2390,72 +2868,138 @@
 }
 
 bool MipsAsmParser::parseSetReorderDirective() {
+  MCAsmParser &Parser = getParser();
   Parser.Lex();
   // If this is not the end of the statement, report an error.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    reportParseError("unexpected token in statement");
+    reportParseError("unexpected token, expected end of statement");
     return false;
   }
-  Options.setReorder();
+  AssemblerOptions.back()->setReorder();
   getTargetStreamer().emitDirectiveSetReorder();
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
 
 bool MipsAsmParser::parseSetNoReorderDirective() {
+  MCAsmParser &Parser = getParser();
   Parser.Lex();
   // If this is not the end of the statement, report an error.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    reportParseError("unexpected token in statement");
+    reportParseError("unexpected token, expected end of statement");
     return false;
   }
-  Options.setNoreorder();
+  AssemblerOptions.back()->setNoReorder();
   getTargetStreamer().emitDirectiveSetNoReorder();
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
 
 bool MipsAsmParser::parseSetMacroDirective() {
+  MCAsmParser &Parser = getParser();
   Parser.Lex();
   // If this is not the end of the statement, report an error.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    reportParseError("unexpected token in statement");
+    reportParseError("unexpected token, expected end of statement");
     return false;
   }
-  Options.setMacro();
+  AssemblerOptions.back()->setMacro();
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
 
 bool MipsAsmParser::parseSetNoMacroDirective() {
+  MCAsmParser &Parser = getParser();
   Parser.Lex();
   // If this is not the end of the statement, report an error.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+  if (AssemblerOptions.back()->isReorder()) {
     reportParseError("`noreorder' must be set before `nomacro'");
     return false;
   }
-  if (Options.isReorder()) {
-    reportParseError("`noreorder' must be set before `nomacro'");
+  AssemblerOptions.back()->setNoMacro();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetMsaDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  setFeatureBits(Mips::FeatureMSA, "msa");
+  getTargetStreamer().emitDirectiveSetMsa();
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoMsaDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  clearFeatureBits(Mips::FeatureMSA, "msa");
+  getTargetStreamer().emitDirectiveSetNoMsa();
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoDspDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "nodsp".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
     return false;
   }
-  Options.setNomacro();
+
+  clearFeatureBits(Mips::FeatureDSP, "dsp");
+  getTargetStreamer().emitDirectiveSetNoDsp();
+  return false;
+}
+
+bool MipsAsmParser::parseSetMips16Directive() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "mips16".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  setFeatureBits(Mips::FeatureMips16, "mips16");
+  getTargetStreamer().emitDirectiveSetMips16();
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
 
 bool MipsAsmParser::parseSetNoMips16Directive() {
-  Parser.Lex();
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "nomips16".
+
   // If this is not the end of the statement, report an error.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    reportParseError("unexpected token in statement");
+    reportParseError("unexpected token, expected end of statement");
     return false;
   }
-  // For now do nothing.
+
+  clearFeatureBits(Mips::FeatureMips16, "mips16");
+  getTargetStreamer().emitDirectiveSetNoMips16();
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
 
 bool MipsAsmParser::parseSetFpDirective() {
+  MCAsmParser &Parser = getParser();
   MipsABIFlagsSection::FpABIKind FpAbiVal;
   // Line can be: .set fp=32
   //              .set fp=xx
@@ -2463,7 +3007,7 @@
   Parser.Lex(); // Eat fp token
   AsmToken Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Equal)) {
-    reportParseError("unexpected token in statement");
+    reportParseError("unexpected token, expected equals sign '='");
     return false;
   }
   Parser.Lex(); // Eat '=' token.
@@ -2473,7 +3017,7 @@
     return false;
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    reportParseError("unexpected token in statement");
+    reportParseError("unexpected token, expected end of statement");
     return false;
   }
   getTargetStreamer().emitDirectiveSetFp(FpAbiVal);
@@ -2481,15 +3025,50 @@
   return false;
 }
 
+bool MipsAsmParser::parseSetPopDirective() {
+  MCAsmParser &Parser = getParser();
+  SMLoc Loc = getLexer().getLoc();
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  // Always keep an element on the options "stack" to prevent the user
+  // from changing the initial options. This is how we remember them.
+  if (AssemblerOptions.size() == 2)
+    return reportParseError(Loc, ".set pop with no .set push");
+
+  AssemblerOptions.pop_back();
+  setAvailableFeatures(AssemblerOptions.back()->getFeatures());
+
+  getTargetStreamer().emitDirectiveSetPop();
+  return false;
+}
+
+bool MipsAsmParser::parseSetPushDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
+
+  // Create a copy of the current assembler options environment and push it.
+  AssemblerOptions.push_back(
+              make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
+
+  getTargetStreamer().emitDirectiveSetPush();
+  return false;
+}
+
 bool MipsAsmParser::parseSetAssignment() {
   StringRef Name;
   const MCExpr *Value;
+  MCAsmParser &Parser = getParser();
 
   if (Parser.parseIdentifier(Name))
     reportParseError("expected identifier after .set");
 
   if (getLexer().isNot(AsmToken::Comma))
-    return reportParseError("unexpected token in .set directive");
+    return reportParseError("unexpected token, expected comma");
   Lex(); // Eat comma
 
   if (Parser.parseExpression(Value))
@@ -2505,10 +3084,61 @@
   return false;
 }
 
-bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
+bool MipsAsmParser::parseSetMips0Directive() {
+  MCAsmParser &Parser = getParser();
   Parser.Lex();
   if (getLexer().isNot(AsmToken::EndOfStatement))
-    return reportParseError("unexpected token in .set directive");
+    return reportParseError("unexpected token, expected end of statement");
+
+  // Reset assembler options to their initial values.
+  setAvailableFeatures(AssemblerOptions.front()->getFeatures());
+  AssemblerOptions.back()->setFeatures(AssemblerOptions.front()->getFeatures());
+
+  getTargetStreamer().emitDirectiveSetMips0();
+  return false;
+}
+
+bool MipsAsmParser::parseSetArchDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Equal))
+    return reportParseError("unexpected token, expected equals sign");
+
+  Parser.Lex();
+  StringRef Arch;
+  if (Parser.parseIdentifier(Arch))
+    return reportParseError("expected arch identifier");
+
+  StringRef ArchFeatureName =
+      StringSwitch<StringRef>(Arch)
+          .Case("mips1", "mips1")
+          .Case("mips2", "mips2")
+          .Case("mips3", "mips3")
+          .Case("mips4", "mips4")
+          .Case("mips5", "mips5")
+          .Case("mips32", "mips32")
+          .Case("mips32r2", "mips32r2")
+          .Case("mips32r6", "mips32r6")
+          .Case("mips64", "mips64")
+          .Case("mips64r2", "mips64r2")
+          .Case("mips64r6", "mips64r6")
+          .Case("cnmips", "cnmips")
+          .Case("r4000", "mips3") // This is an implementation of Mips3.
+          .Default("");
+
+  if (ArchFeatureName.empty())
+    return reportParseError("unsupported architecture");
+
+  selectArch(ArchFeatureName);
+  getTargetStreamer().emitDirectiveSetArch(Arch);
+  return false;
+}
+
+bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return reportParseError("unexpected token, expected end of statement");
 
   switch (Feature) {
   default:
@@ -2520,26 +3150,56 @@
   case Mips::FeatureMicroMips:
     getTargetStreamer().emitDirectiveSetMicroMips();
     break;
-  case Mips::FeatureMips16:
-    getTargetStreamer().emitDirectiveSetMips16();
+  case Mips::FeatureMips1:
+    selectArch("mips1");
+    getTargetStreamer().emitDirectiveSetMips1();
+    break;
+  case Mips::FeatureMips2:
+    selectArch("mips2");
+    getTargetStreamer().emitDirectiveSetMips2();
+    break;
+  case Mips::FeatureMips3:
+    selectArch("mips3");
+    getTargetStreamer().emitDirectiveSetMips3();
+    break;
+  case Mips::FeatureMips4:
+    selectArch("mips4");
+    getTargetStreamer().emitDirectiveSetMips4();
+    break;
+  case Mips::FeatureMips5:
+    selectArch("mips5");
+    getTargetStreamer().emitDirectiveSetMips5();
+    break;
+  case Mips::FeatureMips32:
+    selectArch("mips32");
+    getTargetStreamer().emitDirectiveSetMips32();
     break;
   case Mips::FeatureMips32r2:
-    setFeatureBits(Mips::FeatureMips32r2, "mips32r2");
+    selectArch("mips32r2");
     getTargetStreamer().emitDirectiveSetMips32R2();
     break;
+  case Mips::FeatureMips32r6:
+    selectArch("mips32r6");
+    getTargetStreamer().emitDirectiveSetMips32R6();
+    break;
   case Mips::FeatureMips64:
-    setFeatureBits(Mips::FeatureMips64, "mips64");
+    selectArch("mips64");
     getTargetStreamer().emitDirectiveSetMips64();
     break;
   case Mips::FeatureMips64r2:
-    setFeatureBits(Mips::FeatureMips64r2, "mips64r2");
+    selectArch("mips64r2");
     getTargetStreamer().emitDirectiveSetMips64R2();
     break;
+  case Mips::FeatureMips64r6:
+    selectArch("mips64r6");
+    getTargetStreamer().emitDirectiveSetMips64R6();
+    break;
   }
   return false;
 }
 
 bool MipsAsmParser::eatComma(StringRef ErrorStr) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::Comma)) {
     SMLoc Loc = getLexer().getLoc();
     Parser.eatToEndOfStatement();
@@ -2550,14 +3210,17 @@
   return true;
 }
 
-bool MipsAsmParser::parseDirectiveCPLoad(SMLoc Loc) {
-  if (Options.isReorder())
-    Warning(Loc, ".cpload in reorder section");
+bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
+  if (AssemblerOptions.back()->isReorder())
+    Warning(Loc, ".cpload should be inside a noreorder section");
 
-  // FIXME: Warn if cpload is used in Mips16 mode.
+  if (inMips16Mode()) {
+    reportParseError(".cpload is not supported in Mips16 mode");
+    return false;
+  }
 
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
-  OperandMatchResultTy ResTy = ParseAnyRegister(Reg);
+  OperandMatchResultTy ResTy = parseAnyRegister(Reg);
   if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
     reportParseError("expected register containing function address");
     return false;
@@ -2569,17 +3232,24 @@
     return false;
   }
 
-  getTargetStreamer().emitDirectiveCpload(RegOpnd.getGPR32Reg());
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  getTargetStreamer().emitDirectiveCpLoad(RegOpnd.getGPR32Reg());
   return false;
 }
 
 bool MipsAsmParser::parseDirectiveCPSetup() {
+  MCAsmParser &Parser = getParser();
   unsigned FuncReg;
   unsigned Save;
   bool SaveIsReg = true;
 
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
-  OperandMatchResultTy ResTy = ParseAnyRegister(TmpReg);
+  OperandMatchResultTy ResTy = parseAnyRegister(TmpReg);
   if (ResTy == MatchOperand_NoMatch) {
     reportParseError("expected register containing function address");
     Parser.eatToEndOfStatement();
@@ -2596,10 +3266,10 @@
   FuncReg = FuncRegOpnd.getGPR32Reg();
   TmpReg.clear();
 
-  if (!eatComma("expected comma parsing directive"))
+  if (!eatComma("unexpected token, expected comma"))
     return true;
 
-  ResTy = ParseAnyRegister(TmpReg);
+  ResTy = parseAnyRegister(TmpReg);
   if (ResTy == MatchOperand_NoMatch) {
     const AsmToken &Tok = Parser.getTok();
     if (Tok.is(AsmToken::Integer)) {
@@ -2621,7 +3291,7 @@
     Save = SaveOpnd.getGPR32Reg();
   }
 
-  if (!eatComma("expected comma parsing directive"))
+  if (!eatComma("unexpected token, expected comma"))
     return true;
 
   StringRef Name;
@@ -2634,6 +3304,7 @@
 }
 
 bool MipsAsmParser::parseDirectiveNaN() {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     const AsmToken &Tok = Parser.getTok();
 
@@ -2654,7 +3325,7 @@
 }
 
 bool MipsAsmParser::parseDirectiveSet() {
-
+  MCAsmParser &Parser = getParser();
   // Get the next token.
   const AsmToken &Tok = Parser.getTok();
 
@@ -2662,8 +3333,14 @@
     return parseSetNoAtDirective();
   } else if (Tok.getString() == "at") {
     return parseSetAtDirective();
+  } else if (Tok.getString() == "arch") {
+    return parseSetArchDirective();
   } else if (Tok.getString() == "fp") {
     return parseSetFpDirective();
+  } else if (Tok.getString() == "pop") {
+    return parseSetPopDirective();
+  } else if (Tok.getString() == "push") {
+    return parseSetPushDirective();
   } else if (Tok.getString() == "reorder") {
     return parseSetReorderDirective();
   } else if (Tok.getString() == "noreorder") {
@@ -2673,7 +3350,7 @@
   } else if (Tok.getString() == "nomacro") {
     return parseSetNoMacroDirective();
   } else if (Tok.getString() == "mips16") {
-    return parseSetFeature(Mips::FeatureMips16);
+    return parseSetMips16Directive();
   } else if (Tok.getString() == "nomips16") {
     return parseSetNoMips16Directive();
   } else if (Tok.getString() == "nomicromips") {
@@ -2682,14 +3359,38 @@
     return false;
   } else if (Tok.getString() == "micromips") {
     return parseSetFeature(Mips::FeatureMicroMips);
+  } else if (Tok.getString() == "mips0") {
+    return parseSetMips0Directive();
+  } else if (Tok.getString() == "mips1") {
+    return parseSetFeature(Mips::FeatureMips1);
+  } else if (Tok.getString() == "mips2") {
+    return parseSetFeature(Mips::FeatureMips2);
+  } else if (Tok.getString() == "mips3") {
+    return parseSetFeature(Mips::FeatureMips3);
+  } else if (Tok.getString() == "mips4") {
+    return parseSetFeature(Mips::FeatureMips4);
+  } else if (Tok.getString() == "mips5") {
+    return parseSetFeature(Mips::FeatureMips5);
+  } else if (Tok.getString() == "mips32") {
+    return parseSetFeature(Mips::FeatureMips32);
   } else if (Tok.getString() == "mips32r2") {
     return parseSetFeature(Mips::FeatureMips32r2);
+  } else if (Tok.getString() == "mips32r6") {
+    return parseSetFeature(Mips::FeatureMips32r6);
   } else if (Tok.getString() == "mips64") {
     return parseSetFeature(Mips::FeatureMips64);
   } else if (Tok.getString() == "mips64r2") {
     return parseSetFeature(Mips::FeatureMips64r2);
+  } else if (Tok.getString() == "mips64r6") {
+    return parseSetFeature(Mips::FeatureMips64r6);
   } else if (Tok.getString() == "dsp") {
     return parseSetFeature(Mips::FeatureDSP);
+  } else if (Tok.getString() == "nodsp") {
+    return parseSetNoDspDirective();
+  } else if (Tok.getString() == "msa") {
+    return parseSetMsaDirective();
+  } else if (Tok.getString() == "nomsa") {
+    return parseSetNoMsaDirective();
   } else {
     // It is just an identifier, look for an assignment.
     parseSetAssignment();
@@ -2702,6 +3403,7 @@
 /// parseDataDirective
 ///  ::= .word [ expression (, expression)* ]
 bool MipsAsmParser::parseDataDirective(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
@@ -2713,9 +3415,8 @@
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
 
-      // FIXME: Improve diagnostic.
       if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
+        return Error(L, "unexpected token, expected comma");
       Parser.Lex();
     }
   }
@@ -2727,6 +3428,7 @@
 /// parseDirectiveGpWord
 ///  ::= .gpword local_sym
 bool MipsAsmParser::parseDirectiveGpWord() {
+  MCAsmParser &Parser = getParser();
   const MCExpr *Value;
   // EmitGPRel32Value requires an expression, so we are using base class
   // method to evaluate the expression.
@@ -2735,7 +3437,8 @@
   getParser().getStreamer().EmitGPRel32Value(Value);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(getLexer().getLoc(), "unexpected token in directive");
+    return Error(getLexer().getLoc(), 
+                "unexpected token, expected end of statement");
   Parser.Lex(); // Eat EndOfStatement token.
   return false;
 }
@@ -2743,6 +3446,7 @@
 /// parseDirectiveGpDWord
 ///  ::= .gpdword local_sym
 bool MipsAsmParser::parseDirectiveGpDWord() {
+  MCAsmParser &Parser = getParser();
   const MCExpr *Value;
   // EmitGPRel64Value requires an expression, so we are using base class
   // method to evaluate the expression.
@@ -2751,17 +3455,19 @@
   getParser().getStreamer().EmitGPRel64Value(Value);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
-    return Error(getLexer().getLoc(), "unexpected token in directive");
+    return Error(getLexer().getLoc(), 
+                "unexpected token, expected end of statement");
   Parser.Lex(); // Eat EndOfStatement token.
   return false;
 }
 
 bool MipsAsmParser::parseDirectiveOption() {
+  MCAsmParser &Parser = getParser();
   // Get the option token.
   AsmToken Tok = Parser.getTok();
   // At the moment only identifiers are supported.
   if (Tok.isNot(AsmToken::Identifier)) {
-    Error(Parser.getTok().getLoc(), "unexpected token in .option directive");
+    Error(Parser.getTok().getLoc(), "unexpected token, expected identifier");
     Parser.eatToEndOfStatement();
     return false;
   }
@@ -2773,7 +3479,7 @@
     Parser.Lex();
     if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
       Error(Parser.getTok().getLoc(),
-            "unexpected token in .option pic0 directive");
+            "unexpected token, expected end of statement");
       Parser.eatToEndOfStatement();
     }
     return false;
@@ -2784,14 +3490,15 @@
     Parser.Lex();
     if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
       Error(Parser.getTok().getLoc(),
-            "unexpected token in .option pic2 directive");
+            "unexpected token, expected end of statement");
       Parser.eatToEndOfStatement();
     }
     return false;
   }
 
   // Unknown option.
-  Warning(Parser.getTok().getLoc(), "unknown option in .option directive");
+  Warning(Parser.getTok().getLoc(), 
+          "unknown option, expected 'pic0' or 'pic2'");
   Parser.eatToEndOfStatement();
   return false;
 }
@@ -2801,10 +3508,11 @@
 ///  ::= .module nooddspreg
 ///  ::= .module fp=value
 bool MipsAsmParser::parseDirectiveModule() {
+  MCAsmParser &Parser = getParser();
   MCAsmLexer &Lexer = getLexer();
   SMLoc L = Lexer.getLoc();
 
-  if (!getTargetStreamer().getCanHaveModuleDir()) {
+  if (!getTargetStreamer().isModuleDirectiveAllowed()) {
     // TODO : get a better message.
     reportParseError(".module directive must appear before any code");
     return false;
@@ -2819,7 +3527,7 @@
       clearFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
 
       if (getLexer().isNot(AsmToken::EndOfStatement)) {
-        reportParseError("Expected end of statement");
+        reportParseError("unexpected token, expected end of statement");
         return false;
       }
 
@@ -2834,7 +3542,7 @@
       setFeatureBits(Mips::FeatureNoOddSPReg, "nooddspreg");
 
       if (getLexer().isNot(AsmToken::EndOfStatement)) {
-        reportParseError("Expected end of statement");
+        reportParseError("unexpected token, expected end of statement");
         return false;
       }
 
@@ -2854,10 +3562,11 @@
 ///  ::= =xx
 ///  ::= =64
 bool MipsAsmParser::parseDirectiveModuleFP() {
+  MCAsmParser &Parser = getParser();
   MCAsmLexer &Lexer = getLexer();
 
   if (Lexer.isNot(AsmToken::Equal)) {
-    reportParseError("unexpected token in statement");
+    reportParseError("unexpected token, expected equals sign '='");
     return false;
   }
   Parser.Lex(); // Eat '=' token.
@@ -2867,7 +3576,7 @@
     return false;
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    reportParseError("unexpected token in statement");
+    reportParseError("unexpected token, expected end of statement");
     return false;
   }
 
@@ -2879,6 +3588,7 @@
 
 bool MipsAsmParser::parseFpABIValue(MipsABIFlagsSection::FpABIKind &FpABI,
                                     StringRef Directive) {
+  MCAsmParser &Parser = getParser();
   MCAsmLexer &Lexer = getLexer();
 
   if (Lexer.is(AsmToken::Identifier)) {
@@ -2925,30 +3635,160 @@
 }
 
 bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
+  MCAsmParser &Parser = getParser();
   StringRef IDVal = DirectiveID.getString();
 
   if (IDVal == ".cpload")
-    return parseDirectiveCPLoad(DirectiveID.getLoc());
+    return parseDirectiveCpLoad(DirectiveID.getLoc());
   if (IDVal == ".dword") {
     parseDataDirective(8, DirectiveID.getLoc());
     return false;
   }
-
   if (IDVal == ".ent") {
-    // Ignore this directive for now.
-    Parser.Lex();
+    StringRef SymbolName;
+
+    if (Parser.parseIdentifier(SymbolName)) {
+      reportParseError("expected identifier after .ent");
+      return false;
+    }
+
+    // There's an undocumented extension that allows an integer to
+    // follow the name of the procedure which AFAICS is ignored by GAS.
+    // Example: .ent foo,2
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (getLexer().isNot(AsmToken::Comma)) {
+        // Even though we accept this undocumented extension for compatibility
+        // reasons, the additional integer argument does not actually change
+        // the behaviour of the '.ent' directive, so we would like to discourage
+        // its use. We do this by not referring to the extended version in
+        // error messages which are not directly related to its use.
+        reportParseError("unexpected token, expected end of statement");
+        return false;
+      }
+      Parser.Lex(); // Eat the comma.
+      const MCExpr *DummyNumber;
+      int64_t DummyNumberVal;
+      // If the user was explicitly trying to use the extended version,
+      // we still give helpful extension-related error messages.
+      if (Parser.parseExpression(DummyNumber)) {
+        reportParseError("expected number after comma");
+        return false;
+      }
+      if (!DummyNumber->EvaluateAsAbsolute(DummyNumberVal)) {
+        reportParseError("expected an absolute expression after comma");
+        return false;
+      }
+    }
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    MCSymbol *Sym = getContext().GetOrCreateSymbol(SymbolName);
+
+    getTargetStreamer().emitDirectiveEnt(*Sym);
+    CurrentFn = Sym;
     return false;
   }
 
   if (IDVal == ".end") {
-    // Ignore this directive for now.
-    Parser.Lex();
+    StringRef SymbolName;
+
+    if (Parser.parseIdentifier(SymbolName)) {
+      reportParseError("expected identifier after .end");
+      return false;
+    }
+
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    if (CurrentFn == nullptr) {
+      reportParseError(".end used without .ent");
+      return false;
+    }
+
+    if ((SymbolName != CurrentFn->getName())) {
+      reportParseError(".end symbol does not match .ent symbol");
+      return false;
+    }
+
+    getTargetStreamer().emitDirectiveEnd(SymbolName);
+    CurrentFn = nullptr;
     return false;
   }
 
   if (IDVal == ".frame") {
-    // Ignore this directive for now.
-    Parser.eatToEndOfStatement();
+    // .frame $stack_reg, frame_size_in_bytes, $return_reg
+    SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> TmpReg;
+    OperandMatchResultTy ResTy = parseAnyRegister(TmpReg);
+    if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+      reportParseError("expected stack register");
+      return false;
+    }
+
+    MipsOperand &StackRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+    if (!StackRegOpnd.isGPRAsmReg()) {
+      reportParseError(StackRegOpnd.getStartLoc(),
+                       "expected general purpose register");
+      return false;
+    }
+    unsigned StackReg = StackRegOpnd.getGPR32Reg();
+
+    if (Parser.getTok().is(AsmToken::Comma))
+      Parser.Lex();
+    else {
+      reportParseError("unexpected token, expected comma");
+      return false;
+    }
+
+    // Parse the frame size.
+    const MCExpr *FrameSize;
+    int64_t FrameSizeVal;
+
+    if (Parser.parseExpression(FrameSize)) {
+      reportParseError("expected frame size value");
+      return false;
+    }
+
+    if (!FrameSize->EvaluateAsAbsolute(FrameSizeVal)) {
+      reportParseError("frame size not an absolute expression");
+      return false;
+    }
+
+    if (Parser.getTok().is(AsmToken::Comma))
+      Parser.Lex();
+    else {
+      reportParseError("unexpected token, expected comma");
+      return false;
+    }
+
+    // Parse the return register.
+    TmpReg.clear();
+    ResTy = parseAnyRegister(TmpReg);
+    if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+      reportParseError("expected return register");
+      return false;
+    }
+
+    MipsOperand &ReturnRegOpnd = static_cast<MipsOperand &>(*TmpReg[0]);
+    if (!ReturnRegOpnd.isGPRAsmReg()) {
+      reportParseError(ReturnRegOpnd.getStartLoc(),
+                       "expected general purpose register");
+      return false;
+    }
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    getTargetStreamer().emitFrame(StackReg, FrameSizeVal,
+                                  ReturnRegOpnd.getGPR32Reg());
     return false;
   }
 
@@ -2956,15 +3796,61 @@
     return parseDirectiveSet();
   }
 
-  if (IDVal == ".fmask") {
-    // Ignore this directive for now.
-    Parser.eatToEndOfStatement();
-    return false;
-  }
+  if (IDVal == ".mask" || IDVal == ".fmask") {
+    // .mask bitmask, frame_offset
+    // bitmask: One bit for each register used.
+    // frame_offset: Offset from Canonical Frame Address ($sp on entry) where
+    //               first register is expected to be saved.
+    // Examples:
+    //   .mask 0x80000000, -4
+    //   .fmask 0x80000000, -4
+    //
 
-  if (IDVal == ".mask") {
-    // Ignore this directive for now.
-    Parser.eatToEndOfStatement();
+    // Parse the bitmask
+    const MCExpr *BitMask;
+    int64_t BitMaskVal;
+
+    if (Parser.parseExpression(BitMask)) {
+      reportParseError("expected bitmask value");
+      return false;
+    }
+
+    if (!BitMask->EvaluateAsAbsolute(BitMaskVal)) {
+      reportParseError("bitmask not an absolute expression");
+      return false;
+    }
+
+    if (Parser.getTok().is(AsmToken::Comma))
+      Parser.Lex();
+    else {
+      reportParseError("unexpected token, expected comma");
+      return false;
+    }
+
+    // Parse the frame_offset
+    const MCExpr *FrameOffset;
+    int64_t FrameOffsetVal;
+
+    if (Parser.parseExpression(FrameOffset)) {
+      reportParseError("expected frame offset value");
+      return false;
+    }
+
+    if (!FrameOffset->EvaluateAsAbsolute(FrameOffsetVal)) {
+      reportParseError("frame offset not an absolute expression");
+      return false;
+    }
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    if (IDVal == ".mask")
+      getTargetStreamer().emitMask(BitMaskVal, FrameOffsetVal);
+    else
+      getTargetStreamer().emitFMask(BitMaskVal, FrameOffsetVal);
     return false;
   }
 
@@ -2992,7 +3878,8 @@
   if (IDVal == ".abicalls") {
     getTargetStreamer().emitDirectiveAbiCalls();
     if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
-      Error(Parser.getTok().getLoc(), "unexpected token in directive");
+      Error(Parser.getTok().getLoc(), 
+            "unexpected token, expected end of statement");
       // Clear line
       Parser.eatToEndOfStatement();
     }

diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index bf67d71..1f201b0 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt

@@ -3,8 +3,7 @@
 tablegen(LLVM MipsGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM MipsGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM MipsGenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM MipsGenCodeEmitter.inc -gen-emitter)
-tablegen(LLVM MipsGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM MipsGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM MipsGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM MipsGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM MipsGenFastISel.inc -gen-fast-isel)
@@ -22,13 +21,13 @@
   Mips16ISelDAGToDAG.cpp
   Mips16ISelLowering.cpp
   Mips16RegisterInfo.cpp
+  MipsABIInfo.cpp
   MipsAnalyzeImmediate.cpp
   MipsAsmPrinter.cpp
-  MipsCodeEmitter.cpp
+  MipsCCState.cpp
   MipsConstantIslandPass.cpp
   MipsDelaySlotFiller.cpp
   MipsFastISel.cpp
-  MipsJITInfo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
   MipsISelLowering.cpp

diff --git a/lib/Target/Mips/Disassembler/LLVMBuild.txt b/lib/Target/Mips/Disassembler/LLVMBuild.txt
index bb70fd3..414b4f7 100644
--- a/lib/Target/Mips/Disassembler/LLVMBuild.txt
+++ b/lib/Target/Mips/Disassembler/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = MipsDisassembler
 parent = Mips
-required_libraries = MC MipsInfo Support
+required_libraries = MCDisassembler MipsInfo Support
 add_to_library_groups = Mips

diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 902b877..48904ce 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp

@@ -20,7 +20,6 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -31,15 +30,14 @@
 
 namespace {
 
-/// MipsDisassemblerBase - a disasembler class for Mips.
+/// A disasembler class for Mips.
 class MipsDisassemblerBase : public MCDisassembler {
 public:
-  /// Constructor     - Initializes the disassembler.
-  ///
   MipsDisassemblerBase(const MCSubtargetInfo &STI, MCContext &Ctx,
-                       bool bigEndian) :
-    MCDisassembler(STI, Ctx),
-    IsN64(STI.getFeatureBits() & Mips::FeatureN64), isBigEndian(bigEndian) {}
+                       bool IsBigEndian)
+      : MCDisassembler(STI, Ctx),
+        IsN64(STI.getFeatureBits() & Mips::FeatureN64),
+        IsBigEndian(IsBigEndian) {}
 
   virtual ~MipsDisassemblerBase() {}
 
@@ -48,15 +46,13 @@
 private:
   bool IsN64;
 protected:
-  bool isBigEndian;
+  bool IsBigEndian;
 };
 
-/// MipsDisassembler - a disasembler class for Mips32.
+/// A disasembler class for Mips32.
 class MipsDisassembler : public MipsDisassemblerBase {
   bool IsMicroMips;
 public:
-  /// Constructor     - Initializes the disassembler.
-  ///
   MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool bigEndian)
       : MipsDisassemblerBase(STI, Ctx, bigEndian) {
     IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
@@ -75,32 +71,23 @@
     return !hasMips32() && !hasMips3();
   }
 
-  /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const override;
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
 };
 
-
-/// Mips64Disassembler - a disasembler class for Mips64.
+/// A disasembler class for Mips64.
 class Mips64Disassembler : public MipsDisassemblerBase {
 public:
-  /// Constructor     - Initializes the disassembler.
-  ///
   Mips64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                      bool bigEndian) :
     MipsDisassemblerBase(STI, Ctx, bigEndian) {}
 
-  /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const override;
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
 };
 
 } // end anonymous namespace
@@ -117,6 +104,11 @@
                                                  uint64_t Address,
                                                  const void *Decoder);
 
+static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
 static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
                                              uint64_t Address,
@@ -142,11 +134,6 @@
                                              uint64_t Address,
                                              const void *Decoder);
 
-static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
-                                              uint64_t Address,
-                                              const void *Decoder);
-
 static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
                                            unsigned RegNo,
                                            uint64_t Address,
@@ -255,6 +242,11 @@
                               uint64_t Address,
                               const void *Decoder);
 
+static DecodeStatus DecodeCacheOp(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder);
+
 static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder);
 
@@ -272,6 +264,14 @@
                                uint64_t Address,
                                const void *Decoder);
 
+static DecodeStatus DecodeFMem2(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodeFMem3(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
 static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
                                        unsigned Insn,
                                        uint64_t Address,
@@ -341,6 +341,10 @@
 DecodeBlezGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
                        const void *Decoder);
 
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
 namespace llvm {
 extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
               TheMips64elTarget;
@@ -456,7 +460,7 @@
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
   bool HasRs = false;
 
   if (Rs >= Rt) {
@@ -495,7 +499,7 @@
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
   bool HasRs = false;
 
   if (Rs >= Rt) {
@@ -535,7 +539,7 @@
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
   bool HasRs = false;
 
   if (Rt == 0)
@@ -580,7 +584,7 @@
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
 
   if (Rt == 0)
     return MCDisassembler::Fail;
@@ -622,7 +626,7 @@
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
   bool HasRs = false;
   bool HasRt = false;
 
@@ -671,7 +675,7 @@
 
   InsnType Rs = fieldFromInstruction(insn, 21, 5);
   InsnType Rt = fieldFromInstruction(insn, 16, 5);
-  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) * 4;
   bool HasRs = false;
 
   if (Rt == 0)
@@ -696,43 +700,31 @@
   return MCDisassembler::Success;
 }
 
-  /// readInstruction - read four bytes from the MemoryObject
-  /// and return 32 bit word sorted according to the given endianess
-static DecodeStatus readInstruction32(const MemoryObject &region,
-                                      uint64_t address,
-                                      uint64_t &size,
-                                      uint32_t &insn,
-                                      bool isBigEndian,
-                                      bool IsMicroMips) {
-  uint8_t Bytes[4];
-
+/// Read four bytes from the ArrayRef and return 32 bit word sorted
+/// according to the given endianess
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn,
+                                      bool IsBigEndian, bool IsMicroMips) {
   // We want to read exactly 4 Bytes of data.
-  if (region.readBytes(address, 4, Bytes) == -1) {
-    size = 0;
+  if (Bytes.size() < 4) {
+    Size = 0;
     return MCDisassembler::Fail;
   }
 
-  if (isBigEndian) {
+  if (IsBigEndian) {
     // Encoded as a big-endian 32-bit word in the stream.
-    insn = (Bytes[3] <<  0) |
-           (Bytes[2] <<  8) |
-           (Bytes[1] << 16) |
-           (Bytes[0] << 24);
-  }
-  else {
+    Insn =
+        (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24);
+  } else {
     // Encoded as a small-endian 32-bit word in the stream.
     // Little-endian byte ordering:
     //   mips32r2:   4 | 3 | 2 | 1
     //   microMIPS:  2 | 1 | 4 | 3
     if (IsMicroMips) {
-      insn = (Bytes[2] <<  0) |
-             (Bytes[3] <<  8) |
-             (Bytes[0] << 16) |
+      Insn = (Bytes[2] << 0) | (Bytes[3] << 8) | (Bytes[0] << 16) |
              (Bytes[1] << 24);
     } else {
-      insn = (Bytes[0] <<  0) |
-             (Bytes[1] <<  8) |
-             (Bytes[2] << 16) |
+      Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
              (Bytes[3] << 24);
     }
   }
@@ -740,24 +732,22 @@
   return MCDisassembler::Success;
 }
 
-DecodeStatus
-MipsDisassembler::getInstruction(MCInst &instr,
-                                 uint64_t &Size,
-                                 const MemoryObject &Region,
-                                 uint64_t Address,
-                                 raw_ostream &vStream,
-                                 raw_ostream &cStream) const {
+DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                              ArrayRef<uint8_t> Bytes,
+                                              uint64_t Address,
+                                              raw_ostream &VStream,
+                                              raw_ostream &CStream) const {
   uint32_t Insn;
 
-  DecodeStatus Result = readInstruction32(Region, Address, Size,
-                                          Insn, isBigEndian, IsMicroMips);
+  DecodeStatus Result =
+      readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, IsMicroMips);
   if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
 
   if (IsMicroMips) {
     DEBUG(dbgs() << "Trying MicroMips32 table (32-bit opcodes):\n");
     // Calling the auto-generated decoder function.
-    Result = decodeInstruction(DecoderTableMicroMips32, instr, Insn, Address,
+    Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address,
                                this, STI);
     if (Result != MCDisassembler::Fail) {
       Size = 4;
@@ -769,7 +759,7 @@
   if (hasCOP3()) {
     DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
     Result =
-        decodeInstruction(DecoderTableCOP3_32, instr, Insn, Address, this, STI);
+        decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
     if (Result != MCDisassembler::Fail) {
       Size = 4;
       return Result;
@@ -778,7 +768,7 @@
 
   if (hasMips32r6() && isGP64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, instr, Insn,
+    Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail) {
       Size = 4;
@@ -788,7 +778,7 @@
 
   if (hasMips32r6()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
-    Result = decodeInstruction(DecoderTableMips32r6_64r632, instr, Insn,
+    Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
                                Address, this, STI);
     if (Result != MCDisassembler::Fail) {
       Size = 4;
@@ -798,8 +788,8 @@
 
   DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
   // Calling the auto-generated decoder function.
-  Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
-                             this, STI);
+  Result =
+      decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
     return Result;
@@ -808,30 +798,28 @@
   return MCDisassembler::Fail;
 }
 
-DecodeStatus
-Mips64Disassembler::getInstruction(MCInst &instr,
-                                   uint64_t &Size,
-                                   const MemoryObject &Region,
-                                   uint64_t Address,
-                                   raw_ostream &vStream,
-                                   raw_ostream &cStream) const {
+DecodeStatus Mips64Disassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                                ArrayRef<uint8_t> Bytes,
+                                                uint64_t Address,
+                                                raw_ostream &VStream,
+                                                raw_ostream &CStream) const {
   uint32_t Insn;
 
-  DecodeStatus Result = readInstruction32(Region, Address, Size,
-                                          Insn, isBigEndian, false);
+  DecodeStatus Result =
+      readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
   if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
 
   // Calling the auto-generated decoder function.
-  Result = decodeInstruction(DecoderTableMips6432, instr, Insn, Address,
-                             this, STI);
+  Result =
+      decodeInstruction(DecoderTableMips6432, Instr, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
     return Result;
   }
   // If we fail to decode in Mips64 decoder space we can try in Mips32
-  Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
-                             this, STI);
+  Result =
+      decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
     return Result;
@@ -862,6 +850,13 @@
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeGPRMM16RegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return MCDisassembler::Fail;
+}
+
 static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
                                              uint64_t Address,
@@ -914,18 +909,6 @@
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst,
-                                              unsigned RegNo,
-                                              uint64_t Address,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  unsigned Reg = getReg(Decoder, Mips::FGRH32RegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Reg));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
                                            unsigned RegNo,
                                            uint64_t Address,
@@ -981,6 +964,23 @@
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeCacheOp(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Hint = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+  Inst.addOperand(MCOperand::CreateImm(Hint));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
                                     uint64_t Address, const void *Decoder) {
   int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
@@ -1012,15 +1012,15 @@
     break;
   case Mips::LD_H:
   case Mips::ST_H:
-    Inst.addOperand(MCOperand::CreateImm(Offset << 1));
+    Inst.addOperand(MCOperand::CreateImm(Offset * 2));
     break;
   case Mips::LD_W:
   case Mips::ST_W:
-    Inst.addOperand(MCOperand::CreateImm(Offset << 2));
+    Inst.addOperand(MCOperand::CreateImm(Offset * 4));
     break;
   case Mips::LD_D:
   case Mips::ST_D:
-    Inst.addOperand(MCOperand::CreateImm(Offset << 3));
+    Inst.addOperand(MCOperand::CreateImm(Offset * 8));
     break;
   }
 
@@ -1038,12 +1038,23 @@
   Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
   Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
-  if (Inst.getOpcode() == Mips::SC_MM)
+  switch (Inst.getOpcode()) {
+  case Mips::SWM32_MM:
+  case Mips::LWM32_MM:
+    if (DecodeRegListOperand(Inst, Insn, Address, Decoder)
+        == MCDisassembler::Fail)
+      return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateReg(Base));
+    Inst.addOperand(MCOperand::CreateImm(Offset));
+    break;
+  case Mips::SC_MM:
     Inst.addOperand(MCOperand::CreateReg(Reg));
-
-  Inst.addOperand(MCOperand::CreateReg(Reg));
-  Inst.addOperand(MCOperand::CreateReg(Base));
-  Inst.addOperand(MCOperand::CreateImm(Offset));
+    // fallthrough
+  default:
+    Inst.addOperand(MCOperand::CreateReg(Reg));
+    Inst.addOperand(MCOperand::CreateReg(Base));
+    Inst.addOperand(MCOperand::CreateImm(Offset));
+  }
 
   return MCDisassembler::Success;
 }
@@ -1084,6 +1095,42 @@
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeFMem2(MCInst &Inst,
+                               unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::COP2RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem3(MCInst &Inst,
+                               unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::COP3RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSpecial3LlSc(MCInst &Inst,
                                        unsigned Insn,
                                        uint64_t Address,
@@ -1242,7 +1289,7 @@
                                        unsigned Offset,
                                        uint64_t Address,
                                        const void *Decoder) {
-  int32_t BranchOffset = (SignExtend32<16>(Offset) << 2) + 4;
+  int32_t BranchOffset = (SignExtend32<16>(Offset) * 4) + 4;
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
 }
@@ -1261,7 +1308,7 @@
                                          unsigned Offset,
                                          uint64_t Address,
                                          const void *Decoder) {
-  int32_t BranchOffset = SignExtend32<21>(Offset) << 2;
+  int32_t BranchOffset = SignExtend32<21>(Offset) * 4;
 
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
@@ -1271,7 +1318,7 @@
                                          unsigned Offset,
                                          uint64_t Address,
                                          const void *Decoder) {
-  int32_t BranchOffset = SignExtend32<26>(Offset) << 2;
+  int32_t BranchOffset = SignExtend32<26>(Offset) * 4;
 
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
@@ -1281,7 +1328,7 @@
                                          unsigned Offset,
                                          uint64_t Address,
                                          const void *Decoder) {
-  int32_t BranchOffset = SignExtend32<16>(Offset) << 1;
+  int32_t BranchOffset = SignExtend32<16>(Offset) * 2;
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
 }
@@ -1334,12 +1381,35 @@
 
 static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
                                      uint64_t Address, const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(SignExtend32<19>(Insn) << 2));
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<19>(Insn) * 4));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeSimm18Lsl3(MCInst &Inst, unsigned Insn,
                                      uint64_t Address, const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(SignExtend32<18>(Insn) << 3));
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<18>(Insn) * 8));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegListOperand(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
+                     Mips::S6, Mips::FP};
+  unsigned RegNum;
+
+  unsigned RegLst = fieldFromInstruction(Insn, 21, 5);
+  // Empty register lists are not allowed.
+  if (RegLst == 0)
+    return MCDisassembler::Fail;
+
+  RegNum = RegLst & 0xf;
+  for (unsigned i = 0; i < RegNum; i++)
+    Inst.addOperand(MCOperand::CreateReg(Regs[i]));
+
+  if (RegLst & 0x10)
+    Inst.addOperand(MCOperand::CreateReg(Mips::RA));
+
   return MCDisassembler::Success;
 }

diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 8c79751..ab6b225 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp

@@ -225,6 +225,18 @@
   // Load/Store memory operands -- imm($reg)
   // If PIC target the target is loaded as the
   // pattern lw $25,%call16($28)
+
+  // opNum can be invalid if instruction had reglist as operand.
+  // MemOperand is always last operand of instruction (base + offset).
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::SWM32_MM:
+  case Mips::LWM32_MM:
+    opNum = MI->getNumOperands() - 2;
+    break;
+  }
+
   printOperand(MI, opNum+1, O);
   O << "(";
   printOperand(MI, opNum, O);
@@ -324,3 +336,13 @@
   }
 }
 
+void MipsInstPrinter::
+printRegisterList(const MCInst *MI, int opNum, raw_ostream &O) {
+  // - 2 because register List is always first operand of instruction and it is
+  // always followed by memory operand (base + offset).
+  for (int i = opNum, e = MI->getNumOperands() - 2; i != e; ++i) {
+    if (i != opNum)
+      O << ", ";
+    printRegName(O, MI->getOperand(i).getReg());
+  }
+}

diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 550a0f1..42df013 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSINSTPRINTER_H
-#define MIPSINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H
+#define LLVM_LIB_TARGET_MIPS_INSTPRINTER_MIPSINSTPRINTER_H
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -107,6 +107,7 @@
                   unsigned OpNo1, raw_ostream &OS);
   bool printAlias(const MCInst &MI, raw_ostream &OS);
   void printSaveRestore(const MCInst *MI, raw_ostream &O);
+  void printRegisterList(const MCInst *MI, int opNum, raw_ostream &O);
 };
 } // end namespace llvm
 

diff --git a/lib/Target/Mips/LLVMBuild.txt b/lib/Target/Mips/LLVMBuild.txt
index e6d3a42..0e8d902 100644
--- a/lib/Target/Mips/LLVMBuild.txt
+++ b/lib/Target/Mips/LLVMBuild.txt

@@ -31,5 +31,5 @@
 type = Library
 name = MipsCodeGen
 parent = Mips
-required_libraries = Analysis AsmPrinter CodeGen Core MC MipsAsmPrinter MipsDesc MipsInfo Scalar SelectionDAG Support Target
+required_libraries = Analysis AsmPrinter CodeGen Core MC MipsAsmPrinter MipsDesc MipsInfo SelectionDAG Support Target
 add_to_library_groups = Mips

diff --git a/lib/Target/Mips/MCTargetDesc/Android.mk b/lib/Target/Mips/MCTargetDesc/Android.mk
index c8d18fc..89e132d 100644
--- a/lib/Target/Mips/MCTargetDesc/Android.mk
+++ b/lib/Target/Mips/MCTargetDesc/Android.mk

@@ -15,6 +15,7 @@
   MipsMCCodeEmitter.cpp \
   MipsMCExpr.cpp \
   MipsMCTargetDesc.cpp \
+  MipsOptionRecord.cpp \
   MipsNaClELFStreamer.cpp \
   MipsTargetStreamer.cpp
 

diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index c14ee35..6b3788c 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt

@@ -8,5 +8,6 @@
   MipsMCExpr.cpp
   MipsMCTargetDesc.cpp
   MipsNaClELFStreamer.cpp
+  MipsOptionRecord.cpp
   MipsTargetStreamer.cpp
   )

diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
index 52d5dd3..5b0f950 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp

@@ -41,6 +41,12 @@
   }
 }
 
+uint8_t MipsABIFlagsSection::getCPR1SizeValue() {
+  if (FpABI == FpABIKind::XX)
+    return (uint8_t)AFL_REG_32;
+  return (uint8_t)CPR1Size;
+}
+
 namespace llvm {
 MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
   // Write out a Elf_Internal_ABIFlags_v0 struct

diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index ab18c44..8bcfb0f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSABIFLAGSSECTION_H
-#define MIPSABIFLAGSSECTION_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
 
 #include "llvm/MC/MCStreamer.h"
 
@@ -115,7 +115,7 @@
   uint8_t getISALevelValue() { return (uint8_t)ISALevel; }
   uint8_t getISARevisionValue() { return (uint8_t)ISARevision; }
   uint8_t getGPRSizeValue() { return (uint8_t)GPRSize; }
-  uint8_t getCPR1SizeValue() { return (uint8_t)CPR1Size; }
+  uint8_t getCPR1SizeValue();
   uint8_t getCPR2SizeValue() { return (uint8_t)CPR2Size; }
   uint8_t getFpABIValue();
   uint32_t getISAExtensionSetValue() { return (uint32_t)ISAExtensionSet; }
@@ -181,7 +181,7 @@
 
   template <class PredicateLibrary>
   void setCPR1SizeFromPredicates(const PredicateLibrary &P) {
-    if (P.mipsSEUsesSoftFloat())
+    if (P.abiUsesSoftFloat())
       CPR1Size = AFL_REG_NONE;
     else if (P.hasMSA())
       CPR1Size = AFL_REG_128;
@@ -212,10 +212,10 @@
     if (P.isABI_N32() || P.isABI_N64())
       FpABI = FpABIKind::S64;
     else if (P.isABI_O32()) {
-      if (P.isFP64bit())
-        FpABI = FpABIKind::S64;
-      else if (P.isABI_FPXX())
+      if (P.isABI_FPXX())
         FpABI = FpABIKind::XX;
+      else if (P.isFP64bit())
+        FpABI = FpABIKind::S64;
       else
         FpABI = FpABIKind::S32;
     }
@@ -228,6 +228,7 @@
     setCPR1SizeFromPredicates(P);
     setASESetFromPredicates(P);
     setFpAbiFromPredicates(P);
+    OddSPReg = P.useOddSPReg();
   }
 };
 

diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index d8e6128..efeb54d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp

@@ -367,7 +367,12 @@
   // Check for a less than instruction size number of bytes
   // FIXME: 16 bit instructions are not handled yet here.
   // We shouldn't be using a hard coded number for instruction size.
-  if (Count % 4) return false;
+
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  for (uint64_t i = 0, e = Count % 4; i != e; ++i)
+    OW->Write8(0);
 
   uint64_t NumNops = Count / 4;
   for (uint64_t i = 0; i != NumNops; ++i)

diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index d5c3dbc..d4f4983 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h

@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 //
 
-#ifndef MIPSASMBACKEND_H
-#define MIPSASMBACKEND_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSASMBACKEND_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSASMBACKEND_H
 
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "llvm/MC/MCAsmBackend.h"

diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index d2323dc..ff7779e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h

@@ -11,8 +11,8 @@
 // the Mips target useful for the compiler back-end and the MC libraries.
 //
 //===----------------------------------------------------------------------===//
-#ifndef MIPSBASEINFO_H
-#define MIPSBASEINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSBASEINFO_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSBASEINFO_H
 
 #include "MipsFixupKinds.h"
 #include "MipsMCTargetDesc.h"

diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 49ac256..4ea7846 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp

@@ -30,7 +30,8 @@
 
     unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                           bool IsPCRel) const override;
-    bool needsRelocateWithSymbol(unsigned Type) const override;
+    bool needsRelocateWithSymbol(const MCSymbolData &SD,
+                                 unsigned Type) const override;
   };
 }
 
@@ -216,7 +217,8 @@
 }
 
 bool
-MipsELFObjectWriter::needsRelocateWithSymbol(unsigned Type) const {
+MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
+                                             unsigned Type) const {
   // FIXME: This is extremelly conservative. This really needs to use a
   // whitelist with a clear explanation for why each realocation needs to
   // point to the symbol, not to the section.

diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index fe37829..18c4a20 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp

@@ -8,12 +8,72 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsELFStreamer.h"
+#include "MipsTargetStreamer.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
+                                      const MCSubtargetInfo &STI) {
+  MCELFStreamer::EmitInstruction(Inst, STI);
+
+  MCContext &Context = getContext();
+  const MCRegisterInfo *MCRegInfo = Context.getRegisterInfo();
+  MipsTargetELFStreamer *ELFTargetStreamer =
+      static_cast<MipsTargetELFStreamer *>(getTargetStreamer());
+
+  for (unsigned OpIndex = 0; OpIndex < Inst.getNumOperands(); ++OpIndex) {
+    const MCOperand &Op = Inst.getOperand(OpIndex);
+
+    if (!Op.isReg())
+      continue;
+
+    unsigned Reg = Op.getReg();
+    RegInfoRecord->SetPhysRegUsed(Reg, MCRegInfo);
+  }
+
+  if (ELFTargetStreamer->isMicroMipsEnabled()) {
+    for (auto Label : Labels) {
+      MCSymbolData &Data = getOrCreateSymbolData(Label);
+      // The "other" values are stored in the last 6 bits of the second byte.
+      // The traditional defines for STO values assume the full byte and thus
+      // the shift to pack it.
+      MCELF::setOther(Data, ELF::STO_MIPS_MICROMIPS >> 2);
+    }
+  }
+
+  Labels.clear();
+}
+
+void MipsELFStreamer::EmitLabel(MCSymbol *Symbol) {
+  MCELFStreamer::EmitLabel(Symbol);
+  Labels.push_back(Symbol);
+}
+
+void MipsELFStreamer::SwitchSection(const MCSection * Section,
+                                    const MCExpr *Subsection) {
+  MCELFStreamer::SwitchSection(Section, Subsection);
+  Labels.clear();
+}
+
+void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                    const SMLoc &Loc) {
+  MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+  Labels.clear();
+}
+
+void MipsELFStreamer::EmitMipsOptionRecords() {
+  for (const auto &I : MipsOptionRecords)
+    I->EmitMipsOptionRecord();
+}
 
 namespace llvm {
 MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                     const MCSubtargetInfo &STI, bool RelaxAll,
-                                     bool NoExecStack) {
+                                     const MCSubtargetInfo &STI,
+                                     bool RelaxAll) {
   return new MipsELFStreamer(Context, MAB, OS, Emitter, STI);
 }
 }

diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index 641f8cf..136146b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h

@@ -12,11 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSELFSTREAMER_H
-#define MIPSELFSTREAMER_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
 
+#include "MipsOptionRecord.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCELFStreamer.h"
-#include "llvm/Support/raw_ostream.h"
+#include <memory>
 
 namespace llvm {
 class MCAsmBackend;
@@ -25,18 +27,48 @@
 class MCSubtargetInfo;
 
 class MipsELFStreamer : public MCELFStreamer {
+  SmallVector<std::unique_ptr<MipsOptionRecord>, 8> MipsOptionRecords;
+  MipsRegInfoRecord *RegInfoRecord;
+  SmallVector<MCSymbol*, 4> Labels;
+
 
 public:
   MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS,
                   MCCodeEmitter *Emitter, const MCSubtargetInfo &STI)
-      : MCELFStreamer(Context, MAB, OS, Emitter) {}
+      : MCELFStreamer(Context, MAB, OS, Emitter) {
 
-  virtual ~MipsELFStreamer() {}
+    RegInfoRecord = new MipsRegInfoRecord(this, Context, STI);
+    MipsOptionRecords.push_back(
+        std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
+  }
+
+  /// Overriding this function allows us to add arbitrary behaviour before the
+  /// \p Inst is actually emitted. For example, we can inspect the operands and
+  /// gather sufficient information that allows us to reason about the register
+  /// usage for the translation unit.
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+
+  /// Overriding this function allows us to record all labels that should be
+  /// marked as microMIPS. Based on this data marking is done in
+  /// EmitInstruction.
+  void EmitLabel(MCSymbol *Symbol) override;
+
+  /// Overriding this function allows us to dismiss all labels that are
+  /// candidates for marking as microMIPS when .section directive is processed.
+  void SwitchSection(const MCSection *Section,
+                     const MCExpr *Subsection = nullptr) override;
+
+  /// Overriding this function allows us to dismiss all labels that are
+  /// candidates for marking as microMIPS when .word directive is emitted.
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc) override;
+
+  /// Emits all the option records stored up until the point it's called.
+  void EmitMipsOptionRecords();
 };
 
 MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                     const MCSubtargetInfo &STI, bool RelaxAll,
-                                     bool NoExecStack);
+                                     const MCSubtargetInfo &STI, bool RelaxAll);
 } // namespace llvm.
 #endif

diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 05080f0..317db16 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MIPS_MIPSFIXUPKINDS_H
-#define LLVM_MIPS_MIPSFIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSFIXUPKINDS_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSFIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 
@@ -199,4 +199,4 @@
 } // namespace llvm
 
 
-#endif // LLVM_MIPS_MIPSFIXUPKINDS_H
+#endif

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index e415412..2f5d196 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp

@@ -41,6 +41,5 @@
   UseAssignmentForEHBegin = true;
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
-  HasLEB128 = true;
   DwarfRegNumForCFI = true;
 }

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 37ba0c4..59ff1c4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSTARGETASMINFO_H
-#define MIPSTARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
 

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 43fc521..d632c27 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp

@@ -345,6 +345,67 @@
 }
 
 unsigned MipsMCCodeEmitter::
+getUImm5Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    // The immediate is encoded as 'immediate << 2'.
+    unsigned Res = getMachineOpValue(MI, MO, Fixups, STI);
+    assert((Res & 3) == 0);
+    return Res >> 2;
+  }
+
+  assert(MO.isExpr() &&
+         "getUImm5Lsl2Encoding expects only expressions or an immediate");
+
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getSImm3Lsa2Value(const MCInst &MI, unsigned OpNo,
+                  SmallVectorImpl<MCFixup> &Fixups,
+                  const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    int Value = MO.getImm();
+    return Value >> 2;
+  }
+
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getUImm6Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    unsigned Value = MO.getImm();
+    return Value >> 2;
+  }
+
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
+getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm()) {
+    unsigned Binary = (MO.getImm() >> 2) & 0x0000ffff;
+    return (((Binary & 0x8000) >> 7) | (Binary & 0x00ff));
+  }
+
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
 getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups,
                const MCSubtargetInfo &STI) const {
   int64_t Res;
@@ -577,6 +638,17 @@
 getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
                       SmallVectorImpl<MCFixup> &Fixups,
                       const MCSubtargetInfo &STI) const {
+  // opNum can be invalid if instruction had reglist as operand.
+  // MemOperand is always last operand of instruction (base + offset).
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case Mips::SWM32_MM:
+  case Mips::LWM32_MM:
+    OpNo = MI.getNumOperands() - 2;
+    break;
+  }
+
   // Base register is encoded in bits 20-16, offset is encoded in bits 11-0.
   assert(MI.getOperand(OpNo).isReg());
   unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) << 16;
@@ -659,4 +731,61 @@
   return 0;
 }
 
+unsigned
+MipsMCCodeEmitter::getUImm3Mod8Encoding(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isImm());
+  const MCOperand &MO = MI.getOperand(OpNo);
+  return MO.getImm() % 8;
+}
+
+unsigned
+MipsMCCodeEmitter::getUImm4AndValue(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isImm());
+  const MCOperand &MO = MI.getOperand(OpNo);
+  unsigned Value = MO.getImm();
+  switch (Value) {
+    case 128:   return 0x0;
+    case 1:     return 0x1;
+    case 2:     return 0x2;
+    case 3:     return 0x3;
+    case 4:     return 0x4;
+    case 7:     return 0x5;
+    case 8:     return 0x6;
+    case 15:    return 0x7;
+    case 16:    return 0x8;
+    case 31:    return 0x9;
+    case 32:    return 0xa;
+    case 63:    return 0xb;
+    case 64:    return 0xc;
+    case 255:   return 0xd;
+    case 32768: return 0xe;
+    case 65535: return 0xf;
+  }
+  llvm_unreachable("Unexpected value");
+}
+
+unsigned
+MipsMCCodeEmitter::getRegisterListOpValue(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned res = 0;
+
+  // Register list operand is always first operand of instruction and it is
+  // placed before memory operand (register + imm).
+
+  for (unsigned I = OpNo, E = MI.getNumOperands() - 2; I < E; ++I) {
+    unsigned Reg = MI.getOperand(I).getReg();
+    unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
+    if (RegNo != 31)
+      res++;
+    else
+      res |= 0x10;
+  }
+  return res;
+}
+
 #include "MipsGenMCCodeEmitter.inc"

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 304167f..9016fcf 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h

@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 //
 
-#ifndef MIPS_MC_CODE_EMITTER_H
-#define MIPS_MC_CODE_EMITTER_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
 
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/Support/DataTypes.h"
@@ -60,7 +60,7 @@
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
 
-  // getBranchJumpOpValue - Return binary encoding of the jump
+  // getJumpTargetOpValue - Return binary encoding of the jump
   // target operand. If the machine operand requires relocation,
   // record the relocation and return zero.
   unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
@@ -74,6 +74,26 @@
                                   SmallVectorImpl<MCFixup> &Fixups,
                                   const MCSubtargetInfo &STI) const;
 
+  // getUImm5Lsl2Encoding - Return binary encoding of the microMIPS jump
+  // target operand.
+  unsigned getUImm5Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  unsigned getSImm3Lsa2Value(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getUImm6Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  // getSImm9AddiuspValue - Return binary encoding of the microMIPS addiusp
+  // instruction immediate operand.
+  unsigned getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
   // getBranchTargetOpValue - Return binary encoding of the branch
   // target operand. If the machine operand requires relocation,
   // record the relocation and return zero.
@@ -145,9 +165,19 @@
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
 
+  unsigned getUImm3Mod8Encoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  unsigned getUImm4AndValue(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+
   unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
+  unsigned getRegisterListOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
 }; // class MipsMCCodeEmitter
 } // namespace llvm.
 

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 5bba3e5..74490f3 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp

@@ -80,8 +80,9 @@
 
 bool
 MipsMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                      const MCAsmLayout *Layout) const {
-  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
+                                      const MCAsmLayout *Layout,
+                                      const MCFixup *Fixup) const {
+  return getSubExpr()->EvaluateAsRelocatable(Res, Layout, Fixup);
 }
 
 void MipsMCExpr::visitUsedExpr(MCStreamer &Streamer) const {

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index f193dc9..2b8f0c8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSMCEXPR_H
-#define MIPSMCEXPR_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
 
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCExpr.h"
@@ -48,7 +48,8 @@
 
   void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const override;
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
   const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index 01d5363..e756b47 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSMCNACL_H
-#define MIPSMCNACL_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H
 
 #include "llvm/MC/MCELFStreamer.h"
 
@@ -26,8 +26,7 @@
                                          raw_ostream &OS,
                                          MCCodeEmitter *Emitter,
                                          const MCSubtargetInfo &STI,
-                                         bool RelaxAll, bool NoExecStack);
-
+                                         bool RelaxAll);
 }
 
 #endif

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index d2b929b..bab4254 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp

@@ -109,15 +109,12 @@
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Context, MCAsmBackend &MAB,
                                     raw_ostream &OS, MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI,
-                                    bool RelaxAll, bool NoExecStack) {
+                                    const MCSubtargetInfo &STI, bool RelaxAll) {
   MCStreamer *S;
   if (!Triple(TT).isOSNaCl())
-    S = createMipsELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll,
-                              NoExecStack);
+    S = createMipsELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll);
   else
-    S = createMipsNaClELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll,
-                                  NoExecStack);
+    S = createMipsNaClELFStreamer(Context, MAB, OS, Emitter, STI, RelaxAll);
   new MipsTargetELFStreamer(*S, STI);
   return S;
 }

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 161d1ea..f08a8f4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSMCTARGETDESC_H
-#define MIPSMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCTARGETDESC_H
+#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
 

diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index 6cde8f9..92b8455 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp

@@ -255,13 +255,11 @@
                                          raw_ostream &OS,
                                          MCCodeEmitter *Emitter,
                                          const MCSubtargetInfo &STI,
-                                         bool RelaxAll, bool NoExecStack) {
+                                         bool RelaxAll) {
   MipsNaClELFStreamer *S = new MipsNaClELFStreamer(Context, TAB, OS, Emitter,
                                                    STI);
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
-  if (NoExecStack)
-    S->getAssembler().setNoExecStack(true);
 
   // Set bundle-alignment as required by the NaCl ABI for the target.
   S->EmitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN);

diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
new file mode 100644
index 0000000..0ef2208
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp

@@ -0,0 +1,92 @@
+//===-- MipsOptionRecord.cpp - Abstraction for storing information --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsOptionRecord.h"
+#include "MipsELFStreamer.h"
+#include "llvm/MC/MCSectionELF.h"
+
+using namespace llvm;
+
+void MipsRegInfoRecord::EmitMipsOptionRecord() {
+  MCAssembler &MCA = Streamer->getAssembler();
+  Triple T(STI.getTargetTriple());
+  uint64_t Features = STI.getFeatureBits();
+
+  Streamer->PushSection();
+
+  // We need to distinguish between N64 and the rest because at the moment
+  // we don't emit .Mips.options for other ELFs other than N64.
+  // Since .reginfo has the same information as .Mips.options (ODK_REGINFO),
+  // we can use the same abstraction (MipsRegInfoRecord class) to handle both.
+  if (Features & Mips::FeatureN64) {
+    // The EntrySize value of 1 seems strange since the records are neither
+    // 1-byte long nor fixed length but it matches the value GAS emits.
+    const MCSectionELF *Sec =
+        Context.getELFSection(".MIPS.options", ELF::SHT_MIPS_OPTIONS,
+                              ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP,
+                              SectionKind::getMetadata(), 1, "");
+    MCA.getOrCreateSectionData(*Sec).setAlignment(8);
+    Streamer->SwitchSection(Sec);
+
+    Streamer->EmitIntValue(1, 1);  // kind
+    Streamer->EmitIntValue(40, 1); // size
+    Streamer->EmitIntValue(0, 2);  // section
+    Streamer->EmitIntValue(0, 4);  // info
+    Streamer->EmitIntValue(ri_gprmask, 4);
+    Streamer->EmitIntValue(0, 4); // pad
+    Streamer->EmitIntValue(ri_cprmask[0], 4);
+    Streamer->EmitIntValue(ri_cprmask[1], 4);
+    Streamer->EmitIntValue(ri_cprmask[2], 4);
+    Streamer->EmitIntValue(ri_cprmask[3], 4);
+    Streamer->EmitIntValue(ri_gp_value, 8);
+  } else {
+    const MCSectionELF *Sec =
+        Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC,
+                              SectionKind::getMetadata(), 24, "");
+    MCA.getOrCreateSectionData(*Sec)
+        .setAlignment(Features & Mips::FeatureN32 ? 8 : 4);
+    Streamer->SwitchSection(Sec);
+
+    Streamer->EmitIntValue(ri_gprmask, 4);
+    Streamer->EmitIntValue(ri_cprmask[0], 4);
+    Streamer->EmitIntValue(ri_cprmask[1], 4);
+    Streamer->EmitIntValue(ri_cprmask[2], 4);
+    Streamer->EmitIntValue(ri_cprmask[3], 4);
+    assert((ri_gp_value & 0xffffffff) == ri_gp_value);
+    Streamer->EmitIntValue(ri_gp_value, 4);
+  }
+
+  Streamer->PopSection();
+}
+
+void MipsRegInfoRecord::SetPhysRegUsed(unsigned Reg,
+                                       const MCRegisterInfo *MCRegInfo) {
+  unsigned Value = 0;
+
+  for (MCSubRegIterator SubRegIt(Reg, MCRegInfo, true); SubRegIt.isValid();
+       ++SubRegIt) {
+    unsigned CurrentSubReg = *SubRegIt;
+
+    unsigned EncVal = MCRegInfo->getEncodingValue(CurrentSubReg);
+    Value |= 1 << EncVal;
+
+    if (GPR32RegClass->contains(CurrentSubReg) ||
+        GPR64RegClass->contains(CurrentSubReg))
+      ri_gprmask |= Value;
+    else if (FGR32RegClass->contains(CurrentSubReg) ||
+             FGR64RegClass->contains(CurrentSubReg) ||
+             AFGR64RegClass->contains(CurrentSubReg) ||
+             MSA128BRegClass->contains(CurrentSubReg))
+      ri_cprmask[1] |= Value;
+    else if (COP2RegClass->contains(CurrentSubReg))
+      ri_cprmask[2] |= Value;
+    else if (COP3RegClass->contains(CurrentSubReg))
+      ri_cprmask[3] |= Value;
+  }
+}

diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index fbe375b..1e092f2 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp

@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstPrinter/MipsInstPrinter.h"
+#include "MipsELFStreamer.h"
 #include "MipsMCTargetDesc.h"
 #include "MipsTargetObjectFile.h"
 #include "MipsTargetStreamer.h"
@@ -28,17 +29,21 @@
 using namespace llvm;
 
 MipsTargetStreamer::MipsTargetStreamer(MCStreamer &S)
-    : MCTargetStreamer(S), canHaveModuleDirective(true) {}
+    : MCTargetStreamer(S), ModuleDirectiveAllowed(true) {
+  GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+}
 void MipsTargetStreamer::emitDirectiveSetMicroMips() {}
 void MipsTargetStreamer::emitDirectiveSetNoMicroMips() {}
 void MipsTargetStreamer::emitDirectiveSetMips16() {}
-void MipsTargetStreamer::emitDirectiveSetNoMips16() {}
-void MipsTargetStreamer::emitDirectiveSetReorder() {}
+void MipsTargetStreamer::emitDirectiveSetNoMips16() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetReorder() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoReorder() {}
-void MipsTargetStreamer::emitDirectiveSetMacro() {}
-void MipsTargetStreamer::emitDirectiveSetNoMacro() {}
-void MipsTargetStreamer::emitDirectiveSetAt() {}
-void MipsTargetStreamer::emitDirectiveSetNoAt() {}
+void MipsTargetStreamer::emitDirectiveSetMacro() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoMacro() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMsa() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoMsa() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetAt() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoAt() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveEnd(StringRef Name) {}
 void MipsTargetStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {}
 void MipsTargetStreamer::emitDirectiveAbiCalls() {}
@@ -51,11 +56,26 @@
 void MipsTargetStreamer::emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) {}
 void MipsTargetStreamer::emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) {
 }
-void MipsTargetStreamer::emitDirectiveSetMips32R2() {}
-void MipsTargetStreamer::emitDirectiveSetMips64() {}
-void MipsTargetStreamer::emitDirectiveSetMips64R2() {}
-void MipsTargetStreamer::emitDirectiveSetDsp() {}
-void MipsTargetStreamer::emitDirectiveCpload(unsigned RegNo) {}
+void MipsTargetStreamer::emitDirectiveSetArch(StringRef Arch) {
+  forbidModuleDirective();
+}
+void MipsTargetStreamer::emitDirectiveSetMips0() {}
+void MipsTargetStreamer::emitDirectiveSetMips1() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips3() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips4() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips5() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips32R6() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R2() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips64R6() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetPop() {}
+void MipsTargetStreamer::emitDirectiveSetPush() {}
+void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
 void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                                               const MCSymbol &Sym, bool IsReg) {
 }
@@ -71,52 +91,62 @@
 
 void MipsTargetAsmStreamer::emitDirectiveSetMicroMips() {
   OS << "\t.set\tmicromips\n";
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoMicroMips() {
   OS << "\t.set\tnomicromips\n";
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetMips16() {
   OS << "\t.set\tmips16\n";
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoMips16() {
   OS << "\t.set\tnomips16\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetNoMips16();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetReorder() {
   OS << "\t.set\treorder\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetReorder();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoReorder() {
   OS << "\t.set\tnoreorder\n";
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetMacro() {
   OS << "\t.set\tmacro\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetMacro();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoMacro() {
   OS << "\t.set\tnomacro\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetNoMacro();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMsa() {
+  OS << "\t.set\tmsa\n";
+  MipsTargetStreamer::emitDirectiveSetMsa();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMsa() {
+  OS << "\t.set\tnomsa\n";
+  MipsTargetStreamer::emitDirectiveSetNoMsa();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetAt() {
   OS << "\t.set\tat\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetAt();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetNoAt() {
   OS << "\t.set\tnoat\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetNoAt();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveEnd(StringRef Name) {
@@ -151,25 +181,82 @@
      << StringRef(MipsInstPrinter::getRegisterName(ReturnReg)).lower() << '\n';
 }
 
+void MipsTargetAsmStreamer::emitDirectiveSetArch(StringRef Arch) {
+  OS << "\t.set arch=" << Arch << "\n";
+  MipsTargetStreamer::emitDirectiveSetArch(Arch);
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips0() { OS << "\t.set\tmips0\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips1() {
+  OS << "\t.set\tmips1\n";
+  MipsTargetStreamer::emitDirectiveSetMips1();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips2() {
+  OS << "\t.set\tmips2\n";
+  MipsTargetStreamer::emitDirectiveSetMips2();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips3() {
+  OS << "\t.set\tmips3\n";
+  MipsTargetStreamer::emitDirectiveSetMips3();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips4() {
+  OS << "\t.set\tmips4\n";
+  MipsTargetStreamer::emitDirectiveSetMips4();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips5() {
+  OS << "\t.set\tmips5\n";
+  MipsTargetStreamer::emitDirectiveSetMips5();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32() {
+  OS << "\t.set\tmips32\n";
+  MipsTargetStreamer::emitDirectiveSetMips32();
+}
+
 void MipsTargetAsmStreamer::emitDirectiveSetMips32R2() {
   OS << "\t.set\tmips32r2\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetMips32R2();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips32R6() {
+  OS << "\t.set\tmips32r6\n";
+  MipsTargetStreamer::emitDirectiveSetMips32R6();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetMips64() {
   OS << "\t.set\tmips64\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetMips64();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetMips64R2() {
   OS << "\t.set\tmips64r2\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetMips64R2();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetMips64R6() {
+  OS << "\t.set\tmips64r6\n";
+  MipsTargetStreamer::emitDirectiveSetMips64R6();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetDsp() {
   OS << "\t.set\tdsp\n";
-  setCanHaveModuleDir(false);
+  MipsTargetStreamer::emitDirectiveSetDsp();
 }
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoDsp() {
+  OS << "\t.set\tnodsp\n";
+  MipsTargetStreamer::emitDirectiveSetNoDsp();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetPop() { OS << "\t.set\tpop\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveSetPush() { OS << "\t.set\tpush\n"; }
+
 // Print a 32 bit hex number with all numbers.
 static void printHex32(unsigned Value, raw_ostream &OS) {
   OS << "0x";
@@ -191,10 +278,10 @@
   OS << "," << FPUTopSavedRegOff << '\n';
 }
 
-void MipsTargetAsmStreamer::emitDirectiveCpload(unsigned RegNo) {
+void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   OS << "\t.cpload\t$"
      << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
@@ -213,7 +300,7 @@
   OS << ", ";
 
   OS << Sym.getName() << "\n";
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetAsmStreamer::emitDirectiveModuleFP(
@@ -281,27 +368,29 @@
   else
     EFlags |= ELF::EF_MIPS_ARCH_1;
 
-  if (T.isArch64Bit()) {
-    if (Features & Mips::FeatureN32)
-      EFlags |= ELF::EF_MIPS_ABI2;
-    else if (Features & Mips::FeatureO32) {
-      EFlags |= ELF::EF_MIPS_ABI_O32;
-      EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */
-    }
-    // No need to set any bit for N64 which is the default ABI at the moment
-    // for 64-bit Mips architectures.
-  } else {
-    if (Features & Mips::FeatureMips64r2 || Features & Mips::FeatureMips64)
-      EFlags |= ELF::EF_MIPS_32BITMODE;
-
-    // ABI
+  // ABI
+  // N64 does not require any ABI bits.
+  if (Features & Mips::FeatureO32)
     EFlags |= ELF::EF_MIPS_ABI_O32;
-  }
+  else if (Features & Mips::FeatureN32)
+    EFlags |= ELF::EF_MIPS_ABI2;
+
+  if (Features & Mips::FeatureGP64Bit) {
+    if (Features & Mips::FeatureO32)
+      EFlags |= ELF::EF_MIPS_32BITMODE; /* Compatibility Mode */
+  } else if (Features & Mips::FeatureMips64r2 || Features & Mips::FeatureMips64)
+    EFlags |= ELF::EF_MIPS_32BITMODE;
 
   // Other options.
   if (Features & Mips::FeatureNaN2008)
     EFlags |= ELF::EF_MIPS_NAN2008;
 
+  // -mabicalls and -mplt are not implemented but we should act as if they were
+  // given.
+  EFlags |= ELF::EF_MIPS_CPIC;
+  if (Features & Mips::FeatureN64)
+    EFlags |= ELF::EF_MIPS_PIC;
+
   MCA.setELFHeaderEFlags(EFlags);
 }
 
@@ -321,41 +410,26 @@
 
 void MipsTargetELFStreamer::finish() {
   MCAssembler &MCA = getStreamer().getAssembler();
-  MCContext &Context = MCA.getContext();
-  MCStreamer &OS = getStreamer();
-  Triple T(STI.getTargetTriple());
-  uint64_t Features = STI.getFeatureBits();
+  const MCObjectFileInfo &OFI = *MCA.getContext().getObjectFileInfo();
 
-  if (T.isArch64Bit() && (Features & Mips::FeatureN64)) {
-    const MCSectionELF *Sec = Context.getELFSection(
-        ".MIPS.options", ELF::SHT_MIPS_OPTIONS,
-        ELF::SHF_ALLOC | ELF::SHF_MIPS_NOSTRIP, SectionKind::getMetadata());
-    OS.SwitchSection(Sec);
+  // .bss, .text and .data are always at least 16-byte aligned.
+  MCSectionData &TextSectionData =
+      MCA.getOrCreateSectionData(*OFI.getTextSection());
+  MCSectionData &DataSectionData =
+      MCA.getOrCreateSectionData(*OFI.getDataSection());
+  MCSectionData &BSSSectionData =
+      MCA.getOrCreateSectionData(*OFI.getBSSSection());
 
-    OS.EmitIntValue(1, 1);  // kind
-    OS.EmitIntValue(40, 1); // size
-    OS.EmitIntValue(0, 2);  // section
-    OS.EmitIntValue(0, 4);  // info
-    OS.EmitIntValue(0, 4);  // ri_gprmask
-    OS.EmitIntValue(0, 4);  // pad
-    OS.EmitIntValue(0, 4);  // ri_cpr[0]mask
-    OS.EmitIntValue(0, 4);  // ri_cpr[1]mask
-    OS.EmitIntValue(0, 4);  // ri_cpr[2]mask
-    OS.EmitIntValue(0, 4);  // ri_cpr[3]mask
-    OS.EmitIntValue(0, 8);  // ri_gp_value
-  } else {
-    const MCSectionELF *Sec =
-        Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO, ELF::SHF_ALLOC,
-                              SectionKind::getMetadata());
-    OS.SwitchSection(Sec);
+  TextSectionData.setAlignment(std::max(16u, TextSectionData.getAlignment()));
+  DataSectionData.setAlignment(std::max(16u, DataSectionData.getAlignment()));
+  BSSSectionData.setAlignment(std::max(16u, BSSSectionData.getAlignment()));
 
-    OS.EmitIntValue(0, 4); // ri_gprmask
-    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
-    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
-    OS.EmitIntValue(0, 4); // ri_gp_value
-  }
+  // Emit all the option records.
+  // At the moment we are only emitting .Mips.options (ODK_REGINFO) and
+  // .reginfo.
+  MipsELFStreamer &MEF = static_cast<MipsELFStreamer &>(Streamer);
+  MEF.EmitMipsOptionRecords();
+
   emitMipsAbiFlags();
 }
 
@@ -390,11 +464,12 @@
   unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_MICROMIPS;
   MCA.setELFHeaderEFlags(Flags);
+  forbidModuleDirective();
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetNoMicroMips() {
   MicroMipsEnabled = false;
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetMips16() {
@@ -402,17 +477,7 @@
   unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_ARCH_ASE_M16;
   MCA.setELFHeaderEFlags(Flags);
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveSetNoMips16() {
-  // FIXME: implement.
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveSetReorder() {
-  // FIXME: implement.
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetELFStreamer::emitDirectiveSetNoReorder() {
@@ -420,35 +485,49 @@
   unsigned Flags = MCA.getELFHeaderEFlags();
   Flags |= ELF::EF_MIPS_NOREORDER;
   MCA.setELFHeaderEFlags(Flags);
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveSetMacro() {
-  // FIXME: implement.
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveSetNoMacro() {
-  // FIXME: implement.
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveSetAt() {
-  // FIXME: implement.
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveSetNoAt() {
-  // FIXME: implement.
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
-  // FIXME: implement.
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCContext &Context = MCA.getContext();
+  MCStreamer &OS = getStreamer();
+
+  const MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS,
+                                                  ELF::SHF_ALLOC | ELF::SHT_REL,
+                                                  SectionKind::getMetadata());
+
+  const MCSymbolRefExpr *ExprRef =
+      MCSymbolRefExpr::Create(Name, MCSymbolRefExpr::VK_None, Context);
+
+  MCSectionData &SecData = MCA.getOrCreateSectionData(*Sec);
+  SecData.setAlignment(4);
+
+  OS.PushSection();
+
+  OS.SwitchSection(Sec);
+
+  OS.EmitValueImpl(ExprRef, 4);
+
+  OS.EmitIntValue(GPRInfoSet ? GPRBitMask : 0, 4); // reg_mask
+  OS.EmitIntValue(GPRInfoSet ? GPROffset : 0, 4);  // reg_offset
+
+  OS.EmitIntValue(FPRInfoSet ? FPRBitMask : 0, 4); // fpreg_mask
+  OS.EmitIntValue(FPRInfoSet ? FPROffset : 0, 4);  // fpreg_offset
+
+  OS.EmitIntValue(FrameInfoSet ? FrameOffset : 0, 4); // frame_offset
+  OS.EmitIntValue(FrameInfoSet ? FrameReg : 0, 4);    // frame_reg
+  OS.EmitIntValue(FrameInfoSet ? ReturnReg : 0, 4);   // return_reg
+
+  // The .end directive marks the end of a procedure. Invalidate
+  // the information gathered up until this point.
+  GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
+
+  OS.PopSection();
 }
 
 void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
-  // FIXME: implement.
+  GPRInfoSet = FPRInfoSet = FrameInfoSet = false;
 }
 
 void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
@@ -494,37 +573,31 @@
 }
 
 void MipsTargetELFStreamer::emitFrame(unsigned StackReg, unsigned StackSize,
-                                      unsigned ReturnReg) {
-  // FIXME: implement.
+                                      unsigned ReturnReg_) {
+  MCContext &Context = getStreamer().getAssembler().getContext();
+  const MCRegisterInfo *RegInfo = Context.getRegisterInfo();
+
+  FrameInfoSet = true;
+  FrameReg = RegInfo->getEncodingValue(StackReg);
+  FrameOffset = StackSize;
+  ReturnReg = RegInfo->getEncodingValue(ReturnReg_);
 }
 
 void MipsTargetELFStreamer::emitMask(unsigned CPUBitmask,
                                      int CPUTopSavedRegOff) {
-  // FIXME: implement.
+  GPRInfoSet = true;
+  GPRBitMask = CPUBitmask;
+  GPROffset = CPUTopSavedRegOff;
 }
 
 void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask,
                                       int FPUTopSavedRegOff) {
-  // FIXME: implement.
+  FPRInfoSet = true;
+  FPRBitMask = FPUBitmask;
+  FPROffset = FPUTopSavedRegOff;
 }
 
-void MipsTargetELFStreamer::emitDirectiveSetMips32R2() {
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveSetMips64() {
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveSetMips64R2() {
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveSetDsp() {
-  setCanHaveModuleDir(false);
-}
-
-void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) {
+void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   // .cpload $reg
   // This directive expands to:
   // lui   $gp, %hi(_gp_disp)
@@ -572,7 +645,7 @@
   TmpInst.addOperand(MCOperand::CreateReg(RegNo));
   getStreamer().EmitInstruction(TmpInst, STI);
 
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
@@ -629,7 +702,7 @@
   Inst.addOperand(MCOperand::CreateReg(RegNo));
   getStreamer().EmitInstruction(Inst, STI);
 
-  setCanHaveModuleDir(false);
+  forbidModuleDirective();
 }
 
 void MipsTargetELFStreamer::emitMipsAbiFlags() {
@@ -638,7 +711,7 @@
   MCStreamer &OS = getStreamer();
   const MCSectionELF *Sec =
       Context.getELFSection(".MIPS.abiflags", ELF::SHT_MIPS_ABIFLAGS,
-                            ELF::SHF_ALLOC, SectionKind::getMetadata());
+                            ELF::SHF_ALLOC, SectionKind::getMetadata(), 24, "");
   MCSectionData &ABIShndxSD = MCA.getOrCreateSectionData(*Sec);
   ABIShndxSD.setAlignment(8);
   OS.SwitchSection(Sec);

diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
index 41efa47..56db450 100644
--- a/lib/Target/Mips/Makefile
+++ b/lib/Target/Mips/Makefile

@@ -13,7 +13,7 @@
 
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = MipsGenRegisterInfo.inc MipsGenInstrInfo.inc \
-                MipsGenAsmWriter.inc MipsGenFastISel.inc MipsGenCodeEmitter.inc \
+                MipsGenAsmWriter.inc MipsGenFastISel.inc \
                 MipsGenDAGISel.inc MipsGenCallingConv.inc \
                 MipsGenSubtargetInfo.inc MipsGenMCCodeEmitter.inc \
                 MipsGenDisassemblerTables.inc \

diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index b93017a..fae7059 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td

@@ -123,10 +123,10 @@
                              II_MFC1, bitconvert>, MFC1_FM_MM<0x80>;
 def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
                              II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>;
-def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
-               MFC1_FM_MM<3>, ISA_MIPS32R2;
-def MTHC1_MM : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
-               MFC1_FM_MM<7>, ISA_MIPS32R2;
+def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+               MFC1_FM_MM<0xc0>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>;
+def MTHC1_MM : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+               MFC1_FM_MM<0xe0>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>;
 
 def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
                 MADDS_FM_MM<0x1>;

diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index 15b951d..59bf949 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td

@@ -41,6 +41,95 @@
 // MicroMIPS 16-bit Instruction Formats
 //===----------------------------------------------------------------------===//
 
+class ARITH_FM_MM16<bit funct> {
+  bits<3> rd;
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x01;
+  let Inst{9-7}   = rd;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = rs;
+  let Inst{0}     = funct;
+}
+
+class ANDI_FM_MM16<bits<6> funct> {
+  bits<3> rd;
+  bits<3> rs;
+  bits<4> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = funct;
+  let Inst{9-7}   = rd;
+  let Inst{6-4}   = rs;
+  let Inst{3-0}   = imm;
+}
+
+class LOGIC_FM_MM16<bits<4> funct> {
+  bits<3> rt;
+  bits<3> rs;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-6}   = funct;
+  let Inst{5-3}   = rt;
+  let Inst{2-0}   = rs;
+}
+
+class SHIFT_FM_MM16<bits<1> funct> {
+  bits<3> rd;
+  bits<3> rt;
+  bits<3> shamt;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x09;
+  let Inst{9-7}   = rd;
+  let Inst{6-4}   = rt;
+  let Inst{3-1}   = shamt;
+  let Inst{0}     = funct;
+}
+
+class ADDIUR2_FM_MM16 {
+  bits<3> rd;
+  bits<3> rs;
+  bits<3> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x1b;
+  let Inst{9-7}   = rd;
+  let Inst{6-4}   = rs;
+  let Inst{3-1}   = imm;
+  let Inst{0}     = 0;
+}
+
+class ADDIUS5_FM_MM16 {
+  bits<5> rd;
+  bits<4> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x13;
+  let Inst{9-5}   = rd;
+  let Inst{4-1}   = imm;
+  let Inst{0}     = 0;
+}
+
+class ADDIUSP_FM_MM16 {
+  bits<9> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x13;
+  let Inst{9-1}   = imm;
+  let Inst{0}     = 1;
+}
+
 class MOVE_FM_MM16<bits<6> funct> {
   bits<5> rs;
   bits<5> rd;
@@ -52,6 +141,17 @@
   let Inst{4-0}   = rs;
 }
 
+class LI_FM_MM16 {
+  bits<3> rd;
+  bits<7> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x3b;
+  let Inst{9-7}   = rd;
+  let Inst{6-0}   = imm;
+}
+
 class JALR_FM_MM16<bits<5> op> {
   bits<5> rs;
 
@@ -72,6 +172,29 @@
   let Inst{4-0}   = rd;
 }
 
+class JRADDIUSP_FM_MM16<bits<5> op> {
+  bits<5> rs;
+  bits<5> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x11;
+  let Inst{9-5}   = op;
+  let Inst{4-0}   = imm;
+}
+
+class ADDIUR1SP_FM_MM16 {
+  bits<3> rd;
+  bits<6> imm;
+
+  bits<16> Inst;
+
+  let Inst{15-10} = 0x1b;
+  let Inst{9-7}   = rd;
+  let Inst{6-1}   = imm;
+  let Inst{0}     = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // MicroMIPS 32-bit Instruction Formats
 //===----------------------------------------------------------------------===//
@@ -621,3 +744,76 @@
   let Inst{10-6}  = fr;
   let Inst{5-0}   = funct;
 }
+
+class COMPACT_BRANCH_FM_MM<bits<5> funct> {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class COP0_TLB_FM_MM<bits<10> op> : MMArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = 0x0;
+  let Inst{15-6}  = op;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SDBBP_FM_MM : MMArch {
+  bits<10> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_;
+  let Inst{15-6}  = 0x36d;
+  let Inst{5-0}   = 0x3c;
+}
+
+class RDHWR_FM_MM : MMArch {
+  bits<5> rt;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rd;
+  let Inst{15-6}  = 0x1ac;
+  let Inst{5-0}   = 0x3c;
+}
+
+class LWXS_FM_MM<bits<10> funct> {
+  bits<5> rd;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-21} = index;
+  let Inst{20-16} = base;
+  let Inst{15-11} = rd;
+  let Inst{10}    = 0;
+  let Inst{9-0}   = funct;
+}
+
+class LWM_FM_MM<bits<4> funct> : MMArch {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x8;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}

diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 87a3a3e..e854620 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td

@@ -1,9 +1,51 @@
 def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddrMM", [frameindex]>;
 
+def simm4 : Operand<i32>;
+def simm7 : Operand<i32>;
+
 def simm12 : Operand<i32> {
   let DecoderMethod = "DecodeSimm12";
 }
 
+def uimm5_lsl2 : Operand<OtherVT> {
+  let EncoderMethod = "getUImm5Lsl2Encoding";
+}
+
+def uimm6_lsl2 : Operand<i32> {
+  let EncoderMethod = "getUImm6Lsl2Encoding";
+}
+
+def simm9_addiusp : Operand<i32> {
+  let EncoderMethod = "getSImm9AddiuspValue";
+}
+
+def uimm3_shift : Operand<i32> {
+  let EncoderMethod = "getUImm3Mod8Encoding";
+}
+
+def simm3_lsa2 : Operand<i32> {
+  let EncoderMethod = "getSImm3Lsa2Value";
+}
+
+def uimm4_andi : Operand<i32> {
+  let EncoderMethod = "getUImm4AndValue";
+}
+
+def immSExtAddiur2 : ImmLeaf<i32, [{return Imm == 1 || Imm == -1 ||
+                                           ((Imm % 4 == 0) &&
+                                            Imm < 28 && Imm > 0);}]>;
+
+def immSExtAddius5 : ImmLeaf<i32, [{return Imm >= -8 && Imm <= 7;}]>;
+
+def immZExtAndi16 : ImmLeaf<i32,
+  [{return (Imm == 128 || (Imm >= 1 && Imm <= 4) || Imm == 7 || Imm == 8 ||
+            Imm == 15 || Imm == 16 || Imm == 31 || Imm == 32 || Imm == 63 ||
+            Imm == 64 || Imm == 255 || Imm == 32768 || Imm == 65535 );}]>;
+
+def immZExt2Shift : ImmLeaf<i32, [{return Imm >= 1 && Imm <= 8;}]>;
+
+def immLi16 : ImmLeaf<i32, [{return Imm >= -1 && Imm <= 126;}]>;
+
 def mem_mm_12 : Operand<i32> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops GPR32, simm12);
@@ -26,6 +68,16 @@
   let DecoderMethod = "DecodeBranchTargetMM";
 }
 
+class CompactBranchMM<string opstr, DAGOperand opnd, PatFrag cond_op,
+                      RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, opnd:$offset),
+         !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 0;
+  let Defs = [AT];
+}
+
 let canFoldAsLoad = 1 in
 class LoadLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
                       Operand MemOpnd> :
@@ -70,6 +122,61 @@
   let mayLoad = 1;
 }
 
+class ArithRMM16<string opstr, RegisterOperand RO, bit isComm = 0,
+                 InstrItinClass Itin = NoItinerary,
+                 SDPatternOperator OpNode = null_frag> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+                  !strconcat(opstr, "\t$rd, $rs, $rt"),
+                  [(set RO:$rd, (OpNode RO:$rs, RO:$rt))], Itin, FrmR> {
+  let isCommutable = isComm;
+}
+
+class AndImmMM16<string opstr, RegisterOperand RO,
+                 InstrItinClass Itin = NoItinerary> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, uimm4_andi:$imm),
+                  !strconcat(opstr, "\t$rd, $rs, $imm"), [], Itin, FrmI>;
+
+class LogicRMM16<string opstr, RegisterOperand RO,
+                 InstrItinClass Itin = NoItinerary,
+                 SDPatternOperator OpNode = null_frag> :
+  MicroMipsInst16<(outs RO:$dst), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$rt, $rs"),
+         [(set RO:$dst, (OpNode RO:$rs, RO:$rt))], Itin, FrmR> {
+  let isCommutable = 1;
+  let Constraints = "$rt = $dst";
+}
+
+class NotMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs RO:$rt), (ins RO:$rs),
+         !strconcat(opstr, "\t$rt, $rs"),
+         [(set RO:$rt, (not RO:$rs))], NoItinerary, FrmR>;
+
+class ShiftIMM16<string opstr, Operand ImmOpnd, RegisterOperand RO,
+                 InstrItinClass Itin = NoItinerary> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
+                  !strconcat(opstr, "\t$rd, $rt, $shamt"), [], Itin, FrmR>;
+
+class AddImmUR2<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs RO:$rd), (ins RO:$rs, simm3_lsa2:$imm),
+                  !strconcat(opstr, "\t$rd, $rs, $imm"),
+                  [], NoItinerary, FrmR> {
+  let isCommutable = 1;
+}
+
+class AddImmUS5<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs RO:$dst), (ins RO:$rd, simm4:$imm),
+                  !strconcat(opstr, "\t$rd, $imm"), [], NoItinerary, FrmR> {
+  let Constraints = "$rd = $dst";
+}
+
+class AddImmUR1SP<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs RO:$rd), (ins uimm6_lsl2:$imm),
+                  !strconcat(opstr, "\t$rd, $imm"), [], NoItinerary, FrmR>;
+
+class AddImmUSP<string opstr> :
+  MicroMipsInst16<(outs), (ins simm9_addiusp:$imm),
+                  !strconcat(opstr, "\t$imm"), [], NoItinerary, FrmI>;
+
 class MoveFromHILOMM<string opstr, RegisterOperand RO, Register UseReg> :
       MicroMipsInst16<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"),
   [], II_MFHI_MFLO, FrmR> {
@@ -85,6 +192,13 @@
   let isReMaterializable = 1;
 }
 
+class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO,
+                  SDPatternOperator imm_type = null_frag> :
+  MicroMipsInst16<(outs RO:$rd), (ins Od:$imm),
+                  !strconcat(opstr, "\t$rd, $imm"), [], NoItinerary, FrmI> {
+  let isReMaterializable = 1;
+}
+
 // 16-bit Jump and Link (Call)
 class JumpLinkRegMM16<string opstr, RegisterOperand RO> :
   MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
@@ -94,16 +208,140 @@
   let Defs = [RA];
 }
 
+// 16-bit Jump Reg
+class JumpRegMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+           [], IIBranch, FrmR> {
+  let hasDelaySlot = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// Base class for JRADDIUSP instruction.
+class JumpRAddiuStackMM16 :
+  MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jraddiusp\t$imm",
+                  [], IIBranch, FrmR> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// 16-bit Jump and Link (Call) - Short Delay Slot
+class JumpLinkRegSMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+           [], IIBranch, FrmR> {
+  let isCall = 1;
+  let hasDelaySlot = 1;
+  let Defs = [RA];
+}
+
+// 16-bit Jump Register Compact - No delay slot
+class JumpRegCMM16<string opstr, RegisterOperand RO> :
+  MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"),
+                  [], IIBranch, FrmR> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// MicroMIPS Jump and Link (Call) - Short Delay Slot
+let isCall = 1, hasDelaySlot = 1, Defs = [RA] in {
+  class JumpLinkMM<string opstr, DAGOperand opnd> :
+    InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+           [], IIBranch, FrmJ, opstr> {
+    let DecoderMethod = "DecodeJumpTargetMM";
+  }
+
+  class JumpLinkRegMM<string opstr, RegisterOperand RO>:
+    InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+            [], IIBranch, FrmR>;
+
+  class BranchCompareToZeroLinkMM<string opstr, DAGOperand opnd,
+                                  RegisterOperand RO> :
+    InstSE<(outs), (ins RO:$rs, opnd:$offset),
+           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>;
+}
+
+class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO,
+                              InstrItinClass Itin = NoItinerary,
+                              SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RO:$rd), (ins PtrRC:$base, PtrRC:$index),
+         !strconcat(opstr, "\t$rd, ${index}(${base})"), [], Itin, FrmFI>;
+
+/// A list of registers used by load/store multiple instructions.
+def RegListAsmOperand : AsmOperandClass {
+  let Name = "RegList";
+  let ParserMethod = "parseRegisterList";
+}
+
+def reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = RegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeRegListOperand";
+}
+
+class StoreMultMM<string opstr,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins reglist:$rt, mem_mm_12:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayStore = 1;
+}
+
+class LoadMultMM<string opstr,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs reglist:$rt), (ins mem_mm_12:$addr),
+          !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI, opstr> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  let mayLoad = 1;
+}
+
+def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>,
+                ARITH_FM_MM16<0>;
+def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>,
+                ARITH_FM_MM16<1>;
+def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>;
+def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>,
+               LOGIC_FM_MM16<0x2>;
+def OR16_MM  : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>,
+               LOGIC_FM_MM16<0x3>;
+def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>,
+               LOGIC_FM_MM16<0x1>;
+def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>;
+def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>,
+               SHIFT_FM_MM16<0>;
+def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>,
+               SHIFT_FM_MM16<1>;
+def ADDIUR1SP_MM : AddImmUR1SP<"addiur1sp", GPRMM16Opnd>, ADDIUR1SP_FM_MM16;
+def ADDIUR2_MM : AddImmUR2<"addiur2", GPRMM16Opnd>, ADDIUR2_FM_MM16;
+def ADDIUS5_MM : AddImmUS5<"addius5", GPR32Opnd>, ADDIUS5_FM_MM16;
+def ADDIUSP_MM : AddImmUSP<"addiusp">, ADDIUSP_FM_MM16;
 def MFHI16_MM : MoveFromHILOMM<"mfhi", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x10>;
 def MFLO16_MM : MoveFromHILOMM<"mflo", GPR32Opnd, AC0>, MFHILO_FM_MM16<0x12>;
 def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>;
+def LI16_MM : LoadImmMM16<"li16", simm7, GPRMM16Opnd, immLi16>,
+              LI_FM_MM16, IsAsCheapAsAMove;
 def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>;
+def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>;
+def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>;
+def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>;
+def JR16_MM : JumpRegMM16<"jr16", GPR32Opnd>, JALR_FM_MM16<0x0c>;
 
 class WaitMM<string opstr> :
   InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [],
          NoItinerary, FrmOther, opstr>;
 
 let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
+  /// Compact Branch Instructions
+  def BEQZC_MM : CompactBranchMM<"beqzc", brtarget_mm, seteq, GPR32Opnd>,
+                 COMPACT_BRANCH_FM_MM<0x7>;
+  def BNEZC_MM : CompactBranchMM<"bnezc", brtarget_mm, setne, GPR32Opnd>,
+                 COMPACT_BRANCH_FM_MM<0x5>;
+
   /// Arithmetic Instructions (ALU Immediate)
   def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd>,
                  ADDI_FM_MM<0xc>;
@@ -179,6 +417,8 @@
     def SW_MM  : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
   }
 
+  def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>;
+
   def LWU_MM : LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU>, LL_FM_MM<0xe>;
 
   /// Load and Store Instructions - unaligned
@@ -191,6 +431,10 @@
   def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>,
                LWL_FM_MM<0x9>;
 
+  /// Load and Store Instructions - multiple
+  def SWM32_MM  : StoreMultMM<"swm32">, LWM_FM_MM<0xd>;
+  def LWM32_MM  : LoadMultMM<"lwm32">, LWM_FM_MM<0x5>;
+
   /// Move Conditional
   def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
                   NoItinerary>, ADD_FM_MM<0, 0x58>;
@@ -247,6 +491,10 @@
   def JR_MM   : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>;
   def JALR_MM : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
 
+  /// Jump Instructions - Short Delay Slot
+  def JALS_MM   : JumpLinkMM<"jals", calltarget_mm>, J_FM_MM<0x1d>;
+  def JALRS_MM  : JumpLinkRegMM<"jalrs", GPR32Opnd>, JALR_FM_MM<0x13c>;
+
   /// Branch Instructions
   def BEQ_MM  : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
                 BEQ_FM_MM<0x25>;
@@ -265,6 +513,12 @@
   def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
                   BGEZAL_FM_MM<0x01>;
 
+  /// Branch Instructions - Short Delay Slot
+  def BGEZALS_MM : BranchCompareToZeroLinkMM<"bgezals", brtarget_mm,
+                                             GPR32Opnd>, BGEZAL_FM_MM<0x13>;
+  def BLTZALS_MM : BranchCompareToZeroLinkMM<"bltzals", brtarget_mm,
+                                             GPR32Opnd>, BGEZAL_FM_MM<0x11>;
+
   /// Control Instructions
   def SYNC_MM    : MMRel, SYNC_FT<"sync">, SYNC_FM_MM;
   def BREAK_MM   : MMRel, BRK_FT<"break">, BRK_FM_MM;
@@ -295,12 +549,47 @@
   /// Load-linked, Store-conditional
   def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>;
   def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>;
+
+  def TLBP_MM : MMRel, TLB<"tlbp">, COP0_TLB_FM_MM<0x0d>;
+  def TLBR_MM : MMRel, TLB<"tlbr">, COP0_TLB_FM_MM<0x4d>;
+  def TLBWI_MM : MMRel, TLB<"tlbwi">, COP0_TLB_FM_MM<0x8d>;
+  def TLBWR_MM : MMRel, TLB<"tlbwr">, COP0_TLB_FM_MM<0xcd>;
+
+  def SDBBP_MM : MMRel, SYS_FT<"sdbbp">, SDBBP_FM_MM;
+  def RDHWR_MM : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM_MM;
 }
 
+let Predicates = [InMicroMips] in {
+
+//===----------------------------------------------------------------------===//
+// MicroMips arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+def : MipsPat<(add GPRMM16:$src, immSExtAddiur2:$imm),
+              (ADDIUR2_MM GPRMM16:$src, immSExtAddiur2:$imm)>;
+def : MipsPat<(add GPR32:$src, immSExtAddius5:$imm),
+              (ADDIUS5_MM GPR32:$src, immSExtAddius5:$imm)>;
+def : MipsPat<(add GPR32:$src, immSExt16:$imm),
+              (ADDiu_MM GPR32:$src, immSExt16:$imm)>;
+
+def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
+              (ANDI16_MM GPRMM16:$src, immZExtAndi16:$imm)>;
+def : MipsPat<(and GPR32:$src, immZExt16:$imm),
+              (ANDi_MM GPR32:$src, immZExt16:$imm)>;
+
+def : MipsPat<(shl GPRMM16:$src, immZExt2Shift:$imm),
+              (SLL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
+def : MipsPat<(shl GPR32:$src, immZExt5:$imm),
+              (SLL_MM GPR32:$src, immZExt5:$imm)>;
+
+def : MipsPat<(srl GPRMM16:$src, immZExt2Shift:$imm),
+              (SRL16_MM GPRMM16:$src, immZExt2Shift:$imm)>;
+def : MipsPat<(srl GPR32:$src, immZExt5:$imm),
+              (SRL_MM GPR32:$src, immZExt5:$imm)>;
+
 //===----------------------------------------------------------------------===//
 // MicroMips instruction aliases
 //===----------------------------------------------------------------------===//
 
-let Predicates = [InMicroMips] in {
   def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
 }

diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index d512d65..87f1b04 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TARGET_MIPS_H
-#define TARGET_MIPS_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS_H
+#define LLVM_LIB_TARGET_MIPS_MIPS_H
 
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
@@ -26,8 +26,6 @@
   FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
   FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
-                                             JITCodeEmitter &JCE);
   FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm);
 } // end namespace llvm;
 

diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index dd3bc9b..3e1d047 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td

@@ -57,6 +57,8 @@
 // Mips Subtarget features                                                    //
 //===----------------------------------------------------------------------===//
 
+def FeatureNoABICalls  : SubtargetFeature<"noabicalls", "NoABICalls", "true",
+                                "Disable SVR4-style position-independent code.">;
 def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
                                 "General Purpose Registers are 64-bit wide.">;
 def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
@@ -67,13 +69,13 @@
                                 "IEEE 754-2008 NaN encoding.">;
 def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
                                 "true", "Only supports single precision float">;
-def FeatureO32         : SubtargetFeature<"o32", "MipsABI", "O32",
+def FeatureO32         : SubtargetFeature<"o32", "ABI", "MipsABIInfo::O32()",
                                 "Enable o32 ABI">;
-def FeatureN32         : SubtargetFeature<"n32", "MipsABI", "N32",
+def FeatureN32         : SubtargetFeature<"n32", "ABI", "MipsABIInfo::N32()",
                                 "Enable n32 ABI">;
-def FeatureN64         : SubtargetFeature<"n64", "MipsABI", "N64",
+def FeatureN64         : SubtargetFeature<"n64", "ABI", "MipsABIInfo::N64()",
                                 "Enable n64 ABI">;
-def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
+def FeatureEABI        : SubtargetFeature<"eabi", "ABI", "MipsABIInfo::EABI()",
                                 "Enable eabi ABI">;
 def FeatureNoOddSPReg  : SubtargetFeature<"nooddspreg", "UseOddSPReg", "false",
                               "Disable odd numbered single-precision "

diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 93706c2..6070276 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp

@@ -36,7 +36,7 @@
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Mips16InstrInfo &TII =
-    *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   uint64_t StackSize = MFI->getStackSize();
@@ -84,7 +84,7 @@
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Mips16InstrInfo &TII =
-    *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
   uint64_t StackSize = MFI->getStackSize();
 
@@ -154,7 +154,7 @@
       Amount = -Amount;
 
     const Mips16InstrInfo &TII =
-      *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+        *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
     TII.adjustStackPtr(Mips::SP, Amount, MBB, I);
   }
@@ -174,7 +174,7 @@
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
   const Mips16InstrInfo &TII =
-    *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const Mips16InstrInfo *>(MF.getSubtarget().getInstrInfo());
   const MipsRegisterInfo &RI = TII.getRegisterInfo();
   const BitVector Reserved = RI.getReservedRegs(MF);
   bool SaveS2 = Reserved[Mips::S2];

diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 1fb7eda..012d558 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPS16_FRAMEINFO_H
-#define MIPS16_FRAMEINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16FRAMELOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16FRAMELOWERING_H
 
 #include "MipsFrameLowering.h"
 

diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 14055d6..9488e63 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp

@@ -403,7 +403,7 @@
                            Attribute::ReadNone);
         A = A.addAttribute(C, AttributeSet::FunctionIndex,
                            Attribute::NoInline);
-        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, NULL));
+        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, nullptr));
         CallInst::Create(F, Params, "", &Inst );
       } else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
           const Value* V = CI->getCalledValue();

diff --git a/lib/Target/Mips/Mips16HardFloat.h b/lib/Target/Mips/Mips16HardFloat.h
index 826887e..19b7bf2 100644
--- a/lib/Target/Mips/Mips16HardFloat.h
+++ b/lib/Target/Mips/Mips16HardFloat.h

@@ -12,15 +12,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOAT_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOAT_H
+
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MipsTargetMachine.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
 
-
-#ifndef MIPS16HARDFLOAT_H
-#define MIPS16HARDFLOAT_H
-
 using namespace llvm;
 
 namespace llvm {

diff --git a/lib/Target/Mips/Mips16HardFloatInfo.h b/lib/Target/Mips/Mips16HardFloatInfo.h
index 02444d9..7295c28 100644
--- a/lib/Target/Mips/Mips16HardFloatInfo.h
+++ b/lib/Target/Mips/Mips16HardFloatInfo.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPS16HARDFLOATINFO_H
-#define MIPS16HARDFLOATINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOATINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16HARDFLOATINFO_H
 
 namespace llvm {
 

diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 6672aef..7732be4 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp

@@ -37,6 +37,7 @@
 #define DEBUG_TYPE "mips-isel"
 
 bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
   if (!Subtarget->inMips16Mode())
     return false;
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
@@ -71,7 +72,7 @@
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator I = MBB.begin();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
   const TargetRegisterClass *RC =
@@ -102,7 +103,7 @@
 
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator I = MBB.begin();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   unsigned Mips16SPAliasReg = MipsFI->getMips16SPAliasReg();
 
@@ -134,8 +135,9 @@
         switch (SD->getMemoryVT().getSizeInBits()) {
         case 8:
         case 16:
-          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
-            AliasFPReg: getMips16SPAliasReg();
+          AliasReg = TM.getSubtargetImpl()->getFrameLowering()->hasFP(*MF)
+                         ? AliasFPReg
+                         : getMips16SPAliasReg();
           return;
         }
         break;
@@ -145,8 +147,9 @@
         switch (SD->getMemoryVT().getSizeInBits()) {
         case 8:
         case 16:
-          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
-            AliasFPReg: getMips16SPAliasReg();
+          AliasReg = TM.getSubtargetImpl()->getFrameLowering()->hasFP(*MF)
+                         ? AliasFPReg
+                         : getMips16SPAliasReg();
           return;
         }
         break;

diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.h b/lib/Target/Mips/Mips16ISelDAGToDAG.h
index e653b39..ae0e61e 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.h
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPS16ISELDAGTODAG_H
-#define MIPS16ISELDAGTODAG_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16ISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16ISELDAGTODAG_H
 
 #include "MipsISelDAGToDAG.h"
 

diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 81a05df..d4852c4 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp

@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 #include "Mips16ISelLowering.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16HardFloatInfo.h"
+#include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
@@ -27,7 +29,7 @@
 static cl::opt<bool> DontExpandCondPseudos16(
   "mips16-dont-expand-cond-pseudo",
   cl::init(false),
-  cl::desc("Dont expand conditional move related "
+  cl::desc("Don't expand conditional move related "
            "pseudos for Mips 16"),
   cl::Hidden);
 
@@ -118,13 +120,14 @@
   {"truncf", "__mips16_call_stub_sf_1"},
 };
 
-Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
-  : MipsTargetLowering(TM) {
+Mips16TargetLowering::Mips16TargetLowering(const MipsTargetMachine &TM,
+                                           const MipsSubtarget &STI)
+    : MipsTargetLowering(TM, STI) {
 
   // Set up the register classes
   addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
 
-  if (Subtarget->inMips16HardFloat())
+  if (!TM.Options.UseSoftFloat)
     setMips16HardFloatLibCalls();
 
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
@@ -150,14 +153,16 @@
 }
 
 const MipsTargetLowering *
-llvm::createMips16TargetLowering(MipsTargetMachine &TM) {
-  return new Mips16TargetLowering(TM);
+llvm::createMips16TargetLowering(const MipsTargetMachine &TM,
+                                 const MipsSubtarget &STI) {
+  return new Mips16TargetLowering(TM, STI);
 }
 
 bool
-Mips16TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
-                                                    unsigned,
-                                                    bool *Fast) const {
+Mips16TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                     unsigned,
+                                                     unsigned,
+                                                     bool *Fast) const {
   return false;
 }
 
@@ -239,10 +244,9 @@
   }
 }
 
-bool Mips16TargetLowering::
-isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                  unsigned NextStackOffset,
-                                  const MipsFunctionInfo& FI) const {
+bool Mips16TargetLowering::isEligibleForTailCallOptimization(
+    const CCState &CCInfo, unsigned NextStackOffset,
+    const MipsFunctionInfo &FI) const {
   // No tail call optimization for mips16.
   return false;
 }
@@ -317,7 +321,7 @@
 }
 
 //
-// prefixs are attached to stub numbers depending on the return type .
+// Prefixes are attached to stub numbers depending on the return type.
 // return type: float  sf_
 //              double df_
 //              single complex sc_
@@ -328,17 +332,16 @@
 // The full name of a helper function is__mips16_call_stub +
 //    return type dependent prefix + stub number
 //
-//
-// This is something that probably should be in a different source file and
-// perhaps done differently but my main purpose is to not waste runtime
+// FIXME: This is something that probably should be in a different source file
+// and perhaps done differently but my main purpose is to not waste runtime
 // on something that we can enumerate in the source. Another possibility is
 // to have a python script to generate these mapping tables. This will do
 // for now. There are a whole series of helper function mapping arrays, one
 // for each return type class as outlined above. There there are 11 possible
-//  entries. Ones with 0 are ones which should never be selected
+// entries. Ones with 0 are ones which should never be selected.
 //
 // All the arrays are similar except for ones which return neither
-// sf, df, sc, dc, in which only care about ones which have sf or df as a
+// sf, df, sc, dc, in which we only care about ones which have sf or df as a
 // first parameter.
 //
 #define P_ "__mips16_call_stub_"
@@ -420,14 +423,15 @@
 getOpndList(SmallVectorImpl<SDValue> &Ops,
             std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
             bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+            bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+            SDValue Chain) const {
   SelectionDAG &DAG = CLI.DAG;
   MachineFunction &MF = DAG.getMachineFunction();
   MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
   const char* Mips16HelperFunction = nullptr;
   bool NeedMips16Helper = false;
 
-  if (Subtarget->inMips16HardFloat()) {
+  if (Subtarget.inMips16HardFloat()) {
     //
     // currently we don't have symbols tagged with the mips16 or mips32
     // qualifier so we will assume that we don't know what kind it is.
@@ -510,14 +514,16 @@
   Ops.push_back(JumpTarget);
 
   MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
-                                  InternalLinkage, CLI, Callee, Chain);
+                                  InternalLinkage, IsCallReloc, CLI, Callee,
+                                  Chain);
 }
 
 MachineBasicBlock *Mips16TargetLowering::
 emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -579,7 +585,8 @@
    MachineInstr *MI, MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -643,7 +650,8 @@
    MachineInstr *MI, MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
@@ -708,7 +716,8 @@
                                              MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   unsigned regX = MI->getOperand(0).getReg();
   unsigned regY = MI->getOperand(1).getReg();
   MachineBasicBlock *target = MI->getOperand(2).getMBB();
@@ -724,7 +733,8 @@
   MachineInstr *MI,  MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   unsigned regX = MI->getOperand(0).getReg();
   int64_t imm = MI->getOperand(1).getImm();
   MachineBasicBlock *target = MI->getOperand(2).getMBB();
@@ -758,7 +768,8 @@
   MachineInstr *MI,  MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   unsigned CC = MI->getOperand(0).getReg();
   unsigned regX = MI->getOperand(1).getReg();
   unsigned regY = MI->getOperand(2).getReg();
@@ -775,7 +786,8 @@
   MachineInstr *MI,  MachineBasicBlock *BB )const {
   if (DontExpandCondPseudos16)
     return BB;
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   unsigned CC = MI->getOperand(0).getReg();
   unsigned regX = MI->getOperand(1).getReg();
   int64_t Imm = MI->getOperand(2).getImm();

diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
index 2a5eec5..d3b9f75 100644
--- a/lib/Target/Mips/Mips16ISelLowering.h
+++ b/lib/Target/Mips/Mips16ISelLowering.h

@@ -11,27 +11,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPS16ISELLOWERING_H
-#define MIPS16ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16ISELLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16ISELLOWERING_H
 
 #include "MipsISelLowering.h"
 
 namespace llvm {
   class Mips16TargetLowering : public MipsTargetLowering  {
   public:
-    explicit Mips16TargetLowering(MipsTargetMachine &TM);
+    explicit Mips16TargetLowering(const MipsTargetMachine &TM,
+                                  const MipsSubtarget &STI);
 
-    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
-                                       bool *Fast) const override;
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                        unsigned Align,
+                                        bool *Fast) const override;
 
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr *MI,
                                 MachineBasicBlock *MBB) const override;
 
   private:
-    bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                     unsigned NextStackOffset,
-                                     const MipsFunctionInfo& FI) const override;
+    bool isEligibleForTailCallOptimization(
+        const CCState &CCInfo, unsigned NextStackOffset,
+        const MipsFunctionInfo &FI) const override;
 
     void setMips16HardFloatLibCalls();
 
@@ -45,7 +47,7 @@
     getOpndList(SmallVectorImpl<SDValue> &Ops,
                 std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-                CallLoweringInfo &CLI, SDValue Callee,
+                bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
                 SDValue Chain) const override;
 
     MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr *MI,
@@ -77,4 +79,4 @@
   };
 }
 
-#endif // Mips16ISELLOWERING_H
+#endif

diff --git a/lib/Target/Mips/Mips16InstrFormats.td b/lib/Target/Mips/Mips16InstrFormats.td
index da3a1f1..4ff68be 100644
--- a/lib/Target/Mips/Mips16InstrFormats.td
+++ b/lib/Target/Mips/Mips16InstrFormats.td

@@ -591,7 +591,7 @@
   bits<3> funct;
 
   let funct = _funct;
-  let I8 = 0b0110;
+  let I8 = 0b00110;
 
   let Inst{26-21} = imm16{10-5};
   let Inst{20-16} = imm16{15-11};

diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 79607de..4dd9af2 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp

@@ -31,9 +31,8 @@
 
 #define DEBUG_TYPE "mips16-instrinfo"
 
-Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
-  : MipsInstrInfo(tm, Mips::Bimm16),
-    RI(*tm.getSubtargetImpl()) {}
+Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI)
+    : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
   return RI;
@@ -44,9 +43,8 @@
 /// the destination along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than loading from the stack slot.
-unsigned Mips16InstrInfo::
-isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
-{
+unsigned Mips16InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
   return 0;
 }
 
@@ -55,9 +53,8 @@
 /// the source reg along with the FrameIndex of the loaded stack slot.  If
 /// not, return 0.  This predicate must return 0 if the instruction has
 /// any side effects other than storing to the stack slot.
-unsigned Mips16InstrInfo::
-isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
-{
+unsigned Mips16InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
   return 0;
 }
 
@@ -93,11 +90,12 @@
     MIB.addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-void Mips16InstrInfo::
-storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                unsigned SrcReg, bool isKill, int FI,
-                const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
-                int64_t Offset) const {
+void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned SrcReg, bool isKill, int FI,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI,
+                                      int64_t Offset) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
@@ -110,10 +108,12 @@
       .addMemOperand(MMO);
 }
 
-void Mips16InstrInfo::
-loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                 unsigned DestReg, int FI, const TargetRegisterClass *RC,
-                 const TargetRegisterInfo *TRI, int64_t Offset) const {
+void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI,
+                                       int64_t Offset) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
@@ -171,7 +171,8 @@
 }
 
 static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
-                          const std::vector<CalleeSavedInfo> &CSI, unsigned Flags=0) {
+                               const std::vector<CalleeSavedInfo> &CSI,
+                               unsigned Flags = 0) {
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     // Add the callee-saved register as live-in. Do not add if the register is
     // RA and return address is taken, because it has already been added in
@@ -195,8 +196,8 @@
 }
 // Adjust SP by FrameSize bytes. Save RA, S0, S1
 void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize,
-                    MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator I) const {
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const {
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI    = MF.getFrameInfo();
@@ -265,9 +266,6 @@
                                         MachineBasicBlock::iterator I,
                                         unsigned Reg1, unsigned Reg2) const {
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
-//  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
-//  unsigned Reg1 = RegInfo.createVirtualRegister(&Mips::CPU16RegsRegClass);
-//  unsigned Reg2 = RegInfo.createVirtualRegister(&Mips::CPU16RegsRegClass);
   //
   // li reg1, constant
   // move reg2, sp
@@ -287,9 +285,9 @@
   MIB4.addReg(Reg1, RegState::Kill);
 }
 
-void Mips16InstrInfo::adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount,
-                    MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator I) const {
+void Mips16InstrInfo::adjustStackPtrBigUnrestricted(
+    unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
    assert(false && "adjust stack pointer amount exceeded");
 }
 
@@ -305,11 +303,10 @@
 
 /// This function generates the sequence of instructions needed to get the
 /// result of adding register REG and immediate IMM.
-unsigned
-Mips16InstrInfo::loadImmediate(unsigned FrameReg,
-                               int64_t Imm, MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator II, DebugLoc DL,
-                               unsigned &NewImm) const {
+unsigned Mips16InstrInfo::loadImmediate(unsigned FrameReg, int64_t Imm,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator II,
+                                        DebugLoc DL, unsigned &NewImm) const {
   //
   // given original instruction is:
   // Instr rx, T[offset] where offset is too big.
@@ -345,7 +342,7 @@
         !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
       Candidates.reset(MO.getReg());
   }
-  //
+
   // If the same register was used and defined in an instruction, then
   // it will not be in the list of candidates.
   //
@@ -354,7 +351,6 @@
   // present as an operand of the instruction. this tells
   // whether the register is live before the instruction. if it's not
   // then we don't need to save it in case there are no free registers.
-  //
   int DefReg = 0;
   for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = II->getOperand(i);
@@ -363,9 +359,8 @@
       break;
     }
   }
-  //
-  BitVector Available = rs.getRegsAvailable(&Mips::CPU16RegsRegClass);
 
+  BitVector Available = rs.getRegsAvailable(&Mips::CPU16RegsRegClass);
   Available &= Candidates;
   //
   // we use T0 for the first register, if we need to save something away.
@@ -374,7 +369,6 @@
   unsigned FirstRegSaved =0, SecondRegSaved=0;
   unsigned FirstRegSavedTo = 0, SecondRegSavedTo = 0;
 
-
   Reg = Available.find_first();
 
   if (Reg == -1) {
@@ -442,7 +436,6 @@
   BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
 }
 
-
 const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const {
   if (validSpImm8(Imm))
     return get(Mips::AddiuSpImm16);
@@ -456,8 +449,8 @@
   BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm);
 }
 
-const MipsInstrInfo *llvm::createMips16InstrInfo(MipsTargetMachine &TM) {
-  return new Mips16InstrInfo(TM);
+const MipsInstrInfo *llvm::createMips16InstrInfo(const MipsSubtarget &STI) {
+  return new Mips16InstrInfo(STI);
 }
 
 bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg,
@@ -497,7 +490,6 @@
 unsigned Mips16InstrInfo::getInlineAsmLength(const char *Str,
                                              const MCAsmInfo &MAI) const {
 
-
   // Count the number of instructions in the asm.
   bool atInsnStart = true;
   unsigned Length = 0;

diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index 0dc0046..e7d0c07 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPS16INSTRUCTIONINFO_H
-#define MIPS16INSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16INSTRINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16INSTRINFO_H
 
 #include "Mips16RegisterInfo.h"
 #include "MipsInstrInfo.h"
@@ -23,7 +23,7 @@
   const Mips16RegisterInfo RI;
 
 public:
-  explicit Mips16InstrInfo(MipsTargetMachine &TM);
+  explicit Mips16InstrInfo(const MipsSubtarget &STI);
 
   const MipsRegisterInfo &getRegisterInfo() const override;
 

diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index 5e4eebb..2364f4d 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td

@@ -1771,9 +1771,9 @@
 
 //
 // For constants, llvm transforms this to:
-// x > (k -1) and then reverses the operands to use setlt. So this pattern
+// x > (k - 1) and then reverses the operands to use setlt. So this pattern
 // is not used now by the compiler. (Presumably checking that k-1 does not
-// overflow). The compiler never uses this at a the current time, due to
+// overflow). The compiler never uses this at the current time, due to
 // other optimizations.
 //
 //def: Mips16Pat

diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index dbee774..0bb452a 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp

@@ -65,7 +65,7 @@
    const TargetRegisterClass *RC,
    unsigned Reg) const {
   DebugLoc DL;
-  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
   TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true);
   TII.copyPhysReg(MBB, UseMI, DL, Reg, Mips::T0, true);
   return true;
@@ -106,7 +106,7 @@
   if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
     FrameReg = Mips::SP;
   else {
-    const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+    const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
     if (TFI->hasFP(MF)) {
       FrameReg = Mips::S0;
     }
@@ -140,8 +140,8 @@
     DebugLoc DL = II->getDebugLoc();
     unsigned NewImm;
     const Mips16InstrInfo &TII =
-      *static_cast<const Mips16InstrInfo*>(
-        MBB.getParent()->getTarget().getInstrInfo());
+        *static_cast<const Mips16InstrInfo *>(
+            MBB.getParent()->getSubtarget().getInstrInfo());
     FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm);
     Offset = SignExtend64<16>(NewImm);
     IsKill = true;

diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index f59f1a7..3cdf836 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPS16REGISTERINFO_H
-#define MIPS16REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPS16REGISTERINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPS16REGISTERINFO_H
 
 #include "MipsRegisterInfo.h"
 

diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
index e4ec96a..e9a4289 100644
--- a/lib/Target/Mips/Mips32r6InstrFormats.td
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td

@@ -403,7 +403,7 @@
   bits<32> Inst;
 
   let Inst{31-26} = funct;
-  let Inst{25-21} = 0b000000;
+  let Inst{25-21} = 0b00000;
   let Inst{20-16} = rt;
   let Inst{15-0} = offset;
 }

diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index d06e5ca..6d6735b 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td

@@ -796,8 +796,8 @@
                   (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
               ISA_MIPS32R6;
 def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f),
-              (OR (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)),
-                  (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)))>,
+              (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)),
+                  (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>,
               ISA_MIPS32R6;
 def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t,
                       i32:$f),

diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index f0b6814..4e2dcd8 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td

@@ -419,6 +419,10 @@
 defm : SetgeImmPats<GPR64, SLTi64, SLTiu64>;
 
 // truncate
+def : MipsPat<(trunc (assertsext GPR64:$src)),
+              (EXTRACT_SUBREG GPR64:$src, sub_32)>;
+def : MipsPat<(trunc (assertzext GPR64:$src)),
+              (EXTRACT_SUBREG GPR64:$src, sub_32)>;
 def : MipsPat<(i32 (trunc GPR64:$src)),
               (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
 
@@ -442,28 +446,22 @@
       GPR_64;
 def : MipsInstAlias<"daddu $rs, $rt, $imm",
                     (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
-                    0>;
+                    0>, ISA_MIPS3;
 def : MipsInstAlias<"dadd $rs, $rt, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
                     0>, ISA_MIPS3_NOT_32R6_64R6;
 def : MipsInstAlias<"daddu $rs, $imm",
                     (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                    0>;
+                    0>, ISA_MIPS3;
 def : MipsInstAlias<"dadd $rs, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
                     0>, ISA_MIPS3_NOT_32R6_64R6;
-def : MipsInstAlias<"add $rs, $imm",
-                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                    0>;
-def : MipsInstAlias<"addu $rs, $imm",
-                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                    0>;
 def : MipsInstAlias<"dsll $rd, $rt, $rs",
                     (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                     ISA_MIPS3;
 def : MipsInstAlias<"dsubu $rt, $rs, $imm",
                     (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
-                            InvertedImOperand64:$imm), 0>;
+                            InvertedImOperand64:$imm), 0>, ISA_MIPS3;
 def : MipsInstAlias<"dsubi $rs, $rt, $imm",
                     (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt,
                            InvertedImOperand64:$imm),
@@ -483,7 +481,7 @@
 def : MipsInstAlias<"dsubu $rs, $imm",
                     (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
                             InvertedImOperand64:$imm),
-                    0>;
+                    0>, ISA_MIPS3;
 def : MipsInstAlias<"dsra $rd, $rt, $rs",
                     (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                     ISA_MIPS3;
@@ -510,3 +508,9 @@
 def : MipsInstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"dmtc2 $rt, $rd", (DMTC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
 
+let Predicates = [HasMips64, HasCnMips] in {
+def : MipsInstAlias<"synciobdma", (SYNC 0x2), 0>;
+def : MipsInstAlias<"syncs",      (SYNC 0x6), 0>;
+def : MipsInstAlias<"syncw",      (SYNC 0x4), 0>;
+def : MipsInstAlias<"syncws",     (SYNC 0x5), 0>;
+}

diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
index 63cf60b..6b546e8 100644
--- a/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td

@@ -191,9 +191,9 @@
                                                       immZExt16:$imm))))>,
               ISA_MIPS64R6;
 def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i64:$t, i64:$f),
-              (OR64 (SELNEZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
+              (OR64 (SELNEZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
                                                       immZExt16:$imm))),
-                    (SELEQZ64 i64:$t, (SLL64_32 (XORi i32:$cond,
+                    (SELEQZ64 i64:$f, (SLL64_32 (XORi i32:$cond,
                                                       immZExt16:$imm))))>,
               ISA_MIPS64R6;
 

diff --git a/lib/Target/Mips/MipsABIInfo.cpp b/lib/Target/Mips/MipsABIInfo.cpp
new file mode 100644
index 0000000..f885369
--- /dev/null
+++ b/lib/Target/Mips/MipsABIInfo.cpp

@@ -0,0 +1,45 @@
+//===---- MipsABIInfo.cpp - Information about MIPS ABI's ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsABIInfo.h"
+#include "MipsRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+static const MCPhysReg O32IntRegs[4] = {Mips::A0, Mips::A1, Mips::A2, Mips::A3};
+
+static const MCPhysReg Mips64IntRegs[8] = {
+    Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
+    Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64};
+}
+
+const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const {
+  if (IsO32())
+    return makeArrayRef(O32IntRegs);
+  if (IsN32() || IsN64())
+    return makeArrayRef(Mips64IntRegs);
+  llvm_unreachable("Unhandled ABI");
+}
+
+const ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const {
+  if (IsO32())
+    return makeArrayRef(O32IntRegs);
+  if (IsN32() || IsN64())
+    return makeArrayRef(Mips64IntRegs);
+  llvm_unreachable("Unhandled ABI");
+}
+
+unsigned MipsABIInfo::GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const {
+  if (IsO32())
+    return CC != CallingConv::Fast ? 16 : 0;
+  if (IsN32() || IsN64() || IsEABI())
+    return 0;
+  llvm_unreachable("Unhandled ABI");
+}

diff --git a/lib/Target/Mips/MipsABIInfo.h b/lib/Target/Mips/MipsABIInfo.h
new file mode 100644
index 0000000..bea585e
--- /dev/null
+++ b/lib/Target/Mips/MipsABIInfo.h

@@ -0,0 +1,61 @@
+//===---- MipsABIInfo.h - Information about MIPS ABI's --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSABIINFO_H
+#define MIPSABIINFO_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+class MipsABIInfo {
+public:
+  enum class ABI { Unknown, O32, N32, N64, EABI };
+
+protected:
+  ABI ThisABI;
+
+public:
+  MipsABIInfo(ABI ThisABI) : ThisABI(ThisABI) {}
+
+  static MipsABIInfo Unknown() { return MipsABIInfo(ABI::Unknown); }
+  static MipsABIInfo O32() { return MipsABIInfo(ABI::O32); }
+  static MipsABIInfo N32() { return MipsABIInfo(ABI::N32); }
+  static MipsABIInfo N64() { return MipsABIInfo(ABI::N64); }
+  static MipsABIInfo EABI() { return MipsABIInfo(ABI::EABI); }
+
+  bool IsKnown() const { return ThisABI != ABI::Unknown; }
+  bool IsO32() const { return ThisABI == ABI::O32; }
+  bool IsN32() const { return ThisABI == ABI::N32; }
+  bool IsN64() const { return ThisABI == ABI::N64; }
+  bool IsEABI() const { return ThisABI == ABI::EABI; }
+  ABI GetEnumValue() const { return ThisABI; }
+
+  /// The registers to use for byval arguments.
+  const ArrayRef<MCPhysReg> GetByValArgRegs() const;
+
+  /// The registers to use for the variable argument list.
+  const ArrayRef<MCPhysReg> GetVarArgRegs() const;
+
+  /// Obtain the size of the area allocated by the callee for arguments.
+  /// CallingConv::FastCall affects the value for O32.
+  unsigned GetCalleeAllocdArgSizeInBytes(CallingConv::ID CC) const;
+
+  /// Ordering of ABI's
+  /// MipsGenSubtargetInfo.inc will use this to resolve conflicts when given
+  /// multiple ABI options.
+  bool operator<(const MipsABIInfo Other) const {
+    return ThisABI < Other.GetEnumValue();
+  }
+};
+}
+
+#endif

diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
index 31a9b7d..161345d 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp

@@ -72,7 +72,8 @@
   if (Imm & 0x8000) {
     InstSeqLs SeqLsORi;
     GetInstSeqLsORi(Imm, RemSize, SeqLsORi);
-    SeqLs.insert(SeqLs.end(), SeqLsORi.begin(), SeqLsORi.end());
+    SeqLs.append(std::make_move_iterator(SeqLsORi.begin()),
+                 std::make_move_iterator(SeqLsORi.end()));
   }
 }
 

diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.h b/lib/Target/Mips/MipsAnalyzeImmediate.h
index cc09034..ae3c38c 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.h
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.h

@@ -6,8 +6,8 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-#ifndef MIPS_ANALYZE_IMMEDIATE_H
-#define MIPS_ANALYZE_IMMEDIATE_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSANALYZEIMMEDIATE_H
+#define LLVM_LIB_TARGET_MIPS_MIPSANALYZEIMMEDIATE_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataTypes.h"

diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 1fb75a2..832fa05 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp

@@ -58,10 +58,12 @@
 }
 
 bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+
   // Initialize TargetLoweringObjectFile.
-  if (Subtarget->allowMixed16_32())
-    const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
+  const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
       .Initialize(OutContext, TM);
+
   MipsFI = MF.getInfo<MipsFunctionInfo>();
   if (Subtarget->inMips16Mode())
     for (std::map<
@@ -129,7 +131,7 @@
 
 void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MipsTargetStreamer &TS = getTargetStreamer();
-  TS.setCanHaveModuleDir(false);
+  TS.forbidModuleDirective();
 
   if (MI->isDebugValue()) {
     SmallString<128> Str;
@@ -264,7 +266,8 @@
     if (Mips::GPR32RegClass.contains(Reg))
       break;
 
-    unsigned RegNum = TM.getRegisterInfo()->getEncodingValue(Reg);
+    unsigned RegNum =
+        TM.getSubtargetImpl()->getRegisterInfo()->getEncodingValue(Reg);
     if (Mips::AFGR64RegClass.contains(Reg)) {
       FPUBitmask |= (3 << RegNum);
       CSFPRegsSize += AFGR64RegSize;
@@ -279,7 +282,8 @@
   // Set CPU Bitmask.
   for (; i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    unsigned RegNum = TM.getRegisterInfo()->getEncodingValue(Reg);
+    unsigned RegNum =
+        TM.getSubtargetImpl()->getRegisterInfo()->getEncodingValue(Reg);
     CPUBitmask |= (1 << RegNum);
   }
 
@@ -304,7 +308,7 @@
 
 /// Frame Directive
 void MipsAsmPrinter::emitFrameDirective() {
-  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+  const TargetRegisterInfo &RI = *TM.getSubtargetImpl()->getRegisterInfo();
 
   unsigned stackReg  = RI.getFrameRegister(*MF);
   unsigned returnReg = RI.getRARegister();
@@ -315,11 +319,11 @@
 
 /// Emit Set directives.
 const char *MipsAsmPrinter::getCurrentABIString() const {
-  switch (Subtarget->getTargetABI()) {
-  case MipsSubtarget::O32:  return "abi32";
-  case MipsSubtarget::N32:  return "abiN32";
-  case MipsSubtarget::N64:  return "abi64";
-  case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
+  switch (Subtarget->getABI().GetEnumValue()) {
+  case MipsABIInfo::ABI::O32:  return "abi32";
+  case MipsABIInfo::ABI::N32:  return "abiN32";
+  case MipsABIInfo::ABI::N64:  return "abi64";
+  case MipsABIInfo::ABI::EABI: return "eabi32"; // TODO: handle eabi64
   default: llvm_unreachable("Unknown Mips ABI");
   }
 }
@@ -469,14 +473,12 @@
       return false;
     case 'z': {
       // $0 if zero, regular printing otherwise
-      if (MO.getType() != MachineOperand::MO_Immediate)
-        return true;
-      int64_t Val = MO.getImm();
-      if (Val)
-        O << Val;
-      else
+      if (MO.getType() == MachineOperand::MO_Immediate && MO.getImm() == 0) {
         O << "$0";
-      return false;
+        return false;
+      }
+      // If not, call printOperand as normal.
+      break;
     }
     case 'D': // Second part of a double word register operand
     case 'L': // Low order register of a double word register operand
@@ -558,7 +560,7 @@
 
 void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                   raw_ostream &O) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   const MachineOperand &MO = MI->getOperand(opNum);
   bool closeP = false;
 
@@ -643,6 +645,18 @@
   // Load/Store memory operands -- imm($reg)
   // If PIC target the target is loaded as the
   // pattern lw $25,%call16($28)
+
+  // opNum can be invalid if instruction has reglist as operand.
+  // MemOperand is always last operand of instruction (base + offset).
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::SWM32_MM:
+  case Mips::LWM32_MM:
+    opNum = MI->getNumOperands() - 2;
+    break;
+  }
+
   printOperand(MI, opNum+1, O);
   O << "(";
   printOperand(MI, opNum, O);
@@ -666,13 +680,19 @@
   O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm());
 }
 
+void MipsAsmPrinter::
+printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O) {
+  for (int i = opNum, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != opNum) O << ", ";
+    printOperand(MI, i, O);
+  }
+}
+
 void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  // TODO: Need to add -mabicalls and -mno-abicalls flags.
-  // Currently we assume that -mabicalls is the default.
-  bool IsABICalls = true;
+  bool IsABICalls = Subtarget->isABICalls();
   if (IsABICalls) {
     getTargetStreamer().emitDirectiveAbiCalls();
-    Reloc::Model RM = Subtarget->getRelocationModel();
+    Reloc::Model RM = TM.getRelocationModel();
     // FIXME: This condition should be a lot more complicated that it is here.
     //        Ideally it should test for properties of the ABI and not the ABI
     //        itself.
@@ -706,9 +726,19 @@
   }
 
   getTargetStreamer().updateABIInfo(*Subtarget);
-  getTargetStreamer().emitDirectiveModuleFP();
 
-  if (Subtarget->isABI_O32())
+  // We should always emit a '.module fp=...' but binutils 2.24 does not accept
+  // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or
+  // -mfp64) and omit it otherwise.
+  if (Subtarget->isABI_O32() && (Subtarget->isABI_FPXX() ||
+                                 Subtarget->isFP64bit()))
+    getTargetStreamer().emitDirectiveModuleFP();
+
+  // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not
+  // accept it. We therefore emit it when it contradicts the default or an
+  // option has changed the default (i.e. FPXX) and omit it otherwise.
+  if (Subtarget->isABI_O32() && (!Subtarget->useOddSPReg() ||
+                                 Subtarget->isABI_FPXX()))
     getTargetStreamer().emitDirectiveModuleOddSPReg(Subtarget->useOddSPReg(),
                                                     Subtarget->isABI_O32());
 }

diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 967aa0b..0582e21 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSASMPRINTER_H
-#define MIPSASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSASMPRINTER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSASMPRINTER_H
 
 #include "Mips16HardFloatInfo.h"
 #include "MipsMCInstLower.h"
@@ -89,11 +89,14 @@
   const MipsFunctionInfo *MipsFI;
   MipsMCInstLower MCInstLowering;
 
-  explicit MipsAsmPrinter(TargetMachine &TM,  MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), MCP(nullptr), InConstantPool(false),
-      MCInstLowering(*this) {
-    Subtarget = &TM.getSubtarget<MipsSubtarget>();
-  }
+  // We initialize the subtarget here and in runOnMachineFunction
+  // since there are certain target specific flags (ABI) that could
+  // reside on the TargetMachine, but are on the subtarget currently
+  // and we need them for the beginning of file output before we've
+  // seen a single function.
+  explicit MipsAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer), MCP(nullptr), InConstantPool(false),
+        Subtarget(&TM.getSubtarget<MipsSubtarget>()), MCInstLowering(*this) {}
 
   const char *getPassName() const override {
     return "Mips Assembly Printer";
@@ -131,6 +134,7 @@
   void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                        const char *Modifier = nullptr);
+  void printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O);
   void EmitStartOfAsmFile(Module &M) override;
   void EmitEndOfAsmFile(Module &M) override;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);

diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp
new file mode 100644
index 0000000..e18cc8b
--- /dev/null
+++ b/lib/Target/Mips/MipsCCState.cpp

@@ -0,0 +1,142 @@
+//===---- MipsCCState.cpp - CCState with Mips specific extensions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsCCState.h"
+#include "MipsSubtarget.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+/// This function returns true if CallSym is a long double emulation routine.
+static bool isF128SoftLibCall(const char *CallSym) {
+  const char *const LibCalls[] = {
+      "__addtf3",      "__divtf3",     "__eqtf2",       "__extenddftf2",
+      "__extendsftf2", "__fixtfdi",    "__fixtfsi",     "__fixtfti",
+      "__fixunstfdi",  "__fixunstfsi", "__fixunstfti",  "__floatditf",
+      "__floatsitf",   "__floattitf",  "__floatunditf", "__floatunsitf",
+      "__floatuntitf", "__getf2",      "__gttf2",       "__letf2",
+      "__lttf2",       "__multf3",     "__netf2",       "__powitf2",
+      "__subtf3",      "__trunctfdf2", "__trunctfsf2",  "__unordtf2",
+      "ceill",         "copysignl",    "cosl",          "exp2l",
+      "expl",          "floorl",       "fmal",          "fmodl",
+      "log10l",        "log2l",        "logl",          "nearbyintl",
+      "powl",          "rintl",        "sinl",          "sqrtl",
+      "truncl"};
+
+  const char *const *End = LibCalls + array_lengthof(LibCalls);
+
+  // Check that LibCalls is sorted alphabetically.
+  MipsTargetLowering::LTStr Comp;
+
+#ifndef NDEBUG
+  for (const char *const *I = LibCalls; I < End - 1; ++I)
+    assert(Comp(*I, *(I + 1)));
+#endif
+
+  return std::binary_search(LibCalls, End, CallSym, Comp);
+}
+
+/// This function returns true if Ty is fp128, {f128} or i128 which was
+/// originally a fp128.
+static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
+  if (Ty->isFP128Ty())
+    return true;
+
+  if (Ty->isStructTy() && Ty->getStructNumElements() == 1 &&
+      Ty->getStructElementType(0)->isFP128Ty())
+    return true;
+
+  const ExternalSymbolSDNode *ES =
+      dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode);
+
+  // If the Ty is i128 and the function being called is a long double emulation
+  // routine, then the original type is f128.
+  return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
+}
+
+MipsCCState::SpecialCallingConvType
+MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
+                                            const MipsSubtarget &Subtarget) {
+  MipsCCState::SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv;
+  if (Subtarget.inMips16HardFloat()) {
+    if (const GlobalAddressSDNode *G =
+            dyn_cast<const GlobalAddressSDNode>(Callee)) {
+      llvm::StringRef Sym = G->getGlobal()->getName();
+      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+      if (F && F->hasFnAttribute("__Mips16RetHelper")) {
+        SpecialCallingConv = Mips16RetHelperConv;
+      }
+    }
+  }
+  return SpecialCallingConv;
+}
+
+void MipsCCState::PreAnalyzeCallResultForF128(
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    const TargetLowering::CallLoweringInfo &CLI) {
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    OriginalArgWasF128.push_back(
+        originalTypeIsF128(CLI.RetTy, CLI.Callee.getNode()));
+    OriginalArgWasFloat.push_back(CLI.RetTy->isFloatingPointTy());
+  }
+}
+
+/// Identify lowered values that originated from f128 arguments and record
+/// this for use by RetCC_MipsN.
+void MipsCCState::PreAnalyzeReturnForF128(
+    const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  const MachineFunction &MF = getMachineFunction();
+  for (unsigned i = 0; i < Outs.size(); ++i) {
+    OriginalArgWasF128.push_back(
+        originalTypeIsF128(MF.getFunction()->getReturnType(), nullptr));
+    OriginalArgWasFloat.push_back(
+        MF.getFunction()->getReturnType()->isFloatingPointTy());
+  }
+}
+
+/// Identify lowered values that originated from f128 arguments and record
+/// this.
+void MipsCCState::PreAnalyzeCallOperands(
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    std::vector<TargetLowering::ArgListEntry> &FuncArgs,
+    const SDNode *CallNode) {
+  for (unsigned i = 0; i < Outs.size(); ++i) {
+    OriginalArgWasF128.push_back(
+        originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, CallNode));
+    OriginalArgWasFloat.push_back(
+        FuncArgs[Outs[i].OrigArgIndex].Ty->isFloatingPointTy());
+    CallOperandIsFixed.push_back(Outs[i].IsFixed);
+  }
+}
+
+/// Identify lowered values that originated from f128 arguments and record
+/// this.
+void MipsCCState::PreAnalyzeFormalArgumentsForF128(
+    const SmallVectorImpl<ISD::InputArg> &Ins) {
+  const MachineFunction &MF = getMachineFunction();
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+
+    // SRet arguments cannot originate from f128 or {f128} returns so we just
+    // push false. We have to handle this specially since SRet arguments
+    // aren't mapped to an original argument.
+    if (Ins[i].Flags.isSRet()) {
+      OriginalArgWasF128.push_back(false);
+      OriginalArgWasFloat.push_back(false);
+      continue;
+    }
+
+    assert(Ins[i].OrigArgIndex < MF.getFunction()->arg_size());
+    std::advance(FuncArg, Ins[i].OrigArgIndex);
+
+    OriginalArgWasF128.push_back(
+        originalTypeIsF128(FuncArg->getType(), nullptr));
+    OriginalArgWasFloat.push_back(FuncArg->getType()->isFloatingPointTy());
+  }
+}

diff --git a/lib/Target/Mips/MipsCCState.h b/lib/Target/Mips/MipsCCState.h
new file mode 100644
index 0000000..cc4531d
--- /dev/null
+++ b/lib/Target/Mips/MipsCCState.h

@@ -0,0 +1,136 @@
+//===---- MipsCCState.h - CCState with Mips specific extensions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSCCSTATE_H
+#define MIPSCCSTATE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "MipsISelLowering.h"
+
+namespace llvm {
+class SDNode;
+class MipsSubtarget;
+
+class MipsCCState : public CCState {
+public:
+  enum SpecialCallingConvType { Mips16RetHelperConv, NoSpecialCallingConv };
+
+  /// Determine the SpecialCallingConvType for the given callee
+  static SpecialCallingConvType
+  getSpecialCallingConvForCallee(const SDNode *Callee,
+                                 const MipsSubtarget &Subtarget);
+
+private:
+  /// Identify lowered values that originated from f128 arguments and record
+  /// this for use by RetCC_MipsN.
+  void PreAnalyzeCallResultForF128(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   const TargetLowering::CallLoweringInfo &CLI);
+
+  /// Identify lowered values that originated from f128 arguments and record
+  /// this for use by RetCC_MipsN.
+  void PreAnalyzeReturnForF128(const SmallVectorImpl<ISD::OutputArg> &Outs);
+
+  /// Identify lowered values that originated from f128 arguments and record
+  /// this.
+  void
+  PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         std::vector<TargetLowering::ArgListEntry> &FuncArgs,
+                         const SDNode *CallNode);
+
+  /// Identify lowered values that originated from f128 arguments and record
+  /// this.
+  void
+  PreAnalyzeFormalArgumentsForF128(const SmallVectorImpl<ISD::InputArg> &Ins);
+
+  /// Records whether the value has been lowered from an f128.
+  SmallVector<bool, 4> OriginalArgWasF128;
+
+  /// Records whether the value has been lowered from float.
+  SmallVector<bool, 4> OriginalArgWasFloat;
+
+  /// Records whether the value was a fixed argument.
+  /// See ISD::OutputArg::IsFixed,
+  SmallVector<bool, 4> CallOperandIsFixed;
+
+  // Used to handle MIPS16-specific calling convention tweaks.
+  // FIXME: This should probably be a fully fledged calling convention.
+  SpecialCallingConvType SpecialCallingConv;
+
+public:
+  MipsCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+              SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
+              SpecialCallingConvType SpecialCC = NoSpecialCallingConv)
+      : CCState(CC, isVarArg, MF, locs, C), SpecialCallingConv(SpecialCC) {}
+
+  void
+  AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      CCAssignFn Fn,
+                      std::vector<TargetLowering::ArgListEntry> &FuncArgs,
+                      const SDNode *CallNode) {
+    PreAnalyzeCallOperands(Outs, FuncArgs, CallNode);
+    CCState::AnalyzeCallOperands(Outs, Fn);
+    OriginalArgWasF128.clear();
+    OriginalArgWasFloat.clear();
+    CallOperandIsFixed.clear();
+  }
+
+  // The AnalyzeCallOperands in the base class is not usable since we must
+  // provide a means of accessing ArgListEntry::IsFixed. Delete them from this
+  // class. This doesn't stop them being used via the base class though.
+  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           CCAssignFn Fn) LLVM_DELETED_FUNCTION;
+  void AnalyzeCallOperands(const SmallVectorImpl<MVT> &Outs,
+                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                           CCAssignFn Fn) LLVM_DELETED_FUNCTION;
+
+  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                              CCAssignFn Fn) {
+    PreAnalyzeFormalArgumentsForF128(Ins);
+    CCState::AnalyzeFormalArguments(Ins, Fn);
+    OriginalArgWasFloat.clear();
+    OriginalArgWasF128.clear();
+  }
+
+  void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                         CCAssignFn Fn,
+                         const TargetLowering::CallLoweringInfo &CLI) {
+    PreAnalyzeCallResultForF128(Ins, CLI);
+    CCState::AnalyzeCallResult(Ins, Fn);
+    OriginalArgWasFloat.clear();
+    OriginalArgWasF128.clear();
+  }
+
+  void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     CCAssignFn Fn) {
+    PreAnalyzeReturnForF128(Outs);
+    CCState::AnalyzeReturn(Outs, Fn);
+    OriginalArgWasFloat.clear();
+    OriginalArgWasF128.clear();
+  }
+
+  bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+                   CCAssignFn Fn) {
+    PreAnalyzeReturnForF128(ArgsFlags);
+    bool Return = CCState::CheckReturn(ArgsFlags, Fn);
+    OriginalArgWasFloat.clear();
+    OriginalArgWasF128.clear();
+    return Return;
+  }
+
+  bool WasOriginalArgF128(unsigned ValNo) { return OriginalArgWasF128[ValNo]; }
+  bool WasOriginalArgFloat(unsigned ValNo) {
+      return OriginalArgWasFloat[ValNo];
+  }
+  bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; }
+  SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
+};
+}
+
+#endif

diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 007213c..7318de2 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td

@@ -10,13 +10,60 @@
 //===----------------------------------------------------------------------===//
 
 /// CCIfSubtarget - Match if the current subtarget has a feature F.
-class CCIfSubtarget<string F, CCAction A>:
-  CCIf<!strconcat("State.getTarget().getSubtarget<MipsSubtarget>().", F), A>;
+class CCIfSubtarget<string F, CCAction A, string Invert = "">
+    : CCIf<!strconcat(Invert,
+                      "static_cast<const MipsSubtarget&>"
+			"(State.getMachineFunction().getSubtarget()).",
+                      F),
+           A>;
+
+// The inverse of CCIfSubtarget
+class CCIfSubtargetNot<string F, CCAction A> : CCIfSubtarget<F, A, "!">;
+
+// For soft-float, f128 values are returned in A0_64 rather than V1_64.
+def RetCC_F128SoftFloat : CallingConv<[
+  CCAssignToReg<[V0_64, A0_64]>
+]>;
+
+// For hard-float, f128 values are returned as a pair of f64's rather than a
+// pair of i64's.
+def RetCC_F128HardFloat : CallingConv<[
+  CCBitConvertToType<f64>,
+
+  // Contrary to the ABI documentation, a struct containing a long double is
+  // returned in $f0, and $f1 instead of the usual $f0, and $f2. This is to
+  // match the de facto ABI as implemented by GCC.
+  CCIfInReg<CCAssignToReg<[D0_64, D1_64]>>,
+
+  CCAssignToReg<[D0_64, D2_64]>
+]>;
+
+// Handle F128 specially since we can't identify the original type during the
+// tablegen-erated code.
+def RetCC_F128 : CallingConv<[
+  CCIfSubtarget<"abiUsesSoftFloat()",
+      CCIfType<[i64], CCDelegateTo<RetCC_F128SoftFloat>>>,
+  CCIfSubtargetNot<"abiUsesSoftFloat()",
+      CCIfType<[i64], CCDelegateTo<RetCC_F128HardFloat>>>
+]>;
 
 //===----------------------------------------------------------------------===//
 // Mips O32 Calling Convention
 //===----------------------------------------------------------------------===//
 
+def CC_MipsO32 : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Integer values get stored in stack slots that are 8 bytes in
+  // size and 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
 // Only the return rules are defined here for O32. The rules for argument
 // passing are defined in MipsISelLowering.cpp.
 def RetCC_MipsO32 : CallingConv<[
@@ -26,26 +73,46 @@
   // f32 are returned in registers F0, F2
   CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
 
-  // f64 arguments are returned in D0_64 and D1_64 in FP64bit mode or
+  // f64 arguments are returned in D0_64 and D2_64 in FP64bit mode or
   // in D0 and D1 in FP32bit mode.
-  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D1_64]>>>,
-  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()", CCAssignToReg<[D0, D1]>>>
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D2_64]>>>,
+  CCIfType<[f64], CCIfSubtargetNot<"isFP64bit()", CCAssignToReg<[D0, D1]>>>
+]>;
+
+def CC_MipsO32_FP32 : CustomCallingConv;
+def CC_MipsO32_FP64 : CustomCallingConv;
+
+def CC_MipsO32_FP : CallingConv<[
+  CCIfSubtargetNot<"isFP64bit()", CCDelegateTo<CC_MipsO32_FP32>>,
+  CCIfSubtarget<"isFP64bit()", CCDelegateTo<CC_MipsO32_FP64>>
 ]>;
 
 //===----------------------------------------------------------------------===//
 // Mips N32/64 Calling Convention
 //===----------------------------------------------------------------------===//
 
+def CC_MipsN_SoftFloat : CallingConv<[
+  CCAssignToRegWithShadow<[A0, A1, A2, A3,
+                           T0, T1, T2, T3],
+                          [D12_64, D13_64, D14_64, D15_64,
+                           D16_64, D17_64, D18_64, D19_64]>,
+  CCAssignToStack<4, 8>
+]>;
+
 def CC_MipsN : CallingConv<[
-  // Promote i8/i16 arguments to i32.
-  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i8, i16, i32],
+      CCIfSubtargetNot<"isLittle()",
+          CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
+
+  // All integers (except soft-float integers) are promoted to 64-bit.
+  CCIfType<[i8, i16, i32],
+     CCIf<"!static_cast<MipsCCState *>(&State)->WasOriginalArgFloat(ValNo)",
+          CCPromoteToType<i64>>>,
+
+  // The only i32's we have left are soft-float arguments.
+  CCIfSubtarget<"abiUsesSoftFloat()", CCIfType<[i32], CCDelegateTo<CC_MipsN_SoftFloat>>>,
 
   // Integer arguments are passed in integer registers.
-  CCIfType<[i32], CCAssignToRegWithShadow<[A0, A1, A2, A3,
-                                           T0, T1, T2, T3],
-                                          [F12, F13, F14, F15,
-                                           F16, F17, F18, F19]>>,
-
   CCIfType<[i64], CCAssignToRegWithShadow<[A0_64, A1_64, A2_64, A3_64,
                                            T0_64, T1_64, T2_64, T3_64],
                                           [D12_64, D13_64, D14_64, D15_64,
@@ -64,29 +131,49 @@
                                            T0_64, T1_64, T2_64, T3_64]>>,
 
   // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
-  CCIfType<[i32, f32], CCAssignToStack<4, 8>>,
+  CCIfType<[f32], CCAssignToStack<4, 8>>,
   CCIfType<[i64, f64], CCAssignToStack<8, 8>>
 ]>;
 
 // N32/64 variable arguments.
 // All arguments are passed in integer registers.
 def CC_MipsN_VarArg : CallingConv<[
-  // Promote i8/i16 arguments to i32.
-  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  // All integers are promoted to 64-bit.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
 
-  CCIfType<[i32, f32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
+  CCIfType<[f32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
 
   CCIfType<[i64, f64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64,
                                       T0_64, T1_64, T2_64, T3_64]>>,
 
   // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
-  CCIfType<[i32, f32], CCAssignToStack<4, 8>>,
+  CCIfType<[f32], CCAssignToStack<4, 8>>,
   CCIfType<[i64, f64], CCAssignToStack<8, 8>>
 ]>;
 
 def RetCC_MipsN : CallingConv<[
-  // i32 are returned in registers V0, V1
-  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+  // f128 needs to be handled similarly to f32 and f64. However, f128 is not
+  // legal and is lowered to i128 which is further lowered to a pair of i64's.
+  // This presents us with a problem for the calling convention since hard-float
+  // still needs to pass them in FPU registers, and soft-float needs to use $v0,
+  // and $a0 instead of the usual $v0, and $v1. We therefore resort to a
+  // pre-analyze (see PreAnalyzeReturnForF128()) step to pass information on
+  // whether the result was originally an f128 into the tablegen-erated code.
+  //
+  // f128 should only occur for the N64 ABI where long double is 128-bit. On
+  // N32, long double is equivalent to double.
+  CCIfType<[i64],
+      CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)",
+           CCDelegateTo<RetCC_F128>>>,
+
+  // Aggregate returns are positioned at the lowest address in the slot for
+  // both little and big-endian targets. When passing in registers, this
+  // requires that big-endian targets shift the value into the upper bits.
+  CCIfSubtarget<"isLittle()",
+      CCIfType<[i8, i16, i32, i64], CCIfInReg<CCPromoteToType<i64>>>>,
+  CCIfSubtargetNot<"isLittle()",
+      CCIfType<[i8, i16, i32, i64],
+          CCIfInReg<CCPromoteToUpperBitsInType<i64>>>>,
 
   // i64 are returned in registers V0_64, V1_64
   CCIfType<[i64], CCAssignToReg<[V0_64, V1_64]>>,
@@ -98,12 +185,6 @@
   CCIfType<[f64], CCAssignToReg<[D0_64, D2_64]>>
 ]>;
 
-// In soft-mode, register A0_64, instead of V1_64, is used to return a long
-// double value.
-def RetCC_F128Soft : CallingConv<[
-  CCIfType<[i64], CCAssignToReg<[V0_64, A0_64]>>
-]>;
-
 //===----------------------------------------------------------------------===//
 // Mips EABI Calling Convention
 //===----------------------------------------------------------------------===//
@@ -119,11 +200,11 @@
   CCIfType<[f32], CCIfSubtarget<"isSingleFloat()",
                   CCAssignToReg<[F12, F13, F14, F15, F16, F17, F18, F19]>>>,
 
-  CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()",
+  CCIfType<[f32], CCIfSubtargetNot<"isSingleFloat()",
                   CCAssignToReg<[F12, F14, F16, F18]>>>,
 
   // The first 4 double fp arguments are passed in single fp registers.
-  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()",
+  CCIfType<[f64], CCIfSubtargetNot<"isSingleFloat()",
                   CCAssignToReg<[D6, D7, D8, D9]>>>,
 
   // Integer values get stored in stack slots that are 4 bytes in
@@ -132,7 +213,7 @@
 
   // Integer values get stored in stack slots that are 8 bytes in
   // size and 8-byte aligned.
-  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToStack<8, 8>>>
+  CCIfType<[f64], CCIfSubtargetNot<"isSingleFloat()", CCAssignToStack<8, 8>>>
 ]>;
 
 def RetCC_MipsEABI : CallingConv<[
@@ -143,7 +224,7 @@
   CCIfType<[f32], CCAssignToReg<[F0, F1]>>,
 
   // f64 are returned in register D0
-  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0]>>>
+  CCIfType<[f64], CCIfSubtargetNot<"isSingleFloat()", CCAssignToReg<[D0]>>>
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -151,16 +232,20 @@
 //===----------------------------------------------------------------------===//
 def CC_MipsO32_FastCC : CallingConv<[
   // f64 arguments are passed in double-precision floating pointer registers.
-  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()",
-                                CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7,
-                                               D8, D9]>>>,
-  CCIfType<[f64], CCIfSubtarget<"isFP64bit()",
+  CCIfType<[f64], CCIfSubtargetNot<"isFP64bit()",
+                                   CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6,
+                                                  D7, D8, D9]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCIfSubtarget<"useOddSPReg()",
                                 CCAssignToReg<[D0_64, D1_64, D2_64, D3_64,
                                                D4_64, D5_64, D6_64, D7_64,
                                                D8_64, D9_64, D10_64, D11_64,
                                                D12_64, D13_64, D14_64, D15_64,
                                                D16_64, D17_64, D18_64,
-                                               D19_64]>>>,
+                                               D19_64]>>>>,
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCIfSubtarget<"noOddSPReg()",
+                                CCAssignToReg<[D0_64, D2_64, D4_64, D6_64,
+                                               D8_64, D10_64, D12_64, D14_64,
+                                               D16_64, D18_64]>>>>,
 
   // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
   CCIfType<[f64], CCAssignToStack<8, 8>>
@@ -192,7 +277,7 @@
 
   // Integer arguments are passed in integer registers. All scratch registers,
   // except for AT, V0 and T9, are available to be used as argument registers.
-  CCIfType<[i32], CCIfSubtarget<"isNotTargetNaCl()",
+  CCIfType<[i32], CCIfSubtargetNot<"isTargetNaCl()",
       CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>>,
 
   // In NaCl, T6, T7 and T8 are reserved and not available as argument
@@ -203,8 +288,13 @@
       CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, V1]>>>,
 
   // f32 arguments are passed in single-precision floating pointer registers.
-  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10,
-                                 F11, F12, F13, F14, F15, F16, F17, F18, F19]>>,
+  CCIfType<[f32], CCIfSubtarget<"useOddSPReg()",
+      CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13,
+                     F14, F15, F16, F17, F18, F19]>>>,
+
+  // Don't use odd numbered single-precision registers for -mno-odd-spreg.
+  CCIfType<[f32], CCIfSubtarget<"noOddSPReg()",
+      CCAssignToReg<[F0, F2, F4, F6, F8, F10, F12, F14, F16, F18]>>>,
 
   // Stack parameter slots for i32 and f32 are 32-bit words and 4-byte aligned.
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
@@ -214,13 +304,6 @@
   CCDelegateTo<CC_MipsN_FastCC>
 ]>;
 
-//==
-
-def CC_Mips16RetHelper : CallingConv<[
-  // Integer arguments are passed in integer registers.
-  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>
-]>;
-
 //===----------------------------------------------------------------------===//
 // Mips Calling Convention Dispatch
 //===----------------------------------------------------------------------===//
@@ -232,6 +315,66 @@
   CCDelegateTo<RetCC_MipsO32>
 ]>;
 
+def CC_Mips_ByVal : CallingConv<[
+  CCIfSubtarget<"isABI_O32()", CCIfByVal<CCPassByVal<4, 4>>>,
+  CCIfByVal<CCPassByVal<8, 8>>
+]>;
+
+def CC_Mips16RetHelper : CallingConv<[
+  CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>
+]>;
+
+def CC_Mips_FixedArg : CallingConv<[
+  // Mips16 needs special handling on some functions.
+  CCIf<"State.getCallingConv() != CallingConv::Fast",
+      CCIf<"static_cast<MipsCCState *>(&State)->getSpecialCallingConv() == "
+               "MipsCCState::Mips16RetHelperConv",
+           CCDelegateTo<CC_Mips16RetHelper>>>,
+
+  CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
+
+  // f128 needs to be handled similarly to f32 and f64 on hard-float. However,
+  // f128 is not legal and is lowered to i128 which is further lowered to a pair
+  // of i64's.
+  // This presents us with a problem for the calling convention since hard-float
+  // still needs to pass them in FPU registers. We therefore resort to a
+  // pre-analyze (see PreAnalyzeFormalArgsForF128()) step to pass information on
+  // whether the argument was originally an f128 into the tablegen-erated code.
+  //
+  // f128 should only occur for the N64 ABI where long double is 128-bit. On
+  // N32, long double is equivalent to double.
+  CCIfType<[i64],
+      CCIfSubtargetNot<"abiUsesSoftFloat()",
+          CCIf<"static_cast<MipsCCState *>(&State)->WasOriginalArgF128(ValNo)",
+              CCBitConvertToType<f64>>>>,
+
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_Mips_FastCC>>,
+
+  // FIXME: There wasn't an EABI case in the original code and it seems unlikely
+  //        that it's the same as CC_MipsN
+  CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FP>>,
+  CCDelegateTo<CC_MipsN>
+]>;
+
+def CC_Mips_VarArg : CallingConv<[
+  CCIfByVal<CCDelegateTo<CC_Mips_ByVal>>,
+
+  // FIXME: There wasn't an EABI case in the original code and it seems unlikely
+  //        that it's the same as CC_MipsN_VarArg
+  CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FP>>,
+  CCDelegateTo<CC_MipsN_VarArg>
+]>;
+
+def CC_Mips : CallingConv<[
+  CCIfVarArg<
+      CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)",
+          CCDelegateTo<CC_Mips_VarArg>>>,
+  CCDelegateTo<CC_Mips_FixedArg>
+]>;
+
 //===----------------------------------------------------------------------===//
 // Callee-saved register lists.
 //===----------------------------------------------------------------------===//
@@ -247,8 +390,9 @@
 def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
                                    (sequence "S%u", 7, 0))>;
 
-def CSR_O32_FP64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 20), RA, FP,
-                                        (sequence "S%u", 7, 0))>;
+def CSR_O32_FP64 :
+  CalleeSavedRegs<(add (decimate (sequence "D%u_64", 30, 20), 2), RA, FP,
+                       (sequence "S%u", 7, 0))>;
 
 def CSR_N32 : CalleeSavedRegs<(add D20_64, D22_64, D24_64, D26_64, D28_64,
                                    D30_64, RA_64, FP_64, GP_64,

diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
deleted file mode 100644
index 151ef13..0000000
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ /dev/null

@@ -1,434 +0,0 @@
-//===-- Mips/MipsCodeEmitter.cpp - Convert Mips Code to Machine Code ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===---------------------------------------------------------------------===//
-//
-// This file contains the pass that transforms the Mips machine instructions
-// into relocatable machine code.
-//
-//===---------------------------------------------------------------------===//
-
-#include "Mips.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
-#include "MipsInstrInfo.h"
-#include "MipsRelocations.h"
-#include "MipsSubtarget.h"
-#include "MipsTargetMachine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#ifndef NDEBUG
-#include <iomanip>
-#endif
-
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-STATISTIC(NumEmitted, "Number of machine instructions emitted");
-
-namespace {
-
-class MipsCodeEmitter : public MachineFunctionPass {
-  MipsJITInfo *JTI;
-  const MipsInstrInfo *II;
-  const DataLayout *TD;
-  const MipsSubtarget *Subtarget;
-  TargetMachine &TM;
-  JITCodeEmitter &MCE;
-  const std::vector<MachineConstantPoolEntry> *MCPEs;
-  const std::vector<MachineJumpTableEntry> *MJTEs;
-  bool IsPIC;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineModuleInfo> ();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  static char ID;
-
-public:
-  MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-    : MachineFunctionPass(ID), JTI(nullptr), II(nullptr), TD(nullptr),
-      TM(tm), MCE(mce), MCPEs(nullptr), MJTEs(nullptr),
-      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "Mips Machine Code Emitter";
-  }
-
-  /// getBinaryCodeForInstr - This function, generated by the
-  /// CodeEmitterGenerator using TableGen, produces the binary encoding for
-  /// machine instructions.
-  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
-
-  void emitInstruction(MachineBasicBlock::instr_iterator MI,
-                       MachineBasicBlock &MBB);
-
-private:
-
-  void emitWord(unsigned Word);
-
-  /// Routines that handle operands which add machine relocations which are
-  /// fixed up by the relocation stage.
-  void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
-                         bool MayNeedFarStub) const;
-  void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
-  void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
-  void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
-  void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MachineInstr &MI,
-                             const MachineOperand &MO) const;
-
-  unsigned getRelocation(const MachineInstr &MI,
-                         const MachineOperand &MO) const;
-
-  unsigned getJumpTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getJumpTargetOpValueMM(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getBranchTargetOpValueMM(const MachineInstr &MI,
-                                    unsigned OpNo) const;
-
-  unsigned getBranchTarget21OpValue(const MachineInstr &MI,
-                                    unsigned OpNo) const;
-  unsigned getBranchTarget26OpValue(const MachineInstr &MI,
-                                    unsigned OpNo) const;
-  unsigned getJumpOffset16OpValue(const MachineInstr &MI, unsigned OpNo) const;
-
-  unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getMemEncodingMMImm12(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getMSAMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getSimm19Lsl2Encoding(const MachineInstr &MI, unsigned OpNo) const;
-  unsigned getSimm18Lsl3Encoding(const MachineInstr &MI, unsigned OpNo) const;
-
-  /// Expand pseudo instructions with accumulator register operands.
-  void expandACCInstr(MachineBasicBlock::instr_iterator MI,
-                      MachineBasicBlock &MBB, unsigned Opc) const;
-
-  /// \brief Expand pseudo instruction. Return true if MI was expanded.
-  bool expandPseudos(MachineBasicBlock::instr_iterator &MI,
-                     MachineBasicBlock &MBB) const;
-};
-}
-
-char MipsCodeEmitter::ID = 0;
-
-bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
-  MipsTargetMachine &Target = static_cast<MipsTargetMachine &>(
-                                const_cast<TargetMachine &>(MF.getTarget()));
-
-  JTI = Target.getJITInfo();
-  II = Target.getInstrInfo();
-  TD = Target.getDataLayout();
-  Subtarget = &TM.getSubtarget<MipsSubtarget> ();
-  MCPEs = &MF.getConstantPool()->getConstants();
-  MJTEs = nullptr;
-  if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
-  JTI->Initialize(MF, IsPIC, Subtarget->isLittle());
-  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
-
-  do {
-    DEBUG(errs() << "JITTing function '"
-        << MF.getName() << "'\n");
-    MCE.startFunction(MF);
-
-    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
-        MBB != E; ++MBB){
-      MCE.StartMachineBasicBlock(MBB);
-      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
-           E = MBB->instr_end(); I != E;)
-        emitInstruction(*I++, *MBB);
-    }
-  } while (MCE.finishFunction(MF));
-
-  return false;
-}
-
-unsigned MipsCodeEmitter::getRelocation(const MachineInstr &MI,
-                                        const MachineOperand &MO) const {
-  // NOTE: This relocations are for static.
-  uint64_t TSFlags = MI.getDesc().TSFlags;
-  uint64_t Form = TSFlags & MipsII::FormMask;
-  if (Form == MipsII::FrmJ)
-    return Mips::reloc_mips_26;
-  if ((Form == MipsII::FrmI || Form == MipsII::FrmFI)
-       && MI.isBranch())
-    return Mips::reloc_mips_pc16;
-  if (Form == MipsII::FrmI && MI.getOpcode() == Mips::LUi)
-    return Mips::reloc_mips_hi;
-  return Mips::reloc_mips_lo;
-}
-
-unsigned MipsCodeEmitter::getJumpTargetOpValue(const MachineInstr &MI,
-                                               unsigned OpNo) const {
-  MachineOperand MO = MI.getOperand(OpNo);
-  if (MO.isGlobal())
-    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
-  else if (MO.isSymbol())
-    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
-  else if (MO.isMBB())
-    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
-  else
-    llvm_unreachable("Unexpected jump target operand kind.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getJumpTargetOpValueMM(const MachineInstr &MI,
-                                                 unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getBranchTargetOpValueMM(const MachineInstr &MI,
-                                                   unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getBranchTarget21OpValue(const MachineInstr &MI,
-                                                   unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getBranchTarget26OpValue(const MachineInstr &MI,
-                                                   unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getJumpOffset16OpValue(const MachineInstr &MI,
-                                                 unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
-                                                 unsigned OpNo) const {
-  MachineOperand MO = MI.getOperand(OpNo);
-  emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getMemEncoding(const MachineInstr &MI,
-                                         unsigned OpNo) const {
-  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
-  assert(MI.getOperand(OpNo).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo)) << 16;
-  return (getMachineOpValue(MI, MI.getOperand(OpNo+1)) & 0xFFFF) | RegBits;
-}
-
-unsigned MipsCodeEmitter::getMemEncodingMMImm12(const MachineInstr &MI,
-                                                unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getMSAMemEncoding(const MachineInstr &MI,
-                                            unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getSizeExtEncoding(const MachineInstr &MI,
-                                             unsigned OpNo) const {
-  // size is encoded as size-1.
-  return getMachineOpValue(MI, MI.getOperand(OpNo)) - 1;
-}
-
-unsigned MipsCodeEmitter::getSizeInsEncoding(const MachineInstr &MI,
-                                             unsigned OpNo) const {
-  // size is encoded as pos+size-1.
-  return getMachineOpValue(MI, MI.getOperand(OpNo-1)) +
-         getMachineOpValue(MI, MI.getOperand(OpNo)) - 1;
-}
-
-unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI,
-                                            unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getSimm18Lsl3Encoding(const MachineInstr &MI,
-                                                unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-unsigned MipsCodeEmitter::getSimm19Lsl2Encoding(const MachineInstr &MI,
-                                                unsigned OpNo) const {
-  llvm_unreachable("Unimplemented function.");
-  return 0;
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
-                                            const MachineOperand &MO) const {
-  if (MO.isReg())
-    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
-  else if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-  else if (MO.isGlobal())
-    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
-  else if (MO.isSymbol())
-    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
-  else if (MO.isCPI())
-    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
-  else if (MO.isJTI())
-    emitJumpTableAddress(MO.getIndex(), getRelocation(MI, MO));
-  else if (MO.isMBB())
-    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
-  else
-    llvm_unreachable("Unable to encode MachineOperand!");
-  return 0;
-}
-
-void MipsCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
-                                        bool MayNeedFarStub) const {
-  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
-                                             const_cast<GlobalValue *>(GV), 0,
-                                             MayNeedFarStub));
-}
-
-void MipsCodeEmitter::
-emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
-                                                 Reloc, ES, 0, 0));
-}
-
-void MipsCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
-                                                    Reloc, CPI, 0, false));
-}
-
-void MipsCodeEmitter::
-emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
-                                                    Reloc, JTIndex, 0, false));
-}
-
-void MipsCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
-                                            unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
-                                             Reloc, BB));
-}
-
-void MipsCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
-                                      MachineBasicBlock &MBB) {
-  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << *MI);
-
-  // Expand pseudo instruction. Skip if MI was not expanded.
-  if (((MI->getDesc().TSFlags & MipsII::FormMask) == MipsII::Pseudo) &&
-      !expandPseudos(MI, MBB))
-    return;
-
-  MCE.processDebugLoc(MI->getDebugLoc(), true);
-
-  emitWord(getBinaryCodeForInstr(*MI));
-  ++NumEmitted;  // Keep track of the # of mi's emitted
-
-  MCE.processDebugLoc(MI->getDebugLoc(), false);
-}
-
-void MipsCodeEmitter::emitWord(unsigned Word) {
-  DEBUG(errs() << "  0x";
-        errs().write_hex(Word) << "\n");
-  if (Subtarget->isLittle())
-    MCE.emitWordLE(Word);
-  else
-    MCE.emitWordBE(Word);
-}
-
-void MipsCodeEmitter::expandACCInstr(MachineBasicBlock::instr_iterator MI,
-                                     MachineBasicBlock &MBB,
-                                     unsigned Opc) const {
-  // Expand "pseudomult $ac0, $t0, $t1" to "mult $t0, $t1".
-  BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Opc))
-    .addReg(MI->getOperand(1).getReg()).addReg(MI->getOperand(2).getReg());
-}
-
-bool MipsCodeEmitter::expandPseudos(MachineBasicBlock::instr_iterator &MI,
-                                    MachineBasicBlock &MBB) const {
-  switch (MI->getOpcode()) {
-  case Mips::NOP:
-    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::SLL), Mips::ZERO)
-      .addReg(Mips::ZERO).addImm(0);
-    break;
-  case Mips::B:
-    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BEQ)).addReg(Mips::ZERO)
-      .addReg(Mips::ZERO).addOperand(MI->getOperand(0));
-    break;
-  case Mips::TRAP:
-    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BREAK)).addImm(0)
-      .addImm(0);
-    break;
-  case Mips::JALRPseudo:
-    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::JALR), Mips::RA)
-      .addReg(MI->getOperand(0).getReg());
-    break;
-  case Mips::PseudoMULT:
-    expandACCInstr(MI, MBB, Mips::MULT);
-    break;
-  case Mips::PseudoMULTu:
-    expandACCInstr(MI, MBB, Mips::MULTu);
-    break;
-  case Mips::PseudoSDIV:
-    expandACCInstr(MI, MBB, Mips::SDIV);
-    break;
-  case Mips::PseudoUDIV:
-    expandACCInstr(MI, MBB, Mips::UDIV);
-    break;
-  case Mips::PseudoMADD:
-    expandACCInstr(MI, MBB, Mips::MADD);
-    break;
-  case Mips::PseudoMADDU:
-    expandACCInstr(MI, MBB, Mips::MADDU);
-    break;
-  case Mips::PseudoMSUB:
-    expandACCInstr(MI, MBB, Mips::MSUB);
-    break;
-  case Mips::PseudoMSUBU:
-    expandACCInstr(MI, MBB, Mips::MSUBU);
-    break;
-  default:
-    return false;
-  }
-
-  (MI--)->eraseFromBundle();
-  return true;
-}
-
-/// createMipsJITCodeEmitterPass - Return a pass that emits the collected Mips
-/// code to the specified MCE object.
-FunctionPass *llvm::createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
-                                                 JITCodeEmitter &JCE) {
-  return new MipsCodeEmitter(TM, JCE);
-}
-
-#include "MipsGenCodeEmitter.inc"

diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index a37062f..c4e5ac0 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp

@@ -28,6 +28,7 @@
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -343,7 +344,6 @@
 
   const TargetMachine &TM;
   bool IsPIC;
-  unsigned ABI;
   const MipsSubtarget *STI;
   const Mips16InstrInfo *TII;
   MipsFunctionInfo *MFI;
@@ -365,11 +365,9 @@
   public:
     static char ID;
     MipsConstantIslands(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm),
-        IsPIC(TM.getRelocationModel() == Reloc::PIC_),
-        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
-        STI(&TM.getSubtarget<MipsSubtarget>()), MF(nullptr), MCP(nullptr),
-        PrescannedForConstants(false){}
+        : MachineFunctionPass(ID), TM(tm),
+          IsPIC(TM.getRelocationModel() == Reloc::PIC_), STI(nullptr),
+          MF(nullptr), MCP(nullptr), PrescannedForConstants(false) {}
 
     const char *getPassName() const override {
       return "Mips Constant Islands";
@@ -450,12 +448,14 @@
   // FIXME:
   MF = &mf;
   MCP = mf.getConstantPool();
+  STI = &mf.getTarget().getSubtarget<MipsSubtarget>();
   DEBUG(dbgs() << "constant island machine function " << "\n");
-  if (!TM.getSubtarget<MipsSubtarget>().inMips16Mode() ||
-      !MipsSubtarget::useConstantIslands()) {
+  if (!STI->inMips16Mode() || !MipsSubtarget::useConstantIslands()) {
     return false;
   }
-  TII = (const Mips16InstrInfo*)MF->getTarget().getInstrInfo();
+  TII = (const Mips16InstrInfo *)MF->getTarget()
+            .getSubtargetImpl()
+            ->getInstrInfo();
   MFI = MF->getInfo<MipsFunctionInfo>();
   DEBUG(dbgs() << "constant island processing " << "\n");
   //
@@ -562,7 +562,7 @@
   // identity mapping of CPI's to CPE's.
   const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
 
-  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  const DataLayout &TD = *MF->getSubtarget().getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
     assert(Size >= 4 && "Too small constant pool entry");
@@ -588,9 +588,7 @@
       if (InsPoint[a] == InsAt)
         InsPoint[a] = CPEMI;
     // Add a new CPEntry, but no corresponding CPUser yet.
-    std::vector<CPEntry> CPEs;
-    CPEs.push_back(CPEntry(CPEMI, i));
-    CPEntries.push_back(CPEs);
+    CPEntries.emplace_back(1, CPEntry(CPEMI, i));
     ++NumCPEs;
     DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
                  << Size << ", align = " << Align <<'\n');

diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index bcfbc12..d7ba6d4 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp

@@ -275,7 +275,11 @@
 
 #ifndef NDEBUG
     const MachineFunction &MF = *MBB.getParent();
-    assert(MF.getTarget().getRegisterInfo()->getAllocatableSet(MF).test(R) &&
+    assert(MF.getTarget()
+               .getSubtargetImpl()
+               ->getRegisterInfo()
+               ->getAllocatableSet(MF)
+               .test(R) &&
            "Shouldn't move an instruction with unallocatable registers across "
            "basic block boundaries.");
 #endif
@@ -286,8 +290,8 @@
 }
 
 RegDefsUses::RegDefsUses(TargetMachine &TM)
-  : TRI(*TM.getRegisterInfo()), Defs(TRI.getNumRegs(), false),
-    Uses(TRI.getNumRegs(), false) {}
+    : TRI(*TM.getSubtargetImpl()->getRegisterInfo()),
+      Defs(TRI.getNumRegs(), false), Uses(TRI.getNumRegs(), false) {}
 
 void RegDefsUses::init(const MachineInstr &MI) {
   // Add all register operands which are explicit and non-variadic.
@@ -451,7 +455,8 @@
 
 bool MemDefsUses::updateDefsUses(ValueType V, bool MayStore) {
   if (MayStore)
-    return !Defs.insert(V) || Uses.count(V) || SeenNoObjStore || SeenNoObjLoad;
+    return !Defs.insert(V).second || Uses.count(V) || SeenNoObjStore ||
+           SeenNoObjLoad;
 
   Uses.insert(V);
   return Defs.count(V) || SeenNoObjStore;
@@ -493,30 +498,38 @@
 /// We assume there is only one delay slot per delayed instruction.
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
+  bool InMicroMipsMode = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode();
 
   for (Iter I = MBB.begin(); I != MBB.end(); ++I) {
     if (!hasUnoccupiedSlot(&*I))
       continue;
 
-    ++FilledSlots;
-    Changed = true;
+    // For microMIPS, at the moment, do not fill delay slots of call
+    // instructions.
+    //
+    // TODO: Support for replacing regular call instructions with corresponding
+    // short delay slot instructions should be implemented.
+    if (!InMicroMipsMode || !I->isCall()) {
+      ++FilledSlots;
+      Changed = true;
 
-    // Delay slot filling is disabled at -O0.
-    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
-      if (searchBackward(MBB, I))
-        continue;
-
-      if (I->isTerminator()) {
-        if (searchSuccBBs(MBB, I))
+      // Delay slot filling is disabled at -O0.
+      if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
+        if (searchBackward(MBB, I))
           continue;
-      } else if (searchForward(MBB, I)) {
-        continue;
+
+        if (I->isTerminator()) {
+          if (searchSuccBBs(MBB, I))
+            continue;
+        } else if (searchForward(MBB, I)) {
+          continue;
+        }
       }
     }
 
     // Bundle the NOP to the instruction with the delay slot.
-    const MipsInstrInfo *TII =
-      static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+    const MipsInstrInfo *TII = static_cast<const MipsInstrInfo *>(
+        TM.getSubtargetImpl()->getInstrInfo());
     BuildMI(MBB, std::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
     MIBundleBuilder(MBB, I, std::next(I, 2));
   }
@@ -554,9 +567,10 @@
       // branches are not checked because non-NaCl targets never put them in
       // delay slots.
       unsigned AddrIdx;
-      if ((isBasePlusOffsetMemoryAccess(I->getOpcode(), &AddrIdx)
-           && baseRegNeedsLoadStoreMask(I->getOperand(AddrIdx).getReg()))
-          || I->modifiesRegister(Mips::SP, TM.getRegisterInfo()))
+      if ((isBasePlusOffsetMemoryAccess(I->getOpcode(), &AddrIdx) &&
+           baseRegNeedsLoadStoreMask(I->getOperand(AddrIdx).getReg())) ||
+          I->modifiesRegister(Mips::SP,
+                              TM.getSubtargetImpl()->getRegisterInfo()))
         continue;
     }
 
@@ -667,7 +681,7 @@
 std::pair<MipsInstrInfo::BranchType, MachineInstr *>
 Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
   const MipsInstrInfo *TII =
-    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
   MachineBasicBlock *TrueBB = nullptr, *FalseBB = nullptr;
   SmallVector<MachineInstr*, 2> BranchInstrs;
   SmallVector<MachineOperand, 2> Cond;

diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index 617801b..2bb16e3 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp

@@ -8,6 +8,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "MipsCCState.h"
 #include "MipsRegisterInfo.h"
 #include "MipsISelLowering.h"
 #include "MipsMachineFunction.h"
@@ -18,23 +19,44 @@
 
 namespace {
 
-// All possible address modes.
-typedef struct Address {
-  enum { RegBase, FrameIndexBase } BaseType;
-
-  union {
-    unsigned Reg;
-    int FI;
-  } Base;
-
-  int64_t Offset;
-
-  // Innocuous defaults for our address.
-  Address() : BaseType(RegBase), Offset(0) { Base.Reg = 0; }
-} Address;
-
 class MipsFastISel final : public FastISel {
 
+  // All possible address modes.
+  class Address {
+  public:
+    typedef enum { RegBase, FrameIndexBase } BaseKind;
+
+  private:
+    BaseKind Kind;
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+
+    int64_t Offset;
+
+    const GlobalValue *GV;
+
+  public:
+    // Innocuous defaults for our address.
+    Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+    void setKind(BaseKind K) { Kind = K; }
+    BaseKind getKind() const { return Kind; }
+    bool isRegBase() const { return Kind == RegBase; }
+    void setReg(unsigned Reg) {
+      assert(isRegBase() && "Invalid base register access!");
+      Base.Reg = Reg;
+    }
+    unsigned getReg() const {
+      assert(isRegBase() && "Invalid base register access!");
+      return Base.Reg;
+    }
+    void setOffset(int64_t Offset_) { Offset = Offset_; }
+    int64_t getOffset() const { return Offset; }
+    void setGlobalValue(const GlobalValue *G) { GV = G; }
+    const GlobalValue *getGlobalValue() { return GV; }
+  };
+
   /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
   /// make the right decision when generating code for different targets.
   Module &M;
@@ -47,74 +69,267 @@
   // Convenience variables to avoid some queries.
   LLVMContext *Context;
 
+  bool fastLowerCall(CallLoweringInfo &CLI) override;
+
   bool TargetSupported;
-
-public:
-  explicit MipsFastISel(FunctionLoweringInfo &funcInfo,
-                        const TargetLibraryInfo *libInfo)
-      : FastISel(funcInfo, libInfo),
-        M(const_cast<Module &>(*funcInfo.Fn->getParent())),
-        TM(funcInfo.MF->getTarget()), TII(*TM.getInstrInfo()),
-        TLI(*TM.getTargetLowering()),
-        Subtarget(&TM.getSubtarget<MipsSubtarget>()) {
-    MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
-    Context = &funcInfo.Fn->getContext();
-    TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) &&
-                       (Subtarget->hasMips32r2() && (Subtarget->isABI_O32())));
-  }
-
-  bool TargetSelectInstruction(const Instruction *I) override;
-  unsigned TargetMaterializeConstant(const Constant *C) override;
-
-  bool ComputeAddress(const Value *Obj, Address &Addr);
+  bool UnsupportedFPMode; // To allow fast-isel to proceed and just not handle
+  // floating point but not reject doing fast-isel in other
+  // situations
 
 private:
-  bool EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
-                unsigned Alignment = 0);
-  bool EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
-                 unsigned Alignment = 0);
-  bool SelectLoad(const Instruction *I);
-  bool SelectRet(const Instruction *I);
-  bool SelectStore(const Instruction *I);
+  // Selection routines.
+  bool selectLoad(const Instruction *I);
+  bool selectStore(const Instruction *I);
+  bool selectBranch(const Instruction *I);
+  bool selectCmp(const Instruction *I);
+  bool selectFPExt(const Instruction *I);
+  bool selectFPTrunc(const Instruction *I);
+  bool selectFPToInt(const Instruction *I, bool IsSigned);
+  bool selectRet(const Instruction *I);
+  bool selectTrunc(const Instruction *I);
+  bool selectIntExt(const Instruction *I);
 
+  // Utility helper routines.
   bool isTypeLegal(Type *Ty, MVT &VT);
   bool isLoadTypeLegal(Type *Ty, MVT &VT);
+  bool computeAddress(const Value *Obj, Address &Addr);
+  bool computeCallAddress(const Value *V, Address &Addr);
 
-  unsigned MaterializeFP(const ConstantFP *CFP, MVT VT);
-  unsigned MaterializeGV(const GlobalValue *GV, MVT VT);
-  unsigned MaterializeInt(const Constant *C, MVT VT);
-  unsigned Materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+  // Emit helper routines.
+  bool emitCmp(unsigned DestReg, const CmpInst *CI);
+  bool emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                unsigned Alignment = 0);
+  bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
+                 MachineMemOperand *MMO = nullptr);
+  bool emitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                 unsigned Alignment = 0);
+  unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+  bool emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg,
 
+                  bool IsZExt);
+  bool emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg);
+
+  bool emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg);
+  bool emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                       unsigned DestReg);
+  bool emitIntSExt32r2(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                       unsigned DestReg);
+
+  unsigned getRegEnsuringSimpleIntegerWidening(const Value *, bool IsUnsigned);
+
+  unsigned materializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned materializeGV(const GlobalValue *GV, MVT VT);
+  unsigned materializeInt(const Constant *C, MVT VT);
+  unsigned materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+
+  MachineInstrBuilder emitInst(unsigned Opc) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  }
+  MachineInstrBuilder emitInst(unsigned Opc, unsigned DstReg) {
+    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
+                   DstReg);
+  }
+  MachineInstrBuilder emitInstStore(unsigned Opc, unsigned SrcReg,
+                                    unsigned MemReg, int64_t MemOffset) {
+    return emitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset);
+  }
+  MachineInstrBuilder emitInstLoad(unsigned Opc, unsigned DstReg,
+                                   unsigned MemReg, int64_t MemOffset) {
+    return emitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
+  }
   // for some reason, this default is not generated by tablegen
   // so we explicitly generate it here.
   //
-  unsigned FastEmitInst_riir(uint64_t inst, const TargetRegisterClass *RC,
+  unsigned fastEmitInst_riir(uint64_t inst, const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill, uint64_t imm1,
                              uint64_t imm2, unsigned Op3, bool Op3IsKill) {
     return 0;
   }
 
-  MachineInstrBuilder EmitInst(unsigned Opc) {
-    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
+  // Call handling routines.
+private:
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
+  bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
+                       unsigned &NumBytes);
+  bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
+
+public:
+  // Backend specific FastISel code.
+  explicit MipsFastISel(FunctionLoweringInfo &funcInfo,
+                        const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo),
+        M(const_cast<Module &>(*funcInfo.Fn->getParent())),
+        TM(funcInfo.MF->getTarget()),
+        TII(*TM.getSubtargetImpl()->getInstrInfo()),
+        TLI(*TM.getSubtargetImpl()->getTargetLowering()),
+        Subtarget(&TM.getSubtarget<MipsSubtarget>()) {
+    MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
+    Context = &funcInfo.Fn->getContext();
+    TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) &&
+                       ((Subtarget->hasMips32r2() || Subtarget->hasMips32()) &&
+                        (Subtarget->isABI_O32())));
+    UnsupportedFPMode = Subtarget->isFP64bit();
   }
 
-  MachineInstrBuilder EmitInst(unsigned Opc, unsigned DstReg) {
-    return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
-                   DstReg);
-  }
-
-  MachineInstrBuilder EmitInstStore(unsigned Opc, unsigned SrcReg,
-                                    unsigned MemReg, int64_t MemOffset) {
-    return EmitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset);
-  }
-
-  MachineInstrBuilder EmitInstLoad(unsigned Opc, unsigned DstReg,
-                                      unsigned MemReg, int64_t MemOffset) {
-    return EmitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
-  }
+  unsigned fastMaterializeConstant(const Constant *C) override;
+  bool fastSelectInstruction(const Instruction *I) override;
 
 #include "MipsGenFastISel.inc"
 };
+} // end anonymous namespace.
+
+static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
+                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                    CCState &State) LLVM_ATTRIBUTE_UNUSED;
+
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                            CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  llvm_unreachable("should not be called");
+}
+
+bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT, MVT LocVT,
+                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                     CCState &State) {
+  llvm_unreachable("should not be called");
+}
+
+#include "MipsGenCallingConv.inc"
+
+CCAssignFn *MipsFastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+  return CC_MipsO32;
+}
+
+unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) {
+  if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  int64_t Imm;
+  if ((VT != MVT::i1) && CI->isNegative())
+    Imm = CI->getSExtValue();
+  else
+    Imm = CI->getZExtValue();
+  return materialize32BitInt(Imm, RC);
+}
+
+unsigned MipsFastISel::materialize32BitInt(int64_t Imm,
+                                           const TargetRegisterClass *RC) {
+  unsigned ResultReg = createResultReg(RC);
+
+  if (isInt<16>(Imm)) {
+    unsigned Opc = Mips::ADDiu;
+    emitInst(Opc, ResultReg).addReg(Mips::ZERO).addImm(Imm);
+    return ResultReg;
+  } else if (isUInt<16>(Imm)) {
+    emitInst(Mips::ORi, ResultReg).addReg(Mips::ZERO).addImm(Imm);
+    return ResultReg;
+  }
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+  if (Lo) {
+    // Both Lo and Hi have nonzero bits.
+    unsigned TmpReg = createResultReg(RC);
+    emitInst(Mips::LUi, TmpReg).addImm(Hi);
+    emitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo);
+  } else {
+    emitInst(Mips::LUi, ResultReg).addImm(Hi);
+  }
+  return ResultReg;
+}
+
+unsigned MipsFastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
+  if (UnsupportedFPMode)
+    return 0;
+  int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+  if (VT == MVT::f32) {
+    const TargetRegisterClass *RC = &Mips::FGR32RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg = materialize32BitInt(Imm, &Mips::GPR32RegClass);
+    emitInst(Mips::MTC1, DestReg).addReg(TempReg);
+    return DestReg;
+  } else if (VT == MVT::f64) {
+    const TargetRegisterClass *RC = &Mips::AFGR64RegClass;
+    unsigned DestReg = createResultReg(RC);
+    unsigned TempReg1 = materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass);
+    unsigned TempReg2 =
+        materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass);
+    emitInst(Mips::BuildPairF64, DestReg).addReg(TempReg2).addReg(TempReg1);
+    return DestReg;
+  }
+  return 0;
+}
+
+unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) {
+  // For now 32-bit only.
+  if (VT != MVT::i32)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  unsigned DestReg = createResultReg(RC);
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  bool IsThreadLocal = GVar && GVar->isThreadLocal();
+  // TLS not supported at this time.
+  if (IsThreadLocal)
+    return 0;
+  emitInst(Mips::LW, DestReg)
+      .addReg(MFI->getGlobalBaseReg())
+      .addGlobalAddress(GV, 0, MipsII::MO_GOT);
+  if ((GV->hasInternalLinkage() ||
+       (GV->hasLocalLinkage() && !isa<Function>(GV)))) {
+    unsigned TempReg = createResultReg(RC);
+    emitInst(Mips::ADDiu, TempReg)
+        .addReg(DestReg)
+        .addGlobalAddress(GV, 0, MipsII::MO_ABS_LO);
+    DestReg = TempReg;
+  }
+  return DestReg;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return (UnsupportedFPMode) ? 0 : materializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return materializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return materializeInt(C, VT);
+
+  return 0;
+}
+
+bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
+  // This construct looks a big awkward but it is how other ports handle this
+  // and as this function is more fully completed, these cases which
+  // return false will have additional code in them.
+  //
+  if (isa<Instruction>(Obj))
+    return false;
+  else if (isa<ConstantExpr>(Obj))
+    return false;
+  Addr.setReg(getRegForValue(Obj));
+  return Addr.getReg() != 0;
+}
+
+bool MipsFastISel::computeCallAddress(const Value *V, Address &Addr) {
+  const GlobalValue *GV = dyn_cast<GlobalValue>(V);
+  if (GV && isa<Function>(GV) && dyn_cast<Function>(GV)->isIntrinsic())
+    return false;
+  if (!GV)
+    return false;
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    Addr.setGlobalValue(GV);
+    return true;
+  }
+  return false;
+}
 
 bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(Ty, true);
@@ -138,21 +353,134 @@
     return true;
   return false;
 }
+// Because of how EmitCmp is called with fast-isel, you can
+// end up with redundant "andi" instructions after the sequences emitted below.
+// We should try and solve this issue in the future.
+//
+bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
+  const Value *Left = CI->getOperand(0), *Right = CI->getOperand(1);
+  bool IsUnsigned = CI->isUnsigned();
+  unsigned LeftReg = getRegEnsuringSimpleIntegerWidening(Left, IsUnsigned);
+  if (LeftReg == 0)
+    return false;
+  unsigned RightReg = getRegEnsuringSimpleIntegerWidening(Right, IsUnsigned);
+  if (RightReg == 0)
+    return false;
+  CmpInst::Predicate P = CI->getPredicate();
 
-bool MipsFastISel::ComputeAddress(const Value *Obj, Address &Addr) {
-  // This construct looks a big awkward but it is how other ports handle this
-  // and as this function is more fully completed, these cases which
-  // return false will have additional code in them.
-  //
-  if (isa<Instruction>(Obj))
+  switch (P) {
+  default:
     return false;
-  else if (isa<ConstantExpr>(Obj))
-    return false;
-  Addr.Base.Reg = getRegForValue(Obj);
-  return Addr.Base.Reg != 0;
+  case CmpInst::ICMP_EQ: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg);
+    emitInst(Mips::SLTiu, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::ICMP_NE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg);
+    emitInst(Mips::SLTu, ResultReg).addReg(Mips::ZERO).addReg(TempReg);
+    break;
+  }
+  case CmpInst::ICMP_UGT: {
+    emitInst(Mips::SLTu, ResultReg).addReg(RightReg).addReg(LeftReg);
+    break;
+  }
+  case CmpInst::ICMP_ULT: {
+    emitInst(Mips::SLTu, ResultReg).addReg(LeftReg).addReg(RightReg);
+    break;
+  }
+  case CmpInst::ICMP_UGE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::SLTu, TempReg).addReg(LeftReg).addReg(RightReg);
+    emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::ICMP_ULE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::SLTu, TempReg).addReg(RightReg).addReg(LeftReg);
+    emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::ICMP_SGT: {
+    emitInst(Mips::SLT, ResultReg).addReg(RightReg).addReg(LeftReg);
+    break;
+  }
+  case CmpInst::ICMP_SLT: {
+    emitInst(Mips::SLT, ResultReg).addReg(LeftReg).addReg(RightReg);
+    break;
+  }
+  case CmpInst::ICMP_SGE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::SLT, TempReg).addReg(LeftReg).addReg(RightReg);
+    emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::ICMP_SLE: {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::SLT, TempReg).addReg(RightReg).addReg(LeftReg);
+    emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
+    break;
+  }
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UNE:
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_OGE: {
+    if (UnsupportedFPMode)
+      return false;
+    bool IsFloat = Left->getType()->isFloatTy();
+    bool IsDouble = Left->getType()->isDoubleTy();
+    if (!IsFloat && !IsDouble)
+      return false;
+    unsigned Opc, CondMovOpc;
+    switch (P) {
+    case CmpInst::FCMP_OEQ:
+      Opc = IsFloat ? Mips::C_EQ_S : Mips::C_EQ_D32;
+      CondMovOpc = Mips::MOVT_I;
+      break;
+    case CmpInst::FCMP_UNE:
+      Opc = IsFloat ? Mips::C_EQ_S : Mips::C_EQ_D32;
+      CondMovOpc = Mips::MOVF_I;
+      break;
+    case CmpInst::FCMP_OLT:
+      Opc = IsFloat ? Mips::C_OLT_S : Mips::C_OLT_D32;
+      CondMovOpc = Mips::MOVT_I;
+      break;
+    case CmpInst::FCMP_OLE:
+      Opc = IsFloat ? Mips::C_OLE_S : Mips::C_OLE_D32;
+      CondMovOpc = Mips::MOVT_I;
+      break;
+    case CmpInst::FCMP_OGT:
+      Opc = IsFloat ? Mips::C_ULE_S : Mips::C_ULE_D32;
+      CondMovOpc = Mips::MOVF_I;
+      break;
+    case CmpInst::FCMP_OGE:
+      Opc = IsFloat ? Mips::C_ULT_S : Mips::C_ULT_D32;
+      CondMovOpc = Mips::MOVF_I;
+      break;
+    default:
+      llvm_unreachable("Only switching of a subset of CCs.");
+    }
+    unsigned RegWithZero = createResultReg(&Mips::GPR32RegClass);
+    unsigned RegWithOne = createResultReg(&Mips::GPR32RegClass);
+    emitInst(Mips::ADDiu, RegWithZero).addReg(Mips::ZERO).addImm(0);
+    emitInst(Mips::ADDiu, RegWithOne).addReg(Mips::ZERO).addImm(1);
+    emitInst(Opc).addReg(LeftReg).addReg(RightReg).addReg(
+        Mips::FCC0, RegState::ImplicitDefine);
+    MachineInstrBuilder MI = emitInst(CondMovOpc, ResultReg)
+                                 .addReg(RegWithOne)
+                                 .addReg(Mips::FCC0)
+                                 .addReg(RegWithZero, RegState::Implicit);
+    MI->tieOperands(0, 3);
+    break;
+  }
+  }
+  return true;
 }
-
-bool MipsFastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                             unsigned Alignment) {
   //
   // more cases will be handled here in following patches.
@@ -175,11 +503,15 @@
     break;
   }
   case MVT::f32: {
+    if (UnsupportedFPMode)
+      return false;
     ResultReg = createResultReg(&Mips::FGR32RegClass);
     Opc = Mips::LWC1;
     break;
   }
   case MVT::f64: {
+    if (UnsupportedFPMode)
+      return false;
     ResultReg = createResultReg(&Mips::AFGR64RegClass);
     Opc = Mips::LDC1;
     break;
@@ -187,31 +519,11 @@
   default:
     return false;
   }
-  EmitInstLoad(Opc, ResultReg, Addr.Base.Reg, Addr.Offset);
+  emitInstLoad(Opc, ResultReg, Addr.getReg(), Addr.getOffset());
   return true;
 }
 
-// Materialize a constant into a register, and return the register
-// number (or zero if we failed to handle it).
-unsigned MipsFastISel::TargetMaterializeConstant(const Constant *C) {
-  EVT CEVT = TLI.getValueType(C->getType(), true);
-
-  // Only handle simple types.
-  if (!CEVT.isSimple())
-    return 0;
-  MVT VT = CEVT.getSimpleVT();
-
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
-    return MaterializeFP(CFP, VT);
-  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return MaterializeGV(GV, VT);
-  else if (isa<ConstantInt>(C))
-    return MaterializeInt(C, VT);
-
-  return 0;
-}
-
-bool MipsFastISel::EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
                              unsigned Alignment) {
   //
   // more cases will be handled here in following patches.
@@ -228,19 +540,23 @@
     Opc = Mips::SW;
     break;
   case MVT::f32:
+    if (UnsupportedFPMode)
+      return false;
     Opc = Mips::SWC1;
     break;
   case MVT::f64:
+    if (UnsupportedFPMode)
+      return false;
     Opc = Mips::SDC1;
     break;
   default:
     return false;
   }
-  EmitInstStore(Opc, SrcReg, Addr.Base.Reg, Addr.Offset);
+  emitInstStore(Opc, SrcReg, Addr.getReg(), Addr.getOffset());
   return true;
 }
 
-bool MipsFastISel::SelectLoad(const Instruction *I) {
+bool MipsFastISel::selectLoad(const Instruction *I) {
   // Atomic loads need special handling.
   if (cast<LoadInst>(I)->isAtomic())
     return false;
@@ -252,17 +568,17 @@
 
   // See if we can handle this address.
   Address Addr;
-  if (!ComputeAddress(I->getOperand(0), Addr))
+  if (!computeAddress(I->getOperand(0), Addr))
     return false;
 
   unsigned ResultReg;
-  if (!EmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+  if (!emitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
     return false;
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
-bool MipsFastISel::SelectStore(const Instruction *I) {
+bool MipsFastISel::selectStore(const Instruction *I) {
   Value *Op0 = I->getOperand(0);
   unsigned SrcReg = 0;
 
@@ -282,15 +598,394 @@
 
   // See if we can handle this address.
   Address Addr;
-  if (!ComputeAddress(I->getOperand(1), Addr))
+  if (!computeAddress(I->getOperand(1), Addr))
     return false;
 
-  if (!EmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+  if (!emitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
     return false;
   return true;
 }
 
-bool MipsFastISel::SelectRet(const Instruction *I) {
+//
+// This can cause a redundant sltiu to be generated.
+// FIXME: try and eliminate this in a future patch.
+//
+bool MipsFastISel::selectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *BrBB = FuncInfo.MBB;
+  //
+  // TBB is the basic block for the case where the comparison is true.
+  // FBB is the basic block for the case where the comparison is false.
+  // if (cond) goto TBB
+  // goto FBB
+  // TBB:
+  //
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+  BI->getCondition();
+  // For now, just try the simplest case where it's fed by a compare.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    unsigned CondReg = createResultReg(&Mips::GPR32RegClass);
+    if (!emitCmp(CondReg, CI))
+      return false;
+    BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
+        .addReg(CondReg)
+        .addMBB(TBB);
+    fastEmitBranch(FBB, DbgLoc);
+    FuncInfo.MBB->addSuccessor(TBB);
+    return true;
+  }
+  return false;
+}
+
+bool MipsFastISel::selectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  if (!emitCmp(ResultReg, CI))
+    return false;
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point extend instruction.
+bool MipsFastISel::selectFPExt(const Instruction *I) {
+  if (UnsupportedFPMode)
+    return false;
+  Value *Src = I->getOperand(0);
+  EVT SrcVT = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f32 || DestVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg =
+      getRegForValue(Src); // his must be a 32 bit floating point register class
+                           // maybe we should handle this differently
+  if (!SrcReg)
+    return false;
+
+  unsigned DestReg = createResultReg(&Mips::AFGR64RegClass);
+  emitInst(Mips::CVT_D32_S, DestReg).addReg(SrcReg);
+  updateValueMap(I, DestReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point truncate instruction.
+bool MipsFastISel::selectFPTrunc(const Instruction *I) {
+  if (UnsupportedFPMode)
+    return false;
+  Value *Src = I->getOperand(0);
+  EVT SrcVT = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f64 || DestVT != MVT::f32)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  unsigned DestReg = createResultReg(&Mips::FGR32RegClass);
+  if (!DestReg)
+    return false;
+
+  emitInst(Mips::CVT_S_D32, DestReg).addReg(SrcReg);
+  updateValueMap(I, DestReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point-to-integer conversion.
+bool MipsFastISel::selectFPToInt(const Instruction *I, bool IsSigned) {
+  if (UnsupportedFPMode)
+    return false;
+  MVT DstVT, SrcVT;
+  if (!IsSigned)
+    return false; // We don't handle this case yet. There is no native
+                  // instruction for this but it can be synthesized.
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::i32)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+  if (!isTypeLegal(SrcTy, SrcVT))
+    return false;
+
+  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // Determine the opcode for the conversion, which takes place
+  // entirely within FPRs.
+  unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+  unsigned TempReg = createResultReg(&Mips::FGR32RegClass);
+  unsigned Opc;
+
+  if (SrcVT == MVT::f32)
+    Opc = Mips::TRUNC_W_S;
+  else
+    Opc = Mips::TRUNC_W_D32;
+
+  // Generate the convert.
+  emitInst(Opc, TempReg).addReg(SrcReg);
+
+  emitInst(Mips::MFC1, DestReg).addReg(TempReg);
+
+  updateValueMap(I, DestReg);
+  return true;
+}
+//
+bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
+                                   SmallVectorImpl<MVT> &OutVTs,
+                                   unsigned &NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+  // This is the minimum argument area used for A0-A3.
+  if (NumBytes < 16)
+    NumBytes = 16;
+
+  emitInst(Mips::ADJCALLSTACKDOWN).addImm(16);
+  // Process the args.
+  MVT firstMVT;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    const Value *ArgVal = CLI.OutVals[VA.getValNo()];
+    MVT ArgVT = OutVTs[VA.getValNo()];
+
+    if (i == 0) {
+      firstMVT = ArgVT;
+      if (ArgVT == MVT::f32) {
+        VA.convertToReg(Mips::F12);
+      } else if (ArgVT == MVT::f64) {
+        VA.convertToReg(Mips::D6);
+      }
+    } else if (i == 1) {
+      if ((firstMVT == MVT::f32) || (firstMVT == MVT::f64)) {
+        if (ArgVT == MVT::f32) {
+          VA.convertToReg(Mips::F14);
+        } else if (ArgVT == MVT::f64) {
+          VA.convertToReg(Mips::D7);
+        }
+      }
+    }
+    if (((ArgVT == MVT::i32) || (ArgVT == MVT::f32)) && VA.isMemLoc()) {
+      switch (VA.getLocMemOffset()) {
+      case 0:
+        VA.convertToReg(Mips::A0);
+        break;
+      case 4:
+        VA.convertToReg(Mips::A1);
+        break;
+      case 8:
+        VA.convertToReg(Mips::A2);
+        break;
+      case 12:
+        VA.convertToReg(Mips::A3);
+        break;
+      default:
+        break;
+      }
+    }
+    unsigned ArgReg = getRegForValue(ArgVal);
+    if (!ArgReg)
+      return false;
+
+    // Handle arg promotion: SExt, ZExt, AExt.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::AExt:
+    case CCValAssign::SExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false);
+      if (!ArgReg)
+        return false;
+      break;
+    }
+    case CCValAssign::ZExt: {
+      MVT DestVT = VA.getLocVT();
+      MVT SrcVT = ArgVT;
+      ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true);
+      if (!ArgReg)
+        return false;
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+      CLI.OutRegs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      llvm_unreachable("Mips does not use custom args.");
+      return false;
+    } else {
+      //
+      // FIXME: This path will currently return false. It was copied
+      // from the AArch64 port and should be essentially fine for Mips too.
+      // The work to finish up this path will be done in a follow-on patch.
+      //
+      assert(VA.isMemLoc() && "Assuming store on stack.");
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
+      // Need to store on the stack.
+      // FIXME: This alignment is incorrect but this path is disabled
+      // for now (will return false). We need to determine the right alignment
+      // based on the normal alignment for the underlying machine type.
+      //
+      unsigned ArgSize = RoundUpToAlignment(ArgVT.getSizeInBits(), 4);
+
+      unsigned BEAlign = 0;
+      if (ArgSize < 8 && !Subtarget->isLittle())
+        BEAlign = 8 - ArgSize;
+
+      Address Addr;
+      Addr.setKind(Address::RegBase);
+      Addr.setReg(Mips::SP);
+      Addr.setOffset(VA.getLocMemOffset() + BEAlign);
+
+      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+          MachinePointerInfo::getStack(Addr.getOffset()),
+          MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+      (void)(MMO);
+      // if (!emitStore(ArgVT, ArgReg, Addr, MMO))
+      return false; // can't store on the stack yet.
+    }
+  }
+
+  return true;
+}
+
+bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
+                              unsigned NumBytes) {
+  CallingConv::ID CC = CLI.CallConv;
+  emitInst(Mips::ADJCALLSTACKUP).addImm(16);
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_Mips);
+
+    // Only handle a single return value.
+    if (RVLocs.size() != 1)
+      return false;
+    // Copy all of the result registers out of their specified physreg.
+    MVT CopyVT = RVLocs[0].getValVT();
+    // Special handling for extended integers.
+    if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16)
+      CopyVT = MVT::i32;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(RVLocs[0].getLocReg());
+    CLI.InRegs.push_back(RVLocs[0].getLocReg());
+
+    CLI.ResultReg = ResultReg;
+    CLI.NumResultRegs = 1;
+  }
+  return true;
+}
+
+bool MipsFastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  CallingConv::ID CC = CLI.CallConv;
+  bool IsTailCall = CLI.IsTailCall;
+  bool IsVarArg = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  // const char *SymName = CLI.SymName;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  if (IsVarArg)
+    return false;
+
+  // FIXME: Only handle *simple* calls for now.
+  MVT RetVT;
+  if (CLI.RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(CLI.RetTy, RetVT))
+    return false;
+
+  for (auto Flag : CLI.OutFlags)
+    if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal())
+      return false;
+
+  // Set up the argument vectors.
+  SmallVector<MVT, 16> OutVTs;
+  OutVTs.reserve(CLI.OutVals.size());
+
+  for (auto *Val : CLI.OutVals) {
+    MVT VT;
+    if (!isTypeLegal(Val->getType(), VT) &&
+        !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16))
+      return false;
+
+    // We don't handle vector parameters yet.
+    if (VT.isVector() || VT.getSizeInBits() > 64)
+      return false;
+
+    OutVTs.push_back(VT);
+  }
+
+  Address Addr;
+  if (!computeCallAddress(Callee, Addr))
+    return false;
+
+  // Handle the arguments now that we've gotten them.
+  unsigned NumBytes;
+  if (!processCallArgs(CLI, OutVTs, NumBytes))
+    return false;
+
+  // Issue the call.
+  unsigned DestAddress = materializeGV(Addr.getGlobalValue(), MVT::i32);
+  emitInst(TargetOpcode::COPY, Mips::T9).addReg(DestAddress);
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::JALR),
+              Mips::RA).addReg(Mips::T9);
+
+  // Add implicit physical register uses to the call.
+  for (auto Reg : CLI.OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  CLI.Call = MIB;
+
+  // Add implicit physical register uses to the call.
+  for (auto Reg : CLI.OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.  Proper
+  // defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  CLI.Call = MIB;
+  // Finish off the call including any return values.
+  return finishCall(CLI, RetVT, NumBytes);
+}
+
+bool MipsFastISel::selectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
 
   if (!FuncInfo.CanLowerReturn)
@@ -298,98 +993,181 @@
   if (Ret->getNumOperands() > 0) {
     return false;
   }
-  EmitInst(Mips::RetRA);
+  emitInst(Mips::RetRA);
   return true;
 }
 
-bool MipsFastISel::TargetSelectInstruction(const Instruction *I) {
+bool MipsFastISel::selectTrunc(const Instruction *I) {
+  // The high bits for a type smaller than the register size are assumed to be
+  // undefined.
+  Value *Op = I->getOperand(0);
+
+  EVT SrcVT, DestVT;
+  SrcVT = TLI.getValueType(Op->getType(), true);
+  DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg)
+    return false;
+
+  // Because the high bits are undefined, a truncate doesn't generate
+  // any code.
+  updateValueMap(I, SrcReg);
+  return true;
+}
+bool MipsFastISel::selectIntExt(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool isZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(SrcTy, true);
+  DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+
+  if (!emitIntExt(SrcVT, SrcReg, DestVT, ResultReg, isZExt))
+    return false;
+  updateValueMap(I, ResultReg);
+  return true;
+}
+bool MipsFastISel::emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                   unsigned DestReg) {
+  unsigned ShiftAmt;
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+    ShiftAmt = 24;
+    break;
+  case MVT::i16:
+    ShiftAmt = 16;
+    break;
+  }
+  unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+  emitInst(Mips::SLL, TempReg).addReg(SrcReg).addImm(ShiftAmt);
+  emitInst(Mips::SRA, DestReg).addReg(TempReg).addImm(ShiftAmt);
+  return true;
+}
+
+bool MipsFastISel::emitIntSExt32r2(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                   unsigned DestReg) {
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+    emitInst(Mips::SEB, DestReg).addReg(SrcReg);
+    break;
+  case MVT::i16:
+    emitInst(Mips::SEH, DestReg).addReg(SrcReg);
+    break;
+  }
+  return true;
+}
+
+bool MipsFastISel::emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                               unsigned DestReg) {
+  if ((DestVT != MVT::i32) && (DestVT != MVT::i16))
+    return false;
+  if (Subtarget->hasMips32r2())
+    return emitIntSExt32r2(SrcVT, SrcReg, DestVT, DestReg);
+  return emitIntSExt32r1(SrcVT, SrcReg, DestVT, DestReg);
+}
+
+bool MipsFastISel::emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                               unsigned DestReg) {
+  switch (SrcVT.SimpleTy) {
+  default:
+    return false;
+  case MVT::i1:
+    emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(1);
+    break;
+  case MVT::i8:
+    emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xff);
+    break;
+  case MVT::i16:
+    emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xffff);
+    break;
+  }
+  return true;
+}
+
+bool MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                              unsigned DestReg, bool IsZExt) {
+  if (IsZExt)
+    return emitIntZExt(SrcVT, SrcReg, DestVT, DestReg);
+  return emitIntSExt(SrcVT, SrcReg, DestVT, DestReg);
+}
+
+unsigned MipsFastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                  bool isZExt) {
+  unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+  return emitIntExt(SrcVT, SrcReg, DestVT, DestReg, isZExt);
+}
+
+bool MipsFastISel::fastSelectInstruction(const Instruction *I) {
   if (!TargetSupported)
     return false;
   switch (I->getOpcode()) {
   default:
     break;
   case Instruction::Load:
-    return SelectLoad(I);
+    return selectLoad(I);
   case Instruction::Store:
-    return SelectStore(I);
+    return selectStore(I);
+  case Instruction::Br:
+    return selectBranch(I);
   case Instruction::Ret:
-    return SelectRet(I);
+    return selectRet(I);
+  case Instruction::Trunc:
+    return selectTrunc(I);
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return selectIntExt(I);
+  case Instruction::FPTrunc:
+    return selectFPTrunc(I);
+  case Instruction::FPExt:
+    return selectFPExt(I);
+  case Instruction::FPToSI:
+    return selectFPToInt(I, /*isSigned*/ true);
+  case Instruction::FPToUI:
+    return selectFPToInt(I, /*isSigned*/ false);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return selectCmp(I);
   }
   return false;
 }
-}
 
-unsigned MipsFastISel::MaterializeFP(const ConstantFP *CFP, MVT VT) {
-  int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
-  if (VT == MVT::f32) {
-    const TargetRegisterClass *RC = &Mips::FGR32RegClass;
-    unsigned DestReg = createResultReg(RC);
-    unsigned TempReg = Materialize32BitInt(Imm, &Mips::GPR32RegClass);
-    EmitInst(Mips::MTC1, DestReg).addReg(TempReg);
-    return DestReg;
-  } else if (VT == MVT::f64) {
-    const TargetRegisterClass *RC = &Mips::AFGR64RegClass;
-    unsigned DestReg = createResultReg(RC);
-    unsigned TempReg1 = Materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass);
-    unsigned TempReg2 =
-        Materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass);
-    EmitInst(Mips::BuildPairF64, DestReg).addReg(TempReg2).addReg(TempReg1);
-    return DestReg;
-  }
-  return 0;
-}
-
-unsigned MipsFastISel::MaterializeGV(const GlobalValue *GV, MVT VT) {
-  // For now 32-bit only.
-  if (VT != MVT::i32)
+unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
+                                                           bool IsUnsigned) {
+  unsigned VReg = getRegForValue(V);
+  if (VReg == 0)
     return 0;
-  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
-  unsigned DestReg = createResultReg(RC);
-  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  bool IsThreadLocal = GVar && GVar->isThreadLocal();
-  // TLS not supported at this time.
-  if (IsThreadLocal)
-    return 0;
-  EmitInst(Mips::LW, DestReg).addReg(MFI->getGlobalBaseReg()).addGlobalAddress(
-      GV, 0, MipsII::MO_GOT);
-  return DestReg;
-}
-unsigned MipsFastISel::MaterializeInt(const Constant *C, MVT VT) {
-  if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
-    return 0;
-  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
-  const ConstantInt *CI = cast<ConstantInt>(C);
-  int64_t Imm;
-  if (CI->isNegative())
-    Imm = CI->getSExtValue();
-  else
-    Imm = CI->getZExtValue();
-  return Materialize32BitInt(Imm, RC);
-}
-
-unsigned MipsFastISel::Materialize32BitInt(int64_t Imm,
-                                           const TargetRegisterClass *RC) {
-  unsigned ResultReg = createResultReg(RC);
-
-  if (isInt<16>(Imm)) {
-    unsigned Opc = Mips::ADDiu;
-    EmitInst(Opc, ResultReg).addReg(Mips::ZERO).addImm(Imm);
-    return ResultReg;
-  } else if (isUInt<16>(Imm)) {
-    EmitInst(Mips::ORi, ResultReg).addReg(Mips::ZERO).addImm(Imm);
-    return ResultReg;
+  MVT VMVT = TLI.getValueType(V->getType(), true).getSimpleVT();
+  if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) {
+    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned))
+      return 0;
+    VReg = TempReg;
   }
-  unsigned Lo = Imm & 0xFFFF;
-  unsigned Hi = (Imm >> 16) & 0xFFFF;
-  if (Lo) {
-    // Both Lo and Hi have nonzero bits.
-    unsigned TmpReg = createResultReg(RC);
-    EmitInst(Mips::LUi, TmpReg).addImm(Hi);
-    EmitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo);
-  } else {
-    EmitInst(Mips::LUi, ResultReg).addImm(Hi);
-  }
-  return ResultReg;
+  return VReg;
 }
 
 namespace llvm {

diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 8ba35fa..3014a0d 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp

@@ -82,9 +82,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-const MipsFrameLowering *MipsFrameLowering::create(MipsTargetMachine &TM,
-                                                   const MipsSubtarget &ST) {
-  if (TM.getSubtargetImpl()->inMips16Mode())
+const MipsFrameLowering *MipsFrameLowering::create(const MipsSubtarget &ST) {
+  if (ST.inMips16Mode())
     return llvm::createMips16FrameLowering(ST);
 
   return llvm::createMipsSEFrameLowering(ST);
@@ -101,7 +100,7 @@
 
 uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo &TRI = *MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
 
   int64_t Offset = 0;
 

diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 8e9196c..90a8d2a 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPS_FRAMEINFO_H
-#define MIPS_FRAMEINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSFRAMELOWERING_H
 
 #include "Mips.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -28,8 +28,7 @@
   explicit MipsFrameLowering(const MipsSubtarget &sti, unsigned Alignment)
     : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {}
 
-  static const MipsFrameLowering *create(MipsTargetMachine &TM,
-                                         const MipsSubtarget &ST);
+  static const MipsFrameLowering *create(const MipsSubtarget &ST);
 
   bool hasFP(const MachineFunction &MF) const override;
 

diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index 2a6c875..ff8760d 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSISELDAGTODAG_H
-#define MIPSISELDAGTODAG_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPSISELDAGTODAG_H
 
 #include "Mips.h"
 #include "MipsSubtarget.h"
@@ -32,7 +32,7 @@
 class MipsDAGToDAGISel : public SelectionDAGISel {
 public:
   explicit MipsDAGToDAGISel(MipsTargetMachine &TM)
-    : SelectionDAGISel(TM), Subtarget(&TM.getSubtarget<MipsSubtarget>()) {}
+      : SelectionDAGISel(TM), Subtarget(nullptr) {}
 
   // Pass Name
   const char *getPassName() const override {

diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index b7af2d4..ff2bfb3 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp

@@ -14,6 +14,7 @@
 #include "MipsISelLowering.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsCCState.h"
 #include "MipsMachineFunction.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
@@ -24,6 +25,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -56,15 +58,6 @@
   cl::desc("Allow mips-fast-isel to be used"),
   cl::init(false));
 
-static const MCPhysReg O32IntRegs[4] = {
-  Mips::A0, Mips::A1, Mips::A2, Mips::A3
-};
-
-static const MCPhysReg Mips64IntRegs[8] = {
-  Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
-  Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64
-};
-
 static const MCPhysReg Mips64DPRegs[8] = {
   Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
   Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
@@ -208,16 +201,16 @@
   }
 }
 
-MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
-    : TargetLowering(TM, new MipsTargetObjectFile()),
-      Subtarget(&TM.getSubtarget<MipsSubtarget>()) {
+MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
+                                       const MipsSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
   // Mips does not have i1 type, so use i32 for
   // setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
   // The cmp.cond.fmt instruction in MIPS32r6/MIPS64r6 uses 0 and -1 like MSA
   // does. Integer booleans still use 0 and 1.
-  if (Subtarget->hasMips32r6())
+  if (Subtarget.hasMips32r6())
     setBooleanContents(ZeroOrOneBooleanContent,
                        ZeroOrNegativeOneBooleanContent);
 
@@ -251,12 +244,11 @@
   setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
   setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
-  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
-  if (Subtarget->isGP64bit()) {
+  if (Subtarget.isGP64bit()) {
     setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
     setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
     setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
@@ -268,14 +260,14 @@
     setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
   }
 
-  if (!Subtarget->isGP64bit()) {
+  if (!Subtarget.isGP64bit()) {
     setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
   }
 
   setOperationAction(ISD::ADD,                MVT::i32,   Custom);
-  if (Subtarget->isGP64bit())
+  if (Subtarget.isGP64bit())
     setOperationAction(ISD::ADD,                MVT::i64,   Custom);
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
@@ -299,7 +291,7 @@
   setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
   setOperationAction(ISD::FP_TO_UINT,        MVT::i64,   Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,    Expand);
-  if (Subtarget->hasCnMips()) {
+  if (Subtarget.hasCnMips()) {
     setOperationAction(ISD::CTPOP,           MVT::i32,   Legal);
     setOperationAction(ISD::CTPOP,           MVT::i64,   Legal);
   } else {
@@ -317,10 +309,10 @@
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,  Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64,  Expand);
 
-  if (!Subtarget->hasMips32r2())
+  if (!Subtarget.hasMips32r2())
     setOperationAction(ISD::ROTR, MVT::i32,   Expand);
 
-  if (!Subtarget->hasMips64r2())
+  if (!Subtarget.hasMips64r2())
     setOperationAction(ISD::ROTR, MVT::i64,   Expand);
 
   setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
@@ -343,7 +335,8 @@
 
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
 
-  setOperationAction(ISD::VAARG,             MVT::Other, Expand);
+  setOperationAction(ISD::VASTART,           MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,             MVT::Other, Custom);
   setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
   setOperationAction(ISD::VAEND,             MVT::Other, Expand);
 
@@ -358,23 +351,23 @@
 
   setInsertFencesForAtomic(true);
 
-  if (!Subtarget->hasMips32r2()) {
+  if (!Subtarget.hasMips32r2()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
   }
 
   // MIPS16 lacks MIPS32's clz and clo instructions.
-  if (!Subtarget->hasMips32() || Subtarget->inMips16Mode())
+  if (!Subtarget.hasMips32() || Subtarget.inMips16Mode())
     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
-  if (!Subtarget->hasMips64())
+  if (!Subtarget.hasMips64())
     setOperationAction(ISD::CTLZ, MVT::i64, Expand);
 
-  if (!Subtarget->hasMips32r2())
+  if (!Subtarget.hasMips32r2())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
-  if (!Subtarget->hasMips64r2())
+  if (!Subtarget.hasMips64r2())
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
-  if (Subtarget->isGP64bit()) {
+  if (Subtarget.isGP64bit()) {
     setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
@@ -390,24 +383,30 @@
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
 
-  setMinFunctionAlignment(Subtarget->isGP64bit() ? 3 : 2);
+  setMinFunctionAlignment(Subtarget.isGP64bit() ? 3 : 2);
 
-  setStackPointerRegisterToSaveRestore(Subtarget->isABI_N64() ? Mips::SP_64
-                                                              : Mips::SP);
+  // The arguments on the stack are defined in terms of 4-byte slots on O32
+  // and 8-byte slots on N32/N64.
+  setMinStackArgumentAlignment(
+      (Subtarget.isABI_N32() || Subtarget.isABI_N64()) ? 8 : 4);
 
-  setExceptionPointerRegister(Subtarget->isABI_N64() ? Mips::A0_64 : Mips::A0);
-  setExceptionSelectorRegister(Subtarget->isABI_N64() ? Mips::A1_64 : Mips::A1);
+  setStackPointerRegisterToSaveRestore(Subtarget.isABI_N64() ? Mips::SP_64
+                                                             : Mips::SP);
+
+  setExceptionPointerRegister(Subtarget.isABI_N64() ? Mips::A0_64 : Mips::A0);
+  setExceptionSelectorRegister(Subtarget.isABI_N64() ? Mips::A1_64 : Mips::A1);
 
   MaxStoresPerMemcpy = 16;
 
-  isMicroMips = Subtarget->inMicroMipsMode();
+  isMicroMips = Subtarget.inMicroMipsMode();
 }
 
-const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM) {
-  if (TM.getSubtargetImpl()->inMips16Mode())
-    return llvm::createMips16TargetLowering(TM);
+const MipsTargetLowering *MipsTargetLowering::create(const MipsTargetMachine &TM,
+                                                     const MipsSubtarget &STI) {
+  if (STI.inMips16Mode())
+    return llvm::createMips16TargetLowering(TM, STI);
 
-  return llvm::createMipsSETargetLowering(TM);
+  return llvm::createMipsSETargetLowering(TM, STI);
 }
 
 // Create a fast isel object.
@@ -427,7 +426,7 @@
 
 static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
-                                    const MipsSubtarget *Subtarget) {
+                                    const MipsSubtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -537,7 +536,7 @@
 
 static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
-                                    const MipsSubtarget *Subtarget) {
+                                    const MipsSubtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -616,11 +615,11 @@
 
 static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget *Subtarget) {
+                                 const MipsSubtarget &Subtarget) {
   // Pattern match EXT.
   //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
   //  => ext $dst, $src, size, pos
-  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert())
+  if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
     return SDValue();
 
   SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
@@ -656,12 +655,12 @@
 
 static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI,
-                                const MipsSubtarget *Subtarget) {
+                                const MipsSubtarget &Subtarget) {
   // Pattern match INS.
   //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
   //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1
   //  => ins $dst, $src, size, pos, $src1
-  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert())
+  if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
     return SDValue();
 
   SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
@@ -710,7 +709,7 @@
 
 static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget *Subtarget) {
+                                 const MipsSubtarget &Subtarget) {
   // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
 
   if (DCI.isBeforeLegalizeOps())
@@ -791,6 +790,7 @@
   case ISD::SELECT_CC:          return lowerSELECT_CC(Op, DAG);
   case ISD::SETCC:              return lowerSETCC(Op, DAG);
   case ISD::VASTART:            return lowerVASTART(Op, DAG);
+  case ISD::VAARG:              return lowerVAARG(Op, DAG);
   case ISD::FCOPYSIGN:          return lowerFCOPYSIGN(Op, DAG);
   case ISD::FRAMEADDR:          return lowerFRAMEADDR(Op, DAG);
   case ISD::RETURNADDR:         return lowerRETURNADDR(Op, DAG);
@@ -933,16 +933,16 @@
   case Mips::DIVU:
   case Mips::MOD:
   case Mips::MODU:
-    return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(),
-                               false);
+    return insertDivByZeroTrap(
+        MI, *BB, *getTargetMachine().getSubtargetImpl()->getInstrInfo(), false);
   case Mips::PseudoDSDIV:
   case Mips::PseudoDUDIV:
   case Mips::DDIV:
   case Mips::DDIVU:
   case Mips::DMOD:
   case Mips::DMODU:
-    return insertDivByZeroTrap(MI, *BB, *getTargetMachine().getInstrInfo(),
-                               true);
+    return insertDivByZeroTrap(
+        MI, *BB, *getTargetMachine().getSubtargetImpl()->getInstrInfo(), true);
   case Mips::SEL_D:
     return emitSEL_D(MI, BB);
   }
@@ -959,7 +959,8 @@
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
@@ -968,16 +969,16 @@
       LL = Mips::LL_MM;
       SC = Mips::SC_MM;
     } else {
-      LL = Subtarget->hasMips32r6() ? Mips::LL : Mips::LL_R6;
-      SC = Subtarget->hasMips32r6() ? Mips::SC : Mips::SC_R6;
+      LL = Subtarget.hasMips32r6() ? Mips::LL_R6 : Mips::LL;
+      SC = Subtarget.hasMips32r6() ? Mips::SC_R6 : Mips::SC;
     }
     AND = Mips::AND;
     NOR = Mips::NOR;
     ZERO = Mips::ZERO;
     BEQ = Mips::BEQ;
   } else {
-    LL = Subtarget->hasMips64r6() ? Mips::LLD : Mips::LLD_R6;
-    SC = Subtarget->hasMips64r6() ? Mips::SCD : Mips::SCD_R6;
+    LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD;
+    SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD;
     AND = Mips::AND64;
     NOR = Mips::NOR64;
     ZERO = Mips::ZERO_64;
@@ -1042,15 +1043,16 @@
 MachineBasicBlock *MipsTargetLowering::emitSignExtendToI32InReg(
     MachineInstr *MI, MachineBasicBlock *BB, unsigned Size, unsigned DstReg,
     unsigned SrcReg) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
-  if (Subtarget->hasMips32r2() && Size == 1) {
+  if (Subtarget.hasMips32r2() && Size == 1) {
     BuildMI(BB, DL, TII->get(Mips::SEB), DstReg).addReg(SrcReg);
     return BB;
   }
 
-  if (Subtarget->hasMips32r2() && Size == 2) {
+  if (Subtarget.hasMips32r2() && Size == 2) {
     BuildMI(BB, DL, TII->get(Mips::SEH), DstReg).addReg(SrcReg);
     return BB;
   }
@@ -1078,7 +1080,8 @@
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   unsigned Dest = MI->getOperand(0).getReg();
@@ -1140,7 +1143,7 @@
   BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
   BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
-  if (Subtarget->isLittle()) {
+  if (Subtarget.isLittle()) {
     BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
   } else {
     unsigned Off = RegInfo.createVirtualRegister(RC);
@@ -1228,7 +1231,8 @@
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned LL, SC, ZERO, BNE, BEQ;
 
@@ -1310,7 +1314,8 @@
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   unsigned Dest    = MI->getOperand(0).getReg();
@@ -1380,7 +1385,7 @@
   BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
   BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
-  if (Subtarget->isLittle()) {
+  if (Subtarget.isLittle()) {
     BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
   } else {
     unsigned Off = RegInfo.createVirtualRegister(RC);
@@ -1445,8 +1450,10 @@
 MachineBasicBlock *MipsTargetLowering::emitSEL_D(MachineInstr *MI,
                                                  MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   MachineBasicBlock::iterator II(MI);
@@ -1487,11 +1494,11 @@
   EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
   Addr = DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr,
                         MachinePointerInfo::getJumpTable(), MemVT, false, false,
-                        0);
+                        false, 0);
   Chain = Addr.getValue(1);
 
   if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) ||
-      Subtarget->isABI_N64()) {
+      Subtarget.isABI_N64()) {
     // For PIC, the sequence is:
     // BRIND(load(Jumptable + index) + RelocBase)
     // RelocBase can be JumpTable, GOT or some sort of global base.
@@ -1509,7 +1516,7 @@
   SDValue Dest = Op.getOperand(2);
   SDLoc DL(Op);
 
-  assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6());
+  assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
   SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
 
   // Return if flag is not set by a floating point comparison.
@@ -1529,7 +1536,7 @@
 SDValue MipsTargetLowering::
 lowerSELECT(SDValue Op, SelectionDAG &DAG) const
 {
-  assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6());
+  assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
   SDValue Cond = createFPCmp(DAG, Op.getOperand(0));
 
   // Return if flag is not set by a floating point comparison.
@@ -1555,7 +1562,7 @@
 }
 
 SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  assert(!Subtarget->hasMips32r6() && !Subtarget->hasMips64r6());
+  assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6());
   SDValue Cond = createFPCmp(DAG, Op);
 
   assert(Cond.getOpcode() == MipsISD::FPCmp &&
@@ -1569,26 +1576,18 @@
 
 SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
-  // FIXME there isn't actually debug info here
-  SDLoc DL(Op);
   EVT Ty = Op.getValueType();
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = N->getGlobal();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
-      !Subtarget->isABI_N64()) {
+      !Subtarget.isABI_N64()) {
     const MipsTargetObjectFile &TLOF =
       (const MipsTargetObjectFile&)getObjFileLowering();
 
-    // %gp_rel relocation
-    if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
-      SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
-                                              MipsII::MO_GPREL);
-      SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, DL,
-                                      DAG.getVTList(MVT::i32), GA);
-      SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32);
-      return DAG.getNode(ISD::ADD, DL, MVT::i32, GPReg, GPRelNode);
-    }
+    if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine()))
+      // %gp_rel relocation
+      return getAddrGPRel(N, Ty, DAG);
 
     // %hi/%lo relocation
     return getAddrNonPIC(N, Ty, DAG);
@@ -1596,7 +1595,7 @@
 
   if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
     return getAddrLocal(N, Ty, DAG,
-                        Subtarget->isABI_N32() || Subtarget->isABI_N64());
+                        Subtarget.isABI_N32() || Subtarget.isABI_N64());
 
   if (LargeGOT)
     return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16,
@@ -1604,7 +1603,7 @@
                                  MachinePointerInfo::getGOT());
 
   return getAddrGlobal(N, Ty, DAG,
-                       (Subtarget->isABI_N32() || Subtarget->isABI_N64())
+                       (Subtarget.isABI_N32() || Subtarget.isABI_N64())
                            ? MipsII::MO_GOT_DISP
                            : MipsII::MO_GOT16,
                        DAG.getEntryNode(), MachinePointerInfo::getGOT());
@@ -1616,11 +1615,11 @@
   EVT Ty = Op.getValueType();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
-      !Subtarget->isABI_N64())
+      !Subtarget.isABI_N64())
     return getAddrNonPIC(N, Ty, DAG);
 
   return getAddrLocal(N, Ty, DAG,
-                      Subtarget->isABI_N32() || Subtarget->isABI_N64());
+                      Subtarget.isABI_N32() || Subtarget.isABI_N64());
 }
 
 SDValue MipsTargetLowering::
@@ -1709,34 +1708,33 @@
   EVT Ty = Op.getValueType();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
-      !Subtarget->isABI_N64())
+      !Subtarget.isABI_N64())
     return getAddrNonPIC(N, Ty, DAG);
 
   return getAddrLocal(N, Ty, DAG,
-                      Subtarget->isABI_N32() || Subtarget->isABI_N64());
+                      Subtarget.isABI_N32() || Subtarget.isABI_N64());
 }
 
 SDValue MipsTargetLowering::
 lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
 {
-  // gp_rel relocation
-  // FIXME: we should reference the constant pool using small data sections,
-  // but the asm printer currently doesn't support this feature without
-  // hacking it. This feature should come soon so we can uncomment the
-  // stuff below.
-  //if (IsInSmallSection(C->getType())) {
-  //  SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, MVT::i32, CP);
-  //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
-  //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode);
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   EVT Ty = Op.getValueType();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ &&
-      !Subtarget->isABI_N64())
+      !Subtarget.isABI_N64()) {
+    const MipsTargetObjectFile &TLOF =
+      (const MipsTargetObjectFile&)getObjFileLowering();
+
+    if (TLOF.IsConstantInSmallSection(N->getConstVal(), getTargetMachine()))
+      // %gp_rel relocation
+      return getAddrGPRel(N, Ty, DAG);
+
     return getAddrNonPIC(N, Ty, DAG);
+  }
 
   return getAddrLocal(N, Ty, DAG,
-                      Subtarget->isABI_N32() || Subtarget->isABI_N64());
+                      Subtarget.isABI_N32() || Subtarget.isABI_N64());
 }
 
 SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -1754,6 +1752,65 @@
                       MachinePointerInfo(SV), false, false, 0);
 }
 
+SDValue MipsTargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  SDValue Chain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  unsigned Align = Node->getConstantOperandVal(3);
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc DL(Node);
+  unsigned ArgSlotSizeInBytes =
+      (Subtarget.isABI_N32() || Subtarget.isABI_N64()) ? 8 : 4;
+
+  SDValue VAListLoad = DAG.getLoad(getPointerTy(), DL, Chain, VAListPtr,
+                                   MachinePointerInfo(SV), false, false, false,
+                                   0);
+  SDValue VAList = VAListLoad;
+
+  // Re-align the pointer if necessary.
+  // It should only ever be necessary for 64-bit types on O32 since the minimum
+  // argument alignment is the same as the maximum type alignment for N32/N64.
+  //
+  // FIXME: We currently align too often. The code generator doesn't notice
+  //        when the pointer is still aligned from the last va_arg (or pair of
+  //        va_args for the i64 on O32 case).
+  if (Align > getMinStackArgumentAlignment()) {
+    assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
+    VAList = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
+                         DAG.getConstant(Align - 1,
+                                         VAList.getValueType()));
+
+    VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
+                         DAG.getConstant(-(int64_t)Align,
+                                         VAList.getValueType()));
+  }
+
+  // Increment the pointer, VAList, to the next vaarg.
+  unsigned ArgSizeInBytes = getDataLayout()->getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext()));
+  SDValue Tmp3 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
+                             DAG.getConstant(RoundUpToAlignment(ArgSizeInBytes, ArgSlotSizeInBytes),
+                                             VAList.getValueType()));
+  // Store the incremented VAList to the legalized pointer
+  Chain = DAG.getStore(VAListLoad.getValue(1), DL, Tmp3, VAListPtr,
+                      MachinePointerInfo(SV), false, false, 0);
+
+  // In big-endian mode we must adjust the pointer when the load size is smaller
+  // than the argument slot size. We must also reduce the known alignment to
+  // match. For example in the N64 ABI, we must add 4 bytes to the offset to get
+  // the correct half of the slot, and reduce the alignment from 8 (slot
+  // alignment) down to 4 (type alignment).
+  if (!Subtarget.isLittle() && ArgSizeInBytes < ArgSlotSizeInBytes) {
+    unsigned Adjustment = ArgSlotSizeInBytes - ArgSizeInBytes;
+    VAList = DAG.getNode(ISD::ADD, DL, VAListPtr.getValueType(), VAList,
+                         DAG.getIntPtrConstant(Adjustment));
+  }
+  // Load the actual argument out of the pointer VAList
+  return DAG.getLoad(VT, DL, Chain, VAList, MachinePointerInfo(), false, false,
+                     false, 0);
+}
+
 static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG,
                                 bool HasExtractInsert) {
   EVT TyX = Op.getOperand(0).getValueType();
@@ -1851,10 +1908,10 @@
 
 SDValue
 MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
-  if (Subtarget->isGP64bit())
-    return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasExtractInsert());
+  if (Subtarget.isGP64bit())
+    return lowerFCOPYSIGN64(Op, DAG, Subtarget.hasExtractInsert());
 
-  return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasExtractInsert());
+  return lowerFCOPYSIGN32(Op, DAG, Subtarget.hasExtractInsert());
 }
 
 SDValue MipsTargetLowering::
@@ -1869,7 +1926,7 @@
   SDLoc DL(Op);
   SDValue FrameAddr =
       DAG.getCopyFromReg(DAG.getEntryNode(), DL,
-                         Subtarget->isABI_N64() ? Mips::FP_64 : Mips::FP, VT);
+                         Subtarget.isABI_N64() ? Mips::FP_64 : Mips::FP, VT);
   return FrameAddr;
 }
 
@@ -1885,7 +1942,7 @@
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MVT VT = Op.getSimpleValueType();
-  unsigned RA = Subtarget->isABI_N64() ? Mips::RA_64 : Mips::RA;
+  unsigned RA = Subtarget.isABI_N64() ? Mips::RA_64 : Mips::RA;
   MFI->setReturnAddressIsTaken(true);
 
   // Return RA, which contains the return address. Mark it an implicit live-in.
@@ -1907,12 +1964,12 @@
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
   SDLoc DL(Op);
-  EVT Ty = Subtarget->isABI_N64() ? MVT::i64 : MVT::i32;
+  EVT Ty = Subtarget.isABI_N64() ? MVT::i64 : MVT::i32;
 
   // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
   // EH_RETURN nodes, so that instructions are emitted back-to-back.
-  unsigned OffsetReg = Subtarget->isABI_N64() ? Mips::V1_64 : Mips::V1;
-  unsigned AddrReg = Subtarget->isABI_N64() ? Mips::V0_64 : Mips::V0;
+  unsigned OffsetReg = Subtarget.isABI_N64() ? Mips::V1_64 : Mips::V1;
+  unsigned AddrReg = Subtarget.isABI_N64() ? Mips::V0_64 : Mips::V0;
   Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
   Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
   return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
@@ -2025,7 +2082,7 @@
   LoadSDNode *LD = cast<LoadSDNode>(Op);
   EVT MemVT = LD->getMemoryVT();
 
-  if (Subtarget->systemSupportsUnalignedAccess())
+  if (Subtarget.systemSupportsUnalignedAccess())
     return Op;
 
   // Return if load is aligned or if MemVT is neither i32 nor i64.
@@ -2033,7 +2090,7 @@
       ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
     return SDValue();
 
-  bool IsLittle = Subtarget->isLittle();
+  bool IsLittle = Subtarget.isLittle();
   EVT VT = Op.getValueType();
   ISD::LoadExtType ExtType = LD->getExtensionType();
   SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT);
@@ -2151,10 +2208,10 @@
   EVT MemVT = SD->getMemoryVT();
 
   // Lower unaligned integer stores.
-  if (!Subtarget->systemSupportsUnalignedAccess() &&
+  if (!Subtarget.systemSupportsUnalignedAccess() &&
       (SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
       ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
-    return lowerUnalignedIntStore(SD, DAG, Subtarget->isLittle());
+    return lowerUnalignedIntStore(SD, DAG, Subtarget.isLittle());
 
   return lowerFP_TO_SINT_STORE(SD, DAG);
 }
@@ -2201,7 +2258,7 @@
 //       an argument. Otherwise, passed in A1, A2, A3 and stack.
 // f64 - Only passed in two aliased f32 registers if no int reg has been used
 //       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
-//       not used, it must be shadowed. If only A3 is avaiable, shadow it and
+//       not used, it must be shadowed. If only A3 is available, shadow it and
 //       go to stack.
 //
 //  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
@@ -2299,6 +2356,10 @@
   return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
 }
 
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                       CCState &State) LLVM_ATTRIBUTE_UNUSED;
+
 #include "MipsGenCallingConv.inc"
 
 //===----------------------------------------------------------------------===//
@@ -2333,15 +2394,21 @@
 getOpndList(SmallVectorImpl<SDValue> &Ops,
             std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
             bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+            bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+            SDValue Chain) const {
   // Insert node "GP copy globalreg" before call to function.
   //
   // R_MIPS_CALL* operators (emitted when non-internal functions are called
   // in PIC mode) allow symbols to be resolved via lazy binding.
   // The lazy binding stub requires GP to point to the GOT.
-  if (IsPICCall && !InternalLinkage) {
-    unsigned GPReg = Subtarget->isABI_N64() ? Mips::GP_64 : Mips::GP;
-    EVT Ty = Subtarget->isABI_N64() ? MVT::i64 : MVT::i32;
+  // Note that we don't need GP to point to the GOT for indirect calls
+  // (when R_MIPS_CALL* is not used for the call) because Mips linker generates
+  // lazy binding stub for a function only when R_MIPS_CALL* are the only relocs
+  // used for the function (that is, Mips linker doesn't generate lazy binding
+  // stub for a function whose address is taken in the program).
+  if (IsPICCall && !InternalLinkage && IsCallReloc) {
+    unsigned GPReg = Subtarget.isABI_N64() ? Mips::GP_64 : Mips::GP;
+    EVT Ty = Subtarget.isABI_N64() ? MVT::i64 : MVT::i32;
     RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
   }
 
@@ -2364,10 +2431,11 @@
                                       RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CLI.CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
-  if (Subtarget->inMips16HardFloat()) {
+  if (Subtarget.inMips16HardFloat()) {
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
       llvm::StringRef Sym = G->getGlobal()->getName();
       Function *F = G->getGlobal()->getParent()->getFunction(Sym);
@@ -2400,31 +2468,30 @@
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
   MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC::SpecialCallingConvType SpecialCallingConv =
-    getSpecialCallingConv(Callee);
-  MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(),
-                    CCInfo, SpecialCallingConv);
+  MipsCCState CCInfo(
+      CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext(),
+      MipsCCState::getSpecialCallingConvForCallee(Callee.getNode(), Subtarget));
 
-  MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
-                                 Subtarget->mipsSEUsesSoftFloat(),
-                                 Callee.getNode(), CLI.getArgs());
+  // Allocate the reserved argument area. It seems strange to do this from the
+  // caller side but removing it breaks the frame size calculation.
+  const MipsABIInfo &ABI = Subtarget.getABI();
+  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), Callee.getNode());
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
 
   // Check if it's really possible to do a tail call.
   if (IsTailCall)
-    IsTailCall =
-      isEligibleForTailCallOptimization(MipsCCInfo, NextStackOffset,
-                                        *MF.getInfo<MipsFunctionInfo>());
+    IsTailCall = isEligibleForTailCallOptimization(
+        CCInfo, NextStackOffset, *MF.getInfo<MipsFunctionInfo>());
 
   if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
@@ -2444,13 +2511,14 @@
     Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(
-      Chain, DL, Subtarget->isABI_N64() ? Mips::SP_64 : Mips::SP,
+      Chain, DL, Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP,
       getPointerTy());
 
   // With EABI is it possible to have 16 args on registers.
   std::deque< std::pair<unsigned, SDValue> > RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
-  MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
+
+  CCInfo.rewindByValRegsInfo();
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -2458,23 +2526,30 @@
     CCValAssign &VA = ArgLocs[i];
     MVT ValVT = VA.getValVT(), LocVT = VA.getLocVT();
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    bool UseUpperBits = false;
 
     // ByVal Arg.
     if (Flags.isByVal()) {
+      unsigned FirstByValReg, LastByValReg;
+      unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
+      CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
+
       assert(Flags.getByValSize() &&
              "ByVal args of size 0 should have been ignored by front-end.");
-      assert(ByValArg != MipsCCInfo.byval_end());
+      assert(ByValIdx < CCInfo.getInRegsParamsCount());
       assert(!IsTailCall &&
              "Do not tail-call optimize if there is a byval argument.");
       passByValArg(Chain, DL, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
-                   MipsCCInfo, *ByValArg, Flags, Subtarget->isLittle());
-      ++ByValArg;
+                   FirstByValReg, LastByValReg, Flags, Subtarget.isLittle(),
+                   VA);
+      CCInfo.nextInRegsParam();
       continue;
     }
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
+    default:
+      llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       if (VA.isRegLoc()) {
         if ((ValVT == MVT::f32 && LocVT == MVT::i32) ||
@@ -2486,7 +2561,7 @@
                                    Arg, DAG.getConstant(0, MVT::i32));
           SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
                                    Arg, DAG.getConstant(1, MVT::i32));
-          if (!Subtarget->isLittle())
+          if (!Subtarget.isLittle())
             std::swap(Lo, Hi);
           unsigned LocRegLo = VA.getLocReg();
           unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
@@ -2496,17 +2571,37 @@
         }
       }
       break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
+      break;
+    case CCValAssign::SExtUpper:
+      UseUpperBits = true;
+      // Fallthrough
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, LocVT, Arg);
       break;
+    case CCValAssign::ZExtUpper:
+      UseUpperBits = true;
+      // Fallthrough
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, LocVT, Arg);
       break;
+    case CCValAssign::AExtUpper:
+      UseUpperBits = true;
+      // Fallthrough
     case CCValAssign::AExt:
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, LocVT, Arg);
       break;
     }
 
+    if (UseUpperBits) {
+      unsigned ValSizeInBits = Outs[i].ArgVT.getSizeInBits();
+      unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+      Arg = DAG.getNode(
+          ISD::SHL, DL, VA.getLocVT(), Arg,
+          DAG.getConstant(LocSizeInBits - ValSizeInBits, VA.getLocVT()));
+    }
+
     // Arguments that can be passed on register must be kept at
     // RegsToPass vector
     if (VA.isRegLoc()) {
@@ -2532,9 +2627,9 @@
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
   bool IsPICCall =
-      (Subtarget->isABI_N64() || IsPIC); // true if calls are translated to
+      (Subtarget.isABI_N64() || IsPIC); // true if calls are translated to
                                          // jalr $25
-  bool GlobalOrExternal = false, InternalLinkage = false;
+  bool GlobalOrExternal = false, InternalLinkage = false, IsCallReloc = false;
   SDValue CalleeLo;
   EVT Ty = Callee.getValueType();
 
@@ -2545,14 +2640,17 @@
 
       if (InternalLinkage)
         Callee = getAddrLocal(G, Ty, DAG,
-                              Subtarget->isABI_N32() || Subtarget->isABI_N64());
-      else if (LargeGOT)
+                              Subtarget.isABI_N32() || Subtarget.isABI_N64());
+      else if (LargeGOT) {
         Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16,
                                        MipsII::MO_CALL_LO16, Chain,
                                        FuncInfo->callPtrInfo(Val));
-      else
+        IsCallReloc = true;
+      } else {
         Callee = getAddrGlobal(G, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
                                FuncInfo->callPtrInfo(Val));
+        IsCallReloc = true;
+      }
     } else
       Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
                                           MipsII::MO_NO_FLAG);
@@ -2561,16 +2659,19 @@
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *Sym = S->getSymbol();
 
-    if (!Subtarget->isABI_N64() && !IsPIC) // !N64 && static
+    if (!Subtarget.isABI_N64() && !IsPIC) // !N64 && static
       Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(),
                                             MipsII::MO_NO_FLAG);
-    else if (LargeGOT)
+    else if (LargeGOT) {
       Callee = getAddrGlobalLargeGOT(S, Ty, DAG, MipsII::MO_CALL_HI16,
                                      MipsII::MO_CALL_LO16, Chain,
                                      FuncInfo->callPtrInfo(Sym));
-    else // N64 || PIC
+      IsCallReloc = true;
+    } else { // N64 || PIC
       Callee = getAddrGlobal(S, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
                              FuncInfo->callPtrInfo(Sym));
+      IsCallReloc = true;
+    }
 
     GlobalOrExternal = true;
   }
@@ -2579,7 +2680,7 @@
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
   getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal, InternalLinkage,
-              CLI, Callee, Chain);
+              IsCallReloc, CLI, Callee, Chain);
 
   if (IsTailCall)
     return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, Ops);
@@ -2594,39 +2695,68 @@
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg,
-                         Ins, DL, DAG, InVals, CLI.Callee.getNode(), CLI.RetTy);
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, CLI);
 }
 
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
-SDValue
-MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                    CallingConv::ID CallConv, bool IsVarArg,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SDLoc DL, SelectionDAG &DAG,
-                                    SmallVectorImpl<SDValue> &InVals,
-                                    const SDNode *CallNode,
-                                    const Type *RetTy) const {
+SDValue MipsTargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals,
+    TargetLowering::CallLoweringInfo &CLI) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(),
-                    CCInfo);
-
-  MipsCCInfo.analyzeCallResult(Ins, Subtarget->mipsSEUsesSoftFloat(),
-                               CallNode, RetTy);
+  MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                     *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
     SDValue Val = DAG.getCopyFromReg(Chain, DL, RVLocs[i].getLocReg(),
                                      RVLocs[i].getLocVT(), InFlag);
     Chain = Val.getValue(1);
     InFlag = Val.getValue(2);
 
-    if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
-      Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getValVT(), Val);
+    if (VA.isUpperBitsInLoc()) {
+      unsigned ValSizeInBits = Ins[i].ArgVT.getSizeInBits();
+      unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+      unsigned Shift =
+          VA.getLocInfo() == CCValAssign::ZExtUpper ? ISD::SRL : ISD::SRA;
+      Val = DAG.getNode(
+          Shift, DL, VA.getLocVT(), Val,
+          DAG.getConstant(LocSizeInBits - ValSizeInBits, VA.getLocVT()));
+    }
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+      break;
+    case CCValAssign::AExt:
+    case CCValAssign::AExtUpper:
+      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+    case CCValAssign::ZExtUpper:
+      Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+    case CCValAssign::SExtUpper:
+      Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+      break;
+    }
 
     InVals.push_back(Val);
   }
@@ -2634,6 +2764,60 @@
   return Chain;
 }
 
+static SDValue UnpackFromArgumentSlot(SDValue Val, const CCValAssign &VA,
+                                      EVT ArgVT, SDLoc DL, SelectionDAG &DAG) {
+  MVT LocVT = VA.getLocVT();
+  EVT ValVT = VA.getValVT();
+
+  // Shift into the upper bits if necessary.
+  switch (VA.getLocInfo()) {
+  default:
+    break;
+  case CCValAssign::AExtUpper:
+  case CCValAssign::SExtUpper:
+  case CCValAssign::ZExtUpper: {
+    unsigned ValSizeInBits = ArgVT.getSizeInBits();
+    unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+    unsigned Opcode =
+        VA.getLocInfo() == CCValAssign::ZExtUpper ? ISD::SRL : ISD::SRA;
+    Val = DAG.getNode(
+        Opcode, DL, VA.getLocVT(), Val,
+        DAG.getConstant(LocSizeInBits - ValSizeInBits, VA.getLocVT()));
+    break;
+  }
+  }
+
+  // If this is an value smaller than the argument slot size (32-bit for O32,
+  // 64-bit for N32/N64), it has been promoted in some way to the argument slot
+  // size. Extract the value and insert any appropriate assertions regarding
+  // sign/zero extension.
+  switch (VA.getLocInfo()) {
+  default:
+    llvm_unreachable("Unknown loc info!");
+  case CCValAssign::Full:
+    break;
+  case CCValAssign::AExtUpper:
+  case CCValAssign::AExt:
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+    break;
+  case CCValAssign::SExtUpper:
+  case CCValAssign::SExt:
+    Val = DAG.getNode(ISD::AssertSext, DL, LocVT, Val, DAG.getValueType(ValVT));
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+    break;
+  case CCValAssign::ZExtUpper:
+  case CCValAssign::ZExt:
+    Val = DAG.getNode(ISD::AssertZext, DL, LocVT, Val, DAG.getValueType(ValVT));
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+    break;
+  case CCValAssign::BCvt:
+    Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+    break;
+  }
+
+  return Val;
+}
+
 //===----------------------------------------------------------------------===//
 //             Formal Arguments Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -2658,20 +2842,19 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(),
-                    CCInfo);
+  MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                     *DAG.getContext());
+  const MipsABIInfo &ABI = Subtarget.getABI();
+  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
   Function::const_arg_iterator FuncArg =
     DAG.getMachineFunction().getFunction()->arg_begin();
-  bool UseSoftFloat = Subtarget->mipsSEUsesSoftFloat();
 
-  MipsCCInfo.analyzeFormalArguments(Ins, UseSoftFloat, FuncArg);
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FixedArg);
   MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
-                           MipsCCInfo.hasByValArg());
+                           CCInfo.getInRegsParamsCount() > 0);
 
   unsigned CurArgIdx = 0;
-  MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
+  CCInfo.rewindByValRegsInfo();
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -2682,12 +2865,16 @@
     bool IsRegLoc = VA.isRegLoc();
 
     if (Flags.isByVal()) {
+      unsigned FirstByValReg, LastByValReg;
+      unsigned ByValIdx = CCInfo.getInRegsParamsProcessed();
+      CCInfo.getInRegsParamInfo(ByValIdx, FirstByValReg, LastByValReg);
+
       assert(Flags.getByValSize() &&
              "ByVal args of size 0 should have been ignored by front-end.");
-      assert(ByValArg != MipsCCInfo.byval_end());
+      assert(ByValIdx < CCInfo.getInRegsParamsCount());
       copyByValRegs(Chain, DL, OutChains, DAG, Flags, InVals, &*FuncArg,
-                    MipsCCInfo, *ByValArg);
-      ++ByValArg;
+                    FirstByValReg, LastByValReg, VA, CCInfo);
+      CCInfo.nextInRegsParam();
       continue;
     }
 
@@ -2702,20 +2889,7 @@
       unsigned Reg = addLiveIn(DAG.getMachineFunction(), ArgReg, RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
-      // If this is an 8 or 16-bit value, it has been passed promoted
-      // to 32 bits.  Insert an assert[sz]ext to capture this, then
-      // truncate to the right size.
-      if (VA.getLocInfo() != CCValAssign::Full) {
-        unsigned Opcode = 0;
-        if (VA.getLocInfo() == CCValAssign::SExt)
-          Opcode = ISD::AssertSext;
-        else if (VA.getLocInfo() == CCValAssign::ZExt)
-          Opcode = ISD::AssertZext;
-        if (Opcode)
-          ArgValue = DAG.getNode(Opcode, DL, RegVT, ArgValue,
-                                 DAG.getValueType(ValVT));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, ValVT, ArgValue);
-      }
+      ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
 
       // Handle floating point arguments passed in integer registers and
       // long double arguments passed in floating point registers.
@@ -2723,12 +2897,12 @@
           (RegVT == MVT::i64 && ValVT == MVT::f64) ||
           (RegVT == MVT::f64 && ValVT == MVT::i64))
         ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
-      else if (Subtarget->isABI_O32() && RegVT == MVT::i32 &&
+      else if (Subtarget.isABI_O32() && RegVT == MVT::i32 &&
                ValVT == MVT::f64) {
         unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
                                   getNextIntArgReg(ArgReg), RC);
         SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
-        if (!Subtarget->isLittle())
+        if (!Subtarget.isLittle())
           std::swap(ArgValue, ArgValue2);
         ArgValue = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64,
                                ArgValue, ArgValue2);
@@ -2736,21 +2910,34 @@
 
       InVals.push_back(ArgValue);
     } else { // VA.isRegLoc()
+      MVT LocVT = VA.getLocVT();
+
+      if (Subtarget.isABI_O32()) {
+        // We ought to be able to use LocVT directly but O32 sets it to i32
+        // when allocating floating point values to integer registers.
+        // This shouldn't influence how we load the value into registers unless
+        // we are targetting softfloat.
+        if (VA.getValVT().isFloatingPoint() && !Subtarget.abiUsesSoftFloat())
+          LocVT = VA.getValVT();
+      }
 
       // sanity check
       assert(VA.isMemLoc());
 
       // The stack pointer offset is relative to the caller stack frame.
-      int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+      int FI = MFI->CreateFixedObject(LocVT.getSizeInBits() / 8,
                                       VA.getLocMemOffset(), true);
 
       // Create load nodes to retrieve arguments from the stack
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      SDValue Load = DAG.getLoad(ValVT, DL, Chain, FIN,
-                                 MachinePointerInfo::getFixedStack(FI),
-                                 false, false, false, 0);
-      InVals.push_back(Load);
-      OutChains.push_back(Load.getValue(1));
+      SDValue ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
+                                     MachinePointerInfo::getFixedStack(FI),
+                                     false, false, false, 0);
+      OutChains.push_back(ArgValue.getValue(1));
+
+      ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG);
+
+      InVals.push_back(ArgValue);
     }
   }
 
@@ -2762,7 +2949,7 @@
       unsigned Reg = MipsFI->getSRetReturnReg();
       if (!Reg) {
         Reg = MF.getRegInfo().createVirtualRegister(
-            getRegClassFor(Subtarget->isABI_N64() ? MVT::i64 : MVT::i32));
+            getRegClassFor(Subtarget.isABI_N64() ? MVT::i64 : MVT::i32));
         MipsFI->setSRetReturnReg(Reg);
       }
       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
@@ -2772,7 +2959,7 @@
   }
 
   if (IsVarArg)
-    writeVarArgRegs(OutChains, MipsCCInfo, Chain, DL, DAG);
+    writeVarArgRegs(OutChains, Chain, DL, DAG, CCInfo);
 
   // All stores are grouped in one node to allow the matching between
   // the size of Ins and InVals. This only happens when on varg functions
@@ -2794,8 +2981,7 @@
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(),
-                 RVLocs, Context);
+  MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_Mips);
 }
 
@@ -2811,14 +2997,10 @@
   MachineFunction &MF = DAG.getMachineFunction();
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs,
-                 *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, Subtarget->isABI_O32(), Subtarget->isFP64bit(),
-                    CCInfo);
+  MipsCCState CCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext());
 
   // Analyze return values.
-  MipsCCInfo.analyzeReturn(Outs, Subtarget->mipsSEUsesSoftFloat(),
-                           MF.getFunction()->getReturnType());
+  CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -2828,9 +3010,43 @@
     SDValue Val = OutVals[i];
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
+    bool UseUpperBits = false;
 
-    if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
-      Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getLocVT(), Val);
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Val);
+      break;
+    case CCValAssign::AExtUpper:
+      UseUpperBits = true;
+      // Fallthrough
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Val);
+      break;
+    case CCValAssign::ZExtUpper:
+      UseUpperBits = true;
+      // Fallthrough
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Val);
+      break;
+    case CCValAssign::SExtUpper:
+      UseUpperBits = true;
+      // Fallthrough
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Val);
+      break;
+    }
+
+    if (UseUpperBits) {
+      unsigned ValSizeInBits = Outs[i].ArgVT.getSizeInBits();
+      unsigned LocSizeInBits = VA.getLocVT().getSizeInBits();
+      Val = DAG.getNode(
+          ISD::SHL, DL, VA.getLocVT(), Val,
+          DAG.getConstant(LocSizeInBits - ValSizeInBits, VA.getLocVT()));
+    }
 
     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
 
@@ -2850,7 +3066,7 @@
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
-    unsigned V0 = Subtarget->isABI_N64() ? Mips::V0_64 : Mips::V0;
+    unsigned V0 = Subtarget.isABI_N64() ? Mips::V0_64 : Mips::V0;
 
     Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
     Flag = Chain.getValue(1);
@@ -2928,7 +3144,7 @@
       weight = CW_Register;
     break;
   case 'f': // FPU or MSA register
-    if (Subtarget->hasMSA() && type->isVectorTy() &&
+    if (Subtarget.hasMSA() && type->isVectorTy() &&
         cast<VectorType>(type)->getBitWidth() == 128)
       weight = CW_Register;
     else if (type->isFloatTy())
@@ -2962,7 +3178,7 @@
 /// that is returned indicates whether parsing was successful. The second flag
 /// is true if the numeric part exists.
 static std::pair<bool, bool>
-parsePhysicalReg(const StringRef &C, std::string &Prefix,
+parsePhysicalReg(StringRef C, std::string &Prefix,
                  unsigned long long &Reg) {
   if (C.front() != '{' || C.back() != '}')
     return std::make_pair(false, false);
@@ -2983,8 +3199,9 @@
 }
 
 std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
-parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const TargetRegisterClass *RC;
   std::string Prefix;
   unsigned long long Reg;
@@ -3034,7 +3251,7 @@
     // If the size of FP registers is 64-bit or Reg is an even number, select
     // the 64-bit register class. Otherwise, select the 32-bit register class.
     if (VT == MVT::Other)
-      VT = (Subtarget->isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
+      VT = (Subtarget.isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
 
     RC = getRegClassFor(VT);
 
@@ -3067,13 +3284,13 @@
     case 'y': // Same as 'r'. Exists for compatibility.
     case 'r':
       if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
-        if (Subtarget->inMips16Mode())
+        if (Subtarget.inMips16Mode())
           return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::GPR32RegClass);
       }
-      if (VT == MVT::i64 && !Subtarget->isGP64bit())
+      if (VT == MVT::i64 && !Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR32RegClass);
-      if (VT == MVT::i64 && Subtarget->isGP64bit())
+      if (VT == MVT::i64 && Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
       return std::make_pair(0U, nullptr);
@@ -3088,8 +3305,8 @@
         return std::make_pair(0U, &Mips::MSA128DRegClass);
       else if (VT == MVT::f32)
         return std::make_pair(0U, &Mips::FGR32RegClass);
-      else if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
-        if (Subtarget->isFP64bit())
+      else if ((VT == MVT::f64) && (!Subtarget.isSingleFloat())) {
+        if (Subtarget.isFP64bit())
           return std::make_pair(0U, &Mips::FGR64RegClass);
         return std::make_pair(0U, &Mips::AFGR64RegClass);
       }
@@ -3245,7 +3462,7 @@
                                             bool IsMemset, bool ZeroMemset,
                                             bool MemcpyStrSrc,
                                             MachineFunction &MF) const {
-  if (Subtarget->hasMips64())
+  if (Subtarget.hasMips64())
     return MVT::i64;
 
   return MVT::i32;
@@ -3260,291 +3477,33 @@
 }
 
 unsigned MipsTargetLowering::getJumpTableEncoding() const {
-  if (Subtarget->isABI_N64())
+  if (Subtarget.isABI_N64())
     return MachineJumpTableInfo::EK_GPRel64BlockAddress;
 
   return TargetLowering::getJumpTableEncoding();
 }
 
-/// This function returns true if CallSym is a long double emulation routine.
-static bool isF128SoftLibCall(const char *CallSym) {
-  const char *const LibCalls[] =
-    {"__addtf3", "__divtf3", "__eqtf2", "__extenddftf2", "__extendsftf2",
-     "__fixtfdi", "__fixtfsi", "__fixtfti", "__fixunstfdi", "__fixunstfsi",
-     "__fixunstfti", "__floatditf", "__floatsitf", "__floattitf",
-     "__floatunditf", "__floatunsitf", "__floatuntitf", "__getf2", "__gttf2",
-     "__letf2", "__lttf2", "__multf3", "__netf2", "__powitf2", "__subtf3",
-     "__trunctfdf2", "__trunctfsf2", "__unordtf2",
-     "ceill", "copysignl", "cosl", "exp2l", "expl", "floorl", "fmal", "fmodl",
-     "log10l", "log2l", "logl", "nearbyintl", "powl", "rintl", "sinl", "sqrtl",
-     "truncl"};
-
-  const char *const *End = LibCalls + array_lengthof(LibCalls);
-
-  // Check that LibCalls is sorted alphabetically.
-  MipsTargetLowering::LTStr Comp;
-
-#ifndef NDEBUG
-  for (const char *const *I = LibCalls; I < End - 1; ++I)
-    assert(Comp(*I, *(I + 1)));
-#endif
-
-  return std::binary_search(LibCalls, End, CallSym, Comp);
-}
-
-/// This function returns true if Ty is fp128 or i128 which was originally a
-/// fp128.
-static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
-  if (Ty->isFP128Ty())
-    return true;
-
-  const ExternalSymbolSDNode *ES =
-    dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode);
-
-  // If the Ty is i128 and the function being called is a long double emulation
-  // routine, then the original type is f128.
-  return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
-}
-
-MipsTargetLowering::MipsCC::SpecialCallingConvType
-  MipsTargetLowering::getSpecialCallingConv(SDValue Callee) const {
-  MipsCC::SpecialCallingConvType SpecialCallingConv =
-    MipsCC::NoSpecialCallingConv;
-  if (Subtarget->inMips16HardFloat()) {
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-      llvm::StringRef Sym = G->getGlobal()->getName();
-      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
-      if (F && F->hasFnAttribute("__Mips16RetHelper")) {
-        SpecialCallingConv = MipsCC::Mips16RetHelperConv;
-      }
-    }
-  }
-  return SpecialCallingConv;
-}
-
-MipsTargetLowering::MipsCC::MipsCC(
-  CallingConv::ID CC, bool IsO32_, bool IsFP64_, CCState &Info,
-  MipsCC::SpecialCallingConvType SpecialCallingConv_)
-  : CCInfo(Info), CallConv(CC), IsO32(IsO32_), IsFP64(IsFP64_),
-    SpecialCallingConv(SpecialCallingConv_){
-  // Pre-allocate reserved argument area.
-  CCInfo.AllocateStack(reservedArgArea(), 1);
-}
-
-
-void MipsTargetLowering::MipsCC::
-analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
-                    bool IsVarArg, bool IsSoftFloat, const SDNode *CallNode,
-                    std::vector<ArgListEntry> &FuncArgs) {
-  assert((CallConv != CallingConv::Fast || !IsVarArg) &&
-         "CallingConv::Fast shouldn't be used for vararg functions.");
-
-  unsigned NumOpnds = Args.size();
-  llvm::CCAssignFn *FixedFn = fixedArgFn(), *VarFn = varArgFn();
-
-  for (unsigned I = 0; I != NumOpnds; ++I) {
-    MVT ArgVT = Args[I].VT;
-    ISD::ArgFlagsTy ArgFlags = Args[I].Flags;
-    bool R;
-
-    if (ArgFlags.isByVal()) {
-      handleByValArg(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags);
-      continue;
-    }
-
-    if (IsVarArg && !Args[I].IsFixed)
-      R = VarFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-    else {
-      MVT RegVT = getRegVT(ArgVT, FuncArgs[Args[I].OrigArgIndex].Ty, CallNode,
-                           IsSoftFloat);
-      R = FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo);
-    }
-
-    if (R) {
-#ifndef NDEBUG
-      dbgs() << "Call operand #" << I << " has unhandled type "
-             << EVT(ArgVT).getEVTString();
-#endif
-      llvm_unreachable(nullptr);
-    }
-  }
-}
-
-void MipsTargetLowering::MipsCC::
-analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args,
-                       bool IsSoftFloat, Function::const_arg_iterator FuncArg) {
-  unsigned NumArgs = Args.size();
-  llvm::CCAssignFn *FixedFn = fixedArgFn();
-  unsigned CurArgIdx = 0;
-
-  for (unsigned I = 0; I != NumArgs; ++I) {
-    MVT ArgVT = Args[I].VT;
-    ISD::ArgFlagsTy ArgFlags = Args[I].Flags;
-    std::advance(FuncArg, Args[I].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Args[I].OrigArgIndex;
-
-    if (ArgFlags.isByVal()) {
-      handleByValArg(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags);
-      continue;
-    }
-
-    MVT RegVT = getRegVT(ArgVT, FuncArg->getType(), nullptr, IsSoftFloat);
-
-    if (!FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo))
-      continue;
-
-#ifndef NDEBUG
-    dbgs() << "Formal Arg #" << I << " has unhandled type "
-           << EVT(ArgVT).getEVTString();
-#endif
-    llvm_unreachable(nullptr);
-  }
-}
-
-template<typename Ty>
-void MipsTargetLowering::MipsCC::
-analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
-              const SDNode *CallNode, const Type *RetTy) const {
-  CCAssignFn *Fn;
-
-  if (IsSoftFloat && originalTypeIsF128(RetTy, CallNode))
-    Fn = RetCC_F128Soft;
-  else
-    Fn = RetCC_Mips;
-
-  for (unsigned I = 0, E = RetVals.size(); I < E; ++I) {
-    MVT VT = RetVals[I].VT;
-    ISD::ArgFlagsTy Flags = RetVals[I].Flags;
-    MVT RegVT = this->getRegVT(VT, RetTy, CallNode, IsSoftFloat);
-
-    if (Fn(I, VT, RegVT, CCValAssign::Full, Flags, this->CCInfo)) {
-#ifndef NDEBUG
-      dbgs() << "Call result #" << I << " has unhandled type "
-             << EVT(VT).getEVTString() << '\n';
-#endif
-      llvm_unreachable(nullptr);
-    }
-  }
-}
-
-void MipsTargetLowering::MipsCC::
-analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsSoftFloat,
-                  const SDNode *CallNode, const Type *RetTy) const {
-  analyzeReturn(Ins, IsSoftFloat, CallNode, RetTy);
-}
-
-void MipsTargetLowering::MipsCC::
-analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsSoftFloat,
-              const Type *RetTy) const {
-  analyzeReturn(Outs, IsSoftFloat, nullptr, RetTy);
-}
-
-void MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
-                                                MVT LocVT,
-                                                CCValAssign::LocInfo LocInfo,
-                                                ISD::ArgFlagsTy ArgFlags) {
-  assert(ArgFlags.getByValSize() && "Byval argument's size shouldn't be 0.");
-
-  struct ByValArgInfo ByVal;
-  unsigned RegSize = regSize();
-  unsigned ByValSize = RoundUpToAlignment(ArgFlags.getByValSize(), RegSize);
-  unsigned Align = std::min(std::max(ArgFlags.getByValAlign(), RegSize),
-                            RegSize * 2);
-
-  if (useRegsForByval())
-    allocateRegs(ByVal, ByValSize, Align);
-
-  // Allocate space on caller's stack.
-  ByVal.Address = CCInfo.AllocateStack(ByValSize - RegSize * ByVal.NumRegs,
-                                       Align);
-  CCInfo.addLoc(CCValAssign::getMem(ValNo, ValVT, ByVal.Address, LocVT,
-                                    LocInfo));
-  ByValArgs.push_back(ByVal);
-}
-
-unsigned MipsTargetLowering::MipsCC::numIntArgRegs() const {
-  return IsO32 ? array_lengthof(O32IntRegs) : array_lengthof(Mips64IntRegs);
-}
-
-unsigned MipsTargetLowering::MipsCC::reservedArgArea() const {
-  return (IsO32 && (CallConv != CallingConv::Fast)) ? 16 : 0;
-}
-
-const MCPhysReg *MipsTargetLowering::MipsCC::intArgRegs() const {
-  return IsO32 ? O32IntRegs : Mips64IntRegs;
-}
-
-llvm::CCAssignFn *MipsTargetLowering::MipsCC::fixedArgFn() const {
-  if (CallConv == CallingConv::Fast)
-    return CC_Mips_FastCC;
-
-  if (SpecialCallingConv == Mips16RetHelperConv)
-    return CC_Mips16RetHelper;
-  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN;
-}
-
-llvm::CCAssignFn *MipsTargetLowering::MipsCC::varArgFn() const {
-  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN_VarArg;
-}
-
-const MCPhysReg *MipsTargetLowering::MipsCC::shadowRegs() const {
-  return IsO32 ? O32IntRegs : Mips64DPRegs;
-}
-
-void MipsTargetLowering::MipsCC::allocateRegs(ByValArgInfo &ByVal,
-                                              unsigned ByValSize,
-                                              unsigned Align) {
-  unsigned RegSize = regSize(), NumIntArgRegs = numIntArgRegs();
-  const MCPhysReg *IntArgRegs = intArgRegs(), *ShadowRegs = shadowRegs();
-  assert(!(ByValSize % RegSize) && !(Align % RegSize) &&
-         "Byval argument's size and alignment should be a multiple of"
-         "RegSize.");
-
-  ByVal.FirstIdx = CCInfo.getFirstUnallocated(IntArgRegs, NumIntArgRegs);
-
-  // If Align > RegSize, the first arg register must be even.
-  if ((Align > RegSize) && (ByVal.FirstIdx % 2)) {
-    CCInfo.AllocateReg(IntArgRegs[ByVal.FirstIdx], ShadowRegs[ByVal.FirstIdx]);
-    ++ByVal.FirstIdx;
-  }
-
-  // Mark the registers allocated.
-  for (unsigned I = ByVal.FirstIdx; ByValSize && (I < NumIntArgRegs);
-       ByValSize -= RegSize, ++I, ++ByVal.NumRegs)
-    CCInfo.AllocateReg(IntArgRegs[I], ShadowRegs[I]);
-}
-
-MVT MipsTargetLowering::MipsCC::getRegVT(MVT VT, const Type *OrigTy,
-                                         const SDNode *CallNode,
-                                         bool IsSoftFloat) const {
-  if (IsSoftFloat || IsO32)
-    return VT;
-
-  // Check if the original type was fp128.
-  if (originalTypeIsF128(OrigTy, CallNode)) {
-    assert(VT == MVT::i64);
-    return MVT::f64;
-  }
-
-  return VT;
-}
-
-void MipsTargetLowering::
-copyByValRegs(SDValue Chain, SDLoc DL, std::vector<SDValue> &OutChains,
-              SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
-              SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
-              const MipsCC &CC, const ByValArgInfo &ByVal) const {
+void MipsTargetLowering::copyByValRegs(
+    SDValue Chain, SDLoc DL, std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+    const ISD::ArgFlagsTy &Flags, SmallVectorImpl<SDValue> &InVals,
+    const Argument *FuncArg, unsigned FirstReg, unsigned LastReg,
+    const CCValAssign &VA, MipsCCState &State) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned RegAreaSize = ByVal.NumRegs * CC.regSize();
+  unsigned GPRSizeInBytes = Subtarget.getGPRSizeInBytes();
+  unsigned NumRegs = LastReg - FirstReg;
+  unsigned RegAreaSize = NumRegs * GPRSizeInBytes;
   unsigned FrameObjSize = std::max(Flags.getByValSize(), RegAreaSize);
   int FrameObjOffset;
+  const MipsABIInfo &ABI = Subtarget.getABI();
+  ArrayRef<MCPhysReg> ByValArgRegs = ABI.GetByValArgRegs();
 
   if (RegAreaSize)
-    FrameObjOffset = (int)CC.reservedArgArea() -
-      (int)((CC.numIntArgRegs() - ByVal.FirstIdx) * CC.regSize());
+    FrameObjOffset =
+        (int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
+        (int)((ByValArgRegs.size() - FirstReg) * GPRSizeInBytes);
   else
-    FrameObjOffset = ByVal.Address;
+    FrameObjOffset = VA.getLocMemOffset();
 
   // Create frame object.
   EVT PtrTy = getPointerTy();
@@ -3552,17 +3511,17 @@
   SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
   InVals.push_back(FIN);
 
-  if (!ByVal.NumRegs)
+  if (!NumRegs)
     return;
 
   // Copy arg registers.
-  MVT RegTy = MVT::getIntegerVT(CC.regSize() * 8);
+  MVT RegTy = MVT::getIntegerVT(GPRSizeInBytes * 8);
   const TargetRegisterClass *RC = getRegClassFor(RegTy);
 
-  for (unsigned I = 0; I < ByVal.NumRegs; ++I) {
-    unsigned ArgReg = CC.intArgRegs()[ByVal.FirstIdx + I];
+  for (unsigned I = 0; I < NumRegs; ++I) {
+    unsigned ArgReg = ByValArgRegs[FirstReg + I];
     unsigned VReg = addLiveIn(MF, ArgReg, RC);
-    unsigned Offset = I * CC.regSize();
+    unsigned Offset = I * GPRSizeInBytes;
     SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
                                    DAG.getConstant(Offset, PtrTy));
     SDValue Store = DAG.getStore(Chain, DL, DAG.getRegister(VReg, RegTy),
@@ -3573,34 +3532,34 @@
 }
 
 // Copy byVal arg to registers and stack.
-void MipsTargetLowering::
-passByValArg(SDValue Chain, SDLoc DL,
-             std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
-             SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
-             MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
-             const MipsCC &CC, const ByValArgInfo &ByVal,
-             const ISD::ArgFlagsTy &Flags, bool isLittle) const {
+void MipsTargetLowering::passByValArg(
+    SDValue Chain, SDLoc DL,
+    std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
+    SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
+    MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg, unsigned FirstReg,
+    unsigned LastReg, const ISD::ArgFlagsTy &Flags, bool isLittle,
+    const CCValAssign &VA) const {
   unsigned ByValSizeInBytes = Flags.getByValSize();
   unsigned OffsetInBytes = 0; // From beginning of struct
-  unsigned RegSizeInBytes = CC.regSize();
+  unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
   unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
   EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
+  unsigned NumRegs = LastReg - FirstReg;
 
-  if (ByVal.NumRegs) {
-    const MCPhysReg *ArgRegs = CC.intArgRegs();
-    bool LeftoverBytes = (ByVal.NumRegs * RegSizeInBytes > ByValSizeInBytes);
+  if (NumRegs) {
+    const ArrayRef<MCPhysReg> ArgRegs = Subtarget.getABI().GetByValArgRegs();
+    bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes);
     unsigned I = 0;
 
     // Copy words to registers.
-    for (; I < ByVal.NumRegs - LeftoverBytes;
-         ++I, OffsetInBytes += RegSizeInBytes) {
+    for (; I < NumRegs - LeftoverBytes; ++I, OffsetInBytes += RegSizeInBytes) {
       SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
                                     DAG.getConstant(OffsetInBytes, PtrTy));
       SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
                                     MachinePointerInfo(), false, false, false,
                                     Alignment);
       MemOpChains.push_back(LoadVal.getValue(1));
-      unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
+      unsigned ArgReg = ArgRegs[FirstReg + I];
       RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
     }
 
@@ -3610,9 +3569,6 @@
 
     // Copy the remainder of the byval argument with sub-word loads and shifts.
     if (LeftoverBytes) {
-      assert((ByValSizeInBytes > OffsetInBytes) &&
-             (ByValSizeInBytes < OffsetInBytes + RegSizeInBytes) &&
-             "Size of the remainder should be smaller than RegSizeInBytes.");
       SDValue Val;
 
       for (unsigned LoadSizeInBytes = RegSizeInBytes / 2, TotalBytesLoaded = 0;
@@ -3627,7 +3583,8 @@
                                       DAG.getConstant(OffsetInBytes, PtrTy));
         SDValue LoadVal = DAG.getExtLoad(
             ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
-            MVT::getIntegerVT(LoadSizeInBytes * 8), false, false, Alignment);
+            MVT::getIntegerVT(LoadSizeInBytes * 8), false, false, false,
+            Alignment);
         MemOpChains.push_back(LoadVal.getValue(1));
 
         // Shift the loaded value.
@@ -3651,7 +3608,7 @@
         Alignment = std::min(Alignment, LoadSizeInBytes);
       }
 
-      unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
+      unsigned ArgReg = ArgRegs[FirstReg + I];
       RegsToPass.push_back(std::make_pair(ArgReg, Val));
       return;
     }
@@ -3662,7 +3619,7 @@
   SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
                             DAG.getConstant(OffsetInBytes, PtrTy));
   SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
-                            DAG.getIntPtrConstant(ByVal.Address));
+                            DAG.getIntPtrConstant(VA.getLocMemOffset()));
   Chain = DAG.getMemcpy(Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, PtrTy),
                         Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
                         MachinePointerInfo(), MachinePointerInfo());
@@ -3670,14 +3627,13 @@
 }
 
 void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
-                                         const MipsCC &CC, SDValue Chain,
-                                         SDLoc DL, SelectionDAG &DAG) const {
-  unsigned NumRegs = CC.numIntArgRegs();
-  const MCPhysReg *ArgRegs = CC.intArgRegs();
-  const CCState &CCInfo = CC.getCCInfo();
-  unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumRegs);
-  unsigned RegSize = CC.regSize();
-  MVT RegTy = MVT::getIntegerVT(RegSize * 8);
+                                         SDValue Chain, SDLoc DL,
+                                         SelectionDAG &DAG,
+                                         CCState &State) const {
+  const ArrayRef<MCPhysReg> ArgRegs = Subtarget.getABI().GetVarArgRegs();
+  unsigned Idx = State.getFirstUnallocated(ArgRegs.data(), ArgRegs.size());
+  unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
+  MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
   const TargetRegisterClass *RC = getRegClassFor(RegTy);
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -3686,28 +3642,81 @@
   // Offset of the first variable argument from stack pointer.
   int VaArgOffset;
 
-  if (NumRegs == Idx)
-    VaArgOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), RegSize);
-  else
-    VaArgOffset = (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
+  if (ArgRegs.size() == Idx)
+    VaArgOffset =
+        RoundUpToAlignment(State.getNextStackOffset(), RegSizeInBytes);
+  else {
+    const MipsABIInfo &ABI = Subtarget.getABI();
+    VaArgOffset =
+        (int)ABI.GetCalleeAllocdArgSizeInBytes(State.getCallingConv()) -
+        (int)(RegSizeInBytes * (ArgRegs.size() - Idx));
+  }
 
   // Record the frame index of the first variable argument
   // which is a value necessary to VASTART.
-  int FI = MFI->CreateFixedObject(RegSize, VaArgOffset, true);
+  int FI = MFI->CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
   MipsFI->setVarArgsFrameIndex(FI);
 
   // Copy the integer registers that have not been used for argument passing
   // to the argument register save area. For O32, the save area is allocated
   // in the caller's stack frame, while for N32/64, it is allocated in the
   // callee's stack frame.
-  for (unsigned I = Idx; I < NumRegs; ++I, VaArgOffset += RegSize) {
+  for (unsigned I = Idx; I < ArgRegs.size();
+       ++I, VaArgOffset += RegSizeInBytes) {
     unsigned Reg = addLiveIn(MF, ArgRegs[I], RC);
     SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
-    FI = MFI->CreateFixedObject(RegSize, VaArgOffset, true);
+    FI = MFI->CreateFixedObject(RegSizeInBytes, VaArgOffset, true);
     SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
     SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
                                  MachinePointerInfo(), false, false, 0);
-    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue((Value*)nullptr);
+    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(
+        (Value *)nullptr);
     OutChains.push_back(Store);
   }
 }
+
+void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
+                                     unsigned Align) const {
+  MachineFunction &MF = State->getMachineFunction();
+  const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
+
+  assert(Size && "Byval argument's size shouldn't be 0.");
+
+  Align = std::min(Align, TFL->getStackAlignment());
+
+  unsigned FirstReg = 0;
+  unsigned NumRegs = 0;
+
+  if (State->getCallingConv() != CallingConv::Fast) {
+    unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
+    const ArrayRef<MCPhysReg> IntArgRegs = Subtarget.getABI().GetByValArgRegs();
+    // FIXME: The O32 case actually describes no shadow registers.
+    const MCPhysReg *ShadowRegs =
+        Subtarget.isABI_O32() ? IntArgRegs.data() : Mips64DPRegs;
+
+    // We used to check the size as well but we can't do that anymore since
+    // CCState::HandleByVal() rounds up the size after calling this function.
+    assert(!(Align % RegSizeInBytes) &&
+           "Byval argument's alignment should be a multiple of"
+           "RegSizeInBytes.");
+
+    FirstReg = State->getFirstUnallocated(IntArgRegs.data(), IntArgRegs.size());
+
+    // If Align > RegSizeInBytes, the first arg register must be even.
+    // FIXME: This condition happens to do the right thing but it's not the
+    //        right way to test it. We want to check that the stack frame offset
+    //        of the register is aligned.
+    if ((Align > RegSizeInBytes) && (FirstReg % 2)) {
+      State->AllocateReg(IntArgRegs[FirstReg], ShadowRegs[FirstReg]);
+      ++FirstReg;
+    }
+
+    // Mark the registers allocated.
+    Size = RoundUpToAlignment(Size, RegSizeInBytes);
+    for (unsigned I = FirstReg; Size > 0 && (I < IntArgRegs.size());
+         Size -= RegSizeInBytes, ++I, ++NumRegs)
+      State->AllocateReg(IntArgRegs[I], ShadowRegs[I]);
+  }
+
+  State->addInRegsParamInfo(FirstReg, FirstReg + NumRegs);
+}

diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 4701bc4..60e53da 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MipsISELLOWERING_H
-#define MipsISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSISELLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSISELLOWERING_H
 
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
@@ -210,13 +210,16 @@
   //===--------------------------------------------------------------------===//
   class MipsFunctionInfo;
   class MipsSubtarget;
+  class MipsCCState;
 
   class MipsTargetLowering : public TargetLowering  {
     bool isMicroMips;
   public:
-    explicit MipsTargetLowering(MipsTargetMachine &TM);
+    explicit MipsTargetLowering(const MipsTargetMachine &TM,
+                                const MipsSubtarget &STI);
 
-    static const MipsTargetLowering *create(MipsTargetMachine &TM);
+    static const MipsTargetLowering *create(const MipsTargetMachine &TM,
+                                            const MipsSubtarget &STI);
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
@@ -257,6 +260,8 @@
       }
     };
 
+    void HandleByVal(CCState *, unsigned &, unsigned) const override;
+
   protected:
     SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
 
@@ -327,6 +332,21 @@
                          DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
     }
 
+    // This method creates the following nodes, which are necessary for
+    // computing a symbol's address using gp-relative addressing:
+    //
+    // (add $gp, %gp_rel(sym))
+    template<class NodeTy>
+    SDValue getAddrGPRel(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
+      SDLoc DL(N);
+      assert(Ty == MVT::i32);
+      SDValue GPRel = getTargetNode(N, Ty, DAG, MipsII::MO_GPREL);
+      return DAG.getNode(ISD::ADD, DL, Ty,
+                         DAG.getRegister(Mips::GP, Ty),
+                         DAG.getNode(MipsISD::GPRel, DL, DAG.getVTList(Ty),
+                                     GPRel));
+    }
+
     /// This function fills Ops, which is the list of operands that will later
     /// be used when a function call node is created. It also generates
     /// copyToReg nodes to set up argument registers.
@@ -334,109 +354,15 @@
     getOpndList(SmallVectorImpl<SDValue> &Ops,
                 std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+                bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const;
 
-    /// ByValArgInfo - Byval argument information.
-    struct ByValArgInfo {
-      unsigned FirstIdx; // Index of the first register used.
-      unsigned NumRegs;  // Number of registers used for this argument.
-      unsigned Address;  // Offset of the stack area used to pass this argument.
-
-      ByValArgInfo() : FirstIdx(0), NumRegs(0), Address(0) {}
-    };
-
-    /// MipsCC - This class provides methods used to analyze formal and call
-    /// arguments and inquire about calling convention information.
-    class MipsCC {
-    public:
-      enum SpecialCallingConvType {
-        Mips16RetHelperConv, NoSpecialCallingConv
-      };
-
-      MipsCC(CallingConv::ID CallConv, bool IsO32, bool IsFP64, CCState &Info,
-             SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv);
-
-
-      void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                               bool IsVarArg, bool IsSoftFloat,
-                               const SDNode *CallNode,
-                               std::vector<ArgListEntry> &FuncArgs);
-      void analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  bool IsSoftFloat,
-                                  Function::const_arg_iterator FuncArg);
-
-      void analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
-                             bool IsSoftFloat, const SDNode *CallNode,
-                             const Type *RetTy) const;
-
-      void analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
-                         bool IsSoftFloat, const Type *RetTy) const;
-
-      const CCState &getCCInfo() const { return CCInfo; }
-
-      /// hasByValArg - Returns true if function has byval arguments.
-      bool hasByValArg() const { return !ByValArgs.empty(); }
-
-      /// regSize - Size (in number of bits) of integer registers.
-      unsigned regSize() const { return IsO32 ? 4 : 8; }
-
-      /// numIntArgRegs - Number of integer registers available for calls.
-      unsigned numIntArgRegs() const;
-
-      /// reservedArgArea - The size of the area the caller reserves for
-      /// register arguments. This is 16-byte if ABI is O32.
-      unsigned reservedArgArea() const;
-
-      /// Return pointer to array of integer argument registers.
-      const MCPhysReg *intArgRegs() const;
-
-      typedef SmallVectorImpl<ByValArgInfo>::const_iterator byval_iterator;
-      byval_iterator byval_begin() const { return ByValArgs.begin(); }
-      byval_iterator byval_end() const { return ByValArgs.end(); }
-
-    private:
-      void handleByValArg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                          CCValAssign::LocInfo LocInfo,
-                          ISD::ArgFlagsTy ArgFlags);
-
-      /// useRegsForByval - Returns true if the calling convention allows the
-      /// use of registers to pass byval arguments.
-      bool useRegsForByval() const { return CallConv != CallingConv::Fast; }
-
-      /// Return the function that analyzes fixed argument list functions.
-      llvm::CCAssignFn *fixedArgFn() const;
-
-      /// Return the function that analyzes variable argument list functions.
-      llvm::CCAssignFn *varArgFn() const;
-
-      const MCPhysReg *shadowRegs() const;
-
-      void allocateRegs(ByValArgInfo &ByVal, unsigned ByValSize,
-                        unsigned Align);
-
-      /// Return the type of the register which is used to pass an argument or
-      /// return a value. This function returns f64 if the argument is an i64
-      /// value which has been generated as a result of softening an f128 value.
-      /// Otherwise, it just returns VT.
-      MVT getRegVT(MVT VT, const Type *OrigTy, const SDNode *CallNode,
-                   bool IsSoftFloat) const;
-
-      template<typename Ty>
-      void analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
-                         const SDNode *CallNode, const Type *RetTy) const;
-
-      CCState &CCInfo;
-      CallingConv::ID CallConv;
-      bool IsO32, IsFP64;
-      SpecialCallingConvType SpecialCallingConv;
-      SmallVector<ByValArgInfo, 2> ByValArgs;
-    };
   protected:
     SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 
     // Subtarget Info
-    const MipsSubtarget *Subtarget;
+    const MipsSubtarget &Subtarget;
 
   private:
     // Create a TargetGlobalAddress node.
@@ -459,14 +385,12 @@
     SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
                           unsigned Flag) const;
 
-    MipsCC::SpecialCallingConvType getSpecialCallingConv(SDValue Callee) const;
     // Lower Operand helpers
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
-                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                            SDLoc dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals,
-                            const SDNode *CallNode, const Type *RetTy) const;
+                            const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl,
+                            SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+                            TargetLowering::CallLoweringInfo &CLI) const;
 
     // Lower Operand specifics
     SDValue lowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
@@ -480,6 +404,7 @@
     SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
@@ -495,33 +420,34 @@
     /// isEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization.
     virtual bool
-    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+    isEligibleForTailCallOptimization(const CCState &CCInfo,
                                       unsigned NextStackOffset,
-                                      const MipsFunctionInfo& FI) const = 0;
+                                      const MipsFunctionInfo &FI) const = 0;
 
     /// copyByValArg - Copy argument registers which were used to pass a byval
     /// argument to the stack. Create a stack frame object for the byval
     /// argument.
-    void copyByValRegs(SDValue Chain, SDLoc DL,
-                       std::vector<SDValue> &OutChains, SelectionDAG &DAG,
-                       const ISD::ArgFlagsTy &Flags,
+    void copyByValRegs(SDValue Chain, SDLoc DL, std::vector<SDValue> &OutChains,
+                       SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
                        SmallVectorImpl<SDValue> &InVals,
-                       const Argument *FuncArg,
-                       const MipsCC &CC, const ByValArgInfo &ByVal) const;
+                       const Argument *FuncArg, unsigned FirstReg,
+                       unsigned LastReg, const CCValAssign &VA,
+                       MipsCCState &State) const;
 
     /// passByValArg - Pass a byval argument in registers or on stack.
     void passByValArg(SDValue Chain, SDLoc DL,
-                      std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                      std::deque<std::pair<unsigned, SDValue>> &RegsToPass,
                       SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
                       MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
-                      const MipsCC &CC, const ByValArgInfo &ByVal,
-                      const ISD::ArgFlagsTy &Flags, bool isLittle) const;
+                      unsigned FirstReg, unsigned LastReg,
+                      const ISD::ArgFlagsTy &Flags, bool isLittle,
+                      const CCValAssign &VA) const;
 
     /// writeVarArgRegs - Write variable function arguments passed in registers
     /// to the stack. Also create a stack frame object for the first variable
     /// argument.
-    void writeVarArgRegs(std::vector<SDValue> &OutChains, const MipsCC &CC,
-                         SDValue Chain, SDLoc DL, SelectionDAG &DAG) const;
+    void writeVarArgRegs(std::vector<SDValue> &OutChains, SDValue Chain,
+                         SDLoc DL, SelectionDAG &DAG, CCState &State) const;
 
     SDValue
       LowerFormalArguments(SDValue Chain,
@@ -560,7 +486,7 @@
     /// This function parses registers that appear in inline-asm constraints.
     /// It returns pair (0, 0) on failure.
     std::pair<unsigned, const TargetRegisterClass *>
-    parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const;
+    parseRegForInlineAsmConstraint(StringRef C, MVT VT) const;
 
     std::pair<unsigned, const TargetRegisterClass*>
               getRegForInlineAsmConstraint(const std::string &Constraint,
@@ -611,8 +537,12 @@
   };
 
   /// Create MipsTargetLowering objects.
-  const MipsTargetLowering *createMips16TargetLowering(MipsTargetMachine &TM);
-  const MipsTargetLowering *createMipsSETargetLowering(MipsTargetMachine &TM);
+  const MipsTargetLowering *
+  createMips16TargetLowering(const MipsTargetMachine &TM,
+                             const MipsSubtarget &STI);
+  const MipsTargetLowering *
+  createMipsSETargetLowering(const MipsTargetMachine &TM,
+                             const MipsSubtarget &STI);
 
   namespace Mips {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -620,4 +550,4 @@
   }
 }
 
-#endif // MipsISELLOWERING_H
+#endif

diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 2260d53..2aa8328 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td

@@ -211,14 +211,14 @@
 }
 
 class BC1F_FT<string opstr, DAGOperand opnd, InstrItinClass Itin,
-              SDPatternOperator Op = null_frag>  :
+              SDPatternOperator Op = null_frag, bit DelaySlot = 1> :
   InstSE<(outs), (ins FCCRegsOpnd:$fcc, opnd:$offset),
          !strconcat(opstr, "\t$fcc, $offset"),
          [(MipsFPBrcond Op, FCCRegsOpnd:$fcc, bb:$offset)], Itin,
          FrmFI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
-  let hasDelaySlot = 1;
+  let hasDelaySlot = DelaySlot;
   let Defs = [AT];
 }
 
@@ -362,11 +362,15 @@
                           bitconvert>, MFC1_FM<0>;
 def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
-def MFHC1 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
-            MFC1_FM<3>, ISA_MIPS32R2;
-def MTHC1_D32 : MMRel, MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
+def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
+                MFC1_FM<3>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>;
+def MFHC1_D64 : MFC1_FT<"mfhc1", GPR32Opnd, FGR64Opnd, II_MFHC1>,
+                MFC1_FM<3>, ISA_MIPS32R2, AdditionalRequires<[IsFP64bit]> {
+  let DecoderNamespace = "Mips64";
+}
+def MTHC1_D32 : MMRel, MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
                 MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[NotFP64bit]>;
-def MTHC1_D64 : MTC1_64_FT<"mthc1", AFGR64Opnd, GPR32Opnd, II_MTHC1>,
+def MTHC1_D64 : MTC1_64_FT<"mthc1", FGR64Opnd, GPR32Opnd, II_MTHC1>,
                 MFC1_FM<7>, ISA_MIPS32R2, AdditionalRequires<[IsFP64bit]> {
   let DecoderNamespace = "Mips64";
 }
@@ -400,30 +404,6 @@
 def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>,
            ISA_MIPS2, FGR_32;
 
-// Cop2 Memory Instructions
-// FIXME: These aren't really FPU instructions and as such don't belong in this
-//        file
-def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>,
-           ISA_MIPS1_NOT_32R6_64R6;
-def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>,
-           ISA_MIPS1_NOT_32R6_64R6;
-def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>,
-           ISA_MIPS2_NOT_32R6_64R6;
-def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>,
-           ISA_MIPS2_NOT_32R6_64R6;
-
-// Cop3 Memory Instructions
-// FIXME: These aren't really FPU instructions and as such don't belong in this
-//        file
-let DecoderNamespace = "COP3_" in {
-  def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
-  def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
-  def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>,
-             ISA_MIPS2;
-  def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>,
-             ISA_MIPS2;
-}
-
 // Indexed loads and stores.
 // Base register + offset register addressing mode (indicated by "x" in the
 // instruction mnemonic) is disallowed under NaCl.
@@ -526,8 +506,12 @@
 
 def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>,
            BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, IIBranch, MIPS_BRANCH_F, 0>,
+            BC1F_FM<1, 0>, ISA_MIPS2_NOT_32R6_64R6;
 def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>,
            BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6;
+def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, IIBranch, MIPS_BRANCH_T, 0>,
+            BC1F_FM<1, 1>, ISA_MIPS2_NOT_32R6_64R6;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
@@ -593,8 +577,12 @@
 //===----------------------------------------------------------------------===//
 def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>,
       ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"bc1tl $offset", (BC1TL FCC0, brtarget:$offset)>,
+      ISA_MIPS2_NOT_32R6_64R6;
 def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>,
       ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"bc1fl $offset", (BC1FL FCC0, brtarget:$offset)>,
+      ISA_MIPS2_NOT_32R6_64R6;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns

diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 6a01ae5..5c91fbc 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td

@@ -440,7 +440,7 @@
   let Inst{5-0}   = funct;
 }
 
-class RDHWR_FM {
+class RDHWR_FM : StdArch {
   bits<5> rt;
   bits<5> rd;
 

diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index d6da6c6..dcc0e24 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp

@@ -30,15 +30,15 @@
 // Pin the vtable to this file.
 void MipsInstrInfo::anchor() {}
 
-MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm, unsigned UncondBr)
-  : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
-    TM(tm), UncondBrOpc(UncondBr) {}
+MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr)
+    : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+      Subtarget(STI), UncondBrOpc(UncondBr) {}
 
-const MipsInstrInfo *MipsInstrInfo::create(MipsTargetMachine &TM) {
-  if (TM.getSubtargetImpl()->inMips16Mode())
-    return llvm::createMips16InstrInfo(TM);
+const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) {
+  if (STI.inMips16Mode())
+    return llvm::createMips16InstrInfo(STI);
 
-  return llvm::createMipsSEInstrInfo(TM);
+  return llvm::createMipsSEInstrInfo(STI);
 }
 
 bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const {
@@ -94,10 +94,10 @@
   return (BT == BT_None) || (BT == BT_Indirect);
 }
 
-void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB,
-                                MachineBasicBlock *TBB, DebugLoc DL,
-                                const SmallVectorImpl<MachineOperand>& Cond)
-  const {
+void
+MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           DebugLoc DL,
+                           const SmallVectorImpl<MachineOperand> &Cond) const {
   unsigned Opc = Cond[0].getImm();
   const MCInstrDesc &MCID = get(Opc);
   MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
@@ -113,11 +113,9 @@
   MIB.addMBB(TBB);
 }
 
-unsigned MipsInstrInfo::
-InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-             MachineBasicBlock *FBB,
-             const SmallVectorImpl<MachineOperand> &Cond,
-             DebugLoc DL) const {
+unsigned MipsInstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
@@ -145,9 +143,7 @@
   return 1;
 }
 
-unsigned MipsInstrInfo::
-RemoveBranch(MachineBasicBlock &MBB) const
-{
+unsigned MipsInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
   MachineBasicBlock::reverse_iterator FirstBr;
   unsigned removed;
@@ -160,7 +156,7 @@
 
   // Up to 2 branches are removed.
   // Note that indirect branches are not removed.
-  for(removed = 0; I != REnd && removed < 2; ++I, ++removed)
+  for (removed = 0; I != REnd && removed < 2; ++I, ++removed)
     if (!getAnalyzableBrOpc(I->getOpcode()))
       break;
 
@@ -171,20 +167,18 @@
 
 /// ReverseBranchCondition - Return the inverse opcode of the
 /// specified Branch instruction.
-bool MipsInstrInfo::
-ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
-{
+bool MipsInstrInfo::ReverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
   assert( (Cond.size() && Cond.size() <= 3) &&
           "Invalid Mips branch condition!");
   Cond[0].setImm(getOppositeBranchOpc(Cond[0].getImm()));
   return false;
 }
 
-MipsInstrInfo::BranchType MipsInstrInfo::
-AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-              MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond,
-              bool AllowModify,
-              SmallVectorImpl<MachineInstr*> &BranchInstrs) const {
+MipsInstrInfo::BranchType MipsInstrInfo::AnalyzeBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    SmallVectorImpl<MachineOperand> &Cond, bool AllowModify,
+    SmallVectorImpl<MachineInstr *> &BranchInstrs) const {
 
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
 

diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 742193f..db149d4 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h

@@ -15,8 +15,8 @@
 // size in bytes; MipsLongBranch only expects it to be the correct upper bound.
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSINSTRUCTIONINFO_H
-#define MIPSINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H
 
 #include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
@@ -33,7 +33,7 @@
 class MipsInstrInfo : public MipsGenInstrInfo {
   virtual void anchor();
 protected:
-  MipsTargetMachine &TM;
+  const MipsSubtarget &Subtarget;
   unsigned UncondBrOpc;
 
 public:
@@ -46,9 +46,9 @@
     BT_Indirect    // One indirct branch.
   };
 
-  explicit MipsInstrInfo(MipsTargetMachine &TM, unsigned UncondBrOpc);
+  explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc);
 
-  static const MipsInstrInfo *create(MipsTargetMachine &TM);
+  static const MipsInstrInfo *create(MipsSubtarget &STI);
 
   /// Branch Analysis
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
@@ -140,8 +140,8 @@
 };
 
 /// Create MipsInstrInfo objects.
-const MipsInstrInfo *createMips16InstrInfo(MipsTargetMachine &TM);
-const MipsInstrInfo *createMipsSEInstrInfo(MipsTargetMachine &TM);
+const MipsInstrInfo *createMips16InstrInfo(const MipsSubtarget &STI);
+const MipsInstrInfo *createMipsSEInstrInfo(const MipsSubtarget &STI);
 
 }
 

diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 8e9472c..aebac34 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td

@@ -331,7 +331,7 @@
 
 def MipsJumpTargetAsmOperand : AsmOperandClass {
   let Name = "JumpTarget";
-  let ParserMethod = "ParseJumpTarget";
+  let ParserMethod = "parseJumpTarget";
   let PredicateMethod = "isImm";
   let RenderMethod = "addImmOperands";
 }
@@ -672,28 +672,62 @@
   let DecoderMethod = "DecodeMem";
 }
 
+// COP2 Load/Store
+class LW_FT2<string opstr, RegisterOperand RC, InstrItinClass Itin,
+             SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem2";
+  let mayLoad = 1;
+}
+
+class SW_FT2<string opstr, RegisterOperand RC, InstrItinClass Itin,
+             SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem2";
+  let mayStore = 1;
+}
+
+// COP3 Load/Store
+class LW_FT3<string opstr, RegisterOperand RC, InstrItinClass Itin,
+             SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem3";
+  let mayLoad = 1;
+}
+
+class SW_FT3<string opstr, RegisterOperand RC, InstrItinClass Itin,
+             SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI, opstr> {
+  let DecoderMethod = "DecodeFMem3";
+  let mayStore = 1;
+}
+
 // Conditional Branch
 class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
-              RegisterOperand RO> :
+              RegisterOperand RO, bit DelaySlot = 1> :
   InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset),
          !strconcat(opstr, "\t$rs, $rt, $offset"),
          [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], IIBranch,
          FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
-  let hasDelaySlot = 1;
+  let hasDelaySlot = DelaySlot;
   let Defs = [AT];
 }
 
 class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
-                  RegisterOperand RO> :
+                  RegisterOperand RO, bit DelaySlot = 1> :
   InstSE<(outs), (ins RO:$rs, opnd:$offset),
          !strconcat(opstr, "\t$rs, $offset"),
          [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch,
          FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
-  let hasDelaySlot = 1;
+  let hasDelaySlot = DelaySlot;
   let Defs = [AT];
 }
 
@@ -765,9 +799,12 @@
     InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
            [], IIBranch, FrmR>;
 
-  class BGEZAL_FT<string opstr, DAGOperand opnd, RegisterOperand RO> :
+  class BGEZAL_FT<string opstr, DAGOperand opnd,
+                  RegisterOperand RO, bit DelaySlot = 1> :
     InstSE<(outs), (ins RO:$rs, opnd:$offset),
-           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>;
+           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr> {
+    let hasDelaySlot = DelaySlot;
+  }
 
 }
 
@@ -933,7 +970,7 @@
 // Read Hardware
 class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
   InstSE<(outs CPURegOperand:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
-         II_RDHWR, FrmR>;
+         II_RDHWR, FrmR, "rdhwr">;
 
 // Ext and Ins
 class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
@@ -1059,18 +1096,20 @@
 //===----------------------------------------------------------------------===//
 
 /// Arithmetic Instructions (ALU Immediate)
+let AdditionalPredicates = [NotInMicroMips] in {
 def ADDiu : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU, immSExt16,
-                               add>,
-            ADDI_FM<0x9>, IsAsCheapAsAMove;
+                               add>, ADDI_FM<0x9>, IsAsCheapAsAMove;
+}
 def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>,
             ISA_MIPS1_NOT_32R6_64R6;
 def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xa>;
 def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xb>;
+let AdditionalPredicates = [NotInMicroMips] in {
 def ANDi  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd, II_ANDI, immZExt16,
-                               and>,
-            ADDI_FM<0xc>;
+                               and>, ADDI_FM<0xc>;
+}
 def ORi   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, II_ORI, immZExt16,
                                or>,
             ADDI_FM<0xd>;
@@ -1100,10 +1139,12 @@
 def NOR   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
+let AdditionalPredicates = [NotInMicroMips] in {
 def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL, shl,
                                    immZExt5>, SRA_FM<0, 0>;
 def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, II_SRL, srl,
                                    immZExt5>, SRA_FM<2, 0>;
+}
 def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, II_SRA, sra,
                                    immZExt5>, SRA_FM<3, 0>;
 def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, II_SLLV, shl>,
@@ -1147,13 +1188,34 @@
           ISA_MIPS1_NOT_32R6_64R6;
 }
 
+// COP2 Memory Instructions
+def LWC2 : LW_FT2<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def SWC2 : SW_FT2<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>,
+           ISA_MIPS1_NOT_32R6_64R6;
+def LDC2 : LW_FT2<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def SDC2 : SW_FT2<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>,
+           ISA_MIPS2_NOT_32R6_64R6;
+
+// COP3 Memory Instructions
+let DecoderNamespace = "COP3_" in {
+  def LWC3 : LW_FT3<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
+  def SWC3 : SW_FT3<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
+  def LDC3 : LW_FT3<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>,
+             ISA_MIPS2;
+  def SDC3 : SW_FT3<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>,
+             ISA_MIPS2;
+}
+
 def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32;
-def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
-def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>;
-def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>;
-def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>;
-def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>;
-def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>;
+
+def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2;
+def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2;
+def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2;
+def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2;
+def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2;
+def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2;
 
 def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>,
            ISA_MIPS2_NOT_32R6_64R6;
@@ -1171,7 +1233,7 @@
 def BREAK : MMRel, BRK_FT<"break">, BRK_FM<0xd>;
 def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>;
 def TRAP : TrapBase<BREAK>;
-def SDBBP : SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
+def SDBBP : MMRel, SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6;
 
 def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32;
 def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32;
@@ -1193,15 +1255,27 @@
               AdditionalRequires<[RelocStatic]>, IsBranch;
 def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>;
 def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
+def BEQL    : MMRel, CBranch<"beql", brtarget, seteq, GPR32Opnd, 0>,
+              BEQ_FM<20>, ISA_MIPS2_NOT_32R6_64R6;
 def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
+def BNEL    : MMRel, CBranch<"bnel", brtarget, setne, GPR32Opnd, 0>,
+              BEQ_FM<21>, ISA_MIPS2_NOT_32R6_64R6;
 def BGEZ    : MMRel, CBranchZero<"bgez", brtarget, setge, GPR32Opnd>,
               BGEZ_FM<1, 1>;
+def BGEZL   : MMRel, CBranchZero<"bgezl", brtarget, setge, GPR32Opnd, 0>,
+              BGEZ_FM<1, 3>, ISA_MIPS2_NOT_32R6_64R6;
 def BGTZ    : MMRel, CBranchZero<"bgtz", brtarget, setgt, GPR32Opnd>,
               BGEZ_FM<7, 0>;
+def BGTZL   : MMRel, CBranchZero<"bgtzl", brtarget, setgt, GPR32Opnd, 0>,
+              BGEZ_FM<23, 0>, ISA_MIPS2_NOT_32R6_64R6;
 def BLEZ    : MMRel, CBranchZero<"blez", brtarget, setle, GPR32Opnd>,
               BGEZ_FM<6, 0>;
+def BLEZL   : MMRel, CBranchZero<"blezl", brtarget, setle, GPR32Opnd, 0>,
+              BGEZ_FM<22, 0>, ISA_MIPS2_NOT_32R6_64R6;
 def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
               BGEZ_FM<1, 0>;
+def BLTZL   : MMRel, CBranchZero<"bltzl", brtarget, setlt, GPR32Opnd, 0>,
+              BGEZ_FM<1, 2>, ISA_MIPS2_NOT_32R6_64R6;
 def B       : UncondBranch<BEQ>;
 
 def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
@@ -1214,8 +1288,12 @@
 def JALX  : JumpLink<"jalx", calltarget>, FJ<0x1D>, ISA_MIPS32_NOT_32R6_64R6;
 def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>,
              ISA_MIPS1_NOT_32R6_64R6;
+def BGEZALL : MMRel, BGEZAL_FT<"bgezall", brtarget, GPR32Opnd, 0>,
+              BGEZAL_FM<0x13>, ISA_MIPS2_NOT_32R6_64R6;
 def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>,
              ISA_MIPS1_NOT_32R6_64R6;
+def BLTZALL : MMRel, BGEZAL_FT<"bltzall", brtarget, GPR32Opnd, 0>,
+              BGEZAL_FM<0x12>, ISA_MIPS2_NOT_32R6_64R6;
 def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
 def TAILCALL : TailCall<J>;
 def TAILCALL_R : TailCallReg<GPR32Opnd, JR>;
@@ -1350,7 +1428,7 @@
 def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU,
                                0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6;
 
-def RDHWR : ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
+def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
 
 def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>;
 def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
@@ -1408,19 +1486,21 @@
 def JALR_HB : JALR_HB_DESC, JALR_HB_ENC, ISA_MIPS32;
 
 class TLB<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
-                                      FrmOther>;
-def TLBP : TLB<"tlbp">, COP0_TLB_FM<0x08>;
-def TLBR : TLB<"tlbr">, COP0_TLB_FM<0x01>;
-def TLBWI : TLB<"tlbwi">, COP0_TLB_FM<0x02>;
-def TLBWR : TLB<"tlbwr">, COP0_TLB_FM<0x06>;
+                                      FrmOther, asmstr>;
+def TLBP : MMRel, TLB<"tlbp">, COP0_TLB_FM<0x08>;
+def TLBR : MMRel, TLB<"tlbr">, COP0_TLB_FM<0x01>;
+def TLBWI : MMRel, TLB<"tlbwi">, COP0_TLB_FM<0x02>;
+def TLBWR : MMRel, TLB<"tlbwr">, COP0_TLB_FM<0x06>;
 
-class CacheOp<string instr_asm, Operand MemOpnd, RegisterOperand GPROpnd> :
+class CacheOp<string instr_asm, Operand MemOpnd> :
     InstSE<(outs), (ins  MemOpnd:$addr, uimm5:$hint),
-           !strconcat(instr_asm, "\t$hint, $addr"), [], NoItinerary, FrmOther>;
+           !strconcat(instr_asm, "\t$hint, $addr"), [], NoItinerary, FrmOther> {
+  let DecoderMethod = "DecodeCacheOp";
+}
 
-def CACHE : CacheOp<"cache", mem, GPR32Opnd>, CACHEOP_FM<0b101111>,
+def CACHE : CacheOp<"cache", mem>, CACHEOP_FM<0b101111>,
             INSN_MIPS3_32_NOT_32R6_64R6;
-def PREF :  CacheOp<"pref", mem, GPR32Opnd>, CACHEOP_FM<0b110011>,
+def PREF :  CacheOp<"pref", mem>, CACHEOP_FM<0b110011>,
             INSN_MIPS3_32_NOT_32R6_64R6;
 
 //===----------------------------------------------------------------------===//
@@ -1435,8 +1515,14 @@
       ISA_MIPS1_NOT_32R6_64R6;
 def : MipsInstAlias<"addu $rs, $rt, $imm",
                     (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"addu $rs, $imm",
+                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
 def : MipsInstAlias<"add $rs, $rt, $imm",
-                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>,
+                    ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"add $rs, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>,
+                    ISA_MIPS1_NOT_32R6_64R6;
 def : MipsInstAlias<"and $rs, $rt, $imm",
                     (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
 def : MipsInstAlias<"and $rs, $imm",
@@ -1480,25 +1566,30 @@
     
 def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
 def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
-def : MipsInstAlias<"ei", (EI ZERO), 1>;
-def : MipsInstAlias<"di", (DI ZERO), 1>;
+def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2;
+def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2;
 
-def  : MipsInstAlias<"teq $rs, $rt", (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : MipsInstAlias<"tge $rs, $rt", (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : MipsInstAlias<"tgeu $rs, $rt", (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0),
-                     1>;
-def  : MipsInstAlias<"tlt $rs, $rt", (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : MipsInstAlias<"tltu $rs, $rt", (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0),
-                     1>;
-def  : MipsInstAlias<"tne $rs, $rt", (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : MipsInstAlias<"teq $rs, $rt",
+                    (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+def : MipsInstAlias<"tge $rs, $rt",
+                    (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+def : MipsInstAlias<"tgeu $rs, $rt",
+                    (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+def : MipsInstAlias<"tlt $rs, $rt",
+                    (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+def : MipsInstAlias<"tltu $rs, $rt",
+                    (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+def : MipsInstAlias<"tne $rs, $rt",
+                    (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2;
+
 def  : MipsInstAlias<"sll $rd, $rt, $rs",
                      (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
 def : MipsInstAlias<"sub, $rd, $rs, $imm",
                     (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs,
-                          InvertedImOperand:$imm), 0>;
+                          InvertedImOperand:$imm), 0>, ISA_MIPS1_NOT_32R6_64R6;
 def : MipsInstAlias<"sub $rs, $imm",
                     (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, InvertedImOperand:$imm),
-                    0>;
+                    0>, ISA_MIPS1_NOT_32R6_64R6;
 def : MipsInstAlias<"subu, $rd, $rs, $imm",
                     (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs,
                            InvertedImOperand:$imm), 0>;
@@ -1563,6 +1654,12 @@
                 (ADDiu GPR32:$src, imm:$imm)>;
 }
 
+// Support multiplication for pre-Mips32 targets that don't have
+// the MUL instruction.
+def : MipsPat<(mul GPR32:$lhs, GPR32:$rhs),
+              (PseudoMFLO (PseudoMULT GPR32:$lhs, GPR32:$rhs))>,
+      ISA_MIPS1_NOT_32R6_64R6;
+
 // SYNC
 def : MipsPat<(MipsSync (i32 immz)),
               (SYNC 0)>, ISA_MIPS2;

diff --git a/lib/Target/Mips/MipsJITInfo.cpp b/lib/Target/Mips/MipsJITInfo.cpp
deleted file mode 100644
index 2072488..0000000
--- a/lib/Target/Mips/MipsJITInfo.cpp
+++ /dev/null

@@ -1,286 +0,0 @@
-//===-- MipsJITInfo.cpp - Implement the Mips JIT Interface ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the JIT interfaces for the Mips target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsJITInfo.h"
-#include "MipsInstrInfo.h"
-#include "MipsRelocations.h"
-#include "MipsSubtarget.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdlib>
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-
-void MipsJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
-  unsigned NewAddr = (intptr_t)New;
-  unsigned OldAddr = (intptr_t)Old;
-  const unsigned NopInstr = 0x0;
-
-  // If the functions are in the same memory segment, insert PC-region branch.
-  if ((NewAddr & 0xF0000000) == ((OldAddr + 4) & 0xF0000000)) {
-    unsigned *OldInstruction = (unsigned *)Old;
-    *OldInstruction = 0x08000000;
-    unsigned JTargetAddr = NewAddr & 0x0FFFFFFC;
-
-    JTargetAddr >>= 2;
-    *OldInstruction |= JTargetAddr;
-
-    // Insert a NOP.
-    OldInstruction++;
-    *OldInstruction = NopInstr;
-
-    sys::Memory::InvalidateInstructionCache(Old, 2 * 4);
-  } else {
-    // We need to clear hint bits from the instruction, in case it is 'jr ra'.
-    const unsigned HintMask = 0xFFFFF83F, ReturnSequence = 0x03e00008;
-    unsigned* CurrentInstr = (unsigned*)Old;
-    unsigned CurrInstrHintClear = (*CurrentInstr) & HintMask;
-    unsigned* NextInstr = CurrentInstr + 1;
-    unsigned NextInstrHintClear = (*NextInstr) & HintMask;
-
-    // Do absolute jump if there are 2 or more instructions before return from
-    // the old function.
-    if ((CurrInstrHintClear != ReturnSequence) &&
-        (NextInstrHintClear != ReturnSequence)) {
-      const unsigned LuiT0Instr = 0x3c080000, AddiuT0Instr = 0x25080000;
-      const unsigned JrT0Instr = 0x01000008;
-      // lui  t0,  high 16 bit of the NewAddr
-      (*(CurrentInstr++)) = LuiT0Instr | ((NewAddr & 0xffff0000) >> 16);
-      // addiu  t0, t0, low 16 bit of the NewAddr
-      (*(CurrentInstr++)) = AddiuT0Instr | (NewAddr & 0x0000ffff);
-      // jr t0
-      (*(CurrentInstr++)) = JrT0Instr;
-      (*CurrentInstr) = NopInstr;
-
-      sys::Memory::InvalidateInstructionCache(Old, 4 * 4);
-    } else {
-      // Unsupported case
-      report_fatal_error("MipsJITInfo::replaceMachineCodeForFunction");
-    }
-  }
-}
-
-/// JITCompilerFunction - This contains the address of the JIT function used to
-/// compile a function lazily.
-static TargetJITInfo::JITCompilerFn JITCompilerFunction;
-
-// Get the ASMPREFIX for the current host.  This is often '_'.
-#ifndef __USER_LABEL_PREFIX__
-#define __USER_LABEL_PREFIX__
-#endif
-#define GETASMPREFIX2(X) #X
-#define GETASMPREFIX(X) GETASMPREFIX2(X)
-#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
-
-// CompilationCallback stub - We can't use a C function with inline assembly in
-// it, because the prolog/epilog inserted by GCC won't work for us. Instead,
-// write our own wrapper, which does things our way, so we have complete control
-// over register saving and restoring. This code saves registers, calls
-// MipsCompilationCallbackC and restores registers.
-extern "C" {
-#if defined (__mips__)
-void MipsCompilationCallback();
-
-  asm(
-    ".text\n"
-    ".align 2\n"
-    ".globl " ASMPREFIX "MipsCompilationCallback\n"
-    ASMPREFIX "MipsCompilationCallback:\n"
-    ".ent " ASMPREFIX "MipsCompilationCallback\n"
-    ".frame  $sp, 32, $ra\n"
-    ".set  noreorder\n"
-    ".cpload $t9\n"
-
-    "addiu $sp, $sp, -64\n"
-    ".cprestore 16\n"
-
-    // Save argument registers a0, a1, a2, a3, f12, f14 since they may contain
-    // stuff for the real target function right now. We have to act as if this
-    // whole compilation callback doesn't exist as far as the caller is
-    // concerned. We also need to save the ra register since it contains the
-    // original return address, and t8 register since it contains the address
-    // of the end of function stub.
-    "sw $a0, 20($sp)\n"
-    "sw $a1, 24($sp)\n"
-    "sw $a2, 28($sp)\n"
-    "sw $a3, 32($sp)\n"
-    "sw $ra, 36($sp)\n"
-    "sw $t8, 40($sp)\n"
-    "sdc1 $f12, 48($sp)\n"
-    "sdc1 $f14, 56($sp)\n"
-
-    // t8 points at the end of function stub. Pass the beginning of the stub
-    // to the MipsCompilationCallbackC.
-    "addiu $a0, $t8, -16\n"
-    "jal " ASMPREFIX "MipsCompilationCallbackC\n"
-    "nop\n"
-
-    // Restore registers.
-    "lw $a0, 20($sp)\n"
-    "lw $a1, 24($sp)\n"
-    "lw $a2, 28($sp)\n"
-    "lw $a3, 32($sp)\n"
-    "lw $ra, 36($sp)\n"
-    "lw $t8, 40($sp)\n"
-    "ldc1 $f12, 48($sp)\n"
-    "ldc1 $f14, 56($sp)\n"
-    "addiu $sp, $sp, 64\n"
-
-    // Jump to the (newly modified) stub to invoke the real function.
-    "addiu $t8, $t8, -16\n"
-    "jr $t8\n"
-    "nop\n"
-
-    ".set  reorder\n"
-    ".end " ASMPREFIX "MipsCompilationCallback\n"
-      );
-#else  // host != Mips
-  void MipsCompilationCallback() {
-    llvm_unreachable(
-      "Cannot call MipsCompilationCallback() on a non-Mips arch!");
-  }
-#endif
-}
-
-/// MipsCompilationCallbackC - This is the target-specific function invoked
-/// by the function stub when we did not know the real target of a call.
-/// This function must locate the start of the stub or call site and pass
-/// it into the JIT compiler function.
-extern "C" void MipsCompilationCallbackC(intptr_t StubAddr) {
-  // Get the address of the compiled code for this function.
-  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
-
-  // Rewrite the function stub so that we don't end up here every time we
-  // execute the call. We're replacing the first four instructions of the
-  // stub with code that jumps to the compiled function:
-  //   lui $t9, %hi(NewVal)
-  //   addiu $t9, $t9, %lo(NewVal)
-  //   jr $t9
-  //   nop
-
-  int Hi = ((unsigned)NewVal & 0xffff0000) >> 16;
-  if ((NewVal & 0x8000) != 0)
-    Hi++;
-  int Lo = (int)(NewVal & 0xffff);
-
-  *(intptr_t *)(StubAddr) = 0xf << 26 | 25 << 16 | Hi;
-  *(intptr_t *)(StubAddr + 4) = 9 << 26 | 25 << 21 | 25 << 16 | Lo;
-  *(intptr_t *)(StubAddr + 8) = 25 << 21 | 8;
-  *(intptr_t *)(StubAddr + 12) = 0;
-
-  sys::Memory::InvalidateInstructionCache((void*) StubAddr, 16);
-}
-
-TargetJITInfo::LazyResolverFn MipsJITInfo::getLazyResolverFunction(
-    JITCompilerFn F) {
-  JITCompilerFunction = F;
-  return MipsCompilationCallback;
-}
-
-TargetJITInfo::StubLayout MipsJITInfo::getStubLayout() {
-  // The stub contains 4 4-byte instructions, aligned at 4 bytes. See
-  // emitFunctionStub for details.
-  StubLayout Result = { 4*4, 4 };
-  return Result;
-}
-
-void *MipsJITInfo::emitFunctionStub(const Function *F, void *Fn,
-                                    JITCodeEmitter &JCE) {
-  JCE.emitAlignment(4);
-  void *Addr = (void*) (JCE.getCurrentPCValue());
-  if (!sys::Memory::setRangeWritable(Addr, 16))
-    llvm_unreachable("ERROR: Unable to mark stub writable.");
-
-  intptr_t EmittedAddr;
-  if (Fn != (void*)(intptr_t)MipsCompilationCallback)
-    EmittedAddr = (intptr_t)Fn;
-  else
-    EmittedAddr = (intptr_t)MipsCompilationCallback;
-
-
-  int Hi = ((unsigned)EmittedAddr & 0xffff0000) >> 16;
-  if ((EmittedAddr & 0x8000) != 0)
-    Hi++;
-  int Lo = (int)(EmittedAddr & 0xffff);
-
-  // lui $t9, %hi(EmittedAddr)
-  // addiu $t9, $t9, %lo(EmittedAddr)
-  // jalr $t8, $t9
-  // nop
-  if (IsLittleEndian) {
-    JCE.emitWordLE(0xf << 26 | 25 << 16 | Hi);
-    JCE.emitWordLE(9 << 26 | 25 << 21 | 25 << 16 | Lo);
-    JCE.emitWordLE(25 << 21 | 24 << 11 | 9);
-    JCE.emitWordLE(0);
-  } else {
-    JCE.emitWordBE(0xf << 26 | 25 << 16 | Hi);
-    JCE.emitWordBE(9 << 26 | 25 << 21 | 25 << 16 | Lo);
-    JCE.emitWordBE(25 << 21 | 24 << 11 | 9);
-    JCE.emitWordBE(0);
-  }
-
-  sys::Memory::InvalidateInstructionCache(Addr, 16);
-  if (!sys::Memory::setRangeExecutable(Addr, 16))
-    llvm_unreachable("ERROR: Unable to mark stub executable.");
-
-  return Addr;
-}
-
-/// relocate - Before the JIT can run a block of code that has been emitted,
-/// it must rewrite the code to contain the actual addresses of any
-/// referenced global symbols.
-void MipsJITInfo::relocate(void *Function, MachineRelocation *MR,
-                           unsigned NumRelocs, unsigned char *GOTBase) {
-  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
-
-    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
-    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
-
-    switch ((Mips::RelocationType) MR->getRelocationType()) {
-    case Mips::reloc_mips_pc16:
-      ResultPtr = (((ResultPtr - (intptr_t) RelocPos) - 4) >> 2) & 0xffff;
-      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
-      break;
-
-    case Mips::reloc_mips_26:
-      ResultPtr = (ResultPtr & 0x0fffffff) >> 2;
-      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
-      break;
-
-    case Mips::reloc_mips_hi:
-      ResultPtr = ResultPtr >> 16;
-      if ((((intptr_t) (MR->getResultPointer()) & 0xffff) >> 15) == 1) {
-        ResultPtr += 1;
-      }
-      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
-      break;
-
-    case Mips::reloc_mips_lo: {
-      // Addend is needed for unaligned load/store instructions, where offset
-      // for the second load/store in the expanded instruction sequence must
-      // be modified by +1 or +3. Otherwise, Addend is 0.
-      int Addend = *((unsigned*) RelocPos) & 0xffff;
-      ResultPtr = (ResultPtr + Addend) & 0xffff;
-      *((unsigned*) RelocPos) &= 0xffff0000;
-      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
-      break;
-    }
-    }
-  }
-}

diff --git a/lib/Target/Mips/MipsJITInfo.h b/lib/Target/Mips/MipsJITInfo.h
deleted file mode 100644
index c9dfd83..0000000
--- a/lib/Target/Mips/MipsJITInfo.h
+++ /dev/null

@@ -1,71 +0,0 @@
-//===- MipsJITInfo.h - Mips Implementation of the JIT Interface -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the MipsJITInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MIPSJITINFO_H
-#define MIPSJITINFO_H
-
-#include "MipsMachineFunction.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/Target/TargetJITInfo.h"
-
-namespace llvm {
-class MipsTargetMachine;
-
-class MipsJITInfo : public TargetJITInfo {
-
-  bool IsPIC;
-  bool IsLittleEndian;
-
-  public:
-    explicit MipsJITInfo() :
-      IsPIC(false), IsLittleEndian(true) {}
-
-    /// replaceMachineCodeForFunction - Make it so that calling the function
-    /// whose machine code is at OLD turns into a call to NEW, perhaps by
-    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
-    /// code.
-    ///
-    void replaceMachineCodeForFunction(void *Old, void *New) override;
-
-    // getStubLayout - Returns the size and alignment of the largest call stub
-    // on Mips.
-    StubLayout getStubLayout() override;
-
-    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
-    /// small native function that simply calls the function at the specified
-    /// address.
-    void *emitFunctionStub(const Function *F, void *Fn,
-                           JITCodeEmitter &JCE) override;
-
-    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
-
-    /// relocate - Before the JIT can run a block of code that has been emitted,
-    /// it must rewrite the code to contain the actual addresses of any
-    /// referenced global symbols.
-    void relocate(void *Function, MachineRelocation *MR,
-                  unsigned NumRelocs, unsigned char *GOTBase) override;
-
-    /// Initialize - Initialize internal stage for the function being JITted.
-    void Initialize(const MachineFunction &MF, bool isPIC,
-                    bool isLittleEndian) {
-      IsPIC = isPIC;
-      IsLittleEndian = isLittleEndian;
-    }
-
-};
-}
-
-#endif

diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index c6838a3..e44d6ee 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp

@@ -16,6 +16,7 @@
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsMCNaCl.h"
+#include "MipsMachineFunction.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -64,8 +65,8 @@
     MipsLongBranch(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
-        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
-        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 10 :
+        ABI(TM.getSubtarget<MipsSubtarget>().getABI()),
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI.IsN64() ? 10 :
             (!TM.getSubtarget<MipsSubtarget>().isTargetNaCl() ? 9 : 10))) {}
 
     const char *getPassName() const override {
@@ -86,7 +87,7 @@
     MachineFunction *MF;
     SmallVector<MBBInfo, 16> MBBInfos;
     bool IsPIC;
-    unsigned ABI;
+    MipsABIInfo ABI;
     unsigned LongBranchSeqSize;
   };
 
@@ -170,7 +171,7 @@
   MBBInfos.resize(MF->size());
 
   const MipsInstrInfo *TII =
-    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
   for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(I);
 
@@ -217,7 +218,7 @@
 void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
                                    DebugLoc DL, MachineBasicBlock *MBBOpnd) {
   const MipsInstrInfo *TII =
-    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
   unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
   const MCInstrDesc &NewDesc = TII->get(NewOpc);
 
@@ -254,7 +255,7 @@
   MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
 
   const MipsInstrInfo *TII =
-    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
 
   MF->insert(FallThroughMBB, LongBrMBB);
   MBB->removeSuccessor(TgtMBB);
@@ -273,7 +274,7 @@
     const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
     unsigned BalOp = Subtarget.hasMips32r6() ? Mips::BAL : Mips::BAL_BR;
 
-    if (ABI != MipsSubtarget::N64) {
+    if (!ABI.IsN64()) {
       // $longbr:
       //  addiu $sp, $sp, -8
       //  sw $ra, 0($sp)
@@ -447,9 +448,10 @@
 
 bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
   const MipsInstrInfo *TII =
-    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+      static_cast<const MipsInstrInfo *>(TM.getSubtargetImpl()->getInstrInfo());
 
-  if (TM.getSubtarget<MipsSubtarget>().inMips16Mode())
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  if (STI.inMips16Mode() || !STI.enableLongBranchPass())
     return false;
   if ((TM.getRelocationModel() == Reloc::PIC_) &&
       TM.getSubtarget<MipsSubtarget>().isABI_O32() &&

diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 269190f..1ce27e4 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSMCINSTLOWER_H
-#define MIPSMCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineOperand.h"

diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index 285bb14..68230e6 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td

@@ -69,7 +69,7 @@
 // as the encoded value should be subtracted by one.
 def uimm2LSAAsmOperand : AsmOperandClass {
   let Name = "LSAImm";
-  let ParserMethod = "ParseLSAImm";
+  let ParserMethod = "parseLSAImm";
   let RenderMethod = "addImmOperands";
 }
 

diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index e30302e..a89718a 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp

@@ -24,7 +24,7 @@
                  cl::desc("Always use $gp as the global base register."));
 
 // class MipsCallEntry.
-MipsCallEntry::MipsCallEntry(const StringRef &N) {
+MipsCallEntry::MipsCallEntry(StringRef N) {
 #ifndef NDEBUG
   Name = N;
   Val = nullptr;
@@ -119,7 +119,7 @@
                         || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
 }
 
-MachinePointerInfo MipsFunctionInfo::callPtrInfo(const StringRef &Name) {
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(StringRef Name) {
   const MipsCallEntry *&E = ExternalCallEntries[Name];
 
   if (!E)
@@ -137,4 +137,12 @@
   return MachinePointerInfo(E);
 }
 
+int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
+  if (MoveF64ViaSpillFI == -1) {
+    MoveF64ViaSpillFI = MF.getFrameInfo()->CreateStackObject(
+        RC->getSize(), RC->getAlignment(), false);
+  }
+  return MoveF64ViaSpillFI;
+}
+
 void MipsFunctionInfo::anchor() { }

diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 8c16f82..217f307 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPS_MACHINE_FUNCTION_INFO_H
-#define MIPS_MACHINE_FUNCTION_INFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
 
 #include "Mips16HardFloatInfo.h"
 #include "llvm/ADT/StringMap.h"
@@ -34,7 +34,7 @@
 /// resolved by lazy-binding.
 class MipsCallEntry : public PseudoSourceValue {
 public:
-  explicit MipsCallEntry(const StringRef &N);
+  explicit MipsCallEntry(StringRef N);
   explicit MipsCallEntry(const GlobalValue *V);
   bool isConstant(const MachineFrameInfo *) const override;
   bool isAliased(const MachineFrameInfo *) const override;
@@ -54,7 +54,8 @@
 public:
   MipsFunctionInfo(MachineFunction &MF)
       : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
-        VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false) {}
+        VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false),
+        MoveF64ViaSpillFI(-1) {}
 
   ~MipsFunctionInfo();
 
@@ -87,7 +88,7 @@
 
   /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
   /// representing a GOT entry for an external function.
-  MachinePointerInfo callPtrInfo(const StringRef &Name);
+  MachinePointerInfo callPtrInfo(StringRef Name);
 
   /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
   /// representing a GOT entry for a global function.
@@ -96,6 +97,8 @@
   void setSaveS2() { SaveS2 = true; }
   bool hasSaveS2() const { return SaveS2; }
 
+  int getMoveF64ViaSpillFI(const TargetRegisterClass *RC);
+
   std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
   StubsNeeded;
 
@@ -136,6 +139,10 @@
   // saveS2
   bool SaveS2;
 
+  /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the
+  /// O32 FPXX ABI is enabled. -1 is used to denote invalid index.
+  int MoveF64ViaSpillFI;
+
   /// MipsCallEntry maps.
   StringMap<const MipsCallEntry *> ExternalCallEntries;
   ValueMap<const GlobalValue *, const MipsCallEntry *> GlobalCallEntries;
@@ -143,4 +150,4 @@
 
 } // end of namespace llvm
 
-#endif // MIPS_MACHINE_FUNCTION_INFO_H
+#endif

diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
index 03c76ea..b011e8f 100644
--- a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp

@@ -20,7 +20,7 @@
 
 bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
-  const_cast<MipsSubtarget&>(Subtarget).resetSubtarget(&MF);
+  TM.resetSubtarget(&MF);
   return false;
 }
 

diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.h b/lib/Target/Mips/MipsModuleISelDAGToDAG.h
index a96862a..85bae47 100644
--- a/lib/Target/Mips/MipsModuleISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSMODULEISELDAGTODAG_H
-#define MIPSMODULEISELDAGTODAG_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSMODULEISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPSMODULEISELDAGTODAG_H
 
 #include "Mips.h"
 #include "MipsSubtarget.h"
@@ -37,8 +37,7 @@
   static char ID;
 
   explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_)
-    : MachineFunctionPass(ID),
-      TM(TM_), Subtarget(TM.getSubtarget<MipsSubtarget>()) {}
+      : MachineFunctionPass(ID), TM(TM_) {}
 
   // Pass Name
   const char *getPassName() const override {
@@ -48,10 +47,7 @@
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 protected:
-  /// Keep a pointer to the MipsSubtarget around so that we can make the right
-  /// decision when generating code for different targets.
-  const TargetMachine &TM;
-  const MipsSubtarget &Subtarget;
+  MipsTargetMachine &TM;
 };
 
 /// createMipsISelDag - This pass converts a legalized DAG into a

diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index c234049..22c524e 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp

@@ -130,7 +130,7 @@
 static void setCallTargetReg(MachineBasicBlock *MBB,
                              MachineBasicBlock::iterator I) {
   MachineFunction &MF = *MBB->getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   unsigned SrcReg = I->getOperand(0).getReg();
   unsigned DstReg = getRegTy(SrcReg, MF) == MVT::i32 ? Mips::T9 : Mips::T9_64;
   BuildMI(*MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), DstReg)

diff --git a/lib/Target/Mips/MipsOptionRecord.h b/lib/Target/Mips/MipsOptionRecord.h
new file mode 100644
index 0000000..f82544a
--- /dev/null
+++ b/lib/Target/Mips/MipsOptionRecord.h

@@ -0,0 +1,78 @@
+//===-- MipsOptionRecord.h - Abstraction for storing information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MipsOptionRecord - Abstraction for storing arbitrary information in
+// ELF files. Arbitrary information (e.g. register usage) can be stored in Mips
+// specific ELF sections like .Mips.options. Specific records should subclass
+// MipsOptionRecord and provide an implementation to EmitMipsOptionRecord which
+// basically just dumps the information into an ELF section. More information
+// about .Mips.option can be found in the SysV ABI and the 64-bit ELF Object
+// specification.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSOPTIONRECORD_H
+#define LLVM_LIB_TARGET_MIPS_MIPSOPTIONRECORD_H
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+class MipsELFStreamer;
+class MCSubtargetInfo;
+
+class MipsOptionRecord {
+public:
+  virtual ~MipsOptionRecord(){};
+  virtual void EmitMipsOptionRecord() = 0;
+};
+
+class MipsRegInfoRecord : public MipsOptionRecord {
+public:
+  MipsRegInfoRecord(MipsELFStreamer *S, MCContext &Context,
+                    const MCSubtargetInfo &STI)
+      : Streamer(S), Context(Context), STI(STI) {
+    ri_gprmask = 0;
+    ri_cprmask[0] = ri_cprmask[1] = ri_cprmask[2] = ri_cprmask[3] = 0;
+    ri_gp_value = 0;
+
+    const MCRegisterInfo *TRI = Context.getRegisterInfo();
+    GPR32RegClass = &(TRI->getRegClass(Mips::GPR32RegClassID));
+    GPR64RegClass = &(TRI->getRegClass(Mips::GPR64RegClassID));
+    FGR32RegClass = &(TRI->getRegClass(Mips::FGR32RegClassID));
+    FGR64RegClass = &(TRI->getRegClass(Mips::FGR64RegClassID));
+    AFGR64RegClass = &(TRI->getRegClass(Mips::AFGR64RegClassID));
+    MSA128BRegClass = &(TRI->getRegClass(Mips::MSA128BRegClassID));
+    COP2RegClass = &(TRI->getRegClass(Mips::COP2RegClassID));
+    COP3RegClass = &(TRI->getRegClass(Mips::COP3RegClassID));
+  }
+  ~MipsRegInfoRecord() {}
+
+  void EmitMipsOptionRecord() override;
+  void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo);
+
+private:
+  MipsELFStreamer *Streamer;
+  MCContext &Context;
+  const MCSubtargetInfo &STI;
+  const MCRegisterClass *GPR32RegClass;
+  const MCRegisterClass *GPR64RegClass;
+  const MCRegisterClass *FGR32RegClass;
+  const MCRegisterClass *FGR64RegClass;
+  const MCRegisterClass *AFGR64RegClass;
+  const MCRegisterClass *MSA128BRegClass;
+  const MCRegisterClass *COP2RegClass;
+  const MCRegisterClass *COP3RegClass;
+  uint32_t ri_gprmask;
+  uint32_t ri_cprmask[4];
+  int64_t ri_gp_value;
+};
+} // namespace llvm
+#endif

diff --git a/lib/Target/Mips/MipsOs16.h b/lib/Target/Mips/MipsOs16.h
index 55e5a81..77183ec 100644
--- a/lib/Target/Mips/MipsOs16.h
+++ b/lib/Target/Mips/MipsOs16.h

@@ -11,16 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSOS16_H
+#define LLVM_LIB_TARGET_MIPS_MIPSOS16_H
+
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MipsTargetMachine.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
 
-
-
-#ifndef MIPSOS16_H
-#define MIPSOS16_H
-
 using namespace llvm;
 
 namespace llvm {

diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 084449b..20ef3f3 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp

@@ -62,7 +62,7 @@
   case Mips::GPR32RegClassID:
   case Mips::GPR64RegClassID:
   case Mips::DSPRRegClassID: {
-    const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+    const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
     return 28 - TFI->hasFP(MF);
   }
   case Mips::FGR32RegClassID:
@@ -149,6 +149,12 @@
   for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I)
     Reserved.set(ReservedGPR64[I]);
 
+  // For mno-abicalls, GP is a program invariant!
+  if (!Subtarget.isABICalls()) {
+    Reserved.set(Mips::GP);
+    Reserved.set(Mips::GP_64);
+  }
+
   if (Subtarget.isFP64bit()) {
     // Reserve all registers in AFGR64.
     for (RegIter Reg = Mips::AFGR64RegClass.begin(),
@@ -161,7 +167,7 @@
       Reserved.set(*Reg);
   }
   // Reserve FP if this function should have a dedicated frame pointer register.
-  if (MF.getTarget().getFrameLowering()->hasFP(MF)) {
+  if (MF.getSubtarget().getFrameLowering()->hasFP(MF)) {
     if (Subtarget.inMips16Mode())
       Reserved.set(Mips::S0);
     else {
@@ -250,7 +256,7 @@
 
 unsigned MipsRegisterInfo::
 getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   bool IsN64 = Subtarget.isABI_N64();
 
   if (Subtarget.inMips16Mode())

diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index b34496f..9ec4a38 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSREGISTERINFO_H
-#define MIPSREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSREGISTERINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSREGISTERINFO_H
 
 #include "Mips.h"
 #include "llvm/Target/TargetRegisterInfo.h"

diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 6323da3..42fe76b 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td

@@ -212,8 +212,13 @@
   // PC register
   def PC : Register<"pc">;
 
-  // Hardware register $29
-  foreach I = 0-31 in
+  // Hardware registers
+  def HWR0 : MipsReg<0, "hwr_cpunum">;
+  def HWR1 : MipsReg<1, "hwr_synci_step">;
+  def HWR2 : MipsReg<2, "hwr_cc">;
+  def HWR3 : MipsReg<3, "hwr_ccres">;
+
+  foreach I = 4-31 in
   def HWR#I : MipsReg<#I, ""#I>;
 
   // Accum registers
@@ -283,6 +288,12 @@
 def GPR32 : GPR32Class<[i32]>;
 def DSPR  : GPR32Class<[v4i8, v2i16]>;
 
+def GPRMM16 : RegisterClass<"Mips", [i32], 32, (add
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Callee save
+  S0, S1)>;
+
 def GPR64 : RegisterClass<"Mips", [i64], 64, (add
 // Reserved
   ZERO_64, AT_64,
@@ -341,9 +352,12 @@
 def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
 
 // Used to reserve odd registers when given -mattr=+nooddspreg
+// FIXME: Remove double precision registers from this set.
 def OddSP : RegisterClass<"Mips", [f32], 32,
                           (add (decimate (sequence "F%u", 1, 31), 2),
-                               (decimate (sequence "F_HI%u", 1, 31), 2))>,
+                               (decimate (sequence "F_HI%u", 1, 31), 2),
+                               (decimate (sequence "D%u", 1, 15), 2),
+                               (decimate (sequence "D%u_64", 1, 31), 2))>,
             Unallocatable;
 
 // FP control registers.
@@ -414,7 +428,7 @@
 // Register Operands.
 
 class MipsAsmRegOperand : AsmOperandClass {
-  let ParserMethod = "ParseAnyRegister";
+  let ParserMethod = "parseAnyRegister";
 }
 
 def GPR64AsmOperand : MipsAsmRegOperand {
@@ -427,6 +441,11 @@
   let PredicateMethod = "isGPRAsmReg";
 }
 
+def GPRMM16AsmOperand : MipsAsmRegOperand {
+  let Name = "GPRMM16AsmReg";
+  let PredicateMethod = "isMM16AsmReg";
+}
+
 def ACC64DSPAsmOperand : MipsAsmRegOperand {
   let Name = "ACC64DSPAsmReg";
   let PredicateMethod = "isACCAsmReg";
@@ -482,6 +501,10 @@
   let ParserMatchClass = GPR32AsmOperand;
 }
 
+def GPRMM16Opnd : RegisterOperand<GPRMM16> {
+  let ParserMatchClass = GPRMM16AsmOperand;
+}
+
 def GPR64Opnd : RegisterOperand<GPR64> {
   let ParserMatchClass = GPR64AsmOperand;
 }
@@ -575,4 +598,3 @@
 def MSA128CROpnd : RegisterOperand<MSACtrl> {
   let ParserMatchClass = MSACtrlAsmOperand;
 }
-

diff --git a/lib/Target/Mips/MipsRelocations.h b/lib/Target/Mips/MipsRelocations.h
deleted file mode 100644
index 0787ed3..0000000
--- a/lib/Target/Mips/MipsRelocations.h
+++ /dev/null

@@ -1,41 +0,0 @@
-//===-- MipsRelocations.h - Mips Code Relocations ---------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the Mips target-specific relocation types
-// (for relocation-model=static).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MIPSRELOCATIONS_H_
-#define MIPSRELOCATIONS_H_
-
-#include "llvm/CodeGen/MachineRelocation.h"
-
-namespace llvm {
-  namespace Mips{
-    enum RelocationType {
-      // reloc_mips_pc16 - pc relative relocation for branches. The lower 18
-      // bits of the difference between the branch target and the branch
-      // instruction, shifted right by 2.
-      reloc_mips_pc16 = 1,
-
-      // reloc_mips_hi - upper 16 bits of the address (modified by +1 if the
-      // lower 16 bits of the address is negative).
-      reloc_mips_hi = 2,
-
-      // reloc_mips_lo - lower 16 bits of the address.
-      reloc_mips_lo = 3,
-
-      // reloc_mips_26 - lower 28 bits of the address, shifted right by 2.
-      reloc_mips_26 = 4
-    };
-  }
-}
-
-#endif /* MIPSRELOCATIONS_H_ */

diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 6573070..97d9edf 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp

@@ -64,6 +64,10 @@
   bool expandCopy(MachineBasicBlock &MBB, Iter I);
   bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
                      unsigned MFLoOpc);
+  bool expandBuildPairF64(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, bool FP64) const;
+  bool expandExtractElementF64(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, bool FP64) const;
 
   MachineFunction &MF;
   MachineRegisterInfo &MRI;
@@ -108,6 +112,22 @@
   case Mips::STORE_ACC128:
     expandStoreACC(MBB, I, Mips::PseudoMFHI64, Mips::PseudoMFLO64, 8);
     break;
+  case Mips::BuildPairF64:
+    if (expandBuildPairF64(MBB, I, false))
+      MBB.erase(I);
+    return false;
+  case Mips::BuildPairF64_64:
+    if (expandBuildPairF64(MBB, I, true))
+      MBB.erase(I);
+    return false;
+  case Mips::ExtractElementF64:
+    if (expandExtractElementF64(MBB, I, false))
+      MBB.erase(I);
+    return false;
+  case Mips::ExtractElementF64_64:
+    if (expandExtractElementF64(MBB, I, true))
+      MBB.erase(I);
+    return false;
   case TargetOpcode::COPY:
     if (!expandCopy(MBB, I))
       return false;
@@ -127,9 +147,9 @@
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
   const MipsSEInstrInfo &TII =
-    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo =
-    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
+                                        MF.getSubtarget().getRegisterInfo());
 
   const TargetRegisterClass *RC = RegInfo.intRegClass(4);
   unsigned VR = MRI.createVirtualRegister(RC);
@@ -147,9 +167,9 @@
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
   const MipsSEInstrInfo &TII =
-    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo =
-    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
+                                        MF.getSubtarget().getRegisterInfo());
 
   const TargetRegisterClass *RC = RegInfo.intRegClass(4);
   unsigned VR = MRI.createVirtualRegister(RC);
@@ -170,9 +190,9 @@
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
   const MipsSEInstrInfo &TII =
-    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo =
-    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
+                                        MF.getSubtarget().getRegisterInfo());
 
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
@@ -200,9 +220,9 @@
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
   const MipsSEInstrInfo &TII =
-    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo =
-    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
+                                        MF.getSubtarget().getRegisterInfo());
 
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
@@ -235,9 +255,9 @@
   //  copy dst_hi, $vr1
 
   const MipsSEInstrInfo &TII =
-    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo =
-    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
+                                        MF.getSubtarget().getRegisterInfo());
 
   unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
   unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
@@ -258,6 +278,123 @@
   return true;
 }
 
+/// This method expands the same instruction that MipsSEInstrInfo::
+/// expandBuildPairF64 does, for the case when ABI is fpxx and mthc1 is not
+/// available and the case where the ABI is FP64A. It is implemented here
+/// because frame indexes are eliminated before MipsSEInstrInfo::
+/// expandBuildPairF64 is called.
+bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      bool FP64) const {
+  // For fpxx and when mthc1 is not available, use:
+  //   spill + reload via ldc1
+  //
+  // The case where dmtc1 is available doesn't need to be handled here
+  // because it never creates a BuildPairF64 node.
+  //
+  // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence
+  // for odd-numbered double precision values (because the lower 32-bits is
+  // transferred with mtc1 which is redirected to the upper half of the even
+  // register). Unfortunately, we have to make this decision before register
+  // allocation so for now we use a spill/reload sequence for all
+  // double-precision values in regardless of being an odd/even register.
+
+  const TargetMachine &TM = MF.getTarget();
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+  if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
+      (FP64 && !Subtarget.useOddSPReg())) {
+    const MipsSEInstrInfo &TII = *static_cast<const MipsSEInstrInfo *>(
+                                     TM.getSubtargetImpl()->getInstrInfo());
+    const MipsRegisterInfo &TRI = *static_cast<const MipsRegisterInfo *>(
+                                      TM.getSubtargetImpl()->getRegisterInfo());
+
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned LoReg = I->getOperand(1).getReg();
+    unsigned HiReg = I->getOperand(2).getReg();
+
+    // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
+    // the cases where mthc1 is not available). 64-bit architectures and
+    // MIPS32r2 or later can use FGR64 though.
+    assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() ||
+           !Subtarget.isFP64bit());
+
+    const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+    const TargetRegisterClass *RC2 =
+        FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+
+    // We re-use the same spill slot each time so that the stack frame doesn't
+    // grow too much in functions with a large number of moves.
+    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC2);
+    if (!Subtarget.isLittle())
+      std::swap(LoReg, HiReg);
+    TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, &TRI,
+                        0);
+    TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, &TRI,
+                        4);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &TRI, 0);
+    return true;
+  }
+
+  return false;
+}
+
+/// This method expands the same instruction that MipsSEInstrInfo::
+/// expandExtractElementF64 does, for the case when ABI is fpxx and mfhc1 is not
+/// available and the case where the ABI is FP64A. It is implemented here
+/// because frame indexes are eliminated before MipsSEInstrInfo::
+/// expandExtractElementF64 is called.
+bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator I,
+                                           bool FP64) const {
+  // For fpxx and when mfhc1 is not available, use:
+  //   spill + reload via ldc1
+  //
+  // The case where dmfc1 is available doesn't need to be handled here
+  // because it never creates a ExtractElementF64 node.
+  //
+  // The FP64A ABI (fp64 with nooddspreg) must also use a spill/reload sequence
+  // for odd-numbered double precision values (because the lower 32-bits is
+  // transferred with mfc1 which is redirected to the upper half of the even
+  // register). Unfortunately, we have to make this decision before register
+  // allocation so for now we use a spill/reload sequence for all
+  // double-precision values in regardless of being an odd/even register.
+
+  const TargetMachine &TM = MF.getTarget();
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+  if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) ||
+      (FP64 && !Subtarget.useOddSPReg())) {
+    const MipsSEInstrInfo &TII = *static_cast<const MipsSEInstrInfo *>(
+                                     TM.getSubtargetImpl()->getInstrInfo());
+    const MipsRegisterInfo &TRI = *static_cast<const MipsRegisterInfo *>(
+                                      TM.getSubtargetImpl()->getRegisterInfo());
+
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned SrcReg = I->getOperand(1).getReg();
+    unsigned N = I->getOperand(2).getImm();
+    int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N));
+
+    // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are
+    // the cases where mfhc1 is not available). 64-bit architectures and
+    // MIPS32r2 or later can use FGR64 though.
+    assert(Subtarget.isGP64bit() || Subtarget.hasMTHC1() ||
+           !Subtarget.isFP64bit());
+
+    const TargetRegisterClass *RC =
+        FP64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+    const TargetRegisterClass *RC2 = &Mips::GPR32RegClass;
+
+    // We re-use the same spill slot each time so that the stack frame doesn't
+    // grow too much in functions with a large number of moves.
+    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC);
+    TII.storeRegToStack(MBB, I, SrcReg, I->getOperand(1).isKill(), FI, RC, &TRI,
+                        0);
+    TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &TRI, Offset);
+    return true;
+  }
+
+  return false;
+}
+
 MipsSEFrameLowering::MipsSEFrameLowering(const MipsSubtarget &STI)
     : MipsFrameLowering(STI, STI.stackAlignment()) {}
 
@@ -278,9 +415,9 @@
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   const MipsSEInstrInfo &TII =
-    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo =
-    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
+                                        MF.getSubtarget().getRegisterInfo());
 
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -343,6 +480,22 @@
             MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4));
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
+      } else if (Mips::FGR64RegClass.contains(Reg)) {
+        unsigned Reg0 = MRI->getDwarfRegNum(Reg, true);
+        unsigned Reg1 = MRI->getDwarfRegNum(Reg, true) + 1;
+
+        if (!STI.isLittle())
+          std::swap(Reg0, Reg1);
+
+        unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg0, Offset));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
+
+        CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg1, Offset + 4));
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex);
       } else {
         // Reg is either in GPR32 or FGR32.
         unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
@@ -397,9 +550,9 @@
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   const MipsSEInstrInfo &TII =
-    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
-  const MipsRegisterInfo &RegInfo =
-    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(
+                                        MF.getSubtarget().getRegisterInfo());
 
   DebugLoc dl = MBBI->getDebugLoc();
   unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
@@ -452,7 +605,7 @@
                           const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
   MachineBasicBlock *EntryBlock = MF->begin();
-  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     // Add the callee-saved register as live-in. Do not add if the register is
@@ -493,7 +646,7 @@
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const MipsSEInstrInfo &TII =
-    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const MipsSEInstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   if (!hasReservedCallFrame(MF)) {
     int64_t Amount = I->getOperand(0).getImm();

diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index e832848..0eca1df 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSSE_FRAMEINFO_H
-#define MIPSSE_FRAMEINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
 
 #include "MipsFrameLowering.h"
 

diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 6f35947..f759905 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp

@@ -37,6 +37,7 @@
 #define DEBUG_TYPE "mips-isel"
 
 bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
   if (Subtarget->inMips16Mode())
     return false;
   return MipsDAGToDAGISel::runOnMachineFunction(MF);
@@ -129,7 +130,7 @@
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator I = MBB.begin();
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
   const TargetRegisterClass *RC;

diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 57328d2..2e11fa7 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSSEISELDAGTODAG_H
-#define MIPSSEISELDAGTODAG_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEISELDAGTODAG_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEISELDAGTODAG_H
 
 #include "MipsISelDAGToDAG.h"
 

diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index be4ca86..4a0ce09 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp

@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "MipsSEISelLowering.h"
+#include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -34,15 +35,16 @@
                                             "stores to their single precision "
                                             "counterparts"));
 
-MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
-  : MipsTargetLowering(TM) {
+MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
+                                           const MipsSubtarget &STI)
+    : MipsTargetLowering(TM, STI) {
   // Set up the register classes
   addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
 
-  if (Subtarget->isGP64bit())
+  if (Subtarget.isGP64bit())
     addRegisterClass(MVT::i64, &Mips::GPR64RegClass);
 
-  if (Subtarget->hasDSP() || Subtarget->hasMSA()) {
+  if (Subtarget.hasDSP() || Subtarget.hasMSA()) {
     // Expand all truncating stores and extending loads.
     unsigned FirstVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
     unsigned LastVT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
@@ -58,7 +60,7 @@
     }
   }
 
-  if (Subtarget->hasDSP()) {
+  if (Subtarget.hasDSP()) {
     MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
 
     for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
@@ -82,10 +84,10 @@
     setTargetDAGCombine(ISD::VSELECT);
   }
 
-  if (Subtarget->hasDSPR2())
+  if (Subtarget.hasDSPR2())
     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
 
-  if (Subtarget->hasMSA()) {
+  if (Subtarget.hasMSA()) {
     addMSAIntType(MVT::v16i8, &Mips::MSA128BRegClass);
     addMSAIntType(MVT::v8i16, &Mips::MSA128HRegClass);
     addMSAIntType(MVT::v4i32, &Mips::MSA128WRegClass);
@@ -101,12 +103,12 @@
     setTargetDAGCombine(ISD::XOR);
   }
 
-  if (!Subtarget->mipsSEUsesSoftFloat()) {
+  if (!Subtarget.abiUsesSoftFloat()) {
     addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
 
     // When dealing with single precision only, use libcalls
-    if (!Subtarget->isSingleFloat()) {
-      if (Subtarget->isFP64bit())
+    if (!Subtarget.isSingleFloat()) {
+      if (Subtarget.isFP64bit())
         addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
       else
         addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
@@ -118,14 +120,16 @@
   setOperationAction(ISD::MULHS,              MVT::i32, Custom);
   setOperationAction(ISD::MULHU,              MVT::i32, Custom);
 
-  if (Subtarget->hasCnMips())
+  if (Subtarget.hasCnMips())
     setOperationAction(ISD::MUL,              MVT::i64, Legal);
-  else if (Subtarget->isGP64bit())
+  else if (Subtarget.isGP64bit())
     setOperationAction(ISD::MUL,              MVT::i64, Custom);
 
-  if (Subtarget->isGP64bit()) {
+  if (Subtarget.isGP64bit()) {
     setOperationAction(ISD::MULHS,            MVT::i64, Custom);
     setOperationAction(ISD::MULHU,            MVT::i64, Custom);
+    setOperationAction(ISD::SDIVREM,          MVT::i64, Custom);
+    setOperationAction(ISD::UDIVREM,          MVT::i64, Custom);
   }
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
@@ -133,8 +137,6 @@
 
   setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
   setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
-  setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
   setOperationAction(ISD::LOAD,               MVT::i32, Custom);
   setOperationAction(ISD::STORE,              MVT::i32, Custom);
@@ -152,7 +154,7 @@
     setOperationAction(ISD::STORE, MVT::f64, Custom);
   }
 
-  if (Subtarget->hasMips32r6()) {
+  if (Subtarget.hasMips32r6()) {
     // MIPS32r6 replaces the accumulator-based multiplies with a three register
     // instruction
     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
@@ -180,7 +182,7 @@
     setOperationAction(ISD::SELECT, MVT::f32, Legal);
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
 
-    assert(Subtarget->isFP64bit() && "FR=1 is required for MIPS32r6");
+    assert(Subtarget.isFP64bit() && "FR=1 is required for MIPS32r6");
     setOperationAction(ISD::SETCC, MVT::f64, Legal);
     setOperationAction(ISD::SELECT, MVT::f64, Legal);
     setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
@@ -199,7 +201,7 @@
     setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
   }
 
-  if (Subtarget->hasMips64r6()) {
+  if (Subtarget.hasMips64r6()) {
     // MIPS64r6 replaces the accumulator-based multiplies with a three register
     // instruction
     setOperationAction(ISD::MUL, MVT::i64, Legal);
@@ -226,14 +228,15 @@
 }
 
 const MipsTargetLowering *
-llvm::createMipsSETargetLowering(MipsTargetMachine &TM) {
-  return new MipsSETargetLowering(TM);
+llvm::createMipsSETargetLowering(const MipsTargetMachine &TM,
+                                 const MipsSubtarget &STI) {
+  return new MipsSETargetLowering(TM, STI);
 }
 
 const TargetRegisterClass *
 MipsSETargetLowering::getRepRegClassFor(MVT VT) const {
   if (VT == MVT::Untyped)
-    return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass : &Mips::ACC64RegClass;
+    return Subtarget.hasDSP() ? &Mips::ACC64DSPRegClass : &Mips::ACC64RegClass;
 
   return TargetLowering::getRepRegClassFor(VT);
 }
@@ -327,12 +330,13 @@
 }
 
 bool
-MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
-                                                    unsigned,
-                                                    bool *Fast) const {
+MipsSETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                     unsigned,
+                                                     unsigned,
+                                                     bool *Fast) const {
   MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
 
-  if (Subtarget->systemSupportsUnalignedAccess()) {
+  if (Subtarget.systemSupportsUnalignedAccess()) {
     // MIPS32r6/MIPS64r6 is required to support unaligned access. It's
     // implementation defined whether this is handled by hardware, software, or
     // a hybrid of the two but it's expected that most implementations will
@@ -523,11 +527,11 @@
 
 static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget *Subtarget) {
+                                  const MipsSubtarget &Subtarget) {
   if (DCI.isBeforeLegalize())
     return SDValue();
 
-  if (Subtarget->hasMips32() && !Subtarget->hasMips32r6() &&
+  if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
       N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
     return SDValue(N, 0);
 
@@ -543,8 +547,8 @@
 // - Removes redundant zero extensions performed by an ISD::AND.
 static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget *Subtarget) {
-  if (!Subtarget->hasMSA())
+                                 const MipsSubtarget &Subtarget) {
+  if (!Subtarget.hasMSA())
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
@@ -575,10 +579,9 @@
     if ((Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT && Log2 >= ExtendTySize) ||
         Log2 == ExtendTySize) {
       SDValue Ops[] = { Op0->getOperand(0), Op0->getOperand(1), Op0Op2 };
-      DAG.MorphNodeTo(Op0.getNode(), MipsISD::VEXTRACT_ZEXT_ELT,
-                      Op0->getVTList(),
-                      makeArrayRef(Ops, Op0->getNumOperands()));
-      return Op0;
+      return DAG.getNode(MipsISD::VEXTRACT_ZEXT_ELT, SDLoc(Op0),
+                         Op0->getVTList(),
+                         makeArrayRef(Ops, Op0->getNumOperands()));
     }
   }
 
@@ -659,8 +662,8 @@
 //   vector type.
 static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI,
-                                const MipsSubtarget *Subtarget) {
-  if (!Subtarget->hasMSA())
+                                const MipsSubtarget &Subtarget) {
+  if (!Subtarget.hasMSA())
     return SDValue();
 
   EVT Ty = N->getValueType(0);
@@ -676,7 +679,7 @@
     SDValue Op0Op1 = Op0->getOperand(1);
     SDValue Op1Op0 = Op1->getOperand(0);
     SDValue Op1Op1 = Op1->getOperand(1);
-    bool IsLittleEndian = !Subtarget->isLittle();
+    bool IsLittleEndian = !Subtarget.isLittle();
 
     SDValue IfSet, IfClr, Cond;
     bool IsConstantMask = false;
@@ -779,11 +782,11 @@
 
 static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget *Subtarget) {
+                                  const MipsSubtarget &Subtarget) {
   if (DCI.isBeforeLegalize())
     return SDValue();
 
-  if (Subtarget->hasMips32() && N->getValueType(0) == MVT::i32 &&
+  if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
       selectMSUB(N, &DAG))
     return SDValue(N, 0);
 
@@ -843,7 +846,7 @@
 
 static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
                                       SelectionDAG &DAG,
-                                      const MipsSubtarget *Subtarget) {
+                                      const MipsSubtarget &Subtarget) {
   // See if this is a vector splat immediate node.
   APInt SplatValue, SplatUndef;
   unsigned SplatBitSize;
@@ -851,12 +854,12 @@
   unsigned EltSize = Ty.getVectorElementType().getSizeInBits();
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
 
-  if (!Subtarget->hasDSP())
+  if (!Subtarget.hasDSP())
     return SDValue();
 
   if (!BV ||
       !BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
-                           EltSize, !Subtarget->isLittle()) ||
+                           EltSize, !Subtarget.isLittle()) ||
       (SplatBitSize != EltSize) ||
       (SplatValue.getZExtValue() >= EltSize))
     return SDValue();
@@ -867,7 +870,7 @@
 
 static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget *Subtarget) {
+                                 const MipsSubtarget &Subtarget) {
   EVT Ty = N->getValueType(0);
 
   if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
@@ -890,10 +893,10 @@
 // used for DSPr2.
 static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget *Subtarget) {
+                                 const MipsSubtarget &Subtarget) {
   EVT Ty = N->getValueType(0);
 
-  if (Subtarget->hasMSA()) {
+  if (Subtarget.hasMSA()) {
     SDValue Op0 = N->getOperand(0);
     SDValue Op1 = N->getOperand(1);
 
@@ -920,15 +923,14 @@
            TotalBits <= 32)) {
         SDValue Ops[] = { Op0Op0->getOperand(0), Op0Op0->getOperand(1),
                           Op0Op0->getOperand(2) };
-        DAG.MorphNodeTo(Op0Op0.getNode(), MipsISD::VEXTRACT_SEXT_ELT,
-                        Op0Op0->getVTList(),
-                        makeArrayRef(Ops, Op0Op0->getNumOperands()));
-        return Op0Op0;
+        return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, SDLoc(Op0Op0),
+                           Op0Op0->getVTList(),
+                           makeArrayRef(Ops, Op0Op0->getNumOperands()));
       }
     }
   }
 
-  if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget->hasDSPR2()))
+  if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget.hasDSPR2()))
     return SDValue();
 
   return performDSPShiftCombine(MipsISD::SHRA_DSP, N, Ty, DAG, Subtarget);
@@ -937,10 +939,10 @@
 
 static SDValue performSRLCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
-                                 const MipsSubtarget *Subtarget) {
+                                 const MipsSubtarget &Subtarget) {
   EVT Ty = N->getValueType(0);
 
-  if (((Ty != MVT::v2i16) || !Subtarget->hasDSPR2()) && (Ty != MVT::v4i8))
+  if (((Ty != MVT::v2i16) || !Subtarget.hasDSPR2()) && (Ty != MVT::v4i8))
     return SDValue();
 
   return performDSPShiftCombine(MipsISD::SHRL_DSP, N, Ty, DAG, Subtarget);
@@ -1034,10 +1036,10 @@
 }
 
 static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
-                                 const MipsSubtarget *Subtarget) {
+                                 const MipsSubtarget &Subtarget) {
   EVT Ty = N->getValueType(0);
 
-  if (Subtarget->hasMSA() && Ty.is128BitVector() && Ty.isInteger()) {
+  if (Subtarget.hasMSA() && Ty.is128BitVector() && Ty.isInteger()) {
     // Try the following combines:
     //   (xor (or $a, $b), (build_vector allones))
     //   (xor (or $a, $b), (bitcast (build_vector allones)))
@@ -1165,15 +1167,14 @@
   }
 }
 
-bool MipsSETargetLowering::
-isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                  unsigned NextStackOffset,
-                                  const MipsFunctionInfo& FI) const {
+bool MipsSETargetLowering::isEligibleForTailCallOptimization(
+    const CCState &CCInfo, unsigned NextStackOffset,
+    const MipsFunctionInfo &FI) const {
   if (!EnableMipsTailCalls)
     return false;
 
   // Return false if either the callee or caller has a byval argument.
-  if (MipsCCInfo.hasByValArg() || FI.hasByvalArg())
+  if (CCInfo.getInRegsParamsCount() > 0 || FI.hasByvalArg())
     return false;
 
   // Return true if the callee's argument area is no larger than the
@@ -1185,10 +1186,12 @@
 getOpndList(SmallVectorImpl<SDValue> &Ops,
             std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
             bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+            bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
+            SDValue Chain) const {
   Ops.push_back(Callee);
   MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
-                                  InternalLinkage, CLI, Callee, Chain);
+                                  InternalLinkage, IsCallReloc, CLI, Callee,
+                                  Chain);
 }
 
 SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -1215,7 +1218,7 @@
                            Nd.isNonTemporal(), Nd.isInvariant(),
                            std::min(Nd.getAlignment(), 4U));
 
-  if (!Subtarget->isLittle())
+  if (!Subtarget.isLittle())
     std::swap(Lo, Hi);
 
   SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
@@ -1238,26 +1241,26 @@
   SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
                            Val, DAG.getConstant(1, MVT::i32));
 
-  if (!Subtarget->isLittle())
+  if (!Subtarget.isLittle())
     std::swap(Lo, Hi);
 
   // i32 store to lower address.
   Chain = DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(),
                        Nd.isVolatile(), Nd.isNonTemporal(), Nd.getAlignment(),
-                       Nd.getTBAAInfo());
+                       Nd.getAAInfo());
 
   // i32 store to higher address.
   Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, PtrVT));
   return DAG.getStore(Chain, DL, Hi, Ptr, MachinePointerInfo(),
                       Nd.isVolatile(), Nd.isNonTemporal(),
-                      std::min(Nd.getAlignment(), 4U), Nd.getTBAAInfo());
+                      std::min(Nd.getAlignment(), 4U), Nd.getAAInfo());
 }
 
 SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
                                           bool HasLo, bool HasHi,
                                           SelectionDAG &DAG) const {
   // MIPS32r6/MIPS64r6 removed accumulator based multiplies.
-  assert(!Subtarget->hasMips32r6());
+  assert(!Subtarget.hasMips32r6());
 
   EVT Ty = Op.getOperand(0).getValueType();
   SDLoc DL(Op);
@@ -1621,7 +1624,7 @@
   case Intrinsic::mips_bnegi_w:
   case Intrinsic::mips_bnegi_d:
     return lowerMSABinaryBitImmIntr(Op, DAG, ISD::XOR, Op->getOperand(2),
-                                    !Subtarget->isLittle());
+                                    !Subtarget.isLittle());
   case Intrinsic::mips_bnz_b:
   case Intrinsic::mips_bnz_h:
   case Intrinsic::mips_bnz_w:
@@ -1657,7 +1660,7 @@
   case Intrinsic::mips_bseti_w:
   case Intrinsic::mips_bseti_d:
     return lowerMSABinaryBitImmIntr(Op, DAG, ISD::OR, Op->getOperand(2),
-                                    !Subtarget->isLittle());
+                                    !Subtarget.isLittle());
   case Intrinsic::mips_bz_b:
   case Intrinsic::mips_bz_h:
   case Intrinsic::mips_bz_w:
@@ -1732,7 +1735,7 @@
   case Intrinsic::mips_copy_s_w:
     return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
   case Intrinsic::mips_copy_s_d:
-    if (Subtarget->hasMips64())
+    if (Subtarget.hasMips64())
       // Lower directly into VEXTRACT_SEXT_ELT since i64 is legal on Mips64.
       return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
     else {
@@ -1747,7 +1750,7 @@
   case Intrinsic::mips_copy_u_w:
     return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
   case Intrinsic::mips_copy_u_d:
-    if (Subtarget->hasMips64())
+    if (Subtarget.hasMips64())
       // Lower directly into VEXTRACT_ZEXT_ELT since i64 is legal on Mips64.
       return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
     else {
@@ -2324,12 +2327,12 @@
   unsigned SplatBitSize;
   bool HasAnyUndefs;
 
-  if (!Subtarget->hasMSA() || !ResTy.is128BitVector())
+  if (!Subtarget.hasMSA() || !ResTy.is128BitVector())
     return SDValue();
 
   if (Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
                             HasAnyUndefs, 8,
-                            !Subtarget->isLittle()) && SplatBitSize <= 64) {
+                            !Subtarget.isLittle()) && SplatBitSize <= 64) {
     // We can only cope with 8, 16, 32, or 64-bit elements
     if (SplatBitSize != 8 && SplatBitSize != 16 && SplatBitSize != 32 &&
         SplatBitSize != 64)
@@ -2744,7 +2747,8 @@
   //  $vr0 = phi($vr2, $fbb, $vr1, $tbb)
 
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
   DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -2809,7 +2813,8 @@
   //  $rd = phi($rd1, $fbb, $rd2, $tbb)
 
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
   DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -2870,7 +2875,8 @@
 // for lane 1 because it would require FR=0 mode which isn't supported by MSA.
 MachineBasicBlock * MipsSETargetLowering::
 emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Fd = MI->getOperand(0).getReg();
@@ -2902,9 +2908,10 @@
 // valid because FR=1 mode which is the only supported mode in MSA.
 MachineBasicBlock * MipsSETargetLowering::
 emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
-  assert(Subtarget->isFP64bit());
+  assert(Subtarget.isFP64bit());
 
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   unsigned Fd  = MI->getOperand(0).getReg();
   unsigned Ws  = MI->getOperand(1).getReg();
@@ -2933,7 +2940,8 @@
 MachineBasicBlock *
 MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
@@ -2965,9 +2973,10 @@
 MachineBasicBlock *
 MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
-  assert(Subtarget->isFP64bit());
+  assert(Subtarget.isFP64bit());
 
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
@@ -3015,7 +3024,8 @@
                                          MachineBasicBlock *BB,
                                          unsigned EltSizeInBytes,
                                          bool IsFP) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
@@ -3025,7 +3035,7 @@
 
   const TargetRegisterClass *VecRC = nullptr;
   const TargetRegisterClass *GPRRC =
-      Subtarget->isGP64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+      Subtarget.isGP64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
   unsigned EltLog2Size;
   unsigned InsertOp = 0;
   unsigned InsveOp = 0;
@@ -3125,7 +3135,8 @@
 MachineBasicBlock *
 MipsSETargetLowering::emitFILL_FW(MachineInstr *MI,
                                   MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
@@ -3154,9 +3165,10 @@
 MachineBasicBlock *
 MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
                                   MachineBasicBlock *BB) const {
-  assert(Subtarget->isFP64bit());
+  assert(Subtarget.isFP64bit());
 
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned Wd = MI->getOperand(0).getReg();
@@ -3184,7 +3196,8 @@
 MachineBasicBlock *
 MipsSETargetLowering::emitFEXP2_W_1(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
   unsigned Ws1 = RegInfo.createVirtualRegister(RC);
@@ -3213,7 +3226,8 @@
 MachineBasicBlock *
 MipsSETargetLowering::emitFEXP2_D_1(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
   unsigned Ws1 = RegInfo.createVirtualRegister(RC);

diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index 13ef6fc..d44f8d8 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSSEISELLOWERING_H
-#define MIPSSEISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEISELLOWERING_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEISELLOWERING_H
 
 #include "MipsISelLowering.h"
 #include "MipsRegisterInfo.h"
@@ -20,7 +20,8 @@
 namespace llvm {
   class MipsSETargetLowering : public MipsTargetLowering  {
   public:
-    explicit MipsSETargetLowering(MipsTargetMachine &TM);
+    explicit MipsSETargetLowering(const MipsTargetMachine &TM,
+                                  const MipsSubtarget &STI);
 
     /// \brief Enable MSA support for the given integer type and Register
     /// class.
@@ -30,8 +31,9 @@
     void addMSAFloatType(MVT::SimpleValueType Ty,
                          const TargetRegisterClass *RC);
 
-    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS = 0,
-                                       bool *Fast = nullptr) const override;
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS = 0,
+                                        unsigned Align = 1,
+                                        bool *Fast = nullptr) const override;
 
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
@@ -49,15 +51,15 @@
     const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
 
   private:
-    bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                     unsigned NextStackOffset,
-                                     const MipsFunctionInfo& FI) const override;
+    bool isEligibleForTailCallOptimization(
+        const CCState &CCInfo, unsigned NextStackOffset,
+        const MipsFunctionInfo &FI) const override;
 
     void
     getOpndList(SmallVectorImpl<SDValue> &Ops,
                 std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-                CallLoweringInfo &CLI, SDValue Callee,
+                bool IsCallReloc, CallLoweringInfo &CLI, SDValue Callee,
                 SDValue Chain) const override;
 
     SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -112,4 +114,4 @@
   };
 }
 
-#endif // MipsSEISELLOWERING_H
+#endif

diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 32da749..16bea8b 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp

@@ -24,11 +24,10 @@
 
 using namespace llvm;
 
-MipsSEInstrInfo::MipsSEInstrInfo(MipsTargetMachine &tm)
-  : MipsInstrInfo(tm,
-                  tm.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J),
-    RI(*tm.getSubtargetImpl()),
-    IsN64(tm.getSubtarget<MipsSubtarget>().isABI_N64()) {}
+MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI)
+    : MipsInstrInfo(STI, STI.getRelocationModel() == Reloc::PIC_ ? Mips::B
+                                                                 : Mips::J),
+      RI(STI), IsN64(STI.isABI_N64()) {}
 
 const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
   return RI;
@@ -84,7 +83,7 @@
                                   unsigned DestReg, unsigned SrcReg,
                                   bool KillSrc) const {
   unsigned Opc = 0, ZeroReg = 0;
-  bool isMicroMips = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode();
+  bool isMicroMips = Subtarget.inMicroMipsMode();
 
   if (Mips::GPR32RegClass.contains(DestReg)) { // Copy to CPU Reg.
     if (Mips::GPR32RegClass.contains(SrcReg)) {
@@ -265,7 +264,7 @@
 
 bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   MachineBasicBlock &MBB = *MI->getParent();
-  bool isMicroMips = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode();
+  bool isMicroMips = Subtarget.inMicroMipsMode();
   unsigned Opc;
 
   switch(MI->getDesc().getOpcode()) {
@@ -360,7 +359,7 @@
 void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const {
-  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  const MipsSubtarget &STI = Subtarget;
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
   unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
@@ -380,7 +379,7 @@
                                MachineBasicBlock::iterator II, DebugLoc DL,
                                unsigned *NewImm) const {
   MipsAnalyzeImmediate AnalyzeImm;
-  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  const MipsSubtarget &STI = Subtarget;
   MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
   unsigned Size = STI.isABI_N64() ? 64 : 32;
   unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
@@ -429,8 +428,6 @@
 
 void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) const {
-  const auto &Subtarget = TM.getSubtarget<MipsSubtarget>();
-
   if (Subtarget.isGP64bit())
     BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
         .addReg(Mips::RA_64);
@@ -521,8 +518,17 @@
   unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
   unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
 
-  if (SubIdx == Mips::sub_hi && FP64) {
-    // FIXME: The .addReg(SrcReg, RegState::Implicit) is a white lie used to
+  // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2()));
+
+  // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg()));
+
+  if (SubIdx == Mips::sub_hi && Subtarget.hasMTHC1()) {
+    // FIXME: Strictly speaking MFHC1 only reads the top 32-bits however, we
+    //        claim to read the whole 64-bits as part of a white lie used to
     //        temporarily work around a widespread bug in the -mfp64 support.
     //        The problem is that none of the 32-bit fpu ops mention the fact
     //        that they clobber the upper 32-bits of the 64-bit FPR. Fixing that
@@ -533,8 +539,8 @@
     //        We therefore pretend that it reads the bottom 32-bits to
     //        artificially create a dependency and prevent the scheduler
     //        changing the behaviour of the code.
-    BuildMI(MBB, I, dl, get(Mips::MFHC1), DstReg).addReg(SubReg).addReg(
-        SrcReg, RegState::Implicit);
+    BuildMI(MBB, I, dl, get(FP64 ? Mips::MFHC1_D64 : Mips::MFHC1_D32), DstReg)
+        .addReg(SrcReg);
   } else
     BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
 }
@@ -547,29 +553,34 @@
   const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
   DebugLoc dl = I->getDebugLoc();
   const TargetRegisterInfo &TRI = getRegisterInfo();
-  bool HasMTHC1 = TM.getSubtarget<MipsSubtarget>().hasMips32r2() ||
-                  TM.getSubtarget<MipsSubtarget>().hasMips32r6();
 
   // When mthc1 is available, use:
   //   mtc1 Lo, $fp
   //   mthc1 Hi, $fp
   //
-  // Otherwise, for FP64:
+  // Otherwise, for O32 FPXX ABI:
   //   spill + reload via ldc1
-  // This has not been implemented since FP64 on MIPS32 and earlier is not
-  // supported.
+  // This case is handled by the frame lowering code.
   //
   // Otherwise, for FP32:
   //   mtc1 Lo, $fp
   //   mtc1 Hi, $fp + 1
+  //
+  // The case where dmtc1 is available doesn't need to be handled here
+  // because it never creates a BuildPairF64 node.
+
+  // FPXX on MIPS-II or MIPS32r1 should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isABI_FPXX() && !Subtarget.hasMips32r2()));
+
+  // FP64A (FP64 with nooddspreg) should have been handled with a spill/reload
+  // in MipsSEFrameLowering.cpp.
+  assert(!(Subtarget.isFP64bit() && !Subtarget.useOddSPReg()));
 
   BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
     .addReg(LoReg);
 
-  if (HasMTHC1 || FP64) {
-    assert(TM.getSubtarget<MipsSubtarget>().hasMips32r2() &&
-           "MTHC1 requires MIPS32r2");
-
+  if (Subtarget.hasMTHC1()) {
     // FIXME: The .addReg(DstReg) is a white lie used to temporarily work
     //        around a widespread bug in the -mfp64 support.
     //        The problem is that none of the 32-bit fpu ops mention the fact
@@ -584,7 +595,9 @@
     BuildMI(MBB, I, dl, get(FP64 ? Mips::MTHC1_D64 : Mips::MTHC1_D32), DstReg)
         .addReg(DstReg)
         .addReg(HiReg);
-  } else
+  } else if (Subtarget.isABI_FPXX())
+    llvm_unreachable("BuildPairF64 not expanded in frame lowering code!");
+  else
     BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
       .addReg(HiReg);
 }
@@ -594,28 +607,34 @@
   // This pseudo instruction is generated as part of the lowering of
   // ISD::EH_RETURN. We convert it to a stack increment by OffsetReg, and
   // indirect jump to TargetReg
-  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
-  unsigned ADDU = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-  unsigned SP = STI.isGP64bit() ? Mips::SP_64 : Mips::SP;
-  unsigned RA = STI.isGP64bit() ? Mips::RA_64 : Mips::RA;
-  unsigned T9 = STI.isGP64bit() ? Mips::T9_64 : Mips::T9;
-  unsigned ZERO = STI.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ADDU = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+  unsigned SP = Subtarget.isGP64bit() ? Mips::SP_64 : Mips::SP;
+  unsigned RA = Subtarget.isGP64bit() ? Mips::RA_64 : Mips::RA;
+  unsigned T9 = Subtarget.isGP64bit() ? Mips::T9_64 : Mips::T9;
+  unsigned ZERO = Subtarget.isGP64bit() ? Mips::ZERO_64 : Mips::ZERO;
   unsigned OffsetReg = I->getOperand(0).getReg();
   unsigned TargetReg = I->getOperand(1).getReg();
 
   // addu $ra, $v0, $zero
   // addu $sp, $sp, $v1
   // jr   $ra (via RetRA)
+  const TargetMachine &TM = MBB.getParent()->getTarget();
   if (TM.getRelocationModel() == Reloc::PIC_)
-    BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), T9)
-        .addReg(TargetReg).addReg(ZERO);
-  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), RA)
-      .addReg(TargetReg).addReg(ZERO);
-  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), SP)
-      .addReg(SP).addReg(OffsetReg);
+    BuildMI(MBB, I, I->getDebugLoc(),
+            TM.getSubtargetImpl()->getInstrInfo()->get(ADDU), T9)
+        .addReg(TargetReg)
+        .addReg(ZERO);
+  BuildMI(MBB, I, I->getDebugLoc(),
+          TM.getSubtargetImpl()->getInstrInfo()->get(ADDU), RA)
+      .addReg(TargetReg)
+      .addReg(ZERO);
+  BuildMI(MBB, I, I->getDebugLoc(),
+          TM.getSubtargetImpl()->getInstrInfo()->get(ADDU), SP)
+      .addReg(SP)
+      .addReg(OffsetReg);
   expandRetRA(MBB, I);
 }
 
-const MipsInstrInfo *llvm::createMipsSEInstrInfo(MipsTargetMachine &TM) {
-  return new MipsSEInstrInfo(TM);
+const MipsInstrInfo *llvm::createMipsSEInstrInfo(const MipsSubtarget &STI) {
+  return new MipsSEInstrInfo(STI);
 }

diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 9ac94ce..b2d2301 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSSEINSTRUCTIONINFO_H
-#define MIPSSEINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEINSTRINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEINSTRINFO_H
 
 #include "MipsInstrInfo.h"
 #include "MipsSERegisterInfo.h"
@@ -24,7 +24,7 @@
   bool IsN64;
 
 public:
-  explicit MipsSEInstrInfo(MipsTargetMachine &TM);
+  explicit MipsSEInstrInfo(const MipsSubtarget &STI);
 
   const MipsRegisterInfo &getRegisterInfo() const override;
 

diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 0af1a6b..55c6638 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp

@@ -172,7 +172,7 @@
       unsigned Reg = RegInfo.createVirtualRegister(RC);
       const MipsSEInstrInfo &TII =
           *static_cast<const MipsSEInstrInfo *>(
-               MBB.getParent()->getTarget().getInstrInfo());
+              MBB.getParent()->getSubtarget().getInstrInfo());
       BuildMI(MBB, II, DL, TII.get(ADDiu), Reg).addReg(FrameReg).addImm(Offset);
 
       FrameReg = Reg;
@@ -187,7 +187,7 @@
       unsigned NewImm = 0;
       const MipsSEInstrInfo &TII =
           *static_cast<const MipsSEInstrInfo *>(
-               MBB.getParent()->getTarget().getInstrInfo());
+              MBB.getParent()->getSubtarget().getInstrInfo());
       unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL,
                                        OffsetBitSize == 16 ? &NewImm : nullptr);
       BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)

diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index f2f3a7e..6b70d07 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSSEREGISTERINFO_H
-#define MIPSSEREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSEREGISTERINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSEREGISTERINFO_H
 
 #include "MipsRegisterInfo.h"
 

diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.h b/lib/Target/Mips/MipsSelectionDAGInfo.h
index 2b3d527..061423f 100644
--- a/lib/Target/Mips/MipsSelectionDAGInfo.h
+++ b/lib/Target/Mips/MipsSelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSSELECTIONDAGINFO_H
-#define MIPSSELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 

diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 693daa3..8768b12 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp

@@ -58,6 +58,10 @@
   cl::desc("MIPS: mips16 constant islands enable."),
   cl::init(true));
 
+static cl::opt<bool>
+GPOpt("mgpopt", cl::Hidden,
+      cl::desc("MIPS: Enable gp-relative addressing of small data items"));
+
 /// Select the Mips CPU for the given triple and cpu name.
 /// FIXME: Merge with the copy in MipsMCTargetDesc.cpp
 static StringRef selectMipsCPU(Triple TT, StringRef CPU) {
@@ -104,31 +108,32 @@
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                              const std::string &FS, bool little,
-                             Reloc::Model _RM, MipsTargetMachine *_TM)
-    : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(Mips32),
-      MipsABI(UnknownABI), IsLittle(little), IsSingleFloat(false),
-      IsFPXX(false), IsFP64bit(false), UseOddSPReg(true), IsNaN2008bit(false),
-      IsGP64bit(false), HasVFPU(false), HasCnMips(false), IsLinux(true),
-      HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false),
-      HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
-      InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
-      HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
-      HasMSA(false), RM(_RM), OverrideMode(NoOverride), TM(_TM),
-      TargetTriple(TT),
+                             const MipsTargetMachine *_TM)
+    : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
+      ABI(MipsABIInfo::Unknown()), IsLittle(little), IsSingleFloat(false),
+      IsFPXX(false), NoABICalls(false), IsFP64bit(false), UseOddSPReg(true),
+      IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false), HasCnMips(false),
+      IsLinux(true), HasMips3_32(false), HasMips3_32r2(false),
+      HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false),
+      InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
+      InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
+      AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
+      HasMSA(false), TM(_TM), TargetTriple(TT),
       DL(computeDataLayout(initializeSubtargetDependencies(CPU, FS, TM))),
-      TSInfo(DL), JITInfo(), InstrInfo(MipsInstrInfo::create(*TM)),
-      FrameLowering(MipsFrameLowering::create(*TM, *this)),
-      TLInfo(MipsTargetLowering::create(*TM)) {
+      TSInfo(DL), InstrInfo(MipsInstrInfo::create(*this)),
+      FrameLowering(MipsFrameLowering::create(*this)),
+      TLInfo(MipsTargetLowering::create(*TM, *this)) {
 
   PreviousInMips16Mode = InMips16Mode;
 
-  // Don't even attempt to generate code for MIPS-I, MIPS-II, MIPS-III, and
-  // MIPS-V. They have not been tested and currently exist for the integrated
+  if (MipsArchVersion == MipsDefault)
+    MipsArchVersion = Mips32;
+
+  // Don't even attempt to generate code for MIPS-I, MIPS-III and MIPS-V.
+  // They have not been tested and currently exist for the integrated
   // assembler only.
   if (MipsArchVersion == Mips1)
     report_fatal_error("Code generation for MIPS-I is not implemented", false);
-  if (MipsArchVersion == Mips2)
-    report_fatal_error("Code generation for MIPS-II is not implemented", false);
   if (MipsArchVersion == Mips3)
     report_fatal_error("Code generation for MIPS-III is not implemented",
                        false);
@@ -136,7 +141,7 @@
     report_fatal_error("Code generation for MIPS-V is not implemented", false);
 
   // Assert exactly one ABI was chosen.
-  assert(MipsABI != UnknownABI);
+  assert(ABI.IsKnown());
   assert((((getFeatureBits() & Mips::FeatureO32) != 0) +
           ((getFeatureBits() & Mips::FeatureEABI) != 0) +
           ((getFeatureBits() & Mips::FeatureN32) != 0) +
@@ -153,9 +158,10 @@
                        false);
 
   if (!isABI_O32() && !useOddSPReg())
-    report_fatal_error("-mattr=+nooddspreg is not currently permitted for a "
-                       "the O32 ABI.",
-                       false);
+    report_fatal_error("-mattr=+nooddspreg requires the O32 ABI.", false);
+
+  if (IsFPXX && (isABI_N32() || isABI_N64()))
+    report_fatal_error("FPXX is not permitted for the N32/N64 ABI's.", false);
 
   if (hasMips32r6()) {
     StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
@@ -170,21 +176,29 @@
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
 
+  if (NoABICalls && TM->getRelocationModel() == Reloc::PIC_)
+    report_fatal_error("position-independent code requires '-mabicalls'");
+
   // Set UseSmallSection.
-  // TODO: Investigate the IsLinux check. I suspect it's really checking for
-  //       bare-metal.
-  UseSmallSection = !IsLinux && (RM == Reloc::Static);
+  UseSmallSection = GPOpt;
+  if (!NoABICalls && GPOpt) {
+    errs() << "warning: cannot use small-data accesses for '-mabicalls'"
+           << "\n";
+    UseSmallSection = false;
+  }
 }
 
-bool
-MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                                    TargetSubtargetInfo::AntiDepBreakMode &Mode,
-                                     RegClassVector &CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
+/// This overrides the PostRAScheduler bit in the SchedModel for any CPU.
+bool MipsSubtarget::enablePostMachineScheduler() const { return true; }
+
+void MipsSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
   CriticalPathRCs.clear();
-  CriticalPathRCs.push_back(isGP64bit() ? &Mips::GPR64RegClass
-                                        : &Mips::GPR32RegClass);
-  return OptLevel >= CodeGenOpt::Aggressive;
+  CriticalPathRCs.push_back(isGP64bit() ?
+                            &Mips::GPR64RegClass : &Mips::GPR32RegClass);
+}
+
+CodeGenOpt::Level MipsSubtarget::getOptLevelToEnablePostRAScheduler() const {
+  return CodeGenOpt::Aggressive;
 }
 
 MipsSubtarget &
@@ -197,100 +211,13 @@
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
-  if (InMips16Mode && !TM->Options.UseSoftFloat) {
-    // Hard float for mips16 means essentially to compile as soft float
-    // but to use a runtime library for soft float that is written with
-    // native mips32 floating point instructions (those runtime routines
-    // run in mips32 hard float mode).
-    TM->Options.UseSoftFloat = true;
-    TM->Options.FloatABIType = FloatABI::Soft;
+  if (InMips16Mode && !TM->Options.UseSoftFloat)
     InMips16HardFloat = true;
-  }
 
   return *this;
 }
 
-//FIXME: This logic for reseting the subtarget along with
-// the helper classes can probably be simplified but there are a lot of
-// cases so we will defer rewriting this to later.
-//
-void MipsSubtarget::resetSubtarget(MachineFunction *MF) {
-  bool ChangeToMips16 = false, ChangeToNoMips16 = false;
-  DEBUG(dbgs() << "resetSubtargetFeatures" << "\n");
-  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
-  ChangeToMips16 = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                                        "mips16");
-  ChangeToNoMips16 = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                                        "nomips16");
-  assert (!(ChangeToMips16 & ChangeToNoMips16) &&
-          "mips16 and nomips16 specified on the same function");
-  if (ChangeToMips16) {
-    if (PreviousInMips16Mode)
-      return;
-    OverrideMode = Mips16Override;
-    PreviousInMips16Mode = true;
-    setHelperClassesMips16();
-    return;
-  } else if (ChangeToNoMips16) {
-    if (!PreviousInMips16Mode)
-      return;
-    OverrideMode = NoMips16Override;
-    PreviousInMips16Mode = false;
-    setHelperClassesMipsSE();
-    return;
-  } else {
-    if (OverrideMode == NoOverride)
-      return;
-    OverrideMode = NoOverride;
-    DEBUG(dbgs() << "back to default" << "\n");
-    if (inMips16Mode() && !PreviousInMips16Mode) {
-      setHelperClassesMips16();
-      PreviousInMips16Mode = true;
-    } else if (!inMips16Mode() && PreviousInMips16Mode) {
-      setHelperClassesMipsSE();
-      PreviousInMips16Mode = false;
-    }
-    return;
-  }
-}
-
-void MipsSubtarget::setHelperClassesMips16() {
-  InstrInfoSE.swap(InstrInfo);
-  FrameLoweringSE.swap(FrameLowering);
-  TLInfoSE.swap(TLInfo);
-  if (!InstrInfo16) {
-    InstrInfo.reset(MipsInstrInfo::create(*TM));
-    FrameLowering.reset(MipsFrameLowering::create(*TM, *this));
-    TLInfo.reset(MipsTargetLowering::create(*TM));
-  } else {
-    InstrInfo16.swap(InstrInfo);
-    FrameLowering16.swap(FrameLowering);
-    TLInfo16.swap(TLInfo);
-  }
-  assert(TLInfo && "null target lowering 16");
-  assert(InstrInfo && "null instr info 16");
-  assert(FrameLowering && "null frame lowering 16");
-}
-
-void MipsSubtarget::setHelperClassesMipsSE() {
-  InstrInfo16.swap(InstrInfo);
-  FrameLowering16.swap(FrameLowering);
-  TLInfo16.swap(TLInfo);
-  if (!InstrInfoSE) {
-    InstrInfo.reset(MipsInstrInfo::create(*TM));
-    FrameLowering.reset(MipsFrameLowering::create(*TM, *this));
-    TLInfo.reset(MipsTargetLowering::create(*TM));
-  } else {
-    InstrInfoSE.swap(InstrInfo);
-    FrameLoweringSE.swap(FrameLowering);
-    TLInfoSE.swap(TLInfo);
-  }
-  assert(TLInfo && "null target lowering in SE");
-  assert(InstrInfo && "null instr info SE");
-  assert(FrameLowering && "null frame lowering SE");
-}
-
-bool MipsSubtarget::mipsSEUsesSoftFloat() const {
+bool MipsSubtarget::abiUsesSoftFloat() const {
   return TM->Options.UseSoftFloat && !InMips16HardFloat;
 }
 
@@ -298,3 +225,7 @@
   DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n");
   return Mips16ConstantIslands;
 }
+
+Reloc::Model MipsSubtarget::getRelocationModel() const {
+  return TM->getRelocationModel();
+}

diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index a3dcf03..bff9013 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h

@@ -11,18 +11,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSSUBTARGET_H
-#define MIPSSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSSUBTARGET_H
+#define LLVM_LIB_TARGET_MIPS_MIPSSUBTARGET_H
 
 #include "MipsFrameLowering.h"
 #include "MipsISelLowering.h"
 #include "MipsInstrInfo.h"
-#include "MipsJITInfo.h"
 #include "MipsSelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include "MipsABIInfo.h"
 #include <string>
 
 #define GET_SUBTARGETINFO_HEADER
@@ -36,14 +36,8 @@
 class MipsSubtarget : public MipsGenSubtargetInfo {
   virtual void anchor();
 
-public:
-  // NOTE: O64 will not be supported.
-  enum MipsABIEnum {
-    UnknownABI, O32, N32, N64, EABI
-  };
-
-protected:
   enum MipsArchEnum {
+    MipsDefault,
     Mips1, Mips2, Mips32, Mips32r2, Mips32r6, Mips3, Mips4, Mips5, Mips64,
     Mips64r2, Mips64r6
   };
@@ -51,8 +45,8 @@
   // Mips architecture version
   MipsArchEnum MipsArchVersion;
 
-  // Mips supported ABIs
-  MipsABIEnum MipsABI;
+  // Selected ABI
+  MipsABIInfo ABI;
 
   // IsLittle - The target is Little Endian
   bool IsLittle;
@@ -65,6 +59,9 @@
   // IsFPXX - MIPS O32 modeless ABI.
   bool IsFPXX;
 
+  // NoABICalls - Disable SVR4-style position-independent code.
+  bool NoABICalls;
+
   // IsFP64bit - The target processor has 64-bit floating point registers.
   bool IsFP64bit;
 
@@ -135,48 +132,39 @@
 
   InstrItineraryData InstrItins;
 
-  // Relocation Model
-  Reloc::Model RM;
-
   // We can override the determination of whether we are in mips16 mode
   // as from the command line
   enum {NoOverride, Mips16Override, NoMips16Override} OverrideMode;
 
-  MipsTargetMachine *TM;
+  const MipsTargetMachine *TM;
 
   Triple TargetTriple;
 
   const DataLayout DL; // Calculates type size & alignment
   const MipsSelectionDAGInfo TSInfo;
-  MipsJITInfo JITInfo;
   std::unique_ptr<const MipsInstrInfo> InstrInfo;
   std::unique_ptr<const MipsFrameLowering> FrameLowering;
   std::unique_ptr<const MipsTargetLowering> TLInfo;
-  std::unique_ptr<const MipsInstrInfo> InstrInfo16;
-  std::unique_ptr<const MipsFrameLowering> FrameLowering16;
-  std::unique_ptr<const MipsTargetLowering> TLInfo16;
-  std::unique_ptr<const MipsInstrInfo> InstrInfoSE;
-  std::unique_ptr<const MipsFrameLowering> FrameLoweringSE;
-  std::unique_ptr<const MipsTargetLowering> TLInfoSE;
 
 public:
-  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                             AntiDepBreakMode& Mode,
-                             RegClassVector& CriticalPathRCs) const override;
+  /// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+  bool enablePostMachineScheduler() const override;
+  void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
+  CodeGenOpt::Level getOptLevelToEnablePostRAScheduler() const override;
 
   /// Only O32 and EABI supported right now.
-  bool isABI_EABI() const { return MipsABI == EABI; }
-  bool isABI_N64() const { return MipsABI == N64; }
-  bool isABI_N32() const { return MipsABI == N32; }
-  bool isABI_O32() const { return MipsABI == O32; }
-  bool isABI_FPXX() const { return false; } // TODO: add check for FPXX
-  unsigned getTargetABI() const { return MipsABI; }
+  bool isABI_EABI() const { return ABI.IsEABI(); }
+  bool isABI_N64() const { return ABI.IsN64(); }
+  bool isABI_N32() const { return ABI.IsN32(); }
+  bool isABI_O32() const { return ABI.IsO32(); }
+  bool isABI_FPXX() const { return isABI_O32() && IsFPXX; }
+  const MipsABIInfo &getABI() const { return ABI; }
 
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   MipsSubtarget(const std::string &TT, const std::string &CPU,
-                const std::string &FS, bool little, Reloc::Model RM,
-                MipsTargetMachine *TM);
+                const std::string &FS, bool little,
+                const MipsTargetMachine *TM);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -209,30 +197,25 @@
   bool hasCnMips() const { return HasCnMips; }
 
   bool isLittle() const { return IsLittle; }
+  bool isABICalls() const { return !NoABICalls; }
   bool isFPXX() const { return IsFPXX; }
   bool isFP64bit() const { return IsFP64bit; }
   bool useOddSPReg() const { return UseOddSPReg; }
+  bool noOddSPReg() const { return !UseOddSPReg; }
   bool isNaN2008() const { return IsNaN2008bit; }
-  bool isNotFP64bit() const { return !IsFP64bit; }
   bool isGP64bit() const { return IsGP64bit; }
   bool isGP32bit() const { return !IsGP64bit; }
+  unsigned getGPRSizeInBytes() const { return isGP64bit() ? 8 : 4; }
   bool isSingleFloat() const { return IsSingleFloat; }
-  bool isNotSingleFloat() const { return !IsSingleFloat; }
   bool hasVFPU() const { return HasVFPU; }
-  bool inMips16Mode() const {
-    switch (OverrideMode) {
-    case NoOverride:
-      return InMips16Mode;
-    case Mips16Override:
-      return true;
-    case NoMips16Override:
-      return false;
-    }
-    llvm_unreachable("Unexpected mode");
-  }
+  bool inMips16Mode() const { return InMips16Mode; }
   bool inMips16ModeDefault() const {
     return InMips16Mode;
   }
+  // Hard float for mips16 means essentially to compile as soft float
+  // but to use a runtime library for soft float that is written with
+  // native mips32 floating point instructions (those runtime routines
+  // run in mips32 hard float mode).
   bool inMips16HardFloat() const {
     return inMips16Mode() && InMips16HardFloat;
   }
@@ -245,7 +228,7 @@
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
 
-  bool mipsSEUsesSoftFloat() const;
+  bool abiUsesSoftFloat() const;
 
   bool enableLongBranchPass() const {
     return hasStandardEncoding() || allowMixed16_32();
@@ -253,15 +236,14 @@
 
   /// Features related to the presence of specific instructions.
   bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); }
+  bool hasMTHC1() const { return hasMips32r2(); }
 
-  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
   bool allowMixed16_32() const { return inMips16ModeDefault() |
                                         AllowMixed16_32;}
 
   bool os16() const { return Os16;};
 
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
-  bool isNotTargetNaCl() const { return !TargetTriple.isOSNaCl(); }
 
   // for now constant islands are on for the whole compilation unit but we only
   // really use them if in addition we are in mips16 mode
@@ -270,10 +252,7 @@
   unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
 
   // Grab relocation model
-  Reloc::Model getRelocationModel() const {return RM;}
-
-  /// \brief Reset the subtarget for the Mips target.
-  void resetSubtarget(MachineFunction *MF);
+  Reloc::Model getRelocationModel() const;
 
   MipsSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
                                                  const TargetMachine *TM);
@@ -289,17 +268,23 @@
   void setHelperClassesMips16();
   void setHelperClassesMipsSE();
 
-  MipsJITInfo *getJITInfo() { return &JITInfo; }
-  const MipsSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
-  const DataLayout *getDataLayout() const { return &DL; }
-  const MipsInstrInfo *getInstrInfo() const { return InstrInfo.get(); }
-  const TargetFrameLowering *getFrameLowering() const {
+  const MipsSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const MipsInstrInfo *getInstrInfo() const override { return InstrInfo.get(); }
+  const TargetFrameLowering *getFrameLowering() const override {
     return FrameLowering.get();
   }
-  const MipsRegisterInfo *getRegisterInfo() const {
+  const MipsRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo->getRegisterInfo();
   }
-  const MipsTargetLowering *getTargetLowering() const { return TLInfo.get(); }
+  const MipsTargetLowering *getTargetLowering() const override {
+    return TLInfo.get();
+  }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
 };
 } // End llvm namespace
 

diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 425dbf1..33280e3 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp

@@ -26,6 +26,7 @@
 #include "MipsSEISelDAGToDAG.h"
 #include "MipsSEISelLowering.h"
 #include "MipsSEInstrInfo.h"
+#include "MipsTargetObjectFile.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
@@ -56,10 +57,20 @@
                                      Reloc::Model RM, CodeModel::Model CM,
                                      CodeGenOpt::Level OL, bool isLittle)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, isLittle, RM, this) {
+      isLittle(isLittle),
+      TLOF(make_unique<MipsTargetObjectFile>()),
+      Subtarget(nullptr),
+      DefaultSubtarget(TT, CPU, FS, isLittle, this),
+      NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
+                        isLittle, this),
+      Mips16Subtarget(TT, CPU, FS.empty() ? "+mips16" : FS.str() + ",+mips16",
+                      isLittle, this) {
+  Subtarget = &DefaultSubtarget;
   initAsmInfo();
 }
 
+MipsTargetMachine::~MipsTargetMachine() {}
+
 void MipsebTargetMachine::anchor() { }
 
 MipsebTargetMachine::
@@ -78,6 +89,63 @@
                     CodeGenOpt::Level OL)
   : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
+const MipsSubtarget *
+MipsTargetMachine::getSubtargetImpl(const Function &F) const {
+  AttributeSet FnAttrs = F.getAttributes();
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+  bool hasMips16Attr =
+      !FnAttrs.getAttribute(AttributeSet::FunctionIndex, "mips16")
+           .hasAttribute(Attribute::None);
+  bool hasNoMips16Attr =
+      !FnAttrs.getAttribute(AttributeSet::FunctionIndex, "nomips16")
+           .hasAttribute(Attribute::None);
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function before we can generate a subtarget. We also need to use
+  // it as a key for the subtarget since that can be the only difference
+  // between two functions.
+  Attribute SFAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
+  bool softFloat = !SFAttr.hasAttribute(Attribute::None)
+                       ? SFAttr.getValueAsString() == "true"
+                       : Options.UseSoftFloat;
+
+  if (hasMips16Attr)
+    FS += FS.empty() ? "+mips16" : ",+mips16";
+  else if (hasNoMips16Attr)
+    FS += FS.empty() ? "-mips16" : ",-mips16";
+
+  auto &I = SubtargetMap[CPU + FS + (softFloat ? "use-soft-float=true"
+                                               : "use-soft-float=false")];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<MipsSubtarget>(TargetTriple, CPU, FS, isLittle, this);
+  }
+  return I.get();
+}
+
+void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
+  DEBUG(dbgs() << "resetSubtarget\n");
+
+  Subtarget = const_cast<MipsSubtarget *>(getSubtargetImpl(*MF->getFunction()));
+  MF->setSubtarget(Subtarget);
+  return;
+}
+
 namespace {
 /// Mips Code Generator Pass Configuration Options.
 class MipsPassConfig : public TargetPassConfig {
@@ -115,22 +183,18 @@
 
 void MipsPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
+  addPass(createAtomicExpandPass(&getMipsTargetMachine()));
   if (getMipsSubtarget().os16())
     addPass(createMipsOs16(getMipsTargetMachine()));
   if (getMipsSubtarget().inMips16HardFloat())
     addPass(createMips16HardFloat(getMipsTargetMachine()));
-  addPass(createPartiallyInlineLibCallsPass());
 }
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
 bool MipsPassConfig::addInstSelector() {
-  if (getMipsSubtarget().allowMixed16_32()) {
-    addPass(createMipsModuleISelDag(getMipsTargetMachine()));
-    addPass(createMips16ISelDag(getMipsTargetMachine()));
-    addPass(createMipsSEISelDag(getMipsTargetMachine()));
-  } else {
-    addPass(createMipsISelDag(getMipsTargetMachine()));
-  }
+  addPass(createMipsModuleISelDag(getMipsTargetMachine()));
+  addPass(createMips16ISelDag(getMipsTargetMachine()));
+  addPass(createMipsSEISelDag(getMipsTargetMachine()));
   return false;
 }
 
@@ -149,7 +213,7 @@
 }
 
 void MipsTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  if (Subtarget.allowMixed16_32()) {
+  if (Subtarget->allowMixed16_32()) {
     DEBUG(errs() << "No ");
     //FIXME: The Basic Target Transform Info
     // pass needs to become a function pass instead of
@@ -166,21 +230,8 @@
 // print out the code after the passes.
 bool MipsPassConfig::addPreEmitPass() {
   MipsTargetMachine &TM = getMipsTargetMachine();
-  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
   addPass(createMipsDelaySlotFillerPass(TM));
-
-  if (Subtarget.enableLongBranchPass())
-    addPass(createMipsLongBranchPass(TM));
-  if (Subtarget.inMips16Mode() ||
-      Subtarget.allowMixed16_32())
-    addPass(createMipsConstantIslandPass(TM));
-
+  addPass(createMipsLongBranchPass(TM));
+  addPass(createMipsConstantIslandPass(TM));
   return true;
 }
-
-bool MipsTargetMachine::addCodeEmitter(PassManagerBase &PM,
-                                       JITCodeEmitter &JCE) {
-  // Machine code emitter pass for Mips.
-  PM.add(createMipsJITCodeEmitterPass(*this, JCE));
-  return false;
-}

diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index a0e7d43..1349f82 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSTARGETMACHINE_H
-#define MIPSTARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
 
 #include "MipsSubtarget.h"
 #include "llvm/CodeGen/Passes.h"
@@ -25,48 +25,40 @@
 class MipsRegisterInfo;
 
 class MipsTargetMachine : public LLVMTargetMachine {
-  MipsSubtarget       Subtarget;
+  bool isLittle;
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  MipsSubtarget *Subtarget;
+  MipsSubtarget DefaultSubtarget;
+  MipsSubtarget NoMips16Subtarget;
+  MipsSubtarget Mips16Subtarget;
+
+  mutable StringMap<std::unique_ptr<MipsSubtarget>> SubtargetMap;
 
 public:
   MipsTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
                     const TargetOptions &Options, Reloc::Model RM,
                     CodeModel::Model CM, CodeGenOpt::Level OL, bool isLittle);
-
-  virtual ~MipsTargetMachine() {}
+  ~MipsTargetMachine() override;
 
   void addAnalysisPasses(PassManagerBase &PM) override;
 
-  const MipsInstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
+  const MipsSubtarget *getSubtargetImpl() const override {
+    if (Subtarget)
+      return Subtarget;
+    return &DefaultSubtarget;
   }
-  const TargetFrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  const MipsSubtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return Subtarget.inMips16Mode()
-               ? nullptr
-               : &getSubtargetImpl()->getInstrItineraryData();
-  }
-  MipsJITInfo *getJITInfo() override {
-    return Subtarget.getJITInfo();
-  }
-  const MipsRegisterInfo *getRegisterInfo()  const override {
-    return getSubtargetImpl()->getRegisterInfo();
-  }
-  const MipsTargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
-  const MipsSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
+
+  const MipsSubtarget *getSubtargetImpl(const Function &F) const override;
+
+  /// \brief Reset the subtarget for the Mips target.
+  void resetSubtarget(MachineFunction *MF);
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
+
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
 };
 
 /// MipsebTargetMachine - Mips32/64 big endian target machine.

diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 13f9408..b56c39b 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp

@@ -24,6 +24,17 @@
             cl::desc("Small data and bss section threshold size (default=8)"),
             cl::init(8));
 
+static cl::opt<bool>
+LocalSData("mlocal-sdata", cl::Hidden,
+           cl::desc("MIPS: Use gp_rel for object-local data."),
+           cl::init(true));
+
+static cl::opt<bool>
+ExternSData("mextern-sdata", cl::Hidden,
+            cl::desc("MIPS: Use gp_rel for data that is not defined by the "
+                     "current object."),
+            cl::init(true));
+
 void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
@@ -37,29 +48,46 @@
     getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
                                ELF::SHF_WRITE |ELF::SHF_ALLOC,
                                SectionKind::getBSS());
+  this->TM = &TM;
 }
 
 // A address must be loaded from a small section if its size is less than the
 // small section size threshold. Data in this section must be addressed using
 // gp_rel operator.
 static bool IsInSmallSection(uint64_t Size) {
+  // gcc has traditionally not treated zero-sized objects as small data, so this
+  // is effectively part of the ABI.
   return Size > 0 && Size <= SSThreshold;
 }
 
-bool MipsTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
-                                                const TargetMachine &TM) const {
+/// Return true if this global address should be placed into small data/bss
+/// section.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM) const {
+  // We first check the case where global is a declaration, because finding
+  // section kind using getKindForGlobal() is only allowed for global
+  // definitions.
   if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
-    return false;
+    return IsGlobalInSmallSectionImpl(GV, TM);
 
   return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
 }
 
-/// IsGlobalInSmallSection - Return true if this global address should be
-/// placed into small data/bss section.
+/// Return true if this global address should be placed into small data/bss
+/// section.
 bool MipsTargetObjectFile::
 IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
                        SectionKind Kind) const {
+  return (IsGlobalInSmallSectionImpl(GV, TM) &&
+          (Kind.isDataRel() || Kind.isBSS() || Kind.isCommon()));
+}
 
+/// Return true if this global address should be placed into small data/bss
+/// section. This method does all the work, except for checking the section
+/// kind.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSectionImpl(const GlobalValue *GV,
+                           const TargetMachine &TM) const {
   const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
 
   // Return if small section is not available.
@@ -71,21 +99,20 @@
   if (!GVA)
     return false;
 
-  // We can only do this for datarel or BSS objects for now.
-  if (!Kind.isBSS() && !Kind.isDataRel())
+  // Enforce -mlocal-sdata.
+  if (!LocalSData && GV->hasLocalLinkage())
     return false;
 
-  // If this is a internal constant string, there is a special
-  // section for it, but not in small data/bss.
-  if (Kind.isMergeable1ByteCString())
+  // Enforce -mextern-sdata.
+  if (!ExternSData && ((GV->hasExternalLinkage() && GV->isDeclaration()) ||
+                       GV->hasCommonLinkage()))
     return false;
 
   Type *Ty = GV->getType()->getElementType();
-  return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+  return IsInSmallSection(
+      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(Ty));
 }
 
-
-
 const MCSection *MipsTargetObjectFile::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler &Mang, const TargetMachine &TM) const {
@@ -95,9 +122,27 @@
   // Handle Small Section classification here.
   if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
     return SmallBSSSection;
-  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+  if (Kind.isDataRel() && IsGlobalInSmallSection(GV, TM, Kind))
     return SmallDataSection;
 
   // Otherwise, we work the same as ELF.
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
 }
+
+/// Return true if this constant should be placed into small data section.
+bool MipsTargetObjectFile::
+IsConstantInSmallSection(const Constant *CN, const TargetMachine &TM) const {
+  return (TM.getSubtarget<MipsSubtarget>().useSmallSection() &&
+          LocalSData &&
+          IsInSmallSection(TM.getSubtargetImpl()->getDataLayout()
+                           ->getTypeAllocSize(CN->getType())));
+}
+
+const MCSection *MipsTargetObjectFile::
+getSectionForConstant(SectionKind Kind, const Constant *C) const {
+  if (IsConstantInSmallSection(C, *TM))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::getSectionForConstant(Kind, C);
+}

diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h
index 2bf5a75..3a2b298 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.h
+++ b/lib/Target/Mips/MipsTargetObjectFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
-#define LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
@@ -17,21 +17,30 @@
   class MipsTargetObjectFile : public TargetLoweringObjectFileELF {
     const MCSection *SmallDataSection;
     const MCSection *SmallBSSSection;
+    const TargetMachine *TM;
   public:
 
     void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
-
-    /// IsGlobalInSmallSection - Return true if this global address should be
-    /// placed into small data/bss section.
-    bool IsGlobalInSmallSection(const GlobalValue *GV,
-                                const TargetMachine &TM, SectionKind Kind)const;
+    /// Return true if this global address should be placed into small data/bss
+    /// section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                                SectionKind Kind) const;
     bool IsGlobalInSmallSection(const GlobalValue *GV,
                                 const TargetMachine &TM) const;
+    bool IsGlobalInSmallSectionImpl(const GlobalValue *GV,
+                                    const TargetMachine &TM) const;
 
     const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
                                         SectionKind Kind, Mangler &Mang,
                                         const TargetMachine &TM) const override;
+
+    /// Return true if this constant should be placed into small data section.
+    bool IsConstantInSmallSection(const Constant *CN,
+                                  const TargetMachine &TM) const;
+
+    const MCSection *getSectionForConstant(SectionKind Kind,
+                                           const Constant *C) const override;
   };
 } // end namespace llvm
 

diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 99f7d4c..c1f17933 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h

@@ -7,10 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MIPSTARGETSTREAMER_H
-#define MIPSTARGETSTREAMER_H
+#ifndef LLVM_LIB_TARGET_MIPS_MIPSTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_MIPS_MIPSTARGETSTREAMER_H
 
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "MCTargetDesc/MipsABIFlagsSection.h"
 
@@ -30,6 +31,8 @@
   virtual void emitDirectiveSetNoReorder();
   virtual void emitDirectiveSetMacro();
   virtual void emitDirectiveSetNoMacro();
+  virtual void emitDirectiveSetMsa();
+  virtual void emitDirectiveSetNoMsa();
   virtual void emitDirectiveSetAt();
   virtual void emitDirectiveSetNoAt();
   virtual void emitDirectiveEnd(StringRef Name);
@@ -45,13 +48,26 @@
   virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
   virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
 
+  virtual void emitDirectiveSetArch(StringRef Arch);
+  virtual void emitDirectiveSetMips0();
+  virtual void emitDirectiveSetMips1();
+  virtual void emitDirectiveSetMips2();
+  virtual void emitDirectiveSetMips3();
+  virtual void emitDirectiveSetMips4();
+  virtual void emitDirectiveSetMips5();
+  virtual void emitDirectiveSetMips32();
   virtual void emitDirectiveSetMips32R2();
+  virtual void emitDirectiveSetMips32R6();
   virtual void emitDirectiveSetMips64();
   virtual void emitDirectiveSetMips64R2();
+  virtual void emitDirectiveSetMips64R6();
   virtual void emitDirectiveSetDsp();
+  virtual void emitDirectiveSetNoDsp();
+  virtual void emitDirectiveSetPop();
+  virtual void emitDirectiveSetPush();
 
   // PIC support
-  virtual void emitDirectiveCpload(unsigned RegNo);
+  virtual void emitDirectiveCpLoad(unsigned RegNo);
   virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                                     const MCSymbol &Sym, bool IsReg);
 
@@ -72,8 +88,8 @@
   virtual void emitDirectiveModuleOddSPReg(bool Enabled, bool IsO32ABI);
   virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value){};
   virtual void emitMipsAbiFlags(){};
-  void setCanHaveModuleDir(bool Can) { canHaveModuleDirective = Can; }
-  bool getCanHaveModuleDir() { return canHaveModuleDirective; }
+  void forbidModuleDirective() { ModuleDirectiveAllowed = false; }
+  bool isModuleDirectiveAllowed() { return ModuleDirectiveAllowed; }
 
   // This method enables template classes to set internal abi flags
   // structure values.
@@ -87,8 +103,21 @@
 protected:
   MipsABIFlagsSection ABIFlagsSection;
 
+  bool GPRInfoSet;
+  unsigned GPRBitMask;
+  int GPROffset;
+
+  bool FPRInfoSet;
+  unsigned FPRBitMask;
+  int FPROffset;
+
+  bool FrameInfoSet;
+  int FrameOffset;
+  unsigned FrameReg;
+  unsigned ReturnReg;
+
 private:
-  bool canHaveModuleDirective;
+  bool ModuleDirectiveAllowed;
 };
 
 // This part is for ascii assembly output
@@ -106,6 +135,8 @@
   void emitDirectiveSetNoReorder() override;
   void emitDirectiveSetMacro() override;
   void emitDirectiveSetNoMacro() override;
+  void emitDirectiveSetMsa() override;
+  void emitDirectiveSetNoMsa() override;
   void emitDirectiveSetAt() override;
   void emitDirectiveSetNoAt() override;
   void emitDirectiveEnd(StringRef Name) override;
@@ -121,13 +152,26 @@
   void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
   void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
 
+  void emitDirectiveSetArch(StringRef Arch) override;
+  void emitDirectiveSetMips0() override;
+  void emitDirectiveSetMips1() override;
+  void emitDirectiveSetMips2() override;
+  void emitDirectiveSetMips3() override;
+  void emitDirectiveSetMips4() override;
+  void emitDirectiveSetMips5() override;
+  void emitDirectiveSetMips32() override;
   void emitDirectiveSetMips32R2() override;
+  void emitDirectiveSetMips32R6() override;
   void emitDirectiveSetMips64() override;
   void emitDirectiveSetMips64R2() override;
+  void emitDirectiveSetMips64R6() override;
   void emitDirectiveSetDsp() override;
+  void emitDirectiveSetNoDsp() override;
+  void emitDirectiveSetPop() override;
+  void emitDirectiveSetPush() override;
 
   // PIC support
-  virtual void emitDirectiveCpload(unsigned RegNo);
+  void emitDirectiveCpLoad(unsigned RegNo) override;
   void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
 
@@ -157,14 +201,8 @@
   void emitDirectiveSetMicroMips() override;
   void emitDirectiveSetNoMicroMips() override;
   void emitDirectiveSetMips16() override;
-  void emitDirectiveSetNoMips16() override;
 
-  void emitDirectiveSetReorder() override;
   void emitDirectiveSetNoReorder() override;
-  void emitDirectiveSetMacro() override;
-  void emitDirectiveSetNoMacro() override;
-  void emitDirectiveSetAt() override;
-  void emitDirectiveSetNoAt() override;
   void emitDirectiveEnd(StringRef Name) override;
 
   void emitDirectiveEnt(const MCSymbol &Symbol) override;
@@ -178,13 +216,8 @@
   void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
   void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
 
-  void emitDirectiveSetMips32R2() override;
-  void emitDirectiveSetMips64() override;
-  void emitDirectiveSetMips64R2() override;
-  void emitDirectiveSetDsp() override;
-
   // PIC support
-  virtual void emitDirectiveCpload(unsigned RegNo);
+  void emitDirectiveCpLoad(unsigned RegNo) override;
   void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
                             const MCSymbol &Sym, bool IsReg) override;
 

diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 4e35b18..3a4a19d 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt

@@ -9,26 +9,28 @@
 add_public_tablegen_target(NVPTXCommonTableGen)
 
 set(NVPTXCodeGen_sources
-  NVPTXFavorNonGenericAddrSpaces.cpp
-  NVPTXFrameLowering.cpp
-  NVPTXInstrInfo.cpp
-  NVPTXISelDAGToDAG.cpp
-  NVPTXISelLowering.cpp
-  NVPTXRegisterInfo.cpp
-  NVPTXSubtarget.cpp
-  NVPTXTargetMachine.cpp
-  NVPTXLowerAggrCopies.cpp
-  NVPTXutil.cpp
   NVPTXAllocaHoisting.cpp
   NVPTXAsmPrinter.cpp
-  NVPTXUtilities.cpp
-  NVVMReflect.cpp
-  NVPTXGenericToNVVM.cpp
   NVPTXAssignValidGlobalNames.cpp
-  NVPTXPrologEpilogPass.cpp
-  NVPTXMCExpr.cpp
-  NVPTXReplaceImageHandles.cpp
+  NVPTXFavorNonGenericAddrSpaces.cpp
+  NVPTXFrameLowering.cpp
+  NVPTXGenericToNVVM.cpp
+  NVPTXISelDAGToDAG.cpp
+  NVPTXISelLowering.cpp
   NVPTXImageOptimizer.cpp
+  NVPTXInstrInfo.cpp
+  NVPTXLowerAggrCopies.cpp
+  NVPTXLowerStructArgs.cpp
+  NVPTXMCExpr.cpp
+  NVPTXPrologEpilogPass.cpp
+  NVPTXRegisterInfo.cpp
+  NVPTXReplaceImageHandles.cpp
+  NVPTXSubtarget.cpp
+  NVPTXTargetMachine.cpp
+  NVPTXTargetTransformInfo.cpp
+  NVPTXUtilities.cpp
+  NVPTXutil.cpp
+  NVVMReflect.cpp
   )
 
 add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})

diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index 9618896..80b2f62 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp

@@ -57,13 +57,13 @@
     OS << "%r";
     break;
   case 4:
-    OS << "%rl";
+    OS << "%rd";
     break;
   case 5:
     OS << "%f";
     break;
   case 6:
-    OS << "%fl";
+    OS << "%fd";
     break;
   }
 

diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
index 1fb3c57..0496964 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTX_INST_PRINTER_H
-#define NVPTX_INST_PRINTER_H
+#ifndef LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
+#define LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/Support/raw_ostream.h"

diff --git a/lib/Target/NVPTX/LLVMBuild.txt b/lib/Target/NVPTX/LLVMBuild.txt
index e805aba..bc8d82e 100644
--- a/lib/Target/NVPTX/LLVMBuild.txt
+++ b/lib/Target/NVPTX/LLVMBuild.txt

@@ -28,5 +28,5 @@
 type = Library
 name = NVPTXCodeGen
 parent = NVPTX
-required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo SelectionDAG Support Target
+required_libraries = Analysis AsmPrinter CodeGen Core MC NVPTXAsmPrinter NVPTXDesc NVPTXInfo Scalar SelectionDAG Support Target
 add_to_library_groups = NVPTX

diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
index ddb122f..a72ae2e 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTXBASEINFO_H
-#define NVPTXBASEINFO_H
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXBASEINFO_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXBASEINFO_H
 
 namespace llvm {
 
@@ -84,6 +84,17 @@
 #endif
     static const char *NamedMDForAnnotations = "nvvm.annotations";
 
+namespace NVPTXII {
+enum {
+  // These must be kept in sync with TSFlags in NVPTXInstrFormats.td
+  IsTexFlag = 0x80,
+  IsSuldMask = 0x300,
+  IsSuldShift = 8,
+  IsSustFlag = 0x400,
+  IsSurfTexQueryFlag = 0x800,
+  IsTexModeUnifiedFlag = 0x1000
+};
+}
 }
 
 #endif

diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 366341a..4fd5bdd 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp

@@ -25,7 +25,7 @@
 
 void NVPTXMCAsmInfo::anchor() {}
 
-NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) {
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(StringRef TT) {
   Triple TheTriple(TT);
   if (TheTriple.getArch() == Triple::nvptx64) {
     PointerSize = CalleeSaveStackSlotSize = 8;
@@ -33,8 +33,6 @@
 
   CommentString = "//";
 
-  HasSetDirective = false;
-
   HasSingleParameterDotFile = false;
 
   InlineAsmStart = " inline asm";

diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 7d1633f..c324286 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTX_MCASM_INFO_H
-#define NVPTX_MCASM_INFO_H
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCASMINFO_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfo.h"
 
@@ -23,8 +23,8 @@
 class NVPTXMCAsmInfo : public MCAsmInfo {
   virtual void anchor();
 public:
-  explicit NVPTXMCAsmInfo(const StringRef &TT);
+  explicit NVPTXMCAsmInfo(StringRef TT);
 };
 } // namespace llvm
 
-#endif // NVPTX_MCASM_INFO_H
+#endif

diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
index af95c76..98821d2 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTXMCTARGETDESC_H
-#define NVPTXMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCTARGETDESC_H
+#define LLVM_LIB_TARGET_NVPTX_MCTARGETDESC_NVPTXMCTARGETDESC_H
 
 namespace llvm {
 class Target;

diff --git a/lib/Target/NVPTX/ManagedStringPool.h b/lib/Target/NVPTX/ManagedStringPool.h
index f9fb059..a2d670f 100644
--- a/lib/Target/NVPTX/ManagedStringPool.h
+++ b/lib/Target/NVPTX/ManagedStringPool.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_MANAGED_STRING_H
-#define LLVM_SUPPORT_MANAGED_STRING_H
+#ifndef LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H
+#define LLVM_LIB_TARGET_NVPTX_MANAGEDSTRINGPOOL_H
 
 #include "llvm/ADT/SmallVector.h"
 #include <string>

diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index e74c808..13ba57e 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_NVPTX_H
-#define LLVM_TARGET_NVPTX_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTX_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTX_H
 
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "llvm/ADT/StringMap.h"
@@ -59,6 +59,7 @@
   llvm_unreachable("Unknown condition code");
 }
 
+ImmutablePass *createNVPTXTargetTransformInfoPass(const NVPTXTargetMachine *TM);
 FunctionPass *
 createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
@@ -69,6 +70,7 @@
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
+FunctionPass *createNVPTXLowerStructArgsPass();
 
 bool isImageOrSamplerVal(const Value *, const Module *);
 

diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
index 5b61068..69fc86e 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTX_ALLOCA_HOISTING_H_
-#define NVPTX_ALLOCA_HOISTING_H_
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXALLOCAHOISTING_H
 
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/IR/DataLayout.h"
@@ -47,4 +47,4 @@
 
 } // end namespace llvm
 
-#endif // NVPTX_ALLOCA_HOISTING_H_
+#endif

diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index decf02a..35ba4f1 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp

@@ -88,12 +88,9 @@
     return;
 
   // Do we have a circular dependency?
-  if (Visiting.count(GV))
+  if (!Visiting.insert(GV).second)
     report_fatal_error("Circular dependency found in global variable set");
 
-  // Start visiting this global
-  Visiting.insert(GV);
-
   // Make sure we visit all dependents first
   DenseSet<const GlobalVariable *> Others;
   for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
@@ -140,7 +137,8 @@
     // If the code isn't optimized, there may be outstanding folding
     // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
-    if (Constant *C = ConstantFoldConstantExpression(CE, AP.TM.getDataLayout()))
+    if (Constant *C = ConstantFoldConstantExpression(
+            CE, AP.TM.getSubtargetImpl()->getDataLayout()))
       if (C != CE)
         return LowerConstant(C, AP);
 
@@ -169,7 +167,7 @@
     report_fatal_error(OS.str());
   }
   case Instruction::GetElementPtr: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &TD = *AP.TM.getSubtargetImpl()->getDataLayout();
     // Generate a symbolic expression for the byte address
     APInt OffsetAI(TD.getPointerSizeInBits(), 0);
     cast<GEPOperator>(CE)->accumulateConstantOffset(TD, OffsetAI);
@@ -193,7 +191,7 @@
     return LowerConstant(CE->getOperand(0), AP);
 
   case Instruction::IntToPtr: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &TD = *AP.TM.getSubtargetImpl()->getDataLayout();
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
@@ -203,7 +201,7 @@
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &TD = *AP.TM.getSubtargetImpl()->getDataLayout();
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
@@ -330,253 +328,51 @@
 bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI,
                                            unsigned OpNo, MCOperand &MCOp) {
   const MachineOperand &MO = MI->getOperand(OpNo);
+  const MCInstrDesc &MCID = MI->getDesc();
 
-  switch (MI->getOpcode()) {
-  default: return false;
-  case NVPTX::TEX_1D_F32_I32:
-  case NVPTX::TEX_1D_F32_F32:
-  case NVPTX::TEX_1D_F32_F32_LEVEL:
-  case NVPTX::TEX_1D_F32_F32_GRAD:
-  case NVPTX::TEX_1D_I32_I32:
-  case NVPTX::TEX_1D_I32_F32:
-  case NVPTX::TEX_1D_I32_F32_LEVEL:
-  case NVPTX::TEX_1D_I32_F32_GRAD:
-  case NVPTX::TEX_1D_ARRAY_F32_I32:
-  case NVPTX::TEX_1D_ARRAY_F32_F32:
-  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL:
-  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD:
-  case NVPTX::TEX_1D_ARRAY_I32_I32:
-  case NVPTX::TEX_1D_ARRAY_I32_F32:
-  case NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL:
-  case NVPTX::TEX_1D_ARRAY_I32_F32_GRAD:
-  case NVPTX::TEX_2D_F32_I32:
-  case NVPTX::TEX_2D_F32_F32:
-  case NVPTX::TEX_2D_F32_F32_LEVEL:
-  case NVPTX::TEX_2D_F32_F32_GRAD:
-  case NVPTX::TEX_2D_I32_I32:
-  case NVPTX::TEX_2D_I32_F32:
-  case NVPTX::TEX_2D_I32_F32_LEVEL:
-  case NVPTX::TEX_2D_I32_F32_GRAD:
-  case NVPTX::TEX_2D_ARRAY_F32_I32:
-  case NVPTX::TEX_2D_ARRAY_F32_F32:
-  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL:
-  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD:
-  case NVPTX::TEX_2D_ARRAY_I32_I32:
-  case NVPTX::TEX_2D_ARRAY_I32_F32:
-  case NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL:
-  case NVPTX::TEX_2D_ARRAY_I32_F32_GRAD:
-  case NVPTX::TEX_3D_F32_I32:
-  case NVPTX::TEX_3D_F32_F32:
-  case NVPTX::TEX_3D_F32_F32_LEVEL:
-  case NVPTX::TEX_3D_F32_F32_GRAD:
-  case NVPTX::TEX_3D_I32_I32:
-  case NVPTX::TEX_3D_I32_F32:
-  case NVPTX::TEX_3D_I32_F32_LEVEL:
-  case NVPTX::TEX_3D_I32_F32_GRAD:
-   {
+  if (MCID.TSFlags & NVPTXII::IsTexFlag) {
     // This is a texture fetch, so operand 4 is a texref and operand 5 is
     // a samplerref
-    if (OpNo == 4) {
+    if (OpNo == 4 && MO.isImm()) {
       lowerImageHandleSymbol(MO.getImm(), MCOp);
       return true;
     }
-    if (OpNo == 5) {
+    if (OpNo == 5 && MO.isImm() && !(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
       lowerImageHandleSymbol(MO.getImm(), MCOp);
       return true;
     }
 
     return false;
-  }
-  case NVPTX::SULD_1D_I8_TRAP:
-  case NVPTX::SULD_1D_I16_TRAP:
-  case NVPTX::SULD_1D_I32_TRAP:
-  case NVPTX::SULD_1D_ARRAY_I8_TRAP:
-  case NVPTX::SULD_1D_ARRAY_I16_TRAP:
-  case NVPTX::SULD_1D_ARRAY_I32_TRAP:
-  case NVPTX::SULD_2D_I8_TRAP:
-  case NVPTX::SULD_2D_I16_TRAP:
-  case NVPTX::SULD_2D_I32_TRAP:
-  case NVPTX::SULD_2D_ARRAY_I8_TRAP:
-  case NVPTX::SULD_2D_ARRAY_I16_TRAP:
-  case NVPTX::SULD_2D_ARRAY_I32_TRAP:
-  case NVPTX::SULD_3D_I8_TRAP:
-  case NVPTX::SULD_3D_I16_TRAP:
-  case NVPTX::SULD_3D_I32_TRAP: {
-    // This is a V1 surface load, so operand 1 is a surfref
-    if (OpNo == 1) {
+  } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+    unsigned VecSize =
+      1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+    // For a surface load of vector size N, the Nth operand will be the surfref
+    if (OpNo == VecSize && MO.isImm()) {
       lowerImageHandleSymbol(MO.getImm(), MCOp);
       return true;
     }
 
     return false;
-  }
-  case NVPTX::SULD_1D_V2I8_TRAP:
-  case NVPTX::SULD_1D_V2I16_TRAP:
-  case NVPTX::SULD_1D_V2I32_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V2I8_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V2I16_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V2I32_TRAP:
-  case NVPTX::SULD_2D_V2I8_TRAP:
-  case NVPTX::SULD_2D_V2I16_TRAP:
-  case NVPTX::SULD_2D_V2I32_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V2I8_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V2I16_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V2I32_TRAP:
-  case NVPTX::SULD_3D_V2I8_TRAP:
-  case NVPTX::SULD_3D_V2I16_TRAP:
-  case NVPTX::SULD_3D_V2I32_TRAP: {
-    // This is a V2 surface load, so operand 2 is a surfref
-    if (OpNo == 2) {
-      lowerImageHandleSymbol(MO.getImm(), MCOp);
-      return true;
-    }
-
-    return false;
-  }
-  case NVPTX::SULD_1D_V4I8_TRAP:
-  case NVPTX::SULD_1D_V4I16_TRAP:
-  case NVPTX::SULD_1D_V4I32_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V4I8_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V4I16_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V4I32_TRAP:
-  case NVPTX::SULD_2D_V4I8_TRAP:
-  case NVPTX::SULD_2D_V4I16_TRAP:
-  case NVPTX::SULD_2D_V4I32_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V4I8_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V4I16_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V4I32_TRAP:
-  case NVPTX::SULD_3D_V4I8_TRAP:
-  case NVPTX::SULD_3D_V4I16_TRAP:
-  case NVPTX::SULD_3D_V4I32_TRAP: {
-    // This is a V4 surface load, so operand 4 is a surfref
-    if (OpNo == 4) {
-      lowerImageHandleSymbol(MO.getImm(), MCOp);
-      return true;
-    }
-
-    return false;
-  }
-  case NVPTX::SUST_B_1D_B8_TRAP:
-  case NVPTX::SUST_B_1D_B16_TRAP:
-  case NVPTX::SUST_B_1D_B32_TRAP:
-  case NVPTX::SUST_B_1D_V2B8_TRAP:
-  case NVPTX::SUST_B_1D_V2B16_TRAP:
-  case NVPTX::SUST_B_1D_V2B32_TRAP:
-  case NVPTX::SUST_B_1D_V4B8_TRAP:
-  case NVPTX::SUST_B_1D_V4B16_TRAP:
-  case NVPTX::SUST_B_1D_V4B32_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_B8_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_B16_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_B32_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP:
-  case NVPTX::SUST_B_2D_B8_TRAP:
-  case NVPTX::SUST_B_2D_B16_TRAP:
-  case NVPTX::SUST_B_2D_B32_TRAP:
-  case NVPTX::SUST_B_2D_V2B8_TRAP:
-  case NVPTX::SUST_B_2D_V2B16_TRAP:
-  case NVPTX::SUST_B_2D_V2B32_TRAP:
-  case NVPTX::SUST_B_2D_V4B8_TRAP:
-  case NVPTX::SUST_B_2D_V4B16_TRAP:
-  case NVPTX::SUST_B_2D_V4B32_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_B8_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_B16_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_B32_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP:
-  case NVPTX::SUST_B_3D_B8_TRAP:
-  case NVPTX::SUST_B_3D_B16_TRAP:
-  case NVPTX::SUST_B_3D_B32_TRAP:
-  case NVPTX::SUST_B_3D_V2B8_TRAP:
-  case NVPTX::SUST_B_3D_V2B16_TRAP:
-  case NVPTX::SUST_B_3D_V2B32_TRAP:
-  case NVPTX::SUST_B_3D_V4B8_TRAP:
-  case NVPTX::SUST_B_3D_V4B16_TRAP:
-  case NVPTX::SUST_B_3D_V4B32_TRAP:
-  case NVPTX::SUST_P_1D_B8_TRAP:
-  case NVPTX::SUST_P_1D_B16_TRAP:
-  case NVPTX::SUST_P_1D_B32_TRAP:
-  case NVPTX::SUST_P_1D_V2B8_TRAP:
-  case NVPTX::SUST_P_1D_V2B16_TRAP:
-  case NVPTX::SUST_P_1D_V2B32_TRAP:
-  case NVPTX::SUST_P_1D_V4B8_TRAP:
-  case NVPTX::SUST_P_1D_V4B16_TRAP:
-  case NVPTX::SUST_P_1D_V4B32_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_B8_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_B16_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_B32_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP:
-  case NVPTX::SUST_P_2D_B8_TRAP:
-  case NVPTX::SUST_P_2D_B16_TRAP:
-  case NVPTX::SUST_P_2D_B32_TRAP:
-  case NVPTX::SUST_P_2D_V2B8_TRAP:
-  case NVPTX::SUST_P_2D_V2B16_TRAP:
-  case NVPTX::SUST_P_2D_V2B32_TRAP:
-  case NVPTX::SUST_P_2D_V4B8_TRAP:
-  case NVPTX::SUST_P_2D_V4B16_TRAP:
-  case NVPTX::SUST_P_2D_V4B32_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_B8_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_B16_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_B32_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP:
-  case NVPTX::SUST_P_3D_B8_TRAP:
-  case NVPTX::SUST_P_3D_B16_TRAP:
-  case NVPTX::SUST_P_3D_B32_TRAP:
-  case NVPTX::SUST_P_3D_V2B8_TRAP:
-  case NVPTX::SUST_P_3D_V2B16_TRAP:
-  case NVPTX::SUST_P_3D_V2B32_TRAP:
-  case NVPTX::SUST_P_3D_V4B8_TRAP:
-  case NVPTX::SUST_P_3D_V4B16_TRAP:
-  case NVPTX::SUST_P_3D_V4B32_TRAP: {
+  } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
     // This is a surface store, so operand 0 is a surfref
-    if (OpNo == 0) {
+    if (OpNo == 0 && MO.isImm()) {
       lowerImageHandleSymbol(MO.getImm(), MCOp);
       return true;
     }
 
     return false;
-  }
-  case NVPTX::TXQ_CHANNEL_ORDER:
-  case NVPTX::TXQ_CHANNEL_DATA_TYPE:
-  case NVPTX::TXQ_WIDTH:
-  case NVPTX::TXQ_HEIGHT:
-  case NVPTX::TXQ_DEPTH:
-  case NVPTX::TXQ_ARRAY_SIZE:
-  case NVPTX::TXQ_NUM_SAMPLES:
-  case NVPTX::TXQ_NUM_MIPMAP_LEVELS:
-  case NVPTX::SUQ_CHANNEL_ORDER:
-  case NVPTX::SUQ_CHANNEL_DATA_TYPE:
-  case NVPTX::SUQ_WIDTH:
-  case NVPTX::SUQ_HEIGHT:
-  case NVPTX::SUQ_DEPTH:
-  case NVPTX::SUQ_ARRAY_SIZE: {
+  } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
     // This is a query, so operand 1 is a surfref/texref
-    if (OpNo == 1) {
+    if (OpNo == 1 && MO.isImm()) {
       lowerImageHandleSymbol(MO.getImm(), MCOp);
       return true;
     }
 
     return false;
   }
-  }
+
+  return false;
 }
 
 void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
@@ -704,8 +500,8 @@
 }
 
 void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
-  const DataLayout *TD = TM.getDataLayout();
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
+  const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
 
   Type *Ty = F->getReturnType();
 
@@ -828,13 +624,14 @@
 
 void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
   if (TRI->isVirtualRegister(RegNo)) {
     OutStreamer.AddComment(Twine("implicit-def: ") +
                            getVirtualRegisterName(RegNo));
   } else {
-    OutStreamer.AddComment(Twine("implicit-def: ") +
-                           TM.getRegisterInfo()->getName(RegNo));
+    OutStreamer.AddComment(
+        Twine("implicit-def: ") +
+        TM.getSubtargetImpl()->getRegisterInfo()->getName(RegNo));
   }
   OutStreamer.AddBlankLine();
 }
@@ -1155,7 +952,7 @@
   const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
       .Initialize(OutContext, TM);
 
-  Mang = new Mangler(TM.getDataLayout());
+  Mang = new Mangler(TM.getSubtargetImpl()->getDataLayout());
 
   // Emit header before any dwarf directives are emitted below.
   emitHeader(M, OS1);
@@ -1356,7 +1153,7 @@
       GVar->getName().startswith("nvvm."))
     return;
 
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
   const PointerType *PTy = GVar->getType();
@@ -1659,7 +1456,7 @@
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
                                             raw_ostream &O) {
 
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
   const PointerType *PTy = GVar->getType();
@@ -1780,9 +1577,9 @@
 }
 
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
   const AttributeSet &PAL = F->getAttributes();
-  const TargetLowering *TLI = TM.getTargetLowering();
+  const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
   bool first = true;
@@ -1973,7 +1770,7 @@
 
   // Map the global virtual register number to a register class specific
   // virtual register number starting from 1 with that class.
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   //unsigned numRegClasses = TRI->getNumRegClasses();
 
   // Emit the Fake Stack Object
@@ -2010,9 +1807,9 @@
   // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
   // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
   // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
-  // O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s64 %rd<" << NVPTXNumRegisters << ">;\n";
   // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
-  // O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .f64 %fd<" << NVPTXNumRegisters << ">;\n";
 
   // Emit declaration of the virtual registers or 'physical' registers for
   // each register class
@@ -2113,7 +1910,7 @@
 void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
 
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
   if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
     int s = TD->getTypeAllocSize(CPV->getType());
@@ -2237,7 +2034,7 @@
 
 void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
                                               AggBuffer *aggBuffer) {
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
   int Bytes;
 
   // Old constants

diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index a9f9bdd..83fa5d3 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTXASMPRINTER_H
-#define NVPTXASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXASMPRINTER_H
 
 #include "NVPTX.h"
 #include "NVPTXSubtarget.h"
@@ -86,13 +86,13 @@
     // Once we have this AggBuffer setup, we can choose how to print
     // it out.
   public:
-    unsigned size;         // size of the buffer in bytes
-    unsigned char *buffer; // the buffer
     unsigned numSymbols;   // number of symbol addresses
-    SmallVector<unsigned, 4> symbolPosInBuffer;
-    SmallVector<const Value *, 4> Symbols;
 
   private:
+    const unsigned size;   // size of the buffer in bytes
+    std::vector<unsigned char> buffer; // the buffer
+    SmallVector<unsigned, 4> symbolPosInBuffer;
+    SmallVector<const Value *, 4> Symbols;
     unsigned curpos;
     raw_ostream &O;
     NVPTXAsmPrinter &AP;
@@ -100,14 +100,11 @@
 
   public:
     AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP)
-        : O(_O), AP(_AP) {
-      buffer = new unsigned char[_size];
-      size = _size;
+        : size(_size), buffer(_size), O(_O), AP(_AP) {
       curpos = 0;
       numSymbols = 0;
       EmitGeneric = AP.EmitGeneric;
     }
-    ~AggBuffer() { delete[] buffer; }
     unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
       assert((curpos + Num) <= size);
       assert((curpos + Bytes) <= size);
@@ -179,9 +176,9 @@
             else
               nextSymbolPos = symbolPosInBuffer[nSym];
           } else if (nBytes == 4)
-            O << *(unsigned int *)(buffer + pos);
+            O << *(unsigned int *)(&buffer[pos]);
           else
-            O << *(unsigned long long *)(buffer + pos);
+            O << *(unsigned long long *)(&buffer[pos]);
         }
       }
     }

diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 8b088412..314df38 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp

@@ -48,20 +48,20 @@
     if (is64bit) {
       unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass);
       MachineInstr *MI =
-          BuildMI(MBB, MBBI, dl,
-                  MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+          BuildMI(MBB, MBBI, dl, MF.getSubtarget().getInstrInfo()->get(
+                                     NVPTX::cvta_local_yes_64),
                   NVPTX::VRFrame).addReg(LocalReg);
       BuildMI(MBB, MI, dl,
-              MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
+              MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
               LocalReg).addImm(MF.getFunctionNumber());
     } else {
       unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass);
       MachineInstr *MI =
           BuildMI(MBB, MBBI, dl,
-                  MF.getTarget().getInstrInfo()->get(NVPTX::cvta_local_yes),
+                  MF.getSubtarget().getInstrInfo()->get(NVPTX::cvta_local_yes),
                   NVPTX::VRFrame).addReg(LocalReg);
       BuildMI(MBB, MI, dl,
-              MF.getTarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
+              MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
               LocalReg).addImm(MF.getFunctionNumber());
     }
   }

diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 56fb673..0846b78 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTX_FRAMELOWERING_H
-#define NVPTX_FRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXFRAMELOWERING_H
 
 #include "llvm/Target/TargetFrameLowering.h"
 

diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index faa9fdb..58fa95b 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp

@@ -140,20 +140,23 @@
   for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) {
     GlobalVariable *GV = I->first;
     GlobalVariable *NewGV = I->second;
-    ++I;
+
+    // Remove GV from the map so that it can be RAUWed.  Note that
+    // DenseMap::erase() won't invalidate any iterators but this one.
+    auto Next = std::next(I);
+    GVMap.erase(I);
+    I = Next;
+
     Constant *BitCastNewGV = ConstantExpr::getPointerCast(NewGV, GV->getType());
     // At this point, the remaining uses of GV should be found only in global
     // variable initializers, as other uses have been already been removed
     // while walking through the instructions in function definitions.
-    for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end();
-         UI != UE;)
-      (UI++)->set(BitCastNewGV);
+    GV->replaceAllUsesWith(BitCastNewGV);
     std::string Name = GV->getName();
-    GV->removeDeadConstantUsers();
     GV->eraseFromParent();
     NewGV->setName(Name);
   }
-  GVMap.clear();
+  assert(GVMap.empty() && "Expected it to be empty by now");
 
   return true;
 }

diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 0dfbf10..cd0422d 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

@@ -24,19 +24,10 @@
 
 #define DEBUG_TYPE "nvptx-isel"
 
-unsigned FMAContractLevel = 0;
-
-static cl::opt<unsigned, true>
-FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
-                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
-                             " 1: do it  2: do it aggressively"),
-                    cl::location(FMAContractLevel),
-                    cl::init(2));
-
 static cl::opt<int> UsePrecDivF32(
     "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
-             " IEEE Compliant F32 div.rnd if avaiable."),
+             " IEEE Compliant F32 div.rnd if available."),
     cl::init(2));
 
 static cl::opt<bool>
@@ -61,16 +52,6 @@
                                      CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(tm, OptLevel),
       Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
-
-  doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
-  doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
-  doFMAF32AGG =
-      (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel == 2);
-  doFMAF64AGG =
-      (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
-
-  allowFMA = (FMAContractLevel >= 1);
-
   doMulWide = (OptLevel > 0);
 }
 
@@ -116,6 +97,11 @@
   }
 }
 
+bool NVPTXDAGToDAGISel::allowFMA() const {
+  const NVPTXTargetLowering *TL = Subtarget.getTargetLowering();
+  return TL->allowFMA(*MF, OptLevel);
+}
+
 /// Select - Select instructions not customized! Used for
 /// expanded, promoted and normal instructions.
 SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
@@ -170,93 +156,341 @@
   case ISD::INTRINSIC_W_CHAIN:
     ResNode = SelectIntrinsicChain(N);
     break;
-  case NVPTXISD::Tex1DFloatI32:
+  case NVPTXISD::Tex1DFloatS32:
   case NVPTXISD::Tex1DFloatFloat:
   case NVPTXISD::Tex1DFloatFloatLevel:
   case NVPTXISD::Tex1DFloatFloatGrad:
-  case NVPTXISD::Tex1DI32I32:
-  case NVPTXISD::Tex1DI32Float:
-  case NVPTXISD::Tex1DI32FloatLevel:
-  case NVPTXISD::Tex1DI32FloatGrad:
-  case NVPTXISD::Tex1DArrayFloatI32:
+  case NVPTXISD::Tex1DS32S32:
+  case NVPTXISD::Tex1DS32Float:
+  case NVPTXISD::Tex1DS32FloatLevel:
+  case NVPTXISD::Tex1DS32FloatGrad:
+  case NVPTXISD::Tex1DU32S32:
+  case NVPTXISD::Tex1DU32Float:
+  case NVPTXISD::Tex1DU32FloatLevel:
+  case NVPTXISD::Tex1DU32FloatGrad:
+  case NVPTXISD::Tex1DArrayFloatS32:
   case NVPTXISD::Tex1DArrayFloatFloat:
   case NVPTXISD::Tex1DArrayFloatFloatLevel:
   case NVPTXISD::Tex1DArrayFloatFloatGrad:
-  case NVPTXISD::Tex1DArrayI32I32:
-  case NVPTXISD::Tex1DArrayI32Float:
-  case NVPTXISD::Tex1DArrayI32FloatLevel:
-  case NVPTXISD::Tex1DArrayI32FloatGrad:
-  case NVPTXISD::Tex2DFloatI32:
+  case NVPTXISD::Tex1DArrayS32S32:
+  case NVPTXISD::Tex1DArrayS32Float:
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+  case NVPTXISD::Tex1DArrayU32S32:
+  case NVPTXISD::Tex1DArrayU32Float:
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+  case NVPTXISD::Tex2DFloatS32:
   case NVPTXISD::Tex2DFloatFloat:
   case NVPTXISD::Tex2DFloatFloatLevel:
   case NVPTXISD::Tex2DFloatFloatGrad:
-  case NVPTXISD::Tex2DI32I32:
-  case NVPTXISD::Tex2DI32Float:
-  case NVPTXISD::Tex2DI32FloatLevel:
-  case NVPTXISD::Tex2DI32FloatGrad:
-  case NVPTXISD::Tex2DArrayFloatI32:
+  case NVPTXISD::Tex2DS32S32:
+  case NVPTXISD::Tex2DS32Float:
+  case NVPTXISD::Tex2DS32FloatLevel:
+  case NVPTXISD::Tex2DS32FloatGrad:
+  case NVPTXISD::Tex2DU32S32:
+  case NVPTXISD::Tex2DU32Float:
+  case NVPTXISD::Tex2DU32FloatLevel:
+  case NVPTXISD::Tex2DU32FloatGrad:
+  case NVPTXISD::Tex2DArrayFloatS32:
   case NVPTXISD::Tex2DArrayFloatFloat:
   case NVPTXISD::Tex2DArrayFloatFloatLevel:
   case NVPTXISD::Tex2DArrayFloatFloatGrad:
-  case NVPTXISD::Tex2DArrayI32I32:
-  case NVPTXISD::Tex2DArrayI32Float:
-  case NVPTXISD::Tex2DArrayI32FloatLevel:
-  case NVPTXISD::Tex2DArrayI32FloatGrad:
-  case NVPTXISD::Tex3DFloatI32:
+  case NVPTXISD::Tex2DArrayS32S32:
+  case NVPTXISD::Tex2DArrayS32Float:
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+  case NVPTXISD::Tex2DArrayU32S32:
+  case NVPTXISD::Tex2DArrayU32Float:
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+  case NVPTXISD::Tex3DFloatS32:
   case NVPTXISD::Tex3DFloatFloat:
   case NVPTXISD::Tex3DFloatFloatLevel:
   case NVPTXISD::Tex3DFloatFloatGrad:
-  case NVPTXISD::Tex3DI32I32:
-  case NVPTXISD::Tex3DI32Float:
-  case NVPTXISD::Tex3DI32FloatLevel:
-  case NVPTXISD::Tex3DI32FloatGrad:
+  case NVPTXISD::Tex3DS32S32:
+  case NVPTXISD::Tex3DS32Float:
+  case NVPTXISD::Tex3DS32FloatLevel:
+  case NVPTXISD::Tex3DS32FloatGrad:
+  case NVPTXISD::Tex3DU32S32:
+  case NVPTXISD::Tex3DU32Float:
+  case NVPTXISD::Tex3DU32FloatLevel:
+  case NVPTXISD::Tex3DU32FloatGrad:
+  case NVPTXISD::TexCubeFloatFloat:
+  case NVPTXISD::TexCubeFloatFloatLevel:
+  case NVPTXISD::TexCubeS32Float:
+  case NVPTXISD::TexCubeS32FloatLevel:
+  case NVPTXISD::TexCubeU32Float:
+  case NVPTXISD::TexCubeU32FloatLevel:
+  case NVPTXISD::TexCubeArrayFloatFloat:
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+  case NVPTXISD::TexCubeArrayS32Float:
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+  case NVPTXISD::TexCubeArrayU32Float:
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+  case NVPTXISD::Tld4R2DFloatFloat:
+  case NVPTXISD::Tld4G2DFloatFloat:
+  case NVPTXISD::Tld4B2DFloatFloat:
+  case NVPTXISD::Tld4A2DFloatFloat:
+  case NVPTXISD::Tld4R2DS64Float:
+  case NVPTXISD::Tld4G2DS64Float:
+  case NVPTXISD::Tld4B2DS64Float:
+  case NVPTXISD::Tld4A2DS64Float:
+  case NVPTXISD::Tld4R2DU64Float:
+  case NVPTXISD::Tld4G2DU64Float:
+  case NVPTXISD::Tld4B2DU64Float:
+  case NVPTXISD::Tld4A2DU64Float:
+  case NVPTXISD::TexUnified1DFloatS32:
+  case NVPTXISD::TexUnified1DFloatFloat:
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+  case NVPTXISD::TexUnified1DS32S32:
+  case NVPTXISD::TexUnified1DS32Float:
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+  case NVPTXISD::TexUnified1DU32S32:
+  case NVPTXISD::TexUnified1DU32Float:
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+  case NVPTXISD::TexUnified1DArrayS32S32:
+  case NVPTXISD::TexUnified1DArrayS32Float:
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+  case NVPTXISD::TexUnified1DArrayU32S32:
+  case NVPTXISD::TexUnified1DArrayU32Float:
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+  case NVPTXISD::TexUnified2DFloatS32:
+  case NVPTXISD::TexUnified2DFloatFloat:
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+  case NVPTXISD::TexUnified2DS32S32:
+  case NVPTXISD::TexUnified2DS32Float:
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+  case NVPTXISD::TexUnified2DU32S32:
+  case NVPTXISD::TexUnified2DU32Float:
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+  case NVPTXISD::TexUnified2DArrayS32S32:
+  case NVPTXISD::TexUnified2DArrayS32Float:
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+  case NVPTXISD::TexUnified2DArrayU32S32:
+  case NVPTXISD::TexUnified2DArrayU32Float:
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+  case NVPTXISD::TexUnified3DFloatS32:
+  case NVPTXISD::TexUnified3DFloatFloat:
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+  case NVPTXISD::TexUnified3DS32S32:
+  case NVPTXISD::TexUnified3DS32Float:
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+  case NVPTXISD::TexUnified3DU32S32:
+  case NVPTXISD::TexUnified3DU32Float:
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+  case NVPTXISD::TexUnifiedCubeS32Float:
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+  case NVPTXISD::TexUnifiedCubeU32Float:
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
     ResNode = SelectTextureIntrinsic(N);
     break;
+  case NVPTXISD::Suld1DI8Clamp:
+  case NVPTXISD::Suld1DI16Clamp:
+  case NVPTXISD::Suld1DI32Clamp:
+  case NVPTXISD::Suld1DI64Clamp:
+  case NVPTXISD::Suld1DV2I8Clamp:
+  case NVPTXISD::Suld1DV2I16Clamp:
+  case NVPTXISD::Suld1DV2I32Clamp:
+  case NVPTXISD::Suld1DV2I64Clamp:
+  case NVPTXISD::Suld1DV4I8Clamp:
+  case NVPTXISD::Suld1DV4I16Clamp:
+  case NVPTXISD::Suld1DV4I32Clamp:
+  case NVPTXISD::Suld1DArrayI8Clamp:
+  case NVPTXISD::Suld1DArrayI16Clamp:
+  case NVPTXISD::Suld1DArrayI32Clamp:
+  case NVPTXISD::Suld1DArrayI64Clamp:
+  case NVPTXISD::Suld1DArrayV2I8Clamp:
+  case NVPTXISD::Suld1DArrayV2I16Clamp:
+  case NVPTXISD::Suld1DArrayV2I32Clamp:
+  case NVPTXISD::Suld1DArrayV2I64Clamp:
+  case NVPTXISD::Suld1DArrayV4I8Clamp:
+  case NVPTXISD::Suld1DArrayV4I16Clamp:
+  case NVPTXISD::Suld1DArrayV4I32Clamp:
+  case NVPTXISD::Suld2DI8Clamp:
+  case NVPTXISD::Suld2DI16Clamp:
+  case NVPTXISD::Suld2DI32Clamp:
+  case NVPTXISD::Suld2DI64Clamp:
+  case NVPTXISD::Suld2DV2I8Clamp:
+  case NVPTXISD::Suld2DV2I16Clamp:
+  case NVPTXISD::Suld2DV2I32Clamp:
+  case NVPTXISD::Suld2DV2I64Clamp:
+  case NVPTXISD::Suld2DV4I8Clamp:
+  case NVPTXISD::Suld2DV4I16Clamp:
+  case NVPTXISD::Suld2DV4I32Clamp:
+  case NVPTXISD::Suld2DArrayI8Clamp:
+  case NVPTXISD::Suld2DArrayI16Clamp:
+  case NVPTXISD::Suld2DArrayI32Clamp:
+  case NVPTXISD::Suld2DArrayI64Clamp:
+  case NVPTXISD::Suld2DArrayV2I8Clamp:
+  case NVPTXISD::Suld2DArrayV2I16Clamp:
+  case NVPTXISD::Suld2DArrayV2I32Clamp:
+  case NVPTXISD::Suld2DArrayV2I64Clamp:
+  case NVPTXISD::Suld2DArrayV4I8Clamp:
+  case NVPTXISD::Suld2DArrayV4I16Clamp:
+  case NVPTXISD::Suld2DArrayV4I32Clamp:
+  case NVPTXISD::Suld3DI8Clamp:
+  case NVPTXISD::Suld3DI16Clamp:
+  case NVPTXISD::Suld3DI32Clamp:
+  case NVPTXISD::Suld3DI64Clamp:
+  case NVPTXISD::Suld3DV2I8Clamp:
+  case NVPTXISD::Suld3DV2I16Clamp:
+  case NVPTXISD::Suld3DV2I32Clamp:
+  case NVPTXISD::Suld3DV2I64Clamp:
+  case NVPTXISD::Suld3DV4I8Clamp:
+  case NVPTXISD::Suld3DV4I16Clamp:
+  case NVPTXISD::Suld3DV4I32Clamp:
   case NVPTXISD::Suld1DI8Trap:
   case NVPTXISD::Suld1DI16Trap:
   case NVPTXISD::Suld1DI32Trap:
+  case NVPTXISD::Suld1DI64Trap:
   case NVPTXISD::Suld1DV2I8Trap:
   case NVPTXISD::Suld1DV2I16Trap:
   case NVPTXISD::Suld1DV2I32Trap:
+  case NVPTXISD::Suld1DV2I64Trap:
   case NVPTXISD::Suld1DV4I8Trap:
   case NVPTXISD::Suld1DV4I16Trap:
   case NVPTXISD::Suld1DV4I32Trap:
   case NVPTXISD::Suld1DArrayI8Trap:
   case NVPTXISD::Suld1DArrayI16Trap:
   case NVPTXISD::Suld1DArrayI32Trap:
+  case NVPTXISD::Suld1DArrayI64Trap:
   case NVPTXISD::Suld1DArrayV2I8Trap:
   case NVPTXISD::Suld1DArrayV2I16Trap:
   case NVPTXISD::Suld1DArrayV2I32Trap:
+  case NVPTXISD::Suld1DArrayV2I64Trap:
   case NVPTXISD::Suld1DArrayV4I8Trap:
   case NVPTXISD::Suld1DArrayV4I16Trap:
   case NVPTXISD::Suld1DArrayV4I32Trap:
   case NVPTXISD::Suld2DI8Trap:
   case NVPTXISD::Suld2DI16Trap:
   case NVPTXISD::Suld2DI32Trap:
+  case NVPTXISD::Suld2DI64Trap:
   case NVPTXISD::Suld2DV2I8Trap:
   case NVPTXISD::Suld2DV2I16Trap:
   case NVPTXISD::Suld2DV2I32Trap:
+  case NVPTXISD::Suld2DV2I64Trap:
   case NVPTXISD::Suld2DV4I8Trap:
   case NVPTXISD::Suld2DV4I16Trap:
   case NVPTXISD::Suld2DV4I32Trap:
   case NVPTXISD::Suld2DArrayI8Trap:
   case NVPTXISD::Suld2DArrayI16Trap:
   case NVPTXISD::Suld2DArrayI32Trap:
+  case NVPTXISD::Suld2DArrayI64Trap:
   case NVPTXISD::Suld2DArrayV2I8Trap:
   case NVPTXISD::Suld2DArrayV2I16Trap:
   case NVPTXISD::Suld2DArrayV2I32Trap:
+  case NVPTXISD::Suld2DArrayV2I64Trap:
   case NVPTXISD::Suld2DArrayV4I8Trap:
   case NVPTXISD::Suld2DArrayV4I16Trap:
   case NVPTXISD::Suld2DArrayV4I32Trap:
   case NVPTXISD::Suld3DI8Trap:
   case NVPTXISD::Suld3DI16Trap:
   case NVPTXISD::Suld3DI32Trap:
+  case NVPTXISD::Suld3DI64Trap:
   case NVPTXISD::Suld3DV2I8Trap:
   case NVPTXISD::Suld3DV2I16Trap:
   case NVPTXISD::Suld3DV2I32Trap:
+  case NVPTXISD::Suld3DV2I64Trap:
   case NVPTXISD::Suld3DV4I8Trap:
   case NVPTXISD::Suld3DV4I16Trap:
   case NVPTXISD::Suld3DV4I32Trap:
+  case NVPTXISD::Suld1DI8Zero:
+  case NVPTXISD::Suld1DI16Zero:
+  case NVPTXISD::Suld1DI32Zero:
+  case NVPTXISD::Suld1DI64Zero:
+  case NVPTXISD::Suld1DV2I8Zero:
+  case NVPTXISD::Suld1DV2I16Zero:
+  case NVPTXISD::Suld1DV2I32Zero:
+  case NVPTXISD::Suld1DV2I64Zero:
+  case NVPTXISD::Suld1DV4I8Zero:
+  case NVPTXISD::Suld1DV4I16Zero:
+  case NVPTXISD::Suld1DV4I32Zero:
+  case NVPTXISD::Suld1DArrayI8Zero:
+  case NVPTXISD::Suld1DArrayI16Zero:
+  case NVPTXISD::Suld1DArrayI32Zero:
+  case NVPTXISD::Suld1DArrayI64Zero:
+  case NVPTXISD::Suld1DArrayV2I8Zero:
+  case NVPTXISD::Suld1DArrayV2I16Zero:
+  case NVPTXISD::Suld1DArrayV2I32Zero:
+  case NVPTXISD::Suld1DArrayV2I64Zero:
+  case NVPTXISD::Suld1DArrayV4I8Zero:
+  case NVPTXISD::Suld1DArrayV4I16Zero:
+  case NVPTXISD::Suld1DArrayV4I32Zero:
+  case NVPTXISD::Suld2DI8Zero:
+  case NVPTXISD::Suld2DI16Zero:
+  case NVPTXISD::Suld2DI32Zero:
+  case NVPTXISD::Suld2DI64Zero:
+  case NVPTXISD::Suld2DV2I8Zero:
+  case NVPTXISD::Suld2DV2I16Zero:
+  case NVPTXISD::Suld2DV2I32Zero:
+  case NVPTXISD::Suld2DV2I64Zero:
+  case NVPTXISD::Suld2DV4I8Zero:
+  case NVPTXISD::Suld2DV4I16Zero:
+  case NVPTXISD::Suld2DV4I32Zero:
+  case NVPTXISD::Suld2DArrayI8Zero:
+  case NVPTXISD::Suld2DArrayI16Zero:
+  case NVPTXISD::Suld2DArrayI32Zero:
+  case NVPTXISD::Suld2DArrayI64Zero:
+  case NVPTXISD::Suld2DArrayV2I8Zero:
+  case NVPTXISD::Suld2DArrayV2I16Zero:
+  case NVPTXISD::Suld2DArrayV2I32Zero:
+  case NVPTXISD::Suld2DArrayV2I64Zero:
+  case NVPTXISD::Suld2DArrayV4I8Zero:
+  case NVPTXISD::Suld2DArrayV4I16Zero:
+  case NVPTXISD::Suld2DArrayV4I32Zero:
+  case NVPTXISD::Suld3DI8Zero:
+  case NVPTXISD::Suld3DI16Zero:
+  case NVPTXISD::Suld3DI32Zero:
+  case NVPTXISD::Suld3DI64Zero:
+  case NVPTXISD::Suld3DV2I8Zero:
+  case NVPTXISD::Suld3DV2I16Zero:
+  case NVPTXISD::Suld3DV2I32Zero:
+  case NVPTXISD::Suld3DV2I64Zero:
+  case NVPTXISD::Suld3DV4I8Zero:
+  case NVPTXISD::Suld3DV4I16Zero:
+  case NVPTXISD::Suld3DV4I32Zero:
     ResNode = SelectSurfaceIntrinsic(N);
     break;
   case ISD::AND:
@@ -2781,16 +3015,14 @@
 
 SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) {
   SDValue Chain = N->getOperand(0);
-  SDValue TexRef = N->getOperand(1);
-  SDValue SampRef = N->getOperand(2);
   SDNode *Ret = nullptr;
   unsigned Opc = 0;
   SmallVector<SDValue, 8> Ops;
 
   switch (N->getOpcode()) {
   default: return nullptr;
-  case NVPTXISD::Tex1DFloatI32:
-    Opc = NVPTX::TEX_1D_F32_I32;
+  case NVPTXISD::Tex1DFloatS32:
+    Opc = NVPTX::TEX_1D_F32_S32;
     break;
   case NVPTXISD::Tex1DFloatFloat:
     Opc = NVPTX::TEX_1D_F32_F32;
@@ -2801,20 +3033,32 @@
   case NVPTXISD::Tex1DFloatFloatGrad:
     Opc = NVPTX::TEX_1D_F32_F32_GRAD;
     break;
-  case NVPTXISD::Tex1DI32I32:
-    Opc = NVPTX::TEX_1D_I32_I32;
+  case NVPTXISD::Tex1DS32S32:
+    Opc = NVPTX::TEX_1D_S32_S32;
     break;
-  case NVPTXISD::Tex1DI32Float:
-    Opc = NVPTX::TEX_1D_I32_F32;
+  case NVPTXISD::Tex1DS32Float:
+    Opc = NVPTX::TEX_1D_S32_F32;
     break;
-  case NVPTXISD::Tex1DI32FloatLevel:
-    Opc = NVPTX::TEX_1D_I32_F32_LEVEL;
+  case NVPTXISD::Tex1DS32FloatLevel:
+    Opc = NVPTX::TEX_1D_S32_F32_LEVEL;
     break;
-  case NVPTXISD::Tex1DI32FloatGrad:
-    Opc = NVPTX::TEX_1D_I32_F32_GRAD;
+  case NVPTXISD::Tex1DS32FloatGrad:
+    Opc = NVPTX::TEX_1D_S32_F32_GRAD;
     break;
-  case NVPTXISD::Tex1DArrayFloatI32:
-    Opc = NVPTX::TEX_1D_ARRAY_F32_I32;
+  case NVPTXISD::Tex1DU32S32:
+    Opc = NVPTX::TEX_1D_U32_S32;
+    break;
+  case NVPTXISD::Tex1DU32Float:
+    Opc = NVPTX::TEX_1D_U32_F32;
+    break;
+  case NVPTXISD::Tex1DU32FloatLevel:
+    Opc = NVPTX::TEX_1D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DU32FloatGrad:
+    Opc = NVPTX::TEX_1D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayFloatS32:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_S32;
     break;
   case NVPTXISD::Tex1DArrayFloatFloat:
     Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
@@ -2825,20 +3069,32 @@
   case NVPTXISD::Tex1DArrayFloatFloatGrad:
     Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
     break;
-  case NVPTXISD::Tex1DArrayI32I32:
-    Opc = NVPTX::TEX_1D_ARRAY_I32_I32;
+  case NVPTXISD::Tex1DArrayS32S32:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_S32;
     break;
-  case NVPTXISD::Tex1DArrayI32Float:
-    Opc = NVPTX::TEX_1D_ARRAY_I32_F32;
+  case NVPTXISD::Tex1DArrayS32Float:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32;
     break;
-  case NVPTXISD::Tex1DArrayI32FloatLevel:
-    Opc = NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL;
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL;
     break;
-  case NVPTXISD::Tex1DArrayI32FloatGrad:
-    Opc = NVPTX::TEX_1D_ARRAY_I32_F32_GRAD;
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD;
     break;
-  case NVPTXISD::Tex2DFloatI32:
-    Opc = NVPTX::TEX_2D_F32_I32;
+  case NVPTXISD::Tex1DArrayU32S32:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::Tex1DArrayU32Float:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DFloatS32:
+    Opc = NVPTX::TEX_2D_F32_S32;
     break;
   case NVPTXISD::Tex2DFloatFloat:
     Opc = NVPTX::TEX_2D_F32_F32;
@@ -2849,20 +3105,32 @@
   case NVPTXISD::Tex2DFloatFloatGrad:
     Opc = NVPTX::TEX_2D_F32_F32_GRAD;
     break;
-  case NVPTXISD::Tex2DI32I32:
-    Opc = NVPTX::TEX_2D_I32_I32;
+  case NVPTXISD::Tex2DS32S32:
+    Opc = NVPTX::TEX_2D_S32_S32;
     break;
-  case NVPTXISD::Tex2DI32Float:
-    Opc = NVPTX::TEX_2D_I32_F32;
+  case NVPTXISD::Tex2DS32Float:
+    Opc = NVPTX::TEX_2D_S32_F32;
     break;
-  case NVPTXISD::Tex2DI32FloatLevel:
-    Opc = NVPTX::TEX_2D_I32_F32_LEVEL;
+  case NVPTXISD::Tex2DS32FloatLevel:
+    Opc = NVPTX::TEX_2D_S32_F32_LEVEL;
     break;
-  case NVPTXISD::Tex2DI32FloatGrad:
-    Opc = NVPTX::TEX_2D_I32_F32_GRAD;
+  case NVPTXISD::Tex2DS32FloatGrad:
+    Opc = NVPTX::TEX_2D_S32_F32_GRAD;
     break;
-  case NVPTXISD::Tex2DArrayFloatI32:
-    Opc = NVPTX::TEX_2D_ARRAY_F32_I32;
+  case NVPTXISD::Tex2DU32S32:
+    Opc = NVPTX::TEX_2D_U32_S32;
+    break;
+  case NVPTXISD::Tex2DU32Float:
+    Opc = NVPTX::TEX_2D_U32_F32;
+    break;
+  case NVPTXISD::Tex2DU32FloatLevel:
+    Opc = NVPTX::TEX_2D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DU32FloatGrad:
+    Opc = NVPTX::TEX_2D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayFloatS32:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_S32;
     break;
   case NVPTXISD::Tex2DArrayFloatFloat:
     Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
@@ -2873,20 +3141,32 @@
   case NVPTXISD::Tex2DArrayFloatFloatGrad:
     Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
     break;
-  case NVPTXISD::Tex2DArrayI32I32:
-    Opc = NVPTX::TEX_2D_ARRAY_I32_I32;
+  case NVPTXISD::Tex2DArrayS32S32:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_S32;
     break;
-  case NVPTXISD::Tex2DArrayI32Float:
-    Opc = NVPTX::TEX_2D_ARRAY_I32_F32;
+  case NVPTXISD::Tex2DArrayS32Float:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32;
     break;
-  case NVPTXISD::Tex2DArrayI32FloatLevel:
-    Opc = NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL;
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL;
     break;
-  case NVPTXISD::Tex2DArrayI32FloatGrad:
-    Opc = NVPTX::TEX_2D_ARRAY_I32_F32_GRAD;
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD;
     break;
-  case NVPTXISD::Tex3DFloatI32:
-    Opc = NVPTX::TEX_3D_F32_I32;
+  case NVPTXISD::Tex2DArrayU32S32:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::Tex2DArrayU32Float:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DFloatS32:
+    Opc = NVPTX::TEX_3D_F32_S32;
     break;
   case NVPTXISD::Tex3DFloatFloat:
     Opc = NVPTX::TEX_3D_F32_F32;
@@ -2897,25 +3177,358 @@
   case NVPTXISD::Tex3DFloatFloatGrad:
     Opc = NVPTX::TEX_3D_F32_F32_GRAD;
     break;
-  case NVPTXISD::Tex3DI32I32:
-    Opc = NVPTX::TEX_3D_I32_I32;
+  case NVPTXISD::Tex3DS32S32:
+    Opc = NVPTX::TEX_3D_S32_S32;
     break;
-  case NVPTXISD::Tex3DI32Float:
-    Opc = NVPTX::TEX_3D_I32_F32;
+  case NVPTXISD::Tex3DS32Float:
+    Opc = NVPTX::TEX_3D_S32_F32;
     break;
-  case NVPTXISD::Tex3DI32FloatLevel:
-    Opc = NVPTX::TEX_3D_I32_F32_LEVEL;
+  case NVPTXISD::Tex3DS32FloatLevel:
+    Opc = NVPTX::TEX_3D_S32_F32_LEVEL;
     break;
-  case NVPTXISD::Tex3DI32FloatGrad:
-    Opc = NVPTX::TEX_3D_I32_F32_GRAD;
+  case NVPTXISD::Tex3DS32FloatGrad:
+    Opc = NVPTX::TEX_3D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DU32S32:
+    Opc = NVPTX::TEX_3D_U32_S32;
+    break;
+  case NVPTXISD::Tex3DU32Float:
+    Opc = NVPTX::TEX_3D_U32_F32;
+    break;
+  case NVPTXISD::Tex3DU32FloatLevel:
+    Opc = NVPTX::TEX_3D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DU32FloatGrad:
+    Opc = NVPTX::TEX_3D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexCubeFloatFloat:
+    Opc = NVPTX::TEX_CUBE_F32_F32;
+    break;
+  case NVPTXISD::TexCubeFloatFloatLevel:
+    Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeS32Float:
+    Opc = NVPTX::TEX_CUBE_S32_F32;
+    break;
+  case NVPTXISD::TexCubeS32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeU32Float:
+    Opc = NVPTX::TEX_CUBE_U32_F32;
+    break;
+  case NVPTXISD::TexCubeU32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeArrayFloatFloat:
+    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeArrayS32Float:
+    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexCubeArrayU32Float:
+    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+    Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tld4R2DFloatFloat:
+    Opc = NVPTX::TLD4_R_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4G2DFloatFloat:
+    Opc = NVPTX::TLD4_G_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4B2DFloatFloat:
+    Opc = NVPTX::TLD4_B_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4A2DFloatFloat:
+    Opc = NVPTX::TLD4_A_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4R2DS64Float:
+    Opc = NVPTX::TLD4_R_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4G2DS64Float:
+    Opc = NVPTX::TLD4_G_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4B2DS64Float:
+    Opc = NVPTX::TLD4_B_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4A2DS64Float:
+    Opc = NVPTX::TLD4_A_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4R2DU64Float:
+    Opc = NVPTX::TLD4_R_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4G2DU64Float:
+    Opc = NVPTX::TLD4_G_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4B2DU64Float:
+    Opc = NVPTX::TLD4_B_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4A2DU64Float:
+    Opc = NVPTX::TLD4_A_2D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified1DFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_S32;
+    break;
+  case NVPTXISD::TexUnified1DFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32;
+    break;
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DS32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_S32;
+    break;
+  case NVPTXISD::TexUnified1DS32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32;
+    break;
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DU32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_S32;
+    break;
+  case NVPTXISD::TexUnified1DU32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32S32:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32Float:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_S32;
+    break;
+  case NVPTXISD::TexUnified2DFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32;
+    break;
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DS32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_S32;
+    break;
+  case NVPTXISD::TexUnified2DS32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32;
+    break;
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DU32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_S32;
+    break;
+  case NVPTXISD::TexUnified2DU32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32S32:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32Float:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified3DFloatS32:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_S32;
+    break;
+  case NVPTXISD::TexUnified3DFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32;
+    break;
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified3DS32S32:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_S32;
+    break;
+  case NVPTXISD::TexUnified3DS32Float:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32;
+    break;
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnified3DU32S32:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_S32;
+    break;
+  case NVPTXISD::TexUnified3DU32Float:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32;
+    break;
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+    Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD;
+    break;
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeS32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeU32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32;
+    break;
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+    Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32;
+    break;
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
+    Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32;
     break;
   }
 
-  Ops.push_back(TexRef);
-  Ops.push_back(SampRef);
-
-  // Copy over indices
-  for (unsigned i = 3; i < N->getNumOperands(); ++i) {
+  // Copy over operands
+  for (unsigned i = 1; i < N->getNumOperands(); ++i) {
     Ops.push_back(N->getOperand(i));
   }
 
@@ -2932,6 +3545,402 @@
   SmallVector<SDValue, 8> Ops;
   switch (N->getOpcode()) {
   default: return nullptr;
+  case NVPTXISD::Suld1DI8Clamp:
+    Opc = NVPTX::SULD_1D_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Clamp:
+    Opc = NVPTX::SULD_1D_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Clamp:
+    Opc = NVPTX::SULD_1D_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI64Clamp:
+    Opc = NVPTX::SULD_1D_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Clamp:
+    Opc = NVPTX::SULD_1D_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Clamp:
+    Opc = NVPTX::SULD_1D_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Clamp:
+    Opc = NVPTX::SULD_1D_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I64Clamp:
+    Opc = NVPTX::SULD_1D_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Clamp:
+    Opc = NVPTX::SULD_1D_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Clamp:
+    Opc = NVPTX::SULD_1D_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Clamp:
+    Opc = NVPTX::SULD_1D_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI64Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I64Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Clamp:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Clamp:
+    Opc = NVPTX::SULD_2D_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Clamp:
+    Opc = NVPTX::SULD_2D_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Clamp:
+    Opc = NVPTX::SULD_2D_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI64Clamp:
+    Opc = NVPTX::SULD_2D_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Clamp:
+    Opc = NVPTX::SULD_2D_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Clamp:
+    Opc = NVPTX::SULD_2D_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Clamp:
+    Opc = NVPTX::SULD_2D_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I64Clamp:
+    Opc = NVPTX::SULD_2D_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Clamp:
+    Opc = NVPTX::SULD_2D_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Clamp:
+    Opc = NVPTX::SULD_2D_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Clamp:
+    Opc = NVPTX::SULD_2D_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI64Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I64Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Clamp:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Clamp:
+    Opc = NVPTX::SULD_3D_I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Clamp:
+    Opc = NVPTX::SULD_3D_I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Clamp:
+    Opc = NVPTX::SULD_3D_I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI64Clamp:
+    Opc = NVPTX::SULD_3D_I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Clamp:
+    Opc = NVPTX::SULD_3D_V2I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Clamp:
+    Opc = NVPTX::SULD_3D_V2I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Clamp:
+    Opc = NVPTX::SULD_3D_V2I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I64Clamp:
+    Opc = NVPTX::SULD_3D_V2I64_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Clamp:
+    Opc = NVPTX::SULD_3D_V4I8_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Clamp:
+    Opc = NVPTX::SULD_3D_V4I16_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Clamp:
+    Opc = NVPTX::SULD_3D_V4I32_CLAMP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld1DI8Trap:
     Opc = NVPTX::SULD_1D_I8_TRAP;
     Ops.push_back(TexHandle);
@@ -2950,6 +3959,12 @@
     Ops.push_back(N->getOperand(2));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld1DI64Trap:
+    Opc = NVPTX::SULD_1D_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld1DV2I8Trap:
     Opc = NVPTX::SULD_1D_V2I8_TRAP;
     Ops.push_back(TexHandle);
@@ -2968,6 +3983,12 @@
     Ops.push_back(N->getOperand(2));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld1DV2I64Trap:
+    Opc = NVPTX::SULD_1D_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld1DV4I8Trap:
     Opc = NVPTX::SULD_1D_V4I8_TRAP;
     Ops.push_back(TexHandle);
@@ -3007,6 +4028,13 @@
     Ops.push_back(N->getOperand(3));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld1DArrayI64Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld1DArrayV2I8Trap:
     Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
     Ops.push_back(TexHandle);
@@ -3028,6 +4056,13 @@
     Ops.push_back(N->getOperand(3));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld1DArrayV2I64Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld1DArrayV4I8Trap:
     Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
     Ops.push_back(TexHandle);
@@ -3070,6 +4105,13 @@
     Ops.push_back(N->getOperand(3));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld2DI64Trap:
+    Opc = NVPTX::SULD_2D_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld2DV2I8Trap:
     Opc = NVPTX::SULD_2D_V2I8_TRAP;
     Ops.push_back(TexHandle);
@@ -3091,6 +4133,13 @@
     Ops.push_back(N->getOperand(3));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld2DV2I64Trap:
+    Opc = NVPTX::SULD_2D_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld2DV4I8Trap:
     Opc = NVPTX::SULD_2D_V4I8_TRAP;
     Ops.push_back(TexHandle);
@@ -3136,6 +4185,14 @@
     Ops.push_back(N->getOperand(4));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld2DArrayI64Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld2DArrayV2I8Trap:
     Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
     Ops.push_back(TexHandle);
@@ -3160,6 +4217,14 @@
     Ops.push_back(N->getOperand(4));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld2DArrayV2I64Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld2DArrayV4I8Trap:
     Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
     Ops.push_back(TexHandle);
@@ -3208,6 +4273,14 @@
     Ops.push_back(N->getOperand(4));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld3DI64Trap:
+    Opc = NVPTX::SULD_3D_I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld3DV2I8Trap:
     Opc = NVPTX::SULD_3D_V2I8_TRAP;
     Ops.push_back(TexHandle);
@@ -3232,6 +4305,14 @@
     Ops.push_back(N->getOperand(4));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld3DV2I64Trap:
+    Opc = NVPTX::SULD_3D_V2I64_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
   case NVPTXISD::Suld3DV4I8Trap:
     Opc = NVPTX::SULD_3D_V4I8_TRAP;
     Ops.push_back(TexHandle);
@@ -3256,11 +4337,408 @@
     Ops.push_back(N->getOperand(4));
     Ops.push_back(Chain);
     break;
+  case NVPTXISD::Suld1DI8Zero:
+    Opc = NVPTX::SULD_1D_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Zero:
+    Opc = NVPTX::SULD_1D_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Zero:
+    Opc = NVPTX::SULD_1D_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI64Zero:
+    Opc = NVPTX::SULD_1D_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Zero:
+    Opc = NVPTX::SULD_1D_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Zero:
+    Opc = NVPTX::SULD_1D_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Zero:
+    Opc = NVPTX::SULD_1D_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I64Zero:
+    Opc = NVPTX::SULD_1D_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Zero:
+    Opc = NVPTX::SULD_1D_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Zero:
+    Opc = NVPTX::SULD_1D_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Zero:
+    Opc = NVPTX::SULD_1D_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI64Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I64Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Zero:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Zero:
+    Opc = NVPTX::SULD_2D_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Zero:
+    Opc = NVPTX::SULD_2D_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Zero:
+    Opc = NVPTX::SULD_2D_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI64Zero:
+    Opc = NVPTX::SULD_2D_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Zero:
+    Opc = NVPTX::SULD_2D_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Zero:
+    Opc = NVPTX::SULD_2D_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Zero:
+    Opc = NVPTX::SULD_2D_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I64Zero:
+    Opc = NVPTX::SULD_2D_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Zero:
+    Opc = NVPTX::SULD_2D_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Zero:
+    Opc = NVPTX::SULD_2D_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Zero:
+    Opc = NVPTX::SULD_2D_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI64Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I64Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Zero:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Zero:
+    Opc = NVPTX::SULD_3D_I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Zero:
+    Opc = NVPTX::SULD_3D_I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Zero:
+    Opc = NVPTX::SULD_3D_I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI64Zero:
+    Opc = NVPTX::SULD_3D_I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Zero:
+    Opc = NVPTX::SULD_3D_V2I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Zero:
+    Opc = NVPTX::SULD_3D_V2I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Zero:
+    Opc = NVPTX::SULD_3D_V2I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I64Zero:
+    Opc = NVPTX::SULD_3D_V2I64_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Zero:
+    Opc = NVPTX::SULD_3D_V4I8_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Zero:
+    Opc = NVPTX::SULD_3D_V4I16_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Zero:
+    Opc = NVPTX::SULD_3D_V4I32_ZERO;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
   }
   Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
   return Ret;
 }
 
+
 /// SelectBFE - Look for instruction sequences that can be made more efficient
 /// by using the 'bfe' (bit-field extract) PTX instruction
 SDNode *NVPTXDAGToDAGISel::SelectBFE(SDNode *N) {
@@ -3563,17 +5041,10 @@
 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
                                                  unsigned int spN) const {
   const Value *Src = nullptr;
-  // Even though MemIntrinsicSDNode is a subclas of MemSDNode,
-  // the classof() for MemSDNode does not include MemIntrinsicSDNode
-  // (See SelectionDAGNodes.h). So we need to check for both.
   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
     if (spN == 0 && mN->getMemOperand()->getPseudoValue())
       return true;
     Src = mN->getMemOperand()->getValue();
-  } else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) {
-    if (spN == 0 && mN->getMemOperand()->getPseudoValue())
-      return true;
-    Src = mN->getMemOperand()->getValue();
   }
   if (!Src)
     return false;

diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index c44ccb2..69afcd7 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

@@ -11,6 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXISELDAGTODAG_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXISELDAGTODAG_H
+
 #include "NVPTX.h"
 #include "NVPTXISelLowering.h"
 #include "NVPTXRegisterInfo.h"
@@ -24,20 +27,13 @@
 
 class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
 
-  // If true, generate corresponding FPCONTRACT. This is
-  // language dependent (i.e. CUDA and OpenCL works differently).
-  bool doFMAF64;
-  bool doFMAF32;
-  bool doFMAF64AGG;
-  bool doFMAF32AGG;
-  bool allowFMA;
-
   // If true, generate mul.wide from sext and mul
   bool doMulWide;
 
   int getDivF32Level() const;
   bool usePrecSqrtF32() const;
   bool useF32FTZ() const;
+  bool allowFMA() const;
 
 public:
   explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
@@ -99,3 +95,5 @@
 
 };
 }
+
+#endif

diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index cb452ff..0b0b536 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp

@@ -48,6 +48,12 @@
     "nvptx-sched4reg",
     cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
 
+static cl::opt<unsigned>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                             " 1: do it  2: do it aggressively"),
+                    cl::init(2));
+
 static bool IsPTXVectorType(MVT VT) {
   switch (VT.SimpleTy) {
   default:
@@ -100,8 +106,8 @@
 }
 
 // NVPTXTargetLowering Constructor.
-NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
-    : TargetLowering(TM, new NVPTXTargetObjectFile()), nvTM(&TM),
+NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM)
+    : TargetLowering(TM), nvTM(&TM),
       nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
 
   // always lower memset, memcpy, and memmove intrinsics to load/store
@@ -197,8 +203,11 @@
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
   // Turn FP extload into load/fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
   // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // PTX does not support load / store predicate registers
@@ -360,73 +369,379 @@
     return "NVPTXISD::MUL_WIDE_SIGNED";
   case NVPTXISD::MUL_WIDE_UNSIGNED:
     return "NVPTXISD::MUL_WIDE_UNSIGNED";
-  case NVPTXISD::Tex1DFloatI32:        return "NVPTXISD::Tex1DFloatI32";
+  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
   case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
   case NVPTXISD::Tex1DFloatFloatLevel:
     return "NVPTXISD::Tex1DFloatFloatLevel";
   case NVPTXISD::Tex1DFloatFloatGrad:
     return "NVPTXISD::Tex1DFloatFloatGrad";
-  case NVPTXISD::Tex1DI32I32:          return "NVPTXISD::Tex1DI32I32";
-  case NVPTXISD::Tex1DI32Float:        return "NVPTXISD::Tex1DI32Float";
-  case NVPTXISD::Tex1DI32FloatLevel:
-    return "NVPTXISD::Tex1DI32FloatLevel";
-  case NVPTXISD::Tex1DI32FloatGrad:
-    return "NVPTXISD::Tex1DI32FloatGrad";
-  case NVPTXISD::Tex1DArrayFloatI32:   return "NVPTXISD::Tex2DArrayFloatI32";
-  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
+  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
+  case NVPTXISD::Tex1DS32FloatLevel:
+    return "NVPTXISD::Tex1DS32FloatLevel";
+  case NVPTXISD::Tex1DS32FloatGrad:
+    return "NVPTXISD::Tex1DS32FloatGrad";
+  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
+  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
+  case NVPTXISD::Tex1DU32FloatLevel:
+    return "NVPTXISD::Tex1DU32FloatLevel";
+  case NVPTXISD::Tex1DU32FloatGrad:
+    return "NVPTXISD::Tex1DU32FloatGrad";
+  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
+  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
   case NVPTXISD::Tex1DArrayFloatFloatLevel:
-    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
   case NVPTXISD::Tex1DArrayFloatFloatGrad:
-    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
-  case NVPTXISD::Tex1DArrayI32I32:     return "NVPTXISD::Tex2DArrayI32I32";
-  case NVPTXISD::Tex1DArrayI32Float:   return "NVPTXISD::Tex2DArrayI32Float";
-  case NVPTXISD::Tex1DArrayI32FloatLevel:
-    return "NVPTXISD::Tex2DArrayI32FloatLevel";
-  case NVPTXISD::Tex1DArrayI32FloatGrad:
-    return "NVPTXISD::Tex2DArrayI32FloatGrad";
-  case NVPTXISD::Tex2DFloatI32:        return "NVPTXISD::Tex2DFloatI32";
+    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
+  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
+  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+    return "NVPTXISD::Tex1DArrayS32FloatLevel";
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+    return "NVPTXISD::Tex1DArrayS32FloatGrad";
+  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
+  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+    return "NVPTXISD::Tex1DArrayU32FloatLevel";
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+    return "NVPTXISD::Tex1DArrayU32FloatGrad";
+  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
   case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
   case NVPTXISD::Tex2DFloatFloatLevel:
     return "NVPTXISD::Tex2DFloatFloatLevel";
   case NVPTXISD::Tex2DFloatFloatGrad:
     return "NVPTXISD::Tex2DFloatFloatGrad";
-  case NVPTXISD::Tex2DI32I32:          return "NVPTXISD::Tex2DI32I32";
-  case NVPTXISD::Tex2DI32Float:        return "NVPTXISD::Tex2DI32Float";
-  case NVPTXISD::Tex2DI32FloatLevel:
-    return "NVPTXISD::Tex2DI32FloatLevel";
-  case NVPTXISD::Tex2DI32FloatGrad:
-    return "NVPTXISD::Tex2DI32FloatGrad";
-  case NVPTXISD::Tex2DArrayFloatI32:   return "NVPTXISD::Tex2DArrayFloatI32";
+  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
+  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
+  case NVPTXISD::Tex2DS32FloatLevel:
+    return "NVPTXISD::Tex2DS32FloatLevel";
+  case NVPTXISD::Tex2DS32FloatGrad:
+    return "NVPTXISD::Tex2DS32FloatGrad";
+  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
+  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
+  case NVPTXISD::Tex2DU32FloatLevel:
+    return "NVPTXISD::Tex2DU32FloatLevel";
+  case NVPTXISD::Tex2DU32FloatGrad:
+    return "NVPTXISD::Tex2DU32FloatGrad";
+  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
   case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
   case NVPTXISD::Tex2DArrayFloatFloatLevel:
     return "NVPTXISD::Tex2DArrayFloatFloatLevel";
   case NVPTXISD::Tex2DArrayFloatFloatGrad:
     return "NVPTXISD::Tex2DArrayFloatFloatGrad";
-  case NVPTXISD::Tex2DArrayI32I32:     return "NVPTXISD::Tex2DArrayI32I32";
-  case NVPTXISD::Tex2DArrayI32Float:   return "NVPTXISD::Tex2DArrayI32Float";
-  case NVPTXISD::Tex2DArrayI32FloatLevel:
-    return "NVPTXISD::Tex2DArrayI32FloatLevel";
-  case NVPTXISD::Tex2DArrayI32FloatGrad:
-    return "NVPTXISD::Tex2DArrayI32FloatGrad";
-  case NVPTXISD::Tex3DFloatI32:        return "NVPTXISD::Tex3DFloatI32";
+  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
+  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+    return "NVPTXISD::Tex2DArrayS32FloatLevel";
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+    return "NVPTXISD::Tex2DArrayS32FloatGrad";
+  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
+  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+    return "NVPTXISD::Tex2DArrayU32FloatLevel";
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+    return "NVPTXISD::Tex2DArrayU32FloatGrad";
+  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
   case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
   case NVPTXISD::Tex3DFloatFloatLevel:
     return "NVPTXISD::Tex3DFloatFloatLevel";
   case NVPTXISD::Tex3DFloatFloatGrad:
     return "NVPTXISD::Tex3DFloatFloatGrad";
-  case NVPTXISD::Tex3DI32I32:          return "NVPTXISD::Tex3DI32I32";
-  case NVPTXISD::Tex3DI32Float:        return "NVPTXISD::Tex3DI32Float";
-  case NVPTXISD::Tex3DI32FloatLevel:
-    return "NVPTXISD::Tex3DI32FloatLevel";
-  case NVPTXISD::Tex3DI32FloatGrad:
-    return "NVPTXISD::Tex3DI32FloatGrad";
+  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
+  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
+  case NVPTXISD::Tex3DS32FloatLevel:
+    return "NVPTXISD::Tex3DS32FloatLevel";
+  case NVPTXISD::Tex3DS32FloatGrad:
+    return "NVPTXISD::Tex3DS32FloatGrad";
+  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
+  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
+  case NVPTXISD::Tex3DU32FloatLevel:
+    return "NVPTXISD::Tex3DU32FloatLevel";
+  case NVPTXISD::Tex3DU32FloatGrad:
+    return "NVPTXISD::Tex3DU32FloatGrad";
+  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
+  case NVPTXISD::TexCubeFloatFloatLevel:
+    return "NVPTXISD::TexCubeFloatFloatLevel";
+  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
+  case NVPTXISD::TexCubeS32FloatLevel:
+    return "NVPTXISD::TexCubeS32FloatLevel";
+  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
+  case NVPTXISD::TexCubeU32FloatLevel:
+    return "NVPTXISD::TexCubeU32FloatLevel";
+  case NVPTXISD::TexCubeArrayFloatFloat:
+    return "NVPTXISD::TexCubeArrayFloatFloat";
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
+  case NVPTXISD::TexCubeArrayS32Float:
+    return "NVPTXISD::TexCubeArrayS32Float";
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+    return "NVPTXISD::TexCubeArrayS32FloatLevel";
+  case NVPTXISD::TexCubeArrayU32Float:
+    return "NVPTXISD::TexCubeArrayU32Float";
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+    return "NVPTXISD::TexCubeArrayU32FloatLevel";
+  case NVPTXISD::Tld4R2DFloatFloat:
+    return "NVPTXISD::Tld4R2DFloatFloat";
+  case NVPTXISD::Tld4G2DFloatFloat:
+    return "NVPTXISD::Tld4G2DFloatFloat";
+  case NVPTXISD::Tld4B2DFloatFloat:
+    return "NVPTXISD::Tld4B2DFloatFloat";
+  case NVPTXISD::Tld4A2DFloatFloat:
+    return "NVPTXISD::Tld4A2DFloatFloat";
+  case NVPTXISD::Tld4R2DS64Float:
+    return "NVPTXISD::Tld4R2DS64Float";
+  case NVPTXISD::Tld4G2DS64Float:
+    return "NVPTXISD::Tld4G2DS64Float";
+  case NVPTXISD::Tld4B2DS64Float:
+    return "NVPTXISD::Tld4B2DS64Float";
+  case NVPTXISD::Tld4A2DS64Float:
+    return "NVPTXISD::Tld4A2DS64Float";
+  case NVPTXISD::Tld4R2DU64Float:
+    return "NVPTXISD::Tld4R2DU64Float";
+  case NVPTXISD::Tld4G2DU64Float:
+    return "NVPTXISD::Tld4G2DU64Float";
+  case NVPTXISD::Tld4B2DU64Float:
+    return "NVPTXISD::Tld4B2DU64Float";
+  case NVPTXISD::Tld4A2DU64Float:
+    return "NVPTXISD::Tld4A2DU64Float";
+
+  case NVPTXISD::TexUnified1DFloatS32:
+    return "NVPTXISD::TexUnified1DFloatS32";
+  case NVPTXISD::TexUnified1DFloatFloat:
+    return "NVPTXISD::TexUnified1DFloatFloat";
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+    return "NVPTXISD::TexUnified1DFloatFloatLevel";
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+    return "NVPTXISD::TexUnified1DFloatFloatGrad";
+  case NVPTXISD::TexUnified1DS32S32:
+    return "NVPTXISD::TexUnified1DS32S32";
+  case NVPTXISD::TexUnified1DS32Float:
+    return "NVPTXISD::TexUnified1DS32Float";
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+    return "NVPTXISD::TexUnified1DS32FloatLevel";
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+    return "NVPTXISD::TexUnified1DS32FloatGrad";
+  case NVPTXISD::TexUnified1DU32S32:
+    return "NVPTXISD::TexUnified1DU32S32";
+  case NVPTXISD::TexUnified1DU32Float:
+    return "NVPTXISD::TexUnified1DU32Float";
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+    return "NVPTXISD::TexUnified1DU32FloatLevel";
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+    return "NVPTXISD::TexUnified1DU32FloatGrad";
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+    return "NVPTXISD::TexUnified1DArrayFloatS32";
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+    return "NVPTXISD::TexUnified1DArrayFloatFloat";
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
+  case NVPTXISD::TexUnified1DArrayS32S32:
+    return "NVPTXISD::TexUnified1DArrayS32S32";
+  case NVPTXISD::TexUnified1DArrayS32Float:
+    return "NVPTXISD::TexUnified1DArrayS32Float";
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
+  case NVPTXISD::TexUnified1DArrayU32S32:
+    return "NVPTXISD::TexUnified1DArrayU32S32";
+  case NVPTXISD::TexUnified1DArrayU32Float:
+    return "NVPTXISD::TexUnified1DArrayU32Float";
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
+  case NVPTXISD::TexUnified2DFloatS32:
+    return "NVPTXISD::TexUnified2DFloatS32";
+  case NVPTXISD::TexUnified2DFloatFloat:
+    return "NVPTXISD::TexUnified2DFloatFloat";
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+    return "NVPTXISD::TexUnified2DFloatFloatLevel";
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+    return "NVPTXISD::TexUnified2DFloatFloatGrad";
+  case NVPTXISD::TexUnified2DS32S32:
+    return "NVPTXISD::TexUnified2DS32S32";
+  case NVPTXISD::TexUnified2DS32Float:
+    return "NVPTXISD::TexUnified2DS32Float";
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+    return "NVPTXISD::TexUnified2DS32FloatLevel";
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+    return "NVPTXISD::TexUnified2DS32FloatGrad";
+  case NVPTXISD::TexUnified2DU32S32:
+    return "NVPTXISD::TexUnified2DU32S32";
+  case NVPTXISD::TexUnified2DU32Float:
+    return "NVPTXISD::TexUnified2DU32Float";
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+    return "NVPTXISD::TexUnified2DU32FloatLevel";
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+    return "NVPTXISD::TexUnified2DU32FloatGrad";
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+    return "NVPTXISD::TexUnified2DArrayFloatS32";
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+    return "NVPTXISD::TexUnified2DArrayFloatFloat";
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
+  case NVPTXISD::TexUnified2DArrayS32S32:
+    return "NVPTXISD::TexUnified2DArrayS32S32";
+  case NVPTXISD::TexUnified2DArrayS32Float:
+    return "NVPTXISD::TexUnified2DArrayS32Float";
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
+  case NVPTXISD::TexUnified2DArrayU32S32:
+    return "NVPTXISD::TexUnified2DArrayU32S32";
+  case NVPTXISD::TexUnified2DArrayU32Float:
+    return "NVPTXISD::TexUnified2DArrayU32Float";
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
+  case NVPTXISD::TexUnified3DFloatS32:
+    return "NVPTXISD::TexUnified3DFloatS32";
+  case NVPTXISD::TexUnified3DFloatFloat:
+    return "NVPTXISD::TexUnified3DFloatFloat";
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+    return "NVPTXISD::TexUnified3DFloatFloatLevel";
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+    return "NVPTXISD::TexUnified3DFloatFloatGrad";
+  case NVPTXISD::TexUnified3DS32S32:
+    return "NVPTXISD::TexUnified3DS32S32";
+  case NVPTXISD::TexUnified3DS32Float:
+    return "NVPTXISD::TexUnified3DS32Float";
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+    return "NVPTXISD::TexUnified3DS32FloatLevel";
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+    return "NVPTXISD::TexUnified3DS32FloatGrad";
+  case NVPTXISD::TexUnified3DU32S32:
+    return "NVPTXISD::TexUnified3DU32S32";
+  case NVPTXISD::TexUnified3DU32Float:
+    return "NVPTXISD::TexUnified3DU32Float";
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+    return "NVPTXISD::TexUnified3DU32FloatLevel";
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+    return "NVPTXISD::TexUnified3DU32FloatGrad";
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+    return "NVPTXISD::TexUnifiedCubeFloatFloat";
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
+  case NVPTXISD::TexUnifiedCubeS32Float:
+    return "NVPTXISD::TexUnifiedCubeS32Float";
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeU32Float:
+    return "NVPTXISD::TexUnifiedCubeU32Float";
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+    return "NVPTXISD::Tld4UnifiedR2DS64Float";
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+    return "NVPTXISD::Tld4UnifiedG2DS64Float";
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+    return "NVPTXISD::Tld4UnifiedB2DS64Float";
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+    return "NVPTXISD::Tld4UnifiedA2DS64Float";
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+    return "NVPTXISD::Tld4UnifiedR2DU64Float";
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+    return "NVPTXISD::Tld4UnifiedG2DU64Float";
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+    return "NVPTXISD::Tld4UnifiedB2DU64Float";
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
+    return "NVPTXISD::Tld4UnifiedA2DU64Float";
+
+  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
+  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
+  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
+  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
+  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
+  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
+  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
+  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
+  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
+  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
+  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
+
+  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
+  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
+  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
+  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
+  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
+  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
+  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
+  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
+  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
+  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
+  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
+
+  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
+  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
+  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
+  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
+  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
+  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
+  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
+  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
+  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
+  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
+  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
+
+  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
+  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
+  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
+  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
+  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
+  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
+  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
+  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
+  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
+  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
+  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
+
+  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
+  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
+  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
+  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
+  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
+  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
+  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
+  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
+  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
+  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
+  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
 
   case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
   case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
   case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
+  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
   case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
   case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
   case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
+  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
   case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
   case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
   case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
@@ -434,9 +749,11 @@
   case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
   case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
   case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
+  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
   case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
   case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
   case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
+  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
   case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
   case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
   case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
@@ -444,9 +761,11 @@
   case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
   case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
   case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
+  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
   case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
   case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
   case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
+  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
   case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
   case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
   case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
@@ -454,9 +773,11 @@
   case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
   case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
   case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
+  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
   case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
   case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
   case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
+  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
   case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
   case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
   case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
@@ -464,12 +785,74 @@
   case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
   case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
   case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
+  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
   case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
   case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
   case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
+  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
   case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
   case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
   case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
+
+  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
+  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
+  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
+  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
+  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
+  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
+  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
+  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
+  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
+  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
+  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
+
+  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
+  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
+  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
+  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
+  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
+  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
+  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
+  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
+  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
+  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
+  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
+
+  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
+  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
+  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
+  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
+  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
+  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
+  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
+  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
+  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
+  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
+  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
+
+  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
+  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
+  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
+  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
+  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
+  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
+  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
+  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
+  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
+  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
+  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
+
+  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
+  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
+  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
+  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
+  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
+  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
+  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
+  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
+  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
+  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
+  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
   }
 }
 
@@ -972,7 +1355,12 @@
     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
     //  .param .b<size-in-bits> retval0
     unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
-    if (retTy->isSingleValueType()) {
+    // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
+    // these three types to match the logic in
+    // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
+    // Plus, this behavior is consistent with nvcc's.
+    if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
+        retTy->isPointerTy()) {
       // Scalar needs to be at least 32bit wide
       if (resultsz < 32)
         resultsz = 32;
@@ -1068,8 +1456,8 @@
       EVT ObjectVT = getValueType(retTy);
       unsigned NumElts = ObjectVT.getVectorNumElements();
       EVT EltVT = ObjectVT.getVectorElementType();
-      assert(nvTM->getTargetLowering()->getNumRegisters(F->getContext(),
-                                                        ObjectVT) == NumElts &&
+      assert(nvTM->getSubtargetImpl()->getTargetLowering()->getNumRegisters(
+                 F->getContext(), ObjectVT) == NumElts &&
              "Vector was not scalarized");
       unsigned sz = EltVT.getSizeInBits();
       bool needTruncate = sz < 8 ? true : false;
@@ -1494,6 +1882,21 @@
       break;
     }
 
+    MemSDNode *MemSD = cast<MemSDNode>(N);
+    const DataLayout *TD = getDataLayout();
+
+    unsigned Align = MemSD->getAlignment();
+    unsigned PrefAlign =
+      TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+    if (Align < PrefAlign) {
+      // This store is not sufficiently aligned, so bail out and let this vector
+      // store be scalarized.  Note that we may still be able to emit smaller
+      // vector stores.  For example, if we are storing a <4 x float> with an
+      // alignment of 8, this check will fail but the legalizer will try again
+      // with 2 x <2 x float>, which will succeed with an alignment of 8.
+      return SDValue();
+    }
+
     unsigned Opcode = 0;
     EVT EltVT = ValVT.getVectorElementType();
     unsigned NumElts = ValVT.getVectorNumElements();
@@ -1536,8 +1939,6 @@
       Ops.push_back(N->getOperand(i));
     }
 
-    MemSDNode *MemSD = cast<MemSDNode>(N);
-
     SDValue NewSt = DAG.getMemIntrinsicNode(
         Opcode, DL, DAG.getVTList(MVT::Other), Ops,
         MemSD->getMemoryVT(), MemSD->getMemOperand());
@@ -1632,7 +2033,7 @@
 
   const Function *F = MF.getFunction();
   const AttributeSet &PAL = F->getAttributes();
-  const TargetLowering *TLI = DAG.getTarget().getTargetLowering();
+  const TargetLowering *TLI = DAG.getSubtarget().getTargetLowering();
 
   SDValue Root = DAG.getRoot();
   std::vector<SDValue> OutChains;
@@ -1746,7 +2147,7 @@
                                      ISD::SEXTLOAD : ISD::ZEXTLOAD;
             p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
                                MachinePointerInfo(srcValue), partVT, false,
-                               false, partAlign);
+                               false, false, partAlign);
           } else {
             p = DAG.getLoad(partVT, dl, Root, srcAddr,
                             MachinePointerInfo(srcValue), false, false, false,
@@ -1767,7 +2168,6 @@
         unsigned NumElts = ObjectVT.getVectorNumElements();
         assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
                "Vector was not scalarized");
-        unsigned Ofst = 0;
         EVT EltVT = ObjectVT.getVectorElementType();
 
         // V1 load
@@ -1776,10 +2176,8 @@
           // We only have one element, so just directly load it
           Value *SrcValue = Constant::getNullValue(PointerType::get(
               EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
-          SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
-                                        DAG.getConstant(Ofst, getPointerTy()));
           SDValue P = DAG.getLoad(
-              EltVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false,
               false, true,
               TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
           if (P.getNode())
@@ -1788,7 +2186,6 @@
           if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
             P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
           InVals.push_back(P);
-          Ofst += TD->getTypeAllocSize(EltVT.getTypeForEVT(F->getContext()));
           ++InsIdx;
         } else if (NumElts == 2) {
           // V2 load
@@ -1796,10 +2193,8 @@
           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
           Value *SrcValue = Constant::getNullValue(PointerType::get(
               VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
-          SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
-                                        DAG.getConstant(Ofst, getPointerTy()));
           SDValue P = DAG.getLoad(
-              VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false,
               false, true,
               TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
           if (P.getNode())
@@ -1817,7 +2212,6 @@
 
           InVals.push_back(Elt0);
           InVals.push_back(Elt1);
-          Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
           InsIdx += 2;
         } else {
           // V4 loads
@@ -1835,6 +2229,7 @@
             VecSize = 2;
           }
           EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+          unsigned Ofst = 0;
           for (unsigned i = 0; i < NumElts; i += VecSize) {
             Value *SrcValue = Constant::getNullValue(
                 PointerType::get(VecVT.getTypeForEVT(F->getContext()),
@@ -1879,6 +2274,7 @@
                                        ISD::SEXTLOAD : ISD::ZEXTLOAD;
         p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg,
                            MachinePointerInfo(srcValue), ObjectVT, false, false,
+                           false,
         TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
       } else {
         p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg,
@@ -2132,90 +2528,357 @@
   default:
     return 0;
 
-  case Intrinsic::nvvm_tex_1d_v4f32_i32:
-    return NVPTXISD::Tex1DFloatI32;
+  case Intrinsic::nvvm_tex_1d_v4f32_s32:
+    return NVPTXISD::Tex1DFloatS32;
   case Intrinsic::nvvm_tex_1d_v4f32_f32:
     return NVPTXISD::Tex1DFloatFloat;
   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
     return NVPTXISD::Tex1DFloatFloatLevel;
   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
     return NVPTXISD::Tex1DFloatFloatGrad;
-  case Intrinsic::nvvm_tex_1d_v4i32_i32:
-    return NVPTXISD::Tex1DI32I32;
-  case Intrinsic::nvvm_tex_1d_v4i32_f32:
-    return NVPTXISD::Tex1DI32Float;
-  case Intrinsic::nvvm_tex_1d_level_v4i32_f32:
-    return NVPTXISD::Tex1DI32FloatLevel;
-  case Intrinsic::nvvm_tex_1d_grad_v4i32_f32:
-    return NVPTXISD::Tex1DI32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4s32_s32:
+    return NVPTXISD::Tex1DS32S32;
+  case Intrinsic::nvvm_tex_1d_v4s32_f32:
+    return NVPTXISD::Tex1DS32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+    return NVPTXISD::Tex1DS32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+    return NVPTXISD::Tex1DS32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4u32_s32:
+    return NVPTXISD::Tex1DU32S32;
+  case Intrinsic::nvvm_tex_1d_v4u32_f32:
+    return NVPTXISD::Tex1DU32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+    return NVPTXISD::Tex1DU32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+    return NVPTXISD::Tex1DU32FloatGrad;
 
-  case Intrinsic::nvvm_tex_1d_array_v4f32_i32:
-    return NVPTXISD::Tex1DArrayFloatI32;
+  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+    return NVPTXISD::Tex1DArrayFloatS32;
   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
     return NVPTXISD::Tex1DArrayFloatFloat;
   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
     return NVPTXISD::Tex1DArrayFloatFloatLevel;
   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
     return NVPTXISD::Tex1DArrayFloatFloatGrad;
-  case Intrinsic::nvvm_tex_1d_array_v4i32_i32:
-    return NVPTXISD::Tex1DArrayI32I32;
-  case Intrinsic::nvvm_tex_1d_array_v4i32_f32:
-    return NVPTXISD::Tex1DArrayI32Float;
-  case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32:
-    return NVPTXISD::Tex1DArrayI32FloatLevel;
-  case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32:
-    return NVPTXISD::Tex1DArrayI32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+    return NVPTXISD::Tex1DArrayS32S32;
+  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+    return NVPTXISD::Tex1DArrayU32S32;
+  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32FloatGrad;
 
-  case Intrinsic::nvvm_tex_2d_v4f32_i32:
-    return NVPTXISD::Tex2DFloatI32;
+  case Intrinsic::nvvm_tex_2d_v4f32_s32:
+    return NVPTXISD::Tex2DFloatS32;
   case Intrinsic::nvvm_tex_2d_v4f32_f32:
     return NVPTXISD::Tex2DFloatFloat;
   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
     return NVPTXISD::Tex2DFloatFloatLevel;
   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
     return NVPTXISD::Tex2DFloatFloatGrad;
-  case Intrinsic::nvvm_tex_2d_v4i32_i32:
-    return NVPTXISD::Tex2DI32I32;
-  case Intrinsic::nvvm_tex_2d_v4i32_f32:
-    return NVPTXISD::Tex2DI32Float;
-  case Intrinsic::nvvm_tex_2d_level_v4i32_f32:
-    return NVPTXISD::Tex2DI32FloatLevel;
-  case Intrinsic::nvvm_tex_2d_grad_v4i32_f32:
-    return NVPTXISD::Tex2DI32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4s32_s32:
+    return NVPTXISD::Tex2DS32S32;
+  case Intrinsic::nvvm_tex_2d_v4s32_f32:
+    return NVPTXISD::Tex2DS32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+    return NVPTXISD::Tex2DS32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+    return NVPTXISD::Tex2DS32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4u32_s32:
+    return NVPTXISD::Tex2DU32S32;
+  case Intrinsic::nvvm_tex_2d_v4u32_f32:
+    return NVPTXISD::Tex2DU32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+    return NVPTXISD::Tex2DU32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+    return NVPTXISD::Tex2DU32FloatGrad;
 
-  case Intrinsic::nvvm_tex_2d_array_v4f32_i32:
-    return NVPTXISD::Tex2DArrayFloatI32;
+  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+    return NVPTXISD::Tex2DArrayFloatS32;
   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
     return NVPTXISD::Tex2DArrayFloatFloat;
   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
     return NVPTXISD::Tex2DArrayFloatFloatLevel;
   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
     return NVPTXISD::Tex2DArrayFloatFloatGrad;
-  case Intrinsic::nvvm_tex_2d_array_v4i32_i32:
-    return NVPTXISD::Tex2DArrayI32I32;
-  case Intrinsic::nvvm_tex_2d_array_v4i32_f32:
-    return NVPTXISD::Tex2DArrayI32Float;
-  case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32:
-    return NVPTXISD::Tex2DArrayI32FloatLevel;
-  case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32:
-    return NVPTXISD::Tex2DArrayI32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+    return NVPTXISD::Tex2DArrayS32S32;
+  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+    return NVPTXISD::Tex2DArrayU32S32;
+  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32FloatGrad;
 
-  case Intrinsic::nvvm_tex_3d_v4f32_i32:
-    return NVPTXISD::Tex3DFloatI32;
+  case Intrinsic::nvvm_tex_3d_v4f32_s32:
+    return NVPTXISD::Tex3DFloatS32;
   case Intrinsic::nvvm_tex_3d_v4f32_f32:
     return NVPTXISD::Tex3DFloatFloat;
   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
     return NVPTXISD::Tex3DFloatFloatLevel;
   case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
     return NVPTXISD::Tex3DFloatFloatGrad;
-  case Intrinsic::nvvm_tex_3d_v4i32_i32:
-    return NVPTXISD::Tex3DI32I32;
-  case Intrinsic::nvvm_tex_3d_v4i32_f32:
-    return NVPTXISD::Tex3DI32Float;
-  case Intrinsic::nvvm_tex_3d_level_v4i32_f32:
-    return NVPTXISD::Tex3DI32FloatLevel;
-  case Intrinsic::nvvm_tex_3d_grad_v4i32_f32:
-    return NVPTXISD::Tex3DI32FloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4s32_s32:
+    return NVPTXISD::Tex3DS32S32;
+  case Intrinsic::nvvm_tex_3d_v4s32_f32:
+    return NVPTXISD::Tex3DS32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+    return NVPTXISD::Tex3DS32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+    return NVPTXISD::Tex3DS32FloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4u32_s32:
+    return NVPTXISD::Tex3DU32S32;
+  case Intrinsic::nvvm_tex_3d_v4u32_f32:
+    return NVPTXISD::Tex3DU32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+    return NVPTXISD::Tex3DU32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+    return NVPTXISD::Tex3DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_cube_v4f32_f32:
+    return NVPTXISD::TexCubeFloatFloat;
+  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+    return NVPTXISD::TexCubeFloatFloatLevel;
+  case Intrinsic::nvvm_tex_cube_v4s32_f32:
+    return NVPTXISD::TexCubeS32Float;
+  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+    return NVPTXISD::TexCubeS32FloatLevel;
+  case Intrinsic::nvvm_tex_cube_v4u32_f32:
+    return NVPTXISD::TexCubeU32Float;
+  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+    return NVPTXISD::TexCubeU32FloatLevel;
+
+  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+    return NVPTXISD::TexCubeArrayFloatFloat;
+  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+    return NVPTXISD::TexCubeArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+    return NVPTXISD::TexCubeArrayS32Float;
+  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+    return NVPTXISD::TexCubeArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+    return NVPTXISD::TexCubeArrayU32Float;
+  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+    return NVPTXISD::TexCubeArrayU32FloatLevel;
+
+  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+    return NVPTXISD::Tld4R2DFloatFloat;
+  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+    return NVPTXISD::Tld4G2DFloatFloat;
+  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+    return NVPTXISD::Tld4B2DFloatFloat;
+  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+    return NVPTXISD::Tld4A2DFloatFloat;
+  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+    return NVPTXISD::Tld4R2DS64Float;
+  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+    return NVPTXISD::Tld4G2DS64Float;
+  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+    return NVPTXISD::Tld4B2DS64Float;
+  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+    return NVPTXISD::Tld4A2DS64Float;
+  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+    return NVPTXISD::Tld4R2DU64Float;
+  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+    return NVPTXISD::Tld4G2DU64Float;
+  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+    return NVPTXISD::Tld4B2DU64Float;
+  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+    return NVPTXISD::Tld4A2DU64Float;
+
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+    return NVPTXISD::TexUnified1DFloatS32;
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+    return NVPTXISD::TexUnified1DS32S32;
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32Float;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+    return NVPTXISD::TexUnified1DU32S32;
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32Float;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+    return NVPTXISD::TexUnified1DArrayFloatS32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+    return NVPTXISD::TexUnified1DArrayS32S32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+    return NVPTXISD::TexUnified1DArrayU32S32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+    return NVPTXISD::TexUnified2DFloatS32;
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+    return NVPTXISD::TexUnified2DS32S32;
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32Float;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+    return NVPTXISD::TexUnified2DU32S32;
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32Float;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+    return NVPTXISD::TexUnified2DArrayFloatS32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+    return NVPTXISD::TexUnified2DArrayS32S32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+    return NVPTXISD::TexUnified2DArrayU32S32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+    return NVPTXISD::TexUnified3DFloatS32;
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+    return NVPTXISD::TexUnified3DS32S32;
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32Float;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+    return NVPTXISD::TexUnified3DU32S32;
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32Float;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeFloatFloat;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeS32Float;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeU32Float;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeU32FloatLevel;
+
+  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
+
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedR2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedG2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedB2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedA2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedR2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedG2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedB2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedA2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedR2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedG2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedB2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedA2DU64Float;
   }
 }
 
@@ -2223,18 +2886,132 @@
   switch (Intrinsic) {
   default:
     return 0;
+  case Intrinsic::nvvm_suld_1d_i8_clamp:
+    return NVPTXISD::Suld1DI8Clamp;
+  case Intrinsic::nvvm_suld_1d_i16_clamp:
+    return NVPTXISD::Suld1DI16Clamp;
+  case Intrinsic::nvvm_suld_1d_i32_clamp:
+    return NVPTXISD::Suld1DI32Clamp;
+  case Intrinsic::nvvm_suld_1d_i64_clamp:
+    return NVPTXISD::Suld1DI64Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+    return NVPTXISD::Suld1DV2I8Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+    return NVPTXISD::Suld1DV2I16Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+    return NVPTXISD::Suld1DV2I32Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+    return NVPTXISD::Suld1DV2I64Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+    return NVPTXISD::Suld1DV4I8Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+    return NVPTXISD::Suld1DV4I16Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+    return NVPTXISD::Suld1DV4I32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+    return NVPTXISD::Suld1DArrayI8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+    return NVPTXISD::Suld1DArrayI16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+    return NVPTXISD::Suld1DArrayI32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+    return NVPTXISD::Suld1DArrayI64Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+    return NVPTXISD::Suld1DArrayV2I8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+    return NVPTXISD::Suld1DArrayV2I16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+    return NVPTXISD::Suld1DArrayV2I32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+    return NVPTXISD::Suld1DArrayV2I64Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+    return NVPTXISD::Suld1DArrayV4I8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+    return NVPTXISD::Suld1DArrayV4I16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+    return NVPTXISD::Suld1DArrayV4I32Clamp;
+  case Intrinsic::nvvm_suld_2d_i8_clamp:
+    return NVPTXISD::Suld2DI8Clamp;
+  case Intrinsic::nvvm_suld_2d_i16_clamp:
+    return NVPTXISD::Suld2DI16Clamp;
+  case Intrinsic::nvvm_suld_2d_i32_clamp:
+    return NVPTXISD::Suld2DI32Clamp;
+  case Intrinsic::nvvm_suld_2d_i64_clamp:
+    return NVPTXISD::Suld2DI64Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+    return NVPTXISD::Suld2DV2I8Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+    return NVPTXISD::Suld2DV2I16Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+    return NVPTXISD::Suld2DV2I32Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+    return NVPTXISD::Suld2DV2I64Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+    return NVPTXISD::Suld2DV4I8Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+    return NVPTXISD::Suld2DV4I16Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+    return NVPTXISD::Suld2DV4I32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+    return NVPTXISD::Suld2DArrayI8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+    return NVPTXISD::Suld2DArrayI16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+    return NVPTXISD::Suld2DArrayI32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+    return NVPTXISD::Suld2DArrayI64Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+    return NVPTXISD::Suld2DArrayV2I8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+    return NVPTXISD::Suld2DArrayV2I16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+    return NVPTXISD::Suld2DArrayV2I32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+    return NVPTXISD::Suld2DArrayV2I64Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+    return NVPTXISD::Suld2DArrayV4I8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+    return NVPTXISD::Suld2DArrayV4I16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+    return NVPTXISD::Suld2DArrayV4I32Clamp;
+  case Intrinsic::nvvm_suld_3d_i8_clamp:
+    return NVPTXISD::Suld3DI8Clamp;
+  case Intrinsic::nvvm_suld_3d_i16_clamp:
+    return NVPTXISD::Suld3DI16Clamp;
+  case Intrinsic::nvvm_suld_3d_i32_clamp:
+    return NVPTXISD::Suld3DI32Clamp;
+  case Intrinsic::nvvm_suld_3d_i64_clamp:
+    return NVPTXISD::Suld3DI64Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+    return NVPTXISD::Suld3DV2I8Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+    return NVPTXISD::Suld3DV2I16Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+    return NVPTXISD::Suld3DV2I32Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+    return NVPTXISD::Suld3DV2I64Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+    return NVPTXISD::Suld3DV4I8Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+    return NVPTXISD::Suld3DV4I16Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+    return NVPTXISD::Suld3DV4I32Clamp;
   case Intrinsic::nvvm_suld_1d_i8_trap:
     return NVPTXISD::Suld1DI8Trap;
   case Intrinsic::nvvm_suld_1d_i16_trap:
     return NVPTXISD::Suld1DI16Trap;
   case Intrinsic::nvvm_suld_1d_i32_trap:
     return NVPTXISD::Suld1DI32Trap;
+  case Intrinsic::nvvm_suld_1d_i64_trap:
+    return NVPTXISD::Suld1DI64Trap;
   case Intrinsic::nvvm_suld_1d_v2i8_trap:
     return NVPTXISD::Suld1DV2I8Trap;
   case Intrinsic::nvvm_suld_1d_v2i16_trap:
     return NVPTXISD::Suld1DV2I16Trap;
   case Intrinsic::nvvm_suld_1d_v2i32_trap:
     return NVPTXISD::Suld1DV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_v2i64_trap:
+    return NVPTXISD::Suld1DV2I64Trap;
   case Intrinsic::nvvm_suld_1d_v4i8_trap:
     return NVPTXISD::Suld1DV4I8Trap;
   case Intrinsic::nvvm_suld_1d_v4i16_trap:
@@ -2247,12 +3024,16 @@
     return NVPTXISD::Suld1DArrayI16Trap;
   case Intrinsic::nvvm_suld_1d_array_i32_trap:
     return NVPTXISD::Suld1DArrayI32Trap;
+  case Intrinsic::nvvm_suld_1d_array_i64_trap:
+    return NVPTXISD::Suld1DArrayI64Trap;
   case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
     return NVPTXISD::Suld1DArrayV2I8Trap;
   case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
     return NVPTXISD::Suld1DArrayV2I16Trap;
   case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
     return NVPTXISD::Suld1DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+    return NVPTXISD::Suld1DArrayV2I64Trap;
   case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
     return NVPTXISD::Suld1DArrayV4I8Trap;
   case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
@@ -2265,12 +3046,16 @@
     return NVPTXISD::Suld2DI16Trap;
   case Intrinsic::nvvm_suld_2d_i32_trap:
     return NVPTXISD::Suld2DI32Trap;
+  case Intrinsic::nvvm_suld_2d_i64_trap:
+    return NVPTXISD::Suld2DI64Trap;
   case Intrinsic::nvvm_suld_2d_v2i8_trap:
     return NVPTXISD::Suld2DV2I8Trap;
   case Intrinsic::nvvm_suld_2d_v2i16_trap:
     return NVPTXISD::Suld2DV2I16Trap;
   case Intrinsic::nvvm_suld_2d_v2i32_trap:
     return NVPTXISD::Suld2DV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_v2i64_trap:
+    return NVPTXISD::Suld2DV2I64Trap;
   case Intrinsic::nvvm_suld_2d_v4i8_trap:
     return NVPTXISD::Suld2DV4I8Trap;
   case Intrinsic::nvvm_suld_2d_v4i16_trap:
@@ -2283,12 +3068,16 @@
     return NVPTXISD::Suld2DArrayI16Trap;
   case Intrinsic::nvvm_suld_2d_array_i32_trap:
     return NVPTXISD::Suld2DArrayI32Trap;
+  case Intrinsic::nvvm_suld_2d_array_i64_trap:
+    return NVPTXISD::Suld2DArrayI64Trap;
   case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
     return NVPTXISD::Suld2DArrayV2I8Trap;
   case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
     return NVPTXISD::Suld2DArrayV2I16Trap;
   case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
     return NVPTXISD::Suld2DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+    return NVPTXISD::Suld2DArrayV2I64Trap;
   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
     return NVPTXISD::Suld2DArrayV4I8Trap;
   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
@@ -2301,18 +3090,132 @@
     return NVPTXISD::Suld3DI16Trap;
   case Intrinsic::nvvm_suld_3d_i32_trap:
     return NVPTXISD::Suld3DI32Trap;
+  case Intrinsic::nvvm_suld_3d_i64_trap:
+    return NVPTXISD::Suld3DI64Trap;
   case Intrinsic::nvvm_suld_3d_v2i8_trap:
     return NVPTXISD::Suld3DV2I8Trap;
   case Intrinsic::nvvm_suld_3d_v2i16_trap:
     return NVPTXISD::Suld3DV2I16Trap;
   case Intrinsic::nvvm_suld_3d_v2i32_trap:
     return NVPTXISD::Suld3DV2I32Trap;
+  case Intrinsic::nvvm_suld_3d_v2i64_trap:
+    return NVPTXISD::Suld3DV2I64Trap;
   case Intrinsic::nvvm_suld_3d_v4i8_trap:
     return NVPTXISD::Suld3DV4I8Trap;
   case Intrinsic::nvvm_suld_3d_v4i16_trap:
     return NVPTXISD::Suld3DV4I16Trap;
   case Intrinsic::nvvm_suld_3d_v4i32_trap:
     return NVPTXISD::Suld3DV4I32Trap;
+  case Intrinsic::nvvm_suld_1d_i8_zero:
+    return NVPTXISD::Suld1DI8Zero;
+  case Intrinsic::nvvm_suld_1d_i16_zero:
+    return NVPTXISD::Suld1DI16Zero;
+  case Intrinsic::nvvm_suld_1d_i32_zero:
+    return NVPTXISD::Suld1DI32Zero;
+  case Intrinsic::nvvm_suld_1d_i64_zero:
+    return NVPTXISD::Suld1DI64Zero;
+  case Intrinsic::nvvm_suld_1d_v2i8_zero:
+    return NVPTXISD::Suld1DV2I8Zero;
+  case Intrinsic::nvvm_suld_1d_v2i16_zero:
+    return NVPTXISD::Suld1DV2I16Zero;
+  case Intrinsic::nvvm_suld_1d_v2i32_zero:
+    return NVPTXISD::Suld1DV2I32Zero;
+  case Intrinsic::nvvm_suld_1d_v2i64_zero:
+    return NVPTXISD::Suld1DV2I64Zero;
+  case Intrinsic::nvvm_suld_1d_v4i8_zero:
+    return NVPTXISD::Suld1DV4I8Zero;
+  case Intrinsic::nvvm_suld_1d_v4i16_zero:
+    return NVPTXISD::Suld1DV4I16Zero;
+  case Intrinsic::nvvm_suld_1d_v4i32_zero:
+    return NVPTXISD::Suld1DV4I32Zero;
+  case Intrinsic::nvvm_suld_1d_array_i8_zero:
+    return NVPTXISD::Suld1DArrayI8Zero;
+  case Intrinsic::nvvm_suld_1d_array_i16_zero:
+    return NVPTXISD::Suld1DArrayI16Zero;
+  case Intrinsic::nvvm_suld_1d_array_i32_zero:
+    return NVPTXISD::Suld1DArrayI32Zero;
+  case Intrinsic::nvvm_suld_1d_array_i64_zero:
+    return NVPTXISD::Suld1DArrayI64Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+    return NVPTXISD::Suld1DArrayV2I8Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+    return NVPTXISD::Suld1DArrayV2I16Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+    return NVPTXISD::Suld1DArrayV2I32Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+    return NVPTXISD::Suld1DArrayV2I64Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+    return NVPTXISD::Suld1DArrayV4I8Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+    return NVPTXISD::Suld1DArrayV4I16Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+    return NVPTXISD::Suld1DArrayV4I32Zero;
+  case Intrinsic::nvvm_suld_2d_i8_zero:
+    return NVPTXISD::Suld2DI8Zero;
+  case Intrinsic::nvvm_suld_2d_i16_zero:
+    return NVPTXISD::Suld2DI16Zero;
+  case Intrinsic::nvvm_suld_2d_i32_zero:
+    return NVPTXISD::Suld2DI32Zero;
+  case Intrinsic::nvvm_suld_2d_i64_zero:
+    return NVPTXISD::Suld2DI64Zero;
+  case Intrinsic::nvvm_suld_2d_v2i8_zero:
+    return NVPTXISD::Suld2DV2I8Zero;
+  case Intrinsic::nvvm_suld_2d_v2i16_zero:
+    return NVPTXISD::Suld2DV2I16Zero;
+  case Intrinsic::nvvm_suld_2d_v2i32_zero:
+    return NVPTXISD::Suld2DV2I32Zero;
+  case Intrinsic::nvvm_suld_2d_v2i64_zero:
+    return NVPTXISD::Suld2DV2I64Zero;
+  case Intrinsic::nvvm_suld_2d_v4i8_zero:
+    return NVPTXISD::Suld2DV4I8Zero;
+  case Intrinsic::nvvm_suld_2d_v4i16_zero:
+    return NVPTXISD::Suld2DV4I16Zero;
+  case Intrinsic::nvvm_suld_2d_v4i32_zero:
+    return NVPTXISD::Suld2DV4I32Zero;
+  case Intrinsic::nvvm_suld_2d_array_i8_zero:
+    return NVPTXISD::Suld2DArrayI8Zero;
+  case Intrinsic::nvvm_suld_2d_array_i16_zero:
+    return NVPTXISD::Suld2DArrayI16Zero;
+  case Intrinsic::nvvm_suld_2d_array_i32_zero:
+    return NVPTXISD::Suld2DArrayI32Zero;
+  case Intrinsic::nvvm_suld_2d_array_i64_zero:
+    return NVPTXISD::Suld2DArrayI64Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+    return NVPTXISD::Suld2DArrayV2I8Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+    return NVPTXISD::Suld2DArrayV2I16Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+    return NVPTXISD::Suld2DArrayV2I32Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+    return NVPTXISD::Suld2DArrayV2I64Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+    return NVPTXISD::Suld2DArrayV4I8Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+    return NVPTXISD::Suld2DArrayV4I16Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+    return NVPTXISD::Suld2DArrayV4I32Zero;
+  case Intrinsic::nvvm_suld_3d_i8_zero:
+    return NVPTXISD::Suld3DI8Zero;
+  case Intrinsic::nvvm_suld_3d_i16_zero:
+    return NVPTXISD::Suld3DI16Zero;
+  case Intrinsic::nvvm_suld_3d_i32_zero:
+    return NVPTXISD::Suld3DI32Zero;
+  case Intrinsic::nvvm_suld_3d_i64_zero:
+    return NVPTXISD::Suld3DI64Zero;
+  case Intrinsic::nvvm_suld_3d_v2i8_zero:
+    return NVPTXISD::Suld3DV2I8Zero;
+  case Intrinsic::nvvm_suld_3d_v2i16_zero:
+    return NVPTXISD::Suld3DV2I16Zero;
+  case Intrinsic::nvvm_suld_3d_v2i32_zero:
+    return NVPTXISD::Suld3DV2I32Zero;
+  case Intrinsic::nvvm_suld_3d_v2i64_zero:
+    return NVPTXISD::Suld3DV2I64Zero;
+  case Intrinsic::nvvm_suld_3d_v4i8_zero:
+    return NVPTXISD::Suld3DV4I8Zero;
+  case Intrinsic::nvvm_suld_3d_v4i16_zero:
+    return NVPTXISD::Suld3DV4I16Zero;
+  case Intrinsic::nvvm_suld_3d_v4i32_zero:
+    return NVPTXISD::Suld3DV4I32Zero;
   }
 }
 
@@ -2366,16 +3269,7 @@
     Info.vol = 0;
     Info.readMem = true;
     Info.writeMem = false;
-
-    // alignment is available as metadata.
-    // Grab it and set the alignment.
-    assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
-    MDNode *AlignMD = I.getMetadata("align");
-    assert(AlignMD && "Must have a non-null MDNode");
-    assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
-    Value *Align = AlignMD->getOperand(0);
-    int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
-    Info.align = Alignment;
+    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
 
     return true;
   }
@@ -2395,42 +3289,69 @@
     Info.vol = 0;
     Info.readMem = true;
     Info.writeMem = false;
-
-    // alignment is available as metadata.
-    // Grab it and set the alignment.
-    assert(I.hasMetadataOtherThanDebugLoc() && "Must have alignment metadata");
-    MDNode *AlignMD = I.getMetadata("align");
-    assert(AlignMD && "Must have a non-null MDNode");
-    assert(AlignMD->getNumOperands() == 1 && "Must have a single operand");
-    Value *Align = AlignMD->getOperand(0);
-    int64_t Alignment = cast<ConstantInt>(Align)->getZExtValue();
-    Info.align = Alignment;
+    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
 
     return true;
   }
 
-  case Intrinsic::nvvm_tex_1d_v4f32_i32:
+  case Intrinsic::nvvm_tex_1d_v4f32_s32:
   case Intrinsic::nvvm_tex_1d_v4f32_f32:
   case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
   case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_1d_array_v4f32_i32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
   case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
   case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
   case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_2d_v4f32_i32:
+  case Intrinsic::nvvm_tex_2d_v4f32_s32:
   case Intrinsic::nvvm_tex_2d_v4f32_f32:
   case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
   case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_2d_array_v4f32_i32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
   case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
   case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
   case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_3d_v4f32_i32:
+  case Intrinsic::nvvm_tex_3d_v4f32_s32:
   case Intrinsic::nvvm_tex_3d_v4f32_f32:
   case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: {
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
     Info.opc = getOpcForTextureInstr(Intrinsic);
-    Info.memVT = MVT::f32;
+    Info.memVT = MVT::v4f32;
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.vol = 0;
@@ -2439,28 +3360,120 @@
     Info.align = 16;
     return true;
   }
-  case Intrinsic::nvvm_tex_1d_v4i32_i32:
-  case Intrinsic::nvvm_tex_1d_v4i32_f32:
-  case Intrinsic::nvvm_tex_1d_level_v4i32_f32:
-  case Intrinsic::nvvm_tex_1d_grad_v4i32_f32:
-  case Intrinsic::nvvm_tex_1d_array_v4i32_i32:
-  case Intrinsic::nvvm_tex_1d_array_v4i32_f32:
-  case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32:
-  case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32:
-  case Intrinsic::nvvm_tex_2d_v4i32_i32:
-  case Intrinsic::nvvm_tex_2d_v4i32_f32:
-  case Intrinsic::nvvm_tex_2d_level_v4i32_f32:
-  case Intrinsic::nvvm_tex_2d_grad_v4i32_f32:
-  case Intrinsic::nvvm_tex_2d_array_v4i32_i32:
-  case Intrinsic::nvvm_tex_2d_array_v4i32_f32:
-  case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32:
-  case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32:
-  case Intrinsic::nvvm_tex_3d_v4i32_i32:
-  case Intrinsic::nvvm_tex_3d_v4i32_f32:
-  case Intrinsic::nvvm_tex_3d_level_v4i32_f32:
-  case Intrinsic::nvvm_tex_3d_grad_v4i32_f32: {
+  case Intrinsic::nvvm_tex_1d_v4s32_s32:
+  case Intrinsic::nvvm_tex_1d_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_v4s32_s32:
+  case Intrinsic::nvvm_tex_2d_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_v4s32_s32:
+  case Intrinsic::nvvm_tex_3d_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_v4u32_s32:
+  case Intrinsic::nvvm_tex_1d_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_v4u32_s32:
+  case Intrinsic::nvvm_tex_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_v4u32_s32:
+  case Intrinsic::nvvm_tex_3d_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
     Info.opc = getOpcForTextureInstr(Intrinsic);
-    Info.memVT = MVT::i32;
+    Info.memVT = MVT::v4i32;
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.vol = 0;
@@ -2469,6 +3482,21 @@
     Info.align = 16;
     return true;
   }
+  case Intrinsic::nvvm_suld_1d_i8_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+  case Intrinsic::nvvm_suld_2d_i8_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+  case Intrinsic::nvvm_suld_3d_i8_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
   case Intrinsic::nvvm_suld_1d_i8_trap:
   case Intrinsic::nvvm_suld_1d_v2i8_trap:
   case Intrinsic::nvvm_suld_1d_v4i8_trap:
@@ -2483,7 +3511,22 @@
   case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
   case Intrinsic::nvvm_suld_3d_i8_trap:
   case Intrinsic::nvvm_suld_3d_v2i8_trap:
-  case Intrinsic::nvvm_suld_3d_v4i8_trap: {
+  case Intrinsic::nvvm_suld_3d_v4i8_trap:
+  case Intrinsic::nvvm_suld_1d_i8_zero:
+  case Intrinsic::nvvm_suld_1d_v2i8_zero:
+  case Intrinsic::nvvm_suld_1d_v4i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+  case Intrinsic::nvvm_suld_2d_i8_zero:
+  case Intrinsic::nvvm_suld_2d_v2i8_zero:
+  case Intrinsic::nvvm_suld_2d_v4i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+  case Intrinsic::nvvm_suld_3d_i8_zero:
+  case Intrinsic::nvvm_suld_3d_v2i8_zero:
+  case Intrinsic::nvvm_suld_3d_v4i8_zero: {
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i8;
     Info.ptrVal = nullptr;
@@ -2494,6 +3537,21 @@
     Info.align = 16;
     return true;
   }
+  case Intrinsic::nvvm_suld_1d_i16_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+  case Intrinsic::nvvm_suld_2d_i16_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+  case Intrinsic::nvvm_suld_3d_i16_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
   case Intrinsic::nvvm_suld_1d_i16_trap:
   case Intrinsic::nvvm_suld_1d_v2i16_trap:
   case Intrinsic::nvvm_suld_1d_v4i16_trap:
@@ -2508,7 +3566,22 @@
   case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
   case Intrinsic::nvvm_suld_3d_i16_trap:
   case Intrinsic::nvvm_suld_3d_v2i16_trap:
-  case Intrinsic::nvvm_suld_3d_v4i16_trap: {
+  case Intrinsic::nvvm_suld_3d_v4i16_trap:
+  case Intrinsic::nvvm_suld_1d_i16_zero:
+  case Intrinsic::nvvm_suld_1d_v2i16_zero:
+  case Intrinsic::nvvm_suld_1d_v4i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+  case Intrinsic::nvvm_suld_2d_i16_zero:
+  case Intrinsic::nvvm_suld_2d_v2i16_zero:
+  case Intrinsic::nvvm_suld_2d_v4i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+  case Intrinsic::nvvm_suld_3d_i16_zero:
+  case Intrinsic::nvvm_suld_3d_v2i16_zero:
+  case Intrinsic::nvvm_suld_3d_v4i16_zero: {
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i16;
     Info.ptrVal = nullptr;
@@ -2519,6 +3592,21 @@
     Info.align = 16;
     return true;
   }
+  case Intrinsic::nvvm_suld_1d_i32_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+  case Intrinsic::nvvm_suld_2d_i32_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+  case Intrinsic::nvvm_suld_3d_i32_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
   case Intrinsic::nvvm_suld_1d_i32_trap:
   case Intrinsic::nvvm_suld_1d_v2i32_trap:
   case Intrinsic::nvvm_suld_1d_v4i32_trap:
@@ -2533,7 +3621,22 @@
   case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
   case Intrinsic::nvvm_suld_3d_i32_trap:
   case Intrinsic::nvvm_suld_3d_v2i32_trap:
-  case Intrinsic::nvvm_suld_3d_v4i32_trap: {
+  case Intrinsic::nvvm_suld_3d_v4i32_trap:
+  case Intrinsic::nvvm_suld_1d_i32_zero:
+  case Intrinsic::nvvm_suld_1d_v2i32_zero:
+  case Intrinsic::nvvm_suld_1d_v4i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+  case Intrinsic::nvvm_suld_2d_i32_zero:
+  case Intrinsic::nvvm_suld_2d_v2i32_zero:
+  case Intrinsic::nvvm_suld_2d_v4i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+  case Intrinsic::nvvm_suld_3d_i32_zero:
+  case Intrinsic::nvvm_suld_3d_v2i32_zero:
+  case Intrinsic::nvvm_suld_3d_v4i32_zero: {
     Info.opc = getOpcForSurfaceInstr(Intrinsic);
     Info.memVT = MVT::i32;
     Info.ptrVal = nullptr;
@@ -2544,7 +3647,46 @@
     Info.align = 16;
     return true;
   }
-
+  case Intrinsic::nvvm_suld_1d_i64_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+  case Intrinsic::nvvm_suld_2d_i64_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+  case Intrinsic::nvvm_suld_3d_i64_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_1d_i64_trap:
+  case Intrinsic::nvvm_suld_1d_v2i64_trap:
+  case Intrinsic::nvvm_suld_1d_array_i64_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+  case Intrinsic::nvvm_suld_2d_i64_trap:
+  case Intrinsic::nvvm_suld_2d_v2i64_trap:
+  case Intrinsic::nvvm_suld_2d_array_i64_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+  case Intrinsic::nvvm_suld_3d_i64_trap:
+  case Intrinsic::nvvm_suld_3d_v2i64_trap:
+  case Intrinsic::nvvm_suld_1d_i64_zero:
+  case Intrinsic::nvvm_suld_1d_v2i64_zero:
+  case Intrinsic::nvvm_suld_1d_array_i64_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+  case Intrinsic::nvvm_suld_2d_i64_zero:
+  case Intrinsic::nvvm_suld_2d_v2i64_zero:
+  case Intrinsic::nvvm_suld_2d_array_i64_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+  case Intrinsic::nvvm_suld_3d_i64_zero:
+  case Intrinsic::nvvm_suld_3d_v2i64_zero: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i64;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
   }
   return false;
 }
@@ -2648,7 +3790,31 @@
 //                         NVPTX DAG Combining
 //===----------------------------------------------------------------------===//
 
-extern unsigned FMAContractLevel;
+bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
+                                   CodeGenOpt::Level OptLevel) const {
+  const Function *F = MF.getFunction();
+  const TargetOptions &TO = MF.getTarget().Options;
+
+  // Always honor command-line argument
+  if (FMAContractLevelOpt.getNumOccurrences() > 0) {
+    return FMAContractLevelOpt > 0;
+  } else if (OptLevel == 0) {
+    // Do not contract if we're not optimizing the code
+    return false;
+  } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
+    // Honor TargetOptions flags that explicitly say fusion is okay
+    return true;
+  } else if (F->hasFnAttribute("unsafe-fp-math")) {
+    // Check for unsafe-fp-math=true coming from Clang
+    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+    StringRef Val = Attr.getValueAsString();
+    if (Val == "true")
+      return true;
+  }
+
+  // We did not have a clear indication that fusion is allowed, so assume not
+  return false;
+}
 
 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
@@ -2682,7 +3848,9 @@
   }
   else if (N0.getOpcode() == ISD::FMUL) {
     if (VT == MVT::f32 || VT == MVT::f64) {
-      if (FMAContractLevel == 0)
+      const auto *TLI = static_cast<const NVPTXTargetLowering *>(
+          &DAG.getTargetLoweringInfo());
+      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
         return SDValue();
 
       // For floating point:
@@ -2867,13 +4035,13 @@
   if (Op.getOpcode() == ISD::SIGN_EXTEND ||
       Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
     EVT OrigVT = Op.getOperand(0).getValueType();
-    if (OrigVT.getSizeInBits() == OptSize) {
+    if (OrigVT.getSizeInBits() <= OptSize) {
       S = Signed;
       return true;
     }
   } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
     EVT OrigVT = Op.getOperand(0).getValueType();
-    if (OrigVT.getSizeInBits() == OptSize) {
+    if (OrigVT.getSizeInBits() <= OptSize) {
       S = Unsigned;
       return true;
     }
@@ -3027,8 +4195,7 @@
 
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
-  // FIXME: Get this from the DAG somehow
-  CodeGenOpt::Level OptLevel = CodeGenOpt::Aggressive;
+  CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
   switch (N->getOpcode()) {
     default: break;
     case ISD::ADD:
@@ -3046,6 +4213,7 @@
 
 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                              const DataLayout *TD,
                               SmallVectorImpl<SDValue> &Results) {
   EVT ResVT = N->getValueType(0);
   SDLoc DL(N);
@@ -3073,6 +4241,20 @@
     break;
   }
 
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+
+  unsigned Align = LD->getAlignment();
+  unsigned PrefAlign =
+    TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+  if (Align < PrefAlign) {
+    // This load is not sufficiently aligned, so bail out and let this vector
+    // load be scalarized.  Note that we may still be able to emit smaller
+    // vector loads.  For example, if we are loading a <4 x float> with an
+    // alignment of 8, this check will fail but the legalizer will try again
+    // with 2 x <2 x float>, which will succeed with an alignment of 8.
+    return;
+  }
+
   EVT EltVT = ResVT.getVectorElementType();
   unsigned NumElts = ResVT.getVectorNumElements();
 
@@ -3109,8 +4291,6 @@
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
     OtherOps.push_back(N->getOperand(i));
 
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-
   // The select routine does not have access to the LoadSDNode instance, so
   // pass along the extension information
   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
@@ -3283,7 +4463,7 @@
   default:
     report_fatal_error("Unhandled custom legalization");
   case ISD::LOAD:
-    ReplaceLoadVector(N, DAG, Results);
+    ReplaceLoadVector(N, DAG, getDataLayout(), Results);
     return;
   case ISD::INTRINSIC_W_CHAIN:
     ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
@@ -3316,3 +4496,10 @@
   delete DwarfRangesSection;
   delete DwarfMacroInfoSection;
 }
+
+const MCSection *
+NVPTXTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                              SectionKind Kind, Mangler &Mang,
+                                              const TargetMachine &TM) const {
+  return getDataSection();
+}

diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 7b4026d..d66d81a 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTXISELLOWERING_H
-#define NVPTXISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXISELLOWERING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXISELLOWERING_H
 
 #include "NVPTX.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -77,54 +77,244 @@
   StoreRetvalV4,
 
   // Texture intrinsics
-  Tex1DFloatI32,
+  Tex1DFloatS32,
   Tex1DFloatFloat,
   Tex1DFloatFloatLevel,
   Tex1DFloatFloatGrad,
-  Tex1DI32I32,
-  Tex1DI32Float,
-  Tex1DI32FloatLevel,
-  Tex1DI32FloatGrad,
-  Tex1DArrayFloatI32,
+  Tex1DS32S32,
+  Tex1DS32Float,
+  Tex1DS32FloatLevel,
+  Tex1DS32FloatGrad,
+  Tex1DU32S32,
+  Tex1DU32Float,
+  Tex1DU32FloatLevel,
+  Tex1DU32FloatGrad,
+  Tex1DArrayFloatS32,
   Tex1DArrayFloatFloat,
   Tex1DArrayFloatFloatLevel,
   Tex1DArrayFloatFloatGrad,
-  Tex1DArrayI32I32,
-  Tex1DArrayI32Float,
-  Tex1DArrayI32FloatLevel,
-  Tex1DArrayI32FloatGrad,
-  Tex2DFloatI32,
+  Tex1DArrayS32S32,
+  Tex1DArrayS32Float,
+  Tex1DArrayS32FloatLevel,
+  Tex1DArrayS32FloatGrad,
+  Tex1DArrayU32S32,
+  Tex1DArrayU32Float,
+  Tex1DArrayU32FloatLevel,
+  Tex1DArrayU32FloatGrad,
+  Tex2DFloatS32,
   Tex2DFloatFloat,
   Tex2DFloatFloatLevel,
   Tex2DFloatFloatGrad,
-  Tex2DI32I32,
-  Tex2DI32Float,
-  Tex2DI32FloatLevel,
-  Tex2DI32FloatGrad,
-  Tex2DArrayFloatI32,
+  Tex2DS32S32,
+  Tex2DS32Float,
+  Tex2DS32FloatLevel,
+  Tex2DS32FloatGrad,
+  Tex2DU32S32,
+  Tex2DU32Float,
+  Tex2DU32FloatLevel,
+  Tex2DU32FloatGrad,
+  Tex2DArrayFloatS32,
   Tex2DArrayFloatFloat,
   Tex2DArrayFloatFloatLevel,
   Tex2DArrayFloatFloatGrad,
-  Tex2DArrayI32I32,
-  Tex2DArrayI32Float,
-  Tex2DArrayI32FloatLevel,
-  Tex2DArrayI32FloatGrad,
-  Tex3DFloatI32,
+  Tex2DArrayS32S32,
+  Tex2DArrayS32Float,
+  Tex2DArrayS32FloatLevel,
+  Tex2DArrayS32FloatGrad,
+  Tex2DArrayU32S32,
+  Tex2DArrayU32Float,
+  Tex2DArrayU32FloatLevel,
+  Tex2DArrayU32FloatGrad,
+  Tex3DFloatS32,
   Tex3DFloatFloat,
   Tex3DFloatFloatLevel,
   Tex3DFloatFloatGrad,
-  Tex3DI32I32,
-  Tex3DI32Float,
-  Tex3DI32FloatLevel,
-  Tex3DI32FloatGrad,
+  Tex3DS32S32,
+  Tex3DS32Float,
+  Tex3DS32FloatLevel,
+  Tex3DS32FloatGrad,
+  Tex3DU32S32,
+  Tex3DU32Float,
+  Tex3DU32FloatLevel,
+  Tex3DU32FloatGrad,
+  TexCubeFloatFloat,
+  TexCubeFloatFloatLevel,
+  TexCubeS32Float,
+  TexCubeS32FloatLevel,
+  TexCubeU32Float,
+  TexCubeU32FloatLevel,
+  TexCubeArrayFloatFloat,
+  TexCubeArrayFloatFloatLevel,
+  TexCubeArrayS32Float,
+  TexCubeArrayS32FloatLevel,
+  TexCubeArrayU32Float,
+  TexCubeArrayU32FloatLevel,
+  Tld4R2DFloatFloat,
+  Tld4G2DFloatFloat,
+  Tld4B2DFloatFloat,
+  Tld4A2DFloatFloat,
+  Tld4R2DS64Float,
+  Tld4G2DS64Float,
+  Tld4B2DS64Float,
+  Tld4A2DS64Float,
+  Tld4R2DU64Float,
+  Tld4G2DU64Float,
+  Tld4B2DU64Float,
+  Tld4A2DU64Float,
+  TexUnified1DFloatS32,
+  TexUnified1DFloatFloat,
+  TexUnified1DFloatFloatLevel,
+  TexUnified1DFloatFloatGrad,
+  TexUnified1DS32S32,
+  TexUnified1DS32Float,
+  TexUnified1DS32FloatLevel,
+  TexUnified1DS32FloatGrad,
+  TexUnified1DU32S32,
+  TexUnified1DU32Float,
+  TexUnified1DU32FloatLevel,
+  TexUnified1DU32FloatGrad,
+  TexUnified1DArrayFloatS32,
+  TexUnified1DArrayFloatFloat,
+  TexUnified1DArrayFloatFloatLevel,
+  TexUnified1DArrayFloatFloatGrad,
+  TexUnified1DArrayS32S32,
+  TexUnified1DArrayS32Float,
+  TexUnified1DArrayS32FloatLevel,
+  TexUnified1DArrayS32FloatGrad,
+  TexUnified1DArrayU32S32,
+  TexUnified1DArrayU32Float,
+  TexUnified1DArrayU32FloatLevel,
+  TexUnified1DArrayU32FloatGrad,
+  TexUnified2DFloatS32,
+  TexUnified2DFloatFloat,
+  TexUnified2DFloatFloatLevel,
+  TexUnified2DFloatFloatGrad,
+  TexUnified2DS32S32,
+  TexUnified2DS32Float,
+  TexUnified2DS32FloatLevel,
+  TexUnified2DS32FloatGrad,
+  TexUnified2DU32S32,
+  TexUnified2DU32Float,
+  TexUnified2DU32FloatLevel,
+  TexUnified2DU32FloatGrad,
+  TexUnified2DArrayFloatS32,
+  TexUnified2DArrayFloatFloat,
+  TexUnified2DArrayFloatFloatLevel,
+  TexUnified2DArrayFloatFloatGrad,
+  TexUnified2DArrayS32S32,
+  TexUnified2DArrayS32Float,
+  TexUnified2DArrayS32FloatLevel,
+  TexUnified2DArrayS32FloatGrad,
+  TexUnified2DArrayU32S32,
+  TexUnified2DArrayU32Float,
+  TexUnified2DArrayU32FloatLevel,
+  TexUnified2DArrayU32FloatGrad,
+  TexUnified3DFloatS32,
+  TexUnified3DFloatFloat,
+  TexUnified3DFloatFloatLevel,
+  TexUnified3DFloatFloatGrad,
+  TexUnified3DS32S32,
+  TexUnified3DS32Float,
+  TexUnified3DS32FloatLevel,
+  TexUnified3DS32FloatGrad,
+  TexUnified3DU32S32,
+  TexUnified3DU32Float,
+  TexUnified3DU32FloatLevel,
+  TexUnified3DU32FloatGrad,
+  TexUnifiedCubeFloatFloat,
+  TexUnifiedCubeFloatFloatLevel,
+  TexUnifiedCubeS32Float,
+  TexUnifiedCubeS32FloatLevel,
+  TexUnifiedCubeU32Float,
+  TexUnifiedCubeU32FloatLevel,
+  TexUnifiedCubeArrayFloatFloat,
+  TexUnifiedCubeArrayFloatFloatLevel,
+  TexUnifiedCubeArrayS32Float,
+  TexUnifiedCubeArrayS32FloatLevel,
+  TexUnifiedCubeArrayU32Float,
+  TexUnifiedCubeArrayU32FloatLevel,
+  Tld4UnifiedR2DFloatFloat,
+  Tld4UnifiedG2DFloatFloat,
+  Tld4UnifiedB2DFloatFloat,
+  Tld4UnifiedA2DFloatFloat,
+  Tld4UnifiedR2DS64Float,
+  Tld4UnifiedG2DS64Float,
+  Tld4UnifiedB2DS64Float,
+  Tld4UnifiedA2DS64Float,
+  Tld4UnifiedR2DU64Float,
+  Tld4UnifiedG2DU64Float,
+  Tld4UnifiedB2DU64Float,
+  Tld4UnifiedA2DU64Float,
 
   // Surface intrinsics
+  Suld1DI8Clamp,
+  Suld1DI16Clamp,
+  Suld1DI32Clamp,
+  Suld1DI64Clamp,
+  Suld1DV2I8Clamp,
+  Suld1DV2I16Clamp,
+  Suld1DV2I32Clamp,
+  Suld1DV2I64Clamp,
+  Suld1DV4I8Clamp,
+  Suld1DV4I16Clamp,
+  Suld1DV4I32Clamp,
+
+  Suld1DArrayI8Clamp,
+  Suld1DArrayI16Clamp,
+  Suld1DArrayI32Clamp,
+  Suld1DArrayI64Clamp,
+  Suld1DArrayV2I8Clamp,
+  Suld1DArrayV2I16Clamp,
+  Suld1DArrayV2I32Clamp,
+  Suld1DArrayV2I64Clamp,
+  Suld1DArrayV4I8Clamp,
+  Suld1DArrayV4I16Clamp,
+  Suld1DArrayV4I32Clamp,
+
+  Suld2DI8Clamp,
+  Suld2DI16Clamp,
+  Suld2DI32Clamp,
+  Suld2DI64Clamp,
+  Suld2DV2I8Clamp,
+  Suld2DV2I16Clamp,
+  Suld2DV2I32Clamp,
+  Suld2DV2I64Clamp,
+  Suld2DV4I8Clamp,
+  Suld2DV4I16Clamp,
+  Suld2DV4I32Clamp,
+
+  Suld2DArrayI8Clamp,
+  Suld2DArrayI16Clamp,
+  Suld2DArrayI32Clamp,
+  Suld2DArrayI64Clamp,
+  Suld2DArrayV2I8Clamp,
+  Suld2DArrayV2I16Clamp,
+  Suld2DArrayV2I32Clamp,
+  Suld2DArrayV2I64Clamp,
+  Suld2DArrayV4I8Clamp,
+  Suld2DArrayV4I16Clamp,
+  Suld2DArrayV4I32Clamp,
+
+  Suld3DI8Clamp,
+  Suld3DI16Clamp,
+  Suld3DI32Clamp,
+  Suld3DI64Clamp,
+  Suld3DV2I8Clamp,
+  Suld3DV2I16Clamp,
+  Suld3DV2I32Clamp,
+  Suld3DV2I64Clamp,
+  Suld3DV4I8Clamp,
+  Suld3DV4I16Clamp,
+  Suld3DV4I32Clamp,
+
   Suld1DI8Trap,
   Suld1DI16Trap,
   Suld1DI32Trap,
+  Suld1DI64Trap,
   Suld1DV2I8Trap,
   Suld1DV2I16Trap,
   Suld1DV2I32Trap,
+  Suld1DV2I64Trap,
   Suld1DV4I8Trap,
   Suld1DV4I16Trap,
   Suld1DV4I32Trap,
@@ -132,9 +322,11 @@
   Suld1DArrayI8Trap,
   Suld1DArrayI16Trap,
   Suld1DArrayI32Trap,
+  Suld1DArrayI64Trap,
   Suld1DArrayV2I8Trap,
   Suld1DArrayV2I16Trap,
   Suld1DArrayV2I32Trap,
+  Suld1DArrayV2I64Trap,
   Suld1DArrayV4I8Trap,
   Suld1DArrayV4I16Trap,
   Suld1DArrayV4I32Trap,
@@ -142,9 +334,11 @@
   Suld2DI8Trap,
   Suld2DI16Trap,
   Suld2DI32Trap,
+  Suld2DI64Trap,
   Suld2DV2I8Trap,
   Suld2DV2I16Trap,
   Suld2DV2I32Trap,
+  Suld2DV2I64Trap,
   Suld2DV4I8Trap,
   Suld2DV4I16Trap,
   Suld2DV4I32Trap,
@@ -152,9 +346,11 @@
   Suld2DArrayI8Trap,
   Suld2DArrayI16Trap,
   Suld2DArrayI32Trap,
+  Suld2DArrayI64Trap,
   Suld2DArrayV2I8Trap,
   Suld2DArrayV2I16Trap,
   Suld2DArrayV2I32Trap,
+  Suld2DArrayV2I64Trap,
   Suld2DArrayV4I8Trap,
   Suld2DArrayV4I16Trap,
   Suld2DArrayV4I32Trap,
@@ -162,12 +358,74 @@
   Suld3DI8Trap,
   Suld3DI16Trap,
   Suld3DI32Trap,
+  Suld3DI64Trap,
   Suld3DV2I8Trap,
   Suld3DV2I16Trap,
   Suld3DV2I32Trap,
+  Suld3DV2I64Trap,
   Suld3DV4I8Trap,
   Suld3DV4I16Trap,
-  Suld3DV4I32Trap
+  Suld3DV4I32Trap,
+
+  Suld1DI8Zero,
+  Suld1DI16Zero,
+  Suld1DI32Zero,
+  Suld1DI64Zero,
+  Suld1DV2I8Zero,
+  Suld1DV2I16Zero,
+  Suld1DV2I32Zero,
+  Suld1DV2I64Zero,
+  Suld1DV4I8Zero,
+  Suld1DV4I16Zero,
+  Suld1DV4I32Zero,
+
+  Suld1DArrayI8Zero,
+  Suld1DArrayI16Zero,
+  Suld1DArrayI32Zero,
+  Suld1DArrayI64Zero,
+  Suld1DArrayV2I8Zero,
+  Suld1DArrayV2I16Zero,
+  Suld1DArrayV2I32Zero,
+  Suld1DArrayV2I64Zero,
+  Suld1DArrayV4I8Zero,
+  Suld1DArrayV4I16Zero,
+  Suld1DArrayV4I32Zero,
+
+  Suld2DI8Zero,
+  Suld2DI16Zero,
+  Suld2DI32Zero,
+  Suld2DI64Zero,
+  Suld2DV2I8Zero,
+  Suld2DV2I16Zero,
+  Suld2DV2I32Zero,
+  Suld2DV2I64Zero,
+  Suld2DV4I8Zero,
+  Suld2DV4I16Zero,
+  Suld2DV4I32Zero,
+
+  Suld2DArrayI8Zero,
+  Suld2DArrayI16Zero,
+  Suld2DArrayI32Zero,
+  Suld2DArrayI64Zero,
+  Suld2DArrayV2I8Zero,
+  Suld2DArrayV2I16Zero,
+  Suld2DArrayV2I32Zero,
+  Suld2DArrayV2I64Zero,
+  Suld2DArrayV4I8Zero,
+  Suld2DArrayV4I16Zero,
+  Suld2DArrayV4I32Zero,
+
+  Suld3DI8Zero,
+  Suld3DI16Zero,
+  Suld3DI32Zero,
+  Suld3DI64Zero,
+  Suld3DV2I8Zero,
+  Suld3DV2I16Zero,
+  Suld3DV2I32Zero,
+  Suld3DV2I64Zero,
+  Suld3DV4I8Zero,
+  Suld3DV4I16Zero,
+  Suld3DV4I32Zero
 };
 }
 
@@ -178,7 +436,7 @@
 //===--------------------------------------------------------------------===//
 class NVPTXTargetLowering : public TargetLowering {
 public:
-  explicit NVPTXTargetLowering(NVPTXTargetMachine &TM);
+  explicit NVPTXTargetLowering(const NVPTXTargetMachine &TM);
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -237,7 +495,7 @@
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
-  NVPTXTargetMachine *nvTM;
+  const NVPTXTargetMachine *nvTM;
 
   // PTX always uses 32-bit shift amounts
   MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
@@ -245,6 +503,10 @@
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
+  bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+
+  bool isFMAFasterThanFMulAndFAdd(EVT) const override { return true; }
+
 private:
   const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
 
@@ -274,4 +536,4 @@
 };
 } // namespace llvm
 
-#endif // NVPTXISELLOWERING_H
+#endif

diff --git a/lib/Target/NVPTX/NVPTXInstrFormats.td b/lib/Target/NVPTX/NVPTXInstrFormats.td
index f11f1b8..ffcb5d5 100644
--- a/lib/Target/NVPTX/NVPTXInstrFormats.td
+++ b/lib/Target/NVPTX/NVPTXInstrFormats.td

@@ -36,8 +36,24 @@
   bit IsLoad = 0;
   bit IsStore = 0;
 
-  let TSFlags{3-0} = VecInstType;
-  let TSFlags{4-4} = IsSimpleMove;
-  let TSFlags{5-5} = IsLoad;
-  let TSFlags{6-6} = IsStore;
+  bit IsTex = 0;
+  bit IsSust = 0;
+  bit IsSurfTexQuery = 0;
+  bit IsTexModeUnified = 0;
+
+  // The following field is encoded as log2 of the vector size minus one,
+  // with 0 meaning the operation is not a surface instruction.  For example,
+  // if IsSuld == 2, then the instruction is a suld instruction with vector size
+  // 2**(2-1) = 2.
+  bits<2> IsSuld = 0;
+
+  let TSFlags{3-0}   = VecInstType;
+  let TSFlags{4-4}   = IsSimpleMove;
+  let TSFlags{5-5}   = IsLoad;
+  let TSFlags{6-6}   = IsStore;
+  let TSFlags{7}     = IsTex;
+  let TSFlags{9-8}   = IsSuld;
+  let TSFlags{10}    = IsSust;
+  let TSFlags{11}    = IsSurfTexQuery;
+  let TSFlags{12}    = IsTexModeUnified;
 }

diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 2ac2974..6de7536 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTXINSTRUCTIONINFO_H
-#define NVPTXINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXINSTRINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXINSTRINFO_H
 
 #include "NVPTX.h"
 #include "NVPTXRegisterInfo.h"

diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index d2c0373..9900b8c 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td

@@ -139,17 +139,10 @@
 def doF32FTZ : Predicate<"useF32FTZ()">;
 def doNoF32FTZ : Predicate<"!useF32FTZ()">;
 
-def doFMAF32      : Predicate<"doFMAF32">;
-def doFMAF32_ftz  : Predicate<"(doFMAF32 && useF32FTZ())">;
-def doFMAF32AGG      : Predicate<"doFMAF32AGG">;
-def doFMAF32AGG_ftz  : Predicate<"(doFMAF32AGG && useF32FTZ())">;
-def doFMAF64      : Predicate<"doFMAF64">;
-def doFMAF64AGG      : Predicate<"doFMAF64AGG">;
-
 def doMulWide      : Predicate<"doMulWide">;
 
-def allowFMA : Predicate<"allowFMA">;
-def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">;
+def allowFMA : Predicate<"allowFMA()">;
+def noFMA : Predicate<"!allowFMA()">;
 
 def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
 def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
@@ -222,13 +215,13 @@
                       !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
                       [(set Float32Regs:$dst,
                         (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[allowFMA_ftz]>;
+                      Requires<[allowFMA, doF32FTZ]>;
    def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, f32imm:$b),
                       !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
                       [(set Float32Regs:$dst,
                         (OpNode Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[allowFMA_ftz]>;
+                      Requires<[allowFMA, doF32FTZ]>;
    def f32rr : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, Float32Regs:$b),
                       !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
@@ -248,34 +241,38 @@
                       (ins Float64Regs:$a, Float64Regs:$b),
                       !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
                       [(set Float64Regs:$dst,
-                        (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                      Requires<[noFMA]>;
    def f64ri : NVPTXInst<(outs Float64Regs:$dst),
                       (ins Float64Regs:$a, f64imm:$b),
                       !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
                       [(set Float64Regs:$dst,
-                        (OpNode Float64Regs:$a, fpimm:$b))]>;
+                        (OpNode Float64Regs:$a, fpimm:$b))]>,
+                      Requires<[noFMA]>;
    def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, Float32Regs:$b),
                       !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
                       [(set Float32Regs:$dst,
                         (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[doF32FTZ]>;
+                      Requires<[noFMA, doF32FTZ]>;
    def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, f32imm:$b),
                       !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
                       [(set Float32Regs:$dst,
                         (OpNode Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[doF32FTZ]>;
+                      Requires<[noFMA, doF32FTZ]>;
    def f32rr : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, Float32Regs:$b),
                       !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
                       [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[noFMA]>;
    def f32ri : NVPTXInst<(outs Float32Regs:$dst),
                       (ins Float32Regs:$a, f32imm:$b),
                       !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
                       [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, fpimm:$b))]>;
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[noFMA]>;
 }
 
 multiclass F2<string OpcStr, SDNode OpNode> {
@@ -919,8 +916,8 @@
 }
 
 defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
-defm FMA32  : FPCONTRACT32<"fma.rn.f32", doNoF32FTZ>;
-defm FMA64  : FPCONTRACT64<"fma.rn.f64", doNoF32FTZ>;
+defm FMA32  : FPCONTRACT32<"fma.rn.f32", true>;
+defm FMA64  : FPCONTRACT64<"fma.rn.f64", true>;
 
 def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                       "sin.approx.f32 \t$dst, $src;",
@@ -1917,7 +1914,7 @@
 def StoreParamV4I32    : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2,
                                                Int32Regs:$val3, Int32Regs:$val4,
                                                 i32imm:$a, i32imm:$b),
-                   "st.param.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+                "st.param.v4.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
                          []>;
 
 def StoreParamV4I16    : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,

diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 0ad3dfa..14e51aa 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td

@@ -792,13 +792,18 @@
             "}}")))),
           Float32Regs, Int16Regs, int_nvvm_h2f>;
 
-def : Pat<(f32 (f16_to_f32 Int16Regs:$a)),
+def : Pat<(f32 (f16_to_fp Int16Regs:$a)),
           (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i16 (f32_to_f16 Float32Regs:$a)),
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
           (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (f32_to_f16 Float32Regs:$a)),
+def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
           (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
 
+def : Pat<(f64 (f16_to_fp Int16Regs:$a)),
+          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (fp_to_f16 Float64Regs:$a)),
+          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+
 //
 // Bitcast
 //
@@ -1936,9 +1941,10 @@
 // NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be
 // also defined in NVPTXReplaceImageHandles.cpp
 
-
+// texmode_independent
+let IsTex = 1, IsTexModeUnified = 0 in {
 // Texture fetch instructions using handles
-def TEX_1D_F32_I32
+def TEX_1D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
@@ -1965,19 +1971,19 @@
               "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
-def TEX_1D_I32_I32
+def TEX_1D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
               "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
-def TEX_1D_I32_F32
+def TEX_1D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
               "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
-def TEX_1D_I32_F32_LEVEL
+def TEX_1D_S32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
@@ -1985,7 +1991,7 @@
               "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], $lod;",
               []>;
-def TEX_1D_I32_F32_GRAD
+def TEX_1D_S32_F32_GRAD
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
@@ -1993,8 +1999,36 @@
               "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
+def TEX_1D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
 
-def TEX_1D_ARRAY_F32_I32
+def TEX_1D_ARRAY_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
@@ -2024,21 +2058,21 @@
               "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
-def TEX_1D_ARRAY_I32_I32
+def TEX_1D_ARRAY_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
               "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
-def TEX_1D_ARRAY_I32_F32
+def TEX_1D_ARRAY_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
               "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
-def TEX_1D_ARRAY_I32_F32_LEVEL
+def TEX_1D_ARRAY_S32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
@@ -2046,7 +2080,7 @@
               "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], $lod;",
               []>;
-def TEX_1D_ARRAY_I32_F32_GRAD
+def TEX_1D_ARRAY_S32_F32_GRAD
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
@@ -2054,8 +2088,38 @@
               "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
+def TEX_1D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
 
-def TEX_2D_F32_I32
+def TEX_2D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
@@ -2087,21 +2151,21 @@
               "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
-def TEX_2D_I32_I32
+def TEX_2D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
               "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
-def TEX_2D_I32_F32
+def TEX_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
               "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
-def TEX_2D_I32_F32_LEVEL
+def TEX_2D_S32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
@@ -2109,7 +2173,7 @@
               "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], $lod;",
               []>;
-def TEX_2D_I32_F32_GRAD
+def TEX_2D_S32_F32_GRAD
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
@@ -2119,8 +2183,40 @@
               "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
+def TEX_2D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
 
-def TEX_2D_ARRAY_F32_I32
+def TEX_2D_ARRAY_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
@@ -2154,7 +2250,7 @@
               "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
-def TEX_2D_ARRAY_I32_I32
+def TEX_2D_ARRAY_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
@@ -2162,7 +2258,7 @@
               "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
-def TEX_2D_ARRAY_I32_F32
+def TEX_2D_ARRAY_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
@@ -2170,7 +2266,7 @@
               "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
-def TEX_2D_ARRAY_I32_F32_LEVEL
+def TEX_2D_ARRAY_S32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
@@ -2178,7 +2274,7 @@
               "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
-def TEX_2D_ARRAY_I32_F32_GRAD
+def TEX_2D_ARRAY_S32_F32_GRAD
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
@@ -2189,8 +2285,43 @@
               "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
+def TEX_2D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
 
-def TEX_3D_F32_I32
+def TEX_3D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
@@ -2227,7 +2358,7 @@
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
               []>;
-def TEX_3D_I32_I32
+def TEX_3D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
@@ -2235,7 +2366,7 @@
               "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
-def TEX_3D_I32_F32
+def TEX_3D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
@@ -2243,7 +2374,7 @@
               "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
-def TEX_3D_I32_F32_LEVEL
+def TEX_3D_S32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
@@ -2251,7 +2382,7 @@
               "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
-def TEX_3D_I32_F32_GRAD
+def TEX_3D_S32_F32_GRAD
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
@@ -2264,9 +2395,1234 @@
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
               []>;
+def TEX_3D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+
+def TEX_CUBE_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_CUBE_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_CUBE_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_CUBE_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+
+def TEX_CUBE_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_CUBE_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_CUBE_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_CUBE_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_CUBE_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+
+def TLD4_R_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_G_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_B_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_A_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_R_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_G_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_B_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_A_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_R_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_G_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_B_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TLD4_A_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+}
 
 
-// Surface load instructions
+// texmode_unified
+let IsTex = 1, IsTexModeUnified = 1 in {
+// Texture fetch instructions using handles
+def TEX_UNIFIED_1D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x),
+              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x),
+              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod),
+              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x),
+              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x),
+              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x),
+              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x),
+              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              []>;
+def TEX_UNIFIED_1D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_UNIFIED_1D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}];",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_UNIFIED_2D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_UNIFIED_2D_ARRAY_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_UNIFIED_3D_F32_S32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_3D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_UNIFIED_3D_S32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_3D_S32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_UNIFIED_3D_U32_S32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_3D_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_3D_U32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+
+def TEX_UNIFIED_CUBE_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+               Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
+              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}];",
+              []>;
+def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int32Regs:$l,
+                   Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
+                   Float32Regs:$lod),
+              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, \\{$l, $x, $y, $z\\}], $lod;",
+              []>;
+
+def TLD4_UNIFIED_R_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_G_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_B_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_A_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
+                    Float32Regs:$v2, Float32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_R_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_G_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_B_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_A_2D_S32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_R_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_G_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_B_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+def TLD4_UNIFIED_A_2D_U32_F32
+  : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
+                    Int32Regs:$v2, Int32Regs:$v3),
+              (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
+              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "[$t, \\{$x, $y\\}];",
+              []>;
+}
+
+
+
+//=== Surface load instructions
+// .clamp variant
+let IsSuld = 1 in {
+def SULD_1D_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b64.clamp \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b64.clamp \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b64.clamp \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b64.clamp \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b64.clamp \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b64.clamp \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_V2I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I64_CLAMP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b64.clamp \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+
+def SULD_3D_V4I8_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_CLAMP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_CLAMP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.clamp \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+
+// .trap variant
+let IsSuld = 1 in {
 def SULD_1D_I8_TRAP
   : NVPTXInst<(outs Int16Regs:$r),
               (ins Int64Regs:$s, Int32Regs:$x),
@@ -2282,35 +3638,10 @@
               (ins Int64Regs:$s, Int32Regs:$x),
               "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];",
               []>;
-def SULD_1D_V2I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+def SULD_1D_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
               (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V2I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V4I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V4I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
-              []>;
-def SULD_1D_V4I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-              (ins Int64Regs:$s, Int32Regs:$x),
-              "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              "suld.b.1d.b64.trap \\{$r\\}, [$s, \\{$x\\}];",
               []>;
 
 def SULD_1D_ARRAY_I8_TRAP
@@ -2328,6 +3659,98 @@
               (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
               "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
               []>;
+def SULD_1D_ARRAY_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b64.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b64.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b64.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b64.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+
 def SULD_1D_ARRAY_V2I8_TRAP
   : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
               (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
@@ -2343,6 +3766,97 @@
               (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
               "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
               []>;
+def SULD_1D_ARRAY_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b64.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I64_TRAP
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b64.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
 def SULD_1D_ARRAY_V4I8_TRAP
   : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
               (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
@@ -2362,36 +3876,6 @@
               "[$s, \\{$l, $x\\}];",
               []>;
 
-def SULD_2D_I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
-def SULD_2D_V2I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
-              []>;
 def SULD_2D_V4I8_TRAP
   : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
               (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
@@ -2408,39 +3892,6 @@
               "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
               []>;
 
-def SULD_2D_ARRAY_I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
-def SULD_2D_ARRAY_V2I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
-              "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, "
-              "[$s, \\{$l, $x, $y, $y\\}];",
-              []>;
 def SULD_2D_ARRAY_V4I8_TRAP
   : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
               (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
@@ -2460,36 +3911,7 @@
               "[$s, \\{$l, $x, $y, $y\\}];",
               []>;
 
-def SULD_3D_I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I8_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I16_TRAP
-  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
-def SULD_3D_V2I32_TRAP
-  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
-              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
-              "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
-              []>;
+
 def SULD_3D_V4I8_TRAP
   : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
               (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
@@ -2508,11 +3930,324 @@
               "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
               "[$s, \\{$x, $y, $z, $z\\}];",
               []>;
+}
 
+// .zero variant
+let IsSuld = 1 in {
+def SULD_1D_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b64.zero \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b64.zero \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b64.zero \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b64.zero \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b64.zero \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 2 in {
+def SULD_1D_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b64.zero \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_V2I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I64_ZERO
+  : NVPTXInst<(outs Int64Regs:$r, Int64Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b64.zero \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
+
+let IsSuld = 3 in {
+def SULD_1D_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.zero \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+
+def SULD_3D_V4I8_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_ZERO
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_ZERO
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.zero \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+}
 
 //-----------------------------------
 // Texture Query Intrinsics
 //-----------------------------------
+
+let IsSurfTexQuery = 1 in {
 def TXQ_CHANNEL_ORDER
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.channel_order.b32 \t$d, [$a];",
@@ -2545,6 +4280,7 @@
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "txq.num_mipmap_levels.b32 \t$d, [$a];",
               []>;
+}
 
 def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a),
           (TXQ_CHANNEL_ORDER Int64Regs:$a)>;
@@ -2567,6 +4303,8 @@
 //-----------------------------------
 // Surface Query Intrinsics
 //-----------------------------------
+
+let IsSurfTexQuery = 1 in {
 def SUQ_CHANNEL_ORDER
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "suq.channel_order.b32 \t$d, [$a];",
@@ -2591,6 +4329,7 @@
   : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
               "suq.array_size.b32 \t$d, [$a];",
               []>;
+}
 
 def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a),
           (SUQ_CHANNEL_ORDER Int64Regs:$a)>;
@@ -2624,8 +4363,354 @@
 
 //===- Surface Stores -----------------------------------------------------===//
 
+let IsSust = 1 in {
 // Unformatted
+// .clamp variant
+def SUST_B_1D_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.1d.b64.clamp \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+              "sust.b.1d.v2.b64.clamp \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.clamp \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
 
+
+def SUST_B_1D_ARRAY_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.a1d.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.a1d.v2.b64.clamp \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.clamp \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.clamp \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.clamp \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+              "sust.b.2d.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.2d.v2.b64.clamp \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.clamp \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.clamp \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.clamp \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r),
+              "sust.b.a2d.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r, Int64Regs:$g),
+             "sust.b.a2d.v2.b64.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.clamp \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r),
+              "sust.b.3d.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B64_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r, Int64Regs:$g),
+              "sust.b.3d.v2.b64.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_CLAMP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.clamp \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+// .trap variant
 def SUST_B_1D_B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
@@ -2641,6 +4726,11 @@
               (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
               "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
               []>;
+def SUST_B_1D_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.1d.b64.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
 def SUST_B_1D_V2B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
@@ -2656,6 +4746,11 @@
               (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
               "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
               []>;
+def SUST_B_1D_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+              "sust.b.1d.v2.b64.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
 def SUST_B_1D_V4B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
@@ -2691,6 +4786,11 @@
               (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
               "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
               []>;
+def SUST_B_1D_ARRAY_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.a1d.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
 def SUST_B_1D_ARRAY_V2B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
@@ -2709,6 +4809,12 @@
                    Int32Regs:$g),
               "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
               []>;
+def SUST_B_1D_ARRAY_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.a1d.v2.b64.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
 def SUST_B_1D_ARRAY_V4B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
@@ -2747,6 +4853,11 @@
               (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
               "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
               []>;
+def SUST_B_2D_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+              "sust.b.2d.b64.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
 def SUST_B_2D_V2B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
@@ -2765,6 +4876,12 @@
                    Int32Regs:$g),
               "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
               []>;
+def SUST_B_2D_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.2d.v2.b64.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
 def SUST_B_2D_V4B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
@@ -2806,6 +4923,12 @@
                    Int32Regs:$r),
               "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
               []>;
+def SUST_B_2D_ARRAY_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r),
+              "sust.b.a2d.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
 def SUST_B_2D_ARRAY_V2B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
@@ -2827,6 +4950,13 @@
              "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
              "\\{$r, $g\\};",
               []>;
+def SUST_B_2D_ARRAY_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r, Int64Regs:$g),
+             "sust.b.a2d.v2.b64.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
 def SUST_B_2D_ARRAY_V4B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
@@ -2868,6 +4998,12 @@
                    Int32Regs:$r),
               "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
               []>;
+def SUST_B_3D_B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r),
+              "sust.b.3d.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
 def SUST_B_3D_V2B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
@@ -2889,6 +5025,13 @@
               "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
               "\\{$r, $g\\};",
               []>;
+def SUST_B_3D_V2B64_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r, Int64Regs:$g),
+              "sust.b.3d.v2.b64.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
 def SUST_B_3D_V4B8_TRAP
   : NVPTXInst<(outs),
               (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
@@ -2911,6 +5054,353 @@
         "\\{$r, $g, $b, $a\\};",
               []>;
 
+
+// .zero variant
+def SUST_B_1D_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.1d.b64.zero \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+              "sust.b.1d.v2.b64.zero \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.zero \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_1D_ARRAY_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r),
+              "sust.b.a1d.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.a1d.v2.b64.zero \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.zero \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.zero \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.zero \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+              "sust.b.2d.b64.zero \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+                   Int64Regs:$g),
+              "sust.b.2d.v2.b64.zero \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.zero \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.zero \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.zero \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r),
+              "sust.b.a2d.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int64Regs:$r, Int64Regs:$g),
+             "sust.b.a2d.v2.b64.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.zero \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r),
+              "sust.b.3d.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B64_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int64Regs:$r, Int64Regs:$g),
+              "sust.b.3d.v2.b64.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_ZERO
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.zero \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+
 // Formatted
 
 def SUST_P_1D_B8_TRAP
@@ -3197,12 +5687,341 @@
         "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
         "\\{$r, $g, $b, $a\\};",
               []>;
-
+}
 
 // Surface store instruction patterns
 // I'm not sure why we can't just include these in the instruction definitions,
 // but TableGen complains of type errors :(
 
+// .clamp variant
+def : Pat<(int_nvvm_sust_b_1d_i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_ARRAY_B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_2D_V2B64_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_CLAMP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_ARRAY_B64_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+           Int64Regs:$g),
+          (SUST_B_2D_ARRAY_V2B64_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_CLAMP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_CLAMP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r),
+          (SUST_B_3D_B64_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_3D_V2B64_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_CLAMP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+// .trap variant
 def : Pat<(int_nvvm_sust_b_1d_i8_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
           (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
@@ -3215,6 +6034,10 @@
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
           (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
 
+def : Pat<(int_nvvm_sust_b_1d_i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
 def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
            Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
           (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
@@ -3230,6 +6053,11 @@
           (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
+def : Pat<(int_nvvm_sust_b_1d_v2i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
 def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
            Int64Regs:$s, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
@@ -3265,6 +6093,11 @@
           (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r)>;
 
+def : Pat<(int_nvvm_sust_b_1d_array_i64_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_ARRAY_B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r)>;
+
 def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
           (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
@@ -3280,6 +6113,11 @@
           (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int32Regs:$r, Int32Regs:$g)>;
 
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
 def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
@@ -3315,6 +6153,11 @@
           (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
+def : Pat<(int_nvvm_sust_b_2d_i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
 def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
           (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
@@ -3330,6 +6173,11 @@
           (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r, Int32Regs:$g)>;
 
+def : Pat<(int_nvvm_sust_b_2d_v2i64_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_2D_V2B64_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r, Int64Regs:$g)>;
+
 def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
@@ -3368,6 +6216,12 @@
            Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int32Regs:$r)>;
 
+def : Pat<(int_nvvm_sust_b_2d_array_i64_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_ARRAY_B64_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
 def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g),
@@ -3388,6 +6242,12 @@
           (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
 
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+           Int64Regs:$g),
+          (SUST_B_2D_ARRAY_V2B64_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
 def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
            Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
@@ -3432,6 +6292,13 @@
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r)>;
 
+def : Pat<(int_nvvm_sust_b_3d_i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r),
+          (SUST_B_3D_B64_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r)>;
+
 def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g),
@@ -3453,6 +6320,13 @@
            Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int32Regs:$r, Int32Regs:$g)>;
 
+def : Pat<(int_nvvm_sust_b_3d_v2i64_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_3D_V2B64_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g)>;
+
 def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
            Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
            Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
@@ -3475,6 +6349,334 @@
            Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 
+// .zero variant
+def : Pat<(int_nvvm_sust_b_1d_i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_zero
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_zero
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_zero
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i64_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
+          (SUST_B_1D_ARRAY_B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_1D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i64_zero
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_2D_V2B64_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_ZERO Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i64_zero
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
+          (SUST_B_2D_ARRAY_B64_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
+           Int64Regs:$g),
+          (SUST_B_2D_ARRAY_V2B64_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_ZERO Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_ZERO Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r),
+          (SUST_B_3D_B64_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i64_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g),
+          (SUST_B_3D_V2B64_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int64Regs:$r, Int64Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_ZERO Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
 
 
 def : Pat<(int_nvvm_sust_p_1d_i8_trap

diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
index 5ec1fc9..8759406 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTX_LOWER_AGGR_COPIES_H
-#define NVPTX_LOWER_AGGR_COPIES_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXLOWERAGGRCOPIES_H
 
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/IR/DataLayout.h"

diff --git a/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp b/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp
new file mode 100644
index 0000000..3149399
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXLowerStructArgs.cpp

@@ -0,0 +1,134 @@
+//===-- NVPTXLowerStructArgs.cpp - Copy struct args to local memory =====--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Copy struct args to local memory. This is needed for kernel functions only.
+// This is a preparation for handling cases like
+//
+// kernel void foo(struct A arg, ...)
+// {
+//     struct A *p = &arg;
+//     ...
+//     ... = p->filed1 ...  (this is no generic address for .param)
+//     p->filed2 = ...      (this is no write access to .param)
+// }
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVPTXLowerStructArgsPass(PassRegistry &);
+}
+
+class LLVM_LIBRARY_VISIBILITY NVPTXLowerStructArgs : public FunctionPass {
+  bool runOnFunction(Function &F) override;
+
+  void handleStructPtrArgs(Function &);
+  void handleParam(Argument *);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  NVPTXLowerStructArgs() : FunctionPass(ID) {}
+  const char *getPassName() const override {
+    return "Copy structure (byval *) arguments to stack";
+  }
+};
+
+char NVPTXLowerStructArgs::ID = 1;
+
+INITIALIZE_PASS(NVPTXLowerStructArgs, "nvptx-lower-struct-args",
+                "Lower structure arguments (NVPTX)", false, false)
+
+void NVPTXLowerStructArgs::handleParam(Argument *Arg) {
+  Function *Func = Arg->getParent();
+  Instruction *FirstInst = &(Func->getEntryBlock().front());
+  PointerType *PType = dyn_cast<PointerType>(Arg->getType());
+
+  assert(PType && "Expecting pointer type in handleParam");
+
+  Type *StructType = PType->getElementType();
+  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
+
+  /* Set the alignment to alignment of the byval parameter. This is because,
+   * later load/stores assume that alignment, and we are going to replace
+   * the use of the byval parameter with this alloca instruction.
+   */
+  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
+
+  Arg->replaceAllUsesWith(AllocA);
+
+  // Get the cvt.gen.to.param intrinsic
+  Type *CvtTypes[] = {
+      Type::getInt8PtrTy(Func->getParent()->getContext(), ADDRESS_SPACE_PARAM),
+      Type::getInt8PtrTy(Func->getParent()->getContext(),
+                         ADDRESS_SPACE_GENERIC)};
+  Function *CvtFunc = Intrinsic::getDeclaration(
+      Func->getParent(), Intrinsic::nvvm_ptr_gen_to_param, CvtTypes);
+
+  Value *BitcastArgs[] = {
+      new BitCastInst(Arg, Type::getInt8PtrTy(Func->getParent()->getContext(),
+                                              ADDRESS_SPACE_GENERIC),
+                      Arg->getName(), FirstInst)};
+  CallInst *CallCVT =
+      CallInst::Create(CvtFunc, BitcastArgs, "cvt_to_param", FirstInst);
+
+  BitCastInst *BitCast = new BitCastInst(
+      CallCVT, PointerType::get(StructType, ADDRESS_SPACE_PARAM),
+      Arg->getName(), FirstInst);
+  LoadInst *LI = new LoadInst(BitCast, Arg->getName(), FirstInst);
+  new StoreInst(LI, AllocA, FirstInst);
+}
+
+// =============================================================================
+// If the function had a struct ptr arg, say foo(%struct.x *byval %d), then
+// add the following instructions to the first basic block :
+//
+// %temp = alloca %struct.x, align 8
+// %tt1 = bitcast %struct.x * %d to i8 *
+// %tt2 = llvm.nvvm.cvt.gen.to.param %tt2
+// %tempd = bitcast i8 addrspace(101) * to %struct.x addrspace(101) *
+// %tv = load %struct.x addrspace(101) * %tempd
+// store %struct.x %tv, %struct.x * %temp, align 8
+//
+// The above code allocates some space in the stack and copies the incoming
+// struct from param space to local space.
+// Then replace all occurences of %d by %temp.
+// =============================================================================
+void NVPTXLowerStructArgs::handleStructPtrArgs(Function &F) {
+  for (Argument &Arg : F.args()) {
+    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) {
+      handleParam(&Arg);
+    }
+  }
+}
+
+// =============================================================================
+// Main function for this pass.
+// =============================================================================
+bool NVPTXLowerStructArgs::runOnFunction(Function &F) {
+  // Skip non-kernels. See the comments at the top of this file.
+  if (!isKernelFunction(F))
+    return false;
+
+  handleStructPtrArgs(F);
+  return true;
+}
+
+FunctionPass *llvm::createNVPTXLowerStructArgsPass() {
+  return new NVPTXLowerStructArgs();
+}

diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 5547649..d39a394 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h

@@ -9,8 +9,8 @@
 
 // Modeled after ARMMCExpr
 
-#ifndef NVPTXMCEXPR_H
-#define NVPTXMCEXPR_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXMCEXPR_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXMCEXPR_H
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/MC/MCExpr.h"
@@ -63,7 +63,8 @@
 
   void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const override {
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override {
     return false;
   }
   void visitUsedExpr(MCStreamer &Streamer) const override {};

diff --git a/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
index 67fb390..10f1135 100644
--- a/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
+++ b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h

@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
+
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -44,3 +47,5 @@
   }
 };
 }
+
+#endif

diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 348ab0c..a1e1b9e 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp

@@ -22,6 +22,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -48,8 +49,8 @@
 
 bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
   const TargetMachine &TM = MF.getTarget();
-  const TargetFrameLowering &TFI = *TM.getFrameLowering();
-  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
+  const TargetRegisterInfo &TRI = *TM.getSubtargetImpl()->getRegisterInfo();
   bool Modified = false;
 
   calculateFrameObjectOffsets(MF);
@@ -108,8 +109,8 @@
 
 void
 NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
-  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
-  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
 
   bool StackGrowsDown =
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;

diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 62f288b..358ccce 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp

@@ -53,9 +53,9 @@
     return "%f";
   }
   if (RC == &NVPTX::Float64RegsRegClass) {
-    return "%fl";
+    return "%fd";
   } else if (RC == &NVPTX::Int64RegsRegClass) {
-    return "%rl";
+    return "%rd";
   } else if (RC == &NVPTX::Int32RegsRegClass) {
     return "%r";
   } else if (RC == &NVPTX::Int16RegsRegClass) {

diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h
index a7594be..d2e6733 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTXREGISTERINFO_H
-#define NVPTXREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXREGISTERINFO_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXREGISTERINFO_H
 
 #include "ManagedStringPool.h"
 #include "llvm/Target/TargetRegisterInfo.h"

diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 3482248..efcee6b 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td

@@ -35,9 +35,9 @@
   def P#i  : NVPTXReg<"%p"#i>;  // Predicate
   def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
   def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
-  def RL#i : NVPTXReg<"%rl"#i>; // 64-bit
+  def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
   def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
-  def FL#i : NVPTXReg<"%fl"#i>; // 64-bit float
+  def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
 
   // Arguments
   def ia#i : NVPTXReg<"%ia"#i>;

diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index afd53a6..324420d 100644
--- a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp

@@ -15,6 +15,7 @@
 
 #include "NVPTX.h"
 #include "NVPTXMachineFunctionInfo.h"
+#include "NVPTXSubtarget.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -33,9 +34,15 @@
   NVPTXReplaceImageHandles();
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "NVPTX Replace Image Handles";
+  }
 private:
   bool processInstr(MachineInstr &MI);
   void replaceImageHandle(MachineOperand &Op, MachineFunction &MF);
+  bool findIndexForHandle(MachineOperand &Op, MachineFunction &MF,
+                          unsigned &Idx);
 };
 }
 
@@ -65,242 +72,43 @@
        E = InstrsToRemove.end(); I != E; ++I) {
     (*I)->eraseFromParent();
   }
-
   return Changed;
 }
 
 bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
   MachineFunction &MF = *MI.getParent()->getParent();
-  // Check if we have a surface/texture instruction
-  switch (MI.getOpcode()) {
-  default: return false;
-  case NVPTX::TEX_1D_F32_I32:
-  case NVPTX::TEX_1D_F32_F32:
-  case NVPTX::TEX_1D_F32_F32_LEVEL:
-  case NVPTX::TEX_1D_F32_F32_GRAD:
-  case NVPTX::TEX_1D_I32_I32:
-  case NVPTX::TEX_1D_I32_F32:
-  case NVPTX::TEX_1D_I32_F32_LEVEL:
-  case NVPTX::TEX_1D_I32_F32_GRAD:
-  case NVPTX::TEX_1D_ARRAY_F32_I32:
-  case NVPTX::TEX_1D_ARRAY_F32_F32:
-  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL:
-  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD:
-  case NVPTX::TEX_1D_ARRAY_I32_I32:
-  case NVPTX::TEX_1D_ARRAY_I32_F32:
-  case NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL:
-  case NVPTX::TEX_1D_ARRAY_I32_F32_GRAD:
-  case NVPTX::TEX_2D_F32_I32:
-  case NVPTX::TEX_2D_F32_F32:
-  case NVPTX::TEX_2D_F32_F32_LEVEL:
-  case NVPTX::TEX_2D_F32_F32_GRAD:
-  case NVPTX::TEX_2D_I32_I32:
-  case NVPTX::TEX_2D_I32_F32:
-  case NVPTX::TEX_2D_I32_F32_LEVEL:
-  case NVPTX::TEX_2D_I32_F32_GRAD:
-  case NVPTX::TEX_2D_ARRAY_F32_I32:
-  case NVPTX::TEX_2D_ARRAY_F32_F32:
-  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL:
-  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD:
-  case NVPTX::TEX_2D_ARRAY_I32_I32:
-  case NVPTX::TEX_2D_ARRAY_I32_F32:
-  case NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL:
-  case NVPTX::TEX_2D_ARRAY_I32_F32_GRAD:
-  case NVPTX::TEX_3D_F32_I32:
-  case NVPTX::TEX_3D_F32_F32:
-  case NVPTX::TEX_3D_F32_F32_LEVEL:
-  case NVPTX::TEX_3D_F32_F32_GRAD:
-  case NVPTX::TEX_3D_I32_I32:
-  case NVPTX::TEX_3D_I32_F32:
-  case NVPTX::TEX_3D_I32_F32_LEVEL:
-  case NVPTX::TEX_3D_I32_F32_GRAD: {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  if (MCID.TSFlags & NVPTXII::IsTexFlag) {
     // This is a texture fetch, so operand 4 is a texref and operand 5 is
     // a samplerref
     MachineOperand &TexHandle = MI.getOperand(4);
-    MachineOperand &SampHandle = MI.getOperand(5);
-
     replaceImageHandle(TexHandle, MF);
-    replaceImageHandle(SampHandle, MF);
+
+    if (!(MCID.TSFlags & NVPTXII::IsTexModeUnifiedFlag)) {
+      MachineOperand &SampHandle = MI.getOperand(5);
+      replaceImageHandle(SampHandle, MF);
+    }
 
     return true;
-  }
-  case NVPTX::SULD_1D_I8_TRAP:
-  case NVPTX::SULD_1D_I16_TRAP:
-  case NVPTX::SULD_1D_I32_TRAP:
-  case NVPTX::SULD_1D_ARRAY_I8_TRAP:
-  case NVPTX::SULD_1D_ARRAY_I16_TRAP:
-  case NVPTX::SULD_1D_ARRAY_I32_TRAP:
-  case NVPTX::SULD_2D_I8_TRAP:
-  case NVPTX::SULD_2D_I16_TRAP:
-  case NVPTX::SULD_2D_I32_TRAP:
-  case NVPTX::SULD_2D_ARRAY_I8_TRAP:
-  case NVPTX::SULD_2D_ARRAY_I16_TRAP:
-  case NVPTX::SULD_2D_ARRAY_I32_TRAP:
-  case NVPTX::SULD_3D_I8_TRAP:
-  case NVPTX::SULD_3D_I16_TRAP:
-  case NVPTX::SULD_3D_I32_TRAP: {
-    // This is a V1 surface load, so operand 1 is a surfref
-    MachineOperand &SurfHandle = MI.getOperand(1);
+  } else if (MCID.TSFlags & NVPTXII::IsSuldMask) {
+    unsigned VecSize =
+      1 << (((MCID.TSFlags & NVPTXII::IsSuldMask) >> NVPTXII::IsSuldShift) - 1);
+
+    // For a surface load of vector size N, the Nth operand will be the surfref
+    MachineOperand &SurfHandle = MI.getOperand(VecSize);
 
     replaceImageHandle(SurfHandle, MF);
 
     return true;
-  }
-  case NVPTX::SULD_1D_V2I8_TRAP:
-  case NVPTX::SULD_1D_V2I16_TRAP:
-  case NVPTX::SULD_1D_V2I32_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V2I8_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V2I16_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V2I32_TRAP:
-  case NVPTX::SULD_2D_V2I8_TRAP:
-  case NVPTX::SULD_2D_V2I16_TRAP:
-  case NVPTX::SULD_2D_V2I32_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V2I8_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V2I16_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V2I32_TRAP:
-  case NVPTX::SULD_3D_V2I8_TRAP:
-  case NVPTX::SULD_3D_V2I16_TRAP:
-  case NVPTX::SULD_3D_V2I32_TRAP: {
-    // This is a V2 surface load, so operand 2 is a surfref
-    MachineOperand &SurfHandle = MI.getOperand(2);
-
-    replaceImageHandle(SurfHandle, MF);
-
-    return true;
-  }
-  case NVPTX::SULD_1D_V4I8_TRAP:
-  case NVPTX::SULD_1D_V4I16_TRAP:
-  case NVPTX::SULD_1D_V4I32_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V4I8_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V4I16_TRAP:
-  case NVPTX::SULD_1D_ARRAY_V4I32_TRAP:
-  case NVPTX::SULD_2D_V4I8_TRAP:
-  case NVPTX::SULD_2D_V4I16_TRAP:
-  case NVPTX::SULD_2D_V4I32_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V4I8_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V4I16_TRAP:
-  case NVPTX::SULD_2D_ARRAY_V4I32_TRAP:
-  case NVPTX::SULD_3D_V4I8_TRAP:
-  case NVPTX::SULD_3D_V4I16_TRAP:
-  case NVPTX::SULD_3D_V4I32_TRAP: {
-    // This is a V4 surface load, so operand 4 is a surfref
-    MachineOperand &SurfHandle = MI.getOperand(4);
-
-    replaceImageHandle(SurfHandle, MF);
-
-    return true;
-  }
-  case NVPTX::SUST_B_1D_B8_TRAP:
-  case NVPTX::SUST_B_1D_B16_TRAP:
-  case NVPTX::SUST_B_1D_B32_TRAP:
-  case NVPTX::SUST_B_1D_V2B8_TRAP:
-  case NVPTX::SUST_B_1D_V2B16_TRAP:
-  case NVPTX::SUST_B_1D_V2B32_TRAP:
-  case NVPTX::SUST_B_1D_V4B8_TRAP:
-  case NVPTX::SUST_B_1D_V4B16_TRAP:
-  case NVPTX::SUST_B_1D_V4B32_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_B8_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_B16_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_B32_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP:
-  case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP:
-  case NVPTX::SUST_B_2D_B8_TRAP:
-  case NVPTX::SUST_B_2D_B16_TRAP:
-  case NVPTX::SUST_B_2D_B32_TRAP:
-  case NVPTX::SUST_B_2D_V2B8_TRAP:
-  case NVPTX::SUST_B_2D_V2B16_TRAP:
-  case NVPTX::SUST_B_2D_V2B32_TRAP:
-  case NVPTX::SUST_B_2D_V4B8_TRAP:
-  case NVPTX::SUST_B_2D_V4B16_TRAP:
-  case NVPTX::SUST_B_2D_V4B32_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_B8_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_B16_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_B32_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP:
-  case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP:
-  case NVPTX::SUST_B_3D_B8_TRAP:
-  case NVPTX::SUST_B_3D_B16_TRAP:
-  case NVPTX::SUST_B_3D_B32_TRAP:
-  case NVPTX::SUST_B_3D_V2B8_TRAP:
-  case NVPTX::SUST_B_3D_V2B16_TRAP:
-  case NVPTX::SUST_B_3D_V2B32_TRAP:
-  case NVPTX::SUST_B_3D_V4B8_TRAP:
-  case NVPTX::SUST_B_3D_V4B16_TRAP:
-  case NVPTX::SUST_B_3D_V4B32_TRAP:
-  case NVPTX::SUST_P_1D_B8_TRAP:
-  case NVPTX::SUST_P_1D_B16_TRAP:
-  case NVPTX::SUST_P_1D_B32_TRAP:
-  case NVPTX::SUST_P_1D_V2B8_TRAP:
-  case NVPTX::SUST_P_1D_V2B16_TRAP:
-  case NVPTX::SUST_P_1D_V2B32_TRAP:
-  case NVPTX::SUST_P_1D_V4B8_TRAP:
-  case NVPTX::SUST_P_1D_V4B16_TRAP:
-  case NVPTX::SUST_P_1D_V4B32_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_B8_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_B16_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_B32_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP:
-  case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP:
-  case NVPTX::SUST_P_2D_B8_TRAP:
-  case NVPTX::SUST_P_2D_B16_TRAP:
-  case NVPTX::SUST_P_2D_B32_TRAP:
-  case NVPTX::SUST_P_2D_V2B8_TRAP:
-  case NVPTX::SUST_P_2D_V2B16_TRAP:
-  case NVPTX::SUST_P_2D_V2B32_TRAP:
-  case NVPTX::SUST_P_2D_V4B8_TRAP:
-  case NVPTX::SUST_P_2D_V4B16_TRAP:
-  case NVPTX::SUST_P_2D_V4B32_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_B8_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_B16_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_B32_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP:
-  case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP:
-  case NVPTX::SUST_P_3D_B8_TRAP:
-  case NVPTX::SUST_P_3D_B16_TRAP:
-  case NVPTX::SUST_P_3D_B32_TRAP:
-  case NVPTX::SUST_P_3D_V2B8_TRAP:
-  case NVPTX::SUST_P_3D_V2B16_TRAP:
-  case NVPTX::SUST_P_3D_V2B32_TRAP:
-  case NVPTX::SUST_P_3D_V4B8_TRAP:
-  case NVPTX::SUST_P_3D_V4B16_TRAP:
-  case NVPTX::SUST_P_3D_V4B32_TRAP: {
+  } else if (MCID.TSFlags & NVPTXII::IsSustFlag) {
     // This is a surface store, so operand 0 is a surfref
     MachineOperand &SurfHandle = MI.getOperand(0);
 
     replaceImageHandle(SurfHandle, MF);
 
     return true;
-  }
-  case NVPTX::TXQ_CHANNEL_ORDER:
-  case NVPTX::TXQ_CHANNEL_DATA_TYPE:
-  case NVPTX::TXQ_WIDTH:
-  case NVPTX::TXQ_HEIGHT:
-  case NVPTX::TXQ_DEPTH:
-  case NVPTX::TXQ_ARRAY_SIZE:
-  case NVPTX::TXQ_NUM_SAMPLES:
-  case NVPTX::TXQ_NUM_MIPMAP_LEVELS:
-  case NVPTX::SUQ_CHANNEL_ORDER:
-  case NVPTX::SUQ_CHANNEL_DATA_TYPE:
-  case NVPTX::SUQ_WIDTH:
-  case NVPTX::SUQ_HEIGHT:
-  case NVPTX::SUQ_DEPTH:
-  case NVPTX::SUQ_ARRAY_SIZE: {
+  } else if (MCID.TSFlags & NVPTXII::IsSurfTexQueryFlag) {
     // This is a query, so operand 1 is a surfref/texref
     MachineOperand &Handle = MI.getOperand(1);
 
@@ -308,22 +116,38 @@
 
     return true; 
   }
-  }
+
+  return false;
 }
 
 void NVPTXReplaceImageHandles::
 replaceImageHandle(MachineOperand &Op, MachineFunction &MF) {
+  unsigned Idx;
+  if (findIndexForHandle(Op, MF, Idx)) {
+    Op.ChangeToImmediate(Idx);
+  }
+}
+
+bool NVPTXReplaceImageHandles::
+findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   NVPTXMachineFunctionInfo *MFI = MF.getInfo<NVPTXMachineFunctionInfo>();
+
+  assert(Op.isReg() && "Handle is not in a reg?");
+
   // Which instruction defines the handle?
-  MachineInstr *MI = MRI.getVRegDef(Op.getReg());
-  assert(MI && "No def for image handle vreg?");
-  MachineInstr &TexHandleDef = *MI;
+  MachineInstr &TexHandleDef = *MRI.getVRegDef(Op.getReg());
 
   switch (TexHandleDef.getOpcode()) {
   case NVPTX::LD_i64_avar: {
     // The handle is a parameter value being loaded, replace with the
     // parameter symbol
+    const NVPTXSubtarget &ST = MF.getTarget().getSubtarget<NVPTXSubtarget>();
+    if (ST.getDrvInterface() == NVPTX::CUDA) {
+      // For CUDA, we preserve the param loads coming from function arguments
+      return false;
+    }
+
     assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
     StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
     std::string ParamBaseName = MF.getName();
@@ -333,19 +157,27 @@
     std::string NewSym;
     raw_string_ostream NewSymStr(NewSym);
     NewSymStr << MF.getFunction()->getName() << "_param_" << Param;
-    Op.ChangeToImmediate(
-      MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str()));
+
     InstrsToRemove.insert(&TexHandleDef);
-    break;
+    Idx = MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str());
+    return true;
   }
   case NVPTX::texsurf_handles: {
     // The handle is a global variable, replace with the global variable name
     assert(TexHandleDef.getOperand(1).isGlobal() && "Load is not a global!");
     const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal();
     assert(GV->hasName() && "Global sampler must be named!");
-    Op.ChangeToImmediate(MFI->getImageHandleSymbolIndex(GV->getName().data()));
     InstrsToRemove.insert(&TexHandleDef);
-    break;
+    Idx = MFI->getImageHandleSymbolIndex(GV->getName().data());
+    return true;
+  }
+  case NVPTX::nvvm_move_i64:
+  case TargetOpcode::COPY: {
+    bool Res = findIndexForHandle(TexHandleDef.getOperand(1), MF, Idx);
+    if (Res) {
+      InstrsToRemove.insert(&TexHandleDef);
+    }
+    return Res;
   }
   default:
     llvm_unreachable("Unknown instruction operating on handle");

diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index aa0436b..f1d3cb4 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_NVPTXSECTION_H
-#define LLVM_NVPTXSECTION_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXSECTION_H
 
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCSection.h"

diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index d5cded2..3d52532 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp

@@ -59,7 +59,8 @@
     : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
       SmVersion(20), DL(computeDataLayout(is64Bit)),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)),
-      TLInfo((NVPTXTargetMachine &)TM), TSInfo(&DL), FrameLowering(*this) {
+      TLInfo((const NVPTXTargetMachine &)TM), TSInfo(&DL),
+      FrameLowering(*this) {
 
   Triple T(TT);
 

diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 3ed5747..fb2d404 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTXSUBTARGET_H
-#define NVPTXSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
 
 #include "NVPTX.h"
 #include "NVPTXFrameLowering.h"
@@ -57,14 +57,20 @@
   NVPTXSubtarget(const std::string &TT, const std::string &CPU,
                  const std::string &FS, const TargetMachine &TM, bool is64Bit);
 
-  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
-  const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  const DataLayout *getDataLayout() const { return &DL; }
-  const NVPTXRegisterInfo *getRegisterInfo() const {
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const NVPTXRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  const NVPTXTargetLowering *getTargetLowering() const { return &TLInfo; }
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const NVPTXTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
 
   bool hasBrkPt() const { return SmVersion >= 11; }
   bool hasAtomRedG32() const { return SmVersion >= 11; }
@@ -91,7 +97,12 @@
   inline bool hasROT64() const { return SmVersion >= 20; }
 
   bool hasImageHandles() const {
-    // Currently disabled
+    // Enable handles for Kepler+, where CUDA supports indirect surfaces and
+    // textures
+    if (getDrvInterface() == NVPTX::CUDA)
+      return (SmVersion >= 30);
+
+    // Disabled, otherwise
     return false;
   }
   bool is64Bit() const { return Is64Bit; }
@@ -108,4 +119,4 @@
 
 } // End llvm namespace
 
-#endif // NVPTXSUBTARGET_H
+#endif

diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 069a1b9..d87693f 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp

@@ -16,6 +16,7 @@
 #include "NVPTX.h"
 #include "NVPTXAllocaHoisting.h"
 #include "NVPTXLowerAggrCopies.h"
+#include "NVPTXTargetObjectFile.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
@@ -50,6 +51,7 @@
 void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
 void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+void initializeNVPTXLowerStructArgsPass(PassRegistry &);
 }
 
 extern "C" void LLVMInitializeNVPTXTarget() {
@@ -64,6 +66,7 @@
   initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
   initializeNVPTXFavorNonGenericAddrSpacesPass(
     *PassRegistry::getPassRegistry());
+  initializeNVPTXLowerStructArgsPass(*PassRegistry::getPassRegistry());
 }
 
 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, StringRef TT,
@@ -72,10 +75,13 @@
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL, bool is64bit)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      TLOF(make_unique<NVPTXTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this, is64bit) {
   initAsmInfo();
 }
 
+NVPTXTargetMachine::~NVPTXTargetMachine() {}
+
 void NVPTXTargetMachine32::anchor() {}
 
 NVPTXTargetMachine32::NVPTXTargetMachine32(
@@ -119,6 +125,14 @@
   return PassConfig;
 }
 
+void NVPTXTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our NVPTX pass. This
+  // allows the NVPTX pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createNVPTXTargetTransformInfoPass(this));
+}
+
 void NVPTXPassConfig::addIRPasses() {
   // The following passes are known to not play well with virtual regs hanging
   // around after register allocation (which in our case, is *all* registers).

diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index a7a1c8f..a726bd1 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTX_TARGETMACHINE_H
-#define NVPTX_TARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
 
 #include "NVPTXSubtarget.h"
 #include "ManagedStringPool.h"
@@ -25,6 +25,7 @@
 /// NVPTXTargetMachine
 ///
 class NVPTXTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   NVPTXSubtarget Subtarget;
 
   // Hold Strings that can be free'd all together with NVPTXTargetMachine
@@ -35,27 +36,9 @@
                      const TargetOptions &Options, Reloc::Model RM,
                      CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit);
 
-  const TargetFrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  const NVPTXInstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
+  ~NVPTXTargetMachine() override;
+
   const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const NVPTXRegisterInfo *getRegisterInfo() const override {
-    return getSubtargetImpl()->getRegisterInfo();
-  }
-
-  const NVPTXTargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
 
   ManagedStringPool *getManagedStrPool() const {
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
@@ -63,17 +46,17 @@
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  // Emission of machine code through JITCodeEmitter is not supported.
-  bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &,
-                                  bool = true) override {
-    return true;
-  }
-
   // Emission of machine code through MCJIT is not supported.
   bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &,
                          bool = true) override {
     return true;
   }
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
+
+  /// \brief Register NVPTX analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
 
 }; // NVPTXTargetMachine.
 

diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 0b438c5..00ceca5 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
-#define LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETOBJECTFILE_H
 
 #include "NVPTXSection.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -87,7 +87,8 @@
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
   }
 
-  const MCSection *getSectionForConstant(SectionKind Kind) const override {
+  const MCSection *getSectionForConstant(SectionKind Kind,
+                                         const Constant *C) const override {
     return ReadOnlySection;
   }
 
@@ -97,6 +98,9 @@
     return DataSection;
   }
 
+  const MCSection *
+  SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+                         const TargetMachine &TM) const override;
 };
 
 } // end namespace llvm

diff --git a/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
new file mode 100644
index 0000000..b09d0d4
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp

@@ -0,0 +1,115 @@
+//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// NVPTX target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetMachine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "NVPTXtti"
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeNVPTXTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class NVPTXTTI final : public ImmutablePass, public TargetTransformInfo {
+  const NVPTXTargetLowering *TLI;
+public:
+  NVPTXTTI() : ImmutablePass(ID), TLI(nullptr) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  NVPTXTTI(const NVPTXTargetMachine *TM)
+      : ImmutablePass(ID), TLI(TM->getSubtargetImpl()->getTargetLowering()) {
+    initializeNVPTXTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void initializePass() override { pushTTIStack(this); }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  void *getAdjustedAnalysisPointer(const void *ID) override {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo *)this;
+    return this;
+  }
+
+  bool hasBranchDivergence() const override;
+
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
+      OperandValueKind Opd2Info = OK_AnyValue,
+      OperandValueProperties Opd1PropInfo = OP_None,
+      OperandValueProperties Opd2PropInfo = OP_None) const override;
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(NVPTXTTI, TargetTransformInfo, "NVPTXtti",
+                   "NVPTX Target Transform Info", true, true, false)
+char NVPTXTTI::ID = 0;
+
+ImmutablePass *
+llvm::createNVPTXTargetTransformInfoPass(const NVPTXTargetMachine *TM) {
+  return new NVPTXTTI(TM);
+}
+
+bool NVPTXTTI::hasBranchDivergence() const { return true; }
+
+unsigned NVPTXTTI::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
+    OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
+    OperandValueProperties Opd2PropInfo) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  switch (ISD) {
+  default:
+    return TargetTransformInfo::getArithmeticInstrCost(
+        Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // The machine code (SASS) simulates an i64 with two i32. Therefore, we
+    // estimate that arithmetic operations on i64 are twice as expensive as
+    // those on types that can fit into one machine register.
+    if (LT.second.SimpleTy == MVT::i64)
+      return 2 * LT.first;
+    // Delegate other cases to the basic TTI.
+    return TargetTransformInfo::getArithmeticInstrCost(
+        Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+  }
+}

diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index a9fd190b..5caa8bd 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp

@@ -90,11 +90,11 @@
     return;
 
   if ((*annotationCache).find(m) != (*annotationCache).end())
-    (*annotationCache)[m][gv] = tmp;
+    (*annotationCache)[m][gv] = std::move(tmp);
   else {
     global_val_annot_t tmp1;
-    tmp1[gv] = tmp;
-    (*annotationCache)[m] = tmp1;
+    tmp1[gv] = std::move(tmp);
+    (*annotationCache)[m] = std::move(tmp1);
   }
 }
 

diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h
index 446bfa1..7e2ce73 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/lib/Target/NVPTX/NVPTXUtilities.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTXUTILITIES_H
-#define NVPTXUTILITIES_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"

diff --git a/lib/Target/NVPTX/NVPTXutil.h b/lib/Target/NVPTX/NVPTXutil.h
index d1d1171..1915dac 100644
--- a/lib/Target/NVPTX/NVPTXutil.h
+++ b/lib/Target/NVPTX/NVPTXutil.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_NVPTX_UTIL_H
-#define LLVM_TARGET_NVPTX_UTIL_H
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTIL_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXUTIL_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"

diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 2f562ca..06bb968 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp

@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -213,16 +214,12 @@
 
 class PPCAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
   const MCInstrInfo &MII;
   bool IsPPC64;
   bool IsDarwin;
 
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); }
 
   bool isPPC64() const { return IsPPC64; }
   bool isDarwin() const { return IsDarwin; }
@@ -244,10 +241,12 @@
   bool ParseDirectiveTC(unsigned Size, SMLoc L);
   bool ParseDirectiveMachine(SMLoc L);
   bool ParseDarwinDirectiveMachine(SMLoc L);
+  bool ParseDirectiveAbiVersion(SMLoc L);
+  bool ParseDirectiveLocalEntry(SMLoc L);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
+                               uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
   void ProcessInstruction(MCInst &Inst, const OperandVector &Ops);
@@ -263,9 +262,8 @@
 
 public:
   PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-               const MCInstrInfo &_MII,
-               const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(_MII) {
+               const MCInstrInfo &_MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(_STI), MII(_MII) {
     // Check for 64-bit vs. 32-bit pointer mode.
     Triple TheTriple(STI.getTargetTriple());
     IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
@@ -294,6 +292,7 @@
   enum KindTy {
     Token,
     Immediate,
+    ContextImmediate,
     Expression,
     TLSRegister
   } Kind;
@@ -338,6 +337,7 @@
       Tok = o.Tok;
       break;
     case Immediate:
+    case ContextImmediate:
       Imm = o.Imm;
       break;
     case Expression:
@@ -362,6 +362,16 @@
     assert(Kind == Immediate && "Invalid access!");
     return Imm.Val;
   }
+  int64_t getImmS16Context() const {
+    assert((Kind == Immediate || Kind == ContextImmediate) && "Invalid access!");
+    if (Kind == Immediate)
+      return Imm.Val;
+    return static_cast<int16_t>(Imm.Val);
+  }
+  int64_t getImmU16Context() const {
+    assert((Kind == Immediate || Kind == ContextImmediate) && "Invalid access!");
+    return Imm.Val;
+  }
 
   const MCExpr *getExpr() const {
     assert(Kind == Expression && "Invalid access!");
@@ -406,22 +416,73 @@
   bool isToken() const override { return Kind == Token; }
   bool isImm() const override { return Kind == Immediate || Kind == Expression; }
   bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); }
+  bool isU4Imm() const { return Kind == Immediate && isUInt<4>(getImm()); }
   bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); }
   bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); }
   bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); }
-  bool isU16Imm() const { return Kind == Expression ||
-                                 (Kind == Immediate && isUInt<16>(getImm())); }
-  bool isS16Imm() const { return Kind == Expression ||
-                                 (Kind == Immediate && isInt<16>(getImm())); }
+  bool isU6ImmX2() const { return Kind == Immediate &&
+                                  isUInt<6>(getImm()) &&
+                                  (getImm() & 1) == 0; }
+  bool isU7ImmX4() const { return Kind == Immediate &&
+                                  isUInt<7>(getImm()) &&
+                                  (getImm() & 3) == 0; }
+  bool isU8ImmX8() const { return Kind == Immediate &&
+                                  isUInt<8>(getImm()) &&
+                                  (getImm() & 7) == 0; }
+  bool isU16Imm() const {
+    switch (Kind) {
+      case Expression:
+        return true;
+      case Immediate:
+      case ContextImmediate:
+        return isUInt<16>(getImmU16Context());
+      default:
+        return false;
+    }
+  }
+  bool isS16Imm() const {
+    switch (Kind) {
+      case Expression:
+        return true;
+      case Immediate:
+      case ContextImmediate:
+        return isInt<16>(getImmS16Context());
+      default:
+        return false;
+    }
+  }
   bool isS16ImmX4() const { return Kind == Expression ||
                                    (Kind == Immediate && isInt<16>(getImm()) &&
                                     (getImm() & 3) == 0); }
-  bool isS17Imm() const { return Kind == Expression ||
-                                 (Kind == Immediate && isInt<17>(getImm())); }
+  bool isS17Imm() const {
+    switch (Kind) {
+      case Expression:
+        return true;
+      case Immediate:
+      case ContextImmediate:
+        return isInt<17>(getImmS16Context());
+      default:
+        return false;
+    }
+  }
   bool isTLSReg() const { return Kind == TLSRegister; }
-  bool isDirectBr() const { return Kind == Expression ||
-                                   (Kind == Immediate && isInt<26>(getImm()) &&
-                                    (getImm() & 3) == 0); }
+  bool isDirectBr() const {
+    if (Kind == Expression)
+      return true;
+    if (Kind != Immediate)
+      return false;
+    // Operand must be 64-bit aligned, signed 27-bit immediate.
+    if ((getImm() & 3) != 0)
+      return false;
+    if (isInt<26>(getImm()))
+      return true;
+    if (!IsPPC64) {
+      // In 32-bit mode, large 32-bit quantities wrap around.
+      if (isUInt<32>(getImm()) && isInt<26>(static_cast<int32_t>(getImm())))
+        return true;
+    }
+    return false;
+  }
   bool isCondBr() const { return Kind == Expression ||
                                  (Kind == Immediate && isInt<16>(getImm()) &&
                                   (getImm() & 3) == 0); }
@@ -526,6 +587,36 @@
       Inst.addOperand(MCOperand::CreateExpr(getExpr()));
   }
 
+  void addS16ImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    switch (Kind) {
+      case Immediate:
+        Inst.addOperand(MCOperand::CreateImm(getImm()));
+        break;
+      case ContextImmediate:
+        Inst.addOperand(MCOperand::CreateImm(getImmS16Context()));
+        break;
+      default:
+        Inst.addOperand(MCOperand::CreateExpr(getExpr()));
+        break;
+    }
+  }
+
+  void addU16ImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    switch (Kind) {
+      case Immediate:
+        Inst.addOperand(MCOperand::CreateImm(getImm()));
+        break;
+      case ContextImmediate:
+        Inst.addOperand(MCOperand::CreateImm(getImmU16Context()));
+        break;
+      default:
+        Inst.addOperand(MCOperand::CreateExpr(getExpr()));
+        break;
+    }
+  }
+
   void addBranchTargetOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     if (Kind == Immediate)
@@ -566,9 +657,9 @@
     // explicitly.
     void *Mem = ::operator new(sizeof(PPCOperand) + Str.size());
     std::unique_ptr<PPCOperand> Op(new (Mem) PPCOperand(Token));
-    Op->Tok.Data = (const char *)(Op.get() + 1);
+    Op->Tok.Data = reinterpret_cast<const char *>(Op.get() + 1);
     Op->Tok.Length = Str.size();
-    std::memcpy((void *)Op->Tok.Data, Str.data(), Str.size());
+    std::memcpy(const_cast<char *>(Op->Tok.Data), Str.data(), Str.size());
     Op->StartLoc = S;
     Op->EndLoc = S;
     Op->IsPPC64 = IsPPC64;
@@ -607,6 +698,16 @@
   }
 
   static std::unique_ptr<PPCOperand>
+  CreateContextImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) {
+    auto Op = make_unique<PPCOperand>(ContextImmediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static std::unique_ptr<PPCOperand>
   CreateFromMCExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) {
     if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val))
       return CreateImm(CE->getValue(), S, E, IsPPC64);
@@ -615,6 +716,12 @@
       if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS)
         return CreateTLSReg(SRE, S, E, IsPPC64);
 
+    if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
+      int64_t Res;
+      if (TE->EvaluateAsConstant(Res))
+        return CreateContextImm(Res, S, E, IsPPC64);
+    }
+
     return CreateExpr(Val, S, E, IsPPC64);
   }
 };
@@ -627,6 +734,7 @@
     OS << "'" << getToken() << "'";
     break;
   case Immediate:
+  case ContextImmediate:
     OS << getImm();
     break;
   case Expression:
@@ -638,6 +746,29 @@
   }
 }
 
+static void
+addNegOperand(MCInst &Inst, MCOperand &Op, MCContext &Ctx) {
+  if (Op.isImm()) {
+    Inst.addOperand(MCOperand::CreateImm(-Op.getImm()));
+    return;
+  }
+  const MCExpr *Expr = Op.getExpr();
+  if (const MCUnaryExpr *UnExpr = dyn_cast<MCUnaryExpr>(Expr)) {
+    if (UnExpr->getOpcode() == MCUnaryExpr::Minus) {
+      Inst.addOperand(MCOperand::CreateExpr(UnExpr->getSubExpr()));
+      return;
+    }
+  } else if (const MCBinaryExpr *BinExpr = dyn_cast<MCBinaryExpr>(Expr)) {
+    if (BinExpr->getOpcode() == MCBinaryExpr::Sub) {
+      const MCExpr *NE = MCBinaryExpr::CreateSub(BinExpr->getRHS(),
+                                                 BinExpr->getLHS(), Ctx);
+      Inst.addOperand(MCOperand::CreateExpr(NE));
+      return;
+    }
+  }
+  Inst.addOperand(MCOperand::CreateExpr(MCUnaryExpr::CreateMinus(Expr, Ctx)));
+}
+
 void PPCAsmParser::ProcessInstruction(MCInst &Inst,
                                       const OperandVector &Operands) {
   int Opcode = Inst.getOpcode();
@@ -653,41 +784,37 @@
   }
   case PPC::SUBI: {
     MCInst TmpInst;
-    int64_t N = Inst.getOperand(2).getImm();
     TmpInst.setOpcode(PPC::ADDI);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(1));
-    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    addNegOperand(TmpInst, Inst.getOperand(2), getContext());
     Inst = TmpInst;
     break;
   }
   case PPC::SUBIS: {
     MCInst TmpInst;
-    int64_t N = Inst.getOperand(2).getImm();
     TmpInst.setOpcode(PPC::ADDIS);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(1));
-    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    addNegOperand(TmpInst, Inst.getOperand(2), getContext());
     Inst = TmpInst;
     break;
   }
   case PPC::SUBIC: {
     MCInst TmpInst;
-    int64_t N = Inst.getOperand(2).getImm();
     TmpInst.setOpcode(PPC::ADDIC);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(1));
-    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    addNegOperand(TmpInst, Inst.getOperand(2), getContext());
     Inst = TmpInst;
     break;
   }
   case PPC::SUBICo: {
     MCInst TmpInst;
-    int64_t N = Inst.getOperand(2).getImm();
     TmpInst.setOpcode(PPC::ADDICo);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(1));
-    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    addNegOperand(TmpInst, Inst.getOperand(2), getContext());
     Inst = TmpInst;
     break;
   }
@@ -921,7 +1048,7 @@
 
 bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                            OperandVector &Operands,
-                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           MCStreamer &Out, uint64_t &ErrorInfo,
                                            bool MatchingInlineAsm) {
   MCInst Inst;
 
@@ -939,7 +1066,7 @@
       return Error(IDLoc, "unrecognized instruction mnemonic");
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
+    if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
@@ -995,6 +1122,7 @@
 
 bool PPCAsmParser::
 ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
@@ -1176,6 +1304,7 @@
 /// for this to be done at a higher level.
 bool PPCAsmParser::
 ParseDarwinExpression(const MCExpr *&EVal) {
+  MCAsmParser &Parser = getParser();
   PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None;
   switch (getLexer().getKind()) {
   default:
@@ -1218,6 +1347,7 @@
 /// This handles registers in the form 'NN', '%rNN' for ELF platforms and
 /// rNN for MachO.
 bool PPCAsmParser::ParseOperand(OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   SMLoc S = Parser.getTok().getLoc();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *EVal;
@@ -1412,6 +1542,10 @@
       return ParseDirectiveTC(isPPC64()? 8 : 4, DirectiveID.getLoc());
     if (IDVal == ".machine")
       return ParseDirectiveMachine(DirectiveID.getLoc());
+    if (IDVal == ".abiversion")
+      return ParseDirectiveAbiVersion(DirectiveID.getLoc());
+    if (IDVal == ".localentry")
+      return ParseDirectiveLocalEntry(DirectiveID.getLoc());
   } else {
     if (IDVal == ".machine")
       return ParseDarwinDirectiveMachine(DirectiveID.getLoc());
@@ -1422,6 +1556,7 @@
 /// ParseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
@@ -1446,6 +1581,7 @@
 /// ParseDirectiveTC
 ///  ::= .tc [ symbol (, expression)* ]
 bool PPCAsmParser::ParseDirectiveTC(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   // Skip TC symbol, which is only used with XCOFF.
   while (getLexer().isNot(AsmToken::EndOfStatement)
          && getLexer().isNot(AsmToken::Comma))
@@ -1466,6 +1602,7 @@
 /// ParseDirectiveMachine (ELF platforms)
 ///  ::= .machine [ cpu | "push" | "pop" ]
 bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::Identifier) &&
       getLexer().isNot(AsmToken::String)) {
     Error(L, "unexpected token in directive");
@@ -1500,6 +1637,7 @@
 /// ParseDarwinDirectiveMachine (Mach-o platforms)
 ///  ::= .machine cpu-identifier
 bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::Identifier) &&
       getLexer().isNot(AsmToken::String)) {
     Error(L, "unexpected token in directive");
@@ -1534,6 +1672,64 @@
   return false;
 }
 
+/// ParseDirectiveAbiVersion
+///  ::= .abiversion constant-expression
+bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) {
+  int64_t AbiVersion;
+  if (getParser().parseAbsoluteExpression(AbiVersion)){
+    Error(L, "expected constant expression");
+    return false;
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(
+           getParser().getStreamer().getTargetStreamer());
+  TStreamer.emitAbiVersion(AbiVersion);
+
+  return false;
+}
+
+/// ParseDirectiveLocalEntry
+///  ::= .localentry symbol, expression
+bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name)) {
+    Error(L, "expected identifier in directive");
+    return false;
+  }
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+  Lex();
+
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr)) {
+    Error(L, "expected expression");
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Error(L, "unexpected token in directive");
+    return false;
+  }
+
+  PPCTargetStreamer &TStreamer =
+      *static_cast<PPCTargetStreamer *>(
+           getParser().getStreamer().getTargetStreamer());
+  TStreamer.emitLocalEntry(Sym, Expr);
+
+  return false;
+}
+
+
+
 /// Force static initialization.
 extern "C" void LLVMInitializePowerPCAsmParser() {
   RegisterMCAsmParser<PPCAsmParser> A(ThePPC32Target);
@@ -1558,6 +1754,10 @@
     case MCK_1: ImmVal = 1; break;
     case MCK_2: ImmVal = 2; break;
     case MCK_3: ImmVal = 3; break;
+    case MCK_4: ImmVal = 4; break;
+    case MCK_5: ImmVal = 5; break;
+    case MCK_6: ImmVal = 6; break;
+    case MCK_7: ImmVal = 7; break;
     default: return Match_InvalidOperand;
   }
 

diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index ea4de63..47a9474 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt

@@ -2,9 +2,8 @@
 
 tablegen(LLVM PPCGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM PPCGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM PPCGenCodeEmitter.inc -gen-emitter)
 tablegen(LLVM PPCGenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM PPCGenDAGISel.inc -gen-dag-isel)
@@ -16,7 +15,6 @@
 add_llvm_target(PowerPCCodeGen
   PPCAsmPrinter.cpp
   PPCBranchSelector.cpp
-  PPCCodeEmitter.cpp
   PPCCTRLoops.cpp
   PPCHazardRecognizers.cpp
   PPCInstrInfo.cpp
@@ -24,7 +22,6 @@
   PPCISelLowering.cpp
   PPCFastISel.cpp
   PPCFrameLowering.cpp
-  PPCJITInfo.cpp
   PPCMCInstLower.cpp
   PPCMachineFunctionInfo.cpp
   PPCRegisterInfo.cpp

diff --git a/lib/Target/PowerPC/Disassembler/LLVMBuild.txt b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
index c1011ff..ea3e7ea 100644
--- a/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
+++ b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = PowerPCDisassembler
 parent = PowerPC
-required_libraries = MC PowerPCDesc PowerPCInfo Support
+required_libraries = MCDisassembler PowerPCInfo Support
 add_to_library_groups = PowerPC

diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index a2305a9..5251b60 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp

@@ -12,7 +12,6 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -28,13 +27,10 @@
     : MCDisassembler(STI, Ctx) {}
   virtual ~PPCDisassembler() {}
 
-  // Override MCDisassembler.
-  virtual DecodeStatus getInstruction(MCInst &instr,
-                                      uint64_t &size,
-                                      const MemoryObject &region,
-                                      uint64_t address,
-                                      raw_ostream &vStream,
-                                      raw_ostream &cStream) const override;
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
 };
 } // end anonymous namespace
 
@@ -325,23 +321,19 @@
 #include "PPCGenDisassemblerTables.inc"
 
 DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                                 const MemoryObject &Region,
-                                                 uint64_t Address,
-                                                 raw_ostream &os,
-                                                 raw_ostream &cs) const {
+                                             ArrayRef<uint8_t> Bytes,
+                                             uint64_t Address, raw_ostream &OS,
+                                             raw_ostream &CS) const {
   // Get the four bytes of the instruction.
-  uint8_t Bytes[4];
   Size = 4;
-  if (Region.readBytes(Address, Size, Bytes) == -1) {
+  if (Bytes.size() < 4) {
     Size = 0;
     return MCDisassembler::Fail;
   }
 
   // The instruction is big-endian encoded.
-  uint32_t Inst = (Bytes[0] << 24) |
-                  (Bytes[1] << 16) |
-                  (Bytes[2] <<  8) |
-                  (Bytes[3] <<  0);
+  uint32_t Inst =
+      (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 8) | (Bytes[3] << 0);
 
   return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI);
 }

diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 7279b09..670c40a 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp

@@ -17,6 +17,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOpcodes.h"
@@ -207,6 +208,13 @@
   O << (unsigned int)Value;
 }
 
+void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 15 && "Invalid u4imm argument!");
+  O << (unsigned int)Value;
+}
+
 void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   int Value = MI->getOperand(OpNo).getImm();
@@ -260,7 +268,7 @@
   if (!MI->getOperand(OpNo).isImm())
     return printOperand(MI, OpNo, O);
 
-  O << (int)MI->getOperand(OpNo).getImm()*4;
+  O << SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
 }
 
 
@@ -308,10 +316,16 @@
 
 void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
                                   raw_ostream &O) {
-  printBranchOperand(MI, OpNo, O);
+  // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must
+  // come at the _end_ of the expression.
+  const MCOperand &Op = MI->getOperand(OpNo);
+  const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*Op.getExpr());
+  O << refExp.getSymbol().getName();
   O << '(';
   printOperand(MI, OpNo+1, O);
   O << ')';
+  if (refExp.getKind() != MCSymbolRefExpr::VK_None)
+    O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind());
 }
 
 

diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 211a628..b21aa22 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PPCINSTPRINTER_H
-#define PPCINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
+#define LLVM_LIB_TARGET_POWERPC_INSTPRINTER_PPCINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
 
@@ -44,6 +44,7 @@
                              raw_ostream &O, const char *Modifier = nullptr);
 
   void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 12584be..c54d5e7 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp

@@ -9,7 +9,9 @@
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCMachObjectWriter.h"
@@ -128,6 +130,30 @@
     }
   }
 
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override {
+    switch ((PPC::Fixups)Fixup.getKind()) {
+    default: break;
+    case PPC::fixup_ppc_br24:
+    case PPC::fixup_ppc_br24abs:
+      // If the target symbol has a local entry point we must not attempt
+      // to resolve the fixup directly.  Emit a relocation and leave
+      // resolution of the final target address to the linker.
+      if (const MCSymbolRefExpr *A = Target.getSymA()) {
+        const MCSymbolData &Data = Asm.getSymbolData(A->getSymbol());
+        // The "other" values are stored in the last 6 bits of the second byte.
+        // The traditional defines for STO values assume the full byte and thus
+        // the shift to pack it.
+        unsigned Other = MCELF::getOther(Data) << 2;
+        if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
+          IsResolved = false;
+      }
+      break;
+    }
+  }
+
   bool mayNeedRelaxation(const MCInst &Inst) const override {
     // FIXME.
     return false;

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index cd3b4f4..b817394 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp

@@ -11,6 +11,7 @@
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCValue.h"
@@ -23,13 +24,12 @@
   public:
     PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI);
 
-    virtual ~PPCELFObjectWriter();
   protected:
-    virtual unsigned getRelocTypeInner(const MCValue &Target,
-                                       const MCFixup &Fixup,
-                                       bool IsPCRel) const;
     unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                           bool IsPCRel) const override;
+
+    bool needsRelocateWithSymbol(const MCSymbolData &SD,
+                                 unsigned Type) const override;
   };
 }
 
@@ -38,9 +38,6 @@
                             Is64Bit ?  ELF::EM_PPC64 : ELF::EM_PPC,
                             /*HasRelocationAddend*/ true) {}
 
-PPCELFObjectWriter::~PPCELFObjectWriter() {
-}
-
 static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
                                                      const MCFixup &Fixup) {
   const MCExpr *Expr = Fixup.getValue();
@@ -69,10 +66,9 @@
   llvm_unreachable("unknown PPCMCExpr kind");
 }
 
-unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
-                                               const MCFixup &Fixup,
-                                               bool IsPCRel) const
-{
+unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const {
   MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup);
 
   // determine the type of the relocation
@@ -83,7 +79,18 @@
       llvm_unreachable("Unimplemented");
     case PPC::fixup_ppc_br24:
     case PPC::fixup_ppc_br24abs:
-      Type = ELF::R_PPC_REL24;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_REL24;
+        break;
+      case MCSymbolRefExpr::VK_PLT:
+        Type = ELF::R_PPC_PLTREL24;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LOCAL:
+        Type = ELF::R_PPC_LOCAL24PC;
+        break;
+      }
       break;
     case PPC::fixup_ppc_brcond14:
     case PPC::fixup_ppc_brcond14abs:
@@ -224,7 +231,10 @@
         Type = ELF::R_PPC64_DTPREL16_HIGHESTA;
         break;
       case MCSymbolRefExpr::VK_PPC_GOT_TLSGD:
-        Type = ELF::R_PPC64_GOT_TLSGD16;
+        if (is64Bit())
+          Type = ELF::R_PPC64_GOT_TLSGD16;
+        else
+          Type = ELF::R_PPC_GOT_TLSGD16;
         break;
       case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO:
         Type = ELF::R_PPC64_GOT_TLSGD16_LO;
@@ -236,7 +246,10 @@
         Type = ELF::R_PPC64_GOT_TLSGD16_HA;
         break;
       case MCSymbolRefExpr::VK_PPC_GOT_TLSLD:
-        Type = ELF::R_PPC64_GOT_TLSLD16;
+        if (is64Bit())
+          Type = ELF::R_PPC64_GOT_TLSLD16;
+        else
+          Type = ELF::R_PPC_GOT_TLSLD16;
         break;
       case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO:
         Type = ELF::R_PPC64_GOT_TLSLD16_LO;
@@ -332,13 +345,22 @@
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_PPC_TLSGD:
-        Type = ELF::R_PPC64_TLSGD;
+        if (is64Bit())
+          Type = ELF::R_PPC64_TLSGD;
+        else
+          Type = ELF::R_PPC_TLSGD;
         break;
       case MCSymbolRefExpr::VK_PPC_TLSLD:
-        Type = ELF::R_PPC64_TLSLD;
+        if (is64Bit())
+          Type = ELF::R_PPC64_TLSLD;
+        else
+          Type = ELF::R_PPC_TLSLD;
         break;
       case MCSymbolRefExpr::VK_PPC_TLS:
-        Type = ELF::R_PPC64_TLS;
+        if (is64Bit())
+          Type = ELF::R_PPC64_TLS;
+        else
+          Type = ELF::R_PPC_TLS;
         break;
       }
       break;
@@ -373,10 +395,21 @@
   return Type;
 }
 
-unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
-                                          const MCFixup &Fixup,
-                                          bool IsPCRel) const {
-  return getRelocTypeInner(Target, Fixup, IsPCRel);
+bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD,
+                                                 unsigned Type) const {
+  switch (Type) {
+    default:
+      return false;
+
+    case ELF::R_PPC_REL24:
+      // If the target symbol has a local entry point, we must keep the
+      // target symbol to preserve that information for the linker.
+      // The "other" values are stored in the last 6 bits of the second byte.
+      // The traditional defines for STO values assume the full byte and thus
+      // the shift to pack it.
+      unsigned Other = MCELF::getOther(SD) << 2;
+      return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0;
+  }
 }
 
 MCObjectWriter *llvm::createPPCELFObjectWriter(raw_ostream &OS,

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 68de8c1..ae43e59 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PPC_PPCFIXUPKINDS_H
-#define LLVM_PPC_PPCFIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCFIXUPKINDS_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCFIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index b95a2ac..893aae3 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp

@@ -42,9 +42,9 @@
   UseIntegratedAssembler = true;
 }
 
-void PPCLinuxMCAsmInfo::anchor() { }
+void PPCELFMCAsmInfo::anchor() { }
 
-PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit, const Triple& T) {
+PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
   if (is64Bit) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
@@ -64,7 +64,6 @@
   DollarIsPC = true;
 
   // Set up DWARF directives
-  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
   MinInstAlignment = 4;
 
   // Exceptions handling
@@ -73,6 +72,7 @@
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr;
   AssemblerDialect = 1;           // New-Style mnemonics.
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
 
   if (T.getOS() == llvm::Triple::FreeBSD ||
       (T.getOS() == llvm::Triple::NetBSD && !is64Bit) ||

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 754330b..9f0294d 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PPCTARGETASMINFO_H
-#define PPCTARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -26,10 +26,10 @@
     explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&);
   };
 
-  class PPCLinuxMCAsmInfo : public MCAsmInfoELF {
+  class PPCELFMCAsmInfo : public MCAsmInfoELF {
     void anchor() override;
   public:
-    explicit PPCLinuxMCAsmInfo(bool is64Bit, const Triple&);
+    explicit PPCELFMCAsmInfo(bool is64Bit, const Triple&);
   };
 
 } // namespace llvm

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 435a93f..786b7fe 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp

@@ -66,6 +66,15 @@
   unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
+  unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+  unsigned getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
   unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
@@ -260,6 +269,54 @@
 }
 
 
+unsigned PPCMCCodeEmitter::getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI)
+                                              const {
+  // Encode (imm, reg) as a spe8dis, which has the low 5-bits of (imm / 8)
+  // as the displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm());
+  uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 3;
+  return reverseBits(Imm | RegBits) >> 22;
+}
+
+
+unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI)
+                                              const {
+  // Encode (imm, reg) as a spe4dis, which has the low 5-bits of (imm / 4)
+  // as the displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm());
+  uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 2;
+  return reverseBits(Imm | RegBits) >> 22;
+}
+
+
+unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI)
+                                              const {
+  // Encode (imm, reg) as a spe2dis, which has the low 5-bits of (imm / 2)
+  // as the displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  uint32_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 5;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert(MO.isImm());
+  uint32_t Imm = getMachineOpValue(MI, MO, Fixups, STI) >> 1;
+  return reverseBits(Imm | RegBits) >> 22;
+}
+
+
 unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 3ac0aca..7204bef 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp

@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "PPCFixupKinds.h"
 #include "PPCMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -52,40 +53,56 @@
 }
 
 bool
-PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                     const MCAsmLayout *Layout) const {
+PPCMCExpr::EvaluateAsConstant(int64_t &Res) const {
   MCValue Value;
 
-  if (!getSubExpr()->EvaluateAsRelocatable(Value, Layout))
+  if (!getSubExpr()->EvaluateAsRelocatable(Value, nullptr, nullptr))
+    return false;
+
+  if (!Value.isAbsolute())
+    return false;
+
+  Res = EvaluateAsInt64(Value.getConstant());
+  return true;
+}
+
+int64_t
+PPCMCExpr::EvaluateAsInt64(int64_t Value) const {
+  switch (Kind) {
+    case VK_PPC_LO:
+      return Value & 0xffff;
+    case VK_PPC_HI:
+      return (Value >> 16) & 0xffff;
+    case VK_PPC_HA:
+      return ((Value + 0x8000) >> 16) & 0xffff;
+    case VK_PPC_HIGHER:
+      return (Value >> 32) & 0xffff;
+    case VK_PPC_HIGHERA:
+      return ((Value + 0x8000) >> 32) & 0xffff;
+    case VK_PPC_HIGHEST:
+      return (Value >> 48) & 0xffff;
+    case VK_PPC_HIGHESTA:
+      return ((Value + 0x8000) >> 48) & 0xffff;
+    case VK_PPC_None:
+      break;
+  }
+  llvm_unreachable("Invalid kind!");
+}
+
+bool
+PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                     const MCAsmLayout *Layout,
+                                     const MCFixup *Fixup) const {
+  MCValue Value;
+
+  if (!getSubExpr()->EvaluateAsRelocatable(Value, Layout, Fixup))
     return false;
 
   if (Value.isAbsolute()) {
-    int64_t Result = Value.getConstant();
-    switch (Kind) {
-      default:
-        llvm_unreachable("Invalid kind!");
-      case VK_PPC_LO:
-        Result = Result & 0xffff;
-        break;
-      case VK_PPC_HI:
-        Result = (Result >> 16) & 0xffff;
-        break;
-      case VK_PPC_HA:
-        Result = ((Result + 0x8000) >> 16) & 0xffff;
-        break;
-      case VK_PPC_HIGHER:
-        Result = (Result >> 32) & 0xffff;
-        break;
-      case VK_PPC_HIGHERA:
-        Result = ((Result + 0x8000) >> 32) & 0xffff;
-        break;
-      case VK_PPC_HIGHEST:
-        Result = (Result >> 48) & 0xffff;
-        break;
-      case VK_PPC_HIGHESTA:
-        Result = ((Result + 0x8000) >> 48) & 0xffff;
-        break;
-    }
+    int64_t Result = EvaluateAsInt64(Value.getConstant());
+    if ((Fixup == nullptr || (unsigned)Fixup->getKind() != PPC::fixup_ppc_half16) &&
+        (Result >= 0x8000))
+      return false;
     Res = MCValue::get(Result);
   } else {
     if (!Layout)

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index bca4085..f0a6bb9 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PPCMCEXPR_H
-#define PPCMCEXPR_H
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
 
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCExpr.h"
@@ -34,6 +34,8 @@
   const MCExpr *Expr;
   bool IsDarwin;
 
+  int64_t EvaluateAsInt64(int64_t Value) const;
+
   explicit PPCMCExpr(VariantKind _Kind, const MCExpr *_Expr,
                      bool _IsDarwin)
     : Kind(_Kind), Expr(_Expr), IsDarwin(_IsDarwin) {}
@@ -78,7 +80,8 @@
 
   void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const override;
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
   const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
@@ -87,6 +90,8 @@
   // There are no TLS PPCMCExprs at the moment.
   void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
 
+  bool EvaluateAsConstant(int64_t &Res) const;
+
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
   }

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 7057797..00be8f4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp

@@ -16,12 +16,16 @@
 #include "PPCMCAsmInfo.h"
 #include "PPCTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -75,7 +79,7 @@
   if (TheTriple.isOSDarwin())
     MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple);
   else
-    MAI = new PPCLinuxMCAsmInfo(isPPC64, TheTriple);
+    MAI = new PPCELFMCAsmInfo(isPPC64, TheTriple);
 
   // Initial state of the frame pointer is R1.
   unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
@@ -125,11 +129,20 @@
   void emitMachine(StringRef CPU) override {
     OS << "\t.machine " << CPU << '\n';
   }
+  void emitAbiVersion(int AbiVersion) override {
+    OS << "\t.abiversion " << AbiVersion << '\n';
+  }
+  void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) override {
+    OS << "\t.localentry\t" << *S << ", " << *LocalOffset << '\n';
+  }
 };
 
 class PPCTargetELFStreamer : public PPCTargetStreamer {
 public:
   PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+  MCELFStreamer &getStreamer() {
+    return static_cast<MCELFStreamer &>(Streamer);
+  }
   void emitTCEntry(const MCSymbol &S) override {
     // Creates a R_PPC64_TOC relocation
     Streamer.EmitSymbolValue(&S, 8);
@@ -138,6 +151,39 @@
     // FIXME: Is there anything to do in here or does this directive only
     // limit the parser?
   }
+  void emitAbiVersion(int AbiVersion) override {
+    MCAssembler &MCA = getStreamer().getAssembler();
+    unsigned Flags = MCA.getELFHeaderEFlags();
+    Flags &= ~ELF::EF_PPC64_ABI;
+    Flags |= (AbiVersion & ELF::EF_PPC64_ABI);
+    MCA.setELFHeaderEFlags(Flags);
+  }
+  void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) override {
+    MCAssembler &MCA = getStreamer().getAssembler();
+    MCSymbolData &Data = getStreamer().getOrCreateSymbolData(S);
+
+    int64_t Res;
+    if (!LocalOffset->EvaluateAsAbsolute(Res, MCA))
+      report_fatal_error(".localentry expression must be absolute.");
+
+    unsigned Encoded = ELF::encodePPC64LocalEntryOffset(Res);
+    if (Res != ELF::decodePPC64LocalEntryOffset(Encoded))
+      report_fatal_error(".localentry expression cannot be encoded.");
+
+    // The "other" values are stored in the last 6 bits of the second byte.
+    // The traditional defines for STO values assume the full byte and thus
+    // the shift to pack it.
+    unsigned Other = MCELF::getOther(Data) << 2;
+    Other &= ~ELF::STO_PPC64_LOCAL_MASK;
+    Other |= Encoded;
+    MCELF::setOther(Data, Other >> 2);
+
+    // For GAS compatibility, unless we already saw a .abiversion directive,
+    // set e_flags to indicate ELFv2 ABI.
+    unsigned Flags = MCA.getELFHeaderEFlags();
+    if ((Flags & ELF::EF_PPC64_ABI) == 0)
+      MCA.setELFHeaderEFlags(Flags | 2);
+  }
 };
 
 class PPCTargetMachOStreamer : public PPCTargetStreamer {
@@ -150,25 +196,27 @@
     // FIXME: We should update the CPUType, CPUSubType in the Object file if
     // the new values are different from the defaults.
   }
+  void emitAbiVersion(int AbiVersion) override {
+    llvm_unreachable("Unknown pseudo-op: .abiversion");
+  }
+  void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) override {
+    llvm_unreachable("Unknown pseudo-op: .localentry");
+  }
 };
 }
 
 // This is duplicated code. Refactor this.
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &OS,
-                                    MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll) {
   if (Triple(TT).isOSDarwin()) {
     MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
     new PPCTargetMachOStreamer(*S);
     return S;
   }
 
-  MCStreamer *S =
-      createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  MCStreamer *S = createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
   new PPCTargetELFStreamer(*S);
   return S;
 }

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 474395b..68f7f7a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PPCMCTARGETDESC_H
-#define PPCMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
 
 // GCC #defines PPC on Linux but we use it as our namespace name
 #undef PPC

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index cff27ba..df2f14a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp

@@ -80,7 +80,7 @@
 }
 
 /// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum.
-/// Outline based on PPCELFObjectWriter::getRelocTypeInner().
+/// Outline based on PPCELFObjectWriter::GetRelocType().
 static unsigned getRelocType(const MCValue &Target,
                              const MCFixupKind FixupKind, // from
                                                           // Fixup.getKind()
@@ -360,7 +360,7 @@
       // For external relocations, make sure to offset the fixup value to
       // compensate for the addend of the symbol address, if it was
       // undefined. This occurs with weak definitions, for example.
-      if (!SD->Symbol->isUndefined())
+      if (!SD->getSymbol().isUndefined())
         FixedValue -= Layout.getSymbolOffset(SD);
     } else {
       // The index is the section ordinal (1-based).

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index 10e328a..6075631 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H
-#define LLVM_TARGET_POWERPC_PPCPREDICATES_H
+#ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCPREDICATES_H
+#define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCPREDICATES_H
 
 // GCC #defines PPC on Linux but we use it as our namespace name
 #undef PPC

diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile
index c966748..cf516f4 100644
--- a/lib/Target/PowerPC/Makefile
+++ b/lib/Target/PowerPC/Makefile

@@ -13,7 +13,7 @@
 
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = PPCGenRegisterInfo.inc PPCGenAsmMatcher.inc \
-                PPCGenAsmWriter.inc  PPCGenCodeEmitter.inc \
+                PPCGenAsmWriter.inc  \
                 PPCGenInstrInfo.inc PPCGenDAGISel.inc \
                 PPCGenSubtargetInfo.inc PPCGenCallingConv.inc \
                 PPCGenMCCodeEmitter.inc PPCGenFastISel.inc \

diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index c42c5be..8fb33df 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_POWERPC_H
-#define LLVM_TARGET_POWERPC_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPC_H
+#define LLVM_LIB_TARGET_POWERPC_PPC_H
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include <string>
@@ -26,7 +26,6 @@
   class PassRegistry;
   class FunctionPass;
   class ImmutablePass;
-  class JITCodeEmitter;
   class MachineInstr;
   class AsmPrinter;
   class MCInst;
@@ -41,8 +40,6 @@
   FunctionPass *createPPCVSXFMAMutatePass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
-  FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
-                                            JITCodeEmitter &MCE);
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
 
@@ -60,10 +57,11 @@
     // PPC Specific MachineOperand flags.
     MO_NO_FLAG,
     
-    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
-    /// and jumps to external functions on Tiger and earlier.
-    MO_DARWIN_STUB = 1,
+    /// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" or "FOO@plt" symbol.  This is
+    /// used for calls and jumps to external functions on Tiger and earlier, and
+    /// for PIC calls on Linux and ELF systems.
+    MO_PLT_OR_STUB = 1,
     
     /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
     /// the function's picbase, e.g. lo16(symbol-picbase).
@@ -95,7 +93,12 @@
     MO_TOC_LO    = 7 << 4,
 
     // Symbol for VK_PPC_TLS fixup attached to an ADD instruction
-    MO_TLS       = 8 << 4
+    MO_TLS       = 8 << 4,
+
+    // Symbols for VK_PPC_TLSGD and VK_PPC_TLSLD in __tls_get_addr
+    // call sequences.
+    MO_TLSLD     = 9 << 4,
+    MO_TLSGD     = 10 << 4
   };
   } // end namespace PPCII
   

diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index a9842b2..46d56a4 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td

@@ -56,6 +56,8 @@
                               "Use condition-register bits individually">;
 def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
                                         "Enable Altivec instructions">;
+def FeatureSPE       : SubtargetFeature<"spe","HasSPE", "true",
+                                        "Enable SPE instructions">;
 def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
                                         "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
@@ -88,11 +90,23 @@
                                         "Enable the ldbrx instruction">;
 def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
                                         "Enable Book E instructions">;
+def FeatureMSYNC     : SubtargetFeature<"msync", "HasOnlyMSYNC", "true",
+                              "Has only the msync instruction instead of sync",
+                              [FeatureBookE]>;
+def FeatureE500      : SubtargetFeature<"e500", "IsE500", "true",
+                                        "Enable E500/E500mc instructions">;
+def FeaturePPC4xx    : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true",
+                                        "Enable PPC 4xx instructions">;
+def FeaturePPC6xx    : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true",
+                                        "Enable PPC 6xx instructions">;
 def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
                                         "Enable QPX instructions">;
 def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
                                         "Enable VSX instructions",
                                         [FeatureAltivec]>;
+def FeatureP8Vector  : SubtargetFeature<"power8-vector", "HasP8Vector", "true",
+                                        "Enable POWER8 vector instructions",
+                                        [FeatureVSX, FeatureAltivec]>;
 
 def DeprecatedMFTB   : SubtargetFeature<"", "DeprecatedMFTB", "true",
                                         "Treat mftb as deprecated">;
@@ -105,7 +119,16 @@
 // CMPB         p6, p6x, p7        cmpb
 // DFP          p6, p6x, p7        decimal floating-point instructions
 // POPCNTB      p5 through p7      popcntb and related instructions
-// VSX          p7                 vector-scalar instruction set
+
+//===----------------------------------------------------------------------===//
+// ABI Selection                                                              //
+//===----------------------------------------------------------------------===//
+
+def FeatureELFv1 : SubtargetFeature<"elfv1", "TargetABI", "PPC_ABI_ELFv1",
+                                    "Use the ELFv1 ABI">;
+
+def FeatureELFv2 : SubtargetFeature<"elfv2", "TargetABI", "PPC_ABI_ELFv2",
+                                    "Use the ELFv2 ABI">;
 
 //===----------------------------------------------------------------------===//
 // Classes used for relation maps.
@@ -178,10 +201,12 @@
 def : Processor<"generic", G3Itineraries, [Directive32]>;
 def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL,
                                           FeatureFRES, FeatureFRSQRTE,
-                                          FeatureBookE, DeprecatedMFTB]>;
+                                          FeatureBookE, FeatureMSYNC,
+                                          DeprecatedMFTB]>;
 def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
                                           FeatureFRES, FeatureFRSQRTE,
-                                          FeatureBookE, DeprecatedMFTB]>;
+                                          FeatureBookE, FeatureMSYNC,
+                                          DeprecatedMFTB]>;
 def : Processor<"601", G3Itineraries, [Directive601]>;
 def : Processor<"602", G3Itineraries, [Directive602]>;
 def : Processor<"603", G3Itineraries, [Directive603,

diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index fd044d9..5648873 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp

@@ -18,6 +18,7 @@
 
 #include "PPC.h"
 #include "InstPrinter/PPCInstPrinter.h"
+#include "PPCMachineFunctionInfo.h"
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCSubtarget.h"
@@ -27,10 +28,12 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -100,9 +103,11 @@
     }
 
     bool doFinalization(Module &M) override;
+    void EmitStartOfAsmFile(Module &M) override;
 
     void EmitFunctionEntryLabel() override;
 
+    void EmitFunctionBodyStart() override;
     void EmitFunctionBodyEnd() override;
   };
 
@@ -142,7 +147,7 @@
 
 void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   const MachineOperand &MO = MI->getOperand(OpNo);
   
   switch (MO.getType()) {
@@ -270,6 +275,18 @@
         printOperand(MI, OpNo, O);
         return false;
       }
+    case 'U': // Print 'u' for update form.
+    case 'X': // Print 'x' for indexed form.
+      {
+	// FIXME: Currently for PowerPC memory operands are always loaded
+	// into a register, so we never get an update or indexed form.
+	// This is bad even for offset forms, since even if we know we
+	// have a value in -16(r1), we will generate a load into r<n>
+	// and then load from 0(r<n>).  Until that issue is fixed,
+	// tolerate 'U' and 'X' but don't output anything.
+	assert(MI->getOperand(OpNo).isReg());
+	return false;
+      }
     }
   }
 
@@ -285,7 +302,7 @@
 /// exists for it.  If not, create one.  Then return a symbol that references
 /// the TOC entry.
 MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   MCSymbol *&TOCEntry = TOC[Sym];
 
   // To avoid name clash check if the name already exists.
@@ -306,12 +323,35 @@
 void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
   bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwin = Triple(TM.getTargetTriple()).isOSDarwin();
+  const Module *M = MF->getFunction()->getParent();
+  PICLevel::Level PL = M->getPICLevel();
   
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
   default: break;
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
+  case PPC::MoveGOTtoLR: {
+    // Transform %LR = MoveGOTtoLR
+    // Into this: bl _GLOBAL_OFFSET_TABLE_@local-4
+    // _GLOBAL_OFFSET_TABLE_@local-4 (instruction preceding
+    // _GLOBAL_OFFSET_TABLE_) has exactly one instruction:
+    //      blrl
+    // This will return the pointer to _GLOBAL_OFFSET_TABLE_@local
+    MCSymbol *GOTSymbol =
+      OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+    const MCExpr *OffsExpr =
+      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(GOTSymbol,
+                                                      MCSymbolRefExpr::VK_PPC_LOCAL,
+                                                      OutContext),
+                              MCConstantExpr::Create(4, OutContext),
+                              OutContext);
+
+    // Emit the 'bl'.
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL).addExpr(OffsExpr));
+    return;
+  }
   case PPC::MovePCtoLR:
   case PPC::MovePCtoLR8: {
     // Transform %LR = MovePCtoLR
@@ -330,19 +370,49 @@
     OutStreamer.EmitLabel(PICBase);
     return;
   }
-  case PPC::LDtocJTI:
-  case PPC::LDtocCPT:
-  case PPC::LDtoc: {
-    // Transform %X3 = LDtoc <ga:@min1>, %X2
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+  case PPC::UpdateGBR: {
+    // Transform %Rd = UpdateGBR(%Rt, %Ri)
+    // Into: lwz %Rt, .L0$poff - .L0$pb(%Ri)
+    //       add %Rd, %Rt, %Ri
+    // Get the offset from the GOT Base Register to the GOT
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+    MCSymbol *PICOffset =
+      MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
+    TmpInst.setOpcode(PPC::LWZ);
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
+    const MCExpr *PB =
+      MCSymbolRefExpr::Create(MF->getPICBaseSymbol(),
+                              MCSymbolRefExpr::VK_None,
+                              OutContext);
+    const MCOperand TR = TmpInst.getOperand(1);
+    const MCOperand PICR = TmpInst.getOperand(0);
 
-    // Change the opcode to LD, and the global address operand to be a
-    // reference to the TOC entry we will synthesize later.
-    TmpInst.setOpcode(PPC::LD);
+    // Step 1: lwz %Rt, .L$poff - .L$pb(%Ri)
+    TmpInst.getOperand(1) =
+        MCOperand::CreateExpr(MCBinaryExpr::CreateSub(Exp, PB, OutContext));
+    TmpInst.getOperand(0) = TR;
+    TmpInst.getOperand(2) = PICR;
+    EmitToStreamer(OutStreamer, TmpInst);
+
+    TmpInst.setOpcode(PPC::ADD4);
+    TmpInst.getOperand(0) = PICR;
+    TmpInst.getOperand(1) = TR;
+    TmpInst.getOperand(2) = PICR;
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::LWZtoc: {
+    // Transform %R3 = LWZtoc <ga:@min1>, %R2
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+    // Change the opcode to LWZ, and the global address operand to be a
+    // reference to the GOT entry we will synthesize later.
+    TmpInst.setOpcode(PPC::LWZ);
     const MachineOperand &MO = MI->getOperand(1);
 
     // Map symbol -> label of TOC entry
-    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
     MCSymbol *MOSymbol = nullptr;
     if (MO.isGlobal())
       MOSymbol = getSymbol(MO.getGlobal());
@@ -350,6 +420,52 @@
       MOSymbol = GetCPISymbol(MO.getIndex());
     else if (MO.isJTI())
       MOSymbol = GetJTISymbol(MO.getIndex());
+    else if (MO.isBlockAddress())
+      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+
+    if (PL == PICLevel::Small) {
+      const MCExpr *Exp =
+        MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_GOT,
+                                OutContext);
+      TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
+    } else {
+      MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+
+      const MCExpr *Exp =
+        MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_None,
+                                OutContext);
+      const MCExpr *PB =
+        MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".LTOC")),
+                                                             OutContext);
+      Exp = MCBinaryExpr::CreateSub(Exp, PB, OutContext);
+      TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
+    }
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case PPC::LDtocJTI:
+  case PPC::LDtocCPT:
+  case PPC::LDtocBA:
+  case PPC::LDtoc: {
+    // Transform %X3 = LDtoc <ga:@min1>, %X2
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
+
+    // Change the opcode to LD, and the global address operand to be a
+    // reference to the TOC entry we will synthesize later.
+    TmpInst.setOpcode(PPC::LD);
+    const MachineOperand &MO = MI->getOperand(1);
+
+    // Map symbol -> label of TOC entry
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress());
+    MCSymbol *MOSymbol = nullptr;
+    if (MO.isGlobal())
+      MOSymbol = getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+    else if (MO.isBlockAddress())
+      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
 
     MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
 
@@ -363,7 +479,7 @@
       
   case PPC::ADDIStocHA: {
     // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to ADDIS8.  If the global address is external, has
     // common linkage, is a non-local function address, or is a jump table
@@ -371,7 +487,8 @@
     // reference the symbol directly.
     TmpInst.setOpcode(PPC::ADDIS8);
     const MachineOperand &MO = MI->getOperand(2);
-    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI()) &&
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
+            MO.isBlockAddress()) &&
            "Invalid operand for ADDIStocHA!");
     MCSymbol *MOSymbol = nullptr;
     bool IsExternal = false;
@@ -391,9 +508,12 @@
       MOSymbol = GetCPISymbol(MO.getIndex());
     else if (MO.isJTI())
       MOSymbol = GetJTISymbol(MO.getIndex());
+    else if (MO.isBlockAddress())
+      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
 
     if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt ||
-        MO.isJTI() || TM.getCodeModel() == CodeModel::Large)
+        MO.isJTI() || MO.isBlockAddress() ||
+        TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
@@ -405,19 +525,24 @@
   }
   case PPC::LDtocL: {
     // Transform %Xd = LDtocL <ga:@sym>, %Xs
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to LD.  If the global address is external, has
     // common linkage, or is a jump table address, then reference the
     // associated TOC entry.  Otherwise reference the symbol directly.
     TmpInst.setOpcode(PPC::LD);
     const MachineOperand &MO = MI->getOperand(1);
-    assert((MO.isGlobal() || MO.isJTI() || MO.isCPI()) &&
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() ||
+            MO.isBlockAddress()) &&
            "Invalid operand for LDtocL!");
     MCSymbol *MOSymbol = nullptr;
 
     if (MO.isJTI())
       MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
+    else if (MO.isBlockAddress()) {
+      MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress());
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
     else if (MO.isCPI()) {
       MOSymbol = GetCPISymbol(MO.getIndex());
       if (TM.getCodeModel() == CodeModel::Large)
@@ -442,7 +567,7 @@
   }
   case PPC::ADDItocL: {
     // Transform %Xd = ADDItocL %Xs, <ga:@sym>
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to ADDI8.  If the global address is external, then
     // generate a TOC entry and reference that.  Otherwise reference the
@@ -493,7 +618,7 @@
   case PPC::LDgotTprelL:
   case PPC::LDgotTprelL32: {
     // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
 
     // Change the opcode to LD.
     TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ);
@@ -508,6 +633,34 @@
     return;
   }
 
+  case PPC::PPC32PICGOT: {
+    MCSymbol *GOTSymbol = OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
+    MCSymbol *GOTRef = OutContext.CreateTempSymbol();
+    MCSymbol *NextInstr = OutContext.CreateTempSymbol();
+
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL)
+      // FIXME: We would like an efficient form for this, so we don't have to do
+      // a lot of extra uniquing.
+      .addExpr(MCSymbolRefExpr::Create(NextInstr, OutContext)));
+    const MCExpr *OffsExpr =
+      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(GOTSymbol, OutContext),
+                                MCSymbolRefExpr::Create(GOTRef, OutContext),
+        OutContext);
+    OutStreamer.EmitLabel(GOTRef);
+    OutStreamer.EmitValue(OffsExpr, 4);
+    OutStreamer.EmitLabel(NextInstr);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR)
+                                .addReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LWZ)
+                                .addReg(MI->getOperand(1).getReg())
+                                .addImm(0)
+                                .addReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADD4)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addReg(MI->getOperand(0).getReg()));
+    return;
+  }
   case PPC::PPC32GOT: {
     MCSymbol *GOTSymbol = OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
     const MCExpr *SymGotTlsL =
@@ -541,40 +694,25 @@
                                 .addExpr(SymGotTlsGD));
     return;
   }
-  case PPC::ADDItlsgdL: {
+  case PPC::ADDItlsgdL:
     // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
     // Into:      %Xd = ADDI8 %Xs, sym@got@tlsgd@l
-    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+  case PPC::ADDItlsgdL32: {
+    // Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDI %Rs, sym@got@tlsgd
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO,
+      MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ?
+                                         MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO :
+                                         MCSymbolRefExpr::VK_PPC_GOT_TLSGD,
                               OutContext);
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI8)
-                                .addReg(MI->getOperand(0).getReg())
-                                .addReg(MI->getOperand(1).getReg())
-                                .addExpr(SymGotTlsGD));
-    return;
-  }
-  case PPC::GETtlsADDR: {
-    // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
-    // Into:      BL8_NOP_TLS __tls_get_addr(sym@tlsgd)
-    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
-
-    StringRef Name = "__tls_get_addr";
-    MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
-    const MCSymbolRefExpr *TlsRef = 
-      MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
-    const MachineOperand &MO = MI->getOperand(2);
-    const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymVar =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD,
-                              OutContext);
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP_TLS)
-                                .addExpr(TlsRef)
-                                .addExpr(SymVar));
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   .addReg(MI->getOperand(0).getReg())
+                   .addReg(MI->getOperand(1).getReg())
+                   .addExpr(SymGotTlsGD));
     return;
   }
   case PPC::ADDIStlsldHA: {
@@ -593,72 +731,63 @@
                                 .addExpr(SymGotTlsLD));
     return;
   }
-  case PPC::ADDItlsldL: {
+  case PPC::ADDItlsldL:
     // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
     // Into:      %Xd = ADDI8 %Xs, sym@got@tlsld@l
-    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+  case PPC::ADDItlsldL32: {
+    // Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDI %Rs, sym@got@tlsld
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO,
+      MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ?
+                                         MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO :
+                                         MCSymbolRefExpr::VK_PPC_GOT_TLSLD,
                               OutContext);
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI8)
-                                .addReg(MI->getOperand(0).getReg())
-                                .addReg(MI->getOperand(1).getReg())
-                                .addExpr(SymGotTlsLD));
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   .addReg(MI->getOperand(0).getReg())
+                   .addReg(MI->getOperand(1).getReg())
+                   .addExpr(SymGotTlsLD));
     return;
   }
-  case PPC::GETtlsldADDR: {
-    // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
-    // Into:      BL8_NOP_TLS __tls_get_addr(sym@tlsld)
-    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
-
-    StringRef Name = "__tls_get_addr";
-    MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
-    const MCSymbolRefExpr *TlsRef = 
-      MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
-    const MachineOperand &MO = MI->getOperand(2);
-    const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymVar =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD,
-                              OutContext);
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP_TLS)
-                                .addExpr(TlsRef)
-                                .addExpr(SymVar));
-    return;
-  }
-  case PPC::ADDISdtprelHA: {
+  case PPC::ADDISdtprelHA:
     // Transform: %Xd = ADDISdtprelHA %X3, <ga:@sym>
     // Into:      %Xd = ADDIS8 %X3, sym@dtprel@ha
-    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+  case PPC::ADDISdtprelHA32: {
+    // Transform: %Rd = ADDISdtprelHA32 %R3, <ga:@sym>
+    // Into:      %Rd = ADDIS %R3, sym@dtprel@ha
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
                               OutContext);
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8)
-                                .addReg(MI->getOperand(0).getReg())
-                                .addReg(PPC::X3)
-                                .addExpr(SymDtprel));
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDIS8 : PPC::ADDIS)
+                   .addReg(MI->getOperand(0).getReg())
+                   .addReg(Subtarget.isPPC64() ? PPC::X3 : PPC::R3)
+                   .addExpr(SymDtprel));
     return;
   }
-  case PPC::ADDIdtprelL: {
+  case PPC::ADDIdtprelL:
     // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
     // Into:      %Xd = ADDI8 %Xs, sym@dtprel@l
-    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+  case PPC::ADDIdtprelL32: {
+    // Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym>
+    // Into:      %Rd = ADDI %Rs, sym@dtprel@l
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
                               OutContext);
-    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI8)
-                                .addReg(MI->getOperand(0).getReg())
-                                .addReg(MI->getOperand(1).getReg())
-                                .addExpr(SymDtprel));
+    EmitToStreamer(OutStreamer,
+                   MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI)
+                   .addReg(MI->getOperand(0).getReg())
+                   .addReg(MI->getOperand(1).getReg())
+                   .addExpr(SymDtprel));
     return;
   }
   case PPC::MFOCRF:
@@ -713,14 +842,77 @@
   }
   }
 
-  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin);
   EmitToStreamer(OutStreamer, TmpInst);
 }
 
+void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (Subtarget.isELFv2ABI()) {
+    PPCTargetStreamer *TS =
+      static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
+
+    if (TS)
+      TS->emitAbiVersion(2);
+  }
+
+  if (Subtarget.isPPC64() || TM.getRelocationModel() != Reloc::PIC_)
+    return AsmPrinter::EmitStartOfAsmFile(M);
+
+  if (M.getPICLevel() == PICLevel::Small)
+    return AsmPrinter::EmitStartOfAsmFile(M);
+
+  OutStreamer.SwitchSection(OutContext.getELFSection(".got2",
+         ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
+         SectionKind::getReadOnly()));
+
+  MCSymbol *TOCSym = OutContext.GetOrCreateSymbol(Twine(".LTOC"));
+  MCSymbol *CurrentPos = OutContext.CreateTempSymbol();
+
+  OutStreamer.EmitLabel(CurrentPos);
+
+  // The GOT pointer points to the middle of the GOT, in order to reference the
+  // entire 64kB range.  0x8000 is the midpoint.
+  const MCExpr *tocExpr =
+    MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(CurrentPos, OutContext),
+                            MCConstantExpr::Create(0x8000, OutContext),
+                            OutContext);
+
+  OutStreamer.EmitAssignment(TOCSym, tocExpr);
+
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+}
+
 void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
-  if (!Subtarget.isPPC64())  // linux/ppc32 - Normal entry label.
+  // linux/ppc32 - Normal entry label.
+  if (!Subtarget.isPPC64() && 
+      (TM.getRelocationModel() != Reloc::PIC_ || 
+       MF->getFunction()->getParent()->getPICLevel() == PICLevel::Small))
     return AsmPrinter::EmitFunctionEntryLabel();
-    
+
+  if (!Subtarget.isPPC64()) {
+    const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+  	if (PPCFI->usesPICBase()) {
+      MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
+      MCSymbol *PICBase = MF->getPICBaseSymbol();
+      OutStreamer.EmitLabel(RelocSymbol);
+
+      const MCExpr *OffsExpr =
+        MCBinaryExpr::CreateSub(
+          MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".LTOC")),
+                                                               OutContext),
+                                  MCSymbolRefExpr::Create(PICBase, OutContext),
+          OutContext);
+      OutStreamer.EmitValue(OffsExpr, 4);
+      OutStreamer.EmitLabel(CurrentFnSym);
+      return;
+    } else
+      return AsmPrinter::EmitFunctionEntryLabel();
+  }
+
+  // ELFv2 ABI - Normal entry label.
+  if (Subtarget.isELFv2ABI())
+    return AsmPrinter::EmitFunctionEntryLabel();
+
   // Emit an official procedure descriptor.
   MCSectionSubPair Current = OutStreamer.getCurrentSection();
   const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".opd",
@@ -752,15 +944,22 @@
 
 
 bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
   bool isPPC64 = TD->getPointerSizeInBits() == 64;
 
   PPCTargetStreamer &TS =
       static_cast<PPCTargetStreamer &>(*OutStreamer.getTargetStreamer());
 
-  if (isPPC64 && !TOC.empty()) {
-    const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".toc",
+  if (!TOC.empty()) {
+    const MCSectionELF *Section;
+    
+    if (isPPC64)
+      Section = OutStreamer.getContext().getELFSection(".toc",
+        ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
+        SectionKind::getReadOnly());
+	else
+      Section = OutStreamer.getContext().getELFSection(".got2",
         ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
         SectionKind::getReadOnly());
     OutStreamer.SwitchSection(Section);
@@ -768,8 +967,11 @@
     for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
          E = TOC.end(); I != E; ++I) {
       OutStreamer.EmitLabel(I->second);
-      MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName());
-      TS.emitTCEntry(*S);
+      MCSymbol *S = I->first;
+      if (isPPC64)
+        TS.emitTCEntry(*S);
+      else
+        OutStreamer.EmitSymbolValue(S, 4);
     }
   }
 
@@ -795,6 +997,68 @@
   return AsmPrinter::doFinalization(M);
 }
 
+/// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2.
+void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
+  // In the ELFv2 ABI, in functions that use the TOC register, we need to
+  // provide two entry points.  The ABI guarantees that when calling the
+  // local entry point, r2 is set up by the caller to contain the TOC base
+  // for this function, and when calling the global entry point, r12 is set
+  // up by the caller to hold the address of the global entry point.  We
+  // thus emit a prefix sequence along the following lines:
+  //
+  // func:
+  //         # global entry point
+  //         addis r2,r12,(.TOC.-func)@ha
+  //         addi  r2,r2,(.TOC.-func)@l
+  //         .localentry func, .-func
+  //         # local entry point, followed by function body
+  //
+  // This ensures we have r2 set up correctly while executing the function
+  // body, no matter which entry point is called.
+  if (Subtarget.isELFv2ABI()
+      // Only do all that if the function uses r2 in the first place.
+      && !MF->getRegInfo().use_empty(PPC::X2)) {
+
+    MCSymbol *GlobalEntryLabel = OutContext.CreateTempSymbol();
+    OutStreamer.EmitLabel(GlobalEntryLabel);
+    const MCSymbolRefExpr *GlobalEntryLabelExp =
+      MCSymbolRefExpr::Create(GlobalEntryLabel, OutContext);
+
+    MCSymbol *TOCSymbol = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
+    const MCExpr *TOCDeltaExpr =
+      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(TOCSymbol, OutContext),
+                              GlobalEntryLabelExp, OutContext);
+
+    const MCExpr *TOCDeltaHi =
+      PPCMCExpr::CreateHa(TOCDeltaExpr, false, OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS)
+                                .addReg(PPC::X2)
+                                .addReg(PPC::X12)
+                                .addExpr(TOCDeltaHi));
+
+    const MCExpr *TOCDeltaLo =
+      PPCMCExpr::CreateLo(TOCDeltaExpr, false, OutContext);
+    EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI)
+                                .addReg(PPC::X2)
+                                .addReg(PPC::X2)
+                                .addExpr(TOCDeltaLo));
+
+    MCSymbol *LocalEntryLabel = OutContext.CreateTempSymbol();
+    OutStreamer.EmitLabel(LocalEntryLabel);
+    const MCSymbolRefExpr *LocalEntryLabelExp =
+       MCSymbolRefExpr::Create(LocalEntryLabel, OutContext);
+    const MCExpr *LocalOffsetExp =
+      MCBinaryExpr::CreateSub(LocalEntryLabelExp,
+                              GlobalEntryLabelExp, OutContext);
+
+    PPCTargetStreamer *TS =
+      static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer());
+
+    if (TS)
+      TS->emitLocalEntry(CurrentFnSym, LocalOffsetExp);
+  }
+}
+
 /// EmitFunctionBodyEnd - Print the traceback table before the .size
 /// directive.
 ///
@@ -886,7 +1150,8 @@
 
 void PPCDarwinAsmPrinter::
 EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
-  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+  bool isPPC64 =
+      TM.getSubtargetImpl()->getDataLayout()->getPointerSizeInBits() == 64;
   bool isDarwin = Subtarget.isDarwin();
   
   const TargetLoweringObjectFileMachO &TLOFMacho = 
@@ -1022,7 +1287,8 @@
 
 
 bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
-  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+  bool isPPC64 =
+      TM.getSubtargetImpl()->getDataLayout()->getPointerSizeInBits() == 64;
 
   // Darwin/PPC always uses mach-o.
   const TargetLoweringObjectFileMachO &TLOFMacho = 

diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index ee90671..41594be 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp

@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "ppc-branch-select"
@@ -64,7 +65,7 @@
 
 bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
   const PPCInstrInfo *TII =
-                static_cast<const PPCInstrInfo*>(Fn.getTarget().getInstrInfo());
+      static_cast<const PPCInstrInfo *>(Fn.getSubtarget().getInstrInfo());
   // Give the blocks of the function a dense, in-order, numbering.
   Fn.RenumberBlocks();
   BlockSizes.resize(Fn.getNumBlockIDs());

diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index ec1e34d..5f3b176 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp

@@ -214,7 +214,7 @@
 
       if (!TM)
         return true;
-      const TargetLowering *TLI = TM->getTargetLowering();
+      const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
 
       if (Function *F = CI->getCalledFunction()) {
         // Most intrinsics don't become function calls, but some might.
@@ -384,10 +384,9 @@
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
       if (!TM)
         return true;
-      const TargetLowering *TLI = TM->getTargetLowering();
+      const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
 
-      if (TLI->supportJumpTables() &&
-          SI->getNumCases()+1 >= (unsigned) TLI->getMinimumJumpTableEntries())
+      if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
         return true;
     }
   }

diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index d48164d..cf8fee4 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td

@@ -14,9 +14,15 @@
 
 /// CCIfSubtarget - Match if the current subtarget has a feature F.
 class CCIfSubtarget<string F, CCAction A>
- : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+    : CCIf<!strconcat("static_cast<const PPCSubtarget&>"
+                       "(State.getMachineFunction().getSubtarget()).",
+                     F),
+          A>;
 class CCIfNotSubtarget<string F, CCAction A>
- : CCIf<!strconcat("!State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+    : CCIf<!strconcat("!static_cast<const PPCSubtarget&>"
+                       "(State.getMachineFunction().getSubtarget()).",
+                     F),
+          A>;
 
 //===----------------------------------------------------------------------===//
 // Return Value Calling Convention
@@ -31,13 +37,18 @@
   CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
   CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
   CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+
+  // Floating point types returned as "direct" go into F1 .. F8; note that
+  // only the ELFv2 ABI fully utilizes all these registers.
+  CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
   
-  CCIfType<[f32], CCAssignToReg<[F1, F2]>>,
-  CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>,
-  
-  // Vector types are always returned in V2.
-  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>,
-  CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>>
+  // Vector types returned as "direct" go into V2 .. V9; note that only the
+  // ELFv2 ABI fully utilizes all these registers.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>,
+  CCIfType<[v2f64, v2i64],
+           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>
 ]>;
 
 
@@ -69,10 +80,12 @@
   CCIfType<[i32],  CCPromoteToType<i64>>,
   CCIfType<[i64],  CCAssignToReg<[X3, X4]>>,
   CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
-  CCIfType<[f32],  CCAssignToReg<[F1, F2]>>,
-  CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4]>>,
-  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>,
-  CCIfType<[v2f64, v2i64], CCAssignToReg<[VSH2]>>
+  CCIfType<[f32],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>,
+  CCIfType<[v2f64, v2i64],
+           CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>>
 ]>;
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
deleted file mode 100644
index 0875523..0000000
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ /dev/null

@@ -1,293 +0,0 @@
-//===-- PPCCodeEmitter.cpp - JIT Code Emitter for PowerPC -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PowerPC 32-bit CodeEmitter and associated machinery to
-// JIT-compile bitcode to native PowerPC.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPC.h"
-#include "PPCRelocations.h"
-#include "PPCTargetMachine.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
-using namespace llvm;
-
-namespace {
-  class PPCCodeEmitter : public MachineFunctionPass {
-    TargetMachine &TM;
-    JITCodeEmitter &MCE;
-    MachineModuleInfo *MMI;
-    
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineModuleInfo>();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-    
-    static char ID;
-    
-    /// MovePCtoLROffset - When/if we see a MovePCtoLR instruction, we record
-    /// its address in the function into this pointer.
-    void *MovePCtoLROffset;
-  public:
-    
-    PPCCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-      : MachineFunctionPass(ID), TM(tm), MCE(mce) {}
-
-    /// getBinaryCodeForInstr - This function, generated by the
-    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
-    /// machine instructions.
-    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
-
-    
-    MachineRelocation GetRelocation(const MachineOperand &MO,
-                                    unsigned RelocID) const;
-    
-    /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr
-    unsigned getMachineOpValue(const MachineInstr &MI,
-                               const MachineOperand &MO) const;
-
-    unsigned get_crbitm_encoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getDirectBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getAbsDirectBrEncoding(const MachineInstr &MI,
-                                    unsigned OpNo) const;
-    unsigned getAbsCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
-
-    unsigned getImm16Encoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getMemRIEncoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getMemRIXEncoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getTLSRegEncoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getTLSCallEncoding(const MachineInstr &MI, unsigned OpNo) const;
-
-    const char *getPassName() const override {
-      return "PowerPC Machine Code Emitter";
-    }
-
-    /// runOnMachineFunction - emits the given MachineFunction to memory
-    ///
-    bool runOnMachineFunction(MachineFunction &MF) override;
-
-    /// emitBasicBlock - emits the given MachineBasicBlock to memory
-    ///
-    void emitBasicBlock(MachineBasicBlock &MBB);
-  };
-}
-
-char PPCCodeEmitter::ID = 0;
-
-/// createPPCCodeEmitterPass - Return a pass that emits the collected PPC code
-/// to the specified MCE object.
-FunctionPass *llvm::createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
-                                                JITCodeEmitter &JCE) {
-  return new PPCCodeEmitter(TM, JCE);
-}
-
-bool PPCCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
-  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
-          MF.getTarget().getRelocationModel() != Reloc::Static) &&
-         "JIT relocation model must be set to static or default!");
-
-  MMI = &getAnalysis<MachineModuleInfo>();
-  MCE.setModuleInfo(MMI);
-  do {
-    MovePCtoLROffset = nullptr;
-    MCE.startFunction(MF);
-    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
-      emitBasicBlock(*BB);
-  } while (MCE.finishFunction(MF));
-
-  return false;
-}
-
-void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
-  MCE.StartMachineBasicBlock(&MBB);
-
-  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I){
-    const MachineInstr &MI = *I;
-    MCE.processDebugLoc(MI.getDebugLoc(), true);
-    switch (MI.getOpcode()) {
-    default:
-      MCE.emitWordBE(getBinaryCodeForInstr(MI));
-      break;
-    case TargetOpcode::CFI_INSTRUCTION:
-      break;
-    case TargetOpcode::EH_LABEL:
-      MCE.emitLabel(MI.getOperand(0).getMCSymbol());
-      break;
-    case TargetOpcode::IMPLICIT_DEF:
-    case TargetOpcode::KILL:
-      break; // pseudo opcode, no side effects
-    case PPC::MovePCtoLR:
-    case PPC::MovePCtoLR8:
-      assert(TM.getRelocationModel() == Reloc::PIC_);
-      MovePCtoLROffset = (void*)MCE.getCurrentPCValue();
-      MCE.emitWordBE(0x48000005);   // bl 1
-      break;
-    }
-    MCE.processDebugLoc(MI.getDebugLoc(), false);
-  }
-}
-
-unsigned PPCCodeEmitter::get_crbitm_encoding(const MachineInstr &MI,
-                                             unsigned OpNo) const {
-  const MachineOperand &MO = MI.getOperand(OpNo);
-  assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 ||
-          MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) &&
-         (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
-  return 0x80 >> TM.getRegisterInfo()->getEncodingValue(MO.getReg());
-}
-
-MachineRelocation PPCCodeEmitter::GetRelocation(const MachineOperand &MO, 
-                                                unsigned RelocID) const {
-  // If in PIC mode, we need to encode the negated address of the
-  // 'movepctolr' into the unrelocated field.  After relocation, we'll have
-  // &gv-&movepctolr-4 in the imm field.  Once &movepctolr is added to the imm
-  // field, we get &gv.  This doesn't happen for branch relocations, which are
-  // always implicitly pc relative.
-  intptr_t Cst = 0;
-  if (TM.getRelocationModel() == Reloc::PIC_) {
-    assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
-    Cst = -(intptr_t)MovePCtoLROffset - 4;
-  }
-  
-  if (MO.isGlobal())
-    return MachineRelocation::getGV(MCE.getCurrentPCOffset(), RelocID,
-                                    const_cast<GlobalValue *>(MO.getGlobal()),
-                                    Cst, isa<Function>(MO.getGlobal()));
-  if (MO.isSymbol())
-    return MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
-                                        RelocID, MO.getSymbolName(), Cst);
-  if (MO.isCPI())
-    return MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
-                                           RelocID, MO.getIndex(), Cst);
-
-  if (MO.isMBB())
-    return MachineRelocation::getBB(MCE.getCurrentPCOffset(),
-                                    RelocID, MO.getMBB());
-  
-  assert(MO.isJTI());
-  return MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
-                                         RelocID, MO.getIndex(), Cst);
-}
-
-unsigned PPCCodeEmitter::getDirectBrEncoding(const MachineInstr &MI,
-                                             unsigned OpNo) const {
-  const MachineOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
-  
-  MCE.addRelocation(GetRelocation(MO, PPC::reloc_pcrel_bx));
-  return 0;
-}
-
-unsigned PPCCodeEmitter::getCondBrEncoding(const MachineInstr &MI,
-                                           unsigned OpNo) const {
-  const MachineOperand &MO = MI.getOperand(OpNo);
-  MCE.addRelocation(GetRelocation(MO, PPC::reloc_pcrel_bcx));
-  return 0;
-}
-
-unsigned PPCCodeEmitter::getAbsDirectBrEncoding(const MachineInstr &MI,
-                                                unsigned OpNo) const {
-  const MachineOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
-
-  llvm_unreachable("Absolute branch relocations unsupported on the old JIT.");
-}
-
-unsigned PPCCodeEmitter::getAbsCondBrEncoding(const MachineInstr &MI,
-                                              unsigned OpNo) const {
-  llvm_unreachable("Absolute branch relocations unsupported on the old JIT.");
-}
-
-unsigned PPCCodeEmitter::getImm16Encoding(const MachineInstr &MI,
-                                          unsigned OpNo) const {
-  const MachineOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
-
-  unsigned RelocID;
-  switch (MO.getTargetFlags() & PPCII::MO_ACCESS_MASK) {
-    default: llvm_unreachable("Unsupported target operand flags!");
-    case PPCII::MO_LO: RelocID = PPC::reloc_absolute_low; break;
-    case PPCII::MO_HA: RelocID = PPC::reloc_absolute_high; break;
-  }
-
-  MCE.addRelocation(GetRelocation(MO, RelocID));
-  return 0;
-}
-
-unsigned PPCCodeEmitter::getMemRIEncoding(const MachineInstr &MI,
-                                          unsigned OpNo) const {
-  // Encode (imm, reg) as a memri, which has the low 16-bits as the
-  // displacement and the next 5 bits as the register #.
-  assert(MI.getOperand(OpNo+1).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1)) << 16;
-  
-  const MachineOperand &MO = MI.getOperand(OpNo);
-  if (MO.isImm())
-    return (getMachineOpValue(MI, MO) & 0xFFFF) | RegBits;
-  
-  // Add a fixup for the displacement field.
-  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low));
-  return RegBits;
-}
-
-unsigned PPCCodeEmitter::getMemRIXEncoding(const MachineInstr &MI,
-                                           unsigned OpNo) const {
-  // Encode (imm, reg) as a memrix, which has the low 14-bits as the
-  // displacement and the next 5 bits as the register #.
-  assert(MI.getOperand(OpNo+1).isReg());
-  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1)) << 14;
-  
-  const MachineOperand &MO = MI.getOperand(OpNo);
-  if (MO.isImm())
-    return ((getMachineOpValue(MI, MO) >> 2) & 0x3FFF) | RegBits;
-  
-  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low_ix));
-  return RegBits;
-}
-
-
-unsigned PPCCodeEmitter::getTLSRegEncoding(const MachineInstr &MI,
-                                           unsigned OpNo) const {
-  llvm_unreachable("TLS not supported on the old JIT.");
-  return 0;
-}
-
-unsigned PPCCodeEmitter::getTLSCallEncoding(const MachineInstr &MI,
-                                            unsigned OpNo) const {
-  llvm_unreachable("TLS not supported on the old JIT.");
-  return 0;
-}
-
-unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
-                                           const MachineOperand &MO) const {
-
-  if (MO.isReg()) {
-    // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
-    // The GPR operand should come through here though.
-    assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 &&
-            MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) ||
-           MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
-    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
-  }
-  
-  assert(MO.isImm() &&
-         "Relocation required in an instruction that we cannot encode!");
-  return MO.getImm();
-}
-
-#include "PPCGenCodeEmitter.inc"

diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 92a0ec1..1149354 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp

@@ -39,7 +39,7 @@
 //===----------------------------------------------------------------------===//
 //
 // TBD:
-//   FastLowerArguments: Handle simple cases.
+//   fastLowerArguments: Handle simple cases.
 //   PPCMaterializeGV: Handle TLS.
 //   SelectCall: Handle function pointers.
 //   SelectCall: Handle multi-register return values.
@@ -92,30 +92,29 @@
   public:
     explicit PPCFastISel(FunctionLoweringInfo &FuncInfo,
                          const TargetLibraryInfo *LibInfo)
-    : FastISel(FuncInfo, LibInfo),
-      TM(FuncInfo.MF->getTarget()),
-      TII(*TM.getInstrInfo()),
-      TLI(*TM.getTargetLowering()),
-      PPCSubTarget(&TM.getSubtarget<PPCSubtarget>()),
-      Context(&FuncInfo.Fn->getContext()) { }
+        : FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()),
+          TII(*TM.getSubtargetImpl()->getInstrInfo()),
+          TLI(*TM.getSubtargetImpl()->getTargetLowering()),
+          PPCSubTarget(&TM.getSubtarget<PPCSubtarget>()),
+          Context(&FuncInfo.Fn->getContext()) {}
 
   // Backend specific FastISel code.
   private:
-    bool TargetSelectInstruction(const Instruction *I) override;
-    unsigned TargetMaterializeConstant(const Constant *C) override;
-    unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+    bool fastSelectInstruction(const Instruction *I) override;
+    unsigned fastMaterializeConstant(const Constant *C) override;
+    unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
     bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                              const LoadInst *LI) override;
-    bool FastLowerArguments() override;
-    unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override;
-    unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+    bool fastLowerArguments() override;
+    unsigned fastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override;
+    unsigned fastEmitInst_ri(unsigned MachineInstOpcode,
                              const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill,
                              uint64_t Imm);
-    unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+    unsigned fastEmitInst_r(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC,
                             unsigned Op0, bool Op0IsKill);
-    unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+    unsigned fastEmitInst_rr(unsigned MachineInstOpcode,
                              const TargetRegisterClass *RC,
                              unsigned Op0, bool Op0IsKill,
                              unsigned Op1, bool Op1IsKill);
@@ -153,7 +152,7 @@
                            unsigned DestReg, bool IsZExt);
     unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
     unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
-    unsigned PPCMaterializeInt(const Constant *C, MVT VT);
+    unsigned PPCMaterializeInt(const Constant *C, MVT VT, bool UseSExt = true);
     unsigned PPCMaterialize32BitInt(int64_t Imm,
                                     const TargetRegisterClass *RC);
     unsigned PPCMaterialize64BitInt(int64_t Imm,
@@ -560,7 +559,7 @@
   unsigned ResultReg = 0;
   if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
     return false;
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -707,7 +706,7 @@
 
     BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
       .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
-    FastEmitBranch(FBB, DbgLoc);
+    fastEmitBranch(FBB, DbgLoc);
     FuncInfo.MBB->addSuccessor(TBB);
     return true;
 
@@ -715,7 +714,7 @@
              dyn_cast<ConstantInt>(BI->getCondition())) {
     uint64_t Imm = CI->getZExtValue();
     MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
-    FastEmitBranch(Target, DbgLoc);
+    fastEmitBranch(Target, DbgLoc);
     return true;
   }
 
@@ -838,7 +837,7 @@
     return false;
 
   // No code is generated for a FP extend.
-  UpdateValueMap(I, SrcReg);
+  updateValueMap(I, SrcReg);
   return true;
 }
 
@@ -860,12 +859,12 @@
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg)
     .addReg(SrcReg);
 
-  UpdateValueMap(I, DestReg);
+  updateValueMap(I, DestReg);
   return true;
 }
 
 // Move an i32 or i64 value in a GPR to an f64 value in an FPR.
-// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// FIXME: When direct register moves are implemented (see PowerISA 2.07),
 // those should be used instead of moving via a stack slot when the
 // subtarget permits.
 // FIXME: The code here is sloppy for the 4-byte case.  Can use a 4-byte
@@ -898,10 +897,10 @@
   if (SrcVT == MVT::i32) {
     if (!IsSigned) {
       LoadOpc = PPC::LFIWZX;
-      Addr.Offset = 4;
+      Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
     } else if (PPCSubTarget->hasLFIWAX()) {
       LoadOpc = PPC::LFIWAX;
-      Addr.Offset = 4;
+      Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
     }
   }
 
@@ -979,13 +978,13 @@
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
     .addReg(FPReg);
 
-  UpdateValueMap(I, DestReg);
+  updateValueMap(I, DestReg);
   return true;
 }
 
 // Move the floating-point value in SrcReg into an integer destination
 // register, and return the register (or zero if we can't handle it).
-// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// FIXME: When direct register moves are implemented (see PowerISA 2.07),
 // those should be used instead of moving via a stack slot when the
 // subtarget permits.
 unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
@@ -1080,7 +1079,7 @@
   if (IntReg == 0)
     return false;
 
-  UpdateValueMap(I, IntReg);
+  updateValueMap(I, IntReg);
   return true;
 }
 
@@ -1169,7 +1168,7 @@
                 ResultReg)
             .addReg(SrcReg1)
             .addImm(Imm);
-        UpdateValueMap(I, ResultReg);
+        updateValueMap(I, ResultReg);
         return true;
       }
     }
@@ -1185,7 +1184,7 @@
 
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
     .addReg(SrcReg1).addReg(SrcReg2);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1200,10 +1199,12 @@
                                   unsigned &NumBytes,
                                   bool IsVarArg) {
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, *Context);
 
   // Reserve space for the linkage area on the stack.
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false);
+  bool isELFv2ABI = PPCSubTarget->isELFv2ABI();
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
+                                                          isELFv2ABI);
   CCInfo.AllocateStack(LinkageSize, 8);
 
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
@@ -1232,6 +1233,7 @@
   // Because we cannot tell if this is needed on the caller side, we have to
   // conservatively assume that it is needed.  As such, make sure we have at
   // least enough stack space for the caller to store the 8 GPRs.
+  // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
   NumBytes = std::max(NumBytes, LinkageSize + 64);
 
   // Issue CALLSEQ_START.
@@ -1318,7 +1320,7 @@
   // any real difficulties there.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
     CCValAssign &VA = RVLocs[0];
     assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
@@ -1364,7 +1366,7 @@
 
     assert(ResultReg && "ResultReg unset!");
     UsedRegs.push_back(SourcePhysReg);
-    UpdateValueMap(I, ResultReg);
+    updateValueMap(I, ResultReg);
   }
 }
 
@@ -1408,7 +1410,7 @@
       RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 &&
       RetVT != MVT::f64) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
     if (RVLocs.size() > 1)
       return false;
@@ -1498,6 +1500,10 @@
   for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
     MIB.addReg(RegArgs[II], RegState::Implicit);
 
+  // Direct calls in the ELFv2 ABI need the TOC register live into the call.
+  if (PPCSubTarget->isELFv2ABI())
+    MIB.addReg(PPC::X2, RegState::Implicit);
+
   // Add a register mask with the call-preserved registers.  Proper
   // defs for return values will be added by setPhysRegsDeadExcept().
   MIB.addRegMask(TRI.getCallPreservedMask(CC));
@@ -1531,7 +1537,7 @@
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, *Context);
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, *Context);
     CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS);
     const Value *RV = Ret->getOperand(0);
     
@@ -1541,13 +1547,23 @@
 
     // Special case for returning a constant integer of any size.
     // Materialize the constant as an i64 and copy it to the return
-    // register.  This avoids an unnecessary extend or truncate.
+    // register. We still need to worry about properly extending the sign. E.g:
+    // If the constant has only one bit, it means it is a boolean. Therefore
+    // we can't use PPCMaterializeInt because it extends the sign which will
+    // cause negations of the returned value to be incorrect as they are
+    // implemented as the flip of the least significant bit.
     if (isa<ConstantInt>(*RV)) {
       const Constant *C = cast<Constant>(RV);
-      unsigned SrcReg = PPCMaterializeInt(C, MVT::i64);
-      unsigned RetReg = ValLocs[0].getLocReg();
+
+      CCValAssign &VA = ValLocs[0];
+
+      unsigned RetReg = VA.getLocReg();
+      unsigned SrcReg = PPCMaterializeInt(C, MVT::i64,
+                                          VA.getLocInfo() == CCValAssign::SExt);
+
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
+            TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg);
+
       RetRegs.push_back(RetReg);
 
     } else {
@@ -1714,7 +1730,7 @@
     SrcReg = ResultReg;
   }
 
-  UpdateValueMap(I, SrcReg);
+  updateValueMap(I, SrcReg);
   return true;
 }
 
@@ -1753,13 +1769,13 @@
   if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt))
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
 // Attempt to fast-select an instruction that wasn't handled by
 // the table-generated machinery.
-bool PPCFastISel::TargetSelectInstruction(const Instruction *I) {
+bool PPCFastISel::fastSelectInstruction(const Instruction *I) {
 
   switch (I->getOpcode()) {
     case Instruction::Load:
@@ -2007,7 +2023,8 @@
 
 // Materialize an integer constant into a register, and return
 // the register number (or zero if we failed to handle it).
-unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT) {
+unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT,
+                                                           bool UseSExt) {
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
   if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
@@ -2031,7 +2048,7 @@
     unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
     unsigned ImmReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
-      .addImm(CI->getSExtValue());
+      .addImm( (UseSExt) ? CI->getSExtValue() : CI->getZExtValue() );
     return ImmReg;
   }
 
@@ -2048,7 +2065,7 @@
 
 // Materialize a constant into a register, and return the register
 // number (or zero if we failed to handle it).
-unsigned PPCFastISel::TargetMaterializeConstant(const Constant *C) {
+unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) {
   EVT CEVT = TLI.getValueType(C->getType(), true);
 
   // Only handle simple types.
@@ -2067,7 +2084,7 @@
 
 // Materialize the address created by an alloca into a register, and
 // return the register number (or zero if we failed to handle it).
-unsigned PPCFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+unsigned PPCFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
   // Don't handle dynamic allocas.
   if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
 
@@ -2167,7 +2184,7 @@
 
 // Attempt to lower call arguments in a faster way than done by
 // the selection DAG code.
-bool PPCFastISel::FastLowerArguments() {
+bool PPCFastISel::fastLowerArguments() {
   // Defer to normal argument lowering for now.  It's reasonably
   // efficient.  Consider doing something like ARM to handle the
   // case where all args fit in registers, no varargs, no float
@@ -2177,7 +2194,7 @@
 
 // Handle materializing integer constants into a register.  This is not
 // automatically generated for PowerPC, so must be explicitly created here.
-unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
+unsigned PPCFastISel::fastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
   
   if (Opc != ISD::Constant)
     return 0;
@@ -2214,7 +2231,7 @@
 // assigning R0 or X0 to the output register for GPRC and G8RC
 // register classes, as any such result could be used in ADDI, etc.,
 // where those regs have another meaning.
-unsigned PPCFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+unsigned PPCFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
                                       const TargetRegisterClass *RC,
                                       unsigned Op0, bool Op0IsKill,
                                       uint64_t Imm) {
@@ -2227,27 +2244,27 @@
     (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
      (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
 
-  return FastISel::FastEmitInst_ri(MachineInstOpcode, UseRC,
+  return FastISel::fastEmitInst_ri(MachineInstOpcode, UseRC,
                                    Op0, Op0IsKill, Imm);
 }
 
 // Override for instructions with one register operand to avoid use of
 // R0/X0.  The automatic infrastructure isn't aware of the context so
 // we must be conservative.
-unsigned PPCFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+unsigned PPCFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
                                      const TargetRegisterClass* RC,
                                      unsigned Op0, bool Op0IsKill) {
   const TargetRegisterClass *UseRC =
     (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
      (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
 
-  return FastISel::FastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
+  return FastISel::fastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
 }
 
 // Override for instructions with two register operands to avoid use
 // of R0/X0.  The automatic infrastructure isn't aware of the context
 // so we must be conservative.
-unsigned PPCFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+unsigned PPCFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
                                       const TargetRegisterClass* RC,
                                       unsigned Op0, bool Op0IsKill,
                                       unsigned Op1, bool Op1IsKill) {
@@ -2255,7 +2272,7 @@
     (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
      (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
 
-  return FastISel::FastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
+  return FastISel::fastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
                                    Op1, Op1IsKill);
 }
 

diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 65e9cf2..dc87a6c 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp

@@ -254,7 +254,7 @@
 // transform this into the appropriate ORI instruction.
 static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   MachineFunction *MF = MI->getParent()->getParent();
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   DebugLoc dl = MI->getDebugLoc();
 
   unsigned UsedRegMask = 0;
@@ -372,7 +372,7 @@
   unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
 
   const PPCRegisterInfo *RegInfo =
-    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 
   // If we are a leaf function, and use up to 224 bytes of stack space,
   // don't have a frame pointer, calls, or dynamic alloca then we do not need
@@ -400,7 +400,8 @@
 
   // Maximum call frame needs to be at least big enough for linkage area.
   unsigned minCallFrameSize = getLinkageSize(Subtarget.isPPC64(),
-                                             Subtarget.isDarwinABI());
+                                             Subtarget.isDarwinABI(),
+                                             Subtarget.isELFv2ABI());
   maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
 
   // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
@@ -459,9 +460,9 @@
   unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
 
   const PPCRegisterInfo *RegInfo =
-    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   bool HasBP = RegInfo->hasBasePointer(MF);
-  unsigned BPReg  = HasBP ? (unsigned) PPC::R30 : FPReg;
+  unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
   unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -497,21 +498,23 @@
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const PPCInstrInfo &TII =
-    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
   const PPCRegisterInfo *RegInfo =
-    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   DebugLoc dl;
   bool needsFrameMoves = MMI.hasDebugInfo() ||
     MF.getFunction()->needsUnwindTableEntry();
+  bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
   // Get the ABI.
   bool isDarwinABI = Subtarget.isDarwinABI();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
   assert((isDarwinABI || isSVR4ABI) &&
          "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
 
@@ -546,7 +549,7 @@
   bool HasBP = RegInfo->hasBasePointer(MF);
 
   unsigned SPReg       = isPPC64 ? PPC::X1  : PPC::R1;
-  unsigned BPReg       = isPPC64 ? PPC::X30 : PPC::R30;
+  unsigned BPReg       = RegInfo->getBaseRegister(MF);
   unsigned FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
   unsigned LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
   unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
@@ -602,7 +605,9 @@
       BPOffset = FFI->getObjectOffset(BPIndex);
     } else {
       BPOffset =
-        PPCFrameLowering::getBasePointerSaveOffset(isPPC64, isDarwinABI);
+        PPCFrameLowering::getBasePointerSaveOffset(isPPC64,
+                                                   isDarwinABI,
+                                                   isPIC);
     }
   }
 
@@ -623,6 +628,9 @@
          "Prologue CR saving supported only in 64-bit mode");
 
   if (!MustSaveCRs.empty()) { // will only occur for PPC64
+    // FIXME: In the ELFv2 ABI, we are not required to save all CR fields.
+    // If only one or two CR fields are clobbered, it could be more
+    // efficient to use mfocrf to selectively save just those fields.
     MachineInstrBuilder MIB =
       BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg);
     for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
@@ -791,8 +799,12 @@
       // For 64-bit SVR4 when we have spilled CRs, the spill location
       // is SP+8, not a frame-relative slot.
       if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+        // In the ELFv1 ABI, only CR2 is noted in CFI and stands in for
+        // the whole CR word.  In the ELFv2 ABI, every CR that was
+        // actually saved gets its own CFI record.
+        unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2;
         unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
-            nullptr, MRI->getDwarfRegNum(PPC::CR2, true), 8));
+            nullptr, MRI->getDwarfRegNum(CRReg, true), 8));
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
         continue;
@@ -812,9 +824,9 @@
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   assert(MBBI != MBB.end() && "Returning block has no terminator");
   const PPCInstrInfo &TII =
-    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
   const PPCRegisterInfo *RegInfo =
-    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc dl;
@@ -839,6 +851,7 @@
   // Get the ABI.
   bool isDarwinABI = Subtarget.isDarwinABI();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
+  bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
 
   // Check if the link register (LR) has been saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -849,7 +862,7 @@
   bool HasBP = RegInfo->hasBasePointer(MF);
 
   unsigned SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
-  unsigned BPReg      = isPPC64 ? PPC::X30 : PPC::R30;
+  unsigned BPReg      = RegInfo->getBaseRegister(MF);
   unsigned FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
   unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
   unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
@@ -890,7 +903,9 @@
       BPOffset = FFI->getObjectOffset(BPIndex);
     } else {
       BPOffset =
-        PPCFrameLowering::getBasePointerSaveOffset(isPPC64, isDarwinABI);
+        PPCFrameLowering::getBasePointerSaveOffset(isPPC64,
+                                                   isDarwinABI,
+                                                   isPIC);
     }
   }
 
@@ -1054,7 +1069,7 @@
 PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                    RegScavenger *) const {
   const PPCRegisterInfo *RegInfo =
-    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 
   //  Save and clear the LR state.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -1067,6 +1082,7 @@
   int FPSI = FI->getFramePointerSaveIndex();
   bool isPPC64 = Subtarget.isPPC64();
   bool isDarwinABI  = Subtarget.isDarwinABI();
+  bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // If the frame pointer save index hasn't been defined yet.
@@ -1081,7 +1097,7 @@
 
   int BPSI = FI->getBasePointerSaveIndex();
   if (!BPSI && RegInfo->hasBasePointer(MF)) {
-    int BPOffset = getBasePointerSaveOffset(isPPC64, isDarwinABI);
+    int BPOffset = getBasePointerSaveOffset(isPPC64, isDarwinABI, isPIC);
     // Allocate the frame index for the base pointer save area.
     BPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, BPOffset, true);
     // Save the result.
@@ -1185,7 +1201,7 @@
   }
 
   PPCFunctionInfo *PFI = MF.getInfo<PPCFunctionInfo>();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
   int64_t LowerBound = 0;
 
@@ -1220,7 +1236,7 @@
   }
 
   const PPCRegisterInfo *RegInfo =
-    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+      static_cast<const PPCRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   if (RegInfo->hasBasePointer(MF)) {
     HasGPSaveArea = true;
 
@@ -1368,7 +1384,7 @@
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII =
-    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(MF->getSubtarget().getInstrInfo());
   DebugLoc DL;
   bool CRSpilled = false;
   MachineInstrBuilder CRMIB;
@@ -1430,7 +1446,7 @@
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII =
-    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(MF->getSubtarget().getInstrInfo());
   DebugLoc DL;
   unsigned RestoreOp, MoveReg;
 
@@ -1463,7 +1479,7 @@
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const PPCInstrInfo &TII =
-    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(MF.getSubtarget().getInstrInfo());
   if (MF.getTarget().Options.GuaranteedTailCallOpt &&
       I->getOpcode() == PPC::ADJCALLSTACKUP) {
     // Add (actually subtract) back the amount the callee popped on return.
@@ -1513,7 +1529,7 @@
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII =
-    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+      *static_cast<const PPCInstrInfo *>(MF->getSubtarget().getInstrInfo());
   bool CR2Spilled = false;
   bool CR3Spilled = false;
   bool CR4Spilled = false;

diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 7a226f7..c482588 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h

@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef POWERPC_FRAMEINFO_H
-#define POWERPC_FRAMEINFO_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H
+#define LLVM_LIB_TARGET_POWERPC_PPCFRAMELOWERING_H
 
 #include "PPC.h"
 #include "llvm/ADT/STLExtras.h"
@@ -76,8 +76,8 @@
 
   /// getTOCSaveOffset - Return the previous frame offset to save the
   /// TOC register -- 64-bit SVR4 ABI only.
-  static unsigned getTOCSaveOffset(void) {
-    return 40;
+  static unsigned getTOCSaveOffset(bool isELFv2ABI) {
+    return isELFv2ABI ? 24 : 40;
   }
 
   /// getFramePointerSaveOffset - Return the previous frame offset to save the
@@ -97,19 +97,22 @@
 
   /// getBasePointerSaveOffset - Return the previous frame offset to save the
   /// base pointer.
-  static unsigned getBasePointerSaveOffset(bool isPPC64, bool isDarwinABI) {
+  static unsigned getBasePointerSaveOffset(bool isPPC64,
+                                           bool isDarwinABI,
+                                           bool isPIC) {
     if (isDarwinABI)
       return isPPC64 ? -16U : -8U;
 
     // SVR4 ABI: First slot in the general register save area.
-    return isPPC64 ? -16U : -8U;
+    return isPPC64 ? -16U : isPIC ? -12U : -8U;
   }
 
   /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
   ///
-  static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI) {
+  static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI,
+                                 bool isELFv2ABI) {
     if (isDarwinABI || isPPC64)
-      return 6 * (isPPC64 ? 8 : 4);
+      return (isELFv2ABI ? 4 : 6) * (isPPC64 ? 8 : 4);
 
     // SVR4 ABI:
     return 8;

diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index 23f76c1..4b50214 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PPCHAZRECS_H
-#define PPCHAZRECS_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCHAZARDRECOGNIZERS_H
+#define LLVM_LIB_TARGET_POWERPC_PPCHAZARDRECOGNIZERS_H
 
 #include "PPCInstrInfo.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
@@ -76,10 +76,10 @@
 
 public:
   PPCHazardRecognizer970(const ScheduleDAG &DAG);
-  virtual HazardType getHazardType(SUnit *SU, int Stalls) override;
-  virtual void EmitInstruction(SUnit *SU) override;
-  virtual void AdvanceCycle() override;
-  virtual void Reset() override;
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void Reset() override;
 
 private:
   /// EndDispatchGroup - Called when we are finishing a new dispatch group.

diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 4881b3f..49ba58b 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

@@ -14,6 +14,7 @@
 
 #include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPCMachineFunctionInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -26,6 +27,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -56,16 +58,16 @@
     unsigned GlobalBaseReg;
   public:
     explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
-      : SelectionDAGISel(tm), TM(tm),
-        PPCLowering(TM.getTargetLowering()),
-        PPCSubTarget(TM.getSubtargetImpl()) {
+        : SelectionDAGISel(tm), TM(tm),
+          PPCLowering(TM.getSubtargetImpl()->getTargetLowering()),
+          PPCSubTarget(TM.getSubtargetImpl()) {
       initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
       GlobalBaseReg = 0;
-      PPCLowering = TM.getTargetLowering();
+      PPCLowering = TM.getSubtargetImpl()->getTargetLowering();
       PPCSubTarget = TM.getSubtargetImpl();
       SelectionDAGISel::runOnMachineFunction(MF);
 
@@ -232,7 +234,7 @@
   unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
   unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
 
-  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
   MachineBasicBlock &EntryBB = *Fn.begin();
   DebugLoc dl;
   // Emit the following code into the entry block:
@@ -268,16 +270,34 @@
 ///
 SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
   if (!GlobalBaseReg) {
-    const TargetInstrInfo &TII = *TM.getInstrInfo();
+    const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
     // Insert the set of GlobalBaseReg into the first MBB of the function
     MachineBasicBlock &FirstMBB = MF->front();
     MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    const Module *M = MF->getFunction()->getParent();
     DebugLoc dl;
 
     if (PPCLowering->getPointerTy() == MVT::i32) {
-      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass);
-      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
-      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+      if (PPCSubTarget->isTargetELF()) {
+        GlobalBaseReg = PPC::R30;
+        if (M->getPICLevel() == PICLevel::Small) {
+          BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR));
+          BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+        } else {
+          BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
+          BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+          unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+          BuildMI(FirstMBB, MBBI, dl,
+                  TII.get(PPC::UpdateGBR)).addReg(GlobalBaseReg)
+                  .addReg(TempReg, RegState::Define).addReg(GlobalBaseReg);
+          MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true);
+        }
+      } else {
+        GlobalBaseReg =
+          RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass);
+        BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
+        BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+      }
     } else {
       GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_NOX0RegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
@@ -650,94 +670,105 @@
 // getVCmpInst: return the vector compare instruction for the specified
 // vector type and condition code. Since this is for altivec specific code,
 // only support the altivec types (v16i8, v8i16, v4i32, and v4f32).
-static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC,
-                                bool HasVSX) {
-  switch (CC) {
-    case ISD::SETEQ:
-    case ISD::SETUEQ:
-    case ISD::SETNE:
-    case ISD::SETUNE:
-      if (VecVT == MVT::v16i8)
-        return PPC::VCMPEQUB;
-      else if (VecVT == MVT::v8i16)
-        return PPC::VCMPEQUH;
-      else if (VecVT == MVT::v4i32)
-        return PPC::VCMPEQUW;
-      // v4f32 != v4f32 could be translate to unordered not equal
-      else if (VecVT == MVT::v4f32)
-        return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
-      else if (VecVT == MVT::v2f64)
-        return PPC::XVCMPEQDP;
-      break;
-    case ISD::SETLT:
-    case ISD::SETGT:
-    case ISD::SETLE:
-    case ISD::SETGE:
-      if (VecVT == MVT::v16i8)
-        return PPC::VCMPGTSB;
-      else if (VecVT == MVT::v8i16)
-        return PPC::VCMPGTSH;
-      else if (VecVT == MVT::v4i32)
-        return PPC::VCMPGTSW;
-      else if (VecVT == MVT::v4f32)
-        return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
-      else if (VecVT == MVT::v2f64)
-        return PPC::XVCMPGTDP;
-      break;
-    case ISD::SETULT:
-    case ISD::SETUGT:
-    case ISD::SETUGE:
-    case ISD::SETULE:
-      if (VecVT == MVT::v16i8)
-        return PPC::VCMPGTUB;
-      else if (VecVT == MVT::v8i16)
-        return PPC::VCMPGTUH;
-      else if (VecVT == MVT::v4i32)
-        return PPC::VCMPGTUW;
-      break;
-    case ISD::SETOEQ:
-      if (VecVT == MVT::v4f32)
-        return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
-      else if (VecVT == MVT::v2f64)
-        return PPC::XVCMPEQDP;
-      break;
-    case ISD::SETOLT:
-    case ISD::SETOGT:
-    case ISD::SETOLE:
-      if (VecVT == MVT::v4f32)
-        return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
-      else if (VecVT == MVT::v2f64)
-        return PPC::XVCMPGTDP;
-      break;
-    case ISD::SETOGE:
-      if (VecVT == MVT::v4f32)
-        return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
-      else if (VecVT == MVT::v2f64)
-        return PPC::XVCMPGEDP;
-      break;
-    default:
-      break;
-  }
-  llvm_unreachable("Invalid integer vector compare condition");
-}
+static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC,
+                                bool HasVSX, bool &Swap, bool &Negate) {
+  Swap = false;
+  Negate = false;
 
-// getVCmpEQInst: return the equal compare instruction for the specified vector
-// type. Since this is for altivec specific code, only support the altivec
-// types (v16i8, v8i16, v4i32, and v4f32).
-static unsigned int getVCmpEQInst(MVT::SimpleValueType VecVT, bool HasVSX) {
-  switch (VecVT) {
-    case MVT::v16i8:
-      return PPC::VCMPEQUB;
-    case MVT::v8i16:
-      return PPC::VCMPEQUH;
-    case MVT::v4i32:
-      return PPC::VCMPEQUW;
-    case MVT::v4f32:
-      return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
-    case MVT::v2f64:
-      return PPC::XVCMPEQDP;
-    default:
-      llvm_unreachable("Invalid integer vector compare condition");
+  if (VecVT.isFloatingPoint()) {
+    /* Handle some cases by swapping input operands.  */
+    switch (CC) {
+      case ISD::SETLE: CC = ISD::SETGE; Swap = true; break;
+      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+      case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break;
+      case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break;
+      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+      case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break;
+      default: break;
+    }
+    /* Handle some cases by negating the result.  */
+    switch (CC) {
+      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+      case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break;
+      case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break;
+      case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break;
+      default: break;
+    }
+    /* We have instructions implementing the remaining cases.  */
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETOEQ:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPEQDP;
+        break;
+      case ISD::SETGT:
+      case ISD::SETOGT:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPGTDP;
+        break;
+      case ISD::SETGE:
+      case ISD::SETOGE:
+        if (VecVT == MVT::v4f32)
+          return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP;
+        else if (VecVT == MVT::v2f64)
+          return PPC::XVCMPGEDP;
+        break;
+      default:
+        break;
+    }
+    llvm_unreachable("Invalid floating-point vector compare condition");
+  } else {
+    /* Handle some cases by swapping input operands.  */
+    switch (CC) {
+      case ISD::SETGE: CC = ISD::SETLE; Swap = true; break;
+      case ISD::SETLT: CC = ISD::SETGT; Swap = true; break;
+      case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break;
+      case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break;
+      default: break;
+    }
+    /* Handle some cases by negating the result.  */
+    switch (CC) {
+      case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break;
+      case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break;
+      case ISD::SETLE: CC = ISD::SETGT; Negate = true; break;
+      case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break;
+      default: break;
+    }
+    /* We have instructions implementing the remaining cases.  */
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETUEQ:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPEQUB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPEQUH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPEQUW;
+        break;
+      case ISD::SETGT:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPGTSB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPGTSH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPGTSW;
+        break;
+      case ISD::SETUGT:
+        if (VecVT == MVT::v16i8)
+          return PPC::VCMPGTUB;
+        else if (VecVT == MVT::v8i16)
+          return PPC::VCMPGTUH;
+        else if (VecVT == MVT::v4i32)
+          return PPC::VCMPGTUW;
+        break;
+      default:
+        break;
+    }
+    llvm_unreachable("Invalid integer vector compare condition");
   }
 }
 
@@ -829,60 +860,20 @@
   // vector compare operations return the same type as the operands.
   if (LHS.getValueType().isVector()) {
     EVT VecVT = LHS.getValueType();
-    MVT::SimpleValueType VT = VecVT.getSimpleVT().SimpleTy;
-    unsigned int VCmpInst = getVCmpInst(VT, CC, PPCSubTarget->hasVSX());
+    bool Swap, Negate;
+    unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
+                                        PPCSubTarget->hasVSX(), Swap, Negate);
+    if (Swap)
+      std::swap(LHS, RHS);
 
-    switch (CC) {
-      case ISD::SETEQ:
-      case ISD::SETOEQ:
-      case ISD::SETUEQ:
-        return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
-      case ISD::SETNE:
-      case ISD::SETONE:
-      case ISD::SETUNE: {
-        SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
-                                                               PPC::VNOR,
-                                    VecVT, VCmp, VCmp);
-      } 
-      case ISD::SETLT:
-      case ISD::SETOLT:
-      case ISD::SETULT:
-        return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, RHS, LHS);
-      case ISD::SETGT:
-      case ISD::SETOGT:
-      case ISD::SETUGT:
-        return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
-      case ISD::SETGE:
-      case ISD::SETOGE:
-      case ISD::SETUGE: {
-        // Small optimization: Altivec provides a 'Vector Compare Greater Than
-        // or Equal To' instruction (vcmpgefp), so in this case there is no
-        // need for extra logic for the equal compare.
-        if (VecVT.getSimpleVT().isFloatingPoint()) {
-          return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
-        } else {
-          SDValue VCmpGT(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-          unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget->hasVSX());
-          SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-          return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLOR :
-                                                                 PPC::VOR,
-                                      VecVT, VCmpGT, VCmpEQ);
-        }
-      }
-      case ISD::SETLE:
-      case ISD::SETOLE:
-      case ISD::SETULE: {
-        SDValue VCmpLE(CurDAG->getMachineNode(VCmpInst, dl, VecVT, RHS, LHS), 0);
-        unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget->hasVSX());
-        SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLOR :
-                                                               PPC::VOR,
-                                    VecVT, VCmpLE, VCmpEQ);
-      }
-      default:
-        llvm_unreachable("Invalid vector compare type: should be expanded by legalize");
+    if (Negate) {
+      SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
+      return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
+                                                              PPC::VNOR,
+                                  VecVT, VCmp, VCmp);
     }
+
+    return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
   }
 
   if (PPCSubTarget->useCRBits())
@@ -924,6 +915,13 @@
     return nullptr;   // Already selected.
   }
 
+  // In case any misguided DAG-level optimizations form an ADD with a
+  // TargetConstant operand, crash here instead of miscompiling (by selecting
+  // an r+r add instead of some kind of r+i add).
+  if (N->getOpcode() == ISD::ADD &&
+      N->getOperand(1).getOpcode() == ISD::TargetConstant)
+    llvm_unreachable("Invalid ADD with TargetConstant operand");
+
   switch (N->getOpcode()) {
   default: break;
 
@@ -1331,7 +1329,13 @@
     else if (N->getValueType(0) == MVT::f32)
       SelectCCOp = PPC::SELECT_CC_F4;
     else if (N->getValueType(0) == MVT::f64)
-      SelectCCOp = PPC::SELECT_CC_F8;
+      if (PPCSubTarget->hasVSX())
+        SelectCCOp = PPC::SELECT_CC_VSFRC;
+      else
+        SelectCCOp = PPC::SELECT_CC_F8;
+    else if (N->getValueType(0) == MVT::v2f64 ||
+             N->getValueType(0) == MVT::v2i64)
+      SelectCCOp = PPC::SELECT_CC_VSRC;
     else
       SelectCCOp = PPC::SELECT_CC_VRRC;
 
@@ -1445,11 +1449,17 @@
     return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
   }
   case PPCISD::TOC_ENTRY: {
-    assert (PPCSubTarget->isPPC64() && "Only supported for 64-bit ABI");
+    assert ((PPCSubTarget->isPPC64() || PPCSubTarget->isSVR4ABI()) &&
+            "Only supported for 64-bit ABI and 32-bit SVR4");
+    if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) {
+      SDValue GA = N->getOperand(0);
+      return CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA,
+                                    N->getOperand(1));
+    }
 
     // For medium and large code model, we generate two instructions as
     // described below.  Otherwise we allow SelectCodeCommon to handle this,
-    // selecting one of LDtoc, LDtocJTI, and LDtocCPT.
+    // selecting one of LDtoc, LDtocJTI, LDtocCPT, and LDtocBA.
     CodeModel::Model CModel = TM.getCodeModel();
     if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
       break;
@@ -1466,7 +1476,8 @@
     SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
                                         TOCbase, GA);
 
-    if (isa<JumpTableSDNode>(GA) || CModel == CodeModel::Large)
+    if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA) ||
+        CModel == CodeModel::Large)
       return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
                                     SDValue(Tmp, 0));
 
@@ -1483,6 +1494,12 @@
     return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
                                   SDValue(Tmp, 0), GA);
   }
+  case PPCISD::PPC32_PICGOT: {
+    // Generate a PIC-safe GOT reference.
+    assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
+      "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
+    return CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(),  MVT::i32);
+  }
   case PPCISD::VADD_SPLAT: {
     // This expands into one of three sequences, depending on whether
     // the first operand is odd or even, positive or negative.
@@ -1683,7 +1700,9 @@
       case PPC::SELECT_I8:
       case PPC::SELECT_F4:
       case PPC::SELECT_F8:
-      case PPC::SELECT_VRRC: {
+      case PPC::SELECT_VRRC:
+      case PPC::SELECT_VSFRC:
+      case PPC::SELECT_VSRC: {
         SDValue Op = MachineNode->getOperand(0);
         if (Op.isMachineOpcode()) {
           if (Op.getMachineOpcode() == PPC::CRSET)
@@ -1989,6 +2008,8 @@
       case PPC::SELECT_F4:
       case PPC::SELECT_F8:
       case PPC::SELECT_VRRC:
+      case PPC::SELECT_VSFRC:
+      case PPC::SELECT_VSRC:
         if (Op1Set)
           ResNode = MachineNode->getOperand(1).getNode();
         else if (Op1Unset)

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index bc057bf..e93bdaf 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp

@@ -39,6 +39,10 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+// FIXME: Remove this once soft-float is supported.
+static cl::opt<bool> DisablePPCFloatInVariadic("disable-ppc-float-in-variadic",
+cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden);
+
 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
 
@@ -51,19 +55,10 @@
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
-  // If it isn't a Mach-O file then it's going to be a linux ELF
-  // object file.
-  if (TT.isOSDarwin())
-    return new TargetLoweringObjectFileMachO();
-
-  return new PPC64LinuxTargetObjectFile();
-}
-
-PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
-    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))),
+PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM)
+    : TargetLowering(TM),
       Subtarget(*TM.getSubtargetImpl()) {
-  setPow2DivIsCheap();
+  setPow2SDivIsCheap();
 
   // Use _setjmp/_longjmp instead of setjmp/longjmp.
   setUseUnderscoreSetJmp(true);
@@ -453,6 +448,8 @@
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
       setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
+      setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::MULHS, VT, Expand);
       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
       setOperationAction(ISD::UDIVREM, VT, Expand);
@@ -526,11 +523,6 @@
     // Altivec does not contain unordered floating-point compare instructions
     setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
-    setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand);
-    setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand);
-    setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand);
-    setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand);
-
     setCondCodeAction(ISD::SETO,   MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
 
@@ -561,11 +553,6 @@
       // Share the Altivec comparison restrictions.
       setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
       setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
-      setCondCodeAction(ISD::SETUGT, MVT::v2f64, Expand);
-      setCondCodeAction(ISD::SETUGE, MVT::v2f64, Expand);
-      setCondCodeAction(ISD::SETULT, MVT::v2f64, Expand);
-      setCondCodeAction(ISD::SETULE, MVT::v2f64, Expand);
-
       setCondCodeAction(ISD::SETO,   MVT::v2f64, Expand);
       setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
 
@@ -617,15 +604,22 @@
     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
   }
 
-  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
-  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+  if (!isPPC64) {
+    setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+  }
 
   setBooleanContents(ZeroOrOneBooleanContent);
   // Altivec instructions set fields to all zeros or all ones.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
+  if (!isPPC64) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
+  }
+
   if (isPPC64) {
     setStackPointerRegisterToSaveRestore(PPC::X1);
     setExceptionPointerRegister(PPC::X3);
@@ -685,11 +679,6 @@
   if (Subtarget.isDarwin())
     setPrefFunctionAlignment(4);
 
-  if (isPPC64 && Subtarget.isJITCodeModel())
-    // Temporary workaround for the inability of PPC64 JIT to handle jump
-    // tables.
-    setSupportJumpTables(false);
-
   setInsertFencesForAtomic(true);
 
   if (Subtarget.enableMachineScheduler())
@@ -782,6 +771,8 @@
   case PPCISD::SHL:             return "PPCISD::SHL";
   case PPCISD::CALL:            return "PPCISD::CALL";
   case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
+  case PPCISD::CALL_TLS:        return "PPCISD::CALL_TLS";
+  case PPCISD::CALL_NOP_TLS:    return "PPCISD::CALL_NOP_TLS";
   case PPCISD::MTCTR:           return "PPCISD::MTCTR";
   case PPCISD::BCTRL:           return "PPCISD::BCTRL";
   case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
@@ -811,10 +802,8 @@
   case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
   case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
   case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
-  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
   case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
   case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
-  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
   case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
   case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
   case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
@@ -828,6 +817,11 @@
   return VT.changeVectorElementTypeToInteger();
 }
 
+bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+  assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Node matching predicates, for use by the tblgen matching code.
 //===----------------------------------------------------------------------===//
@@ -853,14 +847,27 @@
 
 /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
 /// VPKUHUM instruction.
-bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operantion with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
-  unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1;
-  if (!isUnary) {
+  bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian();
+  if (ShuffleKind == 0) {
+    if (IsLE)
+      return false;
     for (unsigned i = 0; i != 16; ++i)
-      if (!isConstantOrUndef(N->getMaskElt(i),  i*2+j))
+      if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
         return false;
-  } else {
+  } else if (ShuffleKind == 2) {
+    if (!IsLE)
+      return false;
+    for (unsigned i = 0; i != 16; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i), i*2))
+        return false;
+  } else if (ShuffleKind == 1) {
+    unsigned j = IsLE ? 0 : 1;
     for (unsigned i = 0; i != 8; ++i)
       if (!isConstantOrUndef(N->getMaskElt(i),    i*2+j) ||
           !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j))
@@ -871,27 +878,34 @@
 
 /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
 /// VPKUWUM instruction.
-bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+/// The ShuffleKind distinguishes between big-endian operations with
+/// two different inputs (0), either-endian operations with two identical
+/// inputs (1), and little-endian operantion with two different inputs (2).
+/// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                                SelectionDAG &DAG) {
-  unsigned j, k;
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
-    j = 0;
-    k = 1;
-  } else {
-    j = 2;
-    k = 3;
-  }
-  if (!isUnary) {
+  bool IsLE = DAG.getSubtarget().getDataLayout()->isLittleEndian();
+  if (ShuffleKind == 0) {
+    if (IsLE)
+      return false;
     for (unsigned i = 0; i != 16; i += 2)
-      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j) ||
-          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+k))
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
         return false;
-  } else {
+  } else if (ShuffleKind == 2) {
+    if (!IsLE)
+      return false;
+    for (unsigned i = 0; i != 16; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+1))
+        return false;
+  } else if (ShuffleKind == 1) {
+    unsigned j = IsLE ? 0 : 2;
     for (unsigned i = 0; i != 8; i += 2)
-      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j) ||
-          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+k) ||
-          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j) ||
-          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+k))
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+j)   ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+j+1) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+j)   ||
+          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+j+1))
         return false;
   }
   return true;
@@ -919,38 +933,63 @@
 
 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
 /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two 
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2).  For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                             bool isUnary, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
-    if (!isUnary)
+                             unsigned ShuffleKind, SelectionDAG &DAG) {
+  if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 0, 0);
+    else if (ShuffleKind == 2) // swapped
       return isVMerge(N, UnitSize, 0, 16);
-    return isVMerge(N, UnitSize, 0, 0);
+    else
+      return false;
   } else {
-    if (!isUnary)
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 8, 8);
+    else if (ShuffleKind == 0) // normal
       return isVMerge(N, UnitSize, 8, 24);
-    return isVMerge(N, UnitSize, 8, 8);
+    else
+      return false;
   }
 }
 
 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
 /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
+/// The ShuffleKind distinguishes between big-endian merges with two 
+/// different inputs (0), either-endian merges with two identical inputs (1),
+/// and little-endian merges with two different inputs (2).  For the latter,
+/// the input operands are swapped (see PPCInstrAltivec.td).
 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                             bool isUnary, SelectionDAG &DAG) {
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
-    if (!isUnary)
+                             unsigned ShuffleKind, SelectionDAG &DAG) {
+  if (DAG.getSubtarget().getDataLayout()->isLittleEndian()) {
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 8, 8);
+    else if (ShuffleKind == 2) // swapped
       return isVMerge(N, UnitSize, 8, 24);
-    return isVMerge(N, UnitSize, 8, 8);
+    else
+      return false;
   } else {
-    if (!isUnary)
+    if (ShuffleKind == 1) // unary
+      return isVMerge(N, UnitSize, 0, 0);
+    else if (ShuffleKind == 0) // normal
       return isVMerge(N, UnitSize, 0, 16);
-    return isVMerge(N, UnitSize, 0, 0);
+    else
+      return false;
   }
 }
 
 
 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
 /// amount, otherwise return -1.
-int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) {
+/// The ShuffleKind distinguishes between big-endian operations with two 
+/// different inputs (0), either-endian operations with two identical inputs
+/// (1), and little-endian operations with two different inputs (2).  For the
+/// latter, the input operands are swapped (see PPCInstrAltivec.td).
+int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+                             SelectionDAG &DAG) {
   if (N->getValueType(0) != MVT::v16i8)
     return -1;
 
@@ -968,38 +1007,26 @@
   unsigned ShiftAmt = SVOp->getMaskElt(i);
   if (ShiftAmt < i) return -1;
 
-  if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
+  ShiftAmt -= i;
+  bool isLE = DAG.getTarget().getSubtargetImpl()->getDataLayout()->
+    isLittleEndian();
 
-    ShiftAmt += i;
+  if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+        return -1;
+  } else if (ShuffleKind == 1) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+        return -1;
+  } else
+    return -1;
 
-    if (!isUnary) {
-      // Check the rest of the elements to see if they are consecutive.
-      for (++i; i != 16; ++i)
-        if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt - i))
-          return -1;
-    } else {
-      // Check the rest of the elements to see if they are consecutive.
-      for (++i; i != 16; ++i)
-        if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt - i) & 15))
-          return -1;
-    }
+  if (ShuffleKind == 2 && isLE)
+    ShiftAmt = 16 - ShiftAmt;
 
-  } else {  // Big Endian
-
-    ShiftAmt -= i;
-
-    if (!isUnary) {
-      // Check the rest of the elements to see if they are consecutive.
-      for (++i; i != 16; ++i)
-        if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
-          return -1;
-    } else {
-      // Check the rest of the elements to see if they are consecutive.
-      for (++i; i != 16; ++i)
-        if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
-          return -1;
-    }
-  }
   return ShiftAmt;
 }
 
@@ -1055,7 +1082,7 @@
                                 SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   assert(isSplatShuffleMask(SVOp, EltSize));
-  if (DAG.getTarget().getDataLayout()->isLittleEndian())
+  if (DAG.getSubtarget().getDataLayout()->isLittleEndian())
     return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
   else
     return SVOp->getMaskElt(0) / EltSize;
@@ -1331,7 +1358,13 @@
       if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
         // If all of the bits are known zero on the LHS or RHS, the add won't
         // carry.
-        Base = N.getOperand(0);
+        if (FrameIndexSDNode *FI =
+              dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+          Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+          fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+        } else {
+          Base = N.getOperand(0);
+        }
         Disp = DAG.getTargetConstant(imm, N.getValueType());
         return true;
       }
@@ -1491,10 +1524,9 @@
   HiOpFlags = PPCII::MO_HA;
   LoOpFlags = PPCII::MO_LO;
 
-  // Don't use the pic base if not in PIC relocation model.  Or if we are on a
-  // non-darwin platform.  We don't support PIC on other platforms yet.
-  bool isPIC = TM.getRelocationModel() == Reloc::PIC_ &&
-               TM.getSubtarget<PPCSubtarget>().isDarwin();
+  // Don't use the pic base if not in PIC relocation model.
+  bool isPIC = TM.getRelocationModel() == Reloc::PIC_;
+
   if (isPIC) {
     HiOpFlags |= PPCII::MO_PIC_FLAG;
     LoOpFlags |= PPCII::MO_PIC_FLAG;
@@ -1550,6 +1582,15 @@
 
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+
+  if (isPIC && Subtarget.isSVR4ABI()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
+                                           PPCII::MO_PIC_FLAG);
+    SDLoc DL(CP);
+    return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA,
+                       DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT));
+  }
+
   SDValue CPIHi =
     DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
   SDValue CPILo =
@@ -1571,6 +1612,15 @@
 
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+
+  if (isPIC && Subtarget.isSVR4ABI()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                        PPCII::MO_PIC_FLAG);
+    SDLoc DL(GA);
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), PtrVT, GA,
+                       DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT));
+  }
+
   SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
   SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
   return LowerLabelRef(JTIHi, JTILo, isPIC, DAG);
@@ -1579,8 +1629,16 @@
 SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
+  BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
+  const BlockAddress *BA = BASDN->getBlockAddress();
 
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual BlockAddress is stored in the TOC.
+  if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
+    SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(BASDN), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
 
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
@@ -1589,6 +1647,27 @@
   return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
 }
 
+// Generate a call to __tls_get_addr for the given GOT entry Op.
+std::pair<SDValue,SDValue>
+PPCTargetLowering::lowerTLSCall(SDValue Op, SDLoc dl,
+                                SelectionDAG &DAG) const {
+
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Op;
+  Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, IntPtrTy,
+               DAG.getTargetExternalSymbol("__tls_get_addr", getPointerTy()),
+               std::move(Args), 0);
+
+  return LowerCallTo(CLI);
+}
+
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
 
@@ -1601,6 +1680,8 @@
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
   bool is64bit = Subtarget.isPPC64();
+  const Module *M = DAG.getMachineFunction().getFunction()->getParent();
+  PICLevel::Level picLevel = M->getPICLevel();
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
 
@@ -1632,50 +1713,46 @@
   }
 
   if (Model == TLSModel::GeneralDynamic) {
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
-    SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
-    SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
-                                     GOTReg, TGA);
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TLSGD);
+    SDValue GOTPtr;
+    if (is64bit) {
+      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+      GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
+                                   GOTReg, TGA);
+    } else {
+      if (picLevel == PICLevel::Small)
+        GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+      else
+        GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+    }
     SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT,
-                                   GOTEntryHi, TGA);
-
-    // We need a chain node, and don't have one handy.  The underlying
-    // call has no side effects, so using the function entry node
-    // suffices.
-    SDValue Chain = DAG.getEntryNode();
-    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
-    SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
-    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl,
-                                  PtrVT, ParmReg, TGA);
-    // The return value from GET_TLS_ADDR really is in X3 already, but
-    // some hacks are needed here to tie everything together.  The extra
-    // copies dissolve during subsequent transforms.
-    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
-    return DAG.getCopyFromReg(Chain, dl, PPC::X3, PtrVT);
+                                   GOTPtr, TGA);
+    std::pair<SDValue, SDValue> CallResult = lowerTLSCall(GOTEntry, dl, DAG);
+    return CallResult.first;
   }
 
   if (Model == TLSModel::LocalDynamic) {
-    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
-    SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
-    SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
-                                     GOTReg, TGA);
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                             PPCII::MO_TLSLD);
+    SDValue GOTPtr;
+    if (is64bit) {
+      SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+      GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
+                           GOTReg, TGA);
+    } else {
+      if (picLevel == PICLevel::Small)
+        GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
+      else
+        GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
+    }
     SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT,
-                                   GOTEntryHi, TGA);
-
-    // We need a chain node, and don't have one handy.  The underlying
-    // call has no side effects, so using the function entry node
-    // suffices.
-    SDValue Chain = DAG.getEntryNode();
-    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
-    SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
-    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl,
-                                  PtrVT, ParmReg, TGA);
-    // The return value from GET_TLSLD_ADDR really is in X3 already, but
-    // some hacks are needed here to tie everything together.  The extra
-    // copies dissolve during subsequent transforms.
-    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
+                                   GOTPtr, TGA);
+    std::pair<SDValue, SDValue> CallResult = lowerTLSCall(GOTEntry, dl, DAG);
+    SDValue TLSAddr = CallResult.first;
+    SDValue Chain = CallResult.second;
     SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT,
-                                      Chain, ParmReg, TGA);
+                                      Chain, TLSAddr, TGA);
     return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
   }
 
@@ -1700,6 +1777,14 @@
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV);
 
+  if (isPIC && Subtarget.isSVR4ABI()) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
+                                            GSDN->getOffset(),
+                                            PPCII::MO_PIC_FLAG);
+    return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA,
+                       DAG.getNode(PPCISD::GlobalBaseReg, DL, MVT::i32));
+  }
+
   SDValue GAHi =
     DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
   SDValue GALo =
@@ -1794,7 +1879,7 @@
   // gpr_index
   SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
                                     VAListPtr, MachinePointerInfo(SV), MVT::i8,
-                                    false, false, 0);
+                                    false, false, false, 0);
   InChain = GprIndex.getValue(1);
 
   if (VT == MVT::i64) {
@@ -1817,7 +1902,7 @@
   // fpr
   SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
                                     FprPtr, MachinePointerInfo(SV), MVT::i8,
-                                    false, false, 0);
+                                    false, false, false, 0);
   InChain = FprIndex.getValue(1);
 
   SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
@@ -2127,14 +2212,19 @@
   unsigned ArgSize = ArgVT.getStoreSize();
   if (Flags.isByVal())
     ArgSize = Flags.getByValSize();
-  ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+
+  // Round up to multiples of the pointer size, except for array members,
+  // which are always packed.
+  if (!Flags.isInConsecutiveRegs())
+    ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
 
   return ArgSize;
 }
 
 /// CalculateStackSlotAlignment - Calculates the alignment of this argument
 /// on the stack.
-static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags,
+static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
+                                            ISD::ArgFlagsTy Flags,
                                             unsigned PtrByteSize) {
   unsigned Align = PtrByteSize;
 
@@ -2156,14 +2246,78 @@
     }
   }
 
+  // Array members are always packed to their original alignment.
+  if (Flags.isInConsecutiveRegs()) {
+    // If the array member was split into multiple registers, the first
+    // needs to be aligned to the size of the full type.  (Except for
+    // ppcf128, which is only aligned as its f64 components.)
+    if (Flags.isSplit() && OrigVT != MVT::ppcf128)
+      Align = OrigVT.getStoreSize();
+    else
+      Align = ArgVT.getStoreSize();
+  }
+
   return Align;
 }
 
+/// CalculateStackSlotUsed - Return whether this argument will use its
+/// stack slot (instead of being passed in registers).  ArgOffset,
+/// AvailableFPRs, and AvailableVRs must hold the current argument
+/// position, and will be updated to account for this argument.
+static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
+                                   ISD::ArgFlagsTy Flags,
+                                   unsigned PtrByteSize,
+                                   unsigned LinkageSize,
+                                   unsigned ParamAreaSize,
+                                   unsigned &ArgOffset,
+                                   unsigned &AvailableFPRs,
+                                   unsigned &AvailableVRs) {
+  bool UseMemory = false;
+
+  // Respect alignment of argument on the stack.
+  unsigned Align =
+    CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+  ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+  // If there's no space left in the argument save area, we must
+  // use memory (this check also catches zero-sized arguments).
+  if (ArgOffset >= LinkageSize + ParamAreaSize)
+    UseMemory = true;
+
+  // Allocate argument on the stack.
+  ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+  if (Flags.isInConsecutiveRegsLast())
+    ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+  // If we overran the argument save area, we must use memory
+  // (this check catches arguments passed partially in memory)
+  if (ArgOffset > LinkageSize + ParamAreaSize)
+    UseMemory = true;
+
+  // However, if the argument is actually passed in an FPR or a VR,
+  // we don't use memory after all.
+  if (!Flags.isByVal()) {
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
+      if (AvailableFPRs > 0) {
+        --AvailableFPRs;
+        return false;
+      }
+    if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
+        ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
+        ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
+      if (AvailableVRs > 0) {
+        --AvailableVRs;
+        return false;
+      }
+  }
+
+  return UseMemory;
+}
+
 /// EnsureStackAlignment - Round stack frame size up from NumBytes to
 /// ensure minimum alignment required for target.
 static unsigned EnsureStackAlignment(const TargetMachine &Target,
                                      unsigned NumBytes) {
-  unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment();
+  unsigned TargetAlign =
+      Target.getSubtargetImpl()->getFrameLowering()->getStackAlignment();
   unsigned AlignMask = TargetAlign - 1;
   NumBytes = (NumBytes + AlignMask) & ~AlignMask;
   return NumBytes;
@@ -2240,11 +2394,11 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false);
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false);
   CCInfo.AllocateStack(LinkageSize, PtrByteSize);
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
@@ -2315,7 +2469,7 @@
   // caller's stack frame, right above the parameter list area.
   SmallVector<CCValAssign, 16> ByValArgLocs;
   CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -2348,7 +2502,9 @@
       PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
       PPC::F8
     };
-    const unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+    unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+    if (DisablePPCFloatInVariadic)
+      NumFPArgRegs = 0;
 
     FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs,
                                                           NumGPArgRegs));
@@ -2357,7 +2513,7 @@
 
     // Make room for NumGPArgRegs and NumFPArgRegs.
     int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
-                NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8;
+                NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
 
     FuncInfo->setVarArgsStackOffset(
       MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
@@ -2399,7 +2555,7 @@
                                    MachinePointerInfo(), false, false, 0);
       MemOps.push_back(Store);
       // Increment the address by eight for the next argument to store
-      SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
+      SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8,
                                          PtrVT);
       FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
     }
@@ -2437,6 +2593,7 @@
                                       SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -2448,8 +2605,8 @@
                        (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = 8;
 
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false);
-  unsigned ArgOffset = LinkageSize;
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
+                                                          isELFv2ABI);
 
   static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
@@ -2471,12 +2628,29 @@
   const unsigned Num_FPR_Regs = 13;
   const unsigned Num_VR_Regs  = array_lengthof(VR);
 
-  unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
+  // Do a first pass over the arguments to determine whether the ABI
+  // guarantees that our caller has allocated the parameter save area
+  // on its stack frame.  In the ELFv1 ABI, this is always the case;
+  // in the ELFv2 ABI, it is true if this is a vararg function or if
+  // any parameter is located in a stack slot.
+
+  bool HasParameterArea = !isELFv2ABI || isVarArg;
+  unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
+  unsigned NumBytes = LinkageSize;
+  unsigned AvailableFPRs = Num_FPR_Regs;
+  unsigned AvailableVRs = Num_VR_Regs;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i)
+    if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
+                               PtrByteSize, LinkageSize, ParamAreaSize,
+                               NumBytes, AvailableFPRs, AvailableVRs))
+      HasParameterArea = true;
 
   // Add DAG nodes to load the arguments or copy them out of registers.  On
   // entry to a function on PPC, the arguments start after the linkage area,
   // although the first ones are often in registers.
 
+  unsigned ArgOffset = LinkageSize;
+  unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
   SmallVector<SDValue, 8> MemOps;
   Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
   unsigned CurArgIdx = 0;
@@ -2484,6 +2658,7 @@
     SDValue ArgVal;
     bool needsLoad = false;
     EVT ObjectVT = Ins[ArgNo].VT;
+    EVT OrigVT = Ins[ArgNo].ArgVT;
     unsigned ObjSize = ObjectVT.getStoreSize();
     unsigned ArgSize = ObjSize;
     ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
@@ -2492,7 +2667,7 @@
 
     /* Respect alignment of argument on the stack.  */
     unsigned Align =
-      CalculateStackSlotAlignment(ObjectVT, Flags, PtrByteSize);
+      CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
     ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
     unsigned CurArgOffset = ArgOffset;
 
@@ -2520,15 +2695,31 @@
         continue;
       }
 
-      // All aggregates smaller than 8 bytes must be passed right-justified.
-      if (ObjSize < PtrByteSize && !isLittleEndian)
-        CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
-      // The value of the object is its address.
-      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+      // Create a stack object covering all stack doublewords occupied
+      // by the argument.  If the argument is (fully or partially) on
+      // the stack, or if the argument is fully in registers but the
+      // caller has allocated the parameter save anyway, we can refer
+      // directly to the caller's stack frame.  Otherwise, create a
+      // local copy in our own frame.
+      int FI;
+      if (HasParameterArea ||
+          ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
+        FI = MFI->CreateFixedObject(ArgSize, ArgOffset, false, true);
+      else
+        FI = MFI->CreateStackObject(ArgSize, Align, false);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-      InVals.push_back(FIN);
 
-      if (ObjSize < 8) {
+      // Handle aggregates smaller than 8 bytes.
+      if (ObjSize < PtrByteSize) {
+        // The value of the object is its address, which differs from the
+        // address of the enclosing doubleword on big-endian systems.
+        SDValue Arg = FIN;
+        if (!isLittleEndian) {
+          SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, PtrVT);
+          Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
+        }
+        InVals.push_back(Arg);
+
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
@@ -2537,18 +2728,13 @@
           if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
             EVT ObjType = (ObjSize == 1 ? MVT::i8 :
                            (ObjSize == 2 ? MVT::i16 : MVT::i32));
-            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
                                       MachinePointerInfo(FuncArg),
                                       ObjType, false, false, 0);
           } else {
             // For sizes that don't fit a truncating store (3, 5, 6, 7),
             // store the whole register as-is to the parameter save area
-            // slot.  The address of the parameter was already calculated
-            // above (InVals.push_back(FIN)) to be the right-justified
-            // offset within the slot.  For this store, we need a new
-            // frame index that points at the beginning of the slot.
-            int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
-            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+            // slot.
             Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
                                  MachinePointerInfo(FuncArg),
                                  false, false, 0);
@@ -2562,27 +2748,29 @@
         continue;
       }
 
+      // The value of the object is its address, which is the address of
+      // its first stack doubleword.
+      InVals.push_back(FIN);
+
+      // Store whatever pieces of the object are in registers to memory.
       for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
-        // Store whatever pieces of the object are in registers
-        // to memory.  ArgOffset will be the address of the beginning
-        // of the object.
-        if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg;
-          VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
-          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
-          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                       MachinePointerInfo(FuncArg, j),
-                                       false, false, 0);
-          MemOps.push_back(Store);
-          ++GPR_idx;
-          ArgOffset += PtrByteSize;
-        } else {
-          ArgOffset += ArgSize - j;
+        if (GPR_idx == Num_GPR_Regs)
           break;
+
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+        SDValue Addr = FIN;
+        if (j) {
+          SDValue Off = DAG.getConstant(j, PtrVT);
+          Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
         }
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr,
+                                     MachinePointerInfo(FuncArg, j),
+                                     false, false, 0);
+        MemOps.push_back(Store);
+        ++GPR_idx;
       }
+      ArgOffset += ArgSize;
       continue;
     }
 
@@ -2591,6 +2779,9 @@
     case MVT::i1:
     case MVT::i32:
     case MVT::i64:
+      // These can be scalar arguments or elements of an integer array type
+      // passed directly.  Clang may use those instead of "byval" aggregate
+      // types to avoid forcing arguments to memory unnecessarily.
       if (GPR_idx != Num_GPR_Regs) {
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
@@ -2608,6 +2799,9 @@
 
     case MVT::f32:
     case MVT::f64:
+      // These can be scalar arguments or elements of a float array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // float aggregates.
       if (FPR_idx != Num_FPR_Regs) {
         unsigned VReg;
 
@@ -2620,12 +2814,32 @@
 
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++FPR_idx;
+      } else if (GPR_idx != Num_GPR_Regs) {
+        // This can only ever happen in the presence of f32 array types,
+        // since otherwise we never run out of FPRs before running out
+        // of GPRs.
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::f32) {
+          if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
+            ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
+                                 DAG.getConstant(32, MVT::i32));
+          ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+        }
+
+        ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
       } else {
         needsLoad = true;
-        ArgSize = PtrByteSize;
       }
 
-      ArgOffset += 8;
+      // When passing an array of floats, the array occupies consecutive
+      // space in the argument area; only round up to the next doubleword
+      // at the end of the array.  Otherwise, each float takes 8 bytes.
+      ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
+      ArgOffset += ArgSize;
+      if (Flags.isInConsecutiveRegsLast())
+        ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
       break;
     case MVT::v4f32:
     case MVT::v4i32:
@@ -2633,6 +2847,9 @@
     case MVT::v16i8:
     case MVT::v2f64:
     case MVT::v2i64:
+      // These can be scalar arguments or elements of a vector array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // vector aggregates.
       if (VR_idx != Num_VR_Regs) {
         unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ?
                         MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) :
@@ -2662,7 +2879,10 @@
 
   // Area that is at least reserved in the caller of this function.
   unsigned MinReservedArea;
-  MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
+  if (HasParameterArea)
+    MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
+  else
+    MinReservedArea = LinkageSize;
 
   // Set the size that is at least reserved in caller of this function.  Tail
   // call optimized functions' reserved stack space needs to be aligned so that
@@ -2723,7 +2943,8 @@
                        (CallConv == CallingConv::Fast));
   unsigned PtrByteSize = isPPC64 ? 8 : 4;
 
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true,
+                                                          false);
   unsigned ArgOffset = LinkageSize;
   // Area that is at least reserved in caller of this function.
   unsigned MinReservedArea = ArgOffset;
@@ -2849,7 +3070,7 @@
         CurArgOffset = CurArgOffset + (4 - ObjSize);
       }
       // The value of the object is its address.
-      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, false, true);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       InVals.push_back(FIN);
       if (ObjSize==1 || ObjSize==2) {
@@ -3336,6 +3557,7 @@
 
   bool isPPC64 = Subtarget.isPPC64();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   NodeTys.push_back(MVT::Other);   // Returns a chain
@@ -3352,42 +3574,41 @@
     }
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201
-    // Use indirect calls for ALL functions calls in JIT mode, since the
-    // far-call stubs may be outside relocation limits for a BL instruction.
-    if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
-      unsigned OpFlags = 0;
-      if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-          (Subtarget.getTargetTriple().isMacOSX() &&
-           Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
-          (G->getGlobal()->isDeclaration() ||
-           G->getGlobal()->isWeakForLinker())) {
-        // PC-relative references to external symbols should go through $stub,
-        // unless we're building with the leopard linker or later, which
-        // automatically synthesizes these stubs.
-        OpFlags = PPCII::MO_DARWIN_STUB;
-      }
-
-      // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
-      // every direct call is) turn it into a TargetGlobalAddress /
-      // TargetExternalSymbol node so that legalize doesn't hack it.
-      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
-                                          Callee.getValueType(),
-                                          0, OpFlags);
-      needIndirectCall = false;
+    unsigned OpFlags = 0;
+    if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
+         (Subtarget.getTargetTriple().isMacOSX() &&
+          Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
+         (G->getGlobal()->isDeclaration() ||
+          G->getGlobal()->isWeakForLinker())) ||
+        (Subtarget.isTargetELF() && !isPPC64 &&
+         !G->getGlobal()->hasLocalLinkage() &&
+         DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = PPCII::MO_PLT_OR_STUB;
     }
+
+    // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+    // every direct call is) turn it into a TargetGlobalAddress /
+    // TargetExternalSymbol node so that legalize doesn't hack it.
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+                                        Callee.getValueType(), 0, OpFlags);
+    needIndirectCall = false;
   }
 
   if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     unsigned char OpFlags = 0;
 
-    if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-        (Subtarget.getTargetTriple().isMacOSX() &&
-         Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
+    if ((DAG.getTarget().getRelocationModel() != Reloc::Static &&
+         (Subtarget.getTargetTriple().isMacOSX() &&
+          Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) ||
+        (Subtarget.isTargetELF() && !isPPC64 &&
+         DAG.getTarget().getRelocationModel() == Reloc::PIC_)	) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
-      OpFlags = PPCII::MO_DARWIN_STUB;
+      OpFlags = PPCII::MO_PLT_OR_STUB;
     }
 
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
@@ -3400,7 +3621,7 @@
     // to do the call, we can't use PPCISD::CALL.
     SDValue MTCTROps[] = {Chain, Callee, InFlag};
 
-    if (isSVR4ABI && isPPC64) {
+    if (isSVR4ABI && isPPC64 && !isELFv2ABI) {
       // Function pointers in the 64-bit SVR4 ABI do not point to the function
       // entry point, but to the function descriptor (the function entry point
       // address is part of the function descriptor though).
@@ -3480,7 +3701,7 @@
     CallOpc = PPCISD::BCTRL;
     Callee.setNode(nullptr);
     // Add use of X11 (holding environment pointer)
-    if (isSVR4ABI && isPPC64)
+    if (isSVR4ABI && isPPC64 && !isELFv2ABI)
       Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
     // Add CTR register as callee so a bctr can be emitted later.
     if (isTailCall)
@@ -3491,6 +3712,23 @@
   if (Callee.getNode()) {
     Ops.push_back(Chain);
     Ops.push_back(Callee);
+
+    // If this is a call to __tls_get_addr, find the symbol whose address
+    // is to be taken and add it to the list.  This will be used to 
+    // generate __tls_get_addr(<sym>@tlsgd) or __tls_get_addr(<sym>@tlsld).
+    // We find the symbol by walking the chain to the CopyFromReg, walking
+    // back from the CopyFromReg to the ADDI_TLSGD_L or ADDI_TLSLD_L, and
+    // pulling the symbol from that node.
+    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+      if (!strcmp(S->getSymbol(), "__tls_get_addr")) {
+        assert(!needIndirectCall && "Indirect call to __tls_get_addr???");
+        SDNode *AddI = Chain.getNode()->getOperand(2).getNode();
+        SDValue TGTAddr = AddI->getOperand(1);
+        assert(TGTAddr.getNode()->getOpcode() == ISD::TargetGlobalTLSAddress &&
+               "Didn't find target global TLS address where we expected one");
+        Ops.push_back(TGTAddr);
+        CallOpc = PPCISD::CALL_TLS;
+      }
   }
   // If this is a tail call add stack pointer delta.
   if (isTailCall)
@@ -3502,6 +3740,10 @@
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
+  // Direct calls in the ELFv2 ABI need the TOC register live into the call.
+  if (Callee.getNode() && isELFv2ABI)
+    Ops.push_back(DAG.getRegister(PPC::X2, PtrVT));
+
   return CallOpc;
 }
 
@@ -3522,8 +3764,8 @@
                                    SmallVectorImpl<SDValue> &InVals) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext());
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
 
   // Copy all of the result registers out of their specified physreg.
@@ -3571,6 +3813,8 @@
                               int SPDiff, unsigned NumBytes,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
                               SmallVectorImpl<SDValue> &InVals) const {
+
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
   std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
@@ -3589,7 +3833,8 @@
      getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3636,7 +3881,9 @@
                 DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
       // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP;
-    }
+    } else if (CallOpc == PPCISD::CALL_TLS)
+      // For 64-bit SVR4, TLS calls are always non-local.
+      CallOpc = PPCISD::CALL_NOP_TLS;
   }
 
   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
@@ -3646,7 +3893,7 @@
     SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
     SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
-    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
+    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI);
     SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset);
     SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
     Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag);
@@ -3735,11 +3982,12 @@
 
   // Assign locations to all of the outgoing arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
-  CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
+  CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false),
+                       PtrByteSize);
 
   if (isVarArg) {
     // Handle fixed and variable vector arguments differently.
@@ -3776,7 +4024,7 @@
   // Assign locations to all of the outgoing aggregate by value arguments.
   SmallVector<CCValAssign, 16> ByValArgLocs;
   CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+                      ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -3948,6 +4196,7 @@
                                     SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
+  bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
 
@@ -3966,21 +4215,27 @@
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
   // Count how many bytes are to be pushed on the stack, including the linkage
-  // area, and parameter passing area.  We start with at least 48 bytes, which
-  // is reserved space for [SP][CR][LR][3 x unused].
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false);
+  // area, and parameter passing area.  On ELFv1, the linkage area is 48 bytes
+  // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
+  // area is 32 bytes reserved space for [SP][CR][LR][TOC].
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false,
+                                                          isELFv2ABI);
   unsigned NumBytes = LinkageSize;
 
   // Add up all the space actually used.
   for (unsigned i = 0; i != NumOps; ++i) {
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
     EVT ArgVT = Outs[i].VT;
+    EVT OrigVT = Outs[i].ArgVT;
 
     /* Respect alignment of argument on the stack.  */
-    unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize);
+    unsigned Align =
+      CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
     NumBytes = ((NumBytes + Align - 1) / Align) * Align;
 
     NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+    if (Flags.isInConsecutiveRegsLast())
+      NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
   }
 
   unsigned NumBytesActuallyUsed = NumBytes;
@@ -3990,6 +4245,7 @@
   // Because we cannot tell if this is needed on the caller side, we have to
   // conservatively assume that it is needed.  As such, make sure we have at
   // least enough stack space for the caller to store the 8 GPRs.
+  // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
   NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
 
   // Tail call needs the stack to be aligned.
@@ -4056,10 +4312,12 @@
   for (unsigned i = 0; i != NumOps; ++i) {
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+    EVT OrigVT = Outs[i].ArgVT;
 
     /* Respect alignment of argument on the stack.  */
     unsigned Align =
-      CalculateStackSlotAlignment(Outs[i].VT, Flags, PtrByteSize);
+      CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
     ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
 
     /* Compute GPR index associated with argument offset.  */
@@ -4103,7 +4361,7 @@
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
                                         MachinePointerInfo(), VT,
-                                        false, false, 0);
+                                        false, false, false, 0);
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
 
@@ -4199,6 +4457,9 @@
     case MVT::i1:
     case MVT::i32:
     case MVT::i64:
+      // These can be scalar arguments or elements of an integer array type
+      // passed directly.  Clang may use those instead of "byval" aggregate
+      // types to avoid forcing arguments to memory unnecessarily.
       if (GPR_idx != NumGPRs) {
         RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg));
       } else {
@@ -4209,39 +4470,70 @@
       ArgOffset += PtrByteSize;
       break;
     case MVT::f32:
-    case MVT::f64:
-      if (FPR_idx != NumFPRs) {
+    case MVT::f64: {
+      // These can be scalar arguments or elements of a float array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // float aggregates.
+
+      // Named arguments go into FPRs first, and once they overflow, the
+      // remaining arguments go into GPRs and then the parameter save area.
+      // Unnamed arguments for vararg functions always go to GPRs and
+      // then the parameter save area.  For now, put all arguments to vararg
+      // routines always in both locations (FPR *and* GPR or stack slot).
+      bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
+
+      // First load the argument into the next available FPR.
+      if (FPR_idx != NumFPRs)
         RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
 
-        if (isVarArg) {
-          // A single float or an aggregate containing only a single float
-          // must be passed right-justified in the stack doubleword, and
-          // in the GPR, if one is available.
-          SDValue StoreOff;
-          if (Arg.getSimpleValueType().SimpleTy == MVT::f32 &&
-              !isLittleEndian) {
-            SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
-            StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
-          } else
-            StoreOff = PtrOff;
+      // Next, load the argument into GPR or stack slot if needed.
+      if (!NeedGPROrStack)
+        ;
+      else if (GPR_idx != NumGPRs) {
+        // In the non-vararg case, this can only ever happen in the
+        // presence of f32 array types, since otherwise we never run
+        // out of FPRs before running out of GPRs.
+        SDValue ArgVal;
 
-          SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff,
-                                       MachinePointerInfo(), false, false, 0);
-          MemOpChains.push_back(Store);
+        // Double values are always passed in a single GPR.
+        if (Arg.getValueType() != MVT::f32) {
+          ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
 
-          // Float varargs are always shadowed in available integer registers
-          if (GPR_idx != NumGPRs) {
-            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
-                                       MachinePointerInfo(), false, false,
-                                       false, 0);
-            MemOpChains.push_back(Load.getValue(1));
-            RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
-          }
-        }
+        // Non-array float values are extended and passed in a GPR.
+        } else if (!Flags.isInConsecutiveRegs()) {
+          ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+          ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+
+        // If we have an array of floats, we collect every odd element
+        // together with its predecessor into one GPR.
+        } else if (ArgOffset % PtrByteSize != 0) {
+          SDValue Lo, Hi;
+          Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
+          Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+          if (!isLittleEndian)
+            std::swap(Lo, Hi);
+          ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+
+        // The final element, if even, goes into the first half of a GPR.
+        } else if (Flags.isInConsecutiveRegsLast()) {
+          ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+          ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
+          if (!isLittleEndian)
+            ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
+                                 DAG.getConstant(32, MVT::i32));
+
+        // Non-final even elements are skipped; they will be handled
+        // together the with subsequent argument on the next go-around.
+        } else
+          ArgVal = SDValue();
+
+        if (ArgVal.getNode())
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal));
       } else {
         // Single-precision floating-point values are mapped to the
         // second (rightmost) word of the stack doubleword.
-        if (Arg.getValueType() == MVT::f32 && !isLittleEndian) {
+        if (Arg.getValueType() == MVT::f32 &&
+            !isLittleEndian && !Flags.isInConsecutiveRegs()) {
           SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
           PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
         }
@@ -4250,14 +4542,25 @@
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
       }
-      ArgOffset += 8;
+      // When passing an array of floats, the array occupies consecutive
+      // space in the argument area; only round up to the next doubleword
+      // at the end of the array.  Otherwise, each float takes 8 bytes.
+      ArgOffset += (Arg.getValueType() == MVT::f32 &&
+                    Flags.isInConsecutiveRegs()) ? 4 : 8;
+      if (Flags.isInConsecutiveRegsLast())
+        ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
       break;
+    }
     case MVT::v4f32:
     case MVT::v4i32:
     case MVT::v8i16:
     case MVT::v16i8:
     case MVT::v2f64:
     case MVT::v2i64:
+      // These can be scalar arguments or elements of a vector array type
+      // passed directly.  The latter are used to implement ELFv2 homogenous
+      // vector aggregates.
+
       // For a varargs call, named arguments go into VRs or on the stack as
       // usual; unnamed arguments always go to the stack or the corresponding
       // GPRs when within range.  For now, we always put the value in both
@@ -4328,11 +4631,16 @@
     // Load r2 into a virtual register and store it to the TOC save area.
     SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
     // TOC save area offset.
-    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
+    unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI);
     SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset);
     SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
     Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
                          false, false, 0);
+    // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
+    // This does not mean the MTCTR instruction must use R12; it's easier
+    // to model this as an extra parameter, so do that.
+    if (isELFv2ABI)
+      RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -4383,7 +4691,8 @@
   // Count how many bytes are to be pushed on the stack, including the linkage
   // area, and parameter passing area.  We start with 24/48 bytes, which is
   // prereserved space for [SP][CR][LR][3 x unused].
-  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true,
+                                                          false);
   unsigned NumBytes = LinkageSize;
 
   // Add up all the space actually used.
@@ -4522,7 +4831,7 @@
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
                                         MachinePointerInfo(), VT,
-                                        false, false, 0);
+                                        false, false, false, 0);
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
 
@@ -4751,8 +5060,7 @@
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                                   LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
-                 RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_PPC);
 }
 
@@ -4764,8 +5072,8 @@
                                SDLoc dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
 
   SDValue Flag;
@@ -5773,15 +6081,15 @@
     if (PPC::isSplatShuffleMask(SVOp, 1) ||
         PPC::isSplatShuffleMask(SVOp, 2) ||
         PPC::isSplatShuffleMask(SVOp, 4) ||
-        PPC::isVPKUWUMShuffleMask(SVOp, true, DAG) ||
-        PPC::isVPKUHUMShuffleMask(SVOp, true, DAG) ||
-        PPC::isVSLDOIShuffleMask(SVOp, true, DAG) != -1 ||
-        PPC::isVMRGLShuffleMask(SVOp, 1, true, DAG) ||
-        PPC::isVMRGLShuffleMask(SVOp, 2, true, DAG) ||
-        PPC::isVMRGLShuffleMask(SVOp, 4, true, DAG) ||
-        PPC::isVMRGHShuffleMask(SVOp, 1, true, DAG) ||
-        PPC::isVMRGHShuffleMask(SVOp, 2, true, DAG) ||
-        PPC::isVMRGHShuffleMask(SVOp, 4, true, DAG)) {
+        PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
+        PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
+        PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
+        PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
+        PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
+        PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG)) {
       return Op;
     }
   }
@@ -5789,15 +6097,16 @@
   // Altivec has a variety of "shuffle immediates" that take two vector inputs
   // and produce a fixed permutation.  If any of these match, do not lower to
   // VPERM.
-  if (PPC::isVPKUWUMShuffleMask(SVOp, false, DAG) ||
-      PPC::isVPKUHUMShuffleMask(SVOp, false, DAG) ||
-      PPC::isVSLDOIShuffleMask(SVOp, false, DAG) != -1 ||
-      PPC::isVMRGLShuffleMask(SVOp, 1, false, DAG) ||
-      PPC::isVMRGLShuffleMask(SVOp, 2, false, DAG) ||
-      PPC::isVMRGLShuffleMask(SVOp, 4, false, DAG) ||
-      PPC::isVMRGHShuffleMask(SVOp, 1, false, DAG) ||
-      PPC::isVMRGHShuffleMask(SVOp, 2, false, DAG) ||
-      PPC::isVMRGHShuffleMask(SVOp, 4, false, DAG))
+  unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
+  if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+      PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
+      PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
+      PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+      PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
+      PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG))
     return Op;
 
   // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
@@ -6252,11 +6561,44 @@
 //  Other Lowering Code
 //===----------------------------------------------------------------------===//
 
+static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Function *Func = Intrinsic::getDeclaration(M, Id);
+  return Builder.CreateCall(Func);
+}
+
+// The mappings for emitLeading/TrailingFence is taken from
+// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                         AtomicOrdering Ord, bool IsStore,
+                                         bool IsLoad) const {
+  if (Ord == SequentiallyConsistent)
+    return callIntrinsic(Builder, Intrinsic::ppc_sync);
+  else if (isAtLeastRelease(Ord))
+    return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+  else
+    return nullptr;
+}
+
+Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                          AtomicOrdering Ord, bool IsStore,
+                                          bool IsLoad) const {
+  if (IsLoad && isAtLeastAcquire(Ord))
+    return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
+  // FIXME: this is too conservative, a dependent branch + isync is enough.
+  // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
+  // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+  // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
+  else
+    return nullptr;
+}
+
 MachineBasicBlock *
 PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                                     bool is64bit, unsigned BinOpcode) const {
   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction *F = BB->getParent();
@@ -6318,7 +6660,8 @@
                                             bool is8bit,    // operation
                                             unsigned BinOpcode) const {
   // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
   // In 64 bit mode we have to use 64 bits for addresses, even though the
   // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
   // registers without caring whether they're 32 or 64, but here we're
@@ -6446,7 +6789,8 @@
 PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -6545,7 +6889,7 @@
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
   const PPCRegisterInfo *TRI =
-    static_cast<const PPCRegisterInfo*>(getTargetMachine().getRegisterInfo());
+      getTargetMachine().getSubtarget<PPCSubtarget>().getRegisterInfo();
   MIB.addRegMask(TRI->getNoPreservedMask());
 
   BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
@@ -6594,7 +6938,8 @@
 PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
                                      MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -6613,7 +6958,10 @@
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   unsigned FP  = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
   unsigned SP  = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
-  unsigned BP  = (PVT == MVT::i64) ? PPC::X30 : PPC::R30;
+  unsigned BP  = (PVT == MVT::i64) ? PPC::X30 :
+                  (Subtarget.isSVR4ABI() &&
+                   MF->getTarget().getRelocationModel() == Reloc::PIC_ ?
+                     PPC::R29 : PPC::R30);
 
   MachineInstrBuilder MIB;
 
@@ -6703,7 +7051,8 @@
     return emitEHSjLjLongJmp(MI, BB);
   }
 
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetInstrInfo *TII =
+      getTargetMachine().getSubtargetImpl()->getInstrInfo();
 
   // To "insert" these instructions we actually have to insert their
   // control-flow patterns.
@@ -6726,7 +7075,8 @@
     Cond.push_back(MI->getOperand(1));
 
     DebugLoc dl = MI->getDebugLoc();
-    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    const TargetInstrInfo *TII =
+        getTargetMachine().getSubtargetImpl()->getInstrInfo();
     TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(),
                       Cond, MI->getOperand(2).getReg(),
                       MI->getOperand(3).getReg());
@@ -6735,11 +7085,15 @@
              MI->getOpcode() == PPC::SELECT_CC_F4 ||
              MI->getOpcode() == PPC::SELECT_CC_F8 ||
              MI->getOpcode() == PPC::SELECT_CC_VRRC ||
+             MI->getOpcode() == PPC::SELECT_CC_VSFRC ||
+             MI->getOpcode() == PPC::SELECT_CC_VSRC ||
              MI->getOpcode() == PPC::SELECT_I4 ||
              MI->getOpcode() == PPC::SELECT_I8 ||
              MI->getOpcode() == PPC::SELECT_F4 ||
              MI->getOpcode() == PPC::SELECT_F8 ||
-             MI->getOpcode() == PPC::SELECT_VRRC) {
+             MI->getOpcode() == PPC::SELECT_VRRC ||
+             MI->getOpcode() == PPC::SELECT_VSFRC ||
+             MI->getOpcode() == PPC::SELECT_VSRC) {
     // The incoming instruction knows the destination vreg to set, the
     // condition code register to branch on, the true/false values to
     // select between, and a branch opcode to use.
@@ -6770,7 +7124,9 @@
         MI->getOpcode() == PPC::SELECT_I8 ||
         MI->getOpcode() == PPC::SELECT_F4 ||
         MI->getOpcode() == PPC::SELECT_F8 ||
-        MI->getOpcode() == PPC::SELECT_VRRC) {
+        MI->getOpcode() == PPC::SELECT_VRRC ||
+        MI->getOpcode() == PPC::SELECT_VSFRC ||
+        MI->getOpcode() == PPC::SELECT_VSRC) {
       BuildMI(BB, dl, TII->get(PPC::BC))
         .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
     } else {
@@ -7131,151 +7487,54 @@
 // Target Optimization Hooks
 //===----------------------------------------------------------------------===//
 
-SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
-                                               DAGCombinerInfo &DCI) const {
-  if (DCI.isAfterLegalizeVectorOps())
-    return SDValue();
-
-  EVT VT = Op.getValueType();
-
-  if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
-      (VT == MVT::f64 && Subtarget.hasFRE())  ||
-      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
-      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
-
-    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
-    // For the reciprocal, we need to find the zero of the function:
-    //   F(X) = A X - 1 [which has a zero at X = 1/A]
-    //     =>
-    //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
-    //     does not require additional intermediate precision]
-
-    // Convergence is quadratic, so we essentially double the number of digits
-    // correct after every iteration. The minimum architected relative
-    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
-    // 23 digits and double has 52 digits.
-    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
-    if (VT.getScalarType() == MVT::f64)
-      ++Iterations;
-
-    SelectionDAG &DAG = DCI.DAG;
-    SDLoc dl(Op);
-
-    SDValue FPOne =
-      DAG.getConstantFP(1.0, VT.getScalarType());
-    if (VT.isVector()) {
-      assert(VT.getVectorNumElements() == 4 &&
-             "Unknown vector type");
-      FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                          FPOne, FPOne, FPOne, FPOne);
-    }
-
-    SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op);
-    DCI.AddToWorklist(Est.getNode());
-
-    // Newton iterations: Est = Est + Est (1 - Arg * Est)
-    for (int i = 0; i < Iterations; ++i) {
-      SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst);
-      DCI.AddToWorklist(Est.getNode());
-    }
-
-    return Est;
-  }
-
-  return SDValue();
-}
-
-SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
-                                             DAGCombinerInfo &DCI) const {
-  if (DCI.isAfterLegalizeVectorOps())
-    return SDValue();
-
-  EVT VT = Op.getValueType();
-
+SDValue PPCTargetLowering::getRsqrtEstimate(SDValue Operand,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps,
+                                            bool &UseOneConstNR) const {
+  EVT VT = Operand.getValueType();
   if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
       (VT == MVT::f64 && Subtarget.hasFRSQRTE())  ||
       (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
       (VT == MVT::v2f64 && Subtarget.hasVSX())) {
-
-    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
-    // For the reciprocal sqrt, we need to find the zero of the function:
-    //   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
-    //     =>
-    //   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
-    // As a result, we precompute A/2 prior to the iteration loop.
-
     // Convergence is quadratic, so we essentially double the number of digits
-    // correct after every iteration. The minimum architected relative
-    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
-    // 23 digits and double has 52 digits.
-    int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
+    // correct after every iteration. For both FRE and FRSQRTE, the minimum
+    // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
+    // 2^-14. IEEE float has 23 digits and double has 52 digits.
+    RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
     if (VT.getScalarType() == MVT::f64)
-      ++Iterations;
-
-    SelectionDAG &DAG = DCI.DAG;
-    SDLoc dl(Op);
-
-    SDValue FPThreeHalves =
-      DAG.getConstantFP(1.5, VT.getScalarType());
-    if (VT.isVector()) {
-      assert(VT.getVectorNumElements() == 4 &&
-             "Unknown vector type");
-      FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                                  FPThreeHalves, FPThreeHalves,
-                                  FPThreeHalves, FPThreeHalves);
-    }
-
-    SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op);
-    DCI.AddToWorklist(Est.getNode());
-
-    // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that
-    // this entire sequence requires only one FP constant.
-    SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op);
-    DCI.AddToWorklist(HalfArg.getNode());
-
-    HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op);
-    DCI.AddToWorklist(HalfArg.getNode());
-
-    // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
-    for (int i = 0; i < Iterations; ++i) {
-      SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst);
-      DCI.AddToWorklist(NewEst.getNode());
-
-      Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
-      DCI.AddToWorklist(Est.getNode());
-    }
-
-    return Est;
+      ++RefinementSteps;
+    UseOneConstNR = true;
+    return DCI.DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
   }
-
   return SDValue();
 }
 
-// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
-// not enforce equality of the chain operands.
-static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base,
+SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps) const {
+  EVT VT = Operand.getValueType();
+  if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
+      (VT == MVT::f64 && Subtarget.hasFRE())  ||
+      (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
+      (VT == MVT::v2f64 && Subtarget.hasVSX())) {
+    // Convergence is quadratic, so we essentially double the number of digits
+    // correct after every iteration. For both FRE and FRSQRTE, the minimum
+    // architected relative accuracy is 2^-5. When hasRecipPrec(), this is
+    // 2^-14. IEEE float has 23 digits and double has 52 digits.
+    RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
+    if (VT.getScalarType() == MVT::f64)
+      ++RefinementSteps;
+    return DCI.DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
+  }
+  return SDValue();
+}
+
+static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
                             unsigned Bytes, int Dist,
                             SelectionDAG &DAG) {
-  EVT VT = LS->getMemoryVT();
   if (VT.getSizeInBits() / 8 != Bytes)
     return false;
 
-  SDValue Loc = LS->getBasePtr();
   SDValue BaseLoc = Base->getBasePtr();
   if (Loc.getOpcode() == ISD::FrameIndex) {
     if (BaseLoc.getOpcode() != ISD::FrameIndex)
@@ -7306,11 +7565,77 @@
   return false;
 }
 
+// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
+// not enforce equality of the chain operands.
+static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
+                            unsigned Bytes, int Dist,
+                            SelectionDAG &DAG) {
+  if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
+    EVT VT = LS->getMemoryVT();
+    SDValue Loc = LS->getBasePtr();
+    return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
+  }
+
+  if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
+    EVT VT;
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    default: return false;
+    case Intrinsic::ppc_altivec_lvx:
+    case Intrinsic::ppc_altivec_lvxl:
+    case Intrinsic::ppc_vsx_lxvw4x:
+      VT = MVT::v4i32;
+      break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_altivec_lvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_lvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_lvewx:
+      VT = MVT::i32;
+      break;
+    }
+
+    return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
+  }
+
+  if (N->getOpcode() == ISD::INTRINSIC_VOID) {
+    EVT VT;
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    default: return false;
+    case Intrinsic::ppc_altivec_stvx:
+    case Intrinsic::ppc_altivec_stvxl:
+    case Intrinsic::ppc_vsx_stxvw4x:
+      VT = MVT::v4i32;
+      break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
+    case Intrinsic::ppc_altivec_stvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_stvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_stvewx:
+      VT = MVT::i32;
+      break;
+    }
+
+    return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
+  }
+
+  return false;
+}
+
 // Return true is there is a nearyby consecutive load to the one provided
 // (regardless of alignment). We search up and down the chain, looking though
-// token factors and other loads (but nothing else). As a result, a true
-// results indicates that it is safe to create a new consecutive load adjacent
-// to the load provided.
+// token factors and other loads (but nothing else). As a result, a true result
+// indicates that it is safe to create a new consecutive load adjacent to the
+// load provided.
 static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
   SDValue Chain = LD->getChain();
   EVT VT = LD->getMemoryVT();
@@ -7324,10 +7649,10 @@
   // nodes just above the top-level loads and token factors.
   while (!Queue.empty()) {
     SDNode *ChainNext = Queue.pop_back_val();
-    if (!Visited.insert(ChainNext))
+    if (!Visited.insert(ChainNext).second)
       continue;
 
-    if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) {
+    if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
       if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
         return true;
 
@@ -7355,17 +7680,17 @@
        
     while (!Queue.empty()) {
       SDNode *LoadRoot = Queue.pop_back_val();
-      if (!Visited.insert(LoadRoot))
+      if (!Visited.insert(LoadRoot).second)
         continue;
 
-      if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot))
+      if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
         if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
           return true;
 
       for (SDNode::use_iterator UI = LoadRoot->use_begin(),
            UE = LoadRoot->use_end(); UI != UE; ++UI)
-        if (((isa<LoadSDNode>(*UI) &&
-            cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
+        if (((isa<MemSDNode>(*UI) &&
+            cast<MemSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
             UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
           Queue.push_back(*UI);
     }
@@ -7485,7 +7810,7 @@
     SDValue BinOp = BinOps.back();
     BinOps.pop_back();
 
-    if (!Visited.insert(BinOp.getNode()))
+    if (!Visited.insert(BinOp.getNode()).second)
       continue;
 
     PromOps.push_back(BinOp);
@@ -7699,7 +8024,7 @@
     SDValue BinOp = BinOps.back();
     BinOps.pop_back();
 
-    if (!Visited.insert(BinOp.getNode()))
+    if (!Visited.insert(BinOp.getNode()).second)
       continue;
 
     PromOps.push_back(BinOp);
@@ -7936,92 +8261,6 @@
   case ISD::SETCC:
   case ISD::SELECT_CC:
     return DAGCombineTruncBoolExt(N, DCI);
-  case ISD::FDIV: {
-    assert(TM.Options.UnsafeFPMath &&
-           "Reciprocal estimates require UnsafeFPMath");
-
-    if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
-      SDValue RV =
-        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI);
-      if (RV.getNode()) {
-        DCI.AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
-                           N->getOperand(0), RV);
-      }
-    } else if (N->getOperand(1).getOpcode() == ISD::FP_EXTEND &&
-               N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
-      SDValue RV =
-        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
-                                 DCI);
-      if (RV.getNode()) {
-        DCI.AddToWorklist(RV.getNode());
-        RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)),
-                         N->getValueType(0), RV);
-        DCI.AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
-                           N->getOperand(0), RV);
-      }
-    } else if (N->getOperand(1).getOpcode() == ISD::FP_ROUND &&
-               N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
-      SDValue RV =
-        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
-                                 DCI);
-      if (RV.getNode()) {
-        DCI.AddToWorklist(RV.getNode());
-        RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)),
-                         N->getValueType(0), RV,
-                         N->getOperand(1).getOperand(1));
-        DCI.AddToWorklist(RV.getNode());
-        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
-                           N->getOperand(0), RV);
-      }
-    }
-
-    SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI);
-    if (RV.getNode()) {
-      DCI.AddToWorklist(RV.getNode());
-      return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
-                         N->getOperand(0), RV);
-    }
-
-    }
-    break;
-  case ISD::FSQRT: {
-    assert(TM.Options.UnsafeFPMath &&
-           "Reciprocal estimates require UnsafeFPMath");
-
-    // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
-    // reciprocal sqrt.
-    SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI);
-    if (RV.getNode()) {
-      DCI.AddToWorklist(RV.getNode());
-      RV = DAGCombineFastRecip(RV, DCI);
-      if (RV.getNode()) {
-        // Unfortunately, RV is now NaN if the input was exactly 0. Select out
-        // this case and force the answer to 0.
-
-        EVT VT = RV.getValueType();
-
-        SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType());
-        if (VT.isVector()) {
-          assert(VT.getVectorNumElements() == 4 && "Unknown vector type");
-          Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero);
-        }
-
-        SDValue ZeroCmp =
-          DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT),
-                       N->getOperand(0), Zero, ISD::SETEQ);
-        DCI.AddToWorklist(ZeroCmp.getNode());
-        DCI.AddToWorklist(RV.getNode());
-
-        RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT,
-                         ZeroCmp, Zero, RV);
-        return RV;
-      }
-    }
-
-    }
-    break;
   case ISD::SINT_TO_FP:
     if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
       if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
@@ -8112,6 +8351,8 @@
     unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
     if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
         TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
+        // P8 and later hardware should just use LOAD.
+        !TM.getSubtarget<PPCSubtarget>().hasP8Vector() &&
         (VT == MVT::v16i8 || VT == MVT::v8i16 ||
          VT == MVT::v4i32 || VT == MVT::v4f32) &&
         LD->getAlignment() < ABIAlignment) {
@@ -8149,17 +8390,25 @@
                             Intrinsic::ppc_altivec_lvsl);
       SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);
 
-      // Refine the alignment of the original load (a "new" load created here
-      // which was identical to the first except for the alignment would be
-      // merged with the existing node regardless).
+      // Create the new MMO for the new base load. It is like the original MMO,
+      // but represents an area in memory almost twice the vector size centered
+      // on the original address. If the address is unaligned, we might start
+      // reading up to (sizeof(vector)-1) bytes below the address of the
+      // original unaligned load.
       MachineFunction &MF = DAG.getMachineFunction();
-      MachineMemOperand *MMO =
-        MF.getMachineMemOperand(LD->getPointerInfo(),
-                                LD->getMemOperand()->getFlags(),
-                                LD->getMemoryVT().getStoreSize(),
-                                ABIAlignment);
-      LD->refineAlignment(MMO);
-      SDValue BaseLoad = SDValue(LD, 0);
+      MachineMemOperand *BaseMMO =
+        MF.getMachineMemOperand(LD->getMemOperand(),
+                                -LD->getMemoryVT().getStoreSize()+1,
+                                2*LD->getMemoryVT().getStoreSize()-1);
+
+      // Create the new base load.
+      SDValue LDXIntID = DAG.getTargetConstant(Intrinsic::ppc_altivec_lvx,
+                                               getPointerTy());
+      SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
+      SDValue BaseLoad =
+        DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
+                                DAG.getVTList(MVT::v4i32, MVT::Other),
+                                BaseLoadOps, MVT::v4i32, BaseMMO);
 
       // Note that the value of IncOffset (which is provided to the next
       // load's pointer info offset value, and thus used to calculate the
@@ -8181,21 +8430,18 @@
       SDValue Increment = DAG.getConstant(IncValue, getPointerTy());
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
 
+      MachineMemOperand *ExtraMMO =
+        MF.getMachineMemOperand(LD->getMemOperand(),
+                                1, 2*LD->getMemoryVT().getStoreSize()-1);
+      SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
       SDValue ExtraLoad =
-        DAG.getLoad(VT, dl, Chain, Ptr,
-                    LD->getPointerInfo().getWithOffset(IncOffset),
-                    LD->isVolatile(), LD->isNonTemporal(),
-                    LD->isInvariant(), ABIAlignment);
+        DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
+                                DAG.getVTList(MVT::v4i32, MVT::Other),
+                                ExtraLoadOps, MVT::v4i32, ExtraMMO);
 
       SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
         BaseLoad.getValue(1), ExtraLoad.getValue(1));
 
-      if (BaseLoad.getValueType() != MVT::v4i32)
-        BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad);
-
-      if (ExtraLoad.getValueType() != MVT::v4i32)
-        ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
-
       // Because vperm has a big-endian bias, we must reverse the order
       // of the input vectors and complement the permute control vector
       // when generating little endian code.  We have already handled the
@@ -8212,36 +8458,9 @@
       if (VT != MVT::v4i32)
         Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
 
-      // Now we need to be really careful about how we update the users of the
-      // original load. We cannot just call DCI.CombineTo (or
-      // DAG.ReplaceAllUsesWith for that matter), because the load still has
-      // uses created here (the permutation for example) that need to stay.
-      SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
-      while (UI != UE) {
-        SDUse &Use = UI.getUse();
-        SDNode *User = *UI;
-        // Note: BaseLoad is checked here because it might not be N, but a
-        // bitcast of N.
-        if (User == Perm.getNode() || User == BaseLoad.getNode() ||
-            User == TF.getNode() || Use.getResNo() > 1) {
-          ++UI;
-          continue;
-        }
-
-        SDValue To = Use.getResNo() ? TF : Perm;
-        ++UI;
-
-        SmallVector<SDValue, 8> Ops;
-        for (const SDUse &O : User->ops()) {
-          if (O == Use)
-            Ops.push_back(To);
-          else
-            Ops.push_back(O);
-        }
-
-        DAG.UpdateNodeOperands(User, Ops);
-      }
-
+      // The output of the permutation is our loaded result, the TokenFactor is
+      // our new chain.
+      DCI.CombineTo(N, Perm, TF);
       return SDValue(N, 0);
     }
     }
@@ -8659,7 +8878,8 @@
   // the AsmName field from *RegisterInfo.td, then this would not be necessary.
   if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
       PPC::GPRCRegClass.contains(R.first)) {
-    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    const TargetRegisterInfo *TRI =
+        getTargetMachine().getSubtargetImpl()->getRegisterInfo();
     return std::make_pair(TRI->getMatchingSuperReg(R.first,
                             PPC::sub_32, &PPC::G8RCRegClass),
                           &PPC::G8RCRegClass);
@@ -8872,6 +9092,92 @@
   return false;
 }
 
+bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+
+  switch (Intrinsic) {
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+  case Intrinsic::ppc_altivec_lvebx:
+  case Intrinsic::ppc_altivec_lvehx:
+  case Intrinsic::ppc_altivec_lvewx:
+  case Intrinsic::ppc_vsx_lxvd2x:
+  case Intrinsic::ppc_vsx_lxvw4x: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_altivec_lvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_lvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_lvewx:
+      VT = MVT::i32;
+      break;
+    case Intrinsic::ppc_vsx_lxvd2x:
+      VT = MVT::v2f64;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = -VT.getStoreSize()+1;
+    Info.size = 2*VT.getStoreSize()-1;
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+  case Intrinsic::ppc_altivec_stvebx:
+  case Intrinsic::ppc_altivec_stvehx:
+  case Intrinsic::ppc_altivec_stvewx:
+  case Intrinsic::ppc_vsx_stxvd2x:
+  case Intrinsic::ppc_vsx_stxvw4x: {
+    EVT VT;
+    switch (Intrinsic) {
+    case Intrinsic::ppc_altivec_stvebx:
+      VT = MVT::i8;
+      break;
+    case Intrinsic::ppc_altivec_stvehx:
+      VT = MVT::i16;
+      break;
+    case Intrinsic::ppc_altivec_stvewx:
+      VT = MVT::i32;
+      break;
+    case Intrinsic::ppc_vsx_stxvd2x:
+      VT = MVT::v2f64;
+      break;
+    default:
+      VT = MVT::v4i32;
+      break;
+    }
+
+    Info.opc = ISD::INTRINSIC_VOID;
+    Info.memVT = VT;
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = -VT.getStoreSize()+1;
+    Info.size = 2*VT.getStoreSize()-1;
+    Info.align = 1;
+    Info.vol = false;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
+
 /// getOptimalMemOpType - Returns the target specific optimal type for load
 /// and store operations as a result of memset, memcpy, and memmove
 /// lowering. If DstAlign is zero that means it's safe to destination
@@ -8931,9 +9237,10 @@
   return isInt<16>(Imm) || isUInt<16>(Imm);
 }
 
-bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
-                                                      unsigned,
-                                                      bool *Fast) const {
+bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                       unsigned,
+                                                       unsigned,
+                                                       bool *Fast) const {
   if (DisablePPCUnaligned)
     return false;
 
@@ -8948,7 +9255,8 @@
 
   if (VT.getSimpleVT().isVector()) {
     if (Subtarget.hasVSX()) {
-      if (VT != MVT::v2f64 && VT != MVT::v2i64)
+      if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
+          VT != MVT::v4f32 && VT != MVT::v4i32)
         return false;
     } else {
       return false;

diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index df05aa5..bb4d1f1 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
-#define LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
+#define LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
 
 #include "PPC.h"
 #include "PPCInstrInfo.h"
@@ -99,6 +99,10 @@
       /// SVR4 calls.
       CALL, CALL_NOP,
 
+      /// CALL_TLS and CALL_NOP_TLS - Versions of CALL and CALL_NOP used
+      /// to access TLS variables.
+      CALL_TLS, CALL_NOP_TLS,
+
       /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
       /// MTCTR instruction.
       MTCTR,
@@ -181,6 +185,10 @@
       /// on PPC32.
       PPC32_GOT,
 
+      /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
+      /// local dynamic TLS  on PPC32.
+      PPC32_PICGOT,
+
       /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
       /// TLS model, produces an ADDIS8 instruction that adds the GOT
       /// base to sym\@got\@tprel\@ha.
@@ -210,10 +218,6 @@
       /// sym\@got\@tlsgd\@l.
       ADDI_TLSGD_L,
 
-      /// G8RC = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
-      /// model, produces a call to __tls_get_addr(sym\@tlsgd).
-      GET_TLS_ADDR,
-
       /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
       /// model, produces an ADDIS8 instruction that adds the GOT base
       /// register to sym\@got\@tlsld\@ha.
@@ -224,10 +228,6 @@
       /// sym\@got\@tlsld\@l.
       ADDI_TLSLD_L,
 
-      /// G8RC = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
-      /// model, produces a call to __tls_get_addr(sym\@tlsld).
-      GET_TLSLD_ADDR,
-
       /// G8RC = ADDIS_DTPREL_HA %X3, Symbol, Chain - For the
       /// local-dynamic TLS model, produces an ADDIS8 instruction
       /// that adds X3 to sym\@dtprel\@ha. The Chain operand is needed
@@ -297,27 +297,28 @@
   namespace PPC {
     /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUHUM instruction.
-    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                               SelectionDAG &DAG);
 
     /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUWUM instruction.
-    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
+    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
                               SelectionDAG &DAG);
 
     /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
     /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
     bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                            bool isUnary, SelectionDAG &DAG);
+                            unsigned ShuffleKind, SelectionDAG &DAG);
 
     /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
     /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
     bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
-                            bool isUnary, SelectionDAG &DAG);
+                            unsigned ShuffleKind, SelectionDAG &DAG);
 
-    /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
-    /// amount, otherwise return -1.
-    int isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG);
+    /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
+    /// shift amount, otherwise return -1.
+    int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
+                            SelectionDAG &DAG);
 
     /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a splat of a single element that is suitable for input to
@@ -344,7 +345,7 @@
     const PPCSubtarget &Subtarget;
 
   public:
-    explicit PPCTargetLowering(PPCTargetMachine &TM);
+    explicit PPCTargetLowering(const PPCTargetMachine &TM);
 
     /// getTargetNodeName() - This method returns the name of a target specific
     /// DAG node.
@@ -355,6 +356,11 @@
     /// getSetCCResultType - Return the ISD::SETCC ValueType
     EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
+    /// Return true if target always beneficiates from combining into FMA for a
+    /// given value type. This must typically return false on targets where FMA
+    /// takes more cycles to execute than FADD.
+    bool enableAggressiveFMAFusion(EVT VT) const override;
+
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
     /// can be legally represented as pre-indexed load / store address.
@@ -403,6 +409,11 @@
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
+    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                                  bool IsStore, bool IsLoad) const override;
+    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
+                                   bool IsStore, bool IsLoad) const override;
+
     MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const override;
@@ -472,6 +483,10 @@
 
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
+    bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                            const CallInst &I,
+                            unsigned Intrinsic) const override;
+
     /// getOptimalMemOpType - Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
     /// lowering. If DstAlign is zero that means it's safe to destination
@@ -490,9 +505,10 @@
 
     /// Is unaligned memory access allowed for the given type, and is it fast
     /// relative to software emulation.
-    bool allowsUnalignedMemoryAccesses(EVT VT,
-                                       unsigned AddrSpace,
-                                       bool *Fast = nullptr) const override;
+    bool allowsMisalignedMemoryAccesses(EVT VT,
+                                        unsigned AddrSpace,
+                                        unsigned Align = 1,
+                                        bool *Fast = nullptr) const override;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
@@ -510,6 +526,20 @@
     FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
                              const TargetLibraryInfo *LibInfo) const override;
 
+    /// \brief Returns true if an argument of type Ty needs to be passed in a
+    /// contiguous block of registers in calling convention CallConv.
+    bool functionArgumentNeedsConsecutiveRegisters(
+      Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override {
+      // We support any array type as "consecutive" block in the parameter
+      // save area.  The element type defines the alignment requirement and
+      // whether the argument should go in GPRs, FPRs, or VRs if available.
+      //
+      // Note that clang uses this capability both to implement the ELFv2
+      // homogeneous float/vector aggregate ABI, and to avoid having to use
+      // "byval" when passing aggregates that might fully fit in registers.
+      return Ty->isArrayTy();
+    }
+
   private:
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
@@ -533,6 +563,8 @@
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    std::pair<SDValue,SDValue> lowerTLSCall(SDValue Op, SDLoc dl,
+                                            SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
@@ -666,8 +698,12 @@
 
     SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
-    SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const;
-    SDValue DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const;
+
+    SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                             unsigned &RefinementSteps,
+                             bool &UseOneConstNR) const override;
+    SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                             unsigned &RefinementSteps) const override;
 
     CCAssignFn *useFastISelCCs(unsigned Flag) const;
   };

diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 9318f70..9a19abb 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td

@@ -188,6 +188,9 @@
 def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
           (BL8_NOP texternalsym:$dst)>;
 
+def : Pat<(PPCcall_nop_tls texternalsym:$func, tglobaltlsaddr:$sym),
+          (BL8_NOP_TLS texternalsym:$func, tglobaltlsaddr:$sym)>;
+
 // Atomic operations
 let usesCustomInserter = 1 in {
   let Defs = [CR0] in {
@@ -786,7 +789,7 @@
 def LD   : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src),
                     "ld $rD, $src", IIC_LdStLD,
                     [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64;
-// The following three definitions are selected for small code model only.
+// The following four definitions are selected for small code model only.
 // Otherwise, we need to create two instructions to form a 32-bit offset,
 // so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
 def LDtoc: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
@@ -801,8 +804,12 @@
                   "#LDtocCPT",
                   [(set i64:$rD,
                      (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64;
+def LDtocBA: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg),
+                  "#LDtocCPT",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tblockaddress:$disp, i64:$reg))]>, isPPC64;
 
-let hasSideEffects = 1, isCodeGenOnly = 1, RST = 2 in
+let hasSideEffects = 1, isCodeGenOnly = 1, RST = 2, Defs = [X2] in
 def LDinto_toc: DSForm_1<58, 0, (outs), (ins memrix:$src),
                     "ld 2, $src", IIC_LdStLD,
                     [(PPCload_toc ixaddr:$src)]>, isPPC64;
@@ -872,11 +879,6 @@
                        [(set i64:$rD,
                          (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
                  isPPC64;
-def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
-                        "#GETtlsADDR",
-                        [(set i64:$rD,
-                          (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
-                 isPPC64;
 def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIStlsldHA",
                          [(set i64:$rD,
@@ -887,11 +889,6 @@
                        [(set i64:$rD,
                          (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
                  isPPC64;
-def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
-                          "#GETtlsldADDR",
-                          [(set i64:$rD,
-                            (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
-                   isPPC64;
 def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                           "#ADDISdtprelHA",
                           [(set i64:$rD,
@@ -1135,3 +1132,9 @@
 def : Pat<(unaligned4store i64:$rS, xoaddr:$dst),
           (STDX $rS, xoaddr:$dst)>;
 
+// 64-bits atomic loads and stores
+def : Pat<(atomic_load_64 ixaddr:$src), (LD  memrix:$src)>;
+def : Pat<(atomic_load_64 xaddr:$src),  (LDX memrr:$src)>;
+
+def : Pat<(atomic_store_64 ixaddr:$ptr, i64:$val), (STD  g8rc:$val, memrix:$ptr)>;
+def : Pat<(atomic_store_64 xaddr:$ptr,  i64:$val), (STDX g8rc:$val, memrr:$ptr)>;

diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index dce46d8..4ef08eb 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td

@@ -22,110 +22,143 @@
 
 def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                               (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false,
-                                   *CurDAG);
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
 }]>;
 def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                               (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false,
-                                   *CurDAG);
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
 }]>;
 def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true,
-                                   *CurDAG);
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
 }]>;
 def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true,
-                                   *CurDAG);
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG);
 }]>;
 
+// These fragments are provided for little-endian, where the inputs must be
+// swapped for correct semantics.
+def vpkuhum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
+def vpkuwum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG);
+}]>;
 
 def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false,
-                                 *CurDAG);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
 }]>;
 def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false,
-                                 *CurDAG);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
 }]>;
 def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false,
-                                 *CurDAG);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
 }]>;
 def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false,
-                                 *CurDAG);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG);
 }]>;
 def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false,
-                                 *CurDAG);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG);
 }]>;
 def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false,
-                                 *CurDAG);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG);
 }]>;
 
 
 def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true,
-                                 *CurDAG);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
 }]>;
 def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true,
-                                 *CurDAG);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
 }]>;
 def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true,
-                                 *CurDAG);
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
 }]>;
 def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true,
-                                 *CurDAG);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG);
 }]>;
 def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true,
-                                 *CurDAG);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG);
 }]>;
 def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true,
-                                 *CurDAG);
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG);
+}]>;
+
+
+// These fragments are provided for little-endian, where the inputs must be
+// swapped for correct semantics.
+def vmrglb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
+}]>;
+def vmrglh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
+}]>;
+def vmrglw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
+}]>;
+def vmrghb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG);
+}]>;
+def vmrghh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG);
+}]>;
+def vmrghw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG);
 }]>;
 
 
 def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::isVSLDOIShuffleMask(N, false, *CurDAG));
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, 0, *CurDAG));
 }]>;
 def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                              (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVSLDOIShuffleMask(N, false, *CurDAG) != -1;
+  return PPC::isVSLDOIShuffleMask(N, 0, *CurDAG) != -1;
 }], VSLDOI_get_imm>;
 
 
 /// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
 /// vector_shuffle(X,undef,mask) by the dag combiner.
 def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
-  return getI32Imm(PPC::isVSLDOIShuffleMask(N, true, *CurDAG));
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, 1, *CurDAG));
 }]>;
 def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                                    (vector_shuffle node:$lhs, node:$rhs), [{
-  return PPC::isVSLDOIShuffleMask(N, true, *CurDAG) != -1;
+  return PPC::isVSLDOIShuffleMask(N, 1, *CurDAG) != -1;
 }], VSLDOI_unary_get_imm>;
 
 
+/// VSLDOI_swapped* - These fragments are provided for little-endian, where
+/// the inputs must be swapped for correct semantics.
+def VSLDOI_swapped_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, 2, *CurDAG));
+}]>;
+def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, 2, *CurDAG) != -1;
+}], VSLDOI_get_imm>;
+
+
 // VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
 def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
   return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG));
@@ -242,48 +275,64 @@
 def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">;
 let Predicates = [HasAltivec] in {
 
-let isCodeGenOnly = 1 in {
-def DSS      : DSS_Form<822, (outs),
-                        (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dss $STRM", IIC_LdStLoad /*FIXME*/, []>,
-                        Deprecated<DeprecatedDST>;
-def DSSALL   : DSS_Form<822, (outs),
-                        (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dssall", IIC_LdStLoad /*FIXME*/, []>,
-                        Deprecated<DeprecatedDST>;
-def DST      : DSS_Form<342, (outs),
-                        (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
-                        Deprecated<DeprecatedDST>;
-def DSTT     : DSS_Form<342, (outs),
-                        (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
-                        Deprecated<DeprecatedDST>;
-def DSTST    : DSS_Form<374, (outs),
-                        (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
-                        Deprecated<DeprecatedDST>;
-def DSTSTT   : DSS_Form<374, (outs),
-                        (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+def DSS      : DSS_Form<0, 822, (outs), (ins u5imm:$STRM),
+                        "dss $STRM", IIC_LdStLoad /*FIXME*/, [(int_ppc_altivec_dss imm:$STRM)]>,
+                        Deprecated<DeprecatedDST> {
+  let A = 0;
+  let B = 0;
+}
+
+def DSSALL   : DSS_Form<1, 822, (outs), (ins),
+                        "dssall", IIC_LdStLoad /*FIXME*/, [(int_ppc_altivec_dssall)]>,
+                        Deprecated<DeprecatedDST> {
+  let STRM = 0;
+  let A = 0;
+  let B = 0;
+}
+
+def DST      : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                        [(int_ppc_altivec_dst i32:$rA, i32:$rB, imm:$STRM)]>,
                         Deprecated<DeprecatedDST>;
 
-def DST64    : DSS_Form<342, (outs),
-                        (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+def DSTT     : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                        [(int_ppc_altivec_dstt i32:$rA, i32:$rB, imm:$STRM)]>,
                         Deprecated<DeprecatedDST>;
-def DSTT64   : DSS_Form<342, (outs),
-                        (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+
+def DSTST    : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                        [(int_ppc_altivec_dstst i32:$rA, i32:$rB, imm:$STRM)]>,
                         Deprecated<DeprecatedDST>;
-def DSTST64  : DSS_Form<374, (outs),
-                        (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
+
+def DSTSTT   : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, gprc:$rA, gprc:$rB),
+                        "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                        [(int_ppc_altivec_dststt i32:$rA, i32:$rB, imm:$STRM)]>,
                         Deprecated<DeprecatedDST>;
-def DSTSTT64 : DSS_Form<374, (outs),
-                        (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>,
-                        Deprecated<DeprecatedDST>;
+
+let isCodeGenOnly = 1 in {
+  // The very same instructions as above, but formally matching 64bit registers.
+  def DST64    : DSS_Form<0, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                          "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                          [(int_ppc_altivec_dst i64:$rA, i32:$rB, imm:$STRM)]>,
+                          Deprecated<DeprecatedDST>;
+
+  def DSTT64   : DSS_Form<1, 342, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                          "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                          [(int_ppc_altivec_dstt i64:$rA, i32:$rB, imm:$STRM)]>,
+                          Deprecated<DeprecatedDST>;
+
+  def DSTST64  : DSS_Form<0, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                          "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                          [(int_ppc_altivec_dstst i64:$rA, i32:$rB,
+                                                  imm:$STRM)]>,
+                          Deprecated<DeprecatedDST>;
+
+  def DSTSTT64 : DSS_Form<1, 374, (outs), (ins u5imm:$STRM, g8rc:$rA, gprc:$rB),
+                          "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/,
+                          [(int_ppc_altivec_dststt i64:$rA, i32:$rB,
+                                                   imm:$STRM)]>,
+                          Deprecated<DeprecatedDST>;
 }
 
 def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
@@ -731,30 +780,6 @@
 // Additional Altivec Patterns
 //
 
-// DS* intrinsics
-def : Pat<(int_ppc_altivec_dssall), (DSSALL 1, 0, 0, 0)>;
-def : Pat<(int_ppc_altivec_dss imm:$STRM), (DSS 0, imm:$STRM, 0, 0)>;
-
-//  * 32-bit
-def : Pat<(int_ppc_altivec_dst i32:$rA, i32:$rB, imm:$STRM),
-          (DST 0, imm:$STRM, $rA, $rB)>;
-def : Pat<(int_ppc_altivec_dstt i32:$rA, i32:$rB, imm:$STRM),
-          (DSTT 1, imm:$STRM, $rA, $rB)>;
-def : Pat<(int_ppc_altivec_dstst i32:$rA, i32:$rB, imm:$STRM),
-          (DSTST 0, imm:$STRM, $rA, $rB)>;
-def : Pat<(int_ppc_altivec_dststt i32:$rA, i32:$rB, imm:$STRM),
-          (DSTSTT 1, imm:$STRM, $rA, $rB)>;
-
-//  * 64-bit
-def : Pat<(int_ppc_altivec_dst i64:$rA, i32:$rB, imm:$STRM),
-          (DST64 0, imm:$STRM, $rA, $rB)>;
-def : Pat<(int_ppc_altivec_dstt i64:$rA, i32:$rB, imm:$STRM),
-          (DSTT64 1, imm:$STRM, $rA, $rB)>;
-def : Pat<(int_ppc_altivec_dstst i64:$rA, i32:$rB, imm:$STRM),
-          (DSTST64 0, imm:$STRM, $rA, $rB)>;
-def : Pat<(int_ppc_altivec_dststt i64:$rA, i32:$rB, imm:$STRM),
-          (DSTSTT64 1, imm:$STRM, $rA, $rB)>;
-
 // Loads.
 def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
 
@@ -789,6 +814,16 @@
 def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef),
         (VPKUHUM $vA, $vA)>;
 
+// Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands.
+// These fragments are matched for little-endian, where the inputs must
+// be swapped for correct semantics.
+def:Pat<(vsldoi_swapped_shuffle:$in v16i8:$vA, v16i8:$vB),
+        (VSLDOI $vB, $vA, (VSLDOI_swapped_get_imm $in))>;
+def:Pat<(vpkuwum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VPKUWUM $vB, $vA)>;
+def:Pat<(vpkuhum_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VPKUHUM $vB, $vA)>;
+
 // Match vmrg*(x,x)
 def:Pat<(vmrglb_unary_shuffle v16i8:$vA, undef),
         (VMRGLB $vA, $vA)>;
@@ -803,6 +838,22 @@
 def:Pat<(vmrghw_unary_shuffle v16i8:$vA, undef),
         (VMRGHW $vA, $vA)>;
 
+// Match vmrg*(y,x), i.e., swapped operands.  These fragments
+// are matched for little-endian, where the inputs must be
+// swapped for correct semantics.
+def:Pat<(vmrglb_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGLB $vB, $vA)>;
+def:Pat<(vmrglh_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGLH $vB, $vA)>;
+def:Pat<(vmrglw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGLW $vB, $vA)>;
+def:Pat<(vmrghb_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGHB $vB, $vA)>;
+def:Pat<(vmrghh_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGHH $vB, $vA)>;
+def:Pat<(vmrghw_swapped_shuffle v16i8:$vA, v16i8:$vB),
+        (VMRGHW $vB, $vA)>;
+
 // Logical Operations
 def : Pat<(vnot_ppc v4i32:$vA), (VNOR $vA, $vA)>;
 

diff --git a/lib/Target/PowerPC/PPCInstrBuilder.h b/lib/Target/PowerPC/PPCInstrBuilder.h
index b424d11..cf71b1c 100644
--- a/lib/Target/PowerPC/PPCInstrBuilder.h
+++ b/lib/Target/PowerPC/PPCInstrBuilder.h

@@ -17,8 +17,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef POWERPC_INSTRBUILDER_H
-#define POWERPC_INSTRBUILDER_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRBUILDER_H
+#define LLVM_LIB_TARGET_POWERPC_PPCINSTRBUILDER_H
 
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 

diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 1e4396c..aa68497 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td

@@ -380,6 +380,11 @@
   let Inst{31}    = RC;
 }
 
+class XForm_tlb<bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin> : XForm_base_r3xo<31, xo, OOL, IOL, asmstr, itin, []> {
+  let RST = 0;
+}
+
 // This is the same as XForm_base_r3xo, but the first two operands are swapped
 // when code is emitted.
 class XForm_base_r3xo_swapped
@@ -417,6 +422,22 @@
   let B = 0;
 }
 
+class XForm_tlbws<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RST;
+  bits<5> A;
+  bits<1> WS;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{20}    = WS;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
 class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern> 
   : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
@@ -457,6 +478,52 @@
   let Inst{31}    = 0;
 }
 
+class XForm_icbt<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<4> CT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Inst{6} = 0;
+  let Inst{7-10} = CT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31} = 0;
+}
+
+class XForm_sr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RS;
+  bits<4> SR;
+
+  let Inst{6-10} = RS;
+  let Inst{12-15} = SR;
+  let Inst{21-30} = xo;
+}
+
+class XForm_mbar<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> MO;
+
+  let Inst{6-10} = MO;
+  let Inst{21-30} = xo;
+}
+
+class XForm_srin<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RS;
+  bits<5> RB;
+
+  let Inst{6-10} = RS;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+}
+
 class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                 InstrItinClass itin>
          : I<opcode, OOL, IOL, asmstr, itin> {
@@ -764,10 +831,9 @@
 
 
 // DSS_Form - Form X instruction, used for altivec dss* instructions.
-class DSS_Form<bits<10> xo, dag OOL, dag IOL, string asmstr, 
+class DSS_Form<bits<1> T, bits<10> xo, dag OOL, dag IOL, string asmstr,
                       InstrItinClass itin, list<dag> pattern>
   : I<31, OOL, IOL, asmstr, itin> {
-  bits<1> T;
   bits<2> STRM;
   bits<5> A;
   bits<5> B;

diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 9bac91d..daf8790 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp

@@ -75,7 +75,7 @@
   if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
       Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
     const InstrItineraryData *II =
-        &static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData();
+        static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData();
     return new ScoreboardHazardRecognizer(II, DAG);
   }
 
@@ -331,6 +331,11 @@
   BuildMI(MBB, MI, DL, get(Opcode));
 }
 
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(PPC::NOP);
+}
+
 // Branch analysis.
 // Note: If the condition register is set to CTR or CTR8 then this is a
 // BDNZ (imm == 1) or BDZ (imm == 0) branch.
@@ -1617,6 +1622,7 @@
       bool Changed = false;
 
       MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
       for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
            I != IE; ++I) {
         MachineInstr *MI = I;
@@ -1682,16 +1688,26 @@
         // In theory, there could be other uses of the addend copy before this
         // fma.  We could deal with this, but that would require additional
         // logic below and I suspect it will not occur in any relevant
-        // situations.
-        bool OtherUsers = false;
+        // situations.  Additionally, check whether the copy source is killed
+        // prior to the fma.  In order to replace the addend here with the
+        // source of the copy, it must still be live here.  We can't use
+        // interval testing for a physical register, so as long as we're
+        // walking the MIs we may as well test liveness here.
+        bool OtherUsers = false, KillsAddendSrc = false;
         for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
-             J != JE; --J)
+             J != JE; --J) {
           if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) {
             OtherUsers = true;
             break;
           }
+          if (J->modifiesRegister(AddendSrcReg, TRI) ||
+              J->killsRegister(AddendSrcReg, TRI)) {
+            KillsAddendSrc = true;
+            break;
+          }
+        }
 
-        if (OtherUsers)
+        if (OtherUsers || KillsAddendSrc)
           continue;
 
         // Find one of the product operands that is killed by this instruction.
@@ -1712,10 +1728,11 @@
         if (!KilledProdOp)
           continue;
 
-        // In order to replace the addend here with the source of the copy,
-        // it must still be live here.
-        if (!LIS->getInterval(AddendMI->getOperand(1).getReg()).liveAt(FMAIdx))
-          continue;
+        // For virtual registers, verify that the addend source register
+        // is live here (as should have been assured above).
+        assert((!TargetRegisterInfo::isVirtualRegister(AddendSrcReg) ||
+                LIS->getInterval(AddendSrcReg).liveAt(FMAIdx)) &&
+               "Addend source register is not live!");
 
         // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
 
@@ -1737,6 +1754,12 @@
 
         unsigned OldFMAReg = MI->getOperand(0).getReg();
 
+        // The transformation doesn't work well with things like:
+        //    %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
+        // so leave such things alone.
+        if (OldFMAReg == KilledProdReg)
+          continue;
+
         assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
                "Addend copy not tied to old FMA output!");
 
@@ -1827,7 +1850,7 @@
 
       LIS = &getAnalysis<LiveIntervals>();
 
-      TII = TM->getInstrInfo();
+      TII = TM->getSubtargetImpl()->getInstrInfo();
 
       bool Changed = false;
 
@@ -1980,7 +2003,7 @@
       // If we don't have VSX on the subtarget, don't do anything.
       if (!TM->getSubtargetImpl()->hasVSX())
         return false;
-      TII = TM->getInstrInfo();
+      TII = TM->getSubtargetImpl()->getInstrInfo();
 
       bool Changed = false;
 
@@ -2057,7 +2080,7 @@
       // If we don't have VSX don't bother doing anything here.
       if (!TM->getSubtargetImpl()->hasVSX())
         return false;
-      TII = TM->getInstrInfo();
+      TII = TM->getSubtargetImpl()->getInstrInfo();
 
       bool Changed = false;
 
@@ -2214,7 +2237,7 @@
 public:
     bool runOnMachineFunction(MachineFunction &MF) override {
       TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
-      TII = TM->getInstrInfo();
+      TII = TM->getSubtargetImpl()->getInstrInfo();
 
       bool Changed = false;
 

diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 83f14c6..4d310fe 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef POWERPC_INSTRUCTIONINFO_H
-#define POWERPC_INSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCINSTRINFO_H
 
 #include "PPC.h"
 #include "PPCRegisterInfo.h"
@@ -228,6 +228,8 @@
   /// instruction may be.  This returns the maximum number of bytes.
   ///
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
 };
 
 }

diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index c2e3382..8c76c46 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td

@@ -57,6 +57,9 @@
   SDTCisPtrTy<0>, SDTCisVT<1, i32>
 ]>;
 
+def tocentry32 : Operand<iPTR> {
+  let MIOperandInfo = (ops i32imm:$imm);
+}
 
 //===----------------------------------------------------------------------===//
 // PowerPC specific DAG Nodes.
@@ -107,10 +110,8 @@
 def PPCaddTls     : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
 def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
 def PPCaddiTlsgdL   : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
-def PPCgetTlsAddr   : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
 def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
 def PPCaddiTlsldL   : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
-def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
 def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp,
                               [SDNPHasChain]>;
 def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
@@ -133,9 +134,15 @@
 def PPCcall  : SDNode<"PPCISD::CALL", SDT_PPCCall,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
+def PPCcall_tls : SDNode<"PPCISD::CALL_TLS", SDT_PPCCall,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                          SDNPVariadic]>;
 def PPCcall_nop  : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                            SDNPVariadic]>;
+def PPCcall_nop_tls : SDNode<"PPCISD::CALL_NOP_TLS", SDT_PPCCall,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                              SDNPVariadic]>;
 def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
@@ -417,6 +424,15 @@
   let PrintMethod = "printU2ImmOperand";
   let ParserMatchClass = PPCU2ImmAsmOperand;
 }
+
+def PPCU4ImmAsmOperand : AsmOperandClass {
+  let Name = "U4Imm"; let PredicateMethod = "isU4Imm";
+  let RenderMethod = "addImmOperands";
+}
+def u4imm   : Operand<i32> {
+  let PrintMethod = "printU4ImmOperand";
+  let ParserMatchClass = PPCU4ImmAsmOperand;
+}
 def PPCS5ImmAsmOperand : AsmOperandClass {
   let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
   let RenderMethod = "addImmOperands";
@@ -446,7 +462,7 @@
 }
 def PPCS16ImmAsmOperand : AsmOperandClass {
   let Name = "S16Imm"; let PredicateMethod = "isS16Imm";
-  let RenderMethod = "addImmOperands";
+  let RenderMethod = "addS16ImmOperands";
 }
 def s16imm  : Operand<i32> {
   let PrintMethod = "printS16ImmOperand";
@@ -456,7 +472,7 @@
 }
 def PPCU16ImmAsmOperand : AsmOperandClass {
   let Name = "U16Imm"; let PredicateMethod = "isU16Imm";
-  let RenderMethod = "addImmOperands";
+  let RenderMethod = "addU16ImmOperands";
 }
 def u16imm  : Operand<i32> {
   let PrintMethod = "printU16ImmOperand";
@@ -466,7 +482,7 @@
 }
 def PPCS17ImmAsmOperand : AsmOperandClass {
   let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
-  let RenderMethod = "addImmOperands";
+  let RenderMethod = "addS16ImmOperands";
 }
 def s17imm  : Operand<i32> {
   // This operand type is used for addis/lis to allow the assembler parser
@@ -542,7 +558,7 @@
 
 def PPCDispRIOperand : AsmOperandClass {
  let Name = "DispRI"; let PredicateMethod = "isS16Imm";
- let RenderMethod = "addImmOperands";
+ let RenderMethod = "addS16ImmOperands";
 }
 def dispRI : Operand<iPTR> {
   let ParserMatchClass = PPCDispRIOperand;
@@ -554,6 +570,27 @@
 def dispRIX : Operand<iPTR> {
   let ParserMatchClass = PPCDispRIXOperand;
 }
+def PPCDispSPE8Operand : AsmOperandClass {
+ let Name = "DispSPE8"; let PredicateMethod = "isU8ImmX8";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE8 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispSPE8Operand;
+}
+def PPCDispSPE4Operand : AsmOperandClass {
+ let Name = "DispSPE4"; let PredicateMethod = "isU7ImmX4";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE4 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispSPE4Operand;
+}
+def PPCDispSPE2Operand : AsmOperandClass {
+ let Name = "DispSPE2"; let PredicateMethod = "isU6ImmX2";
+ let RenderMethod = "addImmOperands";
+}
+def dispSPE2 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispSPE2Operand;
+}
 
 def memri : Operand<iPTR> {
   let PrintMethod = "printMemRegImm";
@@ -571,6 +608,21 @@
   let EncoderMethod = "getMemRIXEncoding";
   let DecoderMethod = "decodeMemRIXOperands";
 }
+def spe8dis : Operand<iPTR> {   // SPE displacement where the imm is 8-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getSPE8DisEncoding";
+}
+def spe4dis : Operand<iPTR> {   // SPE displacement where the imm is 4-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getSPE4DisEncoding";
+}
+def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getSPE2DisEncoding";
+}
 
 // A single-register address. This is used with the SjLj
 // pseudo-instructions.
@@ -585,6 +637,12 @@
   let EncoderMethod = "getTLSRegEncoding";
   let ParserMatchClass = PPCTLSRegOperand;
 }
+def tlsgd32 : Operand<i32> {}
+def tlscall32 : Operand<i32> {
+  let PrintMethod = "printTLSCall";
+  let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym);
+  let EncoderMethod = "getTLSCallEncoding";
+}
 
 // PowerPC Predicate operand.
 def pred : Operand<OtherVT> {
@@ -611,6 +669,12 @@
 def In64BitMode  : Predicate<"PPCSubTarget->isPPC64()">;
 def IsBookE  : Predicate<"PPCSubTarget->isBookE()">;
 def IsNotBookE  : Predicate<"!PPCSubTarget->isBookE()">;
+def HasOnlyMSYNC : Predicate<"PPCSubTarget->hasOnlyMSYNC()">;
+def HasSYNC   : Predicate<"!PPCSubTarget->hasOnlyMSYNC()">;
+def IsPPC4xx  : Predicate<"PPCSubTarget->isPPC4xx()">;
+def IsPPC6xx  : Predicate<"PPCSubTarget->isPPC6xx()">;
+def IsE500  : Predicate<"PPCSubTarget->isE500()">;
+def HasSPE  : Predicate<"PPCSubTarget->HasSPE()">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
@@ -967,6 +1031,9 @@
 let Defs = [LR] in
   def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
                    PPC970_Unit_BRU;
+let Defs = [LR] in
+  def MoveGOTtoLR : Pseudo<(outs), (ins), "#MoveGOTtoLR", []>,
+                    PPC970_Unit_BRU;
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
   let isBarrier = 1 in {
@@ -1068,6 +1135,8 @@
                     "bla $func", IIC_BrB, [(PPCcall (i32 imm:$func))]>;
 
     let isCodeGenOnly = 1 in {
+      def BL_TLS  : IForm<18, 0, 1, (outs), (ins tlscall32:$func),
+                          "bl $func", IIC_BrB, []>;
       def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst),
                        "b${cond:cc}l${cond:pm} ${cond:reg}, $dst">;
       def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst),
@@ -1243,8 +1312,15 @@
                       IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
                       PPC970_DGroup_Single;
 
+def ICBT  : XForm_icbt<31, 22, (outs), (ins u4imm:$CT, memrr:$src),
+                       "icbt $CT, $src", IIC_LdStLoad>, Requires<[IsBookE]>;
+
 def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
-          (DCBT xoaddr:$dst)>;
+          (DCBT xoaddr:$dst)>;   // data prefetch for loads
+def : Pat<(prefetch xoaddr:$dst, (i32 1), imm, (i32 1)),
+          (DCBTST xoaddr:$dst)>; // data prefetch for stores
+def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
+          (ICBT 0, xoaddr:$dst)>; // inst prefetch (for read)
 
 // Atomic operations
 let usesCustomInserter = 1 in {
@@ -1628,17 +1704,19 @@
                    "stmw $rS, $dst", IIC_LdStLMW, []>;
 
 def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
-                        "sync $L", IIC_LdStSync, []>, Requires<[IsNotBookE]>;
+                        "sync $L", IIC_LdStSync, []>;
 
 let isCodeGenOnly = 1 in {
   def MSYNC : XForm_24_sync<31, 598, (outs), (ins),
-                           "msync", IIC_LdStSync, []>, Requires<[IsBookE]> {
+                           "msync", IIC_LdStSync, []> {
     let L = 0;
   }
 }
 
-def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[IsNotBookE]>;
-def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[IsBookE]>;
+def : Pat<(int_ppc_sync),   (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(int_ppc_lwsync), (SYNC 1)>, Requires<[HasSYNC]>;
+def : Pat<(int_ppc_sync),   (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+def : Pat<(int_ppc_lwsync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
 
 //===----------------------------------------------------------------------===//
 // PPC32 Arithmetic Instructions.
@@ -2355,6 +2433,8 @@
 def : Pat<(PPCcall (i32 texternalsym:$dst)),
           (BL texternalsym:$dst)>;
 
+def : Pat<(PPCcall_tls texternalsym:$func, tglobaltlsaddr:$sym),
+          (BL_TLS texternalsym:$func, tglobaltlsaddr:$sym)>;
 
 def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
           (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
@@ -2393,13 +2473,47 @@
 def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT", 
                 [(set i32:$rD, (PPCppc32GOT))]>;
 
+// Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
+// This uses two output registers, the first as the real output, the second as a
+// temporary register, used internally in code generation.
+def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", 
+                []>, NoEncode<"$rT">;
+
 def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
-                        "#LDgotTprelL32",
-                        [(set i32:$rD,
-                          (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
+                           "#LDgotTprelL32",
+                           [(set i32:$rD,
+                             (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
 def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g),
           (ADD4TLS $in, tglobaltlsaddr:$g)>;
 
+def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                         "#ADDItlsgdL32",
+                         [(set i32:$rD,
+                           (PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>;
+def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                          "#ADDItlsldL32",
+                          [(set i32:$rD,
+                            (PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>;
+def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                           "#ADDIdtprelL32",
+                           [(set i32:$rD,
+                             (PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>;
+def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp),
+                            "#ADDISdtprelHA32",
+                            [(set i32:$rD,
+                              (PPCaddisDtprelHA i32:$reg,
+                                                tglobaltlsaddr:$disp))]>;
+
+// Support for Position-independent code
+def LWZtoc : Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
+                   "#LWZtoc",
+                   [(set i32:$rD,
+                      (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+// Get Global (GOT) Base Register offset, from the word immediately preceding
+// the function label.
+def UpdateGBR : Pseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>;
+
+
 // Standard shifts.  These are represented separately from the real shifts above
 // so that we can distinguish between shifts that allow 5-bit and 6-bit shift
 // amounts.
@@ -2434,8 +2548,15 @@
 def : Pat<(f64 (fextend f32:$src)),
           (COPY_TO_REGCLASS $src, F8RC)>;
 
-def : Pat<(atomic_fence (imm), (imm)), (SYNC 0)>, Requires<[IsNotBookE]>;
-def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[IsBookE]>;
+// Only seq_cst fences require the heavyweight sync (SYNC 0).
+// All others can use the lightweight sync (SYNC 1).
+// source: http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
+// The rule for seq_cst is duplicated to work with both 64 bits and 32 bits
+// versions of Power.
+def : Pat<(atomic_fence (i64 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(atomic_fence (i32 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>;
+def : Pat<(atomic_fence (imm),   (imm)), (SYNC 1)>, Requires<[HasSYNC]>;
+def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
 
 // Additional FNMSUB patterns: -a*c + b == -(a*c - b)
 def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
@@ -2454,6 +2575,7 @@
           (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
 
 include "PPCInstrAltivec.td"
+include "PPCInstrSPE.td"
 include "PPCInstr64Bit.td"
 include "PPCInstrVSX.td"
 
@@ -2970,6 +3092,16 @@
 // PowerPC Instructions used for assembler/disassembler only
 //
 
+// FIXME: For B=0 or B > 8, the registers following RT are used.
+// WARNING: Do not add patterns for this instruction without fixing this.
+def LSWI  : XForm_base_r3xo<31, 597, (outs gprc:$RT), (ins gprc:$A, u5imm:$B),
+                            "lswi $RT, $A, $B", IIC_LdStLoad, []>;
+
+// FIXME: For B=0 or B > 8, the registers following RT are used.
+// WARNING: Do not add patterns for this instruction without fixing this.
+def STSWI : XForm_base_r3xo<31, 725, (outs), (ins gprc:$RT, gprc:$A, u5imm:$B),
+                            "stswi $RT, $A, $B", IIC_LdStLoad, []>;
+
 def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
                          "isync", IIC_SprISYNC, []>;
 
@@ -2982,9 +3114,47 @@
 def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
                          "wait $L", IIC_LdStLoad, []>;
 
+def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO),
+                         "mbar $MO", IIC_LdStLoad>, Requires<[IsBookE]>;
+
+def MTSR: XForm_sr<31, 210, (outs), (ins gprc:$RS, u4imm:$SR),
+            "mtsr $SR, $RS", IIC_SprMTSR>;
+
+def MFSR: XForm_sr<31, 595, (outs gprc:$RS), (ins u4imm:$SR),
+            "mfsr $RS, $SR", IIC_SprMFSR>;
+
+def MTSRIN: XForm_srin<31, 242, (outs), (ins gprc:$RS, gprc:$RB),
+            "mtsrin $RS, $RB", IIC_SprMTSR>;
+
+def MFSRIN: XForm_srin<31, 659, (outs gprc:$RS), (ins gprc:$RB),
+            "mfsrin $RS, $RB", IIC_SprMFSR>;
+
 def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
                     "mtmsr $RS, $L", IIC_SprMTMSR>;
 
+def WRTEE: XForm_mtmsr<31, 131, (outs), (ins gprc:$RS),
+                    "wrtee $RS", IIC_SprMTMSR>, Requires<[IsBookE]> {
+  let L = 0;
+}
+
+def WRTEEI: I<31, (outs), (ins i1imm:$E), "wrteei $E", IIC_SprMTMSR>,
+              Requires<[IsBookE]> {
+  bits<1> E;
+
+  let Inst{16} = E;
+  let Inst{21-30} = 163;
+}
+
+def DCCCI : XForm_tlb<454, (outs), (ins gprc:$A, gprc:$B),
+               "dccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;
+def ICCCI : XForm_tlb<966, (outs), (ins gprc:$A, gprc:$B),
+               "iccci $A, $B", IIC_LdStLoad>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"dci 0", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"dccci", (DCCCI R0, R0)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"ici 0", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"iccci", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>;
+
 def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
                   "mfmsr $RT", IIC_SprMFMSR, []>;
 
@@ -3002,15 +3172,66 @@
 
 def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>;
 
+def TLBIA : XForm_0<31, 370, (outs), (ins),
+                        "tlbia", IIC_SprTLBIA, []>;
+
 def TLBSYNC : XForm_0<31, 566, (outs), (ins),
                         "tlbsync", IIC_SprTLBSYNC, []>;
 
 def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
                           "tlbiel $RB", IIC_SprTLBIEL, []>;
 
+def TLBLD : XForm_16b<31, 978, (outs), (ins gprc:$RB),
+                          "tlbld $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
+def TLBLI : XForm_16b<31, 1010, (outs), (ins gprc:$RB),
+                          "tlbli $RB", IIC_LdStLoad, []>, Requires<[IsPPC6xx]>;
+
 def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
                           "tlbie $RB,$RS", IIC_SprTLBIE, []>;
 
+def TLBSX : XForm_tlb<914, (outs), (ins gprc:$A, gprc:$B), "tlbsx $A, $B",
+                IIC_LdStLoad>, Requires<[IsBookE]>;
+
+def TLBIVAX : XForm_tlb<786, (outs), (ins gprc:$A, gprc:$B), "tlbivax $A, $B",
+                IIC_LdStLoad>, Requires<[IsBookE]>;
+
+def TLBRE : XForm_24_eieio<31, 946, (outs), (ins),
+                           "tlbre", IIC_LdStLoad, []>, Requires<[IsBookE]>;
+
+def TLBWE : XForm_24_eieio<31, 978, (outs), (ins),
+                           "tlbwe", IIC_LdStLoad, []>, Requires<[IsBookE]>;
+
+def TLBRE2 : XForm_tlbws<31, 946, (outs gprc:$RS), (ins gprc:$A, i1imm:$WS),
+               "tlbre $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;
+
+def TLBWE2 : XForm_tlbws<31, 978, (outs), (ins gprc:$RS, gprc:$A, i1imm:$WS),
+               "tlbwe $RS, $A, $WS", IIC_LdStLoad, []>, Requires<[IsPPC4xx]>;
+
+def TLBSX2 : XForm_base_r3xo<31, 914, (outs), (ins gprc:$RST, gprc:$A, gprc:$B),
+                             "tlbsx $RST, $A, $B", IIC_LdStLoad, []>,
+                             Requires<[IsPPC4xx]>;
+def TLBSX2D : XForm_base_r3xo<31, 914, (outs),
+                              (ins gprc:$RST, gprc:$A, gprc:$B),
+                              "tlbsx. $RST, $A, $B", IIC_LdStLoad, []>,
+                              Requires<[IsPPC4xx]>, isDOT;
+
+def RFID : XForm_0<19, 18, (outs), (ins), "rfid", IIC_IntRFID, []>;
+
+def RFI : XForm_0<19, 50, (outs), (ins), "rfi", IIC_SprRFI, []>,
+                  Requires<[IsBookE]>;
+def RFCI : XForm_0<19, 51, (outs), (ins), "rfci", IIC_BrB, []>,
+                   Requires<[IsBookE]>;
+
+def RFDI : XForm_0<19, 39, (outs), (ins), "rfdi", IIC_BrB, []>,
+                   Requires<[IsE500]>;
+def RFMCI : XForm_0<19, 38, (outs), (ins), "rfmci", IIC_BrB, []>,
+                    Requires<[IsE500]>;
+
+def MFDCR : XFXForm_1<31, 323, (outs gprc:$RT), (ins i32imm:$SPR),
+                      "mfdcr $RT, $SPR", IIC_SprMFSPR>, Requires<[IsPPC4xx]>;
+def MTDCR : XFXForm_1<31, 451, (outs), (ins gprc:$RT, i32imm:$SPR),
+                      "mtdcr $SPR, $RT", IIC_SprMTSPR>, Requires<[IsPPC4xx]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Assembler Instruction Aliases
 //
@@ -3033,15 +3254,17 @@
 
 def : InstAlias<"sc", (SC 0)>;
 
-def : InstAlias<"sync", (SYNC 0)>, Requires<[IsNotBookE]>;
-def : InstAlias<"msync", (SYNC 0)>, Requires<[IsNotBookE]>;
-def : InstAlias<"lwsync", (SYNC 1)>, Requires<[IsNotBookE]>;
-def : InstAlias<"ptesync", (SYNC 2)>, Requires<[IsNotBookE]>;
+def : InstAlias<"sync", (SYNC 0)>, Requires<[HasSYNC]>;
+def : InstAlias<"msync", (SYNC 0)>, Requires<[HasSYNC]>;
+def : InstAlias<"lwsync", (SYNC 1)>, Requires<[HasSYNC]>;
+def : InstAlias<"ptesync", (SYNC 2)>, Requires<[HasSYNC]>;
 
 def : InstAlias<"wait", (WAIT 0)>;
 def : InstAlias<"waitrsv", (WAIT 1)>;
 def : InstAlias<"waitimpl", (WAIT 2)>;
 
+def : InstAlias<"mbar", (MBAR 0)>, Requires<[IsBookE]>;
+
 def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
 def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
 def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
@@ -3050,9 +3273,57 @@
 def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
 def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
 
+def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>;
+def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>;
+
+def : InstAlias<"mtdscr $Rx", (MTSPR 17, gprc:$Rx)>;
+def : InstAlias<"mfdscr $Rx", (MFSPR gprc:$Rx, 17)>;
+
+def : InstAlias<"mtdsisr $Rx", (MTSPR 18, gprc:$Rx)>;
+def : InstAlias<"mfdsisr $Rx", (MFSPR gprc:$Rx, 18)>;
+
+def : InstAlias<"mtdar $Rx", (MTSPR 19, gprc:$Rx)>;
+def : InstAlias<"mfdar $Rx", (MFSPR gprc:$Rx, 19)>;
+
+def : InstAlias<"mtdec $Rx", (MTSPR 22, gprc:$Rx)>;
+def : InstAlias<"mfdec $Rx", (MFSPR gprc:$Rx, 22)>;
+
+def : InstAlias<"mtsdr1 $Rx", (MTSPR 25, gprc:$Rx)>;
+def : InstAlias<"mfsdr1 $Rx", (MFSPR gprc:$Rx, 25)>;
+
+def : InstAlias<"mtsrr0 $Rx", (MTSPR 26, gprc:$Rx)>;
+def : InstAlias<"mfsrr0 $Rx", (MFSPR gprc:$Rx, 26)>;
+
+def : InstAlias<"mtsrr1 $Rx", (MTSPR 27, gprc:$Rx)>;
+def : InstAlias<"mfsrr1 $Rx", (MFSPR gprc:$Rx, 27)>;
+
+def : InstAlias<"mtsrr2 $Rx", (MTSPR 990, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfsrr2 $Rx", (MFSPR gprc:$Rx, 990)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtsrr3 $Rx", (MTSPR 991, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfsrr3 $Rx", (MFSPR gprc:$Rx, 991)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtcfar $Rx", (MTSPR 28, gprc:$Rx)>;
+def : InstAlias<"mfcfar $Rx", (MFSPR gprc:$Rx, 28)>;
+
+def : InstAlias<"mtamr $Rx", (MTSPR 29, gprc:$Rx)>;
+def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;
+
+def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;
+def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>;
+
 def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>;
+def : InstAlias<"mftbl $Rx", (MFTB gprc:$Rx, 268)>;
 def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>;
 
+def : InstAlias<"mttbl $Rx", (MTSPR 284, gprc:$Rx)>;
+def : InstAlias<"mttbu $Rx", (MTSPR 285, gprc:$Rx)>;
+
+def : InstAlias<"mftblo $Rx", (MFSPR gprc:$Rx, 989)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mttblo $Rx", (MTSPR 989, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mftbhi $Rx", (MFSPR gprc:$Rx, 988)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mttbhi $Rx", (MTSPR 988, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+
 def : InstAlias<"xnop", (XORI R0, R0, 0)>;
 
 def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
@@ -3063,6 +3334,60 @@
 
 def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>;
 
+foreach BATR = 0-3 in {
+    def : InstAlias<"mtdbatu "#BATR#", $Rx",
+                    (MTSPR !add(BATR, !add(BATR, 536)), gprc:$Rx)>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mfdbatu $Rx, "#BATR,
+                    (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 536)))>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mtdbatl "#BATR#", $Rx",
+                    (MTSPR !add(BATR, !add(BATR, 537)), gprc:$Rx)>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mfdbatl $Rx, "#BATR,
+                    (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 537)))>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mtibatu "#BATR#", $Rx",
+                    (MTSPR !add(BATR, !add(BATR, 528)), gprc:$Rx)>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mfibatu $Rx, "#BATR,
+                    (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 528)))>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mtibatl "#BATR#", $Rx",
+                    (MTSPR !add(BATR, !add(BATR, 529)), gprc:$Rx)>,
+                    Requires<[IsPPC6xx]>;
+    def : InstAlias<"mfibatl $Rx, "#BATR,
+                    (MFSPR gprc:$Rx, !add(BATR, !add(BATR, 529)))>,
+                    Requires<[IsPPC6xx]>;
+}
+
+foreach BR = 0-7 in {
+    def : InstAlias<"mfbr"#BR#" $Rx",
+                    (MFDCR gprc:$Rx, !add(BR, 0x80))>,
+                    Requires<[IsPPC4xx]>;
+    def : InstAlias<"mtbr"#BR#" $Rx",
+                    (MTDCR gprc:$Rx, !add(BR, 0x80))>,
+                    Requires<[IsPPC4xx]>;
+}
+
+def : InstAlias<"mtdccr $Rx", (MTSPR 1018, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfdccr $Rx", (MFSPR gprc:$Rx, 1018)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mticcr $Rx", (MTSPR 1019, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtdear $Rx", (MTSPR 981, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfdear $Rx", (MFSPR gprc:$Rx, 981)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mtesr $Rx", (MTSPR 980, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfesr $Rx", (MFSPR gprc:$Rx, 980)>, Requires<[IsPPC4xx]>;
+
+def : InstAlias<"mfspefscr $Rx", (MFSPR gprc:$Rx, 512)>;
+def : InstAlias<"mtspefscr $Rx", (MTSPR 512, gprc:$Rx)>;
+
+def : InstAlias<"mttcr $Rx", (MTSPR 986, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mftcr $Rx", (MFSPR gprc:$Rx, 986)>, Requires<[IsPPC4xx]>;
+
 def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;
 
 def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
@@ -3082,25 +3407,25 @@
 def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
 def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
 
-def : InstAlias<"mfsprg $RT, 0", (MFSPR gprc:$RT, 272)>;
-def : InstAlias<"mfsprg $RT, 1", (MFSPR gprc:$RT, 273)>;
-def : InstAlias<"mfsprg $RT, 2", (MFSPR gprc:$RT, 274)>;
-def : InstAlias<"mfsprg $RT, 3", (MFSPR gprc:$RT, 275)>;
+def : InstAlias<"mfasr $RT", (MFSPR gprc:$RT, 280)>;
+def : InstAlias<"mtasr $RT", (MTSPR 280, gprc:$RT)>;
 
-def : InstAlias<"mfsprg0 $RT", (MFSPR gprc:$RT, 272)>;
-def : InstAlias<"mfsprg1 $RT", (MFSPR gprc:$RT, 273)>;
-def : InstAlias<"mfsprg2 $RT", (MFSPR gprc:$RT, 274)>;
-def : InstAlias<"mfsprg3 $RT", (MFSPR gprc:$RT, 275)>;
-
-def : InstAlias<"mtsprg 0, $RT", (MTSPR 272, gprc:$RT)>;
-def : InstAlias<"mtsprg 1, $RT", (MTSPR 273, gprc:$RT)>;
-def : InstAlias<"mtsprg 2, $RT", (MTSPR 274, gprc:$RT)>;
-def : InstAlias<"mtsprg 3, $RT", (MTSPR 275, gprc:$RT)>;
-
-def : InstAlias<"mtsprg0 $RT", (MTSPR 272, gprc:$RT)>;
-def : InstAlias<"mtsprg1 $RT", (MTSPR 273, gprc:$RT)>;
-def : InstAlias<"mtsprg2 $RT", (MTSPR 274, gprc:$RT)>;
-def : InstAlias<"mtsprg3 $RT", (MTSPR 275, gprc:$RT)>;
+foreach SPRG = 0-3 in {
+  def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 272))>;
+  def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 272))>;
+  def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
+  def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
+}
+foreach SPRG = 4-7 in {
+  def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
+                  Requires<[IsBookE]>;
+  def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 256))>,
+                  Requires<[IsBookE]>;
+  def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
+                  Requires<[IsBookE]>;
+  def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
+                  Requires<[IsBookE]>;
+}
 
 def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;
 
@@ -3119,6 +3444,15 @@
 
 def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
 
+def : InstAlias<"tlbrehi $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 0)>,
+                Requires<[IsPPC4xx]>;
+def : InstAlias<"tlbrelo $RS, $A", (TLBRE2 gprc:$RS, gprc:$A, 1)>,
+                Requires<[IsPPC4xx]>;
+def : InstAlias<"tlbwehi $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 0)>,
+                Requires<[IsPPC4xx]>;
+def : InstAlias<"tlbwelo $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 1)>,
+                Requires<[IsPPC4xx]>;
+
 def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
 def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
@@ -3367,3 +3701,18 @@
 defm : TrapExtendedMnemonic<"lng", 6>;
 defm : TrapExtendedMnemonic<"u", 31>;
 
+// Atomic loads
+def : Pat<(atomic_load_8  iaddr:$src), (LBZ  memri:$src)>;
+def : Pat<(atomic_load_16 iaddr:$src), (LHZ  memri:$src)>;
+def : Pat<(atomic_load_32 iaddr:$src), (LWZ  memri:$src)>;
+def : Pat<(atomic_load_8  xaddr:$src), (LBZX memrr:$src)>;
+def : Pat<(atomic_load_16 xaddr:$src), (LHZX memrr:$src)>;
+def : Pat<(atomic_load_32 xaddr:$src), (LWZX memrr:$src)>;
+
+// Atomic stores
+def : Pat<(atomic_store_8  iaddr:$ptr, i32:$val), (STB  gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_16 iaddr:$ptr, i32:$val), (STH  gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_32 iaddr:$ptr, i32:$val), (STW  gprc:$val, memri:$ptr)>;
+def : Pat<(atomic_store_8  xaddr:$ptr, i32:$val), (STBX gprc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_16 xaddr:$ptr, i32:$val), (STHX gprc:$val, memrr:$ptr)>;
+def : Pat<(atomic_store_32 xaddr:$ptr, i32:$val), (STWX gprc:$val, memrr:$ptr)>;

diff --git a/lib/Target/PowerPC/PPCInstrSPE.td b/lib/Target/PowerPC/PPCInstrSPE.td
new file mode 100644
index 0000000..cc3a4d2
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrSPE.td

@@ -0,0 +1,447 @@
+//=======-- PPCInstrSPE.td - The PowerPC SPE Extension -*- tablegen -*-=======//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Signal Processing Engine extension to
+// the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+class EVXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = [];
+  
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-31} = xo;
+}
+
+class EVXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin> : EVXForm_1<xo, OOL, IOL, asmstr, itin> {
+  let RB = 0;
+}
+
+class EVXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+  bits<3> crD;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = [];
+  
+  let Inst{6-8}  = crD;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-31} = xo;
+}
+
+class EVXForm_D<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin> : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<21> D;
+
+  let Pattern = [];
+
+  let Inst{6-10}  = RT;
+  let Inst{20} = D{0};
+  let Inst{19} = D{1};
+  let Inst{18} = D{2};
+  let Inst{17} = D{3};
+  let Inst{16} = D{4};
+  let Inst{15} = D{5};
+  let Inst{14} = D{6};
+  let Inst{13} = D{7};
+  let Inst{12} = D{8};
+  let Inst{11} = D{9};
+  let Inst{11-20} = D{0-9};
+  let Inst{21-31} = xo;
+}
+
+let Predicates = [HasSPE], isAsmParserOnly = 1 in {
+
+def EVLDD          : EVXForm_D<769, (outs gprc:$RT), (ins spe8dis:$dst),
+                               "evldd $RT, $dst", IIC_VecFP>;
+def EVLDW          : EVXForm_D<771, (outs gprc:$RT), (ins spe8dis:$dst),
+                               "evldw $RT, $dst", IIC_VecFP>;
+def EVLDH          : EVXForm_D<773, (outs gprc:$RT), (ins spe8dis:$dst),
+                               "evldh $RT, $dst", IIC_VecFP>;
+def EVLHHESPLAT    : EVXForm_D<777, (outs gprc:$RT), (ins spe2dis:$dst),
+                               "evlhhesplat $RT, $dst", IIC_VecFP>;
+def EVLHHOUSPLAT   : EVXForm_D<781, (outs gprc:$RT), (ins spe2dis:$dst),
+                               "evlhhousplat $RT, $dst", IIC_VecFP>;
+def EVLHHOSSPLAT   : EVXForm_D<783, (outs gprc:$RT), (ins spe2dis:$dst),
+                               "evlhhossplat $RT, $dst", IIC_VecFP>;
+def EVLWHE         : EVXForm_D<785, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwhe $RT, $dst", IIC_VecFP>;
+def EVLWHOU        : EVXForm_D<789, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwhou $RT, $dst", IIC_VecFP>;
+def EVLWHOS        : EVXForm_D<791, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwhos $RT, $dst", IIC_VecFP>;
+def EVLWWSPLAT     : EVXForm_D<793, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwwsplat $RT, $dst", IIC_VecFP>;
+def EVLWHSPLAT     : EVXForm_D<797, (outs gprc:$RT), (ins spe4dis:$dst),
+                               "evlwhsplat $RT, $dst", IIC_VecFP>;
+
+def EVSTDD         : EVXForm_D<801, (outs), (ins gprc:$RT, spe8dis:$dst),
+                               "evstdd $RT, $dst", IIC_VecFP>;
+def EVSTDH         : EVXForm_D<805, (outs), (ins gprc:$RT, spe8dis:$dst),
+                               "evstdh $RT, $dst", IIC_VecFP>;
+def EVSTDW         : EVXForm_D<803, (outs), (ins gprc:$RT, spe8dis:$dst),
+                               "evstdw $RT, $dst", IIC_VecFP>;
+def EVSTWHE        : EVXForm_D<817, (outs), (ins gprc:$RT, spe4dis:$dst),
+                               "evstwhe $RT, $dst", IIC_VecFP>;
+def EVSTWHO        : EVXForm_D<821, (outs), (ins gprc:$RT, spe4dis:$dst),
+                               "evstwho $RT, $dst", IIC_VecFP>;
+def EVSTWWE        : EVXForm_D<825, (outs), (ins gprc:$RT, spe4dis:$dst),
+                               "evstwwe $RT, $dst", IIC_VecFP>;
+def EVSTWWO        : EVXForm_D<829, (outs), (ins gprc:$RT, spe4dis:$dst),
+                               "evstwwo $RT, $dst", IIC_VecFP>;
+
+def EVMRA : EVXForm_1<1220, (outs gprc:$RT), (ins gprc:$RA),
+                      "evmra $RT, $RA", IIC_VecFP> {
+  let RB = 0;
+}
+
+def BRINC          : EVXForm_1<527, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "brinc $RT, $RA, $RB", IIC_VecFP>;
+def EVABS          : EVXForm_2<520, (outs gprc:$RT), (ins gprc:$RA),
+                               "evabs $RT, $RA", IIC_VecFP>;
+
+def EVADDIW        : EVXForm_1<514, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evaddiw $RT, $RB, $RA", IIC_VecFP>;
+def EVADDSMIAAW    : EVXForm_2<1225, (outs gprc:$RT), (ins gprc:$RA),
+                               "evaddsmiaaw $RT, $RA", IIC_VecFP>;
+def EVADDSSIAAW    : EVXForm_2<1217, (outs gprc:$RT), (ins gprc:$RA),
+                               "evaddssiaaw $RT, $RA", IIC_VecFP>;
+def EVADDUSIAAW    : EVXForm_2<1216, (outs gprc:$RT), (ins gprc:$RA),
+                               "evaddusiaaw $RT, $RA", IIC_VecFP>;
+def EVADDUMIAAW    : EVXForm_2<1224, (outs gprc:$RT), (ins gprc:$RA),
+                               "evaddumiaaw $RT, $RA", IIC_VecFP>;
+def EVADDW         : EVXForm_1<512, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evaddw $RT, $RA, $RB", IIC_VecFP>;
+
+def EVAND          : EVXForm_1<529, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evand $RT, $RA, $RB", IIC_VecFP>;
+def EVANDC         : EVXForm_1<530, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evandc $RT, $RA, $RB", IIC_VecFP>;
+
+def EVCMPEQ        : EVXForm_3<564, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmpeq $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPGTS       : EVXForm_3<561, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmpgts $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPGTU       : EVXForm_3<560, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmpgtu $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPLTS       : EVXForm_3<563, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmplts $crD, $RA, $RB", IIC_VecFP>;
+def EVCMPLTU       : EVXForm_3<562, (outs crrc:$crD), (ins gprc:$RA, gprc:$RB),
+                               "evcmpltu $crD, $RA, $RB", IIC_VecFP>;
+
+def EVCNTLSW       : EVXForm_2<526, (outs gprc:$RT), (ins gprc:$RA),
+                               "evcntlsw $RT, $RA", IIC_VecFP>;
+def EVCNTLZW       : EVXForm_2<525, (outs gprc:$RT), (ins gprc:$RA),
+                               "evcntlzw $RT, $RA", IIC_VecFP>;
+
+def EVDIVWS        : EVXForm_1<1222, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evdivws $RT, $RA, $RB", IIC_VecFP>;
+def EVDIVWU        : EVXForm_1<1223, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evdivwu $RT, $RA, $RB", IIC_VecFP>;
+
+def EVEQV          : EVXForm_1<537, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "eveqv $RT, $RA, $RB", IIC_VecFP>;
+
+def EVEXTSB        : EVXForm_2<522, (outs gprc:$RT), (ins gprc:$RA),
+                               "evextsb $RT, $RA", IIC_VecFP>;
+def EVEXTSH        : EVXForm_2<523, (outs gprc:$RT), (ins gprc:$RA),
+                               "evextsh $RT, $RA", IIC_VecFP>;
+
+def EVLDDX         : EVXForm_1<768, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlddx $RT, $RA, $RB", IIC_VecFP>;
+def EVLDWX         : EVXForm_1<770, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evldwx $RT, $RA, $RB", IIC_VecFP>;
+def EVLDHX         : EVXForm_1<772, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evldhx $RT, $RA, $RB", IIC_VecFP>;
+def EVLHHESPLATX   : EVXForm_1<776, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlhhesplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLHHOUSPLATX  : EVXForm_1<780, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlhhousplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLHHOSSPLATX  : EVXForm_1<782, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlhhossplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHEX        : EVXForm_1<784, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwhex $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHOUX       : EVXForm_1<788, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwhoux $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHOSX       : EVXForm_1<790, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwhosx $RT, $RA, $RB", IIC_VecFP>;
+def EVLWWSPLATX    : EVXForm_1<792, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwwsplatx $RT, $RA, $RB", IIC_VecFP>;
+def EVLWHSPLATX    : EVXForm_1<796, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evlwhsplatx $RT, $RA, $RB", IIC_VecFP>;
+
+def EVMERGEHI      : EVXForm_1<556, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmergehi $RT, $RA, $RB", IIC_VecFP>;
+def EVMERGELO      : EVXForm_1<557, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmergelo $RT, $RA, $RB", IIC_VecFP>;
+def EVMERGEHILO    : EVXForm_1<558, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmergehilo $RT, $RA, $RB", IIC_VecFP>;
+def EVMERGELOHI    : EVXForm_1<559, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmergelohi $RT, $RA, $RB", IIC_VecFP>;
+
+def EVMHEGSMFAA    : EVXForm_1<1323, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegsmfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGSMFAN    : EVXForm_1<1451, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegsmfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGSMIAA    : EVXForm_1<1321, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegsmiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGSMIAN    : EVXForm_1<1449, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegsmian $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGUMIAA    : EVXForm_1<1320, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegumiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEGUMIAN    : EVXForm_1<1448, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhegumian $RT, $RA, $RB", IIC_VecFP>;
+
+def EVMHESMF       : EVXForm_1<1035, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMFA      : EVXForm_1<1067, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMFAAW    : EVXForm_1<1291, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMFANW    : EVXForm_1<1419, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMI       : EVXForm_1<1033, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMIA      : EVXForm_1<1065, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMIAAW    : EVXForm_1<1289, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESMIANW    : EVXForm_1<1417, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhesmianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSF       : EVXForm_1<1027, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSFA      : EVXForm_1<1059, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSFAAW    : EVXForm_1<1283, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSFANW    : EVXForm_1<1411, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSIAAW    : EVXForm_1<1281, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHESSIANW    : EVXForm_1<1409, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhessianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMI       : EVXForm_1<1032, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMIA      : EVXForm_1<1064, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMIAAW    : EVXForm_1<1288, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheumiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUMIANW    : EVXForm_1<1416, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheumianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUSIAAW    : EVXForm_1<1280, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheusiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHEUSIANW    : EVXForm_1<1408, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmheusianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMFAA    : EVXForm_1<1327, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogsmfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMFAN    : EVXForm_1<1455, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogsmfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMIAA    : EVXForm_1<1325, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogsmiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGSMIAN    : EVXForm_1<1453, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogsmian $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGUMIAA    : EVXForm_1<1324, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogumiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOGUMIAN    : EVXForm_1<1452, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhogumian $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMF       : EVXForm_1<1039, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMFA      : EVXForm_1<1071, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMFAAW    : EVXForm_1<1295, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMFANW    : EVXForm_1<1423, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMI       : EVXForm_1<1037, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMIA      : EVXForm_1<1069, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMIAAW    : EVXForm_1<1293, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSMIANW    : EVXForm_1<1421, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhosmianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSF       : EVXForm_1<1031, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossf $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSFA      : EVXForm_1<1063, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSFAAW    : EVXForm_1<1287, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossfaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSFANW    : EVXForm_1<1415, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossfanw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSIAAW    : EVXForm_1<1285, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOSSIANW    : EVXForm_1<1413, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhossianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMI       : EVXForm_1<1036, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhoumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMIA      : EVXForm_1<1068, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhoumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMIAAW    : EVXForm_1<1292, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhoumiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUMIANW    : EVXForm_1<1420, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhoumianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUSIAAW    : EVXForm_1<1284, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhousiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMHOUSIANW    : EVXForm_1<1412, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmhousianw $RT, $RA, $RB", IIC_VecFP>;
+
+
+def EVMWHSMF       : EVXForm_1<1103, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhsmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSMFA      : EVXForm_1<1135, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhsmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSMI       : EVXForm_1<1101, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhsmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSMIA      : EVXForm_1<1133, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhsmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSSF       : EVXForm_1<1095, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhssf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHSSFA      : EVXForm_1<1127, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhssfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHUMI       : EVXForm_1<1100, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWHUMIA      : EVXForm_1<1132, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwhumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSMIAAW    : EVXForm_1<1353, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlsmiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSMIANW    : EVXForm_1<1481, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlsmianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSSIAAW    : EVXForm_1<1345, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlssiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLSSIANW    : EVXForm_1<1473, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlssianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMI       : EVXForm_1<1096, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMIA      : EVXForm_1<1128, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMIAAW    : EVXForm_1<1352, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlumiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUMIANW    : EVXForm_1<1480, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlumianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUSIAAW    : EVXForm_1<1344, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlusiaaw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWLUSIANW    : EVXForm_1<1472, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwlusianw $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMF        : EVXForm_1<1115, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMFA       : EVXForm_1<1147, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMFAA      : EVXForm_1<1371, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMFAN      : EVXForm_1<1499, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMI        : EVXForm_1<1113, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMIA       : EVXForm_1<1145, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMIAA      : EVXForm_1<1369, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSMIAN      : EVXForm_1<1497, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwsmian $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSF        : EVXForm_1<1107, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwssf $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSFA       : EVXForm_1<1139, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwssfa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSFAA      : EVXForm_1<1363, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwssfaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWSSFAN      : EVXForm_1<1491, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwssfan $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMI        : EVXForm_1<1112, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwumi $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMIA       : EVXForm_1<1144, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwumia $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMIAA      : EVXForm_1<1368, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwumiaa $RT, $RA, $RB", IIC_VecFP>;
+def EVMWUMIAN      : EVXForm_1<1496, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evmwumian $RT, $RA, $RB", IIC_VecFP>;
+
+
+def EVNAND         : EVXForm_1<542, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evnand $RT, $RA, $RB", IIC_VecFP>;
+
+def EVNEG          : EVXForm_2<521, (outs gprc:$RT), (ins gprc:$RA),
+                               "evneg $RT, $RA", IIC_VecFP>;
+
+def EVNOR          : EVXForm_1<536, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evnor $RT, $RA, $RB", IIC_VecFP>;
+def EVOR           : EVXForm_1<535, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evor $RT, $RA, $RB", IIC_VecFP>;
+def EVORC          : EVXForm_1<539, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evorc $RT, $RA, $RB", IIC_VecFP>;
+
+def EVRLWI         : EVXForm_1<554, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evrlwi $RT, $RA, $RB", IIC_VecFP>;
+def EVRLW          : EVXForm_1<552, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evrlw $RT, $RA, $RB", IIC_VecFP>;
+
+def EVRNDW         : EVXForm_2<524, (outs gprc:$RT), (ins gprc:$RA),
+                               "evrndw $RT, $RA", IIC_VecFP>;
+
+def EVSLWI         : EVXForm_1<550, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evslwi $RT, $RA, $RB", IIC_VecFP>;
+def EVSLW          : EVXForm_1<548, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evslw $RT, $RA, $RB", IIC_VecFP>;
+
+def EVSPLATFI      : EVXForm_2<555, (outs gprc:$RT), (ins i32imm:$RA),
+                               "evsplatfi $RT, $RA", IIC_VecFP>;
+def EVSPLATI       : EVXForm_2<553, (outs gprc:$RT), (ins i32imm:$RA),
+                               "evsplati $RT, $RA", IIC_VecFP>;
+
+def EVSRWIS        : EVXForm_1<547, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evsrwis $RT, $RA, $RB", IIC_VecFP>;
+def EVSRWIU        : EVXForm_1<546, (outs gprc:$RT), (ins gprc:$RA, u5imm:$RB),
+                               "evsrwiu $RT, $RA, $RB", IIC_VecFP>;
+def EVSRWS         : EVXForm_1<545, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evsrws $RT, $RA, $RB", IIC_VecFP>;
+def EVSRWU         : EVXForm_1<544, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evsrwu $RT, $RA, $RB", IIC_VecFP>;
+
+def EVSTDDX        : EVXForm_1<800, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstddx $RT, $RA, $RB", IIC_VecFP>;
+def EVSTDHX        : EVXForm_1<804, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstdhx $RT, $RA, $RB", IIC_VecFP>;
+def EVSTDWX        : EVXForm_1<802, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstdwx $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWHEX       : EVXForm_1<816, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstwhex $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWHOX       : EVXForm_1<820, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstwhox $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWWEX       : EVXForm_1<824, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstwwex $RT, $RA, $RB", IIC_VecFP>;
+def EVSTWWOX       : EVXForm_1<828, (outs), (ins gprc:$RT, gprc:$RA, gprc:$RB),
+                               "evstwwox $RT, $RA, $RB", IIC_VecFP>;
+
+def EVSUBFSSIAAW   : EVXForm_2<1219, (outs gprc:$RT), (ins gprc:$RA),
+                               "evsubfssiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFSMIAAW   : EVXForm_2<1227, (outs gprc:$RT), (ins gprc:$RA),
+                               "evsubfsmiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFUMIAAW   : EVXForm_2<1226, (outs gprc:$RT), (ins gprc:$RA),
+                               "evsubfumiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFUSIAAW   : EVXForm_2<1218, (outs gprc:$RT), (ins gprc:$RA),
+                               "evsubfusiaaw $RT, $RA", IIC_VecFP>;
+def EVSUBFW        : EVXForm_1<516, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evsubfw $RT, $RA, $RB", IIC_VecFP>;
+def EVSUBIFW       : EVXForm_1<518, (outs gprc:$RT), (ins u5imm:$RA, gprc:$RB),
+                               "evsubifw $RT, $RA, $RB", IIC_VecFP>;
+def EVXOR          : EVXForm_1<534, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB),
+                               "evxor $RT, $RA, $RB", IIC_VecFP>;
+
+} // HasSPE

diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 49bcc48..2c8f998 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td

@@ -47,23 +47,24 @@
 
   // Load indexed instructions
   let mayLoad = 1, canFoldAsLoad = 1 in {
-    def LXSDX : XForm_1<31, 588,
+    def LXSDX : XX1Form<31, 588,
                         (outs vsfrc:$XT), (ins memrr:$src),
                         "lxsdx $XT, $src", IIC_LdStLFD,
                         [(set f64:$XT, (load xoaddr:$src))]>;
 
-    def LXVD2X : XForm_1<31, 844,
+    def LXVD2X : XX1Form<31, 844,
                          (outs vsrc:$XT), (ins memrr:$src),
                          "lxvd2x $XT, $src", IIC_LdStLFD,
-                         [(set v2f64:$XT, (load xoaddr:$src))]>;
+                         [(set v2f64:$XT, (int_ppc_vsx_lxvd2x xoaddr:$src))]>;
 
-    def LXVDSX : XForm_1<31, 332,
+    def LXVDSX : XX1Form<31, 332,
                          (outs vsrc:$XT), (ins memrr:$src),
                          "lxvdsx $XT, $src", IIC_LdStLFD, []>;
 
-    def LXVW4X : XForm_1<31, 780,
+    def LXVW4X : XX1Form<31, 780,
                          (outs vsrc:$XT), (ins memrr:$src),
-                         "lxvw4x $XT, $src", IIC_LdStLFD, []>;
+                         "lxvw4x $XT, $src", IIC_LdStLFD,
+                         [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>;
   }
 
   // Store indexed instructions
@@ -76,11 +77,12 @@
     def STXVD2X : XX1Form<31, 972,
                          (outs), (ins vsrc:$XT, memrr:$dst),
                          "stxvd2x $XT, $dst", IIC_LdStSTFD,
-                         [(store v2f64:$XT, xoaddr:$dst)]>;
+                         [(int_ppc_vsx_stxvd2x v2f64:$XT, xoaddr:$dst)]>;
 
     def STXVW4X : XX1Form<31, 908,
                          (outs), (ins vsrc:$XT, memrr:$dst),
-                         "stxvw4x $XT, $dst", IIC_LdStSTFD, []>;
+                         "stxvw4x $XT, $dst", IIC_LdStSTFD,
+                         [(int_ppc_vsx_stxvw4x v4i32:$XT, xoaddr:$dst)]>;
   }
 
   // Add/Mul Instructions
@@ -641,24 +643,36 @@
   let isCommutable = 1 in {
   def XSMAXDP : XX3Form<60, 160,
                         (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
-                        "xsmaxdp $XT, $XA, $XB", IIC_VecFP, []>;
+                        "xsmaxdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsfrc:$XT,
+                              (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>;
   def XSMINDP : XX3Form<60, 168,
                         (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
-                        "xsmindp $XT, $XA, $XB", IIC_VecFP, []>;
+                        "xsmindp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsfrc:$XT,
+                              (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>;
 
   def XVMAXDP : XX3Form<60, 224,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                        "xvmaxdp $XT, $XA, $XB", IIC_VecFP, []>;
+                        "xvmaxdp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>;
   def XVMINDP : XX3Form<60, 232,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                        "xvmindp $XT, $XA, $XB", IIC_VecFP, []>;
+                        "xvmindp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>;
 
   def XVMAXSP : XX3Form<60, 192,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                        "xvmaxsp $XT, $XA, $XB", IIC_VecFP, []>;
+                        "xvmaxsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>;
   def XVMINSP : XX3Form<60, 200,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
-                        "xvminsp $XT, $XA, $XB", IIC_VecFP, []>;
+                        "xvminsp $XT, $XA, $XB", IIC_VecFP,
+                        [(set vsrc:$XT,
+                              (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>;
   } // isCommutable
 } // Uses = [RM]
 
@@ -715,6 +729,31 @@
                        (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM),
                        "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
 } // neverHasSideEffects
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1,    // Expanded after instruction selection.
+    PPC970_Single = 1 in {
+
+  def SELECT_CC_VSRC: Pseudo<(outs vsrc:$dst),
+                             (ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC),
+                             "#SELECT_CC_VSRC",
+                             []>;
+  def SELECT_VSRC: Pseudo<(outs vsrc:$dst),
+                          (ins crbitrc:$cond, vsrc:$T, vsrc:$F),
+                          "#SELECT_VSRC",
+                          [(set v2f64:$dst,
+                                (select i1:$cond, v2f64:$T, v2f64:$F))]>;
+  def SELECT_CC_VSFRC: Pseudo<(outs f8rc:$dst),
+                              (ins crrc:$cond, f8rc:$T, f8rc:$F,
+                               i32imm:$BROPC), "#SELECT_CC_VSFRC",
+                              []>;
+  def SELECT_VSFRC: Pseudo<(outs f8rc:$dst),
+                           (ins crbitrc:$cond, f8rc:$T, f8rc:$F),
+                           "#SELECT_VSFRC",
+                           [(set f64:$dst,
+                                 (select i1:$cond, f64:$T, f64:$F))]>;
+} // usesCustomInserter
 } // AddedComplexity
 
 def : InstAlias<"xvmovdp $XT, $XB",
@@ -811,6 +850,49 @@
 def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))),
           (XVCVSXWDP (XXSLDWI $C, $C, 1))>;
 
+// Loads.
+def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+
+// Stores.
+def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+
+// Selects.
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
+          (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLE)),
+          (SELECT_VSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETEQ)),
+          (SELECT_VSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGE)),
+          (SELECT_VSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGT)),
+          (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETNE)),
+          (SELECT_VSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+          (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+          (SELECT_VSFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+          (SELECT_VSFRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+          (SELECT_VSFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+          (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+          (SELECT_VSFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+// Divides.
+def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
+          (XVDIVSP $A, $B)>;
+def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
+          (XVDIVDP $A, $B)>;
+
 } // AddedComplexity
 } // HasVSX
 

diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
deleted file mode 100644
index e5f113a..0000000
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ /dev/null

@@ -1,482 +0,0 @@
-//===-- PPCJITInfo.cpp - Implement the JIT interfaces for the PowerPC -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the JIT interfaces for the 32-bit PowerPC target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "PPCJITInfo.h"
-#include "PPCRelocations.h"
-#include "PPCSubtarget.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Memory.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-static TargetJITInfo::JITCompilerFn JITCompilerFunction;
-
-PPCJITInfo::PPCJITInfo(PPCSubtarget &STI)
-    : Subtarget(STI), is64Bit(STI.isPPC64()) {
-  useGOT = 0;
-}
-
-#define BUILD_ADDIS(RD,RS,IMM16) \
-  ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
-#define BUILD_ORI(RD,RS,UIMM16) \
-  ((24 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
-#define BUILD_ORIS(RD,RS,UIMM16) \
-  ((25 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
-#define BUILD_RLDICR(RD,RS,SH,ME) \
-  ((30 << 26) | ((RS) << 21) | ((RD) << 16) | (((SH) & 31) << 11) | \
-   (((ME) & 63) << 6) | (1 << 2) | ((((SH) >> 5) & 1) << 1))
-#define BUILD_MTSPR(RS,SPR)      \
-  ((31 << 26) | ((RS) << 21) | ((SPR) << 16) | (467 << 1))
-#define BUILD_BCCTRx(BO,BI,LINK) \
-  ((19 << 26) | ((BO) << 21) | ((BI) << 16) | (528 << 1) | ((LINK) & 1))
-#define BUILD_B(TARGET, LINK) \
-  ((18 << 26) | (((TARGET) & 0x00FFFFFF) << 2) | ((LINK) & 1))
-
-// Pseudo-ops
-#define BUILD_LIS(RD,IMM16)    BUILD_ADDIS(RD,0,IMM16)
-#define BUILD_SLDI(RD,RS,IMM6) BUILD_RLDICR(RD,RS,IMM6,63-IMM6)
-#define BUILD_MTCTR(RS)        BUILD_MTSPR(RS,9)
-#define BUILD_BCTR(LINK)       BUILD_BCCTRx(20,0,LINK)
-
-static void EmitBranchToAt(uint64_t At, uint64_t To, bool isCall, bool is64Bit){
-  intptr_t Offset = ((intptr_t)To - (intptr_t)At) >> 2;
-  unsigned *AtI = (unsigned*)(intptr_t)At;
-
-  if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
-    AtI[0] = BUILD_B(Offset, isCall);     // b/bl target
-  } else if (!is64Bit) {
-    AtI[0] = BUILD_LIS(12, To >> 16);     // lis r12, hi16(address)
-    AtI[1] = BUILD_ORI(12, 12, To);       // ori r12, r12, lo16(address)
-    AtI[2] = BUILD_MTCTR(12);             // mtctr r12
-    AtI[3] = BUILD_BCTR(isCall);          // bctr/bctrl
-  } else {
-    AtI[0] = BUILD_LIS(12, To >> 48);      // lis r12, hi16(address)
-    AtI[1] = BUILD_ORI(12, 12, To >> 32);  // ori r12, r12, lo16(address)
-    AtI[2] = BUILD_SLDI(12, 12, 32);       // sldi r12, r12, 32
-    AtI[3] = BUILD_ORIS(12, 12, To >> 16); // oris r12, r12, hi16(address)
-    AtI[4] = BUILD_ORI(12, 12, To);        // ori r12, r12, lo16(address)
-    AtI[5] = BUILD_MTCTR(12);              // mtctr r12
-    AtI[6] = BUILD_BCTR(isCall);           // bctr/bctrl
-  }
-}
-
-extern "C" void PPC32CompilationCallback();
-extern "C" void PPC64CompilationCallback();
-
-// The first clause of the preprocessor directive looks wrong, but it is
-// necessary when compiling this code on non-PowerPC hosts.
-#if (!defined(__ppc__) && !defined(__powerpc__)) || defined(__powerpc64__) || defined(__ppc64__)
-void PPC32CompilationCallback() {
-  llvm_unreachable("This is not a 32bit PowerPC, you can't execute this!");
-}
-#elif !defined(__ELF__)
-// CompilationCallback stub - We can't use a C function with inline assembly in
-// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
-// write our own wrapper, which does things our way, so we have complete control
-// over register saving and restoring.
-asm(
-    ".text\n"
-    ".align 2\n"
-    ".globl _PPC32CompilationCallback\n"
-"_PPC32CompilationCallback:\n"
-    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
-    // FIXME: need to save v[0-19] for altivec?
-    // FIXME: could shrink frame
-    // Set up a proper stack frame
-    // FIXME Layout
-    //   PowerPC32 ABI linkage    -  24 bytes
-    //                 parameters -  32 bytes
-    //   13 double registers      - 104 bytes
-    //   8 int registers          -  32 bytes
-    "mflr r0\n"
-    "stw r0,  8(r1)\n"
-    "stwu r1, -208(r1)\n"
-    // Save all int arg registers
-    "stw r10, 204(r1)\n"    "stw r9,  200(r1)\n"
-    "stw r8,  196(r1)\n"    "stw r7,  192(r1)\n"
-    "stw r6,  188(r1)\n"    "stw r5,  184(r1)\n"
-    "stw r4,  180(r1)\n"    "stw r3,  176(r1)\n"
-    // Save all call-clobbered FP regs.
-    "stfd f13, 168(r1)\n"   "stfd f12, 160(r1)\n"
-    "stfd f11, 152(r1)\n"   "stfd f10, 144(r1)\n"
-    "stfd f9,  136(r1)\n"   "stfd f8,  128(r1)\n"
-    "stfd f7,  120(r1)\n"   "stfd f6,  112(r1)\n"
-    "stfd f5,  104(r1)\n"   "stfd f4,   96(r1)\n"
-    "stfd f3,   88(r1)\n"   "stfd f2,   80(r1)\n"
-    "stfd f1,   72(r1)\n"
-    // Arguments to Compilation Callback:
-    // r3 - our lr (address of the call instruction in stub plus 4)
-    // r4 - stub's lr (address of instruction that called the stub plus 4)
-    // r5 - is64Bit - always 0.
-    "mr   r3, r0\n"
-    "lwz  r2, 208(r1)\n" // stub's frame
-    "lwz  r4, 8(r2)\n" // stub's lr
-    "li   r5, 0\n"       // 0 == 32 bit
-    "bl _LLVMPPCCompilationCallback\n"
-    "mtctr r3\n"
-    // Restore all int arg registers
-    "lwz r10, 204(r1)\n"    "lwz r9,  200(r1)\n"
-    "lwz r8,  196(r1)\n"    "lwz r7,  192(r1)\n"
-    "lwz r6,  188(r1)\n"    "lwz r5,  184(r1)\n"
-    "lwz r4,  180(r1)\n"    "lwz r3,  176(r1)\n"
-    // Restore all FP arg registers
-    "lfd f13, 168(r1)\n"    "lfd f12, 160(r1)\n"
-    "lfd f11, 152(r1)\n"    "lfd f10, 144(r1)\n"
-    "lfd f9,  136(r1)\n"    "lfd f8,  128(r1)\n"
-    "lfd f7,  120(r1)\n"    "lfd f6,  112(r1)\n"
-    "lfd f5,  104(r1)\n"    "lfd f4,   96(r1)\n"
-    "lfd f3,   88(r1)\n"    "lfd f2,   80(r1)\n"
-    "lfd f1,   72(r1)\n"
-    // Pop 3 frames off the stack and branch to target
-    "lwz  r1, 208(r1)\n"
-    "lwz  r2, 8(r1)\n"
-    "mtlr r2\n"
-    "bctr\n"
-    );
-
-#else
-// ELF PPC 32 support
-
-// CompilationCallback stub - We can't use a C function with inline assembly in
-// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
-// write our own wrapper, which does things our way, so we have complete control
-// over register saving and restoring.
-asm(
-    ".text\n"
-    ".align 2\n"
-    ".globl PPC32CompilationCallback\n"
-"PPC32CompilationCallback:\n"
-    // Make space for 8 ints r[3-10] and 8 doubles f[1-8] and the 
-    // FIXME: need to save v[0-19] for altivec?
-    // FIXME: could shrink frame
-    // Set up a proper stack frame
-    // FIXME Layout
-    //   8 double registers       -  64 bytes
-    //   8 int registers          -  32 bytes
-    "mflr 0\n"
-    "stw 0,  4(1)\n"
-    "stwu 1, -104(1)\n"
-    // Save all int arg registers
-    "stw 10, 100(1)\n"   "stw 9,  96(1)\n"
-    "stw 8,  92(1)\n"    "stw 7,  88(1)\n"
-    "stw 6,  84(1)\n"    "stw 5,  80(1)\n"
-    "stw 4,  76(1)\n"    "stw 3,  72(1)\n"
-    // Save all call-clobbered FP regs.
-    "stfd 8,  64(1)\n"
-    "stfd 7,  56(1)\n"   "stfd 6,  48(1)\n"
-    "stfd 5,  40(1)\n"   "stfd 4,  32(1)\n"
-    "stfd 3,  24(1)\n"   "stfd 2,  16(1)\n"
-    "stfd 1,  8(1)\n"
-    // Arguments to Compilation Callback:
-    // r3 - our lr (address of the call instruction in stub plus 4)
-    // r4 - stub's lr (address of instruction that called the stub plus 4)
-    // r5 - is64Bit - always 0.
-    "mr   3, 0\n"
-    "lwz  5, 104(1)\n" // stub's frame
-    "lwz  4, 4(5)\n" // stub's lr
-    "li   5, 0\n"       // 0 == 32 bit
-    "bl LLVMPPCCompilationCallback\n"
-    "mtctr 3\n"
-    // Restore all int arg registers
-    "lwz 10, 100(1)\n"   "lwz 9,  96(1)\n"
-    "lwz 8,  92(1)\n"    "lwz 7,  88(1)\n"
-    "lwz 6,  84(1)\n"    "lwz 5,  80(1)\n"
-    "lwz 4,  76(1)\n"    "lwz 3,  72(1)\n"
-    // Restore all FP arg registers
-    "lfd 8,  64(1)\n"
-    "lfd 7,  56(1)\n"    "lfd 6,  48(1)\n"
-    "lfd 5,  40(1)\n"    "lfd 4,  32(1)\n"
-    "lfd 3,  24(1)\n"    "lfd 2,  16(1)\n"
-    "lfd 1,  8(1)\n"
-    // Pop 3 frames off the stack and branch to target
-    "lwz  1, 104(1)\n"
-    "lwz  0, 4(1)\n"
-    "mtlr 0\n"
-    "bctr\n"
-    );
-#endif
-
-#if !defined(__powerpc64__) && !defined(__ppc64__)
-void PPC64CompilationCallback() {
-  llvm_unreachable("This is not a 64bit PowerPC, you can't execute this!");
-}
-#else
-#  ifdef __ELF__
-asm(
-    ".text\n"
-    ".align 2\n"
-    ".globl PPC64CompilationCallback\n"
-#if _CALL_ELF == 2
-    ".type PPC64CompilationCallback,@function\n"
-"PPC64CompilationCallback:\n"
-#else
-    ".section \".opd\",\"aw\",@progbits\n"
-    ".align 3\n"
-"PPC64CompilationCallback:\n"
-    ".quad .L.PPC64CompilationCallback,.TOC.@tocbase,0\n"
-    ".size PPC64CompilationCallback,24\n"
-    ".previous\n"
-    ".align 4\n"
-    ".type PPC64CompilationCallback,@function\n"
-".L.PPC64CompilationCallback:\n"
-#endif
-#  else
-asm(
-    ".text\n"
-    ".align 2\n"
-    ".globl _PPC64CompilationCallback\n"
-"_PPC64CompilationCallback:\n"
-#  endif
-    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
-    // FIXME: need to save v[0-19] for altivec?
-    // Set up a proper stack frame
-    // Layout
-    //   PowerPC64 ABI linkage    -  48 bytes
-    //                 parameters -  64 bytes
-    //   13 double registers      - 104 bytes
-    //   8 int registers          -  64 bytes
-    "mflr 0\n"
-    "std  0,  16(1)\n"
-    "stdu 1, -280(1)\n"
-    // Save all int arg registers
-    "std 10, 272(1)\n"    "std 9,  264(1)\n"
-    "std 8,  256(1)\n"    "std 7,  248(1)\n"
-    "std 6,  240(1)\n"    "std 5,  232(1)\n"
-    "std 4,  224(1)\n"    "std 3,  216(1)\n"
-    // Save all call-clobbered FP regs.
-    "stfd 13, 208(1)\n"    "stfd 12, 200(1)\n"
-    "stfd 11, 192(1)\n"    "stfd 10, 184(1)\n"
-    "stfd 9,  176(1)\n"    "stfd 8,  168(1)\n"
-    "stfd 7,  160(1)\n"    "stfd 6,  152(1)\n"
-    "stfd 5,  144(1)\n"    "stfd 4,  136(1)\n"
-    "stfd 3,  128(1)\n"    "stfd 2,  120(1)\n"
-    "stfd 1,  112(1)\n"
-    // Arguments to Compilation Callback:
-    // r3 - our lr (address of the call instruction in stub plus 4)
-    // r4 - stub's lr (address of instruction that called the stub plus 4)
-    // r5 - is64Bit - always 1.
-    "mr   3, 0\n"      // return address (still in r0)
-    "ld   5, 280(1)\n" // stub's frame
-    "ld   4, 16(5)\n"  // stub's lr
-    "li   5, 1\n"      // 1 == 64 bit
-#  ifdef __ELF__
-    "bl LLVMPPCCompilationCallback\n"
-    "nop\n"
-#  else
-    "bl _LLVMPPCCompilationCallback\n"
-#  endif
-    "mtctr 3\n"
-    // Restore all int arg registers
-    "ld 10, 272(1)\n"    "ld 9,  264(1)\n"
-    "ld 8,  256(1)\n"    "ld 7,  248(1)\n"
-    "ld 6,  240(1)\n"    "ld 5,  232(1)\n"
-    "ld 4,  224(1)\n"    "ld 3,  216(1)\n"
-    // Restore all FP arg registers
-    "lfd 13, 208(1)\n"    "lfd 12, 200(1)\n"
-    "lfd 11, 192(1)\n"    "lfd 10, 184(1)\n"
-    "lfd 9,  176(1)\n"    "lfd 8,  168(1)\n"
-    "lfd 7,  160(1)\n"    "lfd 6,  152(1)\n"
-    "lfd 5,  144(1)\n"    "lfd 4,  136(1)\n"
-    "lfd 3,  128(1)\n"    "lfd 2,  120(1)\n"
-    "lfd 1,  112(1)\n"
-    // Pop 3 frames off the stack and branch to target
-    "ld  1, 280(1)\n"
-    "ld  0, 16(1)\n"
-    "mtlr 0\n"
-    // XXX: any special TOC handling in the ELF case for JIT?
-    "bctr\n"
-    );
-#endif
-
-extern "C" {
-LLVM_LIBRARY_VISIBILITY void *
-LLVMPPCCompilationCallback(unsigned *StubCallAddrPlus4,
-                           unsigned *OrigCallAddrPlus4,
-                           bool is64Bit) {
-  // Adjust the pointer to the address of the call instruction in the stub
-  // emitted by emitFunctionStub, rather than the instruction after it.
-  unsigned *StubCallAddr = StubCallAddrPlus4 - 1;
-  unsigned *OrigCallAddr = OrigCallAddrPlus4 - 1;
-
-  void *Target = JITCompilerFunction(StubCallAddr);
-
-  // Check to see if *OrigCallAddr is a 'bl' instruction, and if we can rewrite
-  // it to branch directly to the destination.  If so, rewrite it so it does not
-  // need to go through the stub anymore.
-  unsigned OrigCallInst = *OrigCallAddr;
-  if ((OrigCallInst >> 26) == 18) {     // Direct call.
-    intptr_t Offset = ((intptr_t)Target - (intptr_t)OrigCallAddr) >> 2;
-    
-    if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
-      // Clear the original target out.
-      OrigCallInst &= (63 << 26) | 3;
-      // Fill in the new target.
-      OrigCallInst |= (Offset & ((1 << 24)-1)) << 2;
-      // Replace the call.
-      *OrigCallAddr = OrigCallInst;
-    }
-  }
-
-  // Assert that we are coming from a stub that was created with our
-  // emitFunctionStub.
-  if ((*StubCallAddr >> 26) == 18)
-    StubCallAddr -= 3;
-  else {
-  assert((*StubCallAddr >> 26) == 19 && "Call in stub is not indirect!");
-    StubCallAddr -= is64Bit ? 9 : 6;
-  }
-
-  // Rewrite the stub with an unconditional branch to the target, for any users
-  // who took the address of the stub.
-  EmitBranchToAt((intptr_t)StubCallAddr, (intptr_t)Target, false, is64Bit);
-  sys::Memory::InvalidateInstructionCache(StubCallAddr, 7*4);
-
-  // Put the address of the target function to call and the address to return to
-  // after calling the target function in a place that is easy to get on the
-  // stack after we restore all regs.
-  return Target;
-}
-}
-
-
-
-TargetJITInfo::LazyResolverFn
-PPCJITInfo::getLazyResolverFunction(JITCompilerFn Fn) {
-  JITCompilerFunction = Fn;
-  return is64Bit ? PPC64CompilationCallback : PPC32CompilationCallback;
-}
-
-TargetJITInfo::StubLayout PPCJITInfo::getStubLayout() {
-  // The stub contains up to 10 4-byte instructions, aligned at 4 bytes: 3
-  // instructions to save the caller's address if this is a lazy-compilation
-  // stub, plus a 1-, 4-, or 7-instruction sequence to load an arbitrary address
-  // into a register and jump through it.
-  StubLayout Result = {10*4, 4};
-  return Result;
-}
-
-#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
-defined(__APPLE__)
-extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
-#endif
-
-void *PPCJITInfo::emitFunctionStub(const Function* F, void *Fn,
-                                   JITCodeEmitter &JCE) {
-  // If this is just a call to an external function, emit a branch instead of a
-  // call.  The code is the same except for one bit of the last instruction.
-  if (Fn != (void*)(intptr_t)PPC32CompilationCallback && 
-      Fn != (void*)(intptr_t)PPC64CompilationCallback) {
-    void *Addr = (void*)JCE.getCurrentPCValue();
-    JCE.emitWordBE(0);
-    JCE.emitWordBE(0);
-    JCE.emitWordBE(0);
-    JCE.emitWordBE(0);
-    JCE.emitWordBE(0);
-    JCE.emitWordBE(0);
-    JCE.emitWordBE(0);
-    EmitBranchToAt((intptr_t)Addr, (intptr_t)Fn, false, is64Bit);
-    sys::Memory::InvalidateInstructionCache(Addr, 7*4);
-    return Addr;
-  }
-
-  void *Addr = (void*)JCE.getCurrentPCValue();
-  if (is64Bit) {
-    JCE.emitWordBE(0xf821ffb1);     // stdu r1,-80(r1)
-    JCE.emitWordBE(0x7d6802a6);     // mflr r11
-    JCE.emitWordBE(0xf9610060);     // std r11, 96(r1)
-  } else if (Subtarget.isDarwinABI()){
-    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
-    JCE.emitWordBE(0x7d6802a6);     // mflr r11
-    JCE.emitWordBE(0x91610028);     // stw r11, 40(r1)
-  } else {
-    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
-    JCE.emitWordBE(0x7d6802a6);     // mflr r11
-    JCE.emitWordBE(0x91610024);     // stw r11, 36(r1)
-  }
-  intptr_t BranchAddr = (intptr_t)JCE.getCurrentPCValue();
-  JCE.emitWordBE(0);
-  JCE.emitWordBE(0);
-  JCE.emitWordBE(0);
-  JCE.emitWordBE(0);
-  JCE.emitWordBE(0);
-  JCE.emitWordBE(0);
-  JCE.emitWordBE(0);
-  EmitBranchToAt(BranchAddr, (intptr_t)Fn, true, is64Bit);
-  sys::Memory::InvalidateInstructionCache(Addr, 10*4);
-  return Addr;
-}
-
-
-void PPCJITInfo::relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char* GOTBase) {
-  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
-    unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
-    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
-    switch ((PPC::RelocationType)MR->getRelocationType()) {
-    default: llvm_unreachable("Unknown relocation type!");
-    case PPC::reloc_pcrel_bx:
-      // PC-relative relocation for b and bl instructions.
-      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
-      assert(ResultPtr >= -(1 << 23) && ResultPtr < (1 << 23) &&
-             "Relocation out of range!");
-      *RelocPos |= (ResultPtr & ((1 << 24)-1))  << 2;
-      break;
-    case PPC::reloc_pcrel_bcx:
-      // PC-relative relocation for BLT,BLE,BEQ,BGE,BGT,BNE, or other
-      // bcx instructions.
-      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
-      assert(ResultPtr >= -(1 << 13) && ResultPtr < (1 << 13) &&
-             "Relocation out of range!");
-      *RelocPos |= (ResultPtr & ((1 << 14)-1))  << 2;
-      break;
-    case PPC::reloc_absolute_high:     // high bits of ref -> low 16 of instr
-    case PPC::reloc_absolute_low: {    // low bits of ref  -> low 16 of instr
-      ResultPtr += MR->getConstantVal();
-
-      // If this is a high-part access, get the high-part.
-      if (MR->getRelocationType() == PPC::reloc_absolute_high) {
-        // If the low part will have a carry (really a borrow) from the low
-        // 16-bits into the high 16, add a bit to borrow from.
-        if (((int)ResultPtr << 16) < 0)
-          ResultPtr += 1 << 16;
-        ResultPtr >>= 16;
-      }
-
-      // Do the addition then mask, so the addition does not overflow the 16-bit
-      // immediate section of the instruction.
-      unsigned LowBits  = (*RelocPos + ResultPtr) & 65535;
-      unsigned HighBits = *RelocPos & ~65535;
-      *RelocPos = LowBits | HighBits;  // Slam into low 16-bits
-      break;
-    }
-    case PPC::reloc_absolute_low_ix: {  // low bits of ref  -> low 14 of instr
-      ResultPtr += MR->getConstantVal();
-      // Do the addition then mask, so the addition does not overflow the 16-bit
-      // immediate section of the instruction.
-      unsigned LowBits  = (*RelocPos + ResultPtr) & 0xFFFC;
-      unsigned HighBits = *RelocPos & 0xFFFF0003;
-      *RelocPos = LowBits | HighBits;  // Slam into low 14-bits.
-      break;
-    }
-    }
-  }
-}
-
-void PPCJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
-  EmitBranchToAt((intptr_t)Old, (intptr_t)New, false, is64Bit);
-  sys::Memory::InvalidateInstructionCache(Old, 7*4);
-}

diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h
deleted file mode 100644
index b6b37ff..0000000
--- a/lib/Target/PowerPC/PPCJITInfo.h
+++ /dev/null

@@ -1,46 +0,0 @@
-//===-- PPCJITInfo.h - PowerPC impl. of the JIT interface -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the PowerPC implementation of the TargetJITInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef POWERPC_JITINFO_H
-#define POWERPC_JITINFO_H
-
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/Target/TargetJITInfo.h"
-
-namespace llvm {
-class PPCSubtarget;
-class PPCJITInfo : public TargetJITInfo {
-protected:
-  PPCSubtarget &Subtarget;
-  bool is64Bit;
-
-public:
-  PPCJITInfo(PPCSubtarget &STI);
-
-  StubLayout getStubLayout() override;
-  void *emitFunctionStub(const Function *F, void *Fn,
-                         JITCodeEmitter &JCE) override;
-  LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
-  void relocate(void *Function, MachineRelocation *MR, unsigned NumRelocs,
-                unsigned char *GOTBase) override;
-
-  /// replaceMachineCodeForFunction - Make it so that calling the function
-  /// whose machine code is at OLD turns into a call to NEW, perhaps by
-  /// overwriting OLD with a branch to NEW.  This is used for self-modifying
-  /// code.
-  ///
-  void replaceMachineCodeForFunction(void *Old, void *New) override;
-};
-}
-
-#endif

diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index f8e84a5..880b520 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp

@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
+#include "PPCSubtarget.h"
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
@@ -37,14 +38,16 @@
 static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
   const TargetMachine &TM = AP.TM;
   Mangler *Mang = AP.Mang;
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   MCContext &Ctx = AP.OutContext;
+  bool isDarwin = Triple(TM.getTargetTriple()).isOSDarwin();
 
   SmallString<128> Name;
   StringRef Suffix;
-  if (MO.getTargetFlags() == PPCII::MO_DARWIN_STUB)
-    Suffix = "$stub";
-  else if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG)
+  if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB) {
+    if (isDarwin)
+      Suffix = "$stub";
+  } else if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG)
     Suffix = "$non_lazy_ptr";
 
   if (!Suffix.empty())
@@ -68,7 +71,7 @@
 
   // If the target flags on the operand changes the name of the symbol, do that
   // before we return the symbol.
-  if (MO.getTargetFlags() == PPCII::MO_DARWIN_STUB) {
+  if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && isDarwin) {
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI(AP).getFnStubEntry(Sym);
     if (StubSym.getPointer())
@@ -134,8 +137,17 @@
     case PPCII::MO_TLS:
       RefKind = MCSymbolRefExpr::VK_PPC_TLS;
       break;
+    case PPCII::MO_TLSGD:
+      RefKind = MCSymbolRefExpr::VK_PPC_TLSGD;
+      break;
+    case PPCII::MO_TLSLD:
+      RefKind = MCSymbolRefExpr::VK_PPC_TLSLD;
+      break;
   }
 
+  if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && !isDarwin)
+    RefKind = MCSymbolRefExpr::VK_PLT;
+
   const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, RefKind, Ctx);
 
   if (!MO.isJTI() && MO.getOffset())

diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index 6a0aec8..4aff95a 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp

@@ -8,8 +8,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCMachineFunctionInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
 void PPCFunctionInfo::anchor() { }
 
+MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
+  const DataLayout *DL = MF.getSubtarget().getDataLayout();
+  return MF.getContext().GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
+    Twine(MF.getFunctionNumber())+"$poff");
+}

diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 33f843d..83de799 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PPC_MACHINE_FUNCTION_INFO_H
-#define PPC_MACHINE_FUNCTION_INFO_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 
@@ -92,6 +92,12 @@
   /// 64-bit SVR4 ABI.
   SmallVector<unsigned, 3> MustSaveCRs;
 
+  /// Hold onto our MachineFunction context.
+  MachineFunction &MF;
+
+  /// Whether this uses the PIC Base register or not.
+  bool UsesPICBase;
+
 public:
   explicit PPCFunctionInfo(MachineFunction &MF) 
     : FramePointerSaveIndex(0),
@@ -109,7 +115,9 @@
       VarArgsStackOffset(0),
       VarArgsNumGPR(0),
       VarArgsNumFPR(0),
-      CRSpillFrameIndex(0) {}
+      CRSpillFrameIndex(0),
+      MF(MF),
+      UsesPICBase(0) {}
 
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
@@ -170,6 +178,11 @@
   const SmallVectorImpl<unsigned> &
     getMustSaveCRs() const { return MustSaveCRs; }
   void addMustSaveCR(unsigned Reg) { MustSaveCRs.push_back(Reg); }
+
+  void setUsesPICBase(bool uses) { UsesPICBase = uses; }
+  bool usesPICBase() const { return UsesPICBase; }
+
+  MCSymbol *getPICOffsetSymbol() const;
 };
 
 } // end of namespace llvm

diff --git a/lib/Target/PowerPC/PPCPerfectShuffle.h b/lib/Target/PowerPC/PPCPerfectShuffle.h
index 17b836d..8a1d680 100644
--- a/lib/Target/PowerPC/PPCPerfectShuffle.h
+++ b/lib/Target/PowerPC/PPCPerfectShuffle.h

@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCPERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_POWERPC_PPCPERFECTSHUFFLE_H
+
 // 31 entries have cost 0
 // 292 entries have cost 1
 // 1384 entries have cost 2
@@ -6584,3 +6587,5 @@
   835584U,	// <u,u,u,u>: Cost 0 copy LHS
   0
 };
+
+#endif

diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index eca774e..9b9966f 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp

@@ -140,8 +140,8 @@
 
 BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const PPCFrameLowering *PPCFI =
-    static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
+  const PPCFrameLowering *PPCFI = static_cast<const PPCFrameLowering *>(
+      MF.getSubtarget().getFrameLowering());
 
   // The ZERO register is not really a register, but the representation of r0
   // when used in instructions that treat r0 as the constant 0.
@@ -199,7 +199,16 @@
   if (PPCFI->needsFP(MF))
     Reserved.set(PPC::R31);
 
-  if (hasBasePointer(MF))
+  if (hasBasePointer(MF)) {
+  	if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() &&
+        MF.getTarget().getRelocationModel() == Reloc::PIC_)
+      Reserved.set(PPC::R29);
+    else
+      Reserved.set(PPC::R30);
+  }
+
+  if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() &&
+      MF.getTarget().getRelocationModel() == Reloc::PIC_)
     Reserved.set(PPC::R30);
 
   // Reserve Altivec registers when Altivec is unavailable.
@@ -214,7 +223,7 @@
 unsigned
 PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                          MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   const unsigned DefaultSafety = 1;
 
   switch (RC->getID()) {
@@ -278,7 +287,7 @@
   // Get the frame info.
   MachineFrameInfo *MFI = MF.getFrameInfo();
   // Get the instruction info.
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   // Determine whether 64-bit pointers are used.
   bool LP64 = Subtarget.isPPC64();
   DebugLoc dl = MI.getDebugLoc();
@@ -289,7 +298,10 @@
   unsigned FrameSize = MFI->getStackSize();
   
   // Get stack alignments.
-  unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned TargetAlign = MF.getTarget()
+                             .getSubtargetImpl()
+                             ->getFrameLowering()
+                             ->getStackAlignment();
   unsigned MaxAlign = MFI->getMaxAlignment();
   assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
          "Maximum call-frame size not sufficiently aligned");
@@ -394,7 +406,7 @@
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -438,7 +450,7 @@
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -511,7 +523,7 @@
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -554,7 +566,7 @@
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -601,7 +613,7 @@
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -626,7 +638,7 @@
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -706,7 +718,7 @@
   // Get the basic block's function.
   MachineFunction &MF = *MBB.getParent();
   // Get the instruction info.
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   // Get the frame info.
   MachineFrameInfo *MFI = MF.getFrameInfo();
   DebugLoc dl = MI.getDebugLoc();
@@ -831,7 +843,7 @@
 }
 
 unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   if (!Subtarget.isPPC64())
     return TFI->hasFP(MF) ? PPC::R31 : PPC::R1;
@@ -843,7 +855,14 @@
   if (!hasBasePointer(MF))
     return getFrameRegister(MF);
 
-  return Subtarget.isPPC64() ? PPC::X30 : PPC::R30;
+  if (Subtarget.isPPC64())
+    return PPC::X30;
+
+  if (Subtarget.isSVR4ABI() &&
+      MF.getTarget().getRelocationModel() == Reloc::PIC_)
+    return PPC::R29;
+
+  return PPC::R30;
 }
 
 bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
@@ -868,7 +887,10 @@
 bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned StackAlign = MF.getTarget()
+                            .getSubtargetImpl()
+                            ->getFrameLowering()
+                            ->getStackAlignment();
   bool requiresRealignment =
     ((MFI->getMaxAlignment() > StackAlign) ||
      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
@@ -885,16 +907,6 @@
 needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   assert(Offset < 0 && "Local offset must be negative");
 
-  unsigned FIOperandNum = 0;
-  while (!MI->getOperand(FIOperandNum).isFI()) {
-    ++FIOperandNum;
-    assert(FIOperandNum < MI->getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
-  }
-
-  unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum);
-  Offset += MI->getOperand(OffsetOperandNo).getImm();
-
   // It's the load/store FI references that cause issues, as it can be difficult
   // to materialize the offset if it won't fit in the literal field. Estimate
   // based on the size of the local frame and some conservative assumptions
@@ -916,8 +928,8 @@
   MachineBasicBlock &MBB = *MI->getParent();
   MachineFunction &MF = *MBB.getParent();
 
-  const PPCFrameLowering *PPCFI =
-    static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
+  const PPCFrameLowering *PPCFI = static_cast<const PPCFrameLowering *>(
+      MF.getSubtarget().getFrameLowering());
   unsigned StackEst =
     PPCFI->determineFrameLayout(MF, false, true);
 
@@ -951,7 +963,7 @@
     DL = Ins->getDebugLoc();
 
   const MachineFunction &MF = *MBB->getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
@@ -976,7 +988,7 @@
 
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const MCInstrDesc &MCID = MI.getDesc();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MRI.constrainRegClass(BaseReg,
@@ -985,6 +997,16 @@
 
 bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
                                          int64_t Offset) const {
+  unsigned FIOperandNum = 0;
+  while (!MI->getOperand(FIOperandNum).isFI()) {
+    ++FIOperandNum;
+    assert(FIOperandNum < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum);
+  Offset += MI->getOperand(OffsetOperandNo).getImm();
+
   return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
          (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
 }

diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 13a35f6..c182f95 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef POWERPC32_REGISTERINFO_H
-#define POWERPC32_REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCREGISTERINFO_H
 
 #include "PPC.h"
 #include "llvm/ADT/DenseMap.h"

diff --git a/lib/Target/PowerPC/PPCRelocations.h b/lib/Target/PowerPC/PPCRelocations.h
deleted file mode 100644
index 0b392f9..0000000
--- a/lib/Target/PowerPC/PPCRelocations.h
+++ /dev/null

@@ -1,56 +0,0 @@
-//===-- PPCRelocations.h - PPC Code Relocations -----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PowerPC 32-bit target-specific relocation types.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PPCRELOCATIONS_H
-#define PPCRELOCATIONS_H
-
-#include "llvm/CodeGen/MachineRelocation.h"
-
-// Hack to rid us of a PPC pre-processor symbol which is erroneously
-// defined in a PowerPC header file (bug in Linux/PPC)
-#ifdef PPC
-#undef PPC
-#endif
-
-namespace llvm {
-  namespace PPC {
-    enum RelocationType {
-      // reloc_vanilla - A standard relocation, where the address of the
-      // relocated object completely overwrites the address of the relocation.
-      reloc_vanilla,
-    
-      // reloc_pcrel_bx - PC relative relocation, for the b or bl instructions.
-      reloc_pcrel_bx,
-
-      // reloc_pcrel_bcx - PC relative relocation, for BLT,BLE,BEQ,BGE,BGT,BNE,
-      // and other bcx instructions.
-      reloc_pcrel_bcx,
-
-      // reloc_absolute_high - Absolute relocation, for the loadhi instruction
-      // (which is really addis).  Add the high 16-bits of the specified global
-      // address into the low 16-bits of the instruction.
-      reloc_absolute_high,
-
-      // reloc_absolute_low - Absolute relocation, for the la instruction (which
-      // is really an addi).  Add the low 16-bits of the specified global
-      // address into the low 16-bits of the instruction.
-      reloc_absolute_low,
-      
-      // reloc_absolute_low_ix - Absolute relocation for the 64-bit load/store
-      // instruction which have two implicit zero bits.
-      reloc_absolute_low_ix
-    };
-  }
-}
-
-#endif

diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 1221d41..7f80121 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td

@@ -106,6 +106,7 @@
 def IIC_SprSLBMTE    : InstrItinClass;
 def IIC_SprSLBMFEE   : InstrItinClass;
 def IIC_SprSLBIA     : InstrItinClass;
+def IIC_SprTLBIA     : InstrItinClass;
 def IIC_SprTLBIEL    : InstrItinClass;
 def IIC_SprTLBIE     : InstrItinClass;
 

diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/lib/Target/PowerPC/PPCSelectionDAGInfo.h
index b2e7f3b..2c1378d 100644
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ b/lib/Target/PowerPC/PPCSelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef POWERPCCSELECTIONDAGINFO_H
-#define POWERPCCSELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_POWERPC_PPCSELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 

diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 2e1b74a..04e7ec6 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp

@@ -33,13 +33,12 @@
 #include "PPCGenSubtargetInfo.inc"
 
 /// Return the datalayout string of a subtarget.
-static std::string getDataLayoutString(const PPCSubtarget &ST) {
-  const Triple &T = ST.getTargetTriple();
-
+static std::string getDataLayoutString(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le;
   std::string Ret;
 
   // Most PPC* platforms are big endian, PPC64LE is little endian.
-  if (ST.isLittleEndian())
+  if (T.getArch() == Triple::ppc64le)
     Ret = "e";
   else
     Ret = "E";
@@ -48,18 +47,18 @@
 
   // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit
   // pointers.
-  if (!ST.isPPC64() || T.getOS() == Triple::Lv2)
+  if (!is64Bit || T.getOS() == Triple::Lv2)
     Ret += "-p:32:32";
 
   // Note, the alignment values for f64 and i64 on ppc64 in Darwin
   // documentation are wrong; these are correct (i.e. "what gcc does").
-  if (ST.isPPC64() || ST.isSVR4ABI())
+  if (is64Bit || !T.isOSDarwin())
     Ret += "-i64:64";
   else
     Ret += "-f64:32:64";
 
   // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones.
-  if (ST.isPPC64())
+  if (is64Bit)
     Ret += "-n32:64";
   else
     Ret += "-n32";
@@ -70,47 +69,20 @@
 PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
   initializeEnvironment();
-  resetSubtargetFeatures(CPU, FS);
+  initSubtargetFeatures(CPU, FS);
   return *this;
 }
 
 PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, PPCTargetMachine &TM,
-                           bool is64Bit, CodeGenOpt::Level OptLevel)
-    : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT),
-      OptLevel(OptLevel),
-      FrameLowering(initializeSubtargetDependencies(CPU, FS)),
-      DL(getDataLayoutString(*this)), InstrInfo(*this), JITInfo(*this),
+                           const std::string &FS, const PPCTargetMachine &TM)
+    : PPCGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
+      DL(getDataLayoutString(TargetTriple)),
+      IsPPC64(TargetTriple.getArch() == Triple::ppc64 ||
+              TargetTriple.getArch() == Triple::ppc64le),
+      TargetABI(PPC_ABI_UNKNOWN),
+      FrameLowering(initializeSubtargetDependencies(CPU, FS)), InstrInfo(*this),
       TLInfo(TM), TSInfo(&DL) {}
 
-/// SetJITMode - This is called to inform the subtarget info that we are
-/// producing code for the JIT.
-void PPCSubtarget::SetJITMode() {
-  // JIT mode doesn't want lazy resolver stubs, it knows exactly where
-  // everything is.  This matters for PPC64, which codegens in PIC mode without
-  // stubs.
-  HasLazyResolverStubs = false;
-
-  // Calls to external functions need to use indirect calls
-  IsJITCodeModel = true;
-}
-
-void PPCSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
-  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
-  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
-                                           "target-cpu");
-  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
-                                          "target-features");
-  std::string CPU =
-    !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : "";
-  std::string FS =
-    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
-  if (!FS.empty()) {
-    initializeEnvironment();
-    resetSubtargetFeatures(CPU, FS);
-  }
-}
-
 void PPCSubtarget::initializeEnvironment() {
   StackAlignment = 16;
   DarwinDirective = PPC::DIR_NONE;
@@ -119,8 +91,10 @@
   Use64BitRegs = false;
   UseCRBits = false;
   HasAltivec = false;
+  HasSPE = false;
   HasQPX = false;
   HasVSX = false;
+  HasP8Vector = false;
   HasFCPSGN = false;
   HasFSQRT = false;
   HasFRE = false;
@@ -136,13 +110,16 @@
   HasPOPCNTD = false;
   HasLDBRX = false;
   IsBookE = false;
+  HasOnlyMSYNC = false;
+  IsPPC4xx = false;
+  IsPPC6xx = false;
+  IsE500 = false;
   DeprecatedMFTB = false;
   DeprecatedDST = false;
   HasLazyResolverStubs = false;
-  IsJITCodeModel = false;
 }
 
-void PPCSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
   if (CPUName.empty())
@@ -156,35 +133,13 @@
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
-  // Make sure 64-bit features are available when CPUname is generic
-  std::string FullFS = FS;
-
-  // If we are generating code for ppc64, verify that options make sense.
-  if (IsPPC64) {
-    Has64BitSupport = true;
-    // Silently force 64-bit register use on ppc64.
-    Use64BitRegs = true;
-    if (!FullFS.empty())
-      FullFS = "+64bit," + FullFS;
-    else
-      FullFS = "+64bit";
-  }
-
-  // At -O2 and above, track CR bits as individual registers.
-  if (OptLevel >= CodeGenOpt::Default) {
-    if (!FullFS.empty())
-      FullFS = "+crbits," + FullFS;
-    else
-      FullFS = "+crbits";
-  }
-
   // Parse features string.
-  ParseSubtargetFeatures(CPUName, FullFS);
+  ParseSubtargetFeatures(CPUName, FS);
 
   // If the user requested use of 64-bit regs, but the cpu selected doesn't
   // support it, ignore.
-  if (use64BitRegs() && !has64BitSupport())
-    Use64BitRegs = false;
+  if (IsPPC64 && has64BitSupport())
+    Use64BitRegs = true;
 
   // Set up darwin-specific properties.
   if (isDarwin())
@@ -201,8 +156,20 @@
 
   // FIXME: For now, we disable VSX in little-endian mode until endian
   // issues in those instructions can be addressed.
-  if (IsLittleEndian)
+  if (IsLittleEndian) {
     HasVSX = false;
+    HasP8Vector = false;
+  }
+
+  // Determine default ABI.
+  if (TargetABI == PPC_ABI_UNKNOWN) {
+    if (!isDarwin() && IsPPC64) {
+      if (IsLittleEndian)
+        TargetABI = PPC_ABI_ELFv2;
+      else
+        TargetABI = PPC_ABI_ELFv1;
+    }
+  }
 }
 
 /// hasLazyResolverStub - Return true if accesses to the specified global have
@@ -213,31 +180,13 @@
   // We never have stubs if HasLazyResolverStubs=false or if in static mode.
   if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static)
     return false;
-  // If symbol visibility is hidden, the extra load is not needed if
-  // the symbol is definitely defined in the current translation unit.
-  bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+  bool isDecl = GV->isDeclaration();
   if (GV->hasHiddenVisibility() && !isDecl && !GV->hasCommonLinkage())
     return false;
   return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
          GV->hasCommonLinkage() || isDecl;
 }
 
-bool PPCSubtarget::enablePostRAScheduler(
-           CodeGenOpt::Level OptLevel,
-           TargetSubtargetInfo::AntiDepBreakMode& Mode,
-           RegClassVector& CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_ALL;
-
-  CriticalPathRCs.clear();
-
-  if (isPPC64())
-    CriticalPathRCs.push_back(&PPC::G8RCRegClass);
-  else
-    CriticalPathRCs.push_back(&PPC::GPRCRegClass);
-    
-  return OptLevel >= CodeGenOpt::Default;
-}
-
 // Embedded cores need aggressive scheduling (and some others also benefit).
 static bool needsAggressiveScheduling(unsigned Directive) {
   switch (Directive) {
@@ -259,6 +208,19 @@
   return needsAggressiveScheduling(DarwinDirective);
 }
 
+// This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+bool PPCSubtarget::enablePostMachineScheduler() const { return true; }
+
+PPCGenSubtargetInfo::AntiDepBreakMode PPCSubtarget::getAntiDepBreakMode() const {
+  return TargetSubtargetInfo::ANTIDEP_ALL;
+}
+
+void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const {
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(isPPC64() ?
+                            &PPC::G8RCRegClass : &PPC::GPRCRegClass);
+}
+
 void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
                                        MachineInstr *begin,
                                        MachineInstr *end,

diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 2a16699..1df19c3 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h

@@ -11,13 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef POWERPCSUBTARGET_H
-#define POWERPCSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H
+#define LLVM_LIB_TARGET_POWERPC_PPCSUBTARGET_H
 
 #include "PPCFrameLowering.h"
 #include "PPCInstrInfo.h"
 #include "PPCISelLowering.h"
-#include "PPCJITInfo.h"
 #include "PPCSelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
@@ -66,6 +65,12 @@
 
 class PPCSubtarget : public PPCGenSubtargetInfo {
 protected:
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  // Calculates type size & alignment
+  const DataLayout DL;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned StackAlignment;
@@ -83,8 +88,10 @@
   bool UseCRBits;
   bool IsPPC64;
   bool HasAltivec;
+  bool HasSPE;
   bool HasQPX;
   bool HasVSX;
+  bool HasP8Vector;
   bool HasFCPSGN;
   bool HasFSQRT;
   bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
@@ -97,22 +104,23 @@
   bool HasPOPCNTD;
   bool HasLDBRX;
   bool IsBookE;
+  bool HasOnlyMSYNC;
+  bool IsE500;
+  bool IsPPC4xx;
+  bool IsPPC6xx;
   bool DeprecatedMFTB;
   bool DeprecatedDST;
   bool HasLazyResolverStubs;
-  bool IsJITCodeModel;
   bool IsLittleEndian;
 
-  /// TargetTriple - What processor and OS we're targeting.
-  Triple TargetTriple;
-
-  /// OptLevel - What default optimization level we're emitting code for.
-  CodeGenOpt::Level OptLevel;
+  enum {
+    PPC_ABI_UNKNOWN,
+    PPC_ABI_ELFv1,
+    PPC_ABI_ELFv2
+  } TargetABI;
 
   PPCFrameLowering FrameLowering;
-  const DataLayout DL;
   PPCInstrInfo InstrInfo;
-  PPCJITInfo JITInfo;
   PPCTargetLowering TLInfo;
   PPCSelectionDAGInfo TSInfo;
 
@@ -121,17 +129,12 @@
   /// of the specified triple.
   ///
   PPCSubtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS, PPCTargetMachine &TM, bool is64Bit,
-               CodeGenOpt::Level OptLevel);
+               const std::string &FS, const PPCTargetMachine &TM);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  /// SetJITMode - This is called to inform the subtarget info that we are
-  /// producing code for the JIT.
-  void SetJITMode();
-
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
@@ -143,24 +146,32 @@
 
   /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
-  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
 
-  const PPCFrameLowering *getFrameLowering() const { return &FrameLowering; }
-  const DataLayout *getDataLayout() const { return &DL; }
-  const PPCInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  PPCJITInfo *getJITInfo() { return &JITInfo; }
-  const PPCTargetLowering *getTargetLowering() const { return &TLInfo; }
-  const PPCSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const PPCFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const PPCInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const PPCTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const PPCSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const PPCRegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
 
   /// initializeSubtargetDependencies - Initializes using a CPU and feature string
   /// so that we can use initializer lists for subtarget initialization.
   PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
 
-  /// \brief Reset the features for the PowerPC target.
-  void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
   void initializeEnvironment();
-  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
 
 public:
   /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
@@ -186,9 +197,6 @@
   bool hasLazyResolverStub(const GlobalValue *GV,
                            const TargetMachine &TM) const;
 
-  // isJITCodeModel - True if we're generating code for the JIT
-  bool isJITCodeModel() const { return IsJITCodeModel; }
-
   // isLittleEndian - True if generating little-endian code
   bool isLittleEndian() const { return IsLittleEndian; }
 
@@ -205,13 +213,19 @@
   bool hasFPRND() const { return HasFPRND; }
   bool hasFPCVT() const { return HasFPCVT; }
   bool hasAltivec() const { return HasAltivec; }
+  bool hasSPE() const { return HasSPE; }
   bool hasQPX() const { return HasQPX; }
   bool hasVSX() const { return HasVSX; }
+  bool hasP8Vector() const { return HasP8Vector; }
   bool hasMFOCRF() const { return HasMFOCRF; }
   bool hasISEL() const { return HasISEL; }
   bool hasPOPCNTD() const { return HasPOPCNTD; }
   bool hasLDBRX() const { return HasLDBRX; }
   bool isBookE() const { return IsBookE; }
+  bool hasOnlyMSYNC() const { return HasOnlyMSYNC; }
+  bool isPPC4xx() const { return IsPPC4xx; }
+  bool isPPC6xx() const { return IsPPC6xx; }
+  bool isE500() const { return IsE500; }
   bool isDeprecatedMFTB() const { return DeprecatedMFTB; }
   bool isDeprecatedDST() const { return DeprecatedDST; }
 
@@ -222,18 +236,22 @@
   /// isBGQ - True if this is a BG/Q platform.
   bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
 
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
   bool isDarwinABI() const { return isDarwin(); }
   bool isSVR4ABI() const { return !isDarwin(); }
-
-  /// enablePostRAScheduler - True at 'More' optimization.
-  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                             RegClassVector& CriticalPathRCs) const override;
+  bool isELFv2ABI() const { return TargetABI == PPC_ABI_ELFv2; }
 
   bool enableEarlyIfConversion() const override { return hasISEL(); }
 
   // Scheduling customization.
   bool enableMachineScheduler() const override;
+  // This overrides the PostRAScheduler bit in the SchedModel for each CPU.
+  bool enablePostMachineScheduler() const override;
+  AntiDepBreakMode getAntiDepBreakMode() const override;
+  void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override;
+
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            MachineInstr *begin,
                            MachineInstr *end,

diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 9563b90..f15189c 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp

@@ -12,8 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCTargetMachine.h"
+#include "PPCTargetObjectFile.h"
 #include "PPC.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
@@ -37,15 +39,54 @@
   RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
 }
 
+static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, StringRef TT) {
+  std::string FullFS = FS;
+  Triple TargetTriple(TT);
+
+  // Make sure 64-bit features are available when CPUname is generic
+  if (TargetTriple.getArch() == Triple::ppc64 ||
+      TargetTriple.getArch() == Triple::ppc64le) {
+    if (!FullFS.empty())
+      FullFS = "+64bit," + FullFS;
+    else
+      FullFS = "+64bit";
+  }
+
+  if (OL >= CodeGenOpt::Default) {
+    if (!FullFS.empty())
+      FullFS = "+crbits," + FullFS;
+    else
+      FullFS = "+crbits";
+  }
+  return FullFS;
+}
+
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  // If it isn't a Mach-O file then it's going to be a linux ELF
+  // object file.
+  if (TT.isOSDarwin())
+    return make_unique<TargetLoweringObjectFileMachO>();
+
+  return make_unique<PPC64LinuxTargetObjectFile>();
+}
+
+// The FeatureString here is a little subtle. We are modifying the feature string
+// with what are (currently) non-function specific overrides as it goes into the
+// LLVMTargetMachine constructor and then using the stored value in the
+// Subtarget constructor below it.
 PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU,
                                    StringRef FS, const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
-                                   CodeGenOpt::Level OL, bool is64Bit)
-    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, *this, is64Bit, OL) {
+                                   CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, TT, CPU, computeFSAdditions(FS, OL, TT), Options, RM,
+                        CM, OL),
+      TLOF(createTLOF(Triple(getTargetTriple()))),
+      Subtarget(TT, CPU, TargetFS, *this) {
   initAsmInfo();
 }
 
+PPCTargetMachine::~PPCTargetMachine() {}
+
 void PPC32TargetMachine::anchor() { }
 
 PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT,
@@ -53,7 +94,7 @@
                                        const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
+  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
 }
 
 void PPC64TargetMachine::anchor() { }
@@ -63,9 +104,34 @@
                                        const TargetOptions &Options,
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
+  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
 }
 
+const PPCSubtarget *
+PPCTargetMachine::getSubtargetImpl(const Function &F) const {
+  AttributeSet FnAttrs = F.getAttributes();
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<PPCSubtarget>(TargetTriple, CPU, FS, *this);
+  }
+  return I.get();
+}
 
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
@@ -86,6 +152,7 @@
     return *getPPCTargetMachine().getSubtargetImpl();
   }
 
+  void addIRPasses() override;
   bool addPreISel() override;
   bool addILPOpts() override;
   bool addInstSelector() override;
@@ -99,6 +166,11 @@
   return new PPCPassConfig(this, PM);
 }
 
+void PPCPassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass(&getPPCTargetMachine()));
+  TargetPassConfig::addIRPasses();
+}
+
 bool PPCPassConfig::addPreISel() {
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
     addPass(createPPCCTRLoops(getPPCTargetMachine()));
@@ -148,18 +220,6 @@
   return false;
 }
 
-bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
-                                      JITCodeEmitter &JCE) {
-  // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
-  // writing?
-  Subtarget.SetJITMode();
-
-  // Machine code emitter pass for PowerPC.
-  PM.add(createPPCJITCodeEmitterPass(*this, JCE));
-
-  return false;
-}
-
 void PPCTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   // Add first the target-independent BasicTTI pass, then our PPC pass. This
   // allows the PPC pass to delegate to the target independent layer when

diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 4c7029c..5095d73 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PPC_TARGETMACHINE_H
-#define PPC_TARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETMACHINE_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETMACHINE_H
 
 #include "PPCInstrInfo.h"
 #include "PPCSubtarget.h"
@@ -24,46 +24,30 @@
 /// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets.
 ///
 class PPCTargetMachine : public LLVMTargetMachine {
-  PPCSubtarget        Subtarget;
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
+  PPCSubtarget Subtarget;
+
+  mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
 
 public:
   PPCTargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS, const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
-                   CodeGenOpt::Level OL, bool is64Bit);
+                   CodeGenOpt::Level OL);
 
-  const PPCInstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const PPCFrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  PPCJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
-  const PPCTargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const PPCSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
-  const PPCRegisterInfo *getRegisterInfo() const override {
-    return &getInstrInfo()->getRegisterInfo();
-  }
+  ~PPCTargetMachine() override;
 
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
-  const PPCSubtarget  *getSubtargetImpl() const override { return &Subtarget; }
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return &getSubtargetImpl()->getInstrItineraryData();
-  }
+  const PPCSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-  bool addCodeEmitter(PassManagerBase &PM,
-                      JITCodeEmitter &JCE) override;
 
   /// \brief Register PPC analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
 };
 
 /// PPC32TargetMachine - PowerPC 32-bit target machine.

diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h
index 3e71bbc..cd84da2 100644
--- a/lib/Target/PowerPC/PPCTargetObjectFile.h
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_PPC_TARGETOBJECTFILE_H
-#define LLVM_TARGET_PPC_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"

diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h
index 74b5f45..6493713 100644
--- a/lib/Target/PowerPC/PPCTargetStreamer.h
+++ b/lib/Target/PowerPC/PPCTargetStreamer.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef PPCTARGETSTREAMER_H
-#define PPCTARGETSTREAMER_H
+#ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
 
 #include "llvm/MC/MCStreamer.h"
 
@@ -19,6 +19,8 @@
   virtual ~PPCTargetStreamer();
   virtual void emitTCEntry(const MCSymbol &S) = 0;
   virtual void emitMachine(StringRef CPU) = 0;
+  virtual void emitAbiVersion(int AbiVersion) = 0;
+  virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) = 0;
 };
 }
 

diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 007901b..37624ed 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

@@ -38,6 +38,7 @@
 namespace {
 
 class PPCTTI final : public ImmutablePass, public TargetTransformInfo {
+  const TargetMachine *TM;
   const PPCSubtarget *ST;
   const PPCTargetLowering *TLI;
 
@@ -47,16 +48,16 @@
   }
 
   PPCTTI(const PPCTargetMachine *TM)
-      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
     initializePPCTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() override {
+  void initializePass() override {
     pushTTIStack(this);
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -64,7 +65,7 @@
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo*)this;
     return this;
@@ -79,33 +80,31 @@
   unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                          Type *Ty) const override;
 
-  virtual PopcntSupportKind
-  getPopcntSupport(unsigned TyWidth) const override;
-  virtual void getUnrollingPreferences(
-    Loop *L, UnrollingPreferences &UP) const override;
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+  void getUnrollingPreferences(const Function *F, Loop *L,
+                               UnrollingPreferences &UP) const override;
 
   /// @}
 
   /// \name Vector TTI Implementations
   /// @{
 
-  virtual unsigned getNumberOfRegisters(bool Vector) const override;
-  virtual unsigned getRegisterBitWidth(bool Vector) const override;
-  virtual unsigned getMaximumUnrollFactor() const override;
-  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind,
-                                          OperandValueKind) const override;
-  virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
-                                  int Index, Type *SubTp) const override;
-  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const override;
-  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const override;
-  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const override;
-  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const override;
+  unsigned getNumberOfRegisters(bool Vector) const override;
+  unsigned getRegisterBitWidth(bool Vector) const override;
+  unsigned getMaxInterleaveFactor() const override;
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
+                                  OperandValueKind, OperandValueProperties,
+                                  OperandValueProperties) const override;
+  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                          int Index, Type *SubTp) const override;
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                            Type *Src) const override;
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                              Type *CondTy) const override;
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                              unsigned Index) const override;
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
 
   /// @}
 };
@@ -271,8 +270,9 @@
   return PPCTTI::getIntImmCost(Imm, Ty);
 }
 
-void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
-  if (ST->getDarwinDirective() == PPC::DIR_A2) {
+void PPCTTI::getUnrollingPreferences(const Function *F, Loop *L,
+                                     UnrollingPreferences &UP) const {
+  if (TM->getSubtarget<PPCSubtarget>(F).getDarwinDirective() == PPC::DIR_A2) {
     // The A2 is in-order with a deep pipeline, and concatenation unrolling
     // helps expose latency-hiding opportunities to the instruction scheduler.
     UP.Partial = UP.Runtime = true;
@@ -297,7 +297,7 @@
 
 }
 
-unsigned PPCTTI::getMaximumUnrollFactor() const {
+unsigned PPCTTI::getMaxInterleaveFactor() const {
   unsigned Directive = ST->getDarwinDirective();
   // The 440 has no SIMD support, but floating-point instructions
   // have a 5-cycle latency, so unroll by 5x for latency hiding.
@@ -318,14 +318,15 @@
   return 2;
 }
 
-unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                        OperandValueKind Op1Info,
-                                        OperandValueKind Op2Info) const {
+unsigned PPCTTI::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
+    OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
+    OperandValueProperties Opd2PropInfo) const {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   // Fallback to the default implementation.
-  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
-                                                     Op2Info);
+  return TargetTransformInfo::getArithmeticInstrCost(
+      Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo);
 }
 
 unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,

diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 713fc4b..261075e 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h

@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPU_H
-#define AMDGPU_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H
+#define LLVM_LIB_TARGET_R600_AMDGPU_H
 
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
@@ -39,6 +39,8 @@
 FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIShrinkInstructionsPass();
+FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRLiveRangesPass();
@@ -48,10 +50,14 @@
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
+void initializeSILoadStoreOptimizerPass(PassRegistry &);
+extern char &SILoadStoreOptimizerID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
+ModulePass *createAMDGPUAlwaysInlinePass();
 
 /// \brief Creates an AMDGPU-specific Target Transformation Info pass.
 ImmutablePass *
@@ -63,6 +69,14 @@
 
 extern Target TheAMDGPUTarget;
 
+namespace AMDGPU {
+enum TargetIndex {
+  TI_CONSTDATA_START
+};
+}
+
+#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel"
+
 } // End namespace llvm
 
 namespace ShaderType {
@@ -118,4 +132,4 @@
 
 } // namespace AMDGPUAS
 
-#endif // AMDGPU_H
+#endif

diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 6ff9ab7..4cf1243 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td

@@ -25,6 +25,11 @@
         "false",
         "Disable IR Structurizer">;
 
+def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
+        "EnablePromoteAlloca",
+        "true",
+        "Enable promote alloca pass">;
+
 // Target features
 
 def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
@@ -37,6 +42,20 @@
         "true",
         "Enable double precision operations">;
 
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+        "FP64Denormals",
+        "true",
+        "Enable double precision denormal handling",
+        [FeatureFP64]>;
+
+// Some instructions do not support denormals despite this flag. Using
+// fp32 denormals also causes instructions to run at the double
+// precision rate for the device.
+def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
+        "FP32Denormals",
+        "true",
+        "Enable single precision denormal handling">;
+
 def Feature64BitPtr : SubtargetFeature<"64BitPtr",
         "Is64bit",
         "true",
@@ -62,6 +81,17 @@
         "true",
         "GPU has CF_ALU bug">;
 
+// XXX - This should probably be removed once enabled by default
+def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt",
+        "EnableLoadStoreOpt",
+        "true",
+        "Enable SI load/store optimizer pass">;
+
+def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
+        "FlatAddressSpace",
+        "true",
+        "Support flat address space">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
         "TexVTXClauseSize",
@@ -111,19 +141,28 @@
 >;
 
 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>;
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768,
+         FeatureWavefrontSize64]>;
 
 def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
-        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>;
+        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+         FeatureWavefrontSize64, FeatureFlatAddressSpace]>;
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
 }
 
+def AMDGPUAsmParser : AsmParser {
+  // Some of the R600 registers have the same name, so this crashes.
+  // For example T0_XYZW and T0_XY both have the asm name T0.
+  let ShouldEmitMatchRegisterName = 0;
+}
+
 def AMDGPU : Target {
   // Pull in Instruction Info:
   let InstructionSet = AMDGPUInstrInfo;
+  let AssemblyParsers = [AMDGPUAsmParser];
 }
 
 // Dummy Instruction itineraries for pseudo instructions

diff --git a/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp
new file mode 100644
index 0000000..b545b45
--- /dev/null
+++ b/lib/Target/R600/AMDGPUAlwaysInlinePass.cpp

@@ -0,0 +1,66 @@
+//===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass marks all internal functions as always_inline and creates
+/// duplicates of all other functions a marks the duplicates as always_inline.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAlwaysInline : public ModulePass {
+
+  static char ID;
+
+public:
+  AMDGPUAlwaysInline() : ModulePass(ID) { }
+  bool runOnModule(Module &M) override;
+  const char *getPassName() const override { return "AMDGPU Always Inline Pass"; }
+};
+
+} // End anonymous namespace
+
+char AMDGPUAlwaysInline::ID = 0;
+
+bool AMDGPUAlwaysInline::runOnModule(Module &M) {
+
+  std::vector<Function*> FuncsToClone;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Function &F = *I;
+    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty())
+      FuncsToClone.push_back(&F);
+  }
+
+  for (Function *F : FuncsToClone) {
+    ValueToValueMapTy VMap;
+    Function *NewFunc = CloneFunction(F, VMap, false);
+    NewFunc->setLinkage(GlobalValue::InternalLinkage);
+    F->getParent()->getFunctionList().push_back(NewFunc);
+    F->replaceAllUsesWith(NewFunc);
+  }
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Function &F = *I;
+    if (F.hasLocalLinkage()) {
+      F.addFnAttr(Attribute::AlwaysInline);
+    }
+  }
+  return false;
+}
+
+ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
+  return new AMDGPUAlwaysInline();
+}

diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index a6e217b..5511d7c 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp

@@ -16,7 +16,6 @@
 //===----------------------------------------------------------------------===//
 //
 
-
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
@@ -26,6 +25,7 @@
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
@@ -48,11 +48,28 @@
 // precision, and leaves single precision to flush all and does not report
 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
 // CL_FP_DENORM for both.
-static uint32_t getFPMode(MachineFunction &) {
+//
+// FIXME: It seems some instructions do not support single precision denormals
+// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
+// and sin_f32, cos_f32 on most parts).
+
+// We want to use these instructions, and using fp32 denormals also causes
+// instructions to run at the double precision rate for the device so it's
+// probably best to just report no single precision denormals.
+static uint32_t getFPMode(const MachineFunction &F) {
+  const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>();
+  // TODO: Is there any real use for the flush in only / flush out only modes?
+
+  uint32_t FP32Denormals =
+    ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+  uint32_t FP64Denormals =
+    ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
-         FP_DENORM_MODE_SP(FP_DENORM_FLUSH_NONE) |
-         FP_DENORM_MODE_DP(FP_DENORM_FLUSH_NONE);
+         FP_DENORM_MODE_SP(FP32Denormals) |
+         FP_DENORM_MODE_DP(FP64Denormals);
 }
 
 static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
@@ -69,10 +86,24 @@
   DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode();
 }
 
+void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+
+  // This label is used to mark the end of the .text section.
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  OutStreamer.SwitchSection(TLOF.getTextSection());
+  MCSymbol *EndOfTextLabel =
+      OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+  OutStreamer.EmitLabel(EndOfTextLabel);
+}
+
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+
+  // The starting address of all shader programs must be 256 bytes aligned.
+  MF.setAlignment(8);
+
   SetupMachineFunction(MF);
 
-  OutStreamer.emitRawComment(Twine('@') + MF.getName() + Twine(':'));
+  EmitFunctionHeader();
 
   MCContext &Context = getObjFileLowering().getContext();
   const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
@@ -115,6 +146,8 @@
                                  false);
       OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
                                  false);
+      OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
+                                 false);
     } else {
       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       OutStreamer.emitRawComment(
@@ -145,25 +178,21 @@
   return false;
 }
 
-void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
+void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
   unsigned MaxGPR = 0;
   bool killPixel = false;
-  const R600RegisterInfo * RI =
-                static_cast<const R600RegisterInfo*>(TM.getRegisterInfo());
-  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+  const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
+  const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-                                                    I != E; ++I) {
-      MachineInstr &MI = *I;
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
       if (MI.getOpcode() == AMDGPU::KILLGT)
         killPixel = true;
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        MachineOperand & MO = MI.getOperand(op_idx);
+        const MachineOperand &MO = MI.getOperand(op_idx);
         if (!MO.isReg())
           continue;
         unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
@@ -179,7 +208,7 @@
   unsigned RsrcReg;
   if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
     // Evergreen / Northern Islands
-    switch (MFI->ShaderType) {
+    switch (MFI->getShaderType()) {
     default: // Fall through
     case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
     case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
@@ -188,7 +217,7 @@
     }
   } else {
     // R600 / R700
-    switch (MFI->ShaderType) {
+    switch (MFI->getShaderType()) {
     default: // Fall through
     case ShaderType::GEOMETRY: // Fall through
     case ShaderType::COMPUTE:  // Fall through
@@ -203,34 +232,30 @@
   OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
   OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
 
-  if (MFI->ShaderType == ShaderType::COMPUTE) {
+  if (MFI->getShaderType() == ShaderType::COMPUTE) {
     OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
     OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
   }
 }
 
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
-                                        MachineFunction &MF) const {
+                                        const MachineFunction &MF) const {
   uint64_t CodeSize = 0;
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
-  const SIRegisterInfo * RI =
-                static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+  bool FlatUsed = false;
+  const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
 
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-                                                    I != E; ++I) {
-      MachineInstr &MI = *I;
-
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
       // TODO: CodeSize should account for multiple functions.
       CodeSize += MI.getDesc().Size;
 
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        MachineOperand &MO = MI.getOperand(op_idx);
+        const MachineOperand &MO = MI.getOperand(op_idx);
         unsigned width = 0;
         bool isSGPR = false;
 
@@ -242,6 +267,11 @@
 	    reg == AMDGPU::VCC_HI) {
           VCCUsed = true;
           continue;
+        } else if (reg == AMDGPU::FLAT_SCR ||
+                   reg == AMDGPU::FLAT_SCR_LO ||
+                   reg == AMDGPU::FLAT_SCR_HI) {
+          FlatUsed = true;
+          continue;
         }
 
         switch (reg) {
@@ -302,8 +332,13 @@
   if (VCCUsed)
     MaxSGPR += 2;
 
-  ProgInfo.NumVGPR = MaxVGPR;
-  ProgInfo.NumSGPR = MaxSGPR;
+  if (FlatUsed)
+    MaxSGPR += 2;
+
+  // We found the maximum register index. They start at 0, so add one to get the
+  // number of registers.
+  ProgInfo.NumVGPR = MaxVGPR + 1;
+  ProgInfo.NumSGPR = MaxSGPR + 1;
 
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
   // register.
@@ -315,16 +350,21 @@
   // Do not clamp NAN to 0.
   ProgInfo.DX10Clamp = 0;
 
+  const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+  ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
+
+  ProgInfo.FlatUsed = FlatUsed;
+  ProgInfo.VCCUsed = VCCUsed;
   ProgInfo.CodeLen = CodeSize;
 }
 
-void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
+void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &KernelInfo) {
   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   unsigned RsrcReg;
-  switch (MFI->ShaderType) {
+  switch (MFI->getShaderType()) {
   default: // Fall through
   case ShaderType::COMPUTE:  RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
   case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
@@ -341,15 +381,31 @@
     LDSAlignShift = 9;
   }
 
-  unsigned LDSBlocks =
-    RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+  unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
+                          MFI->getMaximumWorkGroupSize(MF);
 
-  if (MFI->ShaderType == ShaderType::COMPUTE) {
+  unsigned LDSBlocks =
+     RoundUpToAlignment(MFI->LDSSize + LDSSpillSize,
+	                      1 << LDSAlignShift) >> LDSAlignShift;
+
+  // Scratch is allocated in 256 dword blocks.
+  unsigned ScratchAlignShift = 10;
+  // We need to program the hardware with the amount of scratch memory that
+  // is used by the entire wave.  KernelInfo.ScratchSize is the amount of
+  // scratch memory used per thread.
+  unsigned ScratchBlocks =
+    RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
+                       1 << ScratchAlignShift) >> ScratchAlignShift;
+
+  unsigned VGPRBlocks = (KernelInfo.NumVGPR - 1) / 4;
+  unsigned SGPRBlocks = (KernelInfo.NumSGPR - 1) / 8;
+
+  if (MFI->getShaderType() == ShaderType::COMPUTE) {
     OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 
     const uint32_t ComputePGMRSrc1 =
-      S_00B848_VGPRS(KernelInfo.NumVGPR / 4) |
-      S_00B848_SGPRS(KernelInfo.NumSGPR / 8) |
+      S_00B848_VGPRS(VGPRBlocks) |
+      S_00B848_SGPRS(SGPRBlocks) |
       S_00B848_PRIORITY(KernelInfo.Priority) |
       S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
       S_00B848_PRIV(KernelInfo.Priv) |
@@ -360,14 +416,24 @@
     OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
 
     OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
-    OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
+    const uint32_t ComputePGMRSrc2 =
+      S_00B84C_LDS_SIZE(LDSBlocks) |
+      S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
+
+    OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
+
+    OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
+    OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
+
+    // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
+    // 0" comment but I don't see a corresponding field in the register spec.
   } else {
     OutStreamer.EmitIntValue(RsrcReg, 4);
-    OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
-                             S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
+    OutStreamer.EmitIntValue(S_00B028_VGPRS(VGPRBlocks) |
+                             S_00B028_SGPRS(SGPRBlocks), 4);
   }
 
-  if (MFI->ShaderType == ShaderType::PIXEL) {
+  if (MFI->getShaderType() == ShaderType::PIXEL) {
     OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
     OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);

diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index c1acb6e..b9a0767 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h

@@ -12,11 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPU_ASMPRINTER_H
-#define AMDGPU_ASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
+#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H
 
 #include "llvm/CodeGen/AsmPrinter.h"
-#include <string>
 #include <vector>
 
 namespace llvm {
@@ -33,6 +32,9 @@
       DX10Clamp(0),
       DebugMode(0),
       IEEEMode(0),
+      ScratchSize(0),
+      FlatUsed(false),
+      VCCUsed(false),
       CodeLen(0) {}
 
     // Fields set in PGM_RSRC1 pm4 packet.
@@ -44,20 +46,24 @@
     uint32_t DX10Clamp;
     uint32_t DebugMode;
     uint32_t IEEEMode;
+    uint32_t ScratchSize;
+
+    bool FlatUsed;
 
     // Bonus information for debugging.
+    bool VCCUsed;
     uint64_t CodeLen;
   };
 
-  void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const;
-  void findNumUsedRegistersSI(MachineFunction &MF,
+  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  void findNumUsedRegistersSI(const MachineFunction &MF,
                               unsigned &NumSGPR,
                               unsigned &NumVGPR) const;
 
   /// \brief Emit register usage information so that the GPU driver
   /// can correctly setup the GPU state.
-  void EmitProgramInfoR600(MachineFunction &MF);
-  void EmitProgramInfoSI(MachineFunction &MF, const SIProgramInfo &KernelInfo);
+  void EmitProgramInfoR600(const MachineFunction &MF);
+  void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
@@ -71,6 +77,8 @@
   /// Implemented in AMDGPUMCInstLower.cpp
   void EmitInstruction(const MachineInstr *MI) override;
 
+  void EmitEndOfAsmFile(Module &M) override;
+
 protected:
   bool DisasmEnabled;
   std::vector<std::string> DisasmLines, HexLines;
@@ -79,4 +87,4 @@
 
 } // End anonymous llvm
 
-#endif //AMDGPU_ASMPRINTER_H
+#endif

diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 5f8ad8c..6ffa7a0 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td

@@ -59,16 +59,24 @@
 ]>;
 
 def CC_AMDGPU : CallingConv<[
-  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() >= "
-       "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-       "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"#
-       "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
-  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() < "
-       "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
-       "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()->"
-       "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
-  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
-       ".getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>,
-  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
-       ".getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_R600>>
+  CCIf<"static_cast<const AMDGPUSubtarget&>"
+        "(State.getMachineFunction().getSubtarget()).getGeneration() >="
+          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+        "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()"
+         "->getShaderType() == ShaderType::COMPUTE",
+       CCDelegateTo<CC_AMDGPU_Kernel>>,
+  CCIf<"static_cast<const AMDGPUSubtarget&>"
+        "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+          "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+         "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()"
+          "->getShaderType() == ShaderType::COMPUTE",
+        CCDelegateTo<CC_AMDGPU_Kernel>>,
+   CCIf<"static_cast<const AMDGPUSubtarget&>"
+         "(State.getMachineFunction().getSubtarget()).getGeneration() >= "
+           "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+        CCDelegateTo<CC_SI>>,
+   CCIf<"static_cast<const AMDGPUSubtarget&>"
+          "(State.getMachineFunction().getSubtarget()).getGeneration() < "
+            "AMDGPUSubtarget::SOUTHERN_ISLANDS",
+        CCDelegateTo<CC_R600>>
 ]>;

diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h
index d18ede5..15a6636 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.h
+++ b/lib/Target/R600/AMDGPUFrameLowering.h

@@ -12,8 +12,8 @@
 /// machine.
 //
 //===----------------------------------------------------------------------===//
-#ifndef AMDILFRAME_LOWERING_H
-#define AMDILFRAME_LOWERING_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
+#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -42,4 +42,4 @@
   bool hasFP(const MachineFunction &MF) const override;
 };
 } // namespace llvm
-#endif // AMDILFRAME_LOWERING_H
+#endif

diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index b4d79e5..90b6672 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp

@@ -16,9 +16,13 @@
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
+#include "SIDefines.h"
 #include "SIISelLowering.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Function.h"
@@ -61,6 +65,7 @@
   static bool checkPrivateAddress(const MachineMemOperand *Op);
 
   static bool isGlobalStore(const StoreSDNode *N);
+  static bool isFlatStore(const StoreSDNode *N);
   static bool isPrivateStore(const StoreSDNode *N);
   static bool isLocalStore(const StoreSDNode *N);
   static bool isRegionStore(const StoreSDNode *N);
@@ -68,24 +73,46 @@
   bool isCPLoad(const LoadSDNode *N) const;
   bool isConstantLoad(const LoadSDNode *N, int cbID) const;
   bool isGlobalLoad(const LoadSDNode *N) const;
+  bool isFlatLoad(const LoadSDNode *N) const;
   bool isParamLoad(const LoadSDNode *N) const;
   bool isPrivateLoad(const LoadSDNode *N) const;
   bool isLocalLoad(const LoadSDNode *N) const;
   bool isRegionLoad(const LoadSDNode *N) const;
 
-  /// \returns True if the current basic block being selected is at control
-  ///          flow depth 0.  Meaning that the current block dominates the
-  //           exit block.
-  bool isCFDepth0() const;
-
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
   bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
                                        SDValue& Offset);
   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr, SDValue &Offset,
-                        SDValue &ImmOffset) const;
+  bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+                       unsigned OffsetBits) const;
+  bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
+  bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
+                                 SDValue &Offset1) const;
+  void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+                   SDValue &SOffset, SDValue &Offset, SDValue &Offen,
+                   SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
+                   SDValue &TFE) const;
+  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+                         SDValue &Offset) const;
+  bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+                         SDValue &VAddr, SDValue &Offset,
+                         SDValue &SLC) const;
+  bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+                          SDValue &SOffset, SDValue &ImmOffset) const;
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
+                         SDValue &Offset, SDValue &GLC, SDValue &SLC,
+                         SDValue &TFE) const;
+  bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+                         SDValue &Offset, SDValue &GLC) const;
+  SDNode *SelectAddrSpaceCast(SDNode *N);
+  bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                       SDValue &Clamp, SDValue &Omod) const;
+
+  bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
+                            SDValue &Omod) const;
 
   SDNode *SelectADD_SUB_I64(SDNode *N);
   SDNode *SelectDIV_SCALE(SDNode *N);
@@ -125,7 +152,8 @@
 
   switch (N->getMachineOpcode()) {
   default: {
-    const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
+    const MCInstrDesc &Desc =
+        TM.getSubtargetImpl()->getInstrInfo()->get(N->getMachineOpcode());
     unsigned OpIdx = Desc.getNumDefs() + OpNo;
     if (OpIdx >= Desc.getNumOperands())
       return nullptr;
@@ -133,15 +161,17 @@
     if (RegClass == -1)
       return nullptr;
 
-    return TM.getRegisterInfo()->getRegClass(RegClass);
+    return TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RegClass);
   }
   case AMDGPU::REG_SEQUENCE: {
     unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID);
+    const TargetRegisterClass *SuperRC =
+        TM.getSubtargetImpl()->getRegisterInfo()->getRegClass(RCID);
 
     SDValue SubRegOp = N->getOperand(OpNo + 1);
     unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
-    return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
+    return TM.getSubtargetImpl()->getRegisterInfo()->getSubClassWithSubReg(
+        SuperRC, SubRegIdx);
   }
   }
 }
@@ -229,10 +259,10 @@
   case AMDGPUISD::BUILD_VERTICAL_VECTOR:
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
-    const AMDGPURegisterInfo *TRI =
-                   static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
-    const SIRegisterInfo *SIRI =
-                   static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+    const AMDGPURegisterInfo *TRI = static_cast<const AMDGPURegisterInfo *>(
+        TM.getSubtargetImpl()->getRegisterInfo());
+    const SIRegisterInfo *SIRI = static_cast<const SIRegisterInfo *>(
+        TM.getSubtargetImpl()->getRegisterInfo());
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
     EVT EltVT = VT.getVectorElementType();
@@ -460,7 +490,16 @@
   case AMDGPUISD::DIV_SCALE: {
     return SelectDIV_SCALE(N);
   }
+  case ISD::CopyToReg: {
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+    Lowering.legalizeTargetIndependentNode(N, *CurDAG);
+    break;
   }
+  case ISD::ADDRSPACECAST:
+    return SelectAddrSpaceCast(N);
+  }
+
   return SelectCode(N);
 }
 
@@ -498,6 +537,10 @@
   return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
+bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
 bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
   return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
 }
@@ -529,6 +572,10 @@
   return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
+bool AMDGPUDAGToDAGISel::isFlatLoad(const  LoadSDNode *N) const {
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
 bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
   return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
 }
@@ -558,23 +605,16 @@
   const Value *MemVal = N->getMemOperand()->getValue();
   if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
       !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) &&
       !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
       !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
       !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
-      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){
+      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) {
     return true;
   }
   return false;
 }
 
-bool AMDGPUDAGToDAGISel::isCFDepth0() const {
-  // FIXME: Figure out a way to use DominatorTree analysis here.
-  const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock();
-  const Function *Fn = FuncInfo->Fn;
-  return &Fn->front() == CurBlock || &Fn->back() == CurBlock;
-}
-
-
 const char *AMDGPUDAGToDAGISel::getPassName() const {
   return "AMDGPU DAG->DAG Pattern Instruction Selection";
 }
@@ -677,14 +717,9 @@
   SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
 
 
-  unsigned Opc = IsAdd ? AMDGPU::S_ADD_I32 : AMDGPU::S_SUB_I32;
+  unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
 
-  if (!isCFDepth0()) {
-    Opc = IsAdd ? AMDGPU::V_ADD_I32_e32 : AMDGPU::V_SUB_I32_e32;
-    CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e32 : AMDGPU::V_SUBB_U32_e32;
-  }
-
   SDNode *AddLo = CurDAG->getMachineNode( Opc, DL, VTList, AddLoArgs);
   SDValue Carry(AddLo, 1);
   SDNode *AddHi
@@ -711,71 +746,401 @@
     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
 
   const SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
-
+  const SDValue False = CurDAG->getTargetConstant(0, MVT::i1);
   SDValue Ops[] = {
-    N->getOperand(0),
-    N->getOperand(1),
-    N->getOperand(2),
-    Zero,
-    Zero,
-    Zero,
-    Zero
+    Zero,             // src0_modifiers
+    N->getOperand(0), // src0
+    Zero,             // src1_modifiers
+    N->getOperand(1), // src1
+    Zero,             // src2_modifiers
+    N->getOperand(2), // src2
+    False,            // clamp
+    Zero              // omod
   };
 
   return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
 }
 
-static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) {
-  return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32,
-                                     Ptr), 0);
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+                                         unsigned OffsetBits) const {
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+  if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
+      (OffsetBits == 8 && !isUInt<8>(Offset)))
+    return false;
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS)
+    return true;
+
+  // On Southern Islands instruction with a negative base value and an offset
+  // don't seem to work.
+  return CurDAG->SignBitIsZero(Base);
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &Ptr,
-                                           SDValue &Offset,
-                                           SDValue &ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
+                                              SDValue &Offset) const {
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) {
+      // (add n0, c0)
+      Base = N0;
+      Offset = N1;
+      return true;
+    }
+  }
+
+  // If we have a constant address, prefer to put the constant into the
+  // offset. This can save moves to load the constant address since multiple
+  // operations can share the zero base address register, and enables merging
+  // into read2 / write2 instructions.
+  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    if (isUInt<16>(CAddr->getZExtValue())) {
+      SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
+      MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                 SDLoc(Addr), MVT::i32, Zero);
+      Base = SDValue(MovZero, 0);
+      Offset = Addr;
+      return true;
+    }
+  }
+
+  // default case
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i16);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
+                                                   SDValue &Offset0,
+                                                   SDValue &Offset1) const {
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+    unsigned DWordOffset0 = C1->getZExtValue() / 4;
+    unsigned DWordOffset1 = DWordOffset0 + 1;
+    // (add n0, c0)
+    if (isDSOffsetLegal(N0, DWordOffset1, 8)) {
+      Base = N0;
+      Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8);
+      return true;
+    }
+  }
+
+  if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    unsigned DWordOffset0 = CAddr->getZExtValue() / 4;
+    unsigned DWordOffset1 = DWordOffset0 + 1;
+    assert(4 * DWordOffset0 == CAddr->getZExtValue());
+
+    if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) {
+      SDValue Zero = CurDAG->getTargetConstant(0, MVT::i32);
+      MachineSDNode *MovZero
+        = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                 SDLoc(Addr), MVT::i32, Zero);
+      Base = SDValue(MovZero, 0);
+      Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8);
+      Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8);
+      return true;
+    }
+  }
+
+  // default case
+  Base = Addr;
+  Offset0 = CurDAG->getTargetConstant(0, MVT::i8);
+  Offset1 = CurDAG->getTargetConstant(1, MVT::i8);
+  return true;
+}
+
+static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
+  return isUInt<12>(Imm->getZExtValue());
+}
+
+void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+                                     SDValue &VAddr, SDValue &SOffset,
+                                     SDValue &Offset, SDValue &Offen,
+                                     SDValue &Idxen, SDValue &Addr64,
+                                     SDValue &GLC, SDValue &SLC,
+                                     SDValue &TFE) const {
   SDLoc DL(Addr);
 
+  GLC = CurDAG->getTargetConstant(0, MVT::i1);
+  SLC = CurDAG->getTargetConstant(0, MVT::i1);
+  TFE = CurDAG->getTargetConstant(0, MVT::i1);
+
+  Idxen = CurDAG->getTargetConstant(0, MVT::i1);
+  Offen = CurDAG->getTargetConstant(0, MVT::i1);
+  Addr64 = CurDAG->getTargetConstant(0, MVT::i1);
+  SOffset = CurDAG->getTargetConstant(0, MVT::i32);
+
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
 
-    if (isUInt<12>(C1->getZExtValue())) {
+    if (isLegalMUBUFImmOffset(C1)) {
 
       if (N0.getOpcode() == ISD::ADD) {
-        // (add (add N2, N3), C1)
+        // (add (add N2, N3), C1) -> addr64
         SDValue N2 = N0.getOperand(0);
         SDValue N3 = N0.getOperand(1);
-        Ptr = wrapAddr64Rsrc(CurDAG, DL, N2);
-        Offset = N3;
-        ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
-        return true;
+        Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
+        Ptr = N2;
+        VAddr = N3;
+        Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+        return;
       }
 
-      // (add N0, C1)
-      Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getTargetConstant(0, MVT::i64));;
-      Offset = N0;
+      // (add N0, C1) -> offset
+      VAddr = CurDAG->getTargetConstant(0, MVT::i32);
+      Ptr = N0;
+      Offset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
+      return;
+    }
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    // (add N0, N1) -> addr64
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    Addr64 = CurDAG->getTargetConstant(1, MVT::i1);
+    Ptr = N0;
+    VAddr = N1;
+    Offset = CurDAG->getTargetConstant(0, MVT::i16);
+    return;
+  }
+
+  // default case -> offset
+  VAddr = CurDAG->getTargetConstant(0, MVT::i32);
+  Ptr = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i16);
+
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &VAddr,
+                                           SDValue &Offset) const {
+  SDValue Ptr, SOffset, Offen, Idxen, Addr64, GLC, SLC, TFE;
+
+  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE);
+
+  ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
+  if (C->getSExtValue()) {
+    SDLoc DL(Addr);
+
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+
+    SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
+    return true;
+  }
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &VAddr, SDValue &Offset,
+                                           SDValue &SLC) const {
+  SLC = CurDAG->getTargetConstant(0, MVT::i1);
+
+  return SelectMUBUFAddr64(Addr, SRsrc, VAddr, Offset);
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
+                                            SDValue &VAddr, SDValue &SOffset,
+                                            SDValue &ImmOffset) const {
+
+  SDLoc DL(Addr);
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SITargetLowering& Lowering =
+    *static_cast<const SITargetLowering*>(getTargetLowering());
+
+  unsigned ScratchPtrReg =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+  unsigned ScratchOffsetReg =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+  Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
+                                ScratchOffsetReg, MVT::i32);
+
+  SDValue ScratchPtr =
+    CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                           MRI.getLiveInVirtReg(ScratchPtrReg), MVT::i64);
+  Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
+  SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+      MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+
+  // (add n0, c1)
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N1 = Addr.getOperand(1);
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
+
+    if (isLegalMUBUFImmOffset(C1)) {
+      VAddr = Addr.getOperand(0);
       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), MVT::i16);
       return true;
     }
   }
-  if (Addr.getOpcode() == ISD::ADD) {
-    // (add N0, N1)
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-    Ptr = wrapAddr64Rsrc(CurDAG, DL, N0);
-    Offset = N1;
-    ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
+
+  // (add FI, n0)
+  if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+       isa<FrameIndexSDNode>(Addr.getOperand(0))) {
+    VAddr = Addr.getOperand(1);
+    ImmOffset = Addr.getOperand(0);
     return true;
   }
 
-  // default case
-  Ptr = wrapAddr64Rsrc(CurDAG, DL, CurDAG->getConstant(0, MVT::i64));
-  Offset = Addr;
+  // (FI)
+  if (isa<FrameIndexSDNode>(Addr)) {
+    VAddr = SDValue(CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
+                                          CurDAG->getConstant(0, MVT::i32)), 0);
+    ImmOffset = Addr;
+    return true;
+  }
+
+  // (node)
+  VAddr = Addr;
   ImmOffset = CurDAG->getTargetConstant(0, MVT::i16);
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &SOffset, SDValue &Offset,
+                                           SDValue &GLC, SDValue &SLC,
+                                           SDValue &TFE) const {
+  SDValue Ptr, VAddr, Offen, Idxen, Addr64;
+
+  SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+              GLC, SLC, TFE);
+
+  if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
+      !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
+      !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
+    uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT |
+                    APInt::getAllOnesValue(32).getZExtValue(); // Size
+    SDLoc DL(Addr);
+
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
+
+    SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
+                                           SDValue &Soffset, SDValue &Offset,
+                                           SDValue &GLC) const {
+  SDValue SLC, TFE;
+
+  return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
+}
+
+// FIXME: This is incorrect and only enough to be able to compile.
+SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+  AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
+  SDLoc DL(N);
+
+  assert(Subtarget.hasFlatAddressSpace() &&
+         "addrspacecast only supported with flat address space!");
+
+  assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+          ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
+         "Cannot cast address space to / from constant address!");
+
+  assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+          ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
+         "Can only cast to / from flat address space!");
+
+  // The flat instructions read the address as the index of the VGPR holding the
+  // address, so casting should just be reinterpreting the base VGPR, so just
+  // insert trunc / bitcast / zext.
+
+  SDValue Src = ASC->getOperand(0);
+  EVT DestVT = ASC->getValueType(0);
+  EVT SrcVT = Src.getValueType();
+
+  unsigned SrcSize = SrcVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+
+  if (SrcSize > DestSize) {
+    assert(SrcSize == 64 && DestSize == 32);
+    return CurDAG->getMachineNode(
+      TargetOpcode::EXTRACT_SUBREG,
+      DL,
+      DestVT,
+      Src,
+      CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32));
+  }
+
+
+  if (DestSize > SrcSize) {
+    assert(SrcSize == 32 && DestSize == 64);
+
+    SDValue RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
+
+    const SDValue Ops[] = {
+      RC,
+      Src,
+      CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                     CurDAG->getConstant(0, MVT::i32)), 0),
+      CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32)
+    };
+
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+                                  SDLoc(N), N->getValueType(0), Ops);
+  }
+
+  assert(SrcSize == 64 && DestSize == 64);
+  return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
+                                        SDValue &SrcMods) const {
+
+  unsigned Mods = 0;
+
+  Src = In;
+
+  if (Src.getOpcode() == ISD::FNEG) {
+    Mods |= SISrcMods::NEG;
+    Src = Src.getOperand(0);
+  }
+
+  if (Src.getOpcode() == ISD::FABS) {
+    Mods |= SISrcMods::ABS;
+    Src = Src.getOperand(0);
+  }
+
+  SrcMods = CurDAG->getTargetConstant(Mods, MVT::i32);
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods, SDValue &Clamp,
+                                         SDValue &Omod) const {
+  // FIXME: Handle Clamp and Omod
+  Clamp = CurDAG->getTargetConstant(0, MVT::i32);
+  Omod = CurDAG->getTargetConstant(0, MVT::i32);
+
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
+                                              SDValue &SrcMods,
+                                              SDValue &Omod) const {
+  // FIXME: Handle Omod
+  Omod = CurDAG->getTargetConstant(0, MVT::i32);
+
+  return SelectVOP3Mods(In, Src, SrcMods);
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 0ada7a3..2f95b74 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp

@@ -21,7 +21,6 @@
 #include "AMDGPUSubtarget.h"
 #include "R600MachineFunctionInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -104,7 +103,7 @@
 }
 
 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
-  TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+  TargetLowering(TM) {
 
   Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
 
@@ -131,6 +130,9 @@
   setOperationAction(ISD::FROUND, MVT::f32, Legal);
   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
 
+  setOperationAction(ISD::FREM, MVT::f32, Custom);
+  setOperationAction(ISD::FREM, MVT::f64, Custom);
+
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -242,6 +244,12 @@
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
   }
 
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
     setOperationAction(ISD::SREM, VT, Expand);
@@ -271,15 +279,23 @@
   setOperationAction(ISD::ROTL, MVT::i64, Expand);
   setOperationAction(ISD::ROTR, MVT::i64, Expand);
 
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
   setOperationAction(ISD::MUL, MVT::i64, Expand);
   setOperationAction(ISD::MULHU, MVT::i64, Expand);
   setOperationAction(ISD::MULHS, MVT::i64, Expand);
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
+  if (!Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+
+  if (!Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+
   static const MVT::SimpleValueType VectorIntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
@@ -300,7 +316,6 @@
     setOperationAction(ISD::SUB,  VT, Expand);
     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
-    // TODO: Implement custom UREM / SREM routines.
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
@@ -332,12 +347,15 @@
 
   for (MVT VT : FloatVectorTypes) {
     setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FMINNUM, VT, Expand);
+    setOperationAction(ISD::FMAXNUM, VT, Expand);
     setOperationAction(ISD::FADD, VT, Expand);
     setOperationAction(ISD::FCEIL, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FDIV, VT, Expand);
     setOperationAction(ISD::FEXP2, VT, Expand);
     setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FREM, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
     setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -360,21 +378,25 @@
   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
 
   setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::STORE);
 
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
 
+  // SI at least has hardware support for floating point exceptions, but no way
+  // of using or handling them is implemented. They are also optional in OpenCL
+  // (Section 7.3)
+  setHasFloatingPointExceptions(false);
+
   setSelectIsExpensive(false);
   PredictableSelectIsExpensive = false;
 
   // There are no integer divide instructions, and these expand to a pretty
   // large sequence of instructions.
   setIntDivIsCheap(false);
-  setPow2DivIsCheap(false);
-
-  // TODO: Investigate this when 64-bit divides are implemented.
-  addBypassSlowDiv(64, 32);
+  setPow2SDivIsCheap(false);
 
   // FIXME: Need to really handle these.
   MaxStoresPerMemcpy  = 4096;
@@ -426,12 +448,12 @@
 
 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
   assert(VT.isFloatingPoint());
-  return VT == MVT::f32;
+  return VT == MVT::f32 || VT == MVT::f64;
 }
 
 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
   assert(VT.isFloatingPoint());
-  return VT == MVT::f32;
+  return VT == MVT::f32 || VT == MVT::f64;
 }
 
 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
@@ -531,16 +553,18 @@
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
-  case ISD::SDIV: return LowerSDIV(Op, DAG);
-  case ISD::SREM: return LowerSREM(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
+  case ISD::FREM: return LowerFREM(Op, DAG);
   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
   case ISD::FRINT: return LowerFRINT(Op, DAG);
   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
+  case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
   }
   return Op;
 }
@@ -595,7 +619,7 @@
                                                        const SDValue &InitPtr,
                                                        SDValue Chain,
                                                        SelectionDAG &DAG) const {
-  const DataLayout *TD = getTargetMachine().getDataLayout();
+  const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
   SDLoc DL(InitPtr);
   Type *InitTy = Init->getType();
 
@@ -668,22 +692,35 @@
   llvm_unreachable("Unhandled constant initializer");
 }
 
+static bool hasDefinedInitializer(const GlobalValue *GV) {
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar || !GVar->hasInitializer())
+    return false;
+
+  if (isa<UndefValue>(GVar->getInitializer()))
+    return false;
+
+  return true;
+}
+
 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                  SDValue Op,
                                                  SelectionDAG &DAG) const {
 
-  const DataLayout *TD = getTargetMachine().getDataLayout();
+  const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
   switch (G->getAddressSpace()) {
-  default: llvm_unreachable("Global Address lowering not implemented for this "
-                            "address space");
   case AMDGPUAS::LOCAL_ADDRESS: {
     // XXX: What does the value of G->getOffset() mean?
     assert(G->getOffset() == 0 &&
          "Do not know what to do with an non-zero offset");
 
+    // TODO: We could emit code to handle the initialization somewhere.
+    if (hasDefinedInitializer(GV))
+      break;
+
     unsigned Offset;
     if (MFI->LocalMemoryObjects.count(GV) == 0) {
       uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
@@ -695,7 +732,7 @@
       Offset = MFI->LocalMemoryObjects[GV];
     }
 
-    return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
+    return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
   }
   case AMDGPUAS::CONSTANT_ADDRESS: {
     MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
@@ -737,6 +774,12 @@
     return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
   }
   }
+
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+  DiagnosticInfoUnsupported BadInit(Fn,
+                                    "initializer for address space");
+  DAG.getContext()->diagnose(BadInit);
+  return SDValue();
 }
 
 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
@@ -767,8 +810,8 @@
                                               SelectionDAG &DAG) const {
 
   MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+      getTargetMachine().getSubtargetImpl()->getFrameLowering());
 
   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
@@ -810,13 +853,21 @@
       // first parameter must be the same as the first instruction.
       SDValue Numerator = Op.getOperand(1);
       SDValue Denominator = Op.getOperand(2);
+
+      // Note this order is opposite of the machine instruction's operations,
+      // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
+      // intrinsic has the numerator as the first operand to match a normal
+      // division operation.
+
       SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
 
-      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
-                         Src0, Denominator, Numerator);
+      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+                         Denominator, Numerator);
     }
 
     case Intrinsic::AMDGPU_div_fmas:
+      // FIXME: Dropping bool parameter. Work is needed to support the implicit
+      // read from VCC.
       return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
 
@@ -840,6 +891,10 @@
     case Intrinsic::AMDGPU_rsq_clamped:
       return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
 
+    case Intrinsic::AMDGPU_ldexp:
+      return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
+                                                   Op.getOperand(2));
+
     case AMDGPUIntrinsic::AMDGPU_imax:
       return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
@@ -945,21 +1000,16 @@
 }
 
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
-                                            SelectionDAG &DAG) const {
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  SDValue True = N->getOperand(2);
-  SDValue False = N->getOperand(3);
-  SDValue CC = N->getOperand(4);
-
-  if (VT != MVT::f32 ||
-      !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
+SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL,
+                                             EVT VT,
+                                             SDValue LHS,
+                                             SDValue RHS,
+                                             SDValue True,
+                                             SDValue False,
+                                             SDValue CC,
+                                             SelectionDAG &DAG) const {
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
     return SDValue();
-  }
 
   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   switch (CCOpcode) {
@@ -975,15 +1025,19 @@
   case ISD::SETTRUE2:
   case ISD::SETUO:
   case ISD::SETO:
-    llvm_unreachable("Operation should already be optimised!");
+    break;
   case ISD::SETULE:
   case ISD::SETULT:
   case ISD::SETOLE:
   case ISD::SETOLT:
   case ISD::SETLE:
   case ISD::SETLT: {
-    unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+    // We need to permute the operands to get the correct NaN behavior. The
+    // selected operand is the second one based on the failing compare with NaN,
+    // so permute it based on the compare type the hardware uses.
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
   }
   case ISD::SETGT:
   case ISD::SETGE:
@@ -991,8 +1045,9 @@
   case ISD::SETOGE:
   case ISD::SETUGT:
   case ISD::SETOGT: {
-    unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
   }
   case ISD::SETCC_INVALID:
     llvm_unreachable("Invalid setcc condcode!");
@@ -1000,12 +1055,53 @@
   return SDValue();
 }
 
-SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
-                                              SelectionDAG &DAG) const {
-  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
-  EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
+/// \brief Generate Min/Max node
+SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
+                                             EVT VT,
+                                             SDValue LHS,
+                                             SDValue RHS,
+                                             SDValue True,
+                                             SDValue False,
+                                             SDValue CC,
+                                             SelectionDAG &DAG) const {
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  switch (CCOpcode) {
+  case ISD::SETULE:
+  case ISD::SETULT: {
+    unsigned Opc = (LHS == True) ? AMDGPUISD::UMIN : AMDGPUISD::UMAX;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  case ISD::SETLE:
+  case ISD::SETLT: {
+    unsigned Opc = (LHS == True) ? AMDGPUISD::SMIN : AMDGPUISD::SMAX;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  case ISD::SETGT:
+  case ISD::SETGE: {
+    unsigned Opc = (LHS == True) ? AMDGPUISD::SMAX : AMDGPUISD::SMIN;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  case ISD::SETUGE:
+  case ISD::SETUGT: {
+    unsigned Opc = (LHS == True) ? AMDGPUISD::UMAX : AMDGPUISD::UMIN;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  EVT MemVT = Load->getMemoryVT();
+  EVT MemEltVT = MemVT.getVectorElementType();
+
   EVT LoadVT = Op.getValueType();
-  EVT EltVT = Op.getValueType().getVectorElementType();
+  EVT EltVT = LoadVT.getVectorElementType();
   EVT PtrVT = Load->getBasePtr().getValueType();
 
   unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
@@ -1013,17 +1109,19 @@
   SmallVector<SDValue, 8> Chains;
 
   SDLoc SL(Op);
+  unsigned MemEltSize = MemEltVT.getStoreSize();
+  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
 
-  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+  for (unsigned i = 0; i < NumElts; ++i) {
     SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
-                    DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
+                              DAG.getConstant(i * MemEltSize, PtrVT));
 
     SDValue NewLoad
       = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
                        Load->getChain(), Ptr,
-                       MachinePointerInfo(Load->getMemOperand()->getValue()),
+                       SrcValue.getWithOffset(i * MemEltSize),
                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
-                       Load->getAlignment());
+                       Load->isInvariant(), Load->getAlignment());
     Loads.push_back(NewLoad.getValue(0));
     Chains.push_back(NewLoad.getValue(1));
   }
@@ -1036,6 +1134,55 @@
   return DAG.getMergeValues(Ops, SL);
 }
 
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // If this is a 2 element vector, we really want to scalarize and not create
+  // weird 1 element vectors.
+  if (VT.getVectorNumElements() == 2)
+    return ScalarizeVectorLoad(Op, DAG);
+
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  SDValue BasePtr = Load->getBasePtr();
+  EVT PtrVT = BasePtr.getValueType();
+  EVT MemVT = Load->getMemoryVT();
+  SDLoc SL(Op);
+  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+
+  EVT LoVT, HiVT;
+  EVT LoMemVT, HiMemVT;
+  SDValue Lo, Hi;
+
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+  std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+  SDValue LoLoad
+    = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
+                     Load->getChain(), BasePtr,
+                     SrcValue,
+                     LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
+                     Load->isInvariant(), Load->getAlignment());
+
+  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
+
+  SDValue HiLoad
+    = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
+                     Load->getChain(), HiPtr,
+                     SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                     HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
+                     Load->isInvariant(), Load->getAlignment());
+
+  SDValue Ops[] = {
+    DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
+    DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+                LoLoad.getValue(1), HiLoad.getValue(1))
+  };
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
                                                SelectionDAG &DAG) const {
   StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1094,8 +1241,8 @@
                       Store->getAlignment());
 }
 
-SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
-                                            SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
+                                                   SelectionDAG &DAG) const {
   StoreSDNode *Store = cast<StoreSDNode>(Op);
   EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
   EVT EltVT = Store->getValue().getValueType().getVectorElementType();
@@ -1105,21 +1252,77 @@
 
   SmallVector<SDValue, 8> Chains;
 
+  unsigned EltSize = MemEltVT.getStoreSize();
+  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+
   for (unsigned i = 0, e = NumElts; i != e; ++i) {
     SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
-                              Store->getValue(), DAG.getConstant(i, MVT::i32));
-    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
-                              Store->getBasePtr(),
-                            DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
-                                            PtrVT));
-    Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
-                         MachinePointerInfo(Store->getMemOperand()->getValue()),
-                         MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
-                         Store->getAlignment()));
+                              Store->getValue(),
+                              DAG.getConstant(i, MVT::i32));
+
+    SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT);
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
+    SDValue NewStore =
+      DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
+                        SrcValue.getWithOffset(i * EltSize),
+                        MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
+                        Store->getAlignment());
+    Chains.push_back(NewStore);
   }
+
   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
 }
 
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  SDValue Val = Store->getValue();
+  EVT VT = Val.getValueType();
+
+  // If this is a 2 element vector, we really want to scalarize and not create
+  // weird 1 element vectors.
+  if (VT.getVectorNumElements() == 2)
+    return ScalarizeVectorStore(Op, DAG);
+
+  EVT MemVT = Store->getMemoryVT();
+  SDValue Chain = Store->getChain();
+  SDValue BasePtr = Store->getBasePtr();
+  SDLoc SL(Op);
+
+  EVT LoVT, HiVT;
+  EVT LoMemVT, HiMemVT;
+  SDValue Lo, Hi;
+
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+  std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
+
+  EVT PtrVT = BasePtr.getValueType();
+  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
+
+  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+  SDValue LoStore
+    = DAG.getTruncStore(Chain, SL, Lo,
+                        BasePtr,
+                        SrcValue,
+                        LoMemVT,
+                        Store->isNonTemporal(),
+                        Store->isVolatile(),
+                        Store->getAlignment());
+  SDValue HiStore
+    = DAG.getTruncStore(Chain, SL, Hi,
+                        HiPtr,
+                        SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                        HiMemVT,
+                        Store->isNonTemporal(),
+                        Store->isVolatile(),
+                        Store->getAlignment());
+
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
+}
+
+
 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1165,22 +1368,8 @@
     return DAG.getMergeValues(Ops, DL);
   }
 
-  // Lower loads constant address space global variable loads
-  if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-      isa<GlobalVariable>(
-          GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
-
-
-    SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
-        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
-    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
-        DAG.getConstant(2, MVT::i32));
-    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
-                       Load->getChain(), Ptr,
-                       DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
-  }
-
-  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+      Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
       ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
     return SDValue();
 
@@ -1231,7 +1420,7 @@
   if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
        Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
       Store->getValue().getValueType().isVector()) {
-    return SplitVectorStore(Op, DAG);
+    return ScalarizeVectorStore(Op, DAG);
   }
 
   EVT MemVT = Store->getMemoryVT();
@@ -1276,249 +1465,179 @@
   return SDValue();
 }
 
-SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
+// This is a shortcut for integer division because we have fast i32<->f32
+// conversions, and fast f32 reciprocal instructions. The fractional part of a
+// float is enough to accurately represent up to a 24-bit integer.
+SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
   SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
+  EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
-  MVT INTTY;
-  MVT FLTTY;
-  if (!OVT.isVector()) {
-    INTTY = MVT::i32;
-    FLTTY = MVT::f32;
-  } else if (OVT.getVectorNumElements() == 2) {
-    INTTY = MVT::v2i32;
-    FLTTY = MVT::v2f32;
-  } else if (OVT.getVectorNumElements() == 4) {
-    INTTY = MVT::v4i32;
-    FLTTY = MVT::v4f32;
+  MVT IntVT = MVT::i32;
+  MVT FltVT = MVT::f32;
+
+  ISD::NodeType ToFp  = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+  ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+
+  if (VT.isVector()) {
+    unsigned NElts = VT.getVectorNumElements();
+    IntVT = MVT::getVectorVT(MVT::i32, NElts);
+    FltVT = MVT::getVectorVT(MVT::f32, NElts);
   }
-  unsigned bitsize = OVT.getScalarType().getSizeInBits();
-  // char|short jq = ia ^ ib;
-  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
 
-  // jq = jq >> (bitsize - 2)
-  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
+  unsigned BitSize = VT.getScalarType().getSizeInBits();
 
-  // jq = jq | 0x1
-  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+  SDValue jq = DAG.getConstant(1, IntVT);
 
-  // jq = (int)jq
-  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+  if (sign) {
+    // char|short jq = ia ^ ib;
+    jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
+
+    // jq = jq >> (bitsize - 2)
+    jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT));
+
+    // jq = jq | 0x1
+    jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT));
+
+    // jq = (int)jq
+    jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
+  }
 
   // int ia = (int)LHS;
-  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+  SDValue ia = sign ?
+    DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
 
   // int ib, (int)RHS;
-  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+  SDValue ib = sign ?
+    DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
 
   // float fa = (float)ia;
-  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+  SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
 
   // float fb = (float)ib;
-  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+  SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
 
   // float fq = native_divide(fa, fb);
-  SDValue fq = DAG.getNode(ISD::FMUL, DL, FLTTY,
-                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FLTTY, fb));
+  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
+                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
 
   // fq = trunc(fq);
-  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
 
   // float fqneg = -fq;
-  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
 
   // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
-      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
+  SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
+                           DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
 
   // int iq = (int)fq;
-  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+  SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
 
   // fr = fabs(fr);
-  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
 
   // fb = fabs(fb);
-  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
 
   // int cv = fr >= fb;
-  SDValue cv;
-  if (INTTY == MVT::i32) {
-    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
-  } else {
-    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
-  }
+  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
+
   // jq = (cv ? jq : 0);
-  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
-      DAG.getConstant(0, OVT));
+  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT));
+
+  // dst = trunc/extend to legal type
+  iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
+
   // dst = iq + jq;
-  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
-  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
-  return iq;
+  SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
+
+  // Rem needs compensation, it's easier to recompute it
+  SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
+  Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
+
+  SDValue Res[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Res, DL);
 }
 
-SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
+void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
+                                      SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &Results) const {
+  assert(Op.getValueType() == MVT::i64);
+
   SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
+  EVT VT = Op.getValueType();
+  EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+  SDValue one = DAG.getConstant(1, HalfVT);
+  SDValue zero = DAG.getConstant(0, HalfVT);
+
+  //HiLo split
   SDValue LHS = Op.getOperand(0);
+  SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+  SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
   SDValue RHS = Op.getOperand(1);
-  // The LowerSDIV32 function generates equivalent to the following IL.
-  // mov r0, LHS
-  // mov r1, RHS
-  // ilt r10, r0, 0
-  // ilt r11, r1, 0
-  // iadd r0, r0, r10
-  // iadd r1, r1, r11
-  // ixor r0, r0, r10
-  // ixor r1, r1, r11
-  // udiv r0, r0, r1
-  // ixor r10, r10, r11
-  // iadd r0, r0, r10
-  // ixor DST, r0, r10
+  SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+  SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
 
-  // mov r0, LHS
-  SDValue r0 = LHS;
+  // Get Speculative values
+  SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+  SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
 
-  // mov r1, RHS
-  SDValue r1 = RHS;
+  SDValue REM_Hi = zero;
+  SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
 
-  // ilt r10, r0, 0
-  SDValue r10 = DAG.getSelectCC(DL,
-      r0, DAG.getConstant(0, OVT),
-      DAG.getConstant(-1, OVT),
-      DAG.getConstant(0, OVT),
-      ISD::SETLT);
+  SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+  SDValue DIV_Lo = zero;
 
-  // ilt r11, r1, 0
-  SDValue r11 = DAG.getSelectCC(DL,
-      r1, DAG.getConstant(0, OVT),
-      DAG.getConstant(-1, OVT),
-      DAG.getConstant(0, OVT),
-      ISD::SETLT);
+  const unsigned halfBitWidth = HalfVT.getSizeInBits();
 
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+  for (unsigned i = 0; i < halfBitWidth; ++i) {
+    SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
+    // Get Value of high bit
+    SDValue HBit;
+    if (halfBitWidth == 32 && Subtarget->hasBFE()) {
+      HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
+    } else {
+      HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+      HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+    }
 
-  // iadd r1, r1, r11
-  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+    SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
+      DAG.getConstant(halfBitWidth - 1, HalfVT));
+    REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
+    REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
 
-  // ixor r0, r0, r10
-  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+    REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
+    REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
 
-  // ixor r1, r1, r11
-  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
 
-  // udiv r0, r0, r1
-  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 
-  // ixor r10, r10, r11
-  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+    SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+    SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
 
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+    DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
 
-  // ixor DST, r0, r10
-  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-  return DST;
-}
+    // Update REM
 
-SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
-  return SDValue(Op.getNode(), 0);
-}
+    SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
 
-SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
-  EVT OVT = Op.getValueType().getScalarType();
-
-  if (OVT == MVT::i64)
-    return LowerSDIV64(Op, DAG);
-
-  if (OVT.getScalarType() == MVT::i32)
-    return LowerSDIV32(Op, DAG);
-
-  if (OVT == MVT::i16 || OVT == MVT::i8) {
-    // FIXME: We should be checking for the masked bits. This isn't reached
-    // because i8 and i16 are not legal types.
-    return LowerSDIV24(Op, DAG);
+    REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
+    REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
+    REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
   }
 
-  return SDValue(Op.getNode(), 0);
-}
-
-SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  // The LowerSREM32 function generates equivalent to the following IL.
-  // mov r0, LHS
-  // mov r1, RHS
-  // ilt r10, r0, 0
-  // ilt r11, r1, 0
-  // iadd r0, r0, r10
-  // iadd r1, r1, r11
-  // ixor r0, r0, r10
-  // ixor r1, r1, r11
-  // udiv r20, r0, r1
-  // umul r20, r20, r1
-  // sub r0, r0, r20
-  // iadd r0, r0, r10
-  // ixor DST, r0, r10
-
-  // mov r0, LHS
-  SDValue r0 = LHS;
-
-  // mov r1, RHS
-  SDValue r1 = RHS;
-
-  // ilt r10, r0, 0
-  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
-
-  // ilt r11, r1, 0
-  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // iadd r1, r1, r11
-  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
-
-  // ixor r0, r0, r10
-  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-
-  // ixor r1, r1, r11
-  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
-
-  // udiv r20, r0, r1
-  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
-
-  // umul r20, r20, r1
-  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
-
-  // sub r0, r0, r20
-  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
-
-  // iadd r0, r0, r10
-  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
-
-  // ixor DST, r0, r10
-  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
-  return DST;
-}
-
-SDValue AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
-  return SDValue(Op.getNode(), 0);
-}
-
-SDValue AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
-  EVT OVT = Op.getValueType();
-
-  if (OVT.getScalarType() == MVT::i64)
-    return LowerSREM64(Op, DAG);
-
-  if (OVT.getScalarType() == MVT::i32)
-    return LowerSREM32(Op, DAG);
-
-  return SDValue(Op.getNode(), 0);
+  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+  SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+  Results.push_back(DIV);
+  Results.push_back(REM);
 }
 
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
@@ -1526,15 +1645,31 @@
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
+  if (VT == MVT::i64) {
+    SmallVector<SDValue, 2> Results;
+    LowerUDIVREM64(Op, DAG, Results);
+    return DAG.getMergeValues(Results, DL);
+  }
+
   SDValue Num = Op.getOperand(0);
   SDValue Den = Op.getOperand(1);
 
+  if (VT == MVT::i32) {
+    if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) &&
+        DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) {
+      // TODO: We technically could do this for i64, but shouldn't that just be
+      // handled by something generally reducing 64-bit division on 32-bit
+      // values to 32-bit?
+      return LowerDIVREM24(Op, DAG, false);
+    }
+  }
+
   // RCP =  URECIP(Den) = 2^32 / Den + e
   // e is rounding error.
   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
 
-  // RCP_LO = umulo(RCP, Den) */
-  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
+  // RCP_LO = mul(RCP, Den) */
+  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
 
   // RCP_HI = mulhu (RCP, Den) */
   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
@@ -1565,7 +1700,7 @@
   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
 
   // Num_S_Remainder = Quotient * Den
-  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
+  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
 
   // Remainder = Num - Num_S_Remainder
   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
@@ -1630,12 +1765,22 @@
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
-  SDValue Zero = DAG.getConstant(0, VT);
-  SDValue NegOne = DAG.getConstant(-1, VT);
-
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
 
+  if (VT == MVT::i32) {
+    if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 &&
+        DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) {
+      // TODO: We technically could do this for i64, but shouldn't that just be
+      // handled by something generally reducing 64-bit division on 32-bit
+      // values to 32-bit?
+      return LowerDIVREM24(Op, DAG, true);
+    }
+  }
+
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue NegOne = DAG.getConstant(-1, VT);
+
   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
@@ -1663,6 +1808,20 @@
   return DAG.getMergeValues(Res, DL);
 }
 
+// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
+SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  EVT VT = Op.getValueType();
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+
+  SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
+  SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
+
+  return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
+}
+
 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
@@ -1705,7 +1864,7 @@
   const unsigned ExpBits = 11;
 
   // Extract the exponent.
-  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32,
+  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
                                 Hi,
                                 DAG.getConstant(FractBits - 32, MVT::i32),
                                 DAG.getConstant(ExpBits, MVT::i32));
@@ -1796,13 +1955,43 @@
   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
 }
 
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
+                                               bool Signed) const {
+  SDLoc SL(Op);
+  SDValue Src = Op.getOperand(0);
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+                           DAG.getConstant(0, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
+                           DAG.getConstant(1, MVT::i32));
+
+  SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
+                              SL, MVT::f64, Hi);
+
+  SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
+
+  SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
+                              DAG.getConstant(32, MVT::i32));
+
+  return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
+}
+
 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue S0 = Op.getOperand(0);
-  SDLoc DL(Op);
-  if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
+  if (S0.getValueType() != MVT::i64)
     return SDValue();
 
+  EVT DestVT = Op.getValueType();
+  if (DestVT == MVT::f64)
+    return LowerINT_TO_FP64(Op, DAG, false);
+
+  assert(DestVT == MVT::f32);
+
+  SDLoc DL(Op);
+
   // f32 uint_to_fp i64
   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
                            DAG.getConstant(0, MVT::i32));
@@ -1815,16 +2004,62 @@
   return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
 }
 
-SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
-                                                      unsigned BitsDiff,
-                                                      SelectionDAG &DAG) const {
-  MVT VT = Op.getSimpleValueType();
-  SDLoc DL(Op);
-  SDValue Shift = DAG.getConstant(BitsDiff, VT);
-  // Shift left by 'Shift' bits.
-  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift);
-  // Signed shift Right by 'Shift' bits.
-  return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift);
+SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64)
+    return LowerINT_TO_FP64(Op, DAG, true);
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                               bool Signed) const {
+  SDLoc SL(Op);
+
+  SDValue Src = Op.getOperand(0);
+
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+
+  SDValue K0
+    = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), MVT::f64);
+  SDValue K1
+    = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), MVT::f64);
+
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
+
+  SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
+
+
+  SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
+
+  SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
+                           MVT::i32, FloorMul);
+  SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi);
+
+  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+
+  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+    return LowerFP64_TO_INT(Op, DAG, true);
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+
+  if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
+    return LowerFP64_TO_INT(Op, DAG, false);
+
+  return SDValue();
 }
 
 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
@@ -1890,13 +2125,64 @@
 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
                                uint32_t Offset, uint32_t Width) {
   if (Width + Offset < 32) {
-    IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
+    uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
+    IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
     return DAG.getConstant(Result, MVT::i32);
   }
 
   return DAG.getConstant(Src0 >> Offset, MVT::i32);
 }
 
+static bool usesAllNormalStores(SDNode *LoadVal) {
+  for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
+    if (!ISD::isNormalStore(*I))
+      return false;
+  }
+
+  return true;
+}
+
+// If we have a copy of an illegal type, replace it with a load / store of an
+// equivalently sized legal type. This avoids intermediate bit pack / unpack
+// instructions emitted when handling extloads and truncstores. Ideally we could
+// recognize the pack / unpack pattern to eliminate it.
+SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *SN = cast<StoreSDNode>(N);
+  SDValue Value = SN->getValue();
+  EVT VT = Value.getValueType();
+
+  if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode()))
+    return SDValue();
+
+  LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
+  if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
+    return SDValue();
+
+  EVT MemVT = LoadVal->getMemoryVT();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
+
+  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+                                LoadVT, SL,
+                                LoadVal->getChain(),
+                                LoadVal->getBasePtr(),
+                                LoadVal->getOffset(),
+                                LoadVT,
+                                LoadVal->getMemOperand());
+
+  SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
+  DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
+
+  return DAG.getStore(SN->getChain(), SL, NewLoad,
+                      SN->getBasePtr(), SN->getMemOperand());
+}
+
 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
@@ -1929,7 +2215,7 @@
 }
 
 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
-                                            DAGCombinerInfo &DCI) const {
+                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
 
@@ -1945,9 +2231,51 @@
       simplifyI24(N1, DCI);
       return SDValue();
     }
-    case ISD::SELECT_CC: {
-      return CombineMinMax(N, DAG);
+  case ISD::SELECT_CC: {
+    SDLoc DL(N);
+    EVT VT = N->getValueType(0);
+
+    if (VT == MVT::f32 ||
+        (VT == MVT::f64 &&
+         Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
+      SDValue LHS = N->getOperand(0);
+      SDValue RHS = N->getOperand(1);
+      SDValue True = N->getOperand(2);
+      SDValue False = N->getOperand(3);
+      SDValue CC = N->getOperand(4);
+
+      return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
     }
+
+    break;
+  }
+  case ISD::SELECT: {
+    SDValue Cond = N->getOperand(0);
+    if (Cond.getOpcode() == ISD::SETCC) {
+      SDLoc DL(N);
+      EVT VT = N->getValueType(0);
+      SDValue LHS = Cond.getOperand(0);
+      SDValue RHS = Cond.getOperand(1);
+      SDValue CC = Cond.getOperand(2);
+
+      SDValue True = N->getOperand(1);
+      SDValue False = N->getOperand(2);
+
+      if (VT == MVT::f32 ||
+          (VT == MVT::f64 &&
+           Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
+        return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
+      }
+
+      // TODO: Implement min / max Evergreen instructions.
+      if (VT == MVT::i32 &&
+          Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+        return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
+      }
+    }
+
+    break;
+  }
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     assert(!N->getValueType(0).isVector() &&
@@ -1992,41 +2320,47 @@
       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
     }
 
-    if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+    if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
       if (Signed) {
         return constantFoldBFE<int32_t>(DAG,
-                                        Val->getSExtValue(),
+                                        CVal->getSExtValue(),
                                         OffsetVal,
                                         WidthVal);
       }
 
       return constantFoldBFE<uint32_t>(DAG,
-                                       Val->getZExtValue(),
+                                       CVal->getZExtValue(),
                                        OffsetVal,
                                        WidthVal);
     }
 
-    APInt Demanded = APInt::getBitsSet(32,
-                                       OffsetVal,
-                                       OffsetVal + WidthVal);
-
     if ((OffsetVal + WidthVal) >= 32) {
       SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
                          BitsFrom, ShiftVal);
     }
 
-    APInt KnownZero, KnownOne;
-    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
-                                          !DCI.isBeforeLegalizeOps());
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
-        TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
-      DCI.CommitTargetLoweringOpt(TLO);
+    if (BitsFrom.hasOneUse()) {
+      APInt Demanded = APInt::getBitsSet(32,
+                                         OffsetVal,
+                                         OffsetVal + WidthVal);
+
+      APInt KnownZero, KnownOne;
+      TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                            !DCI.isBeforeLegalizeOps());
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+          TLI.SimplifyDemandedBits(BitsFrom, Demanded,
+                                   KnownZero, KnownOne, TLO)) {
+        DCI.CommitTargetLoweringOpt(TLO);
+      }
     }
 
     break;
   }
+
+  case ISD::STORE:
+    return performStoreCombine(N, DCI);
   }
   return SDValue();
 }
@@ -2117,12 +2451,19 @@
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(CLAMP)
-  NODE_NAME_CASE(FMAX)
+  NODE_NAME_CASE(MAD)
+  NODE_NAME_CASE(FMAX_LEGACY)
   NODE_NAME_CASE(SMAX)
   NODE_NAME_CASE(UMAX)
-  NODE_NAME_CASE(FMIN)
+  NODE_NAME_CASE(FMIN_LEGACY)
   NODE_NAME_CASE(SMIN)
   NODE_NAME_CASE(UMIN)
+  NODE_NAME_CASE(FMAX3)
+  NODE_NAME_CASE(SMAX3)
+  NODE_NAME_CASE(UMAX3)
+  NODE_NAME_CASE(FMIN3)
+  NODE_NAME_CASE(SMIN3)
+  NODE_NAME_CASE(UMIN3)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DIV_SCALE)
   NODE_NAME_CASE(DIV_FMAS)
@@ -2132,6 +2473,7 @@
   NODE_NAME_CASE(RSQ)
   NODE_NAME_CASE(RSQ_LEGACY)
   NODE_NAME_CASE(RSQ_CLAMPED)
+  NODE_NAME_CASE(LDEXP)
   NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(BFE_U32)
   NODE_NAME_CASE(BFE_I32)
@@ -2157,6 +2499,7 @@
   NODE_NAME_CASE(CVT_F32_UBYTE2)
   NODE_NAME_CASE(CVT_F32_UBYTE3)
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
+  NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
@@ -2225,17 +2568,8 @@
 
     unsigned BitWidth = 32;
     uint32_t Width = CWidth->getZExtValue() & 0x1f;
-    if (Width == 0) {
-      KnownZero = APInt::getAllOnesValue(BitWidth);
-      KnownOne = APInt::getNullValue(BitWidth);
-      return;
-    }
 
-    // FIXME: This could do a lot more. If offset is 0, should be the same as
-    // sign_extend_inreg implementation, but that involves duplicating it.
-    if (Opc == AMDGPUISD::BFE_I32)
-      KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
-    else
+    if (Opc == AMDGPUISD::BFE_U32)
       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
 
     break;

diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 98a92ad..36b4ee6 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPUISELLOWERING_H
-#define AMDGPUISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
+#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H
 
 #include "llvm/Target/TargetLowering.h"
 
@@ -43,48 +43,52 @@
   /// \brief Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
 
-  SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue ExpandSIGN_EXTEND_INREG(SDValue Op,
-                                  unsigned BitsDiff,
-                                  SelectionDAG &DAG) const;
+  SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
+  SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
 protected:
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
   static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT);
 
-  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
-  /// MachineFunction.
-  ///
-  /// \returns a RegisterSDNode representing Reg.
-  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
-                                       const TargetRegisterClass *RC,
-                                       unsigned Reg, EVT VT) const;
-  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
-                             SelectionDAG &DAG) const;
-  /// \brief Split a vector load into multiple scalar loads.
-  SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
+  virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                                     SelectionDAG &DAG) const;
+
+  /// \brief Split a vector load into a scalar load of each component.
+  SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Split a vector load into 2 loads of half the vector.
+  SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Split a vector store into a scalar store of each component.
+  SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Split a vector store into 2 stores of half the vector.
   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
+  void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &Results) const;
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
 
@@ -138,7 +142,23 @@
 
   SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const;
+  SDValue CombineFMinMax(SDLoc DL,
+                         EVT VT,
+                         SDValue LHS,
+                         SDValue RHS,
+                         SDValue True,
+                         SDValue False,
+                         SDValue CC,
+                         SelectionDAG &DAG) const;
+  SDValue CombineIMinMax(SDLoc DL,
+                         EVT VT,
+                         SDValue LHS,
+                         SDValue RHS,
+                         SDValue True,
+                         SDValue False,
+                         SDValue CC,
+                         SelectionDAG &DAG) const;
+
   const char* getTargetNodeName(unsigned Opcode) const override;
 
   virtual SDNode *PostISelFolding(MachineSDNode *N,
@@ -155,10 +175,16 @@
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
-  virtual unsigned ComputeNumSignBitsForTargetNode(
-    SDValue Op,
-    const SelectionDAG &DAG,
-    unsigned Depth = 0) const override;
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
+                                           unsigned Depth = 0) const override;
+
+  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+  /// MachineFunction.
+  ///
+  /// \returns a RegisterSDNode representing Reg.
+  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Reg, EVT VT) const;
 };
 
 namespace AMDGPUISD {
@@ -174,17 +200,24 @@
   DWORDADDR,
   FRACT,
   CLAMP,
+  MAD, // Multiply + add with same result as the separate operations.
 
   // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
   // Denormals handled on some parts.
   COS_HW,
   SIN_HW,
-  FMAX,
+  FMAX_LEGACY,
   SMAX,
   UMAX,
-  FMIN,
+  FMIN_LEGACY,
   SMIN,
   UMIN,
+  FMAX3,
+  SMAX3,
+  UMAX3,
+  FMIN3,
+  SMIN3,
+  UMIN3,
   URECIP,
   DIV_SCALE,
   DIV_FMAS,
@@ -197,6 +230,7 @@
   RSQ,
   RSQ_LEGACY,
   RSQ_CLAMPED,
+  LDEXP,
   DOT4,
   BFE_U32, // Extract range of bits with zero extension to 32-bits.
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
@@ -232,6 +266,8 @@
   /// T2|v.z| | | |
   /// T3|v.w| | | |
   BUILD_VERTICAL_VECTOR,
+  /// Pointer to the start of the shader's constant data.
+  CONST_DATA_PTR,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   STORE_MSKOR,
   LOAD_CONSTANT,
@@ -244,4 +280,4 @@
 
 } // End namespace llvm
 
-#endif // AMDGPUISELLOWERING_H
+#endif

diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index fef5b8c..a8fc614 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp

@@ -86,21 +86,6 @@
 // TODO: Implement this function
   return nullptr;
 }
-bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
-                                        MachineBasicBlock &MBB) const {
-  while (iter != MBB.end()) {
-    switch (iter->getOpcode()) {
-    default:
-      break;
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32:
-    case AMDGPU::BRANCH:
-      return true;
-    };
-    ++iter;
-  }
-  return false;
-}
 
 void
 AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -147,7 +132,6 @@
   } else if (isRegisterStore(*MI)) {
     int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                               AMDGPU::OpName::val);
-    AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
     unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
     unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
     unsigned Address = calculateIndirectAddress(RegIndex, Channel);
@@ -215,15 +199,30 @@
   return 0;
 }
 
-bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
-                                             int64_t Offset1, int64_t Offset2,
-                                             unsigned NumLoads) const {
-  assert(Offset2 > Offset1
-         && "Second offset should be larger than first offset!");
-  // If we have less than 16 loads in a row, and the offsets are within 16,
-  // then schedule together.
-  // TODO: Make the loads schedule near if it fits in a cacheline
-  return (NumLoads < 16 && (Offset2 - Offset1) < 16);
+bool AMDGPUInstrInfo::enableClusterLoads() const {
+  return true;
+}
+
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+                                              int64_t Offset0, int64_t Offset1,
+                                              unsigned NumLoads) const {
+  assert(Offset1 > Offset0 &&
+         "Second offset should be larger than first offset!");
+  // If we have less than 16 loads in a row, and the offsets are within 64
+  // bytes, then schedule together.
+
+  // A cacheline is 64 bytes (for global memory).
+  return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
 }
 
 bool
@@ -320,7 +319,10 @@
     return -1;
   }
 
-  Offset = MF.getTarget().getFrameLowering()->getFrameIndexOffset(MF, -1);
+  Offset = MF.getTarget()
+               .getSubtargetImpl()
+               ->getFrameLowering()
+               ->getFrameIndexOffset(MF, -1);
 
   return getIndirectIndexBegin(MF) + Offset;
 }
@@ -335,7 +337,7 @@
 }
 
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
-// header files, so we need to wrap it in a function that takes unsigned 
+// header files, so we need to wrap it in a function that takes unsigned
 // instead.
 namespace llvm {
 namespace AMDGPU {

diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 95dc8c1..da9833d 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h

@@ -13,10 +13,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPUINSTRUCTIONINFO_H
-#define AMDGPUINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H
 
-#include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include <map>
@@ -41,8 +40,6 @@
 class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
 private:
   const AMDGPURegisterInfo RI;
-  bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
-                          MachineBasicBlock &MBB) const;
   virtual void anchor();
 protected:
   const AMDGPUSubtarget &ST;
@@ -74,11 +71,6 @@
                         LiveVariables *LV) const override;
 
 
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const = 0;
-
   bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -101,6 +93,7 @@
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) const override;
+public:
   /// \returns the smallest register index that will be accessed by an indirect
   /// read or write or -1 if indirect addressing is not used by this program.
   int getIndirectIndexBegin(const MachineFunction &MF) const;
@@ -109,7 +102,6 @@
   /// read or write or -1 if indirect addressing is not used by this program.
   int getIndirectIndexEnd(const MachineFunction &MF) const;
 
-public:
   bool canFoldMemoryOperand(const MachineInstr *MI,
                            const SmallVectorImpl<unsigned> &Ops) const override;
   bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
@@ -120,6 +112,9 @@
   unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
                                bool UnfoldLoad, bool UnfoldStore,
                                unsigned *LoadRegIndex = nullptr) const override;
+
+  bool enableClusterLoads() const override;
+
   bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
@@ -144,7 +139,6 @@
 // Pure virtual funtions to be implemented by sub-classes.
 //===---------------------------------------------------------------------===//
 
-  virtual unsigned getIEQOpcode() const = 0;
   virtual bool isMov(unsigned opcode) const = 0;
 
   /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
@@ -197,4 +191,4 @@
 #define AMDGPU_FLAG_REGISTER_LOAD  (UINT64_C(1) << 63)
 #define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
 
-#endif // AMDGPUINSTRINFO_H
+#endif

diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 934d59d..4ee0f2b 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td

@@ -23,6 +23,10 @@
   [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
 >;
 
+def AMDGPULdExpOp : SDTypeProfile<1, 2,
+  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
+>;
+
 def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
   [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
 >;
@@ -34,6 +38,9 @@
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 
+def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
+def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
+
 // out = a - floor(a)
 def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
 
@@ -49,12 +56,18 @@
 // out = 1.0 / sqrt(a) result clamped to +/- max_float.
 def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
 
-// out = max(a, b) a and b are floats
-def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
-  [SDNPCommutative, SDNPAssociative]
+def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+
+// out = max(a, b) a and b are floats, where a nan comparison fails.
+// This is not commutative because this gives the second operand:
+//   x < nan ? x : nan -> nan
+//   nan < x ? nan : x -> x
+def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
+  [SDNPAssociative]
 >;
 
 def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>;
 
 // out = max(a, b) a and b are signed ints
 def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
@@ -66,12 +79,12 @@
   [SDNPCommutative, SDNPAssociative]
 >;
 
-// out = min(a, b) a and b are floats
-def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp,
-  [SDNPCommutative, SDNPAssociative]
+// out = min(a, b) a and b are floats, where a nan comparison fails.
+def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
+  [SDNPAssociative]
 >;
 
-// out = min(a, b) a snd b are signed ints
+// out = min(a, b) a and b are signed ints
 def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
@@ -81,6 +94,37 @@
   [SDNPCommutative, SDNPAssociative]
 >;
 
+// FIXME: TableGen doesn't like commutative instructions with more
+// than 2 operands.
+// out = max(a, b, c) a, b and c are floats
+def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b, and c are signed ints
+def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = max(a, b, c) a, b and c are unsigned ints
+def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are floats
+def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b, c) a, b and c are signed ints
+def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
+// out = min(a, b) a and b are unsigned ints
+def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp,
+  [/*SDNPCommutative, SDNPAssociative*/]
+>;
 
 def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
   SDTIntToFPOp, []>;
@@ -127,7 +171,7 @@
 // MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
 //
 // src0: vec4(src, 0, 0, mask)
-// src1: dst - rat offset (aka pointer) in dwords  
+// src1: dst - rat offset (aka pointer) in dwords
 def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
                         SDTypeProfile<0, 2, []>,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index b86b781..c215865 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td

@@ -23,6 +23,8 @@
   let Pattern = pattern;
   let Itinerary = NullALU;
 
+  let isCodeGenOnly = 1;
+
   let TSFlags{63} = isRegisterLoad;
   let TSFlags{62} = isRegisterStore;
 }
@@ -34,9 +36,15 @@
 
 }
 
+def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
+def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
+def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
+
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
 def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
+let OperandType = "OPERAND_IMMEDIATE" in {
+
 def u32imm : Operand<i32> {
   let PrintMethod = "printU32ImmOperand";
 }
@@ -49,6 +57,8 @@
   let PrintMethod = "printU8ImmOperand";
 }
 
+} // End OperandType = "OPERAND_IMMEDIATE"
+
 //===--------------------------------------------------------------------===//
 // Custom Operands
 //===--------------------------------------------------------------------===//
@@ -125,13 +135,35 @@
 
 def COND_NULL : PatLeaf <
   (cond),
-  [{return false;}]
+  [{(void)N; return false;}]
 >;
 
 //===----------------------------------------------------------------------===//
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
 
+class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+}]>;
+
+class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
+  (ops node:$ptr), (op node:$ptr)
+>;
+
+class PrivateStore <SDPatternOperator op> : PrivateMemOp <
+  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
+>;
+
+def extloadi8_private : PrivateLoad <extloadi8>;
+def sextloadi8_private : PrivateLoad <sextloadi8>;
+def extloadi16_private : PrivateLoad <extloadi16>;
+def sextloadi16_private : PrivateLoad <sextloadi16>;
+def load_private : PrivateLoad <load>;
+
+def truncstorei8_private : PrivateStore <truncstorei8>;
+def truncstorei16_private : PrivateStore <truncstorei16>;
+def store_private : PrivateStore <store>;
+
 def global_store : PatFrag<(ops node:$val, node:$ptr),
     (store node:$val, node:$ptr), [{
         return isGlobalStore(dyn_cast<StoreSDNode>(N));
@@ -165,6 +197,14 @@
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
@@ -193,6 +233,14 @@
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
@@ -218,6 +266,11 @@
   return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def az_extloadi32_flat : PatFrag<(ops node:$ptr),
+                                   (az_extloadi32 node:$ptr), [{
+  return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def az_extloadi32_constant : PatFrag<(ops node:$ptr),
                                      (az_extloadi32 node:$ptr), [{
   return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
@@ -233,6 +286,16 @@
   return isGlobalStore(dyn_cast<StoreSDNode>(N));
 }]>;
 
+def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
 def local_store : PatFrag<(ops node:$val, node:$ptr),
                              (store node:$val, node:$ptr), [{
   return isLocalStore(dyn_cast<StoreSDNode>(N));
@@ -252,6 +315,17 @@
     return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
+    return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
+}]>;
+
+def local_load_aligned8bytes : Aligned8Bytes <
+  (ops node:$ptr), (local_load node:$ptr)
+>;
+
+def local_store_aligned8bytes : Aligned8Bytes <
+  (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr)
+>;
 
 class local_binary_atomic_op<SDNode atomic_op> :
   PatFrag<(ops node:$ptr, node:$value),
@@ -277,6 +351,7 @@
   return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
+
 def atomic_cmp_swap_32_local :
   PatFrag<(ops node:$ptr, node:$cmp, node:$swap),
           (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{
@@ -293,6 +368,45 @@
          AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
 }]>;
 
+def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def flat_store : PatFrag<(ops node:$val, node:$ptr),
+                         (store node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
+}]>;
+
+class global_binary_atomic_op<SDNode atomic_op> : PatFrag<
+  (ops node:$ptr, node:$value),
+  (atomic_op node:$ptr, node:$value),
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]
+>;
+
+def atomic_swap_global : global_binary_atomic_op<atomic_swap>;
+def atomic_add_global : global_binary_atomic_op<atomic_load_add>;
+def atomic_and_global : global_binary_atomic_op<atomic_load_and>;
+def atomic_max_global : global_binary_atomic_op<atomic_load_max>;
+def atomic_min_global : global_binary_atomic_op<atomic_load_min>;
+def atomic_or_global : global_binary_atomic_op<atomic_load_or>;
+def atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
+def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
+def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
+def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
+
+//===----------------------------------------------------------------------===//
+// Misc Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def fmad : PatFrag <
+  (ops node:$src0, node:$src1, node:$src2),
+  (fadd (fmul node:$src0, node:$src1), node:$src2)
+>;
 
 class Constants {
 int TWO_PI = 0x40c90fdb;
@@ -412,8 +526,9 @@
 
 // BFI_INT patterns
 
-multiclass BFIPatterns <Instruction BFI_INT, Instruction LoadImm32> {
-
+multiclass BFIPatterns <Instruction BFI_INT,
+                        Instruction LoadImm32,
+                        RegisterClass RC64> {
   // Definition from ISA doc:
   // (y & x) | (z & ~x)
   def : Pat <
@@ -435,8 +550,8 @@
 
   def : Pat <
     (f64 (fcopysign f64:$src0, f64:$src1)),
-      (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0),
+    (REG_SEQUENCE RC64,
+      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
       (BFI_INT (LoadImm32 0x7fffffff),
                (i32 (EXTRACT_SUBREG $src0, sub1)),
                (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)

diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
index 58916a9..e94bb60 100644
--- a/lib/Target/R600/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDGPUIntrinsicInfo.cpp

@@ -24,7 +24,7 @@
 #include "AMDGPUGenIntrinsics.inc"
 #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
 
-AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm)
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo()
     : TargetIntrinsicInfo() {}
 
 std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,

diff --git a/lib/Target/R600/AMDGPUIntrinsicInfo.h b/lib/Target/R600/AMDGPUIntrinsicInfo.h
index 5be68a2..4c95b5e 100644
--- a/lib/Target/R600/AMDGPUIntrinsicInfo.h
+++ b/lib/Target/R600/AMDGPUIntrinsicInfo.h

@@ -11,8 +11,8 @@
 /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
 //
 //===-----------------------------------------------------------------------===//
-#ifndef AMDGPU_INTRINSICINFO_H
-#define AMDGPU_INTRINSICINFO_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H
 
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -33,7 +33,7 @@
 
 class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
 public:
-  AMDGPUIntrinsicInfo(TargetMachine *tm);
+  AMDGPUIntrinsicInfo();
   std::string getName(unsigned IntrId, Type **Tys = nullptr,
                       unsigned numTys = 0) const override;
   unsigned lookupName(const char *Name, unsigned Len) const override;
@@ -45,4 +45,4 @@
 
 } // end namespace llvm
 
-#endif // AMDGPU_INTRINSICINFO_H
+#endif

diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index d934676..eee9c29 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td

@@ -13,9 +13,6 @@
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
 
-  def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
   def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;

diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index ac82e88..bca027f 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp

@@ -22,7 +22,9 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectStreamer.h"
@@ -77,6 +79,20 @@
     case MachineOperand::MO_MachineBasicBlock:
       MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
                                    MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress: {
+      const GlobalValue *GV = MO.getGlobal();
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(GV->getName()));
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(Sym, Ctx));
+      break;
+    }
+    case MachineOperand::MO_TargetIndex: {
+      assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+      const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      MCOp = MCOperand::CreateExpr(Expr);
+      break;
+    }
     }
     OutMI.addOperand(MCOp);
   }
@@ -88,7 +104,7 @@
 
 #ifdef _DEBUG
   StringRef Err;
-  if (!TM.getInstrInfo()->verifyInstruction(MI, Err)) {
+  if (!TM.getSubtargetImpl()->getInstrInfo()->verifyInstruction(MI, Err)) {
     errs() << "Warning: Illegal instruction detected: " << Err << "\n";
     MI->dump();
   }
@@ -112,8 +128,9 @@
       std::string &DisasmLine = DisasmLines.back();
       raw_string_ostream DisasmStream(DisasmLine);
 
-      AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *TM.getInstrInfo(),
-                                    *TM.getRegisterInfo());
+      AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(),
+                                    *TM.getSubtargetImpl()->getInstrInfo(),
+                                    *TM.getSubtargetImpl()->getRegisterInfo());
       InstPrinter.printInst(&TmpInst, DisasmStream, StringRef());
 
       // Disassemble instruction/operands to hex representation.

diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index 58fe34d..00d1f1b 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h

@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPU_MCINSTLOWER_H
-#define AMDGPU_MCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
+#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H
 
 namespace llvm {
 
@@ -45,4 +45,4 @@
 
 } // End namespace llvm
 
-#endif //AMDGPU_MCINSTLOWER_H
+#endif

diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
index 14171f4..0f3f9e2 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp

@@ -10,9 +10,11 @@
 void AMDGPUMachineFunction::anchor() {}
 
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
-    MachineFunctionInfo() {
-  ShaderType = ShaderType::COMPUTE;
-  LDSSize = 0;
+  MachineFunctionInfo(),
+  ShaderType(ShaderType::COMPUTE),
+  LDSSize(0),
+  ScratchSize(0),
+  IsKernel(true) {
   AttributeSet Set = MF.getFunction()->getAttributes();
   Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
                                  ShaderTypeAttribute);

diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
index fea0b39..f5e4694 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ b/lib/Target/R600/AMDGPUMachineFunction.h

@@ -10,8 +10,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPUMACHINEFUNCTION_H
-#define AMDGPUMACHINEFUNCTION_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
+#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include <map>
@@ -20,15 +20,26 @@
 
 class AMDGPUMachineFunction : public MachineFunctionInfo {
   virtual void anchor();
+  unsigned ShaderType;
+
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
-  unsigned ShaderType;
   /// A map to keep track of local memory objects and their offsets within
   /// the local memory space.
   std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize;
+
+  /// Start of implicit kernel args
+  unsigned ABIArgOffset;
+
+  unsigned getShaderType() const {
+    return ShaderType;
+  }
+
+  unsigned ScratchSize;
+  bool IsKernel;
 };
 
 }
-#endif // AMDGPUMACHINEFUNCTION_H
+#endif

diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
index 218750d..b81fef4 100644
--- a/lib/Target/R600/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp

@@ -36,11 +36,9 @@
 public:
   AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
                                                    LocalMemAvailable(0) { }
-  virtual bool doInitialization(Module &M);
-  virtual bool runOnFunction(Function &F);
-  virtual const char *getPassName() const {
-    return "AMDGPU Promote Alloca";
-  }
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
   void visitAlloca(AllocaInst &I);
 };
 
@@ -107,14 +105,16 @@
                          ArrayTy->getArrayNumElements());
 }
 
-static Value* calculateVectorIndex(Value *Ptr,
-                                  std::map<GetElementPtrInst*, Value*> GEPIdx) {
+static Value *
+calculateVectorIndex(Value *Ptr,
+                     const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
   if (isa<AllocaInst>(Ptr))
     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
 
   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
 
-  return GEPIdx[GEP];
+  auto I = GEPIdx.find(GEP);
+  return I == GEPIdx.end() ? nullptr : I->second;
 }
 
 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
@@ -234,7 +234,8 @@
   return true;
 }
 
-static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+  bool Success = true;
   for (User *User : Val->users()) {
     if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
       continue;
@@ -242,11 +243,20 @@
       WorkList.push_back(User);
       continue;
     }
+
+    // FIXME: Correctly handle ptrtoint instructions.
+    Instruction *UseInst = dyn_cast<Instruction>(User);
+    if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
+      return false;
+
     if (!User->getType()->isPointerTy())
       continue;
+
     WorkList.push_back(User);
-    collectUsesWithPtrTypes(User, WorkList);
+
+    Success &= collectUsesWithPtrTypes(User, WorkList);
   }
+  return Success;
 }
 
 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
@@ -274,6 +284,13 @@
     return;
   }
 
+  std::vector<Value*> WorkList;
+
+  if (!collectUsesWithPtrTypes(&I, WorkList)) {
+    DEBUG(dbgs() << " Do not know how to convert all uses\n");
+    return;
+  }
+
   DEBUG(dbgs() << "Promoting alloca to local memory\n");
   LocalMemAvailable -= AllocaSize;
 
@@ -320,10 +337,6 @@
   I.replaceAllUsesWith(Offset);
   I.eraseFromParent();
 
-  std::vector<Value*> WorkList;
-
-  collectUsesWithPtrTypes(Offset, WorkList);
-
   for (std::vector<Value*>::iterator i = WorkList.begin(),
                                      e = WorkList.end(); i != e; ++i) {
     Value *V = *i;
@@ -331,6 +344,13 @@
     if (!Call) {
       Type *EltTy = V->getType()->getPointerElementType();
       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+
+      // The operand's value should be corrected on its own.
+      if (isa<AddrSpaceCastInst>(V))
+        continue;
+
+      // FIXME: It doesn't really make sense to try to do this for all
+      // instructions.
       V->mutateType(NewTy);
       continue;
     }

diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 4731595..f27576a 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPUREGISTERINFO_H
-#define AMDGPUREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
+#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -62,4 +62,4 @@
 
 } // End namespace llvm
 
-#endif // AMDIDSAREGISTERINFO_H
+#endif

diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index b83c290..9d09a19 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp

@@ -13,8 +13,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
+#include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
+#include "R600MachineScheduler.h"
 #include "SIInstrInfo.h"
+#include "SIISelLowering.h"
+#include "llvm/ADT/SmallString.h"
+
+#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
@@ -25,29 +31,66 @@
 #define GET_SUBTARGETINFO_CTOR
 #include "AMDGPUGenSubtargetInfo.inc"
 
-AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS) :
-  AMDGPUGenSubtargetInfo(TT, GPU, FS),
-  DevName(GPU),
-  Is64bit(false),
-  DumpCode(false),
-  R600ALUInst(false),
-  HasVertexCache(false),
-  TexVTXClauseSize(0),
-  Gen(AMDGPUSubtarget::R600),
-  FP64(false),
-  CaymanISA(false),
-  EnableIRStructurizer(true),
-  EnableIfCvt(true),
-  WavefrontSize(0),
-  CFALUBug(false),
-  LocalMemorySize(0),
-  InstrItins(getInstrItineraryForCPU(GPU)) {
-  ParseSubtargetFeatures(GPU, FS);
+static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
+  std::string Ret = "e-p:32:32";
 
+  if (ST.is64bit()) {
+    // 32-bit private, local, and region pointers. 64-bit global and constant.
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
+  }
+
+  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
+         "-v512:512-v1024:1024-v2048:2048-n32:64";
+
+  return Ret;
+}
+
+AMDGPUSubtarget &
+AMDGPUSubtarget::initializeSubtargetDependencies(StringRef GPU, StringRef FS) {
+  // Determine default and user-specified characteristics
+  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
+  // enabled, but some instructions do not respect them and they run at the
+  // double precision rate, so don't enable by default.
+  //
+  // We want to be able to turn these off, but making this a subtarget feature
+  // for SI has the unhelpful behavior that it unsets everything else if you
+  // disable it.
+
+  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+  FullFS += FS;
+
+  ParseSubtargetFeatures(GPU, FullFS);
+
+  // FIXME: I don't think think Evergreen has any useful support for
+  // denormals, but should be checked. Should we issue a warning somewhere
+  // if someone tries to enable these?
+  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    FP32Denormals = false;
+    FP64Denormals = false;
+  }
+  return *this;
+}
+
+AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS,
+                                 TargetMachine &TM)
+    : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false),
+      DumpCode(false), R600ALUInst(false), HasVertexCache(false),
+      TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
+      FP64Denormals(false), FP32Denormals(false), CaymanISA(false),
+      FlatAddressSpace(false), EnableIRStructurizer(true),
+      EnablePromoteAlloca(false), EnableIfCvt(true),
+      EnableLoadStoreOpt(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
+      DL(computeDataLayout(initializeSubtargetDependencies(GPU, FS))),
+      FrameLowering(TargetFrameLowering::StackGrowsUp,
+                    64 * 16, // Maximum stack alignment (long16)
+                    0),
+      InstrItins(getInstrItineraryForCPU(GPU)) {
   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
     InstrInfo.reset(new R600InstrInfo(*this));
+    TLInfo.reset(new R600TargetLowering(TM));
   } else {
     InstrInfo.reset(new SIInstrInfo(*this));
+    TLInfo.reset(new SITargetLowering(TM));
   }
 }
 

diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 0c388b3..f71d80a 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h

@@ -12,10 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPUSUBTARGET_H
-#define AMDGPUSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
+#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H
 #include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
 #include "AMDGPUInstrInfo.h"
+#include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "R600ISelLowering.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -23,14 +28,10 @@
 #define GET_SUBTARGETINFO_HEADER
 #include "AMDGPUGenSubtargetInfo.inc"
 
-#define MAX_CB_SIZE (1 << 16)
-
 namespace llvm {
 
 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
 
-  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
-
 public:
   enum Generation {
     R600 = 0,
@@ -50,24 +51,43 @@
   short TexVTXClauseSize;
   Generation Gen;
   bool FP64;
+  bool FP64Denormals;
+  bool FP32Denormals;
   bool CaymanISA;
+  bool FlatAddressSpace;
   bool EnableIRStructurizer;
+  bool EnablePromoteAlloca;
   bool EnableIfCvt;
+  bool EnableLoadStoreOpt;
   unsigned WavefrontSize;
   bool CFALUBug;
   int LocalMemorySize;
 
+  const DataLayout DL;
+  AMDGPUFrameLowering FrameLowering;
+  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
+  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
   InstrItineraryData InstrItins;
 
 public:
-  AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
+  AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM);
+  AMDGPUSubtarget &initializeSubtargetDependencies(StringRef GPU, StringRef FS);
 
-  const AMDGPUInstrInfo *getInstrInfo() const {
+  const AMDGPUFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const AMDGPUInstrInfo *getInstrInfo() const override {
     return InstrInfo.get();
   }
-
-  const InstrItineraryData &getInstrItineraryData() const {
-    return InstrItins;
+  const AMDGPURegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo->getRegisterInfo();
+  }
+  AMDGPUTargetLowering *getTargetLowering() const override {
+    return TLInfo.get();
+  }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
   }
 
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -81,7 +101,7 @@
   }
 
   short getTexVTXClauseSize() const {
-      return TexVTXClauseSize;
+    return TexVTXClauseSize;
   }
 
   Generation getGeneration() const {
@@ -96,6 +116,18 @@
     return CaymanISA;
   }
 
+  bool hasFP32Denormals() const {
+    return FP32Denormals;
+  }
+
+  bool hasFP64Denormals() const {
+    return FP64Denormals;
+  }
+
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
+
   bool hasBFE() const {
     return (getGeneration() >= EVERGREEN);
   }
@@ -112,8 +144,10 @@
     if (Size == 32)
       return (getGeneration() >= EVERGREEN);
 
-    assert(Size == 64);
-    return (getGeneration() >= SOUTHERN_ISLANDS);
+    if (Size == 64)
+      return (getGeneration() >= SOUTHERN_ISLANDS);
+
+    return false;
   }
 
   bool hasMulU24() const {
@@ -125,14 +159,30 @@
             hasCaymanISA());
   }
 
+  bool hasFFBL() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasFFBH() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
   bool IsIRStructurizerEnabled() const {
     return EnableIRStructurizer;
   }
 
+  bool isPromoteAllocaEnabled() const {
+    return EnablePromoteAlloca;
+  }
+
   bool isIfCvtEnabled() const {
     return EnableIfCvt;
   }
 
+  bool loadStoreOptEnabled() const {
+    return EnableLoadStoreOpt;
+  }
+
   unsigned getWavefrontSize() const {
     return WavefrontSize;
   }
@@ -171,4 +221,4 @@
 
 } // End namespace llvm
 
-#endif // AMDGPUSUBTARGET_H
+#endif

diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 8aab944..b2cd988 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp

@@ -22,6 +22,7 @@
 #include "SIInstrInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Verifier.h"
@@ -33,7 +34,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include <llvm/CodeGen/Passes.h>
 
-
 using namespace llvm;
 
 extern "C" void LLVMInitializeR600Target() {
@@ -49,46 +49,20 @@
 SchedCustomRegistry("r600", "Run R600's custom scheduler",
                     createR600MachineScheduler);
 
-static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
-  std::string Ret = "e-p:32:32";
-
-  if (ST.is64bit()) {
-    // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
-  }
-
-  Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
-         "-v512:512-v1024:1024-v2048:2048-n32:64";
-
-  return Ret;
-}
-
 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
-    StringRef CPU, StringRef FS,
-  TargetOptions Options,
-  Reloc::Model RM, CodeModel::Model CM,
-  CodeGenOpt::Level OptLevel
-)
-:
-  LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
-  Subtarget(TT, CPU, FS),
-  Layout(computeDataLayout(Subtarget)),
-  FrameLowering(TargetFrameLowering::StackGrowsUp,
-                64 * 16 // Maximum stack alignment (long16)
-               , 0),
-  IntrinsicInfo(this),
-  InstrItins(&Subtarget.getInstrItineraryData()) {
-  // TLInfo uses InstrInfo so it must be initialized after.
-  if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    TLInfo.reset(new R600TargetLowering(*this));
-  } else {
-    TLInfo.reset(new SITargetLowering(*this));
-  }
+                                         StringRef CPU, StringRef FS,
+                                         TargetOptions Options, Reloc::Model RM,
+                                         CodeModel::Model CM,
+                                         CodeGenOpt::Level OptLevel)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
+      TLOF(new TargetLoweringObjectFileELF()),
+      Subtarget(TT, CPU, FS, *this), IntrinsicInfo() {
   setRequiresStructuredCFG(true);
   initAsmInfo();
 }
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() {
+  delete TLOF;
 }
 
 namespace {
@@ -109,7 +83,8 @@
     return nullptr;
   }
 
-  virtual void addCodeGenPrepare();
+  void addIRPasses() override;
+  void addCodeGenPrepare() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addPreRegAlloc() override;
@@ -135,10 +110,26 @@
   PM.add(createAMDGPUTargetTransformInfoPass(this));
 }
 
+void AMDGPUPassConfig::addIRPasses() {
+  // Function calls are not supported, so make sure we inline everything.
+  addPass(createAMDGPUAlwaysInlinePass());
+  addPass(createAlwaysInlinerPass());
+  // We need to add the barrier noop pass, otherwise adding the function
+  // inlining pass will cause all of the PassConfigs passes to be run
+  // one function at a time, which means if we have a nodule with two
+  // functions, then we will generate code for the first function
+  // without ever running any passes on the second.
+  addPass(createBarrierNoopPass());
+  TargetPassConfig::addIRPasses();
+}
+
 void AMDGPUPassConfig::addCodeGenPrepare() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  addPass(createAMDGPUPromoteAlloca(ST));
-  addPass(createSROAPass());
+  if (ST.isPromoteAllocaEnabled()) {
+    addPass(createAMDGPUPromoteAlloca(ST));
+    addPass(createSROAPass());
+  }
+
   TargetPassConfig::addCodeGenPrepare();
 }
 
@@ -159,8 +150,15 @@
 }
 
 bool AMDGPUPassConfig::addInstSelector() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
-  addPass(createSILowerI1CopiesPass());
+
+  if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    addPass(createSILowerI1CopiesPass());
+    addPass(createSIFixSGPRCopiesPass(*TM));
+  }
+
   return false;
 }
 
@@ -170,12 +168,18 @@
   if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
     addPass(createR600VectorRegMerger(*TM));
   } else {
-    addPass(createSIFixSGPRCopiesPass(*TM));
-    // SIFixSGPRCopies can generate a lot of duplicate instructions,
-    // so we need to run MachineCSE afterwards.
-    addPass(&MachineCSEID);
-    initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
-    insertPass(&RegisterCoalescerID, &SIFixSGPRLiveRangesID);
+     if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
+      // Don't do this with no optimizations since it throws away debug info by
+      // merging nonadjacent loads.
+
+      // This should be run after scheduling, but before register allocation. It
+      // also need extra copies to the address operand to be eliminated.
+      initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+      insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
+    }
+
+    addPass(createSIShrinkInstructionsPass());
+    addPass(createSIFixSGPRLiveRangesPass());
   }
   return false;
 }
@@ -183,6 +187,7 @@
 bool AMDGPUPassConfig::addPostRegAlloc() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 
+  addPass(createSIShrinkInstructionsPass());
   if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
     addPass(createSIInsertWaits(*TM));
   }

diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 3bb15be..1b3dbce 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPU_TARGET_MACHINE_H
-#define AMDGPU_TARGET_MACHINE_H
+#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
+#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H
 
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUInstrInfo.h"
@@ -25,47 +25,30 @@
 namespace llvm {
 
 class AMDGPUTargetMachine : public LLVMTargetMachine {
-
+  TargetLoweringObjectFile *TLOF;
   AMDGPUSubtarget Subtarget;
-  const DataLayout Layout;
-  AMDGPUFrameLowering FrameLowering;
   AMDGPUIntrinsicInfo IntrinsicInfo;
-  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
-  const InstrItineraryData *InstrItins;
 
 public:
   AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
                       StringRef CPU, TargetOptions Options, Reloc::Model RM,
                       CodeModel::Model CM, CodeGenOpt::Level OL);
   ~AMDGPUTargetMachine();
-  const AMDGPUFrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
+  const AMDGPUSubtarget *getSubtargetImpl() const override {
+    return &Subtarget;
   }
   const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
     return &IntrinsicInfo;
   }
-  const AMDGPUInstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const AMDGPUSubtarget *getSubtargetImpl() const override {
-    return &Subtarget;
-  }
-  const AMDGPURegisterInfo *getRegisterInfo() const override {
-    return &getInstrInfo()->getRegisterInfo();
-  }
-  AMDGPUTargetLowering *getTargetLowering() const override {
-    return TLInfo.get();
-  }
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return InstrItins;
-  }
-  const DataLayout *getDataLayout() const override { return &Layout; }
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   /// \brief Register R600 analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF;
+  }
 };
 
 } // End namespace llvm
 
-#endif // AMDGPU_TARGET_MACHINE_H
+#endif

diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index ea78f43..e7bc006 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp

@@ -52,7 +52,7 @@
 
   AMDGPUTTI(const AMDGPUTargetMachine *TM)
       : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
+        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
     initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
   }
 
@@ -74,10 +74,14 @@
 
   bool hasBranchDivergence() const override;
 
-  void getUnrollingPreferences(Loop *L,
+  void getUnrollingPreferences(const Function *F, Loop *L,
                                UnrollingPreferences &UP) const override;
 
-  /// @}
+  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override;
+
+  unsigned getNumberOfRegisters(bool Vector) const override;
+  unsigned getRegisterBitWidth(bool Vector) const override;
+  unsigned getMaxInterleaveFactor() const override;
 };
 
 } // end anonymous namespace
@@ -93,16 +97,20 @@
 
 bool AMDGPUTTI::hasBranchDivergence() const { return true; }
 
-void AMDGPUTTI::getUnrollingPreferences(Loop *L,
+void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L,
                                         UnrollingPreferences &UP) const {
-  for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end();
-                                                  BI != BE; ++BI) {
-    BasicBlock *BB = *BI;
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
-                                                      I != E; ++I) {
-      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I);
-      if (!GEP)
+  UP.Threshold = 300; // Twice the default.
+  UP.Count = UINT_MAX;
+  UP.Partial = true;
+
+  // TODO: Do we want runtime unrolling?
+
+  for (const BasicBlock *BB : L->getBlocks()) {
+    for (const Instruction &I : *BB) {
+      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
+      if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
         continue;
+
       const Value *Ptr = GEP->getPointerOperand();
       const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr));
       if (Alloca) {
@@ -116,8 +124,34 @@
         //
         // Don't use the maximum allowed value here as it will make some
         // programs way too big.
-        UP.Threshold = 500;
+        UP.Threshold = 800;
       }
     }
   }
 }
+
+AMDGPUTTI::PopcntSupportKind
+AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software;
+}
+
+unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const {
+  if (Vec)
+    return 0;
+
+  // Number of VGPRs on SI.
+  if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
+    return 256;
+
+  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
+}
+
+unsigned AMDGPUTTI::getRegisterBitWidth(bool) const {
+  return 32;
+}
+
+unsigned AMDGPUTTI::getMaxInterleaveFactor() const {
+  // Semi-arbitrary large amount.
+  return 64;
+}

diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index f3a0391..ee6e8ec 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp

@@ -11,6 +11,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "R600InstrInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallVector.h"
@@ -160,7 +161,7 @@
   bool prepare();
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
     TRI = &TII->getRegisterInfo();
     DEBUG(MF.dump(););
     OrderedBlks.clear();
@@ -337,7 +338,7 @@
   void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
 
   MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
-  /// This is work around solution for findNearestCommonDominator not avaiable
+  /// This is work around solution for findNearestCommonDominator not available
   /// to post dom a proper fix should go to Dominators.h.
   MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
       MachineBasicBlock *MBB2);

diff --git a/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp
new file mode 100644
index 0000000..7ad815d
--- /dev/null
+++ b/lib/Target/R600/AsmParser/AMDGPUAsmParser.cpp

@@ -0,0 +1,320 @@
+//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUAsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "AMDGPUGenAsmMatcher.inc"
+
+  /// }
+
+public:
+  AMDGPUAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+               const MCInstrInfo &_MII,
+               const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic);
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool parseCnt(int64_t &IntVal);
+  OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
+};
+
+class AMDGPUOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Immediate
+  } Kind;
+
+public:
+  AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct ImmOp {
+    int64_t Val;
+  };
+
+  union {
+    TokOp Tok;
+    ImmOp Imm;
+  };
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateImm(getImm()));
+  }
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    llvm_unreachable("addRegOperands");
+  }
+  StringRef getToken() const {
+    return StringRef(Tok.Data, Tok.Length);
+  }
+  bool isToken() const override {
+    return Kind == Token;
+  }
+
+  bool isImm() const override {
+    return Kind == Immediate;
+  }
+
+  int64_t getImm() const {
+    return Imm.Val;
+  }
+
+  bool isReg() const override {
+    return false;
+  }
+
+  unsigned getReg() const override {
+    return 0;
+  }
+
+  bool isMem() const override {
+    return false;
+  }
+
+  SMLoc getStartLoc() const override {
+    return SMLoc();
+  }
+
+  SMLoc getEndLoc() const override {
+    return SMLoc();
+  }
+
+  void print(raw_ostream &OS) const override { }
+
+  static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val) {
+    auto Op = llvm::make_unique<AMDGPUOperand>(Immediate);
+    Op->Imm.Val = Val;
+    return Op;
+  }
+
+  static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc) {
+    auto Res = llvm::make_unique<AMDGPUOperand>(Token);
+    Res->Tok.Data = Str.data();
+    Res->Tok.Length = Str.size();
+    return Res;
+  }
+
+  bool isSWaitCnt() const;
+};
+
+}
+
+bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  return true;
+}
+
+
+bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                              OperandVector &Operands,
+                                              MCStreamer &Out,
+                                              uint64_t &ErrorInfo,
+                                              bool MatchingInlineAsm) {
+  MCInst Inst;
+
+  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
+    default: break;
+    case Match_Success:
+      Inst.setLoc(IDLoc);
+      Out.EmitInstruction(Inst, STI);
+      return false;
+    case Match_MissingFeature:
+      return Error(IDLoc, "instruction use requires an option to be enabled");
+    case Match_MnemonicFail:
+        return Error(IDLoc, "unrecognized instruction mnemonic");
+    case Match_InvalidOperand: {
+      if (ErrorInfo != ~0ULL) {
+        if (ErrorInfo >= Operands.size())
+          return Error(IDLoc, "too few operands for instruction");
+
+      }
+      return Error(IDLoc, "invalid operand for instruction");
+    }
+  }
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
+  return true;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
+
+  // Try to parse with a custom parser
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // If we successfully parsed the operand or if there as an error parsing,
+  // we are done.
+  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail)
+    return ResTy;
+
+  switch(getLexer().getKind()) {
+    case AsmToken::Integer: {
+      int64_t IntVal;
+      if (getParser().parseAbsoluteExpression(IntVal))
+        return MatchOperand_ParseFail;
+      Operands.push_back(AMDGPUOperand::CreateImm(IntVal));
+      return MatchOperand_Success;
+    }
+    default:
+      return MatchOperand_NoMatch;
+  }
+}
+
+bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                       StringRef Name,
+                                       SMLoc NameLoc, OperandVector &Operands) {
+  // Add the instruction mnemonic
+  Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc));
+
+  if (getLexer().is(AsmToken::EndOfStatement))
+    return false;
+
+  AMDGPUAsmParser::OperandMatchResultTy Res = parseOperand(Operands, Name);
+  switch (Res) {
+    case MatchOperand_Success: return false;
+    case MatchOperand_ParseFail: return Error(NameLoc,
+                                              "Failed parsing operand");
+    case MatchOperand_NoMatch: return Error(NameLoc, "Not a valid operand");
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// s_waitcnt
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
+  StringRef CntName = Parser.getTok().getString();
+  int64_t CntVal;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::LParen))
+    return true;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Integer))
+    return true;
+
+  if (getParser().parseAbsoluteExpression(CntVal))
+    return true;
+
+  if (getLexer().isNot(AsmToken::RParen))
+    return true;
+
+  Parser.Lex();
+  if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
+    Parser.Lex();
+
+  int CntShift;
+  int CntMask;
+
+  if (CntName == "vmcnt") {
+    CntMask = 0xf;
+    CntShift = 0;
+  } else if (CntName == "expcnt") {
+    CntMask = 0x7;
+    CntShift = 4;
+  } else if (CntName == "lgkmcnt") {
+    CntMask = 0x7;
+    CntShift = 8;
+  } else {
+    return true;
+  }
+
+  IntVal &= ~(CntMask << CntShift);
+  IntVal |= (CntVal << CntShift);
+  return false;
+}
+
+AMDGPUAsmParser::OperandMatchResultTy
+AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
+  // Disable all counters by default.
+  // vmcnt   [3:0]
+  // expcnt  [6:4]
+  // lgkmcnt [10:8]
+  int64_t CntVal = 0x77f;
+
+  switch(getLexer().getKind()) {
+    default: return MatchOperand_ParseFail;
+    case AsmToken::Integer:
+      // The operand can be an integer value.
+      if (getParser().parseAbsoluteExpression(CntVal))
+        return MatchOperand_ParseFail;
+      break;
+
+    case AsmToken::Identifier:
+      do {
+        if (parseCnt(CntVal))
+          return MatchOperand_ParseFail;
+      } while(getLexer().isNot(AsmToken::EndOfStatement));
+      break;
+  }
+  Operands.push_back(AMDGPUOperand::CreateImm(CntVal));
+  return MatchOperand_Success;
+}
+
+bool AMDGPUOperand::isSWaitCnt() const {
+  return isImm();
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeR600AsmParser() {
+  RegisterMCAsmParser<AMDGPUAsmParser> A(TheAMDGPUTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "AMDGPUGenAsmMatcher.inc"
+

diff --git a/lib/Target/R600/AsmParser/CMakeLists.txt b/lib/Target/R600/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..1b42af7
--- /dev/null
+++ b/lib/Target/R600/AsmParser/CMakeLists.txt

@@ -0,0 +1,3 @@
+add_llvm_library(LLVMR600AsmParser
+  AMDGPUAsmParser.cpp
+  )

diff --git a/lib/Target/R600/AsmParser/LLVMBuild.txt b/lib/Target/R600/AsmParser/LLVMBuild.txt
new file mode 100644
index 0000000..940e4ce
--- /dev/null
+++ b/lib/Target/R600/AsmParser/LLVMBuild.txt

@@ -0,0 +1,23 @@
+;===- ./lib/Target/R600/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = R600AsmParser
+parent = R600
+required_libraries = MC MCParser R600Desc R600Info Support
+add_to_library_groups = R600

diff --git a/lib/Target/R600/AsmParser/Makefile b/lib/Target/R600/AsmParser/Makefile
new file mode 100644
index 0000000..e6689b5
--- /dev/null
+++ b/lib/Target/R600/AsmParser/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/R600/AsmParser/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMR600AsmParser
+
+# Hack: we need to include 'main' R600 target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 4d16082..ed0a216 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt

@@ -6,13 +6,15 @@
 tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
-tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
 tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher)
 add_public_tablegen_target(AMDGPUCommonTableGen)
 
 add_llvm_target(R600CodeGen
   AMDILCFGStructurizer.cpp
+  AMDGPUAlwaysInlinePass.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUIntrinsicInfo.cpp
@@ -44,13 +46,16 @@
   SIInsertWaits.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
+  SILoadStoreOptimizer.cpp
   SILowerControlFlow.cpp
   SILowerI1Copies.cpp
   SIMachineFunctionInfo.cpp
   SIRegisterInfo.cpp
+  SIShrinkInstructions.cpp
   SITypeRewriter.cpp
   )
 
+add_subdirectory(AsmParser)
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)

diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
index 2630345..58b5ce2 100644
--- a/lib/Target/R600/CaymanInstructions.td
+++ b/lib/Target/R600/CaymanInstructions.td

@@ -46,6 +46,8 @@
 def COS_cm : COS_Common<0x8E>;
 } // End isVector = 1
 
+defm : RsqPat<RECIPSQRT_IEEE_cm, f32>;
+
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
 
 defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;

diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index dcb7e98..f24f76b 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td

@@ -69,6 +69,7 @@
 def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
 def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
 def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+defm : RsqPat<RECIPSQRT_IEEE_eg, f32>;
 def SIN_eg : SIN_Common<0x8D>;
 def COS_eg : COS_Common<0x8E>;
 
@@ -256,6 +257,12 @@
 
 let Predicates = [isEGorCayman] in {
 
+// Should be predicated on FeatureFP64
+// def FMA_64 : R600_3OP <
+//   0xA, "FMA_64",
+//   [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+// >;
+
 // BFE_UINT - bit_extract, an optimization for mask and shift
 // Src0 = Input
 // Src1 = Offset
@@ -295,7 +302,7 @@
 def : Pat<(i32 (sext_inreg i32:$src, i16)),
   (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
 
-defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32>;
+defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>;
 
 def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
   [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))],
@@ -312,6 +319,7 @@
 def : ROTRPattern <BIT_ALIGN_INT_eg>;
 def MULADD_eg : MULADD_Common<0x14>;
 def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
+def FMA_eg : FMA_Common<0x7>;
 def ASHR_eg : ASHR_Common<0x15>;
 def LSHR_eg : LSHR_Common<0x16>;
 def LSHL_eg : LSHL_Common<0x17>;
@@ -328,6 +336,9 @@
 
 def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>;
+def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
+
 let hasSideEffects = 1 in {
   def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>;
 }
@@ -463,21 +474,47 @@
   let DisableEncoding = "$dst";
 }
 
-class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
+class R600_LDS_1A2D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
     R600_LDS <
-  lds_op,
-  (outs),
+  lds_op, outs,
   (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
        R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
        R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
        LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
-  "  "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+  "  "#name# "$last "#dst#"$src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
   pattern> {
+
+  field string BaseOp;
+
+  let LDS_1A1D = 0;
   let LDS_1A2D = 1;
 }
 
+class R600_LDS_1A2D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A2D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A2D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A2D <lds_op, (outs R600_Reg32:$dst), name, pattern> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+}
+
 def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
 def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_AND : R600_LDS_1A1D_NORET <0x9, "LDS_AND", [] >;
+def LDS_OR : R600_LDS_1A1D_NORET <0xa, "LDS_OR", [] >;
+def LDS_XOR : R600_LDS_1A1D_NORET <0xb, "LDS_XOR", [] >;
+def LDS_WRXCHG: R600_LDS_1A1D_NORET <0xd, "LDS_WRXCHG", [] >;
+def LDS_CMPST: R600_LDS_1A2D_NORET <0x10, "LDS_CMPST", [] >;
+def LDS_MIN_INT : R600_LDS_1A1D_NORET <0x5, "LDS_MIN_INT", [] >;
+def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >;
+def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >;
+def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >;
 def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
   [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
 >;
@@ -493,6 +530,33 @@
 def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
   [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
 >;
+def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
+  [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))]
+>;
+def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
+  [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))]
+>;
+def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
+  [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
+  [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
+  [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))]
+>;
+def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
+  [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))]
+>;
+def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
+  [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))]
+>;
+def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
+  [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
+>;
+def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
+  [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))]
+>;
 def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
   [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
 >;
@@ -526,7 +590,7 @@
 // SHA-256 Patterns
 def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
 
-def : FROUNDPat <CNDGE_eg>;
+def : FROUNDPat <CNDGE_eg, CNDGT_eg>;
 
 def EG_ExportSwz : ExportSwzInst {
   let Word1{19-16} = 0; // BURST_COUNT

diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 0927040..64fe726 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp

@@ -9,6 +9,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUInstPrinter.h"
+#include "SIDefines.h"
+
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -40,6 +42,81 @@
   O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
 }
 
+void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  O << formatDec(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " offen";
+}
+
+void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " idxen";
+}
+
+void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " addr64";
+}
+
+void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " offset:";
+    printU16ImmOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  uint16_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm != 0) {
+    O << " offset:";
+    printU16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  O << " offset0:";
+  printU8ImmDecOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  O << " offset1:";
+  printU8ImmDecOperand(MI, OpNo, O);
+}
+
+void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " glc";
+}
+
+void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " slc";
+}
+
+void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " tfe";
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
   switch (reg) {
   case AMDGPU::VCC:
@@ -54,6 +131,27 @@
   case AMDGPU::M0:
     O << "m0";
     return;
+  case AMDGPU::FLAT_SCR:
+    O << "flat_scratch";
+    return;
+  case AMDGPU::VCC_LO:
+    O << "vcc_lo";
+    return;
+  case AMDGPU::VCC_HI:
+    O << "vcc_hi";
+    return;
+  case AMDGPU::EXEC_LO:
+    O << "exec_lo";
+    return;
+  case AMDGPU::EXEC_HI:
+    O << "exec_hi";
+    return;
+  case AMDGPU::FLAT_SCR_LO:
+    O << "flat_scratch_lo";
+    return;
+  case AMDGPU::FLAT_SCR_HI:
+    O << "flat_scratch_hi";
+    return;
   default:
     break;
   }
@@ -117,19 +215,27 @@
     return;
   }
 
-  if (Imm == FloatToBits(1.0f) ||
-      Imm == FloatToBits(-1.0f) ||
-      Imm == FloatToBits(0.5f) ||
-      Imm == FloatToBits(-0.5f) ||
-      Imm == FloatToBits(2.0f) ||
-      Imm == FloatToBits(-2.0f) ||
-      Imm == FloatToBits(4.0f) ||
-      Imm == FloatToBits(-4.0f)) {
-    O << BitsToFloat(Imm);
-    return;
+  if (Imm == FloatToBits(0.0f))
+    O << "0.0";
+  else if (Imm == FloatToBits(1.0f))
+    O << "1.0";
+  else if (Imm == FloatToBits(-1.0f))
+    O << "-1.0";
+  else if (Imm == FloatToBits(0.5f))
+    O << "0.5";
+  else if (Imm == FloatToBits(-0.5f))
+    O << "-0.5";
+  else if (Imm == FloatToBits(2.0f))
+    O << "2.0";
+  else if (Imm == FloatToBits(-2.0f))
+    O << "-2.0";
+  else if (Imm == FloatToBits(4.0f))
+    O << "4.0";
+  else if (Imm == FloatToBits(-4.0f))
+    O << "-4.0";
+  else {
+    O << formatHex(static_cast<uint64_t>(Imm));
   }
-
-  O << formatHex(static_cast<uint64_t>(Imm));
 }
 
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -149,25 +255,30 @@
   } else if (Op.isImm()) {
     printImmediate(Op.getImm(), O);
   } else if (Op.isFPImm()) {
-    O << Op.getFPImm();
+
+    // We special case 0.0 because otherwise it will be printed as an integer.
+    if (Op.getFPImm() == 0.0)
+      O << "0.0";
+    else
+      printImmediate(FloatToBits(Op.getFPImm()), O);
   } else if (Op.isExpr()) {
     const MCExpr *Exp = Op.getExpr();
     Exp->print(O);
   } else {
-    assert(!"unknown operand type in printOperand");
+    llvm_unreachable("unknown operand type in printOperand");
   }
 }
 
 void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
                                             raw_ostream &O) {
   unsigned InputModifiers = MI->getOperand(OpNo).getImm();
-  if (InputModifiers & 0x1)
-    O << "-";
-  if (InputModifiers & 0x2)
-    O << "|";
+  if (InputModifiers & SISrcMods::NEG)
+    O << '-';
+  if (InputModifiers & SISrcMods::ABS)
+    O << '|';
   printOperand(MI, OpNo + 1, O);
-  if (InputModifiers & 0x2)
-    O << "|";
+  if (InputModifiers & SISrcMods::ABS)
+    O << '|';
 }
 
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
@@ -181,7 +292,7 @@
   } else if (Imm == 0) {
     O << "P10";
   } else {
-    assert(!"Invalid interpolation parameter slot");
+    llvm_unreachable("Invalid interpolation parameter slot");
   }
 }
 
@@ -214,6 +325,23 @@
   printIfSet(MI, OpNo, O, "_SAT");
 }
 
+void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm())
+    O << " clamp";
+}
+
+void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  int Imm = MI->getOperand(OpNo).getImm();
+  if (Imm == SIOutMods::MUL2)
+    O << " mul:2";
+  else if (Imm == SIOutMods::MUL4)
+    O << " mul:4";
+  else if (Imm == SIOutMods::DIV2)
+    O << " div:2";
+}
+
 void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
   int32_t Imm = MI->getOperand(OpNo).getImm();
@@ -281,7 +409,7 @@
     sel -= 512;
     int cb = sel >> 12;
     sel &= 4095;
-    O << cb << "[" << sel << "]";
+    O << cb << '[' << sel << ']';
   } else if (sel >= 448) {
     sel -= 448;
     O << sel;
@@ -290,7 +418,7 @@
   }
 
   if (sel >= 0)
-    O << "." << chans[chan];
+    O << '.' << chans[chan];
 }
 
 void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
@@ -323,25 +451,25 @@
   unsigned Sel = MI->getOperand(OpNo).getImm();
   switch (Sel) {
   case 0:
-    O << "X";
+    O << 'X';
     break;
   case 1:
-    O << "Y";
+    O << 'Y';
     break;
   case 2:
-    O << "Z";
+    O << 'Z';
     break;
   case 3:
-    O << "W";
+    O << 'W';
     break;
   case 4:
-    O << "0";
+    O << '0';
     break;
   case 5:
-    O << "1";
+    O << '1';
     break;
   case 7:
-    O << "_";
+    O << '_';
     break;
   default:
     break;
@@ -353,10 +481,10 @@
   unsigned CT = MI->getOperand(OpNo).getImm();
   switch (CT) {
   case 0:
-    O << "U";
+    O << 'U';
     break;
   case 1:
-    O << "N";
+    O << 'N';
     break;
   default:
     break;
@@ -368,10 +496,10 @@
   int KCacheMode = MI->getOperand(OpNo).getImm();
   if (KCacheMode > 0) {
     int KCacheBank = MI->getOperand(OpNo - 2).getImm();
-    O << "CB" << KCacheBank <<":";
+    O << "CB" << KCacheBank << ':';
     int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
-    int LineSize = (KCacheMode == 1)?16:32;
-    O << KCacheAddr * 16 << "-" << KCacheAddr * 16 + LineSize;
+    int LineSize = (KCacheMode == 1) ? 16 : 32;
+    O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize;
   }
 }
 
@@ -415,12 +543,26 @@
   unsigned Vmcnt = SImm16 & 0xF;
   unsigned Expcnt = (SImm16 >> 4) & 0xF;
   unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
-  if (Vmcnt != 0xF)
-    O << "vmcnt(" << Vmcnt << ") ";
-  if (Expcnt != 0x7)
-    O << "expcnt(" << Expcnt << ") ";
-  if (Lgkmcnt != 0x7)
-    O << "lgkmcnt(" << Lgkmcnt << ")";
+
+  bool NeedSpace = false;
+
+  if (Vmcnt != 0xF) {
+    O << "vmcnt(" << Vmcnt << ')';
+    NeedSpace = true;
+  }
+
+  if (Expcnt != 0x7) {
+    if (NeedSpace)
+      O << ' ';
+    O << "expcnt(" << Expcnt << ')';
+    NeedSpace = true;
+  }
+
+  if (Lgkmcnt != 0x7) {
+    if (NeedSpace)
+      O << ' ';
+    O << "lgkmcnt(" << Lgkmcnt << ')';
+  }
 }
 
 #include "AMDGPUGenAsmWriter.inc"

diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 6ca7170..4c06ac0 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h

@@ -10,8 +10,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPUINSTPRINTER_H
-#define AMDGPUINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
+#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
@@ -34,7 +34,19 @@
 private:
   void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printImmediate(uint32_t Imm, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -45,6 +57,8 @@
                          StringRef Asm, StringRef Default = "");
   static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -65,4 +79,4 @@
 
 } // End namespace llvm
 
-#endif // AMDGPUINSTRPRINTER_H
+#endif

diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt
index 408ed75..f3f254f 100644
--- a/lib/Target/R600/LLVMBuild.txt
+++ b/lib/Target/R600/LLVMBuild.txt

@@ -16,17 +16,18 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = InstPrinter MCTargetDesc TargetInfo
+subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
 name = R600
 parent = Target
+has_asmparser = 1
 has_asmprinter = 1
 
 [component_1]
 type = Library
 name = R600CodeGen
 parent = R600
-required_libraries = Analysis AsmPrinter CodeGen Core MC R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC R600AsmParser R600AsmPrinter R600Desc R600Info Scalar SelectionDAG Support Target TransformUtils
 add_to_library_groups = R600

diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index 489cec7..5fb311b 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp

@@ -9,9 +9,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -43,7 +45,7 @@
   AMDGPUAsmBackend(const Target &T)
     : MCAsmBackend() {}
 
-  unsigned getNumFixupKinds() const override { return 0; };
+  unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                   uint64_t Value, bool IsPCRel) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -55,9 +57,9 @@
     assert(!"Not implemented");
   }
   bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
-    return true;
-  }
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 };
 
 } //End anonymous namespace
@@ -73,9 +75,50 @@
                                   unsigned DataSize, uint64_t Value,
                                   bool IsPCRel) const {
 
-  uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
-  assert(Fixup.getKind() == FK_PCRel_4);
-  *Dst = (Value - 4) / 4;
+  switch ((unsigned)Fixup.getKind()) {
+    default: llvm_unreachable("Unknown fixup kind");
+    case AMDGPU::fixup_si_sopp_br: {
+      uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
+      *Dst = (Value - 4) / 4;
+      break;
+    }
+
+    case AMDGPU::fixup_si_rodata: {
+      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
+      *Dst = Value;
+      break;
+    }
+
+    case AMDGPU::fixup_si_end_of_text: {
+      uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset());
+      // The value points to the last instruction in the text section, so we
+      // need to add 4 bytes to get to the start of the constants.
+      *Dst = Value + 4;
+      break;
+    }
+  }
+}
+
+const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
+                                                       MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = {
+    // name                   offset bits  flags
+    { "fixup_si_sopp_br",     0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_si_rodata",      0,     32,   0 },
+    { "fixup_si_end_of_text", 0,     32,   MCFixupKindInfo::FKF_IsPCRel }
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
+bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  for (unsigned i = 0; i < Count; ++i)
+    OW->Write8(0);
+
+  return true;
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 53b0e85..5fb94d5 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp

@@ -10,6 +10,7 @@
 
 #include "AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
 
 using namespace llvm;
 
@@ -21,7 +22,7 @@
 protected:
   unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                         bool IsPCRel) const override {
-    llvm_unreachable("Not implemented");
+    return Fixup.getKind();
   }
 
 };

diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h
new file mode 100644
index 0000000..01021d6
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUFixupKinds.h

@@ -0,0 +1,34 @@
+//===-- AMDGPUFixupKinds.h - AMDGPU Specific Fixup Entries ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace AMDGPU {
+enum Fixups {
+  /// 16-bit PC relative fixup for SOPP branch instructions.
+  fixup_si_sopp_br = FirstTargetFixupKind,
+
+  /// fixup for global addresses with constant initializers
+  fixup_si_rodata,
+
+  /// fixup for offset from instruction to end of text section
+  fixup_si_end_of_text,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif

diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 78bbe0a..3c2b889 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp

@@ -11,21 +11,14 @@
 #include "AMDGPUMCAsmInfo.h"
 
 using namespace llvm;
-AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfoELF() {
   HasSingleParameterDotFile = false;
   //===------------------------------------------------------------------===//
-  HasSubsectionsViaSymbols = true;
-  HasMachoZeroFillDirective = false;
-  HasMachoTBSSDirective = false;
-  HasStaticCtorDtorReferenceInStaticMode = false;
-  LinkerRequiresNonEmptyDwarfLines = true;
   MaxInstLength = 16;
   SeparatorString = "\n";
   CommentString = ";";
-  LabelSuffix = ":";
   InlineAsmStart = ";#ASMSTART";
   InlineAsmEnd = ";#ASMEND";
-  AssemblerDialect = 0;
 
   //===--- Data Emission Directives -------------------------------------===//
   ZeroDirective = ".zero";
@@ -35,28 +28,15 @@
   Data16bitsDirective = ".short\t";
   Data32bitsDirective = ".long\t";
   Data64bitsDirective = ".quad\t";
-  GPRel32Directive = nullptr;
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
-  //===--- Alignment Information ----------------------------------------===//
-  AlignmentIsInBytes = true;
-  TextAlignFillValue = 0;
-
   //===--- Global Variable Emission Directives --------------------------===//
-  GlobalDirective = ".global";
-  HasSetDirective = false;
   HasAggressiveSymbolFolding = true;
   COMMDirectiveAlignmentIsInBytes = false;
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
   WeakRefDirective = ".weakref\t";
   //===--- Dwarf Emission Directives -----------------------------------===//
-  HasLEB128 = true;
   SupportsDebugInformation = true;
 }
-
-const MCSection*
-AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
-  return nullptr;
-}

diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
index 59aebec..8f75c76 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h

@@ -1,4 +1,4 @@
-//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface  ----------===//
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,18 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPUMCASMINFO_H
-#define AMDGPUMCASMINFO_H
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 namespace llvm {
 
 class StringRef;
 
-class AMDGPUMCAsmInfo : public MCAsmInfo {
+// If you need to create another MCAsmInfo class, which inherits from MCAsmInfo,
+// you will need to make sure your new class sets PrivateGlobalPrefix to
+// a prefix that won't appeary in a fuction name.  The default value
+// for PrivateGlobalPrefix is 'L', so it will consider any function starting
+// with 'L' as a local symbol.
+class AMDGPUMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit AMDGPUMCAsmInfo(StringRef &TT);
-  const MCSection* getNonexecutableStackSection(MCContext &CTX) const override;
 };
 } // namespace llvm
-#endif // AMDGPUMCASMINFO_H
+#endif

diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 6a5cd67..c957427 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AMDGPUCODEEMITTER_H
-#define AMDGPUCODEEMITTER_H
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
 
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/Support/raw_ostream.h"
@@ -37,8 +37,14 @@
                                      const MCSubtargetInfo &STI) const {
     return 0;
   }
+
+  virtual unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+    return 0;
+  }
 };
 
 } // End namespace llvm
 
-#endif // AMDGPUCODEEMITTER_H
+#endif

diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 38a2956..8731055 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp

@@ -84,12 +84,9 @@
 
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    const MCSubtargetInfo &STI,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false);
+                                    raw_ostream &_OS, MCCodeEmitter *_Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll) {
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, false);
 }
 
 extern "C" void LLVMInitializeR600TargetMC() {

diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index f6b3376..c019766 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h

@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 //
 
-#ifndef AMDGPUMCTARGETDESC_H
-#define AMDGPUMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
+#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H
 
 #include "llvm/ADT/StringRef.h"
 
@@ -55,4 +55,4 @@
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
 
-#endif // AMDGPUMCTARGETDESC_H
+#endif

diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index ee02111..999fd0d 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp

@@ -13,8 +13,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPU.h"
+#include "SIDefines.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixup.h"
@@ -39,6 +42,7 @@
   void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
+  MCContext &Ctx;
 
   /// \brief Can this operand also contain immediate values?
   bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
@@ -49,7 +53,7 @@
 public:
   SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
                   MCContext &ctx)
-    : MCII(mcii), MRI(mri) { }
+    : MCII(mcii), MRI(mri), Ctx(ctx) { }
 
   ~SIMCCodeEmitter() { }
 
@@ -62,6 +66,12 @@
   uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
+
+  /// \brief Use a fixup to encode the simm16 field for SOPP branch
+  ///        instructions.
+  unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
 };
 
 } // End anonymous namespace
@@ -75,12 +85,13 @@
 
 bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
                                    unsigned OpNo) const {
-
   unsigned RegClass = Desc.OpInfo[OpNo].RegClass;
   return (AMDGPU::SSrc_32RegClassID == RegClass) ||
          (AMDGPU::SSrc_64RegClassID == RegClass) ||
          (AMDGPU::VSrc_32RegClassID == RegClass) ||
-         (AMDGPU::VSrc_64RegClassID == RegClass);
+         (AMDGPU::VSrc_64RegClassID == RegClass) ||
+	 (AMDGPU::VCSrc_32RegClassID == RegClass) ||
+	 (AMDGPU::VCSrc_64RegClassID == RegClass);
 }
 
 uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
@@ -90,6 +101,8 @@
     Imm.I = MO.getImm();
   else if (MO.isFPImm())
     Imm.F = MO.getFPImm();
+  else if (MO.isExpr())
+    return 255;
   else
     return ~0;
 
@@ -157,8 +170,13 @@
     IntFloatUnion Imm;
     if (Op.isImm())
       Imm.I = Op.getImm();
-    else
+    else if (Op.isFPImm())
       Imm.F = Op.getFPImm();
+    else {
+      assert(Op.isExpr());
+      // This will be replaced with a fixup value.
+      Imm.I = 0;
+    }
 
     for (unsigned j = 0; j < 4; j++) {
       OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff));
@@ -169,6 +187,21 @@
   }
 }
 
+unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isExpr()) {
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+    return 0;
+  }
+
+  return getMachineOpValue(MI, MO, Fixups, STI);
+}
+
 uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                             const MCOperand &MO,
                                        SmallVectorImpl<MCFixup> &Fixups,
@@ -177,10 +210,19 @@
     return MRI.getEncodingValue(MO.getReg());
 
   if (MO.isExpr()) {
-    const MCExpr *Expr = MO.getExpr();
-    MCFixupKind Kind = MCFixupKind(FK_PCRel_4);
-    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
-    return 0;
+    const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr());
+    MCFixupKind Kind;
+    const MCSymbol *Sym =
+        Ctx.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
+
+    if (&Expr->getSymbol() == Sym) {
+      // Add the offset to the beginning of the constant values.
+      Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text;
+    } else {
+      // This is used for constant data stored in .rodata.
+     Kind = (MCFixupKind)AMDGPU::fixup_si_rodata;
+    }
+    Fixups.push_back(MCFixup::Create(4, Expr, Kind, MI.getLoc()));
   }
 
   // Figure out the operand number, needed for isSrcOperand check

diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile
index 1b3ebbe..64a7c8c 100644
--- a/lib/Target/R600/Makefile
+++ b/lib/Target/R600/Makefile

@@ -16,8 +16,8 @@
 		AMDGPUGenDAGISel.inc  AMDGPUGenSubtargetInfo.inc \
 		AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
 		AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
-		AMDGPUGenAsmWriter.inc
+		AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc
 
-DIRS = InstPrinter TargetInfo MCTargetDesc
+DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common

diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
index 92bf0df..f07be00 100644
--- a/lib/Target/R600/R600ClauseMergePass.cpp
+++ b/lib/Target/R600/R600ClauseMergePass.cpp

@@ -18,6 +18,7 @@
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -167,7 +168,7 @@
 }
 
 bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
   for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                   BB != BB_E; ++BB) {
     MachineBasicBlock &MBB = *BB;

diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index d98a6db..edaf278 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp

@@ -336,7 +336,7 @@
         getHWInstrDesc(IsTex?CF_TC:CF_VC))
         .addImm(0) // ADDR
         .addImm(AluInstCount - 1); // COUNT
-    return ClauseFile(MIb, ClauseContent);
+    return ClauseFile(MIb, std::move(ClauseContent));
   }
 
   void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
@@ -426,7 +426,7 @@
     }
     assert(ClauseContent.size() < 128 && "ALU clause is too big");
     ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
-    return ClauseFile(ClauseHead, ClauseContent);
+    return ClauseFile(ClauseHead, std::move(ClauseContent));
   }
 
   void
@@ -459,11 +459,9 @@
   void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
     MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
   }
-  void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
-      const {
-    for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
-        It != E; ++It) {
-      MachineInstr *MI = *It;
+  void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
+                            unsigned Addr) const {
+    for (MachineInstr *MI : MIs) {
       CounterPropagateAddr(MI, Addr);
     }
   }
@@ -477,18 +475,19 @@
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
-    TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
+    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
+    TRI = static_cast<const R600RegisterInfo *>(
+        MF.getSubtarget().getRegisterInfo());
     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 
-    CFStack CFStack(ST, MFI->ShaderType);
+    CFStack CFStack(ST, MFI->getShaderType());
     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
         ++MB) {
       MachineBasicBlock &MBB = *MB;
       unsigned CfCount = 0;
       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
       std::vector<MachineInstr * > IfThenElseStack;
-      if (MFI->ShaderType == 1) {
+      if (MFI->getShaderType() == ShaderType::VERTEX) {
         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
             getHWInstrDesc(CF_CALL_FS));
         CfCount++;
@@ -542,7 +541,7 @@
           std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
               std::set<MachineInstr *>());
           Pair.second.insert(MIb);
-          LoopStack.push_back(Pair);
+          LoopStack.push_back(std::move(Pair));
           MI->eraseFromParent();
           CfCount++;
           break;
@@ -550,7 +549,7 @@
         case AMDGPU::ENDLOOP: {
           CFStack.popLoop();
           std::pair<unsigned, std::set<MachineInstr *> > Pair =
-              LoopStack.back();
+              std::move(LoopStack.back());
           LoopStack.pop_back();
           CounterPropagateAddr(Pair.second, CfCount);
           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))

diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index f2f28fe..51d87ed 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h

@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef R600DEFINES_H_
-#define R600DEFINES_H_
+#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H
+#define LLVM_LIB_TARGET_R600_R600DEFINES_H
 
 #include "llvm/MC/MCRegisterInfo.h"
 
@@ -168,4 +168,4 @@
 
 #define R_0288E8_SQ_LDS_ALLOC                        0x0288E8
 
-#endif // R600DEFINES_H_
+#endif

diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 38afebe..fdc2030 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp

@@ -19,6 +19,7 @@
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -297,7 +298,7 @@
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
-    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+    TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
     for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                     BB != BB_E; ++BB) {

diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index 732b06d..211d392 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp

@@ -19,6 +19,7 @@
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -65,7 +66,7 @@
 }
 
 bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+  TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   const R600RegisterInfo &TRI = TII->getRegisterInfo();
 

diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 7f3560a..a214e53 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp

@@ -19,6 +19,7 @@
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -82,6 +83,8 @@
   setOperationAction(ISD::SETCC, MVT::i32, Expand);
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT, MVT::f32, Expand);
@@ -189,7 +192,7 @@
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock::iterator I = *MI;
   const R600InstrInfo *TII =
-    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
+      static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
 
   switch (MI->getOpcode()) {
   default:
@@ -199,7 +202,10 @@
       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
       assert(DstIdx != -1);
       MachineInstrBuilder NewMI;
-      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
+      // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
+      //        LDS_1A2D support and remove this special case.
+      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
+           MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
         return BB;
 
       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
@@ -642,8 +648,8 @@
       MachineSDNode *interp;
       if (ijb < 0) {
         const MachineFunction &MF = DAG.getMachineFunction();
-        const R600InstrInfo *TII =
-          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
+        const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
+            MF.getSubtarget().getInstrInfo());
         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
         return DAG.getTargetExtractSubreg(
@@ -803,6 +809,9 @@
     case Intrinsic::r600_read_local_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
+    case Intrinsic::AMDGPU_read_workdim:
+      return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
+
     case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_X, VT);
@@ -839,8 +848,20 @@
   default:
     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
     return;
-  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+  case ISD::FP_TO_UINT:
+    if (N->getValueType(0) == MVT::i1) {
+      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+      return;
+    }
+    // Fall-through. Since we don't care about out of bounds values
+    // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
+    // considers some extra cases which are not necessary here.
+  case ISD::FP_TO_SINT: {
+    SDValue Result;
+    if (expandFP_TO_SINT(N, Result, DAG))
+      Results.push_back(Result);
     return;
+  }
   case ISD::UDIV: {
     SDValue Op = SDValue(N, 0);
     SDLoc DL(Op);
@@ -886,74 +907,7 @@
   }
   case ISD::UDIVREM: {
     SDValue Op = SDValue(N, 0);
-    SDLoc DL(Op);
-    EVT VT = Op.getValueType();
-    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
-
-    SDValue one = DAG.getConstant(1, HalfVT);
-    SDValue zero = DAG.getConstant(0, HalfVT);
-
-    //HiLo split
-    SDValue LHS = N->getOperand(0);
-    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
-    SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
-
-    SDValue RHS = N->getOperand(1);
-    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
-    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
-
-    // Get Speculative values
-    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
-    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
-
-    SDValue REM_Hi = zero;
-    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
-
-    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
-    SDValue DIV_Lo = zero;
-
-    const unsigned halfBitWidth = HalfVT.getSizeInBits();
-
-    for (unsigned i = 0; i < halfBitWidth; ++i) {
-      SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
-      // Get Value of high bit
-      SDValue HBit;
-      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
-        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
-      } else {
-        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
-        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
-      }
-
-      SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
-        DAG.getConstant(halfBitWidth - 1, HalfVT));
-      REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
-      REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
-
-      REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
-      REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
-
-
-      SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-
-      SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
-      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
-
-      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
-
-      // Update REM
-
-      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
-
-      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
-      REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
-      REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
-    }
-
-    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
-    Results.push_back(DIV);
-    Results.push_back(REM);
+    LowerUDIVREM64(Op, DAG, Results);
     break;
   }
   }
@@ -1415,8 +1369,8 @@
   // Lowering for indirect addressing
 
   const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
-                                         getTargetMachine().getFrameLowering());
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+      getTargetMachine().getSubtargetImpl()->getFrameLowering());
   unsigned StackWidth = TFL->getStackWidth(MF);
 
   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1512,10 +1466,23 @@
     return DAG.getMergeValues(Ops, DL);
   }
 
+  // Lower loads constant address space global variable loads
+  if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+      isa<GlobalVariable>(
+          GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
+
+    SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
+        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
+    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
+        DAG.getConstant(2, MVT::i32));
+    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
+                       LoadNode->getChain(), Ptr,
+                       DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
+  }
 
   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
     SDValue MergedValues[2] = {
-      SplitVectorLoad(Op, DAG),
+      ScalarizeVectorLoad(Op, DAG),
       Chain
     };
     return DAG.getMergeValues(MergedValues, DL);
@@ -1585,6 +1552,7 @@
                                   LoadNode->getPointerInfo(), MemVT,
                                   LoadNode->isVolatile(),
                                   LoadNode->isNonTemporal(),
+                                  LoadNode->isInvariant(),
                                   LoadNode->getAlignment());
     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
@@ -1599,8 +1567,8 @@
 
   // Lowering for indirect addressing
   const MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
-                                         getTargetMachine().getFrameLowering());
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+      getTargetMachine().getSubtargetImpl()->getFrameLowering());
   unsigned StackWidth = TFL->getStackWidth(MF);
 
   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
@@ -1663,10 +1631,10 @@
                                       SDLoc DL, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   MachineFunction &MF = DAG.getMachineFunction();
-  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
+  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 
   SmallVector<ISD::InputArg, 8> LocalIns;
 
@@ -1676,10 +1644,15 @@
 
   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    EVT VT = Ins[i].VT;
-    EVT MemVT = LocalIns[i].VT;
+    const ISD::InputArg &In = Ins[i];
+    EVT VT = In.VT;
+    EVT MemVT = VA.getLocVT();
+    if (!VT.isVector() && MemVT.isVector()) {
+      // Get load source type if scalarized.
+      MemVT = MemVT.getVectorElementType();
+    }
 
-    if (ShaderType != ShaderType::COMPUTE) {
+    if (MFI->getShaderType() != ShaderType::COMPUTE) {
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
       InVals.push_back(Register);
@@ -1687,7 +1660,7 @@
     }
 
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                                   AMDGPUAS::CONSTANT_BUFFER_0);
+                                          AMDGPUAS::CONSTANT_BUFFER_0);
 
     // i64 isn't a legal type, so the register type used ends up as i32, which
     // isn't expected here. It attempts to create this sextload, but it ends up
@@ -1696,18 +1669,33 @@
 
     // The first 36 bytes of the input buffer contains information about
     // thread group and global sizes.
+    ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
+    if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
+      // FIXME: This should really check the extload type, but the handling of
+      // extload vector parameters seems to be broken.
 
-    // FIXME: This should really check the extload type, but the handling of
-    // extload vecto parameters seems to be broken.
-    //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
-    ISD::LoadExtType Ext = ISD::SEXTLOAD;
-    SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
-                                 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
-                                 MachinePointerInfo(UndefValue::get(PtrTy)),
-                                 MemVT, false, false, 4);
+      // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+      Ext = ISD::SEXTLOAD;
+    }
+
+    // Compute the offset from the value.
+    // XXX - I think PartOffset should give you this, but it seems to give the
+    // size of the register which isn't useful.
+
+    unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
+    unsigned PartOffset = VA.getLocMemOffset();
+    unsigned Offset = 36 + VA.getLocMemOffset();
+
+    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
+    SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
+                              DAG.getConstant(Offset, MVT::i32),
+                              DAG.getUNDEF(MVT::i32),
+                              PtrInfo,
+                              MemVT, false, true, true, 4);
 
     // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
+    MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
   }
   return Chain;
 }
@@ -2053,7 +2041,7 @@
 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
   const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
   if (!Src.isMachineOpcode())
     return false;
   switch (Src.getMachineOpcode()) {
@@ -2178,7 +2166,7 @@
 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
                                             SelectionDAG &DAG) const {
   const R600InstrInfo *TII =
-      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
   if (!Node->isMachineOpcode())
     return Node;
   unsigned Opcode = Node->getMachineOpcode();

diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index d22c8c9..10ebc10 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef R600ISELLOWERING_H
-#define R600ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
+#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H
 
 #include "AMDGPUISelLowering.h"
 
@@ -74,4 +74,4 @@
 
 } // End namespace llvm;
 
-#endif // R600ISELLOWERING_H
+#endif

diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td
index 9428bab..0ffd485 100644
--- a/lib/Target/R600/R600InstrFormats.td
+++ b/lib/Target/R600/R600InstrFormats.td

@@ -38,6 +38,9 @@
   let Pattern = pattern;
   let Itinerary = itin;
 
+  // No AsmMatcher support.
+  let isCodeGenOnly = 1;
+
   let TSFlags{4} = Trig;
   let TSFlags{5} = Op3;
 

diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 3972e2f..653fd0d 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp

@@ -92,10 +92,6 @@
   return true;
 }
 
-unsigned R600InstrInfo::getIEQOpcode() const {
-  return AMDGPU::SETE_INT;
-}
-
 bool R600InstrInfo::isMov(unsigned Opcode) const {
 
 
@@ -209,8 +205,10 @@
 }
 
 bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const {
-  const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>();
-  return MFI->ShaderType != ShaderType::COMPUTE && usesVertexCache(MI->getOpcode());
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+  return MFI->getShaderType() != ShaderType::COMPUTE &&
+    usesVertexCache(MI->getOpcode());
 }
 
 bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
@@ -218,9 +216,11 @@
 }
 
 bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
-  const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>();
-  return (MFI->ShaderType == ShaderType::COMPUTE && usesVertexCache(MI->getOpcode())) ||
-         usesTextureCache(MI->getOpcode());
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+  return (MFI->getShaderType() == ShaderType::COMPUTE &&
+          usesVertexCache(MI->getOpcode())) ||
+    usesTextureCache(MI->getOpcode());
 }
 
 bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
@@ -319,7 +319,7 @@
         Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
         continue;
       }
-      
+
     }
     return Result;
   }
@@ -571,7 +571,7 @@
   if (!isLastAluTrans)
     return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
 
-  TransOps = IGSrcs.back();
+  TransOps = std::move(IGSrcs.back());
   IGSrcs.pop_back();
   ValidSwizzle.pop_back();
 
@@ -654,10 +654,10 @@
   return fitsConstReadLimitations(Consts);
 }
 
-DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
-    const ScheduleDAG *DAG) const {
-  const InstrItineraryData *II = TM->getInstrItineraryData();
-  return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II);
+DFAPacketizer *
+R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const {
+  const InstrItineraryData *II = STI.getInstrItineraryData();
+  return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II);
 }
 
 static bool
@@ -1082,9 +1082,8 @@
 
 void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
-  const AMDGPUFrameLowering *TFL =
-    static_cast<const AMDGPUFrameLowering*>(
-    MF.getTarget().getFrameLowering());
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+      MF.getSubtarget().getFrameLowering());
 
   unsigned StackWidth = TFL->getStackWidth(MF);
   int End = getIndirectIndexEnd(MF);

diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index 45a57d3..d3dc0e5 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef R600INSTRUCTIONINFO_H_
-#define R600INSTRUCTIONINFO_H_
+#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H
+#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H
 
 #include "AMDGPUInstrInfo.h"
 #include "R600Defines.h"
@@ -152,11 +152,10 @@
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
-  unsigned getIEQOpcode() const override;
   bool isMov(unsigned Opcode) const override;
 
-  DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
-                                           const ScheduleDAG *DAG) const override;
+  DFAPacketizer *
+  CreateTargetScheduleState(const TargetSubtargetInfo &) const override;
 
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
@@ -207,7 +206,7 @@
   int getInstrLatency(const InstrItineraryData *ItinData,
                       SDNode *Node) const override { return 1;}
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
   /// \brief Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
@@ -299,4 +298,4 @@
 
 } // End llvm namespace
 
-#endif // R600INSTRINFO_H_
+#endif

diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 73fa345..b6c00f8 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td

@@ -216,7 +216,7 @@
 def TEX_SHADOW : PatLeaf<
   (imm),
   [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13);
+    return (TType >= 6 && TType <= 8) || TType == 13;
   }]
 >;
 
@@ -475,13 +475,13 @@
 multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
   def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
     (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
         0, 61, 0, 7, 7, 7, cf_inst, 0)
   >;
 
   def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
     (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0),
         0, 61, 7, 0, 7, 7, cf_inst, 0)
   >;
 
@@ -513,17 +513,17 @@
 // Stream1
   def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
-      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      (ExportInst $src, 0, imm:$arraybase,
       4095, imm:$mask, buf1inst, 0)>;
 // Stream2
   def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
-      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      (ExportInst $src, 0, imm:$arraybase,
       4095, imm:$mask, buf2inst, 0)>;
 // Stream3
   def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
       (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
-      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      (ExportInst $src, 0, imm:$arraybase,
       4095, imm:$mask, buf3inst, 0)>;
 }
 
@@ -674,8 +674,9 @@
 // Non-IEEE MUL: 0 * anything = 0
 def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
 def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
-def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>;
-def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
+// TODO: Do these actually match the regular fmin/fmax behavior?
+def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>;
+def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>;
 
 // For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
 // so some of the instruction names don't match the asm string.
@@ -915,6 +916,11 @@
   [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
 >;
 
+class FMA_Common <bits<5> inst> : R600_3OP <
+  inst, "FMA",
+  [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU
+>;
+
 class CNDE_Common <bits<5> inst> : R600_3OP <
   inst, "CNDE",
   [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
@@ -1068,7 +1074,7 @@
 }
 
 class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
-  inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
+  inst, "RECIP_IEEE", [(set f32:$dst, (AMDGPUrcp f32:$src0))]
 > {
   let Itinerary = TransALU;
 }
@@ -1114,6 +1120,7 @@
 // Helper patterns for complex intrinsics
 //===----------------------------------------------------------------------===//
 
+// FIXME: Should be predicated on unsafe fp math.
 multiclass DIV_Common <InstR600 recip_ieee> {
 def : Pat<
   (int_AMDGPU_div f32:$src0, f32:$src1),
@@ -1124,6 +1131,8 @@
   (fdiv f32:$src0, f32:$src1),
   (MUL_IEEE $src0, (recip_ieee $src1))
 >;
+
+def : RcpPat<recip_ieee, f32>;
 }
 
 class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee>
@@ -1133,9 +1142,12 @@
 >;
 
 // FROUND pattern
-class FROUNDPat<Instruction CNDGE> : Pat <
+class FROUNDPat<Instruction CNDGE, Instruction CNDGT> : Pat <
   (AMDGPUround f32:$x),
-  (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
+  (CNDGE $x,
+  (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)),
+  (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
+  )
 >;
 
 
@@ -1180,7 +1192,9 @@
   def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
 
   def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
-  def : FROUNDPat <CNDGE_r600>;
+  defm : RsqPat<RECIPSQRT_IEEE_r600, f32>;
+
+  def : FROUNDPat <CNDGE_r600, CNDGT_r600>;
 
   def R600_ExportSwz : ExportSwzInst {
     let Word1{20-17} = 0; // BURST_COUNT
@@ -1482,6 +1496,7 @@
      let mayLoad = 0;
      let mayStore = 0;
      let hasSideEffects = 0;
+     let isCodeGenOnly = 1;
 }
 
 multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {

diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index b0ae22e..263561e 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h

@@ -10,8 +10,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef R600MACHINEFUNCTIONINFO_H
-#define R600MACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H
 
 #include "AMDGPUMachineFunction.h"
 #include "llvm/ADT/BitVector.h"
@@ -31,4 +31,4 @@
 
 } // End llvm namespace
 
-#endif //R600MACHINEFUNCTIONINFO_H
+#endif

diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 7ea654c..d782713 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp

@@ -14,7 +14,6 @@
 
 #include "R600MachineScheduler.h"
 #include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
@@ -76,21 +75,25 @@
     float ALUFetchRationEstimate =
         (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
         (FetchInstCount + Available[IDFetch].size());
-    unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
-    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
-    // We assume the local GPR requirements to be "dominated" by the requirement
-    // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
-    // after TEX are indeed likely to consume or generate values from/for the
-    // TEX clause.
-    // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
-    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
-    // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
-    // (TODO : use RegisterPressure)
-    // If we are going too use too many GPR, we flush Fetch instruction to lower
-    // register pressure on 128 bits regs.
-    unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
-    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+    if (ALUFetchRationEstimate == 0) {
       AllowSwitchFromAlu = true;
+    } else {
+      unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+      DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+      // We assume the local GPR requirements to be "dominated" by the requirement
+      // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+      // after TEX are indeed likely to consume or generate values from/for the
+      // TEX clause.
+      // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+      // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+      // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+      // (TODO : use RegisterPressure)
+      // If we are going too use too many GPR, we flush Fetch instruction to lower
+      // register pressure on 128 bits regs.
+      unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+      if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+        AllowSwitchFromAlu = true;
+    }
   }
 
   if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||

diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index fd475af..fc5b95c 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef R600MACHINESCHEDULER_H_
-#define R600MACHINESCHEDULER_H_
+#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H
 
 #include "R600InstrInfo.h"
 #include "llvm/ADT/PriorityQueue.h"

diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
index 2314136..742c0e0 100644
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp

@@ -30,6 +30,7 @@
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600InstrInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -279,9 +280,8 @@
       continue;
     if (PreviousRegSeqByReg[MOp->getReg()].empty())
       continue;
-    std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()];
-    for (unsigned i = 0, e = MIs.size(); i < e; i++) {
-      CompatibleRSI = PreviousRegSeq[MIs[i]];
+    for (MachineInstr *MI : PreviousRegSeqByReg[MOp->getReg()]) {
+      CompatibleRSI = PreviousRegSeq[MI];
       if (RSI == CompatibleRSI)
         continue;
       if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
@@ -314,7 +314,7 @@
 }
 
 bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
-  TII = static_cast<const R600InstrInfo *>(Fn.getTarget().getInstrInfo());
+  TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo());
   MRI = &(Fn.getRegInfo());
   for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
        MBB != MBBe; ++MBB) {

diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index 74cf309..ddf68c9 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp

@@ -148,11 +148,11 @@
   }
 public:
   // Ctor.
-  R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
-                        MachineDominatorTree &MDT)
-  : VLIWPacketizerList(MF, MLI, MDT, true),
-    TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())),
-    TRI(TII->getRegisterInfo()) {
+  R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI)
+      : VLIWPacketizerList(MF, MLI, true),
+        TII(static_cast<const R600InstrInfo *>(
+            MF.getSubtarget().getInstrInfo())),
+        TRI(TII->getRegisterInfo()) {
     VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
   }
 
@@ -328,12 +328,11 @@
 };
 
 bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
   MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
-  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
 
   // Instantiate the packetizer.
-  R600PacketizerList Packetizer(Fn, MLI, MDT);
+  R600PacketizerList Packetizer(Fn, MLI);
 
   // DFA state table should not be empty.
   assert(Packetizer.getResourceTracker() && "Empty DFA table!");

diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index 247808b..f1a8a41 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef R600REGISTERINFO_H_
-#define R600REGISTERINFO_H_
+#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
+#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
 
@@ -46,4 +46,4 @@
 
 } // End namespace llvm
 
-#endif // AMDIDSAREGISTERINFO_H_
+#endif

diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 4d31a11..2e7dab6 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h

@@ -8,10 +8,11 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#ifndef SIDEFINES_H_
-#define SIDEFINES_H_
+#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H
+#define LLVM_LIB_TARGET_R600_SIDEFINES_H
 
 namespace SIInstrFlags {
+// This needs to be kept in sync with the field bits in InstSI.
 enum {
   MIMG = 1 << 3,
   SMRD = 1 << 4,
@@ -19,10 +20,38 @@
   VOP2 = 1 << 6,
   VOP3 = 1 << 7,
   VOPC = 1 << 8,
-  SALU = 1 << 9
+  SALU = 1 << 9,
+  MUBUF = 1 << 10,
+  MTBUF = 1 << 11,
+  FLAT = 1 << 12
 };
 }
 
+namespace SIInstrFlags {
+  enum Flags {
+    // First 4 bits are the instruction encoding
+    VM_CNT = 1 << 0,
+    EXP_CNT = 1 << 1,
+    LGKM_CNT = 1 << 2
+  };
+}
+
+namespace SISrcMods {
+  enum {
+   NEG = 1 << 0,
+   ABS = 1 << 1
+  };
+}
+
+namespace SIOutMods {
+  enum {
+    NONE = 0,
+    MUL2 = 1,
+    MUL4 = 2,
+    DIV2 = 3
+  };
+}
+
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
 #define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
@@ -32,6 +61,7 @@
 #define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
 #define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
 #define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
+#define   S_00B02C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
 #define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
 
@@ -85,4 +115,7 @@
 #define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
 #define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
 
-#endif // SIDEFINES_H_
+#define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
+#define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+
+#endif

diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index 5f71453..d6f4b4c 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp

@@ -66,6 +66,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -195,10 +196,10 @@
 
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      MF.getTarget().getInstrInfo());
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
 
@@ -237,14 +238,66 @@
 
         // If a PHI node defines an SGPR and any of its operands are VGPRs,
         // then we need to move it to the VALU.
+        //
+        // Also, if a PHI node defines an SGPR and has all SGPR operands
+        // we must move it to the VALU, because the SGPR operands will
+        // all end up being assigned the same register, which means
+        // there is a potential for a conflict if different threads take
+        // different control flow paths.
+        //
+        // For Example:
+        //
+        // sgpr0 = def;
+        // ...
+        // sgpr1 = def;
+        // ...
+        // sgpr2 = PHI sgpr0, sgpr1
+        // use sgpr2;
+        //
+        // Will Become:
+        //
+        // sgpr2 = def;
+        // ...
+        // sgpr2 = def;
+        // ...
+        // use sgpr2
+        //
+        // FIXME: This is OK if the branching decision is made based on an
+        // SGPR value.
+        bool SGPRBranch = false;
+
+        // The one exception to this rule is when one of the operands
+        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
+        // instruction.  In this case, there we know the program will
+        // never enter the second block (the loop) without entering
+        // the first block (where the condition is computed), so there
+        // is no chance for values to be over-written.
+
+        bool HasBreakDef = false;
         for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
           unsigned Reg = MI.getOperand(i).getReg();
           if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
             TII->moveToVALU(MI);
             break;
           }
+          MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
+          assert(DefInstr);
+          switch(DefInstr->getOpcode()) {
+
+          case AMDGPU::SI_BREAK:
+          case AMDGPU::SI_IF_BREAK:
+          case AMDGPU::SI_ELSE_BREAK:
+          // If we see a PHI instruction that defines an SGPR, then that PHI
+          // instruction has already been considered and should have
+          // a *_BREAK as an operand.
+          case AMDGPU::PHI:
+            HasBreakDef = true;
+            break;
+          }
         }
 
+        if (!SGPRBranch && !HasBreakDef)
+          TII->moveToVALU(MI);
         break;
       }
       case AMDGPU::REG_SEQUENCE: {
@@ -252,8 +305,7 @@
             !hasVGPROperands(MI, TRI))
           continue;
 
-        DEBUG(dbgs() << "Fixing REG_SEQUENCE:\n");
-        DEBUG(MI.print(dbgs()));
+        DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
 
         TII->moveToVALU(MI);
         break;
@@ -265,8 +317,7 @@
         Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
         if (TRI->isSGPRClass(DstRC) &&
             (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
-          DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n");
-          DEBUG(MI.print(dbgs()));
+          DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
           TII->moveToVALU(MI);
         }
         break;
@@ -274,5 +325,6 @@
       }
     }
   }
-  return false;
+
+  return true;
 }

diff --git a/lib/Target/R600/SIFixSGPRLiveRanges.cpp b/lib/Target/R600/SIFixSGPRLiveRanges.cpp
index 7d116ee..f34c375 100644
--- a/lib/Target/R600/SIFixSGPRLiveRanges.cpp
+++ b/lib/Target/R600/SIFixSGPRLiveRanges.cpp

@@ -9,18 +9,49 @@
 //
 /// \file
 /// SALU instructions ignore control flow, so we need to modify the live ranges
-/// of the registers they define.
+/// of the registers they define in some cases.
 ///
-/// The strategy is to view the entire program as if it were a single basic
-/// block and calculate the intervals accordingly.  We implement this
-/// by walking this list of segments for each LiveRange and setting the
-/// end of each segment equal to the start of the segment that immediately
-/// follows it.
+/// The main case we need to handle is when a def is used in one side of a
+/// branch and not another.  For example:
+///
+/// %def
+/// IF
+///   ...
+///   ...
+/// ELSE
+///   %use
+///   ...
+/// ENDIF
+///
+/// Here we need the register allocator to avoid assigning any of the defs
+/// inside of the IF to the same register as %def.  In traditional live
+/// interval analysis %def is not live inside the IF branch, however, since
+/// SALU instructions inside of IF will be executed even if the branch is not
+/// taken, there is the chance that one of the instructions will overwrite the
+/// value of %def, so the use in ELSE will see the wrong value.
+///
+/// The strategy we use for solving this is to add an extra use after the ENDIF:
+///
+/// %def
+/// IF
+///   ...
+///   ...
+/// ELSE
+///   %use
+///   ...
+/// ENDIF
+/// %use
+///
+/// Adding this use will make the def live thoughout the IF branch, which is
+/// what we want.
 
 #include "AMDGPU.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
@@ -40,16 +71,15 @@
     initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const override {
+  const char *getPassName() const override {
     return "SI Fix SGPR live ranges";
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervals>();
-    AU.addPreserved<LiveIntervals>();
-    AU.addPreserved<SlotIndexes>();
+    AU.addRequired<MachinePostDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -60,6 +90,7 @@
 INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE,
                       "SI Fix SGPR Live Ranges", false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE,
                     "SI Fix SGPR Live Ranges", false, false)
 
@@ -73,38 +104,88 @@
 
 bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
-      MF.getTarget().getRegisterInfo());
+      MF.getSubtarget().getRegisterInfo());
   LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
+ MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>();
+  std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges;
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-                                                      I != E; ++I) {
-      MachineInstr &MI = *I;
-      MachineOperand *ExecUse = MI.findRegisterUseOperand(AMDGPU::EXEC);
-      if (ExecUse)
-        continue;
-
-      for (const MachineOperand &Def : MI.operands()) {
-        if (!Def.isReg() || !Def.isDef() ||!TargetRegisterInfo::isVirtualRegister(Def.getReg()))
+  // First pass, collect all live intervals for SGPRs
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.defs()) {
+        if (MO.isImplicit())
           continue;
-
-        const TargetRegisterClass *RC = MRI.getRegClass(Def.getReg());
-
-        if (!TRI->isSGPRClass(RC))
-          continue;
-        LiveInterval &LI = LIS->getInterval(Def.getReg());
-        for (unsigned i = 0, e = LI.size() - 1; i != e; ++i) {
-          LiveRange::Segment &Seg = LI.segments[i];
-          LiveRange::Segment &Next = LI.segments[i + 1];
-          Seg.end = Next.start;
+        unsigned Def = MO.getReg();
+        if (TargetRegisterInfo::isVirtualRegister(Def)) {
+          if (TRI->isSGPRClass(MRI.getRegClass(Def)))
+            SGPRLiveRanges.push_back(
+                std::make_pair(Def, &LIS->getInterval(Def)));
+        } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) {
+            SGPRLiveRanges.push_back(
+                std::make_pair(Def, &LIS->getRegUnit(Def)));
         }
       }
     }
   }
 
+  // Second pass fix the intervals
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+    MachineBasicBlock &MBB = *BI;
+    if (MBB.succ_size() < 2)
+      continue;
+
+    // We have structured control flow, so number of succesors should be two.
+    assert(MBB.succ_size() == 2);
+    MachineBasicBlock *SuccA = *MBB.succ_begin();
+    MachineBasicBlock *SuccB = *(++MBB.succ_begin());
+    MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB);
+
+    if (!NCD)
+      continue;
+
+    MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator();
+
+    if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) {
+      assert(NCD->succ_size() == 2);
+      // We want to make sure we insert the Use after the ENDIF, not after
+      // the ELSE.
+      NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(),
+                                            *(++NCD->succ_begin()));
+    }
+    assert(SuccA && SuccB);
+    for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) {
+      unsigned Reg = RegLR.first;
+      LiveRange *LR = RegLR.second;
+
+      // FIXME: We could be smarter here.  If the register is Live-In to
+      // one block, but the other doesn't have any SGPR defs, then there
+      // won't be a conflict.  Also, if the branch decision is based on
+      // a value in an SGPR, then there will be no conflict.
+      bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA);
+      bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB);
+
+      if ((!LiveInToA && !LiveInToB) ||
+          (LiveInToA && LiveInToB))
+        continue;
+
+      // This interval is live in to one successor, but not the other, so
+      // we need to update its range so it is live in to both.
+      DEBUG(dbgs() << "Possible SGPR conflict detected " <<  " in " << *LR <<
+                      " BB#" << SuccA->getNumber() << ", BB#" <<
+                      SuccB->getNumber() <<
+                      " with NCD = " << NCD->getNumber() << '\n');
+
+      // FIXME: Need to figure out how to update LiveRange here so this pass
+      // will be able to preserve LiveInterval analysis.
+      BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(),
+              TII->get(AMDGPU::SGPR_USE))
+              .addReg(Reg, RegState::Implicit);
+      DEBUG(NCD->getFirstNonPHI()->dump());
+    }
+  }
+
   return false;
 }

diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index b13c3b8..8d4164a 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp

@@ -12,6 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifdef _MSC_VER
+// Provide M_PI.
+#define _USE_MATH_DEFINES
+#include <cmath>
+#endif
+
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
 #include "AMDGPUIntrinsicInfo.h"
@@ -19,6 +25,7 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -46,10 +53,10 @@
   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
 
-  addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
+  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
 
-  addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
+  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
 
   computeRegisterProperties();
@@ -80,8 +87,15 @@
   setOperationAction(ISD::SUBC, MVT::i32, Legal);
   setOperationAction(ISD::SUBE, MVT::i32, Legal);
 
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
+
+  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
   // We need to custom lower vector stores from local memory
-  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
@@ -89,12 +103,6 @@
   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
 
-  // We need to custom lower loads/stores from private memory
-  setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
-
   setOperationAction(ISD::STORE, MVT::i1, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
@@ -114,6 +122,8 @@
   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
@@ -126,8 +136,7 @@
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
 
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
-
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -179,6 +188,9 @@
     MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
   };
 
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT, MVT::i1, Promote);
+
   for (MVT VT : VecTypes) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch(Op) {
@@ -188,10 +200,12 @@
       case ISD::BITCAST:
       case ISD::EXTRACT_VECTOR_ELT:
       case ISD::INSERT_VECTOR_ELT:
-      case ISD::CONCAT_VECTORS:
       case ISD::INSERT_SUBVECTOR:
       case ISD::EXTRACT_SUBVECTOR:
         break;
+      case ISD::CONCAT_VECTORS:
+        setOperationAction(Op, VT, Custom);
+        break;
       default:
         setOperationAction(Op, VT, Expand);
         break;
@@ -213,16 +227,37 @@
     setOperationAction(ISD::FRINT, MVT::f64, Legal);
   }
 
-  // FIXME: These should be removed and handled the same was as f32 fneg. Source
-  // modifiers also work for the double instructions.
-  setOperationAction(ISD::FNEG, MVT::f64, Expand);
-  setOperationAction(ISD::FABS, MVT::f64, Expand);
+  setOperationAction(ISD::FDIV, MVT::f32, Custom);
 
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FMINNUM);
+  setTargetDAGCombine(ISD::FMAXNUM);
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::SETCC);
 
   setTargetDAGCombine(ISD::UINT_TO_FP);
 
+  // All memory operations. Some folding on the pointer operand is done to help
+  // matching the constant offsets in the addressing modes.
+  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD);
+  setTargetDAGCombine(ISD::ATOMIC_STORE);
+  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
+  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+  setTargetDAGCombine(ISD::ATOMIC_SWAP);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
+  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
+
   setSchedulingPreference(Sched::RegPressure);
 }
 
@@ -230,15 +265,63 @@
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
-bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
-                                                     unsigned AddrSpace,
-                                                     bool *IsFast) const {
+bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
+                                          EVT) const {
+  // SI has some legal vector types, but no legal vector operations. Say no
+  // shuffles are legal in order to prefer scalarizing some vector operations.
+  return false;
+}
+
+// FIXME: This really needs an address space argument. The immediate offset
+// size is different for different sets of memory instruction sets.
+
+// The single offset DS instructions have a 16-bit unsigned byte offset.
+//
+// MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r +
+// r + i with addr64. 32-bit has more addressing mode options. Depending on the
+// resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i).
+//
+// SMRD instructions have an 8-bit, dword offset.
+//
+bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                             Type *Ty) const {
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // Allow a 16-bit unsigned immediate field, since this is what DS instructions
+  // use.
+  if (!isUInt<16>(AM.BaseOffs))
+    return false;
+
+  // Only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default: // Don't allow n * r
+    return false;
+  }
+
+  return true;
+}
+
+bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT  VT,
+                                                      unsigned AddrSpace,
+                                                      unsigned Align,
+                                                      bool *IsFast) const {
   if (IsFast)
     *IsFast = false;
 
-  // XXX: This depends on the address space and also we may want to revist
-  // the alignment values we specify in the DataLayout.
-
   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
   // which isn't a simple VT.
   if (!VT.isSimple() || VT == MVT::Other)
@@ -248,28 +331,44 @@
   // see what for specifically. The wording everywhere else seems to be the
   // same.
 
-  // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have
-  // no alignment restrictions.
-  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
-    // Using any pair of GPRs should be the same as any other pair.
-    if (IsFast)
-      *IsFast = true;
-    return VT.bitsGE(MVT::i64);
-  }
-
   // XXX - The only mention I see of this in the ISA manual is for LDS direct
   // reads the "byte address and must be dword aligned". Is it also true for the
   // normal loads and stores?
-  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS)
-    return false;
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
+    // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
+    // aligned, 8 byte access in a single operation using ds_read2/write2_b32
+    // with adjacent offsets.
+    return Align % 4 == 0;
+  }
 
   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
   // byte-address are ignored, thus forcing Dword alignment.
+  // This applies to private, global, and constant memory.
   if (IsFast)
     *IsFast = true;
   return VT.bitsGT(MVT::i32);
 }
 
+EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                          unsigned SrcAlign, bool IsMemset,
+                                          bool ZeroMemset,
+                                          bool MemcpyStrSrc,
+                                          MachineFunction &MF) const {
+  // FIXME: Should account for address space here.
+
+  // The default fallback uses the private pointer size as a guess for a type to
+  // use. Make sure we switch these to 64-bit accesses.
+
+  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
+    return MVT::v4i32;
+
+  if (Size >= 8 && DstAlign >= 4)
+    return MVT::v2i32;
+
+  // Use the default.
+  return MVT::Other;
+}
+
 TargetLoweringBase::LegalizeTypeAction
 SITargetLowering::getPreferredVectorAction(EVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
@@ -280,25 +379,37 @@
 
 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                          Type *Ty) const {
-  const SIInstrInfo *TII =
-    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      getTargetMachine().getSubtargetImpl()->getInstrInfo());
   return TII->isInlineConstant(Imm);
 }
 
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
-                                         SDLoc DL, SDValue Chain,
+                                         SDLoc SL, SDValue Chain,
                                          unsigned Offset, bool Signed) const {
-  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                            AMDGPUAS::CONSTANT_ADDRESS);
-  SDValue BasePtr =  DAG.getCopyFromReg(Chain, DL,
-                           MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
-  SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                             DAG.getConstant(Offset, MVT::i64));
-  return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr,
-                            MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
-                            false, false, MemVT.getSizeInBits() >> 3);
+  const DataLayout *DL = getDataLayout();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
 
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  SDValue BasePtr =  DAG.getCopyFromReg(Chain, SL,
+                           MRI.getLiveInVirtReg(InputPtrReg), MVT::i64);
+  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr,
+                                             DAG.getConstant(Offset, MVT::i64));
+  SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+
+  return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD,
+                     VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
+                     false, // isVolatile
+                     true, // isNonTemporal
+                     true, // isInvariant
+                     DL->getABITypeAlignment(Ty)); // Alignment
 }
 
 SDValue SITargetLowering::LowerFormalArguments(
@@ -309,7 +420,9 @@
                                       SDLoc DL, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
 
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetMachine &TM = getTargetMachine();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo());
 
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
@@ -318,20 +431,20 @@
   assert(CallConv == CallingConv::C);
 
   SmallVector<ISD::InputArg, 16> Splits;
-  uint32_t Skipped = 0;
+  BitVector Skipped(Ins.size());
 
   for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
     const ISD::InputArg &Arg = Ins[i];
 
     // First check if it's a PS input addr
-    if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+    if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
         !Arg.Flags.isByVal()) {
 
       assert((PSInputNum <= 15) && "Too many PS inputs!");
 
       if (!Arg.Used) {
         // We can savely skip PS inputs
-        Skipped |= 1 << i;
+        Skipped.set(i);
         ++PSInputNum;
         continue;
       }
@@ -340,7 +453,7 @@
     }
 
     // Second split vertices into their elements
-    if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) {
+    if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
       ISD::InputArg NewArg = Arg;
       NewArg.Flags.setSplit();
       NewArg.VT = Arg.VT.getVectorElementType();
@@ -356,30 +469,51 @@
         NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
 
-    } else if (Info->ShaderType != ShaderType::COMPUTE) {
+    } else if (Info->getShaderType() != ShaderType::COMPUTE) {
       Splits.push_back(Arg);
     }
   }
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   // At least one interpolation mode must be enabled or else the GPU will hang.
-  if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) {
+  if (Info->getShaderType() == ShaderType::PIXEL &&
+      (Info->PSInputAddr & 0x7F) == 0) {
     Info->PSInputAddr |= 1;
     CCInfo.AllocateReg(AMDGPU::VGPR0);
     CCInfo.AllocateReg(AMDGPU::VGPR1);
   }
 
   // The pointer to the list of arguments is stored in SGPR0, SGPR1
-  if (Info->ShaderType == ShaderType::COMPUTE) {
-    CCInfo.AllocateReg(AMDGPU::SGPR0);
-    CCInfo.AllocateReg(AMDGPU::SGPR1);
-    MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
+	// The pointer to the scratch buffer is stored in SGPR2, SGPR3
+  if (Info->getShaderType() == ShaderType::COMPUTE) {
+    Info->NumUserSGPRs = 4;
+
+    unsigned InputPtrReg =
+        TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+    unsigned InputPtrRegLo =
+        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
+    unsigned InputPtrRegHi =
+        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
+
+    unsigned ScratchPtrReg =
+        TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
+    unsigned ScratchPtrRegLo =
+        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
+    unsigned ScratchPtrRegHi =
+        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
+
+    CCInfo.AllocateReg(InputPtrRegLo);
+    CCInfo.AllocateReg(InputPtrRegHi);
+    CCInfo.AllocateReg(ScratchPtrRegLo);
+    CCInfo.AllocateReg(ScratchPtrRegHi);
+    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+    MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
   }
 
-  if (Info->ShaderType == ShaderType::COMPUTE) {
+  if (Info->getShaderType() == ShaderType::COMPUTE) {
     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                             Splits);
   }
@@ -389,23 +523,36 @@
   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
 
     const ISD::InputArg &Arg = Ins[i];
-    if (Skipped & (1 << i)) {
+    if (Skipped[i]) {
       InVals.push_back(DAG.getUNDEF(Arg.VT));
       continue;
     }
 
     CCValAssign &VA = ArgLocs[ArgIdx++];
-    EVT VT = VA.getLocVT();
+    MVT VT = VA.getLocVT();
 
     if (VA.isMemLoc()) {
       VT = Ins[i].VT;
       EVT MemVT = Splits[i].VT;
+      const unsigned Offset = 36 + VA.getLocMemOffset();
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
       SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
-                                   36 + VA.getLocMemOffset(),
-                                   Ins[i].Flags.isSExt());
+                                   Offset, Ins[i].Flags.isSExt());
+
+      const PointerType *ParamTy =
+          dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex));
+      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+        // On SI local pointers are just offsets into LDS, so they are always
+        // less than 16-bits.  On CI and newer they could potentially be
+        // real pointers, so we can't guarantee their size.
+        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
+                          DAG.getValueType(MVT::i16));
+      }
+
       InVals.push_back(Arg);
+      Info->ABIArgOffset = Offset + MemVT.getStoreSize();
       continue;
     }
     assert(VA.isRegLoc() && "Parameter must be in a register!");
@@ -458,39 +605,13 @@
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
   MachineBasicBlock::iterator I = *MI;
-  const SIInstrInfo *TII =
-    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      getTargetMachine().getSubtargetImpl()->getInstrInfo());
 
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH: return BB;
-  case AMDGPU::SI_ADDR64_RSRC: {
-    unsigned SuperReg = MI->getOperand(0).getReg();
-    unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
-    unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
-    unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo)
-            .addOperand(MI->getOperand(1));
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
-            .addImm(0);
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
-            .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
-            .addReg(SubRegHiLo)
-            .addImm(AMDGPU::sub0)
-            .addReg(SubRegHiHi)
-            .addImm(AMDGPU::sub1);
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg)
-            .addReg(SubRegLo)
-            .addImm(AMDGPU::sub0_sub1)
-            .addReg(SubRegHi)
-            .addImm(AMDGPU::sub2_sub3);
-    MI->eraseFromParent();
-    break;
-  }
   case AMDGPU::V_SUB_F64: {
     unsigned DestReg = MI->getOperand(0).getReg();
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
@@ -498,8 +619,6 @@
       .addReg(MI->getOperand(1).getReg())
       .addImm(1)  // SRC1 modifiers
       .addReg(MI->getOperand(2).getReg())
-      .addImm(0)  // SRC2 modifiers
-      .addImm(0)  // src2
       .addImm(0)  // CLAMP
       .addImm(0); // OMOD
     MI->eraseFromParent();
@@ -517,49 +636,6 @@
     MI->eraseFromParent();
     break;
   }
-  case AMDGPU::FABS_SI: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
-            Reg)
-            .addImm(0x7fffffff);
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32),
-            MI->getOperand(0).getReg())
-            .addReg(MI->getOperand(1).getReg())
-            .addReg(Reg);
-    MI->eraseFromParent();
-    break;
-  }
-  case AMDGPU::FNEG_SI: {
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
-            Reg)
-            .addImm(0x80000000);
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32),
-            MI->getOperand(0).getReg())
-            .addReg(MI->getOperand(1).getReg())
-            .addReg(Reg);
-    MI->eraseFromParent();
-    break;
-  }
-  case AMDGPU::FCLAMP_SI: {
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64),
-            MI->getOperand(0).getReg())
-            .addImm(0) // SRC0 modifiers
-            .addOperand(MI->getOperand(1))
-            .addImm(0) // SRC1 modifiers
-            .addImm(0) // SRC1
-            .addImm(1) // CLAMP
-            .addImm(0); // OMOD
-    MI->eraseFromParent();
-  }
   }
   return BB;
 }
@@ -598,148 +674,31 @@
 //===----------------------------------------------------------------------===//
 
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::LOAD: {
-    LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
-    EVT VT = Op.getValueType();
-
-    // These loads are legal.
-    if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
-        VT.isVector() && VT.getVectorNumElements() == 2 &&
-        VT.getVectorElementType() == MVT::i32)
-      return SDValue();
-
-    if (Op.getValueType().isVector() &&
-        (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-         Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
-         (Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
-          Op.getValueType().getVectorNumElements() > 4))) {
-      return SplitVectorLoad(Op, DAG);
-    } else {
-      SDValue Result = LowerLOAD(Op, DAG);
-      assert((!Result.getNode() ||
-              Result.getNode()->getNumValues() == 2) &&
-             "Load should return a value and a chain");
-      return Result;
-    }
+    SDValue Result = LowerLOAD(Op, DAG);
+    assert((!Result.getNode() ||
+            Result.getNode()->getNumValues() == 2) &&
+           "Load should return a value and a chain");
+    return Result;
   }
 
+  case ISD::FSIN:
+  case ISD::FCOS:
+    return LowerTrig(Op, DAG);
   case ISD::SELECT: return LowerSELECT(Op, DAG);
+  case ISD::FDIV: return LowerFDIV(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
-  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntrinsicID =
-                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-    EVT VT = Op.getValueType();
-    SDLoc DL(Op);
-    //XXX: Hardcoded we only use two to store the pointer to the parameters.
-    unsigned NumUserSGPRs = 2;
-    switch (IntrinsicID) {
-    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-    case Intrinsic::r600_read_ngroups_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false);
-    case Intrinsic::r600_read_ngroups_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false);
-    case Intrinsic::r600_read_ngroups_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false);
-    case Intrinsic::r600_read_global_size_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false);
-    case Intrinsic::r600_read_global_size_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false);
-    case Intrinsic::r600_read_global_size_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false);
-    case Intrinsic::r600_read_local_size_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false);
-    case Intrinsic::r600_read_local_size_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false);
-    case Intrinsic::r600_read_local_size_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
-    case Intrinsic::r600_read_tgid_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-                     AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
-    case Intrinsic::r600_read_tgid_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-                     AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
-    case Intrinsic::r600_read_tgid_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-                     AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
-    case Intrinsic::r600_read_tidig_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
-                                  AMDGPU::VGPR0, VT);
-    case Intrinsic::r600_read_tidig_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
-                                  AMDGPU::VGPR1, VT);
-    case Intrinsic::r600_read_tidig_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
-                                  AMDGPU::VGPR2, VT);
-    case AMDGPUIntrinsic::SI_load_const: {
-      SDValue Ops [] = {
-        Op.getOperand(1),
-        Op.getOperand(2)
-      };
-
-      MachineMemOperand *MMO = MF.getMachineMemOperand(
-          MachinePointerInfo(),
-          MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
-          VT.getSizeInBits() / 8, 4);
-      return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
-                                     Op->getVTList(), Ops, VT, MMO);
-    }
-    case AMDGPUIntrinsic::SI_sample:
-      return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
-    case AMDGPUIntrinsic::SI_sampleb:
-      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
-    case AMDGPUIntrinsic::SI_sampled:
-      return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
-    case AMDGPUIntrinsic::SI_samplel:
-      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
-    case AMDGPUIntrinsic::SI_vs_load_input:
-      return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-    }
+  case ISD::GlobalAddress: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    return LowerGlobalAddress(MFI, Op, DAG);
   }
-
-  case ISD::INTRINSIC_VOID:
-    SDValue Chain = Op.getOperand(0);
-    unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-
-    switch (IntrinsicID) {
-      case AMDGPUIntrinsic::SI_tbuffer_store: {
-        SDLoc DL(Op);
-        SDValue Ops [] = {
-          Chain,
-          Op.getOperand(2),
-          Op.getOperand(3),
-          Op.getOperand(4),
-          Op.getOperand(5),
-          Op.getOperand(6),
-          Op.getOperand(7),
-          Op.getOperand(8),
-          Op.getOperand(9),
-          Op.getOperand(10),
-          Op.getOperand(11),
-          Op.getOperand(12),
-          Op.getOperand(13),
-          Op.getOperand(14)
-        };
-        EVT VT = Op.getOperand(3).getValueType();
-
-        MachineMemOperand *MMO = MF.getMachineMemOperand(
-            MachinePointerInfo(),
-            MachineMemOperand::MOStore,
-            VT.getSizeInBits() / 8, 4);
-        return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                       Op->getVTList(), Ops, VT, MMO);
-      }
-      default:
-        break;
-    }
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
   }
   return SDValue();
 }
@@ -760,6 +719,14 @@
   return nullptr;
 }
 
+SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+
+  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
+  unsigned FrameIndex = FINode->getIndex();
+
+  return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+}
+
 /// This transforms the control flow intrinsics to get the branch destination as
 /// last parameter, also switches branch target with BR if the need arise
 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
@@ -810,7 +777,9 @@
       BR->getOperand(0),
       BRCOND.getOperand(2)
     };
-    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops);
+    SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
+    DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
+    BR = NewBR.getNode();
   }
 
   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
@@ -838,56 +807,190 @@
   return Chain;
 }
 
+SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
+                                             SDValue Op,
+                                             SelectionDAG &DAG) const {
+  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+
+  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
+
+  SDLoc DL(GSD);
+  const GlobalValue *GV = GSD->getGlobal();
+  MVT PtrVT = getPointerTy(GSD->getAddressSpace());
+
+  SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
+
+  SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
+                              DAG.getConstant(0, MVT::i32));
+  SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
+                              DAG.getConstant(1, MVT::i32));
+
+  SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
+                           PtrLo, GA);
+  SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
+                           PtrHi, DAG.getConstant(0, MVT::i32),
+                           SDValue(Lo.getNode(), 1));
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(MF.getSubtarget().getRegisterInfo());
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  switch (IntrinsicID) {
+  case Intrinsic::r600_read_ngroups_x:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::NGROUPS_X, false);
+  case Intrinsic::r600_read_ngroups_y:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::NGROUPS_Y, false);
+  case Intrinsic::r600_read_ngroups_z:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::NGROUPS_Z, false);
+  case Intrinsic::r600_read_global_size_x:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+  case Intrinsic::r600_read_global_size_y:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+  case Intrinsic::r600_read_global_size_z:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+  case Intrinsic::r600_read_local_size_x:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+  case Intrinsic::r600_read_local_size_y:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+  case Intrinsic::r600_read_local_size_z:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
+
+  case Intrinsic::AMDGPU_read_workdim:
+    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                          MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
+                          false);
+
+  case Intrinsic::r600_read_tgid_x:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+  case Intrinsic::r600_read_tgid_y:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+  case Intrinsic::r600_read_tgid_z:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+  case Intrinsic::r600_read_tidig_x:
+    return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+  case Intrinsic::r600_read_tidig_y:
+    return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+  case Intrinsic::r600_read_tidig_z:
+    return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+  case AMDGPUIntrinsic::SI_load_const: {
+    SDValue Ops[] = {
+      Op.getOperand(1),
+      Op.getOperand(2)
+    };
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+  case AMDGPUIntrinsic::SI_sample:
+    return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
+  case AMDGPUIntrinsic::SI_sampleb:
+    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
+  case AMDGPUIntrinsic::SI_sampled:
+    return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
+  case AMDGPUIntrinsic::SI_samplel:
+    return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
+  case AMDGPUIntrinsic::SI_vs_load_input:
+    return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+                       Op.getOperand(1),
+                       Op.getOperand(2),
+                       Op.getOperand(3));
+  default:
+    return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  }
+}
+
+SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain = Op.getOperand(0);
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+  switch (IntrinsicID) {
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
+    SDLoc DL(Op);
+    SDValue Ops[] = {
+      Chain,
+      Op.getOperand(2),
+      Op.getOperand(3),
+      Op.getOperand(4),
+      Op.getOperand(5),
+      Op.getOperand(6),
+      Op.getOperand(7),
+      Op.getOperand(8),
+      Op.getOperand(9),
+      Op.getOperand(10),
+      Op.getOperand(11),
+      Op.getOperand(12),
+      Op.getOperand(13),
+      Op.getOperand(14)
+    };
+
+    EVT VT = Op.getOperand(3).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+  default:
+    return SDValue();
+  }
+}
+
 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *Load = cast<LoadSDNode>(Op);
-  SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-  if (Lowered.getNode())
-    return Lowered;
 
-  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
-    return SDValue();
+  if (Op.getValueType().isVector()) {
+    assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
+           "Custom lowering for non-i32 vectors hasn't been implemented.");
+    unsigned NumElements = Op.getValueType().getVectorNumElements();
+    assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+    switch (Load->getAddressSpace()) {
+      default: break;
+      case AMDGPUAS::GLOBAL_ADDRESS:
+      case AMDGPUAS::PRIVATE_ADDRESS:
+        // v4 loads are supported for private and global memory.
+        if (NumElements <= 4)
+          break;
+        // fall-through
+      case AMDGPUAS::LOCAL_ADDRESS:
+        return ScalarizeVectorLoad(Op, DAG);
+    }
   }
 
-  EVT MemVT = Load->getMemoryVT();
-
-  assert(!MemVT.isVector() && "Private loads should be scalarized");
-  assert(!MemVT.isFloatingPoint() && "FP loads should be promoted to int");
-
-  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
-                            DAG.getConstant(2, MVT::i32));
-
-  // FIXME: REGISTER_LOAD should probably have a chain result.
-  SDValue Chain = Load->getChain();
-  SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
-                               Chain, Ptr,
-                               DAG.getTargetConstant(0, MVT::i32),
-                               Op.getOperand(2));
-
-  SDValue Ret = LoLoad.getValue(0);
-  if (MemVT.getSizeInBits() == 64) {
-    // TODO: This needs a test to make sure the right thing is happening with
-    // the chain. That is hard without general function support.
-
-    SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
-                                 DAG.getConstant(1, MVT::i32));
-
-    SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
-                                 Chain, IncPtr,
-                                 DAG.getTargetConstant(0, MVT::i32),
-                                 Op.getOperand(2));
-
-    Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad);
-    // Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-    //                     LoLoad.getValue(1), HiLoad.getValue(1));
-  }
-
-  SDValue Ops[] = {
-    Ret,
-    Chain
-  };
-
-  return DAG.getMergeValues(Ops, DL);
+  return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
 }
 
 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
@@ -926,6 +1029,100 @@
   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
 }
 
+// Catch division cases where we can use shortcuts with rcp and rsq
+// instructions.
+SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
+
+  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+    if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
+        CLHS->isExactlyValue(1.0)) {
+      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+      // the CI documentation has a worst case error of 1 ulp.
+      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+      // use it as long as we aren't trying to use denormals.
+
+      // 1.0 / sqrt(x) -> rsq(x)
+      //
+      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+      // error seems really high at 2^29 ULP.
+      if (RHS.getOpcode() == ISD::FSQRT)
+        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+      // 1.0 / x -> rcp(x)
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+    }
+  }
+
+  if (Unsafe) {
+    // Turn into multiply by the reciprocal.
+    // x / y -> x * (1.0 / y)
+    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
+  SDValue FastLowered = LowerFastFDIV(Op, DAG);
+  if (FastLowered.getNode())
+    return FastLowered;
+
+  // This uses v_rcp_f32 which does not handle denormals. Let this hit a
+  // selection error for now rather than do something incorrect.
+  if (Subtarget->hasFP32Denormals())
+    return SDValue();
+
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+  const APFloat K0Val(BitsToFloat(0x6f800000));
+  const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32);
+
+  const APFloat K1Val(BitsToFloat(0x2f800000));
+  const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32);
+
+  const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+
+  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+
+  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+
+  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+}
+
+SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue();
+}
+
+SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f32)
+    return LowerFDIV32(Op, DAG);
+
+  if (VT == MVT::f64)
+    return LowerFDIV64(Op, DAG);
+
+  llvm_unreachable("Unexpected type for fdiv");
+}
+
 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -937,79 +1134,42 @@
       VT.getVectorElementType() == MVT::i32)
     return SDValue();
 
+  if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+    if (VT.isVector() && VT.getVectorNumElements() > 4)
+      return ScalarizeVectorStore(Op, DAG);
+    return SDValue();
+  }
+
   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
   if (Ret.getNode())
     return Ret;
 
   if (VT.isVector() && VT.getVectorNumElements() >= 8)
-      return SplitVectorStore(Op, DAG);
+      return ScalarizeVectorStore(Op, DAG);
 
   if (VT == MVT::i1)
     return DAG.getTruncStore(Store->getChain(), DL,
                         DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
                         Store->getBasePtr(), MVT::i1, Store->getMemOperand());
 
-  if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
-    return SDValue();
+  return SDValue();
+}
 
-  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Store->getBasePtr(),
-                            DAG.getConstant(2, MVT::i32));
-  SDValue Chain = Store->getChain();
-  SmallVector<SDValue, 8> Values;
+SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDValue Arg = Op.getOperand(0);
+  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
+        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
+          DAG.getConstantFP(0.5 / M_PI, VT)));
 
-  if (Store->isTruncatingStore()) {
-    unsigned Mask = 0;
-    if (Store->getMemoryVT() == MVT::i8) {
-      Mask = 0xff;
-    } else if (Store->getMemoryVT() == MVT::i16) {
-      Mask = 0xffff;
-    }
-    SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
-                              Chain, Store->getBasePtr(),
-                              DAG.getConstant(0, MVT::i32));
-    SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getBasePtr(),
-                                  DAG.getConstant(0x3, MVT::i32));
-    SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
-                                   DAG.getConstant(3, MVT::i32));
-    SDValue MaskedValue = DAG.getNode(ISD::AND, DL, MVT::i32, Store->getValue(),
-                                      DAG.getConstant(Mask, MVT::i32));
-    SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
-                                       MaskedValue, ShiftAmt);
-    SDValue RotrAmt = DAG.getNode(ISD::SUB, DL, MVT::i32,
-                                  DAG.getConstant(32, MVT::i32), ShiftAmt);
-    SDValue DstMask = DAG.getNode(ISD::ROTR, DL, MVT::i32,
-                                  DAG.getConstant(Mask, MVT::i32),
-                                  RotrAmt);
-    Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
-    Dst = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
-
-    Values.push_back(Dst);
-  } else if (VT == MVT::i64) {
-    for (unsigned i = 0; i < 2; ++i) {
-      Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
-                       Store->getValue(), DAG.getConstant(i, MVT::i32)));
-    }
-  } else if (VT == MVT::i128) {
-    for (unsigned i = 0; i < 2; ++i) {
-      for (unsigned j = 0; j < 2; ++j) {
-        Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
-                           DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
-                           Store->getValue(), DAG.getConstant(i, MVT::i32)),
-                         DAG.getConstant(j, MVT::i32)));
-      }
-    }
-  } else {
-    Values.push_back(Store->getValue());
+  switch (Op.getOpcode()) {
+  case ISD::FCOS:
+    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
+  case ISD::FSIN:
+    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
+  default:
+    llvm_unreachable("Wrong trig opcode");
   }
-
-  for (unsigned i = 0; i < Values.size(); ++i) {
-    SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32,
-                                  Ptr, DAG.getConstant(i, MVT::i32));
-    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
-                        Chain, Values[i], PartPtr,
-                        DAG.getTargetConstant(0, MVT::i32));
-  }
-  return Chain;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1106,6 +1266,111 @@
   return SDValue();
 }
 
+// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
+
+// This is a variant of
+// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
+//
+// The normal DAG combiner will do this, but only if the add has one use since
+// that would increase the number of instructions.
+//
+// This prevents us from seeing a constant offset that can be folded into a
+// memory instruction's addressing mode. If we know the resulting add offset of
+// a pointer can be folded into an addressing offset, we can replace the pointer
+// operand with the add of new constant offset. This eliminates one of the uses,
+// and may allow the remaining use to also be simplified.
+//
+SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
+                                               unsigned AddrSpace,
+                                               DAGCombinerInfo &DCI) const {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  if (N0.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
+  if (!CN1)
+    return SDValue();
+
+  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!CAdd)
+    return SDValue();
+
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      getTargetMachine().getSubtargetImpl()->getInstrInfo());
+
+  // If the resulting offset is too large, we can't fold it into the addressing
+  // mode offset.
+  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
+  if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
+  SDValue COffset = DAG.getConstant(Offset, MVT::i32);
+
+  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
+}
+
+static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FMAXNUM:
+    return AMDGPUISD::FMAX3;
+  case AMDGPUISD::SMAX:
+    return AMDGPUISD::SMAX3;
+  case AMDGPUISD::UMAX:
+    return AMDGPUISD::UMAX3;
+  case ISD::FMINNUM:
+    return AMDGPUISD::FMIN3;
+  case AMDGPUISD::SMIN:
+    return AMDGPUISD::SMIN3;
+  case AMDGPUISD::UMIN:
+    return AMDGPUISD::UMIN3;
+  default:
+    llvm_unreachable("Not a min/max opcode");
+  }
+}
+
+SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  unsigned Opc = N->getOpcode();
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Only do this if the inner op has one use since this will just increases
+  // register pressure for no benefit.
+
+  // max(max(a, b), c)
+  if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
+    SDLoc DL(N);
+    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                       DL,
+                       N->getValueType(0),
+                       Op0.getOperand(0),
+                       Op0.getOperand(1),
+                       Op1);
+  }
+
+  // max(a, max(b, c))
+  if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
+    SDLoc DL(N);
+    return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+                       DL,
+                       N->getValueType(0),
+                       Op0,
+                       Op1.getOperand(0),
+                       Op1.getOperand(1));
+  }
+
+  return SDValue();
+}
+
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -1114,20 +1379,6 @@
 
   switch (N->getOpcode()) {
     default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
-    case ISD::SELECT_CC: {
-      ConstantSDNode *True, *False;
-      // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
-      if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
-          && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
-          && True->isAllOnesValue()
-          && False->isNullValue()
-          && VT == MVT::i1) {
-        return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
-                           N->getOperand(1), N->getOperand(4));
-
-      }
-      break;
-    }
     case ISD::SETCC: {
       SDValue Arg0 = N->getOperand(0);
       SDValue Arg1 = N->getOperand(1);
@@ -1147,6 +1398,17 @@
       }
       break;
     }
+  case ISD::FMAXNUM: // TODO: What about fmax_legacy?
+  case ISD::FMINNUM:
+  case AMDGPUISD::SMAX:
+  case AMDGPUISD::SMIN:
+  case AMDGPUISD::UMAX:
+  case AMDGPUISD::UMIN: {
+    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+        getTargetMachine().getOptLevel() > CodeGenOpt::None)
+      return performMin3Max3Combine(N, DCI);
+    break;
+  }
 
   case AMDGPUISD::CVT_F32_UBYTE0:
   case AMDGPUISD::CVT_F32_UBYTE1:
@@ -1171,16 +1433,151 @@
 
   case ISD::UINT_TO_FP: {
     return performUCharToFloatCombine(N, DCI);
-  }
-  }
 
+  case ISD::FADD: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::f32)
+      break;
+
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+
+    // These should really be instruction patterns, but writing patterns with
+    // source modiifiers is a pain.
+
+    // fadd (fadd (a, a), b) -> mad 2.0, a, b
+    if (LHS.getOpcode() == ISD::FADD) {
+      SDValue A = LHS.getOperand(0);
+      if (A == LHS.getOperand(1)) {
+        const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32);
+        return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS);
+      }
+    }
+
+    // fadd (b, fadd (a, a)) -> mad 2.0, a, b
+    if (RHS.getOpcode() == ISD::FADD) {
+      SDValue A = RHS.getOperand(0);
+      if (A == RHS.getOperand(1)) {
+        const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32);
+        return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS);
+      }
+    }
+
+    break;
+  }
+  case ISD::FSUB: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    EVT VT = N->getValueType(0);
+
+    // Try to get the fneg to fold into the source modifier. This undoes generic
+    // DAG combines and folds them into the mad.
+    if (VT == MVT::f32) {
+      SDValue LHS = N->getOperand(0);
+      SDValue RHS = N->getOperand(1);
+
+      if (LHS.getOpcode() == ISD::FMUL) {
+        // (fsub (fmul a, b), c) -> mad a, b, (fneg c)
+
+        SDValue A = LHS.getOperand(0);
+        SDValue B = LHS.getOperand(1);
+        SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS);
+
+        return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
+      }
+
+      if (RHS.getOpcode() == ISD::FMUL) {
+        // (fsub c, (fmul a, b)) -> mad (fneg a), b, c
+
+        SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0));
+        SDValue B = RHS.getOperand(1);
+        SDValue C = LHS;
+
+        return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
+      }
+
+      if (LHS.getOpcode() == ISD::FADD) {
+        // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
+
+        SDValue A = LHS.getOperand(0);
+        if (A == LHS.getOperand(1)) {
+          const SDValue Two = DAG.getTargetConstantFP(2.0, MVT::f32);
+          SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
+
+          return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS);
+        }
+      }
+
+      if (RHS.getOpcode() == ISD::FADD) {
+        // (fsub c, (fadd a, a)) -> mad -2.0, a, c
+
+        SDValue A = RHS.getOperand(0);
+        if (A == RHS.getOperand(1)) {
+          const SDValue NegTwo = DAG.getTargetConstantFP(-2.0, MVT::f32);
+          return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS);
+        }
+      }
+    }
+
+    break;
+  }
+  }
+  case ISD::LOAD:
+  case ISD::STORE:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
+    if (DCI.isBeforeLegalize())
+      break;
+
+    MemSDNode *MemNode = cast<MemSDNode>(N);
+    SDValue Ptr = MemNode->getBasePtr();
+
+    // TODO: We could also do this for multiplies.
+    unsigned AS = MemNode->getAddressSpace();
+    if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
+      if (NewPtr) {
+        SmallVector<SDValue, 8> NewOps;
+        for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I)
+          NewOps.push_back(MemNode->getOperand(I));
+
+        NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
+        return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
+      }
+    }
+    break;
+  }
+  }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 /// \brief Test if RegClass is one of the VSrc classes
 static bool isVSrc(unsigned RegClass) {
-  return AMDGPU::VSrc_32RegClassID == RegClass ||
-         AMDGPU::VSrc_64RegClassID == RegClass;
+  switch(RegClass) {
+    default: return false;
+    case AMDGPU::VSrc_32RegClassID:
+    case AMDGPU::VCSrc_32RegClassID:
+    case AMDGPU::VSrc_64RegClassID:
+    case AMDGPU::VCSrc_64RegClassID:
+      return true;
+  }
 }
 
 /// \brief Test if RegClass is one of the SSrc classes
@@ -1227,8 +1624,8 @@
                                bool &ScalarSlotUsed) const {
 
   MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
-  const SIInstrInfo *TII =
-    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      getTargetMachine().getSubtargetImpl()->getInstrInfo());
   if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
     return false;
 
@@ -1262,8 +1659,8 @@
 
 const TargetRegisterClass *SITargetLowering::getRegClassForNode(
                                    SelectionDAG &DAG, const SDValue &Op) const {
-  const SIInstrInfo *TII =
-    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      getTargetMachine().getSubtargetImpl()->getInstrInfo());
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
   if (!Op->isMachineOpcode()) {
@@ -1292,10 +1689,9 @@
     // If the COPY_TO_REGCLASS instruction is copying to a VSrc register
     // class, then the register class for the value could be either a
     // VReg or and SReg.  In order to get a more accurate
-    if (OpClassID == AMDGPU::VSrc_32RegClassID ||
-        OpClassID == AMDGPU::VSrc_64RegClassID) {
+    if (isVSrc(OpClassID))
       return getRegClassForNode(DAG, Op.getOperand(0));
-    }
+
     return TRI.getRegClass(OpClassID);
   case AMDGPU::EXTRACT_SUBREG: {
     int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@@ -1315,7 +1711,8 @@
 /// \brief Does "Op" fit into register class "RegClass" ?
 bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
                                     unsigned RegClass) const {
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const TargetRegisterClass *RC = getRegClassForNode(DAG, Op);
   if (!RC) {
     return false;
@@ -1323,37 +1720,6 @@
   return TRI->getRegClass(RegClass)->hasSubClassEq(RC);
 }
 
-/// \brief Make sure that we don't exeed the number of allowed scalars
-void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
-                                       unsigned RegClass,
-                                       bool &ScalarSlotUsed) const {
-
-  // First map the operands register class to a destination class
-  if (RegClass == AMDGPU::VSrc_32RegClassID)
-    RegClass = AMDGPU::VReg_32RegClassID;
-  else if (RegClass == AMDGPU::VSrc_64RegClassID)
-    RegClass = AMDGPU::VReg_64RegClassID;
-  else
-    return;
-
-  // Nothing to do if they fit naturally
-  if (fitsRegClass(DAG, Operand, RegClass))
-    return;
-
-  // If the scalar slot isn't used yet use it now
-  if (!ScalarSlotUsed) {
-    ScalarSlotUsed = true;
-    return;
-  }
-
-  // This is a conservative aproach. It is possible that we can't determine the
-  // correct register class and copy too often, but better safe than sorry.
-  SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
-  SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
-                                    Operand.getValueType(), Operand, RC);
-  Operand = SDValue(Node, 0);
-}
-
 /// \returns true if \p Node's operands are different from the SDValue list
 /// \p Ops
 static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) {
@@ -1365,14 +1731,15 @@
   return false;
 }
 
-/// \brief Try to fold the Nodes operands into the Node
-SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
-                                       SelectionDAG &DAG) const {
-
+/// TODO: This needs to be removed. It's current primary purpose is to fold
+/// immediates into operands when legal. The legalization parts are redundant
+/// with SIInstrInfo::legalizeOperands which is called in a post-isel hook.
+SDNode *SITargetLowering::legalizeOperands(MachineSDNode *Node,
+                                           SelectionDAG &DAG) const {
   // Original encoding (either e32 or e64)
   int Opcode = Node->getMachineOpcode();
-  const SIInstrInfo *TII =
-    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      getTargetMachine().getSubtargetImpl()->getInstrInfo());
   const MCInstrDesc *Desc = &TII->get(Opcode);
 
   unsigned NumDefs = Desc->getNumDefs();
@@ -1385,13 +1752,6 @@
   assert(!DescRev || DescRev->getNumDefs() == NumDefs);
   assert(!DescRev || DescRev->getNumOperands() == NumOps);
 
-  // e64 version if available, -1 otherwise
-  int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
-  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64);
-  int InputModifiers[3] = {0};
-
-  assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
-
   int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
   bool HaveVSrc = false, HaveSSrc = false;
 
@@ -1421,9 +1781,17 @@
   // No scalar allowed when we have both VSrc and SSrc
   bool ScalarSlotUsed = HaveVSrc && HaveSSrc;
 
+  // If this instruction has an implicit use of VCC, then it can't use the
+  // constant bus.
+  for (unsigned i = 0, e = Desc->getNumImplicitUses(); i != e; ++i) {
+    if (Desc->ImplicitUses[i] == AMDGPU::VCC) {
+      ScalarSlotUsed = true;
+      break;
+    }
+  }
+
   // Second go over the operands and try to fold them
   std::vector<SDValue> Ops;
-  bool Promote2e64 = false;
   for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
        i != e && Op < NumOps; ++i, ++Op) {
 
@@ -1438,11 +1806,9 @@
     // Is this a VSrc or SSrc operand?
     unsigned RegClass = Desc->OpInfo[Op].RegClass;
     if (isVSrc(RegClass) || isSSrc(RegClass)) {
-      // Try to fold the immediates
-      if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
-        // Folding didn't work, make sure we don't hit the SReg limit.
-        ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
-      }
+      // Try to fold the immediates. If this ends up with multiple constant bus
+      // uses, it will be legalized later.
+      foldImm(Ops[i], Immediate, ScalarSlotUsed);
       continue;
     }
 
@@ -1464,66 +1830,6 @@
         continue;
       }
     }
-
-    if (Immediate)
-      continue;
-
-    if (DescE64) {
-      // Test if it makes sense to switch to e64 encoding
-      unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
-      if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass))
-        continue;
-
-      int32_t TmpImm = -1;
-      if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) ||
-          (!fitsRegClass(DAG, Ops[i], RegClass) &&
-           fitsRegClass(DAG, Ops[1], OtherRegClass))) {
-
-        // Switch to e64 encoding
-        Immediate = -1;
-        Promote2e64 = true;
-        Desc = DescE64;
-        DescE64 = nullptr;
-      }
-    }
-
-    if (!DescE64 && !Promote2e64)
-      continue;
-    if (!Operand.isMachineOpcode())
-      continue;
-    if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) {
-      Ops.pop_back();
-      Ops.push_back(Operand.getOperand(0));
-      InputModifiers[i] = 1;
-      Promote2e64 = true;
-      if (!DescE64)
-        continue;
-      Desc = DescE64;
-      DescE64 = nullptr;
-    }
-    else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
-      Ops.pop_back();
-      Ops.push_back(Operand.getOperand(0));
-      InputModifiers[i] = 2;
-      Promote2e64 = true;
-      if (!DescE64)
-        continue;
-      Desc = DescE64;
-      DescE64 = nullptr;
-    }
-  }
-
-  if (Promote2e64) {
-    std::vector<SDValue> OldOps(Ops);
-    Ops.clear();
-    for (unsigned i = 0; i < OldOps.size(); ++i) {
-      // src_modifier
-      Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32));
-      Ops.push_back(OldOps[i]);
-    }
-    // Add the modifier flags while promoting
-    for (unsigned i = 0; i < 2; ++i)
-      Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
   }
 
   // Add optional chain and glue
@@ -1632,46 +1938,182 @@
   }
 }
 
+/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// with frame index operands.
+/// LLVM assumes that inputs are to these instructions are registers.
+void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
+                                                     SelectionDAG &DAG) const {
+
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
+    if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+      Ops.push_back(Node->getOperand(i));
+      continue;
+    }
+
+    SDLoc DL(Node);
+    Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
+                                     Node->getOperand(i).getValueType(),
+                                     Node->getOperand(i)), 0));
+  }
+
+  DAG.UpdateNodeOperands(Node, Ops);
+}
+
 /// \brief Fold the instructions after selecting them.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      getTargetMachine().getSubtargetImpl()->getInstrInfo());
   Node = AdjustRegClass(Node, DAG);
 
   if (TII->isMIMG(Node->getMachineOpcode()))
     adjustWritemask(Node, DAG);
 
-  return foldOperands(Node, DAG);
+  if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
+      Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
+    legalizeTargetIndependentNode(Node, DAG);
+    return Node;
+  }
+
+  return legalizeOperands(Node, DAG);
 }
 
 /// \brief Assign the register class depending on the number of
 /// bits set in the writemask
 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                      SDNode *Node) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-  if (!TII->isMIMG(MI->getOpcode()))
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      getTargetMachine().getSubtargetImpl()->getInstrInfo());
+
+  TII->legalizeOperands(MI);
+
+  if (TII->isMIMG(MI->getOpcode())) {
+    unsigned VReg = MI->getOperand(0).getReg();
+    unsigned Writemask = MI->getOperand(1).getImm();
+    unsigned BitsSet = 0;
+    for (unsigned i = 0; i < 4; ++i)
+      BitsSet += Writemask & (1 << i) ? 1 : 0;
+
+    const TargetRegisterClass *RC;
+    switch (BitsSet) {
+    default: return;
+    case 1:  RC = &AMDGPU::VReg_32RegClass; break;
+    case 2:  RC = &AMDGPU::VReg_64RegClass; break;
+    case 3:  RC = &AMDGPU::VReg_96RegClass; break;
+    }
+
+    unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
+    MI->setDesc(TII->get(NewOpcode));
+    MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+    MRI.setRegClass(VReg, RC);
     return;
-
-  unsigned VReg = MI->getOperand(0).getReg();
-  unsigned Writemask = MI->getOperand(1).getImm();
-  unsigned BitsSet = 0;
-  for (unsigned i = 0; i < 4; ++i)
-    BitsSet += Writemask & (1 << i) ? 1 : 0;
-
-  const TargetRegisterClass *RC;
-  switch (BitsSet) {
-  default: return;
-  case 1:  RC = &AMDGPU::VReg_32RegClass; break;
-  case 2:  RC = &AMDGPU::VReg_64RegClass; break;
-  case 3:  RC = &AMDGPU::VReg_96RegClass; break;
   }
 
-  unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
-  MI->setDesc(TII->get(NewOpcode));
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  MRI.setRegClass(VReg, RC);
+  // Replace unused atomics with the no return version.
+  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
+  if (NoRetAtomicOp != -1) {
+    if (!Node->hasAnyUseOfValue(0)) {
+      MI->setDesc(TII->get(NoRetAtomicOp));
+      MI->RemoveOperand(0);
+    }
+
+    return;
+  }
+}
+
+static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
+  SDValue K = DAG.getTargetConstant(Val, MVT::i32);
+  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
+}
+
+MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
+                                                SDLoc DL,
+                                                SDValue Ptr) const {
+#if 1
+    // XXX - Workaround for moveToVALU not handling different register class
+    // inserts for REG_SEQUENCE.
+
+    // Build the half of the subregister with the constants.
+    const SDValue Ops0[] = {
+      DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, MVT::i32),
+      buildSMovImm32(DAG, DL, 0),
+      DAG.getTargetConstant(AMDGPU::sub0, MVT::i32),
+      buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32),
+      DAG.getTargetConstant(AMDGPU::sub1, MVT::i32)
+    };
+
+    SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                  MVT::v2i32, Ops0), 0);
+
+    // Combine the constants and the pointer.
+    const SDValue Ops1[] = {
+      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
+      Ptr,
+      DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
+      SubRegHi,
+      DAG.getTargetConstant(AMDGPU::sub2_sub3, MVT::i32)
+    };
+
+    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
+#else
+    const SDValue Ops[] = {
+      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
+      Ptr,
+      DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
+      buildSMovImm32(DAG, DL, 0),
+      DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
+      buildSMovImm32(DAG, DL, AMDGPU::RSRC_DATA_FORMAT >> 32),
+      DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
+    };
+
+    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+
+#endif
+}
+
+/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to the
+///        resource ponter.
+MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
+                                           SDLoc DL,
+                                           SDValue Ptr,
+                                           uint32_t RsrcDword1,
+                                           uint64_t RsrcDword2And3) const {
+  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
+  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
+  if (RsrcDword1) {
+    PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
+                                     DAG.getConstant(RsrcDword1, MVT::i32)), 0);
+  }
+
+  SDValue DataLo = buildSMovImm32(DAG, DL,
+                                  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
+  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
+
+  const SDValue Ops[] = {
+    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
+    PtrLo,
+    DAG.getTargetConstant(AMDGPU::sub0, MVT::i32),
+    PtrHi,
+    DAG.getTargetConstant(AMDGPU::sub1, MVT::i32),
+    DataLo,
+    DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
+    DataHi,
+    DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
+  };
+
+  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+}
+
+MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
+                                                  SDLoc DL,
+                                                  SDValue Ptr) const {
+  uint64_t Rsrc = AMDGPU::RSRC_DATA_FORMAT | AMDGPU::RSRC_TID_ENABLE |
+                  0xffffffff; // Size
+
+  return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
 }
 
 MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
@@ -1699,12 +2141,21 @@
       return N;
     }
     ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
-    SDValue Ops[] = {
-      SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128,
-                                 DAG.getConstant(0, MVT::i64)), 0),
-      N->getOperand(0),
-      DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)
-    };
+
+    const SDValue Zero64 = DAG.getTargetConstant(0, MVT::i64);
+    SDValue Ptr(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Zero64), 0);
+    MachineSDNode *RSrc = wrapAddr64Rsrc(DAG, DL, Ptr);
+
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(SDValue(RSrc, 0));
+    Ops.push_back(N->getOperand(0));
+    Ops.push_back(DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32));
+
+    // Copy remaining operands so we keep any chain and glue nodes that follow
+    // the normal operands.
+    for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I)
+      Ops.push_back(N->getOperand(I));
+
     return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
   }
   }

diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index e25323a..7bf406e 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SIISELLOWERING_H
-#define SIISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H
+#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H
 
 #include "AMDGPUISelLowering.h"
 #include "SIInstrInfo.h"
@@ -25,9 +25,21 @@
                          SDValue Chain, unsigned Offset, bool Signed) const;
   SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
                                SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                             SelectionDAG &DAG) const override;
+
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
   bool foldImm(SDValue &Operand, int32_t &Immediate,
@@ -36,20 +48,37 @@
                                                 const SDValue &Op) const;
   bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
                     unsigned RegClass) const;
-  void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
-                       unsigned RegClass, bool &ScalarSlotUsed) const;
 
-  SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
+  SDNode *legalizeOperands(MachineSDNode *N, SelectionDAG &DAG) const;
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
   MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
 
   static SDValue performUCharToFloatCombine(SDNode *N,
                                             DAGCombinerInfo &DCI);
+  SDValue performSHLPtrCombine(SDNode *N,
+                               unsigned AS,
+                               DAGCombinerInfo &DCI) const;
+
+  SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
 
 public:
   SITargetLowering(TargetMachine &tm);
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
-                                     bool *IsFast) const override;
+
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
+                          EVT /*VT*/) const override;
+
+  bool isLegalAddressingMode(const AddrMode &AM,
+                             Type *Ty) const override;
+
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+                                      unsigned Align,
+                                      bool *IsFast) const override;
+
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                          unsigned SrcAlign, bool IsMemset,
+                          bool ZeroMemset,
+                          bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
 
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
@@ -77,8 +106,19 @@
   int32_t analyzeImmediate(const SDNode *N) const;
   SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
                                unsigned Reg, EVT VT) const override;
+  void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
+
+  MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const;
+  MachineSDNode *buildRSRC(SelectionDAG &DAG,
+                           SDLoc DL,
+                           SDValue Ptr,
+                           uint32_t RsrcDword1,
+                           uint64_t RsrcDword2And3) const;
+  MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
+                                  SDLoc DL,
+                                  SDValue Ptr) const;
 };
 
 } // End namespace llvm
 
-#endif //SIISELLOWERING_H
+#endif

diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 1733326..712d97d 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp

@@ -17,6 +17,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -273,17 +275,17 @@
       continue;
 
     NeedWait = true;
-    
+
     if (Ordered[i]) {
       unsigned Value = LastIssued.Array[i] - Required.Array[i];
 
-      // adjust the value to the real hardware posibilities
+      // Adjust the value to the real hardware possibilities.
       Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
 
     } else
       Counts.Array[i] = 0;
 
-    // Remember on what we have waited on
+    // Remember on what we have waited on.
     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
   }
 
@@ -346,8 +348,9 @@
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   bool Changes = false;
 
-  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
-  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 
   MRI = &MF.getRegInfo();
 

diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 7cae9fc..10e0a3f 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td

@@ -24,7 +24,11 @@
   field bits<1> VOP3 = 0;
   field bits<1> VOPC = 0;
   field bits<1> SALU = 0;
+  field bits<1> MUBUF = 0;
+  field bits<1> MTBUF = 0;
+  field bits<1> FLAT = 0;
 
+  // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
   let TSFlags{2} = LGKM_CNT;
@@ -35,38 +39,60 @@
   let TSFlags{7} = VOP3;
   let TSFlags{8} = VOPC;
   let TSFlags{9} = SALU;
+  let TSFlags{10} = MUBUF;
+  let TSFlags{11} = MTBUF;
+  let TSFlags{12} = FLAT;
+
+  // Most instructions require adjustments after selection to satisfy
+  // operand requirements.
+  let hasPostISelHook = 1;
 }
 
-class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
+class Enc32 {
 
   field bits<32> Inst;
-  let Size = 4;
+  int Size = 4;
 }
 
-class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
+class Enc64 {
 
   field bits<64> Inst;
-  let Size = 8;
+  int Size = 8;
+}
+
+class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP1 = 1;
 }
 
 class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
+    InstSI <outs, ins, asm, pattern> {
 
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
+  // Using complex patterns gives VOP3 patterns a very high complexity rating,
+  // but standalone patterns are almost always prefered, so we need to adjust the
+  // priority lower.  The goal is to use a high number to reduce complexity to
+  // zero (or less than zero).
+  let AddedComplexity = -1000;
+
   let VOP3 = 1;
+
+  int Size = 8;
+  let Uses = [EXEC];
 }
 
 //===----------------------------------------------------------------------===//
 // Scalar operations
 //===----------------------------------------------------------------------===//
 
-class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32<outs, ins, asm, pattern> {
+class SOP1e <bits<8> op> : Enc32 {
 
   bits<7> SDST;
   bits<8> SSRC0;
@@ -75,16 +101,10 @@
   let Inst{15-8} = op;
   let Inst{22-16} = SDST;
   let Inst{31-23} = 0x17d; //encoding;
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let SALU = 1;
 }
 
-class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32 <outs, ins, asm, pattern> {
-  
+class SOP2e <bits<7> op> : Enc32 {
+
   bits<7> SDST;
   bits<8> SSRC0;
   bits<8> SSRC1;
@@ -94,15 +114,9 @@
   let Inst{22-16} = SDST;
   let Inst{29-23} = op;
   let Inst{31-30} = 0x2; // encoding
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let SALU = 1;
 }
 
-class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-  Enc32<outs, ins, asm, pattern> {
+class SOPCe <bits<7> op> : Enc32 {
 
   bits<8> SSRC0;
   bits<8> SSRC1;
@@ -111,113 +125,137 @@
   let Inst{15-8} = SSRC1;
   let Inst{22-16} = op;
   let Inst{31-23} = 0x17e;
-
-  let DisableEncoding = "$dst";
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let SALU = 1;
 }
 
-class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
-   Enc32 <outs, ins , asm, pattern> {
+class SOPKe <bits<5> op> : Enc32 {
 
   bits <7> SDST;
   bits <16> SIMM16;
-  
+
   let Inst{15-0} = SIMM16;
   let Inst{22-16} = SDST;
   let Inst{27-23} = op;
   let Inst{31-28} = 0xb; //encoding
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let SALU = 1;
 }
 
-class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
-  (outs),
-  ins,
-  asm,
-  pattern > {
+class SOPPe <bits<7> op> : Enc32 {
 
-  bits <16> SIMM16;
+  bits <16> simm16;
 
-  let Inst{15-0} = SIMM16;
+  let Inst{15-0} = simm16;
   let Inst{22-16} = op;
   let Inst{31-23} = 0x17f; // encoding
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let SALU = 1;
 }
 
-class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
-            list<dag> pattern> : Enc32<outs, ins, asm, pattern> {
+class SMRDe <bits<5> op, bits<1> imm> : Enc32 {
 
   bits<7> SDST;
   bits<7> SBASE;
   bits<8> OFFSET;
-  
+
   let Inst{7-0} = OFFSET;
   let Inst{8} = imm;
   let Inst{14-9} = SBASE{6-1};
   let Inst{21-15} = SDST;
   let Inst{26-22} = op;
   let Inst{31-27} = 0x18; //encoding
+}
+
+class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern>, SOP1e <op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+}
+
+class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, SOP2e<op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+
+  let UseNamedOperandTable = 1;
+}
+
+class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  InstSI<outs, ins, asm, pattern>, SOPCe <op> {
+
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+
+  let UseNamedOperandTable = 1;
+}
+
+class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
+   InstSI <outs, ins , asm, pattern>, SOPKe<op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let SALU = 1;
+
+  let UseNamedOperandTable = 1;
+}
+
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern = []> :
+		InstSI <(outs), ins, asm, pattern >, SOPPe <op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let isCodeGenOnly = 0;
+  let SALU = 1;
+
+  let UseNamedOperandTable = 1;
+}
+
+class SMRD <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern> {
 
   let LGKM_CNT = 1;
   let SMRD = 1;
+  let mayStore = 0;
+  let mayLoad = 1;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
 }
 
 //===----------------------------------------------------------------------===//
 // Vector ALU operations
 //===----------------------------------------------------------------------===//
-    
-let Uses = [EXEC] in {
 
-class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32 <outs, ins, asm, pattern> {
+class VOP1e <bits<8> op> : Enc32 {
 
   bits<8> VDST;
   bits<9> SRC0;
-  
+
   let Inst{8-0} = SRC0;
   let Inst{16-9} = op;
   let Inst{24-17} = VDST;
   let Inst{31-25} = 0x3f; //encoding
-  
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let VOP1 = 1;
 }
 
-class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32 <outs, ins, asm, pattern> {
+class VOP2e <bits<6> op> : Enc32 {
 
   bits<8> VDST;
   bits<9> SRC0;
   bits<8> VSRC1;
-  
+
   let Inst{8-0} = SRC0;
   let Inst{16-9} = VSRC1;
   let Inst{24-17} = VDST;
   let Inst{30-25} = op;
   let Inst{31} = 0x0; //encoding
-  
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let UseNamedOperandTable = 1;
-  let VOP2 = 1;
 }
 
-class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    VOP3Common <outs, ins, asm, pattern> {
+class VOP3e <bits<9> op> : Enc64 {
 
   bits<8> dst;
   bits<2> src0_modifiers;
@@ -243,11 +281,9 @@
   let Inst{61} = src0_modifiers{0};
   let Inst{62} = src1_modifiers{0};
   let Inst{63} = src2_modifiers{0};
-
 }
 
-class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    VOP3Common <outs, ins, asm, pattern> {
+class VOP3be <bits<9> op> : Enc64 {
 
   bits<8> dst;
   bits<2> src0_modifiers;
@@ -270,11 +306,9 @@
   let Inst{61} = src0_modifiers{0};
   let Inst{62} = src1_modifiers{0};
   let Inst{63} = src2_modifiers{0};
-
 }
 
-class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
-    Enc32 <(outs VCCReg:$dst), ins, asm, pattern> {
+class VOPCe <bits<8> op> : Enc32 {
 
   bits<9> SRC0;
   bits<8> VSRC1;
@@ -283,16 +317,9 @@
   let Inst{16-9} = VSRC1;
   let Inst{24-17} = op;
   let Inst{31-25} = 0x3e;
- 
-  let DisableEncoding = "$dst";
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-  let VOPC = 1;
 }
 
-class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32 <outs, ins, asm, pattern> {
+class VINTRPe <bits<2> op> : Enc32 {
 
   bits<8> VDST;
   bits<8> VSRC;
@@ -305,22 +332,9 @@
   let Inst{17-16} = op;
   let Inst{25-18} = VDST;
   let Inst{31-26} = 0x32; // encoding
-
-  let neverHasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 0;
 }
 
-} // End Uses = [EXEC]
-
-//===----------------------------------------------------------------------===//
-// Vector I/O operations
-//===----------------------------------------------------------------------===//
-
-let Uses = [EXEC] in {
-
-class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
+class DSe <bits<8> op> : Enc64 {
 
   bits<8> vdst;
   bits<1> gds;
@@ -339,12 +353,9 @@
   let Inst{47-40} = data0;
   let Inst{55-48} = data1;
   let Inst{63-56} = vdst;
-
-  let LGKM_CNT = 1;
 }
 
-class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64<outs, ins, asm, pattern> {
+class MUBUFe <bits<7> op> : Enc64 {
 
   bits<12> offset;
   bits<1> offen;
@@ -373,16 +384,9 @@
   let Inst{54} = slc;
   let Inst{55} = tfe;
   let Inst{63-56} = soffset;
-
-  let VM_CNT = 1;
-  let EXP_CNT = 1;
-
-  let neverHasSideEffects = 1;
-  let UseNamedOperandTable = 1;
 }
 
-class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64<outs, ins, asm, pattern> {
+class MTBUFe <bits<3> op> : Enc64 {
 
   bits<8> VDATA;
   bits<12> OFFSET;
@@ -413,15 +417,9 @@
   let Inst{54} = SLC;
   let Inst{55} = TFE;
   let Inst{63-56} = SOFFSET;
-
-  let VM_CNT = 1;
-  let EXP_CNT = 1;
-
-  let neverHasSideEffects = 1;
 }
 
-class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
+class MIMGe <bits<7> op> : Enc64 {
 
   bits<8> VDATA;
   bits<4> DMASK;
@@ -434,7 +432,7 @@
   bits<1> SLC;
   bits<8> VADDR;
   bits<7> SRSRC;
-  bits<7> SSAMP; 
+  bits<7> SSAMP;
 
   let Inst{11-8} = DMASK;
   let Inst{12} = UNORM;
@@ -450,19 +448,29 @@
   let Inst{47-40} = VDATA;
   let Inst{52-48} = SRSRC{6-2};
   let Inst{57-53} = SSAMP{6-2};
-
-  let VM_CNT = 1;
-  let EXP_CNT = 1;
-  let MIMG = 1;
 }
 
-def EXP : Enc64<
-  (outs),
-  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
-       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
-  "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
-  [] > {
+class FLATe<bits<7> op> : Enc64 {
+  bits<8> addr;
+  bits<8> data;
+  bits<8> vdst;
+  bits<1> slc;
+  bits<1> glc;
+  bits<1> tfe;
 
+  // 15-0 is reserved.
+  let Inst{16} = glc;
+  let Inst{17} = slc;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x37; // Encoding.
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data;
+  // 54-48 is reserved.
+  let Inst{55} = tfe;
+  let Inst{63-56} = vdst;
+}
+
+class EXPe : Enc64 {
   bits<4> EN;
   bits<6> TGT;
   bits<1> COMPR;
@@ -483,8 +491,110 @@
   let Inst{47-40} = VSRC1;
   let Inst{55-48} = VSRC2;
   let Inst{63-56} = VSRC3;
-
-  let EXP_CNT = 1;
 }
 
+let Uses = [EXEC] in {
+
+class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    VOP1Common <outs, ins, asm, pattern>,
+    VOP1e<op>;
+
+class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, VOP2e<op> {
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP2 = 1;
+}
+
+class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    VOP3Common <outs, ins, asm, pattern>, VOP3e<op>;
+
+class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    VOP3Common <outs, ins, asm, pattern>, VOP3be<op>;
+
+class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
+    InstSI <(outs VCCReg:$dst), ins, asm, pattern>, VOPCe <op> {
+
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOPC = 1;
+}
+
+class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, VINTRPe<op> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+} // End Uses = [EXEC]
+
+//===----------------------------------------------------------------------===//
+// Vector I/O operations
+//===----------------------------------------------------------------------===//
+
+let Uses = [EXEC] in {
+
+class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> , DSe<op> {
+
+  let LGKM_CNT = 1;
+  let UseNamedOperandTable = 1;
+}
+
+class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern>, MUBUFe <op> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MUBUF = 1;
+
+  let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+}
+
+class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MTBUF = 1;
+
+  let neverHasSideEffects = 1;
+  let UseNamedOperandTable = 1;
+}
+
+class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI<outs, ins, asm, pattern>, FLATe <op> {
+  let FLAT = 1;
+  // Internally, FLAT instruction are executed as both an LDS and a
+  // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+  // and are not considered done until both have been decremented.
+  let VM_CNT = 1;
+  let LGKM_CNT = 1;
+
+  let Uses = [EXEC, FLAT_SCR]; // M0
+
+  let UseNamedOperandTable = 1;
+  let hasSideEffects = 0;
+}
+
+class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern>, MIMGe <op> {
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MIMG = 1;
+
+  let hasSideEffects = 0; // XXX ????
+}
+
+
+
 } // End Uses = [EXEC]

diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 455c890..8343362 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp

@@ -17,10 +17,13 @@
 #include "AMDGPUTargetMachine.h"
 #include "SIDefines.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
@@ -32,6 +35,259 @@
 // TargetInstrInfo callbacks
 //===----------------------------------------------------------------------===//
 
+static unsigned getNumOperandsNoGlue(SDNode *Node) {
+  unsigned N = Node->getNumOperands();
+  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
+    --N;
+  return N;
+}
+
+static SDValue findChainOperand(SDNode *Load) {
+  SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
+  assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
+  return LastOp;
+}
+
+/// \brief Returns true if both nodes have the same value for the given
+///        operand \p Op, or if both nodes do not have this operand.
+static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
+  unsigned Opc0 = N0->getMachineOpcode();
+  unsigned Opc1 = N1->getMachineOpcode();
+
+  int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
+  int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
+
+  if (Op0Idx == -1 && Op1Idx == -1)
+    return true;
+
+
+  if ((Op0Idx == -1 && Op1Idx != -1) ||
+      (Op1Idx == -1 && Op0Idx != -1))
+    return false;
+
+  // getNamedOperandIdx returns the index for the MachineInstr's operands,
+  // which includes the result as the first operand. We are indexing into the
+  // MachineSDNode's operands, so we need to skip the result operand to get
+  // the real index.
+  --Op0Idx;
+  --Op1Idx;
+
+  return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
+}
+
+bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
+                                          int64_t &Offset0,
+                                          int64_t &Offset1) const {
+  if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
+    return false;
+
+  unsigned Opc0 = Load0->getMachineOpcode();
+  unsigned Opc1 = Load1->getMachineOpcode();
+
+  // Make sure both are actually loads.
+  if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
+    return false;
+
+  if (isDS(Opc0) && isDS(Opc1)) {
+
+    // FIXME: Handle this case:
+    if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
+      return false;
+
+    // Check base reg.
+    if (Load0->getOperand(1) != Load1->getOperand(1))
+      return false;
+
+    // Check chain.
+    if (findChainOperand(Load0) != findChainOperand(Load1))
+      return false;
+
+    // Skip read2 / write2 variants for simplicity.
+    // TODO: We should report true if the used offsets are adjacent (excluded
+    // st64 versions).
+    if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
+        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
+      return false;
+
+    Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
+    return true;
+  }
+
+  if (isSMRD(Opc0) && isSMRD(Opc1)) {
+    assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
+
+    // Check base reg.
+    if (Load0->getOperand(0) != Load1->getOperand(0))
+      return false;
+
+    // Check chain.
+    if (findChainOperand(Load0) != findChainOperand(Load1))
+      return false;
+
+    Offset0 = cast<ConstantSDNode>(Load0->getOperand(1))->getZExtValue();
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue();
+    return true;
+  }
+
+  // MUBUF and MTBUF can access the same addresses.
+  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
+
+    // MUBUF and MTBUF have vaddr at different indices.
+    if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
+        findChainOperand(Load0) != findChainOperand(Load1) ||
+        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
+        !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
+      return false;
+
+    int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
+    int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
+
+    if (OffIdx0 == -1 || OffIdx1 == -1)
+      return false;
+
+    // getNamedOperandIdx returns the index for MachineInstrs.  Since they
+    // inlcude the output in the operand list, but SDNodes don't, we need to
+    // subtract the index by one.
+    --OffIdx0;
+    --OffIdx1;
+
+    SDValue Off0 = Load0->getOperand(OffIdx0);
+    SDValue Off1 = Load1->getOperand(OffIdx1);
+
+    // The offset might be a FrameIndexSDNode.
+    if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
+      return false;
+
+    Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
+    Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+static bool isStride64(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::DS_READ2ST64_B32:
+  case AMDGPU::DS_READ2ST64_B64:
+  case AMDGPU::DS_WRITE2ST64_B32:
+  case AMDGPU::DS_WRITE2ST64_B64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SIInstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt,
+                                       unsigned &BaseReg, unsigned &Offset,
+                                       const TargetRegisterInfo *TRI) const {
+  unsigned Opc = LdSt->getOpcode();
+  if (isDS(Opc)) {
+    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::offset);
+    if (OffsetImm) {
+      // Normal, single offset LDS instruction.
+      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::addr);
+
+      BaseReg = AddrReg->getReg();
+      Offset = OffsetImm->getImm();
+      return true;
+    }
+
+    // The 2 offset instructions use offset0 and offset1 instead. We can treat
+    // these as a load with a single offset if the 2 offsets are consecutive. We
+    // will use this for some partially aligned loads.
+    const MachineOperand *Offset0Imm = getNamedOperand(*LdSt,
+                                                       AMDGPU::OpName::offset0);
+    const MachineOperand *Offset1Imm = getNamedOperand(*LdSt,
+                                                       AMDGPU::OpName::offset1);
+
+    uint8_t Offset0 = Offset0Imm->getImm();
+    uint8_t Offset1 = Offset1Imm->getImm();
+    assert(Offset1 > Offset0);
+
+    if (Offset1 - Offset0 == 1) {
+      // Each of these offsets is in element sized units, so we need to convert
+      // to bytes of the individual reads.
+
+      unsigned EltSize;
+      if (LdSt->mayLoad())
+        EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2;
+      else {
+        assert(LdSt->mayStore());
+        int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+        EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize();
+      }
+
+      if (isStride64(Opc))
+        EltSize *= 64;
+
+      const MachineOperand *AddrReg = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::addr);
+      BaseReg = AddrReg->getReg();
+      Offset = EltSize * Offset0;
+      return true;
+    }
+
+    return false;
+  }
+
+  if (isMUBUF(Opc) || isMTBUF(Opc)) {
+    if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
+      return false;
+
+    const MachineOperand *AddrReg = getNamedOperand(*LdSt,
+                                                    AMDGPU::OpName::vaddr);
+    if (!AddrReg)
+      return false;
+
+    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::offset);
+    BaseReg = AddrReg->getReg();
+    Offset = OffsetImm->getImm();
+    return true;
+  }
+
+  if (isSMRD(Opc)) {
+    const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
+                                                      AMDGPU::OpName::offset);
+    if (!OffsetImm)
+      return false;
+
+    const MachineOperand *SBaseReg = getNamedOperand(*LdSt,
+                                                     AMDGPU::OpName::sbase);
+    BaseReg = SBaseReg->getReg();
+    Offset = OffsetImm->getImm();
+    return true;
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+                                     MachineInstr *SecondLdSt,
+                                     unsigned NumLoads) const {
+  unsigned Opc0 = FirstLdSt->getOpcode();
+  unsigned Opc1 = SecondLdSt->getOpcode();
+
+  // TODO: This needs finer tuning
+  if (NumLoads > 4)
+    return false;
+
+  if (isDS(Opc0) && isDS(Opc1))
+    return true;
+
+  if (isSMRD(Opc0) && isSMRD(Opc1))
+    return true;
+
+  if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1)))
+    return true;
+
+  return false;
+}
+
 void
 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MI, DebugLoc DL,
@@ -182,6 +438,19 @@
   return Opcode;
 }
 
+static bool shouldTryToSpillVGPRs(MachineFunction *MF) {
+
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const TargetMachine &TM = MF->getTarget();
+
+  // FIXME: Even though it can cause problems, we need to enable
+  // spilling at -O0, since the fast register allocator always
+  // spills registers that are live at the end of blocks.
+  return MFI->getShaderType() == ShaderType::COMPUTE &&
+         TM.getOptLevel() == CodeGenOpt::None;
+
+}
+
 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
                                       unsigned SrcReg, bool isKill,
@@ -189,50 +458,43 @@
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
-  unsigned KillFlag = isKill ? RegState::Kill : 0;
+  int Opcode = -1;
 
-  if (RI.hasVGPRs(RC)) {
+  if (RI.isSGPRClass(RC)) {
+    // We are only allowed to create one new instruction when spilling
+    // registers, so we need to use pseudo instruction for spilling
+    // SGPRs.
+    switch (RC->getSize() * 8) {
+      case 32:  Opcode = AMDGPU::SI_SPILL_S32_SAVE;  break;
+      case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
+      case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
+      case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
+      case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
+    }
+  } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+    switch(RC->getSize() * 8) {
+      case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
+      case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
+      case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
+      case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
+      case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
+      case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
+    }
+  }
+
+  if (Opcode != -1) {
+    FrameInfo->setObjectAlignment(FrameIndex, 4);
+    BuildMI(MBB, MI, DL, get(Opcode))
+            .addReg(SrcReg)
+            .addFrameIndex(FrameIndex);
+  } else {
     LLVMContext &Ctx = MF->getFunction()->getContext();
-    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!");
+    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
+                  " spill register");
     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
             .addReg(SrcReg);
-  } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
-    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF);
-    unsigned TgtReg = MFI->SpillTracker.LaneVGPR;
-
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg)
-            .addReg(SrcReg, KillFlag)
-            .addImm(Lane);
-    MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane);
-  } else if (RI.isSGPRClass(RC)) {
-    // We are only allowed to create one new instruction when spilling
-    // registers, so we need to use pseudo instruction for vector
-    // registers.
-    //
-    // Reserve a spot in the spill tracker for each sub-register of
-    // the vector register.
-    unsigned NumSubRegs = RC->getSize() / 4;
-    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs);
-    MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
-                                    FirstLane);
-
-    unsigned Opcode;
-    switch (RC->getSize() * 8) {
-    case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
-    case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
-    case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
-    case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
-    default: llvm_unreachable("Cannot spill register class");
-    }
-
-    BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
-            .addReg(SrcReg)
-            .addImm(FrameIndex);
-  } else {
-    llvm_unreachable("VGPR spilling not supported");
   }
 }
 
@@ -242,55 +504,138 @@
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
   DebugLoc DL = MBB.findDebugLoc(MI);
+  int Opcode = -1;
 
-  if (RI.hasVGPRs(RC)) {
-    LLVMContext &Ctx = MF->getFunction()->getContext();
-    Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!");
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
-            .addImm(0);
-  } else if (RI.isSGPRClass(RC)){
-    unsigned Opcode;
+  if (RI.isSGPRClass(RC)){
     switch(RC->getSize() * 8) {
-    case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
-    case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
-    case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
-    case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
-    case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
-    default: llvm_unreachable("Cannot spill register class");
+      case 32:  Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break;
+      case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
+      case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
+      case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
+      case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
     }
+  } else if(shouldTryToSpillVGPRs(MF) && RI.hasVGPRs(RC)) {
+    switch(RC->getSize() * 8) {
+      case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
+      case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
+      case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
+      case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
+      case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
+      case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
+    }
+  }
 
-    SIMachineFunctionInfo::SpilledReg Spill =
-        MFI->SpillTracker.getSpilledReg(FrameIndex);
-
+  if (Opcode != -1) {
+    FrameInfo->setObjectAlignment(FrameIndex, 4);
     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-            .addReg(Spill.VGPR)
-            .addImm(FrameIndex);
+            .addFrameIndex(FrameIndex);
   } else {
-    llvm_unreachable("VGPR spilling not supported");
+    LLVMContext &Ctx = MF->getFunction()->getContext();
+    Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
+                  " restore register");
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+            .addReg(AMDGPU::VGPR0);
   }
 }
 
-static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+/// \param @Offset Offset in bytes of the FrameIndex being spilled
+unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                               RegScavenger *RS, unsigned TmpReg,
+                                               unsigned FrameOffset,
+                                               unsigned Size) const {
+  MachineFunction *MF = MBB.getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget<AMDGPUSubtarget>();
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
+  unsigned WavefrontSize = ST.getWavefrontSize();
 
-  switch (Op) {
-  case AMDGPU::SI_SPILL_S512_SAVE:
-  case AMDGPU::SI_SPILL_S512_RESTORE:
-    return 16;
-  case AMDGPU::SI_SPILL_S256_SAVE:
-  case AMDGPU::SI_SPILL_S256_RESTORE:
-    return 8;
-  case AMDGPU::SI_SPILL_S128_SAVE:
-  case AMDGPU::SI_SPILL_S128_RESTORE:
-    return 4;
-  case AMDGPU::SI_SPILL_S64_SAVE:
-  case AMDGPU::SI_SPILL_S64_RESTORE:
-    return 2;
-  case AMDGPU::SI_SPILL_S32_RESTORE:
-    return 1;
-  default: llvm_unreachable("Invalid spill opcode");
+  unsigned TIDReg = MFI->getTIDReg();
+  if (!MFI->hasCalculatedTID()) {
+    MachineBasicBlock &Entry = MBB.getParent()->front();
+    MachineBasicBlock::iterator Insert = Entry.front();
+    DebugLoc DL = Insert->getDebugLoc();
+
+    TIDReg = RI.findUnusedVGPR(MF->getRegInfo());
+    if (TIDReg == AMDGPU::NoRegister)
+      return TIDReg;
+
+
+    if (MFI->getShaderType() == ShaderType::COMPUTE &&
+        WorkGroupSize > WavefrontSize) {
+
+      unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X);
+      unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y);
+      unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z);
+      unsigned InputPtrReg =
+          TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR);
+      static const unsigned TIDIGRegs[3] = {
+        TIDIGXReg, TIDIGYReg, TIDIGZReg
+      };
+      for (unsigned Reg : TIDIGRegs) {
+        if (!Entry.isLiveIn(Reg))
+          Entry.addLiveIn(Reg);
+      }
+
+      RS->enterBasicBlock(&Entry);
+      unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+      unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
+              .addReg(InputPtrReg)
+              .addImm(SI::KernelInputOffsets::NGROUPS_Z);
+      BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
+              .addReg(InputPtrReg)
+              .addImm(SI::KernelInputOffsets::NGROUPS_Y);
+
+      // NGROUPS.X * NGROUPS.Y
+      BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
+              .addReg(STmp1)
+              .addReg(STmp0);
+      // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
+              .addReg(STmp1)
+              .addReg(TIDIGXReg);
+      // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
+              .addReg(STmp0)
+              .addReg(TIDIGYReg)
+              .addReg(TIDReg);
+      // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
+              .addReg(TIDReg)
+              .addReg(TIDIGZReg);
+    } else {
+      // Get the wave id
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
+              TIDReg)
+              .addImm(-1)
+              .addImm(0);
+
+      BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32),
+              TIDReg)
+              .addImm(-1)
+              .addReg(TIDReg);
+    }
+
+    BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
+            TIDReg)
+            .addImm(2)
+            .addReg(TIDReg);
+    MFI->setTIDReg(TIDReg);
   }
+
+  // Add FrameIndex to LDS offset
+  unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize);
+  BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
+          .addImm(LDSOffset)
+          .addReg(TIDReg);
+
+  return TmpReg;
 }
 
 void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
@@ -308,95 +653,102 @@
 }
 
 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  SIMachineFunctionInfo *MFI =
-      MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
   MachineBasicBlock &MBB = *MI->getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
   switch (MI->getOpcode()) {
   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
 
-  // SGPR register spill
-  case AMDGPU::SI_SPILL_S512_SAVE:
-  case AMDGPU::SI_SPILL_S256_SAVE:
-  case AMDGPU::SI_SPILL_S128_SAVE:
-  case AMDGPU::SI_SPILL_S64_SAVE: {
-    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
-    unsigned FrameIndex = MI->getOperand(2).getImm();
+  case AMDGPU::SI_CONSTDATA_PTR: {
+    unsigned Reg = MI->getOperand(0).getReg();
+    unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
+    unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
 
-    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-      SIMachineFunctionInfo::SpilledReg Spill;
-      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(),
-                                            &AMDGPU::SGPR_32RegClass, i);
-      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg);
 
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
-              MI->getOperand(0).getReg())
-              .addReg(SubReg)
-              .addImm(Spill.Lane + i);
-    }
+    // Add 32-bit offset from this instruction to the start of the constant data.
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo)
+            .addReg(RegLo)
+            .addTargetIndex(AMDGPU::TI_CONSTDATA_START)
+            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit);
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+            .addReg(RegHi)
+            .addImm(0)
+            .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit)
+            .addReg(AMDGPU::SCC, RegState::Implicit);
     MI->eraseFromParent();
     break;
   }
-
-  // SGPR register restore
-  case AMDGPU::SI_SPILL_S512_RESTORE:
-  case AMDGPU::SI_SPILL_S256_RESTORE:
-  case AMDGPU::SI_SPILL_S128_RESTORE:
-  case AMDGPU::SI_SPILL_S64_RESTORE:
-  case AMDGPU::SI_SPILL_S32_RESTORE: {
-    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
-
-    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-      SIMachineFunctionInfo::SpilledReg Spill;
-      unsigned FrameIndex = MI->getOperand(2).getImm();
-      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(),
-                                   &AMDGPU::SGPR_32RegClass, i);
-      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
-
-      BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg)
-              .addReg(MI->getOperand(1).getReg())
-              .addImm(Spill.Lane + i);
-    }
-    insertNOPs(MI, 3);
+  case AMDGPU::SGPR_USE:
+    // This is just a placeholder for register allocation.
     MI->eraseFromParent();
     break;
   }
-  }
   return true;
 }
 
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                                               bool NewMI) const {
-
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
+  if (MI->getNumOperands() < 3)
     return nullptr;
 
-  // Cannot commute VOP2 if src0 is SGPR.
-  if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
-      RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
-   return nullptr;
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src0);
+  assert(Src0Idx != -1 && "Should always have src0 operand");
 
-  if (!MI->getOperand(2).isReg()) {
-    // XXX: Commute instructions with FPImm operands
-    if (NewMI || MI->getOperand(2).isFPImm() ||
+  MachineOperand &Src0 = MI->getOperand(Src0Idx);
+  if (!Src0.isReg())
+    return nullptr;
+
+  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src1);
+  if (Src1Idx == -1)
+    return nullptr;
+
+  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+
+  // Make sure it's legal to commute operands for VOP2.
+  if (isVOP2(MI->getOpcode()) &&
+      (!isOperandLegal(MI, Src0Idx, &Src1) ||
+       !isOperandLegal(MI, Src1Idx, &Src0)))
+    return nullptr;
+
+  if (!Src1.isReg()) {
+    // Allow commuting instructions with Imm or FPImm operands.
+    if (NewMI || (!Src1.isImm() && !Src1.isFPImm()) ||
        (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
       return nullptr;
     }
 
-    // XXX: Commute VOP3 instructions with abs and neg set.
-    if (isVOP3(MI->getOpcode()) &&
-        (MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                        AMDGPU::OpName::abs)).getImm() ||
-         MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                        AMDGPU::OpName::neg)).getImm()))
-      return nullptr;
+    // Be sure to copy the source modifiers to the right place.
+    if (MachineOperand *Src0Mods
+          = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
+      MachineOperand *Src1Mods
+        = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers);
 
-    unsigned Reg = MI->getOperand(1).getReg();
-    unsigned SubReg = MI->getOperand(1).getSubReg();
-    MI->getOperand(1).ChangeToImmediate(MI->getOperand(2).getImm());
-    MI->getOperand(2).ChangeToRegister(Reg, false);
-    MI->getOperand(2).setSubReg(SubReg);
+      int Src0ModsVal = Src0Mods->getImm();
+      if (!Src1Mods && Src0ModsVal != 0)
+        return nullptr;
+
+      // XXX - This assert might be a lie. It might be useful to have a neg
+      // modifier with 0.0.
+      int Src1ModsVal = Src1Mods->getImm();
+      assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates");
+
+      Src1Mods->setImm(Src0ModsVal);
+      Src0Mods->setImm(Src1ModsVal);
+    }
+
+    unsigned Reg = Src0.getReg();
+    unsigned SubReg = Src0.getSubReg();
+    if (Src1.isImm())
+      Src0.ChangeToImmediate(Src1.getImm());
+    else if (Src1.isFPImm())
+      Src0.ChangeToFPImmediate(Src1.getFPImm());
+    else
+      llvm_unreachable("Should only have immediates");
+
+    Src1.ChangeToRegister(Reg, false);
+    Src1.setSubReg(SubReg);
   } else {
     MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
   }
@@ -407,6 +759,44 @@
   return MI;
 }
 
+// This needs to be implemented because the source modifiers may be inserted
+// between the true commutable operands, and the base
+// TargetInstrInfo::commuteInstruction uses it.
+bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
+                                        unsigned &SrcOpIdx1,
+                                        unsigned &SrcOpIdx2) const {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (!MCID.isCommutable())
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  if (Src0Idx == -1)
+    return false;
+
+  // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
+  // immediate.
+  if (!MI->getOperand(Src0Idx).isReg())
+    return false;
+
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  if (Src1Idx == -1)
+    return false;
+
+  if (!MI->getOperand(Src1Idx).isReg())
+    return false;
+
+  // If any source modifiers are set, the generic instruction commuting won't
+  // understand how to copy the source modifiers.
+  if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
+      hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+    return false;
+
+  SrcOpIdx1 = Src0Idx;
+  SrcOpIdx2 = Src1Idx;
+  return true;
+}
+
 MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
                                          MachineBasicBlock::iterator I,
                                          unsigned DstReg,
@@ -443,10 +833,92 @@
   }
 }
 
+static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
+                                int WidthB, int OffsetB) {
+  int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+  int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+  int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+  return LowOffset + LowWidth <= HighOffset;
+}
+
+bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
+                                               MachineInstr *MIb) const {
+  unsigned BaseReg0, Offset0;
+  unsigned BaseReg1, Offset1;
+
+  if (getLdStBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
+      getLdStBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
+    assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() &&
+           "read2 / write2 not expected here yet");
+    unsigned Width0 = (*MIa->memoperands_begin())->getSize();
+    unsigned Width1 = (*MIb->memoperands_begin())->getSize();
+    if (BaseReg0 == BaseReg1 &&
+        offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
+                                                  MachineInstr *MIb,
+                                                  AliasAnalysis *AA) const {
+  unsigned Opc0 = MIa->getOpcode();
+  unsigned Opc1 = MIb->getOpcode();
+
+  assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
+         "MIa must load from or modify a memory location");
+  assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
+         "MIb must load from or modify a memory location");
+
+  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects())
+    return false;
+
+  // XXX - Can we relax this between address spaces?
+  if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+    return false;
+
+  // TODO: Should we check the address space from the MachineMemOperand? That
+  // would allow us to distinguish objects we know don't alias based on the
+  // underlying addres space, even if it was lowered to a different one,
+  // e.g. private accesses lowered to use MUBUF instructions on a scratch
+  // buffer.
+  if (isDS(Opc0)) {
+    if (isDS(Opc1))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return !isFLAT(Opc1);
+  }
+
+  if (isMUBUF(Opc0) || isMTBUF(Opc0)) {
+    if (isMUBUF(Opc1) || isMTBUF(Opc1))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return !isFLAT(Opc1) && !isSMRD(Opc1);
+  }
+
+  if (isSMRD(Opc0)) {
+    if (isSMRD(Opc1))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0);
+  }
+
+  if (isFLAT(Opc0)) {
+    if (isFLAT(Opc1))
+      return checkInstOffsetsDoNotOverlap(MIa, MIb);
+
+    return false;
+  }
+
+  return false;
+}
+
 namespace llvm {
 namespace AMDGPU {
 // Helper function generated by tablegen.  We are wrapping this with
-// an SIInstrInfo function that reutrns bool rather than int.
+// an SIInstrInfo function that returns bool rather than int.
 int isDS(uint16_t Opcode);
 }
 }
@@ -455,14 +927,26 @@
   return ::AMDGPU::isDS(Opcode) != -1;
 }
 
-int SIInstrInfo::isMIMG(uint16_t Opcode) const {
+bool SIInstrInfo::isMIMG(uint16_t Opcode) const {
   return get(Opcode).TSFlags & SIInstrFlags::MIMG;
 }
 
-int SIInstrInfo::isSMRD(uint16_t Opcode) const {
+bool SIInstrInfo::isSMRD(uint16_t Opcode) const {
   return get(Opcode).TSFlags & SIInstrFlags::SMRD;
 }
 
+bool SIInstrInfo::isMUBUF(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::MUBUF;
+}
+
+bool SIInstrInfo::isMTBUF(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::MTBUF;
+}
+
+bool SIInstrInfo::isFLAT(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::FLAT;
+}
+
 bool SIInstrInfo::isVOP1(uint16_t Opcode) const {
   return get(Opcode).TSFlags & SIInstrFlags::VOP1;
 }
@@ -541,9 +1025,99 @@
   }
 }
 
+bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+                                 const MachineOperand &MO) const {
+  const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
+
+  assert(MO.isImm() || MO.isFPImm() || MO.isTargetIndex() || MO.isFI());
+
+  if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
+    return true;
+
+  if (OpInfo.RegClass < 0)
+    return false;
+
+  if (isLiteralConstant(MO))
+    return RI.regClassCanUseLiteralConstant(OpInfo.RegClass);
+
+  return RI.regClassCanUseInlineConstant(OpInfo.RegClass);
+}
+
+bool SIInstrInfo::canFoldOffset(unsigned OffsetSize, unsigned AS) {
+  switch (AS) {
+  case AMDGPUAS::GLOBAL_ADDRESS: {
+    // MUBUF instructions a 12-bit offset in bytes.
+    return isUInt<12>(OffsetSize);
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    // SMRD instructions have an 8-bit offset in dwords.
+    return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
+  }
+  case AMDGPUAS::LOCAL_ADDRESS:
+  case AMDGPUAS::REGION_ADDRESS: {
+    // The single offset versions have a 16-bit offset in bytes.
+    return isUInt<16>(OffsetSize);
+  }
+  case AMDGPUAS::PRIVATE_ADDRESS:
+    // Indirect register addressing does not use any offsets.
+  default:
+    return 0;
+  }
+}
+
+bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
+  return AMDGPU::getVOPe32(Opcode) != -1;
+}
+
+bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
+  // The src0_modifier operand is present on all instructions
+  // that have modifiers.
+
+  return AMDGPU::getNamedOperandIdx(Opcode,
+                                    AMDGPU::OpName::src0_modifiers) != -1;
+}
+
+bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
+                                  unsigned OpName) const {
+  const MachineOperand *Mods = getNamedOperand(MI, OpName);
+  return Mods && Mods->getImm();
+}
+
+bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
+                                  const MachineOperand &MO) const {
+  // Literal constants use the constant bus.
+  if (isLiteralConstant(MO))
+    return true;
+
+  if (!MO.isReg() || !MO.isUse())
+    return false;
+
+  if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
+
+  // FLAT_SCR is just an SGPR pair.
+  if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
+    return true;
+
+  // EXEC register uses the constant bus.
+  if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
+    return true;
+
+  // SGPRs use the constant bus
+  if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
+      (!MO.isImplicit() &&
+      (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+       AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
+    return true;
+  }
+
+  return false;
+}
+
 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
                                     StringRef &ErrInfo) const {
   uint16_t Opcode = MI->getOpcode();
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
@@ -557,19 +1131,22 @@
   }
 
   // Make sure the register classes are correct
-  for (unsigned i = 0, e = Desc.getNumOperands(); i != e; ++i) {
+  for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
     switch (Desc.OpInfo[i].OperandType) {
     case MCOI::OPERAND_REGISTER: {
-      int RegClass = Desc.OpInfo[i].RegClass;
-      if (!RI.regClassCanUseImmediate(RegClass) &&
-          (MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm())) {
-        ErrInfo = "Expected register, but got immediate";
-        return false;
+      if ((MI->getOperand(i).isImm() || MI->getOperand(i).isFPImm()) &&
+          !isImmOperandLegal(MI, i, MI->getOperand(i))) {
+          ErrInfo = "Illegal immediate value for operand.";
+          return false;
+        }
       }
-    }
       break;
     case MCOI::OPERAND_IMMEDIATE:
-      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm()) {
+      // Check if this operand is an immediate.
+      // FrameIndex operands will be replaced by immediates, so they are
+      // allowed.
+      if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFPImm() &&
+          !MI->getOperand(i).isFI()) {
         ErrInfo = "Expected immediate, but got non-immediate";
         return false;
       }
@@ -602,27 +1179,15 @@
     unsigned SGPRUsed = AMDGPU::NoRegister;
     for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
       const MachineOperand &MO = MI->getOperand(i);
-      if (MO.isReg() && MO.isUse() &&
-          !TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
-
-        // EXEC register uses the constant bus.
-        if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
-          ++ConstantBusCount;
-
-        // SGPRs use the constant bus
-        if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
-            (!MO.isImplicit() &&
-            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
-            AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
-          if (SGPRUsed != MO.getReg()) {
+      if (usesConstantBus(MRI, MO)) {
+        if (MO.isReg()) {
+          if (MO.getReg() != SGPRUsed)
             ++ConstantBusCount;
-            SGPRUsed = MO.getReg();
-          }
+          SGPRUsed = MO.getReg();
+        } else {
+          ++ConstantBusCount;
         }
       }
-      // Literal constants use the constant bus.
-      if (isLiteralConstant(MO))
-        ++ConstantBusCount;
     }
     if (ConstantBusCount > 1) {
       ErrInfo = "VOP* instruction uses the constant bus more than once";
@@ -658,11 +1223,9 @@
   // Verify misc. restrictions on specific instructions.
   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
-    MI->dump();
-
-    const MachineOperand &Src0 = MI->getOperand(2);
-    const MachineOperand &Src1 = MI->getOperand(3);
-    const MachineOperand &Src2 = MI->getOperand(4);
+    const MachineOperand &Src0 = MI->getOperand(Src0Idx);
+    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
+    const MachineOperand &Src2 = MI->getOperand(Src2Idx);
     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
       if (!compareMachineOp(Src0, Src1) &&
           !compareMachineOp(Src0, Src2)) {
@@ -685,10 +1248,13 @@
   case AMDGPU::S_MOV_B32:
     return MI.getOperand(1).isReg() ?
            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
-  case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32;
+  case AMDGPU::S_ADD_I32:
+  case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
-  case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32;
+  case AMDGPU::S_SUB_I32:
+  case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+  case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
@@ -757,21 +1323,28 @@
 
 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
   MachineBasicBlock::iterator I = MI;
+  MachineBasicBlock *MBB = MI->getParent();
   MachineOperand &MO = MI->getOperand(OpIdx);
-  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
   const TargetRegisterClass *RC = RI.getRegClass(RCID);
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
-  if (MO.isReg()) {
+  if (MO.isReg())
     Opcode = AMDGPU::COPY;
-  } else if (RI.isSGPRClass(RC)) {
+  else if (RI.isSGPRClass(RC))
     Opcode = AMDGPU::S_MOV_B32;
-  }
+
 
   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
+  if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
+    VRC = &AMDGPU::VReg_64RegClass;
+  else
+    VRC = &AMDGPU::VReg_32RegClass;
+
   unsigned Reg = MRI.createVirtualRegister(VRC);
-  BuildMI(*MI->getParent(), I, MI->getParent()->findDebugLoc(I), get(Opcode),
-          Reg).addOperand(MO);
+  DebugLoc DL = MBB->findDebugLoc(I);
+  BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg)
+    .addOperand(MO);
   MO.ChangeToRegister(Reg, false);
 }
 
@@ -791,13 +1364,15 @@
   // value so we don't need to worry about merging its subreg index with the
   // SubIdx passed to this function. The register coalescer should be able to
   // eliminate this extra copy.
-  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
-          NewSuperReg)
-          .addOperand(SuperReg);
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
 
-  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
-          SubReg)
-          .addReg(NewSuperReg, 0, SubIdx);
+  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
+    .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
+
+  BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
+    .addReg(NewSuperReg, 0, SubIdx);
+
   return SubReg;
 }
 
@@ -853,8 +1428,59 @@
   return Dst;
 }
 
+bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+                                 const MachineOperand *MO) const {
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &InstDesc = get(MI->getOpcode());
+  const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
+  const TargetRegisterClass *DefinedRC =
+      OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
+  if (!MO)
+    MO = &MI->getOperand(OpIdx);
+
+  if (usesConstantBus(MRI, *MO)) {
+    unsigned SGPRUsed =
+        MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      if (i == OpIdx)
+        continue;
+      if (usesConstantBus(MRI, MI->getOperand(i)) &&
+          MI->getOperand(i).isReg() && MI->getOperand(i).getReg() != SGPRUsed) {
+        return false;
+      }
+    }
+  }
+
+  if (MO->isReg()) {
+    assert(DefinedRC);
+    const TargetRegisterClass *RC = MRI.getRegClass(MO->getReg());
+
+    // In order to be legal, the common sub-class must be equal to the
+    // class of the current operand.  For example:
+    //
+    // v_mov_b32 s0 ; Operand defined as vsrc_32
+    //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
+    //
+    // s_sendmsg 0, s0 ; Operand defined as m0reg
+    //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+    return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
+  }
+
+
+  // Handle non-register types that are treated like immediates.
+  assert(MO->isImm() || MO->isFPImm() || MO->isTargetIndex() || MO->isFI());
+
+  if (!DefinedRC) {
+    // This operand expects an immediate.
+    return true;
+  }
+
+  return isImmOperandLegal(MI, OpIdx, *MO);
+}
+
 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                            AMDGPU::OpName::src0);
   int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
@@ -864,45 +1490,40 @@
 
   // Legalize VOP2
   if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
-    MachineOperand &Src0 = MI->getOperand(Src0Idx);
-    MachineOperand &Src1 = MI->getOperand(Src1Idx);
-
-    // If the instruction implicitly reads VCC, we can't have any SGPR operands,
-    // so move any.
-    bool ReadsVCC = MI->readsRegister(AMDGPU::VCC, &RI);
-    if (ReadsVCC && Src0.isReg() &&
-        RI.isSGPRClass(MRI.getRegClass(Src0.getReg()))) {
+    // Legalize src0
+    if (!isOperandLegal(MI, Src0Idx))
       legalizeOpWithMove(MI, Src0Idx);
+
+    // Legalize src1
+    if (isOperandLegal(MI, Src1Idx))
       return;
+
+    // Usually src0 of VOP2 instructions allow more types of inputs
+    // than src1, so try to commute the instruction to decrease our
+    // chances of having to insert a MOV instruction to legalize src1.
+    if (MI->isCommutable()) {
+      if (commuteInstruction(MI))
+        // If we are successful in commuting, then we know MI is legal, so
+        // we are done.
+        return;
     }
 
-    if (ReadsVCC && Src1.isReg() &&
-        RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
-      legalizeOpWithMove(MI, Src1Idx);
-      return;
-    }
-
-    // Legalize VOP2 instructions where src1 is not a VGPR. An SGPR input must
-    // be the first operand, and there can only be one.
-    if (Src1.isImm() || Src1.isFPImm() ||
-        (Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())))) {
-      if (MI->isCommutable()) {
-        if (commuteInstruction(MI))
-          return;
-      }
-      legalizeOpWithMove(MI, Src1Idx);
-    }
+    legalizeOpWithMove(MI, Src1Idx);
+    return;
   }
 
   // XXX - Do any VOP3 instructions read VCC?
   // Legalize VOP3
   if (isVOP3(MI->getOpcode())) {
-    int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx};
-    unsigned SGPRReg = AMDGPU::NoRegister;
+    int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx };
+
+    // Find the one SGPR operand we are allowed to use.
+    unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
+
     for (unsigned i = 0; i < 3; ++i) {
       int Idx = VOP3Idx[i];
       if (Idx == -1)
-        continue;
+        break;
       MachineOperand &MO = MI->getOperand(Idx);
 
       if (MO.isReg()) {
@@ -1002,108 +1623,214 @@
   // Legalize MUBUF* instructions
   // FIXME: If we start using the non-addr64 instructions for compute, we
   // may need to legalize them here.
+  int SRsrcIdx =
+      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
+  if (SRsrcIdx != -1) {
+    // We have an MUBUF instruction
+    MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
+    unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
+    if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
+                                             RI.getRegClass(SRsrcRC))) {
+      // The operands are legal.
+      // FIXME: We may need to legalize operands besided srsrc.
+      return;
+    }
 
-  int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                            AMDGPU::OpName::srsrc);
-  int VAddrIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                             AMDGPU::OpName::vaddr);
-  if (SRsrcIdx != -1 && VAddrIdx != -1) {
-    const TargetRegisterClass *VAddrRC =
-        RI.getRegClass(get(MI->getOpcode()).OpInfo[VAddrIdx].RegClass);
+    MachineBasicBlock &MBB = *MI->getParent();
+    // Extract the the ptr from the resource descriptor.
 
-    if(VAddrRC->getSize() == 8 &&
-       MRI.getRegClass(MI->getOperand(SRsrcIdx).getReg()) != VAddrRC) {
-      // We have a MUBUF instruction that uses a 64-bit vaddr register and
-      // srsrc has the incorrect register class.  In order to fix this, we
-      // need to extract the pointer from the resource descriptor (srsrc),
-      // add it to the value of vadd,  then store the result in the vaddr
-      // operand.  Then, we need to set the pointer field of the resource
-      // descriptor to zero.
+    // SRsrcPtrLo = srsrc:sub0
+    unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc,
+        &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
 
-      MachineBasicBlock &MBB = *MI->getParent();
-      MachineOperand &SRsrcOp = MI->getOperand(SRsrcIdx);
-      MachineOperand &VAddrOp = MI->getOperand(VAddrIdx);
-      unsigned SRsrcPtrLo, SRsrcPtrHi, VAddrLo, VAddrHi;
-      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
-      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
-      unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-      unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-      unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-      unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+    // SRsrcPtrHi = srsrc:sub1
+    unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc,
+        &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
 
-      // SRsrcPtrLo = srsrc:sub0
-      SRsrcPtrLo = buildExtractSubReg(MI, MRI, SRsrcOp,
-          &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+    // Create an empty resource descriptor
+    unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
 
-      // SRsrcPtrHi = srsrc:sub1
-      SRsrcPtrHi = buildExtractSubReg(MI, MRI, SRsrcOp,
-          &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+    // Zero64 = 0
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
+            Zero64)
+            .addImm(0);
 
-      // VAddrLo = vaddr:sub0
-      VAddrLo = buildExtractSubReg(MI, MRI, VAddrOp,
-          &AMDGPU::VReg_64RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+    // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+            SRsrcFormatLo)
+            .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
 
-      // VAddrHi = vaddr:sub1
-      VAddrHi = buildExtractSubReg(MI, MRI, VAddrOp,
-          &AMDGPU::VReg_64RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+    // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+            SRsrcFormatHi)
+            .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
 
-      // NewVaddrLo = SRsrcPtrLo + VAddrLo
+    // NewSRsrc = {Zero64, SRsrcFormat}
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+            NewSRsrc)
+            .addReg(Zero64)
+            .addImm(AMDGPU::sub0_sub1)
+            .addReg(SRsrcFormatLo)
+            .addImm(AMDGPU::sub2)
+            .addReg(SRsrcFormatHi)
+            .addImm(AMDGPU::sub3);
+
+    MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
+    unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+    unsigned NewVAddrLo;
+    unsigned NewVAddrHi;
+    if (VAddr) {
+      // This is already an ADDR64 instruction so we need to add the pointer
+      // extracted from the resource descriptor to the current value of VAddr.
+      NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+      NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+
+      // NewVaddrLo = SRsrcPtrLo + VAddr:sub0
       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
               NewVAddrLo)
               .addReg(SRsrcPtrLo)
-              .addReg(VAddrLo)
-              .addReg(AMDGPU::VCC, RegState::Define | RegState::Implicit);
+              .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
+              .addReg(AMDGPU::VCC, RegState::ImplicitDefine);
 
-      // NewVaddrHi = SRsrcPtrHi + VAddrHi
+      // NewVaddrHi = SRsrcPtrHi + VAddr:sub1
       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
               NewVAddrHi)
               .addReg(SRsrcPtrHi)
-              .addReg(VAddrHi)
+              .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
               .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
               .addReg(AMDGPU::VCC, RegState::Implicit);
 
-      // NewVaddr = {NewVaddrHi, NewVaddrLo}
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
-              NewVAddr)
-              .addReg(NewVAddrLo)
-              .addImm(AMDGPU::sub0)
-              .addReg(NewVAddrHi)
-              .addImm(AMDGPU::sub1);
+    } else {
+      // This instructions is the _OFFSET variant, so we need to convert it to
+      // ADDR64.
+      MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
+      MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
+      MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
+      assert(SOffset->isImm() && SOffset->getImm() == 0 && "Legalizing MUBUF "
+             "with non-zero soffset is not implemented");
+      (void)SOffset;
 
-      // Zero64 = 0
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
-              Zero64)
-              .addImm(0);
+      // Create the new instruction.
+      unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
+      MachineInstr *Addr64 =
+          BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
+                  .addOperand(*VData)
+                  .addOperand(*SRsrc)
+                  .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
+                                              // This will be replaced later
+                                              // with the new value of vaddr.
+                  .addOperand(*Offset);
 
-      // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-              SRsrcFormatLo)
-              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+      MI->removeFromParent();
+      MI = Addr64;
 
-      // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
-              SRsrcFormatHi)
-              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
-
-      // NewSRsrc = {Zero64, SRsrcFormat}
-      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
-              NewSRsrc)
-              .addReg(Zero64)
-              .addImm(AMDGPU::sub0_sub1)
-              .addReg(SRsrcFormatLo)
-              .addImm(AMDGPU::sub2)
-              .addReg(SRsrcFormatHi)
-              .addImm(AMDGPU::sub3);
-
-      // Update the instruction to use NewVaddr
-      MI->getOperand(VAddrIdx).setReg(NewVAddr);
-      // Update the instruction to use NewSRsrc
-      MI->getOperand(SRsrcIdx).setReg(NewSRsrc);
+      NewVAddrLo = SRsrcPtrLo;
+      NewVAddrHi = SRsrcPtrHi;
+      VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
+      SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
     }
+
+    // NewVaddr = {NewVaddrHi, NewVaddrLo}
+    BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+            NewVAddr)
+            .addReg(NewVAddrLo)
+            .addImm(AMDGPU::sub0)
+            .addReg(NewVAddrHi)
+            .addImm(AMDGPU::sub1);
+
+
+    // Update the instruction to use NewVaddr
+    VAddr->setReg(NewVAddr);
+    // Update the instruction to use NewSRsrc
+    SRsrc->setReg(NewSRsrc);
   }
 }
 
+void SIInstrInfo::splitSMRD(MachineInstr *MI,
+                            const TargetRegisterClass *HalfRC,
+                            unsigned HalfImmOp, unsigned HalfSGPROp,
+                            MachineInstr *&Lo, MachineInstr *&Hi) const {
+
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned RegLo = MRI.createVirtualRegister(HalfRC);
+  unsigned RegHi = MRI.createVirtualRegister(HalfRC);
+  unsigned HalfSize = HalfRC->getSize();
+  const MachineOperand *OffOp =
+      getNamedOperand(*MI, AMDGPU::OpName::offset);
+  const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
+
+  if (OffOp) {
+    // Handle the _IMM variant
+    unsigned LoOffset = OffOp->getImm();
+    unsigned HiOffset = LoOffset + (HalfSize / 4);
+    Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
+                  .addOperand(*SBase)
+                  .addImm(LoOffset);
+
+    if (!isUInt<8>(HiOffset)) {
+      unsigned OffsetSGPR =
+          MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
+              .addImm(HiOffset << 2);  // The immediate offset is in dwords,
+                                       // but offset in register is in bytes.
+      Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
+                    .addOperand(*SBase)
+                    .addReg(OffsetSGPR);
+    } else {
+      Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
+                     .addOperand(*SBase)
+                     .addImm(HiOffset);
+    }
+  } else {
+    // Handle the _SGPR variant
+    MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
+    Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
+                  .addOperand(*SBase)
+                  .addOperand(*SOff);
+    unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
+            .addOperand(*SOff)
+            .addImm(HalfSize);
+    Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp))
+                  .addOperand(*SBase)
+                  .addReg(OffsetSGPR);
+  }
+
+  unsigned SubLo, SubHi;
+  switch (HalfSize) {
+    case 4:
+      SubLo = AMDGPU::sub0;
+      SubHi = AMDGPU::sub1;
+      break;
+    case 8:
+      SubLo = AMDGPU::sub0_sub1;
+      SubHi = AMDGPU::sub2_sub3;
+      break;
+    case 16:
+      SubLo = AMDGPU::sub0_sub1_sub2_sub3;
+      SubHi = AMDGPU::sub4_sub5_sub6_sub7;
+      break;
+    case 32:
+      SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
+      SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
+      break;
+    default:
+      llvm_unreachable("Unhandled HalfSize");
+  }
+
+  BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE))
+          .addOperand(MI->getOperand(0))
+          .addReg(RegLo)
+          .addImm(SubLo)
+          .addReg(RegHi)
+          .addImm(SubHi);
+}
+
 void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
   MachineBasicBlock *MBB = MI->getParent();
   switch (MI->getOpcode()) {
@@ -1112,7 +1839,7 @@
     case AMDGPU::S_LOAD_DWORDX2_IMM:
     case AMDGPU::S_LOAD_DWORDX2_SGPR:
     case AMDGPU::S_LOAD_DWORDX4_IMM:
-    case AMDGPU::S_LOAD_DWORDX4_SGPR:
+    case AMDGPU::S_LOAD_DWORDX4_SGPR: {
       unsigned NewOpcode = getVALUOp(*MI);
       unsigned RegOffset;
       unsigned ImmOffset;
@@ -1159,14 +1886,44 @@
               .addImm(AMDGPU::sub2)
               .addReg(DWord3)
               .addImm(AMDGPU::sub3);
-     MI->setDesc(get(NewOpcode));
-     if (MI->getOperand(2).isReg()) {
-       MI->getOperand(2).setReg(MI->getOperand(1).getReg());
-     } else {
-       MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
-     }
-     MI->getOperand(1).setReg(SRsrc);
-     MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+      MI->setDesc(get(NewOpcode));
+      if (MI->getOperand(2).isReg()) {
+        MI->getOperand(2).setReg(MI->getOperand(1).getReg());
+      } else {
+        MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
+      }
+      MI->getOperand(1).setReg(SRsrc);
+      MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+
+      const TargetRegisterClass *NewDstRC =
+          RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass);
+
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+      MRI.replaceRegWith(DstReg, NewDstReg);
+      break;
+    }
+    case AMDGPU::S_LOAD_DWORDX8_IMM:
+    case AMDGPU::S_LOAD_DWORDX8_SGPR: {
+      MachineInstr *Lo, *Hi;
+      splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
+                AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
+      MI->eraseFromParent();
+      moveSMRDToVALU(Lo, MRI);
+      moveSMRDToVALU(Hi, MRI);
+      break;
+    }
+
+    case AMDGPU::S_LOAD_DWORDX16_IMM:
+    case AMDGPU::S_LOAD_DWORDX16_SGPR: {
+      MachineInstr *Lo, *Hi;
+      splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
+                AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
+      MI->eraseFromParent();
+      moveSMRDToVALU(Lo, MRI);
+      moveSMRDToVALU(Hi, MRI);
+      break;
+    }
   }
 }
 
@@ -1238,8 +1995,13 @@
       Inst->eraseFromParent();
       continue;
 
+    case AMDGPU::S_BFE_I64: {
+      splitScalar64BitBFE(Worklist, Inst);
+      Inst->eraseFromParent();
+      continue;
+    }
+
     case AMDGPU::S_BFE_U64:
-    case AMDGPU::S_BFE_I64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
     }
@@ -1268,17 +2030,9 @@
       // We are converting these to a BFE, so we need to add the missing
       // operands for the size and offset.
       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
-      Inst->addOperand(Inst->getOperand(1));
-      Inst->getOperand(1).ChangeToImmediate(0);
-      Inst->addOperand(MachineOperand::CreateImm(0));
-      Inst->addOperand(MachineOperand::CreateImm(0));
       Inst->addOperand(MachineOperand::CreateImm(0));
       Inst->addOperand(MachineOperand::CreateImm(Size));
 
-      // XXX - Other pointless operands. There are 4, but it seems you only need
-      // 3 to not hit an assertion later in MCInstLower.
-      Inst->addOperand(MachineOperand::CreateImm(0));
-      Inst->addOperand(MachineOperand::CreateImm(0));
     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
       // The VALU version adds the second operand to the result, so insert an
       // extra 0 operand.
@@ -1297,16 +2051,9 @@
 
       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
-
       Inst->RemoveOperand(2); // Remove old immediate.
-      Inst->addOperand(Inst->getOperand(1));
-      Inst->getOperand(1).ChangeToImmediate(0);
-      Inst->addOperand(MachineOperand::CreateImm(0));
       Inst->addOperand(MachineOperand::CreateImm(Offset));
-      Inst->addOperand(MachineOperand::CreateImm(0));
       Inst->addOperand(MachineOperand::CreateImm(BitWidth));
-      Inst->addOperand(MachineOperand::CreateImm(0));
-      Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
     // Update the destination register class.
@@ -1519,6 +2266,67 @@
   Worklist.push_back(Second);
 }
 
+void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+                                      MachineInstr *Inst) const {
+  MachineBasicBlock &MBB = *Inst->getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst->getDebugLoc();
+
+  MachineOperand &Dest = Inst->getOperand(0);
+  uint32_t Imm = Inst->getOperand(2).getImm();
+  uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+  uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+  (void) Offset;
+
+  // Only sext_inreg cases handled.
+  assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 &&
+         BitWidth <= 32 &&
+         Offset == 0 &&
+         "Not implemented");
+
+  if (BitWidth < 32) {
+    unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
+      .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0)
+      .addImm(0)
+      .addImm(BitWidth);
+
+    BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
+      .addImm(31)
+      .addReg(MidRegLo);
+
+    BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+      .addReg(MidRegLo)
+      .addImm(AMDGPU::sub0)
+      .addReg(MidRegHi)
+      .addImm(AMDGPU::sub1);
+
+    MRI.replaceRegWith(Dest.getReg(), ResultReg);
+    return;
+  }
+
+  MachineOperand &Src = Inst->getOperand(1);
+  unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
+    .addImm(31)
+    .addReg(Src.getReg(), 0, AMDGPU::sub0);
+
+  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
+    .addReg(Src.getReg(), 0, AMDGPU::sub0)
+    .addImm(AMDGPU::sub0)
+    .addReg(TmpReg)
+    .addImm(AMDGPU::sub1);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+}
+
 void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
                                         MachineInstr *Inst) const {
   // Add the implict and explicit register definitions.
@@ -1537,6 +2345,74 @@
   }
 }
 
+unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
+                                   int OpIndices[3]) const {
+  const MCInstrDesc &Desc = get(MI->getOpcode());
+
+  // Find the one SGPR operand we are allowed to use.
+  unsigned SGPRReg = AMDGPU::NoRegister;
+
+  // First we need to consider the instruction's operand requirements before
+  // legalizing. Some operands are required to be SGPRs, such as implicit uses
+  // of VCC, but we are still bound by the constant bus requirement to only use
+  // one.
+  //
+  // If the operand's class is an SGPR, we can never move it.
+
+  for (const MachineOperand &MO : MI->implicit_operands()) {
+    // We only care about reads.
+    if (MO.isDef())
+      continue;
+
+    if (MO.getReg() == AMDGPU::VCC)
+      return AMDGPU::VCC;
+
+    if (MO.getReg() == AMDGPU::FLAT_SCR)
+      return AMDGPU::FLAT_SCR;
+  }
+
+  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  for (unsigned i = 0; i < 3; ++i) {
+    int Idx = OpIndices[i];
+    if (Idx == -1)
+      break;
+
+    const MachineOperand &MO = MI->getOperand(Idx);
+    if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass))
+      SGPRReg = MO.getReg();
+
+    if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+      UsedSGPRs[i] = MO.getReg();
+  }
+
+  if (SGPRReg != AMDGPU::NoRegister)
+    return SGPRReg;
+
+  // We don't have a required SGPR operand, so we have a bit more freedom in
+  // selecting operands to move.
+
+  // Try to select the most used SGPR. If an SGPR is equal to one of the
+  // others, we choose that.
+  //
+  // e.g.
+  // V_FMA_F32 v0, s0, s0, s0 -> No moves
+  // V_FMA_F32 v0, s0, s1, s0 -> Move s1
+
+  if (UsedSGPRs[0] != AMDGPU::NoRegister) {
+    if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
+      SGPRReg = UsedSGPRs[0];
+  }
+
+  if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
+    if (UsedSGPRs[1] == UsedSGPRs[2])
+      SGPRReg = UsedSGPRs[1];
+  }
+
+  return SGPRReg;
+}
+
 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    MachineBasicBlock *MBB,
                                    MachineBasicBlock::iterator I,
@@ -1600,3 +2476,12 @@
   for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
     Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
 }
+
+MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
+                                             unsigned OperandName) const {
+  int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
+  if (Idx == -1)
+    return nullptr;
+
+  return &MI.getOperand(Idx);
+}

diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 4c204d8..3bdbc9b 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h

@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef SIINSTRINFO_H
-#define SIINSTRINFO_H
+#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H
+#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H
 
 #include "AMDGPUInstrInfo.h"
 #include "SIRegisterInfo.h"
@@ -52,9 +52,16 @@
 
   void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
                             MachineInstr *Inst) const;
+  void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+                           MachineInstr *Inst) const;
 
   void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
 
+  bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
+                                    MachineInstr *MIb) const;
+
+  unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const;
+
 public:
   explicit SIInstrInfo(const AMDGPUSubtarget &st);
 
@@ -62,11 +69,30 @@
     return RI;
   }
 
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                               int64_t &Offset1,
+                               int64_t &Offset2) const override;
+
+  bool getLdStBaseRegImmOfs(MachineInstr *LdSt,
+                            unsigned &BaseReg, unsigned &Offset,
+                            const TargetRegisterInfo *TRI) const final;
+
+  bool shouldClusterLoads(MachineInstr *FirstLdSt,
+                          MachineInstr *SecondLdSt,
+                          unsigned NumLoads) const final;
+
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator MI, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
+  unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    RegScavenger *RS,
+                                    unsigned TmpReg,
+                                    unsigned Offset,
+                                    unsigned Size) const;
+
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
@@ -79,19 +105,22 @@
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
   unsigned commuteOpcode(unsigned Opcode) const;
 
   MachineInstr *commuteInstruction(MachineInstr *MI,
-                                   bool NewMI=false) const override;
+                                   bool NewMI = false) const override;
+  bool findCommutedOpIndices(MachineInstr *MI,
+                             unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
 
   bool isTriviallyReMaterializable(const MachineInstr *MI,
                                    AliasAnalysis *AA = nullptr) const;
 
-  unsigned getIEQOpcode() const override {
-    llvm_unreachable("Unimplemented");
-  }
+  bool areMemAccessesTriviallyDisjoint(
+    MachineInstr *MIa, MachineInstr *MIb,
+    AliasAnalysis *AA = nullptr) const override;
 
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
@@ -100,16 +129,42 @@
 
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
   bool isDS(uint16_t Opcode) const;
-  int isMIMG(uint16_t Opcode) const;
-  int isSMRD(uint16_t Opcode) const;
+  bool isMIMG(uint16_t Opcode) const;
+  bool isSMRD(uint16_t Opcode) const;
+  bool isMUBUF(uint16_t Opcode) const;
+  bool isMTBUF(uint16_t Opcode) const;
+  bool isFLAT(uint16_t Opcode) const;
   bool isVOP1(uint16_t Opcode) const;
   bool isVOP2(uint16_t Opcode) const;
   bool isVOP3(uint16_t Opcode) const;
   bool isVOPC(uint16_t Opcode) const;
+
   bool isInlineConstant(const APInt &Imm) const;
   bool isInlineConstant(const MachineOperand &MO) const;
   bool isLiteralConstant(const MachineOperand &MO) const;
 
+  bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
+                         const MachineOperand &MO) const;
+
+  /// \brief Return true if the given offset Size in bytes can be folded into
+  /// the immediate offsets of a memory instruction for the given address space.
+  static bool canFoldOffset(unsigned OffsetSize, unsigned AS) LLVM_READNONE;
+
+  /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding.
+  /// This function will return false if you pass it a 32-bit instruction.
+  bool hasVALU32BitEncoding(unsigned Opcode) const;
+
+  /// \brief Returns true if this operand uses the constant bus.
+  bool usesConstantBus(const MachineRegisterInfo &MRI,
+                       const MachineOperand &MO) const;
+
+  /// \brief Return true if this instruction has any modifiers.
+  ///  e.g. src[012]_mod, omod, clamp.
+  bool hasModifiers(unsigned Opcode) const;
+
+  bool hasModifiersSet(const MachineInstr &MI,
+                       unsigned OpName) const;
+
   bool verifyInstruction(const MachineInstr *MI,
                          StringRef &ErrInfo) const override;
 
@@ -141,10 +196,21 @@
   /// instead of MOV.
   void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
 
+  /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand
+  /// for \p MI.
+  bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
+                      const MachineOperand *MO = nullptr) const;
+
   /// \brief Legalize all operands in this instruction.  This function may
   /// create new instruction and insert them before \p MI.
   void legalizeOperands(MachineInstr *MI) const;
 
+  /// \brief Split an SMRD instruction into two smaller loads of half the
+  //  size storing the results in \p Lo and \p Hi.
+  void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC,
+                 unsigned HalfImmOp, unsigned HalfSGPROp,
+                 MachineInstr *&Lo, MachineInstr *&Hi) const;
+
   void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
 
   /// \brief Replace this instruction's opcode with the equivalent VALU
@@ -175,29 +241,52 @@
               unsigned SavReg, unsigned IndexReg) const;
 
   void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
+
+  /// \brief Returns the operand named \p Op.  If \p MI does not have an
+  /// operand named \c Op, this function returns nullptr.
+  MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const;
+
+  const MachineOperand *getNamedOperand(const MachineInstr &MI,
+                                        unsigned OpName) const {
+    return getNamedOperand(const_cast<MachineInstr &>(MI), OpName);
+  }
 };
 
 namespace AMDGPU {
 
   int getVOPe64(uint16_t Opcode);
+  int getVOPe32(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
   int getMCOpcode(uint16_t Opcode, unsigned Gen);
+  int getAddr64Inst(uint16_t Opcode);
+  int getAtomicRetOp(uint16_t Opcode);
+  int getAtomicNoRetOp(uint16_t Opcode);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
-
+  const uint64_t RSRC_TID_ENABLE = 1LL << 55;
 
 } // End namespace AMDGPU
 
+namespace SI {
+namespace KernelInputOffsets {
+
+/// Offsets in bytes from the start of the input buffer
+enum Offsets {
+  NGROUPS_X = 0,
+  NGROUPS_Y = 4,
+  NGROUPS_Z = 8,
+  GLOBAL_SIZE_X = 12,
+  GLOBAL_SIZE_Y = 16,
+  GLOBAL_SIZE_Z = 20,
+  LOCAL_SIZE_X = 24,
+  LOCAL_SIZE_Y = 28,
+  LOCAL_SIZE_Z = 32
+};
+
+} // End namespace KernelInputOffsets
+} // End namespace SI
+
 } // End namespace llvm
 
-namespace SIInstrFlags {
-  enum Flags {
-    // First 4 bits are the instruction encoding
-    VM_CNT = 1 << 0,
-    EXP_CNT = 1 << 1,
-    LGKM_CNT = 1 << 2
-  };
-}
-
-#endif //SIINSTRINFO_H
+#endif

diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 774c9d1..713e84e 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td

@@ -7,6 +7,32 @@
 //
 //===----------------------------------------------------------------------===//
 
+class vop {
+  field bits<9> SI3;
+}
+
+class vopc <bits<8> si> : vop {
+  field bits<8> SI = si;
+
+  field bits<9> SI3 = {0, si{7-0}};
+}
+
+class vop1 <bits<8> si> : vop {
+  field bits<8> SI  = si;
+
+  field bits<9> SI3 = {1, 1, si{6-0}};
+}
+
+class vop2 <bits<6> si> : vop {
+  field bits<6> SI = si;
+
+  field bits<9> SI3 = {1, 0, 0, si{5-0}};
+}
+
+class vop3 <bits<9> si> : vop {
+  field bits<9> SI3 = si;
+}
+
 // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
 // in AMDGPUMCInstLower.h
 def SISubtarget {
@@ -57,6 +83,10 @@
 def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
 def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
 
+def SIconstdata_ptr : SDNode<
+  "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]>
+>;
+
 // Transformation function, extract the lower 32bit of a 64bit immediate
 def LO32 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
@@ -132,7 +162,7 @@
     return false;
   }
   const SIRegisterInfo *SIRI =
-                       static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+                       static_cast<const SIRegisterInfo*>(TM.getSubtargetImpl()->getRegisterInfo());
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
                                                 U != E; ++U) {
     if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
@@ -142,15 +172,81 @@
   return false;
 }]>;
 
+//===----------------------------------------------------------------------===//
+// Custom Operands
+//===----------------------------------------------------------------------===//
+
 def FRAMEri32 : Operand<iPTR> {
   let MIOperandInfo = (ops i32:$ptr, i32imm:$index);
 }
 
+def sopp_brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getSOPPBrEncoding";
+  let OperandType = "OPERAND_PCREL";
+}
+
+include "SIInstrFormats.td"
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+
+def offen : Operand<i1> {
+  let PrintMethod = "printOffen";
+}
+def idxen : Operand<i1> {
+  let PrintMethod = "printIdxen";
+}
+def addr64 : Operand<i1> {
+  let PrintMethod = "printAddr64";
+}
+def mbuf_offset : Operand<i16> {
+  let PrintMethod = "printMBUFOffset";
+}
+def ds_offset : Operand<i16> {
+  let PrintMethod = "printDSOffset";
+}
+def ds_offset0 : Operand<i8> {
+  let PrintMethod = "printDSOffset0";
+}
+def ds_offset1 : Operand<i8> {
+  let PrintMethod = "printDSOffset1";
+}
+def glc : Operand <i1> {
+  let PrintMethod = "printGLC";
+}
+def slc : Operand <i1> {
+  let PrintMethod = "printSLC";
+}
+def tfe : Operand <i1> {
+  let PrintMethod = "printTFE";
+}
+
+def omod : Operand <i32> {
+  let PrintMethod = "printOModSI";
+}
+
+def ClampMod : Operand <i1> {
+  let PrintMethod = "printClampSI";
+}
+
+} // End OperandType = "OPERAND_IMMEDIATE"
+
 //===----------------------------------------------------------------------===//
 // Complex patterns
 //===----------------------------------------------------------------------===//
 
+def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
+def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
+
+def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
 def MUBUFAddr64 : ComplexPattern<i64, 3, "SelectMUBUFAddr64">;
+def MUBUFAddr64Atomic : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
+def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
+def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
+
+def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
+def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
+def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
 
 //===----------------------------------------------------------------------===//
 // SI assembler operands
@@ -159,9 +255,20 @@
 def SIOperand {
   int ZERO = 0x80;
   int VCC = 0x6A;
+  int FLAT_SCR = 0x68;
 }
 
-include "SIInstrFormats.td"
+def SRCMODS {
+  int NONE = 0;
+}
+
+def DSTCLAMP {
+  int NONE = 0;
+}
+
+def DSTOMOD {
+  int NONE = 0;
+}
 
 //===----------------------------------------------------------------------===//
 //
@@ -179,6 +286,35 @@
 //
 //===----------------------------------------------------------------------===//
 
+class SIMCInstr <string pseudo, int subtarget> {
+  string PseudoInstr = pseudo;
+  int Subtarget = subtarget;
+}
+
+//===----------------------------------------------------------------------===//
+// EXP classes
+//===----------------------------------------------------------------------===//
+
+class EXPCommon : InstSI<
+  (outs),
+  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
+       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+  "exp $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
+  [] > {
+
+  let EXP_CNT = 1;
+  let Uses = [EXEC];
+}
+
+multiclass EXP_m {
+
+  let isPseudo = 1 in {
+    def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ;
+  }
+
+  def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe;
+}
+
 //===----------------------------------------------------------------------===//
 // Scalar classes
 //===----------------------------------------------------------------------===//
@@ -204,11 +340,21 @@
   opName#" $dst, $src0, $src1", pattern
 >;
 
+class SOP2_SELECT_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc),
+  opName#" $dst, $src0, $src1 [$scc]", pattern
+>;
+
 class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
   opName#" $dst, $src0, $src1", pattern
 >;
 
+class SOP2_64_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
 class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
   opName#" $dst, $src0, $src1", pattern
@@ -227,27 +373,52 @@
   : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
 
 class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
-  op, (outs SReg_32:$dst), (ins i16imm:$src0),
+  op, (outs SReg_32:$dst), (ins u16imm:$src0),
   opName#" $dst, $src0", pattern
 >;
 
 class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
-  op, (outs SReg_64:$dst), (ins i16imm:$src0),
+  op, (outs SReg_64:$dst), (ins u16imm:$src0),
   opName#" $dst, $src0", pattern
 >;
 
-multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
+//===----------------------------------------------------------------------===//
+// SMRD classes
+//===----------------------------------------------------------------------===//
+
+class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  SMRD <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+}
+
+class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins,
+                    string asm> :
+  SMRD <outs, ins, asm, []>,
+  SMRDe <op, imm>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins,
+                   string asm, list<dag> pattern> {
+
+  def "" : SMRD_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>;
+
+}
+
+multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass,
                         RegisterClass dstClass> {
-  def _IMM : SMRD <
-    op, 1, (outs dstClass:$dst),
+  defm _IMM : SMRD_m <
+    op, opName#"_IMM", 1, (outs dstClass:$dst),
     (ins baseClass:$sbase, u32imm:$offset),
-    asm#" $dst, $sbase, $offset", []
+    opName#" $dst, $sbase, $offset", []
   >;
 
-  def _SGPR : SMRD <
-    op, 0, (outs dstClass:$dst),
+  defm _SGPR : SMRD_m <
+    op, opName#"_SGPR", 0, (outs dstClass:$dst),
     (ins baseClass:$sbase, SReg_32:$soff),
-    asm#" $dst, $sbase, $soff", []
+    opName#" $dst, $sbase, $soff", []
   >;
 }
 
@@ -255,6 +426,197 @@
 // Vector ALU classes
 //===----------------------------------------------------------------------===//
 
+// This must always be right before the operand being input modified.
+def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
+  let PrintMethod = "printOperandAndMods";
+}
+def InputModsNoDefault : Operand <i32> {
+  let PrintMethod = "printOperandAndMods";
+}
+
+class getNumSrcArgs<ValueType Src1, ValueType Src2> {
+  int ret =
+    !if (!eq(Src1.Value, untyped.Value),      1,   // VOP1
+         !if (!eq(Src2.Value, untyped.Value), 2,   // VOP2
+                                              3)); // VOP3
+}
+
+// Returns the register class to use for the destination of VOP[123C]
+// instructions for the given VT.
+class getVALUDstForVT<ValueType VT> {
+  RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64);
+}
+
+// Returns the register class to use for source 0 of VOP[12C]
+// instructions for the given VT.
+class getVOPSrc0ForVT<ValueType VT> {
+  RegisterClass ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64);
+}
+
+// Returns the register class to use for source 1 of VOP[12C] for the
+// given VT.
+class getVOPSrc1ForVT<ValueType VT> {
+  RegisterClass ret = !if(!eq(VT.Size, 32), VReg_32, VReg_64);
+}
+
+// Returns the register classes for the source arguments of a VOP[12C]
+// instruction for the given SrcVTs.
+class getInRC32 <list<ValueType> SrcVT> {
+  list<RegisterClass> ret = [
+    getVOPSrc0ForVT<SrcVT[0]>.ret,
+    getVOPSrc1ForVT<SrcVT[1]>.ret
+  ];
+}
+
+// Returns the register class to use for sources of VOP3 instructions for the
+// given VT.
+class getVOP3SrcForVT<ValueType VT> {
+  RegisterClass ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64);
+}
+
+// Returns the register classes for the source arguments of a VOP3
+// instruction for the given SrcVTs.
+class getInRC64 <list<ValueType> SrcVT> {
+  list<RegisterClass> ret = [
+    getVOP3SrcForVT<SrcVT[0]>.ret,
+    getVOP3SrcForVT<SrcVT[1]>.ret,
+    getVOP3SrcForVT<SrcVT[2]>.ret
+  ];
+}
+
+// Returns 1 if the source arguments have modifiers, 0 if they do not.
+class hasModifiers<ValueType SrcVT> {
+  bit ret = !if(!eq(SrcVT.Value, f32.Value), 1,
+            !if(!eq(SrcVT.Value, f64.Value), 1, 0));
+}
+
+// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
+class getIns32 <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
+  dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
+            !if(!eq(NumSrcArgs, 2), (ins Src0RC:$src0, Src1RC:$src1), // VOP2
+                                    (ins)));
+}
+
+// Returns the input arguments for VOP3 instructions for the given SrcVT.
+class getIns64 <RegisterClass Src0RC, RegisterClass Src1RC,
+                RegisterClass Src2RC, int NumSrcArgs,
+                bit HasModifiers> {
+
+  dag ret =
+    !if (!eq(NumSrcArgs, 1),
+      !if (!eq(HasModifiers, 1),
+        // VOP1 with modifiers
+        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
+             ClampMod:$clamp, omod:$omod)
+      /* else */,
+        // VOP1 without modifiers
+        (ins Src0RC:$src0)
+      /* endif */ ),
+    !if (!eq(NumSrcArgs, 2),
+      !if (!eq(HasModifiers, 1),
+        // VOP 2 with modifiers
+        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
+             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
+             ClampMod:$clamp, omod:$omod)
+      /* else */,
+        // VOP2 without modifiers
+        (ins Src0RC:$src0, Src1RC:$src1)
+      /* endif */ )
+    /* NumSrcArgs == 3 */,
+      !if (!eq(HasModifiers, 1),
+        // VOP3 with modifiers
+        (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0,
+             InputModsNoDefault:$src1_modifiers, Src1RC:$src1,
+             InputModsNoDefault:$src2_modifiers, Src2RC:$src2,
+             ClampMod:$clamp, omod:$omod)
+      /* else */,
+        // VOP3 without modifiers
+        (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
+      /* endif */ )));
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP[12C]
+// instruction.  This does not add the _e32 suffix, so it can be reused
+// by getAsm64.
+class getAsm32 <int NumSrcArgs> {
+  string src1 = ", $src1";
+  string src2 = ", $src2";
+  string ret = " $dst, $src0"#
+               !if(!eq(NumSrcArgs, 1), "", src1)#
+               !if(!eq(NumSrcArgs, 3), src2, "");
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP3
+// instruction.
+class getAsm64 <int NumSrcArgs, bit HasModifiers> {
+  string src0 = "$src0_modifiers,";
+  string src1 = !if(!eq(NumSrcArgs, 1), "",
+                   !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
+                                           " $src1_modifiers,"));
+  string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
+  string ret =
+  !if(!eq(HasModifiers, 0),
+      getAsm32<NumSrcArgs>.ret,
+      " $dst, "#src0#src1#src2#"$clamp"#"$omod");
+}
+
+
+class VOPProfile <list<ValueType> _ArgVT> {
+
+  field list<ValueType> ArgVT = _ArgVT;
+
+  field ValueType DstVT = ArgVT[0];
+  field ValueType Src0VT = ArgVT[1];
+  field ValueType Src1VT = ArgVT[2];
+  field ValueType Src2VT = ArgVT[3];
+  field RegisterClass DstRC = getVALUDstForVT<DstVT>.ret;
+  field RegisterClass Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
+  field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret;
+  field RegisterClass Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
+  field RegisterClass Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
+  field RegisterClass Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
+
+  field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret;
+  field bit HasModifiers = hasModifiers<Src0VT>.ret;
+
+  field dag Outs = (outs DstRC:$dst);
+
+  field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
+  field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+                             HasModifiers>.ret;
+
+  field string Asm32 = "_e32"#getAsm32<NumSrcArgs>.ret;
+  field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret;
+}
+
+def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
+def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>;
+def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>;
+def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>;
+def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>;
+def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
+def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
+def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
+def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+
+def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
+def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
+def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
+def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
+def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
+def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> {
+  let Src0RC32 = VCSrc_32;
+}
+def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
+def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
+
+def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
+def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
+def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
+def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
+
+
 class VOP <string opName> {
   string OpName = opName;
 }
@@ -264,197 +626,310 @@
   bit IsOrig = isOrig;
 }
 
-class SIMCInstr <string pseudo, int subtarget> {
-  string PseudoInstr = pseudo;
-  int Subtarget = subtarget;
+class AtomicNoRet <string noRetOp, bit isRet> {
+  string NoRetOp = noRetOp;
+  bit IsRet = isRet;
 }
 
-multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
+class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+  VOP1Common <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+}
+
+multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern,
                    string opName> {
+  def "" : VOP1_Pseudo <outs, ins, pattern, opName>;
 
-  def "" : VOP3Common <outs, ins, "", pattern>, VOP <opName>,
-           SIMCInstr<OpName, SISubtarget.NONE> {
-    let isPseudo = 1;
-  }
+  def _si : VOP1<op.SI, outs, ins, asm, []>,
+            SIMCInstr <opName, SISubtarget.SI>;
+}
 
-  def _si : VOP3 <op, outs, ins, asm, []>, SIMCInstr<opName, SISubtarget.SI>;
+class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> {
+
+  bits<2> src0_modifiers = !if(HasModifiers, ?, 0);
+  bits<2> src1_modifiers = !if(HasModifiers, !if(HasSrc1, ?, 0), 0);
+  bits<2> src2_modifiers = !if(HasModifiers, !if(HasSrc2, ? ,0) ,0);
+  bits<2> omod = !if(HasModifiers, ?, 0);
+  bits<1> clamp = !if(HasModifiers, ?, 0);
+  bits<9> src1 = !if(HasSrc1, ?, 0);
+  bits<9> src2 = !if(HasSrc2, ?, 0);
+}
+
+class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> :
+  VOP3Common <outs, ins, "", pattern>,
+  VOP <opName>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+}
+
+class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> :
+  VOP3 <op, outs, ins, asm, []>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+multiclass VOP3_m <vop3 op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName, int NumSrcArgs, bit HasMods = 1> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1),
+                              !if(!eq(NumSrcArgs, 2), 0, 1),
+                              HasMods>;
 
 }
 
-// This must always be right before the operand being input modified.
-def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
-  let PrintMethod = "printOperandAndMods";
+multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName, bit HasMods = 1> {
+
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
+
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+            VOP3DisableFields<0, 0, HasMods>;
 }
 
-multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
-                        string opName, list<dag> pattern> {
+multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName, string revOp,
+                     bit HasMods = 1, bit UseFullOp = 0> {
 
-  def _e32 : VOP1 <
-    op, (outs drc:$dst), (ins src:$src0),
-    opName#"_e32 $dst, $src0", pattern
-  >, VOP <opName>;
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-  def _e64 : VOP3 <
-    {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-    (outs drc:$dst),
-    (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod),
-    opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", []
-  >, VOP <opName> {
-    let src1 = SIOperand.ZERO;
-    let src2 = SIOperand.ZERO;
-  }
+  def _si : VOP3_Real_si <op.SI3,
+              outs, ins, asm, opName>,
+            VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>,
+            VOP3DisableFields<1, 0, HasMods>;
 }
 
-multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern>
-  : VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm,
+                      list<dag> pattern, string opName, string revOp,
+                      bit HasMods = 1, bit UseFullOp = 0> {
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>,
+           VOP2_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern>
-  : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
-
-multiclass VOP1_32_64 <bits<8> op, string opName, list<dag> pattern>
-  : VOP1_Helper <op, VReg_32, VSrc_64, opName, pattern>;
-
-multiclass VOP1_64_32 <bits<8> op, string opName, list<dag> pattern>
-  : VOP1_Helper <op, VReg_64, VSrc_32, opName, pattern>;
-
-multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
-                        string opName, list<dag> pattern, string revOp> {
-  def _e32 : VOP2 <
-    op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1),
-    opName#"_e32 $dst, $src0, $src1", pattern
-  >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
-
-  def _e64 : VOP3 <
-    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-    (outs vrc:$dst),
-    (ins InputMods:$src0_modifiers, arc:$src0,
-         InputMods:$src1_modifiers, arc:$src1,
-         i32imm:$clamp, i32imm:$omod),
-    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
-  >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
-    let src2 = SIOperand.ZERO;
-  }
+  // The VOP2 variant puts the carry out into VCC, the VOP3 variant
+  // can write it into any SGPR. We currently don't use the carry out,
+  // so for now hardcode it to VCC as well.
+  let sdst = SIOperand.VCC, Defs = [VCC] in {
+    def _si : VOP3b <op.SI3, outs, ins, asm, pattern>,
+              VOP3DisableFields<1, 0, HasMods>,
+              SIMCInstr<opName, SISubtarget.SI>,
+              VOP2_REV<revOp#"_e64_si", !eq(revOp, opName)>;
+  } // End sdst = SIOperand.VCC, Defs = [VCC]
 }
 
-multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern,
-                    string revOp = opName>
-  : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern, revOp>;
+multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm,
+                     list<dag> pattern, string opName,
+                     bit HasMods, bit defExec> {
 
-multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern,
-                    string revOp = opName>
-  : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern, revOp>;
+  def "" : VOP3_Pseudo <outs, ins, pattern, opName>;
 
-multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
-                     RegisterClass src0_rc, string revOp = opName> {
-
-  def _e32 : VOP2 <
-    op, (outs VReg_32:$dst), (ins src0_rc:$src0, VReg_32:$src1),
-    opName#"_e32 $dst, $src0, $src1", pattern
-  >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
-
-  def _e64 : VOP3b <
-    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-    (outs VReg_32:$dst),
-    (ins InputMods: $src0_modifiers, VSrc_32:$src0,
-         InputMods:$src1_modifiers, VSrc_32:$src1,
-         i32imm:$clamp, i32imm:$omod),
-    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
-  >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
-    let src2 = SIOperand.ZERO;
-    /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
-       can write it into any SGPR. We currently don't use the carry out,
-       so for now hardcode it to VCC as well */
-    let sdst = SIOperand.VCC;
-  }
-}
-
-multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
-                        string opName, ValueType vt, PatLeaf cond, bit defExec = 0> {
-  def _e32 : VOPC <
-    op, (ins arc:$src0, vrc:$src1),
-    opName#"_e32 $dst, $src0, $src1", []
-  >, VOP <opName> {
-    let Defs = !if(defExec, [VCC, EXEC], [VCC]);
-  }
-
-  def _e64 : VOP3 <
-    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-    (outs SReg_64:$dst),
-    (ins InputMods:$src0_modifiers, arc:$src0,
-         InputMods:$src1_modifiers, arc:$src1,
-         InstFlag:$clamp, InstFlag:$omod),
-    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod",
-    !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>,
-      [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
-    )
-  >, VOP <opName> {
+  def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>,
+              VOP3DisableFields<1, 0, HasMods> {
     let Defs = !if(defExec, [EXEC], []);
-    let src2 = SIOperand.ZERO;
-    let src2_modifiers = 0;
   }
 }
 
-multiclass VOPC_32 <bits<8> op, string opName,
-  ValueType vt = untyped, PatLeaf cond = COND_NULL>
-  : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond>;
+multiclass VOP1_Helper <vop1 op, string opName, dag outs,
+                        dag ins32, string asm32, list<dag> pat32,
+                        dag ins64, string asm64, list<dag> pat64,
+                        bit HasMods> {
 
-multiclass VOPC_64 <bits<8> op, string opName,
-  ValueType vt = untyped, PatLeaf cond = COND_NULL>
-  : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
+  def _e32 : VOP1 <op.SI, outs, ins32, opName#asm32, pat32>, VOP<opName>;
 
-multiclass VOPCX_32 <bits<8> op, string opName,
-  ValueType vt = untyped, PatLeaf cond = COND_NULL>
-  : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond, 1>;
+  defm _e64 : VOP3_1_m <op, outs, ins64, opName#"_e64"#asm64, pat64, opName, HasMods>;
+}
 
-multiclass VOPCX_64 <bits<8> op, string opName,
-  ValueType vt = untyped, PatLeaf cond = COND_NULL>
-  : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond, 1>;
-
-multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
-  op, (outs VReg_32:$dst),
-  (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
-   VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2,
-   InstFlag:$clamp, InstFlag:$omod),
-  opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
+multiclass VOP1Inst <vop1 op, string opName, VOPProfile P,
+                     SDPatternOperator node = null_frag> : VOP1_Helper <
+  op, opName, P.Outs,
+  P.Ins32, P.Asm32, [],
+  P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0))]),
+  P.HasModifiers
 >;
 
-class VOP3_64_32 <bits <9> op, string opName, list<dag> pattern> : VOP3 <
-  op, (outs VReg_64:$dst),
-  (ins VSrc_64:$src0, VSrc_32:$src1),
-  opName#" $dst, $src0, $src1", pattern
->, VOP <opName> {
+class VOP2_e32 <bits<6> op, string opName, dag outs, dag ins, string asm,
+                list<dag> pattern, string revOp> :
+  VOP2 <op, outs, ins, opName#asm, pattern>,
+  VOP <opName>,
+  VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
 
-  let src2 = SIOperand.ZERO;
-  let src0_modifiers = 0;
-  let clamp = 0;
-  let omod = 0;
+multiclass VOP2_Helper <vop2 op, string opName, dag outs,
+                        dag ins32, string asm32, list<dag> pat32,
+                        dag ins64, string asm64, list<dag> pat64,
+                        string revOp, bit HasMods> {
+  def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>;
+
+  defm _e64 : VOP3_2_m <op,
+    outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods
+  >;
 }
 
-class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
-  op, (outs VReg_64:$dst),
-  (ins InputMods:$src0_modifiers, VSrc_64:$src0,
-       InputMods:$src1_modifiers, VSrc_64:$src1,
-       InputMods:$src2_modifiers, VSrc_64:$src2,
-       InstFlag:$clamp, InstFlag:$omod),
-  opName#" $dst, $src0_modifiers, $src1_modifiers, $src2_modifiers, $clamp, $omod", pattern
->, VOP <opName>;
+multiclass VOP2Inst <vop2 op, string opName, VOPProfile P,
+                     SDPatternOperator node = null_frag,
+                     string revOp = opName> : VOP2_Helper <
+  op, opName, P.Outs,
+  P.Ins32, P.Asm32, [],
+  P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set P.DstVT:$dst,
+           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+  revOp, P.HasModifiers
+>;
+
+multiclass VOP2b_Helper <vop2 op, string opName, dag outs,
+                         dag ins32, string asm32, list<dag> pat32,
+                         dag ins64, string asm64, list<dag> pat64,
+                         string revOp, bit HasMods> {
+
+  def _e32 : VOP2_e32 <op.SI, opName, outs, ins32, asm32, pat32, revOp>;
+
+  defm _e64 : VOP3b_2_m <op,
+    outs, ins64, opName#"_e64"#asm64, pat64, opName, revOp, HasMods
+  >;
+}
+
+multiclass VOP2bInst <vop2 op, string opName, VOPProfile P,
+                      SDPatternOperator node = null_frag,
+                      string revOp = opName> : VOP2b_Helper <
+  op, opName, P.Outs,
+  P.Ins32, P.Asm32, [],
+  P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set P.DstVT:$dst,
+           (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+      [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]),
+  revOp, P.HasModifiers
+>;
+
+multiclass VOPC_Helper <vopc op, string opName,
+                        dag ins32, string asm32, list<dag> pat32,
+                        dag out64, dag ins64, string asm64, list<dag> pat64,
+                        bit HasMods, bit DefExec> {
+  def _e32 : VOPC <op.SI, ins32, opName#asm32, pat32>, VOP <opName> {
+    let Defs = !if(DefExec, [EXEC], []);
+  }
+
+  defm _e64 : VOP3_C_m <op, out64, ins64, opName#"_e64"#asm64, pat64, opName,
+                        HasMods, DefExec>;
+}
+
+multiclass VOPCInst <vopc op, string opName,
+                     VOPProfile P, PatLeaf cond = COND_NULL,
+                     bit DefExec = 0> : VOPC_Helper <
+  op, opName,
+  P.Ins32, P.Asm32, [],
+  (outs SReg_64:$dst), P.Ins64, P.Asm64,
+  !if(P.HasModifiers,
+      [(set i1:$dst,
+          (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                      i1:$clamp, i32:$omod)),
+                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+                 cond))],
+      [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]),
+  P.HasModifiers, DefExec
+>;
+
+multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+  VOPCInst <op, opName, VOP_F32_F32_F32, cond>;
+
+multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+  VOPCInst <op, opName, VOP_F64_F64_F64, cond>;
+
+multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+  VOPCInst <op, opName, VOP_I32_I32_I32, cond>;
+
+multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+  VOPCInst <op, opName, VOP_I64_I64_I64, cond>;
 
 
-class VOP3b_Helper <bits<9> op, RegisterClass vrc, RegisterClass arc,
-                    string opName, list<dag> pattern> : VOP3 <
-  op, (outs vrc:$dst0, SReg_64:$dst1),
-  (ins arc:$src0, arc:$src1, arc:$src2,
-   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
-  opName#" $dst0, $dst1, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
->, VOP <opName>;
+multiclass VOPCX <vopc op, string opName, VOPProfile P,
+                  PatLeaf cond = COND_NULL>
+  : VOPCInst <op, opName, P, cond, 1>;
 
+multiclass VOPCX_F32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+  VOPCX <op, opName, VOP_F32_F32_F32, cond>;
 
-class VOP3b_64 <bits<9> op, string opName, list<dag> pattern> :
+multiclass VOPCX_F64 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+  VOPCX <op, opName, VOP_F64_F64_F64, cond>;
+
+multiclass VOPCX_I32 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+  VOPCX <op, opName, VOP_I32_I32_I32, cond>;
+
+multiclass VOPCX_I64 <vopc op, string opName, PatLeaf cond = COND_NULL> :
+  VOPCX <op, opName, VOP_I64_I64_I64, cond>;
+
+multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm,
+                        list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m <
+    op, outs, ins, opName#asm, pat, opName, NumSrcArgs, HasMods
+>;
+
+multiclass VOP3Inst <vop3 op, string opName, VOPProfile P,
+                     SDPatternOperator node = null_frag> : VOP3_Helper <
+  op, opName, P.Outs, P.Ins64, P.Asm64,
+  !if(!eq(P.NumSrcArgs, 3),
+    !if(P.HasModifiers,
+        [(set P.DstVT:$dst,
+            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                       i1:$clamp, i32:$omod)),
+                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+                  (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))],
+        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1,
+                                  P.Src2VT:$src2))]),
+  !if(!eq(P.NumSrcArgs, 2),
+    !if(P.HasModifiers,
+        [(set P.DstVT:$dst,
+            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                       i1:$clamp, i32:$omod)),
+                  (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
+        [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))])
+  /* P.NumSrcArgs == 1 */,
+    !if(P.HasModifiers,
+        [(set P.DstVT:$dst,
+            (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
+                                       i1:$clamp, i32:$omod))))],
+        [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))),
+  P.NumSrcArgs, P.HasModifiers
+>;
+
+multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterClass arc,
+                    string opName, list<dag> pattern> :
+  VOP3b_2_m <
+  op, (outs vrc:$vdst, SReg_64:$sdst),
+      (ins InputModsNoDefault:$src0_modifiers, arc:$src0,
+           InputModsNoDefault:$src1_modifiers, arc:$src1,
+           InputModsNoDefault:$src2_modifiers, arc:$src2,
+           ClampMod:$clamp, omod:$omod),
+  opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern,
+  opName, opName, 1, 1
+>;
+
+multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> :
   VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>;
 
-class VOP3b_32 <bits<9> op, string opName, list<dag> pattern> :
+multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> :
   VOP3b_Helper <op, VReg_32, VSrc_32, opName, pattern>;
 
+
+class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat<
+  (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+        (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+        (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))),
+  (Inst i32:$src0_modifiers, P.Src0VT:$src0,
+        i32:$src1_modifiers, P.Src1VT:$src1,
+        i32:$src2_modifiers, P.Src2VT:$src2,
+        i1:$clamp,
+        i32:$omod)>;
+
 //===----------------------------------------------------------------------===//
 // Vector I/O classes
 //===----------------------------------------------------------------------===//
@@ -466,13 +941,15 @@
   // Single load interpret the 2 i8imm operands as a single i16 offset.
   let offset0 = offset{7-0};
   let offset1 = offset{15-8};
+
+  let hasSideEffects = 0;
 }
 
 class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset),
-  asm#" $vdst, $addr, $offset, [M0]",
+  (ins i1imm:$gds, VReg_32:$addr, ds_offset:$offset),
+  asm#" $vdst, $addr"#"$offset"#" [M0]",
   []> {
   let data0 = 0;
   let data1 = 0;
@@ -483,20 +960,21 @@
 class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
   op,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1),
-  asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]",
+  (ins i1imm:$gds, VReg_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1),
+  asm#" $vdst, $addr"#"$offset0"#"$offset1 [M0]",
   []> {
   let data0 = 0;
   let data1 = 0;
   let mayLoad = 1;
   let mayStore = 0;
+  let hasSideEffects = 0;
 }
 
 class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset),
-  asm#" $addr, $data0, $offset [M0]",
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, ds_offset:$offset),
+  asm#" $addr, $data0"#"$offset"#" [M0]",
   []> {
   let data1 = 0;
   let mayStore = 1;
@@ -504,76 +982,204 @@
   let vdst = 0;
 }
 
-class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
+class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1),
-  asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]",
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, regClass:$data1,
+       ds_offset0:$offset0, ds_offset1:$offset1),
+  asm#" $addr, $data0, $data1"#"$offset0"#"$offset1 [M0]",
   []> {
   let mayStore = 1;
   let mayLoad = 0;
+  let hasSideEffects = 0;
   let vdst = 0;
 }
 
 // 1 address, 1 data.
-class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A <
   op,
   (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
-  asm#" $vdst, $addr, $data0, $offset, [M0]",
-  []> {
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset),
+  asm#" $vdst, $addr, $data0"#"$offset"#" [M0]", []>,
+  AtomicNoRet<noRetOp, 1> {
 
   let data1 = 0;
   let mayStore = 1;
   let mayLoad = 1;
+
+  let hasPostISelHook = 1; // Adjusted to no return version.
 }
 
 // 1 address, 2 data.
-class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+class DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = ""> : DS_1A <
   op,
   (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
-  asm#" $vdst, $addr, $data0, $data1, $offset, [M0]",
-  []> {
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset),
+  asm#" $vdst, $addr, $data0, $data1"#"$offset"#" [M0]",
+  []>,
+  AtomicNoRet<noRetOp, 1> {
   let mayStore = 1;
   let mayLoad = 1;
+  let hasPostISelHook = 1; // Adjusted to no return version.
 }
 
 // 1 address, 2 data.
-class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+class DS_1A2D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, u16imm:$offset),
-  asm#" $addr, $data0, $data1, $offset, [M0]",
-  []> {
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, rc:$data1, ds_offset:$offset),
+  asm#" $addr, $data0, $data1"#"$offset"#" [M0]",
+  []>,
+  AtomicNoRet<noRetOp, 0> {
   let mayStore = 1;
   let mayLoad = 1;
 }
 
 // 1 address, 1 data.
-class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
+class DS_1A1D_NORET <bits<8> op, string asm, RegisterClass rc, string noRetOp = asm> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, u16imm:$offset),
-  asm#" $addr, $data0, $offset, [M0]",
-  []> {
+  (ins i1imm:$gds, VReg_32:$addr, rc:$data0, ds_offset:$offset),
+  asm#" $addr, $data0"#"$offset"#" [M0]",
+  []>,
+  AtomicNoRet<noRetOp, 0> {
 
   let data1 = 0;
   let mayStore = 1;
   let mayLoad = 1;
 }
 
-class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
-  op,
-  (outs),
+//===----------------------------------------------------------------------===//
+// MTBUF classes
+//===----------------------------------------------------------------------===//
+
+class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+  MTBUF <outs, ins, "", pattern>,
+  SIMCInstr<opName, SISubtarget.NONE> {
+  let isPseudo = 1;
+}
+
+class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins,
+                    string asm> :
+  MTBUF <outs, ins, asm, []>,
+  MTBUFe <op>,
+  SIMCInstr<opName, SISubtarget.SI>;
+
+multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm,
+                    list<dag> pattern> {
+
+  def "" : MTBUF_Pseudo <opName, outs, ins, pattern>;
+
+  def _si : MTBUF_Real_si <op, opName, outs, ins, asm>;
+
+}
+
+let mayStore = 1, mayLoad = 0 in {
+
+multiclass MTBUF_Store_Helper <bits<3> op, string opName,
+                               RegisterClass regClass> : MTBUF_m <
+  op, opName, (outs),
   (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
    i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
    SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
-  asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
-     #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
-  []> {
-  let mayStore = 1;
-  let mayLoad = 0;
+  opName#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+        #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
+>;
+
+} // mayStore = 1, mayLoad = 0
+
+let mayLoad = 1, mayStore = 0 in {
+
+multiclass MTBUF_Load_Helper <bits<3> op, string opName,
+                              RegisterClass regClass> : MTBUF_m <
+  op, opName, (outs regClass:$dst),
+  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
+       i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+  opName#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+        #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset", []
+>;
+
+} // mayLoad = 1, mayStore = 0
+
+class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
+
+  bit IsAddr64 = is_addr64;
+  string OpName = NAME # suffix;
+}
+
+class MUBUFAtomicAddr64 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern>
+    : MUBUF <op, outs, ins, asm, pattern> {
+
+  let offen = 0;
+  let idxen = 0;
+  let addr64 = 1;
+  let tfe = 0;
+  let lds = 0;
+  let soffset = 128;
+}
+
+class MUBUFAtomicOffset <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern>
+    : MUBUF <op, outs, ins, asm, pattern> {
+
+  let offen = 0;
+  let idxen = 0;
+  let addr64 = 0;
+  let tfe = 0;
+  let lds = 0;
+  let vaddr = 0;
+}
+
+multiclass MUBUF_Atomic <bits<7> op, string name, RegisterClass rc,
+                         ValueType vt, SDPatternOperator atomic> {
+
+  let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in {
+
+    // No return variants
+    let glc = 0 in {
+
+      def _ADDR64 : MUBUFAtomicAddr64 <
+        op, (outs),
+        (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+             mbuf_offset:$offset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#"$slc", []
+      >, MUBUFAddr64Table<1>, AtomicNoRet<NAME#"_ADDR64", 0>;
+
+      def _OFFSET : MUBUFAtomicOffset <
+        op, (outs),
+        (ins rc:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
+             SSrc_32:$soffset, slc:$slc),
+        name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", []
+      >, MUBUFAddr64Table<0>, AtomicNoRet<NAME#"_OFFSET", 0>;
+    } // glc = 0
+
+    // Variant that return values
+    let glc = 1, Constraints = "$vdata = $vdata_in",
+        DisableEncoding = "$vdata_in"  in {
+
+      def _RTN_ADDR64 : MUBUFAtomicAddr64 <
+        op, (outs rc:$vdata),
+        (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr,
+             mbuf_offset:$offset, slc:$slc),
+        name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset"#" glc"#"$slc",
+        [(set vt:$vdata,
+         (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i16:$offset,
+                                    i1:$slc), vt:$vdata_in))]
+      >, MUBUFAddr64Table<1, "_RTN">, AtomicNoRet<NAME#"_ADDR64", 1>;
+
+      def _RTN_OFFSET : MUBUFAtomicOffset <
+        op, (outs rc:$vdata),
+        (ins rc:$vdata_in, SReg_128:$srsrc, mbuf_offset:$offset,
+             SSrc_32:$soffset, slc:$slc),
+        name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc",
+        [(set vt:$vdata,
+         (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset,
+                                    i1:$slc), vt:$vdata_in))]
+      >, MUBUFAddr64Table<0, "_RTN">, AtomicNoRet<NAME#"_OFFSET", 1>;
+
+    } // glc = 1
+
+  } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1
 }
 
 multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass,
@@ -584,81 +1190,137 @@
 
     let addr64 = 0 in {
 
-      let offen = 0, idxen = 0 in {
+      let offen = 0, idxen = 0, vaddr = 0 in {
         def _OFFSET : MUBUF <op, (outs regClass:$vdata),
-                             (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
-                             i1imm:$slc, i1imm:$tfe),
-                             asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+                             (ins SReg_128:$srsrc,
+                             mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc,
+                             slc:$slc, tfe:$tfe),
+                             asm#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+                             [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc,
+                                                       i32:$soffset, i16:$offset,
+                                                       i1:$glc, i1:$slc, i1:$tfe)))]>,
+                     MUBUFAddr64Table<0>;
       }
 
-      let offen = 1, idxen = 0, offset = 0 in {
+      let offen = 1, idxen = 0  in {
         def _OFFEN  : MUBUF <op, (outs regClass:$vdata),
                              (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             SSrc_32:$soffset, i1imm:$glc, i1imm:$slc,
-                             i1imm:$tfe),
-                             asm#" $vdata, $srsrc + $vaddr + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+                             SSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc,
+                             tfe:$tfe),
+                             asm#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
       }
 
       let offen = 0, idxen = 1 in {
         def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
                              (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
-                             i1imm:$slc, i1imm:$tfe),
-                             asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+                             mbuf_offset:$offset, SSrc_32:$soffset, glc:$glc,
+                             slc:$slc, tfe:$tfe),
+                             asm#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>;
       }
 
       let offen = 1, idxen = 1 in {
         def _BOTHEN : MUBUF <op, (outs regClass:$vdata),
                              (ins SReg_128:$srsrc, VReg_64:$vaddr,
-                             SSrc_32:$soffset, i1imm:$glc,
-                             i1imm:$slc, i1imm:$tfe),
-                             asm#" $vdata, $srsrc[$vaddr[0]] + $vaddr[1] + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
+                             SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
+                             asm#" $vdata, $vaddr, $srsrc, $soffset, idxen offen"#"$glc"#"$slc"#"$tfe", []>;
       }
     }
 
     let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
       def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
-                           (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
-                           asm#" $vdata, $srsrc + $vaddr + $offset",
+                           (ins SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
+                           asm#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
                            [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc,
-                                                  i64:$vaddr, u16imm:$offset)))]>;
+                                                  i64:$vaddr, i16:$offset)))]>, MUBUFAddr64Table<1>;
     }
   }
 }
 
-class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
-                          ValueType store_vt, SDPatternOperator st> :
-    MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
-                            u16imm:$offset),
-          name#" $vdata, $srsrc + $vaddr + $offset",
-          [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, u16imm:$offset))]> {
+multiclass MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
+                          ValueType store_vt, SDPatternOperator st> {
+
+  let addr64 = 0, lds = 0 in {
+
+    def "" : MUBUF <
+      op, (outs),
+      (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
+           mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc,
+           tfe:$tfe),
+      name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"#
+           "$glc"#"$slc"#"$tfe",
+      []
+    >;
+
+    let offen = 0, idxen = 0, vaddr = 0 in {
+      def _OFFSET : MUBUF <
+        op, (outs),
+        (ins vdataClass:$vdata, SReg_128:$srsrc, mbuf_offset:$offset,
+              SSrc_32:$soffset, glc:$glc, slc:$slc, tfe:$tfe),
+        name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe",
+        [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                           i16:$offset, i1:$glc, i1:$slc,
+                                           i1:$tfe))]
+      >, MUBUFAddr64Table<0>;
+    } // offen = 0, idxen = 0, vaddr = 0
+
+    let offen = 1, idxen = 0  in {
+      def _OFFEN  : MUBUF <
+        op, (outs),
+        (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_32:$vaddr, SSrc_32:$soffset,
+             mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe),
+        name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#
+            "$glc"#"$slc"#"$tfe",
+        []
+      >;
+    } // end offen = 1, idxen = 0
+
+  } // End addr64 = 0, lds = 0
+
+  def _ADDR64 : MUBUF <
+    op, (outs),
+    (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, mbuf_offset:$offset),
+    name#" $vdata, $vaddr, $srsrc, 0 addr64"#"$offset",
+    [(st store_vt:$vdata,
+     (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))]>, MUBUFAddr64Table<1>
+     {
+
+      let mayLoad = 0;
+      let mayStore = 1;
+
+      // Encoding
+      let offen = 0;
+      let idxen = 0;
+      let glc = 0;
+      let addr64 = 1;
+      let lds = 0;
+      let slc = 0;
+      let tfe = 0;
+      let soffset = 128; // ZERO
+   }
+}
+
+class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
+      FLAT <op, (outs regClass:$data),
+                (ins VReg_64:$addr),
+            asm#" $data, $addr, [M0, FLAT_SCRATCH]", []> {
+  let glc = 0;
+  let slc = 0;
+  let tfe = 0;
+  let mayLoad = 1;
+}
+
+class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+      FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr),
+          name#" $data, $addr, [M0, FLAT_SCRATCH]",
+         []> {
 
   let mayLoad = 0;
   let mayStore = 1;
 
   // Encoding
-  let offen = 0;
-  let idxen = 0;
   let glc = 0;
-  let addr64 = 1;
-  let lds = 0;
   let slc = 0;
   let tfe = 0;
-  let soffset = 128; // ZERO
-}
-
-class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
-  op,
-  (outs regClass:$dst),
-  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
-  asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
-     #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
-  []> {
-  let mayLoad = 1;
-  let mayStore = 0;
 }
 
 class MIMG_Mask <string op, int channels> {
@@ -799,6 +1461,15 @@
   let ValueCols = [["8"]];
 }
 
+// Maps an opcode in e64 form to its e32 equivalent
+def getVOPe32 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["Size"];
+  let KeyCol = ["8"];
+  let ValueCols = [["4"]];
+}
+
 // Maps an original opcode to its commuted version
 def getCommuteRev : InstrMapping {
   let FilterClass = "VOP2_REV";
@@ -841,4 +1512,30 @@
   let ValueCols = [[!cast<string>(SISubtarget.SI)]];
 }
 
+def getAddr64Inst : InstrMapping {
+  let FilterClass = "MUBUFAddr64Table";
+  let RowFields = ["OpName"];
+  let ColFields = ["IsAddr64"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its version with a return value.
+def getAtomicRetOp : InstrMapping {
+  let FilterClass = "AtomicNoRet";
+  let RowFields = ["NoRetOp"];
+  let ColFields = ["IsRet"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Maps an atomic opcode to its returnless version.
+def getAtomicNoRetOp : InstrMapping {
+  let FilterClass = "AtomicNoRet";
+  let RowFields = ["NoRetOp"];
+  let ColFields = ["IsRet"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
 include "SIInstructions.td"

diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index b3b44e2..90da7a9 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td

@@ -31,13 +31,25 @@
 
 def isCI : Predicate<"Subtarget.getGeneration() "
                       ">= AMDGPUSubtarget::SEA_ISLANDS">;
+def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">;
 
-def isCFDepth0 : Predicate<"isCFDepth0()">;
+def SWaitMatchClass : AsmOperandClass {
+  let Name = "SWaitCnt";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseSWaitCntOps";
+}
 
-def WAIT_FLAG : InstFlag<"printWaitFlag">;
+def WAIT_FLAG : InstFlag<"printWaitFlag"> {
+  let ParserMatchClass = SWaitMatchClass;
+}
 
 let SubtargetPredicate = isSI in {
-let OtherPredicates  = [isCFDepth0] in {
+
+//===----------------------------------------------------------------------===//
+// EXP Instructions
+//===----------------------------------------------------------------------===//
+
+defm EXP : EXP_m;
 
 //===----------------------------------------------------------------------===//
 // SMRD Instructions
@@ -48,125 +60,126 @@
 // We are using the SGPR_32 and not the SReg_32 register class for 32-bit
 // SMRD instructions, because the SGPR_32 register class does not include M0
 // and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>;
 
 defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
+  0x08, "s_buffer_load_dword", SReg_128, SGPR_32
 >;
 
 defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
-  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+  0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64
 >;
 
 defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
-  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+  0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128
 >;
 
 defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
-  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+  0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256
 >;
 
 defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
-  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+  0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512
 >;
 
 } // mayLoad = 1
 
-//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>;
 
 //===----------------------------------------------------------------------===//
 // SOP1 Instructions
 //===----------------------------------------------------------------------===//
 
-let neverHasSideEffects = 1 in {
-
 let isMoveImm = 1 in {
-def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
-def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
-def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
-def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
+def S_MOV_B32 : SOP1_32 <0x00000003, "s_mov_b32", []>;
+def S_MOV_B64 : SOP1_64 <0x00000004, "s_mov_b64", []>;
+def S_CMOV_B32 : SOP1_32 <0x00000005, "s_cmov_b32", []>;
+def S_CMOV_B64 : SOP1_64 <0x00000006, "s_cmov_b64", []>;
 } // End isMoveImm = 1
 
-def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
+def S_NOT_B32 : SOP1_32 <0x00000007, "s_not_b32",
   [(set i32:$dst, (not i32:$src0))]
 >;
 
-def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64",
+def S_NOT_B64 : SOP1_64 <0x00000008, "s_not_b64",
   [(set i64:$dst, (not i64:$src0))]
 >;
-def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
-def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
-def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32",
+def S_WQM_B32 : SOP1_32 <0x00000009, "s_wqm_b32", []>;
+def S_WQM_B64 : SOP1_64 <0x0000000a, "s_wqm_b64", []>;
+def S_BREV_B32 : SOP1_32 <0x0000000b, "s_brev_b32",
   [(set i32:$dst, (AMDGPUbrev i32:$src0))]
 >;
-def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
-} // End neverHasSideEffects = 1
+def S_BREV_B64 : SOP1_64 <0x0000000c, "s_brev_b64", []>;
 
-////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
-////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
-def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "S_BCNT1_I32_B32",
+////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "s_bcnt0_i32_b32", []>;
+////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "s_bcnt0_i32_b64", []>;
+def S_BCNT1_I32_B32 : SOP1_32 <0x0000000f, "s_bcnt1_i32_b32",
   [(set i32:$dst, (ctpop i32:$src0))]
 >;
-def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "S_BCNT1_I32_B64", []>;
+def S_BCNT1_I32_B64 : SOP1_32_64 <0x00000010, "s_bcnt1_i32_b64", []>;
 
-////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "S_FF0_I32_B32", []>;
-////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
-def S_FF1_I32_B32 : SOP1_32 <0x00000013, "S_FF1_I32_B32",
+////def S_FF0_I32_B32 : SOP1_32 <0x00000011, "s_ff0_i32_b32", []>;
+////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "s_ff0_i32_b64", []>;
+def S_FF1_I32_B32 : SOP1_32 <0x00000013, "s_ff1_i32_b32",
   [(set i32:$dst, (cttz_zero_undef i32:$src0))]
 >;
-////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
+////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "s_ff1_i32_b64", []>;
 
-def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32",
+def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "s_flbit_i32_b32",
   [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
 >;
 
-//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
-def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
-//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
-def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8",
+//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "s_flbit_i32_b64", []>;
+def S_FLBIT_I32 : SOP1_32 <0x00000017, "s_flbit_i32", []>;
+//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "s_flbit_i32_i64", []>;
+def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "s_sext_i32_i8",
   [(set i32:$dst, (sext_inreg i32:$src0, i8))]
 >;
-def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16",
+def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "s_sext_i32_i16",
   [(set i32:$dst, (sext_inreg i32:$src0, i16))]
 >;
 
-////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
-////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
-////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
-////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
-def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>;
-def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
-def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
-def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
+////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "s_bitset0_b32", []>;
+////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "s_bitset0_b64", []>;
+////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "s_bitset1_b32", []>;
+////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "s_bitset1_b64", []>;
+def S_GETPC_B64 : SOP1 <
+  0x0000001f, (outs SReg_64:$dst), (ins), "s_getpc_b64 $dst", []
+> {
+  let SSRC0 = 0;
+}
+def S_SETPC_B64 : SOP1_64 <0x00000020, "s_setpc_b64", []>;
+def S_SWAPPC_B64 : SOP1_64 <0x00000021, "s_swappc_b64", []>;
+def S_RFE_B64 : SOP1_64 <0x00000022, "s_rfe_b64", []>;
 
 let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
 
-def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>;
-def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>;
-def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>;
-def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>;
-def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>;
-def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>;
-def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>;
-def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>;
+def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "s_and_saveexec_b64", []>;
+def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "s_or_saveexec_b64", []>;
+def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "s_xor_saveexec_b64", []>;
+def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "s_andn2_saveexec_b64", []>;
+def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "s_orn2_saveexec_b64", []>;
+def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "s_nand_saveexec_b64", []>;
+def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "s_nor_saveexec_b64", []>;
+def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "s_xnor_saveexec_b64", []>;
 
 } // End hasSideEffects = 1
 
-def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>;
-def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>;
-def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>;
-def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>;
-def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>;
-def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
-//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>;
-def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
-def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
-def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "s_quadmask_b32", []>;
+def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "s_quadmask_b64", []>;
+def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "s_movrels_b32", []>;
+def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "s_movrels_b64", []>;
+def S_MOVRELD_B32 : SOP1_32 <0x00000030, "s_movreld_b32", []>;
+def S_MOVRELD_B64 : SOP1_64 <0x00000031, "s_movreld_b64", []>;
+//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "s_cbranch_join", []>;
+def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "s_mov_regrd_b32", []>;
+def S_ABS_I32 : SOP1_32 <0x00000034, "s_abs_i32", []>;
+def S_MOV_FED_B32 : SOP1_32 <0x00000035, "s_mov_fed_b32", []>;
 
 //===----------------------------------------------------------------------===//
 // SOP2 Instructions
@@ -174,145 +187,150 @@
 
 let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {
-def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+def S_ADD_U32 : SOP2_32 <0x00000000, "s_add_u32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "s_add_i32",
   [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
 >;
 } // End isCommutable = 1
 
-def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+def S_SUB_U32 : SOP2_32 <0x00000001, "s_sub_u32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "s_sub_i32",
   [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
 >;
 
 let Uses = [SCC] in { // Carry in comes from SCC
 let isCommutable = 1 in {
-def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+def S_ADDC_U32 : SOP2_32 <0x00000004, "s_addc_u32",
   [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
 } // End isCommutable = 1
 
-def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+def S_SUBB_U32 : SOP2_32 <0x00000005, "s_subb_u32",
   [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
 } // End Uses = [SCC]
 } // End Defs = [SCC]
 
-def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
+def S_MIN_I32 : SOP2_32 <0x00000006, "s_min_i32",
   [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
 >;
-def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
+def S_MIN_U32 : SOP2_32 <0x00000007, "s_min_u32",
   [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
 >;
-def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
+def S_MAX_I32 : SOP2_32 <0x00000008, "s_max_i32",
   [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
 >;
-def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
+def S_MAX_U32 : SOP2_32 <0x00000009, "s_max_u32",
   [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
 >;
 
-def S_CSELECT_B32 : SOP2 <
-  0x0000000a, (outs SReg_32:$dst),
-  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
+def S_CSELECT_B32 : SOP2_SELECT_32 <
+  0x0000000a, "s_cselect_b32",
   []
 >;
 
-def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "s_cselect_b64", []>;
 
-def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
+def S_AND_B32 : SOP2_32 <0x0000000e, "s_and_b32",
   [(set i32:$dst, (and i32:$src0, i32:$src1))]
 >;
 
-def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
+def S_AND_B64 : SOP2_64 <0x0000000f, "s_and_b64",
   [(set i64:$dst, (and i64:$src0, i64:$src1))]
 >;
 
-def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
+def S_OR_B32 : SOP2_32 <0x00000010, "s_or_b32",
   [(set i32:$dst, (or i32:$src0, i32:$src1))]
 >;
 
-def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
+def S_OR_B64 : SOP2_64 <0x00000011, "s_or_b64",
   [(set i64:$dst, (or i64:$src0, i64:$src1))]
 >;
 
-def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
+def S_XOR_B32 : SOP2_32 <0x00000012, "s_xor_b32",
   [(set i32:$dst, (xor i32:$src0, i32:$src1))]
 >;
 
-def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+def S_XOR_B64 : SOP2_64 <0x00000013, "s_xor_b64",
   [(set i64:$dst, (xor i64:$src0, i64:$src1))]
 >;
-def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
-def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
-def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
-def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
-def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
-def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
-def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
-def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
-def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
-def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+def S_ANDN2_B32 : SOP2_32 <0x00000014, "s_andn2_b32", []>;
+def S_ANDN2_B64 : SOP2_64 <0x00000015, "s_andn2_b64", []>;
+def S_ORN2_B32 : SOP2_32 <0x00000016, "s_orn2_b32", []>;
+def S_ORN2_B64 : SOP2_64 <0x00000017, "s_orn2_b64", []>;
+def S_NAND_B32 : SOP2_32 <0x00000018, "s_nand_b32", []>;
+def S_NAND_B64 : SOP2_64 <0x00000019, "s_nand_b64", []>;
+def S_NOR_B32 : SOP2_32 <0x0000001a, "s_nor_b32", []>;
+def S_NOR_B64 : SOP2_64 <0x0000001b, "s_nor_b64", []>;
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "s_xnor_b32", []>;
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "s_xnor_b64", []>;
 
 // Use added complexity so these patterns are preferred to the VALU patterns.
 let AddedComplexity = 1 in {
 
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "s_lshl_b32",
   [(set i32:$dst, (shl i32:$src0, i32:$src1))]
 >;
-def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "s_lshl_b64",
   [(set i64:$dst, (shl i64:$src0, i32:$src1))]
 >;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+def S_LSHR_B32 : SOP2_32 <0x00000020, "s_lshr_b32",
   [(set i32:$dst, (srl i32:$src0, i32:$src1))]
 >;
-def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "s_lshr_b64",
   [(set i64:$dst, (srl i64:$src0, i32:$src1))]
 >;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+def S_ASHR_I32 : SOP2_32 <0x00000022, "s_ashr_i32",
   [(set i32:$dst, (sra i32:$src0, i32:$src1))]
 >;
-def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "s_ashr_i64",
   [(set i64:$dst, (sra i64:$src0, i32:$src1))]
 >;
 
+
+def S_BFM_B32 : SOP2_32 <0x00000024, "s_bfm_b32", []>;
+def S_BFM_B64 : SOP2_64 <0x00000025, "s_bfm_b64", []>;
+def S_MUL_I32 : SOP2_32 <0x00000026, "s_mul_i32",
+  [(set i32:$dst, (mul i32:$src0, i32:$src1))]
+>;
+
 } // End AddedComplexity = 1
 
-def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
-def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
-def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
-def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
-def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
-def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
-def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
-//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
-def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+def S_BFE_U32 : SOP2_32 <0x00000027, "s_bfe_u32", []>;
+def S_BFE_I32 : SOP2_32 <0x00000028, "s_bfe_i32", []>;
+def S_BFE_U64 : SOP2_64 <0x00000029, "s_bfe_u64", []>;
+def S_BFE_I64 : SOP2_64_32 <0x0000002a, "s_bfe_i64", []>;
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "s_cbranch_g_fork", []>;
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "s_absdiff_i32", []>;
 
 //===----------------------------------------------------------------------===//
 // SOPC Instructions
 //===----------------------------------------------------------------------===//
 
-def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">;
-def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">;
-def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">;
-def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">;
-def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">;
-def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">;
-def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">;
-def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">;
-def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">;
-def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">;
-def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">;
-def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">;
-////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
-////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
-////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
-////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
-//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>;
 
 //===----------------------------------------------------------------------===//
 // SOPK Instructions
 //===----------------------------------------------------------------------===//
 
-def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
-def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
+let isReMaterializable = 1 in {
+def S_MOVK_I32 : SOPK_32 <0x00000000, "s_movk_i32", []>;
+} // End isReMaterializable = 1
+def S_CMOVK_I32 : SOPK_32 <0x00000002, "s_cmovk_i32", []>;
 
 /*
 This instruction is disabled for now until we can figure out how to teach
@@ -328,94 +346,87 @@
 
 def S_CMPK_EQ_I32 : SOPK <
   0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
-  "S_CMPK_EQ_I32",
+  "s_cmpk_eq_i32",
   [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
 >;
 */
 
 let isCompare = 1, Defs = [SCC] in {
-def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
-def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
-def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
-def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
-def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
-def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
-def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
-def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
-def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
-def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
-def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
+def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "s_cmpk_lg_i32", []>;
+def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "s_cmpk_gt_i32", []>;
+def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "s_cmpk_ge_i32", []>;
+def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "s_cmpk_lt_i32", []>;
+def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "s_cmpk_le_i32", []>;
+def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "s_cmpk_eq_u32", []>;
+def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "s_cmpk_lg_u32", []>;
+def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "s_cmpk_gt_u32", []>;
+def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "s_cmpk_ge_u32", []>;
+def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "s_cmpk_lt_u32", []>;
+def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "s_cmpk_le_u32", []>;
 } // End isCompare = 1, Defs = [SCC]
 
 let Defs = [SCC], isCommutable = 1 in {
-  def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
-  def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+  def S_ADDK_I32 : SOPK_32 <0x0000000f, "s_addk_i32", []>;
+  def S_MULK_I32 : SOPK_32 <0x00000010, "s_mulk_i32", []>;
 }
 
-//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
-def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
-def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
-def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
-//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
-//def EXP : EXP_ <0x00000000, "EXP", []>;
-
-} // End let OtherPredicates = [isCFDepth0]
+//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "s_cbranch_i_fork", []>;
+def S_GETREG_B32 : SOPK_32 <0x00000012, "s_getreg_b32", []>;
+def S_SETREG_B32 : SOPK_32 <0x00000013, "s_setreg_b32", []>;
+def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "s_getreg_regrd_b32", []>;
+//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "s_setreg_imm32_b32", []>;
+//def EXP : EXP_ <0x00000000, "exp", []>;
 
 //===----------------------------------------------------------------------===//
 // SOPP Instructions
 //===----------------------------------------------------------------------===//
 
-def S_NOP : SOPP <0x00000000, (ins i16imm:$SIMM16), "S_NOP $SIMM16", []>;
+def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">;
 
 let isTerminator = 1 in {
 
-def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
+def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
   [(IL_retflag)]> {
-  let SIMM16 = 0;
+  let simm16 = 0;
   let isBarrier = 1;
   let hasCtrlDep = 1;
 }
 
 let isBranch = 1 in {
 def S_BRANCH : SOPP <
-  0x00000002, (ins brtarget:$target), "S_BRANCH $target",
-  [(br bb:$target)]> {
+  0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
+  [(br bb:$simm16)]> {
   let isBarrier = 1;
 }
 
 let DisableEncoding = "$scc" in {
 def S_CBRANCH_SCC0 : SOPP <
-  0x00000004, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC0 $target", []
+  0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  "s_cbranch_scc0 $simm16"
 >;
 def S_CBRANCH_SCC1 : SOPP <
-  0x00000005, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC1 $target",
-  []
+  0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc),
+  "s_cbranch_scc1 $simm16"
 >;
 } // End DisableEncoding = "$scc"
 
 def S_CBRANCH_VCCZ : SOPP <
-  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCZ $target",
-  []
+  0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  "s_cbranch_vccz $simm16"
 >;
 def S_CBRANCH_VCCNZ : SOPP <
-  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCNZ $target",
-  []
+  0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc),
+  "s_cbranch_vccnz $simm16"
 >;
 
 let DisableEncoding = "$exec" in {
 def S_CBRANCH_EXECZ : SOPP <
-  0x00000008, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECZ $target",
-  []
+  0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  "s_cbranch_execz $simm16"
 >;
 def S_CBRANCH_EXECNZ : SOPP <
-  0x00000009, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECNZ $target",
-  []
+  0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec),
+  "s_cbranch_execnz $simm16"
 >;
 } // End DisableEncoding = "$exec"
 
@@ -424,37 +435,39 @@
 } // End isTerminator = 1
 
 let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
+def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   [(int_AMDGPU_barrier_local)]
 > {
-  let SIMM16 = 0;
+  let simm16 = 0;
   let isBarrier = 1;
   let hasCtrlDep = 1;
   let mayLoad = 1;
   let mayStore = 1;
 }
 
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
-  []
->;
-//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
-//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
-//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
+def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
+def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">;
+def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">;
 
 let Uses = [EXEC] in {
-  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
+  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "s_sendmsg $simm16",
       [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
   > {
     let DisableEncoding = "$m0";
   }
 } // End Uses = [EXEC]
 
-//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
-//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
-//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
-//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
-//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
-//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">;
+def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
+def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
+	let simm16 = 0;
+}
+def S_INCPERFLEVEL : SOPP <0x00000014, (ins i16imm:$simm16), "s_incperflevel $simm16">;
+def S_DECPERFLEVEL : SOPP <0x00000015, (ins i16imm:$simm16), "s_decperflevel $simm16">;
+def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
+  let simm16 = 0;
+}
 } // End hasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -463,256 +476,256 @@
 
 let isCompare = 1 in {
 
-defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
-defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_OLT>;
-defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_OEQ>;
-defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_OLE>;
-defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_OGT>;
-defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32">;
-defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_OGE>;
-defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", f32, COND_O>;
-defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", f32, COND_UO>;
-defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">;
-defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">;
-defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">;
-defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">;
-defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>;
-defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
-defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
+defm V_CMP_F_F32 : VOPC_F32 <vopc<0x0>, "v_cmp_f_f32">;
+defm V_CMP_LT_F32 : VOPC_F32 <vopc<0x1>, "v_cmp_lt_f32", COND_OLT>;
+defm V_CMP_EQ_F32 : VOPC_F32 <vopc<0x2>, "v_cmp_eq_f32", COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_F32 <vopc<0x3>, "v_cmp_le_f32", COND_OLE>;
+defm V_CMP_GT_F32 : VOPC_F32 <vopc<0x4>, "v_cmp_gt_f32", COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_F32 <vopc<0x5>, "v_cmp_lg_f32">;
+defm V_CMP_GE_F32 : VOPC_F32 <vopc<0x6>, "v_cmp_ge_f32", COND_OGE>;
+defm V_CMP_O_F32 : VOPC_F32 <vopc<0x7>, "v_cmp_o_f32", COND_O>;
+defm V_CMP_U_F32 : VOPC_F32 <vopc<0x8>, "v_cmp_u_f32", COND_UO>;
+defm V_CMP_NGE_F32 : VOPC_F32 <vopc<0x9>, "v_cmp_nge_f32">;
+defm V_CMP_NLG_F32 : VOPC_F32 <vopc<0xa>, "v_cmp_nlg_f32">;
+defm V_CMP_NGT_F32 : VOPC_F32 <vopc<0xb>, "v_cmp_ngt_f32">;
+defm V_CMP_NLE_F32 : VOPC_F32 <vopc<0xc>, "v_cmp_nle_f32">;
+defm V_CMP_NEQ_F32 : VOPC_F32 <vopc<0xd>, "v_cmp_neq_f32", COND_UNE>;
+defm V_CMP_NLT_F32 : VOPC_F32 <vopc<0xe>, "v_cmp_nlt_f32">;
+defm V_CMP_TRU_F32 : VOPC_F32 <vopc<0xf>, "v_cmp_tru_f32">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_F32 : VOPCX_32 <0x00000010, "V_CMPX_F_F32">;
-defm V_CMPX_LT_F32 : VOPCX_32 <0x00000011, "V_CMPX_LT_F32">;
-defm V_CMPX_EQ_F32 : VOPCX_32 <0x00000012, "V_CMPX_EQ_F32">;
-defm V_CMPX_LE_F32 : VOPCX_32 <0x00000013, "V_CMPX_LE_F32">;
-defm V_CMPX_GT_F32 : VOPCX_32 <0x00000014, "V_CMPX_GT_F32">;
-defm V_CMPX_LG_F32 : VOPCX_32 <0x00000015, "V_CMPX_LG_F32">;
-defm V_CMPX_GE_F32 : VOPCX_32 <0x00000016, "V_CMPX_GE_F32">;
-defm V_CMPX_O_F32 : VOPCX_32 <0x00000017, "V_CMPX_O_F32">;
-defm V_CMPX_U_F32 : VOPCX_32 <0x00000018, "V_CMPX_U_F32">;
-defm V_CMPX_NGE_F32 : VOPCX_32 <0x00000019, "V_CMPX_NGE_F32">;
-defm V_CMPX_NLG_F32 : VOPCX_32 <0x0000001a, "V_CMPX_NLG_F32">;
-defm V_CMPX_NGT_F32 : VOPCX_32 <0x0000001b, "V_CMPX_NGT_F32">;
-defm V_CMPX_NLE_F32 : VOPCX_32 <0x0000001c, "V_CMPX_NLE_F32">;
-defm V_CMPX_NEQ_F32 : VOPCX_32 <0x0000001d, "V_CMPX_NEQ_F32">;
-defm V_CMPX_NLT_F32 : VOPCX_32 <0x0000001e, "V_CMPX_NLT_F32">;
-defm V_CMPX_TRU_F32 : VOPCX_32 <0x0000001f, "V_CMPX_TRU_F32">;
+defm V_CMPX_F_F32 : VOPCX_F32 <vopc<0x10>, "v_cmpx_f_f32">;
+defm V_CMPX_LT_F32 : VOPCX_F32 <vopc<0x11>, "v_cmpx_lt_f32">;
+defm V_CMPX_EQ_F32 : VOPCX_F32 <vopc<0x12>, "v_cmpx_eq_f32">;
+defm V_CMPX_LE_F32 : VOPCX_F32 <vopc<0x13>, "v_cmpx_le_f32">;
+defm V_CMPX_GT_F32 : VOPCX_F32 <vopc<0x14>, "v_cmpx_gt_f32">;
+defm V_CMPX_LG_F32 : VOPCX_F32 <vopc<0x15>, "v_cmpx_lg_f32">;
+defm V_CMPX_GE_F32 : VOPCX_F32 <vopc<0x16>, "v_cmpx_ge_f32">;
+defm V_CMPX_O_F32 : VOPCX_F32 <vopc<0x17>, "v_cmpx_o_f32">;
+defm V_CMPX_U_F32 : VOPCX_F32 <vopc<0x18>, "v_cmpx_u_f32">;
+defm V_CMPX_NGE_F32 : VOPCX_F32 <vopc<0x19>, "v_cmpx_nge_f32">;
+defm V_CMPX_NLG_F32 : VOPCX_F32 <vopc<0x1a>, "v_cmpx_nlg_f32">;
+defm V_CMPX_NGT_F32 : VOPCX_F32 <vopc<0x1b>, "v_cmpx_ngt_f32">;
+defm V_CMPX_NLE_F32 : VOPCX_F32 <vopc<0x1c>, "v_cmpx_nle_f32">;
+defm V_CMPX_NEQ_F32 : VOPCX_F32 <vopc<0x1d>, "v_cmpx_neq_f32">;
+defm V_CMPX_NLT_F32 : VOPCX_F32 <vopc<0x1e>, "v_cmpx_nlt_f32">;
+defm V_CMPX_TRU_F32 : VOPCX_F32 <vopc<0x1f>, "v_cmpx_tru_f32">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
-defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>;
-defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_OEQ>;
-defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_OLE>;
-defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_OGT>;
-defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">;
-defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_OGE>;
-defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", f64, COND_O>;
-defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", f64, COND_UO>;
-defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">;
-defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">;
-defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">;
-defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">;
-defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>;
-defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
-defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
+defm V_CMP_F_F64 : VOPC_F64 <vopc<0x20>, "v_cmp_f_f64">;
+defm V_CMP_LT_F64 : VOPC_F64 <vopc<0x21>, "v_cmp_lt_f64", COND_OLT>;
+defm V_CMP_EQ_F64 : VOPC_F64 <vopc<0x22>, "v_cmp_eq_f64", COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_F64 <vopc<0x23>, "v_cmp_le_f64", COND_OLE>;
+defm V_CMP_GT_F64 : VOPC_F64 <vopc<0x24>, "v_cmp_gt_f64", COND_OGT>;
+defm V_CMP_LG_F64 : VOPC_F64 <vopc<0x25>, "v_cmp_lg_f64">;
+defm V_CMP_GE_F64 : VOPC_F64 <vopc<0x26>, "v_cmp_ge_f64", COND_OGE>;
+defm V_CMP_O_F64 : VOPC_F64 <vopc<0x27>, "v_cmp_o_f64", COND_O>;
+defm V_CMP_U_F64 : VOPC_F64 <vopc<0x28>, "v_cmp_u_f64", COND_UO>;
+defm V_CMP_NGE_F64 : VOPC_F64 <vopc<0x29>, "v_cmp_nge_f64">;
+defm V_CMP_NLG_F64 : VOPC_F64 <vopc<0x2a>, "v_cmp_nlg_f64">;
+defm V_CMP_NGT_F64 : VOPC_F64 <vopc<0x2b>, "v_cmp_ngt_f64">;
+defm V_CMP_NLE_F64 : VOPC_F64 <vopc<0x2c>, "v_cmp_nle_f64">;
+defm V_CMP_NEQ_F64 : VOPC_F64 <vopc<0x2d>, "v_cmp_neq_f64", COND_UNE>;
+defm V_CMP_NLT_F64 : VOPC_F64 <vopc<0x2e>, "v_cmp_nlt_f64">;
+defm V_CMP_TRU_F64 : VOPC_F64 <vopc<0x2f>, "v_cmp_tru_f64">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_F64 : VOPCX_64 <0x00000030, "V_CMPX_F_F64">;
-defm V_CMPX_LT_F64 : VOPCX_64 <0x00000031, "V_CMPX_LT_F64">;
-defm V_CMPX_EQ_F64 : VOPCX_64 <0x00000032, "V_CMPX_EQ_F64">;
-defm V_CMPX_LE_F64 : VOPCX_64 <0x00000033, "V_CMPX_LE_F64">;
-defm V_CMPX_GT_F64 : VOPCX_64 <0x00000034, "V_CMPX_GT_F64">;
-defm V_CMPX_LG_F64 : VOPCX_64 <0x00000035, "V_CMPX_LG_F64">;
-defm V_CMPX_GE_F64 : VOPCX_64 <0x00000036, "V_CMPX_GE_F64">;
-defm V_CMPX_O_F64 : VOPCX_64 <0x00000037, "V_CMPX_O_F64">;
-defm V_CMPX_U_F64 : VOPCX_64 <0x00000038, "V_CMPX_U_F64">;
-defm V_CMPX_NGE_F64 : VOPCX_64 <0x00000039, "V_CMPX_NGE_F64">;
-defm V_CMPX_NLG_F64 : VOPCX_64 <0x0000003a, "V_CMPX_NLG_F64">;
-defm V_CMPX_NGT_F64 : VOPCX_64 <0x0000003b, "V_CMPX_NGT_F64">;
-defm V_CMPX_NLE_F64 : VOPCX_64 <0x0000003c, "V_CMPX_NLE_F64">;
-defm V_CMPX_NEQ_F64 : VOPCX_64 <0x0000003d, "V_CMPX_NEQ_F64">;
-defm V_CMPX_NLT_F64 : VOPCX_64 <0x0000003e, "V_CMPX_NLT_F64">;
-defm V_CMPX_TRU_F64 : VOPCX_64 <0x0000003f, "V_CMPX_TRU_F64">;
+defm V_CMPX_F_F64 : VOPCX_F64 <vopc<0x30>, "v_cmpx_f_f64">;
+defm V_CMPX_LT_F64 : VOPCX_F64 <vopc<0x31>, "v_cmpx_lt_f64">;
+defm V_CMPX_EQ_F64 : VOPCX_F64 <vopc<0x32>, "v_cmpx_eq_f64">;
+defm V_CMPX_LE_F64 : VOPCX_F64 <vopc<0x33>, "v_cmpx_le_f64">;
+defm V_CMPX_GT_F64 : VOPCX_F64 <vopc<0x34>, "v_cmpx_gt_f64">;
+defm V_CMPX_LG_F64 : VOPCX_F64 <vopc<0x35>, "v_cmpx_lg_f64">;
+defm V_CMPX_GE_F64 : VOPCX_F64 <vopc<0x36>, "v_cmpx_ge_f64">;
+defm V_CMPX_O_F64 : VOPCX_F64 <vopc<0x37>, "v_cmpx_o_f64">;
+defm V_CMPX_U_F64 : VOPCX_F64 <vopc<0x38>, "v_cmpx_u_f64">;
+defm V_CMPX_NGE_F64 : VOPCX_F64 <vopc<0x39>, "v_cmpx_nge_f64">;
+defm V_CMPX_NLG_F64 : VOPCX_F64 <vopc<0x3a>, "v_cmpx_nlg_f64">;
+defm V_CMPX_NGT_F64 : VOPCX_F64 <vopc<0x3b>, "v_cmpx_ngt_f64">;
+defm V_CMPX_NLE_F64 : VOPCX_F64 <vopc<0x3c>, "v_cmpx_nle_f64">;
+defm V_CMPX_NEQ_F64 : VOPCX_F64 <vopc<0x3d>, "v_cmpx_neq_f64">;
+defm V_CMPX_NLT_F64 : VOPCX_F64 <vopc<0x3e>, "v_cmpx_nlt_f64">;
+defm V_CMPX_TRU_F64 : VOPCX_F64 <vopc<0x3f>, "v_cmpx_tru_f64">;
 
 } // End hasSideEffects = 1
 
-defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">;
-defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">;
-defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32">;
-defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32">;
-defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32">;
-defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32">;
-defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32">;
-defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32">;
-defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32">;
-defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32">;
-defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32">;
-defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32">;
-defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32">;
-defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">;
-defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">;
-defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">;
+defm V_CMPS_F_F32 : VOPC_F32 <vopc<0x40>, "v_cmps_f_f32">;
+defm V_CMPS_LT_F32 : VOPC_F32 <vopc<0x41>, "v_cmps_lt_f32">;
+defm V_CMPS_EQ_F32 : VOPC_F32 <vopc<0x42>, "v_cmps_eq_f32">;
+defm V_CMPS_LE_F32 : VOPC_F32 <vopc<0x43>, "v_cmps_le_f32">;
+defm V_CMPS_GT_F32 : VOPC_F32 <vopc<0x44>, "v_cmps_gt_f32">;
+defm V_CMPS_LG_F32 : VOPC_F32 <vopc<0x45>, "v_cmps_lg_f32">;
+defm V_CMPS_GE_F32 : VOPC_F32 <vopc<0x46>, "v_cmps_ge_f32">;
+defm V_CMPS_O_F32 : VOPC_F32 <vopc<0x47>, "v_cmps_o_f32">;
+defm V_CMPS_U_F32 : VOPC_F32 <vopc<0x48>, "v_cmps_u_f32">;
+defm V_CMPS_NGE_F32 : VOPC_F32 <vopc<0x49>, "v_cmps_nge_f32">;
+defm V_CMPS_NLG_F32 : VOPC_F32 <vopc<0x4a>, "v_cmps_nlg_f32">;
+defm V_CMPS_NGT_F32 : VOPC_F32 <vopc<0x4b>, "v_cmps_ngt_f32">;
+defm V_CMPS_NLE_F32 : VOPC_F32 <vopc<0x4c>, "v_cmps_nle_f32">;
+defm V_CMPS_NEQ_F32 : VOPC_F32 <vopc<0x4d>, "v_cmps_neq_f32">;
+defm V_CMPS_NLT_F32 : VOPC_F32 <vopc<0x4e>, "v_cmps_nlt_f32">;
+defm V_CMPS_TRU_F32 : VOPC_F32 <vopc<0x4f>, "v_cmps_tru_f32">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPSX_F_F32 : VOPCX_32 <0x00000050, "V_CMPSX_F_F32">;
-defm V_CMPSX_LT_F32 : VOPCX_32 <0x00000051, "V_CMPSX_LT_F32">;
-defm V_CMPSX_EQ_F32 : VOPCX_32 <0x00000052, "V_CMPSX_EQ_F32">;
-defm V_CMPSX_LE_F32 : VOPCX_32 <0x00000053, "V_CMPSX_LE_F32">;
-defm V_CMPSX_GT_F32 : VOPCX_32 <0x00000054, "V_CMPSX_GT_F32">;
-defm V_CMPSX_LG_F32 : VOPCX_32 <0x00000055, "V_CMPSX_LG_F32">;
-defm V_CMPSX_GE_F32 : VOPCX_32 <0x00000056, "V_CMPSX_GE_F32">;
-defm V_CMPSX_O_F32 : VOPCX_32 <0x00000057, "V_CMPSX_O_F32">;
-defm V_CMPSX_U_F32 : VOPCX_32 <0x00000058, "V_CMPSX_U_F32">;
-defm V_CMPSX_NGE_F32 : VOPCX_32 <0x00000059, "V_CMPSX_NGE_F32">;
-defm V_CMPSX_NLG_F32 : VOPCX_32 <0x0000005a, "V_CMPSX_NLG_F32">;
-defm V_CMPSX_NGT_F32 : VOPCX_32 <0x0000005b, "V_CMPSX_NGT_F32">;
-defm V_CMPSX_NLE_F32 : VOPCX_32 <0x0000005c, "V_CMPSX_NLE_F32">;
-defm V_CMPSX_NEQ_F32 : VOPCX_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
-defm V_CMPSX_NLT_F32 : VOPCX_32 <0x0000005e, "V_CMPSX_NLT_F32">;
-defm V_CMPSX_TRU_F32 : VOPCX_32 <0x0000005f, "V_CMPSX_TRU_F32">;
+defm V_CMPSX_F_F32 : VOPCX_F32 <vopc<0x50>, "v_cmpsx_f_f32">;
+defm V_CMPSX_LT_F32 : VOPCX_F32 <vopc<0x51>, "v_cmpsx_lt_f32">;
+defm V_CMPSX_EQ_F32 : VOPCX_F32 <vopc<0x52>, "v_cmpsx_eq_f32">;
+defm V_CMPSX_LE_F32 : VOPCX_F32 <vopc<0x53>, "v_cmpsx_le_f32">;
+defm V_CMPSX_GT_F32 : VOPCX_F32 <vopc<0x54>, "v_cmpsx_gt_f32">;
+defm V_CMPSX_LG_F32 : VOPCX_F32 <vopc<0x55>, "v_cmpsx_lg_f32">;
+defm V_CMPSX_GE_F32 : VOPCX_F32 <vopc<0x56>, "v_cmpsx_ge_f32">;
+defm V_CMPSX_O_F32 : VOPCX_F32 <vopc<0x57>, "v_cmpsx_o_f32">;
+defm V_CMPSX_U_F32 : VOPCX_F32 <vopc<0x58>, "v_cmpsx_u_f32">;
+defm V_CMPSX_NGE_F32 : VOPCX_F32 <vopc<0x59>, "v_cmpsx_nge_f32">;
+defm V_CMPSX_NLG_F32 : VOPCX_F32 <vopc<0x5a>, "v_cmpsx_nlg_f32">;
+defm V_CMPSX_NGT_F32 : VOPCX_F32 <vopc<0x5b>, "v_cmpsx_ngt_f32">;
+defm V_CMPSX_NLE_F32 : VOPCX_F32 <vopc<0x5c>, "v_cmpsx_nle_f32">;
+defm V_CMPSX_NEQ_F32 : VOPCX_F32 <vopc<0x5d>, "v_cmpsx_neq_f32">;
+defm V_CMPSX_NLT_F32 : VOPCX_F32 <vopc<0x5e>, "v_cmpsx_nlt_f32">;
+defm V_CMPSX_TRU_F32 : VOPCX_F32 <vopc<0x5f>, "v_cmpsx_tru_f32">;
 
 } // End hasSideEffects = 1
 
-defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">;
-defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">;
-defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64">;
-defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64">;
-defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64">;
-defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64">;
-defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64">;
-defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64">;
-defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64">;
-defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64">;
-defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64">;
-defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64">;
-defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64">;
-defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64">;
-defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64">;
-defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64">;
+defm V_CMPS_F_F64 : VOPC_F64 <vopc<0x60>, "v_cmps_f_f64">;
+defm V_CMPS_LT_F64 : VOPC_F64 <vopc<0x61>, "v_cmps_lt_f64">;
+defm V_CMPS_EQ_F64 : VOPC_F64 <vopc<0x62>, "v_cmps_eq_f64">;
+defm V_CMPS_LE_F64 : VOPC_F64 <vopc<0x63>, "v_cmps_le_f64">;
+defm V_CMPS_GT_F64 : VOPC_F64 <vopc<0x64>, "v_cmps_gt_f64">;
+defm V_CMPS_LG_F64 : VOPC_F64 <vopc<0x65>, "v_cmps_lg_f64">;
+defm V_CMPS_GE_F64 : VOPC_F64 <vopc<0x66>, "v_cmps_ge_f64">;
+defm V_CMPS_O_F64 : VOPC_F64 <vopc<0x67>, "v_cmps_o_f64">;
+defm V_CMPS_U_F64 : VOPC_F64 <vopc<0x68>, "v_cmps_u_f64">;
+defm V_CMPS_NGE_F64 : VOPC_F64 <vopc<0x69>, "v_cmps_nge_f64">;
+defm V_CMPS_NLG_F64 : VOPC_F64 <vopc<0x6a>, "v_cmps_nlg_f64">;
+defm V_CMPS_NGT_F64 : VOPC_F64 <vopc<0x6b>, "v_cmps_ngt_f64">;
+defm V_CMPS_NLE_F64 : VOPC_F64 <vopc<0x6c>, "v_cmps_nle_f64">;
+defm V_CMPS_NEQ_F64 : VOPC_F64 <vopc<0x6d>, "v_cmps_neq_f64">;
+defm V_CMPS_NLT_F64 : VOPC_F64 <vopc<0x6e>, "v_cmps_nlt_f64">;
+defm V_CMPS_TRU_F64 : VOPC_F64 <vopc<0x6f>, "v_cmps_tru_f64">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
 
-defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64">;
-defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64">;
-defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64">;
-defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64">;
-defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64">;
-defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64">;
-defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64">;
-defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64">;
-defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64">;
-defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64">;
-defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64">;
-defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64">;
-defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64">;
-defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64">;
-defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64">;
-defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">;
+defm V_CMPSX_F_F64 : VOPC_F64 <vopc<0x70>, "v_cmpsx_f_f64">;
+defm V_CMPSX_LT_F64 : VOPC_F64 <vopc<0x71>, "v_cmpsx_lt_f64">;
+defm V_CMPSX_EQ_F64 : VOPC_F64 <vopc<0x72>, "v_cmpsx_eq_f64">;
+defm V_CMPSX_LE_F64 : VOPC_F64 <vopc<0x73>, "v_cmpsx_le_f64">;
+defm V_CMPSX_GT_F64 : VOPC_F64 <vopc<0x74>, "v_cmpsx_gt_f64">;
+defm V_CMPSX_LG_F64 : VOPC_F64 <vopc<0x75>, "v_cmpsx_lg_f64">;
+defm V_CMPSX_GE_F64 : VOPC_F64 <vopc<0x76>, "v_cmpsx_ge_f64">;
+defm V_CMPSX_O_F64 : VOPC_F64 <vopc<0x77>, "v_cmpsx_o_f64">;
+defm V_CMPSX_U_F64 : VOPC_F64 <vopc<0x78>, "v_cmpsx_u_f64">;
+defm V_CMPSX_NGE_F64 : VOPC_F64 <vopc<0x79>, "v_cmpsx_nge_f64">;
+defm V_CMPSX_NLG_F64 : VOPC_F64 <vopc<0x7a>, "v_cmpsx_nlg_f64">;
+defm V_CMPSX_NGT_F64 : VOPC_F64 <vopc<0x7b>, "v_cmpsx_ngt_f64">;
+defm V_CMPSX_NLE_F64 : VOPC_F64 <vopc<0x7c>, "v_cmpsx_nle_f64">;
+defm V_CMPSX_NEQ_F64 : VOPC_F64 <vopc<0x7d>, "v_cmpsx_neq_f64">;
+defm V_CMPSX_NLT_F64 : VOPC_F64 <vopc<0x7e>, "v_cmpsx_nlt_f64">;
+defm V_CMPSX_TRU_F64 : VOPC_F64 <vopc<0x7f>, "v_cmpsx_tru_f64">;
 
 } // End hasSideEffects = 1, Defs = [EXEC]
 
-defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">;
-defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_SLT>;
-defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>;
-defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_SLE>;
-defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_SGT>;
-defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
-defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>;
-defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
+defm V_CMP_F_I32 : VOPC_I32 <vopc<0x80>, "v_cmp_f_i32">;
+defm V_CMP_LT_I32 : VOPC_I32 <vopc<0x81>, "v_cmp_lt_i32", COND_SLT>;
+defm V_CMP_EQ_I32 : VOPC_I32 <vopc<0x82>, "v_cmp_eq_i32", COND_EQ>;
+defm V_CMP_LE_I32 : VOPC_I32 <vopc<0x83>, "v_cmp_le_i32", COND_SLE>;
+defm V_CMP_GT_I32 : VOPC_I32 <vopc<0x84>, "v_cmp_gt_i32", COND_SGT>;
+defm V_CMP_NE_I32 : VOPC_I32 <vopc<0x85>, "v_cmp_ne_i32", COND_NE>;
+defm V_CMP_GE_I32 : VOPC_I32 <vopc<0x86>, "v_cmp_ge_i32", COND_SGE>;
+defm V_CMP_T_I32 : VOPC_I32 <vopc<0x87>, "v_cmp_t_i32">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_I32 : VOPCX_32 <0x00000090, "V_CMPX_F_I32">;
-defm V_CMPX_LT_I32 : VOPCX_32 <0x00000091, "V_CMPX_LT_I32">;
-defm V_CMPX_EQ_I32 : VOPCX_32 <0x00000092, "V_CMPX_EQ_I32">;
-defm V_CMPX_LE_I32 : VOPCX_32 <0x00000093, "V_CMPX_LE_I32">;
-defm V_CMPX_GT_I32 : VOPCX_32 <0x00000094, "V_CMPX_GT_I32">;
-defm V_CMPX_NE_I32 : VOPCX_32 <0x00000095, "V_CMPX_NE_I32">;
-defm V_CMPX_GE_I32 : VOPCX_32 <0x00000096, "V_CMPX_GE_I32">;
-defm V_CMPX_T_I32 : VOPCX_32 <0x00000097, "V_CMPX_T_I32">;
+defm V_CMPX_F_I32 : VOPCX_I32 <vopc<0x90>, "v_cmpx_f_i32">;
+defm V_CMPX_LT_I32 : VOPCX_I32 <vopc<0x91>, "v_cmpx_lt_i32">;
+defm V_CMPX_EQ_I32 : VOPCX_I32 <vopc<0x92>, "v_cmpx_eq_i32">;
+defm V_CMPX_LE_I32 : VOPCX_I32 <vopc<0x93>, "v_cmpx_le_i32">;
+defm V_CMPX_GT_I32 : VOPCX_I32 <vopc<0x94>, "v_cmpx_gt_i32">;
+defm V_CMPX_NE_I32 : VOPCX_I32 <vopc<0x95>, "v_cmpx_ne_i32">;
+defm V_CMPX_GE_I32 : VOPCX_I32 <vopc<0x96>, "v_cmpx_ge_i32">;
+defm V_CMPX_T_I32 : VOPCX_I32 <vopc<0x97>, "v_cmpx_t_i32">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
-defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>;
-defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", i64, COND_EQ>;
-defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", i64, COND_SLE>;
-defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", i64, COND_SGT>;
-defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>;
-defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>;
-defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
+defm V_CMP_F_I64 : VOPC_I64 <vopc<0xa0>, "v_cmp_f_i64">;
+defm V_CMP_LT_I64 : VOPC_I64 <vopc<0xa1>, "v_cmp_lt_i64", COND_SLT>;
+defm V_CMP_EQ_I64 : VOPC_I64 <vopc<0xa2>, "v_cmp_eq_i64", COND_EQ>;
+defm V_CMP_LE_I64 : VOPC_I64 <vopc<0xa3>, "v_cmp_le_i64", COND_SLE>;
+defm V_CMP_GT_I64 : VOPC_I64 <vopc<0xa4>, "v_cmp_gt_i64", COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_I64 <vopc<0xa5>, "v_cmp_ne_i64", COND_NE>;
+defm V_CMP_GE_I64 : VOPC_I64 <vopc<0xa6>, "v_cmp_ge_i64", COND_SGE>;
+defm V_CMP_T_I64 : VOPC_I64 <vopc<0xa7>, "v_cmp_t_i64">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_I64 : VOPCX_64 <0x000000b0, "V_CMPX_F_I64">;
-defm V_CMPX_LT_I64 : VOPCX_64 <0x000000b1, "V_CMPX_LT_I64">;
-defm V_CMPX_EQ_I64 : VOPCX_64 <0x000000b2, "V_CMPX_EQ_I64">;
-defm V_CMPX_LE_I64 : VOPCX_64 <0x000000b3, "V_CMPX_LE_I64">;
-defm V_CMPX_GT_I64 : VOPCX_64 <0x000000b4, "V_CMPX_GT_I64">;
-defm V_CMPX_NE_I64 : VOPCX_64 <0x000000b5, "V_CMPX_NE_I64">;
-defm V_CMPX_GE_I64 : VOPCX_64 <0x000000b6, "V_CMPX_GE_I64">;
-defm V_CMPX_T_I64 : VOPCX_64 <0x000000b7, "V_CMPX_T_I64">;
+defm V_CMPX_F_I64 : VOPCX_I64 <vopc<0xb0>, "v_cmpx_f_i64">;
+defm V_CMPX_LT_I64 : VOPCX_I64 <vopc<0xb1>, "v_cmpx_lt_i64">;
+defm V_CMPX_EQ_I64 : VOPCX_I64 <vopc<0xb2>, "v_cmpx_eq_i64">;
+defm V_CMPX_LE_I64 : VOPCX_I64 <vopc<0xb3>, "v_cmpx_le_i64">;
+defm V_CMPX_GT_I64 : VOPCX_I64 <vopc<0xb4>, "v_cmpx_gt_i64">;
+defm V_CMPX_NE_I64 : VOPCX_I64 <vopc<0xb5>, "v_cmpx_ne_i64">;
+defm V_CMPX_GE_I64 : VOPCX_I64 <vopc<0xb6>, "v_cmpx_ge_i64">;
+defm V_CMPX_T_I64 : VOPCX_I64 <vopc<0xb7>, "v_cmpx_t_i64">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
-defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>;
-defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", i32, COND_EQ>;
-defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", i32, COND_ULE>;
-defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", i32, COND_UGT>;
-defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>;
-defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>;
-defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
+defm V_CMP_F_U32 : VOPC_I32 <vopc<0xc0>, "v_cmp_f_u32">;
+defm V_CMP_LT_U32 : VOPC_I32 <vopc<0xc1>, "v_cmp_lt_u32", COND_ULT>;
+defm V_CMP_EQ_U32 : VOPC_I32 <vopc<0xc2>, "v_cmp_eq_u32", COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_I32 <vopc<0xc3>, "v_cmp_le_u32", COND_ULE>;
+defm V_CMP_GT_U32 : VOPC_I32 <vopc<0xc4>, "v_cmp_gt_u32", COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_I32 <vopc<0xc5>, "v_cmp_ne_u32", COND_NE>;
+defm V_CMP_GE_U32 : VOPC_I32 <vopc<0xc6>, "v_cmp_ge_u32", COND_UGE>;
+defm V_CMP_T_U32 : VOPC_I32 <vopc<0xc7>, "v_cmp_t_u32">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_U32 : VOPCX_32 <0x000000d0, "V_CMPX_F_U32">;
-defm V_CMPX_LT_U32 : VOPCX_32 <0x000000d1, "V_CMPX_LT_U32">;
-defm V_CMPX_EQ_U32 : VOPCX_32 <0x000000d2, "V_CMPX_EQ_U32">;
-defm V_CMPX_LE_U32 : VOPCX_32 <0x000000d3, "V_CMPX_LE_U32">;
-defm V_CMPX_GT_U32 : VOPCX_32 <0x000000d4, "V_CMPX_GT_U32">;
-defm V_CMPX_NE_U32 : VOPCX_32 <0x000000d5, "V_CMPX_NE_U32">;
-defm V_CMPX_GE_U32 : VOPCX_32 <0x000000d6, "V_CMPX_GE_U32">;
-defm V_CMPX_T_U32 : VOPCX_32 <0x000000d7, "V_CMPX_T_U32">;
+defm V_CMPX_F_U32 : VOPCX_I32 <vopc<0xd0>, "v_cmpx_f_u32">;
+defm V_CMPX_LT_U32 : VOPCX_I32 <vopc<0xd1>, "v_cmpx_lt_u32">;
+defm V_CMPX_EQ_U32 : VOPCX_I32 <vopc<0xd2>, "v_cmpx_eq_u32">;
+defm V_CMPX_LE_U32 : VOPCX_I32 <vopc<0xd3>, "v_cmpx_le_u32">;
+defm V_CMPX_GT_U32 : VOPCX_I32 <vopc<0xd4>, "v_cmpx_gt_u32">;
+defm V_CMPX_NE_U32 : VOPCX_I32 <vopc<0xd5>, "v_cmpx_ne_u32">;
+defm V_CMPX_GE_U32 : VOPCX_I32 <vopc<0xd6>, "v_cmpx_ge_u32">;
+defm V_CMPX_T_U32 : VOPCX_I32 <vopc<0xd7>, "v_cmpx_t_u32">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
-defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>;
-defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", i64, COND_EQ>;
-defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", i64, COND_ULE>;
-defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", i64, COND_UGT>;
-defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>;
-defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>;
-defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
+defm V_CMP_F_U64 : VOPC_I64 <vopc<0xe0>, "v_cmp_f_u64">;
+defm V_CMP_LT_U64 : VOPC_I64 <vopc<0xe1>, "v_cmp_lt_u64", COND_ULT>;
+defm V_CMP_EQ_U64 : VOPC_I64 <vopc<0xe2>, "v_cmp_eq_u64", COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_I64 <vopc<0xe3>, "v_cmp_le_u64", COND_ULE>;
+defm V_CMP_GT_U64 : VOPC_I64 <vopc<0xe4>, "v_cmp_gt_u64", COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_I64 <vopc<0xe5>, "v_cmp_ne_u64", COND_NE>;
+defm V_CMP_GE_U64 : VOPC_I64 <vopc<0xe6>, "v_cmp_ge_u64", COND_UGE>;
+defm V_CMP_T_U64 : VOPC_I64 <vopc<0xe7>, "v_cmp_t_u64">;
 
 let hasSideEffects = 1 in {
 
-defm V_CMPX_F_U64 : VOPCX_64 <0x000000f0, "V_CMPX_F_U64">;
-defm V_CMPX_LT_U64 : VOPCX_64 <0x000000f1, "V_CMPX_LT_U64">;
-defm V_CMPX_EQ_U64 : VOPCX_64 <0x000000f2, "V_CMPX_EQ_U64">;
-defm V_CMPX_LE_U64 : VOPCX_64 <0x000000f3, "V_CMPX_LE_U64">;
-defm V_CMPX_GT_U64 : VOPCX_64 <0x000000f4, "V_CMPX_GT_U64">;
-defm V_CMPX_NE_U64 : VOPCX_64 <0x000000f5, "V_CMPX_NE_U64">;
-defm V_CMPX_GE_U64 : VOPCX_64 <0x000000f6, "V_CMPX_GE_U64">;
-defm V_CMPX_T_U64 : VOPCX_64 <0x000000f7, "V_CMPX_T_U64">;
+defm V_CMPX_F_U64 : VOPCX_I64 <vopc<0xf0>, "v_cmpx_f_u64">;
+defm V_CMPX_LT_U64 : VOPCX_I64 <vopc<0xf1>, "v_cmpx_lt_u64">;
+defm V_CMPX_EQ_U64 : VOPCX_I64 <vopc<0xf2>, "v_cmpx_eq_u64">;
+defm V_CMPX_LE_U64 : VOPCX_I64 <vopc<0xf3>, "v_cmpx_le_u64">;
+defm V_CMPX_GT_U64 : VOPCX_I64 <vopc<0xf4>, "v_cmpx_gt_u64">;
+defm V_CMPX_NE_U64 : VOPCX_I64 <vopc<0xf5>, "v_cmpx_ne_u64">;
+defm V_CMPX_GE_U64 : VOPCX_I64 <vopc<0xf6>, "v_cmpx_ge_u64">;
+defm V_CMPX_T_U64 : VOPCX_I64 <vopc<0xf7>, "v_cmpx_t_u64">;
 
 } // End hasSideEffects = 1
 
-defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">;
+defm V_CMP_CLASS_F32 : VOPC_F32 <vopc<0x88>, "v_cmp_class_f32">;
 
 let hasSideEffects = 1 in {
-defm V_CMPX_CLASS_F32 : VOPCX_32 <0x00000098, "V_CMPX_CLASS_F32">;
+defm V_CMPX_CLASS_F32 : VOPCX_F32 <vopc<0x98>, "v_cmpx_class_f32">;
 } // End hasSideEffects = 1
 
-defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">;
+defm V_CMP_CLASS_F64 : VOPC_F64 <vopc<0xa8>, "v_cmp_class_f64">;
 
 let hasSideEffects = 1 in {
-defm V_CMPX_CLASS_F64 : VOPCX_64 <0x000000b8, "V_CMPX_CLASS_F64">;
+defm V_CMPX_CLASS_F64 : VOPCX_F64 <vopc<0xb8>, "v_cmpx_class_f64">;
 } // End hasSideEffects = 1
 
 } // End isCompare = 1
@@ -722,88 +735,88 @@
 //===----------------------------------------------------------------------===//
 
 
-def DS_ADD_U32 : DS_1A1D_NORET <0x0, "DS_ADD_U32", VReg_32>;
-def DS_SUB_U32 : DS_1A1D_NORET <0x1, "DS_SUB_U32", VReg_32>;
-def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "DS_RSUB_U32", VReg_32>;
-def DS_INC_U32 : DS_1A1D_NORET <0x3, "DS_INC_U32", VReg_32>;
-def DS_DEC_U32 : DS_1A1D_NORET <0x4, "DS_DEC_U32", VReg_32>;
-def DS_MIN_I32 : DS_1A1D_NORET <0x5, "DS_MIN_I32", VReg_32>;
-def DS_MAX_I32 : DS_1A1D_NORET <0x6, "DS_MAX_I32", VReg_32>;
-def DS_MIN_U32 : DS_1A1D_NORET <0x7, "DS_MIN_U32", VReg_32>;
-def DS_MAX_U32 : DS_1A1D_NORET <0x8, "DS_MAX_U32", VReg_32>;
-def DS_AND_B32 : DS_1A1D_NORET <0x9, "DS_AND_B32", VReg_32>;
-def DS_OR_B32 : DS_1A1D_NORET <0xa, "DS_OR_B32", VReg_32>;
-def DS_XOR_B32 : DS_1A1D_NORET <0xb, "DS_XOR_B32", VReg_32>;
-def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "DS_MSKOR_B32", VReg_32>;
-def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "DS_CMPST_B32", VReg_32>;
-def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "DS_CMPST_F32", VReg_32>;
-def DS_MIN_F32 : DS_1A1D_NORET <0x12, "DS_MIN_F32", VReg_32>;
-def DS_MAX_F32 : DS_1A1D_NORET <0x13, "DS_MAX_F32", VReg_32>;
+def DS_ADD_U32 : DS_1A1D_NORET <0x0, "ds_add_u32", VReg_32>;
+def DS_SUB_U32 : DS_1A1D_NORET <0x1, "ds_sub_u32", VReg_32>;
+def DS_RSUB_U32 : DS_1A1D_NORET <0x2, "ds_rsub_u32", VReg_32>;
+def DS_INC_U32 : DS_1A1D_NORET <0x3, "ds_inc_u32", VReg_32>;
+def DS_DEC_U32 : DS_1A1D_NORET <0x4, "ds_dec_u32", VReg_32>;
+def DS_MIN_I32 : DS_1A1D_NORET <0x5, "ds_min_i32", VReg_32>;
+def DS_MAX_I32 : DS_1A1D_NORET <0x6, "ds_max_i32", VReg_32>;
+def DS_MIN_U32 : DS_1A1D_NORET <0x7, "ds_min_u32", VReg_32>;
+def DS_MAX_U32 : DS_1A1D_NORET <0x8, "ds_max_u32", VReg_32>;
+def DS_AND_B32 : DS_1A1D_NORET <0x9, "ds_and_b32", VReg_32>;
+def DS_OR_B32 : DS_1A1D_NORET <0xa, "ds_or_b32", VReg_32>;
+def DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VReg_32>;
+def DS_MSKOR_B32 : DS_1A1D_NORET <0xc, "ds_mskor_b32", VReg_32>;
+def DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VReg_32>;
+def DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VReg_32>;
+def DS_MIN_F32 : DS_1A1D_NORET <0x12, "ds_min_f32", VReg_32>;
+def DS_MAX_F32 : DS_1A1D_NORET <0x13, "ds_max_f32", VReg_32>;
 
-def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "DS_ADD_RTN_U32", VReg_32>;
-def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "DS_SUB_RTN_U32", VReg_32>;
-def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "DS_RSUB_RTN_U32", VReg_32>;
-def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "DS_INC_RTN_U32", VReg_32>;
-def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "DS_DEC_RTN_U32", VReg_32>;
-def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "DS_MIN_RTN_I32", VReg_32>;
-def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "DS_MAX_RTN_I32", VReg_32>;
-def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "DS_MIN_RTN_U32", VReg_32>;
-def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "DS_MAX_RTN_U32", VReg_32>;
-def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "DS_AND_RTN_B32", VReg_32>;
-def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "DS_OR_RTN_B32", VReg_32>;
-def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "DS_XOR_RTN_B32", VReg_32>;
-def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "DS_MSKOR_RTN_B32", VReg_32>;
-def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "DS_WRXCHG_RTN_B32", VReg_32>;
-//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "DS_WRXCHG2_RTN_B32", VReg_32>;
-//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "DS_WRXCHG2_RTN_B32", VReg_32>;
-def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "DS_CMPST_RTN_B32", VReg_32>;
-def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "DS_CMPST_RTN_F32", VReg_32>;
-def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "DS_MIN_RTN_F32", VReg_32>;
-def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "DS_MAX_RTN_F32", VReg_32>;
+def DS_ADD_RTN_U32 : DS_1A1D_RET <0x20, "ds_add_rtn_u32", VReg_32, "ds_add_u32">;
+def DS_SUB_RTN_U32 : DS_1A1D_RET <0x21, "ds_sub_rtn_u32", VReg_32, "ds_sub_u32">;
+def DS_RSUB_RTN_U32 : DS_1A1D_RET <0x22, "ds_rsub_rtn_u32", VReg_32, "ds_rsub_u32">;
+def DS_INC_RTN_U32 : DS_1A1D_RET <0x23, "ds_inc_rtn_u32", VReg_32, "ds_inc_u32">;
+def DS_DEC_RTN_U32 : DS_1A1D_RET <0x24, "ds_dec_rtn_u32", VReg_32, "ds_dec_u32">;
+def DS_MIN_RTN_I32 : DS_1A1D_RET <0x25, "ds_min_rtn_i32", VReg_32, "ds_min_i32">;
+def DS_MAX_RTN_I32 : DS_1A1D_RET <0x26, "ds_max_rtn_i32", VReg_32, "ds_max_i32">;
+def DS_MIN_RTN_U32 : DS_1A1D_RET <0x27, "ds_min_rtn_u32", VReg_32, "ds_min_u32">;
+def DS_MAX_RTN_U32 : DS_1A1D_RET <0x28, "ds_max_rtn_u32", VReg_32, "ds_max_u32">;
+def DS_AND_RTN_B32 : DS_1A1D_RET <0x29, "ds_and_rtn_b32", VReg_32, "ds_and_b32">;
+def DS_OR_RTN_B32 : DS_1A1D_RET <0x2a, "ds_or_rtn_b32", VReg_32, "ds_or_b32">;
+def DS_XOR_RTN_B32 : DS_1A1D_RET <0x2b, "ds_xor_rtn_b32", VReg_32, "ds_xor_b32">;
+def DS_MSKOR_RTN_B32 : DS_1A1D_RET <0x2c, "ds_mskor_rtn_b32", VReg_32, "ds_mskor_b32">;
+def DS_WRXCHG_RTN_B32 : DS_1A1D_RET <0x2d, "ds_wrxchg_rtn_b32", VReg_32>;
+//def DS_WRXCHG2_RTN_B32 : DS_2A0D_RET <0x2e, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2_b32">;
+//def DS_WRXCHG2ST64_RTN_B32 : DS_2A0D_RET <0x2f, "ds_wrxchg2_rtn_b32", VReg_32, "ds_wrxchg2st64_b32">;
+def DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VReg_32, "ds_cmpst_b32">;
+def DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VReg_32, "ds_cmpst_f32">;
+def DS_MIN_RTN_F32 : DS_1A1D_RET <0x32, "ds_min_rtn_f32", VReg_32, "ds_min_f32">;
+def DS_MAX_RTN_F32 : DS_1A1D_RET <0x33, "ds_max_rtn_f32", VReg_32, "ds_max_f32">;
 
 let SubtargetPredicate = isCI in {
-def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "DS_WRAP_RTN_F32", VReg_32>;
+def DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VReg_32, "ds_wrap_f32">;
 } // End isCI
 
 
-def DS_ADD_U64 : DS_1A1D_NORET <0x40, "DS_ADD_U64", VReg_32>;
-def DS_SUB_U64 : DS_1A1D_NORET <0x41, "DS_SUB_U64", VReg_32>;
-def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "DS_RSUB_U64", VReg_32>;
-def DS_INC_U64 : DS_1A1D_NORET <0x43, "DS_INC_U64", VReg_32>;
-def DS_DEC_U64 : DS_1A1D_NORET <0x44, "DS_DEC_U64", VReg_32>;
-def DS_MIN_I64 : DS_1A1D_NORET <0x45, "DS_MIN_I64", VReg_64>;
-def DS_MAX_I64 : DS_1A1D_NORET <0x46, "DS_MAX_I64", VReg_64>;
-def DS_MIN_U64 : DS_1A1D_NORET <0x47, "DS_MIN_U64", VReg_64>;
-def DS_MAX_U64 : DS_1A1D_NORET <0x48, "DS_MAX_U64", VReg_64>;
-def DS_AND_B64 : DS_1A1D_NORET <0x49, "DS_AND_B64", VReg_64>;
-def DS_OR_B64 : DS_1A1D_NORET <0x4a, "DS_OR_B64", VReg_64>;
-def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "DS_XOR_B64", VReg_64>;
-def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "DS_MSKOR_B64", VReg_64>;
-def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "DS_CMPST_B64", VReg_64>;
-def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "DS_CMPST_F64", VReg_64>;
-def DS_MIN_F64 : DS_1A1D_NORET <0x52, "DS_MIN_F64", VReg_64>;
-def DS_MAX_F64 : DS_1A1D_NORET <0x53, "DS_MAX_F64", VReg_64>;
+def DS_ADD_U64 : DS_1A1D_NORET <0x40, "ds_add_u64", VReg_64>;
+def DS_SUB_U64 : DS_1A1D_NORET <0x41, "ds_sub_u64", VReg_64>;
+def DS_RSUB_U64 : DS_1A1D_NORET <0x42, "ds_rsub_u64", VReg_64>;
+def DS_INC_U64 : DS_1A1D_NORET <0x43, "ds_inc_u64", VReg_64>;
+def DS_DEC_U64 : DS_1A1D_NORET <0x44, "ds_dec_u64", VReg_64>;
+def DS_MIN_I64 : DS_1A1D_NORET <0x45, "ds_min_i64", VReg_64>;
+def DS_MAX_I64 : DS_1A1D_NORET <0x46, "ds_max_i64", VReg_64>;
+def DS_MIN_U64 : DS_1A1D_NORET <0x47, "ds_min_u64", VReg_64>;
+def DS_MAX_U64 : DS_1A1D_NORET <0x48, "ds_max_u64", VReg_64>;
+def DS_AND_B64 : DS_1A1D_NORET <0x49, "ds_and_b64", VReg_64>;
+def DS_OR_B64 : DS_1A1D_NORET <0x4a, "ds_or_b64", VReg_64>;
+def DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>;
+def DS_MSKOR_B64 : DS_1A1D_NORET <0x4c, "ds_mskor_b64", VReg_64>;
+def DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>;
+def DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>;
+def DS_MIN_F64 : DS_1A1D_NORET <0x52, "ds_min_f64", VReg_64>;
+def DS_MAX_F64 : DS_1A1D_NORET <0x53, "ds_max_f64", VReg_64>;
 
-def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "DS_ADD_RTN_U64", VReg_64>;
-def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "DS_SUB_RTN_U64", VReg_64>;
-def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "DS_RSUB_RTN_U64", VReg_64>;
-def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "DS_INC_RTN_U64", VReg_64>;
-def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "DS_DEC_RTN_U64", VReg_64>;
-def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "DS_MIN_RTN_I64", VReg_64>;
-def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "DS_MAX_RTN_I64", VReg_64>;
-def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "DS_MIN_RTN_U64", VReg_64>;
-def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "DS_MAX_RTN_U64", VReg_64>;
-def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "DS_AND_RTN_B64", VReg_64>;
-def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "DS_OR_RTN_B64", VReg_64>;
-def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "DS_XOR_RTN_B64", VReg_64>;
-def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "DS_MSKOR_RTN_B64", VReg_64>;
-def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "DS_WRXCHG_RTN_B64", VReg_64>;
-//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "DS_WRXCHG2_RTN_B64", VReg_64>;
-//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "DS_WRXCHG2_RTN_B64", VReg_64>;
-def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "DS_CMPST_RTN_B64", VReg_64>;
-def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "DS_CMPST_RTN_F64", VReg_64>;
-def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "DS_MIN_F64", VReg_64>;
-def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "DS_MAX_F64", VReg_64>;
+def DS_ADD_RTN_U64 : DS_1A1D_RET <0x60, "ds_add_rtn_u64", VReg_64, "ds_add_u64">;
+def DS_SUB_RTN_U64 : DS_1A1D_RET <0x61, "ds_sub_rtn_u64", VReg_64, "ds_sub_u64">;
+def DS_RSUB_RTN_U64 : DS_1A1D_RET <0x62, "ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">;
+def DS_INC_RTN_U64 : DS_1A1D_RET <0x63, "ds_inc_rtn_u64", VReg_64, "ds_inc_u64">;
+def DS_DEC_RTN_U64 : DS_1A1D_RET <0x64, "ds_dec_rtn_u64", VReg_64, "ds_dec_u64">;
+def DS_MIN_RTN_I64 : DS_1A1D_RET <0x65, "ds_min_rtn_i64", VReg_64, "ds_min_i64">;
+def DS_MAX_RTN_I64 : DS_1A1D_RET <0x66, "ds_max_rtn_i64", VReg_64, "ds_max_i64">;
+def DS_MIN_RTN_U64 : DS_1A1D_RET <0x67, "ds_min_rtn_u64", VReg_64, "ds_min_u64">;
+def DS_MAX_RTN_U64 : DS_1A1D_RET <0x68, "ds_max_rtn_u64", VReg_64, "ds_max_u64">;
+def DS_AND_RTN_B64 : DS_1A1D_RET <0x69, "ds_and_rtn_b64", VReg_64, "ds_and_b64">;
+def DS_OR_RTN_B64 : DS_1A1D_RET <0x6a, "ds_or_rtn_b64", VReg_64, "ds_or_b64">;
+def DS_XOR_RTN_B64 : DS_1A1D_RET <0x6b, "ds_xor_rtn_b64", VReg_64, "ds_xor_b64">;
+def DS_MSKOR_RTN_B64 : DS_1A1D_RET <0x6c, "ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">;
+def DS_WRXCHG_RTN_B64 : DS_1A1D_RET <0x6d, "ds_wrxchg_rtn_b64", VReg_64, "ds_wrxchg_b64">;
+//def DS_WRXCHG2_RTN_B64 : DS_2A0D_RET <0x6e, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2_b64">;
+//def DS_WRXCHG2ST64_RTN_B64 : DS_2A0D_RET <0x6f, "ds_wrxchg2_rtn_b64", VReg_64, "ds_wrxchg2st64_b64">;
+def DS_CMPST_RTN_B64 : DS_1A2D_RET <0x70, "ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">;
+def DS_CMPST_RTN_F64 : DS_1A2D_RET <0x71, "ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">;
+def DS_MIN_RTN_F64 : DS_1A1D_RET <0x72, "ds_min_f64", VReg_64, "ds_min_f64">;
+def DS_MAX_RTN_F64 : DS_1A1D_RET <0x73, "ds_max_f64", VReg_64, "ds_max_f64">;
 
 //let SubtargetPredicate = isCI in {
 // DS_CONDXCHG32_RTN_B64
@@ -812,240 +825,336 @@
 
 // TODO: _SRC2_* forms
 
-def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
-def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
-def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
-def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "DS_WRITE_B64", VReg_64>;
+def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "ds_write_b32", VReg_32>;
+def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "ds_write_b8", VReg_32>;
+def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "ds_write_b16", VReg_32>;
+def DS_WRITE_B64 : DS_Store_Helper <0x00000004d, "ds_write_b64", VReg_64>;
 
-def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>;
-def DS_READ_I8 : DS_Load_Helper <0x00000039, "DS_READ_I8", VReg_32>;
-def DS_READ_U8 : DS_Load_Helper <0x0000003a, "DS_READ_U8", VReg_32>;
-def DS_READ_I16 : DS_Load_Helper <0x0000003b, "DS_READ_I16", VReg_32>;
-def DS_READ_U16 : DS_Load_Helper <0x0000003c, "DS_READ_U16", VReg_32>;
-def DS_READ_B64 : DS_Load_Helper <0x00000076, "DS_READ_B64", VReg_64>;
+def DS_READ_B32 : DS_Load_Helper <0x00000036, "ds_read_b32", VReg_32>;
+def DS_READ_I8 : DS_Load_Helper <0x00000039, "ds_read_i8", VReg_32>;
+def DS_READ_U8 : DS_Load_Helper <0x0000003a, "ds_read_u8", VReg_32>;
+def DS_READ_I16 : DS_Load_Helper <0x0000003b, "ds_read_i16", VReg_32>;
+def DS_READ_U16 : DS_Load_Helper <0x0000003c, "ds_read_u16", VReg_32>;
+def DS_READ_B64 : DS_Load_Helper <0x00000076, "ds_read_b64", VReg_64>;
 
 // 2 forms.
-def DS_WRITE2_B32 : DS_Load2_Helper <0x0000000E, "DS_WRITE2_B32", VReg_64>;
-def DS_WRITE2_B64 : DS_Load2_Helper <0x0000004E, "DS_WRITE2_B64", VReg_128>;
+def DS_WRITE2_B32 : DS_Store2_Helper <0x0000000E, "ds_write2_b32", VReg_32>;
+def DS_WRITE2ST64_B32 : DS_Store2_Helper <0x0000000F, "ds_write2st64_b32", VReg_32>;
+def DS_WRITE2_B64 : DS_Store2_Helper <0x0000004E, "ds_write2_b64", VReg_64>;
+def DS_WRITE2ST64_B64 : DS_Store2_Helper <0x0000004F, "ds_write2st64_b64", VReg_64>;
 
-def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "DS_READ2_B32", VReg_64>;
-def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>;
-
-// TODO: DS_READ2ST64_B32, DS_READ2ST64_B64,
-// DS_WRITE2ST64_B32, DS_WRITE2ST64_B64
+def DS_READ2_B32 : DS_Load2_Helper <0x00000037, "ds_read2_b32", VReg_64>;
+def DS_READ2ST64_B32 : DS_Load2_Helper <0x00000038, "ds_read2st64_b32", VReg_64>;
+def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "ds_read2_b64", VReg_128>;
+def DS_READ2ST64_B64 : DS_Load2_Helper <0x00000076, "ds_read2st64_b64", VReg_128>;
 
 //===----------------------------------------------------------------------===//
 // MUBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
-//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
-//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
-//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
-//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
-//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
-//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "buffer_load_format_x", []>;
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "buffer_load_format_xy", []>;
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "buffer_load_format_xyz", []>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "buffer_load_format_xyzw", VReg_128>;
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "buffer_store_format_x", []>;
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "buffer_store_format_xy", []>;
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "buffer_store_format_xyz", []>;
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "buffer_store_format_xyzw", []>;
 defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <
-  0x00000008, "BUFFER_LOAD_UBYTE", VReg_32, i32, az_extloadi8_global
+  0x00000008, "buffer_load_ubyte", VReg_32, i32, az_extloadi8_global
 >;
 defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <
-  0x00000009, "BUFFER_LOAD_SBYTE", VReg_32, i32, sextloadi8_global
+  0x00000009, "buffer_load_sbyte", VReg_32, i32, sextloadi8_global
 >;
 defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <
-  0x0000000a, "BUFFER_LOAD_USHORT", VReg_32, i32, az_extloadi16_global
+  0x0000000a, "buffer_load_ushort", VReg_32, i32, az_extloadi16_global
 >;
 defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
-  0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32, i32, sextloadi16_global
+  0x0000000b, "buffer_load_sshort", VReg_32, i32, sextloadi16_global
 >;
 defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
-  0x0000000c, "BUFFER_LOAD_DWORD", VReg_32, i32, global_load
+  0x0000000c, "buffer_load_dword", VReg_32, i32, global_load
 >;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
-  0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64, v2i32, global_load
+  0x0000000d, "buffer_load_dwordx2", VReg_64, v2i32, global_load
 >;
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
-  0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128, v4i32, global_load
+  0x0000000e, "buffer_load_dwordx4", VReg_128, v4i32, global_load
 >;
 
-def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
-  0x00000018, "BUFFER_STORE_BYTE", VReg_32, i32, truncstorei8_global
+defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
+  0x00000018, "buffer_store_byte", VReg_32, i32, truncstorei8_global
 >;
 
-def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
-  0x0000001a, "BUFFER_STORE_SHORT", VReg_32, i32, truncstorei16_global
+defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
+  0x0000001a, "buffer_store_short", VReg_32, i32, truncstorei16_global
 >;
 
-def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
-  0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32, global_store
+defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
+  0x0000001c, "buffer_store_dword", VReg_32, i32, global_store
 >;
 
-def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
-  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, v2i32, global_store
+defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
+  0x0000001d, "buffer_store_dwordx2", VReg_64, v2i32, global_store
 >;
 
-def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
-  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32, global_store
+defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
+  0x0000001e, "buffer_store_dwordx4", VReg_128, v4i32, global_store
 >;
-//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
-//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
-//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
-//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>;
-//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>;
-//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>;
-//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>;
-//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>;
-//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>;
-//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>;
-//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>;
-//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>;
-//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>;
-//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>;
-//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>;
-//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>;
-//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>;
-//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>;
-//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>;
-//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>;
-//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>;
-//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>;
-//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>;
-//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>;
-//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>;
-//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>;
-//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>;
-//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>;
-//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>;
-//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>;
-//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>;
-//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>;
-//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>;
-//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
-//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
-//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "buffer_atomic_swap", []>;
+defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
+  0x00000030, "buffer_atomic_swap", VReg_32, i32, atomic_swap_global
+>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "buffer_atomic_cmpswap", []>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Atomic <
+  0x00000032, "buffer_atomic_add", VReg_32, i32, atomic_add_global
+>;
+defm BUFFER_ATOMIC_SUB : MUBUF_Atomic <
+  0x00000033, "buffer_atomic_sub", VReg_32, i32, atomic_sub_global
+>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "buffer_atomic_rsub", []>;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic <
+  0x00000035, "buffer_atomic_smin", VReg_32, i32, atomic_min_global
+>;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic <
+  0x00000036, "buffer_atomic_umin", VReg_32, i32, atomic_umin_global
+>;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic <
+  0x00000037, "buffer_atomic_smax", VReg_32, i32, atomic_max_global
+>;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic <
+  0x00000038, "buffer_atomic_umax", VReg_32, i32, atomic_umax_global
+>;
+defm BUFFER_ATOMIC_AND : MUBUF_Atomic <
+  0x00000039, "buffer_atomic_and", VReg_32, i32, atomic_and_global
+>;
+defm BUFFER_ATOMIC_OR : MUBUF_Atomic <
+  0x0000003a, "buffer_atomic_or", VReg_32, i32, atomic_or_global
+>;
+defm BUFFER_ATOMIC_XOR : MUBUF_Atomic <
+  0x0000003b, "buffer_atomic_xor", VReg_32, i32, atomic_xor_global
+>;
+//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "buffer_atomic_inc", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "buffer_atomic_dec", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "buffer_atomic_fcmpswap", []>;
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "buffer_atomic_fmin", []>;
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "buffer_atomic_fmax", []>;
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "buffer_atomic_swap_x2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "buffer_atomic_cmpswap_x2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "buffer_atomic_add_x2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "buffer_atomic_sub_x2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "buffer_atomic_rsub_x2", []>;
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "buffer_atomic_smin_x2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "buffer_atomic_umin_x2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "buffer_atomic_smax_x2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "buffer_atomic_umax_x2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "buffer_atomic_and_x2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "buffer_atomic_or_x2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "buffer_atomic_xor_x2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "buffer_atomic_inc_x2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "buffer_atomic_dec_x2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "buffer_atomic_fcmpswap_x2", []>;
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "buffer_atomic_fmin_x2", []>;
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "buffer_atomic_fmax_x2", []>;
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "buffer_wbinvl1_sc", []>;
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "buffer_wbinvl1", []>;
 
 //===----------------------------------------------------------------------===//
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
-//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
-def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
-def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "TBUFFER_STORE_FORMAT_X", VReg_32>;
-def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FORMAT_XY", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "tbuffer_load_format_x", []>;
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "tbuffer_load_format_xy", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "tbuffer_load_format_xyz", []>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "tbuffer_load_format_xyzw", VReg_128>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "tbuffer_store_format_x", VReg_32>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "tbuffer_store_format_xy", VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "tbuffer_store_format_xyz", VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "tbuffer_store_format_xyzw", VReg_128>;
 
 //===----------------------------------------------------------------------===//
 // MIMG Instructions
 //===----------------------------------------------------------------------===//
 
-defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
-//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
-//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
-//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
-//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>;
-//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>;
-//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
-//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
-//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "IMAGE_GET_RESINFO">;
-//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
-//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
-//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
-//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>;
-//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>;
-//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>;
-//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>;
-//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>;
-//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>;
-//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>;
-//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>;
-//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>;
-//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>;
-//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>;
-//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
-//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
-//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
-defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">;
-//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">;
-//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">;
-defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">;
-//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
-//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
-defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">;
-//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">;
-//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
-//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
-//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
-//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
-//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>;
-//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>;
-//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>;
-//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>;
-//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>;
-//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>;
-//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>;
-//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>;
-//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>;
-//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>;
-//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>;
-//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>;
-//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
-//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
-//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
-defm IMAGE_GATHER4          : MIMG_Gather <0x00000040, "IMAGE_GATHER4">;
-defm IMAGE_GATHER4_CL       : MIMG_Gather <0x00000041, "IMAGE_GATHER4_CL">;
-defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "IMAGE_GATHER4_L">;
-defm IMAGE_GATHER4_B        : MIMG_Gather <0x00000045, "IMAGE_GATHER4_B">;
-defm IMAGE_GATHER4_B_CL     : MIMG_Gather <0x00000046, "IMAGE_GATHER4_B_CL">;
-defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "IMAGE_GATHER4_LZ">;
-defm IMAGE_GATHER4_C        : MIMG_Gather <0x00000048, "IMAGE_GATHER4_C">;
-defm IMAGE_GATHER4_C_CL     : MIMG_Gather <0x00000049, "IMAGE_GATHER4_C_CL">;
-defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "IMAGE_GATHER4_C_L">;
-defm IMAGE_GATHER4_C_B      : MIMG_Gather <0x0000004d, "IMAGE_GATHER4_C_B">;
-defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather <0x0000004e, "IMAGE_GATHER4_C_B_CL">;
-defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "IMAGE_GATHER4_C_LZ">;
-defm IMAGE_GATHER4_O        : MIMG_Gather <0x00000050, "IMAGE_GATHER4_O">;
-defm IMAGE_GATHER4_CL_O     : MIMG_Gather <0x00000051, "IMAGE_GATHER4_CL_O">;
-defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "IMAGE_GATHER4_L_O">;
-defm IMAGE_GATHER4_B_O      : MIMG_Gather <0x00000055, "IMAGE_GATHER4_B_O">;
-defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "IMAGE_GATHER4_B_CL_O">;
-defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "IMAGE_GATHER4_LZ_O">;
-defm IMAGE_GATHER4_C_O      : MIMG_Gather <0x00000058, "IMAGE_GATHER4_C_O">;
-defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather <0x00000059, "IMAGE_GATHER4_C_CL_O">;
-defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "IMAGE_GATHER4_C_L_O">;
-defm IMAGE_GATHER4_C_B_O    : MIMG_Gather <0x0000005d, "IMAGE_GATHER4_C_B_O">;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "IMAGE_GATHER4_C_B_CL_O">;
-defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "IMAGE_GATHER4_C_LZ_O">;
-defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, "IMAGE_GET_LOD">;
-//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
-//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
-//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>;
-//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>;
-//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>;
-//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>;
-//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>;
-//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
-//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
-//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>;
+//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>;
+//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">;
+//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>;
+//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>;
+//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>;
+//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>;
+//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>;
+//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>;
+//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>;
+//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>;
+//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>;
+//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>;
+//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>;
+//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>;
+//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>;
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>;
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>;
+defm IMAGE_SAMPLE           : MIMG_Sampler <0x00000020, "image_sample">;
+defm IMAGE_SAMPLE_CL        : MIMG_Sampler <0x00000021, "image_sample_cl">;
+defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, "image_sample_d">;
+defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, "image_sample_d_cl">;
+defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, "image_sample_l">;
+defm IMAGE_SAMPLE_B         : MIMG_Sampler <0x00000025, "image_sample_b">;
+defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler <0x00000026, "image_sample_b_cl">;
+defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, "image_sample_lz">;
+defm IMAGE_SAMPLE_C         : MIMG_Sampler <0x00000028, "image_sample_c">;
+defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler <0x00000029, "image_sample_c_cl">;
+defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, "image_sample_c_d">;
+defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">;
+defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, "image_sample_c_l">;
+defm IMAGE_SAMPLE_C_B       : MIMG_Sampler <0x0000002d, "image_sample_c_b">;
+defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler <0x0000002e, "image_sample_c_b_cl">;
+defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, "image_sample_c_lz">;
+defm IMAGE_SAMPLE_O         : MIMG_Sampler <0x00000030, "image_sample_o">;
+defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler <0x00000031, "image_sample_cl_o">;
+defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, "image_sample_d_o">;
+defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">;
+defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, "image_sample_l_o">;
+defm IMAGE_SAMPLE_B_O       : MIMG_Sampler <0x00000035, "image_sample_b_o">;
+defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler <0x00000036, "image_sample_b_cl_o">;
+defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, "image_sample_lz_o">;
+defm IMAGE_SAMPLE_C_O       : MIMG_Sampler <0x00000038, "image_sample_c_o">;
+defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler <0x00000039, "image_sample_c_cl_o">;
+defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">;
+defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">;
+defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">;
+defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler <0x0000003d, "image_sample_c_b_o">;
+defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler <0x0000003e, "image_sample_c_b_cl_o">;
+defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">;
+defm IMAGE_GATHER4          : MIMG_Gather <0x00000040, "image_gather4">;
+defm IMAGE_GATHER4_CL       : MIMG_Gather <0x00000041, "image_gather4_cl">;
+defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, "image_gather4_l">;
+defm IMAGE_GATHER4_B        : MIMG_Gather <0x00000045, "image_gather4_b">;
+defm IMAGE_GATHER4_B_CL     : MIMG_Gather <0x00000046, "image_gather4_b_cl">;
+defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, "image_gather4_lz">;
+defm IMAGE_GATHER4_C        : MIMG_Gather <0x00000048, "image_gather4_c">;
+defm IMAGE_GATHER4_C_CL     : MIMG_Gather <0x00000049, "image_gather4_c_cl">;
+defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, "image_gather4_c_l">;
+defm IMAGE_GATHER4_C_B      : MIMG_Gather <0x0000004d, "image_gather4_c_b">;
+defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather <0x0000004e, "image_gather4_c_b_cl">;
+defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, "image_gather4_c_lz">;
+defm IMAGE_GATHER4_O        : MIMG_Gather <0x00000050, "image_gather4_o">;
+defm IMAGE_GATHER4_CL_O     : MIMG_Gather <0x00000051, "image_gather4_cl_o">;
+defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, "image_gather4_l_o">;
+defm IMAGE_GATHER4_B_O      : MIMG_Gather <0x00000055, "image_gather4_b_o">;
+defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">;
+defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, "image_gather4_lz_o">;
+defm IMAGE_GATHER4_C_O      : MIMG_Gather <0x00000058, "image_gather4_c_o">;
+defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather <0x00000059, "image_gather4_c_cl_o">;
+defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">;
+defm IMAGE_GATHER4_C_B_O    : MIMG_Gather <0x0000005d, "image_gather4_c_b_o">;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather <0x0000005e, "image_gather4_c_b_cl_o">;
+defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">;
+defm IMAGE_GET_LOD          : MIMG_Sampler <0x00000060, "image_get_lod">;
+defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, "image_sample_cd">;
+defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, "image_sample_cd_cl">;
+defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, "image_sample_c_cd">;
+defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">;
+defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, "image_sample_cd_o">;
+defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">;
+defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
 
 //===----------------------------------------------------------------------===//
+// Flat Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasFlatAddressSpace] in {
+def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "flat_load_ubyte", VReg_32>;
+def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "flat_load_sbyte", VReg_32>;
+def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "flat_load_ushort", VReg_32>;
+def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "flat_load_sshort", VReg_32>;
+def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "flat_load_dword", VReg_32>;
+def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "flat_load_dwordx2", VReg_64>;
+def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "flat_load_dwordx4", VReg_128>;
+def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "flat_load_dwordx3", VReg_96>;
+
+def FLAT_STORE_BYTE : FLAT_Store_Helper <
+  0x00000018, "flat_store_byte", VReg_32
+>;
+
+def FLAT_STORE_SHORT : FLAT_Store_Helper <
+  0x0000001a, "flat_store_short", VReg_32
+>;
+
+def FLAT_STORE_DWORD : FLAT_Store_Helper <
+  0x0000001c, "flat_store_dword", VReg_32
+>;
+
+def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+  0x0000001d, "flat_store_dwordx2", VReg_64
+>;
+
+def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+  0x0000001e, "flat_store_dwordx4", VReg_128
+>;
+
+def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+  0x0000001e, "flat_store_dwordx3", VReg_96
+>;
+
+//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "flat_atomic_swap", []>;
+//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "flat_atomic_cmpswap", []>;
+//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "flat_atomic_add", []>;
+//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "flat_atomic_sub", []>;
+//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "flat_atomic_rsub", []>;
+//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "flat_atomic_smin", []>;
+//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "flat_atomic_umin", []>;
+//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "flat_atomic_smax", []>;
+//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "flat_atomic_umax", []>;
+//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "flat_atomic_and", []>;
+//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "flat_atomic_or", []>;
+//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "flat_atomic_xor", []>;
+//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "flat_atomic_inc", []>;
+//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "flat_atomic_dec", []>;
+//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "flat_atomic_fcmpswap", []>;
+//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "flat_atomic_fmin", []>;
+//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "flat_atomic_fmax", []>;
+//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "flat_atomic_swap_x2", []>;
+//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "flat_atomic_cmpswap_x2", []>;
+//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "flat_atomic_add_x2", []>;
+//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "flat_atomic_sub_x2", []>;
+//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "flat_atomic_rsub_x2", []>;
+//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "flat_atomic_smin_x2", []>;
+//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "flat_atomic_umin_x2", []>;
+//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "flat_atomic_smax_x2", []>;
+//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "flat_atomic_umax_x2", []>;
+//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "flat_atomic_and_x2", []>;
+//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "flat_atomic_or_x2", []>;
+//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "flat_atomic_xor_x2", []>;
+//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "flat_atomic_inc_x2", []>;
+//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "flat_atomic_dec_x2", []>;
+//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "flat_atomic_fcmpswap_x2", []>;
+//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "flat_atomic_fmin_x2", []>;
+//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "flat_atomic_fmax_x2", []>;
+
+} // End HasFlatAddressSpace predicate
+//===----------------------------------------------------------------------===//
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
 
-//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
+//def V_NOP : VOP1_ <0x00000000, "v_nop", []>;
 
-let neverHasSideEffects = 1, isMoveImm = 1 in {
-defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
-} // End neverHasSideEffects = 1, isMoveImm = 1
+let isMoveImm = 1 in {
+defm V_MOV_B32 : VOP1Inst <vop1<0x1>, "v_mov_b32", VOP_I32_I32>;
+} // End isMoveImm = 1
 
 let Uses = [EXEC] in {
 
@@ -1053,136 +1162,139 @@
   0x00000002,
   (outs SReg_32:$vdst),
   (ins VReg_32:$src0),
-  "V_READFIRSTLANE_B32 $vdst, $src0",
+  "v_readfirstlane_b32 $vdst, $src0",
   []
 >;
 
 }
 
-defm V_CVT_I32_F64 : VOP1_32_64 <0x00000003, "V_CVT_I32_F64",
-  [(set i32:$dst, (fp_to_sint f64:$src0))]
+defm V_CVT_I32_F64 : VOP1Inst <vop1<0x3>, "v_cvt_i32_f64",
+  VOP_I32_F64, fp_to_sint
 >;
-defm V_CVT_F64_I32 : VOP1_64_32 <0x00000004, "V_CVT_F64_I32",
-  [(set f64:$dst, (sint_to_fp i32:$src0))]
+defm V_CVT_F64_I32 : VOP1Inst <vop1<0x4>, "v_cvt_f64_i32",
+  VOP_F64_I32, sint_to_fp
 >;
-defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
-  [(set f32:$dst, (sint_to_fp i32:$src0))]
+defm V_CVT_F32_I32 : VOP1Inst <vop1<0x5>, "v_cvt_f32_i32",
+  VOP_F32_I32, sint_to_fp
 >;
-defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32",
-  [(set f32:$dst, (uint_to_fp i32:$src0))]
+defm V_CVT_F32_U32 : VOP1Inst <vop1<0x6>, "v_cvt_f32_u32",
+  VOP_F32_I32, uint_to_fp
 >;
-defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32",
-  [(set i32:$dst, (fp_to_uint f32:$src0))]
+defm V_CVT_U32_F32 : VOP1Inst <vop1<0x7>, "v_cvt_u32_f32",
+  VOP_I32_F32, fp_to_uint
 >;
-defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
-  [(set i32:$dst, (fp_to_sint f32:$src0))]
+defm V_CVT_I32_F32 : VOP1Inst <vop1<0x8>, "v_cvt_i32_f32",
+  VOP_I32_F32, fp_to_sint
 >;
-defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
-defm V_CVT_F16_F32 : VOP1_32 <0x0000000a, "V_CVT_F16_F32",
-  [(set i32:$dst, (f32_to_f16 f32:$src0))]
+defm V_MOV_FED_B32 : VOP1Inst <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>;
+defm V_CVT_F16_F32 : VOP1Inst <vop1<0xa>, "v_cvt_f16_f32",
+  VOP_I32_F32, fp_to_f16
 >;
-defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16",
-  [(set f32:$dst, (f16_to_f32 i32:$src0))]
+defm V_CVT_F32_F16 : VOP1Inst <vop1<0xb>, "v_cvt_f32_f16",
+  VOP_F32_I32, f16_to_fp
 >;
-//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
-//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
-//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
-defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",
-  [(set f32:$dst, (fround f64:$src0))]
+//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "v_cvt_rpi_i32_f32", []>;
+//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "v_cvt_flr_i32_f32", []>;
+//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "v_cvt_off_f32_i4", []>;
+defm V_CVT_F32_F64 : VOP1Inst <vop1<0xf>, "v_cvt_f32_f64",
+  VOP_F32_F64, fround
 >;
-defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
-  [(set f64:$dst, (fextend f32:$src0))]
+defm V_CVT_F64_F32 : VOP1Inst <vop1<0x10>, "v_cvt_f64_f32",
+  VOP_F64_F32, fextend
 >;
-defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0",
-  [(set f32:$dst, (AMDGPUcvt_f32_ubyte0 i32:$src0))]
+defm V_CVT_F32_UBYTE0 : VOP1Inst <vop1<0x11>, "v_cvt_f32_ubyte0",
+  VOP_F32_I32, AMDGPUcvt_f32_ubyte0
 >;
-defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1",
-  [(set f32:$dst, (AMDGPUcvt_f32_ubyte1 i32:$src0))]
+defm V_CVT_F32_UBYTE1 : VOP1Inst <vop1<0x12>, "v_cvt_f32_ubyte1",
+  VOP_F32_I32, AMDGPUcvt_f32_ubyte1
 >;
-defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2",
-  [(set f32:$dst, (AMDGPUcvt_f32_ubyte2 i32:$src0))]
+defm V_CVT_F32_UBYTE2 : VOP1Inst <vop1<0x13>, "v_cvt_f32_ubyte2",
+  VOP_F32_I32, AMDGPUcvt_f32_ubyte2
 >;
-defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3",
-  [(set f32:$dst, (AMDGPUcvt_f32_ubyte3 i32:$src0))]
+defm V_CVT_F32_UBYTE3 : VOP1Inst <vop1<0x14>, "v_cvt_f32_ubyte3",
+  VOP_F32_I32, AMDGPUcvt_f32_ubyte3
 >;
-defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
-  [(set i32:$dst, (fp_to_uint f64:$src0))]
+defm V_CVT_U32_F64 : VOP1Inst <vop1<0x15>, "v_cvt_u32_f64",
+  VOP_I32_F64, fp_to_uint
 >;
-defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32",
-  [(set f64:$dst, (uint_to_fp i32:$src0))]
+defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32",
+  VOP_F64_I32, uint_to_fp
 >;
 
-defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
-  [(set f32:$dst, (AMDGPUfract f32:$src0))]
+defm V_FRACT_F32 : VOP1Inst <vop1<0x20>, "v_fract_f32",
+  VOP_F32_F32, AMDGPUfract
 >;
-defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32",
-  [(set f32:$dst, (ftrunc f32:$src0))]
+defm V_TRUNC_F32 : VOP1Inst <vop1<0x21>, "v_trunc_f32",
+  VOP_F32_F32, ftrunc
 >;
-defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
-  [(set f32:$dst, (fceil f32:$src0))]
+defm V_CEIL_F32 : VOP1Inst <vop1<0x22>, "v_ceil_f32",
+  VOP_F32_F32, fceil
 >;
-defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
-  [(set f32:$dst, (frint f32:$src0))]
+defm V_RNDNE_F32 : VOP1Inst <vop1<0x23>, "v_rndne_f32",
+  VOP_F32_F32, frint
 >;
-defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
-  [(set f32:$dst, (ffloor f32:$src0))]
+defm V_FLOOR_F32 : VOP1Inst <vop1<0x24>, "v_floor_f32",
+  VOP_F32_F32, ffloor
 >;
-defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
-  [(set f32:$dst, (fexp2 f32:$src0))]
+defm V_EXP_F32 : VOP1Inst <vop1<0x25>, "v_exp_f32",
+  VOP_F32_F32, fexp2
 >;
-defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
-defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
-  [(set f32:$dst, (flog2 f32:$src0))]
+defm V_LOG_CLAMP_F32 : VOP1Inst <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>;
+defm V_LOG_F32 : VOP1Inst <vop1<0x27>, "v_log_f32",
+  VOP_F32_F32, flog2
 >;
 
-defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
-defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
-defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
-  [(set f32:$dst, (AMDGPUrcp f32:$src0))]
+defm V_RCP_CLAMP_F32 : VOP1Inst <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>;
+defm V_RCP_LEGACY_F32 : VOP1Inst <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>;
+defm V_RCP_F32 : VOP1Inst <vop1<0x2a>, "v_rcp_f32",
+  VOP_F32_F32, AMDGPUrcp
 >;
-defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
-defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32",
-  [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))]
+defm V_RCP_IFLAG_F32 : VOP1Inst <vop1<0x2b>, "v_rcp_iflag_f32", VOP_F32_F32>;
+defm V_RSQ_CLAMP_F32 : VOP1Inst <vop1<0x2c>, "v_rsq_clamp_f32",
+  VOP_F32_F32, AMDGPUrsq_clamped
 >;
-defm V_RSQ_LEGACY_F32 : VOP1_32 <
-  0x0000002d, "V_RSQ_LEGACY_F32",
-  [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))]
+defm V_RSQ_LEGACY_F32 : VOP1Inst <vop1<0x2d>, "v_rsq_legacy_f32",
+  VOP_F32_F32, AMDGPUrsq_legacy
 >;
-defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32",
-  [(set f32:$dst, (AMDGPUrsq f32:$src0))]
+defm V_RSQ_F32 : VOP1Inst <vop1<0x2e>, "v_rsq_f32",
+  VOP_F32_F32, AMDGPUrsq
 >;
-defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
-  [(set f64:$dst, (AMDGPUrcp f64:$src0))]
+defm V_RCP_F64 : VOP1Inst <vop1<0x2f>, "v_rcp_f64",
+  VOP_F64_F64, AMDGPUrcp
 >;
-defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
-defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
-  [(set f64:$dst, (AMDGPUrsq f64:$src0))]
+defm V_RCP_CLAMP_F64 : VOP1Inst <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>;
+defm V_RSQ_F64 : VOP1Inst <vop1<0x31>, "v_rsq_f64",
+  VOP_F64_F64, AMDGPUrsq
 >;
-defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64",
-  [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))]
+defm V_RSQ_CLAMP_F64 : VOP1Inst <vop1<0x32>, "v_rsq_clamp_f64",
+  VOP_F64_F64, AMDGPUrsq_clamped
 >;
-defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32",
-  [(set f32:$dst, (fsqrt f32:$src0))]
+defm V_SQRT_F32 : VOP1Inst <vop1<0x33>, "v_sqrt_f32",
+  VOP_F32_F32, fsqrt
 >;
-defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64",
-  [(set f64:$dst, (fsqrt f64:$src0))]
+defm V_SQRT_F64 : VOP1Inst <vop1<0x34>, "v_sqrt_f64",
+  VOP_F64_F64, fsqrt
 >;
-defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>;
-defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>;
-defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
-defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>;
-defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>;
-defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>;
-defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>;
-//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>;
-defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>;
-defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
-//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
-defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
-//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
-defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
-defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
-defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
+defm V_SIN_F32 : VOP1Inst <vop1<0x35>, "v_sin_f32",
+  VOP_F32_F32, AMDGPUsin
+>;
+defm V_COS_F32 : VOP1Inst <vop1<0x36>, "v_cos_f32",
+  VOP_F32_F32, AMDGPUcos
+>;
+defm V_NOT_B32 : VOP1Inst <vop1<0x37>, "v_not_b32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <vop1<0x38>, "v_bfrev_b32", VOP_I32_I32>;
+defm V_FFBH_U32 : VOP1Inst <vop1<0x39>, "v_ffbh_u32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <vop1<0x3a>, "v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <vop1<0x3b>, "v_ffbh_i32", VOP_I32_I32>;
+//defm V_FREXP_EXP_I32_F64 : VOPInst <0x0000003c, "v_frexp_exp_i32_f64", VOP_I32_F32>;
+defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d>, "v_frexp_mant_f64", VOP_F64_F64>;
+defm V_FRACT_F64 : VOP1Inst <vop1<0x3e>, "v_fract_f64", VOP_F64_F64>;
+//defm V_FREXP_EXP_I32_F32 : VOPInst <0x0000003f, "v_frexp_exp_i32_f32", VOP_I32_F32>;
+defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40>, "v_frexp_mant_f32", VOP_F32_F32>;
+//def V_CLREXCP : VOP1_ <0x00000041, "v_clrexcp", []>;
+defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42>, "v_movreld_b32", VOP_I32_I32>;
+defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43>, "v_movrels_b32", VOP_I32_I32>;
+defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44>, "v_movrelsd_b32", VOP_I32_I32>;
 
 
 //===----------------------------------------------------------------------===//
@@ -1193,7 +1305,7 @@
   0x00000000,
   (outs VReg_32:$dst),
   (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-  "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]",
+  "v_interp_p1_f32 $dst, $i, $attr_chan, $attr, [$m0]",
   []> {
   let DisableEncoding = "$m0";
 }
@@ -1202,7 +1314,7 @@
   0x00000001,
   (outs VReg_32:$dst),
   (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-  "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
+  "v_interp_p2_f32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
   []> {
 
   let Constraints = "$src0 = $dst";
@@ -1214,7 +1326,7 @@
   0x00000002,
   (outs VReg_32:$dst),
   (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-  "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]",
+  "v_interp_mov_f32 $dst, $src0, $attr_chan, $attr, [$m0]",
   []> {
   let DisableEncoding = "$m0";
 }
@@ -1225,16 +1337,15 @@
 
 def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
   (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
-  "V_CNDMASK_B32_e32 $dst, $src0, $src1, [$vcc]",
+  "v_cndmask_b32_e32 $dst, $src0, $src1, [$vcc]",
   []
 >{
   let DisableEncoding = "$vcc";
 }
 
 def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
-  (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2,
-   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
-  "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
+  (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2),
+  "v_cndmask_b32_e64 $dst, $src0, $src1, $src2",
   [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
 > {
   let src0_modifiers = 0;
@@ -1246,7 +1357,7 @@
   0x00000001,
   (outs SReg_32:$vdst),
   (ins VReg_32:$src0, SSrc_32:$vsrc1),
-  "V_READLANE_B32 $vdst, $src0, $vsrc1",
+  "v_readlane_b32 $vdst, $src0, $vsrc1",
   []
 >;
 
@@ -1254,245 +1365,320 @@
   0x00000002,
   (outs VReg_32:$vdst),
   (ins SReg_32:$src0, SSrc_32:$vsrc1),
-  "V_WRITELANE_B32 $vdst, $src0, $vsrc1",
+  "v_writelane_b32 $vdst, $src0, $vsrc1",
   []
 >;
 
 let isCommutable = 1 in {
-defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32",
-  [(set f32:$dst, (fadd f32:$src0, f32:$src1))]
+defm V_ADD_F32 : VOP2Inst <vop2<0x3>, "v_add_f32",
+  VOP_F32_F32_F32, fadd
 >;
 
-defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32",
-  [(set f32:$dst, (fsub f32:$src0, f32:$src1))]
+defm V_SUB_F32 : VOP2Inst <vop2<0x4>, "v_sub_f32", VOP_F32_F32_F32, fsub>;
+defm V_SUBREV_F32 : VOP2Inst <vop2<0x5>, "v_subrev_f32",
+  VOP_F32_F32_F32, null_frag, "v_sub_f32"
 >;
-defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", [], "V_SUB_F32">;
 } // End isCommutable = 1
 
-defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
-
 let isCommutable = 1 in {
 
-defm V_MUL_LEGACY_F32 : VOP2_32 <
-  0x00000007, "V_MUL_LEGACY_F32",
-  [(set f32:$dst, (int_AMDGPU_mul f32:$src0, f32:$src1))]
+defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
+  VOP_F32_F32_F32
 >;
 
-defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
-  [(set f32:$dst, (fmul f32:$src0, f32:$src1))]
+defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "v_mul_legacy_f32",
+  VOP_F32_F32_F32, int_AMDGPU_mul
 >;
 
-
-defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
-  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))]
->;
-//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
-defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
-  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))]
->;
-//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
-
-
-defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
-  [(set f32:$dst, (AMDGPUfmin f32:$src0, f32:$src1))]
+defm V_MUL_F32 : VOP2Inst <vop2<0x8>, "v_mul_f32",
+  VOP_F32_F32_F32, fmul
 >;
 
-defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
-  [(set f32:$dst, (AMDGPUfmax f32:$src0, f32:$src1))]
+defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "v_mul_i32_i24",
+  VOP_I32_I32_I32, AMDGPUmul_i24
+>;
+//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "v_mul_hi_i32_i24", []>;
+defm V_MUL_U32_U24 : VOP2Inst <vop2<0xb>, "v_mul_u32_u24",
+  VOP_I32_I32_I32, AMDGPUmul_u24
+>;
+//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "v_mul_hi_u32_u24", []>;
+
+
+defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32",
+  VOP_F32_F32_F32, AMDGPUfmin_legacy
 >;
 
-defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
-defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
-defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
-  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>;
-defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
-  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>;
-defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
-  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>;
-defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
-  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>;
-
-defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
-  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32",
+  VOP_F32_F32_F32, AMDGPUfmax_legacy
 >;
 
-defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
+defm V_MIN_F32 : VOP2Inst <vop2<0xf>, "v_min_f32", VOP_F32_F32_F32, fminnum>;
+defm V_MAX_F32 : VOP2Inst <vop2<0x10>, "v_max_f32", VOP_F32_F32_F32, fmaxnum>;
+defm V_MIN_I32 : VOP2Inst <vop2<0x11>, "v_min_i32", VOP_I32_I32_I32, AMDGPUsmin>;
+defm V_MAX_I32 : VOP2Inst <vop2<0x12>, "v_max_i32", VOP_I32_I32_I32, AMDGPUsmax>;
+defm V_MIN_U32 : VOP2Inst <vop2<0x13>, "v_min_u32", VOP_I32_I32_I32, AMDGPUumin>;
+defm V_MAX_U32 : VOP2Inst <vop2<0x14>, "v_max_u32", VOP_I32_I32_I32, AMDGPUumax>;
 
-defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
-  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>;
+
+defm V_LSHRREV_B32 : VOP2Inst <
+  vop2<0x16>, "v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32"
 >;
-defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
+
+defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32",
+  VOP_I32_I32_I32, sra
+>;
+defm V_ASHRREV_I32 : VOP2Inst <
+  vop2<0x18>, "v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32"
+>;
 
 let hasPostISelHook = 1 in {
 
-defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
-  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
->;
+defm V_LSHL_B32 : VOP2Inst <vop2<0x19>, "v_lshl_b32", VOP_I32_I32_I32, shl>;
 
 }
-defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
-
-defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
-  [(set i32:$dst, (and i32:$src0, i32:$src1))]>;
-defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
-  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+defm V_LSHLREV_B32 : VOP2Inst <
+  vop2<0x1a>, "v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32"
 >;
-defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
-  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+
+defm V_AND_B32 : VOP2Inst <vop2<0x1b>, "v_and_b32",
+  VOP_I32_I32_I32, and>;
+defm V_OR_B32 : VOP2Inst <vop2<0x1c>, "v_or_b32",
+  VOP_I32_I32_I32, or
+>;
+defm V_XOR_B32 : VOP2Inst <vop2<0x1d>, "v_xor_b32",
+  VOP_I32_I32_I32, xor
 >;
 
 } // End isCommutable = 1
 
-defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32",
-  [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>;
-defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
-defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
-defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
-defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
-defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
-defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
+defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32",
+  VOP_I32_I32_I32, AMDGPUbfm>;
+
+let isCommutable = 1 in {
+defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "v_mac_f32", VOP_F32_F32_F32>;
+} // End isCommutable = 1
+
+defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "v_madmk_f32", VOP_F32_F32_F32>;
+
+let isCommutable = 1 in {
+defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "v_madak_f32", VOP_F32_F32_F32>;
+} // End isCommutable = 1
+
+
+defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32",
+
+  VOP_I32_I32_I32
+>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32",
+  VOP_I32_I32_I32
+>;
 
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
-  [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>;
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
-  [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>;
-defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32,
-                              "V_SUB_I32">;
+defm V_ADD_I32 : VOP2bInst <vop2<0x25>, "v_add_i32",
+  VOP_I32_I32_I32, add
+>;
+defm V_SUB_I32 : VOP2bInst <vop2<0x26>, "v_sub_i32",
+  VOP_I32_I32_I32, sub
+>;
+defm V_SUBREV_I32 : VOP2bInst <vop2<0x27>, "v_subrev_i32",
+  VOP_I32_I32_I32, null_frag, "v_sub_i32"
+>;
 
 let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32",
-  [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>;
-defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32",
-  [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>;
-defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32,
-                               "V_SUBB_U32">;
+defm V_ADDC_U32 : VOP2bInst <vop2<0x28>, "v_addc_u32",
+  VOP_I32_I32_I32_VCC, adde
+>;
+defm V_SUBB_U32 : VOP2bInst <vop2<0x29>, "v_subb_u32",
+  VOP_I32_I32_I32_VCC, sube
+>;
+defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a>, "v_subbrev_u32",
+  VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32"
+>;
+
 } // End Uses = [VCC]
 } // End isCommutable = 1, Defs = [VCC]
 
-defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
-////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
-////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
-////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
- [(set i32:$dst, (int_SI_packf16 f32:$src0, f32:$src1))]
+defm V_LDEXP_F32 : VOP2Inst <vop2<0x2b>, "v_ldexp_f32",
+  VOP_F32_F32_I32, AMDGPUldexp
 >;
-////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
-////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
+////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "v_cvt_pkaccum_u8_f32", []>;
+////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "v_cvt_pknorm_i16_f32", []>;
+////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "v_cvt_pknorm_u16_f32", []>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32",
+ VOP_I32_F32_F32, int_SI_packf16
+>;
+////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "v_cvt_pk_u16_u32", []>;
+////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "v_cvt_pk_i16_i32", []>;
 
 //===----------------------------------------------------------------------===//
 // VOP3 Instructions
 //===----------------------------------------------------------------------===//
 
-let neverHasSideEffects = 1 in {
-
-defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
-defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32",
-  [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
->;
-defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
-  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))]
->;
-defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
-  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))]
+let isCommutable = 1 in {
+defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "v_mad_legacy_f32",
+  VOP_F32_F32_F32_F32
 >;
 
-} // End neverHasSideEffects
-
-defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
-defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
-defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
-defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
-
-let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
-defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
-  [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>;
-defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
-  [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>;
-}
-
-defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
-  [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>;
-defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
-  [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
->;
-def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
-  [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
->;
-//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
-defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
-
-defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
-defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
-////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
-////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
-////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
-////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
-////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
-////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
-////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
-////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
-////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
-//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
-//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
-//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
-defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
-////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
-defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32",
-  [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))]
->;
-def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64",
-  [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))]
+defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "v_mad_f32",
+  VOP_F32_F32_F32_F32, fmad
 >;
 
-def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64",
-  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "v_mad_i32_i24",
+  VOP_I32_I32_I32_I32, AMDGPUmad_i24
 >;
-def V_LSHR_B64 : VOP3_64_32 <0x00000162, "V_LSHR_B64",
-  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "v_mad_u32_u24",
+  VOP_I32_I32_I32_I32, AMDGPUmad_u24
 >;
-def V_ASHR_I64 : VOP3_64_32 <0x00000163, "V_ASHR_I64",
-  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+} // End isCommutable = 1
+
+defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "v_cubeid_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_CUBESC_F32 : VOP3Inst <vop3<0x145>, "v_cubesc_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_CUBETC_F32 : VOP3Inst <vop3<0x146>, "v_cubetc_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147>, "v_cubema_f32",
+  VOP_F32_F32_F32_F32
+>;
+defm V_BFE_U32 : VOP3Inst <vop3<0x148>, "v_bfe_u32",
+  VOP_I32_I32_I32_I32, AMDGPUbfe_u32
+>;
+defm V_BFE_I32 : VOP3Inst <vop3<0x149>, "v_bfe_i32",
+  VOP_I32_I32_I32_I32, AMDGPUbfe_i32
+>;
+defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "v_bfi_b32",
+  VOP_I32_I32_I32_I32, AMDGPUbfi
+>;
+
+let isCommutable = 1 in {
+defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "v_fma_f32",
+  VOP_F32_F32_F32_F32, fma
+>;
+defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "v_fma_f64",
+  VOP_F64_F64_F64_F64, fma
+>;
+} // End isCommutable = 1
+
+//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
+defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "v_alignbit_b32",
+  VOP_I32_I32_I32_I32
+>;
+defm V_ALIGNBYTE_B32 : VOP3Inst <vop3<0x14f>, "v_alignbyte_b32",
+  VOP_I32_I32_I32_I32
+>;
+defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32",
+  VOP_F32_F32_F32_F32>;
+defm V_MIN3_F32 : VOP3Inst <vop3<0x151>, "v_min3_f32",
+  VOP_F32_F32_F32_F32, AMDGPUfmin3>;
+
+defm V_MIN3_I32 : VOP3Inst <vop3<0x152>, "v_min3_i32",
+  VOP_I32_I32_I32_I32, AMDGPUsmin3
+>;
+defm V_MIN3_U32 : VOP3Inst <vop3<0x153>, "v_min3_u32",
+  VOP_I32_I32_I32_I32, AMDGPUumin3
+>;
+defm V_MAX3_F32 : VOP3Inst <vop3<0x154>, "v_max3_f32",
+  VOP_F32_F32_F32_F32, AMDGPUfmax3
+>;
+defm V_MAX3_I32 : VOP3Inst <vop3<0x155>, "v_max3_i32",
+  VOP_I32_I32_I32_I32, AMDGPUsmax3
+>;
+defm V_MAX3_U32 : VOP3Inst <vop3<0x156>, "v_max3_u32",
+  VOP_I32_I32_I32_I32, AMDGPUumax3
+>;
+//def V_MED3_F32 : VOP3_MED3 <0x00000157, "v_med3_f32", []>;
+//def V_MED3_I32 : VOP3_MED3 <0x00000158, "v_med3_i32", []>;
+//def V_MED3_U32 : VOP3_MED3 <0x00000159, "v_med3_u32", []>;
+//def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>;
+//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "v_sad_hi_u8", []>;
+//def V_SAD_U16 : VOP3_U16 <0x0000015c, "v_sad_u16", []>;
+defm V_SAD_U32 : VOP3Inst <vop3<0x15d>, "v_sad_u32",
+  VOP_I32_I32_I32_I32
+>;
+////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>;
+defm V_DIV_FIXUP_F32 : VOP3Inst <
+  vop3<0x15f>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup
+>;
+defm V_DIV_FIXUP_F64 : VOP3Inst <
+  vop3<0x160>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup
+>;
+
+defm V_LSHL_B64 : VOP3Inst <vop3<0x161>, "v_lshl_b64",
+  VOP_I64_I64_I32, shl
+>;
+defm V_LSHR_B64 : VOP3Inst <vop3<0x162>, "v_lshr_b64",
+  VOP_I64_I64_I32, srl
+>;
+defm V_ASHR_I64 : VOP3Inst <vop3<0x163>, "v_ashr_i64",
+  VOP_I64_I64_I32, sra
 >;
 
 let isCommutable = 1 in {
 
-def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
-def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
-def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
-def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
+defm V_ADD_F64 : VOP3Inst <vop3<0x164>, "v_add_f64",
+  VOP_F64_F64_F64, fadd
+>;
+defm V_MUL_F64 : VOP3Inst <vop3<0x165>, "v_mul_f64",
+  VOP_F64_F64_F64, fmul
+>;
+
+defm V_MIN_F64 : VOP3Inst <vop3<0x166>, "v_min_f64",
+  VOP_F64_F64_F64, fminnum
+>;
+defm V_MAX_F64 : VOP3Inst <vop3<0x167>, "v_max_f64",
+  VOP_F64_F64_F64, fmaxnum
+>;
 
 } // isCommutable = 1
 
-def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
+defm V_LDEXP_F64 : VOP3Inst <vop3<0x168>, "v_ldexp_f64",
+  VOP_F64_F64_I32, AMDGPUldexp
+>;
 
 let isCommutable = 1 in {
 
-defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
-defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
-defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
-defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169>, "v_mul_lo_u32",
+  VOP_I32_I32_I32
+>;
+defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a>, "v_mul_hi_u32",
+  VOP_I32_I32_I32
+>;
+defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b>, "v_mul_lo_i32",
+  VOP_I32_I32_I32
+>;
+defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c>, "v_mul_hi_i32",
+  VOP_I32_I32_I32
+>;
 
 } // isCommutable = 1
 
-def V_DIV_SCALE_F32 : VOP3b_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d>, "v_div_scale_f32", []>;
 
 // Double precision division pre-scale.
-def V_DIV_SCALE_F64 : VOP3b_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
+defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>;
 
-defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32",
-  [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))]
+let isCommutable = 1 in {
+defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32",
+  VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
 >;
-def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64",
-  [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))]
+defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64",
+  VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
 >;
-//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
-//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
-//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
-def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64",
-  [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))]
+} // End isCommutable = 1
+
+//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
+//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
+//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
+
+defm V_TRIG_PREOP_F64 : VOP3Inst <
+  vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
 >;
 
 //===----------------------------------------------------------------------===//
@@ -1517,6 +1703,15 @@
    [(set i1:$dst, (or i1:$src0, i1:$src1))]
 >;
 
+def V_XOR_I1 : InstSI <
+  (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+  [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+>;
+
+let hasSideEffects = 1 in {
+def SGPR_USE : InstSI <(outs),(ins), "", []>;
+}
+
 // SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
@@ -1544,7 +1739,7 @@
 def SI_LOOP : InstSI <
   (outs),
   (ins SReg_64:$saved, brtarget:$target),
-  "SI_LOOP $saved, $target",
+  "si_loop $saved, $target",
   [(int_SI_loop i64:$saved, bb:$target)]
 >;
 
@@ -1553,35 +1748,35 @@
 def SI_BREAK : InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src),
-  "SI_ELSE $dst, $src",
+  "si_else $dst, $src",
   [(set i64:$dst, (int_SI_break i64:$src))]
 >;
 
 def SI_IF_BREAK : InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$vcc, SReg_64:$src),
-  "SI_IF_BREAK $dst, $vcc, $src",
+  "si_if_break $dst, $vcc, $src",
   [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))]
 >;
 
 def SI_ELSE_BREAK : InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src0, SReg_64:$src1),
-  "SI_ELSE_BREAK $dst, $src0, $src1",
+  "si_else_break $dst, $src0, $src1",
   [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))]
 >;
 
 def SI_END_CF : InstSI <
   (outs),
   (ins SReg_64:$saved),
-  "SI_END_CF $saved",
+  "si_end_cf $saved",
   [(int_SI_end_cf i64:$saved)]
 >;
 
 def SI_KILL : InstSI <
   (outs),
   (ins VSrc_32:$src),
-  "SI_KILL $src",
+  "si_kill $src",
   [(int_AMDGPU_kill f32:$src)]
 >;
 
@@ -1623,14 +1818,14 @@
 def SI_INDIRECT_SRC : InstSI <
   (outs VReg_32:$dst, SReg_64:$temp),
   (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
-  "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off",
+  "si_indirect_src $dst, $temp, $src, $idx, $off",
   []
 >;
 
 class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
   (outs rc:$dst, SReg_64:$temp),
   (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
-  "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val",
+  "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
   []
 > {
   let Constraints = "$src = $dst";
@@ -1646,18 +1841,10 @@
 
 let usesCustomInserter = 1 in {
 
-// This pseudo instruction takes a pointer as input and outputs a resource
-// constant that can be used with the ADDR64 MUBUF instructions.
-def SI_ADDR64_RSRC : InstSI <
-  (outs SReg_128:$srsrc),
-  (ins SSrc_64:$ptr),
-  "", []
->;
-
 def V_SUB_F64 : InstSI <
   (outs VReg_64:$dst),
   (ins VReg_64:$src0, VReg_64:$src1),
-  "V_SUB_F64 $dst, $src0, $src1",
+  "v_sub_f64 $dst, $src0, $src1",
   [(set f64:$dst, (fsub f64:$src0, f64:$src1))]
 >;
 
@@ -1666,14 +1853,14 @@
 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
 
   def _SAVE : InstSI <
-    (outs VReg_32:$dst),
+    (outs),
     (ins sgpr_class:$src, i32imm:$frame_idx),
     "", []
   >;
 
   def _RESTORE : InstSI <
     (outs sgpr_class:$dst),
-    (ins VReg_32:$src, i32imm:$frame_idx),
+    (ins i32imm:$frame_idx),
     "", []
   >;
 
@@ -1685,6 +1872,37 @@
 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
+  def _SAVE : InstSI <
+    (outs),
+    (ins vgpr_class:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+  def _RESTORE : InstSI <
+    (outs vgpr_class:$dst),
+    (ins i32imm:$frame_idx),
+    "", []
+  >;
+}
+
+defm SI_SPILL_V32  : SI_SPILL_VGPR <VReg_32>;
+defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
+defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
+defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+
+let Defs = [SCC] in {
+
+def SI_CONSTDATA_PTR : InstSI <
+  (outs SReg_64:$dst),
+  (ins),
+  "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))]
+>;
+
+} // End Defs = [SCC]
+
 } // end IsCodeGenOnly, isPseudo
 
 } // end SubtargetPredicate = SI
@@ -1693,7 +1911,9 @@
 
 def : Pat<
   (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
-  (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0))
+  (V_CNDMASK_B32_e64 $src2, $src1,
+                     (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0,
+                                       DSTCLAMP.NONE, DSTOMOD.NONE))
 >;
 
 def : Pat <
@@ -1766,27 +1986,26 @@
 // SOP1 Patterns
 //===----------------------------------------------------------------------===//
 
-let Predicates = [isSI, isCFDepth0] in {
-
 def : Pat <
   (i64 (ctpop i64:$src)),
-  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (S_BCNT1_I32_B64 $src), sub0),
-    (S_MOV_B32 0), sub1)
+    (i64 (REG_SEQUENCE SReg_64,
+     (S_BCNT1_I32_B64 $src), sub0,
+     (S_MOV_B32 0), sub1))
 >;
 
-} // Predicates = [isSI, isCFDepth0]
-
-let  Predicates = [isSI] in {
 //===----------------------------------------------------------------------===//
 // SOP2 Patterns
 //===----------------------------------------------------------------------===//
 
+// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
+// case, the sgpr-copies pass will fix this to use the vector version.
 def : Pat <
-  (i1 (xor i1:$src0, i1:$src1)),
-  (S_XOR_B64 $src0, $src1)
+  (i32 (addc i32:$src0, i32:$src1)),
+  (S_ADD_U32 $src0, $src1)
 >;
 
+let  Predicates = [isSI] in {
+
 //===----------------------------------------------------------------------===//
 // SOPP Patterns
 //===----------------------------------------------------------------------===//
@@ -1800,67 +2019,106 @@
 // VOP1 Patterns
 //===----------------------------------------------------------------------===//
 
-def : RcpPat<V_RCP_F32_e32, f32>;
+let Predicates = [UnsafeFPMath] in {
 def : RcpPat<V_RCP_F64_e32, f64>;
-defm : RsqPat<V_RSQ_F32_e32, f32>;
 defm : RsqPat<V_RSQ_F64_e32, f64>;
+defm : RsqPat<V_RSQ_F32_e32, f32>;
+}
 
 //===----------------------------------------------------------------------===//
 // VOP2 Patterns
 //===----------------------------------------------------------------------===//
 
-class BinOp64Pat <SDNode node, Instruction inst> : Pat <
-  (node i64:$src0, i64:$src1),
-  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (inst  (EXTRACT_SUBREG i64:$src0, sub0),
-                  (EXTRACT_SUBREG i64:$src1, sub0)), sub0),
-    (inst (EXTRACT_SUBREG i64:$src0, sub1),
-                  (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
->;
-
-def : BinOp64Pat <or, V_OR_B32_e32>;
-def : BinOp64Pat <xor, V_XOR_B32_e32>;
-
-class SextInReg <ValueType vt, int ShiftAmt> : Pat <
-  (sext_inreg i32:$src0, vt),
-  (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0))
->;
-
-def : SextInReg <i8, 24>;
-def : SextInReg <i16, 16>;
-
 def : Pat <
   (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)),
-  (V_BCNT_U32_B32_e32 $popcnt, $val)
->;
-
-def : Pat <
-   (i32 (ctpop i32:$popcnt)),
-   (V_BCNT_U32_B32_e64 $popcnt, 0, 0, 0)
->;
-
-def : Pat <
-  (i64 (ctpop i64:$src)),
-  (INSERT_SUBREG
-    (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-      (V_BCNT_U32_B32_e32 (EXTRACT_SUBREG $src, sub1),
-        (V_BCNT_U32_B32_e64 (EXTRACT_SUBREG $src, sub0), 0, 0, 0)),
-      sub0),
-    (V_MOV_B32_e32 0), sub1)
+  (V_BCNT_U32_B32_e64 $popcnt, $val)
 >;
 
 /********** ======================= **********/
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
+// Image + sampler
 class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
-  (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, i32:$dmask, i32:$unorm,
+  (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm,
         i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
   (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
           (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
           $addr, $rsrc, $sampler)
 >;
 
+multiclass SampleRawPatterns<SDPatternOperator name, string opcode> {
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>;
+  def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>;
+}
+
+// Image only
+class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat <
+  (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm,
+        i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe),
+  (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da),
+          (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc),
+          $addr, $rsrc)
+>;
+
+multiclass ImagePatterns<SDPatternOperator name, string opcode> {
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>;
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>;
+  def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>;
+}
+
+// Basic sample
+defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
+defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_d,         "IMAGE_SAMPLE_D">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_l,         "IMAGE_SAMPLE_L">;
+defm : SampleRawPatterns<int_SI_image_sample_b,         "IMAGE_SAMPLE_B">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_cd,        "IMAGE_SAMPLE_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
+
+// Sample with comparison
+defm : SampleRawPatterns<int_SI_image_sample_c,         "IMAGE_SAMPLE_C">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
+
+// Sample with offsets
+defm : SampleRawPatterns<int_SI_image_sample_o,         "IMAGE_SAMPLE_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
+
+// Sample with comparison and offsets
+defm : SampleRawPatterns<int_SI_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
+defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
+
+// Gather opcodes
 // Only the variants which make sense are defined.
 def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
 def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
@@ -1905,6 +2163,10 @@
 def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
 def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
 
+def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
+defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
+defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
+
 /* SIsample for simple 1D texture lookup */
 def : Pat <
   (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
@@ -2143,62 +2405,63 @@
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
 
-def FCLAMP_SI : AMDGPUShaderInst <
-  (outs VReg_32:$dst),
-  (ins VSrc_32:$src0),
-  "FCLAMP_SI $dst, $src0",
-  []
-> {
-  let usesCustomInserter = 1;
-}
-
 def : Pat <
-  (AMDGPUclamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
-  (FCLAMP_SI f32:$src)
+  (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
+               (f32 FP_ZERO), (f32 FP_ONE)),
+  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod)
 >;
 
 /********** ================================ **********/
 /********** Floating point absolute/negative **********/
 /********** ================================ **********/
 
-// Manipulate the sign bit directly, as e.g. using the source negation modifier
-// in V_ADD_F32_e64 $src, 0, [...] does not result in -0.0 for $src == +0.0,
-// breaking the piglit *s-floatBitsToInt-neg* tests
+// Prevent expanding both fneg and fabs.
 
-// TODO: Look into not implementing isFNegFree/isFAbsFree for SI, and possibly
-// removing these patterns
-
+// FIXME: Should use S_OR_B32
 def : Pat <
   (fneg (fabs f32:$src)),
   (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
 >;
 
-def FABS_SI : AMDGPUShaderInst <
-  (outs VReg_32:$dst),
-  (ins VSrc_32:$src0),
-  "FABS_SI $dst, $src0",
-  []
-> {
-  let usesCustomInserter = 1;
-}
+// FIXME: Should use S_OR_B32
+def : Pat <
+  (fneg (fabs f64:$src)),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    sub0,
+    (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+                  (V_MOV_B32_e32 0x80000000)), // Set sign bit.
+    sub1)
+>;
 
 def : Pat <
   (fabs f32:$src),
-  (FABS_SI f32:$src)
+  (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff))
 >;
 
-def FNEG_SI : AMDGPUShaderInst <
-  (outs VReg_32:$dst),
-  (ins VSrc_32:$src0),
-  "FNEG_SI $dst, $src0",
-  []
-> {
-  let usesCustomInserter = 1;
-}
-
 def : Pat <
   (fneg f32:$src),
-  (FNEG_SI f32:$src)
+  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
+>;
+
+def : Pat <
+  (fabs f64:$src),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    sub0,
+    (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+                   (V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
+     sub1)
+>;
+
+def : Pat <
+  (fneg f64:$src),
+  (REG_SEQUENCE VReg_64,
+    (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+    sub0,
+    (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
+                   (V_MOV_B32_e32 0x80000000)),
+    sub1)
 >;
 
 /********** ================== **********/
@@ -2260,44 +2523,31 @@
 >;
 
 def : Pat<
-  (fdiv f32:$src0, f32:$src1),
-  (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1))
->;
-
-def : Pat<
   (fdiv f64:$src0, f64:$src1),
-  (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0))
->;
-
-def : Pat <
-  (fcos f32:$src0),
-  (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
->;
-
-def : Pat <
-  (fsin f32:$src0),
-  (V_SIN_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
+  (V_MUL_F64 0 /* src0_modifiers */, $src0,
+             0 /* src1_modifiers */, (V_RCP_F64_e32 $src1),
+             0 /* clamp */, 0 /* omod */)
 >;
 
 def : Pat <
   (int_AMDGPU_cube v4f32:$src),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
-    (V_CUBETC_F32 (EXTRACT_SUBREG $src, sub0),
-                  (EXTRACT_SUBREG $src, sub1),
-                  (EXTRACT_SUBREG $src, sub2)),
-                   sub0),
-    (V_CUBESC_F32 (EXTRACT_SUBREG $src, sub0),
-                  (EXTRACT_SUBREG $src, sub1),
-                  (EXTRACT_SUBREG $src, sub2)),
-                   sub1),
-    (V_CUBEMA_F32 (EXTRACT_SUBREG $src, sub0),
-                  (EXTRACT_SUBREG $src, sub1),
-                  (EXTRACT_SUBREG $src, sub2)),
-                   sub2),
-    (V_CUBEID_F32 (EXTRACT_SUBREG $src, sub0),
-                  (EXTRACT_SUBREG $src, sub1),
-                  (EXTRACT_SUBREG $src, sub2)),
-                   sub3)
+  (REG_SEQUENCE VReg_128,
+    (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
+                  0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1),
+                  0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2),
+                  0 /* clamp */, 0 /* omod */), sub0,
+    (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
+                  0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2),
+                  0 /* clamp */, 0 /* omod */), sub1,
+    (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+                  0 /* clamp */, 0 /* omod */), sub2,
+    (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
+                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+                  0 /* clamp */, 0 /* omod */), sub3)
 >;
 
 def : Pat <
@@ -2316,7 +2566,7 @@
 // Offset in an 32Bit VGPR
 def : Pat <
   (SIload_constant v4i32:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
+  (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0, 0)
 >;
 
 // The multiplication scales from [0,1] to the unsigned integer range
@@ -2330,7 +2580,7 @@
 def : Pat <
   (int_SI_tid),
   (V_MBCNT_HI_U32_B32_e32 0xffffffff,
-                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0))
+                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0))
 >;
 
 //===----------------------------------------------------------------------===//
@@ -2341,84 +2591,73 @@
 def : UMad24Pat<V_MAD_U32_U24>;
 
 def : Pat <
-  (fadd f64:$src0, f64:$src1),
-  (V_ADD_F64 $src0, $src1, (i64 0))
->;
-
-def : Pat <
-  (fmul f64:$src0, f64:$src1),
-  (V_MUL_F64 $src0, $src1, (i64 0))
->;
-
-def : Pat <
-  (mul i32:$src0, i32:$src1),
-  (V_MUL_LO_I32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
   (mulhu i32:$src0, i32:$src1),
-  (V_MUL_HI_U32 $src0, $src1, (i32 0))
+  (V_MUL_HI_U32 $src0, $src1)
 >;
 
 def : Pat <
   (mulhs i32:$src0, i32:$src1),
-  (V_MUL_HI_I32 $src0, $src1, (i32 0))
+  (V_MUL_HI_I32 $src0, $src1)
 >;
 
-defm : BFIPatterns <V_BFI_B32, S_MOV_B32>;
+def : Vop3ModPat<V_MAD_F32, VOP_F32_F32_F32_F32, AMDGPUmad>;
+
+
+defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
 /********** ======================= **********/
 /**********   Load/Store Patterns   **********/
 /********** ======================= **********/
 
-multiclass DSReadPat <DS inst, ValueType vt, PatFrag frag> {
-  def : Pat <
-    (vt (frag (add i32:$ptr, (i32 IMM16bit:$offset)))),
-    (inst (i1 0), $ptr, (as_i16imm $offset))
-  >;
+class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
+  (inst (i1 0), $ptr, (as_i16imm $offset))
+>;
 
-  def : Pat <
-    (frag i32:$src0),
-    (vt (inst 0, $src0, 0))
-  >;
-}
+def : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
+def : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
+def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
+def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
+def : DSReadPat <DS_READ_B32, i32, local_load>;
 
-defm : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
-defm : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
-defm : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
-defm : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
-defm : DSReadPat <DS_READ_B32, i32, local_load>;
-defm : DSReadPat <DS_READ_B64, v2i32, local_load>;
+let AddedComplexity = 100 in {
 
-multiclass DSWritePat <DS inst, ValueType vt, PatFrag frag> {
-  def : Pat <
-    (frag vt:$value, (add i32:$ptr, (i32 IMM16bit:$offset))),
-    (inst (i1 0), $ptr, $value, (as_i16imm $offset))
-  >;
+def : DSReadPat <DS_READ_B64, v2i32, local_load_aligned8bytes>;
 
-  def : Pat <
-    (frag vt:$val, i32:$ptr),
-    (inst 0, $ptr, $val, 0)
-  >;
-}
+} // End AddedComplexity = 100
 
-defm : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
-defm : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
-defm : DSWritePat <DS_WRITE_B32, i32, local_store>;
-defm : DSWritePat <DS_WRITE_B64, v2i32, local_store>;
+def : Pat <
+  (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+                                                    i8:$offset1))),
+  (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1)
+>;
 
-multiclass DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> {
-  def : Pat <
-    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$value),
-    (inst (i1 0), $ptr, $value, (as_i16imm $offset))
-  >;
+class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
+  (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+>;
 
-  def : Pat <
-    (frag i32:$ptr, vt:$val),
-    (inst 0, $ptr, $val, 0)
-  >;
-}
+def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
+def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
+def : DSWritePat <DS_WRITE_B32, i32, local_store>;
+
+let AddedComplexity = 100 in {
+
+def : DSWritePat <DS_WRITE_B64, v2i32, local_store_aligned8bytes>;
+} // End AddedComplexity = 100
+
+def : Pat <
+  (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
+                                                            i8:$offset1)),
+  (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0),
+                        (EXTRACT_SUBREG $value, sub1), $offset0, $offset1)
+>;
+
+class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+  (inst (i1 0), $ptr, $value, (as_i16imm $offset))
+>;
 
 // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec
 //
@@ -2430,69 +2669,56 @@
 // We also load this -1 with s_mov_b32 / s_mov_b64 even though this
 // needs to be a VGPR. The SGPR copy pass will fix this, and it's
 // easier since there is no v_mov_b64.
-multiclass DSAtomicIncRetPat<DS inst, ValueType vt,
-                             Instruction LoadImm, PatFrag frag> {
-  def : Pat <
-    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), (vt 1)),
-    (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
-  >;
+class DSAtomicIncRetPat<DS inst, ValueType vt,
+                        Instruction LoadImm, PatFrag frag> : Pat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)),
+  (inst (i1 0), $ptr, (LoadImm (vt -1)), (as_i16imm $offset))
+>;
 
-  def : Pat <
-    (frag i32:$ptr, (vt 1)),
-    (inst 0, $ptr, (LoadImm (vt -1)), 0)
-  >;
-}
 
-multiclass DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> {
-  def : Pat <
-    (frag (add i32:$ptr, (i32 IMM16bit:$offset)), vt:$cmp, vt:$swap),
-    (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
-  >;
-
-  def : Pat <
-    (frag i32:$ptr, vt:$cmp, vt:$swap),
-    (inst 0, $ptr, $cmp, $swap, 0)
-  >;
-}
+class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
+  (inst (i1 0), $ptr, $cmp, $swap, (as_i16imm $offset))
+>;
 
 
 // 32-bit atomics.
-defm : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
-                         S_MOV_B32, atomic_load_add_local>;
-defm : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
-                         S_MOV_B32, atomic_load_sub_local>;
+def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32,
+                        S_MOV_B32, atomic_load_add_local>;
+def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32,
+                        S_MOV_B32, atomic_load_sub_local>;
 
-defm : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
-defm : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
-defm : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
-defm : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
-defm : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
-defm : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
-defm : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
-defm : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
-defm : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
-defm : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B32, i32, atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B32, i32, atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, atomic_load_umax_local>;
 
-defm : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, atomic_cmp_swap_32_local>;
 
 // 64-bit atomics.
-defm : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
-                         S_MOV_B64, atomic_load_add_local>;
-defm : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
-                         S_MOV_B64, atomic_load_sub_local>;
+def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64,
+                        S_MOV_B64, atomic_load_add_local>;
+def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64,
+                        S_MOV_B64, atomic_load_sub_local>;
 
-defm : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
-defm : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
-defm : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
-defm : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
-defm : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
-defm : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
-defm : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
-defm : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
-defm : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
-defm : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
+def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, atomic_swap_local>;
+def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, atomic_load_add_local>;
+def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, atomic_load_sub_local>;
+def : DSAtomicRetPat<DS_AND_RTN_B64, i64, atomic_load_and_local>;
+def : DSAtomicRetPat<DS_OR_RTN_B64, i64, atomic_load_or_local>;
+def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, atomic_load_xor_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, atomic_load_min_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, atomic_load_max_local>;
+def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, atomic_load_umin_local>;
+def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, atomic_load_umax_local>;
 
-defm : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
+def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, atomic_cmp_swap_64_local>;
 
 
 //===----------------------------------------------------------------------===//
@@ -2502,43 +2728,50 @@
 multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
                               PatFrag constant_ld> {
   def : Pat <
-     (vt (constant_ld (add i64:$ptr, i64:$offset))),
-     (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+     (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i16:$offset))),
+     (Instr_ADDR64 $srsrc, $vaddr, $offset)
   >;
 }
 
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32,
-                          sextloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32,
-                          az_extloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32,
-                          sextloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32,
-                          az_extloadi16_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32,
-                          constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32,
-                          constant_load>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
-                          constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32, constant_load>;
+
+class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat <
+  (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
+                        i32:$soffset, u16imm:$offset))),
+  (Instr $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
 
 // BUFFER_LOAD_DWORD*, addr64=0
 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxen,
                              MUBUF bothen> {
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset,
                                   imm:$offset, 0, 0, imm:$glc, imm:$slc,
                                   imm:$tfe)),
-    (offset $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
+    (offset $rsrc, (as_i16imm $offset), $soffset, (as_i1imm $glc),
             (as_i1imm $slc), (as_i1imm $tfe))
   >;
 
   def : Pat <
     (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
-                                  imm, 1, 0, imm:$glc, imm:$slc,
+                                  imm:$offset, 1, 0, imm:$glc, imm:$slc,
                                   imm:$tfe)),
-    (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
+    (offen $rsrc, $vaddr, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
            (as_i1imm $tfe))
   >;
 
@@ -2566,6 +2799,32 @@
 defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
                          BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
 
+class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
+  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
+                               u16imm:$offset)),
+  (Instr $value, $srsrc, $vaddr, $soffset, $offset, 0, 0, 0)
+>;
+
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
+
+/*
+class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
+  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)),
+  (Instr $value, $srsrc, $vaddr, $offset)
+>;
+
+def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>;
+def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>;
+
+*/
+
 //===----------------------------------------------------------------------===//
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
@@ -2590,28 +2849,39 @@
 let SubtargetPredicate = isCI in {
 
 // Sea island new arithmetic instructinos
-let neverHasSideEffects = 1 in {
-defm V_TRUNC_F64 : VOP1_64 <0x00000017, "V_TRUNC_F64",
-  [(set f64:$dst, (ftrunc f64:$src0))]
+defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64",
+  VOP_F64_F64, ftrunc
 >;
-defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64",
-  [(set f64:$dst, (fceil f64:$src0))]
+defm V_CEIL_F64 : VOP1Inst <vop1<0x18>, "v_ceil_f64",
+  VOP_F64_F64, fceil
 >;
-defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64",
-  [(set f64:$dst, (ffloor f64:$src0))]
+defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64",
+  VOP_F64_F64, ffloor
 >;
-defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64",
-  [(set f64:$dst, (frint f64:$src0))]
+defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64",
+  VOP_F64_F64, frint
 >;
 
-defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
-defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
-defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
-def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>;
+defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8",
+  VOP_I32_I32_I32
+>;
+defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
+  VOP_I32_I32_I32
+>;
+defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
+  VOP_I32_I32_I32
+>;
+
+let isCommutable = 1 in {
+defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
+  VOP_I64_I32_I32_I64
+>;
 
 // XXX - Does this set VCC?
-def V_MAD_I64_I32 : VOP3_64 <0x00000177, "V_MAD_I64_I32", []>;
-} // End neverHasSideEffects = 1
+defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
+  VOP_I64_I32_I32_I64
+>;
+} // End isCommutable = 1
 
 // Remaining instructions:
 // FLAT_*
@@ -2636,6 +2906,37 @@
 
 } // End iSCI
 
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
+
+class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
+                             PatFrag flat_ld> :
+  Pat <(vt (flat_ld i64:$ptr)),
+       (Instr_ADDR64 $ptr)
+>;
+
+def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
+
+class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
+  Pat <(st vt:$value, i64:$ptr),
+        (Instr $value, $ptr)
+  >;
+
+def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
+def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
+def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
 
 /********** ====================== **********/
 /**********   Indirect adressing   **********/
@@ -2685,44 +2986,37 @@
 def : Pat<(i32 (sext_inreg i32:$src, i1)),
   (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
 
-// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
-// might not be worth the effort, and will need to expand to shifts when
-// fixing SGPR copies.
-
 // Handle sext_inreg in i64
 def : Pat <
   (i64 (sext_inreg i64:$src, i1)),
-  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16
-    (S_MOV_B32 -1), sub1)
+  (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16
 >;
 
 def : Pat <
   (i64 (sext_inreg i64:$src, i8)),
-  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
-    (S_MOV_B32 -1), sub1)
+  (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16
 >;
 
 def : Pat <
   (i64 (sext_inreg i64:$src, i16)),
-  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
-    (S_MOV_B32 -1), sub1)
+  (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i32)),
+  (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16
 >;
 
 class ZExt_i64_i32_Pat <SDNode ext> : Pat <
   (i64 (ext i32:$src)),
-  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
-    (S_MOV_B32 0), sub1)
+  (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1)
 >;
 
 class ZExt_i64_i1_Pat <SDNode ext> : Pat <
   (i64 (ext i1:$src)),
-  (INSERT_SUBREG
-    (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-      (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0),
-    (S_MOV_B32 0), sub1)
+    (REG_SEQUENCE VReg_64,
+      (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
+      (S_MOV_B32 0), sub1)
 >;
 
 
@@ -2733,17 +3027,14 @@
 
 def : Pat <
   (i64 (sext i32:$src)),
-    (INSERT_SUBREG
-      (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $src, sub0),
-      (S_ASHR_I32 $src, 31), sub1)
+    (REG_SEQUENCE SReg_64, $src, sub0,
+    (S_ASHR_I32 $src, 31), sub1)
 >;
 
 def : Pat <
   (i64 (sext i1:$src)),
-  (INSERT_SUBREG
-    (INSERT_SUBREG
-      (i64 (IMPLICIT_DEF)),
-      (V_CNDMASK_B32_e64 0, -1, $src), sub0),
+  (REG_SEQUENCE VReg_64,
+    (V_CNDMASK_B32_e64 0, -1, $src), sub0,
     (V_CNDMASK_B32_e64 0, -1, $src), sub1)
 >;
 
@@ -2778,20 +3069,20 @@
 
 def : Pat <
   (i1 (trunc i32:$a)),
-  (V_CMP_EQ_I32_e64 (V_AND_B32_e32 (i32 1), $a), 1)
+  (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1)
 >;
 
-// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector
-// case, the sgpr-copies pass will fix this to use the vector version.
 def : Pat <
-  (i32 (addc i32:$src0, i32:$src1)),
-  (S_ADD_I32 $src0, $src1)
+  (i32 (bswap i32:$a)),
+  (V_BFI_B32 (S_MOV_B32 0x00ff00ff),
+             (V_ALIGNBIT_B32 $a, $a, 24),
+             (V_ALIGNBIT_B32 $a, $a, 8))
 >;
 
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
 
-def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>;
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
 
 } // End isSI predicate

diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index df690a4..027a0a2 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td

@@ -54,14 +54,12 @@
 
   def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-
   // Fully-flexible SAMPLE instruction.
   class SampleRaw : Intrinsic <
     [llvm_v4f32_ty],    // vdata(VGPR)
     [llvm_anyint_ty,    // vaddr(VGPR)
-     llvm_v32i8_ty,     // rsrc(SGPR)
-     llvm_v16i8_ty,     // sampler(SGPR)
+     llvm_v8i32_ty,     // rsrc(SGPR)
+     llvm_v4i32_ty,     // sampler(SGPR)
      llvm_i32_ty,       // dmask(imm)
      llvm_i32_ty,       // unorm(imm)
      llvm_i32_ty,       // r128(imm)
@@ -72,10 +70,68 @@
      llvm_i32_ty],      // lwe(imm)
     [IntrNoMem]>;
 
-  def int_SI_sample : Sample;
-  def int_SI_sampleb : Sample;
-  def int_SI_sampled : Sample;
-  def int_SI_samplel : Sample;
+  // Image instruction without a sampler.
+  class Image : Intrinsic <
+    [llvm_v4f32_ty],    // vdata(VGPR)
+    [llvm_anyint_ty,    // vaddr(VGPR)
+     llvm_v8i32_ty,     // rsrc(SGPR)
+     llvm_i32_ty,       // dmask(imm)
+     llvm_i32_ty,       // unorm(imm)
+     llvm_i32_ty,       // r128(imm)
+     llvm_i32_ty,       // da(imm)
+     llvm_i32_ty,       // glc(imm)
+     llvm_i32_ty,       // slc(imm)
+     llvm_i32_ty,       // tfe(imm)
+     llvm_i32_ty],      // lwe(imm)
+    [IntrNoMem]>;
+
+  // Basic sample
+  def int_SI_image_sample : SampleRaw;
+  def int_SI_image_sample_cl : SampleRaw;
+  def int_SI_image_sample_d : SampleRaw;
+  def int_SI_image_sample_d_cl : SampleRaw;
+  def int_SI_image_sample_l : SampleRaw;
+  def int_SI_image_sample_b : SampleRaw;
+  def int_SI_image_sample_b_cl : SampleRaw;
+  def int_SI_image_sample_lz : SampleRaw;
+  def int_SI_image_sample_cd : SampleRaw;
+  def int_SI_image_sample_cd_cl : SampleRaw;
+
+  // Sample with comparison
+  def int_SI_image_sample_c : SampleRaw;
+  def int_SI_image_sample_c_cl : SampleRaw;
+  def int_SI_image_sample_c_d : SampleRaw;
+  def int_SI_image_sample_c_d_cl : SampleRaw;
+  def int_SI_image_sample_c_l : SampleRaw;
+  def int_SI_image_sample_c_b : SampleRaw;
+  def int_SI_image_sample_c_b_cl : SampleRaw;
+  def int_SI_image_sample_c_lz : SampleRaw;
+  def int_SI_image_sample_c_cd : SampleRaw;
+  def int_SI_image_sample_c_cd_cl : SampleRaw;
+
+  // Sample with offsets
+  def int_SI_image_sample_o : SampleRaw;
+  def int_SI_image_sample_cl_o : SampleRaw;
+  def int_SI_image_sample_d_o : SampleRaw;
+  def int_SI_image_sample_d_cl_o : SampleRaw;
+  def int_SI_image_sample_l_o : SampleRaw;
+  def int_SI_image_sample_b_o : SampleRaw;
+  def int_SI_image_sample_b_cl_o : SampleRaw;
+  def int_SI_image_sample_lz_o : SampleRaw;
+  def int_SI_image_sample_cd_o : SampleRaw;
+  def int_SI_image_sample_cd_cl_o : SampleRaw;
+
+  // Sample with comparison and offsets
+  def int_SI_image_sample_c_o : SampleRaw;
+  def int_SI_image_sample_c_cl_o : SampleRaw;
+  def int_SI_image_sample_c_d_o : SampleRaw;
+  def int_SI_image_sample_c_d_cl_o : SampleRaw;
+  def int_SI_image_sample_c_l_o : SampleRaw;
+  def int_SI_image_sample_c_b_o : SampleRaw;
+  def int_SI_image_sample_c_b_cl_o : SampleRaw;
+  def int_SI_image_sample_c_lz_o : SampleRaw;
+  def int_SI_image_sample_c_cd_o : SampleRaw;
+  def int_SI_image_sample_c_cd_cl_o : SampleRaw;
 
   // Basic gather4
   def int_SI_gather4 : SampleRaw;
@@ -111,8 +167,19 @@
 
   def int_SI_getlod : SampleRaw;
 
-  def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  // Image instrinsics.
+  def int_SI_image_load : Image;
+  def int_SI_image_load_mip : Image;
+  def int_SI_getresinfo : Image;
 
+  // Deprecated image and sample intrinsics.
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  def int_SI_sample : Sample;
+  def int_SI_sampleb : Sample;
+  def int_SI_sampled : Sample;
+  def int_SI_samplel : Sample;
+  def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
   /* Interpolation Intrinsics */

diff --git a/lib/Target/R600/SILoadStoreOptimizer.cpp b/lib/Target/R600/SILoadStoreOptimizer.cpp
new file mode 100644
index 0000000..4140196
--- /dev/null
+++ b/lib/Target/R600/SILoadStoreOptimizer.cpp

@@ -0,0 +1,417 @@
+//===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to fuse DS instructions with close by immediate offsets.
+// This will fuse operations such as
+//  ds_read_b32 v0, v2 offset:16
+//  ds_read_b32 v1, v2 offset:32
+// ==>
+//   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
+//
+//
+// Future improvements:
+//
+// - This currently relies on the scheduler to place loads and stores next to
+//   each other, and then only merges adjacent pairs of instructions. It would
+//   be good to be more flexible with interleaved instructions, and possibly run
+//   before scheduling. It currently missing stores of constants because loading
+//   the constant into the data register is placed between the stores, although
+//   this is arguably a scheduling problem.
+//
+// - Live interval recomputing seems inefficient. This currently only matches
+//   one pair, and recomputes live intervals and moves on to the next pair. It
+//   would be better to compute a list of all merges that need to occur
+//
+// - With a list of instructions to process, we can also merge more. If a
+//   cluster of loads have offsets that are too large to fit in the 8-bit
+//   offsets, but are close enough to fit in the 8 bits, we can add to the base
+//   pointer and use the new reduced offsets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-load-store-opt"
+
+namespace {
+
+class SILoadStoreOptimizer : public MachineFunctionPass {
+private:
+  const TargetMachine *TM;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
+
+
+  static bool offsetsCanBeCombined(unsigned Offset0,
+                                   unsigned Offset1,
+                                   unsigned EltSize);
+
+  MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
+                                                 unsigned EltSize);
+
+  void updateRegDefsUses(unsigned SrcReg,
+                         unsigned DstReg,
+                         unsigned SubIdx);
+
+  MachineBasicBlock::iterator mergeRead2Pair(
+    MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator Paired,
+    unsigned EltSize);
+
+  MachineBasicBlock::iterator mergeWrite2Pair(
+    MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator Paired,
+    unsigned EltSize);
+
+public:
+  static char ID;
+
+  SILoadStoreOptimizer() :
+    MachineFunctionPass(ID),
+    TM(nullptr),
+    TII(nullptr),
+    TRI(nullptr),
+    MRI(nullptr),
+    LIS(nullptr) {
+
+  }
+
+  SILoadStoreOptimizer(const TargetMachine &TM_) :
+    MachineFunctionPass(ID),
+    TM(&TM_),
+    TII(static_cast<const SIInstrInfo*>(TM->getSubtargetImpl()->getInstrInfo())) {
+    initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool optimizeBlock(MachineBasicBlock &MBB);
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Load / Store Optimizer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreserved<LiveVariables>();
+    AU.addRequired<LiveIntervals>();
+
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
+                      "SI Load / Store Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
+                    "SI Load / Store Optimizer", false, false)
+
+char SILoadStoreOptimizer::ID = 0;
+
+char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
+
+FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
+  return new SILoadStoreOptimizer(TM);
+}
+
+bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
+                                                unsigned Offset1,
+                                                unsigned Size) {
+  // XXX - Would the same offset be OK? Is there any reason this would happen or
+  // be useful?
+  if (Offset0 == Offset1)
+    return false;
+
+  // This won't be valid if the offset isn't aligned.
+  if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
+    return false;
+
+  unsigned EltOffset0 = Offset0 / Size;
+  unsigned EltOffset1 = Offset1 / Size;
+
+  // Check if the new offsets fit in the reduced 8-bit range.
+  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
+    return true;
+
+  // If the offset in elements doesn't fit in 8-bits, we might be able to use
+  // the stride 64 versions.
+  if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
+    return false;
+
+  return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
+                                         unsigned EltSize){
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  ++MBBI;
+
+  if (MBBI->getOpcode() != I->getOpcode())
+    return E;
+
+  // Don't merge volatiles.
+  if (MBBI->hasOrderedMemoryRef())
+    return E;
+
+  int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+  const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+  const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+  // Check same base pointer. Be careful of subregisters, which can occur with
+  // vectors of pointers.
+  if (AddrReg0.getReg() == AddrReg1.getReg() &&
+      AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+    int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+                                               AMDGPU::OpName::offset);
+    unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+    unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+    // Check both offsets fit in the reduced range.
+    if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
+      return MBBI;
+  }
+
+  return E;
+}
+
+void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg,
+                                             unsigned DstReg,
+                                             unsigned SubIdx) {
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg),
+         E = MRI->reg_end(); I != E; ) {
+    MachineOperand &O = *I;
+    ++I;
+    O.substVirtReg(DstReg, SubIdx, *TRI);
+  }
+}
+
+MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
+  MachineBasicBlock::iterator I,
+  MachineBasicBlock::iterator Paired,
+  unsigned EltSize) {
+  MachineBasicBlock *MBB = I->getParent();
+
+  // Be careful, since the addresses could be subregisters themselves in weird
+  // cases, like vectors of pointers.
+  const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+
+  unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
+  unsigned DestReg1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg();
+
+  unsigned Offset0
+          = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+  unsigned Offset1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+  unsigned NewOffset0 = Offset0 / EltSize;
+  unsigned NewOffset1 = Offset1 / EltSize;
+  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
+
+  // Prefer the st64 form if we can use it, even if we can fit the offset in the
+  // non st64 version. I'm not sure if there's any real reason to do this.
+  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+  if (UseST64) {
+    NewOffset0 /= 64;
+    NewOffset1 /= 64;
+    Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
+  }
+
+  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+         (NewOffset0 != NewOffset1) &&
+         "Computed offset doesn't fit");
+
+  const MCInstrDesc &Read2Desc = TII->get(Opc);
+
+  const TargetRegisterClass *SuperRC
+    = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+
+  DebugLoc DL = I->getDebugLoc();
+  MachineInstrBuilder Read2
+    = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
+    .addImm(0) // gds
+    .addOperand(*AddrReg) // addr
+    .addImm(NewOffset0) // offset0
+    .addImm(NewOffset1) // offset1
+    .addMemOperand(*I->memoperands_begin())
+    .addMemOperand(*Paired->memoperands_begin());
+
+  LIS->InsertMachineInstrInMaps(Read2);
+
+  unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+  unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+  updateRegDefsUses(DestReg0, DestReg, SubRegIdx0);
+  updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);
+
+  LIS->RemoveMachineInstrFromMaps(I);
+  LIS->RemoveMachineInstrFromMaps(Paired);
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
+  LIS->shrinkToUses(&AddrRegLI);
+
+  LIS->getInterval(DestReg); // Create new LI
+
+  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
+  return Read2.getInstr();
+}
+
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
+  MachineBasicBlock::iterator I,
+  MachineBasicBlock::iterator Paired,
+  unsigned EltSize) {
+  MachineBasicBlock *MBB = I->getParent();
+
+  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
+  // sure we preserve the subregister index and any register flags set on them.
+  const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
+  const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
+  const MachineOperand *Data1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
+
+
+  unsigned Offset0
+    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
+  unsigned Offset1
+    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
+
+  unsigned NewOffset0 = Offset0 / EltSize;
+  unsigned NewOffset1 = Offset1 / EltSize;
+  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
+
+  // Prefer the st64 form if we can use it, even if we can fit the offset in the
+  // non st64 version. I'm not sure if there's any real reason to do this.
+  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
+  if (UseST64) {
+    NewOffset0 /= 64;
+    NewOffset1 /= 64;
+    Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
+  }
+
+  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
+         (NewOffset0 != NewOffset1) &&
+         "Computed offset doesn't fit");
+
+  const MCInstrDesc &Write2Desc = TII->get(Opc);
+  DebugLoc DL = I->getDebugLoc();
+
+  MachineInstrBuilder Write2
+    = BuildMI(*MBB, I, DL, Write2Desc)
+    .addImm(0) // gds
+    .addOperand(*Addr) // addr
+    .addOperand(*Data0) // data0
+    .addOperand(*Data1) // data1
+    .addImm(NewOffset0) // offset0
+    .addImm(NewOffset1) // offset1
+    .addMemOperand(*I->memoperands_begin())
+    .addMemOperand(*Paired->memoperands_begin());
+
+  // XXX - How do we express subregisters here?
+  unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
+
+  LIS->RemoveMachineInstrFromMaps(I);
+  LIS->RemoveMachineInstrFromMaps(Paired);
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
+
+  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
+  return Write2.getInstr();
+}
+
+// Scan through looking for adjacent LDS operations with constant offsets from
+// the same base register. We rely on the scheduler to do the hard work of
+// clustering nearby loads, and assume these are all adjacent.
+bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
+    MachineInstr &MI = *I;
+
+    // Don't combine if volatile.
+    if (MI.hasOrderedMemoryRef()) {
+      ++I;
+      continue;
+    }
+
+    unsigned Opc = MI.getOpcode();
+    if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
+      unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
+      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+      if (Match != E) {
+        Modified = true;
+        I = mergeRead2Pair(I, Match, Size);
+      } else {
+        ++I;
+      }
+
+      continue;
+    } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
+      unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
+      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+      if (Match != E) {
+        Modified = true;
+        I = mergeWrite2Pair(I, Match, Size);
+      } else {
+        ++I;
+      }
+
+      continue;
+    }
+
+    ++I;
+  }
+
+  return Modified;
+}
+
+bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  const TargetSubtargetInfo *STM = MF.getTarget().getSubtargetImpl();
+  TRI = static_cast<const SIRegisterInfo*>(STM->getRegisterInfo());
+  TII = static_cast<const SIInstrInfo*>(STM->getInstrInfo());
+  MRI = &MF.getRegInfo();
+
+  LIS = &getAnalysis<LiveIntervals>();
+
+  DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
+
+  assert(!MRI->isSSA());
+
+  bool Modified = false;
+
+  for (MachineBasicBlock &MBB : MF)
+    Modified |= optimizeBlock(MBB);
+
+  return Modified;
+}

diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 9f5ff29..9702565 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp

@@ -49,8 +49,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -147,7 +149,7 @@
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
-  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType !=
+  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
       ShaderType::PIXEL ||
       !shouldSkip(&MBB, &MBB.getParent()->back()))
     return;
@@ -298,11 +300,13 @@
   DebugLoc DL = MI.getDebugLoc();
   const MachineOperand &Op = MI.getOperand(0);
 
-  // Kill is only allowed in pixel / geometry shaders
-  assert(MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
-         ShaderType::PIXEL ||
-         MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
-         ShaderType::GEOMETRY);
+#ifndef NDEBUG
+  const SIMachineFunctionInfo *MFI
+    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  // Kill is only allowed in pixel / geometry shaders.
+  assert(MFI->getShaderType() == ShaderType::PIXEL ||
+         MFI->getShaderType() == ShaderType::GEOMETRY);
+#endif
 
   // Clear this thread from the exec mask if the operand is negative
   if ((Op.isImm() || Op.isFPImm())) {
@@ -440,13 +444,15 @@
 }
 
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
-  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
   bool NeedM0 = false;
   bool NeedWQM = false;
+  bool NeedFlat = false;
   unsigned Depth = 0;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -463,6 +469,12 @@
         NeedWQM = true;
       }
 
+      // Flat uses m0 in case it needs to access LDS.
+      if (TII->isFLAT(MI.getOpcode())) {
+        NeedM0 = true;
+        NeedFlat = true;
+      }
+
       switch (MI.getOpcode()) {
         default: break;
         case AMDGPU::SI_IF:
@@ -528,7 +540,6 @@
         case AMDGPU::V_INTERP_MOV_F32:
           NeedWQM = true;
           break;
-
       }
     }
   }
@@ -540,11 +551,50 @@
     InitM0ForLDS(MBB.getFirstNonPHI());
   }
 
-  if (NeedWQM && MFI->ShaderType == ShaderType::PIXEL) {
+  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
     MachineBasicBlock &MBB = MF.front();
     BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
             AMDGPU::EXEC).addReg(AMDGPU::EXEC);
   }
 
+  // FIXME: This seems inappropriate to do here.
+  if (NeedFlat && MFI->IsKernel) {
+    // Insert the prologue initializing the SGPRs pointing to the scratch space
+    // for flat accesses.
+    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+    // TODO: What to use with function calls?
+
+    // FIXME: This is reporting stack size that is used in a scratch buffer
+    // rather than registers as well.
+    uint64_t StackSizeBytes = FrameInfo->getStackSize();
+
+    int IndirectBegin
+      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
+    // Convert register index to 256-byte unit.
+    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
+
+    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
+           "Stack limits should be smaller than 16-bits");
+
+    // Initialize the flat scratch register pair.
+    // TODO: Can we use one s_mov_b64 here?
+
+    // Offset is in units of 256-bytes.
+    MachineBasicBlock &MBB = MF.front();
+    DebugLoc NoDL;
+    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
+    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
+
+    assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
+
+    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
+      .addImm(StackOffset);
+
+    // Documentation says size is "per-thread scratch size in bytes"
+    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
+      .addImm(StackSizeBytes);
+  }
+
   return true;
 }

diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
index 738c90b..65b892c 100644
--- a/lib/Target/R600/SILowerI1Copies.cpp
+++ b/lib/Target/R600/SILowerI1Copies.cpp

@@ -15,6 +15,7 @@
 
 #define DEBUG_TYPE "si-i1-copies"
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -39,14 +40,14 @@
     initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const override {
-    return "SI Lower il Copies";
+  const char *getPassName() const override {
+    return "SI Lower i1 Copies";
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
-  AU.addRequired<MachineDominatorTree>();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -55,10 +56,10 @@
 } // End anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
-                      "SI Lower il Copies", false, false)
+                      "SI Lower i1 Copies", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
-                    "SI Lower il Copies", false, false)
+                    "SI Lower i1 Copies", false, false)
 
 char SILowerI1Copies::ID = 0;
 
@@ -70,9 +71,9 @@
 
 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
-      MF.getTarget().getInstrInfo());
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   std::vector<unsigned> I1Defs;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -102,6 +103,20 @@
         continue;
       }
 
+      if (MI.getOpcode() == AMDGPU::V_XOR_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
+        unsigned Reg = MI.getOperand(0).getReg();
+        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+        if (RC == &AMDGPU::VReg_1RegClass)
+          MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
+        continue;
+      }
+
       if (MI.getOpcode() != AMDGPU::COPY ||
           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
@@ -120,21 +135,13 @@
                 .addOperand(MI.getOperand(0))
                 .addImm(0)
                 .addImm(-1)
-                .addOperand(MI.getOperand(1))
-                .addImm(0)
-                .addImm(0)
-                .addImm(0)
-                .addImm(0);
+                .addOperand(MI.getOperand(1));
         MI.eraseFromParent();
       } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
                  SrcRC == &AMDGPU::VReg_1RegClass) {
         BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
                 .addOperand(MI.getOperand(0))
-                .addImm(0)
                 .addOperand(MI.getOperand(1))
-                .addImm(0)
-                .addImm(0)
-                .addImm(0)
                 .addImm(0);
         MI.eraseFromParent();
       }

diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index e2df950..d58f31d 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp

@@ -10,8 +10,10 @@
 
 
 #include "SIMachineFunctionInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
@@ -26,71 +28,49 @@
 
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
+    TIDReg(AMDGPU::NoRegister),
     PSInputAddr(0),
-    SpillTracker() { }
+    NumUserSGPRs(0),
+    LDSWaveSpillSize(0) { }
 
-static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
-  unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
+                                                       MachineFunction *MF,
+                                                       unsigned FrameIndex,
+                                                       unsigned SubIdx) {
+  const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(
+      MF->getTarget().getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  int64_t Offset = FrameInfo->getObjectOffset(FrameIndex);
+  Offset += SubIdx * 4;
 
-  // We need to add this register as live out for the function, in order to
-  // have the live range calculated directly.
-  //
-  // When register spilling begins, we have already calculated the live
-  // live intervals for all the registers.  Since we are spilling SGPRs to
-  // VGPRs, we need to update the Lane VGPR's live interval every time we
-  // spill or restore a register.
-  //
-  // Unfortunately, there is no good way to update the live interval as
-  // the TargetInstrInfo callbacks for spilling and restoring don't give
-  // us access to the live interval information.
-  //
-  // We are lucky, though, because the InlineSpiller calls
-  // LiveRangeEdit::calculateRegClassAndHint() which iterates through
-  // all the new register that have been created when restoring a register
-  // and calls LiveIntervals::getInterval(), which creates and computes
-  // the live interval for the newly created register.  However, once this
-  // live intervals is created, it doesn't change and since we usually reuse
-  // the Lane VGPR multiple times, this means any uses after the first aren't
-  // added to the live interval.
-  //
-  // To work around this, we add Lane VGPRs to the functions live out list,
-  // so that we can guarantee its live range will cover all of its uses.
+  unsigned LaneVGPRIdx = Offset / (64 * 4);
+  unsigned Lane = (Offset / 4) % 64;
 
-  for (MachineBasicBlock &MBB : *MF) {
-    if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
-      MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
-      return VGPR;
+  struct SpilledReg Spill;
+
+  if (!LaneVGPRs.count(LaneVGPRIdx)) {
+    unsigned LaneVGPR = TRI->findUnusedVGPR(MRI);
+    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
+    MRI.setPhysRegUsed(LaneVGPR);
+
+    // Add this register as live-in to all blocks to avoid machine verifer
+    // complaining about use of an undefined physical register.
+    for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
+         BI != BE; ++BI) {
+      BI->addLiveIn(LaneVGPR);
     }
   }
 
-  LLVMContext &Ctx = MF->getFunction()->getContext();
-  Ctx.emitError("Could not find S_ENDPGM instruction.");
-
-  return VGPR;
+  Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
+  Spill.Lane = Lane;
+  return Spill;
 }
 
-unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes(
-    MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) {
-  unsigned StartLane = CurrentLane;
-  CurrentLane += NumRegs;
-  if (!LaneVGPR) {
-    LaneVGPR = createLaneVGPR(MRI, MF);
-  } else {
-    if (CurrentLane >= MAX_LANES) {
-      StartLane = CurrentLane = 0;
-      LaneVGPR = createLaneVGPR(MRI, MF);
-    }
-  }
-  return StartLane;
-}
-
-void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex,
-                                                           unsigned Reg,
-                                                           int Lane) {
-  SpilledRegisters[FrameIndex] = SpilledReg(Reg, Lane);
-}
-
-const SIMachineFunctionInfo::SpilledReg&
-SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) {
-  return SpilledRegisters[FrameIndex];
+unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
+                                              const MachineFunction &MF) const {
+  const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget<AMDGPUSubtarget>();
+  // FIXME: We should get this information from kernel attributes if it
+  // is available.
+  return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize();
 }

diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 96e619b..6bb8f9d 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h

@@ -12,10 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef SIMACHINEFUNCTIONINFO_H_
-#define SIMACHINEFUNCTIONINFO_H_
+#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H
 
 #include "AMDGPUMachineFunction.h"
+#include "SIRegisterInfo.h"
 #include <map>
 
 namespace llvm {
@@ -26,6 +27,9 @@
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
   void anchor() override;
+
+  unsigned TIDReg;
+
 public:
 
   struct SpilledReg {
@@ -36,32 +40,23 @@
     bool hasLane() { return Lane != -1;}
   };
 
-  struct RegSpillTracker {
-  private:
-    unsigned CurrentLane;
-    std::map<unsigned, SpilledReg> SpilledRegisters;
-  public:
-    unsigned LaneVGPR;
-    RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { }
-    /// \p NumRegs The number of consecutive registers what need to be spilled.
-    ///            This function will ensure that all registers are stored in
-    ///            the same VGPR.
-    /// \returns The lane to be used for storing the first register.
-    unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF,
-                          unsigned NumRegs = 1);
-    void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1);
-    const SpilledReg& getSpilledReg(unsigned FrameIndex);
-    bool programSpillsRegisters() { return !SpilledRegisters.empty(); }
-  };
-
   // SIMachineFunctionInfo definition
 
   SIMachineFunctionInfo(const MachineFunction &MF);
+  SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
+                           unsigned SubIdx);
   unsigned PSInputAddr;
-  struct RegSpillTracker SpillTracker;
+  unsigned NumUserSGPRs;
+  std::map<unsigned, unsigned> LaneVGPRs;
+  unsigned LDSWaveSpillSize;
+  bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
+  unsigned getTIDReg() const { return TIDReg; };
+  void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+
+  unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
 };
 
 } // End namespace llvm
 
 
-#endif //_SIMACHINEFUNCTIONINFO_H_
+#endif

diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index d0b677a..cffea12 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp

@@ -16,6 +16,12 @@
 #include "SIRegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 
 using namespace llvm;
 
@@ -26,9 +32,19 @@
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::EXEC);
+
+  // EXEC_LO and EXEC_HI could be allocated and used as regular register,
+  // but this seems likely to result in bugs, so I'm marking them as reserved.
+  Reserved.set(AMDGPU::EXEC_LO);
+  Reserved.set(AMDGPU::EXEC_HI);
+
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
-  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
-  TII->reserveIndirectRegisters(Reserved, MF);
+  Reserved.set(AMDGPU::FLAT_SCR);
+
+  // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
+  Reserved.set(AMDGPU::VGPR255);
+  Reserved.set(AMDGPU::VGPR254);
+
   return Reserved;
 }
 
@@ -37,6 +53,213 @@
   return RC->getNumRegs();
 }
 
+bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
+  return Fn.getFrameInfo()->hasStackObjects();
+}
+
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+  switch (Op) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_V512_SAVE:
+  case AMDGPU::SI_SPILL_V512_RESTORE:
+    return 16;
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_V256_SAVE:
+  case AMDGPU::SI_SPILL_V256_RESTORE:
+    return 8;
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_V128_SAVE:
+  case AMDGPU::SI_SPILL_V128_RESTORE:
+    return 4;
+  case AMDGPU::SI_SPILL_V96_SAVE:
+  case AMDGPU::SI_SPILL_V96_RESTORE:
+    return 3;
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+  case AMDGPU::SI_SPILL_V64_SAVE:
+  case AMDGPU::SI_SPILL_V64_RESTORE:
+    return 2;
+  case AMDGPU::SI_SPILL_S32_SAVE:
+  case AMDGPU::SI_SPILL_S32_RESTORE:
+  case AMDGPU::SI_SPILL_V32_SAVE:
+  case AMDGPU::SI_SPILL_V32_RESTORE:
+    return 1;
+  default: llvm_unreachable("Invalid spill opcode");
+  }
+}
+
+void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineBasicBlock *MBB = MI->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo());
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineOperand &FIOp = MI->getOperand(FIOperandNum);
+  int Index = MI->getOperand(FIOperandNum).getIndex();
+
+  switch (MI->getOpcode()) {
+    // SGPR register spill
+    case AMDGPU::SI_SPILL_S512_SAVE:
+    case AMDGPU::SI_SPILL_S256_SAVE:
+    case AMDGPU::SI_SPILL_S128_SAVE:
+    case AMDGPU::SI_SPILL_S64_SAVE:
+    case AMDGPU::SI_SPILL_S32_SAVE: {
+      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+      for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+        unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
+                                           &AMDGPU::SGPR_32RegClass, i);
+        struct SIMachineFunctionInfo::SpilledReg Spill =
+            MFI->getSpilledReg(MF, Index, i);
+
+        if (Spill.VGPR == AMDGPU::NoRegister) {
+           LLVMContext &Ctx = MF->getFunction()->getContext();
+           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+        }
+
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+                .addReg(SubReg)
+                .addImm(Spill.Lane);
+
+      }
+      MI->eraseFromParent();
+      break;
+    }
+
+    // SGPR register restore
+    case AMDGPU::SI_SPILL_S512_RESTORE:
+    case AMDGPU::SI_SPILL_S256_RESTORE:
+    case AMDGPU::SI_SPILL_S128_RESTORE:
+    case AMDGPU::SI_SPILL_S64_RESTORE:
+    case AMDGPU::SI_SPILL_S32_RESTORE: {
+      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+      for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+        unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
+                                           &AMDGPU::SGPR_32RegClass, i);
+        bool isM0 = SubReg == AMDGPU::M0;
+        struct SIMachineFunctionInfo::SpilledReg Spill =
+            MFI->getSpilledReg(MF, Index, i);
+
+        if (Spill.VGPR == AMDGPU::NoRegister) {
+           LLVMContext &Ctx = MF->getFunction()->getContext();
+           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+        }
+
+        if (isM0) {
+          SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
+        }
+
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+                .addReg(Spill.VGPR)
+                .addImm(Spill.Lane);
+        if (isM0) {
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+                  .addReg(SubReg);
+        }
+      }
+      TII->insertNOPs(MI, 3);
+      MI->eraseFromParent();
+      break;
+    }
+
+    // VGPR register spill
+    case AMDGPU::SI_SPILL_V512_SAVE:
+    case AMDGPU::SI_SPILL_V256_SAVE:
+    case AMDGPU::SI_SPILL_V128_SAVE:
+    case AMDGPU::SI_SPILL_V96_SAVE:
+    case AMDGPU::SI_SPILL_V64_SAVE:
+    case AMDGPU::SI_SPILL_V32_SAVE: {
+      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+      unsigned SrcReg = MI->getOperand(0).getReg();
+      int64_t Offset = FrameInfo->getObjectOffset(Index);
+      unsigned Size = NumSubRegs * 4;
+      unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+
+      for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
+        unsigned SubReg = NumSubRegs > 1 ?
+            getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) :
+            SrcReg;
+        Offset += (i * 4);
+        MFI->LDSWaveSpillSize = std::max((unsigned)Offset + 4, (unsigned)MFI->LDSWaveSpillSize);
+
+        unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg,
+                                                         Offset, Size);
+
+        if (AddrReg == AMDGPU::NoRegister) {
+           LLVMContext &Ctx = MF->getFunction()->getContext();
+           Ctx.emitError("Ran out of VGPRs for spilling VGPRS");
+           AddrReg = AMDGPU::VGPR0;
+        }
+
+        // Store the value in LDS
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_B32))
+                .addImm(0) // gds
+                .addReg(AddrReg, RegState::Kill) // addr
+                .addReg(SubReg) // data0
+                .addImm(0); // offset
+      }
+
+      MI->eraseFromParent();
+      break;
+    }
+    case AMDGPU::SI_SPILL_V32_RESTORE:
+    case AMDGPU::SI_SPILL_V64_RESTORE:
+    case AMDGPU::SI_SPILL_V128_RESTORE:
+    case AMDGPU::SI_SPILL_V256_RESTORE:
+    case AMDGPU::SI_SPILL_V512_RESTORE: {
+      unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+      unsigned DstReg = MI->getOperand(0).getReg();
+      int64_t Offset = FrameInfo->getObjectOffset(Index);
+      unsigned Size = NumSubRegs * 4;
+      unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+
+      // FIXME: We could use DS_READ_B64 here to optimize for larger registers.
+      for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
+        unsigned SubReg = NumSubRegs > 1 ?
+            getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) :
+            DstReg;
+
+        Offset += (i * 4);
+        unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg,
+                                                          Offset, Size);
+        if (AddrReg == AMDGPU::NoRegister) {
+           LLVMContext &Ctx = MF->getFunction()->getContext();
+           Ctx.emitError("Ran out of VGPRs for spilling VGPRs");
+           AddrReg = AMDGPU::VGPR0;
+        }
+
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_READ_B32), SubReg)
+                .addImm(0) // gds
+                .addReg(AddrReg, RegState::Kill) // addr
+                .addImm(0); //offset
+      }
+      MI->eraseFromParent();
+      break;
+    }
+
+    default: {
+      int64_t Offset = FrameInfo->getObjectOffset(Index);
+      FIOp.ChangeToImmediate(Offset);
+      if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
+        unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj);
+        BuildMI(*MBB, MI, MI->getDebugLoc(),
+                TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+                .addImm(Offset);
+        FIOp.ChangeToRegister(TmpReg, false);
+      }
+    }
+  }
+}
+
 const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
                                                                    MVT VT) const {
   switch(VT.SimpleTy) {
@@ -52,13 +275,17 @@
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   assert(!TargetRegisterInfo::isVirtualRegister(Reg));
 
-  const TargetRegisterClass *BaseClasses[] = {
+  static const TargetRegisterClass *BaseClasses[] = {
     &AMDGPU::VReg_32RegClass,
     &AMDGPU::SReg_32RegClass,
     &AMDGPU::VReg_64RegClass,
     &AMDGPU::SReg_64RegClass,
+    &AMDGPU::VReg_96RegClass,
+    &AMDGPU::VReg_128RegClass,
     &AMDGPU::SReg_128RegClass,
-    &AMDGPU::SReg_256RegClass
+    &AMDGPU::VReg_256RegClass,
+    &AMDGPU::SReg_256RegClass,
+    &AMDGPU::VReg_512RegClass
   };
 
   for (const TargetRegisterClass *BaseClass : BaseClasses) {
@@ -69,13 +296,6 @@
   return nullptr;
 }
 
-bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
-  if (!RC) {
-    return false;
-  }
-  return !hasVGPRs(RC);
-}
-
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
   return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) ||
          getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
@@ -122,11 +342,53 @@
 unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
                                           const TargetRegisterClass *SubRC,
                                           unsigned Channel) const {
+
+  switch (Reg) {
+    case AMDGPU::VCC:
+      switch(Channel) {
+        case 0: return AMDGPU::VCC_LO;
+        case 1: return AMDGPU::VCC_HI;
+        default: llvm_unreachable("Invalid SubIdx for VCC");
+      }
+
+  case AMDGPU::FLAT_SCR:
+    switch (Channel) {
+    case 0:
+      return AMDGPU::FLAT_SCR_LO;
+    case 1:
+      return AMDGPU::FLAT_SCR_HI;
+    default:
+      llvm_unreachable("Invalid SubIdx for FLAT_SCR");
+    }
+    break;
+
+  case AMDGPU::EXEC:
+    switch (Channel) {
+    case 0:
+      return AMDGPU::EXEC_LO;
+    case 1:
+      return AMDGPU::EXEC_HI;
+    default:
+      llvm_unreachable("Invalid SubIdx for EXEC");
+    }
+    break;
+  }
+
+  const TargetRegisterClass *RC = getPhysRegClass(Reg);
+  // 32-bit registers don't have sub-registers, so we can just return the
+  // Reg.  We need to have this check here, because the calculation below
+  // using getHWRegIndex() will fail with special 32-bit registers like
+  // VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0.
+  if (RC->getSize() == 4) {
+    assert(Channel == 0);
+    return Reg;
+  }
+
   unsigned Index = getHWRegIndex(Reg);
   return SubRC->getRegister(Index + Channel);
 }
 
-bool SIRegisterInfo::regClassCanUseImmediate(int RCID) const {
+bool SIRegisterInfo::regClassCanUseLiteralConstant(int RCID) const {
   switch (RCID) {
   default: return false;
   case AMDGPU::SSrc_32RegClassID:
@@ -137,7 +399,68 @@
   }
 }
 
-bool SIRegisterInfo::regClassCanUseImmediate(
+bool SIRegisterInfo::regClassCanUseLiteralConstant(
                              const TargetRegisterClass *RC) const {
-  return regClassCanUseImmediate(RC->getID());
+  return regClassCanUseLiteralConstant(RC->getID());
 }
+
+bool SIRegisterInfo::regClassCanUseInlineConstant(int RCID) const {
+  if (regClassCanUseLiteralConstant(RCID))
+    return true;
+
+  switch (RCID) {
+  default: return false;
+  case AMDGPU::VCSrc_32RegClassID:
+  case AMDGPU::VCSrc_64RegClassID:
+    return true;
+  }
+}
+
+bool SIRegisterInfo::regClassCanUseInlineConstant(
+                            const TargetRegisterClass *RC) const {
+  return regClassCanUseInlineConstant(RC->getID());
+}
+
+
+unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
+                                           enum PreloadedValue Value) const {
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  switch (Value) {
+  case SIRegisterInfo::TGID_X:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
+  case SIRegisterInfo::TGID_Y:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
+  case SIRegisterInfo::TGID_Z:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
+  case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
+    return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
+  case SIRegisterInfo::SCRATCH_PTR:
+    return AMDGPU::SGPR2_SGPR3;
+  case SIRegisterInfo::INPUT_PTR:
+    return AMDGPU::SGPR0_SGPR1;
+  case SIRegisterInfo::TIDIG_X:
+    return AMDGPU::VGPR0;
+  case SIRegisterInfo::TIDIG_Y:
+    return AMDGPU::VGPR1;
+  case SIRegisterInfo::TIDIG_Z:
+    return AMDGPU::VGPR2;
+  }
+  llvm_unreachable("unexpected preloaded value type");
+}
+
+/// \brief Returns a register that is not used at any point in the function.
+///        If all registers are used, then this function will return
+//         AMDGPU::NoRegister.
+unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const {
+
+  const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass;
+
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I) {
+    if (!MRI.isPhysRegUsed(*I))
+      return *I;
+  }
+  return AMDGPU::NoRegister;
+}
+

diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index c9305fb..c7e54db 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h

@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef SIREGISTERINFO_H_
-#define SIREGISTERINFO_H_
+#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
+#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
 
@@ -29,6 +29,12 @@
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
+  bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS) const override;
+
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
   const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
@@ -40,7 +46,20 @@
   const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
 
   /// \returns true if this class contains only SGPR registers
-  bool isSGPRClass(const TargetRegisterClass *RC) const;
+  bool isSGPRClass(const TargetRegisterClass *RC) const {
+    if (!RC)
+      return false;
+
+    return !hasVGPRs(RC);
+  }
+
+  /// \returns true if this class ID contains only SGPR registers
+  bool isSGPRClassID(unsigned RCID) const {
+    if (static_cast<int>(RCID) == -1)
+      return false;
+
+    return isSGPRClass(getRegClass(RCID));
+  }
 
   /// \returns true if this class contains VGPR registers.
   bool hasVGPRs(const TargetRegisterClass *RC) const;
@@ -62,14 +81,41 @@
                             unsigned Channel) const;
 
   /// \returns True if operands defined with this register class can accept
-  /// inline immediates.
-  bool regClassCanUseImmediate(int RCID) const;
+  /// a literal constant (i.e. any 32-bit immediate).
+  bool regClassCanUseLiteralConstant(int RCID) const;
 
   /// \returns True if operands defined with this register class can accept
-  /// inline immediates.
-  bool regClassCanUseImmediate(const TargetRegisterClass *RC) const;
+  /// a literal constant (i.e. any 32-bit immediate).
+  bool regClassCanUseLiteralConstant(const TargetRegisterClass *RC) const;
+
+  /// \returns True if operands defined with this register class can accept
+  /// an inline constant. i.e. An integer value in the range (-16, 64) or
+  /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. 
+  bool regClassCanUseInlineConstant(int RCID) const;
+
+  /// \returns True if operands defined with this register class can accept
+  /// a literal constant. i.e. A value in the range (-16, 64).
+  bool regClassCanUseInlineConstant(const TargetRegisterClass *RC) const;
+
+  enum PreloadedValue {
+    TGID_X,
+    TGID_Y,
+    TGID_Z,
+    SCRATCH_WAVE_OFFSET,
+    SCRATCH_PTR,
+    INPUT_PTR,
+    TIDIG_X,
+    TIDIG_Y,
+    TIDIG_Z
+  };
+
+  /// \brief Returns the physical register that \p Value is stored in.
+  unsigned getPreloadedValue(const MachineFunction &MF,
+                             enum PreloadedValue Value) const;
+
+  unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const;
 };
 
 } // End namespace llvm
 
-#endif // SIREGISTERINFO_H_
+#endif

diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 8974b63..45c2b41 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td

@@ -27,10 +27,28 @@
   let HWEncoding = 106;
 }
 
-def EXEC : SIReg<"EXEC", 126>;
+def EXEC_LO : SIReg<"exec_lo", 126>;
+def EXEC_HI : SIReg<"exec_hi", 127>;
+
+def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 126;
+}
+
 def SCC : SIReg<"SCC", 253>;
 def M0 : SIReg <"M0", 124>;
 
+def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes.
+def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes.
+
+// Pair to indicate location of scratch space for flat accesses.
+def FLAT_SCR : RegisterWithSubRegs <"FLAT_SCR", [FLAT_SCR_LO, FLAT_SCR_HI]> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = 104;
+}
+
 // SGPR registers
 foreach Index = 0-101 in {
   def SGPR#Index : SIReg <"SGPR"#Index, Index>;
@@ -152,20 +170,24 @@
 //===----------------------------------------------------------------------===//
 
 // Special register classes for predicates and the M0 register
-def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)>;
+def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> {
+  let CopyCost = -1; // Theoretically it is possible to read from SCC,
+                     // but it should never be necessary.
+}
+
 def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
 def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
 def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
 
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
-  (add SGPR_32, M0Reg, VCC_LO)
+  (add SGPR_32, M0Reg, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)
 >;
 
 def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
 
 def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
-  (add SGPR_64Regs, VCCReg, EXECReg)
+  (add SGPR_64, VCCReg, EXECReg, FLAT_SCR)
 >;
 
 def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>;
@@ -192,18 +214,30 @@
 def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
 
 //===----------------------------------------------------------------------===//
-//  [SV]Src_(32|64) register classes, can have either an immediate or an register
+//  SSrc_* Operands with an SGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
 
 def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
 
 def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>;
 
+//===----------------------------------------------------------------------===//
+//  VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
+//===----------------------------------------------------------------------===//
+
 def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
 
 def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
 
 //===----------------------------------------------------------------------===//
+//  VCSrc_* Operands with an SGPR, VGPR or an inline constant
+//===----------------------------------------------------------------------===//
+
+def VCSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
+
+def VCSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+
+//===----------------------------------------------------------------------===//
 // SGPR and VGPR register classes
 //===----------------------------------------------------------------------===//
 

diff --git a/lib/Target/R600/SIShrinkInstructions.cpp b/lib/Target/R600/SIShrinkInstructions.cpp
new file mode 100644
index 0000000..45e83f5
--- /dev/null
+++ b/lib/Target/R600/SIShrinkInstructions.cpp

@@ -0,0 +1,271 @@
+//===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// The pass tries to use the 32-bit encoding for instructions when possible.
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "si-shrink-instructions"
+
+STATISTIC(NumInstructionsShrunk,
+          "Number of 64-bit instruction reduced to 32-bit.");
+STATISTIC(NumLiteralConstantsFolded,
+          "Number of literal constants folded into 32-bit instructions.");
+
+namespace llvm {
+  void initializeSIShrinkInstructionsPass(PassRegistry&);
+}
+
+using namespace llvm;
+
+namespace {
+
+class SIShrinkInstructions : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIShrinkInstructions() : MachineFunctionPass(ID) {
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Shrink Instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE,
+                      "SI Lower il Copies", false, false)
+INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE,
+                    "SI Lower il Copies", false, false)
+
+char SIShrinkInstructions::ID = 0;
+
+FunctionPass *llvm::createSIShrinkInstructionsPass() {
+  return new SIShrinkInstructions();
+}
+
+static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
+                   const MachineRegisterInfo &MRI) {
+  if (!MO->isReg())
+    return false;
+
+  if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
+    return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
+
+  return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
+}
+
+static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
+                      const SIRegisterInfo &TRI,
+                      const MachineRegisterInfo &MRI) {
+
+  const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+  // Can't shrink instruction with three operands.
+  if (Src2)
+    return false;
+
+  const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src1Mod =
+      TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+
+  if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
+    return false;
+
+  // We don't need to check src0, all input types are legal, so just make sure
+  // src0 isn't using any modifiers.
+  if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
+    return false;
+
+  // Check output modifiers
+  if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+    return false;
+
+  if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+    return false;
+
+  return true;
+}
+
+/// \brief This function checks \p MI for operands defined by a move immediate
+/// instruction and then folds the literal constant into the instruction if it
+/// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
+/// and will only fold literal constants if we are still in SSA.
+static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
+                           MachineRegisterInfo &MRI, bool TryToCommute = true) {
+
+  if (!MRI.isSSA())
+    return;
+
+  assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) ||
+         TII->isVOPC(MI.getOpcode()));
+
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+
+  // Only one literal constant is allowed per instruction, so if src0 is a
+  // literal constant then we can't do any folding.
+  if ((Src0->isImm() || Src0->isFPImm()) && TII->isLiteralConstant(*Src0))
+    return;
+
+
+  // Literal constants and SGPRs can only be used in Src0, so if Src0 is an
+  // SGPR, we cannot commute the instruction, so we can't fold any literal
+  // constants.
+  if (Src0->isReg() && !isVGPR(Src0, TRI, MRI))
+    return;
+
+  // Try to fold Src0
+  if (Src0->isReg()) {
+    unsigned Reg = Src0->getReg();
+    MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+    if (Def && Def->isMoveImmediate()) {
+      MachineOperand &MovSrc = Def->getOperand(1);
+      bool ConstantFolded = false;
+
+      if (MovSrc.isImm() && isUInt<32>(MovSrc.getImm())) {
+        Src0->ChangeToImmediate(MovSrc.getImm());
+        ConstantFolded = true;
+      } else if (MovSrc.isFPImm()) {
+        const ConstantFP *CFP = MovSrc.getFPImm();
+        if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle) {
+          Src0->ChangeToFPImmediate(CFP);
+          ConstantFolded = true;
+        }
+      }
+      if (ConstantFolded) {
+        if (MRI.use_empty(Reg))
+          Def->eraseFromParent();
+        ++NumLiteralConstantsFolded;
+        return;
+      }
+    }
+  }
+
+  // We have failed to fold src0, so commute the instruction and try again.
+  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
+    foldImmediates(MI, TII, MRI, false);
+
+}
+
+bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
+      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
+        const MachineOperand &Src = MI.getOperand(1);
+
+        // TODO: Handle FPImm?
+        if (Src.isImm()) {
+          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src)) {
+            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
+            continue;
+          }
+        }
+      }
+
+      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
+        continue;
+
+      if (!canShrink(MI, TII, TRI, MRI)) {
+        // Try commuting the instruction and see if that enables us to shrink
+        // it.
+        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
+            !canShrink(MI, TII, TRI, MRI))
+          continue;
+      }
+
+      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
+
+      // Op32 could be -1 here if we started with an instruction that had a
+      // a 32-bit encoding and then commuted it to an instruction that did not.
+      if (Op32 == -1)
+        continue;
+
+      if (TII->isVOPC(Op32)) {
+        unsigned DstReg = MI.getOperand(0).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+          // VOPC instructions can only write to the VCC register.  We can't
+          // force them to use VCC here, because the register allocator has
+          // trouble with sequences like this, which cause the allocator to run
+          // out of registers if vreg0 and vreg1 belong to the VCCReg register
+          // class:
+          // vreg0 = VOPC;
+          // vreg1 = VOPC;
+          // S_AND_B64 vreg0, vreg1
+          //
+          // So, instead of forcing the instruction to write to VCC, we provide
+          // a hint to the register allocator to use VCC and then we we will run
+          // this pass again after RA and shrink it if it outputs to VCC.
+          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
+          continue;
+        }
+        if (DstReg != AMDGPU::VCC)
+          continue;
+      }
+
+      // We can shrink this instruction
+      DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);
+
+      MachineInstrBuilder Inst32 =
+          BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
+
+      // dst
+      Inst32.addOperand(MI.getOperand(0));
+
+      Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+
+      const MachineOperand *Src1 =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+      if (Src1)
+        Inst32.addOperand(*Src1);
+
+      ++NumInstructionsShrunk;
+      MI.eraseFromParent();
+
+      foldImmediates(*Inst32, TII, MRI);
+      DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
+
+
+    }
+  }
+  return false;
+}

diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index 367963a..9318dc1 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp

@@ -87,7 +87,7 @@
     Value *BitCast = Builder.CreateBitCast(Ptr,
         PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
     LoadInst *Load = Builder.CreateLoad(BitCast);
-    SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
+    SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
     I.getAllMetadataOtherThanDebugLoc(MD);
     for (unsigned i = 0, e = MD.size(); i != e; ++i) {
       Load->setMetadata(MD[i].first, MD[i].second);

diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index a9aab86..0fa56e6 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt

@@ -95,44 +95,6 @@
 
 //===---------------------------------------------------------------------===//
 
-This function: (derived from GCC PR19988)
-double foo(double x, double y) {
-  return ((x + 0.1234 * y) * (x + -0.1234 * y));
-}
-
-compiles to:
-_foo:
-	movapd	%xmm1, %xmm2
-	mulsd	LCPI1_1(%rip), %xmm1
-	mulsd	LCPI1_0(%rip), %xmm2
-	addsd	%xmm0, %xmm1
-	addsd	%xmm0, %xmm2
-	movapd	%xmm1, %xmm0
-	mulsd	%xmm2, %xmm0
-	ret
-
-Reassociate should be able to turn it into:
-
-double foo(double x, double y) {
-  return ((x + 0.1234 * y) * (x - 0.1234 * y));
-}
-
-Which allows the multiply by constant to be CSE'd, producing:
-
-_foo:
-	mulsd	LCPI1_0(%rip), %xmm1
-	movapd	%xmm1, %xmm2
-	addsd	%xmm0, %xmm2
-	subsd	%xmm1, %xmm0
-	mulsd	%xmm2, %xmm0
-	ret
-
-This doesn't need -ffast-math support at all.  This is particularly bad because
-the llvm-gcc frontend is canonicalizing the later into the former, but clang
-doesn't have this problem.
-
-//===---------------------------------------------------------------------===//
-
 These two functions should generate the same code on big-endian systems:
 
 int g(int *j,int *l)  {  return memcmp(j,l,4);  }
@@ -771,7 +733,7 @@
   return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
 }
 Both should combine to ((a|b) & (c-1)) != 0.  Currently not optimized with
-"clang -emit-llvm-bc | opt -std-compile-opts".
+"clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
@@ -784,7 +746,7 @@
 }
 The expression should optimize to something like
 "!((start|end)&~PMD_MASK). Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
@@ -803,7 +765,7 @@
  return (abs(x)) >= 0;
 }
 This should optimize to x == INT_MIN. (With -fwrapv.)  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
@@ -841,117 +803,117 @@
 
 All should simplify to a single comparison.  All of these are
 currently not optimized with "clang -emit-llvm-bc | opt
--std-compile-opts".
+-O3".
 
 //===---------------------------------------------------------------------===//
 
 From GCC Bug 32605:
 int c(int* x) {return (char*)x+2 == (char*)x;}
 Should combine to 0.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
+-emit-llvm-bc | opt -O3" (although llc can optimize it).
 
 //===---------------------------------------------------------------------===//
 
 int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
 Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
-with "clang -emit-llvm-bc | opt -std-compile-opts".
+with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
 Should combine to "x | (y & 3)".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
 Should fold to "(~a & c) | (a & b)".  Currently not optimized with
-"clang -emit-llvm-bc | opt -std-compile-opts".
+"clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a,int b) {return (~(a|b))|a;}
 Should fold to "a|~b".  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a, int b) {return (a&&b) || (a&&!b);}
 Should fold to "a".  Currently not optimized with "clang -emit-llvm-bc
-| opt -std-compile-opts".
+| opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
 Should fold to "a ? b : c", or at least something sane.  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
 Should fold to a && (b || c).  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int x) {return x | ((x & 8) ^ 8);}
 Should combine to x | 8.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int x) {return x ^ ((x & 8) ^ 8);}
 Should also combine to x | 8.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int a(int x) {return ((x | -9) ^ 8) & x;}
 Should combine to x & -9.  Currently not optimized with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
 Should combine to "a * 0x88888888 >> 31".  Currently not optimized
-with "clang -emit-llvm-bc | opt -std-compile-opts".
+with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 unsigned a(char* x) {if ((*x & 32) == 0) return b();}
 There's an unnecessary zext in the generated code with "clang
--emit-llvm-bc | opt -std-compile-opts".
+-emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 unsigned a(unsigned long long x) {return 40 * (x >> 1);}
 Should combine to "20 * (((unsigned)x) & -2)".  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int g(int x) { return (x - 10) < 0; }
 Should combine to "x <= 9" (the sub has nsw).  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int g(int x) { return (x + 10) < 0; }
 Should combine to "x < -10" (the add has nsw).  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 int f(int i, int j) { return i < j + 1; }
 int g(int i, int j) { return j > i - 1; }
 Should combine to "i <= j" (the add/sub has nsw).  Currently not
-optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
 unsigned f(unsigned x) { return ((x & 7) + 1) & 15; }
 The & 15 part should be optimized away, it doesn't change the result. Currently
-not optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+not optimized with "clang -emit-llvm-bc | opt -O3".
 
 //===---------------------------------------------------------------------===//
 
@@ -1163,7 +1125,7 @@
 GCC testsuite, ones we don't get yet are (checked through loadpre25):
 
 [CRIT EDGE BREAKING]
-loadpre3.c predcom-4.c
+predcom-4.c
 
 [PRE OF READONLY CALL]
 loadpre5.c

diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 9df0054..d0b362c 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp

@@ -48,7 +48,7 @@
   // public interface of the MCTargetAsmParser.
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
+                               uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
@@ -386,7 +386,7 @@
 bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                              OperandVector &Operands,
                                              MCStreamer &Out,
-                                             unsigned &ErrorInfo,
+                                             uint64_t &ErrorInfo,
                                              bool MatchingInlineAsm) {
   MCInst Inst;
   SmallVector<MCInst, 8> Instructions;
@@ -408,7 +408,7 @@
 
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
+    if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
@@ -444,7 +444,7 @@
   return Error(StartLoc, "invalid register name");
 }
 
-static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features,
+static void applyMnemonicAliases(StringRef &Mnemonic, uint64_t Features,
                                  unsigned VariantID);
 
 bool SparcAsmParser::ParseInstruction(ParseInstructionInfo &Info,

diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index cebda92..c486411 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt

@@ -2,9 +2,8 @@
 
 tablegen(LLVM SparcGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM SparcGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM SparcGenCodeEmitter.inc -gen-emitter)
 tablegen(LLVM SparcGenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM SparcGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM SparcGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM SparcGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM SparcGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM SparcGenDAGISel.inc -gen-dag-isel)
@@ -24,8 +23,6 @@
   SparcSubtarget.cpp
   SparcTargetMachine.cpp
   SparcSelectionDAGInfo.cpp
-  SparcJITInfo.cpp
-  SparcCodeEmitter.cpp
   SparcMCInstLower.cpp
   SparcTargetObjectFile.cpp
   )

diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index f3441ff..28369fd 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp

@@ -110,7 +110,7 @@
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
-  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
 
   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
     MachineBasicBlock::iterator MI = I;
@@ -187,7 +187,7 @@
     if (J->getOpcode() == SP::RESTORErr
         || J->getOpcode() == SP::RESTOREri) {
       // change retl to ret.
-      slot->setDesc(TM.getInstrInfo()->get(SP::RET));
+      slot->setDesc(TM.getSubtargetImpl()->getInstrInfo()->get(SP::RET));
       return J;
     }
   }
@@ -329,7 +329,8 @@
 bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
 {
   // Check Reg and all aliased Registers.
-  for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true);
+  for (MCRegAliasIterator AI(Reg, TM.getSubtargetImpl()->getRegisterInfo(),
+                             true);
        AI.isValid(); ++AI)
     if (RegSet.count(*AI))
       return true;
@@ -482,7 +483,7 @@
   if (PrevInst->isBundledWithSucc())
     return false;
 
-  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
 
   switch (PrevInst->getOpcode()) {
   default: break;

diff --git a/lib/Target/Sparc/Disassembler/LLVMBuild.txt b/lib/Target/Sparc/Disassembler/LLVMBuild.txt
index c27398f..bd5397d 100644
--- a/lib/Target/Sparc/Disassembler/LLVMBuild.txt
+++ b/lib/Target/Sparc/Disassembler/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = SparcDisassembler
 parent = Sparc
-required_libraries = MC SparcInfo Support
+required_libraries = MCDisassembler SparcInfo Support
 add_to_library_groups = Sparc

diff --git a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 4df0990..8bc4ca9 100644
--- a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp

@@ -16,7 +16,6 @@
 #include "SparcSubtarget.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -27,23 +26,17 @@
 
 namespace {
 
-/// SparcDisassembler - a disassembler class for Sparc.
+/// A disassembler class for Sparc.
 class SparcDisassembler : public MCDisassembler {
 public:
-  /// Constructor     - Initializes the disassembler.
-  ///
-  SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
-    MCDisassembler(STI, Ctx)
-  {}
+  SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
   virtual ~SparcDisassembler() {}
 
-  /// getInstruction - See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const override;
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
 };
 
 }
@@ -213,47 +206,37 @@
 
 #include "SparcGenDisassemblerTables.inc"
 
-/// readInstruction - read four bytes from the MemoryObject
-/// and return 32 bit word.
-static DecodeStatus readInstruction32(const MemoryObject &region,
-                                      uint64_t address,
-                                      uint64_t &size,
-                                      uint32_t &insn) {
-  uint8_t Bytes[4];
-
+/// Read four bytes from the ArrayRef and return 32 bit word.
+static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint32_t &Insn) {
   // We want to read exactly 4 Bytes of data.
-  if (region.readBytes(address, 4, Bytes) == -1) {
-    size = 0;
+  if (Bytes.size() < 4) {
+    Size = 0;
     return MCDisassembler::Fail;
   }
 
   // Encoded as a big-endian 32-bit word in the stream.
-  insn = (Bytes[3] <<  0) |
-    (Bytes[2] <<  8) |
-    (Bytes[1] << 16) |
-    (Bytes[0] << 24);
+  Insn =
+      (Bytes[3] << 0) | (Bytes[2] << 8) | (Bytes[1] << 16) | (Bytes[0] << 24);
 
   return MCDisassembler::Success;
 }
 
-
-DecodeStatus
-SparcDisassembler::getInstruction(MCInst &instr,
-                                 uint64_t &Size,
-                                 const MemoryObject &Region,
-                                 uint64_t Address,
-                                 raw_ostream &vStream,
-                                 raw_ostream &cStream) const {
+DecodeStatus SparcDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                               ArrayRef<uint8_t> Bytes,
+                                               uint64_t Address,
+                                               raw_ostream &VStream,
+                                               raw_ostream &CStream) const {
   uint32_t Insn;
 
-  DecodeStatus Result = readInstruction32(Region, Address, Size, Insn);
+  DecodeStatus Result = readInstruction32(Bytes, Address, Size, Insn);
   if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
 
 
   // Calling the auto-generated decoder function.
-  Result = decodeInstruction(DecoderTableSparc32, instr, Insn, Address,
-                             this, STI);
+  Result =
+      decodeInstruction(DecoderTableSparc32, Instr, Insn, Address, this, STI);
 
   if (Result != MCDisassembler::Fail) {
     Size = 4;

diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
index 8fe4075..c96d5ad 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SparcINSTPRINTER_H
-#define SparcINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
+#define LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"

diff --git a/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
index d42bcee..8d79396 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcFixupKinds.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SPARC_FIXUPKINDS_H
-#define LLVM_SPARC_FIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCFIXUPKINDS_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCFIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 

diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 6875fc6..3a9c987 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp

@@ -35,7 +35,6 @@
   Data64bitsDirective = (isV9) ? "\t.xword\t" : nullptr;
   ZeroDirective = "\t.skip\t";
   CommentString = "!";
-  HasLEB128 = true;
   SupportsDebugInformation = true;
 
   ExceptionsType = ExceptionHandling::DwarfCFI;
@@ -43,7 +42,8 @@
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
-  if (TheTriple.getOS() == llvm::Triple::Solaris)
+  if (TheTriple.getOS() == llvm::Triple::Solaris ||
+      TheTriple.getOS() == llvm::Triple::OpenBSD)
     UseIntegratedAssembler = true;
 }
 

diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index e126b68..84de551 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARCTARGETASMINFO_H
-#define SPARCTARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
 

diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 7f01ab0..d97e3a2 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp

@@ -161,8 +161,9 @@
 
 bool
 SparcMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                       const MCAsmLayout *Layout) const {
-  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
+                                       const MCAsmLayout *Layout,
+                                       const MCFixup *Fixup) const {
+  return getSubExpr()->EvaluateAsRelocatable(Res, Layout, Fixup);
 }
 
 static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {

diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index f0d0ef3..f72c6c4 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SPARCMCEXPR_H
-#define LLVM_SPARCMCEXPR_H
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
 
 #include "SparcFixupKinds.h"
 #include "llvm/MC/MCExpr.h"
@@ -87,7 +87,8 @@
   /// @}
   void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const override;
+                                 const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
   const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();

diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 571017d..3cc4314 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp

@@ -125,10 +125,8 @@
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Context, MCAsmBackend &MAB,
                                     raw_ostream &OS, MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI, bool RelaxAll,
-                                    bool NoExecStack) {
-  MCStreamer *S =
-      createELFStreamer(Context, MAB, OS, Emitter, RelaxAll, NoExecStack);
+                                    const MCSubtargetInfo &STI, bool RelaxAll) {
+  MCStreamer *S = createELFStreamer(Context, MAB, OS, Emitter, RelaxAll);
   new SparcTargetELFStreamer(*S);
   return S;
 }

diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index c8029a8..c31943d 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARCMCTARGETDESC_H
-#define SPARCMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCTARGETDESC_H
+#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
 

diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
index bcc0291..c2a95b4 100644
--- a/lib/Target/Sparc/Makefile
+++ b/lib/Target/Sparc/Makefile

@@ -16,7 +16,7 @@
 		SparcGenAsmWriter.inc SparcGenAsmMatcher.inc \
 		SparcGenDAGISel.inc SparcGenDisassemblerTables.inc \
 		SparcGenSubtargetInfo.inc SparcGenCallingConv.inc \
-		SparcGenCodeEmitter.inc SparcGenMCCodeEmitter.inc
+		SparcGenMCCodeEmitter.inc
 
 DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
 

diff --git a/lib/Target/Sparc/README.txt b/lib/Target/Sparc/README.txt
index 34e68cf..647c276 100644
--- a/lib/Target/Sparc/README.txt
+++ b/lib/Target/Sparc/README.txt

@@ -56,6 +56,4 @@
   leaf fns.
 * Fill delay slots
 
-* Implement JIT support
-
 * Use %g0 directly to materialize 0. No instruction is required.

diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index de20aaa..96378d5 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TARGET_SPARC_H
-#define TARGET_SPARC_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARC_H
+#define LLVM_LIB_TARGET_SPARC_SPARC_H
 
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -29,8 +29,6 @@
 
   FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
   FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
-  FunctionPass *createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
-                                              JITCodeEmitter &JCE);
 
   void LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
                                       MCInst &OutMI,

diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 1b7330e..6432003 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp

@@ -296,7 +296,7 @@
 
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   const MachineOperand &MO = MI->getOperand (opNum);
   SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags();
 
@@ -450,7 +450,8 @@
   MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
   if (!Stubs.empty()) {
     OutStreamer.SwitchSection(TLOFELF.getDataSection());
-    unsigned PtrSize = TM.getDataLayout()->getPointerSize(0);
+    unsigned PtrSize =
+        TM.getSubtargetImpl()->getDataLayout()->getPointerSize(0);
     for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
       OutStreamer.EmitLabel(Stubs[i].first);
       OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), PtrSize);

diff --git a/lib/Target/Sparc/SparcCodeEmitter.cpp b/lib/Target/Sparc/SparcCodeEmitter.cpp
deleted file mode 100644
index 247da2a..0000000
--- a/lib/Target/Sparc/SparcCodeEmitter.cpp
+++ /dev/null

@@ -1,280 +0,0 @@
-//===-- Sparc/SparcCodeEmitter.cpp - Convert Sparc Code to Machine Code ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===---------------------------------------------------------------------===//
-//
-// This file contains the pass that transforms the Sparc machine instructions
-// into relocatable machine code.
-//
-//===---------------------------------------------------------------------===//
-
-#include "Sparc.h"
-#include "MCTargetDesc/SparcMCExpr.h"
-#include "SparcRelocations.h"
-#include "SparcTargetMachine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-STATISTIC(NumEmitted, "Number of machine instructions emitted");
-
-namespace {
-
-class SparcCodeEmitter : public MachineFunctionPass {
-  SparcJITInfo *JTI;
-  const SparcInstrInfo *II;
-  const DataLayout *TD;
-  const SparcSubtarget *Subtarget;
-  TargetMachine &TM;
-  JITCodeEmitter &MCE;
-  const std::vector<MachineConstantPoolEntry> *MCPEs;
-  bool IsPIC;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineModuleInfo> ();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  static char ID;
-
-public:
-  SparcCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-    : MachineFunctionPass(ID), JTI(nullptr), II(nullptr), TD(nullptr),
-      TM(tm), MCE(mce), MCPEs(nullptr),
-      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "Sparc Machine Code Emitter";
-  }
-
-  /// getBinaryCodeForInstr - This function, generated by the
-  /// CodeEmitterGenerator using TableGen, produces the binary encoding for
-  /// machine instructions.
-  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
-
-  void emitInstruction(MachineBasicBlock::instr_iterator MI,
-                       MachineBasicBlock &MBB);
-
-private:
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MachineInstr &MI,
-                             const MachineOperand &MO) const;
-
-  unsigned getCallTargetOpValue(const MachineInstr &MI,
-                                unsigned) const;
-  unsigned getBranchTargetOpValue(const MachineInstr &MI,
-                                  unsigned) const;
-  unsigned getBranchPredTargetOpValue(const MachineInstr &MI,
-                                      unsigned) const;
-  unsigned getBranchOnRegTargetOpValue(const MachineInstr &MI,
-                                       unsigned) const;
-
-  void emitWord(unsigned Word);
-
-  unsigned getRelocation(const MachineInstr &MI,
-                         const MachineOperand &MO) const;
-
-  void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc) const;
-  void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
-  void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
-  void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
-};
-}  // end anonymous namespace.
-
-char SparcCodeEmitter::ID = 0;
-
-bool SparcCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
-  SparcTargetMachine &Target = static_cast<SparcTargetMachine &>(
-                                const_cast<TargetMachine &>(MF.getTarget()));
-
-  JTI = Target.getJITInfo();
-  II = Target.getInstrInfo();
-  TD = Target.getDataLayout();
-  Subtarget = &TM.getSubtarget<SparcSubtarget> ();
-  MCPEs = &MF.getConstantPool()->getConstants();
-  JTI->Initialize(MF, IsPIC);
-  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
-
-  do {
-    DEBUG(errs() << "JITTing function '"
-        << MF.getName() << "'\n");
-    MCE.startFunction(MF);
-
-    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
-        MBB != E; ++MBB){
-      MCE.StartMachineBasicBlock(MBB);
-      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
-           E = MBB->instr_end(); I != E;)
-        emitInstruction(*I++, *MBB);
-    }
-  } while (MCE.finishFunction(MF));
-
-  return false;
-}
-
-void SparcCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
-                                      MachineBasicBlock &MBB) {
-  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << *MI);
-
-  MCE.processDebugLoc(MI->getDebugLoc(), true);
-
-  ++NumEmitted;
-
-  switch (MI->getOpcode()) {
-  default: {
-    emitWord(getBinaryCodeForInstr(*MI));
-    break;
-  }
-  case TargetOpcode::INLINEASM: {
-    // We allow inline assembler nodes with empty bodies - they can
-    // implicitly define registers, which is ok for JIT.
-    if (MI->getOperand(0).getSymbolName()[0]) {
-      report_fatal_error("JIT does not support inline asm!");
-    }
-    break;
-  }
-  case TargetOpcode::CFI_INSTRUCTION:
-    break;
-  case TargetOpcode::EH_LABEL: {
-    MCE.emitLabel(MI->getOperand(0).getMCSymbol());
-    break;
-  }
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL: {
-    // Do nothing.
-    break;
-  }
-  case SP::GETPCX: {
-    report_fatal_error("JIT does not support pseudo instruction GETPCX yet!");
-    break;
-  }
-  }
-
-  MCE.processDebugLoc(MI->getDebugLoc(), false);
-}
-
-void SparcCodeEmitter::emitWord(unsigned Word) {
-  DEBUG(errs() << "  0x";
-        errs().write_hex(Word) << "\n");
-  MCE.emitWordBE(Word);
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned SparcCodeEmitter::getMachineOpValue(const MachineInstr &MI,
-                                             const MachineOperand &MO) const {
-  if (MO.isReg())
-    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
-  else if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-  else if (MO.isGlobal())
-    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO));
-  else if (MO.isSymbol())
-    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
-  else if (MO.isCPI())
-    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
-  else if (MO.isMBB())
-    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
-  else
-    llvm_unreachable("Unable to encode MachineOperand!");
-  return 0;
-}
-unsigned SparcCodeEmitter::getCallTargetOpValue(const MachineInstr &MI,
-                                                unsigned opIdx) const {
-  const MachineOperand MO = MI.getOperand(opIdx);
-  return getMachineOpValue(MI, MO);
-}
-
-unsigned SparcCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
-                                                  unsigned opIdx) const {
-  const MachineOperand MO = MI.getOperand(opIdx);
-  return getMachineOpValue(MI, MO);
-}
-
-unsigned SparcCodeEmitter::getBranchPredTargetOpValue(const MachineInstr &MI,
-                                                      unsigned opIdx) const {
-  const MachineOperand MO = MI.getOperand(opIdx);
-  return getMachineOpValue(MI, MO);
-}
-
-unsigned SparcCodeEmitter::getBranchOnRegTargetOpValue(const MachineInstr &MI,
-                                                       unsigned opIdx) const {
-  const MachineOperand MO = MI.getOperand(opIdx);
-  return getMachineOpValue(MI, MO);
-}
-
-unsigned SparcCodeEmitter::getRelocation(const MachineInstr &MI,
-                                         const MachineOperand &MO) const {
-
-  unsigned TF = MO.getTargetFlags();
-  switch (TF) {
-  default:
-  case SparcMCExpr::VK_Sparc_None:  break;
-  case SparcMCExpr::VK_Sparc_LO:    return SP::reloc_sparc_lo;
-  case SparcMCExpr::VK_Sparc_HI:    return SP::reloc_sparc_hi;
-  case SparcMCExpr::VK_Sparc_H44:   return SP::reloc_sparc_h44;
-  case SparcMCExpr::VK_Sparc_M44:   return SP::reloc_sparc_m44;
-  case SparcMCExpr::VK_Sparc_L44:   return SP::reloc_sparc_l44;
-  case SparcMCExpr::VK_Sparc_HH:    return SP::reloc_sparc_hh;
-  case SparcMCExpr::VK_Sparc_HM:    return SP::reloc_sparc_hm;
-  }
-
-  unsigned Opc = MI.getOpcode();
-  switch (Opc) {
-  default: break;
-  case SP::CALL:    return SP::reloc_sparc_pc30;
-  case SP::BA:
-  case SP::BCOND:
-  case SP::FBCOND:  return SP::reloc_sparc_pc22;
-  case SP::BPXCC:   return SP::reloc_sparc_pc19;
-  }
-  llvm_unreachable("unknown reloc!");
-}
-
-void SparcCodeEmitter::emitGlobalAddress(const GlobalValue *GV,
-                                        unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
-                                             const_cast<GlobalValue *>(GV), 0,
-                                             true));
-}
-
-void SparcCodeEmitter::
-emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
-                                                 Reloc, ES, 0, 0));
-}
-
-void SparcCodeEmitter::
-emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
-                                                    Reloc, CPI, 0, false));
-}
-
-void SparcCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
-                                            unsigned Reloc) const {
-  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
-                                             Reloc, BB));
-}
-
-
-/// createSparcJITCodeEmitterPass - Return a pass that emits the collected Sparc
-/// code to the specified MCE object.
-FunctionPass *llvm::createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
-                                                 JITCodeEmitter &JCE) {
-  return new SparcCodeEmitter(TM, JCE);
-}
-
-#include "SparcGenCodeEmitter.inc"

diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 3cdfda3..1b67b4b 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp

@@ -46,7 +46,7 @@
 
   DebugLoc dl = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
   const SparcInstrInfo &TII =
-    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   if (NumBytes >= -4096 && NumBytes < 4096) {
     BuildMI(MBB, MBBI, dl, TII.get(ADDri), SP::O6)
@@ -88,7 +88,7 @@
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const SparcInstrInfo &TII =
-    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
@@ -153,7 +153,7 @@
   SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   const SparcInstrInfo &TII =
-    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
   assert(MBBI->getOpcode() == SP::RETL &&
          "Can only put epilog before 'retl' instruction!");

diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index a7d1b89..9e53994 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARC_FRAMEINFO_H
-#define SPARC_FRAMEINFO_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H
+#define LLVM_LIB_TARGET_SPARC_SPARCFRAMELOWERING_H
 
 #include "Sparc.h"
 #include "llvm/Target/TargetFrameLowering.h"

diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index 2fade27..b3b029e 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp

@@ -66,16 +66,15 @@
 }  // end anonymous namespace
 
 SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
-  unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg,
-                             getTargetLowering()->getPointerTy()).getNode();
+  unsigned GlobalBaseReg =
+      TM.getSubtargetImpl()->getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
 }
 
 bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
                                      SDValue &Base, SDValue &Offset) {
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(),
-                                       getTargetLowering()->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI->getPointerTy());
     Offset = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
@@ -90,8 +89,8 @@
         if (FrameIndexSDNode *FIN =
                 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
           // Constant offset from frame ref.
-          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(),
-                                           getTargetLowering()->getPointerTy());
+          Base =
+              CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI->getPointerTy());
         } else {
           Base = Addr.getOperand(0);
         }
@@ -135,7 +134,7 @@
   }
 
   R1 = Addr;
-  R2 = CurDAG->getRegister(SP::G0, getTargetLowering()->getPointerTy());
+  R2 = CurDAG->getRegister(SP::G0, TLI->getPointerTy());
   return true;
 }
 

diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 990f52a..e6a69d2 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp

@@ -190,8 +190,8 @@
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 DAG.getTarget(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
@@ -250,8 +250,8 @@
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 DAG.getTarget(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Sparc64);
@@ -349,8 +349,8 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
 
   const unsigned StackOffset = 92;
@@ -474,7 +474,7 @@
                           DAG.getConstant(Offset, MVT::i32));
       Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr,
                             MachinePointerInfo(),
-                            VA.getValVT(), false, false,0);
+                            VA.getValVT(), false, false, false,0);
       Load = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Load);
     }
     InVals.push_back(Load);
@@ -549,8 +549,8 @@
 
   // Analyze arguments according to CC_Sparc64.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc64);
 
   // The argument array begins at %fp+BIAS+128, after the register save area.
@@ -698,8 +698,8 @@
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 DAG.getTarget(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
 
   // Get the size of the outgoing arguments stack space requirement.
@@ -915,7 +915,7 @@
 
   // Add a register mask operand representing the call-preserved registers.
   const SparcRegisterInfo *TRI =
-    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
+      getTargetMachine().getSubtarget<SparcSubtarget>().getRegisterInfo();
   const uint32_t *Mask = ((hasReturnsTwice)
                           ? TRI->getRTCallPreservedMask(CallConv)
                           : TRI->getCallPreservedMask(CallConv));
@@ -934,8 +934,8 @@
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 DAG.getTarget(), RVLocs, *DAG.getContext());
+  CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
 
@@ -1061,8 +1061,8 @@
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
-                 DAG.getTarget(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallOperands(CLI.Outs, CC_Sparc64);
 
   // Get the size of the outgoing arguments stack space requirement.
@@ -1228,10 +1228,10 @@
 
   // Add a register mask operand representing the call-preserved registers.
   const SparcRegisterInfo *TRI =
-    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
-  const uint32_t *Mask = ((hasReturnsTwice)
-                          ? TRI->getRTCallPreservedMask(CLI.CallConv)
-                          : TRI->getCallPreservedMask(CLI.CallConv));
+      getTargetMachine().getSubtarget<SparcSubtarget>().getRegisterInfo();
+  const uint32_t *Mask =
+      ((hasReturnsTwice) ? TRI->getRTCallPreservedMask(CLI.CallConv)
+                         : TRI->getCallPreservedMask(CLI.CallConv));
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
@@ -1255,8 +1255,8 @@
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
-                 DAG.getTarget(), RVLocs, *DAG.getContext());
+  CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   // Set inreg flag manually for codegen generated library calls that
   // return float.
@@ -1366,7 +1366,7 @@
 }
 
 SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
-  : TargetLowering(TM, new SparcELFTargetObjectFile()) {
+  : TargetLowering(TM) {
   Subtarget = &TM.getSubtarget<SparcSubtarget>();
 
   // Set up the register classes.
@@ -1905,7 +1905,9 @@
     Ops.push_back(Symbol);
     Ops.push_back(DAG.getRegister(SP::O0, PtrVT));
     const uint32_t *Mask = getTargetMachine()
-      .getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+                               .getSubtargetImpl()
+                               ->getRegisterInfo()
+                               ->getCallPreservedMask(CallingConv::C);
     assert(Mask && "Missing call preserved mask for calling convention");
     Ops.push_back(DAG.getRegisterMask(Mask));
     Ops.push_back(InFlag);
@@ -2754,9 +2756,10 @@
                            ISD::SETNE);
   }
   // MulResult is a node with an illegal type. Because such things are not
-  // generally permitted during this phase of legalization, delete the
-  // node. The above EXTRACT_ELEMENT nodes should have been folded.
-  DAG.DeleteNode(MulResult.getNode());
+  // generally permitted during this phase of legalization, ensure that
+  // nothing is left using the node. The above EXTRACT_ELEMENT nodes should have
+  // been folded.
+  assert(MulResult->use_empty() && "Illegally typed node still in use!");
 
   SDValue Ops[2] = { BottomHalf, TopHalf } ;
   return DAG.getMergeValues(Ops, dl);
@@ -2900,7 +2903,8 @@
 SparcTargetLowering::expandSelectCC(MachineInstr *MI,
                                     MachineBasicBlock *BB,
                                     unsigned BROpcode) const {
-  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  const TargetInstrInfo &TII =
+      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   unsigned CC = (SPCC::CondCodes)MI->getOperand(3).getImm();
 
@@ -2961,7 +2965,8 @@
                                      MachineBasicBlock *MBB,
                                      unsigned Opcode,
                                      unsigned CondCode) const {
-  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  const TargetInstrInfo &TII =
+      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 

diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index a24cc82..a62d569 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARC_ISELLOWERING_H
-#define SPARC_ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCISELLOWERING_H
+#define LLVM_LIB_TARGET_SPARC_SPARCISELLOWERING_H
 
 #include "Sparc.h"
 #include "llvm/Target/TargetLowering.h"

diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index 3a1472e..fe93ed7 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARCINSTRUCTIONINFO_H
-#define SPARCINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCINSTRINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCINSTRINFO_H
 
 #include "SparcRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"

diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 960261c..c320239 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td

@@ -331,7 +331,7 @@
                    [(flushw)]>;
 }
 
-let isBarrier = 1, isTerminator = 1, rd = 0b1000, rs1 = 0, simm13 = 5 in
+let isBarrier = 1, isTerminator = 1, rd = 0b01000, rs1 = 0, simm13 = 5 in
   def TA5 : F3_2<0b10, 0b111010, (outs), (ins), "ta 5", [(trap)]>;
 
 let rd = 0 in

diff --git a/lib/Target/Sparc/SparcInstrVIS.td b/lib/Target/Sparc/SparcInstrVIS.td
index 3e2b49d..d9adf3e 100644
--- a/lib/Target/Sparc/SparcInstrVIS.td
+++ b/lib/Target/Sparc/SparcInstrVIS.td

@@ -71,13 +71,13 @@
 def FEXPAND     : VISInst2<0b001001101, "fexpand">;
 def FPMERGE     : VISInst <0b001001011, "fpmerge">;
 
-def FMUL8X16    : VISInst<0b00110001, "fmul8x16">;
-def FMUL8X16AU  : VISInst<0b00110011, "fmul8x16au">;
-def FMUL8X16AL  : VISInst<0b00110101, "fmul8x16al">;
-def FMUL8SUX16  : VISInst<0b00110110, "fmul8sux16">;
-def FMUL8ULX16  : VISInst<0b00110111, "fmul8ulx16">;
-def FMULD8SUX16 : VISInst<0b00111000, "fmuld8sux16">;
-def FMULD8ULX16 : VISInst<0b00111001, "fmuld8ulx16">;
+def FMUL8X16    : VISInst<0b000110001, "fmul8x16">;
+def FMUL8X16AU  : VISInst<0b000110011, "fmul8x16au">;
+def FMUL8X16AL  : VISInst<0b000110101, "fmul8x16al">;
+def FMUL8SUX16  : VISInst<0b000110110, "fmul8sux16">;
+def FMUL8ULX16  : VISInst<0b000110111, "fmul8ulx16">;
+def FMULD8SUX16 : VISInst<0b000111000, "fmuld8sux16">;
+def FMULD8ULX16 : VISInst<0b000111001, "fmuld8ulx16">;
 
 def ALIGNADDR   : VISInst<0b000011000, "alignaddr", I64Regs>;
 def ALIGNADDRL  : VISInst<0b000011010, "alignaddrl", I64Regs>;
@@ -134,7 +134,7 @@
 def EDGE32      : VISInst<0b000001000,  "edge32",  I64Regs>;
 def EDGE32L     : VISInst<0b000001010,  "edge32l", I64Regs>;
 
-def PDIST       : VISInst<0b00111110, "pdist">;
+def PDIST       : VISInst<0b000111110, "pdist">;
 
 def ARRAY8      : VISInst<0b000010000, "array8",  I64Regs>;
 def ARRAY16     : VISInst<0b000010010, "array16", I64Regs>;
@@ -181,7 +181,7 @@
 
 }
 
-def FCHKSM16 : VISInst<0b01000100, "fchksm16">;
+def FCHKSM16 : VISInst<0b001000100, "fchksm16">;
 
 def FHADDS   : F3_3<0b10, 0b110100, 0b001100001,
                     (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2),
@@ -229,14 +229,14 @@
 
 def FPADD64   : VISInst<0b001000010, "fpadd64">;
 
-def FSLL16    : VISInst<0b00100001, "fsll16">;
-def FSRL16    : VISInst<0b00100011, "fsrl16">;
-def FSLL32    : VISInst<0b00100101, "fsll32">;
-def FSRL32    : VISInst<0b00100111, "fsrl32">;
-def FSLAS16   : VISInst<0b00101001, "fslas16">;
-def FSRA16    : VISInst<0b00101011, "fsra16">;
-def FSLAS32   : VISInst<0b00101101, "fslas32">;
-def FSRA32    : VISInst<0b00101111, "fsra32">;
+def FSLL16    : VISInst<0b000100001, "fsll16">;
+def FSRL16    : VISInst<0b000100011, "fsrl16">;
+def FSLL32    : VISInst<0b000100101, "fsll32">;
+def FSRL32    : VISInst<0b000100111, "fsrl32">;
+def FSLAS16   : VISInst<0b000101001, "fslas16">;
+def FSRA16    : VISInst<0b000101011, "fsra16">;
+def FSLAS32   : VISInst<0b000101101, "fslas32">;
+def FSRA32    : VISInst<0b000101111, "fsra32">;
 
 let rs1 = 0 in
 def LZCNT     : VISInstFormat<0b000010111, (outs I64Regs:$rd),

diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp
deleted file mode 100644
index d0eec98..0000000
--- a/lib/Target/Sparc/SparcJITInfo.cpp
+++ /dev/null

@@ -1,326 +0,0 @@
-//===-- SparcJITInfo.cpp - Implement the Sparc JIT Interface --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the JIT interfaces for the Sparc target.
-//
-//===----------------------------------------------------------------------===//
-#include "SparcJITInfo.h"
-#include "Sparc.h"
-#include "SparcRelocations.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/Support/Memory.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-/// JITCompilerFunction - This contains the address of the JIT function used to
-/// compile a function lazily.
-static TargetJITInfo::JITCompilerFn JITCompilerFunction;
-
-extern "C" void SparcCompilationCallback();
-
-extern "C" {
-#if defined (__sparc__)
-
-#if defined(__arch64__)
-#define FRAME_PTR(X) #X "+2047"
-#else
-#define FRAME_PTR(X) #X
-#endif
-
-  asm(
-      ".text\n"
-      "\t.align 4\n"
-      "\t.global SparcCompilationCallback\n"
-      "\t.type SparcCompilationCallback, #function\n"
-      "SparcCompilationCallback:\n"
-      // Save current register window and create stack.
-      // 128 (save area) + 6*8 (for arguments) + 16*8 (for float regfile) = 304
-      "\tsave %sp, -304, %sp\n"
-      // save float regfile to the stack.
-      "\tstd %f0,  [" FRAME_PTR(%fp) "-0]\n"
-      "\tstd %f2,  [" FRAME_PTR(%fp) "-8]\n"
-      "\tstd %f4,  [" FRAME_PTR(%fp) "-16]\n"
-      "\tstd %f6,  [" FRAME_PTR(%fp) "-24]\n"
-      "\tstd %f8,  [" FRAME_PTR(%fp) "-32]\n"
-      "\tstd %f10, [" FRAME_PTR(%fp) "-40]\n"
-      "\tstd %f12, [" FRAME_PTR(%fp) "-48]\n"
-      "\tstd %f14, [" FRAME_PTR(%fp) "-56]\n"
-      "\tstd %f16, [" FRAME_PTR(%fp) "-64]\n"
-      "\tstd %f18, [" FRAME_PTR(%fp) "-72]\n"
-      "\tstd %f20, [" FRAME_PTR(%fp) "-80]\n"
-      "\tstd %f22, [" FRAME_PTR(%fp) "-88]\n"
-      "\tstd %f24, [" FRAME_PTR(%fp) "-96]\n"
-      "\tstd %f26, [" FRAME_PTR(%fp) "-104]\n"
-      "\tstd %f28, [" FRAME_PTR(%fp) "-112]\n"
-      "\tstd %f30, [" FRAME_PTR(%fp) "-120]\n"
-      // stubaddr is in %g1.
-      "\tcall SparcCompilationCallbackC\n"
-      "\t  mov %g1, %o0\n"
-      // restore float regfile from the stack.
-      "\tldd [" FRAME_PTR(%fp) "-0],   %f0\n"
-      "\tldd [" FRAME_PTR(%fp) "-8],   %f2\n"
-      "\tldd [" FRAME_PTR(%fp) "-16],  %f4\n"
-      "\tldd [" FRAME_PTR(%fp) "-24],  %f6\n"
-      "\tldd [" FRAME_PTR(%fp) "-32],  %f8\n"
-      "\tldd [" FRAME_PTR(%fp) "-40],  %f10\n"
-      "\tldd [" FRAME_PTR(%fp) "-48],  %f12\n"
-      "\tldd [" FRAME_PTR(%fp) "-56],  %f14\n"
-      "\tldd [" FRAME_PTR(%fp) "-64],  %f16\n"
-      "\tldd [" FRAME_PTR(%fp) "-72],  %f18\n"
-      "\tldd [" FRAME_PTR(%fp) "-80],  %f20\n"
-      "\tldd [" FRAME_PTR(%fp) "-88],  %f22\n"
-      "\tldd [" FRAME_PTR(%fp) "-96],  %f24\n"
-      "\tldd [" FRAME_PTR(%fp) "-104], %f26\n"
-      "\tldd [" FRAME_PTR(%fp) "-112], %f28\n"
-      "\tldd [" FRAME_PTR(%fp) "-120], %f30\n"
-      // restore original register window and
-      // copy %o0 to %g1
-      "\trestore %o0, 0, %g1\n"
-      // call the new stub
-      "\tjmp %g1\n"
-      "\t  nop\n"
-      "\t.size   SparcCompilationCallback, .-SparcCompilationCallback"
-      );
-#else
-  void SparcCompilationCallback() {
-    llvm_unreachable(
-      "Cannot call SparcCompilationCallback() on a non-sparc arch!");
-  }
-#endif
-}
-
-
-#define SETHI_INST(imm, rd)    (0x01000000 | ((rd) << 25) | ((imm) & 0x3FFFFF))
-#define JMP_INST(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x38 << 19) \
-                                | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
-#define NOP_INST               SETHI_INST(0, 0)
-#define OR_INST_I(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x02 << 19) \
-                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
-#define OR_INST_R(rs1, rs2, rd) (0x80000000 | ((rd) << 25) | (0x02 << 19) \
-                                 | ((rs1) << 14) | (0 << 13) | ((rs2) & 0x1F))
-#define RDPC_INST(rd)           (0x80000000 | ((rd) << 25) | (0x28 << 19) \
-                                 | (5 << 14))
-#define LDX_INST(rs1, imm, rd)  (0xC0000000 | ((rd) << 25) | (0x0B << 19) \
-                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
-#define SLLX_INST(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x25 << 19) \
-                                 | ((rs1) << 14) | (3 << 12) | ((imm) & 0x3F))
-#define SUB_INST(rs1, imm, rd)  (0x80000000 | ((rd) << 25) | (0x04 << 19) \
-                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
-#define XOR_INST(rs1, imm, rd)  (0x80000000 | ((rd) << 25) | (0x03 << 19) \
-                                 | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
-#define BA_INST(tgt)             (0x10800000 | ((tgt) & 0x3FFFFF))
-
-// Emit instructions to jump to Addr and store the starting address of
-// the instructions emitted in the scratch register.
-static void emitInstrForIndirectJump(intptr_t Addr,
-                                     unsigned scratch,
-                                     SmallVectorImpl<uint32_t> &Insts) {
-
-  if (isInt<13>(Addr)) {
-    // Emit: jmpl %g0+Addr, <scratch>
-    //         nop
-    Insts.push_back(JMP_INST(0, LO10(Addr), scratch));
-    Insts.push_back(NOP_INST);
-    return;
-  }
-
-  if (isUInt<32>(Addr)) {
-    // Emit: sethi %hi(Addr), scratch
-    //       jmpl scratch+%lo(Addr), scratch
-    //         sub scratch, 4, scratch
-    Insts.push_back(SETHI_INST(HI22(Addr), scratch));
-    Insts.push_back(JMP_INST(scratch, LO10(Addr), scratch));
-    Insts.push_back(SUB_INST(scratch, 4, scratch));
-    return;
-  }
-
-  if (Addr < 0 && isInt<33>(Addr)) {
-    // Emit: sethi %hix(Addr), scratch)
-    //       xor   scratch, %lox(Addr), scratch
-    //       jmpl scratch+0, scratch
-    //         sub scratch, 8, scratch
-    Insts.push_back(SETHI_INST(HIX22(Addr), scratch));
-    Insts.push_back(XOR_INST(scratch, LOX10(Addr), scratch));
-    Insts.push_back(JMP_INST(scratch, 0, scratch));
-    Insts.push_back(SUB_INST(scratch, 8, scratch));
-    return;
-  }
-
-  // Emit: rd %pc, scratch
-  //       ldx [scratch+16], scratch
-  //       jmpl scratch+0, scratch
-  //         sub scratch, 8, scratch
-  //       <Addr: 8 byte>
-  Insts.push_back(RDPC_INST(scratch));
-  Insts.push_back(LDX_INST(scratch, 16, scratch));
-  Insts.push_back(JMP_INST(scratch, 0, scratch));
-  Insts.push_back(SUB_INST(scratch, 8, scratch));
-  Insts.push_back((uint32_t)(((int64_t)Addr) >> 32) & 0xffffffff);
-  Insts.push_back((uint32_t)(Addr & 0xffffffff));
-
-  // Instruction sequence without rdpc instruction
-  // 7 instruction and 2 scratch register
-  // Emit: sethi %hh(Addr), scratch
-  //       or scratch, %hm(Addr), scratch
-  //       sllx scratch, 32, scratch
-  //       sethi %hi(Addr), scratch2
-  //       or scratch, scratch2, scratch
-  //       jmpl scratch+%lo(Addr), scratch
-  //         sub scratch, 20, scratch
-  // Insts.push_back(SETHI_INST(HH22(Addr), scratch));
-  // Insts.push_back(OR_INST_I(scratch, HM10(Addr), scratch));
-  // Insts.push_back(SLLX_INST(scratch, 32, scratch));
-  // Insts.push_back(SETHI_INST(HI22(Addr), scratch2));
-  // Insts.push_back(OR_INST_R(scratch, scratch2, scratch));
-  // Insts.push_back(JMP_INST(scratch, LO10(Addr), scratch));
-  // Insts.push_back(SUB_INST(scratch, 20, scratch));
-}
-
-extern "C" void *SparcCompilationCallbackC(intptr_t StubAddr) {
-  // Get the address of the compiled code for this function.
-  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
-
-  // Rewrite the function stub so that we don't end up here every time we
-  // execute the call. We're replacing the stub instructions with code
-  // that jumps to the compiled function:
-
-  SmallVector<uint32_t, 8> Insts;
-  intptr_t diff = (NewVal - StubAddr) >> 2;
-  if (isInt<22>(diff)) {
-    // Use branch instruction to jump
-    Insts.push_back(BA_INST(diff));
-    Insts.push_back(NOP_INST);
-  } else {
-    // Otherwise, use indirect jump to the compiled function
-    emitInstrForIndirectJump(NewVal, 1, Insts);
-  }
-
-  for (unsigned i = 0, e = Insts.size(); i != e; ++i)
-    *(uint32_t *)(StubAddr + i*4) = Insts[i];
-
-  sys::Memory::InvalidateInstructionCache((void*) StubAddr, Insts.size() * 4);
-  return (void*)StubAddr;
-}
-
-
-void SparcJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
-  llvm_unreachable("FIXME: Implement SparcJITInfo::"
-                   "replaceMachineCodeForFunction");
-}
-
-
-TargetJITInfo::StubLayout SparcJITInfo::getStubLayout() {
-  // The stub contains maximum of 4 4-byte instructions and 8 bytes for address,
-  // aligned at 32 bytes.
-  // See emitFunctionStub and emitInstrForIndirectJump for details.
-  StubLayout Result = { 4*4 + 8, 32 };
-  return Result;
-}
-
-void *SparcJITInfo::emitFunctionStub(const Function *F, void *Fn,
-                                     JITCodeEmitter &JCE)
-{
-  JCE.emitAlignment(32);
-  void *Addr = (void*) (JCE.getCurrentPCValue());
-
-  intptr_t CurrentAddr = (intptr_t)Addr;
-  intptr_t EmittedAddr;
-  SmallVector<uint32_t, 8> Insts;
-  if (Fn != (void*)(intptr_t)SparcCompilationCallback) {
-    EmittedAddr = (intptr_t)Fn;
-    intptr_t diff = (EmittedAddr - CurrentAddr) >> 2;
-    if (isInt<22>(diff)) {
-      Insts.push_back(BA_INST(diff));
-      Insts.push_back(NOP_INST);
-    }
-  } else {
-    EmittedAddr = (intptr_t)SparcCompilationCallback;
-  }
-
-  if (Insts.size() == 0)
-    emitInstrForIndirectJump(EmittedAddr, 1, Insts);
-
-
-  if (!sys::Memory::setRangeWritable(Addr, 4 * Insts.size()))
-    llvm_unreachable("ERROR: Unable to mark stub writable.");
-
-  for (unsigned i = 0, e = Insts.size(); i != e; ++i)
-    JCE.emitWordBE(Insts[i]);
-
-  sys::Memory::InvalidateInstructionCache(Addr, 4 * Insts.size());
-  if (!sys::Memory::setRangeExecutable(Addr, 4 * Insts.size()))
-    llvm_unreachable("ERROR: Unable to mark stub executable.");
-
-  return Addr;
-}
-
-
-TargetJITInfo::LazyResolverFn
-SparcJITInfo::getLazyResolverFunction(JITCompilerFn F) {
-  JITCompilerFunction = F;
-  return SparcCompilationCallback;
-}
-
-/// relocate - Before the JIT can run a block of code that has been emitted,
-/// it must rewrite the code to contain the actual addresses of any
-/// referenced global symbols.
-void SparcJITInfo::relocate(void *Function, MachineRelocation *MR,
-                            unsigned NumRelocs, unsigned char *GOTBase) {
-  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
-    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
-    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
-
-    switch ((SP::RelocationType) MR->getRelocationType()) {
-    case SP::reloc_sparc_hi:
-      ResultPtr = (ResultPtr >> 10) & 0x3fffff;
-      break;
-
-    case SP::reloc_sparc_lo:
-      ResultPtr = (ResultPtr & 0x3ff);
-      break;
-
-    case SP::reloc_sparc_pc30:
-      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffffff;
-      break;
-
-    case SP::reloc_sparc_pc22:
-      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffff;
-      break;
-
-    case SP::reloc_sparc_pc19:
-      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x7ffff;
-      break;
-
-    case SP::reloc_sparc_h44:
-      ResultPtr = (ResultPtr >> 22) & 0x3fffff;
-      break;
-
-    case SP::reloc_sparc_m44:
-      ResultPtr = (ResultPtr >> 12) & 0x3ff;
-      break;
-
-    case SP::reloc_sparc_l44:
-      ResultPtr = (ResultPtr & 0xfff);
-      break;
-
-    case SP::reloc_sparc_hh:
-      ResultPtr = (((int64_t)ResultPtr) >> 42) & 0x3fffff;
-      break;
-
-    case SP::reloc_sparc_hm:
-      ResultPtr = (((int64_t)ResultPtr) >> 32) & 0x3ff;
-      break;
-
-    }
-    *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
-  }
-}

diff --git a/lib/Target/Sparc/SparcJITInfo.h b/lib/Target/Sparc/SparcJITInfo.h
deleted file mode 100644
index ff1b43a..0000000
--- a/lib/Target/Sparc/SparcJITInfo.h
+++ /dev/null

@@ -1,67 +0,0 @@
-//==- SparcJITInfo.h - Sparc Implementation of the JIT Interface -*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the SparcJITInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SPARCJITINFO_H
-#define SPARCJITINFO_H
-
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Target/TargetJITInfo.h"
-
-namespace llvm {
-class SparcTargetMachine;
-
-class SparcJITInfo : public TargetJITInfo {
-
-  bool IsPIC;
-
-  public:
-  explicit SparcJITInfo()
-    :  IsPIC(false) {}
-
-  /// replaceMachineCodeForFunction - Make it so that calling the function
-  /// whose machine code is at OLD turns into a call to NEW, perhaps by
-  /// overwriting OLD with a branch to NEW.  This is used for self-modifying
-  /// code.
-  ///
-  void replaceMachineCodeForFunction(void *Old, void *New) override;
-
-  // getStubLayout - Returns the size and alignment of the largest call stub
-  // on Sparc.
-  StubLayout getStubLayout() override;
-
-
-  /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
-  /// small native function that simply calls the function at the specified
-  /// address.
-  void *emitFunctionStub(const Function *F, void *Fn,
-                         JITCodeEmitter &JCE) override;
-
-  /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-  LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
-
-  /// relocate - Before the JIT can run a block of code that has been emitted,
-  /// it must rewrite the code to contain the actual addresses of any
-  /// referenced global symbols.
-  void relocate(void *Function, MachineRelocation *MR,
-                unsigned NumRelocs, unsigned char *GOTBase) override;
-
-  /// Initialize - Initialize internal stage for the function being JITted.
-  void Initialize(const MachineFunction &MF, bool isPIC) {
-    IsPIC = isPIC;
-  }
-
-};
-}
-
-#endif

diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.h b/lib/Target/Sparc/SparcMachineFunctionInfo.h
index 3783c16..1047442 100644
--- a/lib/Target/Sparc/SparcMachineFunctionInfo.h
+++ b/lib/Target/Sparc/SparcMachineFunctionInfo.h

@@ -10,8 +10,8 @@
 // This file declares  Sparc specific per-machine-function information.
 //
 //===----------------------------------------------------------------------===//
-#ifndef SPARCMACHINEFUNCTIONINFO_H
-#define SPARCMACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCMACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 

diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index dc1ec7c..3cca98f 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp

@@ -108,7 +108,7 @@
     return;
   }
 
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   // FIXME: it would be better to scavenge a register here instead of
   // reserving G1 all of the time.
@@ -174,7 +174,7 @@
 
   if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
     if (MI.getOpcode() == SP::STQFri) {
-      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
       unsigned SrcReg = MI.getOperand(2).getReg();
       unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
       unsigned SrcOddReg  = getSubReg(SrcReg, SP::sub_odd64);
@@ -186,7 +186,7 @@
       MI.getOperand(2).setReg(SrcOddReg);
       Offset += 8;
     } else if (MI.getOpcode() == SP::LDQFri) {
-      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
       unsigned DestReg     = MI.getOperand(0).getReg();
       unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
       unsigned DestOddReg  = getSubReg(DestReg, SP::sub_odd64);

diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 77f879a..63567b0 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARCREGISTERINFO_H
-#define SPARCREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCREGISTERINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCREGISTERINFO_H
 
 #include "llvm/Target/TargetRegisterInfo.h"
 

diff --git a/lib/Target/Sparc/SparcRelocations.h b/lib/Target/Sparc/SparcRelocations.h
deleted file mode 100644
index c1ff78d..0000000
--- a/lib/Target/Sparc/SparcRelocations.h
+++ /dev/null

@@ -1,56 +0,0 @@
-//===-- SparcRelocations.h - Sparc Code Relocations -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the Sparc target-specific relocation types
-// (for relocation-model=static).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SPARC_RELOCATIONS_H
-#define SPARC_RELOCATIONS_H
-
-#include "llvm/CodeGen/MachineRelocation.h"
-
-namespace llvm {
-  namespace SP {
-    enum RelocationType {
-      // reloc_sparc_hi - upper 22 bits
-      reloc_sparc_hi = 1,
-
-      // reloc_sparc_lo - lower 10 bits
-      reloc_sparc_lo = 2,
-
-      // reloc_sparc_pc30 - pc rel. 30 bits for call
-      reloc_sparc_pc30 = 3,
-
-     // reloc_sparc_pc22 - pc rel. 22 bits for branch
-      reloc_sparc_pc22 = 4,
-
-      // reloc_sparc_pc22 - pc rel. 19 bits for branch with icc/xcc
-      reloc_sparc_pc19 = 5,
-
-      // reloc_sparc_h44 - 43-22 bits
-      reloc_sparc_h44 = 6,
-
-      // reloc_sparc_m44 - 21-12 bits
-      reloc_sparc_m44 = 7,
-
-      // reloc_sparc_l44 - lower 12 bits
-      reloc_sparc_l44 = 8,
-
-      // reloc_sparc_hh - 63-42 bits
-      reloc_sparc_hh  = 9,
-
-      // reloc_sparc_hm - 41-32 bits
-      reloc_sparc_hm  = 10
-    };
-  }
-}
-
-#endif

diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.h b/lib/Target/Sparc/SparcSelectionDAGInfo.h
index 2346f41..a3a21d6 100644
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.h
+++ b/lib/Target/Sparc/SparcSelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARCSELECTIONDAGINFO_H
-#define SPARCSELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_SPARC_SPARCSELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 

diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index a335778..d503b2b 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h

@@ -11,13 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARC_SUBTARGET_H
-#define SPARC_SUBTARGET_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
+#define LLVM_LIB_TARGET_SPARC_SPARCSUBTARGET_H
 
 #include "SparcFrameLowering.h"
 #include "SparcInstrInfo.h"
 #include "SparcISelLowering.h"
-#include "SparcJITInfo.h"
 #include "SparcSelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -43,21 +42,25 @@
   SparcTargetLowering TLInfo;
   SparcSelectionDAGInfo TSInfo;
   SparcFrameLowering FrameLowering;
-  SparcJITInfo JITInfo;
 
 public:
   SparcSubtarget(const std::string &TT, const std::string &CPU,
                  const std::string &FS, TargetMachine &TM, bool is64bit);
 
-  const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
-  const SparcRegisterInfo *getRegisterInfo() const {
+  const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const SparcRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  const SparcTargetLowering *getTargetLowering() const { return &TLInfo; }
-  const SparcSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
-  SparcJITInfo *getJITInfo() { return &JITInfo; }
-  const DataLayout *getDataLayout() const { return &DL; }
+  const SparcTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const SparcSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const DataLayout *getDataLayout() const override { return &DL; }
 
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }

diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 0130fac..489bb69 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp

@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcTargetMachine.h"
+#include "SparcTargetObjectFile.h"
 #include "Sparc.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
@@ -32,10 +33,13 @@
                                        CodeGenOpt::Level OL,
                                        bool is64bit)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    TLOF(make_unique<SparcELFTargetObjectFile>()),
     Subtarget(TT, CPU, FS, *this, is64bit) {
   initAsmInfo();
 }
 
+SparcTargetMachine::~SparcTargetMachine() {}
+
 namespace {
 /// Sparc Code Generator Pass Configuration Options.
 class SparcPassConfig : public TargetPassConfig {
@@ -47,6 +51,7 @@
     return getTM<SparcTargetMachine>();
   }
 
+  void addIRPasses() override;
   bool addInstSelector() override;
   bool addPreEmitPass() override;
 };
@@ -56,15 +61,14 @@
   return new SparcPassConfig(this, PM);
 }
 
-bool SparcPassConfig::addInstSelector() {
-  addPass(createSparcISelDag(getSparcTargetMachine()));
-  return false;
+void SparcPassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass(&getSparcTargetMachine()));
+
+  TargetPassConfig::addIRPasses();
 }
 
-bool SparcTargetMachine::addCodeEmitter(PassManagerBase &PM,
-                                        JITCodeEmitter &JCE) {
-  // Machine code emitter pass for Sparc.
-  PM.add(createSparcJITCodeEmitterPass(*this, JCE));
+bool SparcPassConfig::addInstSelector() {
+  addPass(createSparcISelDag(getSparcTargetMachine()));
   return false;
 }
 

diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 03b5137..096e7c8 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARCTARGETMACHINE_H
-#define SPARCTARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCTARGETMACHINE_H
+#define LLVM_LIB_TARGET_SPARC_SPARCTARGETMACHINE_H
 
 #include "SparcInstrInfo.h"
 #include "SparcSubtarget.h"
@@ -21,37 +21,22 @@
 namespace llvm {
 
 class SparcTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   SparcSubtarget Subtarget;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL, bool is64bit);
+  ~SparcTargetMachine() override;
 
-  const SparcInstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const TargetFrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
   const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const SparcRegisterInfo *getRegisterInfo() const override {
-    return getSubtargetImpl()->getRegisterInfo();
-  }
-  const SparcTargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const SparcSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
-  SparcJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
 };
 
 /// SparcV8TargetMachine - Sparc 32-bit target machine

diff --git a/lib/Target/Sparc/SparcTargetObjectFile.h b/lib/Target/Sparc/SparcTargetObjectFile.h
index c60675b..76c8cca 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.h
+++ b/lib/Target/Sparc/SparcTargetObjectFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_SPARC_TARGETOBJECTFILE_H
-#define LLVM_TARGET_SPARC_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCTARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_SPARC_SPARCTARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 

diff --git a/lib/Target/Sparc/SparcTargetStreamer.h b/lib/Target/Sparc/SparcTargetStreamer.h
index 3767d8e..3b50350 100644
--- a/lib/Target/Sparc/SparcTargetStreamer.h
+++ b/lib/Target/Sparc/SparcTargetStreamer.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SPARCTARGETSTREAMER_H
-#define SPARCTARGETSTREAMER_H
+#ifndef LLVM_LIB_TARGET_SPARC_SPARCTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_SPARC_SPARCTARGETSTREAMER_H
 
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCStreamer.h"

diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 758be41..0955f4a 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp

@@ -345,7 +345,7 @@
                         SMLoc NameLoc, OperandVector &Operands) override;
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
+                               uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
   // Used by the TableGen code to parse particular operand types.
@@ -677,7 +677,7 @@
 bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
                                                MCStreamer &Out,
-                                               unsigned &ErrorInfo,
+                                               uint64_t &ErrorInfo,
                                                bool MatchingInlineAsm) {
   MCInst Inst;
   unsigned MatchResult;
@@ -696,7 +696,7 @@
     // Special case the error message for the very common case where only
     // a single subtarget feature is missing
     std::string Msg = "instruction requires:";
-    unsigned Mask = 1;
+    uint64_t Mask = 1;
     for (unsigned I = 0; I < sizeof(ErrorInfo) * 8 - 1; ++I) {
       if (ErrorInfo & Mask) {
         Msg += " ";
@@ -709,7 +709,7 @@
 
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
+    if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 

diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index 4da2d0f..41a614d 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt

@@ -5,7 +5,7 @@
 tablegen(LLVM SystemZGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM SystemZGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM SystemZGenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM SystemZGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM SystemZGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM SystemZGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM SystemZGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM SystemZGenSubtargetInfo.inc -gen-subtarget)

diff --git a/lib/Target/SystemZ/Disassembler/LLVMBuild.txt b/lib/Target/SystemZ/Disassembler/LLVMBuild.txt
index c3081f5..fd78269 100644
--- a/lib/Target/SystemZ/Disassembler/LLVMBuild.txt
+++ b/lib/Target/SystemZ/Disassembler/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = SystemZDisassembler
 parent = SystemZ
-required_libraries = MC Support SystemZDesc SystemZInfo
+required_libraries = MC MCDisassembler Support SystemZDesc SystemZInfo
 add_to_library_groups = SystemZ

diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 2350776..23173bf 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp

@@ -12,7 +12,6 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -28,11 +27,10 @@
     : MCDisassembler(STI, Ctx) {}
   virtual ~SystemZDisassembler() {}
 
-  // Override MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
-                              const MemoryObject &region, uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const override;
+  DecodeStatus getInstruction(MCInst &instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
 };
 } // end anonymous namespace
 
@@ -288,14 +286,13 @@
 #include "SystemZGenDisassemblerTables.inc"
 
 DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                                 const MemoryObject &Region,
+                                                 ArrayRef<uint8_t> Bytes,
                                                  uint64_t Address,
-                                                 raw_ostream &os,
-                                                 raw_ostream &cs) const {
+                                                 raw_ostream &OS,
+                                                 raw_ostream &CS) const {
   // Get the first two bytes of the instruction.
-  uint8_t Bytes[6];
   Size = 0;
-  if (Region.readBytes(Address, 2, Bytes) == -1)
+  if (Bytes.size() < 2)
     return MCDisassembler::Fail;
 
   // The top 2 bits of the first byte specify the size.
@@ -312,7 +309,7 @@
   }
 
   // Read any remaining bytes.
-  if (Size > 2 && Region.readBytes(Address + 2, Size - 2, Bytes + 2) == -1)
+  if (Bytes.size() < Size)
     return MCDisassembler::Fail;
 
   // Construct the instruction.

diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index dce482b..753903c 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEMZINSTPRINTER_H
-#define LLVM_SYSTEMZINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/Support/Compiler.h"

diff --git a/lib/Target/SystemZ/LLVMBuild.txt b/lib/Target/SystemZ/LLVMBuild.txt
index 7781318..542aaee 100644
--- a/lib/Target/SystemZ/LLVMBuild.txt
+++ b/lib/Target/SystemZ/LLVMBuild.txt

@@ -31,5 +31,5 @@
 type = Library
 name = SystemZCodeGen
 parent = SystemZ
-required_libraries = AsmPrinter CodeGen Core MC Scalar SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
+required_libraries = AsmPrinter CodeGen Core MC SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target
 add_to_library_groups = SystemZ

diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index c46a36b..35887fa 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp

@@ -23,12 +23,5 @@
   Data64bitsDirective = "\t.quad\t";
   UsesELFSectionDirectiveForBSS = true;
   SupportsDebugInformation = true;
-  HasLEB128 = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
 }
-
-const MCSection *
-SystemZMCAsmInfo::getNonexecutableStackSection(MCContext &Ctx) const {
-  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
-                           0, SectionKind::getMetadata());
-}

diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index 1de97af..19b5b4b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SystemZTARGETASMINFO_H
-#define SystemZTARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCASMINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/Support/Compiler.h"
@@ -19,9 +19,6 @@
 class SystemZMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit SystemZMCAsmInfo(StringRef TT);
-
-  // Override MCAsmInfo;
-  const MCSection *getNonexecutableStackSection(MCContext &Ctx) const override;
 };
 
 } // end namespace llvm

diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
index a3aab71..52a8d1d 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCFixups.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEMZMCFIXUPS_H
-#define LLVM_SYSTEMZMCFIXUPS_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCFIXUPS_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCFIXUPS_H
 
 #include "llvm/MC/MCFixup.h"
 

diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index cc94869..6e82b6d 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp

@@ -181,15 +181,12 @@
   return new SystemZInstPrinter(MAI, MII, MRI);
 }
 
-static MCStreamer *createSystemZMCObjectStreamer(const Target &T, StringRef TT,
-                                                 MCContext &Ctx,
-                                                 MCAsmBackend &MAB,
-                                                 raw_ostream &OS,
-                                                 MCCodeEmitter *Emitter,
-                                                 const MCSubtargetInfo &STI,
-                                                 bool RelaxAll,
-                                                 bool NoExecStack) {
-  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+static MCStreamer *
+createSystemZMCObjectStreamer(const Target &T, StringRef TT, MCContext &Ctx,
+                              MCAsmBackend &MAB, raw_ostream &OS,
+                              MCCodeEmitter *Emitter,
+                              const MCSubtargetInfo &STI, bool RelaxAll) {
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
 }
 
 extern "C" void LLVMInitializeSystemZTargetMC() {

diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index cbaf9a8..5eb6526 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZMCTARGETDESC_H
-#define SYSTEMZMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCTARGETDESC_H
+#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
 

diff --git a/lib/Target/SystemZ/Makefile b/lib/Target/SystemZ/Makefile
index 445725b..732c317 100644
--- a/lib/Target/SystemZ/Makefile
+++ b/lib/Target/SystemZ/Makefile

@@ -15,7 +15,6 @@
 BUILT_SOURCES = SystemZGenRegisterInfo.inc \
 		SystemZGenAsmWriter.inc \
 		SystemZGenAsmMatcher.inc \
-		SystemZGenCodeEmitter.inc \
 		SystemZGenDisassemblerTables.inc \
 		SystemZGenInstrInfo.inc \
 		SystemZGenDAGISel.inc \

diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index 1579249..c8b95b2 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZ_H
-#define SYSTEMZ_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZ_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZ_H
 
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/Support/CodeGen.h"

diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 8b18bc1..f4f3ec7 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp

@@ -185,7 +185,8 @@
     MCSymbolRefExpr::Create(getSymbol(ZCPV->getGlobalValue()),
                             getModifierVariantKind(ZCPV->getModifier()),
                             OutContext);
-  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
+  uint64_t Size =
+      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(ZCPV->getType());
 
   OutStreamer.EmitValue(Expr, Size);
 }
@@ -229,7 +230,7 @@
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
+      const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);

diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.h b/lib/Target/SystemZ/SystemZAsmPrinter.h
index 20093bc..6467279 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZASMPRINTER_H
-#define SYSTEMZASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H
 
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/AsmPrinter.h"

diff --git a/lib/Target/SystemZ/SystemZCallingConv.h b/lib/Target/SystemZ/SystemZCallingConv.h
index 4b1569d..71605ac 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/lib/Target/SystemZ/SystemZCallingConv.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZCALLINGCONV_H
-#define SYSTEMZCALLINGCONV_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCALLINGCONV_H
 
 namespace llvm {
 namespace SystemZ {

diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.h b/lib/Target/SystemZ/SystemZConstantPoolValue.h
index 699718f..0bd8c20 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.h
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZCONSTANTPOOLVALUE_H
-#define SYSTEMZCONSTANTPOOLVALUE_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCONSTANTPOOLVALUE_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZCONSTANTPOOLVALUE_H
 
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/Support/ErrorHandling.h"

diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index dc210d6..ce99ee5 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp

@@ -47,7 +47,7 @@
     return *this;
   }
 
-  operator bool() const { return Def || Use; }
+  LLVM_EXPLICIT operator bool() const { return Def || Use; }
 
   // True if the register is defined or used in some form, either directly or
   // via a sub- or super-register.
@@ -458,7 +458,7 @@
 }
 
 bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
-  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
   TRI = &TII->getRegisterInfo();
 
   bool Changed = false;

diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 055dbe9..eff4ae3 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp

@@ -13,6 +13,7 @@
 #include "SystemZInstrInfo.h"
 #include "SystemZMachineFunctionInfo.h"
 #include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -65,7 +66,7 @@
                                      RegScavenger *RS) const {
   MachineFrameInfo *MFFrame = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   bool HasFP = hasFP(MF);
   SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
   bool IsVarArg = MF.getFunction()->isVarArg();
@@ -108,7 +109,8 @@
 // and end registers.
 static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
                         unsigned GPR64, bool IsImplicit) {
-  const TargetRegisterInfo *RI = MBB.getParent()->getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
   unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
   bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
   if (!IsLive || !IsImplicit) {
@@ -127,7 +129,7 @@
     return false;
 
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   bool IsVarArg = MF.getFunction()->isVarArg();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -216,7 +218,7 @@
     return false;
 
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   bool HasFP = hasFP(MF);
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -292,7 +294,7 @@
     else {
       Opcode = SystemZ::AGFI;
       // Make sure we maintain 8-byte stack alignment.
-      int64_t MinVal = -int64_t(1) << 31;
+      int64_t MinVal = -uint64_t(1) << 31;
       int64_t MaxVal = (int64_t(1) << 31) - 8;
       if (ThisVal < MinVal)
         ThisVal = MinVal;
@@ -311,7 +313,7 @@
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFFrame = MF.getFrameInfo();
   auto *ZII =
-    static_cast<const SystemZInstrInfo*>(MF.getTarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineModuleInfo &MMI = MF.getMMI();
@@ -408,7 +410,7 @@
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   auto *ZII =
-    static_cast<const SystemZInstrInfo*>(MF.getTarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
 
   // Skip the return instruction.

diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 4d5fe6d..cefa56f 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZFRAMELOWERING_H
-#define SYSTEMZFRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
 
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/Target/TargetFrameLowering.h"

diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 24f7584..5f84624 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp

@@ -140,7 +140,7 @@
   }
 
   const SystemZInstrInfo *getInstrInfo() const {
-    return getTargetMachine().getInstrInfo();
+    return getTargetMachine().getSubtargetImpl()->getInstrInfo();
   }
 
   // Try to fold more of the base or index of AM into AM, where IsBase
@@ -315,9 +315,9 @@
 
 public:
   SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
-    : SelectionDAGISel(TM, OptLevel),
-      Lowering(*TM.getTargetLowering()),
-      Subtarget(*TM.getSubtargetImpl()) { }
+      : SelectionDAGISel(TM, OptLevel),
+        Lowering(*TM.getSubtargetImpl()->getTargetLowering()),
+        Subtarget(*TM.getSubtargetImpl()) {}
 
   // Override MachineFunctionPass.
   const char *getPassName() const override {
@@ -1000,8 +1000,8 @@
   if (V1 == V2 && End1 == End2)
     return false;
 
-  return !AA->alias(AliasAnalysis::Location(V1, End1, Load->getTBAAInfo()),
-                    AliasAnalysis::Location(V2, End2, Store->getTBAAInfo()));
+  return !AA->alias(AliasAnalysis::Location(V1, End1, Load->getAAInfo()),
+                    AliasAnalysis::Location(V2, End2, Store->getAAInfo()));
 }
 
 bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {

diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 00c65f5..b282fca 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp

@@ -81,7 +81,7 @@
 }
 
 SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm)
-    : TargetLowering(tm, new TargetLoweringObjectFileELF()),
+    : TargetLowering(tm),
       Subtarget(tm.getSubtarget<SystemZSubtarget>()) {
   MVT PtrVT = getPointerTy();
 
@@ -339,9 +339,10 @@
   return Imm.isZero() || Imm.isNegZero();
 }
 
-bool SystemZTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
-                                                          unsigned,
-                                                          bool *Fast) const {
+bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                           unsigned,
+                                                           unsigned,
+                                                           bool *Fast) const {
   // Unaligned accesses should never be slower than the expanded version.
   // We check specifically for aligned accesses in the few cases where
   // they are required.
@@ -674,12 +675,11 @@
   SystemZMachineFunctionInfo *FuncInfo =
     MF.getInfo<SystemZMachineFunctionInfo>();
   auto *TFL = static_cast<const SystemZFrameLowering *>(
-      DAG.getTarget().getFrameLowering());
+      DAG.getSubtarget().getFrameLowering());
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
 
   unsigned NumFixedGPRs = 0;
@@ -782,7 +782,7 @@
   return Chain;
 }
 
-static bool canUseSiblingCall(CCState ArgCCInfo,
+static bool canUseSiblingCall(const CCState &ArgCCInfo,
                               SmallVectorImpl<CCValAssign> &ArgLocs) {
   // Punt if there are any indirect or stack arguments, or if the call
   // needs the call-saved argument register R6.
@@ -817,8 +817,7 @@
 
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState ArgCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), ArgLocs,
-                    *DAG.getContext());
+  CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
   // We don't support GuaranteedTailCallOpt, only automatically-detected
@@ -915,7 +914,8 @@
                                   RegsToPass[I].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -940,8 +940,7 @@
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RetLocs;
-  CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs,
-                    *DAG.getContext());
+  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
   RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
 
   // Copy all of the result registers out of their specified physreg.
@@ -972,8 +971,7 @@
 
   // Assign locations to each returned value.
   SmallVector<CCValAssign, 16> RetLocs;
-  CCState RetCCInfo(CallConv, IsVarArg, MF, DAG.getTarget(), RetLocs,
-                    *DAG.getContext());
+  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
   RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
 
   // Quick exit for void returns
@@ -1191,7 +1189,7 @@
                            Load->getChain(), Load->getBasePtr(),
                            Load->getPointerInfo(), Load->getMemoryVT(),
                            Load->isVolatile(), Load->isNonTemporal(),
-                           Load->getAlignment());
+                           Load->isInvariant(), Load->getAlignment());
 
   // Make sure that the second operand is an i32 with the right value.
   if (C.Op1.getValueType() != MVT::i32 ||
@@ -2614,7 +2612,7 @@
 SystemZTargetLowering::emitSelect(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const {
   const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(
-      MBB->getParent()->getTarget().getInstrInfo());
+      MBB->getParent()->getSubtarget().getInstrInfo());
 
   unsigned DestReg  = MI->getOperand(0).getReg();
   unsigned TrueReg  = MI->getOperand(1).getReg();
@@ -2663,7 +2661,7 @@
                                      unsigned StoreOpcode, unsigned STOCOpcode,
                                      bool Invert) const {
   const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(
-      MBB->getParent()->getTarget().getInstrInfo());
+      MBB->getParent()->getSubtarget().getInstrInfo());
 
   unsigned SrcReg     = MI->getOperand(0).getReg();
   MachineOperand Base = MI->getOperand(1);
@@ -2732,7 +2730,7 @@
                                             bool Invert) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool IsSubWord = (BitSize < 32);
 
@@ -2802,14 +2800,10 @@
     unsigned Tmp = MRI.createVirtualRegister(RC);
     BuildMI(MBB, DL, TII->get(BinOpcode), Tmp)
       .addReg(RotatedOldVal).addOperand(Src2);
-    if (BitSize < 32)
+    if (BitSize <= 32)
       // XILF with the upper BitSize bits set.
       BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
-        .addReg(Tmp).addImm(uint32_t(~0 << (32 - BitSize)));
-    else if (BitSize == 32)
-      // XILF with every bit set.
-      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
-        .addReg(Tmp).addImm(~uint32_t(0));
+        .addReg(Tmp).addImm(-1U << (32 - BitSize));
     else {
       // Use LCGR and add -1 to the result, which is more compact than
       // an XILF, XILH pair.
@@ -2856,7 +2850,7 @@
                                             unsigned BitSize) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool IsSubWord = (BitSize < 32);
 
@@ -2968,7 +2962,7 @@
                                           MachineBasicBlock *MBB) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Extract the operands.  Base can be a register or a frame index.
@@ -3085,7 +3079,7 @@
                                   bool ClearEven, unsigned SubReg) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -3117,7 +3111,7 @@
                                          unsigned Opcode) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
@@ -3287,7 +3281,7 @@
                                          unsigned Opcode) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
   MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 

diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index e21b050..887c236 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_SystemZ_ISELLOWERING_H
-#define LLVM_TARGET_SystemZ_ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZISELLOWERING_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZISELLOWERING_H
 
 #include "SystemZ.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -208,8 +208,9 @@
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
   bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
-                                     bool *Fast) const override;
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
+                                      unsigned Align,
+                                      bool *Fast) const override;
   bool isTruncateFree(Type *, Type *) const override;
   bool isTruncateFree(EVT, EVT) const override;
   const char *getTargetNodeName(unsigned Opcode) const override;
@@ -321,4 +322,4 @@
 };
 } // end namespace llvm
 
-#endif // LLVM_TARGET_SystemZ_ISELLOWERING_H
+#endif

diff --git a/lib/Target/SystemZ/SystemZInstrBuilder.h b/lib/Target/SystemZ/SystemZInstrBuilder.h
index 84196e9..464f79a 100644
--- a/lib/Target/SystemZ/SystemZInstrBuilder.h
+++ b/lib/Target/SystemZ/SystemZInstrBuilder.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZINSTRBUILDER_H
-#define SYSTEMZINSTRBUILDER_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRBUILDER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRBUILDER_H
 
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"

diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index f58ab47..8ff9553 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp

@@ -633,7 +633,7 @@
   LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
     : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
 
-  operator bool() const { return RegSize; }
+  LLVM_EXPLICIT operator bool() const { return RegSize; }
 
   unsigned RegSize, ImmLSB, ImmSize;
 };

diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 83009cb..d2e3f54 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_SYSTEMZINSTRINFO_H
-#define LLVM_TARGET_SYSTEMZINSTRINFO_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRINFO_H
 
 #include "SystemZ.h"
 #include "SystemZRegisterInfo.h"

diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 8081334..8dab44e 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp

@@ -448,7 +448,7 @@
 }
 
 bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
-  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
   MF = &F;
   uint64_t Size = initMBBInfo();
   if (Size <= MaxForwardRange || !mustRelaxABranch())

diff --git a/lib/Target/SystemZ/SystemZMCInstLower.h b/lib/Target/SystemZ/SystemZMCInstLower.h
index 90447ff..7173cfa 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.h
+++ b/lib/Target/SystemZ/SystemZMCInstLower.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEMZMCINSTLOWER_H
-#define LLVM_SYSTEMZMCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
 
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/Compiler.h"

diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index 50865f1..92c2ce7 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZMACHINEFUNCTIONINFO_H
-#define SYSTEMZMACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/MachineFunction.h"
 

diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index f03bcc4..64f5eeb 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp

@@ -35,7 +35,7 @@
 BitVector
 SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   if (TFI->hasFP(MF)) {
     // R11D is the frame pointer.  Reserve all aliases.
@@ -62,8 +62,8 @@
   MachineBasicBlock &MBB = *MI->getParent();
   MachineFunction &MF = *MBB.getParent();
   auto *TII =
-      static_cast<const SystemZInstrInfo *>(MF.getTarget().getInstrInfo());
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   DebugLoc DL = MI->getDebugLoc();
 
   // Decompose the frame index into a base and offset.
@@ -134,6 +134,6 @@
 
 unsigned
 SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   return TFI->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D;
 }

diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 9bffa46..212fe91 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SystemZREGISTERINFO_H
-#define SystemZREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H
 
 #include "SystemZ.h"
 #include "llvm/Target/TargetRegisterInfo.h"

diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index e9de146..a257d6b 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZSELECTIONDAGINFO_H
-#define SYSTEMZSELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 

diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index aad899c..ec7a8c4 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp

@@ -150,7 +150,7 @@
 }
 
 bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
-  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
 
   bool Changed = false;
   for (auto &MBB : F)

diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index 4e8c710..f881552 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SYSTEMZSUBTARGET_H
-#define SYSTEMZSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSUBTARGET_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZSUBTARGET_H
 
 #include "SystemZFrameLowering.h"
 #include "SystemZISelLowering.h"
@@ -55,14 +55,20 @@
   SystemZSubtarget(const std::string &TT, const std::string &CPU,
                    const std::string &FS, const TargetMachine &TM);
 
-  const TargetFrameLowering *getFrameLowering() const { return &FrameLowering; }
-  const SystemZInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  const DataLayout *getDataLayout() const { return &DL; }
-  const SystemZRegisterInfo *getRegisterInfo() const {
+  const TargetFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const SystemZInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const SystemZRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  const SystemZTargetLowering *getTargetLowering() const { return &TLInfo; }
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
+  const SystemZTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
 
   // This is important for reducing register pressure in vector code.
   bool useAA() const override { return true; }

diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 0122e99..d7c432e 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp

@@ -11,6 +11,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
 using namespace llvm;
 
@@ -25,10 +26,13 @@
                                            Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      TLOF(make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
+SystemZTargetMachine::~SystemZTargetMachine() {}
+
 namespace {
 /// SystemZ Code Generator Pass Configuration Options.
 class SystemZPassConfig : public TargetPassConfig {
@@ -49,7 +53,6 @@
 
 void SystemZPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
-  addPass(createPartiallyInlineLibCallsPass());
 }
 
 bool SystemZPassConfig::addInstSelector() {

diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index ded07e9..9fae5e4 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h

@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef SYSTEMZTARGETMACHINE_H
-#define SYSTEMZTARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
+#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
 
 #include "SystemZSubtarget.h"
 #include "llvm/Target/TargetMachine.h"
@@ -23,6 +23,7 @@
 class TargetFrameLowering;
 
 class SystemZTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   SystemZSubtarget        Subtarget;
 
 public:
@@ -30,32 +31,17 @@
                        StringRef FS, const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
+  ~SystemZTargetMachine() override;
 
   // Override TargetMachine.
-  const TargetFrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  const SystemZInstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
   const SystemZSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
-  const SystemZRegisterInfo *getRegisterInfo() const override {
-    return getSubtargetImpl()->getRegisterInfo();
-  }
-  const SystemZTargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
-
   // Override LLVMTargetMachine
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
 };
 
 } // end namespace llvm

diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index d277f82..4b51b3f 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp

@@ -49,7 +49,7 @@
 void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
   // The DataLayoutPass must now be in sync with the module. Unfortunatelly we
   // cannot enforce that from the C api.
-  unwrap(PM)->add(new DataLayoutPass(*unwrap(TD)));
+  unwrap(PM)->add(new DataLayoutPass());
 }
 
 void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,

diff --git a/lib/Target/TargetJITInfo.cpp b/lib/Target/TargetJITInfo.cpp
deleted file mode 100644
index aafedf8..0000000
--- a/lib/Target/TargetJITInfo.cpp
+++ /dev/null

@@ -1,14 +0,0 @@
-//===- Target/TargetJITInfo.h - Target Information for JIT ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/TargetJITInfo.h"
-
-using namespace llvm;
-
-void TargetJITInfo::anchor() { }

diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 6ec0b1f..bca56b5 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp

@@ -28,8 +28,12 @@
     "_IO_putc",
     "_ZdaPv",
     "_ZdaPvRKSt9nothrow_t",
+    "_ZdaPvj",
+    "_ZdaPvm",
     "_ZdlPv",
     "_ZdlPvRKSt9nothrow_t",
+    "_ZdlPvj",
+    "_ZdlPvm",
     "_Znaj",
     "_ZnajRKSt9nothrow_t",
     "_Znam",
@@ -47,6 +51,8 @@
     "__isoc99_scanf",
     "__isoc99_sscanf",
     "__memcpy_chk",
+    "__memmove_chk",
+    "__memset_chk",
     "__sincospi_stret",
     "__sincospif_stret",
     "__sinpi",
@@ -54,7 +60,11 @@
     "__sqrt_finite",
     "__sqrtf_finite",
     "__sqrtl_finite",
+    "__stpcpy_chk",
+    "__stpncpy_chk",
+    "__strcpy_chk",
     "__strdup",
+    "__strncpy_chk",
     "__strndup",
     "__strtok_r",
     "abs",
@@ -348,7 +358,7 @@
 
 static bool hasSinCosPiStret(const Triple &T) {
   // Only Darwin variants have _stret versions of combined trig functions.
-  if (!T.isMacOSX() && T.getOS() != Triple::IOS)
+  if (!T.isOSDarwin())
     return false;
 
   // The ABI is rather complicated on x86, so don't do anything special there.
@@ -358,7 +368,7 @@
   if (T.isMacOSX() && T.isMacOSXVersionLT(10, 9))
     return false;
 
-  if (T.getOS() == Triple::IOS && T.isOSVersionLT(7, 0))
+  if (T.isiOS() && T.isOSVersionLT(7, 0))
     return false;
 
   return true;
@@ -426,7 +436,7 @@
     TLI.setUnavailable(LibFunc::fiprintf);
   }
 
-  if (T.isKnownWindowsMSVCEnvironment()) {
+  if (T.isOSWindows() && !T.isOSCygMing()) {
     // Win32 does not support long double
     TLI.setUnavailable(LibFunc::acosl);
     TLI.setUnavailable(LibFunc::asinl);

diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 39e0459..01139fb 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp

@@ -30,6 +30,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -42,7 +43,7 @@
 void TargetLoweringObjectFile::Initialize(MCContext &ctx,
                                           const TargetMachine &TM) {
   Ctx = &ctx;
-  DL = TM.getDataLayout();
+  DL = TM.getSubtargetImpl()->getDataLayout();
   InitMCObjectFileInfo(TM.getTargetTriple(),
                        TM.getRelocationModel(), TM.getCodeModel(), *Ctx);
 }
@@ -199,7 +200,8 @@
       // Otherwise, just drop it into a mergable constant section.  If we have
       // a section for this size, use it, otherwise use the arbitrary sized
       // mergable section.
-      switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) {
+      switch (TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+          C->getType())) {
       case 4:  return SectionKind::getMergeableConst4();
       case 8:  return SectionKind::getMergeableConst8();
       case 16: return SectionKind::getMergeableConst16();
@@ -273,31 +275,13 @@
   return false;
 }
 
-// Lame default implementation. Calculate the section name for global.
-const MCSection *
-TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
-                                                 SectionKind Kind,
-                                                 Mangler &Mang,
-                                                 const TargetMachine &TM) const{
-  assert(!Kind.isThreadLocal() && "Doesn't support TLS");
-
-  if (Kind.isText())
-    return getTextSection();
-
-  if (Kind.isBSS() && BSSSection != nullptr)
-    return BSSSection;
-
-  if (Kind.isReadOnly() && ReadOnlySection != nullptr)
-    return ReadOnlySection;
-
-  return getDataSection();
-}
 
 /// getSectionForConstant - Given a mergable constant with the
 /// specified size and relocation information, return a section that it
 /// should be placed in.
 const MCSection *
-TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind) const {
+TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind,
+                                                const Constant *C) const {
   if (Kind.isReadOnly() && ReadOnlySection != nullptr)
     return ReadOnlySection;
 

diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 95c8cb6..309e1bf 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp

@@ -26,6 +26,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 //---------------------------------------------------------------------------
@@ -47,17 +48,13 @@
 }
 
 /// \brief Reset the target options based on the function's attributes.
-void TargetMachine::resetTargetOptions(const MachineFunction *MF) const {
-  const Function *F = MF->getFunction();
-  TargetOptions &TO = MF->getTarget().Options;
-
-#define RESET_OPTION(X, Y)                                              \
-  do {                                                                  \
-    if (F->hasFnAttribute(Y))                                           \
-      TO.X =                                                            \
-        (F->getAttributes().                                            \
-           getAttribute(AttributeSet::FunctionIndex,                    \
-                        Y).getValueAsString() == "true");               \
+void TargetMachine::resetTargetOptions(const Function &F) const {
+#define RESET_OPTION(X, Y)                                                     \
+  do {                                                                         \
+    if (F.hasFnAttribute(Y))                                                  \
+      Options.X = (F.getAttributes()                                          \
+                       .getAttribute(AttributeSet::FunctionIndex, Y)           \
+                       .getValueAsString() == "true");                         \
   } while (0)
 
   RESET_OPTION(NoFramePointerElim, "no-frame-pointer-elim");
@@ -68,7 +65,7 @@
   RESET_OPTION(UseSoftFloat, "use-soft-float");
   RESET_OPTION(DisableTailCalls, "disable-tail-calls");
 
-  TO.MCOptions.SanitizeAddress = F->hasFnAttribute(Attribute::SanitizeAddress);
+  Options.MCOptions.SanitizeAddress = F.hasFnAttribute(Attribute::SanitizeAddress);
 }
 
 /// getRelocationModel - Returns the code generation relocation model. The
@@ -183,7 +180,7 @@
   }
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, *this);
   const TargetLoweringObjectFile &TLOF =
-      getTargetLowering()->getObjFileLowering();
+      getSubtargetImpl()->getTargetLowering()->getObjFileLowering();
   const MCSection *TheSection = TLOF.SectionForGlobal(GV, GVKind, Mang, *this);
   bool CannotUsePrivateLabel = TLOF.isSectionAtomizableBySymbols(*TheSection);
   Mang.getNameWithPrefix(Name, GV, CannotUsePrivateLabel);
@@ -193,6 +190,6 @@
   SmallString<60> NameStr;
   getNameWithPrefix(NameStr, GV, Mang);
   const TargetLoweringObjectFile &TLOF =
-      getTargetLowering()->getObjFileLowering();
+      getSubtargetImpl()->getTargetLowering()->getObjFileLowering();
   return TLOF.getContext().GetOrCreateSymbol(NameStr.str());
 }

diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 20923c9..b3e07df 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp

@@ -24,6 +24,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
@@ -172,7 +173,7 @@
 }
 
 LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
-  return wrap(unwrap(T)->getDataLayout());
+  return wrap(unwrap(T)->getSubtargetImpl()->getDataLayout());
 }
 
 void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
@@ -189,7 +190,7 @@
 
   std::string error;
 
-  const DataLayout* td = TM->getDataLayout();
+  const DataLayout *td = TM->getSubtargetImpl()->getDataLayout();
 
   if (!td) {
     error = "No DataLayout in TargetMachine";
@@ -197,7 +198,7 @@
     return true;
   }
   Mod->setDataLayout(td);
-  pass.add(new DataLayoutPass(Mod));
+  pass.add(new DataLayoutPass());
 
   TargetMachine::CodeGenFileType ft;
   switch (codegen) {
@@ -222,10 +223,10 @@
 
 LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
   char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
-  std::string error;
-  raw_fd_ostream dest(Filename, error, sys::fs::F_None);
-  if (!error.empty()) {
-    *ErrorMessage = strdup(error.c_str());
+  std::error_code EC;
+  raw_fd_ostream dest(Filename, EC, sys::fs::F_None);
+  if (EC) {
+    *ErrorMessage = strdup(EC.message().c_str());
     return true;
   }
   formatted_raw_ostream destf(dest);

diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index 87b6b66..10597a8 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp

@@ -39,7 +39,7 @@
   return enableMachineScheduler();
 }
 
-bool TargetSubtargetInfo::enableAtomicExpandLoadLinked() const {
+bool TargetSubtargetInfo::enableAtomicExpand() const {
   return true;
 }
 
@@ -53,16 +53,7 @@
 }
 
 bool TargetSubtargetInfo::enablePostMachineScheduler() const {
-  return false;
-}
-
-bool TargetSubtargetInfo::enablePostRAScheduler(
-          CodeGenOpt::Level OptLevel,
-          AntiDepBreakMode& Mode,
-          RegClassVector& CriticalPathRCs) const {
-  Mode = ANTIDEP_NONE;
-  CriticalPathRCs.clear();
-  return false;
+  return getSchedModel().PostRAScheduler;
 }
 
 bool TargetSubtargetInfo::useAA() const {

diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk
index e2c4be7..861a41d 100644
--- a/lib/Target/X86/Android.mk
+++ b/lib/Target/X86/Android.mk

@@ -12,8 +12,6 @@
 
 x86_codegen_SRC_FILES := \
   X86AsmPrinter.cpp \
-  X86AtomicExpandPass.cpp \
-  X86CodeEmitter.cpp \
   X86FastISel.cpp \
   X86FixupLEAs.cpp \
   X86FloatingPoint.cpp \
@@ -21,7 +19,6 @@
   X86ISelDAGToDAG.cpp \
   X86ISelLowering.cpp \
   X86InstrInfo.cpp \
-  X86JITInfo.cpp \
   X86MachineFunctionInfo.cpp \
   X86MCInstLower.cpp \
   X86PadShortFunction.cpp \

diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt
index b022a41..2c1926e 100644
--- a/lib/Target/X86/AsmParser/CMakeLists.txt
+++ b/lib/Target/X86/AsmParser/CMakeLists.txt

@@ -1,4 +1,7 @@
 add_llvm_library(LLVMX86AsmParser
   X86AsmInstrumentation.cpp
   X86AsmParser.cpp
+
+  LINK_LIBS
+  LLVMX86CodeGen
   )

diff --git a/lib/Target/X86/AsmParser/LLVMBuild.txt b/lib/Target/X86/AsmParser/LLVMBuild.txt
index 9f94d5d..284bfd0 100644
--- a/lib/Target/X86/AsmParser/LLVMBuild.txt
+++ b/lib/Target/X86/AsmParser/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = X86AsmParser
 parent = X86
-required_libraries = MC MCParser Support X86Desc X86Info
+required_libraries = MC MCParser Support X86CodeGen X86Desc X86Info
 add_to_library_groups = X86

diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index a365f62..9c49a11 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp

@@ -10,9 +10,12 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86AsmInstrumentation.h"
 #include "X86Operand.h"
+#include "X86RegisterInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
@@ -23,6 +26,73 @@
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+// Following comment describes how assembly instrumentation works.
+// Currently we have only AddressSanitizer instrumentation, but we're
+// planning to implement MemorySanitizer for inline assembly too. If
+// you're not familiar with AddressSanitizer algorithm, please, read
+// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm.
+//
+// When inline assembly is parsed by an instance of X86AsmParser, all
+// instructions are emitted via EmitInstruction method. That's the
+// place where X86AsmInstrumentation analyzes an instruction and
+// decides, whether the instruction should be emitted as is or
+// instrumentation is required. The latter case happens when an
+// instruction reads from or writes to memory. Now instruction opcode
+// is explicitly checked, and if an instruction has a memory operand
+// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be
+// instrumented.  There're also exist instructions that modify
+// memory but don't have an explicit memory operands, for instance,
+// movs.
+//
+// Let's consider at first 8-byte memory accesses when an instruction
+// has an explicit memory operand. In this case we need two registers -
+// AddressReg to compute address of a memory cells which are accessed
+// and ShadowReg to compute corresponding shadow address. So, we need
+// to spill both registers before instrumentation code and restore them
+// after instrumentation. Thus, in general, instrumentation code will
+// look like this:
+// PUSHF  # Store flags, otherwise they will be overwritten
+// PUSH AddressReg  # spill AddressReg
+// PUSH ShadowReg   # spill ShadowReg
+// LEA MemOp, AddressReg  # compute address of the memory operand
+// MOV AddressReg, ShadowReg
+// SHR ShadowReg, 3
+// # ShadowOffset(AddressReg >> 3) contains address of a shadow
+// # corresponding to MemOp.
+// CMP ShadowOffset(ShadowReg), 0  # test shadow value
+// JZ .Done  # when shadow equals to zero, everything is fine
+// MOV AddressReg, RDI
+// # Call __asan_report function with AddressReg as an argument
+// CALL __asan_report
+// .Done:
+// POP ShadowReg  # Restore ShadowReg
+// POP AddressReg  # Restore AddressReg
+// POPF  # Restore flags
+//
+// Memory accesses with different size (1-, 2-, 4- and 16-byte) are
+// handled in a similar manner, but small memory accesses (less than 8
+// byte) require an additional ScratchReg, which is used for shadow value.
+//
+// If, suppose, we're instrumenting an instruction like movs, only
+// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize *
+// RCX are checked.  In this case there're no need to spill and restore
+// AddressReg , ShadowReg or flags four times, they're saved on stack
+// just once, before instrumentation of these four addresses, and restored
+// at the end of the instrumentation.
+//
+// There exist several things which complicate this simple algorithm.
+// * Instrumented memory operand can have RSP as a base or an index
+//   register.  So we need to add a constant offset before computation
+//   of memory address, since flags, AddressReg, ShadowReg, etc. were
+//   already stored on stack and RSP was modified.
+// * Debug info (usually, DWARF) should be adjusted, because sometimes
+//   RSP is used as a frame register. So, we need to select some
+//   register as a frame register and temprorary override current CFA
+//   register.
 
 namespace llvm {
 namespace {
@@ -32,10 +102,23 @@
     cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
     cl::init(false));
 
-bool IsStackReg(unsigned Reg) {
-  return Reg == X86::RSP || Reg == X86::ESP || Reg == X86::SP;
+const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min();
+const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max();
+
+int64_t ApplyDisplacementBounds(int64_t Displacement) {
+  return std::max(std::min(MaxAllowedDisplacement, Displacement),
+                  MinAllowedDisplacement);
 }
 
+void CheckDisplacementBounds(int64_t Displacement) {
+  assert(Displacement >= MinAllowedDisplacement &&
+         Displacement <= MaxAllowedDisplacement);
+}
+
+bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
+
+bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+
 std::string FuncName(unsigned AccessSize, bool IsWrite) {
   return std::string("__asan_report_") + (IsWrite ? "store" : "load") +
          utostr(AccessSize);
@@ -43,60 +126,245 @@
 
 class X86AddressSanitizer : public X86AsmInstrumentation {
 public:
-  X86AddressSanitizer(const MCSubtargetInfo &STI) : STI(STI) {}
+  struct RegisterContext {
+  private:
+    enum RegOffset {
+      REG_OFFSET_ADDRESS = 0,
+      REG_OFFSET_SHADOW,
+      REG_OFFSET_SCRATCH
+    };
+
+  public:
+    RegisterContext(unsigned AddressReg, unsigned ShadowReg,
+                    unsigned ScratchReg) {
+      BusyRegs.push_back(convReg(AddressReg, MVT::i64));
+      BusyRegs.push_back(convReg(ShadowReg, MVT::i64));
+      BusyRegs.push_back(convReg(ScratchReg, MVT::i64));
+    }
+
+    unsigned AddressReg(MVT::SimpleValueType VT) const {
+      return convReg(BusyRegs[REG_OFFSET_ADDRESS], VT);
+    }
+
+    unsigned ShadowReg(MVT::SimpleValueType VT) const {
+      return convReg(BusyRegs[REG_OFFSET_SHADOW], VT);
+    }
+
+    unsigned ScratchReg(MVT::SimpleValueType VT) const {
+      return convReg(BusyRegs[REG_OFFSET_SCRATCH], VT);
+    }
+
+    void AddBusyReg(unsigned Reg) {
+      if (Reg != X86::NoRegister)
+        BusyRegs.push_back(convReg(Reg, MVT::i64));
+    }
+
+    void AddBusyRegs(const X86Operand &Op) {
+      AddBusyReg(Op.getMemBaseReg());
+      AddBusyReg(Op.getMemIndexReg());
+    }
+
+    unsigned ChooseFrameReg(MVT::SimpleValueType VT) const {
+      static const unsigned Candidates[] = { X86::RBP, X86::RAX, X86::RBX,
+                                             X86::RCX, X86::RDX, X86::RDI,
+                                             X86::RSI };
+      for (unsigned Reg : Candidates) {
+        if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg))
+          return convReg(Reg, VT);
+      }
+      return X86::NoRegister;
+    }
+
+  private:
+    unsigned convReg(unsigned Reg, MVT::SimpleValueType VT) const {
+      return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, VT);
+    }
+
+    std::vector<unsigned> BusyRegs;
+  };
+
+  X86AddressSanitizer(const MCSubtargetInfo &STI)
+      : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
+
   virtual ~X86AddressSanitizer() {}
 
   // X86AsmInstrumentation implementation:
-  virtual void InstrumentInstruction(
-      const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
-      const MCInstrInfo &MII, MCStreamer &Out) override {
+  virtual void InstrumentAndEmitInstruction(const MCInst &Inst,
+                                            OperandVector &Operands,
+                                            MCContext &Ctx,
+                                            const MCInstrInfo &MII,
+                                            MCStreamer &Out) override {
+    InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
+    if (RepPrefix)
+      EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
+
     InstrumentMOV(Inst, Operands, Ctx, MII, Out);
+
+    RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX);
+    if (!RepPrefix)
+      EmitInstruction(Out, Inst);
   }
 
-  // Should be implemented differently in x86_32 and x86_64 subclasses.
-  virtual void InstrumentMemOperandSmallImpl(
-      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-      MCStreamer &Out) = 0;
-  virtual void InstrumentMemOperandLargeImpl(
-      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-      MCStreamer &Out) = 0;
+  // Adjusts up stack and saves all registers used in instrumentation.
+  virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) = 0;
 
-  void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize,
-                            bool IsWrite, MCContext &Ctx, MCStreamer &Out);
+  // Restores all registers used in instrumentation and adjusts stack.
+  virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) = 0;
+
+  virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx, MCStreamer &Out) = 0;
+  virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx, MCStreamer &Out) = 0;
+
+  virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                                  MCStreamer &Out) = 0;
+
+  void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite,
+                            const RegisterContext &RegCtx, MCContext &Ctx,
+                            MCStreamer &Out);
+  void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg,
+                          unsigned AccessSize, MCContext &Ctx, MCStreamer &Out);
+
+  void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands,
+                      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
   void InstrumentMOV(const MCInst &Inst, OperandVector &Operands,
                      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
-  void EmitInstruction(MCStreamer &Out, const MCInst &Inst) {
-    Out.EmitInstruction(Inst, STI);
-  }
-
-  void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
 
 protected:
-  const MCSubtargetInfo &STI;
+  void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); }
+
+  void EmitLEA(X86Operand &Op, MVT::SimpleValueType VT, unsigned Reg,
+               MCStreamer &Out) {
+    assert(VT == MVT::i32 || VT == MVT::i64);
+    MCInst Inst;
+    Inst.setOpcode(VT == MVT::i32 ? X86::LEA32r : X86::LEA64r);
+    Inst.addOperand(MCOperand::CreateReg(getX86SubSuperRegister(Reg, VT)));
+    Op.addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  void ComputeMemOperandAddress(X86Operand &Op, MVT::SimpleValueType VT,
+                                unsigned Reg, MCContext &Ctx, MCStreamer &Out);
+
+  // Creates new memory operand with Displacement added to an original
+  // displacement. Residue will contain a residue which could happen when the
+  // total displacement exceeds 32-bit limitation.
+  std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op,
+                                              int64_t Displacement,
+                                              MCContext &Ctx, int64_t *Residue);
+
+  // True when previous instruction was actually REP prefix.
+  bool RepPrefix;
+
+  // Offset from the original SP register.
+  int64_t OrigSPOffset;
 };
 
 void X86AddressSanitizer::InstrumentMemOperand(
-    MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-    MCStreamer &Out) {
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
   assert(Op.isMem() && "Op should be a memory operand.");
   assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 &&
          "AccessSize should be a power of two, less or equal than 16.");
-
-  X86Operand &MemOp = static_cast<X86Operand &>(Op);
-  // FIXME: get rid of this limitation.
-  if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg()))
-    return;
-
   // FIXME: take into account load/store alignment.
-  if (AccessSize < 8)
-    InstrumentMemOperandSmallImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
+  if (IsSmallMemAccess(AccessSize))
+    InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
   else
-    InstrumentMemOperandLargeImpl(MemOp, AccessSize, IsWrite, Ctx, Out);
+    InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out);
 }
 
-void X86AddressSanitizer::InstrumentMOV(
-    const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
-    const MCInstrInfo &MII, MCStreamer &Out) {
+void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg,
+                                             unsigned CntReg,
+                                             unsigned AccessSize,
+                                             MCContext &Ctx, MCStreamer &Out) {
+  // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)]
+  // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)].
+  RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */,
+                         IsSmallMemAccess(AccessSize)
+                             ? X86::RBX
+                             : X86::NoRegister /* ScratchReg */);
+  RegCtx.AddBusyReg(DstReg);
+  RegCtx.AddBusyReg(SrcReg);
+  RegCtx.AddBusyReg(CntReg);
+
+  InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+
+  // Test (%SrcReg)
+  {
+    const MCExpr *Disp = MCConstantExpr::Create(0, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+                         Out);
+  }
+
+  // Test -1(%SrcReg, %CntReg, AccessSize)
+  {
+    const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        0, Disp, SrcReg, CntReg, AccessSize, SMLoc(), SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx,
+                         Out);
+  }
+
+  // Test (%DstReg)
+  {
+    const MCExpr *Disp = MCConstantExpr::Create(0, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+  }
+
+  // Test -1(%DstReg, %CntReg, AccessSize)
+  {
+    const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx);
+    std::unique_ptr<X86Operand> Op(X86Operand::CreateMem(
+        0, Disp, DstReg, CntReg, AccessSize, SMLoc(), SMLoc()));
+    InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out);
+  }
+
+  InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst,
+                                         OperandVector &Operands,
+                                         MCContext &Ctx, const MCInstrInfo &MII,
+                                         MCStreamer &Out) {
+  // Access size in bytes.
+  unsigned AccessSize = 0;
+
+  switch (Inst.getOpcode()) {
+  case X86::MOVSB:
+    AccessSize = 1;
+    break;
+  case X86::MOVSW:
+    AccessSize = 2;
+    break;
+  case X86::MOVSL:
+    AccessSize = 4;
+    break;
+  case X86::MOVSQ:
+    AccessSize = 8;
+    break;
+  default:
+    return;
+  }
+
+  InstrumentMOVSImpl(AccessSize, Ctx, Out);
+}
+
+void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst,
+                                        OperandVector &Operands, MCContext &Ctx,
+                                        const MCInstrInfo &MII,
+                                        MCStreamer &Out) {
   // Access size in bytes.
   unsigned AccessSize = 0;
 
@@ -132,41 +400,199 @@
   }
 
   const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
+
   for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
     assert(Operands[Ix]);
     MCParsedAsmOperand &Op = *Operands[Ix];
-    if (Op.isMem())
-      InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out);
+    if (Op.isMem()) {
+      X86Operand &MemOp = static_cast<X86Operand &>(Op);
+      RegisterContext RegCtx(
+          X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */,
+          IsSmallMemAccess(AccessSize) ? X86::RCX
+                                       : X86::NoRegister /* ScratchReg */);
+      RegCtx.AddBusyRegs(MemOp);
+      InstrumentMemOperandPrologue(RegCtx, Ctx, Out);
+      InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out);
+      InstrumentMemOperandEpilogue(RegCtx, Ctx, Out);
+    }
   }
 }
 
+void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op,
+                                                   MVT::SimpleValueType VT,
+                                                   unsigned Reg, MCContext &Ctx,
+                                                   MCStreamer &Out) {
+  int64_t Displacement = 0;
+  if (IsStackReg(Op.getMemBaseReg()))
+    Displacement -= OrigSPOffset;
+  if (IsStackReg(Op.getMemIndexReg()))
+    Displacement -= OrigSPOffset * Op.getMemScale();
+
+  assert(Displacement >= 0);
+
+  // Emit Op as is.
+  if (Displacement == 0) {
+    EmitLEA(Op, VT, Reg, Out);
+    return;
+  }
+
+  int64_t Residue;
+  std::unique_ptr<X86Operand> NewOp =
+      AddDisplacement(Op, Displacement, Ctx, &Residue);
+  EmitLEA(*NewOp, VT, Reg, Out);
+
+  while (Residue != 0) {
+    const MCConstantExpr *Disp =
+        MCConstantExpr::Create(ApplyDisplacementBounds(Residue), Ctx);
+    std::unique_ptr<X86Operand> DispOp =
+        X86Operand::CreateMem(0, Disp, Reg, 0, 1, SMLoc(), SMLoc());
+    EmitLEA(*DispOp, VT, Reg, Out);
+    Residue -= Disp->getValue();
+  }
+}
+
+std::unique_ptr<X86Operand>
+X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement,
+                                     MCContext &Ctx, int64_t *Residue) {
+  assert(Displacement >= 0);
+
+  if (Displacement == 0 ||
+      (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) {
+    *Residue = Displacement;
+    return X86Operand::CreateMem(Op.getMemSegReg(), Op.getMemDisp(),
+                                 Op.getMemBaseReg(), Op.getMemIndexReg(),
+                                 Op.getMemScale(), SMLoc(), SMLoc());
+  }
+
+  int64_t OrigDisplacement =
+      static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue();
+  CheckDisplacementBounds(OrigDisplacement);
+  Displacement += OrigDisplacement;
+
+  int64_t NewDisplacement = ApplyDisplacementBounds(Displacement);
+  CheckDisplacementBounds(NewDisplacement);
+
+  *Residue = Displacement - NewDisplacement;
+  const MCExpr *Disp = MCConstantExpr::Create(NewDisplacement, Ctx);
+  return X86Operand::CreateMem(Op.getMemSegReg(), Disp, Op.getMemBaseReg(),
+                               Op.getMemIndexReg(), Op.getMemScale(), SMLoc(),
+                               SMLoc());
+}
+
 class X86AddressSanitizer32 : public X86AddressSanitizer {
 public:
   static const long kShadowOffset = 0x20000000;
 
   X86AddressSanitizer32(const MCSubtargetInfo &STI)
       : X86AddressSanitizer(STI) {}
+
   virtual ~X86AddressSanitizer32() {}
 
-  virtual void InstrumentMemOperandSmallImpl(
-      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-      MCStreamer &Out) override;
-  virtual void InstrumentMemOperandLargeImpl(
-      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-      MCStreamer &Out) override;
+  unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+    unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+    if (FrameReg == X86::NoRegister)
+      return FrameReg;
+    return getX86SubSuperRegister(FrameReg, MVT::i32);
+  }
 
- private:
-  void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize,
-                          bool IsWrite, unsigned AddressReg) {
+  void SpillReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg));
+    OrigSPOffset -= 4;
+  }
+
+  void RestoreReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg));
+    OrigSPOffset += 4;
+  }
+
+  void StoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+    OrigSPOffset -= 4;
+  }
+
+  void RestoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::POPF32));
+    OrigSPOffset += 4;
+  }
+
+  virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (MRI && FrameReg != X86::NoRegister) {
+      SpillReg(Out, LocalFrameReg);
+      if (FrameReg == X86::ESP) {
+        Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */);
+        Out.EmitCFIRelOffset(
+            MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+      }
+      EmitInstruction(
+          Out,
+          MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg));
+      Out.EmitCFIRememberState();
+      Out.EmitCFIDefCfaRegister(
+          MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+    }
+
+    SpillReg(Out, RegCtx.AddressReg(MVT::i32));
+    SpillReg(Out, RegCtx.ShadowReg(MVT::i32));
+    if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister)
+      SpillReg(Out, RegCtx.ScratchReg(MVT::i32));
+    StoreFlags(Out);
+  }
+
+  virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    RestoreFlags(Out);
+    if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister)
+      RestoreReg(Out, RegCtx.ScratchReg(MVT::i32));
+    RestoreReg(Out, RegCtx.ShadowReg(MVT::i32));
+    RestoreReg(Out, RegCtx.AddressReg(MVT::i32));
+
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+      RestoreReg(Out, LocalFrameReg);
+      Out.EmitCFIRestoreState();
+      if (FrameReg == X86::ESP)
+        Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */);
+    }
+  }
+
+  virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx,
+                                         MCStreamer &Out) override;
+  virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx,
+                                         MCStreamer &Out) override;
+  virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                                  MCStreamer &Out) override;
+
+private:
+  void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+                          MCStreamer &Out, const RegisterContext &RegCtx) {
     EmitInstruction(Out, MCInstBuilder(X86::CLD));
     EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
 
-    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::ESP)
-                             .addReg(X86::ESP).addImm(-16));
-    EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(AddressReg));
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+                             .addReg(X86::ESP)
+                             .addReg(X86::ESP)
+                             .addImm(-16));
+    EmitInstruction(
+        Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(MVT::i32)));
 
-
-    const std::string& Fn = FuncName(AccessSize, IsWrite);
+    const std::string &Fn = FuncName(AccessSize, IsWrite);
     MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn));
     const MCSymbolRefExpr *FnExpr =
         MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
@@ -174,67 +600,64 @@
   }
 };
 
-void X86AddressSanitizer32::InstrumentMemOperandSmallImpl(
-    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-    MCStreamer &Out) {
-  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
-  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX));
-  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EDX));
-  EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+void X86AddressSanitizer32::InstrumentMemOperandSmall(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32);
+  unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8);
 
-  {
-    MCInst Inst;
-    Inst.setOpcode(X86::LEA32r);
-    Inst.addOperand(MCOperand::CreateReg(X86::EAX));
-    Op.addMemOperands(Inst, 5);
-    EmitInstruction(Out, Inst);
-  }
+  assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister);
+  unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32);
 
-  EmitInstruction(
-      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX));
-  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX)
-                           .addReg(X86::ECX).addImm(3));
+  ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+                           .addReg(ShadowRegI32)
+                           .addReg(ShadowRegI32)
+                           .addImm(3));
 
   {
     MCInst Inst;
     Inst.setOpcode(X86::MOV8rm);
-    Inst.addOperand(MCOperand::CreateReg(X86::CL));
+    Inst.addOperand(MCOperand::CreateReg(ShadowRegI8));
     const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(0, Disp, ShadowRegI32, 0, 1, SMLoc(), SMLoc()));
     Op->addMemOperands(Inst, 5);
     EmitInstruction(Out, Inst);
   }
 
-  EmitInstruction(Out,
-                  MCInstBuilder(X86::TEST8rr).addReg(X86::CL).addReg(X86::CL));
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
   MCSymbol *DoneSym = Ctx.CreateTempSymbol();
   const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
   EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
 
-  EmitInstruction(
-      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::EDX).addReg(X86::EAX));
-  EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::EDX)
-                           .addReg(X86::EDX).addImm(7));
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+                           .addReg(ScratchRegI32)
+                           .addReg(ScratchRegI32)
+                           .addImm(7));
 
   switch (AccessSize) {
   case 1:
     break;
   case 2: {
-    MCInst Inst;
-    Inst.setOpcode(X86::LEA32r);
-    Inst.addOperand(MCOperand::CreateReg(X86::EDX));
-
     const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, X86::EDX, 0, 1, SMLoc(), SMLoc()));
-    Op->addMemOperands(Inst, 5);
-    EmitInstruction(Out, Inst);
+        X86Operand::CreateMem(0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc()));
+    EmitLEA(*Op, MVT::i32, ScratchRegI32, Out);
     break;
   }
   case 4:
-    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::EDX)
-                             .addReg(X86::EDX).addImm(3));
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+                             .addReg(ScratchRegI32)
+                             .addReg(ScratchRegI32)
+                             .addImm(3));
     break;
   default:
     assert(false && "Incorrect access size");
@@ -242,211 +665,30 @@
   }
 
   EmitInstruction(
-      Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::ECX).addReg(X86::CL));
-  EmitInstruction(
-      Out, MCInstBuilder(X86::CMP32rr).addReg(X86::EDX).addReg(X86::ECX));
+      Out,
+      MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+  EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+                           ShadowRegI32));
   EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
 
-  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX);
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
   EmitLabel(Out, DoneSym);
-
-  EmitInstruction(Out, MCInstBuilder(X86::POPF32));
-  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EDX));
-  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX));
-  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
 }
 
-void X86AddressSanitizer32::InstrumentMemOperandLargeImpl(
-    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-    MCStreamer &Out) {
-  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX));
-  EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX));
-  EmitInstruction(Out, MCInstBuilder(X86::PUSHF32));
+void X86AddressSanitizer32::InstrumentMemOperandLarge(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32);
 
-  {
-    MCInst Inst;
-    Inst.setOpcode(X86::LEA32r);
-    Inst.addOperand(MCOperand::CreateReg(X86::EAX));
-    Op.addMemOperands(Inst, 5);
-    EmitInstruction(Out, Inst);
-  }
-  EmitInstruction(
-      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX));
-  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX)
-                           .addReg(X86::ECX).addImm(3));
-  {
-    MCInst Inst;
-    switch (AccessSize) {
-      case 8:
-        Inst.setOpcode(X86::CMP8mi);
-        break;
-      case 16:
-        Inst.setOpcode(X86::CMP16mi);
-        break;
-      default:
-        assert(false && "Incorrect access size");
-        break;
-    }
-    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
-    std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
-    Op->addMemOperands(Inst, 5);
-    Inst.addOperand(MCOperand::CreateImm(0));
-    EmitInstruction(Out, Inst);
-  }
-  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
-  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
-  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+  ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out);
 
-  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX);
-  EmitLabel(Out, DoneSym);
-
-  EmitInstruction(Out, MCInstBuilder(X86::POPF32));
-  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX));
-  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
-}
-
-class X86AddressSanitizer64 : public X86AddressSanitizer {
-public:
-  static const long kShadowOffset = 0x7fff8000;
-
-  X86AddressSanitizer64(const MCSubtargetInfo &STI)
-      : X86AddressSanitizer(STI) {}
-  virtual ~X86AddressSanitizer64() {}
-
-  virtual void InstrumentMemOperandSmallImpl(
-      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-      MCStreamer &Out) override;
-  virtual void InstrumentMemOperandLargeImpl(
-      X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-      MCStreamer &Out) override;
-
-private:
-  void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
-    MCInst Inst;
-    Inst.setOpcode(X86::LEA64r);
-    Inst.addOperand(MCOperand::CreateReg(X86::RSP));
-
-    const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx);
-    std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
-    Op->addMemOperands(Inst, 5);
-    EmitInstruction(Out, Inst);
-  }
-
-  void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize,
-                          bool IsWrite) {
-    EmitInstruction(Out, MCInstBuilder(X86::CLD));
-    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
-
-    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::RSP)
-                             .addReg(X86::RSP).addImm(-16));
-
-    const std::string& Fn = FuncName(AccessSize, IsWrite);
-    MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn));
-    const MCSymbolRefExpr *FnExpr =
-        MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
-    EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
-  }
-};
-
-void X86AddressSanitizer64::InstrumentMemOperandSmallImpl(
-    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-    MCStreamer &Out) {
-  EmitAdjustRSP(Ctx, Out, -128);
-  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
-  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RCX));
-  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI));
-  EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
-  {
-    MCInst Inst;
-    Inst.setOpcode(X86::LEA64r);
-    Inst.addOperand(MCOperand::CreateReg(X86::RDI));
-    Op.addMemOperands(Inst, 5);
-    EmitInstruction(Out, Inst);
-  }
-  EmitInstruction(
-      Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(X86::RDI));
-  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX)
-                           .addReg(X86::RAX).addImm(3));
-  {
-    MCInst Inst;
-    Inst.setOpcode(X86::MOV8rm);
-    Inst.addOperand(MCOperand::CreateReg(X86::AL));
-    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
-    std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc()));
-    Op->addMemOperands(Inst, 5);
-    EmitInstruction(Out, Inst);
-  }
-
-  EmitInstruction(Out,
-                  MCInstBuilder(X86::TEST8rr).addReg(X86::AL).addReg(X86::AL));
-  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
-  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
-  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
-
-  EmitInstruction(
-      Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EDI));
-  EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::ECX)
-                           .addReg(X86::ECX).addImm(7));
-
-  switch (AccessSize) {
-  case 1:
-    break;
-  case 2: {
-    MCInst Inst;
-    Inst.setOpcode(X86::LEA32r);
-    Inst.addOperand(MCOperand::CreateReg(X86::ECX));
-
-    const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
-    std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc()));
-    Op->addMemOperands(Inst, 5);
-    EmitInstruction(Out, Inst);
-    break;
-  }
-  case 4:
-    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::ECX)
-                             .addReg(X86::ECX).addImm(3));
-    break;
-  default:
-    assert(false && "Incorrect access size");
-    break;
-  }
-
-  EmitInstruction(
-      Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::EAX).addReg(X86::AL));
-  EmitInstruction(
-      Out, MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::EAX));
-  EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
-
-  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite);
-  EmitLabel(Out, DoneSym);
-
-  EmitInstruction(Out, MCInstBuilder(X86::POPF64));
-  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI));
-  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RCX));
-  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX));
-  EmitAdjustRSP(Ctx, Out, 128);
-}
-
-void X86AddressSanitizer64::InstrumentMemOperandLargeImpl(
-    X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-    MCStreamer &Out) {
-  EmitAdjustRSP(Ctx, Out, -128);
-  EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
-  EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
-
-  {
-    MCInst Inst;
-    Inst.setOpcode(X86::LEA64r);
-    Inst.addOperand(MCOperand::CreateReg(X86::RAX));
-    Op.addMemOperands(Inst, 5);
-    EmitInstruction(Out, Inst);
-  }
-  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX)
-                           .addReg(X86::RAX).addImm(3));
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR32ri)
+                           .addReg(ShadowRegI32)
+                           .addReg(ShadowRegI32)
+                           .addImm(3));
   {
     MCInst Inst;
     switch (AccessSize) {
@@ -462,7 +704,278 @@
     }
     const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
     std::unique_ptr<X86Operand> Op(
-        X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc()));
+        X86Operand::CreateMem(0, Disp, ShadowRegI32, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    Inst.addOperand(MCOperand::CreateImm(0));
+    EmitInstruction(Out, Inst);
+  }
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize,
+                                               MCContext &Ctx,
+                                               MCStreamer &Out) {
+  StoreFlags(Out);
+
+  // No need to test when ECX is equals to zero.
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  // Instrument first and last elements in src and dst range.
+  InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */,
+                     X86::ECX /* CntReg */, AccessSize, Ctx, Out);
+
+  EmitLabel(Out, DoneSym);
+  RestoreFlags(Out);
+}
+
+class X86AddressSanitizer64 : public X86AddressSanitizer {
+public:
+  static const long kShadowOffset = 0x7fff8000;
+
+  X86AddressSanitizer64(const MCSubtargetInfo &STI)
+      : X86AddressSanitizer(STI) {}
+
+  virtual ~X86AddressSanitizer64() {}
+
+  unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
+    unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
+    if (FrameReg == X86::NoRegister)
+      return FrameReg;
+    return getX86SubSuperRegister(FrameReg, MVT::i64);
+  }
+
+  void SpillReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg));
+    OrigSPOffset -= 8;
+  }
+
+  void RestoreReg(MCStreamer &Out, unsigned Reg) {
+    EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg));
+    OrigSPOffset += 8;
+  }
+
+  void StoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::PUSHF64));
+    OrigSPOffset -= 8;
+  }
+
+  void RestoreFlags(MCStreamer &Out) {
+    EmitInstruction(Out, MCInstBuilder(X86::POPF64));
+    OrigSPOffset += 8;
+  }
+
+  virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (MRI && FrameReg != X86::NoRegister) {
+      SpillReg(Out, X86::RBP);
+      if (FrameReg == X86::RSP) {
+        Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */);
+        Out.EmitCFIRelOffset(
+            MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0);
+      }
+      EmitInstruction(
+          Out,
+          MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg));
+      Out.EmitCFIRememberState();
+      Out.EmitCFIDefCfaRegister(
+          MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */));
+    }
+
+    EmitAdjustRSP(Ctx, Out, -128);
+    SpillReg(Out, RegCtx.ShadowReg(MVT::i64));
+    SpillReg(Out, RegCtx.AddressReg(MVT::i64));
+    if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister)
+      SpillReg(Out, RegCtx.ScratchReg(MVT::i64));
+    StoreFlags(Out);
+  }
+
+  virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx,
+                                            MCContext &Ctx,
+                                            MCStreamer &Out) override {
+    unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64);
+    assert(LocalFrameReg != X86::NoRegister);
+
+    RestoreFlags(Out);
+    if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister)
+      RestoreReg(Out, RegCtx.ScratchReg(MVT::i64));
+    RestoreReg(Out, RegCtx.AddressReg(MVT::i64));
+    RestoreReg(Out, RegCtx.ShadowReg(MVT::i64));
+    EmitAdjustRSP(Ctx, Out, 128);
+
+    unsigned FrameReg = GetFrameReg(Ctx, Out);
+    if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) {
+      RestoreReg(Out, LocalFrameReg);
+      Out.EmitCFIRestoreState();
+      if (FrameReg == X86::RSP)
+        Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */);
+    }
+  }
+
+  virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx,
+                                         MCStreamer &Out) override;
+  virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize,
+                                         bool IsWrite,
+                                         const RegisterContext &RegCtx,
+                                         MCContext &Ctx,
+                                         MCStreamer &Out) override;
+  virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx,
+                                  MCStreamer &Out) override;
+
+private:
+  void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) {
+    const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+    EmitLEA(*Op, MVT::i64, X86::RSP, Out);
+    OrigSPOffset += Offset;
+  }
+
+  void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx,
+                          MCStreamer &Out, const RegisterContext &RegCtx) {
+    EmitInstruction(Out, MCInstBuilder(X86::CLD));
+    EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
+
+    EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+                             .addReg(X86::RSP)
+                             .addReg(X86::RSP)
+                             .addImm(-16));
+
+    if (RegCtx.AddressReg(MVT::i64) != X86::RDI) {
+      EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
+                               RegCtx.AddressReg(MVT::i64)));
+    }
+    const std::string &Fn = FuncName(AccessSize, IsWrite);
+    MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn));
+    const MCSymbolRefExpr *FnExpr =
+        MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
+    EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
+  }
+};
+
+void X86AddressSanitizer64::InstrumentMemOperandSmall(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64);
+  unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32);
+  unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64);
+  unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32);
+  unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8);
+
+  assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister);
+  unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32);
+
+  ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+                           AddressRegI64));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+                           .addReg(ShadowRegI64)
+                           .addReg(ShadowRegI64)
+                           .addImm(3));
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::MOV8rm);
+    Inst.addOperand(MCOperand::CreateReg(ShadowRegI8));
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, ShadowRegI64, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
+
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8));
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg(
+                           AddressRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::AND32ri)
+                           .addReg(ScratchRegI32)
+                           .addReg(ScratchRegI32)
+                           .addImm(7));
+
+  switch (AccessSize) {
+  case 1:
+    break;
+  case 2: {
+    const MCExpr *Disp = MCConstantExpr::Create(1, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc()));
+    EmitLEA(*Op, MVT::i32, ScratchRegI32, Out);
+    break;
+  }
+  case 4:
+    EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8)
+                             .addReg(ScratchRegI32)
+                             .addReg(ScratchRegI32)
+                             .addImm(3));
+    break;
+  default:
+    assert(false && "Incorrect access size");
+    break;
+  }
+
+  EmitInstruction(
+      Out,
+      MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8));
+  EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg(
+                           ShadowRegI32));
+  EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr));
+
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
+  EmitLabel(Out, DoneSym);
+}
+
+void X86AddressSanitizer64::InstrumentMemOperandLarge(
+    X86Operand &Op, unsigned AccessSize, bool IsWrite,
+    const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
+  unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64);
+  unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64);
+
+  ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out);
+
+  EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg(
+                           AddressRegI64));
+  EmitInstruction(Out, MCInstBuilder(X86::SHR64ri)
+                           .addReg(ShadowRegI64)
+                           .addReg(ShadowRegI64)
+                           .addImm(3));
+  {
+    MCInst Inst;
+    switch (AccessSize) {
+    case 8:
+      Inst.setOpcode(X86::CMP8mi);
+      break;
+    case 16:
+      Inst.setOpcode(X86::CMP16mi);
+      break;
+    default:
+      assert(false && "Incorrect access size");
+      break;
+    }
+    const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, ShadowRegI64, 0, 1, SMLoc(), SMLoc()));
     Op->addMemOperands(Inst, 5);
     Inst.addOperand(MCOperand::CreateImm(0));
     EmitInstruction(Out, Inst);
@@ -472,22 +985,66 @@
   const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
   EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
 
-  EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite);
+  EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx);
   EmitLabel(Out, DoneSym);
+}
 
-  EmitInstruction(Out, MCInstBuilder(X86::POPF64));
-  EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX));
-  EmitAdjustRSP(Ctx, Out, 128);
+void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
+                                               MCContext &Ctx,
+                                               MCStreamer &Out) {
+  StoreFlags(Out);
+
+  // No need to test when RCX is equals to zero.
+  MCSymbol *DoneSym = Ctx.CreateTempSymbol();
+  const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx);
+  EmitInstruction(
+      Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX));
+  EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr));
+
+  // Instrument first and last elements in src and dst range.
+  InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */,
+                     X86::RCX /* CntReg */, AccessSize, Ctx, Out);
+
+  EmitLabel(Out, DoneSym);
+  RestoreFlags(Out);
 }
 
 } // End anonymous namespace
 
-X86AsmInstrumentation::X86AsmInstrumentation() {}
+X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo &STI)
+    : STI(STI), InitialFrameReg(0) {}
+
 X86AsmInstrumentation::~X86AsmInstrumentation() {}
 
-void X86AsmInstrumentation::InstrumentInstruction(
+void X86AsmInstrumentation::InstrumentAndEmitInstruction(
     const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
-    const MCInstrInfo &MII, MCStreamer &Out) {}
+    const MCInstrInfo &MII, MCStreamer &Out) {
+  EmitInstruction(Out, Inst);
+}
+
+void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out,
+                                            const MCInst &Inst) {
+  Out.EmitInstruction(Inst, STI);
+}
+
+unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
+                                                   MCStreamer &Out) {
+  if (!Out.getNumFrameInfos()) // No active dwarf frame
+    return X86::NoRegister;
+  const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back();
+  if (Frame.End) // Active dwarf frame is closed
+    return X86::NoRegister;
+  const MCRegisterInfo *MRI = Ctx.getRegisterInfo();
+  if (!MRI) // No register info
+    return X86::NoRegister;
+
+  if (InitialFrameReg) {
+    // FrameReg is set explicitly, we're instrumenting a MachineFunction.
+    return InitialFrameReg;
+  }
+
+  return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */);
+}
 
 X86AsmInstrumentation *
 CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
@@ -501,7 +1058,7 @@
     if ((STI.getFeatureBits() & X86::Mode64Bit) != 0)
       return new X86AddressSanitizer64(STI);
   }
-  return new X86AsmInstrumentation();
+  return new X86AsmInstrumentation(STI);
 }
 
 } // End llvm namespace

diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 1bc3c09..19ebcc4 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86_ASM_INSTRUMENTATION_H
-#define X86_ASM_INSTRUMENTATION_H
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
 
 #include "llvm/ADT/SmallVector.h"
 
@@ -34,11 +34,15 @@
 public:
   virtual ~X86AsmInstrumentation();
 
-  // Instruments Inst. Should be called just before the original
-  // instruction is sent to Out.
-  virtual void InstrumentInstruction(
+  // Sets frame register corresponding to a current frame.
+  void SetInitialFrameRegister(unsigned RegNo) {
+    InitialFrameReg = RegNo;
+  }
+
+  // Tries to instrument and emit instruction.
+  virtual void InstrumentAndEmitInstruction(
       const MCInst &Inst,
-      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
+      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands,
       MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
 
 protected:
@@ -46,9 +50,17 @@
   CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
                               const MCContext &Ctx, const MCSubtargetInfo &STI);
 
-  X86AsmInstrumentation();
+  X86AsmInstrumentation(const MCSubtargetInfo &STI);
+
+  unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
+
+  void EmitInstruction(MCStreamer &Out, const MCInst &Inst);
+
+  const MCSubtargetInfo &STI;
+
+  unsigned InitialFrameReg;
 };
 
 } // End llvm namespace
 
-#endif // X86_ASM_INSTRUMENTATION_H
+#endif

diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index f0765ed..8ef2a55 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp

@@ -32,6 +32,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <memory>
 
 using namespace llvm;
@@ -55,12 +56,12 @@
 
 class X86AsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
   const MCInstrInfo &MII;
   ParseInstructionInfo *InstInfo;
   std::unique_ptr<X86AsmInstrumentation> Instrumentation;
 private:
   SMLoc consumeToken() {
+    MCAsmParser &Parser = getParser();
     SMLoc Result = Parser.getTok().getLoc();
     Parser.Lex();
     return Result;
@@ -630,13 +631,10 @@
     }
   };
 
-  MCAsmParser &getParser() const { return Parser; }
-
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
   bool Error(SMLoc L, const Twine &Msg,
              ArrayRef<SMRange> Ranges = None,
              bool MatchingInlineAsm = false) {
+    MCAsmParser &Parser = getParser();
     if (MatchingInlineAsm) return true;
     return Parser.Error(L, Msg, Ranges);
   }
@@ -644,8 +642,9 @@
   bool ErrorAndEatStatement(SMLoc L, const Twine &Msg,
           ArrayRef<SMRange> Ranges = None,
           bool MatchingInlineAsm = false) {
-      Parser.eatToEndOfStatement();
-      return Error(L, Msg, Ranges, MatchingInlineAsm);
+    MCAsmParser &Parser = getParser();
+    Parser.eatToEndOfStatement();
+    return Error(L, Msg, Ranges, MatchingInlineAsm);
   }
 
   std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) {
@@ -693,9 +692,34 @@
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo,
+                               uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
 
+  void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands,
+                         MCStreamer &Out, bool MatchingInlineAsm);
+
+  bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+                           bool MatchingInlineAsm);
+
+  bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                  OperandVector &Operands, MCStreamer &Out,
+                                  uint64_t &ErrorInfo,
+                                  bool MatchingInlineAsm);
+
+  bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                    OperandVector &Operands, MCStreamer &Out,
+                                    uint64_t &ErrorInfo,
+                                    bool MatchingInlineAsm);
+
+  unsigned getPointerSize() {
+    if (is16BitMode()) return 16;
+    if (is32BitMode()) return 32;
+    if (is64BitMode()) return 64;
+    llvm_unreachable("invalid mode");
+  }
+
+  bool OmitRegisterFromClobberLists(unsigned RegNo) override;
+
   /// doSrcDstMatch - Returns true if operands are matching in their
   /// word size (%si and %di, %esi and %edi, etc.). Order depends on
   /// the parsing mode (Intel vs. AT&T).
@@ -728,6 +752,13 @@
                     (X86::Mode64Bit | X86::Mode32Bit | X86::Mode16Bit)));
   }
 
+  unsigned getPointerWidth() {
+    if (is16BitMode()) return 16;
+    if (is32BitMode()) return 32;
+    if (is64BitMode()) return 64;
+    llvm_unreachable("invalid mode");
+  }
+
   bool isParsingIntelSyntax() {
     return getParser().getAssemblerDialect();
   }
@@ -741,11 +772,9 @@
   /// }
 
 public:
-  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-               const MCInstrInfo &mii,
-               const MCTargetOptions &Options)
-      : MCTargetAsmParser(), STI(sti), Parser(parser), MII(mii),
-        InstInfo(nullptr) {
+  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &Parser,
+               const MCInstrInfo &mii, const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(sti), MII(mii), InstInfo(nullptr) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -755,6 +784,8 @@
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
+  void SetFrameRegister(unsigned RegNo) override;
+
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
 
@@ -830,6 +861,7 @@
 
 bool X86AsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
+  MCAsmParser &Parser = getParser();
   RegNo = 0;
   const AsmToken &PercentTok = Parser.getTok();
   StartLoc = PercentTok.getLoc();
@@ -937,6 +969,10 @@
   return false;
 }
 
+void X86AsmParser::SetFrameRegister(unsigned RegNo) {
+  Instrumentation->SetInitialFrameRegister(RegNo);
+}
+
 std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
   unsigned basereg =
     is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI);
@@ -979,15 +1015,20 @@
     unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
     unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
     InlineAsmIdentifierInfo &Info) {
-  // If this is not a VarDecl then assume it is a FuncDecl or some other label
-  // reference.  We need an 'r' constraint here, so we need to create register
-  // operand to ensure proper matching.  Just pick a GPR based on the size of
-  // a pointer.
-  if (isa<MCSymbolRefExpr>(Disp) && !Info.IsVarDecl) {
-    unsigned RegNo =
-        is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX);
-    return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true,
-                                 SMLoc(), Identifier, Info.OpDecl);
+  // If we found a decl other than a VarDecl, then assume it is a FuncDecl or
+  // some other label reference.
+  if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) {
+    // Insert an explicit size if the user didn't have one.
+    if (!Size) {
+      Size = getPointerWidth();
+      InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start,
+                                                  /*Len=*/0, Size));
+    }
+
+    // Create an absolute memory reference in order to match against
+    // instructions taking a PC relative operand.
+    return X86Operand::CreateMem(Disp, Start, End, Size, Identifier,
+                                 Info.OpDecl);
   }
 
   // We either have a direct symbol reference, or an offset from a symbol.  The
@@ -1076,6 +1117,7 @@
 }
 
 bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
 
   bool Done = false;
@@ -1197,6 +1239,7 @@
 std::unique_ptr<X86Operand>
 X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
                                        int64_t ImmDisp, unsigned Size) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
   if (getLexer().isNot(AsmToken::LBrac))
@@ -1272,13 +1315,16 @@
                                         StringRef &Identifier,
                                         InlineAsmIdentifierInfo &Info,
                                         bool IsUnevaluatedOperand, SMLoc &End) {
+  MCAsmParser &Parser = getParser();
   assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
   Val = nullptr;
 
   StringRef LineBuf(Identifier.data());
-  SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+  void *Result =
+    SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
 
   const AsmToken &Tok = Parser.getTok();
+  SMLoc Loc = Tok.getLoc();
 
   // Advance the token stream until the end of the current token is
   // after the end of what the frontend claimed.
@@ -1290,9 +1336,22 @@
     assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?");
     if (End.getPointer() == EndPtr) break;
   }
+  Identifier = LineBuf;
+
+  // If the identifier lookup was unsuccessful, assume that we are dealing with
+  // a label.
+  if (!Result) {
+    StringRef InternalName =
+      SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(),
+                                         Loc, false);
+    assert(InternalName.size() && "We should have an internal name here.");
+    // Push a rewrite for replacing the identifier name with the internal name.
+    InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Label, Loc,
+                                                Identifier.size(),
+                                                InternalName));
+  }
 
   // Create the symbol reference.
-  Identifier = LineBuf;
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
   MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
   Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext());
@@ -1303,6 +1362,7 @@
 std::unique_ptr<X86Operand>
 X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
                                         unsigned Size) {
+  MCAsmParser &Parser = getParser();
   assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
   const AsmToken &Tok = Parser.getTok(); // Eat colon.
   if (Tok.isNot(AsmToken::Colon))
@@ -1354,6 +1414,7 @@
 std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp,
                                                                SMLoc Start,
                                                                unsigned Size) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc End;
 
@@ -1413,6 +1474,7 @@
 /// Parse the '.' operator.
 bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
                                                 const MCExpr *&NewDisp) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   int64_t OrigDispVal, DotDispVal;
 
@@ -1457,6 +1519,7 @@
 /// Parse the 'offset' operator.  This operator is used to specify the
 /// location rather then the content of a variable.
 std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc OffsetOfLoc = Tok.getLoc();
   Parser.Lex(); // Eat offset.
@@ -1494,6 +1557,7 @@
 /// TYPE operator returns the size of a C or C++ type or variable. If the
 /// variable is an array, TYPE returns the size of a single element.
 std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc TypeLoc = Tok.getLoc();
   Parser.Lex(); // Eat operator.
@@ -1527,6 +1591,7 @@
 }
 
 std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
+  MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc Start, End;
 
@@ -1547,7 +1612,7 @@
   if (Size) {
     Parser.Lex(); // Eat operand size (e.g., byte, word).
     if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
-      return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
+      return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
     Parser.Lex(); // Eat ptr.
   }
   Start = Tok.getLoc();
@@ -1609,6 +1674,7 @@
 }
 
 std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
+  MCAsmParser &Parser = getParser();
   switch (getLexer().getKind()) {
   default:
     // Parse a memory operand with no segment register.
@@ -1629,6 +1695,9 @@
     if (getLexer().isNot(AsmToken::Colon))
       return X86Operand::CreateReg(RegNo, Start, End);
 
+    if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
+      return ErrorOperand(Start, "invalid segment register");
+
     getParser().Lex(); // Eat the colon.
     return ParseMemOperand(RegNo, Start);
   }
@@ -1646,6 +1715,7 @@
 
 bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
                                        const MCParsedAsmOperand &Op) {
+  MCAsmParser &Parser = getParser();
   if(STI.getFeatureBits() & X86::FeatureAVX512) {
     if (getLexer().is(AsmToken::LCurly)) {
       // Eat "{" and mark the current place.
@@ -1664,6 +1734,8 @@
         // Recognize only reasonable suffixes.
         const char *BroadcastPrimitive =
           StringSwitch<const char*>(getLexer().getTok().getIdentifier())
+            .Case("to2",  "{1to2}")
+            .Case("to4",  "{1to4}")
             .Case("to8",  "{1to8}")
             .Case("to16", "{1to16}")
             .Default(nullptr);
@@ -1715,6 +1787,7 @@
 std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
                                                           SMLoc MemStart) {
 
+  MCAsmParser &Parser = getParser();
   // We have to disambiguate a parenthesized expression "(4+5)" from the start
   // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
   // only way to do this without lookahead is to eat the '(' and see what is
@@ -1872,12 +1945,15 @@
     return nullptr;
   }
 
-  return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
-                               MemStart, MemEnd);
+  if (SegReg || BaseReg || IndexReg)
+    return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
+                                 MemStart, MemEnd);
+  return X86Operand::CreateMem(Disp, MemStart, MemEnd);
 }
 
 bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                     SMLoc NameLoc, OperandVector &Operands) {
+  MCAsmParser &Parser = getParser();
   InstInfo = &Info;
   StringRef PatchedName = Name;
 
@@ -2275,51 +2351,79 @@
   }
 }
 
-static const char *getSubtargetFeatureName(unsigned Val);
+static const char *getSubtargetFeatureName(uint64_t Val);
 
 void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
                                    MCStreamer &Out) {
-  Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII,
-                                         Out);
-  Out.EmitInstruction(Inst, STI);
+  Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(),
+                                                MII, Out);
 }
 
 bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                            OperandVector &Operands,
-                                           MCStreamer &Out, unsigned &ErrorInfo,
+                                           MCStreamer &Out, uint64_t &ErrorInfo,
                                            bool MatchingInlineAsm) {
+  if (isParsingIntelSyntax())
+    return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+                                        MatchingInlineAsm);
+  return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo,
+                                    MatchingInlineAsm);
+}
+
+void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
+                                     OperandVector &Operands, MCStreamer &Out,
+                                     bool MatchingInlineAsm) {
+  // FIXME: This should be replaced with a real .td file alias mechanism.
+  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
+  // call.
+  const char *Repl = StringSwitch<const char *>(Op.getToken())
+                         .Case("finit", "fninit")
+                         .Case("fsave", "fnsave")
+                         .Case("fstcw", "fnstcw")
+                         .Case("fstcww", "fnstcw")
+                         .Case("fstenv", "fnstenv")
+                         .Case("fstsw", "fnstsw")
+                         .Case("fstsww", "fnstsw")
+                         .Case("fclex", "fnclex")
+                         .Default(nullptr);
+  if (Repl) {
+    MCInst Inst;
+    Inst.setOpcode(X86::WAIT);
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
+  }
+}
+
+bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
+                                       bool MatchingInlineAsm) {
+  assert(ErrorInfo && "Unknown missing feature!");
+  ArrayRef<SMRange> EmptyRanges = None;
+  SmallString<126> Msg;
+  raw_svector_ostream OS(Msg);
+  OS << "instruction requires:";
+  uint64_t Mask = 1;
+  for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+    if (ErrorInfo & Mask)
+      OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask);
+    Mask <<= 1;
+  }
+  return Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+}
+
+bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                              OperandVector &Operands,
+                                              MCStreamer &Out,
+                                              uint64_t &ErrorInfo,
+                                              bool MatchingInlineAsm) {
   assert(!Operands.empty() && "Unexpect empty operand list!");
   X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
   assert(Op.isToken() && "Leading operand should always be a mnemonic!");
   ArrayRef<SMRange> EmptyRanges = None;
 
   // First, handle aliases that expand to multiple instructions.
-  // FIXME: This should be replaced with a real .td file alias mechanism.
-  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
-  // call.
-  if (Op.getToken() == "fstsw" || Op.getToken() == "fstcw" ||
-      Op.getToken() == "fstsww" || Op.getToken() == "fstcww" ||
-      Op.getToken() == "finit" || Op.getToken() == "fsave" ||
-      Op.getToken() == "fstenv" || Op.getToken() == "fclex") {
-    MCInst Inst;
-    Inst.setOpcode(X86::WAIT);
-    Inst.setLoc(IDLoc);
-    if (!MatchingInlineAsm)
-      EmitInstruction(Inst, Operands, Out);
-
-    const char *Repl = StringSwitch<const char *>(Op.getToken())
-                           .Case("finit", "fninit")
-                           .Case("fsave", "fnsave")
-                           .Case("fstcw", "fnstcw")
-                           .Case("fstcww", "fnstcw")
-                           .Case("fstenv", "fnstenv")
-                           .Case("fstsw", "fnstsw")
-                           .Case("fstsww", "fnstsw")
-                           .Case("fclex", "fnclex")
-                           .Default(nullptr);
-    assert(Repl && "Unknown wait-prefixed instruction");
-    Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
-  }
+  MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
 
   bool WasOriginallyInvalidOperand = false;
   MCInst Inst;
@@ -2342,21 +2446,8 @@
       EmitInstruction(Inst, Operands, Out);
     Opcode = Inst.getOpcode();
     return false;
-  case Match_MissingFeature: {
-    assert(ErrorInfo && "Unknown missing feature!");
-    // Special case the error message for the very common case where only
-    // a single subtarget feature is missing.
-    std::string Msg = "instruction requires:";
-    unsigned Mask = 1;
-    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
-      if (ErrorInfo & Mask) {
-        Msg += " ";
-        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
-      }
-      Mask <<= 1;
-    }
-    return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm);
-  }
+  case Match_MissingFeature:
+    return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm);
   case Match_InvalidOperand:
     WasOriginallyInvalidOperand = true;
     break;
@@ -2385,34 +2476,18 @@
   const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
 
   // Check for the various suffix matches.
-  Tmp[Base.size()] = Suffixes[0];
-  unsigned ErrorInfoIgnore;
-  unsigned ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
-  unsigned Match1, Match2, Match3, Match4;
+  uint64_t ErrorInfoIgnore;
+  uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+  unsigned Match[4];
 
-  Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                MatchingInlineAsm, isParsingIntelSyntax());
-  // If this returned as a missing feature failure, remember that.
-  if (Match1 == Match_MissingFeature)
-    ErrorInfoMissingFeature = ErrorInfoIgnore;
-  Tmp[Base.size()] = Suffixes[1];
-  Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                MatchingInlineAsm, isParsingIntelSyntax());
-  // If this returned as a missing feature failure, remember that.
-  if (Match2 == Match_MissingFeature)
-    ErrorInfoMissingFeature = ErrorInfoIgnore;
-  Tmp[Base.size()] = Suffixes[2];
-  Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                MatchingInlineAsm, isParsingIntelSyntax());
-  // If this returned as a missing feature failure, remember that.
-  if (Match3 == Match_MissingFeature)
-    ErrorInfoMissingFeature = ErrorInfoIgnore;
-  Tmp[Base.size()] = Suffixes[3];
-  Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                MatchingInlineAsm, isParsingIntelSyntax());
-  // If this returned as a missing feature failure, remember that.
-  if (Match4 == Match_MissingFeature)
-    ErrorInfoMissingFeature = ErrorInfoIgnore;
+  for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
+    Tmp.back() = Suffixes[I];
+    Match[I] = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                  MatchingInlineAsm, isParsingIntelSyntax());
+    // If this returned as a missing feature failure, remember that.
+    if (Match[I] == Match_MissingFeature)
+      ErrorInfoMissingFeature = ErrorInfoIgnore;
+  }
 
   // Restore the old token.
   Op.setTokenValue(Base);
@@ -2421,8 +2496,7 @@
   // instruction will already have been filled in correctly, since the failing
   // matches won't have modified it).
   unsigned NumSuccessfulMatches =
-    (Match1 == Match_Success) + (Match2 == Match_Success) +
-    (Match3 == Match_Success) + (Match4 == Match_Success);
+      std::count(std::begin(Match), std::end(Match), Match_Success);
   if (NumSuccessfulMatches == 1) {
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
@@ -2438,10 +2512,9 @@
   if (NumSuccessfulMatches > 1) {
     char MatchChars[4];
     unsigned NumMatches = 0;
-    if (Match1 == Match_Success) MatchChars[NumMatches++] = Suffixes[0];
-    if (Match2 == Match_Success) MatchChars[NumMatches++] = Suffixes[1];
-    if (Match3 == Match_Success) MatchChars[NumMatches++] = Suffixes[2];
-    if (Match4 == Match_Success) MatchChars[NumMatches++] = Suffixes[3];
+    for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I)
+      if (Match[I] == Match_Success)
+        MatchChars[NumMatches++] = Suffixes[I];
 
     SmallString<126> Msg;
     raw_svector_ostream OS(Msg);
@@ -2462,8 +2535,7 @@
 
   // If all of the instructions reported an invalid mnemonic, then the original
   // mnemonic was invalid.
-  if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
-      (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
+  if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) {
     if (!WasOriginallyInvalidOperand) {
       ArrayRef<SMRange> Ranges =
           MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
@@ -2472,7 +2544,7 @@
     }
 
     // Recover location info for the operand if we know which was the problem.
-    if (ErrorInfo != ~0U) {
+    if (ErrorInfo != ~0ULL) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction",
                      EmptyRanges, MatchingInlineAsm);
@@ -2491,27 +2563,19 @@
 
   // If one instruction matched with a missing feature, report this as a
   // missing feature.
-  if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) +
-      (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){
-    std::string Msg = "instruction requires:";
-    unsigned Mask = 1;
-    for (unsigned i = 0; i < (sizeof(ErrorInfoMissingFeature)*8-1); ++i) {
-      if (ErrorInfoMissingFeature & Mask) {
-        Msg += " ";
-        Msg += getSubtargetFeatureName(ErrorInfoMissingFeature & Mask);
-      }
-      Mask <<= 1;
-    }
-    return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm);
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_MissingFeature) == 1) {
+    ErrorInfo = ErrorInfoMissingFeature;
+    return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+                               MatchingInlineAsm);
   }
 
   // If one instruction matched with an invalid operand, report this as an
   // operand failure.
-  if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) +
-      (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){
-    Error(IDLoc, "invalid operand for instruction", EmptyRanges,
-          MatchingInlineAsm);
-    return true;
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_InvalidOperand) == 1) {
+    return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+                 MatchingInlineAsm);
   }
 
   // If all of these were an outright failure, report it in a useless way.
@@ -2520,22 +2584,173 @@
   return true;
 }
 
+bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                                OperandVector &Operands,
+                                                MCStreamer &Out,
+                                                uint64_t &ErrorInfo,
+                                                bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  X86Operand &Op = static_cast<X86Operand &>(*Operands[0]);
+  assert(Op.isToken() && "Leading operand should always be a mnemonic!");
+  StringRef Mnemonic = Op.getToken();
+  ArrayRef<SMRange> EmptyRanges = None;
+
+  // First, handle aliases that expand to multiple instructions.
+  MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
+
+  MCInst Inst;
+
+  // Find one unsized memory operand, if present.
+  X86Operand *UnsizedMemOp = nullptr;
+  for (const auto &Op : Operands) {
+    X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+    if (X86Op->isMemUnsized())
+      UnsizedMemOp = X86Op;
+  }
+
+  // Allow some instructions to have implicitly pointer-sized operands.  This is
+  // compatible with gas.
+  if (UnsizedMemOp) {
+    static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"};
+    for (const char *Instr : PtrSizedInstrs) {
+      if (Mnemonic == Instr) {
+        UnsizedMemOp->Mem.Size = getPointerSize();
+        break;
+      }
+    }
+  }
+
+  // If an unsized memory operand is present, try to match with each memory
+  // operand size.  In Intel assembly, the size is not part of the instruction
+  // mnemonic.
+  SmallVector<unsigned, 8> Match;
+  uint64_t ErrorInfoMissingFeature = 0;
+  if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
+    static const unsigned MopSizes[] = {8, 16, 32, 64, 80};
+    for (unsigned Size : MopSizes) {
+      UnsizedMemOp->Mem.Size = Size;
+      uint64_t ErrorInfoIgnore;
+      unsigned LastOpcode = Inst.getOpcode();
+      unsigned M =
+          MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                               MatchingInlineAsm, isParsingIntelSyntax());
+      if (Match.empty() || LastOpcode != Inst.getOpcode())
+        Match.push_back(M);
+
+      // If this returned as a missing feature failure, remember that.
+      if (Match.back() == Match_MissingFeature)
+        ErrorInfoMissingFeature = ErrorInfoIgnore;
+    }
+
+    // Restore the size of the unsized memory operand if we modified it.
+    if (UnsizedMemOp)
+      UnsizedMemOp->Mem.Size = 0;
+  }
+
+  // If we haven't matched anything yet, this is not a basic integer or FPU
+  // operation.  There shouldn't be any ambiguity in our mneumonic table, so try
+  // matching with the unsized operand.
+  if (Match.empty()) {
+    Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                         MatchingInlineAsm,
+                                         isParsingIntelSyntax()));
+    // If this returned as a missing feature failure, remember that.
+    if (Match.back() == Match_MissingFeature)
+      ErrorInfoMissingFeature = ErrorInfo;
+  }
+
+  // Restore the size of the unsized memory operand if we modified it.
+  if (UnsizedMemOp)
+    UnsizedMemOp->Mem.Size = 0;
+
+  // If it's a bad mnemonic, all results will be the same.
+  if (Match.back() == Match_MnemonicFail) {
+    ArrayRef<SMRange> Ranges =
+        MatchingInlineAsm ? EmptyRanges : Op.getLocRange();
+    return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'",
+                 Ranges, MatchingInlineAsm);
+  }
+
+  // If exactly one matched, then we treat that as a successful match (and the
+  // instruction will already have been filled in correctly, since the failing
+  // matches won't have modified it).
+  unsigned NumSuccessfulMatches =
+      std::count(std::begin(Match), std::end(Match), Match_Success);
+  if (NumSuccessfulMatches == 1) {
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the individual
+    // transformations can chain off each other.
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      EmitInstruction(Inst, Operands, Out);
+    Opcode = Inst.getOpcode();
+    return false;
+  } else if (NumSuccessfulMatches > 1) {
+    assert(UnsizedMemOp &&
+           "multiple matches only possible with unsized memory operands");
+    ArrayRef<SMRange> Ranges =
+        MatchingInlineAsm ? EmptyRanges : UnsizedMemOp->getLocRange();
+    return Error(UnsizedMemOp->getStartLoc(),
+                 "ambiguous operand size for instruction '" + Mnemonic + "\'",
+                 Ranges, MatchingInlineAsm);
+  }
+
+  // If one instruction matched with a missing feature, report this as a
+  // missing feature.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_MissingFeature) == 1) {
+    ErrorInfo = ErrorInfoMissingFeature;
+    return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature,
+                               MatchingInlineAsm);
+  }
+
+  // If one instruction matched with an invalid operand, report this as an
+  // operand failure.
+  if (std::count(std::begin(Match), std::end(Match),
+                 Match_InvalidOperand) == 1) {
+    return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+                 MatchingInlineAsm);
+  }
+
+  // If all of these were an outright failure, report it in a useless way.
+  return Error(IDLoc, "unknown instruction mnemonic", EmptyRanges,
+               MatchingInlineAsm);
+}
+
+bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) {
+  return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo);
+}
 
 bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
+  MCAsmParser &Parser = getParser();
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
     return ParseDirectiveWord(2, DirectiveID.getLoc());
   else if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
   else if (IDVal.startswith(".att_syntax")) {
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (Parser.getTok().getString() == "prefix")
+        Parser.Lex();
+      else if (Parser.getTok().getString() == "noprefix")
+        return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not "
+                                           "supported: registers must have a "
+                                           "'%' prefix in .att_syntax");
+    }
     getParser().setAssemblerDialect(0);
     return false;
   } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
-      // FIXME: Handle noprefix
       if (Parser.getTok().getString() == "noprefix")
         Parser.Lex();
+      else if (Parser.getTok().getString() == "prefix")
+        return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not "
+                                           "supported: registers must not have "
+                                           "a '%' prefix in .intel_syntax");
     }
     return false;
   }
@@ -2545,6 +2760,7 @@
 /// ParseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
@@ -2572,6 +2788,7 @@
 /// ParseDirectiveCode
 ///  ::= .code16 | .code32 | .code64
 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+  MCAsmParser &Parser = getParser();
   if (IDVal == ".code16") {
     Parser.Lex();
     if (!is16BitMode()) {

diff --git a/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/lib/Target/X86/AsmParser/X86AsmParserCommon.h
index ef1565f..72aeeaa 100644
--- a/lib/Target/X86/AsmParser/X86AsmParserCommon.h
+++ b/lib/Target/X86/AsmParser/X86AsmParserCommon.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86_ASM_PARSER_COMMON_H
-#define X86_ASM_PARSER_COMMON_H
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H
 
 namespace llvm {
 
@@ -24,10 +24,6 @@
           (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
 }
 
-inline bool isImmZExtu32u8Value(uint64_t Value) {
-    return (Value <= 0x00000000000000FFULL);
-}
-
 inline bool isImmSExti64i8Value(uint64_t Value) {
   return ((                                  Value <= 0x000000000000007FULL)||
           (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
@@ -40,4 +36,4 @@
 
 } // End of namespace llvm
 
-#endif // X86_ASM_PARSER_COMMON_H
+#endif

diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 1bbfc11..e0fab8d 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86_OPERAND_H
-#define X86_OPERAND_H
+#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
 
 #include "X86AsmParserCommon.h"
 #include "llvm/MC/MCExpr.h"
@@ -153,20 +153,6 @@
     // extension.
     return isImmSExti32i8Value(CE->getValue());
   }
-  bool isImmZExtu32u8() const {
-    if (!isImm())
-      return false;
-
-    // If this isn't a constant expr, just assume it fits and let relaxation
-    // handle it.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
-      return true;
-
-    // Otherwise, check the value is in a range that makes sense for this
-    // extension.
-    return isImmZExtu32u8Value(CE->getValue());
-  }
   bool isImmSExti64i8() const {
     if (!isImm())
       return false;
@@ -205,6 +191,9 @@
   }
 
   bool isMem() const override { return Kind == Memory; }
+  bool isMemUnsized() const {
+    return Kind == Memory && Mem.Size == 0;
+  }
   bool isMem8() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 8);
   }
@@ -485,4 +474,4 @@
 
 } // End of namespace llvm
 
-#endif // X86_OPERAND
+#endif

diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index a09767e..1083fad 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt

@@ -14,15 +14,12 @@
 
 set(sources
   X86AsmPrinter.cpp
-  X86AtomicExpandPass.cpp
-  X86CodeEmitter.cpp
   X86FastISel.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp
   X86ISelDAGToDAG.cpp
   X86ISelLowering.cpp
   X86InstrInfo.cpp
-  X86JITInfo.cpp
   X86MCInstLower.cpp
   X86MachineFunctionInfo.cpp
   X86PadShortFunction.cpp

diff --git a/lib/Target/X86/Disassembler/LLVMBuild.txt b/lib/Target/X86/Disassembler/LLVMBuild.txt
index cac7adf..e003fc9 100644
--- a/lib/Target/X86/Disassembler/LLVMBuild.txt
+++ b/lib/Target/X86/Disassembler/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = X86Disassembler
 parent = X86
-required_libraries = MC Support X86Info
+required_libraries = MCDisassembler Support X86Info
 add_to_library_groups = X86

diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index c366725..5e8c2d6 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp

@@ -23,7 +23,6 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -97,16 +96,26 @@
   }
 }
 
-/// regionReader - a callback function that wraps the readByte method from
-///   MemoryObject.
+struct Region {
+  ArrayRef<uint8_t> Bytes;
+  uint64_t Base;
+  Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {}
+};
+
+/// A callback function that wraps the readByte method from Region.
 ///
-/// @param arg      - The generic callback parameter.  In this case, this should
-///                   be a pointer to a MemoryObject.
-/// @param byte     - A pointer to the byte to be read.
-/// @param address  - The address to be read.
-static int regionReader(const void* arg, uint8_t* byte, uint64_t address) {
-  const MemoryObject* region = static_cast<const MemoryObject*>(arg);
-  return region->readByte(address, byte);
+/// @param Arg      - The generic callback parameter.  In this case, this should
+///                   be a pointer to a Region.
+/// @param Byte     - A pointer to the byte to be read.
+/// @param Address  - The address to be read.
+static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) {
+  auto *R = static_cast<const Region *>(Arg);
+  ArrayRef<uint8_t> Bytes = R->Bytes;
+  unsigned Index = Address - R->Base;
+  if (Bytes.size() <= Index)
+    return -1;
+  *Byte = Bytes[Index];
+  return 0;
 }
 
 /// logger - a callback function that wraps the operator<< method from
@@ -127,38 +136,29 @@
 // Public interface for the disassembler
 //
 
-MCDisassembler::DecodeStatus
-X86GenericDisassembler::getInstruction(MCInst &instr,
-                                       uint64_t &size,
-                                       const MemoryObject &region,
-                                       uint64_t address,
-                                       raw_ostream &vStream,
-                                       raw_ostream &cStream) const {
-  CommentStream = &cStream;
+MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
+    MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+    raw_ostream &VStream, raw_ostream &CStream) const {
+  CommentStream = &CStream;
 
-  InternalInstruction internalInstr;
+  InternalInstruction InternalInstr;
 
-  dlog_t loggerFn = logger;
-  if (&vStream == &nulls())
-    loggerFn = nullptr; // Disable logging completely if it's going to nulls().
-  
-  int ret = decodeInstruction(&internalInstr,
-                              regionReader,
-                              (const void*)&region,
-                              loggerFn,
-                              (void*)&vStream,
-                              (const void*)MII.get(),
-                              address,
-                              fMode);
+  dlog_t LoggerFn = logger;
+  if (&VStream == &nulls())
+    LoggerFn = nullptr; // Disable logging completely if it's going to nulls().
 
-  if (ret) {
-    size = internalInstr.readerCursor - address;
+  Region R(Bytes, Address);
+
+  int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R,
+                              LoggerFn, (void *)&VStream,
+                              (const void *)MII.get(), Address, fMode);
+
+  if (Ret) {
+    Size = InternalInstr.readerCursor - Address;
     return Fail;
-  }
-  else {
-    size = internalInstr.length;
-    return (!translateInstruction(instr, internalInstr, this)) ?
-            Success : Fail;
+  } else {
+    Size = InternalInstr.length;
+    return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail;
   }
 }
 
@@ -717,7 +717,7 @@
     return false;
   case ENCODING_WRITEMASK:
     return translateMaskRegister(mcInst, insn.writemask);
-  case ENCODING_RM:
+  CASE_ENCODING_RM:
     return translateRM(mcInst, operand, insn, Dis);
   case ENCODING_CB:
   case ENCODING_CW:

diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index 4dc7c29..d7f426b 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h

@@ -71,8 +71,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86DISASSEMBLER_H
-#define X86DISASSEMBLER_H
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H
 
 #include "X86DisassemblerDecoderCommon.h"
 #include "llvm/MC/MCDisassembler.h"
@@ -87,21 +87,17 @@
 
 namespace X86Disassembler {
 
-/// X86GenericDisassembler - Generic disassembler for all X86 platforms.
-///   All each platform class should have to do is subclass the constructor, and
-///   provide a different disassemblerMode value.
+/// Generic disassembler for all X86 platforms. All each platform class should
+/// have to do is subclass the constructor, and provide a different
+/// disassemblerMode value.
 class X86GenericDisassembler : public MCDisassembler {
   std::unique_ptr<const MCInstrInfo> MII;
 public:
-  /// Constructor     - Initializes the disassembler.
-  ///
   X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                          std::unique_ptr<const MCInstrInfo> MII);
 public:
-
-  /// getInstruction - See MCDisassembler.
   DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
-                              const MemoryObject &region, uint64_t address,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &vStream,
                               raw_ostream &cStream) const override;
 

diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 55587d4..98b3440 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp

@@ -1,4 +1,4 @@
-//===-- X86DisassemblerDecoder.c - Disassembler decoder -------------------===//
+//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,10 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <stdarg.h>   /* for va_*()       */
-#include <stdio.h>    /* for vsnprintf()  */
-#include <stdlib.h>   /* for exit()       */
-#include <string.h>   /* for memset()     */
+#include <cstdarg>   /* for va_*()       */
+#include <cstdio>    /* for vsnprintf()  */
+#include <cstdlib>   /* for exit()       */
+#include <cstring>   /* for memset()     */
 
 #include "X86DisassemblerDecoder.h"
 
@@ -472,8 +472,7 @@
     if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
        ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
       insn->vectorExtensionType = TYPE_EVEX;
-    }
-    else {
+    } else {
       unconsumeByte(insn); /* unconsume byte1 */
       unconsumeByte(insn); /* unconsume byte  */
       insn->necessaryPrefixLocation = insn->readerCursor - 2;
@@ -504,8 +503,7 @@
               insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
               insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
     }
-  }
-  else if (byte == 0xc4) {
+  } else if (byte == 0xc4) {
     uint8_t byte1;
 
     if (lookAtByte(insn, &byte1)) {
@@ -516,8 +514,7 @@
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
       insn->vectorExtensionType = TYPE_VEX_3B;
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
-    }
-    else {
+    } else {
       unconsumeByte(insn);
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
@@ -541,8 +538,7 @@
                 insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
                 insn->vectorExtensionPrefix[2]);
     }
-  }
-  else if (byte == 0xc5) {
+  } else if (byte == 0xc5) {
     uint8_t byte1;
 
     if (lookAtByte(insn, &byte1)) {
@@ -552,8 +548,7 @@
 
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
       insn->vectorExtensionType = TYPE_VEX_2B;
-    }
-    else {
+    } else {
       unconsumeByte(insn);
     }
 
@@ -566,8 +561,7 @@
                         | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
       }
 
-      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1]))
-      {
+      switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
       default:
         break;
       case VEX_PREFIX_66:
@@ -579,8 +573,7 @@
                 insn->vectorExtensionPrefix[0],
                 insn->vectorExtensionPrefix[1]);
     }
-  }
-  else if (byte == 0x8f) {
+  } else if (byte == 0x8f) {
     uint8_t byte1;
 
     if (lookAtByte(insn, &byte1)) {
@@ -591,8 +584,7 @@
     if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
       insn->vectorExtensionType = TYPE_XOP;
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
-    }
-    else {
+    } else {
       unconsumeByte(insn);
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
@@ -612,8 +604,7 @@
                         | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
       }
 
-      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2]))
-      {
+      switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
       default:
         break;
       case VEX_PREFIX_66:
@@ -625,8 +616,7 @@
                 insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
                 insn->vectorExtensionPrefix[2]);
     }
-  }
-  else {
+  } else {
     if (insn->mode == MODE_64BIT) {
       if ((byte & 0xf0) == 0x40) {
         uint8_t opcodeByte;
@@ -698,8 +688,7 @@
 
   insn->opcodeType = ONEBYTE;
 
-  if (insn->vectorExtensionType == TYPE_EVEX)
-  {
+  if (insn->vectorExtensionType == TYPE_EVEX) {
     switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
     default:
       dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
@@ -715,8 +704,7 @@
       insn->opcodeType = THREEBYTE_3A;
       return consumeByte(insn, &insn->opcode);
     }
-  }
-  else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+  } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
     switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
     default:
       dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
@@ -732,12 +720,10 @@
       insn->opcodeType = THREEBYTE_3A;
       return consumeByte(insn, &insn->opcode);
     }
-  }
-  else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+  } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
     insn->opcodeType = TWOBYTE;
     return consumeByte(insn, &insn->opcode);
-  }
-  else if (insn->vectorExtensionType == TYPE_XOP) {
+  } else if (insn->vectorExtensionType == TYPE_XOP) {
     switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
     default:
       dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
@@ -866,6 +852,22 @@
 }
 
 /*
+ * is64Bit - Determines whether this instruction is a 64-bit instruction.
+ *
+ * @param name - The instruction that is not 16-bit
+ */
+static bool is64Bit(const char* name) {
+  off_t i;
+
+  for (i = 0;; ++i) {
+    if (name[i] == '\0')
+      return false;
+    if (name[i] == '6' && name[i+1] == '4')
+      return true;
+  }
+}
+
+/*
  * getID - Determines the ID of an instruction, consuming the ModR/M byte as
  *   appropriate for extended and escape opcodes.  Determines the attributes and
  *   context for the instruction before doing so.
@@ -911,8 +913,7 @@
         attrMask |= ATTR_EVEXL;
       if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
         attrMask |= ATTR_EVEXL2;
-    }
-    else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+    } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
       switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
@@ -927,8 +928,7 @@
 
       if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
         attrMask |= ATTR_VEXL;
-    }
-    else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+    } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
       switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
@@ -943,8 +943,7 @@
 
       if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
         attrMask |= ATTR_VEXL;
-    }
-    else if (insn->vectorExtensionType == TYPE_XOP) {
+    } else if (insn->vectorExtensionType == TYPE_XOP) {
       switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
@@ -959,12 +958,10 @@
 
       if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
         attrMask |= ATTR_VEXL;
-    }
-    else {
+    } else {
       return -1;
     }
-  }
-  else {
+  } else {
     if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
       attrMask |= ATTR_OPSIZE;
     else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
@@ -1002,6 +999,37 @@
 
   /* The following clauses compensate for limitations of the tables. */
 
+  if (insn->mode != MODE_64BIT &&
+      insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+    /*
+     * The tables can't distinquish between cases where the W-bit is used to
+     * select register size and cases where its a required part of the opcode.
+     */
+    if ((insn->vectorExtensionType == TYPE_EVEX &&
+         wFromEVEX3of4(insn->vectorExtensionPrefix[2])) ||
+        (insn->vectorExtensionType == TYPE_VEX_3B &&
+         wFromVEX3of3(insn->vectorExtensionPrefix[2])) ||
+        (insn->vectorExtensionType == TYPE_XOP &&
+         wFromXOP3of3(insn->vectorExtensionPrefix[2]))) {
+
+      uint16_t instructionIDWithREXW;
+      if (getIDWithAttrMask(&instructionIDWithREXW,
+                            insn, attrMask | ATTR_REXW)) {
+        insn->instructionID = instructionID;
+        insn->spec = specifierForUID(instructionID);
+        return 0;
+      }
+
+      const char *SpecName = GetInstrName(instructionIDWithREXW, miiArg);
+      // If not a 64-bit instruction. Switch the opcode.
+      if (!is64Bit(SpecName)) {
+        insn->instructionID = instructionIDWithREXW;
+        insn->spec = specifierForUID(instructionIDWithREXW);
+        return 0;
+      }
+    }
+  }
+
   if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
       !(attrMask & ATTR_OPSIZE)) {
     /*
@@ -1488,7 +1516,7 @@
     if (!valid)
       return -1;
     break;
-  case ENCODING_RM:
+  CASE_ENCODING_RM:
     if (insn->eaBase >= insn->eaRegBase) {
       insn->eaBase = (EABase)fixupRMValue(insn,
                                           (OperandType)op->type,
@@ -1681,11 +1709,14 @@
     case ENCODING_DI:
       break;
     case ENCODING_REG:
-    case ENCODING_RM:
+    CASE_ENCODING_RM:
       if (readModRM(insn))
         return -1;
       if (fixupReg(insn, &Op))
         return -1;
+      // Apply the AVX512 compressed displacement scaling factor.
+      if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+        insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
       break;
     case ENCODING_CB:
     case ENCODING_CW:

diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 8c45402..457b382 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86DISASSEMBLERDECODER_H
-#define X86DISASSEMBLERDECODER_H
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H
 
 #include "X86DisassemblerDecoderCommon.h"
 #include "llvm/ADT/ArrayRef.h"

diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index f59e0b6..bec4f0e 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86DISASSEMBLERDECODERCOMMON_H
-#define X86DISASSEMBLERDECODERCOMMON_H
+#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
+#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H
 
 #include "llvm/Support/DataTypes.h"
 
@@ -265,7 +265,7 @@
   ENUM_ENTRY(IC_EVEX_L2_W_KZ,        3,  "requires EVEX_KZ, L2 and W")               \
   ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ,     4,  "requires EVEX_KZ, L2, W and XS prefix")    \
   ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ,     4,  "requires EVEX_KZ, L2, W and XD prefix")    \
-  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")     
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")
 
 #define ENUM_ENTRY(n, r, d) n,
 enum InstructionContext {
@@ -325,11 +325,26 @@
 };
 #undef ENUM_ENTRY
 
+#define CASE_ENCODING_RM     \
+    case ENCODING_RM:        \
+    case ENCODING_RM_CD2:    \
+    case ENCODING_RM_CD4:    \
+    case ENCODING_RM_CD8:    \
+    case ENCODING_RM_CD16:   \
+    case ENCODING_RM_CD32:   \
+    case ENCODING_RM_CD64
+
 // Physical encodings of instruction operands.
 #define ENCODINGS                                                              \
   ENUM_ENTRY(ENCODING_NONE,   "")                                              \
   ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
   ENUM_ENTRY(ENCODING_RM,     "R/M operand in ModR/M byte.")                   \
+  ENUM_ENTRY(ENCODING_RM_CD2, "R/M operand with CDisp scaling of 2")           \
+  ENUM_ENTRY(ENCODING_RM_CD4, "R/M operand with CDisp scaling of 4")           \
+  ENUM_ENTRY(ENCODING_RM_CD8, "R/M operand with CDisp scaling of 8")           \
+  ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16")          \
+  ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32")          \
+  ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64")          \
   ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
   ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.")         \
   ENUM_ENTRY(ENCODING_CB,     "1-byte code offset (possible new CS value)")    \
@@ -438,8 +453,12 @@
   ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
   ENUM_ENTRY(TYPE_XMM512,     "64-byte")                                       \
   ENUM_ENTRY(TYPE_VK1,        "1-bit")                                         \
+  ENUM_ENTRY(TYPE_VK2,        "2-bit")                                         \
+  ENUM_ENTRY(TYPE_VK4,        "4-bit")                                         \
   ENUM_ENTRY(TYPE_VK8,        "8-bit")                                         \
   ENUM_ENTRY(TYPE_VK16,       "16-bit")                                        \
+  ENUM_ENTRY(TYPE_VK32,       "32-bit")                                        \
+  ENUM_ENTRY(TYPE_VK64,       "64-bit")                                        \
   ENUM_ENTRY(TYPE_XMM0,       "Implicit use of XMM0")                          \
   ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
   ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
@@ -481,7 +500,7 @@
 };
 #undef ENUM_ENTRY
 
-static const unsigned X86_MAX_OPERANDS = 5;
+static const unsigned X86_MAX_OPERANDS = 6;
 
 /// Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
 /// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,

diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index b45b118..b72730c 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp

@@ -45,19 +45,31 @@
   const MCInstrDesc &Desc = MII.get(MI->getOpcode());
   uint64_t TSFlags = Desc.TSFlags;
 
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    HasCustomInstComment =
+        EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+
   if (TSFlags & X86II::LOCK)
     OS << "\tlock\n";
 
+  // Output CALLpcrel32 as "callq" in 64-bit mode.
+  // In Intel annotation it's always emitted as "call".
+  //
+  // TODO: Probably this hack should be redesigned via InstAlias in
+  // InstrInfo.td as soon as Requires clause is supported properly
+  // for InstAlias.
+  if (MI->getOpcode() == X86::CALLpcrel32 &&
+      (getAvailableFeatures() & X86::Mode64Bit) != 0) {
+    OS << "\tcallq\t";
+    printPCRelImm(MI, 0, OS);
+  }
   // Try to print any aliases first.
-  if (!printAliasInstr(MI, OS))
+  else if (!printAliasInstr(MI, OS))
     printInstruction(MI, OS);
 
   // Next always print the annotation.
   printAnnotation(OS, Annot);
-
-  // If verbose assembly is enabled, we can print some informative comments.
-  if (CommentStream)
-    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
 }
 
 void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
@@ -170,7 +182,11 @@
       << '$' << formatImm((int64_t)Op.getImm())
       << markup(">");
 
-    if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
+    // If there are no instruction-specific comments, add a comment clarifying
+    // the hex value of the immediate operand when it isn't in the range
+    // [-256,255].
+    if (CommentStream && !HasCustomInstComment &&
+        (Op.getImm() > 255 || Op.getImm() < -256))
       *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
 
   } else {

diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 531183b..41be14b 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h

@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86_ATT_INST_PRINTER_H
-#define X86_ATT_INST_PRINTER_H
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
 
@@ -23,8 +24,11 @@
 class X86ATTInstPrinter final : public MCInstPrinter {
 public:
   X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                    const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MII, MRI) {}
+                    const MCRegisterInfo &MRI, const MCSubtargetInfo &STI)
+    : MCInstPrinter(MAI, MII, MRI) {
+    // Initialize the set of available features.
+    setAvailableFeatures(STI.getFeatureBits());
+  }
 
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override;
@@ -129,6 +133,9 @@
   void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemOffset(MI, OpNo, O);
   }
+
+private:
+  bool HasCustomInstComment;
 };
   
 }

diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index baf6507..a8f15e6 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp

@@ -28,13 +28,116 @@
 /// EmitAnyX86InstComments - This function decodes x86 instructions and prints
 /// newline terminated strings to the specified string if desired.  This
 /// information is shown in disassembly dumps when verbose assembly is enabled.
-void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
                                   const char *(*getRegName)(unsigned)) {
   // If this is a shuffle operation, the switch should fill in this state.
   SmallVector<int, 8> ShuffleMask;
   const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
 
   switch (MI->getOpcode()) {
+  default:
+    // Not an instruction for which we can decode comments.
+    return false;
+
+  case X86::BLENDPDrri:
+  case X86::VBLENDPDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::BLENDPDrmi:
+  case X86::VBLENDPDrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v2f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VBLENDPDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VBLENDPDYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v4f64,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::BLENDPSrri:
+  case X86::VBLENDPSrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::BLENDPSrmi:
+  case X86::VBLENDPSrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v4f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VBLENDPSYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VBLENDPSYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v8f32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::PBLENDWrri:
+  case X86::VPBLENDWrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PBLENDWrmi:
+  case X86::VPBLENDWrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v8i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPBLENDWYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPBLENDWYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::VPBLENDDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPBLENDDrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v4i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::VPBLENDDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPBLENDDYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v8i32,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
   case X86::INSERTPSrr:
   case X86::VINSERTPSrr:
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -60,6 +163,80 @@
     DecodeMOVHLPSMask(2, ShuffleMask);
     break;
 
+  case X86::MOVSLDUPrr:
+  case X86::VMOVSLDUPrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSLDUPrm:
+  case X86::VMOVSLDUPrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask);
+    break;
+
+  case X86::VMOVSHDUPYrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VMOVSHDUPYrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask);
+    break;
+
+  case X86::VMOVSLDUPYrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VMOVSLDUPYrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask);
+    break;
+
+  case X86::MOVSHDUPrr:
+  case X86::VMOVSHDUPrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSHDUPrm:
+  case X86::VMOVSHDUPrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask);
+    break;
+
+  case X86::PSLLDQri:
+  case X86::VPSLLDQri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSLLDQMask(MVT::v16i8,
+                       MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::VPSLLDQYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSLLDQMask(MVT::v32i8,
+                       MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::PSRLDQri:
+  case X86::VPSRLDQri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSRLDQMask(MVT::v16i8,
+                       MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    break;
+
+  case X86::VPSRLDQYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodePSRLDQMask(MVT::v32i8,
+                       MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    break;
+
   case X86::PALIGNR128rr:
   case X86::VPALIGNR128rr:
     Src1Name = getRegName(MI->getOperand(2).getReg());
@@ -489,54 +666,59 @@
     break;
   }
 
+  // The only comments we decode are shuffles, so give up if we were unable to
+  // decode a shuffle mask.
+  if (ShuffleMask.empty())
+    return false;
 
-  // If this was a shuffle operation, print the shuffle mask.
-  if (!ShuffleMask.empty()) {
-    if (!DestName) DestName = Src1Name;
-    OS << (DestName ? DestName : "mem") << " = ";
+  if (!DestName) DestName = Src1Name;
+  OS << (DestName ? DestName : "mem") << " = ";
 
-    // If the two sources are the same, canonicalize the input elements to be
-    // from the first src so that we get larger element spans.
-    if (Src1Name == Src2Name) {
-      for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
-        if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
-            ShuffleMask[i] >= (int)e)        // From second mask.
-          ShuffleMask[i] -= e;
-      }
-    }
-
-    // The shuffle mask specifies which elements of the src1/src2 fill in the
-    // destination, with a few sentinel values.  Loop through and print them
-    // out.
+  // If the two sources are the same, canonicalize the input elements to be
+  // from the first src so that we get larger element spans.
+  if (Src1Name == Src2Name) {
     for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
-      if (i != 0)
-        OS << ',';
-      if (ShuffleMask[i] == SM_SentinelZero) {
-        OS << "zero";
-        continue;
-      }
-
-      // Otherwise, it must come from src1 or src2.  Print the span of elements
-      // that comes from this src.
-      bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
-      const char *SrcName = isSrc1 ? Src1Name : Src2Name;
-      OS << (SrcName ? SrcName : "mem") << '[';
-      bool IsFirst = true;
-      while (i != e &&
-             (int)ShuffleMask[i] >= 0 &&
-             (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
-        if (!IsFirst)
-          OS << ',';
-        else
-          IsFirst = false;
-        OS << ShuffleMask[i] % ShuffleMask.size();
-        ++i;
-      }
-      OS << ']';
-      --i;  // For loop increments element #.
+      if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+          ShuffleMask[i] >= (int)e)        // From second mask.
+        ShuffleMask[i] -= e;
     }
-    //MI->print(OS, 0);
-    OS << "\n";
   }
 
+  // The shuffle mask specifies which elements of the src1/src2 fill in the
+  // destination, with a few sentinel values.  Loop through and print them
+  // out.
+  for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+    if (i != 0)
+      OS << ',';
+    if (ShuffleMask[i] == SM_SentinelZero) {
+      OS << "zero";
+      continue;
+    }
+
+    // Otherwise, it must come from src1 or src2.  Print the span of elements
+    // that comes from this src.
+    bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+    const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+    OS << (SrcName ? SrcName : "mem") << '[';
+    bool IsFirst = true;
+    while (i != e && (int)ShuffleMask[i] != SM_SentinelZero &&
+           (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+      if (!IsFirst)
+        OS << ',';
+      else
+        IsFirst = false;
+      if (ShuffleMask[i] == SM_SentinelUndef)
+        OS << "u";
+      else
+        OS << ShuffleMask[i] % ShuffleMask.size();
+      ++i;
+    }
+    OS << ']';
+    --i;  // For loop increments element #.
+  }
+  //MI->print(OS, 0);
+  OS << "\n";
+
+  // We successfully added a comment to this instruction.
+  return true;
 }

diff --git a/lib/Target/X86/InstPrinter/X86InstComments.h b/lib/Target/X86/InstPrinter/X86InstComments.h
index 13fdf9a..687581b 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.h
+++ b/lib/Target/X86/InstPrinter/X86InstComments.h

@@ -12,13 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86_INST_COMMENTS_H
-#define X86_INST_COMMENTS_H
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
 
 namespace llvm {
   class MCInst;
   class raw_ostream;
-  void EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+  bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
                               const char *(*getRegName)(unsigned));
 }
 

diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 4d9b481..d082f0b 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86_INTEL_INST_PRINTER_H
-#define X86_INTEL_INST_PRINTER_H
+#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
+#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/Support/raw_ostream.h"

diff --git a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
index 146d111..b9fdc9c 100644
--- a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = X86Desc
 parent = X86
-required_libraries = MC Object Support X86AsmPrinter X86Info
+required_libraries = MC MCDisassembler Object Support X86AsmPrinter X86Info
 add_to_library_groups = X86

diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 23bca0d..befa6c2 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp

@@ -11,7 +11,6 @@
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
@@ -437,10 +436,30 @@
   bool Is64Bit;
 
   unsigned OffsetSize;                   ///< Offset of a "push" instruction.
-  unsigned PushInstrSize;                ///< Size of a "push" instruction.
   unsigned MoveInstrSize;                ///< Size of a "move" instruction.
-  unsigned StackDivide;                  ///< Amount to adjust stack stize by.
+  unsigned StackDivide;                  ///< Amount to adjust stack size by.
 protected:
+  /// \brief Size of a "push" instruction for the given register.
+  unsigned PushInstrSize(unsigned Reg) const {
+    switch (Reg) {
+      case X86::EBX:
+      case X86::ECX:
+      case X86::EDX:
+      case X86::EDI:
+      case X86::ESI:
+      case X86::EBP:
+      case X86::RBX:
+      case X86::RBP:
+        return 1;
+      case X86::R12:
+      case X86::R13:
+      case X86::R14:
+      case X86::R15:
+        return 2;
+    }
+    return 1;
+  }
+
   /// \brief Implementation of algorithm to generate the compact unwind encoding
   /// for the CFI instructions.
   uint32_t
@@ -530,7 +549,7 @@
         unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
         SavedRegs[SavedRegIdx++] = Reg;
         StackAdjust += OffsetSize;
-        InstrOffset += PushInstrSize;
+        InstrOffset += PushInstrSize(Reg);
         break;
       }
       }
@@ -724,7 +743,6 @@
     OffsetSize = Is64Bit ? 8 : 4;
     MoveInstrSize = Is64Bit ? 3 : 2;
     StackDivide = Is64Bit ? 8 : 4;
-    PushInstrSize = 1;
   }
 };
 

diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 6aeb1f2..365cf0c 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86BASEINFO_H
-#define X86BASEINFO_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H
 
 #include "X86MCTargetDesc.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -216,7 +216,7 @@
     MO_SECREL
   };
 
-  enum {
+  enum : uint64_t {
     //===------------------------------------------------------------------===//
     // Instruction encodings.  These are the standard/most common forms for X86
     // instructions.
@@ -303,17 +303,18 @@
     //// MRM_XX - A mod/rm byte of exactly 0xXX.
     MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35,
     MRM_C4 = 36, MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39,
-    MRM_CB = 40, MRM_D0 = 41, MRM_D1 = 42, MRM_D4 = 43,
-    MRM_D5 = 44, MRM_D6 = 45, MRM_D8 = 46, MRM_D9 = 47,
-    MRM_DA = 48, MRM_DB = 49, MRM_DC = 50, MRM_DD = 51,
-    MRM_DE = 52, MRM_DF = 53, MRM_E0 = 54, MRM_E1 = 55,
-    MRM_E2 = 56, MRM_E3 = 57, MRM_E4 = 58, MRM_E5 = 59,
-    MRM_E8 = 60, MRM_E9 = 61, MRM_EA = 62, MRM_EB = 63,
-    MRM_EC = 64, MRM_ED = 65, MRM_EE = 66, MRM_F0 = 67,
-    MRM_F1 = 68, MRM_F2 = 69, MRM_F3 = 70, MRM_F4 = 71,
-    MRM_F5 = 72, MRM_F6 = 73, MRM_F7 = 74, MRM_F8 = 75,
-    MRM_F9 = 76, MRM_FA = 77, MRM_FB = 78, MRM_FC = 79,
-    MRM_FD = 80, MRM_FE = 81, MRM_FF = 82,
+    MRM_CB = 40, MRM_CF = 41, MRM_D0 = 42, MRM_D1 = 43,
+    MRM_D4 = 44, MRM_D5 = 45, MRM_D6 = 46, MRM_D7 = 47,
+    MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50, MRM_DB = 51,
+    MRM_DC = 52, MRM_DD = 53, MRM_DE = 54, MRM_DF = 55,
+    MRM_E0 = 56, MRM_E1 = 57, MRM_E2 = 58, MRM_E3 = 59,
+    MRM_E4 = 60, MRM_E5 = 61, MRM_E8 = 62, MRM_E9 = 63,
+    MRM_EA = 64, MRM_EB = 65, MRM_EC = 66, MRM_ED = 67,
+    MRM_EE = 68, MRM_F0 = 69, MRM_F1 = 70, MRM_F2 = 71,
+    MRM_F3 = 72, MRM_F4 = 73, MRM_F5 = 74, MRM_F6 = 75,
+    MRM_F7 = 76, MRM_F8 = 77, MRM_F9 = 78, MRM_FA = 79,
+    MRM_FB = 80, MRM_FC = 81, MRM_FD = 82, MRM_FE = 83,
+    MRM_FF = 84,
 
     FormMask       = 127,
 
@@ -327,8 +328,8 @@
     OpSizeShift = 7,
     OpSizeMask = 0x3 << OpSizeShift,
 
-    OpSize16 = 1,
-    OpSize32 = 2,
+    OpSize16 = 1 << OpSizeShift,
+    OpSize32 = 2 << OpSizeShift,
 
     // AsSize - Set if this instruction requires an operand size prefix (0x67),
     // which most often indicates that the instruction address 16 bit address
@@ -454,51 +455,53 @@
     EncodingMask = 0x3 << EncodingShift,
 
     // VEX - encoding using 0xC4/0xC5
-    VEX = 1,
+    VEX = 1 << EncodingShift,
 
     /// XOP - Opcode prefix used by XOP instructions.
-    XOP = 2,
+    XOP = 2 << EncodingShift,
 
     // VEX_EVEX - Specifies that this instruction use EVEX form which provides
     // syntax support up to 32 512-bit register operands and up to 7 16-bit
     // mask operands as well as source operand data swizzling/memory operand
     // conversion, eviction hint, and rounding mode.
-    EVEX = 3,
+    EVEX = 3 << EncodingShift,
 
     // Opcode
     OpcodeShift   = EncodingShift + 2,
 
-    //===------------------------------------------------------------------===//
-    /// VEX - The opcode prefix used by AVX instructions
-    VEXShift = OpcodeShift + 8,
-
     /// VEX_W - Has a opcode specific functionality, but is used in the same
     /// way as REX_W is for regular SSE instructions.
-    VEX_W       = 1U << 0,
+    VEX_WShift  = OpcodeShift + 8,
+    VEX_W       = 1ULL << VEX_WShift,
 
     /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
     /// address instructions in SSE are represented as 3 address ones in AVX
     /// and the additional register is encoded in VEX_VVVV prefix.
-    VEX_4V      = 1U << 1,
+    VEX_4VShift = VEX_WShift + 1,
+    VEX_4V      = 1ULL << VEX_4VShift,
 
     /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode
     /// operand 3 with VEX.vvvv.
-    VEX_4VOp3   = 1U << 2,
+    VEX_4VOp3Shift = VEX_4VShift + 1,
+    VEX_4VOp3   = 1ULL << VEX_4VOp3Shift,
 
     /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
     /// must be encoded in the i8 immediate field. This usually happens in
     /// instructions with 4 operands.
-    VEX_I8IMM   = 1U << 3,
+    VEX_I8IMMShift = VEX_4VOp3Shift + 1,
+    VEX_I8IMM   = 1ULL << VEX_I8IMMShift,
 
     /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
     /// instruction uses 256-bit wide registers. This is usually auto detected
     /// if a VR256 register is used, but some AVX instructions also have this
     /// field marked when using a f256 memory references.
-    VEX_L       = 1U << 4,
+    VEX_LShift = VEX_I8IMMShift + 1,
+    VEX_L       = 1ULL << VEX_LShift,
 
     // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
     // prefix. Usually used for scalar instructions. Needed by disassembler.
-    VEX_LIG     = 1U << 5,
+    VEX_LIGShift = VEX_LShift + 1,
+    VEX_LIG     = 1ULL << VEX_LIGShift,
 
     // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field
     // with following encoding:
@@ -509,24 +512,24 @@
     // this will save 1 tsflag bit
 
     // EVEX_K - Set if this instruction requires masking
-    EVEX_K      = 1U << 6,
+    EVEX_KShift = VEX_LIGShift + 1,
+    EVEX_K      = 1ULL << EVEX_KShift,
 
     // EVEX_Z - Set if this instruction has EVEX.Z field set.
-    EVEX_Z      = 1U << 7,
+    EVEX_ZShift = EVEX_KShift + 1,
+    EVEX_Z      = 1ULL << EVEX_ZShift,
 
     // EVEX_L2 - Set if this instruction has EVEX.L' field set.
-    EVEX_L2     = 1U << 8,
+    EVEX_L2Shift = EVEX_ZShift + 1,
+    EVEX_L2     = 1ULL << EVEX_L2Shift,
 
     // EVEX_B - Set if this instruction has EVEX.B field set.
-    EVEX_B      = 1U << 9,
+    EVEX_BShift = EVEX_L2Shift + 1,
+    EVEX_B      = 1ULL << EVEX_BShift,
 
-    // EVEX_CD8E - compressed disp8 form, element-size
-    EVEX_CD8EShift = VEXShift + 10,
-    EVEX_CD8EMask = 3,
-
-    // EVEX_CD8V - compressed disp8 form, vector-width
-    EVEX_CD8VShift = EVEX_CD8EShift + 2,
-    EVEX_CD8VMask = 7,
+    // The scaling factor for the AVX512's 8-bit compressed displacement.
+    CD8_Scale_Shift = EVEX_BShift + 1,
+    CD8_Scale_Mask = 127ULL << CD8_Scale_Shift,
 
     /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
     /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
@@ -534,14 +537,17 @@
     /// storing a classifier in the imm8 field.  To simplify our implementation,
     /// we handle this by storeing the classifier in the opcode field and using
     /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
-    Has3DNow0F0FOpcode = 1U << 15,
+    Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7,
+    Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift,
 
     /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
     /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
-    MemOp4 = 1U << 16,
+    MemOp4Shift = Has3DNow0F0FOpcodeShift + 1,
+    MemOp4 = 1ULL << MemOp4Shift,
 
     /// Explicitly specified rounding control
-    EVEX_RC = 1U << 17
+    EVEX_RCShift = MemOp4Shift + 1,
+    EVEX_RC = 1ULL << EVEX_RCShift
   };
 
   // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
@@ -643,10 +649,10 @@
   /// counted as one operand.
   ///
   inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
-    bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
-    bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
-    bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
-    
+    bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+    bool HasMemOp4 = TSFlags & X86II::MemOp4;
+    bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+
     switch (TSFlags & X86II::FormMask) {
     default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
     case X86II::Pseudo:
@@ -687,7 +693,7 @@
     case X86II::MRM2m: case X86II::MRM3m:
     case X86II::MRM4m: case X86II::MRM5m:
     case X86II::MRM6m: case X86II::MRM7m: {
-      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+      bool HasVEX_4V = TSFlags & X86II::VEX_4V;
       unsigned FirstMemOp = 0;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
@@ -698,20 +704,21 @@
     case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
     case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
     case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
-    case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
-    case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
-    case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
-    case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
-    case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1:
-    case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4:
-    case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9:
-    case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
-    case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0:
-    case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3:
-    case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6:
-    case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9:
-    case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC:
-    case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF:
+    case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+    case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6:
+    case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9:
+    case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC:
+    case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF:
+    case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2:
+    case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5:
+    case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA:
+    case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED:
+    case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1:
+    case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4:
+    case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7:
+    case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA:
+    case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD:
+    case X86II::MRM_FE: case X86II::MRM_FF:
       return -1;
     }
   }

diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 3fdec87..be6a8e4 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp

@@ -77,7 +77,7 @@
           break;
         case MCSymbolRefExpr::VK_GOTTPOFF:
           Type = ELF::R_X86_64_GOTTPOFF;
-        break;
+          break;
         case MCSymbolRefExpr::VK_TLSGD:
           Type = ELF::R_X86_64_TLSGD;
           break;

diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index 09396b7..4899900 100644
--- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_X86_X86FIXUPKINDS_H
-#define LLVM_X86_X86FIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 

diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 83b2777..5679d63 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp

@@ -72,11 +72,10 @@
   if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6))
     HasWeakDefCanBeHiddenDirective = false;
 
-  // FIXME: this should not depend on the target OS version, but on the ld64
-  // version in use.  From at least >= ld64-97.17 (Xcode 3.2.6) the abs-ified
-  // FDE relocs may be used. We also use them for the ios simulator.
-  DwarfFDESymbolsUseAbsDiff = (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6))
-    || T.isiOS();
+  // Assume ld64 is new enough that the abs-ified FDE relocs may be used
+  // (actually, must, since otherwise the non-extern relocations we produce
+  // overwhelm ld64's tiny little mind and it fails).
+  DwarfFDESymbolsUseAbsDiff = true;
 
   UseIntegratedAssembler = true;
 }
@@ -103,9 +102,6 @@
 
   TextAlignFillValue = 0x90;
 
-  // Set up DWARF directives
-  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
-
   // Debug Information
   SupportsDebugInformation = true;
 
@@ -134,19 +130,14 @@
   return MCBinaryExpr::CreateAdd(Res, Four, Context);
 }
 
-const MCSection *X86ELFMCAsmInfo::
-getNonexecutableStackSection(MCContext &Ctx) const {
-  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
-                           0, SectionKind::getMetadata());
-}
-
 void X86MCAsmInfoMicrosoft::anchor() { }
 
 X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
   if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
     PointerSize = 8;
-    ExceptionsType = ExceptionHandling::WinEH;
+    WinEHEncodingType = WinEH::EncodingType::Itanium;
+    ExceptionsType = ExceptionHandling::ItaniumWinEH;
   }
 
   AssemblerDialect = AsmWriterFlavor;
@@ -165,7 +156,8 @@
   if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
     PointerSize = 8;
-    ExceptionsType = ExceptionHandling::WinEH;
+    WinEHEncodingType = WinEH::EncodingType::Itanium;
+    ExceptionsType = ExceptionHandling::ItaniumWinEH;
   } else {
     ExceptionsType = ExceptionHandling::DwarfCFI;
   }

diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index a7509b0..f2f06c3 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86TARGETASMINFO_H
-#define X86TARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
 
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
@@ -39,8 +39,6 @@
     void anchor() override;
   public:
     explicit X86ELFMCAsmInfo(const Triple &Triple);
-    const MCSection *
-    getNonexecutableStackSection(MCContext &Ctx) const override;
   };
 
   class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {

diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 2152b21..31b8e2d 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp

@@ -185,42 +185,21 @@
 /// isCDisp8 - Return true if this signed displacement fits in a 8-bit
 /// compressed dispacement field.
 static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
-  assert((TSFlags & X86II::EncodingMask) >> X86II::EncodingShift == X86II::EVEX &&
+  assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
          "Compressed 8-bit displacement is only valid for EVEX inst.");
 
-  unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask;
-  unsigned CD8V = (TSFlags >> X86II::EVEX_CD8VShift) & X86II::EVEX_CD8VMask;
-
-  if (CD8V == 0 && CD8E == 0) {
+  unsigned CD8_Scale =
+    (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
+  if (CD8_Scale == 0) {
     CValue = Value;
     return isDisp8(Value);
   }
-  
-  unsigned MemObjSize = 1U << CD8E;
-  if (CD8V & 4) {
-    // Fixed vector length
-    MemObjSize *= 1U << (CD8V & 0x3);
-  } else {
-    // Modified vector length
-    bool EVEX_b = (TSFlags >> X86II::VEXShift) & X86II::EVEX_B;
-    if (!EVEX_b) {
-      unsigned EVEX_LL = ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) ? 1 : 0;
-      EVEX_LL += ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2) ? 2 : 0;
-      assert(EVEX_LL < 3 && "");
 
-      unsigned NumElems = (1U << (EVEX_LL + 4)) / MemObjSize;
-      NumElems /= 1U << (CD8V & 0x3);
-
-      MemObjSize *= NumElems;
-    }
-  }
-
-  unsigned MemObjMask = MemObjSize - 1;
-  assert((MemObjSize & MemObjMask) == 0 && "Invalid memory object size.");
-
-  if (Value & MemObjMask) // Unaligned offset
+  unsigned Mask = CD8_Scale - 1;
+  assert((CD8_Scale & Mask) == 0 && "Invalid memory object size.");
+  if (Value & Mask) // Unaligned offset
     return false;
-  Value /= (int)MemObjSize;
+  Value /= (int)CD8_Scale;
   bool Ret = (Value == (signed char)Value);
 
   if (Ret)
@@ -393,9 +372,7 @@
   const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
   const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
   unsigned BaseReg = Base.getReg();
-  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
-                           X86II::EncodingShift;
-  bool HasEVEX = (Encoding == X86II::EVEX);
+  bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
 
   // Handle %rip relative addressing.
   if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
@@ -613,13 +590,12 @@
                                            int MemOperand, const MCInst &MI,
                                            const MCInstrDesc &Desc,
                                            raw_ostream &OS) const {
-  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
-                           X86II::EncodingShift;
-  bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
-  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
-  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
-  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
-  bool HasEVEX_RC = (TSFlags >> X86II::VEXShift) & X86II::EVEX_RC;
+  uint64_t Encoding = TSFlags & X86II::EncodingMask;
+  bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+  bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3;
+  bool HasMemOp4 = TSFlags & X86II::MemOp4;
+  bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
 
   // VEX_R: opcode externsion equivalent to REX.R in
   // 1's complement (inverted) form
@@ -700,18 +676,18 @@
 
   bool EncodeRC = false;
 
-  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+  if (TSFlags & X86II::VEX_W)
     VEX_W = 1;
 
-  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+  if (TSFlags & X86II::VEX_L)
     VEX_L = 1;
-  if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2))
+  if (TSFlags & X86II::EVEX_L2)
     EVEX_L2 = 1;
 
-  if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z))
+  if (HasEVEX_K && (TSFlags & X86II::EVEX_Z))
     EVEX_z = 1;
 
-  if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_B))
+  if ((TSFlags & X86II::EVEX_B))
     EVEX_b = 1;
 
   switch (TSFlags & X86II::OpPrefixMask) {
@@ -1129,8 +1105,8 @@
                                         raw_ostream &OS) const {
 
   // Emit the operand size opcode prefix as needed.
-  unsigned char OpSize = (TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift;
-  if (OpSize == (is16BitMode(STI) ? X86II::OpSize32 : X86II::OpSize16))
+  if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32
+                                                         : X86II::OpSize16))
     EmitByte(0x66, CurByte, OS);
 
   switch (TSFlags & X86II::OpPrefixMask) {
@@ -1190,19 +1166,18 @@
   unsigned CurByte = 0;
 
   // Encoding type for this instruction.
-  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
-                           X86II::EncodingShift;
+  uint64_t Encoding = TSFlags & X86II::EncodingMask;
 
   // It uses the VEX.VVVV field?
-  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
-  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
-  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+  bool HasVEX_4V = TSFlags & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3;
+  bool HasMemOp4 = TSFlags & X86II::MemOp4;
   const unsigned MemOp4_I8IMMOperand = 2;
 
   // It uses the EVEX.aaa field?
-  bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
-  bool HasEVEX_RC = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_RC);
-  
+  bool HasEVEX_K = TSFlags & X86II::EVEX_K;
+  bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;
+
   // Determine where the memory operand starts, if present.
   int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
   if (MemoryOperand != -1) MemoryOperand += CurOp;
@@ -1257,7 +1232,7 @@
 
   unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
 
-  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+  if (TSFlags & X86II::Has3DNow0F0FOpcode)
     BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
 
   unsigned SrcRegNum = 0;
@@ -1457,20 +1432,21 @@
   case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
   case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
   case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
-  case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
-  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
-  case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
-  case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
-  case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1:
-  case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4:
-  case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9:
-  case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
-  case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0:
-  case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3:
-  case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6:
-  case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9:
-  case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC:
-  case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF:
+  case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
+  case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6:
+  case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9:
+  case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC:
+  case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF:
+  case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2:
+  case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5:
+  case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA:
+  case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED:
+  case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1:
+  case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4:
+  case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7:
+  case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA:
+  case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD:
+  case X86II::MRM_FE: case X86II::MRM_FF:
     EmitByte(BaseOpcode, CurByte, OS);
 
     unsigned char MRM;
@@ -1485,11 +1461,13 @@
     case X86II::MRM_C9: MRM = 0xC9; break;
     case X86II::MRM_CA: MRM = 0xCA; break;
     case X86II::MRM_CB: MRM = 0xCB; break;
+    case X86II::MRM_CF: MRM = 0xCF; break;
     case X86II::MRM_D0: MRM = 0xD0; break;
     case X86II::MRM_D1: MRM = 0xD1; break;
     case X86II::MRM_D4: MRM = 0xD4; break;
     case X86II::MRM_D5: MRM = 0xD5; break;
     case X86II::MRM_D6: MRM = 0xD6; break;
+    case X86II::MRM_D7: MRM = 0xD7; break;
     case X86II::MRM_D8: MRM = 0xD8; break;
     case X86II::MRM_D9: MRM = 0xD9; break;
     case X86II::MRM_DA: MRM = 0xDA; break;
@@ -1538,7 +1516,7 @@
   while (CurOp != NumOps && NumOps - CurOp <= 2) {
     // The last source register of a 4 operand instruction in AVX is encoded
     // in bits[7:4] of a immediate byte.
-    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+    if (TSFlags & X86II::VEX_I8IMM) {
       const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
                                                     : CurOp);
       ++CurOp;
@@ -1564,7 +1542,7 @@
     }
   }
 
-  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+  if (TSFlags & X86II::Has3DNow0F0FOpcode)
     EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
 
 #ifndef NDEBUG

diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 5e29e5c..5a9181d 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp

@@ -272,7 +272,8 @@
     MAI = new X86ELFMCAsmInfo(TheTriple);
   } else if (TheTriple.isWindowsMSVCEnvironment()) {
     MAI = new X86MCAsmInfoMicrosoft(TheTriple);
-  } else if (TheTriple.isOSCygMing()) {
+  } else if (TheTriple.isOSCygMing() ||
+             TheTriple.isWindowsItaniumEnvironment()) {
     MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
   } else {
     // The default is ELF.
@@ -350,11 +351,8 @@
 
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    const MCSubtargetInfo &STI,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
+                                    raw_ostream &_OS, MCCodeEmitter *_Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll) {
   Triple TheTriple(TT);
 
   switch (TheTriple.getObjectFormat()) {
@@ -365,7 +363,7 @@
     assert(TheTriple.isOSWindows() && "only Windows COFF is supported");
     return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll);
   case Triple::ELF:
-    return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+    return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
   }
 }
 
@@ -376,7 +374,7 @@
                                              const MCRegisterInfo &MRI,
                                              const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
-    return new X86ATTInstPrinter(MAI, MII, MRI);
+    return new X86ATTInstPrinter(MAI, MII, MRI, STI);
   if (SyntaxVariant == 1)
     return new X86IntelInstPrinter(MAI, MII, MRI);
   return nullptr;

diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index ebe74cf..aef9571 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86MCTARGETDESC_H
-#define X86MCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
 #include <string>

diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index ead3338..5685a7f 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp

@@ -179,11 +179,14 @@
     if (A_Base == B_Base && A_Base)
       report_fatal_error("unsupported relocation with identical base", false);
 
-    // A subtraction expression where both symbols are undefined is a
+    // A subtraction expression where either symbol is undefined is a
     // non-relocatable expression.
-    if (A->isUndefined() && B->isUndefined())
-      report_fatal_error("unsupported relocation with subtraction expression",
-                         false);
+    if (A->isUndefined() || B->isUndefined()) {
+      StringRef Name = A->isUndefined() ? A->getName() : B->getName();
+      Asm.getContext().FatalError(Fixup.getLoc(),
+        "unsupported relocation with subtraction expression, symbol '" + 
+        Name + "' can not be undefined in a subtraction expression");
+    }
 
     Value += Writer->getSymbolAddress(&A_SD, Layout) -
       (!A_Base ? 0 : Writer->getSymbolAddress(A_Base, Layout));
@@ -572,7 +575,7 @@
       // For external relocations, make sure to offset the fixup value to
       // compensate for the addend of the symbol address, if it was
       // undefined. This occurs with weak definitions, for example.
-      if (!SD->Symbol->isUndefined())
+      if (!SD->getSymbol().isUndefined())
         FixedValue -= Layout.getSymbolOffset(SD);
     } else {
       // The index is the section ordinal (1-based).

diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index 7fa4180..5f1596c 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp

@@ -8,18 +8,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86MCTargetDesc.h"
+#include "llvm/MC/MCWin64EH.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
 
 using namespace llvm;
 
 namespace {
 class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+  Win64EH::UnwindEmitter EHStreamer;
 public:
   X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
                      raw_ostream &OS)
     : MCWinCOFFStreamer(C, AB, *CE, OS) { }
 
   void EmitWinEHHandlerData() override;
+  void EmitWindowsUnwindTables() override;
   void FinishImpl() override;
 };
 
@@ -28,12 +31,18 @@
 
   // We have to emit the unwind info now, because this directive
   // actually switches to the .xdata section!
-  MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo());
+  EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+}
+
+void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
+  if (!getNumWinFrameInfos())
+    return;
+  EHStreamer.Emit(*this);
 }
 
 void X86WinCOFFStreamer::FinishImpl() {
   EmitFrames(nullptr);
-  EmitW64Tables();
+  EmitWindowsUnwindTables();
 
   MCWinCOFFStreamer::FinishImpl();
 }

diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 52d3c01..19a1832 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt

@@ -2,17 +2,6 @@
 // Random ideas for the X86 backend.
 //===---------------------------------------------------------------------===//
 
-This should be one DIV/IDIV instruction, not a libcall:
-
-unsigned test(unsigned long long X, unsigned Y) {
-        return X/Y;
-}
-
-This can be done trivially with a custom legalizer.  What about overflow 
-though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
-
-//===---------------------------------------------------------------------===//
-
 Improvements to the multiply -> shift/add algorithm:
 http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
 
@@ -83,43 +72,6 @@
 
 //===---------------------------------------------------------------------===//
 
-This:
-
-void foo(void);
-void bar(int x, int *P) { 
-  x >>= 2;
-  if (x) 
-    foo();
-  *P = x;
-}
-
-compiles into:
-
-	movq	%rsi, %rbx
-	movl	%edi, %r14d
-	sarl	$2, %r14d
-	testl	%r14d, %r14d
-	je	LBB0_2
-
-Instead of doing an explicit test, we can use the flags off the sar.  This
-occurs in a bigger testcase like this, which is pretty common:
-
-#include <vector>
-int test1(std::vector<int> &X) {
-  int Sum = 0;
-  for (long i = 0, e = X.size(); i != e; ++i)
-    X[i] = 0;
-  return Sum;
-}
-
-//===---------------------------------------------------------------------===//
-
-Only use inc/neg/not instructions on processors where they are faster than
-add/sub/xor.  They are slower on the P4 due to only updating some processor
-flags.
-
-//===---------------------------------------------------------------------===//
-
 The instruction selector sometimes misses folding a load into a compare.  The
 pattern is written as (cmp reg, (load p)).  Because the compare isn't 
 commutative, it is not matched with the load on both sides.  The dag combiner
@@ -303,42 +255,6 @@
 
 //===---------------------------------------------------------------------===//
 
-__builtin_ffs codegen is messy.
-
-int ffs_(unsigned X) { return __builtin_ffs(X); }
-
-llvm produces:
-ffs_:
-        movl    4(%esp), %ecx
-        bsfl    %ecx, %eax
-        movl    $32, %edx
-        cmove   %edx, %eax
-        incl    %eax
-        xorl    %edx, %edx
-        testl   %ecx, %ecx
-        cmove   %edx, %eax
-        ret
-
-vs gcc:
-
-_ffs_:
-        movl    $-1, %edx
-        bsfl    4(%esp), %eax
-        cmove   %edx, %eax
-        addl    $1, %eax
-        ret
-
-Another example of __builtin_ffs (use predsimplify to eliminate a select):
-
-int foo (unsigned long j) {
-  if (j)
-    return __builtin_ffs (j) - 1;
-  else
-    return 0;
-}
-
-//===---------------------------------------------------------------------===//
-
 It appears gcc place string data with linkonce linkage in
 .section __TEXT,__const_coal,coalesced instead of
 .section __DATA,__const_coal,coalesced.
@@ -466,85 +382,6 @@
 
 //===---------------------------------------------------------------------===//
 
-Use the FLAGS values from arithmetic instructions more.  For example, compile:
-
-int add_zf(int *x, int y, int a, int b) {
-     if ((*x += y) == 0)
-          return a;
-     else
-          return b;
-}
-
-to:
-       addl    %esi, (%rdi)
-       movl    %edx, %eax
-       cmovne  %ecx, %eax
-       ret
-instead of:
-
-_add_zf:
-        addl (%rdi), %esi
-        movl %esi, (%rdi)
-        testl %esi, %esi
-        cmove %edx, %ecx
-        movl %ecx, %eax
-        ret
-
-As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
-without a test instruction.
-
-//===---------------------------------------------------------------------===//
-
-These two functions have identical effects:
-
-unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
-unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
-
-We currently compile them to:
-
-_f:
-        movl 4(%esp), %eax
-        movl %eax, %ecx
-        incl %ecx
-        movl 8(%esp), %edx
-        cmpl %edx, %ecx
-        jne LBB1_2      #UnifiedReturnBlock
-LBB1_1: #cond_true
-        addl $2, %eax
-        ret
-LBB1_2: #UnifiedReturnBlock
-        movl %ecx, %eax
-        ret
-_f2:
-        movl 4(%esp), %eax
-        movl %eax, %ecx
-        incl %ecx
-        cmpl 8(%esp), %ecx
-        sete %cl
-        movzbl %cl, %ecx
-        leal 1(%ecx,%eax), %eax
-        ret
-
-both of which are inferior to GCC's:
-
-_f:
-        movl    4(%esp), %edx
-        leal    1(%edx), %eax
-        addl    $2, %edx
-        cmpl    8(%esp), %eax
-        cmove   %edx, %eax
-        ret
-_f2:
-        movl    4(%esp), %eax
-        addl    $1, %eax
-        xorl    %edx, %edx
-        cmpl    8(%esp), %eax
-        sete    %dl
-        addl    %edx, %eax
-        ret
-
-//===---------------------------------------------------------------------===//
-
 This code:
 
 void test(int X) {
@@ -1398,20 +1235,6 @@
 
 //===---------------------------------------------------------------------===//
 
-These should compile to the same code, but the later codegen's to useless
-instructions on X86. This may be a trivial dag combine (GCC PR7061):
-
-struct s1 { unsigned char a, b; };
-unsigned long f1(struct s1 x) {
-    return x.a + x.b;
-}
-struct s2 { unsigned a: 8, b: 8; };
-unsigned long f2(struct s2 x) {
-    return x.a + x.b;
-}
-
-//===---------------------------------------------------------------------===//
-
 We currently compile this:
 
 define i32 @func1(i32 %v1, i32 %v2) nounwind {

diff --git a/lib/Target/X86/Utils/LLVMBuild.txt b/lib/Target/X86/Utils/LLVMBuild.txt
index fdb886f..de0a30f 100644
--- a/lib/Target/X86/Utils/LLVMBuild.txt
+++ b/lib/Target/X86/Utils/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = X86Utils
 parent = X86
-required_libraries = Support
+required_libraries = Core Support
 add_to_library_groups = X86

diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 5f2441c..ba6cbc8 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp

@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ShuffleDecode.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/CodeGen/MachineValueType.h"
 
 //===----------------------------------------------------------------------===//
@@ -62,6 +63,51 @@
     ShuffleMask.push_back(NElts+i);
 }
 
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i);
+    ShuffleMask.push_back(2 * i);
+  }
+}
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i + 1);
+    ShuffleMask.push_back(2 * i + 1);
+  }
+}
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned NumElts = VectorSizeInBits / 8;
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      int M = SM_SentinelZero;
+      if (i >= Imm) M = i - Imm + l;
+      ShuffleMask.push_back(M);
+    }
+}
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned VectorSizeInBits = VT.getSizeInBits();
+  unsigned NumElts = VectorSizeInBits / 8;
+  unsigned NumLanes = VectorSizeInBits / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l < NumElts; l += NumLaneElts)
+    for (unsigned i = 0; i < NumLaneElts; ++i) {
+      unsigned Base = i + Imm;
+      int M = Base + l;
+      if (Base >= NumLaneElts) M = SM_SentinelZero;
+      ShuffleMask.push_back(M);
+    }
+}
+
 void DecodePALIGNRMask(MVT VT, unsigned Imm,
                        SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
@@ -207,6 +253,97 @@
   }
 }
 
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
+  assert(MaskTy->getVectorElementType()->isIntegerTy(8) &&
+         "Expected i8 constant mask elements!");
+  int NumElements = MaskTy->getVectorNumElements();
+  // FIXME: Add support for AVX-512.
+  assert((NumElements == 16 || NumElements == 32) &&
+         "Only 128-bit and 256-bit vectors supported!");
+  ShuffleMask.reserve(NumElements);
+
+  if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+    assert((unsigned)NumElements == CDS->getNumElements() &&
+           "Constant mask has a different number of elements!");
+
+    for (int i = 0; i < NumElements; ++i) {
+      // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+      // lane of the vector we're inside.
+      int Base = i < 16 ? 0 : 16;
+      uint64_t Element = CDS->getElementAsInteger(i);
+      // If the high bit (7) of the byte is set, the element is zeroed.
+      if (Element & (1 << 7))
+        ShuffleMask.push_back(SM_SentinelZero);
+      else {
+        // Only the least significant 4 bits of the byte are used.
+        int Index = Base + (Element & 0xf);
+        ShuffleMask.push_back(Index);
+      }
+    }
+  } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+    assert((unsigned)NumElements == CV->getNumOperands() &&
+           "Constant mask has a different number of elements!");
+
+    for (int i = 0; i < NumElements; ++i) {
+      // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
+      // lane of the vector we're inside.
+      int Base = i < 16 ? 0 : 16;
+      Constant *COp = CV->getOperand(i);
+      if (isa<UndefValue>(COp)) {
+        ShuffleMask.push_back(SM_SentinelUndef);
+        continue;
+      }
+      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+      // If the high bit (7) of the byte is set, the element is zeroed.
+      if (Element & (1 << 7))
+        ShuffleMask.push_back(SM_SentinelZero);
+      else {
+        // Only the least significant 4 bits of the byte are used.
+        int Index = Base + (Element & 0xf);
+        ShuffleMask.push_back(Index);
+      }
+    }
+  }
+}
+
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  for (int i = 0, e = RawMask.size(); i < e; ++i) {
+    uint64_t M = RawMask[i];
+    if (M == (uint64_t)SM_SentinelUndef) {
+      ShuffleMask.push_back(M);
+      continue;
+    }
+    // For AVX vectors with 32 bytes the base of the shuffle is the half of
+    // the vector we're inside.
+    int Base = i < 16 ? 0 : 16;
+    // If the high bit (7) of the byte is set, the element is zeroed.
+    if (M & (1 << 7))
+      ShuffleMask.push_back(SM_SentinelZero);
+    else {
+      // Only the least significant 4 bits of the byte are used.
+      int Index = Base + (M & 0xf);
+      ShuffleMask.push_back(Index);
+    }
+  }
+}
+
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  int ElementBits = VT.getScalarSizeInBits();
+  int NumElements = VT.getVectorNumElements();
+  for (int i = 0; i < NumElements; ++i) {
+    // If there are more than 8 elements in the vector, then any immediate blend
+    // mask applies to each 128-bit lane. There can never be more than
+    // 8 elements in a 128-bit lane with an immediate blend.
+    int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
+    assert(Bit < 8 &&
+           "Immediate blends only operate over 8 elements at a time!");
+    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+  }
+}
+
 /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
 /// No VT provided since it only works on 256-bit, 4 element vectors.
 void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
@@ -215,4 +352,44 @@
   }
 }
 
+void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
+  assert(MaskTy->getVectorElementType()->isIntegerTy() &&
+         "Expected integer constant mask elements!");
+  int ElementBits = MaskTy->getScalarSizeInBits();
+  int NumElements = MaskTy->getVectorNumElements();
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+         "Unexpected number of vector elements.");
+  ShuffleMask.reserve(NumElements);
+  if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+    assert((unsigned)NumElements == CDS->getNumElements() &&
+           "Constant mask has a different number of elements!");
+
+    for (int i = 0; i < NumElements; ++i) {
+      int Base = (i * ElementBits / 128) * (128 / ElementBits);
+      uint64_t Element = CDS->getElementAsInteger(i);
+      // Only the least significant 2 bits of the integer are used.
+      int Index = Base + (Element & 0x3);
+      ShuffleMask.push_back(Index);
+    }
+  } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+    assert((unsigned)NumElements == C->getNumOperands() &&
+           "Constant mask has a different number of elements!");
+
+    for (int i = 0; i < NumElements; ++i) {
+      int Base = (i * ElementBits / 128) * (128 / ElementBits);
+      Constant *COp = CV->getOperand(i);
+      if (isa<UndefValue>(COp)) {
+        ShuffleMask.push_back(SM_SentinelUndef);
+        continue;
+      }
+      uint64_t Element = cast<ConstantInt>(COp)->getZExtValue();
+      // Only the least significant 2 bits of the integer are used.
+      int Index = Base + (Element & 0x3);
+      ShuffleMask.push_back(Index);
+    }
+  }
+}
+
 } // llvm namespace

diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 9e75b6b..6ba3c64 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h

@@ -12,21 +12,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86_SHUFFLE_DECODE_H
-#define X86_SHUFFLE_DECODE_H
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ArrayRef.h"
 
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
+class Constant;
 class MVT;
 
-enum {
-  SM_SentinelZero = -1
-};
+enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
 
 void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
@@ -36,6 +36,14 @@
 // <0,2> or <0,1,4,5>
 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
 void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
@@ -59,6 +67,16 @@
 /// different datatypes and vector widths.
 void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 
+/// \brief Decode a PSHUFB mask from an IR-level vector constant.
+void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a PSHUFB mask from a raw array of constants such as from
+/// BUILD_VECTOR.
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+                      SmallVectorImpl<int> &ShuffleMask);
+
+/// \brief Decode a BLEND immediate mask into a shuffle mask.
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask);
@@ -67,6 +85,9 @@
 /// No VT provided since it only works on 256-bit, 4 element vectors.
 void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+
 } // llvm namespace
 
 #endif

diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index d5522ed..8bd5817 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TARGET_X86_H
-#define TARGET_X86_H
+#ifndef LLVM_LIB_TARGET_X86_X86_H
+#define LLVM_LIB_TARGET_X86_X86_H
 
 #include "llvm/Support/CodeGen.h"
 
@@ -21,13 +21,8 @@
 
 class FunctionPass;
 class ImmutablePass;
-class JITCodeEmitter;
 class X86TargetMachine;
 
-/// createX86AtomicExpandPass - This pass expands atomic operations that cannot
-/// be handled natively in terms of a loop using cmpxchg.
-FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM);
-
 /// createX86ISelDag - This pass converts a legalized DAG into a
 /// X86-specific DAG, ready for instruction scheduling.
 ///
@@ -54,11 +49,6 @@
 /// AVX and SSE.
 FunctionPass *createX86IssueVZeroUpperPass();
 
-/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
-/// to the specified MCE object.
-FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
-                                          JITCodeEmitter &JCE);
-
 /// createX86EmitCodeToMemory - Returns a pass that converts a register
 /// allocated function into raw machine code in a dynamically
 /// allocated chunk of memory.

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 93f516a..83f55d3 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td

@@ -104,7 +104,15 @@
 def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
                       "Enable AVX-512 PreFetch Instructions",
                                       [FeatureAVX512]>;
-
+def FeatureDQI     : SubtargetFeature<"avx512dq", "HasDQI", "true",
+                      "Enable AVX-512 Doubleword and Quadword Instructions",
+                                      [FeatureAVX512]>;
+def FeatureBWI     : SubtargetFeature<"avx512bw", "HasBWI", "true",
+                      "Enable AVX-512 Byte and Word Instructions",
+                                      [FeatureAVX512]>;
+def FeatureVLX     : SubtargetFeature<"avx512vl", "HasVLX", "true",
+                      "Enable AVX-512 Vector Length eXtensions",
+                                      [FeatureAVX512]>;
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
                          "Enable packed carry-less multiplication instructions",
                                [FeatureSSE2]>;
@@ -149,10 +157,14 @@
 def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
                                       "Enable SHA instructions",
                                       [FeatureSSE2]>;
+def FeatureSGX     : SubtargetFeature<"sgx", "HasSGX", "true",
+                                      "Support SGX instructions">;
 def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
                                       "Support RDSEED instruction">;
+def FeatureSMAP    : SubtargetFeature<"smap", "HasSMAP", "true",
+                                      "Support SMAP instructions">;
 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
 def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
@@ -170,6 +182,10 @@
                                    "LEA instruction with certain arguments is slow">;
 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
+def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true",
+                            "Use RSQRT* to optimize square root calculations">;
+def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst",
+                          "true", "Use RCP* to optimize division calculations">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -264,8 +280,16 @@
                       FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
                       FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT,
                       FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
-                      FeatureHLE]>;
+                      FeatureHLE, FeatureSlowIncDec]>;
 
+// Broadwell
+def : ProcessorModel<"broadwell", HaswellModel,
+                     [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem,
+                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
+                      FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT,
+                      FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
+                      FeatureHLE, FeatureADX, FeatureRDSEED, FeatureSMAP,
+                      FeatureSlowIncDec]>;
 // KNL
 // FIXME: define KNL model
 def : ProcessorModel<"knl", HaswellModel,
@@ -276,6 +300,17 @@
                       FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
                       FeatureSlowIncDec]>;
 
+// SKX
+// FIXME: define SKX model
+def : ProcessorModel<"skx", HaswellModel,
+                     [FeatureAVX512, FeatureCDI,
+                      FeatureDQI, FeatureBWI, FeatureVLX,
+                      FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
+                      FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
+                      FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
+                      FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE,
+                      FeatureSlowIncDec, FeatureSGX]>;
+
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
 def : Proc<"k6-3",            [Feature3DNow]>;
@@ -311,35 +346,42 @@
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
                                FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
                                FeatureSlowSHLD]>;
+
 // Jaguar
-def : Proc<"btver2",          [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
-                               FeatureBMI, FeatureF16C, FeatureMOVBE,
-                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
+def : ProcessorModel<"btver2", BtVer2Model,
+                     [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
+                      FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
+                      FeatureBMI, FeatureF16C, FeatureMOVBE,
+                      FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD,
+                      FeatureUseSqrtEst, FeatureUseRecipEst]>;
+
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
-                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
+                               FeatureAVX, FeatureSSE4A, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureSlowSHLD]>;
 // Piledriver
 def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
-                               FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI, FeatureTBM,
-                               FeatureFMA, FeatureSlowSHLD]>;
+                               FeatureAVX, FeatureSSE4A, FeatureF16C,
+                               FeatureLZCNT, FeaturePOPCNT, FeatureBMI,
+                               FeatureTBM, FeatureFMA, FeatureSlowSHLD]>;
 
 // Steamroller
 def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
-                               FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
-                               FeatureFMA, FeatureFSGSBase]>;
+                               FeatureAVX, FeatureSSE4A, FeatureF16C,
+                               FeatureLZCNT, FeaturePOPCNT, FeatureBMI,
+                               FeatureTBM, FeatureFMA, FeatureSlowSHLD,
+                               FeatureFSGSBase]>;
 
 // Excavator
 def : Proc<"bdver4",          [FeatureAVX2, FeatureXOP, FeatureFMA4,
                                FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW,
                                FeaturePCLMUL, FeatureF16C, FeatureLZCNT,
                                FeaturePOPCNT, FeatureBMI, FeatureBMI2,
-                               FeatureTBM, FeatureFMA, FeatureFSGSBase]>;
+                               FeatureTBM, FeatureFMA, FeatureSSE4A,
+                               FeatureFSGSBase]>;
 
 def : Proc<"geode",           [Feature3DNowA]>;
 

diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 1dca568..4e5b7b8 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp

@@ -18,6 +18,7 @@
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -29,6 +30,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -45,6 +47,8 @@
 /// runOnMachineFunction - Emit the function body.
 ///
 bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SMShadowTracker.startFunction(MF);
+
   SetupMachineFunction(MF);
 
   if (Subtarget->isTargetCOFF()) {
@@ -549,6 +553,28 @@
         4 /*size*/);
 }
 
+MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  if (Subtarget->isTargetKnownWindowsMSVC()) {
+    const MachineConstantPoolEntry &CPE =
+        MF->getConstantPool()->getConstants()[CPID];
+    if (!CPE.isMachineConstantPoolEntry()) {
+      SectionKind Kind =
+          CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout());
+      const Constant *C = CPE.Val.ConstVal;
+      if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
+            getObjFileLowering().getSectionForConstant(Kind, C))) {
+        if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+          if (Sym->isUndefined())
+            OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
+          return Sym;
+        }
+      }
+    }
+  }
+
+  return AsmPrinter::GetCPISymbol(CPID);
+}
+
 void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) {
   SmallString<128> Directive;
   raw_svector_ostream OS(Directive);
@@ -703,7 +729,7 @@
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
+      const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
       for (const auto &Stub : Stubs) {
         OutStreamer.EmitLabel(Stub.first);
@@ -712,6 +738,8 @@
       }
       Stubs.clear();
     }
+
+    SM.serializeToStackMapSection();
   }
 }
 

diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index e4eef5d..748b948 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h

@@ -7,14 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86ASMPRINTER_H
-#define X86ASMPRINTER_H
+#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
+#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
 
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/Target/TargetMachine.h"
 
+// Implemented in X86MCInstLower.cpp
+namespace {
+  class X86MCInstLower;
+}
+
 namespace llvm {
 class MCStreamer;
 class MCSymbol;
@@ -25,9 +30,63 @@
 
   void GenerateExportDirective(const MCSymbol *Sym, bool IsData);
 
+  // This utility class tracks the length of a stackmap instruction's 'shadow'.
+  // It is used by the X86AsmPrinter to ensure that the stackmap shadow
+  // invariants (i.e. no other stackmaps, patchpoints, or control flow within
+  // the shadow) are met, while outputting a minimal number of NOPs for padding.
+  //
+  // To minimise the number of NOPs used, the shadow tracker counts the number
+  // of instruction bytes output since the last stackmap. Only if there are too
+  // few instruction bytes to cover the shadow are NOPs used for padding.
+  class StackMapShadowTracker {
+  public:
+    StackMapShadowTracker(TargetMachine &TM);
+    ~StackMapShadowTracker();
+    void startFunction(MachineFunction &MF);
+    void count(MCInst &Inst, const MCSubtargetInfo &STI);
+
+    // Called to signal the start of a shadow of RequiredSize bytes.
+    void reset(unsigned RequiredSize) {
+      RequiredShadowSize = RequiredSize;
+      CurrentShadowSize = 0;
+      InShadow = true;
+    }
+
+    // Called before every stackmap/patchpoint, and at the end of basic blocks,
+    // to emit any necessary padding-NOPs.
+    void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
+  private:
+    TargetMachine &TM;
+    std::unique_ptr<MCCodeEmitter> CodeEmitter;
+    bool InShadow;
+
+    // RequiredShadowSize holds the length of the shadow specified in the most
+    // recently encountered STACKMAP instruction.
+    // CurrentShadowSize counts the number of bytes encoded since the most
+    // recently encountered STACKMAP, stopping when that number is greater than
+    // or equal to RequiredShadowSize.
+    unsigned RequiredShadowSize, CurrentShadowSize;
+  };
+
+  StackMapShadowTracker SMShadowTracker;
+
+  // All instructions emitted by the X86AsmPrinter should use this helper
+  // method.
+  //
+  // This helper function invokes the SMShadowTracker on each instruction before
+  // outputting it to the OutStream. This allows the shadow tracker to minimise
+  // the number of NOPs used for stackmap padding.
+  void EmitAndCountInstruction(MCInst &Inst);
+
+  void InsertStackMapShadows(MachineFunction &MF);
+  void LowerSTACKMAP(const MachineInstr &MI);
+  void LowerPATCHPOINT(const MachineInstr &MI);
+
+  void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
+
  public:
   explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), SM(*this) {
+    : AsmPrinter(TM, Streamer), SM(*this), SMShadowTracker(TM) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
   }
 
@@ -43,6 +102,10 @@
 
   void EmitInstruction(const MachineInstr *MI) override;
 
+  void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override {
+    SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo());
+  }
+
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &OS) override;
@@ -50,6 +113,15 @@
                              unsigned AsmVariant, const char *ExtraCode,
                              raw_ostream &OS) override;
 
+  /// \brief Return the symbol for the specified constant pool entry.
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+
+  bool doInitialization(Module &M) override {
+    SMShadowTracker.reset(0);
+    SM.reset();
+    return AsmPrinter::doInitialization(M);
+  }
+
   bool runOnMachineFunction(MachineFunction &F) override;
 };
 

diff --git a/lib/Target/X86/X86AtomicExpandPass.cpp b/lib/Target/X86/X86AtomicExpandPass.cpp
deleted file mode 100644
index 61eefbb..0000000
--- a/lib/Target/X86/X86AtomicExpandPass.cpp
+++ /dev/null

@@ -1,287 +0,0 @@
-//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass (at IR level) to replace atomic instructions which
-// cannot be implemented as a single instruction with cmpxchg-based loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86TargetMachine.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-atomic-expand"
-
-namespace {
-  class X86AtomicExpandPass : public FunctionPass {
-    const X86TargetMachine *TM;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit X86AtomicExpandPass(const X86TargetMachine *TM)
-      : FunctionPass(ID), TM(TM) {}
-
-    bool runOnFunction(Function &F) override;
-    bool expandAtomicInsts(Function &F);
-
-    bool needsCmpXchgNb(Type *MemType);
-
-    /// There are four kinds of atomic operations. Two never need expanding:
-    /// cmpxchg is what we expand the others *to*, and loads are easily handled
-    /// by ISelLowering. Atomicrmw and store can need expanding in some
-    /// circumstances.
-    bool shouldExpand(Instruction *Inst);
-
-    /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms
-    /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic.
-    bool shouldExpandStore(StoreInst *SI);
-
-    /// Only some atomicrmw instructions need expanding -- some operations
-    /// (e.g. max) have absolutely no architectural support; some (e.g. or) have
-    /// limited support but can't return the previous value; some (e.g. add)
-    /// have complete support in the instruction set.
-    ///
-    /// Also, naturally, 128-bit operations always need to be expanded.
-    bool shouldExpandAtomicRMW(AtomicRMWInst *AI);
-
-    bool expandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicStore(StoreInst *SI);
-  };
-}
-
-char X86AtomicExpandPass::ID = 0;
-
-FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) {
-  return new X86AtomicExpandPass(TM);
-}
-
-bool X86AtomicExpandPass::runOnFunction(Function &F) {
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (BasicBlock &BB : F)
-    for (Instruction &Inst : BB) {
-      if (isa<AtomicRMWInst>(&Inst) ||
-          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
-        AtomicInsts.push_back(&Inst);
-    }
-
-  bool MadeChange = false;
-  for (Instruction *Inst : AtomicInsts) {
-    if (!shouldExpand(Inst))
-      continue;
-
-    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
-      MadeChange |= expandAtomicRMW(AI);
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-      MadeChange |= expandAtomicStore(SI);
-
-    assert(MadeChange && "Atomic inst not expanded when it should be?");
-    Inst->eraseFromParent();
-  }
-
-  return MadeChange;
-}
-
-/// Returns true if operations on the given type will need to use either
-/// cmpxchg8b or cmpxchg16b. This occurs if the type is 1 step up from the
-/// native width, and the instructions are available (otherwise we leave them
-/// alone to become __sync_fetch_and_... calls).
-bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) {
-  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
-  if (!Subtarget.hasCmpxchg16b())
-    return false;
-
-  unsigned CmpXchgNbWidth = Subtarget.is64Bit() ? 128 : 64;
-
-  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
-  if (OpWidth == CmpXchgNbWidth)
-    return true;
-
-  return false;
-}
-
-
-bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) {
-  const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>();
-  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
-
-  if (needsCmpXchgNb(AI->getType()))
-    return true;
-
-  if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth)
-    return false;
-
-  AtomicRMWInst::BinOp Op = AI->getOperation();
-  switch (Op) {
-  default:
-    llvm_unreachable("Unknown atomic operation");
-  case AtomicRMWInst::Xchg:
-  case AtomicRMWInst::Add:
-  case AtomicRMWInst::Sub:
-    // It's better to use xadd, xsub or xchg for these in all cases.
-    return false;
-  case AtomicRMWInst::Or:
-  case AtomicRMWInst::And:
-  case AtomicRMWInst::Xor:
-    // If the atomicrmw's result isn't actually used, we can just add a "lock"
-    // prefix to a normal instruction for these operations.
-    return !AI->use_empty();
-  case AtomicRMWInst::Nand:
-  case AtomicRMWInst::Max:
-  case AtomicRMWInst::Min:
-  case AtomicRMWInst::UMax:
-  case AtomicRMWInst::UMin:
-    // These always require a non-trivial set of data operations on x86. We must
-    // use a cmpxchg loop.
-    return true;
-  }
-}
-
-bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) {
-  if (needsCmpXchgNb(SI->getValueOperand()->getType()))
-    return true;
-
-  return false;
-}
-
-bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) {
-  if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
-    return shouldExpandAtomicRMW(AI);
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return shouldExpandStore(SI);
-  return false;
-}
-
-/// Emit IR to implement the given atomicrmw operation on values in registers,
-/// returning the new value.
-static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
-                              Value *Loaded, Value *Inc) {
-  Value *NewVal;
-  switch (Op) {
-  case AtomicRMWInst::Xchg:
-    return Inc;
-  case AtomicRMWInst::Add:
-    return Builder.CreateAdd(Loaded, Inc, "new");
-  case AtomicRMWInst::Sub:
-    return Builder.CreateSub(Loaded, Inc, "new");
-  case AtomicRMWInst::And:
-    return Builder.CreateAnd(Loaded, Inc, "new");
-  case AtomicRMWInst::Nand:
-    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
-  case AtomicRMWInst::Or:
-    return Builder.CreateOr(Loaded, Inc, "new");
-  case AtomicRMWInst::Xor:
-    return Builder.CreateXor(Loaded, Inc, "new");
-  case AtomicRMWInst::Max:
-    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::Min:
-    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::UMax:
-    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
-    return  Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  case AtomicRMWInst::UMin:
-    NewVal = Builder.CreateICmpULE(Loaded, Inc);
-    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
-  default:
-    break;
-  }
-  llvm_unreachable("Unknown atomic op");
-}
-
-bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) {
-  AtomicOrdering Order =
-      AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
-  //
-  // The standard expansion we produce is:
-  //     [...]
-  //     %init_loaded = load atomic iN* %addr
-  //     br label %loop
-  // loop:
-  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
-  //     %new = some_op iN %loaded, %incr
-  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
-  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
-  //     %success = extractvalue { iN, i1 } %pair, 1
-  //     br i1 %success, label %atomicrmw.end, label %loop
-  // atomicrmw.end:
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
-  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
-
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we want a load. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  LoadInst *InitLoaded = Builder.CreateLoad(Addr);
-  InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits());
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded");
-  Loaded->addIncoming(InitLoaded, BB);
-
-  Value *NewVal =
-      performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand());
-
-  Value *Pair = Builder.CreateAtomicCmpXchg(
-      Addr, Loaded, NewVal, Order,
-      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
-  Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
-  Loaded->addIncoming(NewLoaded, LoopBB);
-
-  Value *Success = Builder.CreateExtractValue(Pair, 1, "success");
-  Builder.CreateCondBr(Success, ExitBB, LoopBB);
-
-  AI->replaceAllUsesWith(NewLoaded);
-
-  return true;
-}
-
-bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) {
-  // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express
-  // this in terms of the usual expansion to "atomicrmw xchg".
-  IRBuilder<> Builder(SI);
-  AtomicOrdering Order =
-      SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering();
-  AtomicRMWInst *AI =
-      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
-                              SI->getValueOperand(), Order);
-
-  // Now we have an appropriate swap instruction, lower it as usual.
-  if (shouldExpandAtomicRMW(AI)) {
-    expandAtomicRMW(AI);
-    AI->eraseFromParent();
-    return true;
-  }
-
-  return AI;
-}

diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
index e76f9fd..0eb2494 100644
--- a/lib/Target/X86/X86CallingConv.h
+++ b/lib/Target/X86/X86CallingConv.h

@@ -12,14 +12,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86CALLINGCONV_H
-#define X86CALLINGCONV_H
+#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
+#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/IR/CallingConv.h"
 
 namespace llvm {
 
+inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State) {
+  // Similar to CCPassIndirect, with the addition of inreg.
+  LocVT = MVT::i32;
+  LocInfo = CCValAssign::Indirect;
+  ArgFlags.setInReg();
+  return false; // Continue the search, but now for i32.
+}
+
+
 inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
                                 CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
                                 CCState &) {

diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 0824d4e..75a2ec0 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td

@@ -14,7 +14,9 @@
 
 /// CCIfSubtarget - Match if the current subtarget has a feature F.
 class CCIfSubtarget<string F, CCAction A>
- : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+    : CCIf<!strconcat("static_cast<const X86Subtarget&>"
+                       "(State.getMachineFunction().getSubtarget()).", F),
+           A>;
 
 //===----------------------------------------------------------------------===//
 // Return Value Calling Conventions
@@ -52,27 +54,27 @@
   // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
   // can only be used by ABI non-compliant code. This vector type is only
   // supported while using the AVX-512 target feature.
-  CCIfType<[v16i32, v8i64, v16f32, v8f64],
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
             CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
 
   // MMX vector types are always returned in MM0. If the target doesn't have
   // MM0, it doesn't support these vector types.
   CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
 
-  // Long double types are always returned in ST0 (even with SSE).
-  CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+  // Long double types are always returned in FP0 (even with SSE).
+  CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>
 ]>;
 
 // X86-32 C return-value convention.
 def RetCC_X86_32_C : CallingConv<[
-  // The X86-32 calling convention returns FP values in ST0, unless marked
+  // The X86-32 calling convention returns FP values in FP0, unless marked
   // with "inreg" (used here to distinguish one kind of reg from another,
   // weirdly; this is really the sse-regparm calling convention) in which
   // case they use XMM0, otherwise it is the same as the common X86 calling
   // conv.
   CCIfInReg<CCIfSubtarget<"hasSSE2()",
     CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
-  CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+  CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
@@ -122,6 +124,24 @@
   CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
 ]>;
 
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_VectorCall : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // 512-bit FP vectors
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
+  // Return integers in the standard way.
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
 // X86-64 C return-value convention.
 def RetCC_X86_64_C : CallingConv<[
   // The X86-64 calling convention always returns FP values in XMM0.
@@ -177,6 +197,7 @@
   CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
   // If HiPE, use RetCC_X86_32_HiPE.
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
 
   // Otherwise, use RetCC_X86_32_C.
   CCDelegateTo<RetCC_X86_32_C>
@@ -224,6 +245,7 @@
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
   CCIfNest<CCAssignToReg<[R10]>>,
 
   // The first 6 integer arguments are passed in integer registers.
@@ -252,7 +274,7 @@
                                          YMM4, YMM5, YMM6, YMM7]>>>>,
 
   // The first 8 512-bit vector arguments are passed in ZMM registers.
-  CCIfNotVarArg<CCIfType<[v16i32, v8i64, v16f32, v8f64],
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
             CCIfSubtarget<"hasAVX512()",
             CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
 
@@ -327,6 +349,25 @@
   CCIfType<[f80], CCAssignToStack<0, 0>>
 ]>;
 
+def CC_X86_Win64_VectorCall : CallingConv<[
+  // The first 6 floating point and vector types of 128 bits or less use
+  // XMM0-XMM5.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
+
+  // 256-bit vectors use YMM registers.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
+
+  // 512-bit vectors use ZMM registers.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+
+  // Delegate to fastcall to handle integer types.
+  CCDelegateTo<CC_X86_Win64_C>
+]>;
+
+
 def CC_X86_64_GHC : CallingConv<[
   // Promote i8/i16/i32 arguments to i64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
@@ -460,6 +501,30 @@
   CCDelegateTo<CC_X86_32_Common>
 ]>;
 
+def CC_X86_32_VectorCall : CallingConv<[
+  // The first 6 floating point and vector types of 128 bits or less use
+  // XMM0-XMM5.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
+
+  // 256-bit vectors use YMM registers.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
+
+  // 512-bit vectors use ZMM registers.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+
+  // Otherwise, pass it indirectly.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64,
+            v32i8, v16i16, v8i32, v4i64, v8f32, v4f64,
+            v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+           CCCustom<"CC_X86_32_VectorCallIndirect">>,
+
+  // Delegate to fastcall to handle integer types.
+  CCDelegateTo<CC_X86_32_FastCall>
+]>;
+
 def CC_X86_32_ThisCall_Common : CallingConv<[
   // The first integer argument is passed in ECX
   CCIfType<[i32], CCAssignToReg<[ECX]>>,
@@ -573,6 +638,7 @@
 // This is the root argument convention for the X86-32 backend.
 def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
@@ -590,6 +656,7 @@
   CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
 
   // Mingw64 and native Win64 use Win64 CC
   CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,

diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
deleted file mode 100644
index a3ae7ee..0000000
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ /dev/null

@@ -1,1498 +0,0 @@
-//===-- X86CodeEmitter.cpp - Convert X86 code to machine code -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the pass that transforms the X86 machine instructions into
-// relocatable machine code.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86InstrInfo.h"
-#include "X86JITInfo.h"
-#include "X86Relocations.h"
-#include "X86Subtarget.h"
-#include "X86TargetMachine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-emitter"
-
-STATISTIC(NumEmitted, "Number of machine instructions emitted");
-
-namespace {
-  template<class CodeEmitter>
-  class Emitter : public MachineFunctionPass {
-    const X86InstrInfo  *II;
-    const DataLayout    *TD;
-    X86TargetMachine    &TM;
-    CodeEmitter         &MCE;
-    MachineModuleInfo   *MMI;
-    intptr_t PICBaseOffset;
-    bool Is64BitMode;
-    bool IsPIC;
-  public:
-    static char ID;
-    explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
-      : MachineFunctionPass(ID), II(nullptr), TD(nullptr), TM(tm),
-        MCE(mce), PICBaseOffset(0), Is64BitMode(false),
-        IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
-
-    bool runOnMachineFunction(MachineFunction &MF) override;
-
-    const char *getPassName() const override {
-      return "X86 Machine Code Emitter";
-    }
-
-    void emitOpcodePrefix(uint64_t TSFlags, int MemOperand,
-                          const MachineInstr &MI,
-                          const MCInstrDesc *Desc) const;
-
-    void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand,
-                             const MachineInstr &MI,
-                             const MCInstrDesc *Desc) const;
-
-    void emitSegmentOverridePrefix(uint64_t TSFlags,
-                                   int MemOperand,
-                                   const MachineInstr &MI) const;
-
-    void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc);
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-      AU.addRequired<MachineModuleInfo>();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-
-  private:
-    void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
-    void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
-                           intptr_t Disp = 0, intptr_t PCAdj = 0,
-                           bool Indirect = false);
-    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
-    void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0,
-                              intptr_t PCAdj = 0);
-    void emitJumpTableAddress(unsigned JTI, unsigned Reloc,
-                              intptr_t PCAdj = 0);
-
-    void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
-                               intptr_t Adj = 0, bool IsPCRel = true);
-
-    void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
-    void emitRegModRMByte(unsigned RegOpcodeField);
-    void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
-    void emitConstant(uint64_t Val, unsigned Size);
-
-    void emitMemModRMByte(const MachineInstr &MI,
-                          unsigned Op, unsigned RegOpcodeField,
-                          intptr_t PCAdj = 0);
-
-    unsigned getX86RegNum(unsigned RegNo) const {
-      const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-      return TRI->getEncodingValue(RegNo) & 0x7;
-    }
-
-    unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
-                                         unsigned OpNum) const;
-  };
-
-template<class CodeEmitter>
-  char Emitter<CodeEmitter>::ID = 0;
-} // end anonymous namespace.
-
-/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
-/// to the specified JITCodeEmitter object.
-FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM,
-                                                JITCodeEmitter &JCE) {
-  return new Emitter<JITCodeEmitter>(TM, JCE);
-}
-
-template<class CodeEmitter>
-bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
-  MMI = &getAnalysis<MachineModuleInfo>();
-  MCE.setModuleInfo(MMI);
-
-  II = TM.getInstrInfo();
-  TD = TM.getDataLayout();
-  Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
-  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
-
-  do {
-    DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n");
-    MCE.startFunction(MF);
-    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
-         MBB != E; ++MBB) {
-      MCE.StartMachineBasicBlock(MBB);
-      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
-           I != E; ++I) {
-        const MCInstrDesc &Desc = I->getDesc();
-        emitInstruction(*I, &Desc);
-        // MOVPC32r is basically a call plus a pop instruction.
-        if (Desc.getOpcode() == X86::MOVPC32r)
-          emitInstruction(*I, &II->get(X86::POP32r));
-        ++NumEmitted;  // Keep track of the # of mi's emitted
-      }
-    }
-  } while (MCE.finishFunction(MF));
-
-  return false;
-}
-
-/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
-/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
-/// size, and 3) use of X86-64 extended registers.
-static unsigned determineREX(const MachineInstr &MI) {
-  unsigned REX = 0;
-  const MCInstrDesc &Desc = MI.getDesc();
-
-  // Pseudo instructions do not need REX prefix byte.
-  if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
-    return 0;
-  if (Desc.TSFlags & X86II::REX_W)
-    REX |= 1 << 3;
-
-  unsigned NumOps = Desc.getNumOperands();
-  if (NumOps) {
-    bool isTwoAddr = NumOps > 1 &&
-      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
-
-    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
-    unsigned i = isTwoAddr ? 1 : 0;
-    for (unsigned e = NumOps; i != e; ++i) {
-      const MachineOperand& MO = MI.getOperand(i);
-      if (MO.isReg()) {
-        unsigned Reg = MO.getReg();
-        if (X86II::isX86_64NonExtLowByteReg(Reg))
-          REX |= 0x40;
-      }
-    }
-
-    switch (Desc.TSFlags & X86II::FormMask) {
-      case X86II::MRMSrcReg: {
-        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
-          REX |= 1 << 2;
-        i = isTwoAddr ? 2 : 1;
-        for (unsigned e = NumOps; i != e; ++i) {
-          const MachineOperand& MO = MI.getOperand(i);
-          if (X86InstrInfo::isX86_64ExtendedReg(MO))
-            REX |= 1 << 0;
-        }
-        break;
-      }
-      case X86II::MRMSrcMem: {
-        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
-          REX |= 1 << 2;
-        unsigned Bit = 0;
-        i = isTwoAddr ? 2 : 1;
-        for (; i != NumOps; ++i) {
-          const MachineOperand& MO = MI.getOperand(i);
-          if (MO.isReg()) {
-            if (X86InstrInfo::isX86_64ExtendedReg(MO))
-              REX |= 1 << Bit;
-            Bit++;
-          }
-        }
-        break;
-      }
-      case X86II::MRMXm:
-      case X86II::MRM0m: case X86II::MRM1m:
-      case X86II::MRM2m: case X86II::MRM3m:
-      case X86II::MRM4m: case X86II::MRM5m:
-      case X86II::MRM6m: case X86II::MRM7m:
-      case X86II::MRMDestMem: {
-        unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
-        i = isTwoAddr ? 1 : 0;
-        if (NumOps > e && X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e)))
-          REX |= 1 << 2;
-        unsigned Bit = 0;
-        for (; i != e; ++i) {
-          const MachineOperand& MO = MI.getOperand(i);
-          if (MO.isReg()) {
-            if (X86InstrInfo::isX86_64ExtendedReg(MO))
-              REX |= 1 << Bit;
-            Bit++;
-          }
-        }
-        break;
-      }
-      default: {
-        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
-          REX |= 1 << 0;
-        i = isTwoAddr ? 2 : 1;
-        for (unsigned e = NumOps; i != e; ++i) {
-          const MachineOperand& MO = MI.getOperand(i);
-          if (X86InstrInfo::isX86_64ExtendedReg(MO))
-            REX |= 1 << 2;
-        }
-        break;
-      }
-    }
-  }
-  return REX;
-}
-
-
-/// emitPCRelativeBlockAddress - This method keeps track of the information
-/// necessary to resolve the address of this block later and emits a dummy
-/// value.
-///
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
-  // Remember where this reference was and where it is to so we can
-  // deal with it later.
-  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
-                                             X86::reloc_pcrel_word, MBB));
-  MCE.emitWordLE(0);
-}
-
-/// emitGlobalAddress - Emit the specified address to the code stream assuming
-/// this is part of a "take the address of a global" instruction.
-///
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitGlobalAddress(const GlobalValue *GV,
-                                unsigned Reloc,
-                                intptr_t Disp /* = 0 */,
-                                intptr_t PCAdj /* = 0 */,
-                                bool Indirect /* = false */) {
-  intptr_t RelocCST = Disp;
-  if (Reloc == X86::reloc_picrel_word)
-    RelocCST = PICBaseOffset;
-  else if (Reloc == X86::reloc_pcrel_word)
-    RelocCST = PCAdj;
-  MachineRelocation MR = Indirect
-    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
-                                           const_cast<GlobalValue *>(GV),
-                                           RelocCST, false)
-    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
-                               const_cast<GlobalValue *>(GV), RelocCST, false);
-  MCE.addRelocation(MR);
-  // The relocated value will be added to the displacement
-  if (Reloc == X86::reloc_absolute_dword)
-    MCE.emitDWordLE(Disp);
-  else
-    MCE.emitWordLE((int32_t)Disp);
-}
-
-/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
-/// be emitted to the current location in the function, and allow it to be PC
-/// relative.
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES,
-                                                     unsigned Reloc) {
-  intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0;
-
-  // X86 never needs stubs because instruction selection will always pick
-  // an instruction sequence that is large enough to hold any address
-  // to a symbol.
-  // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall)
-  bool NeedStub = false;
-  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
-                                                 Reloc, ES, RelocCST,
-                                                 0, NeedStub));
-  if (Reloc == X86::reloc_absolute_dword)
-    MCE.emitDWordLE(0);
-  else
-    MCE.emitWordLE(0);
-}
-
-/// emitConstPoolAddress - Arrange for the address of an constant pool
-/// to be emitted to the current location in the function, and allow it to be PC
-/// relative.
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
-                                   intptr_t Disp /* = 0 */,
-                                   intptr_t PCAdj /* = 0 */) {
-  intptr_t RelocCST = 0;
-  if (Reloc == X86::reloc_picrel_word)
-    RelocCST = PICBaseOffset;
-  else if (Reloc == X86::reloc_pcrel_word)
-    RelocCST = PCAdj;
-  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
-                                                    Reloc, CPI, RelocCST));
-  // The relocated value will be added to the displacement
-  if (Reloc == X86::reloc_absolute_dword)
-    MCE.emitDWordLE(Disp);
-  else
-    MCE.emitWordLE((int32_t)Disp);
-}
-
-/// emitJumpTableAddress - Arrange for the address of a jump table to
-/// be emitted to the current location in the function, and allow it to be PC
-/// relative.
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
-                                   intptr_t PCAdj /* = 0 */) {
-  intptr_t RelocCST = 0;
-  if (Reloc == X86::reloc_picrel_word)
-    RelocCST = PICBaseOffset;
-  else if (Reloc == X86::reloc_pcrel_word)
-    RelocCST = PCAdj;
-  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
-                                                    Reloc, JTI, RelocCST));
-  // The relocated value will be added to the displacement
-  if (Reloc == X86::reloc_absolute_dword)
-    MCE.emitDWordLE(0);
-  else
-    MCE.emitWordLE(0);
-}
-
-inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
-                                      unsigned RM) {
-  assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
-  return RM | (RegOpcode << 3) | (Mod << 6);
-}
-
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
-                                            unsigned RegOpcodeFld){
-  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
-}
-
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
-  MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0));
-}
-
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
-                                       unsigned Index,
-                                       unsigned Base) {
-  // SIB byte is in the same format as the ModRMByte...
-  MCE.emitByte(ModRMByte(SS, Index, Base));
-}
-
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
-  // Output the constant in little endian byte order...
-  for (unsigned i = 0; i != Size; ++i) {
-    MCE.emitByte(Val & 255);
-    Val >>= 8;
-  }
-}
-
-/// isDisp8 - Return true if this signed displacement fits in a 8-bit
-/// sign-extended field.
-static bool isDisp8(int Value) {
-  return Value == (signed char)Value;
-}
-
-static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
-                              const TargetMachine &TM) {
-  // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
-  // mechanism as 32-bit mode.
-  if (TM.getSubtarget<X86Subtarget>().is64Bit() &&
-      !TM.getSubtarget<X86Subtarget>().isTargetDarwin())
-    return false;
-
-  // Return true if this is a reference to a stub containing the address of the
-  // global, not the global itself.
-  return isGlobalStubReference(GVOp.getTargetFlags());
-}
-
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
-                                                 int DispVal,
-                                                 intptr_t Adj /* = 0 */,
-                                                 bool IsPCRel /* = true */) {
-  // If this is a simple integer displacement that doesn't require a relocation,
-  // emit it now.
-  if (!RelocOp) {
-    emitConstant(DispVal, 4);
-    return;
-  }
-
-  // Otherwise, this is something that requires a relocation.  Emit it as such
-  // now.
-  unsigned RelocType = Is64BitMode ?
-    (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext)
-    : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
-  if (RelocOp->isGlobal()) {
-    // In 64-bit static small code model, we could potentially emit absolute.
-    // But it's probably not beneficial. If the MCE supports using RIP directly
-    // do it, otherwise fallback to absolute (this is determined by IsPCRel).
-    //  89 05 00 00 00 00     mov    %eax,0(%rip)  # PC-relative
-    //  89 04 25 00 00 00 00  mov    %eax,0x0      # Absolute
-    bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
-    emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(),
-                      Adj, Indirect);
-  } else if (RelocOp->isSymbol()) {
-    emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType);
-  } else if (RelocOp->isCPI()) {
-    emitConstPoolAddress(RelocOp->getIndex(), RelocType,
-                         RelocOp->getOffset(), Adj);
-  } else {
-    assert(RelocOp->isJTI() && "Unexpected machine operand!");
-    emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj);
-  }
-}
-
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
-                                            unsigned Op,unsigned RegOpcodeField,
-                                            intptr_t PCAdj) {
-  const MachineOperand &Op3 = MI.getOperand(Op+3);
-  int DispVal = 0;
-  const MachineOperand *DispForReloc = nullptr;
-
-  // Figure out what sort of displacement we have to handle here.
-  if (Op3.isGlobal()) {
-    DispForReloc = &Op3;
-  } else if (Op3.isSymbol()) {
-    DispForReloc = &Op3;
-  } else if (Op3.isCPI()) {
-    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
-      DispForReloc = &Op3;
-    } else {
-      DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex());
-      DispVal += Op3.getOffset();
-    }
-  } else if (Op3.isJTI()) {
-    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
-      DispForReloc = &Op3;
-    } else {
-      DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex());
-    }
-  } else {
-    DispVal = Op3.getImm();
-  }
-
-  const MachineOperand &Base     = MI.getOperand(Op);
-  const MachineOperand &Scale    = MI.getOperand(Op+1);
-  const MachineOperand &IndexReg = MI.getOperand(Op+2);
-
-  unsigned BaseReg = Base.getReg();
-
-  // Handle %rip relative addressing.
-  if (BaseReg == X86::RIP ||
-      (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode
-    assert(IndexReg.getReg() == 0 && Is64BitMode &&
-           "Invalid rip-relative address");
-    MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
-    emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
-    return;
-  }
-
-  // Indicate that the displacement will use an pcrel or absolute reference
-  // by default. MCEs able to resolve addresses on-the-fly use pcrel by default
-  // while others, unless explicit asked to use RIP, use absolute references.
-  bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
-
-  // Is a SIB byte needed?
-  // If no BaseReg, issue a RIP relative instruction only if the MCE can
-  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
-  // 2-7) and absolute references.
-  unsigned BaseRegNo = -1U;
-  if (BaseReg != 0 && BaseReg != X86::RIP)
-    BaseRegNo = getX86RegNum(BaseReg);
-
-  if (// The SIB byte must be used if there is an index register.
-      IndexReg.getReg() == 0 &&
-      // The SIB byte must be used if the base is ESP/RSP/R12, all of which
-      // encode to an R/M value of 4, which indicates that a SIB byte is
-      // present.
-      BaseRegNo != N86::ESP &&
-      // If there is no base register and we're in 64-bit mode, we need a SIB
-      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
-      (!Is64BitMode || BaseReg != 0)) {
-    if (BaseReg == 0 ||          // [disp32]     in X86-32 mode
-        BaseReg == X86::RIP) {   // [disp32+RIP] in X86-64 mode
-      MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
-      emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
-      return;
-    }
-
-    // If the base is not EBP/ESP and there is no displacement, use simple
-    // indirect register encoding, this handles addresses like [EAX].  The
-    // encoding for [EBP] with no displacement means [disp32] so we handle it
-    // by emitting a displacement of 0 below.
-    if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
-      MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
-      return;
-    }
-
-    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
-    if (!DispForReloc && isDisp8(DispVal)) {
-      MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
-      emitConstant(DispVal, 1);
-      return;
-    }
-
-    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
-    MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
-    emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
-    return;
-  }
-
-  // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first.
-  assert(IndexReg.getReg() != X86::ESP &&
-         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
-
-  bool ForceDisp32 = false;
-  bool ForceDisp8  = false;
-  if (BaseReg == 0) {
-    // If there is no base register, we emit the special case SIB byte with
-    // MOD=0, BASE=4, to JUST get the index, scale, and displacement.
-    MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
-    ForceDisp32 = true;
-  } else if (DispForReloc) {
-    // Emit the normal disp32 encoding.
-    MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
-    ForceDisp32 = true;
-  } else if (DispVal == 0 && BaseRegNo != N86::EBP) {
-    // Emit no displacement ModR/M byte
-    MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
-  } else if (isDisp8(DispVal)) {
-    // Emit the disp8 encoding...
-    MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
-    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
-  } else {
-    // Emit the normal disp32 encoding...
-    MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
-  }
-
-  // Calculate what the SS field value should be...
-  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
-  unsigned SS = SSTable[Scale.getImm()];
-
-  if (BaseReg == 0) {
-    // Handle the SIB byte for the case where there is no base, see Intel
-    // Manual 2A, table 2-7. The displacement has already been output.
-    unsigned IndexRegNo;
-    if (IndexReg.getReg())
-      IndexRegNo = getX86RegNum(IndexReg.getReg());
-    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
-      IndexRegNo = 4;
-    emitSIBByte(SS, IndexRegNo, 5);
-  } else {
-    unsigned BaseRegNo = getX86RegNum(BaseReg);
-    unsigned IndexRegNo;
-    if (IndexReg.getReg())
-      IndexRegNo = getX86RegNum(IndexReg.getReg());
-    else
-      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
-    emitSIBByte(SS, IndexRegNo, BaseRegNo);
-  }
-
-  // Do we need to output a displacement?
-  if (ForceDisp8) {
-    emitConstant(DispVal, 1);
-  } else if (DispVal != 0 || ForceDisp32) {
-    emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
-  }
-}
-
-static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II,
-                                   unsigned Opcode) {
-  const MCInstrDesc *Desc = &II->get(Opcode);
-  MI.setDesc(*Desc);
-  return Desc;
-}
-
-/// Is16BitMemOperand - Return true if the specified instruction has
-/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
-static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) {
-  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
-  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
-
-  if ((BaseReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
-      (IndexReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
-    return true;
-  return false;
-}
-
-/// Is32BitMemOperand - Return true if the specified instruction has
-/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
-static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) {
-  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
-  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
-
-  if ((BaseReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
-      (IndexReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
-    return true;
-  return false;
-}
-
-/// Is64BitMemOperand - Return true if the specified instruction has
-/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
-#ifndef NDEBUG
-static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) {
-  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
-  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
-
-  if ((BaseReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
-      (IndexReg.getReg() != 0 &&
-       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
-    return true;
-  return false;
-}
-#endif
-
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
-                                            int MemOperand,
-                                            const MachineInstr &MI,
-                                            const MCInstrDesc *Desc) const {
-  // Emit the operand size opcode prefix as needed.
-  if (((TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift) == X86II::OpSize16)
-    MCE.emitByte(0x66);
-
-  switch (Desc->TSFlags & X86II::OpPrefixMask) {
-  case X86II::PD:   // 66
-    MCE.emitByte(0x66);
-    break;
-  case X86II::XS:   // F3
-    MCE.emitByte(0xF3);
-    break;
-  case X86II::XD:   // F2
-    MCE.emitByte(0xF2);
-    break;
-  }
-
-  // Handle REX prefix.
-  if (Is64BitMode) {
-    if (unsigned REX = determineREX(MI))
-      MCE.emitByte(0x40 | REX);
-  }
-
-  // 0x0F escape code must be emitted just before the opcode.
-  switch (Desc->TSFlags & X86II::OpMapMask) {
-  case X86II::TB:  // Two-byte opcode map
-  case X86II::T8:  // 0F 38
-  case X86II::TA:  // 0F 3A
-    MCE.emitByte(0x0F);
-    break;
-  }
-
-  switch (Desc->TSFlags & X86II::OpMapMask) {
-  case X86II::T8:    // 0F 38
-    MCE.emitByte(0x38);
-    break;
-  case X86II::TA:    // 0F 3A
-    MCE.emitByte(0x3A);
-    break;
-  }
-}
-
-// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
-// 0-7 and the difference between the 2 groups is given by the REX prefix.
-// In the VEX prefix, registers are seen sequencially from 0-15 and encoded
-// in 1's complement form, example:
-//
-//  ModRM field => XMM9 => 1
-//  VEX.VVVV    => XMM9 => ~9
-//
-// See table 4-35 of Intel AVX Programming Reference for details.
-template<class CodeEmitter>
-unsigned char
-Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI,
-                                             unsigned OpNum) const {
-  unsigned SrcReg = MI.getOperand(OpNum).getReg();
-  unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg());
-  if (X86II::isX86_64ExtendedReg(SrcReg))
-    SrcRegNum |= 8;
-
-  // The registers represented through VEX_VVVV should
-  // be encoded in 1's complement form.
-  return (~SrcRegNum) & 0xf;
-}
-
-/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags,
-                                                 int MemOperand,
-                                                 const MachineInstr &MI) const {
-  if (MemOperand < 0)
-    return; // No memory operand
-
-  // Check for explicit segment override on memory operand.
-  switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
-  default: llvm_unreachable("Unknown segment register!");
-  case 0: break;
-  case X86::CS: MCE.emitByte(0x2E); break;
-  case X86::SS: MCE.emitByte(0x36); break;
-  case X86::DS: MCE.emitByte(0x3E); break;
-  case X86::ES: MCE.emitByte(0x26); break;
-  case X86::FS: MCE.emitByte(0x64); break;
-  case X86::GS: MCE.emitByte(0x65); break;
-  }
-}
-
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
-                                               int MemOperand,
-                                               const MachineInstr &MI,
-                                               const MCInstrDesc *Desc) const {
-  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
-                           X86II::EncodingShift;
-  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
-  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
-  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
-
-  // VEX_R: opcode externsion equivalent to REX.R in
-  // 1's complement (inverted) form
-  //
-  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
-  //  0: Same as REX_R=1 (64 bit mode only)
-  //
-  unsigned char VEX_R = 0x1;
-
-  // VEX_X: equivalent to REX.X, only used when a
-  // register is used for index in SIB Byte.
-  //
-  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
-  //  0: Same as REX.X=1 (64-bit mode only)
-  unsigned char VEX_X = 0x1;
-
-  // VEX_B:
-  //
-  //  1: Same as REX_B=0 (ignored in 32-bit mode)
-  //  0: Same as REX_B=1 (64 bit mode only)
-  //
-  unsigned char VEX_B = 0x1;
-
-  // VEX_W: opcode specific (use like REX.W, or used for
-  // opcode extension, or ignored, depending on the opcode byte)
-  unsigned char VEX_W = 0;
-
-  // VEX_5M (VEX m-mmmmm field):
-  //
-  //  0b00000: Reserved for future use
-  //  0b00001: implied 0F leading opcode
-  //  0b00010: implied 0F 38 leading opcode bytes
-  //  0b00011: implied 0F 3A leading opcode bytes
-  //  0b00100-0b11111: Reserved for future use
-  //  0b01000: XOP map select - 08h instructions with imm byte
-  //  0b01001: XOP map select - 09h instructions with no imm byte
-  //  0b01010: XOP map select - 0Ah instructions with imm dword
-  unsigned char VEX_5M = 0;
-
-  // VEX_4V (VEX vvvv field): a register specifier
-  // (in 1's complement form) or 1111 if unused.
-  unsigned char VEX_4V = 0xf;
-
-  // VEX_L (Vector Length):
-  //
-  //  0: scalar or 128-bit vector
-  //  1: 256-bit vector
-  //
-  unsigned char VEX_L = 0;
-
-  // VEX_PP: opcode extension providing equivalent
-  // functionality of a SIMD prefix
-  //
-  //  0b00: None
-  //  0b01: 66
-  //  0b10: F3
-  //  0b11: F2
-  //
-  unsigned char VEX_PP = 0;
-
-  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
-    VEX_W = 1;
-
-  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
-    VEX_L = 1;
-
-  switch (TSFlags & X86II::OpPrefixMask) {
-  default: break; // VEX_PP already correct
-  case X86II::PD: VEX_PP = 0x1; break; // 66
-  case X86II::XS: VEX_PP = 0x2; break; // F3
-  case X86II::XD: VEX_PP = 0x3; break; // F2
-  }
-
-  switch (TSFlags & X86II::OpMapMask) {
-  default: llvm_unreachable("Invalid prefix!");
-  case X86II::TB:   VEX_5M = 0x1; break; // 0F
-  case X86II::T8:   VEX_5M = 0x2; break; // 0F 38
-  case X86II::TA:   VEX_5M = 0x3; break; // 0F 3A
-  case X86II::XOP8: VEX_5M = 0x8; break;
-  case X86II::XOP9: VEX_5M = 0x9; break;
-  case X86II::XOPA: VEX_5M = 0xA; break;
-  }
-
-  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
-  unsigned NumOps = Desc->getNumOperands();
-  unsigned CurOp = 0;
-  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
-    ++CurOp;
-  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
-    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
-    // Special case for GATHER with 2 TIED_TO operands
-    // Skip the first 2 operands: dst, mask_wb
-    CurOp += 2;
-  }
-
-  switch (TSFlags & X86II::FormMask) {
-    default: llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!");
-    case X86II::RawFrm:
-      break;
-    case X86II::MRMDestMem: {
-      // MRMDestMem instructions forms:
-      //  MemAddr, src1(ModR/M)
-      //  MemAddr, src1(VEX_4V), src2(ModR/M)
-      //  MemAddr, src1(ModR/M), imm8
-      //
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
-        VEX_B = 0x0;
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
-        VEX_X = 0x0;
-
-      CurOp = X86::AddrNumOperands;
-      if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
-
-      const MachineOperand &MO = MI.getOperand(CurOp);
-      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
-        VEX_R = 0x0;
-      break;
-    }
-    case X86II::MRMSrcMem:
-      // MRMSrcMem instructions forms:
-      //  src1(ModR/M), MemAddr
-      //  src1(ModR/M), src2(VEX_4V), MemAddr
-      //  src1(ModR/M), MemAddr, imm8
-      //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
-      //
-      //  FMA4:
-      //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
-      //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-        VEX_R = 0x0;
-      CurOp++;
-
-      if (HasVEX_4V) {
-        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-        CurOp++;
-      }
-
-      if (X86II::isX86_64ExtendedReg(
-                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
-        VEX_B = 0x0;
-      if (X86II::isX86_64ExtendedReg(
-                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
-        VEX_X = 0x0;
-
-      if (HasVEX_4VOp3)
-        VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
-      break;
-    case X86II::MRM0m: case X86II::MRM1m:
-    case X86II::MRM2m: case X86II::MRM3m:
-    case X86II::MRM4m: case X86II::MRM5m:
-    case X86II::MRM6m: case X86II::MRM7m: {
-      // MRM[0-9]m instructions forms:
-      //  MemAddr
-      //  src1(VEX_4V), MemAddr
-      if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
-
-      if (X86II::isX86_64ExtendedReg(
-                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
-        VEX_B = 0x0;
-      if (X86II::isX86_64ExtendedReg(
-                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
-        VEX_X = 0x0;
-      break;
-    }
-    case X86II::MRMSrcReg:
-      // MRMSrcReg instructions forms:
-      //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
-      //  dst(ModR/M), src1(ModR/M)
-      //  dst(ModR/M), src1(ModR/M), imm8
-      //
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-        VEX_R = 0x0;
-      CurOp++;
-
-      if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
-
-      if (HasMemOp4) // Skip second register source (encoded in I8IMM)
-        CurOp++;
-
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-        VEX_B = 0x0;
-      CurOp++;
-      if (HasVEX_4VOp3)
-        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      break;
-    case X86II::MRMDestReg:
-      // MRMDestReg instructions forms:
-      //  dst(ModR/M), src(ModR/M)
-      //  dst(ModR/M), src(ModR/M), imm8
-      //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-        VEX_B = 0x0;
-      CurOp++;
-
-      if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
-
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-        VEX_R = 0x0;
-      break;
-    case X86II::MRM0r: case X86II::MRM1r:
-    case X86II::MRM2r: case X86II::MRM3r:
-    case X86II::MRM4r: case X86II::MRM5r:
-    case X86II::MRM6r: case X86II::MRM7r:
-      // MRM0r-MRM7r instructions forms:
-      //  dst(VEX_4V), src(ModR/M), imm8
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-      CurOp++;
-
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-        VEX_B = 0x0;
-      break;
-  }
-
-  // Emit segment override opcode prefix as needed.
-  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
-
-  // VEX opcode prefix can have 2 or 3 bytes
-  //
-  //  3 bytes:
-  //    +-----+ +--------------+ +-------------------+
-  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
-  //    +-----+ +--------------+ +-------------------+
-  //  2 bytes:
-  //    +-----+ +-------------------+
-  //    | C5h | | R | vvvv | L | pp |
-  //    +-----+ +-------------------+
-  //
-  //  XOP uses a similar prefix:
-  //    +-----+ +--------------+ +-------------------+
-  //    | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp |
-  //    +-----+ +--------------+ +-------------------+
-  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
-
-  // Can this use the 2 byte VEX prefix?
-  if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
-    MCE.emitByte(0xC5);
-    MCE.emitByte(LastByte | (VEX_R << 7));
-    return;
-  }
-
-  // 3 byte VEX prefix
-  MCE.emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4);
-  MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
-  MCE.emitByte(LastByte | (VEX_W << 7));
-}
-
-template<class CodeEmitter>
-void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
-                                           const MCInstrDesc *Desc) {
-  DEBUG(dbgs() << MI);
-
-  // If this is a pseudo instruction, lower it.
-  switch (Desc->getOpcode()) {
-  case X86::ADD16rr_DB:      Desc = UpdateOp(MI, II, X86::OR16rr); break;
-  case X86::ADD32rr_DB:      Desc = UpdateOp(MI, II, X86::OR32rr); break;
-  case X86::ADD64rr_DB:      Desc = UpdateOp(MI, II, X86::OR64rr); break;
-  case X86::ADD16ri_DB:      Desc = UpdateOp(MI, II, X86::OR16ri); break;
-  case X86::ADD32ri_DB:      Desc = UpdateOp(MI, II, X86::OR32ri); break;
-  case X86::ADD64ri32_DB:    Desc = UpdateOp(MI, II, X86::OR64ri32); break;
-  case X86::ADD16ri8_DB:     Desc = UpdateOp(MI, II, X86::OR16ri8); break;
-  case X86::ADD32ri8_DB:     Desc = UpdateOp(MI, II, X86::OR32ri8); break;
-  case X86::ADD64ri8_DB:     Desc = UpdateOp(MI, II, X86::OR64ri8); break;
-  case X86::ACQUIRE_MOV8rm:  Desc = UpdateOp(MI, II, X86::MOV8rm); break;
-  case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
-  case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
-  case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
-  case X86::RELEASE_MOV8mr:  Desc = UpdateOp(MI, II, X86::MOV8mr); break;
-  case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
-  case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
-  case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
-  }
-
-
-  MCE.processDebugLoc(MI.getDebugLoc(), true);
-
-  unsigned Opcode = Desc->Opcode;
-
-  // If this is a two-address instruction, skip one of the register operands.
-  unsigned NumOps = Desc->getNumOperands();
-  unsigned CurOp = 0;
-  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
-    ++CurOp;
-  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
-    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
-    // Special case for GATHER with 2 TIED_TO operands
-    // Skip the first 2 operands: dst, mask_wb
-    CurOp += 2;
-  }
-
-  uint64_t TSFlags = Desc->TSFlags;
-
-  // Encoding type for this instruction.
-  unsigned char Encoding = (TSFlags & X86II::EncodingMask) >>
-                           X86II::EncodingShift;
-
-  // It uses the VEX.VVVV field?
-  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
-  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
-  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
-  const unsigned MemOp4_I8IMMOperand = 2;
-
-  // Determine where the memory operand starts, if present.
-  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
-  if (MemoryOperand != -1) MemoryOperand += CurOp;
-
-  // Emit the lock opcode prefix as needed.
-  if (Desc->TSFlags & X86II::LOCK)
-    MCE.emitByte(0xF0);
-
-  // Emit segment override opcode prefix as needed.
-  emitSegmentOverridePrefix(TSFlags, MemoryOperand, MI);
-
-  // Emit the repeat opcode prefix as needed.
-  if (Desc->TSFlags & X86II::REP)
-    MCE.emitByte(0xF3);
-
-  // Emit the address size opcode prefix as needed.
-  bool need_address_override;
-  if (TSFlags & X86II::AdSize) {
-    need_address_override = true;
-  } else if (MemoryOperand < 0) {
-    need_address_override = false;
-  } else if (Is64BitMode) {
-    assert(!Is16BitMemOperand(MI, MemoryOperand));
-    need_address_override = Is32BitMemOperand(MI, MemoryOperand);
-  } else {
-    assert(!Is64BitMemOperand(MI, MemoryOperand));
-    need_address_override = Is16BitMemOperand(MI, MemoryOperand);
-  }
-
-  if (need_address_override)
-    MCE.emitByte(0x67);
-
-  if (Encoding == 0)
-    emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
-  else
-    emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
-
-  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags);
-  switch (TSFlags & X86II::FormMask) {
-  default:
-    llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
-  case X86II::Pseudo:
-    // Remember the current PC offset, this is the PIC relocation
-    // base address.
-    switch (Opcode) {
-    default:
-      llvm_unreachable("pseudo instructions should be removed before code"
-                       " emission");
-    // Do nothing for Int_MemBarrier - it's just a comment.  Add a debug
-    // to make it slightly easier to see.
-    case X86::Int_MemBarrier:
-      DEBUG(dbgs() << "#MEMBARRIER\n");
-      break;
-
-    case TargetOpcode::INLINEASM:
-      // We allow inline assembler nodes with empty bodies - they can
-      // implicitly define registers, which is ok for JIT.
-      if (MI.getOperand(0).getSymbolName()[0]) {
-        DebugLoc DL = MI.getDebugLoc();
-        DL.print(MI.getParent()->getParent()->getFunction()->getContext(),
-                 llvm::errs());
-        report_fatal_error("JIT does not support inline asm!");
-      }
-      break;
-    case TargetOpcode::DBG_VALUE:
-    case TargetOpcode::CFI_INSTRUCTION:
-      break;
-    case TargetOpcode::GC_LABEL:
-    case TargetOpcode::EH_LABEL:
-      MCE.emitLabel(MI.getOperand(0).getMCSymbol());
-      break;
-
-    case TargetOpcode::IMPLICIT_DEF:
-    case TargetOpcode::KILL:
-      break;
-
-    case X86::SEH_PushReg:
-    case X86::SEH_SaveReg:
-    case X86::SEH_SaveXMM:
-    case X86::SEH_StackAlloc:
-    case X86::SEH_SetFrame:
-    case X86::SEH_PushFrame:
-    case X86::SEH_EndPrologue:
-      break;
-
-    case X86::MOVPC32r: {
-      // This emits the "call" portion of this pseudo instruction.
-      MCE.emitByte(BaseOpcode);
-      emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags));
-      // Remember PIC base.
-      PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset();
-      X86JITInfo *JTI = TM.getJITInfo();
-      JTI->setPICBase(MCE.getCurrentPCValue());
-      break;
-    }
-    }
-    CurOp = NumOps;
-    break;
-  case X86II::RawFrm: {
-    MCE.emitByte(BaseOpcode);
-
-    if (CurOp == NumOps)
-      break;
-
-    const MachineOperand &MO = MI.getOperand(CurOp++);
-
-    DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n");
-    DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n");
-    DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n");
-    DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n");
-    DEBUG(dbgs() << "isImm " << MO.isImm() << "\n");
-
-    if (MO.isMBB()) {
-      emitPCRelativeBlockAddress(MO.getMBB());
-      break;
-    }
-
-    if (MO.isGlobal()) {
-      emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
-                        MO.getOffset(), 0);
-      break;
-    }
-
-    if (MO.isSymbol()) {
-      emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
-      break;
-    }
-
-    // FIXME: Only used by hackish MCCodeEmitter, remove when dead.
-    if (MO.isJTI()) {
-      emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word);
-      break;
-    }
-
-    assert(MO.isImm() && "Unknown RawFrm operand!");
-    if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
-      // Fix up immediate operand for pc relative calls.
-      intptr_t Imm = (intptr_t)MO.getImm();
-      Imm = Imm - MCE.getCurrentPCValue() - 4;
-      emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags));
-    } else
-      emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags));
-    break;
-  }
-
-  case X86II::AddRegFrm: {
-    MCE.emitByte(BaseOpcode +
-                 getX86RegNum(MI.getOperand(CurOp++).getReg()));
-
-    if (CurOp == NumOps)
-      break;
-
-    const MachineOperand &MO1 = MI.getOperand(CurOp++);
-    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
-    if (MO1.isImm()) {
-      emitConstant(MO1.getImm(), Size);
-      break;
-    }
-
-    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
-      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
-    if (Opcode == X86::MOV32ri64)
-      rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
-    // This should not occur on Darwin for relocatable objects.
-    if (Opcode == X86::MOV64ri)
-      rt = X86::reloc_absolute_dword;  // FIXME: add X86II flag?
-    if (MO1.isGlobal()) {
-      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
-      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
-                        Indirect);
-    } else if (MO1.isSymbol())
-      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
-    else if (MO1.isCPI())
-      emitConstPoolAddress(MO1.getIndex(), rt);
-    else if (MO1.isJTI())
-      emitJumpTableAddress(MO1.getIndex(), rt);
-    break;
-  }
-
-  case X86II::MRMDestReg: {
-    MCE.emitByte(BaseOpcode);
-
-    unsigned SrcRegNum = CurOp+1;
-    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
-      SrcRegNum++;
-
-    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
-    CurOp = SrcRegNum + 1;
-    break;
-  }
-  case X86II::MRMDestMem: {
-    MCE.emitByte(BaseOpcode);
-
-    unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
-    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
-      SrcRegNum++;
-    emitMemModRMByte(MI, CurOp,
-                     getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
-    CurOp = SrcRegNum + 1;
-    break;
-  }
-
-  case X86II::MRMSrcReg: {
-    MCE.emitByte(BaseOpcode);
-
-    unsigned SrcRegNum = CurOp+1;
-    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
-      ++SrcRegNum;
-
-    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
-      ++SrcRegNum;
-
-    emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(),
-                     getX86RegNum(MI.getOperand(CurOp).getReg()));
-    // 2 operands skipped with HasMemOp4, compensate accordingly
-    CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
-    if (HasVEX_4VOp3)
-      ++CurOp;
-    break;
-  }
-  case X86II::MRMSrcMem: {
-    int AddrOperands = X86::AddrNumOperands;
-    unsigned FirstMemOp = CurOp+1;
-    if (HasVEX_4V) {
-      ++AddrOperands;
-      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
-    }
-    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
-      ++FirstMemOp;
-
-    MCE.emitByte(BaseOpcode);
-
-    intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
-      X86II::getSizeOfImm(Desc->TSFlags) : 0;
-    emitMemModRMByte(MI, FirstMemOp,
-                     getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
-    CurOp += AddrOperands + 1;
-    if (HasVEX_4VOp3)
-      ++CurOp;
-    break;
-  }
-
-  case X86II::MRMXr:
-  case X86II::MRM0r: case X86II::MRM1r:
-  case X86II::MRM2r: case X86II::MRM3r:
-  case X86II::MRM4r: case X86II::MRM5r:
-  case X86II::MRM6r: case X86II::MRM7r: {
-    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
-      ++CurOp;
-    MCE.emitByte(BaseOpcode);
-    uint64_t Form = (Desc->TSFlags & X86II::FormMask);
-    emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
-                     (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r);
-
-    if (CurOp == NumOps)
-      break;
-
-    const MachineOperand &MO1 = MI.getOperand(CurOp++);
-    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
-    if (MO1.isImm()) {
-      emitConstant(MO1.getImm(), Size);
-      break;
-    }
-
-    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
-      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
-    if (Opcode == X86::MOV64ri32)
-      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
-    if (MO1.isGlobal()) {
-      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
-      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
-                        Indirect);
-    } else if (MO1.isSymbol())
-      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
-    else if (MO1.isCPI())
-      emitConstPoolAddress(MO1.getIndex(), rt);
-    else if (MO1.isJTI())
-      emitJumpTableAddress(MO1.getIndex(), rt);
-    break;
-  }
-
-  case X86II::MRMXm:
-  case X86II::MRM0m: case X86II::MRM1m:
-  case X86II::MRM2m: case X86II::MRM3m:
-  case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m: {
-    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
-      ++CurOp;
-    intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ?
-      (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ?
-          X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
-
-    MCE.emitByte(BaseOpcode);
-    uint64_t Form = (Desc->TSFlags & X86II::FormMask);
-    emitMemModRMByte(MI, CurOp, (Form==X86II::MRMXm) ? 0 : Form - X86II::MRM0m,
-                     PCAdj);
-    CurOp += X86::AddrNumOperands;
-
-    if (CurOp == NumOps)
-      break;
-
-    const MachineOperand &MO = MI.getOperand(CurOp++);
-    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
-    if (MO.isImm()) {
-      emitConstant(MO.getImm(), Size);
-      break;
-    }
-
-    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
-      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
-    if (Opcode == X86::MOV64mi32)
-      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
-    if (MO.isGlobal()) {
-      bool Indirect = gvNeedsNonLazyPtr(MO, TM);
-      emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
-                        Indirect);
-    } else if (MO.isSymbol())
-      emitExternalSymbolAddress(MO.getSymbolName(), rt);
-    else if (MO.isCPI())
-      emitConstPoolAddress(MO.getIndex(), rt);
-    else if (MO.isJTI())
-      emitJumpTableAddress(MO.getIndex(), rt);
-    break;
-  }
-
-  case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
-  case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8:
-  case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
-  case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
-  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
-  case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
-  case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
-  case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1:
-  case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4:
-  case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9:
-  case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
-  case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0:
-  case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3:
-  case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6:
-  case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9:
-  case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC:
-  case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF:
-    MCE.emitByte(BaseOpcode);
-
-    unsigned char MRM;
-    switch (TSFlags & X86II::FormMask) {
-    default: llvm_unreachable("Invalid Form");
-    case X86II::MRM_C0: MRM = 0xC0; break;
-    case X86II::MRM_C1: MRM = 0xC1; break;
-    case X86II::MRM_C2: MRM = 0xC2; break;
-    case X86II::MRM_C3: MRM = 0xC3; break;
-    case X86II::MRM_C4: MRM = 0xC4; break;
-    case X86II::MRM_C8: MRM = 0xC8; break;
-    case X86II::MRM_C9: MRM = 0xC9; break;
-    case X86II::MRM_CA: MRM = 0xCA; break;
-    case X86II::MRM_CB: MRM = 0xCB; break;
-    case X86II::MRM_D0: MRM = 0xD0; break;
-    case X86II::MRM_D1: MRM = 0xD1; break;
-    case X86II::MRM_D4: MRM = 0xD4; break;
-    case X86II::MRM_D5: MRM = 0xD5; break;
-    case X86II::MRM_D6: MRM = 0xD6; break;
-    case X86II::MRM_D8: MRM = 0xD8; break;
-    case X86II::MRM_D9: MRM = 0xD9; break;
-    case X86II::MRM_DA: MRM = 0xDA; break;
-    case X86II::MRM_DB: MRM = 0xDB; break;
-    case X86II::MRM_DC: MRM = 0xDC; break;
-    case X86II::MRM_DD: MRM = 0xDD; break;
-    case X86II::MRM_DE: MRM = 0xDE; break;
-    case X86II::MRM_DF: MRM = 0xDF; break;
-    case X86II::MRM_E0: MRM = 0xE0; break;
-    case X86II::MRM_E1: MRM = 0xE1; break;
-    case X86II::MRM_E2: MRM = 0xE2; break;
-    case X86II::MRM_E3: MRM = 0xE3; break;
-    case X86II::MRM_E4: MRM = 0xE4; break;
-    case X86II::MRM_E5: MRM = 0xE5; break;
-    case X86II::MRM_E8: MRM = 0xE8; break;
-    case X86II::MRM_E9: MRM = 0xE9; break;
-    case X86II::MRM_EA: MRM = 0xEA; break;
-    case X86II::MRM_EB: MRM = 0xEB; break;
-    case X86II::MRM_EC: MRM = 0xEC; break;
-    case X86II::MRM_ED: MRM = 0xED; break;
-    case X86II::MRM_EE: MRM = 0xEE; break;
-    case X86II::MRM_F0: MRM = 0xF0; break;
-    case X86II::MRM_F1: MRM = 0xF1; break;
-    case X86II::MRM_F2: MRM = 0xF2; break;
-    case X86II::MRM_F3: MRM = 0xF3; break;
-    case X86II::MRM_F4: MRM = 0xF4; break;
-    case X86II::MRM_F5: MRM = 0xF5; break;
-    case X86II::MRM_F6: MRM = 0xF6; break;
-    case X86II::MRM_F7: MRM = 0xF7; break;
-    case X86II::MRM_F8: MRM = 0xF8; break;
-    case X86II::MRM_F9: MRM = 0xF9; break;
-    case X86II::MRM_FA: MRM = 0xFA; break;
-    case X86II::MRM_FB: MRM = 0xFB; break;
-    case X86II::MRM_FC: MRM = 0xFC; break;
-    case X86II::MRM_FD: MRM = 0xFD; break;
-    case X86II::MRM_FE: MRM = 0xFE; break;
-    case X86II::MRM_FF: MRM = 0xFF; break;
-    }
-    MCE.emitByte(MRM);
-    break;
-  }
-
-  while (CurOp != NumOps && NumOps - CurOp <= 2) {
-    // The last source register of a 4 operand instruction in AVX is encoded
-    // in bits[7:4] of a immediate byte.
-    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
-      const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
-                                                         : CurOp);
-      ++CurOp;
-      unsigned RegNum = getX86RegNum(MO.getReg()) << 4;
-      if (X86II::isX86_64ExtendedReg(MO.getReg()))
-        RegNum |= 1 << 7;
-      // If there is an additional 5th operand it must be an immediate, which
-      // is encoded in bits[3:0]
-      if (CurOp != NumOps) {
-        const MachineOperand &MIMM = MI.getOperand(CurOp++);
-        if (MIMM.isImm()) {
-          unsigned Val = MIMM.getImm();
-          assert(Val < 16 && "Immediate operand value out of range");
-          RegNum |= Val;
-        }
-      }
-      emitConstant(RegNum, 1);
-    } else {
-      emitConstant(MI.getOperand(CurOp++).getImm(),
-                   X86II::getSizeOfImm(Desc->TSFlags));
-    }
-  }
-
-  if (!MI.isVariadic() && CurOp != NumOps) {
-#ifndef NDEBUG
-    dbgs() << "Cannot encode all operands of: " << MI << "\n";
-#endif
-    llvm_unreachable(nullptr);
-  }
-
-  MCE.processDebugLoc(MI.getDebugLoc(), false);
-}

diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index ce554ba..95cb718 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp

@@ -64,7 +64,7 @@
     X86ScalarSSEf32 = Subtarget->hasSSE1();
   }
 
-  bool TargetSelectInstruction(const Instruction *I) override;
+  bool fastSelectInstruction(const Instruction *I) override;
 
   /// \brief The specified machine instr operand is a vreg, and that
   /// vreg is being provided by the specified load instruction.  If possible,
@@ -73,7 +73,9 @@
   bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                            const LoadInst *LI) override;
 
-  bool FastLowerArguments() override;
+  bool fastLowerArguments() override;
+  bool fastLowerCall(CallLoweringInfo &CLI) override;
+  bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
 
 #include "X86GenFastISel.inc"
 
@@ -124,13 +126,8 @@
   bool X86SelectFPExt(const Instruction *I);
   bool X86SelectFPTrunc(const Instruction *I);
 
-  bool X86VisitIntrinsicCall(const IntrinsicInst &I);
-  bool X86SelectCall(const Instruction *I);
-
-  bool DoSelectCall(const Instruction *I, const char *MemIntName);
-
   const X86InstrInfo *getInstrInfo() const {
-    return getTargetMachine()->getInstrInfo();
+    return getTargetMachine()->getSubtargetImpl()->getInstrInfo();
   }
   const X86TargetMachine *getTargetMachine() const {
     return static_cast<const X86TargetMachine *>(&TM);
@@ -138,11 +135,14 @@
 
   bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
 
-  unsigned TargetMaterializeConstant(const Constant *C) override;
+  unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
+  unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned X86MaterializeGV(const GlobalValue *GV,MVT VT);
+  unsigned fastMaterializeConstant(const Constant *C) override;
 
-  unsigned TargetMaterializeAlloca(const AllocaInst *C) override;
+  unsigned fastMaterializeAlloca(const AllocaInst *C) override;
 
-  unsigned TargetMaterializeFloatZero(const ConstantFP *CF) override;
+  unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
 
   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
   /// computed in an SSE register, not on the X87 floating point stack.
@@ -164,46 +164,6 @@
 
 } // end anonymous namespace.
 
-static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) {
-  // If both operands are the same, then try to optimize or fold the cmp.
-  CmpInst::Predicate Predicate = CI->getPredicate();
-  if (CI->getOperand(0) != CI->getOperand(1))
-    return Predicate;
-
-  switch (Predicate) {
-  default: llvm_unreachable("Invalid predicate!");
-  case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break;
-  case CmpInst::FCMP_OEQ:   Predicate = CmpInst::FCMP_ORD;   break;
-  case CmpInst::FCMP_OGT:   Predicate = CmpInst::FCMP_FALSE; break;
-  case CmpInst::FCMP_OGE:   Predicate = CmpInst::FCMP_ORD;   break;
-  case CmpInst::FCMP_OLT:   Predicate = CmpInst::FCMP_FALSE; break;
-  case CmpInst::FCMP_OLE:   Predicate = CmpInst::FCMP_ORD;   break;
-  case CmpInst::FCMP_ONE:   Predicate = CmpInst::FCMP_FALSE; break;
-  case CmpInst::FCMP_ORD:   Predicate = CmpInst::FCMP_ORD;   break;
-  case CmpInst::FCMP_UNO:   Predicate = CmpInst::FCMP_UNO;   break;
-  case CmpInst::FCMP_UEQ:   Predicate = CmpInst::FCMP_TRUE;  break;
-  case CmpInst::FCMP_UGT:   Predicate = CmpInst::FCMP_UNO;   break;
-  case CmpInst::FCMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
-  case CmpInst::FCMP_ULT:   Predicate = CmpInst::FCMP_UNO;   break;
-  case CmpInst::FCMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
-  case CmpInst::FCMP_UNE:   Predicate = CmpInst::FCMP_UNO;   break;
-  case CmpInst::FCMP_TRUE:  Predicate = CmpInst::FCMP_TRUE;  break;
-
-  case CmpInst::ICMP_EQ:    Predicate = CmpInst::FCMP_TRUE;  break;
-  case CmpInst::ICMP_NE:    Predicate = CmpInst::FCMP_FALSE; break;
-  case CmpInst::ICMP_UGT:   Predicate = CmpInst::FCMP_FALSE; break;
-  case CmpInst::ICMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
-  case CmpInst::ICMP_ULT:   Predicate = CmpInst::FCMP_FALSE; break;
-  case CmpInst::ICMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
-  case CmpInst::ICMP_SGT:   Predicate = CmpInst::FCMP_FALSE; break;
-  case CmpInst::ICMP_SGE:   Predicate = CmpInst::FCMP_TRUE;  break;
-  case CmpInst::ICMP_SLT:   Predicate = CmpInst::FCMP_FALSE; break;
-  case CmpInst::ICMP_SLE:   Predicate = CmpInst::FCMP_TRUE;  break;
-  }
-
-  return Predicate;
-}
-
 static std::pair<X86::CondCode, bool>
 getX86ConditionCode(CmpInst::Predicate Predicate) {
   X86::CondCode CC = X86::COND_INVALID;
@@ -532,7 +492,7 @@
 bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
                                     unsigned Src, EVT SrcVT,
                                     unsigned &ResultReg) {
-  unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+  unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
                            Src, /*TODO: Kill=*/false);
   if (RR == 0)
     return false;
@@ -996,8 +956,7 @@
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
-                   I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
     const Value *RV = Ret->getOperand(0);
@@ -1020,7 +979,7 @@
 
     // The calling-convention tables for x87 returns don't tell
     // the whole story.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+    if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
@@ -1039,12 +998,12 @@
       if (SrcVT == MVT::i1) {
         if (Outs[0].Flags.isSExt())
           return false;
-        SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+        SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
         SrcVT = MVT::i8;
       }
       unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
                                              ISD::SIGN_EXTEND;
-      SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+      SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
                           SrcReg, /*TODO: Kill=*/false);
     }
 
@@ -1107,7 +1066,7 @@
   if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg))
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1197,7 +1156,7 @@
     ResultReg = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
             ResultReg);
-    ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
                                            X86::sub_8bit);
     if (!ResultReg)
       return false;
@@ -1212,7 +1171,7 @@
   }
 
   if (ResultReg) {
-    UpdateValueMap(I, ResultReg);
+    updateValueMap(I, ResultReg);
     return true;
   }
 
@@ -1253,7 +1212,7 @@
             FlagReg2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
             ResultReg).addReg(FlagReg1).addReg(FlagReg2);
-    UpdateValueMap(I, ResultReg);
+    updateValueMap(I, ResultReg);
     return true;
   }
 
@@ -1271,7 +1230,7 @@
     return false;
 
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1288,7 +1247,7 @@
   MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
   if (SrcVT.SimpleTy == MVT::i1) {
     // Set the high bits to zero.
-    ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+    ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
     SrcVT = MVT::i8;
 
     if (ResultReg == 0)
@@ -1315,13 +1274,13 @@
             ResultReg)
       .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
   } else if (DstVT != MVT::i8) {
-    ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+    ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
                            ResultReg, /*Kill=*/true);
     if (ResultReg == 0)
       return false;
   }
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1345,8 +1304,8 @@
       CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
       switch (Predicate) {
       default: break;
-      case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true;
-      case CmpInst::FCMP_TRUE:  FastEmitBranch(TrueMBB, DbgLoc); return true;
+      case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
+      case CmpInst::FCMP_TRUE:  fastEmitBranch(TrueMBB, DbgLoc); return true;
       }
 
       const Value *CmpLHS = CI->getOperand(0);
@@ -1416,7 +1375,7 @@
 
       // Emits an unconditional branch to the FalseBB, obtains the branch
       // weight, and adds it to the successor list.
-      FastEmitBranch(FalseMBB, DbgLoc);
+      fastEmitBranch(FalseMBB, DbgLoc);
 
       return true;
     }
@@ -1448,7 +1407,7 @@
 
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
           .addMBB(TrueMBB);
-        FastEmitBranch(FalseMBB, DbgLoc);
+        fastEmitBranch(FalseMBB, DbgLoc);
         uint32_t BranchWeight = 0;
         if (FuncInfo.BPI)
           BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
@@ -1468,7 +1427,7 @@
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
       .addMBB(TrueMBB);
-    FastEmitBranch(FalseMBB, DbgLoc);
+    fastEmitBranch(FalseMBB, DbgLoc);
     uint32_t BranchWeight = 0;
     if (FuncInfo.BPI)
       BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
@@ -1487,7 +1446,7 @@
     .addReg(OpReg).addImm(1);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4))
     .addMBB(TrueMBB);
-  FastEmitBranch(FalseMBB, DbgLoc);
+  fastEmitBranch(FalseMBB, DbgLoc);
   uint32_t BranchWeight = 0;
   if (FuncInfo.BPI)
     BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
@@ -1561,7 +1520,7 @@
   unsigned ResultReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
     .addReg(Op0Reg);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1715,7 +1674,7 @@
             ResultSuperReg).addReg(SourceSuperReg).addImm(8);
 
     // Now reference the 8-bit subreg of the result.
-    ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+    ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
                                            /*Kill=*/true, X86::sub_8bit);
   }
   // Copy the result out of the physreg if we haven't already.
@@ -1724,7 +1683,7 @@
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
         .addReg(OpEntry.DivRemResultReg);
   }
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
 
   return true;
 }
@@ -1840,9 +1799,9 @@
     return false;
 
   unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
-  unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
+  unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
                                        LHSReg, LHSIsKill);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1920,15 +1879,15 @@
     return false;
 
   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
-  unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+  unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
                                      CmpRHSReg, CmpRHSIsKill, CC);
-  unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
+  unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
                                     LHSReg, LHSIsKill);
-  unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
+  unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
                                      RHSReg, RHSIsKill);
-  unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
+  unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
                                        AndReg, /*IsKill=*/true);
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -1991,8 +1950,8 @@
   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
 
   unsigned ResultReg =
-    FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
-  UpdateValueMap(I, ResultReg);
+    fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -2021,7 +1980,7 @@
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(OpReg, getKillRegState(OpIsKill));
-      UpdateValueMap(I, ResultReg);
+      updateValueMap(I, ResultReg);
       return true;
     }
   }
@@ -2054,7 +2013,7 @@
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(X86::CVTSS2SDrr), ResultReg)
         .addReg(OpReg);
-      UpdateValueMap(I, ResultReg);
+      updateValueMap(I, ResultReg);
       return true;
     }
   }
@@ -2073,7 +2032,7 @@
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(X86::CVTSD2SSrr), ResultReg)
           .addReg(OpReg);
-        UpdateValueMap(I, ResultReg);
+        updateValueMap(I, ResultReg);
         return true;
       }
     }
@@ -2099,7 +2058,7 @@
 
   if (SrcVT == MVT::i8) {
     // Truncate from i8 to i1; no code needed.
-    UpdateValueMap(I, InputReg);
+    updateValueMap(I, InputReg);
     return true;
   }
 
@@ -2116,13 +2075,13 @@
   }
 
   // Issue an extract_subreg.
-  unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
+  unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
                                                   InputReg, /*Kill=*/true,
                                                   X86::sub_8bit);
   if (!ResultReg)
     return false;
 
-  UpdateValueMap(I, ResultReg);
+  updateValueMap(I, ResultReg);
   return true;
 }
 
@@ -2166,24 +2125,12 @@
   return true;
 }
 
-static bool isCommutativeIntrinsic(IntrinsicInst const &I) {
-  switch (I.getIntrinsicID()) {
-  case Intrinsic::sadd_with_overflow:
-  case Intrinsic::uadd_with_overflow:
-  case Intrinsic::smul_with_overflow:
-  case Intrinsic::umul_with_overflow:
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
+bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
   // FIXME: Handle more intrinsics.
-  switch (I.getIntrinsicID()) {
+  switch (II->getIntrinsicID()) {
   default: return false;
   case Intrinsic::frameaddress: {
-    Type *RetTy = I.getCalledFunction()->getReturnType();
+    Type *RetTy = II->getCalledFunction()->getReturnType();
 
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
@@ -2203,8 +2150,8 @@
     MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
     MFI->setFrameAddressIsTaken(true);
 
-    const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+        TM.getSubtargetImpl()->getRegisterInfo());
     unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF));
     assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
             (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -2223,7 +2170,7 @@
     // movq (%rax), %rax
     // ...
     unsigned DestReg;
-    unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+    unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
     while (Depth--) {
       DestReg = createResultReg(RC);
       addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2231,23 +2178,23 @@
       SrcReg = DestReg;
     }
 
-    UpdateValueMap(&I, SrcReg);
+    updateValueMap(II, SrcReg);
     return true;
   }
   case Intrinsic::memcpy: {
-    const MemCpyInst &MCI = cast<MemCpyInst>(I);
+    const MemCpyInst *MCI = cast<MemCpyInst>(II);
     // Don't handle volatile or variable length memcpys.
-    if (MCI.isVolatile())
+    if (MCI->isVolatile())
       return false;
 
-    if (isa<ConstantInt>(MCI.getLength())) {
+    if (isa<ConstantInt>(MCI->getLength())) {
       // Small memcpy's are common enough that we want to do them
       // without a call if possible.
-      uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue();
+      uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
       if (IsMemcpySmall(Len)) {
         X86AddressMode DestAM, SrcAM;
-        if (!X86SelectAddress(MCI.getRawDest(), DestAM) ||
-            !X86SelectAddress(MCI.getRawSource(), SrcAM))
+        if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
+            !X86SelectAddress(MCI->getRawSource(), SrcAM))
           return false;
         TryEmitSmallMemcpy(DestAM, SrcAM, Len);
         return true;
@@ -2255,35 +2202,35 @@
     }
 
     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
-    if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth))
+    if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
       return false;
 
-    if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255)
+    if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
       return false;
 
-    return DoSelectCall(&I, "memcpy");
+    return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
   }
   case Intrinsic::memset: {
-    const MemSetInst &MSI = cast<MemSetInst>(I);
+    const MemSetInst *MSI = cast<MemSetInst>(II);
 
-    if (MSI.isVolatile())
+    if (MSI->isVolatile())
       return false;
 
     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
-    if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth))
+    if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
       return false;
 
-    if (MSI.getDestAddressSpace() > 255)
+    if (MSI->getDestAddressSpace() > 255)
       return false;
 
-    return DoSelectCall(&I, "memset");
+    return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
   }
   case Intrinsic::stackprotector: {
     // Emit code to store the stack guard onto the stack.
     EVT PtrTy = TLI.getPointerTy();
 
-    const Value *Op1 = I.getArgOperand(0); // The guard's value.
-    const AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
+    const Value *Op1 = II->getArgOperand(0); // The guard's value.
+    const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
 
     MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
 
@@ -2294,7 +2241,7 @@
     return true;
   }
   case Intrinsic::dbg_declare: {
-    const DbgDeclareInst *DI = cast<DbgDeclareInst>(&I);
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
     X86AddressMode AM;
     assert(DI->getAddress() && "Null address should be checked earlier!");
     if (!X86SelectAddress(DI->getAddress(), AM))
@@ -2302,8 +2249,10 @@
     const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
     // FIXME may need to add RegState::Debug to any registers produced,
     // although ESP/EBP should be the only ones at the moment.
-    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM).
-      addImm(0).addMetadata(DI->getVariable());
+    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
+        .addImm(0)
+        .addMetadata(DI->getVariable())
+        .addMetadata(DI->getExpression());
     return true;
   }
   case Intrinsic::trap: {
@@ -2314,13 +2263,13 @@
     if (!Subtarget->hasSSE1())
       return false;
 
-    Type *RetTy = I.getCalledFunction()->getReturnType();
+    Type *RetTy = II->getCalledFunction()->getReturnType();
 
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
-    // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT
+    // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
     // is not generated by FastISel yet.
     // FIXME: Update this code once tablegen can handle it.
     static const unsigned SqrtOpc[2][2] = {
@@ -2336,7 +2285,7 @@
     case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
     }
 
-    const Value *SrcVal = I.getArgOperand(0);
+    const Value *SrcVal = II->getArgOperand(0);
     unsigned SrcReg = getRegForValue(SrcVal);
 
     if (SrcReg == 0)
@@ -2359,7 +2308,7 @@
 
     MIB.addReg(SrcReg);
 
-    UpdateValueMap(&I, ResultReg);
+    updateValueMap(II, ResultReg);
     return true;
   }
   case Intrinsic::sadd_with_overflow:
@@ -2370,7 +2319,7 @@
   case Intrinsic::umul_with_overflow: {
     // This implements the basic lowering of the xalu with overflow intrinsics
     // into add/sub/mul followed by either seto or setb.
-    const Function *Callee = I.getCalledFunction();
+    const Function *Callee = II->getCalledFunction();
     auto *Ty = cast<StructType>(Callee->getReturnType());
     Type *RetTy = Ty->getTypeAtIndex(0U);
     Type *CondTy = Ty->getTypeAtIndex(1);
@@ -2382,23 +2331,31 @@
     if (VT < MVT::i8 || VT > MVT::i64)
       return false;
 
-    const Value *LHS = I.getArgOperand(0);
-    const Value *RHS = I.getArgOperand(1);
+    const Value *LHS = II->getArgOperand(0);
+    const Value *RHS = II->getArgOperand(1);
 
     // Canonicalize immediate to the RHS.
     if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
-        isCommutativeIntrinsic(I))
+        isCommutativeIntrinsic(II))
       std::swap(LHS, RHS);
 
+    bool UseIncDec = false;
+    if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
+      UseIncDec = true;
+
     unsigned BaseOpc, CondOpc;
-    switch (I.getIntrinsicID()) {
+    switch (II->getIntrinsicID()) {
     default: llvm_unreachable("Unexpected intrinsic!");
     case Intrinsic::sadd_with_overflow:
-      BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break;
+      BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
+      CondOpc = X86::SETOr;
+      break;
     case Intrinsic::uadd_with_overflow:
       BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
     case Intrinsic::ssub_with_overflow:
-      BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break;
+      BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
+      CondOpc = X86::SETOr;
+      break;
     case Intrinsic::usub_with_overflow:
       BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
     case Intrinsic::smul_with_overflow:
@@ -2414,9 +2371,24 @@
 
     unsigned ResultReg = 0;
     // Check if we have an immediate version.
-    if (auto const *C = dyn_cast<ConstantInt>(RHS)) {
-      ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
-                              C->getZExtValue());
+    if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
+      static const unsigned Opc[2][2][4] = {
+        { { X86::INC8r, X86::INC16r,    X86::INC32r,    X86::INC64r },
+          { X86::DEC8r, X86::DEC16r,    X86::DEC32r,    X86::DEC64r }  },
+        { { X86::INC8r, X86::INC64_16r, X86::INC64_32r, X86::INC64r },
+          { X86::DEC8r, X86::DEC64_16r, X86::DEC64_32r, X86::DEC64r }  }
+      };
+
+      if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
+        ResultReg = createResultReg(TLI.getRegClassFor(VT));
+        bool Is64Bit = Subtarget->is64Bit();
+        bool IsDec = BaseOpc == X86ISD::DEC;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(Opc[Is64Bit][IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
+          .addReg(LHSReg, getKillRegState(LHSIsKill));
+      } else
+        ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
+                                CI->getZExtValue());
     }
 
     unsigned RHSReg;
@@ -2426,7 +2398,7 @@
       if (RHSReg == 0)
         return false;
       RHSIsKill = hasTrivialKill(RHS);
-      ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
+      ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
                               RHSIsKill);
     }
 
@@ -2441,7 +2413,7 @@
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
         .addReg(LHSReg, getKillRegState(LHSIsKill));
-      ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
+      ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
                                  TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
     } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
       static const unsigned MULOpc[] =
@@ -2452,10 +2424,10 @@
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                TII.get(TargetOpcode::COPY), X86::AL)
           .addReg(LHSReg, getKillRegState(LHSIsKill));
-        ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
+        ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
                                    RHSIsKill);
       } else
-        ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
+        ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
                                     TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
                                     RHSReg, RHSIsKill);
     }
@@ -2468,7 +2440,7 @@
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
             ResultReg2);
 
-    UpdateValueMap(&I, ResultReg, 2);
+    updateValueMap(II, ResultReg, 2);
     return true;
   }
   case Intrinsic::x86_sse_cvttss2si:
@@ -2476,7 +2448,7 @@
   case Intrinsic::x86_sse2_cvttsd2si:
   case Intrinsic::x86_sse2_cvttsd2si64: {
     bool IsInputDouble;
-    switch (I.getIntrinsicID()) {
+    switch (II->getIntrinsicID()) {
     default: llvm_unreachable("Unexpected intrinsic.");
     case Intrinsic::x86_sse_cvttss2si:
     case Intrinsic::x86_sse_cvttss2si64:
@@ -2492,7 +2464,7 @@
       break;
     }
 
-    Type *RetTy = I.getCalledFunction()->getReturnType();
+    Type *RetTy = II->getCalledFunction()->getReturnType();
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
@@ -2512,7 +2484,7 @@
     }
 
     // Check if we can fold insertelement instructions into the convert.
-    const Value *Op = I.getArgOperand(0);
+    const Value *Op = II->getArgOperand(0);
     while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
       const Value *Index = IE->getOperand(2);
       if (!isa<ConstantInt>(Index))
@@ -2534,13 +2506,13 @@
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(Reg);
 
-    UpdateValueMap(&I, ResultReg);
+    updateValueMap(II, ResultReg);
     return true;
   }
   }
 }
 
-bool X86FastISel::FastLowerArguments() {
+bool X86FastISel::fastLowerArguments() {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
@@ -2630,58 +2602,57 @@
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
       .addReg(DstReg, getKillRegState(true));
-    UpdateValueMap(&Arg, ResultReg);
+    updateValueMap(&Arg, ResultReg);
   }
   return true;
 }
 
-bool X86FastISel::X86SelectCall(const Instruction *I) {
-  const CallInst *CI = cast<CallInst>(I);
-  const Value *Callee = CI->getCalledValue();
-
-  // Can't handle inline asm yet.
-  if (isa<InlineAsm>(Callee))
-    return false;
-
-  // Handle intrinsic calls.
-  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI))
-    return X86VisitIntrinsicCall(*II);
-
-  // Allow SelectionDAG isel to handle tail calls.
-  if (cast<CallInst>(I)->isTailCall())
-    return false;
-
-  return DoSelectCall(I, nullptr);
-}
-
-static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
-                                           const ImmutableCallSite &CS) {
-  if (Subtarget.is64Bit())
+static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
+                                           CallingConv::ID CC,
+                                           ImmutableCallSite *CS) {
+  if (Subtarget->is64Bit())
     return 0;
-  if (Subtarget.getTargetTriple().isOSMSVCRT())
+  if (Subtarget->getTargetTriple().isOSMSVCRT())
     return 0;
-  CallingConv::ID CC = CS.getCallingConv();
-  if (CC == CallingConv::Fast || CC == CallingConv::GHC)
+  if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+      CC == CallingConv::HiPE)
     return 0;
-  if (!CS.paramHasAttr(1, Attribute::StructRet))
+  if (CS && !CS->paramHasAttr(1, Attribute::StructRet))
     return 0;
-  if (CS.paramHasAttr(1, Attribute::InReg))
+  if (CS && CS->paramHasAttr(1, Attribute::InReg))
     return 0;
   return 4;
 }
 
-// Select either a call, or an llvm.memcpy/memmove/memset intrinsic
-bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
-  const CallInst *CI = cast<CallInst>(I);
-  const Value *Callee = CI->getCalledValue();
+bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+  auto &OutVals       = CLI.OutVals;
+  auto &OutFlags      = CLI.OutFlags;
+  auto &OutRegs       = CLI.OutRegs;
+  auto &Ins           = CLI.Ins;
+  auto &InRegs        = CLI.InRegs;
+  CallingConv::ID CC  = CLI.CallConv;
+  bool &IsTailCall    = CLI.IsTailCall;
+  bool IsVarArg       = CLI.IsVarArg;
+  const Value *Callee = CLI.Callee;
+  const char *SymName = CLI.SymName;
 
-  // Handle only C and fastcc calling conventions for now.
-  ImmutableCallSite CS(CI);
-  CallingConv::ID CC = CS.getCallingConv();
-  bool isWin64 = Subtarget->isCallingConvWin64(CC);
-  if (CC != CallingConv::C && CC != CallingConv::Fast &&
-      CC != CallingConv::X86_FastCall && CC != CallingConv::X86_64_Win64 &&
-      CC != CallingConv::X86_64_SysV)
+  bool Is64Bit        = Subtarget->is64Bit();
+  bool IsWin64        = Subtarget->isCallingConvWin64(CC);
+
+  // Handle only C, fastcc, and webkit_js calling conventions for now.
+  switch (CC) {
+  default: return false;
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::WebKit_JS:
+  case CallingConv::X86_FastCall:
+  case CallingConv::X86_64_Win64:
+  case CallingConv::X86_64_SysV:
+    break;
+  }
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (IsTailCall)
     return false;
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
@@ -2689,150 +2660,77 @@
   if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
     return false;
 
-  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  bool isVarArg = FTy->isVarArg();
-
   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
-  // x86-32.  Special handling for x86-64 is implemented.
-  if (isVarArg && isWin64)
+  // x86-32. Special handling for x86-64 is implemented.
+  if (IsVarArg && IsWin64)
     return false;
 
   // Don't know about inalloca yet.
-  if (CS.hasInAllocaArgument())
+  if (CLI.CS && CLI.CS->hasInAllocaArgument())
     return false;
 
   // Fast-isel doesn't know about callee-pop yet.
-  if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg,
+  if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
                        TM.Options.GuaranteedTailCallOpt))
     return false;
 
-  // Check whether the function can return without sret-demotion.
-  SmallVector<ISD::OutputArg, 4> Outs;
-  GetReturnInfo(I->getType(), CS.getAttributes(), Outs, TLI);
-  bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-                                           *FuncInfo.MF, FTy->isVarArg(),
-                                           Outs, FTy->getContext());
-  if (!CanLowerReturn)
-    return false;
-
-  // Materialize callee address in a register. FIXME: GV address can be
-  // handled with a CALLpcrel32 instead.
-  X86AddressMode CalleeAM;
-  if (!X86SelectCallAddress(Callee, CalleeAM))
-    return false;
-  unsigned CalleeOp = 0;
-  const GlobalValue *GV = nullptr;
-  if (CalleeAM.GV != nullptr) {
-    GV = CalleeAM.GV;
-  } else if (CalleeAM.Base.Reg != 0) {
-    CalleeOp = CalleeAM.Base.Reg;
-  } else
-    return false;
-
-  // Deal with call operands first.
-  SmallVector<const Value *, 8> ArgVals;
-  SmallVector<unsigned, 8> Args;
-  SmallVector<MVT, 8> ArgVTs;
-  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
-  unsigned arg_size = CS.arg_size();
-  Args.reserve(arg_size);
-  ArgVals.reserve(arg_size);
-  ArgVTs.reserve(arg_size);
-  ArgFlags.reserve(arg_size);
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
-    // If we're lowering a mem intrinsic instead of a regular call, skip the
-    // last two arguments, which should not passed to the underlying functions.
-    if (MemIntName && e-i <= 2)
-      break;
-    Value *ArgVal = *i;
-    ISD::ArgFlagsTy Flags;
-    unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
-      Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
-      Flags.setZExt();
-
-    if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
-      PointerType *Ty = cast<PointerType>(ArgVal->getType());
-      Type *ElementTy = Ty->getElementType();
-      unsigned FrameSize = DL.getTypeAllocSize(ElementTy);
-      unsigned FrameAlign = CS.getParamAlignment(AttrInd);
-      if (!FrameAlign)
-        FrameAlign = TLI.getByValTypeAlignment(ElementTy);
-      Flags.setByVal();
-      Flags.setByValSize(FrameSize);
-      Flags.setByValAlign(FrameAlign);
-      if (!IsMemcpySmall(FrameSize))
-        return false;
-    }
-
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg))
-      Flags.setInReg();
-    if (CS.paramHasAttr(AttrInd, Attribute::Nest))
-      Flags.setNest();
-
-    // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra
-    // instruction.  This is safe because it is common to all fastisel supported
-    // calling conventions on x86.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) {
-      if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 ||
-          CI->getBitWidth() == 16) {
+  // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
+  // instruction. This is safe because it is common to all FastISel supported
+  // calling conventions on x86.
+  for (int i = 0, e = OutVals.size(); i != e; ++i) {
+    Value *&Val = OutVals[i];
+    ISD::ArgFlagsTy Flags = OutFlags[i];
+    if (auto *CI = dyn_cast<ConstantInt>(Val)) {
+      if (CI->getBitWidth() < 32) {
         if (Flags.isSExt())
-          ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext()));
+          Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
         else
-          ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext()));
+          Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
       }
     }
 
-    unsigned ArgReg;
-
     // Passing bools around ends up doing a trunc to i1 and passing it.
     // Codegen this as an argument + "and 1".
-    if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) &&
-        cast<TruncInst>(ArgVal)->getParent() == I->getParent() &&
-        ArgVal->hasOneUse()) {
-      ArgVal = cast<TruncInst>(ArgVal)->getOperand(0);
-      ArgReg = getRegForValue(ArgVal);
-      if (ArgReg == 0) return false;
+    if (auto *TI = dyn_cast<TruncInst>(Val)) {
+      if (TI->getType()->isIntegerTy(1) && CLI.CS &&
+          (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
+          TI->hasOneUse()) {
+        Val = cast<TruncInst>(Val)->getOperand(0);
+        unsigned ResultReg = getRegForValue(Val);
 
-      MVT ArgVT;
-      if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false;
+        if (!ResultReg)
+          return false;
 
-      ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg,
-                           ArgVal->hasOneUse(), 1);
-    } else {
-      ArgReg = getRegForValue(ArgVal);
+        MVT ArgVT;
+        if (!isTypeLegal(Val->getType(), ArgVT))
+          return false;
+
+        ResultReg =
+          fastEmit_ri(ArgVT, ArgVT, ISD::AND, ResultReg, Val->hasOneUse(), 1);
+
+        if (!ResultReg)
+          return false;
+        updateValueMap(Val, ResultReg);
+      }
     }
-
-    if (ArgReg == 0) return false;
-
-    Type *ArgTy = ArgVal->getType();
-    MVT ArgVT;
-    if (!isTypeLegal(ArgTy, ArgVT))
-      return false;
-    if (ArgVT == MVT::x86mmx)
-      return false;
-    unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
-    Flags.setOrigAlign(OriginalAlignment);
-
-    Args.push_back(ArgReg);
-    ArgVals.push_back(ArgVal);
-    ArgVTs.push_back(ArgVT);
-    ArgFlags.push_back(Flags);
   }
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs,
-                 I->getParent()->getContext());
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
 
   // Allocate shadow area for Win64
-  if (isWin64)
+  if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
-  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86);
+  SmallVector<MVT, 16> OutVTs;
+  for (auto *Val : OutVals) {
+    MVT VT;
+    if (!isTypeLegal(Val->getType(), VT))
+      return false;
+    OutVTs.push_back(VT);
+  }
+  CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -2842,13 +2740,20 @@
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
     .addImm(NumBytes);
 
-  // Process argument: walk the register/memloc assignments, inserting
-  // copies / loads.
-  SmallVector<unsigned, 4> RegArgs;
+  // Walk the register/memloc assignments, inserting copies/loads.
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    unsigned Arg = Args[VA.getValNo()];
-    EVT ArgVT = ArgVTs[VA.getValNo()];
+    CCValAssign const &VA = ArgLocs[i];
+    const Value *ArgVal = OutVals[VA.getValNo()];
+    MVT ArgVT = OutVTs[VA.getValNo()];
+
+    if (ArgVT == MVT::x86mmx)
+      return false;
+
+    unsigned ArgReg = getRegForValue(ArgVal);
+    if (!ArgReg)
+      return false;
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
@@ -2856,8 +2761,8 @@
     case CCValAssign::SExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
              "Unexpected extend");
-      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
-                                       Arg, ArgVT, Arg);
+      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
       assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
       ArgVT = VA.getLocVT();
       break;
@@ -2865,8 +2770,8 @@
     case CCValAssign::ZExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
              "Unexpected extend");
-      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
-                                       Arg, ArgVT, Arg);
+      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
       assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
       ArgVT = VA.getLocVT();
       break;
@@ -2874,66 +2779,75 @@
     case CCValAssign::AExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
              "Unexpected extend");
-      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
-                                       Arg, ArgVT, Arg);
+      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
+                                       ArgVT, ArgReg);
       if (!Emitted)
-        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
-                                    Arg, ArgVT, Arg);
+        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
+                                    ArgVT, ArgReg);
       if (!Emitted)
-        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
-                                    Arg, ArgVT, Arg);
+        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
+                                    ArgVT, ArgReg);
 
       assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
       ArgVT = VA.getLocVT();
       break;
     }
     case CCValAssign::BCvt: {
-      unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT(),
-                               ISD::BITCAST, Arg, /*TODO: Kill=*/false);
-      assert(BC != 0 && "Failed to emit a bitcast!");
-      Arg = BC;
+      ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
+                          /*TODO: Kill=*/false);
+      assert(ArgReg && "Failed to emit a bitcast!");
       ArgVT = VA.getLocVT();
       break;
     }
-    case CCValAssign::VExt: 
+    case CCValAssign::VExt:
       // VExt has not been implemented, so this should be impossible to reach
       // for now.  However, fallback to Selection DAG isel once implemented.
       return false;
+    case CCValAssign::AExtUpper:
+    case CCValAssign::SExtUpper:
+    case CCValAssign::ZExtUpper:
+    case CCValAssign::FPExt:
+      llvm_unreachable("Unexpected loc info!");
     case CCValAssign::Indirect:
       // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
       // support this.
       return false;
-    case CCValAssign::FPExt:
-      llvm_unreachable("Unexpected loc info!");
     }
 
     if (VA.isRegLoc()) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
-      RegArgs.push_back(VA.getLocReg());
+              TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+      OutRegs.push_back(VA.getLocReg());
     } else {
+      assert(VA.isMemLoc());
+
+      // Don't emit stores for undef values.
+      if (isa<UndefValue>(ArgVal))
+        continue;
+
       unsigned LocMemOffset = VA.getLocMemOffset();
       X86AddressMode AM;
-      const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>(
-          getTargetMachine()->getRegisterInfo());
       AM.Base.Reg = RegInfo->getStackRegister();
       AM.Disp = LocMemOffset;
-      const Value *ArgVal = ArgVals[VA.getValNo()];
-      ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()];
-
+      ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
+      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore,
+        ArgVT.getStoreSize(), Alignment);
       if (Flags.isByVal()) {
         X86AddressMode SrcAM;
-        SrcAM.Base.Reg = Arg;
-        bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize());
-        assert(Res && "memcpy length already checked!"); (void)Res;
+        SrcAM.Base.Reg = ArgReg;
+        if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
+          return false;
       } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
         // If this is a really simple value, emit this with the Value* version
         // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
         // as it can cause us to reevaluate the argument.
-        if (!X86FastEmitStore(ArgVT, ArgVal, AM))
+        if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
           return false;
       } else {
-        if (!X86FastEmitStore(ArgVT, Arg, /*ValIsKill=*/false, AM))
+        bool ValIsKill = hasTrivialKill(ArgVal);
+        if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
           return false;
       }
     }
@@ -2947,37 +2861,53 @@
             TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
   }
 
-  if (Subtarget->is64Bit() && isVarArg && !isWin64) {
+  if (Is64Bit && IsVarArg && !IsWin64) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
     // Count the number of XMM registers allocated.
     static const MCPhysReg XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
             X86::AL).addImm(NumXMMRegs);
   }
 
+  // Materialize callee address in a register. FIXME: GV address can be
+  // handled with a CALLpcrel32 instead.
+  X86AddressMode CalleeAM;
+  if (!X86SelectCallAddress(Callee, CalleeAM))
+    return false;
+
+  unsigned CalleeOp = 0;
+  const GlobalValue *GV = nullptr;
+  if (CalleeAM.GV != nullptr) {
+    GV = CalleeAM.GV;
+  } else if (CalleeAM.Base.Reg != 0) {
+    CalleeOp = CalleeAM.Base.Reg;
+  } else
+    return false;
+
   // Issue the call.
   MachineInstrBuilder MIB;
   if (CalleeOp) {
     // Register-indirect call.
-    unsigned CallOpc;
-    if (Subtarget->is64Bit())
-      CallOpc = X86::CALL64r;
-    else
-      CallOpc = X86::CALL32r;
+    unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
       .addReg(CalleeOp);
-
   } else {
     // Direct call.
     assert(GV && "Not a direct call");
-    unsigned CallOpc;
-    if (Subtarget->is64Bit())
-      CallOpc = X86::CALL64pcrel32;
-    else
-      CallOpc = X86::CALLpcrel32;
+    unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
 
     // See if we need any target-specific flags on the GV operand.
     unsigned char OpFlags = 0;
@@ -3000,92 +2930,72 @@
       OpFlags = X86II::MO_DARWIN_STUB;
     }
 
-
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
-    if (MemIntName)
-      MIB.addExternalSymbol(MemIntName, OpFlags);
+    if (SymName)
+      MIB.addExternalSymbol(SymName, OpFlags);
     else
       MIB.addGlobalAddress(GV, 0, OpFlags);
   }
 
-  // Add a register mask with the call-preserved registers.
+  // Add a register mask operand representing the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
-  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
 
   // Add an implicit use GOT pointer in EBX.
   if (Subtarget->isPICStyleGOT())
     MIB.addReg(X86::EBX, RegState::Implicit);
 
-  if (Subtarget->is64Bit() && isVarArg && !isWin64)
+  if (Is64Bit && IsVarArg && !IsWin64)
     MIB.addReg(X86::AL, RegState::Implicit);
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (auto Reg : OutRegs)
+    MIB.addReg(Reg, RegState::Implicit);
 
   // Issue CALLSEQ_END
+  unsigned NumBytesForCalleeToPop =
+    computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
-  const unsigned NumBytesCallee = computeBytesPoppedByCallee(*Subtarget, CS);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
-    .addImm(NumBytes).addImm(NumBytesCallee);
-
-  // Build info for return calling conv lowering code.
-  // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo.
-  SmallVector<ISD::InputArg, 32> Ins;
-  SmallVector<EVT, 4> RetTys;
-  ComputeValueVTs(TLI, I->getType(), RetTys);
-  for (unsigned i = 0, e = RetTys.size(); i != e; ++i) {
-    EVT VT = RetTys[i];
-    MVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT);
-    unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT);
-    for (unsigned j = 0; j != NumRegs; ++j) {
-      ISD::InputArg MyFlags;
-      MyFlags.VT = RegisterVT;
-      MyFlags.Used = !CS.getInstruction()->use_empty();
-      if (CS.paramHasAttr(0, Attribute::SExt))
-        MyFlags.Flags.setSExt();
-      if (CS.paramHasAttr(0, Attribute::ZExt))
-        MyFlags.Flags.setZExt();
-      if (CS.paramHasAttr(0, Attribute::InReg))
-        MyFlags.Flags.setInReg();
-      Ins.push_back(MyFlags);
-    }
-  }
+    .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
 
   // Now handle call return values.
-  SmallVector<unsigned, 4> UsedRegs;
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs,
-                    I->getParent()->getContext());
-  unsigned ResultReg = FuncInfo.CreateRegs(I->getType());
+  CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
+                    CLI.RetTy->getContext());
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+  // Copy all of the result registers out of their specified physreg.
+  unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    EVT CopyVT = RVLocs[i].getValVT();
+    CCValAssign &VA = RVLocs[i];
+    EVT CopyVT = VA.getValVT();
     unsigned CopyReg = ResultReg + i;
 
-    // If this is a call to a function that returns an fp value on the x87 fp
-    // stack, but where we prefer to use the value in xmm registers, copy it
-    // out as F80 and use a truncate to move it from fp stack reg to xmm reg.
-    if ((RVLocs[i].getLocReg() == X86::ST0 ||
-         RVLocs[i].getLocReg() == X86::ST1)) {
-      if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) {
-        CopyVT = MVT::f80;
-        CopyReg = createResultReg(&X86::RFP80RegClass);
-      }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(X86::FpPOP_RETVAL), CopyReg);
-    } else {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY),
-              CopyReg).addReg(RVLocs[i].getLocReg());
-      UsedRegs.push_back(RVLocs[i].getLocReg());
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
     }
 
-    if (CopyVT != RVLocs[i].getValVT()) {
-      // Round the F80 the right size, which also moves to the appropriate xmm
-      // register. This is accomplished by storing the F80 value in memory and
-      // then loading it back. Ewww...
-      EVT ResVT = RVLocs[i].getValVT();
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT())) {
+      CopyVT = MVT::f80;
+      CopyReg = createResultReg(&X86::RFP80RegClass);
+    }
+
+    // Copy out the result.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+    InRegs.push_back(VA.getLocReg());
+
+    // Round the f80 to the right size, which also moves it to the appropriate
+    // xmm register. This is accomplished by storing the f80 value in memory
+    // and then loading it back.
+    if (CopyVT != VA.getValVT()) {
+      EVT ResVT = VA.getValVT();
       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
       unsigned MemSize = ResVT.getSizeInBits()/8;
       int FI = MFI.CreateStackObject(MemSize, MemSize, false);
@@ -3098,18 +3008,15 @@
     }
   }
 
-  if (RVLocs.size())
-    UpdateValueMap(I, ResultReg, RVLocs.size());
-
-  // Set all unused physreg defs as dead.
-  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+  CLI.ResultReg = ResultReg;
+  CLI.NumResultRegs = RVLocs.size();
+  CLI.Call = MIB;
 
   return true;
 }
 
-
 bool
-X86FastISel::TargetSelectInstruction(const Instruction *I)  {
+X86FastISel::fastSelectInstruction(const Instruction *I)  {
   switch (I->getOpcode()) {
   default: break;
   case Instruction::Load:
@@ -3125,8 +3032,6 @@
     return X86SelectZExt(I);
   case Instruction::Br:
     return X86SelectBranch(I);
-  case Instruction::Call:
-    return X86SelectCall(I);
   case Instruction::LShr:
   case Instruction::AShr:
   case Instruction::Shl:
@@ -3154,7 +3059,7 @@
       return X86SelectTrunc(I);
     unsigned Reg = getRegForValue(I->getOperand(0));
     if (Reg == 0) return false;
-    UpdateValueMap(I, Reg);
+    updateValueMap(I, Reg);
     return true;
   }
   }
@@ -3162,13 +3067,69 @@
   return false;
 }
 
-unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
-  MVT VT;
-  if (!isTypeLegal(C->getType(), VT))
+unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
+  if (VT > MVT::i64)
     return 0;
 
+  uint64_t Imm = CI->getZExtValue();
+  if (Imm == 0) {
+    unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type");
+    case MVT::i1:
+    case MVT::i8:
+      return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+                                        X86::sub_8bit);
+    case MVT::i16:
+      return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
+                                        X86::sub_16bit);
+    case MVT::i32:
+      return SrcReg;
+    case MVT::i64: {
+      unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+        .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+      return ResultReg;
+    }
+    }
+  }
+
+  unsigned Opc = 0;
+  switch (VT.SimpleTy) {
+  default: llvm_unreachable("Unexpected value type");
+  case MVT::i1:  VT = MVT::i8; // fall-through
+  case MVT::i8:  Opc = X86::MOV8ri;  break;
+  case MVT::i16: Opc = X86::MOV16ri; break;
+  case MVT::i32: Opc = X86::MOV32ri; break;
+  case MVT::i64: {
+    if (isUInt<32>(Imm))
+      Opc = X86::MOV32ri;
+    else if (isInt<32>(Imm))
+      Opc = X86::MOV64ri32;
+    else
+      Opc = X86::MOV64ri;
+    break;
+  }
+  }
+  if (VT == MVT::i64 && Opc == X86::MOV32ri) {
+    unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
+    unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
+      .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
+    return ResultReg;
+  }
+  return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+}
+
+unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  if (CFP->isNullValue())
+    return fastMaterializeFloatZero(CFP);
+
   // Can't handle alternate code models yet.
-  if (TM.getCodeModel() != CodeModel::Small)
+  CodeModel::Model CM = TM.getCodeModel();
+  if (CM != CodeModel::Small && CM != CodeModel::Large)
     return 0;
 
   // Get opcode and regclass of the output for the given load instruction.
@@ -3176,23 +3137,6 @@
   const TargetRegisterClass *RC = nullptr;
   switch (VT.SimpleTy) {
   default: return 0;
-  case MVT::i8:
-    Opc = X86::MOV8rm;
-    RC  = &X86::GR8RegClass;
-    break;
-  case MVT::i16:
-    Opc = X86::MOV16rm;
-    RC  = &X86::GR16RegClass;
-    break;
-  case MVT::i32:
-    Opc = X86::MOV32rm;
-    RC  = &X86::GR32RegClass;
-    break;
-  case MVT::i64:
-    // Must be in x86-64 mode.
-    Opc = X86::MOV64rm;
-    RC  = &X86::GR64RegClass;
-    break;
   case MVT::f32:
     if (X86ScalarSSEf32) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
@@ -3216,39 +3160,11 @@
     return 0;
   }
 
-  // Materialize addresses with LEA/MOV instructions.
-  if (isa<GlobalValue>(C)) {
-    X86AddressMode AM;
-    if (X86SelectAddress(C, AM)) {
-      // If the expression is just a basereg, then we're done, otherwise we need
-      // to emit an LEA.
-      if (AM.BaseType == X86AddressMode::RegBase &&
-          AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
-        return AM.Base.Reg;
-
-      unsigned ResultReg = createResultReg(RC);
-      if (TM.getRelocationModel() == Reloc::Static &&
-          TLI.getPointerTy() == MVT::i64) {
-        // The displacement code be more than 32 bits away so we need to use
-        // an instruction with a 64 bit immediate
-        Opc = X86::MOV64ri;
-        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(Opc), ResultReg).addGlobalAddress(cast<GlobalValue>(C));
-      } else {
-        Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
-        addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                             TII.get(Opc), ResultReg), AM);
-      }
-      return ResultReg;
-    }
-    return 0;
-  }
-
   // MachineConstantPool wants an explicit alignment.
-  unsigned Align = DL.getPrefTypeAlignment(C->getType());
+  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
   if (Align == 0) {
-    // Alignment of vector types.  FIXME!
-    Align = DL.getTypeAllocSize(C->getType());
+    // Alignment of vector types. FIXME!
+    Align = DL.getTypeAllocSize(CFP->getType());
   }
 
   // x86-32 PIC requires a PIC base register for constant pools.
@@ -3266,23 +3182,88 @@
   }
 
   // Create the load from the constant pool.
-  unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
+  unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
   unsigned ResultReg = createResultReg(RC);
+
+  if (CM == CodeModel::Large) {
+    unsigned AddrReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+            AddrReg)
+      .addConstantPoolIndex(CPI, 0, OpFlag);
+    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                      TII.get(Opc), ResultReg);
+    addDirectMem(MIB, AddrReg);
+    MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+      MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
+      TM.getSubtargetImpl()->getDataLayout()->getPointerSize(), Align);
+    MIB->addMemOperand(*FuncInfo.MF, MMO);
+    return ResultReg;
+  }
+
   addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                    TII.get(Opc), ResultReg),
-                           MCPOffset, PICBase, OpFlag);
-
+                           CPI, PICBase, OpFlag);
   return ResultReg;
 }
 
-unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
+unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
+
+  // Materialize addresses with LEA/MOV instructions.
+  X86AddressMode AM;
+  if (X86SelectAddress(GV, AM)) {
+    // If the expression is just a basereg, then we're done, otherwise we need
+    // to emit an LEA.
+    if (AM.BaseType == X86AddressMode::RegBase &&
+        AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
+      return AM.Base.Reg;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    if (TM.getRelocationModel() == Reloc::Static &&
+        TLI.getPointerTy() == MVT::i64) {
+      // The displacement code could be more than 32 bits away so we need to use
+      // an instruction with a 64 bit immediate
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
+              ResultReg)
+        .addGlobalAddress(GV);
+    } else {
+      unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
+      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                             TII.get(Opc), ResultReg), AM);
+    }
+    return ResultReg;
+  }
+  return 0;
+}
+
+unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const auto *CI = dyn_cast<ConstantInt>(C))
+    return X86MaterializeInt(CI, VT);
+  else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return X86MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return X86MaterializeGV(GV, VT);
+
+  return 0;
+}
+
+unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
   // Fail on dynamic allocas. At this point, getRegForValue has already
   // checked its CSE maps, so if we're here trying to handle a dynamic
   // alloca, we're not going to succeed. X86SelectAddress has a
   // check for dynamic allocas, because it's called directly from
-  // various places, but TargetMaterializeAlloca also needs a check
+  // various places, but targetMaterializeAlloca also needs a check
   // in order to avoid recursion between getRegForValue,
-  // X86SelectAddrss, and TargetMaterializeAlloca.
+  // X86SelectAddrss, and targetMaterializeAlloca.
   if (!FuncInfo.StaticAllocaMap.count(C))
     return 0;
   assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
@@ -3290,7 +3271,7 @@
   X86AddressMode AM;
   if (!X86SelectAddress(C, AM))
     return 0;
-  unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
   const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
   unsigned ResultReg = createResultReg(RC);
   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -3298,7 +3279,7 @@
   return ResultReg;
 }
 
-unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
+unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
   MVT VT;
   if (!isTypeLegal(CF->getType(), VT))
     return 0;
@@ -3356,7 +3337,8 @@
   AM.getFullAddress(AddrOps);
 
   MachineInstr *Result =
-    XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
+    XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps,
+                              Size, Alignment, /*AllowCommute=*/true);
   if (!Result)
     return false;
 

diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 4be766a..02736ac 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp

@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the pass which will find  instructions  which
-// can be re-written as LEA instructions in order to reduce pipeline
-// delays for some models of the Intel Atom family.
+// This file defines the pass that finds instructions that can be
+// re-written as LEA instructions in order to reduce pipeline delays.
 //
 //===----------------------------------------------------------------------===//
 
@@ -40,7 +39,7 @@
   /// where appropriate.
   bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
 
-  const char *getPassName() const override { return "X86 Atom LEA Fixup"; }
+  const char *getPassName() const override { return "X86 LEA Fixup"; }
 
   /// \brief Given a machine register, look for the instruction
   /// which writes it in the current basic block. If found,
@@ -156,7 +155,8 @@
   if (!ST.LEAusesAG() && !ST.slowLEA())
     return false;
 
-  TII = static_cast<const X86InstrInfo *>(TM->getInstrInfo());
+  TII =
+      static_cast<const X86InstrInfo *>(TM->getSubtargetImpl()->getInstrInfo());
 
   DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
@@ -218,7 +218,8 @@
     if (usesRegister(p, CurInst) == RU_Write) {
       return CurInst;
     }
-    InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
+    InstrDistance += TII->getInstrLatency(
+        TM->getSubtargetImpl()->getInstrItineraryData(), CurInst);
     Found = getPreviousInstr(CurInst, MFI);
   }
   return nullptr;

diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index c8a3ab3..6189109 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp

@@ -28,12 +28,14 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/EdgeBundles.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/Support/Debug.h"
@@ -41,7 +43,9 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
+#include <bitset>
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-codegen"
@@ -50,6 +54,8 @@
 STATISTIC(NumFP  , "Number of floating point instructions");
 
 namespace {
+  const unsigned ScratchFPReg = 7;
+
   struct FPS : public MachineFunctionPass {
     static char ID;
     FPS() : MachineFunctionPass(ID) {
@@ -137,7 +143,7 @@
     unsigned StackTop;          // The current top of the FP stack.
 
     enum {
-      NumFPRegs = 16            // Including scratch pseudo-registers.
+      NumFPRegs = 8             // Including scratch pseudo-registers.
     };
 
     // For each live FP<n> register, point to its Stack[] entry.
@@ -146,27 +152,6 @@
     // register allocator thinks.
     unsigned RegMap[NumFPRegs];
 
-    // Pending fixed registers - Inline assembly needs FP registers to appear
-    // in fixed stack slot positions. This is handled by copying FP registers
-    // to ST registers before the instruction, and copying back after the
-    // instruction.
-    //
-    // This is modeled with pending ST registers. NumPendingSTs is the number
-    // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP
-    // register that holds the ST value. The ST registers are not moved into
-    // place until immediately before the instruction that needs them.
-    //
-    // It can happen that we need an ST register to be live when no FP register
-    // holds the value:
-    //
-    //   %ST0 = COPY %FP4<kill>
-    //
-    // When that happens, we allocate a scratch FP register to hold the ST
-    // value. That means every register in PendingST must be live.
-
-    unsigned NumPendingSTs;
-    unsigned char PendingST[8];
-
     // Set up our stack model to match the incoming registers to MBB.
     void setupBlockStack();
 
@@ -180,9 +165,6 @@
         dbgs() << " FP" << Stack[i];
         assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
       }
-      for (unsigned i = 0; i != NumPendingSTs; ++i)
-        dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
-      dbgs() << "\n";
     }
 #endif
 
@@ -199,19 +181,6 @@
       return Slot < StackTop && Stack[Slot] == RegNo;
     }
 
-    /// getScratchReg - Return an FP register that is not currently in use.
-    unsigned getScratchReg() const {
-      for (int i = NumFPRegs - 1; i >= 8; --i)
-        if (!isLive(i))
-          return i;
-      llvm_unreachable("Ran out of scratch FP registers");
-    }
-
-    /// isScratchReg - Returns trus if RegNo is a scratch FP register.
-    static bool isScratchReg(unsigned RegNo) {
-      return RegNo > 8 && RegNo < NumFPRegs;
-    }
-
     /// getStackEntry - Return the X86::FP<n> register in register ST(i).
     unsigned getStackEntry(unsigned STi) const {
       if (STi >= StackTop)
@@ -263,21 +232,6 @@
       BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
     }
 
-    /// duplicatePendingSTBeforeKill - The instruction at I is about to kill
-    /// RegNo. If any PendingST registers still need the RegNo value, duplicate
-    /// them to new scratch registers.
-    void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) {
-      for (unsigned i = 0; i != NumPendingSTs; ++i) {
-        if (PendingST[i] != RegNo)
-          continue;
-        unsigned SR = getScratchReg();
-        DEBUG(dbgs() << "Duplicating pending ST" << i
-                     << " in FP" << RegNo << " to FP" << SR << '\n');
-        duplicateToTop(RegNo, SR, I);
-        PendingST[i] = SR;
-      }
-    }
-
     /// popStackAfter - Pop the current value off of the top of the FP stack
     /// after the specified instruction.
     void popStackAfter(MachineBasicBlock::iterator &I);
@@ -304,6 +258,7 @@
 
     bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
 
+    void handleCall(MachineBasicBlock::iterator &I);
     void handleZeroArgFP(MachineBasicBlock::iterator &I);
     void handleOneArgFP(MachineBasicBlock::iterator &I);
     void handleOneArgFPRW(MachineBasicBlock::iterator &I);
@@ -320,6 +275,8 @@
       return X86::RFP80RegClass.contains(DstReg) ||
         X86::RFP80RegClass.contains(SrcReg);
     }
+
+    void setKillFlags(MachineBasicBlock &MBB) const;
   };
   char FPS::ID = 0;
 }
@@ -354,7 +311,7 @@
   if (!FPIsUsed) return false;
 
   Bundles = &getAnalysis<EdgeBundles>();
-  TII = MF.getTarget().getInstrInfo();
+  TII = MF.getSubtarget().getInstrInfo();
 
   // Prepare cross-MBB liveness.
   bundleCFG(MF);
@@ -367,15 +324,13 @@
   MachineBasicBlock *Entry = MF.begin();
 
   bool Changed = false;
-  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> >
-         I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
-       I != E; ++I)
-    Changed |= processBasicBlock(MF, **I);
+  for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed))
+    Changed |= processBasicBlock(MF, *BB);
 
   // Process any unreachable blocks in arbitrary order now.
   if (MF.size() != Processed.size())
     for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
-      if (Processed.insert(BB))
+      if (Processed.insert(BB).second)
         Changed |= processBasicBlock(MF, *BB);
 
   LiveBundles.clear();
@@ -409,8 +364,8 @@
 bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
   bool Changed = false;
   MBB = &BB;
-  NumPendingSTs = 0;
 
+  setKillFlags(BB);
   setupBlockStack();
 
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
@@ -428,6 +383,9 @@
         X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
       FPInstClass = X86II::SpecialFP;
 
+    if (MI->isCall())
+      FPInstClass = X86II::SpecialFP;
+
     if (FPInstClass == X86II::NotFP)
       continue;  // Efficiently ignore non-fp insts!
 
@@ -462,7 +420,9 @@
     // after definition.  If so, pop them.
     for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
       unsigned Reg = DeadRegs[i];
-      if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+      // Check if Reg is live on the stack. An inline-asm register operand that
+      // is in the clobber list and marked dead might not be live on the stack.
+      if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
         DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
         freeStackSlotAfter(I, Reg-X86::FP0);
       }
@@ -874,7 +834,9 @@
   RegMap[TopReg]    = OldSlot;
   RegMap[FPRegNo]   = ~0;
   Stack[--StackTop] = ~0;
-  return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)).addReg(STReg);
+  return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr))
+      .addReg(STReg)
+      .getInstr();
 }
 
 /// adjustLiveRegs - Kill and revive registers such that exactly the FP
@@ -966,6 +928,31 @@
 // Instruction transformation implementation
 //===----------------------------------------------------------------------===//
 
+void FPS::handleCall(MachineBasicBlock::iterator &I) {
+  unsigned STReturns = 0;
+
+  for (const auto &MO : I->operands()) {
+    if (!MO.isReg())
+      continue;
+
+    unsigned R = MO.getReg() - X86::FP0;
+
+    if (R < 8) {
+      assert(MO.isDef() && MO.isImplicit());
+      STReturns |= 1 << R;
+    }
+  }
+
+  unsigned N = CountTrailingOnes_32(STReturns);
+
+  // FP registers used for function return must be consecutive starting at
+  // FP0.
+  assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2));
+
+  for (unsigned I = 0; I < N; ++I)
+    pushReg(N - I - 1);
+}
+
 /// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
 ///
 void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
@@ -992,9 +979,6 @@
   unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
   bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
 
-  if (KillsSrc)
-    duplicatePendingSTBeforeKill(Reg, I);
-
   // FISTP64m is strange because there isn't a non-popping versions.
   // If we have one _and_ we don't want to pop the operand, duplicate the value
   // on the stack instead of moving it.  This ensure that popping the value is
@@ -1015,7 +999,7 @@
        MI->getOpcode() == X86::ISTT_Fp32m80 ||
        MI->getOpcode() == X86::ISTT_Fp64m80 ||
        MI->getOpcode() == X86::ST_FpP80m)) {
-    duplicateToTop(Reg, getScratchReg(), I);
+    duplicateToTop(Reg, ScratchFPReg, I);
   } else {
     moveToTop(Reg, I);            // Move to the top of the stack...
   }
@@ -1058,7 +1042,6 @@
   bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
 
   if (KillsSrc) {
-    duplicatePendingSTBeforeKill(Reg, I);
     // If this is the last use of the source register, just make sure it's on
     // the top of the stack.
     moveToTop(Reg, I);
@@ -1314,71 +1297,22 @@
 /// floating point instructions.  This is primarily intended for use by pseudo
 /// instructions.
 ///
-void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
-  MachineInstr *MI = I;
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
+  MachineInstr *MI = Inst;
+
+  if (MI->isCall()) {
+    handleCall(Inst);
+    return;
+  }
+
   switch (MI->getOpcode()) {
   default: llvm_unreachable("Unknown SpecialFP instruction!");
   case TargetOpcode::COPY: {
     // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
     const MachineOperand &MO1 = MI->getOperand(1);
     const MachineOperand &MO0 = MI->getOperand(0);
-    unsigned DstST = MO0.getReg() - X86::ST0;
-    unsigned SrcST = MO1.getReg() - X86::ST0;
     bool KillsSrc = MI->killsRegister(MO1.getReg());
 
-    // ST = COPY FP. Set up a pending ST register.
-    if (DstST < 8) {
-      unsigned SrcFP = getFPReg(MO1);
-      assert(isLive(SrcFP) && "Cannot copy dead register");
-      assert(!MO0.isDead() && "Cannot copy to dead ST register");
-
-      // Unallocated STs are marked as the nonexistent FP255.
-      while (NumPendingSTs <= DstST)
-        PendingST[NumPendingSTs++] = NumFPRegs;
-
-      // STi could still be live from a previous inline asm.
-      if (isScratchReg(PendingST[DstST])) {
-        DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST])
-                     << '\n');
-        freeStackSlotBefore(MI, PendingST[DstST]);
-      }
-
-      // When the source is killed, allocate a scratch FP register.
-      if (KillsSrc) {
-        duplicatePendingSTBeforeKill(SrcFP, I);
-        unsigned Slot = getSlot(SrcFP);
-        unsigned SR = getScratchReg();
-        PendingST[DstST] = SR;
-        Stack[Slot] = SR;
-        RegMap[SR] = Slot;
-      } else
-        PendingST[DstST] = SrcFP;
-      break;
-    }
-
-    // FP = COPY ST. Extract fixed stack value.
-    // Any instruction defining ST registers must have assigned them to a
-    // scratch register.
-    if (SrcST < 8) {
-      unsigned DstFP = getFPReg(MO0);
-      assert(!isLive(DstFP) && "Cannot copy ST to live FP register");
-      assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register");
-      unsigned SrcFP = PendingST[SrcST];
-      assert(isScratchReg(SrcFP) && "Expected ST in a scratch register");
-      assert(isLive(SrcFP) && "Scratch holding ST is dead");
-
-      // DstFP steals the stack slot from SrcFP.
-      unsigned Slot = getSlot(SrcFP);
-      Stack[Slot] = DstFP;
-      RegMap[DstFP] = Slot;
-
-      // Always treat the ST as killed.
-      PendingST[SrcST] = NumFPRegs;
-      while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
-        --NumPendingSTs;
-      break;
-    }
-
     // FP <- FP copy.
     unsigned DstFP = getFPReg(MO0);
     unsigned SrcFP = getFPReg(MO1);
@@ -1392,7 +1326,7 @@
     } else {
       // For COPY we just duplicate the specified value to a new stack slot.
       // This could be made better, but would require substantial changes.
-      duplicateToTop(SrcFP, DstFP, I);
+      duplicateToTop(SrcFP, DstFP, Inst);
     }
     break;
   }
@@ -1401,41 +1335,11 @@
     // All FP registers must be explicitly defined, so load a 0 instead.
     unsigned Reg = MI->getOperand(0).getReg() - X86::FP0;
     DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
-    BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
+    BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::LD_F0));
     pushReg(Reg);
     break;
   }
 
-  case X86::FpPOP_RETVAL: {
-    // The FpPOP_RETVAL instruction is used after calls that return a value on
-    // the floating point stack. We cannot model this with ST defs since CALL
-    // instructions have fixed clobber lists. This instruction is interpreted
-    // to mean that there is one more live register on the stack than we
-    // thought.
-    //
-    // This means that StackTop does not match the hardware stack between a
-    // call and the FpPOP_RETVAL instructions.  We do tolerate FP instructions
-    // between CALL and FpPOP_RETVAL as long as they don't overflow the
-    // hardware stack.
-    unsigned DstFP = getFPReg(MI->getOperand(0));
-
-    // Move existing stack elements up to reflect reality.
-    assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL");
-    if (StackTop) {
-      std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1);
-      for (unsigned i = 0; i != NumFPRegs; ++i)
-        ++RegMap[i];
-    }
-    ++StackTop;
-
-    // DstFP is the new bottom of the stack.
-    Stack[0] = DstFP;
-    RegMap[DstFP] = 0;
-
-    // DstFP will be killed by processBasicBlock if this was a dead def.
-    break;
-  }
-
   case TargetOpcode::INLINEASM: {
     // The inline asm MachineInstr currently only *uses* FP registers for the
     // 'f' constraint.  These should be turned into the current ST(x) register
@@ -1472,19 +1376,30 @@
     // only tell clobbers from defs by looking at the asm descriptor.
     unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
     unsigned NumOps = 0;
+    SmallSet<unsigned, 1> FRegIdx;
+    unsigned RCID;
+
     for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
          i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
       unsigned Flags = MI->getOperand(i).getImm();
+
       NumOps = InlineAsm::getNumOperandRegisters(Flags);
       if (NumOps != 1)
         continue;
       const MachineOperand &MO = MI->getOperand(i + 1);
       if (!MO.isReg())
         continue;
-      unsigned STReg = MO.getReg() - X86::ST0;
+      unsigned STReg = MO.getReg() - X86::FP0;
       if (STReg >= 8)
         continue;
 
+      // If the flag has a register class constraint, this must be an operand
+      // with constraint "f". Record its index and continue.
+      if (InlineAsm::hasRegClassConstraint(Flags, RCID)) {
+        FRegIdx.insert(i + 1);
+        continue;
+      }
+
       switch (InlineAsm::getKind(Flags)) {
       case InlineAsm::Kind_RegUse:
         STUses |= (1u << STReg);
@@ -1527,71 +1442,42 @@
     DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
                  << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
 
-    // Scan the instruction for FP uses corresponding to "f" constraints.
-    // Collect FP registers to kill afer the instruction.
-    // Always kill all the scratch regs.
+#ifndef NDEBUG
+    // If any input operand uses constraint "f", all output register
+    // constraints must be early-clobber defs.
+    for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I)
+      if (FRegIdx.count(I)) {
+        assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 &&
+               "Operands with constraint \"f\" cannot overlap with defs");
+      }
+#endif
+
+    // Collect all FP registers (register operands with constraints "t", "u",
+    // and "f") to kill afer the instruction.
     unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
-    unsigned FPUsed = 0;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &Op = MI->getOperand(i);
       if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
         continue;
-      if (!Op.isUse())
-        MI->emitError("illegal \"f\" output constraint");
       unsigned FPReg = getFPReg(Op);
-      FPUsed |= 1U << FPReg;
 
       // If we kill this operand, make sure to pop it from the stack after the
       // asm.  We just remember it for now, and pop them all off at the end in
       // a batch.
-      if (Op.isKill())
+      if (Op.isUse() && Op.isKill())
         FPKills |= 1U << FPReg;
     }
 
-    // The popped inputs will be killed by the instruction, so duplicate them
-    // if the FP register needs to be live after the instruction, or if it is
-    // used in the instruction itself. We effectively treat the popped inputs
-    // as early clobbers.
-    for (unsigned i = 0; i < NumSTPopped; ++i) {
-      if ((FPKills & ~FPUsed) & (1u << PendingST[i]))
-        continue;
-      unsigned SR = getScratchReg();
-      duplicateToTop(PendingST[i], SR, I);
-      DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
-                   << unsigned(PendingST[i]) << " to avoid clobbering it.\n");
-      PendingST[i] = SR;
-    }
-
-    // Make sure we have a unique live register for every fixed use. Some of
-    // them could be undef uses, and we need to emit LD_F0 instructions.
-    for (unsigned i = 0; i < NumSTUses; ++i) {
-      if (i < NumPendingSTs && PendingST[i] < NumFPRegs) {
-        // Check for shared assignments.
-        for (unsigned j = 0; j < i; ++j) {
-          if (PendingST[j] != PendingST[i])
-            continue;
-          // STi and STj are inn the same register, create a copy.
-          unsigned SR = getScratchReg();
-          duplicateToTop(PendingST[i], SR, I);
-          DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
-                       << unsigned(PendingST[i])
-                       << " to avoid collision with ST" << j << '\n');
-          PendingST[i] = SR;
-        }
-        continue;
-      }
-      unsigned SR = getScratchReg();
-      DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n');
-      BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
-      pushReg(SR);
-      PendingST[i] = SR;
-      if (NumPendingSTs == i)
-        ++NumPendingSTs;
-    }
-    assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned");
+    // Do not include registers that are implicitly popped by defs/clobbers.
+    FPKills &= ~(STDefs | STClobbers);
 
     // Now we can rearrange the live registers to match what was requested.
-    shuffleStackTop(PendingST, NumPendingSTs, I);
+    unsigned char STUsesArray[8];
+
+    for (unsigned I = 0; I < NumSTUses; ++I)
+      STUsesArray[I] = I;
+
+    shuffleStackTop(STUsesArray, NumSTUses, Inst);
     DEBUG({dbgs() << "Before asm: "; dumpStack();});
 
     // With the stack layout fixed, rewrite the FP registers.
@@ -1599,36 +1485,22 @@
       MachineOperand &Op = MI->getOperand(i);
       if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
         continue;
+
       unsigned FPReg = getFPReg(Op);
-      Op.setReg(getSTReg(FPReg));
+
+      if (FRegIdx.count(i))
+        // Operand with constraint "f".
+        Op.setReg(getSTReg(FPReg));
+      else
+        // Operand with a single register class constraint ("t" or "u").
+        Op.setReg(X86::ST0 + FPReg);
     }
 
     // Simulate the inline asm popping its inputs and pushing its outputs.
     StackTop -= NumSTPopped;
 
-    // Hold the fixed output registers in scratch FP registers. They will be
-    // transferred to real FP registers by copies.
-    NumPendingSTs = 0;
-    for (unsigned i = 0; i < NumSTDefs; ++i) {
-      unsigned SR = getScratchReg();
-      pushReg(SR);
-      FPKills &= ~(1u << SR);
-    }
     for (unsigned i = 0; i < NumSTDefs; ++i)
-      PendingST[NumPendingSTs++] = getStackEntry(i);
-    DEBUG({dbgs() << "After asm: "; dumpStack();});
-
-    // If any of the ST defs were dead, pop them immediately. Our caller only
-    // handles dead FP defs.
-    MachineBasicBlock::iterator InsertPt = MI;
-    for (unsigned i = 0; STDefs & (1u << i); ++i) {
-      if (!(STDeadDefs & (1u << i)))
-        continue;
-      freeStackSlotAfter(InsertPt, PendingST[i]);
-      PendingST[i] = NumFPRegs;
-    }
-    while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
-      --NumPendingSTs;
+      pushReg(NumSTDefs - i - 1);
 
     // If this asm kills any FP registers (is the last use of them) we must
     // explicitly emit pop instructions for them.  Do this now after the asm has
@@ -1640,9 +1512,10 @@
     while (FPKills) {
       unsigned FPReg = countTrailingZeros(FPKills);
       if (isLive(FPReg))
-        freeStackSlotAfter(InsertPt, FPReg);
+        freeStackSlotAfter(Inst, FPReg);
       FPKills &= ~(1U << FPReg);
     }
+
     // Don't delete the inline asm!
     return;
   }
@@ -1655,12 +1528,12 @@
       Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6);
     unsigned FPReg = getFPReg(Op);
     if (Op.isKill())
-      moveToTop(FPReg, I);
+      moveToTop(FPReg, Inst);
     else
-      duplicateToTop(FPReg, FPReg, I);
+      duplicateToTop(FPReg, FPReg, Inst);
 
     // Emit the call. This will pop the operand.
-    BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
+    BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
       .addExternalSymbol("_ftol2")
       .addReg(X86::ST0, RegState::ImplicitKill)
       .addReg(X86::ECX, RegState::ImplicitDefine)
@@ -1738,7 +1611,7 @@
 
       // Duplicate the TOS so that we return it twice.  Just pick some other FPx
       // register to hold it.
-      unsigned NewReg = getScratchReg();
+      unsigned NewReg = ScratchFPReg;
       duplicateToTop(FirstFPRegOp, NewReg, MI);
       FirstFPRegOp = NewReg;
     }
@@ -1761,13 +1634,54 @@
     return;
   }
 
-  I = MBB->erase(I);  // Remove the pseudo instruction
+  Inst = MBB->erase(Inst);  // Remove the pseudo instruction
 
   // We want to leave I pointing to the previous instruction, but what if we
   // just erased the first instruction?
-  if (I == MBB->begin()) {
+  if (Inst == MBB->begin()) {
     DEBUG(dbgs() << "Inserting dummy KILL\n");
-    I = BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL));
+    Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL));
   } else
-    --I;
+    --Inst;
+}
+
+void FPS::setKillFlags(MachineBasicBlock &MBB) const {
+  const TargetRegisterInfo *TRI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
+  LivePhysRegs LPR(TRI);
+
+  LPR.addLiveOuts(&MBB);
+
+  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+       I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    std::bitset<8> Defs;
+    SmallVector<MachineOperand *, 2> Uses;
+    MachineInstr &MI = *I;
+
+    for (auto &MO : I->operands()) {
+      if (!MO.isReg())
+        continue;
+
+      unsigned Reg = MO.getReg() - X86::FP0;
+
+      if (Reg >= 8)
+        continue;
+
+      if (MO.isDef()) {
+        Defs.set(Reg);
+        if (!LPR.contains(MO.getReg()))
+          MO.setIsDead();
+      } else
+        Uses.push_back(&MO);
+    }
+
+    for (auto *MO : Uses)
+      if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg()))
+        MO->setIsKill();
+
+    LPR.stepBackward(MI);
+  }
 }

diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 8c029a8..b9920b1 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp

@@ -30,6 +30,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Debug.h"
+#include <cstdlib>
 
 using namespace llvm;
 
@@ -46,14 +47,15 @@
 bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineModuleInfo &MMI = MF.getMMI();
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           RegInfo->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
           MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
-          MMI.callsUnwindInit() || MMI.callsEHReturn());
+          MMI.callsUnwindInit() || MMI.callsEHReturn() ||
+          MFI->hasStackMap() || MFI->hasPatchPoint());
 }
 
 static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
@@ -80,6 +82,17 @@
   }
 }
 
+static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::AND64ri8;
+    return X86::AND64ri32;
+  }
+  if (isInt<8>(Imm))
+    return X86::AND32ri8;
+  return X86::AND32ri;
+}
+
 static unsigned getLEArOpcode(unsigned IsLP64) {
   return IsLP64 ? X86::LEA64r : X86::LEA32r;
 }
@@ -148,32 +161,32 @@
 static
 void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
                   unsigned StackPtr, int64_t NumBytes,
-                  bool Is64Bit, bool IsLP64, bool UseLEA,
+                  bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA,
                   const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
   bool isSub = NumBytes < 0;
   uint64_t Offset = isSub ? -NumBytes : NumBytes;
   unsigned Opc;
   if (UseLEA)
-    Opc = getLEArOpcode(IsLP64);
+    Opc = getLEArOpcode(Is64BitStackPtr);
   else
     Opc = isSub
-      ? getSUBriOpcode(IsLP64, Offset)
-      : getADDriOpcode(IsLP64, Offset);
+      ? getSUBriOpcode(Is64BitStackPtr, Offset)
+      : getADDriOpcode(Is64BitStackPtr, Offset);
 
   uint64_t Chunk = (1LL << 31) - 1;
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   while (Offset) {
     uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
-    if (ThisVal == (Is64Bit ? 8 : 4)) {
+    if (ThisVal == (Is64BitTarget ? 8 : 4)) {
       // Use push / pop instead.
       unsigned Reg = isSub
-        ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
-        : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+        ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX)
+        : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget);
       if (Reg) {
         Opc = isSub
-          ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
-          : (Is64Bit ? X86::POP64r  : X86::POP32r);
+          ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r)
+          : (Is64BitTarget ? X86::POP64r  : X86::POP32r);
         MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
           .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
         if (isSub)
@@ -314,7 +327,7 @@
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
@@ -352,6 +365,23 @@
   return false;
 }
 
+void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI,
+                                             unsigned &CallOp,
+                                             const char *&Symbol) {
+  CallOp = STI.is64Bit() ? X86::W64ALLOCA : X86::CALLpcrel32;
+
+  if (STI.is64Bit()) {
+    if (STI.isTargetCygMing()) {
+      Symbol = "___chkstk_ms";
+    } else {
+      Symbol = "__chkstk";
+    }
+  } else if (STI.isTargetCygMing())
+    Symbol = "_alloca";
+  else
+    Symbol = "_chkstk";
+}
+
 /// emitPrologue - Push callee-saved registers onto the stack, which
 /// automatically adjust the stack pointer. Adjust the stack pointer to allocate
 /// space for local variables. Also emit labels used by the exception handler to
@@ -440,8 +470,8 @@
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
   const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
@@ -449,11 +479,12 @@
   bool HasFP = hasFP(MF);
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
-  bool IsLP64 = STI.isTarget64BitLP64();
+  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+  const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
   bool IsWin64 = STI.isTargetWin64();
-  bool IsWinEH =
-      MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() ==
-      ExceptionHandling::WinEH; // Not necessarily synonymous with IsWin64.
+  // Not necessarily synonymous with IsWin64.
+  bool IsWinEH = MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() ==
+                 ExceptionHandling::ItaniumWinEH;
   bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry();
   bool NeedsDwarfCFI =
       !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
@@ -461,6 +492,8 @@
   unsigned StackAlign = getStackAlignment();
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  const unsigned MachineFramePtr = STI.isTarget64BitILP32() ?
+                 getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
   unsigned StackPtr = RegInfo->getStackRegister();
   unsigned BasePtr = RegInfo->getBaseRegister();
   DebugLoc DL;
@@ -482,6 +515,8 @@
     X86FI->setCalleeSavedFrameSize(
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
+  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMacho());
+  
   // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
@@ -507,7 +542,7 @@
   if (TailCallReturnAddrDelta < 0) {
     MachineInstr *MI =
       BuildMI(MBB, MBBI, DL,
-              TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)),
+              TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)),
               StackPtr)
         .addReg(StackPtr)
         .addImm(-TailCallReturnAddrDelta)
@@ -551,7 +586,7 @@
 
     // Save EBP/RBP into the appropriate stack slot.
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
-      .addReg(FramePtr, RegState::Kill)
+      .addReg(MachineFramePtr, RegState::Kill)
       .setMIFlag(MachineInstr::FrameSetup);
 
     if (NeedsDwarfCFI) {
@@ -564,7 +599,7 @@
           .addCFIIndex(CFIIndex);
 
       // Change the rule for the FramePtr to be an "offset" rule.
-      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr,
                                          DwarfFramePtr, 2 * stackGrowth));
@@ -580,14 +615,14 @@
 
     // Update EBP with the new base value.
     BuildMI(MBB, MBBI, DL,
-            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+            TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr)
         .addReg(StackPtr)
         .setMIFlag(MachineInstr::FrameSetup);
 
     if (NeedsDwarfCFI) {
       // Mark effective beginning of when frame pointer becomes valid.
       // Define the current CFA to use the EBP/RBP register.
-      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
       BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -596,7 +631,7 @@
 
     // Mark the FramePtr as live-in in every block.
     for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-      I->addLiveIn(FramePtr);
+      I->addLiveIn(MachineFramePtr);
   } else {
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
   }
@@ -633,11 +668,12 @@
   // able to calculate their offsets from the frame pointer).
   if (RegInfo->needsStackRealignment(MF)) {
     assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    uint64_t Val = -MaxAlign;
     MachineInstr *MI =
       BuildMI(MBB, MBBI, DL,
-              TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
+              TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr)
       .addReg(StackPtr)
-      .addImm(-MaxAlign)
+      .addImm(Val)
       .setMIFlag(MachineInstr::FrameSetup);
 
     // The EFLAGS implicit def is dead.
@@ -655,6 +691,8 @@
 
   // Adjust stack pointer: ESP -= numbytes.
 
+  static const size_t PageSize = 4096;
+
   // Windows and cygwin/mingw require a prologue helper routine when allocating
   // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
   // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the
@@ -663,19 +701,11 @@
   // responsible for adjusting the stack pointer.  Touching the stack at 4K
   // increments is necessary to ensure that the guard pages used by the OS
   // virtual memory manager are allocated in correct sequence.
-  if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetMacho()) {
+  if (NumBytes >= PageSize && UseStackProbe) {
     const char *StackProbeSymbol;
+    unsigned CallOp;
 
-    if (Is64Bit) {
-      if (STI.isTargetCygMing()) {
-        StackProbeSymbol = "___chkstk_ms";
-      } else {
-        StackProbeSymbol = "__chkstk";
-      }
-    } else if (STI.isTargetCygMing())
-      StackProbeSymbol = "_alloca";
-    else
-      StackProbeSymbol = "_chkstk";
+    getStackProbeFunction(STI, CallOp, StackProbeSymbol);
 
     // Check whether EAX is livein for this function.
     bool isEAXAlive = isEAXLiveIn(MF);
@@ -706,7 +736,7 @@
     }
 
     BuildMI(MBB, MBBI, DL,
-            TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32))
+            TII.get(CallOp))
       .addExternalSymbol(StackProbeSymbol)
       .addReg(StackPtr,    RegState::Define | RegState::Implicit)
       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
@@ -722,15 +752,15 @@
         .setMIFlag(MachineInstr::FrameSetup);
     }
     if (isEAXAlive) {
-        // Restore EAX
-        MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
-                                                X86::EAX),
-                                        StackPtr, false, NumBytes - 4);
-        MI->setFlag(MachineInstr::FrameSetup);
-        MBB.insert(MBBI, MI);
+      // Restore EAX
+      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+                                              X86::EAX),
+                                      StackPtr, false, NumBytes - 4);
+      MI->setFlag(MachineInstr::FrameSetup);
+      MBB.insert(MBBI, MI);
     }
   } else if (NumBytes) {
-    emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
+    emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr,
                  UseLEA, TII, *RegInfo);
   }
 
@@ -746,7 +776,7 @@
       // will restore SP to (BP - SEHFrameOffset)
       for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
         int offset = MFI->getObjectOffset(Info.getFrameIdx());
-        SEHFrameOffset = std::max(SEHFrameOffset, abs(offset));
+        SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset));
       }
       SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant
 
@@ -804,7 +834,7 @@
   // to reference locals.
   if (RegInfo->hasBasePointer(MF)) {
     // Update the base pointer with the current stack pointer.
-    unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr;
+    unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
     BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
@@ -834,21 +864,29 @@
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   assert(MBBI != MBB.end() && "Returning block has no instructions");
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc DL = MBBI->getDebugLoc();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
-  bool IsLP64 = STI.isTarget64BitLP64();
+  // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
+  const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
+  const bool Is64BitILP32 = STI.isTarget64BitILP32();
   bool UseLEA = STI.useLeaForSP();
   unsigned StackAlign = getStackAlignment();
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned MachineFramePtr = Is64BitILP32 ?
+             getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
   unsigned StackPtr = RegInfo->getStackRegister();
 
+  bool IsWinEH = MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() ==
+                 ExceptionHandling::ItaniumWinEH;
+  bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry();
+
   switch (RetOpcode) {
   default:
     llvm_unreachable("Can only insert epilog into returning blocks");
@@ -898,7 +936,7 @@
 
     // Pop EBP.
     BuildMI(MBB, MBBI, DL,
-            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr);
   } else {
     NumBytes = StackSize - CSSize;
   }
@@ -930,27 +968,39 @@
     if (RegInfo->needsStackRealignment(MF))
       MBBI = FirstCSPop;
     if (CSSize != 0) {
-      unsigned Opc = getLEArOpcode(IsLP64);
+      unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
                    FramePtr, false, -CSSize);
+      --MBBI;
     } else {
-      unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+      unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
       BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
         .addReg(FramePtr);
+      --MBBI;
     }
   } else if (NumBytes) {
     // Adjust stack pointer back: ESP += numbytes.
-    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA,
+    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA,
                  TII, *RegInfo);
+    --MBBI;
   }
 
+  // Windows unwinder will not invoke function's exception handler if IP is
+  // either in prologue or in epilogue.  This behavior causes a problem when a
+  // call immediately precedes an epilogue, because the return address points
+  // into the epilogue.  To cope with that, we insert an epilogue marker here,
+  // then replace it with a 'nop' if it ends up immediately after a CALL in the
+  // final emitted code.
+  if (NeedsWinEH)
+    BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
+
   // We're returning from function via eh_return.
   if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
     MBBI = MBB.getLastNonDebugInstr();
     MachineOperand &DestAddr  = MBBI->getOperand(0);
     assert(DestAddr.isReg() && "Offset should be in register!");
     BuildMI(MBB, MBBI, DL,
-            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+            TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
             StackPtr).addReg(DestAddr.getReg());
   } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
              RetOpcode == X86::TCRETURNmi ||
@@ -976,7 +1026,7 @@
     if (Offset) {
       // Check for possible merge with preceding ADD instruction.
       Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
-      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64,
+      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr,
                    UseLEA, TII, *RegInfo);
     }
 
@@ -1021,7 +1071,7 @@
 
     // Check for possible merge with preceding ADD instruction.
     delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
-    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII,
+    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII,
                  *RegInfo);
   }
 }
@@ -1029,7 +1079,7 @@
 int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
                                           int FI) const {
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
   uint64_t StackSize = MFI->getStackSize();
@@ -1072,7 +1122,7 @@
 int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                              unsigned &FrameReg) const {
   const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   // We can't calculate offset from frame pointer if the stack is realigned,
   // so enforce usage of stack/base pointer.  The base pointer is used when we
   // have dynamic allocas in addition to dynamic realignment.
@@ -1090,7 +1140,7 @@
     std::vector<CalleeSavedInfo> &CSI) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   unsigned SlotSize = RegInfo->getSlotSize();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
@@ -1107,7 +1157,7 @@
     // about avoiding it later.
     unsigned FPReg = RegInfo->getFrameRegister(MF);
     for (unsigned i = 0; i < CSI.size(); ++i) {
-      if (CSI[i].getReg() == FPReg) {
+      if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
         CSI.erase(CSI.begin() + i);
         break;
       }
@@ -1138,7 +1188,7 @@
 
     const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
     // ensure alignment
-    SpillSlotOffset -= abs(SpillSlotOffset) % RC->getAlignment();
+    SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
     // spill into slot
     SpillSlotOffset -= RC->getSize();
     int SlotIndex =
@@ -1157,7 +1207,7 @@
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 
   // Push GPRs. It increases frame size.
@@ -1205,7 +1255,7 @@
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 
   // Reload XMMs from stack frame.
@@ -1237,7 +1287,7 @@
                                                        RegScavenger *RS) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   unsigned SlotSize = RegInfo->getSlotSize();
 
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1278,7 +1328,7 @@
 /// and the properties of the function either one or two registers will be
 /// needed. Set primary to true for the first register, false for the second.
 static unsigned
-GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) {
+GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
   CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
 
   // Erlang stuff.
@@ -1289,8 +1339,12 @@
       return Primary ? X86::EBX : X86::EDI;
   }
 
-  if (Is64Bit)
-    return Primary ? X86::R11 : X86::R12;
+  if (Is64Bit) {
+    if (IsLP64)
+      return Primary ? X86::R11 : X86::R12;
+    else
+      return Primary ? X86::R11D : X86::R12D;
+  }
 
   bool IsNested = HasNestArgument(&MF);
 
@@ -1314,14 +1368,15 @@
 X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MachineBasicBlock &prologueMBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   uint64_t StackSize;
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
+  const bool IsLP64 = STI.isTarget64BitLP64();
   unsigned TlsReg, TlsOffset;
   DebugLoc DL;
 
-  unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+  unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
   assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
          "Scratch register is live-in");
 
@@ -1359,7 +1414,7 @@
   }
 
   if (IsNested)
-    allocMBB->addLiveIn(X86::R10);
+    allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
 
   MF.push_front(allocMBB);
   MF.push_front(checkMBB);
@@ -1372,7 +1427,7 @@
   if (Is64Bit) {
     if (STI.isTargetLinux()) {
       TlsReg = X86::FS;
-      TlsOffset = 0x70;
+      TlsOffset = IsLP64 ? 0x70 : 0x40;
     } else if (STI.isTargetDarwin()) {
       TlsReg = X86::GS;
       TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
@@ -1387,12 +1442,12 @@
     }
 
     if (CompareStackPointer)
-      ScratchReg = X86::RSP;
+      ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
     else
-      BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
+      BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
         .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
 
-    BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
+    BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
       .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
   } else {
     if (STI.isTargetLinux()) {
@@ -1426,11 +1481,11 @@
       bool SaveScratch2;
       if (CompareStackPointer) {
         // The primary scratch register is available for holding the TLS offset.
-        ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
+        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
         SaveScratch2 = false;
       } else {
         // Need to use a second register to hold the TLS offset
-        ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
+        ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
 
         // Unfortunately, with fastcc the second scratch register may hold an
         // argument.
@@ -1468,15 +1523,21 @@
     // Functions with nested arguments use R10, so it needs to be saved across
     // the call to _morestack
 
-    if (IsNested)
-      BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10);
+    const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
+    const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
+    const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
+    const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
+    const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
 
-    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10)
+    if (IsNested)
+      BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
+
+    BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
       .addImm(StackSize);
-    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11)
+    BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
       .addImm(X86FI->getArgumentStackSize());
-    MF.getRegInfo().setPhysRegUsed(X86::R10);
-    MF.getRegInfo().setPhysRegUsed(X86::R11);
+    MF.getRegInfo().setPhysRegUsed(Reg10);
+    MF.getRegInfo().setPhysRegUsed(Reg11);
   } else {
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
       .addImm(X86FI->getArgumentStackSize());
@@ -1523,13 +1584,14 @@
 ///       temp0 = sp - MaxStack
 ///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const unsigned SlotSize =
-      static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo())
+      static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo())
           ->getSlotSize();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   const bool Is64Bit = STI.is64Bit();
+  const bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL;
   // HiPE-specific values
   const unsigned HipeLeafWords = 24;
@@ -1623,7 +1685,7 @@
       SPLimitOffset = 0x4c;
     }
 
-    ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+    ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
     assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
            "HiPE prologue scratch register is live-in");
 
@@ -1657,9 +1719,9 @@
 void X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  const X86RegisterInfo &RegInfo =
-      *static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+                                       MF.getSubtarget().getRegisterInfo());
   unsigned StackPtr = RegInfo.getStackRegister();
   bool reseveCallFrame = hasReservedCallFrame(MF);
   int Opcode = I->getOpcode();
@@ -1682,8 +1744,10 @@
     // We need to keep the stack aligned properly.  To do this, we round the
     // amount of space needed for the outgoing arguments up to the next
     // alignment boundary.
-    unsigned StackAlign =
-        MF.getTarget().getFrameLowering()->getStackAlignment();
+    unsigned StackAlign = MF.getTarget()
+                              .getSubtargetImpl()
+                              ->getFrameLowering()
+                              ->getStackAlignment();
     Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 
     MachineInstr *New = nullptr;

diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 5ad3d4d..7740c3a 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86_FRAMELOWERING_H
-#define X86_FRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
+#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 
 #include "llvm/Target/TargetFrameLowering.h"
 
@@ -20,12 +20,17 @@
 
 class MCSymbol;
 class X86TargetMachine;
+class X86Subtarget;
 
 class X86FrameLowering : public TargetFrameLowering {
 public:
   explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
     : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}
 
+  static void getStackProbeFunction(const X86Subtarget &STI,
+                                    unsigned &CallOp,
+                                    const char *&Symbol);
+
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
                                  DebugLoc DL) const;

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index ba2f5f6..3ef7b2c 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp

@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
@@ -33,6 +34,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include <stdint.h>
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-isel"
@@ -192,7 +194,6 @@
   private:
     SDNode *Select(SDNode *N) override;
     SDNode *SelectGather(SDNode *N, unsigned Opc);
-    SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
     SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
 
     bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
@@ -237,10 +238,10 @@
     inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
                                    SDValue &Scale, SDValue &Index,
                                    SDValue &Disp, SDValue &Segment) {
-      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
-        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex,
-                                    getTargetLowering()->getPointerTy()) :
-        AM.Base_Reg;
+      Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+                 ? CurDAG->getTargetFrameIndex(AM.Base_FrameIndex,
+                                               TLI->getPointerTy())
+                 : AM.Base_Reg;
       Scale = getI8Imm(AM.Scale);
       Index = AM.IndexReg;
       // These are 32-bit even in 64-bit mode since RIP relative offset
@@ -297,7 +298,14 @@
     /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
     /// to the target-specific type.
     const X86InstrInfo *getInstrInfo() const {
-      return getTargetMachine().getInstrInfo();
+      return getTargetMachine().getSubtargetImpl()->getInstrInfo();
+    }
+
+    /// \brief Address-mode matching performs shift-of-and to and-of-shift
+    /// reassociation in order to expose more scaled addressing
+    /// opportunities.
+    bool ComplexPatternFuncMutatesDAG() const override {
+      return true;
     }
   };
 }
@@ -510,7 +518,7 @@
     // If the source and destination are SSE registers, then this is a legal
     // conversion that should not be lowered.
     const X86TargetLowering *X86Lowering =
-        static_cast<const X86TargetLowering *>(getTargetLowering());
+        static_cast<const X86TargetLowering *>(TLI);
     bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
     bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
     if (SrcIsSSE && DstIsSSE)
@@ -544,7 +552,7 @@
                                           false, false, 0);
     SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
                                         MachinePointerInfo(),
-                                        MemVT, false, false, 0);
+                                        MemVT, false, false, false, 0);
 
     // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
     // extload we created.  This will cause general havok on the dag because
@@ -565,7 +573,7 @@
 /// the main function.
 void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
                                              MachineFrameInfo *MFI) {
-  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
   if (Subtarget->isTargetCygMing()) {
     unsigned CallOp =
       Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
@@ -775,9 +783,10 @@
   }
 }
 
-// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This
-// allows us to convert the shift and and into an h-register extract and
-// a scaled index. Returns false if the simplification is performed.
+// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
+// safe. This allows us to convert the shift and and into an h-register
+// extract and a scaled index. Returns false if the simplification is
+// performed.
 static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
                                       uint64_t Mask,
                                       SDValue Shift, SDValue X,
@@ -1429,7 +1438,7 @@
   RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
   if (RN && RN->getReg() == 0)
     Base = CurDAG->getRegister(0, MVT::i64);
-  else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(N)) {
+  else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
     // Base could already be %rip, particularly in the x32 ABI.
     Base = SDValue(CurDAG->getMachineNode(
                        TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
@@ -1563,26 +1572,7 @@
 ///
 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg,
-                             getTargetLowering()->getPointerTy()).getNode();
-}
-
-SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
-  SDValue Chain = Node->getOperand(0);
-  SDValue In1 = Node->getOperand(1);
-  SDValue In2L = Node->getOperand(2);
-  SDValue In2H = Node->getOperand(3);
-
-  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
-    return nullptr;
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node),
-                                           MVT::i32, MVT::i32, MVT::Other, Ops);
-  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
-  return ResNode;
+  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode();
 }
 
 /// Atomic opcode table
@@ -1716,16 +1706,23 @@
 static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
                                                 SDLoc dl,
                                                 enum AtomicOpc &Op, MVT NVT,
-                                                SDValue Val) {
+                                                SDValue Val,
+                                                const X86Subtarget *Subtarget) {
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
     int64_t CNVal = CN->getSExtValue();
     // Quit if not 32-bit imm.
     if ((int32_t)CNVal != CNVal)
       return Val;
+    // Quit if INT32_MIN: it would be negated as it is negative and overflow,
+    // producing an immediate that does not fit in the 32 bits available for
+    // an immediate operand to sub. However, it still fits in 32 bits for the
+    // add (since it is not negated) so we can return target-constant.
+    if (CNVal == INT32_MIN)
+      return CurDAG->getTargetConstant(CNVal, NVT);
     // For atomic-load-add, we could do some optimizations.
     if (Op == ADD) {
       // Translate to INC/DEC if ADD by 1 or -1.
-      if ((CNVal == 1) || (CNVal == -1)) {
+      if (((CNVal == 1) || (CNVal == -1)) && !Subtarget->slowIncDec()) {
         Op = (CNVal == 1) ? INC : DEC;
         // No more constant operand after being translated into INC/DEC.
         return SDValue();
@@ -1774,8 +1771,8 @@
   SDValue Chain = Node->getOperand(0);
   SDValue Ptr = Node->getOperand(1);
   SDValue Val = Node->getOperand(2);
-  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+  SDValue Base, Scale, Index, Disp, Segment;
+  if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment))
     return nullptr;
 
   // Which index into the table.
@@ -1797,7 +1794,7 @@
       break;
   }
 
-  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val);
+  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val, Subtarget);
   bool isUnOp = !Val.getNode();
   bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
 
@@ -1829,31 +1826,40 @@
         Opc = AtomicOpcTbl[Op][I32];
       break;
     case MVT::i64:
-      Opc = AtomicOpcTbl[Op][I64];
       if (isCN) {
         if (immSext8(Val.getNode()))
           Opc = AtomicOpcTbl[Op][SextConstantI64];
         else if (i64immSExt32(Val.getNode()))
           Opc = AtomicOpcTbl[Op][ConstantI64];
-      }
+        else
+          llvm_unreachable("True 64 bits constant in SelectAtomicLoadArith");
+      } else
+        Opc = AtomicOpcTbl[Op][I64];
       break;
   }
 
   assert(Opc != 0 && "Invalid arith lock transform!");
 
+  // Building the new node.
   SDValue Ret;
-  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                 dl, NVT), 0);
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
   if (isUnOp) {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+    SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Chain };
     Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
   } else {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+    SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Val, Chain };
     Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0);
   }
+
+  // Copying the MachineMemOperand.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+
+  // We need to have two outputs as that is what the original instruction had.
+  // So we add a dummy, undefined output. This is safe as we checked first
+  // that no-one uses our output anyway.
+  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, NVT), 0);
   SDValue RetVals[] = { Undef, Ret };
   return CurDAG->getMergeValues(RetVals, dl).getNode();
 }
@@ -2125,6 +2131,16 @@
   case X86ISD::GlobalBaseReg:
     return getGlobalBaseReg();
 
+  case X86ISD::SHRUNKBLEND: {
+    // SHRUNKBLEND selects like a regular VSELECT.
+    SDValue VSelect = CurDAG->getNode(
+        ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
+        Node->getOperand(1), Node->getOperand(2));
+    ReplaceUses(SDValue(Node, 0), VSelect);
+    SelectCode(VSelect.getNode());
+    // We already called ReplaceUses.
+    return nullptr;
+  }
 
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_AND:
@@ -2212,6 +2228,25 @@
     return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
                                 getI8Imm(ShlVal));
   }
+  case X86ISD::UMUL8:
+  case X86ISD::SMUL8: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
+                                          N0, SDValue()).getValue(1);
+
+    SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32);
+    SDValue Ops[] = {N1, InFlag};
+    SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+
+    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+    ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
+    return nullptr;
+  }
+
   case X86ISD::UMUL: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
@@ -2387,11 +2422,14 @@
   }
 
   case ISD::SDIVREM:
-  case ISD::UDIVREM: {
+  case ISD::UDIVREM:
+  case X86ISD::SDIVREM8_SEXT_HREG:
+  case X86ISD::UDIVREM8_ZEXT_HREG: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
-    bool isSigned = Opcode == ISD::SDIVREM;
+    bool isSigned = (Opcode == ISD::SDIVREM ||
+                     Opcode == X86ISD::SDIVREM8_SEXT_HREG);
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
@@ -2507,33 +2545,43 @@
         SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
     }
 
-    // Prevent use of AH in a REX instruction by referencing AX instead.
-    // Shift it down 8 bits.
+    // Prevent use of AH in a REX instruction by explicitly copying it to
+    // an ABCD_L register.
     //
     // The current assumption of the register allocator is that isel
-    // won't generate explicit references to the GPR8_NOREX registers. If
+    // won't generate explicit references to the GR8_ABCD_H registers. If
     // the allocator and/or the backend get enhanced to be more robust in
     // that regard, this can be, and should be, removed.
-    if (HiReg == X86::AH && Subtarget->is64Bit() &&
-        !SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::AX, MVT::i16, InFlag);
-      InFlag = Result.getValue(2);
+    if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
+      SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
+      unsigned AHExtOpcode =
+          isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
 
-      // If we also need AL (the quotient), get it by extracting a subreg from
-      // Result. The fast register allocator does not like multiple CopyFromReg
-      // nodes using aliasing registers.
-      if (!SDValue(Node, 0).use_empty())
-        ReplaceUses(SDValue(Node, 0),
-          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+      SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
+                                             MVT::Glue, AHCopy, InFlag);
+      SDValue Result(RNode, 0);
+      InFlag = SDValue(RNode, 1);
 
-      // Shift AX right by 8 bits instead of using AH.
-      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
-                                         Result,
-                                         CurDAG->getTargetConstant(8, MVT::i8)),
-                       0);
-      ReplaceUses(SDValue(Node, 1),
-        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+      if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
+          Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
+        if (Node->getValueType(1) == MVT::i64) {
+          // It's not possible to directly movsx AH to a 64bit register, because
+          // the latter needs the REX prefix, but the former can't have it.
+          assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG &&
+                 "Unexpected i64 sext of h-register");
+          Result =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, MVT::i64), Result,
+                          CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                      0);
+        }
+      } else {
+        Result =
+            CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
+      }
+      ReplaceUses(SDValue(Node, 1), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
     // Copy the division (low) result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
@@ -2563,12 +2611,30 @@
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
+    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+        HasNoSignedComparisonUses(Node)) {
+      // Look for (X86cmp (truncate $op, i1), 0) and try to convert to a
+      // smaller encoding
+      if (Opcode == X86ISD::CMP && N0.getValueType() == MVT::i1 &&
+          X86::isZeroNode(N1)) {
+        SDValue Reg = N0.getOperand(0);
+        SDValue Imm = CurDAG->getTargetConstant(1, MVT::i8);
+
+        // Emit testb
+        if (Reg.getScalarValueSizeInBits() > 8)
+          Reg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Reg);
+        // Emit a testb.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+                                                Reg, Imm);
+        ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+        return nullptr;
+      }
+
+      N0 = N0.getOperand(0);
+    }
     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
     // use a smaller encoding.
-    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
-        HasNoSignedComparisonUses(Node))
-      // Look past the truncate if CMP is the only use of it.
-      N0 = N0.getOperand(0);
+    // Look past the truncate if CMP is the only use of it.
     if ((N0.getNode()->getOpcode() == ISD::AND ||
          (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
         N0.getNode()->hasOneUse() &&

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5ccff20..f05b6c6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp

@@ -19,6 +19,7 @@
 #include "X86MachineFunctionInfo.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -49,6 +50,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
+#include "X86IntrinsicsInfo.h"
 #include <bitset>
 #include <numeric>
 #include <cctype>
@@ -65,10 +67,16 @@
     cl::Hidden);
 
 static cl::opt<bool> ExperimentalVectorShuffleLowering(
-    "x86-experimental-vector-shuffle-lowering", cl::init(false),
+    "x86-experimental-vector-shuffle-lowering", cl::init(true),
     cl::desc("Enable an experimental vector shuffle lowering code path."),
     cl::Hidden);
 
+static cl::opt<int> ReciprocalEstimateRefinementSteps(
+    "x86-recip-refinement-steps", cl::init(1),
+    cl::desc("Specify the number of Newton-Raphson iterations applied to the "
+             "result of the hardware reciprocal estimate instruction."),
+    cl::NotHidden);
+
 // Forward declarations.
 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
@@ -191,28 +199,10 @@
   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 }
 
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
-  if (TT.isOSBinFormatMachO()) {
-    if (TT.getArch() == Triple::x86_64)
-      return new X86_64MachoTargetObjectFile();
-    return new TargetLoweringObjectFileMachO();
-  }
-
-  if (TT.isOSLinux())
-    return new X86LinuxTargetObjectFile();
-  if (TT.isOSBinFormatELF())
-    return new TargetLoweringObjectFileELF();
-  if (TT.isKnownWindowsMSVCEnvironment())
-    return new X86WindowsTargetObjectFile();
-  if (TT.isOSBinFormatCOFF())
-    return new TargetLoweringObjectFileCOFF();
-  llvm_unreachable("unknown subtarget type");
-}
-
 // FIXME: This should stop caching the target machine as soon as
 // we can remove resetOperationActions et al.
-X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
+    : TargetLowering(TM) {
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
@@ -255,7 +245,7 @@
   else
     setSchedulingPreference(Sched::RegPressure);
   const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+      TM.getSubtarget<X86Subtarget>().getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
   // Bypass expensive divides on Atom when compiling with O2
@@ -316,6 +306,8 @@
   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
   // SETOEQ and SETUNE require checking two conditions.
   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
@@ -519,10 +511,21 @@
   // If we don't have F16C support, then lower half float conversions
   // into library calls.
   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
-    setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
-    setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand);
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
   }
 
+  // There's never any support for operations beyond MVT::f32.
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+
   if (Subtarget->hasPOPCNT()) {
     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
   } else {
@@ -648,8 +651,7 @@
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
-                     MVT::i64 : MVT::i32, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 
   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
@@ -797,6 +799,8 @@
   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
   setOperationAction(ISD::FEXP, MVT::f80, Expand);
   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+  setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
+  setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 
   // First set operation action for all vector types to either promote
   // (for widening) or expand (for scalarization). Then we will selectively
@@ -878,7 +882,12 @@
                           (MVT::SimpleValueType)InnerVT, Expand);
     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+
+    // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
+    // we have to deal with them whether we ask for Expansion or not. Setting
+    // Expand causes its own optimisation problems though, so leave them legal.
+    if (VT.getVectorElementType() == MVT::i1)
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
   }
 
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
@@ -935,12 +944,13 @@
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 
-    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
+    // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
     // registers cannot be used even for integer operations.
     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
@@ -995,6 +1005,20 @@
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
+    // We support custom legalizing of sext and anyext loads for specific
+    // memory vector types which we can load as a scalar (or sequence of
+    // scalars) and extend in-register to a legal 128-bit vector type. For sext
+    // loads these must work with a single scalar load.
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
+
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
@@ -1027,8 +1051,6 @@
       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
     }
 
-    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
     // Custom lower v2i64 and v2f64 selects.
     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
@@ -1090,7 +1112,13 @@
     // some vselects for now.
     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
 
-    // i8 and i16 vectors are custom , because the source register and source
+    // SSE41 brings specific instructions for doing vector sign extend even in
+    // cases where we don't have SRA.
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
+
+    // i8 and i16 vectors are custom because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
     // custom since the immediate controlling the insert encodes additional
     // information.
@@ -1104,7 +1132,7 @@
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 
-    // FIXME: these should be Legal but thats only for the case where
+    // FIXME: these should be Legal, but that's only for the case where
     // the index is constant.  For now custom expand to deal with that.
     if (Subtarget->is64Bit()) {
       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
@@ -1254,6 +1282,10 @@
 
       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+
+      // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
+      // when we have a 256bit-wide blend with immediate.
+      setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
     } else {
       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
@@ -1378,6 +1410,10 @@
     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
@@ -1489,6 +1525,43 @@
     }
   }// has  AVX-512
 
+  if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
+    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
+
+    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
+    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
+
+    setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
+    setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
+
+    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
+      const MVT VT = (MVT::SimpleValueType)i;
+
+      const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+      // Do not attempt to promote non-256-bit vectors
+      if (!VT.is512BitVector())
+        continue;
+
+      if ( EltSize < 32) {
+        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
+        setOperationAction(ISD::VSELECT,             VT, Legal);
+      }
+    }
+  }
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
+    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
+    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
+
+    setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
+  }
+
   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
   // of this type with custom code.
   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1521,9 +1594,6 @@
     setOperationAction(ISD::UMULO, VT, Custom);
   }
 
-  // There are no 8-bit 3-address imul/mul instructions
-  setOperationAction(ISD::SMULO, MVT::i8, Expand);
-  setOperationAction(ISD::UMULO, MVT::i8, Expand);
 
   if (!Subtarget->is64Bit()) {
     // These libcalls are not available in 32-bit.
@@ -1600,6 +1670,14 @@
   PredictableSelectIsExpensive = !Subtarget->isAtom();
 
   setPrefFunctionAlignment(4); // 2^4 bytes.
+
+  verifyIntrinsicTables();
+}
+
+// This has so far only been implemented for 64-bit MachO.
+bool X86TargetLowering::useLoadStackGuardNode() const {
+  return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
+         Subtarget->is64Bit();
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -1616,10 +1694,40 @@
   if (!VT.isVector())
     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
 
-  if (Subtarget->hasAVX512())
-    switch(VT.getVectorNumElements()) {
-    case  8: return MVT::v8i1;
-    case 16: return MVT::v16i1;
+  const unsigned NumElts = VT.getVectorNumElements();
+  const EVT EltVT = VT.getVectorElementType();
+  if (VT.is512BitVector()) {
+    if (Subtarget->hasAVX512())
+      if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+          EltVT == MVT::f32 || EltVT == MVT::f64)
+        switch(NumElts) {
+        case  8: return MVT::v8i1;
+        case 16: return MVT::v16i1;
+      }
+    if (Subtarget->hasBWI())
+      if (EltVT == MVT::i8 || EltVT == MVT::i16)
+        switch(NumElts) {
+        case 32: return MVT::v32i1;
+        case 64: return MVT::v64i1;
+      }
+  }
+
+  if (VT.is256BitVector() || VT.is128BitVector()) {
+    if (Subtarget->hasVLX())
+      if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
+          EltVT == MVT::f32 || EltVT == MVT::f64)
+        switch(NumElts) {
+        case 2: return MVT::v2i1;
+        case 4: return MVT::v4i1;
+        case 8: return MVT::v8i1;
+      }
+    if (Subtarget->hasBWI() && Subtarget->hasVLX())
+      if (EltVT == MVT::i8 || EltVT == MVT::i16)
+        switch(NumElts) {
+        case  8: return MVT::v8i1;
+        case 16: return MVT::v16i1;
+        case 32: return MVT::v32i1;
+      }
   }
 
   return VT.changeVectorElementTypeToInteger();
@@ -1726,9 +1834,10 @@
 }
 
 bool
-X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
-                                                 unsigned,
-                                                 bool *Fast) const {
+X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                  unsigned,
+                                                  unsigned,
+                                                  bool *Fast) const {
   if (Fast)
     *Fast = Subtarget->isUnalignedMemAccessFast();
   return true;
@@ -1794,9 +1903,7 @@
   default:
     return TargetLowering::findRepresentativeClass(VT);
   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
-    RRC = Subtarget->is64Bit() ?
-      (const TargetRegisterClass*)&X86::GR64RegClass :
-      (const TargetRegisterClass*)&X86::GR32RegClass;
+    RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
     break;
   case MVT::x86mmx:
     RRC = &X86::VR64RegClass;
@@ -1851,8 +1958,7 @@
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
-                 RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
@@ -1871,8 +1977,7 @@
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
   SDValue Flag;
@@ -1918,8 +2023,8 @@
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
-    if (VA.getLocReg() == X86::ST0 ||
-        VA.getLocReg() == X86::ST1) {
+    if (VA.getLocReg() == X86::FP0 ||
+        VA.getLocReg() == X86::FP1) {
       // If this is a copy from an xmm register to ST(0), use an FPExtend to
       // change the value to the FP stack register class.
       if (isScalarFPTypeInSSEReg(VA.getValVT()))
@@ -2005,6 +2110,13 @@
        UI != UE; ++UI) {
     if (UI->getOpcode() != X86ISD::RET_FLAG)
       return false;
+    // If we are returning more than one value, we can definitely
+    // not make a tail call see PR19530
+    if (UI->getNumOperands() > 4)
+      return false;
+    if (UI->getNumOperands() == 4 &&
+        UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
+      return false;
     HasRet = true;
   }
 
@@ -2015,8 +2127,8 @@
   return true;
 }
 
-MVT
-X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
+EVT
+X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
                                             ISD::NodeType ExtendKind) const {
   MVT ReturnMVT;
   // TODO: Is this also valid on 32-bit?
@@ -2025,7 +2137,7 @@
   else
     ReturnMVT = MVT::i32;
 
-  MVT MinVT = getRegisterType(ReturnMVT);
+  EVT MinVT = getRegisterType(Context, ReturnMVT);
   return VT.bitsLT(MinVT) ? MinVT : VT;
 }
 
@@ -2042,8 +2154,8 @@
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget->is64Bit();
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 DAG.getTarget(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2057,33 +2169,21 @@
       report_fatal_error("SSE register return with SSE disabled");
     }
 
-    SDValue Val;
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT()))
+      CopyVT = MVT::f80;
 
-    // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the value is popped from the stack, so
-    // a CopyFromReg is not good enough - the copy instruction may be eliminated
-    // if the return value is not used. We use the FpPOP_RETVAL instruction
-    // instead.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
-      // If we prefer to use the value in xmm registers, copy it out as f80 and
-      // use a truncate to move it from fp stack reg to xmm reg.
-      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
-      SDValue Ops[] = { Chain, InFlag };
-      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
-                                         MVT::Other, MVT::Glue, Ops), 1);
-      Val = Chain.getValue(0);
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               CopyVT, InFlag).getValue(1);
+    SDValue Val = Chain.getValue(0);
 
-      // Round the f80 to the right size, which also moves it to the appropriate
-      // xmm register.
-      if (CopyVT != VA.getValVT())
-        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
-                          // This truncation won't change the value.
-                          DAG.getIntPtrConstant(1));
-    } else {
-      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
-                                 CopyVT, InFlag).getValue(1);
-      Val = Chain.getValue(0);
-    }
+    if (CopyVT != VA.getValVT())
+      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                        // This truncation won't change the value.
+                        DAG.getIntPtrConstant(1));
+
     InFlag = Chain.getValue(2);
     InVals.push_back(Val);
   }
@@ -2224,6 +2324,55 @@
   }
 }
 
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
+                                                const X86Subtarget *Subtarget) {
+  assert(Subtarget->is64Bit());
+
+  if (Subtarget->isCallingConvWin64(CallConv)) {
+    static const MCPhysReg GPR64ArgRegsWin64[] = {
+      X86::RCX, X86::RDX, X86::R8,  X86::R9
+    };
+    return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
+  }
+
+  static const MCPhysReg GPR64ArgRegs64Bit[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+  };
+  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
+}
+
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
+                                                CallingConv::ID CallConv,
+                                                const X86Subtarget *Subtarget) {
+  assert(Subtarget->is64Bit());
+  if (Subtarget->isCallingConvWin64(CallConv)) {
+    // The XMM registers which might contain var arg parameters are shadowed
+    // in their paired GPR.  So we only need to save the GPR to their home
+    // slots.
+    // TODO: __vectorcall will change this.
+    return None;
+  }
+
+  const Function *Fn = MF.getFunction();
+  bool NoImplicitFloatOps = Fn->getAttributes().
+      hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+  assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
+         "SSE register cannot be used when SSE is disabled!");
+  if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
+      !Subtarget->hasSSE1())
+    // Kernel mode asks for SSE to be disabled, so there are no XMM argument
+    // registers.
+    return None;
+
+  static const MCPhysReg XMMArgRegs64Bit[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
+  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
+}
+
 SDValue
 X86TargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv,
@@ -2251,8 +2400,7 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
   if (IsWin64)
@@ -2296,6 +2444,10 @@
         RC = &X86::VK8RegClass;
       else if (RegVT == MVT::v16i1)
         RC = &X86::VK16RegClass;
+      else if (RegVT == MVT::v32i1)
+        RC = &X86::VK32RegClass;
+      else if (RegVT == MVT::v64i1)
+        RC = &X86::VK64RegClass;
       else
         llvm_unreachable("Unknown argument type!");
 
@@ -2362,60 +2514,53 @@
     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
 
   // If the function takes variable number of arguments, make a frame index for
-  // the start of the first vararg value... for expansion of llvm.va_start.
-  if (isVarArg) {
-    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
-                    CallConv != CallingConv::X86_ThisCall)) {
-      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
+  // the start of the first vararg value... for expansion of llvm.va_start. We
+  // can skip this if there are no va_start calls.
+  if (MFI->hasVAStart() &&
+      (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+                   CallConv != CallingConv::X86_ThisCall))) {
+    FuncInfo->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(1, StackSize, true));
+  }
+
+  // 64-bit calling conventions support varargs and register parameters, so we
+  // have to do extra work to spill them in the prologue or forward them to
+  // musttail calls.
+  if (Is64Bit && isVarArg &&
+      (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
+    // Find the first unallocated argument registers.
+    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+    ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
+    unsigned NumIntRegs =
+        CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
+    unsigned NumXMMRegs =
+        CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
+    assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+           "SSE register cannot be used when SSE is disabled!");
+
+    // Gather all the live in physical registers.
+    SmallVector<SDValue, 6> LiveGPRs;
+    SmallVector<SDValue, 8> LiveXMMRegs;
+    SDValue ALVal;
+    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+      unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
+      LiveGPRs.push_back(
+          DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
     }
-    if (Is64Bit) {
-      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
-
-      // FIXME: We should really autogenerate these arrays
-      static const MCPhysReg GPR64ArgRegsWin64[] = {
-        X86::RCX, X86::RDX, X86::R8,  X86::R9
-      };
-      static const MCPhysReg GPR64ArgRegs64Bit[] = {
-        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
-      };
-      static const MCPhysReg XMMArgRegs64Bit[] = {
-        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
-        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
-      };
-      const MCPhysReg *GPR64ArgRegs;
-      unsigned NumXMMRegs = 0;
-
-      if (IsWin64) {
-        // The XMM registers which might contain var arg parameters are shadowed
-        // in their paired GPR.  So we only need to save the GPR to their home
-        // slots.
-        TotalNumIntRegs = 4;
-        GPR64ArgRegs = GPR64ArgRegsWin64;
-      } else {
-        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
-        GPR64ArgRegs = GPR64ArgRegs64Bit;
-
-        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
-                                                TotalNumXMMRegs);
+    if (!ArgXMMs.empty()) {
+      unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+      ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
+      for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
+        unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
+        LiveXMMRegs.push_back(
+            DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
       }
-      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
-                                                       TotalNumIntRegs);
+    }
 
-      bool NoImplicitFloatOps = Fn->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
-      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
-             "SSE register cannot be used when SSE is disabled!");
-      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
-               NoImplicitFloatOps) &&
-             "SSE register cannot be used when SSE is disabled!");
-      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
-          !Subtarget->hasSSE1())
-        // Kernel mode asks for SSE to be disabled, so don't push them
-        // on the stack.
-        TotalNumXMMRegs = 0;
-
+    // Store them to the va_list returned by va_start.
+    if (MFI->hasVAStart()) {
       if (IsWin64) {
-        const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
+        const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
         // Get to the caller-allocated home save location.  Add 8 to account
         // for the return address.
         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2429,10 +2574,9 @@
         // registers, then we must store them to their spots on the stack so
         // they may be loaded by deferencing the result of va_next.
         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
-        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
-        FuncInfo->setRegSaveFrameIndex(
-          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
-                               false));
+        FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+        FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
+            ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
       }
 
       // Store the integer parameter registers.
@@ -2440,12 +2584,9 @@
       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
                                         getPointerTy());
       unsigned Offset = FuncInfo->getVarArgsGPOffset();
-      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+      for (SDValue Val : LiveGPRs) {
         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
                                   DAG.getIntPtrConstant(Offset));
-        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
-                                     &X86::GR64RegClass);
-        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
         SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
                        MachinePointerInfo::getFixedStack(
@@ -2455,32 +2596,51 @@
         Offset += 8;
       }
 
-      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
+      if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
         // Now store the XMM (fp + vector) parameter registers.
-        SmallVector<SDValue, 11> SaveXMMOps;
+        SmallVector<SDValue, 12> SaveXMMOps;
         SaveXMMOps.push_back(Chain);
-
-        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
-        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
         SaveXMMOps.push_back(ALVal);
-
         SaveXMMOps.push_back(DAG.getIntPtrConstant(
                                FuncInfo->getRegSaveFrameIndex()));
         SaveXMMOps.push_back(DAG.getIntPtrConstant(
                                FuncInfo->getVarArgsFPOffset()));
-
-        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
-          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
-                                       &X86::VR128RegClass);
-          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
-          SaveXMMOps.push_back(Val);
-        }
+        SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+                          LiveXMMRegs.end());
         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
                                      MVT::Other, SaveXMMOps));
       }
 
       if (!MemOps.empty())
         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
+    } else {
+      // Add all GPRs, al, and XMMs to the list of forwards.  We will add then
+      // to the liveout set on a musttail call.
+      assert(MFI->hasMustTailInVarArgFunc());
+      auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
+      typedef X86MachineFunctionInfo::Forward Forward;
+
+      for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
+        unsigned VReg =
+            MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
+        Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
+        Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
+      }
+
+      if (!ArgXMMs.empty()) {
+        unsigned ALVReg =
+            MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
+        Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
+        Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
+
+        for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
+          unsigned VReg =
+              MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
+          Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
+          Forwards.push_back(
+              Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
+        }
+      }
     }
   }
 
@@ -2583,6 +2743,7 @@
   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
   StructReturnType SR = callIsStructReturn(Outs);
   bool IsSibcall      = false;
+  X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
 
   if (MF.getTarget().Options.DisableTailCalls)
     isTailCall = false;
@@ -2614,8 +2775,7 @@
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
   if (IsWin64)
@@ -2636,7 +2796,6 @@
   int FPDiff = 0;
   if (isTailCall && !IsSibcall && !IsMustTail) {
     // Lower arguments at fp - stackoffset + fpdiff.
-    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
 
     FPDiff = NumBytesCallerPushed - NumBytes;
@@ -2655,8 +2814,12 @@
   // arguments passed in memory when using inalloca.
   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
     NumBytesToPush = 0;
-    assert(ArgLocs.back().getLocMemOffset() == 0 &&
-           "an inalloca argument must be the only memory argument");
+    if (!ArgLocs.back().isMemLoc())
+      report_fatal_error("cannot use inalloca attribute on a register "
+                         "parameter");
+    if (ArgLocs.back().getLocMemOffset() != 0)
+      report_fatal_error("any parameter with the inalloca attribute must be "
+                         "the only memory argument");
   }
 
   if (!IsSibcall)
@@ -2675,8 +2838,8 @@
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     // Skip inalloca arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -2775,7 +2938,7 @@
     }
   }
 
-  if (Is64Bit && isVarArg && !IsWin64) {
+  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
     // From AMD64 ABI document:
     // For calls that may call functions that use varargs or stdargs
     // (prototype-less calls or calls to functions containing ellipsis (...) in
@@ -2797,6 +2960,14 @@
                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
   }
 
+  if (Is64Bit && isVarArg && IsMustTail) {
+    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
+    for (const auto &F : Forwards) {
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
+      RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+    }
+  }
+
   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   // don't need this because the eligibility check rejects calls that require
   // shuffling arguments passed in memory.
@@ -2946,6 +3117,9 @@
 
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
                                          OpFlags);
+  } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
+    // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
+    Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
   }
 
   // Returns a chain & a flag for retval copy to use.
@@ -2972,7 +3146,7 @@
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3043,7 +3217,7 @@
 //  If a tail called function callee has more arguments than the caller the
 //  caller needs to make sure that there is room to move the RETADDR to. This is
 //  achieved by reserving an area the size of the argument delta right after the
-//  original REtADDR, but before the saved framepointer or the spilled registers
+//  original RETADDR, but before the saved framepointer or the spilled registers
 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
 //  stack layout:
 //    arg1
@@ -3063,9 +3237,9 @@
                                                SelectionDAG& DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetMachine &TM = MF.getTarget();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
-  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
+  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
   int64_t Offset = StackSize;
@@ -3178,8 +3352,8 @@
 
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
@@ -3207,8 +3381,8 @@
       return false;
 
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   DAG.getTarget(), ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -3228,12 +3402,12 @@
   }
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
-                   DAG.getTarget(), RVLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
+                   *DAG.getContext());
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
-      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
         return false;
     }
   }
@@ -3242,13 +3416,13 @@
   // results are returned in the same way as what the caller expects.
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
-                    DAG.getTarget(), RVLocs1, *DAG.getContext());
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+                    *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
 
     SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
-                    DAG.getTarget(), RVLocs2, *DAG.getContext());
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+                    *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -3274,8 +3448,8 @@
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   DAG.getTarget(), ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                   *DAG.getContext());
 
     // Allocate shadow area for Win64
     if (IsCalleeWin64)
@@ -3292,7 +3466,7 @@
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const X86InstrInfo *TII =
-          static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
+          static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
@@ -3362,6 +3536,8 @@
 static bool isTargetShuffle(unsigned Opcode) {
   switch(Opcode) {
   default: return false;
+  case X86ISD::BLENDI:
+  case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
@@ -3379,7 +3555,7 @@
   case X86ISD::MOVSD:
   case X86ISD::UNPCKL:
   case X86ISD::UNPCKH:
-  case X86ISD::VPERMILP:
+  case X86ISD::VPERMILPI:
   case X86ISD::VPERM2X128:
   case X86ISD::VPERMI:
     return true;
@@ -3405,7 +3581,7 @@
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
-  case X86ISD::VPERMILP:
+  case X86ISD::VPERMILPI:
   case X86ISD::VPERMI:
     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   }
@@ -3417,6 +3593,7 @@
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
   case X86ISD::PALIGNR:
+  case X86ISD::VALIGN:
   case X86ISD::SHUFP:
   case X86ISD::VPERM2X128:
     return DAG.getNode(Opc, dl, VT, V1, V2,
@@ -3443,8 +3620,8 @@
 
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
 
@@ -3494,23 +3671,18 @@
 /// own arguments. Callee pop is necessary to support tail calls.
 bool X86::isCalleePop(CallingConv::ID CallingConv,
                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
-  if (IsVarArg)
-    return false;
-
   switch (CallingConv) {
   default:
     return false;
   case CallingConv::X86_StdCall:
-    return !is64Bit;
   case CallingConv::X86_FastCall:
-    return !is64Bit;
   case CallingConv::X86_ThisCall:
     return !is64Bit;
   case CallingConv::Fast:
-    return TailCallOpt;
   case CallingConv::GHC:
-    return TailCallOpt;
   case CallingConv::HiPE:
+    if (IsVarArg)
+      return false;
     return TailCallOpt;
   }
 }
@@ -3687,14 +3859,23 @@
 }
 
 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
-/// the second operand.
-static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
-  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
-    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
-  if (VT == MVT::v2f64 || VT == MVT::v2i64)
-    return (Mask[0] < 2 && Mask[1] < 2);
-  return false;
+/// is suitable for input to PSHUFD. That is, it doesn't reference the other
+/// operand - by default will match for first operand.
+static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
+                         bool TestSecondOperand = false) {
+  if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
+      VT != MVT::v2f64 && VT != MVT::v2i64)
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned Lo = TestSecondOperand ? NumElems : 0;
+  unsigned Hi = Lo + NumElems;
+
+  for (unsigned i = 0; i < NumElems; ++i)
+    if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
+      return false;
+
+  return true;
 }
 
 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
@@ -3755,16 +3936,12 @@
   return true;
 }
 
-/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
-/// is suitable for input to PALIGNR.
-static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
-                          const X86Subtarget *Subtarget) {
-  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
-      (VT.is256BitVector() && !Subtarget->hasInt256()))
-    return false;
-
+/// \brief Return true if the mask specifies a shuffle of elements that is
+/// suitable for input to intralane (palignr) or interlane (valign) vector
+/// right-shift.
+static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
+  unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
   // Do not handle 64-bit element shuffles with palignr.
@@ -3828,6 +4005,29 @@
   return true;
 }
 
+/// \brief Return true if the node specifies a shuffle of elements that is
+/// suitable for input to PALIGNR.
+static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
+                          const X86Subtarget *Subtarget) {
+  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
+      (VT.is256BitVector() && !Subtarget->hasInt256()) ||
+      VT.is512BitVector())
+    // FIXME: Add AVX512BW.
+    return false;
+
+  return isAlignrMask(Mask, VT, false);
+}
+
+/// \brief Return true if the node specifies a shuffle of elements that is
+/// suitable for input to VALIGN.
+static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
+                          const X86Subtarget *Subtarget) {
+  // FIXME: Add AVX512VL.
+  if (!VT.is512BitVector() || !Subtarget->hasAVX512())
+    return false;
+  return isAlignrMask(Mask, VT, true);
+}
+
 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
 /// the two vector operands have swapped position.
 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
@@ -4070,43 +4270,34 @@
   assert(VT.getSizeInBits() >= 128 &&
          "Unsupported vector type for unpckl");
 
-  // AVX defines UNPCK* to operate independently on 128-bit lanes.
-  unsigned NumLanes;
-  unsigned NumOf256BitLanes;
   unsigned NumElts = VT.getVectorNumElements();
-  if (VT.is256BitVector()) {
-    if (NumElts != 4 && NumElts != 8 &&
-        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
-    NumLanes = 2;
-    NumOf256BitLanes = 1;
-  } else if (VT.is512BitVector()) {
-    assert(VT.getScalarType().getSizeInBits() >= 32 &&
-           "Unsupported vector type for unpckh");
-    NumLanes = 2;
-    NumOf256BitLanes = 2;
-  } else {
-    NumLanes = 1;
-    NumOf256BitLanes = 1;
-  }
 
-  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
-  unsigned NumLaneElts = NumEltsInStride/NumLanes;
+  assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
+         "Unsupported vector type for unpckh");
 
-  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
-    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
-      for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
-        int BitI  = Mask[l256*NumEltsInStride+l+i];
-        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
-        if (!isUndefOrEqual(BitI, j+l256*NumElts))
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[l+i];
+      int BitI1 = Mask[l+i+1];
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (V2IsSplat) {
+        if (!isUndefOrEqual(BitI1, NumElts))
           return false;
-        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
-          return false;
-        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
+      } else {
+        if (!isUndefOrEqual(BitI1, j + NumElts))
           return false;
       }
     }
   }
+
   return true;
 }
 
@@ -4117,39 +4308,29 @@
   assert(VT.getSizeInBits() >= 128 &&
          "Unsupported vector type for unpckh");
 
-  // AVX defines UNPCK* to operate independently on 128-bit lanes.
-  unsigned NumLanes;
-  unsigned NumOf256BitLanes;
   unsigned NumElts = VT.getVectorNumElements();
-  if (VT.is256BitVector()) {
-    if (NumElts != 4 && NumElts != 8 &&
-        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
-    NumLanes = 2;
-    NumOf256BitLanes = 1;
-  } else if (VT.is512BitVector()) {
-    assert(VT.getScalarType().getSizeInBits() >= 32 &&
-           "Unsupported vector type for unpckh");
-    NumLanes = 2;
-    NumOf256BitLanes = 2;
-  } else {
-    NumLanes = 1;
-    NumOf256BitLanes = 1;
-  }
 
-  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
-  unsigned NumLaneElts = NumEltsInStride/NumLanes;
+  assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
+         "Unsupported vector type for unpckh");
 
-  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
-    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
-      for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
-        int BitI  = Mask[l256*NumEltsInStride+l+i];
-        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
-        if (!isUndefOrEqual(BitI, j+l256*NumElts))
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[l+i];
+      int BitI1 = Mask[l+i+1];
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (V2IsSplat) {
+        if (isUndefOrEqual(BitI1, NumElts))
           return false;
-        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
-          return false;
-        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
+      } else {
+        if (!isUndefOrEqual(BitI1, j+NumElts))
           return false;
       }
     }
@@ -4652,11 +4833,13 @@
   return Mask;
 }
 
-/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
-static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+/// \brief Return the appropriate immediate to shuffle the specified
+/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
+/// VALIGN (if Interlane is true) instructions.
+static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
+                                           bool InterLane) {
   MVT VT = SVOp->getSimpleValueType(0);
-  unsigned EltSize = VT.is512BitVector() ? 1 :
+  unsigned EltSize = InterLane ? 1 :
     VT.getVectorElementType().getSizeInBits() >> 3;
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -4677,6 +4860,19 @@
   return (Val - i) * EltSize;
 }
 
+/// \brief Return the appropriate immediate to shuffle the specified
+/// VECTOR_SHUFFLE mask with the PALIGNR instruction.
+static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+  return getShuffleAlignrImmediate(SVOp, false);
+}
+
+/// \brief Return the appropriate immediate to shuffle the specified
+/// VECTOR_SHUFFLE mask with the VALIGN instruction.
+static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
+  return getShuffleAlignrImmediate(SVOp, true);
+}
+
+
 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
@@ -4751,28 +4947,6 @@
   return false;
 }
 
-/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
-/// their permute mask.
-static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
-                                    SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-  SmallVector<int, 8> MaskVec;
-
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int Idx = SVOp->getMaskElt(i);
-    if (Idx >= 0) {
-      if (Idx < (int)NumElems)
-        Idx += NumElems;
-      else
-        Idx -= NumElems;
-    }
-    MaskVec.push_back(Idx);
-  }
-  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
-                              SVOp->getOperand(0), &MaskVec[0]);
-}
-
 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
 /// match movhlps. The lower half elements should come from upper half of
 /// V1 (and in order), and the upper half elements should come from the upper
@@ -4897,32 +5071,32 @@
   SDValue Vec;
   if (VT.is128BitVector()) {  // SSE
     if (Subtarget->hasSSE2()) {  // SSE2
-      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Cst = DAG.getConstant(0, MVT::i32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
     } else { // SSE1
-      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+      SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
     }
   } else if (VT.is256BitVector()) { // AVX
     if (Subtarget->hasInt256()) { // AVX2
-      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Cst = DAG.getConstant(0, MVT::i32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
     } else {
       // 256-bit logic and arithmetic instructions in AVX are all
       // floating-point, no support for integer ops. Emit fp zeroed vectors.
-      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+      SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
     }
   } else if (VT.is512BitVector()) { // AVX-512
-      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Cst = DAG.getConstant(0, MVT::i32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
   } else if (VT.getScalarType() == MVT::i1) {
     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
-    SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+    SDValue Cst = DAG.getConstant(0, MVT::i1);
     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   } else
@@ -4939,7 +5113,7 @@
                              SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
-  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDValue Cst = DAG.getConstant(~0U, MVT::i32);
   SDValue Vec;
   if (VT.is256BitVector()) {
     if (HasInt256) { // AVX2
@@ -5109,37 +5283,49 @@
 }
 
 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
-/// target specific opcode. Returns true if the Mask could be calculated.
-/// Sets IsUnary to true if only uses one source.
+/// target specific opcode. Returns true if the Mask could be calculated. Sets
+/// IsUnary to true if only uses one source. Note that this will set IsUnary for
+/// shuffles which use a single input multiple times, and in those cases it will
+/// adjust the mask to only have indices within that single input.
 static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
   SDValue ImmN;
 
   IsUnary = false;
+  bool IsFakeUnary = false;
   switch(N->getOpcode()) {
+  case X86ISD::BLENDI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
   case X86ISD::SHUFP:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::UNPCKH:
     DecodeUNPCKHMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::UNPCKL:
     DecodeUNPCKLMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVHLPS:
     DecodeMOVHLPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVLHPS:
     DecodeMOVLHPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::PALIGNR:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     break;
   case X86ISD::PSHUFD:
-  case X86ISD::VPERMILP:
+  case X86ISD::VPERMILPI:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
@@ -5154,6 +5340,72 @@
     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
+  case X86ISD::PSHUFB: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(1);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(0);
+
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      EVT VT = MaskNode.getValueType();
+      assert(VT.isVector() &&
+             "Can't produce a non-vector with a build_vector!");
+      if (!VT.isInteger())
+        return false;
+
+      int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
+
+      SmallVector<uint64_t, 32> RawMask;
+      for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
+        SDValue Op = MaskNode->getOperand(i);
+        if (Op->getOpcode() == ISD::UNDEF) {
+          RawMask.push_back((uint64_t)SM_SentinelUndef);
+          continue;
+        }
+        auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
+        if (!CN)
+          return false;
+        APInt MaskElement = CN->getAPIntValue();
+
+        // We now have to decode the element which could be any integer size and
+        // extract each byte of it.
+        for (int j = 0; j < NumBytesPerElement; ++j) {
+          // Note that this is x86 and so always little endian: the low byte is
+          // the first byte of the mask.
+          RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
+          MaskElement = MaskElement.lshr(8);
+        }
+      }
+      DecodePSHUFBMask(RawMask, Mask);
+      break;
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
+      // FIXME: Support AVX-512 here.
+      Type *Ty = C->getType();
+      if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
+                                Ty->getVectorNumElements() != 32))
+        return false;
+
+      DecodePSHUFBMask(C, Mask);
+      break;
+    }
+
+    return false;
+  }
   case X86ISD::VPERMI:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
@@ -5175,17 +5427,29 @@
     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     if (Mask.empty()) return false;
     break;
+  case X86ISD::MOVSLDUP:
+    DecodeMOVSLDUPMask(VT, Mask);
+    break;
+  case X86ISD::MOVSHDUP:
+    DecodeMOVSHDUPMask(VT, Mask);
+    break;
   case X86ISD::MOVDDUP:
   case X86ISD::MOVLHPD:
   case X86ISD::MOVLPD:
   case X86ISD::MOVLPS:
-  case X86ISD::MOVSHDUP:
-  case X86ISD::MOVSLDUP:
     // Not yet implemented
     return false;
   default: llvm_unreachable("unknown target shuffle node");
   }
 
+  // If we have a fake unary shuffle, the shuffle mask is spread across two
+  // inputs that are actually the same node. Re-map the mask to always point
+  // into the first input.
+  if (IsFakeUnary)
+    for (int &M : Mask)
+      if (M >= (int)Mask.size())
+        M -= Mask.size();
+
   return true;
 }
 
@@ -5476,76 +5740,109 @@
 }
 
 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
-static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
-                                     unsigned NonZeros, unsigned NumNonZero,
-                                     unsigned NumZero, SelectionDAG &DAG,
+static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget *Subtarget,
                                      const TargetLowering &TLI) {
-  // We know there's at least one non-zero element
-  unsigned FirstNonZeroIdx = 0;
-  SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
-  while (FirstNonZero.getOpcode() == ISD::UNDEF ||
-         X86::isZeroNode(FirstNonZero)) {
-    ++FirstNonZeroIdx;
-    FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  // Find all zeroable elements.
+  bool Zeroable[4];
+  for (int i=0; i < 4; ++i) {
+    SDValue Elt = Op->getOperand(i);
+    Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
   }
+  assert(std::count_if(&Zeroable[0], &Zeroable[4],
+                       [](bool M) { return !M; }) > 1 &&
+         "We expect at least two non-zero elements!");
 
-  if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
-    return SDValue();
-
-  SDValue V = FirstNonZero.getOperand(0);
-  MVT VVT = V.getSimpleValueType();
-  if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
-    return SDValue();
-
-  unsigned FirstNonZeroDst =
-      cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
-  unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
-  unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
-  unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
-
-  for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
-    SDValue Elem = Op.getOperand(Idx);
-    if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
+  // We only know how to deal with build_vector nodes where elements are either
+  // zeroable or extract_vector_elt with constant index.
+  SDValue FirstNonZero;
+  for (int i=0; i < 4; ++i) {
+    if (Zeroable[i])
       continue;
-
-    // TODO: What else can be here? Deal with it.
-    if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    SDValue Elt = Op->getOperand(i);
+    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Elt.getOperand(1)))
       return SDValue();
-
-    // TODO: Some optimizations are still possible here
-    // ex: Getting one element from a vector, and the rest from another.
-    if (Elem.getOperand(0) != V)
+    // Make sure that this node is extracting from a 128-bit vector.
+    MVT VT = Elt.getOperand(0).getSimpleValueType();
+    if (!VT.is128BitVector())
       return SDValue();
-
-    unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
-    if (Dst == Idx)
-      ++CorrectIdx;
-    else if (IncorrectIdx == -1U) {
-      IncorrectIdx = Idx;
-      IncorrectDst = Dst;
-    } else
-      // There was already one element with an incorrect index.
-      // We can't optimize this case to an insertps.
-      return SDValue();
+    if (!FirstNonZero.getNode())
+      FirstNonZero = Elt;
   }
 
-  if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
-    SDLoc dl(Op);
-    EVT VT = Op.getSimpleValueType();
-    unsigned ElementMoveMask = 0;
-    if (IncorrectIdx == -1U)
-      ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
-    else
-      ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+  assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
+  SDValue V1 = FirstNonZero.getOperand(0);
+  MVT VT = V1.getSimpleValueType();
 
-    SDValue InsertpsMask =
-        DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
-    return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
+  // See if this build_vector can be lowered as a blend with zero.
+  SDValue Elt;
+  unsigned EltMaskIdx, EltIdx;
+  int Mask[4];
+  for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
+    if (Zeroable[EltIdx]) {
+      // The zero vector will be on the right hand side.
+      Mask[EltIdx] = EltIdx+4;
+      continue;
+    }
+
+    Elt = Op->getOperand(EltIdx);
+    // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
+    EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
+    if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
+      break;
+    Mask[EltIdx] = EltIdx;
   }
 
-  return SDValue();
+  if (EltIdx == 4) {
+    // Let the shuffle legalizer deal with blend operations.
+    SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
+    if (V1.getSimpleValueType() != VT)
+      V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
+    return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
+  }
+
+  // See if we can lower this build_vector to a INSERTPS.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  SDValue V2 = Elt.getOperand(0);
+  if (Elt == FirstNonZero)
+    V1 = SDValue();
+
+  bool CanFold = true;
+  for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
+    if (Zeroable[i])
+      continue;
+    
+    SDValue Current = Op->getOperand(i);
+    SDValue SrcVector = Current->getOperand(0);
+    if (!V1.getNode())
+      V1 = SrcVector;
+    CanFold = SrcVector == V1 &&
+      cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
+  }
+
+  if (!CanFold)
+    return SDValue();
+
+  assert(V1.getNode() && "Expected at least two non-zero elements!");
+  if (V1.getSimpleValueType() != MVT::v4f32)
+    V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
+  if (V2.getSimpleValueType() != MVT::v4f32)
+    V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
+
+  // Ok, we can emit an INSERTPS instruction.
+  unsigned ZMask = 0;
+  for (int i = 0; i < 4; ++i)
+    if (Zeroable[i])
+      ZMask |= 1 << i;
+
+  unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
+  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+  SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
+                               DAG.getIntPtrConstant(InsertPSMask));
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
 }
 
 /// getVShift - Return a vector logical shift node.
@@ -5748,7 +6045,10 @@
 /// or SDValue() otherwise.
 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
                                     SelectionDAG &DAG) {
-  if (!Subtarget->hasFp256())
+  // VBROADCAST requires AVX.
+  // TODO: Splats could be generated for non-AVX CPUs using SSE
+  // instructions, but there's less potential gain for only 128-bit vectors.
+  if (!Subtarget->hasAVX())
     return SDValue();
 
   MVT VT = Op.getSimpleValueType();
@@ -5825,17 +6125,34 @@
     }
   }
 
+  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
   bool IsGE256 = (VT.getSizeInBits() >= 256);
 
-  // Handle the broadcasting a single constant scalar from the constant pool
-  // into a vector. On Sandybridge it is still better to load a constant vector
+  // When optimizing for size, generate up to 5 extra bytes for a broadcast
+  // instruction to save 8 or more bytes of constant pool data.
+  // TODO: If multiple splats are generated to load the same constant,
+  // it may be detrimental to overall size. There needs to be a way to detect
+  // that condition to know if this is truly a size win.
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool OptForSize = F->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  // Handle broadcasting a single constant scalar from the constant pool
+  // into a vector.
+  // On Sandybridge (no AVX2), it is still better to load a constant vector
   // from the constant pool and not to broadcast it from a scalar.
-  if (ConstSplatVal && Subtarget->hasInt256()) {
+  // But override that restriction when optimizing for size.
+  // TODO: Check if splatting is recommended for other AVX-capable CPUs.
+  if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
     EVT CVT = Ld.getValueType();
     assert(!CVT.isVector() && "Must not broadcast a vector type");
-    unsigned ScalarSize = CVT.getSizeInBits();
 
-    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
+    // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+    // For size optimization, also splat v2f64 and v2i64, and for size opt
+    // with AVX2, also splat i8 and i16.
+    // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
+    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+        (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
       const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
@@ -5856,7 +6173,6 @@
   }
 
   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
-  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
 
   // Handle AVX2 in-register broadcasts.
   if (!IsLoad && Subtarget->hasInt256() &&
@@ -6241,11 +6557,6 @@
   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
           VT == MVT::v2f64) && "build_vector with an invalid type found!");
 
-  // Don't try to emit a VSELECT that cannot be lowered into a blend.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
-    return SDValue();
-
   // Odd-numbered elements in the input build vector are obtained from
   // adding two integer/float elements.
   // Even-numbered elements in the input build vector are obtained from
@@ -6257,14 +6568,14 @@
 
   for (unsigned i = 0, e = NumElts; i != e; i++) {
     SDValue Op = BV->getOperand(i);
-      
+
     // Skip 'undef' values.
     unsigned Opcode = Op.getOpcode();
     if (Opcode == ISD::UNDEF) {
       std::swap(ExpectedOpcode, NextExpectedOpcode);
       continue;
     }
-      
+
     // Early exit if we found an unexpected opcode.
     if (Opcode != ExpectedOpcode)
       return SDValue();
@@ -6318,34 +6629,11 @@
     std::swap(ExpectedOpcode, NextExpectedOpcode);
   }
 
-  // Don't try to fold this build_vector into a VSELECT if it has
-  // too many UNDEF operands.
+  // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
-      InVec1.getOpcode() != ISD::UNDEF) {
-    // Emit a sequence of vector add and sub followed by a VSELECT.
-    // The new VSELECT will be lowered into a BLENDI.
-    // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI'
-    // and emit a single ADDSUB instruction.
-    SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1);
-    SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1);
+      InVec1.getOpcode() != ISD::UNDEF)
+    return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
 
-    // Construct the VSELECT mask.
-    EVT MaskVT = VT.changeVectorElementTypeToInteger();
-    EVT SVT = MaskVT.getVectorElementType();
-    unsigned SVTBits = SVT.getSizeInBits();
-    SmallVector<SDValue, 8> Ops;
-
-    for (unsigned i = 0, e = NumElts; i != e; ++i) {
-      APInt Value = i & 1 ? APInt::getNullValue(SVTBits) :
-                            APInt::getAllOnesValue(SVTBits);
-      SDValue Constant = DAG.getConstant(Value, SVT);
-      Ops.push_back(Constant);
-    }
-
-    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops);
-    return DAG.getSelect(DL, VT, Mask, Sub, Add);
-  }
-  
   return SDValue();
 }
 
@@ -6581,6 +6869,13 @@
         // convert it to a vector with movd (S2V+shuffle to zero extend).
         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+
+        // If using the new shuffle lowering, just directly insert this.
+        if (ExperimentalVectorShuffleLowering)
+          return DAG.getNode(
+              ISD::BITCAST, dl, VT,
+              getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
+
         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
 
         // Now we have our 32-bit value zero extended in the low element of
@@ -6654,6 +6949,10 @@
     if (EVTBits == 32) {
       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
 
+      // If using the new shuffle lowering, just directly insert this.
+      if (ExperimentalVectorShuffleLowering)
+        return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+
       // Turn it into a shuffle of zero and zero-extended scalar to vector.
       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
       SmallVector<int, 8> MaskVec;
@@ -6731,8 +7030,7 @@
 
   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   if (EVTBits == 32 && NumElems == 4) {
-    SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
-                                      NumZero, DAG, Subtarget, *this);
+    SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
     if (V.getNode())
       return V;
   }
@@ -6923,6 +7221,89 @@
   return true;
 }
 
+/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+      return true;
+  return false;
+}
+
+/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 128-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  RepeatedMask.resize(LaneSize, -1);
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+      // This entry crosses lanes, so there is no way to model this shuffle.
+      return false;
+
+    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+    if (RepeatedMask[i % LaneSize] == -1)
+      // This is the first non-undef entry in this slot of a 128-bit lane.
+      RepeatedMask[i % LaneSize] =
+          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+      // Found a mismatch with the repeated mask.
+      return false;
+  }
+  return true;
+}
+
+// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
+// 2013 will allow us to use it as a non-type template parameter.
+namespace {
+
+/// \brief Implementation of the \c isShuffleEquivalent variadic functor.
+///
+/// See its documentation for details.
+bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
+  if (Mask.size() != Args.size())
+    return false;
+  for (int i = 0, e = Mask.size(); i < e; ++i) {
+    assert(*Args[i] >= 0 && "Arguments must be positive integers!");
+    if (Mask[i] != -1 && Mask[i] != *Args[i])
+      return false;
+  }
+  return true;
+}
+
+} // namespace
+
+/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
+/// arguments.
+///
+/// This is a fast way to test a shuffle mask against a fixed pattern:
+///
+///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
+///
+/// It returns true if the mask is exactly as wide as the argument list, and
+/// each element of the mask is either -1 (signifying undef) or the value given
+/// in the argument.
+static const VariadicFunction1<
+    bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
+
 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
 /// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -6947,6 +7328,764 @@
   return DAG.getConstant(Imm, MVT::i8);
 }
 
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is in fact a blend.
+static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
+                                         SDValue V2, ArrayRef<int> Mask,
+                                         const X86Subtarget *Subtarget,
+                                         SelectionDAG &DAG) {
+
+  unsigned BlendMask = 0;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] >= Size) {
+      if (Mask[i] != i + Size)
+        return SDValue(); // Shuffled V2 input!
+      BlendMask |= 1u << i;
+      continue;
+    }
+    if (Mask[i] >= 0 && Mask[i] != i)
+      return SDValue(); // Shuffled V1 input!
+  }
+  switch (VT.SimpleTy) {
+  case MVT::v2f64:
+  case MVT::v4f32:
+  case MVT::v4f64:
+  case MVT::v8f32:
+    return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+                       DAG.getConstant(BlendMask, MVT::i8));
+
+  case MVT::v4i64:
+  case MVT::v8i32:
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    // FALLTHROUGH
+  case MVT::v2i64:
+  case MVT::v4i32:
+    // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
+    // that instruction.
+    if (Subtarget->hasAVX2()) {
+      // Scale the blend by the number of 32-bit dwords per element.
+      int Scale =  VT.getScalarSizeInBits() / 32;
+      BlendMask = 0;
+      for (int i = 0, Size = Mask.size(); i < Size; ++i)
+        if (Mask[i] >= Size)
+          for (int j = 0; j < Scale; ++j)
+            BlendMask |= 1u << (i * Scale + j);
+
+      MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
+      V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
+      V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
+      return DAG.getNode(ISD::BITCAST, DL, VT,
+                         DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
+                                     DAG.getConstant(BlendMask, MVT::i8)));
+    }
+    // FALLTHROUGH
+  case MVT::v8i16: {
+    // For integer shuffles we need to expand the mask and cast the inputs to
+    // v8i16s prior to blending.
+    int Scale = 8 / VT.getVectorNumElements();
+    BlendMask = 0;
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      if (Mask[i] >= Size)
+        for (int j = 0; j < Scale; ++j)
+          BlendMask |= 1u << (i * Scale + j);
+
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
+                                   DAG.getConstant(BlendMask, MVT::i8)));
+  }
+
+  case MVT::v16i16: {
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+      BlendMask = 0;
+      for (int i = 0; i < 8; ++i)
+        if (RepeatedMask[i] >= 16)
+          BlendMask |= 1u << i;
+      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+                         DAG.getConstant(BlendMask, MVT::i8));
+    }
+  }
+    // FALLTHROUGH
+  case MVT::v32i8: {
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    // Scale the blend by the number of bytes per element.
+    int Scale =  VT.getScalarSizeInBits() / 8;
+    assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+
+    // Compute the VSELECT mask. Note that VSELECT is really confusing in the
+    // mix of LLVM's code generator and the x86 backend. We tell the code
+    // generator that boolean values in the elements of an x86 vector register
+    // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
+    // mapping a select to operand #1, and 'false' mapping to operand #2. The
+    // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
+    // of the element (the remaining are ignored) and 0 in that high bit would
+    // mean operand #1 while 1 in the high bit would mean operand #2. So while
+    // the LLVM model for boolean values in vector elements gets the relevant
+    // bit set, it is set backwards and over constrained relative to x86's
+    // actual model.
+    SDValue VSELECTMask[32];
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      for (int j = 0; j < Scale; ++j)
+        VSELECTMask[Scale * i + j] =
+            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+                        : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
+
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
+    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+    return DAG.getNode(
+        ISD::BITCAST, DL, VT,
+        DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
+                    DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
+                    V1, V2));
+  }
+
+  default:
+    llvm_unreachable("Not a supported integer vector type!");
+  }
+}
+
+/// \brief Generic routine to lower a shuffle and blend as a decomposed set of
+/// unblended shuffles followed by an unshuffled blend.
+///
+/// This matches the extremely common pattern for handling combined
+/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
+/// operations.
+static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
+                                                          SDValue V1,
+                                                          SDValue V2,
+                                                          ArrayRef<int> Mask,
+                                                          SelectionDAG &DAG) {
+  // Shuffle the input elements into the desired positions in V1 and V2 and
+  // blend them together.
+  SmallVector<int, 32> V1Mask(Mask.size(), -1);
+  SmallVector<int, 32> V2Mask(Mask.size(), -1);
+  SmallVector<int, 32> BlendMask(Mask.size(), -1);
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (Mask[i] >= 0 && Mask[i] < Size) {
+      V1Mask[i] = Mask[i];
+      BlendMask[i] = i;
+    } else if (Mask[i] >= Size) {
+      V2Mask[i] = Mask[i] - Size;
+      BlendMask[i] = i + Size;
+    }
+
+  V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
+  V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
+  return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
+}
+
+/// \brief Try to lower a vector shuffle as a byte rotation.
+///
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
+/// 
+///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+/// 
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+///
+/// Note that this only handles 128-bit vector widths currently.
+static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
+                                              SDValue V2,
+                                              ArrayRef<int> Mask,
+                                              const X86Subtarget *Subtarget,
+                                              SelectionDAG &DAG) {
+  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+  // We need to detect various ways of spelling a rotation:
+  //   [11, 12, 13, 14, 15,  0,  1,  2]
+  //   [-1, 12, 13, 14, -1, -1,  1, -1]
+  //   [-1, -1, -1, -1, -1, -1,  1,  2]
+  //   [ 3,  4,  5,  6,  7,  8,  9, 10]
+  //   [-1,  4,  5,  6, -1, -1,  9, -1]
+  //   [-1,  4,  5,  6, -1, -1, -1, -1]
+  int Rotation = 0;
+  SDValue Lo, Hi;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    if (Mask[i] == -1)
+      continue;
+    assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
+
+    // Based on the mod-Size value of this mask element determine where
+    // a rotated vector would have started.
+    int StartIdx = i - (Mask[i] % Size);
+    if (StartIdx == 0)
+      // The identity rotation isn't interesting, stop.
+      return SDValue();
+
+    // If we found the tail of a vector the rotation must be the missing
+    // front. If we found the head of a vector, it must be how much of the head.
+    int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
+
+    if (Rotation == 0)
+      Rotation = CandidateRotation;
+    else if (Rotation != CandidateRotation)
+      // The rotations don't match, so we can't match this mask.
+      return SDValue();
+
+    // Compute which value this mask is pointing at.
+    SDValue MaskV = Mask[i] < Size ? V1 : V2;
+
+    // Compute which of the two target values this index should be assigned to.
+    // This reflects whether the high elements are remaining or the low elements
+    // are remaining.
+    SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
+
+    // Either set up this value if we've not encountered it before, or check
+    // that it remains consistent.
+    if (!TargetV)
+      TargetV = MaskV;
+    else if (TargetV != MaskV)
+      // This may be a rotation, but it pulls from the inputs in some
+      // unsupported interleaving.
+      return SDValue();
+  }
+
+  // Check that we successfully analyzed the mask, and normalize the results.
+  assert(Rotation != 0 && "Failed to locate a viable rotation!");
+  assert((Lo || Hi) && "Failed to find a rotated input vector!");
+  if (!Lo)
+    Lo = Hi;
+  else if (!Hi)
+    Hi = Lo;
+
+  assert(VT.getSizeInBits() == 128 &&
+         "Rotate-based lowering only supports 128-bit lowering!");
+  assert(Mask.size() <= 16 &&
+         "Can shuffle at most 16 bytes in a 128-bit vector!");
+
+  // The actual rotate instruction rotates bytes, so we need to scale the
+  // rotation based on how many bytes are in the vector.
+  int Scale = 16 / Mask.size();
+
+  // SSSE3 targets can use the palignr instruction
+  if (Subtarget->hasSSSE3()) {
+    // Cast the inputs to v16i8 to match PALIGNR.
+    Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
+    Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
+
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
+                                   DAG.getConstant(Rotation * Scale, MVT::i8)));
+  }
+
+  // Default SSE2 implementation
+  int LoByteShift = 16 - Rotation * Scale;
+  int HiByteShift = Rotation * Scale;
+
+  // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
+  Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
+  Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
+
+  SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
+                                DAG.getConstant(8 * LoByteShift, MVT::i8));
+  SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
+                                DAG.getConstant(8 * HiByteShift, MVT::i8));
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
+}
+
+/// \brief Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
+                                                     SDValue V1, SDValue V2) {
+  SmallBitVector Zeroable(Mask.size(), false);
+
+  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+    int M = Mask[i];
+    // Handle the easy cases.
+    if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+      Zeroable[i] = true;
+      continue;
+    }
+
+    // If this is an index into a build_vector node, dig out the input value and
+    // use it.
+    SDValue V = M < Size ? V1 : V2;
+    if (V.getOpcode() != ISD::BUILD_VECTOR)
+      continue;
+
+    SDValue Input = V.getOperand(M % Size);
+    // The UNDEF opcode check really should be dead code here, but not quite
+    // worth asserting on (it isn't invalid, just unexpected).
+    if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
+      Zeroable[i] = true;
+  }
+
+  return Zeroable;
+}
+
+/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
+///
+/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
+/// byte-shift instructions. The mask must consist of a shifted sequential
+/// shuffle from one of the input vectors and zeroable elements for the
+/// remaining 'shifted in' elements.
+///
+/// Note that this only handles 128-bit vector widths currently.
+static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
+                                             SDValue V2, ArrayRef<int> Mask,
+                                             SelectionDAG &DAG) {
+  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  int Size = Mask.size();
+  int Scale = 16 / Size;
+
+  auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset,
+                         ArrayRef<int> Mask) {
+    for (int i = StartIndex; i < EndIndex; i++) {
+      if (Mask[i] < 0)
+        continue;
+      if (i + Base != Mask[i] - MaskOffset)
+        return false;
+    }
+    return true;
+  };
+
+  for (int Shift = 1; Shift < Size; Shift++) {
+    int ByteShift = Shift * Scale;
+
+    // PSRLDQ : (little-endian) right byte shift
+    // [ 5,  6,  7, zz, zz, zz, zz, zz]
+    // [ -1, 5,  6,  7, zz, zz, zz, zz]
+    // [  1, 2, -1, -1, -1, -1, zz, zz]
+    bool ZeroableRight = true;
+    for (int i = Size - Shift; i < Size; i++) {
+      ZeroableRight &= Zeroable[i];
+    }
+
+    if (ZeroableRight) {
+      bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask);
+      bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask);
+
+      if (ValidShiftRight1 || ValidShiftRight2) {
+        // Cast the inputs to v2i64 to match PSRLDQ.
+        SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
+        SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
+        SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
+                                      DAG.getConstant(ByteShift * 8, MVT::i8));
+        return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
+      }
+    }
+
+    // PSLLDQ : (little-endian) left byte shift
+    // [ zz,  0,  1,  2,  3,  4,  5,  6]
+    // [ zz, zz, -1, -1,  2,  3,  4, -1]
+    // [ zz, zz, zz, zz, zz, zz, -1,  1]
+    bool ZeroableLeft = true;
+    for (int i = 0; i < Shift; i++) {
+      ZeroableLeft &= Zeroable[i];
+    }
+
+    if (ZeroableLeft) {
+      bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask);
+      bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask);
+
+      if (ValidShiftLeft1 || ValidShiftLeft2) {
+        // Cast the inputs to v2i64 to match PSLLDQ.
+        SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
+        SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
+        SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
+                                      DAG.getConstant(ByteShift * 8, MVT::i8));
+        return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// \brief Lower a vector shuffle as a zero or any extension.
+///
+/// Given a specific number of elements, element bit width, and extension
+/// stride, produce either a zero or any extension based on the available
+/// features of the subtarget.
+static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+    SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  assert(Scale > 1 && "Need a scale to extend.");
+  int EltBits = VT.getSizeInBits() / NumElements;
+  assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+         "Only 8, 16, and 32 bit elements can be extended.");
+  assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
+
+  // Found a valid zext mask! Try various lowering strategies based on the
+  // input type and available ISA extensions.
+  if (Subtarget->hasSSE41()) {
+    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
+                                 NumElements / Scale);
+    InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
+  }
+
+  // For any extends we can cheat for larger element sizes and use shuffle
+  // instructions that can fold with a load and/or copy.
+  if (AnyExt && EltBits == 32) {
+    int PSHUFDMask[4] = {0, -1, 1, -1};
+    return DAG.getNode(
+        ISD::BITCAST, DL, VT,
+        DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                    DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+  }
+  if (AnyExt && EltBits == 16 && Scale > 2) {
+    int PSHUFDMask[4] = {0, -1, 0, -1};
+    InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
+                         DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
+                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
+    int PSHUFHWMask[4] = {1, -1, -1, -1};
+    return DAG.getNode(
+        ISD::BITCAST, DL, VT,
+        DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
+                    DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
+                    getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
+  }
+
+  // If this would require more than 2 unpack instructions to expand, use
+  // pshufb when available. We can only use more than 2 unpack instructions
+  // when zero extending i8 elements which also makes it easier to use pshufb.
+  if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
+    assert(NumElements == 16 && "Unexpected byte vector width!");
+    SDValue PSHUFBMask[16];
+    for (int i = 0; i < 16; ++i)
+      PSHUFBMask[i] =
+          DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
+    InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
+                                   DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                               MVT::v16i8, PSHUFBMask)));
+  }
+
+  // Otherwise emit a sequence of unpacks.
+  do {
+    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
+    SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
+                         : getZeroVector(InputVT, Subtarget, DAG, DL);
+    InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
+    InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
+    Scale /= 2;
+    EltBits *= 2;
+    NumElements /= 2;
+  } while (Scale > 1);
+  return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
+}
+
+/// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
+///
+/// This routine will try to do everything in its power to cleverly lower
+/// a shuffle which happens to match the pattern of a zero extend. It doesn't
+/// check for the profitability of this lowering,  it tries to aggressively
+/// match this pattern. It will use all of the micro-architectural details it
+/// can to emit an efficient lowering. It handles both blends with all-zero
+/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
+/// masking out later).
+///
+/// The reason we have dedicated lowering for zext-style shuffles is that they
+/// are both incredibly common and often quite performance sensitive.
+static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
+    SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+  int Bits = VT.getSizeInBits();
+  int NumElements = Mask.size();
+
+  // Define a helper function to check a particular ext-scale and lower to it if
+  // valid.
+  auto Lower = [&](int Scale) -> SDValue {
+    SDValue InputV;
+    bool AnyExt = true;
+    for (int i = 0; i < NumElements; ++i) {
+      if (Mask[i] == -1)
+        continue; // Valid anywhere but doesn't tell us anything.
+      if (i % Scale != 0) {
+        // Each of the extend elements needs to be zeroable.
+        if (!Zeroable[i])
+          return SDValue();
+
+        // We no lorger are in the anyext case.
+        AnyExt = false;
+        continue;
+      }
+
+      // Each of the base elements needs to be consecutive indices into the
+      // same input vector.
+      SDValue V = Mask[i] < NumElements ? V1 : V2;
+      if (!InputV)
+        InputV = V;
+      else if (InputV != V)
+        return SDValue(); // Flip-flopping inputs.
+
+      if (Mask[i] % NumElements != i / Scale)
+        return SDValue(); // Non-consecutive strided elemenst.
+    }
+
+    // If we fail to find an input, we have a zero-shuffle which should always
+    // have already been handled.
+    // FIXME: Maybe handle this here in case during blending we end up with one?
+    if (!InputV)
+      return SDValue();
+
+    return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
+        DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
+  };
+
+  // The widest scale possible for extending is to a 64-bit integer.
+  assert(Bits % 64 == 0 &&
+         "The number of bits in a vector must be divisible by 64 on x86!");
+  int NumExtElements = Bits / 64;
+
+  // Each iteration, try extending the elements half as much, but into twice as
+  // many elements.
+  for (; NumExtElements < NumElements; NumExtElements *= 2) {
+    assert(NumElements % NumExtElements == 0 &&
+           "The input vector size must be divisble by the extended size.");
+    if (SDValue V = Lower(NumElements / NumExtElements))
+      return V;
+  }
+
+  // No viable ext lowering found.
+  return SDValue();
+}
+
+/// \brief Try to get a scalar value for a specific element of a vector.
+///
+/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
+static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
+                                              SelectionDAG &DAG) {
+  MVT VT = V.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+  // If the bitcasts shift the element size, we can't extract an equivalent
+  // element from it.
+  MVT NewVT = V.getSimpleValueType();
+  if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+    return SDValue();
+
+  if (V.getOpcode() == ISD::BUILD_VECTOR ||
+      (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
+    return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
+
+  return SDValue();
+}
+
+/// \brief Helper to test for a load that can be folded with x86 shuffles.
+///
+/// This is particularly important because the set of instructions varies
+/// significantly based on whether the operand is a load or not.
+static bool isShuffleFoldableLoad(SDValue V) {
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  return ISD::isNON_EXTLoad(V.getNode());
+}
+
+/// \brief Try to lower insertion of a single element into a zero vector.
+///
+/// This is a common pattern that we have especially efficient patterns to lower
+/// across all subtarget feature sets.
+static SDValue lowerVectorShuffleAsElementInsertion(
+    MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  MVT ExtVT = VT;
+  MVT EltVT = VT.getVectorElementType();
+
+  int V2Index = std::find_if(Mask.begin(), Mask.end(),
+                             [&Mask](int M) { return M >= (int)Mask.size(); }) -
+                Mask.begin();
+  bool IsV1Zeroable = true;
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (i != V2Index && !Zeroable[i]) {
+      IsV1Zeroable = false;
+      break;
+    }
+
+  // Check for a single input from a SCALAR_TO_VECTOR node.
+  // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
+  // all the smarts here sunk into that routine. However, the current
+  // lowering of BUILD_VECTOR makes that nearly impossible until the old
+  // vector shuffle lowering is dead.
+  if (SDValue V2S = getScalarValueForVectorElement(
+          V2, Mask[V2Index] - Mask.size(), DAG)) {
+    // We need to zext the scalar if it is smaller than an i32.
+    V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
+    if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+      // Using zext to expand a narrow element won't work for non-zero
+      // insertions.
+      if (!IsV1Zeroable)
+        return SDValue();
+
+      // Zero-extend directly to i32.
+      ExtVT = MVT::v4i32;
+      V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
+    }
+    V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
+  } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
+             EltVT == MVT::i16) {
+    // Either not inserting from the low element of the input or the input
+    // element size is too small to use VZEXT_MOVL to clear the high bits.
+    return SDValue();
+  }
+
+  if (!IsV1Zeroable) {
+    // If V1 can't be treated as a zero vector we have fewer options to lower
+    // this. We can't support integer vectors or non-zero targets cheaply, and
+    // the V1 elements can't be permuted in any way.
+    assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
+    if (!VT.isFloatingPoint() || V2Index != 0)
+      return SDValue();
+    SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
+    V1Mask[V2Index] = -1;
+    if (!isNoopShuffleMask(V1Mask))
+      return SDValue();
+    // This is essentially a special case blend operation, but if we have
+    // general purpose blend operations, they are always faster. Bail and let
+    // the rest of the lowering handle these as blends.
+    if (Subtarget->hasSSE41())
+      return SDValue();
+
+    // Otherwise, use MOVSD or MOVSS.
+    assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
+           "Only two types of floating point element types to handle!");
+    return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
+                       ExtVT, V1, V2);
+  }
+
+  V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
+  if (ExtVT != VT)
+    V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+
+  if (V2Index != 0) {
+    // If we have 4 or fewer lanes we can cheaply shuffle the element into
+    // the desired position. Otherwise it is more efficient to do a vector
+    // shift left. We know that we can do a vector shift left because all
+    // the inputs are zero.
+    if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
+      SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
+      V2Shuffle[V2Index] = 0;
+      V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
+    } else {
+      V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
+      V2 = DAG.getNode(
+          X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
+          DAG.getConstant(
+              V2Index * EltVT.getSizeInBits(),
+              DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
+      V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
+    }
+  }
+  return V2;
+}
+
+/// \brief Try to lower broadcast of a single element.
+///
+/// For convenience, this code also bundles all of the subtarget feature set
+/// filtering. While a little annoying to re-dispatch on type here, there isn't
+/// a convenient way to factor it out.
+static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
+                                             ArrayRef<int> Mask,
+                                             const X86Subtarget *Subtarget,
+                                             SelectionDAG &DAG) {
+  if (!Subtarget->hasAVX())
+    return SDValue();
+  if (VT.isInteger() && !Subtarget->hasAVX2())
+    return SDValue();
+
+  // Check that the mask is a broadcast.
+  int BroadcastIdx = -1;
+  for (int M : Mask)
+    if (M >= 0 && BroadcastIdx == -1)
+      BroadcastIdx = M;
+    else if (M >= 0 && M != BroadcastIdx)
+      return SDValue();
+
+  assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
+                                            "a sorted mask where the broadcast "
+                                            "comes from V1.");
+
+  // Go up the chain of (vector) values to try and find a scalar load that
+  // we can combine with the broadcast.
+  for (;;) {
+    switch (V.getOpcode()) {
+    case ISD::CONCAT_VECTORS: {
+      int OperandSize = Mask.size() / V.getNumOperands();
+      V = V.getOperand(BroadcastIdx / OperandSize);
+      BroadcastIdx %= OperandSize;
+      continue;
+    }
+
+    case ISD::INSERT_SUBVECTOR: {
+      SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
+      auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
+      if (!ConstantIdx)
+        break;
+
+      int BeginIdx = (int)ConstantIdx->getZExtValue();
+      int EndIdx =
+          BeginIdx + (int)VInner.getValueType().getVectorNumElements();
+      if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
+        BroadcastIdx -= BeginIdx;
+        V = VInner;
+      } else {
+        V = VOuter;
+      }
+      continue;
+    }
+    }
+    break;
+  }
+
+  // Check if this is a broadcast of a scalar. We special case lowering
+  // for scalars so that we can more effectively fold with loads.
+  if (V.getOpcode() == ISD::BUILD_VECTOR ||
+      (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
+    V = V.getOperand(BroadcastIdx);
+
+    // If the scalar isn't a load we can't broadcast from it in AVX1, only with
+    // AVX2.
+    if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
+      return SDValue();
+  } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
+    // We can't broadcast from a vector register w/o AVX2, and we can only
+    // broadcast from the zero-element of a vector register.
+    return SDValue();
+  }
+
+  return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
+}
+
 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
 ///
 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
@@ -6969,12 +8108,56 @@
     // Straight shuffle of a single input vector. Simulate this by using the
     // single input as both of the "inputs" to this instruction..
     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
+
+    if (Subtarget->hasAVX()) {
+      // If we have AVX, we can use VPERMILPS which will allow folding a load
+      // into the shuffle.
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
+                         DAG.getConstant(SHUFPDMask, MVT::i8));
+    }
+
     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
                        DAG.getConstant(SHUFPDMask, MVT::i8));
   }
   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
 
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(Mask, 0, 2))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
+  if (isShuffleEquivalent(Mask, 1, 3))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+
+  // If we have a single input, insert that into V1 if we can do so cheaply.
+  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
+      return Insertion;
+    // Try inverting the insertion since for v2 masks it is easy to do and we
+    // can't reliably sort the mask one way or the other.
+    int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+                          Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
+      return Insertion;
+  }
+
+  // Try to use one of the special instruction patterns to handle two common
+  // blend patterns if a zero-blend above didn't work.
+  if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
+    if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
+      // We can either use a special instruction to load over the low double or
+      // to move just the low double.
+      return DAG.getNode(
+          isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
+          DL, MVT::v2f64, V2,
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
+
+  if (Subtarget->hasSSE41())
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
                      DAG.getConstant(SHUFPDMask, MVT::i8));
@@ -6998,6 +8181,11 @@
   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
 
   if (isSingleInputShuffleMask(Mask)) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
     // We have to map the mask as it is actually a v4i32 shuffle instruction.
@@ -7011,6 +8199,44 @@
                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
   }
 
+  // If we have a single input from V2 insert that into V1 if we can do so
+  // cheaply.
+  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
+      return Insertion;
+    // Try inverting the insertion since for v2 masks it is easy to do and we
+    // can't reliably sort the mask one way or the other.
+    int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
+                          Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
+      return Insertion;
+  }
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(Mask, 0, 2))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
+  if (isShuffleEquivalent(Mask, 1, 3))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
+
+  if (Subtarget->hasSSE41())
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v2i64, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+  if (Subtarget->hasSSSE3())
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
+      return Rotate;
+
   // We implement this with SHUFPD which is pretty lame because it will likely
   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   // However, all the alternatives are still more cycles and newer chips don't
@@ -7021,6 +8247,96 @@
                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
 }
 
+/// \brief Lower a vector shuffle using the SHUFPS instruction.
+///
+/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
+/// It makes no assumptions about whether this is the *best* lowering, it simply
+/// uses it.
+static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
+                                            ArrayRef<int> Mask, SDValue V1,
+                                            SDValue V2, SelectionDAG &DAG) {
+  SDValue LowV = V1, HighV = V2;
+  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
+
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 1) {
+    int V2Index =
+        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
+        Mask.begin();
+
+    // Compute the index adjacent to V2Index and in the same half by toggling
+    // the low bit.
+    int V2AdjIndex = V2Index ^ 1;
+
+    if (Mask[V2AdjIndex] == -1) {
+      // Handles all the cases where we have a single V2 element and an undef.
+      // This will only ever happen in the high lanes because we commute the
+      // vector otherwise.
+      if (V2Index < 2)
+        std::swap(LowV, HighV);
+      NewMask[V2Index] -= 4;
+    } else {
+      // Handle the case where the V2 element ends up adjacent to a V1 element.
+      // To make this work, blend them together as the first step.
+      int V1Index = V2AdjIndex;
+      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
+      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+
+      // Now proceed to reconstruct the final blend as we have the necessary
+      // high or low half formed.
+      if (V2Index < 2) {
+        LowV = V2;
+        HighV = V1;
+      } else {
+        HighV = V2;
+      }
+      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
+      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
+    }
+  } else if (NumV2Elements == 2) {
+    if (Mask[0] < 4 && Mask[1] < 4) {
+      // Handle the easy case where we have V1 in the low lanes and V2 in the
+      // high lanes.
+      NewMask[2] -= 4;
+      NewMask[3] -= 4;
+    } else if (Mask[2] < 4 && Mask[3] < 4) {
+      // We also handle the reversed case because this utility may get called
+      // when we detect a SHUFPS pattern but can't easily commute the shuffle to
+      // arrange things in the right direction.
+      NewMask[0] -= 4;
+      NewMask[1] -= 4;
+      HighV = V1;
+      LowV = V2;
+    } else {
+      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
+      // trying to place elements directly, just blend them and set up the final
+      // shuffle to place them.
+
+      // The first two blend mask elements are for V1, the second two are for
+      // V2.
+      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
+                          Mask[2] < 4 ? Mask[2] : Mask[3],
+                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
+                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
+      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
+                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+
+      // Now we do a normal shuffle of V1 by giving V1 as both operands to
+      // a blend.
+      LowV = HighV = V1;
+      NewMask[0] = Mask[0] < 4 ? 0 : 2;
+      NewMask[1] = Mask[0] < 4 ? 2 : 0;
+      NewMask[2] = Mask[2] < 4 ? 1 : 3;
+      NewMask[3] = Mask[2] < 4 ? 3 : 1;
+    }
+  }
+  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
+                     getV4X86ShuffleImm8ForMask(NewMask, DAG));
+}
+
 /// \brief Lower 4-lane 32-bit floating point shuffles.
 ///
 /// Uses instructions exclusively from the floating point unit to minimize
@@ -7037,83 +8353,94 @@
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
-  SDValue LowV = V1, HighV = V2;
-  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
-
   int NumV2Elements =
       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
 
-  if (NumV2Elements == 0)
-    // Straight shuffle of a single input vector. We pass the input vector to
-    // both operands to simulate this with a SHUFPS.
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    if (Subtarget->hasAVX()) {
+      // If we have AVX, we can use VPERMILPS which will allow folding a load
+      // into the shuffle.
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
+                         getV4X86ShuffleImm8ForMask(Mask, DAG));
+    }
+
+    // Otherwise, use a straight shuffle of a single input vector. We pass the
+    // input vector to both operands to simulate this with a SHUFPS.
     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DAG));
+  }
 
-  if (NumV2Elements == 1) {
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
+  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+
+  // There are special ways we can lower some single-element blends. However, we
+  // have custom ways we can lower more complex single-element blends below that
+  // we defer to if both this and BLENDPS fail to match, so restrict this to
+  // when the V2 input is targeting element 0 of the mask -- that is the fast
+  // case here.
+  if (NumV2Elements == 1 && Mask[0] >= 4)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
+  if (Subtarget->hasSSE41())
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
+  // Check for whether we can use INSERTPS to perform the blend. We only use
+  // INSERTPS when the V1 elements are already in the correct locations
+  // because otherwise we can just always use two SHUFPS instructions which
+  // are much smaller to encode than a SHUFPS and an INSERTPS.
+  if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
     int V2Index =
         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
         Mask.begin();
-    // Compute the index adjacent to V2Index and in the same half by toggling
-    // the low bit.
-    int V2AdjIndex = V2Index ^ 1;
 
-    if (Mask[V2AdjIndex] == -1) {
-      // Handles all the cases where we have a single V2 element and an undef.
-      // This will only ever happen in the high lanes because we commute the
-      // vector otherwise.
-      if (V2Index < 2)
-        std::swap(LowV, HighV);
-      NewMask[V2Index] -= 4;
-    } else {
-      // Handle the case where the V2 element ends up adjacent to a V1 element.
-      // To make this work, blend them together as the first step.
-      int V1Index = V2AdjIndex;
-      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
-      V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1,
-                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+    // When using INSERTPS we can zero any lane of the destination. Collect
+    // the zero inputs into a mask and drop them from the lanes of V1 which
+    // actually need to be present as inputs to the INSERTPS.
+    SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
 
-      // Now proceed to reconstruct the final blend as we have the necessary
-      // high or low half formed.
-      if (V2Index < 2) {
-        LowV = V2;
-        HighV = V1;
-      } else {
-        HighV = V2;
+    // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
+    bool InsertNeedsShuffle = false;
+    unsigned ZMask = 0;
+    for (int i = 0; i < 4; ++i)
+      if (i != V2Index) {
+        if (Zeroable[i]) {
+          ZMask |= 1 << i;
+        } else if (Mask[i] != i) {
+          InsertNeedsShuffle = true;
+          break;
+        }
       }
-      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
-      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
-    }
-  } else if (NumV2Elements == 2) {
-    if (Mask[0] < 4 && Mask[1] < 4) {
-      // Handle the easy case where we have V1 in the low lanes and V2 in the
-      // high lanes. We never see this reversed because we sort the shuffle.
-      NewMask[2] -= 4;
-      NewMask[3] -= 4;
-    } else {
-      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
-      // trying to place elements directly, just blend them and set up the final
-      // shuffle to place them.
 
-      // The first two blend mask elements are for V1, the second two are for
-      // V2.
-      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
-                          Mask[2] < 4 ? Mask[2] : Mask[3],
-                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
-                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
-      V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2,
-                       getV4X86ShuffleImm8ForMask(BlendMask, DAG));
+    // We don't want to use INSERTPS or other insertion techniques if it will
+    // require shuffling anyways.
+    if (!InsertNeedsShuffle) {
+      // If all of V1 is zeroable, replace it with undef.
+      if ((ZMask | 1 << V2Index) == 0xF)
+        V1 = DAG.getUNDEF(MVT::v4f32);
 
-      // Now we do a normal shuffle of V1 by giving V1 as both operands to
-      // a blend.
-      LowV = HighV = V1;
-      NewMask[0] = Mask[0] < 4 ? 0 : 2;
-      NewMask[1] = Mask[0] < 4 ? 2 : 0;
-      NewMask[2] = Mask[2] < 4 ? 1 : 3;
-      NewMask[3] = Mask[2] < 4 ? 3 : 1;
+      unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
+      assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+      // Insert the V2 element into the desired position.
+      return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+                         DAG.getConstant(InsertPSMask, MVT::i8));
     }
   }
-  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV,
-                     getV4X86ShuffleImm8ForMask(NewMask, DAG));
+
+  // Otherwise fall back to a SHUFPS lowering strategy.
+  return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
 }
 
 /// \brief Lower 4-lane i32 vector shuffles.
@@ -7131,11 +8458,66 @@
   ArrayRef<int> Mask = SVOp->getMask();
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
-  if (isSingleInputShuffleMask(Mask))
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative. It also allows us to fold memory operands into the
+  // shuffle in many cases.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
+                                                         Mask, Subtarget, DAG))
+    return ZExt;
+
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
+    // We coerce the shuffle pattern to be compatible with UNPCK instructions
+    // but we aren't actually going to use the UNPCK instruction because doing
+    // so prevents folding a load into this instruction or making a copy.
+    const int UnpackLoMask[] = {0, 0, 1, 1};
+    const int UnpackHiMask[] = {2, 2, 3, 3};
+    if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
+      Mask = UnpackLoMask;
+    else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
+      Mask = UnpackHiMask;
+
     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DAG));
+  }
+
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
+  if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
+
+  if (Subtarget->hasSSE41())
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v4i32, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
+  if (Subtarget->hasSSSE3())
+    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+            DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
+      return Rotate;
 
   // We implement this with SHUFPS because it can blend from two vectors.
   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
@@ -7188,6 +8570,27 @@
   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
 
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
+  if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
+
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v8i16, V, V, Mask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
+    return Rotate;
+
   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   // such inputs we can swap two of the dwords across the half mark and end up
   // with <=2 inputs to each half in each half. Once there, we can fall through
@@ -7196,22 +8599,126 @@
   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
   //
-  // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2
-  // and 2-2.
-  auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput,
-                          int ThreeInputHalfSum, int OneInputHalfOffset) {
+  // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
+  // and an existing 2-into-2 on the other half. In this case we may have to
+  // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
+  // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
+  // Fortunately, we don't have to handle anything but a 2-into-2 pattern
+  // because any other situation (including a 3-into-1 or 1-into-3 in the other
+  // half than the one we target for fixing) will be fixed when we re-enter this
+  // path. We will also combine away any sequence of PSHUFD instructions that
+  // result into a single instruction. Here is an example of the tricky case:
+  //
+  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
+  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
+  //
+  // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
+  //
+  // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
+  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
+  //
+  // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
+  // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
+  //
+  // The result is fine to be handled by the generic logic.
+  auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
+                          ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
+                          int AOffset, int BOffset) {
+    assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
+           "Must call this with A having 3 or 1 inputs from the A half.");
+    assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
+           "Must call this with B having 1 or 3 inputs from the B half.");
+    assert(AToAInputs.size() + BToAInputs.size() == 4 &&
+           "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
+
     // Compute the index of dword with only one word among the three inputs in
     // a half by taking the sum of the half with three inputs and subtracting
     // the sum of the actual three inputs. The difference is the remaining
     // slot.
-    int DWordA = (ThreeInputHalfSum -
-                  std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) /
-                 2;
-    int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2;
+    int ADWord, BDWord;
+    int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
+    int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
+    int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
+    ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
+    int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
+    int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
+    int TripleNonInputIdx =
+        TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
+    TripleDWord = TripleNonInputIdx / 2;
+
+    // We use xor with one to compute the adjacent DWord to whichever one the
+    // OneInput is in.
+    OneInputDWord = (OneInput / 2) ^ 1;
+
+    // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
+    // and BToA inputs. If there is also such a problem with the BToB and AToB
+    // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
+    // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
+    // is essential that we don't *create* a 3<-1 as then we might oscillate.
+    if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
+      // Compute how many inputs will be flipped by swapping these DWords. We
+      // need
+      // to balance this to ensure we don't form a 3-1 shuffle in the other
+      // half.
+      int NumFlippedAToBInputs =
+          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
+          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
+      int NumFlippedBToBInputs =
+          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
+          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
+      if ((NumFlippedAToBInputs == 1 &&
+           (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
+          (NumFlippedBToBInputs == 1 &&
+           (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
+        // We choose whether to fix the A half or B half based on whether that
+        // half has zero flipped inputs. At zero, we may not be able to fix it
+        // with that half. We also bias towards fixing the B half because that
+        // will more commonly be the high half, and we have to bias one way.
+        auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
+                                                       ArrayRef<int> Inputs) {
+          int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
+          bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
+                                         PinnedIdx ^ 1) != Inputs.end();
+          // Determine whether the free index is in the flipped dword or the
+          // unflipped dword based on where the pinned index is. We use this bit
+          // in an xor to conditionally select the adjacent dword.
+          int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
+          bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
+                                             FixFreeIdx) != Inputs.end();
+          if (IsFixIdxInput == IsFixFreeIdxInput)
+            FixFreeIdx += 1;
+          IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
+                                        FixFreeIdx) != Inputs.end();
+          assert(IsFixIdxInput != IsFixFreeIdxInput &&
+                 "We need to be changing the number of flipped inputs!");
+          int PSHUFHalfMask[] = {0, 1, 2, 3};
+          std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
+          V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
+                          MVT::v8i16, V,
+                          getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
+
+          for (int &M : Mask)
+            if (M != -1 && M == FixIdx)
+              M = FixFreeIdx;
+            else if (M != -1 && M == FixFreeIdx)
+              M = FixIdx;
+        };
+        if (NumFlippedBToBInputs != 0) {
+          int BPinnedIdx =
+              BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
+          FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
+        } else {
+          assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
+          int APinnedIdx =
+              AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
+          FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
+        }
+      }
+    }
 
     int PSHUFDMask[] = {0, 1, 2, 3};
-    PSHUFDMask[DWordA] = DWordB;
-    PSHUFDMask[DWordB] = DWordA;
+    PSHUFDMask[ADWord] = BDWord;
+    PSHUFDMask[BDWord] = ADWord;
     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
@@ -7219,24 +8726,20 @@
 
     // Adjust the mask to match the new locations of A and B.
     for (int &M : Mask)
-      if (M != -1 && M/2 == DWordA)
-        M = 2 * DWordB + M % 2;
-      else if (M != -1 && M/2 == DWordB)
-        M = 2 * DWordA + M % 2;
+      if (M != -1 && M/2 == ADWord)
+        M = 2 * BDWord + M % 2;
+      else if (M != -1 && M/2 == BDWord)
+        M = 2 * ADWord + M % 2;
 
     // Recurse back into this routine to re-compute state now that this isn't
     // a 3 and 1 problem.
     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
                                 Mask);
   };
-  if (NumLToL == 3 && NumHToL == 1)
-    return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4);
-  else if (NumLToL == 1 && NumHToL == 3)
-    return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0);
-  else if (NumLToH == 1 && NumHToH == 3)
-    return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0);
-  else if (NumLToH == 3 && NumHToH == 1)
-    return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4);
+  if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
+    return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
+  else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+    return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
 
   // At this point there are at most two inputs to the low and high halves from
   // each half. That means the inputs can always be grouped into dwords and
@@ -7250,9 +8753,10 @@
   // First fix the masks for all the inputs that are staying in their
   // original halves. This will then dictate the targets of the cross-half
   // shuffles.
-  auto fixInPlaceInputs = [&PSHUFDMask](
-      ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask,
-      MutableArrayRef<int> HalfMask, int HalfOffset) {
+  auto fixInPlaceInputs =
+      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
+                    MutableArrayRef<int> SourceHalfMask,
+                    MutableArrayRef<int> HalfMask, int HalfOffset) {
     if (InPlaceInputs.empty())
       return;
     if (InPlaceInputs.size() == 1) {
@@ -7261,6 +8765,14 @@
       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
       return;
     }
+    if (IncomingInputs.empty()) {
+      // Just fix all of the in place inputs.
+      for (int Input : InPlaceInputs) {
+        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+        PSHUFDMask[Input / 2] = Input / 2;
+      }
+      return;
+    }
 
     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
@@ -7272,10 +8784,8 @@
     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
   };
-  if (!HToLInputs.empty())
-    fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0);
-  if (!LToHInputs.empty())
-    fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4);
+  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
+  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
 
   // Now gather the cross-half inputs and place them into a free dword of
   // their target half.
@@ -7284,7 +8794,8 @@
   auto moveInputsToRightHalf = [&PSHUFDMask](
       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
-      int SourceOffset, int DestOffset) {
+      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
+      int DestOffset) {
     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
     };
@@ -7310,7 +8821,7 @@
                 Input - SourceOffset;
             // We have to swap the uses in our half mask in one sweep.
             for (int &M : HalfMask)
-              if (M == SourceHalfMask[Input - SourceOffset])
+              if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
                 M = Input;
               else if (M == Input)
                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
@@ -7362,18 +8873,68 @@
     } else if (IncomingInputs.size() == 2) {
       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
-        int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2;
-        assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) &&
-               "Not all dwords can be clobbered!");
-        SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset;
-        SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset;
+        // We have two non-adjacent or clobbered inputs we need to extract from
+        // the source half. To do this, we need to map them into some adjacent
+        // dword slot in the source mask.
+        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
+                              IncomingInputs[1] - SourceOffset};
+
+        // If there is a free slot in the source half mask adjacent to one of
+        // the inputs, place the other input in it. We use (Index XOR 1) to
+        // compute an adjacent index.
+        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
+            SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
+                   SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
+          InputsFixed[0] = InputsFixed[1] ^ 1;
+        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
+                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
+          // The two inputs are in the same DWord but it is clobbered and the
+          // adjacent DWord isn't used at all. Move both inputs to the free
+          // slot.
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
+          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
+          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
+          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
+        } else {
+          // The only way we hit this point is if there is no clobbering
+          // (because there are no off-half inputs to this half) and there is no
+          // free slot adjacent to one of the inputs. In this case, we have to
+          // swap an input with a non-input.
+          for (int i = 0; i < 4; ++i)
+            assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
+                   "We can't handle any clobbers here!");
+          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
+                 "Cannot have adjacent inputs here!");
+
+          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
+          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
+
+          // We also have to update the final source mask in this case because
+          // it may need to undo the above swap.
+          for (int &M : FinalSourceHalfMask)
+            if (M == (InputsFixed[0] ^ 1) + SourceOffset)
+              M = InputsFixed[1] + SourceOffset;
+            else if (M == InputsFixed[1] + SourceOffset)
+              M = (InputsFixed[0] ^ 1) + SourceOffset;
+
+          InputsFixed[1] = InputsFixed[0] ^ 1;
+        }
+
+        // Point everything at the fixed inputs.
         for (int &M : HalfMask)
           if (M == IncomingInputs[0])
-            M = SourceDWordBase + SourceOffset;
+            M = InputsFixed[0] + SourceOffset;
           else if (M == IncomingInputs[1])
-            M = SourceDWordBase + 1 + SourceOffset;
-        IncomingInputs[0] = SourceDWordBase + SourceOffset;
-        IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset;
+            M = InputsFixed[1] + SourceOffset;
+
+        IncomingInputs[0] = InputsFixed[0] + SourceOffset;
+        IncomingInputs[1] = InputsFixed[1] + SourceOffset;
       }
     } else {
       llvm_unreachable("Unhandled input size!");
@@ -7383,13 +8944,14 @@
     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
-    for (int Input : IncomingInputs)
-      std::replace(HalfMask.begin(), HalfMask.end(), Input,
-                   FreeDWord * 2 + Input % 2);
+    for (int &M : HalfMask)
+      for (int Input : IncomingInputs)
+        if (M == Input)
+          M = FreeDWord * 2 + Input % 2;
   };
-  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask,
+  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
-  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask,
+  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
 
   // Now enact all the shuffles we've computed to move the inputs into their
@@ -7526,34 +9088,37 @@
       if (GoodInputs.size() == 2) {
         // If the low inputs are spread across two dwords, pack them into
         // a single dword.
-        MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] =
-            Mask[GoodInputs[0]] - MaskOffset;
-        MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] =
-            Mask[GoodInputs[1]] - MaskOffset;
-        Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
-        Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+        MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
+        MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
+        Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
+        Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
       } else {
-        // Otherwise pin the low inputs.
+        // Otherwise pin the good inputs.
         for (int GoodInput : GoodInputs)
           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
       }
 
-      int MoveMaskIdx =
-          std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) -
-          std::begin(MoveMask);
-      assert(MoveMaskIdx >= MoveOffset && "Established above");
-
       if (BadInputs.size() == 2) {
+        // If we have two bad inputs then there may be either one or two good
+        // inputs fixed in place. Find a fixed input, and then find the *other*
+        // two adjacent indices by using modular arithmetic.
+        int GoodMaskIdx =
+            std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
+                         [](int M) { return M >= 0; }) -
+            std::begin(MoveMask);
+        int MoveMaskIdx =
+            ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
-        MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] =
-            Mask[BadInputs[0]] - MaskOffset;
-        MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] =
-            Mask[BadInputs[1]] - MaskOffset;
-        Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset;
-        Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset;
+        MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
+        MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
+        Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
+        Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
       } else {
         assert(BadInputs.size() == 1 && "All sizes handled");
+        int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
+                                    std::end(MoveMask), -1) -
+                          std::begin(MoveMask);
         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
       }
@@ -7609,6 +9174,12 @@
 
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
+  // Whenever we can lower this as a zext, that instruction is strictly faster
+  // than any alternative.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
+    return ZExt;
+
   auto isV1 = [](int M) { return M >= 0 && M < 8; };
   auto isV2 = [](int M) { return M >= 8; };
 
@@ -7621,6 +9192,33 @@
   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
                             "to be V1-input shuffles.");
 
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Inputs == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
+  if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
+
+  if (Subtarget->hasSSE41())
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
+                                                  Subtarget, DAG))
+      return Blend;
+
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v8i16, V1, V2, Mask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
+    return Rotate;
+
   if (NumV1Inputs + NumV2Inputs <= 4)
     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
 
@@ -7664,6 +9262,74 @@
                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
 }
 
+/// \brief Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
+///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
+///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
+///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
+  // Figure out whether we're looping over two inputs or just one.
+  bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+  // The modulus for the shuffle vector entries is based on whether this is
+  // a single input or not.
+  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+         "We should only be called with masks with a power-of-2 size!");
+
+  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+  // and 2^3 simultaneously. This is because we may have ambiguity with
+  // partially undef inputs.
+  bool ViableForN[3] = {true, true, true};
+
+  for (int i = 0, e = Mask.size(); i < e; ++i) {
+    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+    // want.
+    if (Mask[i] == -1)
+      continue;
+
+    bool IsAnyViable = false;
+    for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+      if (ViableForN[j]) {
+        uint64_t N = j + 1;
+
+        // The shuffle mask must be equal to (i * 2^N) % M.
+        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+          IsAnyViable = true;
+        else
+          ViableForN[j] = false;
+      }
+    // Early exit if we exhaust the possible powers of two.
+    if (!IsAnyViable)
+      break;
+  }
+
+  for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+    if (ViableForN[j])
+      return j + 1;
+
+  // Return 0 as there is no viable power of two.
+  return 0;
+}
+
 /// \brief Generic lowering of v16i8 shuffles.
 ///
 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
@@ -7681,6 +9347,22 @@
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   ArrayRef<int> OrigMask = SVOp->getMask();
   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+  // Try to use byte shift instructions.
+  if (SDValue Shift = lowerVectorShuffleAsByteShift(
+          DL, MVT::v16i8, V1, V2, OrigMask, DAG))
+    return Shift;
+
+  // Try to use byte rotation instructions.
+  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
+          DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+    return Rotate;
+
+  // Try to use a zext lowering.
+  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
+          DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
+    return ZExt;
+
   int MaskStorage[16] = {
       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
@@ -7690,8 +9372,16 @@
   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
 
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
+
   // For single-input shuffles, there are some nicer lowering tricks we can use.
-  if (isSingleInputShuffleMask(Mask)) {
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
     // Notably, this handles splat and partial-splat shuffles more efficiently.
     // However, it only makes sense if the pre-duplication shuffle simplifies
@@ -7701,10 +9391,10 @@
     // FIXME: We should check for other patterns which can be widened into an
     // i16 shuffle as well.
     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
-      for (int i = 0; i < 16; i += 2) {
-        if (Mask[i] != Mask[i + 1])
+      for (int i = 0; i < 16; i += 2)
+        if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
           return false;
-      }
+
       return true;
     };
     auto tryToWidenViaDuplication = [&]() -> SDValue {
@@ -7765,11 +9455,16 @@
                        MVT::v16i8, V1, V1);
 
       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
-      for (int i = 0; i < 16; i += 2) {
-        if (Mask[i] != -1)
-          PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
-        assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
-      }
+      for (int i = 0; i < 16; ++i)
+        if (Mask[i] != -1) {
+          int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+          assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
+          if (PostDupI16Shuffle[i / 2] == -1)
+            PostDupI16Shuffle[i / 2] = MappedMask;
+          else
+            assert(PostDupI16Shuffle[i / 2] == MappedMask &&
+                   "Conflicting entrties in the original shuffle!");
+        }
       return DAG.getNode(
           ISD::BITCAST, DL, MVT::v16i8,
           DAG.getVectorShuffle(MVT::v8i16, DL,
@@ -7786,21 +9481,108 @@
   //
   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
   if (shouldLowerAsInterleaving(Mask)) {
-    // FIXME: Figure out whether we should pack these into the low or high
-    // halves.
-
-    int EMask[16], OMask[16];
+    int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
+      return (M >= 0 && M < 8) || (M >= 16 && M < 24);
+    });
+    int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
+      return (M >= 8 && M < 16) || M >= 24;
+    });
+    int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
+                     -1, -1, -1, -1, -1, -1, -1, -1};
+    int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
+                     -1, -1, -1, -1, -1, -1, -1, -1};
+    bool UnpackLo = NumLoHalf >= NumHiHalf;
+    MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
+    MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
     for (int i = 0; i < 8; ++i) {
-      EMask[i] = Mask[2*i];
-      OMask[i] = Mask[2*i + 1];
-      EMask[i + 8] = -1;
-      OMask[i + 8] = -1;
+      TargetEMask[i] = Mask[2 * i];
+      TargetOMask[i] = Mask[2 * i + 1];
     }
 
     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
 
-    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
+    return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, Evens, Odds);
+  }
+
+  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+  // with PSHUFB. It is important to do this before we attempt to generate any
+  // blends but after all of the single-input lowerings. If the single input
+  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+  // want to preserve that and we can DAG combine any longer sequences into
+  // a PSHUFB in the end. But once we start blending from multiple inputs,
+  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+  // and there are *very* few patterns that would actually be faster than the
+  // PSHUFB approach because of its ability to zero lanes.
+  //
+  // FIXME: The only exceptions to the above are blends which are exact
+  // interleavings with direct instructions supporting them. We currently don't
+  // handle those well here.
+  if (Subtarget->hasSSSE3()) {
+    SDValue V1Mask[16];
+    SDValue V2Mask[16];
+    for (int i = 0; i < 16; ++i)
+      if (Mask[i] == -1) {
+        V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
+      } else {
+        V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
+        V2Mask[i] =
+            DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
+      }
+    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+    if (isSingleInputShuffleMask(Mask))
+      return V1; // Single inputs are easy.
+
+    // Otherwise, blend the two.
+    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+    return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+  }
+
+  // There are special ways we can lower some single-element blends.
+  if (NumV2Elements == 1)
+    if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
+                                                         Mask, Subtarget, DAG))
+      return V;
+
+  // Check whether a compaction lowering can be done. This handles shuffles
+  // which take every Nth element for some even N. See the helper function for
+  // details.
+  //
+  // We special case these as they can be particularly efficiently handled with
+  // the PACKUSB instruction on x86 and they show up in common patterns of
+  // rearranging bytes to truncate wide elements.
+  if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
+    // NumEvenDrops is the power of two stride of the elements. Another way of
+    // thinking about it is that we need to drop the even elements this many
+    // times to get the original input.
+    bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+    // First we need to zero all the dropped bytes.
+    assert(NumEvenDrops <= 3 &&
+           "No support for dropping even elements more than 3 times.");
+    // We use the mask type to pick which bytes are preserved based on how many
+    // elements are dropped.
+    MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
+    SDValue ByteClearMask =
+        DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
+                    DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
+    V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+    if (!IsSingleInput)
+      V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+
+    // Now pack things back together.
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+    V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+    for (int i = 1; i < NumEvenDrops; ++i) {
+      Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
+      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
+    }
+
+    return Result;
   }
 
   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
@@ -7899,15 +9681,933 @@
   }
 }
 
-/// \brief Tiny helper function to test whether adjacent masks are sequential.
-static bool areAdjacentMasksSequential(ArrayRef<int> Mask) {
-  for (int i = 0, Size = Mask.size(); i < Size; i += 2)
-    if (Mask[i] + 1 != Mask[i+1])
+/// \brief Helper function to test whether a shuffle mask could be
+/// simplified by widening the elements being shuffled.
+///
+/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
+/// leaves it in an unspecified state.
+///
+/// NOTE: This must handle normal vector shuffle masks and *target* vector
+/// shuffle masks. The latter have the special property of a '-2' representing
+/// a zero-ed lane of a vector.
+static bool canWidenShuffleElements(ArrayRef<int> Mask,
+                                    SmallVectorImpl<int> &WidenedMask) {
+  for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+    // If both elements are undef, its trivial.
+    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+      WidenedMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
+    // Check for an undef mask and a mask value properly aligned to fit with
+    // a pair of values. If we find such a case, use the non-undef mask's value.
+    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
+      WidenedMask.push_back(Mask[i + 1] / 2);
+      continue;
+    }
+    if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
+      WidenedMask.push_back(Mask[i] / 2);
+      continue;
+    }
+
+    // When zeroing, we need to spread the zeroing across both lanes to widen.
+    if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
+      if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
+          (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+        WidenedMask.push_back(SM_SentinelZero);
+        continue;
+      }
       return false;
+    }
+
+    // Finally check if the two mask values are adjacent and aligned with
+    // a pair.
+    if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
+      WidenedMask.push_back(Mask[i] / 2);
+      continue;
+    }
+
+    // Otherwise we can't safely widen the elements used in this shuffle.
+    return false;
+  }
+  assert(WidenedMask.size() == Mask.size() / 2 &&
+         "Incorrect size of mask after widening the elements!");
 
   return true;
 }
 
+/// \brief Generic routine to split ector shuffle into half-sized shuffles.
+///
+/// This routine just extracts two subvectors, shuffles them independently, and
+/// then concatenates them back together. This should work effectively with all
+/// AVX vector shuffle types.
+static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+                                          SDValue V2, ArrayRef<int> Mask,
+                                          SelectionDAG &DAG) {
+  assert(VT.getSizeInBits() >= 256 &&
+         "Only for 256-bit or wider vector shuffles!");
+  assert(V1.getSimpleValueType() == VT && "Bad operand type!");
+  assert(V2.getSimpleValueType() == VT && "Bad operand type!");
+
+  ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
+  ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
+
+  int NumElements = VT.getVectorNumElements();
+  int SplitNumElements = NumElements / 2;
+  MVT ScalarVT = VT.getScalarType();
+  MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
+
+  SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
+                             DAG.getIntPtrConstant(0));
+  SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
+                             DAG.getIntPtrConstant(SplitNumElements));
+  SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
+                             DAG.getIntPtrConstant(0));
+  SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
+                             DAG.getIntPtrConstant(SplitNumElements));
+
+  // Now create two 4-way blends of these half-width vectors.
+  auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+    bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
+    SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
+    for (int i = 0; i < SplitNumElements; ++i) {
+      int M = HalfMask[i];
+      if (M >= NumElements) {
+        if (M >= NumElements + SplitNumElements)
+          UseHiV2 = true;
+        else
+          UseLoV2 = true;
+        V2BlendMask.push_back(M - NumElements);
+        V1BlendMask.push_back(-1);
+        BlendMask.push_back(SplitNumElements + i);
+      } else if (M >= 0) {
+        if (M >= SplitNumElements)
+          UseHiV1 = true;
+        else
+          UseLoV1 = true;
+        V2BlendMask.push_back(-1);
+        V1BlendMask.push_back(M);
+        BlendMask.push_back(i);
+      } else {
+        V2BlendMask.push_back(-1);
+        V1BlendMask.push_back(-1);
+        BlendMask.push_back(-1);
+      }
+    }
+
+    // Because the lowering happens after all combining takes place, we need to
+    // manually combine these blend masks as much as possible so that we create
+    // a minimal number of high-level vector shuffle nodes.
+
+    // First try just blending the halves of V1 or V2.
+    if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
+      return DAG.getUNDEF(SplitVT);
+    if (!UseLoV2 && !UseHiV2)
+      return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+    if (!UseLoV1 && !UseHiV1)
+      return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+
+    SDValue V1Blend, V2Blend;
+    if (UseLoV1 && UseHiV1) {
+      V1Blend =
+        DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+    } else {
+      // We only use half of V1 so map the usage down into the final blend mask.
+      V1Blend = UseLoV1 ? LoV1 : HiV1;
+      for (int i = 0; i < SplitNumElements; ++i)
+        if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
+          BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
+    }
+    if (UseLoV2 && UseHiV2) {
+      V2Blend =
+        DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+    } else {
+      // We only use half of V2 so map the usage down into the final blend mask.
+      V2Blend = UseLoV2 ? LoV2 : HiV2;
+      for (int i = 0; i < SplitNumElements; ++i)
+        if (BlendMask[i] >= SplitNumElements)
+          BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
+    }
+    return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
+  };
+  SDValue Lo = HalfBlend(LoMask);
+  SDValue Hi = HalfBlend(HiMask);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+}
+
+/// \brief Either split a vector in halves or decompose the shuffles and the
+/// blend.
+///
+/// This is provided as a good fallback for many lowerings of non-single-input
+/// shuffles with more than one 128-bit lane. In those cases, we want to select
+/// between splitting the shuffle into 128-bit components and stitching those
+/// back together vs. extracting the single-input shuffles and blending those
+/// results.
+static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
+                                                SDValue V2, ArrayRef<int> Mask,
+                                                SelectionDAG &DAG) {
+  assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
+                                            "lower single-input shuffles as it "
+                                            "could then recurse on itself.");
+  int Size = Mask.size();
+
+  // If this can be modeled as a broadcast of two elements followed by a blend,
+  // prefer that lowering. This is especially important because broadcasts can
+  // often fold with memory operands.
+  auto DoBothBroadcast = [&] {
+    int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
+    for (int M : Mask)
+      if (M >= Size) {
+        if (V2BroadcastIdx == -1)
+          V2BroadcastIdx = M - Size;
+        else if (M - Size != V2BroadcastIdx)
+          return false;
+      } else if (M >= 0) {
+        if (V1BroadcastIdx == -1)
+          V1BroadcastIdx = M;
+        else if (M != V1BroadcastIdx)
+          return false;
+      }
+    return true;
+  };
+  if (DoBothBroadcast())
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
+                                                      DAG);
+
+  // If the inputs all stem from a single 128-bit lane of each input, then we
+  // split them rather than blending because the split will decompose to
+  // unusually few instructions.
+  int LaneCount = VT.getSizeInBits() / 128;
+  int LaneSize = Size / LaneCount;
+  SmallBitVector LaneInputs[2];
+  LaneInputs[0].resize(LaneCount, false);
+  LaneInputs[1].resize(LaneCount, false);
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0)
+      LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
+  if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
+    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+  // Otherwise, just fall back to decomposed shuffles and a blend. This requires
+  // that the decomposed single-input shuffles don't end up here.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
+/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a permutation and blend of those lanes.
+///
+/// This essentially blends the out-of-lane inputs to each lane into the lane
+/// from a permuted copy of the vector. This lowering strategy results in four
+/// instructions in the worst case for a single-input cross lane shuffle which
+/// is lower than any other fully general cross-lane shuffle strategy I'm aware
+/// of. Special cases for each particular shuffle pattern should be handled
+/// prior to trying this lowering.
+static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
+                                                       SDValue V1, SDValue V2,
+                                                       ArrayRef<int> Mask,
+                                                       SelectionDAG &DAG) {
+  // FIXME: This should probably be generalized for 512-bit vectors as well.
+  assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
+  int LaneSize = Mask.size() / 2;
+
+  // If there are only inputs from one 128-bit lane, splitting will in fact be
+  // less expensive. The flags track wether the given lane contains an element
+  // that crosses to another lane.
+  bool LaneCrossing[2] = {false, false};
+  for (int i = 0, Size = Mask.size(); i < Size; ++i)
+    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+      LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+  if (!LaneCrossing[0] || !LaneCrossing[1])
+    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+  if (isSingleInputShuffleMask(Mask)) {
+    SmallVector<int, 32> FlippedBlendMask;
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      FlippedBlendMask.push_back(
+          Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
+                                  ? Mask[i]
+                                  : Mask[i] % LaneSize +
+                                        (i / LaneSize) * LaneSize + Size));
+
+    // Flip the vector, and blend the results which should now be in-lane. The
+    // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
+    // 5 for the high source. The value 3 selects the high half of source 2 and
+    // the value 2 selects the low half of source 2. We only use source 2 to
+    // allow folding it into a memory operand.
+    unsigned PERMMask = 3 | 2 << 4;
+    SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
+                                  V1, DAG.getConstant(PERMMask, MVT::i8));
+    return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
+  }
+
+  // This now reduces to two single-input shuffles of V1 and V2 which at worst
+  // will be handled by the above logic and a blend of the results, much like
+  // other patterns in AVX.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering 2-lane 128-bit shuffles.
+static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
+                                        SDValue V2, ArrayRef<int> Mask,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  // Blends are faster and handle all the non-lane-crossing cases.
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+                               VT.getVectorNumElements() / 2);
+  // Check for patterns which can be matched with a single insert of a 128-bit
+  // subvector.
+  if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
+      isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
+    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                              DAG.getIntPtrConstant(0));
+    SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+                              Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+  }
+  if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
+    SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                              DAG.getIntPtrConstant(0));
+    SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
+                              DAG.getIntPtrConstant(2));
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+  }
+
+  // Otherwise form a 128-bit permutation.
+  // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
+  unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
+  return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
+                     DAG.getConstant(PermMask, MVT::i8));
+}
+
+/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
+                                    DAG);
+
+  if (isSingleInputShuffleMask(Mask)) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
+                                                          Mask, Subtarget, DAG))
+      return Broadcast;
+
+    if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
+      // Non-half-crossing single input shuffles can be lowerid with an
+      // interleaved permutation.
+      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
+                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
+                         DAG.getConstant(VPERMILPMask, MVT::i8));
+    }
+
+    // With AVX2 we have direct support for this permutation.
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
+                         getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+    // Otherwise, fall back.
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
+                                                   DAG);
+  }
+
+  // X86 has dedicated unpack instructions that can handle specific blend
+  // operations: UNPCKH and UNPCKL.
+  if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
+  if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
+
+  // If we have a single input to the zero element, insert that into V1 if we
+  // can do so cheaply.
+  int NumV2Elements =
+      std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
+  if (NumV2Elements == 1 && Mask[0] >= 4)
+    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+            MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
+      return Insertion;
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Check if the blend happens to exactly fit that of SHUFPD.
+  if ((Mask[0] == -1 || Mask[0] < 2) &&
+      (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
+      (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
+      (Mask[3] == -1 || Mask[3] >= 6)) {
+    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
+                          ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
+    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
+                       DAG.getConstant(SHUFPDMask, MVT::i8));
+  }
+  if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
+      (Mask[1] == -1 || Mask[1] < 2) &&
+      (Mask[2] == -1 || Mask[2] >= 6) &&
+      (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
+    unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
+                          ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
+    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
+                       DAG.getConstant(SHUFPDMask, MVT::i8));
+  }
+
+  // If we have AVX2 then we always want to lower with a blend because an v4 we
+  // can fully permute the elements.
+  if (Subtarget->hasAVX2())
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
+                                                      Mask, DAG);
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v4i64 shuffling..
+static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+  assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
+
+  SmallVector<int, 4> WidenedMask;
+  if (canWidenShuffleElements(Mask, WidenedMask))
+    return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
+                                    DAG);
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
+  // use lower latency instructions that will operate on both 128-bit lanes.
+  SmallVector<int, 2> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
+    if (isSingleInputShuffleMask(Mask)) {
+      int PSHUFDMask[] = {-1, -1, -1, -1};
+      for (int i = 0; i < 2; ++i)
+        if (RepeatedMask[i] >= 0) {
+          PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
+          PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
+        }
+      return DAG.getNode(
+          ISD::BITCAST, DL, MVT::v4i64,
+          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
+                      DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
+                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
+    }
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
+      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
+    if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
+      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
+  }
+
+  // AVX2 provides a direct instruction for permuting a single input across
+  // lanes.
+  if (isSingleInputShuffleMask(Mask))
+    return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
+                       getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+  // Otherwise fall back on generic blend lowering.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
+                                                    Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
+///
+/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
+/// isn't available.
+static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // If the shuffle mask is repeated in each 128-bit lane, we have many more
+  // options to efficiently lower the shuffle.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 4 &&
+           "Repeated masks must be half the mask width!");
+    if (isSingleInputShuffleMask(Mask))
+      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
+    if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
+
+    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
+    // have already handled any direct blends. We also need to squash the
+    // repeated mask into a simulated v4f32 mask.
+    for (int i = 0; i < 4; ++i)
+      if (RepeatedMask[i] >= 8)
+        RepeatedMask[i] -= 4;
+    return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
+  }
+
+  // If we have a single input shuffle with different shuffle patterns in the
+  // two 128-bit lanes use the variable mask to VPERMILPS.
+  if (isSingleInputShuffleMask(Mask)) {
+    SDValue VPermMask[8];
+    for (int i = 0; i < 8; ++i)
+      VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+                                 : DAG.getConstant(Mask[i], MVT::i32);
+    if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
+      return DAG.getNode(
+          X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
+
+    if (Subtarget->hasAVX2())
+      return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
+                         DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
+                                     DAG.getNode(ISD::BUILD_VECTOR, DL,
+                                                 MVT::v8i32, VPermMask)),
+                         V1);
+
+    // Otherwise, fall back.
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
+                                                   DAG);
+  }
+
+  // If we have AVX2 then we always want to lower with a blend because at v8 we
+  // can fully permute the elements.
+  if (Subtarget->hasAVX2())
+    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
+                                                      Mask, DAG);
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v8i32 shuffling..
+static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+  assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // If the shuffle mask is repeated in each 128-bit lane we can use more
+  // efficient instructions that mirror the shuffles across the two 128-bit
+  // lanes.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
+    if (isSingleInputShuffleMask(Mask))
+      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
+                         getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
+      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
+    if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
+      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
+  }
+
+  // If the shuffle patterns aren't repeated but it is a single input, directly
+  // generate a cross-lane VPERMD instruction.
+  if (isSingleInputShuffleMask(Mask)) {
+    SDValue VPermMask[8];
+    for (int i = 0; i < 8; ++i)
+      VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+                                 : DAG.getConstant(Mask[i], MVT::i32);
+    return DAG.getNode(
+        X86ISD::VPERMV, DL, MVT::v8i32,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
+  }
+
+  // Otherwise fall back on generic blend lowering.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
+                                                    Mask, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v16i16 shuffling..
+static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+  assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // There are no generalized cross-lane shuffle operations available on i16
+  // element types.
+  if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
+                                                   Mask, DAG);
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  if (isShuffleEquivalent(Mask,
+                          // First 128-bit lane:
+                          0, 16, 1, 17, 2, 18, 3, 19,
+                          // Second 128-bit lane:
+                          8, 24, 9, 25, 10, 26, 11, 27))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
+  if (isShuffleEquivalent(Mask,
+                          // First 128-bit lane:
+                          4, 20, 5, 21, 6, 22, 7, 23,
+                          // Second 128-bit lane:
+                          12, 28, 13, 29, 14, 30, 15, 31))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+
+  if (isSingleInputShuffleMask(Mask)) {
+    SDValue PSHUFBMask[32];
+    for (int i = 0; i < 16; ++i) {
+      if (Mask[i] == -1) {
+        PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
+        continue;
+      }
+
+      int M = i < 8 ? Mask[i] : Mask[i] - 8;
+      assert(M >= 0 && M < 8 && "Invalid single-input mask!");
+      PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
+      PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
+    }
+    return DAG.getNode(
+        ISD::BITCAST, DL, MVT::v16i16,
+        DAG.getNode(
+            X86ISD::PSHUFB, DL, MVT::v32i8,
+            DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
+            DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
+  }
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
+///
+/// This routine is only called when we have AVX2 and thus a reasonable
+/// instruction set for v32i8 shuffling..
+static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+  assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // There are no generalized cross-lane shuffle operations available on i8
+  // element types.
+  if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
+                                                   Mask, DAG);
+
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // Use dedicated unpack instructions for masks that match their pattern.
+  // Note that these are repeated 128-bit lane unpacks, not unpacks across all
+  // 256-bit lanes.
+  if (isShuffleEquivalent(
+          Mask,
+          // First 128-bit lane:
+          0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+          // Second 128-bit lane:
+          16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
+    return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
+  if (isShuffleEquivalent(
+          Mask,
+          // First 128-bit lane:
+          8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
+          // Second 128-bit lane:
+          24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
+    return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
+
+  if (isSingleInputShuffleMask(Mask)) {
+    SDValue PSHUFBMask[32];
+    for (int i = 0; i < 32; ++i)
+      PSHUFBMask[i] =
+          Mask[i] < 0
+              ? DAG.getUNDEF(MVT::i8)
+              : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
+
+    return DAG.getNode(
+        X86ISD::PSHUFB, DL, MVT::v32i8, V1,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
+  }
+
+  // Otherwise fall back on generic lowering.
+  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 256-bit x86 vector
+/// shuffle or splits it into two 128-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        MVT VT, const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+
+  // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
+  // check for those subtargets here and avoid much of the subtarget querying in
+  // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
+  // ability to manipulate a 256-bit vector with integer types. Since we'll use
+  // floating point types there eventually, just immediately cast everything to
+  // a float and operate entirely in that domain.
+  if (VT.isInteger() && !Subtarget->hasAVX2()) {
+    int ElementBits = VT.getScalarSizeInBits();
+    if (ElementBits < 32)
+      // No floating point type available, decompose into 128-bit vectors.
+      return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+
+    MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
+                                VT.getVectorNumElements());
+    V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
+    return DAG.getNode(ISD::BITCAST, DL, VT,
+                       DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
+  }
+
+  switch (VT.SimpleTy) {
+  case MVT::v4f64:
+    return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v4i64:
+    return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8f32:
+    return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i32:
+    return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16i16:
+    return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v32i8:
+    return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+  default:
+    llvm_unreachable("Not a valid 256-bit x86 vector type!");
+  }
+}
+
+/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  // FIXME: Implement direct support for this type!
+  return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
+static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+  // FIXME: Implement direct support for this type!
+  return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
+static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+
+  // FIXME: Implement direct support for this type!
+  return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
+static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+
+  // FIXME: Implement direct support for this type!
+  return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
+static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
+  assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+
+  // FIXME: Implement direct support for this type!
+  return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
+static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                       const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
+  assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+
+  // FIXME: Implement direct support for this type!
+  return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+}
+
+/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
+///
+/// This routine either breaks down the specific type of a 512-bit x86 vector
+/// shuffle or splits it into two 256-bit shuffles and fuses the results back
+/// together based on the available instructions.
+static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                        MVT VT, const X86Subtarget *Subtarget,
+                                        SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Subtarget->hasAVX512() &&
+         "Cannot lower 512-bit vectors w/ basic ISA!");
+
+  // Check for being able to broadcast a single element.
+  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
+                                                        Mask, Subtarget, DAG))
+    return Broadcast;
+
+  // Dispatch to each element type for lowering. If we don't have supprot for
+  // specific element type shuffles at 512 bits, immediately split them and
+  // lower them. Each lowering routine of a given type is allowed to assume that
+  // the requisite ISA extensions for that element type are available.
+  switch (VT.SimpleTy) {
+  case MVT::v8f64:
+    return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16f32:
+    return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v8i64:
+    return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v16i32:
+    return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+  case MVT::v32i16:
+    if (Subtarget->hasBWI())
+      return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    break;
+  case MVT::v64i8:
+    if (Subtarget->hasBWI())
+      return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
+    break;
+
+  default:
+    llvm_unreachable("Not a valid 512-bit x86 vector type!");
+  }
+
+  // Otherwise fall back on splitting.
+  return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+}
+
 /// \brief Top-level lowering for x86 vector shuffles.
 ///
 /// This handles decomposition, canonicalization, and lowering of all x86
@@ -7936,7 +10636,7 @@
   // but in some cases the first operand may be transformed to UNDEF.
   // In this case we should just commute the node.
   if (V1IsUndef)
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
 
   // Check for non-undef masks pointing at an undef vector and make the masks
   // undef as well. This makes it easier to match the shuffle based solely on
@@ -7951,22 +10651,25 @@
         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
       }
 
-  // For integer vector shuffles, try to collapse them into a shuffle of fewer
-  // lanes but wider integers. We cap this to not form integers larger than i64
-  // but it might be interesting to form i128 integers to handle flipping the
-  // low and high halves of AVX 256-bit vectors.
-  if (VT.isInteger() && VT.getScalarSizeInBits() < 64 &&
-      areAdjacentMasksSequential(Mask)) {
-    SmallVector<int, 8> NewMask;
-    for (int i = 0, Size = Mask.size(); i < Size; i += 2)
-      NewMask.push_back(Mask[i] / 2);
-    MVT NewVT =
-        MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2),
-                         VT.getVectorNumElements() / 2);
-    V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
-    V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask));
+  // Try to collapse shuffles into using a vector type with fewer elements but
+  // wider element types. We cap this to not form integers or floating point
+  // elements wider than 64 bits, but it might be interesting to form i128
+  // integers to handle flipping the low and high halves of AVX 256-bit vectors.
+  SmallVector<int, 16> WidenedMask;
+  if (VT.getScalarSizeInBits() < 64 &&
+      canWidenShuffleElements(Mask, WidenedMask)) {
+    MVT NewEltVT = VT.isFloatingPoint()
+                       ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
+                       : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
+    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
+    // Make sure that the new vector type is legal. For example, v2f64 isn't
+    // legal on SSE1.
+    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
+      V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
+      V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
+      return DAG.getNode(ISD::BITCAST, dl, VT,
+                         DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
+    }
   }
 
   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
@@ -7982,10 +10685,12 @@
   // V2. This allows us to match the shuffle pattern strictly on how many
   // elements come from V1 without handling the symmetric cases.
   if (NumV2Elements > NumV1Elements)
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
 
   // When the number of V1 and V2 elements are the same, try to minimize the
-  // number of uses of V2 in the low half of the vector.
+  // number of uses of V2 in the low half of the vector. When that is tied,
+  // ensure that the sum of indices for V1 is equal to or lower than the sum
+  // indices for V2.
   if (NumV1Elements == NumV2Elements) {
     int LowV1Elements = 0, LowV2Elements = 0;
     for (int M : SVOp->getMask().slice(0, NumElements / 2))
@@ -7993,14 +10698,32 @@
         ++LowV2Elements;
       else if (M >= 0)
         ++LowV1Elements;
-    if (LowV2Elements > LowV1Elements)
-      return CommuteVectorShuffle(SVOp, DAG);
+    if (LowV2Elements > LowV1Elements) {
+      return DAG.getCommutedVectorShuffle(*SVOp);
+    } else if (LowV2Elements == LowV1Elements) {
+      int SumV1Indices = 0, SumV2Indices = 0;
+      for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
+        if (SVOp->getMask()[i] >= NumElements)
+          SumV2Indices += i;
+        else if (SVOp->getMask()[i] >= 0)
+          SumV1Indices += i;
+      if (SumV2Indices < SumV1Indices)
+        return DAG.getCommutedVectorShuffle(*SVOp);
+    }
   }
 
   // For each vector width, delegate to a specialized lowering routine.
   if (VT.getSizeInBits() == 128)
     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
 
+  if (VT.getSizeInBits() == 256)
+    return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
+  // Force AVX-512 vectors to be scalarized for now.
+  // FIXME: Implement AVX-512 support!
+  if (VT.getSizeInBits() == 512)
+    return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
+
   llvm_unreachable("Unimplemented!");
 }
 
@@ -9060,6 +11783,20 @@
     To = V2;
     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
                 Mask.begin();
+
+    // If we have 1 element from each vector, we have to check if we're
+    // changing V1's element's place. If so, we're done. Otherwise, we
+    // should assume we're changing V2's element's place and behave
+    // accordingly.
+    int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
+    assert(DestIndex <= INT32_MAX && "truncated destination index");
+    if (FromV1 == FromV2 &&
+        static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
+      From = V2;
+      To = V1;
+      DestIndex =
+          std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
+    }
   } else {
     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
            "More than one element from V1 and from V2, or no elements from one "
@@ -9071,6 +11808,8 @@
         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
   }
 
+  // Get an index into the source vector in the range [0,4) (the mask is
+  // in the range [0,8) because it can address V1 and V2)
   unsigned SrcIndex = Mask[DestIndex] % 4;
   if (MayFoldLoad(From)) {
     // Trivial case, when From comes from a load and is only used by the
@@ -9155,37 +11894,6 @@
   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
     return SDValue();
 
-  // Simplify the operand as it's prepared to be fed into shuffle.
-  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
-  if (V1.getOpcode() == ISD::BITCAST &&
-      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
-      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      V1.getOperand(0).getOperand(0)
-        .getSimpleValueType().getSizeInBits() == SignificantBits) {
-    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
-    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
-    ConstantSDNode *CIdx =
-      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
-    // If it's foldable, i.e. normal load with single use, we will let code
-    // selection to fold it. Otherwise, we will short the conversion sequence.
-    if (CIdx && CIdx->getZExtValue() == 0 &&
-        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
-      MVT FullVT = V.getSimpleValueType();
-      MVT V1VT = V1.getSimpleValueType();
-      if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
-        // The "ext_vec_elt" node is wider than the result node.
-        // In this case we should extract subvector from V.
-        // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
-        unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
-        MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
-                                        FullVT.getVectorNumElements()/Ratio);
-        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
-                        DAG.getIntPtrConstant(0));
-      }
-      V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
-    }
-  }
-
   return DAG.getNode(ISD::BITCAST, DL, VT,
                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
 }
@@ -9278,7 +11986,7 @@
   // but in some cases the first operand may be transformed to UNDEF.
   // In this case we should just commute the node.
   if (V1IsUndef)
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
 
   // Vector shuffle lowering takes 3 steps:
   //
@@ -9335,7 +12043,7 @@
       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
 
     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
-      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
+      return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
                                   DAG);
 
     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
@@ -9347,6 +12055,11 @@
                                 getShufflePALIGNRImmediate(SVOp),
                                 DAG);
 
+  if (isVALIGNMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
+                                getShuffleVALIGNImmediate(SVOp),
+                                DAG);
+
   // Check if this can be converted into a logical shift.
   bool isLeft = false;
   unsigned ShAmt = 0;
@@ -9390,7 +12103,7 @@
 
   if (ShouldXformToMOVHLPS(M, VT) ||
       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
 
   if (isShift) {
     // No better options. Use a vshldq / vsrldq.
@@ -9462,7 +12175,7 @@
 
   // Normalize the node to match x86 shuffle ops if needed
   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
 
   // The checks below are all present in isShuffleMaskLegal, but they are
   // inlined here right now to enable us to directly emit target specific
@@ -9512,7 +12225,7 @@
     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
                                   getShuffleSHUFImmediate(SVOp), DAG);
-    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+    return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
                                 getShuffleSHUFImmediate(SVOp), DAG);
   }
 
@@ -9631,9 +12344,10 @@
   return true;
 }
 
-// Try to lower a vselect node into a simple blend instruction.
-static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
-                                   SelectionDAG &DAG) {
+/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
+/// instruction.
+static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
   SDValue Cond = Op.getOperand(0);
   SDValue LHS = Op.getOperand(1);
   SDValue RHS = Op.getOperand(2);
@@ -9675,7 +12389,14 @@
 }
 
 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+  // A vselect where all conditions and data are constants can be optimized into
+  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
+    return SDValue();
+
+  SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
   if (BlendOp.getNode())
     return BlendOp;
 
@@ -9688,6 +12409,8 @@
     break;
   case MVT::v8i16:
   case MVT::v16i16:
+    if (Subtarget->hasBWI() && Subtarget->hasVLX())
+      break;
     return SDValue();
   }
 
@@ -9906,59 +12629,6 @@
   return SDValue();
 }
 
-static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  MVT EltVT = VT.getVectorElementType();
-  SDLoc dl(Op);
-
-  SDValue N0 = Op.getOperand(0);
-  SDValue N1 = Op.getOperand(1);
-  SDValue N2 = Op.getOperand(2);
-
-  if (!VT.is128BitVector())
-    return SDValue();
-
-  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
-      isa<ConstantSDNode>(N2)) {
-    unsigned Opc;
-    if (VT == MVT::v8i16)
-      Opc = X86ISD::PINSRW;
-    else if (VT == MVT::v16i8)
-      Opc = X86ISD::PINSRB;
-    else
-      Opc = X86ISD::PINSRB;
-
-    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
-    // argument.
-    if (N1.getValueType() != MVT::i32)
-      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-    if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
-    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
-  }
-
-  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
-    // Bits [7:6] of the constant are the source select.  This will always be
-    //  zero here.  The DAG Combiner may combine an extract_elt index into these
-    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
-    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
-    // Bits [5:4] of the constant are the destination select.  This is the
-    //  value of the incoming immediate.
-    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
-    //   combine either bitwise AND or insert of float 0.0 to set these bits.
-    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
-    // Create this as a scalar to vector..
-    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
-    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
-  }
-
-  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
-    // PINSR* works with constant index.
-    return Op;
-  }
-  return SDValue();
-}
-
 /// Insert one bit to mask vector, like v16i1 or v8i1.
 /// AVX-512 feature.
 SDValue 
@@ -9993,11 +12663,12 @@
                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
 }
-SDValue
-X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+
+SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                  SelectionDAG &DAG) const {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
-  
+
   if (EltVT == MVT::i1)
     return InsertBitToMaskVector(Op, DAG);
 
@@ -10005,20 +12676,20 @@
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
+  if (!isa<ConstantSDNode>(N2))
+    return SDValue();
+  auto *N2C = cast<ConstantSDNode>(N2);
+  unsigned IdxVal = N2C->getZExtValue();
 
-  // If this is a 256-bit vector result, first extract the 128-bit vector,
-  // insert the element into the extracted half and then place it back.
+  // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
+  // into that, and then insert the subvector back into the result.
   if (VT.is256BitVector() || VT.is512BitVector()) {
-    if (!isa<ConstantSDNode>(N2))
-      return SDValue();
-
     // Get the desired 128-bit vector half.
-    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
 
     // Insert the element into the desired half.
-    unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
-    unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
+    unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
+    unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
 
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
                     DAG.getConstant(IdxIn128, MVT::i32));
@@ -10026,20 +12697,60 @@
     // Insert the changed part back to the 256-bit vector
     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
   }
+  assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
-  if (Subtarget->hasSSE41())
-    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+  if (Subtarget->hasSSE41()) {
+    if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
+      unsigned Opc;
+      if (VT == MVT::v8i16) {
+        Opc = X86ISD::PINSRW;
+      } else {
+        assert(VT == MVT::v16i8);
+        Opc = X86ISD::PINSRB;
+      }
+
+      // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+      // argument.
+      if (N1.getValueType() != MVT::i32)
+        N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+      if (N2.getValueType() != MVT::i32)
+        N2 = DAG.getIntPtrConstant(IdxVal);
+      return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+    }
+
+    if (EltVT == MVT::f32) {
+      // Bits [7:6] of the constant are the source select.  This will always be
+      //  zero here.  The DAG Combiner may combine an extract_elt index into
+      //  these
+      //  bits.  For example (insert (extract, 3), 2) could be matched by
+      //  putting
+      //  the '3' into bits [7:6] of X86ISD::INSERTPS.
+      // Bits [5:4] of the constant are the destination select.  This is the
+      //  value of the incoming immediate.
+      // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
+      //   combine either bitwise AND or insert of float 0.0 to set these bits.
+      N2 = DAG.getIntPtrConstant(IdxVal << 4);
+      // Create this as a scalar to vector..
+      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+    }
+
+    if (EltVT == MVT::i32 || EltVT == MVT::i64) {
+      // PINSR* works with constant index.
+      return Op;
+    }
+  }
 
   if (EltVT == MVT::i8)
     return SDValue();
 
-  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
+  if (EltVT.getSizeInBits() == 16) {
     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
     // as its second argument.
     if (N1.getValueType() != MVT::i32)
       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
     if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+      N2 = DAG.getIntPtrConstant(IdxVal);
     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
   }
   return SDValue();
@@ -10352,6 +13063,7 @@
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   MFI->setAdjustsStack(true);
+  MFI->setHasCalls(true);
 
   SDValue Flag = Chain.getValue(1);
   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
@@ -10585,7 +13297,7 @@
     if (Subtarget->is64Bit())
       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
                            IDX, MachinePointerInfo(), MVT::i32,
-                           false, false, 0);
+                           false, false, false, 0);
     else
       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
                         false, false, false, 0);
@@ -10669,10 +13381,18 @@
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+  SDLoc dl(Op);
 
-  if (SrcVT.isVector())
+  if (SrcVT.isVector()) {
+    if (SrcVT.getVectorElementType() == MVT::i1) {
+      MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
+      return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                         DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
+                                     Op.getOperand(0)));
+    }
     return SDValue();
-
+  }
+  
   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
          "Unknown SINT_TO_FP to lower!");
 
@@ -10685,7 +13405,6 @@
     return Op;
   }
 
-  SDLoc dl(Op);
   unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
@@ -10872,19 +13591,135 @@
   return Sub;
 }
 
+static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
+  // The algorithm is the following:
+  // #ifdef __SSE4_1__
+  //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+  //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+  //                                 (uint4) 0x53000000, 0xaa);
+  // #else
+  //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+  //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
+  // #endif
+  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+  //     return (float4) lo + fhi;
+
+  SDLoc DL(Op);
+  SDValue V = Op->getOperand(0);
+  EVT VecIntVT = V.getValueType();
+  bool Is128 = VecIntVT == MVT::v4i32;
+  EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
+  // If we convert to something else than the supported type, e.g., to v4f64,
+  // abort early.
+  if (VecFloatVT != Op->getValueType(0))
+    return SDValue();
+
+  unsigned NumElts = VecIntVT.getVectorNumElements();
+  assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
+         "Unsupported custom type");
+  assert(NumElts <= 8 && "The size of the constant array must be fixed");
+
+  // In the #idef/#else code, we have in common:
+  // - The vector of constants:
+  // -- 0x4b000000
+  // -- 0x53000000
+  // - A shift:
+  // -- v >> 16
+
+  // Create the splat vector for 0x4b000000.
+  SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
+  SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
+                           CstLow, CstLow, CstLow, CstLow};
+  SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+                                  makeArrayRef(&CstLowArray[0], NumElts));
+  // Create the splat vector for 0x53000000.
+  SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
+  SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
+                            CstHigh, CstHigh, CstHigh, CstHigh};
+  SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+                                   makeArrayRef(&CstHighArray[0], NumElts));
+
+  // Create the right shift.
+  SDValue CstShift = DAG.getConstant(16, MVT::i32);
+  SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
+                             CstShift, CstShift, CstShift, CstShift};
+  SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
+                                    makeArrayRef(&CstShiftArray[0], NumElts));
+  SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
+
+  SDValue Low, High;
+  if (Subtarget.hasSSE41()) {
+    EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
+    //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
+    SDValue VecCstLowBitcast =
+        DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
+    SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
+    // Low will be bitcasted right away, so do not bother bitcasting back to its
+    // original type.
+    Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
+                      VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
+    //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
+    //                                 (uint4) 0x53000000, 0xaa);
+    SDValue VecCstHighBitcast =
+        DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
+    SDValue VecShiftBitcast =
+        DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
+    // High will be bitcasted right away, so do not bother bitcasting back to
+    // its original type.
+    High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
+                       VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
+  } else {
+    SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
+    SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
+                                     CstMask, CstMask, CstMask);
+    //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
+    SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
+    Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
+
+    //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
+    High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
+  }
+
+  // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
+  SDValue CstFAdd = DAG.getConstantFP(
+      APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
+  SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
+                            CstFAdd, CstFAdd, CstFAdd, CstFAdd};
+  SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
+                                   makeArrayRef(&CstFAddArray[0], NumElts));
+
+  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+  SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
+  SDValue FHigh =
+      DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
+  //     return (float4) lo + fhi;
+  SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
+  return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
+}
+
 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   MVT SVT = N0.getSimpleValueType();
   SDLoc dl(Op);
 
-  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
-          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
-         "Custom UINT_TO_FP is not supported!");
-
-  MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
-  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
-                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+  switch (SVT.SimpleTy) {
+  default:
+    llvm_unreachable("Custom UINT_TO_FP is not supported!");
+  case MVT::v4i8:
+  case MVT::v4i16:
+  case MVT::v8i8:
+  case MVT::v8i16: {
+    MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                       DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+  }
+  case MVT::v4i32:
+  case MVT::v8i32:
+    return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
+  }
+  llvm_unreachable(nullptr);
 }
 
 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
@@ -10970,7 +13805,7 @@
   // FIXME: Avoid the extend by constructing the right constant pool?
   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
                                  FudgePtr, MachinePointerInfo::getConstantPool(),
-                                 MVT::f32, false, false, 4);
+                                 MVT::f32, false, false, false, 4);
   // Extend everything to 80 bits to force it to be done on x87.
   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
@@ -11184,12 +14019,9 @@
   if (VT == MVT::i1) {
     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
            "Invalid scalar TRUNCATE operation");
-    if (InVT == MVT::i32)
+    if (InVT.getSizeInBits() >= 32)
       return SDValue();
-    if (InVT.getSizeInBits() == 64)
-      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In);
-    else if (InVT.getSizeInBits() < 32)
-      In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
+    In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
   }
   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
@@ -11367,23 +14199,47 @@
                                  In, DAG.getUNDEF(SVT)));
 }
 
-static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
-  LLVMContext *Context = DAG.getContext();
+/// The only differences between FABS and FNEG are the mask and the logic op.
+/// FNEG also has a folding opportunity for FNEG(FABS(x)).
+static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
+         "Wrong opcode for lowering FABS or FNEG.");
+
+  bool IsFABS = (Op.getOpcode() == ISD::FABS);
+
+  // If this is a FABS and it has an FNEG user, bail out to fold the combination
+  // into an FNABS. We'll lower the FABS after that if it is still in use.
+  if (IsFABS)
+    for (SDNode *User : Op->uses())
+      if (User->getOpcode() == ISD::FNEG)
+        return Op;
+
+  SDValue Op0 = Op.getOperand(0);
+  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
+
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
+  // Assume scalar op for initialization; update for vector if needed.
+  // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
+  // generate a 16-byte vector constant and logic op even for the scalar case.
+  // Using a 16-byte mask allows folding the load of the mask with
+  // the logic op, so it can save (~4 bytes) on code size.
   MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
+  // decide if we should generate a 16-byte constant mask when we only need 4 or
+  // 8 bytes for the scalar case.
   if (VT.isVector()) {
     EltVT = VT.getVectorElementType();
     NumElts = VT.getVectorNumElements();
   }
-  Constant *C;
-  if (EltVT == MVT::f64)
-    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
-                                          APInt(64, ~(1ULL << 63))));
-  else
-    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
-                                          APInt(32, ~(1U << 31))));
+  
+  unsigned EltBits = EltVT.getSizeInBits();
+  LLVMContext *Context = DAG.getContext();
+  // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
+  APInt MaskElt =
+    IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
+  Constant *C = ConstantInt::get(*Context, MaskElt);
   C = ConstantVector::getSplat(NumElts, C);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
@@ -11391,51 +14247,24 @@
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, Alignment);
-  if (VT.isVector()) {
-    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(ISD::AND, dl, ANDVT,
-                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
-                                               Op.getOperand(0)),
-                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
-  }
-  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
-}
 
-static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) {
-  LLVMContext *Context = DAG.getContext();
-  SDLoc dl(Op);
-  MVT VT = Op.getSimpleValueType();
-  MVT EltVT = VT;
-  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   if (VT.isVector()) {
-    EltVT = VT.getVectorElementType();
-    NumElts = VT.getVectorNumElements();
-  }
-  Constant *C;
-  if (EltVT == MVT::f64)
-    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
-                                          APInt(64, 1ULL << 63)));
-  else
-    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
-                                          APInt(32, 1U << 31)));
-  C = ConstantVector::getSplat(NumElts, C);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
-  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                             MachinePointerInfo::getConstantPool(),
-                             false, false, false, Alignment);
-  if (VT.isVector()) {
-    MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
+    // For a vector, cast operands to a vector type, perform the logic op,
+    // and cast the result back to the original value type.
+    MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+    SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
+    SDValue Operand = IsFNABS ?
+      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
+      DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
+    unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
     return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getNode(ISD::XOR, dl, XORVT,
-                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
-                                               Op.getOperand(0)),
-                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
+                       DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
   }
-
-  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+  
+  // If not vector, then scalar.
+  unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
+  return DAG.getNode(BitOp, dl, VT, Operand, Mask);
 }
 
 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -11529,8 +14358,7 @@
   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
 }
 
-// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
-//
+// Check whether an OR'd tree is PTEST-able.
 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
@@ -11938,6 +14766,66 @@
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps,
+                                            bool &UseOneConstNR) const {
+  // FIXME: We should use instruction latency models to calculate the cost of
+  // each potential sequence, but this is very hard to do reliably because
+  // at least Intel's Core* chips have variable timing based on the number of
+  // significant digits in the divisor and/or sqrt operand.
+  if (!Subtarget->useSqrtEst())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  
+  // SSE1 has rsqrtss and rsqrtps.
+  // TODO: Add support for AVX512 (v16f32).
+  // It is likely not profitable to do this for f64 because a double-precision
+  // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
+  // instructions: convert to single, rsqrtss, convert back to double, refine
+  // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
+  // along with FMA, this could be a throughput win.
+  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+    RefinementSteps = 1;
+    UseOneConstNR = false;
+    return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
+  }
+  return SDValue();
+}
+
+/// The minimum architected relative accuracy is 2^-12. We need one
+/// Newton-Raphson step to have a good float result (24 bits of precision).
+SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
+                                            DAGCombinerInfo &DCI,
+                                            unsigned &RefinementSteps) const {
+  // FIXME: We should use instruction latency models to calculate the cost of
+  // each potential sequence, but this is very hard to do reliably because
+  // at least Intel's Core* chips have variable timing based on the number of
+  // significant digits in the divisor.
+  if (!Subtarget->useReciprocalEst())
+    return SDValue();
+  
+  EVT VT = Op.getValueType();
+  
+  // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
+  // TODO: Add support for AVX512 (v16f32).
+  // It is likely not profitable to do this for f64 because a double-precision
+  // reciprocal estimate with refinement on x86 prior to FMA requires
+  // 15 instructions: convert to single, rcpss, convert back to double, refine
+  // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
+  // along with FMA, this could be a throughput win.
+  if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
+      (Subtarget->hasAVX() && VT == MVT::v8f32)) {
+    RefinementSteps = ReciprocalEstimateRefinementSteps;
+    return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
+  }
+  return SDValue();
+}
+
 static bool isAllOnes(SDValue V) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   return C && C->isAllOnesValue();
@@ -12097,7 +14985,7 @@
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
-  assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
+  assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
          Op.getValueType().getScalarType() == MVT::i1 &&
          "Cannot set masked compare for this operation");
 
@@ -12211,11 +15099,12 @@
   EVT OpVT = Op1.getValueType();
   if (Subtarget->hasAVX512()) {
     if (Op1.getValueType().is512BitVector() ||
+        (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
 
     // In AVX-512 architecture setcc returns mask with i1 elements,
-    // But there is no compare instruction for i8 and i16 elements.
+    // But there is no compare instruction for i8 and i16 elements in KNL.
     // We are not talking about 512-bit operands in this case, these
     // types are illegal.
     if (MaskResult &&
@@ -12721,18 +15610,40 @@
   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
 }
 
-static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
+  MVT VTElt = VT.getVectorElementType();
+  MVT InVTElt = InVT.getVectorElementType();
   SDLoc dl(Op);
 
+  // SKX processor
+  if ((InVTElt == MVT::i1) &&
+      (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
+        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
+
+       ((Subtarget->hasBWI() && VT.is512BitVector() &&
+        VTElt.getSizeInBits() <= 16)) ||
+
+       ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
+        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+    
+       ((Subtarget->hasDQI() && VT.is512BitVector() &&
+        VTElt.getSizeInBits() >= 32))))
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+    
   unsigned int NumElts = VT.getVectorNumElements();
+
   if (NumElts != 8 && NumElts != 16)
     return SDValue();
 
-  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+    if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
+      return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+  }
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
@@ -12760,7 +15671,7 @@
   SDLoc dl(Op);
 
   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
-    return LowerSIGN_EXTEND_AVX512(Op, DAG);
+    return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
 
   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
@@ -12803,6 +15714,210 @@
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
+// Lower vector extended loads using a shuffle. If SSSE3 is not available we
+// may emit an illegal shuffle but the expansion is still better than scalar
+// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
+// we'll emit a shuffle and a arithmetic shift.
+// TODO: It is possible to support ZExt by zeroing the undef values during
+// the shuffle phase or after the shuffle.
+static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  MVT RegVT = Op.getSimpleValueType();
+  assert(RegVT.isVector() && "We only custom lower vector sext loads.");
+  assert(RegVT.isInteger() &&
+         "We only custom lower integer vector sext loads.");
+
+  // Nothing useful we can do without SSE2 shuffles.
+  assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
+
+  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
+  SDLoc dl(Ld);
+  EVT MemVT = Ld->getMemoryVT();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned RegSz = RegVT.getSizeInBits();
+
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+
+  assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
+         && "Only anyext and sext are currently implemented.");
+  assert(MemVT != RegVT && "Cannot extend to the same type");
+  assert(MemVT.isVector() && "Must load a vector from memory");
+
+  unsigned NumElems = RegVT.getVectorNumElements();
+  unsigned MemSz = MemVT.getSizeInBits();
+  assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+  if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
+    // The only way in which we have a legal 256-bit vector result but not the
+    // integer 256-bit operations needed to directly lower a sextload is if we
+    // have AVX1 but not AVX2. In that case, we can always emit a sextload to
+    // a 128-bit vector and a normal sign_extend to 256-bits that should get
+    // correctly legalized. We do this late to allow the canonical form of
+    // sextload to persist throughout the rest of the DAG combiner -- it wants
+    // to fold together any extensions it can, and so will fuse a sign_extend
+    // of an sextload into a sextload targeting a wider value.
+    SDValue Load;
+    if (MemSz == 128) {
+      // Just switch this to a normal load.
+      assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
+                                       "it must be a legal 128-bit vector "
+                                       "type!");
+      Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
+                  Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
+                  Ld->isInvariant(), Ld->getAlignment());
+    } else {
+      assert(MemSz < 128 &&
+             "Can't extend a type wider than 128 bits to a 256 bit vector!");
+      // Do an sext load to a 128-bit vector type. We want to use the same
+      // number of elements, but elements half as wide. This will end up being
+      // recursively lowered by this routine, but will succeed as we definitely
+      // have all the necessary features if we're using AVX1.
+      EVT HalfEltVT =
+          EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
+      EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
+      Load =
+          DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
+                         Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
+                         Ld->isNonTemporal(), Ld->isInvariant(),
+                         Ld->getAlignment());
+    }
+
+    // Replace chain users with the new chain.
+    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
+
+    // Finally, do a normal sign-extend to the desired register.
+    return DAG.getSExtOrTrunc(Load, dl, RegVT);
+  }
+
+  // All sizes must be a power of two.
+  assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
+         "Non-power-of-two elements are not custom lowered!");
+
+  // Attempt to load the original value using scalar loads.
+  // Find the largest scalar type that divides the total loaded size.
+  MVT SclrLoadTy = MVT::i8;
+  for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+       tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+    MVT Tp = (MVT::SimpleValueType)tp;
+    if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+      SclrLoadTy = Tp;
+    }
+  }
+
+  // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+  if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+      (64 <= MemSz))
+    SclrLoadTy = MVT::f64;
+
+  // Calculate the number of scalar loads that we need to perform
+  // in order to load our vector from memory.
+  unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+
+  assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
+         "Can only lower sext loads with a single scalar load!");
+
+  unsigned loadRegZize = RegSz;
+  if (Ext == ISD::SEXTLOAD && RegSz == 256)
+    loadRegZize /= 2;
+
+  // Represent our vector as a sequence of elements which are the
+  // largest scalar that we can load.
+  EVT LoadUnitVecVT = EVT::getVectorVT(
+      *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
+
+  // Represent the data using the same element type that is stored in
+  // memory. In practice, we ''widen'' MemVT.
+  EVT WideVecVT =
+      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                       loadRegZize / MemVT.getScalarType().getSizeInBits());
+
+  assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+         "Invalid vector type");
+
+  // We can't shuffle using an illegal type.
+  assert(TLI.isTypeLegal(WideVecVT) &&
+         "We only lower types that form legal widened vector types");
+
+  SmallVector<SDValue, 8> Chains;
+  SDValue Ptr = Ld->getBasePtr();
+  SDValue Increment =
+      DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
+  SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    // Perform a single load.
+    SDValue ScalarLoad =
+        DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
+                    Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
+                    Ld->getAlignment());
+    Chains.push_back(ScalarLoad.getValue(1));
+    // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+    // another round of DAGCombining.
+    if (i == 0)
+      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+    else
+      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                        ScalarLoad, DAG.getIntPtrConstant(i));
+
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+  }
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+
+  // Bitcast the loaded value to a vector of the original element type, in
+  // the size of the target vector type.
+  SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
+  unsigned SizeRatio = RegSz / MemSz;
+
+  if (Ext == ISD::SEXTLOAD) {
+    // If we have SSE4.1, we can directly emit a VSEXT node.
+    if (Subtarget->hasSSE41()) {
+      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+      return Sext;
+    }
+
+    // Otherwise we'll shuffle the small elements in the high bits of the
+    // larger type and perform an arithmetic shift. If the shift is not legal
+    // it's better to scalarize.
+    assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
+           "We can't implement a sext load without an arithmetic right shift!");
+
+    // Redistribute the loaded elements into the different locations.
+    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
+
+    SDValue Shuff = DAG.getVectorShuffle(
+        WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+
+    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+    // Build the arithmetic shift.
+    unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+                   MemVT.getVectorElementType().getSizeInBits();
+    Shuff =
+        DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
+
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+    return Shuff;
+  }
+
+  // Redistribute the loaded elements into the different locations.
+  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
+  for (unsigned i = 0; i != NumElems; ++i)
+    ShuffleVec[i * SizeRatio] = i;
+
+  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                       DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
+
+  // Bitcast to the requested type.
+  Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+  return Shuff;
+}
+
 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
 // from the AND / OR.
@@ -13108,7 +16223,7 @@
 }
 
 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
-// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// Calls to _alloca are needed to probe the stack when allocating more than 4k
 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
 // that the guard pages used by the OS virtual memory manager are allocated in
 // correct sequence.
@@ -13143,7 +16258,7 @@
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-    const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering();
+    const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
@@ -13166,7 +16281,7 @@
   EVT VT = Op.getNode()->getValueType(0);
 
   bool Is64Bit = Subtarget->is64Bit();
-  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
+  EVT SPTy = getPointerTy();
 
   if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -13184,7 +16299,7 @@
     }
 
     const TargetRegisterClass *AddrRegClass =
-      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+      getRegClassFor(getPointerTy());
     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
@@ -13193,7 +16308,7 @@
     return DAG.getMergeValues(Ops1, dl);
   } else {
     SDValue Flag;
-    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+    const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
 
     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
     Flag = Chain.getValue(1);
@@ -13201,8 +16316,8 @@
 
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
 
-    const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+        DAG.getSubtarget().getRegisterInfo());
     unsigned SPReg = RegInfo->getStackRegister();
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
     Chain = SP.getValue(1);
@@ -13475,112 +16590,178 @@
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
-static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
-  SDLoc dl(Op);
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  switch (IntNo) {
-  default: return SDValue();    // Don't custom lower most intrinsics.
-  // Comparison intrinsics.
-  case Intrinsic::x86_sse_comieq_ss:
-  case Intrinsic::x86_sse_comilt_ss:
-  case Intrinsic::x86_sse_comile_ss:
-  case Intrinsic::x86_sse_comigt_ss:
-  case Intrinsic::x86_sse_comige_ss:
-  case Intrinsic::x86_sse_comineq_ss:
-  case Intrinsic::x86_sse_ucomieq_ss:
-  case Intrinsic::x86_sse_ucomilt_ss:
-  case Intrinsic::x86_sse_ucomile_ss:
-  case Intrinsic::x86_sse_ucomigt_ss:
-  case Intrinsic::x86_sse_ucomige_ss:
-  case Intrinsic::x86_sse_ucomineq_ss:
-  case Intrinsic::x86_sse2_comieq_sd:
-  case Intrinsic::x86_sse2_comilt_sd:
-  case Intrinsic::x86_sse2_comile_sd:
-  case Intrinsic::x86_sse2_comigt_sd:
-  case Intrinsic::x86_sse2_comige_sd:
-  case Intrinsic::x86_sse2_comineq_sd:
-  case Intrinsic::x86_sse2_ucomieq_sd:
-  case Intrinsic::x86_sse2_ucomilt_sd:
-  case Intrinsic::x86_sse2_ucomile_sd:
-  case Intrinsic::x86_sse2_ucomigt_sd:
-  case Intrinsic::x86_sse2_ucomige_sd:
-  case Intrinsic::x86_sse2_ucomineq_sd: {
-    unsigned Opc;
-    ISD::CondCode CC;
+/// \brief Return (and \p Op, \p Mask) for compare instructions or
+/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
+/// necessary casting for \p Mask when lowering masking intrinsics.
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
+    EVT VT = Op.getValueType();
+    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
+                                  MVT::i1, VT.getVectorNumElements());
+    EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                     Mask.getValueType().getSizeInBits());
+    SDLoc dl(Op);
+
+    assert(MaskVT.isSimple() && "invalid mask type");
+
+    if (isAllOnes(Mask))
+      return Op;
+
+    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+    // are extracted by EXTRACT_SUBVECTOR.
+    SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                              DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
+                              DAG.getIntPtrConstant(0));
+
+    switch (Op.getOpcode()) {
+      default: break;
+      case X86ISD::PCMPEQM:
+      case X86ISD::PCMPGTM:
+      case X86ISD::CMPM:
+      case X86ISD::CMPMU:
+        return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
+    }
+    if (PreservedSrc.getOpcode() == ISD::UNDEF)
+      PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+    return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
+}
+
+static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_sse_comieq_ss:
-    case Intrinsic::x86_sse2_comieq_sd:
-      Opc = X86ISD::COMI;
-      CC = ISD::SETEQ;
-      break;
-    case Intrinsic::x86_sse_comilt_ss:
-    case Intrinsic::x86_sse2_comilt_sd:
-      Opc = X86ISD::COMI;
-      CC = ISD::SETLT;
-      break;
-    case Intrinsic::x86_sse_comile_ss:
-    case Intrinsic::x86_sse2_comile_sd:
-      Opc = X86ISD::COMI;
-      CC = ISD::SETLE;
-      break;
-    case Intrinsic::x86_sse_comigt_ss:
-    case Intrinsic::x86_sse2_comigt_sd:
-      Opc = X86ISD::COMI;
-      CC = ISD::SETGT;
-      break;
-    case Intrinsic::x86_sse_comige_ss:
-    case Intrinsic::x86_sse2_comige_sd:
-      Opc = X86ISD::COMI;
-      CC = ISD::SETGE;
-      break;
-    case Intrinsic::x86_sse_comineq_ss:
-    case Intrinsic::x86_sse2_comineq_sd:
-      Opc = X86ISD::COMI;
-      CC = ISD::SETNE;
-      break;
-    case Intrinsic::x86_sse_ucomieq_ss:
-    case Intrinsic::x86_sse2_ucomieq_sd:
-      Opc = X86ISD::UCOMI;
-      CC = ISD::SETEQ;
-      break;
-    case Intrinsic::x86_sse_ucomilt_ss:
-    case Intrinsic::x86_sse2_ucomilt_sd:
-      Opc = X86ISD::UCOMI;
-      CC = ISD::SETLT;
-      break;
-    case Intrinsic::x86_sse_ucomile_ss:
-    case Intrinsic::x86_sse2_ucomile_sd:
-      Opc = X86ISD::UCOMI;
-      CC = ISD::SETLE;
-      break;
-    case Intrinsic::x86_sse_ucomigt_ss:
-    case Intrinsic::x86_sse2_ucomigt_sd:
-      Opc = X86ISD::UCOMI;
-      CC = ISD::SETGT;
-      break;
-    case Intrinsic::x86_sse_ucomige_ss:
-    case Intrinsic::x86_sse2_ucomige_sd:
-      Opc = X86ISD::UCOMI;
-      CC = ISD::SETGE;
-      break;
-    case Intrinsic::x86_sse_ucomineq_ss:
-    case Intrinsic::x86_sse2_ucomineq_sd:
-      Opc = X86ISD::UCOMI;
-      CC = ISD::SETNE;
+    case Intrinsic::x86_fma_vfmadd_ps:
+    case Intrinsic::x86_fma_vfmadd_pd:
+    case Intrinsic::x86_fma_vfmadd_ps_256:
+    case Intrinsic::x86_fma_vfmadd_pd_256:
+    case Intrinsic::x86_fma_mask_vfmadd_ps_512:
+    case Intrinsic::x86_fma_mask_vfmadd_pd_512:
+      return X86ISD::FMADD;
+    case Intrinsic::x86_fma_vfmsub_ps:
+    case Intrinsic::x86_fma_vfmsub_pd:
+    case Intrinsic::x86_fma_vfmsub_ps_256:
+    case Intrinsic::x86_fma_vfmsub_pd_256:
+    case Intrinsic::x86_fma_mask_vfmsub_ps_512:
+    case Intrinsic::x86_fma_mask_vfmsub_pd_512:
+      return X86ISD::FMSUB;
+    case Intrinsic::x86_fma_vfnmadd_ps:
+    case Intrinsic::x86_fma_vfnmadd_pd:
+    case Intrinsic::x86_fma_vfnmadd_ps_256:
+    case Intrinsic::x86_fma_vfnmadd_pd_256:
+    case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
+    case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
+      return X86ISD::FNMADD;
+    case Intrinsic::x86_fma_vfnmsub_ps:
+    case Intrinsic::x86_fma_vfnmsub_pd:
+    case Intrinsic::x86_fma_vfnmsub_ps_256:
+    case Intrinsic::x86_fma_vfnmsub_pd_256:
+    case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
+    case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
+      return X86ISD::FNMSUB;
+    case Intrinsic::x86_fma_vfmaddsub_ps:
+    case Intrinsic::x86_fma_vfmaddsub_pd:
+    case Intrinsic::x86_fma_vfmaddsub_ps_256:
+    case Intrinsic::x86_fma_vfmaddsub_pd_256:
+    case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
+    case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
+      return X86ISD::FMADDSUB;
+    case Intrinsic::x86_fma_vfmsubadd_ps:
+    case Intrinsic::x86_fma_vfmsubadd_pd:
+    case Intrinsic::x86_fma_vfmsubadd_ps_256:
+    case Intrinsic::x86_fma_vfmsubadd_pd_256:
+    case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
+    case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
+      return X86ISD::FMSUBADD;
+    }
+}
+
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+                                       SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  EVT VT = Op.getValueType();
+  const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+  if (IntrData) {
+    switch(IntrData->Type) {
+    case INTR_TYPE_1OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
+    case INTR_TYPE_2OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2));
+    case INTR_TYPE_3OP:
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
+        Op.getOperand(2), Op.getOperand(3));
+    case INTR_TYPE_1OP_MASK_RM: {
+      SDValue Src = Op.getOperand(1);
+      SDValue Src0 = Op.getOperand(2);
+      SDValue Mask = Op.getOperand(3);
+      SDValue RoundingMode = Op.getOperand(4);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
+                                              RoundingMode),
+                                  Mask, Src0, Subtarget, DAG);
+    }
+                                              
+    case CMP_MASK:
+    case CMP_MASK_CC: {
+      // Comparison intrinsics with masks.
+      // Example of transformation:
+      // (i8 (int_x86_avx512_mask_pcmpeq_q_128
+      //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
+      // (i8 (bitcast
+      //   (v8i1 (insert_subvector undef,
+      //           (v2i1 (and (PCMPEQM %a, %b),
+      //                      (extract_subvector
+      //                         (v8i1 (bitcast %mask)), 0))), 0))))
+      EVT VT = Op.getOperand(1).getValueType();
+      EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    VT.getVectorNumElements());
+      SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
+      EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                       Mask.getValueType().getSizeInBits());
+      SDValue Cmp;
+      if (IntrData->Type == CMP_MASK_CC) {
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                    Op.getOperand(2), Op.getOperand(3));
+      } else {
+        assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
+                    Op.getOperand(2));
+      }
+      SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
+                                             DAG.getTargetConstant(0, MaskVT),
+                                             Subtarget, DAG);
+      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
+                                DAG.getUNDEF(BitcastVT), CmpMask,
+                                DAG.getIntPtrConstant(0));
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+    case COMI: { // Comparison intrinsics
+      ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
+      SDValue LHS = Op.getOperand(1);
+      SDValue RHS = Op.getOperand(2);
+      unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+      assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
+      SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
+      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                  DAG.getConstant(X86CC, MVT::i8), Cond);
+      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+    }
+    case VSHIFT:
+      return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
+                                 Op.getOperand(1), Op.getOperand(2), DAG);
+    case VSHIFT_MASK:
+      return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
+                                                      Op.getOperand(1), Op.getOperand(2), DAG),
+                                  Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);;
+    default:
       break;
     }
-
-    SDValue LHS = Op.getOperand(1);
-    SDValue RHS = Op.getOperand(2);
-    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
-    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
-    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
-    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-                                DAG.getConstant(X86CC, MVT::i8), Cond);
-    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+
   // Arithmetic intrinsics.
   case Intrinsic::x86_sse2_pmulu_dq:
   case Intrinsic::x86_avx2_pmulu_dq:
@@ -13602,128 +16783,6 @@
     return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
 
-  // SSE2/AVX2 sub with unsigned saturation intrinsics
-  case Intrinsic::x86_sse2_psubus_b:
-  case Intrinsic::x86_sse2_psubus_w:
-  case Intrinsic::x86_avx2_psubus_b:
-  case Intrinsic::x86_avx2_psubus_w:
-    return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-
-  // SSE3/AVX horizontal add/sub intrinsics
-  case Intrinsic::x86_sse3_hadd_ps:
-  case Intrinsic::x86_sse3_hadd_pd:
-  case Intrinsic::x86_avx_hadd_ps_256:
-  case Intrinsic::x86_avx_hadd_pd_256:
-  case Intrinsic::x86_sse3_hsub_ps:
-  case Intrinsic::x86_sse3_hsub_pd:
-  case Intrinsic::x86_avx_hsub_ps_256:
-  case Intrinsic::x86_avx_hsub_pd_256:
-  case Intrinsic::x86_ssse3_phadd_w_128:
-  case Intrinsic::x86_ssse3_phadd_d_128:
-  case Intrinsic::x86_avx2_phadd_w:
-  case Intrinsic::x86_avx2_phadd_d:
-  case Intrinsic::x86_ssse3_phsub_w_128:
-  case Intrinsic::x86_ssse3_phsub_d_128:
-  case Intrinsic::x86_avx2_phsub_w:
-  case Intrinsic::x86_avx2_phsub_d: {
-    unsigned Opcode;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_sse3_hadd_ps:
-    case Intrinsic::x86_sse3_hadd_pd:
-    case Intrinsic::x86_avx_hadd_ps_256:
-    case Intrinsic::x86_avx_hadd_pd_256:
-      Opcode = X86ISD::FHADD;
-      break;
-    case Intrinsic::x86_sse3_hsub_ps:
-    case Intrinsic::x86_sse3_hsub_pd:
-    case Intrinsic::x86_avx_hsub_ps_256:
-    case Intrinsic::x86_avx_hsub_pd_256:
-      Opcode = X86ISD::FHSUB;
-      break;
-    case Intrinsic::x86_ssse3_phadd_w_128:
-    case Intrinsic::x86_ssse3_phadd_d_128:
-    case Intrinsic::x86_avx2_phadd_w:
-    case Intrinsic::x86_avx2_phadd_d:
-      Opcode = X86ISD::HADD;
-      break;
-    case Intrinsic::x86_ssse3_phsub_w_128:
-    case Intrinsic::x86_ssse3_phsub_d_128:
-    case Intrinsic::x86_avx2_phsub_w:
-    case Intrinsic::x86_avx2_phsub_d:
-      Opcode = X86ISD::HSUB;
-      break;
-    }
-    return DAG.getNode(Opcode, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-  }
-
-  // SSE2/SSE41/AVX2 integer max/min intrinsics.
-  case Intrinsic::x86_sse2_pmaxu_b:
-  case Intrinsic::x86_sse41_pmaxuw:
-  case Intrinsic::x86_sse41_pmaxud:
-  case Intrinsic::x86_avx2_pmaxu_b:
-  case Intrinsic::x86_avx2_pmaxu_w:
-  case Intrinsic::x86_avx2_pmaxu_d:
-  case Intrinsic::x86_sse2_pminu_b:
-  case Intrinsic::x86_sse41_pminuw:
-  case Intrinsic::x86_sse41_pminud:
-  case Intrinsic::x86_avx2_pminu_b:
-  case Intrinsic::x86_avx2_pminu_w:
-  case Intrinsic::x86_avx2_pminu_d:
-  case Intrinsic::x86_sse41_pmaxsb:
-  case Intrinsic::x86_sse2_pmaxs_w:
-  case Intrinsic::x86_sse41_pmaxsd:
-  case Intrinsic::x86_avx2_pmaxs_b:
-  case Intrinsic::x86_avx2_pmaxs_w:
-  case Intrinsic::x86_avx2_pmaxs_d:
-  case Intrinsic::x86_sse41_pminsb:
-  case Intrinsic::x86_sse2_pmins_w:
-  case Intrinsic::x86_sse41_pminsd:
-  case Intrinsic::x86_avx2_pmins_b:
-  case Intrinsic::x86_avx2_pmins_w:
-  case Intrinsic::x86_avx2_pmins_d: {
-    unsigned Opcode;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_sse2_pmaxu_b:
-    case Intrinsic::x86_sse41_pmaxuw:
-    case Intrinsic::x86_sse41_pmaxud:
-    case Intrinsic::x86_avx2_pmaxu_b:
-    case Intrinsic::x86_avx2_pmaxu_w:
-    case Intrinsic::x86_avx2_pmaxu_d:
-      Opcode = X86ISD::UMAX;
-      break;
-    case Intrinsic::x86_sse2_pminu_b:
-    case Intrinsic::x86_sse41_pminuw:
-    case Intrinsic::x86_sse41_pminud:
-    case Intrinsic::x86_avx2_pminu_b:
-    case Intrinsic::x86_avx2_pminu_w:
-    case Intrinsic::x86_avx2_pminu_d:
-      Opcode = X86ISD::UMIN;
-      break;
-    case Intrinsic::x86_sse41_pmaxsb:
-    case Intrinsic::x86_sse2_pmaxs_w:
-    case Intrinsic::x86_sse41_pmaxsd:
-    case Intrinsic::x86_avx2_pmaxs_b:
-    case Intrinsic::x86_avx2_pmaxs_w:
-    case Intrinsic::x86_avx2_pmaxs_d:
-      Opcode = X86ISD::SMAX;
-      break;
-    case Intrinsic::x86_sse41_pminsb:
-    case Intrinsic::x86_sse2_pmins_w:
-    case Intrinsic::x86_sse41_pminsd:
-    case Intrinsic::x86_avx2_pmins_b:
-    case Intrinsic::x86_avx2_pmins_w:
-    case Intrinsic::x86_avx2_pmins_d:
-      Opcode = X86ISD::SMIN;
-      break;
-    }
-    return DAG.getNode(Opcode, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-  }
-
   // SSE/SSE2/AVX floating point max/min intrinsics.
   case Intrinsic::x86_sse_max_ps:
   case Intrinsic::x86_sse2_max_pd:
@@ -13828,17 +16887,6 @@
     return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
 
-  case Intrinsic::x86_sse41_insertps:
-    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-  case Intrinsic::x86_avx_vperm2f128_ps_256:
-  case Intrinsic::x86_avx_vperm2f128_pd_256:
-  case Intrinsic::x86_avx_vperm2f128_si_256:
-  case Intrinsic::x86_avx2_vperm2i128:
-    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
     // Operands intentionally swapped. Mask is last operand to intrinsic,
@@ -13846,11 +16894,15 @@
     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(1));
 
-  case Intrinsic::x86_sse_sqrt_ps:
-  case Intrinsic::x86_sse2_sqrt_pd:
-  case Intrinsic::x86_avx_sqrt_ps_256:
-  case Intrinsic::x86_avx_sqrt_pd_256:
-    return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
+  case Intrinsic::x86_avx512_mask_valign_q_512:
+  case Intrinsic::x86_avx512_mask_valign_d_512:
+    // Vector source operands are swapped.
+    return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
+                                            Op.getValueType(), Op.getOperand(2),
+                                            Op.getOperand(1),
+                                            Op.getOperand(3)),
+                                Op.getOperand(5), Op.getOperand(4),
+                                Subtarget, DAG);
 
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
@@ -13928,100 +16980,6 @@
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
-  // SSE/AVX shift intrinsics
-  case Intrinsic::x86_sse2_psll_w:
-  case Intrinsic::x86_sse2_psll_d:
-  case Intrinsic::x86_sse2_psll_q:
-  case Intrinsic::x86_avx2_psll_w:
-  case Intrinsic::x86_avx2_psll_d:
-  case Intrinsic::x86_avx2_psll_q:
-  case Intrinsic::x86_sse2_psrl_w:
-  case Intrinsic::x86_sse2_psrl_d:
-  case Intrinsic::x86_sse2_psrl_q:
-  case Intrinsic::x86_avx2_psrl_w:
-  case Intrinsic::x86_avx2_psrl_d:
-  case Intrinsic::x86_avx2_psrl_q:
-  case Intrinsic::x86_sse2_psra_w:
-  case Intrinsic::x86_sse2_psra_d:
-  case Intrinsic::x86_avx2_psra_w:
-  case Intrinsic::x86_avx2_psra_d: {
-    unsigned Opcode;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_sse2_psll_w:
-    case Intrinsic::x86_sse2_psll_d:
-    case Intrinsic::x86_sse2_psll_q:
-    case Intrinsic::x86_avx2_psll_w:
-    case Intrinsic::x86_avx2_psll_d:
-    case Intrinsic::x86_avx2_psll_q:
-      Opcode = X86ISD::VSHL;
-      break;
-    case Intrinsic::x86_sse2_psrl_w:
-    case Intrinsic::x86_sse2_psrl_d:
-    case Intrinsic::x86_sse2_psrl_q:
-    case Intrinsic::x86_avx2_psrl_w:
-    case Intrinsic::x86_avx2_psrl_d:
-    case Intrinsic::x86_avx2_psrl_q:
-      Opcode = X86ISD::VSRL;
-      break;
-    case Intrinsic::x86_sse2_psra_w:
-    case Intrinsic::x86_sse2_psra_d:
-    case Intrinsic::x86_avx2_psra_w:
-    case Intrinsic::x86_avx2_psra_d:
-      Opcode = X86ISD::VSRA;
-      break;
-    }
-    return DAG.getNode(Opcode, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
-  }
-
-  // SSE/AVX immediate shift intrinsics
-  case Intrinsic::x86_sse2_pslli_w:
-  case Intrinsic::x86_sse2_pslli_d:
-  case Intrinsic::x86_sse2_pslli_q:
-  case Intrinsic::x86_avx2_pslli_w:
-  case Intrinsic::x86_avx2_pslli_d:
-  case Intrinsic::x86_avx2_pslli_q:
-  case Intrinsic::x86_sse2_psrli_w:
-  case Intrinsic::x86_sse2_psrli_d:
-  case Intrinsic::x86_sse2_psrli_q:
-  case Intrinsic::x86_avx2_psrli_w:
-  case Intrinsic::x86_avx2_psrli_d:
-  case Intrinsic::x86_avx2_psrli_q:
-  case Intrinsic::x86_sse2_psrai_w:
-  case Intrinsic::x86_sse2_psrai_d:
-  case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx2_psrai_d: {
-    unsigned Opcode;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_sse2_pslli_w:
-    case Intrinsic::x86_sse2_pslli_d:
-    case Intrinsic::x86_sse2_pslli_q:
-    case Intrinsic::x86_avx2_pslli_w:
-    case Intrinsic::x86_avx2_pslli_d:
-    case Intrinsic::x86_avx2_pslli_q:
-      Opcode = X86ISD::VSHLI;
-      break;
-    case Intrinsic::x86_sse2_psrli_w:
-    case Intrinsic::x86_sse2_psrli_d:
-    case Intrinsic::x86_sse2_psrli_q:
-    case Intrinsic::x86_avx2_psrli_w:
-    case Intrinsic::x86_avx2_psrli_d:
-    case Intrinsic::x86_avx2_psrli_q:
-      Opcode = X86ISD::VSRLI;
-      break;
-    case Intrinsic::x86_sse2_psrai_w:
-    case Intrinsic::x86_sse2_psrai_d:
-    case Intrinsic::x86_avx2_psrai_w:
-    case Intrinsic::x86_avx2_psrai_d:
-      Opcode = X86ISD::VSRAI;
-      break;
-    }
-    return getTargetVShiftNode(Opcode, dl, Op.getSimpleValueType(),
-                               Op.getOperand(1), Op.getOperand(2), DAG);
-  }
-
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
@@ -14098,6 +17056,32 @@
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
+
+  case Intrinsic::x86_fma_mask_vfmadd_ps_512:
+  case Intrinsic::x86_fma_mask_vfmadd_pd_512:
+  case Intrinsic::x86_fma_mask_vfmsub_ps_512:
+  case Intrinsic::x86_fma_mask_vfmsub_pd_512:
+  case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
+  case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
+  case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
+  case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
+  case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
+  case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
+  case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
+  case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
+    auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
+    if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
+      return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
+                                              dl, Op.getValueType(),
+                                              Op.getOperand(1),
+                                              Op.getOperand(2),
+                                              Op.getOperand(3)),
+                                  Op.getOperand(4), Op.getOperand(1),
+                                  Subtarget, DAG);
+    else
+      return SDValue();
+  }
+
   case Intrinsic::x86_fma_vfmadd_ps:
   case Intrinsic::x86_fma_vfmadd_pd:
   case Intrinsic::x86_fma_vfmsub_ps:
@@ -14122,74 +17106,8 @@
   case Intrinsic::x86_fma_vfmaddsub_pd_256:
   case Intrinsic::x86_fma_vfmsubadd_ps_256:
   case Intrinsic::x86_fma_vfmsubadd_pd_256:
-  case Intrinsic::x86_fma_vfmadd_ps_512:
-  case Intrinsic::x86_fma_vfmadd_pd_512:
-  case Intrinsic::x86_fma_vfmsub_ps_512:
-  case Intrinsic::x86_fma_vfmsub_pd_512:
-  case Intrinsic::x86_fma_vfnmadd_ps_512:
-  case Intrinsic::x86_fma_vfnmadd_pd_512:
-  case Intrinsic::x86_fma_vfnmsub_ps_512:
-  case Intrinsic::x86_fma_vfnmsub_pd_512:
-  case Intrinsic::x86_fma_vfmaddsub_ps_512:
-  case Intrinsic::x86_fma_vfmaddsub_pd_512:
-  case Intrinsic::x86_fma_vfmsubadd_ps_512:
-  case Intrinsic::x86_fma_vfmsubadd_pd_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_fma_vfmadd_ps:
-    case Intrinsic::x86_fma_vfmadd_pd:
-    case Intrinsic::x86_fma_vfmadd_ps_256:
-    case Intrinsic::x86_fma_vfmadd_pd_256:
-    case Intrinsic::x86_fma_vfmadd_ps_512:
-    case Intrinsic::x86_fma_vfmadd_pd_512:
-      Opc = X86ISD::FMADD;
-      break;
-    case Intrinsic::x86_fma_vfmsub_ps:
-    case Intrinsic::x86_fma_vfmsub_pd:
-    case Intrinsic::x86_fma_vfmsub_ps_256:
-    case Intrinsic::x86_fma_vfmsub_pd_256:
-    case Intrinsic::x86_fma_vfmsub_ps_512:
-    case Intrinsic::x86_fma_vfmsub_pd_512:
-      Opc = X86ISD::FMSUB;
-      break;
-    case Intrinsic::x86_fma_vfnmadd_ps:
-    case Intrinsic::x86_fma_vfnmadd_pd:
-    case Intrinsic::x86_fma_vfnmadd_ps_256:
-    case Intrinsic::x86_fma_vfnmadd_pd_256:
-    case Intrinsic::x86_fma_vfnmadd_ps_512:
-    case Intrinsic::x86_fma_vfnmadd_pd_512:
-      Opc = X86ISD::FNMADD;
-      break;
-    case Intrinsic::x86_fma_vfnmsub_ps:
-    case Intrinsic::x86_fma_vfnmsub_pd:
-    case Intrinsic::x86_fma_vfnmsub_ps_256:
-    case Intrinsic::x86_fma_vfnmsub_pd_256:
-    case Intrinsic::x86_fma_vfnmsub_ps_512:
-    case Intrinsic::x86_fma_vfnmsub_pd_512:
-      Opc = X86ISD::FNMSUB;
-      break;
-    case Intrinsic::x86_fma_vfmaddsub_ps:
-    case Intrinsic::x86_fma_vfmaddsub_pd:
-    case Intrinsic::x86_fma_vfmaddsub_ps_256:
-    case Intrinsic::x86_fma_vfmaddsub_pd_256:
-    case Intrinsic::x86_fma_vfmaddsub_ps_512:
-    case Intrinsic::x86_fma_vfmaddsub_pd_512:
-      Opc = X86ISD::FMADDSUB;
-      break;
-    case Intrinsic::x86_fma_vfmsubadd_ps:
-    case Intrinsic::x86_fma_vfmsubadd_pd:
-    case Intrinsic::x86_fma_vfmsubadd_ps_256:
-    case Intrinsic::x86_fma_vfmsubadd_pd_256:
-    case Intrinsic::x86_fma_vfmsubadd_ps_512:
-    case Intrinsic::x86_fma_vfmsubadd_pd_512:
-      Opc = X86ISD::FMSUBADD;
-      break;
-    }
-
-    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
-                       Op.getOperand(2), Op.getOperand(3));
-  }
+    return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   }
 }
 
@@ -14374,122 +17292,25 @@
   return DAG.getMergeValues(Results, DL);
 }
 
-enum IntrinsicType {
-  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST
-};
-
-struct IntrinsicData {
-  IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
-    :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
-  IntrinsicType Type;
-  unsigned      Opc0;
-  unsigned      Opc1;
-};
-
-std::map < unsigned, IntrinsicData> IntrMap;
-static void InitIntinsicsMap() {
-  static bool Initialized = false;
-  if (Initialized) 
-    return;
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
-                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
-                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
-                                IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
-                                IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
-                                IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, 
-                                IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, 
-                                IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, 
-                                IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, 
-                                IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
-
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
-                                IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, 
-                                IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, 
-                                IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, 
-                                IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, 
-                                IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, 
-                                IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, 
-                                IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, 
-                                IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
-   
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, 
-                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
-                                                        X86::VGATHERPF1QPSm)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, 
-                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
-                                                        X86::VGATHERPF1QPDm)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, 
-                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
-                                                        X86::VGATHERPF1DPDm)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, 
-                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
-                                                        X86::VGATHERPF1DPSm)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, 
-                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
-                                                        X86::VSCATTERPF1QPSm)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, 
-                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
-                                                        X86::VSCATTERPF1QPDm)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, 
-                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
-                                                        X86::VSCATTERPF1DPDm)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, 
-                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
-                                                        X86::VSCATTERPF1DPSm)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
-                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
-                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
-                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
-                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
-                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
-                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
-                                IntrinsicData(XTEST,  X86ISD::XTEST,  0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
-                                IntrinsicData(RDTSC,  X86ISD::RDTSC_DAG, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
-                                IntrinsicData(RDTSC,  X86ISD::RDTSCP_DAG, 0)));
-  IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc,
-                                IntrinsicData(RDPMC,  X86ISD::RDPMC_DAG, 0)));
-  Initialized = true;
-}
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
-  InitIntinsicsMap();
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
-  if (itr == IntrMap.end())
+
+  const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
+  if (!IntrData)
     return SDValue();
 
   SDLoc dl(Op);
-  IntrinsicData Intr = itr->second;
-  switch(Intr.Type) {
+  switch(IntrData->Type) {
+  default:
+    llvm_unreachable("Unknown Intrinsic Type");
+    break;    
   case RDSEED:
   case RDRAND: {
     // Emit the node with the right value type.
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
-    SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
+    SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
 
     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
     // Otherwise return the value from Rand, which is always 0, casted to i32.
@@ -14513,7 +17334,7 @@
     SDValue Index = Op.getOperand(4);
     SDValue Mask  = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
-    return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+    return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
                           Subtarget);
   }
   case SCATTER: {
@@ -14524,7 +17345,7 @@
     SDValue Index = Op.getOperand(4);
     SDValue Src   = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
-    return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+    return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
   }
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
@@ -14532,7 +17353,7 @@
     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
-    unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
+    unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
     SDValue Chain = Op.getOperand(0);
     SDValue Mask  = Op.getOperand(2);
     SDValue Index = Op.getOperand(3);
@@ -14543,7 +17364,7 @@
   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   case RDTSC: {
     SmallVector<SDValue, 2> Results;
-    getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
+    getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
     return DAG.getMergeValues(Results, dl);
   }
   // Read Performance Monitoring Counters.
@@ -14555,7 +17376,7 @@
   // XTEST intrinsics.
   case XTEST: {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
-    SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
+    SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                                 DAG.getConstant(X86::COND_NE, MVT::i8),
                                 InTrans);
@@ -14563,8 +17384,26 @@
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
                        Ret, SDValue(InTrans.getNode(), 1));
   }
+  // ADC/ADCX/SBB
+  case ADX: {
+    SmallVector<SDValue, 2> Results;
+    SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
+    SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
+                                DAG.getConstant(-1, MVT::i8));
+    SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
+                              Op.getOperand(4), GenCF.getValue(1));
+    SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
+                                 Op.getOperand(5), MachinePointerInfo(),
+                                 false, false, 0);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86::COND_B, MVT::i8),
+                                Res.getValue(1));
+    Results.push_back(SetCC);
+    Results.push_back(Store);
+    return DAG.getMergeValues(Results, dl);
   }
-  llvm_unreachable("Unknown Intrinsic Type");
+  }
 }
 
 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -14581,8 +17420,8 @@
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+        DAG.getSubtarget().getRegisterInfo());
     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT,
@@ -14603,8 +17442,8 @@
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -14632,8 +17471,8 @@
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
 }
 
@@ -14644,8 +17483,8 @@
   SDLoc dl      (Op);
 
   EVT PtrVT = getPointerTy();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -14692,7 +17531,7 @@
   SDLoc dl (Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
 
   if (Subtarget->is64Bit()) {
     SDValue OutChains[6];
@@ -14856,7 +17695,7 @@
 
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetMachine &TM = MF.getTarget();
-  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
@@ -15156,10 +17995,23 @@
   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
          (VT == MVT::v8i32 && Subtarget->hasInt256()));
 
-  // Get the high parts.
+  // PMULxD operations multiply each even value (starting at 0) of LHS with
+  // the related value of RHS and produce a widen result.
+  // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  //
+  // In other word, to have all the results, we need to perform two PMULxD:
+  // 1. one with the even values.
+  // 2. one with the odd values.
+  // To achieve #2, with need to place the odd values at an even position.
+  //
+  // Place the odd value at an even position (basically, shift all values 1
+  // step to the left):
   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
-  SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
-  SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+  // <a|b|c|d> => <b|undef|d|undef>
+  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  // <e|f|g|h> => <f|undef|h|undef>
+  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
 
   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
   // ints.
@@ -15167,10 +18019,14 @@
   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
   unsigned Opcode =
       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+  // => <2 x i64> <bf|dh>
   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
-                             DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+                             DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
 
   // Shuffle it back into the right order.
   SDValue Highs, Lows;
@@ -15200,7 +18056,10 @@
     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
   }
 
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
+  // The first result of MUL_LOHI is actually the low value, followed by the
+  // high value.
+  SDValue Ops[] = {Lows, Highs};
+  return DAG.getMergeValues(Ops, dl);
 }
 
 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
@@ -15811,10 +18670,15 @@
     Cond = X86::COND_B;
     break;
   case ISD::SMULO:
-    BaseOp = X86ISD::SMUL;
+    BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
     Cond = X86::COND_O;
     break;
   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+    if (N->getValueType(0) == MVT::i8) {
+      BaseOp = X86ISD::UMUL8;
+      Cond = X86::COND_O;
+      break;
+    }
     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
                                  MVT::i32);
     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
@@ -15840,6 +18704,11 @@
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
+// Sign extension of the low part of vector elements. This may be used either
+// when sign extend instructions are not available or if the vector element
+// sizes already match the sign-extended size. If the vector elements are in
+// their pre-extended size and sign extend instructions are available, that will
+// be handled by LowerSIGN_EXTEND.
 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -15885,37 +18754,151 @@
     case MVT::v4i32:
     case MVT::v8i16: {
       SDValue Op0 = Op.getOperand(0);
-      SDValue Op00 = Op0.getOperand(0);
-      SDValue Tmp1;
-      // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
-      if (Op0.getOpcode() == ISD::BITCAST &&
-          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
-        // (sext (vzext x)) -> (vsext x)
-        Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
-        if (Tmp1.getNode()) {
-          EVT ExtraEltVT = ExtraVT.getVectorElementType();
-          // This folding is only valid when the in-reg type is a vector of i8,
-          // i16, or i32.
-          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
-              ExtraEltVT == MVT::i32) {
-            SDValue Tmp1Op0 = Tmp1.getOperand(0);
-            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
-                   "This optimization is invalid without a VZEXT.");
-            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
-          }
-          Op0 = Tmp1;
-        }
-      }
 
-      // If the above didn't work, then just use Shift-Left + Shift-Right.
-      Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,
-                                        DAG);
-      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,
+      // This is a sign extension of some low part of vector elements without
+      // changing the size of the vector elements themselves:
+      // Shift-Left + Shift-Right-Algebraic.
+      SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
+                                               BitsDiff, DAG);
+      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
                                         DAG);
     }
   }
 }
 
+/// Returns true if the operand type is exactly twice the native width, and
+/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
+/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
+/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
+bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
+  const X86Subtarget &Subtarget =
+      getTargetMachine().getSubtarget<X86Subtarget>();
+  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
+
+  if (OpWidth == 64)
+    return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
+  else if (OpWidth == 128)
+    return Subtarget.hasCmpxchg16b();
+  else
+    return false;
+}
+
+bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  return needsCmpXchgNb(SI->getValueOperand()->getType());
+}
+
+// Note: this turns large loads into lock cmpxchg8b/16b.
+// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
+bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+  return needsCmpXchgNb(PTy->getElementType());
+}
+
+bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  const X86Subtarget &Subtarget =
+      getTargetMachine().getSubtarget<X86Subtarget>();
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  const Type *MemType = AI->getType();
+
+  // If the operand is too big, we must see if cmpxchg8/16b is available
+  // and default to library calls otherwise.
+  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+    return needsCmpXchgNb(MemType);
+
+  AtomicRMWInst::BinOp Op = AI->getOperation();
+  switch (Op) {
+  default:
+    llvm_unreachable("Unknown atomic operation");
+  case AtomicRMWInst::Xchg:
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+    // It's better to use xadd, xsub or xchg for these in all cases.
+    return false;
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::Xor:
+    // If the atomicrmw's result isn't actually used, we can just add a "lock"
+    // prefix to a normal instruction for these operations.
+    return !AI->use_empty();
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+    // These always require a non-trivial set of data operations on x86. We must
+    // use a cmpxchg loop.
+    return true;
+  }
+}
+
+static bool hasMFENCE(const X86Subtarget& Subtarget) {
+  // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+  // no-sse2). There isn't any reason to disable it if the target processor
+  // supports it.
+  return Subtarget.hasSSE2() || Subtarget.is64Bit();
+}
+
+LoadInst *
+X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+  const X86Subtarget &Subtarget =
+      getTargetMachine().getSubtarget<X86Subtarget>();
+  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
+  const Type *MemType = AI->getType();
+  // Accesses larger than the native width are turned into cmpxchg/libcalls, so
+  // there is no benefit in turning such RMWs into loads, and it is actually
+  // harmful as it introduces a mfence.
+  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
+    return nullptr;
+
+  auto Builder = IRBuilder<>(AI);
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  auto SynchScope = AI->getSynchScope();
+  // We must restrict the ordering to avoid generating loads with Release or
+  // ReleaseAcquire orderings.
+  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+  auto Ptr = AI->getPointerOperand();
+
+  // Before the load we need a fence. Here is an example lifted from
+  // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
+  // is required:
+  // Thread 0:
+  //   x.store(1, relaxed);
+  //   r1 = y.fetch_add(0, release);
+  // Thread 1:
+  //   y.fetch_add(42, acquire);
+  //   r2 = x.load(relaxed);
+  // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
+  // lowered to just a load without a fence. A mfence flushes the store buffer,
+  // making the optimization clearly correct.
+  // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
+  // otherwise, we might be able to be more agressive on relaxed idempotent
+  // rmw. In practice, they do not look useful, so we don't try to be
+  // especially clever.
+  if (SynchScope == SingleThread) {
+    // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
+    // the IR level, so we must wrap it in an intrinsic.
+    return nullptr;
+  } else if (hasMFENCE(Subtarget)) {
+    Function *MFence = llvm::Intrinsic::getDeclaration(M,
+            Intrinsic::x86_sse2_mfence);
+    Builder.CreateCall(MFence);
+  } else {
+    // FIXME: it might make sense to use a locked operation here but on a
+    // different cache-line to prevent cache-line bouncing. In practice it
+    // is probably a small win, and x86 processors without mfence are rare
+    // enough that we do not bother.
+    return nullptr;
+  }
+
+  // Finally we can emit the atomic load.
+  LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
+          AI->getType()->getPrimitiveSizeInBits());
+  Loaded->setAtomic(Order, SynchScope);
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+  return Loaded;
+}
+
 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
                                  SelectionDAG &DAG) {
   SDLoc dl(Op);
@@ -15927,10 +18910,7 @@
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
-    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
-    // no-sse2). There isn't any reason to disable it if the target processor
-    // supports it.
-    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
+    if (hasMFENCE(*Subtarget))
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
     SDValue Chain = Op.getOperand(0);
@@ -16141,7 +19121,7 @@
   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
 
   Type *RetTy = isF64
-    ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
+    ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
     : (Type*)VectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
@@ -16200,8 +19180,9 @@
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
-  case ISD::FABS:               return LowerFABS(Op, DAG);
-  case ISD::FNEG:               return LowerFNEG(Op, DAG);
+  case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
+  case ISD::FABS:
+  case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
@@ -16211,7 +19192,7 @@
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
-  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
@@ -16252,29 +19233,6 @@
   }
 }
 
-static void ReplaceATOMIC_LOAD(SDNode *Node,
-                               SmallVectorImpl<SDValue> &Results,
-                               SelectionDAG &DAG) {
-  SDLoc dl(Node);
-  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
-
-  // Convert wide load -> cmpxchg8b/cmpxchg16b
-  // FIXME: On 32-bit, load -> fild or movq would be more efficient
-  //        (The only way to get a 16-byte load is cmpxchg16b)
-  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
-  SDValue Zero = DAG.getConstant(0, VT);
-  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
-  SDValue Swap =
-      DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
-                           Node->getOperand(0), Node->getOperand(1), Zero, Zero,
-                           cast<AtomicSDNode>(Node)->getMemOperand(),
-                           cast<AtomicSDNode>(Node)->getOrdering(),
-                           cast<AtomicSDNode>(Node)->getOrdering(),
-                           cast<AtomicSDNode>(Node)->getSynchScope());
-  Results.push_back(Swap.getValue(0));
-  Results.push_back(Swap.getValue(2));
-}
-
 /// ReplaceNodeResults - Replace a node with an illegal result type
 /// with a new node built out of custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -16433,12 +19391,10 @@
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
-    // Delegate to generic TypeLegalization. Situations we can really handle
-    // should have already been dealt with by X86AtomicExpand.cpp.
-    break;
   case ISD::ATOMIC_LOAD: {
-    ReplaceATOMIC_LOAD(N, Results, DAG);
-    return;
+    // Delegate to generic TypeLegalization. Situations we can really handle
+    // should have already been dealt with by AtomicExpandPass.cpp.
+    break;
   }
   case ISD::BITCAST: {
     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
@@ -16521,8 +19477,8 @@
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
-  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
+  case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
   case X86ISD::HADD:               return "X86ISD::HADD";
   case X86ISD::HSUB:               return "X86ISD::HSUB";
@@ -16578,6 +19534,10 @@
   case X86ISD::SBB:                return "X86ISD::SBB";
   case X86ISD::SMUL:               return "X86ISD::SMUL";
   case X86ISD::UMUL:               return "X86ISD::UMUL";
+  case X86ISD::SMUL8:              return "X86ISD::SMUL8";
+  case X86ISD::UMUL8:              return "X86ISD::UMUL8";
+  case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
+  case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
   case X86ISD::INC:                return "X86ISD::INC";
   case X86ISD::DEC:                return "X86ISD::DEC";
   case X86ISD::OR:                 return "X86ISD::OR";
@@ -16593,6 +19553,7 @@
   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
+  case X86ISD::VALIGN:             return "X86ISD::VALIGN";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
@@ -16612,7 +19573,7 @@
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
-  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
+  case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
@@ -16848,8 +19809,11 @@
   return (SVT.getVectorNumElements() == 2 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
           isMOVLMask(M, SVT) ||
+          isMOVHLPSMask(M, SVT) ||
           isSHUFPMask(M, SVT) ||
+          isSHUFPMask(M, SVT, /* Commuted */ true) ||
           isPSHUFDMask(M, SVT) ||
+          isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
           isPALIGNRMask(M, SVT, Subtarget) ||
@@ -16857,7 +19821,8 @@
           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
-          isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()));
+          isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
+          (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
 }
 
 bool
@@ -16875,7 +19840,9 @@
     return (isMOVLMask(Mask, SVT)  ||
             isCommutedMOVLMask(Mask, SVT, true) ||
             isSHUFPMask(Mask, SVT) ||
-            isSHUFPMask(Mask, SVT, /* Commuted */ true));
+            isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
+            isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
+                        Subtarget->hasInt256()));
   }
   return false;
 }
@@ -17073,7 +20040,7 @@
   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
   // Machine Information
-  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
@@ -17329,7 +20296,7 @@
   XMMSaveMBB->addSuccessor(EndMBB);
 
   // Now add the instructions.
-  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   unsigned CountReg = MI->getOperand(0).getReg();
@@ -17412,7 +20379,7 @@
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                      MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -17438,7 +20405,8 @@
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
-  const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      BB->getParent()->getSubtarget().getRegisterInfo();
   if (!MI->killsRegister(X86::EFLAGS) &&
       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
     copy0MBB->addLiveIn(X86::EFLAGS);
@@ -17477,17 +20445,20 @@
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
-                                        bool Is64Bit) const {
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
   assert(MF->shouldSplitStack());
 
-  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
-  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+  const bool Is64Bit = Subtarget->is64Bit();
+  const bool IsLP64 = Subtarget->isTarget64BitLP64();
+
+  const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+  const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
 
   // BB:
   //  ... [Till the alloca]
@@ -17511,14 +20482,14 @@
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetRegisterClass *AddrRegClass =
-    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+    getRegClassFor(getPointerTy());
 
   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
     sizeVReg = MI->getOperand(1).getReg(),
-    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+    physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
 
   MachineFunction::iterator MBBIter = BB;
   ++MBBIter;
@@ -17534,9 +20505,9 @@
   // Add code to the main basic block to check if the stack limit has been hit,
   // and if so, jump to mallocMBB otherwise to bumpMBB.
   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
-  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+  BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
     .addReg(tmpSPVReg).addReg(sizeVReg);
-  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+  BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
     .addReg(SPLimitVReg);
   BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
@@ -17550,9 +20521,11 @@
   BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
 
   // Calls into a routine in libgcc to allocate more space from the heap.
-  const uint32_t *RegMask =
-    MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
-  if (Is64Bit) {
+  const uint32_t *RegMask = MF->getTarget()
+                                .getSubtargetImpl()
+                                ->getRegisterInfo()
+                                ->getCallPreservedMask(CallingConv::C);
+  if (IsLP64) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
@@ -17560,6 +20533,14 @@
       .addRegMask(RegMask)
       .addReg(X86::RDI, RegState::Implicit)
       .addReg(X86::RAX, RegState::ImplicitDefine);
+  } else if (Is64Bit) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::EDI, RegState::Implicit)
+      .addReg(X86::EAX, RegState::ImplicitDefine);
   } else {
     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
       .addImm(12);
@@ -17575,7 +20556,7 @@
       .addImm(16);
 
   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
-    .addReg(Is64Bit ? X86::RAX : X86::EAX);
+    .addReg(IsLP64 ? X86::RAX : X86::EAX);
   BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
 
   // Set up the CFG correctly.
@@ -17600,7 +20581,7 @@
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
                                         MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
 
   assert(!Subtarget->isTargetMacho());
@@ -17633,8 +20614,10 @@
         .addReg(X86::RAX);
     }
   } else {
-    const char *StackProbeSymbol =
-      Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca";
+    const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
+                                    Subtarget->isTargetWindowsItanium())
+                                       ? "_chkstk"
+                                       : "_alloca";
 
     BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
       .addExternalSymbol(StackProbeSymbol)
@@ -17657,8 +20640,8 @@
   // or EAX and doing an indirect call.  The return value will then
   // be in the normal return register.
   MachineFunction *F = BB->getParent();
-  const X86InstrInfo *TII
-    = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo());
+  const X86InstrInfo *TII =
+      static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
   DebugLoc DL = MI->getDebugLoc();
 
   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
@@ -17667,8 +20650,10 @@
   // Get a register mask for the lowered call.
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
-  const uint32_t *RegMask =
-    F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+  const uint32_t *RegMask = F->getTarget()
+                                .getSubtargetImpl()
+                                ->getRegisterInfo()
+                                ->getCallPreservedMask(CallingConv::C);
   if (Subtarget->is64Bit()) {
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                       TII->get(X86::MOV64rm), X86::RDI)
@@ -17713,7 +20698,7 @@
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
   MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
@@ -17819,8 +20804,8 @@
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
           .addMBB(restoreMBB);
 
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(restoreMBB);
@@ -17850,7 +20835,7 @@
                                      MachineBasicBlock *MBB) const {
   DebugLoc DL = MI->getDebugLoc();
   MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
@@ -17865,8 +20850,8 @@
     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   unsigned SP = RegInfo->getStackRegister();
 
@@ -17965,6 +20950,11 @@
         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
+        case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
+        case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
+        case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
+        case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
+
         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
@@ -17973,10 +20963,14 @@
         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
+        case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
+        case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
+        case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
+        case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
         default: llvm_unreachable("Unrecognized FMA variant.");
       }
 
-      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
       MachineInstrBuilder MIB =
         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
         .addOperand(MI->getOperand(0))
@@ -18007,9 +21001,8 @@
   case X86::WIN_ALLOCA:
     return EmitLoweredWinAlloca(MI, BB);
   case X86::SEG_ALLOCA_32:
-    return EmitLoweredSegAlloca(MI, BB, false);
   case X86::SEG_ALLOCA_64:
-    return EmitLoweredSegAlloca(MI, BB, true);
+    return EmitLoweredSegAlloca(MI, BB);
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
@@ -18042,7 +21035,7 @@
   case X86::FP80_TO_INT32_IN_MEM:
   case X86::FP80_TO_INT64_IN_MEM: {
     MachineFunction *F = BB->getParent();
-    const TargetInstrInfo *TII = F->getTarget().getInstrInfo();
+    const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
     DebugLoc DL = MI->getDebugLoc();
 
     // Change the floating point control register to use "round towards zero"
@@ -18126,7 +21119,7 @@
   case X86::VPCMPESTRM128MEM:
     assert(Subtarget->hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+    return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
 
   // String/text processing lowering.
   case X86::PCMPISTRIREG:
@@ -18139,15 +21132,16 @@
   case X86::VPCMPESTRIMEM:
     assert(Subtarget->hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+    return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
 
   // Thread synchronization.
   case X86::MONITOR:
-    return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget);
+    return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
+                       Subtarget);
 
   // xbegin
   case X86::XBEGIN:
-    return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+    return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
 
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -18183,6 +21177,10 @@
   case X86::VFNMSUBPSr213r:
   case X86::VFNMSUBSDr213r:
   case X86::VFNMSUBSSr213r:
+  case X86::VFMADDSUBPDr213r:
+  case X86::VFMADDSUBPSr213r:
+  case X86::VFMSUBADDPDr213r:
+  case X86::VFMSUBADDPSr213r:
   case X86::VFMADDPDr213rY:
   case X86::VFMADDPSr213rY:
   case X86::VFMSUBPDr213rY:
@@ -18191,6 +21189,10 @@
   case X86::VFNMADDPSr213rY:
   case X86::VFNMSUBPDr213rY:
   case X86::VFNMSUBPSr213rY:
+  case X86::VFMADDSUBPDr213rY:
+  case X86::VFMADDSUBPSr213rY:
+  case X86::VFMSUBADDPDr213rY:
+  case X86::VFMSUBADDPSr213rY:
     return emitFMA3Instr(MI, BB);
   }
 }
@@ -18420,6 +21422,329 @@
   return SDValue();
 }
 
+/// \brief Combine an arbitrary chain of shuffles into a single instruction if
+/// possible.
+///
+/// This is the leaf of the recursive combinine below. When we have found some
+/// chain of single-use x86 shuffle instructions and accumulated the combined
+/// shuffle mask represented by them, this will try to pattern match that mask
+/// into either a single instruction if there is a special purpose instruction
+/// for this operation, or into a PSHUFB instruction which is a fully general
+/// instruction but should only be used to replace chains over a certain depth.
+static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
+                                   int Depth, bool HasPSHUFB, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
+
+  // Find the operand that enters the chain. Note that multiple uses are OK
+  // here, we're not going to remove the operand we find.
+  SDValue Input = Op.getOperand(0);
+  while (Input.getOpcode() == ISD::BITCAST)
+    Input = Input.getOperand(0);
+
+  MVT VT = Input.getSimpleValueType();
+  MVT RootVT = Root.getSimpleValueType();
+  SDLoc DL(Root);
+
+  // Just remove no-op shuffle masks.
+  if (Mask.size() == 1) {
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // Use the float domain if the operand type is a floating point type.
+  bool FloatDomain = VT.isFloatingPoint();
+
+  // For floating point shuffles, we don't have free copies in the shuffle
+  // instructions or the ability to load as part of the instruction, so
+  // canonicalize their shuffles to UNPCK or MOV variants.
+  //
+  // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
+  // vectors because it can have a load folded into it that UNPCK cannot. This
+  // doesn't preclude something switching to the shorter encoding post-RA.
+  if (FloatDomain) {
+    if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
+      bool Lo = Mask.equals(0, 0);
+      unsigned Shuffle;
+      MVT ShuffleVT;
+      // Check if we have SSE3 which will let us use MOVDDUP. That instruction
+      // is no slower than UNPCKLPD but has the option to fold the input operand
+      // into even an unaligned memory load.
+      if (Lo && Subtarget->hasSSE3()) {
+        Shuffle = X86ISD::MOVDDUP;
+        ShuffleVT = MVT::v2f64;
+      } else {
+        // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
+        // than the UNPCK variants.
+        Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
+        ShuffleVT = MVT::v4f32;
+      }
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
+      if (Shuffle == X86ISD::MOVDDUP)
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+      else
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
+    }
+    if (Subtarget->hasSSE3() &&
+        (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
+      bool Lo = Mask.equals(0, 0, 2, 2);
+      unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
+      MVT ShuffleVT = MVT::v4f32;
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
+    }
+    if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
+      bool Lo = Mask.equals(0, 0, 1, 1);
+      unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      MVT ShuffleVT = MVT::v4f32;
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
+    }
+  }
+
+  // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
+  // variants as none of these have single-instruction variants that are
+  // superior to the UNPCK formulation.
+  if (!FloatDomain &&
+      (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
+       Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
+       Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
+       Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
+                   15))) {
+    bool Lo = Mask[0] == 0;
+    unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+    if (Depth == 1 && Root->getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    MVT ShuffleVT;
+    switch (Mask.size()) {
+    case 8:
+      ShuffleVT = MVT::v8i16;
+      break;
+    case 16:
+      ShuffleVT = MVT::v16i8;
+      break;
+    default:
+      llvm_unreachable("Impossible mask size!");
+    };
+    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+    DCI.AddToWorklist(Op.getNode());
+    Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // Don't try to re-form single instruction chains under any circumstances now
+  // that we've done encoding canonicalization for them.
+  if (Depth < 2)
+    return false;
+
+  // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
+  // can replace them with a single PSHUFB instruction profitably. Intel's
+  // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
+  // in practice PSHUFB tends to be *very* fast so we're more aggressive.
+  if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
+    SmallVector<SDValue, 16> PSHUFBMask;
+    assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
+    int Ratio = 16 / Mask.size();
+    for (unsigned i = 0; i < 16; ++i) {
+      if (Mask[i / Ratio] == SM_SentinelUndef) {
+        PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
+        continue;
+      }
+      int M = Mask[i / Ratio] != SM_SentinelZero
+                  ? Ratio * Mask[i / Ratio] + i % Ratio
+                  : 255;
+      PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
+    }
+    Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
+    DCI.AddToWorklist(Op.getNode());
+    SDValue PSHUFBMaskOp =
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
+    DCI.AddToWorklist(PSHUFBMaskOp.getNode());
+    Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
+  }
+
+  // Failed to find any combines.
+  return false;
+}
+
+/// \brief Fully generic combining of x86 shuffle instructions.
+///
+/// This should be the last combine run over the x86 shuffle instructions. Once
+/// they have been fully optimized, this will recursively consider all chains
+/// of single-use shuffle instructions, build a generic model of the cumulative
+/// shuffle operation, and check for simpler instructions which implement this
+/// operation. We use this primarily for two purposes:
+///
+/// 1) Collapse generic shuffles to specialized single instructions when
+///    equivalent. In most cases, this is just an encoding size win, but
+///    sometimes we will collapse multiple generic shuffles into a single
+///    special-purpose shuffle.
+/// 2) Look for sequences of shuffle instructions with 3 or more total
+///    instructions, and replace them with the slightly more expensive SSSE3
+///    PSHUFB instruction if available. We do this as the last combining step
+///    to ensure we avoid using PSHUFB if we can implement the shuffle with
+///    a suitable short sequence of other instructions. The PHUFB will either
+///    use a register or have to read from memory and so is slightly (but only
+///    slightly) more expensive than the other shuffle instructions.
+///
+/// Because this is inherently a quadratic operation (for each shuffle in
+/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
+/// This should never be an issue in practice as the shuffle lowering doesn't
+/// produce sequences of more than 8 instructions.
+///
+/// FIXME: We will currently miss some cases where the redundant shuffling
+/// would simplify under the threshold for PSHUFB formation because of
+/// combine-ordering. To fix this, we should do the redundant instruction
+/// combining in this recursive walk.
+static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
+                                          ArrayRef<int> RootMask,
+                                          int Depth, bool HasPSHUFB,
+                                          SelectionDAG &DAG,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const X86Subtarget *Subtarget) {
+  // Bound the depth of our recursive combine because this is ultimately
+  // quadratic in nature.
+  if (Depth > 8)
+    return false;
+
+  // Directly rip through bitcasts to find the underlying operand.
+  while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
+    Op = Op.getOperand(0);
+
+  MVT VT = Op.getSimpleValueType();
+  if (!VT.isVector())
+    return false; // Bail if we hit a non-vector.
+  // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
+  // version should be added.
+  if (VT.getSizeInBits() != 128)
+    return false;
+
+  assert(Root.getSimpleValueType().isVector() &&
+         "Shuffles operate on vector types!");
+  assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+         "Can only combine shuffles of the same vector register size.");
+
+  if (!isTargetShuffle(Op.getOpcode()))
+    return false;
+  SmallVector<int, 16> OpMask;
+  bool IsUnary;
+  bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
+  // We only can combine unary shuffles which we can decode the mask for.
+  if (!HaveMask || !IsUnary)
+    return false;
+
+  assert(VT.getVectorNumElements() == OpMask.size() &&
+         "Different mask size from vector size!");
+  assert(((RootMask.size() > OpMask.size() &&
+           RootMask.size() % OpMask.size() == 0) ||
+          (OpMask.size() > RootMask.size() &&
+           OpMask.size() % RootMask.size() == 0) ||
+          OpMask.size() == RootMask.size()) &&
+         "The smaller number of elements must divide the larger.");
+  int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
+  int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
+  assert(((RootRatio == 1 && OpRatio == 1) ||
+          (RootRatio == 1) != (OpRatio == 1)) &&
+         "Must not have a ratio for both incoming and op masks!");
+
+  SmallVector<int, 16> Mask;
+  Mask.reserve(std::max(OpMask.size(), RootMask.size()));
+
+  // Merge this shuffle operation's mask into our accumulated mask. Note that
+  // this shuffle's mask will be the first applied to the input, followed by the
+  // root mask to get us all the way to the root value arrangement. The reason
+  // for this order is that we are recursing up the operation chain.
+  for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
+    int RootIdx = i / RootRatio;
+    if (RootMask[RootIdx] < 0) {
+      // This is a zero or undef lane, we're done.
+      Mask.push_back(RootMask[RootIdx]);
+      continue;
+    }
+
+    int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+    int OpIdx = RootMaskedIdx / OpRatio;
+    if (OpMask[OpIdx] < 0) {
+      // The incoming lanes are zero or undef, it doesn't matter which ones we
+      // are using.
+      Mask.push_back(OpMask[OpIdx]);
+      continue;
+    }
+
+    // Ok, we have non-zero lanes, map them through.
+    Mask.push_back(OpMask[OpIdx] * OpRatio +
+                   RootMaskedIdx % OpRatio);
+  }
+
+  // See if we can recurse into the operand to combine more things.
+  switch (Op.getOpcode()) {
+    case X86ISD::PSHUFB:
+      HasPSHUFB = true;
+    case X86ISD::PSHUFD:
+    case X86ISD::PSHUFHW:
+    case X86ISD::PSHUFLW:
+      if (Op.getOperand(0).hasOneUse() &&
+          combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                        HasPSHUFB, DAG, DCI, Subtarget))
+        return true;
+      break;
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
+      // We can't check for single use, we have to check that this shuffle is the only user.
+      if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
+          combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
+                                        HasPSHUFB, DAG, DCI, Subtarget))
+          return true;
+      break;
+  }
+
+  // Minor canonicalization of the accumulated shuffle mask to make it easier
+  // to match below. All this does is detect masks with squential pairs of
+  // elements, and shrink them to the half-width mask. It does this in a loop
+  // so it will reduce the size of the mask to the minimal width mask which
+  // performs an equivalent shuffle.
+  SmallVector<int, 16> WidenedMask;
+  while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+    Mask = std::move(WidenedMask);
+    WidenedMask.clear();
+  }
+
+  return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
+                                Subtarget);
+}
+
 /// \brief Get the PSHUF-style mask from PSHUF node.
 ///
 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
@@ -18452,19 +21777,23 @@
 /// We walk up the chain and look for a combinable shuffle, skipping over
 /// shuffles that we could hoist this shuffle's transformation past without
 /// altering anything.
-static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
-                                         SelectionDAG &DAG,
-                                         TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue
+combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+                             SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
   assert(N.getOpcode() == X86ISD::PSHUFD &&
          "Called with something other than an x86 128-bit half shuffle!");
   SDLoc DL(N);
 
-  // Walk up a single-use chain looking for a combinable shuffle.
+  // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
+  // of the shuffles in the chain so that we can form a fresh chain to replace
+  // this one.
+  SmallVector<SDValue, 8> Chain;
   SDValue V = N.getOperand(0);
   for (; V.hasOneUse(); V = V.getOperand(0)) {
     switch (V.getOpcode()) {
     default:
-      return false; // Nothing combined!
+      return SDValue(); // Nothing combined!
 
     case ISD::BITCAST:
       // Skip bitcasts as we always know the type for the target specific
@@ -18480,8 +21809,9 @@
       // dword shuffle, and the high words are self-contained.
       if (Mask[0] != 0 || Mask[1] != 1 ||
           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
-        return false;
+        return SDValue();
 
+      Chain.push_back(V);
       continue;
 
     case X86ISD::PSHUFHW:
@@ -18489,8 +21819,9 @@
       // dword shuffle, and the low words are self-contained.
       if (Mask[2] != 2 || Mask[3] != 3 ||
           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
-        return false;
+        return SDValue();
 
+      Chain.push_back(V);
       continue;
 
     case X86ISD::UNPCKL:
@@ -18498,25 +21829,28 @@
       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
       // shuffle into a preceding word shuffle.
       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
-        return false;
+        return SDValue();
 
       // Search for a half-shuffle which we can combine with.
       unsigned CombineOp =
           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
       if (V.getOperand(0) != V.getOperand(1) ||
           !V->isOnlyUserOf(V.getOperand(0).getNode()))
-        return false;
+        return SDValue();
+      Chain.push_back(V);
       V = V.getOperand(0);
       do {
         switch (V.getOpcode()) {
         default:
-          return false; // Nothing to combine.
+          return SDValue(); // Nothing to combine.
 
         case X86ISD::PSHUFLW:
         case X86ISD::PSHUFHW:
           if (V.getOpcode() == CombineOp)
             break;
 
+          Chain.push_back(V);
+
           // Fallthrough!
         case ISD::BITCAST:
           V = V.getOperand(0);
@@ -18532,10 +21866,7 @@
 
   if (!V.hasOneUse())
     // We fell out of the loop without finding a viable combining instruction.
-    return false;
-
-  // Record the old value to use in RAUW-ing.
-  SDValue Old = V;
+    return SDValue();
 
   // Merge this node's mask and our incoming mask.
   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
@@ -18544,20 +21875,34 @@
   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
                   getV4X86ShuffleImm8ForMask(Mask, DAG));
 
-  // It is possible that one of the combinable shuffles was completely absorbed
-  // by the other, just replace it and revisit all users in that case.
-  if (Old.getNode() == V.getNode()) {
-    DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true);
-    return true;
+  // Rebuild the chain around this new shuffle.
+  while (!Chain.empty()) {
+    SDValue W = Chain.pop_back_val();
+
+    if (V.getValueType() != W.getOperand(0).getValueType())
+      V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
+
+    switch (W.getOpcode()) {
+    default:
+      llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
+      break;
+
+    case X86ISD::PSHUFD:
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
+      break;
+    }
   }
+  if (V.getValueType() != N.getValueType())
+    V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
 
-  // Replace N with its operand as we're going to combine that shuffle away.
-  DAG.ReplaceAllUsesWith(N, N.getOperand(0));
-
-  // Replace the combinable shuffle with the combined one, updating all users
-  // so that we re-evaluate the chain here.
-  DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
-  return true;
+  // Return the new chain to replace N.
+  return V;
 }
 
 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
@@ -18593,26 +21938,6 @@
 
       // Other-half shuffles are no-ops.
       continue;
-
-    case X86ISD::PSHUFD: {
-      // We can only handle pshufd if the half we are combining either stays in
-      // its half, or switches to the other half. Bail if one of these isn't
-      // true.
-      SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
-      int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
-      if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
-            (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
-        return false;
-
-      // Map the mask through the pshufd and keep walking up the chain.
-      for (int i = 0; i < 4; ++i)
-        Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
-
-      // Switch halves if the pshufd does.
-      CombineOpcode =
-          VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
-      continue;
-    }
     }
     // Break out of the loop if we break out of the switch.
     break;
@@ -18622,7 +21947,11 @@
     // We fell out of the loop without finding a viable combining instruction.
     return false;
 
-  // Record the old value to use in RAUW-ing.
+  // Combine away the bottom node as its shuffle will be accumulated into
+  // a preceding shuffle.
+  DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+  // Record the old value.
   SDValue Old = V;
 
   // Merge this node's mask and our incoming mask (adjusted to account for all
@@ -18633,12 +21962,13 @@
   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
                   getV4X86ShuffleImm8ForMask(Mask, DAG));
 
-  // Replace N with its operand as we're going to combine that shuffle away.
-  DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+  // Check that the shuffles didn't cancel each other out. If not, we need to
+  // combine to the new one.
+  if (Old != V)
+    // Replace the combinable shuffle with the combined one, updating all users
+    // so that we re-evaluate the chain here.
+    DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
 
-  // Replace the combinable shuffle with the combined one, updating all users
-  // so that we re-evaluate the chain here.
-  DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
   return true;
 }
 
@@ -18679,13 +22009,13 @@
       return SDValue(); // We combined away this shuffle, so we're done.
 
     // See if this reduces to a PSHUFD which is no more expensive and can
-    // combine with more operations.
-    if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&
-        areAdjacentMasksSequential(Mask)) {
-      int DMask[] = {-1, -1, -1, -1};
+    // combine with more operations. Note that it has to at least flip the
+    // dwords as otherwise it would have been removed as a no-op.
+    if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
+      int DMask[] = {0, 1, 2, 3};
       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
-      DMask[DOffset + 0] = DOffset + Mask[0] / 2;
-      DMask[DOffset + 1] = DOffset + Mask[2] / 2;
+      DMask[DOffset + 0] = DOffset + 1;
+      DMask[DOffset + 1] = DOffset + 0;
       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
       DCI.AddToWorklist(V.getNode());
       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
@@ -18738,8 +22068,8 @@
     break;
 
   case X86ISD::PSHUFD:
-    if (combineRedundantDWordShuffle(N, Mask, DAG, DCI))
-      return SDValue(); // We combined away this shuffle.
+    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+      return NewN;
 
     break;
   }
@@ -18747,6 +22077,61 @@
   return SDValue();
 }
 
+/// \brief Try to combine a shuffle into a target-specific add-sub node.
+///
+/// We combine this directly on the abstract vector shuffle nodes so it is
+/// easier to generically match. We also insert dummy vector shuffle nodes for
+/// the operands which explicitly discard the lanes which are unused by this
+/// operation to try to flow through the rest of the combiner the fact that
+/// they're unused.
+static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // We only handle target-independent shuffles.
+  // FIXME: It would be easy and harmless to use the target shuffle mask
+  // extraction tool to support more.
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+    return SDValue();
+
+  auto *SVN = cast<ShuffleVectorSDNode>(N);
+  ArrayRef<int> Mask = SVN->getMask();
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+
+  // We require the first shuffle operand to be the SUB node, and the second to
+  // be the ADD node.
+  // FIXME: We should support the commuted patterns.
+  if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
+    return SDValue();
+
+  // If there are other uses of these operations we can't fold them.
+  if (!V1->hasOneUse() || !V2->hasOneUse())
+    return SDValue();
+
+  // Ensure that both operations have the same operands. Note that we can
+  // commute the FADD operands.
+  SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
+  if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
+      (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
+    return SDValue();
+
+  // We're looking for blends between FADD and FSUB nodes. We insist on these
+  // nodes being lined up in a specific expected pattern.
+  if (!(isShuffleEquivalent(Mask, 0, 3) ||
+        isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
+        isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
+    return SDValue();
+
+  // Only specific types are legal at this point, assert so we notice if and
+  // when these change.
+  assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
+          VT == MVT::v4f64) &&
+         "Unknown vector type encountered!");
+
+  return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
+}
+
 /// PerformShuffleCombine - Performs several different shuffle combines.
 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
@@ -18756,54 +22141,17 @@
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
-  // Canonicalize shuffles that perform 'addsub' on packed float vectors
-  // according to the rule:
-  //  (shuffle (FADD A, B), (FSUB A, B), Mask) ->
-  //  (shuffle (FSUB A, -B), (FADD A, -B), Mask)
-  //
-  // Where 'Mask' is:
-  //  <0,5,2,7>             -- for v4f32 and v4f64 shuffles;
-  //  <0,3>                 -- for v2f64 shuffles;
-  //  <0,9,2,11,4,13,6,15>  -- for v8f32 shuffles.
-  //
-  // This helps pattern-matching more SSE3/AVX ADDSUB instructions
-  // during ISel stage.
-  if (N->getOpcode() == ISD::VECTOR_SHUFFLE &&
-      ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
-       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB &&
-      // Operands to the FADD and FSUB must be the same.
-      ((N0->getOperand(0) == N1->getOperand(0) &&
-        N0->getOperand(1) == N1->getOperand(1)) ||
-       // FADD is commutable. See if by commuting the operands of the FADD
-       // we would still be able to match the operands of the FSUB dag node.
-       (N0->getOperand(1) == N1->getOperand(0) &&
-        N0->getOperand(0) == N1->getOperand(1))) &&
-      N0->getOperand(0)->getOpcode() != ISD::UNDEF &&
-      N0->getOperand(1)->getOpcode() != ISD::UNDEF) {
-    
-    ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
-    unsigned NumElts = VT.getVectorNumElements();
-    ArrayRef<int> Mask = SV->getMask();
-    bool CanFold = true;
-
-    for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i)
-      CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i);
-
-    if (CanFold) {
-      SDValue Op0 = N1->getOperand(0);
-      SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1));
-      SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1);
-      SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1);
-      return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask);
-    }
-  }
-
   // Don't create instructions with illegal types after legalize types has run.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
     return SDValue();
 
+  // If we have legalized the vector types, look for blends of FADD and FSUB
+  // nodes that we can fuse into an ADDSUB node.
+  if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
+    if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
+      return AddSub;
+
   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
   if (Subtarget->hasFp256() && VT.is256BitVector() &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE)
@@ -18880,6 +22228,18 @@
         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
     if (Shuffle.getNode())
       return Shuffle;
+
+    // Try recursively combining arbitrary sequences of x86 shuffle
+    // instructions into higher-order shuffles. We do this after combining
+    // specific PSHUF instruction sequences into their minimal form so that we
+    // can evaluate how many specialized shuffle instructions are involved in
+    // a particular chain.
+    SmallVector<int, 1> NonceMask; // Just a placeholder.
+    NonceMask.push_back(0);
+    if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
+                                      /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
+                                      DCI, Subtarget))
+      return SDValue(); // This routine will use CombineTo to replace N.
   }
 
   return SDValue();
@@ -18897,7 +22257,7 @@
 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
 /// specific shuffle of a load can be folded into a single element load.
 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
-/// shuffles have been customed lowered so we need to handle those here.
+/// shuffles have been custom lowered so we need to handle those here.
 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                                          TargetLowering::DAGCombinerInfo &DCI) {
   if (DCI.isBeforeLegalizeOps())
@@ -18909,20 +22269,20 @@
   if (!isa<ConstantSDNode>(EltNo))
     return SDValue();
 
-  EVT VT = InVec.getValueType();
+  EVT OriginalVT = InVec.getValueType();
 
-  bool HasShuffleIntoBitcast = false;
   if (InVec.getOpcode() == ISD::BITCAST) {
     // Don't duplicate a load with other uses.
     if (!InVec.hasOneUse())
       return SDValue();
     EVT BCVT = InVec.getOperand(0).getValueType();
-    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
+    if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
       return SDValue();
     InVec = InVec.getOperand(0);
-    HasShuffleIntoBitcast = true;
   }
 
+  EVT CurrentVT = InVec.getValueType();
+
   if (!isTargetShuffle(InVec.getOpcode()))
     return SDValue();
 
@@ -18932,12 +22292,12 @@
 
   SmallVector<int, 16> ShuffleMask;
   bool UnaryShuffle;
-  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
-                            UnaryShuffle))
+  if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
+                            ShuffleMask, UnaryShuffle))
     return SDValue();
 
   // Select the input vector, guarding against out of range extract vector.
-  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumElems = CurrentVT.getVectorNumElements();
   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
@@ -18963,28 +22323,28 @@
   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
     return SDValue();
 
-  if (HasShuffleIntoBitcast) {
-    // If there's a bitcast before the shuffle, check if the load type and
-    // alignment is valid.
-    unsigned Align = LN0->getAlignment();
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    unsigned NewAlign = TLI.getDataLayout()->
-      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+  EVT EltVT = N->getValueType(0);
+  // If there's a bitcast before the shuffle, check if the load type and
+  // alignment is valid.
+  unsigned Align = LN0->getAlignment();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
+      EltVT.getTypeForEVT(*DAG.getContext()));
 
-    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
-      return SDValue();
-  }
+  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
+    return SDValue();
 
   // All checks match so transform back to vector_shuffle so that DAG combiner
   // can finish the job
   SDLoc dl(N);
 
   // Create shuffle node taking into account the case that its a unary shuffle
-  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
-  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
+  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
+                                   : InVec.getOperand(1);
+  Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
                                  InVec.getOperand(0), Shuffle,
                                  &ShuffleMask[0]);
-  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
                      EltNo);
 }
@@ -19190,6 +22550,12 @@
   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return SDValue();
 
+  // A vselect where all conditions and data are constants can be optimized into
+  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
+  if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
+    return SDValue();
+
   unsigned MaskValue = 0;
   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
     return SDValue();
@@ -19367,13 +22733,15 @@
   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
       CondVT.getVectorElementType() == MVT::i1) {
     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
-    // lowering on AVX-512. In this case we convert it to
+    // lowering on KNL. In this case we convert it to
     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
-    // The same situation for all 128 and 256-bit vectors of i8 and i16
+    // The same situation for all 128 and 256-bit vectors of i8 and i16.
+    // Since SKX these selects have a proper lowering.
     EVT OpVT = LHS.getValueType();
     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
         (OpVT.getVectorElementType() == MVT::i8 ||
-         OpVT.getVectorElementType() == MVT::i16)) {
+         OpVT.getVectorElementType() == MVT::i16) &&
+        !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
       DCI.AddToWorklist(Cond.getNode());
       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
@@ -19593,22 +22961,22 @@
       return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
 
-  // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
-  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
-      // Check if SETCC has already been promoted
-      TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT &&
-      // Check that condition value type matches vselect operand type
-      CondVT == VT) { 
-
+  // Simplify vector selection if condition value type matches vselect
+  // operand type
+  if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
     assert(Cond.getValueType().isVector() &&
            "vector select expects a vector selector!");
 
     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
 
-    if (!TValIsAllOnes && !FValIsAllZeros) {
-      // Try invert the condition if true value is not all 1s and false value
-      // is not all 0s.
+    // Try invert the condition if true value is not all 1s and false value
+    // is not all 0s.
+    if (!TValIsAllOnes && !FValIsAllZeros &&
+        // Check if the selector will be produced by CMPP*/PCMP*
+        Cond.getOpcode() == ISD::SETCC &&
+        // Check if SETCC has already been promoted
+        TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
 
@@ -19726,22 +23094,17 @@
       // build_vector of constants. This will be taken care in a later
       // condition.
       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
-       VT != MVT::v8i16)) {
+       VT != MVT::v8i16) &&
+      // Don't optimize vector of constants. Those are handled by
+      // the generic code and all the bits must be properly set for
+      // the generic optimizer.
+      !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
 
     // Don't optimize vector selects that map to mask-registers.
     if (BitWidth == 1)
       return SDValue();
 
-    // Check all uses of that condition operand to check whether it will be
-    // consumed by non-BLEND instructions, which may depend on all bits are set
-    // properly.
-    for (SDNode::use_iterator I = Cond->use_begin(),
-                              E = Cond->use_end(); I != E; ++I)
-      if (I->getOpcode() != ISD::VSELECT)
-        // TODO: Add other opcodes eventually lowered into BLEND.
-        return SDValue();
-
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
 
@@ -19749,8 +23112,45 @@
     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
                                           DCI.isBeforeLegalizeOps());
     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
-        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
-      DCI.CommitTargetLoweringOpt(TLO);
+        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
+                                 TLO)) {
+      // If we changed the computation somewhere in the DAG, this change
+      // will affect all users of Cond.
+      // Make sure it is fine and update all the nodes so that we do not
+      // use the generic VSELECT anymore. Otherwise, we may perform
+      // wrong optimizations as we messed up with the actual expectation
+      // for the vector boolean values.
+      if (Cond != TLO.Old) {
+        // Check all uses of that condition operand to check whether it will be
+        // consumed by non-BLEND instructions, which may depend on all bits are
+        // set properly.
+        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+             I != E; ++I)
+          if (I->getOpcode() != ISD::VSELECT)
+            // TODO: Add other opcodes eventually lowered into BLEND.
+            return SDValue();
+
+        // Update all the users of the condition, before committing the change,
+        // so that the VSELECT optimizations that expect the correct vector
+        // boolean value will not be triggered.
+        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
+             I != E; ++I)
+          DAG.ReplaceAllUsesOfValueWith(
+              SDValue(*I, 0),
+              DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
+                          Cond, I->getOperand(1), I->getOperand(2)));
+        DCI.CommitTargetLoweringOpt(TLO);
+        return SDValue();
+      }
+      // At this point, only Cond is changed. Change the condition
+      // just for N to keep the opportunity to optimize all other
+      // users their own way.
+      DAG.ReplaceAllUsesOfValueWith(
+          SDValue(N, 0),
+          DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
+                      TLO.New, N->getOperand(1), N->getOperand(2)));
+      return SDValue();
+    }
   }
 
   // We should generate an X86ISD::BLENDI from a vselect if its argument
@@ -19764,7 +23164,9 @@
   // Iff we find this pattern and the build_vectors are built from
   // constants, we translate the vselect into a shuffle_vector that we
   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
-  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+  if ((N->getOpcode() == ISD::VSELECT ||
+       N->getOpcode() == X86ISD::SHRUNKBLEND) &&
+      !DCI.isBeforeLegalize()) {
     SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
     if (Shuffle.getNode())
       return Shuffle;
@@ -20830,7 +24232,6 @@
   EVT MemVT = Ld->getMemoryVT();
   SDLoc dl(Ld);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  unsigned RegSz = RegVT.getSizeInBits();
 
   // On Sandybridge unaligned 256bit loads are inefficient.
   ISD::LoadExtType Ext = Ld->getExtensionType();
@@ -20866,153 +24267,6 @@
     return DCI.CombineTo(N, NewVec, TF, true);
   }
 
-  // If this is a vector EXT Load then attempt to optimize it using a
-  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
-  // expansion is still better than scalar code.
-  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
-  // emit a shuffle and a arithmetic shift.
-  // TODO: It is possible to support ZExt by zeroing the undef values
-  // during the shuffle phase or after the shuffle.
-  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
-      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
-    assert(MemVT != RegVT && "Cannot extend to the same type");
-    assert(MemVT.isVector() && "Must load a vector from memory");
-
-    unsigned NumElems = RegVT.getVectorNumElements();
-    unsigned MemSz = MemVT.getSizeInBits();
-    assert(RegSz > MemSz && "Register size must be greater than the mem size");
-
-    if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
-      return SDValue();
-
-    // All sizes must be a power of two.
-    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
-      return SDValue();
-
-    // Attempt to load the original value using scalar loads.
-    // Find the largest scalar type that divides the total loaded size.
-    MVT SclrLoadTy = MVT::i8;
-    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
-         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
-      MVT Tp = (MVT::SimpleValueType)tp;
-      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
-        SclrLoadTy = Tp;
-      }
-    }
-
-    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
-    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
-        (64 <= MemSz))
-      SclrLoadTy = MVT::f64;
-
-    // Calculate the number of scalar loads that we need to perform
-    // in order to load our vector from memory.
-    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
-    if (Ext == ISD::SEXTLOAD && NumLoads > 1)
-      return SDValue();
-
-    unsigned loadRegZize = RegSz;
-    if (Ext == ISD::SEXTLOAD && RegSz == 256)
-      loadRegZize /= 2;
-
-    // Represent our vector as a sequence of elements which are the
-    // largest scalar that we can load.
-    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
-      loadRegZize/SclrLoadTy.getSizeInBits());
-
-    // Represent the data using the same element type that is stored in
-    // memory. In practice, we ''widen'' MemVT.
-    EVT WideVecVT =
-          EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
-                       loadRegZize/MemVT.getScalarType().getSizeInBits());
-
-    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
-      "Invalid vector type");
-
-    // We can't shuffle using an illegal type.
-    if (!TLI.isTypeLegal(WideVecVT))
-      return SDValue();
-
-    SmallVector<SDValue, 8> Chains;
-    SDValue Ptr = Ld->getBasePtr();
-    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
-                                        TLI.getPointerTy());
-    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
-
-    for (unsigned i = 0; i < NumLoads; ++i) {
-      // Perform a single load.
-      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
-                                       Ptr, Ld->getPointerInfo(),
-                                       Ld->isVolatile(), Ld->isNonTemporal(),
-                                       Ld->isInvariant(), Ld->getAlignment());
-      Chains.push_back(ScalarLoad.getValue(1));
-      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
-      // another round of DAGCombining.
-      if (i == 0)
-        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
-      else
-        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
-                          ScalarLoad, DAG.getIntPtrConstant(i));
-
-      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
-    }
-
-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
-
-    // Bitcast the loaded value to a vector of the original element type, in
-    // the size of the target vector type.
-    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
-    unsigned SizeRatio = RegSz/MemSz;
-
-    if (Ext == ISD::SEXTLOAD) {
-      // If we have SSE4.1 we can directly emit a VSEXT node.
-      if (Subtarget->hasSSE41()) {
-        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
-        return DCI.CombineTo(N, Sext, TF, true);
-      }
-
-      // Otherwise we'll shuffle the small elements in the high bits of the
-      // larger type and perform an arithmetic shift. If the shift is not legal
-      // it's better to scalarize.
-      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
-        return SDValue();
-
-      // Redistribute the loaded elements into the different locations.
-      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-      for (unsigned i = 0; i != NumElems; ++i)
-        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
-
-      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                           DAG.getUNDEF(WideVecVT),
-                                           &ShuffleVec[0]);
-
-      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
-
-      // Build the arithmetic shift.
-      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
-                     MemVT.getVectorElementType().getSizeInBits();
-      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
-                          DAG.getConstant(Amt, RegVT));
-
-      return DCI.CombineTo(N, Shuff, TF, true);
-    }
-
-    // Redistribute the loaded elements into the different locations.
-    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
-    for (unsigned i = 0; i != NumElems; ++i)
-      ShuffleVec[i*SizeRatio] = i;
-
-    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
-                                         DAG.getUNDEF(WideVecVT),
-                                         &ShuffleVec[0]);
-
-    // Bitcast to the requested type.
-    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
-    // Replace the original load with the new sequence
-    // and return the new chain.
-    return DCI.CombineTo(N, Shuff, TF, true);
-  }
-
   return SDValue();
 }
 
@@ -21535,13 +24789,29 @@
 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
+  // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
+  // This exposes the sext to the sdivrem lowering, so that it directly extends
+  // from AH (which we otherwise need to do contortions to access).
+  if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
+      N0.getValueType() == MVT::i8 && VT == MVT::i32) {
+    SDLoc dl(N);
+    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+    SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
+                            N0.getOperand(0), N0.getOperand(1));
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+    return R.getValue(1);
+  }
+
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (!Subtarget->hasFp256())
     return SDValue();
 
-  EVT VT = N->getValueType(0);
   if (VT.isVector() && VT.getSizeInBits() == 256) {
     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
     if (R.getNode())
@@ -21634,6 +24904,20 @@
       return R;
   }
 
+  // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
+  // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
+  // This exposes the zext to the udivrem lowering, so that it directly extends
+  // from AH (which we otherwise need to do contortions to access).
+  if (N0.getOpcode() == ISD::UDIVREM &&
+      N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
+      (VT == MVT::i32 || VT == MVT::i64)) {
+    SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+    SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
+                            N0.getOperand(0), N0.getOperand(1));
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+    return R.getValue(1);
+  }
+
   return SDValue();
 }
 
@@ -21803,8 +25087,61 @@
   return SDValue();
 }
 
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation, the operand of the
+  // unary operation isn't a bitwise AND, or if the sizes of the operations
+  // aren't the same.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
+      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant.
+    if (!BV->isConstant())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                         const X86TargetLowering *XTLI) {
+  // First try to optimize away the conversion entirely when it's
+  // conditionally from a constant. Vectors only.
+  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
+  if (Res != SDValue())
+    return Res;
+
+  // Now move on to more general possibilities.
   SDValue Op0 = N->getOperand(0);
   EVT InVT = Op0->getValueType(0);
 
@@ -21950,18 +25287,68 @@
 
 /// performVZEXTCombine - Performs build vector combines
 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        const X86Subtarget *Subtarget) {
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  SDLoc DL(N);
+  MVT VT = N->getSimpleValueType(0);
+  SDValue Op = N->getOperand(0);
+  MVT OpVT = Op.getSimpleValueType();
+  MVT OpEltVT = OpVT.getVectorElementType();
+  unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+
   // (vzext (bitcast (vzext (x)) -> (vzext x)
-  SDValue In = N->getOperand(0);
-  while (In.getOpcode() == ISD::BITCAST)
-    In = In.getOperand(0);
+  SDValue V = Op;
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
 
-  if (In.getOpcode() != X86ISD::VZEXT)
-    return SDValue();
+  if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
+    MVT InnerVT = V.getSimpleValueType();
+    MVT InnerEltVT = InnerVT.getVectorElementType();
 
-  return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
-                     In.getOperand(0));
+    // If the element sizes match exactly, we can just do one larger vzext. This
+    // is always an exact type match as vzext operates on integer types.
+    if (OpEltVT == InnerEltVT) {
+      assert(OpVT == InnerVT && "Types must match for vzext!");
+      return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
+    }
+
+    // The only other way we can combine them is if only a single element of the
+    // inner vzext is used in the input to the outer vzext.
+    if (InnerEltVT.getSizeInBits() < InputBits)
+      return SDValue();
+
+    // In this case, the inner vzext is completely dead because we're going to
+    // only look at bits inside of the low element. Just do the outer vzext on
+    // a bitcast of the input to the inner.
+    return DAG.getNode(X86ISD::VZEXT, DL, VT,
+                       DAG.getNode(ISD::BITCAST, DL, OpVT, V));
+  }
+
+  // Check if we can bypass extracting and re-inserting an element of an input
+  // vector. Essentialy:
+  // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
+    SDValue ExtractedV = V.getOperand(0);
+    SDValue OrigV = ExtractedV.getOperand(0);
+    if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
+      if (ExtractIdx->getZExtValue() == 0) {
+        MVT OrigVT = OrigV.getSimpleValueType();
+        // Extract a subvector if necessary...
+        if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
+          int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
+          OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
+                                    OrigVT.getVectorNumElements() / Ratio);
+          OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
+                              DAG.getIntPtrConstant(0));
+        }
+        Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
+        return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
+      }
+  }
+
+  return SDValue();
 }
 
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
@@ -21972,7 +25359,9 @@
   case ISD::EXTRACT_VECTOR_ELT:
     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   case ISD::VSELECT:
-  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+  case ISD::SELECT:
+  case X86ISD::SHRUNKBLEND:
+    return PerformSELECTCombine(N, DAG, DCI, Subtarget);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
@@ -22013,12 +25402,13 @@
   case X86ISD::UNPCKL:
   case X86ISD::MOVHLPS:
   case X86ISD::MOVLHPS:
+  case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
-  case X86ISD::VPERMILP:
+  case X86ISD::VPERMILPI:
   case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
@@ -22668,14 +26058,14 @@
         Constraint[5] == ')' &&
         Constraint[6] == '}') {
 
-      Res.first = X86::ST0+Constraint[4]-'0';
+      Res.first = X86::FP0+Constraint[4]-'0';
       Res.second = &X86::RFP80RegClass;
       return Res;
     }
 
     // GCC allows "st(0)" to be called just plain "st".
     if (StringRef("{st}").equals_lower(Constraint)) {
-      Res.first = X86::ST0;
+      Res.first = X86::FP0;
       Res.second = &X86::RFP80RegClass;
       return Res;
     }

diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index c8cdce7..7c6ffa2 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86ISELLOWERING_H
-#define X86ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
+#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -187,12 +187,17 @@
       /// PSIGN - Copy integer sign.
       PSIGN,
 
-      /// BLENDV - Blend where the selector is a register.
-      BLENDV,
-
       /// BLENDI - Blend where the selector is an immediate.
       BLENDI,
 
+      /// SHRUNKBLEND - Blend where the condition has been shrunk.
+      /// This is used to emphasize that the condition mask is
+      /// no more valid for generic VSELECT optimizations.
+      SHRUNKBLEND,
+
+      /// ADDSUB - Combined add and sub on an FP vector.
+      ADDSUB,
+
       // SUBUS - Integer sub with unsigned saturation.
       SUBUS,
 
@@ -301,6 +306,13 @@
 
       UMUL, // LOW, HI, FLAGS = umul LHS, RHS
 
+      // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS
+      SMUL8, UMUL8,
+
+      // 8-bit divrem that zero-extend the high result (AH).
+      UDIVREM8_ZEXT_HREG,
+      SDIVREM8_SEXT_HREG,
+
       // MUL_IMM - X86 specific multiply by immediate.
       MUL_IMM,
 
@@ -320,7 +332,10 @@
       // Several flavors of instructions with vector shuffle behaviors.
       PACKSS,
       PACKUS,
+      // Intra-lane alignr
       PALIGNR,
+      // AVX512 inter-lane alignr
+      VALIGN,
       PSHUFD,
       PSHUFHW,
       PSHUFLW,
@@ -337,7 +352,8 @@
       MOVSS,
       UNPCKL,
       UNPCKH,
-      VPERMILP,
+      VPERMILPV,
+      VPERMILPI,
       VPERMV,
       VPERMV3,
       VPERMIV3,
@@ -350,9 +366,9 @@
       VINSERT,
       VEXTRACT,
 
-      // PMULUDQ - Vector multiply packed unsigned doubleword integers
+      // Vector multiply packed unsigned doubleword integers
       PMULUDQ,
-      // PMULUDQ - Vector multiply packed signed doubleword integers
+      // Vector multiply packed signed doubleword integers
       PMULDQ,
 
       // FMA nodes
@@ -363,20 +379,19 @@
       FMADDSUB,
       FMSUBADD,
 
-      // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
-      // according to %al. An operator is needed so that this can be expanded
-      // with control flow.
+      // Save xmm argument registers to the stack, according to %al. An operator
+      // is needed so that this can be expanded with control flow.
       VASTART_SAVE_XMM_REGS,
 
-      // WIN_ALLOCA - Windows's _chkstk call to do stack probing.
+      // Windows's _chkstk call to do stack probing.
       WIN_ALLOCA,
 
-      // SEG_ALLOCA - For allocating variable amounts of stack space when using
+      // For allocating variable amounts of stack space when using
       // segmented stacks. Check if the current stacklet has enough space, and
       // falls back to heap allocation if not.
       SEG_ALLOCA,
 
-      // WIN_FTOL - Windows's _ftol2 runtime routine to do fptoui.
+      // Windows's _ftol2 runtime routine to do fptoui.
       WIN_FTOL,
 
       // Memory barrier
@@ -385,38 +400,40 @@
       SFENCE,
       LFENCE,
 
-      // FNSTSW16r - Store FP status word into i16 register.
+      // Store FP status word into i16 register.
       FNSTSW16r,
 
-      // SAHF - Store contents of %ah into %eflags.
+      // Store contents of %ah into %eflags.
       SAHF,
 
-      // RDRAND - Get a random integer and indicate whether it is valid in CF.
+      // Get a random integer and indicate whether it is valid in CF.
       RDRAND,
 
-      // RDSEED - Get a NIST SP800-90B & C compliant random integer and
+      // Get a NIST SP800-90B & C compliant random integer and
       // indicate whether it is valid in CF.
       RDSEED,
 
-      // PCMP*STRI
       PCMPISTRI,
       PCMPESTRI,
 
-      // XTEST - Test if in transactional execution.
+      // Test if in transactional execution.
       XTEST,
 
-      // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
+      // ERI instructions
+      RSQRT28, RCP28, EXP2,
+
+      // Compare and swap.
       LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LCMPXCHG8_DAG,
       LCMPXCHG16_DAG,
 
-      // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
+      // Load, scalar_to_vector, and zero extend.
       VZEXT_LOAD,
 
-      // FNSTCW16m - Store FP control world into i16 memory.
+      // Store FP control world into i16 memory.
       FNSTCW16m,
 
-      /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+      /// This instruction implements FP_TO_SINT with the
       /// integer destination in memory and a FP reg source.  This corresponds
       /// to the X86::FIST*m instructions and the rounding mode change stuff. It
       /// has two inputs (token chain and address) and two outputs (int value
@@ -425,7 +442,7 @@
       FP_TO_INT32_IN_MEM,
       FP_TO_INT64_IN_MEM,
 
-      /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+      /// This instruction implements SINT_TO_FP with the
       /// integer source in memory and FP reg result.  This corresponds to the
       /// X86::FILD*m instructions. It has three inputs (token chain, address,
       /// and source type) and two outputs (FP value and token chain). FILD_FLAG
@@ -433,19 +450,19 @@
       FILD,
       FILD_FLAG,
 
-      /// FLD - This instruction implements an extending load to FP stack slots.
+      /// This instruction implements an extending load to FP stack slots.
       /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
       /// operand, ptr to load from, and a ValueType node indicating the type
       /// to load to.
       FLD,
 
-      /// FST - This instruction implements a truncating store to FP stack
+      /// This instruction implements a truncating store to FP stack
       /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
       /// chain operand, value to store, address, and a ValueType to store it
       /// as.
       FST,
 
-      /// VAARG_64 - This instruction grabs the address of the next argument
+      /// This instruction grabs the address of the next argument
       /// from a va_list. (reads and modifies the va_list in memory)
       VAARG_64
 
@@ -457,67 +474,76 @@
 
   /// Define some predicates that are used for node matching.
   namespace X86 {
-    /// isVEXTRACT128Index - Return true if the specified
+    /// Return true if the specified
     /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
     /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
     bool isVEXTRACT128Index(SDNode *N);
 
-    /// isVINSERT128Index - Return true if the specified
+    /// Return true if the specified
     /// INSERT_SUBVECTOR operand specifies a subvector insert that is
     /// suitable for input to VINSERTF128, VINSERTI128 instructions.
     bool isVINSERT128Index(SDNode *N);
 
-    /// isVEXTRACT256Index - Return true if the specified
+    /// Return true if the specified
     /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
     /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
     bool isVEXTRACT256Index(SDNode *N);
 
-    /// isVINSERT256Index - Return true if the specified
+    /// Return true if the specified
     /// INSERT_SUBVECTOR operand specifies a subvector insert that is
     /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
     bool isVINSERT256Index(SDNode *N);
 
-    /// getExtractVEXTRACT128Immediate - Return the appropriate
+    /// Return the appropriate
     /// immediate to extract the specified EXTRACT_SUBVECTOR index
     /// with VEXTRACTF128, VEXTRACTI128 instructions.
     unsigned getExtractVEXTRACT128Immediate(SDNode *N);
 
-    /// getInsertVINSERT128Immediate - Return the appropriate
+    /// Return the appropriate
     /// immediate to insert at the specified INSERT_SUBVECTOR index
     /// with VINSERTF128, VINSERT128 instructions.
     unsigned getInsertVINSERT128Immediate(SDNode *N);
 
-    /// getExtractVEXTRACT256Immediate - Return the appropriate
+    /// Return the appropriate
     /// immediate to extract the specified EXTRACT_SUBVECTOR index
     /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
     unsigned getExtractVEXTRACT256Immediate(SDNode *N);
 
-    /// getInsertVINSERT256Immediate - Return the appropriate
+    /// Return the appropriate
     /// immediate to insert at the specified INSERT_SUBVECTOR index
     /// with VINSERTF64x4, VINSERTI64x4 instructions.
     unsigned getInsertVINSERT256Immediate(SDNode *N);
 
-    /// isZeroNode - Returns true if Elt is a constant zero or a floating point
-    /// constant +0.0.
+    /// Returns true if Elt is a constant zero or floating point constant +0.0.
     bool isZeroNode(SDValue Elt);
 
-    /// isOffsetSuitableForCodeModel - Returns true of the given offset can be
+    /// Returns true of the given offset can be
     /// fit into displacement field of the instruction.
     bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                       bool hasSymbolicDisplacement = true);
 
 
-    /// isCalleePop - Determines whether the callee is required to pop its
+    /// Determines whether the callee is required to pop its
     /// own arguments. Callee pop is necessary to support tail calls.
     bool isCalleePop(CallingConv::ID CallingConv,
                      bool is64Bit, bool IsVarArg, bool TailCallOpt);
+
+    /// AVX512 static rounding constants.  These need to match the values in
+    /// avx512fintrin.h.
+    enum STATIC_ROUNDING {
+      TO_NEAREST_INT = 0,
+      TO_NEG_INF = 1,
+      TO_POS_INF = 2,
+      TO_ZERO = 3,
+      CUR_DIRECTION = 4
+    };
   }
 
   //===--------------------------------------------------------------------===//
-  //  X86TargetLowering - X86 Implementation of the TargetLowering interface
+  //  X86 Implementation of the TargetLowering interface
   class X86TargetLowering final : public TargetLowering {
   public:
-    explicit X86TargetLowering(X86TargetMachine &TM);
+    explicit X86TargetLowering(const X86TargetMachine &TM);
 
     unsigned getJumpTableEncoding() const override;
 
@@ -528,21 +554,20 @@
                               const MachineBasicBlock *MBB, unsigned uid,
                               MCContext &Ctx) const override;
 
-    /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
-    /// jumptable.
+    /// Returns relocation base for the given PIC jumptable.
     SDValue getPICJumpTableRelocBase(SDValue Table,
                                      SelectionDAG &DAG) const override;
     const MCExpr *
     getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
                                  unsigned JTI, MCContext &Ctx) const override;
 
-    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area. For X86, aggregates
     /// that contains are placed at 16-byte boundaries while the rest are at
     /// 4-byte boundaries.
     unsigned getByValTypeAlignment(Type *Ty) const override;
 
-    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
     /// lowering. If DstAlign is zero that means it's safe to destination
     /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
@@ -557,7 +582,7 @@
                             bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                             MachineFunction &MF) const override;
 
-    /// isSafeMemOpType - Returns true if it's safe to use load / store of the
+    /// Returns true if it's safe to use load / store of the
     /// specified type to expand memcpy / memset inline. This is mostly true
     /// for all types except for some special cases. For example, on X86
     /// targets without SSE2 f64 load / store are done with fldl / fstpl which
@@ -565,17 +590,17 @@
     /// legal as the hook is used before type legalization.
     bool isSafeMemOpType(MVT VT) const override;
 
-    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// Returns true if the target allows
     /// unaligned memory accesses. of the specified type. Returns whether it
     /// is "fast" by reference in the second argument.
-    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
                                        bool *Fast) const override;
 
-    /// LowerOperation - Provide custom lowering hooks for some operations.
+    /// Provide custom lowering hooks for some operations.
     ///
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
@@ -584,13 +609,13 @@
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    /// isTypeDesirableForOp - Return true if the target has native support for
+    /// Return true if the target has native support for
     /// the specified value type and it is 'desirable' to use the type for the
     /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
     /// instruction encodings are longer and some i16 instructions are slow.
     bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
 
-    /// isTypeDesirable - Return true if the target has native support for the
+    /// Return true if the target has native support for the
     /// specified value type and it is 'desirable' to use the type. e.g. On x86
     /// i16 is legal, but undesirable since i16 instruction encodings are longer
     /// and some i16 instructions are slow.
@@ -601,24 +626,21 @@
                                   MachineBasicBlock *MBB) const override;
 
 
-    /// getTargetNodeName - This method returns the name of a target specific
-    /// DAG node.
+    /// This method returns the name of a target specific DAG node.
     const char *getTargetNodeName(unsigned Opcode) const override;
 
-    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    /// Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-    /// computeKnownBitsForTargetNode - Determine which of the bits specified
-    /// in Mask are known to be either zero or one and return them in the
-    /// KnownZero/KnownOne bitsets.
+    /// Determine which of the bits specified in Mask are known to be either
+    /// zero or one and return them in the KnownZero/KnownOne bitsets.
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
                                        APInt &KnownOne,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
-    // ComputeNumSignBitsForTargetNode - Determine the number of bits in the
-    // operation that are sign bits.
+    /// Determine the number of bits in the operation that are sign bits.
     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
                                              const SelectionDAG &DAG,
                                              unsigned Depth) const override;
@@ -641,16 +663,15 @@
 
     const char *LowerXConstraint(EVT ConstraintVT) const override;
 
-    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
-    /// true it means one of the asm constraint of the inline asm instruction
-    /// being processed is 'm'.
+    /// Lower the specified operand into the Ops vector. If it is invalid, don't
+    /// add anything to Ops. If hasMemory is true it means one of the asm
+    /// constraint of the inline asm instruction being processed is 'm'.
     void LowerAsmOperandForConstraint(SDValue Op,
                                       std::string &Constraint,
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
-    /// getRegForInlineAsmConstraint - Given a physical register constraint
+    /// Given a physical register constraint
     /// (e.g. {edx}), return the register number and the register class for the
     /// register.  This should only be used for C_Register constraints.  On
     /// error, this returns a register number of 0.
@@ -658,17 +679,17 @@
       getRegForInlineAsmConstraint(const std::string &Constraint,
                                    MVT VT) const override;
 
-    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
-    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
     /// compare a register against the immediate without having to materialize
     /// the immediate into a register.
     bool isLegalICmpImmediate(int64_t Imm) const override;
 
-    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// Return true if the specified immediate is legal
     /// add immediate, that is the target has add instructions which can
     /// add a register and the immediate without having to materialize
     /// the immediate into a register.
@@ -683,7 +704,7 @@
 
     bool isVectorShiftByScalarCheap(Type *Ty) const override;
 
-    /// isTruncateFree - Return true if it's free to truncate a value of
+    /// Return true if it's free to truncate a value of
     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
     /// register EAX to i16 by referencing its sub-register AX.
     bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
@@ -691,7 +712,7 @@
 
     bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
-    /// isZExtFree - Return true if any actual instruction that defines a
+    /// Return true if any actual instruction that defines a
     /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
     /// register. This does not necessarily include registers defined in
     /// unknown ways, such as incoming arguments, or copies from unknown
@@ -703,37 +724,35 @@
     bool isZExtFree(EVT VT1, EVT VT2) const override;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
 
-    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-    /// expanded to FMAs when this method returns true, otherwise fmuladd is
-    /// expanded to fmul + fadd.
+    /// Return true if an FMA operation is faster than a pair of fmul and fadd
+    /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+    /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
     bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
-    /// isNarrowingProfitable - Return true if it's profitable to narrow
+    /// Return true if it's profitable to narrow
     /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
     /// from i32 to i8 but not from i32 to i16.
     bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
-    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
     bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
-    /// isShuffleMaskLegal - Targets can use this to indicate that they only
-    /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
-    /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
-    /// values are assumed to be legal.
+    /// Targets can use this to indicate that they only support *some*
+    /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
+    /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
+    /// be legal.
     bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
                             EVT VT) const override;
 
-    /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
-    /// used by Targets can use this to indicate if there is a suitable
-    /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
-    /// pool entry.
+    /// Similar to isShuffleMaskLegal. This is used by Targets can use this to
+    /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to
+    /// replace a VAND with a constant pool entry.
     bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
                                 EVT VT) const override;
 
-    /// ShouldShrinkFPConstant - If true, then instruction selection should
+    /// If true, then instruction selection should
     /// seek to shrink the FP constant of the specified type to a smaller type
     /// in order to save space and / or reduce runtime.
     bool ShouldShrinkFPConstant(EVT VT) const override {
@@ -747,19 +766,18 @@
       return Subtarget;
     }
 
-    /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
-    /// computed in an SSE register, not on the X87 floating point stack.
+    /// Return true if the specified scalar FP type is computed in an SSE
+    /// register, not on the X87 floating point stack.
     bool isScalarFPTypeInSSEReg(EVT VT) const {
       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
       (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
     }
 
-    /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine
-    /// for fptoui.
+    /// Return true if the target uses the MSVC _ftol2 routine for fptoui.
     bool isTargetFTOL() const;
 
-    /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be
-    /// used for fptoui to the given type.
+    /// Return true if the MSVC _ftol2 routine should be used for fptoui to the
+    /// given type.
     bool isIntegerTypeFTOL(EVT VT) const {
       return isTargetFTOL() && VT == MVT::i64;
     }
@@ -776,15 +794,14 @@
 
     unsigned getRegisterByName(const char* RegName, EVT VT) const override;
 
-    /// createFastISel - This method returns a target specific FastISel object,
+    /// This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo) const override;
 
-    /// getStackCookieLocation - Return true if the target stores stack
-    /// protector cookies at a fixed offset in some non-standard address
-    /// space, and populates the address space and offset as
-    /// appropriate.
+    /// Return true if the target stores stack protector cookies at a fixed
+    /// offset in some non-standard address space, and populates the address
+    /// space and offset as appropriate.
     bool getStackCookieLocation(unsigned &AddressSpace,
                                 unsigned &Offset) const override;
 
@@ -796,6 +813,7 @@
     /// \brief Reset the operation actions based on target options.
     void resetOperationActions() override;
 
+    bool useLoadStackGuardNode() const override;
     /// \brief Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
 
@@ -804,7 +822,7 @@
     findRepresentativeClass(MVT VT) const override;
 
   private:
-    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
     const DataLayout *TD;
@@ -813,17 +831,16 @@
     /// the operation actions unless we have to.
     TargetOptions TO;
 
-    /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
-    /// floating point ops.
+    /// Select between SSE or x87 floating point ops.
     /// When SSE is available, use it for f32 operations.
     /// When SSE2 is available, use it for f64 operations.
     bool X86ScalarSSEf32;
     bool X86ScalarSSEf64;
 
-    /// LegalFPImmediates - A list of legal fp immediates.
+    /// A list of legal FP immediates.
     std::vector<APFloat> LegalFPImmediates;
 
-    /// addLegalFPImmediate - Indicate that this x86 target can instruction
+    /// Indicate that this x86 target can instruction
     /// select the specified FP immediate natively.
     void addLegalFPImmediate(const APFloat& Imm) {
       LegalFPImmediates.push_back(Imm);
@@ -847,9 +864,8 @@
 
     // Call lowering helpers.
 
-    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
-    /// for tail call optimization. Targets which want to do tail call
-    /// optimization should implement this function.
+    /// Check whether the call is eligible for tail call optimization. Targets
+    /// that want to do tail call optimization should implement this function.
     bool IsEligibleForTailCallOptimization(SDValue Callee,
                                            CallingConv::ID CalleeCC,
                                            bool isVarArg,
@@ -936,7 +952,7 @@
 
     bool mayBeEmittedAsTailCall(CallInst *CI) const override;
 
-    MVT getTypeForExtArgOrReturn(MVT VT,
+    EVT getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
                                  ISD::NodeType ExtendKind) const override;
 
     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
@@ -946,6 +962,15 @@
 
     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
+    bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+    bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+    bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+
+    LoadInst *
+    lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+
+    bool needsCmpXchgNb(const Type *MemType) const;
+
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
     /// nand, max, min, umax, umin). It takes the corresponding instruction to
     /// expand, the associated machine basic block, and the associated X86
@@ -975,8 +1000,7 @@
                                               MachineBasicBlock *BB) const;
 
     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
-                                            MachineBasicBlock *BB,
-                                            bool Is64Bit) const;
+                                            MachineBasicBlock *BB) const;
 
     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
                                           MachineBasicBlock *BB) const;
@@ -1005,6 +1029,15 @@
 
     /// Convert a comparison if required by the subtarget.
     SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+
+    /// Use rsqrt* to speed up sqrt calculations.
+    SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                             unsigned &RefinementSteps,
+                             bool &UseOneConstNR) const override;
+
+    /// Use rcp* to speed up fdiv calculations.
+    SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                             unsigned &RefinementSteps) const override;
   };
 
   namespace X86 {

diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 41e900e..b188cd5 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td

@@ -1,19 +1,277 @@
+// Group template arguments that can be derived from the vector type (EltNum x
+// EltVT).  These are things like the register class for the writemask, etc.
+// The idea is to pass one of these as the template argument rather than the
+// individual arguments.
+class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc,
+                      string suffix = ""> {
+  RegisterClass RC = rc;
+  int NumElts = numelts;
+
+  // Corresponding mask register class.
+  RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts);
+
+  // Corresponding write-mask register class.
+  RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
+
+  // The GPR register class that can hold the write mask.  Use GR8 for fewer
+  // than 8 elements.  Use shift-right and equal to work around the lack of
+  // !lt in tablegen.
+  RegisterClass MRC =
+    !cast<RegisterClass>("GR" #
+                         !if (!eq (!srl(NumElts, 3), 0), 8, NumElts));
+
+  // Suffix used in the instruction mnemonic.
+  string Suffix = suffix;
+
+  string VTName = "v" # NumElts # EltVT;
+
+  // The vector VT.
+  ValueType VT = !cast<ValueType>(VTName);
+
+  string EltTypeName = !cast<string>(EltVT);
+  // Size of the element type in bits, e.g. 32 for v16i32.
+  string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName));
+  int EltSize = EltVT.Size;
+
+  // "i" for integer types and "f" for floating-point types
+  string TypeVariantName = !subst(EltSizeName, "", EltTypeName);
+
+  // Size of RC in bits, e.g. 512 for VR512.
+  int Size = VT.Size;
+
+  // The corresponding memory operand, e.g. i512mem for VR512.
+  X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
+  X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
+
+  // Load patterns
+  // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
+  //       due to load promotion during legalization
+  PatFrag LdFrag = !cast<PatFrag>("load" #
+                                  !if (!eq (TypeVariantName, "i"),
+                                       !if (!eq (Size, 128), "v2i64",
+                                       !if (!eq (Size, 256), "v4i64",
+                                            VTName)), VTName));
+  PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
+
+  // Load patterns used for memory operands.  We only have this defined in
+  // case of i64 element types for sub-512 integer vectors.  For now, keep
+  // MemOpFrag undefined in these cases.
+  PatFrag MemOpFrag =
+    !if (!eq (TypeVariantName, "f"), !cast<PatFrag>("memop" # VTName),
+    !if (!eq (EltTypeName, "i64"),   !cast<PatFrag>("memop" # VTName),
+    !if (!eq (VTName, "v16i32"),     !cast<PatFrag>("memop" # VTName), ?)));
+
+  // The corresponding float type, e.g. v16f32 for v16i32
+  // Note: For EltSize < 32, FloatVT is illegal and TableGen
+  //       fails to compile, so we choose FloatVT = VT
+  ValueType FloatVT = !cast<ValueType>(
+                        !if (!eq (!srl(EltSize,5),0),
+                             VTName,
+                             !if (!eq(TypeVariantName, "i"),
+                                  "v" # NumElts # "f" # EltSize,
+                                  VTName)));
+
+  // The string to specify embedded broadcast in assembly.
+  string BroadcastStr = "{1to" # NumElts # "}";
+
+  // 8-bit compressed displacement tuple/subvector format.  This is only
+  // defined for NumElts <= 8.
+  CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0),
+                               !cast<CD8VForm>("CD8VT" # NumElts), ?);
+
+  SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm,
+                          !if (!eq (Size, 256), sub_ymm, ?));
+
+  Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle,
+                     !if (!eq (EltTypeName, "f64"), SSEPackedDouble,
+                     SSEPackedInt));
+
+  // A vector type of the same width with element type i32.  This is used to
+  // create the canonical constant zero node ImmAllZerosV.
+  ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
+  dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV)));
+}
+
+def v64i8_info  : X86VectorVTInfo<64,  i8, VR512, "b">;
+def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
+def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
+def v8i64_info  : X86VectorVTInfo<8,  i64, VR512, "q">;
+def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
+def v8f64_info  : X86VectorVTInfo<8,  f64, VR512, "pd">;
+
+// "x" in v32i8x_info means RC = VR256X
+def v32i8x_info  : X86VectorVTInfo<32,  i8, VR256X, "b">;
+def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
+def v8i32x_info  : X86VectorVTInfo<8,  i32, VR256X, "d">;
+def v4i64x_info  : X86VectorVTInfo<4,  i64, VR256X, "q">;
+def v8f32x_info  : X86VectorVTInfo<8,  f32, VR256X, "ps">;
+def v4f64x_info  : X86VectorVTInfo<4,  f64, VR256X, "pd">;
+
+def v16i8x_info  : X86VectorVTInfo<16,  i8, VR128X, "b">;
+def v8i16x_info  : X86VectorVTInfo<8,  i16, VR128X, "w">;
+def v4i32x_info  : X86VectorVTInfo<4,  i32, VR128X, "d">;
+def v2i64x_info  : X86VectorVTInfo<2,  i64, VR128X, "q">;
+def v4f32x_info  : X86VectorVTInfo<4,  f32, VR128X, "ps">;
+def v2f64x_info  : X86VectorVTInfo<2,  f64, VR128X, "pd">;
+
+class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,
+                           X86VectorVTInfo i128> {
+  X86VectorVTInfo info512 = i512;
+  X86VectorVTInfo info256 = i256;
+  X86VectorVTInfo info128 = i128;
+}
+
+def avx512vl_i8_info  : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info,
+                                             v16i8x_info>;
+def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info,
+                                             v8i16x_info>;
+def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info,
+                                             v4i32x_info>;
+def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info,
+                                             v2i64x_info>;
+def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
+                                             v4f32x_info>;
+def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
+                                             v2f64x_info>;
+
+// This multiclass generates the masking variants from the non-masking
+// variant.  It only provides the assembly pieces for the masking variants.
+// It assumes custom ISel patterns for masking which can be provided as
+// template arguments.
+multiclass AVX512_maskable_custom<bits<8> O, Format F,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  list<dag> Pattern,
+                                  list<dag> MaskingPattern,
+                                  list<dag> ZeroMaskingPattern,
+                                  string Round = "",
+                                  string MaskingConstraint = "",
+                                  InstrItinClass itin = NoItinerary,
+                                  bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+    def NAME: AVX512<O, F, Outs, Ins,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"#
+                                     "$dst "#Round#", "#IntelSrcAsm#"}",
+                       Pattern, itin>;
+
+  // Prefer over VMOV*rrk Pat<>
+  let AddedComplexity = 20 in
+    def NAME#k: AVX512<O, F, Outs, MaskingIns,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}"#Round#"|"#
+                                     "$dst {${mask}}"#Round#", "#IntelSrcAsm#"}",
+                       MaskingPattern, itin>,
+              EVEX_K {
+      // In case of the 3src subclass this is overridden with a let.
+      string Constraints = MaskingConstraint;
+  }
+  let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
+    def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
+                       OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}"#Round#"|"#
+                                     "$dst {${mask}} {z}"#Round#", "#IntelSrcAsm#"}",
+                       ZeroMaskingPattern,
+                       itin>,
+              EVEX_KZ;
+}
+
+
+// Common base class of AVX512_maskable and AVX512_maskable_3src.
+multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  dag RHS, dag MaskingRHS,
+                                  string Round = "",
+                                  string MaskingConstraint = "",
+                                  InstrItinClass itin = NoItinerary,
+                                  bit IsCommutable = 0> :
+  AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+                         AttSrcAsm, IntelSrcAsm,
+                         [(set _.RC:$dst, RHS)],
+                         [(set _.RC:$dst, MaskingRHS)],
+                         [(set _.RC:$dst,
+                               (vselect _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
+                         Round, MaskingConstraint, NoItinerary, IsCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the instruction.  In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS, string Round = "",
+                           InstrItinClass itin = NoItinerary,
+                           bit IsCommutable = 0> :
+   AVX512_maskable_common<O, F, _, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (vselect _.KRCWM:$mask, RHS, _.RC:$src0), Round,
+                          "$src0 = $dst", itin, IsCommutable>;
+
+// Similar to AVX512_maskable but in this case one of the source operands
+// ($src1) is already tied to $dst so we just use that for the preserved
+// vector elements.  NOTE that the NonTiedIns (the ins dag) should exclude
+// $src1.
+multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
+                                dag Outs, dag NonTiedIns, string OpcodeStr,
+                                string AttSrcAsm, string IntelSrcAsm,
+                                dag RHS> :
+   AVX512_maskable_common<O, F, _, Outs,
+                          !con((ins _.RC:$src1), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
+                          (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>;
+
+
+multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs, dag Ins,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  list<dag> Pattern> :
+   AVX512_maskable_custom<O, F, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "",
+                          "$src0 = $dst">;
+
 // Bitcasts between 512-bit vector types. Return the original type since
 // no instruction is needed for the conversion
 let Predicates = [HasAVX512] in {
-  def : Pat<(v8f64  (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v8i64 VR512:$src))),  (v8f64 VR512:$src)>;
   def : Pat<(v8f64  (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
-  def : Pat<(v8f64  (bitconvert (v8i64 VR512:$src))),  (v8f64 VR512:$src)>;
-  def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v32i16 VR512:$src))),  (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
   def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))),  (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>;
   def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))),  (v16f32 VR512:$src)>;
-  def : Pat<(v8i64  (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
   def : Pat<(v8i64  (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>;
   def : Pat<(v8i64  (bitconvert (v8f64 VR512:$src))),  (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
   def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
-  def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))),  (v16i32 VR512:$src)>;
   def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))),  (v16i32 VR512:$src)>;
-  def : Pat<(v8f64  (bitconvert (v8i64 VR512:$src))),  (v8f64 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))),  (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))),  (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))),  (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+  def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v8f64 VR512:$src))),  (v64i8 VR512:$src)>;
+  def : Pat<(v64i8  (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>;
 
   def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>;
   def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>;
@@ -99,120 +357,92 @@
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
-// -- 32x8 form --
-let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
-def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst),
-          (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
-          "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, EVEX_4V, EVEX_V512;
-let mayLoad = 1 in
-def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst),
-          (ins VR512:$src1, f128mem:$src2, i8imm:$src3),
-          "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+multiclass vinsert_for_size_no_alt<int Opcode,
+                                   X86VectorVTInfo From, X86VectorVTInfo To,
+                                   PatFrag vinsert_insert,
+                                   SDNodeXForm INSERT_get_vinsert_imm> {
+  let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+    def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst),
+               (ins VR512:$src1, From.RC:$src2, i8imm:$src3),
+               "vinsert" # From.EltTypeName # "x" # From.NumElts #
+                                                "\t{$src3, $src2, $src1, $dst|"
+                                                   "$dst, $src1, $src2, $src3}",
+               [(set To.RC:$dst, (vinsert_insert:$src3 (To.VT VR512:$src1),
+                                                       (From.VT From.RC:$src2),
+                                                       (iPTR imm)))]>,
+             EVEX_4V, EVEX_V512;
+
+    let mayLoad = 1 in
+    def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst),
+               (ins VR512:$src1, From.MemOp:$src2, i8imm:$src3),
+               "vinsert" # From.EltTypeName # "x" # From.NumElts #
+                                                "\t{$src3, $src2, $src1, $dst|"
+                                                   "$dst, $src1, $src2, $src3}",
+               []>,
+             EVEX_4V, EVEX_V512, EVEX_CD8<From.EltSize, From.CD8TupleForm>;
+  }
 }
 
-// -- 64x4 fp form --
-let hasSideEffects = 0, ExeDomain = SSEPackedDouble in {
-def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst),
-          (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
-          "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, EVEX_4V, EVEX_V512, VEX_W;
-let mayLoad = 1 in
-def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst),
-          (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
-          "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
-}
-// -- 32x4 integer form --
-let hasSideEffects = 0 in {
-def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst),
-          (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
-          "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, EVEX_4V, EVEX_V512;
-let mayLoad = 1 in
-def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst),
-          (ins VR512:$src1, i128mem:$src2, i8imm:$src3),
-          "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
-
+multiclass vinsert_for_size<int Opcode,
+                            X86VectorVTInfo From, X86VectorVTInfo To,
+                            X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo,
+                            PatFrag vinsert_insert,
+                            SDNodeXForm INSERT_get_vinsert_imm> :
+  vinsert_for_size_no_alt<Opcode, From, To,
+                          vinsert_insert, INSERT_get_vinsert_imm> {
+  // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for
+  // vinserti32x4.  Only add this if 64x2 and friends are not supported
+  // natively via AVX512DQ.
+  let Predicates = [NoDQI] in
+    def : Pat<(vinsert_insert:$ins
+                 (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)),
+              (AltTo.VT (!cast<Instruction>(NAME # From.EltSize # "x4rr")
+                            VR512:$src1, From.RC:$src2,
+                            (INSERT_get_vinsert_imm VR512:$ins)))>;
 }
 
-let hasSideEffects = 0 in {
-// -- 64x4 form --
-def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst),
-          (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
-          "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, EVEX_4V, EVEX_V512, VEX_W;
-let mayLoad = 1 in
-def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst),
-          (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
-          "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
+                            ValueType EltVT64, int Opcode256> {
+  defm NAME # "32x4" : vinsert_for_size<Opcode128,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 vinsert128_insert,
+                                 INSERT_get_vinsert128_imm>;
+  let Predicates = [HasDQI] in
+    defm NAME # "64x2" : vinsert_for_size_no_alt<Opcode128,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 vinsert128_insert,
+                                 INSERT_get_vinsert128_imm>, VEX_W;
+  defm NAME # "64x4" : vinsert_for_size<Opcode256,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 X86VectorVTInfo< 8, EltVT32, VR256>,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 vinsert256_insert,
+                                 INSERT_get_vinsert256_imm>, VEX_W;
+  let Predicates = [HasDQI] in
+    defm NAME # "32x8" : vinsert_for_size_no_alt<Opcode256,
+                                 X86VectorVTInfo< 8, EltVT32, VR256X>,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 vinsert256_insert,
+                                 INSERT_get_vinsert256_imm>;
 }
 
-def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2),
-           (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
-                        (INSERT_get_vinsert128_imm VR512:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8f64  VR512:$src1), (v2f64 VR128X:$src2),
-           (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
-                        (INSERT_get_vinsert128_imm VR512:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (v2i64 VR128X:$src2),
-           (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
-                        (INSERT_get_vinsert128_imm VR512:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2),
-           (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
-                        (INSERT_get_vinsert128_imm VR512:$ins))>;
-
-def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2),
-           (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
-                        (INSERT_get_vinsert128_imm VR512:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1),
-                  (bc_v4i32 (loadv2i64 addr:$src2)),
-           (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
-                        (INSERT_get_vinsert128_imm VR512:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8f64  VR512:$src1), (loadv2f64 addr:$src2),
-           (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
-                        (INSERT_get_vinsert128_imm VR512:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (loadv2i64 addr:$src2),
-           (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
-                        (INSERT_get_vinsert128_imm VR512:$ins))>;
-
-def : Pat<(vinsert256_insert:$ins (v16f32  VR512:$src1), (v8f32 VR256X:$src2),
-           (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
-                        (INSERT_get_vinsert256_imm VR512:$ins))>;
-def : Pat<(vinsert256_insert:$ins (v8f64  VR512:$src1), (v4f64 VR256X:$src2),
-           (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
-                        (INSERT_get_vinsert256_imm VR512:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (v4i64 VR256X:$src2),
-           (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
-                        (INSERT_get_vinsert256_imm VR512:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2),
-           (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
-                        (INSERT_get_vinsert256_imm VR512:$ins))>;
-
-def : Pat<(vinsert256_insert:$ins (v16f32  VR512:$src1), (loadv8f32 addr:$src2),
-           (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
-                        (INSERT_get_vinsert256_imm VR512:$ins))>;
-def : Pat<(vinsert256_insert:$ins (v8f64  VR512:$src1), (loadv4f64 addr:$src2),
-           (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
-                        (INSERT_get_vinsert256_imm VR512:$ins))>;
-def : Pat<(vinsert256_insert:$ins (v8i64  VR512:$src1), (loadv4i64 addr:$src2),
-           (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
-                        (INSERT_get_vinsert256_imm VR512:$ins))>;
-def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1),
-	                (bc_v8i32 (loadv4i64 addr:$src2)),
-           (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
-                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
+defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
 
 // vinsertps - insert f32 to XMM
 def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
-      (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3),
+      (ins VR128X:$src1, VR128X:$src2, i8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
       EVEX_4V;
 def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
-      (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3),
+      (ins VR128X:$src1, f32mem:$src2, i8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1,
                           (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
@@ -221,106 +451,90 @@
 //===----------------------------------------------------------------------===//
 // AVX-512 VECTOR EXTRACT
 //---
-let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
-// -- 32x4 form --
-def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst),
-          (ins VR512:$src1, i8imm:$src2),
-          "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, EVEX, EVEX_V512;
-def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs),
-          (ins f128mem:$dst, VR512:$src1, i8imm:$src2),
-          "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
 
-// -- 64x4 form --
-def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst),
-          (ins VR512:$src1, i8imm:$src2),
-          "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, EVEX, EVEX_V512, VEX_W;
-let mayStore = 1 in
-def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs),
-          (ins f256mem:$dst, VR512:$src1, i8imm:$src2),
-          "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+multiclass vextract_for_size<int Opcode,
+                             X86VectorVTInfo From, X86VectorVTInfo To,
+                             X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo,
+                             PatFrag vextract_extract,
+                             SDNodeXForm EXTRACT_get_vextract_imm> {
+  let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+    defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
+                (ins VR512:$src1, i8imm:$idx),
+                "vextract" # To.EltTypeName # "x4",
+                "$idx, $src1", "$src1, $idx",
+                [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1),
+                                                         (iPTR imm)))]>,
+              AVX512AIi8Base, EVEX, EVEX_V512;
+    let mayStore = 1 in
+    def rm : AVX512AIi8<Opcode, MRMDestMem, (outs),
+            (ins To.MemOp:$dst, VR512:$src1, i8imm:$src2),
+            "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|"
+                                               "$dst, $src1, $src2}",
+            []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>;
+  }
+
+  // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for
+  // vextracti32x4
+  def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)),
+            (AltTo.VT (!cast<Instruction>(NAME # To.EltSize # "x4rr")
+                          VR512:$src1,
+                          (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+
+  // A 128/256-bit subvector extract from the first 512-bit vector position is
+  // a subregister copy that needs no instruction.
+  def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))),
+            (To.VT
+               (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>;
+
+  // And for the alternative types.
+  def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))),
+            (AltTo.VT
+               (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>;
+
+  // Intrinsic call with masking.
+  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+                              "x4_512")
+                VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask),
+            (!cast<Instruction>(NAME # To.EltSize # "x4rrk") To.RC:$src0,
+                (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)),
+                VR512:$src1, imm:$idx)>;
+
+  // Intrinsic call with zero-masking.
+  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+                              "x4_512")
+                VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask),
+            (!cast<Instruction>(NAME # To.EltSize # "x4rrkz")
+                (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)),
+                VR512:$src1, imm:$idx)>;
+
+  // Intrinsic call without masking.
+  def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
+                              "x4_512")
+                VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
+            (!cast<Instruction>(NAME # To.EltSize # "x4rr")
+                VR512:$src1, imm:$idx)>;
 }
 
-let hasSideEffects = 0 in {
-// -- 32x4 form --
-def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst),
-          (ins VR512:$src1, i8imm:$src2),
-          "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, EVEX, EVEX_V512;
-def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs),
-          (ins i128mem:$dst, VR512:$src1, i8imm:$src2),
-          "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
-
-// -- 64x4 form --
-def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst),
-          (ins VR512:$src1, i8imm:$src2),
-          "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, EVEX, EVEX_V512, VEX_W;
-let mayStore = 1 in
-def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs),
-          (ins i256mem:$dst, VR512:$src1, i8imm:$src2),
-          "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+multiclass vextract_for_type<ValueType EltVT32, int Opcode32,
+                             ValueType EltVT64, int Opcode64> {
+  defm NAME # "32x4" : vextract_for_size<Opcode32,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 X86VectorVTInfo< 4, EltVT32, VR128X>,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 X86VectorVTInfo< 2, EltVT64, VR128X>,
+                                 vextract128_extract,
+                                 EXTRACT_get_vextract128_imm>;
+  defm NAME # "64x4" : vextract_for_size<Opcode64,
+                                 X86VectorVTInfo< 8, EltVT64, VR512>,
+                                 X86VectorVTInfo< 4, EltVT64, VR256X>,
+                                 X86VectorVTInfo<16, EltVT32, VR512>,
+                                 X86VectorVTInfo< 8, EltVT32, VR256>,
+                                 vextract256_extract,
+                                 EXTRACT_get_vextract256_imm>, VEX_W;
 }
 
-def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
-          (v4f32 (VEXTRACTF32x4rr VR512:$src1,
-                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
-
-def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)),
-          (v4i32 (VEXTRACTF32x4rr VR512:$src1,
-                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
-
-def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
-          (v2f64 (VEXTRACTF32x4rr VR512:$src1,
-                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
-
-def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
-          (v2i64 (VEXTRACTI32x4rr VR512:$src1,
-                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
-
-
-def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
-          (v8f32 (VEXTRACTF64x4rr VR512:$src1,
-                  (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
-
-def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)),
-          (v8i32 (VEXTRACTI64x4rr VR512:$src1,
-                    (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
-
-def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
-          (v4f64 (VEXTRACTF64x4rr VR512:$src1,
-                  (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
-
-def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
-          (v4i64 (VEXTRACTI64x4rr VR512:$src1,
-                  (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
-
-// A 256-bit subvector extract from the first 512-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
-          (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
-def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
-          (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
-def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
-          (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
-def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
-          (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
-
-// zmm -> xmm
-def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
-          (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
-def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
-          (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
-def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
-          (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
-def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
-          (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
-
+defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
+defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
 
 // A 128-bit subvector insert to the first 512-bit vector position
 // is a subregister copy that needs no instruction.
@@ -352,13 +566,13 @@
 
 // vextractps - extract 32 bits from XMM
 def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
-      (ins VR128X:$src1, u32u8imm:$src2),
+      (ins VR128X:$src1, i32i8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
       EVEX;
 
 def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
-      (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2),
+      (ins f32mem:$dst, VR128X:$src1, i32i8imm:$src2),
       "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
                           addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;
@@ -366,36 +580,57 @@
 //===---------------------------------------------------------------------===//
 // AVX-512 BROADCAST
 //---
-multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr, 
-                         RegisterClass DestRC,
-                         RegisterClass SrcRC, X86MemOperand x86memop> {
-  def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src),
-         !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
-         []>, EVEX;
-  def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src),
-        !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),[]>, EVEX;
+multiclass avx512_fp_broadcast<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
+                              ValueType svt, X86VectorVTInfo _> {
+  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                   (ins SrcRC:$src), "vbroadcast"## !subst("p", "s", _.Suffix),
+                   "$src", "$src", (_.VT (OpNode (svt SrcRC:$src)))>,
+                   T8PD, EVEX;
+
+  let mayLoad = 1 in {
+    defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.ScalarMemOp:$src),
+                     "vbroadcast"##!subst("p", "s", _.Suffix), "$src", "$src",
+                     (_.VT (OpNode (_.ScalarLdFrag addr:$src)))>,
+                     T8PD, EVEX;
+  }
 }
+
+multiclass avx512_fp_broadcast_vl<bits<8> opc, SDNode OpNode,
+                                  AVX512VLVectorVTInfo _> {
+  defm Z  : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info512>,
+                             EVEX_V512;
+
+  let Predicates = [HasVLX] in {
+    defm Z256  : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info256>,
+                                  EVEX_V256;
+  }
+}
+
 let ExeDomain = SSEPackedSingle in {
-  defm VBROADCASTSSZ  : avx512_fp_broadcast<0x18, "vbroadcastss", VR512,
-                                       VR128X, f32mem>,
-                                       EVEX_V512, EVEX_CD8<32, CD8VT1>;
+  defm VBROADCASTSS  : avx512_fp_broadcast_vl<0x18, X86VBroadcast,
+                              avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>;
+   let Predicates = [HasVLX] in {
+     defm VBROADCASTSSZ128  : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X,
+                                     v4f32, v4f32x_info>, EVEX_V128,
+                                     EVEX_CD8<32, CD8VT1>;
+   }
 }
 
 let ExeDomain = SSEPackedDouble in {
-  defm VBROADCASTSDZ  : avx512_fp_broadcast<0x19, "vbroadcastsd", VR512,
-                                       VR128X, f64mem>,
-                                       EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+  defm VBROADCASTSD  : avx512_fp_broadcast_vl<0x19, X86VBroadcast,
+                              avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>;
 }
 
 def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
-          (VBROADCASTSSZrm addr:$src)>;
+          (VBROADCASTSSZm addr:$src)>;
 def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
-          (VBROADCASTSDZrm addr:$src)>;
+          (VBROADCASTSDZm addr:$src)>;
 
 def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
-          (VBROADCASTSSZrm addr:$src)>;
+          (VBROADCASTSSZm addr:$src)>;
 def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
-          (VBROADCASTSDZrm addr:$src)>;
+          (VBROADCASTSDZm addr:$src)>;
 
 multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
                           RegisterClass SrcRC, RegisterClass KRC> {
@@ -503,22 +738,27 @@
 def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
           (VPBROADCASTQZrr VR128X:$src)>;
 
-def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),
-          (VBROADCASTSSZrr VR128X:$src)>;
-def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),
-          (VBROADCASTSDZrr VR128X:$src)>;
+def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
+          (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))),
+          (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+
+def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))),
+          (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
+def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))),
+          (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
 
 def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),
-          (VBROADCASTSSZrr VR128X:$src)>;
+          (VBROADCASTSSZr VR128X:$src)>;
 def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),
-          (VBROADCASTSDZrr VR128X:$src)>;
+          (VBROADCASTSDZr VR128X:$src)>;
     
 // Provide fallback in case the load node that is used in the patterns above
 // is used by additional users, which prevents the pattern selection.
 def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
-          (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+          (VBROADCASTSSZr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
 def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
-          (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+          (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
 
 
 let Predicates = [HasAVX512] in {
@@ -532,48 +772,91 @@
 //---
 
 multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
-                       RegisterClass DstRC, RegisterClass KRC,
-                       ValueType OpVT, ValueType SrcVT> {
-def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
+                       RegisterClass KRC> {
+let Predicates = [HasCDI] in
+def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src),
                   !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
-                  []>, EVEX;
+                  []>, EVEX, EVEX_V512;
+                  
+let Predicates = [HasCDI, HasVLX] in {
+def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  []>, EVEX, EVEX_V128;
+def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src),
+                  !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
+                  []>, EVEX, EVEX_V256;
+}
 }
 
 let Predicates = [HasCDI] in {
-defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
-                                             VK16, v16i32, v16i1>, EVEX_V512;
-defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
-                                            VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d",
+                                             VK16>;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
+                                             VK8>, VEX_W;
 }
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - VPERM
 //
 // -- immediate form --
-multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                         SDNode OpNode, PatFrag mem_frag, 
-                         X86MemOperand x86memop, ValueType OpVT> {
-  def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst),
-                     (ins RC:$src1, i8imm:$src2),
+multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
+  def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst),
+                     (ins _.RC:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
                          " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set RC:$dst,
-                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     [(set _.RC:$dst,
+                       (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>,
                      EVEX;
-  def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst),
-                     (ins x86memop:$src1, i8imm:$src2),
+  def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst),
+                     (ins _.MemOp:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
                          " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                     [(set RC:$dst,
-                       (OpVT (OpNode (mem_frag addr:$src1),
-                              (i8 imm:$src2))))]>, EVEX;
+                     [(set _.RC:$dst,
+                       (_.VT (OpNode (_.MemOpFrag addr:$src1),
+                              (i8 imm:$src2))))]>,
+           EVEX, EVEX_CD8<_.EltSize, CD8VF>;
+}
 }
 
-defm VPERMQZ  : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64,
-                        i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-let ExeDomain = SSEPackedDouble in 
-defm VPERMPDZ  : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, 
-                        f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _,
+                         X86VectorVTInfo Ctrl> :
+     avx512_perm_imm<OpcImm, "vpermil" # _.Suffix, X86VPermilpi, _> {
+  let ExeDomain = _.ExeDomain in {
+    def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.RC:$src2),
+                     !strconcat("vpermil" # _.Suffix,
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set _.RC:$dst,
+                         (_.VT (X86VPermilpv _.RC:$src1,
+                                  (Ctrl.VT Ctrl.RC:$src2))))]>,
+             EVEX_4V;
+    def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst),
+                     (ins _.RC:$src1, Ctrl.MemOp:$src2),
+                     !strconcat("vpermil" # _.Suffix,
+                         " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set _.RC:$dst,
+                         (_.VT (X86VPermilpv _.RC:$src1,
+                                  (Ctrl.VT (Ctrl.MemOpFrag addr:$src2)))))]>,
+             EVEX_4V;
+  }
+}
+
+defm VPERMQZ :    avx512_perm_imm<0x00, "vpermq", X86VPermi, v8i64_info>,
+                  EVEX_V512, VEX_W;
+defm VPERMPDZ :   avx512_perm_imm<0x01, "vpermpd", X86VPermi, v8f64_info>,
+                  EVEX_V512, VEX_W;
+
+defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>,
+                  EVEX_V512;
+defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>,
+                  EVEX_V512, VEX_W;
+
+def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPSZri VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPDZri VR512:$src1, imm:$imm)>;
 
 // -- VPERM - register form --
 multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, 
@@ -834,98 +1117,295 @@
                  XD, VEX_W;
 }
 
-multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
-              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
-              SDNode OpNode, ValueType vt> {
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+              X86VectorVTInfo _> {
   def rr : AVX512BI<opc, MRMSrcReg,
-             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
-             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], 
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
              IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  let mayLoad = 1 in
   def rm : AVX512BI<opc, MRMSrcMem,
-             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
-             !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))],
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                                     (_.VT (bitconvert (_.LdFrag addr:$src2)))))],
              IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  def rrk : AVX512BI<opc, MRMSrcReg,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                   (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
+              IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+  let mayLoad = 1 in
+  def rmk : AVX512BI<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
+              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                   (OpNode (_.VT _.RC:$src1),
+                                       (_.VT (bitconvert
+                                              (_.LdFrag addr:$src2))))))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
 }
 
-defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, 
-                           memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512,
-                           EVEX_CD8<32, CD8VF>;
-defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, 
-                           memopv8i64, X86pcmpeqm, v8i64>, T8PD, EVEX_V512,
-                           VEX_W, EVEX_CD8<64, CD8VF>;
+multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+              X86VectorVTInfo _> :
+           avx512_icmp_packed<opc, OpcodeStr, OpNode, _> {
+  let mayLoad = 1 in {
+  def rmb : AVX512BI<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
+              !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
+                                    "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                              (X86VBroadcast (_.ScalarLdFrag addr:$src2))))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+  def rmbk : AVX512BI<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+                                       _.ScalarMemOp:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+               [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                      (OpNode (_.VT _.RC:$src1),
+                                        (X86VBroadcast
+                                          (_.ScalarLdFrag addr:$src2)))))],
+               IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+  }
+}
 
-defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, 
-                           memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512,
-                           EVEX_CD8<32, CD8VF>;
-defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, 
-                           memopv8i64, X86pcmpgtm, v8i64>, T8PD, EVEX_V512,
-                           VEX_W, EVEX_CD8<64, CD8VF>;
+multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512>,
+           EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                EVEX_V256;
+    defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                EVEX_V128;
+  }
+}
+
+multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, AVX512VLVectorVTInfo VTInfo,
+                                  Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
+           EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
+                EVEX_V256;
+    defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
+                EVEX_V128;
+  }
+}
+
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
+                      avx512vl_i8_info, HasBWI>,
+                EVEX_CD8<8, CD8VF>;
+
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
+                      avx512vl_i16_info, HasBWI>,
+                EVEX_CD8<16, CD8VF>;
+
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
+                      avx512vl_i32_info, HasAVX512>,
+                EVEX_CD8<32, CD8VF>;
+
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
+                      avx512vl_i64_info, HasAVX512>,
+                T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
+                      avx512vl_i8_info, HasBWI>,
+                EVEX_CD8<8, CD8VF>;
+
+defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
+                      avx512vl_i16_info, HasBWI>,
+                EVEX_CD8<16, CD8VF>;
+
+defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
+                      avx512vl_i32_info, HasAVX512>,
+                EVEX_CD8<32, CD8VF>;
+
+defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
+                      avx512vl_i64_info, HasAVX512>,
+                T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
-            (COPY_TO_REGCLASS (VPCMPGTDZrr 
+            (COPY_TO_REGCLASS (VPCMPGTDZrr
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
 
 def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
-            (COPY_TO_REGCLASS (VPCMPEQDZrr 
+            (COPY_TO_REGCLASS (VPCMPEQDZrr
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
             (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
 
-multiclass avx512_icmp_cc<bits<8> opc, RegisterClass WMRC, RegisterClass KRC,
-              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
-              SDNode OpNode, ValueType vt, Operand CC, string Suffix> {
+multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
+                          X86VectorVTInfo _> {
   def rri : AVX512AIi8<opc, MRMSrcReg,
-             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc),
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
              !strconcat("vpcmp${cc}", Suffix,
                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], 
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                       imm:$cc))],
              IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  let mayLoad = 1 in
   def rmi : AVX512AIi8<opc, MRMSrcMem,
-             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc),
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
              !strconcat("vpcmp${cc}", Suffix,
                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2),
-                              imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                              imm:$cc))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  def rrik : AVX512AIi8<opc, MRMSrcReg,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+                                      AVXCC:$cc),
+              !strconcat("vpcmp${cc}", Suffix,
+                         "\t{$src2, $src1, $dst {${mask}}|",
+                         "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                  (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                                          imm:$cc)))],
+              IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+  let mayLoad = 1 in
+  def rmik : AVX512AIi8<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+                                    AVXCC:$cc),
+              !strconcat("vpcmp${cc}", Suffix,
+                         "\t{$src2, $src1, $dst {${mask}}|",
+                         "$dst {${mask}}, $src1, $src2}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                   (OpNode (_.VT _.RC:$src1),
+                                      (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                      imm:$cc)))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+
   // Accept explicit immediate argument form instead of comparison code.
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
     def rri_alt : AVX512AIi8<opc, MRMSrcReg,
-               (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
-               !strconcat("vpcmp", Suffix,
-                  "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, i8imm:$cc),
+               !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+                          "$dst, $src1, $src2, $cc}"),
                [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
-    def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
-               (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, RC:$src2, i8imm:$cc),
-               !strconcat("vpcmp", Suffix,
-                  "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"),
-               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
     def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
-               (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
-               !strconcat("vpcmp", Suffix,
-                  "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"),
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i8imm:$cc),
+               !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
+                          "$dst, $src1, $src2, $cc}"),
                [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
-    def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
-               (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, x86memop:$src2, i8imm:$cc),
+    def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
+                                       i8imm:$cc),
                !strconcat("vpcmp", Suffix,
-                  "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"),
+                          "\t{$cc, $src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2, $cc}"),
+               [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+    def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
+                                       i8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                          "\t{$cc, $src2, $src1, $dst {${mask}}|",
+                          "$dst {${mask}}, $src1, $src2, $cc}"),
                [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
   }
 }
 
-defm VPCMPDZ :  avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32,
-                               X86cmpm, v16i32, AVXCC, "d">,
-                EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32,
-                               X86cmpmu, v16i32, AVXCC, "ud">,
-                EVEX_V512, EVEX_CD8<32, CD8VF>;
+multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
+                              X86VectorVTInfo _> :
+           avx512_icmp_cc<opc, Suffix, OpNode, _> {
+  let mayLoad = 1 in {
+  def rmib : AVX512AIi8<opc, MRMSrcMem,
+             (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+                                     AVXCC:$cc),
+             !strconcat("vpcmp${cc}", Suffix,
+                        "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
+                        "$dst, $src1, ${src2}", _.BroadcastStr, "}"),
+             [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
+                               (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                               imm:$cc))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+  def rmibk : AVX512AIi8<opc, MRMSrcMem,
+              (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+                                       _.ScalarMemOp:$src2, AVXCC:$cc),
+              !strconcat("vpcmp${cc}", Suffix,
+                       "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+                       "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
+              [(set _.KRC:$dst, (and _.KRCWM:$mask,
+                                  (OpNode (_.VT _.RC:$src1),
+                                    (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                    imm:$cc)))],
+              IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+  }
 
-defm VPCMPQZ :  avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64,
-                               X86cmpm, v8i64, AVXCC, "q">,
-                VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64,
-                               X86cmpmu, v8i64, AVXCC, "uq">,
-                VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+  // Accept explicit immediate argument form instead of comparison code.
+  let isAsmParserOnly = 1, hasSideEffects = 0 in {
+    def rmib_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
+                                       i8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                   "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
+                   "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+    def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
+                                       _.ScalarMemOp:$src2, i8imm:$cc),
+               !strconcat("vpcmp", Suffix,
+                  "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
+                  "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
+               [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+  }
+}
+
+multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
+                             AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256;
+    defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128;
+  }
+}
+
+multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
+                                AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>,
+           EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>,
+                EVEX_V256;
+    defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>,
+                EVEX_V128;
+  }
+}
+
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info,
+                                HasBWI>, EVEX_CD8<8, CD8VF>;
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info,
+                                 HasBWI>, EVEX_CD8<8, CD8VF>;
+
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info,
+                                HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info,
+                                 HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info,
+                                    HasAVX512>, EVEX_CD8<32, CD8VF>;
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info,
+                                     HasAVX512>, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
+                                    HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
+                                     HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
 
 // avx512_cmp_packed - compare packed instructions
 multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
@@ -1015,14 +1495,14 @@
 //
 multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
                          string OpcodeStr, RegisterClass KRC,
-                         ValueType vt, X86MemOperand x86memop> {
+                         ValueType vvt, ValueType ivt, X86MemOperand x86memop> {
   let hasSideEffects = 0 in {
     def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
                !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
     let mayLoad = 1 in
     def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
                !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
-               [(set KRC:$dst, (vt (load addr:$src)))]>;
+               [(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>;
     let mayStore = 1 in
     def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
                !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>;
@@ -1040,32 +1520,82 @@
   }
 }
 
-let Predicates = [HasAVX512] in {
-  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+let Predicates = [HasDQI] in
+  defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8,
+                               i8mem>,
+               avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
+               VEX, PD;
+
+let Predicates = [HasAVX512] in
+  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16,
+                               i16mem>,
+               avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
                VEX, PS;
-  defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
-               VEX, PS;
+
+let Predicates = [HasBWI] in {
+  defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32,
+                               i32mem>, VEX, PD, VEX_W;
+  defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
+               VEX, XD;
 }
 
+let Predicates = [HasBWI] in {
+  defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64,
+                               i64mem>, VEX, PS, VEX_W;
+  defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
+               VEX, XD, VEX_W;
+}
+
+// GR from/to mask register
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+            (KMOVBkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit))>;
+  def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+            (EXTRACT_SUBREG (KMOVBrk VK8:$src), sub_8bit)>;
+}
 let Predicates = [HasAVX512] in {
-  // GR16 from/to 16-bit mask
   def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
             (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>;
   def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
             (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (KMOVDkr GR32:$src)>;
+  def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), (KMOVDrk VK32:$src)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), (KMOVQkr GR64:$src)>;
+  def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), (KMOVQrk VK64:$src)>;
+}
 
-  // Store kreg in memory
-  def : Pat<(store (v16i1 VK16:$src), addr:$dst),
+// Load/store kreg
+let Predicates = [HasDQI] in {
+  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
+            (KMOVBmk addr:$dst, VK8:$src)>;
+}
+let Predicates = [HasAVX512] in {
+  def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
             (KMOVWmk addr:$dst, VK16:$src)>;
-
-  def : Pat<(store VK8:$src, addr:$dst),
+  def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst),
             (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>;
-
   def : Pat<(i1 (load addr:$src)),
             (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>;
-
-  def : Pat<(v8i1 (load addr:$src)),
+  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
             (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst),
+            (KMOVDmk addr:$dst, VK32:$src)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst),
+            (KMOVQmk addr:$dst, VK64:$src)>;
+}
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(i1 (trunc (i64 GR64:$src))),
+            (COPY_TO_REGCLASS (KMOVWkr (AND32ri (EXTRACT_SUBREG $src, sub_32bit),
+                                        (i32 1))), VK1)>;
 
   def : Pat<(i1 (trunc (i32 GR32:$src))),
             (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>;
@@ -1078,7 +1608,7 @@
        (COPY_TO_REGCLASS
         (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))),
        VK1)>;
-            
+
   def : Pat<(i32 (zext VK1:$src)),
             (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
   def : Pat<(i8 (zext VK1:$src)),
@@ -1097,6 +1627,14 @@
   def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
             (COPY_TO_REGCLASS VK1:$src, VK8)>;
 }
+let Predicates = [HasBWI] in {
+  def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
+            (COPY_TO_REGCLASS VK1:$src, VK32)>;
+  def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
+            (COPY_TO_REGCLASS VK1:$src, VK64)>;
+}
+
+
 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
 let Predicates = [HasAVX512] in {
   // GR from/to 8-bit mask without native support
@@ -1113,26 +1651,38 @@
             (COPY_TO_REGCLASS VK16:$src, VK1)>;
   def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))),
             (COPY_TO_REGCLASS VK8:$src, VK1)>;
-
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK32:$src, VK1)>;
+  def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))),
+            (COPY_TO_REGCLASS VK64:$src, VK1)>;
 }
 
 // Mask unary operation
 // - KNOT
 multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
-                         RegisterClass KRC, SDPatternOperator OpNode> {
-  let Predicates = [HasAVX512] in
+                            RegisterClass KRC, SDPatternOperator OpNode,
+                            Predicate prd> {
+  let Predicates = [prd] in
     def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
                !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                [(set KRC:$dst, (OpNode KRC:$src))]>;
 }
 
-multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr,
-                               SDPatternOperator OpNode> {
-  defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
-                          VEX, PS;
+multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
+                                SDPatternOperator OpNode> {
+  defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+                            HasDQI>, VEX, PD;
+  defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+                            HasAVX512>, VEX, PS;
+  defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+                            HasBWI>, VEX, PD, VEX_W;
+  defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+                            HasBWI>, VEX, PS, VEX_W;
 }
 
-defm KNOT : avx512_mask_unop_w<0x44, "knot", not>;
+defm KNOT : avx512_mask_unop_all<0x44, "knot", not>;
 
 multiclass avx512_mask_unop_int<string IntName, string InstName> {
   let Predicates = [HasAVX512] in
@@ -1143,43 +1693,60 @@
 }
 defm : avx512_mask_unop_int<"knot", "KNOT">;
 
+let Predicates = [HasDQI] in
+def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (KNOTBrr VK8:$src1)>;
+let Predicates = [HasAVX512] in
 def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>;
+let Predicates = [HasBWI] in
+def : Pat<(xor VK32:$src1, (v32i1 immAllOnesV)), (KNOTDrr VK32:$src1)>;
+let Predicates = [HasBWI] in
+def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>;
+
+// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
+let Predicates = [HasAVX512] in {
 def : Pat<(xor VK8:$src1,  (v8i1 immAllOnesV)),
           (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
 
-// With AVX-512, 8-bit mask is promoted to 16-bit mask.
 def : Pat<(not VK8:$src),
           (COPY_TO_REGCLASS
             (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+}
 
 // Mask binary operation
 // - KAND, KANDN, KOR, KXNOR, KXOR
 multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
-                           RegisterClass KRC, SDPatternOperator OpNode> {
-  let Predicates = [HasAVX512] in
+                           RegisterClass KRC, SDPatternOperator OpNode,
+                           Predicate prd> {
+  let Predicates = [prd] in
     def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
                !strconcat(OpcodeStr,
                           " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
 }
 
-multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr,
-                             SDPatternOperator OpNode> {
-  defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
-                           VEX_4V, VEX_L, PS;
+multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
+                               SDPatternOperator OpNode> {
+  defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+                             HasDQI>, VEX_4V, VEX_L, PD;
+  defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+                             HasAVX512>, VEX_4V, VEX_L, PS;
+  defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+                             HasBWI>, VEX_4V, VEX_L, VEX_W, PD;
+  defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+                             HasBWI>, VEX_4V, VEX_L, VEX_W, PS;
 }
 
 def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
 def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
 
 let isCommutable = 1 in {
-  defm KAND  : avx512_mask_binop_w<0x41, "kand",  and>;
-  let isCommutable = 0 in
-  defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>;
-  defm KOR   : avx512_mask_binop_w<0x45, "kor",   or>;
-  defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>;
-  defm KXOR  : avx512_mask_binop_w<0x47, "kxor",  xor>;
+  defm KAND  : avx512_mask_binop_all<0x41, "kand",  and>;
+  defm KOR   : avx512_mask_binop_all<0x45, "kor",   or>;
+  defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor>;
+  defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor>;
 }
+let isCommutable = 0 in
+  defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn>;
 
 def : Pat<(xor VK1:$src1, VK1:$src2),
      (COPY_TO_REGCLASS (KXORWrr (COPY_TO_REGCLASS VK1:$src1, VK16),
@@ -1325,6 +1892,17 @@
 def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
           (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
 
+let Predicates = [HasVLX] in {
+  def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))),
+            (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>;
+  def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))),
+            (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>;
+  def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+            (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
+  def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
+            (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
+}
+
 def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))),
           (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>;
 
@@ -1334,104 +1912,176 @@
 // AVX-512 - Aligned and unaligned load and store
 //
 
-multiclass avx512_load<bits<8> opc, RegisterClass RC, RegisterClass KRC,
-                            X86MemOperand x86memop, PatFrag ld_frag, 
-                            string asm, Domain d,
-                            ValueType vt, bit IsReMaterializable = 1> {
+multiclass avx512_load<bits<8> opc, string OpcodeStr, PatFrag ld_frag,
+                       RegisterClass KRC, RegisterClass RC,
+                       ValueType vt, ValueType zvt, X86MemOperand memop,
+                       Domain d, bit IsReMaterializable = 1> {
 let hasSideEffects = 0 in {
   def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-              !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>,
-              EVEX;
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
+                    d>, EVEX;
   def rrkz : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src),
-               !strconcat(asm,
-               " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-               [], d>, EVEX, EVEX_KZ;
+                      !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+                       "${dst} {${mask}} {z}, $src}"), [], d>, EVEX, EVEX_KZ;
   }
-  let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
-  def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
-               [(set (vt RC:$dst), (ld_frag addr:$src))], d>, EVEX;
-  let Constraints = "$src1 = $dst",  hasSideEffects = 0 in {
-  def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), 
-                                     (ins RC:$src1, KRC:$mask, RC:$src2),
-              !strconcat(asm, 
-              " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
-              EVEX, EVEX_K;
-  let mayLoad = 1 in
-  def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
-                                (ins RC:$src1, KRC:$mask, x86memop:$src2),
-              !strconcat(asm, 
-              " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-               [], d>, EVEX, EVEX_K;
+  let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable,
+      SchedRW = [WriteLoad] in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst, (vt (bitconvert (ld_frag addr:$src))))],
+                    d>, EVEX;
+
+  let AddedComplexity = 20 in {
+  let Constraints = "$src0 = $dst",  hasSideEffects = 0 in {
+  let hasSideEffects = 0 in
+    def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src0, KRC:$mask, RC:$src1),
+                     !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+                      "${dst} {${mask}}, $src1}"),
+                     [(set RC:$dst, (vt (vselect KRC:$mask,
+                                          (vt RC:$src1),
+                                          (vt RC:$src0))))],
+                     d>, EVEX, EVEX_K;
+  let mayLoad = 1, SchedRW = [WriteLoad] in
+    def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins RC:$src0, KRC:$mask, memop:$src1),
+                     !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+                      "${dst} {${mask}}, $src1}"),
+                     [(set RC:$dst, (vt
+                         (vselect KRC:$mask,
+                                 (vt (bitconvert (ld_frag addr:$src1))),
+                                 (vt RC:$src0))))],
+                     d>, EVEX, EVEX_K;
   }
-  let mayLoad = 1 in
-  def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
-                      (ins KRC:$mask, x86memop:$src2),
-              !strconcat(asm,
-              " \t{$src2, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src2}"),
-               [], d>, EVEX, EVEX_KZ;
+  let mayLoad = 1, SchedRW = [WriteLoad] in
+    def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
+                      (ins KRC:$mask, memop:$src),
+                      !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+                       "${dst} {${mask}} {z}, $src}"),
+                      [(set RC:$dst, (vt
+                           (vselect KRC:$mask,
+                                     (vt (bitconvert (ld_frag addr:$src))),
+                                     (vt (bitconvert (zvt immAllZerosV))))))],
+                      d>, EVEX, EVEX_KZ;
+  }
 }
 
-multiclass avx512_store<bits<8> opc, RegisterClass RC, RegisterClass KRC,
-                            X86MemOperand x86memop, PatFrag store_frag,
-                            string asm, Domain d, ValueType vt> {
+multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, string ld_pat,
+                          string elty, string elsz, string vsz512,
+                          string vsz256, string vsz128, Domain d,
+                          Predicate prd, bit IsReMaterializable = 1> {
+  let Predicates = [prd] in
+  defm Z : avx512_load<opc, OpcodeStr,
+                       !cast<PatFrag>(ld_pat##"v"##vsz512##elty##elsz),
+                       !cast<RegisterClass>("VK"##vsz512##"WM"), VR512,
+                       !cast<ValueType>("v"##vsz512##elty##elsz), v16i32,
+                       !cast<X86MemOperand>(elty##"512mem"), d,
+                       IsReMaterializable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_load<opc, OpcodeStr,
+                       !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"),
+                             "v"##vsz256##elty##elsz, "v4i64")),
+                       !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X,
+                       !cast<ValueType>("v"##vsz256##elty##elsz), v8i32,
+                       !cast<X86MemOperand>(elty##"256mem"), d,
+                       IsReMaterializable>, EVEX_V256;
+
+    defm Z128 : avx512_load<opc, OpcodeStr,
+                       !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"),
+                             "v"##vsz128##elty##elsz, "v2i64")),
+                       !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X,
+                       !cast<ValueType>("v"##vsz128##elty##elsz), v4i32,
+                       !cast<X86MemOperand>(elty##"128mem"), d,
+                       IsReMaterializable>, EVEX_V128;
+  }
+}
+
+
+multiclass avx512_store<bits<8> opc, string OpcodeStr, PatFrag st_frag,
+                        ValueType OpVT, RegisterClass KRC, RegisterClass RC,
+                        X86MemOperand memop, Domain d> {
   let isAsmParserOnly = 1, hasSideEffects = 0 in {
   def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src),
-              !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>,
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], d>,
               EVEX;
   let Constraints = "$src1 = $dst" in
-  def alt_rrk : AVX512PI<opc, MRMDestReg, (outs  RC:$dst),
-                                     (ins RC:$src1, KRC:$mask, RC:$src2),
-              !strconcat(asm,
-              " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
+  def rrk_alt : AVX512PI<opc, MRMDestReg, (outs  RC:$dst),
+                                          (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(OpcodeStr,
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
               EVEX, EVEX_K;
-  def alt_rrkz : AVX512PI<opc, MRMDestReg, (outs  RC:$dst),
+  def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs  RC:$dst),
                                            (ins KRC:$mask, RC:$src),
-              !strconcat(asm,
-              " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+              !strconcat(OpcodeStr,
+              "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
               [], d>, EVEX, EVEX_KZ;
   }
   let mayStore = 1 in {
-  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
-              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
-               [(store_frag (vt RC:$src), addr:$dst)], d>, EVEX;
+  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(st_frag (OpVT RC:$src), addr:$dst)], d>, EVEX;
   def mrk : AVX512PI<opc, MRMDestMem, (outs),
-                                (ins x86memop:$dst, KRC:$mask, RC:$src),
-              !strconcat(asm,
-              " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+                                      (ins memop:$dst, KRC:$mask, RC:$src),
+              !strconcat(OpcodeStr,
+              "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
                [], d>, EVEX, EVEX_K;
-  def mrkz : AVX512PI<opc, MRMDestMem, (outs),
-                      (ins x86memop:$dst, KRC:$mask, RC:$src),
-              !strconcat(asm,
-              " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
-               [], d>, EVEX, EVEX_KZ;
   }
 }
 
-defm VMOVAPSZ : avx512_load<0x28, VR512, VK16WM, f512mem, alignedloadv16f32,
-                              "vmovaps", SSEPackedSingle, v16f32>,
-                avx512_store<0x29, VR512, VK16WM, f512mem, alignedstore512,
-                              "vmovaps", SSEPackedSingle, v16f32>,
-                               PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VMOVAPDZ : avx512_load<0x28, VR512, VK8WM, f512mem, alignedloadv8f64,
-                              "vmovapd", SSEPackedDouble, v8f64>,
-                avx512_store<0x29, VR512, VK8WM, f512mem, alignedstore512,
-                              "vmovapd", SSEPackedDouble, v8f64>,
-                              PD, EVEX_V512, VEX_W,
-                              EVEX_CD8<64, CD8VF>;
-defm VMOVUPSZ : avx512_load<0x10, VR512, VK16WM, f512mem, loadv16f32,
-                              "vmovups", SSEPackedSingle, v16f32>,
-                avx512_store<0x11, VR512, VK16WM, f512mem, store,
-                              "vmovups", SSEPackedSingle, v16f32>,
-                              PS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VMOVUPDZ : avx512_load<0x10, VR512, VK8WM, f512mem, loadv8f64,
-                              "vmovupd", SSEPackedDouble, v8f64, 0>,
-                avx512_store<0x11, VR512, VK8WM, f512mem, store,
-                              "vmovupd", SSEPackedDouble, v8f64>,
-                               PD, EVEX_V512, VEX_W,
-                               EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_store_vl<bits<8> opc, string OpcodeStr, string st_pat,
+                           string st_suff_512, string st_suff_256,
+                           string st_suff_128, string elty, string elsz,
+                           string vsz512, string vsz256, string vsz128,
+                           Domain d, Predicate prd> {
+  let Predicates = [prd] in
+  defm Z : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_512),
+                        !cast<ValueType>("v"##vsz512##elty##elsz),
+                        !cast<RegisterClass>("VK"##vsz512##"WM"), VR512,
+                        !cast<X86MemOperand>(elty##"512mem"), d>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_256),
+                             !cast<ValueType>("v"##vsz256##elty##elsz),
+                             !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X,
+                             !cast<X86MemOperand>(elty##"256mem"), d>, EVEX_V256;
+
+    defm Z128 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_128),
+                             !cast<ValueType>("v"##vsz128##elty##elsz),
+                             !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X,
+                             !cast<X86MemOperand>(elty##"128mem"), d>, EVEX_V128;
+  }
+}
+
+defm VMOVAPS : avx512_load_vl<0x28, "vmovaps", "alignedload", "f", "32",
+                              "16", "8", "4", SSEPackedSingle, HasAVX512>,
+               avx512_store_vl<0x29, "vmovaps", "alignedstore",
+                               "512", "256", "", "f", "32", "16", "8", "4",
+                               SSEPackedSingle, HasAVX512>,
+                              PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVAPD : avx512_load_vl<0x28, "vmovapd", "alignedload", "f", "64",
+                              "8", "4", "2", SSEPackedDouble, HasAVX512>,
+               avx512_store_vl<0x29, "vmovapd", "alignedstore",
+                               "512", "256", "", "f", "64", "8", "4", "2",
+                               SSEPackedDouble, HasAVX512>,
+                              PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVUPS : avx512_load_vl<0x10, "vmovups", "load", "f", "32",
+                              "16", "8", "4", SSEPackedSingle, HasAVX512>,
+               avx512_store_vl<0x11, "vmovups", "store", "", "", "", "f", "32",
+                              "16", "8", "4", SSEPackedSingle, HasAVX512>,
+                              PS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", "load", "f", "64",
+                              "8", "4", "2", SSEPackedDouble, HasAVX512, 0>,
+               avx512_store_vl<0x11, "vmovupd", "store", "", "", "", "f", "64",
+                              "8", "4", "2", SSEPackedDouble, HasAVX512>,
+                             PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
 def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr,
-                 (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
+                (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
        (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
 
 def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
@@ -1447,75 +2097,80 @@
          (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
             VR512:$src)>;
 
-defm VMOVDQA32: avx512_load<0x6F, VR512, VK16WM, i512mem, alignedloadv16i32,
-                              "vmovdqa32", SSEPackedInt, v16i32>,
-                avx512_store<0x7F, VR512, VK16WM, i512mem, alignedstore512,
-                              "vmovdqa32", SSEPackedInt, v16i32>,
-                               PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VMOVDQA64: avx512_load<0x6F, VR512, VK8WM, i512mem, alignedloadv8i64,
-                              "vmovdqa64", SSEPackedInt, v8i64>,
-                avx512_store<0x7F, VR512, VK8WM, i512mem, alignedstore512,
-                              "vmovdqa64", SSEPackedInt, v8i64>,
-                               PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-defm VMOVDQU32: avx512_load<0x6F, VR512, VK16WM, i512mem, load,
-                              "vmovdqu32", SSEPackedInt, v16i32>,
-                avx512_store<0x7F, VR512, VK16WM, i512mem, store,
-                              "vmovdqu32", SSEPackedInt, v16i32>,
-                               XS, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load,
-                              "vmovdqu64", SSEPackedInt, v8i64>,
-                avx512_store<0x7F, VR512, VK8WM, i512mem, store,
-                              "vmovdqu64", SSEPackedInt, v8i64>,
-                               XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32",
+                                "16", "8", "4", SSEPackedInt, HasAVX512>,
+                 avx512_store_vl<0x7F, "vmovdqa32", "alignedstore",
+                                 "512", "256", "", "i", "32", "16", "8", "4",
+                                 SSEPackedInt, HasAVX512>,
+                                PD, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQA64 : avx512_load_vl<0x6F, "vmovdqa64", "alignedload", "i", "64",
+                                "8", "4", "2", SSEPackedInt, HasAVX512>,
+                 avx512_store_vl<0x7F, "vmovdqa64", "alignedstore",
+                                 "512", "256", "", "i", "64", "8", "4", "2",
+                                 SSEPackedInt, HasAVX512>,
+                                PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", "load", "i", "8",
+                               "64", "32", "16", SSEPackedInt, HasBWI>,
+                 avx512_store_vl<0x7F, "vmovdqu8", "store", "", "", "",
+                                 "i", "8", "64", "32", "16", SSEPackedInt,
+                                 HasBWI>, XD, EVEX_CD8<8, CD8VF>;
+
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", "load", "i", "16",
+                                "32", "16", "8", SSEPackedInt, HasBWI>,
+                 avx512_store_vl<0x7F, "vmovdqu16", "store", "", "", "",
+                                 "i", "16", "32", "16", "8", SSEPackedInt,
+                                 HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", "load", "i", "32",
+                                "16", "8", "4", SSEPackedInt, HasAVX512>,
+                 avx512_store_vl<0x7F, "vmovdqu32", "store", "", "", "",
+                                 "i", "32", "16", "8", "4", SSEPackedInt,
+                                 HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
+
+defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", "load", "i", "64",
+                                "8", "4", "2", SSEPackedInt, HasAVX512>,
+                 avx512_store_vl<0x7F, "vmovdqu64", "store", "", "", "",
+                                 "i", "64", "8", "4", "2", SSEPackedInt,
+                                 HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
                  (v16i32 immAllZerosV), GR16:$mask)),
-       (VMOVDQU32rmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+       (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
 
 def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
-                 (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
-       (VMOVDQU64rmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+                (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
 
 def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src),
-          GR16:$mask),
-         (VMOVDQU32mrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            GR16:$mask),
+         (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
             VR512:$src)>;
 def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src),
-          GR8:$mask),
-         (VMOVDQU64mrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            GR8:$mask),
+         (VMOVDQU64Zmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
             VR512:$src)>;
 
 let AddedComplexity = 20 in {
 def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
-                           (bc_v8i64 (v16i32 immAllZerosV)))),
-                  (VMOVDQU64rrkz VK8WM:$mask, VR512:$src)>;
+                          (bc_v8i64 (v16i32 immAllZerosV)))),
+                  (VMOVDQU64Zrrkz VK8WM:$mask, VR512:$src)>;
 
 def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
-                  (v8i64 VR512:$src))),
-   (VMOVDQU64rrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
+                          (v8i64 VR512:$src))),
+   (VMOVDQU64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
                                               VK8), VR512:$src)>;
 
 def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src),
                            (v16i32 immAllZerosV))),
-                  (VMOVDQU32rrkz VK16WM:$mask, VR512:$src)>;
+                  (VMOVDQU32Zrrkz VK16WM:$mask, VR512:$src)>;
 
 def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV),
-                   (v16i32 VR512:$src))),
-   (VMOVDQU32rrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
-                                              
-def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), 
-                           (v16f32 VR512:$src2))),
-                  (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
-def : Pat<(v8f64 (vselect VK8WM:$mask, (v8f64 VR512:$src1), 
-                           (v8f64 VR512:$src2))),
-                  (VMOVUPDZrrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
-def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src1), 
-                           (v16i32 VR512:$src2))),
-                  (VMOVDQU32rrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
-def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), 
-                           (v8i64 VR512:$src2))),
-                  (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
+                           (v16i32 VR512:$src))),
+                  (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
 }
+
 // Move Int Doubleword to Packed Double Int
 //
 def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
@@ -1641,10 +2296,16 @@
               !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
               [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
               EVEX, VEX_LIG;
+  let mayStore = 1 in {
   def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
              !strconcat(asm, " \t{$src, $dst|$dst, $src}"),
              [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
              EVEX, VEX_LIG;
+  def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src),
+             !strconcat(asm, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
+             [], IIC_SSE_MOV_S_MR>,
+             EVEX, VEX_LIG, EVEX_K;
+  } // mayStore
   } //hasSideEffects = 0
 }
 
@@ -1664,6 +2325,10 @@
           (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
            VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
 
+def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
+          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
+           (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+
 // For the disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
@@ -1882,136 +2547,201 @@
 //===----------------------------------------------------------------------===//
 // AVX-512 - Non-temporals
 //===----------------------------------------------------------------------===//
+let SchedRW = [WriteLoad] in {
+  def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
+                        (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
+                        [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],
+                        SSEPackedInt>, EVEX, T8PD, EVEX_V512,
+                        EVEX_CD8<64, CD8VF>;
 
-def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst),
-                            (ins i512mem:$src),
-                            "vmovntdqa\t{$src, $dst|$dst, $src}",
-                            [(set VR512:$dst,
-                              (int_x86_avx512_movntdqa addr:$src))]>,
-                   EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+  let Predicates = [HasAVX512, HasVLX] in {
+    def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
+                             (ins i256mem:$src),
+                             "vmovntdqa\t{$src, $dst|$dst, $src}", [],
+                             SSEPackedInt>, EVEX, T8PD, EVEX_V256,
+                             EVEX_CD8<64, CD8VF>;
 
-// Prefer non-temporal over temporal versions
-let AddedComplexity = 400, SchedRW = [WriteStore] in {
-
-def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs),
-                            (ins f512mem:$dst, VR512:$src),
-                            "vmovntps\t{$src, $dst|$dst, $src}",
-                            [(alignednontemporalstore (v16f32 VR512:$src),
-                                                      addr:$dst)],
-                            IIC_SSE_MOVNT>,
-                  EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs),
-                            (ins f512mem:$dst, VR512:$src),
-                            "vmovntpd\t{$src, $dst|$dst, $src}",
-                            [(alignednontemporalstore (v8f64 VR512:$src),
-                                                      addr:$dst)],
-			    IIC_SSE_MOVNT>,
-                  EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-
-def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs),
-                           (ins i512mem:$dst, VR512:$src),
-                           "vmovntdq\t{$src, $dst|$dst, $src}",
-                           [(alignednontemporalstore (v8i64 VR512:$src),
-                                                     addr:$dst)],
-                           IIC_SSE_MOVNT>,
-                  EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+    def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
+                             (ins i128mem:$src),
+                             "vmovntdqa\t{$src, $dst|$dst, $src}", [],
+                             SSEPackedInt>, EVEX, T8PD, EVEX_V128,
+                             EVEX_CD8<64, CD8VF>;
+  }
 }
 
+multiclass avx512_movnt<bits<8> opc, string OpcodeStr, PatFrag st_frag,
+                        ValueType OpVT, RegisterClass RC, X86MemOperand memop,
+                        Domain d, InstrItinClass itin = IIC_SSE_MOVNT> {
+  let SchedRW = [WriteStore], mayStore = 1,
+      AddedComplexity = 400 in
+  def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(st_frag (OpVT RC:$src), addr:$dst)], d, itin>, EVEX;
+}
+
+multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, PatFrag st_frag,
+                           string elty, string elsz, string vsz512,
+                           string vsz256, string vsz128, Domain d,
+                           Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> {
+  let Predicates = [prd] in
+  defm Z : avx512_movnt<opc, OpcodeStr, st_frag,
+                        !cast<ValueType>("v"##vsz512##elty##elsz), VR512,
+                        !cast<X86MemOperand>(elty##"512mem"), d, itin>,
+                        EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_movnt<opc, OpcodeStr, st_frag,
+                             !cast<ValueType>("v"##vsz256##elty##elsz), VR256X,
+                             !cast<X86MemOperand>(elty##"256mem"), d, itin>,
+                             EVEX_V256;
+
+    defm Z128 : avx512_movnt<opc, OpcodeStr, st_frag,
+                             !cast<ValueType>("v"##vsz128##elty##elsz), VR128X,
+                             !cast<X86MemOperand>(elty##"128mem"), d, itin>,
+                             EVEX_V128;
+  }
+}
+
+defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore,
+                                "i", "64", "8", "4", "2", SSEPackedInt,
+                                HasAVX512>, PD, EVEX_CD8<64, CD8VF>;
+
+defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore,
+                                "f", "64", "8", "4", "2", SSEPackedDouble,
+                                HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore,
+                                "f", "32", "16", "8", "4", SSEPackedSingle,
+                                HasAVX512>, PS, EVEX_CD8<32, CD8VF>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - Integer arithmetic
 //
 multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                        ValueType OpVT, RegisterClass KRC,
-                        RegisterClass RC, PatFrag memop_frag,
-                        X86MemOperand x86memop, PatFrag scalar_mfrag,
-                        X86MemOperand x86scalar_mop, string BrdcstStr,
-                        OpndItins itins, bit IsCommutable = 0> {
-  let isCommutable = IsCommutable in
-    def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
-              (ins RC:$src1, RC:$src2),
-              !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-              [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))],
-              itins.rr>, EVEX_4V;
-  let AddedComplexity = 30 in {
-    let Constraints = "$src0 = $dst" in
-      def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
-                 (ins RC:$src0, KRC:$mask, RC:$src1, RC:$src2),
-                 !strconcat(OpcodeStr,
-                    " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
-                 [(set RC:$dst, (OpVT (vselect KRC:$mask,
-                                  (OpNode (OpVT RC:$src1), (OpVT RC:$src2)),
-                                  RC:$src0)))],
-                 itins.rr>, EVEX_4V, EVEX_K;
-    def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
-                (ins KRC:$mask, RC:$src1, RC:$src2),
-                !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" ,
-                    "|$dst {${mask}} {z}, $src1, $src2}"),
-                [(set RC:$dst, (OpVT (vselect KRC:$mask,
-                                  (OpNode (OpVT RC:$src1), (OpVT RC:$src2)),
-                                  (OpVT immAllZerosV))))],
-                itins.rr>, EVEX_4V, EVEX_KZ;
-  }
+                           X86VectorVTInfo _, OpndItins itins,
+                           bit IsCommutable = 0> {
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                    "", itins.rr, IsCommutable>,
+            AVX512BIBase, EVEX_4V;
 
-  let mayLoad = 1 in {
-    def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-              (ins RC:$src1, x86memop:$src2),
-              !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-              [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))],
-              itins.rm>, EVEX_4V;
-    let AddedComplexity = 30 in {
-    let Constraints = "$src0 = $dst" in
-      def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-                 (ins RC:$src0, KRC:$mask, RC:$src1, x86memop:$src2),
-                 !strconcat(OpcodeStr,
-                     " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
-                 [(set RC:$dst, (OpVT (vselect KRC:$mask,
-                                    (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)),
-                                    RC:$src0)))],
-                 itins.rm>, EVEX_4V, EVEX_K;
-    def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-                (ins KRC:$mask, RC:$src1, x86memop:$src2),
-                !strconcat(OpcodeStr,
-                    " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
-                [(set RC:$dst, (OpVT (vselect KRC:$mask,
-                                    (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)),
-                                    (OpVT immAllZerosV))))],
-                itins.rm>, EVEX_4V, EVEX_KZ;
-    }
-    def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-               (ins RC:$src1, x86scalar_mop:$src2),
-               !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
-                          ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
-               [(set RC:$dst, (OpNode RC:$src1,
-                               (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))],
-               itins.rm>, EVEX_4V, EVEX_B;
-    let AddedComplexity = 30 in {
-    let Constraints = "$src0 = $dst" in
-      def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-                  (ins RC:$src0, KRC:$mask, RC:$src1, x86scalar_mop:$src2),
-                  !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
-                             ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}",
-                             BrdcstStr, "}"),
-                  [(set RC:$dst, (OpVT (vselect KRC:$mask,
-                                    (OpNode (OpVT RC:$src1),
-                                     (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))),
-                                    RC:$src0)))],
-                  itins.rm>, EVEX_4V, EVEX_B, EVEX_K;
-    def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
-                 (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2),
-                 !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
-                            ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}",
-                            BrdcstStr, "}"),
-                 [(set RC:$dst, (OpVT (vselect KRC:$mask,
-                                    (OpNode (OpVT RC:$src1),
-                                     (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))),
-                                    (OpVT immAllZerosV))))],
-                 itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ;
-    }
+  let mayLoad = 1 in
+    defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (_.VT (OpNode _.RC:$src1,
+                                  (bitconvert (_.LdFrag addr:$src2)))),
+                    "", itins.rm>,
+              AVX512BIBase, EVEX_4V;
+}
+
+multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            X86VectorVTInfo _, OpndItins itins,
+                            bit IsCommutable = 0> :
+           avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+  let mayLoad = 1 in
+    defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                    "${src2}"##_.BroadcastStr##", $src1",
+                    "$src1, ${src2}"##_.BroadcastStr,
+                    (_.VT (OpNode _.RC:$src1,
+                                  (X86VBroadcast
+                                      (_.ScalarLdFrag addr:$src2)))),
+                    "", itins.rm>,
+               AVX512BIBase, EVEX_4V, EVEX_B;
+}
+
+multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+                              Predicate prd, bit IsCommutable = 0> {
+  let Predicates = [prd] in
+    defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+                             IsCommutable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+                             IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+                             IsCommutable>, EVEX_V128;
   }
 }
 
+multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               AVX512VLVectorVTInfo VTInfo, OpndItins itins,
+                               Predicate prd, bit IsCommutable = 0> {
+  let Predicates = [prd] in
+    defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+                             IsCommutable>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+                             IsCommutable>, EVEX_V256;
+    defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+                             IsCommutable>, EVEX_V128;
+  }
+}
+
+multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+                               itins, prd, IsCommutable>,
+                               VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+                               itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+                              itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                OpndItins itins, Predicate prd,
+                                bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
+                              itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>;
+}
+
+multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
+                                 SDNode OpNode, OpndItins itins, Predicate prd,
+                                 bit IsCommutable = 0> {
+  defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr, OpNode, itins, prd,
+                                   IsCommutable>;
+
+  defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr, OpNode, itins, prd,
+                                   IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
+                                 SDNode OpNode, OpndItins itins, Predicate prd,
+                                 bit IsCommutable = 0> {
+  defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr, OpNode, itins, prd,
+                                   IsCommutable>;
+
+  defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr, OpNode, itins, prd,
+                                   IsCommutable>;
+}
+
+multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
+                                  bits<8> opc_d, bits<8> opc_q,
+                                  string OpcodeStr, SDNode OpNode,
+                                  OpndItins itins, bit IsCommutable = 0> {
+  defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+                                    itins, HasAVX512, IsCommutable>,
+              avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+                                    itins, HasBWI, IsCommutable>;
+}
+
 multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT,
                             ValueType SrcVT, RegisterClass KRC, RegisterClass RC,
                             PatFrag memop_frag, X86MemOperand x86memop,
@@ -2069,25 +2799,16 @@
   }
 }
 
-defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VK16WM, VR512,
-                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
-                   SSE_INTALU_ITINS_P, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VK16WM, VR512,
-                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
-                   SSE_INTALU_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VK16WM, VR512,
-                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
-                   SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VK8WM, VR512,
-                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
-                   SSE_INTALU_ITINS_P, 1>, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W;
-
-defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VK8WM, VR512,
-                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
-                   SSE_INTALU_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
+                                    SSE_INTALU_ITINS_P, 1>;
+defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub,
+                                    SSE_INTALU_ITINS_P, 0>;
+defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul,
+                                   SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
+defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul,
+                                   SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul,
+                                   SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD;
 
 defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512,
                    memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
@@ -2108,41 +2829,33 @@
            (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))),
           (VPMULDQZrr VR512:$src1, VR512:$src2)>;
 
-defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VK16WM, VR512,
-                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
-                   SSE_INTALU_ITINS_P, 1>,
-                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VK8WM, VR512,
-                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
-                   SSE_INTALU_ITINS_P, 0>,
-                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", X86smax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", X86smax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", X86smax,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VK16WM, VR512,
-                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
-                   SSE_INTALU_ITINS_P, 1>,
-                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VK8WM, VR512,
-                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
-                   SSE_INTALU_ITINS_P, 0>,
-                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", X86umax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", X86umax,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", X86umax,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VK16WM, VR512,
-                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
-                   SSE_INTALU_ITINS_P, 1>,
-                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VK8WM, VR512,
-                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
-                   SSE_INTALU_ITINS_P, 0>,
-                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", X86smin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", X86smin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", X86smin,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
-defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VK16WM, VR512,
-                   memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
-                   SSE_INTALU_ITINS_P, 1>,
-                   T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VK8WM, VR512,
-                   memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
-                   SSE_INTALU_ITINS_P, 0>,
-                   T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", X86umin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>;
+defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", X86umin,
+                                     SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
+defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", X86umin,
+                                     SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD;
 
 def : Pat <(v16i32 (int_x86_avx512_mask_pmaxs_d_512 (v16i32 VR512:$src1),
                     (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))),
@@ -2255,48 +2968,18 @@
 defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
                       i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
 
-let ExeDomain = SSEPackedSingle in
-defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp,
-                      memopv16f32, i512mem, v16f32>, TAPD, EVEX_V512,
-                      EVEX_CD8<32, CD8VF>;
-let ExeDomain = SSEPackedDouble in
-defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp,
-                      memopv8f64, i512mem, v8f64>, TAPD, EVEX_V512,
-                      VEX_W, EVEX_CD8<32, CD8VF>;
-
-def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
-          (VPERMILPSZri VR512:$src1, imm:$imm)>;
-def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
-          (VPERMILPDZri VR512:$src1, imm:$imm)>;
-
 //===----------------------------------------------------------------------===//
 // AVX-512  Logical Instructions
 //===----------------------------------------------------------------------===//
 
-defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VK16WM, VR512, memopv16i32,
-                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
-                      EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VK8WM, VR512, memopv8i64,
-                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
-                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPORDZ  : avx512_binop_rm<0xEB, "vpord", or, v16i32, VK16WM, VR512, memopv16i32,
-                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
-                      EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPORQZ  : avx512_binop_rm<0xEB, "vporq", or, v8i64, VK8WM, VR512, memopv8i64,
-                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
-                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VK16WM, VR512, memopv16i32,
-                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
-                      EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VK8WM, VR512, memopv8i64,
-                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
-                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VK16WM, VR512,
-                      memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
-                      SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VK8WM, VR512,
-                      memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
-                      SSE_BIT_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
+defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  FP arithmetic
@@ -2324,118 +3007,58 @@
 }
 
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           RegisterClass KRC,
-                           RegisterClass RC, ValueType vt,
-                           X86MemOperand x86memop, PatFrag mem_frag,
-                           X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
-                           string BrdcstStr,
-                           Domain d, OpndItins itins, bit commutable> {
-  let isCommutable = commutable in {
-    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
-       EVEX_4V;
-
-    def rrk: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr,
-           " \t{$src2, $src1, $dst {${mask}} |$dst {${mask}}, $src1, $src2}"),
-       [], itins.rr, d>, EVEX_4V, EVEX_K;
-
-    def rrkz: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2),
-       !strconcat(OpcodeStr,
-           " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
-       [], itins.rr, d>, EVEX_4V, EVEX_KZ;
-  }
-
+                            X86VectorVTInfo _, bit IsCommutable> {
+  defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  "$src2, $src1", "$src1, $src2",
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V;
   let mayLoad = 1 in {
-    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-       !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
-          itins.rm, d>, EVEX_4V;
+    defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                    "$src2, $src1", "$src1, $src2",
+                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V;
+    defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                     "${src2}"##_.BroadcastStr##", $src1",
+                     "$src1, ${src2}"##_.BroadcastStr,
+                     (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
+                                                (_.ScalarLdFrag addr:$src2))))>,
+                     EVEX_4V, EVEX_B;
+  }//let mayLoad = 1
+}
 
-    def rmb : PI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins RC:$src1, x86scalar_mop:$src2),
-       !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
-           ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
-       [(set RC:$dst, (OpNode RC:$src1, 
-                       (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))],
-       itins.rm, d>, EVEX_4V, EVEX_B;
+multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                             bit IsCommutable = 0> {
+  defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
+                              IsCommutable>, EVEX_V512, PS,
+                              EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
+                              IsCommutable>, EVEX_V512, PD, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
 
-    def rmk : PI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr,
-           "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
-       [], itins.rm, d>, EVEX_4V, EVEX_K;
-
-    def rmkz : PI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr,
-           "\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"),
-       [], itins.rm, d>, EVEX_4V, EVEX_KZ;
-
-    def rmbk : PI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr,
-           " \t{${src2}", BrdcstStr,
-           ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", BrdcstStr, "}"),
-       [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_K;
-
-    def rmbkz : PI<opc, MRMSrcMem, (outs RC:$dst),
-       (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr,
-           " \t{${src2}", BrdcstStr,
-           ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}",
-           BrdcstStr, "}"),
-       [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_KZ;
+    // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
+                                   IsCommutable>, EVEX_V128, PS,
+                                   EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
+                                   IsCommutable>, EVEX_V256, PS,
+                                   EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
+                                   IsCommutable>, EVEX_V128, PD, VEX_W,
+                                   EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
+                                   IsCommutable>, EVEX_V256, PD, VEX_W,
+                                   EVEX_CD8<64, CD8VF>;
   }
 }
 
-defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VK16WM, VR512, v16f32, f512mem,
-                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, 
-                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-                   
-defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VK8WM, VR512, v8f64, f512mem,
-                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
-                   SSE_ALU_ITINS_P.d, 1>,
-                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VK16WM, VR512, v16f32, f512mem,
-                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
-                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VK8WM, VR512, v8f64, f512mem,
-                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
-                   SSE_ALU_ITINS_P.d, 1>,
-                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VK16WM, VR512, v16f32, f512mem,
-                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
-                   SSE_ALU_ITINS_P.s, 1>,
-                   EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VK16WM, VR512, v16f32, f512mem,
-                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
-                   SSE_ALU_ITINS_P.s, 1>,
-                   EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-
-defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VK8WM, VR512, v8f64, f512mem,
-                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
-                   SSE_ALU_ITINS_P.d, 1>,
-                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VK8WM, VR512, v8f64, f512mem,
-                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
-                   SSE_ALU_ITINS_P.d, 1>,
-                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
-
-defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VK16WM, VR512, v16f32, f512mem,
-                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
-                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VK16WM, VR512, v16f32, f512mem,
-                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
-                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
-
-defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VK8WM, VR512, v8f64, f512mem,
-                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
-                   SSE_ALU_ITINS_P.d, 0>, 
-                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VK8WM, VR512, v8f64, f512mem,
-                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
-                   SSE_ALU_ITINS_P.d, 0>, 
-                   EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>;
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>;
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>;
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>;
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>;
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>;
 
 def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1),
                    (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)),
@@ -2502,29 +3125,17 @@
 // AVX-512  Shift instructions
 //===----------------------------------------------------------------------===//
 multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
-                         string OpcodeStr, SDNode OpNode, RegisterClass RC,
-                         ValueType vt, X86MemOperand x86memop, PatFrag mem_frag,
-                         RegisterClass KRC> {
-  def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
-       (ins RC:$src1, i8imm:$src2),
-           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (vt (OpNode RC:$src1, (i8 imm:$src2))))],
-        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
-  def rik : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
-       (ins KRC:$mask, RC:$src1, i8imm:$src2),
-           !strconcat(OpcodeStr,
-                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
-       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
-  def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
-       (ins x86memop:$src1, i8imm:$src2),
-           !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set RC:$dst, (OpNode (mem_frag addr:$src1),
-                     (i8 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
-  def mik: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
-       (ins KRC:$mask, x86memop:$src1, i8imm:$src2),
-           !strconcat(OpcodeStr,
-                " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
-       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+                         string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { 
+  defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
+                   (ins _.RC:$src1, i8imm:$src2), OpcodeStr,
+                      "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
+                   " ",  SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V;
+  defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
+                   (ins _.MemOp:$src1, i8imm:$src2), OpcodeStr,
+                       "$src2, $src1", "$src1, $src2",
+                   (_.VT (OpNode (_.MemOpFrag addr:$src1), (i8 imm:$src2))),
+                   " ",  SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V;
 }
 
 multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -2555,42 +3166,42 @@
 }
 
 defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli,
-                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           v16i32_info>,
                            EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl,
                            VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
                            EVEX_CD8<32, CD8VQ>;
                            
 defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli,
-                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           v8i64_info>, EVEX_V512,
                            EVEX_CD8<64, CD8VF>, VEX_W;
 defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl,
                            VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
                            EVEX_CD8<64, CD8VQ>, VEX_W;
 
 defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli,
-                           VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512,
+                           v16i32_info>, EVEX_V512,
                            EVEX_CD8<32, CD8VF>;
 defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl,
                            VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
                            EVEX_CD8<32, CD8VQ>;
                            
 defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli,
-                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           v8i64_info>, EVEX_V512,
                            EVEX_CD8<64, CD8VF>, VEX_W;
 defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl,
                            VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
                            EVEX_CD8<64, CD8VQ>, VEX_W;
 
 defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai,
-                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           v16i32_info>,
                            EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra,
                            VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
                            EVEX_CD8<32, CD8VQ>;
                            
 defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai,
-                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           v8i64_info>, EVEX_V512,
                            EVEX_CD8<64, CD8VF>, VEX_W;
 defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra,
                            VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
@@ -2713,155 +3324,133 @@
 //===----------------------------------------------------------------------===//
 // FMA - Fused Multiply Operations
 //
+
 let Constraints = "$src1 = $dst" in {
-multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr,
-            RegisterClass RC, X86MemOperand x86memop,
-            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
-            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
-  def r: AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
-          (ins RC:$src1, RC:$src2, RC:$src3),
-          !strconcat(OpcodeStr," \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-          [(set RC:$dst, (OpVT(OpNode RC:$src1, RC:$src2, RC:$src3)))]>;
+// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching.
+multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                           SDPatternOperator OpNode = null_frag> {
+  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+          (ins _.RC:$src2, _.RC:$src3),
+          OpcodeStr, "$src3, $src2", "$src2, $src3",
+          (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+         AVX512FMA3Base;
 
   let mayLoad = 1 in
-  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
-          (ins RC:$src1, RC:$src2, x86memop:$src3),
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst),
+          (ins _.RC:$src1, _.RC:$src2, _.MemOp:$src3),
           !strconcat(OpcodeStr, " \t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-          [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
-                                               (mem_frag addr:$src3))))]>;
-   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
-           (ins RC:$src1, RC:$src2, x86scalar_mop:$src3),
-           !strconcat(OpcodeStr, " \t{${src3}", BrdcstStr,
-            ", $src2, $dst|$dst, $src2, ${src3}", BrdcstStr, "}"),
-           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
-           (OpVT (X86VBroadcast (scalar_mfrag addr:$src3)))))]>, EVEX_B;
+          [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2,
+                                               (_.MemOpFrag addr:$src3))))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst),
+           (ins _.RC:$src1, _.RC:$src2, _.ScalarMemOp:$src3),
+           !strconcat(OpcodeStr, " \t{${src3}", _.BroadcastStr,
+            ", $src2, $dst|$dst, $src2, ${src3}", _.BroadcastStr, "}"),
+           [(set _.RC:$dst, (OpNode _.RC:$src1, _.RC:$src2,
+           (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))]>, EVEX_B;
 }
 } // Constraints = "$src1 = $dst"
 
+multiclass avx512_fma3p_forms<bits<8> opc213, bits<8> opc231,
+                              string OpcodeStr, X86VectorVTInfo VTI,
+                              SDPatternOperator OpNode> {
+  defm v213 : avx512_fma3p_rm<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix),
+                              VTI, OpNode>,
+              EVEX_V512, EVEX_CD8<VTI.EltSize, CD8VF>;
+
+  defm v231 : avx512_fma3p_rm<opc231, !strconcat(OpcodeStr, "231", VTI.Suffix),
+                              VTI>,
+              EVEX_V512, EVEX_CD8<VTI.EltSize, CD8VF>;
+}
+
 let ExeDomain = SSEPackedSingle in {
-  defm VFMADD213PSZ    : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fmadd, v16f32>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VF>;
-  defm VFMSUB213PSZ    : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fmsub, v16f32>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VF>;
-  defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fmaddsub, v16f32>,
-                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fmsubadd, v16f32>,
-                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm VFNMADD213PSZ   : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fnmadd, v16f32>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VF>;
-  defm VFNMSUB213PSZ   : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fnmsub, v16f32>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADDPSZ    : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd",
+                                         v16f32_info, X86Fmadd>;
+  defm VFMSUBPSZ    : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub",
+                                         v16f32_info, X86Fmsub>;
+  defm VFMADDSUBPSZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub",
+                                         v16f32_info, X86Fmaddsub>;
+  defm VFMSUBADDPSZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd",
+                                         v16f32_info, X86Fmsubadd>;
+  defm VFNMADDPSZ   : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd",
+                                         v16f32_info, X86Fnmadd>;
+  defm VFNMSUBPSZ   : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub",
+                                         v16f32_info, X86Fnmsub>;
 }
 let ExeDomain = SSEPackedDouble in {
-  defm VFMADD213PDZ    : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem,
-                                    memopv8f64, f64mem, loadf64, "{1to8}",
-                                    X86Fmadd, v8f64>, EVEX_V512,
-                                    VEX_W, EVEX_CD8<64, CD8VF>;
-  defm VFMSUB213PDZ    : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem,
-                                    memopv8f64, f64mem, loadf64, "{1to8}",
-                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
-                                    EVEX_CD8<64, CD8VF>;
-  defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem,
-                                    memopv8f64, f64mem, loadf64, "{1to8}",
-                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
-                                    EVEX_CD8<64, CD8VF>;
-  defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem,
-                                    memopv8f64, f64mem, loadf64, "{1to8}",
-                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
-                                    EVEX_CD8<64, CD8VF>;
-  defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem,
-                                  memopv8f64, f64mem, loadf64, "{1to8}",
-                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
-                                  EVEX_CD8<64, CD8VF>;
-  defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem,
-                                  memopv8f64, f64mem, loadf64, "{1to8}",
-                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
-                                  EVEX_CD8<64, CD8VF>;
+  defm VFMADDPDZ    : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd",
+                                         v8f64_info, X86Fmadd>, VEX_W;
+  defm VFMSUBPDZ    : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub",
+                                         v8f64_info, X86Fmsub>, VEX_W;
+  defm VFMADDSUBPDZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub",
+                                         v8f64_info, X86Fmaddsub>, VEX_W;
+  defm VFMSUBADDPDZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd",
+                                         v8f64_info, X86Fmsubadd>, VEX_W;
+  defm VFNMADDPDZ :   avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd",
+                                         v8f64_info, X86Fnmadd>, VEX_W;
+  defm VFNMSUBPDZ :   avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub",
+                                         v8f64_info, X86Fnmsub>, VEX_W;
 }
 
 let Constraints = "$src1 = $dst" in {
-multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr,
-            RegisterClass RC, X86MemOperand x86memop,
-            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
-            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
+multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                             X86VectorVTInfo _> {
   let mayLoad = 1 in
-  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
-          (ins RC:$src1, RC:$src3, x86memop:$src2),
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst),
+          (ins _.RC:$src1, _.RC:$src3, _.MemOp:$src2),
           !strconcat(OpcodeStr, " \t{$src2, $src3, $dst|$dst, $src3, $src2}"),
-          [(set RC:$dst, (OpVT (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3)))]>;
-   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
-           (ins RC:$src1, RC:$src3, x86scalar_mop:$src2),
-           !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr,
-            ", $src3, $dst|$dst, $src3, ${src2}", BrdcstStr, "}"),
-           [(set RC:$dst, (OpNode RC:$src1, 
-           (OpVT (X86VBroadcast (scalar_mfrag addr:$src2))), RC:$src3))]>, EVEX_B;
+          [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.MemOpFrag addr:$src2),
+                                                    _.RC:$src3)))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst),
+           (ins _.RC:$src1, _.RC:$src3, _.ScalarMemOp:$src2),
+           !strconcat(OpcodeStr, " \t{${src2}", _.BroadcastStr,
+            ", $src3, $dst|$dst, $src3, ${src2}", _.BroadcastStr, "}"),
+           [(set _.RC:$dst,
+               (OpNode _.RC:$src1, (_.VT (X86VBroadcast
+                                            (_.ScalarLdFrag addr:$src2))),
+                                   _.RC:$src3))]>, EVEX_B;
 }
 } // Constraints = "$src1 = $dst"
 
 
 let ExeDomain = SSEPackedSingle in {
-  defm VFMADD132PSZ    : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fmadd, v16f32>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VF>;
-  defm VFMSUB132PSZ    : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fmsub, v16f32>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VF>;
-  defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fmaddsub, v16f32>,
-                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fmsubadd, v16f32>,
-                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm VFNMADD132PSZ   : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fnmadd, v16f32>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VF>;
-  defm VFNMSUB132PSZ   : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem,
-                                    memopv16f32, f32mem, loadf32, "{1to16}",
-                                    X86Fnmsub, v16f32>, EVEX_V512,
-                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADD132PSZ    : avx512_fma3p_m132<0x98, "vfmadd132ps", X86Fmadd,
+                                           v16f32_info>,
+                         EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUB132PSZ    : avx512_fma3p_m132<0x9A, "vfmsub132ps", X86Fmsub,
+                                           v16f32_info>,
+                         EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", X86Fmaddsub,
+                                           v16f32_info>,
+                         EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", X86Fmsubadd,
+                                           v16f32_info>,
+                         EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMADD132PSZ   : avx512_fma3p_m132<0x9C, "vfnmadd132ps", X86Fnmadd,
+                                           v16f32_info>,
+                         EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMSUB132PSZ   : avx512_fma3p_m132<0x9E, "vfnmsub132ps", X86Fnmsub,
+                                           v16f32_info>,
+                         EVEX_V512, EVEX_CD8<32, CD8VF>;
 }
 let ExeDomain = SSEPackedDouble in {
-  defm VFMADD132PDZ    : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem,
-                                    memopv8f64, f64mem, loadf64, "{1to8}",
-                                    X86Fmadd, v8f64>, EVEX_V512,
-                                    VEX_W, EVEX_CD8<64, CD8VF>;
-  defm VFMSUB132PDZ    : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem,
-                                    memopv8f64, f64mem, loadf64, "{1to8}",
-                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
-                                    EVEX_CD8<64, CD8VF>;
-  defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem,
-                                    memopv8f64, f64mem, loadf64, "{1to8}",
-                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
-                                    EVEX_CD8<64, CD8VF>;
-  defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem,
-                                    memopv8f64, f64mem, loadf64, "{1to8}",
-                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
-                                    EVEX_CD8<64, CD8VF>;
-  defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem,
-                                  memopv8f64, f64mem, loadf64, "{1to8}",
-                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
-                                  EVEX_CD8<64, CD8VF>;
-  defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem,
-                                  memopv8f64, f64mem, loadf64, "{1to8}",
-                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
-                                  EVEX_CD8<64, CD8VF>;
+  defm VFMADD132PDZ    : avx512_fma3p_m132<0x98, "vfmadd132pd", X86Fmadd,
+                                           v8f64_info>,
+                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUB132PDZ    : avx512_fma3p_m132<0x9A, "vfmsub132pd", X86Fmsub,
+                                           v8f64_info>,
+                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", X86Fmaddsub,
+                                           v8f64_info>,
+                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", X86Fmsubadd,
+                                           v8f64_info>,
+                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFNMADD132PDZ :   avx512_fma3p_m132<0x9C, "vfnmadd132pd", X86Fnmadd,
+                                           v8f64_info>,
+                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFNMSUB132PDZ :   avx512_fma3p_m132<0x9E, "vfnmsub132pd", X86Fnmsub,
+                                           v8f64_info>,
+                         EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
 }
 
 // Scalar FMA
@@ -3482,26 +4071,49 @@
 
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
 multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                         RegisterClass RC, X86MemOperand x86memop,
-                         PatFrag mem_frag, ValueType OpVt> {
-  def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                        !strconcat(OpcodeStr,
-                                   " \t{$src, $dst|$dst, $src}"),
-                        [(set RC:$dst, (OpVt (OpNode RC:$src)))]>,
-                        EVEX;
-  def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-                        !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
-                        [(set RC:$dst, (OpVt (OpNode (mem_frag addr:$src))))]>,
-                        EVEX;
+                         X86VectorVTInfo _> {
+  defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
+  let mayLoad = 1 in {
+    defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                           (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                           (OpNode (_.FloatVT
+                             (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD;
+    defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                            (ins _.ScalarMemOp:$src), OpcodeStr,
+                            "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                            (OpNode (_.FloatVT
+                              (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                            EVEX, T8PD, EVEX_B;
+  }
 }
-defm VRSQRT14PSZ : avx512_fp14_p<0x4E, "vrsqrt14ps", X86frsqrt, VR512, f512mem,
-                        memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VRSQRT14PDZ : avx512_fp14_p<0x4E, "vrsqrt14pd", X86frsqrt, VR512, f512mem,
-                        memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-defm VRCP14PSZ : avx512_fp14_p<0x4C, "vrcp14ps", X86frcp, VR512, f512mem,
-                        memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VRCP14PDZ : avx512_fp14_p<0x4C, "vrcp14pd", X86frcp, VR512, f512mem,
-                        memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>,
+                          EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>,
+                          EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+                                OpNode, v4f32x_info>,
+                               EVEX_V128, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
+                                OpNode, v8f32x_info>,
+                               EVEX_V256, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+                                OpNode, v2f64x_info>,
+                               EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
+                                OpNode, v4f64x_info>,
+                               EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+  }
+}
+
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
 
 def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src),
               (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
@@ -3573,93 +4185,63 @@
                        (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;
 
 /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
-multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr,
-                         RegisterClass RC, X86MemOperand x86memop> {
-  let hasSideEffects = 0, Predicates = [HasERI] in {
-  def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                        !strconcat(OpcodeStr,
-                                   " \t{$src, $dst|$dst, $src}"),
-                        []>, EVEX;
-  def rb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
-                        !strconcat(OpcodeStr,
-                                   " \t{{sae}, $src, $dst|$dst, $src, {sae}}"),
-                        []>, EVEX, EVEX_B;
-  def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-                        !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
-                        []>, EVEX;
-  }
+
+multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                         SDNode OpNode> {
+
+  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
+
+  defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src), OpcodeStr,
+                        "$src", "$src",
+                        (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;
+
+  defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.FloatVT
+                             (bitconvert (_.LdFrag addr:$src))), (i32 FROUND_CURRENT))>;
+
+  defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (OpNode (_.FloatVT
+                                  (X86VBroadcast (_.ScalarLdFrag addr:$src))),
+                                 (i32 FROUND_CURRENT))>, EVEX_B;
 }
-defm VRSQRT28PSZ : avx512_fp28_p<0xCC, "vrsqrt28ps", VR512, f512mem>,
-                        EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VRSQRT28PDZ : avx512_fp28_p<0xCC, "vrsqrt28pd", VR512, f512mem>,
-                        VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-defm VRCP28PSZ : avx512_fp28_p<0xCA, "vrcp28ps", VR512, f512mem>,
-                        EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VRCP28PDZ : avx512_fp28_p<0xCA, "vrcp28pd", VR512, f512mem>,
-                        VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
-def : Pat <(v16f32 (int_x86_avx512_rsqrt28_ps (v16f32 VR512:$src),
-              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)),
-           (VRSQRT28PSZrb VR512:$src)>;
-def : Pat <(v8f64 (int_x86_avx512_rsqrt28_pd (v8f64 VR512:$src),
-              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)),
-           (VRSQRT28PDZrb VR512:$src)>;
+multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+   defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+                     EVEX_CD8<32, CD8VF>;
+   defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+                     VEX_W, EVEX_CD8<32, CD8VF>;
+}
 
-def : Pat <(v16f32 (int_x86_avx512_rcp28_ps (v16f32 VR512:$src),
-              (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)),
-           (VRCP28PSZrb VR512:$src)>;
-def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src),
-              (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)),
-           (VRCP28PDZrb VR512:$src)>;
+let Predicates = [HasERI], hasSideEffects = 0 in {
+  
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD;
+ defm VRCP28   : avx512_eri<0xCA, "vrcp28",   X86rcp28>,   EVEX, EVEX_V512, T8PD;
+ defm VEXP2    : avx512_eri<0xC8, "vexp2",    X86exp2>,    EVEX, EVEX_V512, T8PD;
+}
 
-multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              Intrinsic V16F32Int, Intrinsic V8F64Int,
-                              OpndItins itins_s, OpndItins itins_d> {
-  def PSZrr :AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-             !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-             [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))], itins_s.rr>,
-             EVEX, EVEX_V512;
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
+                              SDNode OpNode, X86VectorVTInfo _>{
+  defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (_.FloatVT (OpNode _.RC:$src))>, EVEX;
+  let mayLoad = 1 in {
+    defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                           (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                           (OpNode (_.FloatVT
+                             (bitconvert (_.LdFrag addr:$src))))>, EVEX;
 
-  let mayLoad = 1 in
-  def PSZrm : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-              [(set VR512:$dst, 
-                (OpNode (v16f32 (bitconvert (memopv16f32 addr:$src)))))],
-              itins_s.rm>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
-
-  def PDZrr : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-              [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))], itins_d.rr>,
-              EVEX, EVEX_V512;
-
-  let mayLoad = 1 in
-    def PDZrm : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-                [(set VR512:$dst, (OpNode
-                  (v8f64 (bitconvert (memopv16f32 addr:$src)))))],
-                itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-let isCodeGenOnly = 1 in {
-  def PSZr_Int : AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-                           !strconcat(OpcodeStr,
-                                      "ps\t{$src, $dst|$dst, $src}"),
-                           [(set VR512:$dst, (V16F32Int VR512:$src))]>, 
-                           EVEX, EVEX_V512;
-  def PSZm_Int : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-                          !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
-                          [(set VR512:$dst, 
-                           (V16F32Int (memopv16f32 addr:$src)))]>, EVEX,
-                          EVEX_V512, EVEX_CD8<32, CD8VF>;
-  def PDZr_Int : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
-                           !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
-                           [(set VR512:$dst, (V8F64Int VR512:$src))]>, 
-                           EVEX, EVEX_V512, VEX_W;
-  def PDZm_Int : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
-                         !strconcat(OpcodeStr,
-                         "pd\t{$src, $dst|$dst, $src}"),
-                         [(set VR512:$dst, (V8F64Int (memopv8f64 addr:$src)))]>,
-                         EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; 
-} // isCodeGenOnly = 1
+    defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                            (ins _.ScalarMemOp:$src), OpcodeStr,
+                            "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                            (OpNode (_.FloatVT
+                              (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
+                            EVEX, EVEX_B;
+  }
 }
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
@@ -3723,15 +4305,45 @@
   }
 }
 
+multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode> {
+  defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                                v16f32_info>,
+                                EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+  defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+                                v8f64_info>,
+                                EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX] in {
+    defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+                                     OpNode, v4f32x_info>,
+                                     EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
+    defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
+                                     OpNode, v8f32x_info>,
+                                     EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
+    defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+                                     OpNode, v2f64x_info>,
+                                     EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+    defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
+                                     OpNode, v4f64x_info>,
+                                     EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
+  }
+}
+
+defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>;
 
 defm VSQRT  : avx512_sqrt_scalar<0x51, "sqrt", 
                 int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, 
-                SSE_SQRTSS, SSE_SQRTSD>,
-              avx512_sqrt_packed<0x51, "vsqrt", fsqrt,
-                int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512,
-                SSE_SQRTPS, SSE_SQRTPD>;
+                SSE_SQRTSS, SSE_SQRTSD>;
 
 let Predicates = [HasAVX512] in {
+  def : Pat<(v16f32 (int_x86_avx512_sqrt_ps_512 (v16f32 VR512:$src1),
+                    (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_CURRENT)),
+                   (VSQRTPSZr VR512:$src1)>;
+  def : Pat<(v8f64 (int_x86_avx512_sqrt_pd_512 (v8f64 VR512:$src1),
+                    (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_CURRENT)),
+                   (VSQRTPDZr VR512:$src1)>;
+  
   def : Pat<(f32 (fsqrt FR32X:$src)),
             (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
   def : Pat<(f32 (fsqrt (load addr:$src))),
@@ -4301,33 +4913,29 @@
                             (memopv8i64 addr:$src2), (i8 imm:$imm))),
           (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
 
-multiclass avx512_alignr<string OpcodeStr, RegisterClass RC,
-                       X86MemOperand x86memop> {
-  def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst),
-                     (ins RC:$src1, RC:$src2, i8imm:$src3),
-                     !strconcat(OpcodeStr,
-                     " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                     []>, EVEX_4V;
+multiclass avx512_valign<X86VectorVTInfo _> {
+  defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.RC:$src2, i8imm:$src3),
+                     "valign"##_.Suffix,
+                     "$src3, $src2, $src1", "$src1, $src2, $src3",
+                     (_.VT (X86VAlign _.RC:$src2, _.RC:$src1,
+                                      (i8 imm:$src3)))>,
+             AVX512AIi8Base, EVEX_4V;
+
+  // Also match valign of packed floats.
+  def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
+            (!cast<Instruction>(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>;
+
   let mayLoad = 1 in
-  def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst),
-                     (ins RC:$src1, x86memop:$src2, i8imm:$src3),
-                     !strconcat(OpcodeStr,
-                     " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+  def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.MemOp:$src2, i8imm:$src3),
+                     !strconcat("valign"##_.Suffix,
+                     " \t{$src3, $src2, $src1, $dst|"
+                         "$dst, $src1, $src2, $src3}"),
                      []>, EVEX_4V;
 }
-defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, 
-                 EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, 
-                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
-def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
-          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
-def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
-          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
-def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
-          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
-def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
-          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+defm VALIGND : avx512_valign<v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ : avx512_valign<v8i64_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
 // Helper fragments to match sext vXi1 to vXiY.
 def v16i1sextv16i32  : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
@@ -4525,3 +5133,32 @@
 def : Pat<(truncstorei1 GR8:$src, addr:$dst),
           (MOV8mr addr:$dst, GR8:$src)>;
 
+multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
+def rr : AVX512XS8I<opc, MRMDestReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
+                  !strconcat(OpcodeStr##Vec.Suffix, " \t{$src, $dst|$dst, $src}"),
+                  [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
+}
+          
+multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
+                                 string OpcodeStr, Predicate prd> {
+let Predicates = [prd] in
+  defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512;
+
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
+    defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
+  }
+}
+
+multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
+  defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info,  OpcodeStr,
+                                       HasBWI>;
+  defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr,
+                                       HasBWI>, VEX_W;
+  defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr,
+                                       HasDQI>;
+  defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr,
+                                       HasDQI>, VEX_W;
+}
+          
+defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;

diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index f2574cc..25e1e80 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td

@@ -1355,49 +1355,57 @@
 //===----------------------------------------------------------------------===//
 // ADCX Instruction
 //
-let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS],
+    Constraints = "$src0 = $dst", AddedComplexity = 10 in {
   let SchedRW = [WriteALU] in {
-  def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-             "adcx{l}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_NONMEM>, T8PD;
-
-  def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-             "adcx{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_NONMEM>, T8PD, Requires<[In64BitMode]>;
+  def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst),
+             (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+             [(set GR32:$dst, EFLAGS,
+                 (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))],
+             IIC_BIN_CARRY_NONMEM>, T8PD, Requires<[HasADX]>;
+  def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst),
+             (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+             [(set GR64:$dst, EFLAGS,
+                 (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))],
+             IIC_BIN_CARRY_NONMEM>, T8PD, Requires<[HasADX, In64BitMode]>;
   } // SchedRW
 
   let mayLoad = 1, SchedRW = [WriteALULd] in {
-  def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-             "adcx{l}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_MEM>, T8PD;
+  def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
+             (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}",
+             [(set GR32:$dst, EFLAGS,
+                 (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))],
+             IIC_BIN_CARRY_MEM>, T8PD, Requires<[HasADX]>;
 
-  def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-             "adcx{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_MEM>, T8PD, Requires<[In64BitMode]>;
+  def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst),
+             (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}",
+             [(set GR64:$dst, EFLAGS,
+                 (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))],
+             IIC_BIN_CARRY_MEM>, T8PD, Requires<[HasADX, In64BitMode]>;
   }
 }
 
 //===----------------------------------------------------------------------===//
 // ADOX Instruction
 //
-let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+let hasSideEffects = 0, Defs = [EFLAGS], Uses = [EFLAGS] in {
   let SchedRW = [WriteALU] in {
   def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
              "adox{l}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_NONMEM>, T8XS;
+             [], IIC_BIN_NONMEM>, T8XS, Requires<[HasADX]>;
 
   def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
              "adox{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_NONMEM>, T8XS, Requires<[In64BitMode]>;
+             [], IIC_BIN_NONMEM>, T8XS, Requires<[HasADX, In64BitMode]>;
   } // SchedRW
 
   let mayLoad = 1, SchedRW = [WriteALULd] in {
   def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
              "adox{l}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_MEM>, T8XS;
+             [], IIC_BIN_MEM>, T8XS, Requires<[HasADX]>;
 
   def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
              "adox{q}\t{$src, $dst|$dst, $src}",
-             [], IIC_BIN_MEM>, T8XS, Requires<[In64BitMode]>;
+             [], IIC_BIN_MEM>, T8XS, Requires<[HasADX, In64BitMode]>;
   }
 }

diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index e421f8c..2056056 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h

@@ -21,8 +21,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86INSTRBUILDER_H
-#define X86INSTRBUILDER_H
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
+#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H
 
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"

diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index ca4f608..117b6ff 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td

@@ -46,11 +46,11 @@
 def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
                            "#ADJCALLSTACKDOWN",
                            [(X86callseq_start timm:$amt)]>,
-                          Requires<[Not64BitMode]>;
+                          Requires<[NotLP64]>;
 def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
-                          Requires<[Not64BitMode]>;
+                          Requires<[NotLP64]>;
 }
 
 // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
@@ -62,11 +62,11 @@
 def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
                            "#ADJCALLSTACKDOWN",
                            [(X86callseq_start timm:$amt)]>,
-                          Requires<[In64BitMode]>;
+                          Requires<[IsLP64]>;
 def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
-                          Requires<[In64BitMode]>;
+                          Requires<[IsLP64]>;
 }
 
 
@@ -118,7 +118,7 @@
                       "# variable sized alloca for segmented stacks",
                       [(set GR32:$dst,
                          (X86SegAlloca GR32:$size))]>,
-                    Requires<[Not64BitMode]>;
+                    Requires<[NotLP64]>;
 
 let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
 def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
@@ -214,6 +214,8 @@
                             "#SEH_PushFrame $mode", []>;
   def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
                             "#SEH_EndPrologue", []>;
+  def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
+                            "#SEH_Epilogue", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -407,7 +409,8 @@
 // All calls clobber the non-callee saved registers. ESP is marked as
 // a use to prevent stack-pointer assignments that appear immediately
 // before calls from potentially appearing dead.
-let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
@@ -426,7 +429,8 @@
 // a use to prevent stack-pointer assignments that appear immediately
 // before calls from potentially appearing dead.
 let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
-            FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
+            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
@@ -747,18 +751,88 @@
                                IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
              TB, LOCK;
 
-def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
-                      "#ACQUIRE_MOV PSEUDO!",
-                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
-def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
-                      "#ACQUIRE_MOV PSEUDO!",
-                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
-def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
-                      "#ACQUIRE_MOV PSEUDO!",
-                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
-def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
-                      "#ACQUIRE_MOV PSEUDO!",
-                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+/* The following multiclass tries to make sure that in code like
+ *    x.store (immediate op x.load(acquire), release)
+ * an operation directly on memory is generated instead of wasting a register.
+ * It is not automatic as atomic_store/load are only lowered to MOV instructions
+ * extremely late to prevent them from being accidentally reordered in the backend
+ * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
+ */
+multiclass RELEASE_BINOP_MI<string op> {
+    def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+        "#RELEASE_BINOP PSEUDO!",
+        [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
+            (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
+    // NAME#16 is not generated as 16-bit arithmetic instructions are considered
+    // costly and avoided as far as possible by this backend anyway
+    def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+        "#RELEASE_BINOP PSEUDO!",
+        [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
+            (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
+    def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+        "#RELEASE_BINOP PSEUDO!",
+        [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
+            (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
+}
+defm RELEASE_ADD : RELEASE_BINOP_MI<"add">;
+defm RELEASE_AND : RELEASE_BINOP_MI<"and">;
+defm RELEASE_OR  : RELEASE_BINOP_MI<"or">;
+defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">;
+// Note: we don't deal with sub, because substractions of constants are
+// optimized into additions before this code can run
+
+multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
+    def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
+        "#RELEASE_UNOP PSEUDO!",
+        [(atomic_store_8 addr:$dst, dag8)]>;
+    def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
+        "#RELEASE_UNOP PSEUDO!",
+        [(atomic_store_16 addr:$dst, dag16)]>;
+    def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
+        "#RELEASE_UNOP PSEUDO!",
+        [(atomic_store_32 addr:$dst, dag32)]>;
+    def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
+        "#RELEASE_UNOP PSEUDO!",
+        [(atomic_store_64 addr:$dst, dag64)]>;
+}
+
+defm RELEASE_INC : RELEASE_UNOP<
+    (add (atomic_load_8  addr:$dst), (i8 1)),
+    (add (atomic_load_16 addr:$dst), (i16 1)),
+    (add (atomic_load_32 addr:$dst), (i32 1)),
+    (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
+defm RELEASE_DEC : RELEASE_UNOP<
+    (add (atomic_load_8  addr:$dst), (i8 -1)),
+    (add (atomic_load_16 addr:$dst), (i16 -1)),
+    (add (atomic_load_32 addr:$dst), (i32 -1)),
+    (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+/*
+TODO: These don't work because the type inference of TableGen fails.
+TODO: find a way to fix it.
+defm RELEASE_NEG : RELEASE_UNOP<
+    (ineg (atomic_load_8  addr:$dst)),
+    (ineg (atomic_load_16 addr:$dst)),
+    (ineg (atomic_load_32 addr:$dst)),
+    (ineg (atomic_load_64 addr:$dst))>;
+defm RELEASE_NOT : RELEASE_UNOP<
+    (not (atomic_load_8  addr:$dst)),
+    (not (atomic_load_16 addr:$dst)),
+    (not (atomic_load_32 addr:$dst)),
+    (not (atomic_load_64 addr:$dst))>;
+*/
+
+def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
+			"#RELEASE_MOV PSEUDO !",
+			[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
+def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
+			"#RELEASE_MOV PSEUDO !",
+			[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
+def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
+			"#RELEASE_MOV PSEUDO !",
+			[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
+def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
+			"#RELEASE_MOV PSEUDO !",
+			[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
 
 def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
                         "#RELEASE_MOV PSEUDO!",
@@ -773,11 +847,22 @@
                         "#RELEASE_MOV PSEUDO!",
                         [(atomic_store_64 addr:$dst, GR64:$src)]>;
 
+def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
 //===----------------------------------------------------------------------===//
 // Conditional Move Pseudo Instructions.
 //===----------------------------------------------------------------------===//
 
-
 // CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded after
 // instruction selection into a branch sequence.
 let Uses = [EFLAGS], usesCustomInserter = 1 in {
@@ -1106,6 +1191,7 @@
   return N->getOpcode() != ISD::TRUNCATE &&
          N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
          N->getOpcode() != ISD::CopyFromReg &&
+         N->getOpcode() != ISD::AssertSext &&
          N->getOpcode() != X86ISD::CMOV;
 }]>;
 

diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 6be6a1f..b38129a 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td

@@ -97,13 +97,23 @@
 let neverHasSideEffects = 1, isCodeGenOnly = 1 in {
 def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
                          (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
-                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
                          [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
 let mayLoad = 1 in
 def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
                          (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
-                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
                          [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+
+def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVSX>, TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+                         "movs{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         [], IIC_MOVSX>, TB, Sched<[WriteALULd]>;
 }
 
 // MOVSX64rr8 always has a REX prefix and it has an 8-bit register

diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 4ad7b7e..d9f173e 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td

@@ -114,9 +114,6 @@
 // a pattern) and the FPI instruction should have emission info (e.g. opcode
 // encoding and asm printing info).
 
-// Pseudo Instruction for FP stack return values.
-def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>;
-
 // FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
 // f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
 // f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.

diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index cc30266..fe4ead1 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td

@@ -36,20 +36,21 @@
 def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>;
 def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C8 : Format<37>;
 def MRM_C9 : Format<38>; def MRM_CA : Format<39>; def MRM_CB : Format<40>;
-def MRM_D0 : Format<41>; def MRM_D1 : Format<42>; def MRM_D4 : Format<43>;
-def MRM_D5 : Format<44>; def MRM_D6 : Format<45>; def MRM_D8 : Format<46>;
-def MRM_D9 : Format<47>; def MRM_DA : Format<48>; def MRM_DB : Format<49>;
-def MRM_DC : Format<50>; def MRM_DD : Format<51>; def MRM_DE : Format<52>;
-def MRM_DF : Format<53>; def MRM_E0 : Format<54>; def MRM_E1 : Format<55>;
-def MRM_E2 : Format<56>; def MRM_E3 : Format<57>; def MRM_E4 : Format<58>;
-def MRM_E5 : Format<59>; def MRM_E8 : Format<60>; def MRM_E9 : Format<61>;
-def MRM_EA : Format<62>; def MRM_EB : Format<63>; def MRM_EC : Format<64>;
-def MRM_ED : Format<65>; def MRM_EE : Format<66>; def MRM_F0 : Format<67>;
-def MRM_F1 : Format<68>; def MRM_F2 : Format<69>; def MRM_F3 : Format<70>;
-def MRM_F4 : Format<71>; def MRM_F5 : Format<72>; def MRM_F6 : Format<73>;
-def MRM_F7 : Format<74>; def MRM_F8 : Format<75>; def MRM_F9 : Format<76>;
-def MRM_FA : Format<77>; def MRM_FB : Format<78>; def MRM_FC : Format<79>;
-def MRM_FD : Format<80>; def MRM_FE : Format<81>; def MRM_FF : Format<82>;
+def MRM_CF : Format<41>; def MRM_D0 : Format<42>; def MRM_D1 : Format<43>;
+def MRM_D4 : Format<44>; def MRM_D5 : Format<45>; def MRM_D6 : Format<46>;
+def MRM_D7 : Format<47>; def MRM_D8 : Format<48>; def MRM_D9 : Format<49>;
+def MRM_DA : Format<50>; def MRM_DB : Format<51>; def MRM_DC : Format<52>;
+def MRM_DD : Format<53>; def MRM_DE : Format<54>; def MRM_DF : Format<55>;
+def MRM_E0 : Format<56>; def MRM_E1 : Format<57>; def MRM_E2 : Format<58>;
+def MRM_E3 : Format<59>; def MRM_E4 : Format<60>; def MRM_E5 : Format<61>;
+def MRM_E8 : Format<62>; def MRM_E9 : Format<63>; def MRM_EA : Format<64>;
+def MRM_EB : Format<65>; def MRM_EC : Format<66>; def MRM_ED : Format<67>;
+def MRM_EE : Format<68>; def MRM_F0 : Format<69>; def MRM_F1 : Format<70>;
+def MRM_F2 : Format<71>; def MRM_F3 : Format<72>; def MRM_F4 : Format<73>;
+def MRM_F5 : Format<74>; def MRM_F6 : Format<75>; def MRM_F7 : Format<76>;
+def MRM_F8 : Format<77>; def MRM_F9 : Format<78>; def MRM_FA : Format<79>;
+def MRM_FB : Format<80>; def MRM_FC : Format<81>; def MRM_FD : Format<82>;
+def MRM_FE : Format<83>; def MRM_FF : Format<84>;
 
 // ImmType - This specifies the immediate type used by an instruction. This is
 // part of the ad-hoc solution used to emit machine instruction encodings by our
@@ -100,6 +101,7 @@
 def CD8VH  : CD8VForm<1>;  // v := VL/2
 def CD8VQ  : CD8VForm<2>;  // v := VL/4
 def CD8VO  : CD8VForm<3>;  // v := VL/8
+// The tuple (subvector) forms.
 def CD8VT1 : CD8VForm<4>;  // v := 1
 def CD8VT2 : CD8VForm<5>;  // v := 2
 def CD8VT4 : CD8VForm<6>;  // v := 4
@@ -184,13 +186,16 @@
 class EVEX_B { bit hasEVEX_B = 1; }
 class EVEX_RC { bit hasEVEX_RC = 1; }
 class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
+class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
+
+// Specify AVX512 8-bit compressed displacement encoding based on the vector
+// element size in bits (8, 16, 32, 64) and the CDisp8 form.
 class EVEX_CD8<int esize, CD8VForm form> {
-  bits<2> EVEX_CD8E = !if(!eq(esize, 8),  0b00,
-                      !if(!eq(esize, 16), 0b01,
-                      !if(!eq(esize, 32), 0b10,
-                      !if(!eq(esize, 64), 0b11, ?))));
-  bits<3> EVEX_CD8V = form.Value;
+  int CD8_EltSize = !srl(esize, 3);
+  bits<3> CD8_Form = form.Value;
 }
+
 class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
 class MemOp4 { bit hasMemOp4Prefix = 1; }
 class XOP { Encoding OpEnc = EncXOP; }
@@ -253,12 +258,32 @@
   bit hasEVEX_Z = 0;        // Does this inst set the EVEX_Z field?
   bit hasEVEX_L2 = 0;       // Does this inst set the EVEX_L2 field?
   bit hasEVEX_B = 0;        // Does this inst set the EVEX_B field?
-  bits<2> EVEX_CD8E = 0;    // Compressed disp8 form - element-size.
-  bits<3> EVEX_CD8V = 0;    // Compressed disp8 form - vector-width.
+  bits<3> CD8_Form = 0;     // Compressed disp8 form - vector-width.
+  // Declare it int rather than bits<4> so that all bits are defined when
+  // assigning to bits<7>.
+  int CD8_EltSize = 0;      // Compressed disp8 form - element-size in bytes.
   bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
   bit hasMemOp4Prefix = 0;  // Same bit as VEX_W, but used for swapping operands
   bit hasEVEX_RC = 0;       // Explicitly specified rounding control in FP instruction.
 
+  bits<2> EVEX_LL;
+  let EVEX_LL{0} = hasVEX_L;
+  let EVEX_LL{1} = hasEVEX_L2;
+  // Vector size in bytes.
+  bits<7> VectSize = !shl(16, EVEX_LL);
+
+  // The scaling factor for AVX512's compressed displacement is either
+  //   - the size of a  power-of-two number of elements or
+  //   - the size of a single element for broadcasts or
+  //   - the total vector size divided by a power-of-two number.
+  // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64.
+  bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value),
+                           !if (CD8_Form{2},
+                                !shl(CD8_EltSize, CD8_Form{1-0}),
+                                !if (hasEVEX_B,
+                                     CD8_EltSize,
+                                     !srl(VectSize, CD8_Form{1-0}))), 0);
+
   // TSFlags layout should be kept in sync with X86InstrInfo.h.
   let TSFlags{6-0}   = FormBits;
   let TSFlags{8-7}   = OpSizeBits;
@@ -283,11 +308,11 @@
   let TSFlags{45}    = hasEVEX_Z;
   let TSFlags{46}    = hasEVEX_L2;
   let TSFlags{47}    = hasEVEX_B;
-  let TSFlags{49-48} = EVEX_CD8E;
-  let TSFlags{52-50} = EVEX_CD8V;
-  let TSFlags{53}    = has3DNow0F0FOpcode;
-  let TSFlags{54}    = hasMemOp4Prefix;
-  let TSFlags{55}    = hasEVEX_RC;
+  // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
+  let TSFlags{54-48} = CD8_Scale;
+  let TSFlags{55}    = has3DNow0F0FOpcode;
+  let TSFlags{56}    = hasMemOp4Prefix;
+  let TSFlags{57}    = hasEVEX_RC;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -690,14 +715,25 @@
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
         Requires<[HasAVX512]>;
+class AVX512BIBase : PD {
+  Domain ExeDomain = SSEPackedInt;
+}
 class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD,
         Requires<[HasAVX512]>;
+class AVX512BIi8Base : PD {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
 class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
         Requires<[HasAVX512]>;
+class AVX512AIi8Base : TAPD {
+  Domain ExeDomain = SSEPackedInt;
+  ImmType ImmT = Imm8;
+}
 class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
@@ -720,6 +756,11 @@
            list<dag>pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
         EVEX_4V, Requires<[HasAVX512]>;
+class AVX512FMA3Base : T8PD, EVEX_4V;
+
+class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>;
 
 // AES Instruction Templates:
 //

diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 6f0fa94..1c7215c 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td

@@ -83,7 +83,7 @@
                                       SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
 def X86insertps : SDNode<"X86ISD::INSERTPS",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
-                                      SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
+                                      SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>;
 def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
                  SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
 
@@ -188,6 +188,8 @@
 def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                 SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
 
+def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                        SDTCisVec<2>]>;
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisInt<2>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -197,12 +199,15 @@
 def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
 
 def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                             SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
+                             SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
 
 def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,
+                           SDTCisVec<0>, SDTCisInt<2>]>;
 
 def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
+def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
 
 def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
 def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
@@ -231,10 +236,11 @@
 def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
 def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
-def X86VPermilp  : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
-def X86VPermv    : SDNode<"X86ISD::VPERMV",   SDTShuff2Op>;
-def X86VPermi    : SDNode<"X86ISD::VPERMI",   SDTShuff2OpI>;
-def X86VPermv3   : SDNode<"X86ISD::VPERMV3",  SDTShuff3Op>;
+def X86VPermilpv  : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
+def X86VPermilpi  : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
+def X86VPermv     : SDNode<"X86ISD::VPERMV",    SDTShuff2Op>;
+def X86VPermi     : SDNode<"X86ISD::VPERMI",    SDTShuff2OpI>;
+def X86VPermv3    : SDNode<"X86ISD::VPERMV3",   SDTShuff3Op>;
 def X86VPermiv3   : SDNode<"X86ISD::VPERMIV3",  SDTShuff3Op>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
@@ -247,6 +253,9 @@
                               [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>;
 
 def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
+
+def X86Addsub    : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
+
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
 def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
@@ -254,6 +263,10 @@
 def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
 def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
 
+def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  STDFp1SrcRm>;
+def X86rcp28     : SDNode<"X86ISD::RCP28",    STDFp1SrcRm>;
+def X86exp2      : SDNode<"X86ISD::EXP2",  STDFp1SrcRm>;
+
 def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                          SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
                                          SDTCisVT<4, i8>]>;
@@ -311,6 +324,8 @@
 // 512-bit load pattern fragments
 def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
 def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv64i8    : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>;
+def loadv32i16   : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
 def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
 def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
 
@@ -509,7 +524,9 @@
 }]>;
 
 def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>;
-def FROUND_CURRENT : ImmLeaf<i32, [{ return Imm == 4; }]>;
+def FROUND_CURRENT : ImmLeaf<i32, [{
+  return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION;
+}]>;
 
 // BYTE_imm - Transform bit immediates into byte immediates.
 def BYTE_imm  : SDNodeXForm<imm, [{

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 0d3afc4..7f87bdd 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp

@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -100,8 +101,8 @@
 
 X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     : X86GenInstrInfo(
-          (STI.is64Bit() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
-          (STI.is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
+          (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
+          (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
       Subtarget(STI), RI(STI) {
 
   static const X86OpTblEntry OpTbl2Addr[] = {
@@ -377,7 +378,39 @@
     { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
     { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
     // AVX-512 foldable instructions
-    { X86::VMOVPDI2DIZrr,X86::VMOVPDI2DIZmr,  TB_FOLDED_STORE }
+    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+    { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVAPSZrr,      X86::VMOVAPSZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
+    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
+    { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zmr,   TB_FOLDED_STORE },
+    { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
+    // AVX-512 foldable instructions (256-bit versions)
+    { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256mr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256mr,    TB_FOLDED_STORE },
+    { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256mr,   TB_FOLDED_STORE },
+    { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256mr,  TB_FOLDED_STORE },
+    // AVX-512 foldable instructions (128-bit versions)
+    { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128mr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128mr,    TB_FOLDED_STORE },
+    { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128mr,   TB_FOLDED_STORE },
+    { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128mr,  TB_FOLDED_STORE },
+    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128mr,  TB_FOLDED_STORE }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
@@ -415,6 +448,9 @@
     { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
     { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
     { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
+    { X86::CVTDQ2PSrr,      X86::CVTDQ2PSrm,          TB_ALIGN_16 },
+    { X86::CVTPD2DQrr,      X86::CVTPD2DQrm,          TB_ALIGN_16 },
+    { X86::CVTPS2DQrr,      X86::CVTPS2DQrm,          TB_ALIGN_16 },
     { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
     { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
     { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
@@ -493,6 +529,11 @@
     { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
     { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
     { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
+    { X86::VCVTDQ2PSrr,     X86::VCVTDQ2PSrm,         0 },
+    { X86::VCVTPD2DQrr,     X86::VCVTPD2DQXrm,        0 },
+    { X86::VCVTPS2DQrr,     X86::VCVTPS2DQrm,         0 },
+    { X86::VCVTTPD2DQrr,    X86::VCVTTPD2DQXrm,       0 },
+    { X86::VCVTTPS2DQrr,    X86::VCVTTPS2DQrm,        0 },
     { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
     { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
     { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
@@ -526,6 +567,11 @@
     { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
 
     // AVX 256-bit foldable instructions
+    { X86::VCVTDQ2PSYrr,    X86::VCVTDQ2PSYrm,        0 },
+    { X86::VCVTPD2DQYrr,    X86::VCVTPD2DQYrm,        0 },
+    { X86::VCVTPS2DQYrr,    X86::VCVTPS2DQYrm,        0 },
+    { X86::VCVTTPD2DQYrr,   X86::VCVTTPD2DQYrm,       0 },
+    { X86::VCVTTPS2DQYrr,   X86::VCVTTPS2DQYrm,       0 },
     { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
     { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
     { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_32 },
@@ -533,6 +579,13 @@
     { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
     { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
     { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
+    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
+    { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
+    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
+    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
+    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
 
     // AVX2 foldable instructions
     { X86::VPABSBrr256,     X86::VPABSBrm256,         0 },
@@ -541,13 +594,6 @@
     { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
     { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
     { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
-    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
-    { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
-    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
-    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
-    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
-    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
 
     // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
     { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
@@ -601,18 +647,46 @@
     // AVX-512 foldable instructions
     { X86::VMOV64toPQIZrr,  X86::VMOVQI2PQIZrm,       0 },
     { X86::VMOVDI2SSZrr,    X86::VMOVDI2SSZrm,        0 },
-    { X86::VMOVDQA32rr,     X86::VMOVDQA32rm,         TB_ALIGN_64 },
-    { X86::VMOVDQA64rr,     X86::VMOVDQA64rm,         TB_ALIGN_64 },
-    { X86::VMOVDQU32rr,     X86::VMOVDQU32rm,         0 },
-    { X86::VMOVDQU64rr,     X86::VMOVDQU64rm,         0 },
+    { X86::VMOVAPDZrr,      X86::VMOVAPDZrm,          TB_ALIGN_64 },
+    { X86::VMOVAPSZrr,      X86::VMOVAPSZrm,          TB_ALIGN_64 },
+    { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zrm,        TB_ALIGN_64 },
+    { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zrm,        TB_ALIGN_64 },
+    { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zrm,         0 },
+    { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zrm,        0 },
+    { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zrm,        0 },
+    { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zrm,        0 },
+    { X86::VMOVUPDZrr,      X86::VMOVUPDZrm,          0 },
+    { X86::VMOVUPSZrr,      X86::VMOVUPSZrm,          0 },
     { X86::VPABSDZrr,       X86::VPABSDZrm,           0 },
     { X86::VPABSQZrr,       X86::VPABSQZrm,           0 },
+    // AVX-512 foldable instructions (256-bit versions)
+    { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256rm,          TB_ALIGN_32 },
+    { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256rm,          TB_ALIGN_32 },
+    { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256rm,        TB_ALIGN_32 },
+    { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256rm,        TB_ALIGN_32 },
+    { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256rm,         0 },
+    { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256rm,        0 },
+    { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256rm,        0 },
+    { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256rm,        0 },
+    { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256rm,          0 },
+    { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256rm,          0 },
+    // AVX-512 foldable instructions (256-bit versions)
+    { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128rm,          TB_ALIGN_16 },
+    { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128rm,          TB_ALIGN_16 },
+    { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128rm,        TB_ALIGN_16 },
+    { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128rm,        TB_ALIGN_16 },
+    { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128rm,         0 },
+    { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128rm,        0 },
+    { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128rm,        0 },
+    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128rm,        0 },
+    { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128rm,          0 },
+    { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128rm,          0 },
 
     // AES foldable instructions
     { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
     { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
     { X86::VAESIMCrr,             X86::VAESIMCrm,             TB_ALIGN_16 },
-    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 },
+    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -869,8 +943,6 @@
     { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
     { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
     { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
-    { X86::VCVTTPD2DQrr,      X86::VCVTTPD2DQXrm,      0 },
-    { X86::VCVTTPS2DQrr,      X86::VCVTTPS2DQrm,       0 },
     { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
     { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
     { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
@@ -1543,8 +1615,11 @@
   case X86::VMOVAPSrm:
   case X86::VMOVAPDrm:
   case X86::VMOVDQArm:
+  case X86::VMOVUPSYrm:
   case X86::VMOVAPSYrm:
+  case X86::VMOVUPDYrm:
   case X86::VMOVAPDYrm:
+  case X86::VMOVDQUYrm:
   case X86::VMOVDQAYrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
@@ -1572,8 +1647,11 @@
   case X86::VMOVAPSmr:
   case X86::VMOVAPDmr:
   case X86::VMOVDQAmr:
+  case X86::VMOVUPSYmr:
   case X86::VMOVAPSYmr:
+  case X86::VMOVUPDYmr:
   case X86::VMOVAPDYmr:
+  case X86::VMOVDQUYmr:
   case X86::VMOVDQAYmr:
   case X86::VMOVUPSZmr:
   case X86::VMOVAPSZmr:
@@ -2078,34 +2156,6 @@
 
   unsigned MIOpc = MI->getOpcode();
   switch (MIOpc) {
-  case X86::SHUFPSrri: {
-    assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
-    if (!Subtarget.hasSSE2()) return nullptr;
-
-    unsigned B = MI->getOperand(1).getReg();
-    unsigned C = MI->getOperand(2).getReg();
-    if (B != C) return nullptr;
-    unsigned M = MI->getOperand(3).getImm();
-    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addOperand(Dest).addOperand(Src).addImm(M);
-    break;
-  }
-  case X86::SHUFPDrri: {
-    assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
-    if (!Subtarget.hasSSE2()) return nullptr;
-
-    unsigned B = MI->getOperand(1).getReg();
-    unsigned C = MI->getOperand(2).getReg();
-    if (B != C) return nullptr;
-    unsigned M = MI->getOperand(3).getImm();
-
-    // Convert to PSHUFD mask.
-    M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44;
-
-    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addOperand(Dest).addOperand(Src).addImm(M);
-    break;
-  }
   case X86::SHL64ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
@@ -2387,6 +2437,42 @@
     MI->getOperand(3).setImm(Size-Amt);
     return TargetInstrInfo::commuteInstruction(MI, NewMI);
   }
+  case X86::BLENDPDrri:
+  case X86::BLENDPSrri:
+  case X86::PBLENDWrri:
+  case X86::VBLENDPDrri:
+  case X86::VBLENDPSrri:
+  case X86::VBLENDPDYrri:
+  case X86::VBLENDPSYrri:
+  case X86::VPBLENDDrri:
+  case X86::VPBLENDWrri:
+  case X86::VPBLENDDYrri:
+  case X86::VPBLENDWYrri:{
+    unsigned Mask;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::BLENDPDrri:    Mask = 0x03; break;
+    case X86::BLENDPSrri:    Mask = 0x0F; break;
+    case X86::PBLENDWrri:    Mask = 0xFF; break;
+    case X86::VBLENDPDrri:   Mask = 0x03; break;
+    case X86::VBLENDPSrri:   Mask = 0x0F; break;
+    case X86::VBLENDPDYrri:  Mask = 0x0F; break;
+    case X86::VBLENDPSYrri:  Mask = 0xFF; break;
+    case X86::VPBLENDDrri:   Mask = 0x0F; break;
+    case X86::VPBLENDWrri:   Mask = 0xFF; break;
+    case X86::VPBLENDDYrri:  Mask = 0xFF; break;
+    case X86::VPBLENDWYrri:  Mask = 0xFF; break;
+    }
+    // Only the least significant bits of Imm are used.
+    unsigned Imm = MI->getOperand(3).getImm() & Mask;
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->getOperand(3).setImm(Mask ^ Imm);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
   case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
   case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
   case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
@@ -2471,6 +2557,20 @@
 bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                                          unsigned &SrcOpIdx2) const {
   switch (MI->getOpcode()) {
+    case X86::BLENDPDrri:
+    case X86::BLENDPSrri:
+    case X86::PBLENDWrri:
+    case X86::VBLENDPDrri:
+    case X86::VBLENDPSrri:
+    case X86::VBLENDPDYrri:
+    case X86::VBLENDPSYrri:
+    case X86::VPBLENDDrri:
+    case X86::VPBLENDDYrri:
+    case X86::VPBLENDWrri:
+    case X86::VPBLENDWYrri:
+      SrcOpIdx1 = 1;
+      SrcOpIdx2 = 2;
+      return true;
     case X86::VFMADDPDr231r:
     case X86::VFMADDPSr231r:
     case X86::VFMADDSDr231r:
@@ -3067,6 +3167,8 @@
 inline static bool MaskRegClassContains(unsigned Reg) {
   return X86::VK8RegClass.contains(Reg) ||
          X86::VK16RegClass.contains(Reg) ||
+         X86::VK32RegClass.contains(Reg) ||
+         X86::VK64RegClass.contains(Reg) ||
          X86::VK1RegClass.contains(Reg);
 }
 static
@@ -3143,7 +3245,7 @@
 
   // Moving EFLAGS to / from another register requires a push and a pop.
   // Notice that we have to adjust the stack if we don't want to clobber the
-  // first frame index. See X86FrameLowering.cpp - colobbersTheStack.
+  // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
   if (SrcReg == X86::EFLAGS) {
     if (X86::GR64RegClass.contains(DestReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSHF64));
@@ -3287,9 +3389,11 @@
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned =
-      (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
-      RI.canRealignStack(MF);
+  bool isAligned = (MF.getTarget()
+                        .getSubtargetImpl()
+                        ->getFrameLowering()
+                        ->getStackAlignment() >= Alignment) ||
+                   RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
@@ -3324,9 +3428,11 @@
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
-  bool isAligned =
-      (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) ||
-      RI.canRealignStack(MF);
+  bool isAligned = (MF.getTarget()
+                        .getSubtargetImpl()
+                        ->getFrameLowering()
+                        ->getStackAlignment() >= Alignment) ||
+                   RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
@@ -3868,10 +3974,10 @@
 /// operand at the use. We fold the load instructions if load defines a virtual
 /// register, the virtual register is used once in the same BB, and the
 /// instructions in-between do not load or store, and have no side effects.
-MachineInstr* X86InstrInfo::
-optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
-                  unsigned &FoldAsLoadDefReg,
-                  MachineInstr *&DefMI) const {
+MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
+                                              const MachineRegisterInfo *MRI,
+                                              unsigned &FoldAsLoadDefReg,
+                                              MachineInstr *&DefMI) const {
   if (FoldAsLoadDefReg == 0)
     return nullptr;
   // To be conservative, if there exists another load, clear the load candidate.
@@ -3887,55 +3993,35 @@
   if (!DefMI->isSafeToMove(this, nullptr, SawStore))
     return nullptr;
 
-  // We try to commute MI if possible.
-  unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
-  for (unsigned Idx = 0; Idx < IdxEnd; Idx++) {
-    // Collect information about virtual register operands of MI.
-    unsigned SrcOperandId = 0;
-    bool FoundSrcOperand = false;
-    for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg())
-        continue;
-      unsigned Reg = MO.getReg();
-      if (Reg != FoldAsLoadDefReg)
-        continue;
-      // Do not fold if we have a subreg use or a def or multiple uses.
-      if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
-        return nullptr;
-
-      SrcOperandId = i;
-      FoundSrcOperand = true;
-    }
-    if (!FoundSrcOperand) return nullptr;
-
-    // Check whether we can fold the def into SrcOperandId.
-    SmallVector<unsigned, 8> Ops;
-    Ops.push_back(SrcOperandId);
-    MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
-    if (FoldMI) {
-      FoldAsLoadDefReg = 0;
-      return FoldMI;
-    }
-
-    if (Idx == 1) {
-      // MI was changed but it didn't help, commute it back!
-      commuteInstruction(MI, false);
+  // Collect information about virtual register operands of MI.
+  unsigned SrcOperandId = 0;
+  bool FoundSrcOperand = false;
+  for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg != FoldAsLoadDefReg)
+      continue;
+    // Do not fold if we have a subreg use or a def or multiple uses.
+    if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
       return nullptr;
-    }
 
-    // Check whether we can commute MI and enable folding.
-    if (MI->isCommutable()) {
-      MachineInstr *NewMI = commuteInstruction(MI, false);
-      // Unable to commute.
-      if (!NewMI) return nullptr;
-      if (NewMI != MI) {
-        // New instruction. It doesn't need to be kept.
-        NewMI->eraseFromParent();
-        return nullptr;
-      }
-    }
+    SrcOperandId = i;
+    FoundSrcOperand = true;
   }
+  if (!FoundSrcOperand)
+    return nullptr;
+
+  // Check whether we can fold the def into SrcOperandId.
+  SmallVector<unsigned, 8> Ops;
+  Ops.push_back(SrcOperandId);
+  MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
+  if (FoldMI) {
+    FoldAsLoadDefReg = 0;
+    return FoldMI;
+  }
+
   return nullptr;
 }
 
@@ -3961,6 +4047,28 @@
   return true;
 }
 
+// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
+// code sequence is needed for other targets.
+static void expandLoadStackGuard(MachineInstrBuilder &MIB,
+                                 const TargetInstrInfo &TII) {
+  MachineBasicBlock &MBB = *MIB->getParent();
+  DebugLoc DL = MIB->getDebugLoc();
+  unsigned Reg = MIB->getOperand(0).getReg();
+  const GlobalValue *GV =
+      cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
+  unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+  MachineMemOperand *MMO = MBB.getParent()->
+      getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8);
+  MachineBasicBlock::iterator I = MIB.getInstr();
+
+  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
+      .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
+      .addMemOperand(MMO);
+  MIB->setDebugLoc(DL);
+  MIB->setDesc(TII.get(X86::MOV64rm));
+  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
 bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   bool HasAVX = Subtarget.hasAVX();
   MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
@@ -3995,6 +4103,9 @@
   case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
   case X86::KSET1B:
   case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
+  case TargetOpcode::LOAD_STACK_GUARD:
+    expandLoadStackGuard(MIB, *this);
+    return true;
   }
   return false;
 }
@@ -4070,7 +4181,8 @@
 X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                     MachineInstr *MI, unsigned i,
                                     const SmallVectorImpl<MachineOperand> &MOs,
-                                    unsigned Size, unsigned Align) const {
+                                    unsigned Size, unsigned Align,
+                                    bool AllowCommute) const {
   const DenseMap<unsigned,
                  std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   bool isCallRegIndirect = Subtarget.callRegIndirect();
@@ -4138,8 +4250,8 @@
           if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
             return nullptr;
           // If this is a 64-bit load, but the spill slot is 32, then we can do
-          // a 32-bit load which is implicitly zero-extended. This likely is due
-          // to liveintervalanalysis remat'ing a load from stack slot.
+          // a 32-bit load which is implicitly zero-extended. This likely is
+          // due to live interval analysis remat'ing a load from stack slot.
           if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
             return nullptr;
           Opcode = X86::MOV32rm;
@@ -4158,8 +4270,7 @@
         // to a 32-bit one.
         unsigned DstReg = NewMI->getOperand(0).getReg();
         if (TargetRegisterInfo::isPhysicalRegister(DstReg))
-          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
-                                                   X86::sub_32bit));
+          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
         else
           NewMI->getOperand(0).setSubReg(X86::sub_32bit);
       }
@@ -4167,6 +4278,65 @@
     }
   }
 
+  // If the instruction and target operand are commutable, commute the
+  // instruction and try again.
+  if (AllowCommute) {
+    unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2;
+    if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
+      bool HasDef = MI->getDesc().getNumDefs();
+      unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
+      unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
+      unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
+      bool Tied0 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+      bool Tied1 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
+
+      // If either of the commutable operands are tied to the destination
+      // then we can not commute + fold.
+      if ((HasDef && Reg0 == Reg1 && Tied0) ||
+          (HasDef && Reg0 == Reg2 && Tied1))
+        return nullptr;
+
+      if ((CommuteOpIdx1 == OriginalOpIdx) ||
+          (CommuteOpIdx2 == OriginalOpIdx)) {
+        MachineInstr *CommutedMI = commuteInstruction(MI, false);
+        if (!CommutedMI) {
+          // Unable to commute.
+          return nullptr;
+        }
+        if (CommutedMI != MI) {
+          // New instruction. We can't fold from this.
+          CommutedMI->eraseFromParent();
+          return nullptr;
+        }
+
+        // Attempt to fold with the commuted version of the instruction.
+        unsigned CommuteOp =
+            (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
+        NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
+                                      /*AllowCommute=*/false);
+        if (NewMI)
+          return NewMI;
+
+        // Folding failed again - undo the commute before returning.
+        MachineInstr *UncommutedMI = commuteInstruction(MI, false);
+        if (!UncommutedMI) {
+          // Unable to commute.
+          return nullptr;
+        }
+        if (UncommutedMI != MI) {
+          // New instruction. It doesn't need to be kept.
+          UncommutedMI->eraseFromParent();
+          return nullptr;
+        }
+
+        // Return here to prevent duplicate fuse failure report.
+        return nullptr;
+      }
+    }
+  }
+
   // No fusion
   if (PrintFailedFusing && !MI->isCopy())
     dbgs() << "We failed to fuse operand " << i << " in " << *MI;
@@ -4350,8 +4520,10 @@
   // If the function stack isn't realigned we don't want to fold instructions
   // that need increased alignment.
   if (!RI.needsStackRealignment(MF))
-    Alignment = std::min(
-        Alignment, MF.getTarget().getFrameLowering()->getStackAlignment());
+    Alignment = std::min(Alignment, MF.getTarget()
+                                        .getSubtargetImpl()
+                                        ->getFrameLowering()
+                                        ->getStackAlignment());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
@@ -4374,7 +4546,27 @@
 
   SmallVector<MachineOperand,4> MOs;
   MOs.push_back(MachineOperand::CreateFI(FrameIndex));
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+                               Size, Alignment, /*AllowCommute=*/true);
+}
+
+static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
+                                  const MachineFunction &MF) {
+  unsigned Opc = LoadMI.getOpcode();
+  unsigned RegSize =
+      MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
+
+  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
+    // These instructions only load 32 bits, we can't fold them if the
+    // destination register is wider than 32 bits (4 bytes).
+    return true;
+
+  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
+    // These instructions only load 64 bits, we can't fold them if the
+    // destination register is wider than 64 bits (8 bytes).
+    return true;
+
+  return false;
 }
 
 MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -4384,8 +4576,11 @@
   // If loading from a FrameIndex, fold directly from the FrameIndex.
   unsigned NumOps = LoadMI->getDesc().getNumOperands();
   int FrameIndex;
-  if (isLoadFromStackSlot(LoadMI, FrameIndex))
+  if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
+    if (isPartialRegisterLoad(*LoadMI, MF))
+      return nullptr;
     return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
+  }
 
   // Check switch flag
   if (NoFusing) return nullptr;
@@ -4496,19 +4691,7 @@
     break;
   }
   default: {
-    if ((LoadMI->getOpcode() == X86::MOVSSrm ||
-         LoadMI->getOpcode() == X86::VMOVSSrm) &&
-        MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
-          > 4)
-      // These instructions only load 32 bits, we can't fold them if the
-      // destination register is wider than 32 bits (4 bytes).
-      return nullptr;
-    if ((LoadMI->getOpcode() == X86::MOVSDrm ||
-         LoadMI->getOpcode() == X86::VMOVSDrm) &&
-        MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
-          > 8)
-      // These instructions only load 64 bits, we can't fold them if the
-      // destination register is wider than 64 bits (8 bytes).
+    if (isPartialRegisterLoad(*LoadMI, MF))
       return nullptr;
 
     // Folding a normal load. Just copy the load's address operands.
@@ -4517,7 +4700,8 @@
     break;
   }
   }
-  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
+                               /*Size=*/0, Alignment, /*AllowCommute=*/true);
 }
 
 
@@ -5299,16 +5483,32 @@
   NopInst.setOpcode(X86::NOOP);
 }
 
+// This code must remain in sync with getJumpInstrTableEntryBound in this class!
+// In particular, getJumpInstrTableEntryBound must always return an upper bound
+// on the encoding lengths of the instructions generated by
+// getUnconditionalBranch and getTrap.
 void X86InstrInfo::getUnconditionalBranch(
     MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
   Branch.setOpcode(X86::JMP_4);
   Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
 }
 
+// This code must remain in sync with getJumpInstrTableEntryBound in this class!
+// In particular, getJumpInstrTableEntryBound must always return an upper bound
+// on the encoding lengths of the instructions generated by
+// getUnconditionalBranch and getTrap.
 void X86InstrInfo::getTrap(MCInst &MI) const {
   MI.setOpcode(X86::TRAP);
 }
 
+// See getTrap and getUnconditionalBranch for conditions on the value returned
+// by this function.
+unsigned X86InstrInfo::getJumpInstrTableEntryBound() const {
+  // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4
+  // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B).
+  return 5;
+}
+
 bool X86InstrInfo::isHighLatencyDef(int opc) const {
   switch (opc) {
   default: return false;
@@ -5351,10 +5551,10 @@
   case X86::VSQRTSSm:
   case X86::VSQRTSSm_Int:
   case X86::VSQRTSSr:
-  case X86::VSQRTPDZrm:
-  case X86::VSQRTPDZrr:
-  case X86::VSQRTPSZrm:
-  case X86::VSQRTPSZrr:
+  case X86::VSQRTPDZm:
+  case X86::VSQRTPDZr:
+  case X86::VSQRTPSZm:
+  case X86::VSQRTPSZr:
   case X86::VSQRTSDZm:
   case X86::VSQRTSDZm_Int:
   case X86::VSQRTSDZr:
@@ -5426,7 +5626,7 @@
       MachineBasicBlock::iterator MBBI = FirstMBB.begin();
       DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
-      const X86InstrInfo *TII = TM->getInstrInfo();
+      const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
       unsigned PC;
       if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
@@ -5524,7 +5724,7 @@
       const X86TargetMachine *TM =
           static_cast<const X86TargetMachine *>(&MF->getTarget());
       const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
-      const X86InstrInfo *TII = TM->getInstrInfo();
+      const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
       // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
       MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
@@ -5545,7 +5745,7 @@
       const X86TargetMachine *TM =
           static_cast<const X86TargetMachine *>(&MF->getTarget());
       const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
-      const X86InstrInfo *TII = TM->getInstrInfo();
+      const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
       // Create a virtual register for the TLS base address.
       MachineRegisterInfo &RegInfo = MF->getRegInfo();

diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index c177e3a..57b1958 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86INSTRUCTIONINFO_H
-#define X86INSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H
+#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
 
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86RegisterInfo.h"
@@ -404,7 +404,8 @@
                                       MachineInstr* MI,
                                       unsigned OpNum,
                                       const SmallVectorImpl<MachineOperand> &MOs,
-                                      unsigned Size, unsigned Alignment) const;
+                                      unsigned Size, unsigned Alignment,
+                                      bool AllowCommute) const;
 
   void
   getUnconditionalBranch(MCInst &Branch,
@@ -412,6 +413,8 @@
 
   void getTrap(MCInst &MI) const override;
 
+  unsigned getJumpInstrTableEntryBound() const override;
+
   bool isHighLatencyDef(int opc) const override;
 
   bool hasHighOperandLatency(const InstrItineraryData *ItinData,

diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index e7b532c..3dbf819 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td

@@ -551,11 +551,6 @@
   let RenderMethod = "addImmOperands";
 }
 
-class ImmZExtAsmOperandClass : AsmOperandClass {
-  let SuperClasses = [ImmAsmOperand];
-  let RenderMethod = "addImmOperands";
-}
-
 def X86GR32orGR64AsmOperand : AsmOperandClass {
   let Name = "GR32orGR64";
 }
@@ -568,6 +563,7 @@
   let PrintMethod = "printRoundingControl";
   let OperandType = "OPERAND_IMMEDIATE";
 }
+
 // Sign-extended immediate classes. We don't need to define the full lattice
 // here because there is no instruction with an ambiguity between ImmSExti64i32
 // and ImmSExti32i8.
@@ -595,12 +591,6 @@
   let Name = "ImmSExti32i8";
 }
 
-// [0, 0x000000FF]
-def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass {
-  let Name = "ImmZExtu32u8";
-}
-
-
 // [0, 0x0000007F]                                            |
 //   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
 def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
@@ -620,11 +610,6 @@
   let ParserMatchClass = ImmSExti32i8AsmOperand;
   let OperandType = "OPERAND_IMMEDIATE";
 }
-// 32-bits but only 8 bits are significant, and those 8 bits are unsigned.
-def u32u8imm  : Operand<i32> {
-  let ParserMatchClass = ImmZExtu32u8AsmOperand;
-  let OperandType = "OPERAND_IMMEDIATE";
-}
 
 // 64-bits but only 32 bits are significant.
 def i64i32imm  : Operand<i64> {
@@ -708,6 +693,7 @@
 def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
 def UseSSSE3     : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
 def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def NoSSE41      : Predicate<"!Subtarget->hasSSE41()">;
 def UseSSE41     : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
 def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
 def UseSSE42     : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
@@ -719,10 +705,16 @@
                      AssemblerPredicate<"FeatureAVX512", "AVX-512 ISA">;
 def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
 def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
-def NoAVX512       : Predicate<"!Subtarget->hasAVX512()">;
+def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
 def HasCDI       : Predicate<"Subtarget->hasCDI()">;
 def HasPFI       : Predicate<"Subtarget->hasPFI()">;
 def HasERI       : Predicate<"Subtarget->hasERI()">;
+def HasDQI       : Predicate<"Subtarget->hasDQI()">;
+def NoDQI        : Predicate<"!Subtarget->hasDQI()">;
+def HasBWI       : Predicate<"Subtarget->hasBWI()">;
+def HasVLX       : Predicate<"Subtarget->hasVLX()">,
+                     AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">;
+def NoVLX        : Predicate<"!Subtarget->hasVLX()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
@@ -744,8 +736,10 @@
 def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
+def HasSGX       : Predicate<"Subtarget->hasSGX()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
+def HasSMAP      : Predicate<"Subtarget->hasSMAP()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
@@ -754,6 +748,8 @@
                              AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
                              AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+def IsLP64  : Predicate<"Subtarget->isTarget64BitLP64()">;
+def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
 def In16BitMode  : Predicate<"Subtarget->is16Bit()">,
                              AssemblerPredicate<"Mode16Bit", "16-bit mode">;
 def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
@@ -2396,6 +2392,7 @@
 include "X86InstrSVM.td"
 
 include "X86InstrTSX.td"
+include "X86InstrSGX.td"
 
 // System instructions.
 include "X86InstrSystem.td"
@@ -2514,7 +2511,7 @@
 def : MnemonicAlias<"fnstcww",  "fnstcw",   "att">;
 def : MnemonicAlias<"fnstsww",  "fnstsw",   "att">;
 def : MnemonicAlias<"fucomip",  "fucompi",  "att">;
-def : MnemonicAlias<"fwait",    "wait",     "att">;
+def : MnemonicAlias<"fwait",    "wait">;
 
 
 class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,

diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index ecf80a1..9001fba 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td

@@ -38,12 +38,17 @@
 >;
 }
 
+let Sched = WriteVecLogic in
+def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins<
+  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
 let Sched = WriteVecIMul in
 def MMX_PMUL_ITINS : OpndItins<
   IIC_MMX_PMUL, IIC_MMX_PMUL
 >;
 
-let Sched = WriteVecALU in {
+let Sched = WriteVecIMul in {
 def MMX_PSADBW_ITINS : OpndItins<
   IIC_MMX_PSADBW, IIC_MMX_PSADBW
 >;
@@ -167,12 +172,14 @@
   def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
       (ins VR64:$src1, VR64:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
-      [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>;
+      [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>,
+      Sched<[WriteShuffle]>;
   def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1,
-                       (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>;
+                       (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
 multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
@@ -192,11 +199,11 @@
   def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
                   (ins DstRC:$src1, SrcRC:$src2), asm,
                   [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
-                  NoItinerary, d>;
+                  NoItinerary, d>, Sched<[WriteCvtI2F]>;
   def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
                   (ins DstRC:$src1, x86memop:$src2), asm,
                   [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
-                  NoItinerary, d>;
+                  NoItinerary, d>, Sched<[WriteCvtI2FLd]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -427,13 +434,13 @@
 
 // Logical Instructions
 defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
-                                  MMX_INTALU_ITINS, 1>;
+                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
 defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
-                                  MMX_INTALU_ITINS, 1>;
+                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
 defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
-                                  MMX_INTALU_ITINS, 1>;
+                                  MMX_INTALU_ITINS_VECLOGICSCHED, 1>;
 defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
-                                  MMX_INTALU_ITINS>;
+                                  MMX_INTALU_ITINS_VECLOGICSCHED>;
 
 // Shift Instructions
 defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",

diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td
new file mode 100644
index 0000000..47c5dc5
--- /dev/null
+++ b/lib/Target/X86/X86InstrSGX.td

@@ -0,0 +1,24 @@
+//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel SGX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SGX instructions
+
+// ENCLS - Execute an Enclave System Function of Specified Leaf Number
+def ENCLS : I<0x01, MRM_CF, (outs), (ins),
+             "encls", []>, TB, Requires<[HasSGX]>;
+
+// ENCLU - Execute an Enclave User Function of Specified Leaf Number
+def ENCLU : I<0x01, MRM_D7, (outs), (ins),
+             "enclu", []>, TB, Requires<[HasSGX]>;

diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index f9a5ae1..cc896f0 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td

@@ -181,6 +181,7 @@
   IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
 >;
 
+let Sched = WriteVecIMul in
 def SSE_PMULLD_ITINS : OpndItins<
   IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
 >;
@@ -218,11 +219,21 @@
   IIC_ALU_NONMEM, IIC_ALU_MEM
 >;
 
+let Sched = WriteVarBlend in
+def DEFAULT_ITINS_VARBLENDSCHED :  OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
 let Sched = WriteFBlend in
 def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
   IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
 >;
 
+let Sched = WriteBlend in
+def SSE_INTALU_ITINS_BLEND_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 Instructions Classes
 //===----------------------------------------------------------------------===//
@@ -601,29 +612,6 @@
 
 // Patterns
 let Predicates = [UseAVX] in {
-  let AddedComplexity = 15 in {
-  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
-  // MOVS{S,D} to the lower bits.
-  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
-            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
-  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-            (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
-  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-            (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
-  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
-            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
-
-  // Move low f32 and clear high bits.
-  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (VMOVSSrr (v4f32 (V_SET0)),
-                       (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
-  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (VMOVSSrr (v4i32 (V_SET0)),
-                       (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
-  }
-
   let AddedComplexity = 20 in {
   // MOVSSrm zeros the high parts of the register; represent this
   // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
@@ -659,31 +647,10 @@
                    (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
   }
-  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0),
-                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
-                           sub_xmm)>;
-  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
-            (SUBREG_TO_REG (i64 0),
-                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
-                           sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
                    (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
 
-  // Move low f64 and clear high bits.
-  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (VMOVSDrr (v2f64 (V_SET0)),
-                       (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
-
-  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
-            (SUBREG_TO_REG (i32 0),
-             (VMOVSDrr (v2i64 (V_SET0)),
-                       (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
-
   // Extract and store.
   def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
                    addr:$dst),
@@ -734,7 +701,6 @@
                         (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
               sub_xmm)>;
 
-
   // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
   // is during lowering, where it's not possible to recognize the fold cause
   // it has two uses through a bitcast. One use disappears at isel time and the
@@ -750,7 +716,7 @@
 }
 
 let Predicates = [UseSSE1] in {
-  let AddedComplexity = 15 in {
+  let Predicates = [NoSSE41], AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSS to the lower bits.
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
@@ -784,7 +750,7 @@
 }
 
 let Predicates = [UseSSE2] in {
-  let AddedComplexity = 15 in {
+  let Predicates = [NoSSE41], AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSD to the lower bits.
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
@@ -854,6 +820,7 @@
            Sched<[WriteLoad]>;
 }
 
+let Predicates = [HasAVX, NoVLX] in {
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
                               PS, VEX;
@@ -879,20 +846,26 @@
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
                               PD, VEX, VEX_L;
+}
+
+let Predicates = [UseSSE1] in {
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
                               PS;
-defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
-                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD;
 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
                               PS;
+}
+let Predicates = [UseSSE2] in {
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              PD;
 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
                               PD;
+}
 
-let SchedRW = [WriteStore] in {
+let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)],
@@ -1006,7 +979,7 @@
 
 // For disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
-    SchedRW = [WriteMove] in {
+    SchedRW = [WriteFShuffle] in {
   def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movaps\t{$src, $dst|$dst, $src}", [],
                          IIC_SSE_MOVA_P_RR>;
@@ -1036,7 +1009,7 @@
             (MOVUPDmr addr:$dst, VR128:$src)>;
 
 // Use vmovaps/vmovups for AVX integer load/store.
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
   // 128-bit load/store
   def : Pat<(alignedloadv2i64 addr:$src),
             (VMOVAPSrm addr:$src)>;
@@ -1251,6 +1224,9 @@
             (VMOVLPDrm VR128:$src1, addr:$src2)>;
   def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
             (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1,
+                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
 
   // Store patterns
   def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
@@ -1298,6 +1274,9 @@
             (MOVLPDrm VR128:$src1, addr:$src2)>;
   def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
             (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1,
+                             (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
 
   // Store patterns
   def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
@@ -1360,6 +1339,11 @@
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
                       (scalar_to_vector (loadf64 addr:$src2)))),
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
+  // Also handle an i64 load because that may get selected as a faster way to
+  // load the data.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
 }
 
 let Predicates = [UseSSE1] in {
@@ -1380,6 +1364,11 @@
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
                       (scalar_to_vector (loadf64 addr:$src2)))),
             (MOVHPDrm VR128:$src1, addr:$src2)>;
+  // Also handle an i64 load because that may get selected as a faster way to
+  // load the data.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2577,18 +2566,17 @@
 /// sse12_shuffle - sse 1 & 2 fp shuffle instructions
 multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
                          ValueType vt, string asm, PatFrag mem_frag,
-                         Domain d, bit IsConvertibleToThreeAddress = 0> {
+                         Domain d> {
   def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
                    [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
                                        (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
             Sched<[WriteFShuffleLd, ReadAfterLd]>;
-  let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
-    def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
-                   (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
-                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
-                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
-              Sched<[WriteFShuffle]>;
+  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                 (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
+                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                     (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+            Sched<[WriteFShuffle]>;
 }
 
 defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2607,10 +2595,10 @@
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
                     "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS;
+                    memopv4f32, SSEPackedSingle>, PS;
   defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
                     "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD;
+                    memopv2f64, SSEPackedDouble>, PD;
 }
 
 let Predicates = [HasAVX] in {
@@ -3136,7 +3124,6 @@
 
 let Predicates = [UseSSE2] in {
   // SSE2 patterns to select scalar double-precision fp arithmetic instructions
-
   def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
                       (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
                       FR64:$src))))),
@@ -3156,10 +3143,10 @@
 }
 
 let Predicates = [UseSSE41] in {
-  // If the subtarget has SSE4.1 but not AVX, the vector insert
-  // instruction is lowered into a X86insertps rather than a X86Movss.
-  // When selecting SSE scalar single-precision fp arithmetic instructions,
-  // make sure that we correctly match the X86insertps.
+  // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is
+  // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When
+  // selecting SSE scalar single-precision fp arithmetic instructions, make
+  // sure that we correctly match them.
 
   def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                   (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
@@ -3177,6 +3164,57 @@
                   (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                     FR32:$src))), (iPTR 0))),
             (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))), (i8 1))),
+            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))), (i8 1))),
+            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))), (i8 1))),
+            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))), (i8 1))),
+            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (i8 1))),
+            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (i8 1))),
+            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (i8 1))),
+            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (i8 1))),
+            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
 }
 
 let Predicates = [HasAVX] in {
@@ -3215,6 +3253,57 @@
                  (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                        FR32:$src))), (iPTR 0))),
             (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))), (i8 1))),
+            (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))), (i8 1))),
+            (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))), (i8 1))),
+            (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
+                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+                      FR32:$src))), (i8 1))),
+            (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (i8 1))),
+            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (i8 1))),
+            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (i8 1))),
+            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (i8 1))),
+            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
+                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+                      FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
 }
 
 // Patterns used to select SSE scalar fp arithmetic instructions from
@@ -3269,6 +3358,49 @@
             (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
 }
 
+let Predicates = [UseSSE41] in {
+  // With SSE4.1 we may see these operations using X86Blendi rather than
+  // X86Movs{s,d}.
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+            (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 
+                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+            (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+            (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 
+                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+            (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+
+  def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+                              (v2f64 VR128:$dst), (i8 2))),
+            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+                   (v2f64 VR128:$dst), (i8 2))),
+            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+                   (v2f64 VR128:$dst), (i8 2))),
+            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+                   (v2f64 VR128:$dst), (i8 2))),
+            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
 let Predicates = [HasAVX] in {
   // The following patterns select AVX Scalar single/double precision fp
   // arithmetic instructions from a packed single precision fp instruction
@@ -3298,6 +3430,46 @@
   def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
                    (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
             (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+
+  // Also handle X86Blendi-based patterns.
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+            (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 
+                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+            (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+            (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), 
+                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+            (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+
+  def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+                              (v2f64 VR128:$dst), (i8 2))),
+            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+                   (v2f64 VR128:$dst), (i8 2))),
+            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+                   (v2f64 VR128:$dst), (i8 2))),
+            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+  def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+                   (v2f64 VR128:$dst), (i8 2))),
+            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
 }
 
 /// Unop Arithmetic
@@ -3326,6 +3498,16 @@
 >;
 }
 
+let Sched = WriteFRsqrt in {
+def SSE_RSQRTPS : OpndItins<
+  IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
+>;
+
+def SSE_RSQRTSS : OpndItins<
+  IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
+>;
+}
+
 let Sched = WriteFRcp in {
 def SSE_RCPP : OpndItins<
   IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
@@ -3604,10 +3786,10 @@
 
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
-             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
              sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
-                                int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
+                                int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
 defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
              sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
              sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
@@ -3686,6 +3868,7 @@
 
 let AddedComplexity = 400 in { // Prefer non-temporal versions
 let SchedRW = [WriteStore] in {
+let Predicates = [HasAVX, NoVLX] in {
 def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
                      (ins f128mem:$dst, VR128:$src),
                      "movntps\t{$src, $dst|$dst, $src}",
@@ -3726,6 +3909,7 @@
                     [(alignednontemporalstore (v4i64 VR256:$src),
                                               addr:$dst)],
                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+}
 
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntps\t{$src, $dst|$dst, $src}",
@@ -3755,6 +3939,14 @@
                   PS, Requires<[HasSSE2]>;
 } // SchedRW = [WriteStore]
 
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVNTPSmr addr:$dst, VR128:$src)>;
+}
+
+def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
+          (MOVNTPSmr addr:$dst, VR128:$src)>;
+
 } // AddedComplexity
 
 //===----------------------------------------------------------------------===//
@@ -5277,6 +5469,13 @@
             (VMOVDDUPYrr VR256:$src)>;
 }
 
+let Predicates = [UseAVX, OptForSize] in {
+  def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+  (VMOVDDUPrm addr:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+  (VMOVDDUPrm addr:$src)>;
+}
+
 let Predicates = [UseSSE3] in {
   def : Pat<(X86Movddup (memopv2f64 addr:$src)),
             (MOVDDUPrm addr:$src)>;
@@ -5357,56 +5556,34 @@
 
 // Patterns used to select 'addsub' instructions.
 let Predicates = [HasAVX] in {
-  // Constant 170 corresponds to the binary mask '10101010'.
-  // When used as a blend mask, it allows selecting eight elements from two
-  // input vectors as follow:
-  // - Even-numbered values in the destination are copied from
-  //   the corresponding elements in the first input vector;
-  // - Odd-numbered values in the destination are copied from
-  //   the corresponding elements in the second input vector.
-
-  def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)),
-                              (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))),
-            (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
-
-  // Constant 10 corresponds to the binary mask '1010'.
-  // In the two pattens below, constant 10 is used as a blend mask to select
-  // - the 1st and 3rd element from the first input vector (the 'fsub' node);
-  // - the 2nd and 4th element from the second input vector (the 'fadd' node).
-
-  def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
-                             (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
-            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
-  def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
-                              (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
-            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
-  def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
-                              (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
             (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
-  def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
-                              (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), 
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))),
+            (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
             (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
-                             (v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
-            (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))),
+            (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+
+  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
+            (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))),
+            (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
+  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
+            (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+  def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))),
+            (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
 }
 
 let Predicates = [UseSSE3] in {
-  // Constant 10 corresponds to the binary mask '1010'.
-  // In the pattern below, it is used as a blend mask to select:
-  // - the 1st and 3rd element from the first input vector (the fsub node);
-  // - the 2nd and 4th element from the second input vector (the fadd node).
-
-  def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
-                              (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
             (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
-
-  def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
-                              (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), 
+  def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))),
+            (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
             (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
-  def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
-                             (v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
-            (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
+  def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))),
+            (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -6692,7 +6869,7 @@
 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
                            OpndItins itins = DEFAULT_ITINS> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
+      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -6701,7 +6878,7 @@
         (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
       Sched<[WriteFShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
+      (ins VR128:$src1, f32mem:$src2, i8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -7308,7 +7485,7 @@
 
 let Predicates = [HasAVX] in {
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
-                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
+                                 memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
                                  VEX_4V;
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
                                  memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
@@ -7316,7 +7493,7 @@
 }
 let Predicates = [HasAVX2] in {
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
+                                  memopv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
                                   VEX_4V, VEX_L;
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
                                   memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
@@ -7337,7 +7514,7 @@
                  OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
-        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
+        (ins RC:$src1, RC:$src2, i8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -7346,7 +7523,7 @@
         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
         Sched<[itins.Sched]>;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
-        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
+        (ins RC:$src1, x86memop:$src2, i8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -7360,31 +7537,33 @@
 
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
-    let ExeDomain = SSEPackedSingle in {
-    defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                        VR128, loadv4f32, f128mem, 0,
-                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
-    defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-                                    int_x86_avx_blend_ps_256, VR256, loadv8f32,
-                                    f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
-                                    VEX_4V, VEX_L;
-    }
-    let ExeDomain = SSEPackedDouble in {
-    defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                        VR128, loadv2f64, f128mem, 0,
-                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
-    defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-                                     int_x86_avx_blend_pd_256,VR256, loadv4f64,
-                                     f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
-                                     VEX_4V, VEX_L;
-    }
+    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+                                        VR128, loadv2i64, i128mem, 0,
+                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
+  }
+
+  let ExeDomain = SSEPackedSingle in {
+  defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
+                                      VR128, loadv4f32, f128mem, 0,
+                                      DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+  defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
+                                  int_x86_avx_blend_ps_256, VR256, loadv8f32,
+                                  f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
+                                  VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+  defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
+                                      VR128, loadv2f64, f128mem, 0,
+                                      DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+  defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
+                                   int_x86_avx_blend_pd_256,VR256, loadv4f64,
+                                   f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
+                                   VEX_4V, VEX_L;
+  }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
                                       VR128, loadv2i64, i128mem, 0,
                                       DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
-  defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                      VR128, loadv2i64, i128mem, 0,
-                                      DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
-  }
+
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
                                    VR128, loadv4f32, f128mem, 0,
@@ -7412,6 +7591,10 @@
 
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
+  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_MPSADBW_ITINS>;
+  }
   let ExeDomain = SSEPackedSingle in
   defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
                                      VR128, memopv4f32, f128mem,
@@ -7422,11 +7605,7 @@
                                      1, SSE_INTALU_ITINS_FBLEND_P>;
   defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
                                      VR128, memopv2i64, i128mem,
-                                     1, SSE_INTALU_ITINS_FBLEND_P>;
-  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memopv2i64, i128mem,
-                                     1, SSE_MPSADBW_ITINS>;
-  }
+                                     1, SSE_INTALU_ITINS_BLEND_P>;
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
                                   VR128, memopv4f32, f128mem, 1,
@@ -7545,6 +7724,57 @@
             (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
 }
 
+// Patterns
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (VBLENDPSrri (v4i32 (V_SET0)), VR128:$src, (i8 1))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+            (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+  }
+
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0),
+                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
+                           sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0),
+                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
+                           sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
+
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+            (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
+}
+
+let Predicates = [UseSSE41] in {
+  // With SSE41 we can use blends for these patterns.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+            (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>;
+}
+
+
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
@@ -7555,7 +7785,7 @@
                     !strconcat(OpcodeStr,
                      "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
-                    itins.rr>;
+                    itins.rr>, Sched<[itins.Sched]>;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, x86memop:$src2),
@@ -7564,18 +7794,21 @@
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
                        (bitconvert (mem_frag addr:$src2)), XMM0))],
-                       itins.rm>;
+                       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
   }
 }
 
 let ExeDomain = SSEPackedDouble in
 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
-                                  int_x86_sse41_blendvpd>;
+                                  int_x86_sse41_blendvpd,
+                                  DEFAULT_ITINS_FBLENDSCHED>;
 let ExeDomain = SSEPackedSingle in
 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
-                                  int_x86_sse41_blendvps>;
+                                  int_x86_sse41_blendvps,
+                                  DEFAULT_ITINS_FBLENDSCHED>;
 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
-                                  int_x86_sse41_pblendvb>;
+                                  int_x86_sse41_pblendvb,
+                                  DEFAULT_ITINS_VARBLENDSCHED>;
 
 // Aliases with the implicit xmm0 argument
 def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
@@ -8393,13 +8626,13 @@
   def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
              (ins RC:$src1, i8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX,
+             [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
              Sched<[WriteFShuffle]>;
   def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
              (ins x86memop_f:$src1, i8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst,
-               (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX,
+               (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX,
              Sched<[WriteFShuffleLd]>;
 }
 
@@ -8417,19 +8650,37 @@
 }
 
 let Predicates = [HasAVX] in {
-def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
+          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
+          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
+          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
+
+def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
           (VPERMILPSYri VR256:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
+def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
           (VPERMILPDYri VR256:$src1, imm:$imm)>;
-def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)),
+def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
                                (i8 imm:$imm))),
           (VPERMILPSYmi addr:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))),
+def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDYmi addr:$src1, imm:$imm)>;
 
-def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
+          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
+          (VPERMILPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
+          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
+          (VPERMILPDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
           (VPERMILPDri VR128:$src1, imm:$imm)>;
-def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))),
+def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDmi addr:$src1, imm:$imm)>;
 }
 
@@ -8540,15 +8791,15 @@
 
 // Patterns for  matching conversions from float to half-float and vice versa.
 let Predicates = [HasF16C] in {
-  def : Pat<(f32_to_f16 FR32:$src),
+  def : Pat<(fp_to_f16 FR32:$src),
             (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
               (COPY_TO_REGCLASS FR32:$src, VR128), 0)), sub_16bit))>;
 
-  def : Pat<(f16_to_f32 GR16:$src),
+  def : Pat<(f16_to_fp GR16:$src),
             (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
               (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;
 
-  def : Pat<(f16_to_f32 (i16 (f32_to_f16 FR32:$src))),
+  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
             (f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
               (VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 0)), FR32)) >;
 }
@@ -8563,13 +8814,13 @@
                  X86MemOperand x86memop> {
   let isCommutable = 1 in
   def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
-        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
+        (ins RC:$src1, RC:$src2, i8imm:$src3),
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
         Sched<[WriteBlend]>, VEX_4V;
   def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
-        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
+        (ins RC:$src1, x86memop:$src2, i8imm:$src3),
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst,
@@ -8578,12 +8829,10 @@
         Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
 }
 
-let isCommutable = 0 in {
 defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
                                    VR128, loadv2i64, i128mem>;
 defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
                                     VR256, loadv4i64, i256mem>, VEX_L;
-}
 
 def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
                   imm:$mask)),
@@ -8675,6 +8924,27 @@
   def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
           (VBROADCASTSDYrr VR128:$src)>;
 
+  // Provide aliases for broadcast from the same regitser class that
+  // automatically does the extract.
+  def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))),
+            (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src),
+                                                    sub_xmm)))>;
+  def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))),
+            (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src),
+                                                    sub_xmm)))>;
+  def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))),
+            (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src),
+                                                    sub_xmm)))>;
+  def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))),
+            (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src),
+                                                    sub_xmm)))>;
+  def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
+            (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
+                                                    sub_xmm)))>;
+  def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
+            (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
+                                                    sub_xmm)))>;
+
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
   let AddedComplexity = 20 in {
@@ -8756,6 +9026,9 @@
               (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
               (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
   }
+
+  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
+            (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8763,14 +9036,14 @@
 //
 
 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                     ValueType OpVT> {
+                     ValueType OpVT, X86FoldableSchedWrite Sched> {
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2),
                    !strconcat(OpcodeStr,
                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR256:$dst,
                      (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
-                   Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
+                   Sched<[Sched]>, VEX_4V, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, i256mem:$src2),
                    !strconcat(OpcodeStr,
@@ -8778,22 +9051,22 @@
                    [(set VR256:$dst,
                      (OpVT (X86VPermv VR256:$src1,
                             (bitconvert (mem_frag addr:$src2)))))]>,
-                   Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
+                   Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>;
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                         ValueType OpVT> {
+                         ValueType OpVT, X86FoldableSchedWrite Sched> {
   def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
                      (ins VR256:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
-                     Sched<[WriteShuffle256]>, VEX, VEX_L;
+                     Sched<[Sched]>, VEX, VEX_L;
   def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
                      (ins i256mem:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
@@ -8801,12 +9074,14 @@
                      [(set VR256:$dst,
                        (OpVT (X86VPermi (mem_frag addr:$src1),
                               (i8 imm:$src2))))]>,
-                     Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L;
+                     Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
 }
 
-defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W;
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
+                            WriteShuffle256>, VEX_W;
 let ExeDomain = SSEPackedDouble in
-defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W;
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
+                             WriteFShuffle256>, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks

diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 5402780..8cabdd0 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td

@@ -462,11 +462,7 @@
                 "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
 
 let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
-  def CPUID32 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB,
-  Requires<[Not64BitMode]>;
-let Defs = [RAX, RBX, RCX, RDX], Uses = [RAX, RCX] in
-  def CPUID64 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB,
-  Requires<[In64BitMode]>;
+  def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -479,10 +475,10 @@
 //===----------------------------------------------------------------------===//
 // XSAVE instructions
 let SchedRW = [WriteSystem] in {
-let Defs = [RDX, RAX], Uses = [RCX] in
+let Defs = [EDX, EAX], Uses = [ECX] in
   def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
 
-let Uses = [RDX, RAX, RCX] in
+let Uses = [EDX, EAX, ECX] in
   def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
 
 let Uses = [RDX, RAX] in {
@@ -563,7 +559,7 @@
 
 //===----------------------------------------------------------------------===//
 // SMAP Instruction
-let Defs = [EFLAGS], Uses = [EFLAGS] in {
+let Predicates = [HasSMAP], Defs = [EFLAGS] in {
   def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
   def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
 }

diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
new file mode 100644
index 0000000..d252f72
--- /dev/null
+++ b/lib/Target/X86/X86IntrinsicsInfo.h

@@ -0,0 +1,320 @@
+//===-- X86IntinsicsInfo.h - X86 Instrinsics ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the details for lowering X86 intrinsics
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
+
+namespace llvm {
+
+enum IntrinsicType {
+  INTR_NO_TYPE,
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
+  INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
+  CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, 
+  INTR_TYPE_1OP_MASK_RM
+};
+
+struct IntrinsicData {
+
+  unsigned      Id;
+  IntrinsicType Type;
+  unsigned      Opc0;
+  unsigned      Opc1;
+
+  bool operator<(const IntrinsicData &RHS) const {
+    return Id < RHS.Id;
+  }
+  bool operator==(const IntrinsicData &RHS) const {
+    return RHS.Id == Id;
+  }
+};
+
+#define X86_INTRINSIC_DATA(id, type, op0, op1) \
+  { Intrinsic::x86_##id, type, op0, op1 }
+
+/*
+ * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData IntrinsicsWithChain[] = {
+  X86_INTRINSIC_DATA(addcarry_u32,  ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarry_u64,  ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
+  X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
+  
+  X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
+  X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
+  
+  X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
+                     X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
+  X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
+                     X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm),
+  X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH,
+                     X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
+  X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
+                     X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
+  
+  X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
+  X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
+  
+  X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH,
+                     X86::VSCATTERPF0DPDm, X86::VSCATTERPF1DPDm),
+  X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH,
+                     X86::VSCATTERPF0DPSm, X86::VSCATTERPF1DPSm),
+  X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH,
+                     X86::VSCATTERPF0QPDm, X86::VSCATTERPF1QPDm),
+  X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH,
+                     X86::VSCATTERPF0QPSm, X86::VSCATTERPF1QPSm),
+  
+  X86_INTRINSIC_DATA(rdpmc,     RDPMC,  X86ISD::RDPMC_DAG, 0),
+  X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
+  X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
+  X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
+  X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0),
+  X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0),
+  X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
+  X86_INTRINSIC_DATA(rdtsc,     RDTSC,  X86ISD::RDTSC_DAG, 0),
+  X86_INTRINSIC_DATA(rdtscp,    RDTSC,  X86ISD::RDTSCP_DAG, 0),
+  
+  X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0),
+  X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0),
+  X86_INTRINSIC_DATA(xtest,     XTEST,  X86ISD::XTEST,  0),
+};
+
+/*
+ * Find Intrinsic data by intrinsic ID
+ */
+static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) {
+
+  IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 };
+  const IntrinsicData *Data =  std::lower_bound(std::begin(IntrinsicsWithChain),
+                                                std::end(IntrinsicsWithChain),
+                                                IntrinsicToFind);
+  if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind)
+    return Data;
+  return nullptr;
+}
+
+/*
+ * IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in
+ * the alphabetical order.
+ */
+static const IntrinsicData  IntrinsicsWithoutChain[] = {
+  X86_INTRINSIC_DATA(avx2_phadd_d,      INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(avx2_phadd_w,      INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(avx2_phsub_d,      INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(avx2_phsub_w,      INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_b,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_d,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxs_w,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_b,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_d,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmaxu_w,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_b,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_d,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pmins_w,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_b,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_d,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_pminu_w,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(avx2_psll_d,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx2_psll_q,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx2_psll_w,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(avx2_pslli_d,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_pslli_q,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_pslli_w,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx2_psra_d,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx2_psra_w,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(avx2_psrai_d,      VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx2_psrai_w,      VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx2_psrl_d,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrl_q,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrl_w,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(avx2_psrli_d,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psrli_q,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psrli_w,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(avx2_vperm2i128,   INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx512_exp2_pd,    INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_exp2_ps,    INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_b_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_b_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_b_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_d_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_d_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_d_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_q_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_q_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_q_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_w_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_w_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_cmp_w_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pslli_d,       VSHIFT_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pslli_q,       VSHIFT_MASK, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrai_d,       VSHIFT_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrai_q,       VSHIFT_MASK, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrli_d,       VSHIFT_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_psrli_q,       VSHIFT_MASK, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_pd,   INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rcp28_ps,   INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
+  X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(avx_hsub_ps_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
+  X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse2_comile_sd,    COMI, X86ISD::COMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse2_comilt_sd,    COMI, X86ISD::COMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse2_comineq_sd,   COMI, X86ISD::COMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse2_pmaxs_w,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse2_pmaxu_b,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse2_pmins_w,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse2_pminu_b,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse2_psll_d,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(sse2_psll_q,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(sse2_psll_w,       INTR_TYPE_2OP, X86ISD::VSHL, 0),
+  X86_INTRINSIC_DATA(sse2_pslli_d,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(sse2_pslli_q,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(sse2_pslli_w,      VSHIFT, X86ISD::VSHLI, 0),
+  X86_INTRINSIC_DATA(sse2_psra_d,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(sse2_psra_w,       INTR_TYPE_2OP, X86ISD::VSRA, 0),
+  X86_INTRINSIC_DATA(sse2_psrai_d,      VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(sse2_psrai_w,      VSHIFT, X86ISD::VSRAI, 0),
+  X86_INTRINSIC_DATA(sse2_psrl_d,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(sse2_psrl_q,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(sse2_psrl_w,       INTR_TYPE_2OP, X86ISD::VSRL, 0),
+  X86_INTRINSIC_DATA(sse2_psrli_d,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psrli_q,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psrli_w,      VSHIFT, X86ISD::VSRLI, 0),
+  X86_INTRINSIC_DATA(sse2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(sse2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
+  X86_INTRINSIC_DATA(sse2_sqrt_pd,      INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(sse2_ucomieq_sd,   COMI, X86ISD::UCOMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse2_ucomige_sd,   COMI, X86ISD::UCOMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse2_ucomigt_sd,   COMI, X86ISD::UCOMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse2_ucomile_sd,   COMI, X86ISD::UCOMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse2_ucomilt_sd,   COMI, X86ISD::UCOMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse2_ucomineq_sd,  COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse3_hadd_pd,      INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(sse3_hadd_ps,      INTR_TYPE_2OP, X86ISD::FHADD, 0),
+  X86_INTRINSIC_DATA(sse3_hsub_pd,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(sse3_hsub_ps,      INTR_TYPE_2OP, X86ISD::FHSUB, 0),
+  X86_INTRINSIC_DATA(sse41_insertps,    INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxsb,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxsd,      INTR_TYPE_2OP, X86ISD::SMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxud,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pmaxuw,      INTR_TYPE_2OP, X86ISD::UMAX, 0),
+  X86_INTRINSIC_DATA(sse41_pminsb,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminsd,      INTR_TYPE_2OP, X86ISD::SMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminud,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse41_pminuw,      INTR_TYPE_2OP, X86ISD::UMIN, 0),
+  X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse_comile_ss,     COMI, X86ISD::COMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse_comilt_ss,     COMI, X86ISD::COMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse_comineq_ss,    COMI, X86ISD::COMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),
+  X86_INTRINSIC_DATA(sse_ucomieq_ss,    COMI, X86ISD::UCOMI, ISD::SETEQ),
+  X86_INTRINSIC_DATA(sse_ucomige_ss,    COMI, X86ISD::UCOMI, ISD::SETGE),
+  X86_INTRINSIC_DATA(sse_ucomigt_ss,    COMI, X86ISD::UCOMI, ISD::SETGT),
+  X86_INTRINSIC_DATA(sse_ucomile_ss,    COMI, X86ISD::UCOMI, ISD::SETLE),
+  X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
+  X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+  X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+  X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0)
+};
+
+/*
+ * Retrieve data for Intrinsic without chain.
+ * Return nullptr if intrinsic is not defined in the table.
+ */
+static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
+  IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 };
+  const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
+                                               std::end(IntrinsicsWithoutChain),
+                                               IntrinsicToFind);
+  if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind)
+    return Data;
+  return nullptr;
+}
+
+static void verifyIntrinsicTables() {
+  assert(std::is_sorted(std::begin(IntrinsicsWithoutChain),
+                        std::end(IntrinsicsWithoutChain)) &&
+         std::is_sorted(std::begin(IntrinsicsWithChain),
+                        std::end(IntrinsicsWithChain)) &&
+         "Intrinsic data tables should be sorted by Intrinsic ID");
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
deleted file mode 100644
index a082c4f..0000000
--- a/lib/Target/X86/X86JITInfo.cpp
+++ /dev/null

@@ -1,588 +0,0 @@
-//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the JIT interfaces for the X86 target.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86JITInfo.h"
-#include "X86Relocations.h"
-#include "X86Subtarget.h"
-#include "X86TargetMachine.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Valgrind.h"
-#include <cstdlib>
-#include <cstring>
-using namespace llvm;
-
-#define DEBUG_TYPE "jit"
-
-// Determine the platform we're running on
-#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
-# define X86_64_JIT
-#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
-# define X86_32_JIT
-#endif
-
-void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
-  unsigned char *OldByte = (unsigned char *)Old;
-  *OldByte++ = 0xE9;                // Emit JMP opcode.
-  unsigned *OldWord = (unsigned *)OldByte;
-  unsigned NewAddr = (intptr_t)New;
-  unsigned OldAddr = (intptr_t)OldWord;
-  *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
-
-  // X86 doesn't need to invalidate the processor cache, so just invalidate
-  // Valgrind's cache directly.
-  sys::ValgrindDiscardTranslations(Old, 5);
-}
-
-
-/// JITCompilerFunction - This contains the address of the JIT function used to
-/// compile a function lazily.
-static TargetJITInfo::JITCompilerFn JITCompilerFunction;
-
-// Get the ASMPREFIX for the current host.  This is often '_'.
-#ifndef __USER_LABEL_PREFIX__
-#define __USER_LABEL_PREFIX__
-#endif
-#define GETASMPREFIX2(X) #X
-#define GETASMPREFIX(X) GETASMPREFIX2(X)
-#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
-
-// For ELF targets, use a .size and .type directive, to let tools
-// know the extent of functions defined in assembler.
-#if defined(__ELF__)
-# define SIZE(sym) ".size " #sym ", . - " #sym "\n"
-# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n"
-#else
-# define SIZE(sym)
-# define TYPE_FUNCTION(sym)
-#endif
-
-// Provide a convenient way for disabling usage of CFI directives.
-// This is needed for old/broken assemblers (for example, gas on
-// Darwin is pretty old and doesn't support these directives)
-#if defined(__APPLE__)
-# define CFI(x)
-#else
-// FIXME: Disable this until we really want to use it. Also, we will
-//        need to add some workarounds for compilers, which support
-//        only subset of these directives.
-# define CFI(x)
-#endif
-
-// Provide a wrapper for LLVMX86CompilationCallback2 that saves non-traditional
-// callee saved registers, for the fastcc calling convention.
-extern "C" {
-#if defined(X86_64_JIT)
-# ifndef _MSC_VER
-  // No need to save EAX/EDX for X86-64.
-  void X86CompilationCallback(void);
-  asm(
-    ".text\n"
-    ".align 8\n"
-    ".globl " ASMPREFIX "X86CompilationCallback\n"
-    TYPE_FUNCTION(X86CompilationCallback)
-  ASMPREFIX "X86CompilationCallback:\n"
-    CFI(".cfi_startproc\n")
-    // Save RBP
-    "pushq   %rbp\n"
-    CFI(".cfi_def_cfa_offset 16\n")
-    CFI(".cfi_offset %rbp, -16\n")
-    // Save RSP
-    "movq    %rsp, %rbp\n"
-    CFI(".cfi_def_cfa_register %rbp\n")
-    // Save all int arg registers
-    "pushq   %rdi\n"
-    CFI(".cfi_rel_offset %rdi, 0\n")
-    "pushq   %rsi\n"
-    CFI(".cfi_rel_offset %rsi, 8\n")
-    "pushq   %rdx\n"
-    CFI(".cfi_rel_offset %rdx, 16\n")
-    "pushq   %rcx\n"
-    CFI(".cfi_rel_offset %rcx, 24\n")
-    "pushq   %r8\n"
-    CFI(".cfi_rel_offset %r8, 32\n")
-    "pushq   %r9\n"
-    CFI(".cfi_rel_offset %r9, 40\n")
-    // Align stack on 16-byte boundary. ESP might not be properly aligned
-    // (8 byte) if this is called from an indirect stub.
-    "andq    $-16, %rsp\n"
-    // Save all XMM arg registers
-    "subq    $128, %rsp\n"
-    "movaps  %xmm0, (%rsp)\n"
-    "movaps  %xmm1, 16(%rsp)\n"
-    "movaps  %xmm2, 32(%rsp)\n"
-    "movaps  %xmm3, 48(%rsp)\n"
-    "movaps  %xmm4, 64(%rsp)\n"
-    "movaps  %xmm5, 80(%rsp)\n"
-    "movaps  %xmm6, 96(%rsp)\n"
-    "movaps  %xmm7, 112(%rsp)\n"
-    // JIT callee
-#if defined(_WIN64) || defined(__CYGWIN__)
-    "subq    $32, %rsp\n"
-    "movq    %rbp, %rcx\n"    // Pass prev frame and return address
-    "movq    8(%rbp), %rdx\n"
-    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
-    "addq    $32, %rsp\n"
-#else
-    "movq    %rbp, %rdi\n"    // Pass prev frame and return address
-    "movq    8(%rbp), %rsi\n"
-    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
-#endif
-    // Restore all XMM arg registers
-    "movaps  112(%rsp), %xmm7\n"
-    "movaps  96(%rsp), %xmm6\n"
-    "movaps  80(%rsp), %xmm5\n"
-    "movaps  64(%rsp), %xmm4\n"
-    "movaps  48(%rsp), %xmm3\n"
-    "movaps  32(%rsp), %xmm2\n"
-    "movaps  16(%rsp), %xmm1\n"
-    "movaps  (%rsp), %xmm0\n"
-    // Restore RSP
-    "movq    %rbp, %rsp\n"
-    CFI(".cfi_def_cfa_register %rsp\n")
-    // Restore all int arg registers
-    "subq    $48, %rsp\n"
-    CFI(".cfi_adjust_cfa_offset 48\n")
-    "popq    %r9\n"
-    CFI(".cfi_adjust_cfa_offset -8\n")
-    CFI(".cfi_restore %r9\n")
-    "popq    %r8\n"
-    CFI(".cfi_adjust_cfa_offset -8\n")
-    CFI(".cfi_restore %r8\n")
-    "popq    %rcx\n"
-    CFI(".cfi_adjust_cfa_offset -8\n")
-    CFI(".cfi_restore %rcx\n")
-    "popq    %rdx\n"
-    CFI(".cfi_adjust_cfa_offset -8\n")
-    CFI(".cfi_restore %rdx\n")
-    "popq    %rsi\n"
-    CFI(".cfi_adjust_cfa_offset -8\n")
-    CFI(".cfi_restore %rsi\n")
-    "popq    %rdi\n"
-    CFI(".cfi_adjust_cfa_offset -8\n")
-    CFI(".cfi_restore %rdi\n")
-    // Restore RBP
-    "popq    %rbp\n"
-    CFI(".cfi_adjust_cfa_offset -8\n")
-    CFI(".cfi_restore %rbp\n")
-    "ret\n"
-    CFI(".cfi_endproc\n")
-    SIZE(X86CompilationCallback)
-  );
-# else
-  // No inline assembler support on this platform. The routine is in external
-  // file.
-  void X86CompilationCallback();
-
-# endif
-#elif defined (X86_32_JIT)
-# ifndef _MSC_VER
-  void X86CompilationCallback(void);
-  asm(
-    ".text\n"
-    ".align 8\n"
-    ".globl " ASMPREFIX "X86CompilationCallback\n"
-    TYPE_FUNCTION(X86CompilationCallback)
-  ASMPREFIX "X86CompilationCallback:\n"
-    CFI(".cfi_startproc\n")
-    "pushl   %ebp\n"
-    CFI(".cfi_def_cfa_offset 8\n")
-    CFI(".cfi_offset %ebp, -8\n")
-    "movl    %esp, %ebp\n"    // Standard prologue
-    CFI(".cfi_def_cfa_register %ebp\n")
-    "pushl   %eax\n"
-    CFI(".cfi_rel_offset %eax, 0\n")
-    "pushl   %edx\n"          // Save EAX/EDX/ECX
-    CFI(".cfi_rel_offset %edx, 4\n")
-    "pushl   %ecx\n"
-    CFI(".cfi_rel_offset %ecx, 8\n")
-#  if defined(__APPLE__)
-    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
-#  endif
-    "subl    $16, %esp\n"
-    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
-    "movl    %eax, 4(%esp)\n"
-    "movl    %ebp, (%esp)\n"
-    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
-    "movl    %ebp, %esp\n"    // Restore ESP
-    CFI(".cfi_def_cfa_register %esp\n")
-    "subl    $12, %esp\n"
-    CFI(".cfi_adjust_cfa_offset 12\n")
-    "popl    %ecx\n"
-    CFI(".cfi_adjust_cfa_offset -4\n")
-    CFI(".cfi_restore %ecx\n")
-    "popl    %edx\n"
-    CFI(".cfi_adjust_cfa_offset -4\n")
-    CFI(".cfi_restore %edx\n")
-    "popl    %eax\n"
-    CFI(".cfi_adjust_cfa_offset -4\n")
-    CFI(".cfi_restore %eax\n")
-    "popl    %ebp\n"
-    CFI(".cfi_adjust_cfa_offset -4\n")
-    CFI(".cfi_restore %ebp\n")
-    "ret\n"
-    CFI(".cfi_endproc\n")
-    SIZE(X86CompilationCallback)
-  );
-
-  // Same as X86CompilationCallback but also saves XMM argument registers.
-  void X86CompilationCallback_SSE(void);
-  asm(
-    ".text\n"
-    ".align 8\n"
-    ".globl " ASMPREFIX "X86CompilationCallback_SSE\n"
-    TYPE_FUNCTION(X86CompilationCallback_SSE)
-  ASMPREFIX "X86CompilationCallback_SSE:\n"
-    CFI(".cfi_startproc\n")
-    "pushl   %ebp\n"
-    CFI(".cfi_def_cfa_offset 8\n")
-    CFI(".cfi_offset %ebp, -8\n")
-    "movl    %esp, %ebp\n"    // Standard prologue
-    CFI(".cfi_def_cfa_register %ebp\n")
-    "pushl   %eax\n"
-    CFI(".cfi_rel_offset %eax, 0\n")
-    "pushl   %edx\n"          // Save EAX/EDX/ECX
-    CFI(".cfi_rel_offset %edx, 4\n")
-    "pushl   %ecx\n"
-    CFI(".cfi_rel_offset %ecx, 8\n")
-    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
-    // Save all XMM arg registers
-    "subl    $64, %esp\n"
-    // FIXME: provide frame move information for xmm registers.
-    // This can be tricky, because CFA register is ebp (unaligned)
-    // and we need to produce offsets relative to it.
-    "movaps  %xmm0, (%esp)\n"
-    "movaps  %xmm1, 16(%esp)\n"
-    "movaps  %xmm2, 32(%esp)\n"
-    "movaps  %xmm3, 48(%esp)\n"
-    "subl    $16, %esp\n"
-    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
-    "movl    %eax, 4(%esp)\n"
-    "movl    %ebp, (%esp)\n"
-    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
-    "addl    $16, %esp\n"
-    "movaps  48(%esp), %xmm3\n"
-    CFI(".cfi_restore %xmm3\n")
-    "movaps  32(%esp), %xmm2\n"
-    CFI(".cfi_restore %xmm2\n")
-    "movaps  16(%esp), %xmm1\n"
-    CFI(".cfi_restore %xmm1\n")
-    "movaps  (%esp), %xmm0\n"
-    CFI(".cfi_restore %xmm0\n")
-    "movl    %ebp, %esp\n"    // Restore ESP
-    CFI(".cfi_def_cfa_register esp\n")
-    "subl    $12, %esp\n"
-    CFI(".cfi_adjust_cfa_offset 12\n")
-    "popl    %ecx\n"
-    CFI(".cfi_adjust_cfa_offset -4\n")
-    CFI(".cfi_restore %ecx\n")
-    "popl    %edx\n"
-    CFI(".cfi_adjust_cfa_offset -4\n")
-    CFI(".cfi_restore %edx\n")
-    "popl    %eax\n"
-    CFI(".cfi_adjust_cfa_offset -4\n")
-    CFI(".cfi_restore %eax\n")
-    "popl    %ebp\n"
-    CFI(".cfi_adjust_cfa_offset -4\n")
-    CFI(".cfi_restore %ebp\n")
-    "ret\n"
-    CFI(".cfi_endproc\n")
-    SIZE(X86CompilationCallback_SSE)
-  );
-# else
-  void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
-
-  _declspec(naked) void X86CompilationCallback(void) {
-    __asm {
-      push  ebp
-      mov   ebp, esp
-      push  eax
-      push  edx
-      push  ecx
-      and   esp, -16
-      sub   esp, 16
-      mov   eax, dword ptr [ebp+4]
-      mov   dword ptr [esp+4], eax
-      mov   dword ptr [esp], ebp
-      call  LLVMX86CompilationCallback2
-      mov   esp, ebp
-      sub   esp, 12
-      pop   ecx
-      pop   edx
-      pop   eax
-      pop   ebp
-      ret
-    }
-  }
-
-# endif // _MSC_VER
-
-#else // Not an i386 host
-  void X86CompilationCallback() {
-    llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!");
-  }
-#endif
-}
-
-/// This is the target-specific function invoked by the
-/// function stub when we did not know the real target of a call.  This function
-/// must locate the start of the stub or call site and pass it into the JIT
-/// compiler function.
-extern "C" {
-LLVM_ATTRIBUTE_USED // Referenced from inline asm.
-LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr,
-                                                         intptr_t RetAddr) {
-  intptr_t *RetAddrLoc = &StackPtr[1];
-  // We are reading raw stack data here. Tell MemorySanitizer that it is
-  // sufficiently initialized.
-  __msan_unpoison(RetAddrLoc, sizeof(*RetAddrLoc));
-  assert(*RetAddrLoc == RetAddr &&
-         "Could not find return address on the stack!");
-
-  // It's a stub if there is an interrupt marker after the call.
-  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE;
-
-  // The call instruction should have pushed the return value onto the stack...
-#if defined (X86_64_JIT)
-  RetAddr--;     // Backtrack to the reference itself...
-#else
-  RetAddr -= 4;  // Backtrack to the reference itself...
-#endif
-
-#if 0
-  DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr
-               << " ESP=" << (void*)StackPtr
-               << ": Resolving call to function: "
-               << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n");
-#endif
-
-  // Sanity check to make sure this really is a call instruction.
-#if defined (X86_64_JIT)
-  assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
-  assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
-#else
-  assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
-#endif
-
-  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
-
-  // Rewrite the call target... so that we don't end up here every time we
-  // execute the call.
-#if defined (X86_64_JIT)
-  assert(isStub &&
-         "X86-64 doesn't support rewriting non-stub lazy compilation calls:"
-         " the call instruction varies too much.");
-#else
-  *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
-#endif
-
-  if (isStub) {
-    // If this is a stub, rewrite the call into an unconditional branch
-    // instruction so that two return addresses are not pushed onto the stack
-    // when the requested function finally gets called.  This also makes the
-    // 0xCE byte (interrupt) dead, so the marker doesn't effect anything.
-#if defined (X86_64_JIT)
-    // If the target address is within 32-bit range of the stub, use a
-    // PC-relative branch instead of loading the actual address.  (This is
-    // considerably shorter than the 64-bit immediate load already there.)
-    // We assume here intptr_t is 64 bits.
-    intptr_t diff = NewVal-RetAddr+7;
-    if (diff >= -2147483648LL && diff <= 2147483647LL) {
-      *(unsigned char*)(RetAddr-0xc) = 0xE9;
-      *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff;
-    } else {
-      *(intptr_t *)(RetAddr - 0xa) = NewVal;
-      ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
-    }
-    sys::ValgrindDiscardTranslations((void*)(RetAddr-0xc), 0xd);
-#else
-    ((unsigned char*)RetAddr)[-1] = 0xE9;
-    sys::ValgrindDiscardTranslations((void*)(RetAddr-1), 5);
-#endif
-  }
-
-  // Change the return address to reexecute the call instruction...
-#if defined (X86_64_JIT)
-  *RetAddrLoc -= 0xd;
-#else
-  *RetAddrLoc -= 5;
-#endif
-}
-}
-
-TargetJITInfo::LazyResolverFn
-X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
-  TsanIgnoreWritesBegin();
-  JITCompilerFunction = F;
-  TsanIgnoreWritesEnd();
-
-#if defined (X86_32_JIT) && !defined (_MSC_VER)
-#if defined(__SSE__)
-  // SSE Callback should be called for SSE-enabled LLVM.
-  return X86CompilationCallback_SSE;
-#else
-  if (useSSE)
-    return X86CompilationCallback_SSE;
-#endif
-#endif
-
-  return X86CompilationCallback;
-}
-
-X86JITInfo::X86JITInfo(bool UseSSE) {
-  useSSE = UseSSE;
-  useGOT = 0;
-  TLSOffset = nullptr;
-}
-
-void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
-                                             JITCodeEmitter &JCE) {
-#if defined (X86_64_JIT)
-  const unsigned Alignment = 8;
-  uint8_t Buffer[8];
-  uint8_t *Cur = Buffer;
-  MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr);
-  MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32));
-#else
-  const unsigned Alignment = 4;
-  uint8_t Buffer[4];
-  uint8_t *Cur = Buffer;
-  MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr);
-#endif
-  return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment);
-}
-
-TargetJITInfo::StubLayout X86JITInfo::getStubLayout() {
-  // The 64-bit stub contains:
-  //   movabs r10 <- 8-byte-target-address  # 10 bytes
-  //   call|jmp *r10  # 3 bytes
-  // The 32-bit stub contains a 5-byte call|jmp.
-  // If the stub is a call to the compilation callback, an extra byte is added
-  // to mark it as a stub.
-  StubLayout Result = {14, 4};
-  return Result;
-}
-
-void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
-                                   JITCodeEmitter &JCE) {
-  // Note, we cast to intptr_t here to silence a -pedantic warning that
-  // complains about casting a function pointer to a normal pointer.
-#if defined (X86_32_JIT) && !defined (_MSC_VER)
-  bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback &&
-                Target != (void*)(intptr_t)X86CompilationCallback_SSE);
-#else
-  bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback;
-#endif
-  JCE.emitAlignment(4);
-  void *Result = (void*)JCE.getCurrentPCValue();
-  if (NotCC) {
-#if defined (X86_64_JIT)
-    JCE.emitByte(0x49);          // REX prefix
-    JCE.emitByte(0xB8+2);        // movabsq r10
-    JCE.emitWordLE((unsigned)(intptr_t)Target);
-    JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
-    JCE.emitByte(0x41);          // REX prefix
-    JCE.emitByte(0xFF);          // jmpq *r10
-    JCE.emitByte(2 | (4 << 3) | (3 << 6));
-#else
-    JCE.emitByte(0xE9);
-    JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
-#endif
-    return Result;
-  }
-
-#if defined (X86_64_JIT)
-  JCE.emitByte(0x49);          // REX prefix
-  JCE.emitByte(0xB8+2);        // movabsq r10
-  JCE.emitWordLE((unsigned)(intptr_t)Target);
-  JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
-  JCE.emitByte(0x41);          // REX prefix
-  JCE.emitByte(0xFF);          // callq *r10
-  JCE.emitByte(2 | (2 << 3) | (3 << 6));
-#else
-  JCE.emitByte(0xE8);   // Call with 32 bit pc-rel destination...
-
-  JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
-#endif
-
-  // This used to use 0xCD, but that value is used by JITMemoryManager to
-  // initialize the buffer with garbage, which means it may follow a
-  // noreturn function call, confusing LLVMX86CompilationCallback2.  PR 4929.
-  JCE.emitByte(0xCE);   // Interrupt - Just a marker identifying the stub!
-  return Result;
-}
-
-/// getPICJumpTableEntry - Returns the value of the jumptable entry for the
-/// specific basic block.
-uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
-#if defined(X86_64_JIT)
-  return BB - Entry;
-#else
-  return BB - PICBase;
-#endif
-}
-
-template<typename T> static void addUnaligned(void *Pos, T Delta) {
-  T Value;
-  std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos),
-              sizeof(T));
-  Value += Delta;
-  std::memcpy(reinterpret_cast<char*>(Pos), reinterpret_cast<char*>(&Value),
-              sizeof(T));
-}
-
-/// relocate - Before the JIT can run a block of code that has been emitted,
-/// it must rewrite the code to contain the actual addresses of any
-/// referenced global symbols.
-void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char* GOTBase) {
-  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
-    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
-    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
-    switch ((X86::RelocationType)MR->getRelocationType()) {
-    case X86::reloc_pcrel_word: {
-      // PC relative relocation, add the relocated value to the value already in
-      // memory, after we adjust it for where the PC is.
-      ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
-      addUnaligned<unsigned>(RelocPos, ResultPtr);
-      break;
-    }
-    case X86::reloc_picrel_word: {
-      // PIC base relative relocation, add the relocated value to the value
-      // already in memory, after we adjust it for where the PIC base is.
-      ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
-      addUnaligned<unsigned>(RelocPos, ResultPtr);
-      break;
-    }
-    case X86::reloc_absolute_word:
-    case X86::reloc_absolute_word_sext:
-      // Absolute relocation, just add the relocated value to the value already
-      // in memory.
-      addUnaligned<unsigned>(RelocPos, ResultPtr);
-      break;
-    case X86::reloc_absolute_dword:
-      addUnaligned<intptr_t>(RelocPos, ResultPtr);
-      break;
-    }
-  }
-}
-
-char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
-#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER)
-  TLSOffset -= size;
-  return TLSOffset;
-#else
-  llvm_unreachable("Cannot allocate thread local storage on this arch!");
-#endif
-}

diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
deleted file mode 100644
index 564343f..0000000
--- a/lib/Target/X86/X86JITInfo.h
+++ /dev/null

@@ -1,79 +0,0 @@
-//===-- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the X86 implementation of the TargetJITInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef X86JITINFO_H
-#define X86JITINFO_H
-
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Target/TargetJITInfo.h"
-
-namespace llvm {
-  class X86Subtarget;
-
-  class X86JITInfo : public TargetJITInfo {
-    uintptr_t PICBase;
-    char *TLSOffset;
-    bool useSSE;
-  public:
-    explicit X86JITInfo(bool UseSSE);
-
-    /// replaceMachineCodeForFunction - Make it so that calling the function
-    /// whose machine code is at OLD turns into a call to NEW, perhaps by
-    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
-    /// code.
-    ///
-    void replaceMachineCodeForFunction(void *Old, void *New) override;
-
-    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
-    /// to emit an indirect symbol which contains the address of the specified
-    /// ptr.
-    void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
-                                     JITCodeEmitter &JCE) override;
-
-    // getStubLayout - Returns the size and alignment of the largest call stub
-    // on X86.
-    StubLayout getStubLayout() override;
-
-    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
-    /// small native function that simply calls the function at the specified
-    /// address.
-    void *emitFunctionStub(const Function* F, void *Target,
-                           JITCodeEmitter &JCE) override;
-
-    /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
-    /// specific basic block.
-    uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase) override;
-
-    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
-
-    /// relocate - Before the JIT can run a block of code that has been emitted,
-    /// it must rewrite the code to contain the actual addresses of any
-    /// referenced global symbols.
-    void relocate(void *Function, MachineRelocation *MR,
-                  unsigned NumRelocs, unsigned char* GOTBase) override;
-
-    /// allocateThreadLocalMemory - Each target has its own way of
-    /// handling thread local variables. This method returns a value only
-    /// meaningful to the target.
-    char* allocateThreadLocalMemory(size_t size) override;
-
-    /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute
-    /// PIC jumptable entry.
-    void setPICBase(uintptr_t Base) { PICBase = Base; }
-    uintptr_t getPICBase() const { return PICBase; }
-  };
-}
-
-#endif

diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 2bd70a9..4e0d594 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp

@@ -16,20 +16,25 @@
 #include "X86RegisterInfo.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
+#include "Utils/X86ShuffleDecode.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 namespace {
@@ -58,6 +63,53 @@
 
 } // end anonymous namespace
 
+// Emit a minimal sequence of nops spanning NumBytes bytes.
+static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
+                     const MCSubtargetInfo &STI);
+
+namespace llvm {
+   X86AsmPrinter::StackMapShadowTracker::StackMapShadowTracker(TargetMachine &TM)
+     : TM(TM), InShadow(false), RequiredShadowSize(0), CurrentShadowSize(0) {}
+
+  X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {}
+
+  void
+  X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &MF) {
+    CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
+        *TM.getSubtargetImpl()->getInstrInfo(),
+        *TM.getSubtargetImpl()->getRegisterInfo(), *TM.getSubtargetImpl(),
+        MF.getContext()));
+  }
+
+  void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
+                                                   const MCSubtargetInfo &STI) {
+    if (InShadow) {
+      SmallString<256> Code;
+      SmallVector<MCFixup, 4> Fixups;
+      raw_svector_ostream VecOS(Code);
+      CodeEmitter->EncodeInstruction(Inst, VecOS, Fixups, STI);
+      VecOS.flush();
+      CurrentShadowSize += Code.size();
+      if (CurrentShadowSize >= RequiredShadowSize)
+        InShadow = false; // The shadow is big enough. Stop counting.
+    }
+  }
+
+  void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
+    MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
+    if (InShadow && CurrentShadowSize < RequiredShadowSize) {
+      InShadow = false;
+      EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+               TM.getSubtarget<X86Subtarget>().is64Bit(), STI);
+    }
+  }
+
+  void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
+    OutStreamer.EmitInstruction(Inst, getSubtargetInfo());
+    SMShadowTracker.count(Inst, getSubtargetInfo());
+  }
+} // end llvm namespace
+
 X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
                                X86AsmPrinter &asmprinter)
 : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()),
@@ -72,7 +124,7 @@
 /// operand to an MCSymbol.
 MCSymbol *X86MCInstLower::
 GetSymbolFromOperand(const MachineOperand &MO) const {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
 
   SmallString<128> Name;
@@ -212,7 +264,8 @@
     Expr = MCBinaryExpr::CreateSub(Expr,
                             MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx),
                                    Ctx);
-    if (MO.isJTI() && MAI.hasSetDirective()) {
+    if (MO.isJTI()) {
+      assert(MAI.doesSetDirectiveSuppressesReloc());
       // If .set directive is supported, use it to reduce the number of
       // relocations the assembler will generate for differences between
       // local labels. This is only safe when the symbols are in the same
@@ -531,14 +584,38 @@
   // Atomic load and store require a separate pseudo-inst because Acquire
   // implies mayStore and Release implies mayLoad; fix these to regular MOV
   // instructions here
-  case X86::ACQUIRE_MOV8rm:  OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
-  case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
-  case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
-  case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
-  case X86::RELEASE_MOV8mr:  OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
-  case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
-  case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
-  case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+  case X86::ACQUIRE_MOV8rm:    OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV16rm:   OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV32rm:   OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV64rm:   OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+  case X86::RELEASE_MOV8mr:    OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+  case X86::RELEASE_MOV16mr:   OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+  case X86::RELEASE_MOV32mr:   OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+  case X86::RELEASE_MOV64mr:   OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+  case X86::RELEASE_MOV8mi:    OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
+  case X86::RELEASE_MOV16mi:   OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
+  case X86::RELEASE_MOV32mi:   OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
+  case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
+  case X86::RELEASE_ADD8mi:    OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
+  case X86::RELEASE_ADD32mi:   OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
+  case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
+  case X86::RELEASE_AND8mi:    OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
+  case X86::RELEASE_AND32mi:   OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
+  case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
+  case X86::RELEASE_OR8mi:     OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
+  case X86::RELEASE_OR32mi:    OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
+  case X86::RELEASE_OR64mi32:  OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
+  case X86::RELEASE_XOR8mi:    OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
+  case X86::RELEASE_XOR32mi:   OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
+  case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
+  case X86::RELEASE_INC8m:     OutMI.setOpcode(X86::INC8m); goto ReSimplify;
+  case X86::RELEASE_INC16m:    OutMI.setOpcode(X86::INC16m); goto ReSimplify;
+  case X86::RELEASE_INC32m:    OutMI.setOpcode(X86::INC32m); goto ReSimplify;
+  case X86::RELEASE_INC64m:    OutMI.setOpcode(X86::INC64m); goto ReSimplify;
+  case X86::RELEASE_DEC8m:     OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
+  case X86::RELEASE_DEC16m:    OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
+  case X86::RELEASE_DEC32m:    OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
+  case X86::RELEASE_DEC64m:    OutMI.setOpcode(X86::DEC64m); goto ReSimplify;
 
   // We don't currently select the correct instruction form for instructions
   // which have a short %eax, etc. form. Handle this by custom lowering, for
@@ -602,10 +679,8 @@
   }
 }
 
-static void LowerTlsAddr(MCStreamer &OutStreamer,
-                         X86MCInstLower &MCInstLowering,
-                         const MachineInstr &MI,
-                         const MCSubtargetInfo& STI) {
+void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
+                                 const MachineInstr &MI) {
 
   bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
                   MI.getOpcode() == X86::TLS_base_addr64;
@@ -615,7 +690,7 @@
   MCContext &context = OutStreamer.getContext();
 
   if (needsPadding)
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI);
+    EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
 
   MCSymbolRefExpr::VariantKind SRVK;
   switch (MI.getOpcode()) {
@@ -662,12 +737,12 @@
     LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
     LEA.addOperand(MCOperand::CreateReg(0));        // seg
   }
-  OutStreamer.EmitInstruction(LEA, STI);
+  EmitAndCountInstruction(LEA);
 
   if (needsPadding) {
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI);
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI);
-    OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX), STI);
+    EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+    EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+    EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
   }
 
   StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
@@ -677,9 +752,9 @@
                             MCSymbolRefExpr::VK_PLT,
                             context);
 
-  OutStreamer.EmitInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
-                                                     : X86::CALLpcrel32)
-    .addExpr(tlsRef), STI);
+  EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
+                                                 : X86::CALLpcrel32)
+                            .addExpr(tlsRef));
 }
 
 /// \brief Emit the optimal amount of multi-byte nops on X86.
@@ -725,10 +800,9 @@
       break;
     case X86::NOOPL:
     case X86::NOOPW:
-      OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg).addImm(ScaleVal)
-                                           .addReg(IndexReg)
-                                           .addImm(Displacement)
-                                           .addReg(SegmentReg), STI);
+      OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg)
+                         .addImm(ScaleVal).addReg(IndexReg)
+                         .addImm(Displacement).addReg(SegmentReg), STI);
       break;
     }
   } // while (NumBytes)
@@ -736,22 +810,20 @@
 
 // Lower a stackmap of the form:
 // <id>, <shadowBytes>, ...
-static void LowerSTACKMAP(MCStreamer &OS, StackMaps &SM,
-                          const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) {
-  unsigned NumBytes = MI.getOperand(1).getImm();
+void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
+  SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo());
   SM.recordStackMap(MI);
-  // Emit padding.
-  // FIXME: These nops ensure that the stackmap's shadow is covered by
-  // instructions from the same basic block, but the nops should not be
-  // necessary if instructions from the same block follow the stackmap.
-  EmitNops(OS, NumBytes, Is64Bit, STI);
+  unsigned NumShadowBytes = MI.getOperand(1).getImm();
+  SMShadowTracker.reset(NumShadowBytes);
 }
 
 // Lower a patchpoint of the form:
 // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
-static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM,
-                            const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) {
-  assert(Is64Bit && "Patchpoint currently only supports X86-64");
+void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI) {
+  assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");
+
+  SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo());
+
   SM.recordPatchPoint(MI);
 
   PatchPointOpers opers(&MI);
@@ -766,22 +838,111 @@
       EncodedBytes = 13;
     else
       EncodedBytes = 12;
-    OS.EmitInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg)
-                                                  .addImm(CallTarget), STI);
-    OS.EmitInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg), STI);
+    EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg)
+                                                       .addImm(CallTarget));
+    EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
   }
   // Emit padding.
   unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
   assert(NumBytes >= EncodedBytes &&
          "Patchpoint can't request size less than the length of a call.");
 
-  EmitNops(OS, NumBytes - EncodedBytes, Is64Bit, STI);
+  EmitNops(OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
+           getSubtargetInfo());
+}
+
+// Returns instruction preceding MBBI in MachineFunction.
+// If MBBI is the first instruction of the first basic block, returns null.
+static MachineBasicBlock::const_iterator
+PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
+  const MachineBasicBlock *MBB = MBBI->getParent();
+  while (MBBI == MBB->begin()) {
+    if (MBB == MBB->getParent()->begin())
+      return nullptr;
+    MBB = MBB->getPrevNode();
+    MBBI = MBB->end();
+  }
+  return --MBBI;
+}
+
+static const Constant *getConstantFromPool(const MachineInstr &MI,
+                                           const MachineOperand &Op) {
+  if (!Op.isCPI())
+    return nullptr;
+
+  ArrayRef<MachineConstantPoolEntry> Constants =
+      MI.getParent()->getParent()->getConstantPool()->getConstants();
+  const MachineConstantPoolEntry &ConstantEntry =
+      Constants[Op.getIndex()];
+
+  // Bail if this is a machine constant pool entry, we won't be able to dig out
+  // anything useful.
+  if (ConstantEntry.isMachineConstantPoolEntry())
+    return nullptr;
+
+  auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
+  assert((!C || ConstantEntry.getType() == C->getType()) &&
+         "Expected a constant of the same type!");
+  return C;
+}
+
+static std::string getShuffleComment(const MachineOperand &DstOp,
+                                     const MachineOperand &SrcOp,
+                                     ArrayRef<int> Mask) {
+  std::string Comment;
+
+  // Compute the name for a register. This is really goofy because we have
+  // multiple instruction printers that could (in theory) use different
+  // names. Fortunately most people use the ATT style (outside of Windows)
+  // and they actually agree on register naming here. Ultimately, this is
+  // a comment, and so its OK if it isn't perfect.
+  auto GetRegisterName = [](unsigned RegNum) -> StringRef {
+    return X86ATTInstPrinter::getRegisterName(RegNum);
+  };
+
+  StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
+  StringRef SrcName = SrcOp.isReg() ? GetRegisterName(SrcOp.getReg()) : "mem";
+
+  raw_string_ostream CS(Comment);
+  CS << DstName << " = ";
+  bool NeedComma = false;
+  bool InSrc = false;
+  for (int M : Mask) {
+    // Wrap up any prior entry...
+    if (M == SM_SentinelZero && InSrc) {
+      InSrc = false;
+      CS << "]";
+    }
+    if (NeedComma)
+      CS << ",";
+    else
+      NeedComma = true;
+
+    // Print this shuffle...
+    if (M == SM_SentinelZero) {
+      CS << "zero";
+    } else {
+      if (!InSrc) {
+        InSrc = true;
+        CS << SrcName << "[";
+      }
+      if (M == SM_SentinelUndef)
+        CS << "u";
+      else
+        CS << M;
+    }
+  }
+  if (InSrc)
+    CS << "]";
+  CS.flush();
+
+  return Comment;
 }
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
-  const X86RegisterInfo *RI =
-      static_cast<const X86RegisterInfo *>(TM.getRegisterInfo());
+  const X86RegisterInfo *RI = static_cast<const X86RegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
 
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
@@ -812,7 +973,7 @@
   case X86::TLS_addr64:
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
-    return LowerTlsAddr(OutStreamer, MCInstLowering, *MI, getSubtargetInfo());
+    return LowerTlsAddr(MCInstLowering, *MI);
 
   case X86::MOVPC32r: {
     // This is a pseudo op for a two instruction sequence with a label, which
@@ -825,15 +986,15 @@
     MCSymbol *PICBase = MF->getPICBaseSymbol();
     // FIXME: We would like an efficient form for this, so we don't have to do a
     // lot of extra uniquing.
-    EmitToStreamer(OutStreamer, MCInstBuilder(X86::CALLpcrel32)
+    EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
       .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
 
     // Emit the label.
     OutStreamer.EmitLabel(PICBase);
 
     // popl $reg
-    EmitToStreamer(OutStreamer, MCInstBuilder(X86::POP32r)
-      .addReg(MI->getOperand(0).getReg()));
+    EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
+                            .addReg(MI->getOperand(0).getReg()));
     return;
   }
 
@@ -863,7 +1024,7 @@
     DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext),
                                       DotExpr, OutContext);
 
-    EmitToStreamer(OutStreamer, MCInstBuilder(X86::ADD32ri)
+    EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
       .addReg(MI->getOperand(0).getReg())
       .addReg(MI->getOperand(1).getReg())
       .addExpr(DotExpr));
@@ -871,21 +1032,21 @@
   }
 
   case TargetOpcode::STACKMAP:
-    return LowerSTACKMAP(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo());
+    return LowerSTACKMAP(*MI);
 
   case TargetOpcode::PATCHPOINT:
-    return LowerPATCHPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo());
+    return LowerPATCHPOINT(*MI);
 
   case X86::MORESTACK_RET:
-    EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget)));
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
     return;
 
   case X86::MORESTACK_RET_RESTORE_R10:
     // Return, then restore R10.
-    EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget)));
-    EmitToStreamer(OutStreamer, MCInstBuilder(X86::MOV64rr)
-      .addReg(X86::R10)
-      .addReg(X86::RAX));
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+    EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
+                            .addReg(X86::R10)
+                            .addReg(X86::RAX));
     return;
 
   case X86::SEH_PushReg:
@@ -918,9 +1079,151 @@
   case X86::SEH_EndPrologue:
     OutStreamer.EmitWinCFIEndProlog();
     return;
+
+  case X86::SEH_Epilogue: {
+    MachineBasicBlock::const_iterator MBBI(MI);
+    // Check if preceded by a call and emit nop if so.
+    for (MBBI = PrevCrossBBInst(MBBI); MBBI; MBBI = PrevCrossBBInst(MBBI)) {
+      // Conservatively assume that pseudo instructions don't emit code and keep
+      // looking for a call. We may emit an unnecessary nop in some cases.
+      if (!MBBI->isPseudo()) {
+        if (MBBI->isCall())
+          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+        break;
+      }
+    }
+    return;
+  }
+
+    // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+    // a constant shuffle mask. We won't be able to do this at the MC layer
+    // because the mask isn't an immediate.
+  case X86::PSHUFBrm:
+  case X86::VPSHUFBrm:
+  case X86::VPSHUFBYrm: {
+    if (!OutStreamer.isVerboseAsm())
+      break;
+    assert(MI->getNumOperands() > 5 &&
+           "We should always have at least 5 operands!");
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    const MachineOperand &MaskOp = MI->getOperand(5);
+
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 16> Mask;
+      DecodePSHUFBMask(C, Mask);
+      if (!Mask.empty())
+        OutStreamer.AddComment(getShuffleComment(DstOp, SrcOp, Mask));
+    }
+    break;
+  }
+  case X86::VPERMILPSrm:
+  case X86::VPERMILPDrm:
+  case X86::VPERMILPSYrm:
+  case X86::VPERMILPDYrm: {
+    if (!OutStreamer.isVerboseAsm())
+      break;
+    assert(MI->getNumOperands() > 5 &&
+           "We should always have at least 5 operands!");
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    const MachineOperand &MaskOp = MI->getOperand(5);
+
+    if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      SmallVector<int, 16> Mask;
+      DecodeVPERMILPMask(C, Mask);
+      if (!Mask.empty())
+        OutStreamer.AddComment(getShuffleComment(DstOp, SrcOp, Mask));
+    }
+    break;
+  }
+
+    // For loads from a constant pool to a vector register, print the constant
+    // loaded.
+  case X86::MOVAPDrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVAPDYrm:
+  case X86::MOVUPDrm:
+  case X86::VMOVUPDrm:
+  case X86::VMOVUPDYrm:
+  case X86::MOVAPSrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVAPSYrm:
+  case X86::MOVUPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVUPSYrm:
+  case X86::MOVDQArm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQAYrm:
+  case X86::MOVDQUrm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVDQUYrm:
+    if (!OutStreamer.isVerboseAsm())
+      break;
+    if (MI->getNumOperands() > 4)
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+      std::string Comment;
+      raw_string_ostream CS(Comment);
+      const MachineOperand &DstOp = MI->getOperand(0);
+      CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+      if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+        CS << "[";
+        for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
+          if (i != 0)
+            CS << ",";
+          if (CDS->getElementType()->isIntegerTy())
+            CS << CDS->getElementAsInteger(i);
+          else if (CDS->getElementType()->isFloatTy())
+            CS << CDS->getElementAsFloat(i);
+          else if (CDS->getElementType()->isDoubleTy())
+            CS << CDS->getElementAsDouble(i);
+          else
+            CS << "?";
+        }
+        CS << "]";
+        OutStreamer.AddComment(CS.str());
+      } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+        CS << "<";
+        for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
+          if (i != 0)
+            CS << ",";
+          Constant *COp = CV->getOperand(i);
+          if (isa<UndefValue>(COp)) {
+            CS << "u";
+          } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
+            CS << CI->getZExtValue();
+          } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
+            SmallString<32> Str;
+            CF->getValueAPF().toString(Str);
+            CS << Str;
+          } else {
+            CS << "?";
+          }
+        }
+        CS << ">";
+        OutStreamer.AddComment(CS.str());
+      }
+    }
+    break;
   }
 
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
-  EmitToStreamer(OutStreamer, TmpInst);
+
+  // Stackmap shadows cannot include branch targets, so we can count the bytes
+  // in a call towards the shadow, but must ensure that the no thread returns
+  // in to the stackmap shadow.  The only way to achieve this is if the call
+  // is at the end of the shadow.
+  if (MI->isCall()) {
+    // Count then size of the call towards the shadow
+    SMShadowTracker.count(TmpInst, getSubtargetInfo());
+    // Then flush the shadow so that we fill with nops before the call, not
+    // after it.
+    SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo());
+    // Then emit the call
+    OutStreamer.EmitInstruction(TmpInst, getSubtargetInfo());
+    return;
+  }
+
+  EmitAndCountInstruction(TmpInst);
 }

diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index 78d20ce..79a51b3 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h

@@ -11,10 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86MACHINEFUNCTIONINFO_H
-#define X86MACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include <vector>
 
 namespace llvm {
 
@@ -70,6 +72,22 @@
   unsigned NumLocalDynamics;
 
 public:
+  /// Describes a register that needs to be forwarded from the prologue to a
+  /// musttail call.
+  struct Forward {
+    Forward(unsigned VReg, MCPhysReg PReg, MVT VT)
+        : VReg(VReg), PReg(PReg), VT(VT) {}
+    unsigned VReg;
+    MCPhysReg PReg;
+    MVT VT;
+  };
+
+private:
+  /// ForwardedMustTailRegParms - A list of virtual and physical registers
+  /// that must be forwarded to every musttail call.
+  std::vector<Forward> ForwardedMustTailRegParms;
+
+public:
   X86MachineFunctionInfo() : ForceFramePointer(false),
                              CalleeSavedFrameSize(0),
                              BytesToPopOnReturn(0),
@@ -138,6 +156,9 @@
   unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
 
+  std::vector<Forward> &getForwardedMustTailRegParms() {
+    return ForwardedMustTailRegParms;
+  }
 };
 
 } // End llvm namespace

diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 6639875..adc05b2 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp

@@ -105,7 +105,7 @@
   if (!TM->getSubtarget<X86Subtarget>().padShortFunctions())
     return false;
 
-  TII = TM->getInstrInfo();
+  TII = TM->getSubtargetImpl()->getInstrInfo();
 
   // Search through basic blocks and mark the ones that have early returns
   ReturnBBs.clear();
@@ -195,7 +195,8 @@
       return true;
     }
 
-    CyclesToEnd += TII->getInstrLatency(TM->getInstrItineraryData(), MI);
+    CyclesToEnd += TII->getInstrLatency(
+        TM->getSubtargetImpl()->getInstrItineraryData(), MI);
   }
 
   VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);

diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index e8a7e84..a4a366d 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp

@@ -68,8 +68,10 @@
 
   if (Is64Bit) {
     SlotSize = 8;
-    StackPtr = X86::RSP;
-    FramePtr = X86::RBP;
+    StackPtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ?
+        X86::RSP : X86::ESP;
+    FramePtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ?
+        X86::RBP : X86::EBP;
   } else {
     SlotSize = 4;
     StackPtr = X86::ESP;
@@ -120,7 +122,7 @@
 const TargetRegisterClass*
 X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
   // Don't allow super-classes of GR8_NOREX.  This class is only used after
-  // extrating sub_8bit_hi sub-registers.  The H sub-registers cannot be copied
+  // extracting sub_8bit_hi sub-registers.  The H sub-registers cannot be copied
   // to the full GR8 register class in 64-bit mode, so we cannot allow the
   // reigster class inflation.
   //
@@ -196,7 +198,7 @@
 unsigned
 X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                      MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
   switch (RC->getID()) {
@@ -324,7 +326,7 @@
 
 BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   // Set the stack-pointer register and its aliases as reserved.
   for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
@@ -441,7 +443,8 @@
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
-  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned StackAlign =
+    MF.getSubtarget().getFrameLowering()->getStackAlignment();
   bool requiresRealignment =
     ((MFI->getMaxAlignment() > StackAlign) ||
      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
@@ -456,13 +459,9 @@
 
 bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
                                            unsigned Reg, int &FrameIdx) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  if (Reg == FramePtr && TFI->hasFP(MF)) {
-    FrameIdx = MF.getFrameInfo()->getObjectIndexBegin();
-    return true;
-  }
-  return false;
+  // Since X86 defines assignCalleeSavedSpillSlots which always return true
+  // this function neither used nor tested.
+  llvm_unreachable("Unused function on X86. Otherwise need a test case.");
 }
 
 void
@@ -473,7 +472,7 @@
 
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned BasePtr;
 
@@ -488,6 +487,12 @@
   else
     BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
 
+  // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit
+  // register as source operand, semantic is the same and destination is
+  // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
+  if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
+    BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false);
+
   // This must be part of a four operand memory reference.  Replace the
   // FrameIndex with base register with EBP.  Add an offset to the offset.
   MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
@@ -526,7 +531,7 @@
 }
 
 unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
 }
 

diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 74efd1f..cc0a7b2 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86REGISTERINFO_H
-#define X86REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
 
 #include "llvm/Target/TargetRegisterInfo.h"
 

diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 33c402b..311a717 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td

@@ -166,6 +166,7 @@
 def FP4 : X86Reg<"fp4", 0>;
 def FP5 : X86Reg<"fp5", 0>;
 def FP6 : X86Reg<"fp6", 0>;
+def FP7 : X86Reg<"fp7", 0>;
 
 // XMM Registers, used by the various SSE instruction set extensions.
 def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
@@ -234,22 +235,18 @@
   def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
   def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
 
-class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
-  let Aliases = A;
-}
-
 // Floating point stack registers. These don't map one-to-one to the FP
 // pseudo registers, but we still mark them as aliasing FP registers. That
 // way both kinds can be live without exceeding the stack depth. ST registers
 // are only live around inline assembly.
-def ST0 : STRegister<"st(0)", 0, []>,    DwarfRegNum<[33, 12, 11]>;
-def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>;
-def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>;
-def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>;
-def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>;
-def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>;
-def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>;
-def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>;
+def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>;
+def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>;
 
 // Floating-point status word
 def FPSW : X86Reg<"fpsw", 0>;
@@ -449,7 +446,7 @@
 }
 
 // AVX-512 vector/mask registers.
-def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512,
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512,
     (sequence "ZMM%u", 0, 31)>;
 
 // Scalar AVX-512 floating point registers.
@@ -463,13 +460,19 @@
 def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
                           256, (sequence "YMM%u", 0, 31)>;
 
-// The size of the all masked registers is 16 bit because we have only one
-// KMOVW istruction that can store this register in memory, and it writes 2 bytes
-def VK1     : RegisterClass<"X86", [i1],    16, (sequence "K%u", 0, 7)>;
-def VK8     : RegisterClass<"X86", [v8i1],  16, (add VK1)> {let Size = 16;}
+// Mask registers
+def VK1     : RegisterClass<"X86", [i1],    16, (sequence "K%u", 0, 7)> {let Size = 16;}
+def VK2     : RegisterClass<"X86", [v2i1],  16, (add VK1)> {let Size = 16;}
+def VK4     : RegisterClass<"X86", [v4i1],  16, (add VK2)> {let Size = 16;}
+def VK8     : RegisterClass<"X86", [v8i1],  16, (add VK4)> {let Size = 16;}
 def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
+def VK32    : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
+def VK64    : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
 
 def VK1WM   : RegisterClass<"X86", [i1],    16, (sub VK1, K0)> {let Size = 16;}
+def VK2WM   : RegisterClass<"X86", [v2i1],  16, (sub VK2, K0)> {let Size = 16;}
+def VK4WM   : RegisterClass<"X86", [v4i1],  16, (sub VK4, K0)> {let Size = 16;}
 def VK8WM   : RegisterClass<"X86", [v8i1],  16, (sub VK8, K0)> {let Size = 16;}
-def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>;
-
+def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>   {let Size = 16;}
+def VK32WM  : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;}
+def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}

diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
deleted file mode 100644
index 0333056..0000000
--- a/lib/Target/X86/X86Relocations.h
+++ /dev/null

@@ -1,52 +0,0 @@
-//===-- X86Relocations.h - X86 Code Relocations -----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the X86 target-specific relocation types.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef X86RELOCATIONS_H
-#define X86RELOCATIONS_H
-
-#include "llvm/CodeGen/MachineRelocation.h"
-
-namespace llvm {
-  namespace X86 {
-    /// RelocationType - An enum for the x86 relocation codes. Note that
-    /// the terminology here doesn't follow x86 convention - word means
-    /// 32-bit and dword means 64-bit. The relocations will be treated
-    /// by JIT or ObjectCode emitters, this is transparent to the x86 code
-    /// emitter but JIT and ObjectCode will treat them differently
-    enum RelocationType {
-      /// reloc_pcrel_word - PC relative relocation, add the relocated value to
-      /// the value already in memory, after we adjust it for where the PC is.
-      reloc_pcrel_word = 0,
-
-      /// reloc_picrel_word - PIC base relative relocation, add the relocated
-      /// value to the value already in memory, after we adjust it for where the
-      /// PIC base is.
-      reloc_picrel_word = 1,
-
-      /// reloc_absolute_word - absolute relocation, just add the relocated
-      /// value to the value already in memory.
-      reloc_absolute_word = 2,
-
-      /// reloc_absolute_word_sext - absolute relocation, just add the relocated
-      /// value to the value already in memory. In object files, it represents a
-      /// value which must be sign-extended when resolving the relocation.
-      reloc_absolute_word_sext = 3,
-
-      /// reloc_absolute_dword - absolute relocation, just add the relocated
-      /// value to the value already in memory.
-      reloc_absolute_dword = 4
-    };
-  }
-}
-
-#endif

diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 6966d61..73a3230 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td

@@ -48,13 +48,17 @@
 def HWPort7 : ProcResource<1>;
 
 // Many micro-ops are capable of issuing on multiple ports.
+def HWPort01  : ProcResGroup<[HWPort0, HWPort1]>;
 def HWPort23  : ProcResGroup<[HWPort2, HWPort3]>;
 def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort04  : ProcResGroup<[HWPort0, HWPort4]>;
 def HWPort05  : ProcResGroup<[HWPort0, HWPort5]>;
-def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>;
+def HWPort06  : ProcResGroup<[HWPort0, HWPort6]>;
 def HWPort15  : ProcResGroup<[HWPort1, HWPort5]>;
 def HWPort16  : ProcResGroup<[HWPort1, HWPort6]>;
+def HWPort56  : ProcResGroup<[HWPort5, HWPort6]>;
 def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
 def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
 
 // 60 Entry Unified Scheduler
@@ -125,6 +129,7 @@
 defm : HWWriteResPair<WriteFMul,   HWPort0, 5>;
 defm : HWWriteResPair<WriteFDiv,   HWPort0, 12>; // 10-14 cycles.
 defm : HWWriteResPair<WriteFRcp,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
 defm : HWWriteResPair<WriteFSqrt,  HWPort0, 15>;
 defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
 defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
@@ -261,4 +266,1882 @@
 def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
 def : WriteRes<WriteFence,  [HWPort23, HWPort4]>;
 def : WriteRes<WriteNop, []>;
+
+//================ Exceptions ================//
+
+//-- Specific Scheduling Models --//
+
+// Starting with P0.
+def WriteP0 : SchedWriteRes<[HWPort0]>;
+
+def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP01 : SchedWriteRes<[HWPort01]>;
+
+def Write2P01 : SchedWriteRes<[HWPort01]> {
+  let NumMicroOps = 2;
+}
+def Write3P01 : SchedWriteRes<[HWPort01]> {
+  let NumMicroOps = 3;
+}
+
+def WriteP015 : SchedWriteRes<[HWPort015]>;
+
+def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> {
+  let NumMicroOps = 2;
+}
+def WriteP06 : SchedWriteRes<[HWPort06]>;
+
+def Write2P06 : SchedWriteRes<[HWPort06]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+
+def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+
+def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+  let NumMicroOps = 2;
+}
+
+def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> {
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+
+def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+def Write5P0156 : SchedWriteRes<[HWPort0156]> {
+  let NumMicroOps = 5;
+  let ResourceCycles = [5];
+}
+
+def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [1, 2, 1];
+}
+
+def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [2, 2, 1];
+}
+
+def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [3, 2, 1];
+}
+
+// Starting with P1.
+def WriteP1 : SchedWriteRes<[HWPort1]>;
+
+def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+  let NumMicroOps = 2;
+}
+def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> {
+  let Latency = 3;
+}
+def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> {
+  let Latency = 7;
+}
+
+def Write2P1 : SchedWriteRes<[HWPort1]> {
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> {
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def WriteP15 : SchedWriteRes<[HWPort15]>;
+def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> {
+  let Latency = 4;
+}
+
+def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+
+def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+
+// Starting with P2.
+def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> {
+  let Latency = 1;
+  let ResourceCycles = [2, 1];
+}
+
+// Starting with P5.
+def WriteP5 : SchedWriteRes<[HWPort5]>;
+def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+
+// Notation:
+// - r: register.
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+// - m = memory.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+
+// MOV.
+// r16,m.
+def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
+
+// MOVSX, MOVZX.
+// r,m.
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
+
+// CMOVcc.
+// r,r.
+def : InstRW<[Write2P0156_Lat2],
+      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd],
+      (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
+
+// XCHG.
+// r,r.
+def WriteXCHG : SchedWriteRes<[HWPort0156]> {
+  let Latency = 2;
+  let ResourceCycles = [3];
+}
+
+def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+
+// r,m.
+def WriteXCHGrm : SchedWriteRes<[]> {
+  let Latency = 21;
+  let NumMicroOps = 8;
+}
+def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>;
+
+// XLAT.
+def WriteXLAT : SchedWriteRes<[]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteXLAT], (instregex "XLAT")>;
+
+// PUSH.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
+
+// PUSHF.
+def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>;
+
+// PUSHA.
+def WritePushA : SchedWriteRes<[]> {
+  let NumMicroOps = 19;
+}
+def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
+
+// POP.
+// m.
+def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
+
+// POPF.
+def WritePopF : SchedWriteRes<[]> {
+  let NumMicroOps = 9;
+}
+def : InstRW<[WritePopF], (instregex "POPF(16|32)")>;
+
+// POPA.
+def WritePopA : SchedWriteRes<[]> {
+  let NumMicroOps = 18;
+}
+def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
+
+// LAHF SAHF.
+def : InstRW<[WriteP06], (instregex "(S|L)AHF")>;
+
+// BSWAP.
+// r32.
+def WriteBSwap32 : SchedWriteRes<[HWPort15]>;
+def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>;
+
+// r64.
+def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>;
+
+// MOVBE.
+// r16,m16 / r64,m64.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>;
+
+// r32, m32.
+def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>;
+
+// m16,r16.
+def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>;
+
+// m32,r32.
+def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>;
+
+// m64,r64.
+def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>;
+
+//-- Arithmetic instructions --//
+
+// ADD SUB.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+              (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
+              "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>;
+
+// ADC SBB.
+// r,r/i.
+def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
+                           "(ADC|SBB)(16|32|64)ri8",
+                           "(ADC|SBB)64ri32",
+                           "(ADC|SBB)(8|16|32|64)rr_REV")>;
+
+// r,m.
+def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>;
+
+// m,r/i.
+def : InstRW<[Write3P0156_2P237_P4],
+             (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
+              "(ADC|SBB)(16|32|64)mi8",
+              "(ADC|SBB)64mi32")>;
+
+// INC DEC NOT NEG.
+// m.
+def : InstRW<[WriteP0156_2P237_P4],
+             (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
+              "(INC|DEC)64(16|32)m")>;
+
+// MUL IMUL.
+// r16.
+def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>;
+
+// m16.
+def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 5;
+}
+def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>;
+
+// r32.
+def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>;
+
+// m32.
+def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>;
+
+// r64.
+def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>;
+
+// m64.
+def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>;
+
+// r16,r16.
+def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
+
+// r16,m16.
+def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>;
+
+// MULX.
+// r32,r32,r32.
+def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteMulX32], (instregex "MULX32rr")>;
+
+// r32,r32,m32.
+def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>;
+
+// r64,r64,r64.
+def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMulX64], (instregex "MULX64rr")>;
+
+// r64,r64,m64.
+def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>;
+
+// DIV.
+// r8.
+def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 22;
+  let NumMicroOps = 9;
+}
+def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
+
+// r16.
+def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 23;
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv16], (instregex "DIV16r")>;
+
+// r32.
+def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 22;
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteDiv32], (instregex "DIV32r")>;
+
+// r64.
+def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 32;
+  let NumMicroOps = 36;
+}
+def : InstRW<[WriteDiv64], (instregex "DIV64r")>;
+
+// IDIV.
+// r8.
+def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 23;
+  let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
+
+// r16.
+def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 23;
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>;
+
+// r32.
+def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 22;
+  let NumMicroOps = 9;
+}
+def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>;
+
+// r64.
+def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
+  let Latency = 39;
+  let NumMicroOps = 59;
+}
+def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>;
+
+//-- Logic instructions --//
+
+// AND OR XOR.
+// m,r/i.
+def : InstRW<[Write2P0156_2P237_P4],
+             (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
+              "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
+
+// SHR SHL SAR.
+// m,i.
+def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> {
+  let NumMicroOps = 6;
+  let ResourceCycles = [3, 2, 1];
+}
+def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>;
+
+// ROR ROL.
+// r,1.
+def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>;
+
+// m,i.
+def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 5;
+  let ResourceCycles = [2, 2, 1];
+}
+def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>;
+
+// r,cl.
+def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>;
+
+// m,cl.
+def WriteRotateRMWCL : SchedWriteRes<[]> {
+  let NumMicroOps = 6;
+}
+def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>;
+
+// RCR RCL.
+// r,1.
+def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>;
+
+// m,1.
+def WriteRCm1 : SchedWriteRes<[]> {
+  let NumMicroOps = 6;
+}
+def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>;
+
+// r,i.
+def WriteRCri : SchedWriteRes<[HWPort0156]> {
+  let Latency = 6;
+  let NumMicroOps = 8;
+}
+def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
+
+// m,i.
+def WriteRCmi : SchedWriteRes<[]> {
+  let NumMicroOps = 11;
+}
+def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
+
+// SHRD SHLD.
+// r,r,i.
+def WriteShDrr : SchedWriteRes<[HWPort1]> {
+  let Latency = 3;
+}
+def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>;
+
+// m,r,i.
+def WriteShDmr : SchedWriteRes<[]> {
+  let NumMicroOps = 5;
+}
+def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>;
+
+// r,r,cl.
+def WriteShlDCL : SchedWriteRes<[HWPort0156]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>;
+
+// r,r,cl.
+def WriteShrDCL : SchedWriteRes<[HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>;
+
+// m,r,cl.
+def WriteShDmrCL : SchedWriteRes<[]> {
+  let NumMicroOps = 7;
+}
+def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>;
+
+// BT.
+// r,r/i.
+def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTmr : SchedWriteRes<[]> {
+  let NumMicroOps = 10;
+}
+def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
+
+// BTR BTS BTC.
+// r,r,i.
+def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
+
+// m,r.
+def WriteBTRSCmr : SchedWriteRes<[]> {
+  let NumMicroOps = 11;
+}
+def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
+
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>;
+
+// BSF BSR.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>;
+
+// SETcc.
+// r.
+def : InstRW<[WriteShift],
+             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
+// m.
+def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteSetCCm],
+             (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
+
+// CLD STD.
+def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>;
+
+// LZCNT TZCNT.
+// r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>;
+
+// ANDN.
+// r,r.
+def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>;
+
+// BLSI BLSMSK BLSR.
+// r,r.
+def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
+
+// BEXTR.
+// r,r,r.
+def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>;
+// r,m,r.
+def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>;
+
+// BZHI.
+// r,r,r.
+def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>;
+
+// PDEP PEXT.
+// r,r,r.
+def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
+
+//-- Control transfer instructions --//
+
+// J(E|R)CXZ.
+def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
+
+// LOOP.
+def WriteLOOP : SchedWriteRes<[]> {
+  let NumMicroOps = 7;
+}
+def : InstRW<[WriteLOOP], (instregex "LOOP")>;
+
+// LOOP(N)E
+def WriteLOOPE : SchedWriteRes<[]> {
+  let NumMicroOps = 11;
+}
+def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>;
+
+// CALL.
+// r.
+def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>;
+
+// m.
+def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>;
+
+// RET.
+def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>;
+
+// i.
+def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>;
+
+// BOUND.
+// r,m.
+def WriteBOUND : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>;
+
+// INTO.
+def WriteINTO : SchedWriteRes<[]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteINTO], (instregex "INTO")>;
+
+//-- String instructions --//
+
+// LODSB/W.
+def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
+
+// STOS.
+def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>;
+
+// MOVS.
+def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2, 1, 2];
+}
+def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>;
+
+// SCAS.
+def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>;
+
+// CMPS.
+def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+  let ResourceCycles = [2, 3];
+}
+def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
+
+//-- Synchronization instructions --//
+
+// XADD.
+def WriteXADD : SchedWriteRes<[]> {
+  let NumMicroOps = 5;
+}
+def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>;
+
+// CMPXCHG.
+def WriteCMPXCHG : SchedWriteRes<[]> {
+  let NumMicroOps = 6;
+}
+def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
+
+// CMPXCHG8B.
+def WriteCMPXCHG8B : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
+
+// CMPXCHG16B.
+def WriteCMPXCHG16B : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>;
+
+//-- Other --//
+
+// PAUSE.
+def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> {
+  let NumMicroOps = 5;
+  let ResourceCycles = [1, 3];
+}
+def : InstRW<[WritePAUSE], (instregex "PAUSE")>;
+
+// LEAVE.
+def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>;
+
+// XGETBV.
+def WriteXGETBV : SchedWriteRes<[]> {
+  let NumMicroOps = 8;
+}
+def : InstRW<[WriteXGETBV], (instregex "XGETBV")>;
+
+// RDTSC.
+def WriteRDTSC : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteRDTSC], (instregex "RDTSC")>;
+
+// RDPMC.
+def WriteRDPMC : SchedWriteRes<[]> {
+  let NumMicroOps = 34;
+}
+def : InstRW<[WriteRDPMC], (instregex "RDPMC")>;
+
+// RDRAND.
+def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> {
+  let NumMicroOps = 17;
+  let ResourceCycles = [1, 16];
+}
+def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+// FLD.
+// m80.
+def : InstRW<[WriteP01], (instregex "LD_Frr")>;
+
+def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 2];
+}
+def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>;
+
+// FBLD.
+// m80.
+def WriteFBLD : SchedWriteRes<[]> {
+  let Latency = 47;
+  let NumMicroOps = 43;
+}
+def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
+
+// FST(P).
+// r.
+def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
+
+// m80.
+def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> {
+  let NumMicroOps = 7;
+  let ResourceCycles = [3, 2, 2];
+}
+def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>;
+
+// FBSTP.
+// m80.
+def WriteFBSTP : SchedWriteRes<[]> {
+  let NumMicroOps = 226;
+}
+def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>;
+
+// FXCHG.
+def : InstRW<[WriteNop], (instregex "XCH_F")>;
+
+// FILD.
+def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>;
+
+// FIST(P) FISTTP.
+def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>;
+
+// FLDZ.
+def : InstRW<[WriteP01], (instregex "LD_F0")>;
+
+// FLD1.
+def : InstRW<[Write2P01], (instregex "LD_F1")>;
+
+// FLDPI FLDL2E etc.
+def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
+
+// FCMOVcc.
+def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>;
+
+// FNSTSW.
+// AX.
+def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> {
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>;
+
+// m16.
+def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>;
+
+// FLDCW.
+def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>;
+
+// FNSTCW.
+def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
+  let NumMicroOps = 3;
+}
+def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>;
+
+// FINCSTP FDECSTP.
+def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>;
+
+// FFREE.
+def : InstRW<[WriteP01], (instregex "FFREE")>;
+
+// FNSAVE.
+def WriteFNSAVE : SchedWriteRes<[]> {
+  let NumMicroOps = 147;
+}
+def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>;
+
+// FRSTOR.
+def WriteFRSTOR : SchedWriteRes<[]> {
+  let NumMicroOps = 90;
+}
+def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>;
+
+//-- Arithmetic instructions --//
+
+// FABS.
+def : InstRW<[WriteP0], (instregex "ABS_F")>;
+
+// FCHS.
+def : InstRW<[WriteP0], (instregex "CHS_F")>;
+
+// FCOM(P) FUCOM(P).
+// r.
+def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
+                         "UCOM_FPr")>;
+// m.
+def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
+                           "UCOM_FIPr")>;
+
+// FICOM(P).
+def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
+
+// FTST.
+def : InstRW<[WriteP1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[Write2P1], (instregex "FXAM")>;
+
+// FPREM.
+def WriteFPREM : SchedWriteRes<[]> {
+  let Latency = 19;
+  let NumMicroOps = 28;
+}
+def : InstRW<[WriteFPREM], (instregex "FPREM")>;
+
+// FPREM1.
+def WriteFPREM1 : SchedWriteRes<[]> {
+  let Latency = 27;
+  let NumMicroOps = 41;
+}
+def : InstRW<[WriteFPREM1], (instregex "FPREM1")>;
+
+// FRNDINT.
+def WriteFRNDINT : SchedWriteRes<[]> {
+  let Latency = 11;
+  let NumMicroOps = 17;
+}
+def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>;
+
+//-- Math instructions --//
+
+// FSCALE.
+def WriteFSCALE : SchedWriteRes<[]> {
+  let Latency = 75; // 49-125
+  let NumMicroOps = 50; // 25-75
+}
+def : InstRW<[WriteFSCALE], (instregex "FSCALE")>;
+
+// FXTRACT.
+def WriteFXTRACT : SchedWriteRes<[]> {
+  let Latency = 15;
+  let NumMicroOps = 17;
+}
+def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>;
+
+//-- Other instructions --//
+
+// FNOP.
+def : InstRW<[WriteP01], (instregex "FNOP")>;
+
+// WAIT.
+def : InstRW<[Write2P01], (instregex "WAIT")>;
+
+// FNCLEX.
+def : InstRW<[Write5P0156], (instregex "FNCLEX")>;
+
+// FNINIT.
+def WriteFNINIT : SchedWriteRes<[]> {
+  let NumMicroOps = 26;
+}
+def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
+
+//=== Integer MMX and XMM Instructions ===//
+//-- Move instructions --//
+
+// MOVD.
+// r32/64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
+                         "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
+
+// (x)mm <- r32/64.
+def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
+                         "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
+
+// MOVQ.
+// r64 <- (x)mm.
+def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>;
+
+// (x)mm <- r64.
+def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
+
+// (x)mm <- (x)mm.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>;
+
+// (V)MOVDQA/U.
+// x <- x.
+def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
+                           "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV",
+                           "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
+
+// MOVDQ2Q.
+def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>;
+
+// MOVQ2DQ.
+def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>;
+
+
+// PACKSSWB/DW.
+// mm <- mm.
+def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+  let ResourceCycles = [3];
+}
+def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr",
+                                  "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
+
+// mm <- m64.
+def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 3];
+}
+def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm",
+                                  "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+
+// VPMOVSX/ZX BW BD BQ DW DQ.
+// y <- x.
+def WriteVPMOVSX : SchedWriteRes<[HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 1;
+}
+def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+
+// PBLENDW.
+// x,x,i / v,v,v,i
+def WritePBLENDWr : SchedWriteRes<[HWPort5]>;
+def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>;
+
+// x,m,i / v,v,m,i
+def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> {
+  let NumMicroOps = 2;
+  let Latency = 4;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>;
+
+// VPBLENDD.
+// v,v,v,i.
+def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>;
+def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>;
+
+// v,v,m,i
+def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> {
+  let NumMicroOps = 2;
+  let Latency = 4;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>;
+
+// MASKMOVQ.
+def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 2];
+}
+def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>;
+
+// MASKMOVDQU.
+def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 10;
+  let ResourceCycles = [4, 2, 4];
+}
+def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>;
+
+// VPMASKMOV D/Q.
+// v,v,m.
+def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVPMASKMOVr, ReadAfterLd],
+                               (instregex "VPMASKMOV(D|Q)(Y?)rm")>;
+
+// m, v,v.
+def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+
+// PMOVMSKB.
+def WritePMOVMSKB : SchedWriteRes<[HWPort0]> {
+  let Latency = 3;
+}
+def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>;
+
+// PEXTR B/W/D/Q.
+// r32,x,i.
+def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
+
+// m8,x,i.
+def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> {
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
+
+// VPBROADCAST B/W.
+// x, m8/16.
+def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd],
+                                     (instregex "VPBROADCAST(B|W)rm")>;
+
+// y, m8/16
+def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd],
+                                     (instregex "VPBROADCAST(B|W)Yrm")>;
+
+// VPGATHERDD.
+// x.
+def WriteVPGATHERDD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>;
+
+// y.
+def WriteVPGATHERDD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 34;
+}
+def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>;
+
+// VPGATHERQD.
+// x.
+def WriteVPGATHERQD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>;
+
+// y.
+def WriteVPGATHERQD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>;
+
+// VPGATHERDQ.
+// x.
+def WriteVPGATHERDQ128 : SchedWriteRes<[]> {
+  let NumMicroOps = 12;
+}
+def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>;
+
+// y.
+def WriteVPGATHERDQ256 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>;
+
+// VPGATHERQQ.
+// x.
+def WriteVPGATHERQQ128 : SchedWriteRes<[]> {
+  let NumMicroOps = 14;
+}
+def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>;
+
+// y.
+def WriteVPGATHERQQ256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>;
+
+//-- Arithmetic instructions --//
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64",
+                               "MMX_PHADDSWrr64",
+                               "MMX_PHSUB(W|D)rr64",
+                               "MMX_PHSUBSWrr64",
+                               "(V?)PH(ADD|SUB)(W|D)(Y?)rr",
+                               "(V?)PH(ADD|SUB)SWrr(256)?")>;
+
+// v <- v,m.
+def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WritePHADDSUBm, ReadAfterLd],
+                              (instregex "MMX_PHADD(W?)rm64",
+                               "MMX_PHADDSWrm64",
+                               "MMX_PHSUB(W|D)rm64",
+                               "MMX_PHSUBSWrm64",
+                               "(V?)PH(ADD|SUB)(W|D)(Y?)rm",
+                               "(V?)PH(ADD|SUB)SWrm(128|256)?")>;
+
+// PCMPGTQ.
+// v <- v,v.
+def WritePCMPGTQr : SchedWriteRes<[HWPort0]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+
+// v <- v,m.
+def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>;
+
+// PMULLD.
+// x,x / y,y,y.
+def WritePMULLDr : SchedWriteRes<[HWPort0]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>;
+
+// x,m / y,y,m.
+def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>;
+
+//-- Logic instructions --//
+
+// PTEST.
+// v,v.
+def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
+
+// v,m.
+def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>;
+
+// PSLL,PSRL,PSRA W/D/Q.
+// x,x / v,v,x.
+def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>;
+
+// PSLL,PSRL DQ.
+def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>;
+
+//-- Other --//
+
+// EMMS.
+def WriteEMMS : SchedWriteRes<[]> {
+  let Latency = 13;
+  let NumMicroOps = 31;
+}
+def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+//-- Move instructions --//
+
+// MOVMSKP S/D.
+// r32 <- x.
+def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> {
+  let Latency = 3;
+}
+def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>;
+
+// r32 <- y.
+def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> {
+  let Latency = 2;
+}
+def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>;
+
+// VPERM2F128.
+def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>;
+def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>;
+
+// BLENDVP S/D.
+def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>;
+def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+
+// VBROADCASTF128.
+def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>;
+
+// EXTRACTPS.
+// r32,x,i.
+def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+
+// m32,x,i.
+def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+
+// VEXTRACTF128.
+// x,y,i.
+def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>;
+
+// m128,y,i.
+def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>;
+
+// VINSERTF128.
+// y,y,x,i.
+def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>;
+
+// y,y,m128,i.
+def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>;
+
+// VMASKMOVP S/D.
+// v,v,m.
+def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>;
+
+// m128,x,x.
+def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>;
+
+// m256,y,y.
+def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
+
+// VGATHERDPS.
+// x.
+def WriteVGATHERDPS128 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>;
+
+// y.
+def WriteVGATHERDPS256 : SchedWriteRes<[]> {
+  let NumMicroOps = 34;
+}
+def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>;
+
+// VGATHERQPS.
+// x.
+def WriteVGATHERQPS128 : SchedWriteRes<[]> {
+  let NumMicroOps = 15;
+}
+def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>;
+
+// y.
+def WriteVGATHERQPS256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>;
+
+// VGATHERDPD.
+// x.
+def WriteVGATHERDPD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 12;
+}
+def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>;
+
+// y.
+def WriteVGATHERDPD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 20;
+}
+def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>;
+
+// VGATHERQPD.
+// x.
+def WriteVGATHERQPD128 : SchedWriteRes<[]> {
+  let NumMicroOps = 14;
+}
+def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>;
+
+// y.
+def WriteVGATHERQPD256 : SchedWriteRes<[]> {
+  let NumMicroOps = 22;
+}
+def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>;
+
+//-- Conversion instructions --//
+
+// CVTPD2PS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>;
+
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>;
+
+// x,y.
+def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>;
+
+// x,m256.
+def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>;
+
+// CVTSD2SS.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+
+// x,m64.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+
+// CVTPS2PD.
+// x,x.
+def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>;
+
+// x,m64.
+// y,m128.
+def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+
+// y,x.
+def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>;
+
+// CVTSS2SD.
+// x,x.
+def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+
+// x,m32.
+def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
+
+// CVTDQ2PD.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>;
+
+// y,x.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>;
+
+// CVT(T)PD2DQ.
+// x,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>;
+// x,m128.
+def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>;
+// x,y.
+def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>;
+// x,m256.
+def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>;
+
+// CVT(T)PS2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>;
+
+// CVTPI2PD.
+// x,mm.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>;
+
+// CVT(T)PD2PI.
+// mm,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>;
+
+// CVSTSI2SS.
+// x,r32.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
+
+// CVT(T)SS2SI.
+// r32,x.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
+
+// CVTSI2SD.
+// x,r32/64.
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
+
+// CVTSD2SI.
+// r32/64
+def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>;
+
+// VCVTPS2PH.
+// x,v,i.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>;
+// m,v,i.
+def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>;
+
+// VCVTPH2PS.
+// v,x.
+def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>;
+
+// x,m / v,v,m.
+def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>;
+
+// MULL SS/SD PS/PD.
+// x,x / v,v,v.
+def WriteMULr : SchedWriteRes<[HWPort01]> {
+  let Latency = 5;
+}
+def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
+
+// x,m / v,v,m.
+def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>;
+
+// VDIVPS.
+// y,y,y.
+def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 19; // 18-21 cycles.
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>;
+
+// y,y,m256.
+def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 23; // 18-21 + 4 cycles.
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>;
+
+// VDIVPD.
+// y,y,y.
+def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 27; // 19-35 cycles.
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>;
+
+// y,y,m256.
+def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 31; // 19-35 + 4 cycles.
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>;
+
+// VRCPPS.
+// y,y.
+def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
+
+// y,m256.
+def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>;
+
+// ROUND SS/SD PS/PD.
+// v,v,i.
+def WriteROUNDr : SchedWriteRes<[HWPort1]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
+
+// v,m,i.
+def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+
+// DPPS.
+// x,x,i / v,v,v,i.
+def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>;
+
+// x,m,i / v,v,m,i.
+def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> {
+  let Latency = 18;
+  let NumMicroOps = 6;
+  let ResourceCycles = [2, 1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>;
+
+// DPPD.
+// x,x,i.
+def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+  let Latency = 9;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>;
+
+// x,m,i.
+def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> {
+  let Latency = 13;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>;
+
+// VFMADD.
+// v,v,v.
+def WriteFMADDr : SchedWriteRes<[HWPort01]> {
+  let Latency = 5;
+  let NumMicroOps = 1;
+}
+def : InstRW<[WriteFMADDr],
+    (instregex
+    // 3p forms.
+    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?",
+    // 3s forms.
+    "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)r",
+    // 4s/4s_int forms.
+    "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
+    // 4p forms.
+    "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
+
+// v,v,m.
+def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteFMADDm],
+    (instregex
+    // 3p forms.
+    "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?",
+    // 3s forms.
+    "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)m",
+    // 4s/4s_int forms.
+    "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
+    // 4p forms.
+    "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
+
+//-- Math instructions --//
+
+// VSQRTPS.
+// y,y.
+def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 19;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
+
+// y,m256.
+def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 23;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>;
+
+// VSQRTPD.
+// y,y.
+def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 28;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
+
+// y,m256.
+def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 32;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>;
+
+// RSQRT SS/PS.
+// x,x.
+def WriteRSQRTr : SchedWriteRes<[HWPort0]> {
+  let Latency = 5;
+}
+def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>;
+
+// x,m128.
+def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>;
+
+// RSQRTPS 256.
+// y,y.
+def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+  let ResourceCycles = [2, 1];
+}
+def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+
+// y,m256.
+def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+  let ResourceCycles = [2, 1, 1];
+}
+def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>;
+
+//-- Logic instructions --//
+
+// AND, ANDN, OR, XOR PS/PD.
+// x,x / v,v,v.
+def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
+// x,m / v,v,m.
+def : InstRW<[WriteP5Ld, ReadAfterLd],
+                         (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+
+//-- Other instructions --//
+
+// VZEROUPPER.
+def WriteVZEROUPPER : SchedWriteRes<[]> {
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>;
+
+// VZEROALL.
+def WriteVZEROALL : SchedWriteRes<[]> {
+  let NumMicroOps = 12;
+}
+def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>;
+
+// LDMXCSR.
+def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 1, 1];
+}
+def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>;
+
+// STMXCSR.
+def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 1, 1, 1];
+}
+def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>;
+
 } // SchedModel

diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index 83f0534..eca65c2 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td

@@ -117,6 +117,7 @@
 defm : SBWriteResPair<WriteFMul,   SBPort0, 5>;
 defm : SBWriteResPair<WriteFDiv,   SBPort0, 12>; // 10-14 cycles.
 defm : SBWriteResPair<WriteFRcp,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
 defm : SBWriteResPair<WriteFSqrt,  SBPort0, 15>;
 defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
 defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;

diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 25c5a6b..a261356 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td

@@ -63,12 +63,13 @@
 defm WriteJump : X86SchedWritePair;
 
 // Floating point. This covers both scalar and vector operations.
-defm WriteFAdd  : X86SchedWritePair; // Floating point add/sub/compare.
-defm WriteFMul  : X86SchedWritePair; // Floating point multiplication.
-defm WriteFDiv  : X86SchedWritePair; // Floating point division.
-defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
-defm WriteFRcp  : X86SchedWritePair; // Floating point reciprocal.
-defm WriteFMA   : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFAdd   : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul   : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv   : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt  : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp   : X86SchedWritePair; // Floating point reciprocal estimate.
+defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
+defm WriteFMA    : X86SchedWritePair; // Fused Multiply Add.
 defm WriteFShuffle  : X86SchedWritePair; // Floating point vector shuffles.
 defm WriteFBlend  : X86SchedWritePair; // Floating point vector blends.
 defm WriteFVarBlend  : X86SchedWritePair; // Fp vector variable blends.
@@ -314,6 +315,11 @@
 def IIC_SSE_SQRTSD_RR : InstrItinClass;
 def IIC_SSE_SQRTSD_RM : InstrItinClass;
 
+def IIC_SSE_RSQRTPS_RR : InstrItinClass;
+def IIC_SSE_RSQRTPS_RM : InstrItinClass;
+def IIC_SSE_RSQRTSS_RR : InstrItinClass;
+def IIC_SSE_RSQRTSS_RM : InstrItinClass;
+
 def IIC_SSE_RCPP_RR : InstrItinClass;
 def IIC_SSE_RCPP_RM : InstrItinClass;
 def IIC_SSE_RCPS_RR : InstrItinClass;
@@ -633,9 +639,12 @@
   let MicroOpBufferSize = 32;
   let LoadLatency = 4;
   let HighLatency = 10;
+  let PostRAScheduler = 0;
 }
 
 include "X86ScheduleAtom.td"
 include "X86SchedSandyBridge.td"
 include "X86SchedHaswell.td"
 include "X86ScheduleSLM.td"
+include "X86ScheduleBtVer2.td"
+

diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 3256ee7..4c559c9 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td

@@ -224,6 +224,11 @@
   InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
 
+  InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
+
   InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
@@ -538,6 +543,7 @@
   // On the Atom, the throughput for taken branches is 2 cycles. For small
   // simple loops, expand by a small factor to hide the backedge cost.
   let LoopMicroOpBufferSize = 10;
+  let PostRAScheduler = 1;
 
   let Itineraries = AtomItineraries;
 }

diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
new file mode 100644
index 0000000..ce1ece3
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleBtVer2.td

@@ -0,0 +1,341 @@
+//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AMD btver2 (Jaguar) to support
+// instruction scheduling and other instruction cost heuristics. Based off AMD Software
+// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix.
+//
+//===----------------------------------------------------------------------===//
+
+def BtVer2Model : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and btver2 can
+  // decode 2 instructions per cycle.
+  let IssueWidth = 2;
+  let MicroOpBufferSize = 64; // Retire Control Unit
+  let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency)
+  let HighLatency = 25;
+  let MispredictPenalty = 14; // Minimum branch misdirection penalty
+  let PostRAScheduler = 1;
+
+  // FIXME: SSE4/AVX is unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
+
+let SchedModel = BtVer2Model in {
+
+// Jaguar can issue up to 6 micro-ops in one cycle
+def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam)
+def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV
+def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU
+def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA)
+def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA
+def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
+
+// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly
+def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>;
+
+// Integer Pipe Scheduler
+def JALU01 : ProcResGroup<[JALU0, JALU1]> {
+  let BufferSize=20;
+}
+
+// AGU Pipe Scheduler
+def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> {
+  let BufferSize=12;
+}
+
+// Fpu Pipe Scheduler
+def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> {
+  let BufferSize=18;
+}
+
+def JDiv    : ProcResource<1>; // integer division
+def JMul    : ProcResource<1>; // integer multiplication
+def JVALU0  : ProcResource<1>; // vector integer
+def JVALU1  : ProcResource<1>; // vector integer
+def JVIMUL  : ProcResource<1>; // vector integer multiplication
+def JSTC    : ProcResource<1>; // vector store/convert
+def JFPM    : ProcResource<1>; // FP multiplication
+def JFPA    : ProcResource<1>; // FP addition
+
+// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+     let Latency = !add(Lat, 3);
+  }
+}
+
+multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> {
+     let Latency = !add(Lat, 5);
+  }
+}
+
+// A folded store needs a cycle on the SAGU for the store data.
+def : WriteRes<WriteRMW, [JSAGU]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteALU,   JALU01, 1>;
+defm : JWriteResIntPair<WriteIMul,  JALU1,  3>;
+
+def  : WriteRes<WriteIMulH, [JALU1]> {
+  let Latency = 6;
+  let ResourceCycles = [4];
+}
+
+// FIXME 8/16 bit divisions
+def : WriteRes<WriteIDiv, [JALU1, JDiv]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> {
+  let Latency = 41;
+  let ResourceCycles = [1, 1, 25];
+}
+
+// This is for simple LEAs with one or two input operands.
+// FIXME: SAGU 3-operand LEA
+def : WriteRes<WriteLEA, [JALU01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteShift, JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+// FIXME: Split x86 and SSE load/store/moves
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad,  [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteStore, [JSAGU]>;
+def : WriteRes<WriteMove,  [JAny]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero,  []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResIntPair<WriteJump,  JALU01, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
+// FIXME: Double precision latencies
+// FIXME: SS vs PS latencies
+// FIXME: ymm latencies
+////////////////////////////////////////////////////////////////////////////////
+
+defm : JWriteResFpuPair<WriteFAdd,        JFPU0,  3>;
+defm : JWriteResFpuPair<WriteFMul,        JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFRcp,        JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFRsqrt,      JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFShuffle,   JFPU01,  1>;
+defm : JWriteResFpuPair<WriteFBlend,     JFPU01,  1>;
+defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 21;
+  let ResourceCycles = [1, 1, 21];
+}
+def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 26;
+  let ResourceCycles = [1, 1, 21];
+}
+
+def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 19;
+  let ResourceCycles = [1, 1, 19];
+}
+def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> {
+  let Latency = 24;
+  let ResourceCycles = [1, 1, 19];
+}
+
+// FIXME: integer pipes
+defm : JWriteResFpuPair<WriteCvtF2I,    JFPU1,  3>; // Float -> Integer.
+defm : JWriteResFpuPair<WriteCvtI2F,    JFPU1,  3>; // Integer -> Float.
+defm : JWriteResFpuPair<WriteCvtF2F,    JFPU1,  3>; // Float -> Float size conversion.
+
+def : WriteRes<WriteFVarBlend, [JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 2];
+}
+
+// Vector integer operations.
+defm : JWriteResFpuPair<WriteVecALU,   JFPU01,  1>;
+defm : JWriteResFpuPair<WriteVecShift, JFPU01,  1>;
+defm : JWriteResFpuPair<WriteVecIMul,  JFPU0,   2>;
+defm : JWriteResFpuPair<WriteShuffle,  JFPU01,  1>;
+defm : JWriteResFpuPair<WriteBlend,    JFPU01,  1>;
+defm : JWriteResFpuPair<WriteVecLogic, JFPU01,  1>;
+defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>;
+
+def : WriteRes<WriteVarBlend, [JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 2];
+}
+
+// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2?
+def : WriteRes<WriteVarVecShift, [JFPU01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteMPSAD, [JFPU0]> {
+  let Latency = 3;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 2];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+// FIXME: approximate latencies + pipe dependencies
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WritePCmpIStrM, [JFPU01]> {
+  let Latency = 7;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> {
+  let Latency = 12;
+  let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [JFPU01]> {
+  let Latency = 13;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> {
+  let Latency = 18;
+  let ResourceCycles = [1, 5];
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [JFPU01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+}
+def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> {
+  let Latency = 11;
+  let ResourceCycles = [1, 2];
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [JFPU01]> {
+  let Latency = 13;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> {
+  let Latency = 18;
+  let ResourceCycles = [1, 5];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// AES Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> {
+  let Latency = 3;
+  let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 1, 1];
+}
+
+def : WriteRes<WriteAESIMC, [JVIMUL]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+def : WriteRes<WriteAESKeyGen, [JVIMUL]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteCLMul, [JVIMUL]> {
+  let Latency = 2;
+  let ResourceCycles = [1];
+}
+def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+// FIXME: pipe for system/microcode?
+def : WriteRes<WriteSystem,     [JAny]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
+def : WriteRes<WriteFence,  [JSAGU]>;
+def : WriteRes<WriteNop, []>;
+} // SchedModel
+

diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index 823d101..f95d4fa 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td

@@ -19,6 +19,7 @@
   let MicroOpBufferSize = 32; // Based on the reorder buffer.
   let LoadLatency = 3;
   let MispredictPenalty = 10;
+  let PostRAScheduler = 1;
 
   // For small loops, expand by a small factor to hide the backedge cost.
   let LoopMicroOpBufferSize = 10;
@@ -100,6 +101,7 @@
 // Scalar and vector floating point.
 defm : SMWriteResPair<WriteFAdd,   FPC_RSV1, 3>;
 defm : SMWriteResPair<WriteFRcp,   FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
 defm : SMWriteResPair<WriteFSqrt,  FPC_RSV0, 15>;
 defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
 defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;

diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index a83dd9b..821044f 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp

@@ -29,6 +29,26 @@
 
 X86SelectionDAGInfo::~X86SelectionDAGInfo() {}
 
+bool X86SelectionDAGInfo::isBaseRegConflictPossible(
+    SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const {
+  // We cannot use TRI->hasBasePointer() until *after* we select all basic
+  // blocks.  Legalization may introduce new stack temporaries with large
+  // alignment requirements.  Fall back to generic code if there are any
+  // dynamic stack adjustments (hopefully rare) and the base pointer would
+  // conflict if we had to use it.
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  if (!MFI->hasVarSizedObjects() && !MFI->hasInlineAsmWithSPAdjust())
+    return false;
+
+  const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
+  unsigned BaseReg = TRI->getBaseRegister();
+  for (unsigned R : ClobberSet)
+    if (BaseReg == R)
+      return true;
+  return false;
+}
+
 SDValue
 X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                              SDValue Chain,
@@ -39,6 +59,13 @@
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>();
 
+#ifndef NDEBUG
+  // If the base register might conflict with our physical registers, bail out.
+  unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
+                           X86::ECX, X86::EAX, X86::EDI};
+  assert(!isBaseRegConflictPossible(DAG, ClobberSet));
+#endif
+
   // If to a segment-relative address space, use the default lowering.
   if (DstPtrInfo.getAddrSpace() >= 256)
     return SDValue();
@@ -201,12 +228,10 @@
       SrcPtrInfo.getAddrSpace() >= 256)
     return SDValue();
 
-  // ESI might be used as a base pointer, in that case we can't simply overwrite
-  // the register.  Fall back to generic code.
-  const X86RegisterInfo *TRI =
-      static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo());
-  if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
-      TRI->getBaseRegister() == X86::ESI)
+  // If the base register might conflict with our physical registers, bail out.
+  unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
+                           X86::ECX, X86::ESI, X86::EDI};
+  if (isBaseRegConflictPossible(DAG, ClobberSet))
     return SDValue();
 
   MVT AVT;

diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index c12555a..eb7e0ed 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86SELECTIONDAGINFO_H
-#define X86SELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 
@@ -23,6 +23,11 @@
 class X86Subtarget;
 
 class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Returns true if it is possible for the base register to conflict with the
+  /// given set of clobbers for a memory intrinsic.
+  bool isBaseRegConflictPossible(SelectionDAG &DAG,
+                                 ArrayRef<unsigned> ClobberSet) const;
+
 public:
   explicit X86SelectionDAGInfo(const DataLayout &DL);
   ~X86SelectionDAGInfo();

diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 79b7e68..9d877c9 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp

@@ -13,6 +13,7 @@
 
 #include "X86Subtarget.h"
 #include "X86InstrInfo.h"
+#include "X86TargetMachine.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -67,12 +68,7 @@
   if (GV->hasDLLImportStorageClass())
     return X86II::MO_DLLIMPORT;
 
-  // Determine whether this is a reference to a definition or a declaration.
-  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
-  // load from stub.
-  bool isDecl = GV->hasAvailableExternallyLinkage();
-  if (GV->isDeclaration() && !GV->isMaterializable())
-    isDecl = true;
+  bool isDecl = GV->isDeclarationForLinker();
 
   // X86-64 in PIC mode.
   if (isPICStyleRIPRel()) {
@@ -182,23 +178,7 @@
   return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
 }
 
-void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
-  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
-  Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
-  Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
-  std::string CPU =
-      !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : "";
-  std::string FS =
-      !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
-  if (!FS.empty()) {
-    initializeEnvironment();
-    resetSubtargetFeatures(CPU, FS);
-  }
-}
-
-void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   std::string CPUName = CPU;
   if (CPUName.empty())
     CPUName = "generic";
@@ -219,9 +199,6 @@
   // Make sure the right MCSchedModel is used.
   InitCPUSchedModel(CPUName);
 
-  if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM)
-    PostRAScheduler = true;
-
   InstrItins = getInstrItineraryForCPU(CPUName);
 
   // It's important to keep the MCSubtargetInfo feature bits in sync with
@@ -275,10 +252,15 @@
   HasERI = false;
   HasCDI = false;
   HasPFI = false;
+  HasDQI = false;
+  HasBWI = false;
+  HasVLX = false;
   HasADX = false;
   HasSHA = false;
+  HasSGX = false;
   HasPRFCHW = false;
   HasRDSEED = false;
+  HasSMAP = false;
   IsBTMemSlow = false;
   IsSHLDSlow = false;
   IsUAMemFast = false;
@@ -286,48 +268,51 @@
   HasCmpxchg16b = false;
   UseLeaForSP = false;
   HasSlowDivide = false;
-  PostRAScheduler = false;
   PadShortFunctions = false;
   CallRegIndirect = false;
   LEAUsesAG = false;
   SlowLEA = false;
   SlowIncDec = false;
+  UseSqrtEst = false;
+  UseReciprocalEst = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
 }
 
-static std::string computeDataLayout(const X86Subtarget &ST) {
+static std::string computeDataLayout(const Triple &TT) {
   // X86 is little endian
   std::string Ret = "e";
 
-  Ret += DataLayout::getManglingComponent(ST.getTargetTriple());
+  Ret += DataLayout::getManglingComponent(TT);
   // X86 and x32 have 32 bit pointers.
-  if (ST.isTarget64BitILP32() || !ST.is64Bit())
+  if ((TT.isArch64Bit() &&
+       (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) ||
+      !TT.isArch64Bit())
     Ret += "-p:32:32";
 
   // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
-  if (ST.is64Bit() || ST.isOSWindows() || ST.isTargetNaCl())
+  if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
     Ret += "-i64:64";
   else
     Ret += "-f64:32:64";
 
   // Some ABIs align long double to 128 bits, others to 32.
-  if (ST.isTargetNaCl())
+  if (TT.isOSNaCl())
     ; // No f80
-  else if (ST.is64Bit() || ST.isTargetDarwin())
+  else if (TT.isArch64Bit() || TT.isOSDarwin())
     Ret += "-f80:128";
   else
     Ret += "-f80:32";
 
   // The registers can hold 8, 16, 32 or, in x86-64, 64 bits.
-  if (ST.is64Bit())
+  if (TT.isArch64Bit())
     Ret += "-n8:16:32:64";
   else
     Ret += "-n8:16:32";
 
   // The stack is aligned to 32 bits on some ABIs and 128 bits on others.
-  if (!ST.is64Bit() && ST.isOSWindows())  
+  if (!TT.isArch64Bit() && TT.isOSWindows())
     Ret += "-S32";
   else
     Ret += "-S128";
@@ -338,37 +323,47 @@
 X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
                                                             StringRef FS) {
   initializeEnvironment();
-  resetSubtargetFeatures(CPU, FS);
+  initSubtargetFeatures(CPU, FS);
   return *this;
 }
 
 X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS, X86TargetMachine &TM,
+                           const std::string &FS, const X86TargetMachine &TM,
                            unsigned StackAlignOverride)
     : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
       PICStyle(PICStyles::None), TargetTriple(TT),
+      DL(computeDataLayout(TargetTriple)),
       StackAlignOverride(StackAlignOverride),
       In64BitMode(TargetTriple.getArch() == Triple::x86_64),
       In32BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() != Triple::CODE16),
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() == Triple::CODE16),
-      DL(computeDataLayout(*this)), TSInfo(DL),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM),
-      FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(),
-                    is64Bit() ? -8 : -4),
-      JITInfo(hasSSE1()) {}
-
-bool
-X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                                    TargetSubtargetInfo::AntiDepBreakMode &Mode,
-                                    RegClassVector &CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
-  CriticalPathRCs.clear();
-  return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
+      TSInfo(DL), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TLInfo(TM), FrameLowering(TargetFrameLowering::StackGrowsDown,
+                                getStackAlignment(), is64Bit() ? -8 : -4) {
+  // Determine the PICStyle based on the target selected.
+  if (TM.getRelocationModel() == Reloc::Static) {
+    // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+    setPICStyle(PICStyles::None);
+  } else if (is64Bit()) {
+    // PIC in 64 bit mode is always rip-rel.
+    setPICStyle(PICStyles::RIPRel);
+  } else if (isTargetCOFF()) {
+    setPICStyle(PICStyles::None);
+  } else if (isTargetDarwin()) {
+    if (TM.getRelocationModel() == Reloc::PIC_)
+      setPICStyle(PICStyles::StubPIC);
+    else {
+      assert(TM.getRelocationModel() == Reloc::DynamicNoPIC);
+      setPICStyle(PICStyles::StubDynamicNoPIC);
+    }
+  } else if (isTargetELF()) {
+    setPICStyle(PICStyles::GOT);
+  }
 }
 
-bool
-X86Subtarget::enableEarlyIfConversion() const {
+bool X86Subtarget::enableEarlyIfConversion() const {
   return hasCMov() && X86EarlyIfConv;
 }
+

diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 09db0eb..091b6c4 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h

@@ -11,13 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86SUBTARGET_H
-#define X86SUBTARGET_H
+#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H
+#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H
 
 #include "X86FrameLowering.h"
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
-#include "X86JITInfo.h"
 #include "X86SelectionDAGInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/CallingConv.h"
@@ -139,12 +138,18 @@
   /// HasSHA - Processor has SHA instructions.
   bool HasSHA;
 
+  /// HasSGX - Processor has SGX instructions.
+  bool HasSGX;
+
   /// HasPRFCHW - Processor has PRFCHW instructions.
   bool HasPRFCHW;
 
   /// HasRDSEED - Processor has RDSEED instructions.
   bool HasRDSEED;
 
+  /// HasSMAP - Processor has SMAP instructions.
+  bool HasSMAP;
+
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -170,9 +175,6 @@
   /// full divides and should be used when possible.
   bool HasSlowDivide;
 
-  /// PostRAScheduler - True if using post-register-allocation scheduler.
-  bool PostRAScheduler;
-
   /// PadShortFunctions - True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -190,15 +192,34 @@
   /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
+  /// Use the RSQRT* instructions to optimize square root calculations.
+  /// For this to be profitable, the cost of FSQRT and FDIV must be
+  /// substantially higher than normal FP ops like FADD and FMUL.
+  bool UseSqrtEst;
+
+  /// Use the RCP* instructions to optimize FP division calculations.
+  /// For this to be profitable, the cost of FDIV must be
+  /// substantially higher than normal FP ops like FADD and FMUL.
+  bool UseReciprocalEst;
+  
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
-  
+
   /// Processor has AVX-512 Exponential and Reciprocal Instructions
   bool HasERI;
-  
+
   /// Processor has AVX-512 Conflict Detection Instructions
   bool HasCDI;
-  
+
+  /// Processor has AVX-512 Doubleword and Quadword instructions
+  bool HasDQI;
+
+  /// Processor has AVX-512 Byte and Word instructions
+  bool HasBWI;
+
+  /// Processor has AVX-512 Vector Length eXtenstions
+  bool HasVLX;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -214,6 +235,9 @@
   InstrItineraryData InstrItins;
 
 private:
+  // Calculates type size & alignment
+  const DataLayout DL;
+
   /// StackAlignOverride - Override the stack alignment.
   unsigned StackAlignOverride;
 
@@ -226,30 +250,35 @@
   /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit.
   bool In16BitMode;
 
-  // Calculates type size & alignment
-  const DataLayout DL;
   X86SelectionDAGInfo TSInfo;
   // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
   // X86TargetLowering needs.
   X86InstrInfo InstrInfo;
   X86TargetLowering TLInfo;
   X86FrameLowering FrameLowering;
-  X86JITInfo JITInfo;
 
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   X86Subtarget(const std::string &TT, const std::string &CPU,
-               const std::string &FS, X86TargetMachine &TM,
+               const std::string &FS, const X86TargetMachine &TM,
                unsigned StackAlignOverride);
 
-  const X86TargetLowering *getTargetLowering() const { return &TLInfo; }
-  const X86InstrInfo *getInstrInfo() const { return &InstrInfo; }
-  const DataLayout *getDataLayout() const { return &DL; }
-  const X86FrameLowering *getFrameLowering() const { return &FrameLowering; }
-  const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
-  X86JITInfo *getJITInfo() { return &JITInfo; }
+  const X86TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const X86FrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const X86RegisterInfo *getRegisterInfo() const override {
+    return &getInstrInfo()->getRegisterInfo();
+  }
 
   /// getStackAlignment - Returns the minimum alignment known to hold of the
   /// stack frame on entry to the function and which must be maintained by every
@@ -264,14 +293,12 @@
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  /// \brief Reset the features for the X86 target.
-  void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
   /// \brief Initialize the full set of dependencies so we can use an initializer
   /// list for X86Subtarget.
   X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void initializeEnvironment();
-  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+  void initSubtargetFeatures(StringRef CPU, StringRef FS);
 public:
   /// Is this x86_64? (disregarding specific ABI / programming model)
   bool is64Bit() const {
@@ -294,7 +321,8 @@
 
   /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
   bool isTarget64BitLP64() const {
-    return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32);
+    return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 &&
+                           TargetTriple.getOS() != Triple::NaCl);
   }
 
   PICStyles::Style getPICStyle() const { return PICStyle; }
@@ -335,8 +363,10 @@
   bool hasHLE() const { return HasHLE; }
   bool hasADX() const { return HasADX; }
   bool hasSHA() const { return HasSHA; }
+  bool hasSGX() const { return HasSGX; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
+  bool hasSMAP() const { return HasSMAP; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
@@ -349,9 +379,14 @@
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
   bool slowIncDec() const { return SlowIncDec; }
+  bool useSqrtEst() const { return UseSqrtEst; }
+  bool useReciprocalEst() const { return UseReciprocalEst; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
+  bool hasDQI() const { return HasDQI; }
+  bool hasBWI() const { return HasBWI; }
+  bool hasVLX() const { return HasVLX; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
@@ -391,6 +426,10 @@
     return TargetTriple.isWindowsGNUEnvironment();
   }
 
+  bool isTargetWindowsItanium() const {
+    return TargetTriple.isWindowsItaniumEnvironment();
+  }
+
   bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
 
   bool isOSWindows() const { return TargetTriple.isOSWindows(); }
@@ -453,18 +492,17 @@
   /// Enable the MachineScheduler pass for all X86 subtargets.
   bool enableMachineScheduler() const override { return true; }
 
-  /// enablePostRAScheduler - run for Atom optimization.
-  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                             RegClassVector& CriticalPathRCs) const override;
-
-  bool postRAScheduler() const { return PostRAScheduler; }
-
   bool enableEarlyIfConversion() const override;
 
   /// getInstrItins = Return the instruction itineraries based on the
   /// subtarget selection.
-  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return &InstrItins;
+  }
+
+  AntiDepBreakMode getAntiDepBreakMode() const override {
+    return TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  }
 };
 
 } // End llvm namespace

diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index f12140f..8802feb 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp

@@ -13,7 +13,9 @@
 
 #include "X86TargetMachine.h"
 #include "X86.h"
+#include "X86TargetObjectFile.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
@@ -27,7 +29,23 @@
   RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target);
 }
 
-void X86TargetMachine::anchor() { }
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::x86_64)
+      return make_unique<X86_64MachoTargetObjectFile>();
+    return make_unique<TargetLoweringObjectFileMachO>();
+  }
+
+  if (TT.isOSLinux())
+    return make_unique<X86LinuxTargetObjectFile>();
+  if (TT.isOSBinFormatELF())
+    return make_unique<TargetLoweringObjectFileELF>();
+  if (TT.isKnownWindowsMSVCEnvironment())
+    return make_unique<X86WindowsTargetObjectFile>();
+  if (TT.isOSBinFormatCOFF())
+    return make_unique<TargetLoweringObjectFileCOFF>();
+  llvm_unreachable("unknown subtarget type");
+}
 
 /// X86TargetMachine ctor - Create an X86 target.
 ///
@@ -36,27 +54,8 @@
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      TLOF(createTLOF(Triple(getTargetTriple()))),
       Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
-  // Determine the PICStyle based on the target selected.
-  if (getRelocationModel() == Reloc::Static) {
-    // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
-    Subtarget.setPICStyle(PICStyles::None);
-  } else if (Subtarget.is64Bit()) {
-    // PIC in 64 bit mode is always rip-rel.
-    Subtarget.setPICStyle(PICStyles::RIPRel);
-  } else if (Subtarget.isTargetCOFF()) {
-    Subtarget.setPICStyle(PICStyles::None);
-  } else if (Subtarget.isTargetDarwin()) {
-    if (getRelocationModel() == Reloc::PIC_)
-      Subtarget.setPICStyle(PICStyles::StubPIC);
-    else {
-      assert(getRelocationModel() == Reloc::DynamicNoPIC);
-      Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC);
-    }
-  } else if (Subtarget.isTargetELF()) {
-    Subtarget.setPICStyle(PICStyles::GOT);
-  }
-
   // default to hard float ABI
   if (Options.FloatABIType == FloatABI::Default)
     this->Options.FloatABIType = FloatABI::Hard;
@@ -71,6 +70,47 @@
   initAsmInfo();
 }
 
+X86TargetMachine::~X86TargetMachine() {}
+
+const X86Subtarget *
+X86TargetMachine::getSubtargetImpl(const Function &F) const {
+  AttributeSet FnAttrs = F.getAttributes();
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function before we can generate a subtarget. We also need to use
+  // it as a key for the subtarget since that can be the only difference
+  // between two functions.
+  Attribute SFAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
+  bool SoftFloat = !SFAttr.hasAttribute(Attribute::None)
+                       ? SFAttr.getValueAsString() == "true"
+                       : Options.UseSoftFloat;
+
+  auto &I = SubtargetMap[CPU + FS + (SoftFloat ? "use-soft-float=true"
+                                               : "use-soft-float=false")];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
+                                        Options.StackAlignmentOverride);
+  }
+  return I.get();
+}
+
 //===----------------------------------------------------------------------===//
 // Command line options for x86
 //===----------------------------------------------------------------------===//
@@ -125,7 +165,7 @@
 }
 
 void X86PassConfig::addIRPasses() {
-  addPass(createX86AtomicExpandPass(&getX86TargetMachine()));
+  addPass(createAtomicExpandPass(&getX86TargetMachine()));
 
   TargetPassConfig::addIRPasses();
 }
@@ -177,10 +217,3 @@
 
   return ShouldPrint;
 }
-
-bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
-                                      JITCodeEmitter &JCE) {
-  PM.add(createX86JITCodeEmitterPass(*this, JCE));
-
-  return false;
-}

diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 41d5157..916278c 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86TARGETMACHINE_H
-#define X86TARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/IR/DataLayout.h"
@@ -23,46 +23,29 @@
 class StringRef;
 
 class X86TargetMachine final : public LLVMTargetMachine {
-  virtual void anchor();
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   X86Subtarget       Subtarget;
 
+  mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap;
+
 public:
   X86TargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS, const TargetOptions &Options,
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL);
+  ~X86TargetMachine() override;
 
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
-  const X86InstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const TargetFrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
-  X86JITInfo *getJITInfo() override { return Subtarget.getJITInfo(); }
   const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const X86TargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
-  const X86RegisterInfo  *getRegisterInfo() const override {
-    return &getInstrInfo()->getRegisterInfo();
-  }
-  const InstrItineraryData *getInstrItineraryData() const override {
-    return &getSubtargetImpl()->getInstrItineraryData();
-  }
+  const X86Subtarget *getSubtargetImpl(const Function &F) const override;
 
   /// \brief Register X86 analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
 
   // Set up the pass pipeline.
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-
-  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
 };
 
 } // End llvm namespace

diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 8157085..f8bcd61 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp

@@ -8,10 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86TargetObjectFile.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Target/TargetLowering.h"
@@ -106,3 +108,64 @@
                                  MCSymbolRefExpr::VK_COFF_IMGREL32,
                                  getContext());
 }
+
+static std::string APIntToHexString(const APInt &AI) {
+  unsigned Width = (AI.getBitWidth() / 8) * 2;
+  std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
+  unsigned Size = HexString.size();
+  assert(Width >= Size && "hex string is too large!");
+  HexString.insert(HexString.begin(), Width - Size, '0');
+
+  return HexString;
+}
+
+
+static std::string scalarConstantToHexString(const Constant *C) {
+  Type *Ty = C->getType();
+  APInt AI;
+  if (isa<UndefValue>(C)) {
+    AI = APInt(Ty->getPrimitiveSizeInBits(), /*val=*/0);
+  } else if (Ty->isFloatTy() || Ty->isDoubleTy()) {
+    const auto *CFP = cast<ConstantFP>(C);
+    AI = CFP->getValueAPF().bitcastToAPInt();
+  } else if (Ty->isIntegerTy()) {
+    const auto *CI = cast<ConstantInt>(C);
+    AI = CI->getValue();
+  } else {
+    llvm_unreachable("unexpected constant pool element type!");
+  }
+  return APIntToHexString(AI);
+}
+
+const MCSection *
+X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind,
+                                                  const Constant *C) const {
+  if (Kind.isReadOnly()) {
+    if (C) {
+      Type *Ty = C->getType();
+      SmallString<32> COMDATSymName;
+      if (Ty->isFloatTy() || Ty->isDoubleTy()) {
+        COMDATSymName = "__real@";
+        COMDATSymName += scalarConstantToHexString(C);
+      } else if (const auto *VTy = dyn_cast<VectorType>(Ty)) {
+        uint64_t NumBits = VTy->getBitWidth();
+        if (NumBits == 128 || NumBits == 256) {
+          COMDATSymName = NumBits == 128 ? "__xmm@" : "__ymm@";
+          for (int I = VTy->getNumElements() - 1, E = -1; I != E; --I)
+            COMDATSymName +=
+                scalarConstantToHexString(C->getAggregateElement(I));
+        }
+      }
+      if (!COMDATSymName.empty()) {
+        unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                   COFF::IMAGE_SCN_MEM_READ |
+                                   COFF::IMAGE_SCN_LNK_COMDAT;
+        return getContext().getCOFFSection(".rdata", Characteristics, Kind,
+                                           COMDATSymName,
+                                           COFF::IMAGE_COMDAT_SELECT_ANY);
+      }
+    }
+  }
+
+  return TargetLoweringObjectFile::getSectionForConstant(Kind, C);
+}

diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index a08ed09..6a6988a 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H
-#define LLVM_TARGET_X86_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -46,6 +46,11 @@
     const MCExpr *
     getExecutableRelativeSymbol(const ConstantExpr *CE, Mangler &Mang,
                                 const TargetMachine &TM) const override;
+
+    /// \brief Given a mergeable constant with the specified size and relocation
+    /// information, return a section that it should be placed in.
+    const MCSection *getSectionForConstant(SectionKind Kind,
+                                           const Constant *C) const override;
   };
 
 } // end namespace llvm

diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index c961e2f..2b70fd0 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp

@@ -48,8 +48,8 @@
   }
 
   X86TTI(const X86TargetMachine *TM)
-    : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
-      TLI(TM->getTargetLowering()) {
+      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
+        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
     initializeX86TTIPass(*PassRegistry::getPassRegistry());
   }
 
@@ -82,9 +82,10 @@
 
   unsigned getNumberOfRegisters(bool Vector) const override;
   unsigned getRegisterBitWidth(bool Vector) const override;
-  unsigned getMaximumUnrollFactor() const override;
+  unsigned getMaxInterleaveFactor() const override;
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
-                                  OperandValueKind) const override;
+                                  OperandValueKind, OperandValueProperties,
+                                  OperandValueProperties) const override;
   unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
                           int Index, Type *SubTp) const override;
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
@@ -166,7 +167,7 @@
 
 }
 
-unsigned X86TTI::getMaximumUnrollFactor() const {
+unsigned X86TTI::getMaxInterleaveFactor() const {
   if (ST->isAtom())
     return 1;
 
@@ -178,15 +179,37 @@
   return 2;
 }
 
-unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                        OperandValueKind Op1Info,
-                                        OperandValueKind Op2Info) const {
+unsigned X86TTI::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, OperandValueKind Op1Info,
+    OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo,
+    OperandValueProperties Opd2PropInfo) const {
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  if (ISD == ISD::SDIV &&
+      Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+    // On X86, vector signed division by constants power-of-two are
+    // normally expanded to the sequence SRA + SRL + ADD + SRA.
+    // The OperandValue properties many not be same as that of previous
+    // operation;conservatively assume OP_None.
+    unsigned Cost =
+        2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
+
+    return Cost;
+  }
+
   static const CostTblEntry<MVT::SimpleValueType>
   AVX2UniformConstCostTable[] = {
     { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
@@ -202,6 +225,15 @@
       return LT.first * AVX2UniformConstCostTable[Idx].Cost;
   }
 
+  static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = {
+    { ISD::SHL,     MVT::v16i32,    1 },
+    { ISD::SRL,     MVT::v16i32,    1 },
+    { ISD::SRA,     MVT::v16i32,    1 },
+    { ISD::SHL,     MVT::v8i64,    1 },
+    { ISD::SRL,     MVT::v8i64,    1 },
+    { ISD::SRA,     MVT::v8i64,    1 },
+  };
+
   static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
     // customize them to detect the cases where shift amount is a scalar one.
@@ -237,6 +269,11 @@
     { ISD::UDIV,  MVT::v4i64,  4*20 },
   };
 
+  if (ST->hasAVX512()) {
+    int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * AVX512CostTable[Idx].Cost;
+  }
   // Look for AVX2 lowering tricks.
   if (ST->hasAVX2()) {
     if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
@@ -541,7 +578,7 @@
     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
     // There are faster sequences for float conversions.
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
-    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 },
+    { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
@@ -557,6 +594,45 @@
       return LTSrc.first * SSE2ConvTbl[Idx].Cost;
   }
 
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  AVX512ConversionTbl[] = {
+    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
+    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
+    { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
+    { ISD::FP_ROUND,  MVT::v16f32,  MVT::v8f64,  3 },
+
+    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
+    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
+    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
+    { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
+    { ISD::TRUNCATE,  MVT::v16i32,  MVT::v8i64,  4 },
+
+    // v16i1 -> v16i32 - load + broadcast
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
+
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v16i32, 3 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v16i32, 3 },
+
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
+    { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+  };
+
+  if (ST->hasAVX512()) {
+    int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second,
+                                     LTSrc.second);
+    if (Idx != -1)
+      return AVX512ConversionTbl[Idx].Cost;
+  }
   EVT SrcTy = TLI->getValueType(Src);
   EVT DstTy = TLI->getValueType(Dst);
 
@@ -589,6 +665,11 @@
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
     { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
+
+    { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
+    { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
+
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
   };
 
   static const TypeConversionCostTblEntry<MVT::SimpleValueType>
@@ -715,6 +796,19 @@
     { ISD::SETCC,   MVT::v32i8,   1 },
   };
 
+  static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = {
+    { ISD::SETCC,   MVT::v8i64,   1 },
+    { ISD::SETCC,   MVT::v16i32,  1 },
+    { ISD::SETCC,   MVT::v8f64,   1 },
+    { ISD::SETCC,   MVT::v16f32,  1 },
+  };
+
+  if (ST->hasAVX512()) {
+    int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX512CostTbl[Idx].Cost;
+  }
+
   if (ST->hasAVX2()) {
     int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy);
     if (Idx != -1)

diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 0bb5f99..d93baeb 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp

@@ -250,7 +250,7 @@
   const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
   if (!ST.hasAVX() || ST.hasAVX512())
     return false;
-  TII = MF.getTarget().getInstrInfo();
+  TII = MF.getSubtarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   EverMadeChange = false;
 

diff --git a/lib/Target/XCore/Disassembler/LLVMBuild.txt b/lib/Target/XCore/Disassembler/LLVMBuild.txt
index 028de2c..cc30d04 100644
--- a/lib/Target/XCore/Disassembler/LLVMBuild.txt
+++ b/lib/Target/XCore/Disassembler/LLVMBuild.txt

@@ -19,5 +19,5 @@
 type = Library
 name = XCoreDisassembler
 parent = XCore
-required_libraries = MC Support XCoreInfo
+required_libraries = MCDisassembler Support XCoreInfo
 add_to_library_groups = XCore

diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 7fef796..640e6b0 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp

@@ -19,7 +19,6 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -36,47 +35,35 @@
   XCoreDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
     MCDisassembler(STI, Ctx) {}
 
-  /// \brief See MCDisassembler.
-  virtual DecodeStatus getInstruction(MCInst &instr,
-                                      uint64_t &size,
-                                      const MemoryObject &region,
-                                      uint64_t address,
-                                      raw_ostream &vStream,
-                                      raw_ostream &cStream) const override;
-
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
 };
 }
 
-static bool readInstruction16(const MemoryObject &region,
-                              uint64_t address,
-                              uint64_t &size,
-                              uint16_t &insn) {
-  uint8_t Bytes[4];
-
+static bool readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              uint64_t &Size, uint16_t &Insn) {
   // We want to read exactly 2 Bytes of data.
-  if (region.readBytes(address, 2, Bytes) == -1) {
-    size = 0;
+  if (Bytes.size() < 2) {
+    Size = 0;
     return false;
   }
   // Encoded as a little-endian 16-bit word in the stream.
-  insn = (Bytes[0] <<  0) | (Bytes[1] <<  8);
+  Insn = (Bytes[0] << 0) | (Bytes[1] << 8);
   return true;
 }
 
-static bool readInstruction32(const MemoryObject &region,
-                              uint64_t address,
-                              uint64_t &size,
-                              uint32_t &insn) {
-  uint8_t Bytes[4];
-
+static bool readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              uint64_t &Size, uint32_t &Insn) {
   // We want to read exactly 4 Bytes of data.
-  if (region.readBytes(address, 4, Bytes) == -1) {
-    size = 0;
+  if (Bytes.size() < 4) {
+    Size = 0;
     return false;
   }
   // Encoded as a little-endian 32-bit word in the stream.
-  insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
-         (Bytes[3] << 24);
+  Insn =
+      (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) | (Bytes[3] << 24);
   return true;
 }
 
@@ -748,16 +735,12 @@
   return S;
 }
 
-MCDisassembler::DecodeStatus
-XCoreDisassembler::getInstruction(MCInst &instr,
-                                  uint64_t &Size,
-                                  const MemoryObject &Region,
-                                  uint64_t Address,
-                                  raw_ostream &vStream,
-                                  raw_ostream &cStream) const {
+MCDisassembler::DecodeStatus XCoreDisassembler::getInstruction(
+    MCInst &instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
+    raw_ostream &vStream, raw_ostream &cStream) const {
   uint16_t insn16;
 
-  if (!readInstruction16(Region, Address, Size, insn16)) {
+  if (!readInstruction16(Bytes, Address, Size, insn16)) {
     return Fail;
   }
 
@@ -771,7 +754,7 @@
 
   uint32_t insn32;
 
-  if (!readInstruction32(Region, Address, Size, insn32)) {
+  if (!readInstruction32(Bytes, Address, Size, insn32)) {
     return Fail;
   }
 

diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index 98e7c98..78521fd 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h

@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef XCOREINSTPRINTER_H
-#define XCOREINSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+#define LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {

diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index 5665911..f2d2b37 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp

@@ -28,7 +28,6 @@
   ProtectedVisibilityAttr = MCSA_Invalid;
 
   // Debug
-  HasLEB128 = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
   DwarfRegNumForCFI = true;
 }

diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index da2689a..26df211 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCORETARGETASMINFO_H
-#define XCORETARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCASMINFO_H
+#define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
 

diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index d54e94f..4073549 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp

@@ -99,10 +99,10 @@
   formatted_raw_ostream &OS;
 public:
   XCoreTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
-  virtual void emitCCTopData(StringRef Name) override;
-  virtual void emitCCTopFunction(StringRef Name) override;
-  virtual void emitCCBottomData(StringRef Name) override;
-  virtual void emitCCBottomFunction(StringRef Name) override;
+  void emitCCTopData(StringRef Name) override;
+  void emitCCTopFunction(StringRef Name) override;
+  void emitCCBottomData(StringRef Name) override;
+  void emitCCBottomFunction(StringRef Name) override;
 };
 
 XCoreTargetAsmStreamer::XCoreTargetAsmStreamer(MCStreamer &S,

diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
index a255adb..0ff5961 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCOREMCTARGETDESC_H
-#define XCOREMCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
+#define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
 
 namespace llvm {
 class Target;

diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index d707edc..140ba2a 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TARGET_XCORE_H
-#define TARGET_XCORE_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCORE_H
+#define LLVM_LIB_TARGET_XCORE_XCORE_H
 
 #include "MCTargetDesc/XCoreMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"

diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index e98d4f9..82e4e36 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp

@@ -117,7 +117,7 @@
       EmitSpecialLLVMGlobal(GV))
     return;
 
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
   OutStreamer.SwitchSection(
       getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
 
@@ -210,7 +210,7 @@
 
 void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
-  const DataLayout *DL = TM.getDataLayout();
+  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
   const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
   case MachineOperand::MO_Register:

diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index e694736..7c74340 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp

@@ -16,6 +16,7 @@
 #include "XCore.h"
 #include "XCoreInstrInfo.h"
 #include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -226,7 +227,7 @@
   MachineModuleInfo *MMI = &MF.getMMI();
   const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
   const XCoreInstrInfo &TII =
-    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
@@ -262,7 +263,8 @@
     MBB.addLiveIn(XCore::LR);
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opcode));
     MIB.addImm(Adjusted);
-    MIB->addRegisterKilled(XCore::LR, MF.getTarget().getRegisterInfo(), true);
+    MIB->addRegisterKilled(XCore::LR, MF.getSubtarget().getRegisterInfo(),
+                           true);
     if (emitFrameMoves) {
       EmitDefCfaOffset(MBB, MBBI, dl, TII, MMI, Adjusted*4);
       unsigned DRegNum = MRI->getDwarfRegNum(XCore::LR, true);
@@ -310,11 +312,10 @@
 
   if (emitFrameMoves) {
     // Frame moves for callee saved.
-    auto SpillLabels = XFI->getSpillLabels();
-    for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
-      MachineBasicBlock::iterator Pos = SpillLabels[I].first;
+    for (const auto &SpillLabel : XFI->getSpillLabels()) {
+      MachineBasicBlock::iterator Pos = SpillLabel.first;
       ++Pos;
-      CalleeSavedInfo &CSI = SpillLabels[I].second;
+      const CalleeSavedInfo &CSI = SpillLabel.second;
       int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
       unsigned DRegNum = MRI->getDwarfRegNum(CSI.getReg(), true);
       EmitCfiOffset(MBB, Pos, dl, TII, MMI, DRegNum, Offset);
@@ -323,7 +324,8 @@
       // The unwinder requires stack slot & CFI offsets for the exception info.
       // We do not save/spill these registers.
       SmallVector<StackSlotInfo,2> SpillList;
-      GetEHSpillList(SpillList, MFI, XFI, MF.getTarget().getTargetLowering());
+      GetEHSpillList(SpillList, MFI, XFI,
+                     MF.getSubtarget().getTargetLowering());
       assert(SpillList.size()==2 && "Unexpected SpillList size");
       EmitCfiOffset(MBB, MBBI, dl, TII, MMI,
                     MRI->getDwarfRegNum(SpillList[0].Reg, true),
@@ -340,7 +342,7 @@
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   const XCoreInstrInfo &TII =
-    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   DebugLoc dl = MBBI->getDebugLoc();
   unsigned RetOpcode = MBBI->getOpcode();
@@ -355,7 +357,7 @@
     // 'Restore' the exception info the unwinder has placed into the stack
     // slots.
     SmallVector<StackSlotInfo,2> SpillList;
-    GetEHSpillList(SpillList, MFI, XFI, MF.getTarget().getTargetLowering());
+    GetEHSpillList(SpillList, MFI, XFI, MF.getSubtarget().getTargetLowering());
     RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
 
     // Return to the landing pad.
@@ -413,7 +415,7 @@
     return true;
 
   MachineFunction *MF = MBB.getParent();
-  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
   XCoreFunctionInfo *XFI = MF->getInfo<XCoreFunctionInfo>();
   bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
 
@@ -446,7 +448,7 @@
                             const std::vector<CalleeSavedInfo> &CSI,
                             const TargetRegisterInfo *TRI) const{
   MachineFunction *MF = MBB.getParent();
-  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
   bool AtStart = MI == MBB.begin();
   MachineBasicBlock::iterator BeforeI = MI;
   if (!AtStart)
@@ -479,7 +481,7 @@
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const XCoreInstrInfo &TII =
-    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
   if (!hasReservedCallFrame(MF)) {
     // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
     // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'

diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index e4f806a..7b169c2 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCOREFRAMEINFO_H
-#define XCOREFRAMEINFO_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
+#define LLVM_LIB_TARGET_XCORE_XCOREFRAMELOWERING_H
 
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -59,4 +59,4 @@
   };
 }
 
-#endif // XCOREFRAMEINFO_H
+#endif

diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
index 30c7b59..77292c4 100644
--- a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
+++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp

@@ -13,6 +13,7 @@
 
 #include "XCore.h"
 #include "XCoreInstrInfo.h"
+#include "XCoreSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -43,7 +44,7 @@
 
 bool XCoreFTAOElim::runOnMachineFunction(MachineFunction &MF) {
   const XCoreInstrInfo &TII =
-          *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
   unsigned StackSize = MF.getFrameInfo()->getStackSize();
   for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
        ++MFI) {

diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index be7ef64..96c43ae 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp

@@ -69,7 +69,7 @@
 }
 
 XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM)
-    : TargetLowering(TM, new XCoreTargetObjectFile()), TM(TM),
+    : TargetLowering(TM), TM(TM),
       Subtarget(TM.getSubtarget<XCoreSubtarget>()) {
 
   // Set up the register classes.
@@ -426,7 +426,9 @@
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
          "Unexpected extension type");
   assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
-  if (allowsUnalignedMemoryAccesses(LD->getMemoryVT()))
+  if (allowsMisalignedMemoryAccesses(LD->getMemoryVT(),
+                                     LD->getAddressSpace(),
+                                     LD->getAlignment()))
     return SDValue();
 
   unsigned ABIAlignment = getDataLayout()->
@@ -461,14 +463,15 @@
   if (LD->getAlignment() == 2) {
     SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain,
                                  BasePtr, LD->getPointerInfo(), MVT::i16,
-                                 LD->isVolatile(), LD->isNonTemporal(), 2);
+                                 LD->isVolatile(), LD->isNonTemporal(),
+                                 LD->isInvariant(), 2);
     SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                    DAG.getConstant(2, MVT::i32));
     SDValue High = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
                                   HighAddr,
                                   LD->getPointerInfo().getWithOffset(2),
                                   MVT::i16, LD->isVolatile(),
-                                  LD->isNonTemporal(), 2);
+                                  LD->isNonTemporal(), LD->isInvariant(), 2);
     SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High,
                                       DAG.getConstant(16, MVT::i32));
     SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Low, HighShifted);
@@ -504,7 +507,9 @@
   StoreSDNode *ST = cast<StoreSDNode>(Op);
   assert(!ST->isTruncatingStore() && "Unexpected store type");
   assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
-  if (allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+  if (allowsMisalignedMemoryAccesses(ST->getMemoryVT(),
+                                     ST->getAddressSpace(),
+                                     ST->getAlignment())) {
     return SDValue();
   }
   unsigned ABIAlignment = getDataLayout()->
@@ -800,7 +805,8 @@
     return SDValue();
 
   MachineFunction &MF = DAG.getMachineFunction();
-  const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op),
                             RegInfo->getFrameRegister(MF), MVT::i32);
 }
@@ -846,7 +852,8 @@
   SDLoc dl(Op);
 
   // Absolute SP = (FP + FrameToArgs) + Offset
-  const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
+  const TargetRegisterInfo *RegInfo =
+      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   SDValue Stack = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                             RegInfo->getFrameRegister(MF), MVT::i32);
   SDValue FrameToArgs = DAG.getNode(XCoreISD::FRAME_TO_ARGS_OFFSET, dl,
@@ -969,7 +976,7 @@
                        N->getBasePtr(), N->getPointerInfo(),
                        N->isVolatile(), N->isNonTemporal(),
                        N->isInvariant(), N->getAlignment(),
-                       N->getTBAAInfo(), N->getRanges());
+                       N->getAAInfo(), N->getRanges());
   }
   if (N->getMemoryVT() == MVT::i16) {
     if (N->getAlignment() < 2)
@@ -977,13 +984,13 @@
     return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
                           N->getBasePtr(), N->getPointerInfo(), MVT::i16,
                           N->isVolatile(), N->isNonTemporal(),
-                          N->getAlignment(), N->getTBAAInfo());
+                          N->isInvariant(), N->getAlignment(), N->getAAInfo());
   }
   if (N->getMemoryVT() == MVT::i8)
     return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), MVT::i32, N->getChain(),
                           N->getBasePtr(), N->getPointerInfo(), MVT::i8,
                           N->isVolatile(), N->isNonTemporal(),
-                          N->getAlignment(), N->getTBAAInfo());
+                          N->isInvariant(), N->getAlignment(), N->getAAInfo());
   return SDValue();
 }
 
@@ -999,7 +1006,7 @@
     return DAG.getStore(N->getChain(), SDLoc(Op), N->getVal(),
                         N->getBasePtr(), N->getPointerInfo(),
                         N->isVolatile(), N->isNonTemporal(),
-                        N->getAlignment(), N->getTBAAInfo());
+                        N->getAlignment(), N->getAAInfo());
   }
   if (N->getMemoryVT() == MVT::i16) {
     if (N->getAlignment() < 2)
@@ -1007,13 +1014,13 @@
     return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
                              N->getBasePtr(), N->getPointerInfo(), MVT::i16,
                              N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment(), N->getTBAAInfo());
+                             N->getAlignment(), N->getAAInfo());
   }
   if (N->getMemoryVT() == MVT::i8)
     return DAG.getTruncStore(N->getChain(), SDLoc(Op), N->getVal(),
                              N->getBasePtr(), N->getPointerInfo(), MVT::i8,
                              N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment(), N->getTBAAInfo());
+                             N->getAlignment(), N->getAAInfo());
   return SDValue();
 }
 
@@ -1118,8 +1125,8 @@
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   // The ABI dictates there should be one stack slot available to the callee
   // on function entry (for saving lr).
@@ -1129,8 +1136,8 @@
 
   SmallVector<CCValAssign, 16> RVLocs;
   // Analyze return values to determine the number of bytes of stack required.
-  CCState RetCCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                    getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState RetCCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                    *DAG.getContext());
   RetCCInfo.AllocateStack(CCInfo.getNextStackOffset(), 4);
   RetCCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
 
@@ -1284,8 +1291,8 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
 
@@ -1443,7 +1450,7 @@
                const SmallVectorImpl<ISD::OutputArg> &Outs,
                LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   if (!CCInfo.CheckReturn(Outs, RetCC_XCore))
     return false;
   if (CCInfo.getNextStackOffset() != 0 && isVarArg)
@@ -1467,8 +1474,8 @@
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   // Analyze return values.
   if (!isVarArg)
@@ -1541,7 +1548,8 @@
 MachineBasicBlock *
 XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                  MachineBasicBlock *BB) const {
-  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  const TargetInstrInfo &TII =
+      *getTargetMachine().getSubtargetImpl()->getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   assert((MI->getOpcode() == XCore::SELECT_CC) &&
          "Unexpected instr type to insert");
@@ -1803,7 +1811,9 @@
     // Replace unaligned store of unaligned load with memmove.
     StoreSDNode *ST  = cast<StoreSDNode>(N);
     if (!DCI.isBeforeLegalize() ||
-        allowsUnalignedMemoryAccesses(ST->getMemoryVT()) ||
+        allowsMisalignedMemoryAccesses(ST->getMemoryVT(),
+                                       ST->getAddressSpace(),
+                                       ST->getAlignment()) ||
         ST->isVolatile() || ST->isIndexed()) {
       break;
     }
@@ -1912,7 +1922,7 @@
   if (Ty->getTypeID() == Type::VoidTyID)
     return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
 
-  const DataLayout *TD = TM.getDataLayout();
+  const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
   unsigned Size = TD->getTypeAllocSize(Ty);
   if (AM.BaseGV) {
     return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&

diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 62b89c3..13154c6 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCOREISELLOWERING_H
-#define XCOREISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREISELLOWERING_H
+#define LLVM_LIB_TARGET_XCORE_XCOREISELLOWERING_H
 
 #include "XCore.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -215,4 +215,4 @@
   };
 }
 
-#endif // XCOREISELLOWERING_H
+#endif

diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index 36ea9a0..c310aa3 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp

@@ -446,16 +446,19 @@
     dl = MI->getDebugLoc();
   if (isImmMskBitp(Value)) {
     int N = Log2_32(Value) + 1;
-    return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg).addImm(N);
+    return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg)
+        .addImm(N)
+        .getInstr();
   }
   if (isImmU16(Value)) {
     int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
-    return BuildMI(MBB, MI, dl, get(Opcode), Reg).addImm(Value);
+    return BuildMI(MBB, MI, dl, get(Opcode), Reg).addImm(Value).getInstr();
   }
   MachineConstantPool *ConstantPool = MBB.getParent()->getConstantPool();
   const Constant *C = ConstantInt::get(
         Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Value);
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
   return BuildMI(MBB, MI, dl, get(XCore::LDWCP_lru6), Reg)
-            .addConstantPoolIndex(Idx);
+      .addConstantPoolIndex(Idx)
+      .getInstr();
 }

diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index e0be96b..60bb3f8 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCOREINSTRUCTIONINFO_H
-#define XCOREINSTRUCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREINSTRINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCOREINSTRINFO_H
 
 #include "XCoreRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"

diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 00cb705..d34ed7a 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td

@@ -412,7 +412,7 @@
                     (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
                     "stw $val, $addr[$offset]", []>;
 
-def STW_2rus : _F2RUS<0b0000, (outs),
+def STW_2rus : _F2RUS<0b00000, (outs),
                       (ins GRRegs:$val, GRRegs:$addr, i32imm:$offset),
                       "stw $val, $addr[$offset]", []>;
 }
@@ -902,7 +902,7 @@
                         "byterev $dst, $src",
                         [(set GRRegs:$dst, (bswap GRRegs:$src))]>;
 
-def CLZ_l2r : _FL2R<0b000111000, (outs GRRegs:$dst), (ins GRRegs:$src),
+def CLZ_l2r : _FL2R<0b0000111000, (outs GRRegs:$dst), (ins GRRegs:$src),
                     "clz $dst, $src",
                     [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
 

diff --git a/lib/Target/XCore/XCoreMCInstLower.h b/lib/Target/XCore/XCoreMCInstLower.h
index 28e702b..5691478 100644
--- a/lib/Target/XCore/XCoreMCInstLower.h
+++ b/lib/Target/XCore/XCoreMCInstLower.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCOREMCINSTLOWER_H
-#define XCOREMCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREMCINSTLOWER_H
+#define LLVM_LIB_TARGET_XCORE_XCOREMCINSTLOWER_H
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/Support/Compiler.h"
 

diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.h b/lib/Target/XCore/XCoreMachineFunctionInfo.h
index 212a5cf..078ffde 100644
--- a/lib/Target/XCore/XCoreMachineFunctionInfo.h
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCOREMACHINEFUNCTIONINFO_H
-#define XCOREMACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -103,4 +103,4 @@
 };
 } // End llvm namespace
 
-#endif // XCOREMACHINEFUNCTIONINFO_H
+#endif

diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 316c82c..5c666ae 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp

@@ -15,6 +15,7 @@
 #include "XCore.h"
 #include "XCoreInstrInfo.h"
 #include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -98,7 +99,7 @@
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc dl = MI.getDebugLoc();
   unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
-  RS->setUsed(ScratchOffset);
+  RS->setRegUsed(ScratchOffset);
   TII.loadImmediate(MBB, II, ScratchOffset, Offset);
 
   switch (MI.getOpcode()) {
@@ -170,12 +171,12 @@
   unsigned ScratchBase;
   if (OpCode==XCore::STWFI) {
     ScratchBase = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
-    RS->setUsed(ScratchBase);
+    RS->setRegUsed(ScratchBase);
   } else
     ScratchBase = Reg;
   BuildMI(MBB, II, dl, TII.get(XCore::LDAWSP_ru6), ScratchBase).addImm(0);
   unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
-  RS->setUsed(ScratchOffset);
+  RS->setRegUsed(ScratchOffset);
   TII.loadImmediate(MBB, II, ScratchOffset, Offset);
 
   switch (OpCode) {
@@ -221,7 +222,7 @@
     XCore::R8, XCore::R9,
     0
   };
-  const TargetFrameLowering *TFI = MF->getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
   if (TFI->hasFP(*MF))
     return CalleeSavedRegsFP;
   return CalleeSavedRegs;
@@ -229,7 +230,7 @@
 
 BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   Reserved.set(XCore::CP);
   Reserved.set(XCore::DP);
@@ -267,9 +268,9 @@
 
   MachineFunction &MF = *MI.getParent()->getParent();
   const XCoreInstrInfo &TII =
-          *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+      *static_cast<const XCoreInstrInfo *>(MF.getSubtarget().getInstrInfo());
 
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
   int StackSize = MF.getFrameInfo()->getStackSize();
 
@@ -323,7 +324,7 @@
 
 
 unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
 }

diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index aa617a0..5d7721c 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCOREREGISTERINFO_H
-#define XCOREREGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCOREREGISTERINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCOREREGISTERINFO_H
 
 #include "llvm/Target/TargetRegisterInfo.h"
 

diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 91b33fd..a348844 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp

@@ -33,7 +33,7 @@
   // Call __memcpy_4 if the src, dst and size are all 4 byte aligned.
   if (!AlwaysInline && (Align & 3) == 0 &&
       DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) {
-    const TargetLowering &TLI = *DAG.getTarget().getTargetLowering();
+    const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
     TargetLowering::ArgListTy Args;
     TargetLowering::ArgListEntry Entry;
     Entry.Ty = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());

diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h
index 0079de1..cfd80b3 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCORESELECTIONDAGINFO_H
-#define XCORESELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCORESELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_XCORE_XCORESELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 

diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h
index 1e9810b..695578d 100644
--- a/lib/Target/XCore/XCoreSubtarget.h
+++ b/lib/Target/XCore/XCoreSubtarget.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCORESUBTARGET_H
-#define XCORESUBTARGET_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCORESUBTARGET_H
+#define LLVM_LIB_TARGET_XCORE_XCORESUBTARGET_H
 
 #include "XCoreFrameLowering.h"
 #include "XCoreISelLowering.h"
@@ -48,14 +48,20 @@
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  const XCoreFrameLowering *getFrameLowering() const { return &FrameLowering; }
-  const XCoreTargetLowering *getTargetLowering() const { return &TLInfo; }
-  const XCoreSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
-  const TargetRegisterInfo *getRegisterInfo() const {
+  const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const XCoreFrameLowering *getFrameLowering() const override {
+    return &FrameLowering;
+  }
+  const XCoreTargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const XCoreSelectionDAGInfo *getSelectionDAGInfo() const override {
+    return &TSInfo;
+  }
+  const TargetRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  const DataLayout *getDataLayout() const { return &DL; }
+  const DataLayout *getDataLayout() const override { return &DL; }
 };
 } // End llvm namespace
 

diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 8d8bb38..0fa8c21 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp

@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "XCoreTargetMachine.h"
+#include "XCoreTargetObjectFile.h"
 #include "XCore.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Module.h"
@@ -26,10 +27,13 @@
                                        Reloc::Model RM, CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      TLOF(make_unique<XCoreTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
+XCoreTargetMachine::~XCoreTargetMachine() {}
+
 namespace {
 /// XCore Code Generator Pass Configuration Options.
 class XCorePassConfig : public TargetPassConfig {
@@ -41,6 +45,7 @@
     return getTM<XCoreTargetMachine>();
   }
 
+  void addIRPasses() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addPreEmitPass() override;
@@ -51,6 +56,12 @@
   return new XCorePassConfig(this, PM);
 }
 
+void XCorePassConfig::addIRPasses() {
+  addPass(createAtomicExpandPass(&getXCoreTargetMachine()));
+
+  TargetPassConfig::addIRPasses();
+}
+
 bool XCorePassConfig::addPreISel() {
   addPass(createXCoreLowerThreadLocalPass());
   return false;

diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 14c43bf..8ff9269 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCORETARGETMACHINE_H
-#define XCORETARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
 
 #include "XCoreSubtarget.h"
 #include "llvm/Target/TargetMachine.h"
@@ -20,37 +20,24 @@
 namespace llvm {
 
 class XCoreTargetMachine : public LLVMTargetMachine {
+  std::unique_ptr<TargetLoweringObjectFile> TLOF;
   XCoreSubtarget Subtarget;
 public:
   XCoreTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
+  ~XCoreTargetMachine() override;
 
-  const XCoreInstrInfo *getInstrInfo() const override {
-    return getSubtargetImpl()->getInstrInfo();
-  }
-  const XCoreFrameLowering *getFrameLowering() const override {
-    return getSubtargetImpl()->getFrameLowering();
-  }
   const XCoreSubtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const XCoreTargetLowering *getTargetLowering() const override {
-    return getSubtargetImpl()->getTargetLowering();
-  }
-  const XCoreSelectionDAGInfo* getSelectionDAGInfo() const override {
-    return getSubtargetImpl()->getSelectionDAGInfo();
-  }
-  const TargetRegisterInfo *getRegisterInfo() const override {
-    return getSubtargetImpl()->getRegisterInfo();
-  }
-  const DataLayout *getDataLayout() const override {
-    return getSubtargetImpl()->getDataLayout();
-  }
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   void addAnalysisPasses(PassManagerBase &PM) override;
+  TargetLoweringObjectFile *getObjFileLowering() const override {
+    return TLOF.get();
+  }
 };
 
 } // end namespace llvm

diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp
index ab0f7ad..86d0de6 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp

@@ -145,9 +145,9 @@
     if (Kind.isMergeableConst16())      return MergeableConst16Section;
   }
   Type *ObjType = GV->getType()->getPointerElementType();
-  if (TM.getCodeModel() == CodeModel::Small ||
-      !ObjType->isSized() ||
-      TM.getDataLayout()->getTypeAllocSize(ObjType) < CodeModelLargeSize) {
+  if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() ||
+      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(ObjType) <
+          CodeModelLargeSize) {
     if (Kind.isReadOnly())              return UseCPRel? ReadOnlySection
                                                        : DataRelROSection;
     if (Kind.isBSS() || Kind.isCommon())return BSSSection;
@@ -165,8 +165,9 @@
   report_fatal_error("Target does not support TLS or Common sections");
 }
 
-const MCSection *XCoreTargetObjectFile::
-getSectionForConstant(SectionKind Kind) const {
+const MCSection *
+XCoreTargetObjectFile::getSectionForConstant(SectionKind Kind,
+                                             const Constant *C) const {
   if (Kind.isMergeableConst4())           return MergeableConst4Section;
   if (Kind.isMergeableConst8())           return MergeableConst8Section;
   if (Kind.isMergeableConst16())          return MergeableConst16Section;

diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h
index 34d756e..7d3f49d 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.h
+++ b/lib/Target/XCore/XCoreTargetObjectFile.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
-#define LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
@@ -34,7 +34,8 @@
                              Mangler &Mang,
                              const TargetMachine &TM) const override;
 
-    const MCSection *getSectionForConstant(SectionKind Kind) const override;
+    const MCSection *getSectionForConstant(SectionKind Kind,
+                                           const Constant *C) const override;
   };
 } // end namespace llvm
 

diff --git a/lib/Target/XCore/XCoreTargetStreamer.h b/lib/Target/XCore/XCoreTargetStreamer.h
index 0a394da..48bf0fa 100644
--- a/lib/Target/XCore/XCoreTargetStreamer.h
+++ b/lib/Target/XCore/XCoreTargetStreamer.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef XCORETARGETSTREAMER_H
-#define XCORETARGETSTREAMER_H
+#ifndef LLVM_LIB_TARGET_XCORE_XCORETARGETSTREAMER_H
+#define LLVM_LIB_TARGET_XCORE_XCORETARGETSTREAMER_H
 
 #include "llvm/MC/MCStreamer.h"
 

diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
index 80d193d..da232da 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.cpp
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.cpp

@@ -43,17 +43,17 @@
     initializeXCoreTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() override {
+  void initializePass() override {
     pushTTIStack(this);
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
   static char ID;
 
-  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo*)this;
     return this;

diff --git a/lib/Transforms/Hello/CMakeLists.txt b/lib/Transforms/Hello/CMakeLists.txt
index e724dbc..3851b35 100644
--- a/lib/Transforms/Hello/CMakeLists.txt
+++ b/lib/Transforms/Hello/CMakeLists.txt

@@ -6,6 +6,10 @@
   endif()
 endif()
 
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core Support)
+endif()
+
 add_llvm_loadable_module( LLVMHello
   Hello.cpp
   )

diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index f9de54a..c4706e8 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp

@@ -78,11 +78,15 @@
 
     const DataLayout *DL;
   private:
+    bool isDenselyPacked(Type *type);
+    bool canPaddingBeAccessed(Argument *Arg);
     CallGraphNode *PromoteArguments(CallGraphNode *CGN);
     bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const;
     CallGraphNode *DoPromotion(Function *F,
-                               SmallPtrSet<Argument*, 8> &ArgsToPromote,
-                               SmallPtrSet<Argument*, 8> &ByValArgsToTransform);
+                              SmallPtrSetImpl<Argument*> &ArgsToPromote,
+                              SmallPtrSetImpl<Argument*> &ByValArgsToTransform);
+    
+    using llvm::Pass::doInitialization;
     bool doInitialization(CallGraph &CG) override;
     /// The maximum number of elements to expand, or 0 for unlimited.
     unsigned maxElements;
@@ -123,6 +127,78 @@
   return Changed;
 }
 
+/// \brief Checks if a type could have padding bytes.
+bool ArgPromotion::isDenselyPacked(Type *type) {
+
+  // There is no size information, so be conservative.
+  if (!type->isSized())
+    return false;
+
+  // If the alloc size is not equal to the storage size, then there are padding
+  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
+  if (!DL || DL->getTypeSizeInBits(type) != DL->getTypeAllocSizeInBits(type))
+    return false;
+
+  if (!isa<CompositeType>(type))
+    return true;
+
+  // For homogenous sequential types, check for padding within members.
+  if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
+    return isa<PointerType>(seqTy) || isDenselyPacked(seqTy->getElementType());
+
+  // Check for padding within and between elements of a struct.
+  StructType *StructTy = cast<StructType>(type);
+  const StructLayout *Layout = DL->getStructLayout(StructTy);
+  uint64_t StartPos = 0;
+  for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
+    Type *ElTy = StructTy->getElementType(i);
+    if (!isDenselyPacked(ElTy))
+      return false;
+    if (StartPos != Layout->getElementOffsetInBits(i))
+      return false;
+    StartPos += DL->getTypeAllocSizeInBits(ElTy);
+  }
+
+  return true;
+}
+
+/// \brief Checks if the padding bytes of an argument could be accessed.
+bool ArgPromotion::canPaddingBeAccessed(Argument *arg) {
+
+  assert(arg->hasByValAttr());
+
+  // Track all the pointers to the argument to make sure they are not captured.
+  SmallPtrSet<Value *, 16> PtrValues;
+  PtrValues.insert(arg);
+
+  // Track all of the stores.
+  SmallVector<StoreInst *, 16> Stores;
+
+  // Scan through the uses recursively to make sure the pointer is always used
+  // sanely.
+  SmallVector<Value *, 16> WorkList;
+  WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end());
+  while (!WorkList.empty()) {
+    Value *V = WorkList.back();
+    WorkList.pop_back();
+    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
+      if (PtrValues.insert(V).second)
+        WorkList.insert(WorkList.end(), V->user_begin(), V->user_end());
+    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
+      Stores.push_back(Store);
+    } else if (!isa<LoadInst>(V)) {
+      return true;
+    }
+  }
+
+// Check to make sure the pointers aren't captured
+  for (StoreInst *Store : Stores)
+    if (PtrValues.count(Store->getValueOperand()))
+      return true;
+
+  return false;
+}
+
 /// PromoteArguments - This method checks the specified function to see if there
 /// are any promotable arguments and if it is safe to promote the function (for
 /// example, all callers are direct).  If safe to promote some arguments, it
@@ -154,6 +230,13 @@
       isSelfRecursive = true;
   }
   
+  // Don't promote arguments for variadic functions. Adding, removing, or
+  // changing non-pack parameters can change the classification of pack
+  // parameters. Frontends encode that classification at the call site in the
+  // IR, while in the callee the classification is determined dynamically based
+  // on the number of registers consumed so far.
+  if (F->isVarArg()) return nullptr;
+
   // Check to see which arguments are promotable.  If an argument is promotable,
   // add it to ArgsToPromote.
   SmallPtrSet<Argument*, 8> ArgsToPromote;
@@ -163,9 +246,13 @@
     Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
 
     // If this is a byval argument, and if the aggregate type is small, just
-    // pass the elements, which is always safe.  This does not apply to
-    // inalloca.
-    if (PtrArg->hasByValAttr()) {
+    // pass the elements, which is always safe, if the passed value is densely
+    // packed or if we can prove the padding bytes are never accessed. This does
+    // not apply to inalloca.
+    bool isSafeToPromote =
+      PtrArg->hasByValAttr() &&
+      (isDenselyPacked(AgTy) || !canPaddingBeAccessed(PtrArg));
+    if (isSafeToPromote) {
       if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         if (maxElements > 0 && STy->getNumElements() > maxElements) {
           DEBUG(dbgs() << "argpromotion disable promoting argument '"
@@ -443,7 +530,7 @@
         // of elements of the aggregate.
         return false;
       }
-      ToPromote.insert(Operands);
+      ToPromote.insert(std::move(Operands));
     }
   }
 
@@ -475,10 +562,8 @@
     // loading block.
     for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
       BasicBlock *P = *PI;
-      for (idf_ext_iterator<BasicBlock*, SmallPtrSet<BasicBlock*, 16> >
-             I = idf_ext_begin(P, TranspBlocks),
-             E = idf_ext_end(P, TranspBlocks); I != E; ++I)
-        if (AA.canBasicBlockModify(**I, Loc))
+      for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
+        if (AA.canBasicBlockModify(*TranspBB, Loc))
           return false;
     }
   }
@@ -493,8 +578,8 @@
 /// arguments, and returns the new function.  At this point, we know that it's
 /// safe to do so.
 CallGraphNode *ArgPromotion::DoPromotion(Function *F,
-                               SmallPtrSet<Argument*, 8> &ArgsToPromote,
-                              SmallPtrSet<Argument*, 8> &ByValArgsToTransform) {
+                             SmallPtrSetImpl<Argument*> &ArgsToPromote,
+                             SmallPtrSetImpl<Argument*> &ByValArgsToTransform) {
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
@@ -615,9 +700,15 @@
 
   // Patch the pointer to LLVM function in debug info descriptor.
   auto DI = FunctionDIs.find(F);
-  if (DI != FunctionDIs.end())
-    DI->second.replaceFunction(NF);
-  
+  if (DI != FunctionDIs.end()) {
+    DISubprogram SP = DI->second;
+    SP.replaceFunction(NF);
+    // Ensure the map is updated so it can be reused on subsequent argument
+    // promotions of the same function.
+    FunctionDIs.erase(DI);
+    FunctionDIs[NF] = SP;
+  }
+
   DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
         << "From: " << *F);
   
@@ -716,9 +807,11 @@
           // of the previous load.
           LoadInst *newLoad = new LoadInst(V, V->getName()+".val", Call);
           newLoad->setAlignment(OrigLoad->getAlignment());
-          // Transfer the TBAA info too.
-          newLoad->setMetadata(LLVMContext::MD_tbaa,
-                               OrigLoad->getMetadata(LLVMContext::MD_tbaa));
+          // Transfer the AA info too.
+          AAMDNodes AAInfo;
+          OrigLoad->getAAMetadata(AAInfo);
+          newLoad->setAAMetadata(AAInfo);
+
           Args.push_back(newLoad);
           AA.copyValue(OrigLoad, Args.back());
         }

diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index 23be081..0b6ade9 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp

@@ -66,7 +66,7 @@
 
 /// Find values that are marked as llvm.used.
 static void FindUsedValues(GlobalVariable *LLVMUsed,
-                           SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
+                           SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
   if (!LLVMUsed) return;
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
 

diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index ac3853d..4045c09 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp

@@ -199,10 +199,15 @@
     return false;
 
   // Okay, we know we can transform this function if safe.  Scan its body
-  // looking for calls to llvm.vastart.
+  // looking for calls marked musttail or calls to llvm.vastart.
   for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      CallInst *CI = dyn_cast<CallInst>(I);
+      if (!CI)
+        continue;
+      if (CI->isMustTailCall())
+        return false;
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
         if (II->getIntrinsicID() == Intrinsic::vastart)
           return false;
       }
@@ -297,8 +302,14 @@
 
   // Patch the pointer to LLVM function in debug info descriptor.
   auto DI = FunctionDIs.find(&Fn);
-  if (DI != FunctionDIs.end())
-    DI->second.replaceFunction(NF);
+  if (DI != FunctionDIs.end()) {
+    DISubprogram SP = DI->second;
+    SP.replaceFunction(NF);
+    // Ensure the map is updated so it can be reused on non-varargs argument
+    // eliminations of the same function.
+    FunctionDIs.erase(DI);
+    FunctionDIs[NF] = SP;
+  }
 
   // Fix up any BlockAddresses that refer to the function.
   Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
@@ -1088,8 +1099,8 @@
   // determine that dead arguments passed into recursive functions are dead).
   //
   DEBUG(dbgs() << "DAE - Determining liveness\n");
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    SurveyFunction(*I);
+  for (auto &F : M)
+    SurveyFunction(F);
 
   // Now, remove all dead arguments and return values from each function in
   // turn.
@@ -1102,11 +1113,8 @@
 
   // Finally, look for any unused parameters in functions with non-local
   // linkage and replace the passed in parameters with undef.
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function& F = *I;
-
+  for (auto &F : M)
     Changed |= RemoveDeadArgumentsFromCallers(F);
-  }
 
   return Changed;
 }

diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 40ec9fa..2f8c7d9 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp

@@ -91,7 +91,7 @@
             continue;
         }
 
-	makeVisible(*I, Delete);
+        makeVisible(*I, Delete);
 
         if (Delete)
           I->setInitializer(nullptr);
@@ -106,7 +106,7 @@
             continue;
         }
 
-	makeVisible(*I, Delete);
+        makeVisible(*I, Delete);
 
         if (Delete)
           I->deleteBody();
@@ -118,8 +118,8 @@
         Module::alias_iterator CurI = I;
         ++I;
 
-	bool Delete = deleteStuff == (bool)Named.count(CurI);
-	makeVisible(*CurI, Delete);
+        bool Delete = deleteStuff == (bool)Named.count(CurI);
+        makeVisible(*CurI, Delete);
 
         if (Delete) {
           Type *Ty =  CurI->getType()->getElementType();
@@ -148,7 +148,7 @@
   char GVExtractorPass::ID = 0;
 }
 
-ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue*>& GVs, 
+ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue *> &GVs,
                                          bool deleteFn) {
   return new GVExtractorPass(GVs, deleteFn);
 }

diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 8174df9..823ae53 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp

@@ -161,8 +161,9 @@
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
     Function *F = (*I)->getFunction();
 
-    if (!F)
-      // External node - may write memory.  Just give up.
+    if (!F || F->hasFnAttribute(Attribute::OptimizeNone))
+      // External node or node we don't want to optimize - assume it may write
+      // memory and give up.
       return false;
 
     AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(F);
@@ -204,9 +205,11 @@
                  CI != CE; ++CI) {
               Value *Arg = *CI;
               if (Arg->getType()->isPointerTy()) {
+                AAMDNodes AAInfo;
+                I->getAAMetadata(AAInfo);
+
                 AliasAnalysis::Location Loc(Arg,
-                                            AliasAnalysis::UnknownSize,
-                                            I->getMetadata(LLVMContext::MD_tbaa));
+                                            AliasAnalysis::UnknownSize, AAInfo);
                 if (!AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) {
                   if (MRB & AliasAnalysis::Mod)
                     // Writes non-local memory.  Give up.
@@ -443,7 +446,7 @@
     case Instruction::AddrSpaceCast:
       // The original value is not read/written via this if the new value isn't.
       for (Use &UU : I->uses())
-        if (Visited.insert(&UU))
+        if (Visited.insert(&UU).second)
           Worklist.push_back(&UU);
       break;
 
@@ -457,7 +460,7 @@
       auto AddUsersToWorklistIfCapturing = [&] {
         if (Captures)
           for (Use &UU : I->uses())
-            if (Visited.insert(&UU))
+            if (Visited.insert(&UU).second)
               Worklist.push_back(&UU);
       };
 
@@ -525,7 +528,8 @@
   // looking up whether a given CallGraphNode is in this SCC.
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
     Function *F = (*I)->getFunction();
-    if (F && !F->isDeclaration() && !F->mayBeOverridden())
+    if (F && !F->isDeclaration() && !F->mayBeOverridden() &&
+        !F->hasFnAttribute(Attribute::OptimizeNone))
       SCCNodes.insert(F);
   }
 
@@ -539,8 +543,9 @@
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
     Function *F = (*I)->getFunction();
 
-    if (!F)
-      // External node - only a problem for arguments that we pass to it.
+    if (!F || F->hasFnAttribute(Attribute::OptimizeNone))
+      // External node or function we're trying not to optimize - only a problem
+      // for arguments that we pass to it.
       continue;
 
     // Definitions with weak linkage may be overridden at linktime with
@@ -792,8 +797,8 @@
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
     Function *F = (*I)->getFunction();
 
-    if (!F)
-      // External node - skip it;
+    if (!F || F->hasFnAttribute(Attribute::OptimizeNone))
+      // External node or node we don't want to optimize - skip it;
       return false;
 
     // Already noalias.
@@ -832,6 +837,9 @@
 /// given function and set any applicable attributes.  Returns true
 /// if any attributes were set and false otherwise.
 bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
+  if (F.hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
   FunctionType *FTy = F.getFunctionType();
   LibFunc::Func TheLibFunc;
   if (!(TLI->getLibFunc(F.getName(), TheLibFunc) && TLI->has(TheLibFunc)))

diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 7e7a4c0..705e929 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp

@@ -22,6 +22,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
@@ -77,9 +78,6 @@
   // Remove empty functions from the global ctors list.
   Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
 
-  typedef std::multimap<const Comdat *, GlobalValue *> ComdatGVPairsTy;
-  ComdatGVPairsTy ComdatGVPairs;
-
   // Loop over the module, adding globals which are obviously necessary.
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     Changed |= RemoveUnusedGlobalValue(*I);
@@ -87,8 +85,6 @@
     if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) {
       if (!I->isDiscardableIfUnused())
         GlobalIsNeeded(I);
-      else if (const Comdat *C = I->getComdat())
-        ComdatGVPairs.insert(std::make_pair(C, I));
     }
   }
 
@@ -100,8 +96,6 @@
     if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) {
       if (!I->isDiscardableIfUnused())
         GlobalIsNeeded(I);
-      else if (const Comdat *C = I->getComdat())
-        ComdatGVPairs.insert(std::make_pair(C, I));
     }
   }
 
@@ -111,26 +105,9 @@
     // Externally visible aliases are needed.
     if (!I->isDiscardableIfUnused()) {
       GlobalIsNeeded(I);
-    } else if (const Comdat *C = I->getComdat()) {
-      ComdatGVPairs.insert(std::make_pair(C, I));
     }
   }
 
-  for (ComdatGVPairsTy::iterator I = ComdatGVPairs.begin(),
-                                 E = ComdatGVPairs.end();
-       I != E;) {
-    ComdatGVPairsTy::iterator UB = ComdatGVPairs.upper_bound(I->first);
-    bool CanDiscard = std::all_of(I, UB, [](ComdatGVPairsTy::value_type Pair) {
-      return Pair.second->isDiscardableIfUnused();
-    });
-    if (!CanDiscard) {
-      std::for_each(I, UB, [this](ComdatGVPairsTy::value_type Pair) {
-        GlobalIsNeeded(Pair.second);
-      });
-    }
-    I = UB;
-  }
-
   // Now that all globals which are needed are in the AliveGlobals set, we loop
   // through the program, deleting those which are not alive.
   //
@@ -141,7 +118,12 @@
        I != E; ++I)
     if (!AliveGlobals.count(I)) {
       DeadGlobalVars.push_back(I);         // Keep track of dead globals
-      I->setInitializer(nullptr);
+      if (I->hasInitializer()) {
+        Constant *Init = I->getInitializer();
+        I->setInitializer(nullptr);
+        if (isSafeToDestroyConstant(Init))
+          Init->destroyConstant();
+      }
     }
 
   // The second pass drops the bodies of functions which are dead...
@@ -203,9 +185,22 @@
 /// recursively mark anything that it uses as also needed.
 void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
   // If the global is already in the set, no need to reprocess it.
-  if (!AliveGlobals.insert(G))
+  if (!AliveGlobals.insert(G).second)
     return;
-  
+
+  Module *M = G->getParent();
+  if (Comdat *C = G->getComdat()) {
+    for (Function &F : *M)
+      if (F.getComdat() == C)
+        GlobalIsNeeded(&F);
+    for (GlobalVariable &GV : M->globals())
+      if (GV.getComdat() == C)
+        GlobalIsNeeded(&GV);
+    for (GlobalAlias &GA : M->aliases())
+      if (GA.getComdat() == C)
+        GlobalIsNeeded(&GA);
+  }
+
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) {
     // If this is a global variable, we must make sure to add any global values
     // referenced by the initializer to the alive set.
@@ -243,7 +238,7 @@
   for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I) {
     // If we've already processed this constant there's no need to do it again.
     Constant *Op = dyn_cast<Constant>(*I);
-    if (Op && SeenConstants.insert(Op))
+    if (Op && SeenConstants.insert(Op).second)
       MarkUsedGlobalsAsNeeded(Op);
   }
 }

diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index c1d0d3b..6e0ae83 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp

@@ -88,6 +88,7 @@
 
     const DataLayout *DL;
     TargetLibraryInfo *TLI;
+    SmallSet<const Comdat *, 8> NotDiscardableComdats;
   };
 }
 
@@ -612,7 +613,7 @@
 /// value will trap if the value is dynamically null.  PHIs keeps track of any
 /// phi nodes we've seen to avoid reprocessing them.
 static bool AllUsesOfValueWillTrapIfNull(const Value *V,
-                                         SmallPtrSet<const PHINode*, 8> &PHIs) {
+                                        SmallPtrSetImpl<const PHINode*> &PHIs) {
   for (const User *U : V->users())
     if (isa<LoadInst>(U)) {
       // Will trap.
@@ -638,7 +639,7 @@
     } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
       // If we've already seen this phi node, ignore it, it has already been
       // checked.
-      if (PHIs.insert(PN) && !AllUsesOfValueWillTrapIfNull(PN, PHIs))
+      if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs))
         return false;
     } else if (isa<ICmpInst>(U) &&
                isa<ConstantPointerNull>(U->getOperand(1))) {
@@ -957,7 +958,7 @@
 /// it is to the specified global.
 static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
                                                       const GlobalVariable *GV,
-                                         SmallPtrSet<const PHINode*, 8> &PHIs) {
+                                        SmallPtrSetImpl<const PHINode*> &PHIs) {
   for (const User *U : V->users()) {
     const Instruction *Inst = cast<Instruction>(U);
 
@@ -981,7 +982,7 @@
     if (const PHINode *PN = dyn_cast<PHINode>(Inst)) {
       // PHIs are ok if all uses are ok.  Don't infinitely recurse through PHI
       // cycles.
-      if (PHIs.insert(PN))
+      if (PHIs.insert(PN).second)
         if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs))
           return false;
       continue;
@@ -1047,8 +1048,8 @@
 /// of a load) are simple enough to perform heap SRA on.  This permits GEP's
 /// that index through the array and struct field, icmps of null, and PHIs.
 static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
-                        SmallPtrSet<const PHINode*, 32> &LoadUsingPHIs,
-                        SmallPtrSet<const PHINode*, 32> &LoadUsingPHIsPerLoad) {
+                        SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs,
+                        SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) {
   // We permit two users of the load: setcc comparing against the null
   // pointer, and a getelementptr of a specific form.
   for (const User *U : V->users()) {
@@ -1072,11 +1073,11 @@
     }
 
     if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
-      if (!LoadUsingPHIsPerLoad.insert(PN))
+      if (!LoadUsingPHIsPerLoad.insert(PN).second)
         // This means some phi nodes are dependent on each other.
         // Avoid infinite looping!
         return false;
-      if (!LoadUsingPHIs.insert(PN))
+      if (!LoadUsingPHIs.insert(PN).second)
         // If we have already analyzed this PHI, then it is safe.
         continue;
 
@@ -1115,9 +1116,7 @@
   // that all inputs the to the PHI nodes are in the same equivalence sets.
   // Check to verify that all operands of the PHIs are either PHIS that can be
   // transformed, loads from GV, or MI itself.
-  for (SmallPtrSet<const PHINode*, 32>::const_iterator I = LoadUsingPHIs.begin()
-       , E = LoadUsingPHIs.end(); I != E; ++I) {
-    const PHINode *PN = *I;
+  for (const PHINode *PN : LoadUsingPHIs) {
     for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
       Value *InVal = PN->getIncomingValue(op);
 
@@ -1910,8 +1909,11 @@
     // Functions without names cannot be referenced outside this module.
     if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
       F->setLinkage(GlobalValue::InternalLinkage);
+
+    const Comdat *C = F->getComdat();
+    bool inComdat = C && NotDiscardableComdats.count(C);
     F->removeDeadConstantUsers();
-    if (F->isDefTriviallyDead()) {
+    if ((!inComdat || F->hasLocalLinkage()) && F->isDefTriviallyDead()) {
       F->eraseFromParent();
       Changed = true;
       ++NumFnDeleted;
@@ -1943,12 +1945,6 @@
 bool GlobalOpt::OptimizeGlobalVars(Module &M) {
   bool Changed = false;
 
-  SmallSet<const Comdat *, 8> NotDiscardableComdats;
-  for (const GlobalVariable &GV : M.globals())
-    if (const Comdat *C = GV.getComdat())
-      if (!GV.isDiscardableIfUnused())
-        NotDiscardableComdats.insert(C);
-
   for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
        GVI != E; ) {
     GlobalVariable *GV = GVI++;
@@ -1965,7 +1961,7 @@
 
     if (GV->isDiscardableIfUnused()) {
       if (const Comdat *C = GV->getComdat())
-        if (NotDiscardableComdats.count(C))
+        if (NotDiscardableComdats.count(C) && !GV->hasLocalLinkage())
           continue;
       Changed |= ProcessGlobal(GV, GVI);
     }
@@ -1975,7 +1971,7 @@
 
 static inline bool
 isSimpleEnoughValueToCommit(Constant *C,
-                            SmallPtrSet<Constant*, 8> &SimpleConstants,
+                            SmallPtrSetImpl<Constant*> &SimpleConstants,
                             const DataLayout *DL);
 
 
@@ -1988,7 +1984,7 @@
 /// in SimpleConstants to avoid having to rescan the same constants all the
 /// time.
 static bool isSimpleEnoughValueToCommitHelper(Constant *C,
-                                   SmallPtrSet<Constant*, 8> &SimpleConstants,
+                                   SmallPtrSetImpl<Constant*> &SimpleConstants,
                                    const DataLayout *DL) {
   // Simple global addresses are supported, do not allow dllimport or
   // thread-local globals.
@@ -2046,10 +2042,11 @@
 
 static inline bool
 isSimpleEnoughValueToCommit(Constant *C,
-                            SmallPtrSet<Constant*, 8> &SimpleConstants,
+                            SmallPtrSetImpl<Constant*> &SimpleConstants,
                             const DataLayout *DL) {
   // If we already checked this constant, we win.
-  if (!SimpleConstants.insert(C)) return true;
+  if (!SimpleConstants.insert(C).second)
+    return true;
   // Check the constant.
   return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
 }
@@ -2217,7 +2214,7 @@
     return MutatedMemory;
   }
 
-  const SmallPtrSet<GlobalVariable*, 8> &getInvariants() const {
+  const SmallPtrSetImpl<GlobalVariable*> &getInvariants() const {
     return Invariants;
   }
 
@@ -2394,6 +2391,17 @@
                                            getVal(SI->getOperand(2)));
       DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
             << "\n");
+    } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getExtractValue(
+          getVal(EVI->getAggregateOperand()), EVI->getIndices());
+      DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: " << *InstResult
+                   << "\n");
+    } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
+      InstResult = ConstantExpr::getInsertValue(
+          getVal(IVI->getAggregateOperand()),
+          getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
+      DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: " << *InstResult
+                   << "\n");
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
       Constant *P = getVal(GEP->getOperand(0));
       SmallVector<Constant*, 8> GEPOps;
@@ -2663,7 +2671,7 @@
     // Okay, we succeeded in evaluating this control flow.  See if we have
     // executed the new block before.  If so, we have a looping function,
     // which we cannot evaluate in reasonable time.
-    if (!ExecutedBlocks.insert(NextBB))
+    if (!ExecutedBlocks.insert(NextBB).second)
       return false;  // looped!
 
     // Okay, we have never been in this block before.  Check to see if there
@@ -2700,10 +2708,8 @@
            Eval.getMutatedMemory().begin(), E = Eval.getMutatedMemory().end();
          I != E; ++I)
       CommitValueTo(I->second, I->first);
-    for (SmallPtrSet<GlobalVariable*, 8>::const_iterator I =
-           Eval.getInvariants().begin(), E = Eval.getInvariants().end();
-         I != E; ++I)
-      (*I)->setConstant(true);
+    for (GlobalVariable *GV : Eval.getInvariants())
+      GV->setConstant(true);
   }
 
   return EvalSuccess;
@@ -2714,7 +2720,7 @@
 }
 
 static void setUsedInitializer(GlobalVariable &V,
-                               SmallPtrSet<GlobalValue *, 8> Init) {
+                               const SmallPtrSet<GlobalValue *, 8> &Init) {
   if (Init.empty()) {
     V.eraseFromParent();
     return;
@@ -2724,10 +2730,9 @@
   PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0);
 
   SmallVector<llvm::Constant *, 8> UsedArray;
-  for (SmallPtrSet<GlobalValue *, 8>::iterator I = Init.begin(), E = Init.end();
-       I != E; ++I) {
+  for (GlobalValue *GV : Init) {
     Constant *Cast
-      = ConstantExpr::getPointerBitCastOrAddrSpaceCast(*I, Int8PtrTy);
+      = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy);
     UsedArray.push_back(Cast);
   }
   // Sort to get deterministic order.
@@ -2758,18 +2763,27 @@
     CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true);
   }
   typedef SmallPtrSet<GlobalValue *, 8>::iterator iterator;
+  typedef iterator_range<iterator> used_iterator_range;
   iterator usedBegin() { return Used.begin(); }
   iterator usedEnd() { return Used.end(); }
+  used_iterator_range used() {
+    return used_iterator_range(usedBegin(), usedEnd());
+  }
   iterator compilerUsedBegin() { return CompilerUsed.begin(); }
   iterator compilerUsedEnd() { return CompilerUsed.end(); }
+  used_iterator_range compilerUsed() {
+    return used_iterator_range(compilerUsedBegin(), compilerUsedEnd());
+  }
   bool usedCount(GlobalValue *GV) const { return Used.count(GV); }
   bool compilerUsedCount(GlobalValue *GV) const {
     return CompilerUsed.count(GV);
   }
   bool usedErase(GlobalValue *GV) { return Used.erase(GV); }
   bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); }
-  bool usedInsert(GlobalValue *GV) { return Used.insert(GV); }
-  bool compilerUsedInsert(GlobalValue *GV) { return CompilerUsed.insert(GV); }
+  bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; }
+  bool compilerUsedInsert(GlobalValue *GV) {
+    return CompilerUsed.insert(GV).second;
+  }
 
   void syncVariablesAndSets() {
     if (UsedV)
@@ -2814,7 +2828,8 @@
   return U.usedCount(&GA) || U.compilerUsedCount(&GA);
 }
 
-static bool hasUsesToReplace(GlobalAlias &GA, LLVMUsed &U, bool &RenameTarget) {
+static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U,
+                             bool &RenameTarget) {
   RenameTarget = false;
   bool Ret = false;
   if (hasUseOtherThanLLVMUsed(GA, U))
@@ -2849,10 +2864,8 @@
   bool Changed = false;
   LLVMUsed Used(M);
 
-  for (SmallPtrSet<GlobalValue *, 8>::iterator I = Used.usedBegin(),
-                                               E = Used.usedEnd();
-       I != E; ++I)
-    Used.compilerUsedErase(*I);
+  for (GlobalValue *GV : Used.used())
+    Used.compilerUsedErase(GV);
 
   for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E;) {
@@ -2963,7 +2976,7 @@
       SmallPtrSet<const Function *, 8> NewCalledFunctions(CalledFunctions);
 
       // Don't treat recursive functions as empty.
-      if (!NewCalledFunctions.insert(CalledFn))
+      if (!NewCalledFunctions.insert(CalledFn).second)
         return false;
 
       if (!cxxDtorIsEmpty(*CalledFn, NewCalledFunctions))
@@ -3035,6 +3048,20 @@
   while (LocalChange) {
     LocalChange = false;
 
+    NotDiscardableComdats.clear();
+    for (const GlobalVariable &GV : M.globals())
+      if (const Comdat *C = GV.getComdat())
+        if (!GV.isDiscardableIfUnused() || !GV.use_empty())
+          NotDiscardableComdats.insert(C);
+    for (Function &F : M)
+      if (const Comdat *C = F.getComdat())
+        if (!F.isDefTriviallyDead())
+          NotDiscardableComdats.insert(C);
+    for (GlobalAlias &GA : M.aliases())
+      if (const Comdat *C = GA.getComdat())
+        if (!GA.isDiscardableIfUnused() || !GA.use_empty())
+          NotDiscardableComdats.insert(C);
+
     // Delete functions that are trivially dead, ccc -> fastcc
     LocalChange |= OptimizeFunctions(M);
 

diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 624cb90..819b2e0 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp

@@ -14,6 +14,8 @@
 
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/IR/CallSite.h"
@@ -65,6 +67,8 @@
 char AlwaysInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",

diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index d189756..d9a2b9e 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp

@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/IR/CallSite.h"
@@ -73,6 +75,8 @@
 char SimpleInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(SimpleInliner, "inline",

diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 9087ab2..3abe7a8 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp

@@ -16,6 +16,8 @@
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/IR/CallSite.h"
@@ -74,6 +76,8 @@
 /// the call graph.  If the derived class implements this method, it should
 /// always explicitly call the implementation here.
 void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<AssumptionTracker>();
   CallGraphSCCPass::getAnalysisUsage(AU);
 }
 
@@ -215,7 +219,7 @@
       
       // If the inlined function already uses this alloca then we can't reuse
       // it.
-      if (!UsedAllocas.insert(AvailableAlloca))
+      if (!UsedAllocas.insert(AvailableAlloca).second)
         continue;
       
       // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
@@ -357,8 +361,7 @@
   // FIXME: All of this logic should be sunk into getInlineCost. It relies on
   // the internal implementation of the inline cost metrics rather than
   // treating them as truly abstract units etc.
-  if (Caller->hasLocalLinkage() ||
-      Caller->getLinkage() == GlobalValue::LinkOnceODRLinkage) {
+  if (Caller->hasLocalLinkage() || Caller->hasLinkOnceODRLinkage()) {
     int TotalSecondaryCost = 0;
     // The candidate cost to be imposed upon the current function.
     int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1);
@@ -440,9 +443,11 @@
 
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+  AliasAnalysis *AA = &getAnalysis<AliasAnalysis>();
 
   SmallPtrSet<Function*, 8> SCCFunctions;
   DEBUG(dbgs() << "Inliner visiting SCC:");
@@ -501,7 +506,7 @@
 
   
   InlinedArrayAllocasTy InlinedArrayAllocas;
-  InlineFunctionInfo InlineInfo(&CG, DL);
+  InlineFunctionInfo InlineInfo(&CG, DL, AA, AT);
   
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
@@ -664,6 +669,13 @@
 
     if (!F->isDefTriviallyDead())
       continue;
+
+    // It is unsafe to drop a function with discardable linkage from a COMDAT
+    // without also dropping the other members of the COMDAT.
+    // The inliner doesn't visit non-function entities which are in COMDAT
+    // groups so it is unsafe to do so *unless* the linkage is local.
+    if (!F->hasLocalLinkage() && F->hasComdat())
+      continue;
     
     // Remove any call graph edges from the function to its callees.
     CGN->removeAllCalledFunctions();

diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index c970a1a..7950163 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp

@@ -148,9 +148,7 @@
   // we don't see references from function local inline assembly. To be
   // conservative, we internalize symbols in llvm.compiler.used, but we
   // keep llvm.compiler.used so that the symbol is not deleted by llvm.
-  for (SmallPtrSet<GlobalValue *, 8>::iterator I = Used.begin(), E = Used.end();
-       I != E; ++I) {
-    GlobalValue *V = *I;
+  for (GlobalValue *V : Used) {
     ExternalNames.insert(V->getName());
   }
 

diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 559ef0b..b91ebf2 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp

@@ -286,7 +286,7 @@
   /// 6.4.Load: range metadata (as integer numbers)
   /// On this stage its better to see the code, since its not more than 10-15
   /// strings for particular instruction, and could change sometimes.
-  int cmpOperation(const Instruction *L, const Instruction *R) const;
+  int cmpOperations(const Instruction *L, const Instruction *R) const;
 
   /// Compare two GEPs for equivalent pointer arithmetic.
   /// Parts to be compared for each comparison stage,
@@ -297,9 +297,9 @@
   /// 3. Pointer operand type (using cmpType method).
   /// 4. Number of operands.
   /// 5. Compare operands, using cmpValues method.
-  int cmpGEP(const GEPOperator *GEPL, const GEPOperator *GEPR);
-  int cmpGEP(const GetElementPtrInst *GEPL, const GetElementPtrInst *GEPR) {
-    return cmpGEP(cast<GEPOperator>(GEPL), cast<GEPOperator>(GEPR));
+  int cmpGEPs(const GEPOperator *GEPL, const GEPOperator *GEPR);
+  int cmpGEPs(const GetElementPtrInst *GEPL, const GetElementPtrInst *GEPR) {
+    return cmpGEPs(cast<GEPOperator>(GEPL), cast<GEPOperator>(GEPR));
   }
 
   /// cmpType - compares two types,
@@ -342,12 +342,12 @@
   /// be checked with the same way. If we get Res != 0 on some stage, return it.
   /// Otherwise return 0.
   /// 6. For all other cases put llvm_unreachable.
-  int cmpType(Type *TyL, Type *TyR) const;
+  int cmpTypes(Type *TyL, Type *TyR) const;
 
   int cmpNumbers(uint64_t L, uint64_t R) const;
 
-  int cmpAPInt(const APInt &L, const APInt &R) const;
-  int cmpAPFloat(const APFloat &L, const APFloat &R) const;
+  int cmpAPInts(const APInt &L, const APInt &R) const;
+  int cmpAPFloats(const APFloat &L, const APFloat &R) const;
   int cmpStrings(StringRef L, StringRef R) const;
   int cmpAttrs(const AttributeSet L, const AttributeSet R) const;
 
@@ -392,15 +392,15 @@
   DenseMap<const Value*, int> sn_mapL, sn_mapR;
 };
 
-class FunctionPtr {
+class FunctionNode {
   AssertingVH<Function> F;
   const DataLayout *DL;
 
 public:
-  FunctionPtr(Function *F, const DataLayout *DL) : F(F), DL(DL) {}
+  FunctionNode(Function *F, const DataLayout *DL) : F(F), DL(DL) {}
   Function *getFunc() const { return F; }
   void release() { F = 0; }
-  bool operator<(const FunctionPtr &RHS) const {
+  bool operator<(const FunctionNode &RHS) const {
     return (FunctionComparator(DL, F, RHS.getFunc()).compare()) == -1;
   }
 };
@@ -412,7 +412,7 @@
   return 0;
 }
 
-int FunctionComparator::cmpAPInt(const APInt &L, const APInt &R) const {
+int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
   if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
     return Res;
   if (L.ugt(R)) return 1;
@@ -420,11 +420,11 @@
   return 0;
 }
 
-int FunctionComparator::cmpAPFloat(const APFloat &L, const APFloat &R) const {
+int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
   if (int Res = cmpNumbers((uint64_t)&L.getSemantics(),
                            (uint64_t)&R.getSemantics()))
     return Res;
-  return cmpAPInt(L.bitcastToAPInt(), R.bitcastToAPInt());
+  return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt());
 }
 
 int FunctionComparator::cmpStrings(StringRef L, StringRef R) const {
@@ -474,7 +474,7 @@
   // Check whether types are bitcastable. This part is just re-factored
   // Type::canLosslesslyBitCastTo method, but instead of returning true/false,
   // we also pack into result which type is "less" for us.
-  int TypesRes = cmpType(TyL, TyR);
+  int TypesRes = cmpTypes(TyL, TyR);
   if (TypesRes != 0) {
     // Types are different, but check whether we can bitcast them.
     if (!TyL->isFirstClassType()) {
@@ -541,12 +541,12 @@
   case Value::ConstantIntVal: {
     const APInt &LInt = cast<ConstantInt>(L)->getValue();
     const APInt &RInt = cast<ConstantInt>(R)->getValue();
-    return cmpAPInt(LInt, RInt);
+    return cmpAPInts(LInt, RInt);
   }
   case Value::ConstantFPVal: {
     const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF();
     const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF();
-    return cmpAPFloat(LAPF, RAPF);
+    return cmpAPFloats(LAPF, RAPF);
   }
   case Value::ConstantArrayVal: {
     const ConstantArray *LA = cast<ConstantArray>(L);
@@ -615,7 +615,7 @@
 /// cmpType - compares two types,
 /// defines total ordering among the types set.
 /// See method declaration comments for more details.
-int FunctionComparator::cmpType(Type *TyL, Type *TyR) const {
+int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
 
   PointerType *PTyL = dyn_cast<PointerType>(TyL);
   PointerType *PTyR = dyn_cast<PointerType>(TyR);
@@ -665,8 +665,7 @@
       return cmpNumbers(STyL->isPacked(), STyR->isPacked());
 
     for (unsigned i = 0, e = STyL->getNumElements(); i != e; ++i) {
-      if (int Res = cmpType(STyL->getElementType(i),
-                            STyR->getElementType(i)))
+      if (int Res = cmpTypes(STyL->getElementType(i), STyR->getElementType(i)))
         return Res;
     }
     return 0;
@@ -681,11 +680,11 @@
     if (FTyL->isVarArg() != FTyR->isVarArg())
       return cmpNumbers(FTyL->isVarArg(), FTyR->isVarArg());
 
-    if (int Res = cmpType(FTyL->getReturnType(), FTyR->getReturnType()))
+    if (int Res = cmpTypes(FTyL->getReturnType(), FTyR->getReturnType()))
       return Res;
 
     for (unsigned i = 0, e = FTyL->getNumParams(); i != e; ++i) {
-      if (int Res = cmpType(FTyL->getParamType(i), FTyR->getParamType(i)))
+      if (int Res = cmpTypes(FTyL->getParamType(i), FTyR->getParamType(i)))
         return Res;
     }
     return 0;
@@ -696,7 +695,7 @@
     ArrayType *ATyR = cast<ArrayType>(TyR);
     if (ATyL->getNumElements() != ATyR->getNumElements())
       return cmpNumbers(ATyL->getNumElements(), ATyR->getNumElements());
-    return cmpType(ATyL->getElementType(), ATyR->getElementType());
+    return cmpTypes(ATyL->getElementType(), ATyR->getElementType());
   }
   }
 }
@@ -705,8 +704,8 @@
 // and pointer-to-B are equivalent. This should be kept in sync with
 // Instruction::isSameOperationAs.
 // Read method declaration comments for more details.
-int FunctionComparator::cmpOperation(const Instruction *L,
-                                     const Instruction *R) const {
+int FunctionComparator::cmpOperations(const Instruction *L,
+                                      const Instruction *R) const {
   // Differences from Instruction::isSameOperationAs:
   //  * replace type comparison with calls to isEquivalentType.
   //  * we test for I->hasSameSubclassOptionalData (nuw/nsw/tail) at the top
@@ -717,7 +716,7 @@
   if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
     return Res;
 
-  if (int Res = cmpType(L->getType(), R->getType()))
+  if (int Res = cmpTypes(L->getType(), R->getType()))
     return Res;
 
   if (int Res = cmpNumbers(L->getRawSubclassOptionalData(),
@@ -728,7 +727,7 @@
   // if all operands are the same type
   for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) {
     if (int Res =
-            cmpType(L->getOperand(i)->getType(), R->getOperand(i)->getType()))
+            cmpTypes(L->getOperand(i)->getType(), R->getOperand(i)->getType()))
       return Res;
   }
 
@@ -766,13 +765,23 @@
     if (int Res = cmpNumbers(CI->getCallingConv(),
                              cast<CallInst>(R)->getCallingConv()))
       return Res;
-    return cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes());
+    if (int Res =
+            cmpAttrs(CI->getAttributes(), cast<CallInst>(R)->getAttributes()))
+      return Res;
+    return cmpNumbers(
+        (uint64_t)CI->getMetadata(LLVMContext::MD_range),
+        (uint64_t)cast<CallInst>(R)->getMetadata(LLVMContext::MD_range));
   }
   if (const InvokeInst *CI = dyn_cast<InvokeInst>(L)) {
     if (int Res = cmpNumbers(CI->getCallingConv(),
                              cast<InvokeInst>(R)->getCallingConv()))
       return Res;
-    return cmpAttrs(CI->getAttributes(), cast<InvokeInst>(R)->getAttributes());
+    if (int Res =
+            cmpAttrs(CI->getAttributes(), cast<InvokeInst>(R)->getAttributes()))
+      return Res;
+    return cmpNumbers(
+        (uint64_t)CI->getMetadata(LLVMContext::MD_range),
+        (uint64_t)cast<InvokeInst>(R)->getMetadata(LLVMContext::MD_range));
   }
   if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
     ArrayRef<unsigned> LIndices = IVI->getIndices();
@@ -835,7 +844,7 @@
 
 // Determine whether two GEP operations perform the same underlying arithmetic.
 // Read method declaration comments for more details.
-int FunctionComparator::cmpGEP(const GEPOperator *GEPL,
+int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
                                const GEPOperator *GEPR) {
 
   unsigned int ASL = GEPL->getPointerAddressSpace();
@@ -851,7 +860,7 @@
     APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
     if (GEPL->accumulateConstantOffset(*DL, OffsetL) &&
         GEPR->accumulateConstantOffset(*DL, OffsetR))
-      return cmpAPInt(OffsetL, OffsetR);
+      return cmpAPInts(OffsetL, OffsetR);
   }
 
   if (int Res = cmpNumbers((uint64_t)GEPL->getPointerOperand()->getType(),
@@ -935,10 +944,10 @@
       if (int Res =
               cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand()))
         return Res;
-      if (int Res = cmpGEP(GEPL, GEPR))
+      if (int Res = cmpGEPs(GEPL, GEPR))
         return Res;
     } else {
-      if (int Res = cmpOperation(InstL, InstR))
+      if (int Res = cmpOperations(InstL, InstR))
         return Res;
       assert(InstL->getNumOperands() == InstR->getNumOperands());
 
@@ -950,7 +959,7 @@
         if (int Res = cmpNumbers(OpL->getValueID(), OpR->getValueID()))
           return Res;
         // TODO: Already checked in cmpOperation
-        if (int Res = cmpType(OpL->getType(), OpR->getType()))
+        if (int Res = cmpTypes(OpL->getType(), OpR->getType()))
           return Res;
       }
     }
@@ -998,7 +1007,7 @@
   if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv()))
     return Res;
 
-  if (int Res = cmpType(FnL->getFunctionType(), FnR->getFunctionType()))
+  if (int Res = cmpTypes(FnL->getFunctionType(), FnR->getFunctionType()))
     return Res;
 
   assert(FnL->arg_size() == FnR->arg_size() &&
@@ -1040,7 +1049,7 @@
 
     assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
     for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
-      if (!VisitedBBs.insert(TermL->getSuccessor(i)))
+      if (!VisitedBBs.insert(TermL->getSuccessor(i)).second)
         continue;
 
       FnLBBs.push_back(TermL->getSuccessor(i));
@@ -1068,7 +1077,7 @@
   bool runOnModule(Module &M) override;
 
 private:
-  typedef std::set<FunctionPtr> FnTreeType;
+  typedef std::set<FunctionNode> FnTreeType;
 
   /// A work queue of functions that may have been modified and should be
   /// analyzed again.
@@ -1291,11 +1300,11 @@
     Value *Result = UndefValue::get(DestTy);
     for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
       Value *Element = createCast(
-          Builder, Builder.CreateExtractValue(V, ArrayRef<unsigned int>(I)),
+          Builder, Builder.CreateExtractValue(V, makeArrayRef(I)),
           DestTy->getStructElementType(I));
 
       Result =
-          Builder.CreateInsertValue(Result, Element, ArrayRef<unsigned int>(I));
+          Builder.CreateInsertValue(Result, Element, makeArrayRef(I));
     }
     return Result;
   }
@@ -1411,14 +1420,14 @@
 // that was already inserted.
 bool MergeFunctions::insert(Function *NewFunction) {
   std::pair<FnTreeType::iterator, bool> Result =
-      FnTree.insert(FunctionPtr(NewFunction, DL));
+      FnTree.insert(FunctionNode(NewFunction, DL));
 
   if (Result.second) {
     DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName() << '\n');
     return false;
   }
 
-  const FunctionPtr &OldF = *Result.first;
+  const FunctionNode &OldF = *Result.first;
 
   // Don't merge tiny functions, since it can just end up making the function
   // larger.
@@ -1448,7 +1457,7 @@
 void MergeFunctions::remove(Function *F) {
   // We need to make sure we remove F, not a function "equal" to F per the
   // function equality comparator.
-  FnTreeType::iterator found = FnTree.find(FunctionPtr(F, DL));
+  FnTreeType::iterator found = FnTree.find(FunctionNode(F, DL));
   size_t Erased = 0;
   if (found != FnTree.end() && found->getFunc() == F) {
     Erased = 1;

diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 46a3187..da85a91 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp

@@ -17,11 +17,14 @@
 #include "llvm-c/Transforms/PassManagerBuilder.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -45,6 +48,10 @@
   cl::init(false), cl::Hidden,
   cl::desc("Run GVN instead of Early CSE after vectorization passes"));
 
+static cl::opt<bool> ExtraVectorizerPasses(
+    "extra-vectorizer-passes", cl::init(false), cl::Hidden,
+    cl::desc("Run cleanup optimization passes after vectorization."));
+
 static cl::opt<bool> UseNewSROA("use-new-sroa",
   cl::init(true), cl::Hidden,
   cl::desc("Enable the new, experimental SROA pass"));
@@ -57,6 +64,20 @@
                                     cl::Hidden,
                                     cl::desc("Run the load combining pass"));
 
+static cl::opt<bool>
+RunSLPAfterLoopVectorization("run-slp-after-loop-vectorization",
+  cl::init(true), cl::Hidden,
+  cl::desc("Run the SLP vectorizer (and BB vectorizer) after the Loop "
+           "vectorizer instead of before"));
+
+static cl::opt<bool> UseCFLAA("use-cfl-aa",
+  cl::init(false), cl::Hidden,
+  cl::desc("Enable the new, experimental CFL alias analysis"));
+
+static cl::opt<bool>
+EnableMLSM("mlsm", cl::init(true), cl::Hidden,
+           cl::desc("Enable motion of merged load and store"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -70,6 +91,11 @@
     LoopVectorize = RunLoopVectorization;
     RerollLoops = RunLoopRerolling;
     LoadCombine = RunLoadCombine;
+    DisableGVNLoadPRE = false;
+    VerifyInput = false;
+    VerifyOutput = false;
+    StripDebug = false;
+    MergeFunctions = false;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -106,7 +132,10 @@
   // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
+  if (UseCFLAA)
+    PM.add(createCFLAliasAnalysisPass());
   PM.add(createTypeBasedAliasAnalysisPass());
+  PM.add(createScopedNoAliasAAPass());
   PM.add(createBasicAliasAnalysisPass());
 }
 
@@ -130,18 +159,22 @@
 }
 
 void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
-  // If all optimizations are disabled, just run the always-inline pass.
+  // If all optimizations are disabled, just run the always-inline pass and,
+  // if enabled, the function merging pass.
   if (OptLevel == 0) {
     if (Inliner) {
       MPM.add(Inliner);
       Inliner = nullptr;
     }
 
-    // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
-    // pass manager, but we don't want to add extensions into that pass manager.
-    // To prevent this we must insert a no-op module pass to reset the pass
-    // manager to get the same behavior as EP_OptimizerLast in non-O0 builds.
-    if (!GlobalExtensions->empty() || !Extensions.empty())
+    // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
+    // creates a CGSCC pass manager, but we don't want to add extensions into
+    // that pass manager. To prevent this we insert a no-op module pass to reset
+    // the pass manager to get the same behavior as EP_OptimizerLast in non-O0
+    // builds. The function merging pass is 
+    if (MergeFunctions)
+      MPM.add(createMergeFunctionsPass());
+    else if (!GlobalExtensions->empty() || !Extensions.empty())
       MPM.add(createBarrierNoopPass());
 
     addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
@@ -207,8 +240,11 @@
     MPM.add(createSimpleLoopUnrollPass());    // Unroll small loops
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
 
-  if (OptLevel > 1)
-    MPM.add(createGVNPass());                 // Remove redundancies
+  if (OptLevel > 1) {
+    if (EnableMLSM)
+      MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
+    MPM.add(createGVNPass(DisableGVNLoadPRE));  // Remove redundancies
+  }
   MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
   MPM.add(createSCCPPass());                  // Constant prop with SCCP
 
@@ -224,21 +260,23 @@
 
   if (RerollLoops)
     MPM.add(createLoopRerollPass());
-  if (SLPVectorize)
-    MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
+  if (!RunSLPAfterLoopVectorization) {
+    if (SLPVectorize)
+      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
 
-  if (BBVectorize) {
-    MPM.add(createBBVectorizePass());
-    MPM.add(createInstructionCombiningPass());
-    addExtensionsToPM(EP_Peephole, MPM);
-    if (OptLevel > 1 && UseGVNAfterVectorization)
-      MPM.add(createGVNPass());           // Remove redundancies
-    else
-      MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
+    if (BBVectorize) {
+      MPM.add(createBBVectorizePass());
+      MPM.add(createInstructionCombiningPass());
+      addExtensionsToPM(EP_Peephole, MPM);
+      if (OptLevel > 1 && UseGVNAfterVectorization)
+        MPM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+      else
+        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
 
-    // BBVectorize may have significantly shortened a loop body; unroll again.
-    if (!DisableUnrollLoops)
-      MPM.add(createLoopUnrollPass());
+      // BBVectorize may have significantly shortened a loop body; unroll again.
+      if (!DisableUnrollLoops)
+        MPM.add(createLoopUnrollPass());
+    }
   }
 
   if (LoadCombine)
@@ -253,6 +291,13 @@
   // pass manager that we are specifically trying to avoid. To prevent this
   // we must insert a no-op module pass to reset the pass manager.
   MPM.add(createBarrierNoopPass());
+
+  // Re-rotate loops in all our loop nests. These may have fallout out of
+  // rotated form due to GVN or other transformations, and the vectorizer relies
+  // on the rotated form.
+  if (ExtraVectorizerPasses)
+    MPM.add(createLoopRotatePass());
+
   MPM.add(createLoopVectorizePass(DisableUnrollLoops, LoopVectorize));
   // FIXME: Because of #pragma vectorize enable, the passes below are always
   // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
@@ -260,12 +305,56 @@
   // as function calls, so that we can only pass them when the vectorizer
   // changed the code.
   MPM.add(createInstructionCombiningPass());
+  if (OptLevel > 1 && ExtraVectorizerPasses) {
+    // At higher optimization levels, try to clean up any runtime overlap and
+    // alignment checks inserted by the vectorizer. We want to track correllated
+    // runtime checks for two inner loops in the same outer loop, fold any
+    // common computations, hoist loop-invariant aspects out of any outer loop,
+    // and unswitch the runtime checks if possible. Once hoisted, we may have
+    // dead (or speculatable) control flows or more combining opportunities.
+    MPM.add(createEarlyCSEPass());
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createLICMPass());
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+    MPM.add(createCFGSimplificationPass());
+    MPM.add(createInstructionCombiningPass());
+  }
+
+  if (RunSLPAfterLoopVectorization) {
+    if (SLPVectorize) {
+      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
+      if (OptLevel > 1 && ExtraVectorizerPasses) {
+        MPM.add(createEarlyCSEPass());
+      }
+    }
+
+    if (BBVectorize) {
+      MPM.add(createBBVectorizePass());
+      MPM.add(createInstructionCombiningPass());
+      addExtensionsToPM(EP_Peephole, MPM);
+      if (OptLevel > 1 && UseGVNAfterVectorization)
+        MPM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+      else
+        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
+
+      // BBVectorize may have significantly shortened a loop body; unroll again.
+      if (!DisableUnrollLoops)
+        MPM.add(createLoopUnrollPass());
+    }
+  }
+
   addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createCFGSimplificationPass());
+  MPM.add(createInstructionCombiningPass());
 
   if (!DisableUnrollLoops)
     MPM.add(createLoopUnrollPass());    // Unroll small loops
 
+  // After vectorization and unrolling, assume intrinsics may tell us more
+  // about pointer alignments.
+  MPM.add(createAlignmentFromAssumptionsPass());
+
   if (!DisableUnitAtATime) {
     // FIXME: We shouldn't bother with this anymore.
     MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
@@ -277,22 +366,17 @@
       MPM.add(createConstantMergePass());     // Merge dup global constants
     }
   }
+
+  if (MergeFunctions)
+    MPM.add(createMergeFunctionsPass());
+
   addExtensionsToPM(EP_OptimizerLast, MPM);
 }
 
-void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
-                                                bool Internalize,
-                                                bool RunInliner,
-                                                bool DisableGVNLoadPRE) {
+void PassManagerBuilder::addLTOOptimizationPasses(PassManagerBase &PM) {
   // Provide AliasAnalysis services for optimizations.
   addInitialAliasAnalysisPasses(PM);
 
-  // Now that composite has been compiled, scan through the module, looking
-  // for a main function.  If main is defined, mark all other functions
-  // internal.
-  if (Internalize)
-    PM.add(createInternalizePass("main"));
-
   // Propagate constants at call sites into the functions they call.  This
   // opens opportunities for globalopt (and inlining) by substituting function
   // pointers passed as arguments to direct uses of functions.
@@ -316,8 +400,11 @@
   addExtensionsToPM(EP_Peephole, PM);
 
   // Inline small functions
-  if (RunInliner)
-    PM.add(createFunctionInliningPass());
+  bool RunInliner = Inliner;
+  if (RunInliner) {
+    PM.add(Inliner);
+    Inliner = nullptr;
+  }
 
   PM.add(createPruneEHPass());   // Remove dead EH info.
 
@@ -346,6 +433,8 @@
   PM.add(createGlobalsModRefPass()); // IP alias analysis.
 
   PM.add(createLICMPass());                 // Hoist loop invariants.
+  if (EnableMLSM)
+    PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
   PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
 
@@ -355,10 +444,16 @@
   // More loops are countable; try to optimize them.
   PM.add(createIndVarSimplifyPass());
   PM.add(createLoopDeletionPass());
-  PM.add(createLoopVectorizePass(true, true));
+  PM.add(createLoopVectorizePass(true, LoopVectorize));
 
   // More scalar chains could be vectorized due to more alias information
-  PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+  if (RunSLPAfterLoopVectorization)
+    if (SLPVectorize)
+      PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+
+  // After vectorization, assume intrinsics may tell us more about pointer
+  // alignments.
+  PM.add(createAlignmentFromAssumptionsPass());
 
   if (LoadCombine)
     PM.add(createLoadCombinePass());
@@ -374,6 +469,39 @@
 
   // Now that we have optimized the program, discard unreachable functions.
   PM.add(createGlobalDCEPass());
+
+  // FIXME: this is profitable (for compiler time) to do at -O0 too, but
+  // currently it damages debug info.
+  if (MergeFunctions)
+    PM.add(createMergeFunctionsPass());
+}
+
+void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
+                                                TargetMachine *TM) {
+  if (TM) {
+    PM.add(new DataLayoutPass());
+    TM->addAnalysisPasses(PM);
+  }
+
+  if (LibraryInfo)
+    PM.add(new TargetLibraryInfo(*LibraryInfo));
+
+  if (VerifyInput)
+    PM.add(createVerifierPass());
+
+  if (StripDebug)
+    PM.add(createStripSymbolsPass(true));
+
+  if (VerifyInput)
+    PM.add(createDebugInfoVerifierPass());
+
+  if (OptLevel != 0)
+    addLTOOptimizationPasses(PM);
+
+  if (VerifyOutput) {
+    PM.add(createVerifierPass());
+    PM.add(createDebugInfoVerifierPass());
+  }
 }
 
 inline PassManagerBuilder *unwrap(LLVMPassManagerBuilderRef P) {
@@ -457,5 +585,11 @@
                                                   LLVMBool RunInliner) {
   PassManagerBuilder *Builder = unwrap(PMB);
   PassManagerBase *LPM = unwrap(PM);
-  Builder->populateLTOPassManager(*LPM, Internalize != 0, RunInliner != 0);
+
+  // A small backwards compatibility hack. populateLTOPassManager used to take
+  // an RunInliner option.
+  if (RunInliner && !Builder->Inliner)
+    Builder->Inliner = createFunctionInliningPass();
+
+  Builder->populateLTOPassManager(*LPM);
 }

diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 1abbccc..3412b9e 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp

@@ -154,9 +154,8 @@
       C->destroyConstant();
 
   // If the constant referenced anything, see if we can delete it as well.
-  for (SmallPtrSet<Constant*, 4>::iterator OI = Operands.begin(),
-         OE = Operands.end(); OI != OE; ++OI)
-    RemoveDeadConstant(*OI);
+  for (Constant *O : Operands)
+    RemoveDeadConstant(O);
 }
 
 // Strip the symbol table of its names.
@@ -191,7 +190,7 @@
 
 /// Find values that are marked as llvm.used.
 static void findUsedValues(GlobalVariable *LLVMUsed,
-                           SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
+                           SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
   if (!LLVMUsed) return;
   UsedValues.insert(LLVMUsed);
 
@@ -350,28 +349,12 @@
     // subprogram list/global variable list with our new live subprogram/global
     // variable list.
     if (SubprogramChange) {
-      // Make sure that 9 is still the index of the subprograms. This is to make
-      // sure that an assert is hit if the location of the subprogram array
-      // changes. This is just to make sure that this is updated if such an
-      // event occurs.
-      assert(DIC->getNumOperands() >= 10 &&
-             SPs == DIC->getOperand(9) &&
-             "DICompileUnits is expected to store Subprograms in operand "
-             "9.");
-      DIC->replaceOperandWith(9, MDNode::get(C, LiveSubprograms));
+      DIC.replaceSubprograms(DIArray(MDNode::get(C, LiveSubprograms)));
       Changed = true;
     }
 
     if (GlobalVariableChange) {
-      // Make sure that 10 is still the index of global variables. This is to
-      // make sure that an assert is hit if the location of the subprogram array
-      // changes. This is just to make sure that this index is updated if such
-      // an event occurs.
-      assert(DIC->getNumOperands() >= 11 &&
-             GVs == DIC->getOperand(10) &&
-             "DICompileUnits is expected to store Global Variables in operand "
-             "10.");
-      DIC->replaceOperandWith(10, MDNode::get(C, LiveGlobalVariables));
+      DIC.replaceGlobalVariables(DIArray(MDNode::get(C, LiveGlobalVariables)));
       Changed = true;
     }
 

diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index ab4dc1c..d4b252b 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h

@@ -7,16 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef INSTCOMBINE_INSTCOMBINE_H
-#define INSTCOMBINE_INSTCOMBINE_H
+#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H
+#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINE_H
 
 #include "InstCombineWorklist.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 
@@ -25,6 +27,7 @@
 namespace llvm {
 class CallSite;
 class DataLayout;
+class DominatorTree;
 class TargetLibraryInfo;
 class DbgDeclareInst;
 class MemIntrinsic;
@@ -71,14 +74,20 @@
 class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
     : public IRBuilderDefaultInserter<true> {
   InstCombineWorklist &Worklist;
+  AssumptionTracker *AT;
 
 public:
-  InstCombineIRInserter(InstCombineWorklist &WL) : Worklist(WL) {}
+  InstCombineIRInserter(InstCombineWorklist &WL, AssumptionTracker *AT)
+    : Worklist(WL), AT(AT) {}
 
   void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
                     BasicBlock::iterator InsertPt) const {
     IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
     Worklist.Add(I);
+
+    using namespace llvm::PatternMatch;
+    if (match(I, m_Intrinsic<Intrinsic::assume>()))
+      AT->registerAssumption(cast<CallInst>(I));
   }
 };
 
@@ -86,8 +95,10 @@
 class LLVM_LIBRARY_VISIBILITY InstCombiner
     : public FunctionPass,
       public InstVisitor<InstCombiner, Instruction *> {
+  AssumptionTracker *AT;
   const DataLayout *DL;
   TargetLibraryInfo *TLI;
+  DominatorTree *DT; // not required
   bool MadeIRChange;
   LibCallSimplifier *Simplifier;
   bool MinimizeSize;
@@ -114,7 +125,11 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
+  AssumptionTracker *getAssumptionTracker() const { return AT; }
+
   const DataLayout *getDataLayout() const { return DL; }
+  
+  DominatorTree *getDominatorTree() const { return DT; }
 
   TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; }
 
@@ -148,10 +163,12 @@
   Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
   Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
   Instruction *visitAnd(BinaryOperator &I);
-  Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+  Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI);
   Value *FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
   Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A,
                                    Value *B, Value *C);
+  Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A,
+                                    Value *B, Value *C);
   Instruction *visitOr(BinaryOperator &I);
   Instruction *visitXor(BinaryOperator &I);
   Instruction *visitShl(BinaryOperator &I);
@@ -172,6 +189,10 @@
                               ConstantInt *DivRHS);
   Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI,
                               ConstantInt *DivRHS);
+  Instruction *FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
+                                 ConstantInt *CI1, ConstantInt *CI2);
+  Instruction *FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
+                                 ConstantInt *CI1, ConstantInt *CI2);
   Instruction *FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI,
                                 ICmpInst::Predicate Pred);
   Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
@@ -213,6 +234,7 @@
   Instruction *visitStoreInst(StoreInst &SI);
   Instruction *visitBranchInst(BranchInst &BI);
   Instruction *visitSwitchInst(SwitchInst &SI);
+  Instruction *visitReturnInst(ReturnInst &RI);
   Instruction *visitInsertValueInst(InsertValueInst &IV);
   Instruction *visitInsertElementInst(InsertElementInst &IE);
   Instruction *visitExtractElementInst(ExtractElementInst &EI);
@@ -246,8 +268,10 @@
   Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
                                  bool DoXform = true);
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
-  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
-  bool WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS);
+  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction *CxtI);
+  bool WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction *CxtI);
+  bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction *CxtI);
+  bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction *CxtI);
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
@@ -316,16 +340,19 @@
   }
 
   void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
-                        unsigned Depth = 0) const {
-    return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth);
+                        unsigned Depth = 0, Instruction *CxtI = nullptr) const {
+    return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth,
+                                  AT, CxtI, DT);
   }
 
   bool MaskedValueIsZero(Value *V, const APInt &Mask,
-                         unsigned Depth = 0) const {
-    return llvm::MaskedValueIsZero(V, Mask, DL, Depth);
+                         unsigned Depth = 0,
+                         Instruction *CxtI = nullptr) const {
+    return llvm::MaskedValueIsZero(V, Mask, DL, Depth, AT, CxtI, DT);
   }
-  unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0) const {
-    return llvm::ComputeNumSignBits(Op, DL, Depth);
+  unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0,
+                              Instruction *CxtI = nullptr) const {
+    return llvm::ComputeNumSignBits(Op, DL, Depth, AT, CxtI, DT);
   }
 
 private:
@@ -343,7 +370,8 @@
   /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
   /// based on the demanded bits.
   Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero,
-                                 APInt &KnownOne, unsigned Depth);
+                                 APInt &KnownOne, unsigned Depth,
+                                 Instruction *CxtI = nullptr);
   bool SimplifyDemandedBits(Use &U, APInt DemandedMask, APInt &KnownZero,
                             APInt &KnownOne, unsigned Depth = 0);
   /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded

diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 99f0f1f..902b640 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp

@@ -32,7 +32,7 @@
   ///
   class FAddendCoef {
   public:
-    // The constructor has to initialize a APFloat, which is uncessary for
+    // The constructor has to initialize a APFloat, which is unnecessary for
     // most addends which have coefficient either 1 or -1. So, the constructor
     // is expensive. In order to avoid the cost of the constructor, we should
     // reuse some instances whenever possible. The pre-created instances
@@ -895,7 +895,8 @@
 /// This basically requires proving that the add in the original type would not
 /// overflow to change the sign bit or have a carry out.
 /// TODO: Handle this for Vectors.
-bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) {
+bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
+                                            Instruction *CxtI) {
   // There are different heuristics we can use for this.  Here are some simple
   // ones.
 
@@ -913,18 +914,19 @@
   //
   // Since the carry into the most significant position is always equal to
   // the carry out of the addition, there is no signed overflow.
-  if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1)
+  if (ComputeNumSignBits(LHS, 0, CxtI) > 1 &&
+      ComputeNumSignBits(RHS, 0, CxtI) > 1)
     return true;
 
   if (IntegerType *IT = dyn_cast<IntegerType>(LHS->getType())) {
     int BitWidth = IT->getBitWidth();
     APInt LHSKnownZero(BitWidth, 0);
     APInt LHSKnownOne(BitWidth, 0);
-    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI);
 
     APInt RHSKnownZero(BitWidth, 0);
     APInt RHSKnownOne(BitWidth, 0);
-    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
+    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI);
 
     // Addition of two 2's compliment numbers having opposite signs will never
     // overflow.
@@ -943,19 +945,69 @@
 
 /// WillNotOverflowUnsignedAdd - Return true if we can prove that:
 ///    (zext (add LHS, RHS))  === (add (zext LHS), (zext RHS))
-bool InstCombiner::WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS) {
+bool InstCombiner::WillNotOverflowUnsignedAdd(Value *LHS, Value *RHS,
+                                              Instruction *CxtI) {
   // There are different heuristics we can use for this. Here is a simple one.
   // If the sign bit of LHS and that of RHS are both zero, no unsigned wrap.
   bool LHSKnownNonNegative, LHSKnownNegative;
   bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0);
+  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0, AT, CxtI, DT);
+  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0, AT, CxtI, DT);
   if (LHSKnownNonNegative && RHSKnownNonNegative)
     return true;
 
   return false;
 }
 
+/// \brief Return true if we can prove that:
+///    (sub LHS, RHS)  === (sub nsw LHS, RHS)
+/// This basically requires proving that the add in the original type would not
+/// overflow to change the sign bit or have a carry out.
+/// TODO: Handle this for Vectors.
+bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
+                                            Instruction *CxtI) {
+  // If LHS and RHS each have at least two sign bits, the subtraction
+  // cannot overflow.
+  if (ComputeNumSignBits(LHS, 0, CxtI) > 1 &&
+      ComputeNumSignBits(RHS, 0, CxtI) > 1)
+    return true;
+
+  if (IntegerType *IT = dyn_cast<IntegerType>(LHS->getType())) {
+    unsigned BitWidth = IT->getBitWidth();
+    APInt LHSKnownZero(BitWidth, 0);
+    APInt LHSKnownOne(BitWidth, 0);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, CxtI);
+
+    APInt RHSKnownZero(BitWidth, 0);
+    APInt RHSKnownOne(BitWidth, 0);
+    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, CxtI);
+
+    // Subtraction of two 2's compliment numbers having identical signs will
+    // never overflow.
+    if ((LHSKnownOne[BitWidth - 1] && RHSKnownOne[BitWidth - 1]) ||
+        (LHSKnownZero[BitWidth - 1] && RHSKnownZero[BitWidth - 1]))
+      return true;
+
+    // TODO: implement logic similar to checkRippleForAdd
+  }
+  return false;
+}
+
+/// \brief Return true if we can prove that:
+///    (sub LHS, RHS)  === (sub nuw LHS, RHS)
+bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
+                                              Instruction *CxtI) {
+  // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
+  bool LHSKnownNonNegative, LHSKnownNegative;
+  bool RHSKnownNonNegative, RHSKnownNegative;
+  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, 0, AT, CxtI, DT);
+  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, 0, AT, CxtI, DT);
+  if (LHSKnownNegative && RHSKnownNonNegative)
+    return true;
+
+  return false;
+}
+
 // Checks if any operand is negative and we can convert add to sub.
 // This function checks for following negative patterns
 //   ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C))
@@ -1025,7 +1077,7 @@
      return ReplaceInstUsesWith(I, V);
 
    if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
-                                  I.hasNoUnsignedWrap(), DL))
+                                  I.hasNoUnsignedWrap(), DL, TLI, DT, AT))
      return ReplaceInstUsesWith(I, V);
 
    // (A*B)+(A*C) -> A*(B+C) etc
@@ -1064,7 +1116,7 @@
 
       if (ExtendAmt) {
         APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt);
-        if (!MaskedValueIsZero(XorLHS, Mask))
+        if (!MaskedValueIsZero(XorLHS, Mask, 0, &I))
           ExtendAmt = 0;
       }
 
@@ -1080,7 +1132,7 @@
         IntegerType *IT = cast<IntegerType>(I.getType());
         APInt LHSKnownOne(IT->getBitWidth(), 0);
         APInt LHSKnownZero(IT->getBitWidth(), 0);
-        computeKnownBits(XorLHS, LHSKnownZero, LHSKnownOne);
+        computeKnownBits(XorLHS, LHSKnownZero, LHSKnownOne, 0, &I);
         if ((XorRHS->getValue() | LHSKnownZero).isAllOnesValue())
           return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
                                            XorLHS);
@@ -1133,11 +1185,11 @@
   if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
     APInt LHSKnownOne(IT->getBitWidth(), 0);
     APInt LHSKnownZero(IT->getBitWidth(), 0);
-    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &I);
     if (LHSKnownZero != 0) {
       APInt RHSKnownOne(IT->getBitWidth(), 0);
       APInt RHSKnownZero(IT->getBitWidth(), 0);
-      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
+      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &I);
 
       // No bits in common -> bitwise or.
       if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
@@ -1215,7 +1267,7 @@
         ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, &I)) {
         // Insert the new, smaller add.
         Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               CI, "addconv");
@@ -1231,7 +1283,7 @@
       if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
           WillNotOverflowSignedAdd(LHSConv->getOperand(0),
-                                   RHSConv->getOperand(0))) {
+                                   RHSConv->getOperand(0), &I)) {
         // Insert the new integer add.
         Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                              RHSConv->getOperand(0), "addconv");
@@ -1240,7 +1292,7 @@
     }
   }
 
-  // Check for (x & y) + (x ^ y)
+  // (add (xor A, B) (and A, B)) --> (or A, B)
   {
     Value *A = nullptr, *B = nullptr;
     if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
@@ -1254,14 +1306,36 @@
       return BinaryOperator::CreateOr(A, B);
   }
 
+  // (add (or A, B) (and A, B)) --> (add A, B)
+  {
+    Value *A = nullptr, *B = nullptr;
+    if (match(RHS, m_Or(m_Value(A), m_Value(B))) &&
+        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
+         match(LHS, m_And(m_Specific(B), m_Specific(A))))) {
+      auto *New = BinaryOperator::CreateAdd(A, B);
+      New->setHasNoSignedWrap(I.hasNoSignedWrap());
+      New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+      return New;
+    }
+
+    if (match(LHS, m_Or(m_Value(A), m_Value(B))) &&
+        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
+         match(RHS, m_And(m_Specific(B), m_Specific(A))))) {
+      auto *New = BinaryOperator::CreateAdd(A, B);
+      New->setHasNoSignedWrap(I.hasNoSignedWrap());
+      New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+      return New;
+    }
+  }
+
   // TODO(jingyue): Consider WillNotOverflowSignedAdd and
   // WillNotOverflowUnsignedAdd to reduce the number of invocations of
   // computeKnownBits.
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS)) {
+  if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, &I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
-  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedAdd(LHS, RHS)) {
+  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedAdd(LHS, RHS, &I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
@@ -1276,7 +1350,8 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL))
+  if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL,
+                                  TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   if (isa<Constant>(RHS)) {
@@ -1318,7 +1393,7 @@
       ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, &I)) {
         // Insert the new integer add.
         Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               CI, "addconv");
@@ -1334,7 +1409,7 @@
       if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
           WillNotOverflowSignedAdd(LHSConv->getOperand(0),
-                                   RHSConv->getOperand(0))) {
+                                   RHSConv->getOperand(0), &I)) {
         // Insert the new integer add.
         Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               RHSConv->getOperand(0),"addconv");
@@ -1356,11 +1431,11 @@
             Z2 = dyn_cast<Constant>(B2); B = B1;
         } else if (match(B1, m_AnyZero()) && match(A2, m_AnyZero())) {
             Z1 = dyn_cast<Constant>(B1); B = B2;
-            Z2 = dyn_cast<Constant>(A2); A = A1; 
+            Z2 = dyn_cast<Constant>(A2); A = A1;
         }
-        
-        if (Z1 && Z2 && 
-            (I.hasNoSignedZeros() || 
+
+        if (Z1 && Z2 &&
+            (I.hasNoSignedZeros() ||
              (Z1->isNegativeZeroValue() && Z2->isNegativeZeroValue()))) {
           return SelectInst::Create(C, A, B);
         }
@@ -1447,7 +1522,6 @@
   return Builder->CreateIntCast(Result, Ty, true);
 }
 
-
 Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
@@ -1455,18 +1529,27 @@
     return ReplaceInstUsesWith(I, V);
 
   if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(),
-                                 I.hasNoUnsignedWrap(), DL))
+                                 I.hasNoUnsignedWrap(), DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // (A*B)-(A*C) -> A*(B-C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return ReplaceInstUsesWith(I, V);
 
-  // If this is a 'B = x-(-A)', change to B = x+A.  This preserves NSW/NUW.
+  // If this is a 'B = x-(-A)', change to B = x+A.
   if (Value *V = dyn_castNegVal(Op1)) {
     BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
-    Res->setHasNoSignedWrap(I.hasNoSignedWrap());
-    Res->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+
+    if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) {
+      assert(BO->getOpcode() == Instruction::Sub &&
+             "Expected a subtraction operator!");
+      if (BO->hasNoSignedWrap() && I.hasNoSignedWrap())
+        Res->setHasNoSignedWrap(true);
+    } else {
+      if (cast<Constant>(Op1)->isNotMinSignedValue() && I.hasNoSignedWrap())
+        Res->setHasNoSignedWrap(true);
+    }
+
     return Res;
   }
 
@@ -1511,21 +1594,23 @@
     // -(X >>u 31) -> (X >>s 31)
     // -(X >>s 31) -> (X >>u 31)
     if (C->isZero()) {
-      Value *X; ConstantInt *CI;
+      Value *X;
+      ConstantInt *CI;
       if (match(Op1, m_LShr(m_Value(X), m_ConstantInt(CI))) &&
           // Verify we are shifting out everything but the sign bit.
-          CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1)
+          CI->getValue() == I.getType()->getPrimitiveSizeInBits() - 1)
         return BinaryOperator::CreateAShr(X, CI);
 
       if (match(Op1, m_AShr(m_Value(X), m_ConstantInt(CI))) &&
           // Verify we are shifting out everything but the sign bit.
-          CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1)
+          CI->getValue() == I.getType()->getPrimitiveSizeInBits() - 1)
         return BinaryOperator::CreateLShr(X, CI);
     }
   }
 
 
-  { Value *Y;
+  {
+    Value *Y;
     // X-(X+Y) == -Y    X-(Y+X) == -Y
     if (match(Op1, m_Add(m_Specific(Op0), m_Value(Y))) ||
         match(Op1, m_Add(m_Value(Y), m_Specific(Op0))))
@@ -1536,6 +1621,24 @@
       return BinaryOperator::CreateNeg(Y);
   }
 
+  // (sub (or A, B) (xor A, B)) --> (and A, B)
+  {
+    Value *A = nullptr, *B = nullptr;
+    if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+        (match(Op0, m_Or(m_Specific(A), m_Specific(B))) ||
+         match(Op0, m_Or(m_Specific(B), m_Specific(A)))))
+      return BinaryOperator::CreateAnd(A, B);
+  }
+
+  if (Op0->hasOneUse()) {
+    Value *Y = nullptr;
+    // ((X | Y) - X) --> (~X & Y)
+    if (match(Op0, m_Or(m_Value(Y), m_Specific(Op1))) ||
+        match(Op0, m_Or(m_Specific(Op1), m_Value(Y))))
+      return BinaryOperator::CreateAnd(
+          Y, Builder->CreateNot(Op1, Op1->getName() + ".not"));
+  }
+
   if (Op1->hasOneUse()) {
     Value *X = nullptr, *Y = nullptr, *Z = nullptr;
     Constant *C = nullptr;
@@ -1555,7 +1658,7 @@
 
     // 0 - (X sdiv C)  -> (X sdiv -C)  provided the negation doesn't overflow.
     if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && match(Op0, m_Zero()) &&
-        !C->isMinSignedValue())
+        C->isNotMinSignedValue() && !C->isOneValue())
       return BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(C));
 
     // 0 - (X << Y)  -> (-X << Y)   when X is freely negatable.
@@ -1595,7 +1698,17 @@
         return ReplaceInstUsesWith(I, Res);
       }
 
-  return nullptr;
+  bool Changed = false;
+  if (!I.hasNoSignedWrap() && WillNotOverflowSignedSub(Op0, Op1, &I)) {
+    Changed = true;
+    I.setHasNoSignedWrap(true);
+  }
+  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedSub(Op0, Op1, &I)) {
+    Changed = true;
+    I.setHasNoUnsignedWrap(true);
+  }
+
+  return Changed ? &I : nullptr;
 }
 
 Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
@@ -1604,7 +1717,8 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL))
+  if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL,
+                                  TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   if (isa<Constant>(Op0))

diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b23a606..55ebced 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

@@ -355,7 +355,7 @@
       if (isRunOfOnes(Mask, MB, ME)) {  // begin/end bit of run, inclusive
         uint32_t BitWidth = cast<IntegerType>(RHS->getType())->getBitWidth();
         APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1));
-        if (MaskedValueIsZero(RHS, Mask))
+        if (MaskedValueIsZero(RHS, Mask, 0, &I))
           break;
       }
     }
@@ -614,7 +614,7 @@
   } else if (R1->getType()->isIntegerTy()) {
     if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
       // As before, model no mask as a trivial mask if it'll let us do an
-      // optimisation.
+      // optimization.
       R11 = R1;
       R12 = Constant::getAllOnesValue(R1->getType());
     }
@@ -665,8 +665,8 @@
 /// foldLogOpOfMaskedICmps:
 /// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y)
-static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-                                     llvm::InstCombiner::BuilderTy* Builder) {
+static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+                                     llvm::InstCombiner::BuilderTy *Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
   unsigned mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS,
@@ -697,26 +697,26 @@
   if (mask & FoldMskICmp_Mask_AllZeroes) {
     // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
     // -> (icmp eq (A & (B|D)), 0)
-    Value* newOr = Builder->CreateOr(B, D);
-    Value* newAnd = Builder->CreateAnd(A, newOr);
+    Value *newOr = Builder->CreateOr(B, D);
+    Value *newAnd = Builder->CreateAnd(A, newOr);
     // we can't use C as zero, because we might actually handle
     //   (icmp ne (A & B), B) & (icmp ne (A & D), D)
     // with B and D, having a single bit set
-    Value* zero = Constant::getNullValue(A->getType());
+    Value *zero = Constant::getNullValue(A->getType());
     return Builder->CreateICmp(NEWCC, newAnd, zero);
   }
   if (mask & FoldMskICmp_BMask_AllOnes) {
     // (icmp eq (A & B), B) & (icmp eq (A & D), D)
     // -> (icmp eq (A & (B|D)), (B|D))
-    Value* newOr = Builder->CreateOr(B, D);
-    Value* newAnd = Builder->CreateAnd(A, newOr);
+    Value *newOr = Builder->CreateOr(B, D);
+    Value *newAnd = Builder->CreateAnd(A, newOr);
     return Builder->CreateICmp(NEWCC, newAnd, newOr);
   }
   if (mask & FoldMskICmp_AMask_AllOnes) {
     // (icmp eq (A & B), A) & (icmp eq (A & D), A)
     // -> (icmp eq (A & (B&D)), A)
-    Value* newAnd1 = Builder->CreateAnd(B, D);
-    Value* newAnd = Builder->CreateAnd(A, newAnd1);
+    Value *newAnd1 = Builder->CreateAnd(B, D);
+    Value *newAnd = Builder->CreateAnd(A, newAnd1);
     return Builder->CreateICmp(NEWCC, newAnd, A);
   }
 
@@ -766,19 +766,17 @@
     // with B and D, having a single bit set
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
     if (!CCst) return nullptr;
-    if (LHSCC != NEWCC)
-      CCst = dyn_cast<ConstantInt>( ConstantExpr::getXor(BCst, CCst) );
     ConstantInt *ECst = dyn_cast<ConstantInt>(E);
     if (!ECst) return nullptr;
+    if (LHSCC != NEWCC)
+      CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
     if (RHSCC != NEWCC)
-      ECst = dyn_cast<ConstantInt>( ConstantExpr::getXor(DCst, ECst) );
-    ConstantInt* MCst = dyn_cast<ConstantInt>(
-      ConstantExpr::getAnd(ConstantExpr::getAnd(BCst, DCst),
-                           ConstantExpr::getXor(CCst, ECst)) );
+      ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
     // if there is a conflict we should actually return a false for the
     // whole construct
-    if (!MCst->isZero())
-      return nullptr;
+    if (((BCst->getValue() & DCst->getValue()) &
+         (CCst->getValue() ^ ECst->getValue())) != 0)
+      return ConstantInt::get(LHS->getType(), !IsAnd);
     Value *newOr1 = Builder->CreateOr(B, D);
     Value *newOr2 = ConstantExpr::getOr(CCst, ECst);
     Value *newAnd = Builder->CreateAnd(A, newOr1);
@@ -930,6 +928,8 @@
     case ICmpInst::ICMP_ULT:
       if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
         return Builder->CreateICmpULT(Val, LHSCst);
+      if (LHSCst->isNullValue())    // (X !=  0 & X u< 14) -> X-1 u< 13
+        return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, false, true);
       break;                        // (X != 13 & X u< 15) -> no change
     case ICmpInst::ICMP_SLT:
       if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
@@ -1101,7 +1101,6 @@
   return nullptr;
 }
 
-
 Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -1109,7 +1108,7 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyAndInst(Op0, Op1, DL))
+  if (Value *V = SimplifyAndInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // (A|B)&(A|C) -> A|(B&C) etc
@@ -1136,14 +1135,14 @@
         if (!Op0I->hasOneUse()) break;
 
         APInt NotAndRHS(~AndRHSMask);
-        if (MaskedValueIsZero(Op0LHS, NotAndRHS)) {
+        if (MaskedValueIsZero(Op0LHS, NotAndRHS, 0, &I)) {
           // Not masking anything out for the LHS, move to RHS.
           Value *NewRHS = Builder->CreateAnd(Op0RHS, AndRHS,
                                              Op0RHS->getName()+".masked");
           return BinaryOperator::Create(Op0I->getOpcode(), Op0LHS, NewRHS);
         }
         if (!isa<Constant>(Op0RHS) &&
-            MaskedValueIsZero(Op0RHS, NotAndRHS)) {
+            MaskedValueIsZero(Op0RHS, NotAndRHS, 0, &I)) {
           // Not masking anything out for the RHS, move to LHS.
           Value *NewLHS = Builder->CreateAnd(Op0LHS, AndRHS,
                                              Op0LHS->getName()+".masked");
@@ -1176,7 +1175,7 @@
           uint32_t Zeros = AndRHSMask.countLeadingZeros();
           APInt Mask = APInt::getLowBitsSet(BitWidth, BitWidth - Zeros);
 
-          if (MaskedValueIsZero(Op0LHS, Mask)) {
+          if (MaskedValueIsZero(Op0LHS, Mask, 0, &I)) {
             Value *NewNeg = Builder->CreateNeg(Op0RHS);
             return BinaryOperator::CreateAnd(NewNeg, AndRHS);
           }
@@ -1283,13 +1282,58 @@
     if (match(Op1, m_Or(m_Not(m_Specific(Op0)), m_Value(A))) ||
         match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0)))))
       return BinaryOperator::CreateAnd(A, Op0);
+
+    // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
+      if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
+        if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse())
+          return BinaryOperator::CreateAnd(Op0, Builder->CreateNot(C));
+
+    // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C
+    if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
+      if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
+        if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse())
+          return BinaryOperator::CreateAnd(Op1, Builder->CreateNot(C));
+
+    // (A | B) & ((~A) ^ B) -> (A & B)
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Xor(m_Not(m_Specific(A)), m_Specific(B))))
+      return BinaryOperator::CreateAnd(A, B);
+
+    // ((~A) ^ B) & (A | B) -> (A & B)
+    if (match(Op0, m_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1, m_Or(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(A, B);
   }
 
-  if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1))
-    if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0))
+  {
+    ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
+    ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
+    if (LHS && RHS)
       if (Value *Res = FoldAndOfICmps(LHS, RHS))
         return ReplaceInstUsesWith(I, Res);
 
+    // TODO: Make this recursive; it's a little tricky because an arbitrary
+    // number of 'and' instructions might have to be created.
+    Value *X, *Y;
+    if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = FoldAndOfICmps(LHS, Cmp))
+          return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = FoldAndOfICmps(LHS, Cmp))
+          return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, X));
+    }
+    if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+      if (auto *Cmp = dyn_cast<ICmpInst>(X))
+        if (Value *Res = FoldAndOfICmps(Cmp, RHS))
+          return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
+      if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+        if (Value *Res = FoldAndOfICmps(Cmp, RHS))
+          return ReplaceInstUsesWith(I, Builder->CreateAnd(Res, X));
+    }
+  }
+
   // If and'ing two fcmp, try combine them into one.
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
@@ -1329,20 +1373,6 @@
       }
     }
 
-  // (X >> Z) & (Y >> Z)  -> (X&Y) >> Z  for all shifts.
-  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
-    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
-      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() &&
-          SI0->getOperand(1) == SI1->getOperand(1) &&
-          (SI0->hasOneUse() || SI1->hasOneUse())) {
-        Value *NewOp =
-          Builder->CreateAnd(SI0->getOperand(0), SI1->getOperand(0),
-                             SI0->getName());
-        return BinaryOperator::Create(SI1->getOpcode(), NewOp,
-                                      SI1->getOperand(1));
-      }
-  }
-
   {
     Value *X = nullptr;
     bool OpsSwapped = false;
@@ -1554,7 +1584,8 @@
 }
 
 /// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
-Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                   Instruction *CxtI) {
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
 
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
@@ -1574,13 +1605,15 @@
       Value *Mask = nullptr;
       Value *Masked = nullptr;
       if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
-          isKnownToBeAPowerOfTwo(LAnd->getOperand(1)) &&
-          isKnownToBeAPowerOfTwo(RAnd->getOperand(1))) {
+          isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, AT, CxtI, DT) &&
+          isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, AT, CxtI, DT)) {
         Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
         Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
       } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
-                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0)) &&
-                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0))) {
+                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0),
+                                        false, 0, AT, CxtI, DT) &&
+                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0),
+                                        false, 0, AT, CxtI, DT)) {
         Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
         Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
       }
@@ -1590,6 +1623,61 @@
     }
   }
 
+  // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
+  //                   -->  (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
+  // The original condition actually refers to the following two ranges:
+  // [MAX_UINT-C1+1, MAX_UINT-C1+1+C3] and [MAX_UINT-C2+1, MAX_UINT-C2+1+C3]
+  // We can fold these two ranges if:
+  // 1) C1 and C2 is unsigned greater than C3.
+  // 2) The two ranges are separated.
+  // 3) C1 ^ C2 is one-bit mask.
+  // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
+  // This implies all values in the two ranges differ by exactly one bit.
+
+  if ((LHSCC == ICmpInst::ICMP_ULT || LHSCC == ICmpInst::ICMP_ULE) &&
+      LHSCC == RHSCC && LHSCst && RHSCst && LHS->hasOneUse() &&
+      RHS->hasOneUse() && LHSCst->getType() == RHSCst->getType() &&
+      LHSCst->getValue() == (RHSCst->getValue())) {
+
+    Value *LAdd = LHS->getOperand(0);
+    Value *RAdd = RHS->getOperand(0);
+
+    Value *LAddOpnd, *RAddOpnd;
+    ConstantInt *LAddCst, *RAddCst;
+    if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddCst))) &&
+        match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddCst))) &&
+        LAddCst->getValue().ugt(LHSCst->getValue()) &&
+        RAddCst->getValue().ugt(LHSCst->getValue())) {
+
+      APInt DiffCst = LAddCst->getValue() ^ RAddCst->getValue();
+      if (LAddOpnd == RAddOpnd && DiffCst.isPowerOf2()) {
+        ConstantInt *MaxAddCst = nullptr;
+        if (LAddCst->getValue().ult(RAddCst->getValue()))
+          MaxAddCst = RAddCst;
+        else
+          MaxAddCst = LAddCst;
+
+        APInt RRangeLow = -RAddCst->getValue();
+        APInt RRangeHigh = RRangeLow + LHSCst->getValue();
+        APInt LRangeLow = -LAddCst->getValue();
+        APInt LRangeHigh = LRangeLow + LHSCst->getValue();
+        APInt LowRangeDiff = RRangeLow ^ LRangeLow;
+        APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
+        APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
+                                                   : RRangeLow - LRangeLow;
+
+        if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
+            RangeDiff.ugt(LHSCst->getValue())) {
+          Value *MaskCst = ConstantInt::get(LAddCst->getType(), ~DiffCst);
+
+          Value *NewAnd = Builder->CreateAnd(LAddOpnd, MaskCst);
+          Value *NewAdd = Builder->CreateAdd(NewAnd, MaxAddCst);
+          return (Builder->CreateICmp(LHS->getPredicate(), NewAdd, LHSCst));
+        }
+      }
+    }
+  }
+
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
   if (PredicatesFoldable(LHSCC, RHSCC)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
@@ -1906,6 +1994,38 @@
   return nullptr;
 }
 
+/// \brief This helper function folds:
+///
+///     ((A | B) & C1) ^ (B & C2)
+///
+/// into:
+///
+///     (A & C1) ^ B
+///
+/// when the XOR of the two constants is "all ones" (-1).
+Instruction *InstCombiner::FoldXorWithConstants(BinaryOperator &I, Value *Op,
+                                                Value *A, Value *B, Value *C) {
+  ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
+  if (!CI1)
+    return nullptr;
+
+  Value *V1 = nullptr;
+  ConstantInt *CI2 = nullptr;
+  if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2))))
+    return nullptr;
+
+  APInt Xor = CI1->getValue() ^ CI2->getValue();
+  if (!Xor.isAllOnesValue())
+    return nullptr;
+
+  if (V1 == A || V1 == B) {
+    Value *NewOp = Builder->CreateAnd(V1 == A ? B : A, CI1);
+    return BinaryOperator::CreateXor(NewOp, V1);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -1913,7 +2033,7 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyOrInst(Op0, Op1, DL))
+  if (Value *V = SimplifyOrInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // (A&B)|(A&C) -> A&(B|C) etc
@@ -1973,7 +2093,7 @@
   // (X^C)|Y -> (X|Y)^C iff Y&C == 0
   if (Op0->hasOneUse() &&
       match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
-      MaskedValueIsZero(Op1, C1->getValue())) {
+      MaskedValueIsZero(Op1, C1->getValue(), 0, &I)) {
     Value *NOr = Builder->CreateOr(A, Op1);
     NOr->takeName(Op0);
     return BinaryOperator::CreateXor(NOr, C1);
@@ -1982,12 +2102,32 @@
   // Y|(X^C) -> (X|Y)^C iff Y&C == 0
   if (Op1->hasOneUse() &&
       match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
-      MaskedValueIsZero(Op0, C1->getValue())) {
+      MaskedValueIsZero(Op0, C1->getValue(), 0, &I)) {
     Value *NOr = Builder->CreateOr(A, Op0);
     NOr->takeName(Op0);
     return BinaryOperator::CreateXor(NOr, C1);
   }
 
+  // ((~A & B) | A) -> (A | B)
+  if (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
+      match(Op1, m_Specific(A)))
+    return BinaryOperator::CreateOr(A, B);
+
+  // ((A & B) | ~A) -> (~A | B)
+  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+      match(Op1, m_Not(m_Specific(A))))
+    return BinaryOperator::CreateOr(Builder->CreateNot(A), B);
+
+  // (A & (~B)) | (A ^ B) -> (A ^ B)
+  if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+      match(Op1, m_Xor(m_Specific(A), m_Specific(B))))
+    return BinaryOperator::CreateXor(A, B);
+
+  // (A ^ B) | ( A & (~B)) -> (A ^ B)
+  if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+      match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B)))))
+    return BinaryOperator::CreateXor(A, B);
+
   // (A & C)|(B & D)
   Value *C = nullptr, *D = nullptr;
   if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
@@ -2000,14 +2140,18 @@
         // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
         // iff (C1&C2) == 0 and (N&~C1) == 0
         if (match(A, m_Or(m_Value(V1), m_Value(V2))) &&
-            ((V1 == B && MaskedValueIsZero(V2, ~C1->getValue())) ||  // (V|N)
-             (V2 == B && MaskedValueIsZero(V1, ~C1->getValue()))))   // (N|V)
+            ((V1 == B &&
+              MaskedValueIsZero(V2, ~C1->getValue(), 0, &I)) || // (V|N)
+             (V2 == B &&
+              MaskedValueIsZero(V1, ~C1->getValue(), 0, &I))))  // (N|V)
           return BinaryOperator::CreateAnd(A,
                                 Builder->getInt(C1->getValue()|C2->getValue()));
         // Or commutes, try both ways.
         if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
-            ((V1 == A && MaskedValueIsZero(V2, ~C2->getValue())) ||  // (V|N)
-             (V2 == A && MaskedValueIsZero(V1, ~C2->getValue()))))   // (N|V)
+            ((V1 == A &&
+              MaskedValueIsZero(V2, ~C2->getValue(), 0, &I)) || // (V|N)
+             (V2 == A &&
+              MaskedValueIsZero(V1, ~C2->getValue(), 0, &I))))  // (N|V)
           return BinaryOperator::CreateAnd(B,
                                 Builder->getInt(C1->getValue()|C2->getValue()));
 
@@ -2068,20 +2212,35 @@
       Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D);
       if (Ret) return Ret;
     }
+    // ((A^B)&1)|(B&-2) -> (A&1) ^ B
+    if (match(A, m_Xor(m_Value(V1), m_Specific(B))) ||
+        match(A, m_Xor(m_Specific(B), m_Value(V1)))) {
+      Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C);
+      if (Ret) return Ret;
+    }
+    // (B&-2)|((A^B)&1) -> (A&1) ^ B
+    if (match(B, m_Xor(m_Specific(A), m_Value(V1))) ||
+        match(B, m_Xor(m_Value(V1), m_Specific(A)))) {
+      Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D);
+      if (Ret) return Ret;
+    }
   }
 
-  // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
-  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
-    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
-      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() &&
-          SI0->getOperand(1) == SI1->getOperand(1) &&
-          (SI0->hasOneUse() || SI1->hasOneUse())) {
-        Value *NewOp = Builder->CreateOr(SI0->getOperand(0), SI1->getOperand(0),
-                                         SI0->getName());
-        return BinaryOperator::Create(SI1->getOpcode(), NewOp,
-                                      SI1->getOperand(1));
-      }
-  }
+  // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
+  if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
+    if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
+      if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse())
+        return BinaryOperator::CreateOr(Op0, C);
+
+  // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
+  if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
+    if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
+      if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse())
+        return BinaryOperator::CreateOr(Op1, C);
+
+  // ((B | C) & A) | B -> B | (A & C)
+  if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
+    return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C));
 
   // (~A | ~B) == (~(A & B)) - De Morgan's Law
   if (Value *Op0NotVal = dyn_castNotVal(Op0))
@@ -2133,12 +2292,22 @@
         return BinaryOperator::CreateOr(Not, Op0);
       }
 
+  // (A & B) | ((~A) ^ B) -> (~A ^ B)
+  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+      match(Op1, m_Xor(m_Not(m_Specific(A)), m_Specific(B))))
+    return BinaryOperator::CreateXor(Builder->CreateNot(A), B);
+
+  // ((~A) ^ B) | (A & B) -> (~A ^ B)
+  if (match(Op0, m_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+      match(Op1, m_And(m_Specific(A), m_Specific(B))))
+    return BinaryOperator::CreateXor(Builder->CreateNot(A), B);
+
   if (SwappedForXor)
     std::swap(Op0, Op1);
 
   if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
     if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
-      if (Value *Res = FoldOrOfICmps(LHS, RHS))
+      if (Value *Res = FoldOrOfICmps(LHS, RHS, &I))
         return ReplaceInstUsesWith(I, Res);
 
   // (fcmp uno x, c) | (fcmp uno y, c)  -> (fcmp uno x, y)
@@ -2169,7 +2338,7 @@
         // cast is otherwise not optimizable.  This happens for vector sexts.
         if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp))
           if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp))
-            if (Value *Res = FoldOrOfICmps(LHS, RHS))
+            if (Value *Res = FoldOrOfICmps(LHS, RHS, &I))
               return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
 
         // If this is or(cast(fcmp), cast(fcmp)), try to fold this even if the
@@ -2225,7 +2394,7 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyXorInst(Op0, Op1, DL))
+  if (Value *V = SimplifyXorInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // (A&B)^(A&C) -> A&(B^C) etc
@@ -2327,7 +2496,8 @@
           }
         } else if (Op0I->getOpcode() == Instruction::Or) {
           // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
-          if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue())) {
+          if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue(),
+                                0, &I)) {
             Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
             // Anything in both C1 and C2 is known to be zero, remove it from
             // NewRHS.
@@ -2418,18 +2588,6 @@
     }
   }
 
-  // (X >> Z) ^ (Y >> Z)  -> (X^Y) >> Z  for all shifts.
-  if (Op0I && Op1I && Op0I->isShift() &&
-      Op0I->getOpcode() == Op1I->getOpcode() &&
-      Op0I->getOperand(1) == Op1I->getOperand(1) &&
-      (Op0I->hasOneUse() || Op1I->hasOneUse())) {
-    Value *NewOp =
-      Builder->CreateXor(Op0I->getOperand(0), Op1I->getOperand(0),
-                         Op0I->getName());
-    return BinaryOperator::Create(Op1I->getOpcode(), NewOp,
-                                  Op1I->getOperand(1));
-  }
-
   if (Op0I && Op1I) {
     Value *A, *B, *C, *D;
     // (A & B)^(A | B) -> A ^ B
@@ -2444,8 +2602,62 @@
       if ((A == C && B == D) || (A == D && B == C))
         return BinaryOperator::CreateXor(A, B);
     }
+    // (A | ~B) ^ (~A | B) -> A ^ B
+    if (match(Op0I, m_Or(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1I, m_Or(m_Not(m_Specific(A)), m_Specific(B)))) {
+      return BinaryOperator::CreateXor(A, B);
+    }
+    // (~A | B) ^ (A | ~B) -> A ^ B
+    if (match(Op0I, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1I, m_Or(m_Specific(A), m_Not(m_Specific(B))))) {
+      return BinaryOperator::CreateXor(A, B);
+    }
+    // (A & ~B) ^ (~A & B) -> A ^ B
+    if (match(Op0I, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1I, m_And(m_Not(m_Specific(A)), m_Specific(B)))) {
+      return BinaryOperator::CreateXor(A, B);
+    }
+    // (~A & B) ^ (A & ~B) -> A ^ B
+    if (match(Op0I, m_And(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1I, m_And(m_Specific(A), m_Not(m_Specific(B))))) {
+      return BinaryOperator::CreateXor(A, B);
+    }
+    // (A ^ C)^(A | B) -> ((~A) & B) ^ C
+    if (match(Op0I, m_Xor(m_Value(D), m_Value(C))) &&
+        match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
+      if (D == A)
+        return BinaryOperator::CreateXor(
+            Builder->CreateAnd(Builder->CreateNot(A), B), C);
+      if (D == B)
+        return BinaryOperator::CreateXor(
+            Builder->CreateAnd(Builder->CreateNot(B), A), C);
+    }
+    // (A | B)^(A ^ C) -> ((~A) & B) ^ C
+    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_Xor(m_Value(D), m_Value(C)))) {
+      if (D == A)
+        return BinaryOperator::CreateXor(
+            Builder->CreateAnd(Builder->CreateNot(A), B), C);
+      if (D == B)
+        return BinaryOperator::CreateXor(
+            Builder->CreateAnd(Builder->CreateNot(B), A), C);
+    }
+    // (A & B) ^ (A ^ B) -> (A | B)
+    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_Xor(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateOr(A, B);
+    // (A ^ B) ^ (A & B) -> (A | B)
+    if (match(Op0I, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_And(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateOr(A, B);
   }
 
+  Value *A = nullptr, *B = nullptr;
+  // (A & ~B) ^ (~A) -> ~(A & B)
+  if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+      match(Op1, m_Not(m_Specific(A))))
+    return BinaryOperator::CreateNot(Builder->CreateAnd(A, B));
+
   // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
   if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
     if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))

diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 658178d..87e49a1 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp

@@ -16,6 +16,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -58,8 +59,8 @@
 }
 
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
-  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL);
-  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL);
+  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, AT, MI, DT);
+  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, AT, MI, DT);
   unsigned MinAlign = std::min(DstAlign, SrcAlign);
   unsigned CopyAlign = MI->getAlignment();
 
@@ -154,7 +155,7 @@
 }
 
 Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
-  unsigned Alignment = getKnownAlignment(MI->getDest(), DL);
+  unsigned Alignment = getKnownAlignment(MI->getDest(), DL, AT, MI, DT);
   if (MI->getAlignment() < Alignment) {
     MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
                                              Alignment, false));
@@ -322,7 +323,7 @@
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne);
+    computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne, 0, II);
     unsigned TrailingZeros = KnownOne.countTrailingZeros();
     APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
     if ((Mask & KnownZero) == Mask)
@@ -340,7 +341,7 @@
     uint32_t BitWidth = IT->getBitWidth();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne);
+    computeKnownBits(II->getArgOperand(0), KnownZero, KnownOne, 0, II);
     unsigned LeadingZeros = KnownOne.countLeadingZeros();
     APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
     if ((Mask & KnownZero) == Mask)
@@ -355,14 +356,14 @@
     uint32_t BitWidth = IT->getBitWidth();
     APInt LHSKnownZero(BitWidth, 0);
     APInt LHSKnownOne(BitWidth, 0);
-    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, II);
     bool LHSKnownNegative = LHSKnownOne[BitWidth - 1];
     bool LHSKnownPositive = LHSKnownZero[BitWidth - 1];
 
     if (LHSKnownNegative || LHSKnownPositive) {
       APInt RHSKnownZero(BitWidth, 0);
       APInt RHSKnownOne(BitWidth, 0);
-      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
+      computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, II);
       bool RHSKnownNegative = RHSKnownOne[BitWidth - 1];
       bool RHSKnownPositive = RHSKnownZero[BitWidth - 1];
       if (LHSKnownNegative && RHSKnownNegative) {
@@ -426,7 +427,7 @@
     // can prove that it will never overflow.
     if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow) {
       Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-      if (WillNotOverflowSignedAdd(LHS, RHS)) {
+      if (WillNotOverflowSignedAdd(LHS, RHS, II)) {
         Value *Add = Builder->CreateNSWAdd(LHS, RHS);
         Add->takeName(&CI);
         Constant *V[] = {UndefValue::get(Add->getType()), Builder->getFalse()};
@@ -464,10 +465,10 @@
 
     APInt LHSKnownZero(BitWidth, 0);
     APInt LHSKnownOne(BitWidth, 0);
-    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne);
+    computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, II);
     APInt RHSKnownZero(BitWidth, 0);
     APInt RHSKnownOne(BitWidth, 0);
-    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne);
+    computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, II);
 
     // Get the largest possible values for each operand.
     APInt LHSMax = ~LHSKnownZero;
@@ -518,30 +519,131 @@
       }
     }
     break;
+  case Intrinsic::minnum:
+  case Intrinsic::maxnum: {
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+
+    // fmin(x, x) -> x
+    if (Arg0 == Arg1)
+      return ReplaceInstUsesWith(CI, Arg0);
+
+    const ConstantFP *C0 = dyn_cast<ConstantFP>(Arg0);
+    const ConstantFP *C1 = dyn_cast<ConstantFP>(Arg1);
+
+    // Canonicalize constants into the RHS.
+    if (C0 && !C1) {
+      II->setArgOperand(0, Arg1);
+      II->setArgOperand(1, Arg0);
+      return II;
+    }
+
+    // fmin(x, nan) -> x
+    if (C1 && C1->isNaN())
+      return ReplaceInstUsesWith(CI, Arg0);
+
+    // This is the value because if undef were NaN, we would return the other
+    // value and cannot return a NaN unless both operands are.
+    //
+    // fmin(undef, x) -> x
+    if (isa<UndefValue>(Arg0))
+      return ReplaceInstUsesWith(CI, Arg1);
+
+    // fmin(x, undef) -> x
+    if (isa<UndefValue>(Arg1))
+      return ReplaceInstUsesWith(CI, Arg0);
+
+    Value *X = nullptr;
+    Value *Y = nullptr;
+    if (II->getIntrinsicID() == Intrinsic::minnum) {
+      // fmin(x, fmin(x, y)) -> fmin(x, y)
+      // fmin(y, fmin(x, y)) -> fmin(x, y)
+      if (match(Arg1, m_FMin(m_Value(X), m_Value(Y)))) {
+        if (Arg0 == X || Arg0 == Y)
+          return ReplaceInstUsesWith(CI, Arg1);
+      }
+
+      // fmin(fmin(x, y), x) -> fmin(x, y)
+      // fmin(fmin(x, y), y) -> fmin(x, y)
+      if (match(Arg0, m_FMin(m_Value(X), m_Value(Y)))) {
+        if (Arg1 == X || Arg1 == Y)
+          return ReplaceInstUsesWith(CI, Arg0);
+      }
+
+      // TODO: fmin(nnan x, inf) -> x
+      // TODO: fmin(nnan ninf x, flt_max) -> x
+      if (C1 && C1->isInfinity()) {
+        // fmin(x, -inf) -> -inf
+        if (C1->isNegative())
+          return ReplaceInstUsesWith(CI, Arg1);
+      }
+    } else {
+      assert(II->getIntrinsicID() == Intrinsic::maxnum);
+      // fmax(x, fmax(x, y)) -> fmax(x, y)
+      // fmax(y, fmax(x, y)) -> fmax(x, y)
+      if (match(Arg1, m_FMax(m_Value(X), m_Value(Y)))) {
+        if (Arg0 == X || Arg0 == Y)
+          return ReplaceInstUsesWith(CI, Arg1);
+      }
+
+      // fmax(fmax(x, y), x) -> fmax(x, y)
+      // fmax(fmax(x, y), y) -> fmax(x, y)
+      if (match(Arg0, m_FMax(m_Value(X), m_Value(Y)))) {
+        if (Arg1 == X || Arg1 == Y)
+          return ReplaceInstUsesWith(CI, Arg0);
+      }
+
+      // TODO: fmax(nnan x, -inf) -> x
+      // TODO: fmax(nnan ninf x, -flt_max) -> x
+      if (C1 && C1->isInfinity()) {
+        // fmax(x, inf) -> inf
+        if (!C1->isNegative())
+          return ReplaceInstUsesWith(CI, Arg1);
+      }
+    }
+    break;
+  }
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
     // Turn PPC lvx -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL) >= 16) {
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16,
+                                   DL, AT, II, DT) >= 16) {
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
       return new LoadInst(Ptr);
     }
     break;
+  case Intrinsic::ppc_vsx_lxvw4x:
+  case Intrinsic::ppc_vsx_lxvd2x: {
+    // Turn PPC VSX loads into normal loads.
+    Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+                                        PointerType::getUnqual(II->getType()));
+    return new LoadInst(Ptr, Twine(""), false, 1);
+  }
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
     // Turn stvx -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL) >= 16) {
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16,
+                                   DL, AT, II, DT) >= 16) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
       return new StoreInst(II->getArgOperand(0), Ptr);
     }
     break;
+  case Intrinsic::ppc_vsx_stxvw4x:
+  case Intrinsic::ppc_vsx_stxvd2x: {
+    // Turn PPC VSX stores into normal stores.
+    Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
+    Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+    return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
+  }
   case Intrinsic::x86_sse_storeu_ps:
   case Intrinsic::x86_sse2_storeu_pd:
   case Intrinsic::x86_sse2_storeu_dq:
     // Turn X86 storeu -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL) >= 16) {
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16,
+                                   DL, AT, II, DT) >= 16) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(1)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
@@ -680,7 +782,7 @@
               CI,
               Builder->CreateShuffleVector(
                   Vec, Undef, ConstantDataVector::get(
-                                  II->getContext(), ArrayRef<uint32_t>(Mask))));
+                                  II->getContext(), makeArrayRef(Mask))));
 
         } else if (auto Source =
                        dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
@@ -886,7 +988,7 @@
   case Intrinsic::arm_neon_vst2lane:
   case Intrinsic::arm_neon_vst3lane:
   case Intrinsic::arm_neon_vst4lane: {
-    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL);
+    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), DL, AT, II, DT);
     unsigned AlignArg = II->getNumArgOperands() - 1;
     ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
     if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
@@ -994,6 +1096,55 @@
       return EraseInstFromFunction(CI);
     break;
   }
+  case Intrinsic::assume: {
+    // Canonicalize assume(a && b) -> assume(a); assume(b);
+    // Note: New assumption intrinsics created here are registered by
+    // the InstCombineIRInserter object.
+    Value *IIOperand = II->getArgOperand(0), *A, *B,
+          *AssumeIntrinsic = II->getCalledValue();
+    if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
+      Builder->CreateCall(AssumeIntrinsic, A, II->getName());
+      Builder->CreateCall(AssumeIntrinsic, B, II->getName());
+      return EraseInstFromFunction(*II);
+    }
+    // assume(!(a || b)) -> assume(!a); assume(!b);
+    if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
+      Builder->CreateCall(AssumeIntrinsic, Builder->CreateNot(A),
+                          II->getName());
+      Builder->CreateCall(AssumeIntrinsic, Builder->CreateNot(B),
+                          II->getName());
+      return EraseInstFromFunction(*II);
+    }
+
+    // assume( (load addr) != null ) -> add 'nonnull' metadata to load
+    // (if assume is valid at the load)
+    if (ICmpInst* ICmp = dyn_cast<ICmpInst>(IIOperand)) {
+      Value *LHS = ICmp->getOperand(0);
+      Value *RHS = ICmp->getOperand(1);
+      if (ICmpInst::ICMP_NE == ICmp->getPredicate() &&
+          isa<LoadInst>(LHS) &&
+          isa<Constant>(RHS) &&
+          RHS->getType()->isPointerTy() &&
+          cast<Constant>(RHS)->isNullValue()) {
+        LoadInst* LI = cast<LoadInst>(LHS);
+        if (isValidAssumeForContext(II, LI, DL, DT)) {
+          MDNode* MD = MDNode::get(II->getContext(), ArrayRef<Value*>());
+          LI->setMetadata(LLVMContext::MD_nonnull, MD);
+          return EraseInstFromFunction(*II);
+        }
+      }
+      // TODO: apply nonnull return attributes to calls and invokes
+      // TODO: apply range metadata for range check patterns?
+    }
+    // If there is a dominating assume with the same condition as this one,
+    // then this one is redundant, and should be removed.
+    APInt KnownZero(1, 0), KnownOne(1, 0);
+    computeKnownBits(IIOperand, KnownZero, KnownOne, 0, II);
+    if (KnownOne.isAllOnesValue())
+      return EraseInstFromFunction(*II);
+
+    break;
+  }
   }
 
   return visitCallSite(II);
@@ -1253,7 +1404,7 @@
       if (!Caller->use_empty() &&
           // void -> non-void is handled specially
           !NewRetTy->isVoidTy())
-      return false;   // Cannot transform this return value.
+        return false;   // Cannot transform this return value.
     }
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
@@ -1472,8 +1623,14 @@
 
   if (!Caller->use_empty())
     ReplaceInstUsesWith(*Caller, NV);
-  else if (Caller->hasValueHandle())
-    ValueHandleBase::ValueIsRAUWd(Caller, NV);
+  else if (Caller->hasValueHandle()) {
+    if (OldRetTy == NV->getType())
+      ValueHandleBase::ValueIsRAUWd(Caller, NV);
+    else
+      // We cannot call ValueIsRAUWd with a different type, and the
+      // actual tracked value will disappear.
+      ValueHandleBase::ValueIsDeleted(Caller);
+  }
 
   EraseInstFromFunction(*Caller);
   return true;

diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ff083d7..aba77bb 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp

@@ -335,7 +335,8 @@
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool CanEvaluateTruncated(Value *V, Type *Ty) {
+static bool CanEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
+                                 Instruction *CxtI) {
   // We can always evaluate constants in another type.
   if (isa<Constant>(V))
     return true;
@@ -364,8 +365,8 @@
   case Instruction::Or:
   case Instruction::Xor:
     // These operators can all arbitrarily be extended or truncated.
-    return CanEvaluateTruncated(I->getOperand(0), Ty) &&
-           CanEvaluateTruncated(I->getOperand(1), Ty);
+    return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+           CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
 
   case Instruction::UDiv:
   case Instruction::URem: {
@@ -374,10 +375,10 @@
     uint32_t BitWidth = Ty->getScalarSizeInBits();
     if (BitWidth < OrigBitWidth) {
       APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth);
-      if (MaskedValueIsZero(I->getOperand(0), Mask) &&
-          MaskedValueIsZero(I->getOperand(1), Mask)) {
-        return CanEvaluateTruncated(I->getOperand(0), Ty) &&
-               CanEvaluateTruncated(I->getOperand(1), Ty);
+      if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
+          IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
+        return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+               CanEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
       }
     }
     break;
@@ -388,7 +389,7 @@
     if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
       uint32_t BitWidth = Ty->getScalarSizeInBits();
       if (CI->getLimitedValue(BitWidth) < BitWidth)
-        return CanEvaluateTruncated(I->getOperand(0), Ty);
+        return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
     }
     break;
   case Instruction::LShr:
@@ -398,10 +399,10 @@
     if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
       uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
       uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (MaskedValueIsZero(I->getOperand(0),
-            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
+      if (IC.MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth), 0, CxtI) &&
           CI->getLimitedValue(BitWidth) < BitWidth) {
-        return CanEvaluateTruncated(I->getOperand(0), Ty);
+        return CanEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
       }
     }
     break;
@@ -415,8 +416,8 @@
     return true;
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
-    return CanEvaluateTruncated(SI->getTrueValue(), Ty) &&
-           CanEvaluateTruncated(SI->getFalseValue(), Ty);
+    return CanEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) &&
+           CanEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI);
   }
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -424,7 +425,7 @@
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!CanEvaluateTruncated(PN->getIncomingValue(i), Ty))
+      if (!CanEvaluateTruncated(PN->getIncomingValue(i), Ty, IC, CxtI))
         return false;
     return true;
   }
@@ -453,7 +454,7 @@
   // expression tree to something weird like i93 unless the source is also
   // strange.
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateTruncated(Src, DestTy)) {
+      CanEvaluateTruncated(Src, DestTy, *this, &CI)) {
 
     // If this cast is a truncate, evaluting in a different type always
     // eliminates the cast, so it is always a win.
@@ -553,7 +554,7 @@
       // If Op1C some other power of two, convert:
       uint32_t BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      computeKnownBits(ICI->getOperand(0), KnownZero, KnownOne);
+      computeKnownBits(ICI->getOperand(0), KnownZero, KnownOne, 0, &CI);
 
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
@@ -601,8 +602,8 @@
 
       APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0);
       APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0);
-      computeKnownBits(LHS, KnownZeroLHS, KnownOneLHS);
-      computeKnownBits(RHS, KnownZeroRHS, KnownOneRHS);
+      computeKnownBits(LHS, KnownZeroLHS, KnownOneLHS, 0, &CI);
+      computeKnownBits(RHS, KnownZeroRHS, KnownOneRHS, 0, &CI);
 
       if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) {
         APInt KnownBits = KnownZeroLHS | KnownOneLHS;
@@ -651,7 +652,8 @@
 /// clear the top bits anyway, doing this has no extra cost.
 ///
 /// This function works on both vectors and scalars.
-static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
+static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
+                             InstCombiner &IC, Instruction *CxtI) {
   BitsToClear = 0;
   if (isa<Constant>(V))
     return true;
@@ -680,8 +682,8 @@
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
-    if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear) ||
-        !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp))
+    if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) ||
+        !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI))
       return false;
     // These can all be promoted if neither operand has 'bits to clear'.
     if (BitsToClear == 0 && Tmp == 0)
@@ -695,8 +697,9 @@
       // We use MaskedValueIsZero here for generality, but the case we care
       // about the most is constant RHS.
       unsigned VSize = V->getType()->getScalarSizeInBits();
-      if (MaskedValueIsZero(I->getOperand(1),
-                            APInt::getHighBitsSet(VSize, BitsToClear)))
+      if (IC.MaskedValueIsZero(I->getOperand(1),
+                               APInt::getHighBitsSet(VSize, BitsToClear),
+                               0, CxtI))
         return true;
     }
 
@@ -707,7 +710,7 @@
     // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the
     // upper bits we can reduce BitsToClear by the shift amount.
     if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear))
+      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
         return false;
       uint64_t ShiftAmt = Amt->getZExtValue();
       BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
@@ -718,7 +721,7 @@
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
     if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear))
+      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
         return false;
       BitsToClear += Amt->getZExtValue();
       if (BitsToClear > V->getType()->getScalarSizeInBits())
@@ -728,8 +731,8 @@
     // Cannot promote variable LSHR.
     return false;
   case Instruction::Select:
-    if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp) ||
-        !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear) ||
+    if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) ||
+        !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) ||
         // TODO: If important, we could handle the case when the BitsToClear are
         // known zero in the disagreeing side.
         Tmp != BitsToClear)
@@ -741,10 +744,10 @@
     // get into trouble with cyclic PHIs here because we only consider
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
-    if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear))
+    if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI))
       return false;
     for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp) ||
+      if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) ||
           // TODO: If important, we could handle the case when the BitsToClear
           // are known zero in the disagreeing input.
           Tmp != BitsToClear)
@@ -781,7 +784,7 @@
   // strange.
   unsigned BitsToClear;
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateZExtd(Src, DestTy, BitsToClear)) {
+      CanEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
     assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
            "Unreasonable BitsToClear");
 
@@ -796,8 +799,10 @@
 
     // If the high bits are already filled with zeros, just replace this
     // cast with the result.
-    if (MaskedValueIsZero(Res, APInt::getHighBitsSet(DestBitSize,
-                                                     DestBitSize-SrcBitsKept)))
+    if (MaskedValueIsZero(Res,
+                          APInt::getHighBitsSet(DestBitSize,
+                                                DestBitSize-SrcBitsKept),
+                             0, &CI))
       return ReplaceInstUsesWith(CI, Res);
 
     // We need to emit an AND to clear the high bits.
@@ -895,6 +900,10 @@
   Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
   ICmpInst::Predicate Pred = ICI->getPredicate();
 
+  // Don't bother if Op1 isn't of vector or integer type.
+  if (!Op1->getType()->isIntOrIntVectorTy())
+    return nullptr;
+
   if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
     // (x <s  0) ? -1 : 0 -> ashr x, 31        -> all ones if negative
     // (x >s -1) ? -1 : 0 -> not (ashr x, 31)  -> all ones if positive
@@ -921,7 +930,7 @@
         ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
       unsigned BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      computeKnownBits(Op0, KnownZero, KnownOne);
+      computeKnownBits(Op0, KnownZero, KnownOne, 0, &CI);
 
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) {
@@ -1072,7 +1081,7 @@
 
     // If the high bits are already filled with sign bit, just replace this
     // cast with the result.
-    if (ComputeNumSignBits(Res) > DestBitSize - SrcBitSize)
+    if (ComputeNumSignBits(Res, 0, &CI) > DestBitSize - SrcBitSize)
       return ReplaceInstUsesWith(CI, Res);
 
     // We need to emit a shl + ashr to do the sign extend.
@@ -1264,10 +1273,12 @@
           LHSOrig = Builder->CreateFPExt(LHSOrig, RHSOrig->getType());
         else if (RHSWidth <= SrcWidth)
           RHSOrig = Builder->CreateFPExt(RHSOrig, LHSOrig->getType());
-        Value *ExactResult = Builder->CreateFRem(LHSOrig, RHSOrig);
-        if (Instruction *RI = dyn_cast<Instruction>(ExactResult))
-          RI->copyFastMathFlags(OpI);
-        return CastInst::CreateFPCast(ExactResult, CI.getType());
+        if (LHSOrig != OpI->getOperand(0) || RHSOrig != OpI->getOperand(1)) {
+          Value *ExactResult = Builder->CreateFRem(LHSOrig, RHSOrig);
+          if (Instruction *RI = dyn_cast<Instruction>(ExactResult))
+            RI->copyFastMathFlags(OpI);
+          return CastInst::CreateFPCast(ExactResult, CI.getType());
+        }
     }
 
     // (fptrunc (fneg x)) -> (fneg (fptrunc x))
@@ -1312,42 +1323,6 @@
     }
   }
 
-  // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
-  // Note that we restrict this transformation based on
-  // TLI->has(LibFunc::sqrtf), even for the sqrt intrinsic, because
-  // TLI->has(LibFunc::sqrtf) is sufficient to guarantee that the
-  // single-precision intrinsic can be expanded in the backend.
-  CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
-  if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
-      (Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) ||
-       Call->getCalledFunction()->getIntrinsicID() == Intrinsic::sqrt) &&
-      Call->getNumArgOperands() == 1 &&
-      Call->hasOneUse()) {
-    CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0));
-    if (Arg && Arg->getOpcode() == Instruction::FPExt &&
-        CI.getType()->isFloatTy() &&
-        Call->getType()->isDoubleTy() &&
-        Arg->getType()->isDoubleTy() &&
-        Arg->getOperand(0)->getType()->isFloatTy()) {
-      Function *Callee = Call->getCalledFunction();
-      Module *M = CI.getParent()->getParent()->getParent();
-      Constant *SqrtfFunc = (Callee->getIntrinsicID() == Intrinsic::sqrt) ?
-        Intrinsic::getDeclaration(M, Intrinsic::sqrt, Builder->getFloatTy()) :
-        M->getOrInsertFunction("sqrtf", Callee->getAttributes(),
-                               Builder->getFloatTy(), Builder->getFloatTy(),
-                               NULL);
-      CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
-                                       "sqrtfcall");
-      ret->setAttributes(Callee->getAttributes());
-
-
-      // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
-      ReplaceInstUsesWith(*Call, UndefValue::get(Call->getType()));
-      EraseInstFromFunction(*Call);
-      return ret;
-    }
-  }
-
   return nullptr;
 }
 
@@ -1909,9 +1884,9 @@
 }
 
 Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
-  // If the destination pointer element type is not the the same as the source's
-  // do the addrspacecast to the same type, and then the bitcast in the new
-  // address space. This allows the cast to be exposed to other transforms.
+  // If the destination pointer element type is not the same as the source's
+  // first do a bitcast to the destination type, and then the addrspacecast.
+  // This allows the cast to be exposed to other transforms.
   Value *Src = CI.getOperand(0);
   PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType());
   PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType());

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 5e71c5c..399f1c3 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp

@@ -740,21 +740,6 @@
 Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,
                                             Value *X, ConstantInt *CI,
                                             ICmpInst::Predicate Pred) {
-  // If we have X+0, exit early (simplifying logic below) and let it get folded
-  // elsewhere.   icmp X+0, X  -> icmp X, X
-  if (CI->isZero()) {
-    bool isTrue = ICmpInst::isTrueWhenEqual(Pred);
-    return ReplaceInstUsesWith(ICI, ConstantInt::get(ICI.getType(), isTrue));
-  }
-
-  // (X+4) == X -> false.
-  if (Pred == ICmpInst::ICMP_EQ)
-    return ReplaceInstUsesWith(ICI, Builder->getFalse());
-
-  // (X+4) != X -> true.
-  if (Pred == ICmpInst::ICMP_NE)
-    return ReplaceInstUsesWith(ICI, Builder->getTrue());
-
   // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
   // so the values can never be equal.  Similarly for all other "or equals"
   // operators.
@@ -1044,6 +1029,111 @@
   return nullptr;
 }
 
+/// FoldICmpCstShrCst - Handle "(icmp eq/ne (ashr/lshr const2, A), const1)" ->
+/// (icmp eq/ne A, Log2(const2/const1)) ->
+/// (icmp eq/ne A, Log2(const2) - Log2(const1)).
+Instruction *InstCombiner::FoldICmpCstShrCst(ICmpInst &I, Value *Op, Value *A,
+                                             ConstantInt *CI1,
+                                             ConstantInt *CI2) {
+  assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+  auto getConstant = [&I, this](bool IsTrue) {
+    if (I.getPredicate() == I.ICMP_NE)
+      IsTrue = !IsTrue;
+    return ReplaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue));
+  };
+
+  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+    if (I.getPredicate() == I.ICMP_NE)
+      Pred = CmpInst::getInversePredicate(Pred);
+    return new ICmpInst(Pred, LHS, RHS);
+  };
+
+  APInt AP1 = CI1->getValue();
+  APInt AP2 = CI2->getValue();
+
+  // Don't bother doing any work for cases which InstSimplify handles.
+  if (AP2 == 0)
+    return nullptr;
+  bool IsAShr = isa<AShrOperator>(Op);
+  if (IsAShr) {
+    if (AP2.isAllOnesValue())
+      return nullptr;
+    if (AP2.isNegative() != AP1.isNegative())
+      return nullptr;
+    if (AP2.sgt(AP1))
+      return nullptr;
+  }
+
+  if (!AP1)
+    // 'A' must be large enough to shift out the highest set bit.
+    return getICmp(I.ICMP_UGT, A,
+                   ConstantInt::get(A->getType(), AP2.logBase2()));
+
+  if (AP1 == AP2)
+    return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+  // Get the distance between the highest bit that's set.
+  int Shift;
+  // Both the constants are negative, take their positive to calculate log.
+  if (IsAShr && AP1.isNegative())
+    // Get the ones' complement of AP2 and AP1 when computing the distance.
+    Shift = (~AP2).logBase2() - (~AP1).logBase2();
+  else
+    Shift = AP2.logBase2() - AP1.logBase2();
+
+  if (Shift > 0) {
+    if (IsAShr ? AP1 == AP2.ashr(Shift) : AP1 == AP2.lshr(Shift))
+      return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+  }
+  // Shifting const2 will never be equal to const1.
+  return getConstant(false);
+}
+
+/// FoldICmpCstShlCst - Handle "(icmp eq/ne (shl const2, A), const1)" ->
+/// (icmp eq/ne A, TrailingZeros(const1) - TrailingZeros(const2)).
+Instruction *InstCombiner::FoldICmpCstShlCst(ICmpInst &I, Value *Op, Value *A,
+                                             ConstantInt *CI1,
+                                             ConstantInt *CI2) {
+  assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+  auto getConstant = [&I, this](bool IsTrue) {
+    if (I.getPredicate() == I.ICMP_NE)
+      IsTrue = !IsTrue;
+    return ReplaceInstUsesWith(I, ConstantInt::get(I.getType(), IsTrue));
+  };
+
+  auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+    if (I.getPredicate() == I.ICMP_NE)
+      Pred = CmpInst::getInversePredicate(Pred);
+    return new ICmpInst(Pred, LHS, RHS);
+  };
+
+  APInt AP1 = CI1->getValue();
+  APInt AP2 = CI2->getValue();
+
+  // Don't bother doing any work for cases which InstSimplify handles.
+  if (AP2 == 0)
+    return nullptr;
+
+  unsigned AP2TrailingZeros = AP2.countTrailingZeros();
+
+  if (!AP1 && AP2TrailingZeros != 0)
+    return getICmp(I.ICMP_UGE, A,
+                   ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros));
+
+  if (AP1 == AP2)
+    return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+  // Get the distance between the lowest bits that are set.
+  int Shift = AP1.countTrailingZeros() - AP2TrailingZeros;
+
+  if (Shift > 0 && AP2.shl(Shift) == AP1)
+    return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+
+  // Shifting const2 will never be equal to const1.
+  return getConstant(false);
+}
 
 /// visitICmpInstWithInstAndIntCst - Handle "icmp (instr, intcst)".
 ///
@@ -1060,7 +1150,7 @@
       unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(),
              SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
       APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
-      computeKnownBits(LHSI->getOperand(0), KnownZero, KnownOne);
+      computeKnownBits(LHSI->getOperand(0), KnownZero, KnownOne, 0, &ICI);
 
       // If all the high bits are known, we can do this xform.
       if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
@@ -1282,6 +1372,48 @@
         return &ICI;
       }
 
+      // (icmp pred (and (or (lshr X, Y), X), 1), 0) -->
+      //    (icmp pred (and X, (or (shl 1, Y), 1), 0))
+      //
+      // iff pred isn't signed
+      {
+        Value *X, *Y, *LShr;
+        if (!ICI.isSigned() && RHSV == 0) {
+          if (match(LHSI->getOperand(1), m_One())) {
+            Constant *One = cast<Constant>(LHSI->getOperand(1));
+            Value *Or = LHSI->getOperand(0);
+            if (match(Or, m_Or(m_Value(LShr), m_Value(X))) &&
+                match(LShr, m_LShr(m_Specific(X), m_Value(Y)))) {
+              unsigned UsesRemoved = 0;
+              if (LHSI->hasOneUse())
+                ++UsesRemoved;
+              if (Or->hasOneUse())
+                ++UsesRemoved;
+              if (LShr->hasOneUse())
+                ++UsesRemoved;
+              Value *NewOr = nullptr;
+              // Compute X & ((1 << Y) | 1)
+              if (auto *C = dyn_cast<Constant>(Y)) {
+                if (UsesRemoved >= 1)
+                  NewOr =
+                      ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
+              } else {
+                if (UsesRemoved >= 3)
+                  NewOr = Builder->CreateOr(Builder->CreateShl(One, Y,
+                                                               LShr->getName(),
+                                                               /*HasNUW=*/true),
+                                            One, Or->getName());
+              }
+              if (NewOr) {
+                Value *NewAnd = Builder->CreateAnd(X, NewOr, LHSI->getName());
+                ICI.setOperand(0, NewAnd);
+                return &ICI;
+              }
+            }
+          }
+        }
+      }
+
       // Replace ((X & AndCst) > RHSV) with ((X & AndCst) != 0), if any
       // bit set in (X & AndCst) will produce a result greater than RHSV.
       if (ICI.getPredicate() == ICmpInst::ICMP_UGT) {
@@ -1377,16 +1509,10 @@
           unsigned RHSLog2 = RHSV.logBase2();
 
           // (1 << X) >= 2147483648 -> X >= 31 -> X == 31
-          // (1 << X) >  2147483648 -> X >  31 -> false
-          // (1 << X) <= 2147483648 -> X <= 31 -> true
           // (1 << X) <  2147483648 -> X <  31 -> X != 31
           if (RHSLog2 == TypeBits-1) {
             if (Pred == ICmpInst::ICMP_UGE)
               Pred = ICmpInst::ICMP_EQ;
-            else if (Pred == ICmpInst::ICMP_UGT)
-              return ReplaceInstUsesWith(ICI, Builder->getFalse());
-            else if (Pred == ICmpInst::ICMP_ULE)
-              return ReplaceInstUsesWith(ICI, Builder->getTrue());
             else if (Pred == ICmpInst::ICMP_ULT)
               Pred = ICmpInst::ICMP_NE;
           }
@@ -1421,10 +1547,6 @@
           if (RHSVIsPowerOf2)
             return new ICmpInst(
                 Pred, X, ConstantInt::get(RHS->getType(), RHSV.logBase2()));
-
-          return ReplaceInstUsesWith(
-              ICI, Pred == ICmpInst::ICMP_EQ ? Builder->getFalse()
-                                             : Builder->getTrue());
         }
       }
       break;
@@ -1932,8 +2054,8 @@
   // sign-extended; check for that condition. For example, if CI2 is 2^31 and
   // the operands of the add are 64 bits wide, we need at least 33 sign bits.
   unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
-  if (IC.ComputeNumSignBits(A) < NeededSignBits ||
-      IC.ComputeNumSignBits(B) < NeededSignBits)
+  if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits ||
+      IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
     return nullptr;
 
   // In order to replace the original add with a narrower
@@ -2038,8 +2160,8 @@
   Instruction *MulInstr = cast<Instruction>(MulVal);
   assert(MulInstr->getOpcode() == Instruction::Mul);
 
-  Instruction *LHS = cast<Instruction>(MulInstr->getOperand(0)),
-              *RHS = cast<Instruction>(MulInstr->getOperand(1));
+  auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)),
+       *RHS = cast<ZExtOperator>(MulInstr->getOperand(1));
   assert(LHS->getOpcode() == Instruction::ZExt);
   assert(RHS->getOpcode() == Instruction::ZExt);
   Value *A = LHS->getOperand(0), *B = RHS->getOperand(0);
@@ -2341,7 +2463,7 @@
     Changed = true;
   }
 
-  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL))
+  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // comparing -val or val with non-zero is the same as just comparing val
@@ -2469,6 +2591,21 @@
                           Builder->getInt(CI->getValue()-1));
     }
 
+    if (I.isEquality()) {
+      ConstantInt *CI2;
+      if (match(Op0, m_AShr(m_ConstantInt(CI2), m_Value(A))) ||
+          match(Op0, m_LShr(m_ConstantInt(CI2), m_Value(A)))) {
+        // (icmp eq/ne (ashr/lshr const2, A), const1)
+        if (Instruction *Inst = FoldICmpCstShrCst(I, Op0, A, CI, CI2))
+          return Inst;
+      }
+      if (match(Op0, m_Shl(m_ConstantInt(CI2), m_Value(A)))) {
+        // (icmp eq/ne (shl const2, A), const1)
+        if (Instruction *Inst = FoldICmpCstShlCst(I, Op0, A, CI, CI2))
+          return Inst;
+      }
+    }
+
     // If this comparison is a normal comparison, it demands all
     // bits, if it is a sign bit comparison, it only demands the sign bit.
     bool UnusedBit;
@@ -2878,6 +3015,12 @@
     if (BO1 && BO1->getOpcode() == Instruction::Add)
       C = BO1->getOperand(0), D = BO1->getOperand(1);
 
+    // icmp (X+cst) < 0 --> X < -cst
+    if (NoOp0WrapProblem && ICmpInst::isSigned(Pred) && match(Op1, m_Zero()))
+      if (ConstantInt *RHSC = dyn_cast_or_null<ConstantInt>(B))
+        if (!RHSC->isMinValue(/*isSigned=*/true))
+          return new ICmpInst(Pred, A, ConstantExpr::getNeg(RHSC));
+
     // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
     if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
       return new ICmpInst(Pred, A == Op1 ? B : A,
@@ -3112,7 +3255,9 @@
     // and       (A & ~B) != 0 --> (A & B) == 0
     // if A is a power of 2.
     if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
-        match(Op1, m_Zero()) && isKnownToBeAPowerOfTwo(A) && I.isEquality())
+        match(Op1, m_Zero()) && isKnownToBeAPowerOfTwo(A, false,
+                                                       0, AT, &I, DT) &&
+                                I.isEquality())
       return new ICmpInst(I.getInversePredicate(),
                           Builder->CreateAnd(A, B),
                           Op1);
@@ -3273,6 +3418,22 @@
     }
   }
 
+  // The 'cmpxchg' instruction returns an aggregate containing the old value and
+  // an i1 which indicates whether or not we successfully did the swap.
+  //
+  // Replace comparisons between the old value and the expected value with the
+  // indicator that 'cmpxchg' returns.
+  //
+  // N.B.  This transform is only valid when the 'cmpxchg' is not permitted to
+  // spuriously fail.  In those cases, the old value may equal the expected
+  // value but it is possible for the swap to not occur.
+  if (I.getPredicate() == ICmpInst::ICMP_EQ)
+    if (auto *EVI = dyn_cast<ExtractValueInst>(Op0))
+      if (auto *ACXI = dyn_cast<AtomicCmpXchgInst>(EVI->getAggregateOperand()))
+        if (EVI->getIndices()[0] == 0 && ACXI->getCompareOperand() == Op1 &&
+            !ACXI->isWeak())
+          return ExtractValueInst::Create(ACXI, 1);
+
   {
     Value *X; ConstantInt *Cst;
     // icmp X+Cst, X
@@ -3502,7 +3663,7 @@
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL))
+  if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // Simplify 'fcmp pred X, X'

diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index c10e92a..f3ac44c 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

@@ -15,6 +15,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -268,7 +269,8 @@
     SmallVector<Instruction *, 4> ToDelete;
     if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
       unsigned SourceAlign = getOrEnforceKnownAlignment(Copy->getSource(),
-                                                        AI.getAlignment(), DL);
+                                                        AI.getAlignment(),
+                                                        DL, AT, &AI, DT);
       if (AI.getAlignment() <= SourceAlign) {
         DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
         DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
@@ -290,80 +292,112 @@
   return visitAllocSite(AI);
 }
 
+/// \brief Helper to combine a load to a new type.
+///
+/// This just does the work of combining a load to a new type. It handles
+/// metadata, etc., and returns the new instruction. The \c NewTy should be the
+/// loaded *value* type. This will convert it to a pointer, cast the operand to
+/// that pointer type, load it, etc.
+///
+/// Note that this will create all of the instructions with whatever insert
+/// point the \c InstCombiner currently is using.
+static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy) {
+  Value *Ptr = LI.getPointerOperand();
+  unsigned AS = LI.getPointerAddressSpace();
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  LI.getAllMetadata(MD);
 
-/// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible.
-static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
-                                        const DataLayout *DL) {
-  User *CI = cast<User>(LI.getOperand(0));
-  Value *CastOp = CI->getOperand(0);
+  LoadInst *NewLoad = IC.Builder->CreateAlignedLoad(
+      IC.Builder->CreateBitCast(Ptr, NewTy->getPointerTo(AS)),
+      LI.getAlignment(), LI.getName());
+  for (const auto &MDPair : MD) {
+    unsigned ID = MDPair.first;
+    MDNode *N = MDPair.second;
+    // Note, essentially every kind of metadata should be preserved here! This
+    // routine is supposed to clone a load instruction changing *only its type*.
+    // The only metadata it makes sense to drop is metadata which is invalidated
+    // when the pointer type changes. This should essentially never be the case
+    // in LLVM, but we explicitly switch over only known metadata to be
+    // conservatively correct. If you are adding metadata to LLVM which pertains
+    // to loads, you almost certainly want to add it here.
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_prof:
+    case LLVMContext::MD_fpmath:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_invariant_load:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_nontemporal:
+    case LLVMContext::MD_mem_parallel_loop_access:
+    case LLVMContext::MD_nonnull:
+      // All of these directly apply.
+      NewLoad->setMetadata(ID, N);
+      break;
 
-  PointerType *DestTy = cast<PointerType>(CI->getType());
-  Type *DestPTy = DestTy->getElementType();
-  if (PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
-
-    // If the address spaces don't match, don't eliminate the cast.
-    if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
-      return nullptr;
-
-    Type *SrcPTy = SrcTy->getElementType();
-
-    if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() ||
-         DestPTy->isVectorTy()) {
-      // If the source is an array, the code below will not succeed.  Check to
-      // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
-      // constants.
-      if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
-        if (Constant *CSrc = dyn_cast<Constant>(CastOp))
-          if (ASrcTy->getNumElements() != 0) {
-            Type *IdxTy = DL
-                        ? DL->getIntPtrType(SrcTy)
-                        : Type::getInt64Ty(SrcTy->getContext());
-            Value *Idx = Constant::getNullValue(IdxTy);
-            Value *Idxs[2] = { Idx, Idx };
-            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs);
-            SrcTy = cast<PointerType>(CastOp->getType());
-            SrcPTy = SrcTy->getElementType();
-          }
-
-      if (IC.getDataLayout() &&
-          (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() ||
-            SrcPTy->isVectorTy()) &&
-          // Do not allow turning this into a load of an integer, which is then
-          // casted to a pointer, this pessimizes pointer analysis a lot.
-          (SrcPTy->isPtrOrPtrVectorTy() ==
-           LI.getType()->isPtrOrPtrVectorTy()) &&
-          IC.getDataLayout()->getTypeSizeInBits(SrcPTy) ==
-               IC.getDataLayout()->getTypeSizeInBits(DestPTy)) {
-
-        // Okay, we are casting from one integer or pointer type to another of
-        // the same size.  Instead of casting the pointer before the load, cast
-        // the result of the loaded value.
-        LoadInst *NewLoad =
-          IC.Builder->CreateLoad(CastOp, LI.isVolatile(), CI->getName());
-        NewLoad->setAlignment(LI.getAlignment());
-        NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
-        // Now cast the result of the load.
-        PointerType *OldTy = dyn_cast<PointerType>(NewLoad->getType());
-        PointerType *NewTy = dyn_cast<PointerType>(LI.getType());
-        if (OldTy && NewTy &&
-            OldTy->getAddressSpace() != NewTy->getAddressSpace()) {
-          return new AddrSpaceCastInst(NewLoad, LI.getType());
-        }
-
-        return new BitCastInst(NewLoad, LI.getType());
-      }
+    case LLVMContext::MD_range:
+      // FIXME: It would be nice to propagate this in some way, but the type
+      // conversions make it hard.
+      break;
     }
   }
+  return NewLoad;
+}
+
+/// \brief Combine loads to match the type of value their uses after looking
+/// through intervening bitcasts.
+///
+/// The core idea here is that if the result of a load is used in an operation,
+/// we should load the type most conducive to that operation. For example, when
+/// loading an integer and converting that immediately to a pointer, we should
+/// instead directly load a pointer.
+///
+/// However, this routine must never change the width of a load or the number of
+/// loads as that would introduce a semantic change. This combine is expected to
+/// be a semantic no-op which just allows loads to more closely model the types
+/// of their consuming operations.
+///
+/// Currently, we also refuse to change the precise type used for an atomic load
+/// or a volatile load. This is debatable, and might be reasonable to change
+/// later. However, it is risky in case some backend or other part of LLVM is
+/// relying on the exact type loaded to select appropriate atomic operations.
+static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
+  // FIXME: We could probably with some care handle both volatile and atomic
+  // loads here but it isn't clear that this is important.
+  if (!LI.isSimple())
+    return nullptr;
+
+  if (LI.use_empty())
+    return nullptr;
+
+
+  // Fold away bit casts of the loaded value by loading the desired type.
+  if (LI.hasOneUse())
+    if (auto *BC = dyn_cast<BitCastInst>(LI.user_back())) {
+      LoadInst *NewLoad = combineLoadToNewType(IC, LI, BC->getDestTy());
+      BC->replaceAllUsesWith(NewLoad);
+      IC.EraseInstFromFunction(*BC);
+      return &LI;
+    }
+
+  // FIXME: We should also canonicalize loads of vectors when their elements are
+  // cast to other types.
   return nullptr;
 }
 
 Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   Value *Op = LI.getOperand(0);
 
+  // Try to canonicalize the loaded type.
+  if (Instruction *Res = combineLoadToOperationType(*this, LI))
+    return Res;
+
   // Attempt to improve the alignment.
   if (DL) {
     unsigned KnownAlign =
-      getOrEnforceKnownAlignment(Op, DL->getPrefTypeAlignment(LI.getType()),DL);
+      getOrEnforceKnownAlignment(Op, DL->getPrefTypeAlignment(LI.getType()),
+                                 DL, AT, &LI, DT);
     unsigned LoadAlign = LI.getAlignment();
     unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign :
       DL->getABITypeAlignment(LI.getType());
@@ -374,11 +408,6 @@
       LI.setAlignment(EffectiveLoadAlign);
   }
 
-  // load (cast X) --> cast (load X) iff safe.
-  if (isa<CastInst>(Op))
-    if (Instruction *Res = InstCombineLoadCast(*this, LI, DL))
-      return Res;
-
   // None of the following transforms are legal for volatile/atomic loads.
   // FIXME: Some of it is okay for atomic loads; needs refactoring.
   if (!LI.isSimple()) return nullptr;
@@ -388,7 +417,8 @@
   // separated by a few arithmetic operations.
   BasicBlock::iterator BBI = &LI;
   if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
-    return ReplaceInstUsesWith(LI, AvailableVal);
+    return ReplaceInstUsesWith(
+        LI, Builder->CreateBitCast(AvailableVal, LI.getType()));
 
   // load(gep null, ...) -> unreachable
   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
@@ -417,12 +447,6 @@
     return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
   }
 
-  // Instcombine load (constantexpr_cast global) -> cast (load global)
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op))
-    if (CE->isCast())
-      if (Instruction *Res = InstCombineLoadCast(*this, LI, DL))
-        return Res;
-
   if (Op->hasOneUse()) {
     // Change select and PHI nodes to select values instead of addresses: this
     // helps alias analysis out a lot, allows many others simplifications, and
@@ -473,7 +497,7 @@
   User *CI = cast<User>(SI.getOperand(1));
   Value *CastOp = CI->getOperand(0);
 
-  Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  Type *DestPTy = CI->getType()->getPointerElementType();
   PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
   if (!SrcTy) return nullptr;
 
@@ -518,8 +542,7 @@
 
   // If the pointers point into different address spaces don't do the
   // transformation.
-  if (SrcTy->getAddressSpace() !=
-      cast<PointerType>(CI->getType())->getAddressSpace())
+  if (SrcTy->getAddressSpace() != CI->getType()->getPointerAddressSpace())
     return nullptr;
 
   // If the pointers point to values of different sizes don't do the
@@ -602,7 +625,7 @@
   if (DL) {
     unsigned KnownAlign =
       getOrEnforceKnownAlignment(Ptr, DL->getPrefTypeAlignment(Val->getType()),
-                                 DL);
+                                 DL, AT, &SI, DT);
     unsigned StoreAlign = SI.getAlignment();
     unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign :
       DL->getABITypeAlignment(Val->getType());
@@ -837,12 +860,13 @@
   InsertNewInstBefore(NewSI, *BBI);
   NewSI->setDebugLoc(OtherStore->getDebugLoc());
 
-  // If the two stores had the same TBAA tag, preserve it.
-  if (MDNode *TBAATag = SI.getMetadata(LLVMContext::MD_tbaa))
-    if ((TBAATag = MDNode::getMostGenericTBAA(TBAATag,
-                               OtherStore->getMetadata(LLVMContext::MD_tbaa))))
-      NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
-
+  // If the two stores had AA tags, merge them.
+  AAMDNodes AATags;
+  SI.getAAMetadata(AATags);
+  if (AATags) {
+    OtherStore->getAAMetadata(AATags, /* Merge = */ true);
+    NewSI->setAAMetadata(AATags);
+  }
 
   // Nuke the old stores.
   EraseInstFromFunction(SI);

diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 6c6e7d8..8c48dce 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp

@@ -25,7 +25,8 @@
 /// simplifyValueKnownNonZero - The specific integer value is used in a context
 /// where it is known to be non-zero.  If this allows us to simplify the
 /// computation, do so and return the new operand, otherwise return null.
-static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
+static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
+                                        Instruction *CxtI) {
   // If V has multiple uses, then we would have to do more analysis to determine
   // if this is safe.  For example, the use could be in dynamically unreached
   // code.
@@ -35,22 +36,23 @@
 
   // ((1 << A) >>u B) --> (1 << (A-B))
   // Because V cannot be zero, we know that B is less than A.
-  Value *A = nullptr, *B = nullptr, *PowerOf2 = nullptr;
-  if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))),
-                      m_Value(B))) &&
-      // The "1" can be any value known to be a power of 2.
-      isKnownToBeAPowerOfTwo(PowerOf2)) {
+  Value *A = nullptr, *B = nullptr, *One = nullptr;
+  if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(One), m_Value(A))), m_Value(B))) &&
+      match(One, m_One())) {
     A = IC.Builder->CreateSub(A, B);
-    return IC.Builder->CreateShl(PowerOf2, A);
+    return IC.Builder->CreateShl(One, A);
   }
 
   // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
   // inexact.  Similarly for <<.
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
-    if (I->isLogicalShift() && isKnownToBeAPowerOfTwo(I->getOperand(0))) {
+    if (I->isLogicalShift() && isKnownToBeAPowerOfTwo(I->getOperand(0), false,
+                                                      0, IC.getAssumptionTracker(),
+                                                      CxtI,
+                                                      IC.getDominatorTree())) {
       // We know that this is an exact/nuw shift and that the input is a
       // non-zero context as well.
-      if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC)) {
+      if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
         I->setOperand(0, V2);
         MadeChange = true;
       }
@@ -76,25 +78,30 @@
 
 /// MultiplyOverflows - True if the multiply can not be expressed in an int
 /// this size.
-static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
-  uint32_t W = C1->getBitWidth();
-  APInt LHSExt = C1->getValue(), RHSExt = C2->getValue();
-  if (sign) {
-    LHSExt = LHSExt.sext(W * 2);
-    RHSExt = RHSExt.sext(W * 2);
-  } else {
-    LHSExt = LHSExt.zext(W * 2);
-    RHSExt = RHSExt.zext(W * 2);
-  }
+static bool MultiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
+                              bool IsSigned) {
+  bool Overflow;
+  if (IsSigned)
+    Product = C1.smul_ov(C2, Overflow);
+  else
+    Product = C1.umul_ov(C2, Overflow);
 
-  APInt MulExt = LHSExt * RHSExt;
+  return Overflow;
+}
 
-  if (!sign)
-    return MulExt.ugt(APInt::getLowBitsSet(W * 2, W));
+/// \brief True if C2 is a multiple of C1. Quotient contains C2/C1.
+static bool IsMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
+                       bool IsSigned) {
+  assert(C1.getBitWidth() == C2.getBitWidth() &&
+         "Inconsistent width of constants!");
 
-  APInt Min = APInt::getSignedMinValue(W).sext(W * 2);
-  APInt Max = APInt::getSignedMaxValue(W).sext(W * 2);
-  return MulExt.slt(Min) || MulExt.sgt(Max);
+  APInt Remainder(C1.getBitWidth(), /*Val=*/0ULL, IsSigned);
+  if (IsSigned)
+    APInt::sdivrem(C1, C2, Quotient, Remainder);
+  else
+    APInt::udivrem(C1, C2, Quotient, Remainder);
+
+  return Remainder.isMinValue();
 }
 
 /// \brief A helper routine of InstCombiner::visitMul().
@@ -123,7 +130,7 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyMulInst(Op0, Op1, DL))
+  if (Value *V = SimplifyMulInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyUsingDistributiveLaws(I))
@@ -155,8 +162,10 @@
 
       if (NewCst) {
         BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
-        if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap();
-        if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap();
+
+        if (I.hasNoUnsignedWrap())
+          Shl->setHasNoUnsignedWrap();
+
         return Shl;
       }
     }
@@ -277,9 +286,9 @@
     APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true);
 
     Value *BoolCast = nullptr, *OtherOp = nullptr;
-    if (MaskedValueIsZero(Op0, Negative2))
+    if (MaskedValueIsZero(Op0, Negative2, 0, &I))
       BoolCast = Op0, OtherOp = Op1;
-    else if (MaskedValueIsZero(Op1, Negative2))
+    else if (MaskedValueIsZero(Op1, Negative2, 0, &I))
       BoolCast = Op1, OtherOp = Op0;
 
     if (BoolCast) {
@@ -292,40 +301,32 @@
   return Changed ? &I : nullptr;
 }
 
-//
-// Detect pattern:
-//
-// log2(Y*0.5)
-//
-// And check for corresponding fast math flags
-//
-
+/// Detect pattern log2(Y * 0.5) with corresponding fast math flags.
 static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
+  if (!Op->hasOneUse())
+    return;
 
-   if (!Op->hasOneUse())
-     return;
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
+  if (!II)
+    return;
+  if (II->getIntrinsicID() != Intrinsic::log2 || !II->hasUnsafeAlgebra())
+    return;
+  Log2 = II;
 
-   IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
-   if (!II)
-     return;
-   if (II->getIntrinsicID() != Intrinsic::log2 || !II->hasUnsafeAlgebra())
-     return;
-   Log2 = II;
+  Value *OpLog2Of = II->getArgOperand(0);
+  if (!OpLog2Of->hasOneUse())
+    return;
 
-   Value *OpLog2Of = II->getArgOperand(0);
-   if (!OpLog2Of->hasOneUse())
-     return;
+  Instruction *I = dyn_cast<Instruction>(OpLog2Of);
+  if (!I)
+    return;
+  if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+    return;
 
-   Instruction *I = dyn_cast<Instruction>(OpLog2Of);
-   if (!I)
-     return;
-   if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
-     return;
-
-   if (match(I->getOperand(0), m_SpecificFP(0.5)))
-     Y = I->getOperand(1);
-   else if (match(I->getOperand(1), m_SpecificFP(0.5)))
-     Y = I->getOperand(0);
+  if (match(I->getOperand(0), m_SpecificFP(0.5)))
+    Y = I->getOperand(1);
+  else if (match(I->getOperand(1), m_SpecificFP(0.5)))
+    Y = I->getOperand(0);
 }
 
 static bool isFiniteNonZeroFp(Constant *C) {
@@ -440,7 +441,8 @@
   if (isa<Constant>(Op0))
     std::swap(Op0, Op1);
 
-  if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL))
+  if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL, TLI,
+                                  DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   bool AllowReassociate = I.hasUnsafeAlgebra();
@@ -510,10 +512,15 @@
     }
   }
 
+  // sqrt(X) * sqrt(X) -> X
+  if (AllowReassociate && (Op0 == Op1))
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0))
+      if (II->getIntrinsicID() == Intrinsic::sqrt)
+        return ReplaceInstUsesWith(I, II->getOperand(0));
 
   // Under unsafe algebra do:
   // X * log2(0.5*Y) = X*log2(Y) - X
-  if (I.hasUnsafeAlgebra()) {
+  if (AllowReassociate) {
     Value *OpX = nullptr;
     Value *OpY = nullptr;
     IntrinsicInst *Log2;
@@ -596,36 +603,6 @@
       }
     }
 
-    // B * (uitofp i1 C) -> select C, B, 0
-    if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) {
-      Value *LHS = Op0, *RHS = Op1;
-      Value *B, *C;
-      if (!match(RHS, m_UIToFP(m_Value(C))))
-        std::swap(LHS, RHS);
-
-      if (match(RHS, m_UIToFP(m_Value(C))) &&
-          C->getType()->getScalarType()->isIntegerTy(1)) {
-        B = LHS;
-        Value *Zero = ConstantFP::getNegativeZero(B->getType());
-        return SelectInst::Create(C, B, Zero);
-      }
-    }
-
-    // A * (1 - uitofp i1 C) -> select C, 0, A
-    if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) {
-      Value *LHS = Op0, *RHS = Op1;
-      Value *A, *C;
-      if (!match(RHS, m_FSub(m_FPOne(), m_UIToFP(m_Value(C)))))
-        std::swap(LHS, RHS);
-
-      if (match(RHS, m_FSub(m_FPOne(), m_UIToFP(m_Value(C)))) &&
-          C->getType()->getScalarType()->isIntegerTy(1)) {
-        A = LHS;
-        Value *Zero = ConstantFP::getNegativeZero(A->getType());
-        return SelectInst::Create(C, Zero, A);
-      }
-    }
-
     if (!isa<Constant>(Op1))
       std::swap(Opnd0, Opnd1);
     else
@@ -714,7 +691,7 @@
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // The RHS is known non-zero.
-  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) {
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, &I)) {
     I.setOperand(1, V);
     return &I;
   }
@@ -724,25 +701,83 @@
   if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
     return &I;
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
-    // (X / C1) / C2  -> X / (C1*C2)
-    if (Instruction *LHS = dyn_cast<Instruction>(Op0))
-      if (Instruction::BinaryOps(LHS->getOpcode()) == I.getOpcode())
-        if (ConstantInt *LHSRHS = dyn_cast<ConstantInt>(LHS->getOperand(1))) {
-          if (MultiplyOverflows(RHS, LHSRHS,
-                                I.getOpcode() == Instruction::SDiv))
-            return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-          return BinaryOperator::Create(I.getOpcode(), LHS->getOperand(0),
-                                        ConstantExpr::getMul(RHS, LHSRHS));
+  if (Instruction *LHS = dyn_cast<Instruction>(Op0)) {
+    const APInt *C2;
+    if (match(Op1, m_APInt(C2))) {
+      Value *X;
+      const APInt *C1;
+      bool IsSigned = I.getOpcode() == Instruction::SDiv;
+
+      // (X / C1) / C2  -> X / (C1*C2)
+      if ((IsSigned && match(LHS, m_SDiv(m_Value(X), m_APInt(C1)))) ||
+          (!IsSigned && match(LHS, m_UDiv(m_Value(X), m_APInt(C1))))) {
+        APInt Product(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+        if (!MultiplyOverflows(*C1, *C2, Product, IsSigned))
+          return BinaryOperator::Create(I.getOpcode(), X,
+                                        ConstantInt::get(I.getType(), Product));
+      }
+
+      if ((IsSigned && match(LHS, m_NSWMul(m_Value(X), m_APInt(C1)))) ||
+          (!IsSigned && match(LHS, m_NUWMul(m_Value(X), m_APInt(C1))))) {
+        APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+
+        // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
+        if (IsMultiple(*C2, *C1, Quotient, IsSigned)) {
+          BinaryOperator *BO = BinaryOperator::Create(
+              I.getOpcode(), X, ConstantInt::get(X->getType(), Quotient));
+          BO->setIsExact(I.isExact());
+          return BO;
         }
 
-    if (!RHS->isZero()) { // avoid X udiv 0
-      if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
-        if (Instruction *R = FoldOpIntoSelect(I, SI))
-          return R;
-      if (isa<PHINode>(Op0))
-        if (Instruction *NV = FoldOpIntoPhi(I))
-          return NV;
+        // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
+        if (IsMultiple(*C1, *C2, Quotient, IsSigned)) {
+          BinaryOperator *BO = BinaryOperator::Create(
+              Instruction::Mul, X, ConstantInt::get(X->getType(), Quotient));
+          BO->setHasNoUnsignedWrap(
+              !IsSigned &&
+              cast<OverflowingBinaryOperator>(LHS)->hasNoUnsignedWrap());
+          BO->setHasNoSignedWrap(
+              cast<OverflowingBinaryOperator>(LHS)->hasNoSignedWrap());
+          return BO;
+        }
+      }
+
+      if ((IsSigned && match(LHS, m_NSWShl(m_Value(X), m_APInt(C1))) &&
+           *C1 != C1->getBitWidth() - 1) ||
+          (!IsSigned && match(LHS, m_NUWShl(m_Value(X), m_APInt(C1))))) {
+        APInt Quotient(C1->getBitWidth(), /*Val=*/0ULL, IsSigned);
+        APInt C1Shifted = APInt::getOneBitSet(
+            C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
+
+        // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of C1.
+        if (IsMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
+          BinaryOperator *BO = BinaryOperator::Create(
+              I.getOpcode(), X, ConstantInt::get(X->getType(), Quotient));
+          BO->setIsExact(I.isExact());
+          return BO;
+        }
+
+        // (X << C1) / C2 -> X * (C2 >> C1) if C1 is a multiple of C2.
+        if (IsMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
+          BinaryOperator *BO = BinaryOperator::Create(
+              Instruction::Mul, X, ConstantInt::get(X->getType(), Quotient));
+          BO->setHasNoUnsignedWrap(
+              !IsSigned &&
+              cast<OverflowingBinaryOperator>(LHS)->hasNoUnsignedWrap());
+          BO->setHasNoSignedWrap(
+              cast<OverflowingBinaryOperator>(LHS)->hasNoSignedWrap());
+          return BO;
+        }
+      }
+
+      if (*C2 != 0) { // avoid X udiv 0
+        if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+          if (Instruction *R = FoldOpIntoSelect(I, SI))
+            return R;
+        if (isa<PHINode>(Op0))
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
+      }
     }
   }
 
@@ -828,7 +863,8 @@
   const APInt &C = cast<Constant>(Op1)->getUniqueInteger();
   BinaryOperator *LShr = BinaryOperator::CreateLShr(
       Op0, ConstantInt::get(Op0->getType(), C.logBase2()));
-  if (I.isExact()) LShr->setIsExact();
+  if (I.isExact())
+    LShr->setIsExact();
   return LShr;
 }
 
@@ -856,7 +892,8 @@
   if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1))
     N = IC.Builder->CreateZExt(N, Z->getDestTy());
   BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
-  if (I.isExact()) LShr->setIsExact();
+  if (I.isExact())
+    LShr->setIsExact();
   return LShr;
 }
 
@@ -893,10 +930,10 @@
     return 0;
 
   if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
-    if (size_t LHSIdx = visitUDivOperand(Op0, SI->getOperand(1), I, Actions))
-      if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions)) {
-        Actions.push_back(UDivFoldAction((FoldUDivOperandCb)nullptr, Op1,
-                                         LHSIdx-1));
+    if (size_t LHSIdx =
+            visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth))
+      if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) {
+        Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1));
         return Actions.size();
       }
 
@@ -909,7 +946,7 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyUDivInst(Op0, Op1, DL))
+  if (Value *V = SimplifyUDivInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // Handle the integer div common cases
@@ -917,19 +954,25 @@
     return Common;
 
   // (x lshr C1) udiv C2 --> x udiv (C2 << C1)
-  if (Constant *C2 = dyn_cast<Constant>(Op1)) {
+  {
     Value *X;
-    Constant *C1;
-    if (match(Op0, m_LShr(m_Value(X), m_Constant(C1))))
-      return BinaryOperator::CreateUDiv(X, ConstantExpr::getShl(C2, C1));
+    const APInt *C1, *C2;
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) &&
+        match(Op1, m_APInt(C2))) {
+      bool Overflow;
+      APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
+      if (!Overflow)
+        return BinaryOperator::CreateUDiv(
+            X, ConstantInt::get(X->getType(), C2ShlC1));
+    }
   }
 
   // (zext A) udiv (zext B) --> zext (A udiv B)
   if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
     if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
-      return new ZExtInst(Builder->CreateUDiv(ZOp0->getOperand(0), ZOp1, "div",
-                                              I.isExact()),
-                          I.getType());
+      return new ZExtInst(
+          Builder->CreateUDiv(ZOp0->getOperand(0), ZOp1, "div", I.isExact()),
+          I.getType());
 
   // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
   SmallVector<UDivFoldAction, 6> UDivActions;
@@ -971,7 +1014,7 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifySDivInst(Op0, Op1, DL))
+  if (Value *V = SimplifySDivInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // Handle the integer div common cases
@@ -1008,8 +1051,8 @@
   // unsigned inputs), turn this into a udiv.
   if (I.getType()->isIntegerTy()) {
     APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
-    if (MaskedValueIsZero(Op0, Mask)) {
-      if (MaskedValueIsZero(Op1, Mask)) {
+    if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
+      if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
         // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
         return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
       }
@@ -1034,8 +1077,7 @@
 /// If the conversion was successful, the simplified expression "X * 1/C" is
 /// returned; otherwise, NULL is returned.
 ///
-static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
-                                             Constant *Divisor,
+static Instruction *CvtFDivConstToReciprocal(Value *Dividend, Constant *Divisor,
                                              bool AllowReciprocal) {
   if (!isa<ConstantFP>(Divisor)) // TODO: handle vectors.
     return nullptr;
@@ -1064,7 +1106,7 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyFDivInst(Op0, Op1, DL))
+  if (Value *V = SimplifyFDivInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   if (isa<Constant>(Op0))
@@ -1195,7 +1237,7 @@
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // The RHS is known non-zero.
-  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) {
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, &I)) {
     I.setOperand(1, V);
     return &I;
   }
@@ -1229,7 +1271,7 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyURemInst(Op0, Op1, DL))
+  if (Value *V = SimplifyURemInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   if (Instruction *common = commonIRemTransforms(I))
@@ -1242,7 +1284,7 @@
                           I.getType());
 
   // X urem Y -> X and Y-1, where Y is a power of 2,
-  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true)) {
+  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true, 0, AT, &I, DT)) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
     Value *Add = Builder->CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
@@ -1264,28 +1306,29 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifySRemInst(Op0, Op1, DL))
+  if (Value *V = SimplifySRemInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // Handle the integer rem common cases
   if (Instruction *Common = commonIRemTransforms(I))
     return Common;
 
-  if (Value *RHSNeg = dyn_castNegVal(Op1))
-    if (!isa<Constant>(RHSNeg) ||
-        (isa<ConstantInt>(RHSNeg) &&
-         cast<ConstantInt>(RHSNeg)->getValue().isStrictlyPositive())) {
-      // X % -Y -> X % Y
+  {
+    const APInt *Y;
+    // X % -Y -> X % Y
+    if (match(Op1, m_APInt(Y)) && Y->isNegative() && !Y->isMinSignedValue()) {
       Worklist.AddValue(I.getOperand(1));
-      I.setOperand(1, RHSNeg);
+      I.setOperand(1, ConstantInt::get(I.getType(), -*Y));
       return &I;
     }
+  }
 
   // If the sign bits of both operands are zero (i.e. we can prove they are
   // unsigned inputs), turn this into a urem.
   if (I.getType()->isIntegerTy()) {
     APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
-    if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
+    if (MaskedValueIsZero(Op1, Mask, 0, &I) &&
+        MaskedValueIsZero(Op0, Mask, 0, &I)) {
       // X srem Y -> X urem Y, iff X and Y don't have sign bit set
       return BinaryOperator::CreateURem(Op0, Op1, I.getName());
     }
@@ -1338,7 +1381,7 @@
   if (Value *V = SimplifyVectorOp(I))
     return ReplaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyFRemInst(Op0, Op1, DL))
+  if (Value *V = SimplifyFRemInst(Op0, Op1, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   // Handle cases involving: rem X, (select Cond, Y, Z)

diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 46f7b8a..794263a 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp

@@ -506,12 +506,12 @@
 /// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle
 /// that is dead.
 static bool DeadPHICycle(PHINode *PN,
-                         SmallPtrSet<PHINode*, 16> &PotentiallyDeadPHIs) {
+                         SmallPtrSetImpl<PHINode*> &PotentiallyDeadPHIs) {
   if (PN->use_empty()) return true;
   if (!PN->hasOneUse()) return false;
 
   // Remember this node, and if we find the cycle, return.
-  if (!PotentiallyDeadPHIs.insert(PN))
+  if (!PotentiallyDeadPHIs.insert(PN).second)
     return true;
 
   // Don't scan crazily complex things.
@@ -528,9 +528,9 @@
 /// NonPhiInVal.  This happens with mutually cyclic phi nodes like:
 ///   z = some value; x = phi (y, z); y = phi (x, z)
 static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
-                           SmallPtrSet<PHINode*, 16> &ValueEqualPHIs) {
+                           SmallPtrSetImpl<PHINode*> &ValueEqualPHIs) {
   // See if we already saw this PHI node.
-  if (!ValueEqualPHIs.insert(PN))
+  if (!ValueEqualPHIs.insert(PN).second)
     return true;
 
   // Don't scan crazily complex things.
@@ -654,7 +654,7 @@
 
       // If the user is a PHI, inspect its uses recursively.
       if (PHINode *UserPN = dyn_cast<PHINode>(UserI)) {
-        if (PHIsInspected.insert(UserPN))
+        if (PHIsInspected.insert(UserPN).second)
           PHIsToSlice.push_back(UserPN);
         continue;
       }
@@ -788,7 +788,7 @@
 // PHINode simplification
 //
 Instruction *InstCombiner::visitPHINode(PHINode &PN) {
-  if (Value *V = SimplifyInstruction(&PN, DL, TLI))
+  if (Value *V = SimplifyInstruction(&PN, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(PN, V);
 
   // If all PHI operands are the same operation, pull them through the PHI,

diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 06c9e29..079ae34 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp

@@ -313,7 +313,9 @@
 /// replaced with RepOp.
 static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                      const DataLayout *TD,
-                                     const TargetLibraryInfo *TLI) {
+                                     const TargetLibraryInfo *TLI,
+                                     DominatorTree *DT,
+                                     AssumptionTracker *AT) {
   // Trivial replacement.
   if (V == Op)
     return RepOp;
@@ -334,10 +336,10 @@
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
     if (C->getOperand(0) == Op)
       return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD,
-                             TLI);
+                             TLI, DT, AT);
     if (C->getOperand(1) == Op)
       return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD,
-                             TLI);
+                             TLI, DT, AT);
   }
 
   // TODO: We could hand off more cases to instsimplify here.
@@ -390,11 +392,11 @@
 ///
 /// This also tries to turn
 /// --- Single bit tests:
-/// if ((x & C) == 0) x |= C	to  x |= C
-/// if ((x & C) != 0) x ^= C	to  x &= ~C
-/// if ((x & C) == 0) x ^= C	to  x |= C
-/// if ((x & C) != 0) x &= ~C	to  x &= ~C
-/// if ((x & C) == 0) x &= ~C	to  nothing
+/// if ((x & C) == 0) x |= C    to  x |= C
+/// if ((x & C) != 0) x ^= C    to  x &= ~C
+/// if ((x & C) == 0) x ^= C    to  x |= C
+/// if ((x & C) != 0) x &= ~C   to  x &= ~C
+/// if ((x & C) == 0) x &= ~C   to  nothing
 static Value *foldSelectICmpAndOr(SelectInst &SI, Value *TrueVal,
                                   Value *FalseVal,
                                   InstCombiner::BuilderTy *Builder) {
@@ -605,18 +607,26 @@
   // arms of the select. See if substituting this value into the arm and
   // simplifying the result yields the same value as the other arm.
   if (Pred == ICmpInst::ICMP_EQ) {
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI) == TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI) == TrueVal)
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI,
+                               DT, AT) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI,
+                               DT, AT) == TrueVal)
       return ReplaceInstUsesWith(SI, FalseVal);
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI) == FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI) == FalseVal)
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI,
+                               DT, AT) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI,
+                               DT, AT) == FalseVal)
       return ReplaceInstUsesWith(SI, FalseVal);
   } else if (Pred == ICmpInst::ICMP_NE) {
-    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI) == FalseVal ||
-        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI) == FalseVal)
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, DL, TLI,
+                               DT, AT) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, DL, TLI,
+                               DT, AT) == FalseVal)
       return ReplaceInstUsesWith(SI, TrueVal);
-    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI) == TrueVal ||
-        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI) == TrueVal)
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, DL, TLI,
+                               DT, AT) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, DL, TLI,
+                               DT, AT) == TrueVal)
       return ReplaceInstUsesWith(SI, TrueVal);
   }
 
@@ -825,7 +835,8 @@
   Value *TrueVal = SI.getTrueValue();
   Value *FalseVal = SI.getFalseValue();
 
-  if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, DL))
+  if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, DL, TLI,
+                                    DT, AT))
     return ReplaceInstUsesWith(SI, V);
 
   if (SI.getType()->isIntegerTy(1)) {

diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 2495747..afa907a 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp

@@ -68,7 +68,7 @@
 /// this succeeds, the GetShiftedValue function will be called to produce the
 /// value.
 static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
-                               InstCombiner &IC) {
+                               InstCombiner &IC, Instruction *CxtI) {
   // We can always evaluate constants shifted.
   if (isa<Constant>(V))
     return true;
@@ -111,8 +111,8 @@
   case Instruction::Or:
   case Instruction::Xor:
     // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
-    return CanEvaluateShifted(I->getOperand(0), NumBits, isLeftShift, IC) &&
-           CanEvaluateShifted(I->getOperand(1), NumBits, isLeftShift, IC);
+    return CanEvaluateShifted(I->getOperand(0), NumBits, isLeftShift, IC, I) &&
+           CanEvaluateShifted(I->getOperand(1), NumBits, isLeftShift, IC, I);
 
   case Instruction::Shl: {
     // We can often fold the shift into shifts-by-a-constant.
@@ -131,8 +131,9 @@
     // profitable unless we know the and'd out bits are already zero.
     if (CI->getZExtValue() > NumBits) {
       unsigned LowBits = TypeWidth - CI->getZExtValue();
-      if (MaskedValueIsZero(I->getOperand(0),
-                       APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
+      if (IC.MaskedValueIsZero(I->getOperand(0),
+                       APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits,
+                       0, CxtI))
         return true;
     }
 
@@ -155,8 +156,9 @@
     // profitable unless we know the and'd out bits are already zero.
     if (CI->getValue().ult(TypeWidth) && CI->getZExtValue() > NumBits) {
       unsigned LowBits = CI->getZExtValue() - NumBits;
-      if (MaskedValueIsZero(I->getOperand(0),
-                          APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
+      if (IC.MaskedValueIsZero(I->getOperand(0),
+                          APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits,
+                          0, CxtI))
         return true;
     }
 
@@ -164,8 +166,9 @@
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
-    return CanEvaluateShifted(SI->getTrueValue(), NumBits, isLeftShift, IC) &&
-           CanEvaluateShifted(SI->getFalseValue(), NumBits, isLeftShift, IC);
+    return CanEvaluateShifted(SI->getTrueValue(), NumBits, isLeftShift,
+                              IC, SI) &&
+           CanEvaluateShifted(SI->getFalseValue(), NumBits, isLeftShift, IC, SI);
   }
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -173,7 +176,8 @@
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      if (!CanEvaluateShifted(PN->getIncomingValue(i), NumBits, isLeftShift,IC))
+      if (!CanEvaluateShifted(PN->getIncomingValue(i), NumBits, isLeftShift,
+                              IC, PN))
         return false;
     return true;
   }
@@ -329,7 +333,7 @@
   // See if we can propagate this shift into the input, this covers the trivial
   // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
   if (I.getOpcode() != Instruction::AShr &&
-      CanEvaluateShifted(Op0, COp1->getZExtValue(), isLeftShift, *this)) {
+      CanEvaluateShifted(Op0, COp1->getZExtValue(), isLeftShift, *this, &I)) {
     DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
               " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
 
@@ -488,7 +492,7 @@
       }
 
 
-      // If the operand is an bitwise operator with a constant RHS, and the
+      // If the operand is a bitwise operator with a constant RHS, and the
       // shift is the only use, we can pull it out of the shift.
       if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) {
         bool isValid = true;     // Valid only for And, Or, Xor
@@ -691,7 +695,7 @@
 
   if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
-                                 DL))
+                                 DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   if (Instruction *V = commonShiftTransforms(I))
@@ -703,14 +707,15 @@
     // If the shifted-out value is known-zero, then this is a NUW shift.
     if (!I.hasNoUnsignedWrap() &&
         MaskedValueIsZero(I.getOperand(0),
-                          APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt))) {
+                          APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt),
+                          0, &I)) {
           I.setHasNoUnsignedWrap();
           return &I;
         }
 
     // If the shifted out value is all signbits, this is a NSW shift.
     if (!I.hasNoSignedWrap() &&
-        ComputeNumSignBits(I.getOperand(0)) > ShAmt) {
+        ComputeNumSignBits(I.getOperand(0), 0, &I) > ShAmt) {
       I.setHasNoSignedWrap();
       return &I;
     }
@@ -731,7 +736,7 @@
     return ReplaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1),
-                                  I.isExact(), DL))
+                                  I.isExact(), DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
@@ -760,7 +765,8 @@
 
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
-        MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(Op1C->getBitWidth(), ShAmt),
+                          0, &I)){
       I.setIsExact();
       return &I;
     }
@@ -774,7 +780,7 @@
     return ReplaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1),
-                                  I.isExact(), DL))
+                                  I.isExact(), DL, TLI, DT, AT))
     return ReplaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
@@ -804,7 +810,8 @@
 
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
-        MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){
+        MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt),
+                          0, &I)){
       I.setIsExact();
       return &I;
     }
@@ -812,13 +819,9 @@
 
   // See if we can turn a signed shr into an unsigned shr.
   if (MaskedValueIsZero(Op0,
-                        APInt::getSignBit(I.getType()->getScalarSizeInBits())))
+                        APInt::getSignBit(I.getType()->getScalarSizeInBits()),
+                        0, &I))
     return BinaryOperator::CreateLShr(Op0, Op1);
 
-  // Arithmetic shifting an all-sign-bit value is a no-op.
-  unsigned NumSignBits = ComputeNumSignBits(Op0);
-  if (NumSignBits == Op0->getType()->getScalarSizeInBits())
-    return ReplaceInstUsesWith(I, Op0);
-
   return nullptr;
 }

diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 1b42d3d..ad6983a 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

@@ -43,6 +43,20 @@
   // This instruction is producing bits that are not demanded. Shrink the RHS.
   Demanded &= OpC->getValue();
   I->setOperand(OpNo, ConstantInt::get(OpC->getType(), Demanded));
+
+  // If either 'nsw' or 'nuw' is set and the constant is negative,
+  // removing *any* bits from the constant could make overflow occur.
+  // Remove 'nsw' and 'nuw' from the instruction in this case.
+  if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(I)) {
+    assert(OBO->getOpcode() == Instruction::Add);
+    if (OBO->hasNoSignedWrap() || OBO->hasNoUnsignedWrap()) {
+      if (OpC->getValue().isNegative()) {
+        cast<BinaryOperator>(OBO)->setHasNoSignedWrap(false);
+        cast<BinaryOperator>(OBO)->setHasNoUnsignedWrap(false);
+      }
+    }
+  }
+
   return true;
 }
 
@@ -57,7 +71,7 @@
   APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
 
   Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask,
-                                     KnownZero, KnownOne, 0);
+                                     KnownZero, KnownOne, 0, &Inst);
   if (!V) return false;
   if (V == &Inst) return true;
   ReplaceInstUsesWith(Inst, V);
@@ -71,7 +85,8 @@
                                         APInt &KnownZero, APInt &KnownOne,
                                         unsigned Depth) {
   Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask,
-                                          KnownZero, KnownOne, Depth);
+                                          KnownZero, KnownOne, Depth,
+                                          dyn_cast<Instruction>(U.getUser()));
   if (!NewVal) return false;
   U = NewVal;
   return true;
@@ -101,7 +116,8 @@
 /// in the context where the specified bits are demanded, but not for all users.
 Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                              APInt &KnownZero, APInt &KnownOne,
-                                             unsigned Depth) {
+                                             unsigned Depth,
+                                             Instruction *CxtI) {
   assert(V != nullptr && "Null pointer of Value???");
   assert(Depth <= 6 && "Limit Search Depth");
   uint32_t BitWidth = DemandedMask.getBitWidth();
@@ -144,7 +160,7 @@
 
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
-    computeKnownBits(V, KnownZero, KnownOne, Depth);
+    computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI);
     return nullptr;        // Only analyze instructions.
   }
 
@@ -158,8 +174,10 @@
     // this instruction has a simpler value in that context.
     if (I->getOpcode() == Instruction::And) {
       // If either the LHS or the RHS are Zero, the result is zero.
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1,
+                       CxtI);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1,
+                       CxtI);
 
       // If all of the demanded bits are known 1 on one side, return the other.
       // These bits cannot contribute to the result of the 'and' in this
@@ -180,8 +198,10 @@
       // only bits from X or Y are demanded.
 
       // If either the LHS or the RHS are One, the result is One.
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1,
+                       CxtI);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1,
+                       CxtI);
 
       // If all of the demanded bits are known zero on one side, return the
       // other.  These bits cannot contribute to the result of the 'or' in this
@@ -205,8 +225,10 @@
       // We can simplify (X^Y) -> X or Y in the user's context if we know that
       // only bits from X or Y are demanded.
 
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1,
+                       CxtI);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1,
+                       CxtI);
 
       // If all of the demanded bits are known zero on one side, return the
       // other.
@@ -217,7 +239,7 @@
     }
 
     // Compute the KnownZero/KnownOne bits to simplify things downstream.
-    computeKnownBits(I, KnownZero, KnownOne, Depth);
+    computeKnownBits(I, KnownZero, KnownOne, Depth, CxtI);
     return nullptr;
   }
 
@@ -230,7 +252,7 @@
 
   switch (I->getOpcode()) {
   default:
-    computeKnownBits(I, KnownZero, KnownOne, Depth);
+    computeKnownBits(I, KnownZero, KnownOne, Depth, CxtI);
     break;
   case Instruction::And:
     // If either the LHS or the RHS are Zero, the result is zero.
@@ -242,6 +264,12 @@
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
 
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if ((DemandedMask & ((RHSKnownZero | LHSKnownZero)|
+                         (RHSKnownOne & LHSKnownOne))) == DemandedMask)
+      return Constant::getIntegerValue(VTy, RHSKnownOne & LHSKnownOne);
+
     // If all of the demanded bits are known 1 on one side, return the other.
     // These bits cannot contribute to the result of the 'and'.
     if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) ==
@@ -274,6 +302,12 @@
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
 
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if ((DemandedMask & ((RHSKnownZero & LHSKnownZero)|
+                         (RHSKnownOne | LHSKnownOne))) == DemandedMask)
+      return Constant::getIntegerValue(VTy, RHSKnownOne | LHSKnownOne);
+
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'or'.
     if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) ==
@@ -310,6 +344,18 @@
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
 
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt IKnownZero = (RHSKnownZero & LHSKnownZero) |
+                       (RHSKnownOne & LHSKnownOne);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    APInt IKnownOne =  (RHSKnownZero & LHSKnownOne) |
+                       (RHSKnownOne & LHSKnownZero);
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if ((DemandedMask & (IKnownZero|IKnownOne)) == DemandedMask)
+      return Constant::getIntegerValue(VTy, IKnownOne);
+
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'xor'.
     if ((DemandedMask & RHSKnownZero) == DemandedMask)
@@ -581,7 +627,7 @@
 
     // Otherwise just hand the sub off to computeKnownBits to fill in
     // the known zeros and ones.
-    computeKnownBits(V, KnownZero, KnownOne, Depth);
+    computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI);
 
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
     // zero.
@@ -752,7 +798,8 @@
     // remainder is zero.
     if (DemandedMask.isNegative() && KnownZero.isNonNegative()) {
       APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1,
+                       CxtI);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
         KnownZero.setBit(KnownZero.getBitWidth() - 1);
@@ -814,7 +861,7 @@
         return nullptr;
       }
     }
-    computeKnownBits(V, KnownZero, KnownOne, Depth);
+    computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI);
     break;
   }
 

diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 1ab7db3..8d857d0 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef INSTCOMBINE_WORKLIST_H
-#define INSTCOMBINE_WORKLIST_H
+#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H
+#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEWORKLIST_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"

diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 08e2446..e4a4fef 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp

@@ -39,12 +39,16 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
@@ -68,11 +72,6 @@
 STATISTIC(NumFactor   , "Number of factorizations");
 STATISTIC(NumReassoc  , "Number of reassociations");
 
-static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
-                                   cl::init(false),
-                                   cl::desc("Enable unsafe double to float "
-                                            "shrinking for math lib calls"));
-
 // Initialization Routines
 void llvm::initializeInstCombine(PassRegistry &Registry) {
   initializeInstCombinerPass(Registry);
@@ -85,12 +84,14 @@
 char InstCombiner::ID = 0;
 INITIALIZE_PASS_BEGIN(InstCombiner, "instcombine",
                 "Combine redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(InstCombiner, "instcombine",
                 "Combine redundant instructions", false, false)
 
 void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
+  AU.addRequired<AssumptionTracker>();
   AU.addRequired<TargetLibraryInfo>();
 }
 
@@ -390,6 +391,25 @@
                                      Instruction::BinaryOps ROp) {
   if (Instruction::isCommutative(ROp))
     return LeftDistributesOverRight(ROp, LOp);
+
+  switch (LOp) {
+  default:
+    return false;
+  // (X >> Z) & (Y >> Z)  -> (X&Y) >> Z  for all shifts.
+  // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
+  // (X >> Z) ^ (Y >> Z)  -> (X^Y) >> Z  for all shifts.
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    switch (ROp) {
+    default:
+      return false;
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+      return true;
+    }
+  }
   // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
   // but this requires knowing that the addition does not overflow and other
   // such subtleties.
@@ -411,26 +431,37 @@
 }
 
 /// This function factors binary ops which can be combined using distributive
-/// laws. This also factor SHL as MUL e.g. SHL(X, 2) ==> MUL(X, 4).
+/// laws. This function tries to transform 'Op' based TopLevelOpcode to enable
+/// factorization e.g for ADD(SHL(X , 2), MUL(X, 5)), When this function called
+/// with TopLevelOpcode == Instruction::Add and Op = SHL(X, 2), transforms
+/// SHL(X, 2) to MUL(X, 4) i.e. returns Instruction::Mul with LHS set to 'X' and
+/// RHS to 4.
 static Instruction::BinaryOps
-getBinOpsForFactorization(BinaryOperator *Op, Value *&LHS, Value *&RHS) {
+getBinOpsForFactorization(Instruction::BinaryOps TopLevelOpcode,
+                          BinaryOperator *Op, Value *&LHS, Value *&RHS) {
   if (!Op)
     return Instruction::BinaryOpsEnd;
 
-  if (Op->getOpcode() == Instruction::Shl) {
-    if (Constant *CST = dyn_cast<Constant>(Op->getOperand(1))) {
-      // The multiplier is really 1 << CST.
-      RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), CST);
-      LHS = Op->getOperand(0);
-      return Instruction::Mul;
+  LHS = Op->getOperand(0);
+  RHS = Op->getOperand(1);
+
+  switch (TopLevelOpcode) {
+  default:
+    return Op->getOpcode();
+
+  case Instruction::Add:
+  case Instruction::Sub:
+    if (Op->getOpcode() == Instruction::Shl) {
+      if (Constant *CST = dyn_cast<Constant>(Op->getOperand(1))) {
+        // The multiplier is really 1 << CST.
+        RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), CST);
+        return Instruction::Mul;
+      }
     }
+    return Op->getOpcode();
   }
 
   // TODO: We can add other conversions e.g. shr => div etc.
-
-  LHS = Op->getOperand(0);
-  RHS = Op->getOperand(1);
-  return Op->getOpcode();
 }
 
 /// This tries to simplify binary operations by factorizing out common terms
@@ -529,8 +560,9 @@
 
   // Factorization.
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
-  Instruction::BinaryOps LHSOpcode = getBinOpsForFactorization(Op0, A, B);
-  Instruction::BinaryOps RHSOpcode = getBinOpsForFactorization(Op1, C, D);
+  auto TopLevelOpcode = I.getOpcode();
+  auto LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
+  auto RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
 
   // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
   // a common term.
@@ -552,7 +584,6 @@
     return V;
 
   // Expansion.
-  Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
   if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
     // The instruction has the form "(A op' B) op C".  See if expanding it out
     // to "(A op C) op' (B op C)" results in simplifications.
@@ -765,13 +796,14 @@
     // If the incoming non-constant value is in I's block, we will remove one
     // instruction, but insert another equivalent one, leading to infinite
     // instcombine.
-    if (NonConstBB == I.getParent())
+    if (isPotentiallyReachable(I.getParent(), NonConstBB, DT,
+                               getAnalysisIfAvailable<LoopInfo>()))
       return nullptr;
   }
 
   // If there is exactly one non-constant value, we can insert a copy of the
   // operation in that block.  However, if this is a critical edge, we would be
-  // inserting the computation one some other paths (e.g. inside a loop).  Only
+  // inserting the computation on some other paths (e.g. inside a loop).  Only
   // do this if the pred block is unconditionally branching into the phi block.
   if (NonConstBB != nullptr) {
     BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
@@ -1284,7 +1316,7 @@
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
-  if (Value *V = SimplifyGEPInst(Ops, DL))
+  if (Value *V = SimplifyGEPInst(Ops, DL, TLI, DT, AT))
     return ReplaceInstUsesWith(GEP, V);
 
   Value *PtrOp = GEP.getOperand(0);
@@ -1478,19 +1510,50 @@
         GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName());
   }
 
-  // Canonicalize (gep i8* X, -(ptrtoint Y)) to (sub (ptrtoint X), (ptrtoint Y))
-  // The GEP pattern is emitted by the SCEV expander for certain kinds of
-  // pointer arithmetic.
-  if (DL && GEP.getNumIndices() == 1 &&
-      match(GEP.getOperand(1), m_Neg(m_PtrToInt(m_Value())))) {
+  if (DL && GEP.getNumIndices() == 1) {
     unsigned AS = GEP.getPointerAddressSpace();
-    if (GEP.getType() == Builder->getInt8PtrTy(AS) &&
-        GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
+    if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
         DL->getPointerSizeInBits(AS)) {
-      Operator *Index = cast<Operator>(GEP.getOperand(1));
-      Value *PtrToInt = Builder->CreatePtrToInt(PtrOp, Index->getType());
-      Value *NewSub = Builder->CreateSub(PtrToInt, Index->getOperand(1));
-      return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType());
+      Type *PtrTy = GEP.getPointerOperandType();
+      Type *Ty = PtrTy->getPointerElementType();
+      uint64_t TyAllocSize = DL->getTypeAllocSize(Ty);
+
+      bool Matched = false;
+      uint64_t C;
+      Value *V = nullptr;
+      if (TyAllocSize == 1) {
+        V = GEP.getOperand(1);
+        Matched = true;
+      } else if (match(GEP.getOperand(1),
+                       m_AShr(m_Value(V), m_ConstantInt(C)))) {
+        if (TyAllocSize == 1ULL << C)
+          Matched = true;
+      } else if (match(GEP.getOperand(1),
+                       m_SDiv(m_Value(V), m_ConstantInt(C)))) {
+        if (TyAllocSize == C)
+          Matched = true;
+      }
+
+      if (Matched) {
+        // Canonicalize (gep i8* X, -(ptrtoint Y))
+        // to (inttoptr (sub (ptrtoint X), (ptrtoint Y)))
+        // The GEP pattern is emitted by the SCEV expander for certain kinds of
+        // pointer arithmetic.
+        if (match(V, m_Neg(m_PtrToInt(m_Value())))) {
+          Operator *Index = cast<Operator>(V);
+          Value *PtrToInt = Builder->CreatePtrToInt(PtrOp, Index->getType());
+          Value *NewSub = Builder->CreateSub(PtrToInt, Index->getOperand(1));
+          return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType());
+        }
+        // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
+        // to (bitcast Y)
+        Value *Y;
+        if (match(V, m_Sub(m_PtrToInt(m_Value(Y)),
+                           m_PtrToInt(m_Specific(GEP.getOperand(0)))))) {
+          return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y,
+                                                               GEP.getType());
+        }
+      }
     }
   }
 
@@ -1582,9 +1645,8 @@
           Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
 
         // V and GEP are both pointer types --> BitCast
-        if (StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace())
-          return new BitCastInst(NewGEP, GEP.getType());
-        return new AddrSpaceCastInst(NewGEP, GEP.getType());
+        return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+                                                             GEP.getType());
       }
 
       // Transform things like:
@@ -1616,9 +1678,8 @@
               Builder->CreateGEP(StrippedPtr, NewIdx, GEP.getName());
 
             // The NewGEP must be pointer typed, so must the old one -> BitCast
-            if (StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace())
-              return new BitCastInst(NewGEP, GEP.getType());
-            return new AddrSpaceCastInst(NewGEP, GEP.getType());
+            return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+                                                                 GEP.getType());
           }
         }
       }
@@ -1658,9 +1719,8 @@
               Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) :
               Builder->CreateGEP(StrippedPtr, Off, GEP.getName());
             // The NewGEP must be pointer typed, so must the old one -> BitCast
-            if (StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace())
-              return new BitCastInst(NewGEP, GEP.getType());
-            return new AddrSpaceCastInst(NewGEP, GEP.getType());
+            return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+                                                                 GEP.getType());
           }
         }
       }
@@ -1670,6 +1730,18 @@
   if (!DL)
     return nullptr;
 
+  // addrspacecast between types is canonicalized as a bitcast, then an
+  // addrspacecast. To take advantage of the below bitcast + struct GEP, look
+  // through the addrspacecast.
+  if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
+    //   X = bitcast A addrspace(1)* to B addrspace(1)*
+    //   Y = addrspacecast A addrspace(1)* to B addrspace(2)*
+    //   Z = gep Y, <...constant indices...>
+    // Into an addrspacecasted GEP of the struct.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
+      PtrOp = BC;
+  }
+
   /// See if we can simplify:
   ///   X = bitcast A* to B*
   ///   Y = gep X, <...constant indices...>
@@ -1678,11 +1750,10 @@
   if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
     Value *Operand = BCI->getOperand(0);
     PointerType *OpType = cast<PointerType>(Operand->getType());
-    unsigned OffsetBits = DL->getPointerTypeSizeInBits(OpType);
+    unsigned OffsetBits = DL->getPointerTypeSizeInBits(GEP.getType());
     APInt Offset(OffsetBits, 0);
     if (!isa<BitCastInst>(Operand) &&
-        GEP.accumulateConstantOffset(*DL, Offset) &&
-        StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
+        GEP.accumulateConstantOffset(*DL, Offset)) {
 
       // If this GEP instruction doesn't move the pointer, just replace the GEP
       // with a bitcast of the real input to the dest type.
@@ -1700,6 +1771,9 @@
             return &GEP;
           }
         }
+
+        if (Operand->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+          return new AddrSpaceCastInst(Operand, GEP.getType());
         return new BitCastInst(Operand, GEP.getType());
       }
 
@@ -1715,6 +1789,9 @@
         if (NGEP->getType() == GEP.getType())
           return ReplaceInstUsesWith(GEP, NGEP);
         NGEP->takeName(&GEP);
+
+        if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+          return new AddrSpaceCastInst(NGEP, GEP.getType());
         return new BitCastInst(NGEP, GEP.getType());
       }
     }
@@ -1925,7 +2002,25 @@
   return nullptr;
 }
 
+Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
+  if (RI.getNumOperands() == 0) // ret void
+    return nullptr;
 
+  Value *ResultOp = RI.getOperand(0);
+  Type *VTy = ResultOp->getType();
+  if (!VTy->isIntegerTy())
+    return nullptr;
+
+  // There might be assume intrinsics dominating this return that completely
+  // determine the value. If so, constant fold it.
+  unsigned BitWidth = VTy->getPrimitiveSizeInBits();
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  computeKnownBits(ResultOp, KnownZero, KnownOne, 0, &RI);
+  if ((KnownZero|KnownOne).isAllOnesValue())
+    RI.setOperand(0, Constant::getIntegerValue(VTy, KnownOne));
+
+  return nullptr;
+}
 
 Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
   // Change br (not X), label True, label False to: br X, label False, True
@@ -1977,6 +2072,37 @@
 
 Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   Value *Cond = SI.getCondition();
+  unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  computeKnownBits(Cond, KnownZero, KnownOne);
+  unsigned LeadingKnownZeros = KnownZero.countLeadingOnes();
+  unsigned LeadingKnownOnes = KnownOne.countLeadingOnes();
+
+  // Compute the number of leading bits we can ignore.
+  for (auto &C : SI.cases()) {
+    LeadingKnownZeros = std::min(
+        LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());
+    LeadingKnownOnes = std::min(
+        LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
+  }
+
+  unsigned NewWidth = BitWidth - std::max(LeadingKnownZeros, LeadingKnownOnes);
+
+  // Truncate the condition operand if the new type is equal to or larger than
+  // the largest legal integer type. We need to be conservative here since
+  // x86 generates redundant zero-extenstion instructions if the operand is
+  // truncated to i8 or i16.
+  if (BitWidth > NewWidth && NewWidth >= DL->getLargestLegalIntTypeSize()) {
+    IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
+    Builder->SetInsertPoint(&SI);
+    Value *NewCond = Builder->CreateTrunc(SI.getCondition(), Ty, "trunc");
+    SI.setCondition(NewCond);
+
+    for (auto &C : SI.cases())
+      static_cast<SwitchInst::CaseIt *>(&C)->setValue(ConstantInt::get(
+          SI.getContext(), C.getCaseValue()->getValue().trunc(NewWidth)));
+  }
+
   if (Instruction *I = dyn_cast<Instruction>(Cond)) {
     if (I->getOpcode() == Instruction::Add)
       if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
@@ -2215,7 +2341,7 @@
 
       // If we already saw this clause, there is no point in having a second
       // copy of it.
-      if (AlreadyCaught.insert(TypeInfo)) {
+      if (AlreadyCaught.insert(TypeInfo).second) {
         // This catch clause was not already seen.
         NewClauses.push_back(CatchClause);
       } else {
@@ -2297,7 +2423,7 @@
             continue;
           // There is no point in having multiple copies of the same typeinfo in
           // a filter, so only add it if we didn't already.
-          if (SeenInFilter.insert(TypeInfo))
+          if (SeenInFilter.insert(TypeInfo).second)
             NewFilterElts.push_back(cast<Constant>(Elt));
         }
         // A filter containing a catch-all cannot match anything by definition.
@@ -2534,7 +2660,7 @@
 /// whose condition is a known constant, we only visit the reachable successors.
 ///
 static bool AddReachableCodeToWorklist(BasicBlock *BB,
-                                       SmallPtrSet<BasicBlock*, 64> &Visited,
+                                       SmallPtrSetImpl<BasicBlock*> &Visited,
                                        InstCombiner &IC,
                                        const DataLayout *DL,
                                        const TargetLibraryInfo *TLI) {
@@ -2549,7 +2675,8 @@
     BB = Worklist.pop_back_val();
 
     // We have now visited this block!  If we've already been here, ignore it.
-    if (!Visited.insert(BB)) continue;
+    if (!Visited.insert(BB).second)
+      continue;
 
     for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
       Instruction *Inst = BBI++;
@@ -2730,9 +2857,18 @@
         // If the user is one of our immediate successors, and if that successor
         // only has us as a predecessors (we'd have to split the critical edge
         // otherwise), we can keep going.
-        if (UserIsSuccessor && UserParent->getSinglePredecessor())
+        if (UserIsSuccessor && UserParent->getSinglePredecessor()) {
           // Okay, the CFG is simple enough, try to sink this instruction.
-          MadeIRChange |= TryToSinkInstruction(I, UserParent);
+          if (TryToSinkInstruction(I, UserParent)) {
+            MadeIRChange = true;
+            // We'll add uses of the sunk instruction below, but since sinking
+            // can expose opportunities for it's *operands* add them to the
+            // worklist
+            for (Use &U : I->operands())
+              if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
+                Worklist.Add(OpI);
+          }
+        }
       }
     }
 
@@ -2801,13 +2937,13 @@
 }
 
 namespace {
-class InstCombinerLibCallSimplifier : public LibCallSimplifier {
+class InstCombinerLibCallSimplifier final : public LibCallSimplifier {
   InstCombiner *IC;
 public:
   InstCombinerLibCallSimplifier(const DataLayout *DL,
                                 const TargetLibraryInfo *TLI,
                                 InstCombiner *IC)
-    : LibCallSimplifier(DL, TLI, UnsafeFPShrink) {
+    : LibCallSimplifier(DL, TLI) {
     this->IC = IC;
   }
 
@@ -2823,9 +2959,15 @@
   if (skipOptnoneFunction(F))
     return false;
 
+  AT = &getAnalysis<AssumptionTracker>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
+
+  DominatorTreeWrapperPass *DTWP =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
   // Minimizing size?
   MinimizeSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                                 Attribute::MinSize);
@@ -2834,7 +2976,7 @@
   /// instructions into the worklist when they are created.
   IRBuilder<true, TargetFolder, InstCombineIRInserter>
     TheBuilder(F.getContext(), TargetFolder(DL),
-               InstCombineIRInserter(Worklist));
+               InstCombineIRInserter(Worklist, AT));
   Builder = &TheBuilder;
 
   InstCombinerLibCallSimplifier TheSimplifier(DL, TLI, this);

diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 5e5ddc1..38f587f 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp

@@ -40,6 +40,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -59,7 +60,8 @@
 static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
 static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000;  // < 2G.
 static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
-static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa8000;
+static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
+static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
 
@@ -70,7 +72,7 @@
 
 static const char *const kAsanModuleCtorName = "asan.module_ctor";
 static const char *const kAsanModuleDtorName = "asan.module_dtor";
-static const int         kAsanCtorAndDtorPriority = 1;
+static const uint64_t    kAsanCtorAndDtorPriority = 1;
 static const char *const kAsanReportErrorTemplate = "__asan_report_";
 static const char *const kAsanReportLoadN = "__asan_report_load_n";
 static const char *const kAsanReportStoreN = "__asan_report_store_n";
@@ -80,8 +82,6 @@
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
 static const char *const kAsanInitName = "__asan_init_v4";
-static const char *const kAsanCovModuleInitName = "__sanitizer_cov_module_init";
-static const char *const kAsanCovName = "__sanitizer_cov";
 static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
 static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
 static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
@@ -89,6 +89,7 @@
 static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
 static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
 static const char *const kAsanGenPrefix = "__asan_gen_";
+static const char *const kSanCovGenPrefix = "__sancov_gen_";
 static const char *const kAsanPoisonStackMemoryName =
     "__asan_poison_stack_memory";
 static const char *const kAsanUnpoisonStackMemoryName =
@@ -133,13 +134,6 @@
 // This flag may need to be replaced with -f[no]asan-globals.
 static cl::opt<bool> ClGlobals("asan-globals",
        cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
-static cl::opt<int> ClCoverage("asan-coverage",
-       cl::desc("ASan coverage. 0: none, 1: entry block, 2: all blocks"),
-       cl::Hidden, cl::init(false));
-static cl::opt<int> ClCoverageBlockThreshold("asan-coverage-block-threshold",
-       cl::desc("Add coverage instrumentation only to the entry block if there "
-                "are more than this number of blocks."),
-       cl::Hidden, cl::init(1500));
 static cl::opt<bool> ClInitializers("asan-initialization-order",
        cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(true));
 static cl::opt<bool> ClInvalidPointerPairs("asan-detect-invalid-pointer-pair",
@@ -212,10 +206,40 @@
           "Number of optimized accesses to global vars");
 
 namespace {
+/// Frontend-provided metadata for source location.
+struct LocationMetadata {
+  StringRef Filename;
+  int LineNo;
+  int ColumnNo;
+
+  LocationMetadata() : Filename(), LineNo(0), ColumnNo(0) {}
+
+  bool empty() const { return Filename.empty(); }
+
+  void parse(MDNode *MDN) {
+    assert(MDN->getNumOperands() == 3);
+    MDString *MDFilename = cast<MDString>(MDN->getOperand(0));
+    Filename = MDFilename->getString();
+    LineNo = cast<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
+    ColumnNo = cast<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
+  }
+};
+
 /// Frontend-provided metadata for global variables.
 class GlobalsMetadata {
  public:
+  struct Entry {
+    Entry()
+        : SourceLoc(), Name(), IsDynInit(false),
+          IsBlacklisted(false) {}
+    LocationMetadata SourceLoc;
+    StringRef Name;
+    bool IsDynInit;
+    bool IsBlacklisted;
+  };
+
   GlobalsMetadata() : inited_(false) {}
+
   void init(Module& M) {
     assert(!inited_);
     inited_ = true;
@@ -223,76 +247,38 @@
     if (!Globals)
       return;
     for (auto MDN : Globals->operands()) {
-      // Format of the metadata node for the global:
-      // {
-      //   global,
-      //   source_location,
-      //   i1 is_dynamically_initialized,
-      //   i1 is_blacklisted
-      // }
-      assert(MDN->getNumOperands() == 4);
+      // Metadata node contains the global and the fields of "Entry".
+      assert(MDN->getNumOperands() == 5);
       Value *V = MDN->getOperand(0);
       // The optimizer may optimize away a global entirely.
       if (!V)
         continue;
       GlobalVariable *GV = cast<GlobalVariable>(V);
-      if (Value *Loc = MDN->getOperand(1)) {
-        GlobalVariable *GVLoc = cast<GlobalVariable>(Loc);
-        // We may already know the source location for GV, if it was merged
-        // with another global.
-        if (SourceLocation.insert(std::make_pair(GV, GVLoc)).second)
-          addSourceLocationGlobal(GVLoc);
+      // We can already have an entry for GV if it was merged with another
+      // global.
+      Entry &E = Entries[GV];
+      if (Value *Loc = MDN->getOperand(1))
+        E.SourceLoc.parse(cast<MDNode>(Loc));
+      if (Value *Name = MDN->getOperand(2)) {
+        MDString *MDName = cast<MDString>(Name);
+        E.Name = MDName->getString();
       }
-      ConstantInt *IsDynInit = cast<ConstantInt>(MDN->getOperand(2));
-      if (IsDynInit->isOne())
-        DynInitGlobals.insert(GV);
-      ConstantInt *IsBlacklisted = cast<ConstantInt>(MDN->getOperand(3));
-      if (IsBlacklisted->isOne())
-        BlacklistedGlobals.insert(GV);
+      ConstantInt *IsDynInit = cast<ConstantInt>(MDN->getOperand(3));
+      E.IsDynInit |= IsDynInit->isOne();
+      ConstantInt *IsBlacklisted = cast<ConstantInt>(MDN->getOperand(4));
+      E.IsBlacklisted |= IsBlacklisted->isOne();
     }
   }
 
-  GlobalVariable *getSourceLocation(GlobalVariable *G) const {
-    auto Pos = SourceLocation.find(G);
-    return (Pos != SourceLocation.end()) ? Pos->second : nullptr;
-  }
-
-  /// Check if the global is dynamically initialized.
-  bool isDynInit(GlobalVariable *G) const {
-    return DynInitGlobals.count(G);
-  }
-
-  /// Check if the global was blacklisted.
-  bool isBlacklisted(GlobalVariable *G) const {
-    return BlacklistedGlobals.count(G);
-  }
-
-  /// Check if the global was generated to describe source location of another
-  /// global (we don't want to instrument them).
-  bool isSourceLocationGlobal(GlobalVariable *G) const {
-    return LocationGlobals.count(G);
+  /// Returns metadata entry for a given global.
+  Entry get(GlobalVariable *G) const {
+    auto Pos = Entries.find(G);
+    return (Pos != Entries.end()) ? Pos->second : Entry();
   }
 
  private:
   bool inited_;
-  DenseMap<GlobalVariable*, GlobalVariable*> SourceLocation;
-  DenseSet<GlobalVariable*> DynInitGlobals;
-  DenseSet<GlobalVariable*> BlacklistedGlobals;
-  DenseSet<GlobalVariable*> LocationGlobals;
-
-  void addSourceLocationGlobal(GlobalVariable *SourceLocGV) {
-    // Source location global is a struct with layout:
-    // {
-    //    filename,
-    //    i32 line_number,
-    //    i32 column_number,
-    // }
-    LocationGlobals.insert(SourceLocGV);
-    ConstantStruct *Contents =
-        cast<ConstantStruct>(SourceLocGV->getInitializer());
-    GlobalVariable *FilenameGV = cast<GlobalVariable>(Contents->getOperand(0));
-    LocationGlobals.insert(FilenameGV);
-  }
+  DenseMap<GlobalVariable*, Entry> Entries;
 };
 
 /// This struct defines the shadow mapping using the rule:
@@ -306,7 +292,7 @@
 static ShadowMapping getShadowMapping(const Module &M, int LongSize) {
   llvm::Triple TargetTriple(M.getTargetTriple());
   bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
-  bool IsIOS = TargetTriple.getOS() == llvm::Triple::IOS;
+  bool IsIOS = TargetTriple.isiOS();
   bool IsFreeBSD = TargetTriple.getOS() == llvm::Triple::FreeBSD;
   bool IsLinux = TargetTriple.getOS() == llvm::Triple::Linux;
   bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
@@ -314,6 +300,8 @@
   bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
   bool IsMIPS32 = TargetTriple.getArch() == llvm::Triple::mips ||
                   TargetTriple.getArch() == llvm::Triple::mipsel;
+  bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||
+                  TargetTriple.getArch() == llvm::Triple::mips64el;
 
   ShadowMapping Mapping;
 
@@ -335,6 +323,8 @@
       Mapping.Offset = kFreeBSD_ShadowOffset64;
     else if (IsLinux && IsX86_64)
       Mapping.Offset = kSmallX86_64ShadowOffset;
+    else if (IsMIPS64)
+      Mapping.Offset = kMIPS64_ShadowOffset64;
     else
       Mapping.Offset = kDefaultShadowOffset64;
   }
@@ -386,8 +376,6 @@
 
   bool LooksLikeCodeInBug11395(Instruction *I);
   bool GlobalIsLinkerInitialized(GlobalVariable *G);
-  bool InjectCoverage(Function &F, const ArrayRef<BasicBlock*> AllBlocks);
-  void InjectCoverageAtBlock(Function &F, BasicBlock &BB);
 
   LLVMContext *C;
   const DataLayout *DL;
@@ -397,7 +385,6 @@
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Function *AsanHandleNoReturnFunc;
-  Function *AsanCovFunction;
   Function *AsanPtrCmpFunction, *AsanPtrSubFunction;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
@@ -441,7 +428,6 @@
   Function *AsanUnpoisonGlobals;
   Function *AsanRegisterGlobals;
   Function *AsanUnregisterGlobals;
-  Function *AsanCovModuleInit;
 };
 
 // Stack poisoning does not play well with exception handling.
@@ -570,7 +556,7 @@
   }
   /// Finds alloca where the value comes from.
   AllocaInst *findAllocaForValue(Value *V);
-  void poisonRedZones(const ArrayRef<uint8_t> ShadowBytes, IRBuilder<> &IRB,
+  void poisonRedZones(ArrayRef<uint8_t> ShadowBytes, IRBuilder<> &IRB,
                       Value *ShadowBase, bool DoPoison);
   void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
 
@@ -617,8 +603,25 @@
   return GV;
 }
 
+/// \brief Create a global describing a source location.
+static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
+                                                       LocationMetadata MD) {
+  Constant *LocData[] = {
+      createPrivateGlobalForString(M, MD.Filename, true),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
+  };
+  auto LocStruct = ConstantStruct::getAnon(LocData);
+  auto GV = new GlobalVariable(M, LocStruct->getType(), true,
+                               GlobalValue::PrivateLinkage, LocStruct,
+                               kAsanGenPrefix);
+  GV->setUnnamedAddr(true);
+  return GV;
+}
+
 static bool GlobalWasGeneratedByAsan(GlobalVariable *G) {
-  return G->getName().find(kAsanGenPrefix) == 0;
+  return G->getName().find(kAsanGenPrefix) == 0 ||
+         G->getName().find(kSanCovGenPrefix) == 0;
 }
 
 Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
@@ -653,9 +656,12 @@
 }
 
 // If I is an interesting memory access, return the PointerOperand
-// and set IsWrite/Alignment. Otherwise return NULL.
+// and set IsWrite/Alignment. Otherwise return nullptr.
 static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
                                         unsigned *Alignment) {
+  // Skip memory accesses inserted by another instrumentation.
+  if (I->getMetadata("nosanitize"))
+    return nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     if (!ClInstrumentReads) return nullptr;
     *IsWrite = false;
@@ -710,7 +716,7 @@
   // If a global variable does not have dynamic initialization we don't
   // have to instrument it.  However, if a global does not have initializer
   // at all, we assume it has dynamic initializer (in other TU).
-  return G->hasInitializer() && !GlobalsMD.isDynInit(G);
+  return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
 }
 
 void
@@ -859,8 +865,11 @@
   TerminatorInst *CrashTerm = nullptr;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
+    // We use branch weights for the slow path check, to indicate that the slow
+    // path is rarely taken. This seems to be the case for SPEC benchmarks.
     TerminatorInst *CheckTerm =
-        SplitBlockAndInsertIfThen(Cmp, InsertBefore, false);
+        SplitBlockAndInsertIfThen(Cmp, InsertBefore, false,
+            MDBuilder(*C).createBranchWeights(1, 100000));
     assert(dyn_cast<BranchInst>(CheckTerm)->isUnconditional());
     BasicBlock *NextBB = CheckTerm->getSuccessor(0);
     IRB.SetInsertPoint(CheckTerm);
@@ -905,10 +914,12 @@
     ConstantStruct *CS = cast<ConstantStruct>(OP);
 
     // Must have a function or null ptr.
-    // (CS->getOperand(0) is the init priority.)
     if (Function* F = dyn_cast<Function>(CS->getOperand(1))) {
-      if (F->getName() != kAsanModuleCtorName)
-        poisonOneInitializer(*F, ModuleName);
+      if (F->getName() == kAsanModuleCtorName) continue;
+      ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+      // Don't instrument CTORs that will run before asan.module_ctor.
+      if (Priority->getLimitedValue() <= kAsanCtorAndDtorPriority) continue;
+      poisonOneInitializer(*F, ModuleName);
     }
   }
 }
@@ -917,8 +928,7 @@
   Type *Ty = cast<PointerType>(G->getType())->getElementType();
   DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
 
-  if (GlobalsMD.isBlacklisted(G)) return false;
-  if (GlobalsMD.isSourceLocationGlobal(G)) return false;
+  if (GlobalsMD.get(G).IsBlacklisted) return false;
   if (!Ty->isSized()) return false;
   if (!G->hasInitializer()) return false;
   if (GlobalWasGeneratedByAsan(G)) return false;  // Our own global.
@@ -939,16 +949,6 @@
   // For now, just ignore this Global if the alignment is large.
   if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false;
 
-  // Ignore all the globals with the names starting with "\01L_OBJC_".
-  // Many of those are put into the .cstring section. The linker compresses
-  // that section by removing the spare \0s after the string terminator, so
-  // our redzones get broken.
-  if ((G->getName().find("\01L_OBJC_") == 0) ||
-      (G->getName().find("\01l_OBJC_") == 0)) {
-    DEBUG(dbgs() << "Ignoring \\01L_OBJC_* global: " << *G << "\n");
-    return false;
-  }
-
   if (G->hasSection()) {
     StringRef Section(G->getSection());
     // Ignore the globals from the __OBJC section. The ObjC runtime assumes
@@ -977,6 +977,11 @@
       DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
       return false;
     }
+    if (Section.startswith("__TEXT,__objc_methname,cstring_literals")) {
+      DEBUG(dbgs() << "Ignoring objc_methname cstring global: " << *G << "\n");
+      return false;
+    }
+
 
     // Callbacks put into the CRT initializer/terminator sections
     // should not be instrumented.
@@ -998,24 +1003,20 @@
   IRBuilder<> IRB(*C);
   // Declare our poisoning and unpoisoning functions.
   AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, NULL));
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
   AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), NULL));
+      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), nullptr));
   AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
   // Declare functions that register/unregister globals.
   AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanRegisterGlobalsName, IRB.getVoidTy(),
-      IntptrTy, IntptrTy, NULL));
+      IntptrTy, IntptrTy, nullptr));
   AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnregisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanUnregisterGlobalsName,
-      IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+      IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
-  AsanCovModuleInit = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanCovModuleInitName,
-      IRB.getVoidTy(), IntptrTy, NULL));
-  AsanCovModuleInit->setLinkage(Function::ExternalLinkage);
 }
 
 // This function replaces all global variables with new variables that have
@@ -1045,7 +1046,7 @@
   // We initialize an array of such structures and pass it to a run-time call.
   StructType *GlobalStructTy =
       StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
-                      IntptrTy, IntptrTy, NULL);
+                      IntptrTy, IntptrTy, nullptr);
   SmallVector<Constant *, 16> Initializers(n);
 
   bool HasDynamicallyInitializedGlobals = false;
@@ -1058,6 +1059,14 @@
   for (size_t i = 0; i < n; i++) {
     static const uint64_t kMaxGlobalRedzone = 1 << 18;
     GlobalVariable *G = GlobalsToChange[i];
+
+    auto MD = GlobalsMD.get(G);
+    // Create string holding the global name (use global name from metadata
+    // if it's available, otherwise just write the name of global variable).
+    GlobalVariable *Name = createPrivateGlobalForString(
+        M, MD.Name.empty() ? G->getName() : MD.Name,
+        /*AllowMerging*/ true);
+
     PointerType *PtrTy = cast<PointerType>(G->getType());
     Type *Ty = PtrTy->getElementType();
     uint64_t SizeInBytes = DL->getTypeAllocSize(Ty);
@@ -1074,13 +1083,10 @@
     assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
 
-    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr);
     Constant *NewInitializer = ConstantStruct::get(
         NewTy, G->getInitializer(),
-        Constant::getNullValue(RightRedZoneTy), NULL);
-
-    GlobalVariable *Name =
-        createPrivateGlobalForString(M, G->getName(), /*AllowMerging*/true);
+        Constant::getNullValue(RightRedZoneTy), nullptr);
 
     // Create a new global variable with enough space for a redzone.
     GlobalValue::LinkageTypes Linkage = G->getLinkage();
@@ -1101,8 +1107,13 @@
     NewGlobal->takeName(G);
     G->eraseFromParent();
 
-    bool GlobalHasDynamicInitializer = GlobalsMD.isDynInit(G);
-    GlobalVariable *SourceLoc = GlobalsMD.getSourceLocation(G);
+    Constant *SourceLoc;
+    if (!MD.SourceLoc.empty()) {
+      auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc);
+      SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy);
+    } else {
+      SourceLoc = ConstantInt::get(IntptrTy, 0);
+    }
 
     Initializers[i] = ConstantStruct::get(
         GlobalStructTy, ConstantExpr::getPointerCast(NewGlobal, IntptrTy),
@@ -1110,12 +1121,9 @@
         ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
         ConstantExpr::getPointerCast(Name, IntptrTy),
         ConstantExpr::getPointerCast(ModuleName, IntptrTy),
-        ConstantInt::get(IntptrTy, GlobalHasDynamicInitializer),
-        SourceLoc ? ConstantExpr::getPointerCast(SourceLoc, IntptrTy)
-                  : ConstantInt::get(IntptrTy, 0),
-        NULL);
+        ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc, nullptr);
 
-    if (ClInitializers && GlobalHasDynamicInitializer)
+    if (ClInitializers && MD.IsDynInit)
       HasDynamicallyInitializedGlobals = true;
 
     DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
@@ -1166,13 +1174,6 @@
   assert(CtorFunc);
   IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
 
-  if (ClCoverage > 0) {
-    Function *CovFunc = M.getFunction(kAsanCovName);
-    int nCov = CovFunc ? CovFunc->getNumUses() : 0;
-    IRB.CreateCall(AsanCovModuleInit, ConstantInt::get(IntptrTy, nCov));
-    Changed = true;
-  }
-
   if (ClGlobals)
     Changed |= InstrumentGlobals(IRB, M);
 
@@ -1191,43 +1192,42 @@
       AsanErrorCallback[AccessIsWrite][AccessSizeIndex] =
           checkInterfaceFunction(
               M.getOrInsertFunction(kAsanReportErrorTemplate + Suffix,
-                                    IRB.getVoidTy(), IntptrTy, NULL));
+                                    IRB.getVoidTy(), IntptrTy, nullptr));
       AsanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
           checkInterfaceFunction(
               M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + Suffix,
-                                    IRB.getVoidTy(), IntptrTy, NULL));
+                                    IRB.getVoidTy(), IntptrTy, nullptr));
     }
   }
   AsanErrorCallbackSized[0] = checkInterfaceFunction(M.getOrInsertFunction(
-              kAsanReportLoadN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+              kAsanReportLoadN, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
   AsanErrorCallbackSized[1] = checkInterfaceFunction(M.getOrInsertFunction(
-              kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+              kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
 
   AsanMemoryAccessCallbackSized[0] = checkInterfaceFunction(
       M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "loadN",
-                            IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+                            IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
   AsanMemoryAccessCallbackSized[1] = checkInterfaceFunction(
       M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "storeN",
-                            IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+                            IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
 
   AsanMemmove = checkInterfaceFunction(M.getOrInsertFunction(
       ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, NULL));
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
   AsanMemcpy = checkInterfaceFunction(M.getOrInsertFunction(
       ClMemoryAccessCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, NULL));
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
   AsanMemset = checkInterfaceFunction(M.getOrInsertFunction(
       ClMemoryAccessCallbackPrefix + "memset", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy, NULL));
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy, nullptr));
 
   AsanHandleNoReturnFunc = checkInterfaceFunction(
-      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
-  AsanCovFunction = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanCovName, IRB.getVoidTy(), NULL));
+      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy(), nullptr));
+
   AsanPtrCmpFunction = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
   AsanPtrSubFunction = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
@@ -1255,7 +1255,7 @@
   // call __asan_init in the module ctor.
   IRBuilder<> IRB(ReturnInst::Create(*C, AsanCtorBB));
   AsanInitFunction = checkInterfaceFunction(
-      M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
+      M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), nullptr));
   AsanInitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(AsanInitFunction);
 
@@ -1281,74 +1281,6 @@
   return false;
 }
 
-void AddressSanitizer::InjectCoverageAtBlock(Function &F, BasicBlock &BB) {
-  BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end();
-  // Skip static allocas at the top of the entry block so they don't become
-  // dynamic when we split the block.  If we used our optimized stack layout,
-  // then there will only be one alloca and it will come first.
-  for (; IP != BE; ++IP) {
-    AllocaInst *AI = dyn_cast<AllocaInst>(IP);
-    if (!AI || !AI->isStaticAlloca())
-      break;
-  }
-
-  DebugLoc EntryLoc = IP->getDebugLoc().getFnDebugLoc(*C);
-  IRBuilder<> IRB(IP);
-  IRB.SetCurrentDebugLocation(EntryLoc);
-  Type *Int8Ty = IRB.getInt8Ty();
-  GlobalVariable *Guard = new GlobalVariable(
-      *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage,
-      Constant::getNullValue(Int8Ty), "__asan_gen_cov_" + F.getName());
-  LoadInst *Load = IRB.CreateLoad(Guard);
-  Load->setAtomic(Monotonic);
-  Load->setAlignment(1);
-  Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load);
-  Instruction *Ins = SplitBlockAndInsertIfThen(
-      Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
-  IRB.SetInsertPoint(Ins);
-  IRB.SetCurrentDebugLocation(EntryLoc);
-  // We pass &F to __sanitizer_cov. We could avoid this and rely on
-  // GET_CALLER_PC, but having the PC of the first instruction is just nice.
-  IRB.CreateCall(AsanCovFunction);
-  StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard);
-  Store->setAtomic(Monotonic);
-  Store->setAlignment(1);
-}
-
-// Poor man's coverage that works with ASan.
-// We create a Guard boolean variable with the same linkage
-// as the function and inject this code into the entry block (-asan-coverage=1)
-// or all blocks (-asan-coverage=2):
-// if (*Guard) {
-//    __sanitizer_cov(&F);
-//    *Guard = 1;
-// }
-// The accesses to Guard are atomic. The rest of the logic is
-// in __sanitizer_cov (it's fine to call it more than once).
-//
-// This coverage implementation provides very limited data:
-// it only tells if a given function (block) was ever executed.
-// No counters, no per-edge data.
-// But for many use cases this is what we need and the added slowdown
-// is negligible. This simple implementation will probably be obsoleted
-// by the upcoming Clang-based coverage implementation.
-// By having it here and now we hope to
-//  a) get the functionality to users earlier and
-//  b) collect usage statistics to help improve Clang coverage design.
-bool AddressSanitizer::InjectCoverage(Function &F,
-                                      const ArrayRef<BasicBlock *> AllBlocks) {
-  if (!ClCoverage) return false;
-
-  if (ClCoverage == 1 ||
-      (unsigned)ClCoverageBlockThreshold < AllBlocks.size()) {
-    InjectCoverageAtBlock(F, F.getEntryBlock());
-  } else {
-    for (auto BB : AllBlocks)
-      InjectCoverageAtBlock(F, *BB);
-  }
-  return true;
-}
-
 bool AddressSanitizer::runOnFunction(Function &F) {
   if (&F == AsanCtorFunction) return false;
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
@@ -1385,7 +1317,7 @@
       if (Value *Addr =
               isInterestingMemoryAccess(&Inst, &IsWrite, &Alignment)) {
         if (ClOpt && ClOptSameTemp) {
-          if (!TempsToInstrument.insert(Addr))
+          if (!TempsToInstrument.insert(Addr).second)
             continue;  // We've seen this temp in the current BB.
         }
       } else if (ClInvalidPointerPairs &&
@@ -1459,9 +1391,6 @@
 
   bool res = NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
 
-  if (InjectCoverage(F, AllBlocks))
-    res = true;
-
   DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n");
 
   if (ClKeepUninstrumented) {
@@ -1499,19 +1428,21 @@
     std::string Suffix = itostr(i);
     AsanStackMallocFunc[i] = checkInterfaceFunction(
         M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
-                              IntptrTy, IntptrTy, NULL));
+                              IntptrTy, IntptrTy, nullptr));
     AsanStackFreeFunc[i] = checkInterfaceFunction(M.getOrInsertFunction(
         kAsanStackFreeNameTemplate + Suffix, IRB.getVoidTy(), IntptrTy,
-        IntptrTy, IntptrTy, NULL));
+        IntptrTy, IntptrTy, nullptr));
   }
-  AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
-  AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanPoisonStackMemoryFunc = checkInterfaceFunction(
+      M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, nullptr));
+  AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(
+      M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, nullptr));
 }
 
 void
-FunctionStackPoisoner::poisonRedZones(const ArrayRef<uint8_t> ShadowBytes,
+FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes,
                                       IRBuilder<> &IRB, Value *ShadowBase,
                                       bool DoPoison) {
   size_t n = ShadowBytes.size();

diff --git a/lib/Transforms/Instrumentation/Android.mk b/lib/Transforms/Instrumentation/Android.mk
index f9a55c7..1f21028 100644
--- a/lib/Transforms/Instrumentation/Android.mk
+++ b/lib/Transforms/Instrumentation/Android.mk

@@ -8,6 +8,7 @@
   GCOVProfiling.cpp \
   Instrumentation.cpp \
   MemorySanitizer.cpp \
+  SanitizerCoverage.cpp \
   ThreadSanitizer.cpp
 
 # For the host

diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 3563593..139e514 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt

@@ -6,6 +6,7 @@
   GCOVProfiling.cpp
   MemorySanitizer.cpp
   Instrumentation.cpp
+  SanitizerCoverage.cpp
   ThreadSanitizer.cpp
   )
 

diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 799e14b..c5a4860 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp

@@ -50,6 +50,8 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstVisitor.h"
@@ -62,7 +64,10 @@
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
 #include <iterator>
+#include <set>
+#include <utility>
 
 using namespace llvm;
 
@@ -135,11 +140,11 @@
   std::unique_ptr<SpecialCaseList> SCL;
 
  public:
-  DFSanABIList(SpecialCaseList *SCL) : SCL(SCL) {}
+  DFSanABIList(std::unique_ptr<SpecialCaseList> SCL) : SCL(std::move(SCL)) {}
 
   /// Returns whether either this function or its source file are listed in the
   /// given category.
-  bool isIn(const Function &F, const StringRef Category) const {
+  bool isIn(const Function &F, StringRef Category) const {
     return isIn(*F.getParent(), Category) ||
            SCL->inSection("fun", F.getName(), Category);
   }
@@ -148,7 +153,7 @@
   ///
   /// If GA aliases a function, the alias's name is matched as a function name
   /// would be.  Similarly, aliases of globals are matched like globals.
-  bool isIn(const GlobalAlias &GA, const StringRef Category) const {
+  bool isIn(const GlobalAlias &GA, StringRef Category) const {
     if (isIn(*GA.getParent(), Category))
       return true;
 
@@ -160,7 +165,7 @@
   }
 
   /// Returns whether this module is listed in the given category.
-  bool isIn(const Module &M, const StringRef Category) const {
+  bool isIn(const Module &M, StringRef Category) const {
     return SCL->inSection("src", M.getModuleIdentifier(), Category);
   }
 };
@@ -229,18 +234,21 @@
   FunctionType *DFSanUnimplementedFnTy;
   FunctionType *DFSanSetLabelFnTy;
   FunctionType *DFSanNonzeroLabelFnTy;
+  FunctionType *DFSanVarargWrapperFnTy;
   Constant *DFSanUnionFn;
+  Constant *DFSanCheckedUnionFn;
   Constant *DFSanUnionLoadFn;
   Constant *DFSanUnimplementedFn;
   Constant *DFSanSetLabelFn;
   Constant *DFSanNonzeroLabelFn;
+  Constant *DFSanVarargWrapperFn;
   MDNode *ColdCallWeights;
   DFSanABIList ABIList;
   DenseMap<Value *, Function *> UnwrappedFnMap;
   AttributeSet ReadOnlyNoneAttrs;
+  DenseMap<const Function *, DISubprogram> FunctionDIs;
 
   Value *getShadowAddress(Value *Addr, Instruction *Pos);
-  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
   bool isInstrumented(const Function *F);
   bool isInstrumented(const GlobalAlias *GA);
   FunctionType *getArgsFunctionType(FunctionType *T);
@@ -266,6 +274,7 @@
 struct DFSanFunction {
   DataFlowSanitizer &DFS;
   Function *F;
+  DominatorTree DT;
   DataFlowSanitizer::InstrumentedABI IA;
   bool IsNativeABI;
   Value *ArgTLSPtr;
@@ -275,17 +284,32 @@
   DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
   std::vector<std::pair<PHINode *, PHINode *> > PHIFixups;
   DenseSet<Instruction *> SkipInsts;
-  DenseSet<Value *> NonZeroChecks;
+  std::vector<Value *> NonZeroChecks;
+  bool AvoidNewBlocks;
+
+  struct CachedCombinedShadow {
+    BasicBlock *Block;
+    Value *Shadow;
+  };
+  DenseMap<std::pair<Value *, Value *>, CachedCombinedShadow>
+      CachedCombinedShadows;
+  DenseMap<Value *, std::set<Value *>> ShadowElements;
 
   DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
       : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()),
         IsNativeABI(IsNativeABI), ArgTLSPtr(nullptr), RetvalTLSPtr(nullptr),
-        LabelReturnAlloca(nullptr) {}
+        LabelReturnAlloca(nullptr) {
+    DT.recalculate(*F);
+    // FIXME: Need to track down the register allocator issue which causes poor
+    // performance in pathological cases with large numbers of basic blocks.
+    AvoidNewBlocks = F->size() > 1000;
+  }
   Value *getArgTLSPtr();
   Value *getArgTLS(unsigned Index, Instruction *Pos);
   Value *getRetvalTLS();
   Value *getShadow(Value *V);
   void setShadow(Instruction *I, Value *Shadow);
+  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
   Value *combineOperandShadows(Instruction *Inst);
   Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
                     Instruction *Pos);
@@ -367,7 +391,6 @@
 }
 
 FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
-  assert(!T->isVarArg());
   llvm::SmallVector<Type *, 4> ArgTypes;
   for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end();
        i != e; ++i) {
@@ -382,10 +405,12 @@
   }
   for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
     ArgTypes.push_back(ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
     ArgTypes.push_back(ShadowPtrTy);
-  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+  return FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg());
 }
 
 bool DataFlowSanitizer::doInitialization(Module &M) {
@@ -415,7 +440,9 @@
   DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                         DFSanSetLabelArgs, /*isVarArg=*/false);
   DFSanNonzeroLabelFnTy = FunctionType::get(
-      Type::getVoidTy(*Ctx), ArrayRef<Type *>(), /*isVarArg=*/false);
+      Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
+  DFSanVarargWrapperFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
 
   if (GetArgTLSPtr) {
     Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
@@ -495,15 +522,26 @@
                                        AttributeSet::ReturnIndex));
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
-  std::vector<Value *> Args;
-  unsigned n = FT->getNumParams();
-  for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
-    Args.push_back(&*ai);
-  CallInst *CI = CallInst::Create(F, Args, "", BB);
-  if (FT->getReturnType()->isVoidTy())
-    ReturnInst::Create(*Ctx, BB);
-  else
-    ReturnInst::Create(*Ctx, CI, BB);
+  if (F->isVarArg()) {
+    NewF->removeAttributes(
+        AttributeSet::FunctionIndex,
+        AttributeSet().addAttribute(*Ctx, AttributeSet::FunctionIndex,
+                                    "split-stack"));
+    CallInst::Create(DFSanVarargWrapperFn,
+                     IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
+                     BB);
+    new UnreachableInst(*Ctx, BB);
+  } else {
+    std::vector<Value *> Args;
+    unsigned n = FT->getNumParams();
+    for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
+      Args.push_back(&*ai);
+    CallInst *CI = CallInst::Create(F, Args, "", BB);
+    if (FT->getReturnType()->isVoidTy())
+      ReturnInst::Create(*Ctx, BB);
+    else
+      ReturnInst::Create(*Ctx, CI, BB);
+  }
 
   return NewF;
 }
@@ -548,6 +586,8 @@
   if (ABIList.isIn(M, "skip"))
     return false;
 
+  FunctionDIs = makeSubprogramMap(M);
+
   if (!GetArgTLSPtr) {
     Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
     ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
@@ -562,6 +602,15 @@
 
   DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
   if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(1, Attribute::ZExt);
+    F->addAttribute(2, Attribute::ZExt);
+  }
+  DFSanCheckedUnionFn = Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanCheckedUnionFn)) {
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
     F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
     F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
     F->addAttribute(1, Attribute::ZExt);
@@ -570,6 +619,7 @@
   DFSanUnionLoadFn =
       Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
   if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
     F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly);
     F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
   }
@@ -582,16 +632,20 @@
   }
   DFSanNonzeroLabelFn =
       Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
+  DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper",
+                                                  DFSanVarargWrapperFnTy);
 
   std::vector<Function *> FnsToInstrument;
   llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI;
   for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
     if (!i->isIntrinsic() &&
         i != DFSanUnionFn &&
+        i != DFSanCheckedUnionFn &&
         i != DFSanUnionLoadFn &&
         i != DFSanUnimplementedFn &&
         i != DFSanSetLabelFn &&
-        i != DFSanNonzeroLabelFn)
+        i != DFSanNonzeroLabelFn &&
+        i != DFSanVarargWrapperFn)
       FnsToInstrument.push_back(&*i);
   }
 
@@ -673,11 +727,6 @@
       } else {
         addGlobalNamePrefix(&F);
       }
-               // Hopefully, nobody will try to indirectly call a vararg
-               // function... yet.
-    } else if (FT->isVarArg()) {
-      UnwrappedFnMap[&F] = &F;
-      *i = nullptr;
     } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
       // Build a wrapper function for F.  The wrapper simply calls F, and is
       // added to FnsToInstrument so that any instrumentation according to its
@@ -694,6 +743,12 @@
       Value *WrappedFnCst =
           ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
       F.replaceAllUsesWith(WrappedFnCst);
+
+      // Patch the pointer to LLVM function in debug info descriptor.
+      auto DI = FunctionDIs.find(&F);
+      if (DI != FunctionDIs.end())
+        DI->second.replaceFunction(&F);
+
       UnwrappedFnMap[WrappedFnCst] = &F;
       *i = NewF;
 
@@ -713,6 +768,11 @@
         i = FnsToInstrument.begin() + N;
         e = FnsToInstrument.begin() + Count;
       }
+               // Hopefully, nobody will try to indirectly call a vararg
+               // function... yet.
+    } else if (FT->isVarArg()) {
+      UnwrappedFnMap[&F] = &F;
+      *i = nullptr;
     }
   }
 
@@ -771,18 +831,16 @@
     // yet).  To make our life easier, do this work in a pass after the main
     // instrumentation.
     if (ClDebugNonzeroLabels) {
-      for (DenseSet<Value *>::iterator i = DFSF.NonZeroChecks.begin(),
-                                       e = DFSF.NonZeroChecks.end();
-           i != e; ++i) {
+      for (Value *V : DFSF.NonZeroChecks) {
         Instruction *Pos;
-        if (Instruction *I = dyn_cast<Instruction>(*i))
+        if (Instruction *I = dyn_cast<Instruction>(V))
           Pos = I->getNextNode();
         else
           Pos = DFSF.F->getEntryBlock().begin();
         while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
           Pos = Pos->getNextNode();
         IRBuilder<> IRB(Pos);
-        Value *Ne = IRB.CreateICmpNE(*i, DFSF.DFS.ZeroShadow);
+        Value *Ne = IRB.CreateICmpNE(V, DFSF.DFS.ZeroShadow);
         BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
             Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
         IRBuilder<> ThenIRB(BI);
@@ -847,7 +905,7 @@
         break;
       }
       }
-      NonZeroChecks.insert(Shadow);
+      NonZeroChecks.push_back(Shadow);
     } else {
       Shadow = DFS.ZeroShadow;
     }
@@ -873,30 +931,82 @@
 
 // Generates IR to compute the union of the two given shadows, inserting it
 // before Pos.  Returns the computed union Value.
-Value *DataFlowSanitizer::combineShadows(Value *V1, Value *V2,
-                                         Instruction *Pos) {
-  if (V1 == ZeroShadow)
+Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
+  if (V1 == DFS.ZeroShadow)
     return V2;
-  if (V2 == ZeroShadow)
+  if (V2 == DFS.ZeroShadow)
     return V1;
   if (V1 == V2)
     return V1;
-  IRBuilder<> IRB(Pos);
-  BasicBlock *Head = Pos->getParent();
-  Value *Ne = IRB.CreateICmpNE(V1, V2);
-  BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
-      Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
-  IRBuilder<> ThenIRB(BI);
-  CallInst *Call = ThenIRB.CreateCall2(DFSanUnionFn, V1, V2);
-  Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
-  Call->addAttribute(1, Attribute::ZExt);
-  Call->addAttribute(2, Attribute::ZExt);
 
-  BasicBlock *Tail = BI->getSuccessor(0);
-  PHINode *Phi = PHINode::Create(ShadowTy, 2, "", Tail->begin());
-  Phi->addIncoming(Call, Call->getParent());
-  Phi->addIncoming(V1, Head);
-  return Phi;
+  auto V1Elems = ShadowElements.find(V1);
+  auto V2Elems = ShadowElements.find(V2);
+  if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
+    if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
+                      V2Elems->second.begin(), V2Elems->second.end())) {
+      return V1;
+    } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
+                             V1Elems->second.begin(), V1Elems->second.end())) {
+      return V2;
+    }
+  } else if (V1Elems != ShadowElements.end()) {
+    if (V1Elems->second.count(V2))
+      return V1;
+  } else if (V2Elems != ShadowElements.end()) {
+    if (V2Elems->second.count(V1))
+      return V2;
+  }
+
+  auto Key = std::make_pair(V1, V2);
+  if (V1 > V2)
+    std::swap(Key.first, Key.second);
+  CachedCombinedShadow &CCS = CachedCombinedShadows[Key];
+  if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent()))
+    return CCS.Shadow;
+
+  IRBuilder<> IRB(Pos);
+  if (AvoidNewBlocks) {
+    CallInst *Call = IRB.CreateCall2(DFS.DFSanCheckedUnionFn, V1, V2);
+    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(1, Attribute::ZExt);
+    Call->addAttribute(2, Attribute::ZExt);
+
+    CCS.Block = Pos->getParent();
+    CCS.Shadow = Call;
+  } else {
+    BasicBlock *Head = Pos->getParent();
+    Value *Ne = IRB.CreateICmpNE(V1, V2);
+    BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
+    IRBuilder<> ThenIRB(BI);
+    CallInst *Call = ThenIRB.CreateCall2(DFS.DFSanUnionFn, V1, V2);
+    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(1, Attribute::ZExt);
+    Call->addAttribute(2, Attribute::ZExt);
+
+    BasicBlock *Tail = BI->getSuccessor(0);
+    PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", Tail->begin());
+    Phi->addIncoming(Call, Call->getParent());
+    Phi->addIncoming(V1, Head);
+
+    CCS.Block = Tail;
+    CCS.Shadow = Phi;
+  }
+
+  std::set<Value *> UnionElems;
+  if (V1Elems != ShadowElements.end()) {
+    UnionElems = V1Elems->second;
+  } else {
+    UnionElems.insert(V1);
+  }
+  if (V2Elems != ShadowElements.end()) {
+    UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end());
+  } else {
+    UnionElems.insert(V2);
+  }
+  ShadowElements[CCS.Shadow] = std::move(UnionElems);
+
+  return CCS.Shadow;
 }
 
 // A convenience function which folds the shadows of each of the operands
@@ -908,7 +1018,7 @@
 
   Value *Shadow = getShadow(Inst->getOperand(0));
   for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
-    Shadow = DFS.combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
+    Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
   }
   return Shadow;
 }
@@ -961,12 +1071,11 @@
     IRBuilder<> IRB(Pos);
     Value *ShadowAddr1 =
         IRB.CreateGEP(ShadowAddr, ConstantInt::get(DFS.IntptrTy, 1));
-    return DFS.combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign),
-                              IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign),
-                              Pos);
+    return combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign),
+                          IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), Pos);
   }
   }
-  if (Size % (64 / DFS.ShadowWidth) == 0) {
+  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidth) == 0) {
     // Fast path for the common case where each byte has identical shadow: load
     // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
     // shadow is non-equal.
@@ -990,16 +1099,27 @@
 
     BasicBlock *Head = Pos->getParent();
     BasicBlock *Tail = Head->splitBasicBlock(Pos);
+
+    if (DomTreeNode *OldNode = DT.getNode(Head)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT.addNewBlock(Tail, Head);
+      for (auto Child : Children)
+        DT.changeImmediateDominator(Child, NewNode);
+    }
+
     // In the following code LastBr will refer to the previous basic block's
     // conditional branch instruction, whose true successor is fixed up to point
     // to the next block during the loop below or to the tail after the final
     // iteration.
     BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
     ReplaceInstWithInst(Head->getTerminator(), LastBr);
+    DT.addNewBlock(FallbackBB, Head);
 
     for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size;
          Ofs += 64 / DFS.ShadowWidth) {
       BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
+      DT.addNewBlock(NextBB, LastBr->getParent());
       IRBuilder<> NextIRB(NextBB);
       WideAddr = NextIRB.CreateGEP(WideAddr, ConstantInt::get(DFS.IntptrTy, 1));
       Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign);
@@ -1025,6 +1145,11 @@
 
 void DFSanVisitor::visitLoadInst(LoadInst &LI) {
   uint64_t Size = DFSF.DFS.DL->getTypeStoreSize(LI.getType());
+  if (Size == 0) {
+    DFSF.setShadow(&LI, DFSF.DFS.ZeroShadow);
+    return;
+  }
+
   uint64_t Align;
   if (ClPreserveAlignment) {
     Align = LI.getAlignment();
@@ -1037,10 +1162,10 @@
   Value *Shadow = DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI);
   if (ClCombinePointerLabelsOnLoad) {
     Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
-    Shadow = DFSF.DFS.combineShadows(Shadow, PtrShadow, &LI);
+    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &LI);
   }
   if (Shadow != DFSF.DFS.ZeroShadow)
-    DFSF.NonZeroChecks.insert(Shadow);
+    DFSF.NonZeroChecks.push_back(Shadow);
 
   DFSF.setShadow(&LI, Shadow);
 }
@@ -1099,6 +1224,9 @@
 void DFSanVisitor::visitStoreInst(StoreInst &SI) {
   uint64_t Size =
       DFSF.DFS.DL->getTypeStoreSize(SI.getValueOperand()->getType());
+  if (Size == 0)
+    return;
+
   uint64_t Align;
   if (ClPreserveAlignment) {
     Align = SI.getAlignment();
@@ -1111,7 +1239,7 @@
   Value* Shadow = DFSF.getShadow(SI.getValueOperand());
   if (ClCombinePointerLabelsOnStore) {
     Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
-    Shadow = DFSF.DFS.combineShadows(Shadow, PtrShadow, &SI);
+    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
   }
   DFSF.storeShadow(SI.getPointerOperand(), Size, Align, Shadow, &SI);
 }
@@ -1176,9 +1304,9 @@
 
   if (isa<VectorType>(I.getCondition()->getType())) {
     DFSF.setShadow(
-        &I, DFSF.DFS.combineShadows(
-                CondShadow,
-                DFSF.DFS.combineShadows(TrueShadow, FalseShadow, &I), &I));
+        &I,
+        DFSF.combineShadows(
+            CondShadow, DFSF.combineShadows(TrueShadow, FalseShadow, &I), &I));
   } else {
     Value *ShadowSel;
     if (TrueShadow == FalseShadow) {
@@ -1187,7 +1315,7 @@
       ShadowSel =
           SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
     }
-    DFSF.setShadow(&I, DFSF.DFS.combineShadows(CondShadow, ShadowSel, &I));
+    DFSF.setShadow(&I, DFSF.combineShadows(CondShadow, ShadowSel, &I));
   }
 }
 
@@ -1253,6 +1381,15 @@
     return;
   }
 
+  // Calls to this function are synthesized in wrappers, and we shouldn't
+  // instrument them.
+  if (F == DFSF.DFS.DFSanVarargWrapperFn)
+    return;
+
+  assert(!(cast<FunctionType>(
+      CS.getCalledValue()->getType()->getPointerElementType())->isVarArg() &&
+           dyn_cast<InvokeInst>(CS.getInstruction())));
+
   IRBuilder<> IRB(CS.getInstruction());
 
   DenseMap<Value *, Function *>::iterator i =
@@ -1324,6 +1461,20 @@
         for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
           Args.push_back(DFSF.getShadow(*i));
 
+        if (FT->isVarArg()) {
+          auto LabelVAAlloca =
+              new AllocaInst(ArrayType::get(DFSF.DFS.ShadowTy,
+                                            CS.arg_size() - FT->getNumParams()),
+                             "labelva", DFSF.F->getEntryBlock().begin());
+
+          for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
+            auto LabelVAPtr = IRB.CreateStructGEP(LabelVAAlloca, n);
+            IRB.CreateStore(DFSF.getShadow(*i), LabelVAPtr);
+          }
+
+          Args.push_back(IRB.CreateStructGEP(LabelVAAlloca, 0));
+        }
+
         if (!FT->getReturnType()->isVoidTy()) {
           if (!DFSF.LabelReturnAlloca) {
             DFSF.LabelReturnAlloca =
@@ -1333,6 +1484,9 @@
           Args.push_back(DFSF.LabelReturnAlloca);
         }
 
+        for (i = CS.arg_begin() + FT->getNumParams(); i != CS.arg_end(); ++i)
+          Args.push_back(*i);
+
         CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
         CustomCI->setCallingConv(CI->getCallingConv());
         CustomCI->setAttributes(CI->getAttributes());
@@ -1379,7 +1533,7 @@
       LoadInst *LI = NextIRB.CreateLoad(DFSF.getRetvalTLS());
       DFSF.SkipInsts.insert(LI);
       DFSF.setShadow(CS.getInstruction(), LI);
-      DFSF.NonZeroChecks.insert(LI);
+      DFSF.NonZeroChecks.push_back(LI);
     }
   }
 
@@ -1433,7 +1587,7 @@
           ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next);
       DFSF.SkipInsts.insert(ExShadow);
       DFSF.setShadow(ExVal, ExShadow);
-      DFSF.NonZeroChecks.insert(ExShadow);
+      DFSF.NonZeroChecks.push_back(ExShadow);
 
       CS.getInstruction()->replaceAllUsesWith(ExVal);
     }

diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp
index f2f1738..5234341 100644
--- a/lib/Transforms/Instrumentation/DebugIR.cpp
+++ b/lib/Transforms/Instrumentation/DebugIR.cpp

@@ -396,7 +396,7 @@
         Elements.push_back(getOrCreateType(T->getStructElementType(i)));
 
       // set struct elements
-      StructDescriptor.setTypeArray(Builder.getOrCreateArray(Elements));
+      StructDescriptor.setArrays(Builder.getOrCreateArray(Elements));
     } else if (T->isPointerTy()) {
       Type *PointeeTy = T->getPointerElementType();
       if (!(N = getType(PointeeTy)))
@@ -440,7 +440,7 @@
       Params.push_back(getOrCreateType(T));
     }
 
-    DIArray ParamArray = Builder.getOrCreateArray(Params);
+    DITypeArray ParamArray = Builder.getOrCreateTypeArray(Params);
     return Builder.createSubroutineType(DIFile(FileNode), ParamArray);
   }
 
@@ -525,11 +525,11 @@
 
 void DebugIR::writeDebugBitcode(const Module *M, int *fd) {
   std::unique_ptr<raw_fd_ostream> Out;
-  std::string error;
+  std::error_code EC;
 
   if (!fd) {
     std::string Path = getPath();
-    Out.reset(new raw_fd_ostream(Path.c_str(), error, sys::fs::F_Text));
+    Out.reset(new raw_fd_ostream(Path, EC, sys::fs::F_Text));
     DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to file "
                  << Path << "\n");
   } else {

diff --git a/lib/Transforms/Instrumentation/DebugIR.h b/lib/Transforms/Instrumentation/DebugIR.h
index 02831ed..8d74a4d 100644
--- a/lib/Transforms/Instrumentation/DebugIR.h
+++ b/lib/Transforms/Instrumentation/DebugIR.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
-#define LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
+#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
+#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
 
 #include "llvm/Pass.h"
 
@@ -95,4 +95,4 @@
 
 } // llvm namespace
 
-#endif // LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
+#endif

diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index cfeb62e..220d7f8 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp

@@ -480,12 +480,12 @@
     // LTO, we'll generate the same .gcno files.
 
     DICompileUnit CU(CU_Nodes->getOperand(i));
-    std::string ErrorInfo;
-    raw_fd_ostream out(mangleName(CU, "gcno").c_str(), ErrorInfo,
-                       sys::fs::F_None);
+    std::error_code EC;
+    raw_fd_ostream out(mangleName(CU, "gcno"), EC, sys::fs::F_None);
     std::string EdgeDestinations;
 
     DIArray SPs = CU.getSubprograms();
+    unsigned FunctionIdent = 0;
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
       DISubprogram SP(SPs.getElement(i));
       assert((!SP || SP.isSubprogram()) &&
@@ -505,8 +505,8 @@
         ++It;
       EntryBlock.splitBasicBlock(It);
 
-      Funcs.push_back(
-          make_unique<GCOVFunction>(SP, &out, i, Options.UseCfgChecksum));
+      Funcs.push_back(make_unique<GCOVFunction>(SP, &out, FunctionIdent++,
+                                                Options.UseCfgChecksum));
       GCOVFunction &Func = *Funcs.back();
 
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
@@ -738,11 +738,11 @@
     Edge += Successors;
   }
 
-  ArrayRef<Constant*> V(&EdgeTable[0], TableSize);
   GlobalVariable *EdgeTableGV =
       new GlobalVariable(
           *M, EdgeTableTy, true, GlobalValue::InternalLinkage,
-          ConstantArray::get(EdgeTableTy, V),
+          ConstantArray::get(EdgeTableTy,
+                             makeArrayRef(&EdgeTable[0],TableSize)),
           "__llvm_gcda_edge_table");
   EdgeTableGV->setUnnamedAddr(true);
   return EdgeTableGV;

diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index ac1dd43..8e95367 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp

@@ -27,6 +27,7 @@
   initializeGCOVProfilerPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
+  initializeSanitizerCoverageModulePass(Registry);
   initializeDataFlowSanitizerPass(Registry);
 }
 

diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 496ab48..1261259 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp

@@ -127,6 +127,10 @@
 static const unsigned kMinOriginAlignment = 4;
 static const unsigned kShadowTLSAlignment = 8;
 
+// These constants must be kept in sync with the ones in msan.h.
+static const unsigned kParamTLSSize = 800;
+static const unsigned kRetvalTLSSize = 800;
+
 // Accesses sizes are powers of two: 1, 2, 4, 8.
 static const size_t kNumberOfAccessSizes = 4;
 
@@ -195,6 +199,13 @@
        cl::desc("Do not wrap indirect calls with target in the same module"),
        cl::Hidden, cl::init(true));
 
+// This is an experiment to enable handling of cases where shadow is a non-zero
+// compile-time constant. For some unexplainable reason they were silently
+// ignored in the instrumentation.
+static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow",
+       cl::desc("Insert checks for constant shadow values"),
+       cl::Hidden, cl::init(false));
+
 namespace {
 
 /// \brief An instrumentation pass implementing detection of uninitialized
@@ -321,7 +332,7 @@
   // which is not yet implemented.
   StringRef WarningFnName = ClKeepGoing ? "__msan_warning"
                                         : "__msan_warning_noreturn";
-  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), NULL);
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), nullptr);
 
   for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
        AccessSizeIndex++) {
@@ -329,34 +340,35 @@
     std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
     MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
         FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt32Ty(), NULL);
+        IRB.getInt32Ty(), nullptr);
 
     FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
     MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
         FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt8PtrTy(), IRB.getInt32Ty(), NULL);
+        IRB.getInt8PtrTy(), IRB.getInt32Ty(), nullptr);
   }
 
   MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
     "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
-    IRB.getInt8PtrTy(), IntptrTy, NULL);
-  MsanPoisonStackFn = M.getOrInsertFunction(
-    "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL);
+    IRB.getInt8PtrTy(), IntptrTy, nullptr);
+  MsanPoisonStackFn =
+      M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, nullptr);
   MsanChainOriginFn = M.getOrInsertFunction(
-    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty(), NULL);
+    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty(), nullptr);
   MemmoveFn = M.getOrInsertFunction(
     "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IRB.getInt8PtrTy(), IntptrTy, NULL);
+    IRB.getInt8PtrTy(), IntptrTy, nullptr);
   MemcpyFn = M.getOrInsertFunction(
     "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IntptrTy, NULL);
+    IntptrTy, nullptr);
   MemsetFn = M.getOrInsertFunction(
     "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
-    IntptrTy, NULL);
+    IntptrTy, nullptr);
 
   // Create globals.
   RetvalTLS = new GlobalVariable(
-    M, ArrayType::get(IRB.getInt64Ty(), 8), false,
+    M, ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8), false,
     GlobalVariable::ExternalLinkage, nullptr, "__msan_retval_tls", nullptr,
     GlobalVariable::InitialExecTLSModel);
   RetvalOriginTLS = new GlobalVariable(
@@ -364,16 +376,16 @@
     "__msan_retval_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
 
   ParamTLS = new GlobalVariable(
-    M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
+    M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
     GlobalVariable::ExternalLinkage, nullptr, "__msan_param_tls", nullptr,
     GlobalVariable::InitialExecTLSModel);
   ParamOriginTLS = new GlobalVariable(
-    M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage,
-    nullptr, "__msan_param_origin_tls", nullptr,
-    GlobalVariable::InitialExecTLSModel);
+    M, ArrayType::get(OriginTy, kParamTLSSize / 4), false,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_param_origin_tls",
+    nullptr, GlobalVariable::InitialExecTLSModel);
 
   VAArgTLS = new GlobalVariable(
-    M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
+    M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
     GlobalVariable::ExternalLinkage, nullptr, "__msan_va_arg_tls", nullptr,
     GlobalVariable::InitialExecTLSModel);
   VAArgOverflowSizeTLS = new GlobalVariable(
@@ -393,7 +405,7 @@
     AnyFunctionPtrTy =
         PointerType::getUnqual(FunctionType::get(IRB.getVoidTy(), false));
     IndirectCallWrapperFn = M.getOrInsertFunction(
-        ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, NULL);
+        ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, nullptr);
   }
 
   if (WrapIndirectCalls && ClWrapIndirectCallsFast) {
@@ -442,7 +454,7 @@
 
   // Insert a call to __msan_init/__msan_track_origins into the module's CTORs.
   appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction(
-                      "__msan_init", IRB.getVoidTy(), NULL)), 0);
+                      "__msan_init", IRB.getVoidTy(), nullptr)), 0);
 
   if (TrackOrigins)
     new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
@@ -559,7 +571,8 @@
       // TODO(eugenis): handle non-zero constant shadow by inserting an
       // unconditional check (can not simply fail compilation as this could
       // be in the dead code).
-      if (isa<Constant>(ConvertedShadow)) return;
+      if (!ClCheckConstantShadow)
+        if (isa<Constant>(ConvertedShadow)) return;
       unsigned TypeSizeInBits =
           MS.DL->getTypeSizeInBits(ConvertedShadow->getType());
       unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
@@ -615,8 +628,9 @@
     DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
     Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
     DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
-    // See the comment in materializeStores().
-    if (isa<Constant>(ConvertedShadow)) return;
+    // See the comment in storeOrigin().
+    if (!ClCheckConstantShadow)
+      if (isa<Constant>(ConvertedShadow)) return;
     unsigned TypeSizeInBits =
         MS.DL->getTypeSizeInBits(ConvertedShadow->getType());
     unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
@@ -763,6 +777,10 @@
       return VectorType::get(IntegerType::get(*MS.C, EltSize),
                              VT->getNumElements());
     }
+    if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) {
+      return ArrayType::get(getShadowTy(AT->getElementType()),
+                            AT->getNumElements());
+    }
     if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
       SmallVector<Type*, 4> Elements;
       for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
@@ -882,11 +900,18 @@
     assert(ShadowTy);
     if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
       return Constant::getAllOnesValue(ShadowTy);
-    StructType *ST = cast<StructType>(ShadowTy);
-    SmallVector<Constant *, 4> Vals;
-    for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
-      Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
-    return ConstantStruct::get(ST, Vals);
+    if (ArrayType *AT = dyn_cast<ArrayType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals(AT->getNumElements(),
+                                      getPoisonedShadow(AT->getElementType()));
+      return ConstantArray::get(AT, Vals);
+    }
+    if (StructType *ST = dyn_cast<StructType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
+      return ConstantStruct::get(ST, Vals);
+    }
+    llvm_unreachable("Unexpected shadow type");
   }
 
   /// \brief Create a dirty shadow for a given value.
@@ -941,6 +966,7 @@
           ? MS.DL->getTypeAllocSize(FArg.getType()->getPointerElementType())
           : MS.DL->getTypeAllocSize(FArg.getType());
         if (A == &FArg) {
+          bool Overflow = ArgOffset + Size > kParamTLSSize;
           Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
           if (FArg.hasByValAttr()) {
             // ByVal pointer itself has clean shadow. We copy the actual
@@ -951,25 +977,38 @@
               Type *EltType = A->getType()->getPointerElementType();
               ArgAlign = MS.DL->getABITypeAlignment(EltType);
             }
-            unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
-            Value *Cpy = EntryIRB.CreateMemCpy(
-                getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), Base, Size,
-                CopyAlign);
-            DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
-            (void)Cpy;
+            if (Overflow) {
+              // ParamTLS overflow.
+              EntryIRB.CreateMemSet(
+                  getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB),
+                  Constant::getNullValue(EntryIRB.getInt8Ty()), Size, ArgAlign);
+            } else {
+              unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
+              Value *Cpy = EntryIRB.CreateMemCpy(
+                  getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), Base, Size,
+                  CopyAlign);
+              DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
+              (void)Cpy;
+            }
             *ShadowPtr = getCleanShadow(V);
           } else {
-            *ShadowPtr = EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
+            if (Overflow) {
+              // ParamTLS overflow.
+              *ShadowPtr = getCleanShadow(V);
+            } else {
+              *ShadowPtr =
+                  EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
+            }
           }
           DEBUG(dbgs() << "  ARG:    "  << FArg << " ==> " <<
                 **ShadowPtr << "\n");
-          if (MS.TrackOrigins) {
+          if (MS.TrackOrigins && !Overflow) {
             Value *OriginPtr =
                 getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
             setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
           }
         }
-        ArgOffset += DataLayout::RoundUpAlignment(Size, kShadowTLSAlignment);
+        ArgOffset += RoundUpToAlignment(Size, kShadowTLSAlignment);
       }
       assert(*ShadowPtr && "Could not find shadow for an argument");
       return *ShadowPtr;
@@ -1024,9 +1063,16 @@
   /// UMR warning in runtime if the value is not fully defined.
   void insertShadowCheck(Value *Val, Instruction *OrigIns) {
     assert(Val);
-    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
-    if (!Shadow) return;
-    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    Value *Shadow, *Origin;
+    if (ClCheckConstantShadow) {
+      Shadow = getShadow(Val);
+      if (!Shadow) return;
+      Origin = getOrigin(Val);
+    } else {
+      Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+      if (!Shadow) return;
+      Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    }
     insertShadowCheck(Shadow, Origin, OrigIns);
   }
 
@@ -1859,7 +1905,7 @@
     Value *Op = I.getArgOperand(0);
     Type *OpType = Op->getType();
     Function *BswapFunc = Intrinsic::getDeclaration(
-      F.getParent(), Intrinsic::bswap, ArrayRef<Type*>(&OpType, 1));
+      F.getParent(), Intrinsic::bswap, makeArrayRef(&OpType, 1));
     setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
     setOrigin(&I, getOrigin(Op));
   }
@@ -2313,26 +2359,32 @@
       Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
       DEBUG(dbgs() << "  Arg#" << i << ": " << *A <<
             " Shadow: " << *ArgShadow << "\n");
+      bool ArgIsInitialized = false;
       if (CS.paramHasAttr(i + 1, Attribute::ByVal)) {
         assert(A->getType()->isPointerTy() &&
                "ByVal argument is not a pointer!");
         Size = MS.DL->getTypeAllocSize(A->getType()->getPointerElementType());
-        unsigned Alignment = CS.getParamAlignment(i + 1);
+        if (ArgOffset + Size > kParamTLSSize) break;
+        unsigned ParamAlignment = CS.getParamAlignment(i + 1);
+        unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment);
         Store = IRB.CreateMemCpy(ArgShadowBase,
                                  getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB),
                                  Size, Alignment);
       } else {
         Size = MS.DL->getTypeAllocSize(A->getType());
+        if (ArgOffset + Size > kParamTLSSize) break;
         Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
                                        kShadowTLSAlignment);
+        Constant *Cst = dyn_cast<Constant>(ArgShadow);
+        if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
       }
-      if (MS.TrackOrigins)
+      if (MS.TrackOrigins && !ArgIsInitialized)
         IRB.CreateStore(getOrigin(A),
                         getOriginPtrForArgument(A, IRB, ArgOffset));
       (void)Store;
       assert(Size != 0 && Store != nullptr);
       DEBUG(dbgs() << "  Param:" << *Store << "\n");
-      ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
+      ArgOffset += RoundUpToAlignment(Size, 8);
     }
     DEBUG(dbgs() << "  done with call args\n");
 
@@ -2613,7 +2665,7 @@
         Type *RealTy = A->getType()->getPointerElementType();
         uint64_t ArgSize = MS.DL->getTypeAllocSize(RealTy);
         Value *Base = getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset);
-        OverflowOffset += DataLayout::RoundUpAlignment(ArgSize, 8);
+        OverflowOffset += RoundUpToAlignment(ArgSize, 8);
         IRB.CreateMemCpy(Base, MSV.getShadowPtr(A, IRB.getInt8Ty(), IRB),
                          ArgSize, kShadowTLSAlignment);
       } else {
@@ -2635,7 +2687,7 @@
           case AK_Memory:
             uint64_t ArgSize = MS.DL->getTypeAllocSize(A->getType());
             Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset);
-            OverflowOffset += DataLayout::RoundUpAlignment(ArgSize, 8);
+            OverflowOffset += RoundUpToAlignment(ArgSize, 8);
         }
         IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
       }

diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
new file mode 100644
index 0000000..f882072
--- /dev/null
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp

@@ -0,0 +1,294 @@
+//===-- SanitizerCoverage.cpp - coverage instrumentation for sanitizers ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Coverage instrumentation that works with AddressSanitizer
+// and potentially with other Sanitizers.
+//
+// We create a Guard boolean variable with the same linkage
+// as the function and inject this code into the entry block (CoverageLevel=1)
+// or all blocks (CoverageLevel>=2):
+// if (*Guard) {
+//    __sanitizer_cov();
+//    *Guard = 1;
+// }
+// The accesses to Guard are atomic. The rest of the logic is
+// in __sanitizer_cov (it's fine to call it more than once).
+//
+// With CoverageLevel>=3 we also split critical edges this effectively
+// instrumenting all edges.
+//
+// CoverageLevel>=4 add indirect call profiling implented as a function call.
+//
+// This coverage implementation provides very limited data:
+// it only tells if a given function (block) was ever executed. No counters.
+// But for many use cases this is what we need and the added slowdown small.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sancov"
+
+static const char *const kSanCovModuleInitName = "__sanitizer_cov_module_init";
+static const char *const kSanCovName = "__sanitizer_cov";
+static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16";
+static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter";
+static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block";
+static const char *const kSanCovModuleCtorName = "sancov.module_ctor";
+static const uint64_t    kSanCtorAndDtorPriority = 1;
+
+static cl::opt<int> ClCoverageLevel("sanitizer-coverage-level",
+       cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
+                "3: all blocks and critical edges, "
+                "4: above plus indirect calls"),
+       cl::Hidden, cl::init(0));
+
+static cl::opt<int> ClCoverageBlockThreshold(
+    "sanitizer-coverage-block-threshold",
+    cl::desc("Add coverage instrumentation only to the entry block if there "
+             "are more than this number of blocks."),
+    cl::Hidden, cl::init(1500));
+
+static cl::opt<bool>
+    ClExperimentalTracing("sanitizer-coverage-experimental-tracing",
+                          cl::desc("Experimental basic-block tracing: insert "
+                                   "callbacks at every basic block"),
+                          cl::Hidden, cl::init(false));
+
+namespace {
+
+class SanitizerCoverageModule : public ModulePass {
+ public:
+   SanitizerCoverageModule(int CoverageLevel = 0)
+       : ModulePass(ID),
+         CoverageLevel(std::max(CoverageLevel, (int)ClCoverageLevel)) {}
+  bool runOnModule(Module &M) override;
+  bool runOnFunction(Function &F);
+  static char ID;  // Pass identification, replacement for typeid
+  const char *getPassName() const override {
+    return "SanitizerCoverageModule";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DataLayoutPass>();
+  }
+
+ private:
+  void InjectCoverageForIndirectCalls(Function &F,
+                                      ArrayRef<Instruction *> IndirCalls);
+  bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks,
+                      ArrayRef<Instruction *> IndirCalls);
+  bool InjectTracing(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+  void InjectCoverageAtBlock(Function &F, BasicBlock &BB);
+  Function *SanCovFunction;
+  Function *SanCovIndirCallFunction;
+  Function *SanCovModuleInit;
+  Function *SanCovTraceEnter, *SanCovTraceBB;
+  Type *IntptrTy;
+  LLVMContext *C;
+
+  int CoverageLevel;
+};
+
+}  // namespace
+
+static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
+  if (Function *F = dyn_cast<Function>(FuncOrBitcast))
+     return F;
+  std::string Err;
+  raw_string_ostream Stream(Err);
+  Stream << "SanitizerCoverage interface function redefined: "
+         << *FuncOrBitcast;
+  report_fatal_error(Err);
+}
+
+bool SanitizerCoverageModule::runOnModule(Module &M) {
+  if (!CoverageLevel) return false;
+  C = &(M.getContext());
+  DataLayoutPass *DLP = &getAnalysis<DataLayoutPass>();
+  IntptrTy = Type::getIntNTy(*C, DLP->getDataLayout().getPointerSizeInBits());
+  Type *VoidTy = Type::getVoidTy(*C);
+
+  Function *CtorFunc =
+      Function::Create(FunctionType::get(VoidTy, false),
+                       GlobalValue::InternalLinkage, kSanCovModuleCtorName, &M);
+  ReturnInst::Create(*C, BasicBlock::Create(*C, "", CtorFunc));
+  appendToGlobalCtors(M, CtorFunc, kSanCtorAndDtorPriority);
+
+  SanCovFunction =
+      checkInterfaceFunction(M.getOrInsertFunction(kSanCovName, VoidTy, nullptr));
+  SanCovIndirCallFunction = checkInterfaceFunction(M.getOrInsertFunction(
+      kSanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
+  SanCovModuleInit = checkInterfaceFunction(M.getOrInsertFunction(
+      kSanCovModuleInitName, Type::getVoidTy(*C), IntptrTy, nullptr));
+  SanCovModuleInit->setLinkage(Function::ExternalLinkage);
+
+  if (ClExperimentalTracing) {
+    SanCovTraceEnter = checkInterfaceFunction(
+        M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, IntptrTy, nullptr));
+    SanCovTraceBB = checkInterfaceFunction(
+        M.getOrInsertFunction(kSanCovTraceBB, VoidTy, IntptrTy, nullptr));
+  }
+
+  for (auto &F : M)
+    runOnFunction(F);
+
+  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
+  IRB.CreateCall(SanCovModuleInit,
+                 ConstantInt::get(IntptrTy, SanCovFunction->getNumUses()));
+  return true;
+}
+
+bool SanitizerCoverageModule::runOnFunction(Function &F) {
+  if (F.empty()) return false;
+  // For now instrument only functions that will also be asan-instrumented.
+  if (!F.hasFnAttribute(Attribute::SanitizeAddress))
+    return false;
+  if (CoverageLevel >= 3)
+    SplitAllCriticalEdges(F, this);
+  SmallVector<Instruction*, 8> IndirCalls;
+  SmallVector<BasicBlock*, 16> AllBlocks;
+  for (auto &BB : F) {
+    AllBlocks.push_back(&BB);
+    if (CoverageLevel >= 4)
+      for (auto &Inst : BB) {
+        CallSite CS(&Inst);
+        if (CS && !CS.getCalledFunction())
+          IndirCalls.push_back(&Inst);
+      }
+  }
+  InjectCoverage(F, AllBlocks, IndirCalls);
+  InjectTracing(F, AllBlocks);
+  return true;
+}
+
+// Experimental support for tracing.
+// Basicaly, insert a callback at the beginning of every basic block.
+// Every callback gets a pointer to a uniqie global for internal storage.
+bool SanitizerCoverageModule::InjectTracing(Function &F,
+                                            ArrayRef<BasicBlock *> AllBlocks) {
+  if (!ClExperimentalTracing) return false;
+  Type *Ty = ArrayType::get(IntptrTy, 1);  // May need to use more words later.
+  for (auto BB : AllBlocks) {
+    IRBuilder<> IRB(BB->getFirstInsertionPt());
+    GlobalVariable *TraceCache = new GlobalVariable(
+        *F.getParent(), Ty, false, GlobalValue::PrivateLinkage,
+        Constant::getNullValue(Ty), "__sancov_gen_trace_cache");
+    IRB.CreateCall(&F.getEntryBlock() == BB ? SanCovTraceEnter : SanCovTraceBB,
+                   IRB.CreatePointerCast(TraceCache, IntptrTy));
+  }
+  return true;
+}
+
+bool
+SanitizerCoverageModule::InjectCoverage(Function &F,
+                                        ArrayRef<BasicBlock *> AllBlocks,
+                                        ArrayRef<Instruction *> IndirCalls) {
+  if (!CoverageLevel) return false;
+
+  if (CoverageLevel == 1 ||
+      (unsigned)ClCoverageBlockThreshold < AllBlocks.size()) {
+    InjectCoverageAtBlock(F, F.getEntryBlock());
+  } else {
+    for (auto BB : AllBlocks)
+      InjectCoverageAtBlock(F, *BB);
+  }
+  InjectCoverageForIndirectCalls(F, IndirCalls);
+  return true;
+}
+
+// On every indirect call we call a run-time function
+// __sanitizer_cov_indir_call* with two parameters:
+//   - callee address,
+//   - global cache array that contains kCacheSize pointers (zero-initialized).
+//     The cache is used to speed up recording the caller-callee pairs.
+// The address of the caller is passed implicitly via caller PC.
+// kCacheSize is encoded in the name of the run-time function.
+void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
+    Function &F, ArrayRef<Instruction *> IndirCalls) {
+  if (IndirCalls.empty()) return;
+  const int kCacheSize = 16;
+  const int kCacheAlignment = 64;  // Align for better performance.
+  Type *Ty = ArrayType::get(IntptrTy, kCacheSize);
+  for (auto I : IndirCalls) {
+    IRBuilder<> IRB(I);
+    CallSite CS(I);
+    Value *Callee = CS.getCalledValue();
+    if (dyn_cast<InlineAsm>(Callee)) continue;
+    GlobalVariable *CalleeCache = new GlobalVariable(
+        *F.getParent(), Ty, false, GlobalValue::PrivateLinkage,
+        Constant::getNullValue(Ty), "__sancov_gen_callee_cache");
+    CalleeCache->setAlignment(kCacheAlignment);
+    IRB.CreateCall2(SanCovIndirCallFunction,
+                    IRB.CreatePointerCast(Callee, IntptrTy),
+                    IRB.CreatePointerCast(CalleeCache, IntptrTy));
+  }
+}
+
+void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F,
+                                                    BasicBlock &BB) {
+  BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end();
+  // Skip static allocas at the top of the entry block so they don't become
+  // dynamic when we split the block.  If we used our optimized stack layout,
+  // then there will only be one alloca and it will come first.
+  for (; IP != BE; ++IP) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(IP);
+    if (!AI || !AI->isStaticAlloca())
+      break;
+  }
+
+  DebugLoc EntryLoc = &BB == &F.getEntryBlock()
+                          ? IP->getDebugLoc().getFnDebugLoc(*C)
+                          : IP->getDebugLoc();
+  IRBuilder<> IRB(IP);
+  IRB.SetCurrentDebugLocation(EntryLoc);
+  Type *Int8Ty = IRB.getInt8Ty();
+  GlobalVariable *Guard = new GlobalVariable(
+      *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage,
+      Constant::getNullValue(Int8Ty), "__sancov_gen_cov_" + F.getName());
+  LoadInst *Load = IRB.CreateLoad(Guard);
+  Load->setAtomic(Monotonic);
+  Load->setAlignment(1);
+  Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load);
+  Instruction *Ins = SplitBlockAndInsertIfThen(
+      Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
+  IRB.SetInsertPoint(Ins);
+  IRB.SetCurrentDebugLocation(EntryLoc);
+  // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC.
+  IRB.CreateCall(SanCovFunction);
+  StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard);
+  Store->setAtomic(Monotonic);
+  Store->setAlignment(1);
+}
+
+char SanitizerCoverageModule::ID = 0;
+INITIALIZE_PASS(SanitizerCoverageModule, "sancov",
+    "SanitizerCoverage: TODO."
+    "ModulePass", false, false)
+ModulePass *llvm::createSanitizerCoverageModulePass(int CoverageLevel) {
+  return new SanitizerCoverageModule(CoverageLevel);
+}

diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 89386a6..8a56a1f 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp

@@ -135,33 +135,33 @@
   IRBuilder<> IRB(M.getContext());
   // Initialize the callbacks.
   TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+      "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
   TsanFuncExit = checkInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_func_exit", IRB.getVoidTy(), NULL));
+      "__tsan_func_exit", IRB.getVoidTy(), nullptr));
   OrdTy = IRB.getInt32Ty();
   for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
     const size_t ByteSize = 1 << i;
     const size_t BitSize = ByteSize * 8;
     SmallString<32> ReadName("__tsan_read" + itostr(ByteSize));
     TsanRead[i] = checkInterfaceFunction(M.getOrInsertFunction(
-        ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+        ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
 
     SmallString<32> WriteName("__tsan_write" + itostr(ByteSize));
     TsanWrite[i] = checkInterfaceFunction(M.getOrInsertFunction(
-        WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+        WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
 
     Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) +
                                    "_load");
     TsanAtomicLoad[i] = checkInterfaceFunction(M.getOrInsertFunction(
-        AtomicLoadName, Ty, PtrTy, OrdTy, NULL));
+        AtomicLoadName, Ty, PtrTy, OrdTy, nullptr));
 
     SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) +
                                     "_store");
     TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction(
         AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy,
-        NULL));
+        nullptr));
 
     for (int op = AtomicRMWInst::FIRST_BINOP;
         op <= AtomicRMWInst::LAST_BINOP; ++op) {
@@ -185,33 +185,33 @@
         continue;
       SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
       TsanAtomicRMW[op][i] = checkInterfaceFunction(M.getOrInsertFunction(
-          RMWName, Ty, PtrTy, Ty, OrdTy, NULL));
+          RMWName, Ty, PtrTy, Ty, OrdTy, nullptr));
     }
 
     SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) +
                                   "_compare_exchange_val");
     TsanAtomicCAS[i] = checkInterfaceFunction(M.getOrInsertFunction(
-        AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, NULL));
+        AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr));
   }
   TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), NULL));
+      IRB.getInt8PtrTy(), nullptr));
   TsanVptrLoad = checkInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_vptr_read", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+      "__tsan_vptr_read", IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
   TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL));
+      "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, nullptr));
   TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL));
+      "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, nullptr));
 
   MemmoveFn = checkInterfaceFunction(M.getOrInsertFunction(
     "memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IRB.getInt8PtrTy(), IntptrTy, NULL));
+    IRB.getInt8PtrTy(), IntptrTy, nullptr));
   MemcpyFn = checkInterfaceFunction(M.getOrInsertFunction(
     "memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IntptrTy, NULL));
+    IntptrTy, nullptr));
   MemsetFn = checkInterfaceFunction(M.getOrInsertFunction(
     "memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
-    IntptrTy, NULL));
+    IntptrTy, nullptr));
 }
 
 bool ThreadSanitizer::doInitialization(Module &M) {
@@ -224,7 +224,7 @@
   IRBuilder<> IRB(M.getContext());
   IntptrTy = IRB.getIntPtrTy(DL);
   Value *TsanInit = M.getOrInsertFunction("__tsan_init",
-                                          IRB.getVoidTy(), NULL);
+                                          IRB.getVoidTy(), nullptr);
   appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
 
   return true;
@@ -481,8 +481,7 @@
     Type *PtrTy = Ty->getPointerTo();
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
                      createOrdering(&IRB, LI->getOrdering())};
-    CallInst *C = CallInst::Create(TsanAtomicLoad[Idx],
-                                   ArrayRef<Value*>(Args));
+    CallInst *C = CallInst::Create(TsanAtomicLoad[Idx], Args);
     ReplaceInstWithInst(I, C);
 
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
@@ -497,8 +496,7 @@
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
                      IRB.CreateIntCast(SI->getValueOperand(), Ty, false),
                      createOrdering(&IRB, SI->getOrdering())};
-    CallInst *C = CallInst::Create(TsanAtomicStore[Idx],
-                                   ArrayRef<Value*>(Args));
+    CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
     ReplaceInstWithInst(I, C);
   } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
     Value *Addr = RMWI->getPointerOperand();
@@ -515,7 +513,7 @@
     Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
                      IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
                      createOrdering(&IRB, RMWI->getOrdering())};
-    CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+    CallInst *C = CallInst::Create(F, Args);
     ReplaceInstWithInst(I, C);
   } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
     Value *Addr = CASI->getPointerOperand();
@@ -543,7 +541,7 @@
     Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
     Function *F = FI->getSynchScope() == SingleThread ?
         TsanAtomicSignalFence : TsanAtomicThreadFence;
-    CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+    CallInst *C = CallInst::Create(F, Args);
     ReplaceInstWithInst(I, C);
   }
   return true;

diff --git a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index 4098428..e286dbc 100644
--- a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h

@@ -19,8 +19,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_SCALAR_ARCRUNTIMEENTRYPOINTS_H
-#define LLVM_TRANSFORMS_SCALAR_ARCRUNTIMEENTRYPOINTS_H
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
 
 #include "ObjCARC.h"
 
@@ -183,4 +183,4 @@
 } // namespace objcarc
 } // namespace llvm
 
-#endif // LLVM_TRANSFORMS_SCALAR_ARCRUNTIMEENTRYPOINTS_H
+#endif

diff --git a/lib/Transforms/ObjCARC/Android.mk b/lib/Transforms/ObjCARC/Android.mk
index 226e9e1..cf45a95 100644
--- a/lib/Transforms/ObjCARC/Android.mk
+++ b/lib/Transforms/ObjCARC/Android.mk

@@ -9,7 +9,8 @@
   ObjCARCExpand.cpp \
   ObjCARCOpts.cpp \
   ObjCARCUtil.cpp \
-  ProvenanceAnalysis.cpp
+  ProvenanceAnalysis.cpp \
+  ProvenanceAnalysisEvaluator.cpp
 
 # For the host
 # =====================================================

diff --git a/lib/Transforms/ObjCARC/CMakeLists.txt b/lib/Transforms/ObjCARC/CMakeLists.txt
index 233deb3..b449fac 100644
--- a/lib/Transforms/ObjCARC/CMakeLists.txt
+++ b/lib/Transforms/ObjCARC/CMakeLists.txt

@@ -8,6 +8,7 @@
   ObjCARCContract.cpp
   DependencyAnalysis.cpp
   ProvenanceAnalysis.cpp
+  ProvenanceAnalysisEvaluator.cpp
   )
 
 add_dependencies(LLVMObjCARCOpts intrinsics_gen)

diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 08c8842..f6c236c 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp

@@ -206,8 +206,8 @@
 llvm::objcarc::FindDependencies(DependenceKind Flavor,
                                 const Value *Arg,
                                 BasicBlock *StartBB, Instruction *StartInst,
-                                SmallPtrSet<Instruction *, 4> &DependingInsts,
-                                SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                SmallPtrSetImpl<Instruction *> &DependingInsts,
+                                SmallPtrSetImpl<const BasicBlock *> &Visited,
                                 ProvenanceAnalysis &PA) {
   BasicBlock::iterator StartPos = StartInst;
 
@@ -229,7 +229,7 @@
           // Add the predecessors to the worklist.
           do {
             BasicBlock *PredBB = *PI;
-            if (Visited.insert(PredBB))
+            if (Visited.insert(PredBB).second)
               Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
           } while (++PI != PE);
         break;
@@ -246,9 +246,7 @@
   // Determine whether the original StartBB post-dominates all of the blocks we
   // visited. If not, insert a sentinal indicating that most optimizations are
   // not safe.
-  for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(),
-       E = Visited.end(); I != E; ++I) {
-    const BasicBlock *BB = *I;
+  for (const BasicBlock *BB : Visited) {
     if (BB == StartBB)
       continue;
     const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());

diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.h b/lib/Transforms/ObjCARC/DependencyAnalysis.h
index 617cdf3..7b5601a 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.h

@@ -20,8 +20,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
-#define LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 
@@ -53,8 +53,8 @@
 void FindDependencies(DependenceKind Flavor,
                       const Value *Arg,
                       BasicBlock *StartBB, Instruction *StartInst,
-                      SmallPtrSet<Instruction *, 4> &DependingInstructions,
-                      SmallPtrSet<const BasicBlock *, 4> &Visited,
+                      SmallPtrSetImpl<Instruction *> &DependingInstructions,
+                      SmallPtrSetImpl<const BasicBlock *> &Visited,
                       ProvenanceAnalysis &PA);
 
 bool
@@ -76,4 +76,4 @@
 } // namespace objcarc
 } // namespace llvm
 
-#endif // LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+#endif

diff --git a/lib/Transforms/ObjCARC/ObjCARC.cpp b/lib/Transforms/ObjCARC/ObjCARC.cpp
index 373168e..6ea038b 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARC.cpp

@@ -42,6 +42,7 @@
   initializeObjCARCExpandPass(Registry);
   initializeObjCARCContractPass(Registry);
   initializeObjCARCOptPass(Registry);
+  initializePAEvalPass(Registry);
 }
 
 void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) {

diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
index f71cf2b..7a7eae8 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/lib/Transforms/ObjCARC/ObjCARC.h

@@ -20,8 +20,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_SCALAR_OBJCARC_H
-#define LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
 
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -380,11 +380,15 @@
       StringRef Name = GV->getName();
       // These special variables are known to hold values which are not
       // reference-counted pointers.
-      if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") ||
-          Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") ||
-          Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") ||
-          Name.startswith("\01L_OBJC_METH_VAR_NAME_") ||
-          Name.startswith("\01l_objc_msgSend_fixup_"))
+      if (Name.startswith("\01l_objc_msgSend_fixup_"))
+        return true;
+
+      StringRef Section = GV->getSection();
+      if (Section.find("__message_refs") != StringRef::npos ||
+          Section.find("__objc_classrefs") != StringRef::npos ||
+          Section.find("__objc_superrefs") != StringRef::npos ||
+          Section.find("__objc_methname") != StringRef::npos ||
+          Section.find("__cstring") != StringRef::npos)
         return true;
     }
   }
@@ -395,4 +399,4 @@
 } // end namespace objcarc
 } // end namespace llvm
 
-#endif // LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+#endif

diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
index 2c09e70..c61b6b0 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp

@@ -62,8 +62,8 @@
   const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
   const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
   AliasResult Result =
-    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag),
-                         Location(SB, LocB.Size, LocB.TBAATag));
+    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.AATags),
+                         Location(SB, LocB.Size, LocB.AATags));
   if (Result != MayAlias)
     return Result;
 
@@ -93,7 +93,7 @@
   // First, strip off no-ops, including ObjC-specific no-ops, and try making
   // a precise alias query.
   const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
-  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag),
+  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.AATags),
                                             OrLocal))
     return true;
 

diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
index 97b565b..3fcea4e 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h

@@ -20,8 +20,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
-#define LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
 
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Pass.h"
@@ -71,4 +71,4 @@
 } // namespace objcarc
 } // namespace llvm
 
-#endif // LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+#endif

diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index f48d53d..eb325eb 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp

@@ -72,9 +72,9 @@
 
     bool ContractAutorelease(Function &F, Instruction *Autorelease,
                              InstructionClass Class,
-                             SmallPtrSet<Instruction *, 4>
+                             SmallPtrSetImpl<Instruction *>
                                &DependingInstructions,
-                             SmallPtrSet<const BasicBlock *, 4>
+                             SmallPtrSetImpl<const BasicBlock *>
                                &Visited);
 
     void ContractRelease(Instruction *Release,
@@ -150,9 +150,9 @@
 bool
 ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
                                      InstructionClass Class,
-                                     SmallPtrSet<Instruction *, 4>
+                                     SmallPtrSetImpl<Instruction *>
                                        &DependingInstructions,
-                                     SmallPtrSet<const BasicBlock *, 4>
+                                     SmallPtrSetImpl<const BasicBlock *>
                                        &Visited) {
   const Value *Arg = GetObjCArg(Autorelease);
 
@@ -508,9 +508,8 @@
   // If this function has no escaping allocas or suspicious vararg usage,
   // objc_storeStrong calls can be marked with the "tail" keyword.
   if (TailOkForStoreStrongs)
-    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
-         E = StoreStrongCalls.end(); I != E; ++I)
-      (*I)->setTailCall();
+    for (CallInst *CI : StoreStrongCalls)
+      CI->setTailCall();
   StoreStrongCalls.clear();
 
   return Changed;

diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index dd4dd50..95c6674 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp

@@ -188,7 +188,7 @@
     if (isa<AllocaInst>(P))
       return true;
 
-    if (!Visited.insert(P))
+    if (!Visited.insert(P).second)
       continue;
 
     if (const SelectInst *SI = dyn_cast<const SelectInst>(P)) {
@@ -411,10 +411,8 @@
     // Merge the insert point sets. If there are any differences,
     // that makes this a partial merge.
     bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size();
-    for (SmallPtrSet<Instruction *, 2>::const_iterator
-         I = Other.ReverseInsertPts.begin(),
-         E = Other.ReverseInsertPts.end(); I != E; ++I)
-      Partial |= ReverseInsertPts.insert(*I);
+    for (Instruction *Inst : Other.ReverseInsertPts)
+      Partial |= ReverseInsertPts.insert(Inst).second;
     return Partial;
 }
 
@@ -887,8 +885,7 @@
                                       OldSeq),
                    SequenceToMDString(Inst->getContext(),
                                       NewSeq)};
-  Node = MDNode::get(Inst->getContext(),
-                     ArrayRef<Value*>(tmp, 3));
+  Node = MDNode::get(Inst->getContext(), tmp);
 
   Inst->setMetadata(NodeId, Node);
 }
@@ -908,8 +905,7 @@
   Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
   Type *I8XX = PointerType::getUnqual(I8X);
   Type *Params[] = {I8XX, I8XX};
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
-                                        ArrayRef<Type*>(Params, 2),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C), Params,
                                         /*isVarArg=*/false);
   Constant *Callee = M->getOrInsertFunction(Name, FTy);
 
@@ -951,8 +947,7 @@
   Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
   Type *I8XX = PointerType::getUnqual(I8X);
   Type *Params[] = {I8XX, I8XX};
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
-                                        ArrayRef<Type*>(Params, 2),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C), Params,
                                         /*isVarArg=*/false);
   Constant *Callee = M->getOrInsertFunction(Name, FTy);
 
@@ -2199,7 +2194,7 @@
 
     while (SuccStack.back().second != SE) {
       BasicBlock *SuccBB = *SuccStack.back().second++;
-      if (Visited.insert(SuccBB)) {
+      if (Visited.insert(SuccBB).second) {
         TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
         SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
         BBStates[CurrBB].addSucc(SuccBB);
@@ -2240,7 +2235,7 @@
       BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
       while (PredStack.back().second != PE) {
         BasicBlock *BB = *PredStack.back().second++;
-        if (Visited.insert(BB)) {
+        if (Visited.insert(BB).second) {
           PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
           goto reverse_dfs_next_succ;
         }
@@ -2299,10 +2294,7 @@
   DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
 
   // Insert the new retain and release calls.
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       PI = ReleasesToMove.ReverseInsertPts.begin(),
-       PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *InsertPt = *PI;
+  for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) {
     Value *MyArg = ArgTy == ParamTy ? Arg :
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
     Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
@@ -2313,10 +2305,7 @@
     DEBUG(dbgs() << "Inserting new Retain: " << *Call << "\n"
                     "At insertion point: " << *InsertPt << "\n");
   }
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       PI = RetainsToMove.ReverseInsertPts.begin(),
-       PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *InsertPt = *PI;
+  for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) {
     Value *MyArg = ArgTy == ParamTy ? Arg :
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
     Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Release);
@@ -2333,18 +2322,12 @@
   }
 
   // Delete the original retain and release calls.
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       AI = RetainsToMove.Calls.begin(),
-       AE = RetainsToMove.Calls.end(); AI != AE; ++AI) {
-    Instruction *OrigRetain = *AI;
+  for (Instruction *OrigRetain : RetainsToMove.Calls) {
     Retains.blot(OrigRetain);
     DeadInsts.push_back(OrigRetain);
     DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
   }
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       AI = ReleasesToMove.Calls.begin(),
-       AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) {
-    Instruction *OrigRelease = *AI;
+  for (Instruction *OrigRelease : ReleasesToMove.Calls) {
     Releases.erase(OrigRelease);
     DeadInsts.push_back(OrigRelease);
     DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
@@ -2392,10 +2375,7 @@
       KnownSafeTD &= NewRetainRRI.KnownSafe;
       MultipleOwners =
         MultipleOwners || MultiOwnersSet.count(GetObjCArg(NewRetain));
-      for (SmallPtrSet<Instruction *, 2>::const_iterator
-             LI = NewRetainRRI.Calls.begin(),
-             LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
-        Instruction *NewRetainRelease = *LI;
+      for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
         DenseMap<Value *, RRInfo>::const_iterator Jt =
           Releases.find(NewRetainRelease);
         if (Jt == Releases.end())
@@ -2410,7 +2390,7 @@
         if (!NewRetainReleaseRRI.Calls.count(NewRetain))
           return false;
 
-        if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
+        if (ReleasesToMove.Calls.insert(NewRetainRelease).second) {
 
           // If we overflow when we compute the path count, don't remove/move
           // anything.
@@ -2441,12 +2421,8 @@
 
           // Collect the optimal insertion points.
           if (!KnownSafe)
-            for (SmallPtrSet<Instruction *, 2>::const_iterator
-                   RI = NewRetainReleaseRRI.ReverseInsertPts.begin(),
-                   RE = NewRetainReleaseRRI.ReverseInsertPts.end();
-                 RI != RE; ++RI) {
-              Instruction *RIP = *RI;
-              if (ReleasesToMove.ReverseInsertPts.insert(RIP)) {
+            for (Instruction *RIP : NewRetainReleaseRRI.ReverseInsertPts) {
+              if (ReleasesToMove.ReverseInsertPts.insert(RIP).second) {
                 // If we overflow when we compute the path count, don't
                 // remove/move anything.
                 const BBState &RIPBBState = BBStates[RIP->getParent()];
@@ -2476,10 +2452,7 @@
       const RRInfo &NewReleaseRRI = It->second;
       KnownSafeBU &= NewReleaseRRI.KnownSafe;
       CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted;
-      for (SmallPtrSet<Instruction *, 2>::const_iterator
-             LI = NewReleaseRRI.Calls.begin(),
-             LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
-        Instruction *NewReleaseRetain = *LI;
+      for (Instruction *NewReleaseRetain : NewReleaseRRI.Calls) {
         MapVector<Value *, RRInfo>::const_iterator Jt =
           Retains.find(NewReleaseRetain);
         if (Jt == Retains.end())
@@ -2494,7 +2467,7 @@
         if (!NewReleaseRetainRRI.Calls.count(NewRelease))
           return false;
 
-        if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
+        if (RetainsToMove.Calls.insert(NewReleaseRetain).second) {
           // If we overflow when we compute the path count, don't remove/move
           // anything.
           const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()];
@@ -2509,12 +2482,8 @@
 
           // Collect the optimal insertion points.
           if (!KnownSafe)
-            for (SmallPtrSet<Instruction *, 2>::const_iterator
-                   RI = NewReleaseRetainRRI.ReverseInsertPts.begin(),
-                   RE = NewReleaseRetainRRI.ReverseInsertPts.end();
-                 RI != RE; ++RI) {
-              Instruction *RIP = *RI;
-              if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
+            for (Instruction *RIP : NewReleaseRetainRRI.ReverseInsertPts) {
+              if (RetainsToMove.ReverseInsertPts.insert(RIP).second) {
                 // If we overflow when we compute the path count, don't
                 // remove/move anything.
                 const BBState &RIPBBState = BBStates[RIP->getParent()];
@@ -2850,8 +2819,8 @@
 /// shared pointer argument. Note that Retain need not be in BB.
 static bool
 HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
-                             SmallPtrSet<Instruction *, 4> &DepInsts,
-                             SmallPtrSet<const BasicBlock *, 4> &Visited,
+                             SmallPtrSetImpl<Instruction *> &DepInsts,
+                             SmallPtrSetImpl<const BasicBlock *> &Visited,
                              ProvenanceAnalysis &PA) {
   FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
                    DepInsts, Visited, PA);
@@ -2879,8 +2848,8 @@
 static CallInst *
 FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
                                   Instruction *Autorelease,
-                                  SmallPtrSet<Instruction *, 4> &DepInsts,
-                                  SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                  SmallPtrSetImpl<Instruction *> &DepInsts,
+                                  SmallPtrSetImpl<const BasicBlock *> &Visited,
                                   ProvenanceAnalysis &PA) {
   FindDependencies(CanChangeRetainCount, Arg,
                    BB, Autorelease, DepInsts, Visited, PA);
@@ -2906,8 +2875,8 @@
 static CallInst *
 FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
                                        ReturnInst *Ret,
-                                       SmallPtrSet<Instruction *, 4> &DepInsts,
-                                       SmallPtrSet<const BasicBlock *, 4> &V,
+                                       SmallPtrSetImpl<Instruction *> &DepInsts,
+                                       SmallPtrSetImpl<const BasicBlock *> &V,
                                        ProvenanceAnalysis &PA) {
   FindDependencies(NeedsPositiveRetainCount, Arg,
                    BB, Ret, DepInsts, V, PA);

diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
index 22be6fd..410abfc 100644
--- a/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp

@@ -62,7 +62,7 @@
   SmallPtrSet<const Value *, 4> UniqueSrc;
   for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
     const Value *PV1 = A->getIncomingValue(i);
-    if (UniqueSrc.insert(PV1) && related(PV1, B))
+    if (UniqueSrc.insert(PV1).second && related(PV1, B))
       return true;
   }
 
@@ -94,7 +94,7 @@
       if (isa<PtrToIntInst>(P))
         // Assume the worst.
         return true;
-      if (Visited.insert(Ur))
+      if (Visited.insert(Ur).second)
         Worklist.push_back(Ur);
     }
   } while (!Worklist.empty());

diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index a13fb9e..7820468 100644
--- a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h

@@ -22,8 +22,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
-#define LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
 
 #include "llvm/ADT/DenseMap.h"
 
@@ -77,4 +77,4 @@
 } // end namespace objcarc
 } // end namespace llvm
 
-#endif // LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#endif

diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
new file mode 100644
index 0000000..d836632
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp

@@ -0,0 +1,92 @@
+//===- ProvenanceAnalysisEvaluator.cpp - ObjC ARC Optimization ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProvenanceAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+class PAEval : public FunctionPass {
+
+public:
+  static char ID;
+  PAEval();
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+}
+
+char PAEval::ID = 0;
+PAEval::PAEval() : FunctionPass(ID) {}
+
+void PAEval::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+}
+
+static StringRef getName(Value *V) {
+  StringRef Name = V->getName();
+  if (Name.startswith("\1"))
+    return Name.substr(1);
+  return Name;
+}
+
+static void insertIfNamed(SetVector<Value *> &Values, Value *V) {
+  if (!V->hasName())
+    return;
+  Values.insert(V);
+}
+
+bool PAEval::runOnFunction(Function &F) {
+  SetVector<Value *> Values;
+
+  for (auto &Arg : F.args())
+    insertIfNamed(Values, &Arg);
+
+  for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    insertIfNamed(Values, &*I);
+
+    for (auto &Op : I->operands())
+    insertIfNamed(Values, Op);
+  }
+
+  ProvenanceAnalysis PA;
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  for (Value *V1 : Values) {
+    StringRef NameV1 = getName(V1);
+    for (Value *V2 : Values) {
+      StringRef NameV2 = getName(V2);
+      if (NameV1 >= NameV2)
+        continue;
+      errs() << NameV1 << " and " << NameV2;
+      if (PA.related(V1, V2))
+        errs() << " are related.\n";
+      else
+        errs() << " are not related.\n";
+    }
+  }
+
+  return false;
+}
+
+FunctionPass *llvm::createPAEvalPass() { return new PAEval(); }
+
+INITIALIZE_PASS_BEGIN(PAEval, "pa-eval",
+                      "Evaluate ProvenanceAnalysis on all pairs", false, true)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(PAEval, "pa-eval",
+                    "Evaluate ProvenanceAnalysis on all pairs", false, true)

diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 1a3a4aa..3d91984 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp

@@ -73,7 +73,7 @@
     for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
          OI != OE; ++OI)
       if (Instruction* Inst = dyn_cast<Instruction>(OI))
-        if (alive.insert(Inst))
+        if (alive.insert(Inst).second)
           worklist.push_back(Inst);
   }
 

diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
new file mode 100644
index 0000000..06c3dfd
--- /dev/null
+++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp

@@ -0,0 +1,428 @@
+//===----------------------- AlignmentFromAssumptions.cpp -----------------===//
+//                  Set Load/Store Alignments From Assumptions
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a ScalarEvolution-based transformation to set
+// the alignments of load, stores and memory intrinsics based on the truth
+// expressions of assume intrinsics. The primary motivation is to handle
+// complex alignment assumptions that apply to vector loads and stores that
+// appear after vectorization and unrolling.
+//
+//===----------------------------------------------------------------------===//
+
+#define AA_NAME "alignment-from-assumptions"
+#define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumLoadAlignChanged,
+  "Number of loads changed by alignment assumptions");
+STATISTIC(NumStoreAlignChanged,
+  "Number of stores changed by alignment assumptions");
+STATISTIC(NumMemIntAlignChanged,
+  "Number of memory intrinsics changed by alignment assumptions");
+
+namespace {
+struct AlignmentFromAssumptions : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  AlignmentFromAssumptions() : FunctionPass(ID) {
+    initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<AssumptionTracker>();
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+
+    AU.setPreservesCFG();
+    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ScalarEvolution>();
+  }
+
+  // For memory transfers, we need a common alignment for both the source and
+  // destination. If we have a new alignment for only one operand of a transfer
+  // instruction, save it in these maps.  If we reach the other operand through
+  // another assumption later, then we may change the alignment at that point.
+  DenseMap<MemTransferInst *, unsigned> NewDestAlignments, NewSrcAlignments;
+
+  AssumptionTracker *AT;
+  ScalarEvolution *SE;
+  DominatorTree *DT;
+  const DataLayout *DL;
+
+  bool extractAlignmentInfo(CallInst *I, Value *&AAPtr, const SCEV *&AlignSCEV,
+                            const SCEV *&OffSCEV);
+  bool processAssumption(CallInst *I);
+};
+}
+
+char AlignmentFromAssumptions::ID = 0;
+static const char aip_name[] = "Alignment from assumptions";
+INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
+                      aip_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
+                    aip_name, false, false)
+
+FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
+  return new AlignmentFromAssumptions();
+}
+
+// Given an expression for the (constant) alignment, AlignSCEV, and an
+// expression for the displacement between a pointer and the aligned address,
+// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
+// to a constant. Using SCEV to compute alignment handles the case where
+// DiffSCEV is a recurrence with constant start such that the aligned offset
+// is constant. e.g. {16,+,32} % 32 -> 16.
+static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
+                                    const SCEV *AlignSCEV,
+                                    ScalarEvolution *SE) {
+  // DiffUnits = Diff % int64_t(Alignment)
+  const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV);
+  const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV);
+  const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV);
+
+  DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " <<
+                  *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+
+  if (const SCEVConstant *ConstDUSCEV =
+      dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
+    int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
+
+    // If the displacement is an exact multiple of the alignment, then the
+    // displaced pointer has the same alignment as the aligned pointer, so
+    // return the alignment value.
+    if (!DiffUnits)
+      return (unsigned)
+        cast<SCEVConstant>(AlignSCEV)->getValue()->getSExtValue();
+
+    // If the displacement is not an exact multiple, but the remainder is a
+    // constant, then return this remainder (but only if it is a power of 2).
+    uint64_t DiffUnitsAbs = abs64(DiffUnits);
+    if (isPowerOf2_64(DiffUnitsAbs))
+      return (unsigned) DiffUnitsAbs;
+  }
+
+  return 0;
+}
+
+// There is an address given by an offset OffSCEV from AASCEV which has an
+// alignment AlignSCEV. Use that information, if possible, to compute a new
+// alignment for Ptr.
+static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+                                const SCEV *OffSCEV, Value *Ptr,
+                                ScalarEvolution *SE) {
+  const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+  const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
+
+  // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
+  // sign-extended OffSCEV to i64, so make sure they agree again.
+  DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType());
+
+  // What we really want to know is the overall offset to the aligned
+  // address. This address is displaced by the provided offset.
+  DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
+
+  DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to " <<
+                  *AlignSCEV << " and offset " << *OffSCEV <<
+                  " using diff " << *DiffSCEV << "\n");
+
+  unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
+  DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+
+  if (NewAlignment) {
+    return NewAlignment;
+  } else if (const SCEVAddRecExpr *DiffARSCEV =
+             dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+    // The relative offset to the alignment assumption did not yield a constant,
+    // but we should try harder: if we assume that a is 32-byte aligned, then in
+    // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
+    // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
+    // As a result, the new alignment will not be a constant, but can still
+    // be improved over the default (of 4) to 16.
+
+    const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
+    const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
+
+    DEBUG(dbgs() << "\ttrying start/inc alignment using start " <<
+                    *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+
+    // Now compute the new alignment using the displacement to the value in the
+    // first iteration, and also the alignment using the per-iteration delta.
+    // If these are the same, then use that answer. Otherwise, use the smaller
+    // one, but only if it divides the larger one.
+    NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+    unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+    DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
+    DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
+
+    if (!NewAlignment || !NewIncAlignment) {
+      return 0;
+    } else if (NewAlignment > NewIncAlignment) {
+      if (NewAlignment % NewIncAlignment == 0) {
+        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                        NewIncAlignment << "\n");
+        return NewIncAlignment;
+      }
+    } else if (NewIncAlignment > NewAlignment) {
+      if (NewIncAlignment % NewAlignment == 0) {
+        DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                        NewAlignment << "\n");
+        return NewAlignment;
+      }
+    } else if (NewIncAlignment == NewAlignment) {
+      DEBUG(dbgs() << "\tnew start/inc alignment: " <<
+                      NewAlignment << "\n");
+      return NewAlignment;
+    }
+  }
+
+  return 0;
+}
+
+bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I,
+                                 Value *&AAPtr, const SCEV *&AlignSCEV,
+                                 const SCEV *&OffSCEV) {
+  // An alignment assume must be a statement about the least-significant
+  // bits of the pointer being zero, possibly with some offset.
+  ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
+  if (!ICI)
+    return false;
+
+  // This must be an expression of the form: x & m == 0.
+  if (ICI->getPredicate() != ICmpInst::ICMP_EQ)
+    return false;
+
+  // Swap things around so that the RHS is 0.
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS);
+  const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS);
+  if (CmpLHSSCEV->isZero())
+    std::swap(CmpLHS, CmpRHS);
+  else if (!CmpRHSSCEV->isZero())
+    return false;
+
+  BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS);
+  if (!CmpBO || CmpBO->getOpcode() != Instruction::And)
+    return false;
+
+  // Swap things around so that the right operand of the and is a constant
+  // (the mask); we cannot deal with variable masks.
+  Value *AndLHS = CmpBO->getOperand(0);
+  Value *AndRHS = CmpBO->getOperand(1);
+  const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS);
+  const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS);
+  if (isa<SCEVConstant>(AndLHSSCEV)) {
+    std::swap(AndLHS, AndRHS);
+    std::swap(AndLHSSCEV, AndRHSSCEV);
+  }
+
+  const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV);
+  if (!MaskSCEV)
+    return false;
+
+  // The mask must have some trailing ones (otherwise the condition is
+  // trivial and tells us nothing about the alignment of the left operand).
+  unsigned TrailingOnes =
+    MaskSCEV->getValue()->getValue().countTrailingOnes();
+  if (!TrailingOnes)
+    return false;
+
+  // Cap the alignment at the maximum with which LLVM can deal (and make sure
+  // we don't overflow the shift).
+  uint64_t Alignment;
+  TrailingOnes = std::min(TrailingOnes,
+    unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+  Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment);
+
+  Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext());
+  AlignSCEV = SE->getConstant(Int64Ty, Alignment);
+
+  // The LHS might be a ptrtoint instruction, or it might be the pointer
+  // with an offset.
+  AAPtr = nullptr;
+  OffSCEV = nullptr;
+  if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
+    AAPtr = PToI->getPointerOperand();
+    OffSCEV = SE->getConstant(Int64Ty, 0);
+  } else if (const SCEVAddExpr* AndLHSAddSCEV =
+             dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
+    // Try to find the ptrtoint; subtract it and the rest is the offset.
+    for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(),
+         JE = AndLHSAddSCEV->op_end(); J != JE; ++J)
+      if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J))
+        if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) {
+          AAPtr = PToI->getPointerOperand();
+          OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J);
+          break;
+        }
+  }
+
+  if (!AAPtr)
+    return false;
+
+  // Sign extend the offset to 64 bits (so that it is like all of the other
+  // expressions). 
+  unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
+  if (OffSCEVBits < 64)
+    OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
+  else if (OffSCEVBits > 64)
+    return false;
+
+  AAPtr = AAPtr->stripPointerCasts();
+  return true;
+}
+
+bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) {
+  Value *AAPtr;
+  const SCEV *AlignSCEV, *OffSCEV;
+  if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
+    return false;
+
+  const SCEV *AASCEV = SE->getSCEV(AAPtr);
+
+  // Apply the assumption to all other users of the specified pointer.
+  SmallPtrSet<Instruction *, 32> Visited;
+  SmallVector<Instruction*, 16> WorkList;
+  for (User *J : AAPtr->users()) {
+    if (J == ACall)
+      continue;
+
+    if (Instruction *K = dyn_cast<Instruction>(J))
+      if (isValidAssumeForContext(ACall, K, DL, DT))
+        WorkList.push_back(K);
+  }
+
+  while (!WorkList.empty()) {
+    Instruction *J = WorkList.pop_back_val();
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
+      unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+        LI->getPointerOperand(), SE);
+
+      if (NewAlignment > LI->getAlignment()) {
+        LI->setAlignment(NewAlignment);
+        ++NumLoadAlignChanged;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
+      unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+        SI->getPointerOperand(), SE);
+
+      if (NewAlignment > SI->getAlignment()) {
+        SI->setAlignment(NewAlignment);
+        ++NumStoreAlignChanged;
+      }
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
+      unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+        MI->getDest(), SE);
+
+      // For memory transfers, we need a common alignment for both the
+      // source and destination. If we have a new alignment for this
+      // instruction, but only for one operand, save it. If we reach the
+      // other operand through another assumption later, then we may
+      // change the alignment at that point.
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+        unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+          MTI->getSource(), SE);
+
+        DenseMap<MemTransferInst *, unsigned>::iterator DI =
+          NewDestAlignments.find(MTI);
+        unsigned AltDestAlignment = (DI == NewDestAlignments.end()) ?
+                                    0 : DI->second;
+
+        DenseMap<MemTransferInst *, unsigned>::iterator SI =
+          NewSrcAlignments.find(MTI);
+        unsigned AltSrcAlignment = (SI == NewSrcAlignments.end()) ?
+                                   0 : SI->second;
+
+        DEBUG(dbgs() << "\tmem trans: " << NewDestAlignment << " " <<
+                        AltDestAlignment << " " << NewSrcAlignment <<
+                        " " << AltSrcAlignment << "\n");
+
+        // Of these four alignments, pick the largest possible...
+        unsigned NewAlignment = 0;
+        if (NewDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+          NewAlignment = std::max(NewAlignment, NewDestAlignment);
+        if (AltDestAlignment <= std::max(NewSrcAlignment, AltSrcAlignment))
+          NewAlignment = std::max(NewAlignment, AltDestAlignment);
+        if (NewSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+          NewAlignment = std::max(NewAlignment, NewSrcAlignment);
+        if (AltSrcAlignment <= std::max(NewDestAlignment, AltDestAlignment))
+          NewAlignment = std::max(NewAlignment, AltSrcAlignment);
+
+        if (NewAlignment > MI->getAlignment()) {
+          MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+            MI->getParent()->getContext()), NewAlignment));
+          ++NumMemIntAlignChanged;
+        }
+
+        NewDestAlignments.insert(std::make_pair(MTI, NewDestAlignment));
+        NewSrcAlignments.insert(std::make_pair(MTI, NewSrcAlignment));
+      } else if (NewDestAlignment > MI->getAlignment()) {
+        assert((!isa<MemIntrinsic>(MI) || isa<MemSetInst>(MI)) &&
+               "Unknown memory intrinsic");
+
+        MI->setAlignment(ConstantInt::get(Type::getInt32Ty(
+          MI->getParent()->getContext()), NewDestAlignment));
+        ++NumMemIntAlignChanged;
+      }
+    }
+
+    // Now that we've updated that use of the pointer, look for other uses of
+    // the pointer to update.
+    Visited.insert(J);
+    for (User *UJ : J->users()) {
+      Instruction *K = cast<Instruction>(UJ);
+      if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DL, DT))
+        WorkList.push_back(K);
+    }
+  }
+
+  return true;
+}
+
+bool AlignmentFromAssumptions::runOnFunction(Function &F) {
+  bool Changed = false;
+  AT = &getAnalysis<AssumptionTracker>();
+  SE = &getAnalysis<ScalarEvolution>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
+
+  NewDestAlignments.clear();
+  NewSrcAlignments.clear();
+
+  for (auto &I : AT->assumptions(&F))
+    Changed |= processAssumption(I);
+
+  return Changed;
+}
+

diff --git a/lib/Transforms/Scalar/Android.mk b/lib/Transforms/Scalar/Android.mk
index 5e22de6..9028b42 100644
--- a/lib/Transforms/Scalar/Android.mk
+++ b/lib/Transforms/Scalar/Android.mk

@@ -2,12 +2,14 @@
 
 transforms_scalar_SRC_FILES := \
   ADCE.cpp \
+  AlignmentFromAssumptions.cpp \
   ConstantProp.cpp \
   ConstantHoisting.cpp \
   CorrelatedValuePropagation.cpp \
   DCE.cpp \
   DeadStoreElimination.cpp \
   EarlyCSE.cpp \
+  FlattenCFGPass.cpp \
   GVN.cpp \
   IndVarSimplify.cpp \
   JumpThreading.cpp \
@@ -23,6 +25,7 @@
   LoopUnswitch.cpp \
   LowerAtomic.cpp \
   MemCpyOptimizer.cpp \
+  MergedLoadStoreMotion.cpp \
   PartiallyInlineLibCalls.cpp \
   Reassociate.cpp \
   Reg2Mem.cpp \

diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 2dcfa23..b3ee11e 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt

@@ -1,5 +1,6 @@
 add_llvm_library(LLVMScalarOpts
   ADCE.cpp
+  AlignmentFromAssumptions.cpp
   ConstantHoisting.cpp
   ConstantProp.cpp
   CorrelatedValuePropagation.cpp
@@ -22,6 +23,7 @@
   LoopUnswitch.cpp
   LowerAtomic.cpp
   MemCpyOptimizer.cpp
+  MergedLoadStoreMotion.cpp
   PartiallyInlineLibCalls.cpp
   Reassociate.cpp
   Reg2Mem.cpp

diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index 763d02b..27c177a 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp

@@ -91,7 +91,7 @@
   Constant *Offset;
 
   RebasedConstantInfo(ConstantUseListType &&Uses, Constant *Offset)
-    : Uses(Uses), Offset(Offset) { }
+    : Uses(std::move(Uses)), Offset(Offset) { }
 };
 
 /// \brief A base constant and all its rebased constants.
@@ -395,7 +395,7 @@
     ConstInfo.RebasedConstants.push_back(
       RebasedConstantInfo(std::move(ConstCand->Uses), Offset));
   }
-  ConstantVec.push_back(ConstInfo);
+  ConstantVec.push_back(std::move(ConstInfo));
 }
 
 /// \brief Finds and combines constant candidates that can be easily

diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 0829462..5a3b5cf 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp

@@ -73,7 +73,7 @@
   if (S->getType()->isVectorTy()) return false;
   if (isa<Constant>(S->getOperand(0))) return false;
 
-  Constant *C = LVI->getConstant(S->getOperand(0), S->getParent());
+  Constant *C = LVI->getConstant(S->getOperand(0), S->getParent(), S);
   if (!C) return false;
 
   ConstantInt *CI = dyn_cast<ConstantInt>(C);
@@ -100,7 +100,7 @@
     Value *Incoming = P->getIncomingValue(i);
     if (isa<Constant>(Incoming)) continue;
 
-    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB);
+    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
 
     // Look if the incoming value is a select with a constant but LVI tells us
     // that the incoming value can never be that constant. In that case replace
@@ -114,7 +114,7 @@
       if (!C) continue;
 
       if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
-                                  P->getIncomingBlock(i), BB) !=
+                                  P->getIncomingBlock(i), BB, P) !=
           LazyValueInfo::False)
         continue;
 
@@ -126,6 +126,7 @@
     Changed = true;
   }
 
+  // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction.
   if (Value *V = SimplifyInstruction(P)) {
     P->replaceAllUsesWith(V);
     P->eraseFromParent();
@@ -147,7 +148,7 @@
 
   if (isa<Constant>(Pointer)) return false;
 
-  Constant *C = LVI->getConstant(Pointer, I->getParent());
+  Constant *C = LVI->getConstant(Pointer, I->getParent(), I);
   if (!C) return false;
 
   ++NumMemAccess;
@@ -173,13 +174,15 @@
   if (PI == PE) return false;
 
   LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(),
-                                    C->getOperand(0), Op1, *PI, C->getParent());
+                                    C->getOperand(0), Op1, *PI,
+                                    C->getParent(), C);
   if (Result == LazyValueInfo::Unknown) return false;
 
   ++PI;
   while (PI != PE) {
     LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(),
-                                    C->getOperand(0), Op1, *PI, C->getParent());
+                                    C->getOperand(0), Op1, *PI,
+                                    C->getParent(), C);
     if (Res != Result) return false;
     ++PI;
   }
@@ -229,7 +232,8 @@
     for (pred_iterator PI = PB; PI != PE; ++PI) {
       // Is the switch condition equal to the case value?
       LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
-                                                              Cond, Case, *PI, BB);
+                                                              Cond, Case, *PI,
+                                                              BB, SI);
       // Give up on this case if nothing is known.
       if (Value == LazyValueInfo::Unknown) {
         State = LazyValueInfo::Unknown;

diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 3af8ee7..a1ddc00 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp

@@ -356,15 +356,8 @@
     // If we don't know the sizes of either access, then we can't do a
     // comparison.
     if (Later.Size == AliasAnalysis::UnknownSize ||
-        Earlier.Size == AliasAnalysis::UnknownSize) {
-      // If we have no DataLayout information around, then the size of the store
-      // is inferrable from the pointee type.  If they are the same type, then
-      // we know that the store is safe.
-      if (DL == nullptr && Later.Ptr->getType() == Earlier.Ptr->getType())
-        return OverwriteComplete;
-
+        Earlier.Size == AliasAnalysis::UnknownSize)
       return OverwriteUnknown;
-    }
 
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size >= Earlier.Size)

diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 735f5c1..cd2ecad 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp

@@ -16,17 +16,21 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <vector>
+#include <deque>
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "early-cse"
 
@@ -266,6 +270,7 @@
   const DataLayout *DL;
   const TargetLibraryInfo *TLI;
   DominatorTree *DT;
+  AssumptionTracker *AT;
   typedef RecyclingAllocator<BumpPtrAllocator,
                       ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy;
   typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>,
@@ -378,6 +383,7 @@
 
   // This transformation requires dominator postdominator info
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionTracker>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfo>();
     AU.setPreservesCFG();
@@ -393,6 +399,7 @@
 }
 
 INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false)
@@ -431,9 +438,18 @@
       continue;
     }
 
+    // Skip assume intrinsics, they don't really have side effects (although
+    // they're marked as such to ensure preservation of control dependencies),
+    // and this pass will not disturb any of the assumption's control
+    // dependencies.
+    if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
+      DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+      continue;
+    }
+
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
-    if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT)) {
+    if (Value *V = SimplifyInstruction(Inst, DL, TLI, DT, AT)) {
       DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
       Inst->replaceAllUsesWith(V);
       Inst->eraseFromParent();
@@ -530,7 +546,7 @@
           Changed = true;
           ++NumDSE;
           LastStore = nullptr;
-          continue;
+          // fallthrough - we can exploit information about this store
         }
 
         // Okay, we just invalidated anything we knew about loaded values.  Try
@@ -556,12 +572,17 @@
   if (skipOptnoneFunction(F))
     return false;
 
-  std::vector<StackNode *> nodesToProcess;
+  // Note, deque is being used here because there is significant performance gains
+  // over vector when the container becomes very large due to the specific access
+  // patterns. For more information see the mailing list discussion on this:
+  // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+  std::deque<StackNode *> nodesToProcess;
 
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = &getAnalysis<TargetLibraryInfo>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  AT = &getAnalysis<AssumptionTracker>();
 
   // Tables that the pass uses when walking the domtree.
   ScopedHTType AVTable;

diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 106eba0..7dba4e2 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp

@@ -24,6 +24,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -45,6 +46,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <vector>
 using namespace llvm;
@@ -590,6 +592,7 @@
     DominatorTree *DT;
     const DataLayout *DL;
     const TargetLibraryInfo *TLI;
+    AssumptionTracker *AT;
     SetVector<BasicBlock *> DeadBlocks;
 
     ValueTable VN;
@@ -679,6 +682,7 @@
 
     // This transformation requires dominator postdominator info
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<TargetLibraryInfo>();
       if (!NoLoads)
@@ -727,6 +731,7 @@
 }
 
 INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
@@ -1616,7 +1621,7 @@
     // If all preds have a single successor, then we know it is safe to insert
     // the load on the pred (?!?), so we can insert code to materialize the
     // pointer if it is not available.
-    PHITransAddr Address(LI->getPointerOperand(), DL);
+    PHITransAddr Address(LI->getPointerOperand(), DL, AT);
     Value *LoadPtr = nullptr;
     LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
                                                 *DT, NewInsts);
@@ -1669,9 +1674,11 @@
                                         LI->getAlignment(),
                                         UnavailablePred->getTerminator());
 
-    // Transfer the old load's TBAA tag to the new load.
-    if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa))
-      NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
+    // Transfer the old load's AA tags to the new load.
+    AAMDNodes Tags;
+    LI->getAAMetadata(Tags);
+    if (Tags)
+      NewLoad->setAAMetadata(Tags);
 
     // Transfer DebugLoc.
     NewLoad->setDebugLoc(LI->getDebugLoc());
@@ -1774,36 +1781,24 @@
       ReplOp->setHasNoUnsignedWrap(false);
   }
   if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {
-    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
-    ReplInst->getAllMetadataOtherThanDebugLoc(Metadata);
-    for (int i = 0, n = Metadata.size(); i < n; ++i) {
-      unsigned Kind = Metadata[i].first;
-      MDNode *IMD = I->getMetadata(Kind);
-      MDNode *ReplMD = Metadata[i].second;
-      switch(Kind) {
-      default:
-        ReplInst->setMetadata(Kind, nullptr); // Remove unknown metadata
-        break;
-      case LLVMContext::MD_dbg:
-        llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
-      case LLVMContext::MD_tbaa:
-        ReplInst->setMetadata(Kind, MDNode::getMostGenericTBAA(IMD, ReplMD));
-        break;
-      case LLVMContext::MD_range:
-        ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD));
-        break;
-      case LLVMContext::MD_prof:
-        llvm_unreachable("MD_prof in a non-terminator instruction");
-        break;
-      case LLVMContext::MD_fpmath:
-        ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD));
-        break;
-      case LLVMContext::MD_invariant_load:
-        // Only set the !invariant.load if it is present in both instructions.
-        ReplInst->setMetadata(Kind, IMD);
-        break;
-      }
-    }
+    // FIXME: If both the original and replacement value are part of the
+    // same control-flow region (meaning that the execution of one
+    // guarentees the executation of the other), then we can combine the
+    // noalias scopes here and do better than the general conservative
+    // answer used in combineMetadata().
+
+    // In general, GVN unifies expressions over different control-flow
+    // regions, and so we need a conservative combination of the noalias
+    // scopes.
+    unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,
+      LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,
+      LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,
+      LLVMContext::MD_invariant_load,
+    };
+    combineMetadata(ReplInst, I, KnownIDs);
   }
 }
 
@@ -2219,7 +2214,7 @@
   // to value numbering it.  Value numbering often exposes redundancies, for
   // example if it determines that %y is equal to %x then the instruction
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
-  if (Value *V = SimplifyInstruction(I, DL, TLI, DT)) {
+  if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AT)) {
     I->replaceAllUsesWith(V);
     if (MD && V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
@@ -2339,6 +2334,7 @@
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
+  AT = &getAnalysis<AssumptionTracker>();
   TLI = &getAnalysis<TargetLibraryInfo>();
   VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
   VN.setMemDep(MD);
@@ -2653,8 +2649,8 @@
   //
   std::vector<BasicBlock *> BBVect;
   BBVect.reserve(256);
-  for (DomTreeNode *x : depth_first(DT->getRootNode()))
-    BBVect.push_back(x->getBlock());
+  for (DomTreeNode *X : depth_first(DT->getRootNode()))
+    BBVect.push_back(X->getBlock());
 
   for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
        I != E; I++)
@@ -2802,7 +2798,7 @@
   return true;
 }
 
-// performPRE() will trigger assert if it come across an instruciton without
+// performPRE() will trigger assert if it comes across an instruction without
 // associated val-num. As it normally has far more live instructions than dead
 // instructions, it makes more sense just to "fabricate" a val-number for the
 // dead code than checking if instruction involved is dead or not.

diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index e83a5c4..c01f57f 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp

@@ -31,6 +31,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -69,11 +70,12 @@
 
 namespace {
   class IndVarSimplify : public LoopPass {
-    LoopInfo        *LI;
-    ScalarEvolution *SE;
-    DominatorTree   *DT;
-    const DataLayout *DL;
-    TargetLibraryInfo *TLI;
+    LoopInfo                  *LI;
+    ScalarEvolution           *SE;
+    DominatorTree             *DT;
+    const DataLayout          *DL;
+    TargetLibraryInfo         *TLI;
+    const TargetTransformInfo *TTI;
 
     SmallVector<WeakVH, 16> DeadInsts;
     bool Changed;
@@ -650,7 +652,7 @@
   struct WideIVInfo {
     PHINode *NarrowIV;
     Type *WidestNativeType; // Widest integer type created [sz]ext
-    bool IsSigned;          // Was an sext user seen before a zext?
+    bool IsSigned;          // Was a sext user seen before a zext?
 
     WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr),
                    IsSigned(false) {}
@@ -661,7 +663,7 @@
 /// extended by this sign or zero extend operation. This is used to determine
 /// the final width of the IV before actually widening it.
 static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE,
-                        const DataLayout *DL) {
+                        const DataLayout *DL, const TargetTransformInfo *TTI) {
   bool IsSigned = Cast->getOpcode() == Instruction::SExt;
   if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
     return;
@@ -671,6 +673,19 @@
   if (DL && !DL->isLegalInteger(Width))
     return;
 
+  // Cast is either an sext or zext up to this point.
+  // We should not widen an indvar if arithmetics on the wider indvar are more
+  // expensive than those on the narrower indvar. We check only the cost of ADD
+  // because at least an ADD is required to increment the induction variable. We
+  // could compute more comprehensively the cost of all instructions on the
+  // induction variable when necessary.
+  if (TTI &&
+      TTI->getArithmeticInstrCost(Instruction::Add, Ty) >
+          TTI->getArithmeticInstrCost(Instruction::Add,
+                                      Cast->getOperand(0)->getType())) {
+    return;
+  }
+
   if (!WI.WidestNativeType) {
     WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
     WI.IsSigned = IsSigned;
@@ -757,8 +772,13 @@
 
   const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU);
 
+  const SCEV *GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+                              unsigned OpCode) const;
+
   Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
 
+  bool WidenLoopCompare(NarrowIVDefUse DU);
+
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
 } // anonymous namespace
@@ -833,18 +853,35 @@
   }
 }
 
+const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS,
+                                     unsigned OpCode) const {
+  if (OpCode == Instruction::Add)
+    return SE->getAddExpr(LHS, RHS);
+  if (OpCode == Instruction::Sub)
+    return SE->getMinusSCEV(LHS, RHS);
+  if (OpCode == Instruction::Mul)
+    return SE->getMulExpr(LHS, RHS);
+
+  llvm_unreachable("Unsupported opcode.");
+}
+
 /// No-wrap operations can transfer sign extension of their result to their
 /// operands. Generate the SCEV value for the widened operation without
 /// actually modifying the IR yet. If the expression after extending the
 /// operands is an AddRec for this loop, return it.
 const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
+
   // Handle the common case of add<nsw/nuw>
-  if (DU.NarrowUse->getOpcode() != Instruction::Add)
+  const unsigned OpCode = DU.NarrowUse->getOpcode();
+  // Only Add/Sub/Mul instructions supported yet.
+  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
+      OpCode != Instruction::Mul)
     return nullptr;
 
   // One operand (NarrowDef) has already been extended to WideDef. Now determine
   // if extending the other will lead to a recurrence.
-  unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+  const unsigned ExtendOperIdx =
+      DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
   assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
 
   const SCEV *ExtendOperExpr = nullptr;
@@ -859,13 +896,20 @@
   else
     return nullptr;
 
-  // When creating this AddExpr, don't apply the current operations NSW or NUW
+  // When creating this SCEV expr, don't apply the current operations NSW or NUW
   // flags. This instruction may be guarded by control flow that the no-wrap
   // behavior depends on. Non-control-equivalent instructions can be mapped to
   // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
   // semantics to those operations.
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(
-    SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr));
+  const SCEV *lhs = SE->getSCEV(DU.WideDef);
+  const SCEV *rhs = ExtendOperExpr;
+
+  // Let's swap operands to the initial order for the case of non-commutative
+  // operations, like SUB. See PR21014.
+  if (ExtendOperIdx == 0)
+    std::swap(lhs, rhs);
+  const SCEVAddRecExpr *AddRec =
+      dyn_cast<SCEVAddRecExpr>(GetSCEVByOpCode(lhs, rhs, OpCode));
 
   if (!AddRec || AddRec->getLoop() != L)
     return nullptr;
@@ -908,6 +952,35 @@
   DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
 }
 
+/// If the narrow use is a compare instruction, then widen the compare
+//  (and possibly the other operand).  The extend operation is hoisted into the
+// loop preheader as far as possible.
+bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) {
+  ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse);
+  if (!Cmp)
+    return false;
+
+  // Sign of IV user and compare must match.
+  if (IsSigned != CmpInst::isSigned(Cmp->getPredicate()))
+    return false;
+
+  Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0);
+  unsigned CastWidth = SE->getTypeSizeInBits(Op->getType());
+  unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+  assert (CastWidth <= IVWidth && "Unexpected width while widening compare.");
+
+  // Widen the compare instruction.
+  IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+  DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+
+  // Widen the other operand of the compare, if necessary.
+  if (CastWidth < IVWidth) {
+    Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp);
+    DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
+  }
+  return true;
+}
+
 /// WidenIVUse - Determine whether an individual user of the narrow IV can be
 /// widened. If so, return the wide clone of the user.
 Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
@@ -975,10 +1048,15 @@
 
   // Does this user itself evaluate to a recurrence after widening?
   const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse);
+  if (!WideAddRec)
+    WideAddRec = GetExtendedOperandRecurrence(DU);
+
   if (!WideAddRec) {
-      WideAddRec = GetExtendedOperandRecurrence(DU);
-  }
-  if (!WideAddRec) {
+    // If use is a loop condition, try to promote the condition instead of
+    // truncating the IV first.
+    if (WidenLoopCompare(DU))
+      return nullptr;
+
     // This user does not evaluate to a recurence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
@@ -1024,7 +1102,7 @@
     Instruction *NarrowUser = cast<Instruction>(U);
 
     // Handle data flow merges and bizarre phi cycles.
-    if (!Widened.insert(NarrowUser))
+    if (!Widened.insert(NarrowUser).second)
       continue;
 
     NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef));
@@ -1124,14 +1202,16 @@
   class IndVarSimplifyVisitor : public IVVisitor {
     ScalarEvolution *SE;
     const DataLayout *DL;
+    const TargetTransformInfo *TTI;
     PHINode *IVPhi;
 
   public:
     WideIVInfo WI;
 
     IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
-                          const DataLayout *DL, const DominatorTree *DTree):
-      SE(SCEV), DL(DL), IVPhi(IV) {
+                          const DataLayout *DL, const TargetTransformInfo *TTI,
+                          const DominatorTree *DTree)
+        : SE(SCEV), DL(DL), TTI(TTI), IVPhi(IV) {
       DT = DTree;
       WI.NarrowIV = IVPhi;
       if (ReduceLiveIVs)
@@ -1139,7 +1219,9 @@
     }
 
     // Implement the interface used by simplifyUsersOfIV.
-    void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, DL); }
+    void visitCast(CastInst *Cast) override {
+      visitIVCast(Cast, WI, SE, DL, TTI);
+    }
   };
 }
 
@@ -1173,7 +1255,7 @@
       PHINode *CurrIV = LoopPhis.pop_back_val();
 
       // Information about sign/zero extensions of CurrIV.
-      IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, DT);
+      IndVarSimplifyVisitor Visitor(CurrIV, SE, DL, TTI, DT);
 
       Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor);
 
@@ -1200,9 +1282,9 @@
 /// BackedgeTakenInfo. If these expressions have not been reduced, then
 /// expanding them may incur additional cost (albeit in the loop preheader).
 static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
-                                SmallPtrSet<const SCEV*, 8> &Processed,
+                                SmallPtrSetImpl<const SCEV*> &Processed,
                                 ScalarEvolution *SE) {
-  if (!Processed.insert(S))
+  if (!Processed.insert(S).second)
     return false;
 
   // If the backedge-taken count is a UDiv, it's very likely a UDiv that
@@ -1373,7 +1455,7 @@
 /// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
 /// down to checking that all operands are constant and listing instructions
 /// that may hide undef.
-static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited,
+static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
                                unsigned Depth) {
   if (isa<Constant>(V))
     return !isa<UndefValue>(V);
@@ -1393,7 +1475,7 @@
 
   // Optimistically handle other instructions.
   for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
-    if (!Visited.insert(*OI))
+    if (!Visited.insert(*OI).second)
       continue;
     if (!hasConcreteDefImpl(*OI, Visited, Depth+1))
       return false;
@@ -1623,15 +1705,51 @@
   // compare against the post-incremented value, otherwise we must compare
   // against the preincremented value.
   if (L->getExitingBlock() == L->getLoopLatch()) {
-    // Add one to the "backedge-taken" count to get the trip count.
-    // This addition may overflow, which is valid as long as the comparison is
-    // truncated to BackedgeTakenCount->getType().
-    IVCount = SE->getAddExpr(BackedgeTakenCount,
-                             SE->getConstant(BackedgeTakenCount->getType(), 1));
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
-    CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
+    llvm::Value *IncrementedIndvar =
+        IndVar->getIncomingValueForBlock(L->getExitingBlock());
+    const auto *IncrementedIndvarSCEV =
+        cast<SCEVAddRecExpr>(SE->getSCEV(IncrementedIndvar));
+    // It is unsafe to use the incremented indvar if it has a wrapping flag, we
+    // don't want to compare against a poison value.  Check the SCEV that
+    // corresponds to the incremented indvar, the SCEVExpander will only insert
+    // flags in the IR if the SCEV originally had wrapping flags.
+    // FIXME: In theory, SCEV could drop flags even though they exist in IR.
+    // A more robust solution would involve getting a new expression for
+    // CmpIndVar by applying non-NSW/NUW AddExprs.
+    auto WrappingFlags =
+        ScalarEvolution::setFlags(SCEV::FlagNUW, SCEV::FlagNSW);
+    const SCEV *IVInit = IncrementedIndvarSCEV->getStart();
+    if (SE->getTypeSizeInBits(IVInit->getType()) >
+        SE->getTypeSizeInBits(IVCount->getType()))
+      IVInit = SE->getTruncateExpr(IVInit, IVCount->getType());
+    unsigned BitWidth = SE->getTypeSizeInBits(IVCount->getType());
+    Type *WideTy = IntegerType::get(SE->getContext(), BitWidth + 1);
+    // Check if InitIV + BECount+1 requires sign/zero extension.
+    // If not, clear the corresponding flag from WrappingFlags because it is not
+    // necessary for those flags in the IncrementedIndvarSCEV expression.
+    if (SE->getSignExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount),
+                              WideTy) ==
+        SE->getAddExpr(SE->getSignExtendExpr(IVInit, WideTy),
+                       SE->getSignExtendExpr(BackedgeTakenCount, WideTy)))
+      WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNSW);
+    if (SE->getZeroExtendExpr(SE->getAddExpr(IVInit, BackedgeTakenCount),
+                              WideTy) ==
+        SE->getAddExpr(SE->getZeroExtendExpr(IVInit, WideTy),
+                       SE->getZeroExtendExpr(BackedgeTakenCount, WideTy)))
+      WrappingFlags = ScalarEvolution::clearFlags(WrappingFlags, SCEV::FlagNUW);
+    if (!ScalarEvolution::maskFlags(IncrementedIndvarSCEV->getNoWrapFlags(),
+                                    WrappingFlags)) {
+      // Add one to the "backedge-taken" count to get the trip count.
+      // This addition may overflow, which is valid as long as the comparison is
+      // truncated to BackedgeTakenCount->getType().
+      IVCount =
+          SE->getAddExpr(BackedgeTakenCount,
+                         SE->getConstant(BackedgeTakenCount->getType(), 1));
+      CmpIndVar = IncrementedIndvar;
+    }
   }
 
   Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
@@ -1817,6 +1935,7 @@
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
   TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+  TTI = getAnalysisIfAvailable<TargetTransformInfo>();
 
   DeadInsts.clear();
   Changed = false;

diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 6e50d33..60a4925 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp

@@ -26,6 +26,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -44,7 +45,7 @@
 STATISTIC(NumDupes,   "Number of branch blocks duplicated to eliminate phi");
 
 static cl::opt<unsigned>
-Threshold("jump-threading-threshold",
+BBDuplicateThreshold("jump-threading-threshold",
           cl::desc("Max block size to duplicate for jump threading"),
           cl::init(6), cl::Hidden);
 
@@ -87,6 +88,8 @@
 #endif
     DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet;
 
+    unsigned BBDupThreshold;
+
     // RAII helper for updating the recursion stack.
     struct RecursionSetRemover {
       DenseSet<std::pair<Value*, BasicBlock*> > &TheSet;
@@ -102,7 +105,8 @@
     };
   public:
     static char ID; // Pass identification
-    JumpThreading() : FunctionPass(ID) {
+    JumpThreading(int T = -1) : FunctionPass(ID) {
+      BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
       initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
     }
 
@@ -123,9 +127,11 @@
 
     bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
                                          PredValueInfo &Result,
-                                         ConstantPreference Preference);
+                                         ConstantPreference Preference,
+                                         Instruction *CxtI = nullptr);
     bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
-                                ConstantPreference Preference);
+                                ConstantPreference Preference,
+                                Instruction *CxtI = nullptr);
 
     bool ProcessBranchOnPHI(PHINode *PN);
     bool ProcessBranchOnXOR(BinaryOperator *BO);
@@ -144,7 +150,7 @@
                 "Jump Threading", false, false)
 
 // Public interface to the Jump Threading pass
-FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
+FunctionPass *llvm::createJumpThreadingPass(int Threshold) { return new JumpThreading(Threshold); }
 
 /// runOnFunction - Top level algorithm.
 ///
@@ -339,7 +345,8 @@
 ///
 bool JumpThreading::
 ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
-                                ConstantPreference Preference) {
+                                ConstantPreference Preference,
+                                Instruction *CxtI) {
   // This method walks up use-def chains recursively.  Because of this, we could
   // get into an infinite loop going around loops in the use-def chain.  To
   // prevent this, keep track of what (value, block) pairs we've already visited
@@ -381,7 +388,7 @@
       BasicBlock *P = *PI;
       // If the value is known by LazyValueInfo to be a constant in a
       // predecessor, use that information to try to thread this block.
-      Constant *PredCst = LVI->getConstantOnEdge(V, P, BB);
+      Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
       if (Constant *KC = getKnownConstant(PredCst, Preference))
         Result.push_back(std::make_pair(KC, P));
     }
@@ -397,7 +404,8 @@
         Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
       } else {
         Constant *CI = LVI->getConstantOnEdge(InVal,
-                                              PN->getIncomingBlock(i), BB);
+                                              PN->getIncomingBlock(i),
+                                              BB, CxtI);
         if (Constant *KC = getKnownConstant(CI, Preference))
           Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
       }
@@ -416,9 +424,9 @@
     if (I->getOpcode() == Instruction::Or ||
         I->getOpcode() == Instruction::And) {
       ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
-                                      WantInteger);
+                                      WantInteger, CxtI);
       ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
-                                      WantInteger);
+                                      WantInteger, CxtI);
 
       if (LHSVals.empty() && RHSVals.empty())
         return false;
@@ -459,7 +467,7 @@
         isa<ConstantInt>(I->getOperand(1)) &&
         cast<ConstantInt>(I->getOperand(1))->isOne()) {
       ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result,
-                                      WantInteger);
+                                      WantInteger, CxtI);
       if (Result.empty())
         return false;
 
@@ -477,7 +485,7 @@
     if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
       PredValueInfoTy LHSVals;
       ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals,
-                                      WantInteger);
+                                      WantInteger, CxtI);
 
       // Try to use constant folding to simplify the binary operator.
       for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
@@ -511,7 +519,8 @@
 
           LazyValueInfo::Tristate
             ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
-                                           cast<Constant>(RHS), PredBB, BB);
+                                           cast<Constant>(RHS), PredBB, BB,
+                                           CxtI ? CxtI : Cmp);
           if (ResT == LazyValueInfo::Unknown)
             continue;
           Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
@@ -524,7 +533,6 @@
       return !Result.empty();
     }
 
-
     // If comparing a live-in value against a constant, see if we know the
     // live-in value on any predecessors.
     if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) {
@@ -538,7 +546,7 @@
           // predecessor, use that information to try to thread this block.
           LazyValueInfo::Tristate Res =
             LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
-                                    RHSCst, P, BB);
+                                    RHSCst, P, BB, CxtI ? CxtI : Cmp);
           if (Res == LazyValueInfo::Unknown)
             continue;
 
@@ -554,7 +562,7 @@
       if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) {
         PredValueInfoTy LHSVals;
         ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
-                                        WantInteger);
+                                        WantInteger, CxtI);
 
         for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
           Constant *V = LHSVals[i].first;
@@ -577,7 +585,7 @@
     PredValueInfoTy Conds;
     if ((TrueVal || FalseVal) &&
         ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds,
-                                        WantInteger)) {
+                                        WantInteger, CxtI)) {
       for (unsigned i = 0, e = Conds.size(); i != e; ++i) {
         Constant *Cond = Conds[i].first;
 
@@ -604,7 +612,7 @@
   }
 
   // If all else fails, see if LVI can figure out a constant value for us.
-  Constant *CI = LVI->getConstant(V, BB);
+  Constant *CI = LVI->getConstant(V, BB, CxtI);
   if (Constant *KC = getKnownConstant(CI, Preference)) {
     for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
       Result.push_back(std::make_pair(KC, *PI));
@@ -669,14 +677,9 @@
       if (LoopHeaders.erase(SinglePred))
         LoopHeaders.insert(BB);
 
-      // Remember if SinglePred was the entry block of the function.  If so, we
-      // will need to move BB back to the entry position.
-      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
       LVI->eraseBlock(SinglePred);
       MergeBasicBlockIntoOnlyPred(BB);
 
-      if (isEntry && BB != &BB->getParent()->getEntryBlock())
-        BB->moveBefore(&BB->getParent()->getEntryBlock());
       return true;
     }
   }
@@ -749,7 +752,7 @@
   // All the rest of our checks depend on the condition being an instruction.
   if (!CondInst) {
     // FIXME: Unify this with code below.
-    if (ProcessThreadableEdges(Condition, BB, Preference))
+    if (ProcessThreadableEdges(Condition, BB, Preference, Terminator))
       return true;
     return false;
   }
@@ -771,13 +774,14 @@
       // FIXME: We could handle mixed true/false by duplicating code.
       LazyValueInfo::Tristate Baseline =
         LVI->getPredicateOnEdge(CondCmp->getPredicate(), CondCmp->getOperand(0),
-                                CondConst, *PI, BB);
+                                CondConst, *PI, BB, CondCmp);
       if (Baseline != LazyValueInfo::Unknown) {
         // Check that all remaining incoming values match the first one.
         while (++PI != PE) {
           LazyValueInfo::Tristate Ret =
             LVI->getPredicateOnEdge(CondCmp->getPredicate(),
-                                    CondCmp->getOperand(0), CondConst, *PI, BB);
+                                    CondCmp->getOperand(0), CondConst, *PI, BB,
+                                    CondCmp);
           if (Ret != Baseline) break;
         }
 
@@ -792,6 +796,21 @@
         }
       }
 
+    } else if (CondBr && CondConst && CondBr->isConditional()) {
+      // There might be an invairant in the same block with the conditional
+      // that can determine the predicate.
+
+      LazyValueInfo::Tristate Ret =
+        LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
+                            CondConst, CondCmp);
+      if (Ret != LazyValueInfo::Unknown) {
+        unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
+        unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
+        CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+        BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+        CondBr->eraseFromParent();
+        return true;
+      }
     }
 
     if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
@@ -819,7 +838,7 @@
   // a PHI node in the current block.  If we can prove that any predecessors
   // compute a predictable value based on a PHI node, thread those predecessors.
   //
-  if (ProcessThreadableEdges(CondInst, BB, Preference))
+  if (ProcessThreadableEdges(CondInst, BB, Preference, Terminator))
     return true;
 
   // If this is an otherwise-unfoldable branch on a phi node in the current
@@ -882,6 +901,9 @@
     // If the returned value is the load itself, replace with an undef. This can
     // only happen in dead loops.
     if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+    if (AvailableVal->getType() != LI->getType())
+      AvailableVal = CastInst::Create(CastInst::BitCast, AvailableVal,
+                                      LI->getType(), "", LI);
     LI->replaceAllUsesWith(AvailableVal);
     LI->eraseFromParent();
     return true;
@@ -893,9 +915,10 @@
   if (BBIt != LoadBB->begin())
     return false;
 
-  // If all of the loads and stores that feed the value have the same TBAA tag,
-  // then we can propagate it onto any newly inserted loads.
-  MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa);
+  // If all of the loads and stores that feed the value have the same AA tags,
+  // then we can propagate them onto any newly inserted loads.
+  AAMDNodes AATags;
+  LI->getAAMetadata(AATags);
 
   SmallPtrSet<BasicBlock*, 8> PredsScanned;
   typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
@@ -909,21 +932,21 @@
     BasicBlock *PredBB = *PI;
 
     // If we already scanned this predecessor, skip it.
-    if (!PredsScanned.insert(PredBB))
+    if (!PredsScanned.insert(PredBB).second)
       continue;
 
     // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
-    MDNode *ThisTBAATag = nullptr;
+    AAMDNodes ThisAATags;
     Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6,
-                                                    nullptr, &ThisTBAATag);
+                                                    nullptr, &ThisAATags);
     if (!PredAvailable) {
       OneUnavailablePred = PredBB;
       continue;
     }
 
-    // If tbaa tags disagree or are not present, forget about them.
-    if (TBAATag != ThisTBAATag) TBAATag = nullptr;
+    // If AA tags disagree or are not present, forget about them.
+    if (AATags != ThisAATags) AATags = AAMDNodes();
 
     // If so, this load is partially redundant.  Remember this info so that we
     // can create a PHI node.
@@ -983,8 +1006,8 @@
                                  LI->getAlignment(),
                                  UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LI->getDebugLoc());
-    if (TBAATag)
-      NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+    if (AATags)
+      NewVal->setAAMetadata(AATags);
 
     AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
   }
@@ -1011,7 +1034,16 @@
     assert(I != AvailablePreds.end() && I->first == P &&
            "Didn't find entry for predecessor!");
 
-    PN->addIncoming(I->second, I->first);
+    // If we have an available predecessor but it requires casting, insert the
+    // cast in the predecessor and use the cast. Note that we have to update the
+    // AvailablePreds vector as we go so that all of the PHI entries for this
+    // predecessor use the same bitcast.
+    Value *&PredV = I->second;
+    if (PredV->getType() != LI->getType())
+      PredV = CastInst::Create(CastInst::BitCast, PredV, LI->getType(), "",
+                               P->getTerminator());
+
+    PN->addIncoming(PredV, I->first);
   }
 
   //cerr << "PRE: " << *LI << *PN << "\n";
@@ -1086,14 +1118,15 @@
 }
 
 bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
-                                           ConstantPreference Preference) {
+                                           ConstantPreference Preference,
+                                           Instruction *CxtI) {
   // If threading this would thread across a loop header, don't even try to
   // thread the edge.
   if (LoopHeaders.count(BB))
     return false;
 
   PredValueInfoTy PredValues;
-  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference))
+  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI))
     return false;
 
   assert(!PredValues.empty() &&
@@ -1118,7 +1151,7 @@
 
   for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
     BasicBlock *Pred = PredValues[i].second;
-    if (!SeenPreds.insert(Pred))
+    if (!SeenPreds.insert(Pred).second)
       continue;  // Duplicate predecessor entry.
 
     // If the predecessor ends with an indirect goto, we can't change its
@@ -1258,10 +1291,10 @@
   PredValueInfoTy XorOpValues;
   bool isLHS = true;
   if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
-                                       WantInteger)) {
+                                       WantInteger, BO)) {
     assert(XorOpValues.empty());
     if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
-                                         WantInteger))
+                                         WantInteger, BO))
       return false;
     isLHS = false;
   }
@@ -1371,8 +1404,8 @@
     return false;
   }
 
-  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, Threshold);
-  if (JumpThreadCost > Threshold) {
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  if (JumpThreadCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
           << "' - Cost is too high: " << JumpThreadCost << "\n");
     return false;
@@ -1514,8 +1547,8 @@
     return false;
   }
 
-  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, Threshold);
-  if (DuplicationCost > Threshold) {
+  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  if (DuplicationCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
           << "' - Cost is too high: " << DuplicationCost << "\n");
     return false;
@@ -1677,10 +1710,10 @@
     // cases will be threaded in any case.
     LazyValueInfo::Tristate LHSFolds =
         LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
-                                CondRHS, Pred, BB);
+                                CondRHS, Pred, BB, CondCmp);
     LazyValueInfo::Tristate RHSFolds =
         LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
-                                CondRHS, Pred, BB);
+                                CondRHS, Pred, BB, CondCmp);
     if ((LHSFolds != LazyValueInfo::Unknown ||
          RHSFolds != LazyValueInfo::Unknown) &&
         LHSFolds != RHSFolds) {

diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index abcceb2..5f00bb9 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp

@@ -130,6 +130,9 @@
     /// set.
     void deleteAnalysisValue(Value *V, Loop *L) override;
 
+    /// Simple Analysis hook. Delete loop L from alias set map.
+    void deleteAnalysisLoop(Loop *L) override;
+
     /// SinkRegion - Walk the specified region of the CFG (defined by all blocks
     /// dominated by the specified block, and that are in the current loop) in
     /// reverse depth first order w.r.t the DominatorTree.  This allows us to
@@ -180,9 +183,9 @@
     /// store into the memory location pointed to by V.
     ///
     bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
-                                  const MDNode *TBAAInfo) {
+                                  const AAMDNodes &AAInfo) {
       // Check to see if any of the basic blocks in CurLoop invalidate *V.
-      return CurAST->getAliasSetForPointer(V, Size, TBAAInfo).isMod();
+      return CurAST->getAliasSetForPointer(V, Size, AAInfo).isMod();
     }
 
     bool canSinkOrHoistInst(Instruction &I);
@@ -441,15 +444,18 @@
     // in the same alias set as something that ends up being modified.
     if (AA->pointsToConstantMemory(LI->getOperand(0)))
       return true;
-    if (LI->getMetadata("invariant.load"))
+    if (LI->getMetadata(LLVMContext::MD_invariant_load))
       return true;
 
     // Don't hoist loads which have may-aliased stores in loop.
     uint64_t Size = 0;
     if (LI->getType()->isSized())
       Size = AA->getTypeStoreSize(LI->getType());
-    return !pointerInvalidatedByLoop(LI->getOperand(0), Size,
-                                     LI->getMetadata(LLVMContext::MD_tbaa));
+
+    AAMDNodes AAInfo;
+    LI->getAAMetadata(AAInfo);
+
+    return !pointerInvalidatedByLoop(LI->getOperand(0), Size, AAInfo);
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
     // Don't sink or hoist dbg info; it's legal, but not useful.
     if (isa<DbgInfoIntrinsic>(I))
@@ -594,8 +600,13 @@
   // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
   // the instruction.
   while (!I.use_empty()) {
+    Instruction *User = I.user_back();
+    if (!DT->isReachableFromEntry(User->getParent())) {
+      User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
+      continue;
+    }
     // The user must be a PHI node.
-    PHINode *PN = cast<PHINode>(I.user_back());
+    PHINode *PN = cast<PHINode>(User);
 
     BasicBlock *ExitBlock = PN->getParent();
     assert(ExitBlockSet.count(ExitBlock) &&
@@ -682,7 +693,7 @@
 namespace {
   class LoopPromoter : public LoadAndStorePromoter {
     Value *SomePtr;  // Designated pointer to store to.
-    SmallPtrSet<Value*, 4> &PointerMustAliases;
+    SmallPtrSetImpl<Value*> &PointerMustAliases;
     SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
     SmallVectorImpl<Instruction*> &LoopInsertPts;
     PredIteratorCache &PredCache;
@@ -690,7 +701,7 @@
     LoopInfo &LI;
     DebugLoc DL;
     int Alignment;
-    MDNode *TBAATag;
+    AAMDNodes AATags;
 
     Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
       if (Instruction *I = dyn_cast<Instruction>(V))
@@ -710,14 +721,14 @@
 
   public:
     LoopPromoter(Value *SP, const SmallVectorImpl<Instruction *> &Insts,
-                 SSAUpdater &S, SmallPtrSet<Value *, 4> &PMA,
+                 SSAUpdater &S, SmallPtrSetImpl<Value *> &PMA,
                  SmallVectorImpl<BasicBlock *> &LEB,
                  SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
                  AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
-                 MDNode *TBAATag)
+                 const AAMDNodes &AATags)
         : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
           LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
-          LI(li), DL(dl), Alignment(alignment), TBAATag(TBAATag) {}
+          LI(li), DL(dl), Alignment(alignment), AATags(AATags) {}
 
     bool isInstInList(Instruction *I,
                       const SmallVectorImpl<Instruction*> &) const override {
@@ -743,7 +754,7 @@
         StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
         NewSI->setAlignment(Alignment);
         NewSI->setDebugLoc(DL);
-        if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+        if (AATags) NewSI->setAAMetadata(AATags);
       }
     }
 
@@ -798,11 +809,11 @@
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
   unsigned Alignment = 1;
-  MDNode *TBAATag = nullptr;
+  AAMDNodes AATags;
 
   // Check that all of the pointers in the alias set have the same type.  We
   // cannot (yet) promote a memory location that is loaded and stored in
-  // different sizes.  While we are at it, collect alignment and TBAA info.
+  // different sizes.  While we are at it, collect alignment and AA info.
   for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
     Value *ASIV = ASI->getValue();
     PointerMustAliases.insert(ASIV);
@@ -855,13 +866,12 @@
       } else
         return; // Not a load or store.
 
-      // Merge the TBAA tags.
+      // Merge the AA tags.
       if (LoopUses.empty()) {
-        // On the first load/store, just take its TBAA tag.
-        TBAATag = UI->getMetadata(LLVMContext::MD_tbaa);
-      } else if (TBAATag) {
-        TBAATag = MDNode::getMostGenericTBAA(TBAATag,
-                                       UI->getMetadata(LLVMContext::MD_tbaa));
+        // On the first load/store, just take its AA tags.
+        UI->getAAMetadata(AATags);
+      } else if (AATags) {
+        UI->getAAMetadata(AATags, /* Merge = */ true);
       }
 
       LoopUses.push_back(UI);
@@ -896,7 +906,7 @@
   SmallVector<PHINode*, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, PIC, *CurAST, *LI, DL, Alignment, TBAATag);
+                        InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
@@ -905,7 +915,7 @@
                  Preheader->getTerminator());
   PreheaderLoad->setAlignment(Alignment);
   PreheaderLoad->setDebugLoc(DL);
-  if (TBAATag) PreheaderLoad->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+  if (AATags) PreheaderLoad->setAAMetadata(AATags);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
 
   // Rewrite all the loads in the loop and remember all the definitions from
@@ -936,3 +946,13 @@
 
   AST->deleteValue(V);
 }
+
+/// Simple Analysis hook. Delete value L from alias set map.
+void LICM::deleteAnalysisLoop(Loop *L) {
+  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+  if (!AST)
+    return;
+
+  delete AST;
+  LoopToAliasSetMap.erase(L);
+}

diff --git a/lib/Transforms/Scalar/LLVMBuild.txt b/lib/Transforms/Scalar/LLVMBuild.txt
index 1f6df7d..2bb49a3 100644
--- a/lib/Transforms/Scalar/LLVMBuild.txt
+++ b/lib/Transforms/Scalar/LLVMBuild.txt

@@ -20,4 +20,4 @@
 name = Scalar
 parent = Transforms
 library_name = ScalarOpts
-required_libraries = Analysis Core IPA InstCombine Support Target TransformUtils
+required_libraries = Analysis Core InstCombine ProfileData Support Target TransformUtils

diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 846aa70..11e4d76 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp

@@ -15,6 +15,8 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Pass.h"
 #include "llvm/IR/DataLayout.h"
@@ -51,13 +53,16 @@
 class LoadCombine : public BasicBlockPass {
   LLVMContext *C;
   const DataLayout *DL;
+  AliasAnalysis *AA;
 
 public:
   LoadCombine()
       : BasicBlockPass(ID),
-        C(nullptr), DL(nullptr) {
+        C(nullptr), DL(nullptr), AA(nullptr) {
     initializeSROAPass(*PassRegistry::getPassRegistry());
   }
+  
+  using llvm::Pass::doInitialization;
   bool doInitialization(Function &) override;
   bool runOnBasicBlock(BasicBlock &BB) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -223,19 +228,23 @@
   if (skipOptnoneFunction(BB) || !DL)
     return false;
 
+  AA = &getAnalysis<AliasAnalysis>();
+
   IRBuilder<true, TargetFolder>
   TheBuilder(BB.getContext(), TargetFolder(DL));
   Builder = &TheBuilder;
 
   DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
+  AliasSetTracker AST(*AA);
 
   bool Combined = false;
   unsigned Index = 0;
   for (auto &I : BB) {
-    if (I.mayWriteToMemory() || I.mayThrow()) {
+    if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) {
       if (combineLoads(LoadMap))
         Combined = true;
       LoadMap.clear();
+      AST.clear();
       continue;
     }
     LoadInst *LI = dyn_cast<LoadInst>(&I);
@@ -248,6 +257,7 @@
     if (!POP.Pointer)
       continue;
     LoadMap[POP.Pointer].push_back(LoadPOPPair(LI, POP, Index++));
+    AST.add(LI);
   }
   if (combineLoads(LoadMap))
     Combined = true;
@@ -256,6 +266,9 @@
 
 void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
+
+  AU.addRequired<AliasAnalysis>();
+  AU.addPreserved<AliasAnalysis>();
 }
 
 char LoadCombine::ID = 0;
@@ -264,5 +277,9 @@
   return new LoadCombine();
 }
 
-INITIALIZE_PASS(LoadCombine, "load-combine", "Combine Adjacent Loads", false,
-                false)
+INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads",
+                      false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads",
+                    false, false)
+

diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 5ab686a..1d1f33a 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp

@@ -239,9 +239,8 @@
   LoopInfo &loopInfo = getAnalysis<LoopInfo>();
   SmallPtrSet<BasicBlock*, 8> blocks;
   blocks.insert(L->block_begin(), L->block_end());
-  for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
-       E = blocks.end(); I != E; ++I)
-    loopInfo.removeBlock(*I);
+  for (BasicBlock *BB : blocks)
+    loopInfo.removeBlock(BB);
 
   // The last step is to inform the loop pass manager that we've
   // eliminated this loop.

diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index ab1a939..8fd7c8f 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp

@@ -14,6 +14,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -41,6 +42,7 @@
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
+      AU.addRequired<AssumptionTracker>();
       AU.addRequired<LoopInfo>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
@@ -54,6 +56,7 @@
 char LoopInstSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
                 "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
@@ -76,6 +79,7 @@
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
   L->getUniqueExitBlocks(ExitBlocks);
@@ -116,7 +120,7 @@
 
         // Don't bother simplifying unused instructions.
         if (!I->use_empty()) {
-          Value *V = SimplifyInstruction(I, DL, TLI, DT);
+          Value *V = SimplifyInstruction(I, DL, TLI, DT, AT);
           if (V && LI->replacementPreservesLCSSAForm(I, V)) {
             // Mark all uses for resimplification next time round the loop.
             for (User *U : I->users())
@@ -148,7 +152,7 @@
       for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
            ++SI) {
         BasicBlock *SuccBB = *SI;
-        if (!Visited.insert(SuccBB))
+        if (!Visited.insert(SuccBB).second)
           continue;
 
         const Loop *SuccLoop = LI->getLoopFor(SuccBB);
@@ -161,7 +165,7 @@
 
           for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
             BasicBlock *ExitBB = SubLoopExitBlocks[i];
-            if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB))
+            if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB).second)
               VisitStack.push_back(WorklistItem(ExitBB, false));
           }
 

diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
index b6fbb16..8f12204 100644
--- a/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp

@@ -215,9 +215,7 @@
       typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
 
       // Add a new possible reduction.
-      void addSLR(SimpleLoopReduction &SLR) {
-        PossibleReds.push_back(SLR);
-      }
+      void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
 
       // Setup to track possible reductions corresponding to the provided
       // rerolling scale. Only reductions with a number of non-PHI instructions
@@ -225,7 +223,8 @@
       // are filled in:
       //   - A set of all possible instructions in eligible reductions.
       //   - A set of all PHIs in eligible reductions
-      //   - A set of all reduced values (last instructions) in eligible reductions.
+      //   - A set of all reduced values (last instructions) in eligible
+      //     reductions.
       void restrictToScale(uint64_t Scale,
                            SmallInstructionSet &PossibleRedSet,
                            SmallInstructionSet &PossibleRedPHISet,
@@ -238,13 +237,12 @@
           if (PossibleReds[i].size() % Scale == 0) {
             PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
             PossibleRedPHISet.insert(PossibleReds[i].getPHI());
-      
+
             PossibleRedSet.insert(PossibleReds[i].getPHI());
             PossibleRedIdx[PossibleReds[i].getPHI()] = i;
-            for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
-                 JE = PossibleReds[i].end(); J != JE; ++J) {
-              PossibleRedSet.insert(*J);
-              PossibleRedIdx[*J] = i;
+            for (Instruction *J : PossibleReds[i]) {
+              PossibleRedSet.insert(J);
+              PossibleRedIdx[J] = i;
             }
           }
       }
@@ -487,7 +485,7 @@
           if (PN->getIncomingBlock(U) == L->getHeader())
             continue;
         }
-  
+
         if (L->contains(User) && !Exclude.count(User)) {
           Queue.push_back(User);
         }
@@ -659,16 +657,15 @@
        RI != RIE; ++RI) {
     int i = *RI;
     int PrevIter = 0, BaseCount = 0, Count = 0;
-    for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
-         JE = PossibleReds[i].end(); J != JE; ++J) {
-	// Note that all instructions in the chain must have been found because
-	// all instructions in the function must have been assigned to some
-	// iteration.
-      int Iter = PossibleRedIter[*J];
+    for (Instruction *J : PossibleReds[i]) {
+      // Note that all instructions in the chain must have been found because
+      // all instructions in the function must have been assigned to some
+      // iteration.
+      int Iter = PossibleRedIter[J];
       if (Iter != PrevIter && Iter != PrevIter + 1 &&
           !PossibleReds[i].getReducedValue()->isAssociative()) {
         DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
-                        *J << "\n");
+                        J << "\n");
         return false;
       }
 
@@ -881,7 +878,7 @@
           // needed because otherwise isSafeToSpeculativelyExecute returns
           // false on PHI nodes.
           if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
-            FutureSideEffects = true; 
+            FutureSideEffects = true;
         }
 
         ++J2;
@@ -952,9 +949,9 @@
         for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
           Value *Op2 = J2->getOperand(j);
 
-	  // If this is part of a reduction (and the operation is not
-	  // associatve), then we match all operands, but not those that are
-	  // part of the reduction.
+          // If this is part of a reduction (and the operation is not
+          // associatve), then we match all operands, but not those that are
+          // part of the reduction.
           if (InReduction)
             if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
               if (Reductions.isPairInSame(J2, Op2I))
@@ -968,11 +965,11 @@
             Op2 = IV;
 
           if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
-	    // If we've not already decided to swap the matched operands, and
-	    // we've not already matched our first operand (note that we could
-	    // have skipped matching the first operand because it is part of a
-	    // reduction above), and the instruction is commutative, then try
-	    // the swapped match.
+            // If we've not already decided to swap the matched operands, and
+            // we've not already matched our first operand (note that we could
+            // have skipped matching the first operand because it is part of a
+            // reduction above), and the instruction is commutative, then try
+            // the swapped match.
             if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
                 J1->getOperand(!j) == Op2) {
               Swapped = true;
@@ -1069,7 +1066,7 @@
       continue;
     }
 
-    ++J; 
+    ++J;
   }
 
   // Insert the new induction variable.
@@ -1110,9 +1107,9 @@
           ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
                                             Preheader->getTerminator());
         }
- 
-        Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1,
-                                   "exitcond");
+
+        Value *Cond =
+            new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond");
         BI->setCondition(Cond);
 
         if (BI->getSuccessor(1) != Header)
@@ -1182,4 +1179,3 @@
 
   return Changed;
 }
-

diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 2ce5831..afd2eca 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp

@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -53,6 +54,7 @@
 
     // LCSSA form makes instruction renaming easier.
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionTracker>();
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
@@ -72,12 +74,14 @@
     unsigned MaxHeaderSize;
     LoopInfo *LI;
     const TargetTransformInfo *TTI;
+    AssumptionTracker *AT;
   };
 }
 
 char LoopRotate::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -98,6 +102,7 @@
 
   LI = &getAnalysis<LoopInfo>();
   TTI = &getAnalysis<TargetTransformInfo>();
+  AT = &getAnalysis<AssumptionTracker>();
 
   // Simplify the loop latch before attempting to rotate the header
   // upward. Rotation may not be needed if the loop tail can be folded into the
@@ -184,13 +189,18 @@
   }
 }
 
-/// Determine whether the instructions in this range my be safely and cheaply
+/// Determine whether the instructions in this range may be safely and cheaply
 /// speculated. This is not an important enough situation to develop complex
 /// heuristics. We handle a single arithmetic instruction along with any type
 /// conversions.
 static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
-                                  BasicBlock::iterator End) {
+                                  BasicBlock::iterator End, Loop *L) {
   bool seenIncrement = false;
+  bool MultiExitLoop = false;
+
+  if (!L->getExitingBlock())
+    MultiExitLoop = true;
+
   for (BasicBlock::iterator I = Begin; I != End; ++I) {
 
     if (!isSafeToSpeculativelyExecute(I))
@@ -214,11 +224,33 @@
     case Instruction::Xor:
     case Instruction::Shl:
     case Instruction::LShr:
-    case Instruction::AShr:
+    case Instruction::AShr: {
+      Value *IVOpnd = nullptr;
+      if (isa<ConstantInt>(I->getOperand(0)))
+        IVOpnd = I->getOperand(1);
+
+      if (isa<ConstantInt>(I->getOperand(1))) {
+        if (IVOpnd)
+          return false;
+
+        IVOpnd = I->getOperand(0);
+      }
+
+      // If increment operand is used outside of the loop, this speculation
+      // could cause extra live range interference.
+      if (MultiExitLoop && IVOpnd) {
+        for (User *UseI : IVOpnd->users()) {
+          auto *UserInst = cast<Instruction>(UseI);
+          if (!L->contains(UserInst))
+            return false;
+        }
+      }
+
       if (seenIncrement)
         return false;
       seenIncrement = true;
       break;
+    }
     case Instruction::Trunc:
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -232,7 +264,7 @@
 /// Fold the loop tail into the loop exit by speculating the loop tail
 /// instructions. Typically, this is a single post-increment. In the case of a
 /// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the cast of loops with early exits,
+/// duplicating the entire loop header. In the case of loops with early exits,
 /// rotation will not work anyway, but simplifyLoopLatch will put the loop in
 /// canonical form so downstream passes can handle it.
 ///
@@ -254,7 +286,7 @@
   if (!BI)
     return false;
 
-  if (!shouldSpeculateInstrs(Latch->begin(), Jmp))
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp, L))
     return false;
 
   DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
@@ -323,8 +355,11 @@
   // Check size of original header and reject loop if it is very big or we can't
   // duplicate blocks inside it.
   {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+
     CodeMetrics Metrics;
-    Metrics.analyzeBasicBlock(OrigHeader, *TTI);
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
     if (Metrics.notDuplicatable) {
       DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
             << " instructions: "; L->dump());
@@ -406,6 +441,7 @@
     // With the operands remapped, see if the instruction constant folds or is
     // otherwise simplifyable.  This commonly occurs because the entry from PHI
     // nodes allows icmps and other instructions to fold.
+    // FIXME: Provide DL, TLI, DT, AT to SimplifyInstruction.
     Value *V = SimplifyInstruction(C);
     if (V && LI->replacementPreservesLCSSAForm(C, V)) {
       // If so, then delete the temporary instruction and stick the folded value

diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 914b56a..7b60373 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

@@ -744,7 +744,7 @@
 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an
 /// obvious multiple of the UDivExpr.
 static bool isHighCostExpansion(const SCEV *S,
-                                SmallPtrSet<const SCEV*, 8> &Processed,
+                                SmallPtrSetImpl<const SCEV*> &Processed,
                                 ScalarEvolution &SE) {
   // Zero/One operand expressions
   switch (S->getSCEVType()) {
@@ -762,7 +762,7 @@
                                Processed, SE);
   }
 
-  if (!Processed.insert(S))
+  if (!Processed.insert(S).second)
     return false;
 
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
@@ -892,34 +892,34 @@
 
   void RateFormula(const TargetTransformInfo &TTI,
                    const Formula &F,
-                   SmallPtrSet<const SCEV *, 16> &Regs,
+                   SmallPtrSetImpl<const SCEV *> &Regs,
                    const DenseSet<const SCEV *> &VisitedRegs,
                    const Loop *L,
                    const SmallVectorImpl<int64_t> &Offsets,
                    ScalarEvolution &SE, DominatorTree &DT,
                    const LSRUse &LU,
-                   SmallPtrSet<const SCEV *, 16> *LoserRegs = nullptr);
+                   SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
 
   void print(raw_ostream &OS) const;
   void dump() const;
 
 private:
   void RateRegister(const SCEV *Reg,
-                    SmallPtrSet<const SCEV *, 16> &Regs,
+                    SmallPtrSetImpl<const SCEV *> &Regs,
                     const Loop *L,
                     ScalarEvolution &SE, DominatorTree &DT);
   void RatePrimaryRegister(const SCEV *Reg,
-                           SmallPtrSet<const SCEV *, 16> &Regs,
+                           SmallPtrSetImpl<const SCEV *> &Regs,
                            const Loop *L,
                            ScalarEvolution &SE, DominatorTree &DT,
-                           SmallPtrSet<const SCEV *, 16> *LoserRegs);
+                           SmallPtrSetImpl<const SCEV *> *LoserRegs);
 };
 
 }
 
 /// RateRegister - Tally up interesting quantities from the given register.
 void Cost::RateRegister(const SCEV *Reg,
-                        SmallPtrSet<const SCEV *, 16> &Regs,
+                        SmallPtrSetImpl<const SCEV *> &Regs,
                         const Loop *L,
                         ScalarEvolution &SE, DominatorTree &DT) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
@@ -967,15 +967,15 @@
 /// before, rate it. Optional LoserRegs provides a way to declare any formula
 /// that refers to one of those regs an instant loser.
 void Cost::RatePrimaryRegister(const SCEV *Reg,
-                               SmallPtrSet<const SCEV *, 16> &Regs,
+                               SmallPtrSetImpl<const SCEV *> &Regs,
                                const Loop *L,
                                ScalarEvolution &SE, DominatorTree &DT,
-                               SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+                               SmallPtrSetImpl<const SCEV *> *LoserRegs) {
   if (LoserRegs && LoserRegs->count(Reg)) {
     Lose();
     return;
   }
-  if (Regs.insert(Reg)) {
+  if (Regs.insert(Reg).second) {
     RateRegister(Reg, Regs, L, SE, DT);
     if (LoserRegs && isLoser())
       LoserRegs->insert(Reg);
@@ -984,13 +984,13 @@
 
 void Cost::RateFormula(const TargetTransformInfo &TTI,
                        const Formula &F,
-                       SmallPtrSet<const SCEV *, 16> &Regs,
+                       SmallPtrSetImpl<const SCEV *> &Regs,
                        const DenseSet<const SCEV *> &VisitedRegs,
                        const Loop *L,
                        const SmallVectorImpl<int64_t> &Offsets,
                        ScalarEvolution &SE, DominatorTree &DT,
                        const LSRUse &LU,
-                       SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
   assert(F.isCanonical() && "Cost is accurate only for canonical formula");
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
@@ -1337,10 +1337,9 @@
   }
 
   // Update the RegTracker.
-  for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
-       E = OldRegs.end(); I != E; ++I)
-    if (!Regs.count(*I))
-      RegUses.DropRegister(*I, LUIdx);
+  for (const SCEV *S : OldRegs)
+    if (!Regs.count(S))
+      RegUses.DropRegister(S, LUIdx);
 }
 
 void LSRUse::print(raw_ostream &OS) const {
@@ -2226,13 +2225,12 @@
   // must dominate all the post-inc comparisons we just set up, and it must
   // dominate the loop latch edge.
   IVIncInsertPos = L->getLoopLatch()->getTerminator();
-  for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
-       E = PostIncs.end(); I != E; ++I) {
+  for (Instruction *Inst : PostIncs) {
     BasicBlock *BB =
       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
-                                    (*I)->getParent());
-    if (BB == (*I)->getParent())
-      IVIncInsertPos = *I;
+                                    Inst->getParent());
+    if (BB == Inst->getParent())
+      IVIncInsertPos = Inst;
     else if (BB != IVIncInsertPos->getParent())
       IVIncInsertPos = BB->getTerminator();
   }
@@ -2557,7 +2555,7 @@
 ///
 /// TODO: Consider IVInc free if it's already used in another chains.
 static bool
-isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
+isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
                   ScalarEvolution &SE, const TargetTransformInfo &TTI) {
   if (StressIVChain)
     return true;
@@ -2567,9 +2565,8 @@
 
   if (!Users.empty()) {
     DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
-          for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(),
-                 E = Users.end(); I != E; ++I) {
-            dbgs() << "  " << **I << "\n";
+          for (Instruction *Inst : Users) {
+            dbgs() << "  " << *Inst << "\n";
           });
     return false;
   }
@@ -2805,7 +2802,7 @@
       User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
       while (IVOpIter != IVOpEnd) {
         Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
-        if (UniqueOperands.insert(IVOpInst))
+        if (UniqueOperands.insert(IVOpInst).second)
           ChainInstruction(I, IVOpInst, ChainUsersVec);
         IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
       }
@@ -3119,11 +3116,15 @@
 void
 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
-  SmallPtrSet<const SCEV *, 8> Inserted;
+  SmallPtrSet<const SCEV *, 32> Visited;
 
   while (!Worklist.empty()) {
     const SCEV *S = Worklist.pop_back_val();
 
+    // Don't process the same SCEV twice
+    if (!Visited.insert(S).second)
+      continue;
+
     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
       Worklist.append(N->op_begin(), N->op_end());
     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
@@ -3132,7 +3133,6 @@
       Worklist.push_back(D->getLHS());
       Worklist.push_back(D->getRHS());
     } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
-      if (!Inserted.insert(US)) continue;
       const Value *V = US->getValue();
       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
         // Look for instructions defined outside the loop.
@@ -3774,7 +3774,7 @@
         for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
              LUIdx = UsedByIndices.find_next(LUIdx))
           // Make a memo of this use, offset, and register tuple.
-          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
+          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
       }
     }
@@ -4302,10 +4302,9 @@
   // reference that register in order to be considered. This prunes out
   // unprofitable searching.
   SmallSetVector<const SCEV *, 4> ReqRegs;
-  for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
-       E = CurRegs.end(); I != E; ++I)
-    if (LU.Regs.count(*I))
-      ReqRegs.insert(*I);
+  for (const SCEV *S : CurRegs)
+    if (LU.Regs.count(S))
+      ReqRegs.insert(S);
 
   SmallPtrSet<const SCEV *, 16> NewRegs;
   Cost NewCost;
@@ -4350,9 +4349,8 @@
       } else {
         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
               dbgs() << ".\n Regs:";
-              for (SmallPtrSet<const SCEV *, 16>::const_iterator
-                   I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
-                dbgs() << ' ' << **I;
+              for (const SCEV *S : NewRegs)
+                dbgs() << ' ' << *S;
               dbgs() << '\n');
 
         SolutionCost = NewCost;

diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 00c0f88..f60d990 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp

@@ -13,7 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/FunctionTargetTransformInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -52,7 +54,7 @@
 
 static cl::opt<unsigned>
 PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
-  cl::desc("Unrolled size limit for loops with an unroll(enable) or "
+  cl::desc("Unrolled size limit for loops with an unroll(full) or "
            "unroll_count pragma."));
 
 namespace {
@@ -102,6 +104,7 @@
     /// loop preheaders be inserted into the CFG...
     ///
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionTracker>();
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
       AU.addRequiredID(LoopSimplifyID);
@@ -111,6 +114,7 @@
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
       AU.addRequired<TargetTransformInfo>();
+      AU.addRequired<FunctionTargetTransformInfo>();
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
       // loop will receive invalid dom info.
@@ -120,7 +124,7 @@
 
     // Fill in the UnrollingPreferences parameter with values from the
     // TargetTransformationInfo.
-    void getUnrollingPreferences(Loop *L, const TargetTransformInfo &TTI,
+    void getUnrollingPreferences(Loop *L, const FunctionTargetTransformInfo &FTTI,
                                  TargetTransformInfo::UnrollingPreferences &UP) {
       UP.Threshold = CurrentThreshold;
       UP.OptSizeThreshold = OptSizeUnrollThreshold;
@@ -130,7 +134,7 @@
       UP.MaxCount = UINT_MAX;
       UP.Partial = CurrentAllowPartial;
       UP.Runtime = CurrentRuntime;
-      TTI.getUnrollingPreferences(L, UP);
+      FTTI.getUnrollingPreferences(L, UP);
     }
 
     // Select and return an unroll count based on parameters from
@@ -138,12 +142,11 @@
     // SetExplicitly is set to true if the unroll count is is set by
     // the user or a pragma rather than selected heuristically.
     unsigned
-    selectUnrollCount(const Loop *L, unsigned TripCount, bool HasEnablePragma,
+    selectUnrollCount(const Loop *L, unsigned TripCount, bool PragmaFullUnroll,
                       unsigned PragmaCount,
                       const TargetTransformInfo::UnrollingPreferences &UP,
                       bool &SetExplicitly);
 
-
     // Select threshold values used to limit unrolling based on a
     // total unrolled size.  Parameters Threshold and PartialThreshold
     // are set to the maximum unrolled size for fully and partially
@@ -183,6 +186,8 @@
 char LoopUnroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(FunctionTargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -201,11 +206,15 @@
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
                                     bool &NotDuplicatable,
-                                    const TargetTransformInfo &TTI) {
+                                    const TargetTransformInfo &TTI,
+                                    AssumptionTracker *AT) {
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+
   CodeMetrics Metrics;
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
-    Metrics.analyzeBasicBlock(*I, TTI);
+    Metrics.analyzeBasicBlock(*I, TTI, EphValues);
   NumCalls = Metrics.NumInlineCandidates;
   NotDuplicatable = Metrics.notDuplicatable;
 
@@ -219,13 +228,13 @@
   return LoopSize;
 }
 
-// Returns the value associated with the given metadata node name (for
-// example, "llvm.loop.unroll.count").  If no such named metadata node
-// exists, then nullptr is returned.
-static const ConstantInt *GetUnrollMetadataValue(const Loop *L,
-                                                 StringRef Name) {
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
+// returned.
+static const MDNode *GetUnrollMetadata(const Loop *L, StringRef Name) {
   MDNode *LoopID = L->getLoopID();
-  if (!LoopID) return nullptr;
+  if (!LoopID)
+    return nullptr;
 
   // First operand should refer to the loop id itself.
   assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
@@ -233,49 +242,80 @@
 
   for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
     const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
-    if (!MD) continue;
+    if (!MD)
+      continue;
 
     const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
-    if (!S) continue;
+    if (!S)
+      continue;
 
-    if (Name.equals(S->getString())) {
-      assert(MD->getNumOperands() == 2 &&
-             "Unroll hint metadata should have two operands.");
-      return cast<ConstantInt>(MD->getOperand(1));
-    }
+    if (Name.equals(S->getString()))
+      return MD;
   }
   return nullptr;
 }
 
-// Returns true if the loop has an unroll(enable) pragma.
-static bool HasUnrollEnablePragma(const Loop *L) {
-  const ConstantInt *EnableValue =
-      GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
-  return (EnableValue && EnableValue->getZExtValue());
+// Returns true if the loop has an unroll(full) pragma.
+static bool HasUnrollFullPragma(const Loop *L) {
+  return GetUnrollMetadata(L, "llvm.loop.unroll.full");
 }
 
 // Returns true if the loop has an unroll(disable) pragma.
 static bool HasUnrollDisablePragma(const Loop *L) {
-  const ConstantInt *EnableValue =
-      GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
-  return (EnableValue && !EnableValue->getZExtValue());
+  return GetUnrollMetadata(L, "llvm.loop.unroll.disable");
 }
 
 // If loop has an unroll_count pragma return the (necessarily
 // positive) value from the pragma.  Otherwise return 0.
 static unsigned UnrollCountPragmaValue(const Loop *L) {
-  const ConstantInt *CountValue =
-      GetUnrollMetadataValue(L, "llvm.loop.unroll.count");
-  if (CountValue) {
-    unsigned Count = CountValue->getZExtValue();
+  const MDNode *MD = GetUnrollMetadata(L, "llvm.loop.unroll.count");
+  if (MD) {
+    assert(MD->getNumOperands() == 2 &&
+           "Unroll count hint metadata should have two operands.");
+    unsigned Count = cast<ConstantInt>(MD->getOperand(1))->getZExtValue();
     assert(Count >= 1 && "Unroll count must be positive.");
     return Count;
   }
   return 0;
 }
 
+// Remove existing unroll metadata and add unroll disable metadata to
+// indicate the loop has already been unrolled.  This prevents a loop
+// from being unrolled more than is directed by a pragma if the loop
+// unrolling pass is run more than once (which it generally is).
+static void SetLoopAlreadyUnrolled(Loop *L) {
+  MDNode *LoopID = L->getLoopID();
+  if (!LoopID) return;
+
+  // First remove any existing loop unrolling metadata.
+  SmallVector<Value *, 4> Vals;
+  // Reserve first location for self reference to the LoopID metadata node.
+  Vals.push_back(nullptr);
+  for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+    bool IsUnrollMetadata = false;
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (MD) {
+      const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+      IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
+    }
+    if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i));
+  }
+
+  // Add unroll(disable) metadata to disable future unrolling.
+  LLVMContext &Context = L->getHeader()->getContext();
+  SmallVector<Value *, 1> DisableOperands;
+  DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
+  MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+  Vals.push_back(DisableNode);
+
+  MDNode *NewLoopID = MDNode::get(Context, Vals);
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  L->setLoopID(NewLoopID);
+}
+
 unsigned LoopUnroll::selectUnrollCount(
-    const Loop *L, unsigned TripCount, bool HasEnablePragma,
+    const Loop *L, unsigned TripCount, bool PragmaFullUnroll,
     unsigned PragmaCount, const TargetTransformInfo::UnrollingPreferences &UP,
     bool &SetExplicitly) {
   SetExplicitly = true;
@@ -289,9 +329,7 @@
   if (Count == 0) {
     if (PragmaCount) {
       Count = PragmaCount;
-    } else if (HasEnablePragma) {
-      // unroll(enable) pragma without an unroll_count pragma
-      // indicates to unroll loop fully.
+    } else if (PragmaFullUnroll) {
       Count = TripCount;
     }
   }
@@ -323,6 +361,9 @@
   LoopInfo *LI = &getAnalysis<LoopInfo>();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
   const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
+  const FunctionTargetTransformInfo &FTTI =
+      getAnalysis<FunctionTargetTransformInfo>();
+  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
@@ -331,35 +372,37 @@
   if (HasUnrollDisablePragma(L)) {
     return false;
   }
-  bool HasEnablePragma = HasUnrollEnablePragma(L);
+  bool PragmaFullUnroll = HasUnrollFullPragma(L);
   unsigned PragmaCount = UnrollCountPragmaValue(L);
-  bool HasPragma = HasEnablePragma || PragmaCount > 0;
+  bool HasPragma = PragmaFullUnroll || PragmaCount > 0;
 
   TargetTransformInfo::UnrollingPreferences UP;
-  getUnrollingPreferences(L, TTI, UP);
+  getUnrollingPreferences(L, FTTI, UP);
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
   unsigned TripMultiple = 1;
-  // Find "latch trip count". UnrollLoop assumes that control cannot exit
-  // via the loop latch on any iteration prior to TripCount. The loop may exit
-  // early via an earlier branch.
-  BasicBlock *LatchBlock = L->getLoopLatch();
-  if (LatchBlock) {
-    TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
-    TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
+  // If there are multiple exiting blocks but one of them is the latch, use the
+  // latch for the trip count estimation. Otherwise insist on a single exiting
+  // block for the trip count estimation.
+  BasicBlock *ExitingBlock = L->getLoopLatch();
+  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
+    ExitingBlock = L->getExitingBlock();
+  if (ExitingBlock) {
+    TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
+    TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
   }
 
   // Select an initial unroll count.  This may be reduced later based
   // on size thresholds.
   bool CountSetExplicitly;
-  unsigned Count = selectUnrollCount(L, TripCount, HasEnablePragma, PragmaCount,
-                                     UP, CountSetExplicitly);
+  unsigned Count = selectUnrollCount(L, TripCount, PragmaFullUnroll,
+                                     PragmaCount, UP, CountSetExplicitly);
 
   unsigned NumInlineCandidates;
   bool notDuplicatable;
   unsigned LoopSize =
-      ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI);
+      ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, AT);
   DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
   uint64_t UnrolledSize = (uint64_t)LoopSize * Count;
   if (notDuplicatable) {
@@ -428,21 +471,26 @@
   }
 
   if (HasPragma) {
+    if (PragmaCount != 0)
+      // If loop has an unroll count pragma mark loop as unrolled to prevent
+      // unrolling beyond that requested by the pragma.
+      SetLoopAlreadyUnrolled(L);
+
     // Emit optimization remarks if we are unable to unroll the loop
     // as directed by a pragma.
     DebugLoc LoopLoc = L->getStartLoc();
     Function *F = Header->getParent();
     LLVMContext &Ctx = F->getContext();
-    if (HasEnablePragma && PragmaCount == 0) {
+    if (PragmaFullUnroll && PragmaCount == 0) {
       if (TripCount && Count != TripCount) {
         emitOptimizationRemarkMissed(
             Ctx, DEBUG_TYPE, *F, LoopLoc,
-            "Unable to fully unroll loop as directed by unroll(enable) pragma "
+            "Unable to fully unroll loop as directed by unroll(full) pragma "
             "because unrolled size is too large.");
       } else if (!TripCount) {
         emitOptimizationRemarkMissed(
             Ctx, DEBUG_TYPE, *F, LoopLoc,
-            "Unable to fully unroll loop as directed by unroll(enable) pragma "
+            "Unable to fully unroll loop as directed by unroll(full) pragma "
             "because loop has a runtime trip count.");
       }
     } else if (PragmaCount > 0 && Count != OriginalCount) {
@@ -460,7 +508,8 @@
   }
 
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, AllowRuntime, TripMultiple, LI, this,
+                  &LPM, AT))
     return false;
 
   return true;

diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 977c53a..ef43483 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp

@@ -30,6 +30,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -103,7 +104,8 @@
 
       // Analyze loop. Check its size, calculate is it possible to unswitch
       // it. Returns true if we can unswitch this loop.
-      bool countLoop(const Loop *L, const TargetTransformInfo &TTI);
+      bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
+                     AssumptionTracker *AT);
 
       // Clean all data related to given loop.
       void forgetLoop(const Loop *L);
@@ -126,6 +128,7 @@
   class LoopUnswitch : public LoopPass {
     LoopInfo *LI;  // Loop information
     LPPassManager *LPM;
+    AssumptionTracker *AT;
 
     // LoopProcessWorklist - Used to check if second loop needs processing
     // after RewriteLoopBodyWithConditionConstant rewrites first loop.
@@ -164,6 +167,7 @@
     /// loop preheaders be inserted into the CFG.
     ///
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionTracker>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequired<LoopInfo>();
@@ -212,7 +216,8 @@
 
 // Analyze loop. Check its size, calculate is it possible to unswitch
 // it. Returns true if we can unswitch this loop.
-bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
+                                AssumptionTracker *AT) {
 
   LoopPropsMapIt PropsIt;
   bool Inserted;
@@ -229,13 +234,16 @@
     // large numbers of branches which cause loop unswitching to go crazy.
     // This is a very ad-hoc heuristic.
 
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+
     // FIXME: This is overly conservative because it does not take into
     // consideration code simplification opportunities and code that can
     // be shared by the resultant unswitched loops.
     CodeMetrics Metrics;
     for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
          I != E; ++I)
-      Metrics.analyzeBasicBlock(*I, TTI);
+      Metrics.analyzeBasicBlock(*I, TTI, EphValues);
 
     Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5);
     Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
@@ -326,6 +334,7 @@
 INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -376,6 +385,7 @@
   if (skipOptnoneFunction(L))
     return false;
 
+  AT = &getAnalysis<AssumptionTracker>();
   LI = &getAnalysis<LoopInfo>();
   LPM = &LPM_Ref;
   DominatorTreeWrapperPass *DTWP =
@@ -421,7 +431,8 @@
 
   // Probably we reach the quota of branches for this loop. If so
   // stop unswitching.
-  if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>()))
+  if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>(),
+                              AT))
     return false;
 
   // Loop over all of the basic blocks in the loop.  If we find an interior
@@ -823,6 +834,10 @@
   F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(),
                                 NewBlocks[0], F->end());
 
+  // FIXME: We could register any cloned assumptions instead of clearing the
+  // whole function's cache.
+  AT->forgetCachedAssumptions(F);
+
   // Now we create the new Loop object for the versioned loop.
   Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
 

diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index b6bc792..be524be 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp

@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
@@ -329,6 +330,7 @@
     // This transformation requires dominator postdominator info
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
+      AU.addRequired<AssumptionTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<MemoryDependenceAnalysis>();
       AU.addRequired<AliasAnalysis>();
@@ -361,6 +363,7 @@
 
 INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
                       false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
@@ -631,22 +634,24 @@
     if (destSize < srcSize)
       return false;
   } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
-    // If the destination is an sret parameter then only accesses that are
-    // outside of the returned struct type can trap.
-    if (!A->hasStructRetAttr())
-      return false;
+    if (A->getDereferenceableBytes() < srcSize) {
+      // If the destination is an sret parameter then only accesses that are
+      // outside of the returned struct type can trap.
+      if (!A->hasStructRetAttr())
+        return false;
 
-    Type *StructTy = cast<PointerType>(A->getType())->getElementType();
-    if (!StructTy->isSized()) {
-      // The call may never return and hence the copy-instruction may never
-      // be executed, and therefore it's not safe to say "the destination
-      // has at least <cpyLen> bytes, as implied by the copy-instruction",
-      return false;
+      Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+      if (!StructTy->isSized()) {
+        // The call may never return and hence the copy-instruction may never
+        // be executed, and therefore it's not safe to say "the destination
+        // has at least <cpyLen> bytes, as implied by the copy-instruction",
+        return false;
+      }
+
+      uint64_t destSize = DL->getTypeAllocSize(StructTy);
+      if (destSize < srcSize)
+        return false;
     }
-
-    uint64_t destSize = DL->getTypeAllocSize(StructTy);
-    if (destSize < srcSize)
-      return false;
   } else {
     return false;
   }
@@ -673,17 +678,31 @@
     if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
       for (User *UU : U->users())
         srcUseList.push_back(UU);
-    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
-      if (G->hasAllZeroIndices())
-        for (User *UU : U->users())
-          srcUseList.push_back(UU);
-      else
-        return false;
-    } else if (U != C && U != cpy) {
-      return false;
+      continue;
     }
+    if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+      if (!G->hasAllZeroIndices())
+        return false;
+
+      for (User *UU : U->users())
+        srcUseList.push_back(UU);
+      continue;
+    }
+    if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+      if (IT->getIntrinsicID() == Intrinsic::lifetime_start ||
+          IT->getIntrinsicID() == Intrinsic::lifetime_end)
+        continue;
+
+    if (U != C && U != cpy)
+      return false;
   }
 
+  // Check that src isn't captured by the called function since the
+  // transformation can cause aliasing issues in that case.
+  for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+    if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i))
+      return false;
+
   // Since we're changing the parameter to the callsite, we need to make sure
   // that what would be the new parameter dominates the callsite.
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -963,8 +982,11 @@
 
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
+  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   if (MDep->getAlignment() < ByValAlign &&
-      getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, DL) < ByValAlign)
+      getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign,
+                                 DL, AT, CS.getInstruction(), &DT) < ByValAlign)
     return false;
 
   // Verify that the copied-from memory doesn't change in between the memcpy and

diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
new file mode 100644
index 0000000..8281c59
--- /dev/null
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp

@@ -0,0 +1,604 @@
+//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//! \file
+//! \brief This pass performs merges of loads and stores on both sides of a
+//  diamond (hammock). It hoists the loads and sinks the stores.
+//
+// The algorithm iteratively hoists two loads to the same address out of a
+// diamond (hammock) and merges them into a single load in the header. Similar
+// it sinks and merges two stores to the tail block (footer). The algorithm
+// iterates over the instructions of one side of the diamond and attempts to
+// find a matching load/store on the other side. It hoists / sinks when it
+// thinks it safe to do so.  This optimization helps with eg. hiding load
+// latencies, triggering if-conversion, and reducing static code size.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Example:
+// Diamond shaped code before merge:
+//
+//            header:
+//                     br %cond, label %if.then, label %if.else
+//                        +                    +
+//                       +                      +
+//                      +                        +
+//            if.then:                         if.else:
+//               %lt = load %addr_l               %le = load %addr_l
+//               <use %lt>                        <use %le>
+//               <...>                            <...>
+//               store %st, %addr_s               store %se, %addr_s
+//               br label %if.end                 br label %if.end
+//                     +                         +
+//                      +                       +
+//                       +                     +
+//            if.end ("footer"):
+//                     <...>
+//
+// Diamond shaped code after merge:
+//
+//            header:
+//                     %l = load %addr_l
+//                     br %cond, label %if.then, label %if.else
+//                        +                    +
+//                       +                      +
+//                      +                        +
+//            if.then:                         if.else:
+//               <use %l>                         <use %l>
+//               <...>                            <...>
+//               br label %if.end                 br label %if.end
+//                      +                        +
+//                       +                      +
+//                        +                    +
+//            if.end ("footer"):
+//                     %s.sink = phi [%st, if.then], [%se, if.else]
+//                     <...>
+//                     store %s.sink, %addr_s
+//                     <...>
+//
+//
+//===----------------------- TODO -----------------------------------------===//
+//
+// 1) Generalize to regions other than diamonds
+// 2) Be more aggressive merging memory operations
+// Note that both changes require register pressure control
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <vector>
+using namespace llvm;
+
+#define DEBUG_TYPE "mldst-motion"
+
+//===----------------------------------------------------------------------===//
+//                         MergedLoadStoreMotion Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class MergedLoadStoreMotion : public FunctionPass {
+  AliasAnalysis *AA;
+  MemoryDependenceAnalysis *MD;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit MergedLoadStoreMotion(void)
+      : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) {
+    initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // This transformation requires dominator postdominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfo>();
+    AU.addRequired<MemoryDependenceAnalysis>();
+    AU.addRequired<AliasAnalysis>();
+    AU.addPreserved<AliasAnalysis>();
+  }
+
+  // Helper routines
+
+  ///
+  /// \brief Remove instruction from parent and update memory dependence
+  /// analysis.
+  ///
+  void removeInstruction(Instruction *Inst);
+  BasicBlock *getDiamondTail(BasicBlock *BB);
+  bool isDiamondHead(BasicBlock *BB);
+  // Routines for hoisting loads
+  bool isLoadHoistBarrierInRange(const Instruction& Start,
+                                 const Instruction& End,
+                                 LoadInst* LI);
+  LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
+  void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
+                        Instruction *ElseInst);
+  bool isSafeToHoist(Instruction *I) const;
+  bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst);
+  bool mergeLoads(BasicBlock *BB);
+  // Routines for sinking stores
+  StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
+  PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
+  bool isStoreSinkBarrier(Instruction *Inst);
+  bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst);
+  bool mergeStores(BasicBlock *BB);
+  // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
+  // where Size0 and Size1 are the #instructions on the two sides of
+  // the diamond. The constant chosen here is arbitrary. Compiler Time
+  // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
+  const int MagicCompileTimeControl;
+};
+
+char MergedLoadStoreMotion::ID = 0;
+}
+
+///
+/// \brief createMergedLoadStoreMotionPass - The public interface to this file.
+///
+FunctionPass *llvm::createMergedLoadStoreMotionPass() {
+  return new MergedLoadStoreMotion();
+}
+
+INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion",
+                      "MergedLoadStoreMotion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion",
+                    "MergedLoadStoreMotion", false, false)
+
+///
+/// \brief Remove instruction from parent and update memory dependence analysis.
+///
+void MergedLoadStoreMotion::removeInstruction(Instruction *Inst) {
+  // Notify the memory dependence analysis.
+  if (MD) {
+    MD->removeInstruction(Inst);
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      MD->invalidateCachedPointerInfo(LI->getPointerOperand());
+    if (Inst->getType()->getScalarType()->isPointerTy()) {
+      MD->invalidateCachedPointerInfo(Inst);
+    }
+  }
+  Inst->eraseFromParent();
+}
+
+///
+/// \brief Return tail block of a diamond.
+///
+BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
+  assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
+  BranchInst *BI = (BranchInst *)(BB->getTerminator());
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0);
+  return Tail;
+}
+
+///
+/// \brief True when BB is the head of a diamond (hammock)
+///
+bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
+  if (!BB)
+    return false;
+  if (!isa<BranchInst>(BB->getTerminator()))
+    return false;
+  if (BB->getTerminator()->getNumSuccessors() != 2)
+    return false;
+
+  BranchInst *BI = (BranchInst *)(BB->getTerminator());
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Succ1 = BI->getSuccessor(1);
+
+  if (!Succ0->getSinglePredecessor() ||
+      Succ0->getTerminator()->getNumSuccessors() != 1)
+    return false;
+  if (!Succ1->getSinglePredecessor() ||
+      Succ1->getTerminator()->getNumSuccessors() != 1)
+    return false;
+
+  BasicBlock *Tail = Succ0->getTerminator()->getSuccessor(0);
+  // Ignore triangles.
+  if (Succ1->getTerminator()->getSuccessor(0) != Tail)
+    return false;
+  return true;
+}
+
+///
+/// \brief True when instruction is a hoist barrier for a load
+///
+/// Whenever an instruction could possibly modify the value
+/// being loaded or protect against the load from happening
+/// it is considered a hoist barrier.
+///
+
+bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, 
+                                                      const Instruction& End,
+                                                      LoadInst* LI) {
+  AliasAnalysis::Location Loc = AA->getLocation(LI);
+  return AA->canInstructionRangeModify(Start, End, Loc);
+}
+
+///
+/// \brief Decide if a load can be hoisted
+///
+/// When there is a load in \p BB to the same address as \p LI
+/// and it can be hoisted from \p BB, return that load.
+/// Otherwise return Null.
+///
+LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
+                                                   LoadInst *Load0) {
+
+  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
+       ++BBI) {
+    Instruction *Inst = BBI;
+
+    // Only merge and hoist loads when their result in used only in BB
+    if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1))
+      continue;
+
+    LoadInst *Load1 = dyn_cast<LoadInst>(Inst);
+    BasicBlock *BB0 = Load0->getParent();
+
+    AliasAnalysis::Location Loc0 = AA->getLocation(Load0);
+    AliasAnalysis::Location Loc1 = AA->getLocation(Load1);
+    if (AA->isMustAlias(Loc0, Loc1) && Load0->isSameOperationAs(Load1) &&
+        !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1) &&
+        !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0)) {
+      return Load1;
+    }
+  }
+  return nullptr;
+}
+
+///
+/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into
+/// \p BB
+///
+/// BB is the head of a diamond
+///
+void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
+                                             Instruction *HoistCand,
+                                             Instruction *ElseInst) {
+  DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump();
+        dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n";
+        dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n");
+  // Hoist the instruction.
+  assert(HoistCand->getParent() != BB);
+
+  // Intersect optional metadata.
+  HoistCand->intersectOptionalDataWith(ElseInst);
+  HoistCand->dropUnknownMetadata();
+
+  // Prepend point for instruction insert
+  Instruction *HoistPt = BB->getTerminator();
+
+  // Merged instruction
+  Instruction *HoistedInst = HoistCand->clone();
+
+  // Notify AA of the new value.
+  if (isa<LoadInst>(HoistCand))
+    AA->copyValue(HoistCand, HoistedInst);
+
+  // Hoist instruction.
+  HoistedInst->insertBefore(HoistPt);
+
+  HoistCand->replaceAllUsesWith(HoistedInst);
+  removeInstruction(HoistCand);
+  // Replace the else block instruction.
+  ElseInst->replaceAllUsesWith(HoistedInst);
+  removeInstruction(ElseInst);
+}
+
+///
+/// \brief Return true if no operand of \p I is defined in I's parent block
+///
+bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const {
+  BasicBlock *Parent = I->getParent();
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    Instruction *Instr = dyn_cast<Instruction>(I->getOperand(i));
+    if (Instr && Instr->getParent() == Parent)
+      return false;
+  }
+  return true;
+}
+
+///
+/// \brief Merge two equivalent loads and GEPs and hoist into diamond head
+///
+bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
+                                      LoadInst *L1) {
+  // Only one definition?
+  Instruction *A0 = dyn_cast<Instruction>(L0->getPointerOperand());
+  Instruction *A1 = dyn_cast<Instruction>(L1->getPointerOperand());
+  if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) &&
+      A0->hasOneUse() && (A0->getParent() == L0->getParent()) &&
+      A1->hasOneUse() && (A1->getParent() == L1->getParent()) &&
+      isa<GetElementPtrInst>(A0)) {
+    DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump();
+          dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n";
+          dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n");
+    hoistInstruction(BB, A0, A1);
+    hoistInstruction(BB, L0, L1);
+    return true;
+  } else
+    return false;
+}
+
+///
+/// \brief Try to hoist two loads to same address into diamond header
+///
+/// Starting from a diamond head block, iterate over the instructions in one
+/// successor block and try to match a load in the second successor.
+///
+bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
+  bool MergedLoads = false;
+  assert(isDiamondHead(BB));
+  BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  BasicBlock *Succ0 = BI->getSuccessor(0);
+  BasicBlock *Succ1 = BI->getSuccessor(1);
+  // #Instructions in Succ1 for Compile Time Control
+  int Size1 = Succ1->size();
+  int NLoads = 0;
+  for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end();
+       BBI != BBE;) {
+
+    Instruction *I = BBI;
+    ++BBI;
+
+    // Only move non-simple (atomic, volatile) loads.
+    LoadInst *L0 = dyn_cast<LoadInst>(I);
+    if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0))
+      continue;
+
+    ++NLoads;
+    if (NLoads * Size1 >= MagicCompileTimeControl)
+      break;
+    if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) {
+      bool Res = hoistLoad(BB, L0, L1);
+      MergedLoads |= Res;
+      // Don't attempt to hoist above loads that had not been hoisted.
+      if (!Res)
+        break;
+    }
+  }
+  return MergedLoads;
+}
+
+///
+/// \brief True when instruction is sink barrier for a store
+/// 
+bool MergedLoadStoreMotion::isStoreSinkBarrier(Instruction *Inst) {
+  // FIXME: Conservatively let a load instruction block the store.
+  // Use alias analysis instead.
+  if (isa<LoadInst>(Inst))
+    return true;
+  if (isa<CallInst>(Inst))
+    return true;
+  if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst))
+    return true;
+  // Note: mayHaveSideEffects covers all instructions that could
+  // trigger a change to state. Eg. in-flight stores have to be executed
+  // before ordered loads or fences, calls could invoke functions that store
+  // data to memory etc.
+  if (!isa<StoreInst>(Inst) && Inst->mayHaveSideEffects()) {
+    return true;
+  }
+  DEBUG(dbgs() << "No Sink Barrier\n");
+  return false;
+}
+
+///
+/// \brief Check if \p BB contains a store to the same address as \p SI
+///
+/// \return The store in \p  when it is safe to sink. Otherwise return Null.
+///
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB,
+                                                   StoreInst *SI) {
+  StoreInst *I = 0;
+  DEBUG(dbgs() << "can Sink? : "; SI->dump(); dbgs() << "\n");
+  for (BasicBlock::reverse_iterator RBI = BB->rbegin(), RBE = BB->rend();
+       RBI != RBE; ++RBI) {
+    Instruction *Inst = &*RBI;
+
+    // Only move loads if they are used in the block.
+    if (isStoreSinkBarrier(Inst))
+      break;
+    if (isa<StoreInst>(Inst)) {
+      AliasAnalysis::Location LocSI = AA->getLocation(SI);
+      AliasAnalysis::Location LocInst = AA->getLocation((StoreInst *)Inst);
+      if (AA->isMustAlias(LocSI, LocInst)) {
+        I = (StoreInst *)Inst;
+        break;
+      }
+    }
+  }
+  return I;
+}
+
+///
+/// \brief Create a PHI node in BB for the operands of S0 and S1
+///
+PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
+                                              StoreInst *S1) {
+  // Create a phi if the values mismatch.
+  PHINode *NewPN = 0;
+  Value *Opd1 = S0->getValueOperand();
+  Value *Opd2 = S1->getValueOperand();
+  if (Opd1 != Opd2) {
+    NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
+                            BB->begin());
+    NewPN->addIncoming(Opd1, S0->getParent());
+    NewPN->addIncoming(Opd2, S1->getParent());
+    if (NewPN->getType()->getScalarType()->isPointerTy()) {
+      // Notify AA of the new value.
+      AA->copyValue(Opd1, NewPN);
+      AA->copyValue(Opd2, NewPN);
+      // AA needs to be informed when a PHI-use of the pointer value is added
+      for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) {
+        unsigned J = PHINode::getOperandNumForIncomingValue(I);
+        AA->addEscapingUse(NewPN->getOperandUse(J));
+      }
+      if (MD)
+        MD->invalidateCachedPointerInfo(NewPN);
+    }
+  }
+  return NewPN;
+}
+
+///
+/// \brief Merge two stores to same address and sink into \p BB
+///
+/// Also sinks GEP instruction computing the store address
+///
+bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0,
+                                      StoreInst *S1) {
+  // Only one definition?
+  Instruction *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+  Instruction *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+  if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+      (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+      (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) {
+    DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+          dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+          dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+    // Hoist the instruction.
+    BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+    // Intersect optional metadata.
+    S0->intersectOptionalDataWith(S1);
+    S0->dropUnknownMetadata();
+
+    // Create the new store to be inserted at the join point.
+    StoreInst *SNew = (StoreInst *)(S0->clone());
+    Instruction *ANew = A0->clone();
+    AA->copyValue(S0, SNew);
+    SNew->insertBefore(InsertPt);
+    ANew->insertBefore(SNew);
+
+    assert(S0->getParent() == A0->getParent());
+    assert(S1->getParent() == A1->getParent());
+
+    PHINode *NewPN = getPHIOperand(BB, S0, S1);
+    // New PHI operand? Use it.
+    if (NewPN)
+      SNew->setOperand(0, NewPN);
+    removeInstruction(S0);
+    removeInstruction(S1);
+    A0->replaceAllUsesWith(ANew);
+    removeInstruction(A0);
+    A1->replaceAllUsesWith(ANew);
+    removeInstruction(A1);
+    return true;
+  }
+  return false;
+}
+
+///
+/// \brief True when two stores are equivalent and can sink into the footer
+///
+/// Starting from a diamond tail block, iterate over the instructions in one
+/// predecessor block and try to match a store in the second predecessor.
+///
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) {
+
+  bool MergedStores = false;
+  assert(T && "Footer of a diamond cannot be empty");
+
+  pred_iterator PI = pred_begin(T), E = pred_end(T);
+  assert(PI != E);
+  BasicBlock *Pred0 = *PI;
+  ++PI;
+  BasicBlock *Pred1 = *PI;
+  ++PI;
+  // tail block  of a diamond/hammock?
+  if (Pred0 == Pred1)
+    return false; // No.
+  if (PI != E)
+    return false; // No. More than 2 predecessors.
+
+  // #Instructions in Succ1 for Compile Time Control
+  int Size1 = Pred1->size();
+  int NStores = 0;
+
+  for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
+       RBI != RBE;) {
+
+    Instruction *I = &*RBI;
+    ++RBI;
+    if (isStoreSinkBarrier(I))
+      break;
+    // Sink move non-simple (atomic, volatile) stores
+    if (!isa<StoreInst>(I))
+      continue;
+    StoreInst *S0 = (StoreInst *)I;
+    if (!S0->isSimple())
+      continue;
+
+    ++NStores;
+    if (NStores * Size1 >= MagicCompileTimeControl)
+      break;
+    if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
+      bool Res = sinkStore(T, S0, S1);
+      MergedStores |= Res;
+      // Don't attempt to sink below stores that had to stick around
+      // But after removal of a store and some of its feeding
+      // instruction search again from the beginning since the iterator
+      // is likely stale at this point.
+      if (!Res)
+        break;
+      else {
+        RBI = Pred0->rbegin();
+        RBE = Pred0->rend();
+        DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+      }
+    }
+  }
+  return MergedStores;
+}
+///
+/// \brief Run the transformation for each function
+///
+bool MergedLoadStoreMotion::runOnFunction(Function &F) {
+  MD = &getAnalysis<MemoryDependenceAnalysis>();
+  AA = &getAnalysis<AliasAnalysis>();
+
+  bool Changed = false;
+  DEBUG(dbgs() << "Instruction Merger\n");
+
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
+    BasicBlock *BB = FI++;
+
+    // Hoist equivalent loads and sink stores
+    // outside diamonds when possible
+    if (isDiamondHead(BB)) {
+      Changed |= mergeLoads(BB);
+      Changed |= mergeStores(getDiamondTail(BB));
+    }
+  }
+  return Changed;
+}

diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 7cce89e..5c8bed5 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp

@@ -108,6 +108,10 @@
   if (Call->onlyReadsMemory())
     return false;
 
+  // The call must have the expected result type.
+  if (!Call->getType()->isFloatingPointTy())
+    return false;
+
   // Do the following transformation:
   //
   // (before)

diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index ea2cf7c..1bbaaf3 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp

@@ -176,6 +176,7 @@
   private:
     void BuildRankMap(Function &F);
     unsigned getRank(Value *V);
+    void canonicalizeOperands(Instruction *I);
     void ReassociateExpression(BinaryOperator *I);
     void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
     Value *OptimizeExpression(BinaryOperator *I,
@@ -194,6 +195,7 @@
     Value *RemoveFactorFromExpression(Value *V, Value *Factor);
     void EraseInst(Instruction *I);
     void OptimizeInst(Instruction *I);
+    Instruction *canonicalizeNegConstExpr(Instruction *I);
   };
 }
 
@@ -235,7 +237,20 @@
 /// opcode and if it only has one use.
 static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
   if (V->hasOneUse() && isa<Instruction>(V) &&
-      cast<Instruction>(V)->getOpcode() == Opcode)
+      cast<Instruction>(V)->getOpcode() == Opcode &&
+      (!isa<FPMathOperator>(V) ||
+       cast<Instruction>(V)->hasUnsafeAlgebra()))
+    return cast<BinaryOperator>(V);
+  return nullptr;
+}
+
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
+                                        unsigned Opcode2) {
+  if (V->hasOneUse() && isa<Instruction>(V) &&
+      (cast<Instruction>(V)->getOpcode() == Opcode1 ||
+       cast<Instruction>(V)->getOpcode() == Opcode2) &&
+      (!isa<FPMathOperator>(V) ||
+       cast<Instruction>(V)->hasUnsafeAlgebra()))
     return cast<BinaryOperator>(V);
   return nullptr;
 }
@@ -264,9 +279,11 @@
 void Reassociate::BuildRankMap(Function &F) {
   unsigned i = 2;
 
-  // Assign distinct ranks to function arguments
-  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+  // Assign distinct ranks to function arguments.
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
     ValueRankMap[&*I] = ++i;
+    DEBUG(dbgs() << "Calculated Rank[" << I->getName() << "] = " << i << "\n");
+  }
 
   ReversePostOrderTraversal<Function*> RPOT(&F);
   for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
@@ -304,24 +321,78 @@
 
   // If this is a not or neg instruction, do not count it for rank.  This
   // assures us that X and ~X will have the same rank.
-  if (!I->getType()->isIntegerTy() ||
-      (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I)))
+  Type *Ty = V->getType();
+  if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) ||
+      (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
+       !BinaryOperator::isFNeg(I)))
     ++Rank;
 
-  //DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = "
-  //     << Rank << "\n");
+  DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank << "\n");
 
   return ValueRankMap[I] = Rank;
 }
 
+// Canonicalize constants to RHS.  Otherwise, sort the operands by rank.
+void Reassociate::canonicalizeOperands(Instruction *I) {
+  assert(isa<BinaryOperator>(I) && "Expected binary operator.");
+  assert(I->isCommutative() && "Expected commutative operator.");
+
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  unsigned LHSRank = getRank(LHS);
+  unsigned RHSRank = getRank(RHS);
+
+  if (isa<Constant>(RHS))
+    return;
+
+  if (isa<Constant>(LHS) || RHSRank < LHSRank)
+    cast<BinaryOperator>(I)->swapOperands();
+}
+
+static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntegerTy())
+    return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
+  else {
+    BinaryOperator *Res =
+        BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntegerTy())
+    return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
+  else {
+    BinaryOperator *Res =
+      BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
+static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
+                                 Instruction *InsertBefore, Value *FlagsOp) {
+  if (S1->getType()->isIntegerTy())
+    return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
+  else {
+    BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
+    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+    return Res;
+  }
+}
+
 /// LowerNegateToMultiply - Replace 0-X with X*-1.
 ///
 static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
-  Constant *Cst = Constant::getAllOnesValue(Neg->getType());
+  Type *Ty = Neg->getType();
+  Constant *NegOne = Ty->isIntegerTy() ? ConstantInt::getAllOnesValue(Ty)
+                                       : ConstantFP::get(Ty, -1.0);
 
-  BinaryOperator *Res =
-    BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
-  Neg->setOperand(1, Constant::getNullValue(Neg->getType())); // Drop use of op.
+  BinaryOperator *Res = CreateMul(Neg->getOperand(1), NegOne, "", Neg, Neg);
+  Neg->setOperand(1, Constant::getNullValue(Ty)); // Drop use of op.
   Res->takeName(Neg);
   Neg->replaceAllUsesWith(Res);
   Res->setDebugLoc(Neg->getDebugLoc());
@@ -377,13 +448,14 @@
     LHS = 0; // 1 + 1 === 0 modulo 2.
     return;
   }
-  if (Opcode == Instruction::Add) {
+  if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
     // TODO: Reduce the weight by exploiting nsw/nuw?
     LHS += RHS;
     return;
   }
 
-  assert(Opcode == Instruction::Mul && "Unknown associative operation!");
+  assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
+         "Unknown associative operation!");
   unsigned Bitwidth = LHS.getBitWidth();
   // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
   // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
@@ -499,8 +571,7 @@
   DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
   unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
   unsigned Opcode = I->getOpcode();
-  assert(Instruction::isAssociative(Opcode) &&
-         Instruction::isCommutative(Opcode) &&
+  assert(I->isAssociative() && I->isCommutative() &&
          "Expected an associative and commutative operation!");
 
   // Visit all operands of the expression, keeping track of their weight (the
@@ -515,7 +586,7 @@
   // ways to get to it.
   SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
   Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
-  bool MadeChange = false;
+  bool Changed = false;
 
   // Leaves of the expression are values that either aren't the right kind of
   // operation (eg: a constant, or a multiply in an add tree), or are, but have
@@ -552,7 +623,7 @@
       // If this is a binary operation of the right kind with only one use then
       // add its operands to the expression.
       if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
-        assert(Visited.insert(Op) && "Not first visit!");
+        assert(Visited.insert(Op).second && "Not first visit!");
         DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
         Worklist.push_back(std::make_pair(BO, Weight));
         continue;
@@ -562,7 +633,7 @@
       LeafMap::iterator It = Leaves.find(Op);
       if (It == Leaves.end()) {
         // Not in the leaf map.  Must be the first time we saw this operand.
-        assert(Visited.insert(Op) && "Not first visit!");
+        assert(Visited.insert(Op).second && "Not first visit!");
         if (!Op->hasOneUse()) {
           // This value has uses not accounted for by the expression, so it is
           // not safe to modify.  Mark it as being a leaf.
@@ -584,7 +655,7 @@
         // exactly one such use, drop this new use of the leaf.
         assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
         I->setOperand(OpIdx, UndefValue::get(I->getType()));
-        MadeChange = true;
+        Changed = true;
 
         // If the leaf is a binary operation of the right kind and we now see
         // that its multiple original uses were in fact all by nodes belonging
@@ -613,21 +684,24 @@
       // expression.  This means that it can safely be modified.  See if we
       // can usefully morph it into an expression of the right kind.
       assert((!isa<Instruction>(Op) ||
-              cast<Instruction>(Op)->getOpcode() != Opcode) &&
+              cast<Instruction>(Op)->getOpcode() != Opcode
+              || (isa<FPMathOperator>(Op) &&
+                  !cast<Instruction>(Op)->hasUnsafeAlgebra())) &&
              "Should have been handled above!");
       assert(Op->hasOneUse() && "Has uses outside the expression tree!");
 
       // If this is a multiply expression, turn any internal negations into
       // multiplies by -1 so they can be reassociated.
-      BinaryOperator *BO = dyn_cast<BinaryOperator>(Op);
-      if (Opcode == Instruction::Mul && BO && BinaryOperator::isNeg(BO)) {
-        DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
-        BO = LowerNegateToMultiply(BO);
-        DEBUG(dbgs() << *BO << 'n');
-        Worklist.push_back(std::make_pair(BO, Weight));
-        MadeChange = true;
-        continue;
-      }
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
+        if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
+            (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
+          DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+          BO = LowerNegateToMultiply(BO);
+          DEBUG(dbgs() << *BO << '\n');
+          Worklist.push_back(std::make_pair(BO, Weight));
+          Changed = true;
+          continue;
+        }
 
       // Failed to morph into an expression of the right type.  This really is
       // a leaf.
@@ -665,7 +739,7 @@
     Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1)));
   }
 
-  return MadeChange;
+  return Changed;
 }
 
 // RewriteExprTree - Now that the operands for this expression tree are
@@ -798,6 +872,8 @@
       Constant *Undef = UndefValue::get(I->getType());
       NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
                                      Undef, Undef, "", I);
+      if (NewOp->getType()->isFloatingPointTy())
+        NewOp->setFastMathFlags(I->getFastMathFlags());
     } else {
       NewOp = NodesToRewrite.pop_back_val();
     }
@@ -817,7 +893,14 @@
   // expression tree is dominated by all of Ops.
   if (ExpressionChanged)
     do {
-      ExpressionChanged->clearSubclassOptionalData();
+      // Preserve FastMathFlags.
+      if (isa<FPMathOperator>(I)) {
+        FastMathFlags Flags = I->getFastMathFlags();
+        ExpressionChanged->clearSubclassOptionalData();
+        ExpressionChanged->setFastMathFlags(Flags);
+      } else
+        ExpressionChanged->clearSubclassOptionalData();
+
       if (ExpressionChanged == I)
         break;
       ExpressionChanged->moveBefore(I);
@@ -834,6 +917,8 @@
 /// version of the value is returned, and BI is left pointing at the instruction
 /// that should be processed next by the reassociation pass.
 static Value *NegateValue(Value *V, Instruction *BI) {
+  if (ConstantFP *C = dyn_cast<ConstantFP>(V))
+    return ConstantExpr::getFNeg(C);
   if (Constant *C = dyn_cast<Constant>(V))
     return ConstantExpr::getNeg(C);
 
@@ -846,7 +931,8 @@
   // the constants.  We assume that instcombine will clean up the mess later if
   // we introduce tons of unnecessary negation instructions.
   //
-  if (BinaryOperator *I = isReassociableOp(V, Instruction::Add)) {
+  if (BinaryOperator *I =
+          isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
     // Push the negates through the add.
     I->setOperand(0, NegateValue(I->getOperand(0), BI));
     I->setOperand(1, NegateValue(I->getOperand(1), BI));
@@ -864,7 +950,8 @@
   // Okay, we need to materialize a negated version of V with an instruction.
   // Scan the use lists of V to see if we have one already.
   for (User *U : V->users()) {
-    if (!BinaryOperator::isNeg(U)) continue;
+    if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
+      continue;
 
     // We found one!  Now we have to make sure that the definition dominates
     // this use.  We do this by moving it to the entry block (if it is a
@@ -894,27 +981,34 @@
 
   // Insert a 'neg' instruction that subtracts the value from zero to get the
   // negation.
-  return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI);
+  return CreateNeg(V, V->getName() + ".neg", BI, BI);
 }
 
 /// ShouldBreakUpSubtract - Return true if we should break up this subtract of
 /// X-Y into (X + -Y).
 static bool ShouldBreakUpSubtract(Instruction *Sub) {
   // If this is a negation, we can't split it up!
-  if (BinaryOperator::isNeg(Sub))
+  if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
+    return false;
+
+  // Don't breakup X - undef.
+  if (isa<UndefValue>(Sub->getOperand(1)))
     return false;
 
   // Don't bother to break this up unless either the LHS is an associable add or
   // subtract or if this is only used by one.
-  if (isReassociableOp(Sub->getOperand(0), Instruction::Add) ||
-      isReassociableOp(Sub->getOperand(0), Instruction::Sub))
+  Value *V0 = Sub->getOperand(0);
+  if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
+      isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
     return true;
-  if (isReassociableOp(Sub->getOperand(1), Instruction::Add) ||
-      isReassociableOp(Sub->getOperand(1), Instruction::Sub))
+  Value *V1 = Sub->getOperand(1);
+  if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
+      isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
     return true;
+  Value *VB = Sub->user_back();
   if (Sub->hasOneUse() &&
-      (isReassociableOp(Sub->user_back(), Instruction::Add) ||
-       isReassociableOp(Sub->user_back(), Instruction::Sub)))
+      (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
+       isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
     return true;
 
   return false;
@@ -931,8 +1025,7 @@
   // and set it as the RHS of the add instruction we just made.
   //
   Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
-  BinaryOperator *New =
-    BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub);
+  BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
   Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
   Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
   New->takeName(Sub);
@@ -956,8 +1049,19 @@
     BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
   Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
   Mul->takeName(Shl);
+
+  // Everyone now refers to the mul instruction.
   Shl->replaceAllUsesWith(Mul);
   Mul->setDebugLoc(Shl->getDebugLoc());
+
+  // We can safely preserve the nuw flag in all cases.  It's also safe to turn a
+  // nuw nsw shl into a nuw nsw mul.  However, nsw in isolation requires special
+  // handling.
+  bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
+  bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
+  if (NSW && NUW)
+    Mul->setHasNoSignedWrap(true);
+  Mul->setHasNoUnsignedWrap(NUW);
   return Mul;
 }
 
@@ -969,13 +1073,23 @@
                                   Value *X) {
   unsigned XRank = Ops[i].Rank;
   unsigned e = Ops.size();
-  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j)
+  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
     if (Ops[j].Op == X)
       return j;
+    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+      if (Instruction *I2 = dyn_cast<Instruction>(X))
+        if (I1->isIdenticalTo(I2))
+          return j;
+  }
   // Scan backwards.
-  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j)
+  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
     if (Ops[j].Op == X)
       return j;
+    if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+      if (Instruction *I2 = dyn_cast<Instruction>(X))
+        if (I1->isIdenticalTo(I2))
+          return j;
+  }
   return i;
 }
 
@@ -988,15 +1102,16 @@
   Value *V1 = Ops.back();
   Ops.pop_back();
   Value *V2 = EmitAddTreeOfValues(I, Ops);
-  return BinaryOperator::CreateAdd(V2, V1, "tmp", I);
+  return CreateAdd(V2, V1, "tmp", I, I);
 }
 
 /// RemoveFactorFromExpression - If V is an expression tree that is a
 /// multiplication sequence, and if this sequence contains a multiply by Factor,
 /// remove Factor from the tree and return the new tree.
 Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
-  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
-  if (!BO) return nullptr;
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+  if (!BO)
+    return nullptr;
 
   SmallVector<RepeatedValue, 8> Tree;
   MadeChange |= LinearizeExprTree(BO, Tree);
@@ -1018,13 +1133,25 @@
     }
 
     // If this is a negative version of this factor, remove it.
-    if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor))
+    if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
       if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
         if (FC1->getValue() == -FC2->getValue()) {
           FoundFactor = NeedsNegate = true;
           Factors.erase(Factors.begin()+i);
           break;
         }
+    } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
+      if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
+        APFloat F1(FC1->getValueAPF());
+        APFloat F2(FC2->getValueAPF());
+        F2.changeSign();
+        if (F1.compare(F2) == APFloat::cmpEqual) {
+          FoundFactor = NeedsNegate = true;
+          Factors.erase(Factors.begin() + i);
+          break;
+        }
+      }
+    }
   }
 
   if (!FoundFactor) {
@@ -1046,7 +1173,7 @@
   }
 
   if (NeedsNegate)
-    V = BinaryOperator::CreateNeg(V, "neg", InsertPt);
+    V = CreateNeg(V, "neg", InsertPt, BO);
 
   return V;
 }
@@ -1058,7 +1185,7 @@
 static void FindSingleUseMultiplyFactors(Value *V,
                                          SmallVectorImpl<Value*> &Factors,
                                        const SmallVectorImpl<ValueEntry> &Ops) {
-  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
   if (!BO) {
     Factors.push_back(V);
     return;
@@ -1389,13 +1516,15 @@
       ++NumFactor;
 
       // Insert a new multiply.
-      Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound);
-      Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I);
+      Type *Ty = TheOp->getType();
+      Constant *C = Ty->isIntegerTy() ? ConstantInt::get(Ty, NumFound)
+                                      : ConstantFP::get(Ty, NumFound);
+      Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
 
       // Now that we have inserted a multiply, optimize it. This allows us to
       // handle cases that require multiple factoring steps, such as this:
       // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
-      RedoInsts.insert(cast<Instruction>(Mul));
+      RedoInsts.insert(Mul);
 
       // If every add operand was a duplicate, return the multiply.
       if (Ops.empty())
@@ -1412,11 +1541,12 @@
     }
 
     // Check for X and -X or X and ~X in the operand list.
-    if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isNot(TheOp))
+    if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
+        !BinaryOperator::isNot(TheOp))
       continue;
 
     Value *X = nullptr;
-    if (BinaryOperator::isNeg(TheOp))
+    if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
       X = BinaryOperator::getNegArgument(TheOp);
     else if (BinaryOperator::isNot(TheOp))
       X = BinaryOperator::getNotArgument(TheOp);
@@ -1426,7 +1556,8 @@
       continue;
 
     // Remove X and -X from the operand list.
-    if (Ops.size() == 2 && BinaryOperator::isNeg(TheOp))
+    if (Ops.size() == 2 &&
+        (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
       return Constant::getNullValue(X->getType());
 
     // Remove X and ~X from the operand list.
@@ -1463,7 +1594,8 @@
   unsigned MaxOcc = 0;
   Value *MaxOccVal = nullptr;
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
-    BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+    BinaryOperator *BOp =
+        isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
     if (!BOp)
       continue;
 
@@ -1476,23 +1608,43 @@
     SmallPtrSet<Value*, 8> Duplicates;
     for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
       Value *Factor = Factors[i];
-      if (!Duplicates.insert(Factor)) continue;
+      if (!Duplicates.insert(Factor).second)
+        continue;
 
       unsigned Occ = ++FactorOccurrences[Factor];
-      if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
+      if (Occ > MaxOcc) {
+        MaxOcc = Occ;
+        MaxOccVal = Factor;
+      }
 
       // If Factor is a negative constant, add the negated value as a factor
       // because we can percolate the negate out.  Watch for minint, which
       // cannot be positivified.
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor))
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
         if (CI->isNegative() && !CI->isMinValue(true)) {
           Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
           assert(!Duplicates.count(Factor) &&
                  "Shouldn't have two constant factors, missed a canonicalize");
-
           unsigned Occ = ++FactorOccurrences[Factor];
-          if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
+          if (Occ > MaxOcc) {
+            MaxOcc = Occ;
+            MaxOccVal = Factor;
+          }
         }
+      } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
+        if (CF->isNegative()) {
+          APFloat F(CF->getValueAPF());
+          F.changeSign();
+          Factor = ConstantFP::get(CF->getContext(), F);
+          assert(!Duplicates.count(Factor) &&
+                 "Shouldn't have two constant factors, missed a canonicalize");
+          unsigned Occ = ++FactorOccurrences[Factor];
+          if (Occ > MaxOcc) {
+            MaxOcc = Occ;
+            MaxOccVal = Factor;
+          }
+        }
+      }
     }
   }
 
@@ -1505,11 +1657,16 @@
     // this, we could otherwise run into situations where removing a factor
     // from an expression will drop a use of maxocc, and this can cause
     // RemoveFactorFromExpression on successive values to behave differently.
-    Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
+    Instruction *DummyInst =
+        I->getType()->isIntegerTy()
+            ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
+            : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
+
     SmallVector<WeakVH, 4> NewMulOps;
     for (unsigned i = 0; i != Ops.size(); ++i) {
       // Only try to remove factors from expressions we're allowed to.
-      BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+      BinaryOperator *BOp =
+          isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
       if (!BOp)
         continue;
 
@@ -1542,7 +1699,7 @@
       RedoInsts.insert(VI);
 
     // Create the multiply.
-    Instruction *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
+    Instruction *V2 = CreateMul(V, MaxOccVal, "tmp", I, I);
 
     // Rerun associate on the multiply in case the inner expression turned into
     // a multiply.  We want to make sure that we keep things in canonical form.
@@ -1632,7 +1789,10 @@
 
   Value *LHS = Ops.pop_back_val();
   do {
-    LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+    if (LHS->getType()->isIntegerTy())
+      LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+    else
+      LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
   } while (!Ops.empty());
 
   return LHS;
@@ -1765,11 +1925,13 @@
     break;
 
   case Instruction::Add:
+  case Instruction::FAdd:
     if (Value *Result = OptimizeAdd(I, Ops))
       return Result;
     break;
 
   case Instruction::Mul:
+  case Instruction::FMul:
     if (Value *Result = OptimizeMul(I, Ops))
       return Result;
     break;
@@ -1797,12 +1959,104 @@
       // and add that since that's where optimization actually happens.
       unsigned Opcode = Op->getOpcode();
       while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
-             Visited.insert(Op))
+             Visited.insert(Op).second)
         Op = Op->user_back();
       RedoInsts.insert(Op);
     }
 }
 
+// Canonicalize expressions of the following form:
+//  x + (-Constant * y) -> x - (Constant * y)
+//  x - (-Constant * y) -> x + (Constant * y)
+Instruction *Reassociate::canonicalizeNegConstExpr(Instruction *I) {
+  if (!I->hasOneUse() || I->getType()->isVectorTy())
+    return nullptr;
+
+  // Must be a mul, fmul, or fdiv instruction.
+  unsigned Opcode = I->getOpcode();
+  if (Opcode != Instruction::Mul && Opcode != Instruction::FMul &&
+      Opcode != Instruction::FDiv)
+    return nullptr;
+
+  // Must have at least one constant operand.
+  Constant *C0 = dyn_cast<Constant>(I->getOperand(0));
+  Constant *C1 = dyn_cast<Constant>(I->getOperand(1));
+  if (!C0 && !C1)
+    return nullptr;
+
+  // Must be a negative ConstantInt or ConstantFP.
+  Constant *C = C0 ? C0 : C1;
+  unsigned ConstIdx = C0 ? 0 : 1;
+  if (auto *CI = dyn_cast<ConstantInt>(C)) {
+    if (!CI->isNegative())
+      return nullptr;
+  } else if (auto *CF = dyn_cast<ConstantFP>(C)) {
+    if (!CF->isNegative())
+      return nullptr;
+  } else
+    return nullptr;
+
+  // User must be a binary operator with one or more uses.
+  Instruction *User = I->user_back();
+  if (!isa<BinaryOperator>(User) || !User->getNumUses())
+    return nullptr;
+
+  unsigned UserOpcode = User->getOpcode();
+  if (UserOpcode != Instruction::Add && UserOpcode != Instruction::FAdd &&
+      UserOpcode != Instruction::Sub && UserOpcode != Instruction::FSub)
+    return nullptr;
+
+  // Subtraction is not commutative. Explicitly, the following transform is
+  // not valid: (-Constant * y) - x  -> x + (Constant * y)
+  if (!User->isCommutative() && User->getOperand(1) != I)
+    return nullptr;
+
+  // Change the sign of the constant.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
+    I->setOperand(ConstIdx, ConstantInt::get(CI->getContext(), -CI->getValue()));
+  else {
+    ConstantFP *CF = cast<ConstantFP>(C);
+    APFloat Val = CF->getValueAPF();
+    Val.changeSign();
+    I->setOperand(ConstIdx, ConstantFP::get(CF->getContext(), Val));
+  }
+
+  // Canonicalize I to RHS to simplify the next bit of logic. E.g.,
+  // ((-Const*y) + x) -> (x + (-Const*y)).
+  if (User->getOperand(0) == I && User->isCommutative())
+    cast<BinaryOperator>(User)->swapOperands();
+
+  Value *Op0 = User->getOperand(0);
+  Value *Op1 = User->getOperand(1);
+  BinaryOperator *NI;
+  switch(UserOpcode) {
+  default:
+    llvm_unreachable("Unexpected Opcode!");
+  case Instruction::Add:
+    NI = BinaryOperator::CreateSub(Op0, Op1);
+    break;
+  case Instruction::Sub:
+    NI = BinaryOperator::CreateAdd(Op0, Op1);
+    break;
+  case Instruction::FAdd:
+    NI = BinaryOperator::CreateFSub(Op0, Op1);
+    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+    break;
+  case Instruction::FSub:
+    NI = BinaryOperator::CreateFAdd(Op0, Op1);
+    NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags());
+    break;
+  }
+
+  NI->insertBefore(User);
+  NI->setName(User->getName());
+  User->replaceAllUsesWith(NI);
+  NI->setDebugLoc(I->getDebugLoc());
+  RedoInsts.insert(I);
+  MadeChange = true;
+  return NI;
+}
+
 /// OptimizeInst - Inspect and optimize the given instruction. Note that erasing
 /// instructions is not allowed.
 void Reassociate::OptimizeInst(Instruction *I) {
@@ -1810,8 +2064,7 @@
   if (!isa<BinaryOperator>(I))
     return;
 
-  if (I->getOpcode() == Instruction::Shl &&
-      isa<ConstantInt>(I->getOperand(1)))
+  if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
     // If an operand of this shift is a reassociable multiply, or if the shift
     // is used by a reassociable multiply or add, turn into a multiply.
     if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
@@ -1824,29 +2077,23 @@
       I = NI;
     }
 
-  // Floating point binary operators are not associative, but we can still
-  // commute (some) of them, to canonicalize the order of their operands.
-  // This can potentially expose more CSE opportunities, and makes writing
-  // other transformations simpler.
-  if ((I->getType()->isFloatingPointTy() || I->getType()->isVectorTy())) {
-    // FAdd and FMul can be commuted.
-    if (I->getOpcode() != Instruction::FMul &&
-        I->getOpcode() != Instruction::FAdd)
-      return;
+  // Canonicalize negative constants out of expressions.
+  if (Instruction *Res = canonicalizeNegConstExpr(I))
+    I = Res;
 
-    Value *LHS = I->getOperand(0);
-    Value *RHS = I->getOperand(1);
-    unsigned LHSRank = getRank(LHS);
-    unsigned RHSRank = getRank(RHS);
+  // Commute binary operators, to canonicalize the order of their operands.
+  // This can potentially expose more CSE opportunities, and makes writing other
+  // transformations simpler.
+  if (I->isCommutative())
+    canonicalizeOperands(I);
 
-    // Sort the operands by rank.
-    if (RHSRank < LHSRank) {
-      I->setOperand(0, RHS);
-      I->setOperand(1, LHS);
-    }
-
+  // Don't optimize vector instructions.
+  if (I->getType()->isVectorTy())
     return;
-  }
+
+  // Don't optimize floating point instructions that don't have unsafe algebra.
+  if (I->getType()->isFloatingPointTy() && !I->hasUnsafeAlgebra())
+    return;
 
   // Do not reassociate boolean (i1) expressions.  We want to preserve the
   // original order of evaluation for short-circuited comparisons that
@@ -1877,6 +2124,24 @@
         I = NI;
       }
     }
+  } else if (I->getOpcode() == Instruction::FSub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    } else if (BinaryOperator::isFNeg(I)) {
+      // Otherwise, this is a negation.  See if the operand is a multiply tree
+      // and if this is not an inner node of a multiply tree.
+      if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->user_back(), Instruction::FMul))) {
+        Instruction *NI = LowerNegateToMultiply(I);
+        RedoInsts.insert(I);
+        MadeChange = true;
+        I = NI;
+      }
+    }
   }
 
   // If this instruction is an associative binary operator, process it.
@@ -1894,11 +2159,16 @@
   if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
       cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
     return;
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
+      cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
+    return;
 
   ReassociateExpression(BO);
 }
 
 void Reassociate::ReassociateExpression(BinaryOperator *I) {
+  assert(!I->getType()->isVectorTy() &&
+         "Reassociation of vector instructions is not supported.");
 
   // First, walk the expression tree, linearizing the tree, collecting the
   // operand information.
@@ -1943,12 +2213,21 @@
   // this is a multiply tree used only by an add, and the immediate is a -1.
   // In this case we reassociate to put the negation on the outside so that we
   // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
-  if (I->getOpcode() == Instruction::Mul && I->hasOneUse() &&
-      cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
-      isa<ConstantInt>(Ops.back().Op) &&
-      cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
-    ValueEntry Tmp = Ops.pop_back_val();
-    Ops.insert(Ops.begin(), Tmp);
+  if (I->hasOneUse()) {
+    if (I->getOpcode() == Instruction::Mul &&
+        cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
+        isa<ConstantInt>(Ops.back().Op) &&
+        cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+      ValueEntry Tmp = Ops.pop_back_val();
+      Ops.insert(Ops.begin(), Tmp);
+    } else if (I->getOpcode() == Instruction::FMul &&
+               cast<Instruction>(I->user_back())->getOpcode() ==
+                   Instruction::FAdd &&
+               isa<ConstantFP>(Ops.back().Op) &&
+               cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
+      ValueEntry Tmp = Ops.pop_back_val();
+      Ops.insert(Ops.begin(), Tmp);
+    }
   }
 
   DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');

diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 90c3520..cfc9a8e 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp

@@ -214,7 +214,8 @@
   ///
   /// This returns true if the block was not considered live before.
   bool MarkBlockExecutable(BasicBlock *BB) {
-    if (!BBExecutable.insert(BB)) return false;
+    if (!BBExecutable.insert(BB).second)
+      return false;
     DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
     BBWorkList.push_back(BB);  // Add the block to the work list!
     return true;
@@ -1010,7 +1011,7 @@
   }
 
   Constant *Ptr = Operands[0];
-  ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end());
+  auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
   markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices));
 }
 
@@ -1107,6 +1108,9 @@
         Operands.push_back(State.getConstant());
       }
 
+      if (getValueState(I).isOverdefined())
+        return;
+
       // If we can constant fold this, mark the result of the call as a
       // constant.
       if (Constant *C = ConstantFoldCall(F, Operands, TLI))

diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 8c7f253..6135114 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp

@@ -28,6 +28,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -224,36 +225,26 @@
   /// \brief Support for iterating over the slices.
   /// @{
   typedef SmallVectorImpl<Slice>::iterator iterator;
+  typedef iterator_range<iterator> range;
   iterator begin() { return Slices.begin(); }
   iterator end() { return Slices.end(); }
 
   typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
+  typedef iterator_range<const_iterator> const_range;
   const_iterator begin() const { return Slices.begin(); }
   const_iterator end() const { return Slices.end(); }
   /// @}
 
-  /// \brief Allow iterating the dead users for this alloca.
-  ///
-  /// These are instructions which will never actually use the alloca as they
-  /// are outside the allocated range. They are safe to replace with undef and
-  /// delete.
-  /// @{
-  typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator;
-  dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); }
-  dead_user_iterator dead_user_end() const { return DeadUsers.end(); }
-  /// @}
+  /// \brief Access the dead users for this alloca.
+  ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
 
-  /// \brief Allow iterating the dead expressions referring to this alloca.
+  /// \brief Access the dead operands referring to this alloca.
   ///
   /// These are operands which have cannot actually be used to refer to the
   /// alloca as they are outside its range and the user doesn't correct for
   /// that. These mostly consist of PHI node inputs and the like which we just
   /// need to replace with undef.
-  /// @{
-  typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator;
-  dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); }
-  dead_op_iterator dead_op_end() const { return DeadOperands.end(); }
-  /// @}
+  ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
@@ -324,6 +315,15 @@
   return nullptr;
 }
 
+/// \brief A helper that folds a PHI node or a select.
+static Value *foldPHINodeOrSelectInst(Instruction &I) {
+  if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+    // If PN merges together the same value, return that value.
+    return PN->hasConstantValue();
+  }
+  return foldSelectInst(cast<SelectInst>(I));
+}
+
 /// \brief Builder for the alloca slices.
 ///
 /// This class builds a set of alloca slices by recursively visiting the uses
@@ -334,7 +334,7 @@
   typedef PtrUseVisitor<SliceBuilder> Base;
 
   const uint64_t AllocSize;
-  AllocaSlices &S;
+  AllocaSlices &AS;
 
   SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
   SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
@@ -343,14 +343,14 @@
   SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
 
 public:
-  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &S)
+  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
       : PtrUseVisitor<SliceBuilder>(DL),
-        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), S(S) {}
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
 
 private:
   void markAsDead(Instruction &I) {
-    if (VisitedDeadInsts.insert(&I))
-      S.DeadUsers.push_back(&I);
+    if (VisitedDeadInsts.insert(&I).second)
+      AS.DeadUsers.push_back(&I);
   }
 
   void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
@@ -361,7 +361,7 @@
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
                    << " which has zero size or starts outside of the "
                    << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << S.AI << "\n"
+                   << "    alloca: " << AS.AI << "\n"
                    << "       use: " << I << "\n");
       return markAsDead(I);
     }
@@ -379,12 +379,12 @@
     if (Size > AllocSize - BeginOffset) {
       DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
                    << " to remain within the " << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << S.AI << "\n"
+                   << "    alloca: " << AS.AI << "\n"
                    << "       use: " << I << "\n");
       EndOffset = AllocSize;
     }
 
-    S.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+    AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
   }
 
   void visitBitCastInst(BitCastInst &BC) {
@@ -485,7 +485,7 @@
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
                    << " which extends past the end of the " << AllocSize
                    << " byte alloca:\n"
-                   << "    alloca: " << S.AI << "\n"
+                   << "    alloca: " << AS.AI << "\n"
                    << "       use: " << SI << "\n");
       return markAsDead(SI);
     }
@@ -535,7 +535,7 @@
     if (Offset.uge(AllocSize)) {
       SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II);
       if (MTPI != MemTransferSliceMap.end())
-        S.Slices[MTPI->second].kill();
+        AS.Slices[MTPI->second].kill();
       return markAsDead(II);
     }
 
@@ -558,10 +558,10 @@
     bool Inserted;
     SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
     std::tie(MTPI, Inserted) =
-        MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size()));
+        MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
     unsigned PrevIdx = MTPI->second;
     if (!Inserted) {
-      Slice &PrevP = S.Slices[PrevIdx];
+      Slice &PrevP = AS.Slices[PrevIdx];
 
       // Check if the begin offsets match and this is a non-volatile transfer.
       // In that case, we can completely elide the transfer.
@@ -579,7 +579,7 @@
     insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
 
     // Check that we ended up with a valid index in the map.
-    assert(S.Slices[PrevIdx].getUse()->getUser() == &II &&
+    assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
            "Map index doesn't point back to a slice with this user.");
   }
 
@@ -639,24 +639,47 @@
       }
 
       for (User *U : I->users())
-        if (Visited.insert(cast<Instruction>(U)))
+        if (Visited.insert(cast<Instruction>(U)).second)
           Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
     } while (!Uses.empty());
 
     return nullptr;
   }
 
-  void visitPHINode(PHINode &PN) {
-    if (PN.use_empty())
-      return markAsDead(PN);
+  void visitPHINodeOrSelectInst(Instruction &I) {
+    assert(isa<PHINode>(I) || isa<SelectInst>(I));
+    if (I.use_empty())
+      return markAsDead(I);
+
+    // TODO: We could use SimplifyInstruction here to fold PHINodes and
+    // SelectInsts. However, doing so requires to change the current
+    // dead-operand-tracking mechanism. For instance, suppose neither loading
+    // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
+    // trap either.  However, if we simply replace %U with undef using the
+    // current dead-operand-tracking mechanism, "load (select undef, undef,
+    // %other)" may trap because the select may return the first operand
+    // "undef".
+    if (Value *Result = foldPHINodeOrSelectInst(I)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the PHI/select as if we had RAUW'ed it.
+        enqueueUsers(I);
+      else
+        // Otherwise the operand to the PHI/select is dead, and we can replace
+        // it with undef.
+        AS.DeadOperands.push_back(U);
+
+      return;
+    }
+
     if (!IsOffsetKnown)
-      return PI.setAborted(&PN);
+      return PI.setAborted(&I);
 
     // See if we already have computed info on this node.
-    uint64_t &PHISize = PHIOrSelectSizes[&PN];
-    if (!PHISize) {
-      // This is a new PHI node, check for an unsafe use of the PHI node.
-      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHISize))
+    uint64_t &Size = PHIOrSelectSizes[&I];
+    if (!Size) {
+      // This is a new PHI/Select, check for an unsafe use of it.
+      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
         return PI.setAborted(UnsafeI);
     }
 
@@ -667,51 +690,19 @@
     // FIXME: This should instead be escaped in the event we're instrumenting
     // for address sanitization.
     if (Offset.uge(AllocSize)) {
-      S.DeadOperands.push_back(U);
+      AS.DeadOperands.push_back(U);
       return;
     }
 
-    insertUse(PN, Offset, PHISize);
+    insertUse(I, Offset, Size);
+  }
+
+  void visitPHINode(PHINode &PN) {
+    visitPHINodeOrSelectInst(PN);
   }
 
   void visitSelectInst(SelectInst &SI) {
-    if (SI.use_empty())
-      return markAsDead(SI);
-    if (Value *Result = foldSelectInst(SI)) {
-      if (Result == *U)
-        // If the result of the constant fold will be the pointer, recurse
-        // through the select as if we had RAUW'ed it.
-        enqueueUsers(SI);
-      else
-        // Otherwise the operand to the select is dead, and we can replace it
-        // with undef.
-        S.DeadOperands.push_back(U);
-
-      return;
-    }
-    if (!IsOffsetKnown)
-      return PI.setAborted(&SI);
-
-    // See if we already have computed info on this node.
-    uint64_t &SelectSize = PHIOrSelectSizes[&SI];
-    if (!SelectSize) {
-      // This is a new Select, check for an unsafe use of it.
-      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectSize))
-        return PI.setAborted(UnsafeI);
-    }
-
-    // For PHI and select operands outside the alloca, we can't nuke the entire
-    // phi or select -- the other side might still be relevant, so we special
-    // case them here and use a separate structure to track the operands
-    // themselves which should be replaced with undef.
-    // FIXME: This should instead be escaped in the event we're instrumenting
-    // for address sanitization.
-    if (Offset.uge(AllocSize)) {
-      S.DeadOperands.push_back(U);
-      return;
-    }
-
-    insertUse(SI, Offset, SelectSize);
+    visitPHINodeOrSelectInst(SI);
   }
 
   /// \brief Disable SROA entirely if there are unhandled users of the alloca.
@@ -857,23 +848,18 @@
       else
         return false;
 
-    } while (Visited.insert(Ptr));
+    } while (Visited.insert(Ptr).second);
 
     return false;
   }
 
   void updateDebugInfo(Instruction *Inst) const override {
-    for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
-           E = DDIs.end(); I != E; ++I) {
-      DbgDeclareInst *DDI = *I;
+    for (DbgDeclareInst *DDI : DDIs)
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
         ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
       else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
         ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
-    }
-    for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
-           E = DVIs.end(); I != E; ++I) {
-      DbgValueInst *DVI = *I;
+    for (DbgValueInst *DVI : DVIs) {
       Value *Arg = nullptr;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         // If an argument is zero extended then use argument directly. The ZExt
@@ -890,8 +876,8 @@
         continue;
       }
       Instruction *DbgVal =
-        DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
-                                     Inst);
+          DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
+                                      DIExpression(DVI->getExpression()), Inst);
       DbgVal->setDebugLoc(DVI->getDebugLoc());
     }
   }
@@ -924,6 +910,7 @@
   LLVMContext *C;
   const DataLayout *DL;
   DominatorTree *DT;
+  AssumptionTracker *AT;
 
   /// \brief Worklist of alloca instructions to simplify.
   ///
@@ -983,14 +970,14 @@
   friend class PHIOrSelectSpeculator;
   friend class AllocaSliceRewriter;
 
-  bool rewritePartition(AllocaInst &AI, AllocaSlices &S,
+  bool rewritePartition(AllocaInst &AI, AllocaSlices &AS,
                         AllocaSlices::iterator B, AllocaSlices::iterator E,
                         int64_t BeginOffset, int64_t EndOffset,
                         ArrayRef<AllocaSlices::iterator> SplitUses);
-  bool splitAlloca(AllocaInst &AI, AllocaSlices &S);
+  bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
   bool runOnAlloca(AllocaInst &AI);
   void clobberUse(Use &U);
-  void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
+  void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
   bool promoteAllocas(Function &F);
 };
 }
@@ -1003,6 +990,7 @@
 
 INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
                       false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
                     false, false)
@@ -1148,10 +1136,12 @@
   PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
                                         PN.getName() + ".sroa.speculated");
 
-  // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
+  // Get the AA tags and alignment to use from one of the loads.  It doesn't
   // matter which one we get and if any differ.
   LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
-  MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+
+  AAMDNodes AATags;
+  SomeLoad->getAAMetadata(AATags);
   unsigned Align = SomeLoad->getAlignment();
 
   // Rewrite all loads of the PN to use the new PHI.
@@ -1172,8 +1162,8 @@
         InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
     ++NumLoadsSpeculated;
     Load->setAlignment(Align);
-    if (TBAATag)
-      Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+    if (AATags)
+      Load->setAAMetadata(AATags);
     NewPN->addIncoming(Load, Pred);
   }
 
@@ -1238,12 +1228,15 @@
         IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
     NumLoadsSpeculated += 2;
 
-    // Transfer alignment and TBAA info if present.
+    // Transfer alignment and AA info if present.
     TL->setAlignment(LI->getAlignment());
     FL->setAlignment(LI->getAlignment());
-    if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
-      TL->setMetadata(LLVMContext::MD_tbaa, Tag);
-      FL->setMetadata(LLVMContext::MD_tbaa, Tag);
+
+    AAMDNodes Tags;
+    LI->getAAMetadata(Tags);
+    if (Tags) {
+      TL->setAAMetadata(Tags);
+      FL->setAAMetadata(Tags);
     }
 
     Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
@@ -1468,7 +1461,7 @@
         break;
       Offset += GEPOffset;
       Ptr = GEP->getPointerOperand();
-      if (!Visited.insert(Ptr))
+      if (!Visited.insert(Ptr).second)
         break;
     }
 
@@ -1505,7 +1498,7 @@
       break;
     }
     assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
-  } while (Visited.insert(Ptr));
+  } while (Visited.insert(Ptr).second);
 
   if (!OffsetPtr) {
     if (!Int8Ptr) {
@@ -1621,39 +1614,43 @@
 ///
 /// This function is called to test each entry in a partioning which is slated
 /// for a single slice.
-static bool isVectorPromotionViableForSlice(
-    const DataLayout &DL, AllocaSlices &S, uint64_t SliceBeginOffset,
-    uint64_t SliceEndOffset, VectorType *Ty, uint64_t ElementSize,
-    AllocaSlices::const_iterator I) {
+static bool
+isVectorPromotionViableForSlice(const DataLayout &DL, uint64_t SliceBeginOffset,
+                                uint64_t SliceEndOffset, VectorType *Ty,
+                                uint64_t ElementSize, const Slice &S) {
   // First validate the slice offsets.
   uint64_t BeginOffset =
-      std::max(I->beginOffset(), SliceBeginOffset) - SliceBeginOffset;
+      std::max(S.beginOffset(), SliceBeginOffset) - SliceBeginOffset;
   uint64_t BeginIndex = BeginOffset / ElementSize;
   if (BeginIndex * ElementSize != BeginOffset ||
       BeginIndex >= Ty->getNumElements())
     return false;
   uint64_t EndOffset =
-      std::min(I->endOffset(), SliceEndOffset) - SliceBeginOffset;
+      std::min(S.endOffset(), SliceEndOffset) - SliceBeginOffset;
   uint64_t EndIndex = EndOffset / ElementSize;
   if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
     return false;
 
   assert(EndIndex > BeginIndex && "Empty vector!");
   uint64_t NumElements = EndIndex - BeginIndex;
-  Type *SliceTy =
-      (NumElements == 1) ? Ty->getElementType()
-                         : VectorType::get(Ty->getElementType(), NumElements);
+  Type *SliceTy = (NumElements == 1)
+                      ? Ty->getElementType()
+                      : VectorType::get(Ty->getElementType(), NumElements);
 
   Type *SplitIntTy =
       Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
 
-  Use *U = I->getUse();
+  Use *U = S.getUse();
 
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
     if (MI->isVolatile())
       return false;
-    if (!I->isSplittable())
+    if (!S.isSplittable())
       return false; // Skip any unsplittable intrinsics.
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+        II->getIntrinsicID() != Intrinsic::lifetime_end)
+      return false;
   } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
     // Disable vector promotion when there are loads or stores of an FCA.
     return false;
@@ -1661,8 +1658,7 @@
     if (LI->isVolatile())
       return false;
     Type *LTy = LI->getType();
-    if (SliceBeginOffset > I->beginOffset() ||
-        SliceEndOffset < I->endOffset()) {
+    if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) {
       assert(LTy->isIntegerTy());
       LTy = SplitIntTy;
     }
@@ -1672,8 +1668,7 @@
     if (SI->isVolatile())
       return false;
     Type *STy = SI->getValueOperand()->getType();
-    if (SliceBeginOffset > I->beginOffset() ||
-        SliceEndOffset < I->endOffset()) {
+    if (SliceBeginOffset > S.beginOffset() || SliceEndOffset < S.endOffset()) {
       assert(STy->isIntegerTy());
       STy = SplitIntTy;
     }
@@ -1695,39 +1690,113 @@
 /// SSA value. We only can ensure this for a limited set of operations, and we
 /// don't want to do the rewrites unless we are confident that the result will
 /// be promotable, so we have an early test here.
-static bool
-isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, AllocaSlices &S,
+static VectorType *
+isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy,
                         uint64_t SliceBeginOffset, uint64_t SliceEndOffset,
-                        AllocaSlices::const_iterator I,
-                        AllocaSlices::const_iterator E,
+                        AllocaSlices::const_range Slices,
                         ArrayRef<AllocaSlices::iterator> SplitUses) {
-  VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
-  if (!Ty)
-    return false;
+  // Collect the candidate types for vector-based promotion. Also track whether
+  // we have different element types.
+  SmallVector<VectorType *, 4> CandidateTys;
+  Type *CommonEltTy = nullptr;
+  bool HaveCommonEltTy = true;
+  auto CheckCandidateType = [&](Type *Ty) {
+    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      CandidateTys.push_back(VTy);
+      if (!CommonEltTy)
+        CommonEltTy = VTy->getElementType();
+      else if (CommonEltTy != VTy->getElementType())
+        HaveCommonEltTy = false;
+    }
+  };
+  CheckCandidateType(AllocaTy);
+  // Consider any loads or stores that are the exact size of the slice.
+  for (const auto &S : Slices)
+    if (S.beginOffset() == SliceBeginOffset &&
+        S.endOffset() == SliceEndOffset) {
+      if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
+        CheckCandidateType(LI->getType());
+      else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
+        CheckCandidateType(SI->getValueOperand()->getType());
+    }
 
-  uint64_t ElementSize = DL.getTypeSizeInBits(Ty->getScalarType());
+  // If we didn't find a vector type, nothing to do here.
+  if (CandidateTys.empty())
+    return nullptr;
 
-  // While the definition of LLVM vectors is bitpacked, we don't support sizes
-  // that aren't byte sized.
-  if (ElementSize % 8)
-    return false;
-  assert((DL.getTypeSizeInBits(Ty) % 8) == 0 &&
-         "vector size not a multiple of element size?");
-  ElementSize /= 8;
+  // Remove non-integer vector types if we had multiple common element types.
+  // FIXME: It'd be nice to replace them with integer vector types, but we can't
+  // do that until all the backends are known to produce good code for all
+  // integer vector types.
+  if (!HaveCommonEltTy) {
+    CandidateTys.erase(std::remove_if(CandidateTys.begin(), CandidateTys.end(),
+                                      [](VectorType *VTy) {
+                         return !VTy->getElementType()->isIntegerTy();
+                       }),
+                       CandidateTys.end());
 
-  for (; I != E; ++I)
-    if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset,
-                                         SliceEndOffset, Ty, ElementSize, I))
+    // If there were no integer vector types, give up.
+    if (CandidateTys.empty())
+      return nullptr;
+
+    // Rank the remaining candidate vector types. This is easy because we know
+    // they're all integer vectors. We sort by ascending number of elements.
+    auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
+             "Cannot have vector types of different sizes!");
+      assert(RHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      assert(LHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      return RHSTy->getNumElements() < LHSTy->getNumElements();
+    };
+    std::sort(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes);
+    CandidateTys.erase(
+        std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
+        CandidateTys.end());
+  } else {
+// The only way to have the same element type in every vector type is to
+// have the same vector type. Check that and remove all but one.
+#ifndef NDEBUG
+    for (VectorType *VTy : CandidateTys) {
+      assert(VTy->getElementType() == CommonEltTy &&
+             "Unaccounted for element type!");
+      assert(VTy == CandidateTys[0] &&
+             "Different vector types with the same element type!");
+    }
+#endif
+    CandidateTys.resize(1);
+  }
+
+  // Try each vector type, and return the one which works.
+  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
+    uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+
+    // While the definition of LLVM vectors is bitpacked, we don't support sizes
+    // that aren't byte sized.
+    if (ElementSize % 8)
       return false;
+    assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
+           "vector size not a multiple of element size?");
+    ElementSize /= 8;
 
-  for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
-                                                        SUE = SplitUses.end();
-       SUI != SUE; ++SUI)
-    if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset,
-                                         SliceEndOffset, Ty, ElementSize, *SUI))
-      return false;
+    for (const auto &S : Slices)
+      if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset,
+                                           VTy, ElementSize, S))
+        return false;
 
-  return true;
+    for (const auto &SI : SplitUses)
+      if (!isVectorPromotionViableForSlice(DL, SliceBeginOffset, SliceEndOffset,
+                                           VTy, ElementSize, *SI))
+        return false;
+
+    return true;
+  };
+  for (VectorType *VTy : CandidateTys)
+    if (CheckVectorTypeForPromotion(VTy))
+      return VTy;
+
+  return nullptr;
 }
 
 /// \brief Test whether a slice of an alloca is valid for integer widening.
@@ -1737,23 +1806,26 @@
 static bool isIntegerWideningViableForSlice(const DataLayout &DL,
                                             Type *AllocaTy,
                                             uint64_t AllocBeginOffset,
-                                            uint64_t Size, AllocaSlices &S,
-                                            AllocaSlices::const_iterator I,
+                                            uint64_t Size,
+                                            const Slice &S,
                                             bool &WholeAllocaOp) {
-  uint64_t RelBegin = I->beginOffset() - AllocBeginOffset;
-  uint64_t RelEnd = I->endOffset() - AllocBeginOffset;
+  uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
+  uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
 
   // We can't reasonably handle cases where the load or store extends past
   // the end of the aloca's type and into its padding.
   if (RelEnd > Size)
     return false;
 
-  Use *U = I->getUse();
+  Use *U = S.getUse();
 
   if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
     if (LI->isVolatile())
       return false;
-    if (RelBegin == 0 && RelEnd == Size)
+    // Note that we don't count vector loads or stores as whole-alloca
+    // operations which enable integer widening because we would prefer to use
+    // vector widening instead.
+    if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
       WholeAllocaOp = true;
     if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
@@ -1768,7 +1840,10 @@
     Type *ValueTy = SI->getValueOperand()->getType();
     if (SI->isVolatile())
       return false;
-    if (RelBegin == 0 && RelEnd == Size)
+    // Note that we don't count vector loads or stores as whole-alloca
+    // operations which enable integer widening because we would prefer to use
+    // vector widening instead.
+    if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
       WholeAllocaOp = true;
     if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
@@ -1782,7 +1857,7 @@
   } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
     if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
       return false;
-    if (!I->isSplittable())
+    if (!S.isSplittable())
       return false; // Skip any unsplittable intrinsics.
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
     if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
@@ -1803,9 +1878,8 @@
 /// promote the resulting alloca.
 static bool
 isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy,
-                        uint64_t AllocBeginOffset, AllocaSlices &S,
-                        AllocaSlices::const_iterator I,
-                        AllocaSlices::const_iterator E,
+                        uint64_t AllocBeginOffset,
+                        AllocaSlices::const_range Slices,
                         ArrayRef<AllocaSlices::iterator> SplitUses) {
   uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
   // Don't create integer types larger than the maximum bitwidth.
@@ -1831,18 +1905,17 @@
   // promote due to some other unsplittable entry (which we may make splittable
   // later). However, if there are only splittable uses, go ahead and assume
   // that we cover the alloca.
-  bool WholeAllocaOp = (I != E) ? false : DL.isLegalInteger(SizeInBits);
+  bool WholeAllocaOp =
+      Slices.begin() != Slices.end() ? false : DL.isLegalInteger(SizeInBits);
 
-  for (; I != E; ++I)
+  for (const auto &S : Slices)
     if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
-                                         S, I, WholeAllocaOp))
+                                         S, WholeAllocaOp))
       return false;
 
-  for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
-                                                        SUE = SplitUses.end();
-       SUI != SUE; ++SUI)
+  for (const auto &SI : SplitUses)
     if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
-                                         S, *SUI, WholeAllocaOp))
+                                         *SI, WholeAllocaOp))
       return false;
 
   return WholeAllocaOp;
@@ -1991,12 +2064,18 @@
   typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
 
   const DataLayout &DL;
-  AllocaSlices &S;
+  AllocaSlices &AS;
   SROA &Pass;
   AllocaInst &OldAI, &NewAI;
   const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
   Type *NewAllocaTy;
 
+  // This is a convenience and flag variable that will be null unless the new
+  // alloca's integer operations should be widened to this integer type due to
+  // passing isIntegerWideningViable above. If it is non-null, the desired
+  // integer type will be stored here for easy access during rewriting.
+  IntegerType *IntTy;
+
   // If we are rewriting an alloca partition which can be written as pure
   // vector operations, we stash extra information here. When VecTy is
   // non-null, we have some strict guarantees about the rewritten alloca:
@@ -2010,12 +2089,6 @@
   Type *ElementTy;
   uint64_t ElementSize;
 
-  // This is a convenience and flag variable that will be null unless the new
-  // alloca's integer operations should be widened to this integer type due to
-  // passing isIntegerWideningViable above. If it is non-null, the desired
-  // integer type will be stored here for easy access during rewriting.
-  IntegerType *IntTy;
-
   // The original offset of the slice currently being rewritten relative to
   // the original alloca.
   uint64_t BeginOffset, EndOffset;
@@ -2038,25 +2111,25 @@
   IRBuilderTy IRB;
 
 public:
-  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass,
+  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
                       AllocaInst &OldAI, AllocaInst &NewAI,
                       uint64_t NewAllocaBeginOffset,
-                      uint64_t NewAllocaEndOffset, bool IsVectorPromotable,
-                      bool IsIntegerPromotable,
+                      uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
+                      VectorType *PromotableVecTy,
                       SmallPtrSetImpl<PHINode *> &PHIUsers,
                       SmallPtrSetImpl<SelectInst *> &SelectUsers)
-      : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+      : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
         NewAllocaBeginOffset(NewAllocaBeginOffset),
         NewAllocaEndOffset(NewAllocaEndOffset),
         NewAllocaTy(NewAI.getAllocatedType()),
-        VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : nullptr),
-        ElementTy(VecTy ? VecTy->getElementType() : nullptr),
-        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
         IntTy(IsIntegerPromotable
                   ? Type::getIntNTy(
                         NewAI.getContext(),
                         DL.getTypeSizeInBits(NewAI.getAllocatedType()))
                   : nullptr),
+        VecTy(PromotableVecTy),
+        ElementTy(VecTy ? VecTy->getElementType() : nullptr),
+        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
         BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
         OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
         IRB(NewAI.getContext(), ConstantFolder()) {
@@ -2065,8 +2138,7 @@
              "Only multiple-of-8 sized vector elements are viable");
       ++NumVectorized;
     }
-    assert((!IsVectorPromotable && !IsIntegerPromotable) ||
-           IsVectorPromotable != IsIntegerPromotable);
+    assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
   }
 
   bool visit(AllocaSlices::const_iterator I) {
@@ -2413,6 +2485,7 @@
     if (!VecTy && !IntTy &&
         (BeginOffset > NewAllocaBeginOffset ||
          EndOffset < NewAllocaEndOffset ||
+         SliceSize != DL.getTypeStoreSize(AllocaTy) ||
          !AllocaTy->isSingleValueType() ||
          !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
          DL.getTypeSizeInBits(ScalarTy)%8 != 0)) {
@@ -2535,10 +2608,11 @@
 
     // If this doesn't map cleanly onto the alloca type, and that type isn't
     // a single value type, just emit a memcpy.
-    bool EmitMemCpy
-      = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset ||
-                             EndOffset < NewAllocaEndOffset ||
-                             !NewAI.getAllocatedType()->isSingleValueType());
+    bool EmitMemCpy =
+        !VecTy && !IntTy &&
+        (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+         SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
+         !NewAI.getAllocatedType()->isSingleValueType());
 
     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
     // size hasn't been shrunk based on analysis of the viable range, this is
@@ -2697,7 +2771,10 @@
     // the old pointer, which necessarily must be in the right position to
     // dominate the PHI.
     IRBuilderTy PtrBuilder(IRB);
-    PtrBuilder.SetInsertPoint(OldPtr);
+    if (isa<PHINode>(OldPtr))
+      PtrBuilder.SetInsertPoint(OldPtr->getParent()->getFirstInsertionPt());
+    else
+      PtrBuilder.SetInsertPoint(OldPtr);
     PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
 
     Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
@@ -2784,7 +2861,7 @@
   /// This uses a set to de-duplicate users.
   void enqueueUsers(Instruction &I) {
     for (Use &U : I.uses())
-      if (Visited.insert(U.getUser()))
+      if (Visited.insert(U.getUser()).second)
         Queue.push_back(&U);
   }
 
@@ -3104,7 +3181,7 @@
 /// appropriate new offsets. It also evaluates how successful the rewrite was
 /// at enabling promotion and if it was successful queues the alloca to be
 /// promoted.
-bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
+bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
                             AllocaSlices::iterator B, AllocaSlices::iterator E,
                             int64_t BeginOffset, int64_t EndOffset,
                             ArrayRef<AllocaSlices::iterator> SplitUses) {
@@ -3130,12 +3207,16 @@
     SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize);
   assert(DL->getTypeAllocSize(SliceTy) >= SliceSize);
 
-  bool IsVectorPromotable = isVectorPromotionViable(
-      *DL, SliceTy, S, BeginOffset, EndOffset, B, E, SplitUses);
+  bool IsIntegerPromotable = isIntegerWideningViable(
+      *DL, SliceTy, BeginOffset, AllocaSlices::const_range(B, E), SplitUses);
 
-  bool IsIntegerPromotable =
-      !IsVectorPromotable &&
-      isIntegerWideningViable(*DL, SliceTy, BeginOffset, S, B, E, SplitUses);
+  VectorType *VecTy =
+      IsIntegerPromotable
+          ? nullptr
+          : isVectorPromotionViable(*DL, SliceTy, BeginOffset, EndOffset,
+                                    AllocaSlices::const_range(B, E), SplitUses);
+  if (VecTy)
+    SliceTy = VecTy;
 
   // Check for the case where we're going to rewrite to a new alloca of the
   // exact same type as the original, and with the same access offsets. In that
@@ -3161,8 +3242,9 @@
     // the alloca's alignment unconstrained.
     if (Alignment <= DL->getABITypeAlignment(SliceTy))
       Alignment = 0;
-    NewAI = new AllocaInst(SliceTy, nullptr, Alignment,
-                           AI.getName() + ".sroa." + Twine(B - S.begin()), &AI);
+    NewAI =
+        new AllocaInst(SliceTy, nullptr, Alignment,
+                       AI.getName() + ".sroa." + Twine(B - AS.begin()), &AI);
     ++NumNewAllocas;
   }
 
@@ -3178,21 +3260,19 @@
   SmallPtrSet<PHINode *, 8> PHIUsers;
   SmallPtrSet<SelectInst *, 8> SelectUsers;
 
-  AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset,
-                               EndOffset, IsVectorPromotable,
-                               IsIntegerPromotable, PHIUsers, SelectUsers);
+  AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, BeginOffset,
+                               EndOffset, IsIntegerPromotable, VecTy, PHIUsers,
+                               SelectUsers);
   bool Promotable = true;
-  for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
-                                                        SUE = SplitUses.end();
-       SUI != SUE; ++SUI) {
+  for (auto & SplitUse : SplitUses) {
     DEBUG(dbgs() << "  rewriting split ");
-    DEBUG(S.printSlice(dbgs(), *SUI, ""));
-    Promotable &= Rewriter.visit(*SUI);
+    DEBUG(AS.printSlice(dbgs(), SplitUse, ""));
+    Promotable &= Rewriter.visit(SplitUse);
     ++NumUses;
   }
   for (AllocaSlices::iterator I = B; I != E; ++I) {
     DEBUG(dbgs() << "  rewriting ");
-    DEBUG(S.printSlice(dbgs(), I, ""));
+    DEBUG(AS.printSlice(dbgs(), I, ""));
     Promotable &= Rewriter.visit(I);
     ++NumUses;
   }
@@ -3230,14 +3310,10 @@
       // If we have either PHIs or Selects to speculate, add them to those
       // worklists and re-queue the new alloca so that we promote in on the
       // next iteration.
-      for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
-                                                E = PHIUsers.end();
-           I != E; ++I)
-        SpeculatablePHIs.insert(*I);
-      for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
-                                                   E = SelectUsers.end();
-           I != E; ++I)
-        SpeculatableSelects.insert(*I);
+      for (PHINode *PHIUser : PHIUsers)
+        SpeculatablePHIs.insert(PHIUser);
+      for (SelectInst *SelectUser : SelectUsers)
+        SpeculatableSelects.insert(SelectUser);
       Worklist.insert(NewAI);
     }
   } else {
@@ -3275,17 +3351,15 @@
 
   // Recompute the max. While this is linear, so is remove_if.
   MaxSplitUseEndOffset = 0;
-  for (SmallVectorImpl<AllocaSlices::iterator>::iterator
-           SUI = SplitUses.begin(),
-           SUE = SplitUses.end();
-       SUI != SUE; ++SUI)
-    MaxSplitUseEndOffset = std::max((*SUI)->endOffset(), MaxSplitUseEndOffset);
+  for (AllocaSlices::iterator SplitUse : SplitUses)
+    MaxSplitUseEndOffset =
+        std::max(SplitUse->endOffset(), MaxSplitUseEndOffset);
 }
 
 /// \brief Walks the slices of an alloca and form partitions based on them,
 /// rewriting each of their uses.
-bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) {
-  if (S.begin() == S.end())
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
+  if (AS.begin() == AS.end())
     return false;
 
   unsigned NumPartitions = 0;
@@ -3293,9 +3367,10 @@
   SmallVector<AllocaSlices::iterator, 4> SplitUses;
   uint64_t MaxSplitUseEndOffset = 0;
 
-  uint64_t BeginOffset = S.begin()->beginOffset();
+  uint64_t BeginOffset = AS.begin()->beginOffset();
 
-  for (AllocaSlices::iterator SI = S.begin(), SJ = std::next(SI), SE = S.end();
+  for (AllocaSlices::iterator SI = AS.begin(), SJ = std::next(SI),
+                              SE = AS.end();
        SI != SE; SI = SJ) {
     uint64_t MaxEndOffset = SI->endOffset();
 
@@ -3333,8 +3408,8 @@
     // we'll have to rewrite uses and erase old split uses.
     if (BeginOffset < MaxEndOffset) {
       // Rewrite a sequence of overlapping slices.
-      Changed |=
-          rewritePartition(AI, S, SI, SJ, BeginOffset, MaxEndOffset, SplitUses);
+      Changed |= rewritePartition(AI, AS, SI, SJ, BeginOffset, MaxEndOffset,
+                                  SplitUses);
       ++NumPartitions;
 
       removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset);
@@ -3373,8 +3448,8 @@
     uint64_t PostSplitEndOffset =
         SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset();
 
-    Changed |= rewritePartition(AI, S, SJ, SJ, MaxEndOffset, PostSplitEndOffset,
-                                SplitUses);
+    Changed |= rewritePartition(AI, AS, SJ, SJ, MaxEndOffset,
+                                PostSplitEndOffset, SplitUses);
     ++NumPartitions;
 
     if (SJ == SE)
@@ -3437,38 +3512,34 @@
   Changed |= AggRewriter.rewrite(AI);
 
   // Build the slices using a recursive instruction-visiting builder.
-  AllocaSlices S(*DL, AI);
-  DEBUG(S.print(dbgs()));
-  if (S.isEscaped())
+  AllocaSlices AS(*DL, AI);
+  DEBUG(AS.print(dbgs()));
+  if (AS.isEscaped())
     return Changed;
 
   // Delete all the dead users of this alloca before splitting and rewriting it.
-  for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(),
-                                        DE = S.dead_user_end();
-       DI != DE; ++DI) {
+  for (Instruction *DeadUser : AS.getDeadUsers()) {
     // Free up everything used by this instruction.
-    for (Use &DeadOp : (*DI)->operands())
+    for (Use &DeadOp : DeadUser->operands())
       clobberUse(DeadOp);
 
     // Now replace the uses of this instruction.
-    (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
+    DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
 
     // And mark it for deletion.
-    DeadInsts.insert(*DI);
+    DeadInsts.insert(DeadUser);
     Changed = true;
   }
-  for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(),
-                                      DE = S.dead_op_end();
-       DO != DE; ++DO) {
-    clobberUse(**DO);
+  for (Use *DeadOp : AS.getDeadOperands()) {
+    clobberUse(*DeadOp);
     Changed = true;
   }
 
   // No slices to split. Leave the dead alloca for a later pass to clean up.
-  if (S.begin() == S.end())
+  if (AS.begin() == AS.end())
     return Changed;
 
-  Changed |= splitAlloca(AI, S);
+  Changed |= splitAlloca(AI, AS);
 
   DEBUG(dbgs() << "  Speculating PHIs\n");
   while (!SpeculatablePHIs.empty())
@@ -3490,7 +3561,7 @@
 ///
 /// We also record the alloca instructions deleted here so that they aren't
 /// subsequently handed to mem2reg to promote.
-void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
+void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas) {
   while (!DeadInsts.empty()) {
     Instruction *I = DeadInsts.pop_back_val();
     DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
@@ -3515,9 +3586,9 @@
 
 static void enqueueUsersInWorklist(Instruction &I,
                                    SmallVectorImpl<Instruction *> &Worklist,
-                                   SmallPtrSet<Instruction *, 8> &Visited) {
+                                   SmallPtrSetImpl<Instruction *> &Visited) {
   for (User *U : I.users())
-    if (Visited.insert(cast<Instruction>(U)))
+    if (Visited.insert(cast<Instruction>(U)).second)
       Worklist.push_back(cast<Instruction>(U));
 }
 
@@ -3537,7 +3608,7 @@
 
   if (DT && !ForceSSAUpdater) {
     DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-    PromoteMemToReg(PromotableAllocas, *DT);
+    PromoteMemToReg(PromotableAllocas, *DT, nullptr, AT);
     PromotableAllocas.clear();
     return true;
   }
@@ -3619,6 +3690,7 @@
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  AT = &getAnalysis<AssumptionTracker>();
 
   BasicBlock &EntryBB = F.getEntryBlock();
   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
@@ -3662,6 +3734,7 @@
 }
 
 void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AssumptionTracker>();
   if (RequiresDomTree)
     AU.addRequired<DominatorTreeWrapperPass>();
   AU.setPreservesCFG();

diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
index 73c97ff..179bbf7 100644
--- a/lib/Transforms/Scalar/SampleProfile.cpp
+++ b/lib/Transforms/Scalar/SampleProfile.cpp

@@ -26,7 +26,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -42,15 +41,14 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 
 using namespace llvm;
+using namespace sampleprof;
 
 #define DEBUG_TYPE "sample-profile"
 
@@ -65,76 +63,48 @@
              "sample block/edge weights through the CFG."));
 
 namespace {
-/// \brief Represents the relative location of an instruction.
-///
-/// Instruction locations are specified by the line offset from the
-/// beginning of the function (marked by the line where the function
-/// header is) and the discriminator value within that line.
-///
-/// The discriminator value is useful to distinguish instructions
-/// that are on the same line but belong to different basic blocks
-/// (e.g., the two post-increment instructions in "if (p) x++; else y++;").
-struct InstructionLocation {
-  InstructionLocation(int L, unsigned D) : LineOffset(L), Discriminator(D) {}
-  int LineOffset;
-  unsigned Discriminator;
-};
-}
-
-namespace llvm {
-template <> struct DenseMapInfo<InstructionLocation> {
-  typedef DenseMapInfo<int> OffsetInfo;
-  typedef DenseMapInfo<unsigned> DiscriminatorInfo;
-  static inline InstructionLocation getEmptyKey() {
-    return InstructionLocation(OffsetInfo::getEmptyKey(),
-                               DiscriminatorInfo::getEmptyKey());
-  }
-  static inline InstructionLocation getTombstoneKey() {
-    return InstructionLocation(OffsetInfo::getTombstoneKey(),
-                               DiscriminatorInfo::getTombstoneKey());
-  }
-  static inline unsigned getHashValue(InstructionLocation Val) {
-    return DenseMapInfo<std::pair<int, unsigned>>::getHashValue(
-        std::pair<int, unsigned>(Val.LineOffset, Val.Discriminator));
-  }
-  static inline bool isEqual(InstructionLocation LHS, InstructionLocation RHS) {
-    return LHS.LineOffset == RHS.LineOffset &&
-           LHS.Discriminator == RHS.Discriminator;
-  }
-};
-}
-
-namespace {
-typedef DenseMap<InstructionLocation, unsigned> BodySampleMap;
 typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap;
 typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap;
 typedef std::pair<BasicBlock *, BasicBlock *> Edge;
 typedef DenseMap<Edge, unsigned> EdgeWeightMap;
 typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap;
 
-/// \brief Representation of the runtime profile for a function.
+/// \brief Sample profile pass.
 ///
-/// This data structure contains the runtime profile for a given
-/// function. It contains the total number of samples collected
-/// in the function and a map of samples collected in every statement.
-class SampleFunctionProfile {
+/// This pass reads profile data from the file specified by
+/// -sample-profile-file and annotates every affected function with the
+/// profile information found in that file.
+class SampleProfileLoader : public FunctionPass {
 public:
-  SampleFunctionProfile()
-      : TotalSamples(0), TotalHeadSamples(0), HeaderLineno(0), DT(nullptr),
-        PDT(nullptr), LI(nullptr), Ctx(nullptr) {}
+  // Class identification, replacement for typeinfo
+  static char ID;
 
-  unsigned getFunctionLoc(Function &F);
-  bool emitAnnotations(Function &F, DominatorTree *DomTree,
-                       PostDominatorTree *PostDomTree, LoopInfo *Loops);
-  unsigned getInstWeight(Instruction &I);
-  unsigned getBlockWeight(BasicBlock *B);
-  void addTotalSamples(unsigned Num) { TotalSamples += Num; }
-  void addHeadSamples(unsigned Num) { TotalHeadSamples += Num; }
-  void addBodySamples(int LineOffset, unsigned Discriminator, unsigned Num) {
-    assert(LineOffset >= 0);
-    BodySamples[InstructionLocation(LineOffset, Discriminator)] += Num;
+  SampleProfileLoader(StringRef Name = SampleProfileFile)
+      : FunctionPass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Ctx(nullptr),
+        Reader(), Samples(nullptr), Filename(Name), ProfileIsValid(false) {
+    initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
   }
-  void print(raw_ostream &OS);
+
+  bool doInitialization(Module &M) override;
+
+  void dump() { Reader->dump(); }
+
+  const char *getPassName() const override { return "Sample profile pass"; }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LoopInfo>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTree>();
+  }
+
+protected:
+  unsigned getFunctionLoc(Function &F);
+  bool emitAnnotations(Function &F);
+  unsigned getInstWeight(Instruction &I);
+  unsigned getBlockWeight(BasicBlock *BB);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, BasicBlock *BB);
   void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB);
@@ -147,32 +117,11 @@
   unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(Function &F);
   bool propagateThroughEdges(Function &F);
-  bool empty() { return BodySamples.empty(); }
 
-protected:
-  /// \brief Total number of samples collected inside this function.
-  ///
-  /// Samples are cumulative, they include all the samples collected
-  /// inside this function and all its inlined callees.
-  unsigned TotalSamples;
-
-  /// \brief Total number of samples collected at the head of the function.
-  /// FIXME: Use head samples to estimate a cold/hot attribute for the function.
-  unsigned TotalHeadSamples;
-
-  /// \brief Line number for the function header. Used to compute relative
-  /// line numbers from the absolute line LOCs found in instruction locations.
-  /// The relative line numbers are needed to address the samples from the
-  /// profile file.
+  /// \brief Line number for the function header. Used to compute absolute
+  /// line numbers from the relative line numbers found in the profile.
   unsigned HeaderLineno;
 
-  /// \brief Map line offsets to collected samples.
-  ///
-  /// Each entry in this map contains the number of samples
-  /// collected at the corresponding line offset. All line locations
-  /// are an offset from the start of the function.
-  BodySampleMap BodySamples;
-
   /// \brief Map basic blocks to their computed weights.
   ///
   /// The weight of a basic block is defined to be the maximum
@@ -212,105 +161,12 @@
 
   /// \brief LLVM context holding the debug data we need.
   LLVMContext *Ctx;
-};
 
-/// \brief Sample-based profile reader.
-///
-/// Each profile contains sample counts for all the functions
-/// executed. Inside each function, statements are annotated with the
-/// collected samples on all the instructions associated with that
-/// statement.
-///
-/// For this to produce meaningful data, the program needs to be
-/// compiled with some debug information (at minimum, line numbers:
-/// -gline-tables-only). Otherwise, it will be impossible to match IR
-/// instructions to the line numbers collected by the profiler.
-///
-/// From the profile file, we are interested in collecting the
-/// following information:
-///
-/// * A list of functions included in the profile (mangled names).
-///
-/// * For each function F:
-///   1. The total number of samples collected in F.
-///
-///   2. The samples collected at each line in F. To provide some
-///      protection against source code shuffling, line numbers should
-///      be relative to the start of the function.
-class SampleModuleProfile {
-public:
-  SampleModuleProfile(const Module &M, StringRef F)
-      : Profiles(0), Filename(F), M(M) {}
-
-  void dump();
-  bool loadText();
-  void loadNative() { llvm_unreachable("not implemented"); }
-  void printFunctionProfile(raw_ostream &OS, StringRef FName);
-  void dumpFunctionProfile(StringRef FName);
-  SampleFunctionProfile &getProfile(const Function &F) {
-    return Profiles[F.getName()];
-  }
-
-  /// \brief Report a parse error message.
-  void reportParseError(int64_t LineNumber, Twine Msg) const {
-    DiagnosticInfoSampleProfile Diag(Filename.data(), LineNumber, Msg);
-    M.getContext().diagnose(Diag);
-  }
-
-protected:
-  /// \brief Map every function to its associated profile.
-  ///
-  /// The profile of every function executed at runtime is collected
-  /// in the structure SampleFunctionProfile. This maps function objects
-  /// to their corresponding profiles.
-  StringMap<SampleFunctionProfile> Profiles;
-
-  /// \brief Path name to the file holding the profile data.
-  ///
-  /// The format of this file is defined by each profiler
-  /// independently. If possible, the profiler should have a text
-  /// version of the profile format to be used in constructing test
-  /// cases and debugging.
-  StringRef Filename;
-
-  /// \brief Module being compiled. Used mainly to access the current
-  /// LLVM context for diagnostics.
-  const Module &M;
-};
-
-/// \brief Sample profile pass.
-///
-/// This pass reads profile data from the file specified by
-/// -sample-profile-file and annotates every affected function with the
-/// profile information found in that file.
-class SampleProfileLoader : public FunctionPass {
-public:
-  // Class identification, replacement for typeinfo
-  static char ID;
-
-  SampleProfileLoader(StringRef Name = SampleProfileFile)
-      : FunctionPass(ID), Profiler(), Filename(Name), ProfileIsValid(false) {
-    initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool doInitialization(Module &M) override;
-
-  void dump() { Profiler->dump(); }
-
-  const char *getPassName() const override { return "Sample profile pass"; }
-
-  bool runOnFunction(Function &F) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<LoopInfo>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<PostDominatorTree>();
-  }
-
-protected:
   /// \brief Profile reader object.
-  std::unique_ptr<SampleModuleProfile> Profiler;
+  std::unique_ptr<SampleProfileReader> Reader;
+
+  /// \brief Samples collected for the body of this function.
+  FunctionSamples *Samples;
 
   /// \brief Name of the profile file to load.
   StringRef Filename;
@@ -320,26 +176,11 @@
 };
 }
 
-/// \brief Print this function profile on stream \p OS.
-///
-/// \param OS Stream to emit the output to.
-void SampleFunctionProfile::print(raw_ostream &OS) {
-  OS << TotalSamples << ", " << TotalHeadSamples << ", " << BodySamples.size()
-     << " sampled lines\n";
-  for (BodySampleMap::const_iterator SI = BodySamples.begin(),
-                                     SE = BodySamples.end();
-       SI != SE; ++SI)
-    OS << "\tline offset: " << SI->first.LineOffset
-       << ", discriminator: " << SI->first.Discriminator
-       << ", number of samples: " << SI->second << "\n";
-  OS << "\n";
-}
-
 /// \brief Print the weight of edge \p E on stream \p OS.
 ///
 /// \param OS  Stream to emit the output to.
 /// \param E  Edge to print.
-void SampleFunctionProfile::printEdgeWeight(raw_ostream &OS, Edge E) {
+void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
   OS << "weight[" << E.first->getName() << "->" << E.second->getName()
      << "]: " << EdgeWeights[E] << "\n";
 }
@@ -348,8 +189,8 @@
 ///
 /// \param OS  Stream to emit the output to.
 /// \param BB  Block to print.
-void SampleFunctionProfile::printBlockEquivalence(raw_ostream &OS,
-                                                  BasicBlock *BB) {
+void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
+                                                BasicBlock *BB) {
   BasicBlock *Equiv = EquivalenceClass[BB];
   OS << "equivalence[" << BB->getName()
      << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
@@ -359,174 +200,10 @@
 ///
 /// \param OS  Stream to emit the output to.
 /// \param BB  Block to print.
-void SampleFunctionProfile::printBlockWeight(raw_ostream &OS, BasicBlock *BB) {
+void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) {
   OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n";
 }
 
-/// \brief Print the function profile for \p FName on stream \p OS.
-///
-/// \param OS Stream to emit the output to.
-/// \param FName Name of the function to print.
-void SampleModuleProfile::printFunctionProfile(raw_ostream &OS,
-                                               StringRef FName) {
-  OS << "Function: " << FName << ":\n";
-  Profiles[FName].print(OS);
-}
-
-/// \brief Dump the function profile for \p FName.
-///
-/// \param FName Name of the function to print.
-void SampleModuleProfile::dumpFunctionProfile(StringRef FName) {
-  printFunctionProfile(dbgs(), FName);
-}
-
-/// \brief Dump all the function profiles found.
-void SampleModuleProfile::dump() {
-  for (StringMap<SampleFunctionProfile>::const_iterator I = Profiles.begin(),
-                                                        E = Profiles.end();
-       I != E; ++I)
-    dumpFunctionProfile(I->getKey());
-}
-
-/// \brief Load samples from a text file.
-///
-/// The file contains a list of samples for every function executed at
-/// runtime. Each function profile has the following format:
-///
-///    function1:total_samples:total_head_samples
-///    offset1[.discriminator]: number_of_samples [fn1:num fn2:num ... ]
-///    offset2[.discriminator]: number_of_samples [fn3:num fn4:num ... ]
-///    ...
-///    offsetN[.discriminator]: number_of_samples [fn5:num fn6:num ... ]
-///
-/// Function names must be mangled in order for the profile loader to
-/// match them in the current translation unit. The two numbers in the
-/// function header specify how many total samples were accumulated in
-/// the function (first number), and the total number of samples accumulated
-/// at the prologue of the function (second number). This head sample
-/// count provides an indicator of how frequent is the function invoked.
-///
-/// Each sampled line may contain several items. Some are optional
-/// (marked below):
-///
-/// a- Source line offset. This number represents the line number
-///    in the function where the sample was collected. The line number
-///    is always relative to the line where symbol of the function
-///    is defined. So, if the function has its header at line 280,
-///    the offset 13 is at line 293 in the file.
-///
-/// b- [OPTIONAL] Discriminator. This is used if the sampled program
-///    was compiled with DWARF discriminator support
-///    (http://wiki.dwarfstd.org/index.php?title=Path_Discriminators)
-///
-/// c- Number of samples. This is the number of samples collected by
-///    the profiler at this source location.
-///
-/// d- [OPTIONAL] Potential call targets and samples. If present, this
-///    line contains a call instruction. This models both direct and
-///    indirect calls. Each called target is listed together with the
-///    number of samples. For example,
-///
-///    130: 7  foo:3  bar:2  baz:7
-///
-///    The above means that at relative line offset 130 there is a
-///    call instruction that calls one of foo(), bar() and baz(). With
-///    baz() being the relatively more frequent call target.
-///
-///    FIXME: This is currently unhandled, but it has a lot of
-///           potential for aiding the inliner.
-///
-///
-/// Since this is a flat profile, a function that shows up more than
-/// once gets all its samples aggregated across all its instances.
-///
-/// FIXME: flat profiles are too imprecise to provide good optimization
-///        opportunities. Convert them to context-sensitive profile.
-///
-/// This textual representation is useful to generate unit tests and
-/// for debugging purposes, but it should not be used to generate
-/// profiles for large programs, as the representation is extremely
-/// inefficient.
-///
-/// \returns true if the file was loaded successfully, false otherwise.
-bool SampleModuleProfile::loadText() {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
-      MemoryBuffer::getFile(Filename);
-  if (std::error_code EC = BufferOrErr.getError()) {
-    std::string Msg(EC.message());
-    M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg));
-    return false;
-  }
-  std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
-  line_iterator LineIt(*Buffer, '#');
-
-  // Read the profile of each function. Since each function may be
-  // mentioned more than once, and we are collecting flat profiles,
-  // accumulate samples as we parse them.
-  Regex HeadRE("^([^0-9].*):([0-9]+):([0-9]+)$");
-  Regex LineSample("^([0-9]+)\\.?([0-9]+)?: ([0-9]+)(.*)$");
-  while (!LineIt.is_at_eof()) {
-    // Read the header of each function.
-    //
-    // Note that for function identifiers we are actually expecting
-    // mangled names, but we may not always get them. This happens when
-    // the compiler decides not to emit the function (e.g., it was inlined
-    // and removed). In this case, the binary will not have the linkage
-    // name for the function, so the profiler will emit the function's
-    // unmangled name, which may contain characters like ':' and '>' in its
-    // name (member functions, templates, etc).
-    //
-    // The only requirement we place on the identifier, then, is that it
-    // should not begin with a number.
-    SmallVector<StringRef, 3> Matches;
-    if (!HeadRE.match(*LineIt, &Matches)) {
-      reportParseError(LineIt.line_number(),
-                       "Expected 'mangled_name:NUM:NUM', found " + *LineIt);
-      return false;
-    }
-    assert(Matches.size() == 4);
-    StringRef FName = Matches[1];
-    unsigned NumSamples, NumHeadSamples;
-    Matches[2].getAsInteger(10, NumSamples);
-    Matches[3].getAsInteger(10, NumHeadSamples);
-    Profiles[FName] = SampleFunctionProfile();
-    SampleFunctionProfile &FProfile = Profiles[FName];
-    FProfile.addTotalSamples(NumSamples);
-    FProfile.addHeadSamples(NumHeadSamples);
-    ++LineIt;
-
-    // Now read the body. The body of the function ends when we reach
-    // EOF or when we see the start of the next function.
-    while (!LineIt.is_at_eof() && isdigit((*LineIt)[0])) {
-      if (!LineSample.match(*LineIt, &Matches)) {
-        reportParseError(
-            LineIt.line_number(),
-            "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt);
-        return false;
-      }
-      assert(Matches.size() == 5);
-      unsigned LineOffset, NumSamples, Discriminator = 0;
-      Matches[1].getAsInteger(10, LineOffset);
-      if (Matches[2] != "")
-        Matches[2].getAsInteger(10, Discriminator);
-      Matches[3].getAsInteger(10, NumSamples);
-
-      // FIXME: Handle called targets (in Matches[4]).
-
-      // When dealing with instruction weights, we use the value
-      // zero to indicate the absence of a sample. If we read an
-      // actual zero from the profile file, return it as 1 to
-      // avoid the confusion later on.
-      if (NumSamples == 0)
-        NumSamples = 1;
-      FProfile.addBodySamples(LineOffset, Discriminator, NumSamples);
-      ++LineIt;
-    }
-  }
-
-  return true;
-}
-
 /// \brief Get the weight for an instruction.
 ///
 /// The "weight" of an instruction \p Inst is the number of samples
@@ -538,7 +215,7 @@
 /// \param Inst Instruction to query.
 ///
 /// \returns The profiled weight of I.
-unsigned SampleFunctionProfile::getInstWeight(Instruction &Inst) {
+unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) {
   DebugLoc DLoc = Inst.getDebugLoc();
   unsigned Lineno = DLoc.getLine();
   if (Lineno < HeaderLineno)
@@ -547,8 +224,7 @@
   DILocation DIL(DLoc.getAsMDNode(*Ctx));
   int LOffset = Lineno - HeaderLineno;
   unsigned Discriminator = DIL.getDiscriminator();
-  unsigned Weight =
-      BodySamples.lookup(InstructionLocation(LOffset, Discriminator));
+  unsigned Weight = Samples->samplesAt(LOffset, Discriminator);
   DEBUG(dbgs() << "    " << Lineno << "." << Discriminator << ":" << Inst
                << " (line offset: " << LOffset << "." << Discriminator
                << " - weight: " << Weight << ")\n");
@@ -557,24 +233,24 @@
 
 /// \brief Compute the weight of a basic block.
 ///
-/// The weight of basic block \p B is the maximum weight of all the
-/// instructions in B. The weight of \p B is computed and cached in
+/// The weight of basic block \p BB is the maximum weight of all the
+/// instructions in BB. The weight of \p BB is computed and cached in
 /// the BlockWeights map.
 ///
-/// \param B The basic block to query.
+/// \param BB The basic block to query.
 ///
-/// \returns The computed weight of B.
-unsigned SampleFunctionProfile::getBlockWeight(BasicBlock *B) {
-  // If we've computed B's weight before, return it.
+/// \returns The computed weight of BB.
+unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) {
+  // If we've computed BB's weight before, return it.
   std::pair<BlockWeightMap::iterator, bool> Entry =
-      BlockWeights.insert(std::make_pair(B, 0));
+      BlockWeights.insert(std::make_pair(BB, 0));
   if (!Entry.second)
     return Entry.first->second;
 
-  // Otherwise, compute and cache B's weight.
+  // Otherwise, compute and cache BB's weight.
   unsigned Weight = 0;
-  for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
-    unsigned InstWeight = getInstWeight(*I);
+  for (auto &I : BB->getInstList()) {
+    unsigned InstWeight = getInstWeight(I);
     if (InstWeight > Weight)
       Weight = InstWeight;
   }
@@ -588,13 +264,13 @@
 /// the weights of every basic block in the CFG.
 ///
 /// \param F The function to query.
-bool SampleFunctionProfile::computeBlockWeights(Function &F) {
+bool SampleProfileLoader::computeBlockWeights(Function &F) {
   bool Changed = false;
   DEBUG(dbgs() << "Block weights\n");
-  for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) {
-    unsigned Weight = getBlockWeight(B);
+  for (auto &BB : F) {
+    unsigned Weight = getBlockWeight(&BB);
     Changed |= (Weight > 0);
-    DEBUG(printBlockWeight(dbgs(), B));
+    DEBUG(printBlockWeight(dbgs(), &BB));
   }
 
   return Changed;
@@ -623,16 +299,13 @@
 /// \param DomTree  Opposite dominator tree. If \p Descendants is filled
 ///                 with blocks from \p BB1's dominator tree, then
 ///                 this is the post-dominator tree, and vice versa.
-void SampleFunctionProfile::findEquivalencesFor(
+void SampleProfileLoader::findEquivalencesFor(
     BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants,
     DominatorTreeBase<BasicBlock> *DomTree) {
-  for (SmallVectorImpl<BasicBlock *>::iterator I = Descendants.begin(),
-                                               E = Descendants.end();
-       I != E; ++I) {
-    BasicBlock *BB2 = *I;
+  for (auto *BB2 : Descendants) {
     bool IsDomParent = DomTree->dominates(BB2, BB1);
     bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
-    if (BB1 != BB2 && VisitedBlocks.insert(BB2) && IsDomParent &&
+    if (BB1 != BB2 && VisitedBlocks.insert(BB2).second && IsDomParent &&
         IsInSameLoop) {
       EquivalenceClass[BB2] = BB1;
 
@@ -660,12 +333,12 @@
 /// dominates B2, B2 post-dominates B1 and both are in the same loop.
 ///
 /// \param F The function to query.
-void SampleFunctionProfile::findEquivalenceClasses(Function &F) {
+void SampleProfileLoader::findEquivalenceClasses(Function &F) {
   SmallVector<BasicBlock *, 8> DominatedBBs;
   DEBUG(dbgs() << "\nBlock equivalence classes\n");
   // Find equivalence sets based on dominance and post-dominance information.
-  for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) {
-    BasicBlock *BB1 = B;
+  for (auto &BB : F) {
+    BasicBlock *BB1 = &BB;
 
     // Compute BB1's equivalence class once.
     if (EquivalenceClass.count(BB1)) {
@@ -712,8 +385,8 @@
   // each equivalence class has the largest weight, assign that weight
   // to all the blocks in that equivalence class.
   DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n");
-  for (Function::iterator B = F.begin(), E = F.end(); B != E; ++B) {
-    BasicBlock *BB = B;
+  for (auto &BI : F) {
+    BasicBlock *BB = &BI;
     BasicBlock *EquivBB = EquivalenceClass[BB];
     if (BB != EquivBB)
       BlockWeights[BB] = BlockWeights[EquivBB];
@@ -731,8 +404,8 @@
 /// \param UnknownEdge  Set if E has not been visited before.
 ///
 /// \returns E's weight, if known. Otherwise, return 0.
-unsigned SampleFunctionProfile::visitEdge(Edge E, unsigned *NumUnknownEdges,
-                                          Edge *UnknownEdge) {
+unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
+                                        Edge *UnknownEdge) {
   if (!VisitedEdges.count(E)) {
     (*NumUnknownEdges)++;
     *UnknownEdge = E;
@@ -753,11 +426,11 @@
 /// \param F  Function to process.
 ///
 /// \returns  True if new weights were assigned to edges or blocks.
-bool SampleFunctionProfile::propagateThroughEdges(Function &F) {
+bool SampleProfileLoader::propagateThroughEdges(Function &F) {
   bool Changed = false;
   DEBUG(dbgs() << "\nPropagation through edges\n");
-  for (Function::iterator BI = F.begin(), EI = F.end(); BI != EI; ++BI) {
-    BasicBlock *BB = BI;
+  for (auto &BI : F) {
+    BasicBlock *BB = &BI;
 
     // Visit all the predecessor and successor edges to determine
     // which ones have a weight assigned already. Note that it doesn't
@@ -771,16 +444,16 @@
 
       if (i == 0) {
         // First, visit all predecessor edges.
-        for (size_t I = 0; I < Predecessors[BB].size(); I++) {
-          Edge E = std::make_pair(Predecessors[BB][I], BB);
+        for (auto *Pred : Predecessors[BB]) {
+          Edge E = std::make_pair(Pred, BB);
           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
           if (E.first == E.second)
             SelfReferentialEdge = E;
         }
       } else {
         // On the second round, visit all successor edges.
-        for (size_t I = 0; I < Successors[BB].size(); I++) {
-          Edge E = std::make_pair(BB, Successors[BB][I]);
+        for (auto *Succ : Successors[BB]) {
+          Edge E = std::make_pair(BB, Succ);
           TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
         }
       }
@@ -821,7 +494,7 @@
                          << " known. Set weight for block: ";
                   printBlockWeight(dbgs(), BB););
           }
-          if (VisitedBlocks.insert(BB))
+          if (VisitedBlocks.insert(BB).second)
             Changed = true;
         } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) {
           // If there is a single unknown edge and the block has been
@@ -857,9 +530,9 @@
 ///
 /// We are interested in unique edges. If a block B1 has multiple
 /// edges to another block B2, we only add a single B1->B2 edge.
-void SampleFunctionProfile::buildEdges(Function &F) {
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *B1 = I;
+void SampleProfileLoader::buildEdges(Function &F) {
+  for (auto &BI : F) {
+    BasicBlock *B1 = &BI;
 
     // Add predecessors for B1.
     SmallPtrSet<BasicBlock *, 16> Visited;
@@ -867,7 +540,7 @@
       llvm_unreachable("Found a stale predecessors list in a basic block.");
     for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
       BasicBlock *B2 = *PI;
-      if (Visited.insert(B2))
+      if (Visited.insert(B2).second)
         Predecessors[B1].push_back(B2);
     }
 
@@ -877,7 +550,7 @@
       llvm_unreachable("Found a stale successors list in a basic block.");
     for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
       BasicBlock *B2 = *SI;
-      if (Visited.insert(B2))
+      if (Visited.insert(B2).second)
         Successors[B1].push_back(B2);
     }
   }
@@ -885,22 +558,22 @@
 
 /// \brief Propagate weights into edges
 ///
-/// The following rules are applied to every block B in the CFG:
+/// The following rules are applied to every block BB in the CFG:
 ///
-/// - If B has a single predecessor/successor, then the weight
+/// - If BB has a single predecessor/successor, then the weight
 ///   of that edge is the weight of the block.
 ///
 /// - If all incoming or outgoing edges are known except one, and the
 ///   weight of the block is already known, the weight of the unknown
 ///   edge will be the weight of the block minus the sum of all the known
-///   edges. If the sum of all the known edges is larger than B's weight,
+///   edges. If the sum of all the known edges is larger than BB's weight,
 ///   we set the unknown edge weight to zero.
 ///
 /// - If there is a self-referential edge, and the weight of the block is
 ///   known, the weight for that edge is set to the weight of the block
 ///   minus the weight of the other incoming edges to that block (if
 ///   known).
-void SampleFunctionProfile::propagateWeights(Function &F) {
+void SampleProfileLoader::propagateWeights(Function &F) {
   bool Changed = true;
   unsigned i = 0;
 
@@ -920,9 +593,9 @@
   // edge weights computed during propagation.
   DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
   MDBuilder MDB(F.getContext());
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *B = I;
-    TerminatorInst *TI = B->getTerminator();
+  for (auto &BI : F) {
+    BasicBlock *BB = &BI;
+    TerminatorInst *TI = BB->getTerminator();
     if (TI->getNumSuccessors() == 1)
       continue;
     if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
@@ -934,7 +607,7 @@
     bool AllWeightsZero = true;
     for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
       BasicBlock *Succ = TI->getSuccessor(I);
-      Edge E = std::make_pair(B, Succ);
+      Edge E = std::make_pair(BB, Succ);
       unsigned Weight = EdgeWeights[E];
       DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
       Weights.push_back(Weight);
@@ -965,22 +638,17 @@
 ///
 /// \returns the line number where \p F is defined. If it returns 0,
 ///          it means that there is no debug information available for \p F.
-unsigned SampleFunctionProfile::getFunctionLoc(Function &F) {
-  NamedMDNode *CUNodes = F.getParent()->getNamedMetadata("llvm.dbg.cu");
-  if (CUNodes) {
-    for (unsigned I = 0, E1 = CUNodes->getNumOperands(); I != E1; ++I) {
-      DICompileUnit CU(CUNodes->getOperand(I));
-      DIArray Subprograms = CU.getSubprograms();
-      for (unsigned J = 0, E2 = Subprograms.getNumElements(); J != E2; ++J) {
-        DISubprogram Subprogram(Subprograms.getElement(J));
-        if (Subprogram.describes(&F))
-          return Subprogram.getLineNumber();
-      }
-    }
-  }
+unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
+  DISubprogram S = getDISubprogram(&F);
+  if (S.isSubprogram())
+    return S.getLineNumber();
 
+  // If could not find the start of \p F, emit a diagnostic to inform the user
+  // about the missed opportunity.
   F.getContext().diagnose(DiagnosticInfoSampleProfile(
-      "No debug information found in function " + F.getName()));
+      "No debug information found in function " + F.getName() +
+          ": Function profile not used",
+      DS_Warning));
   return 0;
 }
 
@@ -1002,15 +670,15 @@
 ///
 /// 3- Propagation of block weights into edges. This uses a simple
 ///    propagation heuristic. The following rules are applied to every
-///    block B in the CFG:
+///    block BB in the CFG:
 ///
-///    - If B has a single predecessor/successor, then the weight
+///    - If BB has a single predecessor/successor, then the weight
 ///      of that edge is the weight of the block.
 ///
 ///    - If all the edges are known except one, and the weight of the
 ///      block is already known, the weight of the unknown edge will
 ///      be the weight of the block minus the sum of all the known
-///      edges. If the sum of all the known edges is larger than B's weight,
+///      edges. If the sum of all the known edges is larger than BB's weight,
 ///      we set the unknown edge weight to zero.
 ///
 ///    - If there is a self-referential edge, and the weight of the block is
@@ -1028,14 +696,12 @@
 /// work here.
 ///
 /// Once all the branch weights are computed, we emit the MD_prof
-/// metadata on B using the computed values for each of its branches.
+/// metadata on BB using the computed values for each of its branches.
 ///
 /// \param F The function to query.
 ///
 /// \returns true if \p F was modified. Returns false, otherwise.
-bool SampleFunctionProfile::emitAnnotations(Function &F, DominatorTree *DomTree,
-                                            PostDominatorTree *PostDomTree,
-                                            LoopInfo *Loops) {
+bool SampleProfileLoader::emitAnnotations(Function &F) {
   bool Changed = false;
 
   // Initialize invariants used during computation and propagation.
@@ -1045,10 +711,6 @@
 
   DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
                << ": " << HeaderLineno << "\n");
-  DT = DomTree;
-  PDT = PostDomTree;
-  LI = Loops;
-  Ctx = &F.getParent()->getContext();
 
   // Compute basic block weights.
   Changed |= computeBlockWeights(F);
@@ -1075,8 +737,14 @@
                     "Sample Profile loader", false, false)
 
 bool SampleProfileLoader::doInitialization(Module &M) {
-  Profiler.reset(new SampleModuleProfile(M, Filename));
-  ProfileIsValid = Profiler->loadText();
+  auto ReaderOrErr = SampleProfileReader::create(Filename, M.getContext());
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    std::string Msg = "Could not open profile: " + EC.message();
+    M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg));
+    return false;
+  }
+  Reader = std::move(ReaderOrErr.get());
+  ProfileIsValid = (Reader->read() == sampleprof_error::success);
   return true;
 }
 
@@ -1091,11 +759,13 @@
 bool SampleProfileLoader::runOnFunction(Function &F) {
   if (!ProfileIsValid)
     return false;
-  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>();
-  LoopInfo *LI = &getAnalysis<LoopInfo>();
-  SampleFunctionProfile &FunctionProfile = Profiler->getProfile(F);
-  if (!FunctionProfile.empty())
-    return FunctionProfile.emitAnnotations(F, DT, PDT, LI);
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  PDT = &getAnalysis<PostDominatorTree>();
+  LI = &getAnalysis<LoopInfo>();
+  Ctx = &F.getParent()->getContext();
+  Samples = Reader->getSamplesFor(F);
+  if (!Samples->empty())
+    return emitAnnotations(F);
   return false;
 }

diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index edf012d..a16e9e2 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp

@@ -28,6 +28,7 @@
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
+  initializeAlignmentFromAssumptionsPass(Registry);
   initializeSampleProfileLoaderPass(Registry);
   initializeConstantHoistingPass(Registry);
   initializeConstantPropagationPass(Registry);
@@ -38,6 +39,7 @@
   initializeDSEPass(Registry);
   initializeGVNPass(Registry);
   initializeEarlyCSEPass(Registry);
+  initializeFlattenCFGPassPass(Registry);
   initializeIndVarSimplifyPass(Registry);
   initializeJumpThreadingPass(Registry);
   initializeLICMPass(Registry);
@@ -52,6 +54,7 @@
   initializeLowerAtomicPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeMemCpyOptPass(Registry);
+  initializeMergedLoadStoreMotionPass(Registry);
   initializePartiallyInlineLibCallsPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
@@ -76,6 +79,10 @@
   unwrap(PM)->add(createAggressiveDCEPass());
 }
 
+void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAlignmentFromAssumptionsPass());
+}
+
 void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCFGSimplificationPass());
 }
@@ -92,6 +99,10 @@
   unwrap(PM)->add(createGVNPass());
 }
 
+void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMergedLoadStoreMotionPass());
+}
+
 void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createIndVarSimplifyPass());
 }
@@ -140,6 +151,10 @@
   unwrap(PM)->add(createPartiallyInlineLibCallsPass());
 }
 
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerSwitchPass());
+}
+
 void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createPromoteMemoryToRegisterPass());
 }
@@ -198,6 +213,10 @@
   unwrap(PM)->add(createTypeBasedAliasAnalysisPass());
 }
 
+void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScopedNoAliasAAPass());
+}
+
 void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBasicAliasAnalysisPass());
 }

diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index e2a24a7..f7fa917 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp

@@ -23,6 +23,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
@@ -197,6 +198,7 @@
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.setPreservesCFG();
     }
@@ -214,6 +216,7 @@
     // getAnalysisUsage - This pass does not require any passes, but we know it
     // will not alter the CFG, so say so.
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionTracker>();
       AU.setPreservesCFG();
     }
   };
@@ -225,12 +228,14 @@
 
 INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
                 "Scalar Replacement of Aggregates (DT)", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
                 "Scalar Replacement of Aggregates (DT)", false, false)
 
 INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa",
                       "Scalar Replacement of Aggregates (SSAUp)", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
                     "Scalar Replacement of Aggregates (SSAUp)", false, false)
 
@@ -1119,9 +1124,9 @@
       } else {
         continue;
       }
-      Instruction *DbgVal =
-        DIB->insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
-                                     Inst);
+      Instruction *DbgVal = DIB->insertDbgValueIntrinsic(
+          Arg, 0, DIVariable(DVI->getVariable()),
+          DIExpression(DVI->getExpression()), Inst);
       DbgVal->setDebugLoc(DVI->getDebugLoc());
     }
   }
@@ -1333,12 +1338,15 @@
         LoadInst *FalseLoad =
           Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f");
 
-        // Transfer alignment and TBAA info if present.
+        // Transfer alignment and AA info if present.
         TrueLoad->setAlignment(LI->getAlignment());
         FalseLoad->setAlignment(LI->getAlignment());
-        if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
-          TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
-          FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
+
+        AAMDNodes Tags;
+        LI->getAAMetadata(Tags);
+        if (Tags) {
+          TrueLoad->setAAMetadata(Tags);
+          FalseLoad->setAAMetadata(Tags);
         }
 
         Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad);
@@ -1364,10 +1372,12 @@
     PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
                                      PN->getName()+".ld", PN);
 
-    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
+    // Get the AA tags and alignment to use from one of the loads.  It doesn't
     // matter which one we get and if any differ, it doesn't matter.
     LoadInst *SomeLoad = cast<LoadInst>(PN->user_back());
-    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+
+    AAMDNodes AATags;
+    SomeLoad->getAAMetadata(AATags);
     unsigned Align = SomeLoad->getAlignment();
 
     // Rewrite all loads of the PN to use the new PHI.
@@ -1389,7 +1399,7 @@
                             PN->getName() + "." + Pred->getName(),
                             Pred->getTerminator());
         Load->setAlignment(Align);
-        if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+        if (AATags) Load->setAAMetadata(AATags);
       }
 
       NewPN->addIncoming(Load, Pred);
@@ -1407,6 +1417,7 @@
   DominatorTree *DT = nullptr;
   if (HasDomTree)
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
 
   BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
   DIBuilder DIB(*F.getParent());
@@ -1425,7 +1436,7 @@
     if (Allocas.empty()) break;
 
     if (HasDomTree)
-      PromoteMemToReg(Allocas, *DT);
+      PromoteMemToReg(Allocas, *DT, nullptr, AT);
     else {
       SSAUpdater SSA;
       for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
@@ -1658,7 +1669,7 @@
                                            AllocaInfo &Info) {
   // If we've already checked this PHI, don't do it again.
   if (PHINode *PN = dyn_cast<PHINode>(I))
-    if (!Info.CheckedPHIs.insert(PN))
+    if (!Info.CheckedPHIs.insert(PN).second)
       return;
 
   for (User *U : I->users()) {

diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index 7a73f11..6036c09 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp

@@ -150,6 +150,16 @@
   bool visitLoadInst(LoadInst &);
   bool visitStoreInst(StoreInst &);
 
+  static void registerOptions() {
+    // This is disabled by default because having separate loads and stores
+    // makes it more likely that the -combiner-alias-analysis limits will be
+    // reached.
+    OptionRegistry::registerOption<bool, Scalarizer,
+                                 &Scalarizer::ScalarizeLoadStore>(
+        "scalarize-load-store",
+        "Allow the scalarizer pass to scalarize loads and store", false);
+  }
+
 private:
   Scatterer scatter(Instruction *, Value *);
   void gather(Instruction *, const ValueVector &);
@@ -164,19 +174,14 @@
   GatherList Gathered;
   unsigned ParallelLoopAccessMDKind;
   const DataLayout *DL;
+  bool ScalarizeLoadStore;
 };
 
 char Scalarizer::ID = 0;
 } // end anonymous namespace
 
-// This is disabled by default because having separate loads and stores makes
-// it more likely that the -combiner-alias-analysis limits will be reached.
-static cl::opt<bool> ScalarizeLoadStore
-  ("scalarize-load-store", cl::Hidden, cl::init(false),
-   cl::desc("Allow the scalarizer pass to scalarize loads and store"));
-
-INITIALIZE_PASS(Scalarizer, "scalarizer", "Scalarize vector operations",
-                false, false)
+INITIALIZE_PASS_WITH_OPTIONS(Scalarizer, "scalarizer",
+                             "Scalarize vector operations", false, false)
 
 Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
                      ValueVector *cachePtr)
@@ -236,7 +241,9 @@
 
 bool Scalarizer::doInitialization(Module &M) {
   ParallelLoopAccessMDKind =
-    M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+      M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+  ScalarizeLoadStore =
+      M.getContext().getOption<bool, Scalarizer, &Scalarizer::ScalarizeLoadStore>();
   return false;
 }
 
@@ -312,6 +319,8 @@
           || Tag == LLVMContext::MD_fpmath
           || Tag == LLVMContext::MD_tbaa_struct
           || Tag == LLVMContext::MD_invariant_load
+          || Tag == LLVMContext::MD_alias_scope
+          || Tag == LLVMContext::MD_noalias
           || Tag == ParallelLoopAccessMDKind);
 }
 
@@ -322,8 +331,10 @@
   Op->getAllMetadataOtherThanDebugLoc(MDs);
   for (unsigned I = 0, E = CV.size(); I != E; ++I) {
     if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
-      for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
-             MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI)
+      for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator
+               MI = MDs.begin(),
+               ME = MDs.end();
+           MI != ME; ++MI)
         if (canTransferMetadata(MI->first))
           New->setMetadata(MI->first, MI->second);
       New->setDebugLoc(Op->getDebugLoc());

diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 62f2026..6157746 100644
--- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

@@ -79,6 +79,81 @@
 // ld.global.f32   %f3, [%rl6+128]; // much better
 // ld.global.f32   %f4, [%rl6+132]; // much better
 //
+// Another improvement enabled by the LowerGEP flag is to lower a GEP with
+// multiple indices to either multiple GEPs with a single index or arithmetic
+// operations (depending on whether the target uses alias analysis in codegen).
+// Such transformation can have following benefits:
+// (1) It can always extract constants in the indices of structure type.
+// (2) After such Lowering, there are more optimization opportunities such as
+//     CSE, LICM and CGP.
+//
+// E.g. The following GEPs have multiple indices:
+//  BB1:
+//    %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
+//    load %p
+//    ...
+//  BB2:
+//    %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
+//    load %p2
+//    ...
+//
+// We can not do CSE for to the common part related to index "i64 %i". Lowering
+// GEPs can achieve such goals.
+// If the target does not use alias analysis in codegen, this pass will
+// lower a GEP with multiple indices into arithmetic operations:
+//  BB1:
+//    %1 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
+//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %3 = add i64 %1, %2                          ; CSE opportunity
+//    %4 = mul i64 %j1, length_of_struct
+//    %5 = add i64 %3, %4
+//    %6 = add i64 %3, struct_field_3              ; Constant offset
+//    %p = inttoptr i64 %6 to i32*
+//    load %p
+//    ...
+//  BB2:
+//    %7 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
+//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %9 = add i64 %7, %8                          ; CSE opportunity
+//    %10 = mul i64 %j2, length_of_struct
+//    %11 = add i64 %9, %10
+//    %12 = add i64 %11, struct_field_2            ; Constant offset
+//    %p = inttoptr i64 %12 to i32*
+//    load %p2
+//    ...
+//
+// If the target uses alias analysis in codegen, this pass will lower a GEP
+// with multiple indices into multiple GEPs with a single index:
+//  BB1:
+//    %1 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
+//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %3 = getelementptr i8* %1, i64 %2            ; CSE opportunity
+//    %4 = mul i64 %j1, length_of_struct
+//    %5 = getelementptr i8* %3, i64 %4
+//    %6 = getelementptr i8* %5, struct_field_3    ; Constant offset
+//    %p = bitcast i8* %6 to i32*
+//    load %p
+//    ...
+//  BB2:
+//    %7 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
+//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
+//    %9 = getelementptr i8* %7, i64 %8            ; CSE opportunity
+//    %10 = mul i64 %j2, length_of_struct
+//    %11 = getelementptr i8* %9, i64 %10
+//    %12 = getelementptr i8* %11, struct_field_2  ; Constant offset
+//    %p2 = bitcast i8* %12 to i32*
+//    load %p2
+//    ...
+//
+// Lowering GEPs can also benefit other passes such as LICM and CGP.
+// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
+// indices if one of the index is variant. If we lower such GEP into invariant
+// parts and variant parts, LICM can hoist/sink those invariant parts.
+// CGP (CodeGen Prepare) tries to sink address calculations that match the
+// target's addressing modes. A GEP with multiple indices may not match and will
+// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
+// them. So we end up with a better addressing mode.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -92,6 +167,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
 
 using namespace llvm;
 
@@ -117,18 +195,17 @@
 /// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
 class ConstantOffsetExtractor {
  public:
-  /// Extracts a constant offset from the given GEP index. It outputs the
-  /// numeric value of the extracted constant offset (0 if failed), and a
+  /// Extracts a constant offset from the given GEP index. It returns the
   /// new index representing the remainder (equal to the original index minus
-  /// the constant offset).
+  /// the constant offset), or nullptr if we cannot extract a constant offset.
   /// \p Idx    The given GEP index
-  /// \p NewIdx The new index to replace (output)
   /// \p DL     The datalayout of the module
   /// \p GEP    The given GEP
-  static int64_t Extract(Value *Idx, Value *&NewIdx, const DataLayout *DL,
-                         GetElementPtrInst *GEP);
-  /// Looks for a constant offset without extracting it. The meaning of the
-  /// arguments and the return value are the same as Extract.
+  static Value *Extract(Value *Idx, const DataLayout *DL,
+                        GetElementPtrInst *GEP);
+  /// Looks for a constant offset from the given GEP index without extracting
+  /// it. It returns the numeric value of the extracted constant offset (0 if
+  /// failed). The meaning of the arguments are the same as Extract.
   static int64_t Find(Value *Idx, const DataLayout *DL, GetElementPtrInst *GEP);
 
  private:
@@ -228,7 +305,9 @@
 class SeparateConstOffsetFromGEP : public FunctionPass {
  public:
   static char ID;
-  SeparateConstOffsetFromGEP() : FunctionPass(ID) {
+  SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
+                             bool LowerGEP = false)
+      : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
     initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
   }
 
@@ -251,10 +330,29 @@
   /// Tries to split the given GEP into a variadic base and a constant offset,
   /// and returns true if the splitting succeeds.
   bool splitGEP(GetElementPtrInst *GEP);
-  /// Finds the constant offset within each index, and accumulates them. This
-  /// function only inspects the GEP without changing it. The output
-  /// NeedsExtraction indicates whether we can extract a non-zero constant
-  /// offset from any index.
+  /// Lower a GEP with multiple indices into multiple GEPs with a single index.
+  /// Function splitGEP already split the original GEP into a variadic part and
+  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+  /// variadic part into a set of GEPs with a single index and applies
+  /// AccumulativeByteOffset to it.
+  /// \p Variadic                  The variadic part of the original GEP.
+  /// \p AccumulativeByteOffset    The constant offset.
+  void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
+                              int64_t AccumulativeByteOffset);
+  /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
+  /// Function splitGEP already split the original GEP into a variadic part and
+  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+  /// variadic part into a set of arithmetic operations and applies
+  /// AccumulativeByteOffset to it.
+  /// \p Variadic                  The variadic part of the original GEP.
+  /// \p AccumulativeByteOffset    The constant offset.
+  void lowerToArithmetics(GetElementPtrInst *Variadic,
+                          int64_t AccumulativeByteOffset);
+  /// Finds the constant offset within each index and accumulates them. If
+  /// LowerGEP is true, it finds in indices of both sequential and structure
+  /// types, otherwise it only finds in sequential indices. The output
+  /// NeedsExtraction indicates whether we successfully find a non-zero constant
+  /// offset.
   int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
   /// Canonicalize array indices to pointer-size integers. This helps to
   /// simplify the logic of splitting a GEP. For example, if a + b is a
@@ -272,25 +370,12 @@
   ///
   /// Verified in @i32_add in split-gep.ll
   bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
-  /// For each array index that is in the form of zext(a), convert it to sext(a)
-  /// if we can prove zext(a) <= max signed value of typeof(a). We prefer
-  /// sext(a) to zext(a), because in the special case where x + y >= 0 and
-  /// (x >= 0 or y >= 0), function CanTraceInto can split sext(x + y),
-  /// while no such case exists for zext(x + y).
-  ///
-  /// Note that
-  ///   zext(x + y) = zext(x) + zext(y)
-  /// is wrong, e.g.,
-  ///   zext i32(UINT_MAX + 1) to i64 !=
-  ///   (zext i32 UINT_MAX to i64) + (zext i32 1 to i64)
-  ///
-  /// Returns true if the module changes.
-  ///
-  /// Verified in @inbounds_zext_add in split-gep.ll and @sum_of_array3 in
-  /// split-gep-and-gvn.ll
-  bool convertInBoundsZExtToSExt(GetElementPtrInst *GEP);
 
   const DataLayout *DL;
+  const TargetMachine *TM;
+  /// Whether to lower a GEP with multiple indices into arithmetic operations or
+  /// multiple GEPs with a single index.
+  bool LowerGEP;
 };
 }  // anonymous namespace
 
@@ -306,8 +391,10 @@
     "Split GEPs to a variadic base and a constant offset for better CSE", false,
     false)
 
-FunctionPass *llvm::createSeparateConstOffsetFromGEPPass() {
-  return new SeparateConstOffsetFromGEP();
+FunctionPass *
+llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
+                                           bool LowerGEP) {
+  return new SeparateConstOffsetFromGEP(TM, LowerGEP);
 }
 
 bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
@@ -536,8 +623,13 @@
     //
     // Replacing the "or" with "add" is fine, because
     //   a | (b + 5) = a + (b + 5) = (a + b) + 5
-    return BinaryOperator::CreateAdd(BO->getOperand(0), BO->getOperand(1),
-                                     BO->getName(), IP);
+    if (OpNo == 0) {
+      return BinaryOperator::CreateAdd(NextInChain, TheOther, BO->getName(),
+                                       IP);
+    } else {
+      return BinaryOperator::CreateAdd(TheOther, NextInChain, BO->getName(),
+                                       IP);
+    }
   }
 
   // We can reuse BO in this case, because the new expression shares the same
@@ -554,19 +646,17 @@
   return BO;
 }
 
-int64_t ConstantOffsetExtractor::Extract(Value *Idx, Value *&NewIdx,
-                                         const DataLayout *DL,
-                                         GetElementPtrInst *GEP) {
+Value *ConstantOffsetExtractor::Extract(Value *Idx, const DataLayout *DL,
+                                        GetElementPtrInst *GEP) {
   ConstantOffsetExtractor Extractor(DL, GEP);
   // Find a non-zero constant offset first.
   APInt ConstantOffset =
       Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
                      GEP->isInBounds());
-  if (ConstantOffset != 0) {
-    // Separates the constant offset from the GEP index.
-    NewIdx = Extractor.rebuildWithoutConstOffset();
-  }
-  return ConstantOffset.getSExtValue();
+  if (ConstantOffset == 0)
+    return nullptr;
+  // Separates the constant offset from the GEP index.
+  return Extractor.rebuildWithoutConstOffset();
 }
 
 int64_t ConstantOffsetExtractor::Find(Value *Idx, const DataLayout *DL,
@@ -613,43 +703,6 @@
   return Changed;
 }
 
-bool
-SeparateConstOffsetFromGEP::convertInBoundsZExtToSExt(GetElementPtrInst *GEP) {
-  if (!GEP->isInBounds())
-    return false;
-
-  // TODO: consider alloca
-  GlobalVariable *UnderlyingObject =
-      dyn_cast<GlobalVariable>(GEP->getPointerOperand());
-  if (UnderlyingObject == nullptr)
-    return false;
-
-  uint64_t ObjectSize =
-      DL->getTypeAllocSize(UnderlyingObject->getType()->getElementType());
-  gep_type_iterator GTI = gep_type_begin(*GEP);
-  bool Changed = false;
-  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); I != E;
-       ++I, ++GTI) {
-    if (isa<SequentialType>(*GTI)) {
-      if (ZExtInst *Extended = dyn_cast<ZExtInst>(*I)) {
-        unsigned SrcBitWidth =
-            cast<IntegerType>(Extended->getSrcTy())->getBitWidth();
-        // For GEP operand zext(a), if a <= max signed value of typeof(a), then
-        // the sign bit of a is zero and sext(a) = zext(a). Because the GEP is
-        // in bounds, we know a <= ObjectSize, so the condition can be reduced
-        // to ObjectSize <= max signed value of typeof(a).
-        if (ObjectSize <=
-            APInt::getSignedMaxValue(SrcBitWidth).getZExtValue()) {
-          *I = new SExtInst(Extended->getOperand(0), Extended->getType(),
-                            Extended->getName(), GEP);
-          Changed = true;
-        }
-      }
-    }
-  }
-  return Changed;
-}
-
 int64_t
 SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
                                                  bool &NeedsExtraction) {
@@ -669,11 +722,116 @@
         AccumulativeByteOffset +=
             ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
       }
+    } else if (LowerGEP) {
+      StructType *StTy = cast<StructType>(*GTI);
+      uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
+      // Skip field 0 as the offset is always 0.
+      if (Field != 0) {
+        NeedsExtraction = true;
+        AccumulativeByteOffset +=
+            DL->getStructLayout(StTy)->getElementOffset(Field);
+      }
     }
   }
   return AccumulativeByteOffset;
 }
 
+void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
+    GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
+  IRBuilder<> Builder(Variadic);
+  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+  Type *I8PtrTy =
+      Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
+  Value *ResultPtr = Variadic->getOperand(0);
+  if (ResultPtr->getType() != I8PtrTy)
+    ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+
+  gep_type_iterator GTI = gep_type_begin(*Variadic);
+  // Create an ugly GEP for each sequential index. We don't create GEPs for
+  // structure indices, as they are accumulated in the constant offset index.
+  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      Value *Idx = Variadic->getOperand(I);
+      // Skip zero indices.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+        if (CI->isZero())
+          continue;
+
+      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+      // Scale the index by element size.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          Idx = Builder.CreateShl(
+              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+        } else {
+          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+        }
+      }
+      // Create an ugly GEP with a single index for each index.
+      ResultPtr = Builder.CreateGEP(ResultPtr, Idx, "uglygep");
+    }
+  }
+
+  // Create a GEP with the constant offset index.
+  if (AccumulativeByteOffset != 0) {
+    Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
+    ResultPtr = Builder.CreateGEP(ResultPtr, Offset, "uglygep");
+  }
+  if (ResultPtr->getType() != Variadic->getType())
+    ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
+
+  Variadic->replaceAllUsesWith(ResultPtr);
+  Variadic->eraseFromParent();
+}
+
+void
+SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
+                                               int64_t AccumulativeByteOffset) {
+  IRBuilder<> Builder(Variadic);
+  Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+  Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
+  gep_type_iterator GTI = gep_type_begin(*Variadic);
+  // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
+  // don't create arithmetics for structure indices, as they are accumulated
+  // in the constant offset index.
+  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      Value *Idx = Variadic->getOperand(I);
+      // Skip zero indices.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+        if (CI->isZero())
+          continue;
+
+      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+                                DL->getTypeAllocSize(GTI.getIndexedType()));
+      // Scale the index by element size.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          Idx = Builder.CreateShl(
+              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+        } else {
+          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+        }
+      }
+      // Create an ADD for each index.
+      ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
+    }
+  }
+
+  // Create an ADD for the constant offset index.
+  if (AccumulativeByteOffset != 0) {
+    ResultPtr = Builder.CreateAdd(
+        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
+  }
+
+  ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
+  Variadic->replaceAllUsesWith(ResultPtr);
+  Variadic->eraseFromParent();
+}
+
 bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // Skip vector GEPs.
   if (GEP->getType()->isVectorTy())
@@ -684,41 +842,49 @@
   if (GEP->hasAllConstantIndices())
     return false;
 
-  bool Changed = false;
-  Changed |= canonicalizeArrayIndicesToPointerSize(GEP);
-  Changed |= convertInBoundsZExtToSExt(GEP);
+  bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
 
   bool NeedsExtraction;
   int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
 
   if (!NeedsExtraction)
     return Changed;
-  // Before really splitting the GEP, check whether the backend supports the
-  // addressing mode we are about to produce. If no, this splitting probably
-  // won't be beneficial.
-  TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
-  if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
-                                 /*BaseGV=*/nullptr, AccumulativeByteOffset,
-                                 /*HasBaseReg=*/true, /*Scale=*/0)) {
-    return Changed;
+  // If LowerGEP is disabled, before really splitting the GEP, check whether the
+  // backend supports the addressing mode we are about to produce. If no, this
+  // splitting probably won't be beneficial.
+  // If LowerGEP is enabled, even the extracted constant offset can not match
+  // the addressing mode, we can still do optimizations to other lowered parts
+  // of variable indices. Therefore, we don't check for addressing modes in that
+  // case.
+  if (!LowerGEP) {
+    TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
+    if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
+                                   /*BaseGV=*/nullptr, AccumulativeByteOffset,
+                                   /*HasBaseReg=*/true, /*Scale=*/0)) {
+      return Changed;
+    }
   }
 
-  // Remove the constant offset in each GEP index. The resultant GEP computes
-  // the variadic base.
+  // Remove the constant offset in each sequential index. The resultant GEP
+  // computes the variadic base.
+  // Notice that we don't remove struct field indices here. If LowerGEP is
+  // disabled, a structure index is not accumulated and we still use the old
+  // one. If LowerGEP is enabled, a structure index is accumulated in the
+  // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
+  // handle the constant offset and won't need a new structure index.
   gep_type_iterator GTI = gep_type_begin(*GEP);
   for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
     if (isa<SequentialType>(*GTI)) {
-      Value *NewIdx = nullptr;
-      // Tries to extract a constant offset from this GEP index.
-      int64_t ConstantOffset =
-          ConstantOffsetExtractor::Extract(GEP->getOperand(I), NewIdx, DL, GEP);
-      if (ConstantOffset != 0) {
-        assert(NewIdx != nullptr &&
-               "ConstantOffset != 0 implies NewIdx is set");
+      // Splits this GEP index into a variadic part and a constant offset, and
+      // uses the variadic part as the new index.
+      Value *NewIdx =
+          ConstantOffsetExtractor::Extract(GEP->getOperand(I), DL, GEP);
+      if (NewIdx != nullptr) {
         GEP->setOperand(I, NewIdx);
       }
     }
   }
+
   // Clear the inbounds attribute because the new index may be off-bound.
   // e.g.,
   //
@@ -740,6 +906,21 @@
   // possible. GEPs with inbounds are more friendly to alias analysis.
   GEP->setIsInBounds(false);
 
+  // Lowers a GEP to either GEPs with a single index or arithmetic operations.
+  if (LowerGEP) {
+    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
+    // arithmetic operations if the target uses alias analysis in codegen.
+    if (TM && TM->getSubtarget<TargetSubtargetInfo>().useAA())
+      lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
+    else
+      lowerToArithmetics(GEP, AccumulativeByteOffset);
+    return true;
+  }
+
+  // No need to create another GEP if the accumulative byte offset is 0.
+  if (AccumulativeByteOffset == 0)
+    return true;
+
   // Offsets the base with the accumulative byte offset.
   //
   //   %gep                        ; the base
@@ -771,16 +952,16 @@
   Instruction *NewGEP = GEP->clone();
   NewGEP->insertBefore(GEP);
 
-  uint64_t ElementTypeSizeOfGEP =
-      DL->getTypeAllocSize(GEP->getType()->getElementType());
+  // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
+  // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
+  // used with unsigned integers later.
+  int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
+      DL->getTypeAllocSize(GEP->getType()->getElementType()));
   Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
   if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
     // Very likely. As long as %gep is natually aligned, the byte offset we
     // extracted should be a multiple of sizeof(*%gep).
-    // Per ANSI C standard, signed / unsigned = unsigned. Therefore, we
-    // cast ElementTypeSizeOfGEP to signed.
-    int64_t Index =
-        AccumulativeByteOffset / static_cast<int64_t>(ElementTypeSizeOfGEP);
+    int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
     NewGEP = GetElementPtrInst::Create(
         NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP);
   } else {

diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 5d5606b..046a7cb 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp

@@ -25,6 +25,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
@@ -34,22 +35,30 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "simplifycfg"
 
+static cl::opt<unsigned>
+UserBonusInstThreshold("bonus-inst-threshold", cl::Hidden, cl::init(1),
+   cl::desc("Control the number of bonus instructions (default = 1)"));
+
 STATISTIC(NumSimpl, "Number of blocks simplified");
 
 namespace {
 struct CFGSimplifyPass : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid
-  CFGSimplifyPass() : FunctionPass(ID) {
+  unsigned BonusInstThreshold;
+  CFGSimplifyPass(int T = -1) : FunctionPass(ID) {
+    BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
     initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionTracker>();
     AU.addRequired<TargetTransformInfo>();
   }
 };
@@ -59,12 +68,13 @@
 INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
                       false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
                     false)
 
 // Public interface to the CFGSimplification pass
-FunctionPass *llvm::createCFGSimplificationPass() {
-  return new CFGSimplifyPass();
+FunctionPass *llvm::createCFGSimplificationPass(int Threshold) {
+  return new CFGSimplifyPass(Threshold);
 }
 
 /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
@@ -146,7 +156,9 @@
 /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
-                                   const DataLayout *DL) {
+                                   const DataLayout *DL,
+                                   AssumptionTracker *AT,
+                                   unsigned BonusInstThreshold) {
   bool Changed = false;
   bool LocalChange = true;
   while (LocalChange) {
@@ -155,7 +167,7 @@
     // Loop over all of the basic blocks and remove them if they are unneeded...
     //
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(BBIt++, TTI, DL)) {
+      if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, DL, AT)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -172,12 +184,13 @@
   if (skipOptnoneFunction(F))
     return false;
 
+  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
   const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, DL);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -191,7 +204,7 @@
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, DL);
+    EverChanged = iterativelySimplifyCFG(F, TTI, DL, AT, BonusInstThreshold);
     EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 

diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 7348c45..903b675 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp

@@ -56,7 +56,7 @@
     }
   private:
     bool ProcessBlock(BasicBlock &BB);
-    bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores);
+    bool SinkInstruction(Instruction *I, SmallPtrSetImpl<Instruction*> &Stores);
     bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const;
     bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const;
   };
@@ -157,7 +157,7 @@
 }
 
 static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
-                         SmallPtrSet<Instruction *, 8> &Stores) {
+                         SmallPtrSetImpl<Instruction *> &Stores) {
 
   if (Inst->mayWriteToMemory()) {
     Stores.insert(Inst);
@@ -166,9 +166,8 @@
 
   if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
     AliasAnalysis::Location Loc = AA->getLocation(L);
-    for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(),
-         E = Stores.end(); I != E; ++I)
-      if (AA->getModRefInfo(*I, Loc) & AliasAnalysis::Mod)
+    for (Instruction *S : Stores)
+      if (AA->getModRefInfo(S, Loc) & AliasAnalysis::Mod)
         return false;
   }
 
@@ -220,7 +219,7 @@
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
 bool Sinking::SinkInstruction(Instruction *Inst,
-                              SmallPtrSet<Instruction *, 8> &Stores) {
+                              SmallPtrSetImpl<Instruction *> &Stores) {
 
   // Don't sink static alloca instructions.  CodeGen assumes allocas outside the
   // entry block are dynamically sized stack objects.

diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index 7b77ae1..b9673ed 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp

@@ -260,7 +260,7 @@
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(RegionInfo)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
 INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                     false, false)
 
@@ -406,11 +406,11 @@
     } else {
 
       // It's an exit from a sub region
-      while(R->getParent() != ParentRegion)
+      while (R->getParent() != ParentRegion)
         R = R->getParent();
 
       // Edge from inside a subregion to its entry, ignore it
-      if (R == N)
+      if (*R == *N)
         continue;
 
       BasicBlock *Entry = R->getEntry();

diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 05b9892..f3c3e30 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp

@@ -63,6 +63,7 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
@@ -86,6 +87,7 @@
 namespace {
   struct TailCallElim : public FunctionPass {
     const TargetTransformInfo *TTI;
+    const DataLayout *DL;
 
     static char ID; // Pass identification, replacement for typeid
     TailCallElim() : FunctionPass(ID) {
@@ -157,6 +159,8 @@
   if (skipOptnoneFunction(F))
     return false;
 
+  DL = F.getParent()->getDataLayout();
+
   bool AllCallsAreTailCalls = false;
   bool Modified = markTails(F, AllCallsAreTailCalls);
   if (AllCallsAreTailCalls)
@@ -175,7 +179,7 @@
 
     auto AddUsesToWorklist = [&](Value *V) {
       for (auto &U : V->uses()) {
-        if (!Visited.insert(&U))
+        if (!Visited.insert(&U).second)
           continue;
         Worklist.push_back(&U);
       }
@@ -227,12 +231,10 @@
   }
 
   void callUsesLocalStack(CallSite CS, bool IsNocapture) {
-    // Add it to the list of alloca users. If it's already there, skip further
-    // processing.
-    if (!AllocaUsers.insert(CS.getInstruction()))
-      return;
+    // Add it to the list of alloca users.
+    AllocaUsers.insert(CS.getInstruction());
 
-    // If it's nocapture then it can't capture the alloca.
+    // If it's nocapture then it can't capture this alloca.
     if (IsNocapture)
       return;
 
@@ -402,18 +404,28 @@
   // alloca' is changed from being a static alloca to being a dynamic alloca.
   // Until this is resolved, disable this transformation if that would ever
   // happen.  This bug is PR962.
+  SmallVector<BasicBlock*, 8> BBToErase;
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
       bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
                                           ArgumentPHIs, !CanTRETailMarkedCall);
-      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+      if (!Change && BB->getFirstNonPHIOrDbg() == Ret) {
         Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
                                           TailCallsAreMarkedTail, ArgumentPHIs,
                                           !CanTRETailMarkedCall);
+        // FoldReturnAndProcessPred may have emptied some BB. Remember to
+        // erase them.
+        if (Change && BB->empty())
+          BBToErase.push_back(BB);
+
+      }
       MadeChange |= Change;
     }
   }
 
+  for (auto BB: BBToErase)
+    BB->eraseFromParent();
+
   // If we eliminated any tail recursions, it's possible that we inserted some
   // silly PHI nodes which just merge an initial value (the incoming operand)
   // with themselves.  Check to see if we did and clean up our mess if so.  This
@@ -452,7 +464,7 @@
       // being loaded from.
       if (CI->mayWriteToMemory() ||
           !isSafeToLoadUnconditionally(L->getPointerOperand(), L,
-                                       L->getAlignment()))
+                                       L->getAlignment(), DL))
         return false;
     }
   }
@@ -821,8 +833,20 @@
     if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){
       DEBUG(dbgs() << "FOLDING: " << *BB
             << "INTO UNCOND BRANCH PRED: " << *Pred);
-      EliminateRecursiveTailCall(CI, FoldReturnIntoUncondBranch(Ret, BB, Pred),
-                                 OldEntry, TailCallsAreMarkedTail, ArgumentPHIs,
+      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+
+      // Cleanup: if all predecessors of BB have been eliminated by
+      // FoldReturnIntoUncondBranch, we would like to delete it, but we
+      // can not just nuke it as it is being used as an iterator by our caller.
+      // Just empty it, and the caller will erase it when it is safe to do so.
+      // It is important to empty it, because the ret instruction in there is
+      // still using a value which EliminateRecursiveTailCall will attempt
+      // to remove.
+      if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+        BB->getInstList().clear();
+
+      EliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
+                                 ArgumentPHIs,
                                  CannotTailCallElimCallsMarkedTail);
       ++NumRetDuped;
       Change = true;

diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp
index 196ac79..f8e5af5 100644
--- a/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/lib/Transforms/Utils/AddDiscriminators.cpp

@@ -193,13 +193,11 @@
         // Create a new lexical scope and compute a new discriminator
         // number for it.
         StringRef Filename = FirstDIL.getFilename();
-        unsigned LineNumber = FirstDIL.getLineNumber();
-        unsigned ColumnNumber = FirstDIL.getColumnNumber();
         DIScope Scope = FirstDIL.getScope();
         DIFile File = Builder.createFile(Filename, Scope.getDirectory());
         unsigned Discriminator = FirstDIL.computeNewDiscriminator(Ctx);
-        DILexicalBlock NewScope = Builder.createLexicalBlock(
-            Scope, File, LineNumber, ColumnNumber, Discriminator);
+        DILexicalBlockFile NewScope =
+            Builder.createLexicalBlockFile(Scope, File, Discriminator);
         DILocation NewDIL = FirstDIL.copyWithNewScope(Ctx, NewScope);
         DebugLoc newDebugLoc = DebugLoc::getFromDILocation(NewDIL);
 

diff --git a/lib/Transforms/Utils/Android.mk b/lib/Transforms/Utils/Android.mk
index 2390027..e20dc0a 100644
--- a/lib/Transforms/Utils/Android.mk
+++ b/lib/Transforms/Utils/Android.mk

@@ -13,6 +13,7 @@
   CodeExtractor.cpp \
   CtorUtils.cpp \
   DemoteRegToStack.cpp \
+  FlattenCFG.cpp \
   GlobalStatus.cpp \
   InlineFunction.cpp \
   InstructionNamer.cpp \
@@ -33,6 +34,7 @@
   SimplifyIndVar.cpp \
   SimplifyInstructions.cpp \
   SimplifyLibCalls.cpp \
+  SymbolRewriter.cpp \
   UnifyFunctionExitNodes.cpp \
   Utils.cpp \
   ValueMapper.cpp

diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 80b7e22..983f025 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp

@@ -265,6 +265,18 @@
   return SplitBlock(BB, BB->getTerminator(), P);
 }
 
+unsigned llvm::SplitAllCriticalEdges(Function &F, Pass *P) {
+  unsigned NumBroken = 0;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    TerminatorInst *TI = I->getTerminator();
+    if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
+      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+        if (SplitCriticalEdge(TI, i, P))
+          ++NumBroken;
+  }
+  return NumBroken;
+}
+
 /// SplitBlock - Split the specified block at the specified instruction - every
 /// thing before SplitPt stays in Old and everything starting with SplitPt moves
 /// to a new block.  The two blocks are joined by an unconditional branch and
@@ -673,7 +685,8 @@
 TerminatorInst *llvm::SplitBlockAndInsertIfThen(Value *Cond,
                                                 Instruction *SplitBefore,
                                                 bool Unreachable,
-                                                MDNode *BranchWeights) {
+                                                MDNode *BranchWeights,
+                                                DominatorTree *DT) {
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
   TerminatorInst *HeadOldTerm = Head->getTerminator();
@@ -690,6 +703,20 @@
   HeadNewTerm->setDebugLoc(SplitBefore->getDebugLoc());
   HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
   ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+
+  if (DT) {
+    if (DomTreeNode *OldNode = DT->getNode(Head)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT->addNewBlock(Tail, Head);
+      for (auto Child : Children)
+        DT->changeImmediateDominator(Child, NewNode);
+
+      // Head dominates ThenBlock.
+      DT->addNewBlock(ThenBlock, Head);
+    }
+  }
+
   return CheckTerm;
 }
 

diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 80bd516..eda22cf 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp

@@ -40,7 +40,11 @@
       initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnFunction(Function &F) override;
+    bool runOnFunction(Function &F) override {
+      unsigned N = SplitAllCriticalEdges(F, this);
+      NumBroken += N;
+      return N > 0;
+    }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addPreserved<DominatorTreeWrapperPass>();
@@ -62,24 +66,6 @@
   return new BreakCriticalEdges();
 }
 
-// runOnFunction - Loop over all of the edges in the CFG, breaking critical
-// edges as they are found.
-//
-bool BreakCriticalEdges::runOnFunction(Function &F) {
-  bool Changed = false;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    TerminatorInst *TI = I->getTerminator();
-    if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
-      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-        if (SplitCriticalEdge(TI, i, this)) {
-          ++NumBroken;
-          Changed = true;
-        }
-  }
-
-  return Changed;
-}
-
 //===----------------------------------------------------------------------===//
 //    Implementation of the external critical edge manipulation functions
 //===----------------------------------------------------------------------===//

diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index be00b69..112d26c 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp

@@ -42,8 +42,7 @@
   AttributeSet AS[2];
   AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen",
@@ -51,7 +50,7 @@
                                                               AS),
                                             TD->getIntPtrType(Context),
                                             B.getInt8PtrTy(),
-                                            NULL);
+                                            nullptr);
   CallInst *CI = B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen");
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -71,8 +70,7 @@
   AttributeSet AS[2];
   AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrNLen = M->getOrInsertFunction("strnlen",
@@ -81,7 +79,7 @@
                                              TD->getIntPtrType(Context),
                                              B.getInt8PtrTy(),
                                              TD->getIntPtrType(Context),
-                                             NULL);
+                                             nullptr);
   CallInst *CI = B.CreateCall2(StrNLen, CastToCStr(Ptr, B), MaxLen, "strnlen");
   if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -100,15 +98,14 @@
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
   AttributeSet AS =
-    AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                      ArrayRef<Attribute::AttrKind>(AVs, 2));
+    AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
 
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr = M->getOrInsertFunction("strchr",
                                             AttributeSet::get(M->getContext(),
                                                              AS),
-                                            I8Ptr, I8Ptr, I32Ty, NULL);
+                                            I8Ptr, I8Ptr, I32Ty, nullptr);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
                                ConstantInt::get(I32Ty, C), "strchr");
   if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts()))
@@ -128,8 +125,7 @@
   AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
   AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *StrNCmp = M->getOrInsertFunction("strncmp",
@@ -138,7 +134,7 @@
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
                                           B.getInt8PtrTy(),
-                                          TD->getIntPtrType(Context), NULL);
+                                          TD->getIntPtrType(Context), nullptr);
   CallInst *CI = B.CreateCall3(StrNCmp, CastToCStr(Ptr1, B),
                                CastToCStr(Ptr2, B), Len, "strncmp");
 
@@ -164,7 +160,7 @@
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name,
                                          AttributeSet::get(M->getContext(), AS),
-                                         I8Ptr, I8Ptr, I8Ptr, NULL);
+                                         I8Ptr, I8Ptr, I8Ptr, nullptr);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Name);
   if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts()))
@@ -190,7 +186,7 @@
                                           AttributeSet::get(M->getContext(),
                                                             AS),
                                           I8Ptr, I8Ptr, I8Ptr,
-                                          Len->getType(), NULL);
+                                          Len->getType(), nullptr);
   CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Len, "strncpy");
   if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts()))
@@ -218,7 +214,7 @@
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          TD->getIntPtrType(Context),
-                                         TD->getIntPtrType(Context), NULL);
+                                         TD->getIntPtrType(Context), nullptr);
   Dst = CastToCStr(Dst, B);
   Src = CastToCStr(Src, B);
   CallInst *CI = B.CreateCall4(MemCpy, Dst, Src, Len, ObjSize);
@@ -238,8 +234,7 @@
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeSet AS;
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                         ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemChr = M->getOrInsertFunction("memchr",
                                          AttributeSet::get(M->getContext(), AS),
@@ -247,7 +242,7 @@
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
                                          TD->getIntPtrType(Context),
-                                         NULL);
+                                         nullptr);
   CallInst *CI = B.CreateCall3(MemChr, CastToCStr(Ptr, B), Val, Len, "memchr");
 
   if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts()))
@@ -268,8 +263,7 @@
   AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
   AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex, AVs);
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCmp = M->getOrInsertFunction("memcmp",
@@ -277,7 +271,7 @@
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
-                                         TD->getIntPtrType(Context), NULL);
+                                         TD->getIntPtrType(Context), nullptr);
   CallInst *CI = B.CreateCall3(MemCmp, CastToCStr(Ptr1, B), CastToCStr(Ptr2, B),
                                Len, "memcmp");
 
@@ -313,7 +307,7 @@
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
-                                         Op->getType(), NULL);
+                                         Op->getType(), nullptr);
   CallInst *CI = B.CreateCall(Callee, Op, Name);
   CI->setAttributes(Attrs);
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
@@ -334,7 +328,7 @@
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Value *Callee = M->getOrInsertFunction(Name, Op1->getType(),
-                                         Op1->getType(), Op2->getType(), NULL);
+                                         Op1->getType(), Op2->getType(), nullptr);
   CallInst *CI = B.CreateCall2(Callee, Op1, Op2, Name);
   CI->setAttributes(Attrs);
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
@@ -352,7 +346,7 @@
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(),
-                                          B.getInt32Ty(), NULL);
+                                          B.getInt32Ty(), nullptr);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
                               B.getInt32Ty(),
@@ -382,7 +376,7 @@
                                        AttributeSet::get(M->getContext(), AS),
                                        B.getInt32Ty(),
                                        B.getInt8PtrTy(),
-                                       NULL);
+                                       nullptr);
   CallInst *CI = B.CreateCall(PutS, CastToCStr(Str, B), "puts");
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -407,12 +401,12 @@
                                AttributeSet::get(M->getContext(), AS),
                                B.getInt32Ty(),
                                B.getInt32Ty(), File->getType(),
-                               NULL);
+                               nullptr);
   else
     F = M->getOrInsertFunction("fputc",
                                B.getInt32Ty(),
                                B.getInt32Ty(),
-                               File->getType(), NULL);
+                               File->getType(), nullptr);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
                          "chari");
   CallInst *CI = B.CreateCall2(F, Char, File, "fputc");
@@ -442,11 +436,11 @@
                                AttributeSet::get(M->getContext(), AS),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
-                               File->getType(), NULL);
+                               File->getType(), nullptr);
   else
     F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
                                B.getInt8PtrTy(),
-                               File->getType(), NULL);
+                               File->getType(), nullptr);
   CallInst *CI = B.CreateCall2(F, CastToCStr(Str, B), File, "fputs");
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
@@ -478,13 +472,13 @@
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
                                TD->getIntPtrType(Context),
-                               File->getType(), NULL);
+                               File->getType(), nullptr);
   else
     F = M->getOrInsertFunction(FWriteName, TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
                                TD->getIntPtrType(Context),
-                               File->getType(), NULL);
+                               File->getType(), nullptr);
   CallInst *CI = B.CreateCall4(F, CastToCStr(Ptr, B), Size,
                         ConstantInt::get(TD->getIntPtrType(Context), 1), File);
 

diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index fcf548f..6ce22b1 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt

@@ -1,16 +1,17 @@
 add_llvm_library(LLVMTransformUtils
-  AddDiscriminators.cpp
   ASanStackFrameLayout.cpp
+  AddDiscriminators.cpp
   BasicBlockUtils.cpp
   BreakCriticalEdges.cpp
   BuildLibCalls.cpp
   BypassSlowDivision.cpp
-  CtorUtils.cpp
   CloneFunction.cpp
   CloneModule.cpp
   CmpInstAnalysis.cpp
   CodeExtractor.cpp
+  CtorUtils.cpp
   DemoteRegToStack.cpp
+  FlattenCFG.cpp
   GlobalStatus.cpp
   InlineFunction.cpp
   InstructionNamer.cpp
@@ -29,10 +30,10 @@
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
   SimplifyCFG.cpp
-  FlattenCFG.cpp
   SimplifyIndVar.cpp
   SimplifyInstructions.cpp
   SimplifyLibCalls.cpp
+  SymbolRewriter.cpp
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp

diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 3f75b3e..d078c96 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp

@@ -17,6 +17,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm-c/Core.h"
 using namespace llvm;
 
 /// CloneModule - Return an exact copy of the specified module.  This is not as
@@ -122,3 +123,11 @@
 
   return New;
 }
+
+extern "C" {
+
+LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
+  return wrap(CloneModule(unwrap(M)));
+}
+
+}

diff --git a/lib/Transforms/Utils/CtorUtils.cpp b/lib/Transforms/Utils/CtorUtils.cpp
index a359424..26875e8 100644
--- a/lib/Transforms/Utils/CtorUtils.cpp
+++ b/lib/Transforms/Utils/CtorUtils.cpp

@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -24,41 +25,22 @@
 namespace llvm {
 
 namespace {
-/// Given a specified llvm.global_ctors list, install the
-/// specified array.
-void installGlobalCtors(GlobalVariable *GCL,
-                        const std::vector<Function *> &Ctors) {
-  // If we made a change, reassemble the initializer list.
-  Constant *CSVals[3];
+/// Given a specified llvm.global_ctors list, remove the listed elements.
+void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {
+  // Filter out the initializer elements to remove.
+  ConstantArray *OldCA = cast<ConstantArray>(GCL->getInitializer());
+  SmallVector<Constant *, 10> CAList;
+  for (unsigned I = 0, E = OldCA->getNumOperands(); I < E; ++I)
+    if (!CtorsToRemove.test(I))
+      CAList.push_back(OldCA->getOperand(I));
 
-  StructType *StructTy =
-      cast<StructType>(GCL->getType()->getElementType()->getArrayElementType());
-
-  // Create the new init list.
-  std::vector<Constant *> CAList;
-  for (Function *F : Ctors) {
-    Type *Int32Ty = Type::getInt32Ty(GCL->getContext());
-    if (F) {
-      CSVals[0] = ConstantInt::get(Int32Ty, 65535);
-      CSVals[1] = F;
-    } else {
-      CSVals[0] = ConstantInt::get(Int32Ty, 0x7fffffff);
-      CSVals[1] = Constant::getNullValue(StructTy->getElementType(1));
-    }
-    // FIXME: Only allow the 3-field form in LLVM 4.0.
-    size_t NumElts = StructTy->getNumElements();
-    if (NumElts > 2)
-      CSVals[2] = Constant::getNullValue(StructTy->getElementType(2));
-    CAList.push_back(
-        ConstantStruct::get(StructTy, makeArrayRef(CSVals, NumElts)));
-  }
-
-  // Create the array initializer.
-  Constant *CA =
-      ConstantArray::get(ArrayType::get(StructTy, CAList.size()), CAList);
+  // Create the new array initializer.
+  ArrayType *ATy =
+      ArrayType::get(OldCA->getType()->getElementType(), CAList.size());
+  Constant *CA = ConstantArray::get(ATy, CAList);
 
   // If we didn't change the number of elements, don't create a new GV.
-  if (CA->getType() == GCL->getInitializer()->getType()) {
+  if (CA->getType() == OldCA->getType()) {
     GCL->setInitializer(CA);
     return;
   }
@@ -82,7 +64,7 @@
 
 /// Given a llvm.global_ctors list that we can understand,
 /// return a list of the functions and null terminator as a vector.
-std::vector<Function*> parseGlobalCtors(GlobalVariable *GV) {
+std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
   if (GV->getInitializer()->isNullValue())
     return std::vector<Function *>();
   ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
@@ -147,17 +129,15 @@
   bool MadeChange = false;
 
   // Loop over global ctors, optimizing them when we can.
-  for (unsigned i = 0; i != Ctors.size(); ++i) {
+  unsigned NumCtors = Ctors.size();
+  BitVector CtorsToRemove(NumCtors);
+  for (unsigned i = 0; i != Ctors.size() && NumCtors > 0; ++i) {
     Function *F = Ctors[i];
     // Found a null terminator in the middle of the list, prune off the rest of
     // the list.
-    if (!F) {
-      if (i != Ctors.size() - 1) {
-        Ctors.resize(i + 1);
-        MadeChange = true;
-      }
-      break;
-    }
+    if (!F)
+      continue;
+
     DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
 
     // We cannot simplify external ctor functions.
@@ -166,9 +146,10 @@
 
     // If we can evaluate the ctor at compile time, do.
     if (ShouldRemove(F)) {
-      Ctors.erase(Ctors.begin() + i);
+      Ctors[i] = nullptr;
+      CtorsToRemove.set(i);
+      NumCtors--;
       MadeChange = true;
-      --i;
       continue;
     }
   }
@@ -176,7 +157,7 @@
   if (!MadeChange)
     return false;
 
-  installGlobalCtors(GlobalCtors, Ctors);
+  removeGlobalCtors(GlobalCtors, CtorsToRemove);
   return true;
 }
 

diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
index 51ead40..4eb3e3d 100644
--- a/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/lib/Transforms/Utils/FlattenCFG.cpp

@@ -238,9 +238,13 @@
     // Do branch inversion.
     BasicBlock *CurrBlock = LastCondBlock;
     bool EverChanged = false;
-    while (1) {
+    for (;CurrBlock != FirstCondBlock;
+          CurrBlock = CurrBlock->getSinglePredecessor()) {
       BranchInst *BI = dyn_cast<BranchInst>(CurrBlock->getTerminator());
       CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+      if (!CI)
+        continue;
+
       CmpInst::Predicate Predicate = CI->getPredicate();
       // Canonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq
       if ((Predicate == CmpInst::ICMP_NE) || (Predicate == CmpInst::FCMP_ONE)) {
@@ -248,9 +252,6 @@
         BI->swapSuccessors();
         EverChanged = true;
       }
-      if (CurrBlock == FirstCondBlock)
-        break;
-      CurrBlock = CurrBlock->getSinglePredecessor();
     }
     return EverChanged;
   }

diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
index 12057e4..52e2d59 100644
--- a/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/lib/Transforms/Utils/GlobalStatus.cpp

@@ -35,6 +35,9 @@
   if (isa<GlobalValue>(C))
     return false;
 
+  if (isa<ConstantInt>(C) || isa<ConstantFP>(C))
+    return false;
+
   for (const User *U : C->users())
     if (const Constant *CU = dyn_cast<Constant>(U)) {
       if (!isSafeToDestroyConstant(CU))
@@ -45,7 +48,7 @@
 }
 
 static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
-                             SmallPtrSet<const PHINode *, 16> &PhiUsers) {
+                             SmallPtrSetImpl<const PHINode *> &PhiUsers) {
   for (const Use &U : V->uses()) {
     const User *UR = U.getUser();
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) {
@@ -130,7 +133,7 @@
       } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
         // PHI nodes we can check just like select or GEP instructions, but we
         // have to be careful about infinite recursion.
-        if (PhiUsers.insert(PN)) // Not already visited.
+        if (PhiUsers.insert(PN).second) // Not already visited.
           if (analyzeGlobalAux(I, GS, PhiUsers))
             return true;
       } else if (isa<CmpInst>(I)) {

diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index f0a9f2b..2d0b7dc 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp

@@ -13,10 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CFG.h"
@@ -24,14 +30,28 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
 using namespace llvm;
 
+static cl::opt<bool>
+EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
+  cl::Hidden,
+  cl::desc("Convert noalias attributes to metadata during inlining."));
+
+static cl::opt<bool>
+PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining",
+  cl::init(true), cl::Hidden,
+  cl::desc("Convert align attributes to assumptions during inlining."));
+
 bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI,
                           bool InsertLifetime) {
   return InlineFunction(CallSite(CI), IFI, InsertLifetime);
@@ -84,7 +104,7 @@
     /// split the landing pad block after the landingpad instruction and jump
     /// to there.
     void forwardResume(ResumeInst *RI,
-                       SmallPtrSet<LandingPadInst*, 16> &InlinedLPads);
+                       SmallPtrSetImpl<LandingPadInst*> &InlinedLPads);
 
     /// addIncomingPHIValuesFor - Add incoming-PHI values to the unwind
     /// destination block for the given basic block, using the values for the
@@ -143,7 +163,7 @@
 /// branch. When there is more than one predecessor, we need to split the
 /// landing pad block after the landingpad instruction and jump to there.
 void InvokeInliningInfo::forwardResume(ResumeInst *RI,
-                               SmallPtrSet<LandingPadInst*, 16> &InlinedLPads) {
+                               SmallPtrSetImpl<LandingPadInst*> &InlinedLPads) {
   BasicBlock *Dest = getInnerResumeDest();
   BasicBlock *Src = RI->getParent();
 
@@ -233,9 +253,7 @@
   // Append the clauses from the outer landing pad instruction into the inlined
   // landing pad instructions.
   LandingPadInst *OuterLPad = Invoke.getLandingPadInst();
-  for (SmallPtrSet<LandingPadInst*, 16>::iterator I = InlinedLPads.begin(),
-         E = InlinedLPads.end(); I != E; ++I) {
-    LandingPadInst *InlinedLPad = *I;
+  for (LandingPadInst *InlinedLPad : InlinedLPads) {
     unsigned OuterNum = OuterLPad->getNumClauses();
     InlinedLPad->reserveClauses(OuterNum);
     for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx)
@@ -260,6 +278,385 @@
   InvokeDest->removePredecessor(II->getParent());
 }
 
+/// CloneAliasScopeMetadata - When inlining a function that contains noalias
+/// scope metadata, this metadata needs to be cloned so that the inlined blocks
+/// have different "unqiue scopes" at every call site. Were this not done, then
+/// aliasing scopes from a function inlined into a caller multiple times could
+/// not be differentiated (and this would lead to miscompiles because the
+/// non-aliasing property communicated by the metadata could have
+/// call-site-specific control dependencies).
+static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
+  const Function *CalledFunc = CS.getCalledFunction();
+  SetVector<const MDNode *> MD;
+
+  // Note: We could only clone the metadata if it is already used in the
+  // caller. I'm omitting that check here because it might confuse
+  // inter-procedural alias analysis passes. We can revisit this if it becomes
+  // an efficiency or overhead problem.
+
+  for (Function::const_iterator I = CalledFunc->begin(), IE = CalledFunc->end();
+       I != IE; ++I)
+    for (BasicBlock::const_iterator J = I->begin(), JE = I->end(); J != JE; ++J) {
+      if (const MDNode *M = J->getMetadata(LLVMContext::MD_alias_scope))
+        MD.insert(M);
+      if (const MDNode *M = J->getMetadata(LLVMContext::MD_noalias))
+        MD.insert(M);
+    }
+
+  if (MD.empty())
+    return;
+
+  // Walk the existing metadata, adding the complete (perhaps cyclic) chain to
+  // the set.
+  SmallVector<const Value *, 16> Queue(MD.begin(), MD.end());
+  while (!Queue.empty()) {
+    const MDNode *M = cast<MDNode>(Queue.pop_back_val());
+    for (unsigned i = 0, ie = M->getNumOperands(); i != ie; ++i)
+      if (const MDNode *M1 = dyn_cast<MDNode>(M->getOperand(i)))
+        if (MD.insert(M1))
+          Queue.push_back(M1);
+  }
+
+  // Now we have a complete set of all metadata in the chains used to specify
+  // the noalias scopes and the lists of those scopes.
+  SmallVector<MDNode *, 16> DummyNodes;
+  DenseMap<const MDNode *, TrackingVH<MDNode> > MDMap;
+  for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end();
+       I != IE; ++I) {
+    MDNode *Dummy = MDNode::getTemporary(CalledFunc->getContext(), None);
+    DummyNodes.push_back(Dummy);
+    MDMap[*I] = Dummy;
+  }
+
+  // Create new metadata nodes to replace the dummy nodes, replacing old
+  // metadata references with either a dummy node or an already-created new
+  // node.
+  for (SetVector<const MDNode *>::iterator I = MD.begin(), IE = MD.end();
+       I != IE; ++I) {
+    SmallVector<Value *, 4> NewOps;
+    for (unsigned i = 0, ie = (*I)->getNumOperands(); i != ie; ++i) {
+      const Value *V = (*I)->getOperand(i);
+      if (const MDNode *M = dyn_cast<MDNode>(V))
+        NewOps.push_back(MDMap[M]);
+      else
+        NewOps.push_back(const_cast<Value *>(V));
+    }
+
+    MDNode *NewM = MDNode::get(CalledFunc->getContext(), NewOps),
+           *TempM = MDMap[*I];
+
+    TempM->replaceAllUsesWith(NewM);
+  }
+
+  // Now replace the metadata in the new inlined instructions with the
+  // repacements from the map.
+  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
+       VMI != VMIE; ++VMI) {
+    if (!VMI->second)
+      continue;
+
+    Instruction *NI = dyn_cast<Instruction>(VMI->second);
+    if (!NI)
+      continue;
+
+    if (MDNode *M = NI->getMetadata(LLVMContext::MD_alias_scope)) {
+      MDNode *NewMD = MDMap[M];
+      // If the call site also had alias scope metadata (a list of scopes to
+      // which instructions inside it might belong), propagate those scopes to
+      // the inlined instructions.
+      if (MDNode *CSM =
+              CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope))
+        NewMD = MDNode::concatenate(NewMD, CSM);
+      NI->setMetadata(LLVMContext::MD_alias_scope, NewMD);
+    } else if (NI->mayReadOrWriteMemory()) {
+      if (MDNode *M =
+              CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope))
+        NI->setMetadata(LLVMContext::MD_alias_scope, M);
+    }
+
+    if (MDNode *M = NI->getMetadata(LLVMContext::MD_noalias)) {
+      MDNode *NewMD = MDMap[M];
+      // If the call site also had noalias metadata (a list of scopes with
+      // which instructions inside it don't alias), propagate those scopes to
+      // the inlined instructions.
+      if (MDNode *CSM =
+              CS.getInstruction()->getMetadata(LLVMContext::MD_noalias))
+        NewMD = MDNode::concatenate(NewMD, CSM);
+      NI->setMetadata(LLVMContext::MD_noalias, NewMD);
+    } else if (NI->mayReadOrWriteMemory()) {
+      if (MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_noalias))
+        NI->setMetadata(LLVMContext::MD_noalias, M);
+    }
+  }
+
+  // Now that everything has been replaced, delete the dummy nodes.
+  for (unsigned i = 0, ie = DummyNodes.size(); i != ie; ++i)
+    MDNode::deleteTemporary(DummyNodes[i]);
+}
+
+/// AddAliasScopeMetadata - If the inlined function has noalias arguments, then
+/// add new alias scopes for each noalias argument, tag the mapped noalias
+/// parameters with noalias metadata specifying the new scope, and tag all
+/// non-derived loads, stores and memory intrinsics with the new alias scopes.
+static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
+                                  const DataLayout *DL, AliasAnalysis *AA) {
+  if (!EnableNoAliasConversion)
+    return;
+
+  const Function *CalledFunc = CS.getCalledFunction();
+  SmallVector<const Argument *, 4> NoAliasArgs;
+
+  for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
+       E = CalledFunc->arg_end(); I != E; ++I) {
+    if (I->hasNoAliasAttr() && !I->hasNUses(0))
+      NoAliasArgs.push_back(I);
+  }
+
+  if (NoAliasArgs.empty())
+    return;
+
+  // To do a good job, if a noalias variable is captured, we need to know if
+  // the capture point dominates the particular use we're considering.
+  DominatorTree DT;
+  DT.recalculate(const_cast<Function&>(*CalledFunc));
+
+  // noalias indicates that pointer values based on the argument do not alias
+  // pointer values which are not based on it. So we add a new "scope" for each
+  // noalias function argument. Accesses using pointers based on that argument
+  // become part of that alias scope, accesses using pointers not based on that
+  // argument are tagged as noalias with that scope.
+
+  DenseMap<const Argument *, MDNode *> NewScopes;
+  MDBuilder MDB(CalledFunc->getContext());
+
+  // Create a new scope domain for this function.
+  MDNode *NewDomain =
+    MDB.createAnonymousAliasScopeDomain(CalledFunc->getName());
+  for (unsigned i = 0, e = NoAliasArgs.size(); i != e; ++i) {
+    const Argument *A = NoAliasArgs[i];
+
+    std::string Name = CalledFunc->getName();
+    if (A->hasName()) {
+      Name += ": %";
+      Name += A->getName();
+    } else {
+      Name += ": argument ";
+      Name += utostr(i);
+    }
+
+    // Note: We always create a new anonymous root here. This is true regardless
+    // of the linkage of the callee because the aliasing "scope" is not just a
+    // property of the callee, but also all control dependencies in the caller.
+    MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+    NewScopes.insert(std::make_pair(A, NewScope));
+  }
+
+  // Iterate over all new instructions in the map; for all memory-access
+  // instructions, add the alias scope metadata.
+  for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
+       VMI != VMIE; ++VMI) {
+    if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) {
+      if (!VMI->second)
+        continue;
+
+      Instruction *NI = dyn_cast<Instruction>(VMI->second);
+      if (!NI)
+        continue;
+
+      bool IsArgMemOnlyCall = false, IsFuncCall = false;
+      SmallVector<const Value *, 2> PtrArgs;
+
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+        PtrArgs.push_back(LI->getPointerOperand());
+      else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+        PtrArgs.push_back(SI->getPointerOperand());
+      else if (const VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+        PtrArgs.push_back(VAAI->getPointerOperand());
+      else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
+        PtrArgs.push_back(CXI->getPointerOperand());
+      else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
+        PtrArgs.push_back(RMWI->getPointerOperand());
+      else if (ImmutableCallSite ICS = ImmutableCallSite(I)) {
+        // If we know that the call does not access memory, then we'll still
+        // know that about the inlined clone of this call site, and we don't
+        // need to add metadata.
+        if (ICS.doesNotAccessMemory())
+          continue;
+
+        IsFuncCall = true;
+        if (AA) {
+          AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(ICS);
+          if (MRB == AliasAnalysis::OnlyAccessesArgumentPointees ||
+              MRB == AliasAnalysis::OnlyReadsArgumentPointees)
+            IsArgMemOnlyCall = true;
+        }
+
+        for (ImmutableCallSite::arg_iterator AI = ICS.arg_begin(),
+             AE = ICS.arg_end(); AI != AE; ++AI) {
+          // We need to check the underlying objects of all arguments, not just
+          // the pointer arguments, because we might be passing pointers as
+          // integers, etc.
+          // However, if we know that the call only accesses pointer arguments,
+          // then we only need to check the pointer arguments.
+          if (IsArgMemOnlyCall && !(*AI)->getType()->isPointerTy())
+            continue;
+
+          PtrArgs.push_back(*AI);
+        }
+      }
+
+      // If we found no pointers, then this instruction is not suitable for
+      // pairing with an instruction to receive aliasing metadata.
+      // However, if this is a call, this we might just alias with none of the
+      // noalias arguments.
+      if (PtrArgs.empty() && !IsFuncCall)
+        continue;
+
+      // It is possible that there is only one underlying object, but you
+      // need to go through several PHIs to see it, and thus could be
+      // repeated in the Objects list.
+      SmallPtrSet<const Value *, 4> ObjSet;
+      SmallVector<Value *, 4> Scopes, NoAliases;
+
+      SmallSetVector<const Argument *, 4> NAPtrArgs;
+      for (unsigned i = 0, ie = PtrArgs.size(); i != ie; ++i) {
+        SmallVector<Value *, 4> Objects;
+        GetUnderlyingObjects(const_cast<Value*>(PtrArgs[i]),
+                             Objects, DL, /* MaxLookup = */ 0);
+
+        for (Value *O : Objects)
+          ObjSet.insert(O);
+      }
+
+      // Figure out if we're derived from anything that is not a noalias
+      // argument.
+      bool CanDeriveViaCapture = false, UsesAliasingPtr = false;
+      for (const Value *V : ObjSet) {
+        // Is this value a constant that cannot be derived from any pointer
+        // value (we need to exclude constant expressions, for example, that
+        // are formed from arithmetic on global symbols).
+        bool IsNonPtrConst = isa<ConstantInt>(V) || isa<ConstantFP>(V) ||
+                             isa<ConstantPointerNull>(V) ||
+                             isa<ConstantDataVector>(V) || isa<UndefValue>(V);
+        if (IsNonPtrConst)
+          continue;
+
+        // If this is anything other than a noalias argument, then we cannot
+        // completely describe the aliasing properties using alias.scope
+        // metadata (and, thus, won't add any).
+        if (const Argument *A = dyn_cast<Argument>(V)) {
+          if (!A->hasNoAliasAttr())
+            UsesAliasingPtr = true;
+        } else {
+          UsesAliasingPtr = true;
+        }
+
+        // If this is not some identified function-local object (which cannot
+        // directly alias a noalias argument), or some other argument (which,
+        // by definition, also cannot alias a noalias argument), then we could
+        // alias a noalias argument that has been captured).
+        if (!isa<Argument>(V) &&
+            !isIdentifiedFunctionLocal(const_cast<Value*>(V)))
+          CanDeriveViaCapture = true;
+      }
+
+      // A function call can always get captured noalias pointers (via other
+      // parameters, globals, etc.).
+      if (IsFuncCall && !IsArgMemOnlyCall)
+        CanDeriveViaCapture = true;
+
+      // First, we want to figure out all of the sets with which we definitely
+      // don't alias. Iterate over all noalias set, and add those for which:
+      //   1. The noalias argument is not in the set of objects from which we
+      //      definitely derive.
+      //   2. The noalias argument has not yet been captured.
+      // An arbitrary function that might load pointers could see captured
+      // noalias arguments via other noalias arguments or globals, and so we
+      // must always check for prior capture.
+      for (const Argument *A : NoAliasArgs) {
+        if (!ObjSet.count(A) && (!CanDeriveViaCapture ||
+                                 // It might be tempting to skip the
+                                 // PointerMayBeCapturedBefore check if
+                                 // A->hasNoCaptureAttr() is true, but this is
+                                 // incorrect because nocapture only guarantees
+                                 // that no copies outlive the function, not
+                                 // that the value cannot be locally captured.
+                                 !PointerMayBeCapturedBefore(A,
+                                   /* ReturnCaptures */ false,
+                                   /* StoreCaptures */ false, I, &DT)))
+          NoAliases.push_back(NewScopes[A]);
+      }
+
+      if (!NoAliases.empty())
+        NI->setMetadata(LLVMContext::MD_noalias,
+                        MDNode::concatenate(
+                            NI->getMetadata(LLVMContext::MD_noalias),
+                            MDNode::get(CalledFunc->getContext(), NoAliases)));
+
+      // Next, we want to figure out all of the sets to which we might belong.
+      // We might belong to a set if the noalias argument is in the set of
+      // underlying objects. If there is some non-noalias argument in our list
+      // of underlying objects, then we cannot add a scope because the fact
+      // that some access does not alias with any set of our noalias arguments
+      // cannot itself guarantee that it does not alias with this access
+      // (because there is some pointer of unknown origin involved and the
+      // other access might also depend on this pointer). We also cannot add
+      // scopes to arbitrary functions unless we know they don't access any
+      // non-parameter pointer-values.
+      bool CanAddScopes = !UsesAliasingPtr;
+      if (CanAddScopes && IsFuncCall)
+        CanAddScopes = IsArgMemOnlyCall;
+
+      if (CanAddScopes)
+        for (const Argument *A : NoAliasArgs) {
+          if (ObjSet.count(A))
+            Scopes.push_back(NewScopes[A]);
+        }
+
+      if (!Scopes.empty())
+        NI->setMetadata(
+            LLVMContext::MD_alias_scope,
+            MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope),
+                                MDNode::get(CalledFunc->getContext(), Scopes)));
+    }
+  }
+}
+
+/// If the inlined function has non-byval align arguments, then
+/// add @llvm.assume-based alignment assumptions to preserve this information.
+static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
+  if (!PreserveAlignmentAssumptions || !IFI.DL)
+    return;
+
+  // To avoid inserting redundant assumptions, we should check for assumptions
+  // already in the caller. To do this, we might need a DT of the caller.
+  DominatorTree DT;
+  bool DTCalculated = false;
+
+  const Function *CalledFunc = CS.getCalledFunction();
+  for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
+       E = CalledFunc->arg_end(); I != E; ++I) {
+    unsigned Align = I->getType()->isPointerTy() ? I->getParamAlignment() : 0;
+    if (Align && !I->hasByValOrInAllocaAttr() && !I->hasNUses(0)) {
+      if (!DTCalculated) {
+        DT.recalculate(const_cast<Function&>(*CS.getInstruction()->getParent()
+                                               ->getParent()));
+        DTCalculated = true;
+      }
+
+      // If we can already prove the asserted alignment in the context of the
+      // caller, then don't bother inserting the assumption.
+      Value *Arg = CS.getArgument(I->getArgNo());
+      if (getKnownAlignment(Arg, IFI.DL, IFI.AT, CS.getInstruction(),
+                            &DT) >= Align)
+        continue;
+
+      IRBuilder<>(CS.getInstruction()).CreateAlignmentAssumption(*IFI.DL, Arg,
+                                                                 Align);
+    }
+  }
+}
+
 /// UpdateCallGraphAfterInlining - Once we have cloned code over from a callee
 /// into the caller, update the specified callgraph to reflect the changes we
 /// made.  Note that it's possible that not all code was copied over, so only
@@ -327,31 +724,19 @@
 static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
                                     BasicBlock *InsertBlock,
                                     InlineFunctionInfo &IFI) {
-  LLVMContext &Context = Src->getContext();
-  Type *VoidPtrTy = Type::getInt8PtrTy(Context);
   Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
-  Type *Tys[3] = { VoidPtrTy, VoidPtrTy, Type::getInt64Ty(Context) };
-  Function *MemCpyFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys);
-  IRBuilder<> builder(InsertBlock->begin());
-  Value *DstCast = builder.CreateBitCast(Dst, VoidPtrTy, "tmp");
-  Value *SrcCast = builder.CreateBitCast(Src, VoidPtrTy, "tmp");
+  IRBuilder<> Builder(InsertBlock->begin());
 
   Value *Size;
   if (IFI.DL == nullptr)
     Size = ConstantExpr::getSizeOf(AggTy);
   else
-    Size = ConstantInt::get(Type::getInt64Ty(Context),
-                            IFI.DL->getTypeStoreSize(AggTy));
+    Size = Builder.getInt64(IFI.DL->getTypeStoreSize(AggTy));
 
   // Always generate a memcpy of alignment 1 here because we don't know
   // the alignment of the src pointer.  Other optimizations can infer
   // better alignment.
-  Value *CallArgs[] = {
-    DstCast, SrcCast, Size,
-    ConstantInt::get(Type::getInt32Ty(Context), 1),
-    ConstantInt::getFalse(Context) // isVolatile
-  };
-  builder.CreateCall(MemCpyFn, CallArgs);
+  Builder.CreateMemCpy(Dst, Src, Size, /*Align=*/1);
 }
 
 /// HandleByValArgument - When inlining a call site that has a byval argument,
@@ -376,7 +761,7 @@
     // If the pointer is already known to be sufficiently aligned, or if we can
     // round it up to a larger alignment, then we don't need a temporary.
     if (getOrEnforceKnownAlignment(Arg, ByValAlignment,
-                                   IFI.DL) >= ByValAlignment)
+                                   IFI.DL, IFI.AT, TheCall) >= ByValAlignment)
       return Arg;
     
     // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
@@ -472,6 +857,12 @@
         // originates from the call location. This is important for
         // ((__always_inline__, __nodebug__)) functions which must use caller
         // location for all instructions in their function body.
+
+        // Don't update static allocas, as they may get moved later.
+        if (auto *AI = dyn_cast<AllocaInst>(BI))
+          if (isa<Constant>(AI->getArraySize()))
+            continue;
+
         BI->setDebugLoc(TheCallDL);
       } else {
         BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext()));
@@ -486,33 +877,6 @@
   }
 }
 
-/// Returns a musttail call instruction if one immediately precedes the given
-/// return instruction with an optional bitcast instruction between them.
-static CallInst *getPrecedingMustTailCall(ReturnInst *RI) {
-  Instruction *Prev = RI->getPrevNode();
-  if (!Prev)
-    return nullptr;
-
-  if (Value *RV = RI->getReturnValue()) {
-    if (RV != Prev)
-      return nullptr;
-
-    // Look through the optional bitcast.
-    if (auto *BI = dyn_cast<BitCastInst>(Prev)) {
-      RV = BI->getOperand(0);
-      Prev = BI->getPrevNode();
-      if (!Prev || RV != Prev)
-        return nullptr;
-    }
-  }
-
-  if (auto *CI = dyn_cast<CallInst>(Prev)) {
-    if (CI->isMustTailCall())
-      return CI;
-  }
-  return nullptr;
-}
-
 /// InlineFunction - This function inlines the called function into the basic
 /// block of the caller.  This returns false if it is not possible to inline
 /// this call.  The program is still in a well defined state if this occurs
@@ -626,6 +990,11 @@
       VMap[I] = ActualArg;
     }
 
+    // Add alignment assumptions if necessary. We do this before the inlined
+    // instructions are actually cloned into the caller so that we can easily
+    // check what will be known at the start of the inlined code.
+    AddAlignmentAssumptions(CS, IFI);
+
     // We want the inliner to prune the code as it copies.  We would LOVE to
     // have no dead or constant instructions leftover after inlining occurs
     // (which can happen, e.g., because an argument was constant), but we'll be
@@ -648,6 +1017,17 @@
 
     // Update inlined instructions' line number information.
     fixupLineNumbers(Caller, FirstNewBlock, TheCall);
+
+    // Clone existing noalias metadata if necessary.
+    CloneAliasScopeMetadata(CS, VMap);
+
+    // Add noalias metadata if necessary.
+    AddAliasScopeMetadata(CS, VMap, IFI.DL, IFI.AA);
+
+    // FIXME: We could register any cloned assumptions instead of clearing the
+    // whole function's cache.
+    if (IFI.AT)
+      IFI.AT->forgetCachedAssumptions(Caller);
   }
 
   // If there are any alloca instructions in the block that used to be the entry
@@ -765,7 +1145,8 @@
       for (ReturnInst *RI : Returns) {
         // Don't insert llvm.lifetime.end calls between a musttail call and a
         // return.  The return kills all local allocas.
-        if (InlinedMustTailCalls && getPrecedingMustTailCall(RI))
+        if (InlinedMustTailCalls &&
+            RI->getParent()->getTerminatingMustTailCall())
           continue;
         IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
       }
@@ -789,7 +1170,7 @@
     for (ReturnInst *RI : Returns) {
       // Don't insert llvm.stackrestore calls between a musttail call and a
       // return.  The return will restore the stack pointer.
-      if (InlinedMustTailCalls && getPrecedingMustTailCall(RI))
+      if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall())
         continue;
       IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr);
     }
@@ -812,7 +1193,8 @@
     // Handle the returns preceded by musttail calls separately.
     SmallVector<ReturnInst *, 8> NormalReturns;
     for (ReturnInst *RI : Returns) {
-      CallInst *ReturnedMustTail = getPrecedingMustTailCall(RI);
+      CallInst *ReturnedMustTail =
+          RI->getParent()->getTerminatingMustTailCall();
       if (!ReturnedMustTail) {
         NormalReturns.push_back(RI);
         continue;
@@ -1016,7 +1398,7 @@
   // the entries are the same or undef).  If so, remove the PHI so it doesn't
   // block other optimizations.
   if (PHI) {
-    if (Value *V = SimplifyInstruction(PHI, IFI.DL)) {
+    if (Value *V = SimplifyInstruction(PHI, IFI.DL, nullptr, nullptr, IFI.AT)) {
       PHI->replaceAllUsesWith(V);
       PHI->eraseFromParent();
     }

diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
index 9f91eeb..0ae746c 100644
--- a/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/lib/Transforms/Utils/IntegerDivision.cpp

@@ -398,11 +398,13 @@
     Rem->dropAllReferences();
     Rem->eraseFromParent();
 
-    // If we didn't actually generate a udiv instruction, we're done
-    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
-    if (!BO || BO->getOpcode() != Instruction::URem)
+    // If we didn't actually generate an urem instruction, we're done
+    // This happens for example if the input were constant. In this case the
+    // Builder insertion point was unchanged
+    if (Rem == Builder.GetInsertPoint())
       return true;
 
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
     Rem = BO;
   }
 
@@ -456,11 +458,13 @@
     Div->dropAllReferences();
     Div->eraseFromParent();
 
-    // If we didn't actually generate a udiv instruction, we're done
-    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
-    if (!BO || BO->getOpcode() != Instruction::UDiv)
+    // If we didn't actually generate an udiv instruction, we're done
+    // This happens for example if the input were constant. In this case the
+    // Builder insertion point was unchanged
+    if (Div == Builder.GetInsertPoint())
       return true;
 
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
     Div = BO;
   }
 

diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index aedd787..c963c51 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp

@@ -128,7 +128,7 @@
       // Check to see if this branch is going to the same place as the default
       // dest.  If so, eliminate it as an explicit compare.
       if (i.getCaseSuccessor() == DefaultDest) {
-        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+        MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
         unsigned NCases = SI->getNumCases();
         // Fold the case metadata into the default if there will be any branches
         // left, unless the metadata doesn't match the switch.
@@ -206,7 +206,7 @@
       BranchInst *NewBr = Builder.CreateCondBr(Cond,
                                                FirstCase.getCaseSuccessor(),
                                                SI->getDefaultDest());
-      MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+      MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
       if (MD && MD->getNumOperands() == 3) {
         ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
         ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
@@ -301,6 +301,14 @@
     if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
         II->getIntrinsicID() == Intrinsic::lifetime_end)
       return isa<UndefValue>(II->getArgOperand(1));
+
+    // Assumptions are dead if their condition is trivially true.
+    if (II->getIntrinsicID() == Intrinsic::assume) {
+      if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+        return !Cond->isZero();
+
+      return false;
+    }
   }
 
   if (isAllocLikeFn(I, TLI)) return true;
@@ -384,7 +392,7 @@
 
     // If we find an instruction more than once, we're on a cycle that
     // won't prove fruitful.
-    if (!Visited.insert(I)) {
+    if (!Visited.insert(I).second) {
       // Break the cycle and delete the instruction and its operands.
       I->replaceAllUsesWith(UndefValue::get(I->getType()));
       (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
@@ -509,6 +517,11 @@
   PredBB->getTerminator()->eraseFromParent();
   DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
 
+  // If the PredBB is the entry block of the function, move DestBB up to
+  // become the entry block after we erase PredBB.
+  if (PredBB == &DestBB->getParent()->getEntryBlock())
+    DestBB->moveAfter(PredBB);
+
   if (P) {
     if (DominatorTreeWrapperPass *DTWP =
             P->getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
@@ -926,13 +939,16 @@
 /// and it is more than the alignment of the ultimate object, see if we can
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
-                                          const DataLayout *DL) {
+                                          const DataLayout *DL,
+                                          AssumptionTracker *AT,
+                                          const Instruction *CxtI,
+                                          const DominatorTree *DT) {
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
   unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(V->getType()) : 64;
 
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  computeKnownBits(V, KnownZero, KnownOne, DL);
+  computeKnownBits(V, KnownZero, KnownOne, DL, 0, AT, CxtI, DT);
   unsigned TrailZ = KnownZero.countTrailingOnes();
 
   // Avoid trouble with ridiculously large TrailZ values, such as
@@ -977,6 +993,7 @@
 bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            StoreInst *SI, DIBuilder &Builder) {
   DIVariable DIVar(DDI->getVariable());
+  DIExpression DIExpr(DDI->getExpression());
   assert((!DIVar || DIVar.isVariable()) &&
          "Variable in DbgDeclareInst should be either null or a DIVariable.");
   if (!DIVar)
@@ -994,9 +1011,10 @@
   if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
     ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
   if (ExtendedArg)
-    DbgVal = Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, SI);
+    DbgVal = Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, DIExpr, SI);
   else
-    DbgVal = Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, SI);
+    DbgVal = Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar,
+                                             DIExpr, SI);
   DbgVal->setDebugLoc(DDI->getDebugLoc());
   return true;
 }
@@ -1006,6 +1024,7 @@
 bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            LoadInst *LI, DIBuilder &Builder) {
   DIVariable DIVar(DDI->getVariable());
+  DIExpression DIExpr(DDI->getExpression());
   assert((!DIVar || DIVar.isVariable()) &&
          "Variable in DbgDeclareInst should be either null or a DIVariable.");
   if (!DIVar)
@@ -1015,8 +1034,7 @@
     return true;
 
   Instruction *DbgVal =
-    Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0,
-                                    DIVar, LI);
+      Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0, DIVar, DIExpr, LI);
   DbgVal->setDebugLoc(DDI->getDebugLoc());
   return true;
 }
@@ -1056,14 +1074,14 @@
         else if (LoadInst *LI = dyn_cast<LoadInst>(U))
           ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
         else if (CallInst *CI = dyn_cast<CallInst>(U)) {
-	  // This is a call by-value or some other instruction that
-	  // takes a pointer to the variable. Insert a *value*
-	  // intrinsic that describes the alloca.
-	  auto DbgVal =
-	    DIB.insertDbgValueIntrinsic(AI, 0,
-					DIVariable(DDI->getVariable()), CI);
-	  DbgVal->setDebugLoc(DDI->getDebugLoc());
-	}
+          // This is a call by-value or some other instruction that
+          // takes a pointer to the variable. Insert a *value*
+          // intrinsic that describes the alloca.
+          auto DbgVal = DIB.insertDbgValueIntrinsic(
+              AI, 0, DIVariable(DDI->getVariable()),
+              DIExpression(DDI->getExpression()), CI);
+          DbgVal->setDebugLoc(DDI->getDebugLoc());
+        }
       DDI->eraseFromParent();
     }
   }
@@ -1087,6 +1105,7 @@
   if (!DDI)
     return false;
   DIVariable DIVar(DDI->getVariable());
+  DIExpression DIExpr(DDI->getExpression());
   assert((!DIVar || DIVar.isVariable()) &&
          "Variable in DbgDeclareInst should be either null or a DIVariable.");
   if (!DIVar)
@@ -1096,24 +1115,19 @@
   // "deref" operation to a list of address elements, as new llvm.dbg.declare
   // will take a value storing address of the memory for variable, not
   // alloca itself.
-  Type *Int64Ty = Type::getInt64Ty(AI->getContext());
-  SmallVector<Value*, 4> NewDIVarAddress;
-  if (DIVar.hasComplexAddress()) {
-    for (unsigned i = 0, n = DIVar.getNumAddrElements(); i < n; ++i) {
-      NewDIVarAddress.push_back(
-          ConstantInt::get(Int64Ty, DIVar.getAddrElement(i)));
+  SmallVector<int64_t, 4> NewDIExpr;
+  if (DIExpr) {
+    for (unsigned i = 0, n = DIExpr.getNumElements(); i < n; ++i) {
+      NewDIExpr.push_back(DIExpr.getElement(i));
     }
   }
-  NewDIVarAddress.push_back(ConstantInt::get(Int64Ty, DIBuilder::OpDeref));
-  DIVariable NewDIVar = Builder.createComplexVariable(
-      DIVar.getTag(), DIVar.getContext(), DIVar.getName(),
-      DIVar.getFile(), DIVar.getLineNumber(), DIVar.getType(),
-      NewDIVarAddress, DIVar.getArgNumber());
+  NewDIExpr.push_back(dwarf::DW_OP_deref);
 
   // Insert llvm.dbg.declare in the same basic block as the original alloca,
   // and remove old llvm.dbg.declare.
   BasicBlock *BB = AI->getParent();
-  Builder.insertDeclare(NewAllocaAddress, NewDIVar, BB);
+  Builder.insertDeclare(NewAllocaAddress, DIVar,
+                        Builder.createExpression(NewDIExpr), BB);
   DDI->eraseFromParent();
   return true;
 }
@@ -1165,7 +1179,7 @@
 }
 
 static bool markAliveBlocks(BasicBlock *BB,
-                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
+                            SmallPtrSetImpl<BasicBlock*> &Reachable) {
 
   SmallVector<BasicBlock*, 128> Worklist;
   Worklist.push_back(BB);
@@ -1178,6 +1192,26 @@
     // instructions into LLVM unreachable insts.  The instruction combining pass
     // canonicalizes unreachable insts into stores to null or undef.
     for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+      // Assumptions that are known to be false are equivalent to unreachable.
+      // Also, if the condition is undefined, then we make the choice most
+      // beneficial to the optimizer, and choose that to also be unreachable.
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
+        if (II->getIntrinsicID() == Intrinsic::assume) {
+          bool MakeUnreachable = false;
+          if (isa<UndefValue>(II->getArgOperand(0)))
+            MakeUnreachable = true;
+          else if (ConstantInt *Cond =
+                   dyn_cast<ConstantInt>(II->getArgOperand(0)))
+            MakeUnreachable = Cond->isZero();
+
+          if (MakeUnreachable) {
+            // Don't insert a call to llvm.trap right before the unreachable.
+            changeToUnreachable(BBI, false);
+            Changed = true;
+            break;
+          }
+        }
+
       if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
         if (CI->doesNotReturn()) {
           // If we found a call to a no-return function, insert an unreachable
@@ -1232,7 +1266,7 @@
 
     Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.insert(*SI))
+      if (Reachable.insert(*SI).second)
         Worklist.push_back(*SI);
   } while (!Worklist.empty());
   return Changed;
@@ -1272,3 +1306,43 @@
 
   return true;
 }
+
+void llvm::combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsigned> KnownIDs) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+  K->dropUnknownMetadata(KnownIDs);
+  K->getAllMetadataOtherThanDebugLoc(Metadata);
+  for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
+    unsigned Kind = Metadata[i].first;
+    MDNode *JMD = J->getMetadata(Kind);
+    MDNode *KMD = Metadata[i].second;
+
+    switch (Kind) {
+      default:
+        K->setMetadata(Kind, nullptr); // Remove unknown metadata
+        break;
+      case LLVMContext::MD_dbg:
+        llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
+      case LLVMContext::MD_tbaa:
+        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
+        break;
+      case LLVMContext::MD_alias_scope:
+      case LLVMContext::MD_noalias:
+        K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
+        break;
+      case LLVMContext::MD_range:
+        K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
+        break;
+      case LLVMContext::MD_fpmath:
+        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
+        break;
+      case LLVMContext::MD_invariant_load:
+        // Only set the !invariant.load if it is present in both instructions.
+        K->setMetadata(Kind, JMD);
+        break;
+      case LLVMContext::MD_nonnull:
+        // Only set the !nonnull if it is present in both instructions.
+        K->setMetadata(Kind, JMD);
+        break;
+    }
+  }
+}

diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index ef42291..af0501f 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp

@@ -44,6 +44,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -173,8 +174,7 @@
 
   if (Exit->isLandingPad()) {
     SmallVector<BasicBlock*, 2> NewBBs;
-    SplitLandingPadPredecessors(Exit, ArrayRef<BasicBlock*>(&LoopBlocks[0],
-                                                            LoopBlocks.size()),
+    SplitLandingPadPredecessors(Exit, LoopBlocks,
                                 ".loopexit", ".nonloopexit",
                                 PP, NewBBs);
     NewExitBB = NewBBs[0];
@@ -209,11 +209,12 @@
 /// \brief The first part of loop-nestification is to find a PHI node that tells
 /// us how to partition the loops.
 static PHINode *findPHIToPartitionLoops(Loop *L, AliasAnalysis *AA,
-                                        DominatorTree *DT) {
+                                        DominatorTree *DT,
+                                        AssumptionTracker *AT) {
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I);
     ++I;
-    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT)) {
+    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AT)) {
       // This is a degenerate PHI already, don't modify it!
       PN->replaceAllUsesWith(V);
       if (AA) AA->deleteValue(PN);
@@ -252,7 +253,8 @@
 ///
 static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
                                 AliasAnalysis *AA, DominatorTree *DT,
-                                LoopInfo *LI, ScalarEvolution *SE, Pass *PP) {
+                                LoopInfo *LI, ScalarEvolution *SE, Pass *PP,
+                                AssumptionTracker *AT) {
   // Don't try to separate loops without a preheader.
   if (!Preheader)
     return nullptr;
@@ -261,7 +263,7 @@
   assert(!L->getHeader()->isLandingPad() &&
          "Can't insert backedge to landing pad");
 
-  PHINode *PN = findPHIToPartitionLoops(L, AA, DT);
+  PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AT);
   if (!PN) return nullptr;  // No known way to partition.
 
   // Pull out all predecessors that have varying values in the loop.  This
@@ -475,7 +477,7 @@
 static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
                             AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
                             ScalarEvolution *SE, Pass *PP,
-                            const DataLayout *DL) {
+                            const DataLayout *DL, AssumptionTracker *AT) {
   bool Changed = false;
 ReprocessLoop:
 
@@ -496,20 +498,19 @@
     }
 
     // Delete each unique out-of-loop (and thus dead) predecessor.
-    for (SmallPtrSet<BasicBlock*, 4>::iterator I = BadPreds.begin(),
-         E = BadPreds.end(); I != E; ++I) {
+    for (BasicBlock *P : BadPreds) {
 
       DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
-                   << (*I)->getName() << "\n");
+                   << P->getName() << "\n");
 
       // Inform each successor of each dead pred.
-      for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI)
-        (*SI)->removePredecessor(*I);
+      for (succ_iterator SI = succ_begin(P), SE = succ_end(P); SI != SE; ++SI)
+        (*SI)->removePredecessor(P);
       // Zap the dead pred's terminator and replace it with unreachable.
-      TerminatorInst *TI = (*I)->getTerminator();
+      TerminatorInst *TI = P->getTerminator();
        TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
-      (*I)->getTerminator()->eraseFromParent();
-      new UnreachableInst((*I)->getContext(), *I);
+      P->getTerminator()->eraseFromParent();
+      new UnreachableInst(P->getContext(), P);
       Changed = true;
     }
   }
@@ -582,7 +583,8 @@
     // this for loops with a giant number of backedges, just factor them into a
     // common backedge instead.
     if (L->getNumBackEdges() < 8) {
-      if (Loop *OuterL = separateNestedLoop(L, Preheader, AA, DT, LI, SE, PP)) {
+      if (Loop *OuterL = separateNestedLoop(L, Preheader, AA, DT, LI, SE,
+                                            PP, AT)) {
         ++NumNested;
         // Enqueue the outer loop as it should be processed next in our
         // depth-first nest walk.
@@ -612,7 +614,7 @@
   PHINode *PN;
   for (BasicBlock::iterator I = L->getHeader()->begin();
        (PN = dyn_cast<PHINode>(I++)); )
-    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT)) {
+    if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, DT, AT)) {
       if (AA) AA->deleteValue(PN);
       if (SE) SE->forgetValue(PN);
       PN->replaceAllUsesWith(V);
@@ -712,7 +714,7 @@
 
 bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Pass *PP,
                         AliasAnalysis *AA, ScalarEvolution *SE,
-                        const DataLayout *DL) {
+                        const DataLayout *DL, AssumptionTracker *AT) {
   bool Changed = false;
 
   // Worklist maintains our depth-first queue of loops in this nest to process.
@@ -730,7 +732,7 @@
 
   while (!Worklist.empty())
     Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, AA, DT, LI,
-                               SE, PP, DL);
+                               SE, PP, DL, AT);
 
   return Changed;
 }
@@ -749,10 +751,13 @@
     LoopInfo *LI;
     ScalarEvolution *SE;
     const DataLayout *DL;
+    AssumptionTracker *AT;
 
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionTracker>();
+
       // We need loop information to identify the loops...
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addPreserved<DominatorTreeWrapperPass>();
@@ -773,11 +778,12 @@
 
 char LoopSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
-                "Canonicalize natural loops", true, false)
+                "Canonicalize natural loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
-                "Canonicalize natural loops", true, false)
+                "Canonicalize natural loops", false, false)
 
 // Publicly exposed interface to pass...
 char &llvm::LoopSimplifyID = LoopSimplify::ID;
@@ -794,10 +800,11 @@
   SE = getAnalysisIfAvailable<ScalarEvolution>();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
   DL = DLP ? &DLP->getDataLayout() : nullptr;
+  AT = &getAnalysis<AssumptionTracker>();
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL);
+    Changed |= simplifyLoop(*I, DT, LI, this, AA, SE, DL, AT);
 
   return Changed;
 }

diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index c86b82c..0e1baa1 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp

@@ -17,7 +17,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -64,10 +66,15 @@
 
 /// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it
 /// only has one predecessor, and that predecessor only has one successor.
-/// The LoopInfo Analysis that is passed will be kept consistent.
-/// Returns the new combined block.
-static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
-                                            LPPassManager *LPM) {
+/// The LoopInfo Analysis that is passed will be kept consistent.  If folding is
+/// successful references to the containing loop must be removed from
+/// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have
+/// references to the eliminated BB.  The argument ForgottenLoops contains a set
+/// of loops that have already been forgotten to prevent redundant, expensive
+/// calls to ScalarEvolution::forgetLoop.  Returns the new combined block.
+static BasicBlock *
+FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, LPPassManager *LPM,
+                         SmallPtrSetImpl<Loop *> &ForgottenLoops) {
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -104,8 +111,10 @@
   // ScalarEvolution holds references to loop exit blocks.
   if (LPM) {
     if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) {
-      if (Loop *L = LI->getLoopFor(BB))
-        SE->forgetLoop(L);
+      if (Loop *L = LI->getLoopFor(BB)) {
+        if (ForgottenLoops.insert(L).second)
+          SE->forgetLoop(L);
+      }
     }
   }
   LI->removeBlock(BB);
@@ -146,7 +155,8 @@
 /// available from the Pass it must also preserve those analyses.
 bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
                       bool AllowRuntime, unsigned TripMultiple,
-                      LoopInfo *LI, Pass *PP, LPPassManager *LPM) {
+                      LoopInfo *LI, Pass *PP, LPPassManager *LPM,
+                      AssumptionTracker *AT) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
@@ -214,11 +224,10 @@
 
   // Notify ScalarEvolution that the loop will be substantially changed,
   // if not outright eliminated.
-  if (PP) {
-    ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
-    if (SE)
-      SE->forgetLoop(L);
-  }
+  ScalarEvolution *SE =
+      PP ? PP->getAnalysisIfAvailable<ScalarEvolution>() : nullptr;
+  if (SE)
+    SE->forgetLoop(L);
 
   // If we know the trip count, we know the multiple...
   unsigned BreakoutTrip = 0;
@@ -292,15 +301,45 @@
 
   for (unsigned It = 1; It != Count; ++It) {
     std::vector<BasicBlock*> NewBlocks;
+    SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
+    NewLoops[L] = L;
 
     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
       ValueToValueMapTy VMap;
       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
       Header->getParent()->getBasicBlockList().push_back(New);
 
-      // Loop over all of the PHI nodes in the block, changing them to use the
-      // incoming values from the previous block.
+      // Tell LI about New.
+      if (*BB == Header) {
+        assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop");
+        L->addBasicBlockToLoop(New, LI->getBase());
+      } else {
+        // Figure out which loop New is in.
+        const Loop *OldLoop = LI->getLoopFor(*BB);
+        assert(OldLoop && "Should (at least) be in the loop being unrolled!");
+
+        Loop *&NewLoop = NewLoops[OldLoop];
+        if (!NewLoop) {
+          // Found a new sub-loop.
+          assert(*BB == OldLoop->getHeader() &&
+                 "Header should be first in RPO");
+
+          Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
+          assert(NewLoopParent &&
+                 "Expected parent loop before sub-loop in RPO");
+          NewLoop = new Loop;
+          NewLoopParent->addChildLoop(NewLoop);
+
+          // Forget the old loop, since its inputs may have changed.
+          if (SE)
+            SE->forgetLoop(OldLoop);
+        }
+        NewLoop->addBasicBlockToLoop(New, LI->getBase());
+      }
+
       if (*BB == Header)
+        // Loop over all of the PHI nodes in the block, changing them to use
+        // the incoming values from the previous block.
         for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
           PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]);
           Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
@@ -317,8 +356,6 @@
            VI != VE; ++VI)
         LastValueMap[VI->first] = VI->second;
 
-      L->addBasicBlockToLoop(New, LI->getBase());
-
       // Add phi entries for newly created values to all exit blocks.
       for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB);
            SI != SE; ++SI) {
@@ -423,15 +460,21 @@
   }
 
   // Merge adjacent basic blocks, if possible.
+  SmallPtrSet<Loop *, 4> ForgottenLoops;
   for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
     BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
     if (Term->isUnconditional()) {
       BasicBlock *Dest = Term->getSuccessor(0);
-      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM))
+      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM,
+                                                      ForgottenLoops))
         std::replace(Latches.begin(), Latches.end(), Dest, Fold);
     }
   }
 
+  // FIXME: We could register any cloned assumptions instead of clearing the
+  // whole function's cache.
+  AT->forgetCachedAssumptions(F);
+
   DominatorTree *DT = nullptr;
   if (PP) {
     // FIXME: Reconstruct dom info, because it is not preserved properly.
@@ -443,7 +486,6 @@
     }
 
     // Simplify any new induction variables in the partially unrolled loop.
-    ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
     if (SE && !CompletelyUnroll) {
       SmallVector<WeakVH, 16> DeadInsts;
       simplifyLoopIVs(L, SE, LPM, DeadInsts);
@@ -492,8 +534,7 @@
     if (OuterL) {
       DataLayoutPass *DLP = PP->getAnalysisIfAvailable<DataLayoutPass>();
       const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
-      ScalarEvolution *SE = PP->getAnalysisIfAvailable<ScalarEvolution>();
-      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL);
+      simplifyLoop(OuterL, DT, LI, PP, /*AliasAnalysis*/ nullptr, SE, DL, AT);
 
       // LCSSA must be performed on the outermost affected loop. The unrolled
       // loop's last loop latch is guaranteed to be in the outermost loop after

diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index a96c46a..3d91336 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp

@@ -28,6 +28,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -57,7 +58,7 @@
 static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
                           BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
                           BasicBlock *OrigPH, BasicBlock *NewPH,
-                          ValueToValueMapTy &LVMap, Pass *P) {
+                          ValueToValueMapTy &VMap, Pass *P) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
 
@@ -86,7 +87,7 @@
       Value *V = PN->getIncomingValueForBlock(Latch);
       if (Instruction *I = dyn_cast<Instruction>(V)) {
         if (L->contains(I)) {
-          V = LVMap[I];
+          V = VMap[I];
         }
       }
       // Adding a value to the new PHI node from the last prolog block
@@ -127,76 +128,122 @@
 }
 
 /// Create a clone of the blocks in a loop and connect them together.
-/// This function doesn't create a clone of the loop structure.
+/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new
+/// loop will be created including all cloned blocks, and the iterator of it
+/// switches to count NewIter down to 0.
 ///
-/// There are two value maps that are defined and used.  VMap is
-/// for the values in the current loop instance.  LVMap contains
-/// the values from the last loop instance.  We need the LVMap values
-/// to update the initial values for the current loop instance.
-///
-static void CloneLoopBlocks(Loop *L,
-                            bool FirstCopy,
-                            BasicBlock *InsertTop,
-                            BasicBlock *InsertBot,
+static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
+                            BasicBlock *InsertTop, BasicBlock *InsertBot,
                             std::vector<BasicBlock *> &NewBlocks,
-                            LoopBlocksDFS &LoopBlocks,
-                            ValueToValueMapTy &VMap,
-                            ValueToValueMapTy &LVMap,
+                            LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
                             LoopInfo *LI) {
-
   BasicBlock *Preheader = L->getLoopPreheader();
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
   Function *F = Header->getParent();
   LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
+  Loop *NewLoop = 0;
+  Loop *ParentLoop = L->getParentLoop();
+  if (!UnrollProlog) {
+    NewLoop = new Loop();
+    if (ParentLoop)
+      ParentLoop->addChildLoop(NewLoop);
+    else
+      LI->addTopLevelLoop(NewLoop);
+  }
+
   // For each block in the original loop, create a new copy,
   // and update the value map with the newly created values.
   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
-    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".unr", F);
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F);
     NewBlocks.push_back(NewBB);
 
-    if (Loop *ParentLoop = L->getParentLoop())
+    if (NewLoop)
+      NewLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+    else if (ParentLoop)
       ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());
 
     VMap[*BB] = NewBB;
     if (Header == *BB) {
       // For the first block, add a CFG connection to this newly
-      // created block
+      // created block.
       InsertTop->getTerminator()->setSuccessor(0, NewBB);
 
-      // Change the incoming values to the ones defined in the
-      // previously cloned loop.
-      for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-        PHINode *NewPHI = cast<PHINode>(VMap[I]);
-        if (FirstCopy) {
-          // We replace the first phi node with the value from the preheader
-          VMap[I] = NewPHI->getIncomingValueForBlock(Preheader);
-          NewBB->getInstList().erase(NewPHI);
-        } else {
-          // Update VMap with values from the previous block
-          unsigned idx = NewPHI->getBasicBlockIndex(Latch);
-          Value *InVal = NewPHI->getIncomingValue(idx);
-          if (Instruction *I = dyn_cast<Instruction>(InVal))
-            if (L->contains(I))
-              InVal = LVMap[InVal];
-          NewPHI->setIncomingValue(idx, InVal);
-          NewPHI->setIncomingBlock(idx, InsertTop);
+    }
+    if (Latch == *BB) {
+      // For the last block, if UnrollProlog is true, create a direct jump to
+      // InsertBot. If not, create a loop back to cloned head.
+      VMap.erase((*BB)->getTerminator());
+      BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
+      BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
+      if (UnrollProlog) {
+        LatchBR->eraseFromParent();
+        BranchInst::Create(InsertBot, NewBB);
+      } else {
+        PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter",
+                                          FirstLoopBB->getFirstNonPHI());
+        IRBuilder<> Builder(LatchBR);
+        Value *IdxSub =
+            Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                              NewIdx->getName() + ".sub");
+        Value *IdxCmp =
+            Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
+        BranchInst::Create(FirstLoopBB, InsertBot, IdxCmp, NewBB);
+        NewIdx->addIncoming(NewIter, InsertTop);
+        NewIdx->addIncoming(IdxSub, NewBB);
+        LatchBR->eraseFromParent();
+      }
+    }
+  }
+
+  // Change the incoming values to the ones defined in the preheader or
+  // cloned loop.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *NewPHI = cast<PHINode>(VMap[I]);
+    if (UnrollProlog) {
+      VMap[I] = NewPHI->getIncomingValueForBlock(Preheader);
+      cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+    } else {
+      unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+      NewPHI->setIncomingBlock(idx, InsertTop);
+      BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
+      idx = NewPHI->getBasicBlockIndex(Latch);
+      Value *InVal = NewPHI->getIncomingValue(idx);
+      NewPHI->setIncomingBlock(idx, NewLatch);
+      if (VMap[InVal])
+        NewPHI->setIncomingValue(idx, VMap[InVal]);
+    }
+  }
+  if (NewLoop) {
+    // Add unroll disable metadata to disable future unrolling for this loop.
+    SmallVector<Value *, 4> Vals;
+    // Reserve first location for self reference to the LoopID metadata node.
+    Vals.push_back(nullptr);
+    MDNode *LoopID = NewLoop->getLoopID();
+    if (LoopID) {
+      // First remove any existing loop unrolling metadata.
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+        bool IsUnrollMetadata = false;
+        MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+        if (MD) {
+          const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+          IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
         }
+        if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i));
       }
     }
 
-    if (Latch == *BB) {
-      VMap.erase((*BB)->getTerminator());
-      NewBB->getTerminator()->eraseFromParent();
-      BranchInst::Create(InsertBot, NewBB);
-    }
-  }
-  // LastValueMap is updated with the values for the current loop
-  // which are used the next time this function is called.
-  for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
-       VI != VE; ++VI) {
-    LVMap[VI->first] = VI->second;
+    LLVMContext &Context = NewLoop->getHeader()->getContext();
+    SmallVector<Value *, 1> DisableOperands;
+    DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
+    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+    Vals.push_back(DisableNode);
+
+    MDNode *NewLoopID = MDNode::get(Context, Vals);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+    NewLoop->setLoopID(NewLoopID);
   }
 }
 
@@ -212,18 +259,16 @@
 /// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
 /// the switch instruction is generated.
 ///
-///    extraiters = tripcount % loopfactor
-///    if (extraiters == 0) jump Loop:
-///    if (extraiters == loopfactor) jump L1
-///    if (extraiters == loopfactor-1) jump L2
-///    ...
-///    L1:  LoopBody;
-///    L2:  LoopBody;
-///    ...
-///    if tripcount < loopfactor jump End
-///    Loop:
-///    ...
-///    End:
+///        extraiters = tripcount % loopfactor
+///        if (extraiters == 0) jump Loop:
+///        else jump Prol
+/// Prol:  LoopBody;
+///        extraiters -= 1                 // Omitted if unroll factor is 2.
+///        if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
+///        if (tripcount < loopfactor) jump End
+/// Loop:
+/// ...
+/// End:
 ///
 bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
                                    LPPassManager *LPM) {
@@ -250,6 +295,10 @@
   if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy())
     return false;
 
+  // If BECount is INT_MAX, we can't compute trip-count without overflow.
+  if (BECount->isAllOnesValue())
+    return false;
+
   // Add 1 since the backedge count doesn't include the first loop iteration
   const SCEV *TripCountSC =
     SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1));
@@ -284,26 +333,21 @@
   IRBuilder<> B(PreHeaderBR);
   Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
 
-  // Check if for no extra iterations, then jump to unrolled loop.  We have to
-  // check that the trip count computation didn't overflow when adding one to
-  // the backedge taken count.
+  // Check if for no extra iterations, then jump to cloned/unrolled loop.
+  // We have to check that the trip count computation didn't overflow when
+  // adding one to the backedge taken count.
   Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod");
   Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow");
   Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or");
 
-  // Branch to either the extra iterations or the unrolled loop
+  // Branch to either the extra iterations or the cloned/unrolled loop
   // We will fix up the true branch label when adding loop body copies
   BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR);
   assert(PreHeaderBR->isUnconditional() &&
          PreHeaderBR->getSuccessor(0) == PEnd &&
          "CFG edges in Preheader are not correct");
   PreHeaderBR->eraseFromParent();
-
-  ValueToValueMapTy LVMap;
   Function *F = Header->getParent();
-  // These variables are used to update the CFG links in each iteration
-  BasicBlock *CompareBB = nullptr;
-  BasicBlock *LastLoopBB = PH;
   // Get an ordered list of blocks in the loop to help with the ordering of the
   // cloned blocks in the prolog code
   LoopBlocksDFS LoopBlocks(L);
@@ -314,62 +358,39 @@
   // and generate a condition that branches to the copy depending on the
   // number of 'left over' iterations.
   //
-  for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) {
-    std::vector<BasicBlock*> NewBlocks;
-    ValueToValueMapTy VMap;
+  std::vector<BasicBlock *> NewBlocks;
+  ValueToValueMapTy VMap;
 
-    // Clone all the basic blocks in the loop, but we don't clone the loop
-    // This function adds the appropriate CFG connections.
-    CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks,
-                    LoopBlocks, VMap, LVMap, LI);
-    LastLoopBB = cast<BasicBlock>(VMap[Latch]);
+  // If unroll count is 2 and we can't overflow in tripcount computation (which
+  // is BECount + 1), then we don't need a loop for prologue, and we can unroll
+  // it. We can be sure that we don't overflow only if tripcount is a constant.
+  bool UnrollPrologue = (Count == 2 && isa<ConstantInt>(TripCount));
 
-    // Insert the cloned blocks into function just before the original loop
-    F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(),
-                                  NewBlocks[0], F->end());
+  // Clone all the basic blocks in the loop. If Count is 2, we don't clone
+  // the loop, otherwise we create a cloned loop to execute the extra
+  // iterations. This function adds the appropriate CFG connections.
+  CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks,
+                  VMap, LI);
 
-    // Generate the code for the comparison which determines if the loop
-    // prolog code needs to be executed.
-    if (leftOverIters == Count-1) {
-      // There is no compare block for the fall-thru case when for the last
-      // left over iteration
-      CompareBB = NewBlocks[0];
-    } else {
-      // Create a new block for the comparison
-      BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp",
-                                             F, CompareBB);
-      if (Loop *ParentLoop = L->getParentLoop()) {
-        // Add the new block to the parent loop, if needed
-        ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());
-      }
+  // Insert the cloned blocks into function just before the original loop
+  F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0],
+                                F->end());
 
-      // The comparison w/ the extra iteration value and branch
-      Type *CountTy = TripCount->getType();
-      Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal,
-                                      ConstantInt::get(CountTy, leftOverIters),
-                                      "un.tmp");
-      // Branch to either the extra iterations or the unrolled loop
-      BranchInst::Create(NewBlocks[0], CompareBB,
-                         BranchVal, NewBB);
-      CompareBB = NewBB;
-      PH->getTerminator()->setSuccessor(0, NewBB);
-      VMap[NewPH] = CompareBB;
-    }
-
-    // Rewrite the cloned instruction operands to use the values
-    // created when the clone is created.
-    for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
-      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
-             E = NewBlocks[i]->end(); I != E; ++I) {
-        RemapInstruction(I, VMap,
-                         RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
-      }
+  // Rewrite the cloned instruction operands to use the values
+  // created when the clone is created.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
+    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+                              E = NewBlocks[i]->end();
+         I != E; ++I) {
+      RemapInstruction(I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
     }
   }
 
   // Connect the prolog code to the original loop and update the
   // PHI functions.
-  ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap,
+  BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
+  ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
                 LPM->getAsPass());
   NumRuntimeUnrolled++;
   return true;

diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index eac693b..a0105c2 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp

@@ -67,8 +67,8 @@
 
     BasicBlock *switchConvert(CaseItr Begin, CaseItr End,
                               ConstantInt *LowerBound, ConstantInt *UpperBound,
-                              Value *Val, BasicBlock *OrigBlock,
-                              BasicBlock *Default);
+                              Value *Val, BasicBlock *Predecessor,
+                              BasicBlock *OrigBlock, BasicBlock *Default);
     BasicBlock *newLeafBlock(CaseRange &Leaf, Value *Val, BasicBlock *OrigBlock,
                              BasicBlock *Default);
     unsigned Clusterify(CaseVector &Cases, SwitchInst *SI);
@@ -131,6 +131,28 @@
   return O << "]";
 }
 
+/// \brief Update the first occurrence of the "switch statement" BB in the PHI
+/// node with the "new" BB. The other occurrences will be updated by subsequent
+/// calls to this function.
+///
+/// Switch statements may have more than one incoming edge into the same BB if
+/// they all have the same value. When the switch statement is converted these
+/// incoming edges are now coming from multiple BBs.
+static void fixPhis(BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB) {
+  for (BasicBlock::iterator I = SuccBB->begin(), E = SuccBB->getFirstNonPHI();
+       I != E; ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // Only update the first occurence.
+    for (unsigned Idx = 0, E = PN->getNumIncomingValues(); Idx != E; ++Idx) {
+      if (PN->getIncomingBlock(Idx) == OrigBB) {
+        PN->setIncomingBlock(Idx, NewBB);
+        break;
+      }
+    }
+  }
+}
+
 // switchConvert - Convert the switch statement into a binary lookup of
 // the case values. The function recursively builds this tree.
 // LowerBound and UpperBound are used to keep track of the bounds for Val
@@ -139,6 +161,7 @@
 BasicBlock *LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
                                        ConstantInt *LowerBound,
                                        ConstantInt *UpperBound, Value *Val,
+                                       BasicBlock *Predecessor,
                                        BasicBlock *OrigBlock,
                                        BasicBlock *Default) {
   unsigned Size = End - Begin;
@@ -149,6 +172,7 @@
     // emitting the code that checks if the value actually falls in the range
     // because the bounds already tell us so.
     if (Begin->Low == LowerBound && Begin->High == UpperBound) {
+      fixPhis(Begin->BB, OrigBlock, Predecessor);
       return Begin->BB;
     }
     return newLeafBlock(*Begin, Val, OrigBlock, Default);
@@ -200,21 +224,25 @@
           dbgs() << "NONE\n";
         });
 
-  BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound,
-                                      NewUpperBound, Val, OrigBlock, Default);
-  BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound,
-                                      UpperBound, Val, OrigBlock, Default);
-
   // Create a new node that checks if the value is < pivot. Go to the
   // left branch if it is and right branch if not.
   Function* F = OrigBlock->getParent();
   BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock");
-  Function::iterator FI = OrigBlock;
-  F->getBasicBlockList().insert(++FI, NewNode);
 
   ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
                                 Val, Pivot.Low, "Pivot");
+
+  BasicBlock *LBranch = switchConvert(LHS.begin(), LHS.end(), LowerBound,
+                                      NewUpperBound, Val, NewNode, OrigBlock,
+                                      Default);
+  BasicBlock *RBranch = switchConvert(RHS.begin(), RHS.end(), NewLowerBound,
+                                      UpperBound, Val, NewNode, OrigBlock,
+                                      Default);
+
+  Function::iterator FI = OrigBlock;
+  F->getBasicBlockList().insert(++FI, NewNode);
   NewNode->getInstList().push_back(Comp);
+
   BranchInst::Create(LBranch, RBranch, Comp, NewNode);
   return NewNode;
 }
@@ -386,7 +414,7 @@
   }
   BasicBlock *SwitchBlock =
       switchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
-                    OrigBlock, NewDefault);
+                    OrigBlock, OrigBlock, NewDefault);
 
   // Branch to our shiny new if-then stuff...
   BranchInst::Create(SwitchBlock, OrigBlock);

diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index 189caa7..477ee7a 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp

@@ -14,6 +14,7 @@
 
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -38,6 +39,7 @@
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AssumptionTracker>();
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.setPreservesCFG();
       // This is a cluster of orthogonal Transforms
@@ -51,6 +53,7 @@
 char PromotePass::ID = 0;
 INITIALIZE_PASS_BEGIN(PromotePass, "mem2reg", "Promote Memory to Register",
                 false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_END(PromotePass, "mem2reg", "Promote Memory to Register",
                 false, false)
@@ -63,6 +66,7 @@
   bool Changed  = false;
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
 
   while (1) {
     Allocas.clear();
@@ -76,7 +80,7 @@
 
     if (Allocas.empty()) break;
 
-    PromoteMemToReg(Allocas, DT);
+    PromoteMemToReg(Allocas, DT, nullptr, AT);
     NumPromoted += Allocas.size();
     Changed = true;
   }

diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index d9dbbca..35c701e 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp

@@ -78,7 +78,7 @@
 }
 
 GlobalVariable *
-llvm::collectUsedGlobalVariables(Module &M, SmallPtrSet<GlobalValue *, 8> &Set,
+llvm::collectUsedGlobalVariables(Module &M, SmallPtrSetImpl<GlobalValue *> &Set,
                                  bool CompilerUsed) {
   const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used";
   GlobalVariable *GV = M.getGlobalVariable(Name);

diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 06d73fe..1fd7071 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp

@@ -238,6 +238,9 @@
   /// An AliasSetTracker object to update.  If null, don't update it.
   AliasSetTracker *AST;
 
+  /// A cache of @llvm.assume intrinsics used by SimplifyInstruction.
+  AssumptionTracker *AT;
+
   /// Reverse mapping of Allocas.
   DenseMap<AllocaInst *, unsigned> AllocaLookup;
 
@@ -279,9 +282,9 @@
 
 public:
   PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                 AliasSetTracker *AST)
+                 AliasSetTracker *AST, AssumptionTracker *AT)
       : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
-        DIB(*DT.getRoot()->getParent()->getParent()), AST(AST) {}
+        DIB(*DT.getRoot()->getParent()->getParent()), AST(AST), AT(AT) {}
 
   void run();
 
@@ -302,8 +305,8 @@
   void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
                                AllocaInfo &Info);
   void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info,
-                           const SmallPtrSet<BasicBlock *, 32> &DefBlocks,
-                           SmallPtrSet<BasicBlock *, 32> &LiveInBlocks);
+                           const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+                           SmallPtrSetImpl<BasicBlock *> &LiveInBlocks);
   void RenamePass(BasicBlock *BB, BasicBlock *Pred,
                   RenamePassData::ValVector &IncVals,
                   std::vector<RenamePassData> &Worklist);
@@ -685,7 +688,7 @@
       PHINode *PN = I->second;
 
       // If this PHI node merges one value and/or undefs, get the value.
-      if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT)) {
+      if (Value *V = SimplifyInstruction(PN, nullptr, nullptr, &DT, AT)) {
         if (AST && PN->getType()->isPointerTy())
           AST->deleteValue(PN);
         PN->replaceAllUsesWith(V);
@@ -766,8 +769,8 @@
 /// inserted phi nodes would be dead).
 void PromoteMem2Reg::ComputeLiveInBlocks(
     AllocaInst *AI, AllocaInfo &Info,
-    const SmallPtrSet<BasicBlock *, 32> &DefBlocks,
-    SmallPtrSet<BasicBlock *, 32> &LiveInBlocks) {
+    const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) {
 
   // To determine liveness, we must iterate through the predecessors of blocks
   // where the def is live.  Blocks are added to the worklist if we need to
@@ -816,7 +819,7 @@
 
     // The block really is live in here, insert it into the set.  If already in
     // the set, then it has already been processed.
-    if (!LiveInBlocks.insert(BB))
+    if (!LiveInBlocks.insert(BB).second)
       continue;
 
     // Since the value is live into BB, it is either defined in a predecessor or
@@ -857,10 +860,8 @@
                               less_second> IDFPriorityQueue;
   IDFPriorityQueue PQ;
 
-  for (SmallPtrSet<BasicBlock *, 32>::const_iterator I = DefBlocks.begin(),
-                                                     E = DefBlocks.end();
-       I != E; ++I) {
-    if (DomTreeNode *Node = DT.getNode(*I))
+  for (BasicBlock *BB : DefBlocks) {
+    if (DomTreeNode *Node = DT.getNode(BB))
       PQ.push(std::make_pair(Node, DomLevels[Node]));
   }
 
@@ -898,7 +899,7 @@
         if (SuccLevel > RootLevel)
           continue;
 
-        if (!Visited.insert(SuccNode))
+        if (!Visited.insert(SuccNode).second)
           continue;
 
         BasicBlock *SuccBB = SuccNode->getBlock();
@@ -1003,7 +1004,7 @@
   }
 
   // Don't revisit blocks.
-  if (!Visited.insert(BB))
+  if (!Visited.insert(BB).second)
     return;
 
   for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II);) {
@@ -1060,17 +1061,17 @@
   ++I;
 
   for (; I != E; ++I)
-    if (VisitedSuccs.insert(*I))
+    if (VisitedSuccs.insert(*I).second)
       Worklist.push_back(RenamePassData(*I, Pred, IncomingVals));
 
   goto NextIteration;
 }
 
 void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                           AliasSetTracker *AST) {
+                           AliasSetTracker *AST, AssumptionTracker *AT) {
   // If there is nothing to do, bail out...
   if (Allocas.empty())
     return;
 
-  PromoteMem2Reg(Allocas, DT, AST).run();
+  PromoteMem2Reg(Allocas, DT, AST, AT).run();
 }

diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 960b198..92fd56a 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp

@@ -43,6 +43,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <map>
 #include <set>
@@ -68,12 +70,23 @@
     cl::desc("Hoist conditional stores if an unconditional store precedes"));
 
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
+STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping");
 STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
 STATISTIC(NumLookupTablesHoles, "Number of switch instructions turned into lookup tables (holes checked)");
 STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 
 namespace {
+  // The first field contains the value that the switch produces when a certain
+  // case group is selected, and the second field is a vector containing the cases
+  // composing the case group.
+  typedef SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>
+    SwitchCaseResultVectorTy;
+  // The first field contains the phi node that generates a result of the switch
+  // and the second field contains the value generated for a certain case in the switch
+  // for that PHI.
+  typedef SmallVector<std::pair<PHINode *, Constant *>, 4> SwitchCaseResultsTy;
+
   /// ValueEqualityComparisonCase - Represents a case of a switch.
   struct ValueEqualityComparisonCase {
     ConstantInt *Value;
@@ -92,7 +105,9 @@
 
 class SimplifyCFGOpt {
   const TargetTransformInfo &TTI;
+  unsigned BonusInstThreshold;
   const DataLayout *const DL;
+  AssumptionTracker *AT;
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
                                std::vector<ValueEqualityComparisonCase> &Cases);
@@ -111,8 +126,9 @@
   bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
 
 public:
-  SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout *DL)
-      : TTI(TTI), DL(DL) {}
+  SimplifyCFGOpt(const TargetTransformInfo &TTI, unsigned BonusInstThreshold,
+                 const DataLayout *DL, AssumptionTracker *AT)
+      : TTI(TTI), BonusInstThreshold(BonusInstThreshold), DL(DL), AT(AT) {}
   bool run(BasicBlock *BB);
 };
 }
@@ -256,7 +272,7 @@
 /// V plus its non-dominating operands.  If that cost is greater than
 /// CostRemaining, false is returned and CostRemaining is undefined.
 static bool DominatesMergePoint(Value *V, BasicBlock *BB,
-                                SmallPtrSet<Instruction*, 4> *AggressiveInsts,
+                                SmallPtrSetImpl<Instruction*> *AggressiveInsts,
                                 unsigned &CostRemaining,
                                 const DataLayout *DL) {
   Instruction *I = dyn_cast<Instruction>(V);
@@ -341,114 +357,177 @@
   return nullptr;
 }
 
-/// GatherConstantCompares - Given a potentially 'or'd or 'and'd together
-/// collection of icmp eq/ne instructions that compare a value against a
-/// constant, return the value being compared, and stick the constant into the
-/// Values vector.
-static Value *
-GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
-                       const DataLayout *DL, bool isEQ, unsigned &UsedICmps) {
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return nullptr;
+namespace {
 
-  // If this is an icmp against a constant, handle this as one of the cases.
-  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
-    if (ConstantInt *C = GetConstantInt(I->getOperand(1), DL)) {
-      Value *RHSVal;
-      ConstantInt *RHSC;
+/// Given a chain of or (||) or and (&&) comparison of a value against a
+/// constant, this will try to recover the information required for a switch
+/// structure.
+/// It will depth-first traverse the chain of comparison, seeking for patterns
+/// like %a == 12 or %a < 4 and combine them to produce a set of integer
+/// representing the different cases for the switch.
+/// Note that if the chain is composed of '||' it will build the set of elements
+/// that matches the comparisons (i.e. any of this value validate the chain)
+/// while for a chain of '&&' it will build the set elements that make the test
+/// fail.
+struct ConstantComparesGatherer {
 
-      if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ:ICmpInst::ICMP_NE)) {
-        // (x & ~2^x) == y --> x == y || x == y|2^x
-        // This undoes a transformation done by instcombine to fuse 2 compares.
-        if (match(ICI->getOperand(0),
-                  m_And(m_Value(RHSVal), m_ConstantInt(RHSC)))) {
-          APInt Not = ~RHSC->getValue();
-          if (Not.isPowerOf2()) {
-            Vals.push_back(C);
-            Vals.push_back(
-                ConstantInt::get(C->getContext(), C->getValue() | Not));
-            UsedICmps++;
-            return RHSVal;
-          }
+  Value *CompValue; /// Value found for the switch comparison
+  Value *Extra;     /// Extra clause to be checked before the switch
+  SmallVector<ConstantInt *, 8> Vals; /// Set of integers to match in switch
+  unsigned UsedICmps; /// Number of comparisons matched in the and/or chain
+
+  /// Construct and compute the result for the comparison instruction Cond
+  ConstantComparesGatherer(Instruction *Cond, const DataLayout *DL)
+      : CompValue(nullptr), Extra(nullptr), UsedICmps(0) {
+    gather(Cond, DL);
+  }
+
+  /// Prevent copy
+  ConstantComparesGatherer(const ConstantComparesGatherer &)
+      LLVM_DELETED_FUNCTION;
+  ConstantComparesGatherer &
+  operator=(const ConstantComparesGatherer &) LLVM_DELETED_FUNCTION;
+
+private:
+
+  /// Try to set the current value used for the comparison, it succeeds only if
+  /// it wasn't set before or if the new value is the same as the old one
+  bool setValueOnce(Value *NewVal) {
+    if(CompValue && CompValue != NewVal) return false;
+    CompValue = NewVal;
+    return (CompValue != nullptr);
+  }
+
+  /// Try to match Instruction "I" as a comparison against a constant and
+  /// populates the array Vals with the set of values that match (or do not
+  /// match depending on isEQ).
+  /// Return false on failure. On success, the Value the comparison matched
+  /// against is placed in CompValue.
+  /// If CompValue is already set, the function is expected to fail if a match
+  /// is found but the value compared to is different.
+  bool matchInstruction(Instruction *I, const DataLayout *DL, bool isEQ) {
+    // If this is an icmp against a constant, handle this as one of the cases.
+    ICmpInst *ICI;
+    ConstantInt *C;
+    if (!((ICI = dyn_cast<ICmpInst>(I)) &&
+             (C = GetConstantInt(I->getOperand(1), DL)))) {
+      return false;
+    }
+
+    Value *RHSVal;
+    ConstantInt *RHSC;
+
+    // Pattern match a special case
+    // (x & ~2^x) == y --> x == y || x == y|2^x
+    // This undoes a transformation done by instcombine to fuse 2 compares.
+    if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ:ICmpInst::ICMP_NE)) {
+      if (match(ICI->getOperand(0),
+                m_And(m_Value(RHSVal), m_ConstantInt(RHSC)))) {
+        APInt Not = ~RHSC->getValue();
+        if (Not.isPowerOf2()) {
+          // If we already have a value for the switch, it has to match!
+          if(!setValueOnce(RHSVal))
+            return false;
+
+          Vals.push_back(C);
+          Vals.push_back(ConstantInt::get(C->getContext(),
+                                          C->getValue() | Not));
+          UsedICmps++;
+          return true;
         }
-
-        UsedICmps++;
-        Vals.push_back(C);
-        return I->getOperand(0);
       }
 
-      // If we have "x ult 3" comparison, for example, then we can add 0,1,2 to
-      // the set.
-      ConstantRange Span =
-        ConstantRange::makeICmpRegion(ICI->getPredicate(), C->getValue());
+      // If we already have a value for the switch, it has to match!
+      if(!setValueOnce(ICI->getOperand(0)))
+        return false;
 
-      // Shift the range if the compare is fed by an add. This is the range
-      // compare idiom as emitted by instcombine.
-      bool hasAdd =
-          match(I->getOperand(0), m_Add(m_Value(RHSVal), m_ConstantInt(RHSC)));
-      if (hasAdd)
-        Span = Span.subtract(RHSC->getValue());
-
-      // If this is an and/!= check then we want to optimize "x ugt 2" into
-      // x != 0 && x != 1.
-      if (!isEQ)
-        Span = Span.inverse();
-
-      // If there are a ton of values, we don't want to make a ginormous switch.
-      if (Span.getSetSize().ugt(8) || Span.isEmptySet())
-        return nullptr;
-
-      for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
-        Vals.push_back(ConstantInt::get(V->getContext(), Tmp));
       UsedICmps++;
-      return hasAdd ? RHSVal : I->getOperand(0);
-    }
-    return nullptr;
-  }
-
-  // Otherwise, we can only handle an | or &, depending on isEQ.
-  if (I->getOpcode() != (isEQ ? Instruction::Or : Instruction::And))
-    return nullptr;
-
-  unsigned NumValsBeforeLHS = Vals.size();
-  unsigned UsedICmpsBeforeLHS = UsedICmps;
-  if (Value *LHS = GatherConstantCompares(I->getOperand(0), Vals, Extra, DL,
-                                          isEQ, UsedICmps)) {
-    unsigned NumVals = Vals.size();
-    unsigned UsedICmpsBeforeRHS = UsedICmps;
-    if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, DL,
-                                            isEQ, UsedICmps)) {
-      if (LHS == RHS)
-        return LHS;
-      Vals.resize(NumVals);
-      UsedICmps = UsedICmpsBeforeRHS;
+      Vals.push_back(C);
+      return ICI->getOperand(0);
     }
 
-    // The RHS of the or/and can't be folded in and we haven't used "Extra" yet,
-    // set it and return success.
-    if (Extra == nullptr || Extra == I->getOperand(1)) {
-      Extra = I->getOperand(1);
-      return LHS;
+    // If we have "x ult 3", for example, then we can add 0,1,2 to the set.
+    ConstantRange Span = ConstantRange::makeICmpRegion(ICI->getPredicate(),
+                                                       C->getValue());
+
+    // Shift the range if the compare is fed by an add. This is the range
+    // compare idiom as emitted by instcombine.
+    Value *CandidateVal = I->getOperand(0);
+    if(match(I->getOperand(0), m_Add(m_Value(RHSVal), m_ConstantInt(RHSC)))) {
+      Span = Span.subtract(RHSC->getValue());
+      CandidateVal = RHSVal;
     }
 
-    Vals.resize(NumValsBeforeLHS);
-    UsedICmps = UsedICmpsBeforeLHS;
-    return nullptr;
+    // If this is an and/!= check, then we are looking to build the set of
+    // value that *don't* pass the and chain. I.e. to turn "x ugt 2" into
+    // x != 0 && x != 1.
+    if (!isEQ)
+      Span = Span.inverse();
+
+    // If there are a ton of values, we don't want to make a ginormous switch.
+    if (Span.getSetSize().ugt(8) || Span.isEmptySet()) {
+      return false;
+    }
+
+    // If we already have a value for the switch, it has to match!
+    if(!setValueOnce(CandidateVal))
+      return false;
+
+    // Add all values from the range to the set
+    for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+      Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+
+    UsedICmps++;
+    return true;
+
   }
 
-  // If the LHS can't be folded in, but Extra is available and RHS can, try to
-  // use LHS as Extra.
-  if (Extra == nullptr || Extra == I->getOperand(0)) {
-    Value *OldExtra = Extra;
-    Extra = I->getOperand(0);
-    if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, DL,
-                                            isEQ, UsedICmps))
-      return RHS;
-    assert(Vals.size() == NumValsBeforeLHS);
-    Extra = OldExtra;
-  }
+  /// gather - Given a potentially 'or'd or 'and'd together collection of icmp
+  /// eq/ne/lt/gt instructions that compare a value against a constant, extract
+  /// the value being compared, and stick the list constants into the Vals
+  /// vector.
+  /// One "Extra" case is allowed to differ from the other.
+  void gather(Value *V, const DataLayout *DL) {
+    Instruction *I = dyn_cast<Instruction>(V);
+    bool isEQ = (I->getOpcode() == Instruction::Or);
 
-  return nullptr;
+    // Keep a stack (SmallVector for efficiency) for depth-first traversal
+    SmallVector<Value *, 8> DFT;
+
+    // Initialize
+    DFT.push_back(V);
+
+    while(!DFT.empty()) {
+      V = DFT.pop_back_val();
+
+      if (Instruction *I = dyn_cast<Instruction>(V)) {
+        // If it is a || (or && depending on isEQ), process the operands.
+        if (I->getOpcode() == (isEQ ? Instruction::Or : Instruction::And)) {
+          DFT.push_back(I->getOperand(1));
+          DFT.push_back(I->getOperand(0));
+          continue;
+        }
+
+        // Try to match the current instruction
+        if (matchInstruction(I, DL, isEQ))
+          // Match succeed, continue the loop
+          continue;
+      }
+
+      // One element of the sequence of || (or &&) could not be match as a
+      // comparison against the same value as the others.
+      // We allow only one "Extra" case to be checked before the switch
+      if (!Extra) {
+        Extra = V;
+        continue;
+      }
+      // Failed to parse a proper sequence, abort now
+      CompValue = nullptr;
+      break;
+    }
+  }
+};
+
 }
 
 static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
@@ -628,7 +707,7 @@
 
     // Collect branch weights into a vector.
     SmallVector<uint32_t, 8> Weights;
-    MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+    MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
     bool HasWeight = MD && (MD->getNumOperands() == 2 + SI->getNumCases());
     if (HasWeight)
       for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
@@ -723,7 +802,7 @@
 }
 
 static inline bool HasBranchWeights(const Instruction* I) {
-  MDNode* ProfMD = I->getMetadata(LLVMContext::MD_prof);
+  MDNode *ProfMD = I->getMetadata(LLVMContext::MD_prof);
   if (ProfMD && ProfMD->getOperand(0))
     if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
       return MDS->getString().equals("branch_weights");
@@ -736,7 +815,7 @@
 /// metadata.
 static void GetBranchWeights(TerminatorInst *TI,
                              SmallVectorImpl<uint64_t> &Weights) {
-  MDNode* MD = TI->getMetadata(LLVMContext::MD_prof);
+  MDNode *MD = TI->getMetadata(LLVMContext::MD_prof);
   assert(MD);
   for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
     ConstantInt *CI = cast<ConstantInt>(MD->getOperand(i));
@@ -995,6 +1074,8 @@
   return true;
 }
 
+static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I);
+
 /// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and
 /// BB2, hoist any common code in the two blocks up into the branch block.  The
 /// caller of this function guarantees that BI's block dominates BB1 and BB2.
@@ -1040,6 +1121,14 @@
     if (!I2->use_empty())
       I2->replaceAllUsesWith(I1);
     I1->intersectOptionalDataWith(I2);
+    unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,
+      LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,
+      LLVMContext::MD_invariant_load,
+      LLVMContext::MD_nonnull
+    };
+    combineMetadata(I1, I2, KnownIDs);
     I2->eraseFromParent();
     Changed = true;
 
@@ -1072,6 +1161,12 @@
       if (BB1V == BB2V)
         continue;
 
+      // Check for passingValueIsAlwaysUndefined here because we would rather
+      // eliminate undefined control flow then converting it to a select.
+      if (passingValueIsAlwaysUndefined(BB1V, PN) ||
+          passingValueIsAlwaysUndefined(BB2V, PN))
+       return Changed;
+
       if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V, DL))
         return Changed;
       if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V, DL))
@@ -1281,6 +1376,8 @@
     if (!I2->use_empty())
       I2->replaceAllUsesWith(I1);
     I1->intersectOptionalDataWith(I2);
+    // TODO: Use combineMetadata here to preserve what metadata we can
+    // (analogous to the hoisting case above).
     I2->eraseFromParent();
 
     if (UpdateRE1)
@@ -1486,6 +1583,11 @@
     if (ThenV == OrigV)
       continue;
 
+    // Don't convert to selects if we could remove undefined behavior instead.
+    if (passingValueIsAlwaysUndefined(OrigV, PN) ||
+        passingValueIsAlwaysUndefined(ThenV, PN))
+      return false;
+
     HaveRewritablePHIs = true;
     ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV);
     ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV);
@@ -1963,7 +2065,8 @@
 /// FoldBranchToCommonDest - If this basic block is simple enough, and if a
 /// predecessor branches to us and one of our successors, fold the block into
 /// the predecessor and use logical operations to pick the right destination.
-bool llvm::FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL) {
+bool llvm::FoldBranchToCommonDest(BranchInst *BI, const DataLayout *DL,
+                                  unsigned BonusInstThreshold) {
   BasicBlock *BB = BI->getParent();
 
   Instruction *Cond = nullptr;
@@ -2000,33 +2103,6 @@
       Cond->getParent() != BB || !Cond->hasOneUse())
   return false;
 
-  // Only allow this if the condition is a simple instruction that can be
-  // executed unconditionally.  It must be in the same block as the branch, and
-  // must be at the front of the block.
-  BasicBlock::iterator FrontIt = BB->front();
-
-  // Ignore dbg intrinsics.
-  while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
-
-  // Allow a single instruction to be hoisted in addition to the compare
-  // that feeds the branch.  We later ensure that any values that _it_ uses
-  // were also live in the predecessor, so that we don't unnecessarily create
-  // register pressure or inhibit out-of-order execution.
-  Instruction *BonusInst = nullptr;
-  if (&*FrontIt != Cond &&
-      FrontIt->hasOneUse() && FrontIt->user_back() == Cond &&
-      isSafeToSpeculativelyExecute(FrontIt, DL)) {
-    BonusInst = &*FrontIt;
-    ++FrontIt;
-
-    // Ignore dbg intrinsics.
-    while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
-  }
-
-  // Only a single bonus inst is allowed.
-  if (&*FrontIt != Cond)
-    return false;
-
   // Make sure the instruction after the condition is the cond branch.
   BasicBlock::iterator CondIt = Cond; ++CondIt;
 
@@ -2036,6 +2112,31 @@
   if (&*CondIt != BI)
     return false;
 
+  // Only allow this transformation if computing the condition doesn't involve
+  // too many instructions and these involved instructions can be executed
+  // unconditionally. We denote all involved instructions except the condition
+  // as "bonus instructions", and only allow this transformation when the
+  // number of the bonus instructions does not exceed a certain threshold.
+  unsigned NumBonusInsts = 0;
+  for (auto I = BB->begin(); Cond != I; ++I) {
+    // Ignore dbg intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+    if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(I, DL))
+      return false;
+    // I has only one use and can be executed unconditionally.
+    Instruction *User = dyn_cast<Instruction>(I->user_back());
+    if (User == nullptr || User->getParent() != BB)
+      return false;
+    // I is used in the same BB. Since BI uses Cond and doesn't have more slots
+    // to use any other instruction, User must be an instruction between next(I)
+    // and Cond.
+    ++NumBonusInsts;
+    // Early exits once we reach the limit.
+    if (NumBonusInsts > BonusInstThreshold)
+      return false;
+  }
+
   // Cond is known to be a compare or binary operator.  Check to make sure that
   // neither operand is a potentially-trapping constant expression.
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
@@ -2086,49 +2187,6 @@
         continue;
     }
 
-    // Ensure that any values used in the bonus instruction are also used
-    // by the terminator of the predecessor.  This means that those values
-    // must already have been resolved, so we won't be inhibiting the
-    // out-of-order core by speculating them earlier. We also allow
-    // instructions that are used by the terminator's condition because it
-    // exposes more merging opportunities.
-    bool UsedByBranch = (BonusInst && BonusInst->hasOneUse() &&
-                         BonusInst->user_back() == Cond);
-
-    if (BonusInst && !UsedByBranch) {
-      // Collect the values used by the bonus inst
-      SmallPtrSet<Value*, 4> UsedValues;
-      for (Instruction::op_iterator OI = BonusInst->op_begin(),
-           OE = BonusInst->op_end(); OI != OE; ++OI) {
-        Value *V = *OI;
-        if (!isa<Constant>(V) && !isa<Argument>(V))
-          UsedValues.insert(V);
-      }
-
-      SmallVector<std::pair<Value*, unsigned>, 4> Worklist;
-      Worklist.push_back(std::make_pair(PBI->getOperand(0), 0));
-
-      // Walk up to four levels back up the use-def chain of the predecessor's
-      // terminator to see if all those values were used.  The choice of four
-      // levels is arbitrary, to provide a compile-time-cost bound.
-      while (!Worklist.empty()) {
-        std::pair<Value*, unsigned> Pair = Worklist.back();
-        Worklist.pop_back();
-
-        if (Pair.second >= 4) continue;
-        UsedValues.erase(Pair.first);
-        if (UsedValues.empty()) break;
-
-        if (Instruction *I = dyn_cast<Instruction>(Pair.first)) {
-          for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
-               OI != OE; ++OI)
-            Worklist.push_back(std::make_pair(OI->get(), Pair.second+1));
-        }
-      }
-
-      if (!UsedValues.empty()) return false;
-    }
-
     DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
     IRBuilder<> Builder(PBI);
 
@@ -2148,30 +2206,41 @@
       PBI->swapSuccessors();
     }
 
-    // If we have a bonus inst, clone it into the predecessor block.
-    Instruction *NewBonus = nullptr;
-    if (BonusInst) {
-      NewBonus = BonusInst->clone();
+    // If we have bonus instructions, clone them into the predecessor block.
+    // Note that there may be mutliple predecessor blocks, so we cannot move
+    // bonus instructions to a predecessor block.
+    ValueToValueMapTy VMap; // maps original values to cloned values
+    // We already make sure Cond is the last instruction before BI. Therefore,
+    // every instructions before Cond other than DbgInfoIntrinsic are bonus
+    // instructions.
+    for (auto BonusInst = BB->begin(); Cond != BonusInst; ++BonusInst) {
+      if (isa<DbgInfoIntrinsic>(BonusInst))
+        continue;
+      Instruction *NewBonusInst = BonusInst->clone();
+      RemapInstruction(NewBonusInst, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
+      VMap[BonusInst] = NewBonusInst;
 
       // If we moved a load, we cannot any longer claim any knowledge about
       // its potential value. The previous information might have been valid
       // only given the branch precondition.
       // For an analogous reason, we must also drop all the metadata whose
       // semantics we don't understand.
-      NewBonus->dropUnknownMetadata(LLVMContext::MD_dbg);
+      NewBonusInst->dropUnknownMetadata(LLVMContext::MD_dbg);
 
-      PredBlock->getInstList().insert(PBI, NewBonus);
-      NewBonus->takeName(BonusInst);
-      BonusInst->setName(BonusInst->getName()+".old");
+      PredBlock->getInstList().insert(PBI, NewBonusInst);
+      NewBonusInst->takeName(BonusInst);
+      BonusInst->setName(BonusInst->getName() + ".old");
     }
 
     // Clone Cond into the predecessor basic block, and or/and the
     // two conditions together.
     Instruction *New = Cond->clone();
-    if (BonusInst) New->replaceUsesOfWith(BonusInst, NewBonus);
+    RemapInstruction(New, VMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
     PredBlock->getInstList().insert(PBI, New);
     New->takeName(Cond);
-    Cond->setName(New->getName()+".old");
+    Cond->setName(New->getName() + ".old");
 
     if (BI->isConditional()) {
       Instruction *NewCond =
@@ -2649,7 +2718,7 @@
 /// the PHI, merging the third icmp into the switch.
 static bool TryToSimplifyUncondBranchWithICmpInIt(
     ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI,
-    const DataLayout *DL) {
+    unsigned BonusInstThreshold, const DataLayout *DL, AssumptionTracker *AT) {
   BasicBlock *BB = ICI->getParent();
 
   // If the block has any PHIs in it or the icmp has multiple uses, it is too
@@ -2682,7 +2751,7 @@
       ICI->eraseFromParent();
     }
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB, TTI, DL) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
   }
 
   // Ok, the block is reachable from the default dest.  If the constant we're
@@ -2698,7 +2767,7 @@
     ICI->replaceAllUsesWith(V);
     ICI->eraseFromParent();
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB, TTI, DL) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
   }
 
   // The use of the icmp has to be in the 'end' block, by the only PHI node in
@@ -2759,24 +2828,17 @@
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (!Cond) return false;
 
-
   // Change br (X == 0 | X == 1), T, F into a switch instruction.
   // If this is a bunch of seteq's or'd together, or if it's a bunch of
   // 'setne's and'ed together, collect them.
-  Value *CompVal = nullptr;
-  std::vector<ConstantInt*> Values;
-  bool TrueWhenEqual = true;
-  Value *ExtraCase = nullptr;
-  unsigned UsedICmps = 0;
 
-  if (Cond->getOpcode() == Instruction::Or) {
-    CompVal = GatherConstantCompares(Cond, Values, ExtraCase, DL, true,
-                                     UsedICmps);
-  } else if (Cond->getOpcode() == Instruction::And) {
-    CompVal = GatherConstantCompares(Cond, Values, ExtraCase, DL, false,
-                                     UsedICmps);
-    TrueWhenEqual = false;
-  }
+  // Try to gather values from a chain of and/or to be turned into a switch
+  ConstantComparesGatherer ConstantCompare(Cond, DL);
+  // Unpack the result
+  SmallVectorImpl<ConstantInt*> &Values = ConstantCompare.Vals;
+  Value *CompVal = ConstantCompare.CompValue;
+  unsigned UsedICmps = ConstantCompare.UsedICmps;
+  Value *ExtraCase = ConstantCompare.Extra;
 
   // If we didn't have a multiply compared value, fail.
   if (!CompVal) return false;
@@ -2785,6 +2847,8 @@
   if (UsedICmps <= 1)
     return false;
 
+  bool TrueWhenEqual = (Cond->getOpcode() == Instruction::Or);
+
   // There might be duplicate constants in the list, which the switch
   // instruction can't handle, remove them now.
   array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate);
@@ -3208,11 +3272,12 @@
 
 /// EliminateDeadSwitchCases - Compute masked bits for the condition of a switch
 /// and use it to remove dead cases.
-static bool EliminateDeadSwitchCases(SwitchInst *SI) {
+static bool EliminateDeadSwitchCases(SwitchInst *SI, const DataLayout *DL,
+                                     AssumptionTracker *AT) {
   Value *Cond = SI->getCondition();
   unsigned Bits = Cond->getType()->getIntegerBitWidth();
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
-  computeKnownBits(Cond, KnownZero, KnownOne);
+  computeKnownBits(Cond, KnownZero, KnownOne, DL, 0, AT, SI);
 
   // Gather dead cases.
   SmallVector<ConstantInt*, 8> DeadCases;
@@ -3460,6 +3525,163 @@
   return Res.size() > 0;
 }
 
+// MapCaseToResult - Helper function used to
+// add CaseVal to the list of cases that generate Result.
+static void MapCaseToResult(ConstantInt *CaseVal,
+    SwitchCaseResultVectorTy &UniqueResults,
+    Constant *Result) {
+  for (auto &I : UniqueResults) {
+    if (I.first == Result) {
+      I.second.push_back(CaseVal);
+      return;
+    }
+  }
+  UniqueResults.push_back(std::make_pair(Result,
+        SmallVector<ConstantInt*, 4>(1, CaseVal)));
+}
+
+// InitializeUniqueCases - Helper function that initializes a map containing
+// results for the PHI node of the common destination block for a switch
+// instruction. Returns false if multiple PHI nodes have been found or if
+// there is not a common destination block for the switch.
+static bool InitializeUniqueCases(
+    SwitchInst *SI, const DataLayout *DL, PHINode *&PHI,
+    BasicBlock *&CommonDest,
+    SwitchCaseResultVectorTy &UniqueResults,
+    Constant *&DefaultResult) {
+  for (auto &I : SI->cases()) {
+    ConstantInt *CaseVal = I.getCaseValue();
+
+    // Resulting value at phi nodes for this case value.
+    SwitchCaseResultsTy Results;
+    if (!GetCaseResults(SI, CaseVal, I.getCaseSuccessor(), &CommonDest, Results,
+                        DL))
+      return false;
+
+    // Only one value per case is permitted
+    if (Results.size() > 1)
+      return false;
+    MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+    // Check the PHI consistency.
+    if (!PHI)
+      PHI = Results[0].first;
+    else if (PHI != Results[0].first)
+      return false;
+  }
+  // Find the default result value.
+  SmallVector<std::pair<PHINode *, Constant *>, 1> DefaultResults;
+  BasicBlock *DefaultDest = SI->getDefaultDest();
+  GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults,
+                 DL);
+  // If the default value is not found abort unless the default destination
+  // is unreachable.
+  DefaultResult =
+      DefaultResults.size() == 1 ? DefaultResults.begin()->second : nullptr;
+  if ((!DefaultResult &&
+        !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())))
+    return false;
+
+  return true;
+}
+
+// ConvertTwoCaseSwitch - Helper function that checks if it is possible to
+// transform a switch with only two cases (or two cases + default)
+// that produces a result into a value select.
+// Example:
+// switch (a) {
+//   case 10:                %0 = icmp eq i32 %a, 10
+//     return 10;            %1 = select i1 %0, i32 10, i32 4
+//   case 20:        ---->   %2 = icmp eq i32 %a, 20
+//     return 2;             %3 = select i1 %2, i32 2, i32 %1
+//   default:
+//     return 4;
+// }
+static Value *
+ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
+                     Constant *DefaultResult, Value *Condition,
+                     IRBuilder<> &Builder) {
+  assert(ResultVector.size() == 2 &&
+      "We should have exactly two unique results at this point");
+  // If we are selecting between only two cases transform into a simple
+  // select or a two-way select if default is possible.
+  if (ResultVector[0].second.size() == 1 &&
+      ResultVector[1].second.size() == 1) {
+    ConstantInt *const FirstCase = ResultVector[0].second[0];
+    ConstantInt *const SecondCase = ResultVector[1].second[0];
+
+    bool DefaultCanTrigger = DefaultResult;
+    Value *SelectValue = ResultVector[1].first;
+    if (DefaultCanTrigger) {
+      Value *const ValueCompare =
+          Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
+      SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
+                                         DefaultResult, "switch.select");
+    }
+    Value *const ValueCompare =
+        Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
+    return Builder.CreateSelect(ValueCompare, ResultVector[0].first, SelectValue,
+                                "switch.select");
+  }
+
+  return nullptr;
+}
+
+// RemoveSwitchAfterSelectConversion - Helper function to cleanup a switch
+// instruction that has been converted into a select, fixing up PHI nodes and
+// basic blocks.
+static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
+                                              Value *SelectValue,
+                                              IRBuilder<> &Builder) {
+  BasicBlock *SelectBB = SI->getParent();
+  while (PHI->getBasicBlockIndex(SelectBB) >= 0)
+    PHI->removeIncomingValue(SelectBB);
+  PHI->addIncoming(SelectValue, SelectBB);
+
+  Builder.CreateBr(PHI->getParent());
+
+  // Remove the switch.
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
+    BasicBlock *Succ = SI->getSuccessor(i);
+
+    if (Succ == PHI->getParent())
+      continue;
+    Succ->removePredecessor(SelectBB);
+  }
+  SI->eraseFromParent();
+}
+
+/// SwitchToSelect - If the switch is only used to initialize one or more
+/// phi nodes in a common successor block with only two different
+/// constant values, replace the switch with select.
+static bool SwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
+                           const DataLayout *DL, AssumptionTracker *AT) {
+  Value *const Cond = SI->getCondition();
+  PHINode *PHI = nullptr;
+  BasicBlock *CommonDest = nullptr;
+  Constant *DefaultResult;
+  SwitchCaseResultVectorTy UniqueResults;
+  // Collect all the cases that will deliver the same value from the switch.
+  if (!InitializeUniqueCases(SI, DL, PHI, CommonDest, UniqueResults,
+                             DefaultResult))
+    return false;
+  // Selects choose between maximum two values.
+  if (UniqueResults.size() != 2)
+    return false;
+  assert(PHI != nullptr && "PHI for value select not found");
+
+  Builder.SetInsertPoint(SI);
+  Value *SelectValue = ConvertTwoCaseSwitch(
+      UniqueResults,
+      DefaultResult, Cond, Builder);
+  if (SelectValue) {
+    RemoveSwitchAfterSelectConversion(SI, PHI, SelectValue, Builder);
+    return true;
+  }
+  // The switch couldn't be converted into a select.
+  return false;
+}
+
 namespace {
   /// SwitchLookupTable - This class represents a lookup table that can be used
   /// to replace a switch.
@@ -3493,6 +3715,11 @@
       // store that single value and return it for each lookup.
       SingleValueKind,
 
+      // For tables where there is a linear relationship between table index
+      // and values. We calculate the result with a simple multiplication
+      // and addition instead of a table lookup.
+      LinearMapKind,
+
       // For small tables with integer elements, we can pack them into a bitmap
       // that fits into a target-legal register. Values are retrieved by
       // shift and mask operations.
@@ -3510,6 +3737,10 @@
     ConstantInt *BitMap;
     IntegerType *BitMapElementTy;
 
+    // For LinearMapKind, these are the constants used to derive the value.
+    ConstantInt *LinearOffset;
+    ConstantInt *LinearMultiplier;
+
     // For ArrayKind, this is the array.
     GlobalVariable *Array;
   };
@@ -3522,7 +3753,7 @@
                                      Constant *DefaultValue,
                                      const DataLayout *DL)
     : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr),
-      Array(nullptr) {
+      LinearOffset(nullptr), LinearMultiplier(nullptr), Array(nullptr) {
   assert(Values.size() && "Can't build lookup table without values!");
   assert(TableSize >= Values.size() && "Can't fit values in table!");
 
@@ -3567,6 +3798,43 @@
     return;
   }
 
+  // Check if we can derive the value with a linear transformation from the
+  // table index.
+  if (isa<IntegerType>(ValueType)) {
+    bool LinearMappingPossible = true;
+    APInt PrevVal;
+    APInt DistToPrev;
+    assert(TableSize >= 2 && "Should be a SingleValue table.");
+    // Check if there is the same distance between two consecutive values.
+    for (uint64_t I = 0; I < TableSize; ++I) {
+      ConstantInt *ConstVal = dyn_cast<ConstantInt>(TableContents[I]);
+      if (!ConstVal) {
+        // This is an undef. We could deal with it, but undefs in lookup tables
+        // are very seldom. It's probably not worth the additional complexity.
+        LinearMappingPossible = false;
+        break;
+      }
+      APInt Val = ConstVal->getValue();
+      if (I != 0) {
+        APInt Dist = Val - PrevVal;
+        if (I == 1) {
+          DistToPrev = Dist;
+        } else if (Dist != DistToPrev) {
+          LinearMappingPossible = false;
+          break;
+        }
+      }
+      PrevVal = Val;
+    }
+    if (LinearMappingPossible) {
+      LinearOffset = cast<ConstantInt>(TableContents[0]);
+      LinearMultiplier = ConstantInt::get(M.getContext(), DistToPrev);
+      Kind = LinearMapKind;
+      ++NumLinearMaps;
+      return;
+    }
+  }
+
   // If the type is integer and the table fits in a register, build a bitmap.
   if (WouldFitInRegister(DL, TableSize, ValueType)) {
     IntegerType *IT = cast<IntegerType>(ValueType);
@@ -3602,6 +3870,16 @@
   switch (Kind) {
     case SingleValueKind:
       return SingleValue;
+    case LinearMapKind: {
+      // Derive the result value from the input value.
+      Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
+                                            false, "switch.idx.cast");
+      if (!LinearMultiplier->isOne())
+        Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult");
+      if (!LinearOffset->isZero())
+        Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset");
+      return Result;
+    }
     case BitMapKind: {
       // Type of the bitmap (e.g. i59).
       IntegerType *MapTy = BitMap->getType();
@@ -3624,6 +3902,16 @@
                                  "switch.masked");
     }
     case ArrayKind: {
+      // Make sure the table index will not overflow when treated as signed.
+      IntegerType *IT = cast<IntegerType>(Index->getType());
+      uint64_t TableSize = Array->getInitializer()->getType()
+                                ->getArrayNumElements();
+      if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
+        Index = Builder.CreateZExt(Index,
+                                   IntegerType::get(IT->getContext(),
+                                                    IT->getBitWidth() + 1),
+                                   "switch.tableidx.zext");
+
       Value *GEPIndices[] = { Builder.getInt32(0), Index };
       Value *GEP = Builder.CreateInBoundsGEP(Array, GEPIndices,
                                              "switch.gep");
@@ -3663,9 +3951,8 @@
 
   bool AllTablesFitInRegister = true;
   bool HasIllegalType = false;
-  for (SmallDenseMap<PHINode*, Type*>::const_iterator I = ResultTypes.begin(),
-       E = ResultTypes.end(); I != E; ++I) {
-    Type *Ty = I->second;
+  for (const auto &I : ResultTypes) {
+    Type *Ty = I.second;
 
     // Saturate this flag to true.
     HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
@@ -3749,16 +4036,17 @@
       return false;
 
     // Append the result from this case to the list for each phi.
-    for (ResultsTy::iterator I = Results.begin(), E = Results.end(); I!=E; ++I) {
-      if (!ResultLists.count(I->first))
-        PHIs.push_back(I->first);
-      ResultLists[I->first].push_back(std::make_pair(CaseVal, I->second));
+    for (const auto &I : Results) {
+      PHINode *PHI = I.first;
+      Constant *Value = I.second;
+      if (!ResultLists.count(PHI))
+        PHIs.push_back(PHI);
+      ResultLists[PHI].push_back(std::make_pair(CaseVal, Value));
     }
   }
 
   // Keep track of the result types.
-  for (size_t I = 0, E = PHIs.size(); I != E; ++I) {
-    PHINode *PHI = PHIs[I];
+  for (PHINode *PHI : PHIs) {
     ResultTypes[PHI] = ResultLists[PHI][0].second->getType();
   }
 
@@ -3775,6 +4063,7 @@
     HasDefaultResults = GetCaseResults(SI, nullptr, SI->getDefaultDest(),
                                        &CommonDest, DefaultResultsList, DL);
   }
+
   bool NeedMask = (TableHasHoles && !HasDefaultResults);
   if (NeedMask) {
     // As an extra penalty for the validity test we require more cases.
@@ -3784,9 +4073,9 @@
       return false;
   }
 
-  for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) {
-    PHINode *PHI = DefaultResultsList[I].first;
-    Constant *Result = DefaultResultsList[I].second;
+  for (const auto &I : DefaultResultsList) {
+    PHINode *PHI = I.first;
+    Constant *Result = I.second;
     DefaultResults[PHI] = Result;
   }
 
@@ -3820,10 +4109,13 @@
   const bool GeneratingCoveredLookupTable = MaxTableSize == TableSize;
   if (GeneratingCoveredLookupTable) {
     Builder.CreateBr(LookupBB);
-    SI->getDefaultDest()->removePredecessor(SI->getParent());
+    // We cached PHINodes in PHIs, to avoid accessing deleted PHINodes later,
+    // do not delete PHINodes here.
+    SI->getDefaultDest()->removePredecessor(SI->getParent(),
+                                            true/*DontDeleteUselessPHIs*/);
   } else {
     Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
-                                         MinCaseVal->getType(), TableSize));
+                                       MinCaseVal->getType(), TableSize));
     Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
   }
 
@@ -3841,9 +4133,12 @@
                                   CommonDest->getParent(),
                                   CommonDest);
 
+    // Make the mask's bitwidth at least 8bit and a power-of-2 to avoid
+    // unnecessary illegal types.
+    uint64_t TableSizePowOf2 = NextPowerOf2(std::max(7ULL, TableSize - 1ULL));
+    APInt MaskInt(TableSizePowOf2, 0);
+    APInt One(TableSizePowOf2, 1);
     // Build bitmask; fill in a 1 bit for every case.
-    APInt MaskInt(TableSize, 0);
-    APInt One(TableSize, 1);
     const ResultListTy &ResultList = ResultLists[PHIs[0]];
     for (size_t I = 0, E = ResultList.size(); I != E; ++I) {
       uint64_t Idx = (ResultList[I].first->getValue() -
@@ -3919,12 +4214,12 @@
     // see if that predecessor totally determines the outcome of this switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
-        return SimplifyCFG(BB, TTI, DL) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
     Value *Cond = SI->getCondition();
     if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
       if (SimplifySwitchOnSelect(SI, Select))
-        return SimplifyCFG(BB, TTI, DL) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
     // If the block only contains the switch, see if we can fold the block
     // away into any preds.
@@ -3934,22 +4229,25 @@
       ++BBI;
     if (SI == &*BBI)
       if (FoldValueComparisonIntoPredecessors(SI, Builder))
-        return SimplifyCFG(BB, TTI, DL) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
   }
 
   // Try to transform the switch into an icmp and a branch.
   if (TurnSwitchRangeIntoICmp(SI, Builder))
-    return SimplifyCFG(BB, TTI, DL) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
   // Remove unreachable cases.
-  if (EliminateDeadSwitchCases(SI))
-    return SimplifyCFG(BB, TTI, DL) | true;
+  if (EliminateDeadSwitchCases(SI, DL, AT))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
+
+  if (SwitchToSelect(SI, Builder, DL, AT))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
   if (ForwardSwitchConditionToPHI(SI))
-    return SimplifyCFG(BB, TTI, DL) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
   if (SwitchToLookupTable(SI, Builder, TTI, DL))
-    return SimplifyCFG(BB, TTI, DL) | true;
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
   return false;
 }
@@ -3962,7 +4260,7 @@
   SmallPtrSet<Value *, 8> Succs;
   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
     BasicBlock *Dest = IBI->getDestination(i);
-    if (!Dest->hasAddressTaken() || !Succs.insert(Dest)) {
+    if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
       Dest->removePredecessor(BB);
       IBI->removeDestination(i);
       --i; --e;
@@ -3986,7 +4284,7 @@
 
   if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
     if (SimplifyIndirectBrOnSelect(IBI, SI))
-      return SimplifyCFG(BB, TTI, DL) | true;
+      return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
   }
   return Changed;
 }
@@ -3998,7 +4296,7 @@
     return true;
 
   // If the Terminator is the only non-phi instruction, simplify the block.
-  BasicBlock::iterator I = BB->getFirstNonPHIOrDbgOrLifetime();
+  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
       TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
@@ -4010,7 +4308,8 @@
       for (++I; isa<DbgInfoIntrinsic>(I); ++I)
         ;
       if (I->isTerminator() &&
-          TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI, DL))
+          TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI,
+                                                BonusInstThreshold, DL, AT))
         return true;
     }
 
@@ -4018,8 +4317,8 @@
   // branches to us and our successor, fold the comparison into the
   // predecessor and use logical operations to update the incoming value
   // for PHI nodes in common successor.
-  if (FoldBranchToCommonDest(BI, DL))
-    return SimplifyCFG(BB, TTI, DL) | true;
+  if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
   return false;
 }
 
@@ -4034,7 +4333,7 @@
     // switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
-        return SimplifyCFG(BB, TTI, DL) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
@@ -4044,14 +4343,14 @@
       ++I;
     if (&*I == BI) {
       if (FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB, TTI, DL) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
     } else if (&*I == cast<Instruction>(BI->getCondition())){
       ++I;
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(I))
         ++I;
       if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB, TTI, DL) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
     }
   }
 
@@ -4062,8 +4361,8 @@
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
-  if (FoldBranchToCommonDest(BI, DL))
-    return SimplifyCFG(BB, TTI, DL) | true;
+  if (FoldBranchToCommonDest(BI, DL, BonusInstThreshold))
+    return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
   // We have a conditional branch to two blocks that are only reachable
   // from BI.  We know that the condbr dominates the two blocks, so see if
@@ -4072,7 +4371,7 @@
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
       if (HoistThenElseCodeToIf(BI, DL))
-        return SimplifyCFG(BB, TTI, DL) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
@@ -4080,7 +4379,7 @@
       if (Succ0TI->getNumSuccessors() == 1 &&
           Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
         if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), DL))
-          return SimplifyCFG(BB, TTI, DL) | true;
+          return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
     }
   } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
@@ -4089,7 +4388,7 @@
     if (Succ1TI->getNumSuccessors() == 1 &&
         Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
       if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), DL))
-        return SimplifyCFG(BB, TTI, DL) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
   }
 
   // If this is a branch on a phi node in the current block, thread control
@@ -4097,14 +4396,14 @@
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
       if (FoldCondBranchOnPHI(BI, DL))
-        return SimplifyCFG(BB, TTI, DL) | true;
+        return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
         if (SimplifyCondBranchToCondBranch(PBI, BI))
-          return SimplifyCFG(BB, TTI, DL) | true;
+          return SimplifyCFG(BB, TTI, BonusInstThreshold, DL, AT) | true;
 
   return false;
 }
@@ -4248,6 +4547,7 @@
 /// of the CFG.  It returns true if a modification was made.
 ///
 bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
-                       const DataLayout *DL) {
-  return SimplifyCFGOpt(TTI, DL).run(BB);
+                       unsigned BonusInstThreshold,
+                       const DataLayout *DL, AssumptionTracker *AT) {
+  return SimplifyCFGOpt(TTI, BonusInstThreshold, DL, AT).run(BB);
 }

diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index b284e6f..a4fdd55 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp

@@ -40,7 +40,7 @@
 STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
 
 namespace {
-  /// SimplifyIndvar - This is a utility for simplifying induction variables
+  /// This is a utility for simplifying induction variables
   /// based on ScalarEvolution. It is the primary instrument of the
   /// IndvarSimplify pass, but it may also be directly invoked to cleanup after
   /// other loop passes that preserve SCEV.
@@ -86,7 +86,7 @@
   };
 }
 
-/// foldIVUser - Fold an IV operand into its use.  This removes increments of an
+/// Fold an IV operand into its use.  This removes increments of an
 /// aligned IV when used by a instruction that ignores the low bits.
 ///
 /// IVOperand is guaranteed SCEVable, but UseInst may not be.
@@ -152,7 +152,7 @@
   return IVSrc;
 }
 
-/// eliminateIVComparison - SimplifyIVUsers helper for eliminating useless
+/// SimplifyIVUsers helper for eliminating useless
 /// comparisons against an induction variable.
 void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
   unsigned IVOperIdx = 0;
@@ -188,7 +188,7 @@
   DeadInsts.push_back(ICmp);
 }
 
-/// eliminateIVRemainder - SimplifyIVUsers helper for eliminating useless
+/// SimplifyIVUsers helper for eliminating useless
 /// remainder operations operating on an induction variable.
 void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
                                       Value *IVOperand,
@@ -239,7 +239,7 @@
   DeadInsts.push_back(Rem);
 }
 
-/// eliminateIVUser - Eliminate an operation that consumes a simple IV and has
+/// Eliminate an operation that consumes a simple IV and has
 /// no observable side-effect given the range of IV values.
 /// IVOperand is guaranteed SCEVable, but UseInst may not be.
 bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
@@ -334,8 +334,7 @@
   return AddInst;
 }
 
-/// pushIVUsers - Add all uses of Def to the current IV's worklist.
-///
+/// Add all uses of Def to the current IV's worklist.
 static void pushIVUsers(
   Instruction *Def,
   SmallPtrSet<Instruction*,16> &Simplified,
@@ -348,12 +347,12 @@
     // Also ensure unique worklist users.
     // If Def is a LoopPhi, it may not be in the Simplified set, so check for
     // self edges first.
-    if (UI != Def && Simplified.insert(UI))
+    if (UI != Def && Simplified.insert(UI).second)
       SimpleIVUsers.push_back(std::make_pair(UI, Def));
   }
 }
 
-/// isSimpleIVUser - Return true if this instruction generates a simple SCEV
+/// Return true if this instruction generates a simple SCEV
 /// expression in terms of that IV.
 ///
 /// This is similar to IVUsers' isInteresting() but processes each instruction
@@ -374,7 +373,7 @@
   return false;
 }
 
-/// simplifyUsers - Iteratively perform simplification on a worklist of users
+/// Iteratively perform simplification on a worklist of users
 /// of the specified induction variable. Each successive simplification may push
 /// more users which may themselves be candidates for simplification.
 ///
@@ -446,7 +445,7 @@
 
 void IVVisitor::anchor() { }
 
-/// simplifyUsersOfIV - Simplify instructions that use this induction variable
+/// Simplify instructions that use this induction variable
 /// by using ScalarEvolution to analyze the IV's recurrence.
 bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM,
                        SmallVectorImpl<WeakVH> &Dead, IVVisitor *V)
@@ -457,7 +456,7 @@
   return SIV.hasChanged();
 }
 
-/// simplifyLoopIVs - Simplify users of induction variables within this
+/// Simplify users of induction variables within this
 /// loop. This does not actually change or add IVs.
 bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM,
                      SmallVectorImpl<WeakVH> &Dead) {

diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 33b3637..5632095 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp

@@ -18,6 +18,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -41,6 +42,7 @@
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
+      AU.addRequired<AssumptionTracker>();
       AU.addRequired<TargetLibraryInfo>();
     }
 
@@ -52,6 +54,7 @@
       DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
       const DataLayout *DL = DLP ? &DLP->getDataLayout() : nullptr;
       const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+      AssumptionTracker *AT = &getAnalysis<AssumptionTracker>();
       SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
       bool Changed = false;
 
@@ -68,7 +71,7 @@
               continue;
             // Don't waste time simplifying unused instructions.
             if (!I->use_empty())
-              if (Value *V = SimplifyInstruction(I, DL, TLI, DT)) {
+              if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AT)) {
                 // Mark all uses for resimplification next time round the loop.
                 for (User *U : I->users())
                   Next->insert(cast<Instruction>(U));
@@ -101,6 +104,7 @@
 char InstSimplifier::ID = 0;
 INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
                       "Remove redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
                     "Remove redundant instructions", false, false)

diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 3b61bb5..a39f128 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp

@@ -27,65 +27,43 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 
 using namespace llvm;
+using namespace PatternMatch;
 
 static cl::opt<bool>
-ColdErrorCalls("error-reporting-is-cold",  cl::init(true),
-  cl::Hidden, cl::desc("Treat error-reporting calls as cold"));
+    ColdErrorCalls("error-reporting-is-cold", cl::init(true), cl::Hidden,
+                   cl::desc("Treat error-reporting calls as cold"));
 
-/// This class is the abstract base class for the set of optimizations that
-/// corresponds to one library call.
-namespace {
-class LibCallOptimization {
-protected:
-  Function *Caller;
-  const DataLayout *DL;
-  const TargetLibraryInfo *TLI;
-  const LibCallSimplifier *LCS;
-  LLVMContext* Context;
-public:
-  LibCallOptimization() { }
-  virtual ~LibCallOptimization() {}
+static cl::opt<bool>
+    EnableUnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+                         cl::init(false),
+                         cl::desc("Enable unsafe double to float "
+                                  "shrinking for math lib calls"));
 
-  /// callOptimizer - This pure virtual method is implemented by base classes to
-  /// do various optimizations.  If this returns null then no transformation was
-  /// performed.  If it returns CI, then it transformed the call and CI is to be
-  /// deleted.  If it returns something else, replace CI with the new value and
-  /// delete CI.
-  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
-    =0;
-
-  /// ignoreCallingConv - Returns false if this transformation could possibly
-  /// change the calling convention.
-  virtual bool ignoreCallingConv() { return false; }
-
-  Value *optimizeCall(CallInst *CI, const DataLayout *DL,
-                      const TargetLibraryInfo *TLI,
-                      const LibCallSimplifier *LCS, IRBuilder<> &B) {
-    Caller = CI->getParent()->getParent();
-    this->DL = DL;
-    this->TLI = TLI;
-    this->LCS = LCS;
-    if (CI->getCalledFunction())
-      Context = &CI->getCalledFunction()->getContext();
-
-    // We never change the calling convention.
-    if (!ignoreCallingConv() && CI->getCallingConv() != llvm::CallingConv::C)
-      return nullptr;
-
-    return callOptimizer(CI->getCalledFunction(), CI, B);
-  }
-};
 
 //===----------------------------------------------------------------------===//
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
+static bool ignoreCallingConv(LibFunc::Func Func) {
+  switch (Func) {
+  case LibFunc::abs:
+  case LibFunc::labs:
+  case LibFunc::llabs:
+  case LibFunc::strlen:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("All cases should be covered in the switch.");
+}
+
 /// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
 /// value is equal or not-equal to zero.
 static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
@@ -142,967 +120,912 @@
 // Fortified Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
-struct FortifiedLibCallOptimization : public LibCallOptimization {
-protected:
-  virtual bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp,
-			  bool isString) const = 0;
-};
-
-struct InstFortifiedLibCallOptimization : public FortifiedLibCallOptimization {
-  CallInst *CI;
-
-  bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp,
-                  bool isString) const override {
-    if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp))
+static bool isFortifiedCallFoldable(CallInst *CI, unsigned SizeCIOp, unsigned SizeArgOp,
+                       bool isString) {
+  if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp))
+    return true;
+  if (ConstantInt *SizeCI =
+          dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) {
+    if (SizeCI->isAllOnesValue())
       return true;
-    if (ConstantInt *SizeCI =
-                           dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) {
-      if (SizeCI->isAllOnesValue())
-        return true;
-      if (isString) {
-        uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp));
-        // If the length is 0 we don't know how long it is and so we can't
-        // remove the check.
-        if (Len == 0) return false;
-        return SizeCI->getZExtValue() >= Len;
-      }
-      if (ConstantInt *Arg = dyn_cast<ConstantInt>(
-                                                  CI->getArgOperand(SizeArgOp)))
-        return SizeCI->getZExtValue() >= Arg->getZExtValue();
+    if (isString) {
+      uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp));
+      // If the length is 0 we don't know how long it is and so we can't
+      // remove the check.
+      if (Len == 0)
+        return false;
+      return SizeCI->getZExtValue() >= Len;
     }
-    return false;
+    if (ConstantInt *Arg = dyn_cast<ConstantInt>(CI->getArgOperand(SizeArgOp)))
+      return SizeCI->getZExtValue() >= Arg->getZExtValue();
   }
-};
+  return false;
+}
 
-struct MemCpyChkOpt : public InstFortifiedLibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    this->CI = CI;
-    FunctionType *FT = Callee->getFunctionType();
-    LLVMContext &Context = CI->getParent()->getContext();
+Value *LibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  LLVMContext &Context = CI->getContext();
 
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != DL->getIntPtrType(Context) ||
-        FT->getParamType(3) != DL->getIntPtrType(Context))
+  // Check if this has the right signature.
+  if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy() ||
+      FT->getParamType(2) != DL->getIntPtrType(Context) ||
+      FT->getParamType(3) != DL->getIntPtrType(Context))
+    return nullptr;
+
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                   CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  LLVMContext &Context = CI->getContext();
+
+  // Check if this has the right signature.
+  if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy() ||
+      FT->getParamType(2) != DL->getIntPtrType(Context) ||
+      FT->getParamType(3) != DL->getIntPtrType(Context))
+    return nullptr;
+
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemSetChk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  LLVMContext &Context = CI->getContext();
+
+  // Check if this has the right signature.
+  if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isIntegerTy() ||
+      FT->getParamType(2) != DL->getIntPtrType(Context) ||
+      FT->getParamType(3) != DL->getIntPtrType(Context))
+    return nullptr;
+
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrCpyChk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+  FunctionType *FT = Callee->getFunctionType();
+  LLVMContext &Context = CI->getContext();
+
+  // Check if this has the right signature.
+  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+      FT->getParamType(2) != DL->getIntPtrType(Context))
+    return nullptr;
+
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+  if (Dst == Src) // __strcpy_chk(x,x)  -> x
+    return Src;
+
+  // If a) we don't have any length information, or b) we know this will
+  // fit then just lower to a plain strcpy. Otherwise we'll keep our
+  // strcpy_chk call which may fail at runtime if the size is too long.
+  // TODO: It might be nice to get a maximum length out of the possible
+  // string lengths for varying.
+  if (isFortifiedCallFoldable(CI, 2, 1, true)) {
+    Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6));
+    return Ret;
+  } else {
+    // Maybe we can stil fold __strcpy_chk to __memcpy_chk.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0)
       return nullptr;
 
-    if (isFoldable(3, 2, false)) {
-      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                     CI->getArgOperand(2), 1);
-      return CI->getArgOperand(0);
-    }
-    return nullptr;
-  }
-};
-
-struct MemMoveChkOpt : public InstFortifiedLibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    this->CI = CI;
-    FunctionType *FT = Callee->getFunctionType();
-    LLVMContext &Context = CI->getParent()->getContext();
-
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != DL->getIntPtrType(Context) ||
-        FT->getParamType(3) != DL->getIntPtrType(Context))
+    // This optimization require DataLayout.
+    if (!DL)
       return nullptr;
 
-    if (isFoldable(3, 2, false)) {
-      B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                      CI->getArgOperand(2), 1);
-      return CI->getArgOperand(0);
-    }
-    return nullptr;
+    Value *Ret = EmitMemCpyChk(
+        Dst, Src, ConstantInt::get(DL->getIntPtrType(Context), Len),
+        CI->getArgOperand(2), B, DL, TLI);
+    return Ret;
   }
-};
+  return nullptr;
+}
 
-struct MemSetChkOpt : public InstFortifiedLibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    this->CI = CI;
-    FunctionType *FT = Callee->getFunctionType();
-    LLVMContext &Context = CI->getParent()->getContext();
+Value *LibCallSimplifier::optimizeStpCpyChk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+  FunctionType *FT = Callee->getFunctionType();
+  LLVMContext &Context = CI->getContext();
 
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != DL->getIntPtrType(Context) ||
-        FT->getParamType(3) != DL->getIntPtrType(Context))
+  // Check if this has the right signature.
+  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+      FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0)))
+    return nullptr;
+
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+  if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
+    Value *StrLen = EmitStrLen(Src, B, DL, TLI);
+    return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr;
+  }
+
+  // If a) we don't have any length information, or b) we know this will
+  // fit then just lower to a plain stpcpy. Otherwise we'll keep our
+  // stpcpy_chk call which may fail at runtime if the size is too long.
+  // TODO: It might be nice to get a maximum length out of the possible
+  // string lengths for varying.
+  if (isFortifiedCallFoldable(CI, 2, 1, true)) {
+    Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6));
+    return Ret;
+  } else {
+    // Maybe we can stil fold __stpcpy_chk to __memcpy_chk.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0)
       return nullptr;
 
-    if (isFoldable(3, 2, false)) {
-      Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(),
-                                   false);
-      B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
-      return CI->getArgOperand(0);
-    }
-    return nullptr;
-  }
-};
-
-struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    this->CI = CI;
-    StringRef Name = Callee->getName();
-    FunctionType *FT = Callee->getFunctionType();
-    LLVMContext &Context = CI->getParent()->getContext();
-
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 3 ||
-        FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-        FT->getParamType(2) != DL->getIntPtrType(Context))
+    // This optimization require DataLayout.
+    if (!DL)
       return nullptr;
 
-    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
-    if (Dst == Src)      // __strcpy_chk(x,x)  -> x
-      return Src;
-
-    // If a) we don't have any length information, or b) we know this will
-    // fit then just lower to a plain strcpy. Otherwise we'll keep our
-    // strcpy_chk call which may fail at runtime if the size is too long.
-    // TODO: It might be nice to get a maximum length out of the possible
-    // string lengths for varying.
-    if (isFoldable(2, 1, true)) {
-      Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6));
-      return Ret;
-    } else {
-      // Maybe we can stil fold __strcpy_chk to __memcpy_chk.
-      uint64_t Len = GetStringLength(Src);
-      if (Len == 0) return nullptr;
-
-      // This optimization require DataLayout.
-      if (!DL) return nullptr;
-
-      Value *Ret =
-	EmitMemCpyChk(Dst, Src,
-                      ConstantInt::get(DL->getIntPtrType(Context), Len),
-                      CI->getArgOperand(2), B, DL, TLI);
-      return Ret;
-    }
-    return nullptr;
-  }
-};
-
-struct StpCpyChkOpt : public InstFortifiedLibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    this->CI = CI;
-    StringRef Name = Callee->getName();
-    FunctionType *FT = Callee->getFunctionType();
-    LLVMContext &Context = CI->getParent()->getContext();
-
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 3 ||
-        FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-        FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0)))
+    Type *PT = FT->getParamType(0);
+    Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len);
+    Value *DstEnd =
+        B.CreateGEP(Dst, ConstantInt::get(DL->getIntPtrType(PT), Len - 1));
+    if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, DL, TLI))
       return nullptr;
-
-    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
-    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
-      Value *StrLen = EmitStrLen(Src, B, DL, TLI);
-      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr;
-    }
-
-    // If a) we don't have any length information, or b) we know this will
-    // fit then just lower to a plain stpcpy. Otherwise we'll keep our
-    // stpcpy_chk call which may fail at runtime if the size is too long.
-    // TODO: It might be nice to get a maximum length out of the possible
-    // string lengths for varying.
-    if (isFoldable(2, 1, true)) {
-      Value *Ret = EmitStrCpy(Dst, Src, B, DL, TLI, Name.substr(2, 6));
-      return Ret;
-    } else {
-      // Maybe we can stil fold __stpcpy_chk to __memcpy_chk.
-      uint64_t Len = GetStringLength(Src);
-      if (Len == 0) return nullptr;
-
-      // This optimization require DataLayout.
-      if (!DL) return nullptr;
-
-      Type *PT = FT->getParamType(0);
-      Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len);
-      Value *DstEnd = B.CreateGEP(Dst,
-                                  ConstantInt::get(DL->getIntPtrType(PT),
-                                                   Len - 1));
-      if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, DL, TLI))
-        return nullptr;
-      return DstEnd;
-    }
-    return nullptr;
+    return DstEnd;
   }
-};
+  return nullptr;
+}
 
-struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    this->CI = CI;
-    StringRef Name = Callee->getName();
-    FunctionType *FT = Callee->getFunctionType();
-    LLVMContext &Context = CI->getParent()->getContext();
+Value *LibCallSimplifier::optimizeStrNCpyChk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+  FunctionType *FT = Callee->getFunctionType();
+  LLVMContext &Context = CI->getContext();
 
-    // Check if this has the right signature.
-    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-        !FT->getParamType(2)->isIntegerTy() ||
-        FT->getParamType(3) != DL->getIntPtrType(Context))
-      return nullptr;
-
-    if (isFoldable(3, 2, false)) {
-      Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                               CI->getArgOperand(2), B, DL, TLI,
-                               Name.substr(2, 7));
-      return Ret;
-    }
+  // Check if this has the right signature.
+  if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+      !FT->getParamType(2)->isIntegerTy() ||
+      FT->getParamType(3) != DL->getIntPtrType(Context))
     return nullptr;
+
+  if (isFortifiedCallFoldable(CI, 3, 2, false)) {
+    Value *Ret =
+        EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2), B, DL, TLI, Name.substr(2, 7));
+    return Ret;
   }
-};
+  return nullptr;
+}
 
 //===----------------------------------------------------------------------===//
 // String and Memory Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
-struct StrCatOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Verify the "strcat" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getReturnType() != B.getInt8PtrTy() ||
-        FT->getParamType(0) != FT->getReturnType() ||
-        FT->getParamType(1) != FT->getReturnType())
+Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Verify the "strcat" function prototype.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2||
+      FT->getReturnType() != B.getInt8PtrTy() ||
+      FT->getParamType(0) != FT->getReturnType() ||
+      FT->getParamType(1) != FT->getReturnType())
+    return nullptr;
+
+  // Extract some information from the instruction
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+
+  // See if we can get the length of the input string.
+  uint64_t Len = GetStringLength(Src);
+  if (Len == 0)
+    return nullptr;
+  --Len; // Unbias length.
+
+  // Handle the simple, do-nothing case: strcat(x, "") -> x
+  if (Len == 0)
+    return Dst;
+
+  // These optimizations require DataLayout.
+  if (!DL)
+    return nullptr;
+
+  return emitStrLenMemCpy(Src, Dst, Len, B);
+}
+
+Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
+                                           IRBuilder<> &B) {
+  // We need to find the end of the destination string.  That's where the
+  // memory is to be moved to. We just generate a call to strlen.
+  Value *DstLen = EmitStrLen(Dst, B, DL, TLI);
+  if (!DstLen)
+    return nullptr;
+
+  // Now that we have the destination's length, we must index into the
+  // destination's pointer to get the actual memcpy destination (end of
+  // the string .. we're concatenating).
+  Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr");
+
+  // We have enough information to now generate the memcpy call to do the
+  // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+  B.CreateMemCpy(
+      CpyDst, Src,
+      ConstantInt::get(DL->getIntPtrType(Src->getContext()), Len + 1), 1);
+  return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Verify the "strncat" function prototype.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 3 || FT->getReturnType() != B.getInt8PtrTy() ||
+      FT->getParamType(0) != FT->getReturnType() ||
+      FT->getParamType(1) != FT->getReturnType() ||
+      !FT->getParamType(2)->isIntegerTy())
+    return nullptr;
+
+  // Extract some information from the instruction
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  uint64_t Len;
+
+  // We don't do anything if length is not constant
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+    Len = LengthArg->getZExtValue();
+  else
+    return nullptr;
+
+  // See if we can get the length of the input string.
+  uint64_t SrcLen = GetStringLength(Src);
+  if (SrcLen == 0)
+    return nullptr;
+  --SrcLen; // Unbias length.
+
+  // Handle the simple, do-nothing cases:
+  // strncat(x, "", c) -> x
+  // strncat(x,  c, 0) -> x
+  if (SrcLen == 0 || Len == 0)
+    return Dst;
+
+  // These optimizations require DataLayout.
+  if (!DL)
+    return nullptr;
+
+  // We don't optimize this case
+  if (Len < SrcLen)
+    return nullptr;
+
+  // strncat(x, s, c) -> strcat(x, s)
+  // s is constant so the strcat can be optimized further
+  return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+}
+
+Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Verify the "strchr" function prototype.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() ||
+      FT->getParamType(0) != FT->getReturnType() ||
+      !FT->getParamType(1)->isIntegerTy(32))
+    return nullptr;
+
+  Value *SrcStr = CI->getArgOperand(0);
+
+  // If the second operand is non-constant, see if we can compute the length
+  // of the input string and turn this into memchr.
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!CharC) {
+    // These optimizations require DataLayout.
+    if (!DL)
       return nullptr;
 
-    // Extract some information from the instruction
-    Value *Dst = CI->getArgOperand(0);
-    Value *Src = CI->getArgOperand(1);
+    uint64_t Len = GetStringLength(SrcStr);
+    if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
+      return nullptr;
 
-    // See if we can get the length of the input string.
-    uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return nullptr;
-    --Len;  // Unbias length.
-
-    // Handle the simple, do-nothing case: strcat(x, "") -> x
-    if (Len == 0)
-      return Dst;
-
-    // These optimizations require DataLayout.
-    if (!DL) return nullptr;
-
-    return emitStrLenMemCpy(Src, Dst, Len, B);
+    return EmitMemChr(
+        SrcStr, CI->getArgOperand(1), // include nul.
+        ConstantInt::get(DL->getIntPtrType(CI->getContext()), Len), B, DL, TLI);
   }
 
-  Value *emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
-                          IRBuilder<> &B) {
-    // We need to find the end of the destination string.  That's where the
-    // memory is to be moved to. We just generate a call to strlen.
-    Value *DstLen = EmitStrLen(Dst, B, DL, TLI);
-    if (!DstLen)
+  // Otherwise, the character is a constant, see if the first argument is
+  // a string literal.  If so, we can constant fold.
+  StringRef Str;
+  if (!getConstantStringInfo(SrcStr, Str)) {
+    if (DL && CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
+      return B.CreateGEP(SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr");
+    return nullptr;
+  }
+
+  // Compute the offset, make sure to handle the case when we're searching for
+  // zero (a weird way to spell strlen).
+  size_t I = (0xFF & CharC->getSExtValue()) == 0
+                 ? Str.size()
+                 : Str.find(CharC->getSExtValue());
+  if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
+    return Constant::getNullValue(CI->getType());
+
+  // strchr(s+n,c)  -> gep(s+n+i,c)
+  return B.CreateGEP(SrcStr, B.getInt64(I), "strchr");
+}
+
+Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Verify the "strrchr" function prototype.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() ||
+      FT->getParamType(0) != FT->getReturnType() ||
+      !FT->getParamType(1)->isIntegerTy(32))
+    return nullptr;
+
+  Value *SrcStr = CI->getArgOperand(0);
+  ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+
+  // Cannot fold anything if we're not looking for a constant.
+  if (!CharC)
+    return nullptr;
+
+  StringRef Str;
+  if (!getConstantStringInfo(SrcStr, Str)) {
+    // strrchr(s, 0) -> strchr(s, 0)
+    if (DL && CharC->isZero())
+      return EmitStrChr(SrcStr, '\0', B, DL, TLI);
+    return nullptr;
+  }
+
+  // Compute the offset.
+  size_t I = (0xFF & CharC->getSExtValue()) == 0
+                 ? Str.size()
+                 : Str.rfind(CharC->getSExtValue());
+  if (I == StringRef::npos) // Didn't find the char. Return null.
+    return Constant::getNullValue(CI->getType());
+
+  // strrchr(s+n,c) -> gep(s+n+i,c)
+  return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr");
+}
+
+Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Verify the "strcmp" function prototype.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || !FT->getReturnType()->isIntegerTy(32) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      FT->getParamType(0) != B.getInt8PtrTy())
+    return nullptr;
+
+  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+  if (Str1P == Str2P) // strcmp(x,x)  -> 0
+    return ConstantInt::get(CI->getType(), 0);
+
+  StringRef Str1, Str2;
+  bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+  bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+  // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
+  if (HasStr1 && HasStr2)
+    return ConstantInt::get(CI->getType(), Str1.compare(Str2));
+
+  if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+    return B.CreateNeg(
+        B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()));
+
+  if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+    return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+  // strcmp(P, "x") -> memcmp(P, "x", 2)
+  uint64_t Len1 = GetStringLength(Str1P);
+  uint64_t Len2 = GetStringLength(Str2P);
+  if (Len1 && Len2) {
+    // These optimizations require DataLayout.
+    if (!DL)
       return nullptr;
 
-    // Now that we have the destination's length, we must index into the
-    // destination's pointer to get the actual memcpy destination (end of
-    // the string .. we're concatenating).
-    Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr");
+    return EmitMemCmp(Str1P, Str2P,
+                      ConstantInt::get(DL->getIntPtrType(CI->getContext()),
+                                       std::min(Len1, Len2)),
+                      B, DL, TLI);
+  }
 
-    // We have enough information to now generate the memcpy call to do the
-    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-    B.CreateMemCpy(CpyDst, Src,
-                   ConstantInt::get(DL->getIntPtrType(*Context), Len + 1), 1);
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Verify the "strncmp" function prototype.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 3 || !FT->getReturnType()->isIntegerTy(32) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      FT->getParamType(0) != B.getInt8PtrTy() ||
+      !FT->getParamType(2)->isIntegerTy())
+    return nullptr;
+
+  Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+  if (Str1P == Str2P) // strncmp(x,x,n)  -> 0
+    return ConstantInt::get(CI->getType(), 0);
+
+  // Get the length argument if it is constant.
+  uint64_t Length;
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+    Length = LengthArg->getZExtValue();
+  else
+    return nullptr;
+
+  if (Length == 0) // strncmp(x,y,0)   -> 0
+    return ConstantInt::get(CI->getType(), 0);
+
+  if (DL && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+    return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
+
+  StringRef Str1, Str2;
+  bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+  bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+  // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+  if (HasStr1 && HasStr2) {
+    StringRef SubStr1 = Str1.substr(0, Length);
+    StringRef SubStr2 = Str2.substr(0, Length);
+    return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+  }
+
+  if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x
+    return B.CreateNeg(
+        B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()));
+
+  if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x
+    return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Verify the "strcpy" function prototype.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      FT->getParamType(0) != B.getInt8PtrTy())
+    return nullptr;
+
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+  if (Dst == Src) // strcpy(x,x)  -> x
+    return Src;
+
+  // These optimizations require DataLayout.
+  if (!DL)
+    return nullptr;
+
+  // See if we can get the length of the input string.
+  uint64_t Len = GetStringLength(Src);
+  if (Len == 0)
+    return nullptr;
+
+  // We have enough information to now generate the memcpy call to do the
+  // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+  B.CreateMemCpy(Dst, Src,
+                 ConstantInt::get(DL->getIntPtrType(CI->getContext()), Len), 1);
+  return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Verify the "stpcpy" function prototype.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      FT->getParamType(0) != B.getInt8PtrTy())
+    return nullptr;
+
+  // These optimizations require DataLayout.
+  if (!DL)
+    return nullptr;
+
+  Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+  if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
+    Value *StrLen = EmitStrLen(Src, B, DL, TLI);
+    return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr;
+  }
+
+  // See if we can get the length of the input string.
+  uint64_t Len = GetStringLength(Src);
+  if (Len == 0)
+    return nullptr;
+
+  Type *PT = FT->getParamType(0);
+  Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len);
+  Value *DstEnd =
+      B.CreateGEP(Dst, ConstantInt::get(DL->getIntPtrType(PT), Len - 1));
+
+  // We have enough information to now generate the memcpy call to do the
+  // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+  B.CreateMemCpy(Dst, Src, LenV, 1);
+  return DstEnd;
+}
+
+Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      FT->getParamType(0) != B.getInt8PtrTy() ||
+      !FT->getParamType(2)->isIntegerTy())
+    return nullptr;
+
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  Value *LenOp = CI->getArgOperand(2);
+
+  // See if we can get the length of the input string.
+  uint64_t SrcLen = GetStringLength(Src);
+  if (SrcLen == 0)
+    return nullptr;
+  --SrcLen;
+
+  if (SrcLen == 0) {
+    // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+    B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
     return Dst;
   }
-};
 
-struct StrNCatOpt : public StrCatOpt {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Verify the "strncat" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 ||
-        FT->getReturnType() != B.getInt8PtrTy() ||
-        FT->getParamType(0) != FT->getReturnType() ||
-        FT->getParamType(1) != FT->getReturnType() ||
-        !FT->getParamType(2)->isIntegerTy())
-      return nullptr;
+  uint64_t Len;
+  if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
+    Len = LengthArg->getZExtValue();
+  else
+    return nullptr;
 
-    // Extract some information from the instruction
-    Value *Dst = CI->getArgOperand(0);
-    Value *Src = CI->getArgOperand(1);
-    uint64_t Len;
+  if (Len == 0)
+    return Dst; // strncpy(x, y, 0) -> x
 
-    // We don't do anything if length is not constant
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
-      Len = LengthArg->getZExtValue();
-    else
-      return nullptr;
+  // These optimizations require DataLayout.
+  if (!DL)
+    return nullptr;
 
-    // See if we can get the length of the input string.
-    uint64_t SrcLen = GetStringLength(Src);
-    if (SrcLen == 0) return nullptr;
-    --SrcLen;  // Unbias length.
+  // Let strncpy handle the zero padding
+  if (Len > SrcLen + 1)
+    return nullptr;
 
-    // Handle the simple, do-nothing cases:
-    // strncat(x, "", c) -> x
-    // strncat(x,  c, 0) -> x
-    if (SrcLen == 0 || Len == 0) return Dst;
+  Type *PT = FT->getParamType(0);
+  // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
+  B.CreateMemCpy(Dst, Src, ConstantInt::get(DL->getIntPtrType(PT), Len), 1);
 
-    // These optimizations require DataLayout.
-    if (!DL) return nullptr;
+  return Dst;
+}
 
-    // We don't optimize this case
-    if (Len < SrcLen) return nullptr;
+Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 1 || FT->getParamType(0) != B.getInt8PtrTy() ||
+      !FT->getReturnType()->isIntegerTy())
+    return nullptr;
 
-    // strncat(x, s, c) -> strcat(x, s)
-    // s is constant so the strcat can be optimized further
-    return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+  Value *Src = CI->getArgOperand(0);
+
+  // Constant folding: strlen("xyz") -> 3
+  if (uint64_t Len = GetStringLength(Src))
+    return ConstantInt::get(CI->getType(), Len - 1);
+
+  // strlen(x?"foo":"bars") --> x ? 3 : 4
+  if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
+    uint64_t LenTrue = GetStringLength(SI->getTrueValue());
+    uint64_t LenFalse = GetStringLength(SI->getFalseValue());
+    if (LenTrue && LenFalse) {
+      Function *Caller = CI->getParent()->getParent();
+      emitOptimizationRemark(CI->getContext(), "simplify-libcalls", *Caller,
+                             SI->getDebugLoc(),
+                             "folded strlen(select) to select of constants");
+      return B.CreateSelect(SI->getCondition(),
+                            ConstantInt::get(CI->getType(), LenTrue - 1),
+                            ConstantInt::get(CI->getType(), LenFalse - 1));
+    }
   }
-};
 
-struct StrChrOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Verify the "strchr" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getReturnType() != B.getInt8PtrTy() ||
-        FT->getParamType(0) != FT->getReturnType() ||
-        !FT->getParamType(1)->isIntegerTy(32))
-      return nullptr;
+  // strlen(x) != 0 --> *x != 0
+  // strlen(x) == 0 --> *x == 0
+  if (isOnlyUsedInZeroEqualityComparison(CI))
+    return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
 
-    Value *SrcStr = CI->getArgOperand(0);
+  return nullptr;
+}
 
-    // If the second operand is non-constant, see if we can compute the length
-    // of the input string and turn this into memchr.
-    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-    if (!CharC) {
-      // These optimizations require DataLayout.
-      if (!DL) return nullptr;
+Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() ||
+      FT->getParamType(1) != FT->getParamType(0) ||
+      FT->getReturnType() != FT->getParamType(0))
+    return nullptr;
 
-      uint64_t Len = GetStringLength(SrcStr);
-      if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32.
-        return nullptr;
+  StringRef S1, S2;
+  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
 
-      return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
-                        ConstantInt::get(DL->getIntPtrType(*Context), Len),
-                        B, DL, TLI);
-    }
+  // strpbrk(s, "") -> nullptr
+  // strpbrk("", s) -> nullptr
+  if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+    return Constant::getNullValue(CI->getType());
 
-    // Otherwise, the character is a constant, see if the first argument is
-    // a string literal.  If so, we can constant fold.
-    StringRef Str;
-    if (!getConstantStringInfo(SrcStr, Str)) {
-      if (DL && CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
-        return B.CreateGEP(SrcStr, EmitStrLen(SrcStr, B, DL, TLI), "strchr");
-      return nullptr;
-    }
-
-    // Compute the offset, make sure to handle the case when we're searching for
-    // zero (a weird way to spell strlen).
-    size_t I = (0xFF & CharC->getSExtValue()) == 0 ?
-        Str.size() : Str.find(CharC->getSExtValue());
-    if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
+  // Constant folding.
+  if (HasS1 && HasS2) {
+    size_t I = S1.find_first_of(S2);
+    if (I == StringRef::npos) // No match.
       return Constant::getNullValue(CI->getType());
 
-    // strchr(s+n,c)  -> gep(s+n+i,c)
-    return B.CreateGEP(SrcStr, B.getInt64(I), "strchr");
+    return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk");
   }
-};
 
-struct StrRChrOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Verify the "strrchr" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getReturnType() != B.getInt8PtrTy() ||
-        FT->getParamType(0) != FT->getReturnType() ||
-        !FT->getParamType(1)->isIntegerTy(32))
+  // strpbrk(s, "a") -> strchr(s, 'a')
+  if (DL && HasS2 && S2.size() == 1)
+    return EmitStrChr(CI->getArgOperand(0), S2[0], B, DL, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
+      !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy())
+    return nullptr;
+
+  Value *EndPtr = CI->getArgOperand(1);
+  if (isa<ConstantPointerNull>(EndPtr)) {
+    // With a null EndPtr, this function won't capture the main argument.
+    // It would be readonly too, except that it still may write to errno.
+    CI->addAttribute(1, Attribute::NoCapture);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() ||
+      FT->getParamType(1) != FT->getParamType(0) ||
+      !FT->getReturnType()->isIntegerTy())
+    return nullptr;
+
+  StringRef S1, S2;
+  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+  // strspn(s, "") -> 0
+  // strspn("", s) -> 0
+  if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+    return Constant::getNullValue(CI->getType());
+
+  // Constant folding.
+  if (HasS1 && HasS2) {
+    size_t Pos = S1.find_first_not_of(S2);
+    if (Pos == StringRef::npos)
+      Pos = S1.size();
+    return ConstantInt::get(CI->getType(), Pos);
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() ||
+      FT->getParamType(1) != FT->getParamType(0) ||
+      !FT->getReturnType()->isIntegerTy())
+    return nullptr;
+
+  StringRef S1, S2;
+  bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+  bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+  // strcspn("", s) -> 0
+  if (HasS1 && S1.empty())
+    return Constant::getNullValue(CI->getType());
+
+  // Constant folding.
+  if (HasS1 && HasS2) {
+    size_t Pos = S1.find_first_of(S2);
+    if (Pos == StringRef::npos)
+      Pos = S1.size();
+    return ConstantInt::get(CI->getType(), Pos);
+  }
+
+  // strcspn(s, "") -> strlen(s)
+  if (DL && HasS2 && S2.empty())
+    return EmitStrLen(CI->getArgOperand(0), B, DL, TLI);
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy() ||
+      !FT->getReturnType()->isPointerTy())
+    return nullptr;
+
+  // fold strstr(x, x) -> x.
+  if (CI->getArgOperand(0) == CI->getArgOperand(1))
+    return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+  // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+  if (DL && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+    Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, DL, TLI);
+    if (!StrLen)
       return nullptr;
-
-    Value *SrcStr = CI->getArgOperand(0);
-    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-
-    // Cannot fold anything if we're not looking for a constant.
-    if (!CharC)
+    Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+                                 StrLen, B, DL, TLI);
+    if (!StrNCmp)
       return nullptr;
-
-    StringRef Str;
-    if (!getConstantStringInfo(SrcStr, Str)) {
-      // strrchr(s, 0) -> strchr(s, 0)
-      if (DL && CharC->isZero())
-        return EmitStrChr(SrcStr, '\0', B, DL, TLI);
-      return nullptr;
+    for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) {
+      ICmpInst *Old = cast<ICmpInst>(*UI++);
+      Value *Cmp =
+          B.CreateICmp(Old->getPredicate(), StrNCmp,
+                       ConstantInt::getNullValue(StrNCmp->getType()), "cmp");
+      replaceAllUsesWith(Old, Cmp);
     }
+    return CI;
+  }
 
-    // Compute the offset.
-    size_t I = (0xFF & CharC->getSExtValue()) == 0 ?
-        Str.size() : Str.rfind(CharC->getSExtValue());
-    if (I == StringRef::npos) // Didn't find the char. Return null.
+  // See if either input string is a constant string.
+  StringRef SearchStr, ToFindStr;
+  bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+  bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+
+  // fold strstr(x, "") -> x.
+  if (HasStr2 && ToFindStr.empty())
+    return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+  // If both strings are known, constant fold it.
+  if (HasStr1 && HasStr2) {
+    size_t Offset = SearchStr.find(ToFindStr);
+
+    if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
       return Constant::getNullValue(CI->getType());
 
-    // strrchr(s+n,c) -> gep(s+n+i,c)
-    return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr");
+    // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+    Value *Result = CastToCStr(CI->getArgOperand(0), B);
+    Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
+    return B.CreateBitCast(Result, CI->getType());
   }
-};
 
-struct StrCmpOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Verify the "strcmp" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        !FT->getReturnType()->isIntegerTy(32) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy())
-      return nullptr;
+  // fold strstr(x, "y") -> strchr(x, 'y').
+  if (HasStr2 && ToFindStr.size() == 1) {
+    Value *StrChr = EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, DL, TLI);
+    return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
+  }
+  return nullptr;
+}
 
-    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
-    if (Str1P == Str2P)      // strcmp(x,x)  -> 0
-      return ConstantInt::get(CI->getType(), 0);
-
-    StringRef Str1, Str2;
-    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
-    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
-
-    // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
-    if (HasStr1 && HasStr2)
-      return ConstantInt::get(CI->getType(), Str1.compare(Str2));
-
-    if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
-      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
-                                      CI->getType()));
-
-    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
-      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
-
-    // strcmp(P, "x") -> memcmp(P, "x", 2)
-    uint64_t Len1 = GetStringLength(Str1P);
-    uint64_t Len2 = GetStringLength(Str2P);
-    if (Len1 && Len2) {
-      // These optimizations require DataLayout.
-      if (!DL) return nullptr;
-
-      return EmitMemCmp(Str1P, Str2P,
-                        ConstantInt::get(DL->getIntPtrType(*Context),
-                        std::min(Len1, Len2)), B, DL, TLI);
-    }
-
+Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy() ||
+      !FT->getReturnType()->isIntegerTy(32))
     return nullptr;
-  }
-};
 
-struct StrNCmpOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Verify the "strncmp" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 ||
-        !FT->getReturnType()->isIntegerTy(32) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        !FT->getParamType(2)->isIntegerTy())
-      return nullptr;
+  Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
 
-    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
-    if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
-      return ConstantInt::get(CI->getType(), 0);
+  if (LHS == RHS) // memcmp(s,s,x) -> 0
+    return Constant::getNullValue(CI->getType());
 
-    // Get the length argument if it is constant.
-    uint64_t Length;
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
-      Length = LengthArg->getZExtValue();
-    else
-      return nullptr;
-
-    if (Length == 0) // strncmp(x,y,0)   -> 0
-      return ConstantInt::get(CI->getType(), 0);
-
-    if (DL && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
-      return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI);
-
-    StringRef Str1, Str2;
-    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
-    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
-
-    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
-    if (HasStr1 && HasStr2) {
-      StringRef SubStr1 = Str1.substr(0, Length);
-      StringRef SubStr2 = Str2.substr(0, Length);
-      return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
-    }
-
-    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> -*x
-      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
-                                      CI->getType()));
-
-    if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
-      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
-
+  // Make sure we have a constant length.
+  ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!LenC)
     return nullptr;
+  uint64_t Len = LenC->getZExtValue();
+
+  if (Len == 0) // memcmp(s1,s2,0) -> 0
+    return Constant::getNullValue(CI->getType());
+
+  // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
+  if (Len == 1) {
+    Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
+                               CI->getType(), "lhsv");
+    Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
+                               CI->getType(), "rhsv");
+    return B.CreateSub(LHSV, RHSV, "chardiff");
   }
-};
 
-struct StrCpyOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Verify the "strcpy" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy())
+  // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
+  StringRef LHSStr, RHSStr;
+  if (getConstantStringInfo(LHS, LHSStr) &&
+      getConstantStringInfo(RHS, RHSStr)) {
+    // Make sure we're not reading out-of-bounds memory.
+    if (Len > LHSStr.size() || Len > RHSStr.size())
       return nullptr;
-
-    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
-    if (Dst == Src)      // strcpy(x,x)  -> x
-      return Src;
-
-    // These optimizations require DataLayout.
-    if (!DL) return nullptr;
-
-    // See if we can get the length of the input string.
-    uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return nullptr;
-
-    // We have enough information to now generate the memcpy call to do the
-    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-    B.CreateMemCpy(Dst, Src,
-		   ConstantInt::get(DL->getIntPtrType(*Context), Len), 1);
-    return Dst;
+    // Fold the memcmp and normalize the result.  This way we get consistent
+    // results across multiple platforms.
+    uint64_t Ret = 0;
+    int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
+    if (Cmp < 0)
+      Ret = -1;
+    else if (Cmp > 0)
+      Ret = 1;
+    return ConstantInt::get(CI->getType(), Ret);
   }
-};
 
-struct StpCpyOpt: public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Verify the "stpcpy" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy())
-      return nullptr;
+  return nullptr;
+}
 
-    // These optimizations require DataLayout.
-    if (!DL) return nullptr;
-
-    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
-    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
-      Value *StrLen = EmitStrLen(Src, B, DL, TLI);
-      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : nullptr;
-    }
-
-    // See if we can get the length of the input string.
-    uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return nullptr;
-
-    Type *PT = FT->getParamType(0);
-    Value *LenV = ConstantInt::get(DL->getIntPtrType(PT), Len);
-    Value *DstEnd = B.CreateGEP(Dst,
-                                ConstantInt::get(DL->getIntPtrType(PT),
-                                                 Len - 1));
-
-    // We have enough information to now generate the memcpy call to do the
-    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-    B.CreateMemCpy(Dst, Src, LenV, 1);
-    return DstEnd;
-  }
-};
-
-struct StrNCpyOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        !FT->getParamType(2)->isIntegerTy())
-      return nullptr;
-
-    Value *Dst = CI->getArgOperand(0);
-    Value *Src = CI->getArgOperand(1);
-    Value *LenOp = CI->getArgOperand(2);
-
-    // See if we can get the length of the input string.
-    uint64_t SrcLen = GetStringLength(Src);
-    if (SrcLen == 0) return nullptr;
-    --SrcLen;
-
-    if (SrcLen == 0) {
-      // strncpy(x, "", y) -> memset(x, '\0', y, 1)
-      B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
-      return Dst;
-    }
-
-    uint64_t Len;
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
-      Len = LengthArg->getZExtValue();
-    else
-      return nullptr;
-
-    if (Len == 0) return Dst; // strncpy(x, y, 0) -> x
-
-    // These optimizations require DataLayout.
-    if (!DL) return nullptr;
-
-    // Let strncpy handle the zero padding
-    if (Len > SrcLen+1) return nullptr;
-
-    Type *PT = FT->getParamType(0);
-    // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
-    B.CreateMemCpy(Dst, Src,
-                   ConstantInt::get(DL->getIntPtrType(PT), Len), 1);
-
-    return Dst;
-  }
-};
-
-struct StrLenOpt : public LibCallOptimization {
-  bool ignoreCallingConv() override { return true; }
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 1 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return nullptr;
-
-    Value *Src = CI->getArgOperand(0);
-
-    // Constant folding: strlen("xyz") -> 3
-    if (uint64_t Len = GetStringLength(Src))
-      return ConstantInt::get(CI->getType(), Len-1);
-
-    // strlen(x?"foo":"bars") --> x ? 3 : 4
-    if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
-      uint64_t LenTrue = GetStringLength(SI->getTrueValue());
-      uint64_t LenFalse = GetStringLength(SI->getFalseValue());
-      if (LenTrue && LenFalse) {
-        emitOptimizationRemark(*Context, "simplify-libcalls", *Caller,
-                               SI->getDebugLoc(),
-                               "folded strlen(select) to select of constants");
-        return B.CreateSelect(SI->getCondition(),
-                              ConstantInt::get(CI->getType(), LenTrue-1),
-                              ConstantInt::get(CI->getType(), LenFalse-1));
-      }
-    }
-
-    // strlen(x) != 0 --> *x != 0
-    // strlen(x) == 0 --> *x == 0
-    if (isOnlyUsedInZeroEqualityComparison(CI))
-      return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
-
+Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // These optimizations require DataLayout.
+  if (!DL)
     return nullptr;
-  }
-};
 
-struct StrPBrkOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        FT->getParamType(1) != FT->getParamType(0) ||
-        FT->getReturnType() != FT->getParamType(0))
-      return nullptr;
-
-    StringRef S1, S2;
-    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
-    // strpbrk(s, "") -> NULL
-    // strpbrk("", s) -> NULL
-    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
-      return Constant::getNullValue(CI->getType());
-
-    // Constant folding.
-    if (HasS1 && HasS2) {
-      size_t I = S1.find_first_of(S2);
-      if (I == StringRef::npos) // No match.
-        return Constant::getNullValue(CI->getType());
-
-      return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk");
-    }
-
-    // strpbrk(s, "a") -> strchr(s, 'a')
-    if (DL && HasS2 && S2.size() == 1)
-      return EmitStrChr(CI->getArgOperand(0), S2[0], B, DL, TLI);
-
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy() ||
+      FT->getParamType(2) != DL->getIntPtrType(CI->getContext()))
     return nullptr;
-  }
-};
 
-struct StrToOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy())
-      return nullptr;
+  // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+  B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                 CI->getArgOperand(2), 1);
+  return CI->getArgOperand(0);
+}
 
-    Value *EndPtr = CI->getArgOperand(1);
-    if (isa<ConstantPointerNull>(EndPtr)) {
-      // With a null EndPtr, this function won't capture the main argument.
-      // It would be readonly too, except that it still may write to errno.
-      CI->addAttribute(1, Attribute::NoCapture);
-    }
-
+Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // These optimizations require DataLayout.
+  if (!DL)
     return nullptr;
-  }
-};
 
-struct StrSpnOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        FT->getParamType(1) != FT->getParamType(0) ||
-        !FT->getReturnType()->isIntegerTy())
-      return nullptr;
-
-    StringRef S1, S2;
-    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
-    // strspn(s, "") -> 0
-    // strspn("", s) -> 0
-    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
-      return Constant::getNullValue(CI->getType());
-
-    // Constant folding.
-    if (HasS1 && HasS2) {
-      size_t Pos = S1.find_first_not_of(S2);
-      if (Pos == StringRef::npos) Pos = S1.size();
-      return ConstantInt::get(CI->getType(), Pos);
-    }
-
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy() ||
+      FT->getParamType(2) != DL->getIntPtrType(CI->getContext()))
     return nullptr;
-  }
-};
 
-struct StrCSpnOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        FT->getParamType(1) != FT->getParamType(0) ||
-        !FT->getReturnType()->isIntegerTy())
-      return nullptr;
+  // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
+  B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                  CI->getArgOperand(2), 1);
+  return CI->getArgOperand(0);
+}
 
-    StringRef S1, S2;
-    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
-    // strcspn("", s) -> 0
-    if (HasS1 && S1.empty())
-      return Constant::getNullValue(CI->getType());
-
-    // Constant folding.
-    if (HasS1 && HasS2) {
-      size_t Pos = S1.find_first_of(S2);
-      if (Pos == StringRef::npos) Pos = S1.size();
-      return ConstantInt::get(CI->getType(), Pos);
-    }
-
-    // strcspn(s, "") -> strlen(s)
-    if (DL && HasS2 && S2.empty())
-      return EmitStrLen(CI->getArgOperand(0), B, DL, TLI);
-
+Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // These optimizations require DataLayout.
+  if (!DL)
     return nullptr;
-  }
-};
 
-struct StrStrOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isPointerTy())
-      return nullptr;
-
-    // fold strstr(x, x) -> x.
-    if (CI->getArgOperand(0) == CI->getArgOperand(1))
-      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
-    // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
-    if (DL && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
-      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, DL, TLI);
-      if (!StrLen)
-        return nullptr;
-      Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
-                                   StrLen, B, DL, TLI);
-      if (!StrNCmp)
-        return nullptr;
-      for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) {
-        ICmpInst *Old = cast<ICmpInst>(*UI++);
-        Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
-                                  ConstantInt::getNullValue(StrNCmp->getType()),
-                                  "cmp");
-        LCS->replaceAllUsesWith(Old, Cmp);
-      }
-      return CI;
-    }
-
-    // See if either input string is a constant string.
-    StringRef SearchStr, ToFindStr;
-    bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
-    bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
-
-    // fold strstr(x, "") -> x.
-    if (HasStr2 && ToFindStr.empty())
-      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
-    // If both strings are known, constant fold it.
-    if (HasStr1 && HasStr2) {
-      size_t Offset = SearchStr.find(ToFindStr);
-
-      if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
-        return Constant::getNullValue(CI->getType());
-
-      // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
-      Value *Result = CastToCStr(CI->getArgOperand(0), B);
-      Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
-      return B.CreateBitCast(Result, CI->getType());
-    }
-
-    // fold strstr(x, "y") -> strchr(x, 'y').
-    if (HasStr2 && ToFindStr.size() == 1) {
-      Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, DL, TLI);
-      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
-    }
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isIntegerTy() ||
+      FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0)))
     return nullptr;
-  }
-};
 
-struct MemCmpOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy(32))
-      return nullptr;
-
-    Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
-
-    if (LHS == RHS)  // memcmp(s,s,x) -> 0
-      return Constant::getNullValue(CI->getType());
-
-    // Make sure we have a constant length.
-    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-    if (!LenC) return nullptr;
-    uint64_t Len = LenC->getZExtValue();
-
-    if (Len == 0) // memcmp(s1,s2,0) -> 0
-      return Constant::getNullValue(CI->getType());
-
-    // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
-    if (Len == 1) {
-      Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
-                                 CI->getType(), "lhsv");
-      Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
-                                 CI->getType(), "rhsv");
-      return B.CreateSub(LHSV, RHSV, "chardiff");
-    }
-
-    // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
-    StringRef LHSStr, RHSStr;
-    if (getConstantStringInfo(LHS, LHSStr) &&
-        getConstantStringInfo(RHS, RHSStr)) {
-      // Make sure we're not reading out-of-bounds memory.
-      if (Len > LHSStr.size() || Len > RHSStr.size())
-        return nullptr;
-      // Fold the memcmp and normalize the result.  This way we get consistent
-      // results across multiple platforms.
-      uint64_t Ret = 0;
-      int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
-      if (Cmp < 0)
-        Ret = -1;
-      else if (Cmp > 0)
-        Ret = 1;
-      return ConstantInt::get(CI->getType(), Ret);
-    }
-
-    return nullptr;
-  }
-};
-
-struct MemCpyOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // These optimizations require DataLayout.
-    if (!DL) return nullptr;
-
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != DL->getIntPtrType(*Context))
-      return nullptr;
-
-    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                   CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-};
-
-struct MemMoveOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // These optimizations require DataLayout.
-    if (!DL) return nullptr;
-
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != DL->getIntPtrType(*Context))
-      return nullptr;
-
-    // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
-    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                    CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-};
-
-struct MemSetOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // These optimizations require DataLayout.
-    if (!DL) return nullptr;
-
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != DL->getIntPtrType(FT->getParamType(0)))
-      return nullptr;
-
-    // memset(p, v, n) -> llvm.memset(p, v, n, 1)
-    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
-    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-};
+  // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+  Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+  B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+  return CI->getArgOperand(0);
+}
 
 //===----------------------------------------------------------------------===//
 // Math Library Optimizations
@@ -1111,935 +1034,959 @@
 //===----------------------------------------------------------------------===//
 // Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
 
-struct UnaryDoubleFPOpt : public LibCallOptimization {
-  bool CheckRetType;
-  UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
-        !FT->getParamType(0)->isDoubleTy())
-      return nullptr;
+Value *LibCallSimplifier::optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
+                                                bool CheckRetType) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
+      !FT->getParamType(0)->isDoubleTy())
+    return nullptr;
 
-    if (CheckRetType) {
-      // Check if all the uses for function like 'sin' are converted to float.
-      for (User *U : CI->users()) {
-        FPTruncInst *Cast = dyn_cast<FPTruncInst>(U);
-        if (!Cast || !Cast->getType()->isFloatTy())
-          return nullptr;
-      }
+  if (CheckRetType) {
+    // Check if all the uses for function like 'sin' are converted to float.
+    for (User *U : CI->users()) {
+      FPTruncInst *Cast = dyn_cast<FPTruncInst>(U);
+      if (!Cast || !Cast->getType()->isFloatTy())
+        return nullptr;
     }
-
-    // If this is something like 'floor((double)floatval)', convert to floorf.
-    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-    if (!Cast || !Cast->getOperand(0)->getType()->isFloatTy())
-      return nullptr;
-
-    // floor((double)floatval) -> (double)floorf(floatval)
-    Value *V = Cast->getOperand(0);
-    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
-    return B.CreateFPExt(V, B.getDoubleTy());
   }
-};
+
+  // If this is something like 'floor((double)floatval)', convert to floorf.
+  FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
+  if (!Cast || !Cast->getOperand(0)->getType()->isFloatTy())
+    return nullptr;
+
+  // floor((double)floatval) -> (double)floorf(floatval)
+  Value *V = Cast->getOperand(0);
+  if (Callee->isIntrinsic()) {
+    Module *M = CI->getParent()->getParent()->getParent();
+    Intrinsic::ID IID = (Intrinsic::ID) Callee->getIntrinsicID();
+    Function *F = Intrinsic::getDeclaration(M, IID, B.getFloatTy());
+    V = B.CreateCall(F, V);
+  } else {
+    // The call is a library call rather than an intrinsic.
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+  }
+
+  return B.CreateFPExt(V, B.getDoubleTy());
+}
 
 // Double -> Float Shrinking Optimizations for Binary Functions like 'fmin/fmax'
-struct BinaryDoubleFPOpt : public LibCallOptimization {
-  bool CheckRetType;
-  BinaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    // Just make sure this has 2 arguments of the same FP type, which match the
-    // result type.
-    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        !FT->getParamType(0)->isFloatingPointTy())
-      return nullptr;
-
-    if (CheckRetType) {
-      // Check if all the uses for function like 'fmin/fmax' are converted to
-      // float.
-      for (User *U : CI->users()) {
-        FPTruncInst *Cast = dyn_cast<FPTruncInst>(U);
-        if (!Cast || !Cast->getType()->isFloatTy())
-          return nullptr;
-      }
-    }
-
-    // If this is something like 'fmin((double)floatval1, (double)floatval2)',
-    // we convert it to fminf.
-    FPExtInst *Cast1 = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-    FPExtInst *Cast2 = dyn_cast<FPExtInst>(CI->getArgOperand(1));
-    if (!Cast1 || !Cast1->getOperand(0)->getType()->isFloatTy() ||
-        !Cast2 || !Cast2->getOperand(0)->getType()->isFloatTy())
-      return nullptr;
-
-    // fmin((double)floatval1, (double)floatval2)
-    //                      -> (double)fmin(floatval1, floatval2)
-    Value *V = nullptr;
-    Value *V1 = Cast1->getOperand(0);
-    Value *V2 = Cast2->getOperand(0);
-    V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
-                              Callee->getAttributes());
-    return B.CreateFPExt(V, B.getDoubleTy());
-  }
-};
-
-struct UnsafeFPLibCallOptimization : public LibCallOptimization {
-  bool UnsafeFPShrink;
-  UnsafeFPLibCallOptimization(bool UnsafeFPShrink) {
-    this->UnsafeFPShrink = UnsafeFPShrink;
-  }
-};
-
-struct CosOpt : public UnsafeFPLibCallOptimization {
-  CosOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    Value *Ret = nullptr;
-    if (UnsafeFPShrink && Callee->getName() == "cos" &&
-        TLI->has(LibFunc::cosf)) {
-      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
-      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
-    }
-
-    FunctionType *FT = Callee->getFunctionType();
-    // Just make sure this has 1 argument of FP type, which matches the
-    // result type.
-    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isFloatingPointTy())
-      return Ret;
-
-    // cos(-x) -> cos(x)
-    Value *Op1 = CI->getArgOperand(0);
-    if (BinaryOperator::isFNeg(Op1)) {
-      BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
-      return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
-    }
-    return Ret;
-  }
-};
-
-struct PowOpt : public UnsafeFPLibCallOptimization {
-  PowOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    Value *Ret = nullptr;
-    if (UnsafeFPShrink && Callee->getName() == "pow" &&
-        TLI->has(LibFunc::powf)) {
-      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
-      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
-    }
-
-    FunctionType *FT = Callee->getFunctionType();
-    // Just make sure this has 2 arguments of the same FP type, which match the
-    // result type.
-    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        !FT->getParamType(0)->isFloatingPointTy())
-      return Ret;
-
-    Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
-    if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
-      // pow(1.0, x) -> 1.0
-      if (Op1C->isExactlyValue(1.0))
-        return Op1C;
-      // pow(2.0, x) -> exp2(x)
-      if (Op1C->isExactlyValue(2.0) &&
-          hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,
-                          LibFunc::exp2l))
-        return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
-      // pow(10.0, x) -> exp10(x)
-      if (Op1C->isExactlyValue(10.0) &&
-          hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f,
-                          LibFunc::exp10l))
-        return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B,
-                                    Callee->getAttributes());
-    }
-
-    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
-    if (!Op2C) return Ret;
-
-    if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
-      return ConstantFP::get(CI->getType(), 1.0);
-
-    if (Op2C->isExactlyValue(0.5) &&
-        hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
-                        LibFunc::sqrtl) &&
-        hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf,
-                        LibFunc::fabsl)) {
-      // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
-      // This is faster than calling pow, and still handles negative zero
-      // and negative infinity correctly.
-      // TODO: In fast-math mode, this could be just sqrt(x).
-      // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
-      Value *Inf = ConstantFP::getInfinity(CI->getType());
-      Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
-      Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B,
-                                         Callee->getAttributes());
-      Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B,
-                                         Callee->getAttributes());
-      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
-      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
-      return Sel;
-    }
-
-    if (Op2C->isExactlyValue(1.0))  // pow(x, 1.0) -> x
-      return Op1;
-    if (Op2C->isExactlyValue(2.0))  // pow(x, 2.0) -> x*x
-      return B.CreateFMul(Op1, Op1, "pow2");
-    if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
-      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0),
-                          Op1, "powrecip");
+Value *LibCallSimplifier::optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  // Just make sure this has 2 arguments of the same FP type, which match the
+  // result type.
+  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      !FT->getParamType(0)->isFloatingPointTy())
     return nullptr;
-  }
-};
 
-struct Exp2Opt : public UnsafeFPLibCallOptimization {
-  Exp2Opt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    Value *Ret = nullptr;
-    if (UnsafeFPShrink && Callee->getName() == "exp2" &&
-        TLI->has(LibFunc::exp2f)) {
-      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
-      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
-    }
-
-    FunctionType *FT = Callee->getFunctionType();
-    // Just make sure this has 1 argument of FP type, which matches the
-    // result type.
-    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isFloatingPointTy())
-      return Ret;
-
-    Value *Op = CI->getArgOperand(0);
-    // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
-    // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
-    LibFunc::Func LdExp = LibFunc::ldexpl;
-    if (Op->getType()->isFloatTy())
-      LdExp = LibFunc::ldexpf;
-    else if (Op->getType()->isDoubleTy())
-      LdExp = LibFunc::ldexp;
-
-    if (TLI->has(LdExp)) {
-      Value *LdExpArg = nullptr;
-      if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
-        if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
-          LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
-      } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
-        if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
-          LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
-      }
-
-      if (LdExpArg) {
-        Constant *One = ConstantFP::get(*Context, APFloat(1.0f));
-        if (!Op->getType()->isFloatTy())
-          One = ConstantExpr::getFPExtend(One, Op->getType());
-
-        Module *M = Caller->getParent();
-        Value *Callee =
-            M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(),
-                                   Op->getType(), B.getInt32Ty(), NULL);
-        CallInst *CI = B.CreateCall2(Callee, One, LdExpArg);
-        if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
-          CI->setCallingConv(F->getCallingConv());
-
-        return CI;
-      }
-    }
-    return Ret;
-  }
-};
-
-struct SinCosPiOpt : public LibCallOptimization {
-  SinCosPiOpt() {}
-
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Make sure the prototype is as expected, otherwise the rest of the
-    // function is probably invalid and likely to abort.
-    if (!isTrigLibCall(CI))
-      return nullptr;
-
-    Value *Arg = CI->getArgOperand(0);
-    SmallVector<CallInst *, 1> SinCalls;
-    SmallVector<CallInst *, 1> CosCalls;
-    SmallVector<CallInst *, 1> SinCosCalls;
-
-    bool IsFloat = Arg->getType()->isFloatTy();
-
-    // Look for all compatible sinpi, cospi and sincospi calls with the same
-    // argument. If there are enough (in some sense) we can make the
-    // substitution.
-    for (User *U : Arg->users())
-      classifyArgUse(U, CI->getParent(), IsFloat, SinCalls, CosCalls,
-                     SinCosCalls);
-
-    // It's only worthwhile if both sinpi and cospi are actually used.
-    if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
-      return nullptr;
-
-    Value *Sin, *Cos, *SinCos;
-    insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos,
-                     SinCos);
-
-    replaceTrigInsts(SinCalls, Sin);
-    replaceTrigInsts(CosCalls, Cos);
-    replaceTrigInsts(SinCosCalls, SinCos);
-
+  // If this is something like 'fmin((double)floatval1, (double)floatval2)',
+  // we convert it to fminf.
+  FPExtInst *Cast1 = dyn_cast<FPExtInst>(CI->getArgOperand(0));
+  FPExtInst *Cast2 = dyn_cast<FPExtInst>(CI->getArgOperand(1));
+  if (!Cast1 || !Cast1->getOperand(0)->getType()->isFloatTy() || !Cast2 ||
+      !Cast2->getOperand(0)->getType()->isFloatTy())
     return nullptr;
+
+  // fmin((double)floatval1, (double)floatval2)
+  //                      -> (double)fmin(floatval1, floatval2)
+  Value *V = nullptr;
+  Value *V1 = Cast1->getOperand(0);
+  Value *V2 = Cast2->getOperand(0);
+  // TODO: Handle intrinsics in the same way as in optimizeUnaryDoubleFP().
+  V = EmitBinaryFloatFnCall(V1, V2, Callee->getName(), B,
+                            Callee->getAttributes());
+  return B.CreateFPExt(V, B.getDoubleTy());
+}
+
+Value *LibCallSimplifier::optimizeCos(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Value *Ret = nullptr;
+  if (UnsafeFPShrink && Callee->getName() == "cos" && TLI->has(LibFunc::cosf)) {
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
   }
 
-  bool isTrigLibCall(CallInst *CI) {
-    Function *Callee = CI->getCalledFunction();
-    FunctionType *FT = Callee->getFunctionType();
+  FunctionType *FT = Callee->getFunctionType();
+  // Just make sure this has 1 argument of FP type, which matches the
+  // result type.
+  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return Ret;
 
-    // We can only hope to do anything useful if we can ignore things like errno
-    // and floating-point exceptions.
-    bool AttributesSafe = CI->hasFnAttr(Attribute::NoUnwind) &&
-                          CI->hasFnAttr(Attribute::ReadNone);
+  // cos(-x) -> cos(x)
+  Value *Op1 = CI->getArgOperand(0);
+  if (BinaryOperator::isFNeg(Op1)) {
+    BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
+    return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
+  }
+  return Ret;
+}
 
-    // Other than that we need float(float) or double(double)
-    return AttributesSafe && FT->getNumParams() == 1 &&
-           FT->getReturnType() == FT->getParamType(0) &&
-           (FT->getParamType(0)->isFloatTy() ||
-            FT->getParamType(0)->isDoubleTy());
+Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+
+  Value *Ret = nullptr;
+  if (UnsafeFPShrink && Callee->getName() == "pow" && TLI->has(LibFunc::powf)) {
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
   }
 
-  void classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
-                      SmallVectorImpl<CallInst *> &SinCalls,
-                      SmallVectorImpl<CallInst *> &CosCalls,
-                      SmallVectorImpl<CallInst *> &SinCosCalls) {
-    CallInst *CI = dyn_cast<CallInst>(Val);
+  FunctionType *FT = Callee->getFunctionType();
+  // Just make sure this has 2 arguments of the same FP type, which match the
+  // result type.
+  if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+      FT->getParamType(0) != FT->getParamType(1) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return Ret;
 
-    if (!CI)
-      return;
-
-    Function *Callee = CI->getCalledFunction();
-    StringRef FuncName = Callee->getName();
-    LibFunc::Func Func;
-    if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) ||
-        !isTrigLibCall(CI))
-      return;
-
-    if (IsFloat) {
-      if (Func == LibFunc::sinpif)
-        SinCalls.push_back(CI);
-      else if (Func == LibFunc::cospif)
-        CosCalls.push_back(CI);
-      else if (Func == LibFunc::sincospif_stret)
-        SinCosCalls.push_back(CI);
-    } else {
-      if (Func == LibFunc::sinpi)
-        SinCalls.push_back(CI);
-      else if (Func == LibFunc::cospi)
-        CosCalls.push_back(CI);
-      else if (Func == LibFunc::sincospi_stret)
-        SinCosCalls.push_back(CI);
-    }
+  Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
+  if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+    // pow(1.0, x) -> 1.0
+    if (Op1C->isExactlyValue(1.0))
+      return Op1C;
+    // pow(2.0, x) -> exp2(x)
+    if (Op1C->isExactlyValue(2.0) &&
+        hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,
+                        LibFunc::exp2l))
+      return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
+    // pow(10.0, x) -> exp10(x)
+    if (Op1C->isExactlyValue(10.0) &&
+        hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f,
+                        LibFunc::exp10l))
+      return EmitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B,
+                                  Callee->getAttributes());
   }
 
-  void replaceTrigInsts(SmallVectorImpl<CallInst*> &Calls, Value *Res) {
-    for (SmallVectorImpl<CallInst*>::iterator I = Calls.begin(),
-           E = Calls.end();
-         I != E; ++I) {
-      LCS->replaceAllUsesWith(*I, Res);
-    }
+  ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
+  if (!Op2C)
+    return Ret;
+
+  if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0
+    return ConstantFP::get(CI->getType(), 1.0);
+
+  if (Op2C->isExactlyValue(0.5) &&
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
+                      LibFunc::sqrtl) &&
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf,
+                      LibFunc::fabsl)) {
+    // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
+    // This is faster than calling pow, and still handles negative zero
+    // and negative infinity correctly.
+    // TODO: In fast-math mode, this could be just sqrt(x).
+    // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
+    Value *Inf = ConstantFP::getInfinity(CI->getType());
+    Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
+    Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B, Callee->getAttributes());
+    Value *FAbs =
+        EmitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes());
+    Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
+    Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
+    return Sel;
   }
 
-  void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
-                        bool UseFloat, Value *&Sin, Value *&Cos,
-                        Value *&SinCos) {
-    Type *ArgTy = Arg->getType();
-    Type *ResTy;
-    StringRef Name;
+  if (Op2C->isExactlyValue(1.0)) // pow(x, 1.0) -> x
+    return Op1;
+  if (Op2C->isExactlyValue(2.0)) // pow(x, 2.0) -> x*x
+    return B.CreateFMul(Op1, Op1, "pow2");
+  if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
+    return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Op1, "powrecip");
+  return nullptr;
+}
 
-    Triple T(OrigCallee->getParent()->getTargetTriple());
-    if (UseFloat) {
-      Name = "__sincospif_stret";
+Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  Function *Caller = CI->getParent()->getParent();
 
-      assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
-      // x86_64 can't use {float, float} since that would be returned in both
-      // xmm0 and xmm1, which isn't what a real struct would do.
-      ResTy = T.getArch() == Triple::x86_64
-                  ? static_cast<Type *>(VectorType::get(ArgTy, 2))
-                  : static_cast<Type *>(StructType::get(ArgTy, ArgTy, NULL));
-    } else {
-      Name = "__sincospi_stret";
-      ResTy = StructType::get(ArgTy, ArgTy, NULL);
-    }
-
-    Module *M = OrigCallee->getParent();
-    Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
-                                           ResTy, ArgTy, NULL);
-
-    if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
-      // If the argument is an instruction, it must dominate all uses so put our
-      // sincos call there.
-      BasicBlock::iterator Loc = ArgInst;
-      B.SetInsertPoint(ArgInst->getParent(), ++Loc);
-    } else {
-      // Otherwise (e.g. for a constant) the beginning of the function is as
-      // good a place as any.
-      BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
-      B.SetInsertPoint(&EntryBB, EntryBB.begin());
-    }
-
-    SinCos = B.CreateCall(Callee, Arg, "sincospi");
-
-    if (SinCos->getType()->isStructTy()) {
-      Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
-      Cos = B.CreateExtractValue(SinCos, 1, "cospi");
-    } else {
-      Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
-                                   "sinpi");
-      Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
-                                   "cospi");
-    }
+  Value *Ret = nullptr;
+  if (UnsafeFPShrink && Callee->getName() == "exp2" &&
+      TLI->has(LibFunc::exp2f)) {
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
   }
 
-};
+  FunctionType *FT = Callee->getFunctionType();
+  // Just make sure this has 1 argument of FP type, which matches the
+  // result type.
+  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return Ret;
+
+  Value *Op = CI->getArgOperand(0);
+  // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
+  // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
+  LibFunc::Func LdExp = LibFunc::ldexpl;
+  if (Op->getType()->isFloatTy())
+    LdExp = LibFunc::ldexpf;
+  else if (Op->getType()->isDoubleTy())
+    LdExp = LibFunc::ldexp;
+
+  if (TLI->has(LdExp)) {
+    Value *LdExpArg = nullptr;
+    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
+    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
+    }
+
+    if (LdExpArg) {
+      Constant *One = ConstantFP::get(CI->getContext(), APFloat(1.0f));
+      if (!Op->getType()->isFloatTy())
+        One = ConstantExpr::getFPExtend(One, Op->getType());
+
+      Module *M = Caller->getParent();
+      Value *Callee =
+          M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(),
+                                 Op->getType(), B.getInt32Ty(), nullptr);
+      CallInst *CI = B.CreateCall2(Callee, One, LdExpArg);
+      if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
+        CI->setCallingConv(F->getCallingConv());
+
+      return CI;
+    }
+  }
+  return Ret;
+}
+
+Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+
+  Value *Ret = nullptr;
+  if (Callee->getName() == "fabs" && TLI->has(LibFunc::fabsf)) {
+    Ret = optimizeUnaryDoubleFP(CI, B, false);
+  }
+
+  FunctionType *FT = Callee->getFunctionType();
+  // Make sure this has 1 argument of FP type which matches the result type.
+  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isFloatingPointTy())
+    return Ret;
+
+  Value *Op = CI->getArgOperand(0);
+  if (Instruction *I = dyn_cast<Instruction>(Op)) {
+    // Fold fabs(x * x) -> x * x; any squared FP value must already be positive.
+    if (I->getOpcode() == Instruction::FMul)
+      if (I->getOperand(0) == I->getOperand(1))
+        return Op;
+  }
+  return Ret;
+}
+
+Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  
+  Value *Ret = nullptr;
+  if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" ||
+                                   Callee->getIntrinsicID() == Intrinsic::sqrt))
+    Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+  // FIXME: For finer-grain optimization, we need intrinsics to have the same
+  // fast-math flag decorations that are applied to FP instructions. For now,
+  // we have to rely on the function-level unsafe-fp-math attribute to do this
+  // optimization because there's no other way to express that the sqrt can be
+  // reassociated.
+  Function *F = CI->getParent()->getParent();
+  if (F->hasFnAttribute("unsafe-fp-math")) {
+    // Check for unsafe-fp-math = true.
+    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+    if (Attr.getValueAsString() != "true")
+      return Ret;
+  }
+  Value *Op = CI->getArgOperand(0);
+  if (Instruction *I = dyn_cast<Instruction>(Op)) {
+    if (I->getOpcode() == Instruction::FMul && I->hasUnsafeAlgebra()) {
+      // We're looking for a repeated factor in a multiplication tree,
+      // so we can do this fold: sqrt(x * x) -> fabs(x);
+      // or this fold: sqrt(x * x * y) -> fabs(x) * sqrt(y).
+      Value *Op0 = I->getOperand(0);
+      Value *Op1 = I->getOperand(1);
+      Value *RepeatOp = nullptr;
+      Value *OtherOp = nullptr;
+      if (Op0 == Op1) {
+        // Simple match: the operands of the multiply are identical.
+        RepeatOp = Op0;
+      } else {
+        // Look for a more complicated pattern: one of the operands is itself
+        // a multiply, so search for a common factor in that multiply.
+        // Note: We don't bother looking any deeper than this first level or for
+        // variations of this pattern because instcombine's visitFMUL and/or the
+        // reassociation pass should give us this form.
+        Value *OtherMul0, *OtherMul1;
+        if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) {
+          // Pattern: sqrt((x * y) * z)
+          if (OtherMul0 == OtherMul1) {
+            // Matched: sqrt((x * x) * z)
+            RepeatOp = OtherMul0;
+            OtherOp = Op1;
+          }
+        }
+      }
+      if (RepeatOp) {
+        // Fast math flags for any created instructions should match the sqrt
+        // and multiply.
+        // FIXME: We're not checking the sqrt because it doesn't have
+        // fast-math-flags (see earlier comment).
+        IRBuilder<true, ConstantFolder,
+          IRBuilderDefaultInserter<true> >::FastMathFlagGuard Guard(B);
+        B.SetFastMathFlags(I->getFastMathFlags());
+        // If we found a repeated factor, hoist it out of the square root and
+        // replace it with the fabs of that factor.
+        Module *M = Callee->getParent();
+        Type *ArgType = Op->getType();
+        Value *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
+        Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
+        if (OtherOp) {
+          // If we found a non-repeated factor, we still need to get its square
+          // root. We then multiply that by the value that was simplified out
+          // of the square root calculation.
+          Value *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType);
+          Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt");
+          return B.CreateFMul(FabsCall, SqrtCall);
+        }
+        return FabsCall;
+      }
+    }
+  }
+  return Ret;
+}
+
+static bool isTrigLibCall(CallInst *CI);
+static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
+                             bool UseFloat, Value *&Sin, Value *&Cos,
+                             Value *&SinCos);
+
+Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
+
+  // Make sure the prototype is as expected, otherwise the rest of the
+  // function is probably invalid and likely to abort.
+  if (!isTrigLibCall(CI))
+    return nullptr;
+
+  Value *Arg = CI->getArgOperand(0);
+  SmallVector<CallInst *, 1> SinCalls;
+  SmallVector<CallInst *, 1> CosCalls;
+  SmallVector<CallInst *, 1> SinCosCalls;
+
+  bool IsFloat = Arg->getType()->isFloatTy();
+
+  // Look for all compatible sinpi, cospi and sincospi calls with the same
+  // argument. If there are enough (in some sense) we can make the
+  // substitution.
+  for (User *U : Arg->users())
+    classifyArgUse(U, CI->getParent(), IsFloat, SinCalls, CosCalls,
+                   SinCosCalls);
+
+  // It's only worthwhile if both sinpi and cospi are actually used.
+  if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
+    return nullptr;
+
+  Value *Sin, *Cos, *SinCos;
+  insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos);
+
+  replaceTrigInsts(SinCalls, Sin);
+  replaceTrigInsts(CosCalls, Cos);
+  replaceTrigInsts(SinCosCalls, SinCos);
+
+  return nullptr;
+}
+
+static bool isTrigLibCall(CallInst *CI) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+
+  // We can only hope to do anything useful if we can ignore things like errno
+  // and floating-point exceptions.
+  bool AttributesSafe =
+      CI->hasFnAttr(Attribute::NoUnwind) && CI->hasFnAttr(Attribute::ReadNone);
+
+  // Other than that we need float(float) or double(double)
+  return AttributesSafe && FT->getNumParams() == 1 &&
+         FT->getReturnType() == FT->getParamType(0) &&
+         (FT->getParamType(0)->isFloatTy() ||
+          FT->getParamType(0)->isDoubleTy());
+}
+
+void
+LibCallSimplifier::classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
+                                  SmallVectorImpl<CallInst *> &SinCalls,
+                                  SmallVectorImpl<CallInst *> &CosCalls,
+                                  SmallVectorImpl<CallInst *> &SinCosCalls) {
+  CallInst *CI = dyn_cast<CallInst>(Val);
+
+  if (!CI)
+    return;
+
+  Function *Callee = CI->getCalledFunction();
+  StringRef FuncName = Callee->getName();
+  LibFunc::Func Func;
+  if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) || !isTrigLibCall(CI))
+    return;
+
+  if (IsFloat) {
+    if (Func == LibFunc::sinpif)
+      SinCalls.push_back(CI);
+    else if (Func == LibFunc::cospif)
+      CosCalls.push_back(CI);
+    else if (Func == LibFunc::sincospif_stret)
+      SinCosCalls.push_back(CI);
+  } else {
+    if (Func == LibFunc::sinpi)
+      SinCalls.push_back(CI);
+    else if (Func == LibFunc::cospi)
+      CosCalls.push_back(CI);
+    else if (Func == LibFunc::sincospi_stret)
+      SinCosCalls.push_back(CI);
+  }
+}
+
+void LibCallSimplifier::replaceTrigInsts(SmallVectorImpl<CallInst *> &Calls,
+                                         Value *Res) {
+  for (SmallVectorImpl<CallInst *>::iterator I = Calls.begin(), E = Calls.end();
+       I != E; ++I) {
+    replaceAllUsesWith(*I, Res);
+  }
+}
+
+void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
+                      bool UseFloat, Value *&Sin, Value *&Cos, Value *&SinCos) {
+  Type *ArgTy = Arg->getType();
+  Type *ResTy;
+  StringRef Name;
+
+  Triple T(OrigCallee->getParent()->getTargetTriple());
+  if (UseFloat) {
+    Name = "__sincospif_stret";
+
+    assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
+    // x86_64 can't use {float, float} since that would be returned in both
+    // xmm0 and xmm1, which isn't what a real struct would do.
+    ResTy = T.getArch() == Triple::x86_64
+                ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
+  } else {
+    Name = "__sincospi_stret";
+    ResTy = StructType::get(ArgTy, ArgTy, nullptr);
+  }
+
+  Module *M = OrigCallee->getParent();
+  Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
+                                         ResTy, ArgTy, nullptr);
+
+  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+    // If the argument is an instruction, it must dominate all uses so put our
+    // sincos call there.
+    BasicBlock::iterator Loc = ArgInst;
+    B.SetInsertPoint(ArgInst->getParent(), ++Loc);
+  } else {
+    // Otherwise (e.g. for a constant) the beginning of the function is as
+    // good a place as any.
+    BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
+    B.SetInsertPoint(&EntryBB, EntryBB.begin());
+  }
+
+  SinCos = B.CreateCall(Callee, Arg, "sincospi");
+
+  if (SinCos->getType()->isStructTy()) {
+    Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
+    Cos = B.CreateExtractValue(SinCos, 1, "cospi");
+  } else {
+    Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
+                                 "sinpi");
+    Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
+                                 "cospi");
+  }
+}
 
 //===----------------------------------------------------------------------===//
 // Integer Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
-struct FFSOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    // Just make sure this has 2 arguments of the same FP type, which match the
-    // result type.
-    if (FT->getNumParams() != 1 ||
-        !FT->getReturnType()->isIntegerTy(32) ||
-        !FT->getParamType(0)->isIntegerTy())
-      return nullptr;
+Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  // Just make sure this has 2 arguments of the same FP type, which match the
+  // result type.
+  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy(32) ||
+      !FT->getParamType(0)->isIntegerTy())
+    return nullptr;
 
-    Value *Op = CI->getArgOperand(0);
+  Value *Op = CI->getArgOperand(0);
 
-    // Constant fold.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
-      if (CI->isZero()) // ffs(0) -> 0.
-        return B.getInt32(0);
-      // ffs(c) -> cttz(c)+1
-      return B.getInt32(CI->getValue().countTrailingZeros() + 1);
-    }
-
-    // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
-    Type *ArgType = Op->getType();
-    Value *F = Intrinsic::getDeclaration(Callee->getParent(),
-                                         Intrinsic::cttz, ArgType);
-    Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz");
-    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
-    V = B.CreateIntCast(V, B.getInt32Ty(), false);
-
-    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
-    return B.CreateSelect(Cond, V, B.getInt32(0));
+  // Constant fold.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+    if (CI->isZero()) // ffs(0) -> 0.
+      return B.getInt32(0);
+    // ffs(c) -> cttz(c)+1
+    return B.getInt32(CI->getValue().countTrailingZeros() + 1);
   }
-};
 
-struct AbsOpt : public LibCallOptimization {
-  bool ignoreCallingConv() override { return true; }
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    // We require integer(integer) where the types agree.
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-        FT->getParamType(0) != FT->getReturnType())
-      return nullptr;
+  // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+  Type *ArgType = Op->getType();
+  Value *F =
+      Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType);
+  Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz");
+  V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+  V = B.CreateIntCast(V, B.getInt32Ty(), false);
 
-    // abs(x) -> x >s -1 ? x : -x
-    Value *Op = CI->getArgOperand(0);
-    Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()),
-                                 "ispos");
-    Value *Neg = B.CreateNeg(Op, "neg");
-    return B.CreateSelect(Pos, Op, Neg);
-  }
-};
+  Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
+  return B.CreateSelect(Cond, V, B.getInt32(0));
+}
 
-struct IsDigitOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    // We require integer(i32)
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-        !FT->getParamType(0)->isIntegerTy(32))
-      return nullptr;
+Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  // We require integer(integer) where the types agree.
+  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+      FT->getParamType(0) != FT->getReturnType())
+    return nullptr;
 
-    // isdigit(c) -> (c-'0') <u 10
-    Value *Op = CI->getArgOperand(0);
-    Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
-    Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
-    return B.CreateZExt(Op, CI->getType());
-  }
-};
+  // abs(x) -> x >s -1 ? x : -x
+  Value *Op = CI->getArgOperand(0);
+  Value *Pos =
+      B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), "ispos");
+  Value *Neg = B.CreateNeg(Op, "neg");
+  return B.CreateSelect(Pos, Op, Neg);
+}
 
-struct IsAsciiOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    // We require integer(i32)
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
-        !FT->getParamType(0)->isIntegerTy(32))
-      return nullptr;
+Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  // We require integer(i32)
+  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+      !FT->getParamType(0)->isIntegerTy(32))
+    return nullptr;
 
-    // isascii(c) -> c <u 128
-    Value *Op = CI->getArgOperand(0);
-    Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
-    return B.CreateZExt(Op, CI->getType());
-  }
-};
+  // isdigit(c) -> (c-'0') <u 10
+  Value *Op = CI->getArgOperand(0);
+  Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
+  Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
+  return B.CreateZExt(Op, CI->getType());
+}
 
-struct ToAsciiOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    FunctionType *FT = Callee->getFunctionType();
-    // We require i32(i32)
-    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isIntegerTy(32))
-      return nullptr;
+Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  // We require integer(i32)
+  if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+      !FT->getParamType(0)->isIntegerTy(32))
+    return nullptr;
 
-    // toascii(c) -> c & 0x7f
-    return B.CreateAnd(CI->getArgOperand(0),
-                       ConstantInt::get(CI->getType(),0x7F));
-  }
-};
+  // isascii(c) -> c <u 128
+  Value *Op = CI->getArgOperand(0);
+  Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
+  return B.CreateZExt(Op, CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  FunctionType *FT = Callee->getFunctionType();
+  // We require i32(i32)
+  if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+      !FT->getParamType(0)->isIntegerTy(32))
+    return nullptr;
+
+  // toascii(c) -> c & 0x7f
+  return B.CreateAnd(CI->getArgOperand(0),
+                     ConstantInt::get(CI->getType(), 0x7F));
+}
 
 //===----------------------------------------------------------------------===//
 // Formatting and IO Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
-struct ErrorReportingOpt : public LibCallOptimization {
-  ErrorReportingOpt(int S = -1) : StreamArg(S) {}
+static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg);
 
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &) override {
-    // Error reporting calls should be cold, mark them as such.
-    // This applies even to non-builtin calls: it is only a hint and applies to
-    // functions that the frontend might not understand as builtins.
+Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
+                                                 int StreamArg) {
+  // Error reporting calls should be cold, mark them as such.
+  // This applies even to non-builtin calls: it is only a hint and applies to
+  // functions that the frontend might not understand as builtins.
 
-    // This heuristic was suggested in:
-    // Improving Static Branch Prediction in a Compiler
-    // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
-    // Proceedings of PACT'98, Oct. 1998, IEEE
+  // This heuristic was suggested in:
+  // Improving Static Branch Prediction in a Compiler
+  // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
+  // Proceedings of PACT'98, Oct. 1998, IEEE
+  Function *Callee = CI->getCalledFunction();
 
-    if (!CI->hasFnAttr(Attribute::Cold) && isReportingError(Callee, CI)) {
-      CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
-    }
+  if (!CI->hasFnAttr(Attribute::Cold) &&
+      isReportingError(Callee, CI, StreamArg)) {
+    CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
+  }
 
+  return nullptr;
+}
+
+static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) {
+  if (!ColdErrorCalls)
+    return false;
+
+  if (!Callee || !Callee->isDeclaration())
+    return false;
+
+  if (StreamArg < 0)
+    return true;
+
+  // These functions might be considered cold, but only if their stream
+  // argument is stderr.
+
+  if (StreamArg >= (int)CI->getNumArgOperands())
+    return false;
+  LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
+  if (!LI)
+    return false;
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+  if (!GV || !GV->isDeclaration())
+    return false;
+  return GV->getName() == "stderr";
+}
+
+Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
     return nullptr;
-  }
 
-protected:
-  bool isReportingError(Function *Callee, CallInst *CI) {
-    if (!ColdErrorCalls)
-      return false;
- 
-    if (!Callee || !Callee->isDeclaration())
-      return false;
+  // Empty format string -> noop.
+  if (FormatStr.empty()) // Tolerate printf's declared void.
+    return CI->use_empty() ? (Value *)CI : ConstantInt::get(CI->getType(), 0);
 
-    if (StreamArg < 0)
-      return true;
-
-    // These functions might be considered cold, but only if their stream
-    // argument is stderr.
-
-    if (StreamArg >= (int) CI->getNumArgOperands())
-      return false;
-    LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
-    if (!LI)
-      return false;
-    GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
-    if (!GV || !GV->isDeclaration())
-      return false;
-    return GV->getName() == "stderr";
-  }
-
-  int StreamArg;
-};
-
-struct PrintFOpt : public LibCallOptimization {
-  Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
-                                   IRBuilder<> &B) {
-    // Check for a fixed format string.
-    StringRef FormatStr;
-    if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
-      return nullptr;
-
-    // Empty format string -> noop.
-    if (FormatStr.empty())  // Tolerate printf's declared void.
-      return CI->use_empty() ? (Value*)CI :
-                               ConstantInt::get(CI->getType(), 0);
-
-    // Do not do any of the following transformations if the printf return value
-    // is used, in general the printf return value is not compatible with either
-    // putchar() or puts().
-    if (!CI->use_empty())
-      return nullptr;
-
-    // printf("x") -> putchar('x'), even for '%'.
-    if (FormatStr.size() == 1) {
-      Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, DL, TLI);
-      if (CI->use_empty() || !Res) return Res;
-      return B.CreateIntCast(Res, CI->getType(), true);
-    }
-
-    // printf("foo\n") --> puts("foo")
-    if (FormatStr[FormatStr.size()-1] == '\n' &&
-        FormatStr.find('%') == StringRef::npos) { // No format characters.
-      // Create a string literal with no \n on it.  We expect the constant merge
-      // pass to be run after this pass, to merge duplicate strings.
-      FormatStr = FormatStr.drop_back();
-      Value *GV = B.CreateGlobalString(FormatStr, "str");
-      Value *NewCI = EmitPutS(GV, B, DL, TLI);
-      return (CI->use_empty() || !NewCI) ?
-              NewCI :
-              ConstantInt::get(CI->getType(), FormatStr.size()+1);
-    }
-
-    // Optimize specific format strings.
-    // printf("%c", chr) --> putchar(chr)
-    if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
-        CI->getArgOperand(1)->getType()->isIntegerTy()) {
-      Value *Res = EmitPutChar(CI->getArgOperand(1), B, DL, TLI);
-
-      if (CI->use_empty() || !Res) return Res;
-      return B.CreateIntCast(Res, CI->getType(), true);
-    }
-
-    // printf("%s\n", str) --> puts(str)
-    if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
-        CI->getArgOperand(1)->getType()->isPointerTy()) {
-      return EmitPutS(CI->getArgOperand(1), B, DL, TLI);
-    }
+  // Do not do any of the following transformations if the printf return value
+  // is used, in general the printf return value is not compatible with either
+  // putchar() or puts().
+  if (!CI->use_empty())
     return nullptr;
+
+  // printf("x") -> putchar('x'), even for '%'.
+  if (FormatStr.size() == 1) {
+    Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, DL, TLI);
+    if (CI->use_empty() || !Res)
+      return Res;
+    return B.CreateIntCast(Res, CI->getType(), true);
   }
 
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Require one fixed pointer argument and an integer/void result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
-        !(FT->getReturnType()->isIntegerTy() ||
-          FT->getReturnType()->isVoidTy()))
-      return nullptr;
+  // printf("foo\n") --> puts("foo")
+  if (FormatStr[FormatStr.size() - 1] == '\n' &&
+      FormatStr.find('%') == StringRef::npos) { // No format characters.
+    // Create a string literal with no \n on it.  We expect the constant merge
+    // pass to be run after this pass, to merge duplicate strings.
+    FormatStr = FormatStr.drop_back();
+    Value *GV = B.CreateGlobalString(FormatStr, "str");
+    Value *NewCI = EmitPutS(GV, B, DL, TLI);
+    return (CI->use_empty() || !NewCI)
+               ? NewCI
+               : ConstantInt::get(CI->getType(), FormatStr.size() + 1);
+  }
 
-    if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
-      return V;
-    }
+  // Optimize specific format strings.
+  // printf("%c", chr) --> putchar(chr)
+  if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+      CI->getArgOperand(1)->getType()->isIntegerTy()) {
+    Value *Res = EmitPutChar(CI->getArgOperand(1), B, DL, TLI);
 
-    // printf(format, ...) -> iprintf(format, ...) if no floating point
-    // arguments.
-    if (TLI->has(LibFunc::iprintf) && !callHasFloatingPointArgument(CI)) {
-      Module *M = B.GetInsertBlock()->getParent()->getParent();
-      Constant *IPrintFFn =
+    if (CI->use_empty() || !Res)
+      return Res;
+    return B.CreateIntCast(Res, CI->getType(), true);
+  }
+
+  // printf("%s\n", str) --> puts(str)
+  if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+      CI->getArgOperand(1)->getType()->isPointerTy()) {
+    return EmitPutS(CI->getArgOperand(1), B, DL, TLI);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
+
+  Function *Callee = CI->getCalledFunction();
+  // Require one fixed pointer argument and an integer/void result.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
+      !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy()))
+    return nullptr;
+
+  if (Value *V = optimizePrintFString(CI, B)) {
+    return V;
+  }
+
+  // printf(format, ...) -> iprintf(format, ...) if no floating point
+  // arguments.
+  if (TLI->has(LibFunc::iprintf) && !callHasFloatingPointArgument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    Constant *IPrintFFn =
         M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
-      CallInst *New = cast<CallInst>(CI->clone());
-      New->setCalledFunction(IPrintFFn);
-      B.Insert(New);
-      return New;
-    }
-    return nullptr;
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(IPrintFFn);
+    B.Insert(New);
+    return New;
   }
-};
+  return nullptr;
+}
 
-struct SPrintFOpt : public LibCallOptimization {
-  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
-                                   IRBuilder<> &B) {
-    // Check for a fixed format string.
-    StringRef FormatStr;
-    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
-      return nullptr;
-
-    // If we just have a format string (nothing else crazy) transform it.
-    if (CI->getNumArgOperands() == 2) {
-      // Make sure there's no % in the constant array.  We could try to handle
-      // %% -> % in the future if we cared.
-      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-        if (FormatStr[i] == '%')
-          return nullptr; // we found a format specifier, bail out.
-
-      // These optimizations require DataLayout.
-      if (!DL) return nullptr;
-
-      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
-      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                     ConstantInt::get(DL->getIntPtrType(*Context), // Copy the
-                                      FormatStr.size() + 1), 1);   // nul byte.
-      return ConstantInt::get(CI->getType(), FormatStr.size());
-    }
-
-    // The remaining optimizations require the format string to be "%s" or "%c"
-    // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
-        CI->getNumArgOperands() < 3)
-      return nullptr;
-
-    // Decode the second character of the format string.
-    if (FormatStr[1] == 'c') {
-      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
-      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr;
-      Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
-      Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
-      B.CreateStore(V, Ptr);
-      Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul");
-      B.CreateStore(B.getInt8(0), Ptr);
-
-      return ConstantInt::get(CI->getType(), 1);
-    }
-
-    if (FormatStr[1] == 's') {
-      // These optimizations require DataLayout.
-      if (!DL) return nullptr;
-
-      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
-      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return nullptr;
-
-      Value *Len = EmitStrLen(CI->getArgOperand(2), B, DL, TLI);
-      if (!Len)
-        return nullptr;
-      Value *IncLen = B.CreateAdd(Len,
-                                  ConstantInt::get(Len->getType(), 1),
-                                  "leninc");
-      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
-
-      // The sprintf result is the unincremented number of bytes in the string.
-      return B.CreateIntCast(Len, CI->getType(), false);
-    }
+Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
+  // Check for a fixed format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
     return nullptr;
-  }
 
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Require two fixed pointer arguments and an integer result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return nullptr;
-
-    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
-      return V;
-    }
-
-    // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
-    // point arguments.
-    if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) {
-      Module *M = B.GetInsertBlock()->getParent()->getParent();
-      Constant *SIPrintFFn =
-        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
-      CallInst *New = cast<CallInst>(CI->clone());
-      New->setCalledFunction(SIPrintFFn);
-      B.Insert(New);
-      return New;
-    }
-    return nullptr;
-  }
-};
-
-struct FPrintFOpt : public LibCallOptimization {
-  Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
-                                   IRBuilder<> &B) {
-    ErrorReportingOpt ER(/* StreamArg = */ 0);
-    (void) ER.callOptimizer(Callee, CI, B);
-
-    // All the optimizations depend on the format string.
-    StringRef FormatStr;
-    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
-      return nullptr;
-
-    // Do not do any of the following transformations if the fprintf return
-    // value is used, in general the fprintf return value is not compatible
-    // with fwrite(), fputc() or fputs().
-    if (!CI->use_empty())
-      return nullptr;
-
-    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
-    if (CI->getNumArgOperands() == 2) {
-      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
-          return nullptr; // We found a format specifier.
-
-      // These optimizations require DataLayout.
-      if (!DL) return nullptr;
-
-      return EmitFWrite(CI->getArgOperand(1),
-                        ConstantInt::get(DL->getIntPtrType(*Context),
-                                         FormatStr.size()),
-                        CI->getArgOperand(0), B, DL, TLI);
-    }
-
-    // The remaining optimizations require the format string to be "%s" or "%c"
-    // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
-        CI->getNumArgOperands() < 3)
-      return nullptr;
-
-    // Decode the second character of the format string.
-    if (FormatStr[1] == 'c') {
-      // fprintf(F, "%c", chr) --> fputc(chr, F)
-      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr;
-      return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI);
-    }
-
-    if (FormatStr[1] == 's') {
-      // fprintf(F, "%s", str) --> fputs(str, F)
-      if (!CI->getArgOperand(2)->getType()->isPointerTy())
-        return nullptr;
-      return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI);
-    }
-    return nullptr;
-  }
-
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Require two fixed paramters as pointers and integer result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return nullptr;
-
-    if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
-      return V;
-    }
-
-    // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
-    // floating point arguments.
-    if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) {
-      Module *M = B.GetInsertBlock()->getParent()->getParent();
-      Constant *FIPrintFFn =
-        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
-      CallInst *New = cast<CallInst>(CI->clone());
-      New->setCalledFunction(FIPrintFFn);
-      B.Insert(New);
-      return New;
-    }
-    return nullptr;
-  }
-};
-
-struct FWriteOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    ErrorReportingOpt ER(/* StreamArg = */ 3);
-    (void) ER.callOptimizer(Callee, CI, B);
-
-    // Require a pointer, an integer, an integer, a pointer, returning integer.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() ||
-        !FT->getParamType(2)->isIntegerTy() ||
-        !FT->getParamType(3)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return nullptr;
-
-    // Get the element size and count.
-    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-    if (!SizeC || !CountC) return nullptr;
-    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
-
-    // If this is writing zero records, remove the call (it's a noop).
-    if (Bytes == 0)
-      return ConstantInt::get(CI->getType(), 0);
-
-    // If this is writing one byte, turn it into fputc.
-    // This optimisation is only valid, if the return value is unused.
-    if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
-      Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
-      Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, DL, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
-    }
-
-    return nullptr;
-  }
-};
-
-struct FPutsOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    ErrorReportingOpt ER(/* StreamArg = */ 1);
-    (void) ER.callOptimizer(Callee, CI, B);
+  // If we just have a format string (nothing else crazy) transform it.
+  if (CI->getNumArgOperands() == 2) {
+    // Make sure there's no % in the constant array.  We could try to handle
+    // %% -> % in the future if we cared.
+    for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+      if (FormatStr[i] == '%')
+        return nullptr; // we found a format specifier, bail out.
 
     // These optimizations require DataLayout.
-    if (!DL) return nullptr;
-
-    // Require two pointers.  Also, we can't optimize if return value is used.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !CI->use_empty())
+    if (!DL)
       return nullptr;
 
-    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
-    uint64_t Len = GetStringLength(CI->getArgOperand(0));
-    if (!Len) return nullptr;
-    // Known to have no uses (see above).
-    return EmitFWrite(CI->getArgOperand(0),
-                      ConstantInt::get(DL->getIntPtrType(*Context), Len-1),
-                      CI->getArgOperand(1), B, DL, TLI);
+    // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+    B.CreateMemCpy(
+        CI->getArgOperand(0), CI->getArgOperand(1),
+        ConstantInt::get(DL->getIntPtrType(CI->getContext()),
+                         FormatStr.size() + 1),
+        1); // Copy the null byte.
+    return ConstantInt::get(CI->getType(), FormatStr.size());
   }
-};
 
-struct PutsOpt : public LibCallOptimization {
-  Value *callOptimizer(Function *Callee, CallInst *CI,
-                       IRBuilder<> &B) override {
-    // Require one fixed pointer argument and an integer/void result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
-        !(FT->getReturnType()->isIntegerTy() ||
-          FT->getReturnType()->isVoidTy()))
-      return nullptr;
-
-    // Check for a constant string.
-    StringRef Str;
-    if (!getConstantStringInfo(CI->getArgOperand(0), Str))
-      return nullptr;
-
-    if (Str.empty() && CI->use_empty()) {
-      // puts("") -> putchar('\n')
-      Value *Res = EmitPutChar(B.getInt32('\n'), B, DL, TLI);
-      if (CI->use_empty() || !Res) return Res;
-      return B.CreateIntCast(Res, CI->getType(), true);
-    }
-
+  // The remaining optimizations require the format string to be "%s" or "%c"
+  // and have an extra operand.
+  if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+      CI->getNumArgOperands() < 3)
     return nullptr;
-  }
-};
 
-} // End anonymous namespace.
+  // Decode the second character of the format string.
+  if (FormatStr[1] == 'c') {
+    // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+    if (!CI->getArgOperand(2)->getType()->isIntegerTy())
+      return nullptr;
+    Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
+    Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
+    B.CreateStore(V, Ptr);
+    Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul");
+    B.CreateStore(B.getInt8(0), Ptr);
 
-namespace llvm {
-
-class LibCallSimplifierImpl {
-  const DataLayout *DL;
-  const TargetLibraryInfo *TLI;
-  const LibCallSimplifier *LCS;
-  bool UnsafeFPShrink;
-
-  // Math library call optimizations.
-  CosOpt Cos;
-  PowOpt Pow;
-  Exp2Opt Exp2;
-public:
-  LibCallSimplifierImpl(const DataLayout *DL, const TargetLibraryInfo *TLI,
-                        const LibCallSimplifier *LCS,
-                        bool UnsafeFPShrink = false)
-    : Cos(UnsafeFPShrink), Pow(UnsafeFPShrink), Exp2(UnsafeFPShrink) {
-    this->DL = DL;
-    this->TLI = TLI;
-    this->LCS = LCS;
-    this->UnsafeFPShrink = UnsafeFPShrink;
+    return ConstantInt::get(CI->getType(), 1);
   }
 
-  Value *optimizeCall(CallInst *CI);
-  LibCallOptimization *lookupOptimization(CallInst *CI);
-  bool hasFloatVersion(StringRef FuncName);
-};
+  if (FormatStr[1] == 's') {
+    // These optimizations require DataLayout.
+    if (!DL)
+      return nullptr;
 
-bool LibCallSimplifierImpl::hasFloatVersion(StringRef FuncName) {
+    // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
+    if (!CI->getArgOperand(2)->getType()->isPointerTy())
+      return nullptr;
+
+    Value *Len = EmitStrLen(CI->getArgOperand(2), B, DL, TLI);
+    if (!Len)
+      return nullptr;
+    Value *IncLen =
+        B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
+
+    // The sprintf result is the unincremented number of bytes in the string.
+    return B.CreateIntCast(Len, CI->getType(), false);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Require two fixed pointer arguments and an integer result.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy() ||
+      !FT->getReturnType()->isIntegerTy())
+    return nullptr;
+
+  if (Value *V = optimizeSPrintFString(CI, B)) {
+    return V;
+  }
+
+  // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
+  // point arguments.
+  if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    Constant *SIPrintFFn =
+        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(SIPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
+  optimizeErrorReporting(CI, B, 0);
+
+  // All the optimizations depend on the format string.
+  StringRef FormatStr;
+  if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+    return nullptr;
+
+  // Do not do any of the following transformations if the fprintf return
+  // value is used, in general the fprintf return value is not compatible
+  // with fwrite(), fputc() or fputs().
+  if (!CI->use_empty())
+    return nullptr;
+
+  // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+  if (CI->getNumArgOperands() == 2) {
+    for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+      if (FormatStr[i] == '%') // Could handle %% -> % if we cared.
+        return nullptr;        // We found a format specifier.
+
+    // These optimizations require DataLayout.
+    if (!DL)
+      return nullptr;
+
+    return EmitFWrite(
+        CI->getArgOperand(1),
+        ConstantInt::get(DL->getIntPtrType(CI->getContext()), FormatStr.size()),
+        CI->getArgOperand(0), B, DL, TLI);
+  }
+
+  // The remaining optimizations require the format string to be "%s" or "%c"
+  // and have an extra operand.
+  if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+      CI->getNumArgOperands() < 3)
+    return nullptr;
+
+  // Decode the second character of the format string.
+  if (FormatStr[1] == 'c') {
+    // fprintf(F, "%c", chr) --> fputc(chr, F)
+    if (!CI->getArgOperand(2)->getType()->isIntegerTy())
+      return nullptr;
+    return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI);
+  }
+
+  if (FormatStr[1] == 's') {
+    // fprintf(F, "%s", str) --> fputs(str, F)
+    if (!CI->getArgOperand(2)->getType()->isPointerTy())
+      return nullptr;
+    return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, DL, TLI);
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Require two fixed paramters as pointers and integer result.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy() ||
+      !FT->getReturnType()->isIntegerTy())
+    return nullptr;
+
+  if (Value *V = optimizeFPrintFString(CI, B)) {
+    return V;
+  }
+
+  // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
+  // floating point arguments.
+  if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) {
+    Module *M = B.GetInsertBlock()->getParent()->getParent();
+    Constant *FIPrintFFn =
+        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+    CallInst *New = cast<CallInst>(CI->clone());
+    New->setCalledFunction(FIPrintFFn);
+    B.Insert(New);
+    return New;
+  }
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
+  optimizeErrorReporting(CI, B, 3);
+
+  Function *Callee = CI->getCalledFunction();
+  // Require a pointer, an integer, an integer, a pointer, returning integer.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isIntegerTy() ||
+      !FT->getParamType(2)->isIntegerTy() ||
+      !FT->getParamType(3)->isPointerTy() ||
+      !FT->getReturnType()->isIntegerTy())
+    return nullptr;
+
+  // Get the element size and count.
+  ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!SizeC || !CountC)
+    return nullptr;
+  uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
+
+  // If this is writing zero records, remove the call (it's a noop).
+  if (Bytes == 0)
+    return ConstantInt::get(CI->getType(), 0);
+
+  // If this is writing one byte, turn it into fputc.
+  // This optimisation is only valid, if the return value is unused.
+  if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
+    Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
+    Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, DL, TLI);
+    return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
+  optimizeErrorReporting(CI, B, 1);
+
+  Function *Callee = CI->getCalledFunction();
+
+  // These optimizations require DataLayout.
+  if (!DL)
+    return nullptr;
+
+  // Require two pointers.  Also, we can't optimize if return value is used.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+      !FT->getParamType(1)->isPointerTy() || !CI->use_empty())
+    return nullptr;
+
+  // fputs(s,F) --> fwrite(s,1,strlen(s),F)
+  uint64_t Len = GetStringLength(CI->getArgOperand(0));
+  if (!Len)
+    return nullptr;
+
+  // Known to have no uses (see above).
+  return EmitFWrite(
+      CI->getArgOperand(0),
+      ConstantInt::get(DL->getIntPtrType(CI->getContext()), Len - 1),
+      CI->getArgOperand(1), B, DL, TLI);
+}
+
+Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
+  Function *Callee = CI->getCalledFunction();
+  // Require one fixed pointer argument and an integer/void result.
+  FunctionType *FT = Callee->getFunctionType();
+  if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
+      !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy()))
+    return nullptr;
+
+  // Check for a constant string.
+  StringRef Str;
+  if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+    return nullptr;
+
+  if (Str.empty() && CI->use_empty()) {
+    // puts("") -> putchar('\n')
+    Value *Res = EmitPutChar(B.getInt32('\n'), B, DL, TLI);
+    if (CI->use_empty() || !Res)
+      return Res;
+    return B.CreateIntCast(Res, CI->getType(), true);
+  }
+
+  return nullptr;
+}
+
+bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
   LibFunc::Func Func;
   SmallString<20> FloatFuncName = FuncName;
   FloatFuncName += 'f';
@@ -2048,263 +1995,219 @@
   return false;
 }
 
-// Fortified library call optimizations.
-static MemCpyChkOpt MemCpyChk;
-static MemMoveChkOpt MemMoveChk;
-static MemSetChkOpt MemSetChk;
-static StrCpyChkOpt StrCpyChk;
-static StpCpyChkOpt StpCpyChk;
-static StrNCpyChkOpt StrNCpyChk;
+Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+  if (CI->isNoBuiltin())
+    return nullptr;
 
-// String library call optimizations.
-static StrCatOpt StrCat;
-static StrNCatOpt StrNCat;
-static StrChrOpt StrChr;
-static StrRChrOpt StrRChr;
-static StrCmpOpt StrCmp;
-static StrNCmpOpt StrNCmp;
-static StrCpyOpt StrCpy;
-static StpCpyOpt StpCpy;
-static StrNCpyOpt StrNCpy;
-static StrLenOpt StrLen;
-static StrPBrkOpt StrPBrk;
-static StrToOpt StrTo;
-static StrSpnOpt StrSpn;
-static StrCSpnOpt StrCSpn;
-static StrStrOpt StrStr;
-
-// Memory library call optimizations.
-static MemCmpOpt MemCmp;
-static MemCpyOpt MemCpy;
-static MemMoveOpt MemMove;
-static MemSetOpt MemSet;
-
-// Math library call optimizations.
-static UnaryDoubleFPOpt UnaryDoubleFP(false);
-static BinaryDoubleFPOpt BinaryDoubleFP(false);
-static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
-static SinCosPiOpt SinCosPi;
-
-  // Integer library call optimizations.
-static FFSOpt FFS;
-static AbsOpt Abs;
-static IsDigitOpt IsDigit;
-static IsAsciiOpt IsAscii;
-static ToAsciiOpt ToAscii;
-
-// Formatting and IO library call optimizations.
-static ErrorReportingOpt ErrorReporting;
-static ErrorReportingOpt ErrorReporting0(0);
-static ErrorReportingOpt ErrorReporting1(1);
-static PrintFOpt PrintF;
-static SPrintFOpt SPrintF;
-static FPrintFOpt FPrintF;
-static FWriteOpt FWrite;
-static FPutsOpt FPuts;
-static PutsOpt Puts;
-
-LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
   LibFunc::Func Func;
   Function *Callee = CI->getCalledFunction();
   StringRef FuncName = Callee->getName();
+  IRBuilder<> Builder(CI);
+  bool isCallingConvC = CI->getCallingConv() == llvm::CallingConv::C;
 
-  // Next check for intrinsics.
+  // Command-line parameter overrides function attribute.
+  if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
+    UnsafeFPShrink = EnableUnsafeFPShrink;
+  else if (Callee->hasFnAttribute("unsafe-fp-math")) {
+    // FIXME: This is the same problem as described in optimizeSqrt().
+    // If calls gain access to IR-level FMF, then use that instead of a
+    // function attribute.
+
+    // Check for unsafe-fp-math = true.
+    Attribute Attr = Callee->getFnAttribute("unsafe-fp-math");
+    if (Attr.getValueAsString() == "true")
+      UnsafeFPShrink = true;
+  }
+
+  // First, check for intrinsics.
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    if (!isCallingConvC)
+      return nullptr;
     switch (II->getIntrinsicID()) {
     case Intrinsic::pow:
-       return &Pow;
+      return optimizePow(CI, Builder);
     case Intrinsic::exp2:
-       return &Exp2;
+      return optimizeExp2(CI, Builder);
+    case Intrinsic::fabs:
+      return optimizeFabs(CI, Builder);
+    case Intrinsic::sqrt:
+      return optimizeSqrt(CI, Builder);
     default:
-       return nullptr;
+      return nullptr;
     }
   }
 
   // Then check for known library functions.
   if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
+    // We never change the calling convention.
+    if (!ignoreCallingConv(Func) && !isCallingConvC)
+      return nullptr;
     switch (Func) {
-      case LibFunc::strcat:
-        return &StrCat;
-      case LibFunc::strncat:
-        return &StrNCat;
-      case LibFunc::strchr:
-        return &StrChr;
-      case LibFunc::strrchr:
-        return &StrRChr;
-      case LibFunc::strcmp:
-        return &StrCmp;
-      case LibFunc::strncmp:
-        return &StrNCmp;
-      case LibFunc::strcpy:
-        return &StrCpy;
-      case LibFunc::stpcpy:
-        return &StpCpy;
-      case LibFunc::strncpy:
-        return &StrNCpy;
-      case LibFunc::strlen:
-        return &StrLen;
-      case LibFunc::strpbrk:
-        return &StrPBrk;
-      case LibFunc::strtol:
-      case LibFunc::strtod:
-      case LibFunc::strtof:
-      case LibFunc::strtoul:
-      case LibFunc::strtoll:
-      case LibFunc::strtold:
-      case LibFunc::strtoull:
-        return &StrTo;
-      case LibFunc::strspn:
-        return &StrSpn;
-      case LibFunc::strcspn:
-        return &StrCSpn;
-      case LibFunc::strstr:
-        return &StrStr;
-      case LibFunc::memcmp:
-        return &MemCmp;
-      case LibFunc::memcpy:
-        return &MemCpy;
-      case LibFunc::memmove:
-        return &MemMove;
-      case LibFunc::memset:
-        return &MemSet;
-      case LibFunc::cosf:
-      case LibFunc::cos:
-      case LibFunc::cosl:
-        return &Cos;
-      case LibFunc::sinpif:
-      case LibFunc::sinpi:
-      case LibFunc::cospif:
-      case LibFunc::cospi:
-        return &SinCosPi;
-      case LibFunc::powf:
-      case LibFunc::pow:
-      case LibFunc::powl:
-        return &Pow;
-      case LibFunc::exp2l:
-      case LibFunc::exp2:
-      case LibFunc::exp2f:
-        return &Exp2;
-      case LibFunc::ffs:
-      case LibFunc::ffsl:
-      case LibFunc::ffsll:
-        return &FFS;
-      case LibFunc::abs:
-      case LibFunc::labs:
-      case LibFunc::llabs:
-        return &Abs;
-      case LibFunc::isdigit:
-        return &IsDigit;
-      case LibFunc::isascii:
-        return &IsAscii;
-      case LibFunc::toascii:
-        return &ToAscii;
-      case LibFunc::printf:
-        return &PrintF;
-      case LibFunc::sprintf:
-        return &SPrintF;
-      case LibFunc::fprintf:
-        return &FPrintF;
-      case LibFunc::fwrite:
-        return &FWrite;
-      case LibFunc::fputs:
-        return &FPuts;
-      case LibFunc::puts:
-        return &Puts;
-      case LibFunc::perror:
-        return &ErrorReporting;
-      case LibFunc::vfprintf:
-      case LibFunc::fiprintf:
-        return &ErrorReporting0;
-      case LibFunc::fputc:
-        return &ErrorReporting1;
-      case LibFunc::ceil:
-      case LibFunc::fabs:
-      case LibFunc::floor:
-      case LibFunc::rint:
-      case LibFunc::round:
-      case LibFunc::nearbyint:
-      case LibFunc::trunc:
-        if (hasFloatVersion(FuncName))
-          return &UnaryDoubleFP;
-        return nullptr;
-      case LibFunc::acos:
-      case LibFunc::acosh:
-      case LibFunc::asin:
-      case LibFunc::asinh:
-      case LibFunc::atan:
-      case LibFunc::atanh:
-      case LibFunc::cbrt:
-      case LibFunc::cosh:
-      case LibFunc::exp:
-      case LibFunc::exp10:
-      case LibFunc::expm1:
-      case LibFunc::log:
-      case LibFunc::log10:
-      case LibFunc::log1p:
-      case LibFunc::log2:
-      case LibFunc::logb:
-      case LibFunc::sin:
-      case LibFunc::sinh:
-      case LibFunc::sqrt:
-      case LibFunc::tan:
-      case LibFunc::tanh:
-        if (UnsafeFPShrink && hasFloatVersion(FuncName))
-         return &UnsafeUnaryDoubleFP;
-        return nullptr;
-      case LibFunc::fmin:
-      case LibFunc::fmax:
-        if (hasFloatVersion(FuncName))
-          return &BinaryDoubleFP;
-        return nullptr;
-      case LibFunc::memcpy_chk:
-        return &MemCpyChk;
-      default:
-        return nullptr;
-      }
+    case LibFunc::strcat:
+      return optimizeStrCat(CI, Builder);
+    case LibFunc::strncat:
+      return optimizeStrNCat(CI, Builder);
+    case LibFunc::strchr:
+      return optimizeStrChr(CI, Builder);
+    case LibFunc::strrchr:
+      return optimizeStrRChr(CI, Builder);
+    case LibFunc::strcmp:
+      return optimizeStrCmp(CI, Builder);
+    case LibFunc::strncmp:
+      return optimizeStrNCmp(CI, Builder);
+    case LibFunc::strcpy:
+      return optimizeStrCpy(CI, Builder);
+    case LibFunc::stpcpy:
+      return optimizeStpCpy(CI, Builder);
+    case LibFunc::strncpy:
+      return optimizeStrNCpy(CI, Builder);
+    case LibFunc::strlen:
+      return optimizeStrLen(CI, Builder);
+    case LibFunc::strpbrk:
+      return optimizeStrPBrk(CI, Builder);
+    case LibFunc::strtol:
+    case LibFunc::strtod:
+    case LibFunc::strtof:
+    case LibFunc::strtoul:
+    case LibFunc::strtoll:
+    case LibFunc::strtold:
+    case LibFunc::strtoull:
+      return optimizeStrTo(CI, Builder);
+    case LibFunc::strspn:
+      return optimizeStrSpn(CI, Builder);
+    case LibFunc::strcspn:
+      return optimizeStrCSpn(CI, Builder);
+    case LibFunc::strstr:
+      return optimizeStrStr(CI, Builder);
+    case LibFunc::memcmp:
+      return optimizeMemCmp(CI, Builder);
+    case LibFunc::memcpy:
+      return optimizeMemCpy(CI, Builder);
+    case LibFunc::memmove:
+      return optimizeMemMove(CI, Builder);
+    case LibFunc::memset:
+      return optimizeMemSet(CI, Builder);
+    case LibFunc::cosf:
+    case LibFunc::cos:
+    case LibFunc::cosl:
+      return optimizeCos(CI, Builder);
+    case LibFunc::sinpif:
+    case LibFunc::sinpi:
+    case LibFunc::cospif:
+    case LibFunc::cospi:
+      return optimizeSinCosPi(CI, Builder);
+    case LibFunc::powf:
+    case LibFunc::pow:
+    case LibFunc::powl:
+      return optimizePow(CI, Builder);
+    case LibFunc::exp2l:
+    case LibFunc::exp2:
+    case LibFunc::exp2f:
+      return optimizeExp2(CI, Builder);
+    case LibFunc::fabsf:
+    case LibFunc::fabs:
+    case LibFunc::fabsl:
+      return optimizeFabs(CI, Builder);
+    case LibFunc::sqrtf:
+    case LibFunc::sqrt:
+    case LibFunc::sqrtl:
+      return optimizeSqrt(CI, Builder);
+    case LibFunc::ffs:
+    case LibFunc::ffsl:
+    case LibFunc::ffsll:
+      return optimizeFFS(CI, Builder);
+    case LibFunc::abs:
+    case LibFunc::labs:
+    case LibFunc::llabs:
+      return optimizeAbs(CI, Builder);
+    case LibFunc::isdigit:
+      return optimizeIsDigit(CI, Builder);
+    case LibFunc::isascii:
+      return optimizeIsAscii(CI, Builder);
+    case LibFunc::toascii:
+      return optimizeToAscii(CI, Builder);
+    case LibFunc::printf:
+      return optimizePrintF(CI, Builder);
+    case LibFunc::sprintf:
+      return optimizeSPrintF(CI, Builder);
+    case LibFunc::fprintf:
+      return optimizeFPrintF(CI, Builder);
+    case LibFunc::fwrite:
+      return optimizeFWrite(CI, Builder);
+    case LibFunc::fputs:
+      return optimizeFPuts(CI, Builder);
+    case LibFunc::puts:
+      return optimizePuts(CI, Builder);
+    case LibFunc::perror:
+      return optimizeErrorReporting(CI, Builder);
+    case LibFunc::vfprintf:
+    case LibFunc::fiprintf:
+      return optimizeErrorReporting(CI, Builder, 0);
+    case LibFunc::fputc:
+      return optimizeErrorReporting(CI, Builder, 1);
+    case LibFunc::ceil:
+    case LibFunc::floor:
+    case LibFunc::rint:
+    case LibFunc::round:
+    case LibFunc::nearbyint:
+    case LibFunc::trunc:
+      if (hasFloatVersion(FuncName))
+        return optimizeUnaryDoubleFP(CI, Builder, false);
+      return nullptr;
+    case LibFunc::acos:
+    case LibFunc::acosh:
+    case LibFunc::asin:
+    case LibFunc::asinh:
+    case LibFunc::atan:
+    case LibFunc::atanh:
+    case LibFunc::cbrt:
+    case LibFunc::cosh:
+    case LibFunc::exp:
+    case LibFunc::exp10:
+    case LibFunc::expm1:
+    case LibFunc::log:
+    case LibFunc::log10:
+    case LibFunc::log1p:
+    case LibFunc::log2:
+    case LibFunc::logb:
+    case LibFunc::sin:
+    case LibFunc::sinh:
+    case LibFunc::tan:
+    case LibFunc::tanh:
+      if (UnsafeFPShrink && hasFloatVersion(FuncName))
+        return optimizeUnaryDoubleFP(CI, Builder, true);
+      return nullptr;
+    case LibFunc::fmin:
+    case LibFunc::fmax:
+      if (hasFloatVersion(FuncName))
+        return optimizeBinaryDoubleFP(CI, Builder);
+      return nullptr;
+    case LibFunc::memcpy_chk:
+      return optimizeMemCpyChk(CI, Builder);
+    case LibFunc::memmove_chk:
+      return optimizeMemMoveChk(CI, Builder);
+    case LibFunc::memset_chk:
+      return optimizeMemSetChk(CI, Builder);
+    case LibFunc::strcpy_chk:
+      return optimizeStrCpyChk(CI, Builder);
+    case LibFunc::stpcpy_chk:
+      return optimizeStpCpyChk(CI, Builder);
+    case LibFunc::stpncpy_chk:
+    case LibFunc::strncpy_chk:
+      return optimizeStrNCpyChk(CI, Builder);
+    default:
+      return nullptr;
+    }
   }
 
-  // Finally check for fortified library calls.
-  if (FuncName.endswith("_chk")) {
-    if (FuncName == "__memmove_chk")
-      return &MemMoveChk;
-    else if (FuncName == "__memset_chk")
-      return &MemSetChk;
-    else if (FuncName == "__strcpy_chk")
-      return &StrCpyChk;
-    else if (FuncName == "__stpcpy_chk")
-      return &StpCpyChk;
-    else if (FuncName == "__strncpy_chk")
-      return &StrNCpyChk;
-    else if (FuncName == "__stpncpy_chk")
-      return &StrNCpyChk;
-  }
-
-  return nullptr;
-
-}
-
-Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
-  LibCallOptimization *LCO = lookupOptimization(CI);
-  if (LCO) {
-    IRBuilder<> Builder(CI);
-    return LCO->optimizeCall(CI, DL, TLI, LCS, Builder);
-  }
   return nullptr;
 }
 
 LibCallSimplifier::LibCallSimplifier(const DataLayout *DL,
-                                     const TargetLibraryInfo *TLI,
-                                     bool UnsafeFPShrink) {
-  Impl = new LibCallSimplifierImpl(DL, TLI, this, UnsafeFPShrink);
-}
-
-LibCallSimplifier::~LibCallSimplifier() {
-  delete Impl;
-}
-
-Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
-  if (CI->isNoBuiltin()) return nullptr;
-  return Impl->optimizeCall(CI);
+                                     const TargetLibraryInfo *TLI) :
+                                     DL(DL),
+                                     TLI(TLI),
+                                     UnsafeFPShrink(false) {
 }
 
 void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const {
@@ -2312,8 +2215,6 @@
   I->eraseFromParent();
 }
 
-}
-
 // TODO:
 //   Additional cases that we need to add to this file:
 //

diff --git a/lib/Transforms/Utils/SymbolRewriter.cpp b/lib/Transforms/Utils/SymbolRewriter.cpp
new file mode 100644
index 0000000..aacc945
--- /dev/null
+++ b/lib/Transforms/Utils/SymbolRewriter.cpp

@@ -0,0 +1,525 @@
+//===- SymbolRewriter.cpp - Symbol Rewriter ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SymbolRewriter is a LLVM pass which can rewrite symbols transparently within
+// existing code.  It is implemented as a compiler pass and is configured via a
+// YAML configuration file.
+//
+// The YAML configuration file format is as follows:
+//
+// RewriteMapFile := RewriteDescriptors
+// RewriteDescriptors := RewriteDescriptor | RewriteDescriptors
+// RewriteDescriptor := RewriteDescriptorType ':' '{' RewriteDescriptorFields '}'
+// RewriteDescriptorFields := RewriteDescriptorField | RewriteDescriptorFields
+// RewriteDescriptorField := FieldIdentifier ':' FieldValue ','
+// RewriteDescriptorType := Identifier
+// FieldIdentifier := Identifier
+// FieldValue := Identifier
+// Identifier := [0-9a-zA-Z]+
+//
+// Currently, the following descriptor types are supported:
+//
+// - function:          (function rewriting)
+//      + Source        (original name of the function)
+//      + Target        (explicit transformation)
+//      + Transform     (pattern transformation)
+//      + Naked         (boolean, whether the function is undecorated)
+// - global variable:   (external linkage global variable rewriting)
+//      + Source        (original name of externally visible variable)
+//      + Target        (explicit transformation)
+//      + Transform     (pattern transformation)
+// - global alias:      (global alias rewriting)
+//      + Source        (original name of the aliased name)
+//      + Target        (explicit transformation)
+//      + Transform     (pattern transformation)
+//
+// Note that source and exactly one of [Target, Transform] must be provided
+//
+// New rewrite descriptors can be created.  Addding a new rewrite descriptor
+// involves:
+//
+//  a) extended the rewrite descriptor kind enumeration
+//     (<anonymous>::RewriteDescriptor::RewriteDescriptorType)
+//  b) implementing the new descriptor
+//     (c.f. <anonymous>::ExplicitRewriteFunctionDescriptor)
+//  c) extending the rewrite map parser
+//     (<anonymous>::RewriteMapParser::parseEntry)
+//
+//  Specify to rewrite the symbols using the `-rewrite-symbols` option, and
+//  specify the map file to use for the rewriting via the `-rewrite-map-file`
+//  option.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "symbol-rewriter"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/SymbolRewriter.h"
+
+using namespace llvm;
+
+static cl::list<std::string> RewriteMapFiles("rewrite-map-file",
+                                             cl::desc("Symbol Rewrite Map"),
+                                             cl::value_desc("filename"));
+
+namespace llvm {
+namespace SymbolRewriter {
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(llvm::Module::*Get)(StringRef) const>
+class ExplicitRewriteDescriptor : public RewriteDescriptor {
+public:
+  const std::string Source;
+  const std::string Target;
+
+  ExplicitRewriteDescriptor(StringRef S, StringRef T, const bool Naked)
+      : RewriteDescriptor(DT), Source(Naked ? StringRef("\01" + S.str()) : S),
+        Target(T) {}
+
+  bool performOnModule(Module &M) override;
+
+  static bool classof(const RewriteDescriptor *RD) {
+    return RD->getType() == DT;
+  }
+};
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(llvm::Module::*Get)(StringRef) const>
+bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) {
+  bool Changed = false;
+  if (ValueType *S = (M.*Get)(Source)) {
+    if (Value *T = (M.*Get)(Target))
+      S->setValueName(T->getValueName());
+    else
+      S->setName(Target);
+    Changed = true;
+  }
+  return Changed;
+}
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(llvm::Module::*Get)(StringRef) const,
+          iterator_range<typename iplist<ValueType>::iterator> (llvm::Module::*Iterator)()>
+class PatternRewriteDescriptor : public RewriteDescriptor {
+public:
+  const std::string Pattern;
+  const std::string Transform;
+
+  PatternRewriteDescriptor(StringRef P, StringRef T)
+    : RewriteDescriptor(DT), Pattern(P), Transform(T) { }
+
+  bool performOnModule(Module &M) override;
+
+  static bool classof(const RewriteDescriptor *RD) {
+    return RD->getType() == DT;
+  }
+};
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+          ValueType *(llvm::Module::*Get)(StringRef) const,
+          iterator_range<typename iplist<ValueType>::iterator> (llvm::Module::*Iterator)()>
+bool PatternRewriteDescriptor<DT, ValueType, Get, Iterator>::
+performOnModule(Module &M) {
+  bool Changed = false;
+  for (auto &C : (M.*Iterator)()) {
+    std::string Error;
+
+    std::string Name = Regex(Pattern).sub(Transform, C.getName(), &Error);
+    if (!Error.empty())
+      report_fatal_error("unable to transforn " + C.getName() + " in " +
+                         M.getModuleIdentifier() + ": " + Error);
+
+    if (Value *V = (M.*Get)(Name))
+      C.setValueName(V->getValueName());
+    else
+      C.setName(Name);
+
+    Changed = true;
+  }
+  return Changed;
+}
+
+/// Represents a rewrite for an explicitly named (function) symbol.  Both the
+/// source function name and target function name of the transformation are
+/// explicitly spelt out.
+typedef ExplicitRewriteDescriptor<RewriteDescriptor::Type::Function,
+                                  llvm::Function, &llvm::Module::getFunction>
+    ExplicitRewriteFunctionDescriptor;
+
+/// Represents a rewrite for an explicitly named (global variable) symbol.  Both
+/// the source variable name and target variable name are spelt out.  This
+/// applies only to module level variables.
+typedef ExplicitRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
+                                  llvm::GlobalVariable,
+                                  &llvm::Module::getGlobalVariable>
+    ExplicitRewriteGlobalVariableDescriptor;
+
+/// Represents a rewrite for an explicitly named global alias.  Both the source
+/// and target name are explicitly spelt out.
+typedef ExplicitRewriteDescriptor<RewriteDescriptor::Type::NamedAlias,
+                                  llvm::GlobalAlias,
+                                  &llvm::Module::getNamedAlias>
+    ExplicitRewriteNamedAliasDescriptor;
+
+/// Represents a rewrite for a regular expression based pattern for functions.
+/// A pattern for the function name is provided and a transformation for that
+/// pattern to determine the target function name create the rewrite rule.
+typedef PatternRewriteDescriptor<RewriteDescriptor::Type::Function,
+                                 llvm::Function, &llvm::Module::getFunction,
+                                 &llvm::Module::functions>
+    PatternRewriteFunctionDescriptor;
+
+/// Represents a rewrite for a global variable based upon a matching pattern.
+/// Each global variable matching the provided pattern will be transformed as
+/// described in the transformation pattern for the target.  Applies only to
+/// module level variables.
+typedef PatternRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
+                                 llvm::GlobalVariable,
+                                 &llvm::Module::getGlobalVariable,
+                                 &llvm::Module::globals>
+    PatternRewriteGlobalVariableDescriptor;
+
+/// PatternRewriteNamedAliasDescriptor - represents a rewrite for global
+/// aliases which match a given pattern.  The provided transformation will be
+/// applied to each of the matching names.
+typedef PatternRewriteDescriptor<RewriteDescriptor::Type::NamedAlias,
+                                 llvm::GlobalAlias,
+                                 &llvm::Module::getNamedAlias,
+                                 &llvm::Module::aliases>
+    PatternRewriteNamedAliasDescriptor;
+
+bool RewriteMapParser::parse(const std::string &MapFile,
+                             RewriteDescriptorList *DL) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Mapping =
+      MemoryBuffer::getFile(MapFile);
+
+  if (!Mapping)
+    report_fatal_error("unable to read rewrite map '" + MapFile + "': " +
+                       Mapping.getError().message());
+
+  if (!parse(*Mapping, DL))
+    report_fatal_error("unable to parse rewrite map '" + MapFile + "'");
+
+  return true;
+}
+
+bool RewriteMapParser::parse(std::unique_ptr<MemoryBuffer> &MapFile,
+                             RewriteDescriptorList *DL) {
+  SourceMgr SM;
+  yaml::Stream YS(MapFile->getBuffer(), SM);
+
+  for (auto &Document : YS) {
+    yaml::MappingNode *DescriptorList;
+
+    // ignore empty documents
+    if (isa<yaml::NullNode>(Document.getRoot()))
+      continue;
+
+    DescriptorList = dyn_cast<yaml::MappingNode>(Document.getRoot());
+    if (!DescriptorList) {
+      YS.printError(Document.getRoot(), "DescriptorList node must be a map");
+      return false;
+    }
+
+    for (auto &Descriptor : *DescriptorList)
+      if (!parseEntry(YS, Descriptor, DL))
+        return false;
+  }
+
+  return true;
+}
+
+bool RewriteMapParser::parseEntry(yaml::Stream &YS, yaml::KeyValueNode &Entry,
+                                  RewriteDescriptorList *DL) {
+  yaml::ScalarNode *Key;
+  yaml::MappingNode *Value;
+  SmallString<32> KeyStorage;
+  StringRef RewriteType;
+
+  Key = dyn_cast<yaml::ScalarNode>(Entry.getKey());
+  if (!Key) {
+    YS.printError(Entry.getKey(), "rewrite type must be a scalar");
+    return false;
+  }
+
+  Value = dyn_cast<yaml::MappingNode>(Entry.getValue());
+  if (!Value) {
+    YS.printError(Entry.getValue(), "rewrite descriptor must be a map");
+    return false;
+  }
+
+  RewriteType = Key->getValue(KeyStorage);
+  if (RewriteType.equals("function"))
+    return parseRewriteFunctionDescriptor(YS, Key, Value, DL);
+  else if (RewriteType.equals("global variable"))
+    return parseRewriteGlobalVariableDescriptor(YS, Key, Value, DL);
+  else if (RewriteType.equals("global alias"))
+    return parseRewriteGlobalAliasDescriptor(YS, Key, Value, DL);
+
+  YS.printError(Entry.getKey(), "unknown rewrite type");
+  return false;
+}
+
+bool RewriteMapParser::
+parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                               yaml::MappingNode *Descriptor,
+                               RewriteDescriptorList *DL) {
+  bool Naked = false;
+  std::string Source;
+  std::string Target;
+  std::string Transform;
+
+  for (auto &Field : *Descriptor) {
+    yaml::ScalarNode *Key;
+    yaml::ScalarNode *Value;
+    SmallString<32> KeyStorage;
+    SmallString<32> ValueStorage;
+    StringRef KeyValue;
+
+    Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+    if (!Key) {
+      YS.printError(Field.getKey(), "descriptor key must be a scalar");
+      return false;
+    }
+
+    Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+    if (!Value) {
+      YS.printError(Field.getValue(), "descriptor value must be a scalar");
+      return false;
+    }
+
+    KeyValue = Key->getValue(KeyStorage);
+    if (KeyValue.equals("source")) {
+      std::string Error;
+
+      Source = Value->getValue(ValueStorage);
+      if (!Regex(Source).isValid(Error)) {
+        YS.printError(Field.getKey(), "invalid regex: " + Error);
+        return false;
+      }
+    } else if (KeyValue.equals("target")) {
+      Target = Value->getValue(ValueStorage);
+    } else if (KeyValue.equals("transform")) {
+      Transform = Value->getValue(ValueStorage);
+    } else if (KeyValue.equals("naked")) {
+      std::string Undecorated;
+
+      Undecorated = Value->getValue(ValueStorage);
+      Naked = StringRef(Undecorated).lower() == "true" || Undecorated == "1";
+    } else {
+      YS.printError(Field.getKey(), "unknown key for function");
+      return false;
+    }
+  }
+
+  if (Transform.empty() == Target.empty()) {
+    YS.printError(Descriptor,
+                  "exactly one of transform or target must be specified");
+    return false;
+  }
+
+  // TODO see if there is a more elegant solution to selecting the rewrite
+  // descriptor type
+  if (!Target.empty())
+    DL->push_back(new ExplicitRewriteFunctionDescriptor(Source, Target, Naked));
+  else
+    DL->push_back(new PatternRewriteFunctionDescriptor(Source, Transform));
+
+  return true;
+}
+
+bool RewriteMapParser::
+parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                                     yaml::MappingNode *Descriptor,
+                                     RewriteDescriptorList *DL) {
+  std::string Source;
+  std::string Target;
+  std::string Transform;
+
+  for (auto &Field : *Descriptor) {
+    yaml::ScalarNode *Key;
+    yaml::ScalarNode *Value;
+    SmallString<32> KeyStorage;
+    SmallString<32> ValueStorage;
+    StringRef KeyValue;
+
+    Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+    if (!Key) {
+      YS.printError(Field.getKey(), "descriptor Key must be a scalar");
+      return false;
+    }
+
+    Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+    if (!Value) {
+      YS.printError(Field.getValue(), "descriptor value must be a scalar");
+      return false;
+    }
+
+    KeyValue = Key->getValue(KeyStorage);
+    if (KeyValue.equals("source")) {
+      std::string Error;
+
+      Source = Value->getValue(ValueStorage);
+      if (!Regex(Source).isValid(Error)) {
+        YS.printError(Field.getKey(), "invalid regex: " + Error);
+        return false;
+      }
+    } else if (KeyValue.equals("target")) {
+      Target = Value->getValue(ValueStorage);
+    } else if (KeyValue.equals("transform")) {
+      Transform = Value->getValue(ValueStorage);
+    } else {
+      YS.printError(Field.getKey(), "unknown Key for Global Variable");
+      return false;
+    }
+  }
+
+  if (Transform.empty() == Target.empty()) {
+    YS.printError(Descriptor,
+                  "exactly one of transform or target must be specified");
+    return false;
+  }
+
+  if (!Target.empty())
+    DL->push_back(new ExplicitRewriteGlobalVariableDescriptor(Source, Target,
+                                                              /*Naked*/false));
+  else
+    DL->push_back(new PatternRewriteGlobalVariableDescriptor(Source,
+                                                             Transform));
+
+  return true;
+}
+
+bool RewriteMapParser::
+parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+                                  yaml::MappingNode *Descriptor,
+                                  RewriteDescriptorList *DL) {
+  std::string Source;
+  std::string Target;
+  std::string Transform;
+
+  for (auto &Field : *Descriptor) {
+    yaml::ScalarNode *Key;
+    yaml::ScalarNode *Value;
+    SmallString<32> KeyStorage;
+    SmallString<32> ValueStorage;
+    StringRef KeyValue;
+
+    Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+    if (!Key) {
+      YS.printError(Field.getKey(), "descriptor key must be a scalar");
+      return false;
+    }
+
+    Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+    if (!Value) {
+      YS.printError(Field.getValue(), "descriptor value must be a scalar");
+      return false;
+    }
+
+    KeyValue = Key->getValue(KeyStorage);
+    if (KeyValue.equals("source")) {
+      std::string Error;
+
+      Source = Value->getValue(ValueStorage);
+      if (!Regex(Source).isValid(Error)) {
+        YS.printError(Field.getKey(), "invalid regex: " + Error);
+        return false;
+      }
+    } else if (KeyValue.equals("target")) {
+      Target = Value->getValue(ValueStorage);
+    } else if (KeyValue.equals("transform")) {
+      Transform = Value->getValue(ValueStorage);
+    } else {
+      YS.printError(Field.getKey(), "unknown key for Global Alias");
+      return false;
+    }
+  }
+
+  if (Transform.empty() == Target.empty()) {
+    YS.printError(Descriptor,
+                  "exactly one of transform or target must be specified");
+    return false;
+  }
+
+  if (!Target.empty())
+    DL->push_back(new ExplicitRewriteNamedAliasDescriptor(Source, Target,
+                                                          /*Naked*/false));
+  else
+    DL->push_back(new PatternRewriteNamedAliasDescriptor(Source, Transform));
+
+  return true;
+}
+}
+}
+
+namespace {
+class RewriteSymbols : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  RewriteSymbols();
+  RewriteSymbols(SymbolRewriter::RewriteDescriptorList &DL);
+
+  bool runOnModule(Module &M) override;
+
+private:
+  void loadAndParseMapFiles();
+
+  SymbolRewriter::RewriteDescriptorList Descriptors;
+};
+
+char RewriteSymbols::ID = 0;
+
+RewriteSymbols::RewriteSymbols() : ModulePass(ID) {
+  initializeRewriteSymbolsPass(*PassRegistry::getPassRegistry());
+  loadAndParseMapFiles();
+}
+
+RewriteSymbols::RewriteSymbols(SymbolRewriter::RewriteDescriptorList &DL)
+    : ModulePass(ID) {
+  std::swap(Descriptors, DL);
+}
+
+bool RewriteSymbols::runOnModule(Module &M) {
+  bool Changed;
+
+  Changed = false;
+  for (auto &Descriptor : Descriptors)
+    Changed |= Descriptor.performOnModule(M);
+
+  return Changed;
+}
+
+void RewriteSymbols::loadAndParseMapFiles() {
+  const std::vector<std::string> MapFiles(RewriteMapFiles);
+  SymbolRewriter::RewriteMapParser parser;
+
+  for (const auto &MapFile : MapFiles)
+    parser.parse(MapFile, &Descriptors);
+}
+}
+
+INITIALIZE_PASS(RewriteSymbols, "rewrite-symbols", "Rewrite Symbols", false,
+                false)
+
+ModulePass *llvm::createRewriteSymbolsPass() { return new RewriteSymbols(); }
+
+ModulePass *
+llvm::createRewriteSymbolsPass(SymbolRewriter::RewriteDescriptorList &DL) {
+  return new RewriteSymbols(DL);
+}

diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 0f20e6d..a2f69d1 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp

@@ -210,8 +210,10 @@
   // Remap attached metadata.
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   I->getAllMetadata(MDs);
-  for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
-       MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) {
+  for (SmallVectorImpl<std::pair<unsigned, MDNode *>>::iterator
+           MI = MDs.begin(),
+           ME = MDs.end();
+       MI != ME; ++MI) {
     MDNode *Old = MI->second;
     MDNode *New = MapValue(Old, VMap, Flags, TypeMapper, Materializer);
     if (New != Old)

diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 28ec83b..b4991bc 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp

@@ -391,8 +391,6 @@
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J);
 
-    void combineMetadata(Instruction *K, const Instruction *J);
-
     bool vectorizeBB(BasicBlock &BB) {
       if (skipOptnoneFunction(BB))
         return false;
@@ -687,6 +685,8 @@
       case Intrinsic::trunc:
       case Intrinsic::floor:
       case Intrinsic::fabs:
+      case Intrinsic::minnum:
+      case Intrinsic::maxnum:
         return Config.VectorizeMath;
       case Intrinsic::bswap:
       case Intrinsic::ctpop:
@@ -2964,31 +2964,6 @@
     }
   }
 
-  // When the first instruction in each pair is cloned, it will inherit its
-  // parent's metadata. This metadata must be combined with that of the other
-  // instruction in a safe way.
-  void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) {
-    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
-    K->getAllMetadataOtherThanDebugLoc(Metadata);
-    for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
-      unsigned Kind = Metadata[i].first;
-      MDNode *JMD = J->getMetadata(Kind);
-      MDNode *KMD = Metadata[i].second;
-
-      switch (Kind) {
-      default:
-        K->setMetadata(Kind, nullptr); // Remove unknown metadata
-        break;
-      case LLVMContext::MD_tbaa:
-        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
-        break;
-      case LLVMContext::MD_fpmath:
-        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
-        break;
-      }
-    }
-  }
-
   // This function fuses the chosen instruction pairs into vector instructions,
   // taking care preserve any needed scalar outputs and, then, it reorders the
   // remaining instructions as needed (users of the first member of the pair
@@ -3138,7 +3113,13 @@
       if (!isa<StoreInst>(K))
         K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
 
-      combineMetadata(K, H);
+      unsigned KnownIDs[] = {
+        LLVMContext::MD_tbaa,
+        LLVMContext::MD_alias_scope,
+        LLVMContext::MD_noalias,
+        LLVMContext::MD_fpmath
+      };
+      combineMetadata(K, H, KnownIDs);
       K->intersectOptionalDataWith(H);
 
       for (unsigned o = 0; o < NumOperands; ++o)

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb8a41d..35b2ecf 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -54,7 +54,10 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionTracker.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -107,8 +110,8 @@
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
 
 static cl::opt<unsigned>
-VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
-                    cl::desc("Sets the vectorization unroll count. "
+VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden,
+                    cl::desc("Sets the vectorization interleave count. "
                              "Zero is autoselect."));
 
 static cl::opt<bool>
@@ -156,17 +159,17 @@
     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
     cl::desc("A flag that overrides the target's number of vector registers."));
 
-/// Maximum vectorization unroll count.
-static const unsigned MaxUnrollFactor = 16;
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
 
-static cl::opt<unsigned> ForceTargetMaxScalarUnrollFactor(
-    "force-target-max-scalar-unroll", cl::init(0), cl::Hidden,
-    cl::desc("A flag that overrides the target's max unroll factor for scalar "
-             "loops."));
+static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
+    "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's max interleave factor for "
+             "scalar loops."));
 
-static cl::opt<unsigned> ForceTargetMaxVectorUnrollFactor(
-    "force-target-max-vector-unroll", cl::init(0), cl::Hidden,
-    cl::desc("A flag that overrides the target's max unroll factor for "
+static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
+    "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
+    cl::desc("A flag that overrides the target's max interleave factor for "
              "vectorized loops."));
 
 static cl::opt<unsigned> ForceTargetInstructionCost(
@@ -203,11 +206,17 @@
     "enable-cond-stores-vec", cl::init(false), cl::Hidden,
     cl::desc("Enable if predication of stores during vectorization."));
 
+static cl::opt<unsigned> MaxNestedScalarReductionUF(
+    "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
+    cl::desc("The maximum unroll factor to use when unrolling a scalar "
+             "reduction in a nested loop."));
+
 namespace {
 
 // Forward declarations.
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
+class LoopVectorizeHints;
 
 /// Optimization analysis message produced during vectorization. Messages inform
 /// the user why vectorization did not occur.
@@ -409,6 +418,8 @@
   LoopInfo *LI;
   /// Dominator Tree.
   DominatorTree *DT;
+  /// Alias Analysis.
+  AliasAnalysis *AA;
   /// Data Layout.
   const DataLayout *DL;
   /// Target Library Info.
@@ -518,6 +529,36 @@
 }
 #endif
 
+/// \brief Propagate known metadata from one instruction to another.
+static void propagateMetadata(Instruction *To, const Instruction *From) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+  From->getAllMetadataOtherThanDebugLoc(Metadata);
+
+  for (auto M : Metadata) {
+    unsigned Kind = M.first;
+
+    // These are safe to transfer (this is safe for TBAA, even when we
+    // if-convert, because should that metadata have had a control dependency
+    // on the condition, and thus actually aliased with some other
+    // non-speculated memory access when the condition was false, this would be
+    // caught by the runtime overlap checks).
+    if (Kind != LLVMContext::MD_tbaa &&
+        Kind != LLVMContext::MD_alias_scope &&
+        Kind != LLVMContext::MD_noalias &&
+        Kind != LLVMContext::MD_fpmath)
+      continue;
+
+    To->setMetadata(Kind, M.second);
+  }
+}
+
+/// \brief Propagate known metadata from one instruction to a vector of others.
+static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
+  for (Value *V : To)
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      propagateMetadata(I, From);
+}
+
 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 /// to what vectorization factor.
 /// This class does not look at the profitability of vectorization, only the
@@ -539,9 +580,9 @@
 
   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
                             DominatorTree *DT, TargetLibraryInfo *TLI,
-                            Function *F)
+                            AliasAnalysis *AA, Function *F)
       : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
-        DT(DT), TLI(TLI), TheFunction(F), Induction(nullptr),
+        DT(DT), TLI(TLI), AA(AA), TheFunction(F), Induction(nullptr),
         WidestIndTy(nullptr), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {
   }
 
@@ -629,11 +670,12 @@
       Ends.clear();
       IsWritePtr.clear();
       DependencySetId.clear();
+      AliasSetId.clear();
     }
 
     /// Insert a pointer and calculate the start and end SCEVs.
     void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
-                unsigned DepSetId, ValueToValueMap &Strides);
+                unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides);
 
     /// This flag indicates if we need to add the runtime check.
     bool Need;
@@ -648,6 +690,8 @@
     /// Holds the id of the set of pointers that could be dependent because of a
     /// shared underlying object.
     SmallVector<unsigned, 2> DependencySetId;
+    /// Holds the id of the disjoint alias set to which this pointer belongs.
+    SmallVector<unsigned, 2> AliasSetId;
   };
 
   /// A struct for saving information about induction variables.
@@ -746,7 +790,7 @@
   /// Return true if all of the instructions in the block can be speculatively
   /// executed. \p SafePtrs is a list of addresses that are known to be legal
   /// and we know that we can read from them without segfault.
-  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSet<Value *, 8>& SafePtrs);
+  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
 
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
@@ -792,6 +836,8 @@
   DominatorTree *DT;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
+  /// Alias analysis.
+  AliasAnalysis *AA;
   /// Parent function
   Function *TheFunction;
 
@@ -839,8 +885,13 @@
   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
                              LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
-                             const DataLayout *DL, const TargetLibraryInfo *TLI)
-      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
+                             const DataLayout *DL, const TargetLibraryInfo *TLI,
+                             AssumptionTracker *AT, const Function *F,
+                             const LoopVectorizeHints *Hints)
+      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI),
+        TheFunction(F), Hints(Hints) {
+    CodeMetrics::collectEphemeralValues(L, AT, EphValues);
+  }
 
   /// Information about vectorization costs
   struct VectorizationFactor {
@@ -851,9 +902,7 @@
   /// This method checks every power of two up to VF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
-  VectorizationFactor selectVectorizationFactor(bool OptForSize,
-                                                unsigned UserVF,
-                                                bool ForceVectorization);
+  VectorizationFactor selectVectorizationFactor(bool OptForSize);
 
   /// \return The size (in bits) of the widest type in the code that
   /// needs to be vectorized. We ignore values that remain scalar such as
@@ -865,8 +914,7 @@
   /// based on register pressure and other parameters.
   /// VF and LoopCost are the selected vectorization factor and the cost of the
   /// selected VF.
-  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
-                              unsigned LoopCost);
+  unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
 
   /// \brief A struct that represents some properties of the register usage
   /// of a loop.
@@ -902,6 +950,19 @@
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
 
+  /// Report an analysis message to assist the user in diagnosing loops that are
+  /// not vectorized.
+  void emitAnalysis(Report &Message) {
+    DebugLoc DL = TheLoop->getStartLoc();
+    if (Instruction *I = Message.getInstr())
+      DL = I->getDebugLoc();
+    emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE,
+                                   *TheFunction, DL, Message.str());
+  }
+
+  /// Values used only by @llvm.assume calls.
+  SmallPtrSet<const Value *, 32> EphValues;
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
@@ -916,11 +977,59 @@
   const DataLayout *DL;
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
+  const Function *TheFunction;
+  // Loop Vectorize Hint.
+  const LoopVectorizeHints *Hints;
 };
 
 /// Utility class for getting and setting loop vectorizer hints in the form
 /// of loop metadata.
+/// This class keeps a number of loop annotations locally (as member variables)
+/// and can, upon request, write them back as metadata on the loop. It will
+/// initially scan the loop for existing metadata, and will update the local
+/// values based on information in the loop.
+/// We cannot write all values to metadata, as the mere presence of some info,
+/// for example 'force', means a decision has been made. So, we need to be
+/// careful NOT to add them if the user hasn't specifically asked so.
 class LoopVectorizeHints {
+  enum HintKind {
+    HK_WIDTH,
+    HK_UNROLL,
+    HK_FORCE
+  };
+
+  /// Hint - associates name and validation with the hint value.
+  struct Hint {
+    const char * Name;
+    unsigned Value; // This may have to change for non-numeric values.
+    HintKind Kind;
+
+    Hint(const char * Name, unsigned Value, HintKind Kind)
+      : Name(Name), Value(Value), Kind(Kind) { }
+
+    bool validate(unsigned Val) {
+      switch (Kind) {
+      case HK_WIDTH:
+        return isPowerOf2_32(Val) && Val <= MaxVectorWidth;
+      case HK_UNROLL:
+        return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+      case HK_FORCE:
+        return (Val <= 1);
+      }
+      return false;
+    }
+  };
+
+  /// Vectorization width.
+  Hint Width;
+  /// Vectorization interleave factor.
+  Hint Interleave;
+  /// Vectorization forced
+  Hint Force;
+
+  /// Return the loop metadata prefix.
+  static StringRef Prefix() { return "llvm.loop."; }
+
 public:
   enum ForceKind {
     FK_Undefined = -1, ///< Not selected.
@@ -928,88 +1037,57 @@
     FK_Enabled = 1,    ///< Forcing enabled.
   };
 
-  LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
-      : Width(VectorizationFactor),
-        Unroll(DisableUnrolling),
-        Force(FK_Undefined),
-        LoopID(L->getLoopID()) {
-    getHints(L);
-    // force-vector-unroll overrides DisableUnrolling.
-    if (VectorizationUnroll.getNumOccurrences() > 0)
-      Unroll = VectorizationUnroll;
+  LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
+      : Width("vectorize.width", VectorizationFactor, HK_WIDTH),
+        Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
+        Force("vectorize.enable", FK_Undefined, HK_FORCE),
+        TheLoop(L) {
+    // Populate values with existing loop metadata.
+    getHintsFromMetadata();
 
-    DEBUG(if (DisableUnrolling && Unroll == 1) dbgs()
-          << "LV: Unrolling disabled by the pass manager\n");
-  }
+    // force-vector-interleave overrides DisableInterleaving.
+    if (VectorizationInterleave.getNumOccurrences() > 0)
+      Interleave.Value = VectorizationInterleave;
 
-  /// Return the loop vectorizer metadata prefix.
-  static StringRef Prefix() { return "llvm.loop.vectorize."; }
-
-  MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const {
-    SmallVector<Value*, 2> Vals;
-    Vals.push_back(MDString::get(Context, Name));
-    Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V));
-    return MDNode::get(Context, Vals);
+    DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
+          << "LV: Interleaving disabled by the pass manager\n");
   }
 
   /// Mark the loop L as already vectorized by setting the width to 1.
-  void setAlreadyVectorized(Loop *L) {
-    LLVMContext &Context = L->getHeader()->getContext();
-
-    Width = 1;
-
-    // Create a new loop id with one more operand for the already_vectorized
-    // hint. If the loop already has a loop id then copy the existing operands.
-    SmallVector<Value*, 4> Vals(1);
-    if (LoopID)
-      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i)
-        Vals.push_back(LoopID->getOperand(i));
-
-    Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
-    Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1));
-
-    MDNode *NewLoopID = MDNode::get(Context, Vals);
-    // Set operand 0 to refer to the loop id itself.
-    NewLoopID->replaceOperandWith(0, NewLoopID);
-
-    L->setLoopID(NewLoopID);
-    if (LoopID)
-      LoopID->replaceAllUsesWith(NewLoopID);
-
-    LoopID = NewLoopID;
+  void setAlreadyVectorized() {
+    Width.Value = Interleave.Value = 1;
+    Hint Hints[] = {Width, Interleave};
+    writeHintsToMetadata(Hints);
   }
 
+  /// Dumps all the hint information.
   std::string emitRemark() const {
     Report R;
-    R << "vectorization ";
-    switch (Force) {
-    case LoopVectorizeHints::FK_Disabled:
-      R << "is explicitly disabled";
-      break;
-    case LoopVectorizeHints::FK_Enabled:
-      R << "is explicitly enabled";
-      if (Width != 0 && Unroll != 0)
-        R << " with width " << Width << " and interleave count " << Unroll;
-      else if (Width != 0)
-        R << " with width " << Width;
-      else if (Unroll != 0)
-        R << " with interleave count " << Unroll;
-      break;
-    case LoopVectorizeHints::FK_Undefined:
-      R << "was not specified";
-      break;
+    if (Force.Value == LoopVectorizeHints::FK_Disabled)
+      R << "vectorization is explicitly disabled";
+    else {
+      R << "use -Rpass-analysis=loop-vectorize for more info";
+      if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+        R << " (Force=true";
+        if (Width.Value != 0)
+          R << ", Vector Width=" << Width.Value;
+        if (Interleave.Value != 0)
+          R << ", Interleave Count=" << Interleave.Value;
+        R << ")";
+      }
     }
+
     return R.str();
   }
 
-  unsigned getWidth() const { return Width; }
-  unsigned getUnroll() const { return Unroll; }
-  enum ForceKind getForce() const { return Force; }
-  MDNode *getLoopID() const { return LoopID; }
+  unsigned getWidth() const { return Width.Value; }
+  unsigned getInterleave() const { return Interleave.Value; }
+  enum ForceKind getForce() const { return (ForceKind)Force.Value; }
 
 private:
-  /// Find hints specified in the loop metadata.
-  void getHints(const Loop *L) {
+  /// Find hints specified in the loop metadata and update local values.
+  void getHintsFromMetadata() {
+    MDNode *LoopID = TheLoop->getLoopID();
     if (!LoopID)
       return;
 
@@ -1037,55 +1115,111 @@
       if (!S)
         continue;
 
-      // Check if the hint starts with the vectorizer prefix.
-      StringRef Hint = S->getString();
-      if (!Hint.startswith(Prefix()))
-        continue;
-      // Remove the prefix.
-      Hint = Hint.substr(Prefix().size(), StringRef::npos);
-
+      // Check if the hint starts with the loop metadata prefix.
+      StringRef Name = S->getString();
       if (Args.size() == 1)
-        getHint(Hint, Args[0]);
+        setHint(Name, Args[0]);
     }
   }
 
-  // Check string hint with one operand.
-  void getHint(StringRef Hint, Value *Arg) {
+  /// Checks string hint with one operand and set value if valid.
+  void setHint(StringRef Name, Value *Arg) {
+    if (!Name.startswith(Prefix()))
+      return;
+    Name = Name.substr(Prefix().size(), StringRef::npos);
+
     const ConstantInt *C = dyn_cast<ConstantInt>(Arg);
     if (!C) return;
     unsigned Val = C->getZExtValue();
 
-    if (Hint == "width") {
-      if (isPowerOf2_32(Val) && Val <= MaxVectorWidth)
-        Width = Val;
-      else
-        DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n");
-    } else if (Hint == "unroll") {
-      if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor)
-        Unroll = Val;
-      else
-        DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
-    } else if (Hint == "enable") {
-      if (C->getBitWidth() == 1)
-        Force = Val == 1 ? LoopVectorizeHints::FK_Enabled
-                         : LoopVectorizeHints::FK_Disabled;
-      else
-        DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n");
-    } else {
-      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n');
+    Hint *Hints[] = {&Width, &Interleave, &Force};
+    for (auto H : Hints) {
+      if (Name == H->Name) {
+        if (H->validate(Val))
+          H->Value = Val;
+        else
+          DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+        break;
+      }
     }
   }
 
-  /// Vectorization width.
-  unsigned Width;
-  /// Vectorization unroll factor.
-  unsigned Unroll;
-  /// Vectorization forced
-  enum ForceKind Force;
+  /// Create a new hint from name / value pair.
+  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    Value *Vals[] = {MDString::get(Context, Name),
+                     ConstantInt::get(Type::getInt32Ty(Context), V)};
+    return MDNode::get(Context, Vals);
+  }
 
-  MDNode *LoopID;
+  /// Matches metadata with hint name.
+  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
+    MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
+    if (!Name)
+      return false;
+
+    for (auto H : HintTypes)
+      if (Name->getString().endswith(H.Name))
+        return true;
+    return false;
+  }
+
+  /// Sets current hints into loop metadata, keeping other values intact.
+  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+    if (HintTypes.size() == 0)
+      return;
+
+    // Reserve the first element to LoopID (see below).
+    SmallVector<Value*, 4> Vals(1);
+    // If the loop already has metadata, then ignore the existing operands.
+    MDNode *LoopID = TheLoop->getLoopID();
+    if (LoopID) {
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+        MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+        // If node in update list, ignore old value.
+        if (!matchesHintMetadataName(Node, HintTypes))
+          Vals.push_back(Node);
+      }
+    }
+
+    // Now, add the missing hints.
+    for (auto H : HintTypes)
+      Vals.push_back(
+          createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+    // Replace current metadata node with new one.
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    MDNode *NewLoopID = MDNode::get(Context, Vals);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+
+    TheLoop->setLoopID(NewLoopID);
+    if (LoopID)
+      LoopID->replaceAllUsesWith(NewLoopID);
+    LoopID = NewLoopID;
+  }
+
+  /// The loop these hints belong to.
+  const Loop *TheLoop;
 };
 
+static void emitMissedWarning(Function *F, Loop *L,
+                              const LoopVectorizeHints &LH) {
+  emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
+                               L->getStartLoc(), LH.emitRemark());
+
+  if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
+    if (LH.getWidth() != 1)
+      emitLoopVectorizeWarning(
+          F->getContext(), *F, L->getStartLoc(),
+          "failed explicitly specified loop vectorization");
+    else if (LH.getInterleave() != 1)
+      emitLoopInterleaveWarning(
+          F->getContext(), *F, L->getStartLoc(),
+          "failed explicitly specified loop interleaving");
+  }
+}
+
 static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
   if (L.empty())
     return V.push_back(&L);
@@ -1113,6 +1247,8 @@
   DominatorTree *DT;
   BlockFrequencyInfo *BFI;
   TargetLibraryInfo *TLI;
+  AliasAnalysis *AA;
+  AssumptionTracker *AT;
   bool DisableUnrolling;
   bool AlwaysVectorize;
 
@@ -1127,6 +1263,8 @@
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     BFI = &getAnalysis<BlockFrequencyInfo>();
     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+    AA = &getAnalysis<AliasAnalysis>();
+    AT = &getAnalysis<AssumptionTracker>();
 
     // Compute some weights outside of the loop over the loops. Compute this
     // using a BranchProbability to re-use its scaling math.
@@ -1183,7 +1321,7 @@
                          : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
                                 ? "enabled"
                                 : "?")) << " width=" << Hints.getWidth()
-                 << " unroll=" << Hints.getUnroll() << "\n");
+                 << " unroll=" << Hints.getInterleave() << "\n");
 
     // Function containing loop
     Function *F = L->getHeader()->getParent();
@@ -1210,7 +1348,7 @@
       return false;
     }
 
-    if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) {
+    if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
       emitOptimizationRemarkAnalysis(
           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
@@ -1221,8 +1359,7 @@
 
     // Check the loop for a trip count threshold:
     // do not vectorize loops with a tiny trip count.
-    BasicBlock *Latch = L->getLoopLatch();
-    const unsigned TC = SE->getSmallConstantTripCount(L, Latch);
+    const unsigned TC = SE->getSmallConstantTripCount(L);
     if (TC > 0u && TC < TinyTripCountVectorThreshold) {
       DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
                    << "This loop is not worth vectorizing.");
@@ -1238,16 +1375,16 @@
     }
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, F);
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
-      emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
-                                   L->getStartLoc(), Hints.emitRemark());
+      emitMissedWarning(F, L, Hints);
       return false;
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AT, F,
+                                  &Hints);
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
@@ -1276,20 +1413,17 @@
       emitOptimizationRemarkAnalysis(
           F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
           "loop not vectorized due to NoImplicitFloat attribute");
-      emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
-                                   L->getStartLoc(), Hints.emitRemark());
+      emitMissedWarning(F, L, Hints);
       return false;
     }
 
     // Select the optimal vectorization factor.
     const LoopVectorizationCostModel::VectorizationFactor VF =
-        CM.selectVectorizationFactor(OptForSize, Hints.getWidth(),
-                                     Hints.getForce() ==
-                                         LoopVectorizeHints::FK_Enabled);
+        CM.selectVectorizationFactor(OptForSize);
 
     // Select the unroll factor.
     const unsigned UF =
-        CM.selectUnrollFactor(OptForSize, Hints.getUnroll(), VF.Width, VF.Cost);
+        CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
 
     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
                  << DebugLocStr << '\n');
@@ -1330,13 +1464,14 @@
     }
 
     // Mark the loop as already vectorized to avoid vectorizing again.
-    Hints.setAlreadyVectorized(L);
+    Hints.setAlreadyVectorized();
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
     return true;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionTracker>();
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
     AU.addRequired<BlockFrequencyInfo>();
@@ -1344,8 +1479,10 @@
     AU.addRequired<LoopInfo>();
     AU.addRequired<ScalarEvolution>();
     AU.addRequired<TargetTransformInfo>();
+    AU.addRequired<AliasAnalysis>();
     AU.addPreserved<LoopInfo>();
     AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<AliasAnalysis>();
   }
 
 };
@@ -1401,7 +1538,7 @@
 
 void LoopVectorizationLegality::RuntimePointerCheck::insert(
     ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId,
-    ValueToValueMap &Strides) {
+    unsigned ASId, ValueToValueMap &Strides) {
   // Get the stride replaced scev.
   const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
@@ -1413,6 +1550,7 @@
   Ends.push_back(ScEnd);
   IsWritePtr.push_back(WritePtr);
   DependencySetId.push_back(DepSetId);
+  AliasSetId.push_back(ASId);
 }
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@@ -1719,7 +1857,9 @@
 
       Value *VecPtr = Builder.CreateBitCast(PartPtr,
                                             DataTy->getPointerTo(AddressSpace));
-      Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
+      StoreInst *NewSI =
+        Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
+      propagateMetadata(NewSI, SI);
     }
     return;
   }
@@ -1740,9 +1880,9 @@
 
     Value *VecPtr = Builder.CreateBitCast(PartPtr,
                                           DataTy->getPointerTo(AddressSpace));
-    Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
-    cast<LoadInst>(LI)->setAlignment(Alignment);
-    Entry[Part] = Reverse ? reverseVector(LI) :  LI;
+    LoadInst *NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
+    propagateMetadata(NewLI, LI);
+    Entry[Part] = Reverse ? reverseVector(NewLI) :  NewLI;
   }
 }
 
@@ -1956,6 +2096,9 @@
       // Only need to check pointers between two different dependency sets.
       if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
        continue;
+      // Only need to check pointers in the same alias set.
+      if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j])
+        continue;
 
       unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
       unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
@@ -2424,7 +2567,7 @@
   LoopScalarBody = OldBasicBlock;
 
   LoopVectorizeHints Hints(Lp, true);
-  Hints.setAlreadyVectorized(Lp);
+  Hints.setAlreadyVectorized();
 }
 
 /// This function returns the identity element (or neutral element) for
@@ -2838,7 +2981,7 @@
       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
                             LoopMiddleBlock);
   }
-} 
+}
 
 InnerLoopVectorizer::VectorParts
 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
@@ -3105,21 +3248,13 @@
       for (unsigned Part = 0; Part < UF; ++Part) {
         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
 
-        // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
-        BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
-        if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
-          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
-          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
-        }
-        if (VecOp && isa<PossiblyExactOperator>(VecOp))
-          VecOp->setIsExact(BinOp->isExact());
-
-        // Copy the fast-math flags.
-        if (VecOp && isa<FPMathOperator>(V))
-          VecOp->setFastMathFlags(it->getFastMathFlags());
+        if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
+          VecOp->copyIRFlags(BinOp);
 
         Entry[Part] = V;
       }
+
+      propagateMetadata(Entry, it);
       break;
     }
     case Instruction::Select: {
@@ -3147,6 +3282,8 @@
           Op0[Part],
           Op1[Part]);
       }
+
+      propagateMetadata(Entry, it);
       break;
     }
 
@@ -3166,6 +3303,8 @@
           C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
         Entry[Part] = C;
       }
+
+      propagateMetadata(Entry, it);
       break;
     }
 
@@ -3198,6 +3337,7 @@
         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
         for (unsigned Part = 0; Part < UF; ++Part)
           Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
+        propagateMetadata(Entry, it);
         break;
       }
       /// Vectorize casts.
@@ -3207,6 +3347,7 @@
       VectorParts &A = getVectorValue(it->getOperand(0));
       for (unsigned Part = 0; Part < UF; ++Part)
         Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
+      propagateMetadata(Entry, it);
       break;
     }
 
@@ -3221,6 +3362,7 @@
       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
       assert(ID && "Not an intrinsic call!");
       switch (ID) {
+      case Intrinsic::assume:
       case Intrinsic::lifetime_end:
       case Intrinsic::lifetime_start:
         scalarizeInstruction(it);
@@ -3244,6 +3386,8 @@
           Function *F = Intrinsic::getDeclaration(M, ID, Tys);
           Entry[Part] = Builder.CreateCall(F, Args);
         }
+
+        propagateMetadata(Entry, it);
         break;
       }
       break;
@@ -3284,7 +3428,7 @@
   DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
-  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
 
   DEBUG(DT->verifyDomTree());
 }
@@ -3460,7 +3604,7 @@
 /// \brief Check that the instruction has outside loop users and is not an
 /// identified reduction variable.
 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
-                               SmallPtrSet<Value *, 4> &Reductions) {
+                               SmallPtrSetImpl<Value *> &Reductions) {
   // Reduction instructions are allowed to have exit users. All other
   // instructions must not have external users.
   if (!Reductions.count(Inst))
@@ -3515,8 +3659,8 @@
           // identified reduction value with an outside user.
           if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
             continue;
-          emitAnalysis(Report(it) << "value that could not be identified as "
-                                     "reduction is used outside the loop");
+          emitAnalysis(Report(it) << "value could not be identified as "
+                                     "an induction or reduction variable");
           return false;
         }
 
@@ -3601,7 +3745,8 @@
           continue;
         }
 
-        emitAnalysis(Report(it) << "unvectorizable operation");
+        emitAnalysis(Report(it) << "value that could not be identified as "
+                                   "reduction is used outside the loop");
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
         return false;
       }// end of PHI handling
@@ -3858,19 +4003,22 @@
   /// \brief Set of potential dependent memory accesses.
   typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
 
-  AccessAnalysis(const DataLayout *Dl, DepCandidates &DA) :
-    DL(Dl), DepCands(DA), AreAllWritesIdentified(true),
-    AreAllReadsIdentified(true), IsRTCheckNeeded(false) {}
+  AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) :
+    DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {}
 
   /// \brief Register a load  and whether it is only read from.
-  void addLoad(Value *Ptr, bool IsReadOnly) {
+  void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) {
+    Value *Ptr = const_cast<Value*>(Loc.Ptr);
+    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
     Accesses.insert(MemAccessInfo(Ptr, false));
     if (IsReadOnly)
       ReadOnlyPtr.insert(Ptr);
   }
 
   /// \brief Register a store.
-  void addStore(Value *Ptr) {
+  void addStore(AliasAnalysis::Location &Loc) {
+    Value *Ptr = const_cast<Value*>(Loc.Ptr);
+    AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags);
     Accesses.insert(MemAccessInfo(Ptr, true));
   }
 
@@ -3884,10 +4032,7 @@
   /// \brief Goes over all memory accesses, checks whether a RT check is needed
   /// and builds sets of dependent accesses.
   void buildDependenceSets() {
-    // Process read-write pointers first.
-    processMemAccesses(false);
-    // Next, process read pointers.
-    processMemAccesses(true);
+    processMemAccesses();
   }
 
   bool isRTCheckNeeded() { return IsRTCheckNeeded; }
@@ -3899,40 +4044,31 @@
 
 private:
   typedef SetVector<MemAccessInfo> PtrAccessSet;
-  typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
 
-  /// \brief Go over all memory access or only the deferred ones if
-  /// \p UseDeferred is true and check whether runtime pointer checks are needed
-  /// and build sets of dependency check candidates.
-  void processMemAccesses(bool UseDeferred);
+  /// \brief Go over all memory access and check whether runtime pointer checks
+  /// are needed /// and build sets of dependency check candidates.
+  void processMemAccesses();
 
   /// Set of all accesses.
   PtrAccessSet Accesses;
 
-  /// Set of access to check after all writes have been processed.
-  PtrAccessSet DeferredAccesses;
-
-  /// Map of pointers to last access encountered.
-  UnderlyingObjToAccessMap ObjToLastAccess;
-
   /// Set of accesses that need a further dependence check.
   MemAccessInfoSet CheckDeps;
 
   /// Set of pointers that are read only.
   SmallPtrSet<Value*, 16> ReadOnlyPtr;
 
-  /// Set of underlying objects already written to.
-  SmallPtrSet<Value*, 16> WriteObjects;
-
   const DataLayout *DL;
 
+  /// An alias set tracker to partition the access set by underlying object and
+  //intrinsic property (such as TBAA metadata).
+  AliasSetTracker AST;
+
   /// Sets of potentially dependent accesses - members of one set share an
   /// underlying pointer. The set "CheckDeps" identfies which sets really need a
   /// dependence check.
   DepCandidates &DepCands;
 
-  bool AreAllWritesIdentified;
-  bool AreAllReadsIdentified;
   bool IsRTCheckNeeded;
 };
 
@@ -3960,62 +4096,67 @@
     ValueToValueMap &StridesMap, bool ShouldCheckStride) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
-  unsigned NumReadPtrChecks = 0;
-  unsigned NumWritePtrChecks = 0;
   bool CanDoRT = true;
 
   bool IsDepCheckNeeded = isDependencyCheckNeeded();
-  // We assign consecutive id to access from different dependence sets.
-  // Accesses within the same set don't need a runtime check.
-  unsigned RunningDepId = 1;
-  DenseMap<Value *, unsigned> DepSetId;
+  NumComparisons = 0;
 
-  for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end();
-       AI != AE; ++AI) {
-    const MemAccessInfo &Access = *AI;
-    Value *Ptr = Access.getPointer();
-    bool IsWrite = Access.getInt();
+  // We assign a consecutive id to access from different alias sets.
+  // Accesses between different groups doesn't need to be checked.
+  unsigned ASId = 1;
+  for (auto &AS : AST) {
+    unsigned NumReadPtrChecks = 0;
+    unsigned NumWritePtrChecks = 0;
 
-    // Just add write checks if we have both.
-    if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true)))
-      continue;
+    // We assign consecutive id to access from different dependence sets.
+    // Accesses within the same set don't need a runtime check.
+    unsigned RunningDepId = 1;
+    DenseMap<Value *, unsigned> DepSetId;
 
-    if (IsWrite)
-      ++NumWritePtrChecks;
-    else
-      ++NumReadPtrChecks;
+    for (auto A : AS) {
+      Value *Ptr = A.getValue();
+      bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
+      MemAccessInfo Access(Ptr, IsWrite);
 
-    if (hasComputableBounds(SE, StridesMap, Ptr) &&
-        // When we run after a failing dependency check we have to make sure we
-        // don't have wrapping pointers.
-        (!ShouldCheckStride ||
-         isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
-      // The id of the dependence set.
-      unsigned DepId;
+      if (IsWrite)
+        ++NumWritePtrChecks;
+      else
+        ++NumReadPtrChecks;
 
-      if (IsDepCheckNeeded) {
-        Value *Leader = DepCands.getLeaderValue(Access).getPointer();
-        unsigned &LeaderId = DepSetId[Leader];
-        if (!LeaderId)
-          LeaderId = RunningDepId++;
-        DepId = LeaderId;
-      } else
-        // Each access has its own dependence set.
-        DepId = RunningDepId++;
+      if (hasComputableBounds(SE, StridesMap, Ptr) &&
+          // When we run after a failing dependency check we have to make sure we
+          // don't have wrapping pointers.
+          (!ShouldCheckStride ||
+           isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) {
+        // The id of the dependence set.
+        unsigned DepId;
 
-      RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, StridesMap);
+        if (IsDepCheckNeeded) {
+          Value *Leader = DepCands.getLeaderValue(Access).getPointer();
+          unsigned &LeaderId = DepSetId[Leader];
+          if (!LeaderId)
+            LeaderId = RunningDepId++;
+          DepId = LeaderId;
+        } else
+          // Each access has its own dependence set.
+          DepId = RunningDepId++;
 
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
-    } else {
-      CanDoRT = false;
+        RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap);
+
+        DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
+      } else {
+        CanDoRT = false;
+      }
     }
-  }
 
-  if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
-    NumComparisons = 0; // Only one dependence set.
-  else {
-    NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks +
-                                           NumWritePtrChecks - 1));
+    if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
+      NumComparisons += 0; // Only one dependence set.
+    else {
+      NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks +
+                                              NumWritePtrChecks - 1));
+    }
+
+    ++ASId;
   }
 
   // If the pointers that we would use for the bounds comparison have different
@@ -4029,6 +4170,9 @@
       // Only need to check pointers between two different dependency sets.
       if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
        continue;
+      // Only need to check pointers in the same alias set.
+      if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j])
+        continue;
 
       Value *PtrI = RtCheck.Pointers[i];
       Value *PtrJ = RtCheck.Pointers[j];
@@ -4046,90 +4190,99 @@
   return CanDoRT;
 }
 
-static bool isFunctionScopeIdentifiedObject(Value *Ptr) {
-  return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr);
-}
-
-void AccessAnalysis::processMemAccesses(bool UseDeferred) {
+void AccessAnalysis::processMemAccesses() {
   // We process the set twice: first we process read-write pointers, last we
   // process read-only pointers. This allows us to skip dependence tests for
   // read-only pointers.
 
-  PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
-  for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) {
-    const MemAccessInfo &Access = *AI;
-    Value *Ptr = Access.getPointer();
-    bool IsWrite = Access.getInt();
+  DEBUG(dbgs() << "LV: Processing memory accesses...\n");
+  DEBUG(dbgs() << "  AST: "; AST.dump());
+  DEBUG(dbgs() << "LV:   Accesses:\n");
+  DEBUG({
+    for (auto A : Accesses)
+      dbgs() << "\t" << *A.getPointer() << " (" <<
+                (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
+                                         "read-only" : "read")) << ")\n";
+  });
 
-    DepCands.insert(Access);
+  // The AliasSetTracker has nicely partitioned our pointers by metadata
+  // compatibility and potential for underlying-object overlap. As a result, we
+  // only need to check for potential pointer dependencies within each alias
+  // set.
+  for (auto &AS : AST) {
+    // Note that both the alias-set tracker and the alias sets themselves used
+    // linked lists internally and so the iteration order here is deterministic
+    // (matching the original instruction order within each set).
 
-    // Memorize read-only pointers for later processing and skip them in the
-    // first round (they need to be checked after we have seen all write
-    // pointers). Note: we also mark pointer that are not consecutive as
-    // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the
-    // second check for "!IsWrite".
-    bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
-    if (!UseDeferred && IsReadOnlyPtr) {
-      DeferredAccesses.insert(Access);
-      continue;
-    }
+    bool SetHasWrite = false;
 
-    bool NeedDepCheck = false;
-    // Check whether there is the possibility of dependency because of
-    // underlying objects being the same.
-    typedef SmallVector<Value*, 16> ValueVector;
-    ValueVector TempObjects;
-    GetUnderlyingObjects(Ptr, TempObjects, DL);
-    for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end();
-         UI != UE; ++UI) {
-      Value *UnderlyingObj = *UI;
+    // Map of pointers to last access encountered.
+    typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
+    UnderlyingObjToAccessMap ObjToLastAccess;
 
-      // If this is a write then it needs to be an identified object.  If this a
-      // read and all writes (so far) are identified function scope objects we
-      // don't need an identified underlying object but only an Argument (the
-      // next write is going to invalidate this assumption if it is
-      // unidentified).
-      // This is a micro-optimization for the case where all writes are
-      // identified and we have one argument pointer.
-      // Otherwise, we do need a runtime check.
-      if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) ||
-          (!IsWrite && (!AreAllWritesIdentified ||
-                        !isa<Argument>(UnderlyingObj)) &&
-           !isIdentifiedObject(UnderlyingObj))) {
-        DEBUG(dbgs() << "LV: Found an unidentified " <<
-              (IsWrite ?  "write" : "read" ) << " ptr: " << *UnderlyingObj <<
-              "\n");
-        IsRTCheckNeeded = (IsRTCheckNeeded ||
-                           !isIdentifiedObject(UnderlyingObj) ||
-                           !AreAllReadsIdentified);
+    // Set of access to check after all writes have been processed.
+    PtrAccessSet DeferredAccesses;
+
+    // Iterate over each alias set twice, once to process read/write pointers,
+    // and then to process read-only pointers.
+    for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
+      bool UseDeferred = SetIteration > 0;
+      PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
+
+      for (auto A : AS) {
+        Value *Ptr = A.getValue();
+        bool IsWrite = S.count(MemAccessInfo(Ptr, true));
+
+        // If we're using the deferred access set, then it contains only reads.
+        bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
+        if (UseDeferred && !IsReadOnlyPtr)
+          continue;
+        // Otherwise, the pointer must be in the PtrAccessSet, either as a read
+        // or a write.
+        assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
+                 S.count(MemAccessInfo(Ptr, false))) &&
+               "Alias-set pointer not in the access set?");
+
+        MemAccessInfo Access(Ptr, IsWrite);
+        DepCands.insert(Access);
+
+        // Memorize read-only pointers for later processing and skip them in the
+        // first round (they need to be checked after we have seen all write
+        // pointers). Note: we also mark pointer that are not consecutive as
+        // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need
+        // the second check for "!IsWrite".
+        if (!UseDeferred && IsReadOnlyPtr) {
+          DeferredAccesses.insert(Access);
+          continue;
+        }
+
+        // If this is a write - check other reads and writes for conflicts.  If
+        // this is a read only check other writes for conflicts (but only if
+        // there is no other write to the ptr - this is an optimization to
+        // catch "a[i] = a[i] + " without having to do a dependence check).
+        if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
+          CheckDeps.insert(Access);
+          IsRTCheckNeeded = true;
+        }
 
         if (IsWrite)
-          AreAllWritesIdentified = false;
-        if (!IsWrite)
-          AreAllReadsIdentified = false;
+          SetHasWrite = true;
+
+        // Create sets of pointers connected by a shared alias set and
+        // underlying object.
+        typedef SmallVector<Value *, 16> ValueVector;
+        ValueVector TempObjects;
+        GetUnderlyingObjects(Ptr, TempObjects, DL);
+        for (Value *UnderlyingObj : TempObjects) {
+          UnderlyingObjToAccessMap::iterator Prev =
+            ObjToLastAccess.find(UnderlyingObj);
+          if (Prev != ObjToLastAccess.end())
+            DepCands.unionSets(Access, Prev->second);
+
+          ObjToLastAccess[UnderlyingObj] = Access;
+        }
       }
-
-      // If this is a write - check other reads and writes for conflicts.  If
-      // this is a read only check other writes for conflicts (but only if there
-      // is no other write to the ptr - this is an optimization to catch "a[i] =
-      // a[i] + " without having to do a dependence check).
-      if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj))
-        NeedDepCheck = true;
-
-      if (IsWrite)
-        WriteObjects.insert(UnderlyingObj);
-
-      // Create sets of pointers connected by shared underlying objects.
-      UnderlyingObjToAccessMap::iterator Prev =
-        ObjToLastAccess.find(UnderlyingObj);
-      if (Prev != ObjToLastAccess.end())
-        DepCands.unionSets(Access, Prev->second);
-
-      ObjToLastAccess[UnderlyingObj] = Access;
     }
-
-    if (NeedDepCheck)
-      CheckDeps.insert(Access);
   }
 }
 
@@ -4389,6 +4542,11 @@
   if (!AIsWrite && !BIsWrite)
     return false;
 
+  // We cannot check pointers in different address spaces.
+  if (APtr->getType()->getPointerAddressSpace() !=
+      BPtr->getType()->getPointerAddressSpace())
+    return true;
+
   const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr);
   const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr);
 
@@ -4471,7 +4629,7 @@
 
   // Bail out early if passed-in parameters make vectorization not feasible.
   unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
-  unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1;
+  unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1;
 
   // The distance must be bigger than the size needed for a vectorized version
   // of the operation and the size of the vectorized operation must not be
@@ -4619,7 +4777,7 @@
   }
 
   AccessAnalysis::DepCandidates DependentAccesses;
-  AccessAnalysis Accesses(DL, DependentAccesses);
+  AccessAnalysis Accesses(DL, AA, DependentAccesses);
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -4643,9 +4801,17 @@
 
     // If we did *not* see this pointer before, insert it to  the read-write
     // list. At this phase it is only a 'write' list.
-    if (Seen.insert(Ptr)) {
+    if (Seen.insert(Ptr).second) {
       ++NumReadWrites;
-      Accesses.addStore(Ptr);
+
+      AliasAnalysis::Location Loc = AA->getLocation(ST);
+      // The TBAA metadata could have a control dependency on the predication
+      // condition, so we cannot rely on it when determining whether or not we
+      // need runtime pointer checks.
+      if (blockNeedsPredication(ST->getParent()))
+        Loc.AATags.TBAA = nullptr;
+
+      Accesses.addStore(Loc);
     }
   }
 
@@ -4668,11 +4834,20 @@
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     bool IsReadOnlyPtr = false;
-    if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
+    if (Seen.insert(Ptr).second ||
+        !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
-    Accesses.addLoad(Ptr, IsReadOnlyPtr);
+
+    AliasAnalysis::Location Loc = AA->getLocation(LD);
+    // The TBAA metadata could have a control dependency on the predication
+    // condition, so we cannot rely on it when determining whether or not we
+    // need runtime pointer checks.
+    if (blockNeedsPredication(LD->getParent()))
+      Loc.AATags.TBAA = nullptr;
+
+    Accesses.addLoad(Loc, IsReadOnlyPtr);
   }
 
   // If we write (or read-write) to a single destination and there are no
@@ -4773,7 +4948,7 @@
 }
 
 static bool hasMultipleUsesOf(Instruction *I,
-                              SmallPtrSet<Instruction *, 8> &Insts) {
+                              SmallPtrSetImpl<Instruction *> &Insts) {
   unsigned NumUses = 0;
   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
     if (Insts.count(dyn_cast<Instruction>(*Use)))
@@ -4785,7 +4960,7 @@
   return false;
 }
 
-static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) {
+static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set) {
   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
     if (!Set.count(dyn_cast<Instruction>(*Use)))
       return false;
@@ -4923,7 +5098,7 @@
       // value must only be used once, except by phi nodes and min/max
       // reductions which are represented as a cmp followed by a select.
       ReductionInstDesc IgnoredVal(false, nullptr);
-      if (VisitedInsts.insert(UI)) {
+      if (VisitedInsts.insert(UI).second) {
         if (isa<PHINode>(UI))
           PHIs.push_back(UI);
         else
@@ -5025,7 +5200,7 @@
                                             ReductionKind Kind,
                                             ReductionInstDesc &Prev) {
   bool FP = I->getType()->isFloatingPointTy();
-  bool FastMath = (FP && I->isCommutative() && I->isAssociative());
+  bool FastMath = FP && I->hasUnsafeAlgebra();
   switch (I->getOpcode()) {
   default:
     return ReductionInstDesc(false, I);
@@ -5047,6 +5222,7 @@
     return ReductionInstDesc(Kind == RK_IntegerXor, I);
   case Instruction::FMul:
     return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
+  case Instruction::FSub:
   case Instruction::FAdd:
     return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
   case Instruction::FCmp:
@@ -5090,7 +5266,13 @@
     return IK_NoInduction;
 
   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
-  uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
+  Type *PointerElementType = PhiTy->getPointerElementType();
+  // The pointer stride cannot be determined if the pointer element type is not
+  // sized.
+  if (!PointerElementType->isSized())
+    return IK_NoInduction;
+
+  uint64_t Size = DL->getTypeAllocSize(PointerElementType);
   if (C->getValue()->equalsInt(Size))
     return IK_PtrInduction;
   else if (C->getValue()->equalsInt(0 - Size))
@@ -5117,7 +5299,7 @@
 }
 
 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
-                                            SmallPtrSet<Value *, 8>& SafePtrs) {
+                                           SmallPtrSetImpl<Value *> &SafePtrs) {
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     // We might be able to hoist the load.
     if (it->mayReadFromMemory()) {
@@ -5162,23 +5344,23 @@
 }
 
 LoopVectorizationCostModel::VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
-                                                      unsigned UserVF,
-                                                      bool ForceVectorization) {
+LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   // Width 1 means no vectorize
   VectorizationFactor Factor = { 1U, 0U };
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
+    emitAnalysis(Report() << "runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os");
     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
     return Factor;
   }
 
   if (!EnableCondStoresVectorization && Legal->NumPredStores) {
+    emitAnalysis(Report() << "store that is conditionally executed prevents vectorization");
     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
     return Factor;
   }
 
   // Find the trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
   unsigned WidestType = getWidestType();
@@ -5207,6 +5389,7 @@
   if (OptForSize) {
     // If we are unable to calculate the trip count then don't try to vectorize.
     if (TC < 2) {
+      emitAnalysis(Report() << "unable to calculate the loop count due to complex control flow");
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
       return Factor;
     }
@@ -5220,11 +5403,16 @@
     // If the trip count that we found modulo the vectorization factor is not
     // zero then we require a tail.
     if (VF < 2) {
+      emitAnalysis(Report() << "cannot optimize for size and vectorize at the "
+                               "same time. Enable vectorization of this loop "
+                               "with '#pragma clang loop vectorize(enable)' "
+                               "when compiling with -Os");
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
       return Factor;
     }
   }
 
+  int UserVF = Hints->getWidth();
   if (UserVF != 0) {
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
@@ -5240,6 +5428,7 @@
   unsigned Width = 1;
   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
 
+  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
   // Ignore scalar width, because the user explicitly wants vectorization.
   if (ForceVectorization && VF > 1) {
     Width = 2;
@@ -5280,6 +5469,10 @@
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
       Type *T = it->getType();
 
+      // Ignore ephemeral values.
+      if (EphValues.count(it))
+        continue;
+
       // Only examine Loads, Stores and PHINodes.
       if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
         continue;
@@ -5309,29 +5502,29 @@
 
 unsigned
 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
-                                               unsigned UserUF,
                                                unsigned VF,
                                                unsigned LoopCost) {
 
   // -- The unroll heuristics --
   // We unroll the loop in order to expose ILP and reduce the loop overhead.
   // There are many micro-architectural considerations that we can't predict
-  // at this level. For example frontend pressure (on decode or fetch) due to
+  // at this level. For example, frontend pressure (on decode or fetch) due to
   // code size, or the number and capabilities of the execution ports.
   //
   // We use the following heuristics to select the unroll factor:
-  // 1. If the code has reductions the we unroll in order to break the cross
+  // 1. If the code has reductions, then we unroll in order to break the cross
   // iteration dependency.
-  // 2. If the loop is really small then we unroll in order to reduce the loop
+  // 2. If the loop is really small, then we unroll in order to reduce the loop
   // overhead.
   // 3. We don't unroll if we think that we will spill registers to memory due
   // to the increased register pressure.
 
   // Use the user preference, unless 'auto' is selected.
+  int UserUF = Hints->getInterleave();
   if (UserUF != 0)
     return UserUF;
 
-  // When we optimize for size we don't unroll.
+  // When we optimize for size, we don't unroll.
   if (OptForSize)
     return 1;
 
@@ -5340,8 +5533,7 @@
     return 1;
 
   // Do not unroll loops with a relatively small trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop,
-                                              TheLoop->getLoopLatch());
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
   if (TC > 1 && TC < TinyTripCountUnrollThreshold)
     return 1;
 
@@ -5380,15 +5572,15 @@
                        std::max(1U, (R.MaxLocalUsers - 1)));
 
   // Clamp the unroll factor ranges to reasonable factors.
-  unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
+  unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor();
 
   // Check if the user has overridden the unroll max.
   if (VF == 1) {
-    if (ForceTargetMaxScalarUnrollFactor.getNumOccurrences() > 0)
-      MaxUnrollSize = ForceTargetMaxScalarUnrollFactor;
+    if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
+      MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
   } else {
-    if (ForceTargetMaxVectorUnrollFactor.getNumOccurrences() > 0)
-      MaxUnrollSize = ForceTargetMaxVectorUnrollFactor;
+    if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
+      MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
   }
 
   // If we did not calculate the cost for VF (because the user selected the VF)
@@ -5398,8 +5590,8 @@
 
   // Clamp the calculated UF to be between the 1 and the max unroll factor
   // that the target allows.
-  if (UF > MaxUnrollSize)
-    UF = MaxUnrollSize;
+  if (UF > MaxInterleaveSize)
+    UF = MaxInterleaveSize;
   else if (UF < 1)
     UF = 1;
 
@@ -5430,6 +5622,18 @@
     unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
     unsigned LoadsUF = UF /  (Legal->NumLoads ? Legal->NumLoads : 1);
 
+    // If we have a scalar reduction (vector reductions are already dealt with
+    // by this point), we can increase the critical path length if the loop
+    // we're unrolling is inside another loop. Limit, by default to 2, so the
+    // critical path only gets increased by one reduction operation.
+    if (Legal->getReductionVars()->size() &&
+        TheLoop->getLoopDepth() > 1) {
+      unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
+      SmallUF = std::min(SmallUF, F);
+      StoresUF = std::min(StoresUF, F);
+      LoadsUF = std::min(LoadsUF, F);
+    }
+
     if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
       DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
       return std::max(StoresUF, LoadsUF);
@@ -5531,6 +5735,10 @@
     // Ignore instructions that are never used within the loop.
     if (!Ends.count(I)) continue;
 
+    // Ignore ephemeral values.
+    if (EphValues.count(I))
+      continue;
+
     // Remove all of the instructions that end at this location.
     InstrList &List = TransposeEnds[i];
     for (unsigned int j=0, e = List.size(); j < e; ++j)
@@ -5571,6 +5779,10 @@
       if (isa<DbgInfoIntrinsic>(it))
         continue;
 
+      // Ignore ephemeral values.
+      if (EphValues.count(it))
+        continue;
+
       unsigned C = getInstructionCost(it, VF);
 
       // Check if we should override the cost.
@@ -5704,18 +5916,31 @@
       TargetTransformInfo::OK_AnyValue;
     TargetTransformInfo::OperandValueKind Op2VK =
       TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueProperties Op1VP =
+        TargetTransformInfo::OP_None;
+    TargetTransformInfo::OperandValueProperties Op2VP =
+        TargetTransformInfo::OP_None;
     Value *Op2 = I->getOperand(1);
 
     // Check for a splat of a constant or for a non uniform vector of constants.
-    if (isa<ConstantInt>(Op2))
+    if (isa<ConstantInt>(Op2)) {
+      ConstantInt *CInt = cast<ConstantInt>(Op2);
+      if (CInt && CInt->getValue().isPowerOf2())
+        Op2VP = TargetTransformInfo::OP_PowerOf2;
       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
-    else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
+    } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
       Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-      if (cast<Constant>(Op2)->getSplatValue() != nullptr)
+      Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
+      if (SplatValue) {
+        ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
+        if (CInt && CInt->getValue().isPowerOf2())
+          Op2VP = TargetTransformInfo::OP_PowerOf2;
         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+      }
     }
 
-    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
+                                      Op1VP, Op2VP);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -5857,6 +6082,8 @@
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)

diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 53a43d9..44bfea1 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp

@@ -19,7 +19,10 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -42,12 +45,15 @@
 #include "llvm/Transforms/Utils/VectorUtils.h"
 #include <algorithm>
 #include <map>
+#include <memory>
 
 using namespace llvm;
 
 #define SV_NAME "slp-vectorizer"
 #define DEBUG_TYPE "SLP"
 
+STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+
 static cl::opt<int>
     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
                      cl::desc("Only vectorize if you gain more than this "
@@ -68,53 +74,6 @@
 
 static const unsigned RecursionMaxDepth = 12;
 
-/// A helper class for numbering instructions in multiple blocks.
-/// Numbers start at zero for each basic block.
-struct BlockNumbering {
-
-  BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
-
-  void numberInstructions() {
-    unsigned Loc = 0;
-    InstrIdx.clear();
-    InstrVec.clear();
-    // Number the instructions in the block.
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-      InstrIdx[it] = Loc++;
-      InstrVec.push_back(it);
-      assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
-    }
-    Valid = true;
-  }
-
-  int getIndex(Instruction *I) {
-    assert(I->getParent() == BB && "Invalid instruction");
-    if (!Valid)
-      numberInstructions();
-    assert(InstrIdx.count(I) && "Unknown instruction");
-    return InstrIdx[I];
-  }
-
-  Instruction *getInstruction(unsigned loc) {
-    if (!Valid)
-      numberInstructions();
-    assert(InstrVec.size() > loc && "Invalid Index");
-    return InstrVec[loc];
-  }
-
-  void forget() { Valid = false; }
-
-private:
-  /// The block we are numbering.
-  BasicBlock *BB;
-  /// Is the block numbered.
-  bool Valid;
-  /// Maps instructions to numbers and back.
-  SmallDenseMap<Instruction *, int> InstrIdx;
-  /// Maps integers to Instructions.
-  SmallVector<Instruction *, 32> InstrVec;
-};
-
 /// \returns the parent basic block if all of the instructions in \p VL
 /// are in the same block or null otherwise.
 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
@@ -209,6 +168,23 @@
   return Opcode;
 }
 
+/// Get the intersection (logical and) of all of the potential IR flags
+/// of each scalar operation (VL) that will be converted into a vector (I).
+/// Flag set: NSW, NUW, exact, and all of fast-math.
+static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+  if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
+    if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
+      // Intersection is initialized to the 0th scalar,
+      // so start counting from index '1'.
+      for (int i = 1, e = VL.size(); i < e; ++i) {
+        if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
+          Intersection->andIRFlags(Scalar);
+      }
+      VecOp->copyIRFlags(Intersection);
+    }
+  }
+}
+  
 /// \returns \p I after propagating metadata from \p VL.
 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
   Instruction *I0 = cast<Instruction>(VL[0]);
@@ -230,6 +206,10 @@
       case LLVMContext::MD_tbaa:
         MD = MDNode::getMostGenericTBAA(MD, IMD);
         break;
+      case LLVMContext::MD_alias_scope:
+      case LLVMContext::MD_noalias:
+        MD = MDNode::intersect(MD, IMD);
+        break;
       case LLVMContext::MD_fpmath:
         MD = MDNode::getMostGenericFPMath(MD, IMD);
         break;
@@ -381,6 +361,33 @@
   }
 }
 
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+                                    TargetLibraryInfo *TLI) {
+
+  unsigned Opcode = UserInst->getOpcode();
+  switch (Opcode) {
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(UserInst);
+    return (LI->getPointerOperand() == Scalar);
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(UserInst);
+    return (SI->getPointerOperand() == Scalar);
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(UserInst);
+    Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+    if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+      return (CI->getArgOperand(1) == Scalar);
+    }
+  }
+  default:
+    return false;
+  }
+}
+
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
 public:
@@ -391,14 +398,21 @@
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
           TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
-          LoopInfo *Li, DominatorTree *Dt)
-      : F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
-        Builder(Se->getContext()) {}
+          LoopInfo *Li, DominatorTree *Dt, AssumptionTracker *AT)
+      : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0),
+        F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
+        Builder(Se->getContext()) {
+    CodeMetrics::collectEphemeralValues(F, AT, EphValues);
+  }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
   /// Returns the vectorized root.
   Value *vectorizeTree();
 
+  /// \returns the cost incurred by unwanted spills and fills, caused by
+  /// holding live values over call sites.
+  int getSpillCost();
+
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
   int getTreeCost();
@@ -414,7 +428,12 @@
     ScalarToTreeEntry.clear();
     MustGather.clear();
     ExternalUses.clear();
-    MemBarrierIgnoreList.clear();
+    NumLoadsWantToKeepOrder = 0;
+    NumLoadsWantToChangeOrder = 0;
+    for (auto &Iter : BlocksSchedules) {
+      BlockScheduling *BS = Iter.second.get();
+      BS->clear();
+    }
   }
 
   /// \returns true if the memory operations A and B are consecutive.
@@ -423,6 +442,11 @@
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
+  /// \returns true if it is benefitial to reverse the vector order.
+  bool shouldReorder() const {
+    return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+  }
+
 private:
   struct TreeEntry;
 
@@ -459,20 +483,6 @@
   /// roots. This method calculates the cost of extracting the values.
   int getGatherCost(ArrayRef<Value *> VL);
 
-  /// \returns the AA location that is being access by the instruction.
-  AliasAnalysis::Location getLocation(Instruction *I);
-
-  /// \brief Checks if it is possible to sink an instruction from
-  /// \p Src to \p Dst.
-  /// \returns the pointer to the barrier instruction if we can't sink.
-  Value *getSinkBarrier(Instruction *Src, Instruction *Dst);
-
-  /// \returns the index of the last instruction in the BB from \p VL.
-  int getLastIndex(ArrayRef<Value *> VL);
-
-  /// \returns the Instruction in the bundle \p VL.
-  Instruction *getLastInstruction(ArrayRef<Value *> VL);
-
   /// \brief Set the Builder insert point to one after the last instruction in
   /// the bundle
   void setInsertPointAfterBundle(ArrayRef<Value *> VL);
@@ -485,7 +495,7 @@
   bool isFullyVectorizableTinyTree();
 
   struct TreeEntry {
-    TreeEntry() : Scalars(), VectorizedValue(nullptr), LastScalarIndex(0),
+    TreeEntry() : Scalars(), VectorizedValue(nullptr),
     NeedToGather(0) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
@@ -500,9 +510,6 @@
     /// The Scalars are vectorized into this value. It is initialized to Null.
     Value *VectorizedValue;
 
-    /// The index in the basic block of the last scalar.
-    int LastScalarIndex;
-
     /// Do we need to gather this sequence ?
     bool NeedToGather;
   };
@@ -515,18 +522,16 @@
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
     if (Vectorized) {
-      Last->LastScalarIndex = getLastIndex(VL);
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
         ScalarToTreeEntry[VL[i]] = idx;
       }
     } else {
-      Last->LastScalarIndex = 0;
       MustGather.insert(VL.begin(), VL.end());
     }
     return Last;
   }
-
+  
   /// -- Vectorization State --
   /// Holds all of the tree entries.
   std::vector<TreeEntry> VectorizableTree;
@@ -554,28 +559,319 @@
   /// This list holds pairs of (Internal Scalar : External User).
   UserList ExternalUses;
 
-  /// A list of instructions to ignore while sinking
-  /// memory instructions. This map must be reset between runs of getCost.
-  ValueSet MemBarrierIgnoreList;
+  /// Values used only by @llvm.assume calls.
+  SmallPtrSet<const Value *, 32> EphValues;
 
   /// Holds all of the instructions that we gathered.
   SetVector<Instruction *> GatherSeq;
   /// A list of blocks that we are going to CSE.
   SetVector<BasicBlock *> CSEBlocks;
 
-  /// Numbers instructions in different blocks.
-  DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
+  /// Contains all scheduling relevant data for an instruction.
+  /// A ScheduleData either represents a single instruction or a member of an
+  /// instruction bundle (= a group of instructions which is combined into a
+  /// vector instruction).
+  struct ScheduleData {
 
-  /// \brief Get the corresponding instruction numbering list for a given
-  /// BasicBlock. The list is allocated lazily.
-  BlockNumbering &getBlockNumbering(BasicBlock *BB) {
-    auto I = BlocksNumbers.insert(std::make_pair(BB, BlockNumbering(BB)));
-    return I.first->second;
-  }
+    // The initial value for the dependency counters. It means that the
+    // dependencies are not calculated yet.
+    enum { InvalidDeps = -1 };
+
+    ScheduleData()
+        : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
+          NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
+          Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
+          UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
+
+    void init(int BlockSchedulingRegionID) {
+      FirstInBundle = this;
+      NextInBundle = nullptr;
+      NextLoadStore = nullptr;
+      IsScheduled = false;
+      SchedulingRegionID = BlockSchedulingRegionID;
+      UnscheduledDepsInBundle = UnscheduledDeps;
+      clearDependencies();
+    }
+
+    /// Returns true if the dependency information has been calculated.
+    bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
+
+    /// Returns true for single instructions and for bundle representatives
+    /// (= the head of a bundle).
+    bool isSchedulingEntity() const { return FirstInBundle == this; }
+
+    /// Returns true if it represents an instruction bundle and not only a
+    /// single instruction.
+    bool isPartOfBundle() const {
+      return NextInBundle != nullptr || FirstInBundle != this;
+    }
+
+    /// Returns true if it is ready for scheduling, i.e. it has no more
+    /// unscheduled depending instructions/bundles.
+    bool isReady() const {
+      assert(isSchedulingEntity() &&
+             "can't consider non-scheduling entity for ready list");
+      return UnscheduledDepsInBundle == 0 && !IsScheduled;
+    }
+
+    /// Modifies the number of unscheduled dependencies, also updating it for
+    /// the whole bundle.
+    int incrementUnscheduledDeps(int Incr) {
+      UnscheduledDeps += Incr;
+      return FirstInBundle->UnscheduledDepsInBundle += Incr;
+    }
+
+    /// Sets the number of unscheduled dependencies to the number of
+    /// dependencies.
+    void resetUnscheduledDeps() {
+      incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+    }
+
+    /// Clears all dependency information.
+    void clearDependencies() {
+      Dependencies = InvalidDeps;
+      resetUnscheduledDeps();
+      MemoryDependencies.clear();
+    }
+
+    void dump(raw_ostream &os) const {
+      if (!isSchedulingEntity()) {
+        os << "/ " << *Inst;
+      } else if (NextInBundle) {
+        os << '[' << *Inst;
+        ScheduleData *SD = NextInBundle;
+        while (SD) {
+          os << ';' << *SD->Inst;
+          SD = SD->NextInBundle;
+        }
+        os << ']';
+      } else {
+        os << *Inst;
+      }
+    }
+
+    Instruction *Inst;
+
+    /// Points to the head in an instruction bundle (and always to this for
+    /// single instructions).
+    ScheduleData *FirstInBundle;
+
+    /// Single linked list of all instructions in a bundle. Null if it is a
+    /// single instruction.
+    ScheduleData *NextInBundle;
+
+    /// Single linked list of all memory instructions (e.g. load, store, call)
+    /// in the block - until the end of the scheduling region.
+    ScheduleData *NextLoadStore;
+
+    /// The dependent memory instructions.
+    /// This list is derived on demand in calculateDependencies().
+    SmallVector<ScheduleData *, 4> MemoryDependencies;
+
+    /// This ScheduleData is in the current scheduling region if this matches
+    /// the current SchedulingRegionID of BlockScheduling.
+    int SchedulingRegionID;
+
+    /// Used for getting a "good" final ordering of instructions.
+    int SchedulingPriority;
+
+    /// The number of dependencies. Constitutes of the number of users of the
+    /// instruction plus the number of dependent memory instructions (if any).
+    /// This value is calculated on demand.
+    /// If InvalidDeps, the number of dependencies is not calculated yet.
+    ///
+    int Dependencies;
+
+    /// The number of dependencies minus the number of dependencies of scheduled
+    /// instructions. As soon as this is zero, the instruction/bundle gets ready
+    /// for scheduling.
+    /// Note that this is negative as long as Dependencies is not calculated.
+    int UnscheduledDeps;
+
+    /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
+    /// single instructions.
+    int UnscheduledDepsInBundle;
+
+    /// True if this instruction is scheduled (or considered as scheduled in the
+    /// dry-run).
+    bool IsScheduled;
+  };
+
+#ifndef NDEBUG
+  friend raw_ostream &operator<<(raw_ostream &os,
+                                 const BoUpSLP::ScheduleData &SD);
+#endif
+
+  /// Contains all scheduling data for a basic block.
+  ///
+  struct BlockScheduling {
+
+    BlockScheduling(BasicBlock *BB)
+        : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
+          ScheduleStart(nullptr), ScheduleEnd(nullptr),
+          FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
+          // Make sure that the initial SchedulingRegionID is greater than the
+          // initial SchedulingRegionID in ScheduleData (which is 0).
+          SchedulingRegionID(1) {}
+
+    void clear() {
+      ReadyInsts.clear();
+      ScheduleStart = nullptr;
+      ScheduleEnd = nullptr;
+      FirstLoadStoreInRegion = nullptr;
+      LastLoadStoreInRegion = nullptr;
+
+      // Make a new scheduling region, i.e. all existing ScheduleData is not
+      // in the new region yet.
+      ++SchedulingRegionID;
+    }
+
+    ScheduleData *getScheduleData(Value *V) {
+      ScheduleData *SD = ScheduleDataMap[V];
+      if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+        return SD;
+      return nullptr;
+    }
+
+    bool isInSchedulingRegion(ScheduleData *SD) {
+      return SD->SchedulingRegionID == SchedulingRegionID;
+    }
+
+    /// Marks an instruction as scheduled and puts all dependent ready
+    /// instructions into the ready-list.
+    template <typename ReadyListType>
+    void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
+      SD->IsScheduled = true;
+      DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
+
+      ScheduleData *BundleMember = SD;
+      while (BundleMember) {
+        // Handle the def-use chain dependencies.
+        for (Use &U : BundleMember->Inst->operands()) {
+          ScheduleData *OpDef = getScheduleData(U.get());
+          if (OpDef && OpDef->hasValidDependencies() &&
+              OpDef->incrementUnscheduledDeps(-1) == 0) {
+            // There are no more unscheduled dependencies after decrementing,
+            // so we can put the dependent instruction into the ready list.
+            ScheduleData *DepBundle = OpDef->FirstInBundle;
+            assert(!DepBundle->IsScheduled &&
+                   "already scheduled bundle gets ready");
+            ReadyList.insert(DepBundle);
+            DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
+          }
+        }
+        // Handle the memory dependencies.
+        for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
+          if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+            // There are no more unscheduled dependencies after decrementing,
+            // so we can put the dependent instruction into the ready list.
+            ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
+            assert(!DepBundle->IsScheduled &&
+                   "already scheduled bundle gets ready");
+            ReadyList.insert(DepBundle);
+            DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
+          }
+        }
+        BundleMember = BundleMember->NextInBundle;
+      }
+    }
+
+    /// Put all instructions into the ReadyList which are ready for scheduling.
+    template <typename ReadyListType>
+    void initialFillReadyList(ReadyListType &ReadyList) {
+      for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+        ScheduleData *SD = getScheduleData(I);
+        if (SD->isSchedulingEntity() && SD->isReady()) {
+          ReadyList.insert(SD);
+          DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
+        }
+      }
+    }
+
+    /// Checks if a bundle of instructions can be scheduled, i.e. has no
+    /// cyclic dependencies. This is only a dry-run, no instructions are
+    /// actually moved at this stage.
+    bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA);
+
+    /// Un-bundles a group of instructions.
+    void cancelScheduling(ArrayRef<Value *> VL);
+
+    /// Extends the scheduling region so that V is inside the region.
+    void extendSchedulingRegion(Value *V);
+
+    /// Initialize the ScheduleData structures for new instructions in the
+    /// scheduling region.
+    void initScheduleData(Instruction *FromI, Instruction *ToI,
+                          ScheduleData *PrevLoadStore,
+                          ScheduleData *NextLoadStore);
+
+    /// Updates the dependency information of a bundle and of all instructions/
+    /// bundles which depend on the original bundle.
+    void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
+                               AliasAnalysis *AA);
+
+    /// Sets all instruction in the scheduling region to un-scheduled.
+    void resetSchedule();
+
+    BasicBlock *BB;
+
+    /// Simple memory allocation for ScheduleData.
+    std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+
+    /// The size of a ScheduleData array in ScheduleDataChunks.
+    int ChunkSize;
+
+    /// The allocator position in the current chunk, which is the last entry
+    /// of ScheduleDataChunks.
+    int ChunkPos;
+
+    /// Attaches ScheduleData to Instruction.
+    /// Note that the mapping survives during all vectorization iterations, i.e.
+    /// ScheduleData structures are recycled.
+    DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+
+    struct ReadyList : SmallVector<ScheduleData *, 8> {
+      void insert(ScheduleData *SD) { push_back(SD); }
+    };
+
+    /// The ready-list for scheduling (only used for the dry-run).
+    ReadyList ReadyInsts;
+
+    /// The first instruction of the scheduling region.
+    Instruction *ScheduleStart;
+
+    /// The first instruction _after_ the scheduling region.
+    Instruction *ScheduleEnd;
+
+    /// The first memory accessing instruction in the scheduling region
+    /// (can be null).
+    ScheduleData *FirstLoadStoreInRegion;
+
+    /// The last memory accessing instruction in the scheduling region
+    /// (can be null).
+    ScheduleData *LastLoadStoreInRegion;
+
+    /// The ID of the scheduling region. For a new vectorization iteration this
+    /// is incremented which "removes" all ScheduleData from the region.
+    int SchedulingRegionID;
+  };
+
+  /// Attaches the BlockScheduling structures to basic blocks.
+  DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+  /// Performs the "real" scheduling. Done before vectorization is actually
+  /// performed in a basic block.
+  void scheduleBlock(BlockScheduling *BS);
 
   /// List of users to ignore during scheduling and that don't need extracting.
   ArrayRef<Value *> UserIgnoreList;
 
+  // Number of load-bundles, which contain consecutive loads.
+  int NumLoadsWantToKeepOrder;
+
+  // Number of load-bundles of size 2, which are consecutive loads if reversed.
+  int NumLoadsWantToChangeOrder;
+
   // Analysis and block reference.
   Function *F;
   ScalarEvolution *SE;
@@ -589,6 +885,13 @@
   IRBuilder<> Builder;
 };
 
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
+  SD.dump(os);
+  return os;
+}
+#endif
+
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
   deleteTree();
@@ -612,18 +915,27 @@
       for (User *U : Scalar->users()) {
         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
-        // Skip in-tree scalars that become vectors.
-        if (ScalarToTreeEntry.count(U)) {
-          DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
-                *U << ".\n");
-          int Idx = ScalarToTreeEntry[U]; (void) Idx;
-          assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
-          continue;
-        }
         Instruction *UserInst = dyn_cast<Instruction>(U);
         if (!UserInst)
           continue;
 
+        // Skip in-tree scalars that become vectors
+        if (ScalarToTreeEntry.count(U)) {
+          int Idx = ScalarToTreeEntry[U];
+          TreeEntry *UseEntry = &VectorizableTree[Idx];
+          Value *UseScalar = UseEntry->Scalars[0];
+          // Some in-tree scalars will remain as scalar in vectorized
+          // instructions. If that is the case, the one in Lane 0 will
+          // be used.
+          if (UseScalar != U ||
+              !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+            DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+                         << ".\n");
+            assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+            continue;
+          }
+        }
+
         // Ignore users in the user ignore list.
         if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
             UserIgnoreList.end())
@@ -683,6 +995,16 @@
   // We now know that this is a vector of instructions of the same type from
   // the same block.
 
+  // Don't vectorize ephemeral values.
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    if (EphValues.count(VL[i])) {
+      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+            ") is ephemeral.\n");
+      newTreeEntry(VL, false);
+      return;
+    }
+  }
+
   // Check if this is a duplicate of another entry.
   if (ScalarToTreeEntry.count(VL[0])) {
     int Idx = ScalarToTreeEntry[VL[0]];
@@ -722,69 +1044,16 @@
   // Check that all of the users of the scalars that we want to vectorize are
   // schedulable.
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  int MyLastIndex = getLastIndex(VL);
   BasicBlock *BB = cast<Instruction>(VL0)->getParent();
 
-  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
-    Instruction *Scalar = cast<Instruction>(VL[i]);
-    DEBUG(dbgs() << "SLP: Checking users of  " << *Scalar << ". \n");
-    for (User *U : Scalar->users()) {
-      DEBUG(dbgs() << "SLP: \tUser " << *U << ". \n");
-      Instruction *UI = dyn_cast<Instruction>(U);
-      if (!UI) {
-        DEBUG(dbgs() << "SLP: Gathering due unknown user. \n");
-        newTreeEntry(VL, false);
-        return;
-      }
-
-      // We don't care if the user is in a different basic block.
-      BasicBlock *UserBlock = UI->getParent();
-      if (UserBlock != BB) {
-        DEBUG(dbgs() << "SLP: User from a different basic block "
-              << *UI << ". \n");
-        continue;
-      }
-
-      // If this is a PHINode within this basic block then we can place the
-      // extract wherever we want.
-      if (isa<PHINode>(*UI)) {
-        DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *UI << ". \n");
-        continue;
-      }
-
-      // Check if this is a safe in-tree user.
-      if (ScalarToTreeEntry.count(UI)) {
-        int Idx = ScalarToTreeEntry[UI];
-        int VecLocation = VectorizableTree[Idx].LastScalarIndex;
-        if (VecLocation <= MyLastIndex) {
-          DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n");
-          newTreeEntry(VL, false);
-          return;
-        }
-        DEBUG(dbgs() << "SLP: In-tree user (" << *UI << ") at #" <<
-              VecLocation << " vector value (" << *Scalar << ") at #"
-              << MyLastIndex << ".\n");
-        continue;
-      }
-
-      // Ignore users in the user ignore list.
-      if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UI) !=
-          UserIgnoreList.end())
-        continue;
-
-      // Make sure that we can schedule this unknown user.
-      BlockNumbering &BN = getBlockNumbering(BB);
-      int UserIndex = BN.getIndex(UI);
-      if (UserIndex < MyLastIndex) {
-
-        DEBUG(dbgs() << "SLP: Can't schedule extractelement for "
-              << *UI << ". \n");
-        newTreeEntry(VL, false);
-        return;
-      }
-    }
+  if (!DT->isReachableFromEntry(BB)) {
+    // Don't go into unreachable blocks. They may contain instructions with
+    // dependency cycles which confuse the final scheduling.
+    DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+    newTreeEntry(VL, false);
+    return;
   }
-
+  
   // Check that every instructions appears once in this bundle.
   for (unsigned i = 0, e = VL.size(); i < e; ++i)
     for (unsigned j = i+1; j < e; ++j)
@@ -794,39 +1063,20 @@
         return;
       }
 
-  // Check that instructions in this bundle don't reference other instructions.
-  // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4.
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    for (User *U : VL[i]->users()) {
-      for (unsigned j = 0; j < e; ++j) {
-        if (i != j && U == VL[j]) {
-          DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << *U << ". \n");
-          newTreeEntry(VL, false);
-          return;
-        }
-      }
-    }
+  auto &BSRef = BlocksSchedules[BB];
+  if (!BSRef) {
+    BSRef = llvm::make_unique<BlockScheduling>(BB);
   }
+  BlockScheduling &BS = *BSRef.get();
 
+  if (!BS.tryScheduleBundle(VL, AA)) {
+    DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+    BS.cancelScheduling(VL);
+    newTreeEntry(VL, false);
+    return;
+  }
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  // Check if it is safe to sink the loads or the stores.
-  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
-    Instruction *Last = getLastInstruction(VL);
-
-    for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-      if (VL[i] == Last)
-        continue;
-      Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
-      if (Barrier) {
-        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
-              << "\n because of " << *Barrier << ".  Gathering.\n");
-        newTreeEntry(VL, false);
-        return;
-      }
-    }
-  }
-
   switch (Opcode) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
@@ -838,6 +1088,7 @@
               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
           if (Term) {
             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+            BS.cancelScheduling(VL);
             newTreeEntry(VL, false);
             return;
           }
@@ -861,6 +1112,8 @@
       bool Reuse = CanReuseExtract(VL);
       if (Reuse) {
         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
+      } else {
+        BS.cancelScheduling(VL);
       }
       newTreeEntry(VL, Reuse);
       return;
@@ -869,12 +1122,23 @@
       // Check if the loads are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
         LoadInst *L = cast<LoadInst>(VL[i]);
-        if (!L->isSimple() || !isConsecutiveAccess(VL[i], VL[i + 1])) {
+        if (!L->isSimple()) {
+          BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
-          DEBUG(dbgs() << "SLP: Need to swizzle loads.\n");
+          DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+          return;
+        }
+        if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+          if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
+            ++NumLoadsWantToChangeOrder;
+          }
+          BS.cancelScheduling(VL);
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
           return;
         }
       }
+      ++NumLoadsWantToKeepOrder;
       newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
       return;
@@ -895,6 +1159,7 @@
       for (unsigned i = 0; i < VL.size(); ++i) {
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
+          BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
           return;
@@ -922,6 +1187,7 @@
         CmpInst *Cmp = cast<CmpInst>(VL[i]);
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
+          BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
           return;
@@ -968,20 +1234,8 @@
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(VL, Left, Right);
-        BasicBlock *LeftBB = getSameBlock(Left);
-        BasicBlock *RightBB = getSameBlock(Right);
-        // If we have common uses on separate paths in the tree make sure we
-        // process the one with greater common depth first.
-        // We can use block numbering to determine the subtree traversal as
-        // earler user has to come in between the common use and the later user.
-        if (LeftBB && RightBB && LeftBB == RightBB &&
-            getLastIndex(Right) > getLastIndex(Left)) {
-          buildTree_rec(Right, Depth + 1);
-          buildTree_rec(Left, Depth + 1);
-        } else {
-          buildTree_rec(Left, Depth + 1);
-          buildTree_rec(Right, Depth + 1);
-        }
+        buildTree_rec(Left, Depth + 1);
+        buildTree_rec(Right, Depth + 1);
         return;
       }
 
@@ -1000,6 +1254,7 @@
       for (unsigned j = 0; j < VL.size(); ++j) {
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+          BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           return;
         }
@@ -1012,6 +1267,7 @@
         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+          BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           return;
         }
@@ -1023,6 +1279,7 @@
         if (!isa<ConstantInt>(Op)) {
           DEBUG(
               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+          BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           return;
         }
@@ -1044,6 +1301,7 @@
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+          BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
@@ -1056,8 +1314,6 @@
       for (unsigned j = 0; j < VL.size(); ++j)
         Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
 
-      // We can ignore these values because we are sinking them down.
-      MemBarrierIgnoreList.insert(VL.begin(), VL.end());
       buildTree_rec(Operands, Depth + 1);
       return;
     }
@@ -1068,6 +1324,7 @@
       // represented by an intrinsic call
       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
+        BS.cancelScheduling(VL);
         newTreeEntry(VL, false);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
@@ -1080,6 +1337,7 @@
         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
         if (!CI2 || CI2->getCalledFunction() != Int ||
             getIntrinsicIDForCall(CI2, TLI) != ID) {
+          BS.cancelScheduling(VL);
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
@@ -1090,6 +1348,7 @@
         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
+            BS.cancelScheduling(VL);
             newTreeEntry(VL, false);
             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                          << " argument "<< A1I<<"!=" << A1J
@@ -1115,6 +1374,7 @@
       // If this is not an alternate sequence of opcode like add-sub
       // then do not vectorize this instruction.
       if (!isAltShuffle) {
+        BS.cancelScheduling(VL);
         newTreeEntry(VL, false);
         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
@@ -1132,6 +1392,7 @@
       return;
     }
     default:
+      BS.cancelScheduling(VL);
       newTreeEntry(VL, false);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
@@ -1234,6 +1495,10 @@
             TargetTransformInfo::OK_AnyValue;
         TargetTransformInfo::OperandValueKind Op2VK =
             TargetTransformInfo::OK_UniformConstantValue;
+        TargetTransformInfo::OperandValueProperties Op1VP =
+            TargetTransformInfo::OP_None;
+        TargetTransformInfo::OperandValueProperties Op2VP =
+            TargetTransformInfo::OP_None;
 
         // If all operands are exactly the same ConstantInt then set the
         // operand kind to OK_UniformConstantValue.
@@ -1255,11 +1520,17 @@
               CInt != cast<ConstantInt>(I->getOperand(1)))
             Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
         }
+        // FIXME: Currently cost of model modification for division by
+        // power of 2 is handled only for X86. Add support for other targets.
+        if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
+            CInt->getValue().isPowerOf2())
+          Op2VP = TargetTransformInfo::OP_PowerOf2;
 
-        ScalarCost =
-            VecTy->getNumElements() *
-            TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK);
-        VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK);
+        ScalarCost = VecTy->getNumElements() *
+                     TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
+                                                 Op1VP, Op2VP);
+        VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
+                                              Op1VP, Op2VP);
       }
       return VecCost - ScalarCost;
     }
@@ -1364,6 +1635,68 @@
   return true;
 }
 
+int BoUpSLP::getSpillCost() {
+  // Walk from the bottom of the tree to the top, tracking which values are
+  // live. When we see a call instruction that is not part of our tree,
+  // query TTI to see if there is a cost to keeping values live over it
+  // (for example, if spills and fills are required).
+  unsigned BundleWidth = VectorizableTree.front().Scalars.size();
+  int Cost = 0;
+
+  SmallPtrSet<Instruction*, 4> LiveValues;
+  Instruction *PrevInst = nullptr; 
+
+  for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
+    Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
+    if (!Inst)
+      continue;
+
+    if (!PrevInst) {
+      PrevInst = Inst;
+      continue;
+    }
+
+    DEBUG(
+      dbgs() << "SLP: #LV: " << LiveValues.size();
+      for (auto *X : LiveValues)
+        dbgs() << " " << X->getName();
+      dbgs() << ", Looking at ";
+      Inst->dump();
+      );
+
+    // Update LiveValues.
+    LiveValues.erase(PrevInst);
+    for (auto &J : PrevInst->operands()) {
+      if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
+        LiveValues.insert(cast<Instruction>(&*J));
+    }    
+
+    // Now find the sequence of instructions between PrevInst and Inst.
+    BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
+    --PrevInstIt;
+    while (InstIt != PrevInstIt) {
+      if (PrevInstIt == PrevInst->getParent()->rend()) {
+        PrevInstIt = Inst->getParent()->rbegin();
+        continue;
+      }
+
+      if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
+        SmallVector<Type*, 4> V;
+        for (auto *II : LiveValues)
+          V.push_back(VectorType::get(II->getType(), BundleWidth));
+        Cost += TTI->getCostOfKeepingLiveOverCall(V);
+      }
+
+      ++PrevInstIt;
+    }
+
+    PrevInst = Inst;
+  }
+
+  DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
+  return Cost;
+}
+
 int BoUpSLP::getTreeCost() {
   int Cost = 0;
   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
@@ -1391,7 +1724,13 @@
   for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
        I != E; ++I) {
     // We only add extract cost once for the same scalar.
-    if (!ExtractCostCalculated.insert(I->Scalar))
+    if (!ExtractCostCalculated.insert(I->Scalar).second)
+      continue;
+
+    // Uses by ephemeral values are free (because the ephemeral value will be
+    // removed prior to code generation, and so the extraction will be
+    // removed as well).
+    if (EphValues.count(I->User))
       continue;
 
     VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
@@ -1399,6 +1738,8 @@
                                            I->Lane);
   }
 
+  Cost += getSpillCost();
+
   DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
   return  Cost + ExtractCost;
 }
@@ -1420,14 +1761,6 @@
   return getGatherCost(VecTy);
 }
 
-AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return AA->getLocation(SI);
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return AA->getLocation(LI);
-  return AliasAnalysis::Location();
-}
-
 Value *BoUpSLP::getPointerOperand(Value *I) {
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
     return LI->getPointerOperand();
@@ -1485,59 +1818,9 @@
   return X == PtrSCEVB;
 }
 
-Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
-  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
-  BasicBlock::iterator I = Src, E = Dst;
-  /// Scan all of the instruction from SRC to DST and check if
-  /// the source may alias.
-  for (++I; I != E; ++I) {
-    // Ignore store instructions that are marked as 'ignore'.
-    if (MemBarrierIgnoreList.count(I))
-      continue;
-    if (Src->mayWriteToMemory()) /* Write */ {
-      if (!I->mayReadOrWriteMemory())
-        continue;
-    } else /* Read */ {
-      if (!I->mayWriteToMemory())
-        continue;
-    }
-    AliasAnalysis::Location A = getLocation(&*I);
-    AliasAnalysis::Location B = getLocation(Src);
-
-    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
-      return I;
-  }
-  return nullptr;
-}
-
-int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) {
-  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
-  assert(BB == getSameBlock(VL) && "Invalid block");
-  BlockNumbering &BN = getBlockNumbering(BB);
-
-  int MaxIdx = BN.getIndex(BB->getFirstNonPHI());
-  for (unsigned i = 0, e = VL.size(); i < e; ++i)
-    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
-  return MaxIdx;
-}
-
-Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) {
-  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
-  assert(BB == getSameBlock(VL) && "Invalid block");
-  BlockNumbering &BN = getBlockNumbering(BB);
-
-  int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
-  for (unsigned i = 1, e = VL.size(); i < e; ++i)
-    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
-  Instruction *I = BN.getInstruction(MaxIdx);
-  assert(I && "bad location");
-  return I;
-}
-
 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  Instruction *LastInst = getLastInstruction(VL);
-  BasicBlock::iterator NextInst = LastInst;
+  BasicBlock::iterator NextInst = VL0;
   ++NextInst;
   Builder.SetInsertPoint(VL0->getParent(), NextInst);
   Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
@@ -1620,6 +1903,7 @@
     setInsertPointAfterBundle(E->Scalars);
     return Gather(E->Scalars, VecTy);
   }
+
   unsigned Opcode = getSameOpcode(E->Scalars);
 
   switch (Opcode) {
@@ -1638,7 +1922,7 @@
         ValueList Operands;
         BasicBlock *IBB = PH->getIncomingBlock(i);
 
-        if (!VisitedBBs.insert(IBB)) {
+        if (!VisitedBBs.insert(IBB).second) {
           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
           continue;
         }
@@ -1693,6 +1977,7 @@
       CastInst *CI = dyn_cast<CastInst>(VL0);
       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
       E->VectorizedValue = V;
+      ++NumVectorInstructions;
       return V;
     }
     case Instruction::FCmp:
@@ -1719,6 +2004,7 @@
         V = Builder.CreateICmp(P0, L, R);
 
       E->VectorizedValue = V;
+      ++NumVectorInstructions;
       return V;
     }
     case Instruction::Select: {
@@ -1740,6 +2026,7 @@
 
       Value *V = Builder.CreateSelect(Cond, True, False);
       E->VectorizedValue = V;
+      ++NumVectorInstructions;
       return V;
     }
     case Instruction::Add:
@@ -1784,6 +2071,8 @@
       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
       E->VectorizedValue = V;
+      propagateIRFlags(E->VectorizedValue, E->Scalars);
+      ++NumVectorInstructions;
 
       if (Instruction *I = dyn_cast<Instruction>(V))
         return propagateMetadata(I, E->Scalars);
@@ -1796,16 +2085,26 @@
       setInsertPointAfterBundle(E->Scalars);
 
       LoadInst *LI = cast<LoadInst>(VL0);
+      Type *ScalarLoadTy = LI->getType();
       unsigned AS = LI->getPointerAddressSpace();
 
       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
                                             VecTy->getPointerTo(AS));
+
+      // The pointer operand uses an in-tree scalar so we add the new BitCast to
+      // ExternalUses list to make sure that an extract will be generated in the
+      // future.
+      if (ScalarToTreeEntry.count(LI->getPointerOperand()))
+        ExternalUses.push_back(
+            ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
+
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
       if (!Alignment)
-        Alignment = DL->getABITypeAlignment(LI->getPointerOperand()->getType());
+        Alignment = DL->getABITypeAlignment(ScalarLoadTy);
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
+      ++NumVectorInstructions;
       return propagateMetadata(LI, E->Scalars);
     }
     case Instruction::Store: {
@@ -1823,10 +2122,19 @@
       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
                                             VecTy->getPointerTo(AS));
       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+
+      // The pointer operand uses an in-tree scalar so we add the new BitCast to
+      // ExternalUses list to make sure that an extract will be generated in the
+      // future.
+      if (ScalarToTreeEntry.count(SI->getPointerOperand()))
+        ExternalUses.push_back(
+            ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
+
       if (!Alignment)
-        Alignment = DL->getABITypeAlignment(SI->getPointerOperand()->getType());
+        Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
       S->setAlignment(Alignment);
       E->VectorizedValue = S;
+      ++NumVectorInstructions;
       return propagateMetadata(S, E->Scalars);
     }
     case Instruction::GetElementPtr: {
@@ -1851,6 +2159,7 @@
 
       Value *V = Builder.CreateGEP(Op0, OpVecs);
       E->VectorizedValue = V;
+      ++NumVectorInstructions;
 
       if (Instruction *I = dyn_cast<Instruction>(V))
         return propagateMetadata(I, E->Scalars);
@@ -1862,6 +2171,7 @@
       setInsertPointAfterBundle(E->Scalars);
       Function *FI;
       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
+      Value *ScalarArg = nullptr;
       if (CI && (FI = CI->getCalledFunction())) {
         IID = (Intrinsic::ID) FI->getIntrinsicID();
       }
@@ -1872,6 +2182,7 @@
         // a scalar. This argument should not be vectorized.
         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+          ScalarArg = CEI->getArgOperand(j);
           OpVecs.push_back(CEI->getArgOperand(j));
           continue;
         }
@@ -1890,7 +2201,15 @@
       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
       Value *V = Builder.CreateCall(CF, OpVecs);
+
+      // The scalar argument uses an in-tree scalar so we add the new vectorized
+      // call to ExternalUses list to make sure that an extract will be
+      // generated in the future.
+      if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
+        ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
       E->VectorizedValue = V;
+      ++NumVectorInstructions;
       return V;
     }
     case Instruction::ShuffleVector: {
@@ -1916,21 +2235,29 @@
       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
 
-      // Create appropriate shuffle to take alternative operations from
-      // the vector.
-      std::vector<Constant *> Mask(E->Scalars.size());
+      // Create shuffle to take alternate operations from the vector.
+      // Also, gather up odd and even scalar ops to propagate IR flags to
+      // each vector operation.
+      ValueList OddScalars, EvenScalars;
       unsigned e = E->Scalars.size();
+      SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
-        if (i & 1)
+        if (i & 1) {
           Mask[i] = Builder.getInt32(e + i);
-        else
+          OddScalars.push_back(E->Scalars[i]);
+        } else {
           Mask[i] = Builder.getInt32(i);
+          EvenScalars.push_back(E->Scalars[i]);
+        }
       }
 
       Value *ShuffleMask = ConstantVector::get(Mask);
+      propagateIRFlags(V0, EvenScalars);
+      propagateIRFlags(V1, OddScalars);
 
       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
       E->VectorizedValue = V;
+      ++NumVectorInstructions;
       if (Instruction *I = dyn_cast<Instruction>(V))
         return propagateMetadata(I, E->Scalars);
 
@@ -1943,6 +2270,12 @@
 }
 
 Value *BoUpSLP::vectorizeTree() {
+  
+  // All blocks must be scheduled before any instructions are inserted.
+  for (auto &BSIter : BlocksSchedules) {
+    scheduleBlock(BSIter.second.get());
+  }
+
   Builder.SetInsertPoint(F->getEntryBlock().begin());
   vectorizeTree(&VectorizableTree[0]);
 
@@ -2031,9 +2364,6 @@
     }
   }
 
-  for (auto &BN : BlocksNumbers)
-    BN.second.forget();
-
   Builder.ClearInsertionPoint();
 
   return VectorizableTree[0].VectorizedValue;
@@ -2127,6 +2457,363 @@
   GatherSeq.clear();
 }
 
+// Groups the instructions to a bundle (which is then a single scheduling entity)
+// and schedules instructions until the bundle gets ready.
+bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
+                                                 AliasAnalysis *AA) {
+  if (isa<PHINode>(VL[0]))
+    return true;
+
+  // Initialize the instruction bundle.
+  Instruction *OldScheduleEnd = ScheduleEnd;
+  ScheduleData *PrevInBundle = nullptr;
+  ScheduleData *Bundle = nullptr;
+  bool ReSchedule = false;
+  DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
+  for (Value *V : VL) {
+    extendSchedulingRegion(V);
+    ScheduleData *BundleMember = getScheduleData(V);
+    assert(BundleMember &&
+           "no ScheduleData for bundle member (maybe not in same basic block)");
+    if (BundleMember->IsScheduled) {
+      // A bundle member was scheduled as single instruction before and now
+      // needs to be scheduled as part of the bundle. We just get rid of the
+      // existing schedule.
+      DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
+                   << " was already scheduled\n");
+      ReSchedule = true;
+    }
+    assert(BundleMember->isSchedulingEntity() &&
+           "bundle member already part of other bundle");
+    if (PrevInBundle) {
+      PrevInBundle->NextInBundle = BundleMember;
+    } else {
+      Bundle = BundleMember;
+    }
+    BundleMember->UnscheduledDepsInBundle = 0;
+    Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
+
+    // Group the instructions to a bundle.
+    BundleMember->FirstInBundle = Bundle;
+    PrevInBundle = BundleMember;
+  }
+  if (ScheduleEnd != OldScheduleEnd) {
+    // The scheduling region got new instructions at the lower end (or it is a
+    // new region for the first bundle). This makes it necessary to
+    // recalculate all dependencies.
+    // It is seldom that this needs to be done a second time after adding the
+    // initial bundle to the region.
+    for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+      ScheduleData *SD = getScheduleData(I);
+      SD->clearDependencies();
+    }
+    ReSchedule = true;
+  }
+  if (ReSchedule) {
+    resetSchedule();
+    initialFillReadyList(ReadyInsts);
+  }
+
+  DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+               << BB->getName() << "\n");
+
+  calculateDependencies(Bundle, true, AA);
+
+  // Now try to schedule the new bundle. As soon as the bundle is "ready" it
+  // means that there are no cyclic dependencies and we can schedule it.
+  // Note that's important that we don't "schedule" the bundle yet (see
+  // cancelScheduling).
+  while (!Bundle->isReady() && !ReadyInsts.empty()) {
+
+    ScheduleData *pickedSD = ReadyInsts.back();
+    ReadyInsts.pop_back();
+
+    if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
+      schedule(pickedSD, ReadyInsts);
+    }
+  }
+  return Bundle->isReady();
+}
+
+void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
+  if (isa<PHINode>(VL[0]))
+    return;
+
+  ScheduleData *Bundle = getScheduleData(VL[0]);
+  DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
+  assert(!Bundle->IsScheduled &&
+         "Can't cancel bundle which is already scheduled");
+  assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+         "tried to unbundle something which is not a bundle");
+
+  // Un-bundle: make single instructions out of the bundle.
+  ScheduleData *BundleMember = Bundle;
+  while (BundleMember) {
+    assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+    BundleMember->FirstInBundle = BundleMember;
+    ScheduleData *Next = BundleMember->NextInBundle;
+    BundleMember->NextInBundle = nullptr;
+    BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
+    if (BundleMember->UnscheduledDepsInBundle == 0) {
+      ReadyInsts.insert(BundleMember);
+    }
+    BundleMember = Next;
+  }
+}
+
+void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
+  if (getScheduleData(V))
+    return;
+  Instruction *I = dyn_cast<Instruction>(V);
+  assert(I && "bundle member must be an instruction");
+  assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+  if (!ScheduleStart) {
+    // It's the first instruction in the new region.
+    initScheduleData(I, I->getNextNode(), nullptr, nullptr);
+    ScheduleStart = I;
+    ScheduleEnd = I->getNextNode();
+    assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+    DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
+    return;
+  }
+  // Search up and down at the same time, because we don't know if the new
+  // instruction is above or below the existing scheduling region.
+  BasicBlock::reverse_iterator UpIter(ScheduleStart);
+  BasicBlock::reverse_iterator UpperEnd = BB->rend();
+  BasicBlock::iterator DownIter(ScheduleEnd);
+  BasicBlock::iterator LowerEnd = BB->end();
+  for (;;) {
+    if (UpIter != UpperEnd) {
+      if (&*UpIter == I) {
+        initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
+        ScheduleStart = I;
+        DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
+        return;
+      }
+      UpIter++;
+    }
+    if (DownIter != LowerEnd) {
+      if (&*DownIter == I) {
+        initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
+                         nullptr);
+        ScheduleEnd = I->getNextNode();
+        assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+        DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
+        return;
+      }
+      DownIter++;
+    }
+    assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
+           "instruction not found in block");
+  }
+}
+
+void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
+                                                Instruction *ToI,
+                                                ScheduleData *PrevLoadStore,
+                                                ScheduleData *NextLoadStore) {
+  ScheduleData *CurrentLoadStore = PrevLoadStore;
+  for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
+    ScheduleData *SD = ScheduleDataMap[I];
+    if (!SD) {
+      // Allocate a new ScheduleData for the instruction.
+      if (ChunkPos >= ChunkSize) {
+        ScheduleDataChunks.push_back(
+            llvm::make_unique<ScheduleData[]>(ChunkSize));
+        ChunkPos = 0;
+      }
+      SD = &(ScheduleDataChunks.back()[ChunkPos++]);
+      ScheduleDataMap[I] = SD;
+      SD->Inst = I;
+    }
+    assert(!isInSchedulingRegion(SD) &&
+           "new ScheduleData already in scheduling region");
+    SD->init(SchedulingRegionID);
+
+    if (I->mayReadOrWriteMemory()) {
+      // Update the linked list of memory accessing instructions.
+      if (CurrentLoadStore) {
+        CurrentLoadStore->NextLoadStore = SD;
+      } else {
+        FirstLoadStoreInRegion = SD;
+      }
+      CurrentLoadStore = SD;
+    }
+  }
+  if (NextLoadStore) {
+    if (CurrentLoadStore)
+      CurrentLoadStore->NextLoadStore = NextLoadStore;
+  } else {
+    LastLoadStoreInRegion = CurrentLoadStore;
+  }
+}
+
+/// \returns the AA location that is being access by the instruction.
+static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return AA->getLocation(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return AA->getLocation(LI);
+  return AliasAnalysis::Location();
+}
+
+void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
+                                                     bool InsertInReadyList,
+                                                     AliasAnalysis *AA) {
+  assert(SD->isSchedulingEntity());
+
+  SmallVector<ScheduleData *, 10> WorkList;
+  WorkList.push_back(SD);
+
+  while (!WorkList.empty()) {
+    ScheduleData *SD = WorkList.back();
+    WorkList.pop_back();
+
+    ScheduleData *BundleMember = SD;
+    while (BundleMember) {
+      assert(isInSchedulingRegion(BundleMember));
+      if (!BundleMember->hasValidDependencies()) {
+
+        DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
+        BundleMember->Dependencies = 0;
+        BundleMember->resetUnscheduledDeps();
+
+        // Handle def-use chain dependencies.
+        for (User *U : BundleMember->Inst->users()) {
+          if (isa<Instruction>(U)) {
+            ScheduleData *UseSD = getScheduleData(U);
+            if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+              BundleMember->Dependencies++;
+              ScheduleData *DestBundle = UseSD->FirstInBundle;
+              if (!DestBundle->IsScheduled) {
+                BundleMember->incrementUnscheduledDeps(1);
+              }
+              if (!DestBundle->hasValidDependencies()) {
+                WorkList.push_back(DestBundle);
+              }
+            }
+          } else {
+            // I'm not sure if this can ever happen. But we need to be safe.
+            // This lets the instruction/bundle never be scheduled and eventally
+            // disable vectorization.
+            BundleMember->Dependencies++;
+            BundleMember->incrementUnscheduledDeps(1);
+          }
+        }
+
+        // Handle the memory dependencies.
+        ScheduleData *DepDest = BundleMember->NextLoadStore;
+        if (DepDest) {
+          AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA);
+          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+
+          while (DepDest) {
+            assert(isInSchedulingRegion(DepDest));
+            if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
+              AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA);
+              if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) {
+                DepDest->MemoryDependencies.push_back(BundleMember);
+                BundleMember->Dependencies++;
+                ScheduleData *DestBundle = DepDest->FirstInBundle;
+                if (!DestBundle->IsScheduled) {
+                  BundleMember->incrementUnscheduledDeps(1);
+                }
+                if (!DestBundle->hasValidDependencies()) {
+                  WorkList.push_back(DestBundle);
+                }
+              }
+            }
+            DepDest = DepDest->NextLoadStore;
+          }
+        }
+      }
+      BundleMember = BundleMember->NextInBundle;
+    }
+    if (InsertInReadyList && SD->isReady()) {
+      ReadyInsts.push_back(SD);
+      DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
+    }
+  }
+}
+
+void BoUpSLP::BlockScheduling::resetSchedule() {
+  assert(ScheduleStart &&
+         "tried to reset schedule on block which has not been scheduled");
+  for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+    ScheduleData *SD = getScheduleData(I);
+    assert(isInSchedulingRegion(SD));
+    SD->IsScheduled = false;
+    SD->resetUnscheduledDeps();
+  }
+  ReadyInsts.clear();
+}
+
+void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+  
+  if (!BS->ScheduleStart)
+    return;
+  
+  DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+
+  BS->resetSchedule();
+
+  // For the real scheduling we use a more sophisticated ready-list: it is
+  // sorted by the original instruction location. This lets the final schedule
+  // be as  close as possible to the original instruction order.
+  struct ScheduleDataCompare {
+    bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
+      return SD2->SchedulingPriority < SD1->SchedulingPriority;
+    }
+  };
+  std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+
+  // Ensure that all depencency data is updated and fill the ready-list with
+  // initial instructions.
+  int Idx = 0;
+  int NumToSchedule = 0;
+  for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+       I = I->getNextNode()) {
+    ScheduleData *SD = BS->getScheduleData(I);
+    assert(
+        SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
+        "scheduler and vectorizer have different opinion on what is a bundle");
+    SD->FirstInBundle->SchedulingPriority = Idx++;
+    if (SD->isSchedulingEntity()) {
+      BS->calculateDependencies(SD, false, AA);
+      NumToSchedule++;
+    }
+  }
+  BS->initialFillReadyList(ReadyInsts);
+
+  Instruction *LastScheduledInst = BS->ScheduleEnd;
+
+  // Do the "real" scheduling.
+  while (!ReadyInsts.empty()) {
+    ScheduleData *picked = *ReadyInsts.begin();
+    ReadyInsts.erase(ReadyInsts.begin());
+
+    // Move the scheduled instruction(s) to their dedicated places, if not
+    // there yet.
+    ScheduleData *BundleMember = picked;
+    while (BundleMember) {
+      Instruction *pickedInst = BundleMember->Inst;
+      if (LastScheduledInst->getNextNode() != pickedInst) {
+        BS->BB->getInstList().remove(pickedInst);
+        BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
+      }
+      LastScheduledInst = pickedInst;
+      BundleMember = BundleMember->NextInBundle;
+    }
+
+    BS->schedule(picked, ReadyInsts);
+    NumToSchedule--;
+  }
+  assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+  // Avoid duplicate scheduling of the block.
+  BS->ScheduleStart = nullptr;
+}
+
 /// The SLPVectorizer Pass.
 struct SLPVectorizer : public FunctionPass {
   typedef SmallVector<StoreInst *, 8> StoreList;
@@ -2146,6 +2833,7 @@
   AliasAnalysis *AA;
   LoopInfo *LI;
   DominatorTree *DT;
+  AssumptionTracker *AT;
 
   bool runOnFunction(Function &F) override {
     if (skipOptnoneFunction(F))
@@ -2159,6 +2847,7 @@
     AA = &getAnalysis<AliasAnalysis>();
     LI = &getAnalysis<LoopInfo>();
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    AT = &getAnalysis<AssumptionTracker>();
 
     StoreRefs.clear();
     bool Changed = false;
@@ -2181,7 +2870,7 @@
 
     // Use the bottom up slp vectorizer to construct chains that start with
     // store instructions.
-    BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT);
+    BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AT);
 
     // Scan the blocks in the function in post order.
     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
@@ -2208,6 +2897,7 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<AssumptionTracker>();
     AU.addRequired<ScalarEvolution>();
     AU.addRequired<AliasAnalysis>();
     AU.addRequired<TargetTransformInfo>();
@@ -2234,7 +2924,8 @@
   ///                     scheduling and that don't need extracting.
   /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                          ArrayRef<Value *> BuildVector = None);
+                          ArrayRef<Value *> BuildVector = None,
+                          bool allowReorder = false);
 
   /// \brief Try to vectorize a chain that may start at the operands of \V;
   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
@@ -2404,11 +3095,12 @@
   if (!A || !B)
     return false;
   Value *VL[] = { A, B };
-  return tryToVectorizeList(VL, R);
+  return tryToVectorizeList(VL, R, None, true);
 }
 
 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                       ArrayRef<Value *> BuildVector) {
+                                       ArrayRef<Value *> BuildVector,
+                                       bool allowReorder) {
   if (VL.size() < 2)
     return false;
 
@@ -2463,6 +3155,14 @@
       BuildVectorSlice = BuildVector.slice(i, OpsWidth);
 
     R.buildTree(Ops, BuildVectorSlice);
+    // TODO: check if we can allow reordering also for other cases than
+    // tryToVectorizePair()
+    if (allowReorder && R.shouldReorder()) {
+      assert(Ops.size() == 2);
+      assert(BuildVectorSlice.empty());
+      Value *ReorderedOps[] = { Ops[1], Ops[0] };
+      R.buildTree(ReorderedOps, None);
+    }
     int Cost = R.getTreeCost();
 
     if (Cost < -SLPCostThreshold) {
@@ -2514,11 +3214,9 @@
     BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
     BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
     if (tryToVectorizePair(A, B0, R)) {
-      B->moveBefore(V);
       return true;
     }
     if (tryToVectorizePair(A, B1, R)) {
-      B->moveBefore(V);
       return true;
     }
   }
@@ -2528,11 +3226,9 @@
     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
     if (tryToVectorizePair(A0, B, R)) {
-      A->moveBefore(V);
       return true;
     }
     if (tryToVectorizePair(A1, B, R)) {
-      A->moveBefore(V);
       return true;
     }
   }
@@ -2728,8 +3424,7 @@
     unsigned i = 0;
 
     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
-      ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
-      V.buildTree(ValsToReduce, ReductionOps);
+      V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
 
       // Estimate cost.
       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
@@ -2921,8 +3616,7 @@
       // Try to vectorize them.
       unsigned NumElts = (SameTypeIt - IncIt);
       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
-      if (NumElts > 1 &&
-          tryToVectorizeList(ArrayRef<Value *>(IncIt, NumElts), R)) {
+      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
@@ -2938,7 +3632,7 @@
 
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
     // We may go through BB multiple times so skip the one we have checked.
-    if (!VisitedInstrs.insert(it))
+    if (!VisitedInstrs.insert(it).second)
       continue;
 
     if (isa<DbgInfoIntrinsic>(it))
@@ -3002,6 +3696,21 @@
           }
         }
 
+    // Try to vectorize horizontal reductions feeding into a return.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
+      if (RI->getNumOperands() != 0)
+        if (BinaryOperator *BinOp =
+                dyn_cast<BinaryOperator>(RI->getOperand(0))) {
+          DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
+          if (tryToVectorizePair(BinOp->getOperand(0),
+                                 BinOp->getOperand(1), R)) {
+            Changed = true;
+            it = BB->begin();
+            e = BB->end();
+            continue;
+          }
+        }
+
     // Try to vectorize trees that start at compare instructions.
     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
@@ -3014,15 +3723,15 @@
       }
 
       for (int i = 0; i < 2; ++i) {
-         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
-            if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
-              Changed = true;
-              // We would like to start over since some instructions are deleted
-              // and the iterator may become invalid value.
-              it = BB->begin();
-              e = BB->end();
-            }
-         }
+        if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
+          if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
+            Changed = true;
+            // We would like to start over since some instructions are deleted
+            // and the iterator may become invalid value.
+            it = BB->begin();
+            e = BB->end();
+          }
+        }
       }
       continue;
     }
@@ -3064,8 +3773,8 @@
     // Process the stores in chunks of 16.
     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
       unsigned Len = std::min<unsigned>(CE - CI, 16);
-      ArrayRef<StoreInst *> Chunk(&it->second[CI], Len);
-      Changed |= vectorizeStores(Chunk, -SLPCostThreshold, R);
+      Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
+                                 -SLPCostThreshold, R);
     }
   }
   return Changed;
@@ -3078,6 +3787,7 @@
 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)

diff --git a/llvm-tblgen-rules.mk b/llvm-tblgen-rules.mk
index 57be1a7..ba59471 100644
--- a/llvm-tblgen-rules.mk
+++ b/llvm-tblgen-rules.mk

@@ -41,51 +41,51 @@
 ifeq ($(tblgen_source_dir),$(LLVM_ROOT_PATH)/lib/Target/ARM/MCTargetDesc)
 $(generated_sources)/%GenRegisterInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenRegisterInfo.inc: $(tblgen_source_dir)/../%.td \
-                                       $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                       $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out, register-info)
 
 $(generated_sources)/%GenInstrInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenInstrInfo.inc: $(tblgen_source_dir)/../%.td \
-                                    $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                    $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,instr-info)
 
 $(generated_sources)/%GenSubtargetInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenSubtargetInfo.inc: $(tblgen_source_dir)/../%.td \
-                                        $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                        $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,subtarget)
 endif
 
 ifeq ($(tblgen_source_dir),$(LLVM_ROOT_PATH)/lib/Target/X86/MCTargetDesc)
 $(generated_sources)/%GenRegisterInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenRegisterInfo.inc: $(tblgen_source_dir)/../%.td \
-                                       $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                       $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out, register-info)
 
 $(generated_sources)/%GenInstrInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenInstrInfo.inc: $(tblgen_source_dir)/../%.td \
-                                    $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                    $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,instr-info)
 
 $(generated_sources)/%GenSubtargetInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenSubtargetInfo.inc: $(tblgen_source_dir)/../%.td \
-                                        $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                        $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,subtarget)
 endif
 
 ifeq ($(tblgen_source_dir),$(LLVM_ROOT_PATH)/lib/Target/Mips/MCTargetDesc)
 $(generated_sources)/%GenRegisterInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenRegisterInfo.inc: $(tblgen_source_dir)/../%.td \
-                                       $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                       $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out, register-info)
 
 $(generated_sources)/%GenInstrInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenInstrInfo.inc: $(tblgen_source_dir)/../%.td \
-                                    $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                    $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,instr-info)
 
 $(generated_sources)/%GenSubtargetInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenSubtargetInfo.inc: $(tblgen_source_dir)/../%.td \
-                                        $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                        $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,subtarget)
 endif
 
@@ -93,112 +93,113 @@
 ifneq ($(filter %GenRegisterInfo.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenRegisterInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenRegisterInfo.inc: $(tblgen_source_dir)/%.td \
-                                       $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                       $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,register-info)
 endif
 
 ifneq ($(filter %GenInstrInfo.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenInstrInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenInstrInfo.inc: $(tblgen_source_dir)/%.td \
-                                    $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                    $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,instr-info)
 endif
 
 ifneq ($(filter %GenAsmWriter.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenAsmWriter.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenAsmWriter.inc: $(tblgen_source_dir)/%.td \
-                                    $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                    $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,asm-writer)
 endif
 
 ifneq ($(filter %GenAsmWriter1.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenAsmWriter1.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenAsmWriter1.inc: $(tblgen_source_dir)/%.td \
-                                     $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                     $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,asm-writer -asmwriternum=1)
 endif
 
 ifneq ($(filter %GenAsmMatcher.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenAsmMatcher.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenAsmMatcher.inc: $(tblgen_source_dir)/%.td \
-                                     $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                     $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,asm-matcher)
 endif
 
+# TODO(srhines): Is this needed
 ifneq ($(filter %GenCodeEmitter.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenCodeEmitter.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenCodeEmitter.inc: $(tblgen_source_dir)/%.td \
-                                      $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                      $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,emitter)
 endif
 
 ifneq ($(filter %GenMCCodeEmitter.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenMCCodeEmitter.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenMCCodeEmitter.inc: $(tblgen_source_dir)/%.td \
-                                        $(tblgen_td_deps) | $(LLVM_TBLGEN)
-	$(call transform-td-to-out,emitter -mc-emitter)
+                                        $(tblgen_td_deps) $(LLVM_TBLGEN)
+	$(call transform-td-to-out,emitter)
 endif
 
 ifneq ($(filter %GenMCPseudoLowering.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenMCPseudoLowering.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenMCPseudoLowering.inc: $(tblgen_source_dir)/%.td \
-                                           $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                           $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,pseudo-lowering)
 endif
 
 ifneq ($(filter %GenDAGISel.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenDAGISel.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenDAGISel.inc: $(tblgen_source_dir)/%.td \
-                                  $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                  $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,dag-isel)
 endif
 
 ifneq ($(filter %GenDisassemblerTables.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenDisassemblerTables.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenDisassemblerTables.inc: $(tblgen_source_dir)/%.td \
-                                             $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                             $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,disassembler)
 endif
 
 ifneq ($(filter %GenEDInfo.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenEDInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenEDInfo.inc: $(tblgen_source_dir)/%.td \
-                                 $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                 $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,enhanced-disassembly-info)
 endif
 
 ifneq ($(filter %GenFastISel.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenFastISel.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenFastISel.inc: $(tblgen_source_dir)/%.td \
-                                   $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                   $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,fast-isel)
 endif
 
 ifneq ($(filter %GenSubtargetInfo.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenSubtargetInfo.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenSubtargetInfo.inc: $(tblgen_source_dir)/%.td \
-                                        $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                        $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,subtarget)
 endif
 
 ifneq ($(filter %GenCallingConv.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenCallingConv.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenCallingConv.inc: $(tblgen_source_dir)/%.td \
-                                      $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                      $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,callingconv)
 endif
 
 ifneq ($(filter %GenIntrinsics.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/%GenIntrinsics.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/%GenIntrinsics.inc: $(tblgen_source_dir)/%.td \
-                                     $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                     $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,tgt_intrinsics)
 endif
 
 ifneq ($(findstring ARMGenDecoderTables.inc,$(tblgen_gen_tables)),)
 $(generated_sources)/ARMGenDecoderTables.inc: TBLGEN_LOCAL_MODULE := $(LOCAL_MODULE)
 $(generated_sources)/ARMGenDecoderTables.inc: $(tblgen_source_dir)/ARM.td \
-                                          $(tblgen_td_deps) | $(LLVM_TBLGEN)
+                                          $(tblgen_td_deps) $(LLVM_TBLGEN)
 	$(call transform-td-to-out,arm-decoder)
 endif
 

diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index e23a277..8d244fd 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt

@@ -6,7 +6,8 @@
   if(IS_DIRECTORY ${entry} AND EXISTS ${entry}/CMakeLists.txt)
     if((NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/dragonegg) AND
-       (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libcxx))
+       (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libcxx) AND
+       (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libcxxabi))
       add_subdirectory(${entry})
     endif()
   endif()
@@ -19,6 +20,7 @@
   # fixed.
   if(NOT MSVC)
     add_llvm_external_project(libcxx)
+    add_llvm_external_project(libcxxabi)
   endif()
   if(NOT LLVM_BUILD_EXTERNAL_COMPILER_RT)
     add_llvm_external_project(compiler-rt)

diff --git a/shared_llvm.mk b/shared_llvm.mk
index 11a1cfe..e9bedb5 100644
--- a/shared_llvm.mk
+++ b/shared_llvm.mk

@@ -55,6 +55,7 @@
   libLLVMipa \
   libLLVMAnalysis \
   libLLVMTarget \
+  libLLVMMCDisassembler \
   libLLVMMC \
   libLLVMMCParser \
   libLLVMCore \
@@ -66,9 +67,7 @@
 
 llvm_host_static_libraries := \
   libLLVMExecutionEngine \
-  libLLVMMCDisassembler \
   libLLVMRuntimeDyld \
-  libLLVMJIT \
   libLLVMMCJIT
 
 ifeq (true,$(FORCE_BUILD_LLVM_COMPONENTS))

diff --git a/test/Analysis/BasicAA/2008-04-15-Byval.ll b/test/Analysis/BasicAA/2008-04-15-Byval.ll
index 428189a..2ea0314 100644
--- a/test/Analysis/BasicAA/2008-04-15-Byval.ll
+++ b/test/Analysis/BasicAA/2008-04-15-Byval.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts -S | FileCheck %s
+; RUN: opt < %s -O3 -S | FileCheck %s
 ; ModuleID = 'small2.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"

diff --git a/test/Analysis/BasicAA/assume.ll b/test/Analysis/BasicAA/assume.ll
new file mode 100644
index 0000000..e163b5a
--- /dev/null
+++ b/test/Analysis/BasicAA/assume.ll

@@ -0,0 +1,23 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #0
+declare void @llvm.assume(i1) #0
+
+define void @test1(i8* %P, i8* %Q) nounwind ssp {
+  tail call void @llvm.assume(i1 true)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+  ret void
+
+; CHECK-LABEL: Function: test1:
+
+; CHECK: MayAlias:	i8* %P, i8* %Q
+; CHECK: NoModRef:  Ptr: i8* %P	<->  tail call void @llvm.assume(i1 true)
+; CHECK: NoModRef:  Ptr: i8* %Q	<->  tail call void @llvm.assume(i1 true)
+; CHECK: Both ModRef:  Ptr: i8* %P	<->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: Both ModRef:  Ptr: i8* %Q	<->  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:   tail call void @llvm.assume(i1 true) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
+; CHECK: NoModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <->   tail call void @llvm.assume(i1 true)
+}
+
+attributes #0 = { nounwind }

diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll
index 682e4b6..693634c 100644
--- a/test/Analysis/BasicAA/cs-cs.ll
+++ b/test/Analysis/BasicAA/cs-cs.ll

@@ -8,6 +8,8 @@
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
+declare void @a_readonly_func(i8 *) noinline nounwind readonly
+
 define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
 entry:
   %q = getelementptr i8* %p, i64 16
@@ -218,4 +220,17 @@
 ; CHECK: Both ModRef:   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <->   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false)
 }
 
+define void @test6(i8* %P) nounwind ssp {
+  call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
+  call void @a_readonly_func(i8* %P)
+  ret void
+
+; CHECK-LABEL: Function: test6:
+
+; CHECK: Just Mod:  Ptr: i8* %P        <->  call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
+; CHECK: Just Ref:  Ptr: i8* %P        <->  call void @a_readonly_func(i8* %P)
+; CHECK: Just Mod:   call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) <->   call void @a_readonly_func(i8* %P)
+; CHECK: Just Ref:   call void @a_readonly_func(i8* %P) <->   call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false)
+}
+
 attributes #0 = { nounwind }

diff --git a/test/Analysis/BasicAA/gcsetest.ll b/test/Analysis/BasicAA/gcsetest.ll
index db557b7..64792eb 100644
--- a/test/Analysis/BasicAA/gcsetest.ll
+++ b/test/Analysis/BasicAA/gcsetest.ll

@@ -1,5 +1,5 @@
-; Test that GCSE uses basicaa to do alias analysis, which is capable of 
-; disambiguating some obvious cases.  All loads should be removable in 
+; Test that GCSE uses basicaa to do alias analysis, which is capable of
+; disambiguating some obvious cases.  All loads should be removable in
 ; this testcase.
 
 ; RUN: opt < %s -basicaa -gvn -instcombine -dce -S | FileCheck %s

diff --git a/test/Analysis/BasicAA/modref.ll b/test/Analysis/BasicAA/modref.ll
index 8421faf..0d8bf71 100644
--- a/test/Analysis/BasicAA/modref.ll
+++ b/test/Analysis/BasicAA/modref.ll

@@ -8,20 +8,20 @@
 define i32 @test0(i8* %P) {
   %A = alloca i32
   call void @external(i32* %A)
-  
+
   store i32 0, i32* %A
-  
+
   call void @llvm.memset.p0i8.i32(i8* %P, i8 0, i32 42, i32 1, i1 false)
-  
+
   %B = load i32* %A
   ret i32 %B
-  
-; CHECK: @test0
+
+; CHECK-LABEL: @test0
 ; CHECK: ret i32 0
 }
 
 define i8 @test1() {
-; CHECK: @test1
+; CHECK-LABEL: @test1
   %A = alloca i8
   %B = alloca i8
 
@@ -35,7 +35,7 @@
 }
 
 define i8 @test2(i8* %P) {
-; CHECK: @test2
+; CHECK-LABEL: @test2
   %P2 = getelementptr i8* %P, i32 127
   store i8 1, i8* %P2  ;; Not dead across memset
   call void @llvm.memset.p0i8.i8(i8* %P, i8 2, i8 127, i32 0, i1 false)
@@ -45,12 +45,12 @@
 }
 
 define i8 @test2a(i8* %P) {
-; CHECK: @test2
+; CHECK-LABEL: @test2
   %P2 = getelementptr i8* %P, i32 126
-  
+
   ;; FIXME: DSE isn't zapping this dead store.
   store i8 1, i8* %P2  ;; Dead, clobbered by memset.
-  
+
   call void @llvm.memset.p0i8.i8(i8* %P, i8 2, i8 127, i32 0, i1 false)
   %A = load i8* %P2
   ret i8 %A
@@ -59,11 +59,11 @@
 }
 
 define void @test3(i8* %P, i8 %X) {
-; CHECK: @test3
+; CHECK-LABEL: @test3
 ; CHECK-NOT: store
 ; CHECK-NOT: %Y
   %Y = add i8 %X, 1     ;; Dead, because the only use (the store) is dead.
-  
+
   %P2 = getelementptr i8* %P, i32 2
   store i8 %Y, i8* %P2  ;; Not read by lifetime.end, should be removed.
 ; CHECK: store i8 2, i8* %P2
@@ -75,9 +75,9 @@
 }
 
 define void @test3a(i8* %P, i8 %X) {
-; CHECK: @test3a
+; CHECK-LABEL: @test3a
   %Y = add i8 %X, 1     ;; Dead, because the only use (the store) is dead.
-  
+
   %P2 = getelementptr i8* %P, i32 2
   store i8 %Y, i8* %P2
 ; CHECK-NEXT: call void @llvm.lifetime.end
@@ -95,7 +95,7 @@
   %tmp2 = load i32* @G1
   %sub = sub i32 %tmp2, %tmp
   ret i32 %sub
-; CHECK: @test4
+; CHECK-LABEL: @test4
 ; CHECK-NOT: load
 ; CHECK: memset.p0i8.i32
 ; CHECK-NOT: load
@@ -123,7 +123,7 @@
   %y = load i8* %a
   %z = add i8 %x, %y
   ret i8 %z
-; CHECK: @test6
+; CHECK-LABEL: @test6
 ; CHECK: load i8* %a
 ; CHECK-NOT: load
 ; CHECK: ret
@@ -139,7 +139,7 @@
   call void @test7decl(i32* %add.ptr)
   %tmp = load i32* %x, align 4
   ret i32 %tmp
-; CHECK: @test7(
+; CHECK-LABEL: @test7(
 ; CHECK: store i32 0
 ; CHECK: call void @test7decl
 ; CHECK: load i32*

diff --git a/test/Analysis/BasicAA/phi-aa.ll b/test/Analysis/BasicAA/phi-aa.ll
index 74279e1..c1100f1 100644
--- a/test/Analysis/BasicAA/phi-aa.ll
+++ b/test/Analysis/BasicAA/phi-aa.ll

@@ -39,6 +39,7 @@
 
 ; CHECK-LABEL: pr18068
 ; CHECK: MayAlias: i32* %0, i32* %arrayidx5
+; CHECK: NoAlias: i32* %arrayidx13, i32* %arrayidx5
 
 define i32 @pr18068(i32* %jj7, i32* %j) {
 entry:

diff --git a/test/Analysis/BasicAA/zext.ll b/test/Analysis/BasicAA/zext.ll
new file mode 100644
index 0000000..b59d16c
--- /dev/null
+++ b/test/Analysis/BasicAA/zext.ll

@@ -0,0 +1,209 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: test_with_zext
+; CHECK:  NoAlias: i8* %a, i8* %b
+
+define void @test_with_zext() {
+  %1 = tail call i8* @malloc(i64 120)
+  %a = getelementptr inbounds i8* %1, i64 8
+  %2 = getelementptr inbounds i8* %1, i64 16
+  %3 = zext i32 3 to i64
+  %b = getelementptr inbounds i8* %2, i64 %3
+  ret void
+}
+
+; CHECK-LABEL: test_with_lshr
+; CHECK:  NoAlias: i8* %a, i8* %b
+
+define void @test_with_lshr(i64 %i) {
+  %1 = tail call i8* @malloc(i64 120)
+  %a = getelementptr inbounds i8* %1, i64 8
+  %2 = getelementptr inbounds i8* %1, i64 16
+  %3 = lshr i64 %i, 2
+  %b = getelementptr inbounds i8* %2, i64 %3
+  ret void
+}
+
+; CHECK-LABEL: test_with_a_loop
+; CHECK:  NoAlias: i8* %a, i8* %b
+
+define void @test_with_a_loop(i8* %mem) {
+  br label %for.loop
+
+for.loop:
+  %i = phi i32 [ 0, %0 ], [ %i.plus1, %for.loop ]
+  %a = getelementptr inbounds i8* %mem, i64 8
+  %a.plus1 = getelementptr inbounds i8* %mem, i64 16
+  %i.64 = zext i32 %i to i64
+  %b = getelementptr inbounds i8* %a.plus1, i64 %i.64
+  %i.plus1 = add nuw nsw i32 %i, 1
+  %cmp = icmp eq i32 %i.plus1, 10
+  br i1 %cmp, label %for.loop.exit, label %for.loop
+
+for.loop.exit:
+  ret void
+}
+
+; CHECK-LABEL: test_with_varying_base_pointer_in_loop
+; CHECK:  NoAlias: i8* %a, i8* %b
+
+define void @test_with_varying_base_pointer_in_loop(i8* %mem.orig) {
+  br label %for.loop
+
+for.loop:
+  %mem = phi i8* [ %mem.orig, %0 ], [ %mem.plus1, %for.loop ]
+  %i = phi i32 [ 0, %0 ], [ %i.plus1, %for.loop ]
+  %a = getelementptr inbounds i8* %mem, i64 8
+  %a.plus1 = getelementptr inbounds i8* %mem, i64 16
+  %i.64 = zext i32 %i to i64
+  %b = getelementptr inbounds i8* %a.plus1, i64 %i.64
+  %i.plus1 = add nuw nsw i32 %i, 1
+  %mem.plus1 = getelementptr inbounds i8* %mem, i64 8
+  %cmp = icmp eq i32 %i.plus1, 10
+  br i1 %cmp, label %for.loop.exit, label %for.loop
+
+for.loop.exit:
+  ret void
+}
+
+; CHECK-LABEL: test_sign_extension
+; CHECK:  PartialAlias: i64* %b.i64, i8* %a
+
+define void @test_sign_extension(i32 %p) {
+  %1 = tail call i8* @malloc(i64 120)
+  %p.64 = zext i32 %p to i64
+  %a = getelementptr inbounds i8* %1, i64 %p.64
+  %p.minus1 = add i32 %p, -1
+  %p.minus1.64 = zext i32 %p.minus1 to i64
+  %b.i8 = getelementptr inbounds i8* %1, i64 %p.minus1.64
+  %b.i64 = bitcast i8* %b.i8 to i64*
+  ret void
+}
+
+; CHECK-LABEL: test_fe_tools
+; CHECK:  PartialAlias: i32* %a, i32* %b
+
+define void @test_fe_tools([8 x i32]* %values) {
+  br label %reorder
+
+for.loop:
+  %i = phi i32 [ 0, %reorder ], [ %i.next, %for.loop ]
+  %idxprom = zext i32 %i to i64
+  %b = getelementptr inbounds [8 x i32]* %values, i64 0, i64 %idxprom
+  %i.next = add nuw nsw i32 %i, 1
+  %1 = icmp eq i32 %i.next, 10
+  br i1 %1, label %for.loop.exit, label %for.loop
+
+reorder:
+  %a = getelementptr inbounds [8 x i32]* %values, i64 0, i64 1
+  br label %for.loop
+
+for.loop.exit:
+  ret void
+}
+
+@b = global i32 0, align 4
+@d = global i32 0, align 4
+
+; CHECK-LABEL: test_spec2006
+; CHECK:  PartialAlias: i32** %x, i32** %y
+
+define void @test_spec2006() {
+  %h = alloca [1 x [2 x i32*]], align 16
+  %d.val = load i32* @d, align 4
+  %d.promoted = sext i32 %d.val to i64
+  %1 = icmp slt i32 %d.val, 2
+  br i1 %1, label %.lr.ph, label %3
+
+.lr.ph:                                           ; preds = %0
+  br label %2
+
+; <label>:2                                       ; preds = %.lr.ph, %2
+  %i = phi i32 [ %d.val, %.lr.ph ], [ %i.plus1, %2 ]
+  %i.promoted = sext i32 %i to i64
+  %x = getelementptr inbounds [1 x [2 x i32*]]* %h, i64 0, i64 %d.promoted, i64 %i.promoted
+  %i.plus1 = add nsw i32 %i, 1
+  %cmp = icmp slt i32 %i.plus1, 2
+  br i1 %cmp, label %2, label %3
+
+; <label>:3                                      ; preds = %._crit_edge, %0
+  %y = getelementptr inbounds [1 x [2 x i32*]]* %h, i64 0, i64 0, i64 1
+  ret void
+}
+
+; CHECK-LABEL: test_modulo_analysis_easy_case
+; CHECK:  NoAlias: i32** %x, i32** %y
+
+define void @test_modulo_analysis_easy_case(i64 %i) {
+  %h = alloca [1 x [2 x i32*]], align 16
+  %x = getelementptr inbounds [1 x [2 x i32*]]* %h, i64 0, i64 %i, i64 0
+  %y = getelementptr inbounds [1 x [2 x i32*]]* %h, i64 0, i64 0, i64 1
+  ret void
+}
+
+; CHECK-LABEL: test_modulo_analysis_in_loop
+; CHECK:  NoAlias: i32** %x, i32** %y
+
+define void @test_modulo_analysis_in_loop() {
+  %h = alloca [1 x [2 x i32*]], align 16
+  br label %for.loop
+
+for.loop:
+  %i = phi i32 [ 0, %0 ], [ %i.plus1, %for.loop ]
+  %i.promoted = sext i32 %i to i64
+  %x = getelementptr inbounds [1 x [2 x i32*]]* %h, i64 0, i64 %i.promoted, i64 0
+  %y = getelementptr inbounds [1 x [2 x i32*]]* %h, i64 0, i64 0, i64 1
+  %i.plus1 = add nsw i32 %i, 1
+  %cmp = icmp slt i32 %i.plus1, 2
+  br i1 %cmp, label %for.loop, label %for.loop.exit
+
+for.loop.exit:
+  ret void
+}
+
+; CHECK-LABEL: test_modulo_analysis_with_global
+; CHECK:  PartialAlias: i32** %x, i32** %y
+
+define void @test_modulo_analysis_with_global() {
+  %h = alloca [1 x [2 x i32*]], align 16
+  %b = load i32* @b, align 4
+  %b.promoted = sext i32 %b to i64
+  br label %for.loop
+
+for.loop:
+  %i = phi i32 [ 0, %0 ], [ %i.plus1, %for.loop ]
+  %i.promoted = sext i32 %i to i64
+  %x = getelementptr inbounds [1 x [2 x i32*]]* %h, i64 0, i64 %i.promoted, i64 %b.promoted
+  %y = getelementptr inbounds [1 x [2 x i32*]]* %h, i64 0, i64 0, i64 1
+  %i.plus1 = add nsw i32 %i, 1
+  %cmp = icmp slt i32 %i.plus1, 2
+  br i1 %cmp, label %for.loop, label %for.loop.exit
+
+for.loop.exit:
+  ret void
+}
+
+; CHECK-LABEL: test_const_eval
+; CHECK: NoAlias: i8* %a, i8* %b
+define void @test_const_eval(i8* %ptr, i64 %offset) {
+  %a = getelementptr inbounds i8* %ptr, i64 %offset
+  %a.dup = getelementptr inbounds i8* %ptr, i64 %offset
+  %three = zext i32 3 to i64
+  %b = getelementptr inbounds i8* %a.dup, i64 %three
+  ret void
+}
+
+; CHECK-LABEL: test_const_eval_scaled
+; CHECK: MustAlias: i8* %a, i8* %b
+define void @test_const_eval_scaled(i8* %ptr) {
+  %three = zext i32 3 to i64
+  %six = mul i64 %three, 2
+  %a = getelementptr inbounds i8* %ptr, i64 %six
+  %b = getelementptr inbounds i8* %ptr, i64 6
+  ret void
+}
+
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i64)

diff --git a/test/Analysis/CFLAliasAnalysis/arguments-globals.ll b/test/Analysis/CFLAliasAnalysis/arguments-globals.ll
new file mode 100644
index 0000000..18bbe8b
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/arguments-globals.ll

@@ -0,0 +1,20 @@
+; This testcase ensures that CFL AA gives conservative answers on variables
+; that involve arguments.
+; (Everything should alias everything, because args can alias globals, so the
+; aliasing sets should of args+alloca+global should be combined)
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+; CHECK:     Function: test
+
+@g = external global i32
+
+define void @test(i1 %c, i32* %arg1, i32* %arg2) {
+  ; CHECK: 15 Total Alias Queries Performed
+  ; CHECK: 0 no alias responses
+  %A = alloca i32, align 4
+  %B = select i1 %c, i32* %arg1, i32* %arg2
+  %C = select i1 %c, i32* @g, i32* %A
+
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/arguments.ll b/test/Analysis/CFLAliasAnalysis/arguments.ll
new file mode 100644
index 0000000..f3e6679
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/arguments.ll

@@ -0,0 +1,15 @@
+; This testcase ensures that CFL AA gives conservative answers on variables
+; that involve arguments.
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+; CHECK:     Function: test
+
+define void @test(i1 %c, i32* %arg1, i32* %arg2) {
+  ; CHECK: 6 Total Alias Queries Performed
+  ; CHECK: 3 no alias responses
+  %a = alloca i32, align 4
+  %b = select i1 %c, i32* %arg1, i32* %arg2
+
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/basic-interproc-ret.ll b/test/Analysis/CFLAliasAnalysis/basic-interproc-ret.ll
new file mode 100644
index 0000000..d56a455
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/basic-interproc-ret.ll

@@ -0,0 +1,26 @@
+; This testcase ensures that CFL AA gives conservative answers on variables
+; that involve arguments.
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+; CHECK:     Function: test
+; CHECK: 4 Total Alias Queries Performed
+; CHECK: 3 no alias responses
+; ^ The 1 MayAlias is due to %arg1. Sadly, we don't currently have machinery
+; in place to check whether %arg1 aliases %a, because BasicAA takes care of 
+; that for us.
+
+define i32* @test2(i32* %arg1) {
+  store i32 0, i32* %arg1
+
+  %a = alloca i32, align 4
+  ret i32* %a
+}
+
+define void @test() {
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = call i32* @test2(i32* %a)
+
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/basic-interproc.ll b/test/Analysis/CFLAliasAnalysis/basic-interproc.ll
new file mode 100644
index 0000000..c0a5404
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/basic-interproc.ll

@@ -0,0 +1,24 @@
+; This testcase ensures that CFL AA gives conservative answers on variables
+; that involve arguments.
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+; CHECK:     Function: test
+; CHECK: 2 Total Alias Queries Performed
+; CHECK: 1 no alias responses
+; ^^ In @test2, %arg1 and %arg2 may alias
+
+define void @test2(i32* %arg1, i32* %arg2) {
+  store i32 0, i32* %arg1
+  store i32 0, i32* %arg2
+
+  ret void
+}
+
+define void @test() {
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  call void @test2(i32* %a, i32* %b)
+
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/const-expr-gep.ll b/test/Analysis/CFLAliasAnalysis/const-expr-gep.ll
new file mode 100644
index 0000000..9ae200b
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/const-expr-gep.ll

@@ -0,0 +1,21 @@
+; This testcase consists of alias relations which should be completely
+; resolvable by cfl-aa, but require analysis of getelementptr constant exprs.
+; Derived from BasicAA/2003-12-11-ConstExprGEP.ll
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+%T = type { i32, [10 x i8] }
+
+@G = external global %T
+
+; CHECK:     Function: test
+; CHECK-NOT:   May:
+
+define void @test() {
+  %D = getelementptr %T* @G, i64 0, i32 0
+  %E = getelementptr %T* @G, i64 0, i32 1, i64 5
+  %F = getelementptr i32* getelementptr (%T* @G, i64 0, i32 0), i64 0
+  %X = getelementptr [10 x i8]* getelementptr (%T* @G, i64 0, i32 1), i64 0, i64 5
+
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/constant-over-index.ll b/test/Analysis/CFLAliasAnalysis/constant-over-index.ll
new file mode 100644
index 0000000..fb44b95
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/constant-over-index.ll

@@ -0,0 +1,30 @@
+; RUN: opt < %s -cfl-aa -aa-eval -print-all-alias-modref-info 2>&1 | FileCheck %s
+
+; CFL AA currently returns PartialAlias, BasicAA returns MayAlias, both seem
+; acceptable (although we might decide that we don't want PartialAlias, and if
+; so, we should update this test case accordingly).
+; CHECK: {{PartialAlias|MayAlias}}: double* %p.0.i.0, double* %p3
+
+; %p3 is equal to %p.0.i.0 on the second iteration of the loop,
+; so MayAlias is needed.
+
+define void @foo([3 x [3 x double]]* noalias %p) {
+entry:
+  %p3 = getelementptr [3 x [3 x double]]* %p, i64 0, i64 0, i64 3
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+
+  %p.0.i.0 = getelementptr [3 x [3 x double]]* %p, i64 0, i64 %i, i64 0
+
+  store volatile double 0.0, double* %p3
+  store volatile double 0.1, double* %p.0.i.0
+
+  %i.next = add i64 %i, 1
+  %cmp = icmp slt i64 %i.next, 3
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/empty.ll b/test/Analysis/CFLAliasAnalysis/empty.ll
new file mode 100644
index 0000000..907fa48
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/empty.ll

@@ -0,0 +1,12 @@
+; RUN: opt < %s -cfl-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK:      Function: foo:
+; CHECK-NEXT:   NoAlias: {}* %p, {}* %q
+
+define void @foo({}* %p, {}* %q) {
+  store {} {}, {}* %p
+  store {} {}, {}* %q
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/full-store-partial-alias.ll b/test/Analysis/CFLAliasAnalysis/full-store-partial-alias.ll
new file mode 100644
index 0000000..155fe13
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/full-store-partial-alias.ll

@@ -0,0 +1,37 @@
+; RUN: opt -S -tbaa -cfl-aa -gvn < %s | FileCheck -check-prefix=CFLAA %s
+; RUN: opt -S -tbaa -gvn < %s | FileCheck %s
+; Adapted from the BasicAA full-store-partial-alias.ll test.
+
+; CFL AA should notice that the store stores to the entire %u object,
+; so the %tmp5 load is PartialAlias with the store and suppress TBAA.
+; Without CFL AA, TBAA should say that %tmp5 is NoAlias with the store.
+
+target datalayout = "e-p:64:64:64"
+
+%union.anon = type { double }
+
+@u = global %union.anon { double -2.500000e-01 }, align 8
+@endianness_test = global i64 1, align 8
+
+define i32 @signbit(double %x) nounwind {
+; CFLAA: ret i32 %tmp5.lobit
+; CHECK:   ret i32 0
+entry:
+  %u = alloca %union.anon, align 8
+  %tmp9 = getelementptr inbounds %union.anon* %u, i64 0, i32 0
+  store double %x, double* %tmp9, align 8, !tbaa !0
+  %tmp2 = load i32* bitcast (i64* @endianness_test to i32*), align 8, !tbaa !3
+  %idxprom = sext i32 %tmp2 to i64
+  %tmp4 = bitcast %union.anon* %u to [2 x i32]*
+  %arrayidx = getelementptr inbounds [2 x i32]* %tmp4, i64 0, i64 %idxprom
+  %tmp5 = load i32* %arrayidx, align 4, !tbaa !3
+  %tmp5.lobit = lshr i32 %tmp5, 31
+  ret i32 %tmp5.lobit
+}
+
+!0 = metadata !{metadata !4, metadata !4, i64 0}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !5, metadata !5, i64 0}
+!4 = metadata !{metadata !"double", metadata !1}
+!5 = metadata !{metadata !"int", metadata !1}

diff --git a/test/Analysis/CFLAliasAnalysis/gep-signed-arithmetic.ll b/test/Analysis/CFLAliasAnalysis/gep-signed-arithmetic.ll
new file mode 100644
index 0000000..a0195d7
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/gep-signed-arithmetic.ll

@@ -0,0 +1,17 @@
+; RUN: opt < %s -cfl-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+; Derived from BasicAA/2010-09-15-GEP-SignedArithmetic.ll
+
+target datalayout = "e-p:32:32:32"
+
+; CHECK: 1 partial alias response
+
+define i32 @test(i32* %tab, i32 %indvar) nounwind {
+  %tmp31 = mul i32 %indvar, -2
+  %tmp32 = add i32 %tmp31, 30
+  %t.5 = getelementptr i32* %tab, i32 %tmp32
+  %loada = load i32* %tab
+  store i32 0, i32* %t.5
+  %loadb = load i32* %tab
+  %rval = add i32 %loada, %loadb
+  ret i32 %rval
+}

diff --git a/test/Analysis/CFLAliasAnalysis/multilevel-combine.ll b/test/Analysis/CFLAliasAnalysis/multilevel-combine.ll
new file mode 100644
index 0000000..9bbc721
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/multilevel-combine.ll

@@ -0,0 +1,31 @@
+; This testcase ensures that CFL AA responds conservatively when we union 
+; groups of pointers together through ternary/conditional operations
+; Derived from:
+; void foo(bool c) {
+;   char a, b;
+;   char *m = c ? &a : &b;
+;   *m;
+; }
+;
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+%T = type { i32, [10 x i8] }
+
+; CHECK:     Function: test
+
+define void @test(i1 %C) {
+; CHECK: 10 Total Alias Queries Performed
+; CHECK: 4 no alias responses
+  %M = alloca %T*, align 8 ; NoAlias with %A, %B, %MS, %AP
+  %A = alloca %T, align 8
+  %B = alloca %T, align 8
+
+  %MS = select i1 %C, %T* %B, %T* %A
+
+  store %T* %MS, %T** %M
+
+  %AP = load %T** %M ; PartialAlias with %A, %B
+
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/multilevel.ll b/test/Analysis/CFLAliasAnalysis/multilevel.ll
new file mode 100644
index 0000000..9c9eb9a
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/multilevel.ll

@@ -0,0 +1,30 @@
+; This testcase ensures that CFL AA handles trivial cases with storing 
+; pointers in pointers appropriately.
+; Derived from:
+; char a, b;
+; char *m = &a, *n = &b;
+; *m;
+; *n;
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+%T = type { i32, [10 x i8] }
+
+; CHECK:     Function: test
+
+define void @test() {
+; CHECK: 15 Total Alias Queries Performed
+; CHECK: 13 no alias responses
+  %M = alloca %T*, align 8
+  %N = alloca %T*, align 8
+  %A = alloca %T, align 8
+  %B = alloca %T, align 8
+
+  store %T* %A, %T** %M
+  store %T* %B, %T** %N
+
+  %AP = load %T** %M ; PartialAlias with %A
+  %BP = load %T** %N ; PartialAlias with %B
+
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/must-and-partial.ll b/test/Analysis/CFLAliasAnalysis/must-and-partial.ll
new file mode 100644
index 0000000..df7de38
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/must-and-partial.ll

@@ -0,0 +1,39 @@
+; RUN: opt < %s -cfl-aa -aa-eval -print-all-alias-modref-info 2>&1 | FileCheck %s
+
+; When merging MustAlias and PartialAlias, merge to PartialAlias
+; instead of MayAlias.
+
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK: PartialAlias:  i16* %bigbase0, i8* %phi
+define i8 @test0(i8* %base, i1 %x) {
+entry:
+  %baseplusone = getelementptr i8* %base, i64 1
+  br i1 %x, label %red, label %green
+red:
+  br label %green
+green:
+  %phi = phi i8* [ %baseplusone, %red ], [ %base, %entry ]
+  store i8 0, i8* %phi
+
+  %bigbase0 = bitcast i8* %base to i16*
+  store i16 -1, i16* %bigbase0
+
+  %loaded = load i8* %phi
+  ret i8 %loaded
+}
+
+; CHECK: PartialAlias:  i16* %bigbase1, i8* %sel
+define i8 @test1(i8* %base, i1 %x) {
+entry:
+  %baseplusone = getelementptr i8* %base, i64 1
+  %sel = select i1 %x, i8* %baseplusone, i8* %base
+  store i8 0, i8* %sel
+
+  %bigbase1 = bitcast i8* %base to i16*
+  store i16 -1, i16* %bigbase1
+
+  %loaded = load i8* %sel
+  ret i8 %loaded
+}

diff --git a/test/Analysis/CFLAliasAnalysis/phi-and-select.ll b/test/Analysis/CFLAliasAnalysis/phi-and-select.ll
new file mode 100644
index 0000000..a0e71a7
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/phi-and-select.ll

@@ -0,0 +1,36 @@
+; RUN: opt < %s -cfl-aa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+; Derived from (a subset of) BasicAA/phi-and-select.ll 
+
+; CHECK: Function: qux
+; CHECK:  NoAlias: double* %a, double* %b
+; CHECK: ===== Alias Analysis Evaluator Report =====
+
+; Two PHIs with disjoint sets of inputs.
+define void @qux(i1 %m, double* noalias %x, double* noalias %y,
+                 i1 %n, double* noalias %v, double* noalias %w) {
+entry:
+  br i1 %m, label %true, label %false
+
+true:
+  br label %exit
+
+false:
+  br label %exit
+
+exit:
+  %a = phi double* [ %x, %true ], [ %y, %false ]
+  br i1 %n, label %ntrue, label %nfalse
+
+ntrue:
+  br label %nexit
+
+nfalse:
+  br label %nexit
+
+nexit:
+  %b = phi double* [ %v, %ntrue ], [ %w, %nfalse ]
+  store volatile double 0.0, double* %a
+  store volatile double 1.0, double* %b
+  ret void
+}
+

diff --git a/test/Analysis/CFLAliasAnalysis/simple.ll b/test/Analysis/CFLAliasAnalysis/simple.ll
new file mode 100644
index 0000000..7bc455a
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/simple.ll

@@ -0,0 +1,18 @@
+; This testcase consists of alias relations which should be completely
+; resolvable by cfl-aa (derived from BasicAA/2003-11-04-SimpleCases.ll).
+
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+%T = type { i32, [10 x i8] }
+
+; CHECK:     Function: test
+; CHECK-NOT:   May:
+
+define void @test(%T* %P) {
+  %A = getelementptr %T* %P, i64 0
+  %B = getelementptr %T* %P, i64 0, i32 0
+  %C = getelementptr %T* %P, i64 0, i32 1
+  %D = getelementptr %T* %P, i64 0, i32 1, i64 0
+  %E = getelementptr %T* %P, i64 0, i32 1, i64 5
+  ret void
+}

diff --git a/test/Analysis/CFLAliasAnalysis/va.ll b/test/Analysis/CFLAliasAnalysis/va.ll
new file mode 100644
index 0000000..3094cb0
--- /dev/null
+++ b/test/Analysis/CFLAliasAnalysis/va.ll

@@ -0,0 +1,29 @@
+; RUN: opt < %s -cfl-aa -aa-eval -print-may-aliases -disable-output 2>&1 | FileCheck %s
+
+; CHECK-LABEL: Function: test1
+; CHECK: 0 no alias responses
+
+define i32 @test1(i32 %X, ...) {
+  ; Initialize variable argument processing
+  %ap = alloca i8*
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  ; Read a single integer argument
+  %tmp = va_arg i8** %ap, i32
+
+  ; Demonstrate usage of llvm.va_copy and llvm.va_end
+  %aq = alloca i8*
+  %aq2 = bitcast i8** %aq to i8*
+  call void @llvm.va_copy(i8* %aq2, i8* %ap2)
+  call void @llvm.va_end(i8* %aq2)
+
+  ; Stop processing of arguments.
+  call void @llvm.va_end(i8* %ap2)
+  ret i32 %tmp
+}
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_copy(i8*, i8*)
+declare void @llvm.va_end(i8*)
+

diff --git a/test/Analysis/CostModel/ARM/cast.ll b/test/Analysis/CostModel/ARM/cast.ll
index 662110f..18d6e84 100644
--- a/test/Analysis/CostModel/ARM/cast.ll
+++ b/test/Analysis/CostModel/ARM/cast.ll

@@ -221,35 +221,35 @@
   %r96 = fptoui <2 x float> undef to <2 x i32>
   ; CHECK: cost of 1 {{.*}} fptosi
   %r97 = fptosi <2 x float> undef to <2 x i32>
-  ; CHECK: cost of 28 {{.*}} fptoui
+  ; CHECK: cost of 32 {{.*}} fptoui
   %r98 = fptoui <2 x float> undef to <2 x i64>
-  ; CHECK: cost of 28 {{.*}} fptosi
+  ; CHECK: cost of 32 {{.*}} fptosi
   %r99 = fptosi <2 x float> undef to <2 x i64>
 
-  ; CHECK: cost of 8 {{.*}} fptoui
+  ; CHECK: cost of 16 {{.*}} fptoui
   %r100 = fptoui <2 x double> undef to <2 x i1>
-  ; CHECK: cost of 8 {{.*}} fptosi
+  ; CHECK: cost of 16 {{.*}} fptosi
   %r101 = fptosi <2 x double> undef to <2 x i1>
-  ; CHECK: cost of 8 {{.*}} fptoui
+  ; CHECK: cost of 16 {{.*}} fptoui
   %r102 = fptoui <2 x double> undef to <2 x i8>
-  ; CHECK: cost of 8 {{.*}} fptosi
+  ; CHECK: cost of 16 {{.*}} fptosi
   %r103 = fptosi <2 x double> undef to <2 x i8>
-  ; CHECK: cost of 8 {{.*}} fptoui
+  ; CHECK: cost of 16 {{.*}} fptoui
   %r104 = fptoui <2 x double> undef to <2 x i16>
-  ; CHECK: cost of 8 {{.*}} fptosi
+  ; CHECK: cost of 16 {{.*}} fptosi
   %r105 = fptosi <2 x double> undef to <2 x i16>
   ; CHECK: cost of 2 {{.*}} fptoui
   %r106 = fptoui <2 x double> undef to <2 x i32>
   ; CHECK: cost of 2 {{.*}} fptosi
   %r107 = fptosi <2 x double> undef to <2 x i32>
-  ; CHECK: cost of 28 {{.*}} fptoui
+  ; CHECK: cost of 32 {{.*}} fptoui
   %r108 = fptoui <2 x double> undef to <2 x i64>
-  ; CHECK: cost of 28 {{.*}} fptosi
+  ; CHECK: cost of 32 {{.*}} fptosi
   %r109 = fptosi <2 x double> undef to <2 x i64>
 
-  ; CHECK: cost of 16 {{.*}} fptoui
+  ; CHECK: cost of 32 {{.*}} fptoui
   %r110 = fptoui <4 x float> undef to <4 x i1>
-  ; CHECK: cost of 16 {{.*}} fptosi
+  ; CHECK: cost of 32 {{.*}} fptosi
   %r111 = fptosi <4 x float> undef to <4 x i1>
   ; CHECK: cost of 3 {{.*}} fptoui
   %r112 = fptoui <4 x float> undef to <4 x i8>
@@ -263,39 +263,39 @@
   %r116 = fptoui <4 x float> undef to <4 x i32>
   ; CHECK: cost of 1 {{.*}} fptosi
   %r117 = fptosi <4 x float> undef to <4 x i32>
-  ; CHECK: cost of 56 {{.*}} fptoui
+  ; CHECK: cost of 64 {{.*}} fptoui
   %r118 = fptoui <4 x float> undef to <4 x i64>
-  ; CHECK: cost of 56 {{.*}} fptosi
+  ; CHECK: cost of 64 {{.*}} fptosi
   %r119 = fptosi <4 x float> undef to <4 x i64>
 
-  ; CHECK: cost of 16 {{.*}} fptoui
+  ; CHECK: cost of 32 {{.*}} fptoui
   %r120 = fptoui <4 x double> undef to <4 x i1>
-  ; CHECK: cost of 16 {{.*}} fptosi
+  ; CHECK: cost of 32 {{.*}} fptosi
   %r121 = fptosi <4 x double> undef to <4 x i1>
-  ; CHECK: cost of 16 {{.*}} fptoui
+  ; CHECK: cost of 32 {{.*}} fptoui
   %r122 = fptoui <4 x double> undef to <4 x i8>
-  ; CHECK: cost of 16 {{.*}} fptosi
+  ; CHECK: cost of 32 {{.*}} fptosi
   %r123 = fptosi <4 x double> undef to <4 x i8>
-  ; CHECK: cost of 16 {{.*}} fptoui
+  ; CHECK: cost of 32 {{.*}} fptoui
   %r124 = fptoui <4 x double> undef to <4 x i16>
-  ; CHECK: cost of 16 {{.*}} fptosi
+  ; CHECK: cost of 32 {{.*}} fptosi
   %r125 = fptosi <4 x double> undef to <4 x i16>
-  ; CHECK: cost of 16 {{.*}} fptoui
+  ; CHECK: cost of 32 {{.*}} fptoui
   %r126 = fptoui <4 x double> undef to <4 x i32>
-  ; CHECK: cost of 16 {{.*}} fptosi
+  ; CHECK: cost of 32 {{.*}} fptosi
   %r127 = fptosi <4 x double> undef to <4 x i32>
-  ; CHECK: cost of 56 {{.*}} fptoui
+  ; CHECK: cost of 64 {{.*}} fptoui
   %r128 = fptoui <4 x double> undef to <4 x i64>
-  ; CHECK: cost of 56 {{.*}} fptosi
+  ; CHECK: cost of 64 {{.*}} fptosi
   %r129 = fptosi <4 x double> undef to <4 x i64>
 
-  ; CHECK: cost of 32 {{.*}} fptoui
+  ; CHECK: cost of 64 {{.*}} fptoui
   %r130 = fptoui <8 x float> undef to <8 x i1>
-  ; CHECK: cost of 32 {{.*}} fptosi
+  ; CHECK: cost of 64 {{.*}} fptosi
   %r131 = fptosi <8 x float> undef to <8 x i1>
-  ; CHECK: cost of 32 {{.*}} fptoui
+  ; CHECK: cost of 64 {{.*}} fptoui
   %r132 = fptoui <8 x float> undef to <8 x i8>
-  ; CHECK: cost of 32 {{.*}} fptosi
+  ; CHECK: cost of 64 {{.*}} fptosi
   %r133 = fptosi <8 x float> undef to <8 x i8>
   ; CHECK: cost of 4 {{.*}} fptoui
   %r134 = fptoui <8 x float> undef to <8 x i16>
@@ -305,39 +305,39 @@
   %r136 = fptoui <8 x float> undef to <8 x i32>
   ; CHECK: cost of 2 {{.*}} fptosi
   %r137 = fptosi <8 x float> undef to <8 x i32>
-  ; CHECK: cost of 112 {{.*}} fptoui
+  ; CHECK: cost of 128 {{.*}} fptoui
   %r138 = fptoui <8 x float> undef to <8 x i64>
-  ; CHECK: cost of 112 {{.*}} fptosi
+  ; CHECK: cost of 128 {{.*}} fptosi
   %r139 = fptosi <8 x float> undef to <8 x i64>
 
-  ; CHECK: cost of 32 {{.*}} fptoui
+  ; CHECK: cost of 64 {{.*}} fptoui
   %r140 = fptoui <8 x double> undef to <8 x i1>
-  ; CHECK: cost of 32 {{.*}} fptosi
+  ; CHECK: cost of 64 {{.*}} fptosi
   %r141 = fptosi <8 x double> undef to <8 x i1>
-  ; CHECK: cost of 32 {{.*}} fptoui
+  ; CHECK: cost of 64 {{.*}} fptoui
   %r142 = fptoui <8 x double> undef to <8 x i8>
-  ; CHECK: cost of 32 {{.*}} fptosi
+  ; CHECK: cost of 64 {{.*}} fptosi
   %r143 = fptosi <8 x double> undef to <8 x i8>
-  ; CHECK: cost of 32 {{.*}} fptoui
+  ; CHECK: cost of 64 {{.*}} fptoui
   %r144 = fptoui <8 x double> undef to <8 x i16>
-  ; CHECK: cost of 32 {{.*}} fptosi
+  ; CHECK: cost of 64 {{.*}} fptosi
   %r145 = fptosi <8 x double> undef to <8 x i16>
-  ; CHECK: cost of 32 {{.*}} fptoui
+  ; CHECK: cost of 64 {{.*}} fptoui
   %r146 = fptoui <8 x double> undef to <8 x i32>
-  ; CHECK: cost of 32 {{.*}} fptosi
+  ; CHECK: cost of 64 {{.*}} fptosi
   %r147 = fptosi <8 x double> undef to <8 x i32>
-  ; CHECK: cost of 112 {{.*}} fptoui
+  ; CHECK: cost of 128 {{.*}} fptoui
   %r148 = fptoui <8 x double> undef to <8 x i64>
-  ; CHECK: cost of 112 {{.*}} fptosi
+  ; CHECK: cost of 128 {{.*}} fptosi
   %r149 = fptosi <8 x double> undef to <8 x i64>
 
-  ; CHECK: cost of 64 {{.*}} fptoui
+  ; CHECK: cost of 128 {{.*}} fptoui
   %r150 = fptoui <16 x float> undef to <16 x i1>
-  ; CHECK: cost of 64 {{.*}} fptosi
+  ; CHECK: cost of 128 {{.*}} fptosi
   %r151 = fptosi <16 x float> undef to <16 x i1>
-  ; CHECK: cost of 64 {{.*}} fptoui
+  ; CHECK: cost of 128 {{.*}} fptoui
   %r152 = fptoui <16 x float> undef to <16 x i8>
-  ; CHECK: cost of 64 {{.*}} fptosi
+  ; CHECK: cost of 128 {{.*}} fptosi
   %r153 = fptosi <16 x float> undef to <16 x i8>
   ; CHECK: cost of 8 {{.*}} fptoui
   %r154 = fptoui <16 x float> undef to <16 x i16>
@@ -347,30 +347,30 @@
   %r156 = fptoui <16 x float> undef to <16 x i32>
   ; CHECK: cost of 4 {{.*}} fptosi
   %r157 = fptosi <16 x float> undef to <16 x i32>
-  ; CHECK: cost of 224 {{.*}} fptoui
+  ; CHECK: cost of 256 {{.*}} fptoui
   %r158 = fptoui <16 x float> undef to <16 x i64>
-  ; CHECK: cost of 224 {{.*}} fptosi
+  ; CHECK: cost of 256 {{.*}} fptosi
   %r159 = fptosi <16 x float> undef to <16 x i64>
 
-  ; CHECK: cost of 64 {{.*}} fptoui
+  ; CHECK: cost of 128 {{.*}} fptoui
   %r160 = fptoui <16 x double> undef to <16 x i1>
-  ; CHECK: cost of 64 {{.*}} fptosi
+  ; CHECK: cost of 128 {{.*}} fptosi
   %r161 = fptosi <16 x double> undef to <16 x i1>
-  ; CHECK: cost of 64 {{.*}} fptoui
+  ; CHECK: cost of 128 {{.*}} fptoui
   %r162 = fptoui <16 x double> undef to <16 x i8>
-  ; CHECK: cost of 64 {{.*}} fptosi
+  ; CHECK: cost of 128 {{.*}} fptosi
   %r163 = fptosi <16 x double> undef to <16 x i8>
-  ; CHECK: cost of 64 {{.*}} fptoui
+  ; CHECK: cost of 128 {{.*}} fptoui
   %r164 = fptoui <16 x double> undef to <16 x i16>
-  ; CHECK: cost of 64 {{.*}} fptosi
+  ; CHECK: cost of 128 {{.*}} fptosi
   %r165 = fptosi <16 x double> undef to <16 x i16>
-  ; CHECK: cost of 64 {{.*}} fptoui
+  ; CHECK: cost of 128 {{.*}} fptoui
   %r166 = fptoui <16 x double> undef to <16 x i32>
-  ; CHECK: cost of 64 {{.*}} fptosi
+  ; CHECK: cost of 128 {{.*}} fptosi
   %r167 = fptosi <16 x double> undef to <16 x i32>
-  ; CHECK: cost of 224 {{.*}} fptoui
+  ; CHECK: cost of 256 {{.*}} fptoui
   %r168 = fptoui <16 x double> undef to <16 x i64>
-  ; CHECK: cost of 224 {{.*}} fptosi
+  ; CHECK: cost of 256 {{.*}} fptosi
   %r169 = fptosi <16 x double> undef to <16 x i64>
 
   ; CHECK: cost of 8 {{.*}} uitofp

diff --git a/test/Analysis/CostModel/PowerPC/cmp-expanded.ll b/test/Analysis/CostModel/PowerPC/cmp-expanded.ll
new file mode 100644
index 0000000..38c8439
--- /dev/null
+++ b/test/Analysis/CostModel/PowerPC/cmp-expanded.ll

@@ -0,0 +1,14 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @exts() {
+
+   ; VSX is disabled, so this cost needs to include scalarization (because
+   ; <4 x double> is legalized to scalars).
+   ; CHECK: cost of 44 {{.*}} fcmp
+   %v1 = fcmp ugt <4 x double> undef, undef
+
+  ret void
+}
+

diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll
index 7f97b17..fb16af6 100644
--- a/test/Analysis/CostModel/X86/cast.ll
+++ b/test/Analysis/CostModel/X86/cast.ll

@@ -1,3 +1,4 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512
 ; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX2
 ; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX
 
@@ -83,6 +84,19 @@
   ;CHECK-AVX: cost of 4 {{.*}} zext
   %D = zext <4 x i32> undef to <4 x i64>
 
+  ;CHECK-AVX512: cost of 3 {{.*}} %D1 = zext
+  %D1 = zext <16 x i32> undef to <16 x i64>
+
+  ;CHECK-AVX512: cost of 3 {{.*}} %D2 = sext
+  %D2 = sext <16 x i32> undef to <16 x i64>
+
+  ;CHECK-AVX512: cost of 1 {{.*}} %D3 = zext
+  %D3 = zext <16 x i16> undef to <16 x i32>
+  ;CHECK-AVX512: cost of 1 {{.*}} %D4 = zext
+  %D4 = zext <16 x i8> undef to <16 x i32>
+  ;CHECK-AVX512: cost of 2 {{.*}} %D5 = zext
+  %D5 = zext <16 x i1> undef to <16 x i32>
+
   ;CHECK-AVX2: cost of 2 {{.*}} trunc
   ;CHECK-AVX: cost of 4 {{.*}} trunc
   %E = trunc <4 x i64> undef to <4 x i32>
@@ -101,8 +115,12 @@
 
   ;CHECK-AVX2: cost of 4 {{.*}} trunc
   ;CHECK-AVX: cost of 9 {{.*}} trunc
+  ;CHECK_AVX512: cost of 1 {{.*}} G = trunc
   %G = trunc <8 x i64> undef to <8 x i32>
 
+  ;CHECK-AVX512: cost of 4 {{.*}} %G1 = trunc
+  %G1 = trunc <16 x i64> undef to <16 x i32>
+
   ret i32 undef
 }
 
@@ -207,7 +225,30 @@
   ; CHECK: cost of 5 {{.*}} uitofp
   %C1 = uitofp <8 x i16> %c to <8 x float>
 
-  ; CHECK: cost of 9 {{.*}} uitofp
+  ; CHECK-AVX2: cost of 8 {{.*}} uitofp
+  ; CHECK-AVX512: cost of 8 {{.*}} uitofp
+  ; CHECK-AVX: cost of 9 {{.*}} uitofp
   %D1 = uitofp <8 x i32> %d to <8 x float>
   ret void
 }
+
+define void @fp_conv(<8 x float> %a, <16 x float>%b) {
+;CHECK-LABEL: for function 'fp_conv'
+  ; CHECK-AVX512: cost of 1 {{.*}} fpext
+  %A1 = fpext <8 x float> %a to <8 x double>
+
+  ; CHECK-AVX512: cost of 3 {{.*}} fpext
+  %A2 = fpext <16 x float> %b to <16 x double>
+
+  ; CHECK-AVX2:   cost of 3 {{.*}} %A3 = fpext
+  ; CHECK-AVX512: cost of 1 {{.*}} %A3 = fpext
+  %A3 = fpext <8 x float> %a to <8 x double>
+
+  ; CHECK-AVX2:   cost of 3 {{.*}} %A4 = fptrunc
+  ; CHECK-AVX512: cost of 1 {{.*}} %A4 = fptrunc
+  %A4 = fptrunc <8 x double> undef to <8 x float>
+
+  ; CHECK-AVX512: cost of 3 {{.*}} %A5 = fptrunc
+  %A5 = fptrunc <16 x double> undef to <16 x float>
+  ret void
+}

diff --git a/test/Analysis/CostModel/X86/cmp.ll b/test/Analysis/CostModel/X86/cmp.ll
index 9f2bdb3..469cd73 100644
--- a/test/Analysis/CostModel/X86/cmp.ll
+++ b/test/Analysis/CostModel/X86/cmp.ll

@@ -1,5 +1,6 @@
 ; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck -check-prefix=CHECK -check-prefix=AVX1 %s
 ; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core-avx2 | FileCheck -check-prefix=CHECK -check-prefix=AVX2 %s
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=knl | FileCheck -check-prefix=CHECK -check-prefix=AVX512 %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -22,6 +23,11 @@
   ;AVX2: cost of 1 {{.*}} fcmp
   %E = fcmp olt <4 x double> undef, undef
 
+  ; AVX512: cost of 1 {{.*}} %E1 = fcmp
+  %E1 = fcmp olt <16 x float> undef, undef
+  ; AVX512: cost of 2 {{.*}} %E2 = fcmp
+  %E2 = fcmp olt <16 x double> undef, undef
+
   ;  -- integers --
 
   ;AVX1: cost of 1 {{.*}} icmp
@@ -49,6 +55,11 @@
   ;AVX2: cost of 1 {{.*}} icmp
   %M = icmp eq <32 x i8> undef, undef
 
+  ; AVX512: cost of 1 {{.*}} %M1 = icmp
+  %M1 = icmp eq <16 x i32> undef, undef
+  ; AVX512: cost of 2 {{.*}} %M2 = icmp
+  %M2 = icmp eq <16 x i64> undef, undef
+
   ;CHECK: cost of 0 {{.*}} ret
   ret i32 undef
 }

diff --git a/test/Analysis/CostModel/X86/sitofp.ll b/test/Analysis/CostModel/X86/sitofp.ll
index 338d974..edc937e 100644
--- a/test/Analysis/CostModel/X86/sitofp.ll
+++ b/test/Analysis/CostModel/X86/sitofp.ll

@@ -1,4 +1,5 @@
 ; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s | FileCheck --check-prefix=AVX512F %s
 
 define <2 x double> @sitofpv2i8v2double(<2 x i8> %a) {
   ; SSE2: sitofpv2i8v2double
@@ -279,3 +280,47 @@
   %1 = sitofp <32 x i64> %a to <32 x float>
   ret <32 x float> %1
 }
+
+; AVX512F-LABEL: sitofp_16i8_float
+; AVX512F: cost of 2 {{.*}} sitofp
+define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
+  %1 = sitofp <16 x i8> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
+  ; AVX512F-LABEL: sitofp_16i16_float
+  ; AVX512F: cost of 2 {{.*}} sitofp
+  %1 = sitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+; AVX512F-LABEL: sitofp_8i8_double
+; AVX512F: cost of 2 {{.*}} sitofp
+define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
+  %1 = sitofp <8 x i8> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+; AVX512F-LABEL: sitofp_8i16_double
+; AVX512F: cost of 2 {{.*}} sitofp
+define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
+  %1 = sitofp <8 x i16> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+; AVX512F-LABEL: sitofp_8i1_double
+; AVX512F: cost of 4 {{.*}} sitofp
+define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
+  %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
+  %1 = sitofp <8 x i1> %cmpres to <8 x double>
+  ret <8 x double> %1
+}
+
+; AVX512F-LABEL: sitofp_16i1_float
+; AVX512F: cost of 3 {{.*}} sitofp
+define <16 x float> @sitofp_16i1_float(<16 x float> %a) {
+  %cmpres = fcmp ogt <16 x float> %a, zeroinitializer
+  %1 = sitofp <16 x i1> %cmpres to <16 x float>
+  ret <16 x float> %1
+}

diff --git a/test/Analysis/CostModel/X86/uitofp.ll b/test/Analysis/CostModel/X86/uitofp.ll
index a41a04d..27ec268 100644
--- a/test/Analysis/CostModel/X86/uitofp.ll
+++ b/test/Analysis/CostModel/X86/uitofp.ll

@@ -235,7 +235,7 @@
 
 define <4 x float> @uitofpv4i8v4float(<4 x i8> %a) {
   ; SSE2: uitofpv4i8v4float
-  ; SSE2: cost of 15 {{.*}} uitofp
+  ; SSE2: cost of 8 {{.*}} uitofp
   %1 = uitofp <4 x i8> %a to <4 x float>
   ret <4 x float> %1
 }
@@ -270,7 +270,7 @@
 
 define <4 x float> @uitofpv4i16v4float(<4 x i16> %a) {
   ; SSE2: uitofpv4i16v4float
-  ; SSE2: cost of 15 {{.*}} uitofp
+  ; SSE2: cost of 8 {{.*}} uitofp
   %1 = uitofp <4 x i16> %a to <4 x float>
   ret <4 x float> %1
 }
@@ -305,28 +305,28 @@
 
 define <4 x float> @uitofpv4i32v4float(<4 x i32> %a) {
   ; SSE2: uitofpv4i32v4float
-  ; SSE2: cost of 15 {{.*}} uitofp
+  ; SSE2: cost of 8 {{.*}} uitofp
   %1 = uitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <8 x float> @uitofpv8i32v8float(<8 x i32> %a) {
   ; SSE2: uitofpv8i32v8float
-  ; SSE2: cost of 30 {{.*}} uitofp
+  ; SSE2: cost of 16 {{.*}} uitofp
   %1 = uitofp <8 x i32> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <16 x float> @uitofpv16i32v16float(<16 x i32> %a) {
   ; SSE2: uitofpv16i32v16float
-  ; SSE2: cost of 60 {{.*}} uitofp
+  ; SSE2: cost of 32 {{.*}} uitofp
   %1 = uitofp <16 x i32> %a to <16 x float>
   ret <16 x float> %1
 }
 
 define <32 x float> @uitofpv32i32v32float(<32 x i32> %a) {
   ; SSE2: uitofpv32i32v32float
-  ; SSE2: cost of 120 {{.*}} uitofp
+  ; SSE2: cost of 64 {{.*}} uitofp
   %1 = uitofp <32 x i32> %a to <32 x float>
   ret <32 x float> %1
 }

diff --git a/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll b/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
new file mode 100644
index 0000000..95e5e52
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll

@@ -0,0 +1,40 @@
+; RUN: opt < %s -analyze -basicaa -da -da-delinearize=false | FileCheck %s
+; RUN: opt < %s -analyze -basicaa -da -da-delinearize | FileCheck %s -check-prefix=DELIN
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+; for (int i = 0; i < 100; ++i) {
+;   int t0 = a[i][i];
+;   int t1 = t0 + 1;
+;   a[i][5] = t1;
+; }
+; The subscript 5 in a[i][5] is deliberately an i32, mismatching the types of
+; other subscript. DependenceAnalysis before the fix crashed due to this
+; mismatch.
+define void @i32_subscript([100 x [100 x i32]]* %a, i32* %b) {
+; CHECK-LABEL: 'Dependence Analysis' for function 'i32_subscript'
+; DELIN-LABEL: 'Dependence Analysis' for function 'i32_subscript'
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: da analyze - none!
+; CHECK: da analyze - anti [=|<]!
+; CHECK: da analyze - none!
+; DELIN: da analyze - none!
+; DELIN: da analyze - anti [=|<]!
+; DELIN: da analyze - none!
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.body ]
+  %a.addr = getelementptr [100 x [100 x i32]]* %a, i64 0, i64 %i, i64 %i
+  %a.addr.2 = getelementptr [100 x [100 x i32]]* %a, i64 0, i64 %i, i32 5
+  %0 = load i32* %a.addr, align 4
+  %1 = add i32 %0, 1
+  store i32 %1, i32* %a.addr.2, align 4
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp ne i64 %i.inc, 100
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

diff --git a/test/Analysis/GlobalsModRef/pr12351.ll b/test/Analysis/GlobalsModRef/pr12351.ll
index 1c5ac43..c221f4c 100644
--- a/test/Analysis/GlobalsModRef/pr12351.ll
+++ b/test/Analysis/GlobalsModRef/pr12351.ll

@@ -26,8 +26,8 @@
 
 define void @bar2(i32* %foo)  {
   store i32 0, i32* %foo, align 4
-  tail call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{})
+  tail call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{}, metadata !{})
   ret void
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone

diff --git a/test/Analysis/ScalarEvolution/load-with-range-metadata.ll b/test/Analysis/ScalarEvolution/load-with-range-metadata.ll
new file mode 100644
index 0000000..2f6dcd0
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/load-with-range-metadata.ll

@@ -0,0 +1,37 @@
+; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s
+
+define i32 @slt_trip_count_with_range(i32 *%ptr0, i32 *%ptr1) {
+; CHECK-LABEL: slt_trip_count_with_range
+ entry:
+  %limit = load i32* %ptr0, !range !0
+  br label %loop
+
+ loop:
+; CHECK: Loop %loop: max backedge-taken count is 98
+  %index = phi i32 [ 0, %entry ], [ %index.inc, %loop ]
+  %index.inc = add i32 %index, 1
+  %continue = icmp slt i32 %index.inc, %limit
+  br i1 %continue, label %loop, label %loop.exit
+
+ loop.exit:
+  ret i32 0
+}
+
+define i32 @ult_trip_count_with_range(i32 *%ptr0, i32 *%ptr1) {
+; CHECK-LABEL: ult_trip_count_with_range
+ entry:
+  %limit = load i32* %ptr0, !range !0
+  br label %loop
+
+ loop:
+; CHECK: Loop %loop: max backedge-taken count is 98
+  %index = phi i32 [ 0, %entry ], [ %index.inc, %loop ]
+  %index.inc = add i32 %index, 1
+  %continue = icmp ult i32 %index.inc, %limit
+  br i1 %continue, label %loop, label %loop.exit
+
+ loop.exit:
+  ret i32 0
+}
+
+!0 = metadata !{i32 1, i32 100}

diff --git a/test/Analysis/ScalarEvolution/nsw-offset-assume.ll b/test/Analysis/ScalarEvolution/nsw-offset-assume.ll
new file mode 100644
index 0000000..29cf658
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/nsw-offset-assume.ll

@@ -0,0 +1,83 @@
+; RUN: opt < %s -S -analyze -scalar-evolution | FileCheck %s
+
+; ScalarEvolution should be able to fold away the sign-extensions
+; on this loop with a primary induction variable incremented with
+; a nsw add of 2 (this test is derived from the nsw-offset.ll test, but uses an
+; assume instead of a preheader conditional branch to guard the loop).
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+
+define void @foo(i32 %no, double* nocapture %d, double* nocapture %q) nounwind {
+entry:
+  %n = and i32 %no, 4294967294
+  %0 = icmp sgt i32 %n, 0                         ; <i1> [#uses=1]
+  tail call void @llvm.assume(i1 %0)
+  br label %bb.nph
+
+bb.nph:                                           ; preds = %entry
+  br label %bb
+
+bb:                                               ; preds = %bb.nph, %bb1
+  %i.01 = phi i32 [ %16, %bb1 ], [ 0, %bb.nph ]   ; <i32> [#uses=5]
+
+; CHECK: %1 = sext i32 %i.01 to i64
+; CHECK: -->  {0,+,2}<nuw><nsw><%bb>
+  %1 = sext i32 %i.01 to i64                      ; <i64> [#uses=1]
+
+; CHECK: %2 = getelementptr inbounds double* %d, i64 %1
+; CHECK: -->  {%d,+,16}<nsw><%bb>
+  %2 = getelementptr inbounds double* %d, i64 %1  ; <double*> [#uses=1]
+
+  %3 = load double* %2, align 8                   ; <double> [#uses=1]
+  %4 = sext i32 %i.01 to i64                      ; <i64> [#uses=1]
+  %5 = getelementptr inbounds double* %q, i64 %4  ; <double*> [#uses=1]
+  %6 = load double* %5, align 8                   ; <double> [#uses=1]
+  %7 = or i32 %i.01, 1                            ; <i32> [#uses=1]
+
+; CHECK: %8 = sext i32 %7 to i64
+; CHECK: -->  {1,+,2}<nuw><nsw><%bb>
+  %8 = sext i32 %7 to i64                         ; <i64> [#uses=1]
+
+; CHECK: %9 = getelementptr inbounds double* %q, i64 %8
+; CHECK: {(8 + %q),+,16}<nsw><%bb>
+  %9 = getelementptr inbounds double* %q, i64 %8  ; <double*> [#uses=1]
+
+; Artificially repeat the above three instructions, this time using
+; add nsw instead of or.
+  %t7 = add nsw i32 %i.01, 1                            ; <i32> [#uses=1]
+
+; CHECK: %t8 = sext i32 %t7 to i64
+; CHECK: -->  {1,+,2}<nuw><nsw><%bb>
+  %t8 = sext i32 %t7 to i64                         ; <i64> [#uses=1]
+
+; CHECK: %t9 = getelementptr inbounds double* %q, i64 %t8
+; CHECK: {(8 + %q),+,16}<nsw><%bb>
+  %t9 = getelementptr inbounds double* %q, i64 %t8  ; <double*> [#uses=1]
+
+  %10 = load double* %9, align 8                  ; <double> [#uses=1]
+  %11 = fadd double %6, %10                       ; <double> [#uses=1]
+  %12 = fadd double %11, 3.200000e+00             ; <double> [#uses=1]
+  %13 = fmul double %3, %12                       ; <double> [#uses=1]
+  %14 = sext i32 %i.01 to i64                     ; <i64> [#uses=1]
+  %15 = getelementptr inbounds double* %d, i64 %14 ; <double*> [#uses=1]
+  store double %13, double* %15, align 8
+  %16 = add nsw i32 %i.01, 2                      ; <i32> [#uses=2]
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  %17 = icmp slt i32 %16, %n                      ; <i1> [#uses=1]
+  br i1 %17, label %bb, label %bb1.return_crit_edge
+
+bb1.return_crit_edge:                             ; preds = %bb1
+  br label %return
+
+return:                                           ; preds = %bb1.return_crit_edge, %entry
+  ret void
+}
+
+declare void @llvm.assume(i1) nounwind
+
+; Note: Without the preheader assume, there is an 'smax' in the
+; backedge-taken count expression:
+; CHECK: Loop %bb: backedge-taken count is ((-1 + (2 * (%no /u 2))) /u 2)
+; CHECK: Loop %bb: max backedge-taken count is 1073741822

diff --git a/test/Analysis/ScalarEvolution/nsw.ll b/test/Analysis/ScalarEvolution/nsw.ll
index 05992ea..d776a5a 100644
--- a/test/Analysis/ScalarEvolution/nsw.ll
+++ b/test/Analysis/ScalarEvolution/nsw.ll

@@ -123,9 +123,8 @@
   ret i32 %result
 }
 
-; TODO: This could fold down to '1'
 ; CHECK-LABEL: PR12375
-; CHECK: -->  {(4 + %arg),+,4}<nuw><%bb1>		Exits: (4 + (4 * ((-1 + (-1 * %arg) + ((4 + %arg) umax (8 + %arg)<nsw>)) /u 4)) + %arg)
+; CHECK: -->  {(4 + %arg),+,4}<nuw><%bb1>		Exits: (8 + %arg)<nsw>
 define i32 @PR12375(i32* readnone %arg) {
 bb:
   %tmp = getelementptr inbounds i32* %arg, i64 2
@@ -158,3 +157,23 @@
 bb5:                                              ; preds = %bb2
   ret void
 }
+
+declare void @f(i32)
+
+; CHECK-LABEL: nswnowrap
+; CHECK: --> {(1 + %v),+,1}<nsw><%for.body>   Exits: (2 + %v)
+define void @nswnowrap(i32 %v) {
+entry:
+  %add = add nsw i32 %v, 1
+  br label %for.body
+
+for.body:
+  %i.04 = phi i32 [ %v, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %i.04, 1
+  tail call void @f(i32 %i.04)
+  %cmp = icmp slt i32 %i.04, %add
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}

diff --git a/test/Analysis/ScalarEvolution/sext-iv-1.ll b/test/Analysis/ScalarEvolution/sext-iv-1.ll
index c34596d..a6f70db 100644
--- a/test/Analysis/ScalarEvolution/sext-iv-1.ll
+++ b/test/Analysis/ScalarEvolution/sext-iv-1.ll

@@ -1,5 +1,12 @@
 ; RUN: opt < %s -scalar-evolution -analyze \
-; RUN:  | grep " -->  (sext i. {.*,+,.*}<%bb1> to i64)" | count 5
+; RUN:  | FileCheck %s
+
+; CHECK: -->  (sext i{{.}} {{{.*}},+,{{.*}}}<%bb1> to i64)
+; CHECK: -->  (sext i{{.}} {{{.*}},+,{{.*}}}<%bb1> to i64)
+; CHECK: -->  (sext i{{.}} {{{.*}},+,{{.*}}}<%bb1> to i64)
+; CHECK: -->  (sext i{{.}} {{{.*}},+,{{.*}}}<%bb1> to i64)
+; CHECK: -->  (sext i{{.}} {{{.*}},+,{{.*}}}<%bb1> to i64)
+; CHECK-NOT: -->  (sext
 
 ; Don't convert (sext {...,+,...}) to {sext(...),+,sext(...)} in cases
 ; where the trip count is not within range.

diff --git a/test/Analysis/ScopedNoAliasAA/basic-domains.ll b/test/Analysis/ScopedNoAliasAA/basic-domains.ll
new file mode 100644
index 0000000..d88a496
--- /dev/null
+++ b/test/Analysis/ScopedNoAliasAA/basic-domains.ll

@@ -0,0 +1,57 @@
+; RUN: opt < %s -basicaa -scoped-noalias -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo1(float* nocapture %a, float* nocapture readonly %c) #0 {
+entry:
+; CHECK-LABEL: Function: foo1
+  %0 = load float* %c, align 4, !alias.scope !9
+  %arrayidx.i = getelementptr inbounds float* %a, i64 5
+  store float %0, float* %arrayidx.i, align 4, !noalias !6
+
+  %1 = load float* %c, align 4, !alias.scope !5
+  %arrayidx.i2 = getelementptr inbounds float* %a, i64 15
+  store float %1, float* %arrayidx.i2, align 4, !noalias !6
+
+  %2 = load float* %c, align 4, !alias.scope !6
+  %arrayidx.i3 = getelementptr inbounds float* %a, i64 16
+  store float %2, float* %arrayidx.i3, align 4, !noalias !5
+
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+
+!0 = metadata !{metadata !0, metadata !"some domain"}
+!1 = metadata !{metadata !1, metadata !"some other domain"}
+
+; Two scopes (which must be self-referential to avoid being "uniqued"):
+!2 = metadata !{metadata !2, metadata !0, metadata !"a scope in dom0"}
+!3 = metadata !{metadata !2}
+
+!4 = metadata !{metadata !4, metadata !0, metadata !"another scope in dom0"}
+!5 = metadata !{metadata !4}
+
+; A list of the two scopes.
+!6 = metadata !{metadata !2, metadata !4}
+
+; Another scope in the second domain
+!7 = metadata !{metadata !7, metadata !1, metadata !"another scope in dom1"}
+!8 = metadata !{metadata !7}
+
+; A list of scopes from both domains.
+!9 = metadata !{metadata !2, metadata !4, metadata !7}
+
+; CHECK: NoAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %0, float* %arrayidx.i, align 4, !noalias !6
+; CHECK: NoAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %1, float* %arrayidx.i2, align 4, !noalias !6
+; CHECK: MayAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %2, float* %arrayidx.i3, align 4, !noalias !7
+; CHECK: NoAlias:   %1 = load float* %c, align 4, !alias.scope !7 <->   store float %0, float* %arrayidx.i, align 4, !noalias !6
+; CHECK: NoAlias:   %1 = load float* %c, align 4, !alias.scope !7 <->   store float %1, float* %arrayidx.i2, align 4, !noalias !6
+; CHECK: NoAlias:   %1 = load float* %c, align 4, !alias.scope !7 <->   store float %2, float* %arrayidx.i3, align 4, !noalias !7
+; CHECK: NoAlias:   %2 = load float* %c, align 4, !alias.scope !6 <->   store float %0, float* %arrayidx.i, align 4, !noalias !6
+; CHECK: NoAlias:   %2 = load float* %c, align 4, !alias.scope !6 <->   store float %1, float* %arrayidx.i2, align 4, !noalias !6
+; CHECK: MayAlias:   %2 = load float* %c, align 4, !alias.scope !6 <->   store float %2, float* %arrayidx.i3, align 4, !noalias !7
+; CHECK: NoAlias:   store float %1, float* %arrayidx.i2, align 4, !noalias !6 <->   store float %0, float* %arrayidx.i, align 4, !noalias !6
+; CHECK: NoAlias:   store float %2, float* %arrayidx.i3, align 4, !noalias !7 <->   store float %0, float* %arrayidx.i, align 4, !noalias !6
+; CHECK: NoAlias:   store float %2, float* %arrayidx.i3, align 4, !noalias !7 <->   store float %1, float* %arrayidx.i2, align 4, !noalias !6
+

diff --git a/test/Analysis/ScopedNoAliasAA/basic.ll b/test/Analysis/ScopedNoAliasAA/basic.ll
new file mode 100644
index 0000000..73fe333
--- /dev/null
+++ b/test/Analysis/ScopedNoAliasAA/basic.ll

@@ -0,0 +1,27 @@
+; RUN: opt < %s -basicaa -scoped-noalias -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo1(float* nocapture %a, float* nocapture readonly %c) #0 {
+entry:
+; CHECK-LABEL: Function: foo1
+  %0 = load float* %c, align 4, !alias.scope !1
+  %arrayidx.i = getelementptr inbounds float* %a, i64 5
+  store float %0, float* %arrayidx.i, align 4, !noalias !1
+  %1 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %1, float* %arrayidx, align 4
+  ret void
+
+; CHECK: NoAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %0, float* %arrayidx.i, align 4, !noalias !0
+; CHECK: MayAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %1, float* %arrayidx, align 4
+; CHECK: MayAlias:   %1 = load float* %c, align 4 <->   store float %0, float* %arrayidx.i, align 4, !noalias !0
+; CHECK: MayAlias:   %1 = load float* %c, align 4 <->   store float %1, float* %arrayidx, align 4
+; CHECK: NoAlias:   store float %1, float* %arrayidx, align 4 <->   store float %0, float* %arrayidx.i, align 4, !noalias !0
+}
+
+attributes #0 = { nounwind uwtable }
+
+!0 = metadata !{metadata !0, metadata !"some domain"}
+!1 = metadata !{metadata !1, metadata !0, metadata !"some scope"}
+

diff --git a/test/Analysis/ScopedNoAliasAA/basic2.ll b/test/Analysis/ScopedNoAliasAA/basic2.ll
new file mode 100644
index 0000000..37b0add
--- /dev/null
+++ b/test/Analysis/ScopedNoAliasAA/basic2.ll

@@ -0,0 +1,41 @@
+; RUN: opt < %s -basicaa -scoped-noalias -aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+entry:
+; CHECK-LABEL: Function: foo2
+  %0 = load float* %c, align 4, !alias.scope !0
+  %arrayidx.i = getelementptr inbounds float* %a, i64 5
+  store float %0, float* %arrayidx.i, align 4, !alias.scope !5, !noalias !4
+  %arrayidx1.i = getelementptr inbounds float* %b, i64 8
+  store float %0, float* %arrayidx1.i, align 4, !alias.scope !0, !noalias !5
+  %1 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %1, float* %arrayidx, align 4
+  ret void
+
+; CHECK: MayAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %0, float* %arrayidx.i, align 4, !alias.scope !4, !noalia
+; CHECK: s !5
+; CHECK: MayAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %0, float* %arrayidx1.i, align 4, !alias.scope !0, !noali
+; CHECK: as !4
+; CHECK: MayAlias:   %0 = load float* %c, align 4, !alias.scope !0 <->   store float %1, float* %arrayidx, align 4
+; CHECK: MayAlias:   %1 = load float* %c, align 4 <->   store float %0, float* %arrayidx.i, align 4, !alias.scope !4, !noalias !5
+; CHECK: MayAlias:   %1 = load float* %c, align 4 <->   store float %0, float* %arrayidx1.i, align 4, !alias.scope !0, !noalias !4
+; CHECK: MayAlias:   %1 = load float* %c, align 4 <->   store float %1, float* %arrayidx, align 4
+; CHECK: NoAlias:   store float %0, float* %arrayidx1.i, align 4, !alias.scope !0, !noalias !4 <->   store float %0, float* %arrayidx.i, align
+; CHECK: 4, !alias.scope !4, !noalias !5
+; CHECK: NoAlias:   store float %1, float* %arrayidx, align 4 <->   store float %0, float* %arrayidx.i, align 4, !alias.scope !4, !noalias !5
+; CHECK: MayAlias:   store float %1, float* %arrayidx, align 4 <->   store float %0, float* %arrayidx1.i, align 4, !alias.scope !0, !noalias !
+; CHECK: 4
+}
+
+attributes #0 = { nounwind uwtable }
+
+!0 = metadata !{metadata !1, metadata !3}
+!1 = metadata !{metadata !1, metadata !2, metadata !"some scope"}
+!2 = metadata !{metadata !2, metadata !"some domain"}
+!3 = metadata !{metadata !3, metadata !2, metadata !"some other scope"}
+!4 = metadata !{metadata !1}
+!5 = metadata !{metadata !3}
+

diff --git a/test/Analysis/TypeBasedAliasAnalysis/dse.ll b/test/Analysis/TypeBasedAliasAnalysis/dse.ll
index bcf1f2c..9032fad 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/dse.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/dse.ll

@@ -1,4 +1,5 @@
 ; RUN: opt < %s -tbaa -basicaa -dse -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; DSE should make use of TBAA.
 

diff --git a/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll b/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll
index 609e87c..a027841 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -tbaa -basicaa -aa-eval -evaluate-tbaa -print-no-aliases -print-may-aliases -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -tbaa -basicaa -aa-eval -evaluate-aa-metadata -print-no-aliases -print-may-aliases -disable-output 2>&1 | FileCheck %s
 
 ; Generated with "clang -cc1 -disable-llvm-optzns -O1 -emit-llvm"
 ; #include <new>

diff --git a/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll b/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
index e1c5d45..38bece7 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -tbaa -basicaa -aa-eval -evaluate-tbaa -print-no-aliases -print-may-aliases -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -tbaa -basicaa -aa-eval -evaluate-aa-metadata -print-no-aliases -print-may-aliases -disable-output 2>&1 | FileCheck %s
 ; RUN: opt < %s -tbaa -basicaa -gvn -S | FileCheck %s --check-prefix=OPT
 ; Generated from clang/test/CodeGen/tbaa.cpp with "-O1 -struct-path-tbaa -disable-llvm-optzns".
 

diff --git a/test/Assembler/2002-03-08-NameCollision.ll b/test/Assembler/2002-03-08-NameCollision.ll
index b49789b..089d3fb 100644
--- a/test/Assembler/2002-03-08-NameCollision.ll
+++ b/test/Assembler/2002-03-08-NameCollision.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 ; Method arguments were being checked for collisions at the global scope before
 ; the method object was created by the parser.  Because of this, false

diff --git a/test/Assembler/2002-03-08-NameCollision2.ll b/test/Assembler/2002-03-08-NameCollision2.ll
index 1f7a4e1..dc98a36 100644
--- a/test/Assembler/2002-03-08-NameCollision2.ll
+++ b/test/Assembler/2002-03-08-NameCollision2.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 ; Another name collision problem.  Here the problem was that if a forward
 ; declaration for a method was found, that this would cause spurious conflicts

diff --git a/test/Assembler/2002-04-07-HexFloatConstants.ll b/test/Assembler/2002-04-07-HexFloatConstants.ll
index b0d7cc0..90ee85a 100644
--- a/test/Assembler/2002-04-07-HexFloatConstants.ll
+++ b/test/Assembler/2002-04-07-HexFloatConstants.ll

@@ -9,6 +9,7 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | opt -constprop | \
 ; RUN: llvm-dis > %t.2
 ; RUN: diff %t.1 %t.2
+; RUN: verify-uselistorder %s
 
 define double @test() {
         %tmp = fmul double 7.200000e+101, 0x427F4000             ; <double> [#uses=1]

diff --git a/test/Assembler/2002-04-07-InfConstant.ll b/test/Assembler/2002-04-07-InfConstant.ll
index 71837c9..6cd5447 100644
--- a/test/Assembler/2002-04-07-InfConstant.ll
+++ b/test/Assembler/2002-04-07-InfConstant.ll

@@ -1,6 +1,7 @@
 ; The output formater prints out 1.0e100 as Inf!
 ;
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | grep 0x7FF0000000000000
+; RUN: verify-uselistorder %s
 
 define float @test() {
         %tmp = fmul float 0x7FF0000000000000, 1.000000e+01               ; <float> [#uses=1]

diff --git a/test/Assembler/2002-04-29-NameBinding.ll b/test/Assembler/2002-04-29-NameBinding.ll
index 7960c20..960209b 100644
--- a/test/Assembler/2002-04-29-NameBinding.ll
+++ b/test/Assembler/2002-04-29-NameBinding.ll

@@ -7,6 +7,7 @@
 ; RUN: opt < %s -globaldce -S | \
 ; RUN:   not grep constant
 ;
+; RUN: verify-uselistorder %s
 
 @v1 = internal constant i32 5           
 

diff --git a/test/Assembler/2002-05-02-InvalidForwardRef.ll b/test/Assembler/2002-05-02-InvalidForwardRef.ll
index 234545c..38c42b3 100644
--- a/test/Assembler/2002-05-02-InvalidForwardRef.ll
+++ b/test/Assembler/2002-05-02-InvalidForwardRef.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 ; It looks like the assembler is not forward resolving the function declaraion
 ; correctly.
 

diff --git a/test/Assembler/2002-07-14-OpaqueType.ll b/test/Assembler/2002-07-14-OpaqueType.ll
index 662fb0f..6256aab 100644
--- a/test/Assembler/2002-07-14-OpaqueType.ll
+++ b/test/Assembler/2002-07-14-OpaqueType.ll

@@ -1,6 +1,7 @@
 ; Test that opaque types are preserved correctly
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis
 ;
+; RUN: verify-uselistorder %s
 
 %Ty = type opaque
 

diff --git a/test/Assembler/2002-07-25-QuoteInString.ll b/test/Assembler/2002-07-25-QuoteInString.ll
index facc5bd..1545680 100644
--- a/test/Assembler/2002-07-25-QuoteInString.ll
+++ b/test/Assembler/2002-07-25-QuoteInString.ll

@@ -1,5 +1,6 @@
 ; Test double quotes in strings work correctly!
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis
 ;
+; RUN: verify-uselistorder %s
 @str = internal global [6 x i8] c"\22foo\22\00"         ; <[6 x i8]*> [#uses=0]
 

diff --git a/test/Assembler/2002-07-25-ReturnPtrFunction.ll b/test/Assembler/2002-07-25-ReturnPtrFunction.ll
index 6988fad..fdee93c 100644
--- a/test/Assembler/2002-07-25-ReturnPtrFunction.ll
+++ b/test/Assembler/2002-07-25-ReturnPtrFunction.ll

@@ -2,6 +2,7 @@
 ; the right thing.
 ;
 ; RUN: llvm-as < %s | llvm-dis | llvm-as
+; RUN: verify-uselistorder %s
 
 declare void (i32)* @foo()
 

diff --git a/test/Assembler/2002-07-31-SlashInString.ll b/test/Assembler/2002-07-31-SlashInString.ll
index ff48258..879a965 100644
--- a/test/Assembler/2002-07-31-SlashInString.ll
+++ b/test/Assembler/2002-07-31-SlashInString.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as 
+; RUN: verify-uselistorder %s
 
 ; Make sure that \\ works in a string initializer
 @Slashtest = internal global [8 x i8] c"\5Cbegin{\00"

diff --git a/test/Assembler/2002-08-15-CastAmbiguity.ll b/test/Assembler/2002-08-15-CastAmbiguity.ll
index c716524..5f952b4 100644
--- a/test/Assembler/2002-08-15-CastAmbiguity.ll
+++ b/test/Assembler/2002-08-15-CastAmbiguity.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 define void @test(i32 %X) {
         call void @test( i32 6 )

diff --git a/test/Assembler/2002-08-15-ConstantExprProblem.ll b/test/Assembler/2002-08-15-ConstantExprProblem.ll
index 02b9ea9..343a104 100644
--- a/test/Assembler/2002-08-15-ConstantExprProblem.ll
+++ b/test/Assembler/2002-08-15-ConstantExprProblem.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 @.LC0 = internal global [12 x i8] c"hello world\00"             ; <[12 x i8]*> [#uses=1]
 

diff --git a/test/Assembler/2002-08-15-UnresolvedGlobalReference.ll b/test/Assembler/2002-08-15-UnresolvedGlobalReference.ll
index 2ba3f14..6bbe0cf 100644
--- a/test/Assembler/2002-08-15-UnresolvedGlobalReference.ll
+++ b/test/Assembler/2002-08-15-UnresolvedGlobalReference.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 @.LC0 = internal global [12 x i8] c"hello world\00"             ; <[12 x i8]*> [#uses=1]
 

diff --git a/test/Assembler/2002-08-16-ConstExprInlined.ll b/test/Assembler/2002-08-16-ConstExprInlined.ll
index f233bac..b78d858 100644
--- a/test/Assembler/2002-08-16-ConstExprInlined.ll
+++ b/test/Assembler/2002-08-16-ConstExprInlined.ll

@@ -9,6 +9,7 @@
 ; around!
 ;
 ; RUN: llvm-as < %s | llvm-dis | llvm-as
+; RUN: verify-uselistorder %s
 
 @.LC0 = internal global [4 x i8] c"foo\00"		; <[4 x i8]*> [#uses=1]
 @X = global i8* null		; <i8**> [#uses=0]

diff --git a/test/Assembler/2002-08-19-BytecodeReader.ll b/test/Assembler/2002-08-19-BytecodeReader.ll
index e211014..0722885 100644
--- a/test/Assembler/2002-08-19-BytecodeReader.ll
+++ b/test/Assembler/2002-08-19-BytecodeReader.ll

@@ -2,6 +2,7 @@
 ; "crafty" spec benchmark.
 ;
 ; RUN: opt < %s -instcombine | llvm-dis
+; RUN: verify-uselistorder %s
 	
 %CHESS_POSITION = type { i32, i32 }
 @pawn_probes = external global i32		; <i32*> [#uses=0]

diff --git a/test/Assembler/2002-08-22-DominanceProblem.ll b/test/Assembler/2002-08-22-DominanceProblem.ll
index 0dc192d..5048610 100644
--- a/test/Assembler/2002-08-22-DominanceProblem.ll
+++ b/test/Assembler/2002-08-22-DominanceProblem.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 ; Dominance relationships is not calculated correctly for unreachable blocks,
 ; which causes the verifier to barf on this input.

diff --git a/test/Assembler/2002-10-08-LargeArrayPerformance.ll b/test/Assembler/2002-10-08-LargeArrayPerformance.ll
index 34a9932..acd9280 100644
--- a/test/Assembler/2002-10-08-LargeArrayPerformance.ll
+++ b/test/Assembler/2002-10-08-LargeArrayPerformance.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 ; This testcase comes from the following really simple c file:
 ;; int foo[30000]
 ;;; We should not be soo slow for such a simple case!

diff --git a/test/Assembler/2002-10-13-ConstantEncodingProblem.ll b/test/Assembler/2002-10-13-ConstantEncodingProblem.ll
index bf3a521..a0f7b3d 100644
--- a/test/Assembler/2002-10-13-ConstantEncodingProblem.ll
+++ b/test/Assembler/2002-10-13-ConstantEncodingProblem.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 
 %Domain = type { %Domain**, %Domain* }
 @D = global %Domain zeroinitializer             ; <%Domain*> [#uses=0]

diff --git a/test/Assembler/2002-12-15-GlobalResolve.ll b/test/Assembler/2002-12-15-GlobalResolve.ll
index a873a61..87608cc 100644
--- a/test/Assembler/2002-12-15-GlobalResolve.ll
+++ b/test/Assembler/2002-12-15-GlobalResolve.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 @X = external global i32*
 @X1 = external global %T* 

diff --git a/test/Assembler/2003-01-30-UnsignedString.ll b/test/Assembler/2003-01-30-UnsignedString.ll
index 3c14d71..27550ad 100644
--- a/test/Assembler/2003-01-30-UnsignedString.ll
+++ b/test/Assembler/2003-01-30-UnsignedString.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 @spell_order = global [4 x i8] c"\FF\00\F7\00"
 

diff --git a/test/Assembler/2003-04-25-UnresolvedGlobalReference.ll b/test/Assembler/2003-04-25-UnresolvedGlobalReference.ll
index f1a5ed7..61fd911 100644
--- a/test/Assembler/2003-04-25-UnresolvedGlobalReference.ll
+++ b/test/Assembler/2003-04-25-UnresolvedGlobalReference.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 ; There should be absolutely no problem with this testcase.
 
 define i32 @test(i32 %arg1, i32 %arg2) {

diff --git a/test/Assembler/2003-05-03-BytecodeReaderProblem.ll b/test/Assembler/2003-05-03-BytecodeReaderProblem.ll
index f4a6911..5cd57ea 100644
--- a/test/Assembler/2003-05-03-BytecodeReaderProblem.ll
+++ b/test/Assembler/2003-05-03-BytecodeReaderProblem.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 
 define void @test() {
         %tmp.123 = trunc i64 0 to i32           ; <i32> [#uses=0]

diff --git a/test/Assembler/2003-05-12-MinIntProblem.ll b/test/Assembler/2003-05-12-MinIntProblem.ll
index ebe1690..1064a76 100644
--- a/test/Assembler/2003-05-12-MinIntProblem.ll
+++ b/test/Assembler/2003-05-12-MinIntProblem.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | grep -- -2147483648
+; RUN: verify-uselistorder %s
 
 define i32 @foo() {
         ret i32 -2147483648

diff --git a/test/Assembler/2003-05-15-AssemblerProblem.ll b/test/Assembler/2003-05-15-AssemblerProblem.ll
index 146ce65..eba26a2 100644
--- a/test/Assembler/2003-05-15-AssemblerProblem.ll
+++ b/test/Assembler/2003-05-15-AssemblerProblem.ll

@@ -1,6 +1,7 @@
 ; This bug was caused by two CPR's existing for the same global variable, 
 ; colliding in the Module level CPR map.
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 define void @test() {
         call void (...)* bitcast (void (i16*, i32)* @AddString to void (...)*)( i16* null, i32 0 )

diff --git a/test/Assembler/2003-05-15-SwitchBug.ll b/test/Assembler/2003-05-15-SwitchBug.ll
index 3768d9c..432be81 100644
--- a/test/Assembler/2003-05-15-SwitchBug.ll
+++ b/test/Assembler/2003-05-15-SwitchBug.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 ; Check minimal switch statement
 

diff --git a/test/Assembler/2003-05-21-ConstantShiftExpr.ll b/test/Assembler/2003-05-21-ConstantShiftExpr.ll
index 40b9651..5b8e5d2 100644
--- a/test/Assembler/2003-05-21-ConstantShiftExpr.ll
+++ b/test/Assembler/2003-05-21-ConstantShiftExpr.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 ; Test that shift instructions can be used in constant expressions.
 
 global i32 3670016

diff --git a/test/Assembler/2003-05-21-EmptyStructTest.ll b/test/Assembler/2003-05-21-EmptyStructTest.ll
index 26e83d9..934e32a 100644
--- a/test/Assembler/2003-05-21-EmptyStructTest.ll
+++ b/test/Assembler/2003-05-21-EmptyStructTest.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 ; The old C front-end never generated empty structures, now the new one
 ; can.  For some reason we never handled them in the parser. Weird.

diff --git a/test/Assembler/2003-08-20-ConstantExprGEP-Fold.ll b/test/Assembler/2003-08-20-ConstantExprGEP-Fold.ll
index 50cdeed..911f0ff 100644
--- a/test/Assembler/2003-08-20-ConstantExprGEP-Fold.ll
+++ b/test/Assembler/2003-08-20-ConstantExprGEP-Fold.ll

@@ -1,4 +1,5 @@
 ; RUN: opt < %s -instcombine -simplifycfg -S | not grep br
+; RUN: verify-uselistorder %s
 
 @.str_1 = internal constant [6 x i8] c"_Bool\00"                ; <[6 x i8]*> [#uses=2]
 

diff --git a/test/Assembler/2003-08-21-ConstantExprCast-Fold.ll b/test/Assembler/2003-08-21-ConstantExprCast-Fold.ll
index b76f774..926d4ed 100644
--- a/test/Assembler/2003-08-21-ConstantExprCast-Fold.ll
+++ b/test/Assembler/2003-08-21-ConstantExprCast-Fold.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | not grep getelementptr
+; RUN: verify-uselistorder %s
 
 @A = external global { float }          ; <{ float }*> [#uses=2]
 global i32* bitcast ({ float }* @A to i32*)             ; <i32**>:0 [#uses=0]

diff --git a/test/Assembler/2003-11-05-ConstantExprShift.ll b/test/Assembler/2003-11-05-ConstantExprShift.ll
index 86b093e..ddfceca 100644
--- a/test/Assembler/2003-11-05-ConstantExprShift.ll
+++ b/test/Assembler/2003-11-05-ConstantExprShift.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 
 define i32 @test() {
         ret i32 ashr (i32 ptrtoint (i32 ()* @test to i32), i32 2)

diff --git a/test/Assembler/2003-11-12-ConstantExprCast.ll b/test/Assembler/2003-11-12-ConstantExprCast.ll
index 47a5353..c9ad266 100644
--- a/test/Assembler/2003-11-12-ConstantExprCast.ll
+++ b/test/Assembler/2003-11-12-ConstantExprCast.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | not grep " bitcast ("
+; RUN: verify-uselistorder %s
 
 @.Base64_1 = external constant [4 x i8]         ; <[4 x i8]*> [#uses=1]
 

diff --git a/test/Assembler/2004-01-11-getelementptrfolding.ll b/test/Assembler/2004-01-11-getelementptrfolding.ll
index 5249d0e..188a95f 100644
--- a/test/Assembler/2004-01-11-getelementptrfolding.ll
+++ b/test/Assembler/2004-01-11-getelementptrfolding.ll

@@ -1,5 +1,6 @@
 ; RUN: llvm-as < %s | llvm-dis | \
 ; RUN:   not grep "getelementptr.*getelementptr"
+; RUN: verify-uselistorder %s
 
 %struct.TTriangleItem = type { i8*, i8*, [3 x %struct.TUVVertex] }
 %struct.TUVVertex = type { i16, i16, i16, i16 }

diff --git a/test/Assembler/2004-01-20-MaxLongLong.ll b/test/Assembler/2004-01-20-MaxLongLong.ll
index 8af5332..23eb402 100644
--- a/test/Assembler/2004-01-20-MaxLongLong.ll
+++ b/test/Assembler/2004-01-20-MaxLongLong.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | grep 9223372036854775808
+; RUN: verify-uselistorder %s
 
 global i64 -9223372036854775808
 

diff --git a/test/Assembler/2004-02-01-NegativeZero.ll b/test/Assembler/2004-02-01-NegativeZero.ll
index b28930f..98bd4cb 100644
--- a/test/Assembler/2004-02-01-NegativeZero.ll
+++ b/test/Assembler/2004-02-01-NegativeZero.ll

@@ -1,5 +1,9 @@
-; RUN: llvm-as < %s | llvm-dis | grep -- -0.0
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
+; CHECK: global double -0.000000e+00
 global double 0x8000000000000000
+
+; CHECK: global float -0.000000e+00
 global float -0.0
 

diff --git a/test/Assembler/2004-02-27-SelfUseAssertError.ll b/test/Assembler/2004-02-27-SelfUseAssertError.ll
index 7052eac..252a1b2 100644
--- a/test/Assembler/2004-02-27-SelfUseAssertError.ll
+++ b/test/Assembler/2004-02-27-SelfUseAssertError.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 ; %inc2 uses it's own value, but that's ok, as it's unreachable!
 

diff --git a/test/Assembler/2004-03-07-FunctionAddressAlignment.ll b/test/Assembler/2004-03-07-FunctionAddressAlignment.ll
index e3bf0bb..7fa0802 100644
--- a/test/Assembler/2004-03-07-FunctionAddressAlignment.ll
+++ b/test/Assembler/2004-03-07-FunctionAddressAlignment.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | not grep ptrtoint
+; RUN: verify-uselistorder %s
 ; All of these should be eliminable
 
 

diff --git a/test/Assembler/2004-04-04-GetElementPtrIndexTypes.ll b/test/Assembler/2004-04-04-GetElementPtrIndexTypes.ll
index ab46f88..a86fe63 100644
--- a/test/Assembler/2004-04-04-GetElementPtrIndexTypes.ll
+++ b/test/Assembler/2004-04-04-GetElementPtrIndexTypes.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 define i32* @t1({ float, i32 }* %X) {
         %W = getelementptr { float, i32 }* %X, i32 20, i32 1            ; <i32*> [#uses=0]

diff --git a/test/Assembler/2004-06-07-VerifierBug.ll b/test/Assembler/2004-06-07-VerifierBug.ll
index 07d2383..090599a 100644
--- a/test/Assembler/2004-06-07-VerifierBug.ll
+++ b/test/Assembler/2004-06-07-VerifierBug.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s > /dev/null
+; RUN: verify-uselistorder %s
 
 define void @t() {
 entry:

diff --git a/test/Assembler/2004-10-22-BCWriterUndefBug.ll b/test/Assembler/2004-10-22-BCWriterUndefBug.ll
index 694b80b..b934131 100644
--- a/test/Assembler/2004-10-22-BCWriterUndefBug.ll
+++ b/test/Assembler/2004-10-22-BCWriterUndefBug.ll

@@ -1,5 +1,6 @@
 ;; The bytecode writer was trying to treat undef values as ConstantArray's when
 ;; they looked like strings.
 ;; RUN: llvm-as %s -o /dev/null
+;; RUN: verify-uselistorder %s
 @G = internal global [8 x i8] undef
 

diff --git a/test/Assembler/2004-11-28-InvalidTypeCrash.ll b/test/Assembler/2004-11-28-InvalidTypeCrash.ll
index 4db5b74..7260f19 100644
--- a/test/Assembler/2004-11-28-InvalidTypeCrash.ll
+++ b/test/Assembler/2004-11-28-InvalidTypeCrash.ll

@@ -1,4 +1,5 @@
 ; Test for PR463.  This program is erroneous, but should not crash llvm-as.
-; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "use of undefined type named 'struct.none'"
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+; CHECK: use of undefined type named 'struct.none'
 
 @.FOO  = internal global %struct.none zeroinitializer

diff --git a/test/Assembler/2005-01-03-FPConstantDisassembly.ll b/test/Assembler/2005-01-03-FPConstantDisassembly.ll
index aaa776f..643d04c 100644
--- a/test/Assembler/2005-01-03-FPConstantDisassembly.ll
+++ b/test/Assembler/2005-01-03-FPConstantDisassembly.ll

@@ -1,6 +1,8 @@
-; RUN: llvm-as < %s | llvm-dis | grep 1.0
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 define double @test() {
+; CHECK: ret double 1.0
         ret double 1.0   ;; This should not require hex notation
 }
 

diff --git a/test/Assembler/2005-01-31-CallingAggregateFunction.ll b/test/Assembler/2005-01-31-CallingAggregateFunction.ll
index ce769a2..a5a917d 100644
--- a/test/Assembler/2005-01-31-CallingAggregateFunction.ll
+++ b/test/Assembler/2005-01-31-CallingAggregateFunction.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 define void @test() {
 	call {i32} @foo()

diff --git a/test/Assembler/2005-05-05-OpaqueUndefValues.ll b/test/Assembler/2005-05-05-OpaqueUndefValues.ll
index 8cd1419..01456f1 100644
--- a/test/Assembler/2005-05-05-OpaqueUndefValues.ll
+++ b/test/Assembler/2005-05-05-OpaqueUndefValues.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as > /dev/null
+; RUN: verify-uselistorder %s
 
 %t = type opaque
 @x = global %t undef

diff --git a/test/Assembler/2005-12-21-ZeroInitVector.ll b/test/Assembler/2005-12-21-ZeroInitVector.ll
index d3a692c..edcf605 100644
--- a/test/Assembler/2005-12-21-ZeroInitVector.ll
+++ b/test/Assembler/2005-12-21-ZeroInitVector.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s > /dev/null
+; RUN: verify-uselistorder %s
 
 define <4 x i32> @foo() {
         ret <4 x i32> zeroinitializer

diff --git a/test/Assembler/2006-12-09-Cast-To-Bool.ll b/test/Assembler/2006-12-09-Cast-To-Bool.ll
index a70262c..91abe77 100644
--- a/test/Assembler/2006-12-09-Cast-To-Bool.ll
+++ b/test/Assembler/2006-12-09-Cast-To-Bool.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | grep bitcast
+; RUN: verify-uselistorder %s
 
 define i1 @main(i32 %X) {
   %res = bitcast i1 true to i1

diff --git a/test/Assembler/2007-01-02-Undefined-Arg-Type.ll b/test/Assembler/2007-01-02-Undefined-Arg-Type.ll
index 184e543..a0542ee 100644
--- a/test/Assembler/2007-01-02-Undefined-Arg-Type.ll
+++ b/test/Assembler/2007-01-02-Undefined-Arg-Type.ll

@@ -1,5 +1,7 @@
 ; The assembler should catch an undefined argument type .
-; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "use of undefined type named 'typedef.bc_struct'"
+; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: use of undefined type named 'typedef.bc_struct'
 
 ; %typedef.bc_struct = type opaque
 

diff --git a/test/Assembler/2007-01-05-Cmp-ConstExpr.ll b/test/Assembler/2007-01-05-Cmp-ConstExpr.ll
index e3f67ba..54a4372 100644
--- a/test/Assembler/2007-01-05-Cmp-ConstExpr.ll
+++ b/test/Assembler/2007-01-05-Cmp-ConstExpr.ll

@@ -1,5 +1,6 @@
 ; Test Case for PR1080
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 @str = internal constant [4 x i8] c"-ga\00"             ; <[4 x i8]*> [#uses=2]
 

diff --git a/test/Assembler/2007-03-19-NegValue.ll b/test/Assembler/2007-03-19-NegValue.ll
index 64eb3cb..a2deac2 100644
--- a/test/Assembler/2007-03-19-NegValue.ll
+++ b/test/Assembler/2007-03-19-NegValue.ll

@@ -1,7 +1,9 @@
 ; Test whether negative values > 64 bits retain their negativeness.
-; RUN: llvm-as < %s | llvm-dis | grep "add i65.*, -1"
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 define i65 @testConsts(i65 %N) {
+; CHECK: add i65 %N, -1
   %a = add i65 %N, -1
   ret i65 %a
 }

diff --git a/test/Assembler/2007-04-20-AlignedLoad.ll b/test/Assembler/2007-04-20-AlignedLoad.ll
index 98a5428..bcf65fd 100644
--- a/test/Assembler/2007-04-20-AlignedLoad.ll
+++ b/test/Assembler/2007-04-20-AlignedLoad.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | grep "align 1024"
+; RUN: verify-uselistorder %s
 
 define i32 @test(i32* %arg) {
 entry:

diff --git a/test/Assembler/2007-04-20-AlignedStore.ll b/test/Assembler/2007-04-20-AlignedStore.ll
index 9e4dd9f..9605af2 100644
--- a/test/Assembler/2007-04-20-AlignedStore.ll
+++ b/test/Assembler/2007-04-20-AlignedStore.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | grep "align 1024"
+; RUN: verify-uselistorder %s
 
 define void @test(i32* %arg) {
 entry:

diff --git a/test/Assembler/2007-04-25-AssemblerFoldExternWeak.ll b/test/Assembler/2007-04-25-AssemblerFoldExternWeak.ll
index b0ca1aa..7c73abc 100644
--- a/test/Assembler/2007-04-25-AssemblerFoldExternWeak.ll
+++ b/test/Assembler/2007-04-25-AssemblerFoldExternWeak.ll

@@ -1,5 +1,8 @@
-; RUN: llvm-as < %s | llvm-dis | grep "icmp.*test_weak.*null"
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 ; PR1358
+
+; CHECK: icmp ne (i32 (...)* @test_weak, i32 (...)* null)
 @G = global i1 icmp ne (i32 (...)* @test_weak, i32 (...)* null)
 
 declare extern_weak i32 @test_weak(...)

diff --git a/test/Assembler/2007-05-21-Escape.ll b/test/Assembler/2007-05-21-Escape.ll
index 0868133..9716244 100644
--- a/test/Assembler/2007-05-21-Escape.ll
+++ b/test/Assembler/2007-05-21-Escape.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis
+; RUN: verify-uselistorder %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "x86_64-apple-darwin8"
 	%struct.bar = type { i32 }

diff --git a/test/Assembler/2007-07-19-ParamAttrAmbiguity.ll b/test/Assembler/2007-07-19-ParamAttrAmbiguity.ll
index 9c7daa8..68aeef2 100644
--- a/test/Assembler/2007-07-19-ParamAttrAmbiguity.ll
+++ b/test/Assembler/2007-07-19-ParamAttrAmbiguity.ll

@@ -1,5 +1,6 @@
 ; PR1553
 ; RUN: llvm-as < %s > /dev/null
+; RUN: verify-uselistorder %s
 define void @bar() {
         %t = call i8 @foo( i8 10 )
         zext i8 %t to i32

diff --git a/test/Assembler/2007-09-10-AliasFwdRef.ll b/test/Assembler/2007-09-10-AliasFwdRef.ll
index 2ebfc27..8e0a571 100644
--- a/test/Assembler/2007-09-10-AliasFwdRef.ll
+++ b/test/Assembler/2007-09-10-AliasFwdRef.ll

@@ -1,8 +1,9 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 ; PR1645
 
 @__gthread_active_ptr.5335 = internal constant i8* bitcast (i32 (i32)* @__gthrw_pthread_cancel to i8*)    
-@__gthrw_pthread_cancel = alias weak i32 (i32)* @pthread_cancel   
+@__gthrw_pthread_cancel = weak alias i32 (i32)* @pthread_cancel
 
 
 

diff --git a/test/Assembler/2007-09-29-GC.ll b/test/Assembler/2007-09-29-GC.ll
index 9aefd0b..f2cafbc 100644
--- a/test/Assembler/2007-09-29-GC.ll
+++ b/test/Assembler/2007-09-29-GC.ll

@@ -1,5 +1,9 @@
-; RUN: llvm-as < %s | llvm-dis | grep "@f.*gc.*shadowstack"
-; RUN: llvm-as < %s | llvm-dis | grep "@g.*gc.*java"
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
+
+
+; CHECK: define void @f() gc "shadowstack"
+; CHECK: define void @g() gc "java"
 
 define void @f() gc "shadowstack" {
 entry:

diff --git a/test/Assembler/2007-12-11-AddressSpaces.ll b/test/Assembler/2007-12-11-AddressSpaces.ll
index 7c9b5b5..f860f57 100644
--- a/test/Assembler/2007-12-11-AddressSpaces.ll
+++ b/test/Assembler/2007-12-11-AddressSpaces.ll

@@ -3,6 +3,7 @@
 ; RUN: llvm-as < %s | llvm-dis | grep "addrspace(66)" | count 2
 ; RUN: llvm-as < %s | llvm-dis | grep "addrspace(11)" | count 6
 ; RUN: llvm-as < %s | llvm-dis | grep "addrspace(22)" | count 5
+; RUN: verify-uselistorder %s
 
 	%struct.mystruct = type { i32, i32 addrspace(33)*, i32, i32 addrspace(33)* }
 @input = weak addrspace(42) global %struct.mystruct zeroinitializer  		; <%struct.mystruct addrspace(42)*> [#uses=1]

diff --git a/test/Assembler/2008-01-11-VarargAttrs.ll b/test/Assembler/2008-01-11-VarargAttrs.ll
index c0aedc8..0b6592c 100644
--- a/test/Assembler/2008-01-11-VarargAttrs.ll
+++ b/test/Assembler/2008-01-11-VarargAttrs.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | grep byval
+; RUN: verify-uselistorder %s
 
 	%struct = type {  }
 

diff --git a/test/Assembler/2008-07-10-APInt.ll b/test/Assembler/2008-07-10-APInt.ll
index 99347e9..fe3608d 100644
--- a/test/Assembler/2008-07-10-APInt.ll
+++ b/test/Assembler/2008-07-10-APInt.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 ; PR2538
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i686-pc-linux-gnu"

diff --git a/test/Assembler/2008-09-02-FunctionNotes.ll b/test/Assembler/2008-09-02-FunctionNotes.ll
index 11a0411..a629c93 100644
--- a/test/Assembler/2008-09-02-FunctionNotes.ll
+++ b/test/Assembler/2008-09-02-FunctionNotes.ll

@@ -1,5 +1,6 @@
 ; Test function attributes
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; CHECK: define void @fn1() #0
 define void @fn1() alwaysinline {

diff --git a/test/Assembler/2008-09-29-RetAttr.ll b/test/Assembler/2008-09-29-RetAttr.ll
index f7db96d..5eb608d 100644
--- a/test/Assembler/2008-09-29-RetAttr.ll
+++ b/test/Assembler/2008-09-29-RetAttr.ll

@@ -1,6 +1,7 @@
 ; Test return attributes
 ; RUN: llvm-as < %s | llvm-dis | grep "define inreg i32"
 ; RUN: llvm-as < %s | llvm-dis | grep "call inreg i32"
+; RUN: verify-uselistorder %s
 
 define inreg i32 @fn1() {
   ret i32 0

diff --git a/test/Assembler/2008-10-14-QuoteInName.ll b/test/Assembler/2008-10-14-QuoteInName.ll
index ccd7779..aa95e79 100644
--- a/test/Assembler/2008-10-14-QuoteInName.ll
+++ b/test/Assembler/2008-10-14-QuoteInName.ll

@@ -1,3 +1,4 @@
 ; RUN: llvm-as < %s | llvm-dis | grep "quote"
+; RUN: verify-uselistorder %s
 
 @"a\22quote" = global i32 0

diff --git a/test/Assembler/2009-02-01-UnnamedForwardRef.ll b/test/Assembler/2009-02-01-UnnamedForwardRef.ll
index 9c6e20d..5b1d9ee 100644
--- a/test/Assembler/2009-02-01-UnnamedForwardRef.ll
+++ b/test/Assembler/2009-02-01-UnnamedForwardRef.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 ; PR3372
 
 @X = global i32* @0

diff --git a/test/Assembler/2009-02-28-CastOpc.ll b/test/Assembler/2009-02-28-CastOpc.ll
index 6035643..e9d2308 100644
--- a/test/Assembler/2009-02-28-CastOpc.ll
+++ b/test/Assembler/2009-02-28-CastOpc.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 
 
 define void @foo() {

diff --git a/test/Assembler/2009-02-28-StripOpaqueName.ll b/test/Assembler/2009-02-28-StripOpaqueName.ll
index f61a44c..614cc57 100644
--- a/test/Assembler/2009-02-28-StripOpaqueName.ll
+++ b/test/Assembler/2009-02-28-StripOpaqueName.ll

@@ -1,4 +1,5 @@
 ; RUN: opt < %s -strip -S | llvm-as | llvm-dis
+; RUN: verify-uselistorder %s
 
 ; Stripping the name from A should not break references to it.
 %A = type opaque

diff --git a/test/Assembler/2009-03-24-ZextConstantExpr.ll b/test/Assembler/2009-03-24-ZextConstantExpr.ll
index daedb95..98bab4b 100644
--- a/test/Assembler/2009-03-24-ZextConstantExpr.ll
+++ b/test/Assembler/2009-03-24-ZextConstantExpr.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 ; PR3876
 @gdtr = external global [0 x i8]
 

diff --git a/test/Assembler/2009-07-24-ZeroArgGEP.ll b/test/Assembler/2009-07-24-ZeroArgGEP.ll
index 2a3d114..92f4d59 100644
--- a/test/Assembler/2009-07-24-ZeroArgGEP.ll
+++ b/test/Assembler/2009-07-24-ZeroArgGEP.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 @foo = global i32 0
 @bar = constant i32* getelementptr(i32* @foo)

diff --git a/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll b/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll
index 17dd745..5cb869d 100644
--- a/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll
+++ b/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll

@@ -1,4 +1,5 @@
-; RUN: opt -std-compile-opts < %s | llvm-dis | not grep badref
+; RUN: opt -O3 < %s | llvm-dis | not grep badref
+; RUN: verify-uselistorder %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.2"
@@ -11,7 +12,7 @@
 
 define i32 @main() nounwind readonly {
   %diff1 = alloca i64                             ; <i64*> [#uses=2]
-  call void @llvm.dbg.declare(metadata !{i64* %diff1}, metadata !0)
+  call void @llvm.dbg.declare(metadata !{i64* %diff1}, metadata !0, metadata !{metadata !"0x102"})
   store i64 72, i64* %diff1, align 8
   %v1 = load %struct.test** @TestArrayPtr, align 8 ; <%struct.test*> [#uses=1]
   %v2 = ptrtoint %struct.test* %v1 to i64 ; <i64> [#uses=1]
@@ -20,15 +21,15 @@
   ret i32 4
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !7 = metadata !{metadata !1}
-!6 = metadata !{i32 786449, metadata !8, i32 12, metadata !"clang version 3.0 (trunk 131941)", i1 true, metadata !"", i32 0, metadata !9, metadata !9, metadata !7, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!0 = metadata !{i32 786688, metadata !1, metadata !"c", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{i32 786478, metadata !8, metadata !2, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !8, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 131941)\001\00\000\00\000", metadata !8, metadata !9, metadata !9, metadata !7, null, null} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x100\00c\002\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!1 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\00256\000\001", metadata !8, metadata !2, metadata !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !6, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !6} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b"}
 !9 = metadata !{i32 0}

diff --git a/test/Assembler/ConstantExprFold.ll b/test/Assembler/ConstantExprFold.ll
index fc18ce7..dc41331 100644
--- a/test/Assembler/ConstantExprFold.ll
+++ b/test/Assembler/ConstantExprFold.ll

@@ -2,6 +2,7 @@
 ; situations
 
 ; RUN: llvm-as < %s | llvm-dis | not grep "("
+; RUN: verify-uselistorder %s
 
 @A = global i64 0
 

diff --git a/test/Assembler/ConstantExprFoldCast.ll b/test/Assembler/ConstantExprFoldCast.ll
index 161a4ca..094f87b 100644
--- a/test/Assembler/ConstantExprFoldCast.ll
+++ b/test/Assembler/ConstantExprFoldCast.ll

@@ -1,6 +1,7 @@
 ; This test checks to make sure that constant exprs fold in some simple situations
 
 ; RUN: llvm-as < %s | llvm-dis | not grep cast
+; RUN: verify-uselistorder %s
 
 @A = global i32* bitcast (i8* null to i32*)  ; Cast null -> fold
 @B = global i32** bitcast (i32** @A to i32**)   ; Cast to same type -> fold

diff --git a/test/Assembler/ConstantExprFoldSelect.ll b/test/Assembler/ConstantExprFoldSelect.ll
index b000e02..5d218a9 100644
--- a/test/Assembler/ConstantExprFoldSelect.ll
+++ b/test/Assembler/ConstantExprFoldSelect.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 ; PR18319
 
 define void @function() {

diff --git a/test/Assembler/ConstantExprNoFold.ll b/test/Assembler/ConstantExprNoFold.ll
index b41959f..8d03e7a 100644
--- a/test/Assembler/ConstantExprNoFold.ll
+++ b/test/Assembler/ConstantExprNoFold.ll

@@ -2,6 +2,7 @@
 ; situations
 
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; Even give it a datalayout, to tempt folding as much as possible.
 target datalayout = "p:32:32"
@@ -24,3 +25,15 @@
 
 ; CHECK: @E = global i64 addrspace(1)* addrspacecast (i64* @A to i64 addrspace(1)*)
 @E = global i64 addrspace(1)* addrspacecast(i64* @A to i64 addrspace(1)*)
+
+; Don't add an inbounds on @weak.gep, since @weak may be null.
+; CHECK: @weak.gep = global i32* getelementptr (i32* @weak, i32 1)
+@weak.gep = global i32* getelementptr (i32* @weak, i32 1)
+@weak = extern_weak global i32
+
+; Don't add an inbounds on @glob.a3, since it's not inbounds.
+; CHECK: @glob.a3 = alias getelementptr (i32* @glob.a2, i32 1)
+@glob = global i32 0
+@glob.a3 = alias getelementptr (i32* @glob.a2, i32 1)
+@glob.a2 = alias getelementptr (i32* @glob.a1, i32 1)
+@glob.a1 = alias i32* @glob

diff --git a/test/Assembler/MultipleReturnValueType.ll b/test/Assembler/MultipleReturnValueType.ll
index 6177143..5812632 100644
--- a/test/Assembler/MultipleReturnValueType.ll
+++ b/test/Assembler/MultipleReturnValueType.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s
+; RUN: verify-uselistorder %s
 
         %struct.S_102 = type { float, float }
 

diff --git a/test/Assembler/addrspacecast-alias.ll b/test/Assembler/addrspacecast-alias.ll
index d751659..745e525 100644
--- a/test/Assembler/addrspacecast-alias.ll
+++ b/test/Assembler/addrspacecast-alias.ll

@@ -1,7 +1,8 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; Test that global aliases are allowed to be constant addrspacecast
 
 @i = internal addrspace(1) global i8 42
-@ia = alias internal addrspacecast (i8 addrspace(1)* @i to i8 addrspace(2)* addrspace(3)*)
-; CHECK: @ia = alias internal addrspacecast (i8 addrspace(2)* addrspace(1)* bitcast (i8 addrspace(1)* @i to i8 addrspace(2)* addrspace(1)*) to i8 addrspace(2)* addrspace(3)*)
+@ia = internal alias addrspacecast (i8 addrspace(1)* @i to i8 addrspace(2)* addrspace(3)*)
+; CHECK: @ia = internal alias addrspacecast (i8 addrspace(2)* addrspace(1)* bitcast (i8 addrspace(1)* @i to i8 addrspace(2)* addrspace(1)*) to i8 addrspace(2)* addrspace(3)*)

diff --git a/test/Assembler/aggregate-constant-values.ll b/test/Assembler/aggregate-constant-values.ll
index d0aab81..9e68e06 100644
--- a/test/Assembler/aggregate-constant-values.ll
+++ b/test/Assembler/aggregate-constant-values.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; CHECK: @foo
 ; CHECK: store { i32, i32 } { i32 7, i32 9 }, { i32, i32 }* %x

diff --git a/test/Assembler/aggregate-return-single-value.ll b/test/Assembler/aggregate-return-single-value.ll
index 04540b5..a77c250 100644
--- a/test/Assembler/aggregate-return-single-value.ll
+++ b/test/Assembler/aggregate-return-single-value.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 
 define { i32 } @foob() nounwind {
   ret {i32}{ i32 0 }

diff --git a/test/Assembler/alias-use-list-order.ll b/test/Assembler/alias-use-list-order.ll
new file mode 100644
index 0000000..d29fd6e
--- /dev/null
+++ b/test/Assembler/alias-use-list-order.ll

@@ -0,0 +1,11 @@
+; RUN: verify-uselistorder < %s
+
+; Globals.
+@global = global i32 0
+@alias.ref1 = global i32* getelementptr inbounds (i32* @alias, i64 1)
+@alias.ref2 = global i32* getelementptr inbounds (i32* @alias, i64 1)
+
+; Aliases.
+@alias = alias i32* @global
+@alias.ref3 = alias i32* getelementptr inbounds (i32* @alias, i64 1)
+@alias.ref4 = alias i32* getelementptr inbounds (i32* @alias, i64 1)

diff --git a/test/Assembler/align-inst.ll b/test/Assembler/align-inst.ll
index 6f7100e..1952fbc 100644
--- a/test/Assembler/align-inst.ll
+++ b/test/Assembler/align-inst.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 @A = global i1 0, align 536870912
 

diff --git a/test/Assembler/alignstack.ll b/test/Assembler/alignstack.ll
index 9f2059f..784f44a 100644
--- a/test/Assembler/alignstack.ll
+++ b/test/Assembler/alignstack.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin10.0"
 

diff --git a/test/Assembler/anon-functions.ll b/test/Assembler/anon-functions.ll
index ac06e8c..42eea83 100644
--- a/test/Assembler/anon-functions.ll
+++ b/test/Assembler/anon-functions.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis
+; RUN: verify-uselistorder %s
 ; PR3611
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"

diff --git a/test/Assembler/atomic.ll b/test/Assembler/atomic.ll
index d7ccd99..0356f5f 100644
--- a/test/Assembler/atomic.ll
+++ b/test/Assembler/atomic.ll

@@ -1,4 +1,5 @@
 ; RUN: opt < %s | opt -S | FileCheck %s
+; RUN: verify-uselistorder %s
 ; Basic smoke test for atomic operations.
 
 define void @f(i32* %x) {

diff --git a/test/Assembler/auto_upgrade_intrinsics.ll b/test/Assembler/auto_upgrade_intrinsics.ll
index 8f655ce..f16e5fe 100644
--- a/test/Assembler/auto_upgrade_intrinsics.ll
+++ b/test/Assembler/auto_upgrade_intrinsics.ll

@@ -1,5 +1,6 @@
 ; Test to make sure intrinsics are automatically upgraded.
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 declare i8 @llvm.ctlz.i8(i8)
 declare i16 @llvm.ctlz.i16(i16)

diff --git a/test/Assembler/bcwrap.ll b/test/Assembler/bcwrap.ll
index 4bec48c..bc260ab 100644
--- a/test/Assembler/bcwrap.ll
+++ b/test/Assembler/bcwrap.ll

@@ -1,5 +1,6 @@
 ; RUN: llvm-as < %s > %t
 ; RUN: llvm-nm %t | FileCheck %s
+; RUN: verify-uselistorder %s
 ; Test for isBitcodeFile, llvm-nm must read from a file for this test.
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin9.2.2"

diff --git a/test/Assembler/comment.ll b/test/Assembler/comment.ll
index 16362ab..edf0989 100644
--- a/test/Assembler/comment.ll
+++ b/test/Assembler/comment.ll

@@ -1,5 +1,6 @@
 ; RUN: llvm-as < %s | llvm-dis -show-annotations | FileCheck -check-prefix=ANNOT %s
 ; RUN: llvm-as < %s | llvm-dis | FileCheck -check-prefix=BARE %s
+; RUN: verify-uselistorder %s
 
 ; The bare version of this file should not have any #uses lines.
 ; BARE: @B =

diff --git a/test/Assembler/externally-initialized.ll b/test/Assembler/externally-initialized.ll
index 4be6e62..ea93367 100644
--- a/test/Assembler/externally-initialized.ll
+++ b/test/Assembler/externally-initialized.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; CHECK: @G = externally_initialized global i32 0
 

diff --git a/test/Assembler/fast-math-flags.ll b/test/Assembler/fast-math-flags.ll
index 3a116c5..8e75bdf 100644
--- a/test/Assembler/fast-math-flags.ll
+++ b/test/Assembler/fast-math-flags.ll

@@ -1,5 +1,6 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 ; RUN: opt -S < %s | FileCheck %s
+; RUN: verify-uselistorder %s
 
 @addr   = external global i64
 @select = external global i1

diff --git a/test/Assembler/flags.ll b/test/Assembler/flags.ll
index 310b807..e74311a 100644
--- a/test/Assembler/flags.ll
+++ b/test/Assembler/flags.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 @addr = external global i64
 

diff --git a/test/Assembler/functionlocal-metadata.ll b/test/Assembler/functionlocal-metadata.ll
index f9b1d74..c46233a 100644
--- a/test/Assembler/functionlocal-metadata.ll
+++ b/test/Assembler/functionlocal-metadata.ll

@@ -1,34 +1,35 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 define void @Foo(i32 %a, i32 %b) {
 entry:
-  call void @llvm.dbg.value(metadata !{ i32* %1 }, i64 16, metadata !2)
-; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata ![[ID2:[0-9]+]])
+  call void @llvm.dbg.value(metadata !{ i32* %1 }, i64 16, metadata !2, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata ![[ID2:[0-9]+]], metadata {{.*}})
   %0 = add i32 %a, 1                              ; <i32> [#uses=1]
   %two = add i32 %b, %0                           ; <i32> [#uses=0]
   %1 = alloca i32                                 ; <i32*> [#uses=1]
 
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32* %1})
-; CHECK: metadata !{i32* %1}, metadata !{i32* %1}
-  call void @llvm.dbg.declare(metadata !{i32 %two}, metadata !{i32 %0})
-; CHECK: metadata !{i32 %two}, metadata !{i32 %0}
-  call void @llvm.dbg.declare(metadata !{i32 %0}, metadata !{i32* %1, i32 %0})
-; CHECK: metadata !{i32 %0}, metadata !{i32* %1, i32 %0}
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32 %b, i32 %0})
-; CHECK: metadata !{i32* %1}, metadata !{i32 %b, i32 %0}
-  call void @llvm.dbg.declare(metadata !{i32 %a}, metadata !{i32 %a, metadata !"foo"})
-; CHECK: metadata !{i32 %a}, metadata !{i32 %a, metadata !"foo"}
-  call void @llvm.dbg.declare(metadata !{i32 %b}, metadata !{metadata !0, i32 %two})
-; CHECK: metadata !{i32 %b}, metadata !{metadata ![[ID0:[0-9]+]], i32 %two}
+  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32* %1}, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32* %1}, metadata {{.*}})
+  call void @llvm.dbg.declare(metadata !{i32 %two}, metadata !{i32 %0}, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32 %two}, metadata !{i32 %0}, metadata {{.*}})
+  call void @llvm.dbg.declare(metadata !{i32 %0}, metadata !{i32* %1, i32 %0}, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32 %0}, metadata !{i32* %1, i32 %0}, metadata {{.*}})
+  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32 %b, i32 %0}, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !{i32 %b, i32 %0}, metadata {{.*}})
+  call void @llvm.dbg.declare(metadata !{i32 %a}, metadata !{i32 %a, metadata !"foo"}, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32 %a}, metadata !{i32 %a, metadata !"foo"}, metadata {{.*}})
+  call void @llvm.dbg.declare(metadata !{i32 %b}, metadata !{metadata !0, i32 %two}, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32 %b}, metadata !{metadata ![[ID0:[0-9]+]], i32 %two}, metadata {{.*}})
 
-  call void @llvm.dbg.value(metadata !{ i32 %a }, i64 0, metadata !1)
-; CHECK: metadata !{i32 %a}, i64 0, metadata ![[ID1:[0-9]+]]
-  call void @llvm.dbg.value(metadata !{ i32 %0 }, i64 25, metadata !0)
-; CHECK: metadata !{i32 %0}, i64 25, metadata ![[ID0]]
-  call void @llvm.dbg.value(metadata !{ i32* %1 }, i64 16, metadata !3)
-; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata ![[ID3:[0-9]+]])
-  call void @llvm.dbg.value(metadata !3, i64 12, metadata !2)
-; CHECK: metadata ![[ID3]], i64 12, metadata ![[ID2]]
+  call void @llvm.dbg.value(metadata !{ i32 %a }, i64 0, metadata !1, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata ![[ID1:[0-9]+]], metadata {{.*}})
+  call void @llvm.dbg.value(metadata !{ i32 %0 }, i64 25, metadata !0, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata !{i32 %0}, i64 25, metadata ![[ID0]], metadata {{.*}})
+  call void @llvm.dbg.value(metadata !{ i32* %1 }, i64 16, metadata !3, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata ![[ID3:[0-9]+]], metadata {{.*}})
+  call void @llvm.dbg.value(metadata !3, i64 12, metadata !2, metadata !{metadata !"0x102"})
+; CHECK: call void @llvm.dbg.value(metadata ![[ID3]], i64 12, metadata ![[ID2]], metadata {{.*}})
 
   ret void, !foo !0, !bar !1
 ; CHECK: ret void, !foo ![[FOO:[0-9]+]], !bar ![[BAR:[0-9]+]]
@@ -40,10 +41,10 @@
 !1 = metadata !{i32 4, metadata !"foo"}
 !2 = metadata !{metadata !"bar"}
 !3 = metadata !{metadata !"foo"}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !foo = !{ !0 }
 !bar = !{ !1 }

diff --git a/test/Assembler/getelementptr.ll b/test/Assembler/getelementptr.ll
index af03fca..e938ff4 100644
--- a/test/Assembler/getelementptr.ll
+++ b/test/Assembler/getelementptr.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; Verify that over-indexed getelementptrs are folded.
 @A = external global [2 x [3 x [5 x [7 x i32]]]]

diff --git a/test/Assembler/global-addrspace-forwardref.ll b/test/Assembler/global-addrspace-forwardref.ll
index f0f094a..4a036e0 100644
--- a/test/Assembler/global-addrspace-forwardref.ll
+++ b/test/Assembler/global-addrspace-forwardref.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; Make sure the address space of forward decls is preserved
 

diff --git a/test/Assembler/half-constprop.ll b/test/Assembler/half-constprop.ll
index 9e24f72..c5ae3bf 100644
--- a/test/Assembler/half-constprop.ll
+++ b/test/Assembler/half-constprop.ll

@@ -1,4 +1,5 @@
 ; RUN: opt < %s -O3 -S | FileCheck %s
+; RUN: verify-uselistorder %s
 ; Testing half constant propagation.
 
 define half @abc() nounwind {

diff --git a/test/Assembler/half-conv.ll b/test/Assembler/half-conv.ll
index 70a6b86..e6f73cf 100644
--- a/test/Assembler/half-conv.ll
+++ b/test/Assembler/half-conv.ll

@@ -1,4 +1,5 @@
 ; RUN: opt < %s -O3 -S | FileCheck %s
+; RUN: verify-uselistorder %s
 ; Testing half to float conversion.
 
 define float @abc() nounwind {

diff --git a/test/Assembler/half.ll b/test/Assembler/half.ll
index 63ad392..cbd03cb 100644
--- a/test/Assembler/half.ll
+++ b/test/Assembler/half.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 ; Basic smoke test for half type.
 
 ; CHECK: define half @halftest

diff --git a/test/Assembler/huge-array.ll b/test/Assembler/huge-array.ll
index a1abf87..6f89e83 100644
--- a/test/Assembler/huge-array.ll
+++ b/test/Assembler/huge-array.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; CHECK: define [18446744073709551615 x i8]* @foo() {
 ; CHECK: ret [18446744073709551615 x i8]* null

diff --git a/test/Assembler/inalloca.ll b/test/Assembler/inalloca.ll
index ff7a87e..a8c47b4 100644
--- a/test/Assembler/inalloca.ll
+++ b/test/Assembler/inalloca.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 define void @a() {
 entry:

diff --git a/test/Assembler/inline-asm-clobber.ll b/test/Assembler/inline-asm-clobber.ll
new file mode 100644
index 0000000..65c8e44
--- /dev/null
+++ b/test/Assembler/inline-asm-clobber.ll

@@ -0,0 +1,10 @@
+; RUN: not llvm-as <%s 2>&1  | FileCheck %s
+
+; "~x{21}" is not a valid clobber constraint.
+
+; CHECK: invalid type for inline asm constraint string
+
+define void @foo() nounwind {
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~x{21}"() nounwind
+  ret void
+}

diff --git a/test/Assembler/insertextractvalue.ll b/test/Assembler/insertextractvalue.ll
index 6c00b13..692843e 100644
--- a/test/Assembler/insertextractvalue.ll
+++ b/test/Assembler/insertextractvalue.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; CHECK:      @foo
 ; CHECK-NEXT: load

diff --git a/test/Assembler/internal-hidden-alias.ll b/test/Assembler/internal-hidden-alias.ll
index 660514b..df547c0 100644
--- a/test/Assembler/internal-hidden-alias.ll
+++ b/test/Assembler/internal-hidden-alias.ll

@@ -2,5 +2,5 @@
 
 @global = global i32 0
 
-@alias = hidden alias internal i32* @global
+@alias = internal hidden alias i32* @global
 ; CHECK: symbol with local linkage must have default visibility

diff --git a/test/Assembler/internal-protected-alias.ll b/test/Assembler/internal-protected-alias.ll
index d785826..46a05ec 100644
--- a/test/Assembler/internal-protected-alias.ll
+++ b/test/Assembler/internal-protected-alias.ll

@@ -2,5 +2,5 @@
 
 @global = global i32 0
 
-@alias = protected alias internal i32* @global
+@alias = internal protected alias i32* @global
 ; CHECK: symbol with local linkage must have default visibility

diff --git a/test/Assembler/invalid-uselistorder-function-between-blocks.ll b/test/Assembler/invalid-uselistorder-function-between-blocks.ll
new file mode 100644
index 0000000..8f771e8
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-function-between-blocks.ll

@@ -0,0 +1,37 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: expected uselistorder directive
+
+define i32 @f32(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  br label %first
+
+; <label 0>:
+  %eh = mul i32 %e, %1
+  %sum = add i32 %eh, %ef
+  br label %preexit
+
+preexit:
+  %product = phi i32 [%ef, %first], [%sum, %0]
+  %backto0 = icmp slt i32 %product, -9
+  br i1 %backto0, label %0, label %exit
+
+first:
+  %e = add i32 %a, 7
+  %f = add i32 %b, 7
+  %g = add i32 %c, 8
+  %1 = add i32 %d, 8
+  %ef = mul i32 %e, %f
+  %g1 = mul i32 %g, %1
+  %goto0 = icmp slt i32 %g1, -9
+  br i1 %goto0, label %0, label %preexit
+
+; uselistorder directives
+  uselistorder i32 7, { 1, 0 }
+  uselistorder i32 %1, { 1, 0 }
+  uselistorder i32 %e, { 1, 0 }
+  uselistorder label %0, { 1, 0 }
+  uselistorder label %preexit, { 1, 0 }
+
+exit:
+  ret i32 %product
+}

diff --git a/test/Assembler/invalid-uselistorder-function-missing-named.ll b/test/Assembler/invalid-uselistorder-function-missing-named.ll
new file mode 100644
index 0000000..c682fbe
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-function-missing-named.ll

@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: value has no uses
+define void @foo() {
+  unreachable
+  uselistorder i32 %val, { 1, 0 }
+}

diff --git a/test/Assembler/invalid-uselistorder-function-missing-numbered.ll b/test/Assembler/invalid-uselistorder-function-missing-numbered.ll
new file mode 100644
index 0000000..e3bf0e1
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-function-missing-numbered.ll

@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: value has no uses
+define void @foo() {
+  unreachable
+  uselistorder i32 %1, { 1, 0 }
+}

diff --git a/test/Assembler/invalid-uselistorder-global-missing.ll b/test/Assembler/invalid-uselistorder-global-missing.ll
new file mode 100644
index 0000000..92f9350
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-global-missing.ll

@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: value has no uses
+uselistorder i32* @global, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder-indexes-duplicated.ll b/test/Assembler/invalid-uselistorder-indexes-duplicated.ll
new file mode 100644
index 0000000..e4affc5
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-indexes-duplicated.ll

@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: expected distinct uselistorder indexes in range [0, size)
+@global = global i32 0
+@alias1 = alias i32* @global
+@alias2 = alias i32* @global
+@alias3 = alias i32* @global
+uselistorder i32* @global, { 0, 0, 2 }

diff --git a/test/Assembler/invalid-uselistorder-indexes-empty.ll b/test/Assembler/invalid-uselistorder-indexes-empty.ll
new file mode 100644
index 0000000..82bbc97
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-indexes-empty.ll

@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: value has no uses
+@global = global i32 0
+uselistorder i32* @global, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder-indexes-one.ll b/test/Assembler/invalid-uselistorder-indexes-one.ll
new file mode 100644
index 0000000..f5eac80
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-indexes-one.ll

@@ -0,0 +1,5 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: value only has one use
+@global = global i32 0
+@alias = alias i32* @global
+uselistorder i32* @global, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder-indexes-ordered.ll b/test/Assembler/invalid-uselistorder-indexes-ordered.ll
new file mode 100644
index 0000000..7bdc400
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-indexes-ordered.ll

@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: expected uselistorder indexes to change the order
+@global = global i32 0
+@alias1 = alias i32* @global
+@alias2 = alias i32* @global
+@alias3 = alias i32* @global
+uselistorder i32* @global, { 0, 1, 2 }

diff --git a/test/Assembler/invalid-uselistorder-indexes-range.ll b/test/Assembler/invalid-uselistorder-indexes-range.ll
new file mode 100644
index 0000000..fc97aca
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-indexes-range.ll

@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: expected distinct uselistorder indexes in range [0, size)
+@global = global i32 0
+@alias1 = alias i32* @global
+@alias2 = alias i32* @global
+@alias3 = alias i32* @global
+uselistorder i32* @global, { 0, 3, 1 }

diff --git a/test/Assembler/invalid-uselistorder-indexes-toofew.ll b/test/Assembler/invalid-uselistorder-indexes-toofew.ll
new file mode 100644
index 0000000..88a76fc
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-indexes-toofew.ll

@@ -0,0 +1,7 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: wrong number of indexes, expected 3
+@global = global i32 0
+@alias1 = alias i32* @global
+@alias2 = alias i32* @global
+@alias3 = alias i32* @global
+uselistorder i32* @global, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder-indexes-toomany.ll b/test/Assembler/invalid-uselistorder-indexes-toomany.ll
new file mode 100644
index 0000000..a2cf3da
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-indexes-toomany.ll

@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: wrong number of indexes, expected 2
+@global = global i32 0
+@alias1 = alias i32* @global
+@alias2 = alias i32* @global
+uselistorder i32* @global, { 1, 0, 2 }

diff --git a/test/Assembler/invalid-uselistorder-type.ll b/test/Assembler/invalid-uselistorder-type.ll
new file mode 100644
index 0000000..e426a7d
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder-type.ll

@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: '@global' defined with type 'i32*'
+@global = global i32 0
+uselistorder i31* @global, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder_bb-missing-bb.ll b/test/Assembler/invalid-uselistorder_bb-missing-bb.ll
new file mode 100644
index 0000000..bd12faa
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder_bb-missing-bb.ll

@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: invalid basic block in uselistorder_bb
+define void @foo() {
+  unreachable
+}
+uselistorder_bb @foo, %bb, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder_bb-missing-body.ll b/test/Assembler/invalid-uselistorder_bb-missing-body.ll
new file mode 100644
index 0000000..0fbc3a8
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder_bb-missing-body.ll

@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: invalid declaration in uselistorder_bb
+declare void @foo()
+uselistorder_bb @foo, %bb, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder_bb-missing-func.ll b/test/Assembler/invalid-uselistorder_bb-missing-func.ll
new file mode 100644
index 0000000..5a1466f
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder_bb-missing-func.ll

@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: invalid function forward reference in uselistorder_bb
+uselistorder_bb @foo, %bb, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder_bb-not-bb.ll b/test/Assembler/invalid-uselistorder_bb-not-bb.ll
new file mode 100644
index 0000000..e59e754
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder_bb-not-bb.ll

@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: expected basic block in uselistorder_bb
+define i32 @foo(i32 %arg) {
+  ret i32 %arg
+}
+uselistorder_bb @foo, %arg, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder_bb-not-func.ll b/test/Assembler/invalid-uselistorder_bb-not-func.ll
new file mode 100644
index 0000000..080ddc1
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder_bb-not-func.ll

@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: expected function name in uselistorder_bb
+@global = global i1 0
+uselistorder_bb @global, %bb, { 1, 0 }

diff --git a/test/Assembler/invalid-uselistorder_bb-numbered.ll b/test/Assembler/invalid-uselistorder_bb-numbered.ll
new file mode 100644
index 0000000..d7d170f
--- /dev/null
+++ b/test/Assembler/invalid-uselistorder_bb-numbered.ll

@@ -0,0 +1,11 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+; CHECK: error: invalid numeric label in uselistorder_bb
+
+@ba1 = constant i8* blockaddress (@foo, %1)
+
+define void @foo() {
+  br label %1
+  unreachable
+}
+
+uselistorder_bb @foo, %1, { 1, 0 }

diff --git a/test/Assembler/metadata.ll b/test/Assembler/metadata.ll
index 56888fd..f6e619d 100644
--- a/test/Assembler/metadata.ll
+++ b/test/Assembler/metadata.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; CHECK: @test
 ; CHECK: ret void, !bar !1, !foo !0

diff --git a/test/Assembler/musttail-invalid-1.ll b/test/Assembler/musttail-invalid-1.ll
new file mode 100644
index 0000000..b123a91
--- /dev/null
+++ b/test/Assembler/musttail-invalid-1.ll

@@ -0,0 +1,14 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+; Check the error message on using ", ..." when we can't actually forward
+; varargs.
+
+%struct.A = type { i32 }
+
+declare i8* @f(i8*, ...)
+
+define i8* @f_thunk(i8* %this) {
+  %rv = musttail call i8* (i8*, ...)* @f(i8* %this, ...)
+; CHECK: error: unexpected ellipsis in argument list for musttail call in non-varargs function
+  ret i8* %rv
+}

diff --git a/test/Assembler/musttail-invalid-2.ll b/test/Assembler/musttail-invalid-2.ll
new file mode 100644
index 0000000..3bcb51f
--- /dev/null
+++ b/test/Assembler/musttail-invalid-2.ll

@@ -0,0 +1,13 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+; Check the error message on skipping ", ..." at the end of a musttail call argument list.
+
+%struct.A = type { i32 }
+
+declare i8* @f(i8*, ...)
+
+define i8* @f_thunk(i8* %this, ...) {
+  %rv = musttail call i8* (i8*, ...)* @f(i8* %this)
+; CHECK: error: expected '...' at end of argument list for musttail call in varargs function
+  ret i8* %rv
+}

diff --git a/test/Assembler/musttail.ll b/test/Assembler/musttail.ll
new file mode 100644
index 0000000..6e2a9b2
--- /dev/null
+++ b/test/Assembler/musttail.ll

@@ -0,0 +1,14 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Check that the ellipsis round trips.
+
+%struct.A = type { i32 }
+
+declare i8* @f(i8*, ...)
+
+define i8* @f_thunk(i8* %this, ...) {
+  %rv = musttail call i8* (i8*, ...)* @f(i8* %this, ...)
+  ret i8* %rv
+}
+; CHECK-LABEL: define i8* @f_thunk(i8* %this, ...)
+; CHECK: %rv = musttail call i8* (i8*, ...)* @f(i8* %this, ...)

diff --git a/test/Assembler/named-metadata.ll b/test/Assembler/named-metadata.ll
index db72810..954c189 100644
--- a/test/Assembler/named-metadata.ll
+++ b/test/Assembler/named-metadata.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 !0 = metadata !{metadata !"zero"}
 !1 = metadata !{metadata !"one"}

diff --git a/test/Assembler/numbered-values.ll b/test/Assembler/numbered-values.ll
index 2439c83..70b6377 100644
--- a/test/Assembler/numbered-values.ll
+++ b/test/Assembler/numbered-values.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis
+; RUN: verify-uselistorder %s
 ; PR2480
 
 define i32 @test(i32 %X) nounwind {

diff --git a/test/Assembler/private-hidden-alias.ll b/test/Assembler/private-hidden-alias.ll
index 58be92a..2e770e5 100644
--- a/test/Assembler/private-hidden-alias.ll
+++ b/test/Assembler/private-hidden-alias.ll

@@ -2,5 +2,5 @@
 
 @global = global i32 0
 
-@alias = hidden alias private i32* @global
+@alias = private hidden alias i32* @global
 ; CHECK: symbol with local linkage must have default visibility

diff --git a/test/Assembler/private-protected-alias.ll b/test/Assembler/private-protected-alias.ll
index a72c248..f1824a2 100644
--- a/test/Assembler/private-protected-alias.ll
+++ b/test/Assembler/private-protected-alias.ll

@@ -2,5 +2,5 @@
 
 @global = global i32 0
 
-@alias = protected alias private i32* @global
+@alias = private protected alias i32* @global
 ; CHECK: symbol with local linkage must have default visibility

diff --git a/test/Assembler/select.ll b/test/Assembler/select.ll
index 2d3f412..fe4677a 100644
--- a/test/Assembler/select.ll
+++ b/test/Assembler/select.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o /dev/null
+; RUN: verify-uselistorder %s
 
 
 define i32 @test(i1 %C, i32 %V1, i32 %V2) {

diff --git a/test/Assembler/tls-models.ll b/test/Assembler/tls-models.ll
index 42f2496..fbc0777 100644
--- a/test/Assembler/tls-models.ll
+++ b/test/Assembler/tls-models.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; CHECK: @a = thread_local global i32 0
 ; CHECK: @b = thread_local(localdynamic) global i32 0

diff --git a/test/Assembler/unnamed-addr.ll b/test/Assembler/unnamed-addr.ll
index 35b3b39..304e544 100644
--- a/test/Assembler/unnamed-addr.ll
+++ b/test/Assembler/unnamed-addr.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 %struct.foobar = type { i32 }
 

diff --git a/test/Assembler/unnamed.ll b/test/Assembler/unnamed.ll
index fb4fa62..099a15a 100644
--- a/test/Assembler/unnamed.ll
+++ b/test/Assembler/unnamed.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis
+; RUN: verify-uselistorder %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 

diff --git a/test/Assembler/upgrade-loop-metadata.ll b/test/Assembler/upgrade-loop-metadata.ll
index f664bdf..7c5a580 100644
--- a/test/Assembler/upgrade-loop-metadata.ll
+++ b/test/Assembler/upgrade-loop-metadata.ll

@@ -5,6 +5,7 @@
 ;
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 ; RUN: opt -S < %s | FileCheck %s
+; RUN: verify-uselistorder %s
 
 define void @_Z28loop_with_vectorize_metadatav() {
 entry:
@@ -30,7 +31,7 @@
   ret void
 }
 
-; CHECK: !{metadata !"llvm.loop.vectorize.unroll", i32 4}
+; CHECK: !{metadata !"llvm.loop.interleave.count", i32 4}
 ; CHECK: !{metadata !"llvm.loop.vectorize.width", i32 8}
 ; CHECK: !{metadata !"llvm.loop.vectorize.enable", i1 true}
 

diff --git a/test/Assembler/uselistorder.ll b/test/Assembler/uselistorder.ll
new file mode 100644
index 0000000..be5ee70
--- /dev/null
+++ b/test/Assembler/uselistorder.ll

@@ -0,0 +1,56 @@
+; RUN: llvm-as < %s -disable-output 2>&1 | FileCheck %s -allow-empty
+; CHECK-NOT: error
+; CHECK-NOT: warning
+; RUN: verify-uselistorder < %s
+
+@a = global [4 x i1] [i1 0, i1 1, i1 0, i1 1]
+@b = alias i1* getelementptr ([4 x i1]* @a, i64 0, i64 2)
+
+; Check use-list order of constants used by globals.
+@glob1 = global i5 7
+@glob2 = global i5 7
+@glob3 = global i5 7
+
+define i32 @f32(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  br label %first
+
+; <label 0>:
+  %eh = mul i32 %e, %1
+  %sum = add i32 %eh, %ef
+  br label %preexit
+
+preexit:
+  %product = phi i32 [%ef, %first], [%sum, %0]
+  %backto0 = icmp slt i32 %product, -9
+  br i1 %backto0, label %0, label %exit
+
+exit:
+  ret i32 %product
+
+first:
+  %e = add i32 %a, 7
+  %f = add i32 %b, 7
+  %g = add i32 %c, 8
+  %1 = add i32 %d, 8
+  %ef = mul i32 %e, %f
+  %g1 = mul i32 %g, %1
+  %goto0 = icmp slt i32 %g1, -9
+  br i1 %goto0, label %0, label %preexit
+
+; uselistorder directives
+  uselistorder i32 7, { 1, 0 }
+  uselistorder i32 %1, { 1, 0 }
+  uselistorder i32 %e, { 1, 0 }
+  uselistorder label %0, { 1, 0 }
+  uselistorder label %preexit, { 1, 0 }
+}
+
+define i1 @loada() {
+entry:
+  %a = load i1* getelementptr ([4 x i1]* @a, i64 0, i64 2)
+  ret i1 %a
+}
+
+uselistorder i5 7, { 1, 0, 2 }
+uselistorder i1* getelementptr ([4 x i1]* @a, i64 0, i64 2), { 1, 0 }

diff --git a/test/Assembler/uselistorder_bb.ll b/test/Assembler/uselistorder_bb.ll
new file mode 100644
index 0000000..11ae57b
--- /dev/null
+++ b/test/Assembler/uselistorder_bb.ll

@@ -0,0 +1,42 @@
+; RUN: llvm-as < %s -disable-output 2>&1 | FileCheck %s -allow-empty
+; CHECK-NOT: error
+; CHECK-NOT: warning
+; RUN: verify-uselistorder < %s
+
+@ba1 = constant i8* blockaddress (@bafunc1, %bb)
+@ba2 = constant i8* getelementptr (i8* blockaddress (@bafunc2, %bb), i61 0)
+@ba3 = constant i8* getelementptr (i8* blockaddress (@bafunc2, %bb), i61 0)
+
+define i8* @babefore() {
+  ret i8* getelementptr (i8* blockaddress (@bafunc2, %bb), i61 0)
+bb1:
+  ret i8* blockaddress (@bafunc1, %bb)
+bb2:
+  ret i8* blockaddress (@bafunc3, %bb)
+}
+define void @bafunc1() {
+  br label %bb
+bb:
+  unreachable
+}
+define void @bafunc2() {
+  br label %bb
+bb:
+  unreachable
+}
+define void @bafunc3() {
+  br label %bb
+bb:
+  unreachable
+}
+define i8* @baafter() {
+  ret i8* blockaddress (@bafunc2, %bb)
+bb1:
+  ret i8* blockaddress (@bafunc1, %bb)
+bb2:
+  ret i8* blockaddress (@bafunc3, %bb)
+}
+
+uselistorder_bb @bafunc1, %bb, { 1, 0 }
+uselistorder_bb @bafunc2, %bb, { 1, 0 }
+uselistorder_bb @bafunc3, %bb, { 1, 0 }

diff --git a/test/Assembler/vbool-cmp.ll b/test/Assembler/vbool-cmp.ll
index e652d2f..6bbd5c8 100644
--- a/test/Assembler/vbool-cmp.ll
+++ b/test/Assembler/vbool-cmp.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 ; Rudimentary test of fcmp/icmp on vectors returning vector of bool
 
 ; CHECK: @ffoo

diff --git a/test/Assembler/vector-cmp.ll b/test/Assembler/vector-cmp.ll
index 6e3894c..dc55494 100644
--- a/test/Assembler/vector-cmp.ll
+++ b/test/Assembler/vector-cmp.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 ; PR2317
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin9.2.2"

diff --git a/test/Assembler/vector-select.ll b/test/Assembler/vector-select.ll
index ae8358a..59692d6 100644
--- a/test/Assembler/vector-select.ll
+++ b/test/Assembler/vector-select.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 ; Rudimentary test of select on vectors returning vector of bool
 
 ; CHECK: @foo

diff --git a/test/Assembler/vector-shift.ll b/test/Assembler/vector-shift.ll
index 6a6531b..d4351a8 100644
--- a/test/Assembler/vector-shift.ll
+++ b/test/Assembler/vector-shift.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 
 ; CHECK: @foo
 ; CHECK: shl

diff --git a/test/Assembler/x86mmx.ll b/test/Assembler/x86mmx.ll
index 732d3be..608347e 100644
--- a/test/Assembler/x86mmx.ll
+++ b/test/Assembler/x86mmx.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder %s
 ; Basic smoke test for x86_mmx type.
 
 ; CHECK: define x86_mmx @sh16

diff --git a/test/Bindings/Go/go.test b/test/Bindings/Go/go.test
new file mode 100644
index 0000000..3951483
--- /dev/null
+++ b/test/Bindings/Go/go.test

@@ -0,0 +1,3 @@
+; RUN: llvm-go test llvm.org/llvm/bindings/go/llvm
+
+; REQUIRES: shell

diff --git a/test/Bindings/Go/lit.local.cfg b/test/Bindings/Go/lit.local.cfg
new file mode 100644
index 0000000..e86595b
--- /dev/null
+++ b/test/Bindings/Go/lit.local.cfg

@@ -0,0 +1,57 @@
+import os
+import pipes
+import shlex
+import sys
+
+if not 'go' in config.root.llvm_bindings:
+    config.unsupported = True
+
+def find_executable(executable, path=None):
+    if path is None:
+        path = os.environ['PATH']
+    paths = path.split(os.pathsep)
+    base, ext = os.path.splitext(executable)
+
+    if (sys.platform == 'win32' or os.name == 'os2') and (ext != '.exe'):
+        executable = executable + '.exe'
+
+    if not os.path.isfile(executable):
+        for p in paths:
+            f = os.path.join(p, executable)
+            if os.path.isfile(f):
+                return f
+        return None
+    else:
+        return executable
+
+# Resolve certain symlinks in the first word of compiler.
+#
+# This is a Go-specific hack. cgo and other Go tools check $CC and $CXX for the
+# substring 'clang' to determine if the compiler is Clang. This won't work if
+# $CC is cc and cc is a symlink pointing to clang, as it is on Darwin.
+#
+# Go tools also have problems with ccache, so we disable it.
+def fixup_compiler_path(compiler):
+    args = shlex.split(compiler)
+    if args[0].endswith('ccache'):
+        args = args[1:]
+
+    path = find_executable(args[0])
+
+    try:
+        if path.endswith('/cc') and os.readlink(path) == 'clang':
+            args[0] = path[:len(path)-2] + 'clang'
+    except (AttributeError, OSError):
+        pass
+
+    try:
+        if path.endswith('/c++') and os.readlink(path) == 'clang++':
+            args[0] = path[:len(path)-3] + 'clang++'
+    except (AttributeError, OSError):
+        pass
+
+    return ' '.join([pipes.quote(arg) for arg in args])
+
+config.environment['CC'] = fixup_compiler_path(config.host_cc)
+config.environment['CXX'] = fixup_compiler_path(config.host_cxx)
+config.environment['CGO_LDFLAGS'] = config.host_ldflags

diff --git a/test/Bindings/OCaml/analysis.ml b/test/Bindings/OCaml/analysis.ml
new file mode 100644
index 0000000..e935ee8
--- /dev/null
+++ b/test/Bindings/OCaml/analysis.ml

@@ -0,0 +1,54 @@
+(* RUN: cp %s %T/analysis.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.analysis -linkpkg %T/analysis.ml -o %t
+ * RUN: %t
+ * RUN: %ocamlopt -g -warn-error A -package llvm.analysis -linkpkg %T/analysis.ml -o %t
+ * RUN: %t
+ * XFAIL: vg_leak
+ *)
+
+open Llvm
+open Llvm_analysis
+
+(* Note that this takes a moment to link, so it's best to keep the number of
+   individual tests low. *)
+
+let context = global_context ()
+
+let test x = if not x then exit 1 else ()
+
+let bomb msg =
+  prerr_endline msg;
+  exit 2
+
+let _ =
+  let fty = function_type (void_type context) [| |] in
+  let m = create_module context "valid_m" in
+  let fn = define_function "valid_fn" fty m in
+  let at_entry = builder_at_end context (entry_block fn) in
+  ignore (build_ret_void at_entry);
+
+
+  (* Test that valid constructs verify. *)
+  begin match verify_module m with
+    Some msg -> bomb "valid module failed verification!"
+  | None -> ()
+  end;
+
+  if not (verify_function fn) then bomb "valid function failed verification!";
+
+
+  (* Test that invalid constructs do not verify.
+     A basic block can contain only one terminator instruction. *)
+  ignore (build_ret_void at_entry);
+
+  begin match verify_module m with
+    Some msg -> ()
+  | None -> bomb "invalid module passed verification!"
+  end;
+
+  if verify_function fn then bomb "invalid function passed verification!";
+
+
+  dispose_module m
+
+  (* Don't bother to test assert_valid_{module,function}. *)

diff --git a/test/Bindings/OCaml/bitreader.ml b/test/Bindings/OCaml/bitreader.ml
new file mode 100644
index 0000000..57cfd04
--- /dev/null
+++ b/test/Bindings/OCaml/bitreader.ml

@@ -0,0 +1,79 @@
+(* RUN: cp %s %T/bitreader.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.bitreader -package llvm.bitwriter -linkpkg %T/bitreader.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: %ocamlopt -g -warn-error A -package llvm.bitreader -package llvm.bitwriter -linkpkg %T/bitreader.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: llvm-dis < %t.bc
+ * XFAIL: vg_leak
+ *)
+
+(* Note that this takes a moment to link, so it's best to keep the number of
+   individual tests low. *)
+
+let context = Llvm.global_context ()
+
+let test x = if not x then exit 1 else ()
+
+let _ =
+  let fn = Sys.argv.(1) in
+  let m = Llvm.create_module context "ocaml_test_module" in
+
+  test (Llvm_bitwriter.write_bitcode_file m fn);
+
+  Llvm.dispose_module m;
+
+  (* parse_bitcode *)
+  begin
+    let mb = Llvm.MemoryBuffer.of_file fn in
+    begin try
+      let m = Llvm_bitreader.parse_bitcode context mb in
+      Llvm.dispose_module m
+    with x ->
+      Llvm.MemoryBuffer.dispose mb;
+      raise x
+    end
+  end;
+
+  (* MemoryBuffer.of_file *)
+  test begin try
+    let mb = Llvm.MemoryBuffer.of_file (fn ^ ".bogus") in
+    Llvm.MemoryBuffer.dispose mb;
+    false
+  with Llvm.IoError _ ->
+    true
+  end;
+
+  (* get_module *)
+  begin
+    let mb = Llvm.MemoryBuffer.of_file fn in
+    let m = begin try
+      Llvm_bitreader.get_module context mb
+    with x ->
+      Llvm.MemoryBuffer.dispose mb;
+      raise x
+    end in
+    Llvm.dispose_module m
+  end;
+
+  (* corrupt the bitcode *)
+  let fn = fn ^ ".txt" in
+  begin let oc = open_out fn in
+    output_string oc "not a bitcode file\n";
+    close_out oc
+  end;
+
+  (* test get_module exceptions *)
+  test begin
+    try
+      let mb = Llvm.MemoryBuffer.of_file fn in
+      let m = begin try
+        Llvm_bitreader.get_module context mb
+      with x ->
+        Llvm.MemoryBuffer.dispose mb;
+        raise x
+      end in
+      Llvm.dispose_module m;
+      false
+    with Llvm_bitreader.Error _ ->
+      true
+  end

diff --git a/test/Bindings/OCaml/bitwriter.ml b/test/Bindings/OCaml/bitwriter.ml
new file mode 100644
index 0000000..7c803f6
--- /dev/null
+++ b/test/Bindings/OCaml/bitwriter.ml

@@ -0,0 +1,49 @@
+(* RUN: cp %s %T/bitwriter.ml
+ * RUN: %ocamlc -g -w -3 -warn-error A -package llvm.bitreader -package llvm.bitwriter -linkpkg %T/bitwriter.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: %ocamlopt -g -w -3 -warn-error A -package llvm.bitreader -package llvm.bitwriter -linkpkg %T/bitwriter.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: llvm-dis < %t.bc
+ * XFAIL: vg_leak
+ *)
+
+(* Note that this takes a moment to link, so it's best to keep the number of
+   individual tests low. *)
+
+let context = Llvm.global_context ()
+
+let test x = if not x then exit 1 else ()
+
+let read_file name =
+  let ic = open_in_bin name in
+  let len = in_channel_length ic in
+  let buf = String.create len in
+
+  test ((input ic buf 0 len) = len);
+
+  close_in ic;
+
+  buf
+
+let temp_bitcode ?unbuffered m =
+  let temp_name, temp_oc = Filename.open_temp_file ~mode:[Open_binary] "" "" in
+
+  test (Llvm_bitwriter.output_bitcode ?unbuffered temp_oc m);
+  flush temp_oc;
+
+  let temp_buf = read_file temp_name in
+
+  close_out temp_oc;
+
+  temp_buf
+
+let _ =
+  let m = Llvm.create_module context "ocaml_test_module" in
+
+  test (Llvm_bitwriter.write_bitcode_file m Sys.argv.(1));
+  let file_buf = read_file Sys.argv.(1) in
+
+  test (file_buf = temp_bitcode m);
+  test (file_buf = temp_bitcode ~unbuffered:false m);
+  test (file_buf = temp_bitcode ~unbuffered:true m);
+  test (file_buf = Llvm.MemoryBuffer.as_string (Llvm_bitwriter.write_bitcode_to_memory_buffer m))

diff --git a/test/Bindings/OCaml/core.ml b/test/Bindings/OCaml/core.ml
new file mode 100644
index 0000000..c08351e
--- /dev/null
+++ b/test/Bindings/OCaml/core.ml

@@ -0,0 +1,1507 @@
+(* RUN: cp %s %T/core.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.analysis -package llvm.bitwriter -linkpkg %T/core.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: %ocamlopt -g -warn-error A -package llvm.analysis -package llvm.bitwriter -linkpkg %T/core.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: llvm-dis < %t.bc > %t.ll
+ * RUN: FileCheck %s < %t.ll
+ * Do a second pass for things that shouldn't be anywhere.
+ * RUN: FileCheck -check-prefix=CHECK-NOWHERE %s < %t.ll
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_bitwriter
+
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let exit_status = ref 0
+let suite_name = ref ""
+let group_name = ref ""
+let case_num = ref 0
+let print_checkpoints = false
+let context = global_context ()
+let i1_type = Llvm.i1_type context
+let i8_type = Llvm.i8_type context
+let i16_type = Llvm.i16_type context
+let i32_type = Llvm.i32_type context
+let i64_type = Llvm.i64_type context
+let void_type = Llvm.void_type context
+let float_type = Llvm.float_type context
+let double_type = Llvm.double_type context
+let fp128_type = Llvm.fp128_type context
+
+let group name =
+  group_name := !suite_name ^ "/" ^ name;
+  case_num := 0;
+  if print_checkpoints then
+    prerr_endline ("  " ^ name ^ "...")
+
+let insist cond =
+  incr case_num;
+  if not cond then
+    exit_status := 10;
+  match print_checkpoints, cond with
+  | false, true -> ()
+  | false, false ->
+      prerr_endline ("FAILED: " ^ !suite_name ^ "/" ^ !group_name ^ " #" ^ (string_of_int !case_num))
+  | true, true ->
+      prerr_endline ("    " ^ (string_of_int !case_num))
+  | true, false ->
+      prerr_endline ("    " ^ (string_of_int !case_num) ^ " FAIL")
+
+let suite name f =
+  suite_name := name;
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+
+(*===-- Fixture -----------------------------------------------------------===*)
+
+let filename = Sys.argv.(1)
+let m = create_module context filename
+
+
+(*===-- Conversion --------------------------------------------------------===*)
+
+let test_conversion () =
+  insist ("i32" = (string_of_lltype i32_type));
+  let c = const_int i32_type 42 in
+  insist ("i32 42" = (string_of_llvalue c))
+
+
+(*===-- Target ------------------------------------------------------------===*)
+
+let test_target () =
+  begin group "triple";
+    let trip = "i686-apple-darwin8" in
+    set_target_triple trip m;
+    insist (trip = target_triple m)
+  end;
+
+  begin group "layout";
+    let layout = "e" in
+    set_data_layout layout m;
+    insist (layout = data_layout m)
+  end
+  (* CHECK: target datalayout = "e"
+   * CHECK: target triple = "i686-apple-darwin8"
+   *)
+
+
+(*===-- Constants ---------------------------------------------------------===*)
+
+let test_constants () =
+  (* CHECK: const_int{{.*}}i32{{.*}}-1
+   *)
+  group "int";
+  let c = const_int i32_type (-1) in
+  ignore (define_global "const_int" c m);
+  insist (i32_type = type_of c);
+  insist (is_constant c);
+  insist (Some (-1L) = int64_of_const c);
+
+  (* CHECK: const_sext_int{{.*}}i64{{.*}}-1
+   *)
+  group "sext int";
+  let c = const_int i64_type (-1) in
+  ignore (define_global "const_sext_int" c m);
+  insist (i64_type = type_of c);
+  insist (Some (-1L) = int64_of_const c);
+
+  (* CHECK: const_zext_int64{{.*}}i64{{.*}}4294967295
+   *)
+  group "zext int64";
+  let c = const_of_int64 i64_type (Int64.of_string "4294967295") false in
+  ignore (define_global "const_zext_int64" c m);
+  insist (i64_type = type_of c);
+  insist (Some 4294967295L = int64_of_const c);
+
+  (* CHECK: const_int_string{{.*}}i32{{.*}}-1
+   *)
+  group "int string";
+  let c = const_int_of_string i32_type "-1" 10 in
+  ignore (define_global "const_int_string" c m);
+  insist (i32_type = type_of c);
+  insist (None = (string_of_const c));
+  insist (None = float_of_const c);
+  insist (Some (-1L) = int64_of_const c);
+
+  (* CHECK: const_int64{{.*}}i64{{.*}}9223372036854775807
+   *)
+  group "max int64";
+  let c = const_of_int64 i64_type 9223372036854775807L true in
+  ignore (define_global "const_int64" c m) ;
+  insist (i64_type = type_of c);
+  insist (Some 9223372036854775807L = int64_of_const c);
+
+  if Sys.word_size = 64; then begin
+    group "long int";
+    let c = const_int i64_type (1 lsl 61) in
+    insist (c = const_of_int64 i64_type (Int64.of_int (1 lsl 61)) false)
+  end;
+
+  (* CHECK: @const_string = global {{.*}}c"cruel\00world"
+   *)
+  group "string";
+  let c = const_string context "cruel\000world" in
+  ignore (define_global "const_string" c m);
+  insist ((array_type i8_type 11) = type_of c);
+  insist ((Some "cruel\000world") = (string_of_const c));
+
+  (* CHECK: const_stringz{{.*}}"hi\00again\00"
+   *)
+  group "stringz";
+  let c = const_stringz context "hi\000again" in
+  ignore (define_global "const_stringz" c m);
+  insist ((array_type i8_type 9) = type_of c);
+
+  (* CHECK: const_single{{.*}}2.75
+   * CHECK: const_double{{.*}}3.1459
+   * CHECK: const_double_string{{.*}}2
+   * CHECK: const_fake_fp128{{.*}}0xL00000000000000004000000000000000
+   * CHECK: const_fp128_string{{.*}}0xLF3CB1CCF26FBC178452FB4EC7F91973F
+   *)
+  begin group "real";
+    let cs = const_float float_type 2.75 in
+    ignore (define_global "const_single" cs m);
+    insist (float_type = type_of cs);
+    insist (float_of_const cs = Some 2.75);
+
+    let cd = const_float double_type 3.1459 in
+    ignore (define_global "const_double" cd m);
+    insist (double_type = type_of cd);
+    insist (float_of_const cd = Some 3.1459);
+
+    let cd = const_float_of_string double_type "2" in
+    ignore (define_global "const_double_string" cd m);
+    insist (double_type = type_of cd);
+    insist (float_of_const cd = Some 2.);
+
+    let cd = const_float fp128_type 2. in
+    ignore (define_global "const_fake_fp128" cd m);
+    insist (fp128_type = type_of cd);
+    insist (float_of_const cd = Some 2.);
+
+    let cd = const_float_of_string fp128_type "1e400" in
+    ignore (define_global "const_fp128_string" cd m);
+    insist (fp128_type = type_of cd);
+    insist (float_of_const cd = None);
+  end;
+
+  let one = const_int i16_type 1 in
+  let two = const_int i16_type 2 in
+  let three = const_int i32_type 3 in
+  let four = const_int i32_type 4 in
+
+  (* CHECK: const_array{{.*}}[i32 3, i32 4]
+   *)
+  group "array";
+  let c = const_array i32_type [| three; four |] in
+  ignore (define_global "const_array" c m);
+  insist ((array_type i32_type 2) = (type_of c));
+  insist (three = (const_element c 0));
+  insist (four = (const_element c 1));
+
+  (* CHECK: const_vector{{.*}}<i16 1, i16 2{{.*}}>
+   *)
+  group "vector";
+  let c = const_vector [| one; two; one; two;
+                          one; two; one; two |] in
+  ignore (define_global "const_vector" c m);
+  insist ((vector_type i16_type 8) = (type_of c));
+
+  (* CHECK: const_structure{{.*.}}i16 1, i16 2, i32 3, i32 4
+   *)
+  group "structure";
+  let c = const_struct context [| one; two; three; four |] in
+  ignore (define_global "const_structure" c m);
+  insist ((struct_type context [| i16_type; i16_type; i32_type; i32_type |])
+        = (type_of c));
+
+  (* CHECK: const_null{{.*}}zeroinit
+   *)
+  group "null";
+  let c = const_null (packed_struct_type context [| i1_type; i8_type; i64_type;
+                                                    double_type |]) in
+  ignore (define_global "const_null" c m);
+
+  (* CHECK: const_all_ones{{.*}}-1
+   *)
+  group "all ones";
+  let c = const_all_ones i64_type in
+  ignore (define_global "const_all_ones" c m);
+
+  group "pointer null"; begin
+    (* CHECK: const_pointer_null = global i64* null
+     *)
+    let c = const_pointer_null (pointer_type i64_type) in
+    ignore (define_global "const_pointer_null" c m);
+  end;
+
+  (* CHECK: const_undef{{.*}}undef
+   *)
+  group "undef";
+  let c = undef i1_type in
+  ignore (define_global "const_undef" c m);
+  insist (i1_type = type_of c);
+  insist (is_undef c);
+
+  group "constant arithmetic";
+  (* CHECK: @const_neg = global i64 sub
+   * CHECK: @const_nsw_neg = global i64 sub nsw
+   * CHECK: @const_nuw_neg = global i64 sub nuw
+   * CHECK: @const_fneg = global double fsub
+   * CHECK: @const_not = global i64 xor
+   * CHECK: @const_add = global i64 add
+   * CHECK: @const_nsw_add = global i64 add nsw
+   * CHECK: @const_nuw_add = global i64 add nuw
+   * CHECK: @const_fadd = global double fadd
+   * CHECK: @const_sub = global i64 sub
+   * CHECK: @const_nsw_sub = global i64 sub nsw
+   * CHECK: @const_nuw_sub = global i64 sub nuw
+   * CHECK: @const_fsub = global double fsub
+   * CHECK: @const_mul = global i64 mul
+   * CHECK: @const_nsw_mul = global i64 mul nsw
+   * CHECK: @const_nuw_mul = global i64 mul nuw
+   * CHECK: @const_fmul = global double fmul
+   * CHECK: @const_udiv = global i64 udiv
+   * CHECK: @const_sdiv = global i64 sdiv
+   * CHECK: @const_exact_sdiv = global i64 sdiv exact
+   * CHECK: @const_fdiv = global double fdiv
+   * CHECK: @const_urem = global i64 urem
+   * CHECK: @const_srem = global i64 srem
+   * CHECK: @const_frem = global double frem
+   * CHECK: @const_and = global i64 and
+   * CHECK: @const_or = global i64 or
+   * CHECK: @const_xor = global i64 xor
+   * CHECK: @const_icmp = global i1 icmp sle
+   * CHECK: @const_fcmp = global i1 fcmp ole
+   *)
+  let void_ptr = pointer_type i8_type in
+  let five = const_int i64_type 5 in
+  let ffive = const_uitofp five double_type in
+  let foldbomb_gv = define_global "FoldBomb" (const_null i8_type) m in
+  let foldbomb = const_ptrtoint foldbomb_gv i64_type in
+  let ffoldbomb = const_uitofp foldbomb double_type in
+  ignore (define_global "const_neg" (const_neg foldbomb) m);
+  ignore (define_global "const_nsw_neg" (const_nsw_neg foldbomb) m);
+  ignore (define_global "const_nuw_neg" (const_nuw_neg foldbomb) m);
+  ignore (define_global "const_fneg" (const_fneg ffoldbomb) m);
+  ignore (define_global "const_not" (const_not foldbomb) m);
+  ignore (define_global "const_add" (const_add foldbomb five) m);
+  ignore (define_global "const_nsw_add" (const_nsw_add foldbomb five) m);
+  ignore (define_global "const_nuw_add" (const_nuw_add foldbomb five) m);
+  ignore (define_global "const_fadd" (const_fadd ffoldbomb ffive) m);
+  ignore (define_global "const_sub" (const_sub foldbomb five) m);
+  ignore (define_global "const_nsw_sub" (const_nsw_sub foldbomb five) m);
+  ignore (define_global "const_nuw_sub" (const_nuw_sub foldbomb five) m);
+  ignore (define_global "const_fsub" (const_fsub ffoldbomb ffive) m);
+  ignore (define_global "const_mul" (const_mul foldbomb five) m);
+  ignore (define_global "const_nsw_mul" (const_nsw_mul foldbomb five) m);
+  ignore (define_global "const_nuw_mul" (const_nuw_mul foldbomb five) m);
+  ignore (define_global "const_fmul" (const_fmul ffoldbomb ffive) m);
+  ignore (define_global "const_udiv" (const_udiv foldbomb five) m);
+  ignore (define_global "const_sdiv" (const_sdiv foldbomb five) m);
+  ignore (define_global "const_exact_sdiv" (const_exact_sdiv foldbomb five) m);
+  ignore (define_global "const_fdiv" (const_fdiv ffoldbomb ffive) m);
+  ignore (define_global "const_urem" (const_urem foldbomb five) m);
+  ignore (define_global "const_srem" (const_srem foldbomb five) m);
+  ignore (define_global "const_frem" (const_frem ffoldbomb ffive) m);
+  ignore (define_global "const_and" (const_and foldbomb five) m);
+  ignore (define_global "const_or" (const_or foldbomb five) m);
+  ignore (define_global "const_xor" (const_xor foldbomb five) m);
+  ignore (define_global "const_icmp" (const_icmp Icmp.Sle foldbomb five) m);
+  ignore (define_global "const_fcmp" (const_fcmp Fcmp.Ole ffoldbomb ffive) m);
+
+  group "constant casts";
+  (* CHECK: const_trunc{{.*}}trunc
+   * CHECK: const_sext{{.*}}sext
+   * CHECK: const_zext{{.*}}zext
+   * CHECK: const_fptrunc{{.*}}fptrunc
+   * CHECK: const_fpext{{.*}}fpext
+   * CHECK: const_uitofp{{.*}}uitofp
+   * CHECK: const_sitofp{{.*}}sitofp
+   * CHECK: const_fptoui{{.*}}fptoui
+   * CHECK: const_fptosi{{.*}}fptosi
+   * CHECK: const_ptrtoint{{.*}}ptrtoint
+   * CHECK: const_inttoptr{{.*}}inttoptr
+   * CHECK: const_bitcast{{.*}}bitcast
+   * CHECK: const_intcast{{.*}}zext
+   *)
+  let i128_type = integer_type context 128 in
+  ignore (define_global "const_trunc" (const_trunc (const_add foldbomb five)
+                                               i8_type) m);
+  ignore (define_global "const_sext" (const_sext foldbomb i128_type) m);
+  ignore (define_global "const_zext" (const_zext foldbomb i128_type) m);
+  ignore (define_global "const_fptrunc" (const_fptrunc ffoldbomb float_type) m);
+  ignore (define_global "const_fpext" (const_fpext ffoldbomb fp128_type) m);
+  ignore (define_global "const_uitofp" (const_uitofp foldbomb double_type) m);
+  ignore (define_global "const_sitofp" (const_sitofp foldbomb double_type) m);
+  ignore (define_global "const_fptoui" (const_fptoui ffoldbomb i32_type) m);
+  ignore (define_global "const_fptosi" (const_fptosi ffoldbomb i32_type) m);
+  ignore (define_global "const_ptrtoint" (const_ptrtoint
+    (const_gep (const_null (pointer_type i8_type))
+               [| const_int i32_type 1 |])
+    i32_type) m);
+  ignore (define_global "const_inttoptr" (const_inttoptr (const_add foldbomb five)
+                                                  void_ptr) m);
+  ignore (define_global "const_bitcast" (const_bitcast ffoldbomb i64_type) m);
+  ignore (define_global "const_intcast"
+          (const_intcast foldbomb i128_type ~is_signed:false) m);
+
+  group "misc constants";
+  (* CHECK: const_size_of{{.*}}getelementptr{{.*}}null
+   * CHECK: const_gep{{.*}}getelementptr
+   * CHECK: const_select{{.*}}select
+   * CHECK: const_extractelement{{.*}}extractelement
+   * CHECK: const_insertelement{{.*}}insertelement
+   * CHECK: const_shufflevector = global <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+   *)
+  ignore (define_global "const_size_of" (size_of (pointer_type i8_type)) m);
+  ignore (define_global "const_gep" (const_gep foldbomb_gv [| five |]) m);
+  ignore (define_global "const_select" (const_select
+    (const_icmp Icmp.Sle foldbomb five)
+    (const_int i8_type (-1))
+    (const_int i8_type 0)) m);
+  let zero = const_int i32_type 0 in
+  let one  = const_int i32_type 1 in
+  ignore (define_global "const_extractelement" (const_extractelement
+    (const_vector [| zero; one; zero; one |])
+    (const_trunc foldbomb i32_type)) m);
+  ignore (define_global "const_insertelement" (const_insertelement
+    (const_vector [| zero; one; zero; one |])
+    zero (const_trunc foldbomb i32_type)) m);
+  ignore (define_global "const_shufflevector" (const_shufflevector
+    (const_vector [| zero; one |])
+    (const_vector [| one; zero |])
+    (const_vector [| const_int i32_type 0; const_int i32_type 1;
+                     const_int i32_type 2; const_int i32_type 3 |])) m);
+
+  group "asm"; begin
+    let ft = function_type void_type [| i32_type; i32_type; i32_type |] in
+    ignore (const_inline_asm
+      ft
+      ""
+      "{cx},{ax},{di},~{dirflag},~{fpsr},~{flags},~{edi},~{ecx}"
+      true
+      false)
+  end;
+
+  group "recursive struct"; begin
+      let nsty = named_struct_type context "rec" in
+      let pty = pointer_type nsty in
+      struct_set_body nsty [| i32_type; pty |] false;
+      let elts = [| const_int i32_type 4; const_pointer_null pty |] in
+      let grec_init = const_named_struct nsty elts in
+      ignore (define_global "grec" grec_init m);
+      ignore (string_of_lltype nsty);
+  end
+
+
+(*===-- Global Values -----------------------------------------------------===*)
+
+let test_global_values () =
+  let (++) x f = f x; x in
+  let zero32 = const_null i32_type in
+
+  (* CHECK: GVal01
+   *)
+  group "naming";
+  let g = define_global "TEMPORARY" zero32 m in
+  insist ("TEMPORARY" = value_name g);
+  set_value_name "GVal01" g;
+  insist ("GVal01" = value_name g);
+
+  (* CHECK: GVal02{{.*}}linkonce
+   *)
+  group "linkage";
+  let g = define_global "GVal02" zero32 m ++
+          set_linkage Linkage.Link_once in
+  insist (Linkage.Link_once = linkage g);
+
+  (* CHECK: GVal03{{.*}}Hanalei
+   *)
+  group "section";
+  let g = define_global "GVal03" zero32 m ++
+          set_section "Hanalei" in
+  insist ("Hanalei" = section g);
+
+  (* CHECK: GVal04{{.*}}hidden
+   *)
+  group "visibility";
+  let g = define_global "GVal04" zero32 m ++
+          set_visibility Visibility.Hidden in
+  insist (Visibility.Hidden = visibility g);
+
+  (* CHECK: GVal05{{.*}}align 128
+   *)
+  group "alignment";
+  let g = define_global "GVal05" zero32 m ++
+          set_alignment 128 in
+  insist (128 = alignment g);
+
+  (* CHECK: GVal06{{.*}}dllexport
+   *)
+  group "dll_storage_class";
+  let g = define_global "GVal06" zero32 m ++
+          set_dll_storage_class DLLStorageClass.DLLExport in
+  insist (DLLStorageClass.DLLExport = dll_storage_class g)
+
+
+(*===-- Global Variables --------------------------------------------------===*)
+
+let test_global_variables () =
+  let (++) x f = f x; x in
+  let forty_two32 = const_int i32_type 42 in
+
+  group "declarations"; begin
+    (* CHECK: @GVar01 = external global i32
+     * CHECK: @QGVar01 = external addrspace(3) global i32
+     *)
+    insist (None == lookup_global "GVar01" m);
+    let g = declare_global i32_type "GVar01" m in
+    insist (is_declaration g);
+    insist (pointer_type float_type ==
+              type_of (declare_global float_type "GVar01" m));
+    insist (g == declare_global i32_type "GVar01" m);
+    insist (match lookup_global "GVar01" m with Some x -> x = g
+                                              | None -> false);
+
+    insist (None == lookup_global "QGVar01" m);
+    let g = declare_qualified_global i32_type "QGVar01" 3 m in
+    insist (is_declaration g);
+    insist (qualified_pointer_type float_type 3 ==
+              type_of (declare_qualified_global float_type "QGVar01" 3 m));
+    insist (g == declare_qualified_global i32_type "QGVar01" 3 m);
+    insist (match lookup_global "QGVar01" m with Some x -> x = g
+                                              | None -> false);
+  end;
+
+  group "definitions"; begin
+    (* CHECK: @GVar02 = global i32 42
+     * CHECK: @GVar03 = global i32 42
+     * CHECK: @QGVar02 = addrspace(3) global i32 42
+     * CHECK: @QGVar03 = addrspace(3) global i32 42
+     *)
+    let g = define_global "GVar02" forty_two32 m in
+    let g2 = declare_global i32_type "GVar03" m ++
+           set_initializer forty_two32 in
+    insist (not (is_declaration g));
+    insist (not (is_declaration g2));
+    insist ((global_initializer g) == (global_initializer g2));
+
+    let g = define_qualified_global "QGVar02" forty_two32 3 m in
+    let g2 = declare_qualified_global i32_type "QGVar03" 3 m ++
+           set_initializer forty_two32 in
+    insist (not (is_declaration g));
+    insist (not (is_declaration g2));
+    insist ((global_initializer g) == (global_initializer g2));
+  end;
+
+  (* CHECK: GVar04{{.*}}thread_local
+   *)
+  group "threadlocal";
+  let g = define_global "GVar04" forty_two32 m ++
+          set_thread_local true in
+  insist (is_thread_local g);
+
+  (* CHECK: GVar05{{.*}}thread_local(initialexec)
+   *)
+  group "threadlocal_mode";
+  let g = define_global "GVar05" forty_two32 m ++
+          set_thread_local_mode ThreadLocalMode.InitialExec in
+  insist ((thread_local_mode g) = ThreadLocalMode.InitialExec);
+
+  (* CHECK: GVar06{{.*}}externally_initialized
+   *)
+  group "externally_initialized";
+  let g = define_global "GVar06" forty_two32 m ++
+          set_externally_initialized true in
+  insist (is_externally_initialized g);
+
+  (* CHECK-NOWHERE-NOT: GVar07
+   *)
+  group "delete";
+  let g = define_global "GVar07" forty_two32 m in
+  delete_global g;
+
+  (* CHECK: ConstGlobalVar{{.*}}constant
+   *)
+  group "constant";
+  let g = define_global "ConstGlobalVar" forty_two32 m in
+  insist (not (is_global_constant g));
+  set_global_constant true g;
+  insist (is_global_constant g);
+
+  begin group "iteration";
+    let m = create_module context "temp" in
+
+    insist (At_end m = global_begin m);
+    insist (At_start m = global_end m);
+
+    let g1 = declare_global i32_type "One" m in
+    let g2 = declare_global i32_type "Two" m in
+
+    insist (Before g1 = global_begin m);
+    insist (Before g2 = global_succ g1);
+    insist (At_end m = global_succ g2);
+
+    insist (After g2 = global_end m);
+    insist (After g1 = global_pred g2);
+    insist (At_start m = global_pred g1);
+
+    let lf s x = s ^ "->" ^ value_name x in
+    insist ("->One->Two" = fold_left_globals lf "" m);
+
+    let rf x s = value_name x ^ "<-" ^ s in
+    insist ("One<-Two<-" = fold_right_globals rf m "");
+
+    dispose_module m
+  end
+
+(* String globals built below are emitted here.
+ * CHECK: build_global_string{{.*}}stringval
+ *)
+
+
+(*===-- Uses --------------------------------------------------------------===*)
+
+let test_uses () =
+  let ty = function_type i32_type [| i32_type; i32_type |] in
+  let fn = define_function "use_function" ty m in
+  let b = builder_at_end context (entry_block fn) in
+
+  let p1 = param fn 0 in
+  let p2 = param fn 1 in
+  let v1 = build_add p1 p2 "v1" b in
+  let v2 = build_add p1 v1 "v2" b in
+  let _ = build_add v1 v2 "v3" b in
+
+  let lf s u = value_name (user u) ^ "->" ^ s in
+  insist ("v2->v3->" = fold_left_uses lf "" v1);
+  let rf u s = value_name (user u) ^ "<-" ^ s in
+  insist ("v3<-v2<-" = fold_right_uses rf v1 "");
+
+  let lf s u = value_name (used_value u) ^ "->" ^ s in
+  insist ("v1->v1->" = fold_left_uses lf "" v1);
+
+  let rf u s = value_name (used_value u) ^ "<-" ^ s in
+  insist ("v1<-v1<-" = fold_right_uses rf v1 "");
+
+  ignore (build_unreachable b)
+
+
+(*===-- Users -------------------------------------------------------------===*)
+
+let test_users () =
+  let ty = function_type i32_type [| i32_type; i32_type |] in
+  let fn = define_function "user_function" ty m in
+  let b = builder_at_end context (entry_block fn) in
+
+  let p1 = param fn 0 in
+  let p2 = param fn 1 in
+  let a3 = build_alloca i32_type "user_alloca" b in
+  let p3 = build_load a3 "user_load" b in
+  let i = build_add p1 p2 "sum" b in
+
+  insist ((num_operands i) = 2);
+  insist ((operand i 0) = p1);
+  insist ((operand i 1) = p2);
+
+  set_operand i 1 p3;
+  insist ((operand i 1) != p2);
+  insist ((operand i 1) = p3);
+
+  ignore (build_unreachable b)
+
+
+(*===-- Aliases -----------------------------------------------------------===*)
+
+let test_aliases () =
+  (* CHECK: @alias = alias i32* @aliasee
+   *)
+  let forty_two32 = const_int i32_type 42 in
+  let v = define_global "aliasee" forty_two32 m in
+  ignore (add_alias m (pointer_type i32_type) v "alias")
+
+
+(*===-- Functions ---------------------------------------------------------===*)
+
+let test_functions () =
+  let ty = function_type i32_type [| i32_type; i64_type |] in
+  let ty2 = function_type i8_type [| i8_type; i64_type |] in
+
+  (* CHECK: declare i32 @Fn1(i32, i64)
+   *)
+  begin group "declare";
+    insist (None = lookup_function "Fn1" m);
+    let fn = declare_function "Fn1" ty m in
+    insist (pointer_type ty = type_of fn);
+    insist (is_declaration fn);
+    insist (0 = Array.length (basic_blocks fn));
+    insist (pointer_type ty2 == type_of (declare_function "Fn1" ty2 m));
+    insist (fn == declare_function "Fn1" ty m);
+    insist (None <> lookup_function "Fn1" m);
+    insist (match lookup_function "Fn1" m with Some x -> x = fn
+                                             | None -> false);
+    insist (m == global_parent fn)
+  end;
+
+  (* CHECK-NOWHERE-NOT: Fn2
+   *)
+  group "delete";
+  let fn = declare_function "Fn2" ty m in
+  delete_function fn;
+
+  (* CHECK: define{{.*}}Fn3
+   *)
+  group "define";
+  let fn = define_function "Fn3" ty m in
+  insist (not (is_declaration fn));
+  insist (1 = Array.length (basic_blocks fn));
+  ignore (build_unreachable (builder_at_end context (entry_block fn)));
+
+  (* CHECK: define{{.*}}Fn4{{.*}}Param1{{.*}}Param2
+   *)
+  group "params";
+  let fn = define_function "Fn4" ty m in
+  let params = params fn in
+  insist (2 = Array.length params);
+  insist (params.(0) = param fn 0);
+  insist (params.(1) = param fn 1);
+  insist (i32_type = type_of params.(0));
+  insist (i64_type = type_of params.(1));
+  set_value_name "Param1" params.(0);
+  set_value_name "Param2" params.(1);
+  ignore (build_unreachable (builder_at_end context (entry_block fn)));
+
+  (* CHECK: fastcc{{.*}}Fn5
+   *)
+  group "callconv";
+  let fn = define_function "Fn5" ty m in
+  insist (CallConv.c = function_call_conv fn);
+  set_function_call_conv CallConv.fast fn;
+  insist (CallConv.fast = function_call_conv fn);
+  ignore (build_unreachable (builder_at_end context (entry_block fn)));
+
+  begin group "gc";
+    (* CHECK: Fn6{{.*}}gc{{.*}}shadowstack
+     *)
+    let fn = define_function "Fn6" ty m in
+    insist (None = gc fn);
+    set_gc (Some "ocaml") fn;
+    insist (Some "ocaml" = gc fn);
+    set_gc None fn;
+    insist (None = gc fn);
+    set_gc (Some "shadowstack") fn;
+    ignore (build_unreachable (builder_at_end context (entry_block fn)));
+  end;
+
+  begin group "iteration";
+    let m = create_module context "temp" in
+
+    insist (At_end m = function_begin m);
+    insist (At_start m = function_end m);
+
+    let f1 = define_function "One" ty m in
+    let f2 = define_function "Two" ty m in
+
+    insist (Before f1 = function_begin m);
+    insist (Before f2 = function_succ f1);
+    insist (At_end m = function_succ f2);
+
+    insist (After f2 = function_end m);
+    insist (After f1 = function_pred f2);
+    insist (At_start m = function_pred f1);
+
+    let lf s x = s ^ "->" ^ value_name x in
+    insist ("->One->Two" = fold_left_functions lf "" m);
+
+    let rf x s = value_name x ^ "<-" ^ s in
+    insist ("One<-Two<-" = fold_right_functions rf m "");
+
+    dispose_module m
+  end
+
+
+(*===-- Params ------------------------------------------------------------===*)
+
+let test_params () =
+  begin group "iteration";
+    let m = create_module context "temp" in
+
+    let vf = define_function "void" (function_type void_type [| |]) m in
+
+    insist (At_end vf = param_begin vf);
+    insist (At_start vf = param_end vf);
+
+    let ty = function_type void_type [| i32_type; i32_type |] in
+    let f = define_function "f" ty m in
+    let p1 = param f 0 in
+    let p2 = param f 1 in
+    set_value_name "One" p1;
+    set_value_name "Two" p2;
+    add_param_attr p1 Attribute.Sext;
+    add_param_attr p2 Attribute.Noalias;
+    remove_param_attr p2 Attribute.Noalias;
+    add_function_attr f Attribute.Nounwind;
+    add_function_attr f Attribute.Noreturn;
+    remove_function_attr f Attribute.Noreturn;
+
+    insist (Before p1 = param_begin f);
+    insist (Before p2 = param_succ p1);
+    insist (At_end f = param_succ p2);
+
+    insist (After p2 = param_end f);
+    insist (After p1 = param_pred p2);
+    insist (At_start f = param_pred p1);
+
+    let lf s x = s ^ "->" ^ value_name x in
+    insist ("->One->Two" = fold_left_params lf "" f);
+
+    let rf x s = value_name x ^ "<-" ^ s in
+    insist ("One<-Two<-" = fold_right_params rf f "");
+
+    dispose_module m
+  end
+
+
+(*===-- Basic Blocks ------------------------------------------------------===*)
+
+let test_basic_blocks () =
+  let ty = function_type void_type [| |] in
+
+  (* CHECK: Bb1
+   *)
+  group "entry";
+  let fn = declare_function "X" ty m in
+  let bb = append_block context "Bb1" fn in
+  insist (bb = entry_block fn);
+  ignore (build_unreachable (builder_at_end context bb));
+
+  (* CHECK-NOWHERE-NOT: Bb2
+   *)
+  group "delete";
+  let fn = declare_function "X2" ty m in
+  let bb = append_block context "Bb2" fn in
+  delete_block bb;
+
+  group "insert";
+  let fn = declare_function "X3" ty m in
+  let bbb = append_block context "b" fn in
+  let bba = insert_block context "a" bbb in
+  insist ([| bba; bbb |] = basic_blocks fn);
+  ignore (build_unreachable (builder_at_end context bba));
+  ignore (build_unreachable (builder_at_end context bbb));
+
+  (* CHECK: Bb3
+   *)
+  group "name/value";
+  let fn = define_function "X4" ty m in
+  let bb = entry_block fn in
+  ignore (build_unreachable (builder_at_end context bb));
+  let bbv = value_of_block bb in
+  set_value_name "Bb3" bbv;
+  insist ("Bb3" = value_name bbv);
+
+  group "casts";
+  let fn = define_function "X5" ty m in
+  let bb = entry_block fn in
+  ignore (build_unreachable (builder_at_end context bb));
+  insist (bb = block_of_value (value_of_block bb));
+  insist (value_is_block (value_of_block bb));
+  insist (not (value_is_block (const_null i32_type)));
+
+  begin group "iteration";
+    let m = create_module context "temp" in
+    let f = declare_function "Temp" (function_type i32_type [| |]) m in
+
+    insist (At_end f = block_begin f);
+    insist (At_start f = block_end f);
+
+    let b1 = append_block context "One" f in
+    let b2 = append_block context "Two" f in
+
+    insist (Before b1 = block_begin f);
+    insist (Before b2 = block_succ b1);
+    insist (At_end f = block_succ b2);
+
+    insist (After b2 = block_end f);
+    insist (After b1 = block_pred b2);
+    insist (At_start f = block_pred b1);
+
+    let lf s x = s ^ "->" ^ value_name (value_of_block x) in
+    insist ("->One->Two" = fold_left_blocks lf "" f);
+
+    let rf x s = value_name (value_of_block x) ^ "<-" ^ s in
+    insist ("One<-Two<-" = fold_right_blocks rf f "");
+
+    dispose_module m
+  end
+
+
+(*===-- Instructions ------------------------------------------------------===*)
+
+let test_instructions () =
+  begin group "iteration";
+    let m = create_module context "temp" in
+    let fty = function_type void_type [| i32_type; i32_type |] in
+    let f = define_function "f" fty m in
+    let bb = entry_block f in
+    let b = builder_at context (At_end bb) in
+
+    insist (At_end bb = instr_begin bb);
+    insist (At_start bb = instr_end bb);
+
+    let i1 = build_add (param f 0) (param f 1) "One" b in
+    let i2 = build_sub (param f 0) (param f 1) "Two" b in
+
+    insist (Before i1 = instr_begin bb);
+    insist (Before i2 = instr_succ i1);
+    insist (At_end bb = instr_succ i2);
+
+    insist (After i2 = instr_end bb);
+    insist (After i1 = instr_pred i2);
+    insist (At_start bb = instr_pred i1);
+
+    let lf s x = s ^ "->" ^ value_name x in
+    insist ("->One->Two" = fold_left_instrs lf "" bb);
+
+    let rf x s = value_name x ^ "<-" ^ s in
+    insist ("One<-Two<-" = fold_right_instrs rf bb "");
+
+    dispose_module m
+  end;
+
+  group "clone instr";
+  begin
+    (* CHECK: %clone = add i32 %0, 2
+     *)
+    let fty = function_type void_type [| i32_type |] in
+    let fn = define_function "BuilderParent" fty m in
+    let bb = entry_block fn in
+    let b = builder_at_end context bb in
+    let p = param fn 0 in
+    let sum = build_add p p "sum" b in
+    let y = const_int i32_type 2 in
+    let clone = instr_clone sum in
+    set_operand clone 0 p;
+    set_operand clone 1 y;
+    insert_into_builder clone "clone" b;
+    ignore (build_ret_void b)
+  end
+
+
+(*===-- Builder -----------------------------------------------------------===*)
+
+let test_builder () =
+  let (++) x f = f x; x in
+
+  begin group "parent";
+    insist (try
+              ignore (insertion_block (builder context));
+              false
+            with Not_found ->
+              true);
+
+    let fty = function_type void_type [| i32_type |] in
+    let fn = define_function "BuilderParent" fty m in
+    let bb = entry_block fn in
+    let b = builder_at_end context bb in
+    let p = param fn 0 in
+    let sum = build_add p p "sum" b in
+    ignore (build_ret_void b);
+
+    insist (fn = block_parent bb);
+    insist (fn = param_parent p);
+    insist (bb = instr_parent sum);
+    insist (bb = insertion_block b)
+  end;
+
+  group "ret void";
+  begin
+    (* CHECK: ret void
+     *)
+    let fty = function_type void_type [| |] in
+    let fn = declare_function "X6" fty m in
+    let b = builder_at_end context (append_block context "Bb01" fn) in
+    ignore (build_ret_void b)
+  end;
+
+  group "ret aggregate";
+  begin
+      (* CHECK: ret { i8, i64 } { i8 4, i64 5 }
+       *)
+      let sty = struct_type context [| i8_type; i64_type |] in
+      let fty = function_type sty [| |] in
+      let fn = declare_function "XA6" fty m in
+      let b = builder_at_end context (append_block context "Bb01" fn) in
+      let agg = [| const_int i8_type 4; const_int i64_type 5 |] in
+      ignore (build_aggregate_ret agg b)
+  end;
+
+  (* The rest of the tests will use one big function. *)
+  let fty = function_type i32_type [| i32_type; i32_type |] in
+  let fn = define_function "X7" fty m in
+  let atentry = builder_at_end context (entry_block fn) in
+  let p1 = param fn 0 ++ set_value_name "P1" in
+  let p2 = param fn 1 ++ set_value_name "P2" in
+  let f1 = build_uitofp p1 float_type "F1" atentry in
+  let f2 = build_uitofp p2 float_type "F2" atentry in
+
+  let bb00 = append_block context "Bb00" fn in
+  ignore (build_unreachable (builder_at_end context bb00));
+
+  group "function attribute";
+  begin
+      ignore (add_function_attr fn Attribute.UWTable);
+      (* CHECK: X7{{.*}}#0
+       * #0 is uwtable, defined at EOF.
+       *)
+      insist ([Attribute.UWTable] = function_attr fn);
+  end;
+
+  group "casts"; begin
+    let void_ptr = pointer_type i8_type in
+
+    (* CHECK-DAG: %build_trunc = trunc i32 %P1 to i8
+     * CHECK-DAG: %build_trunc2 = trunc i32 %P1 to i8
+     * CHECK-DAG: %build_trunc3 = trunc i32 %P1 to i8
+     * CHECK-DAG: %build_zext = zext i8 %build_trunc to i32
+     * CHECK-DAG: %build_zext2 = zext i8 %build_trunc to i32
+     * CHECK-DAG: %build_sext = sext i32 %build_zext to i64
+     * CHECK-DAG: %build_sext2 = sext i32 %build_zext to i64
+     * CHECK-DAG: %build_sext3 = sext i32 %build_zext to i64
+     * CHECK-DAG: %build_uitofp = uitofp i64 %build_sext to float
+     * CHECK-DAG: %build_sitofp = sitofp i32 %build_zext to double
+     * CHECK-DAG: %build_fptoui = fptoui float %build_uitofp to i32
+     * CHECK-DAG: %build_fptosi = fptosi double %build_sitofp to i64
+     * CHECK-DAG: %build_fptrunc = fptrunc double %build_sitofp to float
+     * CHECK-DAG: %build_fptrunc2 = fptrunc double %build_sitofp to float
+     * CHECK-DAG: %build_fpext = fpext float %build_fptrunc to double
+     * CHECK-DAG: %build_fpext2 = fpext float %build_fptrunc to double
+     * CHECK-DAG: %build_inttoptr = inttoptr i32 %P1 to i8*
+     * CHECK-DAG: %build_ptrtoint = ptrtoint i8* %build_inttoptr to i64
+     * CHECK-DAG: %build_ptrtoint2 = ptrtoint i8* %build_inttoptr to i64
+     * CHECK-DAG: %build_bitcast = bitcast i64 %build_ptrtoint to double
+     * CHECK-DAG: %build_bitcast2 = bitcast i64 %build_ptrtoint to double
+     * CHECK-DAG: %build_bitcast3 = bitcast i64 %build_ptrtoint to double
+     * CHECK-DAG: %build_bitcast4 = bitcast i64 %build_ptrtoint to double
+     * CHECK-DAG: %build_pointercast = bitcast i8* %build_inttoptr to i16*
+     *)
+    let inst28 = build_trunc p1 i8_type "build_trunc" atentry in
+    let inst29 = build_zext inst28 i32_type "build_zext" atentry in
+    let inst30 = build_sext inst29 i64_type "build_sext" atentry in
+    let inst31 = build_uitofp inst30 float_type "build_uitofp" atentry in
+    let inst32 = build_sitofp inst29 double_type "build_sitofp" atentry in
+    ignore(build_fptoui inst31 i32_type "build_fptoui" atentry);
+    ignore(build_fptosi inst32 i64_type "build_fptosi" atentry);
+    let inst35 = build_fptrunc inst32 float_type "build_fptrunc" atentry in
+    ignore(build_fpext inst35 double_type "build_fpext" atentry);
+    let inst37 = build_inttoptr p1 void_ptr "build_inttoptr" atentry in
+    let inst38 = build_ptrtoint inst37 i64_type "build_ptrtoint" atentry in
+    ignore(build_bitcast inst38 double_type "build_bitcast" atentry);
+    ignore(build_zext_or_bitcast inst38 double_type "build_bitcast2" atentry);
+    ignore(build_sext_or_bitcast inst38 double_type "build_bitcast3" atentry);
+    ignore(build_trunc_or_bitcast inst38 double_type "build_bitcast4" atentry);
+    ignore(build_pointercast inst37 (pointer_type i16_type) "build_pointercast" atentry);
+
+    ignore(build_zext_or_bitcast inst28 i32_type "build_zext2" atentry);
+    ignore(build_sext_or_bitcast inst29 i64_type "build_sext2" atentry);
+    ignore(build_trunc_or_bitcast p1 i8_type "build_trunc2" atentry);
+    ignore(build_pointercast inst37 i64_type "build_ptrtoint2" atentry);
+    ignore(build_intcast inst29 i64_type "build_sext3" atentry);
+    ignore(build_intcast p1 i8_type "build_trunc3" atentry);
+    ignore(build_fpcast inst35 double_type "build_fpext2" atentry);
+    ignore(build_fpcast inst32 float_type "build_fptrunc2" atentry);
+  end;
+
+  group "comparisons"; begin
+    (* CHECK: %build_icmp_ne = icmp ne i32 %P1, %P2
+     * CHECK: %build_icmp_sle = icmp sle i32 %P2, %P1
+     * CHECK: %build_fcmp_false = fcmp false float %F1, %F2
+     * CHECK: %build_fcmp_true = fcmp true float %F2, %F1
+     * CHECK: %build_is_null{{.*}}= icmp eq{{.*}}%X0,{{.*}}null
+     * CHECK: %build_is_not_null = icmp ne i8* %X1, null
+     * CHECK: %build_ptrdiff
+     *)
+    let c = build_icmp Icmp.Ne    p1 p2 "build_icmp_ne" atentry in
+    insist (Some Icmp.Ne = icmp_predicate c);
+    insist (None = fcmp_predicate c);
+
+    let c = build_icmp Icmp.Sle   p2 p1 "build_icmp_sle" atentry in
+    insist (Some Icmp.Sle = icmp_predicate c);
+    insist (None = fcmp_predicate c);
+
+    let c = build_fcmp Fcmp.False f1 f2 "build_fcmp_false" atentry in
+    (* insist (Some Fcmp.False = fcmp_predicate c); *)
+    insist (None = icmp_predicate c);
+
+    let c = build_fcmp Fcmp.True  f2 f1 "build_fcmp_true" atentry in
+    (* insist (Some Fcmp.True = fcmp_predicate c); *)
+    insist (None = icmp_predicate c);
+
+    let g0 = declare_global (pointer_type i8_type) "g0" m in
+    let g1 = declare_global (pointer_type i8_type) "g1" m in
+    let p0 = build_load g0 "X0" atentry in
+    let p1 = build_load g1 "X1" atentry in
+    ignore (build_is_null p0 "build_is_null" atentry);
+    ignore (build_is_not_null p1 "build_is_not_null" atentry);
+    ignore (build_ptrdiff p1 p0 "build_ptrdiff" atentry);
+  end;
+
+  group "miscellaneous"; begin
+    (* CHECK: %build_call = tail call cc63 i32 @{{.*}}(i32 signext %P2, i32 %P1)
+     * CHECK: %build_select = select i1 %build_icmp, i32 %P1, i32 %P2
+     * CHECK: %build_va_arg = va_arg i8** null, i32
+     * CHECK: %build_extractelement = extractelement <4 x i32> %Vec1, i32 %P2
+     * CHECK: %build_insertelement = insertelement <4 x i32> %Vec1, i32 %P1, i32 %P2
+     * CHECK: %build_shufflevector = shufflevector <4 x i32> %Vec1, <4 x i32> %Vec2, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+     * CHECK: %build_insertvalue0 = insertvalue{{.*}}%bl, i32 1, 0
+     * CHECK: %build_extractvalue = extractvalue{{.*}}%build_insertvalue1, 1
+     *)
+    let ci = build_call fn [| p2; p1 |] "build_call" atentry in
+    insist (CallConv.c = instruction_call_conv ci);
+    set_instruction_call_conv 63 ci;
+    insist (63 = instruction_call_conv ci);
+    insist (not (is_tail_call ci));
+    set_tail_call true ci;
+    insist (is_tail_call ci);
+    add_instruction_param_attr ci 1 Attribute.Sext;
+    add_instruction_param_attr ci 2 Attribute.Noalias;
+    remove_instruction_param_attr ci 2 Attribute.Noalias;
+
+    let inst46 = build_icmp Icmp.Eq p1 p2 "build_icmp" atentry in
+    ignore (build_select inst46 p1 p2 "build_select" atentry);
+    ignore (build_va_arg
+      (const_null (pointer_type (pointer_type i8_type)))
+      i32_type "build_va_arg" atentry);
+
+    (* Set up some vector vregs. *)
+    let one  = const_int i32_type 1 in
+    let zero = const_int i32_type 0 in
+    let t1 = const_vector [| one; zero; one; zero |] in
+    let t2 = const_vector [| zero; one; zero; one |] in
+    let t3 = const_vector [| one; one; zero; zero |] in
+    let vec1 = build_insertelement t1 p1 p2 "Vec1" atentry in
+    let vec2 = build_insertelement t2 p1 p2 "Vec2" atentry in
+    let sty = struct_type context [| i32_type; i8_type |] in
+
+    ignore (build_extractelement vec1 p2 "build_extractelement" atentry);
+    ignore (build_insertelement vec1 p1 p2 "build_insertelement" atentry);
+    ignore (build_shufflevector vec1 vec2 t3 "build_shufflevector" atentry);
+
+    let p = build_alloca sty "ba" atentry in
+    let agg = build_load p "bl" atentry in
+    let agg0 = build_insertvalue agg (const_int i32_type 1) 0
+                 "build_insertvalue0" atentry in
+    let agg1 = build_insertvalue agg0 (const_int i8_type 2) 1
+                 "build_insertvalue1" atentry in
+    ignore (build_extractvalue agg1 1 "build_extractvalue" atentry)
+  end;
+
+  group "metadata"; begin
+    (* CHECK: %metadata = add i32 %P1, %P2, !test !1
+     * !1 is metadata emitted at EOF.
+     *)
+    let i = build_add p1 p2 "metadata" atentry in
+    insist ((has_metadata i) = false);
+
+    let m1 = const_int i32_type 1 in
+    let m2 = mdstring context "metadata test" in
+    let md = mdnode context [| m1; m2 |] in
+
+    let kind = mdkind_id context "test" in
+    set_metadata i kind md;
+
+    insist ((has_metadata i) = true);
+    insist ((metadata i kind) = Some md);
+
+    clear_metadata i kind;
+
+    insist ((has_metadata i) = false);
+    insist ((metadata i kind) = None);
+
+    set_metadata i kind md
+  end;
+
+  group "named metadata"; begin
+    (* !llvm.module.flags is emitted at EOF. *)
+    let n1 = const_int i32_type 1 in
+    let n2 = mdstring context "Debug Info Version" in
+    let n3 = const_int i32_type 2 in
+    let md = mdnode context [| n1; n2; n3 |] in
+    add_named_metadata_operand m "llvm.module.flags" md;
+
+    insist ((get_named_metadata m "llvm.module.flags") = [| md |])
+  end;
+
+  group "dbg"; begin
+    (* CHECK: %dbg = add i32 %P1, %P2, !dbg !2
+     * !2 is metadata emitted at EOF.
+     *)
+    insist ((current_debug_location atentry) = None);
+
+    let m_line = const_int i32_type 2 in
+    let m_col = const_int i32_type 3 in
+    let m_scope = mdnode context [| |] in
+    let m_inlined = mdnode context [| |] in
+    let md = mdnode context [| m_line; m_col; m_scope; m_inlined |] in
+    set_current_debug_location atentry md;
+
+    insist ((current_debug_location atentry) = Some md);
+
+    let i = build_add p1 p2 "dbg" atentry in
+    insist ((has_metadata i) = true);
+
+    clear_current_debug_location atentry
+  end;
+
+  group "ret"; begin
+    (* CHECK: ret{{.*}}P1
+     *)
+    let ret = build_ret p1 atentry in
+    position_before ret atentry
+  end;
+
+  (* see test/Feature/exception.ll *)
+  let bblpad = append_block context "Bblpad" fn in
+  let rt = struct_type context [| pointer_type i8_type; i32_type |] in
+  let ft = var_arg_function_type i32_type  [||] in
+  let personality = declare_function "__gxx_personality_v0" ft m in
+  let ztic = declare_global (pointer_type i8_type) "_ZTIc" m in
+  let ztid = declare_global (pointer_type i8_type) "_ZTId" m in
+  let ztipkc = declare_global (pointer_type i8_type) "_ZTIPKc" m in
+  begin
+      set_global_constant true ztic;
+      set_global_constant true ztid;
+      set_global_constant true ztipkc;
+      let lp = build_landingpad rt personality 0 "lpad"
+       (builder_at_end context bblpad) in begin
+           set_cleanup lp true;
+           add_clause lp ztic;
+           insist((pointer_type (pointer_type i8_type)) = type_of ztid);
+           let ety = pointer_type (pointer_type i8_type) in
+           add_clause lp (const_array ety [| ztipkc; ztid |]);
+           ignore (build_resume lp (builder_at_end context bblpad));
+      end;
+      (* CHECK: landingpad{{.*}}personality{{.*}}__gxx_personality_v0
+       * CHECK: cleanup
+       * CHECK: catch{{.*}}i8**{{.*}}@_ZTIc
+       * CHECK: filter{{.*}}@_ZTIPKc{{.*}}@_ZTId
+       * CHECK: resume
+       * *)
+  end;
+
+  group "br"; begin
+    (* CHECK: br{{.*}}Bb02
+     *)
+    let bb02 = append_block context "Bb02" fn in
+    let b = builder_at_end context bb02 in
+    let br = build_br bb02 b in
+    insist (successors br = [| bb02 |]) ;
+    insist (is_conditional br = false) ;
+    insist (get_branch br = Some (`Unconditional bb02)) ;
+  end;
+
+  group "cond_br"; begin
+    (* CHECK: br{{.*}}build_br{{.*}}Bb03{{.*}}Bb00
+     *)
+    let bb03 = append_block context "Bb03" fn in
+    let b = builder_at_end context bb03 in
+    let cond = build_trunc p1 i1_type "build_br" b in
+    let br = build_cond_br cond bb03 bb00 b in
+    insist (num_successors br = 2) ;
+    insist (successor br 0 = bb03) ;
+    insist (successor br 1 = bb00) ;
+    insist (is_conditional br = true) ;
+    insist (get_branch br = Some (`Conditional (cond, bb03, bb00))) ;
+  end;
+
+  group "switch"; begin
+    (* CHECK: switch{{.*}}P1{{.*}}SwiBlock3
+     * CHECK: 2,{{.*}}SwiBlock2
+     *)
+    let bb1 = append_block context "SwiBlock1" fn in
+    let bb2 = append_block context "SwiBlock2" fn in
+    ignore (build_unreachable (builder_at_end context bb2));
+    let bb3 = append_block context "SwiBlock3" fn in
+    ignore (build_unreachable (builder_at_end context bb3));
+    let si = build_switch p1 bb3 1 (builder_at_end context bb1) in begin
+        ignore (add_case si (const_int i32_type 2) bb2);
+        insist (switch_default_dest si = bb3);
+    end;
+    insist (num_successors si = 2) ;
+    insist (get_branch si = None) ;
+  end;
+
+  group "malloc/free"; begin
+      (* CHECK: call{{.*}}@malloc(i32 ptrtoint
+       * CHECK: call{{.*}}@free(i8*
+       * CHECK: call{{.*}}@malloc(i32 %
+       *)
+      let bb1 = append_block context "MallocBlock1" fn in
+      let m1 = (build_malloc (pointer_type i32_type) "m1"
+      (builder_at_end context bb1)) in
+      ignore (build_free m1 (builder_at_end context bb1));
+      ignore (build_array_malloc i32_type p1 "m2" (builder_at_end context bb1));
+      ignore (build_unreachable (builder_at_end context bb1));
+  end;
+
+  group "indirectbr"; begin
+    (* CHECK: indirectbr i8* blockaddress(@X7, %IBRBlock2), [label %IBRBlock2, label %IBRBlock3]
+     *)
+    let bb1 = append_block context "IBRBlock1" fn in
+
+    let bb2 = append_block context "IBRBlock2" fn in
+    ignore (build_unreachable (builder_at_end context bb2));
+
+    let bb3 = append_block context "IBRBlock3" fn in
+    ignore (build_unreachable (builder_at_end context bb3));
+
+    let addr = block_address fn bb2 in
+    let ibr = build_indirect_br addr 2 (builder_at_end context bb1) in
+    ignore (add_destination ibr bb2);
+    ignore (add_destination ibr bb3)
+  end;
+
+  group "invoke"; begin
+    (* CHECK: build_invoke{{.*}}invoke{{.*}}P1{{.*}}P2
+     * CHECK: to{{.*}}Bb04{{.*}}unwind{{.*}}Bblpad
+     *)
+    let bb04 = append_block context "Bb04" fn in
+    let b = builder_at_end context bb04 in
+    ignore (build_invoke fn [| p1; p2 |] bb04 bblpad "build_invoke" b)
+  end;
+
+  group "unreachable"; begin
+    (* CHECK: unreachable
+     *)
+    let bb06 = append_block context "Bb06" fn in
+    let b = builder_at_end context bb06 in
+    ignore (build_unreachable b)
+  end;
+
+  group "arithmetic"; begin
+    let bb07 = append_block context "Bb07" fn in
+    let b = builder_at_end context bb07 in
+
+    (* CHECK: %build_add = add i32 %P1, %P2
+     * CHECK: %build_nsw_add = add nsw i32 %P1, %P2
+     * CHECK: %build_nuw_add = add nuw i32 %P1, %P2
+     * CHECK: %build_fadd = fadd float %F1, %F2
+     * CHECK: %build_sub = sub i32 %P1, %P2
+     * CHECK: %build_nsw_sub = sub nsw i32 %P1, %P2
+     * CHECK: %build_nuw_sub = sub nuw i32 %P1, %P2
+     * CHECK: %build_fsub = fsub float %F1, %F2
+     * CHECK: %build_mul = mul i32 %P1, %P2
+     * CHECK: %build_nsw_mul = mul nsw i32 %P1, %P2
+     * CHECK: %build_nuw_mul = mul nuw i32 %P1, %P2
+     * CHECK: %build_fmul = fmul float %F1, %F2
+     * CHECK: %build_udiv = udiv i32 %P1, %P2
+     * CHECK: %build_sdiv = sdiv i32 %P1, %P2
+     * CHECK: %build_exact_sdiv = sdiv exact i32 %P1, %P2
+     * CHECK: %build_fdiv = fdiv float %F1, %F2
+     * CHECK: %build_urem = urem i32 %P1, %P2
+     * CHECK: %build_srem = srem i32 %P1, %P2
+     * CHECK: %build_frem = frem float %F1, %F2
+     * CHECK: %build_shl = shl i32 %P1, %P2
+     * CHECK: %build_lshl = lshr i32 %P1, %P2
+     * CHECK: %build_ashl = ashr i32 %P1, %P2
+     * CHECK: %build_and = and i32 %P1, %P2
+     * CHECK: %build_or = or i32 %P1, %P2
+     * CHECK: %build_xor = xor i32 %P1, %P2
+     * CHECK: %build_neg = sub i32 0, %P1
+     * CHECK: %build_nsw_neg = sub nsw i32 0, %P1
+     * CHECK: %build_nuw_neg = sub nuw i32 0, %P1
+     * CHECK: %build_fneg = fsub float {{.*}}0{{.*}}, %F1
+     * CHECK: %build_not = xor i32 %P1, -1
+     *)
+    ignore (build_add p1 p2 "build_add" b);
+    ignore (build_nsw_add p1 p2 "build_nsw_add" b);
+    ignore (build_nuw_add p1 p2 "build_nuw_add" b);
+    ignore (build_fadd f1 f2 "build_fadd" b);
+    ignore (build_sub p1 p2 "build_sub" b);
+    ignore (build_nsw_sub p1 p2 "build_nsw_sub" b);
+    ignore (build_nuw_sub p1 p2 "build_nuw_sub" b);
+    ignore (build_fsub f1 f2 "build_fsub" b);
+    ignore (build_mul p1 p2 "build_mul" b);
+    ignore (build_nsw_mul p1 p2 "build_nsw_mul" b);
+    ignore (build_nuw_mul p1 p2 "build_nuw_mul" b);
+    ignore (build_fmul f1 f2 "build_fmul" b);
+    ignore (build_udiv p1 p2 "build_udiv" b);
+    ignore (build_sdiv p1 p2 "build_sdiv" b);
+    ignore (build_exact_sdiv p1 p2 "build_exact_sdiv" b);
+    ignore (build_fdiv f1 f2 "build_fdiv" b);
+    ignore (build_urem p1 p2 "build_urem" b);
+    ignore (build_srem p1 p2 "build_srem" b);
+    ignore (build_frem f1 f2 "build_frem" b);
+    ignore (build_shl p1 p2 "build_shl" b);
+    ignore (build_lshr p1 p2 "build_lshl" b);
+    ignore (build_ashr p1 p2 "build_ashl" b);
+    ignore (build_and p1 p2 "build_and" b);
+    ignore (build_or p1 p2 "build_or" b);
+    ignore (build_xor p1 p2 "build_xor" b);
+    ignore (build_neg p1 "build_neg" b);
+    ignore (build_nsw_neg p1 "build_nsw_neg" b);
+    ignore (build_nuw_neg p1 "build_nuw_neg" b);
+    ignore (build_fneg f1 "build_fneg" b);
+    ignore (build_not p1 "build_not" b);
+    ignore (build_unreachable b)
+  end;
+
+  group "memory"; begin
+    let bb08 = append_block context "Bb08" fn in
+    let b = builder_at_end context bb08 in
+
+    (* CHECK: %build_alloca = alloca i32
+     * CHECK: %build_array_alloca = alloca i32, i32 %P2
+     * CHECK: %build_load = load volatile i32* %build_array_alloca, align 4
+     * CHECK: store volatile i32 %P2, i32* %build_alloca, align 4
+     * CHECK: %build_gep = getelementptr i32* %build_array_alloca, i32 %P2
+     * CHECK: %build_in_bounds_gep = getelementptr inbounds i32* %build_array_alloca, i32 %P2
+     * CHECK: %build_struct_gep = getelementptr inbounds{{.*}}%build_alloca2, i32 0, i32 1
+     * CHECK: %build_atomicrmw = atomicrmw xchg i8* %p, i8 42 seq_cst
+     *)
+    let alloca = build_alloca i32_type "build_alloca" b in
+    let array_alloca = build_array_alloca i32_type p2 "build_array_alloca" b in
+
+    let load = build_load array_alloca "build_load" b in
+    ignore(set_alignment 4 load);
+    ignore(set_volatile true load);
+    insist(true = is_volatile load);
+    insist(4 = alignment load);
+
+    let store = build_store p2 alloca b in
+    ignore(set_volatile true store);
+    ignore(set_alignment 4 store);
+    insist(true = is_volatile store);
+    insist(4 = alignment store);
+    ignore(build_gep array_alloca [| p2 |] "build_gep" b);
+    ignore(build_in_bounds_gep array_alloca [| p2 |] "build_in_bounds_gep" b);
+
+    let sty = struct_type context [| i32_type; i8_type |] in
+    let alloca2 = build_alloca sty "build_alloca2" b in
+    ignore(build_struct_gep alloca2 1 "build_struct_gep" b);
+
+    let p = build_alloca i8_type "p" b in
+    ignore(build_atomicrmw AtomicRMWBinOp.Xchg p (const_int i8_type 42)
+              AtomicOrdering.SequentiallyConsistent false "build_atomicrmw"
+              b);
+
+    ignore(build_unreachable b)
+  end;
+
+  group "string"; begin
+    let bb09 = append_block context "Bb09" fn in
+    let b = builder_at_end context bb09 in
+    let p = build_alloca (pointer_type i8_type) "p" b in
+    (* build_global_string is emitted above.
+     * CHECK: store{{.*}}build_global_string1{{.*}}p
+     * *)
+    ignore (build_global_string "stringval" "build_global_string" b);
+    let g = build_global_stringptr "stringval" "build_global_string1" b in
+    ignore (build_store g p b);
+    ignore(build_unreachable b);
+  end;
+
+  group "phi"; begin
+    (* CHECK: PhiNode{{.*}}P1{{.*}}PhiBlock1{{.*}}P2{{.*}}PhiBlock2
+     *)
+    let b1 = append_block context "PhiBlock1" fn in
+    let b2 = append_block context "PhiBlock2" fn in
+
+    let jb = append_block context "PhiJoinBlock" fn in
+    ignore (build_br jb (builder_at_end context b1));
+    ignore (build_br jb (builder_at_end context b2));
+    let at_jb = builder_at_end context jb in
+
+    let phi = build_phi [(p1, b1)] "PhiNode" at_jb in
+    insist ([(p1, b1)] = incoming phi);
+
+    add_incoming (p2, b2) phi;
+    insist ([(p1, b1); (p2, b2)] = incoming phi);
+
+    ignore (build_unreachable at_jb);
+  end
+
+(* End-of-file checks for things like metdata and attributes.
+ * CHECK: attributes #0 = {{.*}}uwtable{{.*}}
+ * CHECK: !llvm.module.flags = !{!0}
+ * CHECK: !0 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+ * CHECK: !1 = metadata !{i32 1, metadata !"metadata test"}
+ * CHECK: !2 = metadata !{i32 2, i32 3, metadata !3, metadata !3}
+ *)
+
+(*===-- Pass Managers -----------------------------------------------------===*)
+
+let test_pass_manager () =
+  let (++) x f = ignore (f x); x in
+
+  begin group "module pass manager";
+    ignore (PassManager.create ()
+             ++ PassManager.run_module m
+             ++ PassManager.dispose)
+  end;
+
+  begin group "function pass manager";
+    let fty = function_type void_type [| |] in
+    let fn = define_function "FunctionPassManager" fty m in
+    ignore (build_ret_void (builder_at_end context (entry_block fn)));
+
+    ignore (PassManager.create_function m
+             ++ PassManager.initialize
+             ++ PassManager.run_function fn
+             ++ PassManager.finalize
+             ++ PassManager.dispose)
+  end
+
+
+(*===-- Memory Buffer -----------------------------------------------------===*)
+
+let test_memory_buffer () =
+  group "memory buffer";
+  let buf = MemoryBuffer.of_string "foobar" in
+  insist ((MemoryBuffer.as_string buf) = "foobar")
+
+
+(*===-- Writer ------------------------------------------------------------===*)
+
+let test_writer () =
+  group "valid";
+  insist (match Llvm_analysis.verify_module m with
+          | None -> true
+          | Some msg -> prerr_string msg; false);
+
+  group "writer";
+  insist (write_bitcode_file m filename);
+
+  dispose_module m
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "conversion"       test_conversion;
+  suite "target"           test_target;
+  suite "constants"        test_constants;
+  suite "global values"    test_global_values;
+  suite "global variables" test_global_variables;
+  suite "uses"             test_uses;
+  suite "users"            test_users;
+  suite "aliases"          test_aliases;
+  suite "functions"        test_functions;
+  suite "params"           test_params;
+  suite "basic blocks"     test_basic_blocks;
+  suite "instructions"     test_instructions;
+  suite "builder"          test_builder;
+  suite "pass manager"     test_pass_manager;
+  suite "memory buffer"    test_memory_buffer;
+  suite "writer"           test_writer; (* Keep this last; it disposes m. *)
+  exit !exit_status

diff --git a/test/Bindings/OCaml/executionengine.ml b/test/Bindings/OCaml/executionengine.ml
new file mode 100644
index 0000000..893f988
--- /dev/null
+++ b/test/Bindings/OCaml/executionengine.ml

@@ -0,0 +1,105 @@
+(* RUN: cp %s %T/executionengine.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.executionengine -linkpkg %T/executionengine.ml -o %t
+ * RUN: %t
+ * RUN: %ocamlopt -g -warn-error A -package llvm.executionengine -linkpkg %T/executionengine.ml -o %t
+ * RUN: %t
+ * REQUIRES: native, object-emission
+ * XFAIL: vg_leak
+ *)
+
+open Llvm
+open Llvm_executionengine
+open Llvm_target
+
+(* Note that this takes a moment to link, so it's best to keep the number of
+   individual tests low. *)
+
+let context = global_context ()
+let i8_type = Llvm.i8_type context
+let i32_type = Llvm.i32_type context
+let i64_type = Llvm.i64_type context
+let double_type = Llvm.double_type context
+
+let () =
+  assert (Llvm_executionengine.initialize ())
+
+let bomb msg =
+  prerr_endline msg;
+  exit 2
+
+let define_getglobal m pg =
+  let fn = define_function "getglobal" (function_type i32_type [||]) m in
+  let b = builder_at_end (global_context ()) (entry_block fn) in
+  let g = build_call pg [||] "" b in
+  ignore (build_ret g b);
+  fn
+
+let define_plus m =
+  let fn = define_function "plus" (function_type i32_type [| i32_type;
+                                                             i32_type |]) m in
+  let b = builder_at_end (global_context ()) (entry_block fn) in
+  let add = build_add (param fn 0) (param fn 1) "sum" b in
+  ignore (build_ret add b);
+  fn
+
+let test_executionengine () =
+  let open Ctypes in
+
+  (* create *)
+  let m = create_module (global_context ()) "test_module" in
+  let ee = create m in
+
+  (* add plus *)
+  let plus = define_plus m in
+
+  (* add module *)
+  let m2 = create_module (global_context ()) "test_module2" in
+  add_module m2 ee;
+
+  (* add global mapping *)
+  (* BROKEN: see PR20656 *)
+  (* let g = declare_function "g" (function_type i32_type [||]) m2 in
+  let cg = coerce (Foreign.funptr (void @-> returning int32_t)) (ptr void)
+                                  (fun () -> 42l) in
+  add_global_mapping g cg ee;
+
+  (* check g *)
+  let cg' = get_pointer_to_global g (ptr void) ee in
+  if 0 <> ptr_compare cg cg' then bomb "int pointers to g differ";
+
+  (* add getglobal *)
+  let getglobal = define_getglobal m2 g in*)
+
+  (* run_static_ctors *)
+  run_static_ctors ee;
+
+  (* call plus *)
+  let cplusty = Foreign.funptr (int32_t @-> int32_t @-> returning int32_t) in
+  let cplus   = get_pointer_to_global plus cplusty ee in
+  if 4l <> cplus 2l 2l then bomb "plus didn't work";
+
+  (* call getglobal *)
+  (* let cgetglobalty = Foreign.funptr (void @-> returning int32_t) in
+  let cgetglobal   = get_pointer_to_global getglobal cgetglobalty ee in
+  if 42l <> cgetglobal () then bomb "getglobal didn't work"; *)
+
+  (* remove_module *)
+  remove_module m2 ee;
+  dispose_module m2;
+
+  (* run_static_dtors *)
+  run_static_dtors ee;
+
+  (* Show that the data layout binding links and runs.*)
+  let dl = data_layout ee in
+
+  (* Demonstrate that a garbage pointer wasn't returned. *)
+  let ty = DataLayout.intptr_type context dl in
+  if ty != i32_type && ty != i64_type then bomb "target_data did not work";
+
+  (* dispose *)
+  dispose ee
+
+let () =
+  test_executionengine ();
+  Gc.compact ()

diff --git a/test/Bindings/OCaml/ext_exc.ml b/test/Bindings/OCaml/ext_exc.ml
new file mode 100644
index 0000000..2b44803
--- /dev/null
+++ b/test/Bindings/OCaml/ext_exc.ml

@@ -0,0 +1,22 @@
+(* RUN: cp %s %T/ext_exc.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.bitreader -linkpkg %T/ext_exc.ml -o %t
+ * RUN: %t
+ * RUN: %ocamlopt -g -warn-error A -package llvm.bitreader -linkpkg %T/ext_exc.ml -o %t
+ * RUN: %t
+ * XFAIL: vg_leak
+ *)
+
+let context = Llvm.global_context ()
+
+(* this used to crash, we must not use 'external' in .mli files, but 'val' if we
+ * want the let _ bindings executed, see http://caml.inria.fr/mantis/view.php?id=4166 *)
+let _ =
+    try
+        ignore (Llvm_bitreader.get_module context (Llvm.MemoryBuffer.of_stdin ()))
+    with
+    Llvm_bitreader.Error _ -> ();;
+let _ =
+    try
+        ignore (Llvm.MemoryBuffer.of_file "/path/to/nonexistent/file")
+    with
+    Llvm.IoError _ -> ();;

diff --git a/test/Bindings/OCaml/ipo.ml b/test/Bindings/OCaml/ipo.ml
new file mode 100644
index 0000000..fc728b9
--- /dev/null
+++ b/test/Bindings/OCaml/ipo.ml

@@ -0,0 +1,72 @@
+(* RUN: cp %s %T/ipo_opts.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.ipo -linkpkg %T/ipo_opts.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: %ocamlopt -g -warn-error A -package llvm.ipo -linkpkg %T/ipo_opts.ml -o %t
+ * RUN: %t %t.bc
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_ipo
+open Llvm_target
+
+let context = global_context ()
+let void_type = Llvm.void_type context
+let i8_type = Llvm.i8_type context
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+
+(*===-- Fixture -----------------------------------------------------------===*)
+
+let filename = Sys.argv.(1)
+let m = create_module context filename
+
+
+(*===-- Transforms --------------------------------------------------------===*)
+
+let test_transforms () =
+  let (++) x f = f x; x in
+
+  let fty = function_type i8_type [| |] in
+  let fn = define_function "fn" fty m in
+  let fn2 = define_function "fn2" fty m in begin
+      ignore (build_ret (const_int i8_type 4) (builder_at_end context (entry_block fn)));
+      let b = builder_at_end context  (entry_block fn2) in
+      ignore (build_ret (build_call fn [| |] "" b) b);
+  end;
+
+  ignore (PassManager.create ()
+           ++ add_argument_promotion
+           ++ add_constant_merge
+           ++ add_dead_arg_elimination
+           ++ add_function_attrs
+           ++ add_function_inlining
+           ++ add_always_inliner
+           ++ add_global_dce
+           ++ add_global_optimizer
+           ++ add_ipc_propagation
+           ++ add_prune_eh
+           ++ add_ipsccp
+           ++ add_internalize ~all_but_main:true
+           ++ add_strip_dead_prototypes
+           ++ add_strip_symbols
+           ++ PassManager.run_module m
+           ++ PassManager.dispose)
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "transforms" test_transforms;
+  dispose_module m

diff --git a/test/Bindings/OCaml/irreader.ml b/test/Bindings/OCaml/irreader.ml
new file mode 100644
index 0000000..e1771e7
--- /dev/null
+++ b/test/Bindings/OCaml/irreader.ml

@@ -0,0 +1,59 @@
+(* RUN: cp %s %T/irreader.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.irreader -linkpkg %T/irreader.ml -o %t
+ * RUN: %t
+ * RUN: %ocamlopt -g -warn-error A -package llvm.irreader -linkpkg %T/irreader.ml -o %t
+ * RUN: %t
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_irreader
+
+let context = global_context ()
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+let _ =
+  Printexc.record_backtrace true
+
+let insist cond =
+  if not cond then failwith "insist"
+
+
+(*===-- IR Reader ---------------------------------------------------------===*)
+
+let test_irreader () =
+  begin
+    let buf = MemoryBuffer.of_string "@foo = global i32 42" in
+    let m   = parse_ir context buf in
+    match lookup_global "foo" m with
+    | Some foo ->
+        insist ((global_initializer foo) = (const_int (i32_type context) 42))
+    | None ->
+        failwith "global"
+  end;
+
+  begin
+    let buf = MemoryBuffer.of_string "@foo = global garble" in
+    try
+      ignore (parse_ir context buf);
+      failwith "parsed"
+    with Llvm_irreader.Error _ ->
+      ()
+  end
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "irreader" test_irreader

diff --git a/test/Bindings/OCaml/linker.ml b/test/Bindings/OCaml/linker.ml
new file mode 100644
index 0000000..00064b0
--- /dev/null
+++ b/test/Bindings/OCaml/linker.ml

@@ -0,0 +1,63 @@
+(* RUN: cp %s %T/linker.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.linker -linkpkg %T/linker.ml -o %t
+ * RUN: %t
+ * RUN: %ocamlopt -g -warn-error A -package llvm.linker -linkpkg %T/linker.ml -o %t
+ * RUN: %t
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_linker
+
+let context = global_context ()
+let void_type = Llvm.void_type context
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+
+(*===-- Linker -----------------------------------------------------------===*)
+
+let test_linker () =
+  let fty = function_type void_type [| |] in
+
+  let make_module name =
+    let m = create_module context name in
+    let fn = define_function ("fn_" ^ name) fty m in
+    ignore (build_ret_void (builder_at_end context (entry_block fn)));
+    m
+  in
+
+  let m1 = make_module "one"
+  and m2 = make_module "two" in
+  link_modules m1 m2 Mode.PreserveSource;
+  dispose_module m1;
+  dispose_module m2;
+
+  let m1 = make_module "one"
+  and m2 = make_module "two" in
+  link_modules m1 m2 Mode.DestroySource;
+  dispose_module m1;
+
+  let m1 = make_module "one"
+  and m2 = make_module "one" in
+  try
+    link_modules m1 m2 Mode.PreserveSource;
+    failwith "must raise"
+  with Error _ ->
+    dispose_module m1;
+    dispose_module m2
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "linker" test_linker

diff --git a/test/Bindings/OCaml/lit.local.cfg b/test/Bindings/OCaml/lit.local.cfg
new file mode 100644
index 0000000..bca5d39
--- /dev/null
+++ b/test/Bindings/OCaml/lit.local.cfg

@@ -0,0 +1,7 @@
+config.suffixes = ['.ml']
+
+if not 'ocaml' in config.root.llvm_bindings:
+    config.unsupported = True
+
+if config.root.have_ocaml_ounit != '1':
+    config.unsupported = True

diff --git a/test/Bindings/OCaml/passmgr_builder.ml b/test/Bindings/OCaml/passmgr_builder.ml
new file mode 100644
index 0000000..5dd9d4e
--- /dev/null
+++ b/test/Bindings/OCaml/passmgr_builder.ml

@@ -0,0 +1,64 @@
+(* RUN: cp %s %T/passmgr_builder.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.passmgr_builder -linkpkg %T/passmgr_builder.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: %ocamlopt -g -warn-error A -package llvm.passmgr_builder -linkpkg %T/passmgr_builder.ml -o %t
+ * RUN: %t %t.bc
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_passmgr_builder
+
+let context = global_context ()
+let void_type = Llvm.void_type context
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+
+(*===-- Fixture -----------------------------------------------------------===*)
+
+let filename = Sys.argv.(1)
+let m = create_module context filename
+
+
+(*===-- Pass Manager Builder ----------------------------------------------===*)
+
+let test_pmbuilder () =
+  let (++) x f = ignore (f x); x in
+
+  let module_passmgr = PassManager.create () in
+  let func_passmgr   = PassManager.create_function m in
+  let lto_passmgr    = PassManager.create () in
+
+  ignore (Llvm_passmgr_builder.create ()
+           ++ set_opt_level 3
+           ++ set_size_level 1
+           ++ set_disable_unit_at_a_time false
+           ++ set_disable_unroll_loops false
+           ++ use_inliner_with_threshold 10
+           ++ populate_function_pass_manager func_passmgr
+           ++ populate_module_pass_manager module_passmgr
+           ++ populate_lto_pass_manager lto_passmgr
+                  ~internalize:false ~run_inliner:false);
+  Gc.compact ();
+
+  PassManager.dispose module_passmgr;
+  PassManager.dispose func_passmgr;
+  PassManager.dispose lto_passmgr
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "pass manager builder" test_pmbuilder;
+  dispose_module m

diff --git a/test/Bindings/OCaml/scalar_opts.ml b/test/Bindings/OCaml/scalar_opts.ml
new file mode 100644
index 0000000..3017fb1
--- /dev/null
+++ b/test/Bindings/OCaml/scalar_opts.ml

@@ -0,0 +1,92 @@
+(* RUN: cp %s %T/scalar_opts.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.scalar_opts -linkpkg %T/scalar_opts.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: %ocamlopt -g -warn-error A -package llvm.scalar_opts -linkpkg %T/scalar_opts.ml -o %t
+ * RUN: %t %t.bc
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_scalar_opts
+open Llvm_target
+
+let context = global_context ()
+let void_type = Llvm.void_type context
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+
+(*===-- Fixture -----------------------------------------------------------===*)
+
+let filename = Sys.argv.(1)
+let m = create_module context filename
+
+
+(*===-- Transforms --------------------------------------------------------===*)
+
+let test_transforms () =
+  let (++) x f = f x; x in
+
+  let fty = function_type void_type [| |] in
+  let fn = define_function "fn" fty m in
+  ignore (build_ret_void (builder_at_end context (entry_block fn)));
+
+  ignore (PassManager.create_function m
+           ++ add_aggressive_dce
+           ++ add_alignment_from_assumptions
+           ++ add_cfg_simplification
+           ++ add_dead_store_elimination
+           ++ add_scalarizer
+           ++ add_merged_load_store_motion
+           ++ add_gvn
+           ++ add_ind_var_simplification
+           ++ add_instruction_combination
+           ++ add_jump_threading
+           ++ add_licm
+           ++ add_loop_deletion
+           ++ add_loop_idiom
+           ++ add_loop_rotation
+           ++ add_loop_reroll
+           ++ add_loop_unroll
+           ++ add_loop_unswitch
+           ++ add_memcpy_opt
+           ++ add_partially_inline_lib_calls
+           ++ add_lower_switch
+           ++ add_memory_to_register_promotion
+           ++ add_reassociation
+           ++ add_sccp
+           ++ add_scalar_repl_aggregation
+           ++ add_scalar_repl_aggregation_ssa
+           ++ add_scalar_repl_aggregation_with_threshold 4
+           ++ add_lib_call_simplification
+           ++ add_tail_call_elimination
+           ++ add_constant_propagation
+           ++ add_memory_to_register_demotion
+           ++ add_verifier
+           ++ add_correlated_value_propagation
+           ++ add_early_cse
+           ++ add_lower_expect_intrinsic
+           ++ add_type_based_alias_analysis
+           ++ add_scoped_no_alias_alias_analysis
+           ++ add_basic_alias_analysis
+           ++ PassManager.initialize
+           ++ PassManager.run_function fn
+           ++ PassManager.finalize
+           ++ PassManager.dispose)
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "transforms" test_transforms;
+  dispose_module m

diff --git a/test/Bindings/OCaml/target.ml b/test/Bindings/OCaml/target.ml
new file mode 100644
index 0000000..41faefa
--- /dev/null
+++ b/test/Bindings/OCaml/target.ml

@@ -0,0 +1,116 @@
+(* RUN: cp %s %T/target.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.target -package llvm.all_backends -linkpkg %T/target.ml -o %t
+ * RUN: %ocamlopt -g -warn-error A -package llvm.target -package llvm.all_backends -linkpkg %T/target.ml -o %t
+ * RUN: %t %t.bc
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_target
+
+let () = Llvm_all_backends.initialize ()
+
+let context = global_context ()
+let i32_type = Llvm.i32_type context
+let i64_type = Llvm.i64_type context
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let _ =
+  Printexc.record_backtrace true
+
+let assert_equal a b =
+  if a <> b then failwith "assert_equal"
+
+
+(*===-- Fixture -----------------------------------------------------------===*)
+
+let filename = Sys.argv.(1)
+let m = create_module context filename
+
+let target = Target.by_triple (Target.default_triple ())
+
+let machine = TargetMachine.create (Target.default_triple ()) target
+
+(*===-- Data Layout -------------------------------------------------------===*)
+
+let test_target_data () =
+  let module DL = DataLayout in
+  let layout = "e-p:32:32-f64:32:64-v64:32:64-v128:32:128-n32-S32" in
+  let dl     = DL.of_string layout in
+  let sty    = struct_type context [| i32_type; i64_type |] in
+
+  assert_equal (DL.as_string dl) layout;
+  assert_equal (DL.byte_order dl) Endian.Little;
+  assert_equal (DL.pointer_size dl) 4;
+  assert_equal (DL.intptr_type context dl) i32_type;
+  assert_equal (DL.qualified_pointer_size 0 dl) 4;
+  assert_equal (DL.qualified_intptr_type context 0 dl) i32_type;
+  assert_equal (DL.size_in_bits sty dl) (Int64.of_int 96);
+  assert_equal (DL.store_size sty dl) (Int64.of_int 12);
+  assert_equal (DL.abi_size sty dl) (Int64.of_int 12);
+  assert_equal (DL.stack_align sty dl) 4;
+  assert_equal (DL.preferred_align sty dl) 8;
+  assert_equal (DL.preferred_align_of_global (declare_global sty "g" m) dl) 8;
+  assert_equal (DL.element_at_offset sty (Int64.of_int 1) dl) 0;
+  assert_equal (DL.offset_of_element sty 1 dl) (Int64.of_int 4);
+
+  let pm = PassManager.create () in
+  ignore (DL.add_to_pass_manager pm dl)
+
+
+(*===-- Target ------------------------------------------------------------===*)
+
+let test_target () =
+  let module T = Target in
+  ignore (T.succ target);
+  ignore (T.name target);
+  ignore (T.description target);
+  ignore (T.has_jit target);
+  ignore (T.has_target_machine target);
+  ignore (T.has_asm_backend target)
+
+
+(*===-- Target Machine ----------------------------------------------------===*)
+
+let test_target_machine () =
+  let module TM = TargetMachine in
+  assert_equal (TM.target machine) target;
+  assert_equal (TM.triple machine) (Target.default_triple ());
+  assert_equal (TM.cpu machine) "";
+  assert_equal (TM.features machine) "";
+  ignore (TM.data_layout machine);
+  TM.set_verbose_asm true machine;
+  let pm = PassManager.create () in
+  TM.add_analysis_passes pm machine
+
+
+(*===-- Code Emission -----------------------------------------------------===*)
+
+let test_code_emission () =
+  TargetMachine.emit_to_file m CodeGenFileType.ObjectFile filename machine;
+  try
+    TargetMachine.emit_to_file m CodeGenFileType.ObjectFile
+                               "/nonexistent/file" machine;
+    failwith "must raise"
+  with Llvm_target.Error _ ->
+    ();
+
+  let buf = TargetMachine.emit_to_memory_buffer m CodeGenFileType.ObjectFile
+                                                machine in
+  Llvm.MemoryBuffer.dispose buf
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  test_target_data ();
+  test_target ();
+  test_target_machine ();
+  test_code_emission ();
+  dispose_module m

diff --git a/test/Bindings/OCaml/vectorize.ml b/test/Bindings/OCaml/vectorize.ml
new file mode 100644
index 0000000..c5b03b5
--- /dev/null
+++ b/test/Bindings/OCaml/vectorize.ml

@@ -0,0 +1,56 @@
+(* RUN: cp %s %T/vectorize_opts.ml
+ * RUN: %ocamlc -g -warn-error A -package llvm.vectorize -linkpkg %T/vectorize_opts.ml -o %t
+ * RUN: %t %t.bc
+ * RUN: %ocamlopt -g -warn-error A -package llvm.vectorize -linkpkg %T/vectorize_opts.ml -o %t
+ * RUN: %t %t.bc
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_vectorize
+open Llvm_target
+
+let context = global_context ()
+let void_type = Llvm.void_type context
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+
+(*===-- Fixture -----------------------------------------------------------===*)
+
+let filename = Sys.argv.(1)
+let m = create_module context filename
+
+
+(*===-- Transforms --------------------------------------------------------===*)
+
+let test_transforms () =
+  let (++) x f = f x; x in
+
+  let fty = function_type void_type [| |] in
+  let fn = define_function "fn" fty m in
+  ignore (build_ret_void (builder_at_end context (entry_block fn)));
+
+  ignore (PassManager.create ()
+           ++ add_bb_vectorize
+           ++ add_loop_vectorize
+           ++ add_slp_vectorize
+           ++ PassManager.run_module m
+           ++ PassManager.dispose)
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "transforms" test_transforms;
+  dispose_module m

diff --git a/test/Bindings/Ocaml/analysis.ml b/test/Bindings/Ocaml/analysis.ml
deleted file mode 100644
index c02645c..0000000
--- a/test/Bindings/Ocaml/analysis.ml
+++ /dev/null

@@ -1,54 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_analysis.cmxa %t.builddir/analysis.ml -o %t
- * RUN: %t
- * XFAIL: vg_leak
- *)
-
-open Llvm
-open Llvm_analysis
-
-(* Note that this takes a moment to link, so it's best to keep the number of
-   individual tests low. *)
-
-let context = global_context ()
-
-let test x = if not x then exit 1 else ()
-
-let bomb msg =
-  prerr_endline msg;
-  exit 2
-
-let _ =
-  let fty = function_type (void_type context) [| |] in
-  let m = create_module context "valid_m" in
-  let fn = define_function "valid_fn" fty m in
-  let at_entry = builder_at_end context (entry_block fn) in
-  ignore (build_ret_void at_entry);
-  
-  
-  (* Test that valid constructs verify. *)
-  begin match verify_module m with
-    Some msg -> bomb "valid module failed verification!"
-  | None -> ()
-  end;
-  
-  if not (verify_function fn) then bomb "valid function failed verification!";
-  
-  
-  (* Test that invalid constructs do not verify.
-     A basic block can contain only one terminator instruction. *)
-  ignore (build_ret_void at_entry);
-  
-  begin match verify_module m with
-    Some msg -> ()
-  | None -> bomb "invalid module passed verification!"
-  end;
-  
-  if verify_function fn then bomb "invalid function passed verification!";
-  
-  
-  dispose_module m
-  
-  (* Don't bother to test assert_valid_{module,function}. *)

diff --git a/test/Bindings/Ocaml/bitreader.ml b/test/Bindings/Ocaml/bitreader.ml
deleted file mode 100644
index f1d202a..0000000
--- a/test/Bindings/Ocaml/bitreader.ml
+++ /dev/null

@@ -1,79 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_bitreader.cmxa llvm_bitwriter.cmxa %t.builddir/bitreader.ml -o %t
- * RUN: %t %t.bc
- * RUN: llvm-dis < %t.bc
- * XFAIL: vg_leak
- *)
-
-(* Note that this takes a moment to link, so it's best to keep the number of
-   individual tests low. *)
-
-let context = Llvm.global_context ()
-
-let test x = if not x then exit 1 else ()
-
-let _ =
-  let fn = Sys.argv.(1) in
-  let m = Llvm.create_module context "ocaml_test_module" in
-  
-  test (Llvm_bitwriter.write_bitcode_file m fn);
-  
-  Llvm.dispose_module m;
-  
-  (* parse_bitcode *)
-  begin
-    let mb = Llvm.MemoryBuffer.of_file fn in
-    begin try
-      let m = Llvm_bitreader.parse_bitcode context mb in
-      Llvm.dispose_module m
-    with x ->
-      Llvm.MemoryBuffer.dispose mb;
-      raise x
-    end
-  end;
-  
-  (* MemoryBuffer.of_file *)
-  test begin try
-    let mb = Llvm.MemoryBuffer.of_file (fn ^ ".bogus") in
-    Llvm.MemoryBuffer.dispose mb;
-    false
-  with Llvm.IoError _ ->
-    true
-  end;
-  
-  (* get_module *)
-  begin
-    let mb = Llvm.MemoryBuffer.of_file fn in
-    let m = begin try
-      Llvm_bitreader.get_module context mb
-    with x ->
-      Llvm.MemoryBuffer.dispose mb;
-      raise x
-    end in
-    Llvm.dispose_module m
-  end;
-  
-  (* corrupt the bitcode *)
-  let fn = fn ^ ".txt" in
-  begin let oc = open_out fn in
-    output_string oc "not a bitcode file\n";
-    close_out oc
-  end;
-  
-  (* test get_module exceptions *)
-  test begin
-    try
-      let mb = Llvm.MemoryBuffer.of_file fn in
-      let m = begin try
-        Llvm_bitreader.get_module context mb
-      with x ->
-        Llvm.MemoryBuffer.dispose mb;
-        raise x
-      end in
-      Llvm.dispose_module m;
-      false
-    with Llvm_bitreader.Error _ ->
-      true
-  end

diff --git a/test/Bindings/Ocaml/bitwriter.ml b/test/Bindings/Ocaml/bitwriter.ml
deleted file mode 100644
index ae456cf..0000000
--- a/test/Bindings/Ocaml/bitwriter.ml
+++ /dev/null

@@ -1,48 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A unix.cmxa llvm.cmxa llvm_bitwriter.cmxa %t.builddir/bitwriter.ml -o %t
- * RUN: %t %t.bc
- * RUN: llvm-dis < %t.bc
- * XFAIL: vg_leak
- *)
-
-(* Note that this takes a moment to link, so it's best to keep the number of
-   individual tests low. *)
-
-let context = Llvm.global_context ()
-
-let test x = if not x then exit 1 else ()
-
-let read_file name =
-  let ic = open_in_bin name in
-  let len = in_channel_length ic in
-  let buf = String.create len in
-
-  test ((input ic buf 0 len) = len);
-
-  close_in ic;
-
-  buf
-
-let temp_bitcode ?unbuffered m =
-  let temp_name, temp_oc = Filename.open_temp_file ~mode:[Open_binary] "" "" in
-
-  test (Llvm_bitwriter.output_bitcode ?unbuffered temp_oc m);
-  flush temp_oc;
-
-  let temp_buf = read_file temp_name in
-
-  close_out temp_oc;
-
-  temp_buf
-
-let _ =
-  let m = Llvm.create_module context "ocaml_test_module" in
-  
-  test (Llvm_bitwriter.write_bitcode_file m Sys.argv.(1));
-  let file_buf = read_file Sys.argv.(1) in
-
-  test (file_buf = temp_bitcode m);
-  test (file_buf = temp_bitcode ~unbuffered:false m);
-  test (file_buf = temp_bitcode ~unbuffered:true m)

diff --git a/test/Bindings/Ocaml/executionengine.ml b/test/Bindings/Ocaml/executionengine.ml
deleted file mode 100644
index 8e24949..0000000
--- a/test/Bindings/Ocaml/executionengine.ml
+++ /dev/null

@@ -1,118 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_target.cmxa llvm_executionengine.cmxa %t.builddir/executionengine.ml -o %t
- * RUN: %t
- * XFAIL: vg_leak
- *)
-
-open Llvm
-open Llvm_executionengine
-open Llvm_target
-
-(* Note that this takes a moment to link, so it's best to keep the number of
-   individual tests low. *)
-
-let context = global_context ()
-let i8_type = Llvm.i8_type context
-let i32_type = Llvm.i32_type context
-let i64_type = Llvm.i64_type context
-let double_type = Llvm.double_type context
-
-let bomb msg =
-  prerr_endline msg;
-  exit 2
-
-let define_main_fn m retval =
-  let fn =
-    let str_arr_type = pointer_type (pointer_type i8_type) in
-    define_function "main" (function_type i32_type [| i32_type;
-                                                      str_arr_type;
-                                                      str_arr_type |]) m in
-  let b = builder_at_end (global_context ()) (entry_block fn) in
-  ignore (build_ret (const_int i32_type retval) b);
-  fn
-
-let define_plus m =
-  let fn = define_function "plus" (function_type i32_type [| i32_type;
-                                                             i32_type |]) m in
-  let b = builder_at_end (global_context ()) (entry_block fn) in
-  let add = build_add (param fn 0) (param fn 1) "sum" b in
-  ignore (build_ret add b)
-
-let test_genericvalue () =
-  let tu = (1, 2) in
-  let ptrgv = GenericValue.of_pointer tu in
-  assert (tu = GenericValue.as_pointer ptrgv);
-  
-  let fpgv = GenericValue.of_float double_type 2. in
-  assert (2. = GenericValue.as_float double_type fpgv);
-  
-  let intgv = GenericValue.of_int i32_type 3 in
-  assert (3  = GenericValue.as_int intgv);
-  
-  let i32gv = GenericValue.of_int32 i32_type (Int32.of_int 4) in
-  assert ((Int32.of_int 4) = GenericValue.as_int32 i32gv);
-  
-  let nigv = GenericValue.of_nativeint i32_type (Nativeint.of_int 5) in
-  assert ((Nativeint.of_int 5) = GenericValue.as_nativeint nigv);
-  
-  let i64gv = GenericValue.of_int64 i64_type (Int64.of_int 6) in
-  assert ((Int64.of_int 6) = GenericValue.as_int64 i64gv)
-
-let test_executionengine () =
-  (* create *)
-  let m = create_module (global_context ()) "test_module" in
-  let main = define_main_fn m 42 in
-  
-  let m2 = create_module (global_context ()) "test_module2" in
-  define_plus m2;
-  
-  let ee = ExecutionEngine.create m in
-  ExecutionEngine.add_module m2 ee;
-  
-  (* run_static_ctors *)
-  ExecutionEngine.run_static_ctors ee;
-  
-  (* run_function_as_main *)
-  let res = ExecutionEngine.run_function_as_main main [|"test"|] [||] ee in
-  if 42 != res then bomb "main did not return 42";
-  
-  (* free_machine_code *)
-  ExecutionEngine.free_machine_code main ee;
-  
-  (* find_function *)
-  match ExecutionEngine.find_function "dne" ee with
-  | Some _ -> raise (Failure "find_function 'dne' failed")
-  | None ->
-  
-  match ExecutionEngine.find_function "plus" ee with
-  | None -> raise (Failure "find_function 'plus' failed")
-  | Some plus ->
-  
-  (* run_function *)
-  let res = ExecutionEngine.run_function plus
-                                         [| GenericValue.of_int i32_type 2;
-                                            GenericValue.of_int i32_type 2 |]
-                                         ee in
-  if 4 != GenericValue.as_int res then bomb "plus did not work";
-  
-  (* remove_module *)
-  Llvm.dispose_module (ExecutionEngine.remove_module m2 ee);
-  
-  (* run_static_dtors *)
-  ExecutionEngine.run_static_dtors ee;
-
-  (* Show that the data layout binding links and runs.*)
-  let dl = ExecutionEngine.data_layout ee in
-
-  (* Demonstrate that a garbage pointer wasn't returned. *)
-  let ty = DataLayout.intptr_type context dl in
-  if ty != i32_type && ty != i64_type then bomb "target_data did not work";
-  
-  (* dispose *)
-  ExecutionEngine.dispose ee
-
-let _ =
-  test_genericvalue ();
-  test_executionengine ()

diff --git a/test/Bindings/Ocaml/ext_exc.ml b/test/Bindings/Ocaml/ext_exc.ml
deleted file mode 100644
index 9afc3c3..0000000
--- a/test/Bindings/Ocaml/ext_exc.ml
+++ /dev/null

@@ -1,20 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_bitreader.cmxa llvm_executionengine.cmxa %t.builddir/ext_exc.ml -o %t
- * RUN: %t </dev/null
- * XFAIL: vg_leak
- *)
-let context = Llvm.global_context ()
-(* this used to crash, we must not use 'external' in .mli files, but 'val' if we
- * want the let _ bindings executed, see http://caml.inria.fr/mantis/view.php?id=4166 *)
-let _ =
-    try
-        ignore (Llvm_bitreader.get_module context (Llvm.MemoryBuffer.of_stdin ()))
-    with
-    Llvm_bitreader.Error _ -> ();;
-let _ =
-    try
-        ignore (Llvm.MemoryBuffer.of_file "/path/to/nonexistent/file")
-    with
-    Llvm.IoError _ -> ();;

diff --git a/test/Bindings/Ocaml/ipo_opts.ml b/test/Bindings/Ocaml/ipo_opts.ml
deleted file mode 100644
index e0bcbe5..0000000
--- a/test/Bindings/Ocaml/ipo_opts.ml
+++ /dev/null

@@ -1,72 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_ipo.cmxa llvm_target.cmxa %t.builddir/ipo_opts.ml -o %t
- * RUN: %t %t.bc
- * XFAIL: vg_leak
- *)
-
-(* Note: It takes several seconds for ocamlopt to link an executable with
-         libLLVMCore.a, so it's better to write a big test than a bunch of
-         little ones. *)
-
-open Llvm
-open Llvm_ipo
-open Llvm_target
-
-let context = global_context ()
-let void_type = Llvm.void_type context
-let i8_type = Llvm.i8_type context
-
-(* Tiny unit test framework - really just to help find which line is busted *)
-let print_checkpoints = false
-
-let suite name f =
-  if print_checkpoints then
-    prerr_endline (name ^ ":");
-  f ()
-
-
-(*===-- Fixture -----------------------------------------------------------===*)
-
-let filename = Sys.argv.(1)
-let m = create_module context filename
-
-
-(*===-- Transforms --------------------------------------------------------===*)
-
-let test_transforms () =
-  let (++) x f = ignore (f x); x in
-
-  let fty = function_type i8_type [| |] in
-  let fn = define_function "fn" fty m in
-  let fn2 = define_function "fn2" fty m in begin
-      ignore (build_ret (const_int i8_type 4) (builder_at_end context (entry_block fn)));
-      let b = builder_at_end context  (entry_block fn2) in
-      ignore (build_ret (build_call fn [| |] "" b) b);
-  end;
-
-  ignore (PassManager.create ()
-           ++ add_argument_promotion
-           ++ add_constant_merge
-           ++ add_dead_arg_elimination
-           ++ add_function_attrs
-           ++ add_function_inlining
-           ++ add_always_inliner
-           ++ add_global_dce
-           ++ add_global_optimizer
-           ++ add_ipc_propagation
-           ++ add_prune_eh
-           ++ add_ipsccp
-           ++ add_internalize
-           ++ add_strip_dead_prototypes
-           ++ add_strip_symbols
-           ++ PassManager.run_module m
-           ++ PassManager.dispose)
-
-
-(*===-- Driver ------------------------------------------------------------===*)
-
-let _ =
-  suite "transforms" test_transforms;
-  dispose_module m

diff --git a/test/Bindings/Ocaml/irreader.ml b/test/Bindings/Ocaml/irreader.ml
deleted file mode 100644
index 3511c2b..0000000
--- a/test/Bindings/Ocaml/irreader.ml
+++ /dev/null

@@ -1,59 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -g -warn-error A llvm.cmxa llvm_irreader.cmxa %t.builddir/irreader.ml -o %t
- * RUN: %t
- * XFAIL: vg_leak
- *)
-
-(* Note: It takes several seconds for ocamlopt to link an executable with
-         libLLVMCore.a, so it's better to write a big test than a bunch of
-         little ones. *)
-
-open Llvm
-open Llvm_irreader
-
-let context = global_context ()
-
-(* Tiny unit test framework - really just to help find which line is busted *)
-let print_checkpoints = false
-
-let suite name f =
-  if print_checkpoints then
-    prerr_endline (name ^ ":");
-  f ()
-
-let _ =
-  Printexc.record_backtrace true
-
-let insist cond =
-  if not cond then failwith "insist"
-
-
-(*===-- IR Reader ---------------------------------------------------------===*)
-
-let test_irreader () =
-  begin
-    let buf = MemoryBuffer.of_string "@foo = global i32 42" in
-    let m   = parse_ir context buf in
-    match lookup_global "foo" m with
-    | Some foo ->
-        insist ((global_initializer foo) = (const_int (i32_type context) 42))
-    | None ->
-        failwith "global"
-  end;
-
-  begin
-    let buf = MemoryBuffer.of_string "@foo = global garble" in
-    try
-      ignore (parse_ir context buf);
-      failwith "parsed"
-    with Llvm_irreader.Error _ ->
-      ()
-  end
-
-
-(*===-- Driver ------------------------------------------------------------===*)
-
-let _ =
-  suite "irreader" test_irreader

diff --git a/test/Bindings/Ocaml/linker.ml b/test/Bindings/Ocaml/linker.ml
deleted file mode 100644
index 9359ae9..0000000
--- a/test/Bindings/Ocaml/linker.ml
+++ /dev/null

@@ -1,63 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_linker.cmxa %t.builddir/linker.ml -o %t
- * RUN: %t
- * XFAIL: vg_leak
- *)
-
-(* Note: It takes several seconds for ocamlopt to link an executable with
-         libLLVMCore.a, so it's better to write a big test than a bunch of
-         little ones. *)
-
-open Llvm
-open Llvm_linker
-
-let context = global_context ()
-let void_type = Llvm.void_type context
-
-(* Tiny unit test framework - really just to help find which line is busted *)
-let print_checkpoints = false
-
-let suite name f =
-  if print_checkpoints then
-    prerr_endline (name ^ ":");
-  f ()
-
-
-(*===-- Linker -----------------------------------------------------------===*)
-
-let test_linker () =
-  let fty = function_type void_type [| |] in
-
-  let make_module name =
-    let m = create_module context name in
-    let fn = define_function ("fn_" ^ name) fty m in
-    ignore (build_ret_void (builder_at_end context (entry_block fn)));
-    m
-  in
-
-  let m1 = make_module "one"
-  and m2 = make_module "two" in
-  link_modules m1 m2 Mode.PreserveSource;
-  dispose_module m1;
-  dispose_module m2;
-
-  let m1 = make_module "one"
-  and m2 = make_module "two" in
-  link_modules m1 m2 Mode.DestroySource;
-  dispose_module m1;
-
-  let m1 = make_module "one"
-  and m2 = make_module "one" in
-  try
-    link_modules m1 m2 Mode.PreserveSource;
-    failwith "must raise"
-  with Error _ ->
-    dispose_module m1;
-    dispose_module m2
-
-(*===-- Driver ------------------------------------------------------------===*)
-
-let _ =
-  suite "linker" test_linker

diff --git a/test/Bindings/Ocaml/lit.local.cfg b/test/Bindings/Ocaml/lit.local.cfg
deleted file mode 100644
index c38d89a..0000000
--- a/test/Bindings/Ocaml/lit.local.cfg
+++ /dev/null

@@ -1,5 +0,0 @@
-config.suffixes = ['.ml']
-
-bindings = set([s.strip() for s in config.root.llvm_bindings.split(',')])
-if not 'ocaml' in bindings:
-    config.unsupported = True

diff --git a/test/Bindings/Ocaml/passmgr_builder.ml b/test/Bindings/Ocaml/passmgr_builder.ml
deleted file mode 100644
index 1a3102f..0000000
--- a/test/Bindings/Ocaml/passmgr_builder.ml
+++ /dev/null

@@ -1,64 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_passmgr_builder.cmxa %t.builddir/passmgr_builder.ml -o %t
- * RUN: %t %t.bc
- * XFAIL: vg_leak
- *)
-
-(* Note: It takes several seconds for ocamlopt to link an executable with
-         libLLVMCore.a, so it's better to write a big test than a bunch of
-         little ones. *)
-
-open Llvm
-open Llvm_passmgr_builder
-
-let context = global_context ()
-let void_type = Llvm.void_type context
-
-(* Tiny unit test framework - really just to help find which line is busted *)
-let print_checkpoints = false
-
-let suite name f =
-  if print_checkpoints then
-    prerr_endline (name ^ ":");
-  f ()
-
-
-(*===-- Fixture -----------------------------------------------------------===*)
-
-let filename = Sys.argv.(1)
-let m = create_module context filename
-
-
-(*===-- Pass Manager Builder ----------------------------------------------===*)
-
-let test_pmbuilder () =
-  let (++) x f = ignore (f x); x in
-
-  let module_passmgr = PassManager.create () in
-  let func_passmgr   = PassManager.create_function m in
-  let lto_passmgr    = PassManager.create () in
-
-  ignore (Llvm_passmgr_builder.create ()
-           ++ set_opt_level 3
-           ++ set_size_level 1
-           ++ set_disable_unit_at_a_time false
-           ++ set_disable_unroll_loops false
-           ++ use_inliner_with_threshold 10
-           ++ populate_function_pass_manager func_passmgr
-           ++ populate_module_pass_manager module_passmgr
-           ++ populate_lto_pass_manager lto_passmgr
-                  ~internalize:false ~run_inliner:false);
-  Gc.compact ();
-
-  PassManager.dispose module_passmgr;
-  PassManager.dispose func_passmgr;
-  PassManager.dispose lto_passmgr
-
-
-(*===-- Driver ------------------------------------------------------------===*)
-
-let _ =
-  suite "pass manager builder" test_pmbuilder;
-  dispose_module m

diff --git a/test/Bindings/Ocaml/scalar_opts.ml b/test/Bindings/Ocaml/scalar_opts.ml
deleted file mode 100644
index 39913e4..0000000
--- a/test/Bindings/Ocaml/scalar_opts.ml
+++ /dev/null

@@ -1,87 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_scalar_opts.cmxa llvm_target.cmxa %t.builddir/scalar_opts.ml -o %t
- * RUN: %t %t.bc
- * XFAIL: vg_leak
- *)
-
-(* Note: It takes several seconds for ocamlopt to link an executable with
-         libLLVMCore.a, so it's better to write a big test than a bunch of
-         little ones. *)
-
-open Llvm
-open Llvm_scalar_opts
-open Llvm_target
-
-let context = global_context ()
-let void_type = Llvm.void_type context
-
-(* Tiny unit test framework - really just to help find which line is busted *)
-let print_checkpoints = false
-
-let suite name f =
-  if print_checkpoints then
-    prerr_endline (name ^ ":");
-  f ()
-
-
-(*===-- Fixture -----------------------------------------------------------===*)
-
-let filename = Sys.argv.(1)
-let m = create_module context filename
-
-
-(*===-- Transforms --------------------------------------------------------===*)
-
-let test_transforms () =
-  let (++) x f = ignore (f x); x in
-
-  let fty = function_type void_type [| |] in
-  let fn = define_function "fn" fty m in
-  ignore (build_ret_void (builder_at_end context (entry_block fn)));
-  
-  ignore (PassManager.create_function m
-           ++ add_verifier
-           ++ add_constant_propagation
-           ++ add_sccp
-           ++ add_dead_store_elimination
-           ++ add_aggressive_dce
-           ++ add_scalar_repl_aggregation
-           ++ add_scalar_repl_aggregation_ssa
-           ++ add_scalar_repl_aggregation_with_threshold 4
-           ++ add_ind_var_simplification
-           ++ add_instruction_combination
-           ++ add_licm
-           ++ add_loop_unswitch
-           ++ add_loop_unroll
-           ++ add_loop_rotation
-           ++ add_memory_to_register_promotion
-           ++ add_memory_to_register_demotion
-           ++ add_reassociation
-           ++ add_jump_threading
-           ++ add_cfg_simplification
-           ++ add_tail_call_elimination
-           ++ add_gvn
-           ++ add_memcpy_opt
-           ++ add_loop_deletion
-           ++ add_loop_idiom
-           ++ add_lib_call_simplification
-           ++ add_correlated_value_propagation
-           ++ add_early_cse
-           ++ add_lower_expect_intrinsic
-           ++ add_type_based_alias_analysis
-           ++ add_basic_alias_analysis
-           ++ add_partially_inline_lib_calls
-           ++ add_verifier
-           ++ PassManager.initialize
-           ++ PassManager.run_function fn
-           ++ PassManager.finalize
-           ++ PassManager.dispose)
-
-
-(*===-- Driver ------------------------------------------------------------===*)
-
-let _ =
-  suite "transforms" test_transforms;
-  dispose_module m

diff --git a/test/Bindings/Ocaml/target.ml b/test/Bindings/Ocaml/target.ml
deleted file mode 100644
index 0a2283a..0000000
--- a/test/Bindings/Ocaml/target.ml
+++ /dev/null

@@ -1,116 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -g -warn-error A llvm.cmxa llvm_target.cmxa llvm_executionengine.cmxa %t.builddir/target.ml -o %t
- * RUN: %t %t.bc
- * REQUIRES: native, object-emission
- * XFAIL: vg_leak
- *)
-
-(* Note: It takes several seconds for ocamlopt to link an executable with
-         libLLVMCore.a, so it's better to write a big test than a bunch of
-         little ones. *)
-
-open Llvm
-open Llvm_target
-
-let _ = Llvm_executionengine.initialize_native_target ()
-
-let context = global_context ()
-let i32_type = Llvm.i32_type context
-let i64_type = Llvm.i64_type context
-
-(* Tiny unit test framework - really just to help find which line is busted *)
-let print_checkpoints = false
-
-let _ =
-  Printexc.record_backtrace true
-
-let assert_equal a b =
-  if a <> b then failwith "assert_equal"
-
-
-(*===-- Fixture -----------------------------------------------------------===*)
-
-let filename = Sys.argv.(1)
-let m = create_module context filename
-
-let target = Target.by_triple (Target.default_triple ())
-
-let machine = TargetMachine.create (Target.default_triple ()) target
-
-(*===-- Data Layout -------------------------------------------------------===*)
-
-let test_target_data () =
-  let module DL = DataLayout in
-  let layout = "e-p:32:32-f64:32:64-v64:32:64-v128:32:128-n32-S32" in
-  let dl     = DL.of_string layout in
-  let sty    = struct_type context [| i32_type; i64_type |] in
-
-  assert_equal (DL.as_string dl) layout;
-  assert_equal (DL.byte_order dl) Endian.Little;
-  assert_equal (DL.pointer_size dl) 4;
-  assert_equal (DL.intptr_type context dl) i32_type;
-  assert_equal (DL.qualified_pointer_size 0 dl) 4;
-  assert_equal (DL.qualified_intptr_type context 0 dl) i32_type;
-  assert_equal (DL.size_in_bits sty dl) (Int64.of_int 96);
-  assert_equal (DL.store_size sty dl) (Int64.of_int 12);
-  assert_equal (DL.abi_size sty dl) (Int64.of_int 12);
-  assert_equal (DL.stack_align sty dl) 4;
-  assert_equal (DL.preferred_align sty dl) 8;
-  assert_equal (DL.preferred_align_of_global (declare_global sty "g" m) dl) 8;
-  assert_equal (DL.element_at_offset sty (Int64.of_int 1) dl) 0;
-  assert_equal (DL.offset_of_element sty 1 dl) (Int64.of_int 4);
-
-  let pm = PassManager.create () in
-  ignore (DL.add_to_pass_manager pm dl)
-
-
-(*===-- Target ------------------------------------------------------------===*)
-
-let test_target () =
-  let module T = Target in
-  ignore (T.succ target);
-  ignore (T.name target);
-  ignore (T.description target);
-  ignore (T.has_jit target);
-  ignore (T.has_target_machine target);
-  ignore (T.has_asm_backend target)
-
-
-(*===-- Target Machine ----------------------------------------------------===*)
-
-let test_target_machine () =
-  let module TM = TargetMachine in
-  assert_equal (TM.target machine) target;
-  assert_equal (TM.triple machine) (Target.default_triple ());
-  assert_equal (TM.cpu machine) "";
-  assert_equal (TM.features machine) "";
-  ignore (TM.data_layout machine);
-  TM.set_verbose_asm true machine
-
-
-(*===-- Code Emission -----------------------------------------------------===*)
-
-let test_code_emission () =
-  TargetMachine.emit_to_file m CodeGenFileType.ObjectFile filename machine;
-  try
-    TargetMachine.emit_to_file m CodeGenFileType.ObjectFile
-                               "/nonexistent/file" machine;
-    failwith "must raise"
-  with Llvm_target.Error _ ->
-    ();
-
-  let buf = TargetMachine.emit_to_memory_buffer m CodeGenFileType.ObjectFile
-                                                machine in
-  Llvm.MemoryBuffer.dispose buf
-
-
-(*===-- Driver ------------------------------------------------------------===*)
-
-let _ =
-  test_target_data ();
-  test_target ();
-  test_target_machine ();
-  (* test_code_emission (); *) (* broken without AsmParser support *)
-  dispose_module m

diff --git a/test/Bindings/Ocaml/vectorize_opts.ml b/test/Bindings/Ocaml/vectorize_opts.ml
deleted file mode 100644
index 5ef985d..0000000
--- a/test/Bindings/Ocaml/vectorize_opts.ml
+++ /dev/null

@@ -1,56 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_vectorize.cmxa llvm_target.cmxa %t.builddir/vectorize_opts.ml -o %t
- * RUN: %t %t.bc
- * XFAIL: vg_leak
- *)
-
-(* Note: It takes several seconds for ocamlopt to link an executable with
-         libLLVMCore.a, so it's better to write a big test than a bunch of
-         little ones. *)
-
-open Llvm
-open Llvm_vectorize
-open Llvm_target
-
-let context = global_context ()
-let void_type = Llvm.void_type context
-
-(* Tiny unit test framework - really just to help find which line is busted *)
-let print_checkpoints = false
-
-let suite name f =
-  if print_checkpoints then
-    prerr_endline (name ^ ":");
-  f ()
-
-
-(*===-- Fixture -----------------------------------------------------------===*)
-
-let filename = Sys.argv.(1)
-let m = create_module context filename
-
-
-(*===-- Transforms --------------------------------------------------------===*)
-
-let test_transforms () =
-  let (++) x f = ignore (f x); x in
-
-  let fty = function_type void_type [| |] in
-  let fn = define_function "fn" fty m in
-  ignore (build_ret_void (builder_at_end context (entry_block fn)));
-
-  ignore (PassManager.create ()
-           ++ add_bb_vectorize
-           ++ add_loop_vectorize
-           ++ add_slp_vectorize
-           ++ PassManager.run_module m
-           ++ PassManager.dispose)
-
-
-(*===-- Driver ------------------------------------------------------------===*)
-
-let _ =
-  suite "transforms" test_transforms;
-  dispose_module m

diff --git a/test/Bindings/Ocaml/vmcore.ml b/test/Bindings/Ocaml/vmcore.ml
deleted file mode 100644
index f014116..0000000
--- a/test/Bindings/Ocaml/vmcore.ml
+++ /dev/null

@@ -1,1421 +0,0 @@
-(* RUN: rm -rf %t.builddir
- * RUN: mkdir -p %t.builddir
- * RUN: cp %s %t.builddir
- * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_analysis.cmxa llvm_bitwriter.cmxa %t.builddir/vmcore.ml -o %t
- * RUN: %t %t.bc
- * RUN: llvm-dis < %t.bc > %t.ll
- * RUN: FileCheck %s < %t.ll
- * Do a second pass for things that shouldn't be anywhere.
- * RUN: FileCheck -check-prefix=CHECK-NOWHERE %s < %t.ll
- * XFAIL: vg_leak
- *)
-
-(* Note: It takes several seconds for ocamlopt to link an executable with
-         libLLVMCore.a, so it's better to write a big test than a bunch of
-         little ones. *)
-
-open Llvm
-open Llvm_bitwriter
-
-
-(* Tiny unit test framework - really just to help find which line is busted *)
-let exit_status = ref 0
-let suite_name = ref ""
-let group_name = ref ""
-let case_num = ref 0
-let print_checkpoints = false
-let context = global_context ()
-let i1_type = Llvm.i1_type context
-let i8_type = Llvm.i8_type context
-let i16_type = Llvm.i16_type context
-let i32_type = Llvm.i32_type context
-let i64_type = Llvm.i64_type context
-let void_type = Llvm.void_type context
-let float_type = Llvm.float_type context
-let double_type = Llvm.double_type context
-let fp128_type = Llvm.fp128_type context
-
-let group name =
-  group_name := !suite_name ^ "/" ^ name;
-  case_num := 0;
-  if print_checkpoints then
-    prerr_endline ("  " ^ name ^ "...")
-
-let insist cond =
-  incr case_num;
-  if not cond then
-    exit_status := 10;
-  match print_checkpoints, cond with
-  | false, true -> ()
-  | false, false ->
-      prerr_endline ("FAILED: " ^ !suite_name ^ "/" ^ !group_name ^ " #" ^ (string_of_int !case_num))
-  | true, true ->
-      prerr_endline ("    " ^ (string_of_int !case_num))
-  | true, false ->
-      prerr_endline ("    " ^ (string_of_int !case_num) ^ " FAIL")
-
-let suite name f =
-  suite_name := name;
-  if print_checkpoints then
-    prerr_endline (name ^ ":");
-  f ()
-
-
-(*===-- Fixture -----------------------------------------------------------===*)
-
-let filename = Sys.argv.(1)
-let m = create_module context filename
-
-
-(*===-- Conversion --------------------------------------------------------===*)
-
-let test_conversion () =
-  insist ("i32" = (string_of_lltype i32_type));
-  let c = const_int i32_type 42 in
-  insist ("i32 42" = (string_of_llvalue c))
-
-
-(*===-- Target ------------------------------------------------------------===*)
-
-let test_target () =
-  begin group "triple";
-    let trip = "i686-apple-darwin8" in
-    set_target_triple trip m;
-    insist (trip = target_triple m)
-  end;
-  
-  begin group "layout";
-    let layout = "e" in
-    set_data_layout layout m;
-    insist (layout = data_layout m)
-  end
-  (* CHECK: target datalayout = "e"
-   * CHECK: target triple = "i686-apple-darwin8"
-   *)
-
-
-(*===-- Constants ---------------------------------------------------------===*)
-
-let test_constants () =
-  (* CHECK: const_int{{.*}}i32{{.*}}-1
-   *)
-  group "int";
-  let c = const_int i32_type (-1) in
-  ignore (define_global "const_int" c m);
-  insist (i32_type = type_of c);
-  insist (is_constant c);
-
-  (* CHECK: const_sext_int{{.*}}i64{{.*}}-1
-   *)
-  group "sext int";
-  let c = const_int i64_type (-1) in
-  ignore (define_global "const_sext_int" c m);
-  insist (i64_type = type_of c);
-
-  (* CHECK: const_zext_int64{{.*}}i64{{.*}}4294967295
-   *)
-  group "zext int64";
-  let c = const_of_int64 i64_type (Int64.of_string "4294967295") false in
-  ignore (define_global "const_zext_int64" c m);
-  insist (i64_type = type_of c);
-
-  (* CHECK: const_int_string{{.*}}i32{{.*}}-1
-   *)
-  group "int string";
-  let c = const_int_of_string i32_type "-1" 10 in
-  ignore (define_global "const_int_string" c m);
-  insist (i32_type = type_of c);
-
-  (* CHECK: @const_string = global {{.*}}c"cruel\00world"
-   *)
-  group "string";
-  let c = const_string context "cruel\000world" in
-  ignore (define_global "const_string" c m);
-  insist ((array_type i8_type 11) = type_of c);
-
-  (* CHECK: const_stringz{{.*}}"hi\00again\00"
-   *)
-  group "stringz";
-  let c = const_stringz context "hi\000again" in
-  ignore (define_global "const_stringz" c m);
-  insist ((array_type i8_type 9) = type_of c);
-
-  (* CHECK: const_single{{.*}}2.75
-   * CHECK: const_double{{.*}}3.1459
-   * CHECK: const_double_string{{.*}}1.25
-   *)
-  begin group "real";
-    let cs = const_float float_type 2.75 in
-    ignore (define_global "const_single" cs m);
-    insist (float_type = type_of cs);
-    
-    let cd = const_float double_type 3.1459 in
-    ignore (define_global "const_double" cd m);
-    insist (double_type = type_of cd);
-
-    let cd = const_float_of_string double_type "1.25" in
-    ignore (define_global "const_double_string" cd m);
-    insist (double_type = type_of cd)
-  end;
-  
-  let one = const_int i16_type 1 in
-  let two = const_int i16_type 2 in
-  let three = const_int i32_type 3 in
-  let four = const_int i32_type 4 in
-  
-  (* CHECK: const_array{{.*}}[i32 3, i32 4]
-   *)
-  group "array";
-  let c = const_array i32_type [| three; four |] in
-  ignore (define_global "const_array" c m);
-  insist ((array_type i32_type 2) = (type_of c));
-  
-  (* CHECK: const_vector{{.*}}<i16 1, i16 2{{.*}}>
-   *)
-  group "vector";
-  let c = const_vector [| one; two; one; two;
-                          one; two; one; two |] in
-  ignore (define_global "const_vector" c m);
-  insist ((vector_type i16_type 8) = (type_of c));
-
-  (* CHECK: const_structure{{.*.}}i16 1, i16 2, i32 3, i32 4
-   *)
-  group "structure";
-  let c = const_struct context [| one; two; three; four |] in
-  ignore (define_global "const_structure" c m);
-  insist ((struct_type context [| i16_type; i16_type; i32_type; i32_type |])
-        = (type_of c));
-
-  (* CHECK: const_null{{.*}}zeroinit
-   *)
-  group "null";
-  let c = const_null (packed_struct_type context [| i1_type; i8_type; i64_type;
-                                                    double_type |]) in
-  ignore (define_global "const_null" c m);
-  
-  (* CHECK: const_all_ones{{.*}}-1
-   *)
-  group "all ones";
-  let c = const_all_ones i64_type in
-  ignore (define_global "const_all_ones" c m);
-
-  group "pointer null"; begin
-    (* CHECK: const_pointer_null = global i64* null
-     *)
-    let c = const_pointer_null (pointer_type i64_type) in
-    ignore (define_global "const_pointer_null" c m);
-  end;
-  
-  (* CHECK: const_undef{{.*}}undef
-   *)
-  group "undef";
-  let c = undef i1_type in
-  ignore (define_global "const_undef" c m);
-  insist (i1_type = type_of c);
-  insist (is_undef c);
-  
-  group "constant arithmetic";
-  (* CHECK: @const_neg = global i64 sub
-   * CHECK: @const_nsw_neg = global i64 sub nsw
-   * CHECK: @const_nuw_neg = global i64 sub nuw
-   * CHECK: @const_fneg = global double fsub
-   * CHECK: @const_not = global i64 xor
-   * CHECK: @const_add = global i64 add
-   * CHECK: @const_nsw_add = global i64 add nsw
-   * CHECK: @const_nuw_add = global i64 add nuw
-   * CHECK: @const_fadd = global double fadd
-   * CHECK: @const_sub = global i64 sub
-   * CHECK: @const_nsw_sub = global i64 sub nsw
-   * CHECK: @const_nuw_sub = global i64 sub nuw
-   * CHECK: @const_fsub = global double fsub
-   * CHECK: @const_mul = global i64 mul
-   * CHECK: @const_nsw_mul = global i64 mul nsw
-   * CHECK: @const_nuw_mul = global i64 mul nuw
-   * CHECK: @const_fmul = global double fmul
-   * CHECK: @const_udiv = global i64 udiv
-   * CHECK: @const_sdiv = global i64 sdiv
-   * CHECK: @const_exact_sdiv = global i64 sdiv exact
-   * CHECK: @const_fdiv = global double fdiv
-   * CHECK: @const_urem = global i64 urem
-   * CHECK: @const_srem = global i64 srem
-   * CHECK: @const_frem = global double frem
-   * CHECK: @const_and = global i64 and
-   * CHECK: @const_or = global i64 or
-   * CHECK: @const_xor = global i64 xor
-   * CHECK: @const_icmp = global i1 icmp sle
-   * CHECK: @const_fcmp = global i1 fcmp ole
-   *)
-  let void_ptr = pointer_type i8_type in
-  let five = const_int i64_type 5 in
-  let ffive = const_uitofp five double_type in
-  let foldbomb_gv = define_global "FoldBomb" (const_null i8_type) m in
-  let foldbomb = const_ptrtoint foldbomb_gv i64_type in
-  let ffoldbomb = const_uitofp foldbomb double_type in
-  ignore (define_global "const_neg" (const_neg foldbomb) m);
-  ignore (define_global "const_nsw_neg" (const_nsw_neg foldbomb) m);
-  ignore (define_global "const_nuw_neg" (const_nuw_neg foldbomb) m);
-  ignore (define_global "const_fneg" (const_fneg ffoldbomb) m);
-  ignore (define_global "const_not" (const_not foldbomb) m);
-  ignore (define_global "const_add" (const_add foldbomb five) m);
-  ignore (define_global "const_nsw_add" (const_nsw_add foldbomb five) m);
-  ignore (define_global "const_nuw_add" (const_nuw_add foldbomb five) m);
-  ignore (define_global "const_fadd" (const_fadd ffoldbomb ffive) m);
-  ignore (define_global "const_sub" (const_sub foldbomb five) m);
-  ignore (define_global "const_nsw_sub" (const_nsw_sub foldbomb five) m);
-  ignore (define_global "const_nuw_sub" (const_nuw_sub foldbomb five) m);
-  ignore (define_global "const_fsub" (const_fsub ffoldbomb ffive) m);
-  ignore (define_global "const_mul" (const_mul foldbomb five) m);
-  ignore (define_global "const_nsw_mul" (const_nsw_mul foldbomb five) m);
-  ignore (define_global "const_nuw_mul" (const_nuw_mul foldbomb five) m);
-  ignore (define_global "const_fmul" (const_fmul ffoldbomb ffive) m);
-  ignore (define_global "const_udiv" (const_udiv foldbomb five) m);
-  ignore (define_global "const_sdiv" (const_sdiv foldbomb five) m);
-  ignore (define_global "const_exact_sdiv" (const_exact_sdiv foldbomb five) m);
-  ignore (define_global "const_fdiv" (const_fdiv ffoldbomb ffive) m);
-  ignore (define_global "const_urem" (const_urem foldbomb five) m);
-  ignore (define_global "const_srem" (const_srem foldbomb five) m);
-  ignore (define_global "const_frem" (const_frem ffoldbomb ffive) m);
-  ignore (define_global "const_and" (const_and foldbomb five) m);
-  ignore (define_global "const_or" (const_or foldbomb five) m);
-  ignore (define_global "const_xor" (const_xor foldbomb five) m);
-  ignore (define_global "const_icmp" (const_icmp Icmp.Sle foldbomb five) m);
-  ignore (define_global "const_fcmp" (const_fcmp Fcmp.Ole ffoldbomb ffive) m);
-  
-  group "constant casts";
-  (* CHECK: const_trunc{{.*}}trunc
-   * CHECK: const_sext{{.*}}sext
-   * CHECK: const_zext{{.*}}zext
-   * CHECK: const_fptrunc{{.*}}fptrunc
-   * CHECK: const_fpext{{.*}}fpext
-   * CHECK: const_uitofp{{.*}}uitofp
-   * CHECK: const_sitofp{{.*}}sitofp
-   * CHECK: const_fptoui{{.*}}fptoui
-   * CHECK: const_fptosi{{.*}}fptosi
-   * CHECK: const_ptrtoint{{.*}}ptrtoint
-   * CHECK: const_inttoptr{{.*}}inttoptr
-   * CHECK: const_bitcast{{.*}}bitcast
-   * CHECK: const_intcast{{.*}}zext
-   *)
-  let i128_type = integer_type context 128 in
-  ignore (define_global "const_trunc" (const_trunc (const_add foldbomb five)
-                                               i8_type) m);
-  ignore (define_global "const_sext" (const_sext foldbomb i128_type) m);
-  ignore (define_global "const_zext" (const_zext foldbomb i128_type) m);
-  ignore (define_global "const_fptrunc" (const_fptrunc ffoldbomb float_type) m);
-  ignore (define_global "const_fpext" (const_fpext ffoldbomb fp128_type) m);
-  ignore (define_global "const_uitofp" (const_uitofp foldbomb double_type) m);
-  ignore (define_global "const_sitofp" (const_sitofp foldbomb double_type) m);
-  ignore (define_global "const_fptoui" (const_fptoui ffoldbomb i32_type) m);
-  ignore (define_global "const_fptosi" (const_fptosi ffoldbomb i32_type) m);
-  ignore (define_global "const_ptrtoint" (const_ptrtoint 
-    (const_gep (const_null (pointer_type i8_type))
-               [| const_int i32_type 1 |])
-    i32_type) m);
-  ignore (define_global "const_inttoptr" (const_inttoptr (const_add foldbomb five)
-                                                  void_ptr) m);
-  ignore (define_global "const_bitcast" (const_bitcast ffoldbomb i64_type) m);
-  ignore (define_global "const_intcast"
-          (const_intcast foldbomb i128_type ~is_signed:false) m);
-  
-  group "misc constants";
-  (* CHECK: const_size_of{{.*}}getelementptr{{.*}}null
-   * CHECK: const_gep{{.*}}getelementptr
-   * CHECK: const_select{{.*}}select
-   * CHECK: const_extractelement{{.*}}extractelement
-   * CHECK: const_insertelement{{.*}}insertelement
-   * CHECK: const_shufflevector = global <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-   *)
-  ignore (define_global "const_size_of" (size_of (pointer_type i8_type)) m);
-  ignore (define_global "const_gep" (const_gep foldbomb_gv [| five |]) m);
-  ignore (define_global "const_select" (const_select
-    (const_icmp Icmp.Sle foldbomb five)
-    (const_int i8_type (-1))
-    (const_int i8_type 0)) m);
-  let zero = const_int i32_type 0 in
-  let one  = const_int i32_type 1 in
-  ignore (define_global "const_extractelement" (const_extractelement
-    (const_vector [| zero; one; zero; one |])
-    (const_trunc foldbomb i32_type)) m);
-  ignore (define_global "const_insertelement" (const_insertelement
-    (const_vector [| zero; one; zero; one |])
-    zero (const_trunc foldbomb i32_type)) m);
-  ignore (define_global "const_shufflevector" (const_shufflevector
-    (const_vector [| zero; one |])
-    (const_vector [| one; zero |])
-    (const_vector [| const_int i32_type 0; const_int i32_type 1;
-                     const_int i32_type 2; const_int i32_type 3 |])) m);
-
-  group "asm"; begin
-    let ft = function_type void_type [| i32_type; i32_type; i32_type |] in
-    ignore (const_inline_asm
-      ft
-      ""
-      "{cx},{ax},{di},~{dirflag},~{fpsr},~{flags},~{edi},~{ecx}"
-      true
-      false)
-  end;
-
-  group "recursive struct"; begin
-      let nsty = named_struct_type context "rec" in
-      let pty = pointer_type nsty in
-      struct_set_body nsty [| i32_type; pty |] false;
-      let elts = [| const_int i32_type 4; const_pointer_null pty |] in
-      let grec_init = const_named_struct nsty elts in
-      ignore (define_global "grec" grec_init m);
-      ignore (string_of_lltype nsty);
-  end
-
-
-(*===-- Global Values -----------------------------------------------------===*)
-
-let test_global_values () =
-  let (++) x f = f x; x in
-  let zero32 = const_null i32_type in
-
-  (* CHECK: GVal01
-   *)
-  group "naming";
-  let g = define_global "TEMPORARY" zero32 m in
-  insist ("TEMPORARY" = value_name g);
-  set_value_name "GVal01" g;
-  insist ("GVal01" = value_name g);
-
-  (* CHECK: GVal02{{.*}}linkonce
-   *)
-  group "linkage";
-  let g = define_global "GVal02" zero32 m ++
-          set_linkage Linkage.Link_once in
-  insist (Linkage.Link_once = linkage g);
-
-  (* CHECK: GVal03{{.*}}Hanalei
-   *)
-  group "section";
-  let g = define_global "GVal03" zero32 m ++
-          set_section "Hanalei" in
-  insist ("Hanalei" = section g);
-  
-  (* CHECK: GVal04{{.*}}hidden
-   *)
-  group "visibility";
-  let g = define_global "GVal04" zero32 m ++
-          set_visibility Visibility.Hidden in
-  insist (Visibility.Hidden = visibility g);
-  
-  (* CHECK: GVal05{{.*}}align 128
-   *)
-  group "alignment";
-  let g = define_global "GVal05" zero32 m ++
-          set_alignment 128 in
-  insist (128 = alignment g)
-
-
-(*===-- Global Variables --------------------------------------------------===*)
-
-let test_global_variables () =
-  let (++) x f = f x; x in
-  let forty_two32 = const_int i32_type 42 in
-
-  group "declarations"; begin
-    (* CHECK: @GVar01 = external global i32
-     * CHECK: @QGVar01 = external addrspace(3) global i32
-     *)
-    insist (None == lookup_global "GVar01" m);
-    let g = declare_global i32_type "GVar01" m in
-    insist (is_declaration g);
-    insist (pointer_type float_type ==
-              type_of (declare_global float_type "GVar01" m));
-    insist (g == declare_global i32_type "GVar01" m);
-    insist (match lookup_global "GVar01" m with Some x -> x = g
-                                              | None -> false);
-
-    insist (None == lookup_global "QGVar01" m);
-    let g = declare_qualified_global i32_type "QGVar01" 3 m in
-    insist (is_declaration g);
-    insist (qualified_pointer_type float_type 3 ==
-              type_of (declare_qualified_global float_type "QGVar01" 3 m));
-    insist (g == declare_qualified_global i32_type "QGVar01" 3 m);
-    insist (match lookup_global "QGVar01" m with Some x -> x = g
-                                              | None -> false);
-  end;
-  
-  group "definitions"; begin
-    (* CHECK: @GVar02 = global i32 42
-     * CHECK: @GVar03 = global i32 42
-     * CHECK: @QGVar02 = addrspace(3) global i32 42
-     * CHECK: @QGVar03 = addrspace(3) global i32 42
-     *)
-    let g = define_global "GVar02" forty_two32 m in
-    let g2 = declare_global i32_type "GVar03" m ++
-           set_initializer forty_two32 in
-    insist (not (is_declaration g));
-    insist (not (is_declaration g2));
-    insist ((global_initializer g) == (global_initializer g2));
-
-    let g = define_qualified_global "QGVar02" forty_two32 3 m in
-    let g2 = declare_qualified_global i32_type "QGVar03" 3 m ++
-           set_initializer forty_two32 in
-    insist (not (is_declaration g));
-    insist (not (is_declaration g2));
-    insist ((global_initializer g) == (global_initializer g2));
-  end;
-
-  (* CHECK: GVar04{{.*}}thread_local
-   *)
-  group "threadlocal";
-  let g = define_global "GVar04" forty_two32 m ++
-          set_thread_local true in
-  insist (is_thread_local g);
-
-  (* CHECK: GVar05{{.*}}thread_local(initialexec)
-   *)
-  group "threadlocal_mode";
-  let g = define_global "GVar05" forty_two32 m ++
-          set_thread_local_mode ThreadLocalMode.InitialExec in
-  insist ((thread_local_mode g) = ThreadLocalMode.InitialExec);
-
-  (* CHECK: GVar06{{.*}}externally_initialized
-   *)
-  group "externally_initialized";
-  let g = define_global "GVar06" forty_two32 m ++
-          set_externally_initialized true in
-  insist (is_externally_initialized g);
-
-  (* CHECK-NOWHERE-NOT: GVar07
-   *)
-  group "delete";
-  let g = define_global "GVar07" forty_two32 m in
-  delete_global g;
-
-  (* CHECK: ConstGlobalVar{{.*}}constant
-   *)
-  group "constant";
-  let g = define_global "ConstGlobalVar" forty_two32 m in
-  insist (not (is_global_constant g));
-  set_global_constant true g;
-  insist (is_global_constant g);
-  
-  begin group "iteration";
-    let m = create_module context "temp" in
-    
-    insist (At_end m = global_begin m);
-    insist (At_start m = global_end m);
-    
-    let g1 = declare_global i32_type "One" m in
-    let g2 = declare_global i32_type "Two" m in
-    
-    insist (Before g1 = global_begin m);
-    insist (Before g2 = global_succ g1);
-    insist (At_end m = global_succ g2);
-    
-    insist (After g2 = global_end m);
-    insist (After g1 = global_pred g2);
-    insist (At_start m = global_pred g1);
-    
-    let lf s x = s ^ "->" ^ value_name x in
-    insist ("->One->Two" = fold_left_globals lf "" m);
-    
-    let rf x s = value_name x ^ "<-" ^ s in
-    insist ("One<-Two<-" = fold_right_globals rf m "");
-    
-    dispose_module m
-  end
-
-(* String globals built below are emitted here.
- * CHECK: build_global_string{{.*}}stringval
- *)
-
-
-(*===-- Uses --------------------------------------------------------------===*)
-
-let test_uses () =
-  let ty = function_type i32_type [| i32_type; i32_type |] in
-  let fn = define_function "use_function" ty m in
-  let b = builder_at_end context (entry_block fn) in
-
-  let p1 = param fn 0 in
-  let p2 = param fn 1 in
-  let v1 = build_add p1 p2 "v1" b in
-  let v2 = build_add p1 v1 "v2" b in
-  let _ = build_add v1 v2 "v3" b in
-
-  let lf s u = value_name (user u) ^ "->" ^ s in
-  insist ("v2->v3->" = fold_left_uses lf "" v1);
-  let rf u s = value_name (user u) ^ "<-" ^ s in
-  insist ("v3<-v2<-" = fold_right_uses rf v1 "");
-
-  let lf s u = value_name (used_value u) ^ "->" ^ s in
-  insist ("v1->v1->" = fold_left_uses lf "" v1);
-
-  let rf u s = value_name (used_value u) ^ "<-" ^ s in
-  insist ("v1<-v1<-" = fold_right_uses rf v1 "");
-
-  ignore (build_unreachable b)
-
-
-(*===-- Users -------------------------------------------------------------===*)
-
-let test_users () =
-  let ty = function_type i32_type [| i32_type; i32_type |] in
-  let fn = define_function "user_function" ty m in
-  let b = builder_at_end context (entry_block fn) in
-
-  let p1 = param fn 0 in
-  let p2 = param fn 1 in
-  let a3 = build_alloca i32_type "user_alloca" b in
-  let p3 = build_load a3 "user_load" b in
-  let i = build_add p1 p2 "sum" b in
-
-  insist ((num_operands i) = 2);
-  insist ((operand i 0) = p1);
-  insist ((operand i 1) = p2);
-
-  set_operand i 1 p3;
-  insist ((operand i 1) != p2);
-  insist ((operand i 1) = p3);
-
-  ignore (build_unreachable b)
-
-
-(*===-- Aliases -----------------------------------------------------------===*)
-
-let test_aliases () =
-  (* CHECK: @alias = alias i32* @aliasee
-   *)
-  let forty_two32 = const_int i32_type 42 in
-  let v = define_global "aliasee" forty_two32 m in
-  ignore (add_alias m (pointer_type i32_type) v "alias")
-
-
-(*===-- Functions ---------------------------------------------------------===*)
-
-let test_functions () =
-  let ty = function_type i32_type [| i32_type; i64_type |] in
-  let ty2 = function_type i8_type [| i8_type; i64_type |] in
-  
-  (* CHECK: declare i32 @Fn1(i32, i64)
-   *)
-  begin group "declare";
-    insist (None = lookup_function "Fn1" m);
-    let fn = declare_function "Fn1" ty m in
-    insist (pointer_type ty = type_of fn);
-    insist (is_declaration fn);
-    insist (0 = Array.length (basic_blocks fn));
-    insist (pointer_type ty2 == type_of (declare_function "Fn1" ty2 m));
-    insist (fn == declare_function "Fn1" ty m);
-    insist (None <> lookup_function "Fn1" m);
-    insist (match lookup_function "Fn1" m with Some x -> x = fn
-                                             | None -> false);
-    insist (m == global_parent fn)
-  end;
-  
-  (* CHECK-NOWHERE-NOT: Fn2
-   *)
-  group "delete";
-  let fn = declare_function "Fn2" ty m in
-  delete_function fn;
-  
-  (* CHECK: define{{.*}}Fn3
-   *)
-  group "define";
-  let fn = define_function "Fn3" ty m in
-  insist (not (is_declaration fn));
-  insist (1 = Array.length (basic_blocks fn));
-  ignore (build_unreachable (builder_at_end context (entry_block fn)));
-  
-  (* CHECK: define{{.*}}Fn4{{.*}}Param1{{.*}}Param2
-   *)
-  group "params";
-  let fn = define_function "Fn4" ty m in
-  let params = params fn in
-  insist (2 = Array.length params);
-  insist (params.(0) = param fn 0);
-  insist (params.(1) = param fn 1);
-  insist (i32_type = type_of params.(0));
-  insist (i64_type = type_of params.(1));
-  set_value_name "Param1" params.(0);
-  set_value_name "Param2" params.(1);
-  ignore (build_unreachable (builder_at_end context (entry_block fn)));
-  
-  (* CHECK: fastcc{{.*}}Fn5
-   *)
-  group "callconv";
-  let fn = define_function "Fn5" ty m in
-  insist (CallConv.c = function_call_conv fn);
-  set_function_call_conv CallConv.fast fn;
-  insist (CallConv.fast = function_call_conv fn);
-  ignore (build_unreachable (builder_at_end context (entry_block fn)));
-  
-  begin group "gc";
-    (* CHECK: Fn6{{.*}}gc{{.*}}shadowstack
-     *)
-    let fn = define_function "Fn6" ty m in
-    insist (None = gc fn);
-    set_gc (Some "ocaml") fn;
-    insist (Some "ocaml" = gc fn);
-    set_gc None fn;
-    insist (None = gc fn);
-    set_gc (Some "shadowstack") fn;
-    ignore (build_unreachable (builder_at_end context (entry_block fn)));
-  end;
-  
-  begin group "iteration";
-    let m = create_module context "temp" in
-    
-    insist (At_end m = function_begin m);
-    insist (At_start m = function_end m);
-    
-    let f1 = define_function "One" ty m in
-    let f2 = define_function "Two" ty m in
-    
-    insist (Before f1 = function_begin m);
-    insist (Before f2 = function_succ f1);
-    insist (At_end m = function_succ f2);
-    
-    insist (After f2 = function_end m);
-    insist (After f1 = function_pred f2);
-    insist (At_start m = function_pred f1);
-    
-    let lf s x = s ^ "->" ^ value_name x in
-    insist ("->One->Two" = fold_left_functions lf "" m);
-    
-    let rf x s = value_name x ^ "<-" ^ s in
-    insist ("One<-Two<-" = fold_right_functions rf m "");
-    
-    dispose_module m
-  end
-
-
-(*===-- Params ------------------------------------------------------------===*)
-
-let test_params () =
-  begin group "iteration";
-    let m = create_module context "temp" in
-    
-    let vf = define_function "void" (function_type void_type [| |]) m in
-    
-    insist (At_end vf = param_begin vf);
-    insist (At_start vf = param_end vf);
-    
-    let ty = function_type void_type [| i32_type; i32_type |] in
-    let f = define_function "f" ty m in
-    let p1 = param f 0 in
-    let p2 = param f 1 in
-    set_value_name "One" p1;
-    set_value_name "Two" p2;
-    add_param_attr p1 Attribute.Sext;
-    add_param_attr p2 Attribute.Noalias;
-    remove_param_attr p2 Attribute.Noalias;
-    add_function_attr f Attribute.Nounwind;
-    add_function_attr f Attribute.Noreturn;
-    remove_function_attr f Attribute.Noreturn;
-
-    insist (Before p1 = param_begin f);
-    insist (Before p2 = param_succ p1);
-    insist (At_end f = param_succ p2);
-    
-    insist (After p2 = param_end f);
-    insist (After p1 = param_pred p2);
-    insist (At_start f = param_pred p1);
-    
-    let lf s x = s ^ "->" ^ value_name x in
-    insist ("->One->Two" = fold_left_params lf "" f);
-    
-    let rf x s = value_name x ^ "<-" ^ s in
-    insist ("One<-Two<-" = fold_right_params rf f "");
-    
-    dispose_module m
-  end
-
-
-(*===-- Basic Blocks ------------------------------------------------------===*)
-
-let test_basic_blocks () =
-  let ty = function_type void_type [| |] in
-  
-  (* CHECK: Bb1
-   *)
-  group "entry";
-  let fn = declare_function "X" ty m in
-  let bb = append_block context "Bb1" fn in
-  insist (bb = entry_block fn);
-  ignore (build_unreachable (builder_at_end context bb));
-  
-  (* CHECK-NOWHERE-NOT: Bb2
-   *)
-  group "delete";
-  let fn = declare_function "X2" ty m in
-  let bb = append_block context "Bb2" fn in
-  delete_block bb;
-  
-  group "insert";
-  let fn = declare_function "X3" ty m in
-  let bbb = append_block context "b" fn in
-  let bba = insert_block context "a" bbb in
-  insist ([| bba; bbb |] = basic_blocks fn);
-  ignore (build_unreachable (builder_at_end context bba));
-  ignore (build_unreachable (builder_at_end context bbb));
-  
-  (* CHECK: Bb3
-   *)
-  group "name/value";
-  let fn = define_function "X4" ty m in
-  let bb = entry_block fn in
-  ignore (build_unreachable (builder_at_end context bb));
-  let bbv = value_of_block bb in
-  set_value_name "Bb3" bbv;
-  insist ("Bb3" = value_name bbv);
-  
-  group "casts";
-  let fn = define_function "X5" ty m in
-  let bb = entry_block fn in
-  ignore (build_unreachable (builder_at_end context bb));
-  insist (bb = block_of_value (value_of_block bb));
-  insist (value_is_block (value_of_block bb));
-  insist (not (value_is_block (const_null i32_type)));
-  
-  begin group "iteration";
-    let m = create_module context "temp" in
-    let f = declare_function "Temp" (function_type i32_type [| |]) m in
-    
-    insist (At_end f = block_begin f);
-    insist (At_start f = block_end f);
-    
-    let b1 = append_block context "One" f in
-    let b2 = append_block context "Two" f in
-    
-    insist (Before b1 = block_begin f);
-    insist (Before b2 = block_succ b1);
-    insist (At_end f = block_succ b2);
-    
-    insist (After b2 = block_end f);
-    insist (After b1 = block_pred b2);
-    insist (At_start f = block_pred b1);
-    
-    let lf s x = s ^ "->" ^ value_name (value_of_block x) in
-    insist ("->One->Two" = fold_left_blocks lf "" f);
-    
-    let rf x s = value_name (value_of_block x) ^ "<-" ^ s in
-    insist ("One<-Two<-" = fold_right_blocks rf f "");
-    
-    dispose_module m
-  end
-
-
-(*===-- Instructions ------------------------------------------------------===*)
-
-let test_instructions () =
-  begin group "iteration";
-    let m = create_module context "temp" in
-    let fty = function_type void_type [| i32_type; i32_type |] in
-    let f = define_function "f" fty m in
-    let bb = entry_block f in
-    let b = builder_at context (At_end bb) in
-    
-    insist (At_end bb = instr_begin bb);
-    insist (At_start bb = instr_end bb);
-    
-    let i1 = build_add (param f 0) (param f 1) "One" b in
-    let i2 = build_sub (param f 0) (param f 1) "Two" b in
-    
-    insist (Before i1 = instr_begin bb);
-    insist (Before i2 = instr_succ i1);
-    insist (At_end bb = instr_succ i2);
-    
-    insist (After i2 = instr_end bb);
-    insist (After i1 = instr_pred i2);
-    insist (At_start bb = instr_pred i1);
-    
-    let lf s x = s ^ "->" ^ value_name x in
-    insist ("->One->Two" = fold_left_instrs lf "" bb);
-    
-    let rf x s = value_name x ^ "<-" ^ s in
-    insist ("One<-Two<-" = fold_right_instrs rf bb "");
-    
-    dispose_module m
-  end
-
-
-(*===-- Builder -----------------------------------------------------------===*)
-
-let test_builder () =
-  let (++) x f = f x; x in
-  
-  begin group "parent";
-    insist (try
-              ignore (insertion_block (builder context));
-              false
-            with Not_found ->
-              true);
-    
-    let fty = function_type void_type [| i32_type |] in
-    let fn = define_function "BuilderParent" fty m in
-    let bb = entry_block fn in
-    let b = builder_at_end context bb in
-    let p = param fn 0 in
-    let sum = build_add p p "sum" b in
-    ignore (build_ret_void b);
-    
-    insist (fn = block_parent bb);
-    insist (fn = param_parent p);
-    insist (bb = instr_parent sum);
-    insist (bb = insertion_block b)
-  end;
-  
-  group "ret void";
-  begin
-    (* CHECK: ret void
-     *)
-    let fty = function_type void_type [| |] in
-    let fn = declare_function "X6" fty m in
-    let b = builder_at_end context (append_block context "Bb01" fn) in
-    ignore (build_ret_void b)
-  end;
-
-  group "ret aggregate";
-  begin
-      (* CHECK: ret { i8, i64 } { i8 4, i64 5 }
-       *)
-      let sty = struct_type context [| i8_type; i64_type |] in
-      let fty = function_type sty [| |] in
-      let fn = declare_function "XA6" fty m in
-      let b = builder_at_end context (append_block context "Bb01" fn) in
-      let agg = [| const_int i8_type 4; const_int i64_type 5 |] in
-      ignore (build_aggregate_ret agg b)
-  end;
-  
-  (* The rest of the tests will use one big function. *)
-  let fty = function_type i32_type [| i32_type; i32_type |] in
-  let fn = define_function "X7" fty m in
-  let atentry = builder_at_end context (entry_block fn) in
-  let p1 = param fn 0 ++ set_value_name "P1" in
-  let p2 = param fn 1 ++ set_value_name "P2" in
-  let f1 = build_uitofp p1 float_type "F1" atentry in
-  let f2 = build_uitofp p2 float_type "F2" atentry in
-  
-  let bb00 = append_block context "Bb00" fn in
-  ignore (build_unreachable (builder_at_end context bb00));
-
-  group "function attribute";
-  begin
-      ignore (add_function_attr fn Attribute.UWTable);
-      (* CHECK: X7{{.*}}#0
-       * #0 is uwtable, defined at EOF.
-       *)
-      insist ([Attribute.UWTable] = function_attr fn);
-  end;
-
-  group "casts"; begin
-    let void_ptr = pointer_type i8_type in
-
-    (* CHECK-DAG: %build_trunc = trunc i32 %P1 to i8
-     * CHECK-DAG: %build_trunc2 = trunc i32 %P1 to i8
-     * CHECK-DAG: %build_trunc3 = trunc i32 %P1 to i8
-     * CHECK-DAG: %build_zext = zext i8 %build_trunc to i32
-     * CHECK-DAG: %build_zext2 = zext i8 %build_trunc to i32
-     * CHECK-DAG: %build_sext = sext i32 %build_zext to i64
-     * CHECK-DAG: %build_sext2 = sext i32 %build_zext to i64
-     * CHECK-DAG: %build_sext3 = sext i32 %build_zext to i64
-     * CHECK-DAG: %build_uitofp = uitofp i64 %build_sext to float
-     * CHECK-DAG: %build_sitofp = sitofp i32 %build_zext to double
-     * CHECK-DAG: %build_fptoui = fptoui float %build_uitofp to i32
-     * CHECK-DAG: %build_fptosi = fptosi double %build_sitofp to i64
-     * CHECK-DAG: %build_fptrunc = fptrunc double %build_sitofp to float
-     * CHECK-DAG: %build_fptrunc2 = fptrunc double %build_sitofp to float
-     * CHECK-DAG: %build_fpext = fpext float %build_fptrunc to double
-     * CHECK-DAG: %build_fpext2 = fpext float %build_fptrunc to double
-     * CHECK-DAG: %build_inttoptr = inttoptr i32 %P1 to i8*
-     * CHECK-DAG: %build_ptrtoint = ptrtoint i8* %build_inttoptr to i64
-     * CHECK-DAG: %build_ptrtoint2 = ptrtoint i8* %build_inttoptr to i64
-     * CHECK-DAG: %build_bitcast = bitcast i64 %build_ptrtoint to double
-     * CHECK-DAG: %build_bitcast2 = bitcast i64 %build_ptrtoint to double
-     * CHECK-DAG: %build_bitcast3 = bitcast i64 %build_ptrtoint to double
-     * CHECK-DAG: %build_bitcast4 = bitcast i64 %build_ptrtoint to double
-     * CHECK-DAG: %build_pointercast = bitcast i8* %build_inttoptr to i16*
-     *)
-    let inst28 = build_trunc p1 i8_type "build_trunc" atentry in
-    let inst29 = build_zext inst28 i32_type "build_zext" atentry in
-    let inst30 = build_sext inst29 i64_type "build_sext" atentry in
-    let inst31 = build_uitofp inst30 float_type "build_uitofp" atentry in
-    let inst32 = build_sitofp inst29 double_type "build_sitofp" atentry in
-    ignore(build_fptoui inst31 i32_type "build_fptoui" atentry);
-    ignore(build_fptosi inst32 i64_type "build_fptosi" atentry);
-    let inst35 = build_fptrunc inst32 float_type "build_fptrunc" atentry in
-    ignore(build_fpext inst35 double_type "build_fpext" atentry);
-    let inst37 = build_inttoptr p1 void_ptr "build_inttoptr" atentry in
-    let inst38 = build_ptrtoint inst37 i64_type "build_ptrtoint" atentry in
-    ignore(build_bitcast inst38 double_type "build_bitcast" atentry);
-    ignore(build_zext_or_bitcast inst38 double_type "build_bitcast2" atentry);
-    ignore(build_sext_or_bitcast inst38 double_type "build_bitcast3" atentry);
-    ignore(build_trunc_or_bitcast inst38 double_type "build_bitcast4" atentry);
-    ignore(build_pointercast inst37 (pointer_type i16_type) "build_pointercast" atentry);
-
-    ignore(build_zext_or_bitcast inst28 i32_type "build_zext2" atentry);
-    ignore(build_sext_or_bitcast inst29 i64_type "build_sext2" atentry);
-    ignore(build_trunc_or_bitcast p1 i8_type "build_trunc2" atentry);
-    ignore(build_pointercast inst37 i64_type "build_ptrtoint2" atentry);
-    ignore(build_intcast inst29 i64_type "build_sext3" atentry);
-    ignore(build_intcast p1 i8_type "build_trunc3" atentry);
-    ignore(build_fpcast inst35 double_type "build_fpext2" atentry);
-    ignore(build_fpcast inst32 float_type "build_fptrunc2" atentry);
-  end;
-
-  group "comparisons"; begin
-    (* CHECK: %build_icmp_ne = icmp ne i32 %P1, %P2
-     * CHECK: %build_icmp_sle = icmp sle i32 %P2, %P1
-     * CHECK: %build_fcmp_false = fcmp false float %F1, %F2
-     * CHECK: %build_fcmp_true = fcmp true float %F2, %F1
-     * CHECK: %build_is_null{{.*}}= icmp eq{{.*}}%X0,{{.*}}null
-     * CHECK: %build_is_not_null = icmp ne i8* %X1, null
-     * CHECK: %build_ptrdiff
-     *)
-    ignore (build_icmp Icmp.Ne    p1 p2 "build_icmp_ne" atentry);
-    ignore (build_icmp Icmp.Sle   p2 p1 "build_icmp_sle" atentry);
-    ignore (build_fcmp Fcmp.False f1 f2 "build_fcmp_false" atentry);
-    ignore (build_fcmp Fcmp.True  f2 f1 "build_fcmp_true" atentry);
-    let g0 = declare_global (pointer_type i8_type) "g0" m in
-    let g1 = declare_global (pointer_type i8_type) "g1" m in
-    let p0 = build_load g0 "X0" atentry in
-    let p1 = build_load g1 "X1" atentry in
-    ignore (build_is_null p0 "build_is_null" atentry);
-    ignore (build_is_not_null p1 "build_is_not_null" atentry);
-    ignore (build_ptrdiff p1 p0 "build_ptrdiff" atentry);
-  end;
-
-  group "miscellaneous"; begin
-    (* CHECK: %build_call = tail call cc63 i32 @{{.*}}(i32 signext %P2, i32 %P1)
-     * CHECK: %build_select = select i1 %build_icmp, i32 %P1, i32 %P2
-     * CHECK: %build_va_arg = va_arg i8** null, i32
-     * CHECK: %build_extractelement = extractelement <4 x i32> %Vec1, i32 %P2
-     * CHECK: %build_insertelement = insertelement <4 x i32> %Vec1, i32 %P1, i32 %P2
-     * CHECK: %build_shufflevector = shufflevector <4 x i32> %Vec1, <4 x i32> %Vec2, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
-     * CHECK: %build_insertvalue0 = insertvalue{{.*}}%bl, i32 1, 0
-     * CHECK: %build_extractvalue = extractvalue{{.*}}%build_insertvalue1, 1
-     *)
-    let ci = build_call fn [| p2; p1 |] "build_call" atentry in
-    insist (CallConv.c = instruction_call_conv ci);
-    set_instruction_call_conv 63 ci;
-    insist (63 = instruction_call_conv ci);
-    insist (not (is_tail_call ci));
-    set_tail_call true ci;
-    insist (is_tail_call ci);
-    add_instruction_param_attr ci 1 Attribute.Sext;
-    add_instruction_param_attr ci 2 Attribute.Noalias;
-    remove_instruction_param_attr ci 2 Attribute.Noalias;
-
-    let inst46 = build_icmp Icmp.Eq p1 p2 "build_icmp" atentry in
-    ignore (build_select inst46 p1 p2 "build_select" atentry);
-    ignore (build_va_arg
-      (const_null (pointer_type (pointer_type i8_type)))
-      i32_type "build_va_arg" atentry);
-
-    (* Set up some vector vregs. *)
-    let one  = const_int i32_type 1 in
-    let zero = const_int i32_type 0 in
-    let t1 = const_vector [| one; zero; one; zero |] in
-    let t2 = const_vector [| zero; one; zero; one |] in
-    let t3 = const_vector [| one; one; zero; zero |] in
-    let vec1 = build_insertelement t1 p1 p2 "Vec1" atentry in
-    let vec2 = build_insertelement t2 p1 p2 "Vec2" atentry in
-    let sty = struct_type context [| i32_type; i8_type |] in
-
-    ignore (build_extractelement vec1 p2 "build_extractelement" atentry);
-    ignore (build_insertelement vec1 p1 p2 "build_insertelement" atentry);
-    ignore (build_shufflevector vec1 vec2 t3 "build_shufflevector" atentry);
-
-    let p = build_alloca sty "ba" atentry in
-    let agg = build_load p "bl" atentry in
-    let agg0 = build_insertvalue agg (const_int i32_type 1) 0
-                 "build_insertvalue0" atentry in
-    let agg1 = build_insertvalue agg0 (const_int i8_type 2) 1
-                 "build_insertvalue1" atentry in
-    ignore (build_extractvalue agg1 1 "build_extractvalue" atentry)
-  end;
-
-  group "metadata"; begin
-    (* CHECK: %metadata = add i32 %P1, %P2, !test !1
-     * !1 is metadata emitted at EOF.
-     *)
-    let i = build_add p1 p2 "metadata" atentry in
-    insist ((has_metadata i) = false);
-
-    let m1 = const_int i32_type 1 in
-    let m2 = mdstring context "metadata test" in
-    let md = mdnode context [| m1; m2 |] in
-
-    let kind = mdkind_id context "test" in
-    set_metadata i kind md;
-
-    insist ((has_metadata i) = true);
-    insist ((metadata i kind) = Some md);
-
-    clear_metadata i kind;
-
-    insist ((has_metadata i) = false);
-    insist ((metadata i kind) = None);
-
-    set_metadata i kind md
-  end;
-
-  group "named metadata"; begin
-    (* !llvm.module.flags is emitted at EOF. *)
-    let n1 = const_int i32_type 1 in
-    let n2 = mdstring context "Debug Info Version" in
-    let md = mdnode context [| n1; n2; n1 |] in
-    add_named_metadata_operand m "llvm.module.flags" md;
-
-    insist ((get_named_metadata m "llvm.module.flags") = [| md |])
-  end;
-
-  group "dbg"; begin
-    (* CHECK: %dbg = add i32 %P1, %P2, !dbg !2
-     * !2 is metadata emitted at EOF.
-     *)
-    insist ((current_debug_location atentry) = None);
-
-    let m_line = const_int i32_type 2 in
-    let m_col = const_int i32_type 3 in
-    let m_scope = mdnode context [| |] in
-    let m_inlined = mdnode context [| |] in
-    let md = mdnode context [| m_line; m_col; m_scope; m_inlined |] in
-    set_current_debug_location atentry md;
-
-    insist ((current_debug_location atentry) = Some md);
-
-    let i = build_add p1 p2 "dbg" atentry in
-    insist ((has_metadata i) = true);
-
-    clear_current_debug_location atentry
-  end;
-
-  group "ret"; begin
-    (* CHECK: ret{{.*}}P1
-     *)
-    let ret = build_ret p1 atentry in
-    position_before ret atentry
-  end;
-
-  (* see test/Feature/exception.ll *)
-  let bblpad = append_block context "Bblpad" fn in
-  let rt = struct_type context [| pointer_type i8_type; i32_type |] in
-  let ft = var_arg_function_type i32_type  [||] in
-  let personality = declare_function "__gxx_personality_v0" ft m in
-  let ztic = declare_global (pointer_type i8_type) "_ZTIc" m in
-  let ztid = declare_global (pointer_type i8_type) "_ZTId" m in
-  let ztipkc = declare_global (pointer_type i8_type) "_ZTIPKc" m in
-  begin
-      set_global_constant true ztic;
-      set_global_constant true ztid;
-      set_global_constant true ztipkc;
-      let lp = build_landingpad rt personality 0 "lpad"
-       (builder_at_end context bblpad) in begin
-           set_cleanup lp true;
-           add_clause lp ztic;
-           insist((pointer_type (pointer_type i8_type)) = type_of ztid);
-           let ety = pointer_type (pointer_type i8_type) in
-           add_clause lp (const_array ety [| ztipkc; ztid |]);
-           ignore (build_resume lp (builder_at_end context bblpad));
-      end;
-      (* CHECK: landingpad{{.*}}personality{{.*}}__gxx_personality_v0
-       * CHECK: cleanup
-       * CHECK: catch{{.*}}i8**{{.*}}@_ZTIc
-       * CHECK: filter{{.*}}@_ZTIPKc{{.*}}@_ZTId
-       * CHECK: resume
-       * *)
-  end;
-
-  group "br"; begin
-    (* CHECK: br{{.*}}Bb02
-     *)
-    let bb02 = append_block context "Bb02" fn in
-    let b = builder_at_end context bb02 in
-    ignore (build_br bb02 b)
-  end;
-  
-  group "cond_br"; begin
-    (* CHECK: br{{.*}}build_br{{.*}}Bb03{{.*}}Bb00
-     *)
-    let bb03 = append_block context "Bb03" fn in
-    let b = builder_at_end context bb03 in
-    let cond = build_trunc p1 i1_type "build_br" b in
-    ignore (build_cond_br cond bb03 bb00 b)
-  end;
-  
-  group "switch"; begin
-    (* CHECK: switch{{.*}}P1{{.*}}SwiBlock3
-     * CHECK: 2,{{.*}}SwiBlock2
-     *)
-    let bb1 = append_block context "SwiBlock1" fn in
-    let bb2 = append_block context "SwiBlock2" fn in
-    ignore (build_unreachable (builder_at_end context bb2));
-    let bb3 = append_block context "SwiBlock3" fn in
-    ignore (build_unreachable (builder_at_end context bb3));
-    let si = build_switch p1 bb3 1 (builder_at_end context bb1) in begin
-        ignore (add_case si (const_int i32_type 2) bb2);
-        insist (switch_default_dest si = bb3);
-    end;
-  end;
-
-  group "malloc/free"; begin
-      (* CHECK: call{{.*}}@malloc(i32 ptrtoint
-       * CHECK: call{{.*}}@free(i8*
-       * CHECK: call{{.*}}@malloc(i32 %
-       *)
-      let bb1 = append_block context "MallocBlock1" fn in
-      let m1 = (build_malloc (pointer_type i32_type) "m1"
-      (builder_at_end context bb1)) in
-      ignore (build_free m1 (builder_at_end context bb1));
-      ignore (build_array_malloc i32_type p1 "m2" (builder_at_end context bb1));
-      ignore (build_unreachable (builder_at_end context bb1));
-  end;
-
-  group "indirectbr"; begin
-    (* CHECK: indirectbr i8* blockaddress(@X7, %IBRBlock2), [label %IBRBlock2, label %IBRBlock3]
-     *)
-    let bb1 = append_block context "IBRBlock1" fn in
-
-    let bb2 = append_block context "IBRBlock2" fn in
-    ignore (build_unreachable (builder_at_end context bb2));
-
-    let bb3 = append_block context "IBRBlock3" fn in
-    ignore (build_unreachable (builder_at_end context bb3));
-
-    let addr = block_address fn bb2 in
-    let ibr = build_indirect_br addr 2 (builder_at_end context bb1) in
-    ignore (add_destination ibr bb2);
-    ignore (add_destination ibr bb3)
-  end;
-  
-  group "invoke"; begin
-    (* CHECK: build_invoke{{.*}}invoke{{.*}}P1{{.*}}P2
-     * CHECK: to{{.*}}Bb04{{.*}}unwind{{.*}}Bblpad
-     *)
-    let bb04 = append_block context "Bb04" fn in
-    let b = builder_at_end context bb04 in
-    ignore (build_invoke fn [| p1; p2 |] bb04 bblpad "build_invoke" b)
-  end;
-  
-  group "unreachable"; begin
-    (* CHECK: unreachable
-     *)
-    let bb06 = append_block context "Bb06" fn in
-    let b = builder_at_end context bb06 in
-    ignore (build_unreachable b)
-  end;
-  
-  group "arithmetic"; begin
-    let bb07 = append_block context "Bb07" fn in
-    let b = builder_at_end context bb07 in
-    
-    (* CHECK: %build_add = add i32 %P1, %P2
-     * CHECK: %build_nsw_add = add nsw i32 %P1, %P2
-     * CHECK: %build_nuw_add = add nuw i32 %P1, %P2
-     * CHECK: %build_fadd = fadd float %F1, %F2
-     * CHECK: %build_sub = sub i32 %P1, %P2
-     * CHECK: %build_nsw_sub = sub nsw i32 %P1, %P2
-     * CHECK: %build_nuw_sub = sub nuw i32 %P1, %P2
-     * CHECK: %build_fsub = fsub float %F1, %F2
-     * CHECK: %build_mul = mul i32 %P1, %P2
-     * CHECK: %build_nsw_mul = mul nsw i32 %P1, %P2
-     * CHECK: %build_nuw_mul = mul nuw i32 %P1, %P2
-     * CHECK: %build_fmul = fmul float %F1, %F2
-     * CHECK: %build_udiv = udiv i32 %P1, %P2
-     * CHECK: %build_sdiv = sdiv i32 %P1, %P2
-     * CHECK: %build_exact_sdiv = sdiv exact i32 %P1, %P2
-     * CHECK: %build_fdiv = fdiv float %F1, %F2
-     * CHECK: %build_urem = urem i32 %P1, %P2
-     * CHECK: %build_srem = srem i32 %P1, %P2
-     * CHECK: %build_frem = frem float %F1, %F2
-     * CHECK: %build_shl = shl i32 %P1, %P2
-     * CHECK: %build_lshl = lshr i32 %P1, %P2
-     * CHECK: %build_ashl = ashr i32 %P1, %P2
-     * CHECK: %build_and = and i32 %P1, %P2
-     * CHECK: %build_or = or i32 %P1, %P2
-     * CHECK: %build_xor = xor i32 %P1, %P2
-     * CHECK: %build_neg = sub i32 0, %P1
-     * CHECK: %build_nsw_neg = sub nsw i32 0, %P1
-     * CHECK: %build_nuw_neg = sub nuw i32 0, %P1
-     * CHECK: %build_fneg = fsub float {{.*}}0{{.*}}, %F1
-     * CHECK: %build_not = xor i32 %P1, -1
-     *)
-    ignore (build_add p1 p2 "build_add" b);
-    ignore (build_nsw_add p1 p2 "build_nsw_add" b);
-    ignore (build_nuw_add p1 p2 "build_nuw_add" b);
-    ignore (build_fadd f1 f2 "build_fadd" b);
-    ignore (build_sub p1 p2 "build_sub" b);
-    ignore (build_nsw_sub p1 p2 "build_nsw_sub" b);
-    ignore (build_nuw_sub p1 p2 "build_nuw_sub" b);
-    ignore (build_fsub f1 f2 "build_fsub" b);
-    ignore (build_mul p1 p2 "build_mul" b);
-    ignore (build_nsw_mul p1 p2 "build_nsw_mul" b);
-    ignore (build_nuw_mul p1 p2 "build_nuw_mul" b);
-    ignore (build_fmul f1 f2 "build_fmul" b);
-    ignore (build_udiv p1 p2 "build_udiv" b);
-    ignore (build_sdiv p1 p2 "build_sdiv" b);
-    ignore (build_exact_sdiv p1 p2 "build_exact_sdiv" b);
-    ignore (build_fdiv f1 f2 "build_fdiv" b);
-    ignore (build_urem p1 p2 "build_urem" b);
-    ignore (build_srem p1 p2 "build_srem" b);
-    ignore (build_frem f1 f2 "build_frem" b);
-    ignore (build_shl p1 p2 "build_shl" b);
-    ignore (build_lshr p1 p2 "build_lshl" b);
-    ignore (build_ashr p1 p2 "build_ashl" b);
-    ignore (build_and p1 p2 "build_and" b);
-    ignore (build_or p1 p2 "build_or" b);
-    ignore (build_xor p1 p2 "build_xor" b);
-    ignore (build_neg p1 "build_neg" b);
-    ignore (build_nsw_neg p1 "build_nsw_neg" b);
-    ignore (build_nuw_neg p1 "build_nuw_neg" b);
-    ignore (build_fneg f1 "build_fneg" b);
-    ignore (build_not p1 "build_not" b);
-    ignore (build_unreachable b)
-  end;
-  
-  group "memory"; begin
-    let bb08 = append_block context "Bb08" fn in
-    let b = builder_at_end context bb08 in
-
-    (* CHECK: %build_alloca = alloca i32
-     * CHECK: %build_array_alloca = alloca i32, i32 %P2
-     * CHECK: %build_load = load volatile i32* %build_array_alloca, align 4
-     * CHECK: store volatile i32 %P2, i32* %build_alloca, align 4
-     * CHECK: %build_gep = getelementptr i32* %build_array_alloca, i32 %P2
-     * CHECK: %build_in_bounds_gep = getelementptr inbounds i32* %build_array_alloca, i32 %P2
-     * CHECK: %build_struct_gep = getelementptr inbounds{{.*}}%build_alloca2, i32 0, i32 1
-     * CHECK: %build_atomicrmw = atomicrmw xchg i8* %p, i8 42 seq_cst
-     *)
-    let alloca = build_alloca i32_type "build_alloca" b in
-    let array_alloca = build_array_alloca i32_type p2 "build_array_alloca" b in
-
-    let load = build_load array_alloca "build_load" b in
-    ignore(set_alignment 4 load);
-    ignore(set_volatile true load);
-    insist(true = is_volatile load);
-    insist(4 = alignment load);
-
-    let store = build_store p2 alloca b in
-    ignore(set_volatile true store);
-    ignore(set_alignment 4 store);
-    insist(true = is_volatile store);
-    insist(4 = alignment store);
-    ignore(build_gep array_alloca [| p2 |] "build_gep" b);
-    ignore(build_in_bounds_gep array_alloca [| p2 |] "build_in_bounds_gep" b);
-
-    let sty = struct_type context [| i32_type; i8_type |] in
-    let alloca2 = build_alloca sty "build_alloca2" b in
-    ignore(build_struct_gep alloca2 1 "build_struct_gep" b);
-
-    let p = build_alloca i8_type "p" b in
-    ignore(build_atomicrmw AtomicRMWBinOp.Xchg p (const_int i8_type 42)
-              AtomicOrdering.SequentiallyConsistent false "build_atomicrmw"
-              b);
-
-    ignore(build_unreachable b)
-  end;
-
-  group "string"; begin
-    let bb09 = append_block context "Bb09" fn in
-    let b = builder_at_end context bb09 in
-    let p = build_alloca (pointer_type i8_type) "p" b in
-    (* build_global_string is emitted above.
-     * CHECK: store{{.*}}build_global_string1{{.*}}p
-     * *)
-    ignore (build_global_string "stringval" "build_global_string" b);
-    let g = build_global_stringptr "stringval" "build_global_string1" b in
-    ignore (build_store g p b);
-    ignore(build_unreachable b);
-  end;
-
-  group "phi"; begin
-    (* CHECK: PhiNode{{.*}}P1{{.*}}PhiBlock1{{.*}}P2{{.*}}PhiBlock2
-     *)
-    let b1 = append_block context "PhiBlock1" fn in
-    let b2 = append_block context "PhiBlock2" fn in
-    
-    let jb = append_block context "PhiJoinBlock" fn in
-    ignore (build_br jb (builder_at_end context b1));
-    ignore (build_br jb (builder_at_end context b2));
-    let at_jb = builder_at_end context jb in
-    
-    let phi = build_phi [(p1, b1)] "PhiNode" at_jb in
-    insist ([(p1, b1)] = incoming phi);
-    
-    add_incoming (p2, b2) phi;
-    insist ([(p1, b1); (p2, b2)] = incoming phi);
-    
-    ignore (build_unreachable at_jb);
-  end
-
-(* End-of-file checks for things like metdata and attributes.
- * CHECK: attributes #0 = {{.*}}uwtable{{.*}}
- * CHECK: !llvm.module.flags = !{!0}
- * CHECK: !0 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
- * CHECK: !1 = metadata !{i32 1, metadata !"metadata test"}
- * CHECK: !2 = metadata !{i32 2, i32 3, metadata !3, metadata !3}
- *)
-
-(*===-- Pass Managers -----------------------------------------------------===*)
-
-let test_pass_manager () =
-  let (++) x f = ignore (f x); x in
-
-  begin group "module pass manager";
-    ignore (PassManager.create ()
-             ++ PassManager.run_module m
-             ++ PassManager.dispose)
-  end;
-  
-  begin group "function pass manager";
-    let fty = function_type void_type [| |] in
-    let fn = define_function "FunctionPassManager" fty m in
-    ignore (build_ret_void (builder_at_end context (entry_block fn)));
-    
-    ignore (PassManager.create_function m
-             ++ PassManager.initialize
-             ++ PassManager.run_function fn
-             ++ PassManager.finalize
-             ++ PassManager.dispose)
-  end
-
-
-(*===-- Memory Buffer -----------------------------------------------------===*)
-
-let test_memory_buffer () =
-  group "memory buffer";
-  let buf = MemoryBuffer.of_string "foobar" in
-  insist ((MemoryBuffer.as_string buf) = "foobar")
-
-
-(*===-- Writer ------------------------------------------------------------===*)
-
-let test_writer () =
-  group "valid";
-  insist (match Llvm_analysis.verify_module m with
-          | None -> true
-          | Some msg -> prerr_string msg; false);
-
-  group "writer";
-  insist (write_bitcode_file m filename);
-  
-  dispose_module m
-
-
-(*===-- Driver ------------------------------------------------------------===*)
-
-let _ =
-  suite "conversion"       test_conversion;
-  suite "target"           test_target;
-  suite "constants"        test_constants;
-  suite "global values"    test_global_values;
-  suite "global variables" test_global_variables;
-  suite "uses"             test_uses;
-  suite "users"            test_users;
-  suite "aliases"          test_aliases;
-  suite "functions"        test_functions;
-  suite "params"           test_params;
-  suite "basic blocks"     test_basic_blocks;
-  suite "instructions"     test_instructions;
-  suite "builder"          test_builder;
-  suite "pass manager"     test_pass_manager;
-  suite "memory buffer"    test_memory_buffer;
-  suite "writer"           test_writer; (* Keep this last; it disposes m. *)
-  exit !exit_status

diff --git a/test/Bindings/llvm-c/disassemble.test b/test/Bindings/llvm-c/disassemble.test
index 201e914..bb7a9a0 100644
--- a/test/Bindings/llvm-c/disassemble.test
+++ b/test/Bindings/llvm-c/disassemble.test

@@ -1,15 +1,27 @@
 ; RUN: llvm-c-test --disassemble < %s | FileCheck %s
 
+armv8-linux-gnu     +crypto 02 00 81 e0 02 03 b0 f3
+;CHECK: triple: armv8-linux-gnu, features: +crypto
+;CHECK: 02 00 81 e0                  add r0, r1, r2
+;CHECK: 02 03 b0 f3                  aese.8 q0, q1
 
-arm-linux-android    44 26 1f e5 0c 10 4b e2 02 20 81 e0
-;CHECK: triple: arm-linux-android
+armv8-linux-gnu     -crypto 02 00 81 e0 02 03 b0 f3
+;CHECK: triple: armv8-linux-gnu, features: -crypto
+;CHECK: 02 00 81 e0                  add r0, r1, r2
+;CHECK: 02                           ???
+;CHECK: 03                           ???
+;CHECK: b0                           ???
+;CHECK: f3                           ???
+
+arm-linux-android     NULL  44 26 1f e5 0c 10 4b e2 02 20 81 e0
+;CHECK: triple: arm-linux-android, features: NULL
 ;CHECK: ldr	r2, [pc, #-1604]
 ;CHECK: sub	r1, r11, #12
 ;CHECK: 02 20 81 e0
 ;CHECK: add	r2, r1, r2
 
-x86_64-linux-unknown 48 83 c4 38 5b 5d 41 5c 41 5d 41 5e 41 5f c3
-;CHECK: triple: x86_64-linux-unknown
+x86_64-linux-unknown  NULL  48 83 c4 38 5b 5d 41 5c 41 5d 41 5e 41 5f c3
+;CHECK: triple: x86_64-linux-unknown, features: NULL
 ;CHECK: addq	$56, %rsp
 ;CHECK: popq	%rbx
 ;CHECK: popq	%rbp
@@ -19,11 +31,13 @@
 ;CHECK: popq	%r15
 ;CHECK: ret
 
-i686-apple-darwin    0f b7 4c 24 0a e8 29 ce ff ff
+i686-apple-darwin     NULL  0f b7 4c 24 0a e8 29 ce ff ff
+;CHECK: triple: i686-apple-darwin, features: NULL
 ;CHECK: movzwl	10(%esp), %ecx
 ;CHECK: calll	-12759
 
-i686-linux-unknown   dd 44 24 04 d9 e1 c3
+i686-linux-unknown    NULL  dd 44 24 04 d9 e1 c3
+;CHECK: triple: i686-linux-unknown, features: NULL
 ;CHECK: fldl	4(%esp)
 ;CHECK: fabs
 ;CHECK: ret

diff --git a/test/Bindings/llvm-c/objectfile.ll b/test/Bindings/llvm-c/objectfile.ll
new file mode 100644
index 0000000..b6cb4a0
--- /dev/null
+++ b/test/Bindings/llvm-c/objectfile.ll

@@ -0,0 +1,2 @@
+; RUN: not llvm-c-test --object-list-sections < /dev/null
+; This used to cause a segfault

diff --git a/test/Bitcode/2006-12-11-Cast-ConstExpr.ll b/test/Bitcode/2006-12-11-Cast-ConstExpr.ll
index e704627..35bf7ab 100644
--- a/test/Bitcode/2006-12-11-Cast-ConstExpr.ll
+++ b/test/Bitcode/2006-12-11-Cast-ConstExpr.ll

@@ -1,6 +1,7 @@
 ; This test ensures that we get a bitcast constant expression in and out,
 ; not a sitofp constant expression. 
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 ; CHECK: bitcast (
 
 @G = external global i32

diff --git a/test/Bitcode/2009-06-11-FirstClassAggregateConstant.ll b/test/Bitcode/2009-06-11-FirstClassAggregateConstant.ll
index 415f88e..9405fbb 100644
--- a/test/Bitcode/2009-06-11-FirstClassAggregateConstant.ll
+++ b/test/Bitcode/2009-06-11-FirstClassAggregateConstant.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis -disable-output
+; RUN: verify-uselistorder < %s
 ; PR4373
 
 @foo = weak global { i32 } zeroinitializer              

diff --git a/test/Bitcode/aggregateInstructions.3.2.ll b/test/Bitcode/aggregateInstructions.3.2.ll
index 9352390..59aafd1 100644
--- a/test/Bitcode/aggregateInstructions.3.2.ll
+++ b/test/Bitcode/aggregateInstructions.3.2.ll

@@ -1,33 +1,34 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; aggregateOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread instructions with aggregate operands

-; in older bitcode files.

-

-define void @extractvalue([4 x i8] %x1, [4 x [4 x i8]] %x2, {{i32, float}} %x3){

-entry:

-; CHECK: %res1 = extractvalue [4 x i8] %x1, 0

-  %res1 = extractvalue [4 x i8] %x1, 0

-  

-; CHECK-NEXT: %res2 = extractvalue [4 x [4 x i8]] %x2, 1

-  %res2 = extractvalue [4 x [4 x i8 ]] %x2, 1

-

-; CHECK-NEXT: %res3 = extractvalue [4 x [4 x i8]] %x2, 0, 1

-  %res3 = extractvalue [4 x [4 x i8 ]] %x2, 0, 1

-  

-; CHECK-NEXT: %res4 = extractvalue { { i32, float } } %x3, 0, 1

-  %res4 = extractvalue {{i32, float}} %x3, 0, 1

-

-  ret void

-}

-

-define void @insertvalue([4 x [4 x i8 ]] %x1){

-entry:

-; CHECK: %res1 = insertvalue [4 x [4 x i8]] %x1, i8 0, 0, 0

-  %res1 = insertvalue [4 x [4 x i8 ]] %x1, i8 0, 0, 0

-  

-; CHECK-NEXT: %res2 = insertvalue [4 x [4 x i8]] undef, i8 0, 0, 0

-  %res2 = insertvalue [4 x [4 x i8 ]] undef, i8 0, 0, 0

-

-  ret void

-}
\ No newline at end of file
+; RUN: llvm-dis < %s.bc| FileCheck %s
+; RUN: verify-uselistorder < %s.bc
+
+; aggregateOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread instructions with aggregate operands
+; in older bitcode files.
+
+define void @extractvalue([4 x i8] %x1, [4 x [4 x i8]] %x2, {{i32, float}} %x3){
+entry:
+; CHECK: %res1 = extractvalue [4 x i8] %x1, 0
+  %res1 = extractvalue [4 x i8] %x1, 0
+
+; CHECK-NEXT: %res2 = extractvalue [4 x [4 x i8]] %x2, 1
+  %res2 = extractvalue [4 x [4 x i8 ]] %x2, 1
+
+; CHECK-NEXT: %res3 = extractvalue [4 x [4 x i8]] %x2, 0, 1
+  %res3 = extractvalue [4 x [4 x i8 ]] %x2, 0, 1
+
+; CHECK-NEXT: %res4 = extractvalue { { i32, float } } %x3, 0, 1
+  %res4 = extractvalue {{i32, float}} %x3, 0, 1
+
+  ret void
+}
+
+define void @insertvalue([4 x [4 x i8 ]] %x1){
+entry:
+; CHECK: %res1 = insertvalue [4 x [4 x i8]] %x1, i8 0, 0, 0
+  %res1 = insertvalue [4 x [4 x i8 ]] %x1, i8 0, 0, 0
+
+; CHECK-NEXT: %res2 = insertvalue [4 x [4 x i8]] undef, i8 0, 0, 0
+  %res2 = insertvalue [4 x [4 x i8 ]] undef, i8 0, 0, 0
+
+  ret void
+}

diff --git a/test/Bitcode/arm32_neon_vcnt_upgrade.ll b/test/Bitcode/arm32_neon_vcnt_upgrade.ll
index 10b9284..ed3981b 100644
--- a/test/Bitcode/arm32_neon_vcnt_upgrade.ll
+++ b/test/Bitcode/arm32_neon_vcnt_upgrade.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 ; Tests vclz and vcnt
 
 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {

diff --git a/test/Bitcode/atomic.ll b/test/Bitcode/atomic.ll
index 37815a7..c09e74c 100644
--- a/test/Bitcode/atomic.ll
+++ b/test/Bitcode/atomic.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 
 define void @test_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
   cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
@@ -14,4 +15,4 @@
   ; CHECK: cmpxchg weak volatile i32* %addr, i32 %desired, i32 %new singlethread release monotonic
 
   ret void
-}
\ No newline at end of file
+}

diff --git a/test/Bitcode/attributes-3.3.ll b/test/Bitcode/attributes-3.3.ll
index cd70ba1..b564425 100644
--- a/test/Bitcode/attributes-3.3.ll
+++ b/test/Bitcode/attributes-3.3.ll

@@ -1,4 +1,5 @@
 ; RUN:  llvm-dis < %s.bc| FileCheck %s
+; RUN:  verify-uselistorder < %s.bc
 
 ; attributes-3.3.ll.bc was generated by passing this file to llvm-as-3.3.
 ; The test checks that LLVM does not silently misread attributes of

diff --git a/test/Bitcode/attributes.ll b/test/Bitcode/attributes.ll
index 49366de..c75ee80 100644
--- a/test/Bitcode/attributes.ll
+++ b/test/Bitcode/attributes.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 ; PR12696
 
 define void @f1(i8 zeroext)
@@ -229,6 +230,21 @@
     unreachable
 }
 
+define dereferenceable(2) i8* @f39(i8* dereferenceable(1) %a) {
+; CHECK: define dereferenceable(2) i8* @f39(i8* dereferenceable(1) %a) {
+        ret i8* %a
+}
+
+define dereferenceable(18446744073709551606) i8* @f40(i8* dereferenceable(18446744073709551615) %a) {
+; CHECK: define dereferenceable(18446744073709551606) i8* @f40(i8* dereferenceable(18446744073709551615) %a) {
+        ret i8* %a
+}
+
+define void @f41(i8* align 32, double* align 64) {
+; CHECK: define void @f41(i8* align 32, double* align 64) {
+        ret void
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { readnone }

diff --git a/test/Bitcode/binaryFloatInstructions.3.2.ll b/test/Bitcode/binaryFloatInstructions.3.2.ll
index f94d82d..cec1683 100644
--- a/test/Bitcode/binaryFloatInstructions.3.2.ll
+++ b/test/Bitcode/binaryFloatInstructions.3.2.ll

@@ -1,120 +1,121 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; BinaryFloatOperation.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread binary float instructions from

-; older bitcode files.

-

-define void @fadd(float %x1, double %x2 ,half %x3, fp128 %x4, x86_fp80 %x5, ppc_fp128 %x6){

-entry:

-; CHECK: %res1 = fadd float %x1, %x1

-  %res1 = fadd float %x1, %x1

-

-; CHECK-NEXT: %res2 = fadd double %x2, %x2

-  %res2 = fadd double %x2, %x2

-

-; CHECK-NEXT: %res3 = fadd half %x3, %x3

-  %res3 = fadd half %x3, %x3

-

-; CHECK-NEXT: %res4 = fadd fp128 %x4, %x4

-  %res4 = fadd fp128 %x4, %x4

-

-; CHECK-NEXT: %res5 = fadd x86_fp80 %x5, %x5

-  %res5 = fadd x86_fp80 %x5, %x5

-  

-; CHECK-NEXT: %res6 = fadd ppc_fp128 %x6, %x6

-  %res6 = fadd ppc_fp128 %x6, %x6

-  

-  ret void

-}

-

-define void @faddFloatVec(<2 x float> %x1, <3 x float> %x2 ,<4 x float> %x3, <8 x float> %x4, <16 x float> %x5){

-entry:

-; CHECK: %res1 = fadd <2 x float> %x1, %x1

-  %res1 = fadd <2 x float> %x1, %x1

-

-; CHECK-NEXT: %res2 = fadd <3 x float> %x2, %x2

-  %res2 = fadd <3 x float> %x2, %x2

-

-; CHECK-NEXT: %res3 = fadd <4 x float> %x3, %x3

-  %res3 = fadd <4 x float> %x3, %x3

-

-; CHECK-NEXT: %res4 = fadd <8 x float> %x4, %x4

-  %res4 = fadd <8 x float> %x4, %x4

-

-; CHECK-NEXT: %res5 = fadd <16 x float> %x5, %x5

-  %res5 = fadd <16 x float> %x5, %x5

-  

-  ret void

-}

-

-define void @faddDoubleVec(<2 x double> %x1, <3 x double> %x2 ,<4 x double> %x3, <8 x double> %x4, <16 x double> %x5){

-entry:

-; CHECK: %res1 = fadd <2 x double> %x1, %x1

-  %res1 = fadd <2 x double> %x1, %x1

-

-; CHECK-NEXT: %res2 = fadd <3 x double> %x2, %x2

-  %res2 = fadd <3 x double> %x2, %x2

-

-; CHECK-NEXT: %res3 = fadd <4 x double> %x3, %x3

-  %res3 = fadd <4 x double> %x3, %x3

-

-; CHECK-NEXT: %res4 = fadd <8 x double> %x4, %x4

-  %res4 = fadd <8 x double> %x4, %x4

-

-; CHECK-NEXT: %res5 = fadd <16 x double> %x5, %x5

-  %res5 = fadd <16 x double> %x5, %x5

-  

-  ret void

-}

-

-define void @faddHalfVec(<2 x half> %x1, <3 x half> %x2 ,<4 x half> %x3, <8 x half> %x4, <16 x half> %x5){

-entry:

-; CHECK: %res1 = fadd <2 x half> %x1, %x1

-  %res1 = fadd <2 x half> %x1, %x1

-

-; CHECK-NEXT: %res2 = fadd <3 x half> %x2, %x2

-  %res2 = fadd <3 x half> %x2, %x2

-

-; CHECK-NEXT: %res3 = fadd <4 x half> %x3, %x3

-  %res3 = fadd <4 x half> %x3, %x3

-

-; CHECK-NEXT: %res4 = fadd <8 x half> %x4, %x4

-  %res4 = fadd <8 x half> %x4, %x4

-

-; CHECK-NEXT: %res5 = fadd <16 x half> %x5, %x5

-  %res5 = fadd <16 x half> %x5, %x5

-  

-  ret void

-}

-

-define void @fsub(float %x1){

-entry:

-; CHECK: %res1 = fsub float %x1, %x1

-  %res1 = fsub float %x1, %x1

-

-  ret void

-}

-

-define void @fmul(float %x1){

-entry:

-; CHECK: %res1 = fmul float %x1, %x1

-  %res1 = fmul float %x1, %x1

-  

-  ret void

-}

-

-define void @fdiv(float %x1){

-entry:

-; CHECK: %res1 = fdiv float %x1, %x1

-  %res1 = fdiv float %x1, %x1

-  

-  ret void

-}

-

-define void @frem(float %x1){

-entry:

-; CHECK: %res1 = frem float %x1, %x1

-  %res1 = frem float %x1, %x1

-

-  ret void

-}

+; RUN: llvm-dis < %s.bc| FileCheck %s
+; RUN: verify-uselistorder < %s.bc
+
+; BinaryFloatOperation.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread binary float instructions from
+; older bitcode files.
+
+define void @fadd(float %x1, double %x2 ,half %x3, fp128 %x4, x86_fp80 %x5, ppc_fp128 %x6){
+entry:
+; CHECK: %res1 = fadd float %x1, %x1
+  %res1 = fadd float %x1, %x1
+
+; CHECK-NEXT: %res2 = fadd double %x2, %x2
+  %res2 = fadd double %x2, %x2
+
+; CHECK-NEXT: %res3 = fadd half %x3, %x3
+  %res3 = fadd half %x3, %x3
+
+; CHECK-NEXT: %res4 = fadd fp128 %x4, %x4
+  %res4 = fadd fp128 %x4, %x4
+
+; CHECK-NEXT: %res5 = fadd x86_fp80 %x5, %x5
+  %res5 = fadd x86_fp80 %x5, %x5
+
+; CHECK-NEXT: %res6 = fadd ppc_fp128 %x6, %x6
+  %res6 = fadd ppc_fp128 %x6, %x6
+
+  ret void
+}
+
+define void @faddFloatVec(<2 x float> %x1, <3 x float> %x2 ,<4 x float> %x3, <8 x float> %x4, <16 x float> %x5){
+entry:
+; CHECK: %res1 = fadd <2 x float> %x1, %x1
+  %res1 = fadd <2 x float> %x1, %x1
+
+; CHECK-NEXT: %res2 = fadd <3 x float> %x2, %x2
+  %res2 = fadd <3 x float> %x2, %x2
+
+; CHECK-NEXT: %res3 = fadd <4 x float> %x3, %x3
+  %res3 = fadd <4 x float> %x3, %x3
+
+; CHECK-NEXT: %res4 = fadd <8 x float> %x4, %x4
+  %res4 = fadd <8 x float> %x4, %x4
+
+; CHECK-NEXT: %res5 = fadd <16 x float> %x5, %x5
+  %res5 = fadd <16 x float> %x5, %x5
+
+  ret void
+}
+
+define void @faddDoubleVec(<2 x double> %x1, <3 x double> %x2 ,<4 x double> %x3, <8 x double> %x4, <16 x double> %x5){
+entry:
+; CHECK: %res1 = fadd <2 x double> %x1, %x1
+  %res1 = fadd <2 x double> %x1, %x1
+
+; CHECK-NEXT: %res2 = fadd <3 x double> %x2, %x2
+  %res2 = fadd <3 x double> %x2, %x2
+
+; CHECK-NEXT: %res3 = fadd <4 x double> %x3, %x3
+  %res3 = fadd <4 x double> %x3, %x3
+
+; CHECK-NEXT: %res4 = fadd <8 x double> %x4, %x4
+  %res4 = fadd <8 x double> %x4, %x4
+
+; CHECK-NEXT: %res5 = fadd <16 x double> %x5, %x5
+  %res5 = fadd <16 x double> %x5, %x5
+
+  ret void
+}
+
+define void @faddHalfVec(<2 x half> %x1, <3 x half> %x2 ,<4 x half> %x3, <8 x half> %x4, <16 x half> %x5){
+entry:
+; CHECK: %res1 = fadd <2 x half> %x1, %x1
+  %res1 = fadd <2 x half> %x1, %x1
+
+; CHECK-NEXT: %res2 = fadd <3 x half> %x2, %x2
+  %res2 = fadd <3 x half> %x2, %x2
+
+; CHECK-NEXT: %res3 = fadd <4 x half> %x3, %x3
+  %res3 = fadd <4 x half> %x3, %x3
+
+; CHECK-NEXT: %res4 = fadd <8 x half> %x4, %x4
+  %res4 = fadd <8 x half> %x4, %x4
+
+; CHECK-NEXT: %res5 = fadd <16 x half> %x5, %x5
+  %res5 = fadd <16 x half> %x5, %x5
+
+  ret void
+}
+
+define void @fsub(float %x1){
+entry:
+; CHECK: %res1 = fsub float %x1, %x1
+  %res1 = fsub float %x1, %x1
+
+  ret void
+}
+
+define void @fmul(float %x1){
+entry:
+; CHECK: %res1 = fmul float %x1, %x1
+  %res1 = fmul float %x1, %x1
+
+  ret void
+}
+
+define void @fdiv(float %x1){
+entry:
+; CHECK: %res1 = fdiv float %x1, %x1
+  %res1 = fdiv float %x1, %x1
+
+  ret void
+}
+
+define void @frem(float %x1){
+entry:
+; CHECK: %res1 = frem float %x1, %x1
+  %res1 = frem float %x1, %x1
+
+  ret void
+}

diff --git a/test/Bitcode/binaryIntInstructions.3.2.ll b/test/Bitcode/binaryIntInstructions.3.2.ll
index b08501c..e484ff1 100644
--- a/test/Bitcode/binaryIntInstructions.3.2.ll
+++ b/test/Bitcode/binaryIntInstructions.3.2.ll

@@ -1,177 +1,178 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; BinaryIntOperation.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread binary integer instructions from

-; older bitcode files.

-

-define void @add(i1 %x1, i8 %x2 ,i16 %x3, i32 %x4, i64 %x5){

-entry:

-; CHECK: %res1 = add i1 %x1, %x1

-  %res1 = add i1 %x1, %x1

-

-; CHECK-NEXT: %res2 = add i8 %x2, %x2

-  %res2 = add i8 %x2, %x2

-

-; CHECK-NEXT: %res3 = add i16 %x3, %x3

-  %res3 = add i16 %x3, %x3

-

-; CHECK-NEXT: %res4 = add i32 %x4, %x4

-  %res4 = add i32 %x4, %x4

-

-; CHECK-NEXT: %res5 = add i64 %x5, %x5

-  %res5 = add i64 %x5, %x5

-  

-; CHECK: %res6 = add nuw i1 %x1, %x1

-  %res6 = add nuw i1 %x1, %x1

-  

-; CHECK: %res7 = add nsw i1 %x1, %x1

-  %res7 = add nsw i1 %x1, %x1

-  

-; CHECK: %res8 = add nuw nsw i1 %x1, %x1

-  %res8 = add nuw nsw i1 %x1, %x1

-  

-  ret void

-}

-

-define void @addvec8NuwNsw(<2 x i8> %x1, <3 x i8> %x2 ,<4 x i8> %x3, <8 x i8> %x4, <16 x i8> %x5){

-entry:

-; CHECK: %res1 = add nuw nsw <2 x i8> %x1, %x1

-  %res1 = add nuw nsw <2 x i8> %x1, %x1

-

-; CHECK-NEXT: %res2 = add nuw nsw <3 x i8> %x2, %x2

-  %res2 = add nuw nsw <3 x i8> %x2, %x2

-

-; CHECK-NEXT: %res3 = add nuw nsw <4 x i8> %x3, %x3

-  %res3 = add nuw nsw <4 x i8> %x3, %x3

-

-; CHECK-NEXT: %res4 = add nuw nsw <8 x i8> %x4, %x4

-  %res4 = add nuw nsw <8 x i8> %x4, %x4

-  

-; CHECK-NEXT: %res5 = add nuw nsw <16 x i8> %x5, %x5

-  %res5 = add nuw nsw <16 x i8> %x5, %x5

-  

-  ret void

-}

-

-define void @addvec16NuwNsw(<2 x i16> %x1, <3 x i16> %x2 ,<4 x i16> %x3, <8 x i16> %x4, <16 x i16> %x5){

-entry:

-; CHECK: %res1 = add nuw nsw <2 x i16> %x1, %x1

-  %res1 = add nuw nsw <2 x i16> %x1, %x1

-

-; CHECK-NEXT: %res2 = add nuw nsw <3 x i16> %x2, %x2

-  %res2 = add nuw nsw <3 x i16> %x2, %x2

-

-; CHECK-NEXT: %res3 = add nuw nsw <4 x i16> %x3, %x3

-  %res3 = add nuw nsw <4 x i16> %x3, %x3

-

-; CHECK-NEXT: %res4 = add nuw nsw <8 x i16> %x4, %x4

-  %res4 = add nuw nsw <8 x i16> %x4, %x4

-  

-; CHECK-NEXT: %res5 = add nuw nsw <16 x i16> %x5, %x5

-  %res5 = add nuw nsw <16 x i16> %x5, %x5

-  

-  ret void

-}

-

-define void @addvec32NuwNsw(<2 x i32> %x1, <3 x i32> %x2 ,<4 x i32> %x3, <8 x i32> %x4, <16 x i32> %x5){

-entry:

-; CHECK: %res1 = add nuw nsw <2 x i32> %x1, %x1

-  %res1 = add nuw nsw <2 x i32> %x1, %x1

-

-; CHECK-NEXT: %res2 = add nuw nsw <3 x i32> %x2, %x2

-  %res2 = add nuw nsw <3 x i32> %x2, %x2

-

-; CHECK-NEXT: %res3 = add nuw nsw <4 x i32> %x3, %x3

-  %res3 = add nuw nsw <4 x i32> %x3, %x3

-

-; CHECK-NEXT: %res4 = add nuw nsw <8 x i32> %x4, %x4

-  %res4 = add nuw nsw <8 x i32> %x4, %x4

-  

-; CHECK-NEXT: %res5 = add nuw nsw <16 x i32> %x5, %x5

-  %res5 = add nuw nsw <16 x i32> %x5, %x5

-  

-  ret void

-}

-

-define void @addvec64NuwNsw(<2 x i64> %x1, <3 x i64> %x2 ,<4 x i64> %x3, <8 x i64> %x4, <16 x i64> %x5){

-entry:

-; CHECK: %res1 = add nuw nsw <2 x i64> %x1, %x1

-  %res1 = add nuw nsw <2 x i64> %x1, %x1

-

-; CHECK-NEXT: %res2 = add nuw nsw <3 x i64> %x2, %x2

-  %res2 = add nuw nsw <3 x i64> %x2, %x2

-

-; CHECK-NEXT: %res3 = add nuw nsw <4 x i64> %x3, %x3

-  %res3 = add nuw nsw <4 x i64> %x3, %x3

-

-; CHECK-NEXT: %res4 = add nuw nsw <8 x i64> %x4, %x4

-  %res4 = add nuw nsw <8 x i64> %x4, %x4

-  

-; CHECK-NEXT: %res5 = add nuw nsw <16 x i64> %x5, %x5

-  %res5 = add nuw nsw <16 x i64> %x5, %x5

-  

-  ret void

-}

-

-define void @sub(i8 %x1){

-entry:

-; CHECK: %res1 = sub i8 %x1, %x1

-  %res1 = sub i8 %x1, %x1

-  

-; CHECK: %res2 = sub nuw i8 %x1, %x1

-  %res2 = sub nuw i8 %x1, %x1

-  

-; CHECK: %res3 = sub nsw i8 %x1, %x1

-  %res3 = sub nsw i8 %x1, %x1

-  

-; CHECK: %res4 = sub nuw nsw i8 %x1, %x1

-  %res4 = sub nuw nsw i8 %x1, %x1

-  

-  ret void

-}

-

-define void @mul(i8 %x1){

-entry:

-; CHECK: %res1 = mul i8 %x1, %x1

-  %res1 = mul i8 %x1, %x1

-  

-  ret void

-}

-

-define void @udiv(i8 %x1){

-entry:

-; CHECK: %res1 = udiv i8 %x1, %x1

-  %res1 = udiv i8 %x1, %x1

-  

-; CHECK-NEXT: %res2 = udiv exact i8 %x1, %x1

-  %res2 = udiv exact i8 %x1, %x1

-

-  ret void

-}

-

-define void @sdiv(i8 %x1){

-entry:

-; CHECK: %res1 = sdiv i8 %x1, %x1

-  %res1 = sdiv i8 %x1, %x1

-  

-; CHECK-NEXT: %res2 = sdiv exact i8 %x1, %x1

-  %res2 = sdiv exact i8 %x1, %x1

-

-  ret void

-}

-

-define void @urem(i32 %x1){

-entry:

-; CHECK: %res1 = urem i32 %x1, %x1

-  %res1 = urem i32 %x1, %x1

-  

-  ret void

-}

-

-define void @srem(i32 %x1){

-entry:

-; CHECK: %res1 = srem i32 %x1, %x1

-  %res1 = srem i32 %x1, %x1

-  

-  ret void

-}

+; RUN: llvm-dis < %s.bc| FileCheck %s
+; RUN: verify-uselistorder < %s.bc
+
+; BinaryIntOperation.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread binary integer instructions from
+; older bitcode files.
+
+define void @add(i1 %x1, i8 %x2 ,i16 %x3, i32 %x4, i64 %x5){
+entry:
+; CHECK: %res1 = add i1 %x1, %x1
+  %res1 = add i1 %x1, %x1
+
+; CHECK-NEXT: %res2 = add i8 %x2, %x2
+  %res2 = add i8 %x2, %x2
+
+; CHECK-NEXT: %res3 = add i16 %x3, %x3
+  %res3 = add i16 %x3, %x3
+
+; CHECK-NEXT: %res4 = add i32 %x4, %x4
+  %res4 = add i32 %x4, %x4
+
+; CHECK-NEXT: %res5 = add i64 %x5, %x5
+  %res5 = add i64 %x5, %x5
+
+; CHECK: %res6 = add nuw i1 %x1, %x1
+  %res6 = add nuw i1 %x1, %x1
+
+; CHECK: %res7 = add nsw i1 %x1, %x1
+  %res7 = add nsw i1 %x1, %x1
+
+; CHECK: %res8 = add nuw nsw i1 %x1, %x1
+  %res8 = add nuw nsw i1 %x1, %x1
+
+  ret void
+}
+
+define void @addvec8NuwNsw(<2 x i8> %x1, <3 x i8> %x2 ,<4 x i8> %x3, <8 x i8> %x4, <16 x i8> %x5){
+entry:
+; CHECK: %res1 = add nuw nsw <2 x i8> %x1, %x1
+  %res1 = add nuw nsw <2 x i8> %x1, %x1
+
+; CHECK-NEXT: %res2 = add nuw nsw <3 x i8> %x2, %x2
+  %res2 = add nuw nsw <3 x i8> %x2, %x2
+
+; CHECK-NEXT: %res3 = add nuw nsw <4 x i8> %x3, %x3
+  %res3 = add nuw nsw <4 x i8> %x3, %x3
+
+; CHECK-NEXT: %res4 = add nuw nsw <8 x i8> %x4, %x4
+  %res4 = add nuw nsw <8 x i8> %x4, %x4
+
+; CHECK-NEXT: %res5 = add nuw nsw <16 x i8> %x5, %x5
+  %res5 = add nuw nsw <16 x i8> %x5, %x5
+
+  ret void
+}
+
+define void @addvec16NuwNsw(<2 x i16> %x1, <3 x i16> %x2 ,<4 x i16> %x3, <8 x i16> %x4, <16 x i16> %x5){
+entry:
+; CHECK: %res1 = add nuw nsw <2 x i16> %x1, %x1
+  %res1 = add nuw nsw <2 x i16> %x1, %x1
+
+; CHECK-NEXT: %res2 = add nuw nsw <3 x i16> %x2, %x2
+  %res2 = add nuw nsw <3 x i16> %x2, %x2
+
+; CHECK-NEXT: %res3 = add nuw nsw <4 x i16> %x3, %x3
+  %res3 = add nuw nsw <4 x i16> %x3, %x3
+
+; CHECK-NEXT: %res4 = add nuw nsw <8 x i16> %x4, %x4
+  %res4 = add nuw nsw <8 x i16> %x4, %x4
+
+; CHECK-NEXT: %res5 = add nuw nsw <16 x i16> %x5, %x5
+  %res5 = add nuw nsw <16 x i16> %x5, %x5
+
+  ret void
+}
+
+define void @addvec32NuwNsw(<2 x i32> %x1, <3 x i32> %x2 ,<4 x i32> %x3, <8 x i32> %x4, <16 x i32> %x5){
+entry:
+; CHECK: %res1 = add nuw nsw <2 x i32> %x1, %x1
+  %res1 = add nuw nsw <2 x i32> %x1, %x1
+
+; CHECK-NEXT: %res2 = add nuw nsw <3 x i32> %x2, %x2
+  %res2 = add nuw nsw <3 x i32> %x2, %x2
+
+; CHECK-NEXT: %res3 = add nuw nsw <4 x i32> %x3, %x3
+  %res3 = add nuw nsw <4 x i32> %x3, %x3
+
+; CHECK-NEXT: %res4 = add nuw nsw <8 x i32> %x4, %x4
+  %res4 = add nuw nsw <8 x i32> %x4, %x4
+
+; CHECK-NEXT: %res5 = add nuw nsw <16 x i32> %x5, %x5
+  %res5 = add nuw nsw <16 x i32> %x5, %x5
+
+  ret void
+}
+
+define void @addvec64NuwNsw(<2 x i64> %x1, <3 x i64> %x2 ,<4 x i64> %x3, <8 x i64> %x4, <16 x i64> %x5){
+entry:
+; CHECK: %res1 = add nuw nsw <2 x i64> %x1, %x1
+  %res1 = add nuw nsw <2 x i64> %x1, %x1
+
+; CHECK-NEXT: %res2 = add nuw nsw <3 x i64> %x2, %x2
+  %res2 = add nuw nsw <3 x i64> %x2, %x2
+
+; CHECK-NEXT: %res3 = add nuw nsw <4 x i64> %x3, %x3
+  %res3 = add nuw nsw <4 x i64> %x3, %x3
+
+; CHECK-NEXT: %res4 = add nuw nsw <8 x i64> %x4, %x4
+  %res4 = add nuw nsw <8 x i64> %x4, %x4
+
+; CHECK-NEXT: %res5 = add nuw nsw <16 x i64> %x5, %x5
+  %res5 = add nuw nsw <16 x i64> %x5, %x5
+
+  ret void
+}
+
+define void @sub(i8 %x1){
+entry:
+; CHECK: %res1 = sub i8 %x1, %x1
+  %res1 = sub i8 %x1, %x1
+
+; CHECK: %res2 = sub nuw i8 %x1, %x1
+  %res2 = sub nuw i8 %x1, %x1
+
+; CHECK: %res3 = sub nsw i8 %x1, %x1
+  %res3 = sub nsw i8 %x1, %x1
+
+; CHECK: %res4 = sub nuw nsw i8 %x1, %x1
+  %res4 = sub nuw nsw i8 %x1, %x1
+
+  ret void
+}
+
+define void @mul(i8 %x1){
+entry:
+; CHECK: %res1 = mul i8 %x1, %x1
+  %res1 = mul i8 %x1, %x1
+
+  ret void
+}
+
+define void @udiv(i8 %x1){
+entry:
+; CHECK: %res1 = udiv i8 %x1, %x1
+  %res1 = udiv i8 %x1, %x1
+
+; CHECK-NEXT: %res2 = udiv exact i8 %x1, %x1
+  %res2 = udiv exact i8 %x1, %x1
+
+  ret void
+}
+
+define void @sdiv(i8 %x1){
+entry:
+; CHECK: %res1 = sdiv i8 %x1, %x1
+  %res1 = sdiv i8 %x1, %x1
+
+; CHECK-NEXT: %res2 = sdiv exact i8 %x1, %x1
+  %res2 = sdiv exact i8 %x1, %x1
+
+  ret void
+}
+
+define void @urem(i32 %x1){
+entry:
+; CHECK: %res1 = urem i32 %x1, %x1
+  %res1 = urem i32 %x1, %x1
+
+  ret void
+}
+
+define void @srem(i32 %x1){
+entry:
+; CHECK: %res1 = srem i32 %x1, %x1
+  %res1 = srem i32 %x1, %x1
+
+  ret void
+}

diff --git a/test/Bitcode/bitwiseInstructions.3.2.ll b/test/Bitcode/bitwiseInstructions.3.2.ll
index 6225a08..aaaf4f5 100644
--- a/test/Bitcode/bitwiseInstructions.3.2.ll
+++ b/test/Bitcode/bitwiseInstructions.3.2.ll

@@ -1,68 +1,69 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; bitwiseOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread bitwise instructions from

-; older bitcode files.

-

-define void @shl(i8 %x1){

-entry:

-; CHECK: %res1 = shl i8 %x1, %x1

-  %res1 = shl i8 %x1, %x1

-  

-; CHECK: %res2 = shl nuw i8 %x1, %x1

-  %res2 = shl nuw i8 %x1, %x1

-  

-; CHECK: %res3 = shl nsw i8 %x1, %x1

-  %res3 = shl nsw i8 %x1, %x1

-

-; CHECK: %res4 = shl nuw nsw i8 %x1, %x1

-  %res4 = shl nuw nsw i8 %x1, %x1

-  

-  ret void

-}

-

-define void @lshr(i8 %x1){

-entry:

-; CHECK: %res1 = lshr i8 %x1, %x1

-  %res1 = lshr i8 %x1, %x1

-  

-; CHECK: %res2 = lshr exact i8 %x1, %x1

-  %res2 = lshr exact i8 %x1, %x1

-  

-  ret void

-}

-

-define void @ashr(i8 %x1){

-entry:

-; CHECK: %res1 = ashr i8 %x1, %x1

-  %res1 = ashr i8 %x1, %x1

-

-; CHECK-NEXT: %res2 = ashr exact i8 %x1, %x1

-  %res2 = ashr exact i8 %x1, %x1

-

-  ret void

-}

-

-define void @and(i8 %x1){

-entry:

-; CHECK: %res1 = and i8 %x1, %x1

-  %res1 = and i8 %x1, %x1

-  

-  ret void

-}

-

-define void @or(i8 %x1){

-entry:

-; CHECK: %res1 = or i8 %x1, %x1

-  %res1 = or i8 %x1, %x1

-  

-  ret void

-}

-

-define void @xor(i8 %x1){

-entry:

-; CHECK: %res1 = xor i8 %x1, %x1

-  %res1 = xor i8 %x1, %x1

-  

-  ret void

-}
\ No newline at end of file
+; RUN: llvm-dis < %s.bc| FileCheck %s
+; RUN: verify-uselistorder < %s.bc
+
+; bitwiseOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread bitwise instructions from
+; older bitcode files.
+
+define void @shl(i8 %x1){
+entry:
+; CHECK: %res1 = shl i8 %x1, %x1
+  %res1 = shl i8 %x1, %x1
+
+; CHECK: %res2 = shl nuw i8 %x1, %x1
+  %res2 = shl nuw i8 %x1, %x1
+
+; CHECK: %res3 = shl nsw i8 %x1, %x1
+  %res3 = shl nsw i8 %x1, %x1
+
+; CHECK: %res4 = shl nuw nsw i8 %x1, %x1
+  %res4 = shl nuw nsw i8 %x1, %x1
+
+  ret void
+}
+
+define void @lshr(i8 %x1){
+entry:
+; CHECK: %res1 = lshr i8 %x1, %x1
+  %res1 = lshr i8 %x1, %x1
+
+; CHECK: %res2 = lshr exact i8 %x1, %x1
+  %res2 = lshr exact i8 %x1, %x1
+
+  ret void
+}
+
+define void @ashr(i8 %x1){
+entry:
+; CHECK: %res1 = ashr i8 %x1, %x1
+  %res1 = ashr i8 %x1, %x1
+
+; CHECK-NEXT: %res2 = ashr exact i8 %x1, %x1
+  %res2 = ashr exact i8 %x1, %x1
+
+  ret void
+}
+
+define void @and(i8 %x1){
+entry:
+; CHECK: %res1 = and i8 %x1, %x1
+  %res1 = and i8 %x1, %x1
+
+  ret void
+}
+
+define void @or(i8 %x1){
+entry:
+; CHECK: %res1 = or i8 %x1, %x1
+  %res1 = or i8 %x1, %x1
+
+  ret void
+}
+
+define void @xor(i8 %x1){
+entry:
+; CHECK: %res1 = xor i8 %x1, %x1
+  %res1 = xor i8 %x1, %x1
+
+  ret void
+}

diff --git a/test/Bitcode/blockaddress.ll b/test/Bitcode/blockaddress.ll
index 8ac54be..db109df 100644
--- a/test/Bitcode/blockaddress.ll
+++ b/test/Bitcode/blockaddress.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 ; PR9857
 
 define void @f(i8** nocapture %ptr1) {
@@ -43,3 +44,17 @@
 end:
   ret void
 }
+
+; Check a blockaddress taken in two separate functions before the referenced
+; function.
+define i8* @take1() {
+  ret i8* blockaddress(@taken, %bb)
+}
+define i8* @take2() {
+  ret i8* blockaddress(@taken, %bb)
+}
+define void @taken() {
+  unreachable
+bb:
+  unreachable
+}

diff --git a/test/Bitcode/calling-conventions.3.2.ll b/test/Bitcode/calling-conventions.3.2.ll
index aca9efd..f36e9f8 100644
--- a/test/Bitcode/calling-conventions.3.2.ll
+++ b/test/Bitcode/calling-conventions.3.2.ll

@@ -1,4 +1,5 @@
 ; RUN:  llvm-dis < %s.bc| FileCheck %s
+; RUN:  verify-uselistorder < %s.bc
 
 ; calling-conventions.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
 ; The test checks that LLVM does not silently misread calling conventions of

diff --git a/test/Bitcode/case-ranges-3.3.ll b/test/Bitcode/case-ranges-3.3.ll
index 6e1d0a6..020b37f 100644
--- a/test/Bitcode/case-ranges-3.3.ll
+++ b/test/Bitcode/case-ranges-3.3.ll

@@ -1,4 +1,5 @@
 ; RUN:  llvm-dis < %s.bc| FileCheck %s
+; RUN:  verify-uselistorder < %s.bc
 
 ; case-ranges.ll.bc was generated by passing this file to llvm-as from the 3.3
 ; release of LLVM. This tests that the bitcode for switches from that release

diff --git a/test/Bitcode/cmpxchg-upgrade.ll b/test/Bitcode/cmpxchg-upgrade.ll
index d36ac1c..125729e 100644
--- a/test/Bitcode/cmpxchg-upgrade.ll
+++ b/test/Bitcode/cmpxchg-upgrade.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: verify-uselistorder < %s.bc
 
 ; cmpxchg-upgrade.ll.bc was produced by running a version of llvm-as from just
 ; before the IR change on this file.
@@ -20,4 +21,4 @@
 ; CHECK: cmpxchg i32* %addr, i32 42, i32 0 seq_cst seq_cst
 
    ret void
-}
\ No newline at end of file
+}

diff --git a/test/Bitcode/constantsTest.3.2.ll b/test/Bitcode/constantsTest.3.2.ll
new file mode 100644
index 0000000..b4973cf
--- /dev/null
+++ b/test/Bitcode/constantsTest.3.2.ll

@@ -0,0 +1,124 @@
+; RUN:  llvm-dis < %s.bc| FileCheck %s
+
+; constantsTest.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread binary float instructions of
+; older bitcode files.
+
+;global variable address
+; CHECK: @X = global i32 0
+@X = global i32 0
+; CHECK: @Y = global i32 1
+@Y = global i32 1
+; CHECK: @Z = global [2 x i32*] [i32* @X, i32* @Y]
+@Z = global [2 x i32*] [i32* @X, i32* @Y]
+
+
+define void @SimpleConstants(i32 %x) {
+entry:
+; null
+; CHECK: store i32 %x, i32* null
+  store i32 %x, i32* null
+ 
+; boolean
+; CHECK-NEXT: %res1 = fcmp true float 1.000000e+00, 1.000000e+00 
+  %res1 = fcmp true float 1.0, 1.0
+; CHECK-NEXT: %res2 = fcmp false float 1.000000e+00, 1.000000e+00
+  %res2 = fcmp false float 1.0, 1.0
+
+;integer
+; CHECK-NEXT: %res3 = add i32 0, 0
+  %res3 = add i32 0, 0
+
+;float
+; CHECK-NEXT: %res4 = fadd float 0.000000e+00, 0.000000e+00
+  %res4 = fadd float 0.0, 0.0
+
+  ret void
+}
+
+define void @ComplexConstants(<2 x i32> %x){
+entry:
+;constant structure
+; CHECK: %res1 = extractvalue { i32, float } { i32 1, float 2.000000e+00 }, 0
+  %res1 = extractvalue {i32, float} {i32 1, float 2.0}, 0
+  
+;const array
+; CHECK-NEXT: %res2 = extractvalue [2 x i32] [i32 1, i32 2], 0
+  %res2 = extractvalue [2 x i32] [i32 1, i32 2], 0
+  
+;const vector
+; CHECK-NEXT: %res3 = add <2 x i32> <i32 1, i32 1>, <i32 1, i32 1>
+  %res3 = add <2 x i32> <i32 1, i32 1>, <i32 1, i32 1>
+  
+;zeroinitializer
+; CHECK-NEXT: %res4 = add <2 x i32> %x, zeroinitializer
+  %res4 = add <2 x i32> %x, zeroinitializer
+  
+  ret void
+}
+
+define void @OtherConstants(i32 %x, i8* %Addr){
+entry:
+  ;undef
+  ; CHECK: %res1 = add i32 %x, undef 
+  %res1 = add i32 %x, undef
+  
+  ;poison
+  ; CHECK-NEXT: %poison = sub nuw i32 0, 1
+  %poison = sub nuw i32 0, 1
+  
+  ;address of basic block
+  ; CHECK-NEXT: %res2 = icmp eq i8* blockaddress(@OtherConstants, %Next), null
+  %res2 = icmp eq i8* blockaddress(@OtherConstants, %Next), null
+  br label %Next
+  Next: 
+  ret void
+}
+
+define void @OtherConstants2(){
+entry:
+  ; CHECK: trunc i32 1 to i8
+  trunc i32 1 to i8
+  ; CHECK-NEXT: zext i8 1 to i32
+  zext i8 1 to i32
+  ; CHECK-NEXT: sext i8 1 to i32
+  sext i8 1 to i32
+  ; CHECK-NEXT: fptrunc double 1.000000e+00 to float
+  fptrunc double 1.0 to float
+  ; CHECK-NEXT: fpext float 1.000000e+00 to double
+  fpext float 1.0 to double
+  ; CHECK-NEXT: fptosi float 1.000000e+00 to i32
+  fptosi float 1.0 to i32
+  ; CHECK-NEXT: uitofp i32 1 to float
+  uitofp i32 1 to float
+  ; CHECK-NEXT: sitofp i32 -1 to float
+  sitofp i32 -1 to float
+  ; CHECK-NEXT: ptrtoint i32* @X to i32
+  ptrtoint i32* @X to i32
+  ; CHECK-NEXT: inttoptr i8 1 to i8*
+  inttoptr i8 1 to i8*
+  ; CHECK-NEXT: bitcast i32 1 to <2 x i16>
+  bitcast i32 1 to <2 x i16>
+  ; CHECK-NEXT: getelementptr i32* @X, i32 0
+  getelementptr i32* @X, i32 0
+  ; CHECK-NEXT: getelementptr inbounds i32* @X, i32 0
+  getelementptr inbounds i32* @X, i32 0
+  ; CHECK: select i1 true, i32 1, i32 0
+  select i1 true ,i32 1, i32 0
+  ; CHECK-NEXT: icmp eq i32 1, 0
+  icmp eq i32 1, 0
+  ; CHECK-NEXT: fcmp oeq float 1.000000e+00, 0.000000e+00
+  fcmp oeq float 1.0, 0.0
+  ; CHECK-NEXT: extractelement <2 x i32> <i32 1, i32 1>, i32 1
+  extractelement <2 x i32> <i32 1, i32 1>, i32 1
+  ; CHECK-NEXT: insertelement <2 x i32> <i32 1, i32 1>, i32 0, i32 1
+  insertelement <2 x i32> <i32 1, i32 1>, i32 0, i32 1
+  ; CHECK-NEXT: shufflevector <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  shufflevector <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ; CHECK-NEXT: extractvalue { i32, float } { i32 1, float 2.000000e+00 }, 0
+  extractvalue { i32, float } { i32 1, float 2.0 }, 0
+  ; CHECK-NEXT: insertvalue { i32, float } { i32 1, float 2.000000e+00 }, i32 0, 0
+  insertvalue { i32, float } { i32 1, float 2.0 }, i32 0, 0
+  
+  ret void
+}
\ No newline at end of file

diff --git a/test/Bitcode/constantsTest.3.2.ll.bc b/test/Bitcode/constantsTest.3.2.ll.bc
new file mode 100644
index 0000000..8454bef
--- /dev/null
+++ b/test/Bitcode/constantsTest.3.2.ll.bc
Binary files differ

diff --git a/test/Bitcode/conversionInstructions.3.2.ll b/test/Bitcode/conversionInstructions.3.2.ll
index 4b3f273..ae2d65e 100644
--- a/test/Bitcode/conversionInstructions.3.2.ll
+++ b/test/Bitcode/conversionInstructions.3.2.ll

@@ -1,104 +1,124 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; conversionOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread conversion instructions from

-; older bitcode files.

-

-define void @trunc(i32 %src){

-entry:

-; CHECK: %res1 = trunc i32 %src to i8

-  %res1 = trunc i32 %src to i8

-    

-  ret void

-}

-

-define void @zext(i32 %src){

-entry:

-; CHECK: %res1 = zext i32 %src to i64

-  %res1 = zext i32 %src to i64

-    

-  ret void

-}

-

-define void @sext(i32 %src){

-entry:

-; CHECK: %res1 = sext i32 %src to i64

-  %res1 = sext i32 %src to i64

-    

-  ret void

-}

-

-define void @fptrunc(double %src){

-entry:

-; CHECK: %res1 = fptrunc double %src to float

-  %res1 = fptrunc double %src to float

-  

-  ret void

-}

-

-define void @fpext(float %src){

-entry:

-; CHECK: %res1 = fpext float %src to double

-  %res1 = fpext float %src to double

-  

-  ret void

-}

-

-define void @fptoui(float %src){

-entry:

-; CHECK: %res1 = fptoui float %src to i32

-  %res1 = fptoui float %src to i32

-  

-  ret void

-}

-

-define void @fptosi(float %src){

-entry:

-; CHECK: %res1 = fptosi float %src to i32

-  %res1 = fptosi float %src to i32

-  

-  ret void

-}

-

-define void @uitofp(i32 %src){

-entry:

-; CHECK: %res1 = uitofp i32 %src to float

-  %res1 = uitofp i32 %src to float

-  

-  ret void

-}

-

-define void @sitofp(i32 %src){

-entry:

-; CHECK: %res1 = sitofp i32 %src to float

-  %res1 = sitofp i32 %src to float

-  

-  ret void

-}

-

-define void @ptrtoint(i32* %src){

-entry:

-; CHECK: %res1 = ptrtoint i32* %src to i8

-  %res1 = ptrtoint i32* %src to i8

-  

-  ret void

-}

-

-define void @inttoptr(i32 %src){

-entry:

-; CHECK: %res1 = inttoptr i32 %src to i32*

-  %res1 = inttoptr i32 %src to i32*

-  

-  ret void

-}

-

-define void @bitcast(i32 %src1, i32* %src2){

-entry:

-; CHECK: %res1 = bitcast i32 %src1 to i32

-  %res1 = bitcast i32 %src1 to i32

-  

-; CHECK: %res2 = bitcast i32* %src2 to i64*

-  %res2 = bitcast i32* %src2 to i64*

-  

-  ret void

-}
\ No newline at end of file
+; RUN:  llvm-dis < %s.bc| FileCheck %s
+
+; conversionInstructions.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread conversion instructions from
+; older bitcode files.
+
+define void @trunc(i32 %src){
+entry:
+; CHECK: %res1 = trunc i32 %src to i8
+  %res1 = trunc i32 %src to i8
+    
+  ret void
+}
+
+define void @zext(i32 %src){
+entry:
+; CHECK: %res1 = zext i32 %src to i64
+  %res1 = zext i32 %src to i64
+    
+  ret void
+}
+
+define void @sext(i32 %src){
+entry:
+; CHECK: %res1 = sext i32 %src to i64
+  %res1 = sext i32 %src to i64
+    
+  ret void
+}
+
+define void @fptrunc(double %src){
+entry:
+; CHECK: %res1 = fptrunc double %src to float
+  %res1 = fptrunc double %src to float
+  
+  ret void
+}
+
+define void @fpext(float %src){
+entry:
+; CHECK: %res1 = fpext float %src to double
+  %res1 = fpext float %src to double
+  
+  ret void
+}
+
+define void @fptoui(float %src){
+entry:
+; CHECK: %res1 = fptoui float %src to i32
+  %res1 = fptoui float %src to i32
+  
+  ret void
+}
+
+define void @fptosi(float %src){
+entry:
+; CHECK: %res1 = fptosi float %src to i32
+  %res1 = fptosi float %src to i32
+  
+  ret void
+}
+
+define void @uitofp(i32 %src){
+entry:
+; CHECK: %res1 = uitofp i32 %src to float
+  %res1 = uitofp i32 %src to float
+  
+  ret void
+}
+
+define void @sitofp(i32 %src){
+entry:
+; CHECK: %res1 = sitofp i32 %src to float
+  %res1 = sitofp i32 %src to float
+  
+  ret void
+}
+
+define void @ptrtoint(i32* %src){
+entry:
+; CHECK: %res1 = ptrtoint i32* %src to i8
+  %res1 = ptrtoint i32* %src to i8
+  
+  ret void
+}
+
+define void @inttoptr(i32 %src){
+entry:
+; CHECK: %res1 = inttoptr i32 %src to i32*
+  %res1 = inttoptr i32 %src to i32*
+  
+  ret void
+}
+
+define void @bitcast(i32 %src1, i32* %src2){
+entry:
+; CHECK: %res1 = bitcast i32 %src1 to i32
+  %res1 = bitcast i32 %src1 to i32
+  
+; CHECK: %res2 = bitcast i32* %src2 to i64*
+  %res2 = bitcast i32* %src2 to i64*
+  
+  ret void
+}
+
+define void @ptrtointInstr(i32* %ptr, <4 x i32*> %vecPtr){
+entry:
+; CHECK: %res1 = ptrtoint i32* %ptr to i8
+  %res1 = ptrtoint i32* %ptr to i8  
+; CHECK-NEXT: %res2 = ptrtoint <4 x i32*> %vecPtr to <4 x i64>
+  %res2 = ptrtoint <4 x i32*> %vecPtr to <4 x i64>
+  
+  ret void
+}
+
+define void @inttoptrInstr(i32 %x, <4 x i32> %vec){
+entry:
+; CHECK: %res1 = inttoptr i32 %x to i64*
+  %res1 = inttoptr i32 %x to i64*
+; CHECK-NEXT: inttoptr <4 x i32> %vec to <4 x i8*>
+  %res2 = inttoptr <4 x i32> %vec to <4 x i8*>
+  
+  ret void
+}

diff --git a/test/Bitcode/conversionInstructions.3.2.ll.bc b/test/Bitcode/conversionInstructions.3.2.ll.bc
index fabf7da..a6f8a47 100644
--- a/test/Bitcode/conversionInstructions.3.2.ll.bc
+++ b/test/Bitcode/conversionInstructions.3.2.ll.bc
Binary files differ

diff --git a/test/Bitcode/deprecated-linker_private-linker_private_weak.ll b/test/Bitcode/deprecated-linker_private-linker_private_weak.ll
deleted file mode 100644
index 12a527c..0000000
--- a/test/Bitcode/deprecated-linker_private-linker_private_weak.ll
+++ /dev/null

@@ -1,17 +0,0 @@
-; RUN: llvm-as -o - %s | llvm-dis | FileCheck %s
-; RUN: llvm-as -o /dev/null %s 2>&1 | FileCheck %s -check-prefix CHECK-WARNINGS
-
-@.linker_private = linker_private unnamed_addr constant [15 x i8] c"linker_private\00", align 64
-@.linker_private_weak = linker_private_weak unnamed_addr constant [20 x i8] c"linker_private_weak\00", align 64
-
-; CHECK: @.linker_private = private unnamed_addr constant [15 x i8] c"linker_private\00", align 64
-; CHECK: @.linker_private_weak = private unnamed_addr constant [20 x i8] c"linker_private_weak\00", align 64
-
-; CHECK-WARNINGS: warning: '.linker_private' is deprecated, treating as PrivateLinkage
-; CHECK-WARNINGS: @.linker_private = linker_private unnamed_addr constant [15 x i8] c"linker_private\00", align 64
-; CHECK-WARNINGS:                    ^
-
-; CHECK-WARNINGS: warning: '.linker_private_weak' is deprecated, treating as PrivateLinkage
-; CHECK-WARNINGS: @.linker_private_weak = linker_private_weak unnamed_addr constant [20 x i8] c"linker_private_weak\00", align 64
-; CHECK-WARNINGS:                         ^
-

diff --git a/test/Bitcode/drop-debug-info.ll b/test/Bitcode/drop-debug-info.ll
index 5123018..a2f5694 100644
--- a/test/Bitcode/drop-debug-info.ll
+++ b/test/Bitcode/drop-debug-info.ll

@@ -1,5 +1,6 @@
 ; RUN: llvm-as < %s -o %t.bc 2>&1 >/dev/null | FileCheck -check-prefix=WARN %s
 ; RUN: llvm-dis < %t.bc | FileCheck %s
+; RUN: verify-uselistorder < %t.bc
 
 define i32 @main() {
 entry:
@@ -11,15 +12,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 (trunk 195495) (llvm/trunk 195495:195504M)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 (trunk 195495) (llvm/trunk 195495:195504M)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"../llvm/tools/clang/test/CodeGen/debug-info-version.c", metadata !"/Users/manmanren/llvm_gmail/release"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
 !12 = metadata !{i32 4, i32 0, metadata !4, null}
 

diff --git a/test/Bitcode/extractelement.ll b/test/Bitcode/extractelement.ll
index 8999c65..90a883d 100644
--- a/test/Bitcode/extractelement.ll
+++ b/test/Bitcode/extractelement.ll

@@ -1,4 +1,5 @@
 ; RUN: opt < %s -constprop | llvm-dis -disable-output
+; RUN: verify-uselistorder < %s
 ; PR3465
 
 define double @test() {

diff --git a/test/Bitcode/flags.ll b/test/Bitcode/flags.ll
index 7b0c5b5..6febaa6 100644
--- a/test/Bitcode/flags.ll
+++ b/test/Bitcode/flags.ll

@@ -1,6 +1,7 @@
 ; RUN: llvm-as < %s | llvm-dis > %t0
 ; RUN: opt -S < %s > %t1
 ; RUN: diff %t0 %t1
+; RUN: verify-uselistorder < %s
 ; PR6140
 
 ; Make sure the flags are serialized/deserialized properly for both

diff --git a/test/Bitcode/function-encoding-rel-operands.ll b/test/Bitcode/function-encoding-rel-operands.ll
index aedb0c3..24d6d80 100644
--- a/test/Bitcode/function-encoding-rel-operands.ll
+++ b/test/Bitcode/function-encoding-rel-operands.ll

@@ -1,6 +1,7 @@
 ; Basic sanity test to check that instruction operands are encoded with
 ; relative IDs.
 ; RUN: llvm-as < %s | llvm-bcanalyzer -dump | FileCheck %s
+; RUN: verify-uselistorder < %s
 
 ; CHECK: FUNCTION_BLOCK
 ; CHECK: INST_BINOP {{.*}}op0=1 op1=1
@@ -47,3 +48,5 @@
   %2 = icmp eq i32 %1, %a
   ret i1 %2
 }
+
+; CHECK: Stream type: LLVM IR

diff --git a/test/Bitcode/global-variables.3.2.ll b/test/Bitcode/global-variables.3.2.ll
index 549d025..afd9cb1 100644
--- a/test/Bitcode/global-variables.3.2.ll
+++ b/test/Bitcode/global-variables.3.2.ll

@@ -1,4 +1,5 @@
 ; RUN:  llvm-dis < %s.bc| FileCheck %s
+; RUN:  verify-uselistorder < %s.bc
 
 ; global-variables.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
 ; The test checks that LLVM does not silently misread global variables attributes of

diff --git a/test/Bitcode/highLevelStructure.3.2.ll b/test/Bitcode/highLevelStructure.3.2.ll
new file mode 100644
index 0000000..f9509eb
--- /dev/null
+++ b/test/Bitcode/highLevelStructure.3.2.ll

@@ -0,0 +1,86 @@
+; RUN:  llvm-dis < %s.bc| FileCheck %s
+
+; highLevelStructure.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread binary float instructions of
+; older bitcode files.
+
+; Data Layout Test
+; CHECK: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-f80:32-n8:16:32-S32"
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-a0:0:64-f80:32:32-n8:16:32-S32"
+
+; Module-Level Inline Assembly Test
+; CHECK: module asm "some assembly"
+module asm "some assembly"
+
+; Named Types Test
+; CHECK: %mytype = type { %mytype*, i32 }
+%mytype = type { %mytype*, i32 }
+
+; Aliases Test
+; CHECK: @glob1 = global i32 1
+@glob1 = global i32 1
+; CHECK: @aliased1 = alias i32* @glob1
+@aliased1 = alias i32* @glob1
+; CHECK-NEXT: @aliased2 = internal alias i32* @glob1
+@aliased2 = internal alias i32* @glob1
+; CHECK-NEXT: @aliased3 = alias i32* @glob1
+@aliased3 = external alias i32* @glob1
+; CHECK-NEXT: @aliased4 = weak alias i32* @glob1
+@aliased4 = weak alias i32* @glob1
+; CHECK-NEXT: @aliased5 = weak_odr alias i32* @glob1
+@aliased5 = weak_odr alias i32* @glob1
+
+;Parameter Attribute Test
+; CHECK: declare void @ParamAttr1(i8 zeroext)
+declare void @ParamAttr1(i8 zeroext)
+; CHECK: declare void @ParamAttr2(i8* nest)
+declare void @ParamAttr2(i8* nest)
+; CHECK: declare void @ParamAttr3(i8* sret)
+declare void @ParamAttr3(i8* sret)
+; CHECK: declare void @ParamAttr4(i8 signext)
+declare void @ParamAttr4(i8 signext)
+; CHECK: declare void @ParamAttr5(i8* inreg)
+declare void @ParamAttr5(i8* inreg)
+; CHECK: declare void @ParamAttr6(i8* byval)
+declare void @ParamAttr6(i8* byval)
+; CHECK: declare void @ParamAttr7(i8* noalias)
+declare void @ParamAttr7(i8* noalias)
+; CHECK: declare void @ParamAttr8(i8* nocapture)
+declare void @ParamAttr8(i8* nocapture)
+; CHECK: declare void @ParamAttr9{{[(i8* nest noalias nocapture) | (i8* noalias nocapture nest)]}}
+declare void @ParamAttr9(i8* nest noalias nocapture)
+; CHECK: declare void @ParamAttr10{{[(i8* sret noalias nocapture) | (i8* noalias nocapture sret)]}}
+declare void @ParamAttr10(i8* sret noalias nocapture)
+;CHECK: declare void @ParamAttr11{{[(i8* byval noalias nocapture) | (i8* noalias nocapture byval)]}}
+declare void @ParamAttr11(i8* byval noalias nocapture)
+;CHECK: declare void @ParamAttr12{{[(i8* inreg noalias nocapture) | (i8* noalias nocapture inreg)]}}
+declare void @ParamAttr12(i8* inreg noalias nocapture)
+
+
+; NamedTypesTest
+define void @NamedTypes() {
+entry:
+; CHECK: %res = alloca %mytype
+  %res = alloca %mytype
+  ret void
+}
+
+; Garbage Collector Name Test
+; CHECK: define void @gcTest() gc "gc"
+define void @gcTest() gc "gc" {
+entry:
+  ret void
+}
+
+; Named metadata Test
+; CHECK: !name = !{!0, !1, !2}
+!name = !{!0, !1, !2}
+; CHECK: !0 = metadata !{metadata !"zero"}
+!0 = metadata !{metadata !"zero"}
+; CHECK: !1 = metadata !{metadata !"one"}
+!1 = metadata !{metadata !"one"}
+; CHECK: !2 = metadata !{metadata !"two"}
+!2 = metadata !{metadata !"two"}
+
+
+

diff --git a/test/Bitcode/highLevelStructure.3.2.ll.bc b/test/Bitcode/highLevelStructure.3.2.ll.bc
new file mode 100644
index 0000000..591c5c3
--- /dev/null
+++ b/test/Bitcode/highLevelStructure.3.2.ll.bc
Binary files differ

diff --git a/test/Bitcode/inalloca.ll b/test/Bitcode/inalloca.ll
new file mode 100644
index 0000000..84abe17
--- /dev/null
+++ b/test/Bitcode/inalloca.ll

@@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
+
+; inalloca should roundtrip.
+
+define void @foo(i32* inalloca %args) {
+  ret void
+}
+; CHECK-LABEL: define void @foo(i32* inalloca %args)
+
+define void @bar() {
+  ; Use the maximum alignment, since we stuff our bit with alignment.
+  %args = alloca inalloca i32, align 536870912
+  call void @foo(i32* inalloca %args)
+  ret void
+}
+; CHECK-LABEL: define void @bar() {
+; CHECK: %args = alloca inalloca i32, align 536870912
+; CHECK: call void @foo(i32* inalloca %args)

diff --git a/test/Bitcode/linkage-types-3.2.ll b/test/Bitcode/linkage-types-3.2.ll
index fd070ef..dc6c90c 100644
--- a/test/Bitcode/linkage-types-3.2.ll
+++ b/test/Bitcode/linkage-types-3.2.ll

@@ -1,4 +1,5 @@
 ; RUN:  llvm-dis < %s.bc| FileCheck %s
+; RUN:  verify-uselistorder < %s.bc
 
 ; linkage-types-3.2.ll.bc was generated by passing this file to llvm-as-3.2
 ; The test checks that LLVM does not silently misread linkage types of

diff --git a/test/Bitcode/local-linkage-default-visibility.3.4.ll b/test/Bitcode/local-linkage-default-visibility.3.4.ll
index 45a7b12..df0cf76 100644
--- a/test/Bitcode/local-linkage-default-visibility.3.4.ll
+++ b/test/Bitcode/local-linkage-default-visibility.3.4.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: verify-uselistorder < %s.bc
 
 ; local-linkage-default-visibility.3.4.ll.bc was generated by passing this file
 ; to llvm-as-3.4.  The test checks that LLVM upgrades visibility of symbols
@@ -25,22 +26,22 @@
 @global = global i32 0
 
 @default.internal.alias = alias internal i32* @global
-; CHECK: @default.internal.alias = alias internal i32* @global
+; CHECK: @default.internal.alias = internal alias i32* @global
 
 @hidden.internal.alias = hidden alias internal i32* @global
-; CHECK: @hidden.internal.alias = alias internal i32* @global
+; CHECK: @hidden.internal.alias = internal alias i32* @global
 
 @protected.internal.alias = protected alias internal i32* @global
-; CHECK: @protected.internal.alias = alias internal i32* @global
+; CHECK: @protected.internal.alias = internal alias i32* @global
 
 @default.private.alias = alias private i32* @global
-; CHECK: @default.private.alias = alias private i32* @global
+; CHECK: @default.private.alias = private alias i32* @global
 
 @hidden.private.alias = hidden alias private i32* @global
-; CHECK: @hidden.private.alias = alias private i32* @global
+; CHECK: @hidden.private.alias = private alias i32* @global
 
 @protected.private.alias = protected alias private i32* @global
-; CHECK: @protected.private.alias = alias private i32* @global
+; CHECK: @protected.private.alias = private alias i32* @global
 
 define internal void @default.internal() {
 ; CHECK: define internal void @default.internal

diff --git a/test/Bitcode/memInstructions.3.2.ll b/test/Bitcode/memInstructions.3.2.ll
index e4cb6bd..d826dd1 100644
--- a/test/Bitcode/memInstructions.3.2.ll
+++ b/test/Bitcode/memInstructions.3.2.ll

@@ -1,328 +1,329 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; memOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread memory related instructions of

-; older bitcode files.

-

-define void @alloca(){

-entry:

-; CHECK: %res1 = alloca i8

-  %res1 = alloca i8

-  

-; CHECK-NEXT: %res2 = alloca i8, i32 2

-  %res2 = alloca i8, i32 2

-

-; CHECK-NEXT: %res3 = alloca i8, i32 2, align 4

-  %res3 = alloca i8, i32 2, align 4

-  

-; CHECK-NEXT: %res4 = alloca i8, align 4

-  %res4 = alloca i8, align 4

-  

-  ret void

-}

-

-define void @load(){

-entry:

-  %ptr1 = alloca i8

-  store i8 2, i8* %ptr1

-  

-; CHECK: %res1 = load i8* %ptr1

-  %res1 = load i8* %ptr1

-  

-; CHECK-NEXT: %res2 = load volatile i8* %ptr1

-  %res2 = load volatile i8* %ptr1

-  

-; CHECK-NEXT: %res3 = load i8* %ptr1, align 1

-  %res3 = load i8* %ptr1, align 1

-  

-; CHECK-NEXT: %res4 = load volatile i8* %ptr1, align 1

-  %res4 = load volatile i8* %ptr1, align 1

-  

-; CHECK-NEXT: %res5 = load i8* %ptr1, !nontemporal !0

-  %res5 = load i8* %ptr1, !nontemporal !0

-  

-; CHECK-NEXT: %res6 = load volatile i8* %ptr1, !nontemporal !0

-  %res6 = load volatile i8* %ptr1, !nontemporal !0

-  

-; CHECK-NEXT: %res7 = load i8* %ptr1, align 1, !nontemporal !0

-  %res7 = load i8* %ptr1, align 1, !nontemporal !0

-  

-; CHECK-NEXT: %res8 = load volatile i8* %ptr1, align 1, !nontemporal !0

-  %res8 = load volatile i8* %ptr1, align 1, !nontemporal !0

-  

-; CHECK-NEXT: %res9 = load i8* %ptr1, !invariant.load !1

-  %res9 = load i8* %ptr1, !invariant.load !1

-  

-; CHECK-NEXT: %res10 = load volatile i8* %ptr1, !invariant.load !1

-  %res10 = load volatile i8* %ptr1, !invariant.load !1

-  

-; CHECK-NEXT: %res11 = load i8* %ptr1, align 1, !invariant.load !1

-  %res11 = load i8* %ptr1, align 1, !invariant.load !1

-  

-; CHECK-NEXT: %res12 = load volatile i8* %ptr1, align 1, !invariant.load !1

-  %res12 = load volatile i8* %ptr1, align 1, !invariant.load !1

-  

-; CHECK-NEXT: %res13 = load i8* %ptr1, {{[(!nontemporal !0, !invariant.load !1) | (!invariant.load !1, !nontemporal !0)]}}

-  %res13 = load i8* %ptr1, !nontemporal !0, !invariant.load !1

-  

-; CHECK-NEXT: %res14 = load volatile i8* %ptr1, {{[(!nontemporal !0, !invariant.load !1) | (!invariant.load !1, !nontemporal !0)]}}

-  %res14 = load volatile i8* %ptr1, !nontemporal !0, !invariant.load !1

-  

-; CHECK-NEXT: %res15 = load i8* %ptr1, align 1, {{[(!nontemporal !0, !invariant.load !1) | (!invariant.load !1, !nontemporal !0)]}}

-  %res15 = load i8* %ptr1, align 1, !nontemporal !0, !invariant.load !1

-  

-; CHECK-NEXT: %res16 = load volatile i8* %ptr1, align 1, {{[(!nontemporal !0, !invariant.load !1) | (!invariant.load !1, !nontemporal !0)]}}

-  %res16 = load volatile i8* %ptr1, align 1, !nontemporal !0, !invariant.load !1

-  

-  ret void

-}

-

-define void @loadAtomic(){

-entry:

-  %ptr1 = alloca i8

-  store i8 2, i8* %ptr1

-  

-; CHECK: %res1 = load atomic i8* %ptr1 unordered, align 1

-  %res1 = load atomic i8* %ptr1 unordered, align 1

-  

-; CHECK-NEXT: %res2 = load atomic i8* %ptr1 monotonic, align 1

-  %res2 = load atomic i8* %ptr1 monotonic, align 1

-  

-; CHECK-NEXT: %res3 = load atomic i8* %ptr1 acquire, align 1

-  %res3 = load atomic i8* %ptr1 acquire, align 1

-  

-; CHECK-NEXT: %res4 = load atomic i8* %ptr1 seq_cst, align 1

-  %res4 = load atomic i8* %ptr1 seq_cst, align 1

-  

-; CHECK-NEXT: %res5 = load atomic volatile i8* %ptr1 unordered, align 1

-  %res5 = load atomic volatile i8* %ptr1 unordered, align 1

-  

-; CHECK-NEXT: %res6 = load atomic volatile i8* %ptr1 monotonic, align 1

-  %res6 = load atomic volatile i8* %ptr1 monotonic, align 1

-  

-; CHECK-NEXT: %res7 = load atomic volatile i8* %ptr1 acquire, align 1

-  %res7 = load atomic volatile i8* %ptr1 acquire, align 1

-  

-; CHECK-NEXT: %res8 = load atomic volatile i8* %ptr1 seq_cst, align 1

-  %res8 = load atomic volatile i8* %ptr1 seq_cst, align 1

-  

-; CHECK-NEXT: %res9 = load atomic i8* %ptr1 singlethread unordered, align 1

-  %res9 = load atomic i8* %ptr1 singlethread unordered, align 1

-  

-; CHECK-NEXT: %res10 = load atomic i8* %ptr1 singlethread monotonic, align 1

-  %res10 = load atomic i8* %ptr1 singlethread monotonic, align 1

-  

-; CHECK-NEXT: %res11 = load atomic i8* %ptr1 singlethread acquire, align 1

-  %res11 = load atomic i8* %ptr1 singlethread acquire, align 1

-  

-; CHECK-NEXT: %res12 = load atomic i8* %ptr1 singlethread seq_cst, align 1

-  %res12 = load atomic i8* %ptr1 singlethread seq_cst, align 1

-  

-; CHECK-NEXT: %res13 = load atomic volatile i8* %ptr1 singlethread unordered, align 1

-  %res13 = load atomic volatile i8* %ptr1 singlethread unordered, align 1

-  

-; CHECK-NEXT: %res14 = load atomic volatile i8* %ptr1 singlethread monotonic, align 1

-  %res14 = load atomic volatile i8* %ptr1 singlethread monotonic, align 1

-  

-; CHECK-NEXT: %res15 = load atomic volatile i8* %ptr1 singlethread acquire, align 1

-  %res15 = load atomic volatile i8* %ptr1 singlethread acquire, align 1

-  

-; CHECK-NEXT: %res16 = load atomic volatile i8* %ptr1 singlethread seq_cst, align 1

-  %res16 = load atomic volatile i8* %ptr1 singlethread seq_cst, align 1

-  

-  ret void

-}

-

-define void @store(){

-entry:

-  %ptr1 = alloca i8

-  

-; CHECK: store i8 2, i8* %ptr1

-  store i8 2, i8* %ptr1

-  

-; CHECK-NEXT: store volatile i8 2, i8* %ptr1

-  store volatile i8 2, i8* %ptr1

-  

-; CHECK-NEXT: store i8 2, i8* %ptr1, align 1

-  store i8 2, i8* %ptr1, align 1

-  

-; CHECK-NEXT: store volatile i8 2, i8* %ptr1, align 1

-  store volatile i8 2, i8* %ptr1, align 1

-  

-; CHECK-NEXT: store i8 2, i8* %ptr1, !nontemporal !0

-  store i8 2, i8* %ptr1, !nontemporal !0

-  

-; CHECK-NEXT: store volatile i8 2, i8* %ptr1, !nontemporal !0

-  store volatile i8 2, i8* %ptr1, !nontemporal !0

-  

-; CHECK-NEXT: store i8 2, i8* %ptr1, align 1, !nontemporal !0

-  store i8 2, i8* %ptr1, align 1, !nontemporal !0

-  

-; CHECK-NEXT: store volatile i8 2, i8* %ptr1, align 1, !nontemporal !0

-  store volatile i8 2, i8* %ptr1, align 1, !nontemporal !0

-  

-  ret void

-}

-

-define void @storeAtomic(){

-entry:

-  %ptr1 = alloca i8

-  

-; CHECK: store atomic i8 2, i8* %ptr1 unordered, align 1

-  store atomic i8 2, i8* %ptr1 unordered, align 1

-  

-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 monotonic, align 1

-  store atomic i8 2, i8* %ptr1 monotonic, align 1

-  

-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 release, align 1

-  store atomic i8 2, i8* %ptr1 release, align 1

-  

-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 seq_cst, align 1

-  store atomic i8 2, i8* %ptr1 seq_cst, align 1

-  

-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 unordered, align 1

-  store atomic volatile i8 2, i8* %ptr1 unordered, align 1

-  

-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 monotonic, align 1

-  store atomic volatile i8 2, i8* %ptr1 monotonic, align 1

-  

-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 release, align 1

-  store atomic volatile i8 2, i8* %ptr1 release, align 1

-  

-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 seq_cst, align 1

-  store atomic volatile i8 2, i8* %ptr1 seq_cst, align 1

-  

-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread unordered, align 1

-  store atomic i8 2, i8* %ptr1 singlethread unordered, align 1

-  

-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread monotonic, align 1

-  store atomic i8 2, i8* %ptr1 singlethread monotonic, align 1

-  

-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread release, align 1

-  store atomic i8 2, i8* %ptr1 singlethread release, align 1

-  

-; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread seq_cst, align 1

-  store atomic i8 2, i8* %ptr1 singlethread seq_cst, align 1

-  

-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread unordered, align 1

-  store atomic volatile i8 2, i8* %ptr1 singlethread unordered, align 1

-  

-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread monotonic, align 1

-  store atomic volatile i8 2, i8* %ptr1 singlethread monotonic, align 1

-  

-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread release, align 1

-  store atomic volatile i8 2, i8* %ptr1 singlethread release, align 1

-  

-; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread seq_cst, align 1

-  store atomic volatile i8 2, i8* %ptr1 singlethread seq_cst, align 1

-  

-  ret void

-}

-

-define void @cmpxchg(i32* %ptr,i32 %cmp,i32 %new){

-entry:

-  ;cmpxchg [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [singlethread] <ordering>

-

-; CHECK: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic

-; CHECK-NEXT: %res1 = extractvalue { i32, i1 } [[TMP]], 0

-  %res1 = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic

-; CHECK-NEXT: %res2 = extractvalue { i32, i1 } [[TMP]], 0

-  %res2 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic

-; CHECK-NEXT: %res3 = extractvalue { i32, i1 } [[TMP]], 0

-  %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic

-; CHECK-NEXT: %res4 = extractvalue { i32, i1 } [[TMP]], 0

-  %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic

-  

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire

-; CHECK-NEXT: %res5 = extractvalue { i32, i1 } [[TMP]], 0

-  %res5 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire

-; CHECK-NEXT: %res6 = extractvalue { i32, i1 } [[TMP]], 0

-  %res6 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire

-; CHECK-NEXT: %res7 = extractvalue { i32, i1 } [[TMP]], 0

-  %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire

-; CHECK-NEXT: %res8 = extractvalue { i32, i1 } [[TMP]], 0

-  %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire

-  

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic

-; CHECK-NEXT: %res9 = extractvalue { i32, i1 } [[TMP]], 0

-  %res9 = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic

-; CHECK-NEXT: %res10 = extractvalue { i32, i1 } [[TMP]], 0

-  %res10 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic

-; CHECK-NEXT: %res11 = extractvalue { i32, i1 } [[TMP]], 0

-  %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic

-; CHECK-NEXT: %res12 = extractvalue { i32, i1 } [[TMP]], 0

-  %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic

-  

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire

-; CHECK-NEXT: %res13 = extractvalue { i32, i1 } [[TMP]], 0

-  %res13 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire

-; CHECK-NEXT: %res14 = extractvalue { i32, i1 } [[TMP]], 0

-  %res14 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire

-; CHECK-NEXT: %res15 = extractvalue { i32, i1 } [[TMP]], 0

-  %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire

-; CHECK-NEXT: %res16 = extractvalue { i32, i1 } [[TMP]], 0

-  %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire

-  

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst

-; CHECK-NEXT: %res17 = extractvalue { i32, i1 } [[TMP]], 0

-  %res17 = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst

-; CHECK-NEXT: %res18 = extractvalue { i32, i1 } [[TMP]], 0

-  %res18 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst

-; CHECK-NEXT: %res19 = extractvalue { i32, i1 } [[TMP]], 0

-  %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst

-  

-; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst

-; CHECK-NEXT: %res20 = extractvalue { i32, i1 } [[TMP]], 0

-  %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst

-

-  ret void

-}

-

-define void @getelementptr({i8, i8}* %s, <4 x i8*> %ptrs, <4 x i64> %offsets ){

-entry:

-; CHECK: %res1 = getelementptr { i8, i8 }* %s, i32 1, i32 1

-  %res1 = getelementptr {i8, i8}* %s, i32 1, i32 1

-  

-; CHECK-NEXT: %res2 = getelementptr inbounds { i8, i8 }* %s, i32 1, i32 1

-  %res2 = getelementptr inbounds {i8, i8}* %s, i32 1, i32 1

-  

-; CHECK-NEXT: %res3 = getelementptr <4 x i8*> %ptrs, <4 x i64> %offsets

-  %res3 = getelementptr <4 x i8*> %ptrs, <4 x i64> %offsets

-  

-  ret void

-}

-

-!0 = metadata !{ i32 1 }

-!1 = metadata !{}
\ No newline at end of file
+; RUN: llvm-dis < %s.bc| FileCheck %s
+; RUN: verify-uselistorder < %s.bc
+
+; memOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread memory related instructions of
+; older bitcode files.
+
+define void @alloca(){
+entry:
+; CHECK: %res1 = alloca i8
+  %res1 = alloca i8
+
+; CHECK-NEXT: %res2 = alloca i8, i32 2
+  %res2 = alloca i8, i32 2
+
+; CHECK-NEXT: %res3 = alloca i8, i32 2, align 4
+  %res3 = alloca i8, i32 2, align 4
+
+; CHECK-NEXT: %res4 = alloca i8, align 4
+  %res4 = alloca i8, align 4
+
+  ret void
+}
+
+define void @load(){
+entry:
+  %ptr1 = alloca i8
+  store i8 2, i8* %ptr1
+
+; CHECK: %res1 = load i8* %ptr1
+  %res1 = load i8* %ptr1
+
+; CHECK-NEXT: %res2 = load volatile i8* %ptr1
+  %res2 = load volatile i8* %ptr1
+
+; CHECK-NEXT: %res3 = load i8* %ptr1, align 1
+  %res3 = load i8* %ptr1, align 1
+
+; CHECK-NEXT: %res4 = load volatile i8* %ptr1, align 1
+  %res4 = load volatile i8* %ptr1, align 1
+
+; CHECK-NEXT: %res5 = load i8* %ptr1, !nontemporal !0
+  %res5 = load i8* %ptr1, !nontemporal !0
+
+; CHECK-NEXT: %res6 = load volatile i8* %ptr1, !nontemporal !0
+  %res6 = load volatile i8* %ptr1, !nontemporal !0
+
+; CHECK-NEXT: %res7 = load i8* %ptr1, align 1, !nontemporal !0
+  %res7 = load i8* %ptr1, align 1, !nontemporal !0
+
+; CHECK-NEXT: %res8 = load volatile i8* %ptr1, align 1, !nontemporal !0
+  %res8 = load volatile i8* %ptr1, align 1, !nontemporal !0
+
+; CHECK-NEXT: %res9 = load i8* %ptr1, !invariant.load !1
+  %res9 = load i8* %ptr1, !invariant.load !1
+
+; CHECK-NEXT: %res10 = load volatile i8* %ptr1, !invariant.load !1
+  %res10 = load volatile i8* %ptr1, !invariant.load !1
+
+; CHECK-NEXT: %res11 = load i8* %ptr1, align 1, !invariant.load !1
+  %res11 = load i8* %ptr1, align 1, !invariant.load !1
+
+; CHECK-NEXT: %res12 = load volatile i8* %ptr1, align 1, !invariant.load !1
+  %res12 = load volatile i8* %ptr1, align 1, !invariant.load !1
+
+; CHECK-NEXT: %res13 = load i8* %ptr1, {{[(!nontemporal !0, !invariant.load !1) | (!invariant.load !1, !nontemporal !0)]}}
+  %res13 = load i8* %ptr1, !nontemporal !0, !invariant.load !1
+
+; CHECK-NEXT: %res14 = load volatile i8* %ptr1, {{[(!nontemporal !0, !invariant.load !1) | (!invariant.load !1, !nontemporal !0)]}}
+  %res14 = load volatile i8* %ptr1, !nontemporal !0, !invariant.load !1
+
+; CHECK-NEXT: %res15 = load i8* %ptr1, align 1, {{[(!nontemporal !0, !invariant.load !1) | (!invariant.load !1, !nontemporal !0)]}}
+  %res15 = load i8* %ptr1, align 1, !nontemporal !0, !invariant.load !1
+
+; CHECK-NEXT: %res16 = load volatile i8* %ptr1, align 1, {{[(!nontemporal !0, !invariant.load !1) | (!invariant.load !1, !nontemporal !0)]}}
+  %res16 = load volatile i8* %ptr1, align 1, !nontemporal !0, !invariant.load !1
+
+  ret void
+}
+
+define void @loadAtomic(){
+entry:
+  %ptr1 = alloca i8
+  store i8 2, i8* %ptr1
+
+; CHECK: %res1 = load atomic i8* %ptr1 unordered, align 1
+  %res1 = load atomic i8* %ptr1 unordered, align 1
+
+; CHECK-NEXT: %res2 = load atomic i8* %ptr1 monotonic, align 1
+  %res2 = load atomic i8* %ptr1 monotonic, align 1
+
+; CHECK-NEXT: %res3 = load atomic i8* %ptr1 acquire, align 1
+  %res3 = load atomic i8* %ptr1 acquire, align 1
+
+; CHECK-NEXT: %res4 = load atomic i8* %ptr1 seq_cst, align 1
+  %res4 = load atomic i8* %ptr1 seq_cst, align 1
+
+; CHECK-NEXT: %res5 = load atomic volatile i8* %ptr1 unordered, align 1
+  %res5 = load atomic volatile i8* %ptr1 unordered, align 1
+
+; CHECK-NEXT: %res6 = load atomic volatile i8* %ptr1 monotonic, align 1
+  %res6 = load atomic volatile i8* %ptr1 monotonic, align 1
+
+; CHECK-NEXT: %res7 = load atomic volatile i8* %ptr1 acquire, align 1
+  %res7 = load atomic volatile i8* %ptr1 acquire, align 1
+
+; CHECK-NEXT: %res8 = load atomic volatile i8* %ptr1 seq_cst, align 1
+  %res8 = load atomic volatile i8* %ptr1 seq_cst, align 1
+
+; CHECK-NEXT: %res9 = load atomic i8* %ptr1 singlethread unordered, align 1
+  %res9 = load atomic i8* %ptr1 singlethread unordered, align 1
+
+; CHECK-NEXT: %res10 = load atomic i8* %ptr1 singlethread monotonic, align 1
+  %res10 = load atomic i8* %ptr1 singlethread monotonic, align 1
+
+; CHECK-NEXT: %res11 = load atomic i8* %ptr1 singlethread acquire, align 1
+  %res11 = load atomic i8* %ptr1 singlethread acquire, align 1
+
+; CHECK-NEXT: %res12 = load atomic i8* %ptr1 singlethread seq_cst, align 1
+  %res12 = load atomic i8* %ptr1 singlethread seq_cst, align 1
+
+; CHECK-NEXT: %res13 = load atomic volatile i8* %ptr1 singlethread unordered, align 1
+  %res13 = load atomic volatile i8* %ptr1 singlethread unordered, align 1
+
+; CHECK-NEXT: %res14 = load atomic volatile i8* %ptr1 singlethread monotonic, align 1
+  %res14 = load atomic volatile i8* %ptr1 singlethread monotonic, align 1
+
+; CHECK-NEXT: %res15 = load atomic volatile i8* %ptr1 singlethread acquire, align 1
+  %res15 = load atomic volatile i8* %ptr1 singlethread acquire, align 1
+
+; CHECK-NEXT: %res16 = load atomic volatile i8* %ptr1 singlethread seq_cst, align 1
+  %res16 = load atomic volatile i8* %ptr1 singlethread seq_cst, align 1
+
+  ret void
+}
+
+define void @store(){
+entry:
+  %ptr1 = alloca i8
+
+; CHECK: store i8 2, i8* %ptr1
+  store i8 2, i8* %ptr1
+
+; CHECK-NEXT: store volatile i8 2, i8* %ptr1
+  store volatile i8 2, i8* %ptr1
+
+; CHECK-NEXT: store i8 2, i8* %ptr1, align 1
+  store i8 2, i8* %ptr1, align 1
+
+; CHECK-NEXT: store volatile i8 2, i8* %ptr1, align 1
+  store volatile i8 2, i8* %ptr1, align 1
+
+; CHECK-NEXT: store i8 2, i8* %ptr1, !nontemporal !0
+  store i8 2, i8* %ptr1, !nontemporal !0
+
+; CHECK-NEXT: store volatile i8 2, i8* %ptr1, !nontemporal !0
+  store volatile i8 2, i8* %ptr1, !nontemporal !0
+
+; CHECK-NEXT: store i8 2, i8* %ptr1, align 1, !nontemporal !0
+  store i8 2, i8* %ptr1, align 1, !nontemporal !0
+
+; CHECK-NEXT: store volatile i8 2, i8* %ptr1, align 1, !nontemporal !0
+  store volatile i8 2, i8* %ptr1, align 1, !nontemporal !0
+
+  ret void
+}
+
+define void @storeAtomic(){
+entry:
+  %ptr1 = alloca i8
+
+; CHECK: store atomic i8 2, i8* %ptr1 unordered, align 1
+  store atomic i8 2, i8* %ptr1 unordered, align 1
+
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 monotonic, align 1
+  store atomic i8 2, i8* %ptr1 monotonic, align 1
+
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 release, align 1
+  store atomic i8 2, i8* %ptr1 release, align 1
+
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 seq_cst, align 1
+  store atomic i8 2, i8* %ptr1 seq_cst, align 1
+
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 unordered, align 1
+  store atomic volatile i8 2, i8* %ptr1 unordered, align 1
+
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 monotonic, align 1
+  store atomic volatile i8 2, i8* %ptr1 monotonic, align 1
+
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 release, align 1
+  store atomic volatile i8 2, i8* %ptr1 release, align 1
+
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 seq_cst, align 1
+  store atomic volatile i8 2, i8* %ptr1 seq_cst, align 1
+
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread unordered, align 1
+  store atomic i8 2, i8* %ptr1 singlethread unordered, align 1
+
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread monotonic, align 1
+  store atomic i8 2, i8* %ptr1 singlethread monotonic, align 1
+
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread release, align 1
+  store atomic i8 2, i8* %ptr1 singlethread release, align 1
+
+; CHECK-NEXT: store atomic i8 2, i8* %ptr1 singlethread seq_cst, align 1
+  store atomic i8 2, i8* %ptr1 singlethread seq_cst, align 1
+
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread unordered, align 1
+  store atomic volatile i8 2, i8* %ptr1 singlethread unordered, align 1
+
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread monotonic, align 1
+  store atomic volatile i8 2, i8* %ptr1 singlethread monotonic, align 1
+
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread release, align 1
+  store atomic volatile i8 2, i8* %ptr1 singlethread release, align 1
+
+; CHECK-NEXT: store atomic volatile i8 2, i8* %ptr1 singlethread seq_cst, align 1
+  store atomic volatile i8 2, i8* %ptr1 singlethread seq_cst, align 1
+
+  ret void
+}
+
+define void @cmpxchg(i32* %ptr,i32 %cmp,i32 %new){
+entry:
+  ;cmpxchg [volatile] <ty>* <pointer>, <ty> <cmp>, <ty> <new> [singlethread] <ordering>
+
+; CHECK: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK-NEXT: %res1 = extractvalue { i32, i1 } [[TMP]], 0
+  %res1 = cmpxchg i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+; CHECK-NEXT: %res2 = extractvalue { i32, i1 } [[TMP]], 0
+  %res2 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new monotonic monotonic
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: %res3 = extractvalue { i32, i1 } [[TMP]], 0
+  %res3 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+; CHECK-NEXT: %res4 = extractvalue { i32, i1 } [[TMP]], 0
+  %res4 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread monotonic monotonic
+
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: %res5 = extractvalue { i32, i1 } [[TMP]], 0
+  %res5 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acquire acquire
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire
+; CHECK-NEXT: %res6 = extractvalue { i32, i1 } [[TMP]], 0
+  %res6 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acquire acquire
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: %res7 = extractvalue { i32, i1 } [[TMP]], 0
+  %res7 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+; CHECK-NEXT: %res8 = extractvalue { i32, i1 } [[TMP]], 0
+  %res8 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acquire acquire
+
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: %res9 = extractvalue { i32, i1 } [[TMP]], 0
+  %res9 = cmpxchg i32* %ptr, i32 %cmp, i32 %new release monotonic
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic
+; CHECK-NEXT: %res10 = extractvalue { i32, i1 } [[TMP]], 0
+  %res10 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new release monotonic
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: %res11 = extractvalue { i32, i1 } [[TMP]], 0
+  %res11 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+; CHECK-NEXT: %res12 = extractvalue { i32, i1 } [[TMP]], 0
+  %res12 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread release monotonic
+
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: %res13 = extractvalue { i32, i1 } [[TMP]], 0
+  %res13 = cmpxchg i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+; CHECK-NEXT: %res14 = extractvalue { i32, i1 } [[TMP]], 0
+  %res14 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new acq_rel acquire
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: %res15 = extractvalue { i32, i1 } [[TMP]], 0
+  %res15 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+; CHECK-NEXT: %res16 = extractvalue { i32, i1 } [[TMP]], 0
+  %res16 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread acq_rel acquire
+
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: %res17 = extractvalue { i32, i1 } [[TMP]], 0
+  %res17 = cmpxchg i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+; CHECK-NEXT: %res18 = extractvalue { i32, i1 } [[TMP]], 0
+  %res18 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new seq_cst seq_cst
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: %res19 = extractvalue { i32, i1 } [[TMP]], 0
+  %res19 = cmpxchg i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+
+; CHECK-NEXT: [[TMP:%[a-z0-9]+]] = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+; CHECK-NEXT: %res20 = extractvalue { i32, i1 } [[TMP]], 0
+  %res20 = cmpxchg volatile i32* %ptr, i32 %cmp, i32 %new singlethread seq_cst seq_cst
+
+  ret void
+}
+
+define void @getelementptr({i8, i8}* %s, <4 x i8*> %ptrs, <4 x i64> %offsets ){
+entry:
+; CHECK: %res1 = getelementptr { i8, i8 }* %s, i32 1, i32 1
+  %res1 = getelementptr {i8, i8}* %s, i32 1, i32 1
+
+; CHECK-NEXT: %res2 = getelementptr inbounds { i8, i8 }* %s, i32 1, i32 1
+  %res2 = getelementptr inbounds {i8, i8}* %s, i32 1, i32 1
+
+; CHECK-NEXT: %res3 = getelementptr <4 x i8*> %ptrs, <4 x i64> %offsets
+  %res3 = getelementptr <4 x i8*> %ptrs, <4 x i64> %offsets
+
+  ret void
+}
+
+!0 = metadata !{ i32 1 }
+!1 = metadata !{}

diff --git a/test/Bitcode/metadata-2.ll b/test/Bitcode/metadata-2.ll
index 4055f92..bb957a7 100644
--- a/test/Bitcode/metadata-2.ll
+++ b/test/Bitcode/metadata-2.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis -disable-output
+; RUN: verify-uselistorder < %s
 	%0 = type { %object.ModuleInfo.__vtbl*, i8*, %"byte[]", %1, %"ClassInfo[]", i32, void ()*, void ()*, void ()*, i8*, void ()* }		; type %0
 	%1 = type { i64, %object.ModuleInfo* }		; type %1
 	%2 = type { i32, void ()* }		; type %2

diff --git a/test/Bitcode/metadata.ll b/test/Bitcode/metadata.ll
index fc8a622..955b48b 100644
--- a/test/Bitcode/metadata.ll
+++ b/test/Bitcode/metadata.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis -disable-output
+; RUN: verify-uselistorder < %s
 
 !llvm.foo = !{!0}
 !0 = metadata !{i32 42}

diff --git a/test/Bitcode/miscInstructions.3.2.ll b/test/Bitcode/miscInstructions.3.2.ll
index bceae20..6a077d5 100644
--- a/test/Bitcode/miscInstructions.3.2.ll
+++ b/test/Bitcode/miscInstructions.3.2.ll

@@ -1,126 +1,186 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; miscInstructions.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread miscellaneous instructions of

-; older bitcode files.

-

-define void @icmp(i32 %x1, i32 %x2, i32* %ptr1, i32* %ptr2, <2 x i32> %vec1, <2 x i32> %vec2){

-entry:

-; CHECK: %res1 = icmp eq i32 %x1, %x2

-  %res1 = icmp eq i32 %x1, %x2

-  

-; CHECK-NEXT: %res2 = icmp ne i32 %x1, %x2

-  %res2 = icmp ne i32 %x1, %x2

-  

-; CHECK-NEXT: %res3 = icmp ugt i32 %x1, %x2

-  %res3 = icmp ugt i32 %x1, %x2

-  

-; CHECK-NEXT: %res4 = icmp uge i32 %x1, %x2

-  %res4 = icmp uge i32 %x1, %x2

-  

-; CHECK-NEXT: %res5 = icmp ult i32 %x1, %x2

-  %res5 = icmp ult i32 %x1, %x2

-  

-; CHECK-NEXT: %res6 = icmp ule i32 %x1, %x2

-  %res6 = icmp ule i32 %x1, %x2

-  

-; CHECK-NEXT: %res7 = icmp sgt i32 %x1, %x2

-  %res7 = icmp sgt i32 %x1, %x2

-  

-; CHECK-NEXT: %res8 = icmp sge i32 %x1, %x2

-  %res8 = icmp sge i32 %x1, %x2

-  

-; CHECK-NEXT: %res9 = icmp slt i32 %x1, %x2

-  %res9 = icmp slt i32 %x1, %x2

-  

-; CHECK-NEXT: %res10 = icmp sle i32 %x1, %x2

-  %res10 = icmp sle i32 %x1, %x2

-  

-; CHECK-NEXT: %res11 = icmp eq i32* %ptr1, %ptr2

-  %res11 = icmp eq i32* %ptr1, %ptr2

-  

-; CHECK-NEXT: %res12 = icmp eq <2 x i32> %vec1, %vec2

-  %res12 = icmp eq <2 x i32> %vec1, %vec2

-  

-  ret void

-}

-

-

-define void @fcmp(float %x1, float %x2, <2 x float> %vec1, <2 x float> %vec2){

-entry:

-; CHECK: %res1 = fcmp oeq float %x1, %x2

-  %res1 = fcmp oeq float %x1, %x2

-  

-; CHECK-NEXT: %res2 = fcmp one float %x1, %x2

-  %res2 = fcmp one float %x1, %x2

-  

-; CHECK-NEXT: %res3 = fcmp ugt float %x1, %x2

-  %res3 = fcmp ugt float %x1, %x2

-  

-; CHECK-NEXT: %res4 = fcmp uge float %x1, %x2

-  %res4 = fcmp uge float %x1, %x2

-  

-; CHECK-NEXT: %res5 = fcmp ult float %x1, %x2

-  %res5 = fcmp ult float %x1, %x2

-  

-; CHECK-NEXT: %res6 = fcmp ule float %x1, %x2

-  %res6 = fcmp ule float %x1, %x2

-  

-; CHECK-NEXT: %res7 = fcmp ogt float %x1, %x2

-  %res7 = fcmp ogt float %x1, %x2

-  

-; CHECK-NEXT: %res8 = fcmp oge float %x1, %x2

-  %res8 = fcmp oge float %x1, %x2

-  

-; CHECK-NEXT: %res9 = fcmp olt float %x1, %x2

-  %res9 = fcmp olt float %x1, %x2

-  

-; CHECK-NEXT: %res10 = fcmp ole float %x1, %x2

-  %res10 = fcmp ole float %x1, %x2

-  

-; CHECK-NEXT: %res11 = fcmp ord float %x1, %x2

-  %res11 = fcmp ord float %x1, %x2

-  

-; CHECK-NEXT: %res12 = fcmp ueq float %x1, %x2

-  %res12 = fcmp ueq float %x1, %x2

-  

-; CHECK-NEXT: %res13 = fcmp une float %x1, %x2

-  %res13 = fcmp une float %x1, %x2

-  

-; CHECK-NEXT: %res14 = fcmp uno float %x1, %x2

-  %res14 = fcmp uno float %x1, %x2

-  

-; CHECK-NEXT: %res15 = fcmp true float %x1, %x2

-  %res15 = fcmp true float %x1, %x2

-  

-; CHECK-NEXT: %res16 = fcmp false float %x1, %x2

-  %res16 = fcmp false float %x1, %x2

-  

-; CHECK-NEXT: %res17 = fcmp oeq <2 x float> %vec1, %vec2

-  %res17 = fcmp oeq <2 x float> %vec1, %vec2

-  

-  ret void

-}

-

-declare i32 @printf(i8* noalias nocapture, ...)

-

-define void @call(i32 %x, i8* %msg ){

-entry:

-

-; CHECK: %res1 = call i32 @test(i32 %x)

-  %res1 = call i32 @test(i32 %x)

-  

-; CHECK-NEXT: %res2 = tail call i32 @test(i32 %x)

-  %res2 = tail call i32 @test(i32 %x)

-  

-; CHECK-NEXT: %res3 = call i32 (i8*, ...)* @printf(i8* %msg, i32 12, i8 42)

-  %res3 = call i32 (i8*, ...)* @printf(i8* %msg, i32 12, i8 42)

-  

-  ret void

-}

-

-define i32 @test(i32 %x){

-entry:

-

-  ret i32 %x

-}

+; RUN:  llvm-dis < %s.bc| FileCheck %s
+
+; miscInstructions.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread miscellaneous instructions of
+; older bitcode files.
+
+@X = global i8 1
+@_ZTIi = global i8* @X 
+@_ZTId = global i8* @X 
+
+define i32 @__gxx_personality_v0(...){
+entry:
+  ret i32 0
+}
+
+define void @landingpadInstr1(i1 %cond1, <2 x i1> %cond2, <2 x i8> %x1, <2 x i8> %x2){
+entry:
+; CHECK: %res = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+  %res = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0 
+; CHECK: catch i8** @_ZTIi
+  catch i8** @_ZTIi
+  ret void
+}
+
+define void @landingpadInstr2(i1 %cond1, <2 x i1> %cond2, <2 x i8> %x1, <2 x i8> %x2){
+entry:
+; CHECK: %res = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+  %res = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+; CHECK: cleanup
+  cleanup
+  ret void
+}
+
+define void @landingpadInstr3(i1 %cond1, <2 x i1> %cond2, <2 x i8> %x1, <2 x i8> %x2){
+entry:
+; CHECK: %res = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0 
+  %res = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+; CHECK: catch i8** @_ZTIi
+  catch i8** @_ZTIi
+; CHECK: filter [1 x i8**] [i8** @_ZTId]
+  filter [1 x i8**] [i8** @_ZTId]
+  ret void
+}
+
+define void @phiInstr(){
+LoopHeader: 
+  %x = add i32 0, 0
+  br label %Loop
+Loop:
+; CHECK:  %indvar = phi i32 [ 0, %LoopHeader ], [ %nextindvar, %Loop ]
+  %indvar = phi i32 [ 0, %LoopHeader ], [ %nextindvar, %Loop ]
+  %nextindvar = add i32 %indvar, 1
+  br label %Loop
+  ret void
+}
+
+define void @selectInstr(i1 %cond1, <2 x i1> %cond2, <2 x i8> %x1, <2 x i8> %x2){
+entry:
+; CHECK: %res1 = select i1 %cond1, i8 1, i8 0 
+  %res1 = select i1 %cond1, i8 1, i8 0
+; CHECK-NEXT: %res2 = select <2 x i1> %cond2, <2 x i8> %x1, <2 x i8> %x2
+  %res2 = select <2 x i1> %cond2, <2 x i8> %x1, <2 x i8> %x2
+
+  ret void
+}
+
+define void @icmp(i32 %x1, i32 %x2, i32* %ptr1, i32* %ptr2, <2 x i32> %vec1, <2 x i32> %vec2){
+entry:
+; CHECK: %res1 = icmp eq i32 %x1, %x2
+  %res1 = icmp eq i32 %x1, %x2
+  
+; CHECK-NEXT: %res2 = icmp ne i32 %x1, %x2
+  %res2 = icmp ne i32 %x1, %x2
+  
+; CHECK-NEXT: %res3 = icmp ugt i32 %x1, %x2
+  %res3 = icmp ugt i32 %x1, %x2
+  
+; CHECK-NEXT: %res4 = icmp uge i32 %x1, %x2
+  %res4 = icmp uge i32 %x1, %x2
+  
+; CHECK-NEXT: %res5 = icmp ult i32 %x1, %x2
+  %res5 = icmp ult i32 %x1, %x2
+  
+; CHECK-NEXT: %res6 = icmp ule i32 %x1, %x2
+  %res6 = icmp ule i32 %x1, %x2
+  
+; CHECK-NEXT: %res7 = icmp sgt i32 %x1, %x2
+  %res7 = icmp sgt i32 %x1, %x2
+  
+; CHECK-NEXT: %res8 = icmp sge i32 %x1, %x2
+  %res8 = icmp sge i32 %x1, %x2
+  
+; CHECK-NEXT: %res9 = icmp slt i32 %x1, %x2
+  %res9 = icmp slt i32 %x1, %x2
+  
+; CHECK-NEXT: %res10 = icmp sle i32 %x1, %x2
+  %res10 = icmp sle i32 %x1, %x2
+  
+; CHECK-NEXT: %res11 = icmp eq i32* %ptr1, %ptr2
+  %res11 = icmp eq i32* %ptr1, %ptr2
+  
+; CHECK-NEXT: %res12 = icmp eq <2 x i32> %vec1, %vec2
+  %res12 = icmp eq <2 x i32> %vec1, %vec2
+  
+  ret void
+}
+
+
+define void @fcmp(float %x1, float %x2, <2 x float> %vec1, <2 x float> %vec2){
+entry:
+; CHECK: %res1 = fcmp oeq float %x1, %x2
+  %res1 = fcmp oeq float %x1, %x2
+  
+; CHECK-NEXT: %res2 = fcmp one float %x1, %x2
+  %res2 = fcmp one float %x1, %x2
+  
+; CHECK-NEXT: %res3 = fcmp ugt float %x1, %x2
+  %res3 = fcmp ugt float %x1, %x2
+  
+; CHECK-NEXT: %res4 = fcmp uge float %x1, %x2
+  %res4 = fcmp uge float %x1, %x2
+  
+; CHECK-NEXT: %res5 = fcmp ult float %x1, %x2
+  %res5 = fcmp ult float %x1, %x2
+  
+; CHECK-NEXT: %res6 = fcmp ule float %x1, %x2
+  %res6 = fcmp ule float %x1, %x2
+  
+; CHECK-NEXT: %res7 = fcmp ogt float %x1, %x2
+  %res7 = fcmp ogt float %x1, %x2
+  
+; CHECK-NEXT: %res8 = fcmp oge float %x1, %x2
+  %res8 = fcmp oge float %x1, %x2
+  
+; CHECK-NEXT: %res9 = fcmp olt float %x1, %x2
+  %res9 = fcmp olt float %x1, %x2
+  
+; CHECK-NEXT: %res10 = fcmp ole float %x1, %x2
+  %res10 = fcmp ole float %x1, %x2
+  
+; CHECK-NEXT: %res11 = fcmp ord float %x1, %x2
+  %res11 = fcmp ord float %x1, %x2
+  
+; CHECK-NEXT: %res12 = fcmp ueq float %x1, %x2
+  %res12 = fcmp ueq float %x1, %x2
+  
+; CHECK-NEXT: %res13 = fcmp une float %x1, %x2
+  %res13 = fcmp une float %x1, %x2
+  
+; CHECK-NEXT: %res14 = fcmp uno float %x1, %x2
+  %res14 = fcmp uno float %x1, %x2
+  
+; CHECK-NEXT: %res15 = fcmp true float %x1, %x2
+  %res15 = fcmp true float %x1, %x2
+  
+; CHECK-NEXT: %res16 = fcmp false float %x1, %x2
+  %res16 = fcmp false float %x1, %x2
+  
+; CHECK-NEXT: %res17 = fcmp oeq <2 x float> %vec1, %vec2
+  %res17 = fcmp oeq <2 x float> %vec1, %vec2
+  
+  ret void
+}
+
+declare i32 @printf(i8* noalias nocapture, ...)
+
+define void @call(i32 %x, i8* %msg ){
+entry:
+
+; CHECK: %res1 = call i32 @test(i32 %x)
+  %res1 = call i32 @test(i32 %x)
+  
+; CHECK-NEXT: %res2 = tail call i32 @test(i32 %x)
+  %res2 = tail call i32 @test(i32 %x)
+  
+; CHECK-NEXT: %res3 = call i32 (i8*, ...)* @printf(i8* %msg, i32 12, i8 42)
+  %res3 = call i32 (i8*, ...)* @printf(i8* %msg, i32 12, i8 42)
+  
+  ret void
+}
+
+define i32 @test(i32 %x){
+entry:
+
+  ret i32 %x
+}

diff --git a/test/Bitcode/miscInstructions.3.2.ll.bc b/test/Bitcode/miscInstructions.3.2.ll.bc
index 9d479b5..ed63d70 100644
--- a/test/Bitcode/miscInstructions.3.2.ll.bc
+++ b/test/Bitcode/miscInstructions.3.2.ll.bc
Binary files differ

diff --git a/test/Bitcode/old-aliases.ll b/test/Bitcode/old-aliases.ll
index 7a0eea2..b73b1a9 100644
--- a/test/Bitcode/old-aliases.ll
+++ b/test/Bitcode/old-aliases.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: verify-uselistorder < %s.bc
 
 ; old-aliases.bc consist of this file assembled with an old llvm-as (3.5 trunk)
 ; from when aliases contained a ConstantExpr.

diff --git a/test/Bitcode/ptest-new.ll b/test/Bitcode/ptest-new.ll
index 735cc9c..c17ddc9 100644
--- a/test/Bitcode/ptest-new.ll
+++ b/test/Bitcode/ptest-new.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 
 define i32 @foo(<2 x i64> %bar) nounwind {
 entry:

diff --git a/test/Bitcode/ptest-old.ll b/test/Bitcode/ptest-old.ll
index fbe962f..c1e1cae 100644
--- a/test/Bitcode/ptest-old.ll
+++ b/test/Bitcode/ptest-old.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 
 define i32 @foo(<4 x float> %bar) nounwind {
 entry:

diff --git a/test/Bitcode/select.ll b/test/Bitcode/select.ll
index 08a3061..3ad0679 100644
--- a/test/Bitcode/select.ll
+++ b/test/Bitcode/select.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 
 define <2 x i32> @main() {
   ret <2 x i32> select (<2 x i1> <i1 false, i1 undef>, <2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 undef>)

diff --git a/test/Bitcode/shuffle.ll b/test/Bitcode/shuffle.ll
index 1495d8e..b84641c 100644
--- a/test/Bitcode/shuffle.ll
+++ b/test/Bitcode/shuffle.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis -disable-output
+; RUN: verify-uselistorder < %s
 
 ; <rdar://problem/8622574>
 ; tests the bitcodereader can handle the case where the reader will initially

diff --git a/test/Bitcode/ssse3_palignr.ll b/test/Bitcode/ssse3_palignr.ll
index 90b4394..8254513 100644
--- a/test/Bitcode/ssse3_palignr.ll
+++ b/test/Bitcode/ssse3_palignr.ll

@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S | FileCheck %s
+; RUN: verify-uselistorder < %s
 ; CHECK-NOT: {@llvm\\.palign}
 
 define <4 x i32> @align1(<4 x i32> %a, <4 x i32> %b) nounwind readnone ssp {

diff --git a/test/Bitcode/standardCIntrinsic.3.2.ll b/test/Bitcode/standardCIntrinsic.3.2.ll
new file mode 100644
index 0000000..09f2378
--- /dev/null
+++ b/test/Bitcode/standardCIntrinsic.3.2.ll

@@ -0,0 +1,16 @@
+; RUN:  llvm-dis < %s.bc| FileCheck %s
+
+; standardCIntrinsic.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread standard C library intrinsic functions
+; of older bitcode files.
+
+define void @memcpyintrinsic(i8* %dest, i8* %src, i32 %len) {
+entry:
+
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i32 1, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i32 1, i1 true)
+  
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i32 %align, i1 %isvolatile)
\ No newline at end of file

diff --git a/test/Bitcode/standardCIntrinsic.3.2.ll.bc b/test/Bitcode/standardCIntrinsic.3.2.ll.bc
new file mode 100644
index 0000000..3ffb1af
--- /dev/null
+++ b/test/Bitcode/standardCIntrinsic.3.2.ll.bc
Binary files differ

diff --git a/test/Bitcode/tailcall.ll b/test/Bitcode/tailcall.ll
index 765b470..01190d7 100644
--- a/test/Bitcode/tailcall.ll
+++ b/test/Bitcode/tailcall.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 
 ; Check that musttail and tail roundtrip.
 

diff --git a/test/Bitcode/terminatorInstructions.3.2.ll b/test/Bitcode/terminatorInstructions.3.2.ll
index 31e7896..ba0f5ad 100644
--- a/test/Bitcode/terminatorInstructions.3.2.ll
+++ b/test/Bitcode/terminatorInstructions.3.2.ll

@@ -1,47 +1,76 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; TerminatorOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread terminator instructions from

-; older bitcode files.

-

-define i32 @condbr(i1 %cond){

-entry:

-; CHECK: br i1 %cond, label %TrueLabel, label %FalseLabel

-  br i1 %cond, label %TrueLabel, label %FalseLabel

-  

-  TrueLabel:

-  ret i32 1

-  

-  FalseLabel:

-  ret i32 0

-}

-

-define i32 @uncondbr(){

-entry:

-; CHECK: br label %uncondLabel

-  br label %uncondLabel

-  

-  uncondLabel:

-  ret i32 1

-}

-

-define i32 @indirectbr(i8* %Addr){

-entry:

-; CHECK: indirectbr i8* %Addr, [label %bb1, label %bb2]

-  indirectbr i8* %Addr, [ label %bb1, label %bb2 ]

-  

-  bb1:

-  ret i32 1

-  

-  bb2:

-  ret i32 0

-}

-

-define void @unreachable(){

-entry:

-; CHECK: unreachable

-  unreachable

-  

-  ret void

-}

-

+; RUN:  llvm-dis < %s.bc| FileCheck %s
+
+; TerminatorOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread terminator instructions from
+; older bitcode files.
+
+define i32 @condbr(i1 %cond){
+entry:
+; CHECK: br i1 %cond, label %TrueLabel, label %FalseLabel
+  br i1 %cond, label %TrueLabel, label %FalseLabel
+  
+  TrueLabel:
+  ret i32 1
+  
+  FalseLabel:
+  ret i32 0
+}
+
+define i32 @uncondbr(){
+entry:
+; CHECK: br label %uncondLabel
+  br label %uncondLabel
+  
+  uncondLabel:
+  ret i32 1
+}
+
+define i32 @indirectbr(i8* %Addr){
+entry:
+; CHECK: indirectbr i8* %Addr, [label %bb1, label %bb2]
+  indirectbr i8* %Addr, [ label %bb1, label %bb2 ]
+  
+  bb1:
+  ret i32 1
+  
+  bb2:
+  ret i32 0
+}
+
+define void @unreachable(){
+entry:
+; CHECK: unreachable
+  unreachable
+  
+  ret void
+}
+
+define i32 @retInstr(){
+entry:
+; CHECK: ret i32 1 
+  ret i32 1 
+}
+
+define void @retInstr2(){
+entry:
+; CHECK: ret void 
+  ret void
+}
+
+define i32 @switchInstr(i32 %x){
+entry:
+; CHECK: switch i32 %x, label %label3 [
+  switch i32 %x, label %label3 [
+; CHECK-NEXT: i32 1, label %label1   
+  i32 1, label %label1
+; CHECK-NEXT: i32 2, label %label2  
+  i32 2, label %label2
+  ]
+label1:
+  ret i32 1
+label2:
+  ret i32 2
+label3:
+  ret i32 0
+}
+

diff --git a/test/Bitcode/terminatorInstructions.3.2.ll.bc b/test/Bitcode/terminatorInstructions.3.2.ll.bc
index 9d92ead..0fbc319 100644
--- a/test/Bitcode/terminatorInstructions.3.2.ll.bc
+++ b/test/Bitcode/terminatorInstructions.3.2.ll.bc
Binary files differ

diff --git a/test/Bitcode/upgrade-global-ctors.ll b/test/Bitcode/upgrade-global-ctors.ll
index bd253a8..d7afcdd 100644
--- a/test/Bitcode/upgrade-global-ctors.ll
+++ b/test/Bitcode/upgrade-global-ctors.ll

@@ -1,3 +1,5 @@
 ; RUN:  llvm-dis < %s.bc| FileCheck %s
+; RUN:  verify-uselistorder < %s.bc
 
-; CHECK: @llvm.global_ctors = appending global [0 x { i32, void ()*, i8* }] zeroinitializer
+; Global constructors should no longer be upgraded when reading bitcode.
+; CHECK: @llvm.global_ctors = appending global [0 x { i32, void ()* }] zeroinitializer

diff --git a/test/Bitcode/upgrade-loop-metadata.ll b/test/Bitcode/upgrade-loop-metadata.ll
index 1a45056..cebc583 100644
--- a/test/Bitcode/upgrade-loop-metadata.ll
+++ b/test/Bitcode/upgrade-loop-metadata.ll

@@ -1,6 +1,7 @@
 ; Test to make sure loop vectorizer metadata is automatically upgraded.
 ;
 ; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: verify-uselistorder < %s.bc
 
 define void @_Z28loop_with_vectorize_metadatav() {
 entry:
@@ -26,7 +27,7 @@
   ret void
 }
 
-; CHECK: !{metadata !"llvm.loop.vectorize.unroll", i32 4}
+; CHECK: !{metadata !"llvm.loop.interleave.count", i32 4}
 ; CHECK: !{metadata !"llvm.loop.vectorize.width", i32 8}
 ; CHECK: !{metadata !"llvm.loop.vectorize.enable", i1 true}
 

diff --git a/test/Bitcode/upgrade-tbaa.ll b/test/Bitcode/upgrade-tbaa.ll
index e738909..23b4d7d 100644
--- a/test/Bitcode/upgrade-tbaa.ll
+++ b/test/Bitcode/upgrade-tbaa.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; RUN: verify-uselistorder < %s
 
 ; Function Attrs: nounwind
 define void @_Z4testPiPf(i32* nocapture %pI, float* nocapture %pF) #0 {

diff --git a/test/Bitcode/use-list-order.ll b/test/Bitcode/use-list-order.ll
new file mode 100644
index 0000000..6617b9c5
--- /dev/null
+++ b/test/Bitcode/use-list-order.ll

@@ -0,0 +1,168 @@
+; RUN: verify-uselistorder < %s
+
+@a = global [4 x i1] [i1 0, i1 1, i1 0, i1 1]
+@b = alias i1* getelementptr ([4 x i1]* @a, i64 0, i64 2)
+
+; Check use-list order of constants used by globals.
+@glob1 = global i5 7
+@glob2 = global i5 7
+@glob3 = global i5 7
+
+; Check use-list order between variables and aliases.
+@target = global i3 zeroinitializer
+@alias1 = alias i3* @target
+@alias2 = alias i3* @target
+@alias3 = alias i3* @target
+@var1 = global i3* @target
+@var2 = global i3* @target
+@var3 = global i3* @target
+
+; Check use-list order for a global when used both by a global and in a
+; function.
+@globalAndFunction = global i4 4
+@globalAndFunctionGlobalUser = global i4* @globalAndFunction
+
+; Check use-list order for constants used by globals that are themselves used
+; as aliases.  This confirms that this globals are recognized as GlobalValues
+; (not general constants).
+@const.global = global i63 0
+@const.global.ptr = global i63* @const.global
+@const.global.2 = global i63 0
+
+; Same as above, but for aliases.
+@const.target = global i62 1
+@const.alias = alias i62* @const.target
+@const.alias.ptr = alias i62* @const.alias
+@const.alias.2 = alias i62* @const.target
+
+define i64 @f(i64 %f) {
+entry:
+  %sum = add i64 %f, 0
+  ret i64 %sum
+}
+
+define i64 @g(i64 %g) {
+entry:
+  %sum = add i64 %g, 0
+  ret i64 %sum
+}
+
+define i64 @h(i64 %h) {
+entry:
+  %sum = add i64 %h, 0
+  ret i64 %sum
+}
+
+define i64 @i(i64 %i) {
+entry:
+  %sum = add i64 %i, 1
+  ret i64 %sum
+}
+
+define i64 @j(i64 %j) {
+entry:
+  %sum = add i64 %j, 1
+  ret i64 %sum
+}
+
+define i64 @k(i64 %k) {
+entry:
+  %sum = add i64 %k, 1
+  ret i64 %sum
+}
+
+define i64 @l(i64 %l) {
+entry:
+  %sum = add i64 %l, 1
+  ret i64 %sum
+}
+
+define i1 @loadb() {
+entry:
+  %b = load i1* @b
+  ret i1 %b
+}
+
+define i1 @loada() {
+entry:
+  %a = load i1* getelementptr ([4 x i1]* @a, i64 0, i64 2)
+  ret i1 %a
+}
+
+define i32 @f32(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  br label %first
+
+second:
+  %eh = mul i32 %e, %h
+  %sum = add i32 %eh, %ef
+  br label %exit
+
+exit:
+  %product = phi i32 [%ef, %first], [%sum, %second]
+  ret i32 %product
+
+first:
+  %e = add i32 %a, 7
+  %f = add i32 %b, 7
+  %g = add i32 %c, 8
+  %h = add i32 %d, 8
+  %ef = mul i32 %e, %f
+  %gh = mul i32 %g, %h
+  %gotosecond = icmp slt i32 %gh, -9
+  br i1 %gotosecond, label %second, label %exit
+}
+
+define i4 @globalAndFunctionFunctionUser() {
+entry:
+  %local = load i4* @globalAndFunction
+  ret i4 %local
+}
+
+; Check for when an instruction is its own user.
+define void @selfUser(i1 %a) {
+entry:
+  ret void
+
+loop1:
+  br label %loop2
+
+loop2:
+  %var = phi i32 [ %var, %loop1 ], [ %var, %loop2 ]
+  br label %loop2
+}
+
+; Check that block addresses work.
+@ba1 = constant i8* blockaddress (@bafunc1, %bb)
+@ba2 = constant i8* getelementptr (i8* blockaddress (@bafunc2, %bb), i61 0)
+@ba3 = constant i8* getelementptr (i8* blockaddress (@bafunc2, %bb), i61 0)
+
+define i8* @babefore() {
+  ret i8* getelementptr (i8* blockaddress (@bafunc2, %bb), i61 0)
+bb1:
+  ret i8* blockaddress (@bafunc1, %bb)
+bb2:
+  ret i8* blockaddress (@bafunc3, %bb)
+}
+define void @bafunc1() {
+  unreachable
+bb:
+  unreachable
+}
+define void @bafunc2() {
+  unreachable
+bb:
+  unreachable
+}
+define void @bafunc3() {
+  unreachable
+bb:
+  unreachable
+}
+define i8* @baafter() {
+  ret i8* blockaddress (@bafunc2, %bb)
+bb1:
+  ret i8* blockaddress (@bafunc1, %bb)
+bb2:
+  ret i8* blockaddress (@bafunc3, %bb)
+}

diff --git a/test/Bitcode/variableArgumentIntrinsic.3.2.ll b/test/Bitcode/variableArgumentIntrinsic.3.2.ll
index 35fe0e2..ad70f05 100644
--- a/test/Bitcode/variableArgumentIntrinsic.3.2.ll
+++ b/test/Bitcode/variableArgumentIntrinsic.3.2.ll

@@ -1,33 +1,34 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; vaArgIntrinsic.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread variable argument intrinsic instructions

-; of older bitcode files.

-

-define i32 @varArgIntrinsic(i32 %X, ...) {

-

-  %ap = alloca i8*

-  %ap2 = bitcast i8** %ap to i8*

-  

-; CHECK: call void @llvm.va_start(i8* %ap2)

-  call void @llvm.va_start(i8* %ap2)

-

-; CHECK-NEXT: %tmp = va_arg i8** %ap, i32

-  %tmp = va_arg i8** %ap, i32

-

-  %aq = alloca i8*

-  %aq2 = bitcast i8** %aq to i8*

-  

-; CHECK: call void @llvm.va_copy(i8* %aq2, i8* %ap2)

-  call void @llvm.va_copy(i8* %aq2, i8* %ap2)

-; CHECK-NEXT: call void @llvm.va_end(i8* %aq2)

-  call void @llvm.va_end(i8* %aq2)

-

-; CHECK-NEXT:  call void @llvm.va_end(i8* %ap2)

-  call void @llvm.va_end(i8* %ap2)

-  ret i32 %tmp

-}

-

-declare void @llvm.va_start(i8*)

-declare void @llvm.va_copy(i8*, i8*)

-declare void @llvm.va_end(i8*)
\ No newline at end of file
+; RUN: llvm-dis < %s.bc| FileCheck %s
+; RUN: verify-uselistorder < %s.bc
+
+; vaArgIntrinsic.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread variable argument intrinsic instructions
+; of older bitcode files.
+
+define i32 @varArgIntrinsic(i32 %X, ...) {
+
+  %ap = alloca i8*
+  %ap2 = bitcast i8** %ap to i8*
+
+; CHECK: call void @llvm.va_start(i8* %ap2)
+  call void @llvm.va_start(i8* %ap2)
+
+; CHECK-NEXT: %tmp = va_arg i8** %ap, i32
+  %tmp = va_arg i8** %ap, i32
+
+  %aq = alloca i8*
+  %aq2 = bitcast i8** %aq to i8*
+
+; CHECK: call void @llvm.va_copy(i8* %aq2, i8* %ap2)
+  call void @llvm.va_copy(i8* %aq2, i8* %ap2)
+; CHECK-NEXT: call void @llvm.va_end(i8* %aq2)
+  call void @llvm.va_end(i8* %aq2)
+
+; CHECK-NEXT:  call void @llvm.va_end(i8* %ap2)
+  call void @llvm.va_end(i8* %ap2)
+  ret i32 %tmp
+}
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_copy(i8*, i8*)
+declare void @llvm.va_end(i8*)

diff --git a/test/Bitcode/vectorInstructions.3.2.ll b/test/Bitcode/vectorInstructions.3.2.ll
index b24ef75..94c193a 100644
--- a/test/Bitcode/vectorInstructions.3.2.ll
+++ b/test/Bitcode/vectorInstructions.3.2.ll

@@ -1,34 +1,33 @@
-; RUN:  llvm-dis < %s.bc| FileCheck %s

-

-; vectorOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.

-; The test checks that LLVM does not misread vector operations of

-; older bitcode files.

-

-define void @extractelement(<2 x i8> %x1){

-entry:

-; CHECK: %res1 = extractelement <2 x i8> %x1, i32 0

-  %res1 = extractelement <2 x i8> %x1, i32 0

-  

-  ret void

-}

-

-define void @insertelement(<2 x i8> %x1){

-entry:

-; CHECK: %res1 = insertelement <2 x i8> %x1, i8 0, i32 0

-  %res1 = insertelement <2 x i8> %x1, i8 0, i32 0

-  

-  ret void

-}

-

-define void @shufflevector(<2 x i8> %x1){

-entry:

-; CHECK: %res1 = shufflevector <2 x i8> %x1, <2 x i8> %x1, <2 x i32> <i32 0, i32 1>

-  %res1 = shufflevector <2 x i8> %x1, <2 x i8> %x1, <2 x i32> <i32 0, i32 1>

-

-; CHECK-NEXT: %res2 = shufflevector <2 x i8> %x1, <2 x i8> undef, <2 x i32> <i32 0, i32 1>

-  %res2 = shufflevector <2 x i8> %x1, <2 x i8> undef, <2 x i32> <i32 0, i32 1>

-  

-  ret void

-}

-

-

+; RUN: llvm-dis < %s.bc| FileCheck %s
+; RUN: verify-uselistorder < %s.bc
+
+; vectorOperations.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
+; The test checks that LLVM does not misread vector operations of
+; older bitcode files.
+
+define void @extractelement(<2 x i8> %x1){
+entry:
+; CHECK: %res1 = extractelement <2 x i8> %x1, i32 0
+  %res1 = extractelement <2 x i8> %x1, i32 0
+
+  ret void
+}
+
+define void @insertelement(<2 x i8> %x1){
+entry:
+; CHECK: %res1 = insertelement <2 x i8> %x1, i8 0, i32 0
+  %res1 = insertelement <2 x i8> %x1, i8 0, i32 0
+
+  ret void
+}
+
+define void @shufflevector(<2 x i8> %x1){
+entry:
+; CHECK: %res1 = shufflevector <2 x i8> %x1, <2 x i8> %x1, <2 x i32> <i32 0, i32 1>
+  %res1 = shufflevector <2 x i8> %x1, <2 x i8> %x1, <2 x i32> <i32 0, i32 1>
+
+; CHECK-NEXT: %res2 = shufflevector <2 x i8> %x1, <2 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %res2 = shufflevector <2 x i8> %x1, <2 x i8> undef, <2 x i32> <i32 0, i32 1>
+
+  ret void
+}

diff --git a/test/Bitcode/visibility-styles.3.2.ll b/test/Bitcode/visibility-styles.3.2.ll
index ec2ee68..e36c0e0 100644
--- a/test/Bitcode/visibility-styles.3.2.ll
+++ b/test/Bitcode/visibility-styles.3.2.ll

@@ -1,4 +1,5 @@
 ; RUN:  llvm-dis < %s.bc| FileCheck %s
+; RUN:  verify-uselistorder < %s.bc
 
 ; visibility-styles.3.2.ll.bc was generated by passing this file to llvm-as-3.2.
 ; The test checks that LLVM does not silently misread visibility styles of

diff --git a/test/Bitcode/weak-cmpxchg-upgrade.ll b/test/Bitcode/weak-cmpxchg-upgrade.ll
index dbcd150..76b857b 100644
--- a/test/Bitcode/weak-cmpxchg-upgrade.ll
+++ b/test/Bitcode/weak-cmpxchg-upgrade.ll

@@ -1,4 +1,5 @@
 ; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: verify-uselistorder < %s.bc
 
 ; cmpxchg-upgrade.ll.bc was produced by running a version of llvm-as from just
 ; before the IR change on this file.

diff --git a/test/BugPoint/compile-custom.ll b/test/BugPoint/compile-custom.ll
index e9016ff..d152f08 100755
--- a/test/BugPoint/compile-custom.ll
+++ b/test/BugPoint/compile-custom.ll

@@ -1,4 +1,4 @@
-; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%s.py arg1 arg2" --output-prefix %t %s | FileCheck %s
+; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext --compile-custom --compile-command="%python %s.py arg1 arg2" --output-prefix %t %s | FileCheck %s
 ; REQUIRES: loadable_module
 
 ; Test that arguments are correctly passed in --compile-command.  The output

diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll
index cc043f0..1c27a49 100644
--- a/test/BugPoint/metadata.ll
+++ b/test/BugPoint/metadata.ll

@@ -6,7 +6,7 @@
 
 ; CHECK: call void @foo(), !dbg ![[LOC:[0-9]+]], !attach ![[CALL:[0-9]+]]
 ; CHECK: ![[LOC]] = metadata !{i32 104, i32 105, metadata ![[SCOPE:[0-9]+]], metadata ![[SCOPE]]}
-; CHECK: ![[SCOPE]] = metadata !{i32 458769, metadata ![[FILE:[0-9]+]], i32 0, metadata !"me", i1 true, metadata !"", i32 0, metadata ![[LIST:[0-9]+]], metadata ![[LIST]], null, null, null, metadata !""}
+; CHECK: ![[SCOPE]] = metadata !{metadata !"0x11\000\00me\001\00\000\00\000", metadata ![[FILE:[0-9]+]], metadata ![[LIST:[0-9]+]], metadata ![[LIST]], null, null, null}
 ; CHECK: ![[FILE]] = metadata !{metadata !"source.c", metadata !"/dir"}
 ; CHECK: ![[LIST]] = metadata !{i32 0}
 ; CHECK: ![[CALL]] = metadata !{metadata !"the call to foo"}
@@ -31,7 +31,7 @@
 !3 = metadata !{metadata !"noise"}
 !4 = metadata !{metadata !"filler"}
 
-!9 = metadata !{i32 458769, metadata !15, i32 0, metadata !"me", i1 true, metadata !"", i32 0, metadata !16, metadata !16, null, null, null, metadata !""}
+!9 = metadata !{metadata !"0x11\000\00me\001\00\000\00\000", metadata !15, metadata !16, metadata !16, null, null, null} ; [ DW_TAG_compile_unit ]
 !10 = metadata !{i32 100, i32 101, metadata !9, metadata !9}
 !11 = metadata !{i32 102, i32 103, metadata !9, metadata !9}
 !12 = metadata !{i32 104, i32 105, metadata !9, metadata !9}
@@ -39,4 +39,4 @@
 !14 = metadata !{i32 108, i32 109, metadata !9, metadata !9}
 !15 = metadata !{metadata !"source.c", metadata !"/dir"}
 !16 = metadata !{i32 0}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3e08a16..bdb5d79 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt

@@ -30,8 +30,8 @@
           llvm-cov
           llvm-diff
           llvm-dis
-          llvm-extract
           llvm-dwarfdump
+          llvm-extract
           llvm-link
           llvm-lto
           llvm-mc
@@ -44,6 +44,7 @@
           llvm-rtdyld
           llvm-symbolizer
           llvm-tblgen
+          llvm-vtabledump
           macho-dump
           opt
           FileCheck
@@ -51,6 +52,7 @@
           not
           yaml2obj
           obj2yaml
+          verify-uselistorder
         )
 
 # If Intel JIT events are supported, depend on a tool that tests the listener.
@@ -58,6 +60,14 @@
   set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} llvm-jitlistener)
 endif( LLVM_USE_INTEL_JITEVENTS )
 
+if(TARGET LLVMgold)
+  set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} LLVMgold)
+endif()
+
+if(TARGET llvm-go)
+  set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} llvm-go)
+endif()
+
 add_lit_testsuite(check-llvm "Running the LLVM regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
   PARAMS llvm_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg

diff --git a/test/CodeGen/AArch64/PBQP-chain.ll b/test/CodeGen/AArch64/PBQP-chain.ll
new file mode 100644
index 0000000..c4ba026
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP-chain.ll

@@ -0,0 +1,104 @@
+; RUN: llc < %s -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+;
+; Test PBQP is able to fulfill the accumulator chaining constraint.
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; CHECK-LABEL: fir
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-EVEN: fmadd {{d[0-9]*[02468]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[02468]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+; CHECK-ODD: fmadd {{d[0-9]*[13579]}}, {{d[0-9]*}}, {{d[0-9]*}}, {{d[0-9]*[13579]}}
+define void @fir(double* nocapture %rx, double* nocapture %ry, double* nocapture %c, double* nocapture %x, double* nocapture %y) {
+entry:
+  %0 = load double* %c, align 8
+  %1 = load double* %x, align 8
+  %mul = fmul fast double %1, %0
+  %2 = load double* %y, align 8
+  %mul7 = fmul fast double %2, %0
+  %arrayidx.1 = getelementptr inbounds double* %c, i64 1
+  %3 = load double* %arrayidx.1, align 8
+  %arrayidx2.1 = getelementptr inbounds double* %x, i64 1
+  %4 = load double* %arrayidx2.1, align 8
+  %mul.1 = fmul fast double %4, %3
+  %add.1 = fadd fast double %mul.1, %mul
+  %arrayidx6.1 = getelementptr inbounds double* %y, i64 1
+  %5 = load double* %arrayidx6.1, align 8
+  %mul7.1 = fmul fast double %5, %3
+  %add8.1 = fadd fast double %mul7.1, %mul7
+  %arrayidx.2 = getelementptr inbounds double* %c, i64 2
+  %6 = load double* %arrayidx.2, align 8
+  %arrayidx2.2 = getelementptr inbounds double* %x, i64 2
+  %7 = load double* %arrayidx2.2, align 8
+  %mul.2 = fmul fast double %7, %6
+  %add.2 = fadd fast double %mul.2, %add.1
+  %arrayidx6.2 = getelementptr inbounds double* %y, i64 2
+  %8 = load double* %arrayidx6.2, align 8
+  %mul7.2 = fmul fast double %8, %6
+  %add8.2 = fadd fast double %mul7.2, %add8.1
+  %arrayidx.3 = getelementptr inbounds double* %c, i64 3
+  %9 = load double* %arrayidx.3, align 8
+  %arrayidx2.3 = getelementptr inbounds double* %x, i64 3
+  %10 = load double* %arrayidx2.3, align 8
+  %mul.3 = fmul fast double %10, %9
+  %add.3 = fadd fast double %mul.3, %add.2
+  %arrayidx6.3 = getelementptr inbounds double* %y, i64 3
+  %11 = load double* %arrayidx6.3, align 8
+  %mul7.3 = fmul fast double %11, %9
+  %add8.3 = fadd fast double %mul7.3, %add8.2
+  %arrayidx.4 = getelementptr inbounds double* %c, i64 4
+  %12 = load double* %arrayidx.4, align 8
+  %arrayidx2.4 = getelementptr inbounds double* %x, i64 4
+  %13 = load double* %arrayidx2.4, align 8
+  %mul.4 = fmul fast double %13, %12
+  %add.4 = fadd fast double %mul.4, %add.3
+  %arrayidx6.4 = getelementptr inbounds double* %y, i64 4
+  %14 = load double* %arrayidx6.4, align 8
+  %mul7.4 = fmul fast double %14, %12
+  %add8.4 = fadd fast double %mul7.4, %add8.3
+  %arrayidx.5 = getelementptr inbounds double* %c, i64 5
+  %15 = load double* %arrayidx.5, align 8
+  %arrayidx2.5 = getelementptr inbounds double* %x, i64 5
+  %16 = load double* %arrayidx2.5, align 8
+  %mul.5 = fmul fast double %16, %15
+  %add.5 = fadd fast double %mul.5, %add.4
+  %arrayidx6.5 = getelementptr inbounds double* %y, i64 5
+  %17 = load double* %arrayidx6.5, align 8
+  %mul7.5 = fmul fast double %17, %15
+  %add8.5 = fadd fast double %mul7.5, %add8.4
+  %arrayidx.6 = getelementptr inbounds double* %c, i64 6
+  %18 = load double* %arrayidx.6, align 8
+  %arrayidx2.6 = getelementptr inbounds double* %x, i64 6
+  %19 = load double* %arrayidx2.6, align 8
+  %mul.6 = fmul fast double %19, %18
+  %add.6 = fadd fast double %mul.6, %add.5
+  %arrayidx6.6 = getelementptr inbounds double* %y, i64 6
+  %20 = load double* %arrayidx6.6, align 8
+  %mul7.6 = fmul fast double %20, %18
+  %add8.6 = fadd fast double %mul7.6, %add8.5
+  %arrayidx.7 = getelementptr inbounds double* %c, i64 7
+  %21 = load double* %arrayidx.7, align 8
+  %arrayidx2.7 = getelementptr inbounds double* %x, i64 7
+  %22 = load double* %arrayidx2.7, align 8
+  %mul.7 = fmul fast double %22, %21
+  %add.7 = fadd fast double %mul.7, %add.6
+  %arrayidx6.7 = getelementptr inbounds double* %y, i64 7
+  %23 = load double* %arrayidx6.7, align 8
+  %mul7.7 = fmul fast double %23, %21
+  %add8.7 = fadd fast double %mul7.7, %add8.6
+  store double %add.7, double* %rx, align 8
+  store double %add8.7, double* %ry, align 8
+  ret void
+}
+

diff --git a/test/CodeGen/AArch64/PBQP-coalesce-benefit.ll b/test/CodeGen/AArch64/PBQP-coalesce-benefit.ll
new file mode 100644
index 0000000..45ac5e6
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP-coalesce-benefit.ll

@@ -0,0 +1,14 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s
+
+; CHECK-LABEL: test:
+define i32 @test(i32 %acc, i32* nocapture readonly %c) {
+entry:
+  %0 = load i32* %c, align 4
+; CHECK-NOT: mov	 w{{[0-9]*}}, w0
+  %add = add nsw i32 %0, %acc
+  %arrayidx1 = getelementptr inbounds i32* %c, i64 1
+  %1 = load i32* %arrayidx1, align 4
+  %add2 = add nsw i32 %add, %1
+  ret i32 %add2
+}
+

diff --git a/test/CodeGen/AArch64/PBQP-csr.ll b/test/CodeGen/AArch64/PBQP-csr.ll
new file mode 100644
index 0000000..64335ae
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP-csr.ll

@@ -0,0 +1,91 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -mattr=+neon -fp-contract=fast -regalloc=pbqp -pbqp-coalescing | FileCheck %s
+
+%pl = type { i32, i32, i32, i32, %p*, %l*, double* }
+%p = type { i32, %ca*, [27 x %ca*], %v*, %v*, %v*, i32 }
+%ca = type { %v, float, i32 }
+%v = type { double, double, double }
+%l = type opaque
+%rs = type { i32, i32, i32, i32, %v*, %v*, [21 x double], %v, %v, %v, double, double, double }
+
+;CHECK-LABEL: test_csr
+define void @test_csr(%pl* nocapture readnone %this, %rs* nocapture %r) align 2 {
+;CHECK-NOT: stp {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %x.i = getelementptr inbounds %rs* %r, i64 0, i32 7, i32 0
+  %y.i = getelementptr inbounds %rs* %r, i64 0, i32 7, i32 1
+  %z.i = getelementptr inbounds %rs* %r, i64 0, i32 7, i32 2
+  %x.i61 = getelementptr inbounds %rs* %r, i64 0, i32 8, i32 0
+  %y.i62 = getelementptr inbounds %rs* %r, i64 0, i32 8, i32 1
+  %z.i63 = getelementptr inbounds %rs* %r, i64 0, i32 8, i32 2
+  %x.i58 = getelementptr inbounds %rs* %r, i64 0, i32 9, i32 0
+  %y.i59 = getelementptr inbounds %rs* %r, i64 0, i32 9, i32 1
+  %z.i60 = getelementptr inbounds %rs* %r, i64 0, i32 9, i32 2
+  %na = getelementptr inbounds %rs* %r, i64 0, i32 0
+  %0 = bitcast double* %x.i to i8*
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 72, i32 8, i1 false)
+  %1 = load i32* %na, align 4
+  %cmp70 = icmp sgt i32 %1, 0
+  br i1 %cmp70, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %fn = getelementptr inbounds %rs* %r, i64 0, i32 4
+  %2 = load %v** %fn, align 8
+  %fs = getelementptr inbounds %rs* %r, i64 0, i32 5
+  %3 = load %v** %fs, align 8
+  %4 = sext i32 %1 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %5 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add6.i, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %6 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %17, %for.body ]
+  %7 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %22, %for.body ]
+  %8 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %26, %for.body ]
+  %9 = phi <2 x double> [ zeroinitializer, %for.body.lr.ph ], [ %28, %for.body ]
+  %x.i54 = getelementptr inbounds %v* %2, i64 %indvars.iv, i32 0
+  %x1.i = getelementptr inbounds %v* %3, i64 %indvars.iv, i32 0
+  %y.i56 = getelementptr inbounds %v* %2, i64 %indvars.iv, i32 1
+  %10 = bitcast double* %x.i54 to <2 x double>*
+  %11 = load <2 x double>* %10, align 8
+  %y2.i = getelementptr inbounds %v* %3, i64 %indvars.iv, i32 1
+  %12 = bitcast double* %x1.i to <2 x double>*
+  %13 = load <2 x double>* %12, align 8
+  %14 = fadd fast <2 x double> %13, %11
+  %z.i57 = getelementptr inbounds %v* %2, i64 %indvars.iv, i32 2
+  %15 = load double* %z.i57, align 8
+  %z4.i = getelementptr inbounds %v* %3, i64 %indvars.iv, i32 2
+  %16 = load double* %z4.i, align 8
+  %add5.i = fadd fast double %16, %15
+  %17 = fadd fast <2 x double> %6, %11
+  %18 = bitcast double* %x.i to <2 x double>*
+  store <2 x double> %17, <2 x double>* %18, align 8
+  %19 = load double* %x1.i, align 8
+  %20 = insertelement <2 x double> undef, double %15, i32 0
+  %21 = insertelement <2 x double> %20, double %19, i32 1
+  %22 = fadd fast <2 x double> %7, %21
+  %23 = bitcast double* %z.i to <2 x double>*
+  store <2 x double> %22, <2 x double>* %23, align 8
+  %24 = bitcast double* %y2.i to <2 x double>*
+  %25 = load <2 x double>* %24, align 8
+  %26 = fadd fast <2 x double> %8, %25
+  %27 = bitcast double* %y.i62 to <2 x double>*
+  store <2 x double> %26, <2 x double>* %27, align 8
+  %28 = fadd fast <2 x double> %14, %9
+  %29 = bitcast double* %x.i58 to <2 x double>*
+  store <2 x double> %28, <2 x double>* %29, align 8
+  %add6.i = fadd fast double %add5.i, %5
+  store double %add6.i, double* %z.i60, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp slt i64 %indvars.iv.next, %4
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+

diff --git a/test/CodeGen/AArch64/PBQP.ll b/test/CodeGen/AArch64/PBQP.ll
new file mode 100644
index 0000000..675a2ca
--- /dev/null
+++ b/test/CodeGen/AArch64/PBQP.ll

@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -regalloc=pbqp -pbqp-coalescing -o - %s | FileCheck %s
+
+define i32 @foo(i32 %a) {
+; CHECK-LABEL: foo:
+; CHECK: bl bar
+; CHECK: bl baz
+  %call = call i32 @bar(i32 %a)
+  %call1 = call i32 @baz(i32 %call)
+  ret i32 %call1
+}
+
+declare i32 @bar(i32)
+declare i32 @baz(i32)
+

diff --git a/test/CodeGen/AArch64/Redundantstore.ll b/test/CodeGen/AArch64/Redundantstore.ll
new file mode 100644
index 0000000..72f7f46
--- /dev/null
+++ b/test/CodeGen/AArch64/Redundantstore.ll

@@ -0,0 +1,25 @@
+; RUN: llc -O3 -march=aarch64 < %s | FileCheck %s 
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+@end_of_array = common global i8* null, align 8
+
+; CHECK-LABEL: @test
+; CHECK: stur
+; CHECK-NOT: stur
+define i8* @test(i32 %size) {
+entry:
+  %0 = load i8** @end_of_array, align 8
+  %conv = sext i32 %size to i64
+  %and = and i64 %conv, -8
+  %conv2 = trunc i64 %and to i32
+  %add.ptr.sum = add nsw i64 %and, -4
+  %add.ptr3 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %size4 = bitcast i8* %add.ptr3 to i32*
+  store i32 %conv2, i32* %size4, align 4
+  %add.ptr.sum9 = add nsw i64 %and, -4
+  %add.ptr5 = getelementptr inbounds i8* %0, i64 %add.ptr.sum9
+  %size6 = bitcast i8* %add.ptr5 to i32*
+  store i32 %conv2, i32* %size6, align 4
+  ret i8* %0
+}
+

diff --git a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
new file mode 100644
index 0000000..4da33a0
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll

@@ -0,0 +1,106 @@
+; RUN: llc < %s -O2 -mtriple=aarch64-none-linux-gnu 
+
+; Bug 20598
+
+
+define void @test() #0 {
+entry:
+  br label %for.body, !dbg !39
+
+for.body:                                         ; preds = %for.body, %entry
+  %arrayidx5 = getelementptr inbounds i32* null, i64 1, !dbg !43
+  %0 = load i32* null, align 4, !dbg !45, !tbaa !46
+  %s1 = sub nsw i32 0, %0, !dbg !50
+  %n1 = sext i32 %s1 to i64, !dbg !50
+  %arrayidx21 = getelementptr inbounds i32* null, i64 3, !dbg !51
+  %add53 = add nsw i64 %n1, 0, !dbg !52
+  %add55 = add nsw i64 %n1, 0, !dbg !53
+  %mul63 = mul nsw i64 %add53, -20995, !dbg !54
+  tail call void @llvm.dbg.value(metadata !{i64 %mul63}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !55
+  %mul65 = mul nsw i64 %add55, -3196, !dbg !56
+  %add67 = add nsw i64 0, %mul65, !dbg !57
+  %add80 = add i64 0, 1024, !dbg !58
+  %add81 = add i64 %add80, %mul63, !dbg !58
+  %add82 = add i64 %add81, 0, !dbg !58
+  %shr83351 = lshr i64 %add82, 11, !dbg !58
+  %conv84 = trunc i64 %shr83351 to i32, !dbg !58
+  store i32 %conv84, i32* %arrayidx21, align 4, !dbg !58, !tbaa !46
+  %add86 = add i64 0, 1024, !dbg !59
+  %add87 = add i64 %add86, 0, !dbg !59
+  %add88 = add i64 %add87, %add67, !dbg !59
+  %shr89352 = lshr i64 %add88, 11, !dbg !59
+  %n2 = trunc i64 %shr89352 to i32, !dbg !59
+  store i32 %n2, i32* %arrayidx5, align 4, !dbg !59, !tbaa !46
+  br label %for.body, !dbg !39
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!36, !37}
+!llvm.ident = !{!38}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [] [] []
+!1 = metadata !{metadata !"test.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00\00\00\00140\000\001\000\006\00256\001\00141", metadata !1, metadata !5, metadata !6, null, void ()* @test, null, null, metadata !12} ; [ DW_TAG_subprogram ] [] [] [def] [scope 141] []
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ] [] []
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [] [] [from ]
+!7 = metadata !{null, metadata !8}
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [] [] []
+!9 = metadata !{metadata !"0x16\00\0030\000\000\000\000", metadata !10, null, metadata !11} ; [ DW_TAG_typedef ] [] [] [] [from int]
+!10 = metadata !{metadata !"", metadata !""}
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [int] []
+!12 = metadata !{metadata !13, metadata !14, metadata !18, metadata !19, metadata !20, metadata !21, metadata !22, metadata !23, metadata !24, metadata !25, metadata !26, metadata !27, metadata !28, metadata !29, metadata !30, metadata !31, metadata !32, metadata !33, metadata !34, metadata !35}
+!13 = metadata !{metadata !"0x101\00\0016777356\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [] [data] []
+!14 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!15 = metadata !{metadata !"0x16\00\00183\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [] [INT32] [] [from long int]
+!16 = metadata !{metadata !"", metadata !""}
+!17 = metadata !{metadata !"0x24\00\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [] [long int] []
+!18 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!19 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!20 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!21 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!22 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!23 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [] [] []
+!24 = metadata !{metadata !"0x100\00\00142\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!25 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!26 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!27 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!28 = metadata !{metadata !"0x100\00\00143\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!29 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!30 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!31 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!32 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [ ] [] []
+!33 = metadata !{metadata !"0x100\00\00144\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [  ] [] []
+!34 = metadata !{metadata !"0x100\00\00145\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [  ] [] []
+!35 = metadata !{metadata !"0x100\00\00146\000", metadata !4, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [  ] [] []
+!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!37 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!38 = metadata !{metadata !"clang version 3.6.0 "}
+!39 = metadata !{i32 154, i32 8, metadata !40, null}
+!40 = metadata !{metadata !"0xb\00154\008\002", metadata !1, metadata !41} ; [ DW_TAG_lexical_block ] [  ] []
+!41 = metadata !{metadata !"0xb\00154\008\001", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ] [  ] []
+!42 = metadata !{metadata !"0xb\00154\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [  ] []
+!43 = metadata !{i32 157, i32 5, metadata !44, null}
+!44 = metadata !{metadata !"0xb\00154\0042\000", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ] [  ] []
+!45 = metadata !{i32 159, i32 5, metadata !44, null}
+!46 = metadata !{metadata !47, metadata !47, i64 0}
+!47 = metadata !{metadata !"int", metadata !48, i64 0}
+!48 = metadata !{metadata !"omnipotent char", metadata !49, i64 0}
+!49 = metadata !{metadata !"Simple C/C++ TBAA"}
+!50 = metadata !{i32 160, i32 5, metadata !44, null}
+!51 = metadata !{i32 161, i32 5, metadata !44, null}
+!52 = metadata !{i32 188, i32 5, metadata !44, null}
+!53 = metadata !{i32 190, i32 5, metadata !44, null}
+!54 = metadata !{i32 198, i32 5, metadata !44, null}
+!55 = metadata !{i32 144, i32 13, metadata !4, null}
+!56 = metadata !{i32 200, i32 5, metadata !44, null}
+!57 = metadata !{i32 203, i32 5, metadata !44, null}
+!58 = metadata !{i32 207, i32 5, metadata !44, null}
+!59 = metadata !{i32 208, i32 5, metadata !44, null}

diff --git a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index fb229fc..7108bc0 100644
--- a/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll

@@ -1,5 +1,7 @@
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN
-; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-ODD
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-EVEN
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-ODD
 
 ; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so
 ; our test strategy is to:
@@ -73,7 +75,9 @@
 ; CHECK: fmsub [[x]]
 ; CHECK: fmadd [[y]]
 ; CHECK: fmadd [[x]]
-; CHECK: stp [[x]], [[y]]
+; CHECK-A57: stp [[x]], [[y]]
+; CHECK-A53-DAG: str [[x]]
+; CHECK-A53-DAG: str [[y]]
 
 define void @f2(double* nocapture readonly %p, double* nocapture %q) #0 {
 entry:
@@ -166,7 +170,9 @@
 ; CHECK: fmsub [[x]]
 ; CHECK: fmadd [[y]]
 ; CHECK: fmadd [[x]]
-; CHECK: stp [[x]], [[y]]
+; CHECK-A57: stp [[x]], [[y]]
+; CHECK-A53-DAG: str [[x]]
+; CHECK-A53-DAG: str [[y]]
 
 define void @f4(float* nocapture readonly %p, float* nocapture %q) #0 {
 entry:

diff --git a/test/CodeGen/AArch64/aarch64-be-bv.ll b/test/CodeGen/AArch64/aarch64-be-bv.ll
new file mode 100644
index 0000000..01642a4
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-be-bv.ll

@@ -0,0 +1,831 @@
+; RUN: llc -mtriple=aarch64_be--linux-gnu < %s | FileCheck %s
+
+@vec_v8i16 = global <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+
+; CHECK-LABEL: movi_modimm_t1:
+define i16 @movi_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t2:
+define i16 @movi_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t3:
+define i16 @movi_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t4:
+define i16 @movi_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t5:
+define i16 @movi_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t6:
+define i16 @movi_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t7:
+define i16 @movi_modimm_t7() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, msl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 511, i16 0, i16 511, i16 0, i16 511, i16 0, i16 511, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t8:
+define i16 @movi_modimm_t8() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].4s, #0x1, msl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65535, i16 1, i16 65535, i16 1, i16 65535, i16 1, i16 65535, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t9:
+define i16 @movi_modimm_t9() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].16b, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: movi_modimm_t10:
+define i16 @movi_modimm_t10() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    movi	   v[[REG2:[0-9]+]].2d, #0x00ffff0000ffff
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: fmov_modimm_t11:
+define i16 @fmov_modimm_t11() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    fmov    v[[REG2:[0-9]+]].4s, #3.00000000
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: fmov_modimm_t12:
+define i16 @fmov_modimm_t12() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    fmov    v[[REG2:[0-9]+]].2d, #0.17968750
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 0, i16 0, i16 16327, i16 0, i16 0, i16 0, i16 16327>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t1:
+define i16 @mvni_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t2:
+define i16 @mvni_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t3:
+define i16 @mvni_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t4:
+define i16 @mvni_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t5:
+define i16 @mvni_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t6:
+define i16 @mvni_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t7:
+define i16 @mvni_modimm_t7() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, msl #8
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 65024, i16 65535, i16 65024, i16 65535, i16 65024, i16 65535, i16 65024, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: mvni_modimm_t8:
+define i16 @mvni_modimm_t8() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    mvni	   v[[REG2:[0-9]+]].4s, #0x1, msl #16
+  ; CHECK-NEXT:    add	   v[[REG1]].8h, v[[REG1]].8h, v[[REG2]].8h
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = add <8 x i16> %in, <i16 0, i16 65534, i16 0, i16 65534, i16 0, i16 65534, i16 0, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t1:
+define i16 @bic_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t2:
+define i16 @bic_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t3:
+define i16 @bic_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534, i16 65535, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t4:
+define i16 @bic_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279, i16 65535, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t5:
+define i16 @bic_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534, i16 65534>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: bic_modimm_t6:
+define i16 @bic_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    bic	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = and <8 x i16> %in, <i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279, i16 65279>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t1:
+define i16 @orr_modimm_t1() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].4s, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t2:
+define i16 @orr_modimm_t2() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr     v[[REG2:[0-9]+]].4s, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t3:
+define i16 @orr_modimm_t3() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].4s, #0x1, lsl #16
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 0, i16 1, i16 0, i16 1, i16 0, i16 1, i16 0, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t4:
+define i16 @orr_modimm_t4() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].4s, #0x1, lsl #24
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 0, i16 256, i16 0, i16 256, i16 0, i16 256, i16 0, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t5:
+define i16 @orr_modimm_t5() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].8h, #0x1
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+; CHECK-LABEL: orr_modimm_t6:
+define i16 @orr_modimm_t6() nounwind {
+  ; CHECK:         ld1     { v[[REG1:[0-9]+]].8h }, [x{{[0-9]+}}]
+  ; CHECK-NEXT:    orr	   v[[REG2:[0-9]+]].8h, #0x1, lsl #8
+  ; CHECK-NEXT:    umov	   w{{[0-9]+}}, v[[REG1]].h[0]
+  %in = load <8 x i16>* @vec_v8i16
+  %rv = or <8 x i16> %in, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
+  %el = extractelement <8 x i16> %rv, i32 0
+  ret i16 %el
+}
+
+declare i8 @f_v8i8(<8 x i8> %arg)
+declare i16 @f_v4i16(<4 x i16> %arg)
+declare i32 @f_v2i32(<2 x i32> %arg)
+declare i64 @f_v1i64(<1 x i64> %arg)
+declare i8 @f_v16i8(<16 x i8> %arg)
+declare i16 @f_v8i16(<8 x i16> %arg)
+declare i32 @f_v4i32(<4 x i32> %arg)
+declare i64 @f_v2i64(<2 x i64> %arg)
+
+; CHECK-LABEL: modimm_t1_call:
+define void @modimm_t1_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 8, i8 0, i8 0, i8 0, i8 8, i8 0, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 7, i16 0, i16 7, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 6, i32 6>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 21474836485>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 4, i16 0, i16 4, i16 0, i16 4, i16 0, i16 4, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 8589934594, i64 8589934594>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t2_call:
+define void @modimm_t2_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 8, i8 0, i8 0, i8 0, i8 8, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 1792, i16 0, i16 1792, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 1536, i32 1536>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, lsl #8
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 5497558140160>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 768, i32 768, i32 768, i32 768>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, lsl #8
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 2199023256064, i64 2199023256064>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t3_call:
+define void @modimm_t3_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, lsl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 8, i8 0, i8 0, i8 0, i8 8, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, lsl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 0, i16 7, i16 0, i16 7>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, lsl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 393216, i32 393216>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, lsl #16
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 1407374883880960>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, lsl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, lsl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 4, i16 0, i16 4, i16 0, i16 4, i16 0, i16 4>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, lsl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 196608, i32 196608, i32 196608, i32 196608>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, lsl #16
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 562949953552384, i64 562949953552384>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t4_call:
+define void @modimm_t4_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, lsl #24
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 0, i8 8, i8 0, i8 0, i8 0, i8 8>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, lsl #24
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 0, i16 1792, i16 0, i16 1792>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, lsl #24
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 100663296, i32 100663296>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, lsl #24
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 360287970273525760>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, lsl #24
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0, i8 5>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, lsl #24
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024, i16 0, i16 1024>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, lsl #24
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 50331648, i32 50331648, i32 50331648, i32 50331648>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, lsl #24
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 144115188109410304, i64 144115188109410304>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t5_call:
+define void @modimm_t5_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 8, i8 0, i8 8, i8 0, i8 8, i8 0, i8 8, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x7
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 7, i16 7, i16 7, i16 7>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x6
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 393222, i32 393222>)
+  ; CHECK:         movi    v{{[0-9]+}}.4h, #0x5
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 1407396358717445>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x5
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x4
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x3
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 196611, i32 196611, i32 196611, i32 196611>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].8h, #0x2
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 562958543486978, i64 562958543486978>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t6_call:
+define void @modimm_t6_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x8, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 8, i8 0, i8 8, i8 0, i8 8, i8 0, i8 8>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x7, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 1792, i16 1792, i16 1792, i16 1792>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4h, #0x6, lsl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 100664832, i32 100664832>)
+  ; CHECK:         movi    v{{[0-9]+}}.4h, #0x5, lsl #8
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 360293467831665920>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x5, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5, i8 0, i8 5>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x4, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024, i16 1024>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8h, #0x3, lsl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 50332416, i32 50332416, i32 50332416, i32 50332416>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].8h, #0x2, lsl #8
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 144117387132666368, i64 144117387132666368>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t7_call:
+define void @modimm_t7_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, msl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 255, i8 8, i8 0, i8 0, i8 255, i8 8, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, msl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 2047, i16 0, i16 2047, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, msl #8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 1791, i32 1791>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, msl #8
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 6592774800895>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, msl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0, i8 255, i8 5, i8 0, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, msl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1279, i16 0, i16 1279, i16 0, i16 1279, i16 0, i16 1279, i16 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, msl #8
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 1023, i32 1023, i32 1023, i32 1023>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, msl #8
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 3294239916799, i64 3294239916799>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t8_call:
+define void @modimm_t8_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x8, msl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 255, i8 255, i8 8, i8 0, i8 255, i8 255, i8 8, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x7, msl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 65535, i16 7, i16 65535, i16 7>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2s, #0x6, msl #16
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 458751, i32 458751>)
+  ; CHECK:         movi    v{{[0-9]+}}.2s, #0x5, msl #16
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 1688845565689855>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x5, msl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0, i8 255, i8 255, i8 5, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x4, msl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 65535, i16 4, i16 65535, i16 4, i16 65535, i16 4, i16 65535, i16 4>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].4s, #0x3, msl #16
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 262143, i32 262143, i32 262143, i32 262143>)
+  ; CHECK:         movi    v[[REG:[0-9]+]].4s, #0x2, msl #16
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 844420635361279, i64 844420635361279>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t9_call:
+define void @modimm_t9_call() {
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8b, #0x8
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8b, #0x7
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 1799, i16 1799, i16 1799, i16 1799>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].8b, #0x6
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 101058054, i32 101058054>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].16b, #0x5
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].16b, #0x4
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].16b, #0x3
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 50529027, i32 50529027, i32 50529027, i32 50529027>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t10_call:
+define void @modimm_t10_call() {
+  ; CHECK:         movi    d[[REG1:[0-9]+]], #0x0000ff000000ff
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>)
+  ; CHECK:         movi    d[[REG1:[0-9]+]], #0x00ffff0000ffff
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 -1, i16 0, i16 -1, i16 0>)
+  ; CHECK:         movi    d[[REG1:[0-9]+]], #0xffffffffffffffff
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 -1, i32 -1>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffff00ffffff
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffffffffff0000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 -1, i16 -1, i16 -1, i16 0, i16 -1, i16 -1, i16 -1>)
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffffff00000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 0, i32 -1, i32 0, i32 -1>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t11_call:
+define void @modimm_t11_call() {
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2s, #4.00000000
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.8b, v[[REG1]].8b
+  ; CHECK-NEXT:    bl      f_v8i8
+  call i8 @f_v8i8(<8 x i8> <i8 0, i8 0, i8 128, i8 64, i8 0, i8 0, i8 128, i8 64>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2s, #3.75000000
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
+  ; CHECK-NEXT:    bl      f_v4i16
+  call i16 @f_v4i16(<4 x i16> <i16 0, i16 16496, i16 0, i16 16496>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2s, #3.50000000
+  ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
+  ; CHECK-NEXT:    bl      f_v2i32
+  call i32 @f_v2i32(<2 x i32> <i32 1080033280, i32 1080033280>)
+  ; CHECK:         fmov    v{{[0-9]+}}.2s, #0.39062500
+  ; CHECK-NEXT:    bl      f_v1i64
+  call i64 @f_v1i64(<1 x i64> <i64 4523865826746957824>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].4s, #3.25000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 80, i8 64, i8 0, i8 0, i8 80, i8 64, i8 0, i8 0, i8 80, i8 64, i8 0, i8 0, i8 80, i8 64>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].4s, #3.00000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448, i16 0, i16 16448>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].4s, #2.75000000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 1076887552, i32 1076887552, i32 1076887552, i32 1076887552>)
+  ; CHECK:         fmov    v[[REG:[0-9]+]].4s, #2.5000000
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v2i64
+  call i64 @f_v2i64(<2 x i64> <i64 4620693218757967872, i64 4620693218757967872>)
+
+  ret void
+}
+
+; CHECK-LABEL: modimm_t12_call:
+define void @modimm_t12_call() {
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2d, #0.18750000
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].16b, v[[REG1]].16b
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v16i8
+  call i8 @f_v16i8(<16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 200, i8 63, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 200, i8 63>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2d, #0.17968750
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].8h, v[[REG1]].8h
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v8i16
+  call i16 @f_v8i16(<8 x i16> <i16 0, i16 0, i16 0, i16 16327, i16 0, i16 0, i16 0, i16 16327>)
+  ; CHECK:         fmov    v[[REG1:[0-9]+]].2d, #0.17187500
+  ; CHECK-NEXT:    rev64   v[[REG2:[0-9]+]].4s, v[[REG1]].4s
+  ; CHECK-NEXT:    ext     v[[REG2]].16b, v[[REG2]].16b, v[[REG2]].16b, #8
+  ; CHECK-NEXT:    bl      f_v4i32
+  call i32 @f_v4i32(<4 x i32> <i32 0, i32 1069940736, i32 0, i32 1069940736>)
+
+  ret void
+}

diff --git a/test/CodeGen/AArch64/aarch64-gep-opt.ll b/test/CodeGen/AArch64/aarch64-gep-opt.ll
new file mode 100644
index 0000000..811eed9
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-gep-opt.ll

@@ -0,0 +1,163 @@
+; RUN: llc -O3 -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=cyclone < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-NoAA <%t %s
+; RUN: llc -O3 -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-UseAA <%t %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnueabi"
+
+; Following test cases test enabling SeparateConstOffsetFromGEP pass in AArch64
+; backend. If useAA() returns true, it will lower a GEP with multiple indices
+; into GEPs with a single index, otherwise it will lower it into a
+; "ptrtoint+arithmetics+inttoptr" form.
+
+%struct = type { i32, i32, i32, i32, [20 x i32] }
+
+; Check that when two complex GEPs are used in two basic blocks, LLVM can
+; elimilate the common subexpression for the second use.
+define void @test_GEP_CSE([240 x %struct]* %string, i32* %adj, i32 %lib, i64 %idxprom) {
+  %liberties = getelementptr [240 x %struct]* %string, i64 1, i64 %idxprom, i32 3
+  %1 = load i32* %liberties, align 4
+  %cmp = icmp eq i32 %1, %lib
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %origin = getelementptr [240 x %struct]* %string, i64 1, i64 %idxprom, i32 2
+  %2 = load i32* %origin, align 4
+  store i32 %2, i32* %adj, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK-LABEL: test_GEP_CSE:
+; CHECK: madd
+; CHECK: ldr
+; CHECK-NOT: madd
+; CHECK:ldr
+
+; CHECK-NoAA-LABEL: @test_GEP_CSE(
+; CHECK-NoAA: [[PTR0:%[a-zA-Z0-9]+]] = ptrtoint [240 x %struct]* %string to i64
+; CHECK-NoAA: [[PTR1:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96
+; CHECK-NoAA: [[PTR2:%[a-zA-Z0-9]+]] = add i64 [[PTR0]], [[PTR1]]
+; CHECK-NoAA: add i64 [[PTR2]], 23052
+; CHECK-NoAA: inttoptr
+; CHECK-NoAA: if.then:
+; CHECK-NoAA-NOT: ptrtoint
+; CHECK-NoAA-NOT: mul
+; CHECK-NoAA: add i64 [[PTR2]], 23048
+; CHECK-NoAA: inttoptr
+
+; CHECK-UseAA-LABEL: @test_GEP_CSE(
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = bitcast [240 x %struct]* %string to i8*
+; CHECK-UseAA: [[IDX:%[a-zA-Z0-9]+]] = mul i64 %idxprom, 96
+; CHECK-UseAA: [[PTR1:%[a-zA-Z0-9]+]] = getelementptr i8* [[PTR0]], i64 [[IDX]]
+; CHECK-UseAA: getelementptr i8* [[PTR1]], i64 23052
+; CHECK-UseAA: bitcast
+; CHECK-UseAA: if.then:
+; CHECK-UseAA: getelementptr i8* [[PTR1]], i64 23048
+; CHECK-UseAA: bitcast
+
+%class.my = type { i32, [128 x i32], i32, [256 x %struct.pt]}
+%struct.pt = type { %struct.point*, i32, i32 }
+%struct.point = type { i32, i32 }
+
+; Check when a GEP is used across two basic block, LLVM can sink the address
+; calculation and code gen can generate a better addressing mode for the second
+; use.
+define void @test_GEP_across_BB(%class.my* %this, i64 %idx) {
+  %1 = getelementptr %class.my* %this, i64 0, i32 3, i64 %idx, i32 1
+  %2 = load i32* %1, align 4
+  %3 = getelementptr %class.my* %this, i64 0, i32 3, i64 %idx, i32 2
+  %4 = load i32* %3, align 4
+  %5 = icmp eq i32 %2, %4
+  br i1 %5, label %if.true, label %exit
+
+if.true:
+  %6 = shl i32 %4, 1
+  store i32 %6, i32* %3, align 4
+  br label %exit
+
+exit:
+  %7 = add nsw i32 %4, 1
+  store i32 %7, i32* %1, align 4
+  ret void
+}
+; CHECK-LABEL: test_GEP_across_BB:
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #528]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #532]
+; CHECK-NOT: add
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #532]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #528]
+
+; CHECK-NoAA-LABEL: test_GEP_across_BB(
+; CHECK-NoAA: add i64 [[TMP:%[a-zA-Z0-9]+]], 528
+; CHECK-NoAA: add i64 [[TMP]], 532
+; CHECK-NoAA: if.true:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 532
+; CHECK-NoAA: exit:
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 528
+
+; CHECK-UseAA-LABEL: test_GEP_across_BB(
+; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = getelementptr
+; CHECK-UseAA: getelementptr i8* [[PTR0]], i64 528
+; CHECK-UseAA: getelementptr i8* [[PTR0]], i64 532
+; CHECK-UseAA: if.true:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 532
+; CHECK-UseAA: exit:
+; CHECK-UseAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8* [[PTR0]], i64 528
+
+%struct.S = type { float, double }
+@struct_array = global [1024 x %struct.S] zeroinitializer, align 16
+
+; The following two test cases check we can extract constant from indices of
+; struct type.
+; The constant offsets are from indices "i64 %idxprom" and "i32 1". As the
+; alloca size of %struct.S is 16, and "i32 1" is the 2rd element whose field
+; offset is 8, the total constant offset is (5 * 16 + 8) = 88.
+define double* @test-struct_1(i32 %i) {
+entry:
+  %add = add nsw i32 %i, 5
+  %idxprom = sext i32 %add to i64
+  %p = getelementptr [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
+  ret double* %p
+}
+; CHECK-NoAA-LABEL: @test-struct_1(
+; CHECK-NoAA-NOT: getelementptr
+; CHECK-NoAA: add i64 %{{[a-zA-Z0-9]+}}, 88
+
+; CHECK-UseAA-LABEL: @test-struct_1(
+; CHECK-UseAA: getelementptr i8* %{{[a-zA-Z0-9]+}}, i64 88
+
+%struct3 = type { i64, i32 }
+%struct2 = type { %struct3, i32 }
+%struct1 = type { i64, %struct2 }
+%struct0 = type { i32, i32, i64*, [100 x %struct1] }
+
+; The constant offsets are from indices "i32 3", "i64 %arrayidx" and "i32 1".
+; "i32 3" is the 4th element whose field offset is 16. The alloca size of
+; %struct1 is 32. "i32 1" is the 2rd element whose field offset is 8. So the
+; total constant offset is 16 + (-2 * 32) + 8 = -40
+define %struct2* @test-struct_2(%struct0* %ptr, i64 %idx) {
+entry:
+  %arrayidx = add nsw i64 %idx, -2
+  %ptr2 = getelementptr %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1
+  ret %struct2* %ptr2
+}
+; CHECK-NoAA-LABEL: @test-struct_2(
+; CHECK-NoAA-NOT: = getelementptr
+; CHECK-NoAA: add i64 %{{[a-zA-Z0-9]+}}, -40
+
+; CHECK-UseAA-LABEL: @test-struct_2(
+; CHECK-UseAA: getelementptr i8* %{{[a-zA-Z0-9]+}}, i64 -40
+
+; Test that when a index is added from two constant, SeparateConstOffsetFromGEP
+; pass does not generate incorrect result.
+define void @test_const_add([3 x i32]* %in) {
+  %inc = add nsw i32 2, 1
+  %idxprom = sext i32 %inc to i64
+  %arrayidx = getelementptr [3 x i32]* %in, i64 %idxprom, i64 2
+  store i32 0, i32* %arrayidx, align 4
+  ret void
+}
+; CHECK-LABEL: test_const_add:
+; CHECK: str wzr, [x0, #44]

diff --git a/test/CodeGen/AArch64/aarch64-smull.ll b/test/CodeGen/AArch64/aarch64-smull.ll
new file mode 100644
index 0000000..92582d7
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-smull.ll

@@ -0,0 +1,332 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
+
+define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+; CHECK-LABEL: smull_v8i8_v8i16:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = mul <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+; CHECK-LABEL: smull_v4i16_v4i32:
+; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = mul <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+; CHECK-LABEL: smull_v2i32_v2i64:
+; CHECK:  smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = mul <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+; CHECK-LABEL: umull_v8i8_v8i16:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = mul <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+; CHECK-LABEL: umull_v4i16_v4i32:
+; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = mul <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+; CHECK-LABEL: umull_v2i32_v2i64:
+; CHECK:  umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = mul <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: smlal_v8i8_v8i16:
+; CHECK:  smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = add <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: smlal_v4i16_v4i32:
+; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = add <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: smlal_v2i32_v2i64:
+; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = add <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: umlal_v8i8_v8i16:
+; CHECK:  umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = add <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: umlal_v4i16_v4i32:
+; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = add <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: umlal_v2i32_v2i64:
+; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = add <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: smlsl_v8i8_v8i16:
+; CHECK:  smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = sub <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: smlsl_v4i16_v4i32:
+; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = sub <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: smlsl_v2i32_v2i64:
+; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = sub <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+; CHECK-LABEL: umlsl_v8i8_v8i16:
+; CHECK:  umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = load <8 x i8>* %C
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
+  %tmp6 = mul <8 x i16> %tmp4, %tmp5
+  %tmp7 = sub <8 x i16> %tmp1, %tmp6
+  ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+; CHECK-LABEL: umlsl_v4i16_v4i32:
+; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i16>* %C
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
+  %tmp6 = mul <4 x i32> %tmp4, %tmp5
+  %tmp7 = sub <4 x i32> %tmp1, %tmp6
+  ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+; CHECK-LABEL: umlsl_v2i32_v2i64:
+; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i32>* %C
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
+  %tmp6 = mul <2 x i64> %tmp4, %tmp5
+  %tmp7 = sub <2 x i64> %tmp1, %tmp6
+  ret <2 x i64> %tmp7
+}
+
+; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
+define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; CHECK-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp3 = sext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; Do not use SMULL if the BUILD_VECTOR element values are too big.
+; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK: movz
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %tmp3 = sext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
+; CHECK-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK:  smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp3 = sext <4 x i16> %arg to <4 x i32>
+  %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
+; CHECK: smull_extvec_v2i32_v2i64
+; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp3 = sext <2 x i32> %arg to <2 x i64>
+  %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
+  ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+  %tmp3 = zext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
+  ret <8 x i16> %tmp4
+}
+
+define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
+; Do not use SMULL if the BUILD_VECTOR element values are too big.
+; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK: movz
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %tmp3 = zext <8 x i8> %arg to <8 x i16>
+  %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK:  umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+  %tmp3 = zext <4 x i16> %arg to <4 x i32>
+  %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
+; CHECK-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %tmp3 = zext <2 x i32> %arg to <2 x i64>
+  %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
+  ret <2 x i64> %tmp4
+}
+
+define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
+; If one operand has a zero-extend and the other a sign-extend, smull
+; cannot be used.
+; CHECK-LABEL: smullWithInconsistentExtensions:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+  %1 = sext <8 x i8> %vec to <8 x i16>
+  %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+  %3 = extractelement <8 x i16> %2, i32 0
+  ret i16 %3
+}
+
+define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
+entry:
+; CHECK-LABEL: distribute:
+; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
+; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
+  %0 = trunc i32 %mul to i8
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
+  %4 = bitcast <16 x i8> %3 to <2 x double>
+  %5 = extractelement <2 x double> %4, i32 1
+  %6 = bitcast double %5 to <8 x i8>
+  %7 = zext <8 x i8> %6 to <8 x i16>
+  %8 = zext <8 x i8> %2 to <8 x i16>
+  %9 = extractelement <2 x double> %4, i32 0
+  %10 = bitcast double %9 to <8 x i8>
+  %11 = zext <8 x i8> %10 to <8 x i16>
+  %12 = add <8 x i16> %7, %11
+  %13 = mul <8 x i16> %12, %8
+  %14 = bitcast i16* %dst to i8*
+  tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
+  ret void
+}
+
+declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+

diff --git a/test/CodeGen/AArch64/aarch64-wide-shuffle.ll b/test/CodeGen/AArch64/aarch64-wide-shuffle.ll
new file mode 100644
index 0000000..d06df7a
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-wide-shuffle.ll

@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define <4 x i16> @f(<4 x i32> %vqdmlal_v3.i, <8 x i16> %x5) {
+entry:
+  ; Check that we don't just dup the input vector. The code emitted is ext, dup, ext, ext
+  ; but only match the last three instructions as the first two could be combined to
+  ; a dup2 at some stage.
+  ; CHECK: dup
+  ; CHECK: ext
+  ; CHECK: ext
+  %x4 = extractelement <4 x i32> %vqdmlal_v3.i, i32 2
+  %vgetq_lane = trunc i32 %x4 to i16
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vgetq_lane, i32 0
+  %vecinit2.i = insertelement <4 x i16> %vecinit.i, i16 %vgetq_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vgetq_lane, i32 3
+  %vgetq_lane261 = extractelement <8 x i16> %x5, i32 0
+  %vset_lane267 = insertelement <4 x i16> %vecinit3.i, i16 %vgetq_lane261, i32 1
+  ret <4 x i16> %vset_lane267
+}

diff --git a/test/CodeGen/AArch64/aarch64_f16_be.ll b/test/CodeGen/AArch64/aarch64_f16_be.ll
new file mode 100644
index 0000000..7504439
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64_f16_be.ll

@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=aarch64-linux-gnuabi -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnuabi -O0 < %s | FileCheck %s --check-prefix=CHECK-BE
+
+define void @test_bitcast_v8f16_to_v4f32(<8 x half> %a) {
+; CHECK-LABEL: test_bitcast_v8f16_to_v4f32:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v8f16_to_v4f32:
+; CHECK-BE: st1
+
+  %x = alloca <4 x float>, align 16
+  %y = bitcast <8 x half> %a to <4 x float>
+  store <4 x float> %y, <4 x float>* %x, align 16
+  ret void
+}
+
+define void @test_bitcast_v8f16_to_v2f64(<8 x half> %a) {
+; CHECK-LABEL: test_bitcast_v8f16_to_v2f64:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v8f16_to_v2f64:
+; CHECK-BE: st1
+
+  %x = alloca <2 x double>, align 16
+  %y = bitcast <8 x half> %a to <2 x double>
+  store <2 x double> %y, <2 x double>* %x, align 16
+  ret void
+}
+
+define void @test_bitcast_v8f16_to_fp128(<8 x half> %a) {
+; CHECK-LABEL: test_bitcast_v8f16_to_fp128:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v8f16_to_fp128:
+; CHECK-BE: st1
+
+  %x = alloca fp128, align 16
+  %y = bitcast <8 x half> %a to fp128
+  store fp128 %y, fp128* %x, align 16
+  ret void
+}
+
+define void @test_bitcast_v4f16_to_v2f32(<4 x half> %a) {
+; CHECK-LABEL: test_bitcast_v4f16_to_v2f32:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v4f16_to_v2f32:
+; CHECK-BE: st1
+
+  %x = alloca <2 x float>, align 8
+  %y = bitcast <4 x half> %a to <2 x float>
+  store <2 x float> %y, <2 x float>* %x, align 8
+  ret void
+}
+
+define void @test_bitcast_v4f16_to_v1f64(<4 x half> %a) {
+; CHECK-LABEL: test_bitcast_v4f16_to_v1f64:
+; CHECK-NOT: st1
+
+; CHECK-BE-LABEL: test_bitcast_v4f16_to_v1f64:
+; CHECK-BE: st1
+
+  %x = alloca <1 x double>, align 8
+  %y = bitcast <4 x half> %a to <1 x double>
+  store <1 x double> %y, <1 x double>* %x, align 8
+  ret void
+}

diff --git a/test/CodeGen/AArch64/aarch64_tree_tests.ll b/test/CodeGen/AArch64/aarch64_tree_tests.ll
new file mode 100644
index 0000000..08e506a
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64_tree_tests.ll

@@ -0,0 +1,42 @@
+; RUN: llc < %s | FileCheck %s 
+
+; ModuleID = 'aarch64_tree_tests.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnu"
+
+; CHECK-LABLE: @aarch64_tree_tests_and
+; CHECK: .hword	32768                   
+; CHECK: .hword	32767                   
+; CHECK: .hword	4664                    
+; CHECK: .hword	32767                   
+; CHECK: .hword	32768                   
+; CHECK: .hword	32768                   
+; CHECK: .hword	0                       
+; CHECK: .hword	0                      
+
+; Function Attrs: nounwind readnone
+define <8 x i16> @aarch64_tree_tests_and(<8 x i16> %a) {
+entry:
+  %and = and <8 x i16> <i16 0, i16 undef, i16 undef, i16 0, i16 0, i16 undef, i16 undef, i16 0>, %a
+  %ret = add <8 x i16> %and, <i16 -32768, i16 32767, i16 4664, i16 32767, i16 -32768, i16 -32768, i16 0, i16 0>
+  ret <8 x i16> %ret
+}
+
+; CHECK-LABLE: @aarch64_tree_tests_or
+; CHECK: .hword	32768                 
+; CHECK: .hword	32766
+; CHECK: .hword	4664     
+; CHECK: .hword	32766                
+; CHECK: .hword	32768 
+; CHECK: .hword	32768
+; CHECK: .hword	65535            
+; CHECK: .hword	65535
+
+; Function Attrs: nounwind readnone
+define <8 x i16> @aarch64_tree_tests_or(<8 x i16> %a) {
+entry:
+  %or = or <8 x i16> <i16 -1, i16 undef, i16 undef, i16 -1, i16 -1, i16 undef, i16 undef, i16 -1>, %a
+  %ret = add <8 x i16> %or, <i16 -32767, i16 32767, i16 4665, i16 32767, i16 -32767, i16 -32767, i16 0, i16 0>
+  ret <8 x i16> %ret
+}
+

diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll
index 892573b..0488ee2 100644
--- a/test/CodeGen/AArch64/adc.ll
+++ b/test/CodeGen/AArch64/adc.ll

@@ -1,5 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 
 define i128 @test_simple(i128 %a, i128 %b, i128 %c) {
 ; CHECK-LABEL: test_simple:

diff --git a/test/CodeGen/AArch64/analyzecmp.ll b/test/CodeGen/AArch64/analyzecmp.ll
new file mode 100644
index 0000000..8962505
--- /dev/null
+++ b/test/CodeGen/AArch64/analyzecmp.ll

@@ -0,0 +1,32 @@
+; RUN: llc -O3 -mcpu=cortex-a57 < %s | FileCheck %s 
+
+; CHECK-LABLE: @test
+; CHECK: tst [[CMP:x[0-9]+]], #0x8000000000000000
+; CHECK: csel [[R0:x[0-9]+]], [[S0:x[0-9]+]], [[S1:x[0-9]+]], eq
+; CHECK: csel [[R1:x[0-9]+]], [[S2:x[0-9]+]], [[S3:x[0-9]+]], eq
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnueabi"
+
+define void @test(i64 %a, i64* %ptr1, i64* %ptr2) #0 align 2 {
+entry:
+  %conv = and i64 %a, 4294967295
+  %add = add nsw i64 %conv, -1
+  %div = sdiv i64 %add, 64
+  %rem = srem i64 %add, 64
+  %cmp = icmp slt i64 %rem, 0
+  br i1 %cmp, label %if.then, label %exit
+
+if.then:                                
+  %add2 = add nsw i64 %rem, 64
+  %add3 = add i64 %div, -1
+  br label %exit
+
+exit:                 
+  %__n = phi i64 [ %add3, %if.then ], [ %div, %entry ]
+  %__n.0 = phi i64 [ %add2, %if.then ], [ %rem, %entry ]
+  store i64 %__n, i64* %ptr1
+  store i64 %__n.0, i64* %ptr2
+  ret void 
+}
+
+

diff --git a/test/CodeGen/AArch64/and-mask-removal.ll b/test/CodeGen/AArch64/and-mask-removal.ll
new file mode 100644
index 0000000..f803b85
--- /dev/null
+++ b/test/CodeGen/AArch64/and-mask-removal.ll

@@ -0,0 +1,269 @@
+; RUN: llc -O0 -fast-isel=false -mtriple=arm64-apple-darwin  < %s  | FileCheck %s
+
+@board = common global [400 x i8] zeroinitializer, align 1
+@next_string = common global i32 0, align 4
+@string_number = common global [400 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define void @new_position(i32 %pos) {
+entry:
+  %idxprom = sext i32 %pos to i64
+  %arrayidx = getelementptr inbounds [400 x i8]* @board, i64 0, i64 %idxprom
+  %tmp = load i8* %arrayidx, align 1
+  %.off = add i8 %tmp, -1
+  %switch = icmp ult i8 %.off, 2
+  br i1 %switch, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %tmp1 = load i32* @next_string, align 4
+  %arrayidx8 = getelementptr inbounds [400 x i32]* @string_number, i64 0, i64 %idxprom
+  store i32 %tmp1, i32* %arrayidx8, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+; CHECK-LABEL: new_position
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_0(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 74
+  %1 = icmp ult i8 %0, -20
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_0
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_1(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 246
+  %1 = icmp uge i8 %0, 90
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_1
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_2(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 227
+  %1 = icmp ne i8 %0, 179
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_2
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_3(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 201
+  %1 = icmp eq i8 %0, 154
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_3
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_4(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, -79
+  %1 = icmp ne i8 %0, -40
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_4
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_5(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 133
+  %1 = icmp uge i8 %0, -105
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_5
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_6(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, -58
+  %1 = icmp uge i8 %0, 155
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_6
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test8_7(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 225
+  %1 = icmp ult i8 %0, 124
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_7
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+
+
+define zeroext i1 @test8_8(i8 zeroext %x)  align 2 {
+entry:
+  %0 = add i8 %x, 190
+  %1 = icmp uge i8 %0, 1
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test8_8
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_0(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -46989
+  %1 = icmp ne i16 %0, -41903
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_0
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_2(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, 16882
+  %1 = icmp ule i16 %0, -24837
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_2
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_3(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, 29283
+  %1 = icmp ne i16 %0, 16947
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_3
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_4(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -35551
+  %1 = icmp uge i16 %0, 15677
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_4
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_5(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -25214
+  %1 = icmp ne i16 %0, -1932
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_5
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_6(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -32194
+  %1 = icmp uge i16 %0, -41215
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_6
+; CHECK-NOT: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_7(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, 9272
+  %1 = icmp uge i16 %0, -42916
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_7
+; CHECK: and
+; CHECK: ret
+}
+
+define zeroext i1 @test16_8(i16 zeroext %x)  align 2 {
+entry:
+  %0 = add i16 %x, -63749
+  %1 = icmp ne i16 %0, 6706
+  br i1 %1, label %ret_true, label %ret_false
+ret_false:
+  ret i1 false
+ret_true:
+  ret i1 true
+; CHECK-LABEL: test16_8
+; CHECK-NOT: and
+; CHECK: ret
+}
+

diff --git a/test/CodeGen/AArch64/andandshift.ll b/test/CodeGen/AArch64/andandshift.ll
new file mode 100644
index 0000000..e2c7a09
--- /dev/null
+++ b/test/CodeGen/AArch64/andandshift.ll

@@ -0,0 +1,28 @@
+; RUN: llc -O3 < %s | FileCheck %s 
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64--linux-gnu"
+
+; Function Attrs: nounwind readnone
+define i32 @test1(i8 %a) {
+; CHECK-LABLE: @test1
+; CHECK: ubfx {{w[0-9]+}}, w0, #3, #5
+entry:
+  %conv = zext i8 %a to i32
+  %shr1 = lshr i32 %conv, 3
+  ret i32 %shr1
+}
+
+; Function Attrs: nounwind readnone
+define i32 @test2(i8 %a) {
+; CHECK-LABLE: @test2
+; CHECK: and {{w[0-9]+}}, w0, #0xff
+; CHECK: ubfx {{w[0-9]+}}, w0, #3, #5
+entry:
+  %conv = zext i8 %a to i32
+  %cmp = icmp ugt i8 %a, 47
+  %shr5 = lshr i32 %conv, 3
+  %retval.0 = select i1 %cmp, i32 %shr5, i32 %conv
+  ret i32 %retval.0
+}
+
+

diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
index 2b083d8..e57a8c9 100644
--- a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll

@@ -11,34 +11,34 @@
   unreachable
 
 if.else295:                                       ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
+  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
   store i32 0, i32* %do_tab_convert, align 4, !dbg !19
   unreachable
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.gv = !{!0}
 !llvm.dbg.sp = !{!1, !7, !10, !11, !12}
 
-!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!0 = metadata !{metadata !"0x34\00vsplive\00vsplive\00\00617\001\001", metadata !1, metadata !2, metadata !6, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{metadata !"0x2e\00drt_vsprintf\00drt_vsprintf\00\00616\000\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !5, i32 0} ; [ DW_TAG_subroutine_type ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x2e\00putc_mem\00putc_mem\00\0030\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !9, i32 0} ; [ DW_TAG_subroutine_type ]
 !9 = metadata !{null}
-!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x2e\00print_double\00print_double\00\00203\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x2e\00print_number\00print_number\00\0075\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !4, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!12 = metadata !{metadata !"0x2e\00get_flags\00get_flags\00\00508\001\001\000\006\00256\000\000", metadata !20, metadata !2, metadata !8, null, null, null, null, null} ; [ DW_TAG_subprogram ]
 !13 = metadata !{i32 653, i32 5, metadata !14, null}
-!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\00652\0035\002", metadata !20, metadata !15} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0xb\00616\001\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x100\00do_tab_convert\00853\000", metadata !17, metadata !2, metadata !6} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{metadata !"0xb\00850\0012\0033", metadata !20, metadata !14} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{i32 853, i32 11, metadata !17, null}
 !19 = metadata !{i32 853, i32 29, metadata !17, null}
 !20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}

diff --git a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
index 8f99bc3..a83f164 100644
--- a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
+++ b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll

@@ -12,7 +12,7 @@
 
 for.body:
 ; CHECK: for.body
-; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
 ; CHECK: add x[[REG:[0-9]+]],
 ; CHECK:                      x[[REG]], #1, lsl  #12
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]

diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
index 168e921..7d880f3 100644
--- a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll

@@ -1,5 +1,5 @@
-; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
-; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
+; RUN: llc -march=arm64 -O0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -O3 -verify-machineinstrs < %s | FileCheck %s
 
 @.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
 @.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1

diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index c4597d5..6266d1c 100644
--- a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll

@@ -1,15 +1,36 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NOOPT
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPT
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=true | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-NOOPT
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false -disable-adv-copy-opt=false | FileCheck %s -check-prefix=GENERIC -check-prefix=GENERIC-OPT
 
 define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: bar:
 ; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
 ; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; Without advanced copy optimization, we end up with cross register
+; banks copies that cannot be coalesced.
+; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
+; With advanced copy optimization, we end up with just one copy
+; to insert the computed high part into the V register. 
+; CHECK-OPT-NOT: fmov
 ; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
+; CHECK-NOOPT: fmov d0, [[COPY_REG3]]
+; CHECK-OPT-NOT: fmov
+; CHECK: ins.d v0[1], [[COPY_REG2]]
+; CHECK-NEXT: ret
+;
 ; GENERIC-LABEL: bar:
 ; GENERIC: add	v[[REG:[0-9]+]].2d, v0.2d, v1.2d
 ; GENERIC: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
+; GENERIC-OPT-NOT: fmov
 ; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
+; GENERIC-NOOPT: fmov d0, [[COPY_REG3]]
+; GENERIC-OPT-NOT: fmov
+; GENERIC: ins v0.d[1], [[COPY_REG2]]
+; GENERIC-NEXT: ret
   %add = add <2 x i64> %a, %b
   %vgetq_lane = extractelement <2 x i64> %add, i32 0
   %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
@@ -65,3 +86,44 @@
   %retval = bitcast i64 %sub.i to double
   ret double %retval
 }
+define double @and_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: and_su64:
+; CHECK: and.8b v0, v1, v0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: and_su64:
+; GENERIC: and v0.8b, v1.8b, v0.8b
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %or.i = and i64 %vecext1, %vecext
+  %retval = bitcast i64 %or.i to double
+  ret double %retval
+}
+
+define double @orr_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: orr_su64:
+; CHECK: orr.8b v0, v1, v0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: orr_su64:
+; GENERIC: orr v0.8b, v1.8b, v0.8b
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %or.i = or i64 %vecext1, %vecext
+  %retval = bitcast i64 %or.i to double
+  ret double %retval
+}
+
+define double @xorr_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: xorr_su64:
+; CHECK: eor.8b v0, v1, v0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: xorr_su64:
+; GENERIC: eor v0.8b, v1.8b, v0.8b
+; GENERIC-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %xor.i = xor i64 %vecext1, %vecext
+  %retval = bitcast i64 %xor.i to double
+  ret double %retval
+}

diff --git a/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
index 1b2d543..1bb47fc 100644
--- a/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll
+++ b/test/CodeGen/AArch64/arm64-EXT-undef-mask.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc -O0 -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs < %s | FileCheck %s
 
 ; The following 2 test cases test shufflevector with beginning UNDEF mask.
 define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {

diff --git a/test/CodeGen/AArch64/arm64-aapcs-be.ll b/test/CodeGen/AArch64/arm64-aapcs-be.ll
new file mode 100644
index 0000000..77e2b0f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-aapcs-be.ll

@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=aarch64_be-none-eabi -fast-isel=false < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-eabi -fast-isel=true < %s | FileCheck %s
+
+; Check narrow argument passing via stack - callee end
+define i32 @test_narrow_args_callee(i64 %x0, i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, i8 %c, i16 %s) #0 {
+entry:
+  %conv = zext i8 %c to i32
+  %conv1 = sext i16 %s to i32
+  %add = add nsw i32 %conv1, %conv
+; CHECK-LABEL: test_narrow_args_callee:
+; CHECK-DAG: ldrb w{{[0-9]}}, [sp, #7]
+; CHECK-DAG: ldr{{s?}}h w{{[0-9]}}, [sp, #14]
+  ret i32 %add
+}
+
+; Check narrow argument passing via stack - caller end
+define i32 @test_narrow_args_caller() #0 {
+entry:
+  %call = tail call i32 @test_narrow_args_callee(i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i8 8, i16 9)
+; CHECK-LABEL: test_narrow_args_caller:
+; CHECK-DAG: strh w{{[0-9]}}, [sp, #14]
+; CHECK-DAG: strb w{{[0-9]}}, [sp, #7]
+  ret i32 %call
+}
\ No newline at end of file

diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index ccf1371..41c3ad5 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll

@@ -109,3 +109,45 @@
 ; CHECK: ldr {{q[0-9]+}}, [sp]
   ret <2 x double> %varg_stack;
 }
+
+; Check that f16 can be passed and returned (ACLE 2.0 extension)
+define half @test_half(float, half %arg) {
+; CHECK-LABEL: test_half:
+; CHECK: mov v0.16b, v1.16b
+  ret half %arg;
+}
+
+; Check that f16 constants are materialized correctly
+define half @test_half_const() {
+; CHECK-LABEL: test_half_const:
+; CHECK: ldr h0, [x{{[0-9]+}}, :lo12:{{.*}}]
+  ret half 0xH4248
+}
+
+; Check that v4f16 can be passed and returned in registers
+define <4 x half> @test_v4_half_register(float, <4 x half> %arg) {
+; CHECK-LABEL: test_v4_half_register:
+; CHECK: mov v0.16b, v1.16b
+  ret <4 x half> %arg;
+}
+
+; Check that v8f16 can be passed and returned in registers
+define <8 x half> @test_v8_half_register(float, <8 x half> %arg) {
+; CHECK-LABEL: test_v8_half_register:
+; CHECK: mov v0.16b, v1.16b
+  ret <8 x half> %arg;
+}
+
+; Check that v4f16 can be passed and returned on the stack
+define <4 x half> @test_v4_half_stack([8 x <2 x double>], <4 x half> %arg) {
+; CHECK-LABEL: test_v4_half_stack:
+; CHECK: ldr d0, [sp]
+  ret <4 x half> %arg;
+}
+
+; Check that v8f16 can be passed and returned on the stack
+define <8 x half> @test_v8_half_stack([8 x <2 x double>], <8 x half> %arg) {
+; CHECK-LABEL: test_v8_half_stack:
+; CHECK: ldr q0, [sp]
+  ret <8 x half> %arg;
+}

diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index a955029..8a6b64d 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll

@@ -1,7 +1,5 @@
-; RUN: llc < %s -debug -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
-; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
-; REQUIRES: asserts
-target triple = "arm64-apple-darwin"
+; RUN: llc     -mtriple=arm64-apple-darwin -mcpu=cyclone -enable-misched=false < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=arm64-apple-darwin                                     < %s | FileCheck --check-prefix=FAST %s
 
 ; rdar://9932559
 define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
@@ -42,7 +40,7 @@
 
 define i32 @i8i16caller() nounwind readnone {
 entry:
-; CHECK: i8i16caller
+; CHECK-LABEL: i8i16caller
 ; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
 ; They are i8, i16, i8 and i8.
 ; CHECK-DAG: strb {{w[0-9]+}}, [sp, #5]
@@ -50,7 +48,7 @@
 ; CHECK-DAG: strh {{w[0-9]+}}, [sp, #2]
 ; CHECK-DAG: strb {{w[0-9]+}}, [sp]
 ; CHECK: bl
-; FAST: i8i16caller
+; FAST-LABEL: i8i16caller
 ; FAST: strb {{w[0-9]+}}, [sp]
 ; FAST: strh {{w[0-9]+}}, [sp, #2]
 ; FAST: strb {{w[0-9]+}}, [sp, #4]
@@ -64,7 +62,7 @@
 ; rdar://12651543
 define double @circle_center([2 x float] %a) nounwind ssp {
   %call = tail call double @ext([2 x float] %a) nounwind
-; CHECK: circle_center
+; CHECK-LABEL: circle_center
 ; CHECK: bl
   ret double %call
 }
@@ -75,10 +73,10 @@
 ; A double argument will be passed on stack, so vecotr should be at sp+16.
 define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
 entry:
-; CHECK: fixed_4i
+; CHECK-LABEL: fixed_4i
 ; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
-; FAST: fixed_4i
-; FAST: sub sp, sp, #64
+; FAST-LABEL: fixed_4i
+; FAST: sub sp, sp
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
   %0 = load <4 x i32>* %in, align 16
@@ -93,7 +91,7 @@
 define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
        double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
 entry:
-; CHECK: test1
+; CHECK-LABEL: test1
 ; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
 ; CHECK: scvtf [[REG_2:s[0-9]+]], w0
 ; CHECK: fadd s0, [[REG_2]], s0
@@ -110,7 +108,7 @@
 define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
             i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
 entry:
-; CHECK: test2
+; CHECK-LABEL: test2
 ; CHECK: scvtf [[REG_2:s[0-9]+]], w0
 ; CHECK: fadd s0, [[REG_2]], s0
 ; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
@@ -129,9 +127,9 @@
 ; Check alignment on stack for v64, f64, i64, f32, i32.
 define double @test3(<2 x i32>* nocapture %in) nounwind {
 entry:
-; CHECK: test3
+; CHECK-LABEL: test3
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
-; FAST: test3
+; FAST-LABEL: test3
 ; FAST: sub sp, sp, #32
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
@@ -146,7 +144,7 @@
 
 define double @test4(double* nocapture %in) nounwind {
 entry:
-; CHECK: test4
+; CHECK-LABEL: test4
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_2:w[0-9]+]], [sp]
 ; CHECK: orr w0, wzr, #0x3
@@ -161,7 +159,7 @@
 
 define i64 @test5(i64* nocapture %in) nounwind {
 entry:
-; CHECK: test5
+; CHECK-LABEL: test5
 ; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
 ; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_2:w[0-9]+]], [sp]
@@ -175,7 +173,7 @@
 
 define i32 @test6(float* nocapture %in) nounwind {
 entry:
-; CHECK: test6
+; CHECK-LABEL: test6
 ; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
 ; CHECK: strh [[REG_3:w[0-9]+]], [sp]
@@ -192,7 +190,7 @@
 
 define i32 @test7(i32* nocapture %in) nounwind {
 entry:
-; CHECK: test7
+; CHECK-LABEL: test7
 ; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
 ; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
 ; CHECK: strh [[REG_3:w[0-9]+]], [sp]
@@ -206,13 +204,13 @@
 
 define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
-; CHECK: test8
+; CHECK-LABEL: test8
 ; CHECK: strb {{w[0-9]+}}, [sp, #3]
 ; CHECK: strb wzr, [sp, #2]
 ; CHECK: strb {{w[0-9]+}}, [sp, #1]
 ; CHECK: strb wzr, [sp]
 ; CHECK: bl
-; FAST: test8
+; FAST-LABEL: test8
 ; FAST: strb {{w[0-9]+}}, [sp]
 ; FAST: strb {{w[0-9]+}}, [sp, #1]
 ; FAST: strb {{w[0-9]+}}, [sp, #2]

diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index 44c5a07..deb740e 100644
--- a/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll

@@ -34,7 +34,7 @@
 ; structs with size < 8 bytes, passed via i64 in x1 and x2
 define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
 entry:
-; CHECK: f38
+; CHECK-LABEL: f38
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w2
   %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
@@ -56,7 +56,7 @@
 
 define i32 @caller38() #1 {
 entry:
-; CHECK: caller38
+; CHECK-LABEL: caller38
 ; CHECK: ldr x1,
 ; CHECK: ldr x2,
   %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
@@ -72,7 +72,7 @@
 ; i9 at [sp]
 define i32 @caller38_stack() #1 {
 entry:
-; CHECK: caller38_stack
+; CHECK-LABEL: caller38_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
@@ -87,7 +87,7 @@
 ; passed via i128 in x1 and x3
 define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
 entry:
-; CHECK: f39
+; CHECK-LABEL: f39
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w3
   %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
@@ -109,7 +109,7 @@
 
 define i32 @caller39() #1 {
 entry:
-; CHECK: caller39
+; CHECK-LABEL: caller39
 ; CHECK: ldp x1, x2,
 ; CHECK: ldp x3, x4,
   %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
@@ -125,7 +125,7 @@
 ; passed on stack at [sp+16] and [sp+32]
 define i32 @caller39_stack() #1 {
 entry:
-; CHECK: caller39_stack
+; CHECK-LABEL: caller39_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
@@ -141,7 +141,7 @@
 ; passed via i128 in x1 and x3
 define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
 entry:
-; CHECK: f40
+; CHECK-LABEL: f40
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w3
   %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
@@ -165,7 +165,7 @@
 
 define i32 @caller40() #1 {
 entry:
-; CHECK: caller40
+; CHECK-LABEL: caller40
 ; CHECK: ldp x1, x2,
 ; CHECK: ldp x3, x4,
   %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
@@ -181,7 +181,7 @@
 ; passed on stack at [sp+8] and [sp+24]
 define i32 @caller40_stack() #1 {
 entry:
-; CHECK: caller40_stack
+; CHECK-LABEL: caller40_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
@@ -197,7 +197,7 @@
 ; passed via i128 in x1 and x3
 define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
 entry:
-; CHECK: f41
+; CHECK-LABEL: f41
 ; CHECK: add w[[A:[0-9]+]], w1, w0
 ; CHECK: add {{w[0-9]+}}, w[[A]], w3
   %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
@@ -219,7 +219,7 @@
 
 define i32 @caller41() #1 {
 entry:
-; CHECK: caller41
+; CHECK-LABEL: caller41
 ; CHECK: ldp x1, x2,
 ; CHECK: ldp x3, x4,
   %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
@@ -235,7 +235,7 @@
 ; passed on stack at [sp+16] and [sp+32]
 define i32 @caller41_stack() #1 {
 entry:
-; CHECK: caller41_stack
+; CHECK-LABEL: caller41_stack
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK: movz w[[C:[0-9]+]], #0x9
@@ -250,7 +250,7 @@
 ; structs with size of 22 bytes, passed indirectly in x1 and x2
 define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
 entry:
-; CHECK: f42
+; CHECK-LABEL: f42
 ; CHECK: ldr w[[A:[0-9]+]], [x1]
 ; CHECK: ldr w[[B:[0-9]+]], [x2]
 ; CHECK: add w[[C:[0-9]+]], w[[A]], w0
@@ -280,7 +280,7 @@
 ; For s1, we allocate a 22-byte space, pass its address via x1
 define i32 @caller42() #3 {
 entry:
-; CHECK: caller42
+; CHECK-LABEL: caller42
 ; CHECK: str {{x[0-9]+}}, [sp, #48]
 ; CHECK: str {{q[0-9]+}}, [sp, #32]
 ; CHECK: str {{x[0-9]+}}, [sp, #16]
@@ -290,7 +290,7 @@
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
 
-; FAST: caller42
+; FAST-LABEL: caller42
 ; FAST: sub sp, sp, #96
 ; Space for s1 is allocated at fp-24 = sp+72
 ; Space for s2 is allocated at sp+48
@@ -316,7 +316,7 @@
 
 define i32 @caller42_stack() #3 {
 entry:
-; CHECK: caller42_stack
+; CHECK-LABEL: caller42_stack
 ; CHECK: mov x29, sp
 ; CHECK: sub sp, sp, #96
 ; CHECK: stur {{x[0-9]+}}, [x29, #-16]
@@ -333,7 +333,7 @@
 ; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
 
-; FAST: caller42_stack
+; FAST-LABEL: caller42_stack
 ; Space for s1 is allocated at fp-24
 ; Space for s2 is allocated at fp-48
 ; FAST: sub x[[A:[0-9]+]], x29, #24
@@ -359,12 +359,12 @@
 ; passed indirectly in x1 and x2
 define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
 entry:
-; CHECK: f43
+; CHECK-LABEL: f43
 ; CHECK: ldr w[[A:[0-9]+]], [x1]
 ; CHECK: ldr w[[B:[0-9]+]], [x2]
 ; CHECK: add w[[C:[0-9]+]], w[[A]], w0
 ; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
-; FAST: f43
+; FAST-LABEL: f43
 ; FAST: ldr w[[A:[0-9]+]], [x1]
 ; FAST: ldr w[[B:[0-9]+]], [x2]
 ; FAST: add w[[C:[0-9]+]], w[[A]], w0
@@ -388,7 +388,7 @@
 
 define i32 @caller43() #3 {
 entry:
-; CHECK: caller43
+; CHECK-LABEL: caller43
 ; CHECK: str {{q[0-9]+}}, [sp, #48]
 ; CHECK: str {{q[0-9]+}}, [sp, #32]
 ; CHECK: str {{q[0-9]+}}, [sp, #16]
@@ -398,7 +398,7 @@
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
 
-; FAST: caller43
+; FAST-LABEL: caller43
 ; FAST: mov x29, sp
 ; Space for s1 is allocated at sp+32
 ; Space for s2 is allocated at sp
@@ -428,7 +428,7 @@
 
 define i32 @caller43_stack() #3 {
 entry:
-; CHECK: caller43_stack
+; CHECK-LABEL: caller43_stack
 ; CHECK: mov x29, sp
 ; CHECK: sub sp, sp, #96
 ; CHECK: stur {{q[0-9]+}}, [x29, #-16]
@@ -445,7 +445,7 @@
 ; CHECK: movz w[[C:[0-9]+]], #0x9
 ; CHECK: str w[[C]], [sp]
 
-; FAST: caller43_stack
+; FAST-LABEL: caller43_stack
 ; FAST: sub sp, sp, #96
 ; Space for s1 is allocated at fp-32 = sp+64
 ; Space for s2 is allocated at sp+32
@@ -481,13 +481,13 @@
 
 define i32 @i128_split() {
 entry:
-; CHECK: i128_split
+; CHECK-LABEL: i128_split
 ; "i128 %0" should be on stack at [sp].
 ; "i32 8" should be on stack at [sp, #16].
 ; CHECK: str {{w[0-9]+}}, [sp, #16]
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
-; FAST: i128_split
-; FAST: sub sp, sp, #48
+; FAST-LABEL: i128_split
+; FAST: sub sp, sp
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
 ; Load/Store opt is disabled with -O0, so the i128 is split.
@@ -504,14 +504,16 @@
 
 define i32 @i64_split() {
 entry:
-; CHECK: i64_split
+; CHECK-LABEL: i64_split
 ; "i64 %0" should be in register x7.
 ; "i32 8" should be on stack at [sp].
 ; CHECK: ldr x7, [{{x[0-9]+}}]
 ; CHECK: str {{w[0-9]+}}, [sp]
-; FAST: i64_split
+; FAST-LABEL: i64_split
 ; FAST: ldr x7, [{{x[0-9]+}}]
-; FAST: str {{w[0-9]+}}, [sp]
+; FAST: mov x[[R0:[0-9]+]], sp
+; FAST: orr w[[R1:[0-9]+]], wzr, #0x8
+; FAST: str w[[R1]], {{\[}}x[[R0]]{{\]}}
   %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
   %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
                                     i32 6, i32 7, i64 %0, i32 8) #5

diff --git a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
index 08fb8c9..74bb398 100644
--- a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
+++ b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll

@@ -1,4 +1,4 @@
-; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple arm64-apple-ios3 -aarch64-gep-opt=false %s -o - | FileCheck %s
 ; <rdar://problem/13621857>
 
 @block = common global i8* null, align 8

diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll
index 700fba8..5433a8c 100644
--- a/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/test/CodeGen/AArch64/arm64-addrmode.ll

@@ -37,9 +37,8 @@
 
 ; base + unsigned offset (> imm12 * size of type in bytes)
 ; CHECK: @t4
-; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #8, lsl #12
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
+; CHECK: orr w[[NUM:[0-9]+]], wzr, #0x8000
+; CHECK: ldr xzr, [x{{[0-9]+}}, x[[NUM]]]
 ; CHECK: ret
 define void @t4() {
   %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
@@ -60,9 +59,8 @@
 ; base + reg + imm
 ; CHECK: @t6
 ; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
-; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #8, lsl #12
-; CHECK: ldr xzr, [
-; CHECK: [[ADDREG]]]
+; CHECK-NEXT: orr w[[NUM:[0-9]+]], wzr, #0x8000
+; CHECK: ldr xzr, [x{{[0-9]+}}, x[[NUM]]]
 ; CHECK: ret
 define void @t6(i64 %a) {
   %tmp1 = getelementptr inbounds i64* @object, i64 %a
@@ -70,3 +68,114 @@
   %tmp = load volatile i64* %incdec.ptr, align 8
   ret void
 }
+
+; Test base + wide immediate
+define void @t7(i64 %a) {
+; CHECK-LABEL: t7:
+; CHECK: orr w[[NUM:[0-9]+]], wzr, #0xffff
+; CHECK-NEXT: ldr xzr, [x0, x[[NUM]]]
+  %1 = add i64 %a, 65535   ;0xffff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t8(i64 %a) {
+; CHECK-LABEL: t8:
+; CHECK: movn [[REG:x[0-9]+]], #0x1235
+; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
+  %1 = sub i64 %a, 4662   ;-4662 is 0xffffffffffffedca
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t9(i64 %a) {
+; CHECK-LABEL: t9:
+; CHECK: movn [[REG:x[0-9]+]], #0x1235, lsl #16
+; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
+  %1 = add i64 -305463297, %a   ;-305463297 is 0xffffffffedcaffff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t10(i64 %a) {
+; CHECK-LABEL: t10:
+; CHECK: movz [[REG:x[0-9]+]], #0x123, lsl #48
+; CHECK-NEXT: ldr xzr, [x0, [[REG]]]
+  %1 = add i64 %a, 81909218222800896   ;0x123000000000000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t11(i64 %a) {
+; CHECK-LABEL: t11:
+; CHECK: movz w[[NUM:[0-9]+]], #0x123, lsl #16
+; CHECK: movk w[[NUM:[0-9]+]], #0x4567
+; CHECK-NEXT: ldr xzr, [x0, x[[NUM]]]
+  %1 = add i64 %a, 19088743   ;0x1234567
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+; Test some boundaries that should not use movz/movn/orr
+define void @t12(i64 %a) {
+; CHECK-LABEL: t12:
+; CHECK: add [[REG:x[0-9]+]], x0, #4095
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, 4095   ;0xfff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t13(i64 %a) {
+; CHECK-LABEL: t13:
+; CHECK: sub [[REG:x[0-9]+]], x0, #4095
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, -4095   ;-0xfff
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t14(i64 %a) {
+; CHECK-LABEL: t14:
+; CHECK: add [[REG:x[0-9]+]], x0, #291, lsl #12
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, 1191936   ;0x123000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t15(i64 %a) {
+; CHECK-LABEL: t15:
+; CHECK: sub [[REG:x[0-9]+]], x0, #291, lsl #12
+; CHECK-NEXT: ldr xzr, {{\[}}[[REG]]]
+  %1 = add i64 %a, -1191936   ;0xFFFFFFFFFFEDD000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t16(i64 %a) {
+; CHECK-LABEL: t16:
+; CHECK: ldr xzr, [x0, #28672]
+  %1 = add i64 %a, 28672   ;0x7000
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}
+
+define void @t17(i64 %a) {
+; CHECK-LABEL: t17:
+; CHECK: ldur xzr, [x0, #-256]
+  %1 = add i64 %a, -256   ;-0x100
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load volatile i64* %2, align 8
+  ret void
+}

diff --git a/test/CodeGen/AArch64/arm64-bcc.ll b/test/CodeGen/AArch64/arm64-bcc.ll
new file mode 100644
index 0000000..138ae90
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-bcc.ll

@@ -0,0 +1,60 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin  | FileCheck %s
+; Checks for conditional branch b.vs
+
+; Function Attrs: nounwind
+define i32 @add(i32, i32) {
+entry:
+  %2 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %0, i32 %1)
+  %3 = extractvalue { i32, i1 } %2, 1
+  br i1 %3, label %6, label %4
+
+; <label>:4                                       ; preds = %entry
+  %5 = extractvalue { i32, i1 } %2, 0
+  ret i32 %5
+
+; <label>:6                                       ; preds = %entry
+  tail call void @llvm.trap()
+  unreachable
+; CHECK: b.vs
+}
+
+%S64 = type <{ i64 }>
+%S32 = type <{ i32 }>
+%Sstruct = type <{ %S64, %S32 }>
+
+; Checks for compfail when optimizing csincr-cbz sequence
+
+define { i64, i1 } @foo(i64* , %Sstruct* , i1, i64) {
+entry:
+  %.sroa.0 = alloca i72, align 16
+  %.count.value = getelementptr inbounds %Sstruct* %1, i64 0, i32 0, i32 0
+  %4 = load i64* %.count.value, align 8
+  %.repeatedValue.value = getelementptr inbounds %Sstruct* %1, i64 0, i32 1, i32 0
+  %5 = load i32* %.repeatedValue.value, align 8
+  %6 = icmp eq i64 %4, 0
+  br label %7
+
+; <label>:7                                      ; preds = %entry
+  %.mask58 = and i32 %5, -2048
+  %8 = icmp eq i32 %.mask58, 55296
+  %.not134 = xor i1 %8, true
+  %9 = icmp eq i32 %5, 1114112
+  %or.cond135 = and i1 %9, %.not134
+  br i1 %or.cond135, label %10, label %.loopexit
+
+; <label>:10                                      ; preds = %7
+  %11 = and i32 %5, -2048
+  %12 = icmp eq i32 %11, 55296
+  br i1 %12, label %.loopexit, label %10
+
+
+.loopexit:                                        ; preds = %.entry,%7,%10
+  tail call void @llvm.trap()
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap()

diff --git a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
index f0e968b..d2985f4 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll

@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define void @test_i64_f64(double* %p, i64* %q) {

diff --git a/test/CodeGen/AArch64/arm64-big-endian-eh.ll b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
index 93e7da9..a51703a 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-eh.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-eh.ll

@@ -1,4 +1,4 @@
-; RUN: llc -mtriple arm64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
+; RUN: llc -mtriple aarch64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
 
 ; ARM EHABI for big endian
 ; This test case checks whether CIE length record is laid out in big endian format.

diff --git a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
index d7b26b9..db1f48c 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll

@@ -3,7 +3,7 @@
 ; Vararg saving must save Q registers using the equivalent of STR/STP.
 
 target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "arm64_be-arm-none-eabi"
+target triple = "aarch64_be-arm-none-eabi"
 
 %struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
 

diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
index 1dcccf1..cc9badc 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll

@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define i64 @test_i64_f64(double %p) {

diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
index 9a12b7a..d72d0a5 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll

@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 declare i64 @test_i64_f64_helper(double %p)

diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
index 5d62cfe..b74ece8 100644
--- a/test/CodeGen/AArch64/arm64-cse.ll
+++ b/test/CodeGen/AArch64/arm64-cse.ll

@@ -1,4 +1,4 @@
-; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -O3 < %s -aarch64-atomic-cfg-tidy=0 -aarch64-gep-opt=false | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; rdar://12462006

diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
index 6eed48b..2eb6307 100644
--- a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll

@@ -1,8 +1,4 @@
 ; RUN: llc -mcpu=cyclone < %s | FileCheck %s
-
-; r208640 broke ppc64/Linux self-hosting; xfailing while this is worked on.
-; XFAIL: *
-
 target datalayout = "e-i64:64-n32:64-S128"
 target triple = "arm64-apple-ios"
 

diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
deleted file mode 100644
index ce132c6..0000000
--- a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
+++ /dev/null

@@ -1,46 +0,0 @@
-; RUN: llc -O3 < %s | FileCheck %s
-; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
-; Test case for a DAG combiner bug where we combined an indexed load
-; with an extension (sext, zext, or any) into a regular extended load,
-; i.e., dropping the indexed value.
-; <rdar://problem/16389332>
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-%class.A = type { i64, i64 }
-%class.C = type { i64 }
-
-; CHECK-LABEL: XX:
-; CHECK: ldr
-define i32 @XX(%class.A* %K, i1 %tst, i32* %addr, %class.C** %ppC, %class.C* %pC) {
-entry:
-  br i1 %tst, label %if.then, label %lor.rhs.i
-
-lor.rhs.i:                                        ; preds = %entry
-  %tmp = load i32* %addr, align 4
-  %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
-  %tmp1 = load i64* %y.i.i.i, align 8
-  %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
-  %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
-  %add12.i = add nsw i32 0, %div11.i
-  %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
-  %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
-  %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
-  %add16.i = add nsw i32 %add12.i, %div15.i
-  %rem.i.i = srem i32 %add16.i, %tmp
-  %idxprom = sext i32 %rem.i.i to i64
-  %arrayidx = getelementptr inbounds %class.C** %ppC, i64 %idxprom
-  %tobool533 = icmp eq %class.C* %pC, null
-  br i1 %tobool533, label %while.end, label %while.body
-
-if.then:                                          ; preds = %entry
-  ret i32 42
-
-while.body:                                       ; preds = %lor.rhs.i
-  ret i32 5
-
-while.end:                                        ; preds = %lor.rhs.i
-  %tmp3 = load %class.C** %arrayidx, align 8
-  ret i32 50
-}

diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
index a239403..06bd927 100644
--- a/test/CodeGen/AArch64/arm64-extern-weak.ll
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll

@@ -1,16 +1,23 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK-STATIC %s
 ; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
 define i32()* @foo() {
 ; The usual ADRP/ADD pair can't be used for a weak reference because it must
-; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+; evaluate to 0 if the symbol is undefined. We use a GOT entry for PIC
+; otherwise a litpool entry.
   ret i32()* @var
 
 ; CHECK: adrp x[[VAR:[0-9]+]], :got:var
 ; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
 
+; CHECK-STATIC: .LCPI0_0:
+; CHECK-STATIC-NEXT: .xword  var
+; CHECK-STATIC: adrp x[[VAR:[0-9]+]], .LCPI0_0
+; CHECK-STATIC: ldr x0, [x[[VAR]], :lo12:.LCPI0_0]
+
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
 ; CHECK-LARGE: movz x0, #:abs_g3:var
@@ -29,6 +36,11 @@
 ; CHECK: add x0, [[ARR_VAR]], #20
   ret i32* %addr
 
+; CHECK-STATIC: .LCPI1_0:
+; CHECK-STATIC-NEXT: .xword arr_var
+; CHECK-STATIC: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, :lo12:.LCPI1_0]
+; CHECK-STATIC: add x0, [[BASE]], #20
+
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
 ; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
@@ -44,6 +56,9 @@
 ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
 ; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
+; CHECK-STATIC: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK-STATIC: add x0, [[BASE]], :lo12:defined_weak_var
+
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
index ebd847e..d81bc7c 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 @sortlist = common global [5001 x i32] zeroinitializer, align 16
 @sortlist2 = common global [5001 x i64] zeroinitializer, align 16

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
index 1706e9e..a841702 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll

@@ -1,5 +1,5 @@
 ; This test should cause the TargetMaterializeAlloca to be invoked
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 %struct.S1Ty = type { i64 }
 %struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
@@ -15,9 +15,8 @@
 entry:
 ; CHECK: main
 ; CHECK: mov x29, sp
-; CHECK: mov x[[REG:[0-9]+]], sp
-; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
-; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
+; CHECK: mov [[REG:x[0-9]+]], sp
+; CHECK-NEXT: add x0, [[REG]], #8
   %E = alloca %struct.S2Ty, align 4
   %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
   call void @takeS1(%struct.S1Ty* %B)

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
index 37a8295..f896d85 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-br.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone -verify-machineinstrs < %s | FileCheck %s
 
 define void @branch1() nounwind uwtable ssp {
   %x = alloca i32, align 4
@@ -95,7 +95,7 @@
   store i64 %d, i64* %d.addr, align 8
   %0 = load i16* %b.addr, align 2
 ; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
+; CHECK: cmp w0, #0
 ; CHECK: b.eq LBB4_2
   %conv = trunc i16 %0 to i1
   br i1 %conv, label %if.then, label %if.end
@@ -107,7 +107,7 @@
 if.end:                                           ; preds = %if.then, %entry
   %1 = load i32* %c.addr, align 4
 ; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
-; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
+; CHECK: cmp w[[REG]], #0
 ; CHECK: b.eq LBB4_4
   %conv1 = trunc i32 %1 to i1
   br i1 %conv1, label %if.then3, label %if.end4
@@ -118,7 +118,7 @@
 
 if.end4:                                          ; preds = %if.then3, %if.end
   %2 = load i64* %d.addr, align 8
-; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; CHECK: cmp w{{[0-9]+}}, #0
 ; CHECK: b.eq LBB4_6
   %conv5 = trunc i64 %2 to i1
   br i1 %conv5, label %if.then7, label %if.end8
@@ -137,11 +137,10 @@
 ; rdar://15174028
 define i32 @trunc64(i64 %foo) nounwind {
 ; CHECK: trunc64
-; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
-; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
-; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
-; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
-; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
+; CHECK: and  [[REG1:x[0-9]+]], x0, #0x1
+; CHECK: mov  x[[REG2:[0-9]+]], [[REG1]]
+; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0x1
+; CHECK: cmp  [[REG3]], #0
 ; CHECK: b.eq LBB5_2
   %a = and i64 %foo, 1
   %b = trunc i64 %a to i1

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
index 8d756ae..f1e2c40 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-call.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll

@@ -1,5 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64_be-linux-gnu | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -O0 -fast-isel-abort -fast-isel-abort-args -code-model=small -verify-machineinstrs -mtriple=arm64-apple-darwin   < %s | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -fast-isel-abort-args -code-model=large -verify-machineinstrs -mtriple=arm64-apple-darwin   < %s | FileCheck %s --check-prefix=LARGE
+; RUN: llc -O0 -fast-isel-abort -fast-isel-abort-args -code-model=small -verify-machineinstrs -mtriple=aarch64_be-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-BE
 
 define void @call0() nounwind {
 entry:
@@ -8,8 +9,12 @@
 
 define void @foo0() nounwind {
 entry:
-; CHECK: foo0
-; CHECK: bl _call0
+; CHECK-LABEL: foo0
+; CHECK:       bl _call0
+; LARGE-LABEL: foo0
+; LARGE:       adrp [[REG0:x[0-9]+]], _call0@GOTPAGE
+; LARGE:       ldr  [[REG1:x[0-9]+]], {{\[}}[[REG0]], _call0@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr  [[REG1]]
   call void @call0()
   ret void
 }
@@ -24,10 +29,10 @@
 
 define i32 @foo1(i32 %a) nounwind {
 entry:
-; CHECK: foo1
-; CHECK: stur w0, [x29, #-4]
-; CHECK-NEXT: ldur w0, [x29, #-4]
-; CHECK-NEXT: bl _call1
+; CHECK-LABEL: foo1
+; CHECK:       stur w0, [x29, #-4]
+; CHECK-NEXT:  ldur w0, [x29, #-4]
+; CHECK-NEXT:  bl _call1
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
   %tmp = load i32* %a.addr, align 4
@@ -37,10 +42,10 @@
 
 define i32 @sext_(i8 %a, i16 %b) nounwind {
 entry:
-; CHECK: @sext_
-; CHECK: sxtb w0, w0
-; CHECK: sxth w1, w1
-; CHECK: bl _foo_sext_
+; CHECK-LABEL: sext_
+; CHECK:       sxtb w0, w0
+; CHECK:       sxth w1, w1
+; CHECK:       bl _foo_sext_
   call void @foo_sext_(i8 signext %a, i16 signext %b)
   ret i32 0
 }
@@ -49,9 +54,9 @@
 
 define i32 @zext_(i8 %a, i16 %b) nounwind {
 entry:
-; CHECK: @zext_
-; CHECK: uxtb w0, w0
-; CHECK: uxth w1, w1
+; CHECK-LABEL: zext_
+; CHECK:       uxtb w0, w0
+; CHECK:       uxth w1, w1
   call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
   ret i32 0
 }
@@ -60,10 +65,10 @@
 
 define i32 @t1(i32 %argc, i8** nocapture %argv) {
 entry:
-; CHECK: @t1
+; CHECK-LABEL: @t1
 ; The last parameter will be passed on stack via i8.
-; CHECK: strb w{{[0-9]+}}, [sp]
-; CHECK-NEXT: bl _bar
+; CHECK:       strb w{{[0-9]+}}, [sp]
+; CHECK:       bl _bar
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
   ret i32 0
 }
@@ -73,18 +78,19 @@
 ; Test materialization of integers.  Target-independent selector handles this.
 define i32 @t2() {
 entry:
-; CHECK: @t2
-; CHECK: movz x0, #0
-; CHECK: orr w1, wzr, #0xfffffff8
-; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
-; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
-; CHECK: movz w[[REG3:[0-9]+]], #0
-; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
-; CHECK: uxth w2, w[[REG]]
-; CHECK: sxtb w3, w[[REG2]]
-; CHECK: and w4, w[[REG3]], #0x1
-; CHECK: and w5, w[[REG4]], #0x1
-; CHECK: bl	_func2
+; CHECK-LABEL: t2
+; CHECK:       mov [[REG1:x[0-9]+]], xzr
+; CHECK:       orr w1, wzr, #0xfffffff8
+; CHECK:       orr [[REG2:w[0-9]+]], wzr, #0x3ff
+; CHECK:       orr [[REG3:w[0-9]+]], wzr, #0x2
+; CHECK:       mov [[REG4:w[0-9]+]], wzr
+; CHECK:       orr [[REG5:w[0-9]+]], wzr, #0x1
+; CHECK:       mov x0, [[REG1]]
+; CHECK:       uxth w2, [[REG2]]
+; CHECK:       sxtb w3, [[REG3]]
+; CHECK:       and w4, [[REG4]], #0x1
+; CHECK:       and w5, [[REG5]], #0x1
+; CHECK:       bl _func2
   %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
   ret i32 0
 }
@@ -94,7 +100,170 @@
 declare void @callee_b0f(i8 %bp10, i8 %bp11, i8 %bp12, i8 %bp13, i8 %bp14, i8 %bp15, i8 %bp17, i8 %bp18, i8 %bp19)
 define void @caller_b1f() {
 entry:
-  ; CHECK-BE: strb w{{.*}}, [sp, #7]
+; CHECK-BE-LABEL: caller_b1f
+; CHECK-BE:       strb w{{.*}}, [sp, #7]
   call void @callee_b0f(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 42)
   ret void
 }
+
+define zeroext i1 @call_arguments1(i1 %a1, i1 %a2, i1 %a3, i1 %a4, i1 %a5, i1 %a6, i1 %a7, i1 %a8) {
+; CHECK-LABEL: call_arguments1
+; CHECK:       and {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  and {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  and {{w[0-9]+}}, w6, w7
+  %1 = and i1 %a1, %a2
+  %2 = and i1 %a3, %a4
+  %3 = and i1 %a5, %a6
+  %4 = and i1 %a7, %a8
+  %5 = and i1 %1, %2
+  %6 = and i1 %3, %4
+  %7 = and i1 %5, %6
+  ret i1 %7
+}
+
+define i32 @call_arguments2(i8 zeroext %a1, i8 zeroext %a2, i8 zeroext %a3, i8 zeroext %a4, i8 signext %a5, i8 signext %a6, i8 signext %a7, i8 signext %a8) {
+; CHECK-LABEL: call_arguments2
+; CHECK:       add {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  add {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  add {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  add {{w[0-9]+}}, w6, w7
+  %a1z = zext i8 %a1 to i32
+  %a2z = zext i8 %a2 to i32
+  %a3z = zext i8 %a3 to i32
+  %a4z = zext i8 %a4 to i32
+  %a5s = sext i8 %a5 to i32
+  %a6s = sext i8 %a6 to i32
+  %a7s = sext i8 %a7 to i32
+  %a8s = sext i8 %a8 to i32
+  %1 = add i32 %a1z, %a2z
+  %2 = add i32 %a3z, %a4z
+  %3 = add i32 %a5s, %a6s
+  %4 = add i32 %a7s, %a8s
+  %5 = add i32 %1, %2
+  %6 = add i32 %3, %4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+define i32 @call_arguments3(i16 zeroext %a1, i16 zeroext %a2, i16 zeroext %a3, i16 zeroext %a4, i16 signext %a5, i16 signext %a6, i16 signext %a7, i16 signext %a8) {
+; CHECK-LABEL: call_arguments3
+; CHECK:       add {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  add {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  add {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  add {{w[0-9]+}}, w6, w7
+  %a1z = zext i16 %a1 to i32
+  %a2z = zext i16 %a2 to i32
+  %a3z = zext i16 %a3 to i32
+  %a4z = zext i16 %a4 to i32
+  %a5s = sext i16 %a5 to i32
+  %a6s = sext i16 %a6 to i32
+  %a7s = sext i16 %a7 to i32
+  %a8s = sext i16 %a8 to i32
+  %1 = add i32 %a1z, %a2z
+  %2 = add i32 %a3z, %a4z
+  %3 = add i32 %a5s, %a6s
+  %4 = add i32 %a7s, %a8s
+  %5 = add i32 %1, %2
+  %6 = add i32 %3, %4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+define i32 @call_arguments4(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
+; CHECK-LABEL: call_arguments4
+; CHECK:       add {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:  add {{w[0-9]+}}, w2, w3
+; CHECK-NEXT:  add {{w[0-9]+}}, w4, w5
+; CHECK-NEXT:  add {{w[0-9]+}}, w6, w7
+  %1 = add i32 %a1, %a2
+  %2 = add i32 %a3, %a4
+  %3 = add i32 %a5, %a6
+  %4 = add i32 %a7, %a8
+  %5 = add i32 %1, %2
+  %6 = add i32 %3, %4
+  %7 = add i32 %5, %6
+  ret i32 %7
+}
+
+define i64 @call_arguments5(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8) {
+; CHECK-LABEL: call_arguments5
+; CHECK:       add {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:  add {{x[0-9]+}}, x2, x3
+; CHECK-NEXT:  add {{x[0-9]+}}, x4, x5
+; CHECK-NEXT:  add {{x[0-9]+}}, x6, x7
+  %1 = add i64 %a1, %a2
+  %2 = add i64 %a3, %a4
+  %3 = add i64 %a5, %a6
+  %4 = add i64 %a7, %a8
+  %5 = add i64 %1, %2
+  %6 = add i64 %3, %4
+  %7 = add i64 %5, %6
+  ret i64 %7
+}
+
+define float @call_arguments6(float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, float %a8) {
+; CHECK-LABEL: call_arguments6
+; CHECK:       fadd {{s[0-9]+}}, s0, s1
+; CHECK-NEXT:  fadd {{s[0-9]+}}, s2, s3
+; CHECK-NEXT:  fadd {{s[0-9]+}}, s4, s5
+; CHECK-NEXT:  fadd {{s[0-9]+}}, s6, s7
+  %1 = fadd float %a1, %a2
+  %2 = fadd float %a3, %a4
+  %3 = fadd float %a5, %a6
+  %4 = fadd float %a7, %a8
+  %5 = fadd float %1, %2
+  %6 = fadd float %3, %4
+  %7 = fadd float %5, %6
+  ret float %7
+}
+
+define double @call_arguments7(double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7, double %a8) {
+; CHECK-LABEL: call_arguments7
+; CHECK:       fadd {{d[0-9]+}}, d0, d1
+; CHECK-NEXT:  fadd {{d[0-9]+}}, d2, d3
+; CHECK-NEXT:  fadd {{d[0-9]+}}, d4, d5
+; CHECK-NEXT:  fadd {{d[0-9]+}}, d6, d7
+  %1 = fadd double %a1, %a2
+  %2 = fadd double %a3, %a4
+  %3 = fadd double %a5, %a6
+  %4 = fadd double %a7, %a8
+  %5 = fadd double %1, %2
+  %6 = fadd double %3, %4
+  %7 = fadd double %5, %6
+  ret double %7
+}
+
+define i64 @call_arguments8(i32 %a1, i64 %a2, i32 %a3, i64 %a4) {
+; CHECK-LABEL: call_arguments8
+; CHECK:       ubfx  [[REG1:x[0-9]+]], {{x[0-9]+}}, #0, #32
+; CHECK:       ubfx  [[REG2:x[0-9]+]], {{x[0-9]+}}, #0, #32
+; CHECK:       add {{x[0-9]+}}, [[REG1]], x1
+; CHECK-NEXT:  add {{x[0-9]+}}, [[REG2]], x3
+  %aa1 = zext i32 %a1 to i64
+  %aa3 = zext i32 %a3 to i64
+  %1 = add i64 %aa1, %a2
+  %2 = add i64 %aa3, %a4
+  %3 = add i64 %1, %2
+  ret i64 %3
+}
+
+define void @call_arguments9(i8 %a1, i16 %a2, i32 %a3, i64 %a4, float %a5, double %a6, i64 %a7, double %a8) {
+; CHECK-LABEL: call_arguments9
+  ret void
+}
+
+; Test that we use the correct register class for the branch.
+define void @call_blr(i64 %Fn, i1 %c) {
+; CHECK-LABEL: call_blr
+; CHECK:       blr
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %1 = inttoptr i64 %Fn to void (i64)*
+  br label %bb2
+bb2:
+  %2 = phi void (i64)* [ %1, %bb1 ], [ undef, %0 ]
+  call void %2(i64 1)
+  ret void
+}
+

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
index c5417de..e515184 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll

@@ -1,9 +1,9 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin -mcpu=cyclone < %s | FileCheck %s
 
 ;; Test various conversions.
 define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
 entry:
-; CHECK: trunc_
+; CHECK-LABEL: trunc_
 ; CHECK: sub sp, sp, #16
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: strh w1, [sp, #12]
@@ -17,7 +17,6 @@
 ; CHECK: ldrh w0, [sp, #12]
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
 ; CHECK: add sp, sp, #16
 ; CHECK: ret
   %a.addr = alloca i8, align 1
@@ -44,21 +43,18 @@
 
 define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
 entry:
-; CHECK: zext_
+; CHECK-LABEL: zext_
 ; CHECK: sub sp, sp, #16
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: strh w1, [sp, #12]
 ; CHECK: str w2, [sp, #8]
 ; CHECK: str x3, [sp]
 ; CHECK: ldrb w0, [sp, #15]
-; CHECK: uxtb w0, w0
 ; CHECK: strh w0, [sp, #12]
 ; CHECK: ldrh w0, [sp, #12]
-; CHECK: uxth w0, w0
 ; CHECK: str w0, [sp, #8]
 ; CHECK: ldr w0, [sp, #8]
 ; CHECK: mov x3, x0
-; CHECK: ubfx x3, x3, #0, #32
 ; CHECK: str x3, [sp]
 ; CHECK: ldr x0, [sp]
 ; CHECK: ret
@@ -85,37 +81,35 @@
 
 define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
 entry:
-; CHECK: @zext_i1_i32
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: zext_i1_i32
+; CHECK-NOT:   and w0, w0, #0x1
+; CHECK:       ret
   %conv = zext i1 %a to i32
   ret i32 %conv;
 }
 
 define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
 entry:
-; CHECK: @zext_i1_i64
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: zext_i1_i64
+; CHECK-NOT:   and w0, w0, #0x1
+; CHECK:       ret
   %conv = zext i1 %a to i64
   ret i64 %conv;
 }
 
 define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
 entry:
-; CHECK: sext_
+; CHECK-LABEL: sext_
 ; CHECK: sub sp, sp, #16
 ; CHECK: strb w0, [sp, #15]
 ; CHECK: strh w1, [sp, #12]
 ; CHECK: str w2, [sp, #8]
 ; CHECK: str x3, [sp]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: sxtb w0, w0
+; CHECK: ldrsb w0, [sp, #15]
 ; CHECK: strh w0, [sp, #12]
-; CHECK: ldrh w0, [sp, #12]
-; CHECK: sxth w0, w0
+; CHECK: ldrsh w0, [sp, #12]
 ; CHECK: str w0, [sp, #8]
-; CHECK: ldr w0, [sp, #8]
-; CHECK: mov x3, x0
-; CHECK: sxtw x3, w3
+; CHECK: ldrsw x3, [sp, #8]
 ; CHECK: str x3, [sp]
 ; CHECK: ldr x0, [sp]
 ; CHECK: ret
@@ -161,8 +155,9 @@
 ; Test sext i1 to i32
 define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
 entry:
-; CHECK: sext_i1_i32
-; CHECK: sbfx w0, w0, #0, #1
+; CHECK-LABEL: sext_i1_i32
+; CHECK-NOT:   sbfx w0, w0, #0, #1
+; CHECK:       ret
   %conv = sext i1 %a to i32
   ret i32 %conv
 }
@@ -170,7 +165,7 @@
 ; Test sext i1 to i16
 define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
 entry:
-; CHECK: sext_i1_i16
+; CHECK-LABEL: sext_i1_i16
 ; CHECK: sbfx w0, w0, #0, #1
   %conv = sext i1 %a to i16
   ret i16 %conv
@@ -179,7 +174,7 @@
 ; Test sext i1 to i8
 define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
 entry:
-; CHECK: sext_i1_i8
+; CHECK-LABEL: sext_i1_i8
 ; CHECK: sbfx w0, w0, #0, #1
   %conv = sext i1 %a to i8
   ret i8 %conv
@@ -188,7 +183,7 @@
 ; Test fpext
 define double @fpext_(float %a) nounwind ssp {
 entry:
-; CHECK: fpext_
+; CHECK-LABEL: fpext_
 ; CHECK: fcvt d0, s0
   %conv = fpext float %a to double
   ret double %conv
@@ -197,7 +192,7 @@
 ; Test fptrunc
 define float @fptrunc_(double %a) nounwind ssp {
 entry:
-; CHECK: fptrunc_
+; CHECK-LABEL: fptrunc_
 ; CHECK: fcvt s0, d0
   %conv = fptrunc double %a to float
   ret float %conv
@@ -206,7 +201,7 @@
 ; Test fptosi
 define i32 @fptosi_ws(float %a) nounwind ssp {
 entry:
-; CHECK: fptosi_ws
+; CHECK-LABEL: fptosi_ws
 ; CHECK: fcvtzs w0, s0
   %conv = fptosi float %a to i32
   ret i32 %conv
@@ -215,7 +210,7 @@
 ; Test fptosi
 define i32 @fptosi_wd(double %a) nounwind ssp {
 entry:
-; CHECK: fptosi_wd
+; CHECK-LABEL: fptosi_wd
 ; CHECK: fcvtzs w0, d0
   %conv = fptosi double %a to i32
   ret i32 %conv
@@ -224,7 +219,7 @@
 ; Test fptoui
 define i32 @fptoui_ws(float %a) nounwind ssp {
 entry:
-; CHECK: fptoui_ws
+; CHECK-LABEL: fptoui_ws
 ; CHECK: fcvtzu w0, s0
   %conv = fptoui float %a to i32
   ret i32 %conv
@@ -233,7 +228,7 @@
 ; Test fptoui
 define i32 @fptoui_wd(double %a) nounwind ssp {
 entry:
-; CHECK: fptoui_wd
+; CHECK-LABEL: fptoui_wd
 ; CHECK: fcvtzu w0, d0
   %conv = fptoui double %a to i32
   ret i32 %conv
@@ -242,7 +237,7 @@
 ; Test sitofp
 define float @sitofp_sw_i1(i1 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw_i1
+; CHECK-LABEL: sitofp_sw_i1
 ; CHECK: sbfx w0, w0, #0, #1
 ; CHECK: scvtf s0, w0
   %conv = sitofp i1 %a to float
@@ -252,7 +247,7 @@
 ; Test sitofp
 define float @sitofp_sw_i8(i8 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw_i8
+; CHECK-LABEL: sitofp_sw_i8
 ; CHECK: sxtb w0, w0
 ; CHECK: scvtf s0, w0
   %conv = sitofp i8 %a to float
@@ -262,9 +257,7 @@
 ; Test sitofp
 define float @sitofp_sw_i16(i16 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw_i16
-; CHECK: sxth w0, w0
-; CHECK: scvtf s0, w0
+; CHECK-LABEL: sitofp_sw_i16
   %conv = sitofp i16 %a to float
   ret float %conv
 }
@@ -272,7 +265,7 @@
 ; Test sitofp
 define float @sitofp_sw(i32 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sw
+; CHECK-LABEL: sitofp_sw
 ; CHECK: scvtf s0, w0
   %conv = sitofp i32 %a to float
   ret float %conv
@@ -281,7 +274,7 @@
 ; Test sitofp
 define float @sitofp_sx(i64 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_sx
+; CHECK-LABEL: sitofp_sx
 ; CHECK: scvtf s0, x0
   %conv = sitofp i64 %a to float
   ret float %conv
@@ -290,7 +283,7 @@
 ; Test sitofp
 define double @sitofp_dw(i32 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_dw
+; CHECK-LABEL: sitofp_dw
 ; CHECK: scvtf d0, w0
   %conv = sitofp i32 %a to double
   ret double %conv
@@ -299,7 +292,7 @@
 ; Test sitofp
 define double @sitofp_dx(i64 %a) nounwind ssp {
 entry:
-; CHECK: sitofp_dx
+; CHECK-LABEL: sitofp_dx
 ; CHECK: scvtf d0, x0
   %conv = sitofp i64 %a to double
   ret double %conv
@@ -308,7 +301,7 @@
 ; Test uitofp
 define float @uitofp_sw_i1(i1 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw_i1
+; CHECK-LABEL: uitofp_sw_i1
 ; CHECK: and w0, w0, #0x1
 ; CHECK: ucvtf s0, w0
   %conv = uitofp i1 %a to float
@@ -318,9 +311,7 @@
 ; Test uitofp
 define float @uitofp_sw_i8(i8 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw_i8
-; CHECK: uxtb w0, w0
-; CHECK: ucvtf s0, w0
+; CHECK-LABEL: uitofp_sw_i8
   %conv = uitofp i8 %a to float
   ret float %conv
 }
@@ -328,9 +319,7 @@
 ; Test uitofp
 define float @uitofp_sw_i16(i16 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw_i16
-; CHECK: uxth w0, w0
-; CHECK: ucvtf s0, w0
+; CHECK-LABEL: uitofp_sw_i16
   %conv = uitofp i16 %a to float
   ret float %conv
 }
@@ -338,7 +327,7 @@
 ; Test uitofp
 define float @uitofp_sw(i32 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sw
+; CHECK-LABEL: uitofp_sw
 ; CHECK: ucvtf s0, w0
   %conv = uitofp i32 %a to float
   ret float %conv
@@ -347,7 +336,7 @@
 ; Test uitofp
 define float @uitofp_sx(i64 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_sx
+; CHECK-LABEL: uitofp_sx
 ; CHECK: ucvtf s0, x0
   %conv = uitofp i64 %a to float
   ret float %conv
@@ -356,7 +345,7 @@
 ; Test uitofp
 define double @uitofp_dw(i32 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_dw
+; CHECK-LABEL: uitofp_dw
 ; CHECK: ucvtf d0, w0
   %conv = uitofp i32 %a to double
   ret double %conv
@@ -365,7 +354,7 @@
 ; Test uitofp
 define double @uitofp_dx(i64 %a) nounwind ssp {
 entry:
-; CHECK: uitofp_dx
+; CHECK-LABEL: uitofp_dx
 ; CHECK: ucvtf d0, x0
   %conv = uitofp i64 %a to double
   ret double %conv
@@ -373,7 +362,7 @@
 
 define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i32
+; CHECK-LABEL: i64_trunc_i32
 ; CHECK: mov x1, x0
   %conv = trunc i64 %a to i32
   ret i32 %conv
@@ -381,7 +370,7 @@
 
 define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i16
+; CHECK-LABEL: i64_trunc_i16
 ; CHECK: mov x[[REG:[0-9]+]], x0
 ; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
 ; CHECK: uxth w0, [[REG2]]
@@ -391,7 +380,7 @@
 
 define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i8
+; CHECK-LABEL: i64_trunc_i8
 ; CHECK: mov x[[REG:[0-9]+]], x0
 ; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
 ; CHECK: uxtb w0, [[REG2]]
@@ -401,7 +390,7 @@
 
 define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
 entry:
-; CHECK: i64_trunc_i1
+; CHECK-LABEL: i64_trunc_i1
 ; CHECK: mov x[[REG:[0-9]+]], x0
 ; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
 ; CHECK: and w0, [[REG2]], #0x1
@@ -411,7 +400,7 @@
 
 ; rdar://15101939
 define void @stack_trunc() nounwind {
-; CHECK: stack_trunc
+; CHECK-LABEL: stack_trunc
 ; CHECK: sub  sp, sp, #16
 ; CHECK: ldr  [[REG:x[0-9]+]], [sp]
 ; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
@@ -428,15 +417,36 @@
 
 define zeroext i64 @zext_i8_i64(i8 zeroext %in) {
 ; CHECK-LABEL: zext_i8_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: ubfx x0, x[[TMP]], #0, #8
+; CHECK-NOT:   ubfx x0, {{x[0-9]+}}, #0, #8
+; CHECK:       ret
   %big = zext i8 %in to i64
   ret i64 %big
 }
 define zeroext i64 @zext_i16_i64(i16 zeroext %in) {
 ; CHECK-LABEL: zext_i16_i64:
-; CHECK: mov x[[TMP:[0-9]+]], x0
-; CHECK: ubfx x0, x[[TMP]], #0, #16
+; CHECK-NOT:   ubfx x0, {{x[0-9]+}}, #0, #16
+; CHECK:       ret
   %big = zext i16 %in to i64
   ret i64 %big
 }
+
+define float @bitcast_i32_to_float(i32 %a) {
+  %1 = bitcast i32 %a to float
+  ret float %1
+}
+
+define double @bitcast_i64_to_double(i64 %a) {
+  %1 = bitcast i64 %a to double
+  ret double %1
+}
+
+define i32 @bitcast_float_to_i32(float %a) {
+  %1 = bitcast float %a to i32
+  ret i32 %1
+}
+
+define i64 @bitcast_double_to_i64(double %a) {
+  %1 = bitcast double %a to i64
+  ret i64 %1
+}
+

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
index f030596..111b6bd 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll

@@ -1,146 +1,162 @@
-; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
-define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float1
-; CHECK: fcmp s0, #0.0
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une float %a, 0.000000e+00
-  ret i1 %cmp
+define zeroext i1 @fcmp_float1(float %a) {
+; CHECK-LABEL: fcmp_float1
+; CHECK:       fcmp s0, #0.0
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une float %a, 0.000000e+00
+  ret i1 %1
 }
 
-define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_float2
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une float %a, %b
-  ret i1 %cmp
+define zeroext i1 @fcmp_float2(float %a, float %b) {
+; CHECK-LABEL: fcmp_float2
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une float %a, %b
+  ret i1 %1
 }
 
-define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double1
-; CHECK: fcmp d0, #0.0
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une double %a, 0.000000e+00
-  ret i1 %cmp
+define zeroext i1 @fcmp_double1(double %a) {
+; CHECK-LABEL: fcmp_double1
+; CHECK:       fcmp d0, #0.0
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une double %a, 0.000000e+00
+  ret i1 %1
 }
 
-define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
-entry:
-; CHECK-LABEL: @fcmp_double2
-; CHECK: fcmp d0, d1
-; CHECK: cset w{{[0-9]+}}, ne
-  %cmp = fcmp une double %a, %b
-  ret i1 %cmp
+define zeroext i1 @fcmp_double2(double %a, double %b) {
+; CHECK-LABEL: fcmp_double2
+; CHECK:       fcmp d0, d1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une double %a, %b
+  ret i1 %1
 }
 
 ; Check each fcmp condition
-define float @fcmp_oeq(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oeq
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, eq
-  %cmp = fcmp oeq float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_false(float %a) {
+; CHECK-LABEL: fcmp_false
+; CHECK:       mov {{w[0-9]+}}, wzr
+  %1 = fcmp ogt float %a, %a
+  ret i1 %1
 }
 
-define float @fcmp_ogt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ogt
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, gt
-  %cmp = fcmp ogt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_oeq(float %a, float %b) {
+; CHECK-LABEL: fcmp_oeq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, eq
+  %1 = fcmp oeq float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_oge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_oge
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ge
-  %cmp = fcmp oge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_ogt(float %a, float %b) {
+; CHECK-LABEL: fcmp_ogt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, gt
+  %1 = fcmp ogt float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_olt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_olt
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, mi
-  %cmp = fcmp olt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_oge(float %a, float %b) {
+; CHECK-LABEL: fcmp_oge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ge
+  %1 = fcmp oge float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_ole(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ole
-; CHECK: fcmp s0, s1
-; CHECK: cset w{{[0-9]+}}, ls
-  %cmp = fcmp ole float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_olt(float %a, float %b) {
+; CHECK-LABEL: fcmp_olt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, mi
+  %1 = fcmp olt float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_ord(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ord
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, vc
-  %cmp = fcmp ord float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_ole(float %a, float %b) {
+; CHECK-LABEL: fcmp_ole
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ls
+  %1 = fcmp ole float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_uno(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uno
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, vs
-  %cmp = fcmp uno float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_one(float %a, float %b) {
+; CHECK-LABEL: fcmp_one
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], mi
+; CHECK-NEXT:  csinc {{w[0-9]+}}, [[REG]], wzr, le
+  %1 = fcmp one float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_ugt(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ugt
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, hi
-  %cmp = fcmp ugt float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_ord(float %a, float %b) {
+; CHECK-LABEL: fcmp_ord
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, vc
+  %1 = fcmp ord float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_uge(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_uge
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, pl
-  %cmp = fcmp uge float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_uno(float %a, float %b) {
+; CHECK-LABEL: fcmp_uno
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, vs
+  %1 = fcmp uno float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_ult(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ult
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, lt
-  %cmp = fcmp ult float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_ueq(float %a, float %b) {
+; CHECK-LABEL: fcmp_ueq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  csinc {{w[0-9]+}}, [[REG]], wzr, vc
+  %1 = fcmp ueq float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_ule(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_ule
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, le
-  %cmp = fcmp ule float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_ugt(float %a, float %b) {
+; CHECK-LABEL: fcmp_ugt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, hi
+  %1 = fcmp ugt float %a, %b
+  ret i1 %1
 }
 
-define float @fcmp_une(float %a, float %b) nounwind ssp {
-; CHECK-LABEL: @fcmp_une
-; CHECK: fcmp s0, s1
-; CHECK: cset {{w[0-9]+}}, ne
-  %cmp = fcmp une float %a, %b
-  %conv = uitofp i1 %cmp to float
-  ret float %conv
+define zeroext i1 @fcmp_uge(float %a, float %b) {
+; CHECK-LABEL: fcmp_uge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, pl
+  %1 = fcmp uge float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ult(float %a, float %b) {
+; CHECK-LABEL: fcmp_ult
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, lt
+  %1 = fcmp ult float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_ule(float %a, float %b) {
+; CHECK-LABEL: fcmp_ule
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, le
+  %1 = fcmp ule float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_une(float %a, float %b) {
+; CHECK-LABEL: fcmp_une
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  cset {{w[0-9]+}}, ne
+  %1 = fcmp une float %a, %b
+  ret i1 %1
+}
+
+define zeroext i1 @fcmp_true(float %a) {
+; CHECK-LABEL: fcmp_true
+; CHECK:       orr {{w[0-9]+}}, wzr, #0x1
+  %1 = fcmp ueq float %a, %a
+  ret i1 %1
 }

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
index dc4d895..1a4e8ea 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 ; Test load/store of global value from global offset table.
 @seed = common global i64 0, align 8
@@ -6,9 +6,9 @@
 define void @Initrand() nounwind {
 entry:
 ; CHECK: @Initrand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
+; CHECK: adrp [[REG:x[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr  [[REG2:x[0-9]+]], {{\[}}[[REG]], _seed@GOTPAGEOFF{{\]}}
+; CHECK: str  {{x[0-9]+}}, {{\[}}[[REG2]]{{\]}}
   store i64 74755, i64* @seed, align 8
   ret void
 }
@@ -16,17 +16,16 @@
 define i32 @Rand() nounwind {
 entry:
 ; CHECK: @Rand
-; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
-; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
-; CHECK: movz x[[REG3:[0-9]+]], #0x51d
-; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
-; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
-; CHECK: movz x[[REG6:[0-9]+]], #0x3619
-; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
-; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
-; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
-; CHECK: str x[[REG9]], [x[[REG]]]
-; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
+; CHECK: adrp [[REG1:x[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr  [[REG2:x[0-9]+]], {{\[}}[[REG1]], _seed@GOTPAGEOFF{{\]}}
+; CHECK: movz [[REG3:x[0-9]+]], #0x3619
+; CHECK: movz [[REG4:x[0-9]+]], #0x51d
+; CHECK: ldr  [[REG5:x[0-9]+]], {{\[}}[[REG2]]{{\]}}
+; CHECK: mul  [[REG6:x[0-9]+]], [[REG5]], [[REG4]]
+; CHECK: add  [[REG7:x[0-9]+]], [[REG6]], [[REG3]]
+; CHECK: and  [[REG8:x[0-9]+]], [[REG7]], #0xffff
+; CHECK: str  [[REG8]], {{\[}}[[REG1]]{{\]}}
+; CHECK: ldr  {{x[0-9]+}}, {{\[}}[[REG1]]{{\]}}
   %0 = load i64* @seed, align 8
   %mul = mul nsw i64 %0, 1309
   %add = add nsw i64 %mul, 13849

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
index 971be5c..245c70e 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll

@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
 entry:
-; CHECK: icmp_eq_imm
-; CHECK: cmp  w0, #31
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_imm
+; CHECK:       cmp w0, #31
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i32 %a, 31
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -12,19 +12,19 @@
 
 define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
 entry:
-; CHECK: icmp_eq_neg_imm
-; CHECK: cmn  w0, #7
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_neg_imm
+; CHECK:       cmn w0, #7
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i32 %a, -7
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 }
 
-define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
+define i32 @icmp_eq_i32(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_eq
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_i32
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -32,19 +32,39 @@
 
 define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ne
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ne
+; CHECK-LABEL: icmp_ne
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, ne
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 }
 
+define i32 @icmp_eq_ptr(i8* %a) {
+entry:
+; CHECK-LABEL: icmp_eq_ptr
+; CHECK:       cmp x0, #0
+; CHECK-NEXT:  cset {{.+}}, eq
+  %cmp = icmp eq i8* %a, null
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ne_ptr(i8* %a) {
+entry:
+; CHECK-LABEL: icmp_ne_ptr
+; CHECK:       cmp x0, #0
+; CHECK-NEXT:  cset {{.+}}, ne
+  %cmp = icmp ne i8* %a, null
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
 define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ugt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, hi
+; CHECK-LABEL: icmp_ugt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, hi
   %cmp = icmp ugt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -52,9 +72,9 @@
 
 define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_uge
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, hs
+; CHECK-LABEL: icmp_uge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, hs
   %cmp = icmp uge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -62,9 +82,9 @@
 
 define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ult
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lo
+; CHECK-LABEL: icmp_ult
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, lo
   %cmp = icmp ult i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -72,9 +92,9 @@
 
 define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_ule
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ls
+; CHECK-LABEL: icmp_ule
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, ls
   %cmp = icmp ule i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -82,9 +102,9 @@
 
 define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_sgt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, gt
+; CHECK-LABEL: icmp_sgt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, gt
   %cmp = icmp sgt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -92,9 +112,9 @@
 
 define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_sge
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, ge
+; CHECK-LABEL: icmp_sge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, ge
   %cmp = icmp sge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -102,9 +122,9 @@
 
 define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_slt
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lt
+; CHECK-LABEL: icmp_slt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, lt
   %cmp = icmp slt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -112,9 +132,9 @@
 
 define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
 entry:
-; CHECK: icmp_sle
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, le
+; CHECK-LABEL: icmp_sle
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  cset w0, le
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -122,9 +142,9 @@
 
 define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
 entry:
-; CHECK: icmp_i64
-; CHECK: cmp  x0, x1
-; CHECK: cset w{{[0-9]+}}, le
+; CHECK-LABEL: icmp_i64
+; CHECK:       cmp  x0, x1
+; CHECK-NEXT:  cset w{{[0-9]+}}, le
   %cmp = icmp sle i64 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -132,33 +152,30 @@
 
 define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
 entry:
-; CHECK: icmp_eq_i16
-; CHECK: sxth w0, w0
-; CHECK: sxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_i16
+; CHECK:       sxth w0, w0
+; CHECK:       cmp w0, w1, sxth
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i16 %a, %b
   ret i1 %cmp
 }
 
 define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
 entry:
-; CHECK: icmp_eq_i8
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, eq
+; CHECK-LABEL: icmp_eq_i8
+; CHECK:       sxtb w0, w0
+; CHECK-NEXT:  cmp w0, w1, sxtb
+; CHECK-NEXT:  cset w0, eq
   %cmp = icmp eq i8 %a, %b
   ret i1 %cmp
 }
 
 define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
 entry:
-; CHECK: icmp_i16_unsigned
-; CHECK: uxth w0, w0
-; CHECK: uxth w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, lo
+; CHECK-LABEL: icmp_i16_unsigned
+; CHECK:       uxth w0, w0
+; CHECK-NEXT:  cmp w0, w1, uxth
+; CHECK-NEXT:  cset w0, lo
   %cmp = icmp ult i16 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -166,24 +183,34 @@
 
 define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
 entry:
-; CHECK: @icmp_i8_signed
-; CHECK: sxtb w0, w0
-; CHECK: sxtb w1, w1
-; CHECK: cmp  w0, w1
-; CHECK: cset w0, gt
+; CHECK-LABEL: icmp_i8_signed
+; CHECK:       sxtb w0, w0
+; CHECK-NEXT:  cmp w0, w1, sxtb
+; CHECK-NEXT:  cset w0, gt
   %cmp = icmp sgt i8 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
 }
 
+define i32 @icmp_i1_signed(i1 %a, i1 %b) nounwind {
+entry:
+; CHECK-LABEL: icmp_i1_signed
+; CHECK:       sbfx [[REG1:w[0-9]+]], w0, #0, #1
+; CHECK-NEXT:  sbfx [[REG2:w[0-9]+]], w1, #0, #1
+; CHECK-NEXT:  cmp  [[REG1]], [[REG2]]
+; CHECK-NEXT:  cset w0, gt
+  %cmp = icmp sgt i1 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
 
 define i32 @icmp_i16_signed_const(i16 %a) nounwind {
 entry:
-; CHECK: icmp_i16_signed_const
-; CHECK: sxth w0, w0
-; CHECK: cmn  w0, #233
-; CHECK: cset w0, lt
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: icmp_i16_signed_const
+; CHECK:       sxth w0, w0
+; CHECK-NEXT:  cmn w0, #233
+; CHECK-NEXT:  cset w0, lt
+; CHECK-NEXT:  and w0, w0, #0x1
   %cmp = icmp slt i16 %a, -233
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -191,11 +218,11 @@
 
 define i32 @icmp_i8_signed_const(i8 %a) nounwind {
 entry:
-; CHECK: icmp_i8_signed_const
-; CHECK: sxtb w0, w0
-; CHECK: cmp  w0, #124
-; CHECK: cset w0, gt
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: icmp_i8_signed_const
+; CHECK:       sxtb w0, w0
+; CHECK-NEXT:  cmp w0, #124
+; CHECK-NEXT:  cset w0, gt
+; CHECK-NEXT:  and w0, w0, #0x1
   %cmp = icmp sgt i8 %a, 124
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -203,11 +230,11 @@
 
 define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
 entry:
-; CHECK: icmp_i1_unsigned_const
-; CHECK: and w0, w0, #0x1
-; CHECK: cmp  w0, #0
-; CHECK: cset w0, lo
-; CHECK: and w0, w0, #0x1
+; CHECK-LABEL: icmp_i1_unsigned_const
+; CHECK:       and w0, w0, #0x1
+; CHECK-NEXT:  cmp w0, #0
+; CHECK-NEXT:  cset w0, lo
+; CHECK-NEXT:  and w0, w0, #0x1
   %cmp = icmp ult i1 %a, 0
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
index 70335ac..a5f4524 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll

@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 @fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
 
 define i32 @fn(i32 %target) nounwind {
 entry:
-; CHECK: @fn
+; CHECK-LABEL: fn
   %retval = alloca i32, align 4
   %target.addr = alloca i32, align 4
   store i32 %target, i32* %target.addr, align 4
@@ -29,8 +29,8 @@
   ret i32 %2
 
 indirectgoto:                                     ; preds = %entry
-; CHECK: ldr x0, [sp]
-; CHECK: br x0
+; CHECK:      ldr [[REG:x[0-9]+]], [sp]
+; CHECK-NEXT: br [[REG]]
   %indirect.goto.dest = phi i8* [ %1, %entry ]
   indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
 }

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index 1152988..9ac3e44 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios < %s | FileCheck %s --check-prefix=ARM64
 
 @message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
 @temp = common global [80 x i8] zeroinitializer, align 16
@@ -7,7 +7,7 @@
 ; ARM64-LABEL: t1
 ; ARM64: adrp x8, _message@PAGE
 ; ARM64: add x0, x8, _message@PAGEOFF
-; ARM64: movz w9, #0
+; ARM64: mov w9, wzr
 ; ARM64: movz x2, #0x50
 ; ARM64: uxtb w1, w9
 ; ARM64: bl _memset

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
index ffac131..1dea5d9 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll

@@ -1,27 +1,41 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 ; Materialize using fmov
-define void @float_(float* %value) {
-; CHECK: @float_
-; CHECK: fmov s0, #1.25000000
-  store float 1.250000e+00, float* %value, align 4
-  ret void
+define float @fmov_float1() {
+; CHECK-LABEL: fmov_float1
+; CHECK:       fmov s0, #1.25000000
+  ret float 1.250000e+00
 }
 
-define void @double_(double* %value) {
-; CHECK: @double_
-; CHECK: fmov d0, #1.25000000
-  store double 1.250000e+00, double* %value, align 8
-  ret void
+define float @fmov_float2() {
+; CHECK-LABEL: fmov_float2
+; CHECK:       fmov s0, wzr
+  ret float 0.0e+00
+}
+
+define double @fmov_double1() {
+; CHECK-LABEL: fmov_double1
+; CHECK:       fmov d0, #1.25000000
+  ret double 1.250000e+00
+}
+
+define double @fmov_double2() {
+; CHECK-LABEL: fmov_double2
+; CHECK:       fmov d0, xzr
+  ret double 0.0e+00
 }
 
 ; Materialize from constant pool
-define float @float_cp() {
-; CHECK: @float_cp
+define float @cp_float() {
+; CHECK-LABEL: cp_float
+; CHECK:       adrp [[REG:x[0-9]+]], {{lCPI[0-9]+_0}}@PAGE
+; CHECK-NEXT:  ldr s0, {{\[}}[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF{{\]}}
   ret float 0x400921FB60000000
 }
 
-define double @double_cp() {
-; CHECK: @double_cp
+define double @cp_double() {
+; CHECK-LABEL: cp_double
+; CHECK:       adrp [[REG:x[0-9]+]], {{lCPI[0-9]+_0}}@PAGE
+; CHECK-NEXT:  ldr d0, {{\[}}[[REG]], {{lCPI[0-9]+_0}}@PAGEOFF{{\]}}
   ret double 0x400921FB54442D18
 }

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
index 483d179..81daa7c 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll

@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=aarch64-apple-ios < %s | FileCheck %s
 
 ; Fast-isel can't do vector conversions yet, but it was emitting some highly
 ; suspect UCVTFUWDri MachineInstrs.

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
index d5bdbaa..26f9afa 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll

@@ -1,7 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 ; RUN: llc %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t
 ; RUN: FileCheck %s < %t --check-prefix=CHECK-SSA
-; REQUIRES: asserts
 
 ; CHECK-SSA-LABEL: Machine code for function t1
 

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
index d91fd28..f84c755 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 ;; Test returns.
 define void @t0() nounwind ssp {

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-select.ll b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
deleted file mode 100644
index 1cc207f..0000000
--- a/test/CodeGen/AArch64/arm64-fast-isel-select.ll
+++ /dev/null

@@ -1,63 +0,0 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
-
-define i32 @t1(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t1
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i32 123, i32 357
-  ret i32 %1
-}
-
-define i64 @t2(i32 %c) nounwind readnone {
-entry:
-; CHECK: @t2
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = icmp sgt i32 %c, 1
-  %1 = select i1 %0, i64 123, i64 357
-  ret i64 %1
-}
-
-define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
-entry:
-; CHECK: @t3
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
-  %0 = select i1 %c, i32 %a, i32 %b
-  ret i32 %0
-}
-
-define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
-entry:
-; CHECK: @t4
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
-  %0 = select i1 %c, i64 %a, i64 %b
-  ret i64 %0
-}
-
-define float @t5(i1 %c, float %a, float %b) nounwind readnone {
-entry:
-; CHECK: @t5
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel s0, s0, s1, ne
-  %0 = select i1 %c, float %a, float %b
-  ret float %0
-}
-
-define double @t6(i1 %c, double %a, double %b) nounwind readnone {
-entry:
-; CHECK: @t6
-; CHECK: and w0, w0, #0x1
-; CHECK: subs w0, w0, #0
-; CHECK: fcsel d0, d0, d1, ne
-  %0 = select i1 %c, double %a, double %b
-  ret double %0
-}

diff --git a/test/CodeGen/AArch64/arm64-fast-isel-store.ll b/test/CodeGen/AArch64/arm64-fast-isel-store.ll
new file mode 100644
index 0000000..9494d55
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-store.ll

@@ -0,0 +1,30 @@
+; RUN: llc -mtriple=aarch64-unknown-unknown                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-unknown -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define void @store_i8(i8* %a) {
+; CHECK-LABEL: store_i8
+; CHECK: strb  wzr, [x0]
+  store i8 0, i8* %a
+  ret void
+}
+
+define void @store_i16(i16* %a) {
+; CHECK-LABEL: store_i16
+; CHECK: strh  wzr, [x0]
+  store i16 0, i16* %a
+  ret void
+}
+
+define void @store_i32(i32* %a) {
+; CHECK-LABEL: store_i32
+; CHECK: str  wzr, [x0]
+  store i32 0, i32* %a
+  ret void
+}
+
+define void @store_i64(i64* %a) {
+; CHECK-LABEL: store_i64
+; CHECK: str  xzr, [x0]
+  store i64 0, i64* %a
+  ret void
+}

diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
index 0194b3a..4349946 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin < %s | FileCheck %s
 
 define void @t0(i32 %a) nounwind {
 entry:
@@ -66,8 +66,7 @@
 define void @t4(i32 *%ptr) nounwind {
 entry:
 ; CHECK-LABEL: t4:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-4]
+; CHECK: stur wzr, [x0, #-4]
 ; CHECK: ret
   %0 = getelementptr i32 *%ptr, i32 -1
   store i32 0, i32* %0, align 4
@@ -77,8 +76,7 @@
 define void @t5(i32 *%ptr) nounwind {
 entry:
 ; CHECK-LABEL: t5:
-; CHECK: movz w8, #0
-; CHECK: stur w8, [x0, #-256]
+; CHECK: stur wzr, [x0, #-256]
 ; CHECK: ret
   %0 = getelementptr i32 *%ptr, i32 -64
   store i32 0, i32* %0, align 4

diff --git a/test/CodeGen/AArch64/arm64-frameaddr.ll b/test/CodeGen/AArch64/arm64-frameaddr.ll
deleted file mode 100644
index 469078c..0000000
--- a/test/CodeGen/AArch64/arm64-frameaddr.ll
+++ /dev/null

@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define i8* @t() nounwind {
-entry:
-; CHECK-LABEL: t:
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK: mov x29, sp
-; CHECK: mov x0, x29
-; CHECK: ldp x29, x30, [sp], #16
-; CHECK: ret
-	%0 = call i8* @llvm.frameaddress(i32 0)
-        ret i8* %0
-}
-
-declare i8* @llvm.frameaddress(i32) nounwind readnone

diff --git a/test/CodeGen/AArch64/arm64-indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
index e501c6e..a8620f4 100644
--- a/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll

@@ -349,3 +349,15 @@
   store i64 %ext, i64* %out, align 4
   ret i8* %ptr
 }
+
+; This test checks if illegal post-index is generated
+
+define i64* @postidx_clobber(i64* %addr) nounwind noinline ssp {
+; CHECK-LABEL: postidx_clobber:
+; CHECK-NOT: str     x0, [x0], #8
+; ret
+ %paddr = bitcast i64* %addr to i64**
+ store i64* %addr, i64** %paddr
+ %newaddr = getelementptr i64* %addr, i32 1
+ ret i64* %newaddr
+}

diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index d76cca3..9c8bcaa 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll

@@ -87,13 +87,17 @@
   ret i32 %1
 }
 
-define i32 @constraint_J(i32 %i, i32 %j) nounwind {
+define i32 @constraint_J(i32 %i, i32 %j, i64 %k) nounwind {
 entry:
   ; CHECK-LABEL: constraint_J:
   %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #-16773120
   %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
-  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #-1
+  %2 = tail call i64 asm sideeffect "sub ${0:x}, ${1:x}, $2", "=r,r,J"(i64 %k, i32 -1) nounwind
+  ; CHECK: sub   {{x[0-9]+}}, {{x[0-9]+}}, #-1
+  %3 = tail call i64 asm sideeffect "sub ${0:x}, ${1:x}, $2", "=r,r,J"(i64 %k, i64 -1) nounwind
+  ; CHECK: sub   {{x[0-9]+}}, {{x[0-9]+}}, #-1
   ret i32 %1
 }
 

diff --git a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
new file mode 100644
index 0000000..d39722b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll

@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone < %s | FileCheck %s
+
+; Test that scratch registers are spilled around patchpoints
+; CHECK: InlineAsm End
+; CHECK-NEXT: mov x{{[0-9]+}}, x16
+; CHECK-NEXT: mov x{{[0-9]+}}, x17
+; CHECK-NEXT: Ltmp
+; CHECK-NEXT: nop
+define void @clobberScratch(i32* %p) {
+  %v = load i32* %p
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  store i32 %v, i32* %p
+  ret void
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+

diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
new file mode 100644
index 0000000..8f79f80
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll

@@ -0,0 +1,118 @@
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone            < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone -fast-isel < %s | FileCheck %s --check-prefix=FAST
+
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in x0.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:       Ltmp
+; CHECK:       str x{{.+}}, [sp]
+; CHECK-NEXT:  mov  x0, x{{.+}}
+; CHECK:       Ltmp
+; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
+; CHECK-NEXT:  movk  x16, #0xdead, lsl #16
+; CHECK-NEXT:  movk  x16, #0xbeef
+; CHECK-NEXT:  blr x16
+; FAST-LABEL:  jscall_patchpoint_codegen:
+; FAST:        Ltmp
+; FAST:        str x{{.+}}, [sp]
+; FAST:        Ltmp
+; FAST-NEXT:   movz  x16, #0xffff, lsl #32
+; FAST-NEXT:   movk  x16, #0xdead, lsl #16
+; FAST-NEXT:   movk  x16, #0xbeef
+; FAST-NEXT:   blr x16
+  %resolveCall2 = inttoptr i64 281474417671919 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 244837814038255 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:       Ltmp
+; CHECK:       orr w[[REG:[0-9]+]], wzr, #0x6
+; CHECK-NEXT:  str x[[REG]], [sp, #24]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  str w[[REG]], [sp, #16]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x2
+; CHECK-NEXT:  str x[[REG]], [sp]
+; CHECK:       Ltmp
+; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
+; CHECK-NEXT:  movk  x16, #0xdead, lsl #16
+; CHECK-NEXT:  movk  x16, #0xbeef
+; CHECK-NEXT:  blr x16
+; FAST-LABEL:  jscall_patchpoint_codegen2:
+; FAST:        Ltmp
+; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
+; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
+; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
+; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG2]], [sp, #16]
+; FAST-NEXT:   str [[REG3]], [sp, #24]
+; FAST:        Ltmp
+; FAST-NEXT:   movz  x16, #0xffff, lsl #32
+; FAST-NEXT:   movk  x16, #0xdead, lsl #16
+; FAST-NEXT:   movk  x16, #0xbeef
+; FAST-NEXT:   blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:       Ltmp
+; CHECK:       movz  w[[REG:[0-9]+]], #0xa
+; CHECK-NEXT:  str x[[REG]], [sp, #48]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x8
+; CHECK-NEXT:  str w[[REG]], [sp, #36]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x6
+; CHECK-NEXT:  str x[[REG]], [sp, #24]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  str w[[REG]], [sp, #16]
+; CHECK-NEXT:  orr w[[REG:[0-9]+]], wzr, #0x2
+; CHECK-NEXT:  str x[[REG]], [sp]
+; CHECK:       Ltmp
+; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
+; CHECK-NEXT:  movk  x16, #0xdead, lsl #16
+; CHECK-NEXT:  movk  x16, #0xbeef
+; CHECK-NEXT:  blr x16
+; FAST-LABEL:  jscall_patchpoint_codegen3:
+; FAST:        Ltmp
+; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
+; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
+; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
+; FAST-NEXT:   orr [[REG4:w[0-9]+]], wzr, #0x8
+; FAST-NEXT:   movz [[REG5:x[0-9]+]], #0xa
+; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG2]], [sp, #16]
+; FAST-NEXT:   str [[REG3]], [sp, #24]
+; FAST-NEXT:   str [[REG4]], [sp, #36]
+; FAST-NEXT:   str [[REG5]], [sp, #48]
+; FAST:        Ltmp
+; FAST-NEXT:   movz  x16, #0xffff, lsl #32
+; FAST-NEXT:   movk  x16, #0xdead, lsl #16
+; FAST-NEXT:   movk  x16, #0xbeef
+; FAST-NEXT:   blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+; CHECK-LABEL: test_i16:
+; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
+; CHECK: add {{w[0-9]+}}, w0, [[BREG]]
+define webkit_jscc zeroext i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
+  %sum = add i16 %a, %b
+  ret i16 %sum
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+

diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
index 039cdfc..278cba5 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone                             < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone -fast-isel -fast-isel-abort < %s | FileCheck %s
 
 ; Trivial patchpoint codegen
 ;
@@ -41,73 +42,6 @@
   ret void
 }
 
-; Test the webkit_jscc calling convention.
-; One argument will be passed in register, the other will be pushed on the stack.
-; Return value in x0.
-define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen:
-; CHECK:      Ltmp
-; CHECK:      str x{{.+}}, [sp]
-; CHECK-NEXT: mov  x0, x{{.+}}
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %resolveCall2 = inttoptr i64 281474417671919 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
-  %resolveCall3 = inttoptr i64 244837814038255 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
-  ret void
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen2(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen2:
-; CHECK:      Ltmp
-; CHECK:      orr w{{.+}}, wzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
-  ret i64 %result
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen3(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen3:
-; CHECK:      Ltmp
-; CHECK:      movz  w{{.+}}, #0xa
-; CHECK-NEXT: str x{{.+}}, [sp, #48]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
-; CHECK-NEXT: str w{{.+}}, [sp, #36]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x6
-; CHECK-NEXT: str x{{.+}}, [sp, #24]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
-; CHECK-NEXT: str w{{.+}}, [sp, #16]
-; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
-; CHECK-NEXT: str x{{.+}}, [sp]
-; CHECK:      Ltmp
-; CHECK-NEXT: movz  x16, #0xffff, lsl #32
-; CHECK-NEXT: movk  x16, #0xdead, lsl #16
-; CHECK-NEXT: movk  x16, #0xbeef
-; CHECK-NEXT: blr x16
-  %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
-  ret i64 %result
-}
-
 ; Test patchpoints reusing the same TargetConstant.
 ; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
 ; There is no way to verify this, since it depends on memory allocation.
@@ -144,28 +78,7 @@
   ret void
 }
 
-; Test that scratch registers are spilled around patchpoints
-; CHECK: InlineAsm End
-; CHECK-NEXT: mov x{{[0-9]+}}, x16
-; CHECK-NEXT: mov x{{[0-9]+}}, x17
-; CHECK-NEXT: Ltmp
-; CHECK-NEXT: nop
-define void @clobberScratch(i32* %p) {
-  %v = load i32* %p
-  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
-  store i32 %v, i32* %p
-  ret void
-}
-
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
 
-; CHECK-LABEL: test_i16:
-; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
-; CHECK: add w0, w0, [[BREG]]
-define webkit_jscc i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
-  %sum = add i16 %a, %b
-  ret i16 %sum
-}

diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
index 2afade2..117ab3a 100644
--- a/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll

@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=aarch64 -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -8,6 +9,13 @@
 ; CHECK: uaddlv.8b	h0, v0
 ; CHECK: fmov w0, s0
 ; CHECK: ret
+; CHECK-NONEON-LABEL: cnt32_advsimd
+; CHECK-NONEON-NOT: 8b
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0x55555555
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0x33333333
+; CHECK-NONEON: and w{{[0-9]+}}, w{{[0-9]+}}, #0xf0f0f0f
+; CHECK-NONEON: mul
+
 }
 
 define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
@@ -18,6 +26,12 @@
 ; CHECK: uaddlv.8b	h0, v0
 ; CHECK: fmov	w0, s0
 ; CHECK: ret
+; CHECK-NONEON-LABEL: cnt64_advsimd
+; CHECK-NONEON-NOT: 8b
+; CHECK-NONEON: and x{{[0-9]+}}, x{{[0-9]+}}, #0x5555555555555555
+; CHECK-NONEON: and x{{[0-9]+}}, x{{[0-9]+}}, #0x3333333333333333
+; CHECK-NONEON: and x{{[0-9]+}}, x{{[0-9]+}}, #0xf0f0f0f0f0f0f0f
+; CHECK-NONEON: mul
 }
 
 ; Do not use AdvSIMD when -mno-implicit-float is specified.

diff --git a/test/CodeGen/AArch64/arm64-prefetch.ll b/test/CodeGen/AArch64/arm64-prefetch.ll
index b2e06ed..9dc6301 100644
--- a/test/CodeGen/AArch64/arm64-prefetch.ll
+++ b/test/CodeGen/AArch64/arm64-prefetch.ll

@@ -17,6 +17,15 @@
   ; CHECK: prfum pldl1keep
   call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
 
+  ; CHECK: prfum plil1strm
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 0)
+  ; CHECK: prfum plil3keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 0)
+  ; CHECK: prfum plil2keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 0)
+  ; CHECK: prfum plil1keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 0)
+
   ; CHECK: prfum pstl1strm
   call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
   ; CHECK: prfum pstl3keep
@@ -57,26 +66,52 @@
   %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
   %tmp11 = bitcast i32* %arrayidx12 to i8*
 
-  ; CHECK: prfm pstl1strm
-  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
+
+  ; CHECK: prfm plil1strm
+  call void @llvm.prefetch(i8* %tmp11, i32 0, i32 0, i32 0)
   %tmp12 = load i32** @a, align 8, !tbaa !3
   %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
-  %tmp13 = bitcast i32* %arrayidx15 to i8*
+  %tmp13 = bitcast i32* %arrayidx3 to i8*
 
-  ; CHECK: prfm pstl3keep
-  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
+  ; CHECK: prfm plil3keep
+  call void @llvm.prefetch(i8* %tmp13, i32 0, i32 1, i32 0)
   %tmp14 = load i32** @a, align 8, !tbaa !3
   %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
-  %tmp15 = bitcast i32* %arrayidx18 to i8*
+  %tmp15 = bitcast i32* %arrayidx6 to i8*
 
-  ; CHECK: prfm pstl2keep
-  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
+  ; CHECK: prfm plil2keep
+  call void @llvm.prefetch(i8* %tmp15, i32 0, i32 2, i32 0)
   %tmp16 = load i32** @a, align 8, !tbaa !3
   %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
-  %tmp17 = bitcast i32* %arrayidx21 to i8*
+  %tmp17 = bitcast i32* %arrayidx9 to i8*
+
+  ; CHECK: prfm plil1keep
+  call void @llvm.prefetch(i8* %tmp17, i32 0, i32 3, i32 0)
+  %tmp18 = load i32** @a, align 8, !tbaa !3
+  %arrayidx24 = getelementptr inbounds i32* %tmp18, i64 %idxprom
+  %tmp19 = bitcast i32* %arrayidx12 to i8*
+
+
+  ; CHECK: prfm pstl1strm
+  call void @llvm.prefetch(i8* %tmp19, i32 1, i32 0, i32 1)
+  %tmp20 = load i32** @a, align 8, !tbaa !3
+  %arrayidx27 = getelementptr inbounds i32* %tmp20, i64 %idxprom
+  %tmp21 = bitcast i32* %arrayidx15 to i8*
+
+  ; CHECK: prfm pstl3keep
+  call void @llvm.prefetch(i8* %tmp21, i32 1, i32 1, i32 1)
+  %tmp22 = load i32** @a, align 8, !tbaa !3
+  %arrayidx30 = getelementptr inbounds i32* %tmp22, i64 %idxprom
+  %tmp23 = bitcast i32* %arrayidx18 to i8*
+
+  ; CHECK: prfm pstl2keep
+  call void @llvm.prefetch(i8* %tmp23, i32 1, i32 2, i32 1)
+  %tmp24 = load i32** @a, align 8, !tbaa !3
+  %arrayidx33 = getelementptr inbounds i32* %tmp24, i64 %idxprom
+  %tmp25 = bitcast i32* %arrayidx21 to i8*
 
   ; CHECK: prfm pstl1keep
-  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
+  call void @llvm.prefetch(i8* %tmp25, i32 1, i32 3, i32 1)
   ret void
 }
 

diff --git a/test/CodeGen/AArch64/arm64-scvt.ll b/test/CodeGen/AArch64/arm64-scvt.ll
index 2e006cf..8baaf22 100644
--- a/test/CodeGen/AArch64/arm64-scvt.ll
+++ b/test/CodeGen/AArch64/arm64-scvt.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cortex-a57 | FileCheck --check-prefix=CHECK-A57 %s
 ; rdar://13082402
 
 define float @t1(i32* nocapture %src) nounwind ssp {
@@ -409,6 +410,10 @@
 ; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
 ; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct1:
+; CHECK-A57: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-A57-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul s0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i8* %sp0, i64 1
   %pix_sp0.0.copyload = load i8* %addr, align 1
@@ -466,6 +471,10 @@
 ; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
 ; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct5:
+; CHECK-A57: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-A57-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul s0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i8* %sp0, i64 %offset
   %pix_sp0.0.copyload = load i8* %addr, align 1
@@ -536,6 +545,10 @@
 ; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
 ; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct10:
+; CHECK-A57: ldrsh w[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-A57-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul d0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i16* %sp0, i64 1
   %pix_sp0.0.copyload = load i16* %addr, align 1
@@ -592,6 +605,10 @@
 ; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
 ; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct14:
+; CHECK-A57: ldrsh w[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-A57-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul d0, [[REG]], [[REG]]
 entry:
   %addr = getelementptr i16* %sp0, i64 %offset
   %pix_sp0.0.copyload = load i16* %addr, align 1
@@ -636,6 +653,10 @@
 ; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
 ; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct17:
+; CHECK-A57: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-A57-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul s0, [[REG]], [[REG]]
   %bitcast = ptrtoint i8* %sp0 to i64
   %add = add i64 %bitcast, -1
   %addr = inttoptr i64 %add to i8*
@@ -713,6 +734,10 @@
 ; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
 ; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
 ; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+; CHECK-A57-LABEL: sfct22:
+; CHECK-A57: ldursh w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-A57-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-A57-NEXT: fmul d0, [[REG]], [[REG]]
   %bitcast = ptrtoint i16* %sp0 to i64
   %add = add i64 %bitcast, 1
   %addr = inttoptr i64 %add to i16*

diff --git a/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll b/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll
new file mode 100644
index 0000000..67283b6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll

@@ -0,0 +1,47 @@
+; RUN: llc < %s -asm-verbose=false -mtriple=arm64-apple-ios | FileCheck %s
+
+define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: foo:
+; CHECK-NEXT: fcmeq.4s  v0, v0, v1
+; CHECK-NEXT: fmov.4s v1, #1.00000000
+; CHECK-NEXT: and.16b v0, v0, v1
+; CHECK-NEXT: ret
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x float>
+  ret <4 x float> %result
+}
+; Make sure the operation doesn't try to get folded when the sizes don't match,
+; as that ends up crashing later when trying to form a bitcast operation for
+; the folded nodes.
+define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: movi.4s
+; CHECK: scvtf.2d
+; CHECK: scvtf.2d
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x double>
+  store <4 x double> %result, <4 x double>* %p
+  ret void
+}
+
+; Fold explicit AND operations when the constant isn't a splat of a single
+; scalar value like what the zext creates.
+define <4 x float> @foo2(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: lCPI2_0:
+; CHECK-NEXT: .long 1065353216
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long 1065353216
+; CHECK-NEXT: .long 0
+; CHECK-LABEL: foo2:
+; CHECK: adrp  x8, lCPI2_0@PAGE
+; CHECK: ldr q2, [x8, lCPI2_0@PAGEOFF]
+; CHECK-NEXT:  fcmeq.4s  v0, v0, v1
+; CHECK-NEXT:  and.16b v0, v0, v2
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258>
+  %result = sitofp <4 x i32> %and to <4 x float>
+  ret <4 x float> %result
+}

diff --git a/test/CodeGen/AArch64/arm64-shifted-sext.ll b/test/CodeGen/AArch64/arm64-shifted-sext.ll
index b7b4e5d..71f15b1 100644
--- a/test/CodeGen/AArch64/arm64-shifted-sext.ll
+++ b/test/CodeGen/AArch64/arm64-shifted-sext.ll

@@ -166,8 +166,8 @@
 define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: extendedLeftShiftshortTointBy16:
-; CHECK: add [[REG:w[0-9]+]], w0, #1
-; CHECK: lsl w0, [[REG]], #16
+; CHECK: lsl [[REG:w[0-9]+]], w0, #16
+; CHECK: add w0, [[REG]], #16, lsl #12
   %inc = add i16 %a, 1
   %conv2 = zext i16 %inc to i32
   %shl = shl nuw i32 %conv2, 16

diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
index 2c7c6ae..144c2fd 100644
--- a/test/CodeGen/AArch64/arm64-stackmap.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin                             < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -fast-isel -fast-isel-abort < %s | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 

diff --git a/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
new file mode 100644
index 0000000..a7f5215
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll

@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -enable-aa-sched-mi | FileCheck %s
+; Check that the scheduler moves the load from a[1] past the store into a[2].
+@a = common global i32* null, align 8
+@m = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define i32 @func(i32 %i, i32 %j, i32 %k) #0 {
+entry:
+; CHECK: ldr {{w[0-9]+}}, [x[[REG:[0-9]+]], #4]
+; CHECK: str {{w[0-9]+}}, [x[[REG]], #8]
+  %0 = load i32** @a, align 8, !tbaa !1
+  %arrayidx = getelementptr inbounds i32* %0, i64 2
+  store i32 %i, i32* %arrayidx, align 4, !tbaa !5
+  %arrayidx1 = getelementptr inbounds i32* %0, i64 1
+  %1 = load i32* %arrayidx1, align 4, !tbaa !5
+  %add = add nsw i32 %k, %i
+  store i32 %add, i32* @m, align 4, !tbaa !5
+  ret i32 %1
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 "}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !6, i64 0}
+!6 = metadata !{metadata !"int", metadata !3, i64 0}

diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index 5afc8d9..fae2b90 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll

@@ -802,3 +802,73 @@
   %res1 = zext <2 x i32> %res to <2 x i64>
   ret <2 x i64> %res1
 }
+
+define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
+; CHECK-LABEL: abspattern1:
+; CHECK: abs.2s
+; CHECK-NEXT: ret
+        %tmp1neg = sub <2 x i32> zeroinitializer, %a
+        %b = icmp sge <2 x i32> %a, zeroinitializer
+        %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
+        ret <2 x i32> %abs
+}
+
+define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
+; CHECK-LABEL: abspattern2:
+; CHECK: abs.4h
+; CHECK-NEXT: ret
+        %tmp1neg = sub <4 x i16> zeroinitializer, %a
+        %b = icmp sgt <4 x i16> %a, zeroinitializer
+        %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
+        ret <4 x i16> %abs
+}
+
+define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
+; CHECK-LABEL: abspattern3:
+; CHECK: abs.8b
+; CHECK-NEXT: ret
+        %tmp1neg = sub <8 x i8> zeroinitializer, %a
+        %b = icmp slt <8 x i8> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
+        ret <8 x i8> %abs
+}
+
+define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
+; CHECK-LABEL: abspattern4:
+; CHECK: abs.4s
+; CHECK-NEXT: ret
+        %tmp1neg = sub <4 x i32> zeroinitializer, %a
+        %b = icmp sge <4 x i32> %a, zeroinitializer
+        %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
+        ret <4 x i32> %abs
+}
+
+define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
+; CHECK-LABEL: abspattern5:
+; CHECK: abs.8h
+; CHECK-NEXT: ret
+        %tmp1neg = sub <8 x i16> zeroinitializer, %a
+        %b = icmp sgt <8 x i16> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
+        ret <8 x i16> %abs
+}
+
+define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
+; CHECK-LABEL: abspattern6:
+; CHECK: abs.16b
+; CHECK-NEXT: ret
+        %tmp1neg = sub <16 x i8> zeroinitializer, %a
+        %b = icmp slt <16 x i8> %a, zeroinitializer
+        %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
+        ret <16 x i8> %abs
+}
+
+define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
+; CHECK-LABEL: abspattern7:
+; CHECK: abs.2d
+; CHECK-NEXT: ret
+        %tmp1neg = sub <2 x i64> zeroinitializer, %a
+        %b = icmp sle <2 x i64> %a, zeroinitializer
+        %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a
+        ret <2 x i64> %abs
+}

diff --git a/test/CodeGen/AArch64/arm64-vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d244958..1f393c2 100644
--- a/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll

@@ -66,17 +66,17 @@
 ; CHECK-LABEL: to_half:
 ; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
 ; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
-  %res = call i16 @llvm.convert.to.fp16(float %in)
+  %res = call i16 @llvm.convert.to.fp16.f32(float %in)
   ret i16 %res
 }
 
 define float @from_half(i16 %in) {
 ; CHECK-LABEL: from_half:
-; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
-; CHECK: fcvt s0, h[[HALFVAL]]
-  %res = call float @llvm.convert.from.fp16(i16 %in)
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+; CHECK: fcvt s0, {{h[0-9]+}}
+  %res = call float @llvm.convert.from.fp16.f32(i16 %in)
   ret float %res
 }
 
-declare float @llvm.convert.from.fp16(i16) #1
-declare i16 @llvm.convert.to.fp16(float) #1
+declare float @llvm.convert.from.fp16.f32(i16) #1
+declare i16 @llvm.convert.to.fp16.f32(float) #1

diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 650ff1e..5bee161 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll

@@ -14,3 +14,14 @@
   store %T1_30 %r, %T1_30* %p1
   ret void
 }
+
+; Extend from v1i1 was crashing things (PR20791). Make sure we do something
+; sensible instead.
+define <1 x i32> @autogen_SD7918() {
+; CHECK-LABEL: autogen_SD7918
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+  %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
+  %ZE = zext <1 x i1> %I29 to <1 x i32>
+  ret <1 x i32> %ZE
+}

diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
index 0c300de..59ce684 100644
--- a/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll

@@ -1,13 +1,14 @@
-; RUN: llc < %s -march=arm64 -aarch64-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; Get the actual value of the overflow bit.
 ;
-define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @saddo1.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; CHECK-LABEL:  saddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   cset w0, vs
+; CHECK-LABEL:  saddo1.i32
+; CHECK:        adds {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -15,11 +16,64 @@
   ret i1 %obit
 }
 
-define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+; Test the immediate version.
+define zeroext i1 @saddo2.i32(i32 %v1, i32* %res) {
 entry:
-; CHECK-LABEL:  saddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   cset w0, vs
+; CHECK-LABEL:  saddo2.i32
+; CHECK:        adds {{w[0-9]+}}, w0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+; Test negative immediates.
+define zeroext i1 @saddo3.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo3.i32
+; CHECK:        subs {{w[0-9]+}}, w0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+; Test immediates that are too large to be encoded.
+define zeroext i1 @saddo4.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo4.i32
+; CHECK:        adds {{w[0-9]+}}, w0, {{w[0-9]+}}
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 16777215)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+; Test shift folding.
+define zeroext i1 @saddo5.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo5.i32
+; CHECK:        adds {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %lsl = shl i32 %v2, 16
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %lsl)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo1.i64
+; CHECK:        adds {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -27,11 +81,35 @@
   ret i1 %obit
 }
 
-define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @saddo2.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo2.i64
+; CHECK:        adds {{x[0-9]+}}, x0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo3.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo3.i64
+; CHECK:        subs {{x[0-9]+}}, x0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  uaddo.i32
-; CHECK:        adds w8, w0, w1
-; CHECK-NEXT:   cset w0, hs
+; CHECK:        adds {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, hs
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -39,11 +117,11 @@
   ret i1 %obit
 }
 
-define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  uaddo.i64
-; CHECK:        adds x8, x0, x1
-; CHECK-NEXT:   cset w0, hs
+; CHECK:        adds {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, hs
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -51,11 +129,11 @@
   ret i1 %obit
 }
 
-define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @ssubo1.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; CHECK-LABEL:  ssubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   cset w0, vs
+; CHECK-LABEL:  ssubo1.i32
+; CHECK:        subs {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -63,11 +141,23 @@
   ret i1 %obit
 }
 
-define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @ssubo2.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL:  ssubo2.i32
+; CHECK:        adds {{w[0-9]+}}, w0, #4
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  ssubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   cset w0, vs
+; CHECK:        subs {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -75,11 +165,11 @@
   ret i1 %obit
 }
 
-define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  usubo.i32
-; CHECK:        subs w8, w0, w1
-; CHECK-NEXT:   cset w0, lo
+; CHECK:        subs {{w[0-9]+}}, w0, w1
+; CHECK-NEXT:   cset {{w[0-9]+}}, lo
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -87,11 +177,11 @@
   ret i1 %obit
 }
 
-define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  usubo.i64
-; CHECK:        subs x8, x0, x1
-; CHECK-NEXT:   cset w0, lo
+; CHECK:        subs {{x[0-9]+}}, x0, x1
+; CHECK-NEXT:   cset {{w[0-9]+}}, lo
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -99,13 +189,13 @@
   ret i1 %obit
 }
 
-define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  smulo.i32
-; CHECK:        smull x8, w0, w1
-; CHECK-NEXT:   lsr x9, x8, #32
-; CHECK-NEXT:   cmp w9, w8, asr #31
-; CHECK-NEXT:   cset w0, ne
+; CHECK:        smull x[[MREG:[0-9]+]], w0, w1
+; CHECK-NEXT:   lsr x[[SREG:[0-9]+]], x[[MREG]], #32
+; CHECK-NEXT:   cmp w[[SREG]], w[[MREG]], asr #31
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -113,13 +203,13 @@
   ret i1 %obit
 }
 
-define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  smulo.i64
-; CHECK:        mul x8, x0, x1
-; CHECK-NEXT:   smulh x9, x0, x1
-; CHECK-NEXT:   cmp x9, x8, asr #63
-; CHECK-NEXT:   cset w0, ne
+; CHECK:        mul [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   smulh [[HREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp [[HREG]], [[MREG]], asr #63
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -127,12 +217,24 @@
   ret i1 %obit
 }
 
-define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  smulo2.i64
+; CHECK:        adds [[MREG:x[0-9]+]], x0, x0
+; CHECK-NEXT:   cset {{w[0-9]+}}, vs
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
 ; CHECK-LABEL:  umulo.i32
-; CHECK:        umull x8, w0, w1
-; CHECK-NEXT:   cmp xzr, x8, lsr #32
-; CHECK-NEXT:   cset w0, ne
+; CHECK:        umull [[MREG:x[0-9]+]], w0, w1
+; CHECK-NEXT:   cmp xzr, [[MREG]], lsr #32
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -140,13 +242,12 @@
   ret i1 %obit
 }
 
-define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
 ; CHECK-LABEL:  umulo.i64
-; CHECK:        umulh x8, x0, x1
-; CHECK-NEXT:   cmp xzr, x8
-; CHECK-NEXT:   cset w8, ne
-; CHECK-NEXT:   mul x9, x0, x1
+; CHECK:        umulh [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp xzr, [[MREG]]
+; CHECK-NEXT:   cset {{w[0-9]+}}, ne
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -154,6 +255,18 @@
   ret i1 %obit
 }
 
+define zeroext i1 @umulo2.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL:  umulo2.i64
+; CHECK:        adds [[MREG:x[0-9]+]], x0, x0
+; CHECK-NEXT:   cset {{w[0-9]+}}, hs
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
 
 ;
 ; Check the use of the overflow bit in combination with a select instruction.
@@ -249,9 +362,9 @@
 define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.select.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK:        smull   x[[MREG:[0-9]+]], w0, w1
+; CHECK-NEXT:   lsr     x[[SREG:[0-9]+]], x[[MREG]], #32
+; CHECK-NEXT:   cmp     w[[SREG]], w[[MREG]], asr #31
 ; CHECK-NEXT:   csel    w0, w0, w1, ne
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -262,9 +375,9 @@
 define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.select.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK:        mul     [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   smulh   [[HREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp     [[HREG]], [[MREG]], asr #63
 ; CHECK-NEXT:   csel    x0, x0, x1, ne
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -275,8 +388,8 @@
 define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.select.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK:        umull   [[MREG:x[0-9]+]], w0, w1
+; CHECK-NEXT:   cmp     xzr, [[MREG]], lsr #32
 ; CHECK-NEXT:   csel    w0, w0, w1, ne
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
@@ -287,8 +400,8 @@
 define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.select.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cmp     xzr, x8
+; CHECK:        umulh   [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp     xzr, [[MREG]]
 ; CHECK-NEXT:   csel    x0, x0, x1, ne
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
@@ -300,7 +413,7 @@
 ;
 ; Check the use of the overflow bit in combination with a branch instruction.
 ;
-define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  saddo.br.i32
 ; CHECK:        cmn w0, w1
@@ -317,7 +430,7 @@
   ret i1 true
 }
 
-define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  saddo.br.i64
 ; CHECK:        cmn x0, x1
@@ -334,7 +447,7 @@
   ret i1 true
 }
 
-define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.br.i32
 ; CHECK:        cmn w0, w1
@@ -351,7 +464,7 @@
   ret i1 true
 }
 
-define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  uaddo.br.i64
 ; CHECK:        cmn x0, x1
@@ -368,7 +481,7 @@
   ret i1 true
 }
 
-define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  ssubo.br.i32
 ; CHECK:        cmp w0, w1
@@ -385,7 +498,7 @@
   ret i1 true
 }
 
-define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  ssubo.br.i64
 ; CHECK:        cmp x0, x1
@@ -402,7 +515,7 @@
   ret i1 true
 }
 
-define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.br.i32
 ; CHECK:        cmp w0, w1
@@ -419,7 +532,7 @@
   ret i1 true
 }
 
-define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  usubo.br.i64
 ; CHECK:        cmp x0, x1
@@ -436,12 +549,12 @@
   ret i1 true
 }
 
-define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.br.i32
-; CHECK:        smull    x8, w0, w1
-; CHECK-NEXT:   lsr     x9, x8, #32
-; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK:        smull   x[[MREG:[0-9]+]], w0, w1
+; CHECK-NEXT:   lsr     x[[SREG:[0-9]+]], x8, #32
+; CHECK-NEXT:   cmp     w[[SREG]], w[[MREG]], asr #31
 ; CHECK-NEXT:   b.eq
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -455,12 +568,12 @@
   ret i1 true
 }
 
-define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  smulo.br.i64
-; CHECK:        mul      x8, x0, x1
-; CHECK-NEXT:   smulh   x9, x0, x1
-; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK:        mul     [[MREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   smulh   [[HREG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   cmp     [[HREG]], [[MREG]], asr #63
 ; CHECK-NEXT:   b.eq
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
@@ -474,11 +587,28 @@
   ret i1 true
 }
 
-define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
+define zeroext i1 @smulo2.br.i64(i64 %v1) {
+entry:
+; CHECK-LABEL:  smulo2.br.i64
+; CHECK:        cmn  x0, x0
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.br.i32
-; CHECK:        umull    x8, w0, w1
-; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK:        umull   [[MREG:x[0-9]+]], w0, w1
+; CHECK-NEXT:   cmp     xzr, [[MREG]], lsr #32
 ; CHECK-NEXT:   b.eq
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
@@ -492,11 +622,11 @@
   ret i1 true
 }
 
-define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 entry:
 ; CHECK-LABEL:  umulo.br.i64
-; CHECK:        umulh   x8, x0, x1
-; CHECK-NEXT:   cbz
+; CHECK:        umulh   [[REG:x[0-9]+]], x0, x1
+; CHECK-NEXT:   {{cbz|cmp}}
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -509,6 +639,23 @@
   ret i1 true
 }
 
+define zeroext i1 @umulo2.br.i64(i64 %v1) {
+entry:
+; CHECK-LABEL:  umulo2.br.i64
+; CHECK:        cmn  x0, x0
+; CHECK-NEXT:   b.lo
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone

diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 26301b9..ef209e9 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll

@@ -509,7 +509,7 @@
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i8 %old
 }
 
@@ -534,7 +534,7 @@
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i16 %old
 }
 
@@ -607,7 +607,7 @@
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i8 %old
 }
 
@@ -632,7 +632,7 @@
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
 ; CHECK-NOT: dmb
 
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD_EXT]]
    ret i16 %old
 }
 

diff --git a/test/CodeGen/AArch64/br-undef-cond.ll b/test/CodeGen/AArch64/br-undef-cond.ll
new file mode 100644
index 0000000..12d0da2
--- /dev/null
+++ b/test/CodeGen/AArch64/br-undef-cond.ll

@@ -0,0 +1,26 @@
+; RUN: llc < %s -verify-machineinstrs
+
+; Make sure we don't end up with a CBNZ of an undef v-/phys-reg.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+declare void @bar(i8*)
+
+define void @foo(i8* %m, i32 %off0) {
+.thread1653:
+  br i1 undef, label %0, label %.thread1880
+
+  %1 = icmp eq i32 undef, 0
+  %.not = xor i1 %1, true
+  %brmerge = or i1 %.not, undef
+  br i1 %brmerge, label %.thread1880, label %.thread1705
+
+.thread1705:
+  ret void
+
+.thread1880:
+  %m1652.ph = phi i8* [ %m, %0 ], [ null, %.thread1653 ]
+  call void @bar(i8* %m1652.ph)
+  ret void
+}

diff --git a/test/CodeGen/AArch64/cmp-const-max.ll b/test/CodeGen/AArch64/cmp-const-max.ll
new file mode 100644
index 0000000..0431e39
--- /dev/null
+++ b/test/CodeGen/AArch64/cmp-const-max.ll

@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -aarch64-atomic-cfg-tidy=0 < %s -mtriple=aarch64-none-eabihf -fast-isel=false | FileCheck %s
+
+
+define i32 @ule_64_max(i64 %p) {
+entry:
+; CHECK-LABEL: ule_64_max:
+; CHECK: cmn x0, #1
+; CHECK: b.hi [[RET_ZERO:.LBB[0-9]+_[0-9]+]]
+  %cmp = icmp ule i64 %p, 18446744073709551615 ; 0xffffffffffffffff
+  br i1 %cmp, label %ret_one, label %ret_zero
+
+ret_one:
+  ret i32 1
+
+ret_zero:
+; CHECK: [[RET_ZERO]]:
+; CHECK-NEXT: mov w0, wzr
+  ret i32 0
+}
+
+define i32 @ugt_64_max(i64 %p) {
+entry:
+; CHECK-LABEL: ugt_64_max:
+; CHECK: cmn x0, #1
+; CHECK: b.ls [[RET_ZERO:.LBB[0-9]+_[0-9]+]]
+  %cmp = icmp ugt i64 %p, 18446744073709551615 ; 0xffffffffffffffff
+  br i1 %cmp, label %ret_one, label %ret_zero
+
+ret_one:
+  ret i32 1
+
+ret_zero:
+; CHECK: [[RET_ZERO]]:
+; CHECK-NEXT: mov w0, wzr
+  ret i32 0
+}

diff --git a/test/CodeGen/AArch64/cmpwithshort.ll b/test/CodeGen/AArch64/cmpwithshort.ll
new file mode 100644
index 0000000..14efdcc
--- /dev/null
+++ b/test/CodeGen/AArch64/cmpwithshort.ll

@@ -0,0 +1,46 @@
+; RUN: llc -O3 -march=aarch64 < %s | FileCheck %s 
+
+define i16 @test_1cmp_signed_1(i16* %ptr1) {
+; CHECK-LABLE: @test_1cmp_signed_1
+; CHECK: ldrsh
+; CHECK-NEXT: cmn
+entry:
+  %addr = getelementptr inbounds i16* %ptr1, i16 0
+  %val = load i16* %addr, align 2
+  %cmp = icmp eq i16 %val, -1
+  br i1 %cmp, label %if, label %if.then
+if:
+  ret i16 1
+if.then:
+  ret i16 0
+}
+
+define i16 @test_1cmp_signed_2(i16* %ptr1) {
+; CHECK-LABLE: @test_1cmp_signed_2
+; CHECK: ldrsh
+; CHECK-NEXT: cmn
+entry:
+  %addr = getelementptr inbounds i16* %ptr1, i16 0
+  %val = load i16* %addr, align 2
+  %cmp = icmp sge i16 %val, -1
+  br i1 %cmp, label %if, label %if.then
+if:
+  ret i16 1
+if.then:
+  ret i16 0
+}
+
+define i16 @test_1cmp_unsigned_1(i16* %ptr1) {
+; CHECK-LABLE: @test_1cmp_unsigned_1
+; CHECK: ldrsh
+; CHECK-NEXT: cmn
+entry:
+  %addr = getelementptr inbounds i16* %ptr1, i16 0
+  %val = load i16* %addr, align 2
+  %cmp = icmp uge i16 %val, -1
+  br i1 %cmp, label %if, label %if.then
+if:
+  ret i16 1
+if.then:
+  ret i16 0
+}                                           

diff --git a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
new file mode 100644
index 0000000..df8dc87
--- /dev/null
+++ b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll

@@ -0,0 +1,413 @@
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
+
+; marked as external to prevent possible optimizations
+@a = external global i32
+@b = external global i32
+@c = external global i32
+@d = external global i32
+
+; (a > 10 && b == c) || (a >= 10 && b == d)
+define i32 @combine_gt_ge_10() #0 {
+; CHECK-LABEL: combine_gt_ge_10
+; CHECK: cmp
+; CHECK: b.le
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.lt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sgt i32 %0, 10
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %land.lhs.true3
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp sgt i32 %0, 9
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false, %land.lhs.true
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a > 5 && b == c) || (a < 5 && b == d)
+define i32 @combine_gt_lt_5() #0 {
+; CHECK-LABEL: combine_gt_lt_5
+; CHECK: cmp
+; CHECK: b.le
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.ge
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sgt i32 %0, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp slt i32 %0, 5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a < 5 && b == c) || (a <= 5 && b == d)
+define i32 @combine_lt_ge_5() #0 {
+; CHECK-LABEL: combine_lt_ge_5
+; CHECK: cmp
+; CHECK: b.ge
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.gt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %land.lhs.true3
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp slt i32 %0, 6
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false, %land.lhs.true
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a < 5 && b == c) || (a > 5 && b == d)
+define i32 @combine_lt_gt_5() #0 {
+; CHECK-LABEL: combine_lt_gt_5
+; CHECK: cmp
+; CHECK: b.ge
+; CHECK: ret
+; CHECK-NOT: cmp
+; CHECK: b.le
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp sgt i32 %0, 5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a > -5 && b == c) || (a < -5 && b == d)
+define i32 @combine_gt_lt_n5() #0 {
+; CHECK-LABEL: combine_gt_lt_n5
+; CHECK: cmn
+; CHECK: b.le
+; CHECK: ret
+; CHECK-NOT: cmn
+; CHECK: b.ge
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sgt i32 %0, -5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp slt i32 %0, -5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; (a < -5 && b == c) || (a > -5 && b == d)
+define i32 @combine_lt_gt_n5() #0 {
+; CHECK-LABEL: combine_lt_gt_n5
+; CHECK: cmn
+; CHECK: b.ge
+; CHECK: ret
+; CHECK-NOT: cmn
+; CHECK: b.le
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, -5
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32* @b, align 4
+  %2 = load i32* @c, align 4
+  %cmp1 = icmp eq i32 %1, %2
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp2 = icmp sgt i32 %0, -5
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:                                   ; preds = %lor.lhs.false
+  %3 = load i32* @b, align 4
+  %4 = load i32* @d, align 4
+  %cmp4 = icmp eq i32 %3, %4
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true3, %lor.lhs.false, %land.lhs.true
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true3, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+%struct.Struct = type { i64, i64 }
+
+@glob = internal unnamed_addr global %struct.Struct* null, align 8
+
+declare %struct.Struct* @Update(%struct.Struct*) #1
+
+; no checks for this case, it just should be processed without errors
+define void @combine_non_adjacent_cmp_br(%struct.Struct* nocapture readonly %hdCall) #0 {
+entry:
+  %size = getelementptr inbounds %struct.Struct* %hdCall, i64 0, i32 0
+  %0 = load i64* %size, align 8
+  br label %land.rhs
+
+land.rhs:
+  %rp.06 = phi i64 [ %0, %entry ], [ %sub, %while.body ]
+  %1 = load i64* inttoptr (i64 24 to i64*), align 8
+  %cmp2 = icmp sgt i64 %1, 0
+  br i1 %cmp2, label %while.body, label %while.end
+
+while.body:
+  %2 = load %struct.Struct** @glob, align 8
+  %call = tail call %struct.Struct* @Update(%struct.Struct* %2) #2
+  %sub = add nsw i64 %rp.06, -2
+  %cmp = icmp slt i64 %0, %rp.06
+  br i1 %cmp, label %land.rhs, label %while.end
+
+while.end:
+  ret void
+}
+
+; undefined external to prevent possible optimizations
+declare void @do_something() #1
+
+define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
+; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
+; CHECK: cmn
+; CHECK: b.gt
+; CHECK: cmp
+; CHECK: b.gt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp4 = icmp slt i32 %0, -1
+  br i1 %cmp4, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %while.body.preheader
+  %i.05 = phi i32 [ %inc, %while.body ], [ %0, %while.body.preheader ]
+  tail call void @do_something() #2
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %i.05, 0
+  br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge
+
+while.cond.while.end_crit_edge:                   ; preds = %while.body
+  %.pre = load i32* @a, align 4
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
+  %1 = phi i32 [ %.pre, %while.cond.while.end_crit_edge ], [ %0, %entry ]
+  %cmp1 = icmp slt i32 %1, 2
+  br i1 %cmp1, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %while.end
+  %2 = load i32* @b, align 4
+  %3 = load i32* @d, align 4
+  %cmp2 = icmp eq i32 %2, %3
+  br i1 %cmp2, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true, %while.end
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 123, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 {
+; CHECK-LABEL: do_nothing_if_compares_can_not_be_adjusted_to_each_other
+; CHECK: cmp
+; CHECK: b.gt
+; CHECK: cmn
+; CHECK: b.lt
+entry:
+  %0 = load i32* @a, align 4
+  %cmp4 = icmp slt i32 %0, 1
+  br i1 %cmp4, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %while.body.preheader
+  %i.05 = phi i32 [ %inc, %while.body ], [ %0, %while.body.preheader ]
+  tail call void @do_something() #2
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %i.05, 0
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %1 = load i32* @c, align 4
+  %cmp1 = icmp sgt i32 %1, -3
+  br i1 %cmp1, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %while.end
+  %2 = load i32* @b, align 4
+  %3 = load i32* @d, align 4
+  %cmp2 = icmp eq i32 %2, %3
+  br i1 %cmp2, label %return, label %if.end
+
+if.end:                                           ; preds = %land.lhs.true, %while.end
+  br label %return
+
+return:                                           ; preds = %if.end, %land.lhs.true
+  %retval.0 = phi i32 [ 0, %if.end ], [ 123, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+; Test in the following case, we don't hit 'cmp' and trigger a false positive
+; cmp  w19, #0
+; cinc w0, w19, gt
+; ...
+; fcmp d8, #0.0
+; b.gt .LBB0_5
+
+define i32 @fcmpri(i32 %argc, i8** nocapture readonly %argv) {
+
+; CHECK-LABEL: fcmpri:
+; CHECK: cmp w0, #2
+; CHECK: b.lt .LBB9_3
+; CHECK-NOT: cmp w0, #1
+; CHECK-NOT: b.le .LBB9_3
+
+; CHECK-LABEL-DAG: .LBB9_3
+; CHECK: cmp w19, #0
+; CHECK: fcmp d8, #0.0
+; CHECK: b.gt .LBB9_5
+; CHECK-NOT: cmp w19, #1
+; CHECK-NOT: b.ge .LBB9_5
+
+entry:
+  %cmp = icmp sgt i32 %argc, 1
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  %arrayidx = getelementptr inbounds i8** %argv, i64 1
+  %0 = load i8** %arrayidx, align 8
+  %cmp1 = icmp eq i8* %0, null
+  br i1 %cmp1, label %if.end, label %return
+
+if.end:                                           ; preds = %land.lhs.true, %entry
+  %call = call i32 @zoo(i32 1)
+  %call2 = call double @yoo(i32 -1)
+  %cmp4 = icmp sgt i32 %call, 0
+  %add = zext i1 %cmp4 to i32
+  %cond = add nsw i32 %add, %call
+  %call7 = call i32 @xoo(i32 %cond, i32 2)
+  %cmp9 = fcmp ogt double %call2, 0.000000e+00
+  br i1 %cmp9, label %cond.end14, label %cond.false12
+
+cond.false12:                                     ; preds = %if.end
+  %sub = fadd fast double %call2, -1.000000e+00
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %if.end, %cond.false12
+  %cond15 = phi double [ %sub, %cond.false12 ], [ %call2, %if.end ]
+  %call16 = call i32 @woo(double %cond15, double -2.000000e+00)
+  br label %return
+
+return:                                           ; preds = %land.lhs.true, %cond.end14
+  %retval.0 = phi i32 [ 4, %cond.end14 ], [ 3, %land.lhs.true ]
+  ret i32 %retval.0
+}
+
+declare i32 @zoo(i32)
+
+declare double @yoo(i32)
+
+declare i32 @xoo(i32, i32)
+
+declare i32 @woo(double, double)

diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 5f81cba..dfc83aa 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll

@@ -214,3 +214,20 @@
   ret void
 ; CHECK: ret
 }
+
+define <1 x i1> @test_wide_comparison(i32 %in) {
+; CHECK-LABEL: test_wide_comparison:
+; CHECK: cmp w0, #1234
+; CHECK: cset
+
+  %tmp = icmp sgt i32 %in, 1234
+  %res = select i1 %tmp, <1 x i1> <i1 1>, <1 x i1> zeroinitializer
+  ret <1 x i1> %res
+}
+
+define i32 @test_select_undef() {
+; CHECK-LABEL: test_select_undef:
+; CHECK: ret
+  %res = select i1 undef, i32 0, i32 42
+  ret i32 %res
+}

diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll
new file mode 100644
index 0000000..115fc64
--- /dev/null
+++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll

@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=arm64-apple-darwin8.0 -relocation-model=pic -O1 < %s | FileCheck %s
+
+@.str2 = private unnamed_addr constant [9 x i8] c"_%d____\0A\00", align 1
+
+; Function Attrs: nounwind ssp
+define i32 @main(i32 %argc, i8** %argv) #0 {
+main_:
+  %tmp = alloca i32, align 4
+  %i32T = alloca i32, align 4
+  %i32F = alloca i32, align 4
+  %i32X = alloca i32, align 4
+  store i32 0, i32* %tmp
+  store i32 15, i32* %i32T, align 4
+  store i32 5, i32* %i32F, align 4
+  %tmp6 = load i32* %tmp, align 4
+  %tmp7 = icmp ne i32 %tmp6, 0
+  %tmp8 = xor i1 %tmp7, true
+  %tmp9 = load i32* %i32T, align 4
+  %tmp10 = load i32* %i32F, align 4
+  %DHSelect = select i1 %tmp8, i32 %tmp9, i32 %tmp10
+  store i32 %DHSelect, i32* %i32X, align 4
+  %tmp15 = load i32* %i32X, align 4
+  %tmp17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15)
+  ret i32 0
+
+; CHECK: main:
+; CHECK-DAG: movz
+; CHECK-DAG: orr
+; CHECK: csel
+}
+
+
+declare i32 @printf(i8*, ...) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/AArch64/dont-take-over-the-world.ll b/test/CodeGen/AArch64/dont-take-over-the-world.ll
new file mode 100644
index 0000000..d9e13b7
--- /dev/null
+++ b/test/CodeGen/AArch64/dont-take-over-the-world.ll

@@ -0,0 +1,7 @@
+; RUN: not llc -mtriple=x86-64 2>&1 | FileCheck %s
+
+; To support "arm64" as a -march option, we need to register a second AArch64
+; target, but we have to be careful how we do that so that it doesn't become the
+; target of last resort when the specified triple is completely wrong.
+
+; CHECK: unable to get target for 'x86-64', see --version and --triple.

diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index ce5c0f6..f647c4b 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll

@@ -1,17 +1,24 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=static -o - < %s | FileCheck --check-prefix=CHECK-STATIC %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
 define i32()* @foo() {
 ; The usual ADRP/ADD pair can't be used for a weak reference because it must
-; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+; evaluate to 0 if the symbol is undefined. We use a GOT entry for PIC
+; otherwise a litpool entry.
   ret i32()* @var
 
 
 ; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
 ; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
 
+; CHECK-STATIC: .LCPI0_0:
+; CHECK-STATIC-NEXT: .xword  var
+; CHECK-STATIC: adrp x[[VAR:[0-9]+]], .LCPI0_0
+; CHECK-STATIC: ldr x0, [x[[VAR]], :lo12:.LCPI0_0]
+
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
 ; CHECK-LARGE: movz x0, #:abs_g3:var
@@ -31,6 +38,11 @@
 ; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
 ; CHECK: add x0, [[BASE]], #20
 
+; CHECK-STATIC: .LCPI1_0:
+; CHECK-STATIC-NEXT: .xword arr_var
+; CHECK-STATIC: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, :lo12:.LCPI1_0]
+; CHECK-STATIC: add x0, [[BASE]], #20
+
   ret i32* %addr
 
   ; In the large model, the usual relocations are absolute and can
@@ -49,6 +61,9 @@
 ; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
 ; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
+; CHECK-STATIC: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK-STATIC: add x0, [[BASE]], :lo12:defined_weak_var
+
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var

diff --git a/test/CodeGen/AArch64/f16-convert.ll b/test/CodeGen/AArch64/f16-convert.ll
index 6fabdc5..12412d4 100644
--- a/test/CodeGen/AArch64/f16-convert.ll
+++ b/test/CodeGen/AArch64/f16-convert.ll

@@ -7,7 +7,7 @@
 ; CHECK-NEXT: ret
 
   %tmp = load i16* %a, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -18,8 +18,7 @@
 ; CHECK-NEXT: ret
 
   %tmp = load i16* %a, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -32,7 +31,7 @@
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -45,8 +44,7 @@
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -58,7 +56,7 @@
 
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -70,8 +68,7 @@
 
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -83,7 +80,7 @@
 
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -95,8 +92,7 @@
 
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -108,7 +104,7 @@
 
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
+  %tmp1 = tail call float @llvm.convert.from.fp16.f32(i16 %tmp)
   ret float %tmp1
 }
 
@@ -120,8 +116,7 @@
 
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   %tmp = load i16* %arrayidx, align 2
-  %tmp1 = tail call float @llvm.convert.from.fp16(i16 %tmp)
-  %conv = fpext float %tmp1 to double
+  %conv = tail call double @llvm.convert.from.fp16.f64(i16 %tmp)
   ret double %conv
 }
 
@@ -131,7 +126,7 @@
 ; CHECK-NEXT: str  h0, [x0]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   store i16 %tmp, i16* %a, align 2
   ret void
 }
@@ -143,7 +138,7 @@
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   store i16 %tmp, i16* %a, align 2
   ret void
 }
@@ -154,7 +149,7 @@
 ; CHECK-NEXT: str h0, [x0, w1, sxtw #1]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   store i16 %tmp, i16* %arrayidx, align 2
@@ -168,7 +163,7 @@
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %idxprom = sext i32 %i to i64
   %arrayidx = getelementptr inbounds i16* %a, i64 %idxprom
   store i16 %tmp, i16* %arrayidx, align 2
@@ -181,7 +176,7 @@
 ; CHECK-NEXT: str h0, [x0, x1, lsl #1]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -194,7 +189,7 @@
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %arrayidx = getelementptr inbounds i16* %a, i64 %i
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -206,7 +201,7 @@
 ; CHECK-NEXT: str h0, [x0, #20]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -219,7 +214,7 @@
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %arrayidx = getelementptr inbounds i16* %a, i64 10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -231,7 +226,7 @@
 ; CHECK-NEXT: stur h0, [x0, #-20]
 ; CHECK-NEXT: ret
 
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %val)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %val)
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
@@ -244,11 +239,13 @@
 ; CHECK-NEXT: ret
 
   %conv = fptrunc double %val to float
-  %tmp = tail call i16 @llvm.convert.to.fp16(float %conv)
+  %tmp = tail call i16 @llvm.convert.to.fp16.f32(float %conv)
   %arrayidx = getelementptr inbounds i16* %a, i64 -10
   store i16 %tmp, i16* %arrayidx, align 2
   ret void
 }
 
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone

diff --git a/test/CodeGen/AArch64/fast-isel-addressing-modes.ll b/test/CodeGen/AArch64/fast-isel-addressing-modes.ll
new file mode 100644
index 0000000..d86f00d
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-addressing-modes.ll

@@ -0,0 +1,627 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
+
+; Load / Store Base Register only
+define zeroext i1 @load_breg_i1(i1* %a) {
+; CHECK-LABEL: load_breg_i1
+; CHECK:       ldrb {{w[0-9]+}}, [x0]
+  %1 = load i1* %a
+  ret i1 %1
+}
+
+define zeroext i8 @load_breg_i8(i8* %a) {
+; CHECK-LABEL: load_breg_i8
+; CHECK:       ldrb {{w[0-9]+}}, [x0]
+  %1 = load i8* %a
+  ret i8 %1
+}
+
+define zeroext i16 @load_breg_i16(i16* %a) {
+; CHECK-LABEL: load_breg_i16
+; CHECK:       ldrh {{w[0-9]+}}, [x0]
+  %1 = load i16* %a
+  ret i16 %1
+}
+
+define i32 @load_breg_i32(i32* %a) {
+; CHECK-LABEL: load_breg_i32
+; CHECK:       ldr {{w[0-9]+}}, [x0]
+  %1 = load i32* %a
+  ret i32 %1
+}
+
+define i64 @load_breg_i64(i64* %a) {
+; CHECK-LABEL: load_breg_i64
+; CHECK:       ldr {{x[0-9]+}}, [x0]
+  %1 = load i64* %a
+  ret i64 %1
+}
+
+define float @load_breg_f32(float* %a) {
+; CHECK-LABEL: load_breg_f32
+; CHECK:       ldr {{s[0-9]+}}, [x0]
+  %1 = load float* %a
+  ret float %1
+}
+
+define double @load_breg_f64(double* %a) {
+; CHECK-LABEL: load_breg_f64
+; CHECK:       ldr {{d[0-9]+}}, [x0]
+  %1 = load double* %a
+  ret double %1
+}
+
+define void @store_breg_i1(i1* %a) {
+; CHECK-LABEL: store_breg_i1
+; CHECK:       strb wzr, [x0]
+  store i1 0, i1* %a
+  ret void
+}
+
+define void @store_breg_i1_2(i1* %a) {
+; CHECK-LABEL: store_breg_i1_2
+; CHECK:       strb {{w[0-9]+}}, [x0]
+  store i1 true, i1* %a
+  ret void
+}
+
+define void @store_breg_i8(i8* %a) {
+; CHECK-LABEL: store_breg_i8
+; CHECK:       strb wzr, [x0]
+  store i8 0, i8* %a
+  ret void
+}
+
+define void @store_breg_i16(i16* %a) {
+; CHECK-LABEL: store_breg_i16
+; CHECK:       strh wzr, [x0]
+  store i16 0, i16* %a
+  ret void
+}
+
+define void @store_breg_i32(i32* %a) {
+; CHECK-LABEL: store_breg_i32
+; CHECK:       str wzr, [x0]
+  store i32 0, i32* %a
+  ret void
+}
+
+define void @store_breg_i64(i64* %a) {
+; CHECK-LABEL: store_breg_i64
+; CHECK:       str xzr, [x0]
+  store i64 0, i64* %a
+  ret void
+}
+
+define void @store_breg_f32(float* %a) {
+; CHECK-LABEL: store_breg_f32
+; CHECK:       str wzr, [x0]
+  store float 0.0, float* %a
+  ret void
+}
+
+define void @store_breg_f64(double* %a) {
+; CHECK-LABEL: store_breg_f64
+; CHECK:       str xzr, [x0]
+  store double 0.0, double* %a
+  ret void
+}
+
+; Load Immediate
+define i32 @load_immoff_1() {
+; CHECK-LABEL: load_immoff_1
+; CHECK:       orr {{w|x}}[[REG:[0-9]+]], {{wzr|xzr}}, #0x80
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}x[[REG]]{{\]}}
+  %1 = inttoptr i64 128 to i32*
+  %2 = load i32* %1
+  ret i32 %2
+}
+
+; Load / Store Base Register + Immediate Offset
+; Max supported negative offset
+define i32 @load_breg_immoff_1(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_1
+; CHECK:       ldur {{w[0-9]+}}, [x0, #-256]
+  %1 = add i64 %a, -256
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Min not-supported negative offset
+define i32 @load_breg_immoff_2(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_2
+; CHECK:       sub [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, -257
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Max supported unscaled offset
+define i32 @load_breg_immoff_3(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_3
+; CHECK:       ldur {{w[0-9]+}}, [x0, #255]
+  %1 = add i64 %a, 255
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Min un-supported unscaled offset
+define i32 @load_breg_immoff_4(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_4
+; CHECK:       add [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 257
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Max supported scaled offset
+define i32 @load_breg_immoff_5(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_5
+; CHECK:       ldr {{w[0-9]+}}, [x0, #16380]
+  %1 = add i64 %a, 16380
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Min un-supported scaled offset
+define i32 @load_breg_immoff_6(i64 %a) {
+; SDAG-LABEL: load_breg_immoff_6
+; SDAG:       orr	w[[NUM:[0-9]+]], wzr, #0x4000
+; SDAG-NEXT:  ldr {{w[0-9]+}}, [x0, x[[NUM]]]
+; FAST-LABEL: load_breg_immoff_6
+; FAST:       add [[REG:x[0-9]+]], x0, #4, lsl #12
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 16384
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Max supported negative offset
+define void @store_breg_immoff_1(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_1
+; CHECK:       stur wzr, [x0, #-256]
+  %1 = add i64 %a, -256
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Min not-supported negative offset
+define void @store_breg_immoff_2(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_2
+; CHECK:       sub [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  str wzr, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, -257
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Max supported unscaled offset
+define void @store_breg_immoff_3(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_3
+; CHECK:       stur wzr, [x0, #255]
+  %1 = add i64 %a, 255
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Min un-supported unscaled offset
+define void @store_breg_immoff_4(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_4
+; CHECK:       add [[REG:x[0-9]+]], x0, #257
+; CHECK-NEXT:  str wzr, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 257
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Max supported scaled offset
+define void @store_breg_immoff_5(i64 %a) {
+; CHECK-LABEL: store_breg_immoff_5
+; CHECK:       str wzr, [x0, #16380]
+  %1 = add i64 %a, 16380
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+; Min un-supported scaled offset
+define void @store_breg_immoff_6(i64 %a) {
+; SDAG-LABEL: store_breg_immoff_6
+; SDAG:       orr	w[[NUM:[0-9]+]], wzr, #0x4000
+; SDAG-NEXT:  str wzr, [x0, x[[NUM]]]
+; FAST-LABEL: store_breg_immoff_6
+; FAST:       add [[REG:x[0-9]+]], x0, #4, lsl #12
+; FAST-NEXT:  str wzr, {{\[}}[[REG]]{{\]}}
+  %1 = add i64 %a, 16384
+  %2 = inttoptr i64 %1 to i32*
+  store i32 0, i32* %2
+  ret void
+}
+
+define i64 @load_breg_immoff_7(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_7
+; CHECK:       ldr {{x[0-9]+}}, [x0, #48]
+  %1 = add i64 %a, 48
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Flip add operands
+define i64 @load_breg_immoff_8(i64 %a) {
+; CHECK-LABEL: load_breg_immoff_8
+; CHECK:       ldr {{x[0-9]+}}, [x0, #48]
+  %1 = add i64 48, %a
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Load Base Register + Register Offset
+define i64 @load_breg_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_offreg_1
+; CHECK:       ldr {{x[0-9]+}}, [x0, x1]
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Flip add operands
+define i64 @load_breg_offreg_2(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_offreg_2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0]
+  %1 = add i64 %b, %a
+  %2 = inttoptr i64 %1 to i64*
+  %3 = load i64* %2
+  ret i64 %3
+}
+
+; Load Base Register + Register Offset + Immediate Offset
+define i64 @load_breg_offreg_immoff_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_offreg_immoff_1
+; CHECK:       add [[REG:x[0-9]+]], x0, x1
+; CHECK-NEXT:  ldr x0, {{\[}}[[REG]], #48{{\]}}
+  %1 = add i64 %a, %b
+  %2 = add i64 %1, 48
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  ret i64 %4
+}
+
+define i64 @load_breg_offreg_immoff_2(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_offreg_immoff_2
+; SDAG:       add [[REG1:x[0-9]+]], x0, x1
+; SDAG-NEXT:  orr w[[NUM:[0-9]+]], wzr, #0xf000
+; SDAG-NEXT:  ldr x0, {{\[}}[[REG1]], x[[NUM]]]
+; FAST-LABEL: load_breg_offreg_immoff_2
+; FAST:       add [[REG:x[0-9]+]], x0, #15, lsl #12
+; FAST-NEXT:  ldr x0, {{\[}}[[REG]], x1{{\]}}
+  %1 = add i64 %a, %b
+  %2 = add i64 %1, 61440
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  ret i64 %4
+}
+
+; Load Scaled Register Offset
+define i32 @load_shift_offreg_1(i64 %a) {
+; CHECK-LABEL: load_shift_offreg_1
+; CHECK:       lsl [[REG:x[0-9]+]], x0, #2
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+define i32 @load_mul_offreg_1(i64 %a) {
+; CHECK-LABEL: load_mul_offreg_1
+; CHECK:       lsl [[REG:x[0-9]+]], x0, #2
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}[[REG]]{{\]}}
+  %1 = mul i64 %a, 4
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  ret i32 %3
+}
+
+; Load Base Register + Scaled Register Offset
+define i32 @load_breg_shift_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_shift_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, x0, lsl #2]
+  %1 = shl i64 %a, 2
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  ret i32 %4
+}
+
+define i32 @load_breg_shift_offreg_2(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_shift_offreg_2
+; CHECK:       ldr {{w[0-9]+}}, [x1, x0, lsl #2]
+  %1 = shl i64 %a, 2
+  %2 = add i64 %b, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  ret i32 %4
+}
+
+define i32 @load_breg_shift_offreg_3(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_shift_offreg_3
+; SDAG:       lsl [[REG:x[0-9]+]], x0, #2
+; SDAG-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x1, lsl #2{{\]}}
+; FAST-LABEL: load_breg_shift_offreg_3
+; FAST:       lsl [[REG:x[0-9]+]], x1, #2
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = shl i64 %b, 2
+  %3 = add i64 %1, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_shift_offreg_4(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_shift_offreg_4
+; SDAG:       lsl [[REG:x[0-9]+]], x1, #2
+; SDAG-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+; FAST-LABEL: load_breg_shift_offreg_4
+; FAST:       lsl [[REG:x[0-9]+]], x0, #2
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x1, lsl #2{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = shl i64 %b, 2
+  %3 = add i64 %2, %1
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_shift_offreg_5(i64 %a, i64 %b) {
+; SDAG-LABEL: load_breg_shift_offreg_5
+; SDAG:       lsl [[REG:x[0-9]+]], x1, #3
+; SDAG-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+; FAST-LABEL: load_breg_shift_offreg_5
+; FAST:       lsl [[REG:x[0-9]+]], x1, #3
+; FAST-NEXT:  ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = shl i64 %a, 2
+  %2 = shl i64 %b, 3
+  %3 = add i64 %1, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_mul_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_mul_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, x0, lsl #2]
+  %1 = mul i64 %a, 4
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  ret i32 %4
+}
+
+define zeroext i8 @load_breg_and_offreg_1(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_1
+; CHECK:       ldrb {{w[0-9]+}}, [x1, w0, uxtw]
+  %1 = and i64 %a, 4294967295
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  ret i8 %4
+}
+
+define zeroext i16 @load_breg_and_offreg_2(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_2
+; CHECK:       ldrh {{w[0-9]+}}, [x1, w0, uxtw #1]
+  %1 = and i64 %a, 4294967295
+  %2 = shl i64 %1, 1
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i16*
+  %5 = load i16* %4
+  ret i16 %5
+}
+
+define i32 @load_breg_and_offreg_3(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_3
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = and i64 %a, 4294967295
+  %2 = shl i64 %1, 2
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i64 @load_breg_and_offreg_4(i64 %a, i64 %b) {
+; CHECK-LABEL: load_breg_and_offreg_4
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = and i64 %a, 4294967295
+  %2 = shl i64 %1, 3
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+; Not all 'and' instructions have immediates.
+define i64 @load_breg_and_offreg_5(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: load_breg_and_offreg_5
+; CHECK:       and [[REG:x[0-9]+]], x0, x2
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}[[REG]], x1{{\]}}
+  %1 = and i64 %a, %c
+  %2 = add i64 %1, %b
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  ret i64 %4
+}
+
+define i64 @load_breg_and_offreg_6(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: load_breg_and_offreg_6
+; CHECK:       and [[REG:x[0-9]+]], x0, x2
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}x1, [[REG]], lsl #3{{\]}}
+  %1 = and i64 %a, %c
+  %2 = shl i64 %1, 3
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+; Load Base Register + Scaled Register Offset + Sign/Zero extension
+define i32 @load_breg_zext_shift_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_zext_shift_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_zext_shift_offreg_2(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_zext_shift_offreg_2
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_zext_mul_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_zext_mul_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, uxtw #2]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 4
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_sext_shift_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, sxtw #2]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+define i32 @load_breg_sext_shift_offreg_2(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_2
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, sxtw #2]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 2
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+; Make sure that we don't drop the first 'add' instruction.
+define i32 @load_breg_sext_shift_offreg_3(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_3
+; CHECK:       add [[REG:w[0-9]+]], w0, #4
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}x1, [[REG]], sxtw #2{{\]}}
+  %1 = add i32 %a, 4
+  %2 = sext i32 %1 to i64
+  %3 = shl i64 %2, 2
+  %4 = add i64 %b, %3
+  %5 = inttoptr i64 %4 to i32*
+  %6 = load i32* %5
+  ret i32 %6
+}
+
+
+define i32 @load_breg_sext_mul_offreg_1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_mul_offreg_1
+; CHECK:       ldr {{w[0-9]+}}, [x1, w0, sxtw #2]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 4
+  %3 = add i64 %2, %b
+  %4 = inttoptr i64 %3 to i32*
+  %5 = load i32* %4
+  ret i32 %5
+}
+
+; Load Scaled Register Offset + Immediate Offset + Sign/Zero extension
+define i64 @load_sext_shift_offreg_imm1(i32 %a) {
+; CHECK-LABEL: load_sext_shift_offreg_imm1
+; CHECK:       sbfiz [[REG:x[0-9]+]], {{x[0-9]+}}, #3, #32
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}[[REG]], #8{{\]}}
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %2, 8
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+; Load Base Register + Scaled Register Offset + Immediate Offset + Sign/Zero extension
+define i64 @load_breg_sext_shift_offreg_imm1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_breg_sext_shift_offreg_imm1
+; CHECK:       add [[REG:x[0-9]+]], x1, w0, sxtw #3
+; CHECK-NEXT:  ldr {{x[0-9]+}}, {{\[}}[[REG]], #8{{\]}}
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = add i64 %3, 8
+  %5 = inttoptr i64 %4 to i64*
+  %6 = load i64* %5
+  ret i64 %6
+}
+
+; Test that the kill flag is not set - the machine instruction verifier does that for us.
+define i64 @kill_reg(i64 %a) {
+  %1 = sub i64 %a, 8
+  %2 = add i64 %1, 96
+  %3 = inttoptr i64 %2 to i64*
+  %4 = load i64* %3
+  %5 = add i64 %2, %4
+  ret i64 %5
+}
+
+define void @store_fi(i64 %i) {
+; CHECK-LABEL: store_fi
+; CHECK:       mov [[REG:x[0-9]+]], sp
+; CHECK:       str {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = alloca [8 x i32]
+  %2 = ptrtoint [8 x i32]* %1 to i64
+  %3 = mul i64 %i, 4
+  %4 = add i64 %2, %3
+  %5 = inttoptr i64 %4 to i32*
+  store i32 47, i32* %5, align 4
+  ret void
+}
+
+define i32 @load_fi(i64 %i) {
+; CHECK-LABEL: load_fi
+; CHECK:       mov [[REG:x[0-9]+]], sp
+; CHECK:       ldr {{w[0-9]+}}, {{\[}}[[REG]], x0, lsl #2{{\]}}
+  %1 = alloca [8 x i32]
+  %2 = ptrtoint [8 x i32]* %1 to i64
+  %3 = mul i64 %i, 4
+  %4 = add i64 %2, %3
+  %5 = inttoptr i64 %4 to i32*
+  %6 = load i32* %5, align 4
+  ret i32 %6
+}
+

diff --git a/test/CodeGen/AArch64/fast-isel-branch_weights.ll b/test/CodeGen/AArch64/fast-isel-branch_weights.ll
new file mode 100644
index 0000000..5b22476
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-branch_weights.ll

@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=arm64-apple-darwin -aarch64-atomic-cfg-tidy=0                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; Test if the BBs are reordred according to their branch weights.
+define i64 @branch_weights_test(i64 %a, i64 %b) {
+; CHECK-LABEL: branch_weights_test
+; CHECK-LABEL: success
+; CHECK-LABEL: fail
+  %1 = icmp ult i64 %a, %b
+  br i1 %1, label %fail, label %success, !prof !0
+
+fail:
+  ret i64 -1
+
+success:
+  ret i64 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}

diff --git a/test/CodeGen/AArch64/fast-isel-call-return.ll b/test/CodeGen/AArch64/fast-isel-call-return.ll
new file mode 100644
index 0000000..9b10969
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-call-return.ll

@@ -0,0 +1,12 @@
+; RUN: llc -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnu"
+
+define i8* @test_call_return_type(i64 %size) {
+entry:
+; CHECK: bl xmalloc
+  %0 = call noalias i8* @xmalloc(i64 undef)
+  ret i8* %0
+}
+
+declare noalias i8* @xmalloc(i64)

diff --git a/test/CodeGen/AArch64/fast-isel-cbz.ll b/test/CodeGen/AArch64/fast-isel-cbz.ll
new file mode 100644
index 0000000..6e31a04
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-cbz.ll

@@ -0,0 +1,70 @@
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define i32 @icmp_eq_i1(i1 %a) {
+; CHECK-LABEL: icmp_eq_i1
+; CHECK:       tbz w0, #0, {{LBB.+_2}}
+  %1 = icmp eq i1 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i8(i8 %a) {
+; CHECK-LABEL: icmp_eq_i8
+; CHECK:       uxtb [[REG:w[0-9]+]], w0
+; CHECK:       cbz [[REG]], {{LBB.+_2}}
+  %1 = icmp eq i8 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i16(i16 %a) {
+; CHECK-LABEL: icmp_eq_i16
+; CHECK:       uxth [[REG:w[0-9]+]], w0
+; CHECK:       cbz [[REG]], {{LBB.+_2}}
+  %1 = icmp eq i16 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i32(i32 %a) {
+; CHECK-LABEL: icmp_eq_i32
+; CHECK:       cbz w0, {{LBB.+_2}}
+  %1 = icmp eq i32 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i64(i64 %a) {
+; CHECK-LABEL: icmp_eq_i64
+; CHECK:       cbz x0, {{LBB.+_2}}
+  %1 = icmp eq i64 %a, 0
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq_ptr(i8* %a) {
+; CHECK-LABEL: icmp_eq_ptr
+; CHECK:       cbz x0, {{LBB.+_2}}
+  %1 = icmp eq i8* %a, null
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+

diff --git a/test/CodeGen/AArch64/fast-isel-cmp-branch.ll b/test/CodeGen/AArch64/fast-isel-cmp-branch.ll
new file mode 100644
index 0000000..3651f19
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-cmp-branch.ll

@@ -0,0 +1,293 @@
+; RUN: llc                             -aarch64-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define i32 @fcmp_oeq(float %x, float %y) {
+; CHECK-LABEL: fcmp_oeq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.ne {{LBB.+_2}}
+  %1 = fcmp oeq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ogt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ogt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.le {{LBB.+_2}}
+  %1 = fcmp ogt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_oge(float %x, float %y) {
+; CHECK-LABEL: fcmp_oge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.lt {{LBB.+_2}}
+  %1 = fcmp oge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_olt(float %x, float %y) {
+; CHECK-LABEL: fcmp_olt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.pl {{LBB.+_2}}
+  %1 = fcmp olt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ole(float %x, float %y) {
+; CHECK-LABEL: fcmp_ole
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.hi {{LBB.+_2}}
+  %1 = fcmp ole float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_one(float %x, float %y) {
+; CHECK-LABEL: fcmp_one
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.mi
+; CHECK-NEXT:  b.gt
+  %1 = fcmp one float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ord(float %x, float %y) {
+; CHECK-LABEL: fcmp_ord
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.vs {{LBB.+_2}}
+  %1 = fcmp ord float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uno(float %x, float %y) {
+; CHECK-LABEL: fcmp_uno
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.vs {{LBB.+_2}}
+  %1 = fcmp uno float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ueq(float %x, float %y) {
+; CHECK-LABEL: fcmp_ueq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.eq {{LBB.+_2}}
+; CHECK-NEXT:  b.vs {{LBB.+_2}}
+  %1 = fcmp ueq float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ugt(float %x, float %y) {
+; CHECK-LABEL: fcmp_ugt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.ls {{LBB.+_2}}
+  %1 = fcmp ugt float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_uge(float %x, float %y) {
+; CHECK-LABEL: fcmp_uge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.mi {{LBB.+_2}}
+  %1 = fcmp uge float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ult(float %x, float %y) {
+; CHECK-LABEL: fcmp_ult
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.ge {{LBB.+_2}}
+  %1 = fcmp ult float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_ule(float %x, float %y) {
+; CHECK-LABEL: fcmp_ule
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.gt {{LBB.+_2}}
+  %1 = fcmp ule float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @fcmp_une(float %x, float %y) {
+; CHECK-LABEL: fcmp_une
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  b.eq {{LBB.+_2}}
+  %1 = fcmp une float %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_eq(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_eq
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.ne {{LBB.+_2}}
+  %1 = icmp eq i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ne(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ne
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.eq {{LBB.+_2}}
+  %1 = icmp ne i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ugt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ugt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.ls {{LBB.+_2}}
+  %1 = icmp ugt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_uge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_uge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.lo {{LBB.+_2}}
+  %1 = icmp uge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ult(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ult
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.hs {{LBB.+_2}}
+  %1 = icmp ult i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_ule(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_ule
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.hi {{LBB.+_2}}
+  %1 = icmp ule i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sgt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sgt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.le {{LBB.+_2}}
+  %1 = icmp sgt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sge(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.lt {{LBB.+_2}}
+  %1 = icmp sge i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_slt(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_slt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.ge {{LBB.+_2}}
+  %1 = icmp slt i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+
+define i32 @icmp_sle(i32 %x, i32 %y) {
+; CHECK-LABEL: icmp_sle
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  b.gt {{LBB.+_2}}
+  %1 = icmp sle i32 %x, %y
+  br i1 %1, label %bb1, label %bb2
+bb2:
+  ret i32 1
+bb1:
+  ret i32 0
+}
+

diff --git a/test/CodeGen/AArch64/fast-isel-folding.ll b/test/CodeGen/AArch64/fast-isel-folding.ll
new file mode 100644
index 0000000..6b524ff
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-folding.ll

@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel-abort -verify-machineinstrs < %s
+
+; Test that we don't fold the shift.
+define i64 @fold_shift_test(i64 %a, i1 %c) {
+  %1 = sub i64 %a, 8
+  %2 = ashr i64 %1, 3
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %3 = icmp ult i64 0, %2
+  br i1 %3, label %bb2, label %bb3
+bb2:
+  ret i64 1
+bb3:
+  ret i64 2
+}
+
+; Test that we don't fold the sign-extend.
+define i64 @fold_sext_test1(i32 %a, i1 %c) {
+  %1 = sub i32 %a, 8
+  %2 = sext i32 %1 to i64
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %3 = icmp ult i64 0, %2
+  br i1 %3, label %bb2, label %bb3
+bb2:
+  ret i64 1
+bb3:
+  ret i64 2
+}
+
+; Test that we don't fold the sign-extend.
+define i64 @fold_sext_test2(i32 %a, i1 %c) {
+  %1 = sub i32 %a, 8
+  %2 = sext i32 %1 to i64
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  %3 = shl i64 %2, 4
+  ret i64 %3
+bb2:
+  ret i64 %2
+}
+
+; Test that we clear the kill flag.
+define i32 @fold_kill_test(i32 %a) {
+  %1 = sub i32 %a, 8
+  %2 = shl i32 %1, 3
+  %3 = icmp ult i32 0, %2
+  br i1 %3, label %bb1, label %bb2
+bb1:
+  ret i32 %2
+bb2:
+  %4 = add i32 %2, 4
+  ret i32 %4
+}

diff --git a/test/CodeGen/AArch64/fast-isel-gep.ll b/test/CodeGen/AArch64/fast-isel-gep.ll
new file mode 100644
index 0000000..4dc0a05
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-gep.ll

@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+%struct.foo = type { i32, i64, float, double }
+
+define double* @test_struct(%struct.foo* %f) {
+; CHECK-LABEL: test_struct
+; CHECK:       add x0, x0, #24
+  %1 = getelementptr inbounds %struct.foo* %f, i64 0, i32 3
+  ret double* %1
+}
+
+define i32* @test_array1(i32* %a, i64 %i) {
+; CHECK-LABEL: test_array1
+; CHECK:       orr [[REG:x[0-9]+]], xzr, #0x4
+; CHECK-NEXT:  madd  x0, x1, [[REG]], x0
+  %1 = getelementptr inbounds i32* %a, i64 %i
+  ret i32* %1
+}
+
+define i32* @test_array2(i32* %a) {
+; CHECK-LABEL: test_array2
+; CHECK:       add  x0, x0, #16
+  %1 = getelementptr inbounds i32* %a, i64 4
+  ret i32* %1
+}
+
+define i32* @test_array3(i32* %a) {
+; CHECK-LABEL: test_array3
+; CHECK:       add x0, x0, #1, lsl #12
+  %1 = getelementptr inbounds i32* %a, i64 1024
+  ret i32* %1
+}
+
+define i32* @test_array4(i32* %a) {
+; CHECK-LABEL: test_array4
+; CHECK:       movz [[REG:x[0-9]+]], #0x1008
+; CHECK-NEXR:  add x0, x0, [[REG]]
+  %1 = getelementptr inbounds i32* %a, i64 1026
+  ret i32* %1
+}
+
+define i32* @test_array5(i32* %a, i32 %i) {
+; CHECK-LABEL: test_array5
+; CHECK:       sxtw [[REG1:x[0-9]+]], w1
+; CHECK-NEXT:  orr  [[REG2:x[0-9]+]], xzr, #0x4
+; CHECK-NEXT:  madd  {{x[0-9]+}}, [[REG1]], [[REG2]], x0
+  %1 = getelementptr inbounds i32* %a, i32 %i
+  ret i32* %1
+}

diff --git a/test/CodeGen/AArch64/fast-isel-int-ext.ll b/test/CodeGen/AArch64/fast-isel-int-ext.ll
new file mode 100644
index 0000000..866feba
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext.ll

@@ -0,0 +1,491 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+;
+; Test that we only use the sign/zero extend in the address calculation when
+; necessary.
+;
+; SHIFT
+;
+define i64 @load_addr_shift_zext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_zext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_zext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+;
+; MUL
+;
+define i64 @load_addr_mul_zext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_zext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_zext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_sext1(i32 %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext1
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_sext2(i32 zeroext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_sext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+
+;
+; Test folding of the sign-/zero-extend into the load instruction.
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w0, [x0, #-8]
+; CHECK-NOT:   uxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldursb w0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldursh w0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldursb x0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldursh x0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldursw x0, [x0, #-8]
+; CHECK-NOT:   sxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Register
+define i32 @load_register_zext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_zext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_zext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, x1]
+; CHECK-NOT:   uxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_register_sext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_sext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_sext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, x1]
+; CHECK-NOT:   sxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Extend
+define i32 @load_extend_zext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = zext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_zext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = zext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_zext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = zext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = zext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  %5 = zext i32 %4 to i64
+  ret i64 %5
+}
+
+define i32 @load_extend_sext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_sext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_sext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  %5 = sext i32 %4 to i64
+  ret i64 %5
+}
+

diff --git a/test/CodeGen/AArch64/fast-isel-int-ext2.ll b/test/CodeGen/AArch64/fast-isel-int-ext2.ll
new file mode 100644
index 0000000..8df26b2
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext2.ll

@@ -0,0 +1,439 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=false -disable-cgp-branch-opts -verify-machineinstrs < %s | FileCheck %s
+
+;
+; Test folding of the sign-/zero-extend into the load instruction.
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w0, [x0, #-8]
+; CHECK-NOT:   uxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w0, [x0, #-8]
+; CHECK-NOT:   uxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w0, [x0, #-8]
+; CHECK-NOT:   uxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldursb w0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldursh w0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldursb x0, [x0, #-8]
+; CHECK-NOT:   sxtb
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldursh x0, [x0, #-8]
+; CHECK-NOT:   sxth
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldursw x0, [x0, #-8]
+; CHECK-NOT:   sxtw
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Register
+define i32 @load_register_zext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_zext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_zext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, x1]
+; CHECK-NOT:   uxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_register_sext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_sext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_sext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, x1]
+; CHECK-NOT:   sxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  br label %bb2
+
+bb2:
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Extend
+define i32 @load_extend_zext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_zext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_zext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, w1, sxtw]
+; CHECK-NOT:   uxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  br label %bb2
+
+bb2:
+  %5 = zext i32 %4 to i64
+  ret i64 %5
+}
+
+define i32 @load_extend_sext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_sext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_sext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, w1, sxtw]
+; CHECK-NOT:   sxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  br label %bb2
+
+bb2:
+  %5 = sext i32 %4 to i64
+  ret i64 %5
+}
+

diff --git a/test/CodeGen/AArch64/fast-isel-int-ext3.ll b/test/CodeGen/AArch64/fast-isel-int-ext3.ll
new file mode 100644
index 0000000..5d55a6b
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext3.ll

@@ -0,0 +1,117 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+
+;
+; Test folding of the sign-/zero-extend into the load instruction.
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       uxtb w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       uxth w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w[[REG:[0-9]+]], [x0, #-8]
+; CHECK:       ubfx x0, x[[REG]], #0, #8
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w[[REG:[0-9]+]], [x0, #-8]
+; CHECK:       ubfx x0, x[[REG]], #0, #16
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w[[REG:[0-9]+]], [x0, #-8]
+; CHECK:       ubfx x0, x[[REG]], #0, #32
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32 addrspace(256)*
+  %3 = load i32 addrspace(256)* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldurb [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxtb w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldurh [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxth w0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldurb [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxtb x0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i8 addrspace(256)*
+  %3 = load i8 addrspace(256)* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldurh [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxth x0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i16 addrspace(256)*
+  %3 = load i16 addrspace(256)* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldur [[REG:w[0-9]+]], [x0, #-8]
+; CHECK:       sxtw x0, [[REG]]
+  %1 = sub i64 %a, 8
+  %2 = inttoptr i64 %1 to i32 addrspace(256)*
+  %3 = load i32 addrspace(256)* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+

diff --git a/test/CodeGen/AArch64/fast-isel-int-ext4.ll b/test/CodeGen/AArch64/fast-isel-int-ext4.ll
new file mode 100644
index 0000000..f25bb98
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext4.ll

@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @kill_flag(i16 signext %a) {
+; CHECK-LABEL: kill_flag
+entry:
+  %0 = sext i16 %a to i32
+  br label %bb1
+
+bb1:
+  %1 = icmp slt i32 undef, %0
+  br i1 %1, label %loop, label %exit
+
+loop:
+  %2 = sext i16 %a to i32
+  %3 = icmp slt i32 undef, %2
+  br i1 %3, label %bb1, label %exit
+
+exit:
+  ret i32 0
+}

diff --git a/test/CodeGen/AArch64/fast-isel-intrinsic.ll b/test/CodeGen/AArch64/fast-isel-intrinsic.ll
new file mode 100644
index 0000000..fd1198a
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-intrinsic.ll

@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-apple-darwin            -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+
+define float @fabs_f32(float %a) {
+; CHECK-LABEL: fabs_f32
+; CHECK:       fabs s0, s0
+  %1 = call float @llvm.fabs.f32(float %a)
+  ret float %1
+}
+
+define double @fabs_f64(double %a) {
+; CHECK-LABEL: fabs_f64
+; CHECK:       fabs d0, d0
+  %1 = call double @llvm.fabs.f64(double %a)
+  ret double %1
+}
+
+declare double @llvm.fabs.f64(double)
+declare float @llvm.fabs.f32(float)

diff --git a/test/CodeGen/AArch64/fast-isel-logic-op.ll b/test/CodeGen/AArch64/fast-isel-logic-op.ll
new file mode 100644
index 0000000..2c7486e
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-logic-op.ll

@@ -0,0 +1,362 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=0                  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=1 -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; AND
+define zeroext i1 @and_rr_i1(i1 signext %a, i1 signext %b) {
+; CHECK-LABEL: and_rr_i1
+; CHECK:       and [[REG:w[0-9]+]], w0, w1
+  %1 = and i1 %a, %b
+  ret i1 %1
+}
+
+define zeroext i8 @and_rr_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: and_rr_i8
+; CHECK:       and [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = and i8 %a, %b
+  ret i8 %1
+}
+
+define zeroext i16 @and_rr_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: and_rr_i16
+; CHECK:       and [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = and i16 %a, %b
+  ret i16 %1
+}
+
+define i32 @and_rr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_rr_i32
+; CHECK:       and w0, w0, w1
+  %1 = and i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @and_rr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_rr_i64
+; CHECK:       and x0, x0, x1
+  %1 = and i64 %a, %b
+  ret i64 %1
+}
+
+define zeroext i1 @and_ri_i1(i1 signext %a) {
+; CHECK-LABEL: and_ri_i1
+; CHECK:       and {{w[0-9]+}}, w0, #0x1
+  %1 = and i1 %a, 1
+  ret i1 %1
+}
+
+define zeroext i8 @and_ri_i8(i8 signext %a) {
+; CHECK-LABEL: and_ri_i8
+; CHECK:       and {{w[0-9]+}}, w0, #0xf
+  %1 = and i8 %a, 15
+  ret i8 %1
+}
+
+define zeroext i16 @and_ri_i16(i16 signext %a) {
+; CHECK-LABEL: and_ri_i16
+; CHECK:       and {{w[0-9]+}}, w0, #0xff
+  %1 = and i16 %a, 255
+  ret i16 %1
+}
+
+define i32 @and_ri_i32(i32 %a) {
+; CHECK-LABEL: and_ri_i32
+; CHECK:       and w0, w0, #0xff
+  %1 = and i32 %a, 255
+  ret i32 %1
+}
+
+define i64 @and_ri_i64(i64 %a) {
+; CHECK-LABEL: and_ri_i64
+; CHECK:       and x0, x0, #0xff
+  %1 = and i64 %a, 255
+  ret i64 %1
+}
+
+define zeroext i8 @and_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: and_rs_i8
+; CHECK:       and [[REG:w[0-9]+]], w0, w1, lsl #4
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xff|#0xf0}}
+  %1 = shl i8 %b, 4
+  %2 = and i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @and_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: and_rs_i16
+; CHECK:       and [[REG:w[0-9]+]], w0, w1, lsl #8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xffff|#0xff00}}
+  %1 = shl i16 %b, 8
+  %2 = and i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @and_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_rs_i32
+; CHECK:       and w0, w0, w1, lsl #8
+  %1 = shl i32 %b, 8
+  %2 = and i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @and_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_rs_i64
+; CHECK:       and x0, x0, x1, lsl #8
+  %1 = shl i64 %b, 8
+  %2 = and i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @and_mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_mul_i32
+; CHECK:       and w0, w0, w1, lsl #2
+  %1 = mul i32 %b, 4
+  %2 = and i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @and_mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_mul_i64
+; CHECK:       and x0, x0, x1, lsl #2
+  %1 = mul i64 %b, 4
+  %2 = and i64 %a, %1
+  ret i64 %2
+}
+
+; OR
+define zeroext i1 @or_rr_i1(i1 signext %a, i1 signext %b) {
+; CHECK-LABEL: or_rr_i1
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1
+  %1 = or i1 %a, %b
+  ret i1 %1
+}
+
+define zeroext i8 @or_rr_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: or_rr_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = or i8 %a, %b
+  ret i8 %1
+}
+
+define zeroext i16 @or_rr_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: or_rr_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = or i16 %a, %b
+  ret i16 %1
+}
+
+define i32 @or_rr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_rr_i32
+; CHECK:       orr w0, w0, w1
+  %1 = or i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @or_rr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_rr_i64
+; CHECK:       orr x0, x0, x1
+  %1 = or i64 %a, %b
+  ret i64 %1
+}
+
+define zeroext i8 @or_ri_i8(i8 %a) {
+; CHECK-LABEL: or_ri_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, #0xf
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = or i8 %a, 15
+  ret i8 %1
+}
+
+define zeroext i16 @or_ri_i16(i16 %a) {
+; CHECK-LABEL: or_ri_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, #0xff
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = or i16 %a, 255
+  ret i16 %1
+}
+
+define i32 @or_ri_i32(i32 %a) {
+; CHECK-LABEL: or_ri_i32
+; CHECK:       orr w0, w0, #0xff
+  %1 = or i32 %a, 255
+  ret i32 %1
+}
+
+define i64 @or_ri_i64(i64 %a) {
+; CHECK-LABEL: or_ri_i64
+; CHECK:       orr x0, x0, #0xff
+  %1 = or i64 %a, 255
+  ret i64 %1
+}
+
+define zeroext i8 @or_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: or_rs_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1, lsl #4
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xff|#0xf0}}
+  %1 = shl i8 %b, 4
+  %2 = or i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @or_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: or_rs_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, w1, lsl #8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xffff|#0xff00}}
+  %1 = shl i16 %b, 8
+  %2 = or i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @or_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_rs_i32
+; CHECK:       orr w0, w0, w1, lsl #8
+  %1 = shl i32 %b, 8
+  %2 = or i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @or_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_rs_i64
+; CHECK:       orr x0, x0, x1, lsl #8
+  %1 = shl i64 %b, 8
+  %2 = or i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @or_mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_mul_i32
+; CHECK:       orr w0, w0, w1, lsl #2
+  %1 = mul i32 %b, 4
+  %2 = or i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @or_mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_mul_i64
+; CHECK:       orr x0, x0, x1, lsl #2
+  %1 = mul i64 %b, 4
+  %2 = or i64 %a, %1
+  ret i64 %2
+}
+
+; XOR
+define zeroext i1 @xor_rr_i1(i1 signext %a, i1 signext %b) {
+; CHECK-LABEL: xor_rr_i1
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1
+  %1 = xor i1 %a, %b
+  ret i1 %1
+}
+
+define zeroext i8 @xor_rr_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: xor_rr_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = xor i8 %a, %b
+  ret i8 %1
+}
+
+define zeroext i16 @xor_rr_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: xor_rr_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = xor i16 %a, %b
+  ret i16 %1
+}
+
+define i32 @xor_rr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_rr_i32
+; CHECK:       eor w0, w0, w1
+  %1 = xor i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @xor_rr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_rr_i64
+; CHECK:       eor x0, x0, x1
+  %1 = xor i64 %a, %b
+  ret i64 %1
+}
+
+define zeroext i8 @xor_ri_i8(i8 signext %a) {
+; CHECK-LABEL: xor_ri_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, #0xf
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = xor i8 %a, 15
+  ret i8 %1
+}
+
+define zeroext i16 @xor_ri_i16(i16 signext %a) {
+; CHECK-LABEL: xor_ri_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, #0xff
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = xor i16 %a, 255
+  ret i16 %1
+}
+
+define i32 @xor_ri_i32(i32 %a) {
+; CHECK-LABEL: xor_ri_i32
+; CHECK:       eor w0, w0, #0xff
+  %1 = xor i32 %a, 255
+  ret i32 %1
+}
+
+define i64 @xor_ri_i64(i64 %a) {
+; CHECK-LABEL: xor_ri_i64
+; CHECK:       eor x0, x0, #0xff
+  %1 = xor i64 %a, 255
+  ret i64 %1
+}
+
+define zeroext i8 @xor_rs_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: xor_rs_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1, lsl #4
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xff|#0xf0}}
+  %1 = shl i8 %b, 4
+  %2 = xor i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @xor_rs_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: xor_rs_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, w1, lsl #8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], {{#0xffff|#0xff00}}
+  %1 = shl i16 %b, 8
+  %2 = xor i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @xor_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_rs_i32
+; CHECK:       eor w0, w0, w1, lsl #8
+  %1 = shl i32 %b, 8
+  %2 = xor i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @xor_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_rs_i64
+; CHECK:       eor x0, x0, x1, lsl #8
+  %1 = shl i64 %b, 8
+  %2 = xor i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @xor_mul_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_mul_i32
+; CHECK:       eor w0, w0, w1, lsl #2
+  %1 = mul i32 %b, 4
+  %2 = xor i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @xor_mul_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_mul_i64
+; CHECK:       eor x0, x0, x1, lsl #2
+  %1 = mul i64 %b, 4
+  %2 = xor i64 %a, %1
+  ret i64 %2
+}
+

diff --git a/test/CodeGen/AArch64/fast-isel-mul.ll b/test/CodeGen/AArch64/fast-isel-mul.ll
index d02c67f..f2fda27 100644
--- a/test/CodeGen/AArch64/fast-isel-mul.ll
+++ b/test/CodeGen/AArch64/fast-isel-mul.ll

@@ -1,40 +1,44 @@
-; RUN: llc -fast-isel -fast-isel-abort -mtriple=aarch64 -o - %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
 
-@var8 = global i8 0
-@var16 = global i16 0
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @test_mul8(i8 %lhs, i8 %rhs) {
+define zeroext i8 @test_mul8(i8 %lhs, i8 %rhs) {
 ; CHECK-LABEL: test_mul8:
-; CHECK: mul w0, w0, w1
-;  %lhs = load i8* @var8
-;  %rhs = load i8* @var8
-  %prod = mul i8 %lhs, %rhs
-  store i8 %prod, i8* @var8
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i8 %lhs, %rhs
+  ret i8 %1
 }
 
-define void @test_mul16(i16 %lhs, i16 %rhs) {
+define zeroext i16 @test_mul16(i16 %lhs, i16 %rhs) {
 ; CHECK-LABEL: test_mul16:
-; CHECK: mul w0, w0, w1
-  %prod = mul i16 %lhs, %rhs
-  store i16 %prod, i16* @var16
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i16 %lhs, %rhs
+  ret i16 %1
 }
 
-define void @test_mul32(i32 %lhs, i32 %rhs) {
+define i32 @test_mul32(i32 %lhs, i32 %rhs) {
 ; CHECK-LABEL: test_mul32:
-; CHECK: mul w0, w0, w1
-  %prod = mul i32 %lhs, %rhs
-  store i32 %prod, i32* @var32
-  ret void
+; CHECK:       mul {{w[0-9]+}}, w0, w1
+  %1 = mul i32 %lhs, %rhs
+  ret i32 %1
 }
 
-define void @test_mul64(i64 %lhs, i64 %rhs) {
+define i64 @test_mul64(i64 %lhs, i64 %rhs) {
 ; CHECK-LABEL: test_mul64:
-; CHECK: mul x0, x0, x1
-  %prod = mul i64 %lhs, %rhs
-  store i64 %prod, i64* @var64
-  ret void
+; CHECK:       mul {{x[0-9]+}}, x0, x1
+  %1 = mul i64 %lhs, %rhs
+  ret i64 %1
 }
+
+define i32 @test_mul2shift_i32(i32 %a) {
+; CHECK-LABEL: test_mul2shift_i32:
+; CHECK:       lsl {{w[0-9]+}}, w0, #2
+  %1 = mul i32 %a, 4
+  ret i32 %1
+}
+
+define i64 @test_mul2shift_i64(i64 %a) {
+; CHECK-LABEL: test_mul2shift_i64:
+; CHECK:       lsl {{x[0-9]+}}, x0, #3
+  %1 = mul i64 %a, 8
+  ret i64 %1
+}
+

diff --git a/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll b/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll
new file mode 100644
index 0000000..8d2d39a
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-runtime-libcall.ll

@@ -0,0 +1,96 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -code-model=small -verify-machineinstrs < %s | FileCheck %s --check-prefix=SMALL
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix=LARGE
+
+define float @frem_f32(float %a, float %b) {
+; SMALL-LABEL: frem_f32
+; SMALL:       bl _fmodf
+; LARGE-LABEL: frem_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _fmodf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _fmodf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = frem float %a, %b
+  ret float %1
+}
+
+define double @frem_f64(double %a, double %b) {
+; SMALL-LABEL: frem_f64
+; SMALL:       bl _fmod
+; LARGE-LABEL: frem_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _fmod@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _fmod@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = frem double %a, %b
+  ret double %1
+}
+
+define float @sin_f32(float %a) {
+; SMALL-LABEL: sin_f32
+; SMALL:       bl _sinf
+; LARGE-LABEL: sin_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _sinf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _sinf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.sin.f32(float %a)
+  ret float %1
+}
+
+define double @sin_f64(double %a) {
+; SMALL-LABEL: sin_f64
+; SMALL:       bl _sin
+; LARGE-LABEL: sin_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _sin@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _sin@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.sin.f64(double %a)
+  ret double %1
+}
+
+define float @cos_f32(float %a) {
+; SMALL-LABEL: cos_f32
+; SMALL:       bl _cosf
+; LARGE-LABEL: cos_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _cosf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _cosf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.cos.f32(float %a)
+  ret float %1
+}
+
+define double @cos_f64(double %a) {
+; SMALL-LABEL: cos_f64
+; SMALL:       bl _cos
+; LARGE-LABEL: cos_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _cos@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _cos@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.cos.f64(double %a)
+  ret double %1
+}
+
+define float @pow_f32(float %a, float %b) {
+; SMALL-LABEL: pow_f32
+; SMALL:       bl _powf
+; LARGE-LABEL: pow_f32
+; LARGE:       adrp  [[REG:x[0-9]+]], _powf@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _powf@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call float @llvm.pow.f32(float %a, float %b)
+  ret float %1
+}
+
+define double @pow_f64(double %a, double %b) {
+; SMALL-LABEL: pow_f64
+; SMALL:       bl _pow
+; LARGE-LABEL: pow_f64
+; LARGE:       adrp  [[REG:x[0-9]+]], _pow@GOTPAGE
+; LARGE:       ldr [[REG]], {{\[}}[[REG]], _pow@GOTPAGEOFF{{\]}}
+; LARGE-NEXT:  blr [[REG]]
+  %1 = call double @llvm.pow.f64(double %a, double %b)
+  ret double %1
+}
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)

diff --git a/test/CodeGen/AArch64/fast-isel-sdiv.ll b/test/CodeGen/AArch64/fast-isel-sdiv.ll
new file mode 100644
index 0000000..3080776
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-sdiv.ll

@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @sdiv_i32_exact(i32 %a) {
+; CHECK-LABEL: sdiv_i32_exact
+; CHECK:       asr {{w[0-9]+}}, w0, #3
+  %1 = sdiv exact i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @sdiv_i32_pos(i32 %a) {
+; CHECK-LABEL: sdiv_i32_pos
+; CHECK:       add [[REG1:w[0-9]+]], w0, #7
+; CHECK-NEXT:  cmp w0, #0
+; CHECK-NEXT:  csel [[REG2:w[0-9]+]], [[REG1]], w0, lt
+; CHECK-NEXT:  asr {{w[0-9]+}}, [[REG2]], #3
+  %1 = sdiv i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @sdiv_i32_neg(i32 %a) {
+; CHECK-LABEL: sdiv_i32_neg
+; CHECK:       add [[REG1:w[0-9]+]], w0, #7
+; CHECK-NEXT:  cmp w0, #0
+; CHECK-NEXT:  csel [[REG2:w[0-9]+]], [[REG1]], w0, lt
+; CHECK-NEXT:  neg {{w[0-9]+}}, [[REG2]], asr #3
+  %1 = sdiv i32 %a, -8
+  ret i32 %1
+}
+
+define i64 @sdiv_i64_exact(i64 %a) {
+; CHECK-LABEL: sdiv_i64_exact
+; CHECK:       asr {{x[0-9]+}}, x0, #4
+  %1 = sdiv exact i64 %a, 16
+  ret i64 %1
+}
+
+define i64 @sdiv_i64_pos(i64 %a) {
+; CHECK-LABEL: sdiv_i64_pos
+; CHECK:       add [[REG1:x[0-9]+]], x0, #15
+; CHECK-NEXT:  cmp x0, #0
+; CHECK-NEXT:  csel [[REG2:x[0-9]+]], [[REG1]], x0, lt
+; CHECK-NEXT:  asr {{x[0-9]+}}, [[REG2]], #4
+  %1 = sdiv i64 %a, 16
+  ret i64 %1
+}
+
+define i64 @sdiv_i64_neg(i64 %a) {
+; CHECK-LABEL: sdiv_i64_neg
+; CHECK:       add [[REG1:x[0-9]+]], x0, #15
+; CHECK-NEXT:  cmp x0, #0
+; CHECK-NEXT:  csel [[REG2:x[0-9]+]], [[REG1]], x0, lt
+; CHECK-NEXT:  neg {{x[0-9]+}}, [[REG2]], asr #4
+  %1 = sdiv i64 %a, -16
+  ret i64 %1
+}

diff --git a/test/CodeGen/AArch64/fast-isel-select.ll b/test/CodeGen/AArch64/fast-isel-select.ll
new file mode 100644
index 0000000..928e9d4
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-select.ll

@@ -0,0 +1,316 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; First test the different supported value types for select.
+define zeroext i1 @select_i1(i1 zeroext %c, i1 zeroext %a, i1 zeroext %b) {
+; CHECK-LABEL: select_i1
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i1 %a, i1 %b
+  ret i1 %1
+}
+
+define zeroext i8 @select_i8(i1 zeroext %c, i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: select_i8
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i8 %a, i8 %b
+  ret i8 %1
+}
+
+define zeroext i16 @select_i16(i1 zeroext %c, i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: select_i16
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i16 %a, i16 %b
+  ret i16 %1
+}
+
+define i32 @select_i32(i1 zeroext %c, i32 %a, i32 %b) {
+; CHECK-LABEL: select_i32
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{w[0-9]+}}, w1, w2, ne
+  %1 = select i1 %c, i32 %a, i32 %b
+  ret i32 %1
+}
+
+define i64 @select_i64(i1 zeroext %c, i64 %a, i64 %b) {
+; CHECK-LABEL: select_i64
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  csel {{x[0-9]+}}, x1, x2, ne
+  %1 = select i1 %c, i64 %a, i64 %b
+  ret i64 %1
+}
+
+define float @select_f32(i1 zeroext %c, float %a, float %b) {
+; CHECK-LABEL: select_f32
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ne
+  %1 = select i1 %c, float %a, float %b
+  ret float %1
+}
+
+define double @select_f64(i1 zeroext %c, double %a, double %b) {
+; CHECK-LABEL: select_f64
+; CHECK:       {{cmp w0, #0|tst w0, #0x1}}
+; CHECK-NEXT:  fcsel {{d[0-9]+}}, d0, d1, ne
+  %1 = select i1 %c, double %a, double %b
+  ret double %1
+}
+
+; Now test the folding of all compares.
+define float @select_fcmp_false(float %x, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_false
+; CHECK:       mov.16b {{v[0-9]+}}, v2
+  %1 = fcmp ogt float %x, %x
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ogt(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ogt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, gt
+  %1 = fcmp ogt float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_oge(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_oge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ge
+  %1 = fcmp oge float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_olt(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_olt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, mi
+  %1 = fcmp olt float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ole(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ole
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ls
+  %1 = fcmp ole float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_one(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_one
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel [[REG:s[0-9]+]], s2, s3, mi
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, [[REG]], gt
+  %1 = fcmp one float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ord(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ord
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, vc
+  %1 = fcmp ord float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_uno(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_uno
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, vs
+  %1 = fcmp uno float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ueq(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ueq
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel [[REG:s[0-9]+]], s2, s3, eq
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, [[REG]], vs
+  %1 = fcmp ueq float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ugt(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ugt
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, hi
+  %1 = fcmp ugt float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_uge(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_uge
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, pl
+  %1 = fcmp uge float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_ult(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ult
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, lt
+  %1 = fcmp ult float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+
+define float @select_fcmp_ule(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_ule
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, le
+  %1 = fcmp ule float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_une(float %x, float %y, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_une
+; CHECK:       fcmp s0, s1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s2, s3, ne
+  %1 = fcmp une float %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_fcmp_true(float %x, float %a, float %b) {
+; CHECK-LABEL: select_fcmp_true
+; CHECK:       mov.16b {{v[0-9]+}}, v1
+  %1 = fcmp ueq float %x, %x
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_eq(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_eq
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, eq
+  %1 = icmp eq i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ne(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ne
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ne
+  %1 = icmp ne i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ugt(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ugt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, hi
+  %1 = icmp ugt i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_uge(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_uge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, hs
+  %1 = icmp uge i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ult(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ult
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, lo
+  %1 = icmp ult i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_ule(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_ule
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ls
+  %1 = icmp ule i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_sgt(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_sgt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, gt
+  %1 = icmp sgt i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_sge(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_sge
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, ge
+  %1 = icmp sge i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_slt(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_slt
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, lt
+  %1 = icmp slt i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+define float @select_icmp_sle(i32 %x, i32 %y, float %a, float %b) {
+; CHECK-LABEL: select_icmp_sle
+; CHECK:       cmp w0, w1
+; CHECK-NEXT:  fcsel {{s[0-9]+}}, s0, s1, le
+  %1 = icmp sle i32 %x, %y
+  %2 = select i1 %1, float %a, float %b
+  ret float %2
+}
+
+; Test peephole optimizations for select.
+define zeroext i1 @select_opt1(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt1
+; CHECK:       orr {{w[0-9]+}}, w0, w1
+  %1 = select i1 %c, i1 true, i1 %a
+  ret i1 %1
+}
+
+define zeroext i1 @select_opt2(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt2
+; CHECK:       eor [[REG:w[0-9]+]], w0, #0x1
+; CHECK:       orr {{w[0-9]+}}, [[REG]], w1
+  %1 = select i1 %c, i1 %a, i1 true
+  ret i1 %1
+}
+
+define zeroext i1 @select_opt3(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt3
+; CHECK:       bic {{w[0-9]+}}, w1, w0
+  %1 = select i1 %c, i1 false, i1 %a
+  ret i1 %1
+}
+
+define zeroext i1 @select_opt4(i1 zeroext %c, i1 zeroext %a) {
+; CHECK-LABEL: select_opt4
+; CHECK:       and {{w[0-9]+}}, w0, w1
+  %1 = select i1 %c, i1 %a, i1 false
+  ret i1 %1
+}

diff --git a/test/CodeGen/AArch64/fast-isel-shift.ll b/test/CodeGen/AArch64/fast-isel-shift.ll
new file mode 100644
index 0000000..ce4ba49
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-shift.ll

@@ -0,0 +1,545 @@
+; RUN: llc -fast-isel -fast-isel-abort -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: asr_zext_i1_i16
+; CHECK:       uxth {{w[0-9]*}}, wzr
+define zeroext i16 @asr_zext_i1_i16(i1 %b) {
+  %1 = zext i1 %b to i16
+  %2 = ashr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_sext_i1_i16
+; CHECK:       sbfx [[REG1:w[0-9]+]], {{w[0-9]*}}, #0, #1
+; CHECK-NEXT:  sxth {{w[0-9]*}}, [[REG1]]
+define signext i16 @asr_sext_i1_i16(i1 %b) {
+  %1 = sext i1 %b to i16
+  %2 = ashr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_zext_i1_i32
+; CHECK:       mov {{w[0-9]*}}, wzr
+define i32 @asr_zext_i1_i32(i1 %b) {
+  %1 = zext i1 %b to i32
+  %2 = ashr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: asr_sext_i1_i32
+; CHECK:       sbfx  {{w[0-9]*}}, {{w[0-9]*}}, #0, #1
+define i32 @asr_sext_i1_i32(i1 %b) {
+  %1 = sext i1 %b to i32
+  %2 = ashr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: asr_zext_i1_i64
+; CHECK:       mov {{x[0-9]*}}, xzr
+define i64 @asr_zext_i1_i64(i1 %b) {
+  %1 = zext i1 %b to i64
+  %2 = ashr i64 %1, 1
+  ret i64 %2
+}
+
+; CHECK-LABEL: asr_sext_i1_i64
+; CHECK:       sbfx {{x[0-9]*}}, {{x[0-9]*}}, #0, #1
+define i64 @asr_sext_i1_i64(i1 %b) {
+  %1 = sext i1 %b to i64
+  %2 = ashr i64 %1, 1
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsr_zext_i1_i16
+; CHECK:       uxth {{w[0-9]*}}, wzr
+define zeroext i16 @lsr_zext_i1_i16(i1 %b) {
+  %1 = zext i1 %b to i16
+  %2 = lshr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_sext_i1_i16
+; CHECK:       sbfx [[REG1:w[0-9]+]], {{w[0-9]*}}, #0, #1
+; CHECK-NEXT:  ubfx [[REG2:w[0-9]+]], [[REG1]], #1, #15
+; CHECK-NEXT:  sxth {{w[0-9]*}}, [[REG2]]
+define signext i16 @lsr_sext_i1_i16(i1 %b) {
+  %1 = sext i1 %b to i16
+  %2 = lshr i16 %1, 1
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_zext_i1_i32
+; CHECK:       mov {{w[0-9]*}}, wzr
+define i32 @lsr_zext_i1_i32(i1 %b) {
+  %1 = zext i1 %b to i32
+  %2 = lshr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsr_sext_i1_i32
+; CHECK:       sbfx [[REG1:w[0-9]+]], {{w[0-9]*}}, #0, #1
+; CHECK-NEXT:  lsr {{w[0-9]*}}, [[REG1:w[0-9]+]], #1
+define i32 @lsr_sext_i1_i32(i1 %b) {
+  %1 = sext i1 %b to i32
+  %2 = lshr i32 %1, 1
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsr_zext_i1_i64
+; CHECK:       mov {{x[0-9]*}}, xzr
+define i64 @lsr_zext_i1_i64(i1 %b) {
+  %1 = zext i1 %b to i64
+  %2 = lshr i64 %1, 1
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_zext_i1_i16
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define zeroext i16 @lsl_zext_i1_i16(i1 %b) {
+  %1 = zext i1 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_sext_i1_i16
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define signext i16 @lsl_sext_i1_i16(i1 %b) {
+  %1 = sext i1 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_zext_i1_i32
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define i32 @lsl_zext_i1_i32(i1 %b) {
+  %1 = zext i1 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_sext_i1_i32
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #1
+define i32 @lsl_sext_i1_i32(i1 %b) {
+  %1 = sext i1 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_zext_i1_i64
+; CHECK:       ubfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #1
+define i64 @lsl_zext_i1_i64(i1 %b) {
+  %1 = zext i1 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i1_i64
+; CHECK:       sbfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #1
+define i64 @lsl_sext_i1_i64(i1 %b) {
+  %1 = sext i1 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i8
+; CHECK:       and [[REG1:w[0-9]+]], w1, #0xff
+; CHECK-NEXT:  lsl [[REG2:w[0-9]+]], w0, [[REG1]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG2]], #0xff
+define zeroext i8 @lslv_i8(i8 %a, i8 %b) {
+  %1 = shl i8 %a, %b
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsl_i8
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i8 @lsl_i8(i8 %a) {
+  %1 = shl i8 %a, 4
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsl_zext_i8_i16
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define zeroext i16 @lsl_zext_i8_i16(i8 %b) {
+  %1 = zext i8 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_sext_i8_i16
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define signext i16 @lsl_sext_i8_i16(i8 %b) {
+  %1 = sext i8 %b to i16
+  %2 = shl i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsl_zext_i8_i32
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define i32 @lsl_zext_i8_i32(i8 %b) {
+  %1 = zext i8 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_sext_i8_i32
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #8
+define i32 @lsl_sext_i8_i32(i8 %b) {
+  %1 = sext i8 %b to i32
+  %2 = shl i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_zext_i8_i64
+; CHECK:       ubfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #8
+define i64 @lsl_zext_i8_i64(i8 %b) {
+  %1 = zext i8 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i8_i64
+; CHECK:       sbfiz {{x[0-9]*}}, {{x[0-9]*}}, #4, #8
+define i64 @lsl_sext_i8_i64(i8 %b) {
+  %1 = sext i8 %b to i64
+  %2 = shl i64 %1, 4
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i16
+; CHECK:       and [[REG1:w[0-9]+]], w1, #0xffff
+; CHECK-NEXT:  lsl [[REG2:w[0-9]+]], w0, [[REG1]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG2]], #0xffff
+define zeroext i16 @lslv_i16(i16 %a, i16 %b) {
+  %1 = shl i16 %a, %b
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsl_i16
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #8, #8
+define zeroext i16 @lsl_i16(i16 %a) {
+  %1 = shl i16 %a, 8
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsl_zext_i16_i32
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #8, #16
+define i32 @lsl_zext_i16_i32(i16 %b) {
+  %1 = zext i16 %b to i32
+  %2 = shl i32 %1, 8
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_sext_i16_i32
+; CHECK:       sbfiz {{w[0-9]*}}, {{w[0-9]*}}, #8, #16
+define i32 @lsl_sext_i16_i32(i16 %b) {
+  %1 = sext i16 %b to i32
+  %2 = shl i32 %1, 8
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsl_zext_i16_i64
+; CHECK:       ubfiz {{x[0-9]*}}, {{x[0-9]*}}, #8, #16
+define i64 @lsl_zext_i16_i64(i16 %b) {
+  %1 = zext i16 %b to i64
+  %2 = shl i64 %1, 8
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i16_i64
+; CHECK:       sbfiz {{x[0-9]*}}, {{x[0-9]*}}, #8, #16
+define i64 @lsl_sext_i16_i64(i16 %b) {
+  %1 = sext i16 %b to i64
+  %2 = shl i64 %1, 8
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i32
+; CHECK:       lsl {{w[0-9]*}}, w0, w1
+define zeroext i32 @lslv_i32(i32 %a, i32 %b) {
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsl_i32
+; CHECK:       lsl {{w[0-9]*}}, {{w[0-9]*}}, #16
+define zeroext i32 @lsl_i32(i32 %a) {
+  %1 = shl i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsl_zext_i32_i64
+; CHECK:       ubfiz {{x[0-9]+}}, {{x[0-9]+}}, #16, #32
+define i64 @lsl_zext_i32_i64(i32 %b) {
+  %1 = zext i32 %b to i64
+  %2 = shl i64 %1, 16
+  ret i64 %2
+}
+
+; CHECK-LABEL: lsl_sext_i32_i64
+; CHECK:       sbfiz {{x[0-9]+}}, {{x[0-9]+}}, #16, #32
+define i64 @lsl_sext_i32_i64(i32 %b) {
+  %1 = sext i32 %b to i64
+  %2 = shl i64 %1, 16
+  ret i64 %2
+}
+
+; CHECK-LABEL: lslv_i64
+; CHECK:       lsl {{x[0-9]*}}, x0, x1
+define i64 @lslv_i64(i64 %a, i64 %b) {
+  %1 = shl i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK-LABEL: lsl_i64
+; CHECK:       lsl {{x[0-9]*}}, {{x[0-9]*}}, #32
+define i64 @lsl_i64(i64 %a) {
+  %1 = shl i64 %a, 32
+  ret i64 %1
+}
+
+; CHECK-LABEL: lsrv_i8
+; CHECK:       and [[REG1:w[0-9]+]], w0, #0xff
+; CHECK-NEXT:  and [[REG2:w[0-9]+]], w1, #0xff
+; CHECK-NEXT:  lsr [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG3]], #0xff
+define zeroext i8 @lsrv_i8(i8 %a, i8 %b) {
+  %1 = lshr i8 %a, %b
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsr_i8
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i8 @lsr_i8(i8 %a) {
+  %1 = lshr i8 %a, 4
+  ret i8 %1
+}
+
+; CHECK-LABEL: lsr_zext_i8_i16
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i16 @lsr_zext_i8_i16(i8 %b) {
+  %1 = zext i8 %b to i16
+  %2 = lshr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_sext_i8_i16
+; CHECK:       sxtb [[REG:w[0-9]+]], w0
+; CHECK-NEXT:  ubfx {{w[0-9]*}}, [[REG]], #4, #12
+define signext i16 @lsr_sext_i8_i16(i8 %b) {
+  %1 = sext i8 %b to i16
+  %2 = lshr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: lsr_zext_i8_i32
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @lsr_zext_i8_i32(i8 %b) {
+  %1 = zext i8 %b to i32
+  %2 = lshr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsr_sext_i8_i32
+; CHECK:       sxtb [[REG:w[0-9]+]], w0
+; CHECK-NEXT:  lsr {{w[0-9]*}}, [[REG]], #4
+define i32 @lsr_sext_i8_i32(i8 %b) {
+  %1 = sext i8 %b to i32
+  %2 = lshr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: lsrv_i16
+; CHECK:       and [[REG1:w[0-9]+]], w0, #0xffff
+; CHECK-NEXT:  and [[REG2:w[0-9]+]], w1, #0xffff
+; CHECK-NEXT:  lsr [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG3]], #0xffff
+define zeroext i16 @lsrv_i16(i16 %a, i16 %b) {
+  %1 = lshr i16 %a, %b
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsr_i16
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #8, #8
+define zeroext i16 @lsr_i16(i16 %a) {
+  %1 = lshr i16 %a, 8
+  ret i16 %1
+}
+
+; CHECK-LABEL: lsrv_i32
+; CHECK:       lsr {{w[0-9]*}}, w0, w1
+define zeroext i32 @lsrv_i32(i32 %a, i32 %b) {
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsr_i32
+; CHECK:       lsr {{w[0-9]*}}, {{w[0-9]*}}, #16
+define zeroext i32 @lsr_i32(i32 %a) {
+  %1 = lshr i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK-LABEL: lsrv_i64
+; CHECK:       lsr {{x[0-9]*}}, x0, x1
+define i64 @lsrv_i64(i64 %a, i64 %b) {
+  %1 = lshr i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK-LABEL: lsr_i64
+; CHECK:       lsr {{x[0-9]*}}, {{x[0-9]*}}, #32
+define i64 @lsr_i64(i64 %a) {
+  %1 = lshr i64 %a, 32
+  ret i64 %1
+}
+
+; CHECK-LABEL: asrv_i8
+; CHECK:       sxtb [[REG1:w[0-9]+]], w0
+; CHECK-NEXT:  and  [[REG2:w[0-9]+]], w1, #0xff
+; CHECK-NEXT:  asr  [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and  {{w[0-9]+}}, [[REG3]], #0xff
+define zeroext i8 @asrv_i8(i8 %a, i8 %b) {
+  %1 = ashr i8 %a, %b
+  ret i8 %1
+}
+
+; CHECK-LABEL: asr_i8
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i8 @asr_i8(i8 %a) {
+  %1 = ashr i8 %a, 4
+  ret i8 %1
+}
+
+; CHECK-LABEL: asr_zext_i8_i16
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define zeroext i16 @asr_zext_i8_i16(i8 %b) {
+  %1 = zext i8 %b to i16
+  %2 = ashr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_sext_i8_i16
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define signext i16 @asr_sext_i8_i16(i8 %b) {
+  %1 = sext i8 %b to i16
+  %2 = ashr i16 %1, 4
+  ret i16 %2
+}
+
+; CHECK-LABEL: asr_zext_i8_i32
+; CHECK:       ubfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @asr_zext_i8_i32(i8 %b) {
+  %1 = zext i8 %b to i32
+  %2 = ashr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: asr_sext_i8_i32
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @asr_sext_i8_i32(i8 %b) {
+  %1 = sext i8 %b to i32
+  %2 = ashr i32 %1, 4
+  ret i32 %2
+}
+
+; CHECK-LABEL: asrv_i16
+; CHECK:       sxth [[REG1:w[0-9]+]], w0
+; CHECK-NEXT:  and  [[REG2:w[0-9]+]], w1, #0xffff
+; CHECK-NEXT:  asr  [[REG3:w[0-9]+]], [[REG1]], [[REG2]]
+; CHECK-NEXT:  and  {{w[0-9]+}}, [[REG3]], #0xffff
+define zeroext i16 @asrv_i16(i16 %a, i16 %b) {
+  %1 = ashr i16 %a, %b
+  ret i16 %1
+}
+
+; CHECK-LABEL: asr_i16
+; CHECK:       sbfx {{w[0-9]*}}, {{w[0-9]*}}, #8, #8
+define zeroext i16 @asr_i16(i16 %a) {
+  %1 = ashr i16 %a, 8
+  ret i16 %1
+}
+
+; CHECK-LABEL: asrv_i32
+; CHECK:       asr {{w[0-9]*}}, w0, w1
+define zeroext i32 @asrv_i32(i32 %a, i32 %b) {
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK-LABEL: asr_i32
+; CHECK:       asr {{w[0-9]*}}, {{w[0-9]*}}, #16
+define zeroext i32 @asr_i32(i32 %a) {
+  %1 = ashr i32 %a, 16
+  ret i32 %1
+}
+
+; CHECK-LABEL: asrv_i64
+; CHECK:       asr {{x[0-9]*}}, x0, x1
+define i64 @asrv_i64(i64 %a, i64 %b) {
+  %1 = ashr i64 %a, %b
+  ret i64 %1
+}
+
+; CHECK-LABEL: asr_i64
+; CHECK:       asr {{x[0-9]*}}, {{x[0-9]*}}, #32
+define i64 @asr_i64(i64 %a) {
+  %1 = ashr i64 %a, 32
+  ret i64 %1
+}
+
+; CHECK-LABEL: shift_test1
+; CHECK:       ubfiz {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+; CHECK-NEXT:  sbfx  {{w[0-9]*}}, {{w[0-9]*}}, #4, #4
+define i32 @shift_test1(i8 %a) {
+  %1 = shl i8 %a, 4
+  %2 = ashr i8 %1, 4
+  %3 = sext i8 %2 to i32
+  ret i32 %3
+}
+
+; Test zero shifts
+
+; CHECK-LABEL: shl_zero
+; CHECK-NOT:   lsl
+define i32 @shl_zero(i32 %a) {
+  %1 = shl i32 %a, 0
+  ret i32 %1
+}
+
+; CHECK-LABEL: lshr_zero
+; CHECK-NOT:   lsr
+define i32 @lshr_zero(i32 %a) {
+  %1 = lshr i32 %a, 0
+  ret i32 %1
+}
+
+; CHECK-LABEL: ashr_zero
+; CHECK-NOT:   asr
+define i32 @ashr_zero(i32 %a) {
+  %1 = ashr i32 %a, 0
+  ret i32 %1
+}
+
+; CHECK-LABEL: shl_zext_zero
+; CHECK:       ubfx x0, x0, #0, #32
+define i64 @shl_zext_zero(i32 %a) {
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 0
+  ret i64 %2
+}
+
+; CHECK-LABEL: lshr_zext_zero
+; CHECK:       ubfx x0, x0, #0, #32
+define i64 @lshr_zext_zero(i32 %a) {
+  %1 = zext i32 %a to i64
+  %2 = lshr i64 %1, 0
+  ret i64 %2
+}
+
+; CHECK-LABEL: ashr_zext_zero
+; CHECK:       ubfx x0, x0, #0, #32
+define i64 @ashr_zext_zero(i32 %a) {
+  %1 = zext i32 %a to i64
+  %2 = ashr i64 %1, 0
+  ret i64 %2
+}
+

diff --git a/test/CodeGen/AArch64/fast-isel-sqrt.ll b/test/CodeGen/AArch64/fast-isel-sqrt.ll
new file mode 100644
index 0000000..1331d5c
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-sqrt.ll

@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+define float @test_sqrt_f32(float %a) {
+; CHECK-LABEL: test_sqrt_f32
+; CHECK:       fsqrt s0, s0
+  %res = call float @llvm.sqrt.f32(float %a)
+  ret float %res
+}
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+define double @test_sqrt_f64(double %a) {
+; CHECK-LABEL: test_sqrt_f64
+; CHECK:       fsqrt d0, d0
+  %res = call double @llvm.sqrt.f64(double %a)
+  ret double %res
+}
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+

diff --git a/test/CodeGen/AArch64/fast-isel-switch-phi.ll b/test/CodeGen/AArch64/fast-isel-switch-phi.ll
new file mode 100644
index 0000000..c4f871c
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-switch-phi.ll

@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s
+
+; Test that the Machine Instruction PHI node doesn't have more than one operand
+; from the same predecessor.
+define i32 @foo(i32 %a, i32 %b, i1 %c) {
+entry:
+  br i1 %c, label %switch, label %direct
+
+switch:
+  switch i32 %a, label %exit [
+    i32 43, label %continue
+    i32 45, label %continue
+  ]
+
+direct:
+  %var = add i32 %b, 1
+  br label %continue
+
+continue:
+  %var.phi = phi i32 [ %var, %direct ], [ 0, %switch ], [ 0, %switch ]
+  ret i32 %var.phi
+
+exit:
+  ret i32 1
+}

diff --git a/test/CodeGen/AArch64/fast-isel-tbz.ll b/test/CodeGen/AArch64/fast-isel-tbz.ll
new file mode 100644
index 0000000..d7f46b2
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-tbz.ll

@@ -0,0 +1,141 @@
+; RUN: llc                             -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+; RUN: llc -fast-isel -fast-isel-abort -aarch64-atomic-cfg-tidy=0 -verify-machineinstrs -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define i32 @icmp_eq_i8(i8 zeroext %a) {
+; CHECK-LABEL: icmp_eq_i8
+; CHECK:       tbz {{w[0-9]+}}, #0, {{LBB.+_2}}
+  %1 = and i8 %a, 1
+  %2 = icmp eq i8 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i16(i16 zeroext %a) {
+; CHECK-LABEL: icmp_eq_i16
+; CHECK:       tbz w0, #1, {{LBB.+_2}}
+  %1 = and i16 %a, 2
+  %2 = icmp eq i16 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i32(i32 %a) {
+; CHECK-LABEL: icmp_eq_i32
+; CHECK:       tbz w0, #2, {{LBB.+_2}}
+  %1 = and i32 %a, 4
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i64_1(i64 %a) {
+; CHECK-LABEL: icmp_eq_i64_1
+; CHECK:       tbz w0, #3, {{LBB.+_2}}
+  %1 = and i64 %a, 8
+  %2 = icmp eq i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_eq_i64_2(i64 %a) {
+; CHECK-LABEL: icmp_eq_i64_2
+; CHECK:       tbz x0, #32, {{LBB.+_2}}
+  %1 = and i64 %a, 4294967296
+  %2 = icmp eq i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i8(i8 zeroext %a) {
+; CHECK-LABEL: icmp_ne_i8
+; CHECK:       tbnz w0, #0, {{LBB.+_2}}
+  %1 = and i8 %a, 1
+  %2 = icmp ne i8 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i16(i16 zeroext %a) {
+; CHECK-LABEL: icmp_ne_i16
+; CHECK:       tbnz w0, #1, {{LBB.+_2}}
+  %1 = and i16 %a, 2
+  %2 = icmp ne i16 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i32(i32 %a) {
+; CHECK-LABEL: icmp_ne_i32
+; CHECK:       tbnz w0, #2, {{LBB.+_2}}
+  %1 = and i32 %a, 4
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i64_1(i64 %a) {
+; CHECK-LABEL: icmp_ne_i64_1
+; CHECK:       tbnz w0, #3, {{LBB.+_2}}
+  %1 = and i64 %a, 8
+  %2 = icmp ne i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+define i32 @icmp_ne_i64_2(i64 %a) {
+; CHECK-LABEL: icmp_ne_i64_2
+; CHECK:       tbnz x0, #32, {{LBB.+_2}}
+  %1 = and i64 %a, 4294967296
+  %2 = icmp ne i64 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+; Test that we don't fold the 'and' instruction into the compare.
+define i32 @icmp_eq_and_i32(i32 %a, i1 %c) {
+; CHECK-LABEL: icmp_eq_and_i32
+; CHECK:       and  [[REG:w[0-9]+]], w0, #0x4
+; CHECK-NEXT:  cbz  [[REG]], {{LBB.+_3}}
+  %1 = and i32 %a, 4
+  br i1 %c, label %bb0, label %bb2
+bb0:
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647}
+!1 = metadata !{metadata !"branch_weights", i32 2147483647, i32 0}

diff --git a/test/CodeGen/AArch64/fast-isel-trunc.ll b/test/CodeGen/AArch64/fast-isel-trunc.ll
new file mode 100644
index 0000000..55937eb
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-trunc.ll

@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s
+
+; Test that %1 doesn't get the kill flag set before its last use.
+define i32 @test_trunc(i32 %a) {
+  %1 = add i32 %a, 1
+  %2 = trunc i32 %1 to i16
+  %3 = icmp ult i16 1, %2
+  %4 = add i32 %1, 1
+  %5 = sext i1 %3 to i32
+  %6 = and i32 %4, %5
+  ret i32 %6
+}

diff --git a/test/CodeGen/AArch64/fast-isel-vector-arithmetic.ll b/test/CodeGen/AArch64/fast-isel-vector-arithmetic.ll
new file mode 100644
index 0000000..eaa0db5
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-vector-arithmetic.ll

@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=aarch64-apple-darwin                                                   -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -fast-isel-abort-args -verify-machineinstrs < %s | FileCheck %s
+
+; Vector Integer Add
+define <8 x i8> @add_v8i8_rr(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: add_v8i8_rr
+; CHECK: add.8b v0, v0, v1
+  %1 = add <8 x i8> %a, %b
+  ret <8 x i8> %1
+}
+
+define <16 x i8> @add_v16i8_rr(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: add_v16i8_rr
+; CHECK: add.16b v0, v0, v1
+  %1 = add <16 x i8> %a, %b
+  ret <16 x i8> %1
+}
+
+define <4 x i16> @add_v4i16_rr(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: add_v4i16_rr
+; CHECK: add.4h v0, v0, v1
+  %1 = add <4 x i16> %a, %b
+  ret <4 x i16> %1
+}
+
+define <8 x i16> @add_v8i16_rr(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: add_v8i16_rr
+; CHECK: add.8h v0, v0, v1
+  %1 = add <8 x i16> %a, %b
+  ret <8 x i16> %1
+}
+
+define <2 x i32> @add_v2i32_rr(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: add_v2i32_rr
+; CHECK: add.2s v0, v0, v1
+  %1 = add <2 x i32> %a, %b
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @add_v4i32_rr(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: add_v4i32_rr
+; CHECK: add.4s v0, v0, v1
+  %1 = add <4 x i32> %a, %b
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @add_v2i64_rr(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: add_v2i64_rr
+; CHECK: add.2d v0, v0, v1
+  %1 = add <2 x i64> %a, %b
+  ret <2 x i64> %1
+}
+
+; Vector Floating-point Add
+define <2 x float> @add_v2f32_rr(<2 x float> %a, <2 x float> %b) {
+; CHECK: add_v2f32_rr
+; CHECK: fadd.2s v0, v0, v1
+  %1 = fadd <2 x float> %a, %b
+  ret <2 x float> %1
+}
+
+define <4 x float> @add_v4f32_rr(<4 x float> %a, <4 x float> %b) {
+; CHECK: add_v4f32_rr
+; CHECK: fadd.4s v0, v0, v1
+  %1 = fadd <4 x float> %a, %b
+  ret <4 x float> %1
+}
+
+define <2 x double> @add_v2f64_rr(<2 x double> %a, <2 x double> %b) {
+; CHECK: add_v2f64_rr
+; CHECK: fadd.2d v0, v0, v1
+  %1 = fadd <2 x double> %a, %b
+  ret <2 x double> %1
+}

diff --git a/test/CodeGen/AArch64/fast-isel-vret.ll b/test/CodeGen/AArch64/fast-isel-vret.ll
new file mode 100644
index 0000000..9ad9227
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-vret.ll

@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
+
+; Test that we don't abort fast-isle for ret
+define <8 x i8> @ret_v8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: ret_v8i8
+; CHECK:       add.8b v0, v0, v1
+  %1 = add <8 x i8> %a, %b
+  ret <8 x i8> %1
+}

diff --git a/test/CodeGen/AArch64/fp16-instructions.ll b/test/CodeGen/AArch64/fp16-instructions.ll
new file mode 100644
index 0000000..7a44cd1
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-instructions.ll

@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define half @add_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: add_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fadd [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fadd half %a, %b
+  ret half %0
+}
+
+
+define half @sub_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: sub_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fsub [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fsub half %a, %b
+  ret half %0
+}
+
+
+define half @mul_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: mul_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fmul [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fmul half %a, %b
+  ret half %0
+}
+
+
+define half @div_h(half %a, half %b) {
+entry:
+; CHECK-LABEL: div_h:
+; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
+; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
+; CHECK: fdiv [[RES:s[0-9]+]], [[OP1]], [[OP2]]
+; CHECK: fcvt h0, [[RES]]
+  %0 = fdiv half %a, %b
+  ret half %0
+}
+
+
+define half @load_h(half* %a) {
+entry:
+; CHECK-LABEL: load_h:
+; CHECK: ldr h0, [x0]
+  %0 = load half* %a, align 4
+  ret half %0
+}
+
+
+define void @store_h(half* %a, half %b) {
+entry:
+; CHECK-LABEL: store_h:
+; CHECK: str h0, [x0]
+  store half %b, half* %a, align 4
+  ret void
+}
+
+define half @s_to_h(float %a) {
+; CHECK-LABEL: s_to_h:
+; CHECK: fcvt h0, s0
+  %1 = fptrunc float %a to half
+  ret half %1
+}
+
+define half @d_to_h(double %a) {
+; CHECK-LABEL: d_to_h:
+; CHECK: fcvt h0, d0
+  %1 = fptrunc double %a to half
+  ret half %1
+}
+
+define float @h_to_s(half %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: fcvt s0, h0
+  %1 = fpext half %a to float
+  ret float %1
+}
+
+define double @h_to_d(half %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK: fcvt d0, h0
+  %1 = fpext half %a to double
+  ret double %1
+}
+
+define half @bitcast_i_to_h(i16 %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: fmov s0, w0
+  %1 = bitcast i16 %a to half
+  ret half %1
+}
+
+
+define i16 @bitcast_h_to_i(half %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: fmov w0, s0
+  %1 = bitcast half %a to i16
+  ret i16 %1
+}

diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
new file mode 100644
index 0000000..8e89681
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll

@@ -0,0 +1,122 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <4 x half> @add_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: add_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fadd [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fadd <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @sub_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: sub_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fsub [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fsub <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @mul_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: mul_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fmul [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fmul <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @div_h(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: div_h:
+; CHECK-DAG: fcvtl [[OP1:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl [[OP2:v[0-9]+\.4s]], v1.4h
+; CHECK: fdiv [[RES:v[0-9]+.4s]], [[OP1]], [[OP2]]
+; CHECK: fcvtn v0.4h, [[RES]]
+  %0 = fdiv <4 x half> %a, %b
+  ret <4 x half> %0
+}
+
+
+define <4 x half> @load_h(<4 x half>* %a) {
+entry:
+; CHECK-LABEL: load_h:
+; CHECK: ldr d0, [x0]
+  %0 = load <4 x half>* %a, align 4
+  ret <4 x half> %0
+}
+
+
+define void @store_h(<4 x half>* %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: store_h:
+; CHECK: str d0, [x0]
+  store <4 x half> %b, <4 x half>* %a, align 4
+  ret void
+}
+
+define <4 x half> @s_to_h(<4 x float> %a) {
+; CHECK-LABEL: s_to_h:
+; CHECK: fcvtn v0.4h, v0.4s
+  %1 = fptrunc <4 x float> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @d_to_h(<4 x double> %a) {
+; CHECK-LABEL: d_to_h:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+  %1 = fptrunc <4 x double> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x float> @h_to_s(<4 x half> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: fcvtl v0.4s, v0.4h
+  %1 = fpext <4 x half> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @h_to_d(<4 x half> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+  %1 = fpext <4 x half> %a to <4 x double>
+  ret <4 x double> %1
+}
+
+define <4 x half> @bitcast_i_to_h(float, <4 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <4 x i16> %a to <4 x half>
+  ret <4 x half> %2
+}
+
+define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <4 x half> %a to <4 x i16>
+  ret <4 x i16> %2
+}

diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
new file mode 100644
index 0000000..9ee2296
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll

@@ -0,0 +1,255 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <8 x half> @add_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: add_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fadd
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fadd <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @sub_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: sub_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fsub
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fsub <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @mul_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: mul_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fmul
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fmul <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @div_h(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: div_h:
+; CHECK: fcvt
+; CHECK: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fdiv
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK-DAG: fcvt
+; CHECK: fcvt
+  %0 = fdiv <8 x half> %a, %b
+  ret <8 x half> %0
+}
+
+
+define <8 x half> @load_h(<8 x half>* %a) {
+entry:
+; CHECK-LABEL: load_h:
+; CHECK: ldr q0, [x0]
+  %0 = load <8 x half>* %a, align 4
+  ret <8 x half> %0
+}
+
+
+define void @store_h(<8 x half>* %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: store_h:
+; CHECK: str q0, [x0]
+  store <8 x half> %b, <8 x half>* %a, align 4
+  ret void
+}
+
+define <8 x half> @s_to_h(<8 x float> %a) {
+; CHECK-LABEL: s_to_h:
+; CHECK-DAG: fcvtn v0.4h, v0.4s
+; CHECK-DAG: fcvtn [[REG:v[0-9+]]].4h, v1.4s
+; CHECK: ins v0.d[1], [[REG]].d[0]
+  %1 = fptrunc <8 x float> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @d_to_h(<8 x double> %a) {
+; CHECK-LABEL: d_to_h:
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: ins v{{[0-9]+}}.d
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: fcvt h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+; CHECK-DAG: ins v{{[0-9]+}}.h
+  %1 = fptrunc <8 x double> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x float> @h_to_s(<8 x half> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK: fcvtl2 v1.4s, v0.8h
+; CHECK: fcvtl v0.4s, v0.4h
+  %1 = fpext <8 x half> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x double> @h_to_d(<8 x half> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: fcvt d
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+; CHECK-DAG: ins
+  %1 = fpext <8 x half> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+
+define <8 x half> @bitcast_i_to_h(float, <8 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <8 x i16> %a to <8 x half>
+  ret <8 x half> %2
+}
+
+define <8 x i16> @bitcast_h_to_i(float, <8 x half> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK: mov v0.16b, v1.16b
+  %2 = bitcast <8 x half> %a to <8 x i16>
+  ret <8 x i16> %2
+}
+

diff --git a/test/CodeGen/AArch64/fp16-vector-bitcast.ll b/test/CodeGen/AArch64/fp16-vector-bitcast.ll
new file mode 100644
index 0000000..421a4f5
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-vector-bitcast.ll

@@ -0,0 +1,203 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+define <4 x i16> @v4f16_to_v4i16(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v4i16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @v4f16_to_v2i32(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v2i32:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <1 x i64> @v4f16_to_v1i64(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v1i64:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <1 x i64>
+  ret <1 x i64> %1
+}
+
+define i64 @v4f16_to_i64(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_i64:
+; CHECK: fmov x0, d1
+entry:
+  %1 = bitcast <4 x half> %a to i64
+  ret i64 %1
+}
+
+define <2 x float> @v4f16_to_v2float(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v2float:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <2 x float>
+  ret <2 x float> %1
+}
+
+define <1 x double> @v4f16_to_v1double(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_v1double:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to <1 x double>
+  ret <1 x double> %1
+}
+
+define double @v4f16_to_double(float, <4 x half> %a) #0 {
+; CHECK-LABEL: v4f16_to_double:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x half> %a to double
+  ret double %1
+}
+
+
+define <4 x half> @v4i16_to_v4f16(float, <4 x i16> %a) #0 {
+; CHECK-LABEL: v4i16_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x i16> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v2i32_to_v4f16(float, <2 x i32> %a) #0 {
+; CHECK-LABEL: v2i32_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x i32> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v1i64_to_v4f16(float, <1 x i64> %a) #0 {
+; CHECK-LABEL: v1i64_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <1 x i64> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @i64_to_v4f16(float, i64 %a) #0 {
+; CHECK-LABEL: i64_to_v4f16:
+; CHECK: fmov d0, x0
+entry:
+  %1 = bitcast i64 %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v2float_to_v4f16(float, <2 x float> %a) #0 {
+; CHECK-LABEL: v2float_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x float> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @v1double_to_v4f16(float, <1 x double> %a) #0 {
+; CHECK-LABEL: v1double_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <1 x double> %a to <4 x half>
+  ret <4 x half> %1
+}
+
+define <4 x half> @double_to_v4f16(float, double %a) #0 {
+; CHECK-LABEL: double_to_v4f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast double %a to <4 x half>
+  ret <4 x half> %1
+}
+
+
+
+
+
+
+
+
+
+
+define <8 x i16> @v8f16_to_v8i16(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v8i16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @v8f16_to_v4i32(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v4i32:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @v8f16_to_v2i64(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v2i64:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <4 x float> @v8f16_to_v4float(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v4float:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <2 x double> @v8f16_to_v2double(float, <8 x half> %a) #0 {
+; CHECK-LABEL: v8f16_to_v2double:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x half> %a to <2 x double>
+  ret <2 x double> %1
+}
+
+define <8 x half> @v8i16_to_v8f16(float, <8 x i16> %a) #0 {
+; CHECK-LABEL: v8i16_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <8 x i16> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v4i32_to_v8f16(float, <4 x i32> %a) #0 {
+; CHECK-LABEL: v4i32_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x i32> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v2i64_to_v8f16(float, <2 x i64> %a) #0 {
+; CHECK-LABEL: v2i64_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x i64> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v4float_to_v8f16(float, <4 x float> %a) #0 {
+; CHECK-LABEL: v4float_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <4 x float> %a to <8 x half>
+  ret <8 x half> %1
+}
+
+define <8 x half> @v2double_to_v8f16(float, <2 x double> %a) #0 {
+; CHECK-LABEL: v2double_to_v8f16:
+; CHECK: mov v0.16b, v1.16b
+entry:
+  %1 = bitcast <2 x double> %a to <8 x half>
+  ret <8 x half> %1
+}

diff --git a/test/CodeGen/AArch64/fp16-vector-load-store.ll b/test/CodeGen/AArch64/fp16-vector-load-store.ll
new file mode 100644
index 0000000..edbbffe
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-vector-load-store.ll

@@ -0,0 +1,528 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+; Simple load of v4i16
+define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_64:
+; CHECK: ldr d0, [x0]
+entry:
+  %0 = load <4 x half>* %a, align 8
+  ret <4 x half> %0
+}
+
+; Simple load of v8i16
+define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_128:
+; CHECK: ldr q0, [x0]
+entry:
+  %0 = load <8 x half>* %a, align 16
+  ret <8 x half> %0
+}
+
+; Duplicating load to v4i16
+define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_dup_64:
+; CHECK: ld1r { v0.4h }, [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <4 x half> undef, half %0, i32 0
+  %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer
+  ret <4 x half> %2
+}
+
+; Duplicating load to v8i16
+define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 {
+; CHECK-LABEL: load_dup_128:
+; CHECK: ld1r { v0.8h }, [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <8 x half> undef, half %0, i32 0
+  %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer
+  ret <8 x half> %2
+}
+
+; Load to one lane of v4f16
+define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 {
+; CHECK-LABEL: load_lane_64:
+; CHECK: ld1 { v0.h }[2], [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <4 x half> %b, half %0, i32 2
+  ret <4 x half> %1
+}
+
+; Load to one lane of v8f16
+define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 {
+; CHECK-LABEL: load_lane_128:
+; CHECK: ld1 { v0.h }[5], [x0]
+entry:
+  %0 = load half* %a, align 2
+  %1 = insertelement <8 x half> %b, half %0, i32 5
+  ret <8 x half> %1
+}
+
+; Simple store of v4f16
+define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 {
+; CHECK-LABEL: store_64:
+; CHECK: str d0, [x0]
+entry:
+  store <4 x half> %b, <4 x half>* %a, align 8
+  ret void
+}
+
+; Simple store of v8f16
+define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 {
+; CHECK-LABEL: store_128:
+; CHECK: str q0, [x0]
+entry:
+  store <8 x half> %b, <8 x half>* %a, align 16
+  ret void
+}
+
+; Store from one lane of v4f16
+define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 {
+; CHECK-LABEL: store_lane_64:
+; CHECK: st1 { v0.h }[2], [x0]
+entry:
+  %0 = extractelement <4 x half> %b, i32 2
+  store half %0, half* %a, align 2
+  ret void
+}
+
+; Store from one lane of v8f16
+define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 {
+; CHECK-LABEL: store_lane_128:
+; CHECK: st1 { v0.h }[5], [x0]
+entry:
+  %0 = extractelement <8 x half> %b, i32 5
+  store half %0, half* %a, align 2
+  ret void
+}
+
+; NEON intrinsics - (de-)interleaving loads and stores
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*)
+declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*)
+declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+
+; Load 2 x v4f16 with de-interleaving
+define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_64_2:
+; CHECK: ld2 { v0.4h, v1.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load 3 x v4f16 with de-interleaving
+define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_64_3:
+; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 4 x v4f16 with de-interleaving
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_64_4:
+; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Store 2 x v4f16 with interleaving
+define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: store_interleave_64_2:
+; CHECK: st2 { v0.4h, v1.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
+  ret void
+}
+
+; Store 3 x v4f16 with interleaving
+define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: store_interleave_64_3:
+; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
+  ret void
+}
+
+; Store 4 x v4f16 with interleaving
+define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: store_interleave_64_4:
+; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
+  ret void
+}
+
+; Load 2 x v8f16 with de-interleaving
+define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_128_2:
+; CHECK: ld2 { v0.8h, v1.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load 3 x v8f16 with de-interleaving
+define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_128_3:
+; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load 8 x v8f16 with de-interleaving
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_interleave_128_4:
+; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Store 2 x v8f16 with interleaving
+define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: store_interleave_128_2:
+; CHECK: st2 { v0.8h, v1.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
+  ret void
+}
+
+; Store 3 x v8f16 with interleaving
+define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: store_interleave_128_3:
+; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
+  ret void
+}
+
+; Store 8 x v8f16 with interleaving
+define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: store_interleave_128_4:
+; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
+  ret void
+}
+
+; NEON intrinsics - duplicating loads
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*)
+
+; Load 2 x v4f16 with duplication
+define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 {
+; CHECK-LABEL: load_dup_64_2:
+; CHECK: ld2r { v0.4h, v1.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load 3 x v4f16 with duplication
+define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 {
+; CHECK-LABEL: load_dup_64_3:
+; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 4 x v4f16 with duplication
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 {
+; CHECK-LABEL: load_dup_64_4:
+; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 2 x v8f16 with duplication
+define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 {
+; CHECK-LABEL: load_dup_128_2:
+; CHECK: ld2r { v0.8h, v1.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load 3 x v8f16 with duplication
+define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 {
+; CHECK-LABEL: load_dup_128_3:
+; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load 8 x v8f16 with duplication
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 {
+; CHECK-LABEL: load_dup_128_4:
+; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+
+; NEON intrinsics - loads and stores to/from one lane
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*)
+declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*)
+
+; Load one lane of 2 x v4f16
+define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: load_lane_64_2:
+; CHECK: ld2 { v0.h, v1.h }[2], [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load one lane of 3 x v4f16
+define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: load_lane_64_3:
+; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load one lane of 4 x v4f16
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: load_lane_64_4:
+; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Store one lane of 2 x v4f16
+define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: store_lane_64_2:
+; CHECK: st2 { v0.h, v1.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 3 x v4f16
+define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: store_lane_64_3:
+; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 4 x v4f16
+define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: store_lane_64_4:
+; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a)
+  ret void
+}
+
+; Load one lane of 2 x v8f16
+define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: load_lane_128_2:
+; CHECK: ld2 { v0.h, v1.h }[2], [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load one lane of 3 x v8f16
+define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: load_lane_128_3:
+; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load one lane of 8 x v8f16
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: load_lane_128_4:
+; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Store one lane of 2 x v8f16
+define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: store_lane_128_2:
+; CHECK: st2 { v0.h, v1.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 3 x v8f16
+define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: store_lane_128_3:
+; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a)
+  ret void
+}
+
+; Store one lane of 8 x v8f16
+define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: store_lane_128_4:
+; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a)
+  ret void
+}
+
+; NEON intrinsics - load/store without interleaving
+declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*)
+declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*)
+declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*)
+declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*)
+declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*)
+declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*)
+
+; Load 2 x v4f16 without de-interleaving
+define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_64_2:
+; CHECK: ld1 { v0.4h, v1.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half> } %0
+}
+
+; Load 3 x v4f16 without de-interleaving
+define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_64_3:
+; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Load 4 x v4f16 without de-interleaving
+define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 {
+; CHECK-LABEL: load_64_4:
+; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a)
+  ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0
+}
+
+; Store 2 x v4f16 without interleaving
+define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 {
+; CHECK-LABEL: store_64_2:
+; CHECK: st1 { v0.4h, v1.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a)
+  ret void
+}
+
+; Store 3 x v4f16 without interleaving
+define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 {
+; CHECK-LABEL: store_64_3:
+; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a)
+  ret void
+}
+
+; Store 4 x v4f16 without interleaving
+define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 {
+; CHECK-LABEL: store_64_4:
+; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a)
+  ret void
+}
+
+; Load 2 x v8f16 without de-interleaving
+define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_128_2:
+; CHECK: ld1 { v0.8h, v1.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half> } %0
+}
+
+; Load 3 x v8f16 without de-interleaving
+define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_128_3:
+; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Load 8 x v8f16 without de-interleaving
+define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 {
+; CHECK-LABEL: load_128_4:
+; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a)
+  ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0
+}
+
+; Store 2 x v8f16 without interleaving
+define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 {
+; CHECK-LABEL: store_128_2:
+; CHECK: st1 { v0.8h, v1.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a)
+  ret void
+}
+
+; Store 3 x v8f16 without interleaving
+define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 {
+; CHECK-LABEL: store_128_3:
+; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a)
+  ret void
+}
+
+; Store 8 x v8f16 without interleaving
+define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 {
+; CHECK-LABEL: store_128_4:
+; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+entry:
+  tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a)
+  ret void
+}

diff --git a/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/test/CodeGen/AArch64/fp16-vector-shuffle.ll
new file mode 100644
index 0000000..74d1b43
--- /dev/null
+++ b/test/CodeGen/AArch64/fp16-vector-shuffle.ll

@@ -0,0 +1,301 @@
+; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
+
+; float16x4_t select_64(float16x4_t a, float16x4_t b, uint16x4_t c) { return vbsl_u16(c, a, b); }
+define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 {
+; CHECK-LABEL: select_64:
+; CHECK: bsl
+entry:
+  %0 = bitcast <4 x half> %a to <4 x i16>
+  %1 = bitcast <4 x half> %b to <4 x i16>
+  %vbsl3.i = and <4 x i16> %0, %c
+  %2 = xor <4 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %vbsl4.i = and <4 x i16> %1, %2
+  %vbsl5.i = or <4 x i16> %vbsl3.i, %vbsl4.i
+  %3 = bitcast <4 x i16> %vbsl5.i to <4 x half>
+  ret <4 x half> %3
+}
+
+; float16x8_t select_128(float16x8_t a, float16x8_t b, uint16x8_t c) { return vbslq_u16(c, a, b); }
+define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 {
+; CHECK-LABEL: select_128:
+; CHECK: bsl
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %1 = bitcast <8 x half> %b to <8 x i16>
+  %vbsl3.i = and <8 x i16> %0, %c
+  %2 = xor <8 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %vbsl4.i = and <8 x i16> %1, %2
+  %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
+  %3 = bitcast <8 x i16> %vbsl5.i to <8 x half>
+  ret <8 x half> %3
+}
+
+; float16x4_t lane_64_64(float16x4_t a, float16x4_t b) {
+;  return vcopy_lane_s16(a, 1, b, 2);
+; }
+define <4 x half> @lane_64_64(<4 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: lane_64_64:
+; CHECK: ins
+entry:
+  %0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x half> %0
+}
+
+; float16x8_t lane_128_64(float16x8_t a, float16x4_t b) {
+;   return vcopyq_lane_s16(a, 1, b, 2);
+; }
+define <8 x half> @lane_128_64(<8 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: lane_128_64:
+; CHECK: ins
+entry:
+  %0 = bitcast <4 x half> %b to <4 x i16>
+  %vget_lane = extractelement <4 x i16> %0, i32 2
+  %1 = bitcast <8 x half> %a to <8 x i16>
+  %vset_lane = insertelement <8 x i16> %1, i16 %vget_lane, i32 1
+  %2 = bitcast <8 x i16> %vset_lane to <8 x half>
+  ret <8 x half> %2
+}
+
+; float16x4_t lane_64_128(float16x4_t a, float16x8_t b) {
+;   return vcopy_laneq_s16(a, 3, b, 5);
+; }
+define <4 x half> @lane_64_128(<4 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: lane_64_128:
+; CHECK: ins
+entry:
+  %0 = bitcast <8 x half> %b to <8 x i16>
+  %vgetq_lane = extractelement <8 x i16> %0, i32 5
+  %1 = bitcast <4 x half> %a to <4 x i16>
+  %vset_lane = insertelement <4 x i16> %1, i16 %vgetq_lane, i32 3
+  %2 = bitcast <4 x i16> %vset_lane to <4 x half>
+  ret <4 x half> %2
+}
+
+; float16x8_t lane_128_128(float16x8_t a, float16x8_t b) {
+;   return vcopyq_laneq_s16(a, 3, b, 5);
+; }
+define <8 x half> @lane_128_128(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: lane_128_128:
+; CHECK: ins
+entry:
+  %0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %0
+}
+
+; float16x4_t ext_64(float16x4_t a, float16x4_t b) {
+;   return vext_s16(a, b, 3);
+; }
+define <4 x half> @ext_64(<4 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: ext_64:
+; CHECK: ext
+entry:
+  %0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x half> %0
+}
+
+; float16x8_t ext_128(float16x8_t a, float16x8_t b) {
+;   return vextq_s16(a, b, 3);
+; }
+define <8 x half> @ext_128(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: ext_128:
+; CHECK: ext
+entry:
+  %0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x half> %0
+}
+
+; float16x4_t rev32_64(float16x4_t a) {
+;   return vrev32_s16(a);
+; }
+define <4 x half> @rev32_64(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev32_64:
+; CHECK: rev32
+  %0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x half> %0
+}
+
+; float16x4_t rev64_64(float16x4_t a) {
+;   return vrev64_s16(a);
+; }
+define <4 x half> @rev64_64(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev64_64:
+; CHECK: rev64
+  %0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x half> %0
+}
+
+; float16x8_t rev32_128(float16x8_t a) {
+;   return vrev32q_s16(a);
+; }
+define <8 x half> @rev32_128(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev32_128:
+; CHECK: rev32
+  %0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x half> %0
+}
+
+; float16x8_t rev64_128(float16x8_t a) {
+;   return vrev64q_s16(a);
+; }
+define <8 x half> @rev64_128(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: rev64_128:
+; CHECK: rev64
+  %0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x half> %0
+}
+
+; float16x4_t create_64(long long a) { return vcreate_f16(a); }
+define <4 x half> @create_64(i64 %a) #0 {
+; CHECK-LABEL: create_64:
+; CHECK: fmov
+entry:
+  %0 = bitcast i64 %a to <4 x half>
+  ret <4 x half> %0
+}
+
+; float16x4_t dup_64(__fp16 a) { return vdup_n_f16(a); }
+define <4 x half> @dup_64(half %a) #0 {
+; CHECK-LABEL: dup_64:
+; CHECK: dup
+entry:
+  %vecinit = insertelement <4 x half> undef, half %a, i32 0
+  %vecinit1 = insertelement <4 x half> %vecinit, half %a, i32 1
+  %vecinit2 = insertelement <4 x half> %vecinit1, half %a, i32 2
+  %vecinit3 = insertelement <4 x half> %vecinit2, half %a, i32 3
+  ret <4 x half> %vecinit3
+}
+
+; float16x8_t dup_128(__fp16 a) { return vdupq_n_f16(a); }
+define <8 x half> @dup_128(half %a) #0 {
+entry:
+; CHECK-LABEL: dup_128:
+; CHECK: dup
+  %vecinit = insertelement <8 x half> undef, half %a, i32 0
+  %vecinit1 = insertelement <8 x half> %vecinit, half %a, i32 1
+  %vecinit2 = insertelement <8 x half> %vecinit1, half %a, i32 2
+  %vecinit3 = insertelement <8 x half> %vecinit2, half %a, i32 3
+  %vecinit4 = insertelement <8 x half> %vecinit3, half %a, i32 4
+  %vecinit5 = insertelement <8 x half> %vecinit4, half %a, i32 5
+  %vecinit6 = insertelement <8 x half> %vecinit5, half %a, i32 6
+  %vecinit7 = insertelement <8 x half> %vecinit6, half %a, i32 7
+  ret <8 x half> %vecinit7
+}
+
+; float16x4_t dup_lane_64(float16x4_t a) { return vdup_lane_f16(a, 2); }
+define <4 x half> @dup_lane_64(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_lane_64:
+; CHECK: dup
+  %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x half> %shuffle
+}
+
+; float16x8_t dup_lane_128(float16x4_t a) { return vdupq_lane_f16(a, 2); }
+define <8 x half> @dup_lane_128(<4 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_lane_128:
+; CHECK: dup
+  %shuffle = shufflevector <4 x half> %a, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x half> %shuffle
+}
+
+; float16x4_t dup_laneq_64(float16x8_t a) { return vdup_laneq_f16(a, 2); }
+define <4 x half> @dup_laneq_64(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_laneq_64:
+; CHECK: dup
+  %shuffle = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x half> %shuffle
+}
+
+; float16x8_t dup_laneq_128(float16x8_t a) { return vdupq_laneq_f16(a, 2); }
+define <8 x half> @dup_laneq_128(<8 x half> %a) #0 {
+entry:
+; CHECK-LABEL: dup_laneq_128:
+; CHECK: dup
+  %shuffle = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x half> %shuffle
+}
+
+; float16x8_t vcombine(float16x4_t a, float16x4_t b) { return vcombine_f16(a, b); }
+define <8 x half> @vcombine(<4 x half> %a, <4 x half> %b) #0 {
+entry:
+; CHECK-LABEL: vcombine:
+; CHECK: ins
+  %shuffle.i = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %shuffle.i
+}
+
+; float16x4_t get_high(float16x8_t a) { return vget_high_f16(a); }
+define <4 x half> @get_high(<8 x half> %a) #0 {
+; CHECK-LABEL: get_high:
+; CHECK: ext
+entry:
+  %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x half> %shuffle.i
+}
+
+
+; float16x4_t get_low(float16x8_t a) { return vget_low_f16(a); }
+define <4 x half> @get_low(<8 x half> %a) #0 {
+; CHECK-LABEL: get_low:
+; CHECK-NOT: ext
+entry:
+  %shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x half> %shuffle.i
+}
+
+; float16x4_t set_lane_64(float16x4_t a, __fp16 b) { return vset_lane_f16(b, a, 2); }
+define <4 x half> @set_lane_64(<4 x half> %a, half %b) #0 {
+; CHECK-LABEL: set_lane_64:
+; CHECK: fmov
+; CHECK: ins
+entry:
+  %0 = bitcast half %b to i16
+  %1 = bitcast <4 x half> %a to <4 x i16>
+  %vset_lane = insertelement <4 x i16> %1, i16 %0, i32 2
+  %2 = bitcast <4 x i16> %vset_lane to <4 x half>
+  ret <4 x half> %2
+}
+
+
+; float16x8_t set_lane_128(float16x8_t a, __fp16 b) { return vsetq_lane_f16(b, a, 2); }
+define <8 x half> @set_lane_128(<8 x half> %a, half %b) #0 {
+; CHECK-LABEL: set_lane_128:
+; CHECK: fmov
+; CHECK: ins
+entry:
+  %0 = bitcast half %b to i16
+  %1 = bitcast <8 x half> %a to <8 x i16>
+  %vset_lane = insertelement <8 x i16> %1, i16 %0, i32 2
+  %2 = bitcast <8 x i16> %vset_lane to <8 x half>
+  ret <8 x half> %2
+}
+
+; __fp16 get_lane_64(float16x4_t a) { return vget_lane_f16(a, 2); }
+define half @get_lane_64(<4 x half> %a) #0 {
+; CHECK-LABEL: get_lane_64:
+; CHECK: umov
+; CHECK: fmov
+entry:
+  %0 = bitcast <4 x half> %a to <4 x i16>
+  %vget_lane = extractelement <4 x i16> %0, i32 2
+  %1 = bitcast i16 %vget_lane to half
+  ret half %1
+}
+
+; __fp16 get_lane_128(float16x8_t a) { return vgetq_lane_f16(a, 2); }
+define half @get_lane_128(<8 x half> %a) #0 {
+; CHECK-LABEL: get_lane_128:
+; CHECK: umov
+; CHECK: fmov
+entry:
+  %0 = bitcast <8 x half> %a to <8 x i16>
+  %vgetq_lane = extractelement <8 x i16> %0, i32 2
+  %1 = bitcast i16 %vgetq_lane to half
+  ret half %1
+}

diff --git a/test/CodeGen/AArch64/fpconv-vector-op-scalarize.ll b/test/CodeGen/AArch64/fpconv-vector-op-scalarize.ll
new file mode 100644
index 0000000..56e0b4a
--- /dev/null
+++ b/test/CodeGen/AArch64/fpconv-vector-op-scalarize.ll

@@ -0,0 +1,44 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; PR20778
+; Check that the legalizer doesn't crash when scalarizing FP conversion
+; instructions' operands.  The operands are all illegal on AArch64,
+; ensuring they are legalized.  The results are all legal.
+
+define <1 x double> @test_sitofp(<1 x i1> %in) {
+; CHECK-LABEL: test_sitofp:
+; CHECK:       sbfx  [[GPR:w[0-9]+]], w0, #0, #1
+; CHECK-NEXT:  scvtf d0, [[GPR]]
+; CHECK-NEXT:  ret
+entry:
+  %0 = sitofp <1 x i1> %in to <1 x double>
+  ret <1 x double> %0
+}
+
+define <1 x double> @test_uitofp(<1 x i1> %in) {
+; CHECK-LABEL: test_uitofp:
+; CHECK:       and   [[GPR:w[0-9]+]], w0, #0x1
+; CHECK-NEXT:  ucvtf d0, [[GPR]]
+; CHECK-NEXT:  ret
+entry:
+  %0 = uitofp <1 x i1> %in to <1 x double>
+  ret <1 x double> %0
+}
+
+define <1 x i64> @test_fptosi(<1 x fp128> %in) {
+; CHECK-LABEL: test_fptosi:
+; CHECK:       bl    ___fixtfdi
+; CHECK-NEXT:  fmov  d0, x0
+entry:
+  %0 = fptosi <1 x fp128> %in to <1 x i64>
+  ret <1 x i64> %0
+}
+
+define <1 x i64> @test_fptoui(<1 x fp128> %in) {
+; CHECK-LABEL: test_fptoui:
+; CHECK:       bl    ___fixunstfdi
+; CHECK-NEXT:  fmov  d0, x0
+entry:
+  %0 = fptoui <1 x fp128> %in to <1 x i64>
+  ret <1 x i64> %0
+}

diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll
index 85d95e2..d6bb50e 100644
--- a/test/CodeGen/AArch64/frameaddr.ll
+++ b/test/CodeGen/AArch64/frameaddr.ll

@@ -1,20 +1,29 @@
-; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0  | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin                             -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort -verify-machineinstrs < %s | FileCheck %s
 
-define i8* @t() nounwind {
+define i8* @test_frameaddress0() nounwind {
 entry:
-; CHECK-LABEL: t:
+; CHECK-LABEL: test_frameaddress0:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
 ; CHECK: mov x0, x29
-	%0 = call i8* @llvm.frameaddress(i32 0)
-        ret i8* %0
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
 }
 
-define i8* @t2() nounwind {
+define i8* @test_frameaddress2() nounwind {
 entry:
-; CHECK-LABEL: t2:
+; CHECK-LABEL: test_frameaddress2:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
 ; CHECK: ldr x[[reg:[0-9]+]], [x29]
-; CHECK: ldr {{x[0-9]+}}, [x[[reg]]]
-	%0 = call i8* @llvm.frameaddress(i32 2)
-        ret i8* %0
+; CHECK: ldr x0, [x[[reg]]]
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+  %0 = call i8* @llvm.frameaddress(i32 2)
+  ret i8* %0
 }
 
 declare i8* @llvm.frameaddress(i32) nounwind readnone

diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index 422c576..51979f0 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll

@@ -1,7 +1,7 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
 
 %myStruct = type { i64 , i8, i32 }
 

diff --git a/test/CodeGen/AArch64/half.ll b/test/CodeGen/AArch64/half.ll
new file mode 100644
index 0000000..a46094b
--- /dev/null
+++ b/test/CodeGen/AArch64/half.ll

@@ -0,0 +1,83 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define void @test_load_store(half* %in, half* %out) {
+; CHECK-LABEL: test_load_store:
+; CHECK: ldr [[TMP:h[0-9]+]], [x0]
+; CHECK: str [[TMP]], [x1]
+  %val = load half* %in
+  store half %val, half* %out
+  ret void
+}
+
+define i16 @test_bitcast_from_half(half* %addr) {
+; CHECK-LABEL: test_bitcast_from_half:
+; CHECK: ldrh w0, [x0]
+  %val = load half* %addr
+  %val_int = bitcast half %val to i16
+  ret i16 %val_int
+}
+
+define i16 @test_reg_bitcast_from_half(half %in) {
+; CHECK-LABEL: test_reg_bitcast_from_half:
+; CHECK-NOT: str
+; CHECK-NOT: ldr
+; CHECK-DAG: fmov w0, s0
+; CHECK: ret
+  %val = bitcast half %in to i16
+  ret i16 %val
+}
+
+define void @test_bitcast_to_half(half* %addr, i16 %in) {
+; CHECK-LABEL: test_bitcast_to_half:
+; CHECK: strh w1, [x0]
+  %val_fp = bitcast i16 %in to half
+  store half %val_fp, half* %addr
+  ret void
+}
+
+define half @test_reg_bitcast_to_half(i16 %in) {
+; CHECK-LABEL: test_reg_bitcast_to_half:
+; CHECK-NOT: str
+; CHECK-NOT: ldr
+; CHECK-DAG: fmov s0, w0
+; CHECK: ret
+
+  %val = bitcast i16 %in to half
+  ret half %val
+}
+
+define float @test_extend32(half* %addr) {
+; CHECK-LABEL: test_extend32:
+; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
+
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to float
+  ret float %val32
+}
+
+define double @test_extend64(half* %addr) {
+; CHECK-LABEL: test_extend64:
+; CHECK: fcvt {{d[0-9]+}}, {{h[0-9]+}}
+
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to double
+  ret double %val32
+}
+
+define void @test_trunc32(float %in, half* %addr) {
+; CHECK-LABEL: test_trunc32:
+; CHECK: fcvt {{h[0-9]+}}, {{s[0-9]+}}
+
+  %val16 = fptrunc float %in to half
+  store half %val16, half* %addr
+  ret void
+}
+
+define void @test_trunc64(double %in, half* %addr) {
+; CHECK-LABEL: test_trunc64:
+; CHECK: fcvt {{h[0-9]+}}, {{d[0-9]+}}
+
+  %val16 = fptrunc double %in to half
+  store half %val16, half* %addr
+  ret void
+}

diff --git a/test/CodeGen/AArch64/hints.ll b/test/CodeGen/AArch64/hints.ll
new file mode 100644
index 0000000..d7d9e23
--- /dev/null
+++ b/test/CodeGen/AArch64/hints.ll

@@ -0,0 +1,67 @@
+; RUN: llc -mtriple aarch64-eabi -o - %s | FileCheck %s
+
+declare void @llvm.aarch64.hint(i32) nounwind
+
+define void @hint_nop() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 0) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_nop
+; CHECK: nop
+
+define void @hint_yield() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 1) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_yield
+; CHECK: yield
+
+define void @hint_wfe() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 2) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfe
+; CHECK: wfe
+
+define void @hint_wfi() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 3) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_wfi
+; CHECK: wfi
+
+define void @hint_sev() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 4) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sev
+; CHECK: sev
+
+define void @hint_sevl() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 5) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_sevl
+; CHECK: sevl
+
+define void @hint_undefined() {
+entry:
+  tail call void @llvm.aarch64.hint(i32 8) nounwind
+  ret void
+}
+
+; CHECK-LABEL: hint_undefined
+; CHECK: hint #0x8
+

diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
index f47b490..a275e7e 100644
--- a/test/CodeGen/AArch64/init-array.ll
+++ b/test/CodeGen/AArch64/init-array.ll

@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -o - %s | FileCheck %s
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
   ret void

diff --git a/test/CodeGen/AArch64/intrinsics-memory-barrier.ll b/test/CodeGen/AArch64/intrinsics-memory-barrier.ll
new file mode 100644
index 0000000..09e34ae
--- /dev/null
+++ b/test/CodeGen/AArch64/intrinsics-memory-barrier.ll

@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=aarch64-eabi -O=3 | FileCheck %s
+
+define void @test() {
+  ; CHECK: dmb sy
+  call void @llvm.aarch64.dmb(i32 15)
+  ; CHECK: dmb osh
+  call void @llvm.aarch64.dmb(i32 3)
+  ; CHECK: dsb sy
+  call void @llvm.aarch64.dsb(i32 15)
+  ; CHECK: dsb ishld
+  call void @llvm.aarch64.dsb(i32 9)
+  ; CHECK: isb
+  call void @llvm.aarch64.isb(i32 15)
+  ret void
+}
+
+; Important point is that the compiler should not reorder memory access
+; instructions around DMB.
+; Failure to do so, two STRs will collapse into one STP.
+define void @test_dmb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+
+  call void @llvm.aarch64.dmb(i32 15); CHECK: dmb sy
+
+  %d1 = getelementptr i32* %d, i64 1
+  store i32 %b, i32* %d1             ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  ret void
+}
+
+; Similarly for DSB.
+define void @test_dsb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+
+  call void @llvm.aarch64.dsb(i32 15); CHECK: dsb sy
+
+  %d1 = getelementptr i32* %d, i64 1
+  store i32 %b, i32* %d1             ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  ret void
+}
+
+; And ISB.
+define void @test_isb_reordering(i32 %a, i32 %b, i32* %d) {
+  store i32 %a, i32* %d              ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+
+  call void @llvm.aarch64.isb(i32 15); CHECK: isb
+
+  %d1 = getelementptr i32* %d, i64 1
+  store i32 %b, i32* %d1             ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  ret void
+}
+
+declare void @llvm.aarch64.dmb(i32)
+declare void @llvm.aarch64.dsb(i32)
+declare void @llvm.aarch64.isb(i32)

diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 69fbd99..16682e9 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll

@@ -56,10 +56,11 @@
 ; CHECK-NEXT: .xword
 
 ; CHECK-PIC-NOT: .data_region
+; CHECK-PIC-NOT: .LJTI0_0
 ; CHECK-PIC: .LJTI0_0:
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
-; CHECK-PIC-NEXT: .word
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
 ; CHECK-PIC-NOT: .end_data_region

diff --git a/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll b/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll
new file mode 100644
index 0000000..b785a8f
--- /dev/null
+++ b/test/CodeGen/AArch64/legalize-bug-bogus-cpu.ll

@@ -0,0 +1,8 @@
+; RUN: llc -march=aarch64 -mcpu=bogus -o - %s
+
+; Fix the bug in PR20557. Set mcpu to a bogus name, llc will crash in type
+; legalization.
+define <4 x float> @fneg4(<4 x float> %x) {
+  %sub = fsub <4 x float> zeroinitializer, %x
+  ret <4 x float> %sub
+}

diff --git a/test/CodeGen/AArch64/machine_cse.ll b/test/CodeGen/AArch64/machine_cse.ll
new file mode 100644
index 0000000..bc9ab10
--- /dev/null
+++ b/test/CodeGen/AArch64/machine_cse.ll

@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s
+
+; marked as external to prevent possible optimizations
+@a = external global i32
+@b = external global i32
+@c = external global i32
+@d = external global i32
+@e = external global i32
+
+define void @combine-sign-comparisons-by-cse(i32 *%arg) {
+; CHECK: cmp
+; CHECK: b.ge
+; CHECK-NOT: cmp
+; CHECK: b.le
+
+entry:
+  %a = load i32* @a, align 4
+  %b = load i32* @b, align 4
+  %c = load i32* @c, align 4
+  %d = load i32* @d, align 4
+  %e = load i32* @e, align 4
+
+  %cmp = icmp slt i32 %a, %e
+  br i1 %cmp, label %land.lhs.true, label %lor.lhs.false
+
+land.lhs.true:
+  %cmp1 = icmp eq i32 %b, %c
+  br i1 %cmp1, label %return, label %if.end
+
+lor.lhs.false:
+  %cmp2 = icmp sgt i32 %a, %e
+  br i1 %cmp2, label %land.lhs.true3, label %if.end
+
+land.lhs.true3:
+  %cmp4 = icmp eq i32 %b, %d
+  br i1 %cmp4, label %return, label %if.end
+
+if.end:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %if.end ], [ 1, %land.lhs.true3 ], [ 1, %land.lhs.true ]
+  store i32 %a, i32 *%arg
+  ret void
+}

diff --git a/test/CodeGen/AArch64/madd-combiner.ll b/test/CodeGen/AArch64/madd-combiner.ll
new file mode 100644
index 0000000..7c9787a
--- /dev/null
+++ b/test/CodeGen/AArch64/madd-combiner.ll

@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=aarch64-apple-darwin            -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+
+; Test that we use the correct register class.
+define i32 @mul_add_imm(i32 %a, i32 %b) {
+; CHECK-LABEL: mul_add_imm
+; CHECK:       orr [[REG:w[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  madd  {{w[0-9]+}}, w0, w1, [[REG]]
+  %1 = mul i32 %a, %b
+  %2 = add i32 %1, 4
+  ret i32 %2
+}
+
+define i32 @mul_sub_imm1(i32 %a, i32 %b) {
+; CHECK-LABEL: mul_sub_imm1
+; CHECK:       orr [[REG:w[0-9]+]], wzr, #0x4
+; CHECK-NEXT:  msub  {{w[0-9]+}}, w0, w1, [[REG]]
+  %1 = mul i32 %a, %b
+  %2 = sub i32 4, %1
+  ret i32 %2
+}
+
+; bugpoint reduced test case. This only tests that we pass the MI verifier.
+define void @mul_add_imm2() {
+entry:
+  br label %for.body
+for.body:
+  br i1 undef, label %for.body, label %for.body8
+for.body8:
+  %0 = mul i64 undef, -3
+  %mul1971 = add i64 %0, -3
+  %cmp7 = icmp slt i64 %mul1971, 1390451930000
+  br i1 %cmp7, label %for.body8, label %for.end20
+for.end20:
+  ret void
+}
+

diff --git a/test/CodeGen/AArch64/madd-lohi.ll b/test/CodeGen/AArch64/madd-lohi.ll
new file mode 100644
index 0000000..550a8cb
--- /dev/null
+++ b/test/CodeGen/AArch64/madd-lohi.ll

@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
+
+define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
+; CHECK-LABEL: test_128bitmul:
+; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
+; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]]
+; CHECK: madd x1, x1, x2, [[PART1]]
+; CHECK: mul x0, x0, x2
+
+; CHECK-BE-LABEL: test_128bitmul:
+; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
+; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]]
+; CHECK-BE: madd x0, x0, x3, [[PART1]]
+; CHECK-BE: mul x1, x1, x3
+
+  %prod = mul i128 %lhs, %rhs
+  ret i128 %prod
+}

diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index 0689fbd..4515697 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll

@@ -1,17 +1,16 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
-
+; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - | FileCheck --check-prefix=CHECK-BE %s
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
+; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
 ; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
-; CHECK-DAG: madd [[PART1:x[0-9]+]], x0, x3, [[CARRY]]
-; CHECK: madd x1, x1, x2, [[PART1]]
+; CHECK: mul [[PART2:x[0-9]+]], x1, x2
 ; CHECK: mul x0, x0, x2
 
 ; CHECK-BE-LABEL: test_128bitmul:
+; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2
 ; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
-; CHECK-BE-DAG: madd [[PART1:x[0-9]+]], x1, x2, [[CARRY]]
-; CHECK-BE: madd x0, x0, x3, [[PART1]]
+; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3
 ; CHECK-BE: mul x1, x1, x3
 
   %prod = mul i128 %lhs, %rhs

diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index 4f8571d..41e391d 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll

@@ -1387,6 +1387,13 @@
   ret <8 x i16> %shuffle.i
 }
 
+define <4 x i8> @test_vzip1_v4i8(<8 x i8> %p) {
+; CHECK-LABEL: test_vzip1_v4i8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %lo = shufflevector <8 x i8> %p, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i8> %lo
+}
+
 define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) {
 ; CHECK-LABEL: test_same_vzip2_s8:
 ; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b

diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index a01df32..6afac31 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll

@@ -101,3 +101,20 @@
   ret <1 x i64> %vset_lane
 }
 
+; Undefined behaviour, so we really don't care what actually gets emitted, just
+; as long as we don't crash (since it could be dynamically unreachable).
+define i32 @test_out_of_range_extract(<4 x i32> %vec) {
+; CHECK-LABEL: test_out_of_range_extract:
+; CHECK: ret
+  %elt = extractelement <4 x i32> %vec, i32 4
+  ret i32 %elt
+}
+
+; Undefined behaviour, so we really don't care what actually gets emitted, just
+; as long as we don't crash (since it could be dynamically unreachable).
+define void @test_out_of_range_insert(<4 x i32> %vec, i32 %elt) {
+; CHECK-LABEL: test_out_of_range_insert:
+; CHECK: ret
+  insertelement <4 x i32> %vec, i32 %elt, i32 4
+  ret void
+}

diff --git a/test/CodeGen/AArch64/paired-load.ll b/test/CodeGen/AArch64/paired-load.ll
new file mode 100644
index 0000000..3dddb9e
--- /dev/null
+++ b/test/CodeGen/AArch64/paired-load.ll

@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnu"
+
+; Ensure we're generating ldp instructions instead of ldr Q.
+; CHECK: ldp
+; CHECK: stp
+define void @f(i64* %p, i64* %q) {
+  %addr2 = getelementptr i64* %q, i32 1
+  %addr = getelementptr i64* %p, i32 1
+  %x = load i64* %p
+  %y = load i64* %addr
+  store i64 %x, i64* %q
+  store i64 %y, i64* %addr2
+  ret void
+}

diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index e8c7625..93ee0e6 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll

@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 
 ; Make sure exception-handling PIC code can be linked correctly. An alternative
 ; to the sequence described below would have .gcc_except_table itself writable

diff --git a/test/CodeGen/AArch64/postra-mi-sched.ll b/test/CodeGen/AArch64/postra-mi-sched.ll
new file mode 100644
index 0000000..5a40724
--- /dev/null
+++ b/test/CodeGen/AArch64/postra-mi-sched.ll

@@ -0,0 +1,31 @@
+; RUN: llc < %s -O3 -march=aarch64 -mcpu=cortex-a53 | FileCheck %s
+
+; With cortex-a53, each of fmul and fcvt have latency of 6 cycles.  After the
+; pre-RA MI scheduler, fmul, fcvt and fdiv will be consecutive.  The top-down
+; post-RA MI scheduler will clean this up.
+
+@d1 = common global double 0.000000e+00, align 8
+
+define i32 @test1(float %s2, float %s3, double %d, i32 %i2, i32 %i3) {
+entry:
+; CHECK-LABEL: @test1
+; CHECK: fmul
+; CHECK-NEXT: add
+; CHECK: fcvt
+; CHECK-NEXT: mul
+  %mul = fmul float %s2, %s3
+  %conv = fpext float %mul to double
+  %div = fdiv double %d, %conv
+  store double %div, double* @d1, align 8
+  %factor = shl i32 %i3, 1
+  %add1 = add i32 %i2, 4
+  %add2 = add i32 %add1, %factor
+  %add3 = add nsw i32 %add2, %i2
+  %add4 = add nsw i32 %add3, %add2
+  %mul5 = mul i32 %add3, %add3
+  %mul6 = mul i32 %mul5, %add4
+  %mul7 = shl i32 %add4, 1
+  %factor18 = mul i32 %mul7, %mul6
+  %add9 = add i32 %factor18, %mul6
+  ret i32 %add9
+}

diff --git a/test/CodeGen/AArch64/rbit.ll b/test/CodeGen/AArch64/rbit.ll
new file mode 100644
index 0000000..3404ae4
--- /dev/null
+++ b/test/CodeGen/AArch64/rbit.ll

@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+
+; CHECK-LABEL: rbit32
+; CHECK: rbit w0, w0
+define i32 @rbit32(i32 %t) {
+entry:
+  %rbit.i = call i32 @llvm.aarch64.rbit.i32(i32 %t)
+  ret i32 %rbit.i
+}
+
+; CHECK-LABEL: rbit64
+; CHECK: rbit x0, x0
+define i64 @rbit64(i64 %t) {
+entry:
+  %rbit.i = call i64 @llvm.aarch64.rbit.i64(i64 %t)
+  ret i64 %rbit.i
+}
+
+declare i64 @llvm.aarch64.rbit.i64(i64)
+declare i32 @llvm.aarch64.rbit.i32(i32)

diff --git a/test/CodeGen/AArch64/rm_redundant_cmp.ll b/test/CodeGen/AArch64/rm_redundant_cmp.ll
new file mode 100644
index 0000000..36dc118
--- /dev/null
+++ b/test/CodeGen/AArch64/rm_redundant_cmp.ll

@@ -0,0 +1,254 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s
+
+; The following cases are for i16
+
+%struct.s_signed_i16 = type { i16, i16, i16 }
+%struct.s_unsigned_i16 = type { i16, i16, i16 }
+
+@cost_s_i8_i16 = common global %struct.s_signed_i16 zeroinitializer, align 2
+@cost_u_i16 = common global %struct.s_unsigned_i16 zeroinitializer, align 2
+
+define void @test_i16_2cmp_signed_1() {
+; CHECK-LABEL: test_i16_2cmp_signed_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.gt
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2
+  %cmp = icmp sgt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %0, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i16_2cmp_signed_2() {
+; CHECK-LABEL: test_i16_2cmp_signed_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.le
+; CHECK-NOT: cmp
+; CHECK: b.ge
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2
+  %cmp = icmp sgt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp slt i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %1, i16* getelementptr inbounds (%struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i16_2cmp_unsigned_1() {
+; CHECK-LABEL: test_i16_2cmp_unsigned_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.hi
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2
+  %cmp = icmp ugt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %0, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i16_2cmp_unsigned_2() {
+; CHECK-LABEL: test_i16_2cmp_unsigned_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.ls
+; CHECK-NOT: cmp
+; CHECK: b.hs
+entry:
+  %0 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2
+  %1 = load i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2
+  %cmp = icmp ugt i16 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i16 %0, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp ult i16 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i16 %1, i16* getelementptr inbounds (%struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+; The following cases are for i8
+
+%struct.s_signed_i8 = type { i8, i8, i8 }
+%struct.s_unsigned_i8 = type { i8, i8, i8 }
+
+@cost_s = common global %struct.s_signed_i8 zeroinitializer, align 2
+@cost_u_i8 = common global %struct.s_unsigned_i8 zeroinitializer, align 2
+
+
+define void @test_i8_2cmp_signed_1() {
+; CHECK-LABEL: test_i8_2cmp_signed_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.gt
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2
+  %cmp = icmp sgt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %0, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i8_2cmp_signed_2() {
+; CHECK-LABEL: test_i8_2cmp_signed_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.le
+; CHECK-NOT: cmp
+; CHECK: b.ge
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2
+  %cmp = icmp sgt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp slt i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %1, i8* getelementptr inbounds (%struct.s_signed_i8* @cost_s, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i8_2cmp_unsigned_1() {
+; CHECK-LABEL: test_i8_2cmp_unsigned_1
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.hi
+; CHECK-NOT: cmp
+; CHECK: b.ne
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2
+  %cmp = icmp ugt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp eq i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %0, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+define void @test_i8_2cmp_unsigned_2() {
+; CHECK-LABEL: test_i8_2cmp_unsigned_2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NEXT: b.ls
+; CHECK-NOT: cmp
+; CHECK: b.hs
+entry:
+  %0 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2
+  %1 = load i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2
+  %cmp = icmp ugt i8 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  store i8 %0, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.else:                                          ; preds = %entry
+  %cmp5 = icmp ult i8 %0, %1
+  br i1 %cmp5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.else
+  store i8 %1, i8* getelementptr inbounds (%struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 0), align 2
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %if.then7, %if.then
+  ret void
+}
+
+; Make sure the case below won't crash.
+
+; The optimization of ZERO_EXTEND and SIGN_EXTEND in type legalization stage can't assert
+; the operand of a set_cc is always a TRUNCATE.
+
+define i1 @foo(float %inl, float %inr) {
+  %lval = fptosi float %inl to i8
+  %rval = fptosi float %inr to i8
+  %sum = icmp eq i8 %lval, %rval
+  ret i1 %sum
+}

diff --git a/test/CodeGen/AArch64/sdivpow2.ll b/test/CodeGen/AArch64/sdivpow2.ll
new file mode 100644
index 0000000..6c02ea9
--- /dev/null
+++ b/test/CodeGen/AArch64/sdivpow2.ll

@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=arm64-linux-gnu -fast-isel=0 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -fast-isel=1 -verify-machineinstrs < %s | FileCheck %s
+
+define i32 @test1(i32 %x) {
+; CHECK-LABEL: test1
+; CHECK: add w8, w0, #7
+; CHECK: cmp w0, #0
+; CHECK: csel w8, w8, w0, lt
+; CHECK: asr w0, w8, #3
+  %div = sdiv i32 %x, 8
+  ret i32 %div
+}
+
+define i32 @test2(i32 %x) {
+; CHECK-LABEL: test2
+; CHECK: add w8, w0, #7
+; CHECK: cmp w0, #0
+; CHECK: csel w8, w8, w0, lt
+; CHECK: neg w0, w8, asr #3
+  %div = sdiv i32 %x, -8
+  ret i32 %div
+}
+
+define i32 @test3(i32 %x) {
+; CHECK-LABEL: test3
+; CHECK: add w8, w0, #31
+; CHECK: cmp w0, #0
+; CHECK: csel w8, w8, w0, lt
+; CHECK: asr w0, w8, #5
+  %div = sdiv i32 %x, 32
+  ret i32 %div
+}
+
+define i64 @test4(i64 %x) {
+; CHECK-LABEL: test4
+; CHECK: add x8, x0, #7
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: asr x0, x8, #3
+  %div = sdiv i64 %x, 8
+  ret i64 %div
+}
+
+define i64 @test5(i64 %x) {
+; CHECK-LABEL: test5
+; CHECK: add x8, x0, #7
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: neg x0, x8, asr #3
+  %div = sdiv i64 %x, -8
+  ret i64 %div
+}
+
+define i64 @test6(i64 %x) {
+; CHECK-LABEL: test6
+; CHECK: add x8, x0, #63
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: asr x0, x8, #6
+  %div = sdiv i64 %x, 64
+  ret i64 %div
+}
+
+define i64 @test7(i64 %x) {
+; CHECK-LABEL: test7
+; CHECK: orr [[REG:x[0-9]+]], xzr, #0xffffffffffff
+; CHECK: add x8, x0, [[REG]]
+; CHECK: cmp x0, #0
+; CHECK: csel x8, x8, x0, lt
+; CHECK: asr x0, x8, #48
+  %div = sdiv i64 %x, 281474976710656
+  ret i64 %div
+}
+

diff --git a/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll b/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
new file mode 100644
index 0000000..bedbf5f
--- /dev/null
+++ b/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll

@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=pic | FileCheck %s
+
+@__stack_chk_guard = external global i64*
+
+; PR20558
+
+; CHECK: adrp [[R0:x[0-9]+]], ___stack_chk_guard@GOTPAGE
+; CHECK: ldr  [[R1:x[0-9]+]], {{\[}}[[R0]], ___stack_chk_guard@GOTPAGEOFF{{\]}}
+; CHECK: ldr  [[R2:x[0-9]+]], {{\[}}[[R1]]{{\]}}
+; CHECK: stur [[R2]], {{\[}}x29, [[SLOT0:[0-9#\-]+]]{{\]}}
+; CHECK: ldur [[R3:x[0-9]+]], {{\[}}x29, [[SLOT0]]{{\]}}
+; CHECK: sub  [[R4:x[0-9]+]], [[R2]], [[R3]]
+; CHECK: cbnz [[R4]], LBB
+
+define i32 @test_stack_guard_remat2() {
+entry:
+  %StackGuardSlot = alloca i8*
+  %StackGuard = load i8** bitcast (i64** @__stack_chk_guard to i8**)
+  call void @llvm.stackprotector(i8* %StackGuard, i8** %StackGuardSlot)
+  %container = alloca [32 x i8], align 1
+  call void @llvm.stackprotectorcheck(i8** bitcast (i64** @__stack_chk_guard to i8**))
+  ret i32 -1
+}
+
+declare void @llvm.stackprotector(i8*, i8**)
+declare void @llvm.stackprotectorcheck(i8**)

diff --git a/test/CodeGen/AArch64/stack_guard_remat.ll b/test/CodeGen/AArch64/stack_guard_remat.ll
new file mode 100644
index 0000000..cee7266
--- /dev/null
+++ b/test/CodeGen/AArch64/stack_guard_remat.ll

@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=arm64-apple-ios -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC-LINUX
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -relocation-model=static -code-model=large -no-integrated-as | FileCheck %s -check-prefix=STATIC-LARGE
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -relocation-model=static -code-model=small -no-integrated-as | FileCheck %s -check-prefix=STATIC-SMALL
+
+; DARWIN: foo2
+; DARWIN: adrp [[R0:x[0-9]+]], ___stack_chk_guard@GOTPAGE
+; DARWIN: ldr [[R1:x[0-9]+]], {{\[}}[[R0]], ___stack_chk_guard@GOTPAGEOFF{{\]}}
+; DARWIN: ldr {{x[0-9]+}}, {{\[}}[[R1]]{{\]}}
+
+; PIC-LINUX: foo2
+; PIC-LINUX: adrp [[R0:x[0-9]+]], :got:__stack_chk_guard
+; PIC-LINUX: ldr [[R1:x[0-9]+]], {{\[}}[[R0]], :got_lo12:__stack_chk_guard{{\]}}
+; PIC-LINUX: ldr {{x[0-9]+}}, {{\[}}[[R1]]{{\]}}
+
+; STATIC-LARGE: foo2
+; STATIC-LARGE: movz [[R0:x[0-9]+]], #:abs_g3:__stack_chk_guard
+; STATIC-LARGE: movk [[R0]], #:abs_g2_nc:__stack_chk_guard
+; STATIC-LARGE: movk [[R0]], #:abs_g1_nc:__stack_chk_guard
+; STATIC-LARGE: movk [[R0]], #:abs_g0_nc:__stack_chk_guard
+; STATIC-LARGE: ldr {{x[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+; STATIC-SMALL: foo2
+; STATIC-SMALL: adrp [[R0:x[0-9]+]], __stack_chk_guard
+; STATIC-SMALL: ldr {{x[0-9]+}}, {{\[}}[[R0]], :lo12:__stack_chk_guard{{\]}}
+
+define i32 @test_stack_guard_remat() #0 {
+entry:
+  %a1 = alloca [256 x i32], align 4
+  %0 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %0)
+  %arraydecay = getelementptr inbounds [256 x i32]* %a1, i64 0, i64 0
+  call void @foo3(i32* %arraydecay)
+  call void asm sideeffect "foo2", "~{w0},~{w1},~{w2},~{w3},~{w4},~{w5},~{w6},~{w7},~{w8},~{w9},~{w10},~{w11},~{w12},~{w13},~{w14},~{w15},~{w16},~{w17},~{w18},~{w19},~{w20},~{w21},~{w22},~{w23},~{w24},~{w25},~{w26},~{w27},~{w28},~{w29},~{w30}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %0)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index 8aab842..7fb3954 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll

@@ -3,6 +3,7 @@
 declare fastcc void @callee_stack0()
 declare fastcc void @callee_stack8([8 x i32], i64)
 declare fastcc void @callee_stack16([8 x i32], i64, i64)
+declare extern_weak fastcc void @callee_weak()
 
 define fastcc void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
@@ -92,3 +93,13 @@
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack16
 }
+
+
+; Weakly-referenced extern functions cannot be tail-called, as AAELF does
+; not define the behaviour of branch instructions to undefined weak symbols.
+define fastcc void @caller_weak() {
+; CHECK-LABEL: caller_weak:
+; CHECK: bl callee_weak
+  tail call void @callee_weak()
+  ret void
+}

diff --git a/test/CodeGen/AArch64/tailcall-fastisel.ll b/test/CodeGen/AArch64/tailcall-fastisel.ll
new file mode 100644
index 0000000..3ba6391
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-fastisel.ll

@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -O0 | FileCheck %s
+
+; CHECK: b _foo0
+
+define i32 @foo1() {
+entry:
+  %call = tail call i32 @foo0()
+  ret i32 %call
+}
+
+declare i32 @foo0()

diff --git a/test/CodeGen/AArch64/tbz-tbnz.ll b/test/CodeGen/AArch64/tbz-tbnz.ll
new file mode 100644
index 0000000..c77043c
--- /dev/null
+++ b/test/CodeGen/AArch64/tbz-tbnz.ll

@@ -0,0 +1,258 @@
+; RUN: llc -O1 -march=aarch64 < %s | FileCheck %s
+
+declare void @t()
+
+define void @test1(i32 %a) {
+; CHECK-LABEL: @test1
+entry:
+  %sub = add nsw i32 %a, -12
+  %cmp = icmp slt i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2(i64 %a) {
+; CHECK-LABEL: @test2
+entry:
+  %sub = add nsw i64 %a, -12
+  %cmp = icmp slt i64 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:x[0-9]+]], x0, #12
+; CHECK: tbz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test3(i32 %a) {
+; CHECK-LABEL: @test3
+entry:
+  %sub = add nsw i32 %a, -12
+  %cmp = icmp sgt i32 %sub, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbnz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test4(i64 %a) {
+; CHECK-LABEL: @test4
+entry:
+  %sub = add nsw i64 %a, -12
+  %cmp = icmp sgt i64 %sub, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:x[0-9]+]], x0, #12
+; CHECK: tbnz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test5(i32 %a) {
+; CHECK-LABEL: @test5
+entry:
+  %sub = add nsw i32 %a, -12
+  %cmp = icmp sge i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbnz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test6(i64 %a) {
+; CHECK-LABEL: @test6
+entry:
+  %sub = add nsw i64 %a, -12
+  %cmp = icmp sge i64 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:x[0-9]+]], x0, #12
+; CHECK: tbnz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test7(i32 %a) {
+; CHECK-LABEL: @test7
+entry:
+  %sub = sub nsw i32 %a, 12
+  %cmp = icmp slt i32 %sub, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK: sub [[CMP:w[0-9]+]], w0, #12
+; CHECK: tbz [[CMP]], #31
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test8(i64 %val1, i64 %val2, i64 %val3) {
+; CHECK-LABEL: @test8
+  %and1 = and i64 %val1, %val2
+  %tst1 = icmp slt i64 %and1, 0
+  br i1 %tst1, label %if.then1, label %if.end
+
+; CHECK: tst x0, x1
+; CHECK-NEXT: b.ge
+
+if.then1:
+  %and2 = and i64 %val2, %val3
+  %tst2 = icmp sge i64 %and2, 0
+  br i1 %tst2, label %if.then2, label %if.end
+
+; CHECK: and [[CMP:x[0-9]+]], x1, x2
+; CHECK-NOT: cmp
+; CHECK: tbnz [[CMP]], #63
+
+if.then2:
+  %shifted_op1 = shl i64 %val2, 63
+  %shifted_and1 = and i64 %val1, %shifted_op1
+  %tst3 = icmp slt i64 %shifted_and1, 0
+  br i1 %tst3, label %if.then3, label %if.end
+
+; CHECK: tst x0, x1, lsl #63
+; CHECK: b.ge
+
+if.then3:
+  %shifted_op2 = shl i64 %val2, 62
+  %shifted_and2 = and i64 %val1, %shifted_op2
+  %tst4 = icmp sge i64 %shifted_and2, 0
+  br i1 %tst4, label %if.then4, label %if.end
+
+; CHECK: tst x0, x1, lsl #62
+; CHECK: b.lt
+
+if.then4:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test9(i64 %val1) {
+; CHECK-LABEL: @test9
+  %tst = icmp slt i64 %val1, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK-NOT: cmp
+; CHECK: tbz x0, #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test10(i64 %val1) {
+; CHECK-LABEL: @test10
+  %tst = icmp slt i64 %val1, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK-NOT: cmp
+; CHECK: tbz x0, #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test11(i64 %val1, i64* %ptr) {
+; CHECK-LABEL: @test11
+
+; CHECK: ldr [[CMP:x[0-9]+]], [x1]
+; CHECK-NOT: cmp
+; CHECK: tbz [[CMP]], #63
+
+  %val = load i64* %ptr
+  %tst = icmp slt i64 %val, 0
+  br i1 %tst, label %if.then, label %if.end
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test12(i64 %val1) {
+; CHECK-LABEL: @test12
+  %tst = icmp slt i64 %val1, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK-NOT: cmp
+; CHECK: tbz x0, #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test13(i64 %val1, i64 %val2) {
+; CHECK-LABEL: @test13
+  %or = or i64 %val1, %val2
+  %tst = icmp slt i64 %or, 0
+  br i1 %tst, label %if.then, label %if.end
+
+; CHECK: orr [[CMP:x[0-9]+]], x0, x1
+; CHECK-NOT: cmp
+; CHECK: tbz [[CMP]], #63
+
+if.then:
+  call void @t()
+  br label %if.end
+
+if.end:
+  ret void
+}

diff --git a/test/CodeGen/AArch64/trunc-v1i64.ll b/test/CodeGen/AArch64/trunc-v1i64.ll
index 159b8e0..19efd2f 100644
--- a/test/CodeGen/AArch64/trunc-v1i64.ll
+++ b/test/CodeGen/AArch64/trunc-v1i64.ll

@@ -60,4 +60,23 @@
   %1 = shufflevector <1 x i64> %in0, <1 x i64> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %2 = trunc <8 x i64> %1 to <8 x i8>
   ret <8 x i8> %2
-}
\ No newline at end of file
+}
+
+; PR20777: v1i1 is also problematic, but we can't widen it, so we extract_elt
+; the i64 out of the v1i64 operand, and truncate that scalar instead.
+
+define <1 x i1> @test_v1i1_0(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i1_0:
+; CHECK: fmov w0, s0
+  %1 = trunc <1 x i64> %in0 to <1 x i1>
+  ret <1 x i1> %1
+}
+
+define i1 @test_v1i1_1(<1 x i64> %in0) {
+; CHECK-LABEL: test_v1i1_1:
+; CHECK: fmov [[REG:w[0-9]+]], s0
+  %1 = trunc <1 x i64> %in0 to <1 x i1>
+; CHECK: and w0, [[REG]], #0x1
+  %2 = extractelement <1 x i1> %1, i32 0
+  ret i1 %2
+}

diff --git a/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll b/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
index 55cea3a..90a3b37 100644
--- a/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
+++ b/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll

@@ -1,11 +1,14 @@
-; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*baz | count 1
-; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*quux | count 1
+; RUN: llc < %s -enable-tail-merge | FileCheck %s
 ; Check that calls to baz and quux are tail-merged.
 ; PR1628
 
+; CHECK: bl _baz
+; CHECK-NOT: bl _baz
+; CHECK: bl _quux
+; CHECK-NOT: bl _quux
+
 ; ModuleID = 'tail.c'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "i686-apple-darwin8"
+target triple = "arm-apple-darwin8"
 
 define i32 @f(i32 %i, i32 %q) {
 entry:

diff --git a/test/CodeGen/ARM/2009-10-16-Scope.ll b/test/CodeGen/ARM/2009-10-16-Scope.ll
index 570fcf9..b4e758d 100644
--- a/test/CodeGen/ARM/2009-10-16-Scope.ll
+++ b/test/CodeGen/ARM/2009-10-16-Scope.ll

@@ -9,7 +9,7 @@
   br label %do.body, !dbg !0
 
 do.body:                                          ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4)
+  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4, metadata !{metadata !"0x102"})
   %conv = ptrtoint i32* %count_ to i32, !dbg !0   ; <i32> [#uses=1]
   %call = call i32 @foo(i32 %conv) ssp, !dbg !0   ; <i32> [#uses=0]
   br label %do.end, !dbg !0
@@ -18,17 +18,17 @@
   ret void, !dbg !7
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @foo(i32) ssp
 
 !0 = metadata !{i32 5, i32 2, metadata !1, null}
-!1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0}; [DW_TAG_subprogram ]
-!3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
-!4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ]
-!5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!6 = metadata !{i32 458788, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}; [DW_TAG_base_type ]
+!1 = metadata !{metadata !"0xb\001\001\000", null, metadata !2}; [DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, metadata !3, null, null, null, null, null, null}; [DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !8, null, metadata !9, null, null, null}; [DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x100\00count_\005\000", metadata !5, metadata !3, metadata !6}; [ DW_TAG_auto_variable ]
+!5 = metadata !{metadata !"0xb\001\001\000", null, metadata !1}; [DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3}; [DW_TAG_base_type ]
 !7 = metadata !{i32 6, i32 1, metadata !2, null}
 !8 = metadata !{metadata !"genmodes.i", metadata !"/Users/yash/Downloads"}
 !9 = metadata !{i32 0}

diff --git a/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll b/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll
deleted file mode 100644
index 0f021d2..0000000
--- a/test/CodeGen/ARM/2009-10-21-InvalidFNeg.ll
+++ /dev/null

@@ -1,48 +0,0 @@
-; RUN: llc -mcpu=cortex-a8 -mattr=+neon < %s | grep vneg
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "armv7-eabi"
-
-%aaa = type { %fff, %fff }
-%bbb = type { [6 x %ddd] }
-%ccc = type { %eee, %fff }
-%ddd = type { %fff }
-%eee = type { %fff, %fff, %fff, %fff }
-%fff = type { %struct.vec_float4 }
-%struct.vec_float4 = type { <4 x float> }
-
-define linkonce_odr arm_aapcs_vfpcc void @foo(%eee* noalias sret %agg.result, i64 %tfrm.0.0, i64 %tfrm.0.1, i64 %tfrm.0.2, i64 %tfrm.0.3, i64 %tfrm.0.4, i64 %tfrm.0.5, i64 %tfrm.0.6, i64 %tfrm.0.7) nounwind noinline {
-entry:
-  %tmp104 = zext i64 %tfrm.0.2 to i512            ; <i512> [#uses=1]
-  %tmp105 = shl i512 %tmp104, 128                 ; <i512> [#uses=1]
-  %tmp118 = zext i64 %tfrm.0.3 to i512            ; <i512> [#uses=1]
-  %tmp119 = shl i512 %tmp118, 192                 ; <i512> [#uses=1]
-  %ins121 = or i512 %tmp119, %tmp105              ; <i512> [#uses=1]
-  %tmp99 = zext i64 %tfrm.0.4 to i512             ; <i512> [#uses=1]
-  %tmp100 = shl i512 %tmp99, 256                  ; <i512> [#uses=1]
-  %tmp123 = zext i64 %tfrm.0.5 to i512            ; <i512> [#uses=1]
-  %tmp124 = shl i512 %tmp123, 320                 ; <i512> [#uses=1]
-  %tmp96 = zext i64 %tfrm.0.6 to i512             ; <i512> [#uses=1]
-  %tmp97 = shl i512 %tmp96, 384                   ; <i512> [#uses=1]
-  %tmp128 = zext i64 %tfrm.0.7 to i512            ; <i512> [#uses=1]
-  %tmp129 = shl i512 %tmp128, 448                 ; <i512> [#uses=1]
-  %mask.masked = or i512 %tmp124, %tmp100         ; <i512> [#uses=1]
-  %ins131 = or i512 %tmp129, %tmp97               ; <i512> [#uses=1]
-  %tmp109132 = zext i64 %tfrm.0.0 to i128         ; <i128> [#uses=1]
-  %tmp113134 = zext i64 %tfrm.0.1 to i128         ; <i128> [#uses=1]
-  %tmp114133 = shl i128 %tmp113134, 64            ; <i128> [#uses=1]
-  %tmp94 = or i128 %tmp114133, %tmp109132         ; <i128> [#uses=1]
-  %tmp95 = bitcast i128 %tmp94 to <4 x float>     ; <<4 x float>> [#uses=0]
-  %tmp82 = lshr i512 %ins121, 128                 ; <i512> [#uses=1]
-  %tmp83 = trunc i512 %tmp82 to i128              ; <i128> [#uses=1]
-  %tmp84 = bitcast i128 %tmp83 to <4 x float>     ; <<4 x float>> [#uses=0]
-  %tmp86 = lshr i512 %mask.masked, 256            ; <i512> [#uses=1]
-  %tmp87 = trunc i512 %tmp86 to i128              ; <i128> [#uses=1]
-  %tmp88 = bitcast i128 %tmp87 to <4 x float>     ; <<4 x float>> [#uses=0]
-  %tmp90 = lshr i512 %ins131, 384                 ; <i512> [#uses=1]
-  %tmp91 = trunc i512 %tmp90 to i128              ; <i128> [#uses=1]
-  %tmp92 = bitcast i128 %tmp91 to <4 x float>     ; <<4 x float>> [#uses=1]
-  %tmp = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %tmp92 ; <<4 x float>> [#uses=1]
-  %tmp28 = getelementptr inbounds %eee* %agg.result, i32 0, i32 3, i32 0, i32 0 ; <<4 x float>*> [#uses=1]
-  store <4 x float> %tmp, <4 x float>* %tmp28, align 16
-  ret void
-}

diff --git a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
index 35739d7..bce3120 100644
--- a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
+++ b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll

@@ -5,28 +5,28 @@
 
 define hidden i32 @__addvsi3(i32 %a, i32 %b) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %b}, i64 0, metadata !0)
+  tail call void @llvm.dbg.value(metadata !{i32 %b}, i64 0, metadata !0, metadata !{metadata !"0x102"})
   %0 = add nsw i32 %b, %a, !dbg !9                ; <i32> [#uses=1]
   ret i32 %0, !dbg !11
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!15}
-!0 = metadata !{i32 524545, metadata !1, metadata !"b", metadata !2, i32 93, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 524334, metadata !12, null, metadata !"__addvsi3", metadata !"__addvsi3", metadata !"__addvsi3", i32 94, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 524329, metadata !12} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x101\00b\0093\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00__addvsi3\00__addvsi3\00__addvsi3\0094\000\001\000\006\000\000\000", metadata !12, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
 !12 = metadata !{metadata !"libgcc2.c", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc"}
-!3 = metadata !{i32 524305, metadata !12, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", i1 true, metadata !"", i32 0, metadata !13, metadata !13, metadata !14, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !12, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)\001\00\000\00\000", metadata !12, metadata !13, metadata !13, metadata !14, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6, metadata !6}
-!6 = metadata !{i32 524310, metadata !12, null, metadata !"SItype", i32 152, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ]
-!7 = metadata !{i32 524329, metadata !"libgcc2.h", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc", metadata !3} ; [ DW_TAG_file_type ]
-!8 = metadata !{i32 524324, metadata !12, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x16\00SItype\00152\000\000\000\000", metadata !12, null, metadata !8} ; [ DW_TAG_typedef ]
+!7 = metadata !{metadata !"0x29", metadata !"libgcc2.h", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc", metadata !3} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 95, i32 0, metadata !10, null}
-!10 = metadata !{i32 524299, metadata !12, metadata !1, i32 94, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0xb\0094\000\000", metadata !12, metadata !1} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 100, i32 0, metadata !10, null}
 !13 = metadata !{i32 0}
 !14 = metadata !{metadata !1}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
index a53200e..efe1ab5 100644
--- a/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll
+++ b/test/CodeGen/ARM/2010-06-25-Thumb2ITInvalidIterator.ll

@@ -7,16 +7,16 @@
 
 define void @x0(i8* nocapture %buf, i32 %nbytes) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %buf}, i64 0, metadata !0), !dbg !15
-  tail call void @llvm.dbg.value(metadata !{i32 %nbytes}, i64 0, metadata !8), !dbg !16
+  tail call void @llvm.dbg.value(metadata !{i8* %buf}, i64 0, metadata !0, metadata !{metadata !"0x102"}), !dbg !15
+  tail call void @llvm.dbg.value(metadata !{i32 %nbytes}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !16
   %tmp = load i32* @length, !dbg !17              ; <i32> [#uses=3]
   %cmp = icmp eq i32 %tmp, -1, !dbg !17           ; <i1> [#uses=1]
   %cmp.not = xor i1 %cmp, true                    ; <i1> [#uses=1]
   %cmp3 = icmp ult i32 %tmp, %nbytes, !dbg !17    ; <i1> [#uses=1]
   %or.cond = and i1 %cmp.not, %cmp3               ; <i1> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{i32 %tmp}, i64 0, metadata !8), !dbg !17
+  tail call void @llvm.dbg.value(metadata !{i32 %tmp}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !17
   %nbytes.addr.0 = select i1 %or.cond, i32 %tmp, i32 %nbytes ; <i32> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !18, i64 0, metadata !10), !dbg !19
+  tail call void @llvm.dbg.value(metadata !18, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !19
   br label %while.cond, !dbg !20
 
 while.cond:                                       ; preds = %while.body, %entry
@@ -42,26 +42,26 @@
 
 declare i32 @x1() optsize
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.lv.fn = !{!0, !8, !10, !12}
 !llvm.dbg.gv = !{!14}
 
-!0 = metadata !{i32 524545, metadata !1, metadata !"buf", metadata !2, i32 4, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 524334, metadata !26, null, metadata !"x0", metadata !"x0", metadata !"x0", i32 5, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 524329, metadata !26} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 524305, i32 0, i32 12, metadata !"t.c", metadata !".", metadata !"clang 2.0", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !26, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!0 = metadata !{metadata !"0x101\00buf\004\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00x0\00x0\00x0\005\000\001\000\006\000\000\000", metadata !26, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00clang 2.0\001\00\00\00\00", metadata !26, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !26, metadata !2, null, metadata !5, null} ; [ DW_TAG_subroutine_type ]
 !5 = metadata !{null}
-!6 = metadata !{i32 524303, metadata !26, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 524324, metadata !26, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 524545, metadata !1, metadata !"nbytes", metadata !2, i32 4, metadata !9} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 524324, metadata !26, metadata !2, metadata !"unsigned long", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 524544, metadata !11, metadata !"nread", metadata !2, i32 6, metadata !9} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 524299, metadata !26, metadata !1, i32 5, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 524544, metadata !11, metadata !"c", metadata !2, i32 7, metadata !13} ; [ DW_TAG_auto_variable ]
-!13 = metadata !{i32 524324, metadata !26, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 524340, i32 0, metadata !2, metadata !"length", metadata !"length", metadata !"length", metadata !2, i32 1, metadata !13, i1 false, i1 true, i32* @length} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !26, metadata !2, metadata !7} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !26, metadata !2} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x101\00nbytes\004\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x24\00unsigned long\000\0032\0032\000\000\007", metadata !26, metadata !2} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x100\00nread\006\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\005\001\000", metadata !26, metadata !1} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{metadata !"0x100\00c\007\000", metadata !11, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !26, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x34\00length\00length\00length\001\000\001", metadata !2, metadata !2, metadata !13, i32* @length} ; [ DW_TAG_variable ]
 !15 = metadata !{i32 4, i32 24, metadata !1, null}
 !16 = metadata !{i32 4, i32 43, metadata !1, null}
 !17 = metadata !{i32 9, i32 2, metadata !11, null}
@@ -69,7 +69,7 @@
 !19 = metadata !{i32 10, i32 2, metadata !11, null}
 !20 = metadata !{i32 11, i32 2, metadata !11, null}
 !21 = metadata !{i32 12, i32 3, metadata !22, null}
-!22 = metadata !{i32 524299, metadata !26, metadata !11, i32 11, i32 45, i32 0} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\0011\0045\000", metadata !26, metadata !11} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{i32 13, i32 3, metadata !22, null}
 !24 = metadata !{i32 14, i32 2, metadata !22, null}
 !25 = metadata !{i32 15, i32 1, metadata !11, null}

diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index 48de244..f10408c 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll

@@ -6,8 +6,8 @@
 define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23), !dbg !24
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25), !dbg !24
+  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !24
   %0 = icmp ne i32 %i, 0, !dbg !27                ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !27
 
@@ -34,7 +34,7 @@
 define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2  {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31), !dbg !34
+  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !34
   %0 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 0, !dbg !34 ; <i8**> [#uses=1]
   store i8* null, i8** %0, align 8, !dbg !34
   %1 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 1, !dbg !34 ; <i32*> [#uses=1]
@@ -45,14 +45,14 @@
   ret void, !dbg !35
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @main() nounwind ssp {
 entry:
   %0 = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=3]
   %v = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=4]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38), !dbg !41
+  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38, metadata !{metadata !"0x102"}), !dbg !41
   call void @_ZN4SValC1Ev(%struct.SVal* %v) nounwind, !dbg !41
   %1 = getelementptr inbounds %struct.SVal* %v, i32 0, i32 1, !dbg !42 ; <i32*> [#uses=1]
   store i32 1, i32* %1, align 8, !dbg !42
@@ -65,65 +65,65 @@
   %7 = load i32* %6, align 8, !dbg !43            ; <i32> [#uses=1]
   store i32 %7, i32* %5, align 8, !dbg !43
   %8 = call i32 @_Z3fooi4SVal(i32 2, %struct.SVal* noalias %0) nounwind, !dbg !43 ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44), !dbg !43
+  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44, metadata !{metadata !"0x102"}), !dbg !43
   br label %return, !dbg !45
 
 return:                                           ; preds = %entry
   ret i32 0, !dbg !45
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786451, metadata !48, null, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
-!2 = metadata !{i32 786473, metadata !48} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !48, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !47, metadata !47, metadata !46, metadata !47,  metadata !47, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\000", metadata !48, metadata !1, metadata !14, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x13\00SVal\001\00128\0064\000\000\000", metadata !48, null, null, metadata !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
+!2 = metadata !{metadata !"0x29", metadata !48} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", metadata !48, metadata !47, metadata !47, metadata !46, metadata !47,  metadata !47} ; [ DW_TAG_compile_unit ]
 !4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
-!5 = metadata !{i32 786445, metadata !48, metadata !1, metadata !"Data", i32 7, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{i32 786447, metadata !48, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786445, metadata !48, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ]
-!8 = metadata !{i32 786468, metadata !48, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0xd\00Data\007\0064\0064\000\000", metadata !48, metadata !1, metadata !6} ; [ DW_TAG_member ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !48, null, null} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0xd\00Kind\008\0032\0032\0064\000", metadata !48, metadata !1, metadata !8} ; [ DW_TAG_member ]
+!8 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !48, null} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\000", metadata !48, metadata !1, metadata !10, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12, metadata !13}
-!12 = metadata !{i32 786447, metadata !48, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{i32 786468, metadata !48, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !48, null, metadata !1} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !48, null} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !12}
-!16 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 786478, metadata !48, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!18 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\000", metadata !48, metadata !1, metadata !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\000", metadata !48, metadata !2, metadata !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{metadata !13, metadata !13, metadata !1}
-!20 = metadata !{i32 786478, metadata !48, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!21 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\000", metadata !48, metadata !2, metadata !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !48, null, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{metadata !13}
-!23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{metadata !"0x101\00i\0016\000", metadata !17, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 16, i32 0, metadata !17, null}
-!25 = metadata !{i32 786689, metadata !17, metadata !"location", metadata !2, i32 16, metadata !26, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{i32 786448, metadata !48, metadata !2, metadata !"SVal", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
+!25 = metadata !{metadata !"0x101\00location\0016\000", metadata !17, metadata !2, metadata !26} ; [ DW_TAG_arg_variable ]
+!26 = metadata !{metadata !"0x10\00SVal\000\0064\0064\000\000", metadata !48, metadata !2, metadata !1} ; [ DW_TAG_reference_type ]
 !27 = metadata !{i32 17, i32 0, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !2, metadata !17, i32 16, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0xb\0016\000\002", metadata !2, metadata !17} ; [ DW_TAG_lexical_block ]
 !29 = metadata !{i32 18, i32 0, metadata !28, null}
 !30 = metadata !{i32 20, i32 0, metadata !28, null}
-!31 = metadata !{i32 786689, metadata !16, metadata !"this", metadata !2, i32 11, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!32 = metadata !{i32 786470, metadata !48, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !33} ; [ DW_TAG_const_type ]
-!33 = metadata !{i32 786447, metadata !48, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_pointer_type ]
+!31 = metadata !{metadata !"0x101\00this\0011\000", metadata !16, metadata !2, metadata !32} ; [ DW_TAG_arg_variable ]
+!32 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !48, metadata !2, metadata !33} ; [ DW_TAG_const_type ]
+!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !48, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
 !34 = metadata !{i32 11, i32 0, metadata !16, null}
 !35 = metadata !{i32 11, i32 0, metadata !36, null}
-!36 = metadata !{i32 786443, metadata !48, metadata !37, i32 11, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
-!37 = metadata !{i32 786443, metadata !48, metadata !16, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!38 = metadata !{i32 786688, metadata !39, metadata !"v", metadata !2, i32 24, metadata !1, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!39 = metadata !{i32 786443, metadata !48, metadata !40, i32 23, i32 0, i32 4} ; [ DW_TAG_lexical_block ]
-!40 = metadata !{i32 786443, metadata !48, metadata !20, i32 23, i32 0, i32 3} ; [ DW_TAG_lexical_block ]
+!36 = metadata !{metadata !"0xb\0011\000\001", metadata !48, metadata !37} ; [ DW_TAG_lexical_block ]
+!37 = metadata !{metadata !"0xb\0011\000\000", metadata !48, metadata !16} ; [ DW_TAG_lexical_block ]
+!38 = metadata !{metadata !"0x100\00v\0024\000", metadata !39, metadata !2, metadata !1} ; [ DW_TAG_auto_variable ]
+!39 = metadata !{metadata !"0xb\0023\000\004", metadata !48, metadata !40} ; [ DW_TAG_lexical_block ]
+!40 = metadata !{metadata !"0xb\0023\000\003", metadata !48, metadata !20} ; [ DW_TAG_lexical_block ]
 !41 = metadata !{i32 24, i32 0, metadata !39, null}
 !42 = metadata !{i32 25, i32 0, metadata !39, null}
 !43 = metadata !{i32 26, i32 0, metadata !39, null}
-!44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+!44 = metadata !{metadata !"0x100\00k\0026\000", metadata !39, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
 !45 = metadata !{i32 27, i32 0, metadata !39, null}
 !46 = metadata !{metadata !16, metadata !17, metadata !20}
 !47 = metadata !{}
 !48 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/2010-11-15-SpillEarlyClobber.ll b/test/CodeGen/ARM/2010-11-15-SpillEarlyClobber.ll
index ec74880..80a1964 100644
--- a/test/CodeGen/ARM/2010-11-15-SpillEarlyClobber.ll
+++ b/test/CodeGen/ARM/2010-11-15-SpillEarlyClobber.ll

@@ -1,5 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -spiller=trivial
-; RUN: llc < %s -verify-machineinstrs -spiller=inline
+; RUN: llc < %s -verify-machineinstrs
 ; PR8612
 ;
 ; This test has an inline asm with early-clobber arguments.

diff --git a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
index b1d59aa..7fbd3ba 100644
--- a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-darwin10"
@@ -11,66 +11,66 @@
 
 ; Check debug info output for merged global.
 ; DW_AT_location
-; DW_OP_addr
-; DW_OP_plus
-; .long __MergedGlobals
-; DW_OP_constu
-; offset
+; 0x03 DW_OP_addr
+; 0x.. .long __MergedGlobals
+; 0x10 DW_OP_constu
+; 0x.. offset
+; 0x22 DW_OP_plus
 
-;CHECK: .long Lset7
-;CHECK-NEXT:        @ DW_AT_type
-;CHECK-NEXT:        @ DW_AT_decl_file
-;CHECK-NEXT:        @ DW_AT_decl_line
-;CHECK-NEXT:        @ DW_AT_location
-;CHECK-NEXT:        .byte   3
-;CHECK-NEXT:        .long   __MergedGlobals
-;CHECK-NEXT:        .byte   16
-;CHECK-NEXT:        .byte   1
-;CHECK-NEXT:        .byte   34
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:    DW_AT_name {{.*}} "x1"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:    DW_AT_location [DW_FORM_exprloc]        (<0x8> 03 [[ADDR:.. .. .. ..]] 10 00 22  )
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:    DW_AT_name {{.*}} "x2"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:    DW_AT_location [DW_FORM_exprloc]        (<0x8> 03 [[ADDR]] 10 01 22  )
 
 define zeroext i8 @get1(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !10), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !30
   %0 = load i8* @x1, align 4, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !11), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !30
   store i8 %a, i8* @x1, align 4, !dbg !30
   ret i8 %0, !dbg !31
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 define zeroext i8 @get2(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !18), !dbg !32
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !32
   %0 = load i8* @x2, align 4, !dbg !32
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !19), !dbg !32
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !32
   store i8 %a, i8* @x2, align 4, !dbg !32
   ret i8 %0, !dbg !33
 }
 
 define zeroext i8 @get3(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !21), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !34
   %0 = load i8* @x3, align 4, !dbg !34
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !22), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !34
   store i8 %a, i8* @x3, align 4, !dbg !34
   ret i8 %0, !dbg !35
 }
 
 define zeroext i8 @get4(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !24), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !36
   %0 = load i8* @x4, align 4, !dbg !36
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !25), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !36
   store i8 %a, i8* @x4, align 4, !dbg !36
   ret i8 %0, !dbg !37
 }
 
 define zeroext i8 @get5(i8 zeroext %a) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !27), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i8 %a}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !38
   %0 = load i8* @x5, align 4, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !28), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i8 %0}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !38
   store i8 %a, i8* @x5, align 4, !dbg !38
   ret i8 %0, !dbg !39
 }
@@ -78,36 +78,36 @@
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get1", metadata !"get1", metadata !"get1", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get1, null, null, metadata !42, i32 4} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !47, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !47, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00get1\00get1\00get1\004\000\001\000\006\00256\001\004", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get1, null, null, metadata !42} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)\001\00\000\00\000", metadata !47, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !5}
-!5 = metadata !{i32 786468, metadata !47, metadata !1, metadata !"_Bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get2", metadata !"get2", metadata !"get2", i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get2, null, null, metadata !43, i32 7} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get3", metadata !"get3", metadata !"get3", i32 10, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get3, null, null, metadata !44, i32 10} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get4", metadata !"get4", metadata !"get4", i32 13, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get4, null, null, metadata !45, i32 13} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get5", metadata !"get5", metadata !"get5", i32 16, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get5, null, null, metadata !46, i32 16} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 4, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 786688, metadata !12, metadata !"b", metadata !1, i32 4, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!12 = metadata !{i32 786443, metadata !47, metadata !0, i32 4, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x1", metadata !"x1", metadata !"", metadata !1, i32 3, metadata !5, i1 true, i1 true, i8* @x1, null} ; [ DW_TAG_variable ]
-!14 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x2", metadata !"x2", metadata !"", metadata !1, i32 6, metadata !5, i1 true, i1 true, i8* @x2, null} ; [ DW_TAG_variable ]
-!15 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x3", metadata !"x3", metadata !"", metadata !1, i32 9, metadata !5, i1 true, i1 true, i8* @x3, null} ; [ DW_TAG_variable ]
-!16 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x4", metadata !"x4", metadata !"", metadata !1, i32 12, metadata !5, i1 true, i1 true, i8* @x4, null} ; [ DW_TAG_variable ]
-!17 = metadata !{i32 786484, i32 0, metadata !1, metadata !"x5", metadata !"x5", metadata !"", metadata !1, i32 15, metadata !5, i1 false, i1 true, i8* @x5, null} ; [ DW_TAG_variable ]
-!18 = metadata !{i32 786689, metadata !6, metadata !"a", metadata !1, i32 7, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786688, metadata !20, metadata !"b", metadata !1, i32 7, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!20 = metadata !{i32 786443, metadata !47, metadata !6, i32 7, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
-!21 = metadata !{i32 786689, metadata !7, metadata !"a", metadata !1, i32 10, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 786688, metadata !23, metadata !"b", metadata !1, i32 10, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!23 = metadata !{i32 786443, metadata !47, metadata !7, i32 10, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{i32 786689, metadata !8, metadata !"a", metadata !1, i32 13, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{i32 786688, metadata !26, metadata !"b", metadata !1, i32 13, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!26 = metadata !{i32 786443, metadata !47, metadata !8, i32 13, i32 0, i32 3} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{i32 786689, metadata !9, metadata !"a", metadata !1, i32 16, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!28 = metadata !{i32 786688, metadata !29, metadata !"b", metadata !1, i32 16, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786443, metadata !47, metadata !9, i32 16, i32 0, i32 4} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00_Bool\000\008\008\000\000\002", metadata !47, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00get2\00get2\00get2\007\000\001\000\006\00256\001\007", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get2, null, null, metadata !43} ; [ DW_TAG_subprogram ]
+!7 = metadata !{metadata !"0x2e\00get3\00get3\00get3\0010\000\001\000\006\00256\001\0010", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get3, null, null, metadata !44} ; [ DW_TAG_subprogram ]
+!8 = metadata !{metadata !"0x2e\00get4\00get4\00get4\0013\000\001\000\006\00256\001\0013", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get4, null, null, metadata !45} ; [ DW_TAG_subprogram ]
+!9 = metadata !{metadata !"0x2e\00get5\00get5\00get5\0016\000\001\000\006\00256\001\0016", metadata !47, metadata !1, metadata !3, null, i8 (i8)* @get5, null, null, metadata !46} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x101\00a\004\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x100\00b\004\000", metadata !12, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!12 = metadata !{metadata !"0xb\004\000\000", metadata !47, metadata !0} ; [ DW_TAG_lexical_block ]
+!13 = metadata !{metadata !"0x34\00x1\00x1\00\003\001\001", metadata !1, metadata !1, metadata !5, i8* @x1, null} ; [ DW_TAG_variable ]
+!14 = metadata !{metadata !"0x34\00x2\00x2\00\006\001\001", metadata !1, metadata !1, metadata !5, i8* @x2, null} ; [ DW_TAG_variable ]
+!15 = metadata !{metadata !"0x34\00x3\00x3\00\009\001\001", metadata !1, metadata !1, metadata !5, i8* @x3, null} ; [ DW_TAG_variable ]
+!16 = metadata !{metadata !"0x34\00x4\00x4\00\0012\001\001", metadata !1, metadata !1, metadata !5, i8* @x4, null} ; [ DW_TAG_variable ]
+!17 = metadata !{metadata !"0x34\00x5\00x5\00\0015\000\001", metadata !1, metadata !1, metadata !5, i8* @x5, null} ; [ DW_TAG_variable ]
+!18 = metadata !{metadata !"0x101\00a\007\000", metadata !6, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x100\00b\007\000", metadata !20, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!20 = metadata !{metadata !"0xb\007\000\001", metadata !47, metadata !6} ; [ DW_TAG_lexical_block ]
+!21 = metadata !{metadata !"0x101\00a\0010\000", metadata !7, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{metadata !"0x100\00b\0010\000", metadata !23, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{metadata !"0xb\0010\000\002", metadata !47, metadata !7} ; [ DW_TAG_lexical_block ]
+!24 = metadata !{metadata !"0x101\00a\0013\000", metadata !8, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{metadata !"0x100\00b\0013\000", metadata !26, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!26 = metadata !{metadata !"0xb\0013\000\003", metadata !47, metadata !8} ; [ DW_TAG_lexical_block ]
+!27 = metadata !{metadata !"0x101\00a\0016\000", metadata !9, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!28 = metadata !{metadata !"0x100\00b\0016\000", metadata !29, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{metadata !"0xb\0016\000\004", metadata !47, metadata !9} ; [ DW_TAG_lexical_block ]
 !30 = metadata !{i32 4, i32 0, metadata !0, null}
 !31 = metadata !{i32 4, i32 0, metadata !12, null}
 !32 = metadata !{i32 7, i32 0, metadata !6, null}
@@ -127,4 +127,4 @@
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"foo.c", metadata !"/tmp/"}
 !48 = metadata !{}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/2011-04-12-AlignBug.ll b/test/CodeGen/ARM/2011-04-12-AlignBug.ll
index 97297f7..1a6879e 100644
--- a/test/CodeGen/ARM/2011-04-12-AlignBug.ll
+++ b/test/CodeGen/ARM/2011-04-12-AlignBug.ll

@@ -1,11 +1,10 @@
 ; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-darwin10.0.0"
 
 ; CHECK: align 3
 @.v = private unnamed_addr constant <4 x i32> <i32 1, i32 2, i32 3, i32 4>, align 8
-; CHECK: align 2
-@.strA = private unnamed_addr constant [4 x i8] c"bar\00"
+; CHECK: align 4
+@.strA = private unnamed_addr constant [4 x i64] zeroinitializer
 ; CHECK-NOT: align
 @.strB = private unnamed_addr constant [4 x i8] c"foo\00", align 1
 @.strC = private unnamed_addr constant [4 x i8] c"baz\00", section "__TEXT,__cstring,cstring_literals", align 1

diff --git a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
index ed2840b..ede936c 100644
--- a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll

@@ -1,25 +1,23 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
 
 ; Check debug info output for merged global.
 ; DW_AT_location
-; DW_OP_addr
-; DW_OP_plus
-; .long __MergedGlobals
-; DW_OP_constu
-; offset
+; 0x03 DW_OP_addr
+; 0x.. .long __MergedGlobals
+; 0x10 DW_OP_constu
+; 0x.. offset
+; 0x22 DW_OP_plus
 
-;CHECK: .long Lset9
-;CHECK-NEXT:        @ DW_AT_type
-;CHECK-NEXT:        @ DW_AT_decl_file
-;CHECK-NEXT:        @ DW_AT_decl_line
-;CHECK-NEXT:        @ DW_AT_location
-;CHECK-NEXT:        .byte   3
-;CHECK-NEXT:        .long   __MergedGlobals
-;CHECK-NEXT:        .byte   16
-; 4 is byte offset of x2 in __MergedGobals
-;CHECK-NEXT:        .byte   4
-;CHECK-NEXT:        .byte   34
-
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:    DW_AT_name {{.*}} "x1"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:    DW_AT_location [DW_FORM_exprloc]        (<0x8> 03 [[ADDR:.. .. .. ..]] 10 00 22  )
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK:    DW_AT_name {{.*}} "x2"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:    DW_AT_location [DW_FORM_exprloc]        (<0x8> 03 [[ADDR]] 10 04 22  )
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.7.0"
@@ -31,80 +29,77 @@
 @x5 = global i32 0, align 4
 
 define i32 @get1(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !10), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !30
   %1 = load i32* @x1, align 4, !dbg !31
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !11), !dbg !31
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !31
   store i32 %a, i32* @x1, align 4, !dbg !31
   ret i32 %1, !dbg !31
 }
 
 define i32 @get2(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !13), !dbg !32
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !32
   %1 = load i32* @x2, align 4, !dbg !33
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !14), !dbg !33
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !33
   store i32 %a, i32* @x2, align 4, !dbg !33
   ret i32 %1, !dbg !33
 }
 
 define i32 @get3(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !16), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !34
   %1 = load i32* @x3, align 4, !dbg !35
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !17), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !35
   store i32 %a, i32* @x3, align 4, !dbg !35
   ret i32 %1, !dbg !35
 }
 
 define i32 @get4(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !19), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !36
   %1 = load i32* @x4, align 4, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !20), !dbg !37
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !37
   store i32 %a, i32* @x4, align 4, !dbg !37
   ret i32 %1, !dbg !37
 }
 
 define i32 @get5(i32 %a) nounwind optsize ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !27), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !38
   %1 = load i32* @x5, align 4, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !28), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !39
   store i32 %a, i32* @x5, align 4, !dbg !39
   ret i32 %1, !dbg !39
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!49}
 
-!0 = metadata !{i32 786449, metadata !47, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get1", metadata !"get1", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get1, null, null, metadata !42, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [get1]
-!2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00clang\001\00\000\00\001", metadata !47, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !48} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00get1\00get1\00\005\000\001\000\006\00256\001\005", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get1, null, null, metadata !42} ; [ DW_TAG_subprogram ] [line 5] [def] [get1]
+!2 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get2", metadata !"get2", metadata !"", i32 8, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get2, null, null, metadata !43, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [get2]
-!7 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get3", metadata !"get3", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get3, null, null, metadata !44, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [get3]
-!8 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get4", metadata !"get4", metadata !"", i32 14, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get4, null, null, metadata !45, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [get4]
-!9 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get5", metadata !"get5", metadata !"", i32 17, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get5, null, null, metadata !46, i32 17} ; [ DW_TAG_subprogram ] [line 17] [def] [get5]
-!10 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 16777221, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 786688, metadata !12, metadata !"b", metadata !2, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!12 = metadata !{i32 786443, metadata !47, metadata !1, i32 5, i32 19, i32 0} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{i32 786689, metadata !6, metadata !"a", metadata !2, i32 16777224, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{i32 786688, metadata !15, metadata !"b", metadata !2, i32 8, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786443, metadata !47, metadata !6, i32 8, i32 17, i32 1} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786689, metadata !7, metadata !"a", metadata !2, i32 16777227, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786688, metadata !18, metadata !"b", metadata !2, i32 11, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 786443, metadata !47, metadata !7, i32 11, i32 19, i32 2} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 786689, metadata !8, metadata !"a", metadata !2, i32 16777230, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 786688, metadata !21, metadata !"b", metadata !2, i32 14, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!21 = metadata !{i32 786443, metadata !47, metadata !8, i32 14, i32 19, i32 3} ; [ DW_TAG_lexical_block ]
-!22 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x5", metadata !"x5", metadata !"", metadata !2, i32 16, metadata !5, i32 0, i32 1, i32* @x5, null} ; [ DW_TAG_variable ]
-!23 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x4", metadata !"x4", metadata !"", metadata !2, i32 13, metadata !5, i32 1, i32 1, i32* @x4, null} ; [ DW_TAG_variable ]
-!24 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x3", metadata !"x3", metadata !"", metadata !2, i32 10, metadata !5, i32 1, i32 1, i32* @x3, null} ; [ DW_TAG_variable ]
-!25 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x2", metadata !"x2", metadata !"", metadata !2, i32 7, metadata !5, i32 1, i32 1, i32* @x2, null} ; [ DW_TAG_variable ]
-!26 = metadata !{i32 786484, i32 0, metadata !0, metadata !"x1", metadata !"x1", metadata !"", metadata !2, i32 4, metadata !5, i32 1, i32 1, i32* @x1, null} ; [ DW_TAG_variable ]
-!27 = metadata !{i32 786689, metadata !9, metadata !"a", metadata !2, i32 16777233, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!28 = metadata !{i32 786688, metadata !29, metadata !"b", metadata !2, i32 17, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786443, metadata !47, metadata !9, i32 17, i32 19, i32 4} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00get2\00get2\00\008\000\001\000\006\00256\001\008", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get2, null, null, metadata !43} ; [ DW_TAG_subprogram ] [line 8] [def] [get2]
+!7 = metadata !{metadata !"0x2e\00get3\00get3\00\0011\000\001\000\006\00256\001\0011", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get3, null, null, metadata !44} ; [ DW_TAG_subprogram ] [line 11] [def] [get3]
+!8 = metadata !{metadata !"0x2e\00get4\00get4\00\0014\000\001\000\006\00256\001\0014", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get4, null, null, metadata !45} ; [ DW_TAG_subprogram ] [line 14] [def] [get4]
+!9 = metadata !{metadata !"0x2e\00get5\00get5\00\0017\000\001\000\006\00256\001\0017", metadata !47, metadata !2, metadata !3, null, i32 (i32)* @get5, null, null, metadata !46} ; [ DW_TAG_subprogram ] [line 17] [def] [get5]
+!10 = metadata !{metadata !"0x101\00a\0016777221\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x100\00b\005\000", metadata !12, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!12 = metadata !{metadata !"0xb\005\0019\000", metadata !47, metadata !1} ; [ DW_TAG_lexical_block ]
+!13 = metadata !{metadata !"0x101\00a\0016777224\000", metadata !6, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!14 = metadata !{metadata !"0x100\00b\008\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0xb\008\0017\001", metadata !47, metadata !6} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x101\00a\0016777227\000", metadata !7, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x100\00b\0011\000", metadata !18, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!18 = metadata !{metadata !"0xb\0011\0019\002", metadata !47, metadata !7} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{metadata !"0x101\00a\0016777230\000", metadata !8, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{metadata !"0x100\00b\0014\000", metadata !21, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!21 = metadata !{metadata !"0xb\0014\0019\003", metadata !47, metadata !8} ; [ DW_TAG_lexical_block ]
+!25 = metadata !{metadata !"0x34\00x1\00x1\00\004\001\001", metadata !0, metadata !2, metadata !5, i32* @x1, null} ; [ DW_TAG_variable ]
+!26 = metadata !{metadata !"0x34\00x2\00x2\00\007\001\001", metadata !0, metadata !2, metadata !5, i32* @x2, null} ; [ DW_TAG_variable ]
+!27 = metadata !{metadata !"0x101\00a\0016777233\000", metadata !9, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!28 = metadata !{metadata !"0x100\00b\0017\000", metadata !29, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{metadata !"0xb\0017\0019\004", metadata !47, metadata !9} ; [ DW_TAG_lexical_block ]
 !30 = metadata !{i32 5, i32 16, metadata !1, null}
 !31 = metadata !{i32 5, i32 32, metadata !12, null}
 !32 = metadata !{i32 8, i32 14, metadata !6, null}
@@ -116,7 +111,7 @@
 !38 = metadata !{i32 17, i32 16, metadata !9, null}
 !39 = metadata !{i32 17, i32 32, metadata !29, null}
 !40 = metadata !{metadata !1, metadata !6, metadata !7, metadata !8, metadata !9}
-!41 = metadata !{metadata !22, metadata !23, metadata !24, metadata !25, metadata !26}
+!41 = metadata !{metadata !25, metadata !26}
 !42 = metadata !{metadata !10, metadata !11}
 !43 = metadata !{metadata !13, metadata !14}
 !44 = metadata !{metadata !16, metadata !17}
@@ -124,4 +119,4 @@
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"ss3.c", metadata !"/private/tmp"}
 !48 = metadata !{}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/2014-07-18-earlyclobber-str-post.ll b/test/CodeGen/ARM/2014-07-18-earlyclobber-str-post.ll
new file mode 100644
index 0000000..df7d245
--- /dev/null
+++ b/test/CodeGen/ARM/2014-07-18-earlyclobber-str-post.ll

@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=armv7-linux-gnueabihf %s -o - | FileCheck %s
+
+; Check that we don't create an unpredictable STR instruction,
+; e.g. str r0, [r0], #4
+
+define i32* @earlyclobber-str-post(i32* %addr) nounwind {
+; CHECK-LABEL: earlyclobber-str-post
+; CHECK-NOT: str r[[REG:[0-9]+]], [r[[REG]]], #4
+  %val = ptrtoint i32* %addr to i32
+  store i32 %val, i32* %addr
+  %new = getelementptr i32* %addr, i32 1
+  ret i32* %new
+}
+
+define i16* @earlyclobber-strh-post(i16* %addr) nounwind {
+; CHECK-LABEL: earlyclobber-strh-post
+; CHECK-NOT: strh r[[REG:[0-9]+]], [r[[REG]]], #2
+  %val = ptrtoint i16* %addr to i32
+  %tr = trunc i32 %val to i16
+  store i16 %tr, i16* %addr
+  %new = getelementptr i16* %addr, i32 1
+  ret i16* %new
+}
+
+define i8* @earlyclobber-strb-post(i8* %addr) nounwind {
+; CHECK-LABEL: earlyclobber-strb-post
+; CHECK-NOT: strb r[[REG:[0-9]+]], [r[[REG]]], #1
+  %val = ptrtoint i8* %addr to i32
+  %tr = trunc i32 %val to i8
+  store i8 %tr, i8* %addr
+  %new = getelementptr i8* %addr, i32 1
+  ret i8* %new
+}

diff --git a/test/CodeGen/ARM/2014-08-04-muls-it.ll b/test/CodeGen/ARM/2014-08-04-muls-it.ll
new file mode 100644
index 0000000..4636bff
--- /dev/null
+++ b/test/CodeGen/ARM/2014-08-04-muls-it.ll

@@ -0,0 +1,25 @@
+; RUN: llc -mtriple thumbv7-eabi -arm-restrict-it -filetype asm -o - %s \
+; RUN:    | FileCheck %s
+
+define arm_aapcscc i32 @function(i32 %i, i32 %j) {
+entry:
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %mul = mul nsw i32 %i, %i
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %i.addr.0 = phi i32 [ %mul, %if.then ], [ %i, %entry ]
+  ret i32 %i.addr.0
+}
+
+; CHECK-LABEL: function
+; CHECK: cmp r0, r1
+; CHECK: bne [[LABEL:[.*]]]
+; CHECK-NOT: mulseq r0, r0, r0
+; CHECK: [[LABEL]]
+; CHECK: muls r0, r0, r0
+; CHECK: bx lr
+

diff --git a/test/CodeGen/ARM/Windows/alloca.ll b/test/CodeGen/ARM/Windows/alloca.ll
new file mode 100644
index 0000000..6a3d002
--- /dev/null
+++ b/test/CodeGen/ARM/Windows/alloca.ll

@@ -0,0 +1,22 @@
+; RUN: llc -O0 -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s
+
+declare arm_aapcs_vfpcc i32 @num_entries()
+
+define arm_aapcs_vfpcc void @test___builtin_alloca() {
+entry:
+  %array = alloca i8*, align 4
+  %call = call arm_aapcs_vfpcc i32 @num_entries()
+  %mul = mul i32 4, %call
+  %0 = alloca i8, i32 %mul
+  store i8* %0, i8** %array, align 4
+  ret void
+}
+
+; CHECK: bl num_entries
+; CHECK: movs [[R1:r[0-9]+]], #7
+; CHECK: add.w [[R0:r[0-9]+]], [[R1]], [[R0]], lsl #2
+; CHECK: bic [[R0]], [[R0]], #7
+; CHECK: lsrs r4, [[R0]], #2
+; CHECK: bl __chkstk
+; CHECK: sub.w sp, sp, r4
+

diff --git a/test/CodeGen/ARM/aapcs-hfa-code.ll b/test/CodeGen/ARM/aapcs-hfa-code.ll
index 396e838..5545dfd 100644
--- a/test/CodeGen/ARM/aapcs-hfa-code.ll
+++ b/test/CodeGen/ARM/aapcs-hfa-code.ll

@@ -54,12 +54,11 @@
 ; CHECK: bl test_1double
 
 ; CHECK-M4F-LABEL: test_1double:
-; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
-; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
-; CHECK-M4F: movt [[ONEHI]], #16368
-; CHECK-M4F-DAG: vmov s0, [[ONELO]]
-; CHECK-M4F-DAG: vmov s1, [[ONEHI]]
+; CHECK-M4F: vldr d0, [[CP_LABEL:.*]]
 ; CHECK-M4F: bl test_1double
+; CHECK-M4F: [[CP_LABEL]]
+; CHECK-M4F-NEXT: .long 0
+; CHECK-M4F-NEXT: .long 1072693248
 
   call arm_aapcs_vfpcc void @test_1double({ double } { double 1.0 })
   ret void
@@ -76,11 +75,10 @@
 ; CHECK: bl test_1double_nosplit
 
 ; CHECK-M4F-LABEL: test_1double_nosplit:
-; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
 ; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
 ; CHECK-M4F: movt [[ONEHI]], #16368
-; CHECK-M4F-DAG: str [[ONELO]], [sp]
-; CHECK-M4F-DAG: str [[ONEHI]], [sp, #4]
+; CHECK-M4F: strd [[ONELO]], [[ONEHI]], [sp]
 ; CHECK-M4F: bl test_1double_nosplit
   call arm_aapcs_vfpcc void @test_1double_nosplit([4 x float] undef, [4 x double] undef, [3 x float] undef, double 1.0)
   ret void
@@ -92,19 +90,16 @@
   call arm_aapcs_vfpcc void @test_1double_misaligned([4 x double] undef, [4 x double] undef, float undef, double 1.0)
 
 ; CHECK-LABEL: test_1double_misaligned:
-; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
-; CHECK-DAG: mov r[[BASE:[0-9]+]], sp
 ; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
+; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
 ; CHECK-DAG: movt [[ONEHI]], #16368
-; CHECK-DAG: str [[ONELO]], [r[[BASE]], #8]!
-; CHECK-DAG: str [[ONEHI]], [r[[BASE]], #4]
+; CHECK-DAG: strd [[ONELO]], [[ONEHI]], [sp, #8]
 
 ; CHECK-M4F-LABEL: test_1double_misaligned:
-; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
 ; CHECK-M4F: movs [[ONEHI:r[0-9]+]], #0
+; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
 ; CHECK-M4F: movt [[ONEHI]], #16368
-; CHECK-M4F-DAG: str [[ONELO]], [sp, #8]
-; CHECK-M4F-DAG: str [[ONEHI]], [sp, #12]
+; CHECK-M4F: strd [[ONELO]], [[ONEHI]], [sp, #8]
 ; CHECK-M4F: bl test_1double_misaligned
 
   ret void

diff --git a/test/CodeGen/ARM/adv-copy-opt.ll b/test/CodeGen/ARM/adv-copy-opt.ll
new file mode 100644
index 0000000..f71bf78
--- /dev/null
+++ b/test/CodeGen/ARM/adv-copy-opt.ll

@@ -0,0 +1,38 @@
+; RUN: llc -O1 -mtriple=armv7s-apple-ios -mcpu=swift < %s -disable-adv-copy-opt=true | FileCheck -check-prefix=NOOPT --check-prefix=CHECK %s
+; RUN: llc -O1 -mtriple=armv7s-apple-ios -mcpu=swift < %s -disable-adv-copy-opt=false | FileCheck -check-prefix=OPT --check-prefix=CHECK %s
+; RUN: llc -O1 -mtriple=thumbv7s-apple-ios -mcpu=swift < %s -disable-adv-copy-opt=true | FileCheck -check-prefix=NOOPT --check-prefix=CHECK %s
+; RUN: llc -O1 -mtriple=thumbv7s-apple-ios -mcpu=swift < %s -disable-adv-copy-opt=false | FileCheck -check-prefix=OPT --check-prefix=CHECK %s
+
+; CHECK-LABEL: simpleVectorDiv
+; ABI: %A => r0, r1.
+;      %B => r2, r3
+;      ret => r0, r1
+; We want to compute:
+; r0 = r0 / r2
+; r1 = r1 / r3
+;
+; NOOPT: vmov	[[B:d[0-9]+]], r2, r3
+; NOOPT-NEXT: vmov	[[A:d[0-9]+]], r0, r1
+; Move the low part of B into a register.
+; Unfortunately, we cannot express that the 's' register is the low
+; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2.
+; NOOPT-NEXT: vmov	[[B_LOW:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: vmov	[[A_LOW:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: udiv	[[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
+; NOOPT-NEXT: vmov	[[B_HIGH:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: vmov	[[A_HIGH:r[0-9]+]], s{{[0-9]+}}
+; NOOPT-NEXT: udiv	[[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
+; NOOPT-NEXT: vmov.32	[[RES:d[0-9]+]][0], [[RES_LOW]]
+; NOOPT-NEXT: vmov.32	[[RES]][1], [[RES_HIGH]]
+; NOOPT-NEXT: vmov	r0, r1, [[RES]]
+; NOOPT-NEXT: bx	lr
+;
+; OPT-NOT: vmov
+; OPT: 	udiv	r0, r0, r2
+; OPT-NEXT: udiv	r1, r1, r3
+; OPT-NEXT: bx	lr
+define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind {
+entry:
+  %div = udiv <2 x i32> %A, %B
+  ret <2 x i32> %div
+}

diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index f55ae10..5a737ad 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll

@@ -25,9 +25,9 @@
 define i32 @foo_f() {
   ret i32 0
 }
-@bar_f = alias weak %FunTy* @foo_f
+@bar_f = weak alias %FunTy* @foo_f
 
-@bar_i = alias internal i32* @bar
+@bar_i = internal alias i32* @bar
 
 @A = alias bitcast (i32* @bar to i64*)
 

diff --git a/test/CodeGen/ARM/arm32-round-conv.ll b/test/CodeGen/ARM/arm32-round-conv.ll
new file mode 100644
index 0000000..88fb891
--- /dev/null
+++ b/test/CodeGen/ARM/arm32-round-conv.ll

@@ -0,0 +1,117 @@
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=+fp-armv8 | FileCheck %s
+; RUN: llc < %s -mtriple=armv8-linux-gnueabihf -mattr=+fp-armv8 | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vcvtm.s32.f32
+define i32 @test1(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test2
+; CHECK: vcvtm.u32.f32
+define i32 @test2(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test3
+; CHECK: vcvtm.s32.f64
+define i32 @test3(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test4
+; CHECK: vcvtm.u32.f64
+define i32 @test4(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test5
+; CHECK: vcvtp.s32.f32
+define i32 @test5(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test6
+; CHECK: vcvtp.u32.f32
+define i32 @test6(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test7
+; CHECK: vcvtp.s32.f64
+define i32 @test7(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test8
+; CHECK: vcvtp.u32.f64
+define i32 @test8(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test9
+; CHECK: vcvta.s32.f32
+define i32 @test9(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test10
+; CHECK: vcvta.u32.f32
+define i32 @test10(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test11
+; CHECK: vcvta.s32.f64
+define i32 @test11(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: test12
+; CHECK: vcvta.u32.f64
+define i32 @test12(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+declare float @floorf(float) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare float @roundf(float) nounwind readnone
+declare double @round(double) nounwind readnone

diff --git a/test/CodeGen/ARM/arm32-rounding.ll b/test/CodeGen/ARM/arm32-rounding.ll
new file mode 100644
index 0000000..f247648
--- /dev/null
+++ b/test/CodeGen/ARM/arm32-rounding.ll

@@ -0,0 +1,118 @@
+; RUN: llc < %s -mtriple=armv8-linux-gnueabihf -mattr=+fp-armv8 | FileCheck --check-prefix=CHECK --check-prefix=DP %s
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabihf -mattr=+fp-armv8,+d16,+fp-only-sp | FileCheck --check-prefix=SP %s
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabihf -mattr=+fp-armv8,+d16 | FileCheck --check-prefix=DP %s
+
+; CHECK-LABEL: test1
+; CHECK: vrintm.f32
+define float @test1(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test2
+; SP: b floor
+; DP: vrintm.f64
+define double @test2(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test3
+; CHECK: vrintp.f32
+define float @test3(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test4
+; SP: b ceil
+; DP: vrintp.f64
+define double @test4(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test5
+; CHECK: vrinta.f32
+define float @test5(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test6
+; SP: b round
+; DP: vrinta.f64
+define double @test6(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test7
+; CHECK: vrintz.f32
+define float @test7(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test8
+; SP: b trunc
+; DP: vrintz.f64
+define double @test8(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test9
+; CHECK: vrintr.f32
+define float @test9(float %a) {
+entry:
+  %call = call float @nearbyintf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test10
+; SP: b nearbyint
+; DP: vrintr.f64
+define double @test10(double %a) {
+entry:
+  %call = call double @nearbyint(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: test11
+; CHECK: vrintx.f32
+define float @test11(float %a) {
+entry:
+  %call = call float @rintf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK-LABEL: test12
+; SP: b rint
+; DP: vrintx.f64
+define double @test12(double %a) {
+entry:
+  %call = call double @rint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare float @floorf(float) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare float @roundf(float) nounwind readnone
+declare double @round(double) nounwind readnone
+declare float @truncf(float) nounwind readnone
+declare double @trunc(double) nounwind readnone
+declare float @nearbyintf(float) nounwind readnone
+declare double @nearbyint(double) nounwind readnone
+declare float @rintf(float) nounwind readnone
+declare double @rint(double) nounwind readnone

diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll
new file mode 100644
index 0000000..84790be
--- /dev/null
+++ b/test/CodeGen/ARM/atomic-cmpxchg.ll

@@ -0,0 +1,53 @@
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=thumb-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMB
+
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-ARMV7
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-THUMBV7
+
+define zeroext i1 @test_cmpxchg_res_i8(i8* %addr, i8 %desired, i8 zeroext %new) {
+entry:
+  %0 = cmpxchg i8* %addr, i8 %desired, i8 %new monotonic monotonic
+  %1 = extractvalue { i8, i1 } %0, 1
+  ret i1 %1
+}
+
+; CHECK-ARM-LABEL: test_cmpxchg_res_i8
+; CHECK-ARM: bl __sync_val_compare_and_swap_1
+; CHECK-ARM: mov [[REG:r[0-9]+]], #0
+; CHECK-ARM: cmp r0, {{r[0-9]+}}
+; CHECK-ARM: moveq [[REG]], #1
+; CHECK-ARM: mov r0, [[REG]]
+
+; CHECK-THUMB-LABEL: test_cmpxchg_res_i8
+; CHECK-THUMB: bl __sync_val_compare_and_swap_1
+; CHECK-THUMB-NOT: mov [[R1:r[0-7]]], r0
+; CHECK-THUMB: push  {r0}
+; CHECK-THUMB: pop {[[R1:r[0-7]]]}
+; CHECK-THUMB: movs r0, #1
+; CHECK-THUMB: movs [[R2:r[0-9]+]], #0
+; CHECK-THUMB: cmp [[R1]], {{r[0-9]+}}
+; CHECK-THU<B: beq
+; CHECK-THUMB: push  {[[R2]]}
+; CHECK-THUMB: pop {r0}
+
+; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8
+; CHECK-ARMV7: ldrexb [[R3:r[0-9]+]], [r0]
+; CHECK-ARMV7: mov [[R1:r[0-9]+]], #0
+; CHECK-ARMV7: cmp [[R3]], {{r[0-9]+}}
+; CHECK-ARMV7: bne
+; CHECK-ARMV7: strexb [[R3]], {{r[0-9]+}}, [{{r[0-9]+}}]
+; CHECK-ARMV7: mov [[R1]], #1
+; CHECK-ARMV7: cmp [[R3]], #0
+; CHECK-ARMV7: bne
+; CHECK-ARMV7: mov r0, [[R1]]
+
+; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8
+; CHECK-THUMBV7: ldrexb [[R3:r[0-9]+]], [r0]
+; CHECK-THUMBV7: cmp [[R3]], {{r[0-9]+}}
+; CHECK-THUMBV7: movne r0, #0
+; CHECK-THUMBV7: bxne lr
+; CHECK-THUMBV7: strexb [[R3]], {{r[0-9]+}}, [{{r[0-9]+}}]
+; CHECK-THUMBV7: cmp [[R3]], #0
+; CHECK-THUMBV7: itt eq
+; CHECK-THUMBV7: moveq r0, #1
+; CHECK-THUMBV7: bxeq lr

diff --git a/test/CodeGen/ARM/atomic-load-store.ll b/test/CodeGen/ARM/atomic-load-store.ll
index 49342d2..af13dfc 100644
--- a/test/CodeGen/ARM/atomic-load-store.ll
+++ b/test/CodeGen/ARM/atomic-load-store.ll

@@ -3,6 +3,8 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefix=THUMBTWO
 ; RUN: llc < %s -mtriple=thumbv6-apple-ios | FileCheck %s -check-prefix=THUMBONE
 ; RUN: llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefix=ARMV4
+; RUN: llc < %s -mtriple=armv6-apple-ios | FileCheck %s -check-prefix=ARMV6
+; RUN: llc < %s -mtriple=thumbv7m-apple-ios | FileCheck %s -check-prefix=THUMBM
 
 define void @test1(i32* %ptr, i32 %val1) {
 ; ARM-LABEL: test1
@@ -15,6 +17,14 @@
 ; THUMBTWO: dmb {{ish$}}
 ; THUMBTWO-NEXT: str
 ; THUMBTWO-NEXT: dmb {{ish$}}
+; ARMV6-LABEL: test1
+; ARMV6: mcr p15, #0, {{r[0-9]*}}, c7, c10, #5
+; ARMV6: str
+; ARMV6: mcr p15, #0, {{r[0-9]*}}, c7, c10, #5
+; THUMBM-LABEL: test1
+; THUMBM: dmb sy
+; THUMBM: str
+; THUMBM: dmb sy
   store atomic i32 %val1, i32* %ptr seq_cst, align 4
   ret void
 }
@@ -28,6 +38,12 @@
 ; THUMBTWO-LABEL: test2
 ; THUMBTWO: ldr
 ; THUMBTWO-NEXT: dmb {{ish$}}
+; ARMV6-LABEL: test2
+; ARMV6: ldr
+; ARMV6: mcr p15, #0, {{r[0-9]*}}, c7, c10, #5
+; THUMBM-LABEL: test2
+; THUMBM: ldr
+; THUMBM: dmb sy
   %val = load atomic i32* %ptr seq_cst, align 4
   ret i32 %val
 }
@@ -55,6 +71,11 @@
 ; THUMBONE-NOT: dmb
 ; THUMBONE: strb
 ; THUMBONE-NOT: dmb
+
+; ARMV6-LABEL: test3
+; ARMV6-NOT: mcr
+; THUMBM-LABEL: test3
+; THUMBM-NOT: dmb sy
   %val = load atomic i8* %ptr1 unordered, align 1
   store atomic i8 %val, i8* %ptr2 unordered, align 1
   ret void
@@ -64,6 +85,8 @@
 ; THUMBONE-LABEL: test4
 ; THUMBONE: ___sync_val_compare_and_swap_1
 ; THUMBONE: ___sync_lock_test_and_set_1
+; ARMV6-LABEL: test4
+; THUMBM-LABEL: test4
   %val = load atomic i8* %ptr1 seq_cst, align 1
   store atomic i8 %val, i8* %ptr2 seq_cst, align 1
   ret void

diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index b988242..1ac8648 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll

@@ -1,7 +1,10 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-T1
-; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs -mcpu=cortex-m0 | FileCheck %s --check-prefix=CHECK-T1
+; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs -mcpu=cortex-m0 | FileCheck %s --check-prefix=CHECK-M0
+; RUN: llc < %s -mtriple=thumbv7--none-eabi -thread-model single -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-BAREMETAL
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @func(i32 %argc, i8** %argv) nounwind {
 entry:
@@ -27,48 +30,72 @@
   ; CHECK: add
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_add_4
+  ; CHECK-M0: bl ___sync_fetch_and_add_4
+  ; CHECK-BAREMETAL: add
+  ; CHECK-BAREMETAL-NOT: __sync
   %0 = atomicrmw add i32* %val1, i32 %tmp monotonic
 	store i32 %0, i32* %old
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_sub_4
+  ; CHECK-M0: bl ___sync_fetch_and_sub_4
+  ; CHECK-BAREMETAL: sub
+  ; CHECK-BAREMETAL-NOT: __sync
   %1 = atomicrmw sub i32* %val2, i32 30 monotonic
 	store i32 %1, i32* %old
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_add_4
+  ; CHECK-M0: bl ___sync_fetch_and_add_4
+  ; CHECK-BAREMETAL: add
+  ; CHECK-BAREMETAL-NOT: __sync
   %2 = atomicrmw add i32* %val2, i32 1 monotonic
 	store i32 %2, i32* %old
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_sub_4
+  ; CHECK-M0: bl ___sync_fetch_and_sub_4
+  ; CHECK-BAREMETAL: sub
+  ; CHECK-BAREMETAL-NOT: __sync
   %3 = atomicrmw sub i32* %val2, i32 1 monotonic
 	store i32 %3, i32* %old
   ; CHECK: ldrex
   ; CHECK: and
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_and_4
+  ; CHECK-M0: bl ___sync_fetch_and_and_4
+  ; CHECK-BAREMETAL: and
+  ; CHECK-BAREMETAL-NOT: __sync
   %4 = atomicrmw and i32* %andt, i32 4080 monotonic
 	store i32 %4, i32* %old
   ; CHECK: ldrex
   ; CHECK: or
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_or_4
+  ; CHECK-M0: bl ___sync_fetch_and_or_4
+  ; CHECK-BAREMETAL: or
+  ; CHECK-BAREMETAL-NOT: __sync
   %5 = atomicrmw or i32* %ort, i32 4080 monotonic
 	store i32 %5, i32* %old
   ; CHECK: ldrex
   ; CHECK: eor
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_xor_4
+  ; CHECK-M0: bl ___sync_fetch_and_xor_4
+  ; CHECK-BAREMETAL: eor
+  ; CHECK-BAREMETAL-NOT: __sync
   %6 = atomicrmw xor i32* %xort, i32 4080 monotonic
 	store i32 %6, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_min_4
+  ; CHECK-M0: bl ___sync_fetch_and_min_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %7 = atomicrmw min i32* %val2, i32 16 monotonic
 	store i32 %7, i32* %old
 	%neg = sub i32 0, 1
@@ -76,24 +103,36 @@
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_min_4
+  ; CHECK-M0: bl ___sync_fetch_and_min_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %8 = atomicrmw min i32* %val2, i32 %neg monotonic
 	store i32 %8, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_max_4
+  ; CHECK-M0: bl ___sync_fetch_and_max_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %9 = atomicrmw max i32* %val2, i32 1 monotonic
 	store i32 %9, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_max_4
+  ; CHECK-M0: bl ___sync_fetch_and_max_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %10 = atomicrmw max i32* %val2, i32 0 monotonic
 	store i32 %10, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_4
+  ; CHECK-M0: bl ___sync_fetch_and_umin_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %11 = atomicrmw umin i32* %val2, i32 16 monotonic
 	store i32 %11, i32* %old
 	%uneg = sub i32 0, 1
@@ -101,18 +140,27 @@
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_4
+  ; CHECK-M0: bl ___sync_fetch_and_umin_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %12 = atomicrmw umin i32* %val2, i32 %uneg monotonic
 	store i32 %12, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_4
+  ; CHECK-M0: bl ___sync_fetch_and_umax_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %13 = atomicrmw umax i32* %val2, i32 1 monotonic
 	store i32 %13, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_4
+  ; CHECK-M0: bl ___sync_fetch_and_umax_4
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %14 = atomicrmw umax i32* %val2, i32 0 monotonic
 	store i32 %14, i32* %old
 
@@ -128,6 +176,9 @@
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_2
+  ; CHECK-M0: bl ___sync_fetch_and_umin_2
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %0 = atomicrmw umin i16* %val, i16 16 monotonic
   store i16 %0, i16* %old
   %uneg = sub i16 0, 1
@@ -135,18 +186,27 @@
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_2
+  ; CHECK-M0: bl ___sync_fetch_and_umin_2
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %1 = atomicrmw umin i16* %val, i16 %uneg monotonic
   store i16 %1, i16* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_2
+  ; CHECK-M0: bl ___sync_fetch_and_umax_2
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %2 = atomicrmw umax i16* %val, i16 1 monotonic
   store i16 %2, i16* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_2
+  ; CHECK-M0: bl ___sync_fetch_and_umax_2
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %3 = atomicrmw umax i16* %val, i16 0 monotonic
   store i16 %3, i16* %old
   ret void
@@ -161,12 +221,18 @@
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_1
+  ; CHECK-M0: bl ___sync_fetch_and_umin_1
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %0 = atomicrmw umin i8* %val, i8 16 monotonic
   store i8 %0, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umin_1
+  ; CHECK-M0: bl ___sync_fetch_and_umin_1
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %uneg = sub i8 0, 1
   %1 = atomicrmw umin i8* %val, i8 %uneg monotonic
   store i8 %1, i8* %old
@@ -174,12 +240,18 @@
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_1
+  ; CHECK-M0: bl ___sync_fetch_and_umax_1
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %2 = atomicrmw umax i8* %val, i8 1 monotonic
   store i8 %2, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
   ; CHECK-T1: blx ___sync_fetch_and_umax_1
+  ; CHECK-M0: bl ___sync_fetch_and_umax_1
+  ; CHECK-BAREMETAL: cmp
+  ; CHECK-BAREMETAL-NOT: __sync
   %3 = atomicrmw umax i8* %val, i8 0 monotonic
   store i8 %3, i8* %old
   ret void
@@ -233,3 +305,69 @@
 
   ret i32 %oldval
 }
+
+define i32 @load_load_add_acquire(i32* %mem1, i32* %mem2) nounwind {
+; CHECK-LABEL: load_load_add_acquire
+  %val1 = load atomic i32* %mem1 acquire, align 4
+  %val2 = load atomic i32* %mem2 acquire, align 4
+  %tmp = add i32 %val1, %val2
+
+; CHECK: ldr {{r[0-9]}}, [r0]
+; CHECK: dmb
+; CHECK: ldr {{r[0-9]}}, [r1]
+; CHECK: dmb
+; CHECK: add r0,
+
+; CHECK-M0: ___sync_val_compare_and_swap_4
+; CHECK-M0: ___sync_val_compare_and_swap_4
+
+; CHECK-BAREMETAL: ldr {{r[0-9]}}, [r0]
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMETAL: ldr {{r[0-9]}}, [r1]
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMETAL: add r0,
+
+  ret i32 %tmp
+}
+
+define void @store_store_release(i32* %mem1, i32 %val1, i32* %mem2, i32 %val2) {
+; CHECK-LABEL: store_store_release
+  store atomic i32 %val1, i32* %mem1 release, align 4
+  store atomic i32 %val2, i32* %mem2 release, align 4
+
+; CHECK: dmb
+; CHECK: str r1, [r0]
+; CHECK: dmb
+; CHECK: str r3, [r2]
+
+; CHECK-M0: ___sync_lock_test_and_set
+; CHECK-M0: ___sync_lock_test_and_set
+
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMTEAL: str r1, [r0]
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMTEAL: str r3, [r2]
+
+  ret void
+}
+
+define void @load_fence_store_monotonic(i32* %mem1, i32* %mem2) {
+; CHECK-LABEL: load_fence_store_monotonic
+  %val = load atomic i32* %mem1 monotonic, align 4
+  fence seq_cst
+  store atomic i32 %val, i32* %mem2 monotonic, align 4
+
+; CHECK: ldr [[R0:r[0-9]]], [r0]
+; CHECK: dmb
+; CHECK: str [[R0]], [r1]
+
+; CHECK-M0: ldr [[R0:r[0-9]]], [r0]
+; CHECK-M0: dmb
+; CHECK-M0: str [[R0]], [r1]
+
+; CHECK-BAREMETAL: ldr [[R0:r[0-9]]], [r0]
+; CHECK-BAREMETAL-NOT: dmb
+; CHECK-BAREMETAL: str [[R0]], [r1]
+
+  ret void
+}

diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index d75d55d..99c2445 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll

@@ -20,12 +20,16 @@
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A12-NOFPU
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9-mp | FileCheck %s --check-prefix=CORTEX-A9-MP
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 | FileCheck %s --check-prefix=CORTEX-A15
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 | FileCheck %s --check-prefix=CORTEX-A17-DEFAULT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-A17-NOFPU
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=CORTEX-M3
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-M4-SOFT
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-M4-HARD
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2 | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SOFT
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SINGLE
+; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-DOUBLE
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=CORTEX-A57
@@ -38,6 +42,41 @@
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=default | FileCheck %s --check-prefix=RELOC-OTHER
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -relocation-model=dynamic-no-pic | FileCheck %s --check-prefix=RELOC-OTHER
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=RELOC-OTHER
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=PCS-R9-USE
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -arm-reserve-r9 | FileCheck %s --check-prefix=PCS-R9-RESERVE
+
+; ARMv8a (AArch32)
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; ARMv7a
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; ARMv7r
+; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; ARMv7m
+; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv7m-none-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; ARMv6
+; RUN: llc < %s -mtriple=armv6-none-netbsd-gnueabi -mcpu=arm1136j-s | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv6-none-linux-gnueabi -mcpu=arm1136j-s -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; ARMv6m
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -arm-no-strict-align -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -arm-strict-align -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv6m-none-linux-gnueabi -arm-no-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv6m-none-linux-gnueabi -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumb-none-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=STRICT-ALIGN
+; ARMv5
+; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv5-none-linux-gnueabi -mcpu=arm1022e | FileCheck %s --check-prefix=STRICT-ALIGN
 
 ; XSCALE:      .eabi_attribute 6, 5
 ; XSCALE:      .eabi_attribute 8, 1
@@ -132,6 +171,10 @@
 ; V8-FPARMv8-NEON-CRYPTO: .fpu crypto-neon-fp-armv8
 ; V8-FPARMv8-NEON-CRYPTO: .eabi_attribute 12, 3
 
+; Tag_CPU_unaligned_access
+; NO-STRICT-ALIGN: .eabi_attribute 34, 1
+; STRICT-ALIGN: .eabi_attribute 34, 0
+
 ; Tag_CPU_arch	'ARMv7'
 ; CORTEX-A7-CHECK: .eabi_attribute	6, 10
 ; CORTEX-A7-NOFPU: .eabi_attribute	6, 10
@@ -257,7 +300,7 @@
 ; CORTEX-A9-SOFT-NOT:  .eabi_attribute 27
 ; CORTEX-A9-SOFT-NOT:  .eabi_attribute 28
 ; CORTEX-A9-SOFT:  .eabi_attribute 36, 1
-; CORTEX-A9-SOFT-NOT:  .eabi_attribute 42
+; CORTEX-A9-SOFT:  .eabi_attribute 42, 1
 ; CORTEX-A9-SOFT:  .eabi_attribute 68, 1
 
 ; CORTEX-A9-HARD:  .cpu cortex-a9
@@ -274,26 +317,9 @@
 ; CORTEX-A9-HARD-NOT:  .eabi_attribute 27
 ; CORTEX-A9-HARD:  .eabi_attribute 28, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 36, 1
-; CORTEX-A9-HARD-NOT:  .eabi_attribute 42
+; CORTEX-A9-HARD:  .eabi_attribute 42, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 68, 1
 
-; CORTEX-A9-MP:  .cpu cortex-a9-mp
-; CORTEX-A9-MP:  .eabi_attribute 6, 10
-; CORTEX-A9-MP:  .eabi_attribute 7, 65
-; CORTEX-A9-MP:  .eabi_attribute 8, 1
-; CORTEX-A9-MP:  .eabi_attribute 9, 2
-; CORTEX-A9-MP:  .fpu neon
-; CORTEX-A9-MP:  .eabi_attribute 20, 1
-; CORTEX-A9-MP:  .eabi_attribute 21, 1
-; CORTEX-A9-MP:  .eabi_attribute 23, 3
-; CORTEX-A9-MP:  .eabi_attribute 24, 1
-; CORTEX-A9-MP:  .eabi_attribute 25, 1
-; CORTEX-A9-MP-NOT:  .eabi_attribute 27
-; CORTEX-A9-MP-NOT:  .eabi_attribute 28
-; CORTEX-A9-MP:  .eabi_attribute 36, 1
-; CORTEX-A9-MP:  .eabi_attribute 42, 1
-; CORTEX-A9-MP:  .eabi_attribute 68, 1
-
 ; CORTEX-A12-DEFAULT:  .cpu cortex-a12
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 6, 10
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 7, 65
@@ -342,6 +368,36 @@
 ; CORTEX-A15: .eabi_attribute 44, 2
 ; CORTEX-A15: .eabi_attribute 68, 3
 
+; CORTEX-A17-DEFAULT:  .cpu cortex-a17
+; CORTEX-A17-DEFAULT:  .eabi_attribute 6, 10
+; CORTEX-A17-DEFAULT:  .eabi_attribute 7, 65
+; CORTEX-A17-DEFAULT:  .eabi_attribute 8, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 9, 2
+; CORTEX-A17-DEFAULT:  .fpu neon-vfpv4
+; CORTEX-A17-DEFAULT:  .eabi_attribute 20, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 21, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 23, 3
+; CORTEX-A17-DEFAULT:  .eabi_attribute 24, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 25, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 42, 1
+; CORTEX-A17-DEFAULT:  .eabi_attribute 44, 2
+; CORTEX-A17-DEFAULT:  .eabi_attribute 68, 3
+
+; CORTEX-A17-NOFPU:  .cpu cortex-a17
+; CORTEX-A17-NOFPU:  .eabi_attribute 6, 10
+; CORTEX-A17-NOFPU:  .eabi_attribute 7, 65
+; CORTEX-A17-NOFPU:  .eabi_attribute 8, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 9, 2
+; CORTEX-A17-NOFPU-NOT:  .fpu
+; CORTEX-A17-NOFPU:  .eabi_attribute 20, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 21, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 23, 3
+; CORTEX-A17-NOFPU:  .eabi_attribute 24, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 25, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 42, 1
+; CORTEX-A17-NOFPU:  .eabi_attribute 44, 2
+; CORTEX-A17-NOFPU:  .eabi_attribute 68, 3
+
 ; CORTEX-M0:  .cpu cortex-m0
 ; CORTEX-M0:  .eabi_attribute 6, 12
 ; CORTEX-M0-NOT:  .eabi_attribute 7
@@ -408,6 +464,26 @@
 ; CORTEX-M4-HARD-NOT:  .eabi_attribute 44
 ; CORTEX-M4-HARD-NOT:  .eabi_attribute 68
 
+; CORTEX-M7:  .cpu    cortex-m7
+; CORTEX-M7:  .eabi_attribute 6, 13
+; CORTEX-M7:  .eabi_attribute 7, 77
+; CORTEX-M7:  .eabi_attribute 8, 0
+; CORTEX-M7:  .eabi_attribute 9, 2
+; CORTEX-M7-SOFT-NOT: .fpu
+; CORTEX-M7-SINGLE:  .fpu fpv5-d16
+; CORTEX-M7-DOUBLE:  .fpu fpv5-d16
+; CORTEX-M7:  .eabi_attribute 17, 1
+; CORTEX-M7:  .eabi_attribute 20, 1
+; CORTEX-M7:  .eabi_attribute 21, 1
+; CORTEX-M7:  .eabi_attribute 23, 3
+; CORTEX-M7:  .eabi_attribute 24, 1
+; CORTEX-M7:  .eabi_attribute 25, 1
+; CORTEX-M7-SOFT-NOT: .eabi_attribute 27
+; CORTEX-M7-SINGLE:  .eabi_attribute 27, 1
+; CORTEX-M7-DOUBLE-NOT: .eabi_attribute 27
+; CORTEX-M7:  .eabi_attribute 36, 1
+; CORTEX-M7:  .eabi_attribute 14, 0
+
 ; CORTEX-R5:  .cpu cortex-r5
 ; CORTEX-R5:  .eabi_attribute 6, 10
 ; CORTEX-R5:  .eabi_attribute 7, 82
@@ -463,6 +539,9 @@
 ; RELOC-PIC:  .eabi_attribute 17, 2
 ; RELOC-OTHER:  .eabi_attribute 17, 1
 
+; PCS-R9-USE:  .eabi_attribute 14, 0
+; PCS-R9-RESERVE:  .eabi_attribute 14, 3
+
 define i32 @f(i64 %z) {
 	ret i32 0
 }

diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index e344b08..7ea9be2 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll

@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:

diff --git a/test/CodeGen/ARM/coalesce-dbgvalue.ll b/test/CodeGen/ARM/coalesce-dbgvalue.ll
index 606c9bc..47d81a6 100644
--- a/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/test/CodeGen/ARM/coalesce-dbgvalue.ll

@@ -27,11 +27,11 @@
 
 for.body2:                                        ; preds = %for.cond1
   store i32 %storemerge11, i32* @b, align 4, !dbg !26
-  tail call void @llvm.dbg.value(metadata !27, i64 0, metadata !11), !dbg !28
+  tail call void @llvm.dbg.value(metadata !27, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !28
   %0 = load i64* @a, align 8, !dbg !29
   %xor = xor i64 %0, %e.1.ph, !dbg !29
   %conv3 = trunc i64 %xor to i32, !dbg !29
-  tail call void @llvm.dbg.value(metadata !{i32 %conv3}, i64 0, metadata !10), !dbg !29
+  tail call void @llvm.dbg.value(metadata !{i32 %conv3}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !29
   %tobool4 = icmp eq i32 %conv3, 0, !dbg !29
   br i1 %tobool4, label %land.end, label %land.rhs, !dbg !29
 
@@ -69,7 +69,7 @@
 declare i32 @fn3(...) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -79,33 +79,33 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 182024) (llvm/trunk 182023)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/d/b/pr16110.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 182024) (llvm/trunk 182023)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2} ; [ DW_TAG_compile_unit ] [/d/b/pr16110.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"pr16110.c", metadata !"/d/b"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"pr16110", metadata !"pr16110", metadata !"", i32 7, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @pr16110, null, null, metadata !9, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [pr16110]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/d/b/pr16110.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00pr16110\00pr16110\00\007\000\001\000\006\000\001\007", metadata !1, metadata !5, metadata !6, null, i32 ()* @pr16110, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 7] [def] [pr16110]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/d/b/pr16110.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10, metadata !11}
-!10 = metadata !{i32 786688, metadata !4, metadata !"e", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [e] [line 8]
-!11 = metadata !{i32 786688, metadata !12, metadata !"f", metadata !5, i32 13, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [f] [line 13]
-!12 = metadata !{i32 786443, metadata !1, metadata !13, i32 12, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
-!13 = metadata !{i32 786443, metadata !1, metadata !4, i32 12, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
-!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
+!10 = metadata !{metadata !"0x100\00e\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [e] [line 8]
+!11 = metadata !{metadata !"0x100\00f\0013\000", metadata !12, metadata !5, metadata !14} ; [ DW_TAG_auto_variable ] [f] [line 13]
+!12 = metadata !{metadata !"0xb\0012\000\002", metadata !1, metadata !13} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
+!13 = metadata !{metadata !"0xb\0012\000\001", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
+!14 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
 !15 = metadata !{metadata !16, metadata !18, metadata !19, metadata !20}
-!16 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !5, i32 1, metadata !17, i32 0, i32 1, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!17 = metadata !{i32 786468, null, null, metadata !"long long int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long long int] [line 0, size 64, align 32, offset 0, enc DW_ATE_signed]
-!18 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !5, i32 2, metadata !8, i32 0, i32 1, i32* @b, null} ; [ DW_TAG_variable ] [b] [line 2] [def]
-!19 = metadata !{i32 786484, i32 0, null, metadata !"c", metadata !"c", metadata !"", metadata !5, i32 3, metadata !8, i32 0, i32 1, i32* @c, null} ; [ DW_TAG_variable ] [c] [line 3] [def]
-!20 = metadata !{i32 786484, i32 0, null, metadata !"d", metadata !"d", metadata !"", metadata !5, i32 4, metadata !8, i32 0, i32 1, i32* @d, null} ; [ DW_TAG_variable ] [d] [line 4] [def]
+!16 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !5, metadata !17, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!17 = metadata !{metadata !"0x24\00long long int\000\0064\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [long long int] [line 0, size 64, align 32, offset 0, enc DW_ATE_signed]
+!18 = metadata !{metadata !"0x34\00b\00b\00\002\000\001", null, metadata !5, metadata !8, i32* @b, null} ; [ DW_TAG_variable ] [b] [line 2] [def]
+!19 = metadata !{metadata !"0x34\00c\00c\00\003\000\001", null, metadata !5, metadata !8, i32* @c, null} ; [ DW_TAG_variable ] [c] [line 3] [def]
+!20 = metadata !{metadata !"0x34\00d\00d\00\004\000\001", null, metadata !5, metadata !8, i32* @d, null} ; [ DW_TAG_variable ] [d] [line 4] [def]
 !21 = metadata !{i32 10, i32 0, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !1, metadata !4, i32 10, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
+!22 = metadata !{metadata !"0xb\0010\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
 !26 = metadata !{i32 12, i32 0, metadata !13, null}
 !27 = metadata !{i32* null}
 !28 = metadata !{i32 13, i32 0, metadata !12, null}
 !29 = metadata !{i32 14, i32 0, metadata !12, null}
 !31 = metadata !{i32 16, i32 0, metadata !4, null}
 !32 = metadata !{i32 18, i32 0, metadata !4, null}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/constant-islands.ll b/test/CodeGen/ARM/constant-islands.ll
new file mode 100644
index 0000000..afa4b85
--- /dev/null
+++ b/test/CodeGen/ARM/constant-islands.ll

@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf -O0 -fast-isel=0 -o - %s | FileCheck %s
+
+define void @test_no_duplicate_branches(float %in) {
+; CHECK-LABEL: test_no_duplicate_branches:
+; CHECK: vldr {{s[0-9]+}}, [[CONST:\.LCPI[0-9]+_[0-9]+]]
+; CHECK: b .LBB
+; CHECK-NOT: b .LBB
+; CHECK: [[CONST]]:
+; CHECK-NEXT: .long 1150963712
+
+  %tst = fcmp oeq float %in, 1234.5
+
+  %chain = zext i1 %tst to i32
+
+  br i1 %tst, label %true, label %false
+
+true:
+  call i32 @llvm.arm.space(i32 2000, i32 undef)
+  ret void
+
+false:
+  ret void
+}
+
+declare i32 @llvm.arm.space(i32, i32)

diff --git a/test/CodeGen/ARM/copy-cpsr.ll b/test/CodeGen/ARM/copy-cpsr.ll
new file mode 100644
index 0000000..8b7dc03
--- /dev/null
+++ b/test/CodeGen/ARM/copy-cpsr.ll

@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=armv7s-apple-ios7.0 -show-mc-encoding %s -o - | FileCheck %s --check-prefix=CHECK-ARM
+; RUN: llc -mtriple=thumbv7s-apple-ios7.0 -show-mc-encoding %s -o - | FileCheck %s --check-prefix=CHECK-THUMB
+; RUN: llc -mtriple=thumbv7m-none-eabi -show-mc-encoding %s -o - | FileCheck %s --check-prefix=CHECK-THUMB
+
+; In the ARM backend, most compares are glued to their uses so CPSR can't
+; escape. However, for long ADCS chains (and last ditch fallback) the dependency
+; is carried in the DAG because duplicating them can be more expensive than
+; copying CPSR.
+
+; Crafting a test for this was a little tricky, in case it breaks here are some
+; notes on what I was tring to achieve:
+;   + We want 2 long ADCS chains
+;   + We want them to split after an initial common prefix (so that a single
+;     CPSR is used twice).
+;   + We want both chains to write CPSR post-split (so that the copy can't be
+;     elided).
+;   + We want the chains to be long enough that duplicating them is expensive.
+
+define void @test_copy_cpsr(i128 %lhs, i128 %rhs, i128* %addr) {
+; CHECK-ARM: test_copy_cpsr:
+; CHECK-THUMB: test_copy_cpsr:
+
+; CHECK-ARM: mrs [[TMP:r[0-9]+]], apsr @ encoding: [0x00,0x{{[0-9a-f]}}0,0x0f,0xe1]
+; CHECK-ARM: msr APSR_nzcvq, [[TMP]] @ encoding: [0x0{{[0-9a-f]}},0xf0,0x28,0xe1]
+
+  ; In Thumb mode v7M and v7AR have different MRS/MSR instructions that happen
+  ; to overlap for the apsr case, so it's definitely worth checking both.
+; CHECK-THUMB: mrs [[TMP:r[0-9]+]], apsr @ encoding: [0xef,0xf3,0x00,0x8{{[0-9a-f]}}]
+; CHECK-THUMB: msr {{APSR|apsr}}_nzcvq, [[TMP]] @ encoding: [0x8{{[0-9a-f]}},0xf3,0x00,0x88]
+
+  %sum = add i128 %lhs, %rhs
+  store volatile i128 %sum, i128* %addr
+
+  %rhs2.tmp1 = trunc i128 %rhs to i64
+  %rhs2 = zext i64 %rhs2.tmp1 to i128
+
+  %sum2 = add i128 %lhs, %rhs2
+  store volatile i128 %sum2, i128* %addr
+
+  ret void
+}

diff --git a/test/CodeGen/ARM/darwin-eabi.ll b/test/CodeGen/ARM/darwin-eabi.ll
index f2cde71..5301c0b 100644
--- a/test/CodeGen/ARM/darwin-eabi.ll
+++ b/test/CodeGen/ARM/darwin-eabi.ll

@@ -7,7 +7,7 @@
   %sum = fadd float %lhs, %rhs
   ret float %sum
 ; CHECK-M3-LABEL: float_op:
-; CHECK-M3: blx ___addsf3
+; CHECK-M3: bl ___addsf3
 
 ; CHECK-M4-LABEL: float_op:
 ; CHECK-M4: vadd.f32
@@ -17,8 +17,8 @@
   %sum = fadd double %lhs, %rhs
   ret double %sum
 ; CHECK-M3-LABEL: double_op:
-; CHECK-M3: blx ___adddf3
+; CHECK-M3: bl ___adddf3
 
 ; CHECK-M4-LABEL: double_op:
-; CHECK-M4: blx ___adddf3
+; CHECK-M4: {{(blx|b.w)}} ___adddf3
 }

diff --git a/test/CodeGen/ARM/dbg.ll b/test/CodeGen/ARM/dbg.ll
new file mode 100644
index 0000000..8bce1a6
--- /dev/null
+++ b/test/CodeGen/ARM/dbg.ll

@@ -0,0 +1,13 @@
+; RUN: llc -mtriple armv8-eabi -mcpu=cortex-a57 -o - %s | FileCheck %s
+; RUN: llc -mtriple thumbv8-eabi -mcpu=cortex-a57 -o - %s | FileCheck %s
+
+define void @hint_dbg() {
+entry:
+  call void @llvm.arm.dbg(i32 0)
+  ret void
+}
+
+declare void @llvm.arm.dbg(i32)
+
+; CHECK: dbg #0
+

diff --git a/test/CodeGen/ARM/debug-frame-large-stack.ll b/test/CodeGen/ARM/debug-frame-large-stack.ll
index 5bafce9..1addf63 100644
--- a/test/CodeGen/ARM/debug-frame-large-stack.ll
+++ b/test/CodeGen/ARM/debug-frame-large-stack.ll

@@ -1,28 +1,11 @@
-; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-none-eabi -disable-fp-elim| FileCheck %s --check-prefix=CHECK-ARM
-; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-none-eabi | FileCheck %s --check-prefix=CHECK-ARM-FP-ELIM
+; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-netbsd-eabi -disable-fp-elim| FileCheck %s --check-prefix=CHECK-ARM
+; RUN: llc -filetype=asm -o - < %s -mtriple arm-arm-netbsd-eabi | FileCheck %s --check-prefix=CHECK-ARM-FP-ELIM
 
 define void @test1() {
     %tmp = alloca [ 64 x i32 ] , align 4
     ret void
 }
 
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9}
-!llvm.ident = !{!10}
-
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/large.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"large.c", metadata !"/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test1", metadata !"test1", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @test1, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [test1]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/large.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!10 = metadata !{metadata !"clang version 3.5 "}
-!11 = metadata !{i32 2, i32 0, metadata !4, null}
-
 ; CHECK-ARM-LABEL: test1:
 ; CHECK-ARM: .cfi_startproc
 ; CHECK-ARM: sub    sp, sp, #256

diff --git a/test/CodeGen/ARM/debug-frame-vararg.ll b/test/CodeGen/ARM/debug-frame-vararg.ll
index 42ff82d..ffc1a6a 100644
--- a/test/CodeGen/ARM/debug-frame-vararg.ll
+++ b/test/CodeGen/ARM/debug-frame-vararg.ll

@@ -25,37 +25,37 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"var.c", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"sum", metadata !"sum", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, ...)* @sum, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00sum\00sum\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, i32 (i32, ...)* @sum, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5 "}
-!12 = metadata !{i32 786689, metadata !4, metadata !"count", metadata !5, i32 16777221, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [count] [line 5]
+!12 = metadata !{metadata !"0x101\00count\0016777221\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [count] [line 5]
 !13 = metadata !{i32 5, i32 0, metadata !4, null}
-!14 = metadata !{i32 786688, metadata !4, metadata !"vl", metadata !5, i32 6, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vl] [line 6]
-!15 = metadata !{i32 786454, metadata !16, null, metadata !"va_list", i32 30, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
+!14 = metadata !{metadata !"0x100\00vl\006\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [vl] [line 6]
+!15 = metadata !{metadata !"0x16\00va_list\0030\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
 !16 = metadata !{metadata !"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", metadata !"/tmp"}
-!17 = metadata !{i32 786454, metadata !1, null, metadata !"__builtin_va_list", i32 6, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
-!18 = metadata !{i32 786451, metadata !1, null, metadata !"__va_list", i32 6, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
+!17 = metadata !{metadata !"0x16\00__builtin_va_list\006\000\000\000\000", metadata !1, null, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
+!18 = metadata !{metadata !"0x13\00__va_list\006\0032\0032\000\000\000", metadata !1, null, null, metadata !19, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
 !19 = metadata !{metadata !20}
-!20 = metadata !{i32 786445, metadata !1, metadata !18, metadata !"__ap", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
-!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
+!20 = metadata !{metadata !"0xd\00__ap\006\0032\0032\000\000", metadata !1, metadata !18, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
+!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
 !22 = metadata !{i32 6, i32 0, metadata !4, null}
 !23 = metadata !{i32 7, i32 0, metadata !4, null}
-!24 = metadata !{i32 786688, metadata !4, metadata !"sum", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 8]
-!25 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
-!26 = metadata !{i32 786688, metadata !27, metadata !"i", metadata !5, i32 9, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 9]
-!27 = metadata !{i32 786443, metadata !1, metadata !4, i32 9, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!24 = metadata !{metadata !"0x100\00sum\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [sum] [line 8]
+!25 = metadata !{i32 8, i32 0, metadata !4, null}
+!26 = metadata !{metadata !"0x100\00i\009\000", metadata !27, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 9]
+!27 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
 !28 = metadata !{i32 9, i32 0, metadata !27, null}
 !29 = metadata !{i32 10, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !1, metadata !27, i32 9, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!30 = metadata !{metadata !"0xb\009\000\001", metadata !1, metadata !27} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
 !31 = metadata !{i32 11, i32 0, metadata !30, null}
 !32 = metadata !{i32 12, i32 0, metadata !4, null}
 !33 = metadata !{i32 13, i32 0, metadata !4, null}

diff --git a/test/CodeGen/ARM/debug-frame.ll b/test/CodeGen/ARM/debug-frame.ll
index cb54aa8..c6243ec 100644
--- a/test/CodeGen/ARM/debug-frame.ll
+++ b/test/CodeGen/ARM/debug-frame.ll

@@ -128,37 +128,37 @@
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/exp.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/exp.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"exp.cpp", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"_Z4testiiiiiddddd", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32, i32, i32, i32, i32, double, double, double, double, double)* @_Z4testiiiiiddddd, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 5] [test]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/exp.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00test\00test\00_Z4testiiiiiddddd\004\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, void (i32, i32, i32, i32, i32, double, double, double, double, double)* @_Z4testiiiiiddddd, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 5] [test]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/exp.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8, metadata !8, metadata !8, metadata !8, metadata !8, metadata !9, metadata !9, metadata !9, metadata !9, metadata !9}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
 !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !12 = metadata !{metadata !"clang version 3.5 "}
-!13 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!13 = metadata !{metadata !"0x101\00a\0016777220\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
 !14 = metadata !{i32 4, i32 0, metadata !4, null}
-!15 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 33554436, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 4]
-!16 = metadata !{i32 786689, metadata !4, metadata !"c", metadata !5, i32 50331652, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 4]
-!17 = metadata !{i32 786689, metadata !4, metadata !"d", metadata !5, i32 67108868, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 4]
-!18 = metadata !{i32 786689, metadata !4, metadata !"e", metadata !5, i32 83886084, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [e] [line 4]
-!19 = metadata !{i32 786689, metadata !4, metadata !"m", metadata !5, i32 100663301, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [m] [line 5]
+!15 = metadata !{metadata !"0x101\00b\0033554436\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 4]
+!16 = metadata !{metadata !"0x101\00c\0050331652\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [c] [line 4]
+!17 = metadata !{metadata !"0x101\00d\0067108868\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [d] [line 4]
+!18 = metadata !{metadata !"0x101\00e\0083886084\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [e] [line 4]
+!19 = metadata !{metadata !"0x101\00m\00100663301\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [m] [line 5]
 !20 = metadata !{i32 5, i32 0, metadata !4, null}
-!21 = metadata !{i32 786689, metadata !4, metadata !"n", metadata !5, i32 117440517, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [n] [line 5]
-!22 = metadata !{i32 786689, metadata !4, metadata !"p", metadata !5, i32 134217733, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 5]
-!23 = metadata !{i32 786689, metadata !4, metadata !"q", metadata !5, i32 150994949, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [q] [line 5]
-!24 = metadata !{i32 786689, metadata !4, metadata !"r", metadata !5, i32 167772165, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 5]
+!21 = metadata !{metadata !"0x101\00n\00117440517\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [n] [line 5]
+!22 = metadata !{metadata !"0x101\00p\00134217733\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [p] [line 5]
+!23 = metadata !{metadata !"0x101\00q\00150994949\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [q] [line 5]
+!24 = metadata !{metadata !"0x101\00r\00167772165\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [r] [line 5]
 !25 = metadata !{i32 7, i32 0, metadata !26, null}
-!26 = metadata !{i32 786443, metadata !1, metadata !4, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
-!27 = metadata !{i32 8, i32 0, metadata !26, null} ; [ DW_TAG_imported_declaration ]
+!26 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
+!27 = metadata !{i32 8, i32 0, metadata !26, null}
 !28 = metadata !{i32 11, i32 0, metadata !26, null}
 !29 = metadata !{i32 9, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !1, metadata !4, i32 8, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
+!30 = metadata !{metadata !"0xb\008\000\001", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/exp.cpp]
 !31 = metadata !{i32 10, i32 0, metadata !30, null}
 !32 = metadata !{i32 10, i32 0, metadata !4, null}
 !33 = metadata !{i32 11, i32 0, metadata !4, null}

diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll
index 31d0324..34e9938 100644
--- a/test/CodeGen/ARM/debug-info-arg.ll
+++ b/test/CodeGen/ARM/debug-info-arg.ll

@@ -7,13 +7,13 @@
 %struct.tag_s = type { i32, i32, i32 }
 
 define void @foo(%struct.tag_s* nocapture %this, %struct.tag_s* %c, i64 %x, i64 %y, %struct.tag_s* nocapture %ptr1, %struct.tag_s* nocapture %ptr2) nounwind ssp {
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %this}, i64 0, metadata !5), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %c}, i64 0, metadata !13), !dbg !21
-  tail call void @llvm.dbg.value(metadata !{i64 %x}, i64 0, metadata !14), !dbg !22
-  tail call void @llvm.dbg.value(metadata !{i64 %y}, i64 0, metadata !17), !dbg !23
+  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %this}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %c}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i64 %x}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !22
+  tail call void @llvm.dbg.value(metadata !{i64 %y}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !23
 ;CHECK:	@DEBUG_VALUE: foo:y <- [R7+8]
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr1}, i64 0, metadata !18), !dbg !24
-  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr2}, i64 0, metadata !19), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr1}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !24
+  tail call void @llvm.dbg.value(metadata !{%struct.tag_s* %ptr2}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !25
   %1 = icmp eq %struct.tag_s* %c, null, !dbg !26
   br i1 %1, label %3, label %2, !dbg !26
 
@@ -27,31 +27,31 @@
 
 declare void @foobar(i64, i64)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786449, metadata !32, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !30, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !2, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, metadata !31, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [foo]
-!2 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", metadata !32, metadata !4, metadata !4, metadata !30, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00\0011\000\001\000\006\00256\001\0011", metadata !2, metadata !2, metadata !3, null, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, metadata !31} ; [ DW_TAG_subprogram ] [line 11] [def] [foo]
+!2 = metadata !{metadata !"0x29", metadata !32} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !32, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !2, i32 16777227, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!6 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786451, metadata !32, metadata !0, metadata !"tag_s", i32 5, i64 96, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [tag_s] [line 5, size 96, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x101\00this\0016777227\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!6 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !7} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0x13\00tag_s\005\0096\0032\000\000\000", metadata !32, metadata !0, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [tag_s] [line 5, size 96, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !11, metadata !12}
-!9 = metadata !{i32 786445, metadata !32, metadata !7, metadata !"x", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786445, metadata !32, metadata !7, metadata !"y", i32 7, i64 32, i64 32, i64 32, i32 0, metadata !10} ; [ DW_TAG_member ]
-!12 = metadata !{i32 786445, metadata !32, metadata !7, metadata !"z", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554443, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 50331659, metadata !15, i32 0, null} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 786454, metadata !32, metadata !0, metadata !"UInt64", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !16} ; [ DW_TAG_typedef ]
-!16 = metadata !{i32 786468, null, metadata !0, metadata !"long long unsigned int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!17 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 67108875, metadata !15, i32 0, null} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 786689, metadata !1, metadata !"ptr1", metadata !2, i32 83886091, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786689, metadata !1, metadata !"ptr2", metadata !2, i32 100663307, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0xd\00x\006\0032\0032\000\000", metadata !32, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0xd\00y\007\0032\0032\0032\000", metadata !32, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0xd\00z\008\0032\0032\0064\000", metadata !32, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!13 = metadata !{metadata !"0x101\00c\0033554443\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!14 = metadata !{metadata !"0x101\00x\0050331659\000", metadata !1, metadata !2, metadata !15} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x16\00UInt64\001\000\000\000\000", metadata !32, metadata !0, metadata !16} ; [ DW_TAG_typedef ]
+!16 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0x101\00y\0067108875\000", metadata !1, metadata !2, metadata !15} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x101\00ptr1\0083886091\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x101\00ptr2\00100663307\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
 !20 = metadata !{i32 11, i32 24, metadata !1, null}
 !21 = metadata !{i32 11, i32 44, metadata !1, null}
 !22 = metadata !{i32 11, i32 54, metadata !1, null}
@@ -59,10 +59,10 @@
 !24 = metadata !{i32 11, i32 81, metadata !1, null}
 !25 = metadata !{i32 11, i32 101, metadata !1, null}
 !26 = metadata !{i32 12, i32 3, metadata !27, null}
-!27 = metadata !{i32 786443, metadata !2, metadata !1, i32 11, i32 107, i32 0} ; [ DW_TAG_lexical_block ]
+!27 = metadata !{metadata !"0xb\0011\00107\000", metadata !2, metadata !1} ; [ DW_TAG_lexical_block ]
 !28 = metadata !{i32 13, i32 5, metadata !27, null}
 !29 = metadata !{i32 14, i32 1, metadata !27, null}
 !30 = metadata !{metadata !1}
 !31 = metadata !{metadata !5, metadata !13, metadata !14, metadata !17, metadata !18, metadata!19}
 !32 = metadata !{metadata !"one.c", metadata !"/Volumes/Athwagate/R10048772"}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index 5ad5e59..3623927 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll

@@ -19,11 +19,11 @@
 @"OBJC_IVAR_$_MyWork._data" = external hidden global i32, section "__DATA, __objc_const", align 4
 @"\01L_OBJC_SELECTOR_REFERENCES_222" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i8* @objc_msgSend(i8*, i8*, ...)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 
@@ -31,22 +31,22 @@
   %1 = alloca %0*, align 4
   %bounds = alloca %struct.CR, align 4
   %data = alloca %struct.CR, align 4
-  call void @llvm.dbg.value(metadata !{i8* %.block_descriptor}, i64 0, metadata !27), !dbg !129
+  call void @llvm.dbg.value(metadata !{i8* %.block_descriptor}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !129
   store %0* %loadedMydata, %0** %1, align 4
-  call void @llvm.dbg.declare(metadata !{%0** %1}, metadata !130), !dbg !131
+  call void @llvm.dbg.declare(metadata !{%0** %1}, metadata !130, metadata !{metadata !"0x102"}), !dbg !131
   %2 = bitcast %struct.CR* %bounds to %1*
   %3 = getelementptr %1* %2, i32 0, i32 0
   store [4 x i32] %bounds.coerce0, [4 x i32]* %3
-  call void @llvm.dbg.declare(metadata !{%struct.CR* %bounds}, metadata !132), !dbg !133
+  call void @llvm.dbg.declare(metadata !{%struct.CR* %bounds}, metadata !132, metadata !{metadata !"0x102"}), !dbg !133
   %4 = bitcast %struct.CR* %data to %1*
   %5 = getelementptr %1* %4, i32 0, i32 0
   store [4 x i32] %data.coerce0, [4 x i32]* %5
-  call void @llvm.dbg.declare(metadata !{%struct.CR* %data}, metadata !134), !dbg !135
+  call void @llvm.dbg.declare(metadata !{%struct.CR* %data}, metadata !134, metadata !{metadata !"0x102"}), !dbg !135
   %6 = bitcast i8* %.block_descriptor to %2*
   %7 = getelementptr inbounds %2* %6, i32 0, i32 6
-  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !136), !dbg !137
-  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !138), !dbg !137
-  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !139), !dbg !140
+  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !136, metadata !163), !dbg !137
+  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !138, metadata !164), !dbg !137
+  call void @llvm.dbg.declare(metadata !{%2* %6}, metadata !139, metadata !165), !dbg !140
   %8 = load %0** %1, align 4, !dbg !141
   %9 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_13", !dbg !141
   %10 = bitcast %0* %8 to i8*, !dbg !141
@@ -95,149 +95,149 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!162}
 
-!0 = metadata !{i32 786449, metadata !153, i32 16, metadata !"Apple clang version 2.1", i1 false, metadata !"", i32 2, metadata !147, metadata !26, metadata !148, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786436, metadata !160, metadata !0, metadata !"", i32 248, i64 32, i64 32, i32 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 248, size 32, align 32, offset 0] [def] [from ]
-!2 = metadata !{i32 786473, metadata !160} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x11\0016\00Apple clang version 2.1\000\00\002\00\001", metadata !153, metadata !147, metadata !26, metadata !148, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x4\00\00248\0032\0032\000\000\000", metadata !160, metadata !0, null, metadata !3, null, null, null} ; [ DW_TAG_enumeration_type ] [line 248, size 32, align 32, offset 0] [def] [from ]
+!2 = metadata !{metadata !"0x29", metadata !160} ; [ DW_TAG_file_type ]
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786472, metadata !"Ver1", i64 0} ; [ DW_TAG_enumerator ]
-!5 = metadata !{i32 786436, metadata !160, metadata !0, metadata !"Mode", i32 79, i64 32, i64 32, i32 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [Mode] [line 79, size 32, align 32, offset 0] [def] [from ]
-!6 = metadata !{i32 786473, metadata !161} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !"0x28\00Ver1\000"} ; [ DW_TAG_enumerator ]
+!5 = metadata !{metadata !"0x4\00Mode\0079\0032\0032\000\000\000", metadata !160, metadata !0, null, metadata !7, null, null, null} ; [ DW_TAG_enumeration_type ] [Mode] [line 79, size 32, align 32, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x29", metadata !161} ; [ DW_TAG_file_type ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786472, metadata !"One", i64 0} ; [ DW_TAG_enumerator ]
-!9 = metadata !{i32 786436, metadata !149, metadata !0, metadata !"", i32 15, i64 32, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 15, size 32, align 32, offset 0] [def] [from ]
-!10 = metadata !{i32 786473, metadata !149} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x28\00One\000"} ; [ DW_TAG_enumerator ]
+!9 = metadata !{metadata !"0x4\00\0015\0032\0032\000\000\000", metadata !149, metadata !0, null, metadata !11, null, null, null} ; [ DW_TAG_enumeration_type ] [line 15, size 32, align 32, offset 0] [def] [from ]
+!10 = metadata !{metadata !"0x29", metadata !149} ; [ DW_TAG_file_type ]
 !11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786472, metadata !"Unknown", i64 0} ; [ DW_TAG_enumerator ]
-!13 = metadata !{i32 786472, metadata !"Known", i64 1} ; [ DW_TAG_enumerator ]
-!14 = metadata !{i32 786436, metadata !150, metadata !0, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
-!15 = metadata !{i32 786473, metadata !150} ; [ DW_TAG_file_type ]
+!12 = metadata !{metadata !"0x28\00Unknown\000"} ; [ DW_TAG_enumerator ]
+!13 = metadata !{metadata !"0x28\00Known\001"} ; [ DW_TAG_enumerator ]
+!14 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !150, metadata !0, null, metadata !16, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!15 = metadata !{metadata !"0x29", metadata !150} ; [ DW_TAG_file_type ]
 !16 = metadata !{metadata !17, metadata !18}
-!17 = metadata !{i32 786472, metadata !"Single", i64 0} ; [ DW_TAG_enumerator ]
-!18 = metadata !{i32 786472, metadata !"Double", i64 1} ; [ DW_TAG_enumerator ]
-!19 = metadata !{i32 786436, metadata !151, metadata !0, metadata !"", i32 14, i64 32, i64 32, i32 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 14, size 32, align 32, offset 0] [def] [from ]
-!20 = metadata !{i32 786473, metadata !151} ; [ DW_TAG_file_type ]
+!17 = metadata !{metadata !"0x28\00Single\000"} ; [ DW_TAG_enumerator ]
+!18 = metadata !{metadata !"0x28\00Double\001"} ; [ DW_TAG_enumerator ]
+!19 = metadata !{metadata !"0x4\00\0014\0032\0032\000\000\000", metadata !151, metadata !0, null, metadata !21, null, null, null} ; [ DW_TAG_enumeration_type ] [line 14, size 32, align 32, offset 0] [def] [from ]
+!20 = metadata !{metadata !"0x29", metadata !151} ; [ DW_TAG_file_type ]
 !21 = metadata !{metadata !22}
-!22 = metadata !{i32 786472, metadata !"Eleven", i64 0} ; [ DW_TAG_enumerator ]
-!23 = metadata !{i32 786478, metadata !152, metadata !24, metadata !"foobar_func_block_invoke_0", metadata !"foobar_func_block_invoke_0", metadata !"", i32 609, metadata !25, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null, i32 609} ; [ DW_TAG_subprogram ] [line 609] [local] [def] [foobar_func_block_invoke_0]
-!24 = metadata !{i32 786473, metadata !152} ; [ DW_TAG_file_type ]
-!25 = metadata !{i32 786453, metadata !152, metadata !24, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x28\00Eleven\000"} ; [ DW_TAG_enumerator ]
+!23 = metadata !{metadata !"0x2e\00foobar_func_block_invoke_0\00foobar_func_block_invoke_0\00\00609\001\001\000\006\00256\000\00609", metadata !152, metadata !24, metadata !25, null, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null} ; [ DW_TAG_subprogram ] [line 609] [local] [def] [foobar_func_block_invoke_0]
+!24 = metadata !{metadata !"0x29", metadata !152} ; [ DW_TAG_file_type ]
+!25 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !152, metadata !24, null, metadata !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !26 = metadata !{null}
-!27 = metadata !{i32 786689, metadata !23, metadata !".block_descriptor", metadata !24, i32 16777825, metadata !28, i32 64, null} ; [ DW_TAG_arg_variable ]
-!28 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 0, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ]
-!29 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"__block_literal_14", i32 609, i64 256, i64 32, i32 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_14] [line 609, size 256, align 32, offset 0] [def] [from ]
+!27 = metadata !{metadata !"0x101\00.block_descriptor\0016777825\0064", metadata !23, metadata !24, metadata !28} ; [ DW_TAG_arg_variable ]
+!28 = metadata !{metadata !"0xf\00\000\0032\000\000\000", null, metadata !0, metadata !29} ; [ DW_TAG_pointer_type ]
+!29 = metadata !{metadata !"0x13\00__block_literal_14\00609\00256\0032\000\000\000", metadata !152, metadata !24, null, metadata !30, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_14] [line 609, size 256, align 32, offset 0] [def] [from ]
 !30 = metadata !{metadata !31, metadata !33, metadata !35, metadata !36, metadata !37, metadata !48, metadata !89, metadata !124}
-!31 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__isa", i32 609, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ]
-!32 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!33 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__flags", i32 609, i64 32, i64 32, i64 32, i32 0, metadata !34} ; [ DW_TAG_member ]
-!34 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!35 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__reserved", i32 609, i64 32, i64 32, i64 64, i32 0, metadata !34} ; [ DW_TAG_member ]
-!36 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__FuncPtr", i32 609, i64 32, i64 32, i64 96, i32 0, metadata !32} ; [ DW_TAG_member ]
-!37 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__descriptor", i32 609, i64 32, i64 32, i64 128, i32 0, metadata !38} ; [ DW_TAG_member ]
-!38 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !39} ; [ DW_TAG_pointer_type ]
-!39 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"__block_descriptor_withcopydispose", i32 307, i64 128, i64 32, i32 0, i32 0, null, metadata !41, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 307, size 128, align 32, offset 0] [def] [from ]
-!40 = metadata !{i32 786473, metadata !153} ; [ DW_TAG_file_type ]
+!31 = metadata !{metadata !"0xd\00__isa\00609\0032\0032\000\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!32 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, null} ; [ DW_TAG_pointer_type ]
+!33 = metadata !{metadata !"0xd\00__flags\00609\0032\0032\0032\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
+!34 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!35 = metadata !{metadata !"0xd\00__reserved\00609\0032\0032\0064\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
+!36 = metadata !{metadata !"0xd\00__FuncPtr\00609\0032\0032\0096\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!37 = metadata !{metadata !"0xd\00__descriptor\00609\0032\0032\00128\000", metadata !152, metadata !24, metadata !38} ; [ DW_TAG_member ]
+!38 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !39} ; [ DW_TAG_pointer_type ]
+!39 = metadata !{metadata !"0x13\00__block_descriptor_withcopydispose\00307\00128\0032\000\000\000", metadata !153, metadata !0, null, metadata !41, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 307, size 128, align 32, offset 0] [def] [from ]
+!40 = metadata !{metadata !"0x29", metadata !153} ; [ DW_TAG_file_type ]
 !41 = metadata !{metadata !42, metadata !44, metadata !45, metadata !47}
-!42 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"reserved", i32 307, i64 32, i64 32, i64 0, i32 0, metadata !43} ; [ DW_TAG_member ]
-!43 = metadata !{i32 786468, null, metadata !0, metadata !"long unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!44 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"Size", i32 307, i64 32, i64 32, i64 32, i32 0, metadata !43} ; [ DW_TAG_member ]
-!45 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"CopyFuncPtr", i32 307, i64 32, i64 32, i64 64, i32 0, metadata !46} ; [ DW_TAG_member ]
-!46 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
-!47 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"DestroyFuncPtr", i32 307, i64 32, i64 32, i64 96, i32 0, metadata !46} ; [ DW_TAG_member ]
-!48 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"mydata", i32 609, i64 32, i64 32, i64 160, i32 0, metadata !49} ; [ DW_TAG_member ]
-!49 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 0, i64 0, i32 0, metadata !50} ; [ DW_TAG_pointer_type ]
-!50 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"", i32 0, i64 224, i64 0, i32 0, i32 16, null, metadata !51, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 0, size 224, align 0, offset 0] [def] [from ]
+!42 = metadata !{metadata !"0xd\00reserved\00307\0032\0032\000\000", metadata !153, metadata !40, metadata !43} ; [ DW_TAG_member ]
+!43 = metadata !{metadata !"0x24\00long unsigned int\000\0032\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
+!44 = metadata !{metadata !"0xd\00Size\00307\0032\0032\0032\000", metadata !153, metadata !40, metadata !43} ; [ DW_TAG_member ]
+!45 = metadata !{metadata !"0xd\00CopyFuncPtr\00307\0032\0032\0064\000", metadata !153, metadata !40, metadata !46} ; [ DW_TAG_member ]
+!46 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !32} ; [ DW_TAG_pointer_type ]
+!47 = metadata !{metadata !"0xd\00DestroyFuncPtr\00307\0032\0032\0096\000", metadata !153, metadata !40, metadata !46} ; [ DW_TAG_member ]
+!48 = metadata !{metadata !"0xd\00mydata\00609\0032\0032\00160\000", metadata !152, metadata !24, metadata !49} ; [ DW_TAG_member ]
+!49 = metadata !{metadata !"0xf\00\000\0032\000\000\000", null, metadata !0, metadata !50} ; [ DW_TAG_pointer_type ]
+!50 = metadata !{metadata !"0x13\00\000\00224\000\000\0016\000", metadata !152, metadata !24, null, metadata !51, null, null, null} ; [ DW_TAG_structure_type ] [line 0, size 224, align 0, offset 0] [def] [from ]
 !51 = metadata !{metadata !52, metadata !53, metadata !54, metadata !55, metadata !56, metadata !57, metadata !58}
-!52 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__isa", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ]
-!53 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__forwarding", i32 0, i64 32, i64 32, i64 32, i32 0, metadata !32} ; [ DW_TAG_member ]
-!54 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__flags", i32 0, i64 32, i64 32, i64 64, i32 0, metadata !34} ; [ DW_TAG_member ]
-!55 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__size", i32 0, i64 32, i64 32, i64 96, i32 0, metadata !34} ; [ DW_TAG_member ]
-!56 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__copy_helper", i32 0, i64 32, i64 32, i64 128, i32 0, metadata !32} ; [ DW_TAG_member ]
-!57 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__destroy_helper", i32 0, i64 32, i64 32, i64 160, i32 0, metadata !32} ; [ DW_TAG_member ]
-!58 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"mydata", i32 0, i64 32, i64 32, i64 192, i32 0, metadata !59} ; [ DW_TAG_member ]
-!59 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !60} ; [ DW_TAG_pointer_type ]
-!60 = metadata !{i32 786451, metadata !154, metadata !24, metadata !"UIMydata", i32 26, i64 128, i64 32, i32 0, i32 0, null, metadata !62, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [UIMydata] [line 26, size 128, align 32, offset 0] [def] [from ]
-!61 = metadata !{i32 786473, metadata !154} ; [ DW_TAG_file_type ]
+!52 = metadata !{metadata !"0xd\00__isa\000\0032\0032\000\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!53 = metadata !{metadata !"0xd\00__forwarding\000\0032\0032\0032\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!54 = metadata !{metadata !"0xd\00__flags\000\0032\0032\0064\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
+!55 = metadata !{metadata !"0xd\00__size\000\0032\0032\0096\000", metadata !152, metadata !24, metadata !34} ; [ DW_TAG_member ]
+!56 = metadata !{metadata !"0xd\00__copy_helper\000\0032\0032\00128\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!57 = metadata !{metadata !"0xd\00__destroy_helper\000\0032\0032\00160\000", metadata !152, metadata !24, metadata !32} ; [ DW_TAG_member ]
+!58 = metadata !{metadata !"0xd\00mydata\000\0032\0032\00192\000", metadata !152, metadata !24, metadata !59} ; [ DW_TAG_member ]
+!59 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !60} ; [ DW_TAG_pointer_type ]
+!60 = metadata !{metadata !"0x13\00UIMydata\0026\00128\0032\000\000\0016", metadata !154, metadata !24, null, metadata !62, null, null, null} ; [ DW_TAG_structure_type ] [UIMydata] [line 26, size 128, align 32, offset 0] [def] [from ]
+!61 = metadata !{metadata !"0x29", metadata !154} ; [ DW_TAG_file_type ]
 !62 = metadata !{metadata !63, metadata !71, metadata !75, metadata !79}
-!63 = metadata !{i32 786460, metadata !60, null, metadata !61, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
-!64 = metadata !{i32 786451, metadata !155, metadata !40, metadata !"NSO", i32 66, i64 32, i64 32, i32 0, i32 0, null, metadata !66, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSO] [line 66, size 32, align 32, offset 0] [def] [from ]
-!65 = metadata !{i32 786473, metadata !155} ; [ DW_TAG_file_type ]
+!63 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !60, null, metadata !64} ; [ DW_TAG_inheritance ]
+!64 = metadata !{metadata !"0x13\00NSO\0066\0032\0032\000\000\0016", metadata !155, metadata !40, null, metadata !66, null, null, null} ; [ DW_TAG_structure_type ] [NSO] [line 66, size 32, align 32, offset 0] [def] [from ]
+!65 = metadata !{metadata !"0x29", metadata !155} ; [ DW_TAG_file_type ]
 !66 = metadata !{metadata !67}
-!67 = metadata !{i32 786445, metadata !155, metadata !65, metadata !"isa", i32 67, i64 32, i64 32, i64 0, i32 2, metadata !68, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!68 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"Class", i32 197, i64 0, i64 0, i64 0, i32 0, metadata !69} ; [ DW_TAG_typedef ]
-!69 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !70} ; [ DW_TAG_pointer_type ]
-!70 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!71 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_mydataRef", i32 28, i64 32, i64 32, i64 32, i32 0, metadata !72, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!72 = metadata !{i32 786454, metadata !152, metadata !0, metadata !"CFTypeRef", i32 313, i64 0, i64 0, i64 0, i32 0, metadata !73} ; [ DW_TAG_typedef ]
-!73 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !74} ; [ DW_TAG_pointer_type ]
-!74 = metadata !{i32 786470, null, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_const_type ]
-!75 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_scale", i32 29, i64 32, i64 32, i64 64, i32 0, metadata !76, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!76 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"Float", i32 89, i64 0, i64 0, i64 0, i32 0, metadata !78} ; [ DW_TAG_typedef ]
-!77 = metadata !{i32 786473, metadata !156} ; [ DW_TAG_file_type ]
-!78 = metadata !{i32 786468, null, metadata !0, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!79 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_mydataFlags", i32 37, i64 8, i64 8, i64 96, i32 0, metadata !80, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!80 = metadata !{i32 786451, metadata !154, metadata !0, metadata !"", i32 30, i64 8, i64 8, i32 0, i32 0, null, metadata !81, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 30, size 8, align 8, offset 0] [def] [from ]
+!67 = metadata !{metadata !"0xd\00isa\0067\0032\0032\000\002", metadata !155, metadata !65, metadata !68, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!68 = metadata !{metadata !"0x16\00Class\00197\000\000\000\000", metadata !153, metadata !0, metadata !69} ; [ DW_TAG_typedef ]
+!69 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !70} ; [ DW_TAG_pointer_type ]
+!70 = metadata !{metadata !"0x13\00objc_class\000\000\000\000\004\000", metadata !153, metadata !0, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!71 = metadata !{metadata !"0xd\00_mydataRef\0028\0032\0032\0032\000", metadata !154, metadata !61, metadata !72, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!72 = metadata !{metadata !"0x16\00CFTypeRef\00313\000\000\000\000", metadata !152, metadata !0, metadata !73} ; [ DW_TAG_typedef ]
+!73 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !74} ; [ DW_TAG_pointer_type ]
+!74 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, metadata !0, null} ; [ DW_TAG_const_type ]
+!75 = metadata !{metadata !"0xd\00_scale\0029\0032\0032\0064\000", metadata !154, metadata !61, metadata !76, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!76 = metadata !{metadata !"0x16\00Float\0089\000\000\000\000", metadata !156, metadata !0, metadata !78} ; [ DW_TAG_typedef ]
+!77 = metadata !{metadata !"0x29", metadata !156} ; [ DW_TAG_file_type ]
+!78 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !0} ; [ DW_TAG_base_type ]
+!79 = metadata !{metadata !"0xd\00_mydataFlags\0037\008\008\0096\000", metadata !154, metadata !61, metadata !80, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!80 = metadata !{metadata !"0x13\00\0030\008\008\000\000\000", metadata !154, metadata !0, null, metadata !81, null, null, null} ; [ DW_TAG_structure_type ] [line 30, size 8, align 8, offset 0] [def] [from ]
 !81 = metadata !{metadata !82, metadata !84, metadata !85, metadata !86, metadata !87, metadata !88}
-!82 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"named", i32 31, i64 1, i64 32, i64 0, i32 0, metadata !83} ; [ DW_TAG_member ]
-!83 = metadata !{i32 786468, null, metadata !0, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!84 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"mydataO", i32 32, i64 3, i64 32, i64 1, i32 0, metadata !83} ; [ DW_TAG_member ]
-!85 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"cached", i32 33, i64 1, i64 32, i64 4, i32 0, metadata !83} ; [ DW_TAG_member ]
-!86 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"hasBeenCached", i32 34, i64 1, i64 32, i64 5, i32 0, metadata !83} ; [ DW_TAG_member ]
-!87 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"hasPattern", i32 35, i64 1, i64 32, i64 6, i32 0, metadata !83} ; [ DW_TAG_member ]
-!88 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"isCIMydata", i32 36, i64 1, i64 32, i64 7, i32 0, metadata !83} ; [ DW_TAG_member ]
-!89 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"self", i32 609, i64 32, i64 32, i64 192, i32 0, metadata !90} ; [ DW_TAG_member ]
-!90 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !91} ; [ DW_TAG_pointer_type ]
-!91 = metadata !{i32 786451, metadata !152, metadata !40, metadata !"MyWork", i32 36, i64 384, i64 32, i32 0, i32 0, null, metadata !92, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [MyWork] [line 36, size 384, align 32, offset 0] [def] [from ]
+!82 = metadata !{metadata !"0xd\00named\0031\001\0032\000\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!83 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
+!84 = metadata !{metadata !"0xd\00mydataO\0032\003\0032\001\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!85 = metadata !{metadata !"0xd\00cached\0033\001\0032\004\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!86 = metadata !{metadata !"0xd\00hasBeenCached\0034\001\0032\005\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!87 = metadata !{metadata !"0xd\00hasPattern\0035\001\0032\006\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!88 = metadata !{metadata !"0xd\00isCIMydata\0036\001\0032\007\000", metadata !154, metadata !61, metadata !83} ; [ DW_TAG_member ]
+!89 = metadata !{metadata !"0xd\00self\00609\0032\0032\00192\000", metadata !152, metadata !24, metadata !90} ; [ DW_TAG_member ]
+!90 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !91} ; [ DW_TAG_pointer_type ]
+!91 = metadata !{metadata !"0x13\00MyWork\0036\00384\0032\000\000\0016", metadata !152, metadata !40, null, metadata !92, null, null, null} ; [ DW_TAG_structure_type ] [MyWork] [line 36, size 384, align 32, offset 0] [def] [from ]
 !92 = metadata !{metadata !93, metadata !98, metadata !101, metadata !107, metadata !123}
-!93 = metadata !{i32 786460, metadata !152, metadata !91, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !94} ; [ DW_TAG_inheritance ]
-!94 = metadata !{i32 786451, metadata !157, metadata !40, metadata !"twork", i32 43, i64 32, i64 32, i32 0, i32 0, null, metadata !96, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [twork] [line 43, size 32, align 32, offset 0] [def] [from ]
-!95 = metadata !{i32 786473, metadata !157} ; [ DW_TAG_file_type ]
+!93 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !152, metadata !91, metadata !94} ; [ DW_TAG_inheritance ]
+!94 = metadata !{metadata !"0x13\00twork\0043\0032\0032\000\000\0016", metadata !157, metadata !40, null, metadata !96, null, null, null} ; [ DW_TAG_structure_type ] [twork] [line 43, size 32, align 32, offset 0] [def] [from ]
+!95 = metadata !{metadata !"0x29", metadata !157} ; [ DW_TAG_file_type ]
 !96 = metadata !{metadata !97}
-!97 = metadata !{i32 786460, metadata !94, null, metadata !95, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
-!98 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_itemID", i32 38, i64 64, i64 32, i64 32, i32 1, metadata !99, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!99 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"uint64_t", i32 55, i64 0, i64 0, i64 0, i32 0, metadata !100} ; [ DW_TAG_typedef ]
-!100 = metadata !{i32 786468, null, metadata !0, metadata !"long long unsigned int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!101 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_library", i32 39, i64 32, i64 32, i64 96, i32 1, metadata !102, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!102 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !103} ; [ DW_TAG_pointer_type ]
-!103 = metadata !{i32 786451, metadata !158, metadata !40, metadata !"MyLibrary2", i32 22, i64 32, i64 32, i32 0, i32 0, null, metadata !105, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [MyLibrary2] [line 22, size 32, align 32, offset 0] [def] [from ]
-!104 = metadata !{i32 786473, metadata !158} ; [ DW_TAG_file_type ]
+!97 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !94, null, metadata !64} ; [ DW_TAG_inheritance ]
+!98 = metadata !{metadata !"0xd\00_itemID\0038\0064\0032\0032\001", metadata !152, metadata !24, metadata !99, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!99 = metadata !{metadata !"0x16\00uint64_t\0055\000\000\000\000", metadata !153, metadata !0, metadata !100} ; [ DW_TAG_typedef ]
+!100 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0032\000\000\007", null, metadata !0} ; [ DW_TAG_base_type ]
+!101 = metadata !{metadata !"0xd\00_library\0039\0032\0032\0096\001", metadata !152, metadata !24, metadata !102, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!102 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !103} ; [ DW_TAG_pointer_type ]
+!103 = metadata !{metadata !"0x13\00MyLibrary2\0022\0032\0032\000\000\0016", metadata !158, metadata !40, null, metadata !105, null, null, null} ; [ DW_TAG_structure_type ] [MyLibrary2] [line 22, size 32, align 32, offset 0] [def] [from ]
+!104 = metadata !{metadata !"0x29", metadata !158} ; [ DW_TAG_file_type ]
 !105 = metadata !{metadata !106}
-!106 = metadata !{i32 786460, metadata !103, null, metadata !104, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
-!107 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_bounds", i32 40, i64 128, i64 32, i64 128, i32 1, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!108 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"CR", i32 33, i64 0, i64 0, i64 0, i32 0, metadata !109} ; [ DW_TAG_typedef ]
-!109 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CR", i32 29, i64 128, i64 32, i32 0, i32 0, null, metadata !110, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [CR] [line 29, size 128, align 32, offset 0] [def] [from ]
+!106 = metadata !{metadata !"0x1c\00\000\000\000\000\000", metadata !103, null, metadata !64} ; [ DW_TAG_inheritance ]
+!107 = metadata !{metadata !"0xd\00_bounds\0040\00128\0032\00128\001", metadata !152, metadata !24, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!108 = metadata !{metadata !"0x16\00CR\0033\000\000\000\000", metadata !153, metadata !0, metadata !109} ; [ DW_TAG_typedef ]
+!109 = metadata !{metadata !"0x13\00CR\0029\00128\0032\000\000\000", metadata !156, metadata !0, null, metadata !110, null, null, null} ; [ DW_TAG_structure_type ] [CR] [line 29, size 128, align 32, offset 0] [def] [from ]
 !110 = metadata !{metadata !111, metadata !117}
-!111 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"origin", i32 30, i64 64, i64 32, i64 0, i32 0, metadata !112} ; [ DW_TAG_member ]
-!112 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"CP", i32 17, i64 0, i64 0, i64 0, i32 0, metadata !113} ; [ DW_TAG_typedef ]
-!113 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CP", i32 13, i64 64, i64 32, i32 0, i32 0, null, metadata !114, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [CP] [line 13, size 64, align 32, offset 0] [def] [from ]
+!111 = metadata !{metadata !"0xd\00origin\0030\0064\0032\000\000", metadata !156, metadata !77, metadata !112} ; [ DW_TAG_member ]
+!112 = metadata !{metadata !"0x16\00CP\0017\000\000\000\000", metadata !156, metadata !0, metadata !113} ; [ DW_TAG_typedef ]
+!113 = metadata !{metadata !"0x13\00CP\0013\0064\0032\000\000\000", metadata !156, metadata !0, null, metadata !114, null, null, null} ; [ DW_TAG_structure_type ] [CP] [line 13, size 64, align 32, offset 0] [def] [from ]
 !114 = metadata !{metadata !115, metadata !116}
-!115 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"x", i32 14, i64 32, i64 32, i64 0, i32 0, metadata !76} ; [ DW_TAG_member ]
-!116 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"y", i32 15, i64 32, i64 32, i64 32, i32 0, metadata !76} ; [ DW_TAG_member ]
-!117 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"size", i32 31, i64 64, i64 32, i64 64, i32 0, metadata !118} ; [ DW_TAG_member ]
-!118 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"Size", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !119} ; [ DW_TAG_typedef ]
-!119 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"Size", i32 21, i64 64, i64 32, i32 0, i32 0, null, metadata !120, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Size] [line 21, size 64, align 32, offset 0] [def] [from ]
+!115 = metadata !{metadata !"0xd\00x\0014\0032\0032\000\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
+!116 = metadata !{metadata !"0xd\00y\0015\0032\0032\0032\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
+!117 = metadata !{metadata !"0xd\00size\0031\0064\0032\0064\000", metadata !156, metadata !77, metadata !118} ; [ DW_TAG_member ]
+!118 = metadata !{metadata !"0x16\00Size\0025\000\000\000\000", metadata !156, metadata !0, metadata !119} ; [ DW_TAG_typedef ]
+!119 = metadata !{metadata !"0x13\00Size\0021\0064\0032\000\000\000", metadata !156, metadata !0, null, metadata !120, null, null, null} ; [ DW_TAG_structure_type ] [Size] [line 21, size 64, align 32, offset 0] [def] [from ]
 !120 = metadata !{metadata !121, metadata !122}
-!121 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"width", i32 22, i64 32, i64 32, i64 0, i32 0, metadata !76} ; [ DW_TAG_member ]
-!122 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"height", i32 23, i64 32, i64 32, i64 32, i32 0, metadata !76} ; [ DW_TAG_member ]
-!123 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_data", i32 40, i64 128, i64 32, i64 256, i32 1, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!124 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"semi", i32 609, i64 32, i64 32, i64 224, i32 0, metadata !125} ; [ DW_TAG_member ]
-!125 = metadata !{i32 786454, metadata !152, metadata !0, metadata !"d_t", i32 35, i64 0, i64 0, i64 0, i32 0, metadata !126} ; [ DW_TAG_typedef ]
-!126 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !127} ; [ DW_TAG_pointer_type ]
-!127 = metadata !{i32 786451, metadata !159, metadata !0, metadata !"my_struct", i32 49, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [my_struct] [line 49, size 0, align 0, offset 0] [decl] [from ]
-!128 = metadata !{i32 786473, metadata !159} ; [ DW_TAG_file_type ]
+!121 = metadata !{metadata !"0xd\00width\0022\0032\0032\000\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
+!122 = metadata !{metadata !"0xd\00height\0023\0032\0032\0032\000", metadata !156, metadata !77, metadata !76} ; [ DW_TAG_member ]
+!123 = metadata !{metadata !"0xd\00_data\0040\00128\0032\00256\001", metadata !152, metadata !24, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
+!124 = metadata !{metadata !"0xd\00semi\00609\0032\0032\00224\000", metadata !152, metadata !24, metadata !125} ; [ DW_TAG_member ]
+!125 = metadata !{metadata !"0x16\00d_t\0035\000\000\000\000", metadata !152, metadata !0, metadata !126} ; [ DW_TAG_typedef ]
+!126 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !0, metadata !127} ; [ DW_TAG_pointer_type ]
+!127 = metadata !{metadata !"0x13\00my_struct\0049\000\000\000\004\000", metadata !159, metadata !0, null, null, null, null, null} ; [ DW_TAG_structure_type ] [my_struct] [line 49, size 0, align 0, offset 0] [decl] [from ]
+!128 = metadata !{metadata !"0x29", metadata !159} ; [ DW_TAG_file_type ]
 !129 = metadata !{i32 609, i32 144, metadata !23, null}
-!130 = metadata !{i32 786689, metadata !23, metadata !"loadedMydata", metadata !24, i32 33555041, metadata !59, i32 0, null} ; [ DW_TAG_arg_variable ]
+!130 = metadata !{metadata !"0x101\00loadedMydata\0033555041\000", metadata !23, metadata !24, metadata !59} ; [ DW_TAG_arg_variable ]
 !131 = metadata !{i32 609, i32 155, metadata !23, null}
-!132 = metadata !{i32 786689, metadata !23, metadata !"bounds", metadata !24, i32 50332257, metadata !108, i32 0, null} ; [ DW_TAG_arg_variable ]
+!132 = metadata !{metadata !"0x101\00bounds\0050332257\000", metadata !23, metadata !24, metadata !108} ; [ DW_TAG_arg_variable ]
 !133 = metadata !{i32 609, i32 175, metadata !23, null}
-!134 = metadata !{i32 786689, metadata !23, metadata !"data", metadata !24, i32 67109473, metadata !108, i32 0, null} ; [ DW_TAG_arg_variable ]
+!134 = metadata !{metadata !"0x101\00data\0067109473\000", metadata !23, metadata !24, metadata !108} ; [ DW_TAG_arg_variable ]
 !135 = metadata !{i32 609, i32 190, metadata !23, null}
-!136 = metadata !{i32 786688, metadata !23, metadata !"mydata", metadata !24, i32 604, metadata !50, i32 0, null, metadata !163} ; [ DW_TAG_auto_variable ]
+!136 = metadata !{metadata !"0x100\00mydata\00604\000", metadata !23, metadata !24, metadata !50} ; [ DW_TAG_auto_variable ]
 !137 = metadata !{i32 604, i32 49, metadata !23, null}
-!138 = metadata !{i32 786688, metadata !23, metadata !"self", metadata !40, i32 604, metadata !90, i32 0, null, metadata !164} ; [ DW_TAG_auto_variable ]
-!139 = metadata !{i32 786688, metadata !23, metadata !"semi", metadata !24, i32 607, metadata !125, i32 0, null, metadata !165} ; [ DW_TAG_auto_variable ]
+!138 = metadata !{metadata !"0x100\00self\00604\000", metadata !23, metadata !40, metadata !90} ; [ DW_TAG_auto_variable ]
+!139 = metadata !{metadata !"0x100\00semi\00607\000", metadata !23, metadata !24, metadata !125} ; [ DW_TAG_auto_variable ]
 !140 = metadata !{i32 607, i32 30, metadata !23, null}
 !141 = metadata !{i32 610, i32 17, metadata !142, null}
-!142 = metadata !{i32 786443, metadata !152, metadata !23, i32 609, i32 200, i32 94} ; [ DW_TAG_lexical_block ]
+!142 = metadata !{metadata !"0xb\00609\00200\0094", metadata !152, metadata !23} ; [ DW_TAG_lexical_block ]
 !143 = metadata !{i32 611, i32 17, metadata !142, null}
 !144 = metadata !{i32 612, i32 17, metadata !142, null}
 !145 = metadata !{i32 613, i32 17, metadata !142, null}
@@ -257,7 +257,7 @@
 !159 = metadata !{metadata !"header15.h", metadata !"/Volumes/Sandbox/llvm"}
 !160 = metadata !{metadata !"header.h", metadata !"/Volumes/Sandbox/llvm"}
 !161 = metadata !{metadata !"header2.h", metadata !"/Volumes/Sandbox/llvm"}
-!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!163 = metadata !{i64 1, i64 20, i64 2, i64 1, i64 4, i64 2, i64 1, i64 24}
-!164 = metadata !{i64 1, i64 24}
-!165 = metadata !{i64 1, i64 28}
+!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!163 = metadata !{metadata !"0x102\0034\0020\006\0034\004\006\0034\0024"} ; [ DW_TAG_expression ] [DW_OP_plus 20 DW_OP_deref DW_OP_plus 4 DW_OP_deref DW_OP_plus 24]
+!164 = metadata !{metadata !"0x102\0034\0024"} ; [ DW_TAG_expression ] [DW_OP_plus 24]
+!165 = metadata !{metadata !"0x102\0034\0028"} ; [ DW_TAG_expression ] [DW_OP_plus 28]

diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index 8505f53..db96b49 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll

@@ -3,6 +3,7 @@
 target triple = "thumbv7-apple-macosx10.6.7"
 
 ;CHECK: 	vadd.f32	q4, q8, q8
+;CHECK-NEXT: Ltmp1
 ;CHECK-NEXT: LBB0_1
 
 ;CHECK:@DEBUG_VALUE: x <- Q4{{$}}
@@ -19,9 +20,9 @@
 
 for.body9:                                        ; preds = %for.body9, %entry
   %add19 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !39
   %add20 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{<4 x float> %add20}, i64 0, metadata !28), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add20}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !39
   br i1 %cond, label %for.end54, label %for.body9, !dbg !44
 
 for.end54:                                        ; preds = %for.body9
@@ -36,58 +37,59 @@
 
 declare i32 @printf(i8* nocapture, ...) nounwind
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.module.flags = !{!56}
+!llvm.dbg.cu = !{!2}
 
-!0 = metadata !{i32 786478, metadata !54, null, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !54} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !54, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !50, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!0 = metadata !{metadata !"0x2e\00test0001\00test0001\00\003\000\001\000\006\00256\001\000", metadata !54, null, metadata !3, i32 0, <4 x float> (float)* @test0001, null, null, metadata !51} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !54} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", metadata !54, metadata !17, metadata !17, metadata !50, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, i32 0, metadata !4, i32 0} ; [ DW_TAG_subroutine_type ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786454, metadata !54, metadata !2, metadata !"v4f32", i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786433, metadata !54, metadata !2, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
-!7 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x16\00v4f32\0014\000\000\000\000", metadata !54, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
+!6 = metadata !{metadata !"0x1\00\000\00128\00128\000\000", metadata !54, metadata !2, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
+!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786465, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 786478, metadata !54, null, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**, i1)* @main, null, null, metadata !52, i32 0} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 0] [main]
-!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x21\000\004"}         ; [ DW_TAG_subrange_type ]
+!10 = metadata !{metadata !"0x2e\00main\00main\00\0059\000\001\000\006\00256\001\000", metadata !54, null, metadata !11, null, i32 (i32, i8**, i1)* @main, null, null, metadata !52} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 0] [main]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786478, metadata !55, null, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !53, i32 0} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [scope 0] [printFV]
-!15 = metadata !{i32 786473, metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x2e\00printFV\00printFV\00\0041\001\001\000\006\00256\001\000", metadata !55, null, metadata !16, null, null, null, null, metadata !53} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [scope 0] [printFV]
+!15 = metadata !{metadata !"0x29", metadata !55} ; [ DW_TAG_file_type ]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !55, metadata !15, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null}
-!18 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 786689, metadata !10, metadata !"argv", metadata !1, i32 33554491, metadata !21, i32 0, null} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786468, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!24 = metadata !{i32 786688, metadata !25, metadata !"i", metadata !1, i32 60, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
-!25 = metadata !{i32 786443, metadata !1, metadata !10, i32 59, i32 33, i32 14} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{i32 786688, metadata !25, metadata !"j", metadata !1, i32 60, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{i32 786688, metadata !25, metadata !"x", metadata !1, i32 61, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!28 = metadata !{i32 786688, metadata !25, metadata !"y", metadata !1, i32 62, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786688, metadata !25, metadata !"z", metadata !1, i32 63, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!30 = metadata !{i32 786689, metadata !14, metadata !"F", metadata !15, i32 16777257, metadata !31, i32 0, null} ; [ DW_TAG_arg_variable ]
-!31 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"FV", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_typedef ]
-!33 = metadata !{i32 786455, metadata !55, metadata !2, metadata !"", i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, i32 0} ; [ DW_TAG_union_type ]
+!18 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x101\00argc\0016777275\000", metadata !10, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{metadata !"0x101\00argv\0033554491\000", metadata !10, metadata !1, metadata !21} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
+!24 = metadata !{metadata !"0x100\00i\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{metadata !"0xb\0059\0033\0014", metadata !1, metadata !10} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{metadata !"0x100\00j\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{metadata !"0x100\00x\0061\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!28 = metadata !{metadata !"0x100\00y\0062\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{metadata !"0x100\00z\0063\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!30 = metadata !{metadata !"0x101\00F\0016777257\000", metadata !14, metadata !15, metadata !31} ; [ DW_TAG_arg_variable ]
+!31 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{metadata !"0x16\00FV\0025\000\000\000\000", metadata !55, metadata !2, metadata !33} ; [ DW_TAG_typedef ]
+!33 = metadata !{metadata !"0x17\00\0022\00128\00128\000\000\000", metadata !55, metadata !2, i32 0, metadata !34, null} ; [ DW_TAG_union_type ]
 !34 = metadata !{metadata !35, metadata !37}
-!35 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"V", i32 23, i64 128, i64 128, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ]
-!36 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"v4sf", i32 3, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!37 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"A", i32 24, i64 128, i64 32, i64 0, i32 0, metadata !38} ; [ DW_TAG_member ]
-!38 = metadata !{i32 786433, null, metadata !2, metadata !"", i32 0, i64 128, i64 32, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!35 = metadata !{metadata !"0xd\00V\0023\00128\00128\000\000", metadata !55, metadata !15, metadata !36} ; [ DW_TAG_member ]
+!36 = metadata !{metadata !"0x16\00v4sf\003\000\000\000\000", metadata !55, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
+!37 = metadata !{metadata !"0xd\00A\0024\00128\0032\000\000", metadata !55, metadata !15, metadata !38} ; [ DW_TAG_member ]
+!38 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, metadata !2, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !39 = metadata !{i32 79, i32 7, metadata !40, null}
-!40 = metadata !{i32 786443, metadata !1, metadata !41, i32 75, i32 35, i32 18} ; [ DW_TAG_lexical_block ]
-!41 = metadata !{i32 786443, metadata !1, metadata !42, i32 75, i32 5, i32 17} ; [ DW_TAG_lexical_block ]
-!42 = metadata !{i32 786443, metadata !1, metadata !43, i32 71, i32 32, i32 16} ; [ DW_TAG_lexical_block ]
-!43 = metadata !{i32 786443, metadata !1, metadata !25, i32 71, i32 3, i32 15} ; [ DW_TAG_lexical_block ]
+!40 = metadata !{metadata !"0xb\0075\0035\0018", metadata !1, metadata !41} ; [ DW_TAG_lexical_block ]
+!41 = metadata !{metadata !"0xb\0075\005\0017", metadata !1, metadata !42} ; [ DW_TAG_lexical_block ]
+!42 = metadata !{metadata !"0xb\0071\0032\0016", metadata !1, metadata !43} ; [ DW_TAG_lexical_block ]
+!43 = metadata !{metadata !"0xb\0071\003\0015", metadata !1, metadata !25} ; [ DW_TAG_lexical_block ]
 !44 = metadata !{i32 75, i32 5, metadata !42, null}
 !45 = metadata !{i32 42, i32 2, metadata !46, metadata !48}
-!46 = metadata !{i32 786443, metadata !15, metadata !47, i32 42, i32 2, i32 20} ; [ DW_TAG_lexical_block ]
-!47 = metadata !{i32 786443, metadata !15, metadata !14, i32 41, i32 28, i32 19} ; [ DW_TAG_lexical_block ]
+!46 = metadata !{metadata !"0xb\0042\002\0020", metadata !15, metadata !47} ; [ DW_TAG_lexical_block ]
+!47 = metadata !{metadata !"0xb\0041\0028\0019", metadata !15, metadata !14} ; [ DW_TAG_lexical_block ]
 !48 = metadata !{i32 95, i32 3, metadata !25, null}
 !49 = metadata !{i32 99, i32 3, metadata !25, null}
 !50 = metadata !{metadata !0, metadata !10, metadata !14}
@@ -96,4 +98,4 @@
 !53 = metadata !{metadata !30}
 !54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
 !55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
-!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/debug-info-d16-reg.ll b/test/CodeGen/ARM/debug-info-d16-reg.ll
index 30a3e2d..9791987 100644
--- a/test/CodeGen/ARM/debug-info-d16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-d16-reg.ll

@@ -12,9 +12,9 @@
 
 define i32 @inlineprinter(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !19), !dbg !26
-  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !20), !dbg !26
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !21), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !26
   %0 = zext i8 %c to i32, !dbg !27
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !27
   ret i32 0, !dbg !29
@@ -22,9 +22,9 @@
 
 define i32 @printer(i8* %ptr, double %val, i8 zeroext %c) nounwind optsize noinline {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !16), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !17), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !18), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{double %val}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !30
   %0 = zext i8 %c to i32, !dbg !31
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %val, i32 %0) nounwind, !dbg !31
   ret i32 0, !dbg !33
@@ -32,22 +32,22 @@
 
 declare i32 @printf(i8* nocapture, ...) nounwind
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !22), !dbg !34
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !23), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !34
   %0 = sitofp i32 %argc to double, !dbg !35
   %1 = fadd double %0, 5.555552e+05, !dbg !35
-  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !24), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !35
   %2 = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0)) nounwind, !dbg !36
   %3 = getelementptr inbounds i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !37
   %4 = trunc i32 %argc to i8, !dbg !37
   %5 = add i8 %4, 97, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{i8* %3}, i64 0, metadata !19) nounwind, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !20) nounwind, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i8 %5}, i64 0, metadata !21) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i8* %3}, i64 0, metadata !19, metadata !{metadata !"0x102"}) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{double %1}, i64 0, metadata !20, metadata !{metadata !"0x102"}) nounwind, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i8 %5}, i64 0, metadata !21, metadata !{metadata !"0x102"}) nounwind, !dbg !38
   %6 = zext i8 %5 to i32, !dbg !39
   %7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %3, double %1, i32 %6) nounwind, !dbg !39
   %8 = tail call i32 @printer(i8* %3, double %1, i8 zeroext %5) nounwind, !dbg !40
@@ -59,39 +59,39 @@
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!48}
 
-!0 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"printer", metadata !"printer", metadata !"printer", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @printer, null, null, metadata !43, i32 12} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !46, i32 1, metadata !"(LLVM build 00)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !42, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00printer\00printer\00printer\0012\000\001\000\006\00256\001\0012", metadata !46, metadata !1, metadata !3, null, i32 (i8*, double, i8)* @printer, null, null, metadata !43} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !46} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\00(LLVM build 00)\001\00\000\00\001", metadata !46, metadata !47, metadata !47, metadata !42, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !46, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8}
-!5 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"double", i32 0, i64 64, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"inlineprinter", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @inlineprinter, null, null, metadata !44, i32 5} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 18, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !45, i32 18} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !46, metadata !1, null} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0x24\00double\000\0064\0032\000\000\004", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x2e\00inlineprinter\00inlineprinter\00inlineprinter\005\000\001\000\006\00256\001\005", metadata !46, metadata !1, metadata !3, null, i32 (i8*, double, i8)* @inlineprinter, null, null, metadata !44} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x2e\00main\00main\00main\0018\000\001\000\006\00256\001\0018", metadata !46, metadata !1, metadata !11, null, i32 (i32, i8**)* @main, null, null, metadata !45} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !46, metadata !1, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !5, metadata !5, metadata !13}
-!13 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ]
-!15 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!16 = metadata !{i32 786689, metadata !0, metadata !"ptr", metadata !1, i32 11, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786689, metadata !0, metadata !"val", metadata !1, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 786689, metadata !0, metadata !"c", metadata !1, i32 11, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786689, metadata !9, metadata !"ptr", metadata !1, i32 4, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 786689, metadata !9, metadata !"val", metadata !1, i32 4, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{i32 786689, metadata !9, metadata !"c", metadata !1, i32 4, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 17, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{i32 786689, metadata !10, metadata !"argv", metadata !1, i32 17, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!24 = metadata !{i32 786688, metadata !25, metadata !"dval", metadata !1, i32 19, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
-!25 = metadata !{i32 786443, metadata !46, metadata !10, i32 18, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
+!13 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !46, metadata !1, metadata !14} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !46, metadata !1, metadata !15} ; [ DW_TAG_pointer_type ]
+!15 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !46, metadata !1} ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !"0x101\00ptr\0011\000", metadata !0, metadata !1, metadata !6} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x101\00val\0011\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x101\00c\0011\000", metadata !0, metadata !1, metadata !8} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x101\00ptr\004\000", metadata !9, metadata !1, metadata !6} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{metadata !"0x101\00val\004\000", metadata !9, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{metadata !"0x101\00c\004\000", metadata !9, metadata !1, metadata !8} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{metadata !"0x101\00argc\0017\000", metadata !10, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{metadata !"0x101\00argv\0017\000", metadata !10, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!24 = metadata !{metadata !"0x100\00dval\0019\000", metadata !25, metadata !1, metadata !7} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{metadata !"0xb\0018\000\002", metadata !46, metadata !10} ; [ DW_TAG_lexical_block ]
 !26 = metadata !{i32 4, i32 0, metadata !9, null}
 !27 = metadata !{i32 6, i32 0, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !46, metadata !9, i32 5, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0xb\005\000\001", metadata !46, metadata !9} ; [ DW_TAG_lexical_block ]
 !29 = metadata !{i32 7, i32 0, metadata !28, null}
 !30 = metadata !{i32 11, i32 0, metadata !0, null}
 !31 = metadata !{i32 13, i32 0, metadata !32, null}
-!32 = metadata !{i32 786443, metadata !46, metadata !0, i32 12, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!32 = metadata !{metadata !"0xb\0012\000\000", metadata !46, metadata !0} ; [ DW_TAG_lexical_block ]
 !33 = metadata !{i32 14, i32 0, metadata !32, null}
 !34 = metadata !{i32 17, i32 0, metadata !10, null}
 !35 = metadata !{i32 19, i32 0, metadata !25, null}
@@ -107,4 +107,4 @@
 !45 = metadata !{metadata !22, metadata !23, metadata !24}
 !46 = metadata !{metadata !"a.c", metadata !"/tmp/"}
 !47 = metadata !{i32 0}
-!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index 03ce312..cfcefb8 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll

@@ -26,7 +26,7 @@
   br i1 undef, label %for.end54, label %for.body9, !dbg !44
 
 for.end54:                                        ; preds = %for.body9
-  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !39
   %tmp115 = extractelement <4 x float> %add19, i32 1
   %conv6.i75 = fpext float %tmp115 to double, !dbg !45
   %call.i82 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
@@ -35,59 +35,59 @@
 
 declare i32 @printf(i8* nocapture, ...) nounwind
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!56}
 
-!0 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [test0001]
-!1 = metadata !{i32 786473, metadata !54} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !54, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !50, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00test0001\00test0001\00\003\000\001\000\006\00256\001\003", metadata !54, metadata !1, metadata !3, null, <4 x float> (float)* @test0001, null, null, metadata !51} ; [ DW_TAG_subprogram ] [line 3] [def] [test0001]
+!1 = metadata !{metadata !"0x29", metadata !54} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", metadata !54, metadata !17, metadata !17, metadata !50, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786454, metadata !54, metadata !2, metadata !"v4f32", i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786433, metadata !2, null, metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
-!7 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x16\00v4f32\0014\000\000\000\000", metadata !54, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
+!6 = metadata !{metadata !"0x1\00\000\00128\00128\000\000", metadata !2, null, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
+!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786465, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !52, i32 59} ; [ DW_TAG_subprogram ] [line 59] [def] [main]
-!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x21\000\004"}         ; [ DW_TAG_subrange_type ]
+!10 = metadata !{metadata !"0x2e\00main\00main\00\0059\000\001\000\006\00256\001\0059", metadata !54, metadata !1, metadata !11, null, i32 (i32, i8**)* @main, null, null, metadata !52} ; [ DW_TAG_subprogram ] [line 59] [def] [main]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !54, metadata !1, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786478, metadata !55, metadata !15, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !53, i32 41} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [printFV]
-!15 = metadata !{i32 786473, metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x2e\00printFV\00printFV\00\0041\001\001\000\006\00256\001\0041", metadata !55, metadata !15, metadata !16, null, null, null, null, metadata !53} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [printFV]
+!15 = metadata !{metadata !"0x29", metadata !55} ; [ DW_TAG_file_type ]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !55, metadata !15, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null}
-!18 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 786689, metadata !10, metadata !"argv", metadata !1, i32 33554491, metadata !21, i32 0, null} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786468, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!24 = metadata !{i32 786688, metadata !25, metadata !"i", metadata !1, i32 60, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
-!25 = metadata !{i32 786443, metadata !54, metadata !10, i32 59, i32 33, i32 14} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{i32 786688, metadata !25, metadata !"j", metadata !1, i32 60, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{i32 786688, metadata !25, metadata !"x", metadata !1, i32 61, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!28 = metadata !{i32 786688, metadata !25, metadata !"y", metadata !1, i32 62, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786688, metadata !25, metadata !"z", metadata !1, i32 63, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!30 = metadata !{i32 786689, metadata !14, metadata !"F", metadata !15, i32 16777257, metadata !31, i32 0, null} ; [ DW_TAG_arg_variable ]
-!31 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"FV", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_typedef ]
-!33 = metadata !{i32 786455, metadata !55, metadata !2, metadata !"", i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, null} ; [ DW_TAG_union_type ]
+!18 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x101\00argc\0016777275\000", metadata !10, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{metadata !"0x101\00argv\0033554491\000", metadata !10, metadata !1, metadata !21} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
+!24 = metadata !{metadata !"0x100\00i\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{metadata !"0xb\0059\0033\0014", metadata !54, metadata !10} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{metadata !"0x100\00j\0060\000", metadata !25, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{metadata !"0x100\00x\0061\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!28 = metadata !{metadata !"0x100\00y\0062\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{metadata !"0x100\00z\0063\000", metadata !25, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!30 = metadata !{metadata !"0x101\00F\0016777257\000", metadata !14, metadata !15, metadata !31} ; [ DW_TAG_arg_variable ]
+!31 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{metadata !"0x16\00FV\0025\000\000\000\000", metadata !55, metadata !2, metadata !33} ; [ DW_TAG_typedef ]
+!33 = metadata !{metadata !"0x17\00\0022\00128\00128\000\000\000", metadata !55, metadata !2, i32 0, metadata !34, null} ; [ DW_TAG_union_type ]
 !34 = metadata !{metadata !35, metadata !37}
-!35 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"V", i32 23, i64 128, i64 128, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ]
-!36 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"v4sf", i32 3, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!37 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"A", i32 24, i64 128, i64 32, i64 0, i32 0, metadata !38} ; [ DW_TAG_member ]
-!38 = metadata !{i32 786433, null, metadata !2, metadata !"", i32 0, i64 128, i64 32, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!35 = metadata !{metadata !"0xd\00V\0023\00128\00128\000\000", metadata !55, metadata !15, metadata !36} ; [ DW_TAG_member ]
+!36 = metadata !{metadata !"0x16\00v4sf\003\000\000\000\000", metadata !55, metadata !2, metadata !6} ; [ DW_TAG_typedef ]
+!37 = metadata !{metadata !"0xd\00A\0024\00128\0032\000\000", metadata !55, metadata !15, metadata !38} ; [ DW_TAG_member ]
+!38 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, metadata !2, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !39 = metadata !{i32 79, i32 7, metadata !40, null}
-!40 = metadata !{i32 786443, metadata !54, metadata !41, i32 75, i32 35, i32 18} ; [ DW_TAG_lexical_block ]
-!41 = metadata !{i32 786443, metadata !54, metadata !42, i32 75, i32 5, i32 17} ; [ DW_TAG_lexical_block ]
-!42 = metadata !{i32 786443, metadata !54, metadata !43, i32 71, i32 32, i32 16} ; [ DW_TAG_lexical_block ]
-!43 = metadata !{i32 786443, metadata !54, metadata !25, i32 71, i32 3, i32 15} ; [ DW_TAG_lexical_block ]
+!40 = metadata !{metadata !"0xb\0075\0035\0018", metadata !54, metadata !41} ; [ DW_TAG_lexical_block ]
+!41 = metadata !{metadata !"0xb\0075\005\0017", metadata !54, metadata !42} ; [ DW_TAG_lexical_block ]
+!42 = metadata !{metadata !"0xb\0071\0032\0016", metadata !54, metadata !43} ; [ DW_TAG_lexical_block ]
+!43 = metadata !{metadata !"0xb\0071\003\0015", metadata !54, metadata !25} ; [ DW_TAG_lexical_block ]
 !44 = metadata !{i32 75, i32 5, metadata !42, null}
 !45 = metadata !{i32 42, i32 2, metadata !46, metadata !48}
-!46 = metadata !{i32 786443, metadata !55, metadata !47, i32 42, i32 2, i32 20} ; [ DW_TAG_lexical_block ]
-!47 = metadata !{i32 786443, metadata !55, metadata !14, i32 41, i32 28, i32 19} ; [ DW_TAG_lexical_block ]
+!46 = metadata !{metadata !"0xb\0042\002\0020", metadata !55, metadata !47} ; [ DW_TAG_lexical_block ]
+!47 = metadata !{metadata !"0xb\0041\0028\0019", metadata !55, metadata !14} ; [ DW_TAG_lexical_block ]
 !48 = metadata !{i32 95, i32 3, metadata !25, null}
 !49 = metadata !{i32 99, i32 3, metadata !25, null}
 !50 = metadata !{metadata !0, metadata !10, metadata !14}
@@ -96,4 +96,4 @@
 !53 = metadata !{metadata !30}
 !54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
 !55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
-!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index ee9faf8..6bd7172 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll

@@ -15,9 +15,9 @@
 
 define i32 @inlineprinter(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !8), !dbg !24
-  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !10), !dbg !25
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !12), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !24
+  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !26
   %conv = fpext float %val to double, !dbg !27
   %conv3 = zext i8 %c to i32, !dbg !27
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !27
@@ -28,9 +28,9 @@
 
 define i32 @printer(i8* %ptr, float %val, i8 zeroext %c) nounwind optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !14), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !15), !dbg !31
-  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !16), !dbg !32
+  tail call void @llvm.dbg.value(metadata !{i8* %ptr}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{float %val}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !31
+  tail call void @llvm.dbg.value(metadata !{i8 %c}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !32
   %conv = fpext float %val to double, !dbg !33
   %conv3 = zext i8 %c to i32, !dbg !33
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %ptr, double %conv, i32 %conv3) nounwind optsize, !dbg !33
@@ -39,19 +39,19 @@
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !17), !dbg !36
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !18), !dbg !37
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !37
   %conv = sitofp i32 %argc to double, !dbg !38
   %add = fadd double %conv, 5.555552e+05, !dbg !38
   %conv1 = fptrunc double %add to float, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !22), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !38
   %call = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0)) nounwind optsize, !dbg !39
   %add.ptr = getelementptr i8* bitcast (i32 (i32, i8**)* @main to i8*), i32 %argc, !dbg !40
   %add5 = add nsw i32 %argc, 97, !dbg !40
   %conv6 = trunc i32 %add5 to i8, !dbg !40
-  tail call void @llvm.dbg.value(metadata !{i8* %add.ptr}, i64 0, metadata !8) nounwind, !dbg !41
-  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !10) nounwind, !dbg !42
-  tail call void @llvm.dbg.value(metadata !{i8 %conv6}, i64 0, metadata !12) nounwind, !dbg !43
+  tail call void @llvm.dbg.value(metadata !{i8* %add.ptr}, i64 0, metadata !8, metadata !{metadata !"0x102"}) nounwind, !dbg !41
+  tail call void @llvm.dbg.value(metadata !{float %conv1}, i64 0, metadata !10, metadata !{metadata !"0x102"}) nounwind, !dbg !42
+  tail call void @llvm.dbg.value(metadata !{i8 %conv6}, i64 0, metadata !12, metadata !{metadata !"0x102"}) nounwind, !dbg !43
   %conv.i = fpext float %conv1 to double, !dbg !44
   %conv3.i = and i32 %add5, 255, !dbg !44
   %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i8* %add.ptr, double %conv.i, i32 %conv3.i) nounwind optsize, !dbg !44
@@ -61,46 +61,46 @@
 
 declare i32 @puts(i8* nocapture) nounwind optsize
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!53}
 
-!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, float, i8)* @inlineprinter, null, null, metadata !48, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [inlineprinter]
-!1 = metadata !{i32 786473, metadata !51} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !51, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !52, metadata !52, metadata !47, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !51, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00inlineprinter\00inlineprinter\00\005\000\001\000\006\00256\001\005", metadata !51, metadata !1, metadata !3, null, i32 (i8*, float, i8)* @inlineprinter, null, null, metadata !48} ; [ DW_TAG_subprogram ] [line 5] [def] [inlineprinter]
+!1 = metadata !{metadata !"0x29", metadata !51} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129915)\001\00\000\00\001", metadata !51, metadata !52, metadata !52, metadata !47, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"printer", metadata !"printer", metadata !"", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, float, i8)* @printer, null, null, metadata !49, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [printer]
-!7 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !50, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
-!8 = metadata !{i32 786689, metadata !0, metadata !"ptr", metadata !1, i32 16777220, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 786689, metadata !0, metadata !"val", metadata !1, i32 33554436, metadata !11, i32 0, null} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 786689, metadata !0, metadata !"c", metadata !1, i32 50331652, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786689, metadata !6, metadata !"ptr", metadata !1, i32 16777227, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 786689, metadata !6, metadata !"val", metadata !1, i32 33554443, metadata !11, i32 0, null} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{i32 786689, metadata !6, metadata !"c", metadata !1, i32 50331659, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786689, metadata !7, metadata !"argc", metadata !1, i32 16777233, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 786689, metadata !7, metadata !"argv", metadata !1, i32 33554449, metadata !19, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ]
-!20 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !21} ; [ DW_TAG_pointer_type ]
-!21 = metadata !{i32 786468, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!22 = metadata !{i32 786688, metadata !23, metadata !"dval", metadata !1, i32 19, metadata !11, i32 0, null} ; [ DW_TAG_auto_variable ]
-!23 = metadata !{i32 786443, metadata !51, metadata !7, i32 18, i32 1, i32 2} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00printer\00printer\00\0012\000\001\000\006\00256\001\0012", metadata !51, metadata !1, metadata !3, null, i32 (i8*, float, i8)* @printer, null, null, metadata !49} ; [ DW_TAG_subprogram ] [line 12] [def] [printer]
+!7 = metadata !{metadata !"0x2e\00main\00main\00\0018\000\001\000\006\00256\001\0018", metadata !51, metadata !1, metadata !3, null, i32 (i32, i8**)* @main, null, null, metadata !50} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
+!8 = metadata !{metadata !"0x101\00ptr\0016777220\000", metadata !0, metadata !1, metadata !9} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, null} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x101\00val\0033554436\000", metadata !0, metadata !1, metadata !11} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
+!12 = metadata !{metadata !"0x101\00c\0050331652\000", metadata !0, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!13 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", null, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x101\00ptr\0016777227\000", metadata !6, metadata !1, metadata !9} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x101\00val\0033554443\000", metadata !6, metadata !1, metadata !11} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x101\00c\0050331659\000", metadata !6, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x101\00argc\0016777233\000", metadata !7, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x101\00argv\0033554449\000", metadata !7, metadata !1, metadata !19} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !20} ; [ DW_TAG_pointer_type ]
+!20 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !21} ; [ DW_TAG_pointer_type ]
+!21 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
+!22 = metadata !{metadata !"0x100\00dval\0019\000", metadata !23, metadata !1, metadata !11} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{metadata !"0xb\0018\001\002", metadata !51, metadata !7} ; [ DW_TAG_lexical_block ]
 !24 = metadata !{i32 4, i32 22, metadata !0, null}
 !25 = metadata !{i32 4, i32 33, metadata !0, null}
 !26 = metadata !{i32 4, i32 52, metadata !0, null}
 !27 = metadata !{i32 6, i32 3, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !51, metadata !0, i32 5, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0xb\005\001\000", metadata !51, metadata !0} ; [ DW_TAG_lexical_block ]
 !29 = metadata !{i32 7, i32 3, metadata !28, null}
 !30 = metadata !{i32 11, i32 42, metadata !6, null}
 !31 = metadata !{i32 11, i32 53, metadata !6, null}
 !32 = metadata !{i32 11, i32 72, metadata !6, null}
 !33 = metadata !{i32 13, i32 3, metadata !34, null}
-!34 = metadata !{i32 786443, metadata !51, metadata !6, i32 12, i32 1, i32 1} ; [ DW_TAG_lexical_block ]
+!34 = metadata !{metadata !"0xb\0012\001\001", metadata !51, metadata !6} ; [ DW_TAG_lexical_block ]
 !35 = metadata !{i32 14, i32 3, metadata !34, null}
 !36 = metadata !{i32 17, i32 15, metadata !7, null}
 !37 = metadata !{i32 17, i32 28, metadata !7, null}
@@ -119,4 +119,4 @@
 !50 = metadata !{metadata !17, metadata !18, metadata !22}
 !51 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !52 = metadata !{i32 0}
-!53 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!53 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index 71a696a..4374b9e 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll

@@ -1,26 +1,21 @@
-; RUN: llc < %s - | FileCheck %s
+; RUN: llc < %s - -filetype=obj | llvm-dwarfdump -debug-dump=loc - | FileCheck %s
 ; Radar 9376013
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.6.7"
 
-;CHECK-LABEL: Lfunc_begin0:
-;CHECK: Ltmp[[K:[0-9]+]]:
-;CHECK: Ltmp[[L:[0-9]+]]:
-;CHECK-LABEL: Ldebug_loc0:
-;CHECK-NEXT:        .long   Ltmp[[K]]
-;CHECK-NEXT:        .long   Ltmp[[L]]
-;CHECK-NEXT: Lset[[N:[0-9]+]] = Ltmp{{[0-9]+}}-Ltmp[[M:[0-9]+]]        @ Loc expr size
-;CHECK-NEXT:        .short  Lset[[N]]
-;CHECK-NEXT: Ltmp[[M]]:
-;CHECK-NEXT:        .byte   144                     @ super-register
-;CHECK-NEXT:                                        @ DW_OP_regx
-;CHECK-NEXT:        .ascii
-;CHECK-NEXT:        .byte   {{[0-9]+}}              @ DW_OP_{{.*}}piece
+; Just making sure the first part of the location isn't a repetition
+; of the size of the location description.
+;
+; 0x90   DW_OP_regx of super-register
+
+; CHECK: 0x00000000: Beginning address offset:
+; CHECK-NEXT:           Ending address offset:
+; CHECK-NEXT:            Location description: 90 {{.. .. .. .. $}}
 
 define void @_Z3foov() optsize ssp {
 entry:
   %call = tail call float @_Z3barv() optsize, !dbg !11
-  tail call void @llvm.dbg.value(metadata !{float %call}, i64 0, metadata !5), !dbg !11
+  tail call void @llvm.dbg.value(metadata !{float %call}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !11
   %call16 = tail call float @_Z2f2v() optsize, !dbg !12
   %cmp7 = fcmp olt float %call, %call16, !dbg !12
   br i1 %cmp7, label %for.body, label %for.end, !dbg !12
@@ -43,22 +38,22 @@
 
 declare float @_Z2f3f(float) optsize
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{i32 786449, metadata !18, i32 4, metadata !"clang version 3.0 (trunk 130845)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !16, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z3foov, null, null, metadata !17, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 130845)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !16, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\001\005", metadata !18, metadata !2, metadata !3, null, void ()* @_Z3foov, null, null, metadata !17} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 786688, metadata !6, metadata !"k", metadata !2, i32 6, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
-!6 = metadata !{i32 786443, metadata !18, metadata !1, i32 5, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{i32 786468, null, metadata !0, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786688, metadata !9, metadata !"y", metadata !2, i32 8, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{i32 786443, metadata !18, metadata !10, i32 7, i32 25, i32 2} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{i32 786443, metadata !18, metadata !6, i32 7, i32 3, i32 1} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x100\00k\006\000", metadata !6, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{metadata !"0xb\005\0012\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !0} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x100\00y\008\000", metadata !9, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0xb\007\0025\002", metadata !18, metadata !10} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0xb\007\003\001", metadata !18, metadata !6} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 6, i32 18, metadata !6, null}
 !12 = metadata !{i32 7, i32 3, metadata !6, null}
 !13 = metadata !{i32 8, i32 20, metadata !9, null}
@@ -68,4 +63,4 @@
 !17 = metadata !{metadata !5, metadata !8}
 !18 = metadata !{metadata !"k.cc", metadata !"/private/tmp"}
 !19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/ARM/debug-segmented-stacks.ll b/test/CodeGen/ARM/debug-segmented-stacks.ll
index e866b4e..2123fa7 100644
--- a/test/CodeGen/ARM/debug-segmented-stacks.ll
+++ b/test/CodeGen/ARM/debug-segmented-stacks.ll

@@ -39,39 +39,37 @@
 ; ARM-linux       .cfi_same_value r5
 }
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/var.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"var.c", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_basic",
-  metadata !"test_basic", metadata !"", i32 5, metadata !6, i1 false, i1 true,
-  i32 0, i32 0, null, i32 256, i1 false, void ()* @test_basic, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00test_basic\00test_basic\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, void ()* @test_basic, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [sum]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/var.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5 "}
-!12 = metadata !{i32 786689, metadata !4, metadata !"count", metadata !5, i32 16777221, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [count] [line 5]
+!12 = metadata !{metadata !"0x101\00count\0016777221\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [count] [line 5]
 !13 = metadata !{i32 5, i32 0, metadata !4, null}
-!14 = metadata !{i32 786688, metadata !4, metadata !"vl", metadata !5, i32 6, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vl] [line 6]
-!15 = metadata !{i32 786454, metadata !16, null, metadata !"va_list", i32 30, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
+!14 = metadata !{metadata !"0x100\00vl\006\000", metadata !4, metadata !5, metadata !15} ; [ DW_TAG_auto_variable ] [vl] [line 6]
+!15 = metadata !{metadata !"0x16\00va_list\0030\000\000\000\000", metadata !16, null, metadata !17} ; [ DW_TAG_typedef ] [va_list] [line 30, size 0, align 0, offset 0] [from __builtin_va_list]
 !16 = metadata !{metadata !"/linux-x86_64-high/gcc_4.7.2/dbg/llvm/bin/../lib/clang/3.5/include/stdarg.h", metadata !"/tmp"}
-!17 = metadata !{i32 786454, metadata !1, null, metadata !"__builtin_va_list", i32 6, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
-!18 = metadata !{i32 786451, metadata !1, null, metadata !"__va_list", i32 6, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
+!17 = metadata !{metadata !"0x16\00__builtin_va_list\006\000\000\000\000", metadata !1, null, metadata !18} ; [ DW_TAG_typedef ] [__builtin_va_list] [line 6, size 0, align 0, offset 0] [from __va_list]
+!18 = metadata !{metadata !"0x13\00__va_list\006\0032\0032\000\000\000", metadata !1, null, null, metadata !19, null, null, null} ; [ DW_TAG_structure_type ] [__va_list] [line 6, size 32, align 32, offset 0] [def] [from ]
 !19 = metadata !{metadata !20}
-!20 = metadata !{i32 786445, metadata !1, metadata !18, metadata !"__ap", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
-!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
+!20 = metadata !{metadata !"0xd\00__ap\006\0032\0032\000\000", metadata !1, metadata !18, metadata !21} ; [ DW_TAG_member ] [__ap] [line 6, size 32, align 32, offset 0] [from ]
+!21 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from ]
 !22 = metadata !{i32 6, i32 0, metadata !4, null}
 !23 = metadata !{i32 7, i32 0, metadata !4, null}
-!24 = metadata !{i32 786688, metadata !4, metadata !"test_basic", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 8]
-!25 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
-!26 = metadata !{i32 786688, metadata !27, metadata !"i", metadata !5, i32 9, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 9]
-!27 = metadata !{i32 786443, metadata !1, metadata !4, i32 9, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!24 = metadata !{metadata !"0x100\00test_basic\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [sum] [line 8]
+!25 = metadata !{i32 8, i32 0, metadata !4, null}
+!26 = metadata !{metadata !"0x100\00i\009\000", metadata !27, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 9]
+!27 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
 !28 = metadata !{i32 9, i32 0, metadata !27, null}
 !29 = metadata !{i32 10, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !1, metadata !27, i32 9, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
+!30 = metadata !{metadata !"0xb\009\000\001", metadata !1, metadata !27} ; [ DW_TAG_lexical_block ] [/tmp/var.c]
 !31 = metadata !{i32 11, i32 0, metadata !30, null}
 !32 = metadata !{i32 12, i32 0, metadata !4, null}
 !33 = metadata !{i32 13, i32 0, metadata !4, null}

diff --git a/test/CodeGen/ARM/dwarf-unwind.ll b/test/CodeGen/ARM/dwarf-unwind.ll
new file mode 100644
index 0000000..5256db8
--- /dev/null
+++ b/test/CodeGen/ARM/dwarf-unwind.ll

@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=thumbv7-netbsd-eabi -o - %s | FileCheck %s
+declare void @bar()
+
+; ARM's frame lowering attempts to tack another callee-saved register onto the
+; list when it detects a potential misaligned VFP store. However, if there are
+; none available it used to just vpush anyway and misreport the location of the
+; registers in unwind info. Since there are benefits to aligned stores, it's
+; better to correct the code than the .cfi_offset directive.
+
+define void @test_dpr_align(i8 %l, i8 %r) {
+; CHECK-LABEL: test_dpr_align:
+; CHECK: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK: .cfi_def_cfa_offset 36
+; CHECK: sub sp, #4
+; CHECK: .cfi_def_cfa_offset 40
+; CHECK: vpush {d8}
+; CHECK: .cfi_offset d8, -48
+; CHECK-NOT: sub sp
+; [...]
+; CHECK: bl bar
+; CHECK-NOT: add sp
+; CHECK: vpop {d8}
+; CHECK: add sp, #4
+; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{d8}"()
+  call void @bar()
+  ret void
+}
+
+; The prologue (but not the epilogue) can be made more space efficient by
+; chucking an argument register into the list. Not worth it in general though,
+; "sub sp, #4" is likely faster.
+define void @test_dpr_align_tiny(i8 %l, i8 %r) minsize {
+; CHECK-LABEL: test_dpr_align_tiny:
+; CHECK: push.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NOT: sub sp
+; CHECK: vpush {d8}
+; CHECK: .cfi_offset d8, -48
+; CHECK-NOT: sub sp
+; [...]
+; CHECK: bl bar
+; CHECK-NOT: add sp
+; CHECK: vpop {d8}
+; CHECK: add sp, #4
+; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{d8}"()
+  call void @bar()
+  ret void
+}
+
+
+; However, we shouldn't do a 2-step align/adjust if there are no DPRs to be
+; saved.
+define void @test_nodpr_noalign(i8 %l, i8 %r) {
+; CHECK-LABEL: test_nodpr_noalign:
+; CHECK: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NOT: sub sp
+; CHECK: sub sp, #12
+; CHECK-NOT: sub sp
+; [...]
+; CHECK: bl bar
+; CHECK-NOT: add sp
+; CHECK: add sp, #12
+; CHECK-NOT: add sp
+; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+  alloca i64
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11}"()
+  call void @bar()
+  ret void
+}
+
+define void @test_frame_pointer_offset() minsize "no-frame-pointer-elim"="true" {
+; CHECK-LABEL: test_frame_pointer_offset:
+; CHECK: push.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK: .cfi_def_cfa_offset 40
+; CHECK: add r7, sp, #16
+; CHECK: .cfi_def_cfa r7, 24
+; CHECK-NOT: .cfi_def_cfa_offset
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{d8}"()
+  call void @bar()
+  ret void
+}
\ No newline at end of file

diff --git a/test/CodeGen/ARM/fabs-neon.ll b/test/CodeGen/ARM/fabs-neon.ll
index e3094aa..dc1dc32 100644
--- a/test/CodeGen/ARM/fabs-neon.ll
+++ b/test/CodeGen/ARM/fabs-neon.ll

@@ -15,3 +15,42 @@
     ret <2 x float> %foo
 }
 declare <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+
+; No constant pool loads or vector ops are needed for the fabs of a
+; bitcasted integer constant; we should just return integer constants
+; that have the sign bits turned off.
+;
+; So instead of something like this:
+; 	mvn	r0, #0
+; 	mov	r1, #0
+; 	vmov	d16, r1, r0
+; 	vabs.f32	d16, d16
+; 	vmov	r0, r1, d16
+; 	bx	lr
+;
+; We should generate:
+;	mov	r0, #0
+;	mvn	r1, #-2147483648
+;	bx	lr
+
+define i64 @fabs_v2f32_1() {
+; CHECK-LABEL: fabs_v2f32_1:
+; CHECK: mvn r1, #-2147483648
+; CHECK: bx lr
+; CHECK-NOT: vabs
+ %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}
+
+define i64 @fabs_v2f32_2() {
+; CHECK-LABEL: fabs_v2f32_2:
+; CHECK: mvn r0, #-2147483648
+; CHECK: bx lr
+; CHECK-NOT: vabs
+ %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}

diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index 2d7378e..74b31bd 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll

@@ -117,17 +117,11 @@
 ; ARM-LONG: blx [[R]]
 ; THUMB: @t10
 ; THUMB: movs [[R0:l?r[0-9]*]], #0
-; THUMB: movt [[R0]], #0
 ; THUMB: movs [[R1:l?r[0-9]*]], #248
-; THUMB: movt [[R1]], #0
 ; THUMB: movs [[R2:l?r[0-9]*]], #187
-; THUMB: movt [[R2]], #0
 ; THUMB: movs [[R3:l?r[0-9]*]], #28
-; THUMB: movt [[R3]], #0
 ; THUMB: movw [[R4:l?r[0-9]*]], #40
-; THUMB: movt [[R4]], #0
 ; THUMB: movw [[R5:l?r[0-9]*]], #186
-; THUMB: movt [[R5]], #0
 ; THUMB: and [[R0]], [[R0]], #255
 ; THUMB: and [[R1]], [[R1]], #255
 ; THUMB: and [[R2]], [[R2]], #255
@@ -250,4 +244,19 @@
   ret void
 }
 
+declare void @bar2(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6)
+
+define void @call_undef_args() {
+; ARM-LABEL: call_undef_args
+; ARM:       movw  r0, #1
+; ARM-NEXT:  movw  r1, #2
+; ARM-NEXT:  movw  r2, #3
+; ARM-NEXT:  movw  r3, #4
+; ARM-NOT:   str {{r[0-9]+}}, [sp]
+; ARM:       movw  [[REG:l?r[0-9]*]], #6
+; ARM-NEXT:  str [[REG]], [sp, #4]
+  call void @bar2(i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6)
+  ret void
+}
+
 declare void @print(float)

diff --git a/test/CodeGen/ARM/fast-isel-deadcode.ll b/test/CodeGen/ARM/fast-isel-deadcode.ll
index 5e6666c..c3eed30 100644
--- a/test/CodeGen/ARM/fast-isel-deadcode.ll
+++ b/test/CodeGen/ARM/fast-isel-deadcode.ll

@@ -14,7 +14,6 @@
 ; THUMB-NOT: ldr
 ; THUMB-NOT: sxtb
 ; THUMB: movs r0, #0
-; THUMB: movt r0, #0
 ; THUMB: pop
   ret i32 0
 }

diff --git a/test/CodeGen/ARM/fast-isel-inline-asm.ll b/test/CodeGen/ARM/fast-isel-inline-asm.ll
new file mode 100644
index 0000000..2eb25ec
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-inline-asm.ll

@@ -0,0 +1,18 @@
+; RUN: llc -fast-isel < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-i1:8:32-i8:8:32-i16:16:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.0.0"
+
+%0 = type opaque
+
+; Make sure that the inline asm starts right after the call to bar.
+define void @test_inline_asm_sideeffect(%0* %call) {
+; CHECK:      bl _bar
+; CHECK-NEXT: InlineAsm Start
+  call void @bar()
+  call void asm sideeffect "mov\09r7, r7\09\09@ marker", ""()
+  %1 = call %0* bitcast (i8* (i8*)* @foo to %0* (%0*)*)(%0* %call)
+  ret void
+}
+
+declare i8* @foo(i8*)
+declare void @bar()

diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index 089209e..b09931d 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll

@@ -31,9 +31,7 @@
 ; THUMB: {{(movt r0, :upper16:_?message1)|(ldr r0, \[r0\])}}
 ; THUMB: adds r0, #5
 ; THUMB: movs r1, #64
-; THUMB: movt r1, #0
 ; THUMB: movs r2, #10
-; THUMB: movt r2, #0
 ; THUMB: and r1, r1, #255
 ; THUMB: bl {{_?}}memset
 ; THUMB-LONG-LABEL: t1:
@@ -71,7 +69,6 @@
 ; THUMB: adds r1, r0, #4
 ; THUMB: adds r0, #16
 ; THUMB: movs r2, #17
-; THUMB: movt r2, #0
 ; THUMB: str r0, [sp[[SLOT:[, #0-9]*]]] @ 4-byte Spill
 ; THUMB: mov r0, r1
 ; THUMB: ldr r1,  [sp[[SLOT]]] @ 4-byte Reload
@@ -109,7 +106,6 @@
 ; THUMB: adds r1, r0, #4
 ; THUMB: adds r0, #16
 ; THUMB: movs r2, #10
-; THUMB: movt r2, #0
 ; THUMB: str r0, [sp[[SLOT:[, #0-9]*]]] @ 4-byte Spill
 ; THUMB: mov r0, r1
 ; THUMB: ldr r1,  [sp[[SLOT]]] @ 4-byte Reload

diff --git a/test/CodeGen/ARM/fast-isel-mvn.ll b/test/CodeGen/ARM/fast-isel-mvn.ll
index 0bc9395..886f2da 100644
--- a/test/CodeGen/ARM/fast-isel-mvn.ll
+++ b/test/CodeGen/ARM/fast-isel-mvn.ll

@@ -1,17 +1,14 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=false -mtriple=armv7-apple-ios     < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=false -mtriple=armv7-linux-gnueabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=false -mtriple=thumbv7-apple-ios   < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=true  -mtriple=thumbv7-apple-ios   < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -arm-use-movt=true  -mtriple=armv7-apple-ios     < %s | FileCheck %s --check-prefix=MOVT
 ; rdar://10412592
 
-; Note: The Thumb code is being generated by the target-independent selector.
-
 define void @t1() nounwind {
 entry:
-; ARM: t1
-; THUMB: t1
-; ARM: mvn r0, #0
-; THUMB: movw r0, #65535
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t1
+; CHECK:       mvn r0, #0
   call void @foo(i32 -1)
   ret void
 }
@@ -20,22 +17,16 @@
 
 define void @t2() nounwind {
 entry:
-; ARM: t2
-; THUMB: t2
-; ARM: mvn r0, #233
-; THUMB: movw r0, #65302
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t2
+; CHECK:       mvn r0, #233
   call void @foo(i32 -234)
   ret void
 }
 
 define void @t3() nounwind {
 entry:
-; ARM: t3
-; THUMB: t3
-; ARM: mvn	r0, #256
-; THUMB: movw r0, #65279
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t3
+; CHECK:       mvn r0, #256
   call void @foo(i32 -257)
   ret void
 }
@@ -43,66 +34,60 @@
 ; Load from constant pool
 define void @t4() nounwind {
 entry:
-; ARM: t4
-; THUMB: t4
-; ARM: ldr	r0
-; THUMB: movw r0, #65278
-; THUMB: movt r0, #65535
+; ARM-LABEL:   t4
+; ARM:         ldr r0
+; THUMB-LABEL: t4
+; THUMB:       movw r0, #65278
+; THUMB:       movt r0, #65535
   call void @foo(i32 -258)
   ret void
 }
 
 define void @t5() nounwind {
 entry:
-; ARM: t5
-; THUMB: t5
-; ARM: mvn r0, #65280
-; THUMB: movs r0, #255
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t5
+; CHECK:       mvn r0, #65280
   call void @foo(i32 -65281)
   ret void
 }
 
 define void @t6() nounwind {
 entry:
-; ARM: t6
-; THUMB: t6
-; ARM: mvn r0, #978944
-; THUMB: movw r0, #4095
-; THUMB: movt r0, #65521
+; CHECK-LABEL: t6
+; CHECK:       mvn r0, #978944
   call void @foo(i32 -978945)
   ret void
 }
 
 define void @t7() nounwind {
 entry:
-; ARM: t7
-; THUMB: t7
-; ARM: mvn r0, #267386880
-; THUMB: movw r0, #65535
-; THUMB: movt r0, #61455
+; CHECK-LABEL: t7
+; CHECK:       mvn r0, #267386880
   call void @foo(i32 -267386881)
   ret void
 }
 
 define void @t8() nounwind {
 entry:
-; ARM: t8
-; THUMB: t8
-; ARM: mvn r0, #65280
-; THUMB: movs r0, #255
-; THUMB: movt r0, #65535
+; CHECK-LABEL: t8
+; CHECK:       mvn r0, #65280
   call void @foo(i32 -65281)
   ret void
 }
 
 define void @t9() nounwind {
 entry:
-; ARM: t9
-; THUMB: t9
-; ARM: mvn r0, #2130706432
-; THUMB: movw r0, #65535
-; THUMB: movt r0, #33023
+; CHECK-LABEL: t9
+; CHECK:       mvn r0, #2130706432
   call void @foo(i32 -2130706433)
   ret void
 }
+
+; Load from constant pool.
+define i32 @t10(i32 %a) {
+; MOVT-LABEL: t10
+; MOVT:       ldr
+  %1 = xor i32 -1998730207, %a
+  ret i32 %1
+}
+

diff --git a/test/CodeGen/ARM/fast-isel-select.ll b/test/CodeGen/ARM/fast-isel-select.ll
index 40f8807..549c97e 100644
--- a/test/CodeGen/ARM/fast-isel-select.ll
+++ b/test/CodeGen/ARM/fast-isel-select.ll

@@ -12,7 +12,6 @@
 ; ARM: mov r0, r{{[1-9]}}
 ; THUMB: t1
 ; THUMB: movs r{{[1-9]}}, #10
-; THUMB: movt r{{[1-9]}}, #0
 ; THUMB: cmp r0, #0
 ; THUMB: it eq
 ; THUMB: moveq r{{[1-9]}}, #20
@@ -59,13 +58,12 @@
 ; ARM: cmp r0, #0
 ; ARM: mvneq r{{[1-9]}}, #0
 ; ARM: mov r0, r{{[1-9]}}
-; THUMB: t4
-; THUMB: movw r{{[1-9]}}, #65526
-; THUMB: movt r{{[1-9]}}, #65535
+; THUMB-LABEL: t4
+; THUMB: mvn [[REG:r[1-9]+]], #9
 ; THUMB: cmp r0, #0
 ; THUMB: it eq
-; THUMB: mvneq r{{[1-9]}}, #0
-; THUMB: mov r0, r{{[1-9]}}
+; THUMB: mvneq [[REG]], #0
+; THUMB: mov r0, [[REG]]
   %0 = select i1 %c, i32 -10, i32 -1
   ret i32 %0
 }

diff --git a/test/CodeGen/ARM/fast-isel-vararg.ll b/test/CodeGen/ARM/fast-isel-vararg.ll
index 0b7b0bd..3ff2b15 100644
--- a/test/CodeGen/ARM/fast-isel-vararg.ll
+++ b/test/CodeGen/ARM/fast-isel-vararg.ll

@@ -29,7 +29,6 @@
 ; ARM: bl {{_?CallVariadic}}
 ; THUMB: sub sp, #32
 ; THUMB: movs r0, #5
-; THUMB: movt r0, #0
 ; THUMB: ldr r1, [sp, #28]
 ; THUMB: ldr r2, [sp, #24]
 ; THUMB: ldr r3, [sp, #20]

diff --git a/test/CodeGen/ARM/fnegs.ll b/test/CodeGen/ARM/fnegs.ll
index 36af835..65fe9e3 100644
--- a/test/CodeGen/ARM/fnegs.ll
+++ b/test/CodeGen/ARM/fnegs.ll

@@ -1,9 +1,12 @@
 ; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=VFP2
 
-; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \
+; RUN: llc -mtriple=arm-eabi -mattr=+neon,-neonfp %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=NFP0
 
+; RUN: llc -mtriple=arm-eabi -mattr=+neon,+neonfp %s -o - \
+; RUN:  | FileCheck %s -check-prefix=NFP1
+
 ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \
 ; RUN:  | FileCheck %s -check-prefix=CORTEXA8
 
@@ -70,3 +73,49 @@
 ; CORTEXA9-LABEL: test2:
 ; CORTEXA9: 	vneg.f32	s{{.*}}, s{{.*}}
 
+; If we're bitcasting an integer to an FP vector, we should avoid the FP/vector unit entirely.
+; Make sure that we're flipping the sign bit and only the sign bit of each float (PR20354).
+; So instead of something like this:
+;    vmov     d16, r0, r1
+;    vneg.f32 d16, d16
+;    vmov     r0, r1, d16
+;
+; We should generate:
+;    eor     r0, r0, #-214783648
+;    eor     r1, r1, #-214783648
+
+define <2 x float> @fneg_bitcast(i64 %i) {
+  %bitcast = bitcast i64 %i to <2 x float>
+  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
+  ret <2 x float> %fneg
+}
+; VFP2-LABEL: fneg_bitcast:
+; VFP2-DAG: eor r0, r0, #-2147483648
+; VFP2-DAG: eor r1, r1, #-2147483648
+; VFP2-NOT:  vneg.f32
+
+; NFP1-LABEL: fneg_bitcast:
+; NFP1-DAG: eor r0, r0, #-2147483648
+; NFP1-DAG: eor r1, r1, #-2147483648
+; NFP1-NOT: vneg.f32
+
+; NFP0-LABEL: fneg_bitcast:
+; NFP0-DAG: eor r0, r0, #-2147483648
+; NFP0-DAG: eor r1, r1, #-2147483648
+; NFP0-NOT: vneg.f32
+
+; CORTEXA8-LABEL: fneg_bitcast:
+; CORTEXA8-DAG: eor r0, r0, #-2147483648
+; CORTEXA8-DAG: eor r1, r1, #-2147483648
+; CORTEXA8-NOT:         vneg.f32
+
+; CORTEXA8U-LABEL: fneg_bitcast:
+; CORTEXA8U-DAG: eor r0, r0, #-2147483648
+; CORTEXA8U-DAG: eor r1, r1, #-2147483648
+; CORTEXA8U-NOT:        vneg.f32
+
+; CORTEXA9-LABEL: fneg_bitcast:
+; CORTEXA9-DAG: eor r0, r0, #-2147483648
+; CORTEXA9-DAG: eor r1, r1, #-2147483648
+; CORTEXA9-NOT:         vneg.f32
+

diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
index eb0120f..514d4a9 100644
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll

@@ -167,9 +167,9 @@
 define void @test_varsize(...) minsize {
 ; CHECK-T1-LABEL: test_varsize:
 ; CHECK-T1: sub	sp, #16
-; CHECK-T1: push	{r2, r3, r4, r5, r7, lr}
+; CHECK-T1: push	{r5, r6, r7, lr}
 ; ...
-; CHECK-T1: pop	{r2, r3, r4, r5, r7}
+; CHECK-T1: pop	{r2, r3, r7}
 ; CHECK-T1: pop	{r3}
 ; CHECK-T1: add	sp, #16
 ; CHECK-T1: bx	r3
@@ -183,6 +183,7 @@
 ; CHECK: bx	lr
 
   %var = alloca i8, i32 8
+  call void @llvm.va_start(i8* %var)
   call void @bar(i8* %var)
   ret void
 }
@@ -216,3 +217,5 @@
 exit:                                             ; preds = %if.then, %entry
   ret float %call1
 }
+
+declare void @llvm.va_start(i8*) nounwind

diff --git a/test/CodeGen/ARM/fp16.ll b/test/CodeGen/ARM/fp16.ll
index fba7946..5a926ac 100644
--- a/test/CodeGen/ARM/fp16.ll
+++ b/test/CodeGen/ARM/fp16.ll

@@ -1,32 +1,84 @@
 ; RUN: llc < %s | FileCheck %s
 ; RUN: llc -mattr=+vfp3,+fp16 < %s | FileCheck --check-prefix=CHECK-FP16 %s
+; RUN: llc -mtriple=armv8-eabihf < %s | FileCheck --check-prefix=CHECK-ARMV8 %s
+; RUN: llc -mtriple=thumbv7m-eabi < %s | FileCheck --check-prefix=CHECK-SOFTFLOAT %s
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
-target triple = "armv7-eabi"
+target triple = "armv7---eabihf"
 
 @x = global i16 12902
 @y = global i16 0
 @z = common global i16 0
 
-define arm_aapcs_vfpcc void @foo() nounwind {
+define void @foo() nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK-FP16-LABEL: foo:
+; CHECK-ARMV8-LABEL: foo:
+; CHECK-SOFTFLOAT-LABEL: foo:
 entry:
   %0 = load i16* @x, align 2
   %1 = load i16* @y, align 2
-  %2 = tail call float @llvm.convert.from.fp16(i16 %0)
+  %2 = tail call float @llvm.convert.from.fp16.f32(i16 %0)
 ; CHECK: __gnu_h2f_ieee
 ; CHECK-FP16: vcvtb.f32.f16
-  %3 = tail call float @llvm.convert.from.fp16(i16 %1)
+; CHECK-ARMv8: vcvtb.f32.f16
+; CHECK-SOFTFLOAT: __gnu_h2f_ieee
+  %3 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
 ; CHECK: __gnu_h2f_ieee
 ; CHECK-FP16: vcvtb.f32.f16
+; CHECK-ARMV8: vcvtb.f32.f16
+; CHECK-SOFTFLOAT: __gnu_h2f_ieee
   %4 = fadd float %2, %3
-  %5 = tail call i16 @llvm.convert.to.fp16(float %4)
+  %5 = tail call i16 @llvm.convert.to.fp16.f32(float %4)
 ; CHECK: __gnu_f2h_ieee
 ; CHECK-FP16: vcvtb.f16.f32
+; CHECK-ARMV8: vcvtb.f16.f32
+; CHECK-SOFTFLOAT: __gnu_f2h_ieee
   store i16 %5, i16* @x, align 2
   ret void
 }
 
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
+define double @test_from_fp16(i16 %in) {
+; CHECK-LABEL: test_from_fp16:
+; CHECK-FP16-LABEL: test_from_fp16:
+; CHECK-ARMV8-LABEL: test_from_fp16:
+; CHECK-SOFTFLOAT-LABEL: test_from_fp16:
+  %val = call double @llvm.convert.from.fp16.f64(i16 %in)
+; CHECK: bl __gnu_h2f_ieee
+; CHECK: vmov [[TMP:s[0-9]+]], r0
+; CHECK: vcvt.f64.f32 d0, [[TMP]]
 
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
+; CHECK-FP16: vmov [[TMP16:s[0-9]+]], r0
+; CHECK-FP16: vcvtb.f32.f16 [[TMP32:s[0-9]+]], [[TMP16]]
+; CHECK-FP16: vcvt.f64.f32 d0, [[TMP32]]
+
+; CHECK-ARMV8: vmov [[TMP:s[0-9]+]], r0
+; CHECK-ARMV8: vcvtb.f64.f16 d0, [[TMP]]
+
+; CHECK-SOFTFLOAT: bl __gnu_h2f_ieee
+; CHECK-SOFTFLOAT: bl __aeabi_f2d
+  ret double %val
+}
+
+define i16 @test_to_fp16(double %in) {
+; CHECK-LABEL: test_to_fp16:
+; CHECK-FP16-LABEL: test_to_fp16:
+; CHECK-ARMV8-LABEL: test_to_fp16:
+; CHECK-SOFTFLOAT-LABEL: test_to_fp16:
+  %val = call i16 @llvm.convert.to.fp16.f64(double %in)
+; CHECK: bl __aeabi_d2h
+
+; CHECK-FP16: bl __aeabi_d2h
+
+; CHECK-ARMV8: vcvtb.f16.f64 [[TMP:s[0-9]+]], d0
+; CHECK-ARMV8: vmov r0, [[TMP]]
+
+; CHECK-SOFTFLOAT: bl __aeabi_d2h
+  ret i16 %val
+}
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone

diff --git a/test/CodeGen/ARM/fpcmp-f64-neon-opt.ll b/test/CodeGen/ARM/fpcmp-f64-neon-opt.ll
new file mode 100644
index 0000000..7444a68
--- /dev/null
+++ b/test/CodeGen/ARM/fpcmp-f64-neon-opt.ll

@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=linux-arm-gnueabihf -mattr=+neon %s -o - | FileCheck %s
+
+; Check that no intermediate integer register is used.
+define i32 @no-intermediate-register-for-zero-imm(double %x) #0 {
+entry:
+; CHECK-LABEL: no-intermediate-register-for-zero-imm
+; CHECK-NOT: vmov
+; CHECK: vcmp
+  %cmp = fcmp une double %x, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}

diff --git a/test/CodeGen/ARM/half.ll b/test/CodeGen/ARM/half.ll
new file mode 100644
index 0000000..10cebb3
--- /dev/null
+++ b/test/CodeGen/ARM/half.ll

@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OLD
+; RUN: llc < %s -mtriple=thumbv7s-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-F16
+; RUN: llc < %s -mtriple=thumbv8-apple-ios7.0 | FileCheck %s --check-prefix=CHECK  --check-prefix=CHECK-V8
+
+define void @test_load_store(half* %in, half* %out) {
+; CHECK-LABEL: test_load_store:
+; CHECK: ldrh [[TMP:r[0-9]+]], [r0]
+; CHECK: strh [[TMP]], [r1]
+  %val = load half* %in
+  store half %val, half* %out
+  ret void
+}
+
+define i16 @test_bitcast_from_half(half* %addr) {
+; CHECK-LABEL: test_bitcast_from_half:
+; CHECK: ldrh r0, [r0]
+  %val = load half* %addr
+  %val_int = bitcast half %val to i16
+  ret i16 %val_int
+}
+
+define void @test_bitcast_to_half(half* %addr, i16 %in) {
+; CHECK-LABEL: test_bitcast_to_half:
+; CHECK: strh r1, [r0]
+  %val_fp = bitcast i16 %in to half
+  store half %val_fp, half* %addr
+  ret void
+}
+
+define float @test_extend32(half* %addr) {
+; CHECK-LABEL: test_extend32:
+
+; CHECK-OLD: b.w ___gnu_h2f_ieee
+; CHECK-F16: vcvtb.f32.f16
+; CHECK-V8: vcvtb.f32.f16
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to float
+  ret float %val32
+}
+
+define double @test_extend64(half* %addr) {
+; CHECK-LABEL: test_extend64:
+
+; CHECK-OLD: blx ___gnu_h2f_ieee
+; CHECK-OLD: vcvt.f64.f32
+; CHECK-F16: vcvtb.f32.f16
+; CHECK-F16: vcvt.f64.f32
+; CHECK-V8: vcvtb.f64.f16
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to double
+  ret double %val32
+}
+
+define void @test_trunc32(float %in, half* %addr) {
+; CHECK-LABEL: test_trunc32:
+
+; CHECK-OLD: blx ___gnu_f2h_ieee
+; CHECK-F16: vcvtb.f16.f32
+; CHECK-V8: vcvtb.f16.f32
+  %val16 = fptrunc float %in to half
+  store half %val16, half* %addr
+  ret void
+}
+
+define void @test_trunc64(double %in, half* %addr) {
+; CHECK-LABEL: test_trunc64:
+
+; CHECK-OLD: blx ___truncdfhf2
+; CHECK-F16: blx ___truncdfhf2
+; CHECK-V8: vcvtb.f16.f64
+  %val16 = fptrunc double %in to half
+  store half %val16, half* %addr
+  ret void
+}

diff --git a/test/CodeGen/ARM/inlineasm-global.ll b/test/CodeGen/ARM/inlineasm-global.ll
new file mode 100644
index 0000000..fd210f4
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-global.ll

@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=thumb-unknown-unknown -no-integrated-as < %s | FileCheck %s --check-prefix=THUMB
+; RUN: llc -mtriple=arm-unknown-unknown -no-integrated-as < %s | FileCheck %s --check-prefix=ARM
+
+; In thumb mode, emit ".code 16" before global inline-asm instructions.
+
+; THUMB: .code 16
+; THUMB: stmib
+; THUMB: .code 16
+
+; ARM-NOT: .code 16
+; ARM:     stmib
+
+module asm "stmib sp, {r0-r14};"

diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
index cb67dd9..96d1ee2 100644
--- a/test/CodeGen/ARM/interrupt-attr.ll
+++ b/test/CodeGen/ARM/interrupt-attr.ll

@@ -40,7 +40,7 @@
 ; CHECK-M: mov r4, sp
 ; CHECK-M: bic r4, r4, #7
 ; CHECK-M: mov sp, r4
-; CHECK-M: blx _bar
+; CHECK-M: bl _bar
 ; CHECK-M: sub.w r4, r11, #8
 ; CHECK-M: mov sp, r4
 ; CHECK-M: pop.w {r4, r10, r11, pc}

diff --git a/test/CodeGen/ARM/invalid-target.ll b/test/CodeGen/ARM/invalid-target.ll
new file mode 100644
index 0000000..bb0ada4
--- /dev/null
+++ b/test/CodeGen/ARM/invalid-target.ll

@@ -0,0 +1,32 @@
+; RUN: not llc -mtriple armvinvalid-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=ARMVINVALID
+
+; RUN: not llc -mtriple armebvinvalid-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=ARMEBVINVALID
+
+; RUN: not llc -mtriple thumbvinvalid-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=THUMBVINVALID
+
+; RUN: not llc -mtriple thumbebvinvalid-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=THUMBEBVINVALID
+
+; RUN: not llc -mtriple thumbv2-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=THUMBV2
+
+; RUN: not llc -mtriple thumbv3-linux-gnueabi %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=THUMBV3
+
+; RUN: not llc -mtriple arm64invalid-linux-gnu %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=ARM64INVALID
+
+; RUN: not llc -mtriple aarch64invalid-linux-gnu %s -o - 2>&1 | \
+; RUN: FileCheck %s --check-prefix=AARCH64INVALID
+
+; ARMVINVALID: error: unable to get target for 'armvinvalid--linux-gnueabi'
+; ARMEBVINVALID: error: unable to get target for 'armebvinvalid--linux-gnueabi'
+; THUMBVINVALID: error: unable to get target for 'thumbvinvalid--linux-gnueabi'
+; THUMBEBVINVALID: error: unable to get target for 'thumbebvinvalid--linux-gnueabi'
+; THUMBV2: error: unable to get target for 'thumbv2--linux-gnueabi'
+; THUMBV3: error: unable to get target for 'thumbv3--linux-gnueabi'
+; ARM64INVALID: error: unable to get target for 'arm64invalid--linux-gnu'
+; AARCH64INVALID: error: unable to get target for 'aarch64invalid--linux-gnu'

diff --git a/test/CodeGen/ARM/jump_tables.ll b/test/CodeGen/ARM/jump_tables.ll
deleted file mode 100644
index 907a86c..0000000
--- a/test/CodeGen/ARM/jump_tables.ll
+++ /dev/null

@@ -1,32 +0,0 @@
-; RUN: llc <%s -mtriple=arm-unknown-linux-gnueabi -jump-table-type=single | FileCheck --check-prefix=ARM %s
-; RUN: llc <%s -mtriple=thumb-unknown-linux-gnueabi -jump-table-type=single | FileCheck --check-prefix=THUMB %s
-
-define void @indirect_fun() unnamed_addr jumptable {
-  ret void
-}
-define void ()* @get_fun() {
-  ret void ()* @indirect_fun
-
-; ARM:         ldr     r0, [[LABEL:.*]]
-; ARM:         mov     pc, lr
-; ARM: [[LABEL]]:
-; ARM:         .long   __llvm_jump_instr_table_0_1
-
-; THUMB:         ldr     r0, [[LABEL:.*]]
-; THUMB:         bx      lr
-; THUMB: [[LABEL]]:
-; THUMB:         .long   __llvm_jump_instr_table_0_1
-}
-
-; ARM:         .globl  __llvm_jump_instr_table_0_1
-; ARM:         .align  3
-; ARM:         .type   __llvm_jump_instr_table_0_1,%function
-; ARM: __llvm_jump_instr_table_0_1:
-; ARM:         b     indirect_fun(PLT)
-
-; THUMB:         .globl  __llvm_jump_instr_table_0_1
-; THUMB:         .align  3
-; THUMB:         .thumb_func
-; THUMB:         .type   __llvm_jump_instr_table_0_1,%function
-; THUMB: __llvm_jump_instr_table_0_1:
-; THUMB:         b     indirect_fun(PLT)

diff --git a/test/CodeGen/ARM/negative-offset.ll b/test/CodeGen/ARM/negative-offset.ll
new file mode 100644
index 0000000..7b949fd
--- /dev/null
+++ b/test/CodeGen/ARM/negative-offset.ll

@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=arm-eabi -O3 %s -o - | FileCheck %s
+
+; Function Attrs: nounwind readonly
+define arm_aapcscc i32 @sum(i32* nocapture readonly %p) #0 {
+entry:
+;CHECK-LABEL: sum:
+;CHECK-NOT: sub
+;CHECK: ldr r{{.*}}, [r0, #-16]
+;CHECK: ldr r{{.*}}, [r0, #-8]
+  %arrayidx = getelementptr inbounds i32* %p, i32 -4
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %p, i32 -2
+  %1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  ret i32 %add
+}
+

diff --git a/test/CodeGen/ARM/no-tail-call.ll b/test/CodeGen/ARM/no-tail-call.ll
new file mode 100644
index 0000000..3a8cb21
--- /dev/null
+++ b/test/CodeGen/ARM/no-tail-call.ll

@@ -0,0 +1,84 @@
+; RUN: llc < %s -O0 -o - | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "armv7s-apple-ios7"
+
+%foo = type <{ %Sf }>
+%Sf = type <{ float }>
+
+declare float @llvm.ceil.f32(float) 
+
+; Check that we are not emitting a tail call for the last call to ceil.
+; This function returns three different results.
+; CHECK-LABEL: func1:
+; CHECK-NOT: b _ceilf
+; CHECK: pop
+define { float, float, float } @func1() {
+entry:
+  %0 = alloca %foo, align 4
+  %1 = alloca %foo, align 4
+  %2 = alloca %foo, align 4
+  %.native = getelementptr inbounds %foo* %0, i32 0, i32 0
+  %.native.value = getelementptr inbounds %Sf* %.native, i32 0, i32 0
+  store float 0.000000e+00, float* %.native.value, align 4
+  %.native1 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native1.value = getelementptr inbounds %Sf* %.native1, i32 0, i32 0
+  store float 1.000000e+00, float* %.native1.value, align 4
+  %.native2 = getelementptr inbounds %foo* %2, i32 0, i32 0
+  %.native2.value = getelementptr inbounds %Sf* %.native2, i32 0, i32 0
+  store float 5.000000e+00, float* %.native2.value, align 4
+  br i1 true, label %3, label %4
+
+; <label>:3                                       ; preds = %entry
+  %.native4 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native4.value = getelementptr inbounds %Sf* %.native4, i32 0, i32 0
+  store float 2.000000e+00, float* %.native4.value, align 4
+  br label %4
+
+; <label>:4                                       ; preds = %3, %entry
+  %5 = call float @llvm.ceil.f32(float 5.000000e+00)
+  %.native3 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native3.value = getelementptr inbounds %Sf* %.native3, i32 0, i32 0
+  %6 = load float* %.native3.value, align 4
+  %7 = call float @llvm.ceil.f32(float %6)
+  %8 = insertvalue { float, float, float } { float 0.000000e+00, float undef, float undef }, float %5, 1
+  %9 = insertvalue { float, float, float } %8, float %7, 2
+  ret { float, float, float } %9
+}
+
+; Check that we are not emitting a tail call for the last call to ceil.
+; This function returns two different results.
+; CHECK-LABEL: func2:
+; CHECK-NOT: b _ceilf
+; CHECK: pop
+define { float, float } @func2() {
+entry:
+  %0 = alloca %foo, align 4
+  %1 = alloca %foo, align 4
+  %2 = alloca %foo, align 4
+  %.native = getelementptr inbounds %foo* %0, i32 0, i32 0
+  %.native.value = getelementptr inbounds %Sf* %.native, i32 0, i32 0
+  store float 0.000000e+00, float* %.native.value, align 4
+  %.native1 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native1.value = getelementptr inbounds %Sf* %.native1, i32 0, i32 0
+  store float 1.000000e+00, float* %.native1.value, align 4
+  %.native2 = getelementptr inbounds %foo* %2, i32 0, i32 0
+  %.native2.value = getelementptr inbounds %Sf* %.native2, i32 0, i32 0
+  store float 5.000000e+00, float* %.native2.value, align 4
+  br i1 true, label %3, label %4
+
+; <label>:3                                       ; preds = %entry
+  %.native4 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native4.value = getelementptr inbounds %Sf* %.native4, i32 0, i32 0
+  store float 2.000000e+00, float* %.native4.value, align 4
+  br label %4
+
+; <label>:4                                       ; preds = %3, %entry
+  %5 = call float @llvm.ceil.f32(float 5.000000e+00)
+  %.native3 = getelementptr inbounds %foo* %1, i32 0, i32 0
+  %.native3.value = getelementptr inbounds %Sf* %.native3, i32 0, i32 0
+  %6 = load float* %.native3.value, align 4
+  %7 = call float @llvm.ceil.f32(float %6)
+  %8 = insertvalue { float, float } { float 0.000000e+00, float undef }, float %7, 1
+  ret { float, float } %8
+}
+

diff --git a/test/CodeGen/ARM/none-macho-v4t.ll b/test/CodeGen/ARM/none-macho-v4t.ll
new file mode 100644
index 0000000..4c6e68e
--- /dev/null
+++ b/test/CodeGen/ARM/none-macho-v4t.ll

@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=thumb-none-macho -mcpu=arm7tdmi %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-none-macho -mcpu=arm7tdmi %s -filetype=obj -o /dev/null
+
+declare void @callee()
+
+define void @test_call() {
+  ; BX can only take a register before v5t came along, so we must materialise
+  ; the address properly.
+; CHECK-LABEL: test_call:
+; CHECK: ldr r[[CALLEE_STUB:[0-9]+]], [[LITPOOL:LCPI[0-9]+_[0-9]+]]
+; CHECK: [[PC_LABEL:LPC[0-9]+_[0-9]+]]:
+; CHECK-NEXT: add r[[CALLEE_STUB]], pc
+; CHECK: ldr [[CALLEE:r[0-9]+]], [r[[CALLEE_STUB]]]
+; CHECK: mov lr, pc
+; CHECK: bx [[CALLEE]]
+
+; CHECK: [[LITPOOL]]:
+; CHECK-NEXT: .long L_callee$non_lazy_ptr-([[PC_LABEL]]+4)
+  call void @callee()
+  ret void
+}

diff --git a/test/CodeGen/ARM/none-macho.ll b/test/CodeGen/ARM/none-macho.ll
index 60c2171..2a7878f 100644
--- a/test/CodeGen/ARM/none-macho.ll
+++ b/test/CodeGen/ARM/none-macho.ll

@@ -84,7 +84,7 @@
 
   ; Soft-float calls should be GNU-style rather than RTABI and should not be the
   ; *vfp variants used for ARMv6 iOS.
-; CHECK: blx ___addsf3{{$}}
+; CHECK: bl ___addsf3{{$}}
   ret float %sum
 }
 

diff --git a/test/CodeGen/ARM/out-of-registers.ll b/test/CodeGen/ARM/out-of-registers.ll
new file mode 100644
index 0000000..790e416
--- /dev/null
+++ b/test/CodeGen/ARM/out-of-registers.ll

@@ -0,0 +1,42 @@
+; RUN: llc -O3 %s -o - | FileCheck %s
+; ModuleID = 'fo.c'
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n8:16:32-S64"
+target triple = "thumbv7-none-linux-gnueabi"
+
+; CHECK: vpush
+; CHECK: vpop
+
+define void @foo(float* nocapture %A) #0 {
+  %1= bitcast float* %A to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
+  %div3p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %4
+  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
+  %div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5
+  %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
+  %div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6
+  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
+
+; Function Attrs: nounwind readonly
+
+; Function Attrs: nounwind
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2
+
+; Function Attrs: nounwind
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"Snapdragon LLVM ARM Compiler 3.4"}
+!1 = metadata !{metadata !1}

diff --git a/test/CodeGen/ARM/pr18364-movw.ll b/test/CodeGen/ARM/pr18364-movw.ll
new file mode 100644
index 0000000..fdcf154
--- /dev/null
+++ b/test/CodeGen/ARM/pr18364-movw.ll

@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=armv5te | FileCheck %s --check-prefix=V5
+; RUN: llc < %s -mtriple=armv6   | FileCheck %s --check-prefix=V6
+; RUN: llc < %s -mtriple=armv6t2 | FileCheck %s --check-prefix=V6T2
+; RUN: llc < %s -mtriple=armv7   | FileCheck %s --check-prefix=V7
+; PR18364
+
+define i64 @f() #0 {
+entry:
+; V5-NOT: movw
+; V6-NOT: movw
+; V6T2: movw
+; V7: movw
+  %y = alloca i64, align 8
+  %z = alloca i64, align 8
+  store i64 1, i64* %y, align 8
+  store i64 11579764786944, i64* %z, align 8
+  %0 = load i64* %y, align 8
+  %1 = load i64* %z, align 8
+  %sub = sub i64 %0, %1
+  ret i64 %sub
+}
+
+define i64 @g(i64 %a, i32 %b) #0 {
+entry:
+; V5-NOT: movw
+; V6-NOT: movw
+; V6T2: movw
+; V7: movw
+  %0 = mul i64 %a, 86400000
+  %mul = add i64 %0, -210866803200000
+  %conv = sext i32 %b to i64
+  %add = add nsw i64 %mul, %conv
+  ret i64 %add
+}

diff --git a/test/CodeGen/ARM/preferred-align.ll b/test/CodeGen/ARM/preferred-align.ll
new file mode 100644
index 0000000..8cd4ef6
--- /dev/null
+++ b/test/CodeGen/ARM/preferred-align.ll

@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=armv7-linux-gnueabi %s -o - | FileCheck %s
+
+@var_agg = global {i8, i8} zeroinitializer
+
+; CHECK: .globl var_agg
+; CHECK-NEXT: .align 2
+
+@var1 = global i1 zeroinitializer
+
+; CHECK: .globl var1
+; CHECK-NOT: .align
+
+@var8 = global i8 zeroinitializer
+
+; CHECK: .globl var8
+; CHECK-NOT: .align
+
+@var16 = global i16 zeroinitializer
+
+; CHECK: .globl var16
+; CHECK-NEXT: .align 1
\ No newline at end of file

diff --git a/test/CodeGen/ARM/prefetch.ll b/test/CodeGen/ARM/prefetch.ll
index 7350e0a..7fdc5b6 100644
--- a/test/CodeGen/ARM/prefetch.ll
+++ b/test/CodeGen/ARM/prefetch.ll

@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=thumb-eabi -mattr=-thumb2 %s -o - | FileCheck %s -check-prefix CHECK-T1
 ; RUN: llc -mtriple=thumb-eabi -mattr=+v7 %s -o - | FileCheck %s -check-prefix=THUMB2
 ; RUN: llc -mtriple=arm-eabi -mattr=+v7 %s -o - | FileCheck %s -check-prefix=ARM
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9-mp %s -o - | FileCheck %s -check-prefix=ARM-MP
+; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=ARM-MP
 ; rdar://8601536
 
 ; CHECK-T1-NOT: pld

diff --git a/test/CodeGen/ARM/rbit.ll b/test/CodeGen/ARM/rbit.ll
new file mode 100644
index 0000000..41f866f
--- /dev/null
+++ b/test/CodeGen/ARM/rbit.ll

@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=armv8-eabi %s -o - | FileCheck %s
+
+; CHECK-LABEL: rbit
+; CHECK: rbit r0, r0
+define i32 @rbit(i32 %t) {
+entry:
+  %rbit = call i32 @llvm.arm.rbit(i32 %t)
+  ret i32 %rbit
+}
+
+; CHECK-LABEL: rbit_constant
+; CHECK: mov r0, #0
+; CHECK: rbit r0, r0
+define i32 @rbit_constant() {
+entry:
+  %rbit.i = call i32 @llvm.arm.rbit(i32 0)
+  ret i32 %rbit.i
+}
+
+declare i32 @llvm.arm.rbit(i32)

diff --git a/test/CodeGen/ARM/sbfx.ll b/test/CodeGen/ARM/sbfx.ll
index 3c25edc..5b77c59 100644
--- a/test/CodeGen/ARM/sbfx.ll
+++ b/test/CodeGen/ARM/sbfx.ll

@@ -45,3 +45,21 @@
     %tmp2 = ashr i32 %tmp, 1
     ret i32 %tmp2
 }
+
+define signext i8 @f6(i32 %a) {
+; CHECK-LABEL: f6:
+; CHECK: sbfx r0, r0, #23, #8
+
+  %tmp = lshr i32 %a, 23
+  %res = trunc i32 %tmp to i8
+  ret i8 %res
+}
+
+define signext i8 @f7(i32 %a) {
+; CHECK-LABEL: f7:
+; CHECK-NOT: sbfx
+
+  %tmp = lshr i32 %a, 25
+  %res = trunc i32 %tmp to i8
+  ret i8 %res
+}

diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll
index e13504a..326eb51 100644
--- a/test/CodeGen/ARM/select_xform.ll
+++ b/test/CodeGen/ARM/select_xform.ll

@@ -222,3 +222,110 @@
   %add = add i32 %conv, %c
   ret i32 %add
 }
+
+; Do not fold the xor into the select
+define i32 @t15(i32 %p) {
+entry:
+; ARM-LABEL: t15:
+; ARM: mov     [[REG:r[0-9]+]], #2
+; ARM: cmp     r0, #8
+; ARM: movwgt  [[REG:r[0-9]+]], #1
+; ARM: eor     r0, [[REG:r[0-9]+]], #1
+
+; T2-LABEL: t15:
+; T2: movs    [[REG:r[0-9]+]], #2
+; T2: cmp     [[REG:r[0-9]+]], #8
+; T2: it      gt
+; T2: movgt   [[REG:r[0-9]+]], #1
+; T2: eor     r0, [[REG:r[0-9]+]], #1
+  %cmp = icmp sgt i32 %p, 8
+  %a = select i1 %cmp, i32 1, i32 2
+  %xor = xor i32 %a, 1
+  ret i32 %xor
+}
+
+define i32 @t16(i32 %x, i32 %y) {
+entry:
+; ARM-LABEL: t16:
+; ARM: and r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t16:
+; T2: ands r0, {{r[0-9]+}}
+  %cmp = icmp eq i32 %x, 0
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp eq i32 %y, 0
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %and = and i32 %cond2, %cond
+  ret i32 %and
+}
+
+define i32 @t17(i32 %x, i32 %y) #0 {
+entry:
+; ARM-LABEL: t17:
+; ARM: and r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t17:
+; T2: ands r0, {{r[0-9]+}}
+  %cmp = icmp eq i32 %x, -1
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp eq i32 %y, -1
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %and = and i32 %cond2, %cond
+  ret i32 %and
+}
+
+define i32 @t18(i32 %x, i32 %y) #0 {
+entry:
+; ARM-LABEL: t18:
+; ARM: and r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t18:
+; T2: and.w r0, {{r[0-9]+}}
+  %cmp = icmp ne i32 %x, 0
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp ne i32 %x, -1
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %and = and i32 %cond2, %cond
+  ret i32 %and
+}
+
+define i32 @t19(i32 %x, i32 %y) #0 {
+entry:
+; ARM-LABEL: t19:
+; ARM: orr r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t19:
+; T2: orrs r0, {{r[0-9]+}}
+  %cmp = icmp ne i32 %x, 0
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp ne i32 %y, 0
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %or = or i32 %cond2, %cond
+  ret i32 %or
+}
+
+define i32 @t20(i32 %x, i32 %y) #0 {
+entry:
+; ARM-LABEL: t20:
+; ARM: orr r0, {{r[0-9]+}}, {{r[0-9]+}}
+
+; T2-LABEL: t20:
+; T2: orrs r0, {{r[0-9]+}}
+  %cmp = icmp ne i32 %x, -1
+  %cond = select i1 %cmp, i32 5, i32 2
+  %cmp1 = icmp ne i32 %y, -1
+  %cond2 = select i1 %cmp1, i32 3, i32 4
+  %or = or i32 %cond2, %cond
+  ret i32 %or
+}
+
+define  <2 x i32> @t21(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: t21:
+; CHECK-NOT: eor
+; CHECK: mvn
+; CHECK-NOT: eor
+  %tst = icmp eq <2 x i32> %lhs, %rhs
+  %ntst = xor <2 x i1> %tst, <i1 1 , i1 undef>
+  %btst = sext <2 x i1> %ntst to <2 x i32>
+  ret <2 x i32> %btst
+}

diff --git a/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll
new file mode 100644
index 0000000..3cf2a08
--- /dev/null
+++ b/test/CodeGen/ARM/sjljehprepare-lower-empty-struct.ll

@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=armv7-apple-ios -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-ios -O1 < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-ios -O2 < %s | FileCheck %s
+; RUN: llc -mtriple=armv7-apple-ios -O3 < %s | FileCheck %s
+
+; SjLjEHPrepare shouldn't crash when lowering empty structs.
+;
+; Checks that between in case of empty structs used as arguments
+; nothing happens, i.e. there are no instructions between
+; __Unwind_SjLj_Register and actual @bar invocation
+
+
+define i8* @foo(i8 %a, {} %c) {
+entry:
+; CHECK: bl __Unwind_SjLj_Register
+; CHECK-NEXT: {{[A-Z][a-zA-Z0-9]*}}:
+; CHECK-NEXT: bl _bar
+  invoke void @bar ()
+    to label %unreachable unwind label %handler
+
+unreachable:
+  unreachable
+
+handler:
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @baz to i8*)
+  cleanup
+  resume { i8*, i32 } undef
+}
+
+declare void @bar()
+declare i32 @baz(...)

diff --git a/test/CodeGen/ARM/smulw.ll b/test/CodeGen/ARM/smulw.ll
new file mode 100644
index 0000000..8653903
--- /dev/null
+++ b/test/CodeGen/ARM/smulw.ll

@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb--none-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s
+
+; We cannot codegen the smulw[bt] or smlaw[bt] instructions for these functions,
+; as the top 16 bits of the result would differ
+
+define i32 @f1(i32 %a, i16 %b) {
+; CHECK-LABEL: f1:
+; CHECK: mul
+; CHECK: asr
+  %tmp1 = sext i16 %b to i32
+  %tmp2 = mul i32 %a, %tmp1
+  %tmp3 = ashr i32 %tmp2, 16
+  ret i32 %tmp3
+}
+
+define i32 @f2(i32 %a, i16 %b, i32 %c) {
+; CHECK-LABEL: f2:
+; CHECK: mul
+; CHECK: add{{.*}}, asr #16
+  %tmp1 = sext i16 %b to i32
+  %tmp2 = mul i32 %a, %tmp1
+  %tmp3 = ashr i32 %tmp2, 16
+  %tmp4 = add i32 %tmp3, %c
+  ret i32 %tmp4
+}

diff --git a/test/CodeGen/ARM/space-directive.ll b/test/CodeGen/ARM/space-directive.ll
new file mode 100644
index 0000000..55be199
--- /dev/null
+++ b/test/CodeGen/ARM/space-directive.ll

@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=armv7 -o - %s | FileCheck %s
+
+define i32 @test_space() minsize {
+; CHECK-LABEL: test_space:
+; CHECK: ldr {{r[0-9]+}}, [[CPENTRY:.?LCPI[0-9]+_[0-9]+]]
+; CHECK: b [[PAST_CP:.?LBB[0-9]+_[0-9]+]]
+
+; CHECK: [[CPENTRY]]:
+; CHECK-NEXT: 12345678
+
+; CHECK: [[PAST_CP]]:
+; CHECK: .zero 10000
+  %addr = inttoptr i32 12345678 to i32*
+  %val = load i32* %addr
+  call i32 @llvm.arm.space(i32 10000, i32 undef)
+  ret i32 %val
+}
+
+declare i32 @llvm.arm.space(i32, i32)

diff --git a/test/CodeGen/ARM/stack_guard_remat.ll b/test/CodeGen/ARM/stack_guard_remat.ll
new file mode 100644
index 0000000..b11ea92
--- /dev/null
+++ b/test/CodeGen/ARM/stack_guard_remat.ll

@@ -0,0 +1,70 @@
+; RUN: llc < %s -mtriple=arm-apple-ios -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -mtriple=arm-apple-ios -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=NO-PIC -check-prefix=STATIC
+; RUN: llc < %s -mtriple=arm-apple-ios -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s  -check-prefix=NO-PIC -check-prefix=DYNAMIC-NO-PIC
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC-V7
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=STATIC-V7
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s -check-prefix=DYNAMIC-NO-PIC-V7
+
+;PIC:   foo2
+;PIC:   ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
+;PIC: [[LABEL1:LPC0_1]]:
+;PIC:   ldr [[R1:r[0-9]+]], [pc, [[R0]]]
+;PIC:   ldr [[R2:r[0-9]+]], {{\[}}[[R1]]{{\]}}
+;PIC:   ldr {{r[0-9]+}}, {{\[}}[[R2]]{{\]}}
+
+;PIC:      [[LABEL0]]:
+;PIC-NEXT:   .long L___stack_chk_guard$non_lazy_ptr-([[LABEL1]]+8)
+
+;NO-PIC: foo2
+;NO-PIC: ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
+;NO-PIC-NOT: LPC
+;NO-PIC: ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+;STATIC:      [[LABEL0]]:
+;STATIC-NEXT:   .long ___stack_chk_guard
+
+;DYNAMIC-NO-PIC:      [[LABEL0]]:
+;DYNAMIC-NO-PIC-NEXT:   .long L___stack_chk_guard$non_lazy_ptr
+
+;PIC-V7:   movw [[R0:r[0-9]+]], :lower16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0:LPC[0-9_]+]]+8))
+;PIC-V7:   movt [[R0]], :upper16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0]]+8))
+;PIC-V7: [[LABEL0]]:
+;PIC-V7:   ldr [[R0]], {{\[}}pc, [[R0]]{{\]}}
+;PIC-V7:   ldr [[R0]], {{\[}}[[R0]]{{\]}}
+
+;PIC-V7: L___stack_chk_guard$non_lazy_ptr:
+;PIC-V7:   .indirect_symbol        ___stack_chk_guard
+
+;STATIC-V7: movw [[R0:r[0-9]+]], :lower16:___stack_chk_guard
+;STATIC-V7: movt [[R0]], :upper16:___stack_chk_guard
+;STATIC-V7: ldr  [[R0]], {{\[}}[[R0]]{{\]}}
+
+;DYNAMIC-NO-PIC-V7: movw [[R0:r[0-9]+]], :lower16:L___stack_chk_guard$non_lazy_ptr
+;DYNAMIC-NO-PIC-V7: movt [[R0]], :upper16:L___stack_chk_guard$non_lazy_ptr
+;DYNAMIC-NO-PIC-V7: ldr  [[R0]], {{\[}}[[R0]]{{\]}}
+;DYNAMIC-NO-PIC-V7: ldr  [[R0]], {{\[}}[[R0]]{{\]}}
+
+;DYNAMIC-NO-PIC-V7: L___stack_chk_guard$non_lazy_ptr:
+;DYNAMIC-NO-PIC-V7:   .indirect_symbol        ___stack_chk_guard
+
+; Function Attrs: nounwind ssp
+define i32 @test_stack_guard_remat() #0 {
+  %a1 = alloca [256 x i32], align 4
+  %1 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %1)
+  %2 = getelementptr inbounds [256 x i32]* %a1, i32 0, i32 0
+  call void @foo3(i32* %2) #3
+  call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %1)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/ARM/swift-atomics.ll b/test/CodeGen/ARM/swift-atomics.ll
index 1d71815..8b100f1 100644
--- a/test/CodeGen/ARM/swift-atomics.ll
+++ b/test/CodeGen/ARM/swift-atomics.ll

@@ -8,6 +8,7 @@
 ; CHECK: dmb ishst
 ; CHECK: str
 
+; CHECK-STRICT-ATOMIC-LABEL: test_store_release:
 ; CHECK-STRICT-ATOMIC: dmb {{ish$}}
   store atomic i32 %v, i32* %p release, align 4
   ret void
@@ -24,7 +25,11 @@
 ; CHECK: ldr
 ; CHECK: dmb {{ish$}}
 
+; CHECK-STRICT-ATOMIC-LABEL: test_seq_cst:
 ; CHECK-STRICT-ATOMIC: dmb {{ish$}}
+; CHECK-STRICT-ATOMIC: str
+; CHECK-STRICT-ATOMIC: dmb {{ish$}}
+; CHECK-STRICT-ATOMIC: ldr
 ; CHECK-STRICT-ATOMIC: dmb {{ish$}}
 
   store atomic i32 %v, i32* %p seq_cst, align 4
@@ -39,6 +44,7 @@
 ; CHECK: ldr
 ; CHECK: dmb {{ish$}}
 
+; CHECK-STRICT-ATOMIC-LABEL: test_acq:
 ; CHECK-STRICT-ATOMIC: dmb {{ish$}}
   %val = load atomic i32* %addr acquire, align 4
   ret i32 %val

diff --git a/test/CodeGen/ARM/sxt_rot.ll b/test/CodeGen/ARM/sxt_rot.ll
index 5ddea2e..4162691 100644
--- a/test/CodeGen/ARM/sxt_rot.ll
+++ b/test/CodeGen/ARM/sxt_rot.ll

@@ -9,7 +9,8 @@
 
 define signext i8 @test1(i32 %A) {
 ; CHECK: test1
-; CHECK: sxtb r0, r0, ror #8
+; CHECK: lsr r0, r0, #8
+; CHECK: sxtb r0, r0
   %B = lshr i32 %A, 8
   %C = shl i32 %A, 24
   %D = or i32 %B, %C

diff --git a/test/CodeGen/ARM/tail-call.ll b/test/CodeGen/ARM/tail-call.ll
index 7711586..c3e7965 100644
--- a/test/CodeGen/ARM/tail-call.ll
+++ b/test/CodeGen/ARM/tail-call.ll

@@ -3,6 +3,7 @@
 ; RUN:   | FileCheck %s -check-prefix CHECK-NO-TAIL
 
 declare i32 @callee(i32 %i)
+declare extern_weak fastcc void @callee_weak()
 
 define i32 @caller(i32 %i) {
 entry:
@@ -19,3 +20,12 @@
 ; CHECK-NO-TAIL: pop {lr}
 ; CHECK-NO-TAIL: bx lr
 
+
+; Weakly-referenced extern functions cannot be tail-called, as AAELF does
+; not define the behaviour of branch instructions to undefined weak symbols.
+define fastcc void @caller_weak() {
+; CHECK-LABEL: caller_weak:
+; CHECK: bl callee_weak
+  tail call void @callee_weak()
+  ret void
+}

diff --git a/test/CodeGen/ARM/tail-merge-branch-weight.ll b/test/CodeGen/ARM/tail-merge-branch-weight.ll
new file mode 100644
index 0000000..9b5d566
--- /dev/null
+++ b/test/CodeGen/ARM/tail-merge-branch-weight.ll

@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=arm-apple-ios -print-machineinstrs=branch-folder \
+; RUN: %s -o /dev/null 2>&1 | FileCheck %s
+
+; Branch probability of tailed-merged block:
+;
+; p(L0_L1 -> L2) = p(entry -> L0) * p(L0 -> L2) + p(entry -> L1) * p(L1 -> L2)
+;                = 0.2 * 0.6 + 0.8 * 0.3 = 0.36
+; p(L0_L1 -> L3) = p(entry -> L0) * p(L0 -> L3) + p(entry -> L1) * p(L1 -> L3)
+;                = 0.2 * 0.4 + 0.8 * 0.7 = 0.64
+
+; CHECK: # Machine code for function test0:
+; CHECK: Successors according to CFG: BB#{{[0-9]+}}(13) BB#{{[0-9]+}}(24)
+; CHECK: BB#{{[0-9]+}}:
+; CHECK: BB#{{[0-9]+}}:
+; CHECK: # End machine code for function test0.
+
+define i32 @test0(i32 %n, i32 %m, i32* nocapture %a, i32* nocapture %b) {
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %L0, label %L1, !prof !0
+
+L0:                                          ; preds = %entry
+  store i32 12, i32* %a, align 4
+  store i32 18, i32* %b, align 4
+  %cmp1 = icmp eq i32 %m, 8
+  br i1 %cmp1, label %L2, label %L3, !prof !1
+
+L1:                                          ; preds = %entry
+  store i32 14, i32* %a, align 4
+  store i32 18, i32* %b, align 4
+  %cmp3 = icmp eq i32 %m, 8
+  br i1 %cmp3, label %L2, label %L3, !prof !2
+
+L2:                                               ; preds = %L1, %L0
+  br label %L3
+
+L3:                                           ; preds = %L0, %L1, %L2
+  %retval.0 = phi i32 [ 100, %L2 ], [ 6, %L1 ], [ 6, %L0 ]
+  ret i32 %retval.0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 200, i32 800}
+!1 = metadata !{metadata !"branch_weights", i32 600, i32 400}
+!2 = metadata !{metadata !"branch_weights", i32 300, i32 700}

diff --git a/test/CodeGen/ARM/thumb1-varalloc.ll b/test/CodeGen/ARM/thumb1-varalloc.ll
index e07e8aa..8d5888d 100644
--- a/test/CodeGen/ARM/thumb1-varalloc.ll
+++ b/test/CodeGen/ARM/thumb1-varalloc.ll

@@ -1,13 +1,15 @@
 ; RUN: llc < %s -mtriple=thumbv6-apple-darwin | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv6-apple-darwin -regalloc=basic | FileCheck %s
-; rdar://8819685
+; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-apple-darwin
+; RUN: llvm-objdump -triple=thumbv6-apple-darwin -d %t | FileCheck %s
 
 @__bar = external hidden global i8*
 @__baz = external hidden global i8*
 
+; rdar://8819685
 define i8* @_foo() {
 entry:
-; CHECK: foo:
+; CHECK-LABEL: foo:
 
 	%size = alloca i32, align 4
 	%0 = load i8** @__bar, align 4
@@ -40,3 +42,102 @@
 
 declare noalias i8* @strdup(i8* nocapture) nounwind
 declare i32 @_called_func(i8*, i32*) nounwind
+
+; Variable ending up at unaligned offset from sp (i.e. not a multiple of 4)
+define void @test_local_var_addr() {
+; CHECK-LABEL: test_local_var_addr:
+
+  %addr1 = alloca i8
+  %addr2 = alloca i8
+
+; CHECK: mov r0, sp
+; CHECK: adds r0, #{{[0-9]+}}
+; CHECK: blx
+  call void @take_ptr(i8* %addr1)
+
+; CHECK: mov r0, sp
+; CHECK: adds r0, #{{[0-9]+}}
+; CHECK: blx
+  call void @take_ptr(i8* %addr2)
+
+  ret void
+}
+
+; Simple variable ending up *at* sp.
+define void @test_simple_var() {
+; CHECK-LABEL: test_simple_var:
+
+  %addr32 = alloca i32
+  %addr8 = bitcast i32* %addr32 to i8*
+
+; CHECK: mov r0, sp
+; CHECK-NOT: adds r0
+; CHECK: blx
+  call void @take_ptr(i8* %addr8)
+  ret void
+}
+
+; Simple variable ending up at aligned offset from sp.
+define void @test_local_var_addr_aligned() {
+; CHECK-LABEL: test_local_var_addr_aligned:
+
+  %addr1.32 = alloca i32
+  %addr1 = bitcast i32* %addr1.32 to i8*
+  %addr2.32 = alloca i32
+  %addr2 = bitcast i32* %addr2.32 to i8*
+
+; CHECK: add r0, sp, #{{[0-9]+}}
+; CHECK: blx
+  call void @take_ptr(i8* %addr1)
+
+; CHECK: mov r0, sp
+; CHECK-NOT: add r0
+; CHECK: blx
+  call void @take_ptr(i8* %addr2)
+
+  ret void
+}
+
+; Simple variable ending up at aligned offset from sp.
+define void @test_local_var_big_offset() {
+; CHECK-LABEL: test_local_var_big_offset:
+  %addr1.32 = alloca i32, i32 257
+  %addr1 = bitcast i32* %addr1.32 to i8*
+  %addr2.32 = alloca i32, i32 257
+
+; CHECK: add [[RTMP:r[0-9]+]], sp, #1020
+; CHECK: adds [[RTMP]], #8
+; CHECK: blx
+  call void @take_ptr(i8* %addr1)
+
+  ret void
+}
+
+; Max range addressable with tADDrSPi
+define void @test_local_var_offset_1020() {
+; CHECK-LABEL: test_local_var_offset_1020
+  %addr1 = alloca i8, i32 4
+  %addr2 = alloca i8, i32 1020
+
+; CHECK: add r0, sp, #1020
+; CHECK-NEXT: blx
+  call void @take_ptr(i8* %addr1)
+
+  ret void
+}
+
+; Max range addressable with tADDrSPi + tADDi8
+define void @test_local_var_offset_1275() {
+; CHECK-LABEL: test_local_var_offset_1275
+  %addr1 = alloca i8, i32 1
+  %addr2 = alloca i8, i32 1275
+
+; CHECK: add r0, sp, #1020
+; CHECK: adds r0, #255
+; CHECK-NEXT: blx
+  call void @take_ptr(i8* %addr1)
+
+  ret void
+}
+
+declare void @take_ptr(i8*)

diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll
new file mode 100644
index 0000000..318e6e4
--- /dev/null
+++ b/test/CodeGen/ARM/thumb1_return_sequence.ll

@@ -0,0 +1,217 @@
+; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-V4T
+; RUN: llc -mtriple=thumbv5t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-V5T
+
+; CHECK-V4T-LABEL: clobberframe
+; CHECK-V5T-LABEL: clobberframe
+define <4 x i32> @clobberframe() #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V4T:    sub sp,
+; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+
+  %b = alloca <4 x i32>, align 16
+  %a = alloca <4 x i32>, align 16
+  store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %b, align 16
+  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a, align 16
+  %0 = load <4 x i32>* %a, align 16
+  ret <4 x i32> %0
+
+; Epilogue
+; --------
+; CHECK-V4T:         add sp,
+; CHECK-V4T-NEXT:    pop {[[SAVED]]}
+; CHECK-V4T-NEXT:    mov r12, r3
+; CHECK-V4T-NEXT:    pop {r3}
+; CHECK-V4T-NEXT:    mov lr, r3
+; CHECK-V4T-NEXT:    mov r3, r12
+; CHECK-V4T:         bx  lr
+; CHECK-V5T:         pop {[[SAVED]], pc}
+}
+
+; CHECK-V4T-LABEL: clobbervariadicframe
+; CHECK-V5T-LABEL: clobbervariadicframe
+define <4 x i32> @clobbervariadicframe(i32 %i, ...) #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    sub sp,
+; CHECK-V4T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V5T:    sub sp,
+; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+
+  %b = alloca <4 x i32>, align 16
+  %a = alloca <4 x i32>, align 16
+  store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %b, align 16
+  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %a, align 16
+  %0 = load <4 x i32>* %a, align 16
+  call void @llvm.va_start(i8* null)
+  ret <4 x i32> %0
+
+; Epilogue
+; --------
+; CHECK-V4T:         pop {[[SAVED]]}
+; CHECK-V4T-NEXT:    mov r12, r3
+; CHECK-V4T-NEXT:    pop {r3}
+; CHECK-V4T-NEXT:    add sp,
+; CHECK-V4T-NEXT:    mov lr, r3
+; CHECK-V4T-NEXT:    mov r3, r12
+; CHECK-V4T:         bx  lr
+; CHECK-V5T:         add sp,
+; CHECK-V5T-NEXT:    pop {[[SAVED]]}
+; CHECK-V5T-NEXT:    mov r12, r3
+; CHECK-V5T-NEXT:    pop {r3}
+; CHECK-V5T-NEXT:    add sp,
+; CHECK-V5T-NEXT:    mov lr, r3
+; CHECK-V5T-NEXT:    mov r3, r12
+; CHECK-V5T-NEXT:    bx lr
+}
+
+; CHECK-V4T-LABEL: simpleframe
+; CHECK-V5T-LABEL: simpleframe
+define i32 @simpleframe() #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    push    {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V5T:    push    {[[SAVED:(r[4567](, )?)+]], lr}
+
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  %d = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  store i32 2, i32* %b, align 4
+  store i32 3, i32* %c, align 4
+  store i32 4, i32* %d, align 4
+  %0 = load i32* %a, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %a, align 4
+  %1 = load i32* %b, align 4
+  %inc1 = add nsw i32 %1, 1
+  store i32 %inc1, i32* %b, align 4
+  %2 = load i32* %c, align 4
+  %inc2 = add nsw i32 %2, 1
+  store i32 %inc2, i32* %c, align 4
+  %3 = load i32* %d, align 4
+  %inc3 = add nsw i32 %3, 1
+  store i32 %inc3, i32* %d, align 4
+  %4 = load i32* %a, align 4
+  %5 = load i32* %b, align 4
+  %add = add nsw i32 %4, %5
+  %6 = load i32* %c, align 4
+  %add4 = add nsw i32 %add, %6
+  %7 = load i32* %d, align 4
+  %add5 = add nsw i32 %add4, %7
+  ret i32 %add5
+
+; Epilogue
+; --------
+; CHECK-V4T:    pop {[[SAVED]]}
+; CHECK-V4T:    pop {r3}
+; CHECK-V4T:    bx r3
+; CHECK-V5T:    pop {[[SAVED]], pc}
+}
+
+; CHECK-V4T-LABEL: simplevariadicframe
+; CHECK-V5T-LABEL: simplevariadicframe
+define i32 @simplevariadicframe(i32 %i, ...) #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    sub sp,
+; CHECK-V4T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V4T:    sub sp,
+; CHECK-V5T:    sub sp,
+; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V5T:    sub sp,
+
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  %d = alloca i32, align 4
+  store i32 1, i32* %a, align 4
+  store i32 2, i32* %b, align 4
+  store i32 3, i32* %c, align 4
+  store i32 4, i32* %d, align 4
+  %0 = load i32* %a, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %a, align 4
+  %1 = load i32* %b, align 4
+  %inc1 = add nsw i32 %1, 1
+  store i32 %inc1, i32* %b, align 4
+  %2 = load i32* %c, align 4
+  %inc2 = add nsw i32 %2, 1
+  store i32 %inc2, i32* %c, align 4
+  %3 = load i32* %d, align 4
+  %inc3 = add nsw i32 %3, 1
+  store i32 %inc3, i32* %d, align 4
+  %4 = load i32* %a, align 4
+  %5 = load i32* %b, align 4
+  %add = add nsw i32 %4, %5
+  %6 = load i32* %c, align 4
+  %add4 = add nsw i32 %add, %6
+  %7 = load i32* %d, align 4
+  %add5 = add nsw i32 %add4, %7
+  %add6 = add nsw i32 %add5, %i
+  call void @llvm.va_start(i8* null)
+  ret i32 %add6
+
+; Epilogue
+; --------
+; CHECK-V4T:         add sp,
+; CHECK-V4T-NEXT:    pop {[[SAVED]]}
+; CHECK-V4T-NEXT:    pop {r3}
+; CHECK-V4T-NEXT:    add sp,
+; CHECK-V4T-NEXT:    bx r3
+; CHECK-V5T:         add sp,
+; CHECK-V5T-NEXT:    pop {[[SAVED]]}
+; CHECK-V5T-NEXT:    pop {r3}
+; CHECK-V5T-NEXT:    add sp,
+; CHECK-V5T-NEXT:    bx r3
+}
+
+; CHECK-V4T-LABEL: noframe
+; CHECK-V5T-LABEL: noframe
+define i32 @noframe() #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T-NOT: push
+; CHECK-V5T-NOT: push
+    ret i32 0;
+; Epilogue
+; --------
+; CHECK-V4T-NOT: pop
+; CHECK-V5T-NOT: pop
+; CHECK-V4T:    bx  lr
+; CHECK-V5T:    bx  lr
+}
+
+; CHECK-V4T-LABEL: novariadicframe
+; CHECK-V5T-LABEL: novariadicframe
+define i32 @novariadicframe(i32 %i, ...) #0 {
+entry:
+; Prologue
+; --------
+; CHECK-V4T:    sub sp,
+; CHECK-V4T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+; CHECK-V5T:    sub sp,
+; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
+
+  call void @llvm.va_start(i8* null)
+  ret i32 %i;
+; Epilogue
+; --------
+; CHECK-V4T:         pop {[[SAVED]]}
+; CHECK-V4T-NEXT:    pop {r3}
+; CHECK-V4T-NEXT:    add sp,
+; CHECK-V4T-NEXT:    bx r3
+; CHECK-V5T:         pop {[[SAVED]]}
+; CHECK-V5T-NEXT:    pop {r3}
+; CHECK-V5T-NEXT:    add sp,
+; CHECK-V5T-NEXT:    bx r3
+}
+
+declare void @llvm.va_start(i8*) nounwind

diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index c5e699c..2675a73 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll

@@ -1,15 +1,9 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck -check-prefix CHECK-V7 %s
+; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s -check-prefix CHECK-V8
 ; PR11107
 
 define i32 @test(i32 %a, i32 %b) {
 entry:
-; CHECK:        cmp
-; CHECK-NEXT:   it    mi
-; CHECK-NEXT:   rsb{{s?}}mi
-; CHECK-NEXT:   cmp
-; CHECK-NEXT:   it    mi
-; CHECK-NEXT:   rsb{{s?}}mi
  %cmp1 = icmp slt i32 %a, 0
  %sub1 = sub nsw i32 0, %a
  %abs1 = select i1 %cmp1, i32 %sub1, i32 %a
@@ -19,3 +13,18 @@
  %add = add nsw i32 %abs1, %abs2
  ret i32 %add
 }
+
+; CHECK-V7:        cmp
+; CHECK-V7-NEXT:   it    mi
+; CHECK-V7-NEXT:   rsbmi
+; CHECK-V7-NEXT:   cmp
+; CHECK-V7-NEXT:   it    mi
+; CHECK-V7-NEXT:   rsbmi
+
+; CHECK-V8:        cmp
+; CHECK-V8-NEXT:   bpl
+; CHECK-V8:        rsbs
+; CHECK-V8:        cmp
+; CHECK-V8-NEXT:   bpl
+; CHECK-V8:        rsbs
+

diff --git a/test/CodeGen/ARM/thumb2-size-opt.ll b/test/CodeGen/ARM/thumb2-size-opt.ll
new file mode 100644
index 0000000..0084a45
--- /dev/null
+++ b/test/CodeGen/ARM/thumb2-size-opt.ll

@@ -0,0 +1,84 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf -o - -show-mc-encoding -t2-reduce-limit=0 -t2-reduce-limit2=0 %s | FileCheck %s
+; RUN: llc -mtriple=thumbv7-linux-gnueabihf -o - -show-mc-encoding %s | FileCheck %s --check-prefix=CHECK-OPT
+
+define i32 @and(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: and:
+; CHECK: and.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: ands r{{[0-7]}}, r{{[0-7]}} @ encoding: [{{0x..,0x..}}]
+entry:
+  %and = and i32 %b, %a
+  ret i32 %and
+}
+
+define i32 @asr-imm(i32 %a) nounwind readnone {
+; CHECK-LABEL: "asr-imm":
+; CHECK: asr.w r{{[0-9]+}}, r{{[0-9]+}}, #13 @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: asrs r{{[0-7]}}, r{{[0-7]}}, #13 @ encoding: [{{0x..,0x..}}]
+entry:
+  %shr = ashr i32 %a, 13
+  ret i32 %shr
+}
+
+define i32 @asr-reg(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: "asr-reg":
+; CHECK: asr.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: asrs r{{[0-7]}}, r{{[0-7]}} @ encoding: [{{0x..,0x..}}]
+entry:
+  %shr = ashr i32 %a, %b
+  ret i32 %shr
+}
+
+define i32 @bic(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: bic:
+; CHECK: bic.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: bics r{{[0-7]}}, r{{[0-7]}} @ encoding: [{{0x..,0x..}}]
+entry:
+  %neg = xor i32 %b, -1
+  %and = and i32 %neg, %a
+  ret i32 %and
+}
+
+define i32 @eor(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: eor:
+; CHECK: eor.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: eors r{{[0-7]}}, r{{[0-7]}} @ encoding: [{{0x..,0x..}}]
+entry:
+  %eor = xor i32 %a, %b
+  ret i32 %eor
+}
+
+define i32 @lsl-imm(i32 %a) nounwind readnone {
+; CHECK-LABEL: "lsl-imm":
+; CHECK: lsl.w r{{[0-9]+}}, r{{[0-9]+}}, #13 @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: lsls r{{[0-7]}}, r{{[0-7]}}, #13  @ encoding: [{{0x..,0x..}}]
+entry:
+  %shl = shl i32 %a, 13
+  ret i32 %shl
+}
+
+define i32 @lsl-reg(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: "lsl-reg":
+; CHECK: lsl.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: lsls r{{[0-7]}}, r{{[0-7]}}  @ encoding: [{{0x..,0x..}}]
+entry:
+  %shl = shl i32 %a, %b
+  ret i32 %shl
+}
+
+define i32 @lsr-imm(i32 %a) nounwind readnone {
+; CHECK-LABEL: "lsr-imm":
+; CHECK: lsr.w r{{[0-9]+}}, r{{[0-9]+}}, #13 @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: lsrs r{{[0-7]}}, r{{[0-7]}}, #13  @ encoding: [{{0x..,0x..}}]
+entry:
+  %shr = lshr i32 %a, 13
+  ret i32 %shr
+}
+
+define i32 @lsr-reg(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: "lsr-reg":
+; CHECK: lsr.w r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}} @ encoding: [{{0x..,0x..,0x..,0x..}}]
+; CHECK-OPT: lsrs r{{[0-7]}}, r{{[0-7]}}  @ encoding: [{{0x..,0x..}}]
+entry:
+  %shr = lshr i32 %a, %b
+  ret i32 %shr
+}

diff --git a/test/CodeGen/ARM/vararg_no_start.ll b/test/CodeGen/ARM/vararg_no_start.ll
new file mode 100644
index 0000000..f9c8c1b
--- /dev/null
+++ b/test/CodeGen/ARM/vararg_no_start.ll

@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=arm-darwin < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=arm-darwin < %s | FileCheck %s
+
+define void @foo(i8*, ...) {
+  ret void
+}
+; CHECK-LABEL: {{^_?}}foo:
+; CHECK-NOT: str
+; CHECK: {{bx lr|mov pc, lr}}
+declare void @llvm.va_start(i8*) nounwind

diff --git a/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll b/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
index 19d6cbe..148a79d 100644
--- a/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll
+++ b/test/CodeGen/ARM/varargs-spill-stack-align-nacl.ll

@@ -22,9 +22,9 @@
 ; Reserve space for the varargs save area.  This currently reserves
 ; more than enough (16 bytes rather than the 12 bytes needed).
 ; CHECK: sub sp, sp, #16
-; CHECK: push {lr}
+; CHECK: push {r11, lr}
 ; Align the stack pointer to a multiple of 16.
-; CHECK: sub sp, sp, #12
+; CHECK: sub sp, sp, #8
 ; Calculate the address of the varargs save area and save varargs
 ; arguments into it.
 ; CHECK-NEXT: add r0, sp, #20

diff --git a/test/CodeGen/ARM/vargs_align.ll b/test/CodeGen/ARM/vargs_align.ll
index e390cf0..3abb57e 100644
--- a/test/CodeGen/ARM/vargs_align.ll
+++ b/test/CodeGen/ARM/vargs_align.ll

@@ -10,6 +10,7 @@
 	store i32 0, i32* %tmp
 	%tmp1 = load i32* %tmp		; <i32> [#uses=1]
 	store i32 %tmp1, i32* %retval
+	call void @llvm.va_start(i8* null)
 	br label %return
 
 return:		; preds = %entry
@@ -20,3 +21,5 @@
 ; OABI: add sp, sp, #12
 ; OABI: add sp, sp, #12
 }
+
+declare void @llvm.va_start(i8*) nounwind

diff --git a/test/CodeGen/ARM/vector-promotion.ll b/test/CodeGen/ARM/vector-promotion.ll
new file mode 100644
index 0000000..42ceb60
--- /dev/null
+++ b/test/CodeGen/ARM/vector-promotion.ll

@@ -0,0 +1,403 @@
+; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: llc -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s
+
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 undef, i32 1>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
+; IR-BOTH-NEXT: ret
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+; ASM-LABEL: simpleOneInstructionPromotion:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM-NEXT: vorr.i32 [[LOAD]], #0x1
+; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1:32]
+; ASM-NEXT: bx
+define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @unsupportedInstructionForPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; IR-BOTH-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2
+; IR-BOTH-NEXT: store i1 [[CMP]], i1* %dest
+; IR-BOTH-NEXT: ret
+;
+; ASM-LABEL: unsupportedInstructionForPromotion:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out = icmp eq i32 %extract, %in2
+  store i1 %out, i1* %dest, align 4
+  ret void
+}
+
+
+; IR-BOTH-LABEL: @unsupportedChainInDifferentBBs
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; IR-BOTH-NEXT: br i1 %bool, label %bb2, label %end
+; BB2
+; IR-BOTH: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest, align 4
+; IR-BOTH: ret
+;
+; ASM-LABEL: unsupportedChainInDifferentBBs:
+; ASM: vldrne [[LOAD:d[0-9]+]], [r0]
+; ASM: vmovne.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) {
+bb1:
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  br i1 %bool, label %bb2, label %end
+bb2: 
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  br label %end
+end:
+  ret void
+}
+
+; IR-LABEL: @chainOfInstructionsToPromote
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR4:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR3]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR5:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR4]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR6:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR5]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR7:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR6]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR7]], i32 0
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
+; IR-BOTH-NEXT: ret
+;
+; ASM-LABEL: chainOfInstructionsToPromote:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM-NOT: vmov.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out1 = or i32 %extract, 1
+  %out2 = or i32 %out1, 1
+  %out3 = or i32 %out2, 1
+  %out4 = or i32 %out3, 1
+  %out5 = or i32 %out4, 1
+  %out6 = or i32 %out5, 1
+  %out7 = or i32 %out6, 1
+  store i32 %out7, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @unsupportedMultiUses
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-BOTH-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest
+; IR-BOTH-NEXT: ret i32 [[OR]]
+;
+; ASM-LABEL: unsupportedMultiUses:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret i32 %out
+}
+
+; Check that we promote we a splat constant when this is a division.
+; The NORMAL mode does not promote anything as divisions are not legal.
+; IR-BOTH-LABEL: @udivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @udivCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @uremCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = urem i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = urem <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret 
+define void @uremCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = urem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @sdivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sdiv i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret 
+define void @sdivCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = sdiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @sremCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = srem <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret 
+define void @sremCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @fdivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fdiv <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @fdivCase(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fdiv float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @fremCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @fremCase(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+; IR-BOTH-LABEL: @undefDivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 7, [[EXTRACT]]
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+; IR-BOTH-LABEL: @undefRemCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 7, [[EXTRACT]]
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; IR-BOTH-LABEL: @undefConstantFRemCaseWithFastMath
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> [[LOAD]], <float undef, float 7.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; IR-BOTH-LABEL: @undefVectorFRemCaseWithFastMath
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]]
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> <float undef, float 7.000000e+00>, [[LOAD]]
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float 7.0, %extract
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we are able to promote floating point value.
+; This requires the STRESS mode, as floating point value are
+; not promote on armv7.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotionFloat
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version: 
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <2 x float> [[LOAD]], <float undef, float 1.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fadd float %extract, 1.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we correctly use a splat constant when we cannot
+; determine at compile time the index of the extract.
+; This requires the STRESS modes, as variable index are expensive
+; to lower.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotionVariableIdx
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 %idx
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; Vector version:
+; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 1>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[OR]], i32 %idx
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 %idx
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check a vector with more than 2 elements.
+; This requires the STRESS mode because currently 'or v8i8' is not marked
+; as legal or custom, althought the actual assembly is better if we were
+; promoting it.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion8x8
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <8 x i8>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i8 [[EXTRACT]], 1
+; Vector version:  
+; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], <i8 undef, i8 1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[OR]], i32 1
+;
+; IR-BOTH-NEXT: store i8 [[RES]], i8* %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) {
+  %in1 = load <8 x i8>* %addr1, align 8
+  %extract = extractelement <8 x i8> %in1, i32 1
+  %out = or i8 %extract, 1
+  store i8 %out, i8* %dest, align 4
+  ret void
+}
+
+; Check that we optimized the sequence correctly when it can be
+; lowered on a Q register.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>* %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], <i32 undef, i32 1, i32 undef, i32 undef>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR]], i32 1
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
+; IR-BOTH-NEXT: ret
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+; ASM-LABEL: simpleOneInstructionPromotion4x32:
+; ASM: vld1.64 {[[LOAD:d[0-9]+]], d{{[0-9]+}}}, [r0]
+; The Q register used here must be [[LOAD]] / 2, but we cannot express that.
+; ASM-NEXT: vorr.i32 q{{[[0-9]+}}, #0x1
+; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1]
+; ASM-NEXT: bx
+define void @simpleOneInstructionPromotion4x32(<4 x i32>* %addr1, i32* %dest) {
+  %in1 = load <4 x i32>* %addr1, align 8
+  %extract = extractelement <4 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 1
+  ret void
+}

diff --git a/test/CodeGen/ARM/vector-spilling.ll b/test/CodeGen/ARM/vector-spilling.ll
new file mode 100644
index 0000000..746c6df
--- /dev/null
+++ b/test/CodeGen/ARM/vector-spilling.ll

@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
+
+; This test will generate spills/fills using vldmia instructions that access 24 bytes of memory.
+; Check that we don't crash when we generate these instructions on Cortex-A9.
+
+; CHECK: test:
+; CHECK: vstmia
+; CHECK: vldmia
+define void @test(<8 x i64>* %src) #0 {
+entry:
+  %0 = getelementptr inbounds <8 x i64>* %src, i32 0
+  %1 = load <8 x i64>* %0, align 8
+
+  %2 = getelementptr inbounds <8 x i64>* %src, i32 1
+  %3 = load <8 x i64>* %2, align 8
+
+  %4 = getelementptr inbounds <8 x i64>* %src, i32 2
+  %5 = load <8 x i64>* %4, align 8
+
+  %6 = getelementptr inbounds <8 x i64>* %src, i32 3
+  %7 = load <8 x i64>* %6, align 8
+
+  %8 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %9 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+
+  tail call void(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)* @foo(<8 x i64> %1, <8 x i64> %3, <8 x i64> %5, <8 x i64> %7, <8 x i64> %8, <8 x i64> %9)
+  ret void
+}
+
+declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)
+
+attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/ARM/vfp-regs-dwarf.ll b/test/CodeGen/ARM/vfp-regs-dwarf.ll
index 4976729..f83adf9 100644
--- a/test/CodeGen/ARM/vfp-regs-dwarf.ll
+++ b/test/CodeGen/ARM/vfp-regs-dwarf.ll

@@ -31,14 +31,14 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/Users/tim/llvm/build/tmp.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/tim/llvm/build/tmp.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"tmp.c", metadata !"/Users/tim/llvm/build"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @stack_offsets, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/tim/llvm/build/tmp.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @stack_offsets, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/Users/tim/llvm/build/tmp.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 

diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll
index f2e5eb9..e5e7bc0 100644
--- a/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll

@@ -2,8 +2,8 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
 
-; This test will generate spills/fills using vldmia instructions that access 64 bytes of memory.
-; Check that we don't crash when we generate these instructions on Cortex-A9.
+; This test used to test vector spilling using vstmia/vldmia instructions, but
+; the changes for PR:18825 prevent that spilling.
 
 ; CHECK: test:
 ; CHECK: vstmia

diff --git a/test/CodeGen/ARM/vminmaxnm.ll b/test/CodeGen/ARM/vminmaxnm.ll
index f6ce64c..39289a0 100644
--- a/test/CodeGen/ARM/vminmaxnm.ll
+++ b/test/CodeGen/ARM/vminmaxnm.ll

@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-FAST
 
 define <4 x float> @vmaxnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
-; CHECK: vmaxnmq
+; CHECK-LABEL: vmaxnmq:
 ; CHECK: vmaxnm.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
@@ -11,7 +11,7 @@
 }
 
 define <2 x float> @vmaxnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
-; CHECK: vmaxnmd
+; CHECK-LABEL: vmaxnmd:
 ; CHECK: vmaxnm.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
@@ -20,7 +20,7 @@
 }
 
 define <4 x float> @vminnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
-; CHECK: vminnmq
+; CHECK-LABEL: vminnmq:
 ; CHECK: vminnm.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
@@ -29,7 +29,7 @@
 }
 
 define <2 x float> @vminnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
-; CHECK: vminnmd
+; CHECK-LABEL: vminnmd:
 ; CHECK: vminnm.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
@@ -38,49 +38,93 @@
 }
 
 define float @fp-armv8_vminnm_o(float %a, float %b) {
-; CHECK-FAST: fp-armv8_vminnm_o
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_o":
 ; CHECK-FAST-NOT: vcmp
 ; CHECK-FAST: vminnm.f32
-; CHECK: fp-armv8_vminnm_o
+; CHECK-LABEL: "fp-armv8_vminnm_o":
 ; CHECK-NOT: vminnm.f32
   %cmp = fcmp olt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
-define float @fp-armv8_vminnm_u(float %a, float %b) {
-; CHECK-FAST: fp-armv8_vminnm_u
+define float @fp-armv8_vminnm_o_rev(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_o_rev":
 ; CHECK-FAST-NOT: vcmp
 ; CHECK-FAST: vminnm.f32
-; CHECK: fp-armv8_vminnm_u
+; CHECK-LABEL: "fp-armv8_vminnm_o_rev":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ogt float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
+define float @fp-armv8_vminnm_u(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_u":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f32
+; CHECK-LABEL: "fp-armv8_vminnm_u":
 ; CHECK-NOT: vminnm.f32
   %cmp = fcmp ult float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
+define float @fp-armv8_vminnm_u_rev(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vminnm_u_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f32
+; CHECK-LABEL: "fp-armv8_vminnm_u_rev":
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ugt float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
 define float @fp-armv8_vmaxnm_o(float %a, float %b) {
-; CHECK-FAST: fp-armv8_vmaxnm_o
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_o":
 ; CHECK-FAST-NOT: vcmp
 ; CHECK-FAST: vmaxnm.f32
-; CHECK: fp-armv8_vmaxnm_o
+; CHECK-LABEL: "fp-armv8_vmaxnm_o":
 ; CHECK-NOT: vmaxnm.f32
   %cmp = fcmp ogt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
-define float @fp-armv8_vmaxnm_u(float %a, float %b) {
-; CHECK-FAST: fp-armv8_vmaxnm_u
+define float @fp-armv8_vmaxnm_o_rev(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_o_rev":
 ; CHECK-FAST-NOT: vcmp
 ; CHECK-FAST: vmaxnm.f32
-; CHECK: fp-armv8_vmaxnm_u
+; CHECK-LABEL: "fp-armv8_vmaxnm_o_rev":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp olt float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_u(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_u":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK-LABEL: "fp-armv8_vmaxnm_u":
 ; CHECK-NOT: vmaxnm.f32
   %cmp = fcmp ugt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
 }
 
+define float @fp-armv8_vmaxnm_u_rev(float %a, float %b) {
+; CHECK-FAST-LABEL: "fp-armv8_vmaxnm_u_rev":
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK-LABEL: "fp-armv8_vmaxnm_u_rev":
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ult float %a, %b
+  %cond = select i1 %cmp, float %b, float %a
+  ret float %cond
+}
+
 
 declare <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone

diff --git a/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll
new file mode 100644
index 0000000..7ecd252
--- /dev/null
+++ b/test/CodeGen/ARM/wrong-t2stmia-size-opt.ll

@@ -0,0 +1,20 @@
+; RUN: llc -mcpu=cortex-a9 -O1 -filetype=obj %s -o - | llvm-objdump -arch thumb -mcpu=cortex-a9 -d - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7--linux-gnueabi"
+
+declare i8* @llvm.returnaddress(i32)
+
+define i32* @wrong-t2stmia-size-reduction(i32* %addr, i32 %val0) minsize {
+  store i32 %val0, i32* %addr
+  %addr1 = getelementptr i32* %addr, i32 1
+  %lr = call i8* @llvm.returnaddress(i32 0)
+  %lr32 = ptrtoint i8* %lr to i32
+  store i32 %lr32, i32* %addr1
+  %addr2 = getelementptr i32* %addr1, i32 1
+  ret i32* %addr2
+}
+
+; Check that stm writes two registers.  The bug caused one of registers (LR,
+; which invalid for Thumb1 form of STMIA instruction) to be dropped.
+; CHECK: stm{{[^,]*}}, {{{.*,.*}}}

diff --git a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
index 3f17ce1..eaaeb37 100644
--- a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
+++ b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll

@@ -30,20 +30,20 @@
 	%"struct.qdesigner_internal::GridLayout" = type { %"struct.qdesigner_internal::Layout", %"struct.QPair<int,int>", %"struct.qdesigner_internal::Grid"* }
 	%"struct.qdesigner_internal::Layout" = type { %struct.QObject, %"struct.QList<QAbstractExtensionFactory*>", %struct.QWidget*, %"struct.QHash<QString,QList<QAbstractExtensionFactory*> >", %struct.QWidget*, %struct.QDesignerFormWindowInterface*, i8, %"struct.QPair<int,int>", %struct.QRect, i8 }
 
-@_ZL20__gthrw_pthread_oncePiPFvvE = alias weak i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
-@_ZL27__gthrw_pthread_getspecificj = alias weak i8* (i32)* @pthread_getspecific		; <i8* (i32)*> [#uses=0]
-@_ZL27__gthrw_pthread_setspecificjPKv = alias weak i32 (i32, i8*)* @pthread_setspecific		; <i32 (i32, i8*)*> [#uses=0]
-@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = alias weak i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create		; <i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)*> [#uses=0]
-@_ZL22__gthrw_pthread_cancelm = alias weak i32 (i64)* @pthread_cancel		; <i32 (i64)*> [#uses=0]
-@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = alias weak i32 (%struct.pthread_mutex_t*)* @pthread_mutex_lock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = alias weak i32 (%struct.pthread_mutex_t*)* @pthread_mutex_trylock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = alias weak i32 (%struct.pthread_mutex_t*)* @pthread_mutex_unlock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = alias weak i32 (%struct.pthread_mutex_t*, %struct.Alignment*)* @pthread_mutex_init		; <i32 (%struct.pthread_mutex_t*, %struct.Alignment*)*> [#uses=0]
-@_ZL26__gthrw_pthread_key_createPjPFvPvE = alias weak i32 (i32*, void (i8*)*)* @pthread_key_create		; <i32 (i32*, void (i8*)*)*> [#uses=0]
-@_ZL26__gthrw_pthread_key_deletej = alias weak i32 (i32)* @pthread_key_delete		; <i32 (i32)*> [#uses=0]
-@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = alias weak i32 (%struct.Alignment*)* @pthread_mutexattr_init		; <i32 (%struct.Alignment*)*> [#uses=0]
-@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = alias weak i32 (%struct.Alignment*, i32)* @pthread_mutexattr_settype		; <i32 (%struct.Alignment*, i32)*> [#uses=0]
-@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = alias weak i32 (%struct.Alignment*)* @pthread_mutexattr_destroy		; <i32 (%struct.Alignment*)*> [#uses=0]
+@_ZL20__gthrw_pthread_oncePiPFvvE = weak alias i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
+@_ZL27__gthrw_pthread_getspecificj = weak alias i8* (i32)* @pthread_getspecific		; <i8* (i32)*> [#uses=0]
+@_ZL27__gthrw_pthread_setspecificjPKv = weak alias i32 (i32, i8*)* @pthread_setspecific		; <i32 (i32, i8*)*> [#uses=0]
+@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = weak alias i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create		; <i32 (i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)*> [#uses=0]
+@_ZL22__gthrw_pthread_cancelm = weak alias i32 (i64)* @pthread_cancel		; <i32 (i64)*> [#uses=0]
+@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_lock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_trylock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_unlock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = weak alias i32 (%struct.pthread_mutex_t*, %struct.Alignment*)* @pthread_mutex_init		; <i32 (%struct.pthread_mutex_t*, %struct.Alignment*)*> [#uses=0]
+@_ZL26__gthrw_pthread_key_createPjPFvPvE = weak alias i32 (i32*, void (i8*)*)* @pthread_key_create		; <i32 (i32*, void (i8*)*)*> [#uses=0]
+@_ZL26__gthrw_pthread_key_deletej = weak alias i32 (i32)* @pthread_key_delete		; <i32 (i32)*> [#uses=0]
+@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = weak alias i32 (%struct.Alignment*)* @pthread_mutexattr_init		; <i32 (%struct.Alignment*)*> [#uses=0]
+@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = weak alias i32 (%struct.Alignment*, i32)* @pthread_mutexattr_settype		; <i32 (%struct.Alignment*, i32)*> [#uses=0]
+@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = weak alias i32 (%struct.Alignment*)* @pthread_mutexattr_destroy		; <i32 (%struct.Alignment*)*> [#uses=0]
 
 define void @_ZN18qdesigner_internal10GridLayout9buildGridEv(%"struct.qdesigner_internal::GridLayout"* %this) nounwind {
 entry:

diff --git a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
index da26504..cd446d5 100644
--- a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
+++ b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll

@@ -3,9 +3,6 @@
 ; the uses of a copy to a physical register without ignoring non-data
 ; dependence, PR10220.
 
-; The ARM backend can't handle i256 math at the moment.
-; XFAIL: arm
-
 define void @f(i256* nocapture %a, i256* nocapture %b, i256* nocapture %cc, i256* nocapture %dd) nounwind uwtable noinline ssp {
 entry:
   %c = load i256* %cc

diff --git a/test/CodeGen/Generic/PBQP.ll b/test/CodeGen/Generic/PBQP.ll
new file mode 100644
index 0000000..91fcfba
--- /dev/null
+++ b/test/CodeGen/Generic/PBQP.ll

@@ -0,0 +1,29 @@
+; RUN: llc -regalloc=pbqp < %s
+
+define i32 @foo() {
+entry:
+  %call = tail call i32 (...)* @baz()
+  %call1 = tail call i32 (...)* @baz()
+  %call2 = tail call i32 (...)* @baz()
+  %call3 = tail call i32 (...)* @baz()
+  %call4 = tail call i32 (...)* @baz()
+  %call5 = tail call i32 (...)* @baz()
+  %call6 = tail call i32 (...)* @baz()
+  %call7 = tail call i32 (...)* @baz()
+  %call8 = tail call i32 (...)* @baz()
+  %call9 = tail call i32 (...)* @baz()
+  %call10 = tail call i32 (...)* @baz()
+  %call11 = tail call i32 (...)* @baz()
+  %call12 = tail call i32 (...)* @baz()
+  %call13 = tail call i32 (...)* @baz()
+  %call14 = tail call i32 (...)* @baz()
+  %call15 = tail call i32 (...)* @baz()
+  %call16 = tail call i32 (...)* @baz()
+  %call17 = tail call i32 @bar(i32 %call, i32 %call1, i32 %call2, i32 %call3, i32 %call4, i32 %call5, i32 %call6, i32 %call7, i32 %call8, i32 %call9, i32 %call10, i32 %call11, i32 %call12, i32 %call13, i32 %call14, i32 %call15, i32 %call16)
+  ret i32 %call17
+}
+
+declare i32 @baz(...)
+
+declare i32 @bar(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+

diff --git a/test/CodeGen/Generic/assume.ll b/test/CodeGen/Generic/assume.ll
new file mode 100644
index 0000000..bb045b3
--- /dev/null
+++ b/test/CodeGen/Generic/assume.ll

@@ -0,0 +1,9 @@
+; RUN: llc < %s
+
+define void @main() {
+        call void @llvm.assume(i1 1)
+        ret void
+}
+
+declare void @llvm.assume(i1) nounwind
+

diff --git a/test/CodeGen/Generic/dbg_value.ll b/test/CodeGen/Generic/dbg_value.ll
index 840eeb0..73e41c7 100644
--- a/test/CodeGen/Generic/dbg_value.ll
+++ b/test/CodeGen/Generic/dbg_value.ll

@@ -4,11 +4,11 @@
 %0 = type { i32, i32 }
 
 define void @t(%0*, i32, i32, i32, i32) nounwind {
-  tail call void @llvm.dbg.value(metadata !{%0* %0}, i64 0, metadata !0)
+  tail call void @llvm.dbg.value(metadata !{%0* %0}, i64 0, metadata !0, metadata !{metadata !"0x102"})
   unreachable
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 ; !0 should conform to the format of DIVariable.
-!0 = metadata !{i32 786689, null, metadata !"a", null, i32 0, null, i32 0, i32 0} ;
+!0 = metadata !{metadata !"0x101\00a\000\000", null, null, null} ; [ DW_TAG_arg_variable ]

diff --git a/test/CodeGen/Generic/empty-insertvalue.ll b/test/CodeGen/Generic/empty-insertvalue.ll
new file mode 100644
index 0000000..e4cc27c
--- /dev/null
+++ b/test/CodeGen/Generic/empty-insertvalue.ll

@@ -0,0 +1,7 @@
+; RUN: llc < %s
+
+define void @f() {
+entry:
+  %0 = insertvalue { [0 x { i8*, i8* }], [0 x { i8*, i64 }] } undef, [0 x { i8*, i8* }] undef, 0
+  ret void
+}

diff --git a/test/CodeGen/Hexagon/cmp-not.ll b/test/CodeGen/Hexagon/cmp-not.ll
new file mode 100644
index 0000000..abcddc38
--- /dev/null
+++ b/test/CodeGen/Hexagon/cmp-not.ll

@@ -0,0 +1,50 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate matching compare insn.
+
+; Function Attrs: nounwind
+define i32 @neqi(i32 %argc) #0 {
+entry:
+  %p = alloca i8, align 1
+  %0 = tail call i1 @llvm.hexagon.C4.cmpneqi(i32 %argc, i32 512)
+  %conv = zext i1 %0 to i8
+  store volatile i8 %conv, i8* %p, align 1
+  %p.0.p.0. = load volatile i8* %p, align 1
+  %conv1 = zext i8 %p.0.p.0. to i32
+  ret i32 %conv1
+}
+; CHECK:	p{{[0-3]}}{{ *}} = !cmp.eq(r{{[0-9]+}}, ##512)
+
+; Function Attrs: nounwind readnone
+declare i1 @llvm.hexagon.C4.cmpneqi(i32, i32) #1
+
+; Function Attrs: nounwind
+define i32 @ngti(i32 %argc) #0 {
+entry:
+  %p = alloca i8, align 1
+  %0 = tail call i1 @llvm.hexagon.C4.cmpltei(i32 %argc, i32 4)
+  %conv = zext i1 %0 to i8
+  store volatile i8 %conv, i8* %p, align 1
+  %p.0.p.0. = load volatile i8* %p, align 1
+  %conv1 = zext i8 %p.0.p.0. to i32
+  ret i32 %conv1
+}
+; CHECK:	p{{[0-3]}}{{ *}} = !cmp.gt(r{{[0-9]+}}, #4)
+
+; Function Attrs: nounwind readnone
+declare i1 @llvm.hexagon.C4.cmpltei(i32, i32) #1
+
+; Function Attrs: nounwind
+define i32 @ngtui(i32 %argc) #0 {
+entry:
+  %p = alloca i8, align 1
+  %0 = tail call i1 @llvm.hexagon.C4.cmplteui(i32 %argc, i32 4)
+  %conv = zext i1 %0 to i8
+  store volatile i8 %conv, i8* %p, align 1
+  %p.0.p.0. = load volatile i8* %p, align 1
+  %conv1 = zext i8 %p.0.p.0. to i32
+  ret i32 %conv1
+}
+; CHECK: 	p{{[0-3]}}{{ *}} = !cmp.gtu(r{{[0-9]+}}, #4)
+
+; Function Attrs: nounwind readnone
+declare i1 @llvm.hexagon.C4.cmplteui(i32, i32) #1

diff --git a/test/CodeGen/Hexagon/ctor.ll b/test/CodeGen/Hexagon/ctor.ll
new file mode 100644
index 0000000..2e2fc51
--- /dev/null
+++ b/test/CodeGen/Hexagon/ctor.ll

@@ -0,0 +1,14 @@
+; RUN: llc -march=hexagon < %s  | FileCheck -check-prefix=INITARRAY %s
+; RUN: llc -march=hexagon < %s  -use-ctors | FileCheck -check-prefix=CTOR %s
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_P10066.ii, i8* null }]
+define internal void @_GLOBAL__sub_I_P10066.ii() {
+entry:
+  ret void
+}
+
+;CTOR: .section	.ctors
+;CTOR-NOT:  section	.init_array
+
+;INITARRAY: section	.init_array
+;INITARRAY-NOT: .section	.ctors

diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index 9537489..f093dae 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll

@@ -5,9 +5,9 @@
 
 define void @foo(i32* nocapture %a, i32* nocapture %b) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13), !dbg !17
-  tail call void @llvm.dbg.value(metadata !{i32* %b}, i64 0, metadata !14), !dbg !18
-  tail call void @llvm.dbg.value(metadata !30, i64 0, metadata !15), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !17
+  tail call void @llvm.dbg.value(metadata !{i32* %b}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata !30, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !19
   br label %for.body, !dbg !19
 
 for.body:                                         ; preds = %for.body, %entry
@@ -18,11 +18,11 @@
   %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %b.addr.01 = phi i32* [ %b, %entry ], [ %incdec.ptr, %for.body ]
   %incdec.ptr = getelementptr inbounds i32* %b.addr.01, i32 1, !dbg !21
-  tail call void @llvm.dbg.value(metadata !{i32* %incdec.ptr}, i64 0, metadata !14), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i32* %incdec.ptr}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !21
   %0 = load i32* %b.addr.01, align 4, !dbg !21
   store i32 %0, i32* %arrayidx.phi, align 4, !dbg !21
   %inc = add nsw i32 %i.02, 1, !dbg !26
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !15), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !26
   %exitcond = icmp eq i32 %inc, 10, !dbg !19
   %arrayidx.inc = getelementptr i32* %arrayidx.phi, i32 1
   br i1 %exitcond, label %for.end, label %for.body, !dbg !19
@@ -31,34 +31,34 @@
   ret void, !dbg !27
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{i32 786449, metadata !28, i32 12, metadata !"QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)\001\00\000\00\001", metadata !28, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
 !2 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !28, null, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*)* @foo, null, null, metadata !11, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\001\001", metadata !28, null, metadata !7, null, void (i32*, i32*)* @foo, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!6 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !9}
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !11 = metadata !{metadata !13, metadata !14, metadata !15}
-!13 = metadata !{i32 786689, metadata !5, metadata !"a", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
-!14 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554433, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
-!15 = metadata !{i32 786688, metadata !16, metadata !"i", metadata !6, i32 2, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2]
-!16 = metadata !{i32 786443, metadata !28, metadata !5, i32 1, i32 26, i32 0} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!13 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!14 = metadata !{metadata !"0x101\00b\0033554433\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [b] [line 1]
+!15 = metadata !{metadata !"0x100\00i\002\000", metadata !16, metadata !6, metadata !10} ; [ DW_TAG_auto_variable ] [i] [line 2]
+!16 = metadata !{metadata !"0xb\001\0026\000", metadata !28, metadata !5} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
 !17 = metadata !{i32 1, i32 15, metadata !5, null}
 !18 = metadata !{i32 1, i32 23, metadata !5, null}
 !19 = metadata !{i32 3, i32 8, metadata !20, null}
-!20 = metadata !{i32 786443, metadata !28, metadata !16, i32 3, i32 3, i32 1} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!20 = metadata !{metadata !"0xb\003\003\001", metadata !28, metadata !16} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
 !21 = metadata !{i32 4, i32 5, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !28, metadata !20, i32 3, i32 28, i32 2} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!22 = metadata !{metadata !"0xb\003\0028\002", metadata !28, metadata !20} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
 !26 = metadata !{i32 3, i32 23, metadata !20, null}
 !27 = metadata !{i32 6, i32 1, metadata !16, null}
 !28 = metadata !{metadata !"hwloop-dbg.c", metadata !"/usr2/kparzysz/s.hex/t"}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !30 = metadata !{i32 0}

diff --git a/test/CodeGen/Inputs/DbgValueOtherTargets.ll b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
index 953e576..2d05b45 100644
--- a/test/CodeGen/Inputs/DbgValueOtherTargets.ll
+++ b/test/CodeGen/Inputs/DbgValueOtherTargets.ll

@@ -3,28 +3,28 @@
 define i32 @main() nounwind ssp {
 entry:
 ; CHECK: DEBUG_VALUE
-  call void @llvm.dbg.value(metadata !6, i64 0, metadata !7), !dbg !9
+  call void @llvm.dbg.value(metadata !6, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !9
   ret i32 0, !dbg !10
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{i32 786478, metadata !12, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang version 2.9 (trunk 120996)", i1 false, metadata !"", i32 0, metadata !6, metadata !6, metadata !11, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\000\000\000", metadata !12, metadata !1, metadata !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 120996)\000\00\000\00\000", metadata !12, metadata !6, metadata !6, metadata !11, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !12, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 0}
-!7 = metadata !{i32 786688, metadata !8, metadata !"i", metadata !1, i32 3, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!8 = metadata !{i32 786443, metadata !12, metadata !0, i32 2, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{metadata !"0x100\00i\003\000", metadata !8, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!8 = metadata !{metadata !"0xb\002\0012\000", metadata !12, metadata !0} ; [ DW_TAG_lexical_block ]
 !9 = metadata !{i32 3, i32 11, metadata !8, null}
 !10 = metadata !{i32 4, i32 2, metadata !8, null}
 !11 = metadata !{metadata !0}
 !12 = metadata !{metadata !"/tmp/x.c", metadata !"/Users/manav"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/MSP430/asm-clobbers.ll b/test/CodeGen/MSP430/asm-clobbers.ll
new file mode 100644
index 0000000..216a3fe
--- /dev/null
+++ b/test/CodeGen/MSP430/asm-clobbers.ll

@@ -0,0 +1,13 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:16:16-i32:16:32-a:16-n8:16"
+target triple = "msp430---elf"
+
+define void @test() {
+entry:
+; CHECK-LABEL: test:
+; CHECK: push.w r10
+  call void asm sideeffect "", "~{r10}"()
+; CHECK: pop.w r10
+  ret void
+}

diff --git a/test/CodeGen/MSP430/memset.ll b/test/CodeGen/MSP430/memset.ll
new file mode 100644
index 0000000..bf10544
--- /dev/null
+++ b/test/CodeGen/MSP430/memset.ll

@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
+target triple = "msp430---elf"
+
+@buf = external global i8*
+
+; Function Attrs: nounwind
+define void @test() nounwind {
+entry:
+; CHECK-LABEL: test:
+  %0 = load i8** @buf, align 2
+; CHECK: mov.w &buf, r15
+; CHECK-NEXT: mov.w #5, r14
+; CHECK-NEXT: mov.w #128, r13
+; CHECK-NEXT: call #memset
+  call void @llvm.memset.p0i8.i16(i8* %0, i8 5, i16 128, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i16(i8* nocapture, i8, i16, i32, i1) nounwind
+

diff --git a/test/CodeGen/Mips/Fast-ISel/br1.ll b/test/CodeGen/Mips/Fast-ISel/br1.ll
new file mode 100644
index 0000000..579a77f
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/br1.ll

@@ -0,0 +1,34 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@b = global i32 1, align 4
+@i = global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+; Function Attrs: nounwind
+define void @br() #0 {
+entry:
+  %0 = load i32* @b, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 6754, i32* @i, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+; FIXME: This instruction is redundant.
+; CHECK:  xor  $[[REG1:[0-9]+]], ${{[0-9]+}}, $zero
+; CHECK:  sltiu  $[[REG2:[0-9]+]], $[[REG1]], 1
+; CHECK:  bgtz  $[[REG2]], $BB[[BL:[0-9]+_[0-9]+]]
+; CHECK:  nop
+; CHECK:  addiu  ${{[0-9]+}}, $zero, 6754
+; CHECK:  sw  ${{[0-9]+}}, 0(${{[0-9]+}})
+; CHECK: $BB[[BL]]:
+
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/Mips/Fast-ISel/callabi.ll b/test/CodeGen/Mips/Fast-ISel/callabi.ll
new file mode 100644
index 0000000..44b94bb
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/callabi.ll

@@ -0,0 +1,477 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32r2
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s -check-prefix=CHECK2
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s -check-prefix=CHECK2
+
+
+@c1 = global i8 -45, align 1
+@uc1 = global i8 27, align 1
+@s1 = global i16 -1789, align 2
+@us1 = global i16 1256, align 2
+
+; Function Attrs: nounwind
+define void @cxi() #0 {
+entry:
+; CHECK-LABEL:  cxi
+  call void @xi(i32 10)
+; CHECK-DAG:    addiu   $4, $zero, 10
+; CHECK-DAG:    lw      $25, %got(xi)(${{[0-9]+}})
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xi(i32) #1
+
+; Function Attrs: nounwind
+define void @cxii() #0 {
+entry:
+; CHECK-LABEL:  cxii
+  call void @xii(i32 746, i32 892)
+; CHECK-DAG:    addiu   $4, $zero, 746
+; CHECK-DAG:    addiu   $5, $zero, 892
+; CHECK-DAG:    lw      $25, %got(xii)(${{[0-9]+}})
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xii(i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxiii() #0 {
+entry:
+; CHECK-LABEL:  cxiii
+  call void @xiii(i32 88, i32 44, i32 11)
+; CHECK-DAG:    addiu   $4, $zero, 88
+; CHECK-DAG:    addiu   $5, $zero, 44
+; CHECK-DAG:    addiu   $6, $zero, 11
+; CHECK-DAG:    lw      $25, %got(xiii)(${{[0-9]+}})
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xiii(i32, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxiiii() #0 {
+entry:
+; CHECK-LABEL:  cxiiii
+  call void @xiiii(i32 167, i32 320, i32 97, i32 14)
+; CHECK-DAG:    addiu   $4, $zero, 167
+; CHECK-DAG:    addiu   $5, $zero, 320
+; CHECK-DAG:    addiu   $6, $zero, 97
+; CHECK-DAG:    addiu   $7, $zero, 14
+; CHECK-DAG:    lw      $25, %got(xiiii)(${{[0-9]+}})
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xiiii(i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxiiiiconv() #0 {
+entry:
+; CHECK-LABEL: cxiiiiconv
+; mips32r2-LABEL:  cxiiiiconv
+; mips32-LABEL:  cxiiiiconv
+  %0 = load i8* @c1, align 1
+  %conv = sext i8 %0 to i32
+  %1 = load i8* @uc1, align 1
+  %conv1 = zext i8 %1 to i32
+  %2 = load i16* @s1, align 2
+  %conv2 = sext i16 %2 to i32
+  %3 = load i16* @us1, align 2
+  %conv3 = zext i16 %3 to i32
+  call void @xiiii(i32 %conv, i32 %conv1, i32 %conv2, i32 %conv3)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32r2:     addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32:       addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32r2-DAG:         lw      $[[REG_C1_ADDR:[0-9]+]], %got(c1)($[[REG_GP]])
+; mips32r2-DAG: lbu     $[[REG_C1:[0-9]+]], 0($[[REG_C1_ADDR]])
+; mips32r2-DAG  seb     $3, $[[REG_C1]]
+; mips32-DAG:   lw      $[[REG_C1_ADDR:[0-9]+]], %got(c1)($[[REG_GP]])
+; mips32-DAG:   lbu     $[[REG_C1:[0-9]+]], 0($[[REG_C1_ADDR]])
+; mips32-DAG:   sll     $[[REG_C1_1:[0-9]+]], $[[REG_C1]], 24
+; mips32-DAG:   sra     $4, $[[REG_C1_1]], 24
+; CHECK-DAG:    lw      $[[REG_UC1_ADDR:[0-9]+]], %got(uc1)($[[REG_GP]])
+; CHECK-DAG:    lbu     $[[REG_UC1:[0-9]+]], 0($[[REG_UC1_ADDR]])
+; FIXME andi is superfulous
+; CHECK-DAG:    andi    $5, $[[REG_UC1]], 255
+; mips32r2-DAG:         lw      $[[REG_S1_ADDR:[0-9]+]], %got(s1)($[[REG_GP]])
+; mips32r2-DAG: lhu     $[[REG_S1:[0-9]+]], 0($[[REG_S1_ADDR]])
+; mips32r2-DAG: seh     $6, $[[REG_S1]]
+; mips32-DAG:   lw      $[[REG_S1_ADDR:[0-9]+]], %got(s1)($[[REG_GP]])
+; mips32-DAG:   lhu     $[[REG_S1:[0-9]+]], 0($[[REG_S1_ADDR]])
+; mips32-DAG:   sll     $[[REG_S1_1:[0-9]+]], $[[REG_S1]], 16
+; mips32-DAG:   sra     $6, $[[REG_S1_1]], 16
+; CHECK-DAG:    lw      $[[REG_US1_ADDR:[0-9]+]], %got(us1)($[[REG_GP]])
+; CHECK-DAG:    lhu     $[[REG_US1:[0-9]+]], 0($[[REG_US1_ADDR]])
+; FIXME andi is superfulous
+; CHECK-DAG:    andi    $7, $[[REG_US1]], 65535
+; mips32r2:     jalr    $25
+; mips32r2:     jalr    $25
+; CHECK:        jalr    $25
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @cxf() #0 {
+entry:
+; CHECK-LABEL:  cxf
+  call void @xf(float 0x40BBC85560000000)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK:        lui     $[[REG_FPCONST_1:[0-9]+]], 17886
+; CHECK:        ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 17067
+; CHECK: mtc1   $[[REG_FPCONST]], $f12
+; CHECK:        lw      $25, %got(xf)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xf(float) #1
+
+; Function Attrs: nounwind
+define void @cxff() #0 {
+entry:
+; CHECK-LABEL:  cxff
+  call void @xff(float 0x3FF74A6CA0000000, float 0x401A2C0840000000)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REG_FPCONST_1:[0-9]+]], 16314
+; CHECK-DAG:    ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 21349
+; CHECK-DAG: mtc1       $[[REG_FPCONST]], $f12
+; CHECK-DAG:    lui     $[[REG_FPCONST_2:[0-9]+]], 16593
+; CHECK-DAG:    ori     $[[REG_FPCONST_3:[0-9]+]], $[[REG_FPCONST_2]], 24642
+; CHECK-DAG: mtc1       $[[REG_FPCONST_3]], $f14
+; CHECK:        lw      $25, %got(xff)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xff(float, float) #1
+
+; Function Attrs: nounwind
+define void @cxfi() #0 {
+entry:
+; CHECK-LABEL: cxfi
+  call void @xfi(float 0x4013906240000000, i32 102)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REG_FPCONST_1:[0-9]+]], 16540
+; CHECK-DAG:    ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 33554
+; CHECK-DAG: mtc1       $[[REG_FPCONST]], $f12
+; CHECK-DAG:    addiu   $5, $zero, 102
+; CHECK:        lw      $25, %got(xfi)($[[REG_GP]])
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xfi(float, i32) #1
+
+; Function Attrs: nounwind
+define void @cxfii() #0 {
+entry:
+; CHECK-LABEL: cxfii
+  call void @xfii(float 0x405EC7EE00000000, i32 9993, i32 10922)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REG_FPCONST_1:[0-9]+]], 17142
+; CHECK-DAG:    ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 16240
+; CHECK-DAG: mtc1       $[[REG_FPCONST]], $f12
+; CHECK-DAG:    addiu   $5, $zero, 9993
+; CHECK-DAG:    addiu   $6, $zero, 10922
+; CHECK:        lw      $25, %got(xfii)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xfii(float, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxfiii() #0 {
+entry:
+; CHECK-LABEL: cxfiii
+  call void @xfiii(float 0x405C072B20000000, i32 3948, i32 89011, i32 111222)
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REG_FPCONST_1:[0-9]+]], 17120
+; CHECK-DAG:    ori     $[[REG_FPCONST:[0-9]+]], $[[REG_FPCONST_1]], 14681
+; CHECK-DAG: mtc1       $[[REG_FPCONST]], $f12
+; CHECK-DAG:    addiu   $5, $zero, 3948
+; CHECK-DAG:    lui     $[[REG_I_1:[0-9]+]], 1
+; CHECK-DAG:    ori     $6, $[[REG_I_1]], 23475
+; CHECK-DAG:    lui     $[[REG_I_2:[0-9]+]], 1
+; CHECK-DAG:    ori     $7, $[[REG_I_2]], 45686
+; CHECK:        lw      $25, %got(xfiii)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xfiii(float, i32, i32, i32) #1
+
+; Function Attrs: nounwind
+define void @cxd() #0 {
+entry:
+; mips32r2-LABEL: cxd:
+; mips32-LABEL: cxd:
+  call void @xd(double 5.994560e+02)
+; mips32:       addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32-DAG:   lui     $[[REG_FPCONST_1:[0-9]+]], 16514
+; mips32-DAG:   ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 48037
+; mips32-DAG:   lui     $[[REG_FPCONST_3:[0-9]+]], 58195
+; mips32-DAG:   ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 63439
+; mips32-DAG:    mtc1   $[[REG_FPCONST_4]], $f12
+; mips32-DAG:       mtc1        $[[REG_FPCONST_2]], $f13
+; mips32-DAG:   lw      $25, %got(xd)($[[REG_GP]])
+; mips32:       jalr    $25
+; mips32r2:     addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32r2-DAG: lui     $[[REG_FPCONST_1:[0-9]+]], 16514
+; mips32r2-DAG: ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 48037
+; mips32r2-DAG: lui     $[[REG_FPCONST_3:[0-9]+]], 58195
+; mips32r2-DAG: ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 63439
+; mips32r2-DAG: mtc1    $[[REG_FPCONST_4]], $f12
+; mips32r2-DAG: mthc1   $[[REG_FPCONST_2]], $f12
+; mips32r2-DAG: lw      $25, %got(xd)($[[REG_GP]])
+; mips32r2 :    jalr    $25
+  ret void
+}
+
+declare void @xd(double) #1
+
+; Function Attrs: nounwind
+define void @cxdd() #0 {
+; mips32r2-LABEL: cxdd:
+; mips32-LABEL: cxdd:
+entry:
+  call void @xdd(double 1.234980e+03, double 0x40F5B331F7CED917)
+; mips32:       addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32-DAG:   lui     $[[REG_FPCONST_1:[0-9]+]], 16531
+; mips32-DAG:   ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 19435
+; mips32-DAG:   lui     $[[REG_FPCONST_3:[0-9]+]], 34078
+; mips32-DAG:   ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 47186
+; mips32-DAG:   mtc1    $[[REG_FPCONST_4]], $f12
+; mips32-DAG:   mtc1    $[[REG_FPCONST_2]], $f13
+; mips32-DAG:   lui     $[[REG_FPCONST_1:[0-9]+]], 16629
+; mips32-DAG:   ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 45873
+; mips32-DAG:   lui     $[[REG_FPCONST_3:[0-9]+]], 63438
+; mips32-DAG:   ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 55575
+; mips32-DAG:   mtc1    $[[REG_FPCONST_4]], $f14
+; mips32-DAG:   mtc1    $[[REG_FPCONST_2]], $f15
+; mips32-DAG:   lw      $25, %got(xdd)($[[REG_GP]])
+; mips32:       jalr    $25
+; mips32r2:     addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; mips32r2-DAG: lui     $[[REG_FPCONST_1:[0-9]+]], 16531
+; mips32r2-DAG: ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 19435
+; mips32r2-DAG: lui     $[[REG_FPCONST_3:[0-9]+]], 34078
+; mips32r2-DAG: ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 47186
+; mips32r2-DAG: mtc1    $[[REG_FPCONST_4]], $f12
+; mips32r2-DAG: mthc1   $[[REG_FPCONST_2]], $f12
+; mips32r2-DAG: lui     $[[REG_FPCONST_1:[0-9]+]], 16629
+; mips32r2-DAG: ori     $[[REG_FPCONST_2:[0-9]+]], $[[REG_FPCONST_1]], 45873
+; mips32r2-DAG: lui     $[[REG_FPCONST_3:[0-9]+]], 63438
+; mips32r2-DAG: ori     $[[REG_FPCONST_4:[0-9]+]], $[[REG_FPCONST_3]], 55575
+; mips32r2-DAG: mtc1    $[[REG_FPCONST_4]], $f14
+; mips32r2-DAG: mthc1   $[[REG_FPCONST_2]], $f14
+; mips32r2-DAG: lw      $25, %got(xdd)($[[REG_GP]])
+; mips32r2 :    jalr    $25
+  ret void
+}
+
+declare void @xdd(double, double) #1
+
+; Function Attrs: nounwind
+define void @cxif() #0 {
+entry:
+; CHECK-LABEL: cxif:
+  call void @xif(i32 345, float 0x407BCE5A20000000)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addiu   $4, $zero, 345
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 17374
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 29393
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECK-DAG:    lw      $25, %got(xif)($[[REG_GP]])
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xif(i32, float) #1
+
+; Function Attrs: nounwind
+define void @cxiff() #0 {
+entry:
+; CHECK-LABEL: cxiff:
+; CHECK2-LABEL: cxiff:
+  call void @xiff(i32 12239, float 0x408EDB3340000000, float 0x4013FFE5C0000000)
+; We need to do the two floating point parameters in a separate
+; check because we can't control the ordering of parts of the sequence
+;;
+; CHECK:        addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK:        addiu   $4, $zero, 12239
+; CHECK2:       addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK2:       addiu   $4, $zero, 12239
+; CHECK:        lui     $[[REGF_1:[0-9]+]], 17526
+; CHECK:        ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 55706
+; CHECK:        mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK:        mfc1    $5, $f[[REGF_3]]
+; CHECK2:       lui     $[[REGF2_1:[0-9]+]], 16543
+; CHECK2:       ori     $[[REGF2_2:[0-9]+]], $[[REGF2_1]], 65326
+; CHECK2:       mtc1    $[[REGF2_2]], $f[[REGF2_3:[0-9]+]]
+; CHECK2:       mfc1    $6, $f[[REGF2_3]]
+; CHECK:        lw      $25, %got(xiff)($[[REG_GP]])
+; CHECK2:       lw      $25, %got(xiff)($[[REG_GP]])
+; CHECK:        jalr    $25
+; CHECK2:       jalr    $25
+  ret void
+}
+
+declare void @xiff(i32, float, float) #1
+
+; Function Attrs: nounwind
+define void @cxifi() #0 {
+entry:
+; CHECK: cxifi:
+  call void @xifi(i32 887, float 0x402277CEE0000000, i32 888)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addiu   $4, $zero, 887
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 16659
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 48759
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECk-DAG:    addiu   $6, $zero, 888
+; CHECK-DAG:    lw      $25, %got(xifi)($[[REG_GP]])
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xifi(i32, float, i32) #1
+
+; Function Attrs: nounwind
+define void @cxifif() #0 {
+entry:
+; CHECK: cxifif:
+; CHECK2: cxifif:
+  call void @xifif(i32 67774, float 0x408EE0FBE0000000, i32 9991, float 0x40B15C8CC0000000)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    lui     $[[REGI:[0-9]+]], 1
+; CHECK-DAG:    ori     $4, $[[REGI]], 2238
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 17527
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 2015
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECk-DAG:    addiu   $6, $zero, 888
+; CHECK2:       lui     $[[REGF2_1:[0-9]+]], 17802
+; CHECK2:       ori     $[[REGF2_2:[0-9]+]], $[[REGF2_1]], 58470
+; CHECK2:       mtc1    $[[REGF2_2]], $f[[REGF2_3:[0-9]+]]
+; CHECK2:       mfc1    $7, $f[[REGF2_3]]
+; CHECK:        lw      $25, %got(xifif)($[[REG_GP]])
+; CHECK2:       lw      $25, %got(xifif)($[[REG_GP]])
+; CHECK2:       jalr    $25
+; CHECK:        jalr    $25
+
+  ret void
+}
+
+declare void @xifif(i32, float, i32, float) #1
+
+; Function Attrs: nounwind
+define void @cxiffi() #0 {
+entry:
+; CHECK-label: cxiffi:
+; CHECK2-label: cxiffi:
+  call void @xiffi(i32 45, float 0x3FF6666660000000, float 0x408F333340000000, i32 234)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addiu   $4, $zero, 45
+; CHECK2-DAG:   addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK2-DAG:   addiu   $4, $zero, 45
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 16307
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 13107
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECK2:       lui     $[[REGF2_1:[0-9]+]], 17529
+; CHECK2:       ori     $[[REGF2_2:[0-9]+]], $[[REGF2_1]], 39322
+; CHECK2:       mtc1    $[[REGF2_2]], $f[[REGF2_3:[0-9]+]]
+; CHECK2:       mfc1    $6, $f[[REGF2_3]]
+; CHECK-DAG:    lw      $25, %got(xiffi)($[[REG_GP]])
+; CHECK-DAG:    addiu   $7, $zero, 234
+; CHECK2-DAG:   lw      $25, %got(xiffi)($[[REG_GP]])
+; CHECK:        jalr    $25
+; CHECK2:       jalr    $25
+
+  ret void
+}
+
+declare void @xiffi(i32, float, float, i32) #1
+
+; Function Attrs: nounwind
+define void @cxifii() #0 {
+entry:
+; CHECK-DAG:    cxifii:
+  call void @xifii(i32 12239, float 0x408EDB3340000000, i32 998877, i32 1234)
+; CHECK-DAG:    addu    $[[REG_GP:[0-9]+]], ${{[0-9]+}}, ${{[0-9+]}}
+; CHECK-DAG:    addiu   $4, $zero, 12239
+; CHECK-DAG:    lui     $[[REGF_1:[0-9]+]], 17526
+; CHECK-DAG:    ori     $[[REGF_2:[0-9]+]], $[[REGF_1]], 55706
+; CHECK-DAG:    mtc1    $[[REGF_2]], $f[[REGF_3:[0-9]+]]
+; CHECK-DAG:    mfc1    $5, $f[[REGF_3]]
+; CHECK-DAG:    lui     $[[REGI2:[0-9]+]], 15
+; CHECK-DAG:    ori     $6, $[[REGI2]], 15837
+; CHECk-DAG:    addiu   $7, $zero, 1234
+; CHECK-DAG:    lw      $25, %got(xifii)($[[REG_GP]])
+; CHECK:        jalr    $25
+  ret void
+}
+
+declare void @xifii(i32, float, i32, i32) #1
+
+; FIXME: this function will not pass yet. 
+; Function Attrs: nounwind
+; define void @cxfid() #0 {
+;entry:
+;  call void @xfid(float 0x4013B851E0000000, i32 811123, double 0x40934BFF487FCB92)
+;  ret void
+;}
+
+declare void @xfid(float, i32, double) #1
+
+; Function Attrs: nounwind
+define void @g() #0 {
+entry:
+  call void @cxi()
+  call void @cxii()
+  call void @cxiii()
+  call void @cxiiii()
+  call void @cxiiiiconv()
+  call void @cxf()
+  call void @cxff()
+  call void @cxd()
+  call void @cxfi()
+  call void @cxfii()
+  call void @cxfiii()
+  call void @cxdd()
+  call void @cxif()
+  call void @cxiff()
+  call void @cxifi()
+  call void @cxifii()
+  call void @cxifif()
+  call void @cxiffi()
+  ret void
+}
+
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 (gitosis@dmz-portal.mips.com:clang 43992fe7b17de5553ac06d323cb80cc6723a9ae3) (gitosis@dmz-portal.mips.com:llvm.git 0834e6839eb170197c81bb02e916258d1527e312)"}

diff --git a/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll b/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll
new file mode 100644
index 0000000..c72b1e7
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/fpcmpa.ll

@@ -0,0 +1,254 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@f1 = common global float 0.000000e+00, align 4
+@f2 = common global float 0.000000e+00, align 4
+@b1 = common global i32 0, align 4
+@d1 = common global double 0.000000e+00, align 8
+@d2 = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define void @feq1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp oeq float %0, %1
+; CHECK-LABEL:  feq1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.eq.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fne1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp une float %0, %1
+; CHECK-LABEL:  fne1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.eq.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @flt1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp olt float %0, %1
+; CHECK-LABEL:  flt1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.olt.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fgt1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp ogt float %0, %1
+; CHECK-LABEL: fgt1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ule.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fle1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp ole float %0, %1
+; CHECK-LABEL:  fle1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ole.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fge1()  {
+entry:
+  %0 = load float* @f1, align 4
+  %1 = load float* @f2, align 4
+  %cmp = fcmp oge float %0, %1
+; CHECK-LABEL:  fge1:
+; CHECK-DAG:    lw      $[[REG_F2_GOT:[0-9]+]], %got(f2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_F1_GOT:[0-9]+]], %got(f1)(${{[0-9]+}})
+; CHECK-DAG:    lwc1    $f[[REG_F2:[0-9]+]], 0($[[REG_F2_GOT]])
+; CHECK-DAG:    lwc1    $f[[REG_F1:[0-9]+]], 0($[[REG_F1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ult.s  $f[[REG_F1]], $f[[REG_F2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @deq1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp oeq double %0, %1
+; CHECK-LABEL:  deq1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.eq.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dne1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp une double %0, %1
+; CHECK-LABEL:  dne1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.eq.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dlt1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp olt double %0, %1
+; CHECK-LABEL:  dlt1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.olt.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dgt1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp ogt double %0, %1
+; CHECK-LABEL:  dgt1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ule.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dle1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp ole double %0, %1
+; CHECK-LABEL:  dle1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ole.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movt  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @dge1()  {
+entry:
+  %0 = load double* @d1, align 8
+  %1 = load double* @d2, align 8
+  %cmp = fcmp oge double %0, %1
+; CHECK-LABEL:  dge1:
+; CHECK-DAG:    lw      $[[REG_D2_GOT:[0-9]+]], %got(d2)(${{[0-9]+}})
+; CHECK-DAG:    lw      $[[REG_D1_GOT:[0-9]+]], %got(d1)(${{[0-9]+}})
+; CHECK-DAG:    ldc1    $f[[REG_D2:[0-9]+]], 0($[[REG_D2_GOT]])
+; CHECK-DAG:    ldc1    $f[[REG_D1:[0-9]+]], 0($[[REG_D1_GOT]])
+; CHECK-DAG:    addiu   $[[REG_ZERO:[0-9]+]], $zero, 0
+; CHECK-DAG:    addiu   $[[REG_ONE:[0-9]+]], $zero, 1
+; CHECK:        c.ult.d  $f[[REG_D1]], $f[[REG_D2]]
+; CHECK:        movf  $[[REG_ZERO]], $[[REG_ONE]], $fcc0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+

diff --git a/test/CodeGen/Mips/Fast-ISel/fpext.ll b/test/CodeGen/Mips/Fast-ISel/fpext.ll
new file mode 100644
index 0000000..98aca75
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/fpext.ll

@@ -0,0 +1,21 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@f = global float 0x40147E6B80000000, align 4
+@d_f = common global double 0.000000e+00, align 8
+@.str = private unnamed_addr constant [6 x i8] c"%f  \0A\00", align 1
+
+; Function Attrs: nounwind
+define void @dv() #0 {
+entry:
+  %0 = load float* @f, align 4
+  %conv = fpext float %0 to double
+; CHECK: cvt.d.s  $f{{[0-9]+}}, $f{{[0-9]+}}
+  store double %conv, double* @d_f, align 8
+  ret void
+}
+
+
+attributes #1 = { nounwind }

diff --git a/test/CodeGen/Mips/Fast-ISel/fpintconv.ll b/test/CodeGen/Mips/Fast-ISel/fpintconv.ll
new file mode 100644
index 0000000..846726a
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/fpintconv.ll

@@ -0,0 +1,35 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+
+@f = global float 0x40D6E83280000000, align 4
+@d = global double 0x4132D68780000000, align 8
+@i_f = common global i32 0, align 4
+@i_d = common global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+; Function Attrs: nounwind
+define void @ifv() {
+entry:
+; CHECK-LABEL:   .ent  ifv
+  %0 = load float* @f, align 4
+  %conv = fptosi float %0 to i32
+; CHECK:   trunc.w.s  $f[[REG:[0-9]+]], $f{{[0-9]+}}
+; CHECK:   mfc1	${{[0-9]+}}, $f[[REG]]
+  store i32 %conv, i32* @i_f, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @idv() {
+entry:
+; CHECK-LABEL:   .ent  idv
+  %0 = load double* @d, align 8
+  %conv = fptosi double %0 to i32
+; CHECK:   trunc.w.d  $f[[REG:[0-9]+]], $f{{[0-9]+}}
+; CHECK:   mfc1	${{[0-9]+}}, $f[[REG]]
+  store i32 %conv, i32* @i_d, align 4
+  ret void
+}

diff --git a/test/CodeGen/Mips/Fast-ISel/fptrunc.ll b/test/CodeGen/Mips/Fast-ISel/fptrunc.ll
new file mode 100644
index 0000000..d843dee
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/fptrunc.ll

@@ -0,0 +1,20 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@d = global double 0x40147E6B74DF0446, align 8
+@f = common global float 0.000000e+00, align 4
+@.str = private unnamed_addr constant [6 x i8] c"%f  \0A\00", align 1
+
+; Function Attrs: nounwind
+define void @fv() #0 {
+entry:
+  %0 = load double* @d, align 8
+  %conv = fptrunc double %0 to float
+; CHECK: cvt.s.d  $f{{[0-9]+}}, $f{{[0-9]+}}
+  store float %conv, float* @f, align 4
+  ret void
+}
+
+attributes #1 = { nounwind }

diff --git a/test/CodeGen/Mips/Fast-ISel/icmpa.ll b/test/CodeGen/Mips/Fast-ISel/icmpa.ll
new file mode 100644
index 0000000..bd41a29
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/icmpa.ll

@@ -0,0 +1,210 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@c = global i32 4, align 4
+@d = global i32 9, align 4
+@uc = global i32 4, align 4
+@ud = global i32 9, align 4
+@b1 = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @eq()  {
+entry:
+; CHECK-LABEL:  .ent  eq
+
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp eq i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  xor  $[[REG1:[0-9]+]], $[[REG_C]], $[[REG_D]]
+; CHECK:  sltiu  $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The sltiu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ne()  {
+entry:
+; CHECK-LABEL:  .ent  ne
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp ne i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  xor  $[[REG1:[0-9]+]], $[[REG_C]], $[[REG_D]]
+; CHECK:  sltu  $[[REG2:[0-9]+]], $zero, $[[REG1]]
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ugt()  {
+entry:
+; CHECK-LABEL:  .ent  ugt
+  %0 = load i32* @uc, align 4
+  %1 = load i32* @ud, align 4
+  %cmp = icmp ugt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_UD_GOT:[0-9+]]], %got(ud)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UC_GOT:[0-9+]]], %got(uc)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UD:[0-9]+]], 0($[[REG_UD_GOT]])
+; CHECK-DAG:  lw	$[[REG_UC:[0-9]+]], 0($[[REG_UC_GOT]])
+; CHECK:  sltu  $[[REG1:[0-9]+]], $[[REG_UD]], $[[REG_UC]]
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ult()  {
+entry:
+; CHECK-LABEL:  .ent  ult
+  %0 = load i32* @uc, align 4
+  %1 = load i32* @ud, align 4
+  %cmp = icmp ult i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_UD_GOT:[0-9+]]], %got(ud)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UC_GOT:[0-9+]]], %got(uc)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UD:[0-9]+]], 0($[[REG_UD_GOT]])
+; CHECK-DAG:  lw	$[[REG_UC:[0-9]+]], 0($[[REG_UC_GOT]])
+; CHECK:  sltu  $[[REG1:[0-9]+]], $[[REG_UC]], $[[REG_UD]]
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @uge()  {
+entry:
+; CHECK-LABEL:  .ent  uge
+  %0 = load i32* @uc, align 4
+  %1 = load i32* @ud, align 4
+  %cmp = icmp uge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_UD_GOT:[0-9+]]], %got(ud)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UC_GOT:[0-9+]]], %got(uc)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UD:[0-9]+]], 0($[[REG_UD_GOT]])
+; CHECK-DAG:  lw	$[[REG_UC:[0-9]+]], 0($[[REG_UC_GOT]])
+; CHECK:  sltu  $[[REG1:[0-9]+]], $[[REG_UC]], $[[REG_UD]]
+; CHECK:  xori  $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @ule()  {
+entry:
+; CHECK-LABEL:  .ent  ule
+  %0 = load i32* @uc, align 4
+  %1 = load i32* @ud, align 4
+  %cmp = icmp ule i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_UD_GOT:[0-9+]]], %got(ud)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UC_GOT:[0-9+]]], %got(uc)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_UD:[0-9]+]], 0($[[REG_UD_GOT]])
+; CHECK-DAG:  lw	$[[REG_UC:[0-9]+]], 0($[[REG_UC_GOT]])
+; CHECK:  sltu  $[[REG1:[0-9]+]], $[[REG_UD]], $[[REG_UC]]
+; CHECK:  xori  $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The sltu can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sgt()  {
+entry:
+; CHECK-LABEL:  .ent sgt
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp sgt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  slt  $[[REG1:[0-9]+]], $[[REG_D]], $[[REG_C]]
+; FIXME: This instruction is redundant. The slt can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @slt()  {
+entry:
+; CHECK-LABEL:  .ent slt
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp slt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  slt  $[[REG1:[0-9]+]], $[[REG_C]], $[[REG_D]]
+; FIXME: This instruction is redundant. The slt can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sge()  {
+entry:
+; CHECK-LABEL:  .ent sge
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp sge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @b1, align 4
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:  slt  $[[REG1:[0-9]+]], $[[REG_C]], $[[REG_D]]
+; CHECK:  xori  $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The slt can only produce 0 and 1.
+; CHECK:  andi  ${{[0-9]+}}, $[[REG2]], 1
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @sle()  {
+entry:
+; CHECK-LABEL:  .ent sle
+  %0 = load i32* @c, align 4
+  %1 = load i32* @d, align 4
+  %cmp = icmp sle i32 %0, %1
+  %conv = zext i1 %cmp to i32
+; CHECK-DAG:  lw	$[[REG_D_GOT:[0-9+]]], %got(d)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_C_GOT:[0-9+]]], %got(c)(${{[0-9]+}})
+; CHECK-DAG:  lw	$[[REG_D:[0-9]+]], 0($[[REG_D_GOT]])
+; CHECK-DAG:  lw	$[[REG_C:[0-9]+]], 0($[[REG_C_GOT]])
+; CHECK:        slt     $[[REG1:[0-9]+]], $[[REG_D]], $[[REG_C]]
+; CHECK:        xori    $[[REG2:[0-9]+]], $[[REG1]], 1
+; FIXME: This instruction is redundant. The slt can only produce 0 and 1.
+; CHECK:        andi    ${{[0-9]+}}, $[[REG2]], 1
+  store i32 %conv, i32* @b1, align 4
+  ret void
+}

diff --git a/test/CodeGen/Mips/Fast-ISel/loadstore2.ll b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
index f113a0e..d84478b 100644
--- a/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
+++ b/test/CodeGen/Mips/Fast-ISel/loadstore2.ll

@@ -6,6 +6,8 @@
 @c1 = common global i8 0, align 1
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
 
 @s2 = common global i16 0, align 2
 @s1 = common global i16 0, align 2

diff --git a/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll b/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll
new file mode 100644
index 0000000..f7f2c64
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/loadstoreconv.ll

@@ -0,0 +1,179 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32r2
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32
+
+@b2 = global i8 0, align 1
+@b1 = global i8 1, align 1
+@uc1 = global i8 0, align 1
+@uc2 = global i8 -1, align 1
+@sc1 = global i8 -128, align 1
+@sc2 = global i8 127, align 1
+@ss1 = global i16 -32768, align 2
+@ss2 = global i16 32767, align 2
+@us1 = global i16 0, align 2
+@us2 = global i16 -1, align 2
+@ssi = global i16 0, align 2
+@ssj = global i16 0, align 2
+@i = global i32 0, align 4
+@j = global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
+@.str1 = private unnamed_addr constant [7 x i8] c"%i %i\0A\00", align 1
+
+; Function Attrs: nounwind
+define void @_Z3b_iv()  {
+entry:
+; CHECK-LABEL:   .ent  _Z3b_iv
+  %0 = load i8* @b1, align 1
+  %tobool = trunc i8 %0 to i1
+  %frombool = zext i1 %tobool to i8
+  store i8 %frombool, i8* @b2, align 1
+  %1 = load i8* @b2, align 1
+  %tobool1 = trunc i8 %1 to i1
+  %conv = zext i1 %tobool1 to i32
+  store i32 %conv, i32* @i, align 4
+; CHECK:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  $[[REG2:[0-9]+]], $[[REG1]], 1
+; CHECK:  sb  $[[REG2]], 0(${{[0-9]+}})
+
+
+
+  ret void
+; CHECK:   .end  _Z3b_iv
+}
+
+; Function Attrs: nounwind
+define void @_Z4uc_iv()  {
+entry:
+; CHECK-LABEL:  .ent  _Z4uc_iv
+
+  %0 = load i8* @uc1, align 1
+  %conv = zext i8 %0 to i32
+  store i32 %conv, i32* @i, align 4
+  %1 = load i8* @uc2, align 1
+  %conv1 = zext i8 %1 to i32
+; CHECK:   lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 255
+
+  store i32 %conv1, i32* @j, align 4
+  ret void
+; CHECK:  .end  _Z4uc_iv
+
+}
+
+; Function Attrs: nounwind
+define void @_Z4sc_iv()  {
+entry:
+; mips32r2-LABEL:  .ent  _Z4sc_iv
+; mips32-LABEL:  .ent  _Z4sc_iv
+
+  %0 = load i8* @sc1, align 1
+  %conv = sext i8 %0 to i32
+  store i32 %conv, i32* @i, align 4
+  %1 = load i8* @sc2, align 1
+  %conv1 = sext i8 %1 to i32
+  store i32 %conv1, i32* @j, align 4
+; mips32r2:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32r2:  seb  ${{[0-9]+}}, $[[REG1]]
+; mips32:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32:    sll  $[[REG2:[0-9]+]], $[[REG1]], 24
+; mips32:    sra  ${{[0-9]+}}, $[[REG2]], 24
+
+  ret void
+; CHECK:  .end  _Z4sc_iv
+}
+
+; Function Attrs: nounwind
+define void @_Z4us_iv()  {
+entry:
+; CHECK-LABEL:  .ent  _Z4us_iv
+  %0 = load i16* @us1, align 2
+  %conv = zext i16 %0 to i32
+  store i32 %conv, i32* @i, align 4
+  %1 = load i16* @us2, align 2
+  %conv1 = zext i16 %1 to i32
+  store i32 %conv1, i32* @j, align 4
+  ret void
+; CHECK:  lhu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 65535
+; CHECK:  .end  _Z4us_iv
+}
+
+; Function Attrs: nounwind
+define void @_Z4ss_iv()  {
+entry:
+; mips32r2-LABEL:  .ent  _Z4ss_iv
+; mips32=LABEL:  .ent  _Z4ss_iv
+
+  %0 = load i16* @ss1, align 2
+  %conv = sext i16 %0 to i32
+  store i32 %conv, i32* @i, align 4
+  %1 = load i16* @ss2, align 2
+  %conv1 = sext i16 %1 to i32
+  store i32 %conv1, i32* @j, align 4
+; mips32r2:  lhu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32r2:  seh  ${{[0-9]+}}, $[[REG1]]
+; mips32:    lhu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32:    sll  $[[REG2:[0-9]+]], $[[REG1]], 16
+; mips32:    sra  ${{[0-9]+}}, $[[REG2]], 16
+
+  ret void
+; CHECK:  .end  _Z4ss_iv
+}
+
+; Function Attrs: nounwind
+define void @_Z4b_ssv()  {
+entry:
+; CHECK-LABEL:  .ent  _Z4b_ssv
+  %0 = load i8* @b2, align 1
+  %tobool = trunc i8 %0 to i1
+  %conv = zext i1 %tobool to i16
+  store i16 %conv, i16* @ssi, align 2
+  ret void
+; CHECK:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 1
+; CHECK:  .end  _Z4b_ssv
+}
+
+; Function Attrs: nounwind
+define void @_Z5uc_ssv()  {
+entry:
+; CHECK-LABEL:  .ent  _Z5uc_ssv
+  %0 = load i8* @uc1, align 1
+  %conv = zext i8 %0 to i16
+  store i16 %conv, i16* @ssi, align 2
+  %1 = load i8* @uc2, align 1
+  %conv1 = zext i8 %1 to i16
+; CHECK:   lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; CHECK:  andi  ${{[0-9]+}}, $[[REG1]], 255
+
+  store i16 %conv1, i16* @ssj, align 2
+  ret void
+; CHECK:  .end  _Z5uc_ssv
+}
+
+; Function Attrs: nounwind
+define void @_Z5sc_ssv()  {
+entry:
+; mips32r2-LABEL:  .ent  _Z5sc_ssv
+; mips32-LABEL:  .ent  _Z5sc_ssv
+  %0 = load i8* @sc1, align 1
+  %conv = sext i8 %0 to i16
+  store i16 %conv, i16* @ssi, align 2
+  %1 = load i8* @sc2, align 1
+  %conv1 = sext i8 %1 to i16
+  store i16 %conv1, i16* @ssj, align 2
+; mips32r2:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32r2:  seb  ${{[0-9]+}}, $[[REG1]]
+; mips32:  lbu  $[[REG1:[0-9]+]], 0(${{[0-9]+}})
+; mips32:    sll  $[[REG2:[0-9]+]], $[[REG1]], 24
+; mips32:    sra  ${{[0-9]+}}, $[[REG2]], 24
+
+  ret void
+; CHECK:  .end  _Z5sc_ssv
+}
+

diff --git a/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll b/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
new file mode 100644
index 0000000..93cf4c1
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll

@@ -0,0 +1,21 @@
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
+
+@.str = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+@s = common global i8* null, align 4
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+  store i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i8** @s, align 4
+  ret void
+; CHECK:        .ent    foo
+; CHECK:        lw      $[[REG1:[0-9]+]], %got($.str)(${{[0-9]+}})
+; CHECK:        addiu   ${{[0-9]+}}, $[[REG1]], %lo($.str)
+
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+

diff --git a/test/CodeGen/Mips/Fast-ISel/nullvoid.ll b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
index eeaff87..c847561 100644
--- a/test/CodeGen/Mips/Fast-ISel/nullvoid.ll
+++ b/test/CodeGen/Mips/Fast-ISel/nullvoid.ll

@@ -1,5 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
 
 ; Function Attrs: nounwind
 define void @foo() {

diff --git a/test/CodeGen/Mips/Fast-ISel/shift.ll b/test/CodeGen/Mips/Fast-ISel/shift.ll
new file mode 100644
index 0000000..18fd5ac
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/shift.ll

@@ -0,0 +1,24 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -O1 -fast-isel=true -mips-fast-isel -filetype=obj %s -o - \
+; RUN:   | llvm-objdump -arch mipsel -mcpu=mips32r2 -d - | FileCheck %s
+
+; This test checks that encoding for srl is correct when fast-isel for mips32r2 is used.
+
+%struct.s = type { [4 x i8], i32 }
+
+define i32 @main() nounwind uwtable {
+entry:
+  %foo = alloca %struct.s, align 4
+  %0 = bitcast %struct.s* %foo to i32*
+  %bf.load = load i32* %0, align 4
+  %bf.lshr = lshr i32 %bf.load, 2
+  %cmp = icmp ne i32 %bf.lshr, 2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  unreachable
+
+if.end:
+  ret i32 0
+}
+
+; CHECK: srl    ${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}

diff --git a/test/CodeGen/Mips/Fast-ISel/simplestore.ll b/test/CodeGen/Mips/Fast-ISel/simplestore.ll
index 5d52481..83e3f3f 100644
--- a/test/CodeGen/Mips/Fast-ISel/simplestore.ll
+++ b/test/CodeGen/Mips/Fast-ISel/simplestore.ll

@@ -1,5 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
 
 @abcd = external global i32
 

diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
index 6759c01..74723ae 100644
--- a/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll

@@ -1,5 +1,11 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s 
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32r2 
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s -check-prefix=mips32
 
 @f = common global float 0.000000e+00, align 4
 @de = common global double 0.000000e+00, align 8
@@ -23,15 +29,25 @@
 define void @d1() #0 {
 entry:
   store double 1.234567e+00, double* @de, align 8
-; CHECK:  .ent  d1
-; CHECK:  lui  $[[REG1a:[0-9]+]], 16371
-; CHECK:  ori  $[[REG2a:[0-9]+]], $[[REG1a]], 49353
-; CHECK:  lui  $[[REG1b:[0-9]+]], 21403
-; CHECK:  ori  $[[REG2b:[0-9]+]], $[[REG1b]], 34951
-; CHECK:  mtc1  $[[REG2b]], $f[[REG3:[0-9]+]]
-; CHECK:  mthc1  $[[REG2a]], $f[[REG3]]
-; CHECK:  sdc1  $f[[REG3]], 0(${{[0-9]+}})
-; CHECK:  .end  d1
+; mip32r2:  .ent  d1
+; mips32r2:  lui  $[[REG1a:[0-9]+]], 16371
+; mips32r2:  ori  $[[REG2a:[0-9]+]], $[[REG1a]], 49353
+; mips32r2:  lui  $[[REG1b:[0-9]+]], 21403
+; mips32r2:  ori  $[[REG2b:[0-9]+]], $[[REG1b]], 34951
+; mips32r2:  mtc1  $[[REG2b]], $f[[REG3:[0-9]+]]
+; mips32r2:  mthc1  $[[REG2a]], $f[[REG3]]
+; mips32r2:  sdc1  $f[[REG3]], 0(${{[0-9]+}})
+; mips32r2:  .end  d1
+; mips32:  .ent  d1
+; mips32:  lui  $[[REG1a:[0-9]+]], 16371
+; mips32:  ori  $[[REG2a:[0-9]+]], $[[REG1a]], 49353
+; mips32:  lui  $[[REG1b:[0-9]+]], 21403
+; mips32:  ori  $[[REG2b:[0-9]+]], $[[REG1b]], 34951
+; mips32:  mtc1  $[[REG2b]], $f[[REG3:[0-9]+]]
+; mips32:  mtc1  $[[REG2a]], $f{{[0-9]+}}
+; mips32:  sdc1  $f[[REG3]], 0(${{[0-9]+}})
+; mips32:  .end  d1
+
   ret void
 }
 

diff --git a/test/CodeGen/Mips/Fast-ISel/simplestorei.ll b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
index 7d2c8e7..128e1de 100644
--- a/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
+++ b/test/CodeGen/Mips/Fast-ISel/simplestorei.ll

@@ -1,5 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -O0 -mips-fast-isel -fast-isel-abort -mcpu=mips32 \
+; RUN:     < %s | FileCheck %s
 
 @ijk = external global i32
 

diff --git a/test/CodeGen/Mips/abicalls.ll b/test/CodeGen/Mips/abicalls.ll
index 6fa33aa..7edc3e2 100644
--- a/test/CodeGen/Mips/abicalls.ll
+++ b/test/CodeGen/Mips/abicalls.ll

@@ -1,16 +1,11 @@
-; 
-; When the assembler is ready a .s file for it will
-; be created.
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=STATIC %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=PIC %s
+; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=PIC %s
+; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=ABICALLS -check-prefix=PIC %s
 
-; Note that EF_MIPS_CPIC is set by -mabicalls which is the default on Linux
-; TODO need to support -mno-abicalls
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr noabicalls -relocation-model=static %s -o - | FileCheck -implicit-check-not='.abicalls' -implicit-check-not='pic0' %s
 
-; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-STATIC %s
-; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck -check-prefix=CHECK-PIC %s
-; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips4 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-PIC %s
-; RUN: llc -filetype=asm -mtriple mips64el-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | FileCheck -check-prefix=CHECK-PIC %s
+; ABICALLS: .abicalls
 
-; CHECK-STATIC: .abicalls
-; CHECK-STATIC-NEXT: pic0
-; CHECK-PIC: .abicalls
-; CHECK-PIC-NOT: pic0
+; STATIC: pic0
+; PIC-NOT: pic0

diff --git a/test/CodeGen/Mips/abiflags-xx.ll b/test/CodeGen/Mips/abiflags-xx.ll
index b8aa071..c461012 100644
--- a/test/CodeGen/Mips/abiflags-xx.ll
+++ b/test/CodeGen/Mips/abiflags-xx.ll

@@ -1,5 +1,4 @@
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr=fpxx %s -o - | FileCheck %s
-; XFAIL: *
 
 ; CHECK: .nan    legacy
 ; CHECK: .module fp=xx

diff --git a/test/CodeGen/Mips/abiflags32.ll b/test/CodeGen/Mips/abiflags32.ll
index 093964f..e32d4a5 100644
--- a/test/CodeGen/Mips/abiflags32.ll
+++ b/test/CodeGen/Mips/abiflags32.ll

@@ -3,10 +3,15 @@
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips64 -mattr=-n64,n32 %s -o - | FileCheck  -check-prefix=CHECK-64n %s
 
 ; CHECK: .nan    legacy
-; CHECK: .module fp=32
+; We don't emit '.module fp=32' for compatibility with binutils 2.24 which
+; doesn't accept .module.
+; CHECK-NOT: .module fp=32
 
 ; CHECK-64: .nan    legacy
+; We do emit '.module fp=64' though since it contradicts the default value.
 ; CHECK-64: .module fp=64
 
 ; CHECK-64n: .nan    legacy
-; CHECK-64n: .module fp=64
+; We don't emit '.module fp=64' for compatibility with binutils 2.24 which
+; doesn't accept .module.
+; CHECK-64n-NOT: .module fp=64

diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 066d42c..78fd829 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll

@@ -8,11 +8,11 @@
 
 ; Keep one big-endian check so that we don't reduce testing, but don't add more
 ; since endianness doesn't affect the body of the atomic operations.
-; RUN: llc -march=mips   --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=CHECK-EB
+; RUN: llc -march=mips   --disable-machine-licm -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=MIPS32-ANY -check-prefix=NO-SEB-SEH -check-prefix=CHECK-EB
 
 @x = common global i32 0, align 4
 
-define i32 @AtomicLoadAdd32(i32 %incr) nounwind {
+define i32 @AtomicLoadAdd32(i32 signext %incr) nounwind {
 entry:
   %0 = atomicrmw add i32* @x, i32 %incr monotonic
   ret i32 %0
@@ -29,7 +29,7 @@
 ; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
-define i32 @AtomicLoadNand32(i32 %incr) nounwind {
+define i32 @AtomicLoadNand32(i32 signext %incr) nounwind {
 entry:
   %0 = atomicrmw nand i32* @x, i32 %incr monotonic
   ret i32 %0
@@ -47,7 +47,7 @@
 ; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
-define i32 @AtomicSwap32(i32 %newval) nounwind {
+define i32 @AtomicSwap32(i32 signext %newval) nounwind {
 entry:
   %newval.addr = alloca i32, align 4
   store i32 %newval, i32* %newval.addr, align 4
@@ -66,7 +66,7 @@
 ; ALL:           beqz    $[[R2]], $[[BB0]]
 }
 
-define i32 @AtomicCmpSwap32(i32 %oldval, i32 %newval) nounwind {
+define i32 @AtomicCmpSwap32(i32 signext %oldval, i32 signext %newval) nounwind {
 entry:
   %newval.addr = alloca i32, align 4
   store i32 %newval, i32* %newval.addr, align 4
@@ -246,6 +246,7 @@
 ; NO-SEB-SEH:    sra     $2, $[[R17]], 24
 
 ; HAS-SEB-SEH:   seb     $2, $[[R16]]
+
 }
 
 define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwind {
@@ -292,6 +293,49 @@
 ; HAS-SEB-SEH:   seb     $2, $[[R17]]
 }
 
+define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) nounwind {
+entry:
+  %0 = cmpxchg i8* %ptr, i8 %oldval, i8 %newval monotonic monotonic
+  %1 = extractvalue { i8, i1 } %0, 1
+  ret i1 %1
+; ALL-LABEL: AtomicCmpSwapRes8
+
+; ALL:           addiu   $[[R1:[0-9]+]], $zero, -4
+; ALL:           and     $[[R2:[0-9]+]], $4, $[[R1]]
+; ALL:           andi    $[[R3:[0-9]+]], $4, 3
+; CHECK-EL:      sll     $[[R5:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      xori    $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK-EB:      sll     $[[R5:[0-9]+]], $[[R4]], 3
+; ALL:           ori     $[[R6:[0-9]+]], $zero, 255
+; ALL:           sllv    $[[R7:[0-9]+]], $[[R6]], $[[R5]]
+; ALL:           nor     $[[R8:[0-9]+]], $zero, $[[R7]]
+; ALL:           andi    $[[R9:[0-9]+]], $5, 255
+; ALL:           sllv    $[[R10:[0-9]+]], $[[R9]], $[[R5]]
+; ALL:           andi    $[[R11:[0-9]+]], $6, 255
+; ALL:           sllv    $[[R12:[0-9]+]], $[[R11]], $[[R5]]
+
+; ALL:       $[[BB0:[A-Z_0-9]+]]:
+; ALL:           ll      $[[R13:[0-9]+]], 0($[[R2]])
+; ALL:           and     $[[R14:[0-9]+]], $[[R13]], $[[R7]]
+; ALL:           bne     $[[R14]], $[[R10]], $[[BB1:[A-Z_0-9]+]]
+
+; ALL:           and     $[[R15:[0-9]+]], $[[R13]], $[[R8]]
+; ALL:           or      $[[R16:[0-9]+]], $[[R15]], $[[R12]]
+; ALL:           sc      $[[R16]], 0($[[R2]])
+; ALL:           beqz    $[[R16]], $[[BB0]]
+
+; ALL:       $[[BB1]]:
+; ALL:           srlv    $[[R17:[0-9]+]], $[[R14]], $[[R5]]
+
+; NO-SEB-SEH:    sll     $[[R18:[0-9]+]], $[[R17]], 24
+; NO-SEB-SEH:    sra     $[[R19:[0-9]+]], $[[R18]], 24
+
+; HAS-SEB-SEH:   seb     $[[R19:[0-9]+]], $[[R17]]
+
+; ALL:           xor     $[[R20:[0-9]+]], $[[R19]], $5
+; ALL:           sltiu   $2, $[[R20]], 1
+}
+
 ; Check one i16 so that we cover the seh sign extend
 @z = common global i16 0, align 1
 
@@ -337,7 +381,7 @@
 
 @countsint = common global i32 0, align 4
 
-define i32 @CheckSync(i32 %v) nounwind noinline {
+define i32 @CheckSync(i32 signext %v) nounwind noinline {
 entry:
   %0 = atomicrmw add i32* @countsint, i32 %v seq_cst
   ret i32 %0 
@@ -371,7 +415,7 @@
 
 ; Check that MIPS32R6 has the correct offset range.
 ; FIXME: At the moment, we don't seem to do addr+offset for any atomic load/store.
-define i32 @AtomicLoadAdd32_OffGt9Bit(i32 %incr) nounwind {
+define i32 @AtomicLoadAdd32_OffGt9Bit(i32 signext %incr) nounwind {
 entry:
   %0 = atomicrmw add i32* getelementptr(i32* @x, i32 256), i32 %incr monotonic
   ret i32 %0

diff --git a/test/CodeGen/Mips/bswap.ll b/test/CodeGen/Mips/bswap.ll
index 812eef1..f182e65 100644
--- a/test/CodeGen/Mips/bswap.ll
+++ b/test/CodeGen/Mips/bswap.ll

@@ -2,7 +2,7 @@
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=MIPS64
 ; RUN: llc  < %s -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 | FileCheck %s -check-prefix=MIPS16
 
-define i32 @bswap32(i32 %x) nounwind readnone {
+define i32 @bswap32(i32 signext %x) nounwind readnone {
 entry:
 ; MIPS32-LABEL: bswap32:
 ; MIPS32: wsbh $[[R0:[0-9]+]]
@@ -29,7 +29,7 @@
   ret i32 %or.3
 }
 
-define i64 @bswap64(i64 %x) nounwind readnone {
+define i64 @bswap64(i64 signext %x) nounwind readnone {
 entry:
 ; MIPS32-LABEL: bswap64:
 ; MIPS32: wsbh $[[R0:[0-9]+]]
@@ -72,24 +72,24 @@
 define <4 x i32> @bswapv4i32(<4 x i32> %x) nounwind readnone {
 entry:
 ; MIPS32-LABEL: bswapv4i32:
-; MIPS32: wsbh $[[R0:[0-9]+]]
-; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS32: wsbh $[[R0:[0-9]+]]
-; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS32: wsbh $[[R0:[0-9]+]]
-; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS32: wsbh $[[R0:[0-9]+]]
-; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS32-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS32-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS32-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS32-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS32-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
 
 ; MIPS64-LABEL: bswapv4i32:
-; MIPS64: wsbh $[[R0:[0-9]+]]
-; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS64: wsbh $[[R0:[0-9]+]]
-; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS64: wsbh $[[R0:[0-9]+]]
-; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
-; MIPS64: wsbh $[[R0:[0-9]+]]
-; MIPS64: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS64-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS64-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS64-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
+; MIPS64-DAG: wsbh $[[R0:[0-9]+]]
+; MIPS64-DAG: rotr ${{[0-9]+}}, $[[R0]], 16
 
 ; Don't bother with a MIPS16 version. It's just bswap32 repeated four times and
 ; would be very long

diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll
index 88d1d07..7682a98 100644
--- a/test/CodeGen/Mips/buildpairextractelementf64.ll
+++ b/test/CodeGen/Mips/buildpairextractelementf64.ll

@@ -1,15 +1,19 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
-; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=FP32 -check-prefix=CHECK
-; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
-; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64 -check-prefix=CHECK
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=NO-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=NO-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck %s -check-prefix=HAS-MFHC1 -check-prefix=ALL
 
 @a = external global i32
 
-; CHECK-LABEL: f:
-; FP32: mtc1
-; FP32: mtc1
-; FP64-DAG: mtc1
-; FP64-DAG: mthc1
+; ALL-LABEL: f:
+
+; NO-MFHC1: mtc1
+; NO-MFHC1: mtc1
+
+; HAS-MFHC1-DAG: mtc1
+; HAS-MFHC1-DAG: mthc1
 
 define double @f(i32 %a1, double %d) nounwind {
 entry:
@@ -18,11 +22,13 @@
   ret double %add
 }
 
-; CHECK-LABEL: f3:
-; FP32: mfc1
-; FP32: mfc1
-; FP64-DAG: mfc1
-; FP64-DAG: mfhc1
+; ALL-LABEL: f3:
+
+; NO-MFHC1: mfc1
+; NO-MFHC1: mfc1
+
+; HAS-MFHC1-DAG: mfc1
+; HAS-MFHC1-DAG: mfhc1
 
 define void @f3(double %d, i32 %a1) nounwind {
 entry:

diff --git a/test/CodeGen/Mips/cconv/arguments-float.ll b/test/CodeGen/Mips/cconv/arguments-float.ll
index e2119ec..14a3baa 100644
--- a/test/CodeGen/Mips/cconv/arguments-float.ll
+++ b/test/CodeGen/Mips/cconv/arguments-float.ll

@@ -69,26 +69,26 @@
 ; O32-DAG:           sw [[R4]], 28([[R2]])
 ; NEW-DAG:           sd $6, 24([[R2]])
 
-; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
-; O32-DAG:           lw [[R4:\$[0-9]+]], 36($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 32($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 36($sp)
 ; O32-DAG:           sw [[R3]], 32([[R2]])
 ; O32-DAG:           sw [[R4]], 36([[R2]])
 ; NEW-DAG:           sd $7, 32([[R2]])
 
-; O32-DAG:           lw [[R3:\$[0-9]+]], 40($sp)
-; O32-DAG:           lw [[R4:\$[0-9]+]], 44($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 40($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 44($sp)
 ; O32-DAG:           sw [[R3]], 40([[R2]])
 ; O32-DAG:           sw [[R4]], 44([[R2]])
 ; NEW-DAG:           sd $8, 40([[R2]])
 
-; O32-DAG:           lw [[R3:\$[0-9]+]], 48($sp)
-; O32-DAG:           lw [[R4:\$[0-9]+]], 52($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 48($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 52($sp)
 ; O32-DAG:           sw [[R3]], 48([[R2]])
 ; O32-DAG:           sw [[R4]], 52([[R2]])
 ; NEW-DAG:           sd $9, 48([[R2]])
 
-; O32-DAG:           lw [[R3:\$[0-9]+]], 56($sp)
-; O32-DAG:           lw [[R4:\$[0-9]+]], 60($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 56($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 60($sp)
 ; O32-DAG:           sw [[R3]], 56([[R2]])
 ; O32-DAG:           sw [[R4]], 60([[R2]])
 ; NEW-DAG:           sd $10, 56([[R2]])
@@ -135,8 +135,8 @@
 ; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(floats)(
 
 ; The first four arguments are the same in O32/N32/N64.
-; The first argument isn't floating point so floating point registers are not
-; used.
+; The first argument is floating point but soft-float is enabled so floating
+; point registers are not used.
 ; MD00305 and GCC disagree on this one. MD00305 says that floats are treated
 ; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte
 ; aligned and occupying one slot. We'll use GCC's definition.
@@ -195,7 +195,7 @@
 ; O32-DAG:           sw $7, 12([[R2]])
 ; NEW-DAG:           sd $5, 8([[R2]])
 
-define void @float_arg2(i8 %a, float %b) nounwind {
+define void @float_arg2(i8 signext %a, float %b) nounwind {
 entry:
         %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
         store volatile i8 %a, i8* %0

diff --git a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
index aadf7d1..70ccf14 100644
--- a/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll
+++ b/test/CodeGen/Mips/cconv/arguments-hard-float-varargs.ll

@@ -4,11 +4,11 @@
 ; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 ; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWBE %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=N32 --check-prefix=NEW --check-prefix=NEWLE %s
 
-; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW %s
-; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW %s
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWBE %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM64 --check-prefix=N64 --check-prefix=NEW --check-prefix=NEWLE %s
 
 ; Test the effect of varargs on floating point types in the non-variable part
 ; of the argument list as specified by section 2 of the MIPSpro N32 Handbook.
@@ -34,6 +34,7 @@
         %b = va_arg i8** %ap, double
         %1 = getelementptr [11 x double]* @doubles, i32 0, i32 2
         store volatile double %b, double* %1
+        call void @llvm.va_end(i8* %ap2)
         ret void
 }
 
@@ -98,6 +99,7 @@
         %b = va_arg i8** %ap, float
         %1 = getelementptr [11 x float]* @floats, i32 0, i32 2
         store volatile float %b, float* %1
+        call void @llvm.va_end(i8* %ap2)
         ret void
 }
 
@@ -140,16 +142,18 @@
 ; Increment the pointer then get the varargs arg
 ; LLVM will rebind the load to the stack pointer instead of the varargs pointer
 ; during lowering. This is fine and doesn't change the behaviour.
-; N32/N64 is using ori instead of addiu/daddiu but (although odd) this is fine
-; since the stack is always aligned.
+; Also, in big-endian mode the offset must be increased by 4 to retrieve the
+; correct half of the argument slot.
+;
 ; O32-DAG:           addiu [[VAPTR]], [[VAPTR]], 4
 ; O32-DAG:           sw [[VAPTR]], 4($sp)
-; N32-DAG:           ori [[VAPTR]], [[VAPTR]], 4
+; N32-DAG:           addiu [[VAPTR]], [[VAPTR]], 8
 ; N32-DAG:           sw [[VAPTR]], 4($sp)
-; N64-DAG:           ori [[VAPTR]], [[VAPTR]], 4
+; N64-DAG:           daddiu [[VAPTR]], [[VAPTR]], 8
 ; N64-DAG:           sd [[VAPTR]], 0($sp)
 ; O32-DAG:           lwc1 [[FTMP1:\$f[0-9]+]], 12($sp)
-; NEW-DAG:           lwc1 [[FTMP1:\$f[0-9]+]], 8($sp)
+; NEWLE-DAG:         lwc1 [[FTMP1:\$f[0-9]+]], 8($sp)
+; NEWBE-DAG:         lwc1 [[FTMP1:\$f[0-9]+]], 12($sp)
 ; ALL-DAG:           swc1 [[FTMP1]], 8([[R2]])
 
 declare void @llvm.va_start(i8*)

diff --git a/test/CodeGen/Mips/cconv/arguments-varargs.ll b/test/CodeGen/Mips/cconv/arguments-varargs.ll
new file mode 100644
index 0000000..adacda5
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/arguments-varargs.ll

@@ -0,0 +1,1104 @@
+; RUN: llc -mtriple=mips-linux -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-BE %s
+; RUN: llc -mtriple=mipsel-linux -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-LE %s
+
+; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-BE %s
+; RUN: llc -mtriple=mips64el-linux -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N32 --check-prefix=NEW-LE %s
+
+; RUN: llc -march=mips64 -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-BE %s
+; RUN: llc -march=mips64el -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=NEW --check-prefix=N64 --check-prefix=NEW-LE %s
+
+@hwords = global [3 x i16] zeroinitializer, align 1
+@words  = global [3 x i32] zeroinitializer, align 1
+@dwords = global [3 x i64] zeroinitializer, align 1
+
+define void @fn_i16_dotdotdot_i16(i16 %a, ...) {
+entry:
+; ALL-LABEL: fn_i16_dotdotdot_i16:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(hwords)(
+
+; ALL-DAG:       sh [[ARG1]], 2([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sh [[ARG2]], 4([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i16
+  %e1 = getelementptr [3 x i16]* @hwords, i32 0, i32 1
+  store volatile i16 %arg1, i16* %e1, align 2
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i16
+  %e2 = getelementptr [3 x i16]* @hwords, i32 0, i32 2
+  store volatile i16 %arg2, i16* %e2, align 2
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i16_dotdotdot_i32(i16 %a, ...) {
+entry:
+; ALL-LABEL: fn_i16_dotdotdot_i32:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(words)(
+
+; ALL-DAG:       sw [[ARG1]], 4([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sw [[ARG2]], 8([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i32
+  %e1 = getelementptr [3 x i32]* @words, i32 0, i32 1
+  store volatile i32 %arg1, i32* %e1, align 4
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i32
+  %e2 = getelementptr [3 x i32]* @words, i32 0, i32 2
+  store volatile i32 %arg2, i32* %e2, align 4
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i16_dotdotdot_i64(i16 %a, ...) {
+entry:
+; ALL-LABEL: fn_i16_dotdotdot_i64:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]] (and realign pointer for O32)
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion and copy it to the global.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 8([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 12([[GV]])
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(dwords)(
+; NEW-DAG:       ld [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-DAG:       sd [[ARG1]], 8([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion and copy it to the global.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 16([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 20([[GV]])
+
+; NEW-DAG:       ld [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-DAG:       sd [[ARG2]], 16([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i64
+  %e1 = getelementptr [3 x i64]* @dwords, i32 0, i32 1
+  store volatile i64 %arg1, i64* %e1, align 8
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i64
+  %e2 = getelementptr [3 x i64]* @dwords, i32 0, i32 2
+  store volatile i64 %arg2, i64* %e2, align 8
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i32_dotdotdot_i16(i32 %a, ...) {
+entry:
+; ALL-LABEL: fn_i32_dotdotdot_i16:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(hwords)(
+
+; ALL-DAG:       sh [[ARG1]], 2([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sh [[ARG2]], 4([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i16
+  %e1 = getelementptr [3 x i16]* @hwords, i32 0, i32 1
+  store volatile i16 %arg1, i16* %e1, align 2
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i16
+  %e2 = getelementptr [3 x i16]* @hwords, i32 0, i32 2
+  store volatile i16 %arg2, i16* %e2, align 2
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i32_dotdotdot_i32(i32 %a, ...) {
+entry:
+; ALL-LABEL: fn_i32_dotdotdot_i32:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(words)(
+
+; ALL-DAG:       sw [[ARG1]], 4([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sw [[ARG2]], 8([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i32
+  %e1 = getelementptr [3 x i32]* @words, i32 0, i32 1
+  store volatile i32 %arg1, i32* %e1, align 4
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i32
+  %e2 = getelementptr [3 x i32]* @words, i32 0, i32 2
+  store volatile i32 %arg2, i32* %e2, align 4
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i32_dotdotdot_i64(i32 %a, ...) {
+entry:
+; ALL-LABEL: fn_i32_dotdotdot_i64:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+; O32-DAG:       sw $5, 12([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 12 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
+; fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 12
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]] (and realign pointer for O32)
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion and copy it to the global.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 8([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 12([[GV]])
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(dwords)(
+; NEW-DAG:       ld [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-DAG:       sd [[ARG1]], 8([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion and copy it to the global.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 16([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 20([[GV]])
+
+; NEW-DAG:       ld [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-DAG:       sd [[ARG2]], 16([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i64
+  %e1 = getelementptr [3 x i64]* @dwords, i32 0, i32 1
+  store volatile i64 %arg1, i64* %e1, align 8
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i64
+  %e2 = getelementptr [3 x i64]* @dwords, i32 0, i32 2
+  store volatile i64 %arg2, i64* %e2, align 8
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i64_dotdotdot_i16(i64 %a, ...) {
+entry:
+; ALL-LABEL: fn_i64_dotdotdot_i16:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 16 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
+; first fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 16
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(hwords)(
+
+; ALL-DAG:       sh [[ARG1]], 2([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sh [[ARG2]], 4([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i16
+  %e1 = getelementptr [3 x i16]* @hwords, i32 0, i32 1
+  store volatile i16 %arg1, i16* %e1, align 2
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i16
+  %e2 = getelementptr [3 x i16]* @hwords, i32 0, i32 2
+  store volatile i16 %arg2, i16* %e2, align 2
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i64_dotdotdot_i32(i64 %a, ...) {
+entry:
+; ALL-LABEL: fn_i64_dotdotdot_i32:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 16 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
+; first fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 16
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]]
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-BE-DAG:    lw [[ARG1:\$[0-9]+]], 4([[VA]])
+
+; Copy the arg to the global
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
+
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(words)(
+
+; ALL-DAG:       sw [[ARG1]], 4([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+
+; NEW-LE-DAG:    lw [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-BE-DAG:    lw [[ARG2:\$[0-9]+]], 4([[VA2]])
+
+; Copy the arg to the global
+; ALL-DAG:       sw [[ARG2]], 8([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i32
+  %e1 = getelementptr [3 x i32]* @words, i32 0, i32 1
+  store volatile i32 %arg1, i32* %e1, align 4
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i32
+  %e2 = getelementptr [3 x i32]* @words, i32 0, i32 2
+  store volatile i32 %arg2, i32* %e2, align 4
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+define void @fn_i64_dotdotdot_i64(i64 %a, ...) {
+entry:
+; ALL-LABEL: fn_i64_dotdotdot_i64:
+
+; Set up the stack with an 8-byte local area. N32/N64 must also make room for
+; the argument save area (56 bytes).
+; O32:           addiu  [[SP:\$sp]], $sp, -8
+; N32:           addiu  [[SP:\$sp]], $sp, -64
+; N64:           daddiu  [[SP:\$sp]], $sp, -64
+
+; Save variable argument portion on the stack
+; O32-DAG:       sw $7, 20([[SP]])
+; O32-DAG:       sw $6, 16([[SP]])
+
+; NEW-DAG:       sd $11, 56([[SP]])
+; NEW-DAG:       sd $10, 48([[SP]])
+; NEW-DAG:       sd $9, 40([[SP]])
+; NEW-DAG:       sd $8, 32([[SP]])
+; NEW-DAG:       sd $7, 24([[SP]])
+; NEW-DAG:       sd $6, 16([[SP]])
+; NEW-DAG:       sd $5, 8([[SP]])
+
+; Initialize variable argument pointer.
+; For O32, the offset is 16 due to the 4 bytes used to store local variables,
+; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
+; first fixed argument.
+; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
+; space.
+; O32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 16
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; N32-DAG:       addiu [[VA:\$[0-9]+]], [[SP]], 8
+; N32-DAG:       sw [[VA]], 0([[SP]])
+
+; N64-DAG:       daddiu [[VA:\$[0-9]+]], [[SP]], 8
+; N64-DAG:       sd [[VA]], 0([[SP]])
+
+; Store [[VA]]
+; O32-DAG:       sw [[VA]], 0([[SP]])
+
+; ALL: # ANCHOR1
+
+; Increment [[VA]] (and realign pointer for O32)
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N64-DAG:       ld [[VA:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 8
+; N64-DAG:       sd [[VA2]], 0([[SP]])
+
+; Load the first argument from the variable portion and copy it to the global.
+; This has used the stack pointer directly rather than the [[VA]] we just set
+; up.
+; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
+; order.
+; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 8([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG1]], 12([[GV]])
+
+; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
+; N64-DAG:       ld [[GV:\$[0-9]+]], %got_disp(dwords)(
+; NEW-DAG:       ld [[ARG1:\$[0-9]+]], 0([[VA]])
+; NEW-DAG:       sd [[ARG1]], 8([[GV]])
+
+; ALL: # ANCHOR2
+
+; Increment [[VA]] again.
+; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
+; O32:           lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
+; O32-DAG:       and   [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
+; O32-DAG:       ori   [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+
+; N32-DAG:       lw [[VA2:\$[0-9]+]], 0([[SP]])
+; N32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N32-DAG:       sw [[VA3]], 0([[SP]])
+
+; N64-DAG:       ld [[VA2:\$[0-9]+]], 0([[SP]])
+; N64-DAG:       daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
+; N64-DAG:       sd [[VA3]], 0([[SP]])
+
+; Load the second argument from the variable portion and copy it to the global.
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 16([[GV]])
+; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
+; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       sw [[VA2]], 0([[SP]])
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       sw [[ARG2]], 20([[GV]])
+
+; NEW-DAG:       ld [[ARG2:\$[0-9]+]], 0([[VA2]])
+; NEW-DAG:       sd [[ARG2]], 16([[GV]])
+
+  %ap = alloca i8*, align 8
+  %ap2 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap2)
+
+  call void asm sideeffect "# ANCHOR1", ""()
+  %arg1 = va_arg i8** %ap, i64
+  %e1 = getelementptr [3 x i64]* @dwords, i32 0, i32 1
+  store volatile i64 %arg1, i64* %e1, align 8
+
+  call void asm sideeffect "# ANCHOR2", ""()
+  %arg2 = va_arg i8** %ap, i64
+  %e2 = getelementptr [3 x i64]* @dwords, i32 0, i32 2
+  store volatile i64 %arg2, i64* %e2, align 8
+
+  call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)

diff --git a/test/CodeGen/Mips/cconv/arguments.ll b/test/CodeGen/Mips/cconv/arguments.ll
index 8fe29f3..43da604 100644
--- a/test/CodeGen/Mips/cconv/arguments.ll
+++ b/test/CodeGen/Mips/cconv/arguments.ll

@@ -1,5 +1,5 @@
-; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32BE %s
-; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 --check-prefix=O32LE %s
+; RUN: llc -march=mips -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
+; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 
 ; RUN-TODO: llc -march=mips64 -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
 ; RUN-TODO: llc -march=mips64el -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=SYM32 --check-prefix=O32 %s
@@ -23,8 +23,10 @@
 @floats = global [11 x float] zeroinitializer
 @doubles = global [11 x double] zeroinitializer
 
-define void @align_to_arg_slots(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g,
-                                i8 %h, i8 %i, i8 %j) nounwind {
+define void @align_to_arg_slots(i8 signext %a, i8 signext %b, i8 signext %c,
+                                i8 signext %d, i8 signext %e, i8 signext %f,
+                                i8 signext %g, i8 signext %h, i8 signext %i,
+                                i8 signext %j) nounwind {
 entry:
         %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
         store volatile i8 %a, i8* %0
@@ -53,7 +55,7 @@
 ; We won't test the way the global address is calculated in this test. This is
 ; just to get the register number for the other checks.
 ; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
-; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM64-DAG:           ld [[R1:\$[0-9]+]], %got_disp(bytes)(
 
 ; The first four arguments are the same in O32/N32/N64
 ; ALL-DAG:           sb $4, 1([[R1]])
@@ -82,15 +84,16 @@
 ; increase by 4 for O32 and 8 for N32/N64.
 ; O32-DAG:           lw [[R3:\$[0-9]+]], 32($sp)
 ; O32-DAG:           sb [[R3]], 9([[R1]])
-; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
 ; NEW-DAG:           sb [[R3]], 9([[R1]])
 ; O32-DAG:           lw [[R3:\$[0-9]+]], 36($sp)
 ; O32-DAG:           sb [[R3]], 10([[R1]])
-; NEW-DAG:           lw [[R3:\$[0-9]+]], 8($sp)
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 8($sp)
 ; NEW-DAG:           sb [[R3]], 10([[R1]])
 
-define void @slot_skipping(i8 %a, i64 %b, i8 %c, i8 %d,
-                           i8 %e, i8 %f, i8 %g, i64 %i, i8 %j) nounwind {
+define void @slot_skipping(i8 signext %a, i64 signext %b, i8 signext %c,
+                           i8 signext %d, i8 signext %e, i8 signext %f,
+                           i8 signext %g, i64 signext %i, i8 signext %j) nounwind {
 entry:
         %0 = getelementptr [11 x i8]* @bytes, i32 0, i32 1
         store volatile i8 %a, i8* %0
@@ -117,9 +120,9 @@
 ; We won't test the way the global address is calculated in this test. This is
 ; just to get the register number for the other checks.
 ; SYM32-DAG:           addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
-; SYM64-DAG:           ld [[R1:\$[0-9]]], %got_disp(bytes)(
+; SYM64-DAG:           ld [[R1:\$[0-9]+]], %got_disp(bytes)(
 ; SYM32-DAG:           addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
-; SYM64-DAG:           ld [[R2:\$[0-9]]], %got_disp(dwords)(
+; SYM64-DAG:           ld [[R2:\$[0-9]+]], %got_disp(dwords)(
 
 ; The first argument is the same in O32/N32/N64.
 ; ALL-DAG:           sb $4, 1([[R1]])
@@ -137,8 +140,7 @@
 ; It's not clear why O32 uses lbu for this argument, but it's not wrong so we'll
 ; accept it for now. The only IR difference is that this argument has
 ; anyext from i8 and align 8 on it.
-; O32LE-DAG:           lbu [[R3:\$[0-9]+]], 16($sp)
-; O32BE-DAG:           lbu [[R3:\$[0-9]+]], 19($sp)
+; O32-DAG:           lw [[R3:\$[0-9]+]], 16($sp)
 ; O32-DAG:           sb [[R3]], 2([[R1]])
 ; NEW-DAG:           sb $6, 2([[R1]])
 ; O32-DAG:           lw [[R3:\$[0-9]+]], 20($sp)
@@ -166,5 +168,5 @@
 ; increase by 4 for O32 and 8 for N32/N64.
 ; O32-DAG:           lw [[R3:\$[0-9]+]], 48($sp)
 ; O32-DAG:           sb [[R3]], 7([[R1]])
-; NEW-DAG:           lw [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
 ; NEW-DAG:           sb [[R3]], 7([[R1]])

diff --git a/test/CodeGen/Mips/cconv/return-float.ll b/test/CodeGen/Mips/cconv/return-float.ll
index 28cf83d..d1a5e4f 100644
--- a/test/CodeGen/Mips/cconv/return-float.ll
+++ b/test/CodeGen/Mips/cconv/return-float.ll

@@ -30,7 +30,7 @@
 ; O32-DAG:           lw $2, %lo(float)([[R1]])
 ; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
 ; N32-DAG:           lw $2, %lo(float)([[R1]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)(
 ; N64-DAG:           lw $2, 0([[R1]])
 
 define double @retdouble() nounwind {
@@ -44,5 +44,5 @@
 ; O32-DAG:           addiu [[R2:\$[0-9]+]], [[R1]], %lo(double)
 ; O32-DAG:           lw $3, 4([[R2]])
 ; N32-DAG:           ld $2, %lo(double)([[R1:\$[0-9]+]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)(
 ; N64-DAG:           ld $2, 0([[R1]])

diff --git a/test/CodeGen/Mips/cconv/return-hard-float.ll b/test/CodeGen/Mips/cconv/return-hard-float.ll
index 371b3a5..123b499 100644
--- a/test/CodeGen/Mips/cconv/return-hard-float.ll
+++ b/test/CodeGen/Mips/cconv/return-hard-float.ll

@@ -10,6 +10,9 @@
 ; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 ; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
 
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static -mattr=+o32,+fp64 < %s | FileCheck --check-prefix=ALL --check-prefix=032FP64 %s
+
 ; Test the float returns for all ABI's and byte orders as specified by
 ; section 5 of MD00305 (MIPS ABIs Described).
 
@@ -30,7 +33,7 @@
 ; O32-DAG:           lwc1 $f0, %lo(float)([[R1]])
 ; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
 ; N32-DAG:           lwc1 $f0, %lo(float)([[R1]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(float)(
 ; N64-DAG:           lwc1 $f0, 0([[R1]])
 
 define double @retdouble() nounwind {
@@ -42,5 +45,15 @@
 ; ALL-LABEL: retdouble:
 ; O32-DAG:           ldc1 $f0, %lo(double)([[R1:\$[0-9]+]])
 ; N32-DAG:           ldc1 $f0, %lo(double)([[R1:\$[0-9]+]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(double)(
 ; N64-DAG:           ldc1 $f0, 0([[R1]])
+
+define { double, double } @retComplexDouble() #0 {
+  %retval = alloca { double, double }, align 8
+  %1 = load { double, double }* %retval
+  ret { double, double } %1
+}
+
+; ALL-LABEL: retComplexDouble:
+; 032FP64-DAG:      ldc1     $f0, 0($sp)
+; 032FP64-DAG:      ldc1     $f2, 8($sp)

diff --git a/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll b/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll
new file mode 100644
index 0000000..2e84477
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-hard-struct-f128.ll

@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 %s
+
+; Test return of {fp128} agrees with de-facto N32/N64 ABI.
+
+@struct_fp128 = global {fp128} zeroinitializer
+
+define inreg {fp128} @ret_struct_fp128() nounwind {
+entry:
+        %0 = load volatile {fp128}* @struct_fp128
+        ret {fp128} %0
+}
+
+; ALL-LABEL: ret_struct_fp128:
+
+; O32 generates different IR so we don't test it here. It returns the struct
+; indirectly.
+
+; Contrary to the N32/N64 ABI documentation, a struct containing a long double
+; is returned in $f0, and $f1 instead of the usual $f0, and $f2. This is to
+; match the de facto ABI as implemented by GCC.
+; N32-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_fp128)
+; N32-DAG:        ld  [[R2:\$[0-9]+]], %lo(struct_fp128)([[R1]])
+; N32-DAG:        dmtc1 [[R2]], $f0
+; N32-DAG:        addiu [[R3:\$[0-9]+]], [[R1]], %lo(struct_fp128)
+; N32-DAG:        ld  [[R4:\$[0-9]+]], 8([[R3]])
+; N32-DAG:        dmtc1 [[R4]], $f1
+
+; N64-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_fp128)($1)
+; N64-DAG:        ld  [[R2:\$[0-9]+]], 0([[R1]])
+; N64-DAG:        dmtc1 [[R2]], $f0
+; N64-DAG:        ld  [[R4:\$[0-9]+]], 8([[R1]])
+; N64-DAG:        dmtc1 [[R4]], $f1

diff --git a/test/CodeGen/Mips/cconv/return-struct.ll b/test/CodeGen/Mips/cconv/return-struct.ll
new file mode 100644
index 0000000..11a8cf0
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/return-struct.ll

@@ -0,0 +1,232 @@
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-BE %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static < %s | FileCheck --check-prefix=ALL --check-prefix=O32 --check-prefix=O32-LE %s
+
+; RUN-TODO: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+; RUN-TODO: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+o32 < %s | FileCheck --check-prefix=ALL --check-prefix=O32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-BE %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n32 < %s | FileCheck --check-prefix=ALL --check-prefix=N32 --check-prefix=N32-LE %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-BE %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -mattr=-n64,+n64 < %s | FileCheck --check-prefix=ALL --check-prefix=N64 --check-prefix=N64-LE %s
+
+; Test struct returns for all ABI's and byte orders.
+
+@struct_byte = global {i8} zeroinitializer
+@struct_2byte = global {i8,i8} zeroinitializer
+@struct_3xi16 = global {[3 x i16]} zeroinitializer
+@struct_6xi32 = global {[6 x i32]} zeroinitializer
+@struct_128xi16 = global {[128 x i16]} zeroinitializer
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+define inreg {i8} @ret_struct_i8() nounwind {
+entry:
+        %0 = load volatile {i8}* @struct_byte
+        ret {i8} %0
+}
+
+; ALL-LABEL: ret_struct_i8:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(struct_byte)
+; O32-DAG:           lbu $2, %lo(struct_byte)([[R1]])
+
+; N32-LE-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_byte)
+; N32-LE-DAG:        lb $2, %lo(struct_byte)([[R1]])
+
+; N32-BE-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_byte)
+; N32-BE-DAG:        lb [[R2:\$[0-9]+]], %lo(struct_byte)([[R1]])
+; N32-BE-DAG:        dsll $2, [[R2]], 56
+
+; N64-LE-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_byte)($1)
+; N64-LE-DAG:        lb $2, 0([[R1]])
+
+; N64-BE-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_byte)($1)
+; N64-BE-DAG:        lb [[R2:\$[0-9]+]], 0([[R1]])
+; N64-BE-DAG:        dsll $2, [[R2]], 56
+
+; This test is based on the way clang currently lowers {i8,i8} to {i16}.
+; FIXME: It should probably work for without any lowering too but this doesn't
+;        work as expected. Each member gets mapped to a register rather than
+;        packed into a single register.
+define inreg {i16} @ret_struct_i16() nounwind {
+entry:
+        %retval = alloca {i8,i8}, align 1
+        %0 = bitcast {i8,i8}* %retval to i8*
+        call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds ({i8,i8}* @struct_2byte, i32 0, i32 0), i64 2, i32 1, i1 false)
+        %1 = bitcast {i8,i8}* %retval to {i16}*
+        %2 = load volatile {i16}* %1
+        ret {i16} %2
+}
+
+; ALL-LABEL: ret_struct_i16:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(struct_2byte)
+; O32-DAG:           lhu [[R2:\$[0-9]+]], %lo(struct_2byte)([[R1]])
+; O32-DAG:           sh  [[R2]], 0([[SP:\$sp]])
+; O32-DAG:           lhu $2, 0([[SP:\$sp]])
+
+; N32-LE-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_2byte)
+; N32-LE-DAG:        lhu [[R2:\$[0-9]+]], %lo(struct_2byte)([[R1]])
+; N32-LE-DAG:        sh  [[R2]], 8([[SP:\$sp]])
+; N32-LE-DAG:        lh  $2, 8([[SP:\$sp]])
+
+; N32-BE-DAG:        lui [[R1:\$[0-9]+]], %hi(struct_2byte)
+; N32-BE-DAG:        lhu [[R2:\$[0-9]+]], %lo(struct_2byte)([[R1]])
+; N32-BE-DAG:        sh  [[R2]], 8([[SP:\$sp]])
+; N32-BE-DAG:        lh  [[R3:\$[0-9]+]], 8([[SP:\$sp]])
+; N32-BE-DAG:        dsll $2, [[R3]], 48
+
+; N64-LE-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_2byte)($1)
+; N64-LE-DAG:        lhu [[R2:\$[0-9]+]], 0([[R1]])
+; N64-LE-DAG:        sh  [[R2]], 8([[SP:\$sp]])
+; N64-LE-DAG:        lh  $2, 8([[SP:\$sp]])
+
+; N64-BE-DAG:        ld  [[R1:\$[0-9]+]], %got_disp(struct_2byte)($1)
+; N64-BE-DAG:        lhu [[R2:\$[0-9]+]], 0([[R1]])
+; N64-BE-DAG:        sh  [[R2]], 8([[SP:\$sp]])
+; N64-BE-DAG:        lh  [[R3:\$[0-9]+]], 8([[SP:\$sp]])
+; N64-BE-DAG:        dsll $2, [[R3]], 48
+
+; Ensure that structures bigger than 32-bits but smaller than 64-bits are
+; also returned in the upper bits on big endian targets. Previously, these were
+; missed by the CCPromoteToType and the shift didn't happen.
+define inreg {i48} @ret_struct_3xi16() nounwind {
+entry:
+        %0 = load volatile i48* bitcast ({[3 x i16]}* @struct_3xi16 to i48*), align 2
+        %1 = insertvalue {i48} undef, i48 %0, 0
+        ret {i48} %1
+}
+
+; ALL-LABEL: ret_struct_3xi16:
+
+; O32-BE-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_3xi16)
+; O32-BE-DAG:        addiu [[PTR_LO:\$[0-9]+]], [[PTR_HI]], %lo(struct_3xi16)
+; O32-BE-DAG:        lhu [[R1:\$[0-9]+]], 4([[PTR_LO]])
+; O32-BE-DAG:        lw [[R2:\$[0-9]+]], %lo(struct_3xi16)([[PTR_HI]])
+; O32-BE-DAG:        sll [[R3:\$[0-9]+]], [[R2]], 16
+; O32-BE-DAG:        or  $3, [[R1]], [[R3]]
+; O32-BE-DAG:        srl $2, [[R2]], 16
+
+; O32-LE-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_3xi16)
+; O32-LE-DAG:        addiu [[PTR_LO:\$[0-9]+]], [[PTR_HI]], %lo(struct_3xi16)
+; O32-LE-DAG:        lhu $3, 4([[PTR_LO]])
+; O32-LE-DAG:        lw $2, %lo(struct_3xi16)([[PTR_HI]])
+
+; N32-LE-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_3xi16)
+; N32-LE-DAG:        addiu [[PTR_LO:\$[0-9]+]], [[PTR_HI]], %lo(struct_3xi16)
+; N32-LE-DAG:        lh [[R1:\$[0-9]+]], 4([[PTR_LO]])
+; N32-LE-DAG:        lwu [[R2:\$[0-9]+]], %lo(struct_3xi16)([[PTR_HI]])
+; N32-LE-DAG:        dsll [[R3:\$[0-9]+]], [[R1]], 32
+; N32-LE-DAG:        or $2, [[R2]], [[R3]]
+
+; N32-BE-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_3xi16)
+; N32-BE-DAG:        addiu [[PTR_LO:\$[0-9]+]], [[PTR_HI]], %lo(struct_3xi16)
+; N32-BE-DAG:        lw [[R1:\$[0-9]+]], %lo(struct_3xi16)([[PTR_HI]])
+; N32-BE-DAG:        dsll [[R2:\$[0-9]+]], [[R1]], 16
+; N32-BE-DAG:        lhu [[R3:\$[0-9]+]], 4([[PTR_LO]])
+; N32-BE-DAG:        or [[R4:\$[0-9]+]], [[R3]], [[R2]]
+; N32-BE-DAG:        dsll $2, [[R4]], 16
+
+; N64-LE-DAG:        ld  [[PTR:\$[0-9]+]], %got_disp(struct_3xi16)($1)
+; N64-LE-DAG:        lh [[R1:\$[0-9]+]], 4([[PTR]])
+; N64-LE-DAG:        lwu [[R2:\$[0-9]+]], 0([[PTR]])
+; N64-LE-DAG:        dsll [[R3:\$[0-9]+]], [[R1]], 32
+; N64-LE-DAG:        or $2, [[R2]], [[R3]]
+
+; N64-BE-DAG:        ld  [[PTR:\$[0-9]+]], %got_disp(struct_3xi16)($1)
+; N64-BE-DAG:        lw [[R1:\$[0-9]+]], 0([[PTR]])
+; N64-BE-DAG:        dsll [[R2:\$[0-9]+]], [[R1]], 16
+; N64-BE-DAG:        lhu [[R3:\$[0-9]+]], 4([[PTR]])
+; N64-BE-DAG:        or [[R4:\$[0-9]+]], [[R3]], [[R2]]
+; N32-BE-DAG:        dsll $2, [[R4]], 16
+
+; Ensure that large structures (>128-bit) are returned indirectly.
+; We pick an extremely large structure so we don't have to match inlined memcpy's.
+define void @ret_struct_128xi16({[128 x i16]}* sret %returnval) {
+entry:
+        %0 = bitcast {[128 x i16]}* %returnval to i8*
+        call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ({[128 x i16]}* @struct_128xi16 to i8*), i64 256, i32 2, i1 false)
+        ret void
+}
+
+; ALL-LABEL: ret_struct_128xi16:
+
+; sret pointer is already in $4
+; O32-DAG:        lui [[PTR:\$[0-9]+]], %hi(struct_128xi16)
+; O32-DAG:        addiu $5, [[PTR]], %lo(struct_128xi16)
+; O32:            jal memcpy
+
+; sret pointer is already in $4
+; N32-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_128xi16)
+; N32-DAG:        addiu [[PTR:\$[0-9]+]], [[PTR_HI]], %lo(struct_128xi16)
+; FIXME: This signext isn't necessary. Like integers, pointers are
+;        but unlike integers, pointers cannot have the signext attribute.
+; N32-DAG:        sll $5, [[PTR]], 0
+; N32:            jal memcpy
+
+; sret pointer is already in $4
+; N64-DAG:        ld $5, %got_disp(struct_128xi16)(
+; N64-DAG:        ld $25, %call16(memcpy)(
+; N64:            jalr $25
+
+; Ensure that large structures (>128-bit) are returned indirectly.
+; This will generate inlined memcpy's anyway so pick the smallest large
+; structure
+; This time we let the backend lower the sret argument.
+define {[6 x i32]} @ret_struct_6xi32() {
+entry:
+        %0 = load volatile {[6 x i32]}* @struct_6xi32, align 2
+        ret {[6 x i32]} %0
+}
+
+; ALL-LABEL: ret_struct_6xi32:
+
+; sret pointer is already in $4
+; O32-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_6xi32)
+; O32-DAG:        addiu [[PTR:\$[0-9]+]], [[PTR_HI]], %lo(struct_6xi32)
+; O32-DAG:        lw [[T0:\$[0-9]+]], %lo(struct_6xi32)([[PTR]])
+; O32-DAG:        lw [[T1:\$[0-9]+]], 4([[PTR]])
+; O32-DAG:        lw [[T2:\$[0-9]+]], 8([[PTR]])
+; O32-DAG:        lw [[T3:\$[0-9]+]], 12([[PTR]])
+; O32-DAG:        lw [[T4:\$[0-9]+]], 16([[PTR]])
+; O32-DAG:        lw [[T5:\$[0-9]+]], 20([[PTR]])
+; O32-DAG:        sw [[T0]], 0($4)
+; O32-DAG:        sw [[T1]], 4($4)
+; O32-DAG:        sw [[T2]], 8($4)
+; O32-DAG:        sw [[T3]], 12($4)
+; O32-DAG:        sw [[T4]], 16($4)
+; O32-DAG:        sw [[T5]], 20($4)
+
+; FIXME: This signext isn't necessary. Like integers, pointers are
+;        but unlike integers, pointers cannot have the signext attribute.
+;        In this case we don't have anywhere to put the signext either since
+;        the sret argument is invented by the backend.
+; N32-DAG:        sll [[RET_PTR:\$[0-9]+]], $4, 0
+; N32-DAG:        lui [[PTR_HI:\$[0-9]+]], %hi(struct_6xi32)
+; N32-DAG:        addiu [[PTR:\$[0-9]+]], [[PTR_HI]], %lo(struct_6xi32)
+; N32-DAG:        lw [[T0:\$[0-9]+]], %lo(struct_6xi32)([[PTR]])
+; N32-DAG:        lw [[T1:\$[0-9]+]], 4([[PTR]])
+; N32-DAG:        lw [[T2:\$[0-9]+]], 8([[PTR]])
+; N32-DAG:        lw [[T3:\$[0-9]+]], 12([[PTR]])
+; N32-DAG:        lw [[T4:\$[0-9]+]], 16([[PTR]])
+; N32-DAG:        lw [[T5:\$[0-9]+]], 20([[PTR]])
+; N32-DAG:        sw [[T0]], 0([[RET_PTR]])
+; N32-DAG:        sw [[T1]], 4([[RET_PTR]])
+; N32-DAG:        sw [[T2]], 8([[RET_PTR]])
+; N32-DAG:        sw [[T3]], 12([[RET_PTR]])
+; N32-DAG:        sw [[T4]], 16([[RET_PTR]])
+; N32-DAG:        sw [[T5]], 20([[RET_PTR]])
+
+; sret pointer is already in $4
+; N64-DAG:        ld [[PTR:\$[0-9]+]], %got_disp(struct_6xi32)(
+; N64-DAG:        lw [[T0:\$[0-9]+]], 0([[PTR]])
+; N64-DAG:        lw [[T1:\$[0-9]+]], 4([[PTR]])
+; N64-DAG:        lw [[T2:\$[0-9]+]], 8([[PTR]])
+; N64-DAG:        lw [[T3:\$[0-9]+]], 12([[PTR]])
+; N64-DAG:        lw [[T4:\$[0-9]+]], 16([[PTR]])
+; N64-DAG:        lw [[T5:\$[0-9]+]], 20([[PTR]])
+; N64-DAG:        sw [[T0]], 0($4)
+; N64-DAG:        sw [[T1]], 4($4)
+; N64-DAG:        sw [[T2]], 8($4)
+; N64-DAG:        sw [[T3]], 12($4)
+; N64-DAG:        sw [[T4]], 16($4)
+; N64-DAG:        sw [[T5]], 20($4)

diff --git a/test/CodeGen/Mips/cconv/return.ll b/test/CodeGen/Mips/cconv/return.ll
index 76ce5e4..63f9b5f 100644
--- a/test/CodeGen/Mips/cconv/return.ll
+++ b/test/CodeGen/Mips/cconv/return.ll

@@ -33,7 +33,7 @@
 ; O32-DAG:           lbu $2, %lo(byte)([[R1]])
 ; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(byte)
 ; N32-DAG:           lbu $2, %lo(byte)([[R1]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(byte)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(byte)(
 ; N64-DAG:           lbu $2, 0([[R1]])
 
 define i32 @reti32() nounwind {
@@ -47,7 +47,7 @@
 ; O32-DAG:           lw $2, %lo(word)([[R1]])
 ; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(word)
 ; N32-DAG:           lw $2, %lo(word)([[R1]])
-; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(word)($1)
+; N64-DAG:           ld  [[R1:\$[0-9]+]], %got_disp(word)(
 ; N64-DAG:           lw $2, 0([[R1]])
 
 define i64 @reti64() nounwind {

diff --git a/test/CodeGen/Mips/cfi_offset.ll b/test/CodeGen/Mips/cfi_offset.ll
new file mode 100644
index 0000000..e23855b
--- /dev/null
+++ b/test/CodeGen/Mips/cfi_offset.ll

@@ -0,0 +1,41 @@
+; RUN: llc -march=mips -mattr=+o32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB
+; RUN: llc -march=mipsel -mattr=+o32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL
+; RUN: llc -march=mips -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB
+; RUN: llc -march=mipsel -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL
+; RUN: llc -march=mips -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EB
+; RUN: llc -march=mipsel -mattr=+o32,+fp64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-EL
+
+@var = global double 0.0
+
+declare void @foo(...)
+
+define void @bar() {
+
+; CHECK-LABEL:  bar:
+
+; CHECK:  .cfi_def_cfa_offset 40
+; CHECK:  sdc1  $f22, 32($sp)
+; CHECK:  sdc1  $f20, 24($sp)
+; CHECK:  sw  $ra, 20($sp)
+; CHECK:  sw  $16, 16($sp)
+
+; CHECK-EB:  .cfi_offset 55, -8
+; CHECK-EB:  .cfi_offset 54, -4
+; CHECK-EB:  .cfi_offset 53, -16
+; CHECK-EB:  .cfi_offset 52, -12
+
+; CHECK-EL:  .cfi_offset 54, -8
+; CHECK-EL:  .cfi_offset 55, -4
+; CHECK-EL:  .cfi_offset 52, -16
+; CHECK-EL:  .cfi_offset 53, -12
+
+; CHECK:  .cfi_offset 31, -20
+; CHECK:  .cfi_offset 16, -24
+
+    %val1 = load volatile double* @var
+    %val2 = load volatile double* @var
+    call void (...)* @foo() nounwind
+    store volatile double %val1, double* @var
+    store volatile double %val2, double* @var
+    ret void
+}

diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index 999bdb4..b12c2df 100644
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll

@@ -38,7 +38,7 @@
 ; 64-CMP-DAG:   or $[[T2:[0-9]+]], $[[T0]], $[[T1]]
 ; 64-CMP-DAG:   ld $2, 0($[[T2]])
 
-define i32* @cmov1(i32 %s) nounwind readonly {
+define i32* @cmov1(i32 signext %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
   %tmp1 = load i32** @i3, align 4
@@ -78,7 +78,7 @@
 ; 64-CMP-DAG:   or $[[T2:[0-9]+]], $[[T0]], $[[T1]]
 ; 64-CMP-DAG:   lw $2, 0($[[T2]])
 
-define i32 @cmov2(i32 %s) nounwind readonly {
+define i32 @cmov2(i32 signext %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
   %tmp1 = load i32* @c, align 4
@@ -109,13 +109,46 @@
 ; 64-CMP-DAG:   selnez $[[T1:[0-9]+]], $6, $[[CC]]
 ; 64-CMP-DAG:   or $2, $[[T0]], $[[T1]]
 
-define i32 @cmov3(i32 %a, i32 %b, i32 %c) nounwind readnone {
+define i32 @cmov3(i32 signext %a, i32 signext %b, i32 signext %c) nounwind readnone {
 entry:
   %cmp = icmp eq i32 %a, 234
   %cond = select i1 %cmp, i32 %b, i32 %c
   ret i32 %cond
 }
 
+; ALL-LABEL: cmov3_ne:
+
+; We won't check the result register since we can't know if the move is first
+; or last. We do know it will be either one of two registers so we can at least
+; check that.
+
+; FIXME: Use xori instead of addiu+xor.
+; 32-CMOV:      addiu $[[R0:[0-9]+]], $zero, 234
+; 32-CMOV:      xor $[[R1:[0-9]+]], $4, $[[R0]]
+; 32-CMOV:      movn ${{[26]}}, $5, $[[R1]]
+
+; 32-CMP-DAG:   xori $[[CC:[0-9]+]], $4, 234
+; 32-CMP-DAG:   selnez $[[T0:[0-9]+]], $5, $[[CC]]
+; 32-CMP-DAG:   seleqz $[[T1:[0-9]+]], $6, $[[CC]]
+; 32-CMP-DAG:   or $2, $[[T0]], $[[T1]]
+
+; FIXME: Use xori instead of addiu+xor.
+; 64-CMOV:      addiu $[[R0:[0-9]+]], $zero, 234
+; 64-CMOV:      xor $[[R1:[0-9]+]], $4, $[[R0]]
+; 64-CMOV:      movn ${{[26]}}, $5, $[[R1]]
+
+; 64-CMP-DAG:   xori $[[CC:[0-9]+]], $4, 234
+; 64-CMP-DAG:   selnez $[[T0:[0-9]+]], $5, $[[CC]]
+; 64-CMP-DAG:   seleqz $[[T1:[0-9]+]], $6, $[[CC]]
+; 64-CMP-DAG:   or $2, $[[T0]], $[[T1]]
+
+define i32 @cmov3_ne(i32 signext %a, i32 signext %b, i32 signext %c) nounwind readnone {
+entry:
+  %cmp = icmp ne i32 %a, 234
+  %cond = select i1 %cmp, i32 %b, i32 %c
+  ret i32 %cond
+}
+
 ; ALL-LABEL: cmov4:
 
 ; We won't check the result register since we can't know if the move is first
@@ -146,13 +179,54 @@
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $6, $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i64 @cmov4(i32 %a, i64 %b, i64 %c) nounwind readnone {
+define i64 @cmov4(i32 signext %a, i64 %b, i64 %c) nounwind readnone {
 entry:
   %cmp = icmp eq i32 %a, 234
   %cond = select i1 %cmp, i64 %b, i64 %c
   ret i64 %cond
 }
 
+; ALL-LABEL: cmov4_ne:
+
+; We won't check the result register since we can't know if the move is first
+; or last. We do know it will be one of two registers so we can at least check
+; that.
+
+; FIXME: Use xori instead of addiu+xor.
+; 32-CMOV-DAG: addiu $[[R0:[0-9]+]], $zero, 234
+; 32-CMOV-DAG: xor $[[R1:[0-9]+]], $4, $[[R0]]
+; 32-CMOV-DAG: lw $[[R2:2]], 16($sp)
+; 32-CMOV-DAG: lw $[[R3:3]], 20($sp)
+; 32-CMOV-DAG: movn $[[R2]], $6, $[[R1]]
+; 32-CMOV-DAG: movn $[[R3]], $7, $[[R1]]
+
+; 32-CMP-DAG:  xori $[[R0:[0-9]+]], $4, 234
+; 32-CMP-DAG:  lw $[[R1:[0-9]+]], 16($sp)
+; 32-CMP-DAG:  lw $[[R2:[0-9]+]], 20($sp)
+; 32-CMP-DAG:  selnez $[[T0:[0-9]+]], $6, $[[R0]]
+; 32-CMP-DAG:  selnez $[[T1:[0-9]+]], $7, $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T2:[0-9]+]], $[[R1]], $[[R0]]
+; 32-CMP-DAG:  seleqz $[[T3:[0-9]+]], $[[R2]], $[[R0]]
+; 32-CMP-DAG:  or $2, $[[T0]], $[[T2]]
+; 32-CMP-DAG:  or $3, $[[T1]], $[[T3]]
+
+; FIXME: Use xori instead of addiu+xor.
+; 64-CMOV: addiu $[[R0:[0-9]+]], $zero, 234
+; 64-CMOV: xor $[[R1:[0-9]+]], $4, $[[R0]]
+; 64-CMOV: movn ${{[26]}}, $5, $[[R1]]
+
+; 64-CMP-DAG:  xori $[[R0:[0-9]+]], $4, 234
+; 64-CMP-DAG:  selnez $[[T0:[0-9]+]], $5, $[[R0]]
+; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $6, $[[R0]]
+; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
+
+define i64 @cmov4_ne(i32 signext %a, i64 %b, i64 %c) nounwind readnone {
+entry:
+  %cmp = icmp ne i32 %a, 234
+  %cond = select i1 %cmp, i64 %b, i64 %c
+  ret i64 %cond
+}
+
 ; slti and conditional move.
 ;
 ; Check that, pattern
@@ -189,7 +263,7 @@
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @slti0(i32 %a) {
+define i32 @slti0(i32 signext %a) {
 entry:
   %cmp = icmp sgt i32 %a, 32766
   %cond = select i1 %cmp, i32 3, i32 5
@@ -228,7 +302,7 @@
 ; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @slti1(i32 %a) {
+define i32 @slti1(i32 signext %a) {
 entry:
   %cmp = icmp sgt i32 %a, 32767
   %cond = select i1 %cmp, i32 7, i32 5
@@ -263,7 +337,7 @@
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @slti2(i32 %a) {
+define i32 @slti2(i32 signext %a) {
 entry:
   %cmp = icmp sgt i32 %a, -32769
   %cond = select i1 %cmp, i32 3, i32 5
@@ -306,7 +380,7 @@
 ; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @slti3(i32 %a) {
+define i32 @slti3(i32 signext %a) {
 entry:
   %cmp = icmp sgt i32 %a, -32770
   %cond = select i1 %cmp, i32 3, i32 5
@@ -493,7 +567,7 @@
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @sltiu0(i32 %a) {
+define i32 @sltiu0(i32 signext %a) {
 entry:
   %cmp = icmp ugt i32 %a, 32766
   %cond = select i1 %cmp, i32 3, i32 5
@@ -532,7 +606,7 @@
 ; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @sltiu1(i32 %a) {
+define i32 @sltiu1(i32 signext %a) {
 entry:
   %cmp = icmp ugt i32 %a, 32767
   %cond = select i1 %cmp, i32 7, i32 5
@@ -567,7 +641,7 @@
 ; 64-CMP-DAG:  selnez $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @sltiu2(i32 %a) {
+define i32 @sltiu2(i32 signext %a) {
 entry:
   %cmp = icmp ugt i32 %a, -32769
   %cond = select i1 %cmp, i32 3, i32 5
@@ -610,7 +684,7 @@
 ; 64-CMP-DAG:  seleqz $[[T1:[0-9]+]], $[[I5]], $[[R0]]
 ; 64-CMP-DAG:  or $2, $[[T0]], $[[T1]]
 
-define i32 @sltiu3(i32 %a) {
+define i32 @sltiu3(i32 signext %a) {
 entry:
   %cmp = icmp ugt i32 %a, -32770
   %cond = select i1 %cmp, i32 3, i32 5
@@ -623,7 +697,7 @@
 ; doesn't generate conditional moves
 ; for constant operands whose difference is |1|
 
-define i32 @slti4(i32 %a) nounwind readnone {
+define i32 @slti4(i32 signext %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
   %2 = select i1 %1, i32 4, i32 3
   ret i32 %2
@@ -649,7 +723,7 @@
 ; 64-CMP-NOT:  seleqz
 ; 64-CMP-NOT:  selnez
 
-define i32 @slti5(i32 %a) nounwind readnone {
+define i32 @slti5(i32 signext %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
   %2 = select i1 %1, i32 -3, i32 -4
   ret i32 %2
@@ -675,7 +749,7 @@
 ; 64-CMP-NOT:  seleqz
 ; 64-CMP-NOT:  selnez
 
-define i32 @slti6(i32 %a) nounwind readnone {
+define i32 @slti6(i32 signext %a) nounwind readnone {
   %1 = icmp slt i32 %a, 7
   %2 = select i1 %1, i32 3, i32 4
   ret i32 %2
@@ -683,24 +757,9 @@
 
 ; ALL-LABEL: slti6:
 
-; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; 32-CMOV-DAG: xori [[R1]], [[R1]], 1
-; 32-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 32-CMOV-NOT: movn
-
-; 32-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
-; 32-CMP-DAG:  xori [[R1]], [[R1]], 1
-; 32-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 32-CMP-NOT:  seleqz
-; 32-CMP-NOT:  selnez
-
-; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
-; 64-CMOV-DAG: xori [[R1]], [[R1]], 1
-; 64-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 64-CMOV-NOT: movn
-
-; 64-CMP-DAG:  slti [[R1:\$[0-9]+]], $4, 7
-; 64-CMP-DAG:  xori [[R1]], [[R1]], 1
-; 64-CMP-DAG:  addiu [[R2:\$[0-9]+]], [[R1]], 3
-; 64-CMP-NOT:  seleqz
-; 64-CMP-NOT:  selnez
+; ALL-DAG: addiu [[R1:\$[0-9]+]], $zero, 6
+; ALL-DAG: slt [[R1]], [[R1]], $4
+; ALL-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
+; ALL-NOT: movn
+; ALL-NOT:  seleqz
+; ALL-NOT:  selnez

diff --git a/test/CodeGen/Mips/const-mult.ll b/test/CodeGen/Mips/const-mult.ll
index 1862021..60b2a88 100644
--- a/test/CodeGen/Mips/const-mult.ll
+++ b/test/CodeGen/Mips/const-mult.ll

@@ -5,7 +5,7 @@
 ; CHECK: sll $[[R0:[0-9]+]], $4, 2
 ; CHECK: addu ${{[0-9]+}}, $[[R0]], $4
 
-define i32 @mul5_32(i32 %a) {
+define i32 @mul5_32(i32 signext %a) {
 entry:
   %mul = mul nsw i32 %a, 5
   ret i32 %mul
@@ -17,7 +17,7 @@
 ; CHECK-DAG: sll $[[R2:[0-9]+]], $4, 5
 ; CHECK:     subu ${{[0-9]+}}, $[[R2]], $[[R1]]
 
-define i32 @mul27_32(i32 %a) {
+define i32 @mul27_32(i32 signext %a) {
 entry:
   %mul = mul nsw i32 %a, 27
   ret i32 %mul
@@ -29,7 +29,7 @@
 ; CHECK-DAG: sll $[[R2:[0-9]+]], $4, 31
 ; CHECK:     addu ${{[0-9]+}}, $[[R2]], $[[R1]]
 
-define i32 @muln2147483643_32(i32 %a) {
+define i32 @muln2147483643_32(i32 signext %a) {
 entry:
   %mul = mul nsw i32 %a, -2147483643
   ret i32 %mul
@@ -41,7 +41,7 @@
 ; CHECK64-DAG: dsll $[[R2:[0-9]+]], $4, 63
 ; CHECK64:     daddu ${{[0-9]+}}, $[[R2]], $[[R1]]
 
-define i64 @muln9223372036854775805_64(i64 %a) {
+define i64 @muln9223372036854775805_64(i64 signext %a) {
 entry:
   %mul = mul nsw i64 %a, -9223372036854775805
   ret i64 %mul

diff --git a/test/CodeGen/Mips/countleading.ll b/test/CodeGen/Mips/countleading.ll
index 6e63cff..b7aad04 100644
--- a/test/CodeGen/Mips/countleading.ll
+++ b/test/CodeGen/Mips/countleading.ll

@@ -11,7 +11,7 @@
 ;   MIPS32-GT-R1 - MIPS64r1 and above (does not include MIPS64's)
 ;   MIPS64-GT-R1 - MIPS64r1 and above
 
-define i32 @ctlz_i32(i32 %X) nounwind readnone {
+define i32 @ctlz_i32(i32 signext %X) nounwind readnone {
 entry:
 ; ALL-LABEL: ctlz_i32:
 
@@ -27,7 +27,7 @@
 
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 
-define i32 @ctlo_i32(i32 %X) nounwind readnone {
+define i32 @ctlo_i32(i32 signext %X) nounwind readnone {
 entry:
 ; ALL-LABEL: ctlo_i32:
 

diff --git a/test/CodeGen/Mips/ctlz-v.ll b/test/CodeGen/Mips/ctlz-v.ll
index 270f404..3d580e5 100644
--- a/test/CodeGen/Mips/ctlz-v.ll
+++ b/test/CodeGen/Mips/ctlz-v.ll

@@ -6,12 +6,12 @@
 define <2 x i32> @ctlzv2i32(<2 x i32> %x) {
 entry:
 ; MIPS32: clz     $2, $4
-; MIPS32: jr      $ra
 ; MIPS32: clz     $3, $5
 
-; MIPS64: clz     $2, $4
-; MIPS64: jr      $ra
-; MIPS64: clz     $3, $5
+; MIPS64-DAG: sll $[[A0:[0-9]+]], $4, 0
+; MIPS64-DAG: clz $2, $[[A0]]
+; MIPS64-DAG: sll $[[A1:[0-9]+]], $5, 0
+; MIPS64-DAG: clz $3, $[[A1]]
 
   %ret = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %x, i1 true)
   ret <2 x i32> %ret

diff --git a/test/CodeGen/Mips/cttz-v.ll b/test/CodeGen/Mips/cttz-v.ll
index 9470441..85f69f9 100644
--- a/test/CodeGen/Mips/cttz-v.ll
+++ b/test/CodeGen/Mips/cttz-v.ll

@@ -18,14 +18,16 @@
 ; MIPS32-DAG: jr      $ra
 ; MIPS32-DAG: subu    $3, $[[R4]], $[[R8]]
 
-; MIPS64-DAG: addiu   $[[R0:[0-9]+]], $4, -1
-; MIPS64-DAG: not     $[[R1:[0-9]+]], $4
+; MIPS64-DAG: sll     $[[A0:[0-9]+]], $4, 0
+; MIPS64-DAG: addiu   $[[R0:[0-9]+]], $[[A0]], -1
+; MIPS64-DAG: not     $[[R1:[0-9]+]], $[[A0]]
 ; MIPS64-DAG: and     $[[R2:[0-9]+]], $[[R1]], $[[R0]]
 ; MIPS64-DAG: clz     $[[R3:[0-9]+]], $[[R2]]
 ; MIPS64-DAG: addiu   $[[R4:[0-9]+]], $zero, 32
 ; MIPS64-DAG: subu    $2, $[[R4]], $[[R3]]
-; MIPS64-DAG: addiu   $[[R5:[0-9]+]], $5, -1
-; MIPS64-DAG: not     $[[R6:[0-9]+]], $5
+; MIPS64-DAG: sll     $[[A1:[0-9]+]], $5, 0
+; MIPS64-DAG: addiu   $[[R5:[0-9]+]], $[[A1]], -1
+; MIPS64-DAG: not     $[[R6:[0-9]+]], $[[A1]]
 ; MIPS64-DAG: and     $[[R7:[0-9]+]], $[[R6]], $[[R5]]
 ; MIPS64-DAG: clz     $[[R8:[0-9]+]], $[[R7]]
 ; MIPS64-DAG: jr      $ra

diff --git a/test/CodeGen/Mips/divrem.ll b/test/CodeGen/Mips/divrem.ll
index 97f8360..a9cfe0f 100644
--- a/test/CodeGen/Mips/divrem.ll
+++ b/test/CodeGen/Mips/divrem.ll

@@ -27,7 +27,7 @@
 @g0 = common global i32 0, align 4
 @g1 = common global i32 0, align 4
 
-define i32 @sdiv1(i32 %a0, i32 %a1) nounwind readnone {
+define i32 @sdiv1(i32 signext %a0, i32 signext %a1) nounwind readnone {
 entry:
 ; ALL-LABEL: sdiv1:
 
@@ -54,7 +54,7 @@
   ret i32 %div
 }
 
-define i32 @srem1(i32 %a0, i32 %a1) nounwind readnone {
+define i32 @srem1(i32 signext %a0, i32 signext %a1) nounwind readnone {
 entry:
 ; ALL-LABEL: srem1:
 
@@ -81,7 +81,7 @@
   ret i32 %rem
 }
 
-define i32 @udiv1(i32 %a0, i32 %a1) nounwind readnone {
+define i32 @udiv1(i32 zeroext %a0, i32 zeroext %a1) nounwind readnone {
 entry:
 ; ALL-LABEL: udiv1:
 
@@ -107,7 +107,7 @@
   ret i32 %div
 }
 
-define i32 @urem1(i32 %a0, i32 %a1) nounwind readnone {
+define i32 @urem1(i32 zeroext %a0, i32 zeroext %a1) nounwind readnone {
 entry:
 ; ALL-LABEL: urem1:
 
@@ -134,7 +134,7 @@
   ret i32 %rem
 }
 
-define i32 @sdivrem1(i32 %a0, i32 %a1, i32* nocapture %r) nounwind {
+define i32 @sdivrem1(i32 signext %a0, i32 signext %a1, i32* nocapture %r) nounwind {
 entry:
 ; ALL-LABEL: sdivrem1:
 
@@ -175,7 +175,7 @@
   ret i32 %div
 }
 
-define i32 @udivrem1(i32 %a0, i32 %a1, i32* nocapture %r) nounwind {
+define i32 @udivrem1(i32 zeroext %a0, i32 zeroext %a1, i32* nocapture %r) nounwind {
 entry:
 ; ALL-LABEL: udivrem1:
 

diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll
index e78497a..b4efb40 100644
--- a/test/CodeGen/Mips/ehframe-indirect.ll
+++ b/test/CodeGen/Mips/ehframe-indirect.ll

@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel-linux-android < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck  -check-prefix=CHECK32 %s
+; RUN: llc -mtriple=mipsel-linux-android < %s | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu < %s | FileCheck  -check-prefix=CHECK64 %s
+; RUN: llc -mtriple=mips64el-linux-android < %s | FileCheck -check-prefix=CHECK64 %s
 
 define i32 @main() {
 ; CHECK: .cfi_startproc
@@ -27,8 +29,11 @@
 ; CHECK: .hidden DW.ref.__gxx_personality_v0
 ; CHECK: .weak DW.ref.__gxx_personality_v0
 ; CHECK: .section .data.DW.ref.__gxx_personality_v0,"aGw",@progbits,DW.ref.__gxx_personality_v0,comdat
-; CHECK: .align 2
+; CHECK32: .align 2
+; CHECK64: .align 3
 ; CHECK: .type DW.ref.__gxx_personality_v0,@object
-; CHECK: .size DW.ref.__gxx_personality_v0, 4
+; CHECK32: .size DW.ref.__gxx_personality_v0, 4
+; CHECK64: .size DW.ref.__gxx_personality_v0, 8
 ; CHECK: DW.ref.__gxx_personality_v0:
-; CHECK: .4byte __gxx_personality_v0
+; CHECK32: .4byte __gxx_personality_v0
+; CHECK64: .8byte __gxx_personality_v0

diff --git a/test/CodeGen/Mips/fastcc.ll b/test/CodeGen/Mips/fastcc.ll
index 8ee7af8..6b022c5 100644
--- a/test/CodeGen/Mips/fastcc.ll
+++ b/test/CodeGen/Mips/fastcc.ll

@@ -1,6 +1,8 @@
 ; RUN: llc  < %s -march=mipsel | FileCheck %s 
 ; RUN: llc  < %s -mtriple=mipsel-none-nacl-gnu \
 ; RUN:  | FileCheck %s -check-prefix=CHECK-NACL
+; RUN: llc  < %s -march=mipsel -mcpu=mips32 -mattr=+nooddspreg | FileCheck %s -check-prefix=NOODDSPREG
+; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 -mattr=+fp64,+nooddspreg | FileCheck %s -check-prefix=FP64-NOODDSPREG
 
 
 @gi0 = external global i32
@@ -80,6 +82,9 @@
 @g15 = external global i32
 @g16 = external global i32
 
+@fa = common global [11 x float] zeroinitializer, align 4
+@da = common global [11 x double] zeroinitializer, align 8
+
 define void @caller0() nounwind {
 entry:
 ; CHECK: caller0
@@ -264,3 +269,164 @@
   ret void
 }
 
+define void @caller2() {
+entry:
+
+; NOODDSPREG-LABEL:  caller2:
+
+; Check that first 10 arguments are passed in even float registers
+; f0, f2, ... , f18. Check that 11th argument is passed on stack.
+
+; NOODDSPREG-DAG:    lw      $[[R0:[0-9]+]], %got(fa)(${{[0-9]+|gp}})
+; NOODDSPREG-DAG:    lwc1    $f0, 0($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f2, 4($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f4, 8($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f6, 12($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f8, 16($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f10, 20($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f12, 24($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f14, 28($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f16, 32($[[R0]])
+; NOODDSPREG-DAG:    lwc1    $f18, 36($[[R0]])
+
+; NOODDSPREG-DAG:    lwc1    $[[F0:f[0-9]*[02468]]], 40($[[R0]])
+; NOODDSPREG-DAG:    swc1    $[[F0]], 0($sp)
+
+  %0 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 0), align 4
+  %1 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 1), align 4
+  %2 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 2), align 4
+  %3 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 3), align 4
+  %4 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 4), align 4
+  %5 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 5), align 4
+  %6 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 6), align 4
+  %7 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 7), align 4
+  %8 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 8), align 4
+  %9 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 9), align 4
+  %10 = load float* getelementptr ([11 x float]* @fa, i32 0, i32 10), align 4
+  tail call fastcc void @callee2(float %0, float %1, float %2, float %3,
+                                 float %4, float %5, float %6, float %7,
+                                 float %8, float %9, float %10)
+  ret void
+}
+
+define fastcc void @callee2(float %a0, float %a1, float %a2, float %a3,
+                            float %a4, float %a5, float %a6, float %a7,
+                            float %a8, float %a9, float %a10) {
+entry:
+
+; NOODDSPREG-LABEL:  callee2:
+
+; NOODDSPREG:        addiu   $sp, $sp, -[[OFFSET:[0-9]+]]
+
+; Check that first 10 arguments are received in even float registers
+; f0, f2, ... , f18. Check that 11th argument is received on stack.
+
+; NOODDSPREG-DAG:    lw      $[[R0:[0-9]+]], %got(fa)(${{[0-9]+|gp}})
+; NOODDSPREG-DAG:    swc1    $f0, 0($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f2, 4($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f4, 8($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f6, 12($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f8, 16($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f10, 20($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f12, 24($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f14, 28($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f16, 32($[[R0]])
+; NOODDSPREG-DAG:    swc1    $f18, 36($[[R0]])
+
+; NOODDSPREG-DAG:    lwc1    $[[F0:f[0-9]*[02468]]], [[OFFSET]]($sp)
+; NOODDSPREG-DAG:    swc1    $[[F0]], 40($[[R0]])
+
+  store float %a0, float* getelementptr ([11 x float]* @fa, i32 0, i32 0), align 4
+  store float %a1, float* getelementptr ([11 x float]* @fa, i32 0, i32 1), align 4
+  store float %a2, float* getelementptr ([11 x float]* @fa, i32 0, i32 2), align 4
+  store float %a3, float* getelementptr ([11 x float]* @fa, i32 0, i32 3), align 4
+  store float %a4, float* getelementptr ([11 x float]* @fa, i32 0, i32 4), align 4
+  store float %a5, float* getelementptr ([11 x float]* @fa, i32 0, i32 5), align 4
+  store float %a6, float* getelementptr ([11 x float]* @fa, i32 0, i32 6), align 4
+  store float %a7, float* getelementptr ([11 x float]* @fa, i32 0, i32 7), align 4
+  store float %a8, float* getelementptr ([11 x float]* @fa, i32 0, i32 8), align 4
+  store float %a9, float* getelementptr ([11 x float]* @fa, i32 0, i32 9), align 4
+  store float %a10, float* getelementptr ([11 x float]* @fa, i32 0, i32 10), align 4
+  ret void
+}
+
+define void @caller3() {
+entry:
+
+; FP64-NOODDSPREG-LABEL:  caller3:
+
+; Check that first 10 arguments are passed in even float registers
+; f0, f2, ... , f18. Check that 11th argument is passed on stack.
+
+; FP64-NOODDSPREG-DAG:    lw      $[[R0:[0-9]+]], %got(da)(${{[0-9]+|gp}})
+; FP64-NOODDSPREG-DAG:    ldc1    $f0, 0($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f2, 8($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f4, 16($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f6, 24($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f8, 32($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f10, 40($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f12, 48($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f14, 56($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f16, 64($[[R0]])
+; FP64-NOODDSPREG-DAG:    ldc1    $f18, 72($[[R0]])
+
+; FP64-NOODDSPREG-DAG:    ldc1    $[[F0:f[0-9]*[02468]]], 80($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $[[F0]], 0($sp)
+
+  %0 = load double* getelementptr ([11 x double]* @da, i32 0, i32 0), align 8
+  %1 = load double* getelementptr ([11 x double]* @da, i32 0, i32 1), align 8
+  %2 = load double* getelementptr ([11 x double]* @da, i32 0, i32 2), align 8
+  %3 = load double* getelementptr ([11 x double]* @da, i32 0, i32 3), align 8
+  %4 = load double* getelementptr ([11 x double]* @da, i32 0, i32 4), align 8
+  %5 = load double* getelementptr ([11 x double]* @da, i32 0, i32 5), align 8
+  %6 = load double* getelementptr ([11 x double]* @da, i32 0, i32 6), align 8
+  %7 = load double* getelementptr ([11 x double]* @da, i32 0, i32 7), align 8
+  %8 = load double* getelementptr ([11 x double]* @da, i32 0, i32 8), align 8
+  %9 = load double* getelementptr ([11 x double]* @da, i32 0, i32 9), align 8
+  %10 = load double* getelementptr ([11 x double]* @da, i32 0, i32 10), align 8
+  tail call fastcc void @callee3(double %0, double %1, double %2, double %3,
+                                 double %4, double %5, double %6, double %7,
+                                 double %8, double %9, double %10)
+  ret void
+}
+
+define fastcc void @callee3(double %a0, double %a1, double %a2, double %a3,
+                            double %a4, double %a5, double %a6, double %a7,
+                            double %a8, double %a9, double %a10) {
+entry:
+
+; FP64-NOODDSPREG-LABEL:  callee3:
+
+; FP64-NOODDSPREG:        addiu   $sp, $sp, -[[OFFSET:[0-9]+]]
+
+; Check that first 10 arguments are received in even float registers
+; f0, f2, ... , f18. Check that 11th argument is received on stack.
+
+; FP64-NOODDSPREG-DAG:    lw      $[[R0:[0-9]+]], %got(da)(${{[0-9]+|gp}})
+; FP64-NOODDSPREG-DAG:    sdc1    $f0, 0($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f2, 8($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f4, 16($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f6, 24($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f8, 32($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f10, 40($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f12, 48($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f14, 56($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f16, 64($[[R0]])
+; FP64-NOODDSPREG-DAG:    sdc1    $f18, 72($[[R0]])
+
+; FP64-NOODDSPREG-DAG:    ldc1    $[[F0:f[0-9]*[02468]]], [[OFFSET]]($sp)
+; FP64-NOODDSPREG-DAG:    sdc1    $[[F0]], 80($[[R0]])
+
+  store double %a0, double* getelementptr ([11 x double]* @da, i32 0, i32 0), align 8
+  store double %a1, double* getelementptr ([11 x double]* @da, i32 0, i32 1), align 8
+  store double %a2, double* getelementptr ([11 x double]* @da, i32 0, i32 2), align 8
+  store double %a3, double* getelementptr ([11 x double]* @da, i32 0, i32 3), align 8
+  store double %a4, double* getelementptr ([11 x double]* @da, i32 0, i32 4), align 8
+  store double %a5, double* getelementptr ([11 x double]* @da, i32 0, i32 5), align 8
+  store double %a6, double* getelementptr ([11 x double]* @da, i32 0, i32 6), align 8
+  store double %a7, double* getelementptr ([11 x double]* @da, i32 0, i32 7), align 8
+  store double %a8, double* getelementptr ([11 x double]* @da, i32 0, i32 8), align 8
+  store double %a9, double* getelementptr ([11 x double]* @da, i32 0, i32 9), align 8
+  store double %a10, double* getelementptr ([11 x double]* @da, i32 0, i32 10), align 8
+  ret void
+}

diff --git a/test/CodeGen/Mips/fp16instrinsmc.ll b/test/CodeGen/Mips/fp16instrinsmc.ll
index 7ced36c..84d3814 100644
--- a/test/CodeGen/Mips/fp16instrinsmc.ll
+++ b/test/CodeGen/Mips/fp16instrinsmc.ll

@@ -385,7 +385,7 @@
 ; Function Attrs: nounwind
 declare double @exp2(double) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }

diff --git a/test/CodeGen/Mips/fp64a.ll b/test/CodeGen/Mips/fp64a.ll
new file mode 100644
index 0000000..fadce5c
--- /dev/null
+++ b/test/CodeGen/Mips/fp64a.ll

@@ -0,0 +1,161 @@
+; Test that the FP64A ABI performs double precision moves via a spill/reload.
+; The requirement is really that odd-numbered double precision registers do not
+; use mfc1/mtc1 to move the bottom 32-bits (because the hardware will redirect
+; this to the top 32-bits of the even register) but we have to make the decision
+; before register allocation so we do this for all double-precision values.
+
+; We don't test MIPS32r1 since support for 64-bit coprocessors (such as a 64-bit
+; FPU) on a 32-bit architecture was added in MIPS32r2.
+; FIXME: We currently don't test that attempting to use FP64 on MIPS32r1 is an
+;        error either. This is because a large number of CodeGen tests are
+;        incorrectly using this case. We should fix those test cases then add
+;        this check here.
+
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NO-FP64A-BE
+; RUN: llc -march=mips -mcpu=mips32r2 -mattr=fp64,nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FP64A
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NO-FP64A-LE
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fp64,nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FP64A
+
+; RUN: llc -march=mips64 -mcpu=mips64 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NO-FP64A
+; RUN: not llc -march=mips64 -mcpu=mips64 -mattr=fp64,nooddspreg < %s 2>&1 | FileCheck %s -check-prefix=64-FP64A
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NO-FP64A
+; RUN: not llc -march=mips64el -mcpu=mips64 -mattr=fp64,nooddspreg < %s 2>&1 | FileCheck %s -check-prefix=64-FP64A
+
+; 64-FP64A: LLVM ERROR: -mattr=+nooddspreg requires the O32 ABI.
+
+declare double @dbl();
+
+define double @call1(double %d, ...) {
+  ret double %d
+
+; ALL-LABEL:            call1:
+
+; 32R2-NO-FP64A-LE-NOT:     addiu   $sp, $sp
+; 32R2-NO-FP64A-LE:         mtc1    $4, $f0
+; 32R2-NO-FP64A-LE:         mthc1   $5, $f0
+
+; 32R2-NO-FP64A-BE-NOT:     addiu   $sp, $sp
+; 32R2-NO-FP64A-BE:         mtc1    $5, $f0
+; 32R2-NO-FP64A-BE:         mthc1   $4, $f0
+
+; 32R2-FP64A:               addiu   $sp, $sp, -8
+; 32R2-FP64A:               sw      $4, 0($sp)
+; 32R2-FP64A:               sw      $5, 4($sp)
+; 32R2-FP64A:               ldc1    $f0, 0($sp)
+
+; 64-NO-FP64A:              daddiu  $sp, $sp, -64
+; 64-NO-FP64A:              mov.d   $f0, $f12
+}
+
+define double @call2(i32 %i, double %d) {
+  ret double %d
+
+; ALL-LABEL:        call2:
+
+; 32R2-NO-FP64A-LE:     mtc1    $6, $f0
+; 32R2-NO-FP64A-LE:     mthc1   $7, $f0
+
+; 32R2-NO-FP64A-BE:     mtc1    $7, $f0
+; 32R2-NO-FP64A-BE:     mthc1   $6, $f0
+
+; 32R2-FP64A:           addiu   $sp, $sp, -8
+; 32R2-FP64A:           sw      $6, 0($sp)
+; 32R2-FP64A:           sw      $7, 4($sp)
+; 32R2-FP64A:           ldc1    $f0, 0($sp)
+
+; 64-NO-FP64A-NOT:      daddiu  $sp, $sp
+; 64-NO-FP64A:          mov.d   $f0, $f13
+}
+
+define double @call3(float %f1, float %f2, double %d) {
+  ret double %d
+
+; ALL-LABEL:        call3:
+
+; 32R2-NO-FP64A-LE:     mtc1    $6, $f0
+; 32R2-NO-FP64A-LE:     mthc1   $7, $f0
+
+; 32R2-NO-FP64A-BE:     mtc1    $7, $f0
+; 32R2-NO-FP64A-BE:     mthc1   $6, $f0
+
+; 32R2-FP64A:           addiu   $sp, $sp, -8
+; 32R2-FP64A:           sw      $6, 0($sp)
+; 32R2-FP64A:           sw      $7, 4($sp)
+; 32R2-FP64A:           ldc1    $f0, 0($sp)
+
+; 64-NO-FP64A-NOT:      daddiu  $sp, $sp
+; 64-NO-FP64A:          mov.d   $f0, $f14
+}
+
+define double @call4(float %f, double %d, ...) {
+  ret double %d
+
+; ALL-LABEL:        call4:
+
+; 32R2-NO-FP64A-LE:     mtc1    $6, $f0
+; 32R2-NO-FP64A-LE:     mthc1   $7, $f0
+
+; 32R2-NO-FP64A-BE:     mtc1    $7, $f0
+; 32R2-NO-FP64A-BE:     mthc1   $6, $f0
+
+; 32R2-FP64A:           addiu   $sp, $sp, -8
+; 32R2-FP64A:           sw      $6, 0($sp)
+; 32R2-FP64A:           sw      $7, 4($sp)
+; 32R2-FP64A:           ldc1    $f0, 0($sp)
+
+; 64-NO-FP64A:          daddiu  $sp, $sp, -48
+; 64-NO-FP64A:          mov.d   $f0, $f13
+}
+
+define double @call5(double %a, double %b, ...) {
+  %1 = fsub double %a, %b
+  ret double %1
+
+; ALL-LABEL:            call5:
+
+; 32R2-NO-FP64A-LE-DAG:     mtc1    $4, $[[T0:f[0-9]+]]
+; 32R2-NO-FP64A-LE-DAG:     mthc1   $5, $[[T0:f[0-9]+]]
+; 32R2-NO-FP64A-LE-DAG:     mtc1    $6, $[[T1:f[0-9]+]]
+; 32R2-NO-FP64A-LE-DAG:     mthc1   $7, $[[T1:f[0-9]+]]
+; 32R2-NO-FP64A-LE:         sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32R2-NO-FP64A-BE-DAG:     mtc1    $5, $[[T0:f[0-9]+]]
+; 32R2-NO-FP64A-BE-DAG:     mthc1   $4, $[[T0:f[0-9]+]]
+; 32R2-NO-FP64A-BE-DAG:     mtc1    $7, $[[T1:f[0-9]+]]
+; 32R2-NO-FP64A-BE-DAG:     mthc1   $6, $[[T1:f[0-9]+]]
+; 32R2-NO-FP64A-BE:         sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32R2-FP64A:               addiu   $sp, $sp, -8
+; 32R2-FP64A:               sw      $6, 0($sp)
+; 32R2-FP64A:               sw      $7, 4($sp)
+; 32R2-FP64A:               ldc1    $[[T1:f[0-9]+]], 0($sp)
+; 32R2-FP64A:               sw      $4, 0($sp)
+; 32R2-FP64A:               sw      $5, 4($sp)
+; 32R2-FP64A:               ldc1    $[[T0:f[0-9]+]], 0($sp)
+; 32R2-FP64A:               sub.d   $f0, $[[T0]], $[[T1]]
+
+; 64-NO-FP64A:              sub.d   $f0, $f12, $f13
+}
+
+define double @move_from(double %d) {
+  %1 = call double @dbl()
+  %2 = call double @call2(i32 0, double %1)
+  ret double %2
+
+; ALL-LABEL:        move_from:
+
+; 32R2-NO-FP64A-LE-DAG: mfc1    $6, $f0
+; 32R2-NO-FP64A-LE-DAG: mfhc1   $7, $f0
+
+; 32R2-NO-FP64A-BE-DAG: mfc1    $7, $f0
+; 32R2-NO-FP64A-BE-DAG: mfhc1   $6, $f0
+
+; 32R2-FP64A:           addiu   $sp, $sp, -32
+; 32R2-FP64A:           sdc1    $f0, 16($sp)
+; 32R2-FP64A:           lw      $6, 16($sp)
+; FIXME: This store is redundant
+; 32R2-FP64A:           sdc1    $f0, 16($sp)
+; 32R2-FP64A:           lw      $7, 20($sp)
+
+; 64-NO-FP64A:          mov.d   $f13, $f0
+}

diff --git a/test/CodeGen/Mips/fpxx.ll b/test/CodeGen/Mips/fpxx.ll
new file mode 100644
index 0000000..7e2ed22
--- /dev/null
+++ b/test/CodeGen/Mips/fpxx.ll

@@ -0,0 +1,221 @@
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-NOFPXX
+; RUN: llc -march=mipsel -mcpu=mips32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=32-FPXX
+
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-NOFPXX
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=32R2-FPXX
+
+; RUN: llc -march=mips64 -mcpu=mips4 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-NOFPXX
+; RUN: not llc -march=mips64 -mcpu=mips4 -mattr=fpxx < %s 2>&1 | FileCheck %s -check-prefix=4-FPXX
+
+; RUN: llc -march=mips64 -mcpu=mips64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-NOFPXX
+; RUN: not llc -march=mips64 -mcpu=mips64 -mattr=fpxx < %s 2>&1 | FileCheck %s -check-prefix=64-FPXX
+
+; RUN-TODO: llc -march=mips64 -mcpu=mips4 -mattr=-n64,+o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-NOFPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips4 -mattr=-n64,+o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=4-O32-FPXX
+
+; RUN-TODO: llc -march=mips64 -mcpu=mips64 -mattr=-n64,+o32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-NOFPXX
+; RUN-TODO: llc -march=mips64 -mcpu=mips64 -mattr=-n64,+o32 -mattr=fpxx < %s | FileCheck %s -check-prefix=ALL -check-prefix=64-O32-FPXX
+
+declare double @dbl();
+
+; 4-FPXX:  LLVM ERROR: FPXX is not permitted for the N32/N64 ABI's.
+; 64-FPXX: LLVM ERROR: FPXX is not permitted for the N32/N64 ABI's.
+
+define double @test1(double %d, ...) {
+  ret double %d
+
+; ALL-LABEL: test1:
+
+; 32-NOFPXX:     mtc1    $4, $f0
+; 32-NOFPXX:     mtc1    $5, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $4, 0($sp)
+; 32-FPXX:       sw      $5, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $4, $f0
+; 32R2-NOFPXX:   mthc1   $5, $f0
+
+; 32R2-FPXX:     mtc1    $4, $f0
+; 32R2-FPXX:     mthc1   $5, $f0
+
+; floats/doubles are not passed in integer registers for n64, so dmtc1 is not used.
+; 4-NOFPXX:      mov.d   $f0, $f12
+
+; 64-NOFPXX:     mov.d   $f0, $f12
+}
+
+define double @test2(i32 %i, double %d) {
+  ret double %d
+
+; ALL-LABEL: test2:
+
+; 32-NOFPXX:     mtc1    $6, $f0
+; 32-NOFPXX:     mtc1    $7, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $6, 0($sp)
+; 32-FPXX:       sw      $7, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $6, $f0
+; 32R2-NOFPXX:   mthc1   $7, $f0
+
+; 32R2-FPXX:     mtc1    $6, $f0
+; 32R2-FPXX:     mthc1   $7, $f0
+
+; 4-NOFPXX:      mov.d   $f0, $f13
+
+; 64-NOFPXX:     mov.d   $f0, $f13
+}
+
+define double @test3(float %f1, float %f2, double %d) {
+  ret double %d
+
+; ALL-LABEL: test3:
+
+; 32-NOFPXX:     mtc1    $6, $f0
+; 32-NOFPXX:     mtc1    $7, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $6, 0($sp)
+; 32-FPXX:       sw      $7, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $6, $f0
+; 32R2-NOFPXX:   mthc1   $7, $f0
+
+; 32R2-FPXX:     mtc1    $6, $f0
+; 32R2-FPXX:     mthc1   $7, $f0
+
+; 4-NOFPXX:      mov.d   $f0, $f14
+
+; 64-NOFPXX:     mov.d   $f0, $f14
+}
+
+define double @test4(float %f, double %d, ...) {
+  ret double %d
+
+; ALL-LABEL: test4:
+
+; 32-NOFPXX:     mtc1    $6, $f0
+; 32-NOFPXX:     mtc1    $7, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $6, 0($sp)
+; 32-FPXX:       sw      $7, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $6, $f0
+; 32R2-NOFPXX:   mthc1   $7, $f0
+
+; 32R2-FPXX:     mtc1    $6, $f0
+; 32R2-FPXX:     mthc1   $7, $f0
+
+; 4-NOFPXX:      mov.d   $f0, $f13
+
+; 64-NOFPXX:     mov.d   $f0, $f13
+}
+
+define double @test5() {
+  ret double 0.000000e+00
+
+; ALL-LABEL: test5:
+
+; 32-NOFPXX:     mtc1    $zero, $f0
+; 32-NOFPXX:     mtc1    $zero, $f1
+
+; 32-FPXX:       addiu   $sp, $sp, -8
+; 32-FPXX:       sw      $zero, 0($sp)
+; 32-FPXX:       sw      $zero, 4($sp)
+; 32-FPXX:       ldc1    $f0, 0($sp)
+
+; 32R2-NOFPXX:   mtc1    $zero, $f0
+; 32R2-NOFPXX:   mthc1   $zero, $f0
+
+; 32R2-FPXX:     mtc1    $zero, $f0
+; 32R2-FPXX:     mthc1   $zero, $f0
+
+; 4-NOFPXX:      dmtc1 $zero, $f0
+
+; 64-NOFPXX:     dmtc1 $zero, $f0
+}
+
+define double @test6(double %a, double %b, ...) {
+  %1 = fsub double %a, %b
+  ret double %1
+
+; ALL-LABEL:     test6:
+
+; 32-NOFPXX-DAG:     mtc1    $4, $[[T0:f[0-9]+]]
+; 32-NOFPXX-DAG:     mtc1    $5, ${{f[0-9]*[13579]}}
+; 32-NOFPXX-DAG:     mtc1    $6, $[[T1:f[0-9]+]]
+; 32-NOFPXX-DAG:     mtc1    $7, ${{f[0-9]*[13579]}}
+; 32-NOFPXX:         sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32-FPXX:           addiu   $sp, $sp, -8
+; 32-FPXX:           sw      $6, 0($sp)
+; 32-FPXX:           sw      $7, 4($sp)
+; 32-FPXX:           ldc1    $[[T1:f[0-9]+]], 0($sp)
+; 32-FPXX:           sw      $4, 0($sp)
+; 32-FPXX:           sw      $5, 4($sp)
+; 32-FPXX:           ldc1    $[[T0:f[0-9]+]], 0($sp)
+; 32-FPXX:           sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32R2-NOFPXX-DAG:   mtc1    $4, $[[T0:f[0-9]+]]
+; 32R2-NOFPXX-DAG:   mthc1   $5, $[[T0]]
+; 32R2-NOFPXX-DAG:   mtc1    $6, $[[T1:f[0-9]+]]
+; 32R2-NOFPXX-DAG:   mthc1   $7, $[[T1]]
+; 32R2-NOFPXX:       sub.d   $f0, $[[T0]], $[[T1]]
+
+; 32R2-FPXX-DAG:     mtc1    $4, $[[T0:f[0-9]+]]
+; 32R2-FPXX-DAG:     mthc1   $5, $[[T0]]
+; 32R2-FPXX-DAG:     mtc1    $6, $[[T1:f[0-9]+]]
+; 32R2-FPXX-DAG:     mthc1   $7, $[[T1]]
+; 32R2-FPXX:         sub.d   $f0, $[[T0]], $[[T1]]
+
+; floats/doubles are not passed in integer registers for n64, so dmtc1 is not used.
+; 4-NOFPXX:          sub.d   $f0, $f12, $f13
+
+; floats/doubles are not passed in integer registers for n64, so dmtc1 is not used.
+; 64-NOFPXX:         sub.d   $f0, $f12, $f13
+}
+
+define double @move_from1(double %d) {
+  %1 = call double @dbl()
+  %2 = call double @test2(i32 0, double %1)
+  ret double %2
+
+; ALL-LABEL:   move_from1:
+
+; 32-NOFPXX-DAG:   mfc1    $6, $f0
+; 32-NOFPXX-DAG:   mfc1    $7, $f1
+
+; 32-FPXX:         addiu   $sp, $sp, -32
+; 32-FPXX:         sdc1    $f0, 16($sp)
+; 32-FPXX:         lw      $6, 16($sp)
+; FIXME: This store is redundant
+; 32-FPXX:         sdc1    $f0, 16($sp)
+; 32-FPXX:         lw      $7, 20($sp)
+
+; 32R2-NOFPXX-DAG: mfc1    $6, $f0
+; 32R2-NOFPXX-DAG: mfhc1   $7, $f0
+
+; 32R2-FPXX-DAG:   mfc1    $6, $f0
+; 32R2-FPXX-DAG:   mfhc1   $7, $f0
+
+; floats/doubles are not passed in integer registers for n64, so dmfc1 is not used.
+; We can't use inline assembly to force a copy either because trying to force
+; a copy to a GPR this way fails with ; "couldn't allocate input reg for
+; constraint 'r'". It therefore seems impossible to test the generation of dmfc1
+; in a simple test.
+; 4-NOFPXX:        mov.d   $f13, $f0
+
+; floats/doubles are not passed in integer registers for n64, so dmfc1 is not used.
+; We can't use inline assembly to force a copy either because trying to force
+; a copy to a GPR this way fails with ; "couldn't allocate input reg for
+; constraint 'r'". It therefore seems impossible to test the generation of dmfc1
+; in a simple test.
+; 64-NOFPXX:       mov.d   $f13, $f0
+}

diff --git a/test/CodeGen/Mips/gpreg-lazy-binding.ll b/test/CodeGen/Mips/gpreg-lazy-binding.ll
index 88e596b..3a636d8 100644
--- a/test/CodeGen/Mips/gpreg-lazy-binding.ll
+++ b/test/CodeGen/Mips/gpreg-lazy-binding.ll

@@ -25,3 +25,11 @@
   ret void
 }
 
+define void @no_lazy(void (i32)* %pf) {
+
+; CHECK-LABEL:  no_lazy
+; CHECK-NOT:    gp_disp
+
+  tail call void %pf(i32 1)
+  ret void
+}

diff --git a/test/CodeGen/Mips/hfptrcall.ll b/test/CodeGen/Mips/hfptrcall.ll
index 9df8d90..683952d 100644
--- a/test/CodeGen/Mips/hfptrcall.ll
+++ b/test/CodeGen/Mips/hfptrcall.ll

@@ -118,8 +118,8 @@
 
 declare i32 @printf(i8*, ...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 
 

diff --git a/test/CodeGen/Mips/init-array.ll b/test/CodeGen/Mips/init-array.ll
index f96ce26..1ca182d 100644
--- a/test/CodeGen/Mips/init-array.ll
+++ b/test/CodeGen/Mips/init-array.ll

@@ -1,4 +1,4 @@
-; RUN: llc -mtriple mipsel-unknown-linux -use-init-array < %s | FileCheck  %s
+; RUN: llc -mtriple mipsel-unknown-linux < %s | FileCheck  %s
 
 target triple = "mipsel-unknown-linux"
 

diff --git a/test/CodeGen/Mips/inlineasm-operand-code.ll b/test/CodeGen/Mips/inlineasm-operand-code.ll
index 6512851..3d9dec7 100644
--- a/test/CodeGen/Mips/inlineasm-operand-code.ll
+++ b/test/CodeGen/Mips/inlineasm-operand-code.ll

@@ -65,6 +65,33 @@
 ;CHECK_LITTLE_32:    addiu ${{[0-9]+}},${{[0-9]+}},$0
 ;CHECK_LITTLE_32:    #NO_APP
   tail call i32 asm sideeffect "addiu $0,$1,${2:z}", "=r,r,I"(i32 7, i32 0) nounwind
+
+; z with non-zero and the "r"(register) and "J"(integer zero) constraints
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+  call void asm sideeffect "mtc0 ${0:z}, $$12", "Jr"(i32 7) nounwind
+
+; z with zero and the "r"(register) and "J"(integer zero) constraints
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    mtc0 $0, ${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+  call void asm sideeffect "mtc0 ${0:z}, $$12", "Jr"(i32 0) nounwind
+
+; z with non-zero and just the "r"(register) constraint
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+  call void asm sideeffect "mtc0 ${0:z}, $$12", "r"(i32 7) nounwind
+
+; z with zero and just the "r"(register) constraint
+; FIXME: Check for $0, instead of other registers.
+;        We should be using $0 directly in this case, not real registers.
+;        When the materialization of 0 gets fixed, this test will fail.
+;CHECK_LITTLE_32:    #APP
+;CHECK_LITTLE_32:    mtc0 ${{[1-9][0-9]?}}, ${{[0-9]+}}
+;CHECK_LITTLE_32:    #NO_APP
+  call void asm sideeffect "mtc0 ${0:z}, $$12", "r"(i32 0) nounwind
   ret i32 0
 }
 

diff --git a/test/CodeGen/Mips/llvm-ir/mul.ll b/test/CodeGen/Mips/llvm-ir/mul.ll
new file mode 100644
index 0000000..1674124
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/mul.ll

@@ -0,0 +1,181 @@
+; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=M2
+; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=32R1-R2 -check-prefix=32R1
+; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=32R1-R2 -check-prefix=32R2
+; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=32R6
+; RUN: llc < %s -march=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=M4
+; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=64R1-R2
+; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN:    -check-prefix=ALL -check-prefix=64R1-R2
+; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN:     -check-prefix=ALL -check-prefix=64R6
+
+define signext i1 @mul_i1(i1 signext %a, i1 signext %b) {
+entry:
+; ALL-LABEL: mul_i1:
+
+  ; M2:         mult    $4, $5
+  ; M2:         mflo    $[[T0:[0-9]+]]
+  ; M2:         sll     $[[T0]], $[[T0]], 31
+  ; M2:         sra     $2, $[[T0]], 31
+
+  ; 32R1-R2:    mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R1-R2:    sll     $[[T0]], $[[T0]], 31
+  ; 32R1-R2:    sra     $2, $[[T0]], 31
+
+  ; 32R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R6:       sll     $[[T0]], $[[T0]], 31
+  ; 32R6:       sra     $2, $[[T0]], 31
+
+  ; M4:         mult    $4, $5
+  ; M4:         mflo    $[[T0:[0-9]+]]
+  ; M4:         sll     $[[T0]], $[[T0]], 31
+  ; M4:         sra     $2, $[[T0]], 31
+
+  ; 64R1-R2:    mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R1-R2:    sll     $[[T0]], $[[T0]], 31
+  ; 64R1-R2:    sra     $2, $[[T0]], 31
+
+  ; 64R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R6:       sll     $[[T0]], $[[T0]], 31
+  ; 64R6:       sra     $2, $[[T0]], 31
+
+  %r = mul i1 %a, %b
+  ret i1 %r
+}
+
+define signext i8 @mul_i8(i8 signext %a, i8 signext %b) {
+entry:
+; ALL-LABEL: mul_i8:
+
+  ; M2:         mult    $4, $5
+  ; M2:         mflo    $[[T0:[0-9]+]]
+  ; M2:         sll     $[[T0]], $[[T0]], 24
+  ; M2:         sra     $2, $[[T0]], 24
+
+  ; 32R1:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R1:       sll     $[[T0]], $[[T0]], 24
+  ; 32R1:       sra     $2, $[[T0]], 24
+
+  ; 32R2:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R2:       seb     $2, $[[T0]]
+
+  ; 32R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R6:       seb     $2, $[[T0]]
+
+  ; M4:         mult    $4, $5
+  ; M4:         mflo    $[[T0:[0-9]+]]
+  ; M4:         sll     $[[T0]], $[[T0]], 24
+  ; M4:         sra     $2, $[[T0]], 24
+
+  ; 64R1:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R1:       sll     $[[T0]], $[[T0]], 24
+  ; 64R1:       sra     $2, $[[T0]], 24
+
+  ; 64R2:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R2:       seb     $2, $[[T0]]
+
+  ; 64R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R6:       seb     $2, $[[T0]]
+  %r = mul i8 %a, %b
+  ret i8 %r
+}
+
+define signext i16 @mul_i16(i16 signext %a, i16 signext %b) {
+entry:
+; ALL-LABEL: mul_i16:
+
+  ; M2:         mult    $4, $5
+  ; M2:         mflo    $[[T0:[0-9]+]]
+  ; M2:         sll     $[[T0]], $[[T0]], 16
+  ; M2:         sra     $2, $[[T0]], 16
+
+  ; 32R1:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R1:       sll     $[[T0]], $[[T0]], 16
+  ; 32R1:       sra     $2, $[[T0]], 16
+
+  ; 32R2:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R2:       seh     $2, $[[T0]]
+
+  ; 32R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 32R6:       seh     $2, $[[T0]]
+
+  ; M4:         mult    $4, $5
+  ; M4:         mflo    $[[T0:[0-9]+]]
+  ; M4:         sll     $[[T0]], $[[T0]], 16
+  ; M4:         sra     $2, $[[T0]], 16
+
+  ; 64R1:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R1:       sll     $[[T0]], $[[T0]], 16
+  ; 64R1:       sra     $2, $[[T0]], 16
+
+  ; 64R2:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R2:       seh     $2, $[[T0]]
+
+  ; 64R6:       mul     $[[T0:[0-9]+]], $4, $5
+  ; 64R6:       seh     $2, $[[T0]]
+  %r = mul i16 %a, %b
+  ret i16 %r
+}
+
+define signext i32 @mul_i32(i32 signext %a, i32 signext %b) {
+entry:
+; ALL-LABEL: mul_i32:
+
+  ; M2:         mult    $4, $5
+  ; M2:         mflo    $2
+
+  ; 32R1-R2:    mul     $2, $4, $5
+  ; 32R6:       mul     $2, $4, $5
+
+  ; 64R1-R2:    mul     $2, $4, $5
+  ; 64R6:       mul     $2, $4, $5
+  %r = mul i32 %a, %b
+  ret i32 %r
+}
+
+define signext i64 @mul_i64(i64 signext %a, i64 signext %b) {
+entry:
+; ALL-LABEL: mul_i64:
+
+  ; M2:         mult    $4, $7
+  ; M2:         mflo    $[[T0:[0-9]+]]
+  ; M2:         mult    $5, $6
+  ; M2:         mflo    $[[T1:[0-9]+]]
+  ; M2:         multu   $5, $7
+  ; M2:         mflo    $3
+  ; M2:         mfhi    $4
+  ; M2:         addu    $[[T2:[0-9]+]], $4, $[[T1]]
+  ; M2:         addu    $2, $[[T2]], $[[T0]]
+
+  ; 32R1-R2:    multu   $5, $7
+  ; 32R1-R2:    mflo    $3
+  ; 32R1-R2:    mfhi    $[[T0:[0-9]+]]
+  ; 32R1-R2:    mul     $[[T1:[0-9]+]], $4, $7
+  ; 32R1-R2:    mul     $[[T2:[0-9]+]], $5, $6
+  ; 32R1-R2:    addu    $[[T0]], $[[T0]], $[[T2:[0-9]+]]
+  ; 32R1-R2:    addu    $2, $[[T0]], $[[T1]]
+
+  ; 32R6:       mul     $[[T0:[0-9]+]], $5, $6
+  ; 32R6:       muhu    $[[T1:[0-9]+]], $5, $7
+  ; 32R6:       addu    $[[T0]], $[[T1]], $[[T0]]
+  ; 32R6:       mul     $[[T2:[0-9]+]], $4, $7
+  ; 32R6:       addu    $2, $[[T0]], $[[T2]]
+  ; 32R6:       mul     $3, $5, $7
+
+  ; M4:         dmult   $4, $5
+  ; M4:         mflo    $2
+
+  ; 64R1-R2:    dmult   $4, $5
+  ; 64R1-R2:    mflo    $2
+
+  ; 64R6:       dmul    $2, $4, $5
+
+  %r = mul i64 %a, %b
+  ret i64 %r
+}

diff --git a/test/CodeGen/Mips/load-store-left-right.ll b/test/CodeGen/Mips/load-store-left-right.ll
index a3f5ebf..f6d0e8d 100644
--- a/test/CodeGen/Mips/load-store-left-right.ll
+++ b/test/CodeGen/Mips/load-store-left-right.ll

@@ -47,7 +47,7 @@
   ret i32 %0
 }
 
-define void @store_SI(i32 %a) nounwind {
+define void @store_SI(i32 signext %a) nounwind {
 entry:
 ; ALL-LABEL: store_SI:
 
@@ -201,7 +201,7 @@
   ret void
 }
 
-define void @store_SI_trunc_from_i64(i32 %a) nounwind {
+define void @store_SI_trunc_from_i64(i32 signext %a) nounwind {
 entry:
 ; ALL-LABEL: store_SI_trunc_from_i64:
 

diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index a403744..b9b52be 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll

@@ -13,7 +13,7 @@
 
 @x = external global i32
 
-define void @test1(i32 %s) {
+define void @test1(i32 signext %s) {
 entry:
   %cmp = icmp eq i32 %s, 0
   br i1 %cmp, label %end, label %then

diff --git a/test/CodeGen/Mips/madd-msub.ll b/test/CodeGen/Mips/madd-msub.ll
index 8222967..b0c3ff6 100644
--- a/test/CodeGen/Mips/madd-msub.ll
+++ b/test/CodeGen/Mips/madd-msub.ll

@@ -76,26 +76,14 @@
 ; 32R6-DAG:      muhu $[[T3:[0-9]+]], ${{[45]}}, ${{[45]}}
 ; 32R6-DAG:      addu $2, $[[T3]], $[[T2]]
 
-; 64-DAG:        dsll $[[T0:[0-9]+]], $4, 32
-; 64-DAG:        dsrl $[[T1:[0-9]+]], $[[T0]], 32
-; 64-DAG:        dsll $[[T2:[0-9]+]], $5, 32
-; 64-DAG:        dsrl $[[T3:[0-9]+]], $[[T2]], 32
-; 64-DAG:        d[[m:m]]ult $[[T3]], $[[T1]]
-; 64-DAG:        [[m]]flo $[[T4:[0-9]+]]
-; 64-DAG:        dsll $[[T5:[0-9]+]], $6, 32
-; 64-DAG:        dsrl $[[T6:[0-9]+]], $[[T5]], 32
-; 64-DAG:        daddu $2, $[[T4]], $[[T6]]
+; 64-DAG:        d[[m:m]]ult $5, $4
+; 64-DAG:        [[m]]flo $[[T0:[0-9]+]]
+; 64-DAG:        daddu $2, $[[T0]], $6
 
-; 64R6-DAG:      dsll $[[T0:[0-9]+]], $4, 32
-; 64R6-DAG:      dsrl $[[T1:[0-9]+]], $[[T0]], 32
-; 64R6-DAG:      dsll $[[T2:[0-9]+]], $5, 32
-; 64R6-DAG:      dsrl $[[T3:[0-9]+]], $[[T2]], 32
-; 64R6-DAG:      dmul $[[T4:[0-9]+]], $[[T3]], $[[T1]]
-; 64R6-DAG:      dsll $[[T5:[0-9]+]], $6, 32
-; 64R6-DAG:      dsrl $[[T6:[0-9]+]], $[[T5]], 32
-; 64R6-DAG:      daddu $2, $[[T4]], $[[T6]]
+; 64R6-DAG:      dmul $[[T0:[0-9]+]], $5, $4
+; 64R6-DAG:      daddu $2, $[[T0]], $6
 
-define i64 @madd2(i32 %a, i32 %b, i32 %c) nounwind readnone {
+define i64 @madd2(i32 zeroext %a, i32 zeroext %b, i32 zeroext %c) nounwind readnone {
 entry:
   %conv = zext i32 %a to i64
   %conv2 = zext i32 %b to i64
@@ -214,26 +202,14 @@
 ; 32R6-DAG:      negu $2, $[[T3]]
 ; 32R6-DAG:      subu $3, $6, $[[T1]]
 
-; 64-DAG:        dsll $[[T0:[0-9]+]], $4, 32
-; 64-DAG:        dsrl $[[T1:[0-9]+]], $[[T0]], 32
-; 64-DAG:        dsll $[[T2:[0-9]+]], $5, 32
-; 64-DAG:        dsrl $[[T3:[0-9]+]], $[[T2]], 32
-; 64-DAG:        d[[m:m]]ult $[[T3]], $[[T1]]
-; 64-DAG:        [[m]]flo $[[T4:[0-9]+]]
-; 64-DAG:        dsll $[[T5:[0-9]+]], $6, 32
-; 64-DAG:        dsrl $[[T6:[0-9]+]], $[[T5]], 32
-; 64-DAG:        dsubu $2, $[[T6]], $[[T4]]
+; 64-DAG:        d[[m:m]]ult $5, $4
+; 64-DAG:        [[m]]flo $[[T0:[0-9]+]]
+; 64-DAG:        dsubu $2, $6, $[[T0]]
 
-; 64R6-DAG:      dsll $[[T0:[0-9]+]], $4, 32
-; 64R6-DAG:      dsrl $[[T1:[0-9]+]], $[[T0]], 32
-; 64R6-DAG:      dsll $[[T2:[0-9]+]], $5, 32
-; 64R6-DAG:      dsrl $[[T3:[0-9]+]], $[[T2]], 32
-; 64R6-DAG:      dmul $[[T4:[0-9]+]], $[[T3]], $[[T1]]
-; 64R6-DAG:      dsll $[[T5:[0-9]+]], $6, 32
-; 64R6-DAG:      dsrl $[[T6:[0-9]+]], $[[T5]], 32
-; 64R6-DAG:      dsubu $2, $[[T6]], $[[T4]]
+; 64R6-DAG:      dmul $[[T0:[0-9]+]], $5, $4
+; 64R6-DAG:      dsubu $2, $6, $[[T0]]
 
-define i64 @msub2(i32 %a, i32 %b, i32 %c) nounwind readnone {
+define i64 @msub2(i32 zeroext %a, i32 zeroext %b, i32 zeroext %c) nounwind readnone {
 entry:
   %conv = zext i32 %c to i64
   %conv2 = zext i32 %a to i64

diff --git a/test/CodeGen/Mips/micromips-addiu.ll b/test/CodeGen/Mips/micromips-addiu.ll
new file mode 100644
index 0000000..c5bee34
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-addiu.ll

@@ -0,0 +1,32 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+@x = global i32 65504, align 4
+@y = global i32 60929, align 4
+@z = global i32 60929, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%08x \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @x, align 4
+  %addiu1 = add i32 %0, -7
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %addiu1)
+
+  %1 = load i32* @y, align 4
+  %addiu2 = add i32 %1, 55
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %addiu2)
+
+  %2 = load i32* @z, align 4
+  %addiu3 = add i32 %2, 24
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %addiu3)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
+
+; CHECK: addius5  ${{[0-9]+}}, -7
+; CHECK: addiu    ${{[0-9]+}}, ${{[0-9]+}}, 55
+; CHECK: addiur2  ${{[2-7]|16|17}}, ${{[2-7]|16|17}}, 24

diff --git a/test/CodeGen/Mips/micromips-andi.ll b/test/CodeGen/Mips/micromips-andi.ll
new file mode 100644
index 0000000..b82d2b0
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-andi.ll

@@ -0,0 +1,25 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+@x = global i32 65504, align 4
+@y = global i32 60929, align 4
+@.str = private unnamed_addr constant [7 x i8] c"%08x \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @x, align 4
+  %and1 = and i32 %0, 4
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %and1)
+
+  %1 = load i32* @y, align 4
+  %and2 = and i32 %1, 5
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds
+                                  ([7 x i8]* @.str, i32 0, i32 0), i32 %and2)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
+
+; CHECK: andi16 ${{[2-7]|16|17}}, ${{[2-7]|16|17}}
+; CHECK: andi   ${{[0-9]+}}, ${{[0-9]+}}

diff --git a/test/CodeGen/Mips/micromips-delay-slot.ll b/test/CodeGen/Mips/micromips-delay-slot.ll
new file mode 100644
index 0000000..4bab97a
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-delay-slot.ll

@@ -0,0 +1,18 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  %shl = shl i32 %0, 2
+  %call = call i32 @bar(i32 %shl)
+  ret i32 %call
+}
+
+declare i32 @bar(i32) #1
+
+; CHECK: nop
+

diff --git a/test/CodeGen/Mips/micromips-rdhwr-directives.ll b/test/CodeGen/Mips/micromips-rdhwr-directives.ll
new file mode 100644
index 0000000..af40a87
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-rdhwr-directives.ll

@@ -0,0 +1,15 @@
+; RUN: llc -march=mipsel -mcpu=mips32 -relocation-model=static < %s \
+; RUN:   -mattr=+micromips | FileCheck %s
+
+@a = external thread_local global i32
+
+define i32 @foo() nounwind readonly {
+entry:
+; CHECK: .set  push
+; CHECK: .set  mips32r2
+; CHECK: rdhwr
+; CHECK: .set  pop
+
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}

diff --git a/test/CodeGen/Mips/micromips-shift.ll b/test/CodeGen/Mips/micromips-shift.ll
new file mode 100644
index 0000000..8215010
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-shift.ll

@@ -0,0 +1,44 @@
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+micromips \
+; RUN:   -relocation-model=pic -O3 < %s | FileCheck %s
+
+@a = global i32 10, align 4
+@b = global i32 0, align 4
+@c = global i32 10, align 4
+@d = global i32 0, align 4
+
+define i32 @shift_left() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %shl = shl i32 %0, 4
+  store i32 %shl, i32* @b, align 4
+
+  %1 = load i32* @c, align 4
+  %shl1 = shl i32 %1, 10
+  store i32 %shl1, i32* @d, align 4
+
+  ret i32 0
+}
+
+; CHECK: sll16  ${{[2-7]|16|17}}, ${{[2-7]|16|17}}, {{[0-7]}}
+; CHECK: sll    ${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}
+
+@i = global i32 10654, align 4
+@j = global i32 0, align 4
+@m = global i32 10, align 4
+@n = global i32 0, align 4
+
+define i32 @shift_right() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %shr = lshr i32 %0, 4
+  store i32 %shr, i32* @j, align 4
+
+  %1 = load i32* @m, align 4
+  %shr1 = lshr i32 %1, 10
+  store i32 %shr1, i32* @n, align 4
+
+  ret i32 0
+}
+
+; CHECK: srl16  ${{[2-7]|16|17}}, ${{[2-7]|16|17}}, {{[0-7]}}
+; CHECK: srl    ${{[0-9]+}}, ${{[0-9]+}}, {{[0-9]+}}

diff --git a/test/CodeGen/Mips/mips16-hf-attr-2.ll b/test/CodeGen/Mips/mips16-hf-attr-2.ll
new file mode 100644
index 0000000..60c6eaa
--- /dev/null
+++ b/test/CodeGen/Mips/mips16-hf-attr-2.ll

@@ -0,0 +1,45 @@
+; Check that stubs generation for mips16 hard-float mode does not depend
+; on the function 'use-soft-float' attribute's value.
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel \
+; RUN:     -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s
+
+define void @bar_sf() #1 {
+; CHECK: bar_sf:
+entry:
+  %call1 = call float @foo(float 1.000000e+00)
+; CHECK: lw $3, %call16(foo)($2)
+; CHECK-NOT: lw $5, %got(__mips16_call_stub_sf_1)($3)
+  ret void
+}
+
+define void @bar_hf() #0 {
+; CHECK: bar_hf:
+entry:
+  %call1 = call float @foo(float 1.000000e+00)
+; CHECK: lw $2, %call16(foo)($3)
+; CHECK: lw $5, %got(__mips16_call_stub_sf_1)($3)
+  ret void
+}
+
+declare float @foo(float) #2
+
+attributes #0 = {
+  nounwind
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="true"
+  "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false"
+  "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
+  "unsafe-fp-math"="false" "use-soft-float"="false"
+}
+attributes #1 = {
+  nounwind
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="true"
+  "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false"
+  "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
+  "unsafe-fp-math"="false" "use-soft-float"="true"
+}
+attributes #2 = {
+  "less-precise-fpmad"="false" "no-frame-pointer-elim"="true"
+  "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false"
+  "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
+  "unsafe-fp-math"="false" "use-soft-float"="true"
+}

diff --git a/test/CodeGen/Mips/mips16-hf-attr.ll b/test/CodeGen/Mips/mips16-hf-attr.ll
index d9ad629..c6ad442 100644
--- a/test/CodeGen/Mips/mips16-hf-attr.ll
+++ b/test/CodeGen/Mips/mips16-hf-attr.ll

@@ -3,8 +3,8 @@
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel \
 ; RUN:     -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s
 
-define void @bar_sf() #0 {
-; CHECK: bar_sf:
+define void @bar_hf() #0 {
+; CHECK: bar_hf:
 entry:
   %call1 = call float @foo(float 1.000000e+00)
 ; CHECK: lw $2, %call16(foo)($3)
@@ -12,12 +12,12 @@
   ret void
 }
 
-define void @bar_hf() #1 {
-; CHECK: bar_hf:
+define void @bar_sf() #1 {
+; CHECK: bar_sf:
 entry:
   %call1 = call float @foo(float 1.000000e+00)
-; CHECK: lw $2, %call16(foo)($3)
-; CHECK: lw $5, %got(__mips16_call_stub_sf_1)($3)
+; CHECK: lw $3, %call16(foo)($2)
+; CHECK-NOT: lw $5, %got(__mips16_call_stub_sf_1)($3)
   ret void
 }
 

diff --git a/test/CodeGen/Mips/mips64-f128.ll b/test/CodeGen/Mips/mips64-f128.ll
index 7f7d515..6987d4a 100644
--- a/test/CodeGen/Mips/mips64-f128.ll
+++ b/test/CodeGen/Mips/mips64-f128.ll

@@ -114,7 +114,7 @@
 ; ALL-LABEL: conv_LD_UInt:
 ; ALL: ld $25, %call16(__floatunsitf)
 
-define fp128 @conv_LD_UInt(i32 %a) {
+define fp128 @conv_LD_UInt(i32 signext %a) {
 entry:
   %conv = uitofp i32 %a to fp128
   ret fp128 %conv
@@ -545,7 +545,7 @@
 
 ; ALL-LABEL: load_LD_float:
 ; ALL: ld   $[[R0:[0-9]+]], %got_disp(gf1)
-; ALL: lw   $4, 0($[[R0]])
+; ALL: lwu  $4, 0($[[R0]])
 ; ALL: ld   $25, %call16(__extendsftf2)
 ; ALL: jalr $25
 
@@ -635,7 +635,7 @@
 ; CMP_CC_FMT-DAG: selnez $[[NE2:[0-9]+]], $7, $[[CC]]
 ; CMP_CC_FMT-DAG: or $4, $[[NE2]], $[[EQ2]]
 
-define fp128 @select_LD(i32 %a, i64, fp128 %b, fp128 %c) {
+define fp128 @select_LD(i32 signext %a, i64, fp128 %b, fp128 %c) {
 entry:
   %tobool = icmp ne i32 %a, 0
   %cond = select i1 %tobool, fp128 %b, fp128 %c

diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
index 7a52c3d..ed494e9 100644
--- a/test/CodeGen/Mips/mips64-sret.ll
+++ b/test/CodeGen/Mips/mips64-sret.ll

@@ -11,7 +11,7 @@
   ret void
 }
 
-define void @bar(i32 %v, i32* noalias sret %agg.result) nounwind {
+define void @bar(i32 signext %v, i32* noalias sret %agg.result) nounwind {
 entry:
 ; CHECK-LABEL: bar:
 ; CHECK: sw $4, 0($5)

diff --git a/test/CodeGen/Mips/mno-ldc1-sdc1.ll b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
index 244b03d..db653ea 100644
--- a/test/CodeGen/Mips/mno-ldc1-sdc1.ll
+++ b/test/CodeGen/Mips/mno-ldc1-sdc1.ll

@@ -123,7 +123,7 @@
 ; 32R1-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
 
 ; 32R2-LE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
-; 32R2-LE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-LE-PIC-DAG:    mfhc1 $[[R1:[0-9]+]], $f12
 ; 32R2-LE-PIC-DAG:    sw $[[R0]], 0(${{[0-9]+}})
 ; 32R2-LE-PIC-DAG:    sw $[[R1]], 4(${{[0-9]+}})
 
@@ -140,7 +140,7 @@
 ; 32R1-LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
 
 ; 32R2-LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
-; 32R2-LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-LE-STATIC-DAG: mfhc1 $[[R1:[0-9]+]], $f12
 ; 32R2-LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
 ; 32R2-LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
 ; 32R2-LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
@@ -159,7 +159,7 @@
 ; 32R1-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
 
 ; 32R2-BE-PIC-DAG:    mfc1 $[[R0:[0-9]+]], $f12
-; 32R2-BE-PIC-DAG:    mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-BE-PIC-DAG:    mfhc1 $[[R1:[0-9]+]], $f12
 ; 32R2-BE-PIC-DAG:    sw $[[R1]], 0(${{[0-9]+}})
 ; 32R2-BE-PIC-DAG:    sw $[[R0]], 4(${{[0-9]+}})
 
@@ -225,7 +225,7 @@
 ; 32R1-DAG:      sw $[[R1]], 4(${{[0-9]+}})
 
 ; 32R2-DAG:      mfc1 $[[R0:[0-9]+]], $f12
-; 32R2-DAG:      mfc1 $[[R1:[0-9]+]], $f13
+; 32R2-DAG:      mfhc1 $[[R1:[0-9]+]], $f12
 ; 32R2-DAG:      sw $[[R0]], 0(${{[0-9]+}})
 ; 32R2-DAG:      sw $[[R1]], 4(${{[0-9]+}})
 

diff --git a/test/CodeGen/Mips/msa/arithmetic_float.ll b/test/CodeGen/Mips/msa/arithmetic_float.ll
index 86e57ac..9aae284 100644
--- a/test/CodeGen/Mips/msa/arithmetic_float.ll
+++ b/test/CodeGen/Mips/msa/arithmetic_float.ll

@@ -276,8 +276,8 @@
   ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
   %2 = tail call <4 x float> @llvm.exp2.v4f32 (<4 x float> %1)
   %3 = fmul <4 x float> <float 2.0, float 2.0, float 2.0, float 2.0>, %2
-  ; CHECK-DAG: lui [[R3:\$[0-9]+]], 16384
-  ; CHECK-DAG: fill.w [[R4:\$w[0-9]+]], [[R3]]
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: ffint_u.w [[R4:\$w[0-9]+]], [[R3]]
   ; CHECK-DAG: fexp2.w [[R5:\$w[0-9]+]], [[R4]], [[R1]]
   store <4 x float> %3, <4 x float>* %c
   ; CHECK-DAG: st.w [[R5]], 0($4)
@@ -287,16 +287,14 @@
 }
 
 define void @fexp2_v2f64_2(<2 x double>* %c, <2 x double>* %a) nounwind {
-  ; CHECK:      .8byte 4611686018427387904
-  ; CHECK-NEXT: .8byte 4611686018427387904
   ; CHECK: fexp2_v2f64_2:
 
   %1 = load <2 x double>* %a
   ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
   %2 = tail call <2 x double> @llvm.exp2.v2f64 (<2 x double> %1)
   %3 = fmul <2 x double> <double 2.0, double 2.0>, %2
-  ; CHECK-DAG: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[G_PTR]])
+  ; CHECK-DAG: ldi.d [[R2:\$w[0-9]+]], 1
+  ; CHECK-DAG: ffint_u.d [[R3:\$w[0-9]+]], [[R2]]
   ; CHECK-DAG: fexp2.d [[R4:\$w[0-9]+]], [[R3]], [[R1]]
   store <2 x double> %3, <2 x double>* %c
   ; CHECK-DAG: st.d [[R4]], 0($4)

diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll
index 07e67bf..ebec465 100644
--- a/test/CodeGen/Mips/msa/frameindex.ll
+++ b/test/CodeGen/Mips/msa/frameindex.ll

@@ -36,10 +36,10 @@
   %2 = alloca [497 x i8] ; Push the frame just over 512 bytes
 
   %3 = load volatile <16 x i8>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 512
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512
   ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <16 x i8> %3, <16 x i8>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 512
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512
   ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
 
   ret void
@@ -53,12 +53,12 @@
   %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
 
   %3 = load volatile <16 x i8>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <16 x i8> %3, <16 x i8>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
 
   ret void
@@ -72,12 +72,12 @@
   %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
 
   %3 = load volatile <16 x i8>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <16 x i8> %3, <16 x i8>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
 
   ret void
@@ -107,10 +107,10 @@
   %5 = getelementptr [2 x <8 x i16>]* %4, i32 0, i32 0
 
   %6 = load volatile <8 x i16>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <8 x i16> %6, <8 x i16>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
 
   ret void
@@ -139,10 +139,10 @@
   %2 = alloca [1009 x i8] ; Push the frame just over 1024 bytes
 
   %3 = load volatile <8 x i16>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1024
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024
   ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <8 x i16> %3, <8 x i16>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1024
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024
   ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
 
   ret void
@@ -156,12 +156,12 @@
   %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
 
   %3 = load volatile <8 x i16>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <8 x i16> %3, <8 x i16>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
 
   ret void
@@ -175,12 +175,12 @@
   %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
 
   %3 = load volatile <8 x i16>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <8 x i16> %3, <8 x i16>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.h [[R1]], 0([[BASE]])
 
   ret void
@@ -210,10 +210,10 @@
   %5 = getelementptr [2 x <4 x i32>]* %4, i32 0, i32 0
 
   %6 = load volatile <4 x i32>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <4 x i32> %6, <4 x i32>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
 
   ret void
@@ -242,10 +242,10 @@
   %2 = alloca [2033 x i8] ; Push the frame just over 2048 bytes
 
   %3 = load volatile <4 x i32>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 2048
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048
   ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <4 x i32> %3, <4 x i32>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 2048
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048
   ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
 
   ret void
@@ -259,12 +259,12 @@
   %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
 
   %3 = load volatile <4 x i32>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <4 x i32> %3, <4 x i32>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
 
   ret void
@@ -278,12 +278,12 @@
   %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
 
   %3 = load volatile <4 x i32>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <4 x i32> %3, <4 x i32>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.w [[R1]], 0([[BASE]])
 
   ret void
@@ -313,10 +313,10 @@
   %5 = getelementptr [2 x <2 x i64>]* %4, i32 0, i32 0
 
   %6 = load volatile <2 x i64>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <2 x i64> %6, <2 x i64>* %5
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 1
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1
   ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
 
   ret void
@@ -345,10 +345,10 @@
   %2 = alloca [4081 x i8] ; Push the frame just over 4096 bytes
 
   %3 = load volatile <2 x i64>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 4096
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096
   ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <2 x i64> %3, <2 x i64>* %1
-  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 4096
+  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096
   ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
 
   ret void
@@ -362,12 +362,12 @@
   %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
 
   %3 = load volatile <2 x i64>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <2 x i64> %3, <2 x i64>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
 
   ret void
@@ -381,12 +381,12 @@
   %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
 
   %3 = load volatile <2 x i64>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 0([[BASE]])
   store volatile <2 x i64> %3, <2 x i64>* %1
-  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
-  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$([0-9]+|gp)]], $sp, [[R2]]
   ; MIPS32-AE: st.d [[R1]], 0([[BASE]])
 
   ret void

diff --git a/test/CodeGen/Mips/no-odd-spreg.ll b/test/CodeGen/Mips/no-odd-spreg.ll
index b42ed6a..572e940 100644
--- a/test/CodeGen/Mips/no-odd-spreg.ll
+++ b/test/CodeGen/Mips/no-odd-spreg.ll

@@ -1,10 +1,14 @@
-; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG -check-prefix=ODDSPREG-NO-EMIT
 ; RUN: llc -march=mipsel -mcpu=mips32 -mattr=+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
-; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64 < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG -check-prefix=ODDSPREG-NO-EMIT
 ; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fp64,+nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=NOODDSPREG
+; RUN: llc -march=mipsel -mcpu=mips32r6 -mattr=fpxx,-nooddspreg < %s | FileCheck %s -check-prefix=ALL -check-prefix=ODDSPREG -check-prefix=ODDSPREG-EMIT
 
-; ODDSPREG:       .module oddspreg
-; NOODDSPREG:     .module nooddspreg
+; We don't emit a directive unless we need to. This is to support versions of
+; GAS which do not support the directive.
+; ODDSPREG-EMIT:        .module oddspreg
+; ODDSPREG-NO-EMIT-NOT: .module oddspreg
+; NOODDSPREG:           .module nooddspreg
 
 define float @two_floats(float %a) {
 entry:

diff --git a/test/CodeGen/Mips/nomips16.ll b/test/CodeGen/Mips/nomips16.ll
index 0affb16..5f7d74e 100644
--- a/test/CodeGen/Mips/nomips16.ll
+++ b/test/CodeGen/Mips/nomips16.ll

@@ -33,6 +33,6 @@
 ; CHECK: 	.end	nofoo
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 

diff --git a/test/CodeGen/Mips/o32_cc.ll b/test/CodeGen/Mips/o32_cc.ll
index 08e5aab..c28f9ab 100644
--- a/test/CodeGen/Mips/o32_cc.ll
+++ b/test/CodeGen/Mips/o32_cc.ll

@@ -1,12 +1,13 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
-; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s
-; RUN: llc -march=mipsel < %s | FileCheck -check-prefix=FP32EL %s
-; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck -check-prefix=FP64EL %s
+; RUN: llc -march=mipsel < %s | FileCheck -check-prefix=ALL %s
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck -check-prefix=ALL %s
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck -check-prefix=ALL -check-prefix=NO-MFHC1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r2              < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-MFHC1 %s
+; RUN: llc -march=mipsel -mcpu=mips32r2 -mattr=+fp64 < %s | FileCheck -check-prefix=ALL -check-prefix=HAS-MFHC1 %s
 
 ; $f12, $f14
-; CHECK-LABEL: testlowercall0:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: ldc1 $f14, %lo
+; ALL-LABEL: testlowercall0:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       ldc1 $f14, %lo
 define void @testlowercall0() nounwind {
 entry:
   tail call void @f0(double 5.000000e+00, double 6.000000e+00) nounwind
@@ -16,9 +17,9 @@
 declare void @f0(double, double)
 
 ; $f12, $f14
-; CHECK-LABEL: testlowercall1:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
+; ALL-LABEL: testlowercall1:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
 define void @testlowercall1() nounwind {
 entry:
   tail call void @f1(float 8.000000e+00, float 9.000000e+00) nounwind
@@ -28,9 +29,9 @@
 declare void @f1(float, float)
 
 ; $f12, $f14
-; CHECK-LABEL: testlowercall2:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: ldc1 $f14, %lo
+; ALL-LABEL: testlowercall2:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       ldc1 $f14, %lo
 define void @testlowercall2() nounwind {
 entry:
   tail call void @f2(float 8.000000e+00, double 6.000000e+00) nounwind
@@ -40,9 +41,9 @@
 declare void @f2(float, double)
 
 ; $f12, $f14
-; CHECK-LABEL: testlowercall3:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
+; ALL-LABEL: testlowercall3:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
 define void @testlowercall3() nounwind {
 entry:
   tail call void @f3(double 5.000000e+00, float 9.000000e+00) nounwind
@@ -52,11 +53,11 @@
 declare void @f3(double, float)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall4:
-; CHECK-DAG: addiu $4, $zero, 12
-; CHECK-DAG: addiu $5, $zero, 13
-; CHECK-DAG: addiu $6, $zero, 14
-; CHECK-DAG: addiu $7, $zero, 15
+; ALL-LABEL: testlowercall4:
+; ALL-DAG:       addiu $4, $zero, 12
+; ALL-DAG:       addiu $5, $zero, 13
+; ALL-DAG:       addiu $6, $zero, 14
+; ALL-DAG:       addiu $7, $zero, 15
 define void @testlowercall4() nounwind {
 entry:
   tail call void @f4(i32 12, i32 13, i32 14, i32 15) nounwind
@@ -66,11 +67,11 @@
 declare void @f4(i32, i32, i32, i32)
 
 ; $f12, $6, stack
-; CHECK-LABEL: testlowercall5:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: addiu $6, $zero, 23
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
+; ALL-LABEL: testlowercall5:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       addiu $6, $zero, 23
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 16($sp)
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall5() nounwind {
 entry:
   tail call void @f5(double 1.500000e+01, i32 23, double 1.700000e+01) nounwind
@@ -80,10 +81,10 @@
 declare void @f5(double, i32, double)
 
 ; $f12, $6, $7
-; CHECK-LABEL: testlowercall6:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: addiu $6, $zero, 33
-; CHECK-DAG: addiu $7, $zero, 24
+; ALL-LABEL: testlowercall6:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       addiu $6, $zero, 33
+; ALL-DAG:       addiu $7, $zero, 24
 define void @testlowercall6() nounwind {
 entry:
   tail call void @f6(double 2.500000e+01, i32 33, i32 24) nounwind
@@ -93,10 +94,10 @@
 declare void @f6(double, i32, i32)
 
 ; $f12, $5, $6
-; CHECK-LABEL: testlowercall7:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 43
-; CHECK-DAG: addiu $6, $zero, 34
+; ALL-LABEL: testlowercall7:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       addiu $5, $zero, 43
+; ALL-DAG:       addiu $6, $zero, 34
 define void @testlowercall7() nounwind {
 entry:
   tail call void @f7(float 1.800000e+01, i32 43, i32 34) nounwind
@@ -106,12 +107,12 @@
 declare void @f7(float, i32, i32)
 
 ; $4, $5, $6, stack
-; CHECK-LABEL: testlowercall8:
-; CHECK-DAG: addiu $4, $zero, 22
-; CHECK-DAG: addiu $5, $zero, 53
-; CHECK-DAG: addiu $6, $zero, 44
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
+; ALL-LABEL: testlowercall8:
+; ALL-DAG:       addiu $4, $zero, 22
+; ALL-DAG:       addiu $5, $zero, 53
+; ALL-DAG:       addiu $6, $zero, 44
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 16($sp)
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall8() nounwind {
 entry:
   tail call void @f8(i32 22, i32 53, i32 44, double 4.000000e+00) nounwind
@@ -121,11 +122,11 @@
 declare void @f8(i32, i32, i32, double)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall9:
-; CHECK-DAG: addiu $4, $zero, 32
-; CHECK-DAG: addiu $5, $zero, 63
-; CHECK-DAG: addiu $6, $zero, 54
-; CHECK-DAG: lui $7, 16688
+; ALL-LABEL: testlowercall9:
+; ALL-DAG:       addiu $4, $zero, 32
+; ALL-DAG:       addiu $5, $zero, 63
+; ALL-DAG:       addiu $6, $zero, 54
+; ALL-DAG:       lui $7, 16688
 define void @testlowercall9() nounwind {
 entry:
   tail call void @f9(i32 32, i32 63, i32 54, float 1.100000e+01) nounwind
@@ -135,15 +136,16 @@
 declare void @f9(i32, i32, i32, float)
 
 ; $4, $5, ($6, $7)
-; CHECK-LABEL: testlowercall10:
-; CHECK-DAG: addiu $4, $zero, 42
-; CHECK-DAG: addiu $5, $zero, 73
-; FP32EL-LABEL: testlowercall10:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall10:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall10:
+
+; ALL-DAG:       addiu $4, $zero, 42
+; ALL-DAG:       addiu $5, $zero, 73
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall10() nounwind {
 entry:
   tail call void @f10(i32 42, i32 73, double 2.700000e+01) nounwind
@@ -153,14 +155,14 @@
 declare void @f10(i32, i32, double)
 
 ; $4, ($6, $7)
-; CHECK-LABEL: testlowercall11:
-; CHECK-DAG: addiu $4, $zero, 52
-; FP32EL-LABEL: testlowercall11:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall11:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall11:
+; ALL-DAG:       addiu $4, $zero, 52
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall11() nounwind {
 entry:
   tail call void @f11(i32 52, double 1.600000e+01) nounwind
@@ -170,11 +172,11 @@
 declare void @f11(i32, double)
 
 ; $f12, $f14, $6, $7
-; CHECK-LABEL: testlowercall12:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
-; CHECK-DAG: lui $6, 16672
-; CHECK-DAG: lui $7, 16808
+; ALL-LABEL: testlowercall12:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
+; ALL-DAG:       lui $6, 16672
+; ALL-DAG:       lui $7, 16808
 define void @testlowercall12() nounwind {
 entry:
   tail call void @f12(float 2.800000e+01, float 1.900000e+01, float 1.000000e+01, float 2.100000e+01) nounwind
@@ -184,11 +186,11 @@
 declare void @f12(float, float, float, float)
 
 ; $f12, $5, $6, $7
-; CHECK-LABEL: testlowercall13:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 83
-; CHECK-DAG: lui $6, 16800
-; CHECK-DAG: addiu $7, $zero, 25
+; ALL-LABEL: testlowercall13:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       addiu $5, $zero, 83
+; ALL-DAG:       lui $6, 16800
+; ALL-DAG:       addiu $7, $zero, 25
 define void @testlowercall13() nounwind {
 entry:
   tail call void @f13(float 3.800000e+01, i32 83, float 2.000000e+01, i32 25) nounwind
@@ -199,10 +201,10 @@
 declare void @f13(float, i32, float, i32)
 
 ; $f12, $f14, $7
-; CHECK-LABEL: testlowercall14:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
-; CHECK-DAG: lui $7, 16880
+; ALL-LABEL: testlowercall14:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
+; ALL-DAG:       lui $7, 16880
 define void @testlowercall14() nounwind {
 entry:
   tail call void @f14(double 3.500000e+01, float 2.900000e+01, float 3.000000e+01) nounwind
@@ -212,15 +214,15 @@
 declare void @f14(double, float, float)
 
 ; $f12, $f14, ($6, $7)
-; CHECK-LABEL: testlowercall15:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
-; FP32EL-LABEL: testlowercall15:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall15:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall15:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       lwc1 $f14, %lo
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall15() nounwind {
 entry:
   tail call void @f15(float 4.800000e+01, float 3.900000e+01, double 3.700000e+01) nounwind
@@ -230,11 +232,11 @@
 declare void @f15(float, float, double)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall16:
-; CHECK-DAG: addiu $4, $zero, 62
-; CHECK-DAG: lui $5, 16964
-; CHECK-DAG: addiu $6, $zero, 64
-; CHECK-DAG: lui $7, 16888
+; ALL-LABEL: testlowercall16:
+; ALL-DAG:       addiu $4, $zero, 62
+; ALL-DAG:       lui $5, 16964
+; ALL-DAG:       addiu $6, $zero, 64
+; ALL-DAG:       lui $7, 16888
 define void @testlowercall16() nounwind {
 entry:
   tail call void @f16(i32 62, float 4.900000e+01, i32 64, float 3.100000e+01) nounwind
@@ -244,11 +246,11 @@
 declare void @f16(i32, float, i32, float)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall17:
-; CHECK-DAG: addiu $4, $zero, 72
-; CHECK-DAG: lui $5, 17004
-; CHECK-DAG: addiu $6, $zero, 74
-; CHECK-DAG: addiu $7, $zero, 35
+; ALL-LABEL: testlowercall17:
+; ALL-DAG:       addiu $4, $zero, 72
+; ALL-DAG:       lui $5, 17004
+; ALL-DAG:       addiu $6, $zero, 74
+; ALL-DAG:       addiu $7, $zero, 35
 define void @testlowercall17() nounwind {
 entry:
   tail call void @f17(i32 72, float 5.900000e+01, i32 74, i32 35) nounwind
@@ -258,11 +260,11 @@
 declare void @f17(i32, float, i32, i32)
 
 ; $4, $5, $6, $7
-; CHECK-LABEL: testlowercall18:
-; CHECK-DAG: addiu $4, $zero, 82
-; CHECK-DAG: addiu $5, $zero, 93
-; CHECK-DAG: lui $6, 16928
-; CHECK-DAG: addiu $7, $zero, 45
+; ALL-LABEL: testlowercall18:
+; ALL-DAG:       addiu $4, $zero, 82
+; ALL-DAG:       addiu $5, $zero, 93
+; ALL-DAG:       lui $6, 16928
+; ALL-DAG:       addiu $7, $zero, 45
 define void @testlowercall18() nounwind {
 entry:
   tail call void @f18(i32 82, i32 93, float 4.000000e+01, i32 45) nounwind
@@ -273,16 +275,16 @@
 
 
 ; $4, ($6, $7), stack
-; CHECK-LABEL: testlowercall20:
-; CHECK-DAG: addiu $4, $zero, 92
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
-; FP32EL-LABEL: testlowercall20:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall20:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall20:
+; ALL-DAG:       addiu $4, $zero, 92
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 16($sp)
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 20($sp)
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall20() nounwind {
 entry:
   tail call void @f20(i32 92, double 2.600000e+01, double 4.700000e+01) nounwind
@@ -292,9 +294,9 @@
 declare void @f20(i32, double, double)
 
 ; $f12, $5
-; CHECK-LABEL: testlowercall21:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 103
+; ALL-LABEL: testlowercall21:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       addiu $5, $zero, 103
 define void @testlowercall21() nounwind {
 entry:
   tail call void @f21(float 5.800000e+01, i32 103) nounwind
@@ -304,15 +306,15 @@
 declare void @f21(float, i32)
 
 ; $f12, $5, ($6, $7)
-; CHECK-LABEL: testlowercall22:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 113
-; FP32EL-LABEL: testlowercall22:
-; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
-; FP64EL-LABEL: testlowercall22:
-; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
-; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
+; ALL-LABEL: testlowercall22:
+; ALL-DAG:       lwc1 $f12, %lo
+; ALL-DAG:       addiu $5, $zero, 113
+
+; NO-MFHC1-DAG:  mfc1 $6, $f{{[0-9]+}}
+; NO-MFHC1-DAG:  mfc1 $7, $f{{[0-9]+}}
+
+; HAS-MFHC1-DAG: mfc1 $6, $f{{[0-9]+}}
+; HAS-MFHC1-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall22() nounwind {
 entry:
   tail call void @f22(float 6.800000e+01, i32 113, double 5.700000e+01) nounwind
@@ -322,9 +324,9 @@
 declare void @f22(float, i32, double)
 
 ; $f12, f6
-; CHECK-LABEL: testlowercall23:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: addiu $6, $zero, 123
+; ALL-LABEL: testlowercall23:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       addiu $6, $zero, 123
 define void @testlowercall23() nounwind {
 entry:
   tail call void @f23(double 4.500000e+01, i32 123) nounwind
@@ -334,11 +336,11 @@
 declare void @f23(double, i32)
 
 ; $f12,$6, stack
-; CHECK-LABEL: testlowercall24:
-; CHECK-DAG: ldc1 $f12, %lo
-; CHECK-DAG: addiu $6, $zero, 133
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
-; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
+; ALL-LABEL: testlowercall24:
+; ALL-DAG:       ldc1 $f12, %lo
+; ALL-DAG:       addiu $6, $zero, 133
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 16($sp)
+; ALL-DAG:       sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall24() nounwind {
 entry:
   tail call void @f24(double 5.500000e+01, i32 133, double 6.700000e+01) nounwind
@@ -347,19 +349,19 @@
 
 declare void @f24(double, i32, double)
 
-; CHECK-LABEL: testlowercall25:
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: lwc1 $f14, %lo
-; CHECK-DAG: lui $6
-; CHECK-DAG: lui $7
-; CHECK-DAG: lwc1 $f12, %lo
-; CHECK-DAG: addiu $5, $zero, 83
-; CHECK-DAG: lui $6
-; CHECK-DAG: addiu $7, $zero, 25
-; CHECK-DAG: addiu $4, $zero, 82
-; CHECK-DAG: addiu $5, $zero, 93
-; CHECK-DAG: lui $6
-; CHECK-DAG: addiu $7, $zero, 45
+; ALL-LABEL: testlowercall25:
+; ALL-DAG:      lwc1 $f12, %lo
+; ALL-DAG:      lwc1 $f14, %lo
+; ALL-DAG:      lui $6
+; ALL-DAG:      lui $7
+; ALL-DAG:      lwc1 $f12, %lo
+; ALL-DAG:      addiu $5, $zero, 83
+; ALL-DAG:      lui $6
+; ALL-DAG:      addiu $7, $zero, 25
+; ALL-DAG:      addiu $4, $zero, 82
+; ALL-DAG:      addiu $5, $zero, 93
+; ALL-DAG:      lui $6
+; ALL-DAG:      addiu $7, $zero, 45
 define void @testlowercall25() nounwind {
 entry:
   tail call void @f12(float 2.800000e+01, float 1.900000e+01, float 1.000000e+01, float 2.100000e+01) nounwind

diff --git a/test/CodeGen/Mips/octeon_popcnt.ll b/test/CodeGen/Mips/octeon_popcnt.ll
index 52c37f6..3432b39 100644
--- a/test/CodeGen/Mips/octeon_popcnt.ll
+++ b/test/CodeGen/Mips/octeon_popcnt.ll

@@ -6,7 +6,7 @@
   ret i8 %cnt
 ; OCTEON-LABEL: cnt8:
 ; OCTEON: jr   $ra
-; OCTEON: pop  $2, $1
+; OCTEON: pop  $2, [[R1:\$[0-9]+]]
 ; MIPS64-LABEL: cnt8:
 ; MIPS64-NOT: pop
 }
@@ -16,12 +16,12 @@
   ret i16 %cnt
 ; OCTEON-LABEL: cnt16:
 ; OCTEON: jr   $ra
-; OCTEON: pop  $2, $1
+; OCTEON: pop  $2, [[R1:\$[0-9]+]]
 ; MIPS64-LABEL: cnt16:
 ; MIPS64-NOT: pop
 }
 
-define i32 @cnt32(i32 %x) nounwind readnone {
+define i32 @cnt32(i32 zeroext %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 ; OCTEON-LABEL: cnt32:

diff --git a/test/CodeGen/Mips/prevent-hoisting.ll b/test/CodeGen/Mips/prevent-hoisting.ll
index da665c2..210fe3b 100644
--- a/test/CodeGen/Mips/prevent-hoisting.ll
+++ b/test/CodeGen/Mips/prevent-hoisting.ll

@@ -10,16 +10,19 @@
 
 ; CHECK-LABEL: readLumaCoeff8x8_CABAC
 
-; The check for "addiu" instruction is added so that we can match the correct "b" instruction.
+; The check for first "addiu" instruction is added so that we can match the correct "b" instruction.
 ; CHECK:           addiu ${{[0-9]+}}, $zero, -1
 ; CHECK:           b $[[BB0:BB[0-9_]+]]
+; CHECK-NEXT:      addiu ${{[0-9]+}}, $zero, 0
 
-; Check that sll instruction that writes to $1 starts basic block.
-; CHECK:       {{BB[0-9_#]+}}: 
+; Check that at the start of a fallthrough block there is a instruction that writes to $1.
+; CHECK-NEXT:  {{BB[0-9_#]+}}: 
+; CHECK-NEXT:      lw      $[[R1:[0-9]+]], %got(assignSE2partition)($[[R2:[0-9]+]])
 ; CHECK-NEXT:      sll $1, $[[R0:[0-9]+]], 4
 
-; Check that identical sll instruction starts another basic block.
+; Check that identical instructions are at the start of a target block.
 ; CHECK:       [[BB0]]:
+; CHECK-NEXT:      lw      $[[R1]], %got(assignSE2partition)($[[R2]])
 ; CHECK-NEXT:      sll $1, $[[R0]], 4
 
 

diff --git a/test/CodeGen/Mips/select.ll b/test/CodeGen/Mips/select.ll
index eb2198b..d6e1826 100644
--- a/test/CodeGen/Mips/select.ll
+++ b/test/CodeGen/Mips/select.ll

@@ -8,7 +8,7 @@
 @d2 = external global double
 @d3 = external global double
 
-define i32 @i32_icmp_ne_i32_val(i32 %s, i32 %f0, i32 %f1) nounwind readnone {
+define i32 @i32_icmp_ne_i32_val(i32 signext %s, i32 signext %f0, i32 signext %f1) nounwind readnone {
 entry:
 ; ALL-LABEL: i32_icmp_ne_i32_val:
 
@@ -37,7 +37,7 @@
   ret i32 %cond
 }
 
-define i64 @i32_icmp_ne_i64_val(i32 %s, i64 %f0, i64 %f1) nounwind readnone {
+define i64 @i32_icmp_ne_i64_val(i32 signext %s, i64 %f0, i64 %f1) nounwind readnone {
 entry:
 ; ALL-LABEL: i32_icmp_ne_i64_val:
 
@@ -128,7 +128,7 @@
   ret i64 %cond
 }
 
-define float @i32_icmp_ne_f32_val(i32 %s, float %f0, float %f1) nounwind readnone {
+define float @i32_icmp_ne_f32_val(i32 signext %s, float %f0, float %f1) nounwind readnone {
 entry:
 ; ALL-LABEL: i32_icmp_ne_f32_val:
 
@@ -161,7 +161,7 @@
   ret float %cond
 }
 
-define double @i32_icmp_ne_f64_val(i32 %s, double %f0, double %f1) nounwind readnone {
+define double @i32_icmp_ne_f64_val(i32 signext %s, double %f0, double %f1) nounwind readnone {
 entry:
 ; ALL-LABEL: i32_icmp_ne_f64_val:
 
@@ -496,7 +496,7 @@
   ret float %cond
 }
 
-define i32 @f32_fcmp_oeq_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_oeq_i32_val(i32 signext %f0, i32 signext %f1, float %f2, float %f3) nounwind readnone {
 entry:
 ; ALL-LABEL: f32_fcmp_oeq_i32_val:
 
@@ -541,7 +541,7 @@
   ret i32 %cond
 }
 
-define i32 @f32_fcmp_olt_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_olt_i32_val(i32 signext %f0, i32 signext %f1, float %f2, float %f3) nounwind readnone {
 entry:
 ; ALL-LABEL: f32_fcmp_olt_i32_val:
 
@@ -585,7 +585,7 @@
   ret i32 %cond
 }
 
-define i32 @f32_fcmp_ogt_i32_val(i32 %f0, i32 %f1, float %f2, float %f3) nounwind readnone {
+define i32 @f32_fcmp_ogt_i32_val(i32 signext %f0, i32 signext %f1, float %f2, float %f3) nounwind readnone {
 entry:
 ; ALL-LABEL: f32_fcmp_ogt_i32_val:
 
@@ -630,7 +630,7 @@
   ret i32 %cond
 }
 
-define i32 @f64_fcmp_oeq_i32_val(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_oeq_i32_val(i32 signext %f0, i32 signext %f1) nounwind readonly {
 entry:
 ; ALL-LABEL: f64_fcmp_oeq_i32_val:
 
@@ -707,7 +707,7 @@
   ret i32 %cond
 }
 
-define i32 @f64_fcmp_olt_i32_val(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_olt_i32_val(i32 signext %f0, i32 signext %f1) nounwind readonly {
 entry:
 ; ALL-LABEL: f64_fcmp_olt_i32_val:
 
@@ -784,7 +784,7 @@
   ret i32 %cond
 }
 
-define i32 @f64_fcmp_ogt_i32_val(i32 %f0, i32 %f1) nounwind readonly {
+define i32 @f64_fcmp_ogt_i32_val(i32 signext %f0, i32 signext %f1) nounwind readonly {
 entry:
 ; ALL-LABEL: f64_fcmp_ogt_i32_val:
 

diff --git a/test/CodeGen/Mips/seleq.ll b/test/CodeGen/Mips/seleq.ll
index 190baad..9af422f 100644
--- a/test/CodeGen/Mips/seleq.ll
+++ b/test/CodeGen/Mips/seleq.ll

@@ -10,7 +10,7 @@
 @z3 = common global i32 0, align 4
 @z4 = common global i32 0, align 4
 
-define void @calc_seleq() nounwind "target-cpu"="mips32" "target-features"="+o32,+mips32" {
+define void @calc_seleq() nounwind {
 entry:
   %0 = load i32* @a, align 4
   %1 = load i32* @b, align 4

diff --git a/test/CodeGen/Mips/small-section-reserve-gp.ll b/test/CodeGen/Mips/small-section-reserve-gp.ll
index 03503fb..cbf0681 100644
--- a/test/CodeGen/Mips/small-section-reserve-gp.ll
+++ b/test/CodeGen/Mips/small-section-reserve-gp.ll

@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel-sde-elf -march=mipsel -relocation-model=static < %s \
+; RUN: llc -mtriple=mipsel-sde-elf -march=mipsel -relocation-model=static -mattr=+noabicalls -mgpopt < %s \
 ; RUN: | FileCheck %s
 
 @i = internal unnamed_addr global i32 0, align 4

diff --git a/test/CodeGen/Mips/start-asm-file.ll b/test/CodeGen/Mips/start-asm-file.ll
index 8872464..9dc501c 100644
--- a/test/CodeGen/Mips/start-asm-file.ll
+++ b/test/CodeGen/Mips/start-asm-file.ll

@@ -1,7 +1,4 @@
 ; Check the emission of directives at the start of an asm file.
-; This test is XFAILED until we fix the emission of '.option pic0' on
-; N32. At the moment we check if subtarget is Mips64 when we should be
-; checking the Subtarget's ABI.
 
 ; ### O32 ABI ###
 ; RUN: llc -filetype=asm -mtriple mips-unknown-linux -mcpu=mips32 \

diff --git a/test/CodeGen/Mips/zeroreg.ll b/test/CodeGen/Mips/zeroreg.ll
index a1b6cb0..c766d3b 100644
--- a/test/CodeGen/Mips/zeroreg.ll
+++ b/test/CodeGen/Mips/zeroreg.ll

@@ -8,7 +8,7 @@
 
 @g1 = external global i32
 
-define i32 @sel_icmp_nez_i32_z0(i32 %s) nounwind readonly {
+define i32 @sel_icmp_nez_i32_z0(i32 signext %s) nounwind readonly {
 entry:
 ; ALL-LABEL: sel_icmp_nez_i32_z0:
 
@@ -30,7 +30,7 @@
   ret i32 %cond
 }
 
-define i32 @sel_icmp_nez_i32_z1(i32 %s) nounwind readonly {
+define i32 @sel_icmp_nez_i32_z1(i32 signext %s) nounwind readonly {
 entry:
 ; ALL-LABEL: sel_icmp_nez_i32_z1:
 

diff --git a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
index e474fa4..c167db4 100644
--- a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
+++ b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll

@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s
 
 ;; These tests should run for all targets
 
@@ -9,28 +9,28 @@
 ;;; f64
 
 define double @fadd_f64(double %a, double %b) {
-; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
 ; CHECK: ret
   %ret = fadd double %a, %b
   ret double %ret
 }
 
 define double @fsub_f64(double %a, double %b) {
-; CHECK: sub.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: sub.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
 ; CHECK: ret
   %ret = fsub double %a, %b
   ret double %ret
 }
 
 define double @fmul_f64(double %a, double %b) {
-; CHECK: mul.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: mul.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
 ; CHECK: ret
   %ret = fmul double %a, %b
   ret double %ret
 }
 
 define double @fdiv_f64(double %a, double %b) {
-; CHECK: div.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}
+; CHECK: div.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}
 ; CHECK: ret
   %ret = fdiv double %a, %b
   ret double %ret

diff --git a/test/CodeGen/NVPTX/arithmetic-int.ll b/test/CodeGen/NVPTX/arithmetic-int.ll
index 8d73b7e..b5a2872 100644
--- a/test/CodeGen/NVPTX/arithmetic-int.ll
+++ b/test/CodeGen/NVPTX/arithmetic-int.ll

@@ -9,70 +9,70 @@
 ;;; i64
 
 define i64 @add_i64(i64 %a, i64 %b) {
-; CHECK: add.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = add i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @sub_i64(i64 %a, i64 %b) {
-; CHECK: sub.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: sub.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = sub i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @mul_i64(i64 %a, i64 %b) {
-; CHECK: mul.lo.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: mul.lo.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = mul i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @sdiv_i64(i64 %a, i64 %b) {
-; CHECK: div.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: div.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = sdiv i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @udiv_i64(i64 %a, i64 %b) {
-; CHECK: div.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: div.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = udiv i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @srem_i64(i64 %a, i64 %b) {
-; CHECK: rem.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: rem.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = srem i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @urem_i64(i64 %a, i64 %b) {
-; CHECK: rem.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: rem.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = urem i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @and_i64(i64 %a, i64 %b) {
-; CHECK: and.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: and.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = and i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @or_i64(i64 %a, i64 %b) {
-; CHECK: or.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: or.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = or i64 %a, %b
   ret i64 %ret
 }
 
 define i64 @xor_i64(i64 %a, i64 %b) {
-; CHECK: xor.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: xor.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %ret = xor i64 %a, %b
   ret i64 %ret
@@ -80,7 +80,7 @@
 
 define i64 @shl_i64(i64 %a, i64 %b) {
 ; PTX requires 32-bit shift amount
-; CHECK: shl.b64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: shl.b64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = shl i64 %a, %b
   ret i64 %ret
@@ -88,7 +88,7 @@
 
 define i64 @ashr_i64(i64 %a, i64 %b) {
 ; PTX requires 32-bit shift amount
-; CHECK: shr.s64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: shr.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = ashr i64 %a, %b
   ret i64 %ret
@@ -96,7 +96,7 @@
 
 define i64 @lshr_i64(i64 %a, i64 %b) {
 ; PTX requires 32-bit shift amount
-; CHECK: shr.u64 %rl{{[0-9]+}}, %rl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: shr.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %ret = lshr i64 %a, %b
   ret i64 %ret

diff --git a/test/CodeGen/NVPTX/atomics.ll b/test/CodeGen/NVPTX/atomics.ll
index 10ab73d..daadb6e 100644
--- a/test/CodeGen/NVPTX/atomics.ll
+++ b/test/CodeGen/NVPTX/atomics.ll

@@ -1,21 +1,21 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
 
-; CHECK: atom0
+; CHECK-LABEL: atom0
 define i32 @atom0(i32* %addr, i32 %val) {
 ; CHECK: atom.add.u32
   %ret = atomicrmw add i32* %addr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom1
+; CHECK-LABEL: atom1
 define i64 @atom1(i64* %addr, i64 %val) {
 ; CHECK: atom.add.u64
   %ret = atomicrmw add i64* %addr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom2
+; CHECK-LABEL: atom2
 define i32 @atom2(i32* %subr, i32 %val) {
 ; CHECK: neg.s32
 ; CHECK: atom.add.u32
@@ -23,7 +23,7 @@
   ret i32 %ret
 }
 
-; CHECK: atom3
+; CHECK-LABEL: atom3
 define i64 @atom3(i64* %subr, i64 %val) {
 ; CHECK: neg.s64
 ; CHECK: atom.add.u64
@@ -31,14 +31,14 @@
   ret i64 %ret
 }
 
-; CHECK: atom4
+; CHECK-LABEL: atom4
 define i32 @atom4(i32* %subr, i32 %val) {
 ; CHECK: atom.and.b32
   %ret = atomicrmw and i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom5
+; CHECK-LABEL: atom5
 define i64 @atom5(i64* %subr, i64 %val) {
 ; CHECK: atom.and.b64
   %ret = atomicrmw and i64* %subr, i64 %val seq_cst
@@ -56,86 +56,127 @@
 ;  ret i64 %ret
 ;}
 
-; CHECK: atom8
+; CHECK-LABEL: atom8
 define i32 @atom8(i32* %subr, i32 %val) {
 ; CHECK: atom.or.b32
   %ret = atomicrmw or i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom9
+; CHECK-LABEL: atom9
 define i64 @atom9(i64* %subr, i64 %val) {
 ; CHECK: atom.or.b64
   %ret = atomicrmw or i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom10
+; CHECK-LABEL: atom10
 define i32 @atom10(i32* %subr, i32 %val) {
 ; CHECK: atom.xor.b32
   %ret = atomicrmw xor i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom11
+; CHECK-LABEL: atom11
 define i64 @atom11(i64* %subr, i64 %val) {
 ; CHECK: atom.xor.b64
   %ret = atomicrmw xor i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom12
+; CHECK-LABEL: atom12
 define i32 @atom12(i32* %subr, i32 %val) {
 ; CHECK: atom.max.s32
   %ret = atomicrmw max i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom13
+; CHECK-LABEL: atom13
 define i64 @atom13(i64* %subr, i64 %val) {
 ; CHECK: atom.max.s64
   %ret = atomicrmw max i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom14
+; CHECK-LABEL: atom14
 define i32 @atom14(i32* %subr, i32 %val) {
 ; CHECK: atom.min.s32
   %ret = atomicrmw min i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom15
+; CHECK-LABEL: atom15
 define i64 @atom15(i64* %subr, i64 %val) {
 ; CHECK: atom.min.s64
   %ret = atomicrmw min i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom16
+; CHECK-LABEL: atom16
 define i32 @atom16(i32* %subr, i32 %val) {
 ; CHECK: atom.max.u32
   %ret = atomicrmw umax i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom17
+; CHECK-LABEL: atom17
 define i64 @atom17(i64* %subr, i64 %val) {
 ; CHECK: atom.max.u64
   %ret = atomicrmw umax i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
 
-; CHECK: atom18
+; CHECK-LABEL: atom18
 define i32 @atom18(i32* %subr, i32 %val) {
 ; CHECK: atom.min.u32
   %ret = atomicrmw umin i32* %subr, i32 %val seq_cst
   ret i32 %ret
 }
 
-; CHECK: atom19
+; CHECK-LABEL: atom19
 define i64 @atom19(i64* %subr, i64 %val) {
 ; CHECK: atom.min.u64
   %ret = atomicrmw umin i64* %subr, i64 %val seq_cst
   ret i64 %ret
 }
+
+declare float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val)
+
+; CHECK-LABEL: atomic_add_f32_generic
+define float @atomic_add_f32_generic(float* %addr, float %val) {
+; CHECK: atom.add.f32
+  %ret = call float @llvm.nvvm.atomic.load.add.f32.p0f32(float* %addr, float %val)
+  ret float %ret
+}
+
+declare float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val)
+
+; CHECK-LABEL: atomic_add_f32_addrspace1
+define float @atomic_add_f32_addrspace1(float addrspace(1)* %addr, float %val) {
+; CHECK: atom.global.add.f32
+  %ret = call float @llvm.nvvm.atomic.load.add.f32.p1f32(float addrspace(1)* %addr, float %val)
+  ret float %ret
+}
+
+declare float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val)
+
+; CHECK-LABEL: atomic_add_f32_addrspace3
+define float @atomic_add_f32_addrspace3(float addrspace(3)* %addr, float %val) {
+; CHECK: atom.shared.add.f32
+  %ret = call float @llvm.nvvm.atomic.load.add.f32.p3f32(float addrspace(3)* %addr, float %val)
+  ret float %ret
+}
+
+; CHECK-LABEL: atomic_cmpxchg_i32
+define i32 @atomic_cmpxchg_i32(i32* %addr, i32 %cmp, i32 %new) {
+; CHECK: atom.cas.b32
+  %pairold = cmpxchg i32* %addr, i32 %cmp, i32 %new seq_cst seq_cst
+  ret i32 %new
+}
+
+; CHECK-LABEL: atomic_cmpxchg_i64
+define i64 @atomic_cmpxchg_i64(i64* %addr, i64 %cmp, i64 %new) {
+; CHECK: atom.cas.b64
+  %pairold = cmpxchg i64* %addr, i64 %cmp, i64 %new seq_cst seq_cst
+  ret i64 %new
+}

diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll
new file mode 100644
index 0000000..157b28c
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug21465.ll

@@ -0,0 +1,24 @@
+; RUN: opt < %s -nvptx-lower-struct-args -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+%struct.S = type { i32, i32 }
+
+; Function Attrs: nounwind
+define void @_Z11TakesStruct1SPi(%struct.S* byval nocapture readonly %input, i32* nocapture %output) #0 {
+entry:
+; CHECK-LABEL @_Z22TakesStruct1SPi
+; CHECK:   bitcast %struct.S* %input to i8*
+; CHECK:   call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8
+  %b = getelementptr inbounds %struct.S* %input, i64 0, i32 1
+  %0 = load i32* %b, align 4
+  store i32 %0, i32* %output, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!nvvm.annotations = !{!0}
+
+!0 = metadata !{void (%struct.S*, i32*)* @_Z11TakesStruct1SPi, metadata !"kernel", i32 1}

diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
index 28dfa46..83d4916 100644
--- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
+++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll

@@ -20,11 +20,11 @@
   %buf = alloca [16 x i8], align 4
 
 ; CHECK: .local .align 4 .b8 	__local_depot0[16]
-; CHECK: mov.u64 %rl[[BUF_REG:[0-9]+]]
-; CHECK: cvta.local.u64 %SP, %rl[[BUF_REG]]
+; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]]
+; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]]
 
-; CHECK: ld.param.u64 %rl[[A_REG:[0-9]+]], [kernel_func_param_0]
-; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rl[[A_REG]]]
+; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
+; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]]
 ; CHECK: st.f32 [%SP+0], %f[[A0_REG]]
 
   %0 = load float* %a, align 4
@@ -46,11 +46,11 @@
   %7 = bitcast i8* %arrayidx7 to float*
   store float %6, float* %7, align 4
 
-; CHECK: add.u64 %rl[[SP_REG:[0-9]+]], %SP, 0
+; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0
 ; CHECK:        .param .b64 param0;
-; CHECK-NEXT:   st.param.b64  [param0+0], %rl[[A_REG]]
+; CHECK-NEXT:   st.param.b64  [param0+0], %rd[[A_REG]]
 ; CHECK-NEXT:   .param .b64 param1;
-; CHECK-NEXT:   st.param.b64  [param1+0], %rl[[SP_REG]]
+; CHECK-NEXT:   st.param.b64  [param1+0], %rd[[SP_REG]]
 ; CHECK-NEXT:   call.uni
 ; CHECK-NEXT:   callee,
 

diff --git a/test/CodeGen/NVPTX/compare-int.ll b/test/CodeGen/NVPTX/compare-int.ll
index c595f21..e4e0601 100644
--- a/test/CodeGen/NVPTX/compare-int.ll
+++ b/test/CodeGen/NVPTX/compare-int.ll

@@ -9,8 +9,8 @@
 ;;; i64
 
 define i64 @icmp_eq_i64(i64 %a, i64 %b) {
-; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.eq.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp eq i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -18,8 +18,8 @@
 }
 
 define i64 @icmp_ne_i64(i64 %a, i64 %b) {
-; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.ne.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp ne i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -27,8 +27,8 @@
 }
 
 define i64 @icmp_ugt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp ugt i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -36,8 +36,8 @@
 }
 
 define i64 @icmp_uge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp uge i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -45,8 +45,8 @@
 }
 
 define i64 @icmp_ult_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp ult i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -54,8 +54,8 @@
 }
 
 define i64 @icmp_ule_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp ule i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -63,8 +63,8 @@
 }
 
 define i64 @icmp_sgt_i64(i64 %a, i64 %b) {
-; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp sgt i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -72,8 +72,8 @@
 }
 
 define i64 @icmp_sge_i64(i64 %a, i64 %b) {
-; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp sge i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -81,8 +81,8 @@
 }
 
 define i64 @icmp_slt_i64(i64 %a, i64 %b) {
-; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp slt i64 %a, %b
   %ret = zext i1 %cmp to i64
@@ -90,8 +90,8 @@
 }
 
 define i64 @icmp_sle_i64(i64 %a, i64 %b) {
-; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rl{{[0-9]+}}, %rl{{[0-9]+}}
-; CHECK: selp.u64 %rl{{[0-9]+}}, 1, 0, %p[[P0]]
+; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; CHECK: selp.u64 %rd{{[0-9]+}}, 1, 0, %p[[P0]]
 ; CHECK: ret
   %cmp = icmp sle i64 %a, %b
   %ret = zext i1 %cmp to i64

diff --git a/test/CodeGen/NVPTX/convert-fp.ll b/test/CodeGen/NVPTX/convert-fp.ll
index 1882121..4b5446e 100644
--- a/test/CodeGen/NVPTX/convert-fp.ll
+++ b/test/CodeGen/NVPTX/convert-fp.ll

@@ -10,7 +10,7 @@
 }
 
 define i16 @cvt_i16_f64(double %x) {
-; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rzi.u16.f64 %rs{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui double %x to i16
   ret i16 %a
@@ -24,7 +24,7 @@
 }
 
 define i32 @cvt_i32_f64(double %x) {
-; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rzi.u32.f64 %r{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui double %x to i32
   ret i32 %a
@@ -32,14 +32,14 @@
 
 
 define i64 @cvt_i64_f32(float %x) {
-; CHECK: cvt.rzi.u64.f32 %rl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.rzi.u64.f32 %rd{{[0-9]+}}, %f{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui float %x to i64
   ret i64 %a
 }
 
 define i64 @cvt_i64_f64(double %x) {
-; CHECK: cvt.rzi.u64.f64 %rl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rzi.u64.f64 %rd{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptoui double %x to i64
   ret i64 %a
@@ -60,14 +60,14 @@
 }
 
 define float @cvt_f32_i64(i64 %x) {
-; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: cvt.rn.f32.u64 %f{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i64 %x to float
   ret float %a
 }
 
 define float @cvt_f32_f64(double %x) {
-; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: cvt.rn.f32.f64 %f{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fptrunc double %x to float
   ret float %a
@@ -88,56 +88,56 @@
 }
 
 define float @cvt_f32_s64(i64 %x) {
-; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: cvt.rn.f32.s64 %f{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i64 %x to float
   ret float %a
 }
 
 define double @cvt_f64_i16(i16 %x) {
-; CHECK: cvt.rn.f64.u16 %fl{{[0-9]+}}, %rs{{[0-9]+}};
+; CHECK: cvt.rn.f64.u16 %fd{{[0-9]+}}, %rs{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i16 %x to double
   ret double %a
 }
 
 define double @cvt_f64_i32(i32 %x) {
-; CHECK: cvt.rn.f64.u32 %fl{{[0-9]+}}, %r{{[0-9]+}};
+; CHECK: cvt.rn.f64.u32 %fd{{[0-9]+}}, %r{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i32 %x to double
   ret double %a
 }
 
 define double @cvt_f64_i64(i64 %x) {
-; CHECK: cvt.rn.f64.u64 %fl{{[0-9]+}}, %rl{{[0-9]+}};
+; CHECK: cvt.rn.f64.u64 %fd{{[0-9]+}}, %rd{{[0-9]+}};
 ; CHECK: ret;
   %a = uitofp i64 %x to double
   ret double %a
 }
 
 define double @cvt_f64_f32(float %x) {
-; CHECK: cvt.f64.f32 %fl{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK: cvt.f64.f32 %fd{{[0-9]+}}, %f{{[0-9]+}};
 ; CHECK: ret;
   %a = fpext float %x to double
   ret double %a
 }
 
 define double @cvt_f64_s16(i16 %x) {
-; CHECK: cvt.rn.f64.s16 %fl{{[0-9]+}}, %rs{{[0-9]+}}
+; CHECK: cvt.rn.f64.s16 %fd{{[0-9]+}}, %rs{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i16 %x to double
   ret double %a
 }
 
 define double @cvt_f64_s32(i32 %x) {
-; CHECK: cvt.rn.f64.s32 %fl{{[0-9]+}}, %r{{[0-9]+}}
+; CHECK: cvt.rn.f64.s32 %fd{{[0-9]+}}, %r{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i32 %x to double
   ret double %a
 }
 
 define double @cvt_f64_s64(i64 %x) {
-; CHECK: cvt.rn.f64.s64 %fl{{[0-9]+}}, %rl{{[0-9]+}}
+; CHECK: cvt.rn.f64.s64 %fd{{[0-9]+}}, %rd{{[0-9]+}}
 ; CHECK: ret
   %a = sitofp i64 %x to double
   ret double %a

diff --git a/test/CodeGen/NVPTX/convert-int-sm20.ll b/test/CodeGen/NVPTX/convert-int-sm20.ll
index 227cd31..57a2316 100644
--- a/test/CodeGen/NVPTX/convert-int-sm20.ll
+++ b/test/CodeGen/NVPTX/convert-int-sm20.ll

@@ -48,16 +48,16 @@
 ; i64
 
 define i64 @cvt_i64_i16(i16 %x) {
-; CHECK: ld.param.u16 %rl[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
-; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ld.param.u16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]]
 ; CHECK: ret
   %a = zext i16 %x to i64
   ret i64 %a
 }
 
 define i64 @cvt_i64_i32(i32 %x) {
-; CHECK: ld.param.u32 %rl[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
-; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rl[[R0]]
+; CHECK: ld.param.u32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
+; CHECK: st.param.b64 [func_retval{{[0-9]+}}+0], %rd[[R0]]
 ; CHECK: ret
   %a = zext i32 %x to i64
   ret i64 %a

diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll
index 4ef1a9a..14b5c45 100644
--- a/test/CodeGen/NVPTX/fma.ll
+++ b/test/CodeGen/NVPTX/fma.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
 
 define ptx_device float @t1_f32(float %x, float %y, float %z) {
 ; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
@@ -9,7 +9,7 @@
 }
 
 define ptx_device double @t1_f64(double %x, double %y, double %z) {
-; CHECK: fma.rn.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fmul double %x, %y
   %b = fadd double %a, %z

diff --git a/test/CodeGen/NVPTX/fp-contract.ll b/test/CodeGen/NVPTX/fp-contract.ll
new file mode 100644
index 0000000..3f68b18
--- /dev/null
+++ b/test/CodeGen/NVPTX/fp-contract.ll

@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
+
+target triple = "nvptx64-unknown-cuda"
+
+;; Make sure we are generating proper instruction sequences for fused ops
+;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
+;; add.f32 otherwise.  Without an explicit rounding mode on add.f32, ptxas
+;; is free to fuse with a multiply if it is able.  If fusion is not allowed,
+;; we do not form fma.rn at the PTX level and explicitly generate add.rn
+;; for all adds to prevent ptxas from fusion the ops.
+
+;; FAST-LABEL: @t0
+;; DEFAULT-LABEL: @t0
+define float @t0(float %a, float %b, float %c) {
+;; FAST: fma.rn.f32
+;; DEFAULT: mul.rn.f32
+;; DEFAULT: add.rn.f32
+  %v0 = fmul float %a, %b
+  %v1 = fadd float %v0, %c
+  ret float %v1
+}
+
+;; FAST-LABEL: @t1
+;; DEFAULT-LABEL: @t1
+define float @t1(float %a, float %b) {
+;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
+;; to prevent ptxas from fusing this with anything else.
+;; FAST: add.f32
+;; DEFAULT: add.rn.f32
+  %v1 = fadd float %a, %b
+  ret float %v1
+}

diff --git a/test/CodeGen/NVPTX/fp-literals.ll b/test/CodeGen/NVPTX/fp-literals.ll
index 0cc2413..755e0f9 100644
--- a/test/CodeGen/NVPTX/fp-literals.ll
+++ b/test/CodeGen/NVPTX/fp-literals.ll

@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+
+target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 ; Make sure we can properly differentiate between single-precision and
 ; double-precision FP literals.
@@ -11,7 +14,7 @@
 }
 
 ; CHECK: myaddd
-; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, 0d3FF0000000000000
+; CHECK: add.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, 0d3FF0000000000000
 define double @myaddd(double %a) {
   %ret = fadd double %a, 1.0
   ret double %ret

diff --git a/test/CodeGen/NVPTX/fp16.ll b/test/CodeGen/NVPTX/fp16.ll
new file mode 100644
index 0000000..8770399
--- /dev/null
+++ b/test/CodeGen/NVPTX/fp16.ll

@@ -0,0 +1,45 @@
+; RUN: llc -march=nvptx -verify-machineinstrs < %s | FileCheck %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
+
+; CHECK-LABEL: @test_convert_fp16_to_fp32
+; CHECK: cvt.f32.f16
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16 addrspace(1)* %in, align 2
+  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; CHECK-LABEL: @test_convert_fp16_to_fp64
+; CHECK: cvt.f64.f16
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16 addrspace(1)* %in, align 2
+  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+  store double %cvt, double addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; CHECK-LABEL: @test_convert_fp32_to_fp16
+; CHECK: cvt.rn.f16.f32
+define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float addrspace(1)* %in, align 2
+  %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
+  store i16 %cvt, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; CHECK-LABEL: @test_convert_fp64_to_fp16
+; CHECK: cvt.rn.f16.f64
+define void @test_convert_fp64_to_fp16(i16 addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+  %val = load double addrspace(1)* %in, align 2
+  %cvt = call i16 @llvm.convert.to.fp16.f64(double %val) nounwind readnone
+  store i16 %cvt, i16 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/NVPTX/half.ll b/test/CodeGen/NVPTX/half.ll
new file mode 100644
index 0000000..aa08cc7
--- /dev/null
+++ b/test/CodeGen/NVPTX/half.ll

@@ -0,0 +1,70 @@
+; RUN: llc < %s -march=nvptx | FileCheck %s
+
+define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: @test_load_store
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+  %val = load half addrspace(1)* %in
+  store half %val, half addrspace(1) * %out
+  ret void
+}
+
+define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) {
+; CHECK-LABEL: @test_bitcast_from_half
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+  %val = load half addrspace(1) * %in
+  %val_int = bitcast half %val to i16
+  store i16 %val_int, i16 addrspace(1)* %out
+  ret void
+}
+
+define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+; CHECK-LABEL: @test_bitcast_to_half
+; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
+; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
+  %val = load i16 addrspace(1)* %in
+  %val_fp = bitcast i16 %val to half
+  store half %val_fp, half addrspace(1)* %out
+  ret void
+}
+
+define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) {
+; CHECK-LABEL: @test_extend32
+; CHECK: cvt.f32.f16
+
+  %val16 = load half addrspace(1)* %in
+  %val32 = fpext half %val16 to float
+  store float %val32, float addrspace(1)* %out
+  ret void
+}
+
+define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) {
+; CHECK-LABEL: @test_extend64
+; CHECK: cvt.f64.f16
+
+  %val16 = load half addrspace(1)* %in
+  %val64 = fpext half %val16 to double
+  store double %val64, double addrspace(1)* %out
+  ret void
+}
+
+define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: test_trunc32
+; CHECK: cvt.rn.f16.f32
+
+  %val32 = load float addrspace(1)* %in
+  %val16 = fptrunc float %val32 to half
+  store half %val16, half addrspace(1)* %out
+  ret void
+}
+
+define void @test_trunc64(double addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: @test_trunc64
+; CHECK: cvt.rn.f16.f64
+
+  %val32 = load double addrspace(1)* %in
+  %val16 = fptrunc double %val32 to half
+  store half %val16, half addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/NVPTX/implicit-def.ll b/test/CodeGen/NVPTX/implicit-def.ll
index 06d3d56..2d2c6e5 100644
--- a/test/CodeGen/NVPTX/implicit-def.ll
+++ b/test/CodeGen/NVPTX/implicit-def.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s
 
 ; CHECK: // implicit-def: %f[[F0:[0-9]+]]
-; CHECK: add.f32         %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
+; CHECK: add.rn.f32         %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
 define float @foo(float %a) {
   %ret = fadd float %a, undef
   ret float %ret

diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll
index af91bb4..3c51776 100644
--- a/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/test/CodeGen/NVPTX/intrinsic-old.ll

@@ -198,7 +198,7 @@
 }
 
 define ptx_device i64 @test_clock64() {
-; CHECK: mov.u64 %rl{{[0-9]+}}, %clock64;
+; CHECK: mov.u64 %rd{{[0-9]+}}, %clock64;
 ; CHECK: ret;
 	%x = call i64 @llvm.ptx.read.clock64()
 	ret i64 %x

diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll
index 78e1e77..34b671d 100644
--- a/test/CodeGen/NVPTX/intrinsics.ll
+++ b/test/CodeGen/NVPTX/intrinsics.ll

@@ -9,7 +9,7 @@
 }
 
 define ptx_device double @test_fabs(double %d) {
-; CHECK: abs.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}};
+; CHECK: abs.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
 	%x = call double @llvm.fabs.f64(double %d)
 	ret double %x

diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll
index 133ef09..f33659c 100644
--- a/test/CodeGen/NVPTX/ld-addrspace.ll
+++ b/test/CodeGen/NVPTX/ld-addrspace.ll

@@ -6,7 +6,7 @@
 define i8 @ld_global_i8(i8 addrspace(1)* %ptr) {
 ; PTX32: ld.global.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i8 addrspace(1)* %ptr
   ret i8 %a
@@ -15,7 +15,7 @@
 define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) {
 ; PTX32: ld.shared.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i8 addrspace(3)* %ptr
   ret i8 %a
@@ -24,7 +24,7 @@
 define i8 @ld_local_i8(i8 addrspace(5)* %ptr) {
 ; PTX32: ld.local.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i8 addrspace(5)* %ptr
   ret i8 %a
@@ -34,7 +34,7 @@
 define i16 @ld_global_i16(i16 addrspace(1)* %ptr) {
 ; PTX32: ld.global.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i16 addrspace(1)* %ptr
   ret i16 %a
@@ -43,7 +43,7 @@
 define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) {
 ; PTX32: ld.shared.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i16 addrspace(3)* %ptr
   ret i16 %a
@@ -52,7 +52,7 @@
 define i16 @ld_local_i16(i16 addrspace(5)* %ptr) {
 ; PTX32: ld.local.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i16 addrspace(5)* %ptr
   ret i16 %a
@@ -62,7 +62,7 @@
 define i32 @ld_global_i32(i32 addrspace(1)* %ptr) {
 ; PTX32: ld.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32 addrspace(1)* %ptr
   ret i32 %a
@@ -71,7 +71,7 @@
 define i32 @ld_shared_i32(i32 addrspace(3)* %ptr) {
 ; PTX32: ld.shared.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32 addrspace(3)* %ptr
   ret i32 %a
@@ -80,7 +80,7 @@
 define i32 @ld_local_i32(i32 addrspace(5)* %ptr) {
 ; PTX32: ld.local.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32 addrspace(5)* %ptr
   ret i32 %a
@@ -88,27 +88,27 @@
 
 ;; i64
 define i64 @ld_global_i64(i64 addrspace(1)* %ptr) {
-; PTX32: ld.global.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i64 addrspace(1)* %ptr
   ret i64 %a
 }
 
 define i64 @ld_shared_i64(i64 addrspace(3)* %ptr) {
-; PTX32: ld.shared.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.shared.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i64 addrspace(3)* %ptr
   ret i64 %a
 }
 
 define i64 @ld_local_i64(i64 addrspace(5)* %ptr) {
-; PTX32: ld.local.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.local.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i64 addrspace(5)* %ptr
   ret i64 %a
@@ -118,7 +118,7 @@
 define float @ld_global_f32(float addrspace(1)* %ptr) {
 ; PTX32: ld.global.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load float addrspace(1)* %ptr
   ret float %a
@@ -127,7 +127,7 @@
 define float @ld_shared_f32(float addrspace(3)* %ptr) {
 ; PTX32: ld.shared.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load float addrspace(3)* %ptr
   ret float %a
@@ -136,7 +136,7 @@
 define float @ld_local_f32(float addrspace(5)* %ptr) {
 ; PTX32: ld.local.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load float addrspace(5)* %ptr
   ret float %a
@@ -144,27 +144,27 @@
 
 ;; f64
 define double @ld_global_f64(double addrspace(1)* %ptr) {
-; PTX32: ld.global.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.global.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.global.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load double addrspace(1)* %ptr
   ret double %a
 }
 
 define double @ld_shared_f64(double addrspace(3)* %ptr) {
-; PTX32: ld.shared.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.shared.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.shared.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load double addrspace(3)* %ptr
   ret double %a
 }
 
 define double @ld_local_f64(double addrspace(5)* %ptr) {
-; PTX32: ld.local.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.local.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.local.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load double addrspace(5)* %ptr
   ret double %a

diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll
index 3728268..d629e0e 100644
--- a/test/CodeGen/NVPTX/ld-generic.ll
+++ b/test/CodeGen/NVPTX/ld-generic.ll

@@ -6,7 +6,7 @@
 define i8 @ld_global_i8(i8 addrspace(0)* %ptr) {
 ; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i8 addrspace(0)* %ptr
   ret i8 %a
@@ -16,7 +16,7 @@
 define i16 @ld_global_i16(i16 addrspace(0)* %ptr) {
 ; PTX32: ld.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i16 addrspace(0)* %ptr
   ret i16 %a
@@ -26,7 +26,7 @@
 define i32 @ld_global_i32(i32 addrspace(0)* %ptr) {
 ; PTX32: ld.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u32 %r{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i32 addrspace(0)* %ptr
   ret i32 %a
@@ -34,9 +34,9 @@
 
 ;; i64
 define i64 @ld_global_i64(i64 addrspace(0)* %ptr) {
-; PTX32: ld.u64 %rl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.u64 %rl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load i64 addrspace(0)* %ptr
   ret i64 %a
@@ -46,7 +46,7 @@
 define float @ld_global_f32(float addrspace(0)* %ptr) {
 ; PTX32: ld.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.f32 %f{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load float addrspace(0)* %ptr
   ret float %a
@@ -54,9 +54,9 @@
 
 ;; f64
 define double @ld_global_f64(double addrspace(0)* %ptr) {
-; PTX32: ld.f64 %fl{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: ld.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: ret
-; PTX64: ld.f64 %fl{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: ret
   %a = load double addrspace(0)* %ptr
   ret double %a

diff --git a/test/CodeGen/NVPTX/ldu-i8.ll b/test/CodeGen/NVPTX/ldu-i8.ll
index 9cc6675..36c99b3 100644
--- a/test/CodeGen/NVPTX/ldu-i8.ll
+++ b/test/CodeGen/NVPTX/ldu-i8.ll

@@ -2,15 +2,13 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
-declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*)
+declare i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8*, i32)
 
 define i8 @foo(i8* %a) {
 ; Ensure we properly truncate off the high-order 24 bits
 ; CHECK:        ldu.global.u8
 ; CHECK:        cvt.u32.u16
 ; CHECK:        and.b32         %r{{[0-9]+}}, %r{{[0-9]+}}, 255
-  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a), !align !0
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0i8(i8* %a, i32 4)
   ret i8 %val
 }
-
-!0 = metadata !{i32 4}

diff --git a/test/CodeGen/NVPTX/ldu-ldg.ll b/test/CodeGen/NVPTX/ldu-ldg.ll
index 3b0619f..4bfd68c 100644
--- a/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/test/CodeGen/NVPTX/ldu-ldg.ll

@@ -1,40 +1,36 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
 
-declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
-declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
-declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr)
-declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr)
+declare i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
+declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
+declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
 
 
 ; CHECK: func0
 define i8 @func0(i8 addrspace(1)* %ptr) {
 ; ldu.global.u8
-  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+  %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
   ret i8 %val
 }
 
 ; CHECK: func1
 define i32 @func1(i32 addrspace(1)* %ptr) {
 ; ldu.global.u32
-  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+  %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
   ret i32 %val
 }
 
 ; CHECK: func2
 define i8 @func2(i8 addrspace(1)* %ptr) {
 ; ld.global.nc.u8
-  %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr), !align !0
+  %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
   ret i8 %val
 }
 
 ; CHECK: func3
 define i32 @func3(i32 addrspace(1)* %ptr) {
 ; ld.global.nc.u32
-  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr), !align !0
+  %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
   ret i32 %val
 }
-
-
-
-!0 = metadata !{i32 4}

diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
index 55707ea..fd35a75 100644
--- a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll
+++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll

@@ -7,15 +7,13 @@
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+32];
 ; CHECK:        ldu.global.u32  %r{{[0-9]+}}, [%r{{[0-9]+}}+36];
   %p2 = getelementptr i32* %a, i32 8
-  %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2), !align !1
+  %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p2, i32 4)
   %p3 = getelementptr i32* %a, i32 9
-  %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3), !align !1
+  %t2 = call i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32* %p3, i32 4)
   %t3 = mul i32 %t1, %t2
   store i32 %t3, i32* %a
   ret void
 }
 
-!1 = metadata !{ i32 4 }
-
-declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*)
+declare i32 @llvm.nvvm.ldu.global.i.i32.p0i32(i32*, i32)
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()

diff --git a/test/CodeGen/NVPTX/local-stack-frame.ll b/test/CodeGen/NVPTX/local-stack-frame.ll
index c0d7d1c..377eee9 100644
--- a/test/CodeGen/NVPTX/local-stack-frame.ll
+++ b/test/CodeGen/NVPTX/local-stack-frame.ll

@@ -7,8 +7,8 @@
 ; PTX32:        cvta.local.u32   %SP, %r{{[0-9]+}};
 ; PTX32:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
 ; PTX32:        st.volatile.u32  [%SP+0], %r{{[0-9]+}};
-; PTX64:        mov.u64          %rl{{[0-9]+}}, __local_depot{{[0-9]+}};
-; PTX64:        cvta.local.u64   %SP, %rl{{[0-9]+}};
+; PTX64:        mov.u64          %rd{{[0-9]+}}, __local_depot{{[0-9]+}};
+; PTX64:        cvta.local.u64   %SP, %rd{{[0-9]+}};
 ; PTX64:        ld.param.u32     %r{{[0-9]+}}, [foo_param_0];
 ; PTX64:        st.volatile.u32  [%SP+0], %r{{[0-9]+}};
 define void @foo(i32 %a) {

diff --git a/test/CodeGen/NVPTX/machine-sink.ll b/test/CodeGen/NVPTX/machine-sink.ll
new file mode 100644
index 0000000..3614bea
--- /dev/null
+++ b/test/CodeGen/NVPTX/machine-sink.ll

@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+@scalar1 = internal addrspace(3) global float 0.000000e+00, align 4
+@scalar2 = internal addrspace(3) global float 0.000000e+00, align 4
+
+; We shouldn't sink mul.rn.f32 to BB %merge because BB %merge post-dominates
+; BB %entry. Over-sinking created more register pressure on this example. The
+; backend would sink the fmuls to BB %merge, but not the loads for being
+; conservative on sinking memory accesses. As a result, the loads and
+; the two fmuls would be separated to two basic blocks, causing two
+; cross-BB live ranges.
+define float @post_dominate(float %x, i1 %cond) {
+; CHECK-LABEL: post_dominate(
+entry:
+  %0 = load float* addrspacecast (float addrspace(3)* @scalar1 to float*), align 4
+  %1 = load float* addrspacecast (float addrspace(3)* @scalar2 to float*), align 4
+; CHECK: ld.shared.f32
+; CHECK: ld.shared.f32
+  %2 = fmul float %0, %0
+  %3 = fmul float %1, %2
+; CHECK-NOT: bra
+; CHECK: mul.rn.f32
+; CHECK: mul.rn.f32
+  br i1 %cond, label %then, label %merge
+
+then:
+  %z = fadd float %x, %x
+  br label %then2
+
+then2:
+  %z2 = fadd float %z, %z
+  br label %merge
+
+merge:
+  %y = phi float [ 0.0, %entry ], [ %z2, %then2 ]
+  %w = fadd float %y, %3
+  ret float %w
+}

diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
new file mode 100644
index 0000000..90c9c43
--- /dev/null
+++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll

@@ -0,0 +1,77 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; CHECK-LABEL: t1
+define <4 x float> @t1(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK-NOT: ld.f32
+; CHECK: ld.u8
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 1
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: t2
+define <4 x float> @t2(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK-NOT: ld.v2
+; CHECK: ld.f32
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 4
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: t3
+define <4 x float> @t3(i8* %p1) {
+; CHECK-NOT: ld.v4
+; CHECK: ld.v2
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 8
+  ret <4 x float> %r
+}
+
+; CHECK-LABEL: t4
+define <4 x float> @t4(i8* %p1) {
+; CHECK: ld.v4
+  %cast = bitcast i8* %p1 to <4 x float>*
+  %r = load <4 x float>* %cast, align 16
+  ret <4 x float> %r
+}
+
+
+; CHECK-LABEL: s1
+define void @s1(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK-NOT: st.f32
+; CHECK: st.u8
+  store <4 x float> %v, <4 x float>* %p1, align 1
+  ret void
+}
+
+; CHECK-LABEL: s2
+define void @s2(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+; CHECK-NOT: st.v2
+; CHECK: st.f32
+  store <4 x float> %v, <4 x float>* %p1, align 4
+  ret void
+}
+
+; CHECK-LABEL: s3
+define void @s3(<4 x float>* %p1, <4 x float> %v) {
+; CHECK-NOT: st.v4
+  store <4 x float> %v, <4 x float>* %p1, align 8
+  ret void
+}
+
+; CHECK-LABEL: s4
+define void @s4(<4 x float>* %p1, <4 x float> %v) {
+; CHECK: st.v4
+  store <4 x float> %v, <4 x float>* %p1, align 16
+  ret void
+}
+

diff --git a/test/CodeGen/NVPTX/mulwide.ll b/test/CodeGen/NVPTX/mulwide.ll
index 927946c..1ddf973 100644
--- a/test/CodeGen/NVPTX/mulwide.ll
+++ b/test/CodeGen/NVPTX/mulwide.ll

@@ -1,37 +1,90 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT
 
-; CHECK: mulwide16
+; OPT-LABEL: @mulwide16
+; NOOPT-LABEL: @mulwide16
 define i32 @mulwide16(i16 %a, i16 %b) {
-; CHECK: mul.wide.s16
+; OPT: mul.wide.s16
+; NOOPT: mul.lo.s32
   %val0 = sext i16 %a to i32
   %val1 = sext i16 %b to i32
   %val2 = mul i32 %val0, %val1
   ret i32 %val2
 }
 
-; CHECK: mulwideu16
+; OPT-LABEL: @mulwideu16
+; NOOPT-LABEL: @mulwideu16
 define i32 @mulwideu16(i16 %a, i16 %b) {
-; CHECK: mul.wide.u16
+; OPT: mul.wide.u16
+; NOOPT: mul.lo.s32
   %val0 = zext i16 %a to i32
   %val1 = zext i16 %b to i32
   %val2 = mul i32 %val0, %val1
   ret i32 %val2
 }
 
-; CHECK: mulwide32
+; OPT-LABEL: @mulwide8
+; NOOPT-LABEL: @mulwide8
+define i32 @mulwide8(i8 %a, i8 %b) {
+; OPT: mul.wide.s16
+; NOOPT: mul.lo.s32
+  %val0 = sext i8 %a to i32
+  %val1 = sext i8 %b to i32
+  %val2 = mul i32 %val0, %val1
+  ret i32 %val2
+}
+
+; OPT-LABEL: @mulwideu8
+; NOOPT-LABEL: @mulwideu8
+define i32 @mulwideu8(i8 %a, i8 %b) {
+; OPT: mul.wide.u16
+; NOOPT: mul.lo.s32
+  %val0 = zext i8 %a to i32
+  %val1 = zext i8 %b to i32
+  %val2 = mul i32 %val0, %val1
+  ret i32 %val2
+}
+
+; OPT-LABEL: @mulwide32
+; NOOPT-LABEL: @mulwide32
 define i64 @mulwide32(i32 %a, i32 %b) {
-; CHECK: mul.wide.s32
+; OPT: mul.wide.s32
+; NOOPT: mul.lo.s64
   %val0 = sext i32 %a to i64
   %val1 = sext i32 %b to i64
   %val2 = mul i64 %val0, %val1
   ret i64 %val2
 }
 
-; CHECK: mulwideu32
+; OPT-LABEL: @mulwideu32
+; NOOPT-LABEL: @mulwideu32
 define i64 @mulwideu32(i32 %a, i32 %b) {
-; CHECK: mul.wide.u32
+; OPT: mul.wide.u32
+; NOOPT: mul.lo.s64
   %val0 = zext i32 %a to i64
   %val1 = zext i32 %b to i64
   %val2 = mul i64 %val0, %val1
   ret i64 %val2
 }
+
+; OPT-LABEL: @mulwideu7
+; NOOPT-LABEL: @mulwideu7
+define i64 @mulwideu7(i7 %a, i7 %b) {
+; OPT: mul.wide.u32
+; NOOPT: mul.lo.s64
+  %val0 = zext i7 %a to i64
+  %val1 = zext i7 %b to i64
+  %val2 = mul i64 %val0, %val1
+  ret i64 %val2
+}
+
+; OPT-LABEL: @mulwides7
+; NOOPT-LABEL: @mulwides7
+define i64 @mulwides7(i7 %a, i7 %b) {
+; OPT: mul.wide.s32
+; NOOPT: mul.lo.s64
+  %val0 = sext i7 %a to i64
+  %val1 = sext i7 %b to i64
+  %val2 = mul i64 %val0, %val1
+  ret i64 %val2
+}

diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll
index e7a81be..cc67a6f 100644
--- a/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll

@@ -5,7 +5,7 @@
 ; PTX32:      mov.u16 %rs{{[0-9]+}}, 0;
 ; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
 ; PTX64:      mov.u16 %rs{{[0-9]+}}, 0;
-; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}};
+; PTX64-NEXT: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
   store i1 false, i1* %a
   ret void
 }
@@ -15,7 +15,7 @@
 ; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}]
 ; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX32: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
-; PTX64: ld.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
 ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 ; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1;
 

diff --git a/test/CodeGen/NVPTX/st-addrspace.ll b/test/CodeGen/NVPTX/st-addrspace.ll
index 68c09fe..34a83f3 100644
--- a/test/CodeGen/NVPTX/st-addrspace.ll
+++ b/test/CodeGen/NVPTX/st-addrspace.ll

@@ -7,7 +7,7 @@
 define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) {
 ; PTX32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i8 %a, i8 addrspace(1)* %ptr
   ret void
@@ -16,7 +16,7 @@
 define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) {
 ; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i8 %a, i8 addrspace(3)* %ptr
   ret void
@@ -25,7 +25,7 @@
 define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) {
 ; PTX32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i8 %a, i8 addrspace(5)* %ptr
   ret void
@@ -36,7 +36,7 @@
 define void @st_global_i16(i16 addrspace(1)* %ptr, i16 %a) {
 ; PTX32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i16 %a, i16 addrspace(1)* %ptr
   ret void
@@ -45,7 +45,7 @@
 define void @st_shared_i16(i16 addrspace(3)* %ptr, i16 %a) {
 ; PTX32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i16 %a, i16 addrspace(3)* %ptr
   ret void
@@ -54,7 +54,7 @@
 define void @st_local_i16(i16 addrspace(5)* %ptr, i16 %a) {
 ; PTX32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i16 %a, i16 addrspace(5)* %ptr
   ret void
@@ -65,7 +65,7 @@
 define void @st_global_i32(i32 addrspace(1)* %ptr, i32 %a) {
 ; PTX32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, i32 addrspace(1)* %ptr
   ret void
@@ -74,7 +74,7 @@
 define void @st_shared_i32(i32 addrspace(3)* %ptr, i32 %a) {
 ; PTX32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, i32 addrspace(3)* %ptr
   ret void
@@ -83,7 +83,7 @@
 define void @st_local_i32(i32 addrspace(5)* %ptr, i32 %a) {
 ; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, i32 addrspace(5)* %ptr
   ret void
@@ -92,27 +92,27 @@
 ;; i64
 
 define void @st_global_i64(i64 addrspace(1)* %ptr, i64 %a) {
-; PTX32: st.global.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store i64 %a, i64 addrspace(1)* %ptr
   ret void
 }
 
 define void @st_shared_i64(i64 addrspace(3)* %ptr, i64 %a) {
-; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.shared.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store i64 %a, i64 addrspace(3)* %ptr
   ret void
 }
 
 define void @st_local_i64(i64 addrspace(5)* %ptr, i64 %a) {
-; PTX32: st.local.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.local.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store i64 %a, i64 addrspace(5)* %ptr
   ret void
@@ -123,7 +123,7 @@
 define void @st_global_f32(float addrspace(1)* %ptr, float %a) {
 ; PTX32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX64: ret
   store float %a, float addrspace(1)* %ptr
   ret void
@@ -132,7 +132,7 @@
 define void @st_shared_f32(float addrspace(3)* %ptr, float %a) {
 ; PTX32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX64: ret
   store float %a, float addrspace(3)* %ptr
   ret void
@@ -141,7 +141,7 @@
 define void @st_local_f32(float addrspace(5)* %ptr, float %a) {
 ; PTX32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX64: ret
   store float %a, float addrspace(5)* %ptr
   ret void
@@ -150,27 +150,27 @@
 ;; f64
 
 define void @st_global_f64(double addrspace(1)* %ptr, double %a) {
-; PTX32: st.global.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.global.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX64: ret
   store double %a, double addrspace(1)* %ptr
   ret void
 }
 
 define void @st_shared_f64(double addrspace(3)* %ptr, double %a) {
-; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.shared.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.shared.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX64: ret
   store double %a, double addrspace(3)* %ptr
   ret void
 }
 
 define void @st_local_f64(double addrspace(5)* %ptr, double %a) {
-; PTX32: st.local.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.local.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.local.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX64: ret
   store double %a, double addrspace(5)* %ptr
   ret void

diff --git a/test/CodeGen/NVPTX/st-generic.ll b/test/CodeGen/NVPTX/st-generic.ll
index b9c616f..022f7ab 100644
--- a/test/CodeGen/NVPTX/st-generic.ll
+++ b/test/CodeGen/NVPTX/st-generic.ll

@@ -7,7 +7,7 @@
 define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) {
 ; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i8 %a, i8 addrspace(0)* %ptr
   ret void
@@ -18,7 +18,7 @@
 define void @st_global_i16(i16 addrspace(0)* %ptr, i16 %a) {
 ; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u16 [%rl{{[0-9]+}}], %rs{{[0-9]+}}
+; PTX64: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
 ; PTX64: ret
   store i16 %a, i16 addrspace(0)* %ptr
   ret void
@@ -29,7 +29,7 @@
 define void @st_global_i32(i32 addrspace(0)* %ptr, i32 %a) {
 ; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u32 [%rl{{[0-9]+}}], %r{{[0-9]+}}
+; PTX64: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
 ; PTX64: ret
   store i32 %a, i32 addrspace(0)* %ptr
   ret void
@@ -38,9 +38,9 @@
 ;; i64
 
 define void @st_global_i64(i64 addrspace(0)* %ptr, i64 %a) {
-; PTX32: st.u64 [%r{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX32: st.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.u64 [%rl{{[0-9]+}}], %rl{{[0-9]+}}
+; PTX64: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
 ; PTX64: ret
   store i64 %a, i64 addrspace(0)* %ptr
   ret void
@@ -51,7 +51,7 @@
 define void @st_global_f32(float addrspace(0)* %ptr, float %a) {
 ; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.f32 [%rl{{[0-9]+}}], %f{{[0-9]+}}
+; PTX64: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
 ; PTX64: ret
   store float %a, float addrspace(0)* %ptr
   ret void
@@ -60,9 +60,9 @@
 ;; f64
 
 define void @st_global_f64(double addrspace(0)* %ptr, double %a) {
-; PTX32: st.f64 [%r{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX32: st.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX32: ret
-; PTX64: st.f64 [%rl{{[0-9]+}}], %fl{{[0-9]+}}
+; PTX64: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
 ; PTX64: ret
   store double %a, double addrspace(0)* %ptr
   ret void

diff --git a/test/CodeGen/NVPTX/surf-read-cuda.ll b/test/CodeGen/NVPTX/surf-read-cuda.ll
new file mode 100644
index 0000000..10a1ecc
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-read-cuda.ll

@@ -0,0 +1,53 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+target triple = "nvptx-unknown-cuda"
+
+declare i32 @llvm.nvvm.suld.1d.i32.trap(i64, i32)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+
+; SM20-LABEL: .entry foo
+; SM30-LABEL: .entry foo
+define void @foo(i64 %img, float* %red, i32 %idx) {
+; SM20: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}]
+; SM30: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFREG]], {%r{{[0-9]+}}}]
+  %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx)
+; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+  %ret = sitofp i32 %val to float
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+  store float %ret, float* %red
+  ret void
+}
+
+@surf0 = internal addrspace(1) global i64 0, align 8
+
+; SM20-LABEL: .entry bar
+; SM30-LABEL: .entry bar
+define void @bar(float* %red, i32 %idx) {
+; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0
+  %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [surf0, {%r{{[0-9]+}}}]
+; SM30: suld.b.1d.b32.trap {%r[[RED:[0-9]+]]}, [%rd[[SURFHANDLE]], {%r{{[0-9]+}}}]
+  %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx)
+; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]]
+  %ret = sitofp i32 %val to float
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]]
+  store float %ret, float* %red
+  ret void
+}
+
+
+
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
+!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+

diff --git a/test/CodeGen/NVPTX/surf-write-cuda.ll b/test/CodeGen/NVPTX/surf-write-cuda.ll
new file mode 100644
index 0000000..654c47f
--- /dev/null
+++ b/test/CodeGen/NVPTX/surf-write-cuda.ll

@@ -0,0 +1,42 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @llvm.nvvm.sust.b.1d.i32.trap(i64, i32, i32)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+
+; SM20-LABEL: .entry foo
+; SM30-LABEL: .entry foo
+define void @foo(i64 %img, i32 %val, i32 %idx) {
+; SM20: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM20: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+; SM30: ld.param.u64    %rd[[SURFREG:[0-9]+]], [foo_param_0];
+; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+  tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val)
+  ret void
+}
+
+
+@surf0 = internal addrspace(1) global i64 0, align 8
+
+
+
+; SM20-LABEL: .entry bar
+; SM30-LABEL: .entry bar
+define void @bar(i32 %val, i32 %idx) {
+; SM30: mov.u64 %rd[[SURFHANDLE:[0-9]+]], surf0
+  %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: sust.b.1d.b32.trap [surf0, {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+; SM30: sust.b.1d.b32.trap [%rd[[SURFREG]], {%r{{[0-9]+}}}], {%r{{[0-9]+}}}
+  tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %surfHandle, i32 %idx, i32 %val)
+  ret void
+}
+
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, i32, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (i32, i32)* @bar, metadata !"kernel", i32 1}
+!3 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}
+

diff --git a/test/CodeGen/NVPTX/tex-read-cuda.ll b/test/CodeGen/NVPTX/tex-read-cuda.ll
new file mode 100644
index 0000000..ee0cefa
--- /dev/null
+++ b/test/CodeGen/NVPTX/tex-read-cuda.ll

@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+
+target triple = "nvptx-unknown-cuda"
+
+declare { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64, i32)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+; SM20-LABEL: .entry foo
+; SM30-LABEL: .entry foo
+define void @foo(i64 %img, float* %red, i32 %idx) {
+; SM20: ld.param.u64    %rd[[TEXREG:[0-9]+]], [foo_param_0];
+; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}]
+; SM30: ld.param.u64    %rd[[TEXREG:[0-9]+]], [foo_param_0];
+; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}]
+  %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx)
+  %ret = extractvalue { float, float, float, float } %val, 0
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+  store float %ret, float* %red
+  ret void
+}
+
+
+@tex0 = internal addrspace(1) global i64 0, align 8
+
+; SM20-LABEL: .entry bar
+; SM30-LABEL: .entry bar
+define void @bar(float* %red, i32 %idx) {
+; SM30: mov.u64 %rd[[TEXHANDLE:[0-9]+]], tex0 
+  %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
+; SM20: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [tex0, {%r{{[0-9]+}}}]
+; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}]
+  %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx)
+  %ret = extractvalue { float, float, float, float } %val, 0
+; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]]
+  store float %ret, float* %red
+  ret void
+}
+
+!nvvm.annotations = !{!1, !2, !3}
+!1 = metadata !{void (i64, float*, i32)* @foo, metadata !"kernel", i32 1}
+!2 = metadata !{void (float*, i32)* @bar, metadata !"kernel", i32 1}
+!3 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}

diff --git a/test/CodeGen/NVPTX/tex-read.ll b/test/CodeGen/NVPTX/tex-read.ll
index 291060b..55e4bfc 100644
--- a/test/CodeGen/NVPTX/tex-read.ll
+++ b/test/CodeGen/NVPTX/tex-read.ll

@@ -2,12 +2,12 @@
 
 target triple = "nvptx-unknown-nvcl"
 
-declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64, i64, i32)
+declare { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64, i64, i32)
 
 ; CHECK: .entry foo
 define void @foo(i64 %img, i64 %sampler, float* %red, i32 %idx) {
 ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}]
-  %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.i32(i64 %img, i64 %sampler, i32 %idx)
+  %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx)
   %ret = extractvalue { float, float, float, float } %val, 0
 ; CHECK: st.f32 [%r{{[0-9]+}}], %f[[RED]]
   store float %ret, float* %red

diff --git a/test/CodeGen/NVPTX/texsurf-queries.ll b/test/CodeGen/NVPTX/texsurf-queries.ll
new file mode 100644
index 0000000..c7637cc
--- /dev/null
+++ b/test/CodeGen/NVPTX/texsurf-queries.ll

@@ -0,0 +1,103 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=SM20
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s --check-prefix=SM30
+
+target triple = "nvptx-unknown-cuda"
+
+@tex0 = internal addrspace(1) global i64 0, align 8
+@surf0 = internal addrspace(1) global i64 0, align 8
+
+declare i32 @llvm.nvvm.txq.width(i64)
+declare i32 @llvm.nvvm.txq.height(i64)
+declare i32 @llvm.nvvm.suq.width(i64)
+declare i32 @llvm.nvvm.suq.height(i64)
+declare i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)*)
+
+
+; SM20-LABEL: @t0
+; SM30-LABEL: @t0
+define i32 @t0(i64 %texHandle) {
+; SM20: txq.width.b32
+; SM30: txq.width.b32
+  %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle)
+  ret i32 %width
+}
+
+; SM20-LABEL: @t1
+; SM30-LABEL: @t1
+define i32 @t1() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0
+  %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
+; SM20: txq.width.b32 %r{{[0-9]+}}, [tex0]
+; SM30: txq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+  %width = tail call i32 @llvm.nvvm.txq.width(i64 %texHandle)
+  ret i32 %width
+}
+
+
+; SM20-LABEL: @t2
+; SM30-LABEL: @t2
+define i32 @t2(i64 %texHandle) {
+; SM20: txq.height.b32
+; SM30: txq.height.b32
+  %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle)
+  ret i32 %height
+}
+
+; SM20-LABEL: @t3
+; SM30-LABEL: @t3
+define i32 @t3() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], tex0
+  %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex0)
+; SM20: txq.height.b32 %r{{[0-9]+}}, [tex0]
+; SM30: txq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+  %height = tail call i32 @llvm.nvvm.txq.height(i64 %texHandle)
+  ret i32 %height
+}
+
+
+; SM20-LABEL: @s0
+; SM30-LABEL: @s0
+define i32 @s0(i64 %surfHandle) {
+; SM20: suq.width.b32
+; SM30: suq.width.b32
+  %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle)
+  ret i32 %width
+}
+
+; SM20-LABEL: @s1
+; SM30-LABEL: @s1
+define i32 @s1() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0
+  %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: suq.width.b32 %r{{[0-9]+}}, [surf0]
+; SM30: suq.width.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+  %width = tail call i32 @llvm.nvvm.suq.width(i64 %surfHandle)
+  ret i32 %width
+}
+
+
+; SM20-LABEL: @s2
+; SM30-LABEL: @s2
+define i32 @s2(i64 %surfHandle) {
+; SM20: suq.height.b32
+; SM30: suq.height.b32
+  %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle)
+  ret i32 %height
+}
+
+; SM20-LABEL: @s3
+; SM30-LABEL: @s3
+define i32 @s3() {
+; SM30: mov.u64 %rd[[HANDLE:[0-9]+]], surf0
+  %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf0)
+; SM20: suq.height.b32 %r{{[0-9]+}}, [surf0]
+; SM30: suq.height.b32 %r{{[0-9]+}}, [%rd[[HANDLE:[0-9]+]]]
+  %height = tail call i32 @llvm.nvvm.suq.height(i64 %surfHandle)
+  ret i32 %height
+}
+
+
+
+!nvvm.annotations = !{!1, !2}
+!1 = metadata !{i64 addrspace(1)* @tex0, metadata !"texture", i32 1}
+!2 = metadata !{i64 addrspace(1)* @surf0, metadata !"surface", i32 1}

diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll
new file mode 100644
index 0000000..a03d7fd
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-call.ll

@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+declare void @bar(<4 x i32>)
+
+; CHECK-LABEL @foo
+define void @foo(<4 x i32> %a) {
+; CHECK: st.param.v4.b32
+  tail call void @bar(<4 x i32> %a)
+  ret void
+}

diff --git a/test/CodeGen/NVPTX/vector-return.ll b/test/CodeGen/NVPTX/vector-return.ll
new file mode 100644
index 0000000..15e50f8
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-return.ll

@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+declare <2 x float> @bar(<2 x float> %input)
+
+define void @foo(<2 x float> %input, <2 x float>* %output) {
+; CHECK-LABEL: @foo
+entry:
+  %call = tail call <2 x float> @bar(<2 x float> %input)
+; CHECK: .param .align 8 .b8 retval0[8];
+; CHECK: ld.param.v2.f32 {[[ELEM1:%f[0-9]+]], [[ELEM2:%f[0-9]+]]}, [retval0+0];
+  store <2 x float> %call, <2 x float>* %output, align 8
+; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEM1]], [[ELEM2]]}
+  ret void
+}

diff --git a/test/CodeGen/PowerPC/2007-09-08-unaligned.ll b/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
index 898c470..bdd91f3 100644
--- a/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
+++ b/test/CodeGen/PowerPC/2007-09-08-unaligned.ll

@@ -1,7 +1,7 @@
-; RUN: llc < %s | grep stfd | count 3
-; RUN: llc < %s | grep stfs | count 1
-; RUN: llc < %s | grep lfd | count 2
-; RUN: llc < %s | grep lfs | count 2
+; RUN: llc -mattr=-vsx < %s | grep stfd | count 3
+; RUN: llc -mattr=-vsx < %s | grep stfs | count 1
+; RUN: llc -mattr=-vsx < %s | grep lfd | count 2
+; RUN: llc -mattr=-vsx < %s | grep lfs | count 2
 ; ModuleID = 'foo.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin8"

diff --git a/test/CodeGen/PowerPC/2012-10-12-bitcast.ll b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
index f841c5f..fdacef2 100644
--- a/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
+++ b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll

@@ -1,4 +1,5 @@
-; RUN: llc -mattr=+altivec < %s | FileCheck %s
+; RUN: llc -mattr=-vsx -mattr=+altivec -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mattr=+vsx -mattr=+altivec -mcpu=pwr7 < %s | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -18,3 +19,7 @@
 ; CHECK: lwz 3, -16(1)
 ; CHECK: blr
 
+; CHECK-VSX: addi [[REGISTER:[0-9]+]], 1, -16
+; CHECK-VSX: stxvd2x 34, 0, [[REGISTER]]
+; CHECK-VSX: lwz 3, -16(1)
+; CHECK-VSX: blr

diff --git a/test/CodeGen/PowerPC/Atomics-32.ll b/test/CodeGen/PowerPC/Atomics-32.ll
deleted file mode 100644
index b7f23b1..0000000
--- a/test/CodeGen/PowerPC/Atomics-32.ll
+++ /dev/null

@@ -1,715 +0,0 @@
-; RUN: llc < %s -march=ppc32
-target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
-target triple = "powerpc-apple-darwin9"
-
-@sc = common global i8 0
-@uc = common global i8 0
-@ss = common global i16 0
-@us = common global i16 0
-@si = common global i32 0
-@ui = common global i32 0
-@sl = common global i32 0
-@ul = common global i32 0
-@sll = common global i64 0, align 8
-@ull = common global i64 0, align 8
-
-define void @test_op_ignore() nounwind {
-entry:
-  %0 = atomicrmw add i8* @sc, i8 1 monotonic
-  %1 = atomicrmw add i8* @uc, i8 1 monotonic
-  %2 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %3 = atomicrmw add i16* %2, i16 1 monotonic
-  %4 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %5 = atomicrmw add i16* %4, i16 1 monotonic
-  %6 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %7 = atomicrmw add i32* %6, i32 1 monotonic
-  %8 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %9 = atomicrmw add i32* %8, i32 1 monotonic
-  %10 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %11 = atomicrmw add i32* %10, i32 1 monotonic
-  %12 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %13 = atomicrmw add i32* %12, i32 1 monotonic
-  %14 = atomicrmw sub i8* @sc, i8 1 monotonic
-  %15 = atomicrmw sub i8* @uc, i8 1 monotonic
-  %16 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %17 = atomicrmw sub i16* %16, i16 1 monotonic
-  %18 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %19 = atomicrmw sub i16* %18, i16 1 monotonic
-  %20 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %21 = atomicrmw sub i32* %20, i32 1 monotonic
-  %22 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %23 = atomicrmw sub i32* %22, i32 1 monotonic
-  %24 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %25 = atomicrmw sub i32* %24, i32 1 monotonic
-  %26 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %27 = atomicrmw sub i32* %26, i32 1 monotonic
-  %28 = atomicrmw or i8* @sc, i8 1 monotonic
-  %29 = atomicrmw or i8* @uc, i8 1 monotonic
-  %30 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %31 = atomicrmw or i16* %30, i16 1 monotonic
-  %32 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %33 = atomicrmw or i16* %32, i16 1 monotonic
-  %34 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %35 = atomicrmw or i32* %34, i32 1 monotonic
-  %36 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %37 = atomicrmw or i32* %36, i32 1 monotonic
-  %38 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %39 = atomicrmw or i32* %38, i32 1 monotonic
-  %40 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %41 = atomicrmw or i32* %40, i32 1 monotonic
-  %42 = atomicrmw xor i8* @sc, i8 1 monotonic
-  %43 = atomicrmw xor i8* @uc, i8 1 monotonic
-  %44 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %45 = atomicrmw xor i16* %44, i16 1 monotonic
-  %46 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %47 = atomicrmw xor i16* %46, i16 1 monotonic
-  %48 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %49 = atomicrmw xor i32* %48, i32 1 monotonic
-  %50 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %51 = atomicrmw xor i32* %50, i32 1 monotonic
-  %52 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %53 = atomicrmw xor i32* %52, i32 1 monotonic
-  %54 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %55 = atomicrmw xor i32* %54, i32 1 monotonic
-  %56 = atomicrmw and i8* @sc, i8 1 monotonic
-  %57 = atomicrmw and i8* @uc, i8 1 monotonic
-  %58 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %59 = atomicrmw and i16* %58, i16 1 monotonic
-  %60 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %61 = atomicrmw and i16* %60, i16 1 monotonic
-  %62 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %63 = atomicrmw and i32* %62, i32 1 monotonic
-  %64 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %65 = atomicrmw and i32* %64, i32 1 monotonic
-  %66 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %67 = atomicrmw and i32* %66, i32 1 monotonic
-  %68 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %69 = atomicrmw and i32* %68, i32 1 monotonic
-  %70 = atomicrmw nand i8* @sc, i8 1 monotonic
-  %71 = atomicrmw nand i8* @uc, i8 1 monotonic
-  %72 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %73 = atomicrmw nand i16* %72, i16 1 monotonic
-  %74 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %75 = atomicrmw nand i16* %74, i16 1 monotonic
-  %76 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %77 = atomicrmw nand i32* %76, i32 1 monotonic
-  %78 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %79 = atomicrmw nand i32* %78, i32 1 monotonic
-  %80 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %81 = atomicrmw nand i32* %80, i32 1 monotonic
-  %82 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %83 = atomicrmw nand i32* %82, i32 1 monotonic
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-define void @test_fetch_and_op() nounwind {
-entry:
-  %0 = atomicrmw add i8* @sc, i8 11 monotonic
-  store i8 %0, i8* @sc, align 1
-  %1 = atomicrmw add i8* @uc, i8 11 monotonic
-  store i8 %1, i8* @uc, align 1
-  %2 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %3 = atomicrmw add i16* %2, i16 11 monotonic
-  store i16 %3, i16* @ss, align 2
-  %4 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %5 = atomicrmw add i16* %4, i16 11 monotonic
-  store i16 %5, i16* @us, align 2
-  %6 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %7 = atomicrmw add i32* %6, i32 11 monotonic
-  store i32 %7, i32* @si, align 4
-  %8 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %9 = atomicrmw add i32* %8, i32 11 monotonic
-  store i32 %9, i32* @ui, align 4
-  %10 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %11 = atomicrmw add i32* %10, i32 11 monotonic
-  store i32 %11, i32* @sl, align 4
-  %12 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %13 = atomicrmw add i32* %12, i32 11 monotonic
-  store i32 %13, i32* @ul, align 4
-  %14 = atomicrmw sub i8* @sc, i8 11 monotonic
-  store i8 %14, i8* @sc, align 1
-  %15 = atomicrmw sub i8* @uc, i8 11 monotonic
-  store i8 %15, i8* @uc, align 1
-  %16 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %17 = atomicrmw sub i16* %16, i16 11 monotonic
-  store i16 %17, i16* @ss, align 2
-  %18 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %19 = atomicrmw sub i16* %18, i16 11 monotonic
-  store i16 %19, i16* @us, align 2
-  %20 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %21 = atomicrmw sub i32* %20, i32 11 monotonic
-  store i32 %21, i32* @si, align 4
-  %22 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %23 = atomicrmw sub i32* %22, i32 11 monotonic
-  store i32 %23, i32* @ui, align 4
-  %24 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %25 = atomicrmw sub i32* %24, i32 11 monotonic
-  store i32 %25, i32* @sl, align 4
-  %26 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %27 = atomicrmw sub i32* %26, i32 11 monotonic
-  store i32 %27, i32* @ul, align 4
-  %28 = atomicrmw or i8* @sc, i8 11 monotonic
-  store i8 %28, i8* @sc, align 1
-  %29 = atomicrmw or i8* @uc, i8 11 monotonic
-  store i8 %29, i8* @uc, align 1
-  %30 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %31 = atomicrmw or i16* %30, i16 11 monotonic
-  store i16 %31, i16* @ss, align 2
-  %32 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %33 = atomicrmw or i16* %32, i16 11 monotonic
-  store i16 %33, i16* @us, align 2
-  %34 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %35 = atomicrmw or i32* %34, i32 11 monotonic
-  store i32 %35, i32* @si, align 4
-  %36 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %37 = atomicrmw or i32* %36, i32 11 monotonic
-  store i32 %37, i32* @ui, align 4
-  %38 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %39 = atomicrmw or i32* %38, i32 11 monotonic
-  store i32 %39, i32* @sl, align 4
-  %40 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %41 = atomicrmw or i32* %40, i32 11 monotonic
-  store i32 %41, i32* @ul, align 4
-  %42 = atomicrmw xor i8* @sc, i8 11 monotonic
-  store i8 %42, i8* @sc, align 1
-  %43 = atomicrmw xor i8* @uc, i8 11 monotonic
-  store i8 %43, i8* @uc, align 1
-  %44 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %45 = atomicrmw xor i16* %44, i16 11 monotonic
-  store i16 %45, i16* @ss, align 2
-  %46 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %47 = atomicrmw xor i16* %46, i16 11 monotonic
-  store i16 %47, i16* @us, align 2
-  %48 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %49 = atomicrmw xor i32* %48, i32 11 monotonic
-  store i32 %49, i32* @si, align 4
-  %50 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %51 = atomicrmw xor i32* %50, i32 11 monotonic
-  store i32 %51, i32* @ui, align 4
-  %52 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %53 = atomicrmw xor i32* %52, i32 11 monotonic
-  store i32 %53, i32* @sl, align 4
-  %54 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %55 = atomicrmw xor i32* %54, i32 11 monotonic
-  store i32 %55, i32* @ul, align 4
-  %56 = atomicrmw and i8* @sc, i8 11 monotonic
-  store i8 %56, i8* @sc, align 1
-  %57 = atomicrmw and i8* @uc, i8 11 monotonic
-  store i8 %57, i8* @uc, align 1
-  %58 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %59 = atomicrmw and i16* %58, i16 11 monotonic
-  store i16 %59, i16* @ss, align 2
-  %60 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %61 = atomicrmw and i16* %60, i16 11 monotonic
-  store i16 %61, i16* @us, align 2
-  %62 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %63 = atomicrmw and i32* %62, i32 11 monotonic
-  store i32 %63, i32* @si, align 4
-  %64 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %65 = atomicrmw and i32* %64, i32 11 monotonic
-  store i32 %65, i32* @ui, align 4
-  %66 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %67 = atomicrmw and i32* %66, i32 11 monotonic
-  store i32 %67, i32* @sl, align 4
-  %68 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %69 = atomicrmw and i32* %68, i32 11 monotonic
-  store i32 %69, i32* @ul, align 4
-  %70 = atomicrmw nand i8* @sc, i8 11 monotonic
-  store i8 %70, i8* @sc, align 1
-  %71 = atomicrmw nand i8* @uc, i8 11 monotonic
-  store i8 %71, i8* @uc, align 1
-  %72 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %73 = atomicrmw nand i16* %72, i16 11 monotonic
-  store i16 %73, i16* @ss, align 2
-  %74 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %75 = atomicrmw nand i16* %74, i16 11 monotonic
-  store i16 %75, i16* @us, align 2
-  %76 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %77 = atomicrmw nand i32* %76, i32 11 monotonic
-  store i32 %77, i32* @si, align 4
-  %78 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %79 = atomicrmw nand i32* %78, i32 11 monotonic
-  store i32 %79, i32* @ui, align 4
-  %80 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %81 = atomicrmw nand i32* %80, i32 11 monotonic
-  store i32 %81, i32* @sl, align 4
-  %82 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %83 = atomicrmw nand i32* %82, i32 11 monotonic
-  store i32 %83, i32* @ul, align 4
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-define void @test_op_and_fetch() nounwind {
-entry:
-  %0 = load i8* @uc, align 1
-  %1 = atomicrmw add i8* @sc, i8 %0 monotonic
-  %2 = add i8 %1, %0
-  store i8 %2, i8* @sc, align 1
-  %3 = load i8* @uc, align 1
-  %4 = atomicrmw add i8* @uc, i8 %3 monotonic
-  %5 = add i8 %4, %3
-  store i8 %5, i8* @uc, align 1
-  %6 = load i8* @uc, align 1
-  %7 = zext i8 %6 to i16
-  %8 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %9 = atomicrmw add i16* %8, i16 %7 monotonic
-  %10 = add i16 %9, %7
-  store i16 %10, i16* @ss, align 2
-  %11 = load i8* @uc, align 1
-  %12 = zext i8 %11 to i16
-  %13 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %14 = atomicrmw add i16* %13, i16 %12 monotonic
-  %15 = add i16 %14, %12
-  store i16 %15, i16* @us, align 2
-  %16 = load i8* @uc, align 1
-  %17 = zext i8 %16 to i32
-  %18 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %19 = atomicrmw add i32* %18, i32 %17 monotonic
-  %20 = add i32 %19, %17
-  store i32 %20, i32* @si, align 4
-  %21 = load i8* @uc, align 1
-  %22 = zext i8 %21 to i32
-  %23 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %24 = atomicrmw add i32* %23, i32 %22 monotonic
-  %25 = add i32 %24, %22
-  store i32 %25, i32* @ui, align 4
-  %26 = load i8* @uc, align 1
-  %27 = zext i8 %26 to i32
-  %28 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %29 = atomicrmw add i32* %28, i32 %27 monotonic
-  %30 = add i32 %29, %27
-  store i32 %30, i32* @sl, align 4
-  %31 = load i8* @uc, align 1
-  %32 = zext i8 %31 to i32
-  %33 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %34 = atomicrmw add i32* %33, i32 %32 monotonic
-  %35 = add i32 %34, %32
-  store i32 %35, i32* @ul, align 4
-  %36 = load i8* @uc, align 1
-  %37 = atomicrmw sub i8* @sc, i8 %36 monotonic
-  %38 = sub i8 %37, %36
-  store i8 %38, i8* @sc, align 1
-  %39 = load i8* @uc, align 1
-  %40 = atomicrmw sub i8* @uc, i8 %39 monotonic
-  %41 = sub i8 %40, %39
-  store i8 %41, i8* @uc, align 1
-  %42 = load i8* @uc, align 1
-  %43 = zext i8 %42 to i16
-  %44 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %45 = atomicrmw sub i16* %44, i16 %43 monotonic
-  %46 = sub i16 %45, %43
-  store i16 %46, i16* @ss, align 2
-  %47 = load i8* @uc, align 1
-  %48 = zext i8 %47 to i16
-  %49 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %50 = atomicrmw sub i16* %49, i16 %48 monotonic
-  %51 = sub i16 %50, %48
-  store i16 %51, i16* @us, align 2
-  %52 = load i8* @uc, align 1
-  %53 = zext i8 %52 to i32
-  %54 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %55 = atomicrmw sub i32* %54, i32 %53 monotonic
-  %56 = sub i32 %55, %53
-  store i32 %56, i32* @si, align 4
-  %57 = load i8* @uc, align 1
-  %58 = zext i8 %57 to i32
-  %59 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %60 = atomicrmw sub i32* %59, i32 %58 monotonic
-  %61 = sub i32 %60, %58
-  store i32 %61, i32* @ui, align 4
-  %62 = load i8* @uc, align 1
-  %63 = zext i8 %62 to i32
-  %64 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %65 = atomicrmw sub i32* %64, i32 %63 monotonic
-  %66 = sub i32 %65, %63
-  store i32 %66, i32* @sl, align 4
-  %67 = load i8* @uc, align 1
-  %68 = zext i8 %67 to i32
-  %69 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %70 = atomicrmw sub i32* %69, i32 %68 monotonic
-  %71 = sub i32 %70, %68
-  store i32 %71, i32* @ul, align 4
-  %72 = load i8* @uc, align 1
-  %73 = atomicrmw or i8* @sc, i8 %72 monotonic
-  %74 = or i8 %73, %72
-  store i8 %74, i8* @sc, align 1
-  %75 = load i8* @uc, align 1
-  %76 = atomicrmw or i8* @uc, i8 %75 monotonic
-  %77 = or i8 %76, %75
-  store i8 %77, i8* @uc, align 1
-  %78 = load i8* @uc, align 1
-  %79 = zext i8 %78 to i16
-  %80 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %81 = atomicrmw or i16* %80, i16 %79 monotonic
-  %82 = or i16 %81, %79
-  store i16 %82, i16* @ss, align 2
-  %83 = load i8* @uc, align 1
-  %84 = zext i8 %83 to i16
-  %85 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %86 = atomicrmw or i16* %85, i16 %84 monotonic
-  %87 = or i16 %86, %84
-  store i16 %87, i16* @us, align 2
-  %88 = load i8* @uc, align 1
-  %89 = zext i8 %88 to i32
-  %90 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %91 = atomicrmw or i32* %90, i32 %89 monotonic
-  %92 = or i32 %91, %89
-  store i32 %92, i32* @si, align 4
-  %93 = load i8* @uc, align 1
-  %94 = zext i8 %93 to i32
-  %95 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %96 = atomicrmw or i32* %95, i32 %94 monotonic
-  %97 = or i32 %96, %94
-  store i32 %97, i32* @ui, align 4
-  %98 = load i8* @uc, align 1
-  %99 = zext i8 %98 to i32
-  %100 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %101 = atomicrmw or i32* %100, i32 %99 monotonic
-  %102 = or i32 %101, %99
-  store i32 %102, i32* @sl, align 4
-  %103 = load i8* @uc, align 1
-  %104 = zext i8 %103 to i32
-  %105 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %106 = atomicrmw or i32* %105, i32 %104 monotonic
-  %107 = or i32 %106, %104
-  store i32 %107, i32* @ul, align 4
-  %108 = load i8* @uc, align 1
-  %109 = atomicrmw xor i8* @sc, i8 %108 monotonic
-  %110 = xor i8 %109, %108
-  store i8 %110, i8* @sc, align 1
-  %111 = load i8* @uc, align 1
-  %112 = atomicrmw xor i8* @uc, i8 %111 monotonic
-  %113 = xor i8 %112, %111
-  store i8 %113, i8* @uc, align 1
-  %114 = load i8* @uc, align 1
-  %115 = zext i8 %114 to i16
-  %116 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %117 = atomicrmw xor i16* %116, i16 %115 monotonic
-  %118 = xor i16 %117, %115
-  store i16 %118, i16* @ss, align 2
-  %119 = load i8* @uc, align 1
-  %120 = zext i8 %119 to i16
-  %121 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %122 = atomicrmw xor i16* %121, i16 %120 monotonic
-  %123 = xor i16 %122, %120
-  store i16 %123, i16* @us, align 2
-  %124 = load i8* @uc, align 1
-  %125 = zext i8 %124 to i32
-  %126 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %127 = atomicrmw xor i32* %126, i32 %125 monotonic
-  %128 = xor i32 %127, %125
-  store i32 %128, i32* @si, align 4
-  %129 = load i8* @uc, align 1
-  %130 = zext i8 %129 to i32
-  %131 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %132 = atomicrmw xor i32* %131, i32 %130 monotonic
-  %133 = xor i32 %132, %130
-  store i32 %133, i32* @ui, align 4
-  %134 = load i8* @uc, align 1
-  %135 = zext i8 %134 to i32
-  %136 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %137 = atomicrmw xor i32* %136, i32 %135 monotonic
-  %138 = xor i32 %137, %135
-  store i32 %138, i32* @sl, align 4
-  %139 = load i8* @uc, align 1
-  %140 = zext i8 %139 to i32
-  %141 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %142 = atomicrmw xor i32* %141, i32 %140 monotonic
-  %143 = xor i32 %142, %140
-  store i32 %143, i32* @ul, align 4
-  %144 = load i8* @uc, align 1
-  %145 = atomicrmw and i8* @sc, i8 %144 monotonic
-  %146 = and i8 %145, %144
-  store i8 %146, i8* @sc, align 1
-  %147 = load i8* @uc, align 1
-  %148 = atomicrmw and i8* @uc, i8 %147 monotonic
-  %149 = and i8 %148, %147
-  store i8 %149, i8* @uc, align 1
-  %150 = load i8* @uc, align 1
-  %151 = zext i8 %150 to i16
-  %152 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %153 = atomicrmw and i16* %152, i16 %151 monotonic
-  %154 = and i16 %153, %151
-  store i16 %154, i16* @ss, align 2
-  %155 = load i8* @uc, align 1
-  %156 = zext i8 %155 to i16
-  %157 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %158 = atomicrmw and i16* %157, i16 %156 monotonic
-  %159 = and i16 %158, %156
-  store i16 %159, i16* @us, align 2
-  %160 = load i8* @uc, align 1
-  %161 = zext i8 %160 to i32
-  %162 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %163 = atomicrmw and i32* %162, i32 %161 monotonic
-  %164 = and i32 %163, %161
-  store i32 %164, i32* @si, align 4
-  %165 = load i8* @uc, align 1
-  %166 = zext i8 %165 to i32
-  %167 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %168 = atomicrmw and i32* %167, i32 %166 monotonic
-  %169 = and i32 %168, %166
-  store i32 %169, i32* @ui, align 4
-  %170 = load i8* @uc, align 1
-  %171 = zext i8 %170 to i32
-  %172 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %173 = atomicrmw and i32* %172, i32 %171 monotonic
-  %174 = and i32 %173, %171
-  store i32 %174, i32* @sl, align 4
-  %175 = load i8* @uc, align 1
-  %176 = zext i8 %175 to i32
-  %177 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %178 = atomicrmw and i32* %177, i32 %176 monotonic
-  %179 = and i32 %178, %176
-  store i32 %179, i32* @ul, align 4
-  %180 = load i8* @uc, align 1
-  %181 = atomicrmw nand i8* @sc, i8 %180 monotonic
-  %182 = xor i8 %181, -1
-  %183 = and i8 %182, %180
-  store i8 %183, i8* @sc, align 1
-  %184 = load i8* @uc, align 1
-  %185 = atomicrmw nand i8* @uc, i8 %184 monotonic
-  %186 = xor i8 %185, -1
-  %187 = and i8 %186, %184
-  store i8 %187, i8* @uc, align 1
-  %188 = load i8* @uc, align 1
-  %189 = zext i8 %188 to i16
-  %190 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %191 = atomicrmw nand i16* %190, i16 %189 monotonic
-  %192 = xor i16 %191, -1
-  %193 = and i16 %192, %189
-  store i16 %193, i16* @ss, align 2
-  %194 = load i8* @uc, align 1
-  %195 = zext i8 %194 to i16
-  %196 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %197 = atomicrmw nand i16* %196, i16 %195 monotonic
-  %198 = xor i16 %197, -1
-  %199 = and i16 %198, %195
-  store i16 %199, i16* @us, align 2
-  %200 = load i8* @uc, align 1
-  %201 = zext i8 %200 to i32
-  %202 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %203 = atomicrmw nand i32* %202, i32 %201 monotonic
-  %204 = xor i32 %203, -1
-  %205 = and i32 %204, %201
-  store i32 %205, i32* @si, align 4
-  %206 = load i8* @uc, align 1
-  %207 = zext i8 %206 to i32
-  %208 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %209 = atomicrmw nand i32* %208, i32 %207 monotonic
-  %210 = xor i32 %209, -1
-  %211 = and i32 %210, %207
-  store i32 %211, i32* @ui, align 4
-  %212 = load i8* @uc, align 1
-  %213 = zext i8 %212 to i32
-  %214 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %215 = atomicrmw nand i32* %214, i32 %213 monotonic
-  %216 = xor i32 %215, -1
-  %217 = and i32 %216, %213
-  store i32 %217, i32* @sl, align 4
-  %218 = load i8* @uc, align 1
-  %219 = zext i8 %218 to i32
-  %220 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %221 = atomicrmw nand i32* %220, i32 %219 monotonic
-  %222 = xor i32 %221, -1
-  %223 = and i32 %222, %219
-  store i32 %223, i32* @ul, align 4
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-define void @test_compare_and_swap() nounwind {
-entry:
-  %0 = load i8* @uc, align 1
-  %1 = load i8* @sc, align 1
-  %pair2 = cmpxchg i8* @sc, i8 %0, i8 %1 monotonic monotonic
-  %2 = extractvalue { i8, i1 } %pair2, 0
-  store i8 %2, i8* @sc, align 1
-  %3 = load i8* @uc, align 1
-  %4 = load i8* @sc, align 1
-  %pair5 = cmpxchg i8* @uc, i8 %3, i8 %4 monotonic monotonic
-  %5 = extractvalue { i8, i1 } %pair5, 0
-  store i8 %5, i8* @uc, align 1
-  %6 = load i8* @uc, align 1
-  %7 = zext i8 %6 to i16
-  %8 = load i8* @sc, align 1
-  %9 = sext i8 %8 to i16
-  %10 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %pair11 = cmpxchg i16* %10, i16 %7, i16 %9 monotonic monotonic
-  %11 = extractvalue { i16, i1 } %pair11, 0
-  store i16 %11, i16* @ss, align 2
-  %12 = load i8* @uc, align 1
-  %13 = zext i8 %12 to i16
-  %14 = load i8* @sc, align 1
-  %15 = sext i8 %14 to i16
-  %16 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %pair17 = cmpxchg i16* %16, i16 %13, i16 %15 monotonic monotonic
-  %17 = extractvalue { i16, i1 } %pair17, 0
-  store i16 %17, i16* @us, align 2
-  %18 = load i8* @uc, align 1
-  %19 = zext i8 %18 to i32
-  %20 = load i8* @sc, align 1
-  %21 = sext i8 %20 to i32
-  %22 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %pair23 = cmpxchg i32* %22, i32 %19, i32 %21 monotonic monotonic
-  %23 = extractvalue { i32, i1 } %pair23, 0
-  store i32 %23, i32* @si, align 4
-  %24 = load i8* @uc, align 1
-  %25 = zext i8 %24 to i32
-  %26 = load i8* @sc, align 1
-  %27 = sext i8 %26 to i32
-  %28 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %pair29 = cmpxchg i32* %28, i32 %25, i32 %27 monotonic monotonic
-  %29 = extractvalue { i32, i1 } %pair29, 0
-  store i32 %29, i32* @ui, align 4
-  %30 = load i8* @uc, align 1
-  %31 = zext i8 %30 to i32
-  %32 = load i8* @sc, align 1
-  %33 = sext i8 %32 to i32
-  %34 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %pair35 = cmpxchg i32* %34, i32 %31, i32 %33 monotonic monotonic
-  %35 = extractvalue { i32, i1 } %pair35, 0
-  store i32 %35, i32* @sl, align 4
-  %36 = load i8* @uc, align 1
-  %37 = zext i8 %36 to i32
-  %38 = load i8* @sc, align 1
-  %39 = sext i8 %38 to i32
-  %40 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %pair41 = cmpxchg i32* %40, i32 %37, i32 %39 monotonic monotonic
-  %41 = extractvalue { i32, i1 } %pair41, 0
-  store i32 %41, i32* @ul, align 4
-  %42 = load i8* @uc, align 1
-  %43 = load i8* @sc, align 1
-  %pair44 = cmpxchg i8* @sc, i8 %42, i8 %43 monotonic monotonic
-  %44 = extractvalue { i8, i1 } %pair44, 0
-  %45 = icmp eq i8 %44, %42
-  %46 = zext i1 %45 to i32
-  store i32 %46, i32* @ui, align 4
-  %47 = load i8* @uc, align 1
-  %48 = load i8* @sc, align 1
-  %pair49 = cmpxchg i8* @uc, i8 %47, i8 %48 monotonic monotonic
-  %49 = extractvalue { i8, i1 } %pair49, 0
-  %50 = icmp eq i8 %49, %47
-  %51 = zext i1 %50 to i32
-  store i32 %51, i32* @ui, align 4
-  %52 = load i8* @uc, align 1
-  %53 = zext i8 %52 to i16
-  %54 = load i8* @sc, align 1
-  %55 = sext i8 %54 to i16
-  %56 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %pair57 = cmpxchg i16* %56, i16 %53, i16 %55 monotonic monotonic
-  %57 = extractvalue { i16, i1 } %pair57, 0
-  %58 = icmp eq i16 %57, %53
-  %59 = zext i1 %58 to i32
-  store i32 %59, i32* @ui, align 4
-  %60 = load i8* @uc, align 1
-  %61 = zext i8 %60 to i16
-  %62 = load i8* @sc, align 1
-  %63 = sext i8 %62 to i16
-  %64 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %pair65 = cmpxchg i16* %64, i16 %61, i16 %63 monotonic monotonic
-  %65 = extractvalue { i16, i1 } %pair65, 0
-  %66 = icmp eq i16 %65, %61
-  %67 = zext i1 %66 to i32
-  store i32 %67, i32* @ui, align 4
-  %68 = load i8* @uc, align 1
-  %69 = zext i8 %68 to i32
-  %70 = load i8* @sc, align 1
-  %71 = sext i8 %70 to i32
-  %72 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %pair73 = cmpxchg i32* %72, i32 %69, i32 %71 monotonic monotonic
-  %73 = extractvalue { i32, i1 } %pair73, 0
-  %74 = icmp eq i32 %73, %69
-  %75 = zext i1 %74 to i32
-  store i32 %75, i32* @ui, align 4
-  %76 = load i8* @uc, align 1
-  %77 = zext i8 %76 to i32
-  %78 = load i8* @sc, align 1
-  %79 = sext i8 %78 to i32
-  %80 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %pair81 = cmpxchg i32* %80, i32 %77, i32 %79 monotonic monotonic
-  %81 = extractvalue { i32, i1 } %pair81, 0
-  %82 = icmp eq i32 %81, %77
-  %83 = zext i1 %82 to i32
-  store i32 %83, i32* @ui, align 4
-  %84 = load i8* @uc, align 1
-  %85 = zext i8 %84 to i32
-  %86 = load i8* @sc, align 1
-  %87 = sext i8 %86 to i32
-  %88 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %pair89 = cmpxchg i32* %88, i32 %85, i32 %87 monotonic monotonic
-  %89 = extractvalue { i32, i1 } %pair89, 0
-  %90 = icmp eq i32 %89, %85
-  %91 = zext i1 %90 to i32
-  store i32 %91, i32* @ui, align 4
-  %92 = load i8* @uc, align 1
-  %93 = zext i8 %92 to i32
-  %94 = load i8* @sc, align 1
-  %95 = sext i8 %94 to i32
-  %96 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %pair97 = cmpxchg i32* %96, i32 %93, i32 %95 monotonic monotonic
-  %97 = extractvalue { i32, i1 } %pair97, 0
-  %98 = icmp eq i32 %97, %93
-  %99 = zext i1 %98 to i32
-  store i32 %99, i32* @ui, align 4
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}
-
-define void @test_lock() nounwind {
-entry:
-  %0 = atomicrmw xchg i8* @sc, i8 1 monotonic
-  store i8 %0, i8* @sc, align 1
-  %1 = atomicrmw xchg i8* @uc, i8 1 monotonic
-  store i8 %1, i8* @uc, align 1
-  %2 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  %3 = atomicrmw xchg i16* %2, i16 1 monotonic
-  store i16 %3, i16* @ss, align 2
-  %4 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  %5 = atomicrmw xchg i16* %4, i16 1 monotonic
-  store i16 %5, i16* @us, align 2
-  %6 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %7 = atomicrmw xchg i32* %6, i32 1 monotonic
-  store i32 %7, i32* @si, align 4
-  %8 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %9 = atomicrmw xchg i32* %8, i32 1 monotonic
-  store i32 %9, i32* @ui, align 4
-  %10 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  %11 = atomicrmw xchg i32* %10, i32 1 monotonic
-  store i32 %11, i32* @sl, align 4
-  %12 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  %13 = atomicrmw xchg i32* %12, i32 1 monotonic
-  store i32 %13, i32* @ul, align 4
-  fence seq_cst
-  store volatile i8 0, i8* @sc, align 1
-  store volatile i8 0, i8* @uc, align 1
-  %14 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
-  store volatile i16 0, i16* %14, align 2
-  %15 = bitcast i8* bitcast (i16* @us to i8*) to i16*
-  store volatile i16 0, i16* %15, align 2
-  %16 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  store volatile i32 0, i32* %16, align 4
-  %17 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  store volatile i32 0, i32* %17, align 4
-  %18 = bitcast i8* bitcast (i32* @sl to i8*) to i32*
-  store volatile i32 0, i32* %18, align 4
-  %19 = bitcast i8* bitcast (i32* @ul to i8*) to i32*
-  store volatile i32 0, i32* %19, align 4
-  %20 = bitcast i8* bitcast (i64* @sll to i8*) to i64*
-  store volatile i64 0, i64* %20, align 8
-  %21 = bitcast i8* bitcast (i64* @ull to i8*) to i64*
-  store volatile i64 0, i64* %21, align 8
-  br label %return
-
-return:                                           ; preds = %entry
-  ret void
-}

diff --git a/test/CodeGen/PowerPC/anon_aggr.ll b/test/CodeGen/PowerPC/anon_aggr.ll
index 3bae5c6..6c4f140 100644
--- a/test/CodeGen/PowerPC/anon_aggr.ll
+++ b/test/CodeGen/PowerPC/anon_aggr.ll

@@ -62,8 +62,7 @@
 }
 
 ; CHECK-LABEL: func2:
-; CHECK: addi [[REG1:[0-9]+]], 1, 64
-; CHECK: ld [[REG2:[0-9]+]], 8([[REG1]])
+; CHECK: ld [[REG2:[0-9]+]], 72(1)
 ; CHECK: cmpld {{[0-9]+}}, 4, [[REG2]]
 ; CHECK-DAG: std [[REG2]], -[[OFFSET1:[0-9]+]]
 ; CHECK-DAG: std 4, -[[OFFSET2:[0-9]+]]
@@ -82,8 +81,7 @@
 ; DARWIN32: lwz r3, -[[OFFSET2]]
 
 ; DARWIN64: _func2:
-; DARWIN64: addi r[[REG1:[0-9]+]], r1, 64
-; DARWIN64: ld r[[REG2:[0-9]+]], 8(r[[REG1]])
+; DARWIN64: ld r[[REG2:[0-9]+]], 72(r1)
 ; DARWIN64: mr
 ; DARWIN64: mr r[[REG3:[0-9]+]], r[[REGA:[0-9]+]]
 ; DARWIN64: cmpld cr{{[0-9]+}}, r[[REGA]], r[[REG2]]
@@ -108,10 +106,8 @@
 }
 
 ; CHECK-LABEL: func3:
-; CHECK: addi [[REG1:[0-9]+]], 1, 64
-; CHECK: addi [[REG2:[0-9]+]], 1, 48
-; CHECK: ld [[REG3:[0-9]+]], 8([[REG1]])
-; CHECK: ld [[REG4:[0-9]+]], 8([[REG2]])
+; CHECK: ld [[REG3:[0-9]+]], 72(1)
+; CHECK: ld [[REG4:[0-9]+]], 56(1)
 ; CHECK: cmpld {{[0-9]+}}, [[REG4]], [[REG3]]
 ; CHECK: std [[REG3]], -[[OFFSET1:[0-9]+]](1)
 ; CHECK: std [[REG4]], -[[OFFSET2:[0-9]+]](1)
@@ -130,10 +126,8 @@
 ; DARWIN32: lwz r3, -[[OFFSET1]]
 
 ; DARWIN64: _func3:
-; DARWIN64: addi r[[REG1:[0-9]+]], r1, 64
-; DARWIN64: addi r[[REG2:[0-9]+]], r1, 48
-; DARWIN64: ld r[[REG3:[0-9]+]], 8(r[[REG1]])
-; DARWIN64: ld r[[REG4:[0-9]+]], 8(r[[REG2]])
+; DARWIN64: ld r[[REG3:[0-9]+]], 72(r1)
+; DARWIN64: ld r[[REG4:[0-9]+]], 56(r1)
 ; DARWIN64: cmpld cr{{[0-9]+}}, r[[REG4]], r[[REG3]]
 ; DARWIN64: std r[[REG3]], -[[OFFSET1:[0-9]+]]
 ; DARWIN64: std r[[REG4]], -[[OFFSET2:[0-9]+]]
@@ -157,12 +151,11 @@
 }
 
 ; CHECK-LABEL: func4:
-; CHECK: addi [[REG1:[0-9]+]], 1, 128
+; CHECK: ld [[REG3:[0-9]+]], 136(1)
 ; CHECK: ld [[REG2:[0-9]+]], 120(1)
-; CHECK: ld [[REG3:[0-9]+]], 8([[REG1]])
 ; CHECK: cmpld {{[0-9]+}}, [[REG2]], [[REG3]]
-; CHECK: std [[REG2]], -[[OFFSET1:[0-9]+]](1)
 ; CHECK: std [[REG3]], -[[OFFSET2:[0-9]+]](1)
+; CHECK: std [[REG2]], -[[OFFSET1:[0-9]+]](1)
 ; CHECK: ld 3, -[[OFFSET1]](1)
 ; CHECK: ld 3, -[[OFFSET2]](1)
 
@@ -178,9 +171,8 @@
 ; DARWIN32: lwz r[[REG1]], -[[OFFSET2]]
 
 ; DARWIN64: _func4:
-; DARWIN64: addi r[[REG1:[0-9]+]], r1, 128
 ; DARWIN64: ld r[[REG2:[0-9]+]], 120(r1)
-; DARWIN64: ld r[[REG3:[0-9]+]], 8(r[[REG1]])
+; DARWIN64: ld r[[REG3:[0-9]+]], 136(r1)
 ; DARWIN64: mr r[[REG4:[0-9]+]], r[[REG2]]
 ; DARWIN64: cmpld cr{{[0-9]+}}, r[[REG2]], r[[REG3]]
 ; DARWIN64: std r[[REG4]], -[[OFFSET1:[0-9]+]]

diff --git a/test/CodeGen/PowerPC/asm-constraints.ll b/test/CodeGen/PowerPC/asm-constraints.ll
new file mode 100644
index 0000000..998b618
--- /dev/null
+++ b/test/CodeGen/PowerPC/asm-constraints.ll

@@ -0,0 +1,44 @@
+; RUN: llc < %s -mcpu=pwr8 | FileCheck %s
+
+; Generated from following C code:
+;
+; void foo (int result, char *addr) {
+;   __asm__ __volatile__ (
+;     "ld%U1%X1 %0,%1\n"
+;     "cmpw %0,%0\n"
+;     "bne- 1f\n"
+;     "1: isync\n"
+;     : "=r" (result)
+;     : "m"(*addr) : "memory", "cr0");
+; }
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+; Check that we accept 'U' and 'X' constraints.
+define void @foo(i32 signext %result, i8* %addr) #0 {
+entry:
+  %result.addr = alloca i32, align 4
+  %addr.addr = alloca i8*, align 8
+  store i32 %result, i32* %result.addr, align 4
+  store i8* %addr, i8** %addr.addr, align 8
+  %0 = load i8** %addr.addr, align 8
+  %1 = call i32 asm sideeffect "ld${1:U}${1:X} $0,$1\0Acmpw $0,$0\0Abne- 1f\0A1: isync\0A", "=r,*m,~{memory},~{cr0}"(i8* %0) #1, !srcloc !1
+  store i32 %1, i32* %result.addr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @foo
+; CHECK: ld [[REG:[0-9]+]],0(4)
+; CHECK-NEXT: cmpw [[REG]],[[REG]]
+; CHECK-NEXT: bne- 1f
+; CHECK-NEXT: 1: isync
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 (trunk 217557)"}
+!1 = metadata !{i32 67, i32 91, i32 110, i32 126}

diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index 843250f..9cb0fa5 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll

@@ -30,8 +30,9 @@
 entry:
 ; CHECK: @atomic_store
   store atomic i64 %val, i64* %mem release, align 64
-; CHECK: ldarx
-; CHECK: stdcx.
+; CHECK: sync 1
+; CHECK-NOT: stdcx
+; CHECK: std
   ret void
 }
 
@@ -39,9 +40,9 @@
 entry:
 ; CHECK: @atomic_load
   %tmp = load atomic i64* %mem acquire, align 64
-; CHECK: ldarx
-; CHECK: stdcx.
-; CHECK: stdcx.
+; CHECK-NOT: ldarx
+; CHECK: ld
+; CHECK: sync 1
   ret i64 %tmp
 }
 

diff --git a/test/CodeGen/PowerPC/atomics-fences.ll b/test/CodeGen/PowerPC/atomics-fences.ll
new file mode 100644
index 0000000..862bd17
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics-fences.ll

@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -mcpu=440 | FileCheck %s --check-prefix=PPC440
+
+; Fences
+define void @fence_acquire() {
+; CHECK-LABEL: fence_acquire
+; CHECK: sync 1
+; PPC440-NOT: sync 1
+; PPC440: msync
+  fence acquire
+  ret void
+}
+define void @fence_release() {
+; CHECK-LABEL: fence_release
+; CHECK: sync 1
+; PPC440-NOT: sync 1
+; PPC440: msync
+  fence release
+  ret void
+}
+define void @fence_seq_cst() {
+; CHECK-LABEL: fence_seq_cst
+; CHECK: sync 0
+; PPC440-NOT: sync 0
+; PPC440: msync
+  fence seq_cst
+  ret void
+}

diff --git a/test/CodeGen/PowerPC/atomics-indexed.ll b/test/CodeGen/PowerPC/atomics-indexed.ll
new file mode 100644
index 0000000..bb9ca04
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics-indexed.ll

@@ -0,0 +1,81 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32
+; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction).
+; This is already checked for in Atomics-64.ll
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64
+
+; In this file, we check that atomic load/store can make use of the indexed
+; versions of the instructions.
+
+; Indexed version of loads
+define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) {
+; CHECK-LABEL: load_x_i8_seq_cst
+; CHECK: sync 0
+; CHECK: lbzx
+; CHECK: sync 1
+  %ptr = getelementptr inbounds [100000 x i8]* %mem, i64 0, i64 90000
+  %val = load atomic i8* %ptr seq_cst, align 1
+  ret i8 %val
+}
+define i16 @load_x_i16_acquire([100000 x i16]* %mem) {
+; CHECK-LABEL: load_x_i16_acquire
+; CHECK: lhzx
+; CHECK: sync 1
+  %ptr = getelementptr inbounds [100000 x i16]* %mem, i64 0, i64 90000
+  %val = load atomic i16* %ptr acquire, align 2
+  ret i16 %val
+}
+define i32 @load_x_i32_monotonic([100000 x i32]* %mem) {
+; CHECK-LABEL: load_x_i32_monotonic
+; CHECK: lwzx
+; CHECK-NOT: sync
+  %ptr = getelementptr inbounds [100000 x i32]* %mem, i64 0, i64 90000
+  %val = load atomic i32* %ptr monotonic, align 4
+  ret i32 %val
+}
+define i64 @load_x_i64_unordered([100000 x i64]* %mem) {
+; CHECK-LABEL: load_x_i64_unordered
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: ldx
+; CHECK-NOT: sync
+  %ptr = getelementptr inbounds [100000 x i64]* %mem, i64 0, i64 90000
+  %val = load atomic i64* %ptr unordered, align 8
+  ret i64 %val
+}
+
+; Indexed version of stores
+define void @store_x_i8_seq_cst([100000 x i8]* %mem) {
+; CHECK-LABEL: store_x_i8_seq_cst
+; CHECK: sync 0
+; CHECK: stbx
+  %ptr = getelementptr inbounds [100000 x i8]* %mem, i64 0, i64 90000
+  store atomic i8 42, i8* %ptr seq_cst, align 1
+  ret void
+}
+define void @store_x_i16_release([100000 x i16]* %mem) {
+; CHECK-LABEL: store_x_i16_release
+; CHECK: sync 1
+; CHECK: sthx
+  %ptr = getelementptr inbounds [100000 x i16]* %mem, i64 0, i64 90000
+  store atomic i16 42, i16* %ptr release, align 2
+  ret void
+}
+define void @store_x_i32_monotonic([100000 x i32]* %mem) {
+; CHECK-LABEL: store_x_i32_monotonic
+; CHECK-NOT: sync
+; CHECK: stwx
+  %ptr = getelementptr inbounds [100000 x i32]* %mem, i64 0, i64 90000
+  store atomic i32 42, i32* %ptr monotonic, align 4
+  ret void
+}
+define void @store_x_i64_unordered([100000 x i64]* %mem) {
+; CHECK-LABEL: store_x_i64_unordered
+; CHECK-NOT: sync 0
+; CHECK-NOT: sync 1
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: stdx
+  %ptr = getelementptr inbounds [100000 x i64]* %mem, i64 0, i64 90000
+  store atomic i64 42, i64* %ptr unordered, align 8
+  ret void
+}

diff --git a/test/CodeGen/PowerPC/atomics.ll b/test/CodeGen/PowerPC/atomics.ll
new file mode 100644
index 0000000..5f6a6a4
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics.ll

@@ -0,0 +1,137 @@
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc32 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=PPC32
+; FIXME: -verify-machineinstrs currently fail on ppc64 (mismatched register/instruction).
+; This is already checked for in Atomics-64.ll
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -march=ppc64 | FileCheck %s --check-prefix=CHECK --check-prefix=PPC64
+
+; FIXME: we don't currently check for the operations themselves with CHECK-NEXT,
+;   because they are implemented in a very messy way with lwarx/stwcx.
+;   It should be fixed soon in another patch.
+
+; We first check loads, for all sizes from i8 to i64.
+; We also vary orderings to check for barriers.
+define i8 @load_i8_unordered(i8* %mem) {
+; CHECK-LABEL: load_i8_unordered
+; CHECK: lbz
+; CHECK-NOT: sync
+  %val = load atomic i8* %mem unordered, align 1
+  ret i8 %val
+}
+define i16 @load_i16_monotonic(i16* %mem) {
+; CHECK-LABEL: load_i16_monotonic
+; CHECK: lhz
+; CHECK-NOT: sync
+  %val = load atomic i16* %mem monotonic, align 2
+  ret i16 %val
+}
+define i32 @load_i32_acquire(i32* %mem) {
+; CHECK-LABEL: load_i32_acquire
+; CHECK: lwz
+  %val = load atomic i32* %mem acquire, align 4
+; CHECK: sync 1
+  ret i32 %val
+}
+define i64 @load_i64_seq_cst(i64* %mem) {
+; CHECK-LABEL: load_i64_seq_cst
+; CHECK: sync 0
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: ld
+  %val = load atomic i64* %mem seq_cst, align 8
+; CHECK: sync 1
+  ret i64 %val
+}
+
+; Stores
+define void @store_i8_unordered(i8* %mem) {
+; CHECK-LABEL: store_i8_unordered
+; CHECK-NOT: sync
+; CHECK: stb
+  store atomic i8 42, i8* %mem unordered, align 1
+  ret void
+}
+define void @store_i16_monotonic(i16* %mem) {
+; CHECK-LABEL: store_i16_monotonic
+; CHECK-NOT: sync
+; CHECK: sth
+  store atomic i16 42, i16* %mem monotonic, align 2
+  ret void
+}
+define void @store_i32_release(i32* %mem) {
+; CHECK-LABEL: store_i32_release
+; CHECK: sync 1
+; CHECK: stw
+  store atomic i32 42, i32* %mem release, align 4
+  ret void
+}
+define void @store_i64_seq_cst(i64* %mem) {
+; CHECK-LABEL: store_i64_seq_cst
+; CHECK: sync 0
+; PPC32: __sync_
+; PPC64-NOT: __sync_
+; PPC64: std
+  store atomic i64 42, i64* %mem seq_cst, align 8
+  ret void
+}
+
+; Atomic CmpXchg
+define i8 @cas_strong_i8_sc_sc(i8* %mem) {
+; CHECK-LABEL: cas_strong_i8_sc_sc
+; CHECK: sync 0
+  %val = cmpxchg i8* %mem, i8 0, i8 1 seq_cst seq_cst
+; CHECK: sync 1
+  %loaded = extractvalue { i8, i1} %val, 0
+  ret i8 %loaded
+}
+define i16 @cas_weak_i16_acquire_acquire(i16* %mem) {
+; CHECK-LABEL: cas_weak_i16_acquire_acquire
+;CHECK-NOT: sync
+  %val = cmpxchg weak i16* %mem, i16 0, i16 1 acquire acquire
+; CHECK: sync 1
+  %loaded = extractvalue { i16, i1} %val, 0
+  ret i16 %loaded
+}
+define i32 @cas_strong_i32_acqrel_acquire(i32* %mem) {
+; CHECK-LABEL: cas_strong_i32_acqrel_acquire
+; CHECK: sync 1
+  %val = cmpxchg i32* %mem, i32 0, i32 1 acq_rel acquire
+; CHECK: sync 1
+  %loaded = extractvalue { i32, i1} %val, 0
+  ret i32 %loaded
+}
+define i64 @cas_weak_i64_release_monotonic(i64* %mem) {
+; CHECK-LABEL: cas_weak_i64_release_monotonic
+; CHECK: sync 1
+  %val = cmpxchg weak i64* %mem, i64 0, i64 1 release monotonic
+; CHECK-NOT: [sync ]
+  %loaded = extractvalue { i64, i1} %val, 0
+  ret i64 %loaded
+}
+
+; AtomicRMW
+define i8 @add_i8_monotonic(i8* %mem, i8 %operand) {
+; CHECK-LABEL: add_i8_monotonic
+; CHECK-NOT: sync
+  %val = atomicrmw add i8* %mem, i8 %operand monotonic
+  ret i8 %val
+}
+define i16 @xor_i16_seq_cst(i16* %mem, i16 %operand) {
+; CHECK-LABEL: xor_i16_seq_cst
+; CHECK: sync 0
+  %val = atomicrmw xor i16* %mem, i16 %operand seq_cst
+; CHECK: sync 1
+  ret i16 %val
+}
+define i32 @xchg_i32_acq_rel(i32* %mem, i32 %operand) {
+; CHECK-LABEL: xchg_i32_acq_rel
+; CHECK: sync 1
+  %val = atomicrmw xchg i32* %mem, i32 %operand acq_rel
+; CHECK: sync 1
+  ret i32 %val
+}
+define i64 @and_i64_release(i64* %mem, i64 %operand) {
+; CHECK-LABEL: and_i64_release
+; CHECK: sync 1
+  %val = atomicrmw and i64* %mem, i64 %operand release
+; CHECK-NOT: [sync ]
+  ret i64 %val
+}

diff --git a/test/CodeGen/PowerPC/available-externally.ll b/test/CodeGen/PowerPC/available-externally.ll
index abed0de..53c4359 100644
--- a/test/CodeGen/PowerPC/available-externally.ll
+++ b/test/CodeGen/PowerPC/available-externally.ll

@@ -1,7 +1,8 @@
 ; RUN: llc < %s -relocation-model=static | FileCheck %s -check-prefix=STATIC
-; RUN: llc < %s -relocation-model=pic | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -relocation-model=pic -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -relocation-model=pic -mtriple=powerpc-unknown-linux | FileCheck %s -check-prefix=PICELF
 ; RUN: llc < %s -relocation-model=pic -mtriple=powerpc64-apple-darwin8 | FileCheck %s -check-prefix=PIC64
-; RUN: llc < %s -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=DYNAMIC
+; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=DYNAMIC
 ; RUN: llc < %s -relocation-model=dynamic-no-pic -mtriple=powerpc64-apple-darwin8 | FileCheck %s -check-prefix=DYNAMIC64
 ; PR4482
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
@@ -18,6 +19,10 @@
 ; PIC: bl L_exact_log2$stub
 ; PIC: blr
 
+; PICELF: foo:
+; PICELF: bl exact_log2@PLT
+; PICELF: blr
+
 ; PIC64: _foo:
 ; PIC64: bl L_exact_log2$stub
 ; PIC64: blr

diff --git a/test/CodeGen/PowerPC/blockaddress.ll b/test/CodeGen/PowerPC/blockaddress.ll
new file mode 100644
index 0000000..c1981e2
--- /dev/null
+++ b/test/CodeGen/PowerPC/blockaddress.ll

@@ -0,0 +1,26 @@
+; RUN: llc < %s -code-model=small -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=SMALL
+; RUN: llc < %s -code-model=medium -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=MEDIUM
+; RUN: llc < %s -code-model=large -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=MEDIUM
+; RUN: llc < %s -code-model=small -march=ppc64 -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=SMALL
+; RUN: llc < %s -code-model=medium -march=ppc64 -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=MEDIUM
+; RUN: llc < %s -code-model=large -march=ppc64 -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=MEDIUM
+
+define i8* @test() {
+entry:
+  br label %here
+
+here:                                             ; preds = %entry
+; MEDIUM: .Ltmp[[TMP0:[0-9]+]]:
+; MEDIUM: addis [[R0:[0-9]+]], 2, .LC[[LC0:[0-9]+]]@toc@ha
+; MEDIUM: ld 3, .LC[[LC0]]@toc@l([[R0]])
+; MEDIUM: blr
+; MEDIUM: .LC[[LC0]]:
+; MEDIUM: .tc .Ltmp[[TMP0]][TC],.Ltmp[[TMP0]]
+; SMALL: .Ltmp[[TMP0:[0-9]+]]:
+; SMALL: ld 3, .LC[[LC0:[0-9]+]]@toc(2)
+; SMALL: blr
+; SMALL: .LC[[LC0]]:
+; SMALL: .tc .Ltmp[[TMP0]][TC],.Ltmp[[TMP0]]
+  ret i8* blockaddress(@test, %here)
+}
+

diff --git a/test/CodeGen/PowerPC/buildvec_canonicalize.ll b/test/CodeGen/PowerPC/buildvec_canonicalize.ll
index e155a35..b70671b 100644
--- a/test/CodeGen/PowerPC/buildvec_canonicalize.ll
+++ b/test/CodeGen/PowerPC/buildvec_canonicalize.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mattr=+altivec --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=+altivec --enable-unsafe-fp-math | FileCheck %s
 
 define void @VXOR(<4 x float>* %P1, <4 x i32>* %P2, <4 x float>* %P3) {
         %tmp = load <4 x float>* %P3            ; <<4 x float>> [#uses=1]

diff --git a/test/CodeGen/PowerPC/byval-aliased.ll b/test/CodeGen/PowerPC/byval-aliased.ll
new file mode 100644
index 0000000..9ef2f02
--- /dev/null
+++ b/test/CodeGen/PowerPC/byval-aliased.ll

@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=ppc64 < %s | FileCheck %s
+target datalayout = "E-m:o-p:32:32-f64:32:64-n32"
+target triple = "powerpc-apple-macosx10.5.0"
+ 
+%struct.sm = type { i8, i8 }
+ 
+; Function Attrs: nounwind ssp
+define void @foo(%struct.sm* byval %s) #0 {
+entry:
+  %a = getelementptr inbounds %struct.sm* %s, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv2 = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv2, 3
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %a, align 1
+  call void @bar(%struct.sm* byval %s, %struct.sm* byval %s) #1
+  ret void
+}
+
+; CHECK-LABEL: @foo
+; CHECK: stb {{r[0-9]+}}, [[OFF:[0-9]+]]({{r[3]?1}})
+; CHECK: lhz r3, [[OFF]]({{r[3]?1}})
+; CHECK: bl _bar
+; CHECK: blr
+ 
+declare void @bar(%struct.sm* byval, %struct.sm* byval)
+ 
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind }
+ 

diff --git a/test/CodeGen/PowerPC/complex-return.ll b/test/CodeGen/PowerPC/complex-return.ll
index 5ac7524..9d25e61 100644
--- a/test/CodeGen/PowerPC/complex-return.ll
+++ b/test/CodeGen/PowerPC/complex-return.ll

@@ -24,10 +24,10 @@
 }
 
 ; CHECK-LABEL: foo:
+; CHECK: lfd 1
+; CHECK: lfd 2
 ; CHECK: lfd 3
 ; CHECK: lfd 4
-; CHECK: lfd 2
-; CHECK: lfd 1
 
 define { float, float } @oof() nounwind {
 entry:

diff --git a/test/CodeGen/PowerPC/copysignl.ll b/test/CodeGen/PowerPC/copysignl.ll
index 4b801b7..e280f83 100644
--- a/test/CodeGen/PowerPC/copysignl.ll
+++ b/test/CodeGen/PowerPC/copysignl.ll

@@ -1,4 +1,5 @@
-; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -mattr=+vsx < %s | FileCheck %s -check-prefix=CHECK-VSX
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -11,6 +12,9 @@
 ; CHECK-LABEL: @foo_d_ll
 ; CHECK: fcpsgn 1, 3, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_d_ll
+; CHECK-VSX: xscpsgndp 1, 3, 1
+; CHECK-VSX: blr
 }
 
 declare ppc_fp128 @copysignl(ppc_fp128, ppc_fp128) #0
@@ -24,6 +28,9 @@
 ; CHECK-LABEL: @foo_dl
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_dl
+; CHECK-VSX: xscpsgndp 1, 2, 1
+; CHECK-VSX: blr
 }
 
 declare double @copysign(double, double) #0
@@ -37,6 +44,9 @@
 ; CHECK-LABEL: @foo_ll
 ; CHECK: bl copysignl
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_ll
+; CHECK-VSX: bl copysignl
+; CHECK-VSX: blr
 }
 
 define ppc_fp128 @foo_ld(double %a, double %b) #0 {
@@ -49,6 +59,9 @@
 ; CHECK-LABEL: @foo_ld
 ; CHECK: bl copysignl
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_ld
+; CHECK-VSX: bl copysignl
+; CHECK-VSX: blr
 }
 
 define ppc_fp128 @foo_lf(double %a, float %b) #0 {
@@ -61,6 +74,9 @@
 ; CHECK-LABEL: @foo_lf
 ; CHECK: bl copysignl
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_lf
+; CHECK-VSX: bl copysignl
+; CHECK-VSX: blr
 }
 
 attributes #0 = { nounwind readnone }

diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index 6beea55..04338a6 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll

@@ -6,34 +6,34 @@
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !15), !dbg !17
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !16), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !17
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
   %add = add nsw i32 %argc, 1, !dbg !19
   ret i32 %add, !dbg !19
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{i32 720913, metadata !21, i32 12, metadata !"clang version 3.1", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, metadata !"", metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1\001\00\000\00\000", metadata !21, metadata !1, metadata !1, metadata !3, metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !21, null, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !13, i32 0} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\00256\001\000", metadata !21, null, metadata !7, null, i32 (i32, i8**)* @main, null, null, metadata !13} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !21} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9, metadata !10}
-!9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0x24\00char\000\008\008\000\000\008", null, null} ; [ DW_TAG_base_type ]
 !13 = metadata !{metadata !15, metadata !16}
-!15 = metadata !{i32 721153, metadata !5, metadata !"argc", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{i32 721153, metadata !5, metadata !"argv", metadata !6, i32 33554433, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x101\00argc\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x101\00argv\0033554433\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
 !17 = metadata !{i32 1, i32 14, metadata !5, null}
 !18 = metadata !{i32 1, i32 26, metadata !5, null}
 !19 = metadata !{i32 2, i32 3, metadata !20, null}
-!20 = metadata !{i32 720907, metadata !21, metadata !5, i32 1, i32 34, i32 0} ; [ DW_TAG_lexical_block ]
+!20 = metadata !{metadata !"0xb\001\0034\000", metadata !21, metadata !5} ; [ DW_TAG_lexical_block ]
 !21 = metadata !{metadata !"dbg.c", metadata !"/src"}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/PowerPC/empty-functions.ll b/test/CodeGen/PowerPC/empty-functions.ll
index 3a2907d..e32a847 100644
--- a/test/CodeGen/PowerPC/empty-functions.ll
+++ b/test/CodeGen/PowerPC/empty-functions.ll

@@ -1,12 +1,43 @@
-; RUN: llc < %s -mtriple=powerpc-apple-darwin | FileCheck -check-prefix=CHECK-NO-FP %s
-; RUN: llc < %s -mtriple=powerpc-apple-darwin -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=powerpc-apple-darwin | FileCheck -check-prefix=CHECK-MACHO %s
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -disable-fp-elim | FileCheck -check-prefix=CHECK-MACHO %s
+; RUN: llc < %s -mtriple=powerpc-linux-gnu | FileCheck -check-prefix=LINUX-NO-FP %s
+; RUN: llc < %s -mtriple=powerpc-linux-gnu -disable-fp-elim | FileCheck -check-prefix=LINUX-FP %s
 
 define void @func() {
 entry:
   unreachable
 }
-; CHECK-NO-FP:     _func:
-; CHECK-NO-FP:     nop
 
-; CHECK-FP:      _func:
-; CHECK-FP:      nop
+; MachO cannot handle an empty function.
+; CHECK-MACHO:     _func:
+; CHECK-MACHO-NEXT: .cfi_startproc
+; CHECK-MACHO-NEXT: {{^}};
+; CHECK-MACHO-NEXT:     nop
+; CHECK-MACHO-NEXT: .cfi_endproc
+
+; An empty function is perfectly fine on ELF.
+; LINUX-NO-FP: func:
+; LINUX-NO-FP-NEXT: .cfi_startproc
+; LINUX-NO-FP-NEXT: {{^}}#
+; LINUX-NO-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-NO-FP-NEXT: .size   func, .L{{.*}}-func
+; LINUX-NO-FP-NEXT: .cfi_endproc
+
+; A cfi directive can point to the end of a function. It (and in fact the
+; entire body) could be optimized out because of the unreachable, but we
+; don't do it right now.
+; LINUX-FP: func:
+; LINUX-FP-NEXT: .cfi_startproc
+; LINUX-FP-NEXT: {{^}}#
+; LINUX-FP-NEXT: stw 31, -4(1)
+; LINUX-FP-NEXT: stwu 1, -16(1)
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT:  .cfi_def_cfa_offset 16
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_offset r31, -4
+; LINUX-FP-NEXT: mr 31, 1
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_def_cfa_register r31
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .size   func, .Ltmp3-func
+; LINUX-FP-NEXT: .cfi_endproc

diff --git a/test/CodeGen/PowerPC/fabs.ll b/test/CodeGen/PowerPC/fabs.ll
index ddcce74..36aa23d 100644
--- a/test/CodeGen/PowerPC/fabs.ll
+++ b/test/CodeGen/PowerPC/fabs.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin | grep "fabs f1, f1"
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin | grep "fabs f1, f1"
 
 define double @fabs(double %f) {
 entry:

diff --git a/test/CodeGen/PowerPC/fast-isel-call.ll b/test/CodeGen/PowerPC/fast-isel-call.ll
index 33a8ba9..b2cc75e 100644
--- a/test/CodeGen/PowerPC/fast-isel-call.ll
+++ b/test/CodeGen/PowerPC/fast-isel-call.ll

@@ -1,4 +1,8 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -mattr=-vsx -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
 
 define i32 @t1(i8 signext %a) nounwind {
   %1 = sext i8 %a to i32
@@ -57,11 +61,11 @@
 ; ELF64: t10
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
 ; ELF64: li 3, 0
-; ELF64: li 4, 248
-; ELF64: li 5, 187
+; ELF64: li 4, -8
+; ELF64: li 5, -69
 ; ELF64: li 6, 28
 ; ELF64: li 7, 40
-; ELF64: li 8, 186
+; ELF64: li 8, -70
 ; ELF64: rldicl 3, 3, 0, 56
 ; ELF64: rldicl 4, 4, 0, 56
 ; ELF64: rldicl 5, 5, 0, 56

diff --git a/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
index 33f7a79..c1f6b63 100644
--- a/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
+++ b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll

@@ -1,5 +1,8 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
-
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
 define void @t1a(float %a) uwtable ssp {
 entry:
 ; ELF64: t1a

diff --git a/test/CodeGen/PowerPC/fast-isel-conversion.ll b/test/CodeGen/PowerPC/fast-isel-conversion.ll
index 5e00675..b0e29c1 100644
--- a/test/CodeGen/PowerPC/fast-isel-conversion.ll
+++ b/test/CodeGen/PowerPC/fast-isel-conversion.ll

@@ -1,5 +1,10 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
-; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=970 | FileCheck %s --check-prefix=PPC970
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=-vsx | FileCheck %s --check-prefix=ELF64LE
+; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=970 -mattr=-vsx | FileCheck %s --check-prefix=PPC970
 
 ;; Tests for 970 don't use -fast-isel-abort because we intentionally punt
 ;; to SelectionDAG in some cases.
@@ -9,12 +14,16 @@
 define void @sitofp_single_i64(i64 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i64
+; ELF64LE: sitofp_single_i64
 ; PPC970: sitofp_single_i64
   %b.addr = alloca float, align 4
   %conv = sitofp i64 %a to float
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfids
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfids
 ; PPC970: std
 ; PPC970: lfd
 ; PPC970: fcfid
@@ -26,12 +35,20 @@
 define void @sitofp_single_i32(i32 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i32
+; ELF64LE: sitofp_single_i32
 ; PPC970: sitofp_single_i32
   %b.addr = alloca float, align 4
   %conv = sitofp i32 %a to float
 ; ELF64: std
+; stack offset used to load the float: 65524 = -16 + 4
+; ELF64: ori {{[0-9]+}}, {{[0-9]+}}, 65524 
 ; ELF64: lfiwax
 ; ELF64: fcfids
+; ELF64LE: std
+; stack offset used to load the float: 65520 = -16 + 0
+; ELF64LE: ori {{[0-9]+}}, {{[0-9]+}}, 65520
+; ELF64LE: lfiwax
+; ELF64LE: fcfids
 ; PPC970: std
 ; PPC970: lfd
 ; PPC970: fcfid
@@ -43,6 +60,7 @@
 define void @sitofp_single_i16(i16 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i16
+; ELF64LE: sitofp_single_i16
 ; PPC970: sitofp_single_i16
   %b.addr = alloca float, align 4
   %conv = sitofp i16 %a to float
@@ -50,6 +68,10 @@
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfids
+; ELF64LE: extsh
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfids
 ; PPC970: extsh
 ; PPC970: std
 ; PPC970: lfd
@@ -62,6 +84,7 @@
 define void @sitofp_single_i8(i8 %a) nounwind ssp {
 entry:
 ; ELF64: sitofp_single_i8
+; ELF64LE: sitofp_single_i8
 ; PPC970: sitofp_single_i8
   %b.addr = alloca float, align 4
   %conv = sitofp i8 %a to float
@@ -69,6 +92,10 @@
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfids
+; ELF64LE: extsb
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfids
 ; PPC970: extsb
 ; PPC970: std
 ; PPC970: lfd
@@ -81,12 +108,20 @@
 define void @sitofp_double_i32(i32 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i32
+; ELF64LE: sitofp_double_i32
 ; PPC970: sitofp_double_i32
   %b.addr = alloca double, align 8
   %conv = sitofp i32 %a to double
 ; ELF64: std
+; stack offset used to load the float: 65524 = -16 + 4
+; ELF64: ori {{[0-9]+}}, {{[0-9]+}}, 65524
 ; ELF64: lfiwax
 ; ELF64: fcfid
+; ELF64LE: std
+; stack offset used to load the float: 65520 = -16 + 0
+; ELF64LE: ori {{[0-9]+}}, {{[0-9]+}}, 65520
+; ELF64LE: lfiwax
+; ELF64LE: fcfid
 ; PPC970: std
 ; PPC970: lfd
 ; PPC970: fcfid
@@ -97,12 +132,16 @@
 define void @sitofp_double_i64(i64 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i64
+; ELF64LE: sitofp_double_i64
 ; PPC970: sitofp_double_i64
   %b.addr = alloca double, align 8
   %conv = sitofp i64 %a to double
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfid
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfid
 ; PPC970: std
 ; PPC970: lfd
 ; PPC970: fcfid
@@ -113,6 +152,7 @@
 define void @sitofp_double_i16(i16 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i16
+; ELF64LE: sitofp_double_i16
 ; PPC970: sitofp_double_i16
   %b.addr = alloca double, align 8
   %conv = sitofp i16 %a to double
@@ -120,6 +160,10 @@
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfid
+; ELF64LE: extsh
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfid
 ; PPC970: extsh
 ; PPC970: std
 ; PPC970: lfd
@@ -131,6 +175,7 @@
 define void @sitofp_double_i8(i8 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: sitofp_double_i8
+; ELF64LE: sitofp_double_i8
 ; PPC970: sitofp_double_i8
   %b.addr = alloca double, align 8
   %conv = sitofp i8 %a to double
@@ -138,6 +183,10 @@
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfid
+; ELF64LE: extsb
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfid
 ; PPC970: extsb
 ; PPC970: std
 ; PPC970: lfd
@@ -151,12 +200,16 @@
 define void @uitofp_single_i64(i64 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i64
+; ELF64LE: uitofp_single_i64
 ; PPC970: uitofp_single_i64
   %b.addr = alloca float, align 4
   %conv = uitofp i64 %a to float
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidus
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidus
 ; PPC970-NOT: fcfidus
   store float %conv, float* %b.addr, align 4
   ret void
@@ -165,12 +218,20 @@
 define void @uitofp_single_i32(i32 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i32
+; ELF64LE: uitofp_single_i32
 ; PPC970: uitofp_single_i32
   %b.addr = alloca float, align 4
   %conv = uitofp i32 %a to float
 ; ELF64: std
+; stack offset used to load the float: 65524 = -16 + 4
+; ELF64: ori {{[0-9]+}}, {{[0-9]+}}, 65524
 ; ELF64: lfiwzx
 ; ELF64: fcfidus
+; ELF64LE: std
+; stack offset used to load the float: 65520 = -16 + 0
+; ELF64LE: ori {{[0-9]+}}, {{[0-9]+}}, 65520
+; ELF64LE: lfiwzx
+; ELF64LE: fcfidus
 ; PPC970-NOT: lfiwzx
 ; PPC970-NOT: fcfidus
   store float %conv, float* %b.addr, align 4
@@ -180,6 +241,7 @@
 define void @uitofp_single_i16(i16 %a, float %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i16
+; ELF64LE: uitofp_single_i16
 ; PPC970: uitofp_single_i16
   %b.addr = alloca float, align 4
   %conv = uitofp i16 %a to float
@@ -187,6 +249,10 @@
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidus
+; ELF64LE: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidus
 ; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31
 ; PPC970: std
 ; PPC970: lfd
@@ -199,6 +265,7 @@
 define void @uitofp_single_i8(i8 %a) nounwind ssp {
 entry:
 ; ELF64: uitofp_single_i8
+; ELF64LE: uitofp_single_i8
 ; PPC970: uitofp_single_i8
   %b.addr = alloca float, align 4
   %conv = uitofp i8 %a to float
@@ -206,6 +273,10 @@
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidus
+; ELF64LE: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidus
 ; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31
 ; PPC970: std
 ; PPC970: lfd
@@ -218,12 +289,16 @@
 define void @uitofp_double_i64(i64 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i64
+; ELF64LE: uitofp_double_i64
 ; PPC970: uitofp_double_i64
   %b.addr = alloca double, align 8
   %conv = uitofp i64 %a to double
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidu
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidu
 ; PPC970-NOT: fcfidu
   store double %conv, double* %b.addr, align 8
   ret void
@@ -232,12 +307,20 @@
 define void @uitofp_double_i32(i32 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i32
+; ELF64LE: uitofp_double_i32
 ; PPC970: uitofp_double_i32
   %b.addr = alloca double, align 8
   %conv = uitofp i32 %a to double
 ; ELF64: std
+; stack offset used to load the float: 65524 = -16 + 4
+; ELF64: ori {{[0-9]+}}, {{[0-9]+}}, 65524
 ; ELF64: lfiwzx
 ; ELF64: fcfidu
+; ELF64LE: std
+; stack offset used to load the float: 65520 = -16 + 0
+; ELF64LE: ori {{[0-9]+}}, {{[0-9]+}}, 65520
+; ELF64LE: lfiwzx
+; ELF64LE: fcfidu
 ; PPC970-NOT: lfiwzx
 ; PPC970-NOT: fcfidu
   store double %conv, double* %b.addr, align 8
@@ -247,6 +330,7 @@
 define void @uitofp_double_i16(i16 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i16
+; ELF64LE: uitofp_double_i16
 ; PPC970: uitofp_double_i16
   %b.addr = alloca double, align 8
   %conv = uitofp i16 %a to double
@@ -254,6 +338,10 @@
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidu
+; ELF64LE: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidu
 ; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31
 ; PPC970: std
 ; PPC970: lfd
@@ -265,6 +353,7 @@
 define void @uitofp_double_i8(i8 %a, double %b) nounwind ssp {
 entry:
 ; ELF64: uitofp_double_i8
+; ELF64LE: uitofp_double_i8
 ; PPC970: uitofp_double_i8
   %b.addr = alloca double, align 8
   %conv = uitofp i8 %a to double
@@ -272,6 +361,10 @@
 ; ELF64: std
 ; ELF64: lfd
 ; ELF64: fcfidu
+; ELF64LE: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64LE: std
+; ELF64LE: lfd
+; ELF64LE: fcfidu
 ; PPC970: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31
 ; PPC970: std
 ; PPC970: lfd
@@ -285,12 +378,16 @@
 define void @fptosi_float_i32(float %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_float_i32
+; ELF64LE: fptosi_float_i32
 ; PPC970: fptosi_float_i32
   %b.addr = alloca i32, align 4
   %conv = fptosi float %a to i32
 ; ELF64: fctiwz
 ; ELF64: stfd
 ; ELF64: lwa
+; ELF64LE: fctiwz
+; ELF64LE: stfd
+; ELF64LE: lwa
 ; PPC970: fctiwz
 ; PPC970: stfd
 ; PPC970: lwa
@@ -301,12 +398,16 @@
 define void @fptosi_float_i64(float %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_float_i64
+; ELF64LE: fptosi_float_i64
 ; PPC970: fptosi_float_i64
   %b.addr = alloca i64, align 4
   %conv = fptosi float %a to i64
 ; ELF64: fctidz
 ; ELF64: stfd
 ; ELF64: ld
+; ELF64LE: fctidz
+; ELF64LE: stfd
+; ELF64LE: ld
 ; PPC970: fctidz
 ; PPC970: stfd
 ; PPC970: ld
@@ -317,12 +418,16 @@
 define void @fptosi_double_i32(double %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_double_i32
+; ELF64LE: fptosi_double_i32
 ; PPC970: fptosi_double_i32
   %b.addr = alloca i32, align 8
   %conv = fptosi double %a to i32
 ; ELF64: fctiwz
 ; ELF64: stfd
 ; ELF64: lwa
+; ELF64LE: fctiwz
+; ELF64LE: stfd
+; ELF64LE: lwa
 ; PPC970: fctiwz
 ; PPC970: stfd
 ; PPC970: lwa
@@ -333,12 +438,16 @@
 define void @fptosi_double_i64(double %a) nounwind ssp {
 entry:
 ; ELF64: fptosi_double_i64
+; ELF64LE: fptosi_double_i64
 ; PPC970: fptosi_double_i64
   %b.addr = alloca i64, align 8
   %conv = fptosi double %a to i64
 ; ELF64: fctidz
 ; ELF64: stfd
 ; ELF64: ld
+; ELF64LE: fctidz
+; ELF64LE: stfd
+; ELF64LE: ld
 ; PPC970: fctidz
 ; PPC970: stfd
 ; PPC970: ld
@@ -351,12 +460,16 @@
 define void @fptoui_float_i32(float %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_float_i32
+; ELF64LE: fptoui_float_i32
 ; PPC970: fptoui_float_i32
   %b.addr = alloca i32, align 4
   %conv = fptoui float %a to i32
 ; ELF64: fctiwuz
 ; ELF64: stfd
 ; ELF64: lwz
+; ELF64LE: fctiwuz
+; ELF64LE: stfd
+; ELF64LE: lwz
 ; PPC970: fctidz
 ; PPC970: stfd
 ; PPC970: lwz
@@ -367,12 +480,16 @@
 define void @fptoui_float_i64(float %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_float_i64
+; ELF64LE: fptoui_float_i64
 ; PPC970: fptoui_float_i64
   %b.addr = alloca i64, align 4
   %conv = fptoui float %a to i64
 ; ELF64: fctiduz
 ; ELF64: stfd
 ; ELF64: ld
+; ELF64LE: fctiduz
+; ELF64LE: stfd
+; ELF64LE: ld
 ; PPC970-NOT: fctiduz
   store i64 %conv, i64* %b.addr, align 4
   ret void
@@ -381,12 +498,16 @@
 define void @fptoui_double_i32(double %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_double_i32
+; ELF64LE: fptoui_double_i32
 ; PPC970: fptoui_double_i32
   %b.addr = alloca i32, align 8
   %conv = fptoui double %a to i32
 ; ELF64: fctiwuz
 ; ELF64: stfd
 ; ELF64: lwz
+; ELF64LE: fctiwuz
+; ELF64LE: stfd
+; ELF64LE: lwz
 ; PPC970: fctidz
 ; PPC970: stfd
 ; PPC970: lwz
@@ -397,12 +518,16 @@
 define void @fptoui_double_i64(double %a) nounwind ssp {
 entry:
 ; ELF64: fptoui_double_i64
+; ELF64LE: fptoui_double_i64
 ; PPC970: fptoui_double_i64
   %b.addr = alloca i64, align 8
   %conv = fptoui double %a to i64
 ; ELF64: fctiduz
 ; ELF64: stfd
 ; ELF64: ld
+; ELF64LE: fctiduz
+; ELF64LE: stfd
+; ELF64LE: ld
 ; PPC970-NOT: fctiduz
   store i64 %conv, i64* %b.addr, align 8
   ret void

diff --git a/test/CodeGen/PowerPC/fast-isel-load-store.ll b/test/CodeGen/PowerPC/fast-isel-load-store.ll
index 026b15f..ef702e2 100644
--- a/test/CodeGen/PowerPC/fast-isel-load-store.ll
+++ b/test/CodeGen/PowerPC/fast-isel-load-store.ll

@@ -1,4 +1,8 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel -fast-isel-abort -mattr=-vsx -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
 
 ; This test verifies that load/store instructions are properly generated,
 ; and that they pass MI verification.

diff --git a/test/CodeGen/PowerPC/fast-isel-ret.ll b/test/CodeGen/PowerPC/fast-isel-ret.ll
index fa19f8b..ae34fbf 100644
--- a/test/CodeGen/PowerPC/fast-isel-ret.ll
+++ b/test/CodeGen/PowerPC/fast-isel-ret.ll

@@ -1,8 +1,44 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+; FIXME: FastISel currently returns false if it hits code that uses VSX
+; registers and with -fast-isel-abort turned on the test case will then fail.
+; When fastisel better supports VSX fix up this test case.
+;
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s --check-prefix=ELF64
+
+define zeroext i1 @rettrue() nounwind uwtable ssp {
+entry:
+; ELF64-LABEL: rettrue
+; ELF64: li 3, 1
+; ELF64: blr
+  ret i1 true
+}
+
+define zeroext i1 @retfalse() nounwind uwtable ssp {
+entry:
+; ELF64-LABEL: retfalse
+; ELF64: li 3, 0
+; ELF64: blr
+  ret i1 false
+}
+
+define signext i1 @retstrue() nounwind uwtable ssp {
+entry:
+; ELF64-LABEL: retstrue
+; ELF64: li 3, -1
+; ELF64: blr
+  ret i1 true
+}
+
+define signext i1 @retsfalse() nounwind uwtable ssp {
+entry:
+; ELF64-LABEL: retsfalse
+; ELF64: li 3, 0
+; ELF64: blr
+  ret i1 false
+}
 
 define signext i8 @ret2(i8 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret2
+; ELF64-LABEL: ret2
 ; ELF64: extsb
 ; ELF64: blr
   ret i8 %a
@@ -10,7 +46,7 @@
 
 define zeroext i8 @ret3(i8 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret3
+; ELF64-LABEL: ret3
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
 ; ELF64: blr
   ret i8 %a
@@ -18,7 +54,7 @@
 
 define signext i16 @ret4(i16 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret4
+; ELF64-LABEL: ret4
 ; ELF64: extsh
 ; ELF64: blr
   ret i16 %a
@@ -26,7 +62,7 @@
 
 define zeroext i16 @ret5(i16 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret5
+; ELF64-LABEL: ret5
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
 ; ELF64: blr
   ret i16 %a
@@ -34,7 +70,7 @@
 
 define i16 @ret6(i16 %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret6
+; ELF64-LABEL: ret6
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
 ; ELF64: blr
   ret i16 %a
@@ -42,7 +78,7 @@
 
 define signext i32 @ret7(i32 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret7
+; ELF64-LABEL: ret7
 ; ELF64: extsw
 ; ELF64: blr
   ret i32 %a
@@ -50,7 +86,7 @@
 
 define zeroext i32 @ret8(i32 signext %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret8
+; ELF64-LABEL: ret8
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
 ; ELF64: blr
   ret i32 %a
@@ -58,7 +94,7 @@
 
 define i32 @ret9(i32 %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret9
+; ELF64-LABEL: ret9
 ; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
 ; ELF64: blr
   ret i32 %a
@@ -66,7 +102,7 @@
 
 define i64 @ret10(i64 %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret10
+; ELF64-LABEL: ret10
 ; ELF64-NOT: exts
 ; ELF64-NOT: rldicl
 ; ELF64: blr
@@ -75,21 +111,21 @@
 
 define float @ret11(float %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret11
+; ELF64-LABEL: ret11
 ; ELF64: blr
   ret float %a
 }
 
 define double @ret12(double %a) nounwind uwtable ssp {
 entry:
-; ELF64: ret12
+; ELF64-LABEL: ret12
 ; ELF64: blr
   ret double %a
 }
 
 define i8 @ret13() nounwind uwtable ssp {
 entry:
-; ELF64: ret13
+; ELF64-LABEL: ret13
 ; ELF64: li
 ; ELF64: blr
   ret i8 15;
@@ -97,7 +133,7 @@
 
 define i16 @ret14() nounwind uwtable ssp {
 entry:
-; ELF64: ret14
+; ELF64-LABEL: ret14
 ; ELF64: li
 ; ELF64: blr
   ret i16 -225;
@@ -105,7 +141,7 @@
 
 define i32 @ret15() nounwind uwtable ssp {
 entry:
-; ELF64: ret15
+; ELF64-LABEL: ret15
 ; ELF64: lis
 ; ELF64: ori
 ; ELF64: blr
@@ -114,7 +150,7 @@
 
 define i64 @ret16() nounwind uwtable ssp {
 entry:
-; ELF64: ret16
+; ELF64-LABEL: ret16
 ; ELF64: li
 ; ELF64: sldi
 ; ELF64: oris
@@ -125,7 +161,7 @@
 
 define float @ret17() nounwind uwtable ssp {
 entry:
-; ELF64: ret17
+; ELF64-LABEL: ret17
 ; ELF64: addis
 ; ELF64: lfs
 ; ELF64: blr
@@ -134,7 +170,7 @@
 
 define double @ret18() nounwind uwtable ssp {
 entry:
-; ELF64: ret18
+; ELF64-LABEL: ret18
 ; ELF64: addis
 ; ELF64: lfd
 ; ELF64: blr

diff --git a/test/CodeGen/PowerPC/fcpsgn.ll b/test/CodeGen/PowerPC/fcpsgn.ll
index f469981..4d4afc6 100644
--- a/test/CodeGen/PowerPC/fcpsgn.ll
+++ b/test/CodeGen/PowerPC/fcpsgn.ll

@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -10,6 +11,9 @@
 ; CHECK-LABEL: @foo_dd
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_dd
+; CHECK-VSX: xscpsgndp 1, 2, 1
+; CHECK-VSX: blr
 }
 
 declare double @copysign(double, double) #0
@@ -22,6 +26,9 @@
 ; CHECK-LABEL: @foo_ss
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_ss
+; CHECK-VSX: fcpsgn 1, 2, 1
+; CHECK-VSX: blr
 }
 
 declare float @copysignf(float, float) #0
@@ -35,6 +42,9 @@
 ; CHECK-LABEL: @foo_sd
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_sd
+; CHECK-VSX: fcpsgn 1, 2, 1
+; CHECK-VSX: blr
 }
 
 define double @foo_ds(double %a, float %b) #0 {
@@ -46,6 +56,9 @@
 ; CHECK-LABEL: @foo_ds
 ; CHECK: fcpsgn 1, 2, 1
 ; CHECK: blr
+; CHECK-VSX-LABEL: @foo_ds
+; CHECK-VSX: fcpsgn 1, 2, 1
+; CHECK-VSX: blr
 }
 
 attributes #0 = { nounwind readnone }

diff --git a/test/CodeGen/PowerPC/fma-mutate.ll b/test/CodeGen/PowerPC/fma-mutate.ll
new file mode 100644
index 0000000..1a391f4
--- /dev/null
+++ b/test/CodeGen/PowerPC/fma-mutate.ll

@@ -0,0 +1,21 @@
+; Test several VSX FMA mutation opportunities.  The first one isn't a
+; reasonable transformation because the killed product register is the
+; same as the FMA target register.  The second one is legal.  The third
+; one doesn't fit the feeding-copy pattern.
+
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=+vsx | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare double @llvm.sqrt.f64(double)
+
+define double @foo3(double %a) nounwind {
+  %r = call double @llvm.sqrt.f64(double %a)
+  ret double %r
+
+; CHECK: @foo3
+; CHECK: xsnmsubadp [[REG:[0-9]+]], {{[0-9]+}}, [[REG]]
+; CHECK: xsmaddmdp
+; CHECK: xsmaddadp
+}
+

diff --git a/test/CodeGen/PowerPC/fma.ll b/test/CodeGen/PowerPC/fma.ll
index db19761..ab5251b 100644
--- a/test/CodeGen/PowerPC/fma.ll
+++ b/test/CodeGen/PowerPC/fma.ll

@@ -1,12 +1,21 @@
-; RUN: llc < %s -march=ppc32 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s
+
+declare double @dummy1(double) #0
+declare double @dummy2(double, double) #0
+declare double @dummy3(double, double, double) #0
 
 define double @test_FMADD1(double %A, double %B, double %C) {
 	%D = fmul double %A, %B		; <double> [#uses=1]
-	%E = fadd double %D, %C		; <double> [#uses=1]
+	%E = fadd double %C, %D		; <double> [#uses=1]
 	ret double %E
 ; CHECK-LABEL: test_FMADD1:
 ; CHECK: fmadd
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMADD1:
+; CHECK-VSX: xsmaddmdp
+; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FMADD2(double %A, double %B, double %C) {
@@ -16,15 +25,38 @@
 ; CHECK-LABEL: test_FMADD2:
 ; CHECK: fmadd
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMADD2:
+; CHECK-VSX: xsmaddmdp
+; CHECK-VSX-NEXT: blr
 }
 
-define double @test_FMSUB(double %A, double %B, double %C) {
+define double @test_FMSUB1(double %A, double %B, double %C) {
 	%D = fmul double %A, %B		; <double> [#uses=1]
 	%E = fsub double %D, %C		; <double> [#uses=1]
 	ret double %E
-; CHECK-LABEL: test_FMSUB:
+; CHECK-LABEL: test_FMSUB1:
 ; CHECK: fmsub
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FMSUB1:
+; CHECK-VSX: xsmsubmdp
+; CHECK-VSX-NEXT: blr
+}
+
+define double @test_FMSUB2(double %A, double %B, double %C, double %D) {
+	%E = fmul double %A, %B 	; <double> [#uses=2]
+	%F = fadd double %E, %C 	; <double> [#uses=1]
+	%G = fsub double %E, %D 	; <double> [#uses=1]
+	%H = call double @dummy2(double %F, double %G)      ; <double> [#uses=1]
+	ret double %H
+; CHECK-LABEL: test_FMSUB2:
+; CHECK: fmadd
+; CHECK-NEXT: fmsub
+
+; CHECK-VSX-LABEL: test_FMSUB2:
+; CHECK-VSX: xsmaddadp
+; CHECK-VSX-NEXT: xsmsubmdp
 }
 
 define double @test_FNMADD1(double %A, double %B, double %C) {
@@ -35,6 +67,10 @@
 ; CHECK-LABEL: test_FNMADD1:
 ; CHECK: fnmadd
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMADD1:
+; CHECK-VSX: xsnmaddmdp
+; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FNMADD2(double %A, double %B, double %C) {
@@ -45,6 +81,10 @@
 ; CHECK-LABEL: test_FNMADD2:
 ; CHECK: fnmadd
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMADD2:
+; CHECK-VSX: xsnmaddmdp
+; CHECK-VSX-NEXT: blr
 }
 
 define double @test_FNMSUB1(double %A, double %B, double %C) {
@@ -54,6 +94,9 @@
 ; CHECK-LABEL: test_FNMSUB1:
 ; CHECK: fnmsub
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMSUB1:
+; CHECK-VSX: xsnmsubmdp
 }
 
 define double @test_FNMSUB2(double %A, double %B, double %C) {
@@ -64,6 +107,10 @@
 ; CHECK-LABEL: test_FNMSUB2:
 ; CHECK: fnmsub
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMSUB2:
+; CHECK-VSX: xsnmsubmdp
+; CHECK-VSX-NEXT: blr
 }
 
 define float @test_FNMSUBS(float %A, float %B, float %C) {
@@ -74,4 +121,8 @@
 ; CHECK-LABEL: test_FNMSUBS:
 ; CHECK: fnmsubs
 ; CHECK-NEXT: blr
+
+; CHECK-VSX-LABEL: test_FNMSUBS:
+; CHECK-VSX: fnmsubs
+; CHECK-VSX-NEXT: blr
 }

diff --git a/test/CodeGen/PowerPC/fmaxnum.ll b/test/CodeGen/PowerPC/fmaxnum.ll
new file mode 100644
index 0000000..1825850
--- /dev/null
+++ b/test/CodeGen/PowerPC/fmaxnum.ll

@@ -0,0 +1,86 @@
+; RUN: llc -march=ppc32 -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s
+
+declare float @fmaxf(float, float)
+declare double @fmax(double, double)
+declare ppc_fp128 @fmaxl(ppc_fp128, ppc_fp128)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
+declare ppc_fp128 @llvm.maxnum.ppcf128(ppc_fp128, ppc_fp128)
+
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
+
+; CHECK-LABEL: @test_fmaxf
+; CHECK: bl fmaxf
+define float @test_fmaxf(float %x, float %y) {
+  %z = call float @fmaxf(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_fmax
+; CHECK: bl fmax
+define double @test_fmax(double %x, double %y) {
+  %z = call double @fmax(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_fmaxl
+; CHECK: bl fmaxl
+define ppc_fp128 @test_fmaxl(ppc_fp128 %x, ppc_fp128 %y) {
+  %z = call ppc_fp128 @fmaxl(ppc_fp128 %x, ppc_fp128 %y) readnone
+  ret ppc_fp128 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf
+; CHECK: bl fmaxf
+define float @test_intrinsic_fmaxf(float %x, float %y) {
+  %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax
+; CHECK: bl fmax
+define double @test_intrinsic_fmax(double %x, double %y) {
+  %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxl
+; CHECK: bl fmaxl
+define ppc_fp128 @test_intrinsic_fmaxl(ppc_fp128 %x, ppc_fp128 %y) {
+  %z = call ppc_fp128 @llvm.maxnum.ppcf128(ppc_fp128 %x, ppc_fp128 %y) readnone
+  ret ppc_fp128 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf_v2f32
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+define <2 x float> @test_intrinsic_fmaxf_v2f32(<2 x float> %x, <2 x float> %y) {
+  %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+  ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf_v4f32
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+define <4 x float> @test_intrinsic_fmaxf_v4f32(<4 x float> %x, <4 x float> %y) {
+  %z = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf_v8f32
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+; CHECK: bl fmaxf
+define <8 x float> @test_intrinsic_fmaxf_v8f32(<8 x float> %x, <8 x float> %y) {
+  %z = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %x, <8 x float> %y) readnone
+  ret <8 x float> %z
+}

diff --git a/test/CodeGen/PowerPC/fminnum.ll b/test/CodeGen/PowerPC/fminnum.ll
new file mode 100644
index 0000000..fe91284
--- /dev/null
+++ b/test/CodeGen/PowerPC/fminnum.ll

@@ -0,0 +1,86 @@
+; RUN: llc -march=ppc32 -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s
+
+declare float @fminf(float, float)
+declare double @fmin(double, double)
+declare ppc_fp128 @fminl(ppc_fp128, ppc_fp128)
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare ppc_fp128 @llvm.minnum.ppcf128(ppc_fp128, ppc_fp128)
+
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
+
+; CHECK-LABEL: @test_fminf
+; CHECK: bl fminf
+define float @test_fminf(float %x, float %y) {
+  %z = call float @fminf(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_fmin
+; CHECK: bl fmin
+define double @test_fmin(double %x, double %y) {
+  %z = call double @fmin(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_fminl
+; CHECK: bl fminl
+define ppc_fp128 @test_fminl(ppc_fp128 %x, ppc_fp128 %y) {
+  %z = call ppc_fp128 @fminl(ppc_fp128 %x, ppc_fp128 %y) readnone
+  ret ppc_fp128 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_f32
+; CHECK: bl fminf
+define float @test_intrinsic_fmin_f32(float %x, float %y) {
+  %z = call float @llvm.minnum.f32(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_f64
+; CHECK: bl fmin
+define double @test_intrinsic_fmin_f64(double %x, double %y) {
+  %z = call double @llvm.minnum.f64(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_f128
+; CHECK: bl fminl
+define ppc_fp128 @test_intrinsic_fmin_f128(ppc_fp128 %x, ppc_fp128 %y) {
+  %z = call ppc_fp128 @llvm.minnum.ppcf128(ppc_fp128 %x, ppc_fp128 %y) readnone
+  ret ppc_fp128 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fminf_v2f32
+; CHECK: bl fminf
+; CHECK: bl fminf
+define <2 x float> @test_intrinsic_fminf_v2f32(<2 x float> %x, <2 x float> %y) {
+  %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+  ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v4f32
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) {
+  %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v8f32
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+; CHECK: bl fminf
+define <8 x float> @test_intrinsic_fmin_v8f32(<8 x float> %x, <8 x float> %y) {
+  %z = call <8 x float> @llvm.minnum.v8f32(<8 x float> %x, <8 x float> %y) readnone
+  ret <8 x float> %z
+}

diff --git a/test/CodeGen/PowerPC/fnabs.ll b/test/CodeGen/PowerPC/fnabs.ll
index 9fa2dcb..fc6a04e 100644
--- a/test/CodeGen/PowerPC/fnabs.ll
+++ b/test/CodeGen/PowerPC/fnabs.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 | grep fnabs
+; RUN: llc < %s -mattr=-vsx -march=ppc32 | grep fnabs
 
 declare double @fabs(double)
 

diff --git a/test/CodeGen/PowerPC/fp-branch.ll b/test/CodeGen/PowerPC/fp-branch.ll
index 673da02..f585756 100644
--- a/test/CodeGen/PowerPC/fp-branch.ll
+++ b/test/CodeGen/PowerPC/fp-branch.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 | grep fcmp | count 1
+; RUN: llc < %s -mattr=-vsx -march=ppc32 | grep fcmp | count 1
 
 declare i1 @llvm.isunordered.f64(double, double)
 

diff --git a/test/CodeGen/PowerPC/fp_to_uint.ll b/test/CodeGen/PowerPC/fp_to_uint.ll
index 1360b62..187d2d6 100644
--- a/test/CodeGen/PowerPC/fp_to_uint.ll
+++ b/test/CodeGen/PowerPC/fp_to_uint.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=ppc32 | grep fctiwz | count 1
+; RUN: llc < %s -mattr=-vsx -march=ppc32 | grep fctiwz | count 1
+
 
 define i16 @foo(float %a) {
 entry:

diff --git a/test/CodeGen/PowerPC/fsel.ll b/test/CodeGen/PowerPC/fsel.ll
index 8cd43e6..afceb63 100644
--- a/test/CodeGen/PowerPC/fsel.ll
+++ b/test/CodeGen/PowerPC/fsel.ll

@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-no-infs-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=CHECK-FM %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-no-infs-fp-math -enable-no-nans-fp-math -mattr=-vsx | FileCheck -check-prefix=CHECK-FM %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-no-infs-fp-math -enable-no-nans-fp-math -mattr=+vsx | FileCheck -check-prefix=CHECK-FM-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -16,6 +17,10 @@
 ; CHECK-FM: @zerocmp1
 ; CHECK-FM: fsel 1, 1, 2, 3
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @zerocmp1
+; CHECK-FM-VSX: fsel 1, 1, 2, 3
+; CHECK-FM-VSX: blr
 }
 
 define double @zerocmp2(double %a, double %y, double %z) #0 {
@@ -32,6 +37,11 @@
 ; CHECK-FM: fneg [[REG:[0-9]+]], 1
 ; CHECK-FM: fsel 1, [[REG]], 3, 2
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @zerocmp2
+; CHECK-FM-VSX: xsnegdp [[REG:[0-9]+]], 1
+; CHECK-FM-VSX: fsel 1, [[REG]], 3, 2
+; CHECK-FM-VSX: blr
 }
 
 define double @zerocmp3(double %a, double %y, double %z) #0 {
@@ -49,6 +59,12 @@
 ; CHECK-FM: fneg [[REG2:[0-9]+]], 1
 ; CHECK-FM: fsel 1, [[REG2]], [[REG]], 3
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @zerocmp3
+; CHECK-FM-VSX: xsnegdp [[REG2:[0-9]+]], 1
+; CHECK-FM-VSX: fsel [[REG:[0-9]+]], 1, 2, 3
+; CHECK-FM-VSX: fsel 1, [[REG2]], [[REG]], 3
+; CHECK-FM-VSX: blr
 }
 
 define double @min1(double %a, double %b) #0 {
@@ -65,6 +81,11 @@
 ; CHECK-FM: fsub [[REG:[0-9]+]], 2, 1
 ; CHECK-FM: fsel 1, [[REG]], 1, 2
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @min1
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 2, 1
+; CHECK-FM-VSX: fsel 1, [[REG]], 1, 2
+; CHECK-FM-VSX: blr
 }
 
 define double @max1(double %a, double %b) #0 {
@@ -81,6 +102,11 @@
 ; CHECK-FM: fsub [[REG:[0-9]+]], 1, 2
 ; CHECK-FM: fsel 1, [[REG]], 1, 2
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @max1
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 1, 2
+; CHECK-FM-VSX: fsel 1, [[REG]], 1, 2
+; CHECK-FM-VSX: blr
 }
 
 define double @cmp1(double %a, double %b, double %y, double %z) #0 {
@@ -97,6 +123,11 @@
 ; CHECK-FM: fsub [[REG:[0-9]+]], 1, 2
 ; CHECK-FM: fsel 1, [[REG]], 3, 4
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @cmp1
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 1, 2
+; CHECK-FM-VSX: fsel 1, [[REG]], 3, 4
+; CHECK-FM-VSX: blr
 }
 
 define double @cmp2(double %a, double %b, double %y, double %z) #0 {
@@ -113,6 +144,11 @@
 ; CHECK-FM: fsub [[REG:[0-9]+]], 2, 1
 ; CHECK-FM: fsel 1, [[REG]], 4, 3
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @cmp2
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 2, 1
+; CHECK-FM-VSX: fsel 1, [[REG]], 4, 3
+; CHECK-FM-VSX: blr
 }
 
 define double @cmp3(double %a, double %b, double %y, double %z) #0 {
@@ -131,6 +167,13 @@
 ; CHECK-FM: fneg [[REG3:[0-9]+]], [[REG]]
 ; CHECK-FM: fsel 1, [[REG3]], [[REG2]], 4
 ; CHECK-FM: blr
+
+; CHECK-FM-VSX: @cmp3
+; CHECK-FM-VSX: xssubdp [[REG:[0-9]+]], 1, 2
+; CHECK-FM-VSX: xsnegdp [[REG3:[0-9]+]], [[REG]]
+; CHECK-FM-VSX: fsel [[REG2:[0-9]+]], [[REG]], 3, 4
+; CHECK-FM-VSX: fsel 1, [[REG3]], [[REG2]], 4
+; CHECK-FM-VSX: blr
 }
 
 attributes #0 = { nounwind readnone }

diff --git a/test/CodeGen/PowerPC/fsqrt.ll b/test/CodeGen/PowerPC/fsqrt.ll
index bf8c4a2..019dfa4 100644
--- a/test/CodeGen/PowerPC/fsqrt.ll
+++ b/test/CodeGen/PowerPC/fsqrt.ll

@@ -1,13 +1,13 @@
 ; fsqrt should be generated when the fsqrt feature is enabled, but not 
 ; otherwise.
 
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=+fsqrt | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=+fsqrt | \
 ; RUN:   grep "fsqrt f1, f1"
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g5 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g5 | \
 ; RUN:   grep "fsqrt f1, f1"
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=-fsqrt | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin8 -mattr=-fsqrt | \
 ; RUN:   not grep "fsqrt f1, f1"
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g4 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mtriple=powerpc-apple-darwin8 -mcpu=g4 | \
 ; RUN:   not grep "fsqrt f1, f1"
 
 declare double @llvm.sqrt.f64(double)

diff --git a/test/CodeGen/PowerPC/i64_fp.ll b/test/CodeGen/PowerPC/i64_fp.ll
index d53c948..67f4e0b 100644
--- a/test/CodeGen/PowerPC/i64_fp.ll
+++ b/test/CodeGen/PowerPC/i64_fp.ll

@@ -1,21 +1,21 @@
 ; fcfid and fctid should be generated when the 64bit feature is enabled, but not
 ; otherwise.
 
-; RUN: llc < %s -march=ppc32 -mattr=+64bit | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=+64bit | \
 ; RUN:   grep fcfid
-; RUN: llc < %s -march=ppc32 -mattr=+64bit | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=+64bit | \
 ; RUN:   grep fctidz
-; RUN: llc < %s -march=ppc32 -mcpu=g5 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mcpu=g5 | \
 ; RUN:   grep fcfid
-; RUN: llc < %s -march=ppc32 -mcpu=g5 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mcpu=g5 | \
 ; RUN:   grep fctidz
-; RUN: llc < %s -march=ppc32 -mattr=-64bit | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=-64bit | \
 ; RUN:   not grep fcfid
-; RUN: llc < %s -march=ppc32 -mattr=-64bit | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mattr=-64bit | \
 ; RUN:   not grep fctidz
-; RUN: llc < %s -march=ppc32 -mcpu=g4 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mcpu=g4 | \
 ; RUN:   not grep fcfid
-; RUN: llc < %s -march=ppc32 -mcpu=g4 | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -mcpu=g4 | \
 ; RUN:   not grep fctidz
 
 define double @X(double %Y) {

diff --git a/test/CodeGen/PowerPC/mcm-10.ll b/test/CodeGen/PowerPC/mcm-10.ll
index c3ab747..9565ebc 100644
--- a/test/CodeGen/PowerPC/mcm-10.ll
+++ b/test/CodeGen/PowerPC/mcm-10.ll

@@ -22,5 +22,4 @@
 ; CHECK-NOT: extsw
 ; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
 ; CHECK: .type [[VAR]],@object
-; CHECK: .local [[VAR]]
-; CHECK: .comm [[VAR]],4,4
+; CHECK: .lcomm [[VAR]],4,4

diff --git a/test/CodeGen/PowerPC/mcm-12.ll b/test/CodeGen/PowerPC/mcm-12.ll
index b31b605..668b54f 100644
--- a/test/CodeGen/PowerPC/mcm-12.ll
+++ b/test/CodeGen/PowerPC/mcm-12.ll

@@ -1,4 +1,5 @@
-; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 
 ; Test peephole optimization for medium code model (32-bit TOC offsets)
 ; for loading a value from the constant pool (TOC-relative).
@@ -16,3 +17,10 @@
 ; CHECK-LABEL: test_double_const:
 ; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
 ; CHECK: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+
+; CHECK-VSX: [[VAR:[a-z0-9A-Z_.]+]]:
+; CHECK-VSX: .quad 4562098671269285104
+; CHECK-VSX-LABEL: test_double_const:
+; CHECK-VSX: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
+; CHECK-VSX: addi [[REG1]], {{[0-9]+}}, [[VAR]]@toc@l
+; CHECK-VSX: lxsdx {{[0-9]+}}, 0, [[REG1]]

diff --git a/test/CodeGen/PowerPC/mcm-2.ll b/test/CodeGen/PowerPC/mcm-2.ll
index fee98d8..811600e 100644
--- a/test/CodeGen/PowerPC/mcm-2.ll
+++ b/test/CodeGen/PowerPC/mcm-2.ll

@@ -23,8 +23,7 @@
 ; MEDIUM: lwz {{[0-9]+}}, 0([[REG2]])
 ; MEDIUM: stw {{[0-9]+}}, 0([[REG2]])
 ; MEDIUM: .type [[VAR]],@object
-; MEDIUM: .local [[VAR]]
-; MEDIUM: .comm [[VAR]],4,4
+; MEDIUM: .lcomm [[VAR]],4,4
 
 ; LARGE-LABEL: test_fn_static:
 ; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
@@ -34,6 +33,5 @@
 ; LARGE: [[VAR]]:
 ; LARGE: .tc [[VAR2:[a-z0-9A-Z_.]+]][TC],[[VAR2]]
 ; LARGE: .type [[VAR2]],@object
-; LARGE: .local [[VAR2]]
-; LARGE: .comm [[VAR2]],4,4
+; LARGE: .lcomm [[VAR2]],4,4
 

diff --git a/test/CodeGen/PowerPC/mcm-4.ll b/test/CodeGen/PowerPC/mcm-4.ll
index 73dd902..e4ceb3a 100644
--- a/test/CodeGen/PowerPC/mcm-4.ll
+++ b/test/CodeGen/PowerPC/mcm-4.ll

@@ -1,5 +1,7 @@
-; RUN: llc -mcpu=pwr7 -O0 -code-model=medium -fast-isel=false <%s | FileCheck -check-prefix=MEDIUM %s
-; RUN: llc -mcpu=pwr7 -O0 -code-model=large -fast-isel=false <%s | FileCheck -check-prefix=LARGE %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium -fast-isel=false -mattr=-vsx <%s | FileCheck -check-prefix=MEDIUM %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium -fast-isel=false -mattr=+vsx <%s | FileCheck -check-prefix=MEDIUM-VSX %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large -fast-isel=false -mattr=-vsx <%s | FileCheck -check-prefix=LARGE %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large -fast-isel=false -mattr=+vsx <%s | FileCheck -check-prefix=LARGE-VSX %s
 
 ; Test correct code generation for medium and large code model
 ; for loading a value from the constant pool (TOC-relative).
@@ -19,9 +21,23 @@
 ; MEDIUM: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
 ; MEDIUM: lfd {{[0-9]+}}, 0([[REG2]])
 
+; MEDIUM-VSX: [[VAR:[a-z0-9A-Z_.]+]]:
+; MEDIUM-VSX: .quad 4562098671269285104
+; MEDIUM-VSX-LABEL: test_double_const:
+; MEDIUM-VSX: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
+; MEDIUM-VSX: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; MEDIUM-VSX: lxsdx {{[0-9]+}}, 0, [[REG2]]
+
 ; LARGE: [[VAR:[a-z0-9A-Z_.]+]]:
 ; LARGE: .quad 4562098671269285104
 ; LARGE-LABEL: test_double_const:
 ; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR2:[a-z0-9A-Z_.]+]]@toc@ha
 ; LARGE: ld [[REG2:[0-9]+]], [[VAR2]]@toc@l([[REG1]])
 ; LARGE: lfd {{[0-9]+}}, 0([[REG2]])
+
+; LARGE-VSX: [[VAR:[a-z0-9A-Z_.]+]]:
+; LARGE-VSX: .quad 4562098671269285104
+; LARGE-VSX-LABEL: test_double_const:
+; LARGE-VSX: addis [[REG1:[0-9]+]], 2, [[VAR2:[a-z0-9A-Z_.]+]]@toc@ha
+; LARGE-VSX: ld [[REG2:[0-9]+]], [[VAR2]]@toc@l([[REG1]])
+; LARGE-VSX: lxsdx {{[0-9]+}}, 0, [[REG2]]

diff --git a/test/CodeGen/PowerPC/ppc32-lshrti3.ll b/test/CodeGen/PowerPC/ppc32-lshrti3.ll
new file mode 100644
index 0000000..6e76fea
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-lshrti3.ll

@@ -0,0 +1,39 @@
+; RUN: llc -O=2 < %s -mtriple=powerpc-netbsd | FileCheck %s
+
+; CHECK-NOT: bl __lshrti3
+
+; ModuleID = 'lshrti3-ppc32.c'
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc--netbsd"
+
+; Function Attrs: nounwind uwtable
+define i32 @fn1() #0 {
+entry:
+  %.promoted = load i72* inttoptr (i32 1 to i72*), align 4
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %bf.set3 = phi i72 [ %bf.set, %while.cond ], [ %.promoted, %entry ]
+  %bf.lshr = lshr i72 %bf.set3, 40
+  %bf.lshr.tr = trunc i72 %bf.lshr to i32
+  %bf.cast = and i32 %bf.lshr.tr, 65535
+  %dec = add nsw i32 %bf.lshr.tr, 65535
+  %0 = zext i32 %dec to i72
+  %bf.value = shl nuw i72 %0, 40
+  %bf.shl = and i72 %bf.value, 72056494526300160
+  %bf.clear2 = and i72 %bf.set3, -72056494526300161
+  %bf.set = or i72 %bf.shl, %bf.clear2
+  %tobool = icmp eq i32 %bf.cast, 0
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %bf.set.lcssa = phi i72 [ %bf.set, %while.cond ]
+  store i72 %bf.set.lcssa, i72* inttoptr (i32 1 to i72*), align 4
+  ret i32 undef
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 (213754)"}

diff --git a/test/CodeGen/PowerPC/ppc32-pic-large.ll b/test/CodeGen/PowerPC/ppc32-pic-large.ll
new file mode 100644
index 0000000..ecc4f10
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-pic-large.ll

@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic | FileCheck -check-prefix=LARGE-BSS %s
+@bar = common global i32 0, align 4
+
+define i32 @foo() {
+entry:
+  %0 = load i32* @bar, align 4
+  ret i32 %0
+}
+
+!llvm.module.flags = !{!0}
+!0 = metadata !{i32 1, metadata !"PIC Level", i32 2}
+; LARGE-BSS:       [[POFF:\.L[0-9]+\$poff]]:
+; LARGE-BSS-NEXT:    .long .LTOC-[[PB:\.L[0-9]+\$pb]]
+; LARGE-BSS-NEXT:  foo:
+; LARGE-BSS:         bl [[PB]]
+; LARGE-BSS-NEXT:  [[PB]]:
+; LARGE-BSS:         mflr 30
+; LARGE-BSS:         lwz [[REG:[0-9]+]], [[POFF]]-[[PB]](30)
+; LARGE-BSS-NEXT:    add 30, [[REG]], 30
+; LARGE-BSS:         lwz [[VREG:[0-9]+]], [[VREF:\.LC[0-9]+]]-.LTOC(30)
+; LARGE-BSS:         lwz {{[0-9]+}}, 0([[VREG]])
+; LARGE-BSS:       [[VREF]]:
+; LARGE-BSS-NEXT:    .long bar

diff --git a/test/CodeGen/PowerPC/ppc32-pic.ll b/test/CodeGen/PowerPC/ppc32-pic.ll
new file mode 100644
index 0000000..f9c3467
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc32-pic.ll

@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic | FileCheck -check-prefix=SMALL-BSS %s
+@bar = common global i32 0, align 4
+
+define i32 @foo() {
+entry:
+  %0 = load i32* @bar, align 4
+  ret i32 %0
+}
+
+!llvm.module.flags = !{!0}
+!0 = metadata !{i32 1, metadata !"PIC Level", i32 1}
+; SMALL-BSS-LABEL:foo:
+; SMALL-BSS:         bl _GLOBAL_OFFSET_TABLE_@local-4
+; SMALL-BSS:         mflr 30
+; SMALL-BSS:         lwz [[VREG:[0-9]+]], bar@GOT(30)
+; SMALL-BSS:         lwz {{[0-9]+}}, 0([[VREG]])

diff --git a/test/CodeGen/PowerPC/ppc440-msync.ll b/test/CodeGen/PowerPC/ppc440-msync.ll
index 1274173..3f4e7fd 100644
--- a/test/CodeGen/PowerPC/ppc440-msync.ll
+++ b/test/CodeGen/PowerPC/ppc440-msync.ll

@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=ppc32 | FileCheck %s
+; RUN: llc < %s -march=ppc64 -mcpu=a2 | FileCheck %s
 ; RUN: llc < %s -march=ppc32 -mcpu=440 | FileCheck %s -check-prefix=BE-CHK
 
 define i32 @has_a_fence(i32 %a, i32 %b) nounwind {

diff --git a/test/CodeGen/PowerPC/ppc64-align-long-double.ll b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
index 764d3ce..5ed029c 100644
--- a/test/CodeGen/PowerPC/ppc64-align-long-double.ll
+++ b/test/CodeGen/PowerPC/ppc64-align-long-double.ll

@@ -1,4 +1,5 @@
-; RUN: llc -mcpu=pwr7 -O0 -fast-isel=false < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -fast-isel=false -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -fast-isel=false -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 
 ; Verify internal alignment of long double in a struct.  The double
 ; argument comes in in GPR3; GPR4 is skipped; GPRs 5 and 6 contain
@@ -24,3 +25,12 @@
 ; CHECK: lfd 1, 64(1)
 ; CHECK: lfd 2, 72(1)
 
+; CHECK-VSX: std 6, 72(1)
+; CHECK-VSX: std 5, 64(1)
+; CHECK-VSX: std 4, 56(1)
+; CHECK-VSX: std 3, 48(1)
+; CHECK-VSX: li 3, 16
+; CHECK-VSX: addi 4, 1, 48
+; CHECK-VSX: lxsdx 1, 4, 3
+; CHECK-VSX: li 3, 24
+; CHECK-VSX: lxsdx 2, 4, 3

diff --git a/test/CodeGen/PowerPC/ppc64-elf-abi.ll b/test/CodeGen/PowerPC/ppc64-elf-abi.ll
new file mode 100644
index 0000000..d82122d
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-elf-abi.ll

@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK-ELFv1
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mattr=+elfv1 < %s | FileCheck %s -check-prefix=CHECK-ELFv1
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mattr=+elfv2 < %s | FileCheck %s -check-prefix=CHECK-ELFv2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK-ELFv2
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mattr=+elfv1 < %s | FileCheck %s -check-prefix=CHECK-ELFv1
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mattr=+elfv2 < %s | FileCheck %s -check-prefix=CHECK-ELFv2
+
+; CHECK-ELFv2: .abiversion 2
+; CHECK-ELFv1-NOT: .abiversion 2
+

diff --git a/test/CodeGen/PowerPC/ppc64-prefetch.ll b/test/CodeGen/PowerPC/ppc64-prefetch.ll
index b2f3709..b2f6e7d 100644
--- a/test/CodeGen/PowerPC/ppc64-prefetch.ll
+++ b/test/CodeGen/PowerPC/ppc64-prefetch.ll

@@ -1,15 +1,34 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -mcpu=a2 < %s | FileCheck %s
 
 define void @test1(i8* %a, ...) nounwind {
 entry:
   call void @llvm.prefetch(i8* %a, i32 0, i32 3, i32 1)
   ret void
+
+; CHECK-LABEL: @test1
+; CHECK: dcbt
 }
 
 declare void @llvm.prefetch(i8*, i32, i32, i32)
 
-; CHECK: @test1
-; CHECK: dcbt
+define void @test2(i8* %a, ...) nounwind {
+entry:
+  call void @llvm.prefetch(i8* %a, i32 1, i32 3, i32 1)
+  ret void
+
+; CHECK-LABEL: @test2
+; CHECK: dcbtst
+}
+
+define void @test3(i8* %a, ...) nounwind {
+entry:
+  call void @llvm.prefetch(i8* %a, i32 0, i32 3, i32 0)
+  ret void
+
+; CHECK-LABEL: @test3
+; CHECK: icbt
+}
+
 

diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
new file mode 100644
index 0000000..9eed623
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll

@@ -0,0 +1,329 @@
+; RUN: llc < %s -march=ppc64le -mcpu=pwr8 -mattr=+altivec | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+;
+; Verify use of registers for float/vector aggregate return.
+;
+
+define [8 x float] @return_float([8 x float] %x) {
+entry:
+  ret [8 x float] %x
+}
+; CHECK-LABEL: @return_float
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define [8 x double] @return_double([8 x double] %x) {
+entry:
+  ret [8 x double] %x
+}
+; CHECK-LABEL: @return_double
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define [4 x ppc_fp128] @return_ppcf128([4 x ppc_fp128] %x) {
+entry:
+  ret [4 x ppc_fp128] %x
+}
+; CHECK-LABEL: @return_ppcf128
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define [8 x <4 x i32>] @return_v4i32([8 x <4 x i32>] %x) {
+entry:
+  ret [8 x <4 x i32>] %x
+}
+; CHECK-LABEL: @return_v4i32
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+
+;
+; Verify amount of space taken up by aggregates in the parameter save area.
+;
+
+define i64 @callee_float([7 x float] %a, [7 x float] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_float
+; CHECK: ld 3, 96(1)
+; CHECK: blr
+
+define void @caller_float(i64 %x, [7 x float] %y) {
+entry:
+  tail call void @test_float([7 x float] %y, [7 x float] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_float
+; CHECK: std 3, 96(1)
+; CHECK: bl test_float
+
+declare void @test_float([7 x float], [7 x float], i64)
+
+define i64 @callee_double(i64 %a, [7 x double] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_double
+; CHECK: ld 3, 96(1)
+; CHECK: blr
+
+define void @caller_double(i64 %x, [7 x double] %y) {
+entry:
+  tail call void @test_double(i64 %x, [7 x double] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_double
+; CHECK: std 3, 96(1)
+; CHECK: bl test_double
+
+declare void @test_double(i64, [7 x double], i64)
+
+define i64 @callee_ppcf128(i64 %a, [4 x ppc_fp128] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_ppcf128
+; CHECK: ld 3, 104(1)
+; CHECK: blr
+
+define void @caller_ppcf128(i64 %x, [4 x ppc_fp128] %y) {
+entry:
+  tail call void @test_ppcf128(i64 %x, [4 x ppc_fp128] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_ppcf128
+; CHECK: std 3, 104(1)
+; CHECK: bl test_ppcf128
+
+declare void @test_ppcf128(i64, [4 x ppc_fp128], i64)
+
+define i64 @callee_i64(i64 %a, [7 x i64] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_i64
+; CHECK: ld 3, 96(1)
+; CHECK: blr
+
+define void @caller_i64(i64 %x, [7 x i64] %y) {
+entry:
+  tail call void @test_i64(i64 %x, [7 x i64] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_i64
+; CHECK: std 3, 96(1)
+; CHECK: bl test_i64
+
+declare void @test_i64(i64, [7 x i64], i64)
+
+define i64 @callee_i128(i64 %a, [4 x i128] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_i128
+; CHECK: ld 3, 112(1)
+; CHECK: blr
+
+define void @caller_i128(i64 %x, [4 x i128] %y) {
+entry:
+  tail call void @test_i128(i64 %x, [4 x i128] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_i128
+; CHECK: std 3, 112(1)
+; CHECK: bl test_i128
+
+declare void @test_i128(i64, [4 x i128], i64)
+
+define i64 @callee_v4i32(i64 %a, [4 x <4 x i32>] %b, i64 %c) {
+entry:
+  ret i64 %c
+}
+; CHECK-LABEL: @callee_v4i32
+; CHECK: ld 3, 112(1)
+; CHECK: blr
+
+define void @caller_v4i32(i64 %x, [4 x <4 x i32>] %y) {
+entry:
+  tail call void @test_v4i32(i64 %x, [4 x <4 x i32>] %y, i64 %x)
+  ret void
+}
+; CHECK-LABEL: @caller_v4i32
+; CHECK: std 3, 112(1)
+; CHECK: bl test_v4i32
+
+declare void @test_v4i32(i64, [4 x <4 x i32>], i64)
+
+
+;
+; Verify handling of floating point arguments in GPRs
+;
+
+%struct.float8 = type { [8 x float] }
+%struct.float5 = type { [5 x float] }
+%struct.float2 = type { [2 x float] }
+
+@g8 = common global %struct.float8 zeroinitializer, align 4
+@g5 = common global %struct.float5 zeroinitializer, align 4
+@g2 = common global %struct.float2 zeroinitializer, align 4
+
+define float @callee0([7 x float] %a, [7 x float] %b) {
+entry:
+  %b.extract = extractvalue [7 x float] %b, 6
+  ret float %b.extract
+}
+; CHECK-LABEL: @callee0
+; CHECK: stw 10, [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller0([7 x float] %a) {
+entry:
+  tail call void @test0([7 x float] %a, [7 x float] %a)
+  ret void
+}
+; CHECK-LABEL: @caller0
+; CHECK-DAG: fmr 8, 1
+; CHECK-DAG: fmr 9, 2
+; CHECK-DAG: fmr 10, 3
+; CHECK-DAG: fmr 11, 4
+; CHECK-DAG: fmr 12, 5
+; CHECK-DAG: fmr 13, 6
+; CHECK-DAG: stfs 7, [[OFF:[0-9]+]](1)
+; CHECK-DAG: lwz 10, [[OFF]](1)
+; CHECK: bl test0
+
+declare void @test0([7 x float], [7 x float])
+
+define float @callee1([8 x float] %a, [8 x float] %b) {
+entry:
+  %b.extract = extractvalue [8 x float] %b, 7
+  ret float %b.extract
+}
+; CHECK-LABEL: @callee1
+; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32
+; CHECK: stw [[REG]], [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller1([8 x float] %a) {
+entry:
+  tail call void @test1([8 x float] %a, [8 x float] %a)
+  ret void
+}
+; CHECK-LABEL: @caller1
+; CHECK-DAG: fmr 9, 1
+; CHECK-DAG: fmr 10, 2
+; CHECK-DAG: fmr 11, 3
+; CHECK-DAG: fmr 12, 4
+; CHECK-DAG: fmr 13, 5
+; CHECK-DAG: stfs 5, [[OFF0:[0-9]+]](1)
+; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1)
+; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1)
+; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1)
+; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1)
+; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1)
+; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1)
+; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1)
+; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
+; CHECK-DAG: sldi [[REG3]], [[REG3]], 32
+; CHECK-DAG: or 9, [[REG0]], [[REG1]]
+; CHECK-DAG: or 10, [[REG2]], [[REG3]]
+; CHECK: bl test1
+
+declare void @test1([8 x float], [8 x float])
+
+define float @callee2([8 x float] %a, [5 x float] %b, [2 x float] %c) {
+entry:
+  %c.extract = extractvalue [2 x float] %c, 1
+  ret float %c.extract
+}
+; CHECK-LABEL: @callee2
+; CHECK: rldicl [[REG:[0-9]+]], 10, 32, 32
+; CHECK: stw [[REG]], [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller2() {
+entry:
+  %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4
+  %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4
+  %2 = load [2 x float]* getelementptr inbounds (%struct.float2* @g2, i64 0, i32 0), align 4
+  tail call void @test2([8 x float] %0, [5 x float] %1, [2 x float] %2)
+  ret void
+}
+; CHECK-LABEL: @caller2
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK-DAG: lfs 1, 0([[REG]])
+; CHECK-DAG: lfs 2, 4([[REG]])
+; CHECK-DAG: lfs 3, 8([[REG]])
+; CHECK-DAG: lfs 4, 12([[REG]])
+; CHECK-DAG: lfs 5, 16([[REG]])
+; CHECK-DAG: lfs 6, 20([[REG]])
+; CHECK-DAG: lfs 7, 24([[REG]])
+; CHECK-DAG: lfs 8, 28([[REG]])
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK-DAG: lfs 9, 0([[REG]])
+; CHECK-DAG: lfs 10, 4([[REG]])
+; CHECK-DAG: lfs 11, 8([[REG]])
+; CHECK-DAG: lfs 12, 12([[REG]])
+; CHECK-DAG: lfs 13, 16([[REG]])
+; CHECK: ld [[REG:[0-9]+]], .LC
+; CHECK-DAG: lwz [[REG0:[0-9]+]], 0([[REG]])
+; CHECK-DAG: lwz [[REG1:[0-9]+]], 4([[REG]])
+; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
+; CHECK-DAG: or 10, [[REG0]], [[REG1]]
+; CHECK: bl test2
+
+declare void @test2([8 x float], [5 x float], [2 x float])
+
+define double @callee3([8 x float] %a, [5 x float] %b, double %c) {
+entry:
+  ret double %c
+}
+; CHECK-LABEL: @callee3
+; CHECK: std 10, [[OFF:.*]](1)
+; CHECK: lfd 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller3(double %d) {
+entry:
+  %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4
+  %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4
+  tail call void @test3([8 x float] %0, [5 x float] %1, double %d)
+  ret void
+}
+; CHECK-LABEL: @caller3
+; CHECK: stfd 1, [[OFF:.*]](1)
+; CHECK: ld 10, [[OFF]](1)
+; CHECK: bl test3
+
+declare void @test3([8 x float], [5 x float], double)
+
+define float @callee4([8 x float] %a, [5 x float] %b, float %c) {
+entry:
+  ret float %c
+}
+; CHECK-LABEL: @callee4
+; CHECK: stw 10, [[OFF:.*]](1)
+; CHECK: lfs 1, [[OFF]](1)
+; CHECK: blr
+
+define void @caller4(float %f) {
+entry:
+  %0 = load [8 x float]* getelementptr inbounds (%struct.float8* @g8, i64 0, i32 0), align 4
+  %1 = load [5 x float]* getelementptr inbounds (%struct.float5* @g5, i64 0, i32 0), align 4
+  tail call void @test4([8 x float] %0, [5 x float] %1, float %f)
+  ret void
+}
+; CHECK-LABEL: @caller4
+; CHECK: stfs 1, [[OFF:.*]](1)
+; CHECK: lwz 10, [[OFF]](1)
+; CHECK: bl test4
+
+declare void @test4([8 x float], [5 x float], float)
+

diff --git a/test/CodeGen/PowerPC/ppc64le-calls.ll b/test/CodeGen/PowerPC/ppc64le-calls.ll
new file mode 100644
index 0000000..0d667dd
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-calls.ll

@@ -0,0 +1,17 @@
+; RUN: llc -march=ppc64le -mcpu=pwr8 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Indirect calls requires a full stub creation
+define void @test_indirect(void ()* nocapture %fp) {
+; CHECK-LABEL: @test_indirect
+  tail call void %fp()
+; CHECK-DAG: std 2, 24(1)
+; CHECK-DAG: mr 12, 3
+; CHECK-DAG: mtctr 3
+; CHECK: bctrl
+; CHECK-NEXT: ld 2, 24(1)
+  ret void
+}
+

diff --git a/test/CodeGen/PowerPC/ppc64le-crsave.ll b/test/CodeGen/PowerPC/ppc64le-crsave.ll
new file mode 100644
index 0000000..17174d7
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-crsave.ll

@@ -0,0 +1,28 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@_ZTIi = external constant i8*
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+define void @crsave() {
+entry:
+  call void asm sideeffect "", "~{cr2}"()
+  call void asm sideeffect "", "~{cr3}"()
+  call void asm sideeffect "", "~{cr4}"()
+
+  %exception = call i8* @__cxa_allocate_exception(i64 4)
+  %0 = bitcast i8* %exception to i32*
+  store i32 0, i32* %0
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+
+return:                                           ; No predecessors!
+  ret void
+}
+; CHECK-LABEL: @crsave
+; CHECK: .cfi_offset cr2, 8
+; CHECK: .cfi_offset cr3, 8
+; CHECK: .cfi_offset cr4, 8
+

diff --git a/test/CodeGen/PowerPC/ppc64le-localentry.ll b/test/CodeGen/PowerPC/ppc64le-localentry.ll
new file mode 100644
index 0000000..4676ce8
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64le-localentry.ll

@@ -0,0 +1,46 @@
+; RUN: llc -march=ppc64le -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -march=ppc64le -mcpu=pwr8 -O0 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@number64 = global i64 10, align 8
+
+; CHECK: .abiversion 2
+
+define i64 @use_toc(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: @use_toc
+; CHECK-NEXT: .Ltmp[[TMP1:[0-9]+]]:
+; CHECK-NEXT: addis 2, 12, .TOC.-.Ltmp[[TMP1]]@ha
+; CHECK-NEXT: addi 2, 2, .TOC.-.Ltmp[[TMP1]]@l
+; CHECK-NEXT: .Ltmp[[TMP2:[0-9]+]]:
+; CHECK-NEXT: .localentry use_toc, .Ltmp[[TMP2]]-.Ltmp[[TMP1]]
+; CHECK-NEXT: %entry
+  %0 = load i64* @number64, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+declare void @callee()
+define void @use_toc_implicit() nounwind {
+entry:
+; CHECK-LABEL: @use_toc_implicit
+; CHECK-NEXT: .Ltmp[[TMP1:[0-9]+]]:
+; CHECK-NEXT: addis 2, 12, .TOC.-.Ltmp[[TMP1]]@ha
+; CHECK-NEXT: addi 2, 2, .TOC.-.Ltmp[[TMP1]]@l
+; CHECK-NEXT: .Ltmp[[TMP2:[0-9]+]]:
+; CHECK-NEXT: .localentry use_toc_implicit, .Ltmp[[TMP2]]-.Ltmp[[TMP1]]
+; CHECK-NEXT: %entry
+  call void @callee()
+  ret void
+}
+
+define i64 @no_toc(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: @no_toc
+; CHECK-NEXT: %entry
+  ret i64 %a
+}
+

diff --git a/test/CodeGen/PowerPC/ppc64le-smallarg.ll b/test/CodeGen/PowerPC/ppc64le-smallarg.ll
index fcb1e92..120c140 100644
--- a/test/CodeGen/PowerPC/ppc64le-smallarg.ll
+++ b/test/CodeGen/PowerPC/ppc64le-smallarg.ll

@@ -22,7 +22,7 @@
   ret void
 }
 ; CHECK: @callee1
-; CHECK: lwz {{[0-9]+}}, 120(1)
+; CHECK: lwz {{[0-9]+}}, 104(1)
 ; CHECK: blr
 
 define void @caller1() {
@@ -32,7 +32,7 @@
   ret void
 }
 ; CHECK: @caller1
-; CHECK: stw {{[0-9]+}}, 120(1)
+; CHECK: stw {{[0-9]+}}, 104(1)
 ; CHECK: bl test1
 
 declare void @test1(%struct.small_arg* sret, %struct.large_arg* byval, %struct.small_arg* byval)
@@ -42,7 +42,7 @@
   ret float %x
 }
 ; CHECK: @callee2
-; CHECK: lfs {{[0-9]+}}, 152(1)
+; CHECK: lfs {{[0-9]+}}, 136(1)
 ; CHECK: blr
 
 define void @caller2() {
@@ -52,7 +52,7 @@
   ret void
 }
 ; CHECK: @caller2
-; CHECK: stfs {{[0-9]+}}, 152(1)
+; CHECK: stfs {{[0-9]+}}, 136(1)
 ; CHECK: bl test2
 
 declare float @test2(float, float, float, float, float, float, float, float, float, float, float, float, float, float)

diff --git a/test/CodeGen/PowerPC/ppcf128-1.ll b/test/CodeGen/PowerPC/ppcf128-1.ll
index 1047fe5..2cec934 100644
--- a/test/CodeGen/PowerPC/ppcf128-1.ll
+++ b/test/CodeGen/PowerPC/ppcf128-1.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc > %t
+; RUN: opt < %s -O3 | llc > %t
 ; ModuleID = 'ld3.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin8"

diff --git a/test/CodeGen/PowerPC/pr15630.ll b/test/CodeGen/PowerPC/pr15630.ll
index c5ba8a4..3c1b604 100644
--- a/test/CodeGen/PowerPC/pr15630.ll
+++ b/test/CodeGen/PowerPC/pr15630.ll

@@ -13,4 +13,5 @@
   ret void
 }
 
-; CHECK: stwcx.
+; CHECK: sync
+; CHECK: stb

diff --git a/test/CodeGen/PowerPC/pr17168.ll b/test/CodeGen/PowerPC/pr17168.ll
index 24bcda0..c3f0162 100644
--- a/test/CodeGen/PowerPC/pr17168.ll
+++ b/test/CodeGen/PowerPC/pr17168.ll

@@ -25,7 +25,7 @@
 for.end1042:                                      ; preds = %for.cond968.preheader, %for.cond964.preheader, %entry
   %0 = phi i32 [ undef, %for.cond964.preheader ], [ undef, %for.cond968.preheader ], [ undef, %entry ]
   %1 = load i32* getelementptr inbounds ([3 x i32]* @grid_points, i64 0, i64 0), align 4, !dbg !443, !tbaa !444
-  tail call void @llvm.dbg.value(metadata !447, i64 0, metadata !119), !dbg !448
+  tail call void @llvm.dbg.value(metadata !447, i64 0, metadata !119, metadata !{metadata !"0x102"}), !dbg !448
   %sub10454270 = add nsw i32 %0, -1, !dbg !448
   %cmp10464271 = icmp sgt i32 %sub10454270, 1, !dbg !448
   %sub11134263 = add nsw i32 %1, -1, !dbg !450
@@ -46,7 +46,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -54,468 +54,468 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!438, !464}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 190311)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !298, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 190311)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !298, metadata !2} ; [ DW_TAG_compile_unit ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"bt.c", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !82, metadata !102, metadata !114, metadata !132, metadata !145, metadata !154, metadata !155, metadata !162, metadata !183, metadata !200, metadata !201, metadata !207, metadata !208, metadata !215, metadata !221, metadata !230, metadata !238, metadata !246, metadata !255, metadata !260, metadata !261, metadata !268, metadata !274, metadata !279, metadata !280, metadata !287, metadata !293}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 74, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !12, i32 74} ; [ DW_TAG_subprogram ] [line 74] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\0074\000\001\000\006\00256\001\0074", metadata !1, metadata !5, metadata !6, null, null, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 74] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8, metadata !9}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!11 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_unsigned_char]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = metadata !{metadata !"0x24\00char\000\008\008\000\000\008", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_unsigned_char]
 !12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16, metadata !17, metadata !18, metadata !19, metadata !21, metadata !22, metadata !23, metadata !25, metadata !26}
-!13 = metadata !{i32 786689, metadata !4, metadata !"argc", metadata !5, i32 16777290, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 74]
-!14 = metadata !{i32 786689, metadata !4, metadata !"argv", metadata !5, i32 33554506, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 74]
-!15 = metadata !{i32 786688, metadata !4, metadata !"niter", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [niter] [line 76]
-!16 = metadata !{i32 786688, metadata !4, metadata !"step", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [step] [line 76]
-!17 = metadata !{i32 786688, metadata !4, metadata !"n3", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n3] [line 76]
-!18 = metadata !{i32 786688, metadata !4, metadata !"nthreads", metadata !5, i32 77, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [nthreads] [line 77]
-!19 = metadata !{i32 786688, metadata !4, metadata !"navg", metadata !5, i32 78, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [navg] [line 78]
-!20 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!21 = metadata !{i32 786688, metadata !4, metadata !"mflops", metadata !5, i32 78, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [mflops] [line 78]
-!22 = metadata !{i32 786688, metadata !4, metadata !"tmax", metadata !5, i32 80, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [tmax] [line 80]
-!23 = metadata !{i32 786688, metadata !4, metadata !"verified", metadata !5, i32 81, metadata !24, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [verified] [line 81]
-!24 = metadata !{i32 786454, metadata !1, null, metadata !"boolean", i32 12, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] [boolean] [line 12, size 0, align 0, offset 0] [from int]
-!25 = metadata !{i32 786688, metadata !4, metadata !"class", metadata !5, i32 82, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [class] [line 82]
-!26 = metadata !{i32 786688, metadata !4, metadata !"fp", metadata !5, i32 83, metadata !27, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [fp] [line 83]
-!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from FILE]
-!28 = metadata !{i32 786454, metadata !1, null, metadata !"FILE", i32 49, i64 0, i64 0, i64 0, i32 0, metadata !29} ; [ DW_TAG_typedef ] [FILE] [line 49, size 0, align 0, offset 0] [from _IO_FILE]
-!29 = metadata !{i32 786451, metadata !30, null, metadata !"_IO_FILE", i32 271, i64 1728, i64 64, i32 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [_IO_FILE] [line 271, size 1728, align 64, offset 0] [def] [from ]
+!13 = metadata !{metadata !"0x101\00argc\0016777290\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [argc] [line 74]
+!14 = metadata !{metadata !"0x101\00argv\0033554506\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [argv] [line 74]
+!15 = metadata !{metadata !"0x100\00niter\0076\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [niter] [line 76]
+!16 = metadata !{metadata !"0x100\00step\0076\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [step] [line 76]
+!17 = metadata !{metadata !"0x100\00n3\0076\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n3] [line 76]
+!18 = metadata !{metadata !"0x100\00nthreads\0077\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [nthreads] [line 77]
+!19 = metadata !{metadata !"0x100\00navg\0078\000", metadata !4, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [navg] [line 78]
+!20 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!21 = metadata !{metadata !"0x100\00mflops\0078\000", metadata !4, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [mflops] [line 78]
+!22 = metadata !{metadata !"0x100\00tmax\0080\000", metadata !4, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [tmax] [line 80]
+!23 = metadata !{metadata !"0x100\00verified\0081\000", metadata !4, metadata !5, metadata !24} ; [ DW_TAG_auto_variable ] [verified] [line 81]
+!24 = metadata !{metadata !"0x16\00boolean\0012\000\000\000\000", metadata !1, null, metadata !8} ; [ DW_TAG_typedef ] [boolean] [line 12, size 0, align 0, offset 0] [from int]
+!25 = metadata !{metadata !"0x100\00class\0082\000", metadata !4, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [class] [line 82]
+!26 = metadata !{metadata !"0x100\00fp\0083\000", metadata !4, metadata !5, metadata !27} ; [ DW_TAG_auto_variable ] [fp] [line 83]
+!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from FILE]
+!28 = metadata !{metadata !"0x16\00FILE\0049\000\000\000\000", metadata !1, null, metadata !29} ; [ DW_TAG_typedef ] [FILE] [line 49, size 0, align 0, offset 0] [from _IO_FILE]
+!29 = metadata !{metadata !"0x13\00_IO_FILE\00271\001728\0064\000\000\000", metadata !30, null, null, metadata !31, null, null, null} ; [ DW_TAG_structure_type ] [_IO_FILE] [line 271, size 1728, align 64, offset 0] [def] [from ]
 !30 = metadata !{metadata !"/usr/include/libio.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
 !31 = metadata !{metadata !32, metadata !33, metadata !34, metadata !35, metadata !36, metadata !37, metadata !38, metadata !39, metadata !40, metadata !41, metadata !42, metadata !43, metadata !44, metadata !52, metadata !53, metadata !54, metadata !55, metadata !58, metadata !60, metadata !62, metadata !66, metadata !68, metadata !70, metadata !71, metadata !72, metadata !73, metadata !74, metadata !77, metadata !78}
-!32 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_flags", i32 272, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [_flags] [line 272, size 32, align 32, offset 0] [from int]
-!33 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_ptr", i32 277, i64 64, i64 64, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_ptr] [line 277, size 64, align 64, offset 64] [from ]
-!34 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_end", i32 278, i64 64, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_end] [line 278, size 64, align 64, offset 128] [from ]
-!35 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_base", i32 279, i64 64, i64 64, i64 192, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_base] [line 279, size 64, align 64, offset 192] [from ]
-!36 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_base", i32 280, i64 64, i64 64, i64 256, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_base] [line 280, size 64, align 64, offset 256] [from ]
-!37 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_ptr", i32 281, i64 64, i64 64, i64 320, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_ptr] [line 281, size 64, align 64, offset 320] [from ]
-!38 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_end", i32 282, i64 64, i64 64, i64 384, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_end] [line 282, size 64, align 64, offset 384] [from ]
-!39 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_buf_base", i32 283, i64 64, i64 64, i64 448, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_buf_base] [line 283, size 64, align 64, offset 448] [from ]
-!40 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_buf_end", i32 284, i64 64, i64 64, i64 512, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_buf_end] [line 284, size 64, align 64, offset 512] [from ]
-!41 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_save_base", i32 286, i64 64, i64 64, i64 576, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_save_base] [line 286, size 64, align 64, offset 576] [from ]
-!42 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_backup_base", i32 287, i64 64, i64 64, i64 640, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_backup_base] [line 287, size 64, align 64, offset 640] [from ]
-!43 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_save_end", i32 288, i64 64, i64 64, i64 704, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_save_end] [line 288, size 64, align 64, offset 704] [from ]
-!44 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_markers", i32 290, i64 64, i64 64, i64 768, i32 0, metadata !45} ; [ DW_TAG_member ] [_markers] [line 290, size 64, align 64, offset 768] [from ]
-!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !46} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_marker]
-!46 = metadata !{i32 786451, metadata !30, null, metadata !"_IO_marker", i32 186, i64 192, i64 64, i32 0, i32 0, null, metadata !47, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [_IO_marker] [line 186, size 192, align 64, offset 0] [def] [from ]
+!32 = metadata !{metadata !"0xd\00_flags\00272\0032\0032\000\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_flags] [line 272, size 32, align 32, offset 0] [from int]
+!33 = metadata !{metadata !"0xd\00_IO_read_ptr\00277\0064\0064\0064\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_read_ptr] [line 277, size 64, align 64, offset 64] [from ]
+!34 = metadata !{metadata !"0xd\00_IO_read_end\00278\0064\0064\00128\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_read_end] [line 278, size 64, align 64, offset 128] [from ]
+!35 = metadata !{metadata !"0xd\00_IO_read_base\00279\0064\0064\00192\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_read_base] [line 279, size 64, align 64, offset 192] [from ]
+!36 = metadata !{metadata !"0xd\00_IO_write_base\00280\0064\0064\00256\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_write_base] [line 280, size 64, align 64, offset 256] [from ]
+!37 = metadata !{metadata !"0xd\00_IO_write_ptr\00281\0064\0064\00320\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_write_ptr] [line 281, size 64, align 64, offset 320] [from ]
+!38 = metadata !{metadata !"0xd\00_IO_write_end\00282\0064\0064\00384\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_write_end] [line 282, size 64, align 64, offset 384] [from ]
+!39 = metadata !{metadata !"0xd\00_IO_buf_base\00283\0064\0064\00448\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_buf_base] [line 283, size 64, align 64, offset 448] [from ]
+!40 = metadata !{metadata !"0xd\00_IO_buf_end\00284\0064\0064\00512\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_buf_end] [line 284, size 64, align 64, offset 512] [from ]
+!41 = metadata !{metadata !"0xd\00_IO_save_base\00286\0064\0064\00576\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_save_base] [line 286, size 64, align 64, offset 576] [from ]
+!42 = metadata !{metadata !"0xd\00_IO_backup_base\00287\0064\0064\00640\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_backup_base] [line 287, size 64, align 64, offset 640] [from ]
+!43 = metadata !{metadata !"0xd\00_IO_save_end\00288\0064\0064\00704\000", metadata !30, metadata !29, metadata !10} ; [ DW_TAG_member ] [_IO_save_end] [line 288, size 64, align 64, offset 704] [from ]
+!44 = metadata !{metadata !"0xd\00_markers\00290\0064\0064\00768\000", metadata !30, metadata !29, metadata !45} ; [ DW_TAG_member ] [_markers] [line 290, size 64, align 64, offset 768] [from ]
+!45 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !46} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_marker]
+!46 = metadata !{metadata !"0x13\00_IO_marker\00186\00192\0064\000\000\000", metadata !30, null, null, metadata !47, null, null, null} ; [ DW_TAG_structure_type ] [_IO_marker] [line 186, size 192, align 64, offset 0] [def] [from ]
 !47 = metadata !{metadata !48, metadata !49, metadata !51}
-!48 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_next", i32 187, i64 64, i64 64, i64 0, i32 0, metadata !45} ; [ DW_TAG_member ] [_next] [line 187, size 64, align 64, offset 0] [from ]
-!49 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_sbuf", i32 188, i64 64, i64 64, i64 64, i32 0, metadata !50} ; [ DW_TAG_member ] [_sbuf] [line 188, size 64, align 64, offset 64] [from ]
-!50 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_FILE]
-!51 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_pos", i32 192, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [_pos] [line 192, size 32, align 32, offset 128] [from int]
-!52 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_chain", i32 292, i64 64, i64 64, i64 832, i32 0, metadata !50} ; [ DW_TAG_member ] [_chain] [line 292, size 64, align 64, offset 832] [from ]
-!53 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_fileno", i32 294, i64 32, i64 32, i64 896, i32 0, metadata !8} ; [ DW_TAG_member ] [_fileno] [line 294, size 32, align 32, offset 896] [from int]
-!54 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_flags2", i32 298, i64 32, i64 32, i64 928, i32 0, metadata !8} ; [ DW_TAG_member ] [_flags2] [line 298, size 32, align 32, offset 928] [from int]
-!55 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_old_offset", i32 300, i64 64, i64 64, i64 960, i32 0, metadata !56} ; [ DW_TAG_member ] [_old_offset] [line 300, size 64, align 64, offset 960] [from __off_t]
-!56 = metadata !{i32 786454, metadata !30, null, metadata !"__off_t", i32 141, i64 0, i64 0, i64 0, i32 0, metadata !57} ; [ DW_TAG_typedef ] [__off_t] [line 141, size 0, align 0, offset 0] [from long int]
-!57 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!58 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_cur_column", i32 304, i64 16, i64 16, i64 1024, i32 0, metadata !59} ; [ DW_TAG_member ] [_cur_column] [line 304, size 16, align 16, offset 1024] [from unsigned short]
-!59 = metadata !{i32 786468, null, null, metadata !"unsigned short", i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
-!60 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_vtable_offset", i32 305, i64 8, i64 8, i64 1040, i32 0, metadata !61} ; [ DW_TAG_member ] [_vtable_offset] [line 305, size 8, align 8, offset 1040] [from signed char]
-!61 = metadata !{i32 786468, null, null, metadata !"signed char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!62 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_shortbuf", i32 306, i64 8, i64 8, i64 1048, i32 0, metadata !63} ; [ DW_TAG_member ] [_shortbuf] [line 306, size 8, align 8, offset 1048] [from ]
-!63 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 8, i64 8, i32 0, i32 0, metadata !11, metadata !64, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!48 = metadata !{metadata !"0xd\00_next\00187\0064\0064\000\000", metadata !30, metadata !46, metadata !45} ; [ DW_TAG_member ] [_next] [line 187, size 64, align 64, offset 0] [from ]
+!49 = metadata !{metadata !"0xd\00_sbuf\00188\0064\0064\0064\000", metadata !30, metadata !46, metadata !50} ; [ DW_TAG_member ] [_sbuf] [line 188, size 64, align 64, offset 64] [from ]
+!50 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_FILE]
+!51 = metadata !{metadata !"0xd\00_pos\00192\0032\0032\00128\000", metadata !30, metadata !46, metadata !8} ; [ DW_TAG_member ] [_pos] [line 192, size 32, align 32, offset 128] [from int]
+!52 = metadata !{metadata !"0xd\00_chain\00292\0064\0064\00832\000", metadata !30, metadata !29, metadata !50} ; [ DW_TAG_member ] [_chain] [line 292, size 64, align 64, offset 832] [from ]
+!53 = metadata !{metadata !"0xd\00_fileno\00294\0032\0032\00896\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_fileno] [line 294, size 32, align 32, offset 896] [from int]
+!54 = metadata !{metadata !"0xd\00_flags2\00298\0032\0032\00928\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_flags2] [line 298, size 32, align 32, offset 928] [from int]
+!55 = metadata !{metadata !"0xd\00_old_offset\00300\0064\0064\00960\000", metadata !30, metadata !29, metadata !56} ; [ DW_TAG_member ] [_old_offset] [line 300, size 64, align 64, offset 960] [from __off_t]
+!56 = metadata !{metadata !"0x16\00__off_t\00141\000\000\000\000", metadata !30, null, metadata !57} ; [ DW_TAG_typedef ] [__off_t] [line 141, size 0, align 0, offset 0] [from long int]
+!57 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!58 = metadata !{metadata !"0xd\00_cur_column\00304\0016\0016\001024\000", metadata !30, metadata !29, metadata !59} ; [ DW_TAG_member ] [_cur_column] [line 304, size 16, align 16, offset 1024] [from unsigned short]
+!59 = metadata !{metadata !"0x24\00unsigned short\000\0016\0016\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
+!60 = metadata !{metadata !"0xd\00_vtable_offset\00305\008\008\001040\000", metadata !30, metadata !29, metadata !61} ; [ DW_TAG_member ] [_vtable_offset] [line 305, size 8, align 8, offset 1040] [from signed char]
+!61 = metadata !{metadata !"0x24\00signed char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!62 = metadata !{metadata !"0xd\00_shortbuf\00306\008\008\001048\000", metadata !30, metadata !29, metadata !63} ; [ DW_TAG_member ] [_shortbuf] [line 306, size 8, align 8, offset 1048] [from ]
+!63 = metadata !{metadata !"0x1\00\000\008\008\000\000", null, null, metadata !11, metadata !64, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
 !64 = metadata !{metadata !65}
-!65 = metadata !{i32 786465, i64 0, i64 1}        ; [ DW_TAG_subrange_type ] [0, 0]
-!66 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_lock", i32 310, i64 64, i64 64, i64 1088, i32 0, metadata !67} ; [ DW_TAG_member ] [_lock] [line 310, size 64, align 64, offset 1088] [from ]
-!67 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!68 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_offset", i32 319, i64 64, i64 64, i64 1152, i32 0, metadata !69} ; [ DW_TAG_member ] [_offset] [line 319, size 64, align 64, offset 1152] [from __off64_t]
-!69 = metadata !{i32 786454, metadata !30, null, metadata !"__off64_t", i32 142, i64 0, i64 0, i64 0, i32 0, metadata !57} ; [ DW_TAG_typedef ] [__off64_t] [line 142, size 0, align 0, offset 0] [from long int]
-!70 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad1", i32 328, i64 64, i64 64, i64 1216, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad1] [line 328, size 64, align 64, offset 1216] [from ]
-!71 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad2", i32 329, i64 64, i64 64, i64 1280, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad2] [line 329, size 64, align 64, offset 1280] [from ]
-!72 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad3", i32 330, i64 64, i64 64, i64 1344, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad3] [line 330, size 64, align 64, offset 1344] [from ]
-!73 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad4", i32 331, i64 64, i64 64, i64 1408, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad4] [line 331, size 64, align 64, offset 1408] [from ]
-!74 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad5", i32 332, i64 64, i64 64, i64 1472, i32 0, metadata !75} ; [ DW_TAG_member ] [__pad5] [line 332, size 64, align 64, offset 1472] [from size_t]
-!75 = metadata !{i32 786454, metadata !30, null, metadata !"size_t", i32 42, i64 0, i64 0, i64 0, i32 0, metadata !76} ; [ DW_TAG_typedef ] [size_t] [line 42, size 0, align 0, offset 0] [from long unsigned int]
-!76 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!77 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_mode", i32 334, i64 32, i64 32, i64 1536, i32 0, metadata !8} ; [ DW_TAG_member ] [_mode] [line 334, size 32, align 32, offset 1536] [from int]
-!78 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_unused2", i32 336, i64 160, i64 8, i64 1568, i32 0, metadata !79} ; [ DW_TAG_member ] [_unused2] [line 336, size 160, align 8, offset 1568] [from ]
-!79 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !11, metadata !80, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!65 = metadata !{metadata !"0x21\000\001"}        ; [ DW_TAG_subrange_type ] [0, 0]
+!66 = metadata !{metadata !"0xd\00_lock\00310\0064\0064\001088\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [_lock] [line 310, size 64, align 64, offset 1088] [from ]
+!67 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!68 = metadata !{metadata !"0xd\00_offset\00319\0064\0064\001152\000", metadata !30, metadata !29, metadata !69} ; [ DW_TAG_member ] [_offset] [line 319, size 64, align 64, offset 1152] [from __off64_t]
+!69 = metadata !{metadata !"0x16\00__off64_t\00142\000\000\000\000", metadata !30, null, metadata !57} ; [ DW_TAG_typedef ] [__off64_t] [line 142, size 0, align 0, offset 0] [from long int]
+!70 = metadata !{metadata !"0xd\00__pad1\00328\0064\0064\001216\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad1] [line 328, size 64, align 64, offset 1216] [from ]
+!71 = metadata !{metadata !"0xd\00__pad2\00329\0064\0064\001280\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad2] [line 329, size 64, align 64, offset 1280] [from ]
+!72 = metadata !{metadata !"0xd\00__pad3\00330\0064\0064\001344\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad3] [line 330, size 64, align 64, offset 1344] [from ]
+!73 = metadata !{metadata !"0xd\00__pad4\00331\0064\0064\001408\000", metadata !30, metadata !29, metadata !67} ; [ DW_TAG_member ] [__pad4] [line 331, size 64, align 64, offset 1408] [from ]
+!74 = metadata !{metadata !"0xd\00__pad5\00332\0064\0064\001472\000", metadata !30, metadata !29, metadata !75} ; [ DW_TAG_member ] [__pad5] [line 332, size 64, align 64, offset 1472] [from size_t]
+!75 = metadata !{metadata !"0x16\00size_t\0042\000\000\000\000", metadata !30, null, metadata !76} ; [ DW_TAG_typedef ] [size_t] [line 42, size 0, align 0, offset 0] [from long unsigned int]
+!76 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!77 = metadata !{metadata !"0xd\00_mode\00334\0032\0032\001536\000", metadata !30, metadata !29, metadata !8} ; [ DW_TAG_member ] [_mode] [line 334, size 32, align 32, offset 1536] [from int]
+!78 = metadata !{metadata !"0xd\00_unused2\00336\00160\008\001568\000", metadata !30, metadata !29, metadata !79} ; [ DW_TAG_member ] [_unused2] [line 336, size 160, align 8, offset 1568] [from ]
+!79 = metadata !{metadata !"0x1\00\000\00160\008\000\000", null, null, metadata !11, metadata !80, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
 !80 = metadata !{metadata !81}
-!81 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
-!82 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"verify", metadata !"verify", metadata !"", i32 2388, metadata !83, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !86, i32 2388} ; [ DW_TAG_subprogram ] [line 2388] [local] [def] [verify]
-!83 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !84, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!81 = metadata !{metadata !"0x21\000\0020"}       ; [ DW_TAG_subrange_type ] [0, 19]
+!82 = metadata !{metadata !"0x2e\00verify\00verify\00\002388\001\001\000\006\00256\001\002388", metadata !1, metadata !5, metadata !83, null, null, null, null, metadata !86} ; [ DW_TAG_subprogram ] [line 2388] [local] [def] [verify]
+!83 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !84, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !84 = metadata !{null, metadata !8, metadata !10, metadata !85}
-!85 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from boolean]
+!85 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from boolean]
 !86 = metadata !{metadata !87, metadata !88, metadata !89, metadata !90, metadata !94, metadata !95, metadata !96, metadata !97, metadata !98, metadata !99, metadata !100, metadata !101}
-!87 = metadata !{i32 786689, metadata !82, metadata !"no_time_steps", metadata !5, i32 16779604, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [no_time_steps] [line 2388]
-!88 = metadata !{i32 786689, metadata !82, metadata !"class", metadata !5, i32 33556820, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [class] [line 2388]
-!89 = metadata !{i32 786689, metadata !82, metadata !"verified", metadata !5, i32 50334036, metadata !85, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [verified] [line 2388]
-!90 = metadata !{i32 786688, metadata !82, metadata !"xcrref", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcrref] [line 2397]
-!91 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 320, i64 64, i32 0, i32 0, metadata !20, metadata !92, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 64, offset 0] [from double]
+!87 = metadata !{metadata !"0x101\00no_time_steps\0016779604\000", metadata !82, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [no_time_steps] [line 2388]
+!88 = metadata !{metadata !"0x101\00class\0033556820\000", metadata !82, metadata !5, metadata !10} ; [ DW_TAG_arg_variable ] [class] [line 2388]
+!89 = metadata !{metadata !"0x101\00verified\0050334036\000", metadata !82, metadata !5, metadata !85} ; [ DW_TAG_arg_variable ] [verified] [line 2388]
+!90 = metadata !{metadata !"0x100\00xcrref\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcrref] [line 2397]
+!91 = metadata !{metadata !"0x1\00\000\00320\0064\000\000", null, null, metadata !20, metadata !92, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 64, offset 0] [from double]
 !92 = metadata !{metadata !93}
-!93 = metadata !{i32 786465, i64 0, i64 5}        ; [ DW_TAG_subrange_type ] [0, 4]
-!94 = metadata !{i32 786688, metadata !82, metadata !"xceref", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xceref] [line 2397]
-!95 = metadata !{i32 786688, metadata !82, metadata !"xcrdif", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcrdif] [line 2397]
-!96 = metadata !{i32 786688, metadata !82, metadata !"xcedif", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcedif] [line 2397]
-!97 = metadata !{i32 786688, metadata !82, metadata !"epsilon", metadata !5, i32 2398, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [epsilon] [line 2398]
-!98 = metadata !{i32 786688, metadata !82, metadata !"xce", metadata !5, i32 2398, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xce] [line 2398]
-!99 = metadata !{i32 786688, metadata !82, metadata !"xcr", metadata !5, i32 2398, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcr] [line 2398]
-!100 = metadata !{i32 786688, metadata !82, metadata !"dtref", metadata !5, i32 2398, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtref] [line 2398]
-!101 = metadata !{i32 786688, metadata !82, metadata !"m", metadata !5, i32 2399, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 2399]
-!102 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"rhs_norm", metadata !"rhs_norm", metadata !"", i32 266, metadata !103, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !106, i32 266} ; [ DW_TAG_subprogram ] [line 266] [local] [def] [rhs_norm]
-!103 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !104, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!93 = metadata !{metadata !"0x21\000\005"}        ; [ DW_TAG_subrange_type ] [0, 4]
+!94 = metadata !{metadata !"0x100\00xceref\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xceref] [line 2397]
+!95 = metadata !{metadata !"0x100\00xcrdif\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcrdif] [line 2397]
+!96 = metadata !{metadata !"0x100\00xcedif\002397\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcedif] [line 2397]
+!97 = metadata !{metadata !"0x100\00epsilon\002398\000", metadata !82, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [epsilon] [line 2398]
+!98 = metadata !{metadata !"0x100\00xce\002398\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xce] [line 2398]
+!99 = metadata !{metadata !"0x100\00xcr\002398\000", metadata !82, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [xcr] [line 2398]
+!100 = metadata !{metadata !"0x100\00dtref\002398\000", metadata !82, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [dtref] [line 2398]
+!101 = metadata !{metadata !"0x100\00m\002399\000", metadata !82, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 2399]
+!102 = metadata !{metadata !"0x2e\00rhs_norm\00rhs_norm\00\00266\001\001\000\006\00256\001\00266", metadata !1, metadata !5, metadata !103, null, null, null, null, metadata !106} ; [ DW_TAG_subprogram ] [line 266] [local] [def] [rhs_norm]
+!103 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !104, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !104 = metadata !{null, metadata !105}
-!105 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
+!105 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
 !106 = metadata !{metadata !107, metadata !108, metadata !109, metadata !110, metadata !111, metadata !112, metadata !113}
-!107 = metadata !{i32 786689, metadata !102, metadata !"rms", metadata !5, i32 16777482, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rms] [line 266]
-!108 = metadata !{i32 786688, metadata !102, metadata !"i", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 271]
-!109 = metadata !{i32 786688, metadata !102, metadata !"j", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 271]
-!110 = metadata !{i32 786688, metadata !102, metadata !"k", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 271]
-!111 = metadata !{i32 786688, metadata !102, metadata !"d", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 271]
-!112 = metadata !{i32 786688, metadata !102, metadata !"m", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 271]
-!113 = metadata !{i32 786688, metadata !102, metadata !"add", metadata !5, i32 272, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [add] [line 272]
-!114 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"compute_rhs", metadata !"compute_rhs", metadata !"", i32 1767, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @compute_rhs, null, null, metadata !117, i32 1767} ; [ DW_TAG_subprogram ] [line 1767] [local] [def] [compute_rhs]
-!115 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !116, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!107 = metadata !{metadata !"0x101\00rms\0016777482\000", metadata !102, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [rms] [line 266]
+!108 = metadata !{metadata !"0x100\00i\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 271]
+!109 = metadata !{metadata !"0x100\00j\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 271]
+!110 = metadata !{metadata !"0x100\00k\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 271]
+!111 = metadata !{metadata !"0x100\00d\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [d] [line 271]
+!112 = metadata !{metadata !"0x100\00m\00271\000", metadata !102, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 271]
+!113 = metadata !{metadata !"0x100\00add\00272\000", metadata !102, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [add] [line 272]
+!114 = metadata !{metadata !"0x2e\00compute_rhs\00compute_rhs\00\001767\001\001\000\006\00256\001\001767", metadata !1, metadata !5, metadata !115, null, void ()* @compute_rhs, null, null, metadata !117} ; [ DW_TAG_subprogram ] [line 1767] [local] [def] [compute_rhs]
+!115 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !116, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !116 = metadata !{null}
 !117 = metadata !{metadata !118, metadata !119, metadata !120, metadata !121, metadata !122, metadata !123, metadata !124, metadata !125, metadata !126, metadata !127, metadata !128, metadata !129, metadata !130, metadata !131}
-!118 = metadata !{i32 786688, metadata !114, metadata !"i", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1769]
-!119 = metadata !{i32 786688, metadata !114, metadata !"j", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1769]
-!120 = metadata !{i32 786688, metadata !114, metadata !"k", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1769]
-!121 = metadata !{i32 786688, metadata !114, metadata !"m", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 1769]
-!122 = metadata !{i32 786688, metadata !114, metadata !"rho_inv", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [rho_inv] [line 1770]
-!123 = metadata !{i32 786688, metadata !114, metadata !"uijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [uijk] [line 1770]
-!124 = metadata !{i32 786688, metadata !114, metadata !"up1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [up1] [line 1770]
-!125 = metadata !{i32 786688, metadata !114, metadata !"um1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [um1] [line 1770]
-!126 = metadata !{i32 786688, metadata !114, metadata !"vijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vijk] [line 1770]
-!127 = metadata !{i32 786688, metadata !114, metadata !"vp1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vp1] [line 1770]
-!128 = metadata !{i32 786688, metadata !114, metadata !"vm1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vm1] [line 1770]
-!129 = metadata !{i32 786688, metadata !114, metadata !"wijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wijk] [line 1770]
-!130 = metadata !{i32 786688, metadata !114, metadata !"wp1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wp1] [line 1770]
-!131 = metadata !{i32 786688, metadata !114, metadata !"wm1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wm1] [line 1770]
-!132 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"error_norm", metadata !"error_norm", metadata !"", i32 225, metadata !103, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !133, i32 225} ; [ DW_TAG_subprogram ] [line 225] [local] [def] [error_norm]
+!118 = metadata !{metadata !"0x100\00i\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 1769]
+!119 = metadata !{metadata !"0x100\00j\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 1769]
+!120 = metadata !{metadata !"0x100\00k\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 1769]
+!121 = metadata !{metadata !"0x100\00m\001769\000", metadata !114, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 1769]
+!122 = metadata !{metadata !"0x100\00rho_inv\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [rho_inv] [line 1770]
+!123 = metadata !{metadata !"0x100\00uijk\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [uijk] [line 1770]
+!124 = metadata !{metadata !"0x100\00up1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [up1] [line 1770]
+!125 = metadata !{metadata !"0x100\00um1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [um1] [line 1770]
+!126 = metadata !{metadata !"0x100\00vijk\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [vijk] [line 1770]
+!127 = metadata !{metadata !"0x100\00vp1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [vp1] [line 1770]
+!128 = metadata !{metadata !"0x100\00vm1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [vm1] [line 1770]
+!129 = metadata !{metadata !"0x100\00wijk\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [wijk] [line 1770]
+!130 = metadata !{metadata !"0x100\00wp1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [wp1] [line 1770]
+!131 = metadata !{metadata !"0x100\00wm1\001770\000", metadata !114, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [wm1] [line 1770]
+!132 = metadata !{metadata !"0x2e\00error_norm\00error_norm\00\00225\001\001\000\006\00256\001\00225", metadata !1, metadata !5, metadata !103, null, null, null, null, metadata !133} ; [ DW_TAG_subprogram ] [line 225] [local] [def] [error_norm]
 !133 = metadata !{metadata !134, metadata !135, metadata !136, metadata !137, metadata !138, metadata !139, metadata !140, metadata !141, metadata !142, metadata !143, metadata !144}
-!134 = metadata !{i32 786689, metadata !132, metadata !"rms", metadata !5, i32 16777441, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rms] [line 225]
-!135 = metadata !{i32 786688, metadata !132, metadata !"i", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 232]
-!136 = metadata !{i32 786688, metadata !132, metadata !"j", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 232]
-!137 = metadata !{i32 786688, metadata !132, metadata !"k", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 232]
-!138 = metadata !{i32 786688, metadata !132, metadata !"m", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 232]
-!139 = metadata !{i32 786688, metadata !132, metadata !"d", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 232]
-!140 = metadata !{i32 786688, metadata !132, metadata !"xi", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 233]
-!141 = metadata !{i32 786688, metadata !132, metadata !"eta", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 233]
-!142 = metadata !{i32 786688, metadata !132, metadata !"zeta", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 233]
-!143 = metadata !{i32 786688, metadata !132, metadata !"u_exact", metadata !5, i32 233, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [u_exact] [line 233]
-!144 = metadata !{i32 786688, metadata !132, metadata !"add", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [add] [line 233]
-!145 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"exact_solution", metadata !"exact_solution", metadata !"", i32 643, metadata !146, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !148, i32 644} ; [ DW_TAG_subprogram ] [line 643] [local] [def] [scope 644] [exact_solution]
-!146 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !147, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!134 = metadata !{metadata !"0x101\00rms\0016777441\000", metadata !132, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [rms] [line 225]
+!135 = metadata !{metadata !"0x100\00i\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 232]
+!136 = metadata !{metadata !"0x100\00j\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 232]
+!137 = metadata !{metadata !"0x100\00k\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 232]
+!138 = metadata !{metadata !"0x100\00m\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 232]
+!139 = metadata !{metadata !"0x100\00d\00232\000", metadata !132, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [d] [line 232]
+!140 = metadata !{metadata !"0x100\00xi\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [xi] [line 233]
+!141 = metadata !{metadata !"0x100\00eta\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [eta] [line 233]
+!142 = metadata !{metadata !"0x100\00zeta\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [zeta] [line 233]
+!143 = metadata !{metadata !"0x100\00u_exact\00233\000", metadata !132, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [u_exact] [line 233]
+!144 = metadata !{metadata !"0x100\00add\00233\000", metadata !132, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [add] [line 233]
+!145 = metadata !{metadata !"0x2e\00exact_solution\00exact_solution\00\00643\001\001\000\006\00256\001\00644", metadata !1, metadata !5, metadata !146, null, null, null, null, metadata !148} ; [ DW_TAG_subprogram ] [line 643] [local] [def] [scope 644] [exact_solution]
+!146 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !147, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !147 = metadata !{null, metadata !20, metadata !20, metadata !20, metadata !105}
 !148 = metadata !{metadata !149, metadata !150, metadata !151, metadata !152, metadata !153}
-!149 = metadata !{i32 786689, metadata !145, metadata !"xi", metadata !5, i32 16777859, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [xi] [line 643]
-!150 = metadata !{i32 786689, metadata !145, metadata !"eta", metadata !5, i32 33555075, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [eta] [line 643]
-!151 = metadata !{i32 786689, metadata !145, metadata !"zeta", metadata !5, i32 50332291, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [zeta] [line 643]
-!152 = metadata !{i32 786689, metadata !145, metadata !"dtemp", metadata !5, i32 67109508, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [dtemp] [line 644]
-!153 = metadata !{i32 786688, metadata !145, metadata !"m", metadata !5, i32 653, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 653]
-!154 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"set_constants", metadata !"set_constants", metadata !"", i32 2191, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 2191} ; [ DW_TAG_subprogram ] [line 2191] [local] [def] [set_constants]
-!155 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsinit", metadata !"lhsinit", metadata !"", i32 855, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !156, i32 855} ; [ DW_TAG_subprogram ] [line 855] [local] [def] [lhsinit]
+!149 = metadata !{metadata !"0x101\00xi\0016777859\000", metadata !145, metadata !5, metadata !20} ; [ DW_TAG_arg_variable ] [xi] [line 643]
+!150 = metadata !{metadata !"0x101\00eta\0033555075\000", metadata !145, metadata !5, metadata !20} ; [ DW_TAG_arg_variable ] [eta] [line 643]
+!151 = metadata !{metadata !"0x101\00zeta\0050332291\000", metadata !145, metadata !5, metadata !20} ; [ DW_TAG_arg_variable ] [zeta] [line 643]
+!152 = metadata !{metadata !"0x101\00dtemp\0067109508\000", metadata !145, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [dtemp] [line 644]
+!153 = metadata !{metadata !"0x100\00m\00653\000", metadata !145, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 653]
+!154 = metadata !{metadata !"0x2e\00set_constants\00set_constants\00\002191\001\001\000\006\00256\001\002191", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2191] [local] [def] [set_constants]
+!155 = metadata !{metadata !"0x2e\00lhsinit\00lhsinit\00\00855\001\001\000\006\00256\001\00855", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !156} ; [ DW_TAG_subprogram ] [line 855] [local] [def] [lhsinit]
 !156 = metadata !{metadata !157, metadata !158, metadata !159, metadata !160, metadata !161}
-!157 = metadata !{i32 786688, metadata !155, metadata !"i", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 857]
-!158 = metadata !{i32 786688, metadata !155, metadata !"j", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 857]
-!159 = metadata !{i32 786688, metadata !155, metadata !"k", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 857]
-!160 = metadata !{i32 786688, metadata !155, metadata !"m", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 857]
-!161 = metadata !{i32 786688, metadata !155, metadata !"n", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 857]
-!162 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"initialize", metadata !"initialize", metadata !"", i32 669, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !163, i32 669} ; [ DW_TAG_subprogram ] [line 669] [local] [def] [initialize]
+!157 = metadata !{metadata !"0x100\00i\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 857]
+!158 = metadata !{metadata !"0x100\00j\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 857]
+!159 = metadata !{metadata !"0x100\00k\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 857]
+!160 = metadata !{metadata !"0x100\00m\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 857]
+!161 = metadata !{metadata !"0x100\00n\00857\000", metadata !155, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 857]
+!162 = metadata !{metadata !"0x2e\00initialize\00initialize\00\00669\001\001\000\006\00256\001\00669", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !163} ; [ DW_TAG_subprogram ] [line 669] [local] [def] [initialize]
 !163 = metadata !{metadata !164, metadata !165, metadata !166, metadata !167, metadata !168, metadata !169, metadata !170, metadata !171, metadata !172, metadata !173, metadata !174, metadata !179, metadata !180, metadata !181, metadata !182}
-!164 = metadata !{i32 786688, metadata !162, metadata !"i", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 679]
-!165 = metadata !{i32 786688, metadata !162, metadata !"j", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 679]
-!166 = metadata !{i32 786688, metadata !162, metadata !"k", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 679]
-!167 = metadata !{i32 786688, metadata !162, metadata !"m", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 679]
-!168 = metadata !{i32 786688, metadata !162, metadata !"ix", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ix] [line 679]
-!169 = metadata !{i32 786688, metadata !162, metadata !"iy", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [iy] [line 679]
-!170 = metadata !{i32 786688, metadata !162, metadata !"iz", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [iz] [line 679]
-!171 = metadata !{i32 786688, metadata !162, metadata !"xi", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 680]
-!172 = metadata !{i32 786688, metadata !162, metadata !"eta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 680]
-!173 = metadata !{i32 786688, metadata !162, metadata !"zeta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 680]
-!174 = metadata !{i32 786688, metadata !162, metadata !"Pface", metadata !5, i32 680, metadata !175, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pface] [line 680]
-!175 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1920, i64 64, i32 0, i32 0, metadata !20, metadata !176, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1920, align 64, offset 0] [from double]
+!164 = metadata !{metadata !"0x100\00i\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 679]
+!165 = metadata !{metadata !"0x100\00j\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 679]
+!166 = metadata !{metadata !"0x100\00k\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 679]
+!167 = metadata !{metadata !"0x100\00m\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 679]
+!168 = metadata !{metadata !"0x100\00ix\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [ix] [line 679]
+!169 = metadata !{metadata !"0x100\00iy\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [iy] [line 679]
+!170 = metadata !{metadata !"0x100\00iz\00679\000", metadata !162, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [iz] [line 679]
+!171 = metadata !{metadata !"0x100\00xi\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [xi] [line 680]
+!172 = metadata !{metadata !"0x100\00eta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [eta] [line 680]
+!173 = metadata !{metadata !"0x100\00zeta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [zeta] [line 680]
+!174 = metadata !{metadata !"0x100\00Pface\00680\000", metadata !162, metadata !5, metadata !175} ; [ DW_TAG_auto_variable ] [Pface] [line 680]
+!175 = metadata !{metadata !"0x1\00\000\001920\0064\000\000", null, null, metadata !20, metadata !176, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1920, align 64, offset 0] [from double]
 !176 = metadata !{metadata !177, metadata !178, metadata !93}
-!177 = metadata !{i32 786465, i64 0, i64 2}       ; [ DW_TAG_subrange_type ] [0, 1]
-!178 = metadata !{i32 786465, i64 0, i64 3}       ; [ DW_TAG_subrange_type ] [0, 2]
-!179 = metadata !{i32 786688, metadata !162, metadata !"Pxi", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pxi] [line 680]
-!180 = metadata !{i32 786688, metadata !162, metadata !"Peta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Peta] [line 680]
-!181 = metadata !{i32 786688, metadata !162, metadata !"Pzeta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pzeta] [line 680]
-!182 = metadata !{i32 786688, metadata !162, metadata !"temp", metadata !5, i32 680, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [temp] [line 680]
-!183 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"exact_rhs", metadata !"exact_rhs", metadata !"", i32 301, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !184, i32 301} ; [ DW_TAG_subprogram ] [line 301] [local] [def] [exact_rhs]
+!177 = metadata !{metadata !"0x21\000\002"}       ; [ DW_TAG_subrange_type ] [0, 1]
+!178 = metadata !{metadata !"0x21\000\003"}       ; [ DW_TAG_subrange_type ] [0, 2]
+!179 = metadata !{metadata !"0x100\00Pxi\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [Pxi] [line 680]
+!180 = metadata !{metadata !"0x100\00Peta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [Peta] [line 680]
+!181 = metadata !{metadata !"0x100\00Pzeta\00680\000", metadata !162, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [Pzeta] [line 680]
+!182 = metadata !{metadata !"0x100\00temp\00680\000", metadata !162, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [temp] [line 680]
+!183 = metadata !{metadata !"0x2e\00exact_rhs\00exact_rhs\00\00301\001\001\000\006\00256\001\00301", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !184} ; [ DW_TAG_subprogram ] [line 301] [local] [def] [exact_rhs]
 !184 = metadata !{metadata !185, metadata !186, metadata !187, metadata !188, metadata !189, metadata !190, metadata !191, metadata !192, metadata !193, metadata !194, metadata !195, metadata !196, metadata !197, metadata !198, metadata !199}
-!185 = metadata !{i32 786688, metadata !183, metadata !"dtemp", metadata !5, i32 310, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtemp] [line 310]
-!186 = metadata !{i32 786688, metadata !183, metadata !"xi", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 310]
-!187 = metadata !{i32 786688, metadata !183, metadata !"eta", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 310]
-!188 = metadata !{i32 786688, metadata !183, metadata !"zeta", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 310]
-!189 = metadata !{i32 786688, metadata !183, metadata !"dtpp", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtpp] [line 310]
-!190 = metadata !{i32 786688, metadata !183, metadata !"m", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 311]
-!191 = metadata !{i32 786688, metadata !183, metadata !"i", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 311]
-!192 = metadata !{i32 786688, metadata !183, metadata !"j", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 311]
-!193 = metadata !{i32 786688, metadata !183, metadata !"k", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 311]
-!194 = metadata !{i32 786688, metadata !183, metadata !"ip1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ip1] [line 311]
-!195 = metadata !{i32 786688, metadata !183, metadata !"im1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [im1] [line 311]
-!196 = metadata !{i32 786688, metadata !183, metadata !"jp1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jp1] [line 311]
-!197 = metadata !{i32 786688, metadata !183, metadata !"jm1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jm1] [line 311]
-!198 = metadata !{i32 786688, metadata !183, metadata !"km1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [km1] [line 311]
-!199 = metadata !{i32 786688, metadata !183, metadata !"kp1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [kp1] [line 311]
-!200 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"adi", metadata !"adi", metadata !"", i32 210, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 210} ; [ DW_TAG_subprogram ] [line 210] [local] [def] [adi]
-!201 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"add", metadata !"add", metadata !"", i32 187, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !202, i32 187} ; [ DW_TAG_subprogram ] [line 187] [local] [def] [add]
+!185 = metadata !{metadata !"0x100\00dtemp\00310\000", metadata !183, metadata !5, metadata !91} ; [ DW_TAG_auto_variable ] [dtemp] [line 310]
+!186 = metadata !{metadata !"0x100\00xi\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [xi] [line 310]
+!187 = metadata !{metadata !"0x100\00eta\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [eta] [line 310]
+!188 = metadata !{metadata !"0x100\00zeta\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [zeta] [line 310]
+!189 = metadata !{metadata !"0x100\00dtpp\00310\000", metadata !183, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [dtpp] [line 310]
+!190 = metadata !{metadata !"0x100\00m\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 311]
+!191 = metadata !{metadata !"0x100\00i\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 311]
+!192 = metadata !{metadata !"0x100\00j\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 311]
+!193 = metadata !{metadata !"0x100\00k\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 311]
+!194 = metadata !{metadata !"0x100\00ip1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [ip1] [line 311]
+!195 = metadata !{metadata !"0x100\00im1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [im1] [line 311]
+!196 = metadata !{metadata !"0x100\00jp1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [jp1] [line 311]
+!197 = metadata !{metadata !"0x100\00jm1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [jm1] [line 311]
+!198 = metadata !{metadata !"0x100\00km1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [km1] [line 311]
+!199 = metadata !{metadata !"0x100\00kp1\00311\000", metadata !183, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [kp1] [line 311]
+!200 = metadata !{metadata !"0x2e\00adi\00adi\00\00210\001\001\000\006\00256\001\00210", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 210] [local] [def] [adi]
+!201 = metadata !{metadata !"0x2e\00add\00add\00\00187\001\001\000\006\00256\001\00187", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !202} ; [ DW_TAG_subprogram ] [line 187] [local] [def] [add]
 !202 = metadata !{metadata !203, metadata !204, metadata !205, metadata !206}
-!203 = metadata !{i32 786688, metadata !201, metadata !"i", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 193]
-!204 = metadata !{i32 786688, metadata !201, metadata !"j", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 193]
-!205 = metadata !{i32 786688, metadata !201, metadata !"k", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 193]
-!206 = metadata !{i32 786688, metadata !201, metadata !"m", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 193]
-!207 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_solve", metadata !"z_solve", metadata !"", i32 3457, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 3457} ; [ DW_TAG_subprogram ] [line 3457] [local] [def] [z_solve]
-!208 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_backsubstitute", metadata !"z_backsubstitute", metadata !"", i32 3480, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !209, i32 3480} ; [ DW_TAG_subprogram ] [line 3480] [local] [def] [z_backsubstitute]
+!203 = metadata !{metadata !"0x100\00i\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 193]
+!204 = metadata !{metadata !"0x100\00j\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 193]
+!205 = metadata !{metadata !"0x100\00k\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 193]
+!206 = metadata !{metadata !"0x100\00m\00193\000", metadata !201, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 193]
+!207 = metadata !{metadata !"0x2e\00z_solve\00z_solve\00\003457\001\001\000\006\00256\001\003457", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3457] [local] [def] [z_solve]
+!208 = metadata !{metadata !"0x2e\00z_backsubstitute\00z_backsubstitute\00\003480\001\001\000\006\00256\001\003480", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !209} ; [ DW_TAG_subprogram ] [line 3480] [local] [def] [z_backsubstitute]
 !209 = metadata !{metadata !210, metadata !211, metadata !212, metadata !213, metadata !214}
-!210 = metadata !{i32 786688, metadata !208, metadata !"i", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3492]
-!211 = metadata !{i32 786688, metadata !208, metadata !"j", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3492]
-!212 = metadata !{i32 786688, metadata !208, metadata !"k", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3492]
-!213 = metadata !{i32 786688, metadata !208, metadata !"m", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 3492]
-!214 = metadata !{i32 786688, metadata !208, metadata !"n", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 3492]
-!215 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_solve_cell", metadata !"z_solve_cell", metadata !"", i32 3512, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !216, i32 3512} ; [ DW_TAG_subprogram ] [line 3512] [local] [def] [z_solve_cell]
+!210 = metadata !{metadata !"0x100\00i\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3492]
+!211 = metadata !{metadata !"0x100\00j\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3492]
+!212 = metadata !{metadata !"0x100\00k\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3492]
+!213 = metadata !{metadata !"0x100\00m\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 3492]
+!214 = metadata !{metadata !"0x100\00n\003492\000", metadata !208, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 3492]
+!215 = metadata !{metadata !"0x2e\00z_solve_cell\00z_solve_cell\00\003512\001\001\000\006\00256\001\003512", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !216} ; [ DW_TAG_subprogram ] [line 3512] [local] [def] [z_solve_cell]
 !216 = metadata !{metadata !217, metadata !218, metadata !219, metadata !220}
-!217 = metadata !{i32 786688, metadata !215, metadata !"i", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3527]
-!218 = metadata !{i32 786688, metadata !215, metadata !"j", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3527]
-!219 = metadata !{i32 786688, metadata !215, metadata !"k", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3527]
-!220 = metadata !{i32 786688, metadata !215, metadata !"ksize", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ksize] [line 3527]
-!221 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"binvrhs", metadata !"binvrhs", metadata !"", i32 3154, metadata !222, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !225, i32 3154} ; [ DW_TAG_subprogram ] [line 3154] [local] [def] [binvrhs]
-!222 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !223, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!217 = metadata !{metadata !"0x100\00i\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3527]
+!218 = metadata !{metadata !"0x100\00j\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3527]
+!219 = metadata !{metadata !"0x100\00k\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3527]
+!220 = metadata !{metadata !"0x100\00ksize\003527\000", metadata !215, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [ksize] [line 3527]
+!221 = metadata !{metadata !"0x2e\00binvrhs\00binvrhs\00\003154\001\001\000\006\00256\001\003154", metadata !1, metadata !5, metadata !222, null, null, null, null, metadata !225} ; [ DW_TAG_subprogram ] [line 3154] [local] [def] [binvrhs]
+!222 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !223, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !223 = metadata !{null, metadata !224, metadata !105}
-!224 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !91} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!224 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !91} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !225 = metadata !{metadata !226, metadata !227, metadata !228, metadata !229}
-!226 = metadata !{i32 786689, metadata !221, metadata !"lhs", metadata !5, i32 16780370, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [lhs] [line 3154]
-!227 = metadata !{i32 786689, metadata !221, metadata !"r", metadata !5, i32 33557586, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 3154]
-!228 = metadata !{i32 786688, metadata !221, metadata !"pivot", metadata !5, i32 3159, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pivot] [line 3159]
-!229 = metadata !{i32 786688, metadata !221, metadata !"coeff", metadata !5, i32 3159, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [coeff] [line 3159]
-!230 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"matmul_sub", metadata !"matmul_sub", metadata !"", i32 2841, metadata !231, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !233, i32 2842} ; [ DW_TAG_subprogram ] [line 2841] [local] [def] [scope 2842] [matmul_sub]
-!231 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !232, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!226 = metadata !{metadata !"0x101\00lhs\0016780370\000", metadata !221, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [lhs] [line 3154]
+!227 = metadata !{metadata !"0x101\00r\0033557586\000", metadata !221, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [r] [line 3154]
+!228 = metadata !{metadata !"0x100\00pivot\003159\000", metadata !221, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [pivot] [line 3159]
+!229 = metadata !{metadata !"0x100\00coeff\003159\000", metadata !221, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [coeff] [line 3159]
+!230 = metadata !{metadata !"0x2e\00matmul_sub\00matmul_sub\00\002841\001\001\000\006\00256\001\002842", metadata !1, metadata !5, metadata !231, null, null, null, null, metadata !233} ; [ DW_TAG_subprogram ] [line 2841] [local] [def] [scope 2842] [matmul_sub]
+!231 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !232, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !232 = metadata !{null, metadata !224, metadata !224, metadata !224}
 !233 = metadata !{metadata !234, metadata !235, metadata !236, metadata !237}
-!234 = metadata !{i32 786689, metadata !230, metadata !"ablock", metadata !5, i32 16780057, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ablock] [line 2841]
-!235 = metadata !{i32 786689, metadata !230, metadata !"bblock", metadata !5, i32 33557273, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bblock] [line 2841]
-!236 = metadata !{i32 786689, metadata !230, metadata !"cblock", metadata !5, i32 50334490, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [cblock] [line 2842]
-!237 = metadata !{i32 786688, metadata !230, metadata !"j", metadata !5, i32 2851, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2851]
-!238 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"matvec_sub", metadata !"matvec_sub", metadata !"", i32 2814, metadata !239, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !241, i32 2814} ; [ DW_TAG_subprogram ] [line 2814] [local] [def] [matvec_sub]
-!239 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !240, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!234 = metadata !{metadata !"0x101\00ablock\0016780057\000", metadata !230, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [ablock] [line 2841]
+!235 = metadata !{metadata !"0x101\00bblock\0033557273\000", metadata !230, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [bblock] [line 2841]
+!236 = metadata !{metadata !"0x101\00cblock\0050334490\000", metadata !230, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [cblock] [line 2842]
+!237 = metadata !{metadata !"0x100\00j\002851\000", metadata !230, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 2851]
+!238 = metadata !{metadata !"0x2e\00matvec_sub\00matvec_sub\00\002814\001\001\000\006\00256\001\002814", metadata !1, metadata !5, metadata !239, null, null, null, null, metadata !241} ; [ DW_TAG_subprogram ] [line 2814] [local] [def] [matvec_sub]
+!239 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !240, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !240 = metadata !{null, metadata !224, metadata !105, metadata !105}
 !241 = metadata !{metadata !242, metadata !243, metadata !244, metadata !245}
-!242 = metadata !{i32 786689, metadata !238, metadata !"ablock", metadata !5, i32 16780030, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ablock] [line 2814]
-!243 = metadata !{i32 786689, metadata !238, metadata !"avec", metadata !5, i32 33557246, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [avec] [line 2814]
-!244 = metadata !{i32 786689, metadata !238, metadata !"bvec", metadata !5, i32 50334462, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bvec] [line 2814]
-!245 = metadata !{i32 786688, metadata !238, metadata !"i", metadata !5, i32 2823, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2823]
-!246 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"binvcrhs", metadata !"binvcrhs", metadata !"", i32 2885, metadata !247, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !249, i32 2885} ; [ DW_TAG_subprogram ] [line 2885] [local] [def] [binvcrhs]
-!247 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !248, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!242 = metadata !{metadata !"0x101\00ablock\0016780030\000", metadata !238, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [ablock] [line 2814]
+!243 = metadata !{metadata !"0x101\00avec\0033557246\000", metadata !238, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [avec] [line 2814]
+!244 = metadata !{metadata !"0x101\00bvec\0050334462\000", metadata !238, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [bvec] [line 2814]
+!245 = metadata !{metadata !"0x100\00i\002823\000", metadata !238, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 2823]
+!246 = metadata !{metadata !"0x2e\00binvcrhs\00binvcrhs\00\002885\001\001\000\006\00256\001\002885", metadata !1, metadata !5, metadata !247, null, null, null, null, metadata !249} ; [ DW_TAG_subprogram ] [line 2885] [local] [def] [binvcrhs]
+!247 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !248, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !248 = metadata !{null, metadata !224, metadata !224, metadata !105}
 !249 = metadata !{metadata !250, metadata !251, metadata !252, metadata !253, metadata !254}
-!250 = metadata !{i32 786689, metadata !246, metadata !"lhs", metadata !5, i32 16780101, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [lhs] [line 2885]
-!251 = metadata !{i32 786689, metadata !246, metadata !"c", metadata !5, i32 33557317, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 2885]
-!252 = metadata !{i32 786689, metadata !246, metadata !"r", metadata !5, i32 50334533, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 2885]
-!253 = metadata !{i32 786688, metadata !246, metadata !"pivot", metadata !5, i32 2890, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pivot] [line 2890]
-!254 = metadata !{i32 786688, metadata !246, metadata !"coeff", metadata !5, i32 2890, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [coeff] [line 2890]
-!255 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsz", metadata !"lhsz", metadata !"", i32 1475, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !256, i32 1475} ; [ DW_TAG_subprogram ] [line 1475] [local] [def] [lhsz]
+!250 = metadata !{metadata !"0x101\00lhs\0016780101\000", metadata !246, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [lhs] [line 2885]
+!251 = metadata !{metadata !"0x101\00c\0033557317\000", metadata !246, metadata !5, metadata !224} ; [ DW_TAG_arg_variable ] [c] [line 2885]
+!252 = metadata !{metadata !"0x101\00r\0050334533\000", metadata !246, metadata !5, metadata !105} ; [ DW_TAG_arg_variable ] [r] [line 2885]
+!253 = metadata !{metadata !"0x100\00pivot\002890\000", metadata !246, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [pivot] [line 2890]
+!254 = metadata !{metadata !"0x100\00coeff\002890\000", metadata !246, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [coeff] [line 2890]
+!255 = metadata !{metadata !"0x2e\00lhsz\00lhsz\00\001475\001\001\000\006\00256\001\001475", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !256} ; [ DW_TAG_subprogram ] [line 1475] [local] [def] [lhsz]
 !256 = metadata !{metadata !257, metadata !258, metadata !259}
-!257 = metadata !{i32 786688, metadata !255, metadata !"i", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1484]
-!258 = metadata !{i32 786688, metadata !255, metadata !"j", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1484]
-!259 = metadata !{i32 786688, metadata !255, metadata !"k", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1484]
-!260 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_solve", metadata !"y_solve", metadata !"", i32 3299, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 3299} ; [ DW_TAG_subprogram ] [line 3299] [local] [def] [y_solve]
-!261 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_backsubstitute", metadata !"y_backsubstitute", metadata !"", i32 3323, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !262, i32 3323} ; [ DW_TAG_subprogram ] [line 3323] [local] [def] [y_backsubstitute]
+!257 = metadata !{metadata !"0x100\00i\001484\000", metadata !255, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 1484]
+!258 = metadata !{metadata !"0x100\00j\001484\000", metadata !255, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 1484]
+!259 = metadata !{metadata !"0x100\00k\001484\000", metadata !255, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 1484]
+!260 = metadata !{metadata !"0x2e\00y_solve\00y_solve\00\003299\001\001\000\006\00256\001\003299", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3299] [local] [def] [y_solve]
+!261 = metadata !{metadata !"0x2e\00y_backsubstitute\00y_backsubstitute\00\003323\001\001\000\006\00256\001\003323", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !262} ; [ DW_TAG_subprogram ] [line 3323] [local] [def] [y_backsubstitute]
 !262 = metadata !{metadata !263, metadata !264, metadata !265, metadata !266, metadata !267}
-!263 = metadata !{i32 786688, metadata !261, metadata !"i", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3335]
-!264 = metadata !{i32 786688, metadata !261, metadata !"j", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3335]
-!265 = metadata !{i32 786688, metadata !261, metadata !"k", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3335]
-!266 = metadata !{i32 786688, metadata !261, metadata !"m", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 3335]
-!267 = metadata !{i32 786688, metadata !261, metadata !"n", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 3335]
-!268 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_solve_cell", metadata !"y_solve_cell", metadata !"", i32 3355, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !269, i32 3355} ; [ DW_TAG_subprogram ] [line 3355] [local] [def] [y_solve_cell]
+!263 = metadata !{metadata !"0x100\00i\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3335]
+!264 = metadata !{metadata !"0x100\00j\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3335]
+!265 = metadata !{metadata !"0x100\00k\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3335]
+!266 = metadata !{metadata !"0x100\00m\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 3335]
+!267 = metadata !{metadata !"0x100\00n\003335\000", metadata !261, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 3335]
+!268 = metadata !{metadata !"0x2e\00y_solve_cell\00y_solve_cell\00\003355\001\001\000\006\00256\001\003355", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !269} ; [ DW_TAG_subprogram ] [line 3355] [local] [def] [y_solve_cell]
 !269 = metadata !{metadata !270, metadata !271, metadata !272, metadata !273}
-!270 = metadata !{i32 786688, metadata !268, metadata !"i", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3370]
-!271 = metadata !{i32 786688, metadata !268, metadata !"j", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3370]
-!272 = metadata !{i32 786688, metadata !268, metadata !"k", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3370]
-!273 = metadata !{i32 786688, metadata !268, metadata !"jsize", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jsize] [line 3370]
-!274 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsy", metadata !"lhsy", metadata !"", i32 1181, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !275, i32 1181} ; [ DW_TAG_subprogram ] [line 1181] [local] [def] [lhsy]
+!270 = metadata !{metadata !"0x100\00i\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3370]
+!271 = metadata !{metadata !"0x100\00j\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 3370]
+!272 = metadata !{metadata !"0x100\00k\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 3370]
+!273 = metadata !{metadata !"0x100\00jsize\003370\000", metadata !268, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [jsize] [line 3370]
+!274 = metadata !{metadata !"0x2e\00lhsy\00lhsy\00\001181\001\001\000\006\00256\001\001181", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !275} ; [ DW_TAG_subprogram ] [line 1181] [local] [def] [lhsy]
 !275 = metadata !{metadata !276, metadata !277, metadata !278}
-!276 = metadata !{i32 786688, metadata !274, metadata !"i", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1190]
-!277 = metadata !{i32 786688, metadata !274, metadata !"j", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1190]
-!278 = metadata !{i32 786688, metadata !274, metadata !"k", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1190]
-!279 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_solve", metadata !"x_solve", metadata !"", i32 2658, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 2658} ; [ DW_TAG_subprogram ] [line 2658] [local] [def] [x_solve]
-!280 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_backsubstitute", metadata !"x_backsubstitute", metadata !"", i32 2684, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !281, i32 2684} ; [ DW_TAG_subprogram ] [line 2684] [local] [def] [x_backsubstitute]
+!276 = metadata !{metadata !"0x100\00i\001190\000", metadata !274, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 1190]
+!277 = metadata !{metadata !"0x100\00j\001190\000", metadata !274, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 1190]
+!278 = metadata !{metadata !"0x100\00k\001190\000", metadata !274, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 1190]
+!279 = metadata !{metadata !"0x2e\00x_solve\00x_solve\00\002658\001\001\000\006\00256\001\002658", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2658] [local] [def] [x_solve]
+!280 = metadata !{metadata !"0x2e\00x_backsubstitute\00x_backsubstitute\00\002684\001\001\000\006\00256\001\002684", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !281} ; [ DW_TAG_subprogram ] [line 2684] [local] [def] [x_backsubstitute]
 !281 = metadata !{metadata !282, metadata !283, metadata !284, metadata !285, metadata !286}
-!282 = metadata !{i32 786688, metadata !280, metadata !"i", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2696]
-!283 = metadata !{i32 786688, metadata !280, metadata !"j", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2696]
-!284 = metadata !{i32 786688, metadata !280, metadata !"k", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 2696]
-!285 = metadata !{i32 786688, metadata !280, metadata !"m", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 2696]
-!286 = metadata !{i32 786688, metadata !280, metadata !"n", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 2696]
-!287 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_solve_cell", metadata !"x_solve_cell", metadata !"", i32 2716, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !288, i32 2716} ; [ DW_TAG_subprogram ] [line 2716] [local] [def] [x_solve_cell]
+!282 = metadata !{metadata !"0x100\00i\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 2696]
+!283 = metadata !{metadata !"0x100\00j\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 2696]
+!284 = metadata !{metadata !"0x100\00k\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 2696]
+!285 = metadata !{metadata !"0x100\00m\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [m] [line 2696]
+!286 = metadata !{metadata !"0x100\00n\002696\000", metadata !280, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [n] [line 2696]
+!287 = metadata !{metadata !"0x2e\00x_solve_cell\00x_solve_cell\00\002716\001\001\000\006\00256\001\002716", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !288} ; [ DW_TAG_subprogram ] [line 2716] [local] [def] [x_solve_cell]
 !288 = metadata !{metadata !289, metadata !290, metadata !291, metadata !292}
-!289 = metadata !{i32 786688, metadata !287, metadata !"i", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2728]
-!290 = metadata !{i32 786688, metadata !287, metadata !"j", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2728]
-!291 = metadata !{i32 786688, metadata !287, metadata !"k", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 2728]
-!292 = metadata !{i32 786688, metadata !287, metadata !"isize", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [isize] [line 2728]
-!293 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsx", metadata !"lhsx", metadata !"", i32 898, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !294, i32 898} ; [ DW_TAG_subprogram ] [line 898] [local] [def] [lhsx]
+!289 = metadata !{metadata !"0x100\00i\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 2728]
+!290 = metadata !{metadata !"0x100\00j\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 2728]
+!291 = metadata !{metadata !"0x100\00k\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 2728]
+!292 = metadata !{metadata !"0x100\00isize\002728\000", metadata !287, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [isize] [line 2728]
+!293 = metadata !{metadata !"0x2e\00lhsx\00lhsx\00\00898\001\001\000\006\00256\001\00898", metadata !1, metadata !5, metadata !115, null, null, null, null, metadata !294} ; [ DW_TAG_subprogram ] [line 898] [local] [def] [lhsx]
 !294 = metadata !{metadata !295, metadata !296, metadata !297}
-!295 = metadata !{i32 786688, metadata !293, metadata !"i", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 907]
-!296 = metadata !{i32 786688, metadata !293, metadata !"j", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 907]
-!297 = metadata !{i32 786688, metadata !293, metadata !"k", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 907]
+!295 = metadata !{metadata !"0x100\00i\00907\000", metadata !293, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 907]
+!296 = metadata !{metadata !"0x100\00j\00907\000", metadata !293, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [j] [line 907]
+!297 = metadata !{metadata !"0x100\00k\00907\000", metadata !293, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [k] [line 907]
 !298 = metadata !{metadata !299, metadata !304, metadata !305, metadata !309, metadata !310, metadata !311, metadata !312, metadata !313, metadata !314, metadata !315, metadata !316, metadata !317, metadata !318, metadata !319, metadata !320, metadata !321, metadata !322, metadata !323, metadata !324, metadata !325, metadata !326, metadata !327, metadata !328, metadata !329, metadata !330, metadata !331, metadata !332, metadata !333, metadata !334, metadata !335, metadata !336, metadata !337, metadata !338, metadata !339, metadata !340, metadata !341, metadata !342, metadata !343, metadata !347, metadata !350, metadata !351, metadata !352, metadata !353, metadata !354, metadata !355, metadata !356, metadata !360, metadata !361, metadata !362, metadata !363, metadata !364, metadata !365, metadata !366, metadata !367, metadata !368, metadata !369, metadata !370, metadata !371, metadata !372, metadata !373, metadata !374, metadata !375, metadata !376, metadata !377, metadata !378, metadata !379, metadata !380, metadata !381, metadata !382, metadata !383, metadata !384, metadata !385, metadata !386, metadata !387, metadata !388, metadata !389, metadata !390, metadata !391, metadata !392, metadata !393, metadata !394, metadata !395, metadata !396, metadata !397, metadata !398, metadata !399, metadata !400, metadata !401, metadata !402, metadata !403, metadata !404, metadata !405, metadata !406, metadata !407, metadata !408, metadata !409, metadata !410, metadata !411, metadata !412, metadata !413, metadata !414, metadata !415, metadata !416, metadata !417, metadata !418, metadata !419, metadata !422, metadata !426, metadata !427, metadata !430, metadata !431, metadata !434, metadata !435, metadata !436, metadata !437}
-!299 = metadata !{i32 786484, i32 0, null, metadata !"grid_points", metadata !"grid_points", metadata !"", metadata !300, i32 28, metadata !302, i32 1, i32 1, [3 x i32]* @grid_points, null} ; [ DW_TAG_variable ] [grid_points] [line 28] [local] [def]
-!300 = metadata !{i32 786473, metadata !301}      ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/./header.h]
+!299 = metadata !{metadata !"0x34\00grid_points\00grid_points\00\0028\001\001", null, metadata !300, metadata !302, [3 x i32]* @grid_points, null} ; [ DW_TAG_variable ] [grid_points] [line 28] [local] [def]
+!300 = metadata !{metadata !"0x29", metadata !301}      ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/./header.h]
 !301 = metadata !{metadata !"./header.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
-!302 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 96, i64 32, i32 0, i32 0, metadata !8, metadata !303, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 96, align 32, offset 0] [from int]
+!302 = metadata !{metadata !"0x1\00\000\0096\0032\000\000", null, null, metadata !8, metadata !303, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 96, align 32, offset 0] [from int]
 !303 = metadata !{metadata !178}
-!304 = metadata !{i32 786484, i32 0, null, metadata !"dt", metadata !"dt", metadata !"", metadata !300, i32 35, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dt] [line 35] [local] [def]
-!305 = metadata !{i32 786484, i32 0, null, metadata !"rhs", metadata !"rhs", metadata !"", metadata !300, i32 68, metadata !306, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [rhs] [line 68] [local] [def]
-!306 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1385839040, i64 64, i32 0, i32 0, metadata !20, metadata !307, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1385839040, align 64, offset 0] [from double]
+!304 = metadata !{metadata !"0x34\00dt\00dt\00\0035\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dt] [line 35] [local] [def]
+!305 = metadata !{metadata !"0x34\00rhs\00rhs\00\0068\001\001", null, metadata !300, metadata !306, null, null} ; [ DW_TAG_variable ] [rhs] [line 68] [local] [def]
+!306 = metadata !{metadata !"0x1\00\000\001385839040\0064\000\000", null, null, metadata !20, metadata !307, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1385839040, align 64, offset 0] [from double]
 !307 = metadata !{metadata !308, metadata !308, metadata !308, metadata !93}
-!308 = metadata !{i32 786465, i64 0, i64 163}     ; [ DW_TAG_subrange_type ] [0, 162]
-!309 = metadata !{i32 786484, i32 0, null, metadata !"zzcon5", metadata !"zzcon5", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon5] [line 42] [local] [def]
-!310 = metadata !{i32 786484, i32 0, null, metadata !"zzcon4", metadata !"zzcon4", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon4] [line 42] [local] [def]
-!311 = metadata !{i32 786484, i32 0, null, metadata !"zzcon3", metadata !"zzcon3", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon3] [line 42] [local] [def]
-!312 = metadata !{i32 786484, i32 0, null, metadata !"dz5tz1", metadata !"dz5tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz5tz1] [line 43] [local] [def]
-!313 = metadata !{i32 786484, i32 0, null, metadata !"dz4tz1", metadata !"dz4tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz4tz1] [line 43] [local] [def]
-!314 = metadata !{i32 786484, i32 0, null, metadata !"dz3tz1", metadata !"dz3tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz3tz1] [line 43] [local] [def]
-!315 = metadata !{i32 786484, i32 0, null, metadata !"zzcon2", metadata !"zzcon2", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon2] [line 42] [local] [def]
-!316 = metadata !{i32 786484, i32 0, null, metadata !"dz2tz1", metadata !"dz2tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz2tz1] [line 43] [local] [def]
-!317 = metadata !{i32 786484, i32 0, null, metadata !"tz2", metadata !"tz2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz2] [line 31] [local] [def]
-!318 = metadata !{i32 786484, i32 0, null, metadata !"dz1tz1", metadata !"dz1tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz1tz1] [line 43] [local] [def]
-!319 = metadata !{i32 786484, i32 0, null, metadata !"yycon5", metadata !"yycon5", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon5] [line 40] [local] [def]
-!320 = metadata !{i32 786484, i32 0, null, metadata !"yycon4", metadata !"yycon4", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon4] [line 40] [local] [def]
-!321 = metadata !{i32 786484, i32 0, null, metadata !"yycon3", metadata !"yycon3", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon3] [line 40] [local] [def]
-!322 = metadata !{i32 786484, i32 0, null, metadata !"dy5ty1", metadata !"dy5ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy5ty1] [line 41] [local] [def]
-!323 = metadata !{i32 786484, i32 0, null, metadata !"dy4ty1", metadata !"dy4ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy4ty1] [line 41] [local] [def]
-!324 = metadata !{i32 786484, i32 0, null, metadata !"dy3ty1", metadata !"dy3ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy3ty1] [line 41] [local] [def]
-!325 = metadata !{i32 786484, i32 0, null, metadata !"yycon2", metadata !"yycon2", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon2] [line 40] [local] [def]
-!326 = metadata !{i32 786484, i32 0, null, metadata !"dy2ty1", metadata !"dy2ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy2ty1] [line 41] [local] [def]
-!327 = metadata !{i32 786484, i32 0, null, metadata !"ty2", metadata !"ty2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty2] [line 31] [local] [def]
-!328 = metadata !{i32 786484, i32 0, null, metadata !"dy1ty1", metadata !"dy1ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy1ty1] [line 41] [local] [def]
-!329 = metadata !{i32 786484, i32 0, null, metadata !"dssp", metadata !"dssp", metadata !"", metadata !300, i32 35, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dssp] [line 35] [local] [def]
-!330 = metadata !{i32 786484, i32 0, null, metadata !"c1", metadata !"c1", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1] [line 45] [local] [def]
-!331 = metadata !{i32 786484, i32 0, null, metadata !"xxcon5", metadata !"xxcon5", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon5] [line 38] [local] [def]
-!332 = metadata !{i32 786484, i32 0, null, metadata !"xxcon4", metadata !"xxcon4", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon4] [line 38] [local] [def]
-!333 = metadata !{i32 786484, i32 0, null, metadata !"xxcon3", metadata !"xxcon3", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon3] [line 38] [local] [def]
-!334 = metadata !{i32 786484, i32 0, null, metadata !"dx5tx1", metadata !"dx5tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx5tx1] [line 39] [local] [def]
-!335 = metadata !{i32 786484, i32 0, null, metadata !"dx4tx1", metadata !"dx4tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx4tx1] [line 39] [local] [def]
-!336 = metadata !{i32 786484, i32 0, null, metadata !"dx3tx1", metadata !"dx3tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx3tx1] [line 39] [local] [def]
-!337 = metadata !{i32 786484, i32 0, null, metadata !"c2", metadata !"c2", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2] [line 45] [local] [def]
-!338 = metadata !{i32 786484, i32 0, null, metadata !"con43", metadata !"con43", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [con43] [line 48] [local] [def]
-!339 = metadata !{i32 786484, i32 0, null, metadata !"xxcon2", metadata !"xxcon2", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon2] [line 38] [local] [def]
-!340 = metadata !{i32 786484, i32 0, null, metadata !"dx2tx1", metadata !"dx2tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx2tx1] [line 39] [local] [def]
-!341 = metadata !{i32 786484, i32 0, null, metadata !"tx2", metadata !"tx2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx2] [line 31] [local] [def]
-!342 = metadata !{i32 786484, i32 0, null, metadata !"dx1tx1", metadata !"dx1tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx1tx1] [line 39] [local] [def]
-!343 = metadata !{i32 786484, i32 0, null, metadata !"forcing", metadata !"forcing", metadata !"", metadata !300, i32 66, metadata !344, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [forcing] [line 66] [local] [def]
-!344 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1663006848, i64 64, i32 0, i32 0, metadata !20, metadata !345, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1663006848, align 64, offset 0] [from double]
+!308 = metadata !{metadata !"0x21\000\00163"}     ; [ DW_TAG_subrange_type ] [0, 162]
+!309 = metadata !{metadata !"0x34\00zzcon5\00zzcon5\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon5] [line 42] [local] [def]
+!310 = metadata !{metadata !"0x34\00zzcon4\00zzcon4\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon4] [line 42] [local] [def]
+!311 = metadata !{metadata !"0x34\00zzcon3\00zzcon3\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon3] [line 42] [local] [def]
+!312 = metadata !{metadata !"0x34\00dz5tz1\00dz5tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz5tz1] [line 43] [local] [def]
+!313 = metadata !{metadata !"0x34\00dz4tz1\00dz4tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz4tz1] [line 43] [local] [def]
+!314 = metadata !{metadata !"0x34\00dz3tz1\00dz3tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz3tz1] [line 43] [local] [def]
+!315 = metadata !{metadata !"0x34\00zzcon2\00zzcon2\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon2] [line 42] [local] [def]
+!316 = metadata !{metadata !"0x34\00dz2tz1\00dz2tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz2tz1] [line 43] [local] [def]
+!317 = metadata !{metadata !"0x34\00tz2\00tz2\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tz2] [line 31] [local] [def]
+!318 = metadata !{metadata !"0x34\00dz1tz1\00dz1tz1\00\0043\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz1tz1] [line 43] [local] [def]
+!319 = metadata !{metadata !"0x34\00yycon5\00yycon5\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon5] [line 40] [local] [def]
+!320 = metadata !{metadata !"0x34\00yycon4\00yycon4\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon4] [line 40] [local] [def]
+!321 = metadata !{metadata !"0x34\00yycon3\00yycon3\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon3] [line 40] [local] [def]
+!322 = metadata !{metadata !"0x34\00dy5ty1\00dy5ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy5ty1] [line 41] [local] [def]
+!323 = metadata !{metadata !"0x34\00dy4ty1\00dy4ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy4ty1] [line 41] [local] [def]
+!324 = metadata !{metadata !"0x34\00dy3ty1\00dy3ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy3ty1] [line 41] [local] [def]
+!325 = metadata !{metadata !"0x34\00yycon2\00yycon2\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon2] [line 40] [local] [def]
+!326 = metadata !{metadata !"0x34\00dy2ty1\00dy2ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy2ty1] [line 41] [local] [def]
+!327 = metadata !{metadata !"0x34\00ty2\00ty2\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [ty2] [line 31] [local] [def]
+!328 = metadata !{metadata !"0x34\00dy1ty1\00dy1ty1\00\0041\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy1ty1] [line 41] [local] [def]
+!329 = metadata !{metadata !"0x34\00dssp\00dssp\00\0035\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dssp] [line 35] [local] [def]
+!330 = metadata !{metadata !"0x34\00c1\00c1\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1] [line 45] [local] [def]
+!331 = metadata !{metadata !"0x34\00xxcon5\00xxcon5\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon5] [line 38] [local] [def]
+!332 = metadata !{metadata !"0x34\00xxcon4\00xxcon4\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon4] [line 38] [local] [def]
+!333 = metadata !{metadata !"0x34\00xxcon3\00xxcon3\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon3] [line 38] [local] [def]
+!334 = metadata !{metadata !"0x34\00dx5tx1\00dx5tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx5tx1] [line 39] [local] [def]
+!335 = metadata !{metadata !"0x34\00dx4tx1\00dx4tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx4tx1] [line 39] [local] [def]
+!336 = metadata !{metadata !"0x34\00dx3tx1\00dx3tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx3tx1] [line 39] [local] [def]
+!337 = metadata !{metadata !"0x34\00c2\00c2\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2] [line 45] [local] [def]
+!338 = metadata !{metadata !"0x34\00con43\00con43\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [con43] [line 48] [local] [def]
+!339 = metadata !{metadata !"0x34\00xxcon2\00xxcon2\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon2] [line 38] [local] [def]
+!340 = metadata !{metadata !"0x34\00dx2tx1\00dx2tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx2tx1] [line 39] [local] [def]
+!341 = metadata !{metadata !"0x34\00tx2\00tx2\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tx2] [line 31] [local] [def]
+!342 = metadata !{metadata !"0x34\00dx1tx1\00dx1tx1\00\0039\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx1tx1] [line 39] [local] [def]
+!343 = metadata !{metadata !"0x34\00forcing\00forcing\00\0066\001\001", null, metadata !300, metadata !344, null, null} ; [ DW_TAG_variable ] [forcing] [line 66] [local] [def]
+!344 = metadata !{metadata !"0x1\00\000\001663006848\0064\000\000", null, null, metadata !20, metadata !345, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1663006848, align 64, offset 0] [from double]
 !345 = metadata !{metadata !308, metadata !308, metadata !308, metadata !346}
-!346 = metadata !{i32 786465, i64 0, i64 6}       ; [ DW_TAG_subrange_type ] [0, 5]
-!347 = metadata !{i32 786484, i32 0, null, metadata !"qs", metadata !"qs", metadata !"", metadata !300, i32 63, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [qs] [line 63] [local] [def]
-!348 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 277167808, i64 64, i32 0, i32 0, metadata !20, metadata !349, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 277167808, align 64, offset 0] [from double]
+!346 = metadata !{metadata !"0x21\000\006"}       ; [ DW_TAG_subrange_type ] [0, 5]
+!347 = metadata !{metadata !"0x34\00qs\00qs\00\0063\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [qs] [line 63] [local] [def]
+!348 = metadata !{metadata !"0x1\00\000\00277167808\0064\000\000", null, null, metadata !20, metadata !349, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 277167808, align 64, offset 0] [from double]
 !349 = metadata !{metadata !308, metadata !308, metadata !308}
-!350 = metadata !{i32 786484, i32 0, null, metadata !"square", metadata !"square", metadata !"", metadata !300, i32 65, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [square] [line 65] [local] [def]
-!351 = metadata !{i32 786484, i32 0, null, metadata !"ws", metadata !"ws", metadata !"", metadata !300, i32 62, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ws] [line 62] [local] [def]
-!352 = metadata !{i32 786484, i32 0, null, metadata !"vs", metadata !"vs", metadata !"", metadata !300, i32 61, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [vs] [line 61] [local] [def]
-!353 = metadata !{i32 786484, i32 0, null, metadata !"us", metadata !"us", metadata !"", metadata !300, i32 60, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [us] [line 60] [local] [def]
-!354 = metadata !{i32 786484, i32 0, null, metadata !"rho_i", metadata !"rho_i", metadata !"", metadata !300, i32 64, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [rho_i] [line 64] [local] [def]
-!355 = metadata !{i32 786484, i32 0, null, metadata !"u", metadata !"u", metadata !"", metadata !300, i32 67, metadata !306, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [u] [line 67] [local] [def]
-!356 = metadata !{i32 786484, i32 0, null, metadata !"ce", metadata !"ce", metadata !"", metadata !300, i32 36, metadata !357, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ce] [line 36] [local] [def]
-!357 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 4160, i64 64, i32 0, i32 0, metadata !20, metadata !358, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 4160, align 64, offset 0] [from double]
+!350 = metadata !{metadata !"0x34\00square\00square\00\0065\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [square] [line 65] [local] [def]
+!351 = metadata !{metadata !"0x34\00ws\00ws\00\0062\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [ws] [line 62] [local] [def]
+!352 = metadata !{metadata !"0x34\00vs\00vs\00\0061\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [vs] [line 61] [local] [def]
+!353 = metadata !{metadata !"0x34\00us\00us\00\0060\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [us] [line 60] [local] [def]
+!354 = metadata !{metadata !"0x34\00rho_i\00rho_i\00\0064\001\001", null, metadata !300, metadata !348, null, null} ; [ DW_TAG_variable ] [rho_i] [line 64] [local] [def]
+!355 = metadata !{metadata !"0x34\00u\00u\00\0067\001\001", null, metadata !300, metadata !306, null, null} ; [ DW_TAG_variable ] [u] [line 67] [local] [def]
+!356 = metadata !{metadata !"0x34\00ce\00ce\00\0036\001\001", null, metadata !300, metadata !357, null, null} ; [ DW_TAG_variable ] [ce] [line 36] [local] [def]
+!357 = metadata !{metadata !"0x1\00\000\004160\0064\000\000", null, null, metadata !20, metadata !358, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 4160, align 64, offset 0] [from double]
 !358 = metadata !{metadata !93, metadata !359}
-!359 = metadata !{i32 786465, i64 0, i64 13}      ; [ DW_TAG_subrange_type ] [0, 12]
-!360 = metadata !{i32 786484, i32 0, null, metadata !"dnzm1", metadata !"dnzm1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnzm1] [line 44] [local] [def]
-!361 = metadata !{i32 786484, i32 0, null, metadata !"dnym1", metadata !"dnym1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnym1] [line 44] [local] [def]
-!362 = metadata !{i32 786484, i32 0, null, metadata !"dnxm1", metadata !"dnxm1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnxm1] [line 44] [local] [def]
-!363 = metadata !{i32 786484, i32 0, null, metadata !"zzcon1", metadata !"zzcon1", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon1] [line 42] [local] [def]
-!364 = metadata !{i32 786484, i32 0, null, metadata !"yycon1", metadata !"yycon1", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon1] [line 40] [local] [def]
-!365 = metadata !{i32 786484, i32 0, null, metadata !"xxcon1", metadata !"xxcon1", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon1] [line 38] [local] [def]
-!366 = metadata !{i32 786484, i32 0, null, metadata !"con16", metadata !"con16", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [con16] [line 48] [local] [def]
-!367 = metadata !{i32 786484, i32 0, null, metadata !"c2iv", metadata !"c2iv", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2iv] [line 48] [local] [def]
-!368 = metadata !{i32 786484, i32 0, null, metadata !"c3c4tz3", metadata !"c3c4tz3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4tz3] [line 48] [local] [def]
-!369 = metadata !{i32 786484, i32 0, null, metadata !"c3c4ty3", metadata !"c3c4ty3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4ty3] [line 48] [local] [def]
-!370 = metadata !{i32 786484, i32 0, null, metadata !"c3c4tx3", metadata !"c3c4tx3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4tx3] [line 48] [local] [def]
-!371 = metadata !{i32 786484, i32 0, null, metadata !"comz6", metadata !"comz6", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz6] [line 47] [local] [def]
-!372 = metadata !{i32 786484, i32 0, null, metadata !"comz5", metadata !"comz5", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz5] [line 47] [local] [def]
-!373 = metadata !{i32 786484, i32 0, null, metadata !"comz4", metadata !"comz4", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz4] [line 47] [local] [def]
-!374 = metadata !{i32 786484, i32 0, null, metadata !"comz1", metadata !"comz1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz1] [line 47] [local] [def]
-!375 = metadata !{i32 786484, i32 0, null, metadata !"dtdssp", metadata !"dtdssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtdssp] [line 45] [local] [def]
-!376 = metadata !{i32 786484, i32 0, null, metadata !"c2dttz1", metadata !"c2dttz1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dttz1] [line 47] [local] [def]
-!377 = metadata !{i32 786484, i32 0, null, metadata !"c2dtty1", metadata !"c2dtty1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dtty1] [line 47] [local] [def]
-!378 = metadata !{i32 786484, i32 0, null, metadata !"c2dttx1", metadata !"c2dttx1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dttx1] [line 47] [local] [def]
-!379 = metadata !{i32 786484, i32 0, null, metadata !"dttz2", metadata !"dttz2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttz2] [line 46] [local] [def]
-!380 = metadata !{i32 786484, i32 0, null, metadata !"dttz1", metadata !"dttz1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttz1] [line 46] [local] [def]
-!381 = metadata !{i32 786484, i32 0, null, metadata !"dtty2", metadata !"dtty2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtty2] [line 46] [local] [def]
-!382 = metadata !{i32 786484, i32 0, null, metadata !"dtty1", metadata !"dtty1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtty1] [line 46] [local] [def]
-!383 = metadata !{i32 786484, i32 0, null, metadata !"dttx2", metadata !"dttx2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttx2] [line 46] [local] [def]
-!384 = metadata !{i32 786484, i32 0, null, metadata !"dttx1", metadata !"dttx1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttx1] [line 46] [local] [def]
-!385 = metadata !{i32 786484, i32 0, null, metadata !"c5dssp", metadata !"c5dssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c5dssp] [line 45] [local] [def]
-!386 = metadata !{i32 786484, i32 0, null, metadata !"c4dssp", metadata !"c4dssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c4dssp] [line 45] [local] [def]
-!387 = metadata !{i32 786484, i32 0, null, metadata !"dzmax", metadata !"dzmax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dzmax] [line 37] [local] [def]
-!388 = metadata !{i32 786484, i32 0, null, metadata !"dymax", metadata !"dymax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dymax] [line 37] [local] [def]
-!389 = metadata !{i32 786484, i32 0, null, metadata !"dxmax", metadata !"dxmax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dxmax] [line 37] [local] [def]
-!390 = metadata !{i32 786484, i32 0, null, metadata !"dz5", metadata !"dz5", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz5] [line 34] [local] [def]
-!391 = metadata !{i32 786484, i32 0, null, metadata !"dz4", metadata !"dz4", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz4] [line 34] [local] [def]
-!392 = metadata !{i32 786484, i32 0, null, metadata !"dz3", metadata !"dz3", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz3] [line 34] [local] [def]
-!393 = metadata !{i32 786484, i32 0, null, metadata !"dz2", metadata !"dz2", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz2] [line 34] [local] [def]
-!394 = metadata !{i32 786484, i32 0, null, metadata !"dz1", metadata !"dz1", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz1] [line 34] [local] [def]
-!395 = metadata !{i32 786484, i32 0, null, metadata !"dy5", metadata !"dy5", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy5] [line 33] [local] [def]
-!396 = metadata !{i32 786484, i32 0, null, metadata !"dy4", metadata !"dy4", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy4] [line 33] [local] [def]
-!397 = metadata !{i32 786484, i32 0, null, metadata !"dy3", metadata !"dy3", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy3] [line 33] [local] [def]
-!398 = metadata !{i32 786484, i32 0, null, metadata !"dy2", metadata !"dy2", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy2] [line 33] [local] [def]
-!399 = metadata !{i32 786484, i32 0, null, metadata !"dy1", metadata !"dy1", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy1] [line 33] [local] [def]
-!400 = metadata !{i32 786484, i32 0, null, metadata !"dx5", metadata !"dx5", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx5] [line 32] [local] [def]
-!401 = metadata !{i32 786484, i32 0, null, metadata !"dx4", metadata !"dx4", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx4] [line 32] [local] [def]
-!402 = metadata !{i32 786484, i32 0, null, metadata !"dx3", metadata !"dx3", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx3] [line 32] [local] [def]
-!403 = metadata !{i32 786484, i32 0, null, metadata !"dx2", metadata !"dx2", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx2] [line 32] [local] [def]
-!404 = metadata !{i32 786484, i32 0, null, metadata !"dx1", metadata !"dx1", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx1] [line 32] [local] [def]
-!405 = metadata !{i32 786484, i32 0, null, metadata !"tz3", metadata !"tz3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz3] [line 31] [local] [def]
-!406 = metadata !{i32 786484, i32 0, null, metadata !"tz1", metadata !"tz1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz1] [line 31] [local] [def]
-!407 = metadata !{i32 786484, i32 0, null, metadata !"ty3", metadata !"ty3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty3] [line 31] [local] [def]
-!408 = metadata !{i32 786484, i32 0, null, metadata !"ty1", metadata !"ty1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty1] [line 31] [local] [def]
-!409 = metadata !{i32 786484, i32 0, null, metadata !"tx3", metadata !"tx3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx3] [line 31] [local] [def]
-!410 = metadata !{i32 786484, i32 0, null, metadata !"tx1", metadata !"tx1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx1] [line 31] [local] [def]
-!411 = metadata !{i32 786484, i32 0, null, metadata !"conz1", metadata !"conz1", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [conz1] [line 45] [local] [def]
-!412 = metadata !{i32 786484, i32 0, null, metadata !"c1345", metadata !"c1345", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1345] [line 44] [local] [def]
-!413 = metadata !{i32 786484, i32 0, null, metadata !"c3c4", metadata !"c3c4", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4] [line 44] [local] [def]
-!414 = metadata !{i32 786484, i32 0, null, metadata !"c1c5", metadata !"c1c5", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1c5] [line 44] [local] [def]
-!415 = metadata !{i32 786484, i32 0, null, metadata !"c1c2", metadata !"c1c2", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1c2] [line 44] [local] [def]
-!416 = metadata !{i32 786484, i32 0, null, metadata !"c5", metadata !"c5", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c5] [line 45] [local] [def]
-!417 = metadata !{i32 786484, i32 0, null, metadata !"c4", metadata !"c4", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c4] [line 45] [local] [def]
-!418 = metadata !{i32 786484, i32 0, null, metadata !"c3", metadata !"c3", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3] [line 45] [local] [def]
-!419 = metadata !{i32 786484, i32 0, null, metadata !"lhs", metadata !"lhs", metadata !"", metadata !300, i32 69, metadata !420, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [lhs] [line 69] [local] [def]
-!420 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 20787585600, i64 64, i32 0, i32 0, metadata !20, metadata !421, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 20787585600, align 64, offset 0] [from double]
+!359 = metadata !{metadata !"0x21\000\0013"}      ; [ DW_TAG_subrange_type ] [0, 12]
+!360 = metadata !{metadata !"0x34\00dnzm1\00dnzm1\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dnzm1] [line 44] [local] [def]
+!361 = metadata !{metadata !"0x34\00dnym1\00dnym1\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dnym1] [line 44] [local] [def]
+!362 = metadata !{metadata !"0x34\00dnxm1\00dnxm1\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dnxm1] [line 44] [local] [def]
+!363 = metadata !{metadata !"0x34\00zzcon1\00zzcon1\00\0042\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [zzcon1] [line 42] [local] [def]
+!364 = metadata !{metadata !"0x34\00yycon1\00yycon1\00\0040\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [yycon1] [line 40] [local] [def]
+!365 = metadata !{metadata !"0x34\00xxcon1\00xxcon1\00\0038\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [xxcon1] [line 38] [local] [def]
+!366 = metadata !{metadata !"0x34\00con16\00con16\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [con16] [line 48] [local] [def]
+!367 = metadata !{metadata !"0x34\00c2iv\00c2iv\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2iv] [line 48] [local] [def]
+!368 = metadata !{metadata !"0x34\00c3c4tz3\00c3c4tz3\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4tz3] [line 48] [local] [def]
+!369 = metadata !{metadata !"0x34\00c3c4ty3\00c3c4ty3\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4ty3] [line 48] [local] [def]
+!370 = metadata !{metadata !"0x34\00c3c4tx3\00c3c4tx3\00\0048\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4tx3] [line 48] [local] [def]
+!371 = metadata !{metadata !"0x34\00comz6\00comz6\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz6] [line 47] [local] [def]
+!372 = metadata !{metadata !"0x34\00comz5\00comz5\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz5] [line 47] [local] [def]
+!373 = metadata !{metadata !"0x34\00comz4\00comz4\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz4] [line 47] [local] [def]
+!374 = metadata !{metadata !"0x34\00comz1\00comz1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [comz1] [line 47] [local] [def]
+!375 = metadata !{metadata !"0x34\00dtdssp\00dtdssp\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dtdssp] [line 45] [local] [def]
+!376 = metadata !{metadata !"0x34\00c2dttz1\00c2dttz1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2dttz1] [line 47] [local] [def]
+!377 = metadata !{metadata !"0x34\00c2dtty1\00c2dtty1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2dtty1] [line 47] [local] [def]
+!378 = metadata !{metadata !"0x34\00c2dttx1\00c2dttx1\00\0047\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c2dttx1] [line 47] [local] [def]
+!379 = metadata !{metadata !"0x34\00dttz2\00dttz2\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttz2] [line 46] [local] [def]
+!380 = metadata !{metadata !"0x34\00dttz1\00dttz1\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttz1] [line 46] [local] [def]
+!381 = metadata !{metadata !"0x34\00dtty2\00dtty2\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dtty2] [line 46] [local] [def]
+!382 = metadata !{metadata !"0x34\00dtty1\00dtty1\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dtty1] [line 46] [local] [def]
+!383 = metadata !{metadata !"0x34\00dttx2\00dttx2\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttx2] [line 46] [local] [def]
+!384 = metadata !{metadata !"0x34\00dttx1\00dttx1\00\0046\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dttx1] [line 46] [local] [def]
+!385 = metadata !{metadata !"0x34\00c5dssp\00c5dssp\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c5dssp] [line 45] [local] [def]
+!386 = metadata !{metadata !"0x34\00c4dssp\00c4dssp\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c4dssp] [line 45] [local] [def]
+!387 = metadata !{metadata !"0x34\00dzmax\00dzmax\00\0037\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dzmax] [line 37] [local] [def]
+!388 = metadata !{metadata !"0x34\00dymax\00dymax\00\0037\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dymax] [line 37] [local] [def]
+!389 = metadata !{metadata !"0x34\00dxmax\00dxmax\00\0037\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dxmax] [line 37] [local] [def]
+!390 = metadata !{metadata !"0x34\00dz5\00dz5\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz5] [line 34] [local] [def]
+!391 = metadata !{metadata !"0x34\00dz4\00dz4\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz4] [line 34] [local] [def]
+!392 = metadata !{metadata !"0x34\00dz3\00dz3\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz3] [line 34] [local] [def]
+!393 = metadata !{metadata !"0x34\00dz2\00dz2\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz2] [line 34] [local] [def]
+!394 = metadata !{metadata !"0x34\00dz1\00dz1\00\0034\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dz1] [line 34] [local] [def]
+!395 = metadata !{metadata !"0x34\00dy5\00dy5\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy5] [line 33] [local] [def]
+!396 = metadata !{metadata !"0x34\00dy4\00dy4\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy4] [line 33] [local] [def]
+!397 = metadata !{metadata !"0x34\00dy3\00dy3\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy3] [line 33] [local] [def]
+!398 = metadata !{metadata !"0x34\00dy2\00dy2\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy2] [line 33] [local] [def]
+!399 = metadata !{metadata !"0x34\00dy1\00dy1\00\0033\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dy1] [line 33] [local] [def]
+!400 = metadata !{metadata !"0x34\00dx5\00dx5\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx5] [line 32] [local] [def]
+!401 = metadata !{metadata !"0x34\00dx4\00dx4\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx4] [line 32] [local] [def]
+!402 = metadata !{metadata !"0x34\00dx3\00dx3\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx3] [line 32] [local] [def]
+!403 = metadata !{metadata !"0x34\00dx2\00dx2\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx2] [line 32] [local] [def]
+!404 = metadata !{metadata !"0x34\00dx1\00dx1\00\0032\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [dx1] [line 32] [local] [def]
+!405 = metadata !{metadata !"0x34\00tz3\00tz3\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tz3] [line 31] [local] [def]
+!406 = metadata !{metadata !"0x34\00tz1\00tz1\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tz1] [line 31] [local] [def]
+!407 = metadata !{metadata !"0x34\00ty3\00ty3\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [ty3] [line 31] [local] [def]
+!408 = metadata !{metadata !"0x34\00ty1\00ty1\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [ty1] [line 31] [local] [def]
+!409 = metadata !{metadata !"0x34\00tx3\00tx3\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tx3] [line 31] [local] [def]
+!410 = metadata !{metadata !"0x34\00tx1\00tx1\00\0031\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tx1] [line 31] [local] [def]
+!411 = metadata !{metadata !"0x34\00conz1\00conz1\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [conz1] [line 45] [local] [def]
+!412 = metadata !{metadata !"0x34\00c1345\00c1345\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1345] [line 44] [local] [def]
+!413 = metadata !{metadata !"0x34\00c3c4\00c3c4\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3c4] [line 44] [local] [def]
+!414 = metadata !{metadata !"0x34\00c1c5\00c1c5\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1c5] [line 44] [local] [def]
+!415 = metadata !{metadata !"0x34\00c1c2\00c1c2\00\0044\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c1c2] [line 44] [local] [def]
+!416 = metadata !{metadata !"0x34\00c5\00c5\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c5] [line 45] [local] [def]
+!417 = metadata !{metadata !"0x34\00c4\00c4\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c4] [line 45] [local] [def]
+!418 = metadata !{metadata !"0x34\00c3\00c3\00\0045\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [c3] [line 45] [local] [def]
+!419 = metadata !{metadata !"0x34\00lhs\00lhs\00\0069\001\001", null, metadata !300, metadata !420, null, null} ; [ DW_TAG_variable ] [lhs] [line 69] [local] [def]
+!420 = metadata !{metadata !"0x1\00\000\0020787585600\0064\000\000", null, null, metadata !20, metadata !421, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 20787585600, align 64, offset 0] [from double]
 !421 = metadata !{metadata !308, metadata !308, metadata !308, metadata !178, metadata !93, metadata !93}
-!422 = metadata !{i32 786484, i32 0, null, metadata !"q", metadata !"q", metadata !"", metadata !300, i32 73, metadata !423, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [q] [line 73] [local] [def]
-!423 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 10368, i64 64, i32 0, i32 0, metadata !20, metadata !424, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 10368, align 64, offset 0] [from double]
+!422 = metadata !{metadata !"0x34\00q\00q\00\0073\001\001", null, metadata !300, metadata !423, null, null} ; [ DW_TAG_variable ] [q] [line 73] [local] [def]
+!423 = metadata !{metadata !"0x1\00\000\0010368\0064\000\000", null, null, metadata !20, metadata !424, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 10368, align 64, offset 0] [from double]
 !424 = metadata !{metadata !425}
-!425 = metadata !{i32 786465, i64 0, i64 162}     ; [ DW_TAG_subrange_type ] [0, 161]
-!426 = metadata !{i32 786484, i32 0, null, metadata !"cuf", metadata !"cuf", metadata !"", metadata !300, i32 72, metadata !423, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [cuf] [line 72] [local] [def]
-!427 = metadata !{i32 786484, i32 0, null, metadata !"buf", metadata !"buf", metadata !"", metadata !300, i32 75, metadata !428, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [buf] [line 75] [local] [def]
-!428 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 51840, i64 64, i32 0, i32 0, metadata !20, metadata !429, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 51840, align 64, offset 0] [from double]
+!425 = metadata !{metadata !"0x21\000\00162"}     ; [ DW_TAG_subrange_type ] [0, 161]
+!426 = metadata !{metadata !"0x34\00cuf\00cuf\00\0072\001\001", null, metadata !300, metadata !423, null, null} ; [ DW_TAG_variable ] [cuf] [line 72] [local] [def]
+!427 = metadata !{metadata !"0x34\00buf\00buf\00\0075\001\001", null, metadata !300, metadata !428, null, null} ; [ DW_TAG_variable ] [buf] [line 75] [local] [def]
+!428 = metadata !{metadata !"0x1\00\000\0051840\0064\000\000", null, null, metadata !20, metadata !429, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 51840, align 64, offset 0] [from double]
 !429 = metadata !{metadata !425, metadata !93}
-!430 = metadata !{i32 786484, i32 0, null, metadata !"ue", metadata !"ue", metadata !"", metadata !300, i32 74, metadata !428, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ue] [line 74] [local] [def]
-!431 = metadata !{i32 786484, i32 0, null, metadata !"njac", metadata !"njac", metadata !"", metadata !300, i32 86, metadata !432, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [njac] [line 86] [local] [def]
-!432 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 6886684800, i64 64, i32 0, i32 0, metadata !20, metadata !433, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 6886684800, align 64, offset 0] [from double]
+!430 = metadata !{metadata !"0x34\00ue\00ue\00\0074\001\001", null, metadata !300, metadata !428, null, null} ; [ DW_TAG_variable ] [ue] [line 74] [local] [def]
+!431 = metadata !{metadata !"0x34\00njac\00njac\00\0086\001\001", null, metadata !300, metadata !432, null, null} ; [ DW_TAG_variable ] [njac] [line 86] [local] [def]
+!432 = metadata !{metadata !"0x1\00\000\006886684800\0064\000\000", null, null, metadata !20, metadata !433, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 6886684800, align 64, offset 0] [from double]
 !433 = metadata !{metadata !308, metadata !308, metadata !425, metadata !93, metadata !93}
-!434 = metadata !{i32 786484, i32 0, null, metadata !"fjac", metadata !"fjac", metadata !"", metadata !300, i32 84, metadata !432, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [fjac] [line 84] [local] [def]
-!435 = metadata !{i32 786484, i32 0, null, metadata !"tmp3", metadata !"tmp3", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp3] [line 88] [local] [def]
-!436 = metadata !{i32 786484, i32 0, null, metadata !"tmp2", metadata !"tmp2", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp2] [line 88] [local] [def]
-!437 = metadata !{i32 786484, i32 0, null, metadata !"tmp1", metadata !"tmp1", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp1] [line 88] [local] [def]
+!434 = metadata !{metadata !"0x34\00fjac\00fjac\00\0084\001\001", null, metadata !300, metadata !432, null, null} ; [ DW_TAG_variable ] [fjac] [line 84] [local] [def]
+!435 = metadata !{metadata !"0x34\00tmp3\00tmp3\00\0088\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tmp3] [line 88] [local] [def]
+!436 = metadata !{metadata !"0x34\00tmp2\00tmp2\00\0088\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tmp2] [line 88] [local] [def]
+!437 = metadata !{metadata !"0x34\00tmp1\00tmp1\00\0088\001\001", null, metadata !300, metadata !20, null, null} ; [ DW_TAG_variable ] [tmp1] [line 88] [local] [def]
 !438 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !439 = metadata !{i32 1898, i32 0, metadata !440, null}
-!440 = metadata !{i32 786443, metadata !1, metadata !114, i32 1898, i32 0, i32 107} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!440 = metadata !{metadata !"0xb\001898\000\00107", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !441 = metadata !{i32 1913, i32 0, metadata !442, null}
-!442 = metadata !{i32 786443, metadata !1, metadata !114, i32 1913, i32 0, i32 115} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!442 = metadata !{metadata !"0xb\001913\000\00115", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !443 = metadata !{i32 1923, i32 0, metadata !114, null}
 !444 = metadata !{metadata !"int", metadata !445}
 !445 = metadata !{metadata !"omnipotent char", metadata !446}
 !446 = metadata !{metadata !"Simple C/C++ TBAA"}
 !447 = metadata !{i32 1}
 !448 = metadata !{i32 1925, i32 0, metadata !449, null}
-!449 = metadata !{i32 786443, metadata !1, metadata !114, i32 1925, i32 0, i32 121} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!449 = metadata !{metadata !"0xb\001925\000\00121", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !450 = metadata !{i32 1939, i32 0, metadata !451, null}
-!451 = metadata !{i32 786443, metadata !1, metadata !114, i32 1939, i32 0, i32 127} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!451 = metadata !{metadata !"0xb\001939\000\00127", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !452 = metadata !{i32 1940, i32 0, metadata !453, null}
-!453 = metadata !{i32 786443, metadata !1, metadata !454, i32 1940, i32 0, i32 129} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!454 = metadata !{i32 786443, metadata !1, metadata !451, i32 1939, i32 0, i32 128} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!453 = metadata !{metadata !"0xb\001940\000\00129", metadata !1, metadata !454} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!454 = metadata !{metadata !"0xb\001939\000\00128", metadata !1, metadata !451} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !455 = metadata !{i32 1941, i32 0, metadata !456, null}
-!456 = metadata !{i32 786443, metadata !1, metadata !457, i32 1941, i32 0, i32 131} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!457 = metadata !{i32 786443, metadata !1, metadata !453, i32 1940, i32 0, i32 130} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!456 = metadata !{metadata !"0xb\001941\000\00131", metadata !1, metadata !457} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!457 = metadata !{metadata !"0xb\001940\000\00130", metadata !1, metadata !453} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
 !458 = metadata !{i32 2020, i32 0, metadata !459, null}
-!459 = metadata !{i32 786443, metadata !1, metadata !460, i32 2020, i32 0, i32 149} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!460 = metadata !{i32 786443, metadata !1, metadata !461, i32 2019, i32 0, i32 148} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!461 = metadata !{i32 786443, metadata !1, metadata !462, i32 2019, i32 0, i32 147} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!462 = metadata !{i32 786443, metadata !1, metadata !463, i32 2018, i32 0, i32 146} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!463 = metadata !{i32 786443, metadata !1, metadata !114, i32 2018, i32 0, i32 145} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
-!464 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!459 = metadata !{metadata !"0xb\002020\000\00149", metadata !1, metadata !460} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!460 = metadata !{metadata !"0xb\002019\000\00148", metadata !1, metadata !461} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!461 = metadata !{metadata !"0xb\002019\000\00147", metadata !1, metadata !462} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!462 = metadata !{metadata !"0xb\002018\000\00146", metadata !1, metadata !463} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!463 = metadata !{metadata !"0xb\002018\000\00145", metadata !1, metadata !114} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!464 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/PowerPC/pr18663-2.ll b/test/CodeGen/PowerPC/pr18663-2.ll
new file mode 100644
index 0000000..6b54440
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr18663-2.ll

@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu
+; RUN: llc < %s -march=ppc64le -mtriple=powerpc64le-unknown-linux-gnu
+
+%"class.std::__1::locale::id.1580.4307.4610.8491" = type { %"struct.std::__1::once_flag.1579.4306.4609.8490", i32 }
+%"struct.std::__1::once_flag.1579.4306.4609.8490" = type { i64 }
+%"class.Foam::IOerror.1581.4308.4611.8505" = type { %"class.Foam::error.1535.4262.4565.8504", %"class.Foam::string.1530.4257.4560.8499", i32, i32 }
+%"class.Foam::error.1535.4262.4565.8504" = type { %"class.std::exception.1523.4250.4553.8492", [36 x i8], %"class.Foam::string.1530.4257.4560.8499", %"class.Foam::string.1530.4257.4560.8499", i32, i8, i8, %"class.Foam::OStringStream.1534.4261.4564.8503"* }
+%"class.std::exception.1523.4250.4553.8492" = type { i32 (...)** }
+%"class.Foam::OStringStream.1534.4261.4564.8503" = type { %"class.Foam::OSstream.1533.4260.4563.8502" }
+%"class.Foam::OSstream.1533.4260.4563.8502" = type { [50 x i8], %"class.Foam::fileName.1531.4258.4561.8500", %"class.std::__1::basic_ostream.1532.4259.4562.8501"* }
+%"class.Foam::fileName.1531.4258.4561.8500" = type { %"class.Foam::string.1530.4257.4560.8499" }
+%"class.std::__1::basic_ostream.1532.4259.4562.8501" = type { i32 (...)**, [148 x i8] }
+%"class.Foam::string.1530.4257.4560.8499" = type { %"class.std::__1::basic_string.1529.4256.4559.8498" }
+%"class.std::__1::basic_string.1529.4256.4559.8498" = type { %"class.std::__1::__compressed_pair.1528.4255.4558.8497" }
+%"class.std::__1::__compressed_pair.1528.4255.4558.8497" = type { %"class.std::__1::__libcpp_compressed_pair_imp.1527.4254.4557.8496" }
+%"class.std::__1::__libcpp_compressed_pair_imp.1527.4254.4557.8496" = type { %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep.1526.4253.4556.8495" }
+%"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__rep.1526.4253.4556.8495" = type { %union.anon.1525.4252.4555.8494 }
+%union.anon.1525.4252.4555.8494 = type { %"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long.1524.4251.4554.8493" }
+%"struct.std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__long.1524.4251.4554.8493" = type { i64, i64, i8* }
+
+@.str3 = external unnamed_addr constant [16 x i8], align 1
+@_ZNSt3__15ctypeIcE2idE = external global %"class.std::__1::locale::id.1580.4307.4610.8491"
+@_ZN4Foam12FatalIOErrorE = external global %"class.Foam::IOerror.1581.4308.4611.8505"
+@.str204 = external unnamed_addr constant [18 x i8], align 1
+@.str205 = external unnamed_addr constant [34 x i8], align 1
+
+declare void @_ZN4FoamlsERNS_7OstreamEPKc() #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZNKSt3__18ios_base6getlocEv() #0
+
+declare void @_ZNKSt3__16locale9use_facetERNS0_2idE() #0
+
+; Function Attrs: noreturn
+declare void @_ZNKSt3__121__basic_string_commonILb1EE20__throw_length_errorEv() #1 align 2
+
+declare void @_ZN4Foam6string6expandEb() #0
+
+declare void @_ZN4Foam8IFstreamC1ERKNS_8fileNameENS_8IOstream12streamFormatENS4_13versionNumberE() #0
+
+declare void @_ZN4Foam7IOerrorclEPKcS2_iRKNS_8IOstreamE() #0
+
+declare void @_ZN4Foam7IOerror4exitEi() #0
+
+; Function Attrs: inlinehint
+declare void @_ZN4Foam8fileName12stripInvalidEv() #2 align 2
+
+define void @_ZN4Foam3CSVINS_6VectorIdEEE4readEv() #0 align 2 {
+entry:
+  invoke void @_ZN4Foam6string6expandEb()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  br i1 undef, label %if.then.i.i.i.i176, label %_ZN4Foam6stringC2ERKS0_.exit.i
+
+if.then.i.i.i.i176:                               ; preds = %invoke.cont
+  invoke void @_ZNKSt3__121__basic_string_commonILb1EE20__throw_length_errorEv()
+          to label %.noexc unwind label %lpad
+
+.noexc:                                           ; preds = %if.then.i.i.i.i176
+  unreachable
+
+_ZN4Foam6stringC2ERKS0_.exit.i:                   ; preds = %invoke.cont
+  invoke void @_ZN4Foam8fileName12stripInvalidEv()
+          to label %invoke.cont2 unwind label %lpad.i
+
+lpad.i:                                           ; preds = %_ZN4Foam6stringC2ERKS0_.exit.i
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+invoke.cont2:                                     ; preds = %_ZN4Foam6stringC2ERKS0_.exit.i
+  invoke void @_ZN4Foam8IFstreamC1ERKNS_8fileNameENS_8IOstream12streamFormatENS4_13versionNumberE()
+          to label %invoke.cont4 unwind label %lpad3
+
+invoke.cont4:                                     ; preds = %invoke.cont2
+  br i1 undef, label %for.body, label %if.then
+
+if.then:                                          ; preds = %invoke.cont4
+  invoke void @_ZN4Foam7IOerrorclEPKcS2_iRKNS_8IOstreamE()
+          to label %invoke.cont8 unwind label %lpad5
+
+invoke.cont8:                                     ; preds = %if.then
+  invoke void @_ZN4FoamlsERNS_7OstreamEPKc()
+          to label %memptr.end.i unwind label %lpad5
+
+memptr.end.i:                                     ; preds = %invoke.cont8
+  invoke void @_ZN4Foam7IOerror4exitEi()
+          to label %if.end unwind label %lpad5
+
+lpad:                                             ; preds = %if.then.i.i.i.i176, %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+lpad3:                                            ; preds = %invoke.cont2
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+lpad5:                                            ; preds = %memptr.end.i, %invoke.cont8, %if.then
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+if.end:                                           ; preds = %memptr.end.i
+  br i1 undef, label %for.body, label %vector.body
+
+for.body:                                         ; preds = %if.end, %invoke.cont4
+  invoke void @_ZNKSt3__18ios_base6getlocEv()
+          to label %.noexc205 unwind label %lpad19
+
+.noexc205:                                        ; preds = %for.body
+  invoke void @_ZNKSt3__16locale9use_facetERNS0_2idE()
+          to label %invoke.cont.i.i.i unwind label %lpad.i.i.i
+
+invoke.cont.i.i.i:                                ; preds = %.noexc205
+  unreachable
+
+lpad.i.i.i:                                       ; preds = %.noexc205
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+lpad19:                                           ; preds = %for.body
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %ehcleanup142
+
+vector.body:                                      ; preds = %vector.body, %if.end
+  %vec.phi = phi <8 x i32> [ %10, %vector.body ], [ undef, %if.end ]
+  %vec.phi1302 = phi <8 x i32> [ %11, %vector.body ], [ undef, %if.end ]
+  %vec.phi1303 = phi <8 x i32> [ %12, %vector.body ], [ undef, %if.end ]
+  %vec.phi1304 = phi <8 x i32> [ %13, %vector.body ], [ undef, %if.end ]
+  %6 = icmp sgt <8 x i32> undef, %vec.phi
+  %7 = icmp sgt <8 x i32> undef, %vec.phi1302
+  %8 = icmp sgt <8 x i32> undef, %vec.phi1303
+  %9 = icmp sgt <8 x i32> undef, %vec.phi1304
+  %10 = select <8 x i1> %6, <8 x i32> undef, <8 x i32> %vec.phi
+  %11 = select <8 x i1> %7, <8 x i32> undef, <8 x i32> %vec.phi1302
+  %12 = select <8 x i1> %8, <8 x i32> undef, <8 x i32> %vec.phi1303
+  %13 = select <8 x i1> %9, <8 x i32> undef, <8 x i32> %vec.phi1304
+  br label %vector.body
+
+ehcleanup142:                                     ; preds = %lpad19, %lpad.i.i.i, %lpad5, %lpad3, %lpad, %lpad.i
+  resume { i8*, i32 } undef
+}
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { inlinehint "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+

diff --git a/test/CodeGen/PowerPC/pr18663.ll b/test/CodeGen/PowerPC/pr18663.ll
new file mode 100644
index 0000000..1b85223
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr18663.ll

@@ -0,0 +1,298 @@
+; RUN: llc < %s -march=ppc64 -mtriple=powerpc64-unknown-linux-gnu
+; RUN: llc < %s -march=ppc64le -mtriple=powerpc64le-unknown-linux-gnu
+
+%class.Point.1 = type { %class.Tensor.0 }
+%class.Tensor.0 = type { [3 x double] }
+%class.TriaObjectAccessor.57 = type { %class.TriaAccessor.56 }
+%class.TriaAccessor.56 = type { i32, i32, %class.Triangulation.55* }
+%class.Triangulation.55 = type { %class.Subscriptor, %"class.std::vector.46", %"class.std::vector", %"class.std::vector.3.8", [255 x %class.Boundary.50*], i32, %struct.TriaNumberCache.54 }
+%class.Subscriptor = type { i32 (...)**, i32, %"class.std::type_info.2"* }
+%"class.std::type_info.2" = type { i32 (...)**, i8* }
+%"class.std::vector.46" = type { %"struct.std::_Vector_base.45" }
+%"struct.std::_Vector_base.45" = type { %"struct.std::_Vector_base<TriangulationLevel<3> *, std::allocator<TriangulationLevel<3> *> >::_Vector_impl.44" }
+%"struct.std::_Vector_base<TriangulationLevel<3> *, std::allocator<TriangulationLevel<3> *> >::_Vector_impl.44" = type { %class.TriangulationLevel.43**, %class.TriangulationLevel.43**, %class.TriangulationLevel.43** }
+%class.TriangulationLevel.43 = type { %class.TriangulationLevel.0.37, %"struct.TriangulationLevel<3>::HexesData.42" }
+%class.TriangulationLevel.0.37 = type { %class.TriangulationLevel.1.31, %"struct.TriangulationLevel<2>::QuadsData.36" }
+%class.TriangulationLevel.1.31 = type { %class.TriangulationLevel, %"struct.TriangulationLevel<1>::LinesData.30" }
+%class.TriangulationLevel = type { %"class.std::vector.3.8", %"class.std::vector.3.8", %"class.std::vector.7.12", %"class.std::vector.12.15" }
+%"class.std::vector.7.12" = type { %"struct.std::_Vector_base" }
+%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<std::pair<int, int>, std::allocator<std::pair<int, int> > >::_Vector_impl.10" }
+%"struct.std::_Vector_base<std::pair<int, int>, std::allocator<std::pair<int, int> > >::_Vector_impl.10" = type { %"struct.std::pair.9"*, %"struct.std::pair.9"*, %"struct.std::pair.9"* }
+%"struct.std::pair.9" = type opaque
+%"class.std::vector.12.15" = type { %"struct.std::_Vector_base.13.14" }
+%"struct.std::_Vector_base.13.14" = type { %"struct.std::_Vector_base<unsigned int, std::allocator<unsigned int> >::_Vector_impl.13" }
+%"struct.std::_Vector_base<unsigned int, std::allocator<unsigned int> >::_Vector_impl.13" = type { i32*, i32*, i32* }
+%"struct.TriangulationLevel<1>::LinesData.30" = type { %"class.std::vector.17.20", %"class.std::vector.22.23", %"class.std::vector.3.8", %"class.std::vector.3.8", %"class.std::vector.27.26", %"class.std::vector.32.29" }
+%"class.std::vector.17.20" = type { %"struct.std::_Vector_base.18.19" }
+%"struct.std::_Vector_base.18.19" = type { %"struct.std::_Vector_base<Line, std::allocator<Line> >::_Vector_impl.18" }
+%"struct.std::_Vector_base<Line, std::allocator<Line> >::_Vector_impl.18" = type { %class.Line.17*, %class.Line.17*, %class.Line.17* }
+%class.Line.17 = type { [2 x i32] }
+%"class.std::vector.22.23" = type { %"struct.std::_Vector_base.23.22" }
+%"struct.std::_Vector_base.23.22" = type { %"struct.std::_Vector_base<int, std::allocator<int> >::_Vector_impl.21" }
+%"struct.std::_Vector_base<int, std::allocator<int> >::_Vector_impl.21" = type { i32*, i32*, i32* }
+%"class.std::vector.27.26" = type { %"struct.std::_Vector_base.28.25" }
+%"struct.std::_Vector_base.28.25" = type { %"struct.std::_Vector_base<unsigned char, std::allocator<unsigned char> >::_Vector_impl.24" }
+%"struct.std::_Vector_base<unsigned char, std::allocator<unsigned char> >::_Vector_impl.24" = type { i8*, i8*, i8* }
+%"class.std::vector.32.29" = type { %"struct.std::_Vector_base.33.28" }
+%"struct.std::_Vector_base.33.28" = type { %"struct.std::_Vector_base<void *, std::allocator<void *> >::_Vector_impl.27" }
+%"struct.std::_Vector_base<void *, std::allocator<void *> >::_Vector_impl.27" = type { i8**, i8**, i8** }
+%"struct.TriangulationLevel<2>::QuadsData.36" = type { %"class.std::vector.37.35", %"class.std::vector.22.23", %"class.std::vector.3.8", %"class.std::vector.3.8", %"class.std::vector.27.26", %"class.std::vector.32.29" }
+%"class.std::vector.37.35" = type { %"struct.std::_Vector_base.38.34" }
+%"struct.std::_Vector_base.38.34" = type { %"struct.std::_Vector_base<Quad, std::allocator<Quad> >::_Vector_impl.33" }
+%"struct.std::_Vector_base<Quad, std::allocator<Quad> >::_Vector_impl.33" = type { %class.Quad.32*, %class.Quad.32*, %class.Quad.32* }
+%class.Quad.32 = type { [4 x i32] }
+%"struct.TriangulationLevel<3>::HexesData.42" = type { %"class.std::vector.42.41", %"class.std::vector.22.23", %"class.std::vector.3.8", %"class.std::vector.3.8", %"class.std::vector.27.26", %"class.std::vector.32.29", %"class.std::vector.3.8" }
+%"class.std::vector.42.41" = type { %"struct.std::_Vector_base.43.40" }
+%"struct.std::_Vector_base.43.40" = type { %"struct.std::_Vector_base<Hexahedron, std::allocator<Hexahedron> >::_Vector_impl.39" }
+%"struct.std::_Vector_base<Hexahedron, std::allocator<Hexahedron> >::_Vector_impl.39" = type { %class.Hexahedron.38*, %class.Hexahedron.38*, %class.Hexahedron.38* }
+%class.Hexahedron.38= type { [6 x i32] }
+%"class.std::vector" = type { %"struct.std::_Vector_base.48.48" }
+%"struct.std::_Vector_base.48.48" = type { %"struct.std::_Vector_base<Point<3>, std::allocator<Point<3> > >::_Vector_impl.47" }
+%"struct.std::_Vector_base<Point<3>, std::allocator<Point<3> > >::_Vector_impl.47" = type { %class.Point.1*, %class.Point.1*, %class.Point.1* }
+%"class.std::vector.3.8" = type { %"struct.std::_Bvector_base.7" }
+%"struct.std::_Bvector_base.7" = type { %"struct.std::_Bvector_base<std::allocator<bool> >::_Bvector_impl.6" }
+%"struct.std::_Bvector_base<std::allocator<bool> >::_Bvector_impl.6" = type { %"struct.std::_Bit_iterator.5", %"struct.std::_Bit_iterator.5", i64* }
+%"struct.std::_Bit_iterator.5" = type { %"struct.std::_Bit_iterator_base.base.4", [4 x i8] }
+%"struct.std::_Bit_iterator_base.base.4" = type <{ i64*, i32 }>
+%class.Boundary.50 = type opaque
+%struct.TriaNumberCache.54 = type { %struct.TriaNumberCache.52.52, i32, %"class.std::vector.12.15", i32, %"class.std::vector.12.15" }
+%struct.TriaNumberCache.52.52 = type { %struct.TriaNumberCache.53.51, i32, %"class.std::vector.12.15", i32, %"class.std::vector.12.15" }
+%struct.TriaNumberCache.53.51 = type { i32, %"class.std::vector.12.15", i32, %"class.std::vector.12.15" }
+
+define void @_ZNK18TriaObjectAccessorILi3ELi3EE10barycenterEv(%class.Point.1* noalias nocapture sret %agg.result, %class.TriaObjectAccessor.57* %this) #0 align 2 {
+entry:
+  %0 = load double* null, align 8
+  %1 = load double* undef, align 8
+  %call18 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 6)
+  %2 = load double* undef, align 8
+  %call21 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 7)
+  %3 = load double* undef, align 8
+  %call33 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 3)
+  %4 = load double* null, align 8
+  %5 = load double* undef, align 8
+  %call45 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 7)
+  %6 = load double* undef, align 8
+  %call48 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 0)
+  %7 = load double* undef, align 8
+  %call66 = tail call dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57* %this, i32 zeroext 6)
+  %8 = load double* undef, align 8
+  %mul334 = fmul double undef, 2.000000e+00
+  %mul579 = fmul double %2, %5
+  %mul597 = fmul double undef, %mul579
+  %mul679 = fmul double %2, %8
+  %mul1307 = fmul double undef, %1
+  %mul2092 = fmul double undef, %4
+  %mul2679 = fmul double undef, undef
+  %mul2931 = fmul double undef, %3
+  %mul3094 = fmul double undef, %3
+  %mul3096 = fmul double %mul3094, %8
+  %sub3097 = fsub double 0.000000e+00, %mul3096
+  %add3105 = fadd double undef, %sub3097
+  %add3113 = fadd double 0.000000e+00, %add3105
+  %sub3121 = fsub double %add3113, undef
+  %sub3129 = fsub double %sub3121, undef
+  %add3137 = fadd double undef, %sub3129
+  %add3145 = fadd double undef, %add3137
+  %sub3153 = fsub double %add3145, undef
+  %sub3162 = fsub double %sub3153, 0.000000e+00
+  %add3171 = fadd double undef, %sub3162
+  %add3180 = fadd double undef, %add3171
+  %add3189 = fadd double 0.000000e+00, %add3180
+  %mul3197 = fmul double %4, %mul2679
+  %sub3198 = fsub double %add3189, %mul3197
+  %sub3207 = fsub double %sub3198, 0.000000e+00
+  %mul3212 = fmul double %2, undef
+  %mul3214 = fmul double %mul3212, undef
+  %sub3215 = fsub double %sub3207, %mul3214
+  %mul3222 = fmul double %5, 0.000000e+00
+  %sub3223 = fsub double %sub3215, %mul3222
+  %mul3228 = fmul double %2, undef
+  %mul3230 = fmul double %3, %mul3228
+  %add3231 = fadd double %mul3230, %sub3223
+  %mul3236 = fmul double undef, undef
+  %mul3238 = fmul double %mul3236, %8
+  %add3239 = fadd double %mul3238, %add3231
+  %mul3244 = fmul double %mul1307, %3
+  %mul3246 = fmul double %mul3244, %7
+  %sub3247 = fsub double %add3239, %mul3246
+  %mul3252 = fmul double undef, undef
+  %mul3254 = fmul double %mul3252, %7
+  %add3255 = fadd double %mul3254, %sub3247
+  %sub3263 = fsub double %add3255, undef
+  %add3271 = fadd double 0.000000e+00, %sub3263
+  %sub3279 = fsub double %add3271, undef
+  %sub3287 = fsub double %sub3279, undef
+  %mul3292 = fmul double %mul1307, %5
+  %mul3294 = fmul double %mul3292, undef
+  %add3295 = fadd double %mul3294, %sub3287
+  %add3303 = fadd double undef, %add3295
+  %add3311 = fadd double 0.000000e+00, %add3303
+  %mul3318 = fmul double undef, %7
+  %sub3319 = fsub double %add3311, %mul3318
+  %mul3326 = fmul double %4, %mul3228
+  %sub3327 = fsub double %sub3319, %mul3326
+  %mul3334 = fmul double undef, %8
+  %sub3335 = fsub double %sub3327, %mul3334
+  %add3343 = fadd double undef, %sub3335
+  %mul3350 = fmul double %mul3212, %7
+  %add3351 = fadd double %mul3350, %add3343
+  %mul3358 = fmul double %mul2092, undef
+  %sub3359 = fsub double %add3351, %mul3358
+  %mul3362 = fmul double undef, %1
+  %mul3366 = fmul double 0.000000e+00, %8
+  %add3367 = fadd double %mul3366, %sub3359
+  %mul3372 = fmul double %mul3362, %5
+  %sub3375 = fsub double %add3367, undef
+  %add3383 = fadd double undef, %sub3375
+  %mul3389 = fmul double %2, 0.000000e+00
+  %mul3391 = fmul double %4, %mul3389
+  %sub3392 = fsub double %add3383, %mul3391
+  %mul3396 = fmul double undef, 0.000000e+00
+  %mul3400 = fmul double undef, %7
+  %sub3401 = fsub double %sub3392, %mul3400
+  %mul3407 = fmul double %mul3396, %4
+  %mul3409 = fmul double %mul3407, %8
+  %add3410 = fadd double %mul3409, %sub3401
+  %add3419 = fadd double undef, %add3410
+  %mul3423 = fmul double undef, %mul334
+  %add3428 = fadd double undef, %add3419
+  %add3437 = fadd double undef, %add3428
+  %mul3443 = fmul double %mul3423, %3
+  %mul3445 = fmul double %mul3443, %8
+  %sub3446 = fsub double %add3437, %mul3445
+  %mul3453 = fmul double %mul3372, undef
+  %add3454 = fadd double %mul3453, %sub3446
+  %add3462 = fadd double 0.000000e+00, %add3454
+  %mul3467 = fmul double %mul3362, %3
+  %mul3469 = fmul double %mul3467, %8
+  %sub3470 = fsub double %add3462, %mul3469
+  %add3478 = fadd double 0.000000e+00, %sub3470
+  %sub3486 = fsub double %add3478, undef
+  %mul3490 = fmul double %mul334, 0.000000e+00
+  %mul3492 = fmul double %2, %mul3490
+  %mul3494 = fmul double %mul3492, undef
+  %sub3495 = fsub double %sub3486, %mul3494
+  %sub3503 = fsub double %sub3495, undef
+  %sub3512 = fsub double %sub3503, undef
+  %add3520 = fadd double undef, %sub3512
+  %sub3528 = fsub double %add3520, undef
+  %add3537 = fadd double undef, %sub3528
+  %add3545 = fadd double 0.000000e+00, %add3537
+  %sub3553 = fsub double %add3545, undef
+  %add3561 = fadd double undef, %sub3553
+  %sub3569 = fsub double %add3561, undef
+  %mul3574 = fmul double undef, undef
+  %mul3576 = fmul double %mul3574, %7
+  %add3577 = fadd double %mul3576, %sub3569
+  %sub3585 = fsub double %add3577, undef
+  %mul3592 = fmul double %4, undef
+  %sub3593 = fsub double %sub3585, %mul3592
+  %mul3598 = fmul double %2, undef
+  %mul3600 = fmul double %mul3598, %7
+  %add3601 = fadd double %mul3600, %sub3593
+  %mul3608 = fmul double %mul3598, undef
+  %sub3609 = fsub double %add3601, %mul3608
+  %sub3618 = fsub double %sub3609, undef
+  %add3627 = fadd double undef, %sub3618
+  %add3635 = fadd double undef, %add3627
+  %mul3638 = fmul double undef, %2
+  %mul3640 = fmul double %mul3638, %5
+  %mul3642 = fmul double %mul3640, %7
+  %sub3643 = fsub double %add3635, %mul3642
+  %mul3648 = fmul double %1, undef
+  %mul3650 = fmul double %mul3648, %8
+  %sub3651 = fsub double %sub3643, %mul3650
+  %mul3656 = fmul double %mul3638, %4
+  %mul3658 = fmul double %mul3656, %8
+  %add3659 = fadd double %mul3658, %sub3651
+  %mul3666 = fmul double %5, 0.000000e+00
+  %add3667 = fadd double %mul3666, %add3659
+  %sub3675 = fsub double %add3667, undef
+  %mul3680 = fmul double %mul3638, %3
+  %mul3682 = fmul double %mul3680, %8
+  %sub3683 = fsub double %sub3675, %mul3682
+  %add3692 = fadd double 0.000000e+00, %sub3683
+  %mul3696 = fmul double undef, undef
+  %mul3698 = fmul double %mul3696, %4
+  %mul3700 = fmul double %mul3698, %8
+  %add3701 = fadd double %mul3700, %add3692
+  %sub3710 = fsub double %add3701, undef
+  %mul3716 = fmul double undef, %3
+  %mul3718 = fmul double %mul3716, %8
+  %sub3719 = fsub double %sub3710, %mul3718
+  %add3727 = fadd double undef, %sub3719
+  %mul3734 = fmul double %mul3574, %8
+  %add3735 = fadd double %mul3734, %add3727
+  %sub3743 = fsub double %add3735, 0.000000e+00
+  %add3751 = fadd double 0.000000e+00, %sub3743
+  %mul3758 = fmul double %6, 0.000000e+00
+  %sub3759 = fsub double %add3751, %mul3758
+  %mul3764 = fmul double undef, %mul2931
+  %mul3766 = fmul double %mul3764, undef
+  %sub3767 = fsub double %sub3759, %mul3766
+  %add3775 = fadd double 0.000000e+00, %sub3767
+  %add3783 = fadd double undef, %add3775
+  %sub3791 = fsub double %add3783, 0.000000e+00
+  %add3799 = fadd double undef, %sub3791
+  %sub3807 = fsub double %add3799, undef
+  %mul3814 = fmul double 0.000000e+00, undef
+  %add3815 = fadd double %mul3814, %sub3807
+  %mul3822 = fmul double %mul597, undef
+  %sub3823 = fsub double %add3815, %mul3822
+  %add3831 = fadd double undef, %sub3823
+  %mul3836 = fmul double undef, %mul679
+  %mul3838 = fmul double %6, %mul3836
+  %sub3839 = fsub double %add3831, %mul3838
+  %add3847 = fadd double undef, %sub3839
+  %add3855 = fadd double undef, %add3847
+  %mul3858 = fmul double undef, %8
+  %mul3860 = fmul double undef, %mul3858
+  %mul3862 = fmul double %6, %mul3860
+  %sub3863 = fsub double %add3855, %mul3862
+  %add3872 = fadd double undef, %sub3863
+  %sub3880 = fsub double %add3872, undef
+  %sub3889 = fsub double %sub3880, undef
+  %sub3898 = fsub double %sub3889, undef
+  %add3907 = fadd double undef, %sub3898
+  %sub3915 = fsub double %add3907, 0.000000e+00
+  %add3923 = fadd double undef, %sub3915
+  %mul3930 = fmul double %3, undef
+  %add3931 = fadd double %mul3930, %add3923
+  %add3940 = fadd double undef, %add3931
+  %sub3949 = fsub double %add3940, undef
+  %mul3952 = fmul double %2, %3
+  %sub3957 = fsub double %sub3949, undef
+  %sub3966 = fsub double %sub3957, undef
+  %add3975 = fadd double undef, %sub3966
+  %add3983 = fadd double undef, %add3975
+  %sub3992 = fsub double %add3983, undef
+  %mul3997 = fmul double undef, %mul3952
+  %mul3999 = fmul double %mul3997, %8
+  %add4000 = fadd double %mul3999, %sub3992
+  %sub4008 = fsub double %add4000, undef
+  %add4017 = fadd double undef, %sub4008
+  %add4026 = fadd double 0.000000e+00, %add4017
+  %mul4034 = fmul double %6, undef
+  %sub4035 = fsub double %add4026, %mul4034
+  %add4043 = fadd double undef, %sub4035
+  %sub4051 = fsub double %add4043, 0.000000e+00
+  %mul4916 = fmul double 0.000000e+00, %sub4051
+  %mul4917 = fmul double %mul4916, 0x3FC5555555555555
+  %mul7317 = fmul double 0.000000e+00, %3
+  %mul7670 = fmul double %0, %mul7317
+  %mul8882 = fmul double %0, 0.000000e+00
+  %mul8884 = fmul double undef, %mul8882
+  %sub8885 = fsub double 0.000000e+00, %mul8884
+  %mul8892 = fmul double %mul7670, undef
+  %add8893 = fadd double %mul8892, %sub8885
+  %mul8900 = fmul double undef, undef
+  %add8901 = fadd double %mul8900, %add8893
+  %mul9767 = fmul double 0.000000e+00, %add8901
+  %mul9768 = fmul double %mul9767, 0x3FC5555555555555
+  store double %mul4917, double* undef, align 8
+  store double %mul9768, double* undef, align 8
+  ret void
+}
+
+declare dereferenceable(24) %class.Point.1* @_ZNK18TriaObjectAccessorILi3ELi3EE6vertexEj(%class.TriaObjectAccessor.57*, i32 zeroext) #0
+

diff --git a/test/CodeGen/PowerPC/pr20442.ll b/test/CodeGen/PowerPC/pr20442.ll
new file mode 100644
index 0000000..ad43a04
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr20442.ll

@@ -0,0 +1,79 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-linux-gnu"
+
+; This code would cause code generation like this after PPCCTRLoops ran:
+;  %indvar = phi i32 [ 0, %for.body ], [ %indvar.next, %if.then6 ]
+;  %j.1.ph13 = phi i32 [ %j.110, %if.then6 ], [ 0, %for.body ], [ 0, %for.body ]
+;  %c.0.ph12 = phi i32 [ %dec, %if.then6 ], [ %2, %for.body ], [ %2, %for.body ]
+; which would fail verification because the created induction variable does not
+; have as many predecessor entries as the other PHIs.
+; CHECK-LABEL: @fn1
+; CHECK: mtctr
+
+%struct.anon = type { i32 }
+%struct.anon.0 = type { i32 }
+
+@b = common global %struct.anon* null, align 4
+@a = common global %struct.anon.0* null, align 4
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @fn1() #0 {
+entry:
+  %0 = load %struct.anon** @b, align 4
+  %1 = ptrtoint %struct.anon* %0 to i32
+  %cmp = icmp sgt %struct.anon* %0, null
+  %2 = load %struct.anon.0** @a, align 4
+  br i1 %cmp, label %for.bodythread-pre-split, label %if.end8
+
+for.bodythread-pre-split:                         ; preds = %entry
+  %aclass = getelementptr inbounds %struct.anon.0* %2, i32 0, i32 0
+  %.pr = load i32* %aclass, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.bodythread-pre-split, %for.body
+  switch i32 %.pr, label %for.body [
+    i32 0, label %while.body.lr.ph.preheader
+    i32 2, label %while.body.lr.ph.preheader
+  ]
+
+while.body.lr.ph.preheader:                       ; preds = %for.body, %for.body
+  br label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %while.body.lr.ph.preheader, %if.then6
+  %j.1.ph13 = phi i32 [ %j.110.lcssa, %if.then6 ], [ 0, %while.body.lr.ph.preheader ]
+  %c.0.ph12 = phi i32 [ %dec, %if.then6 ], [ %1, %while.body.lr.ph.preheader ]
+  br label %while.body
+
+while.cond:                                       ; preds = %while.body
+  %cmp2 = icmp slt i32 %inc7, %c.0.ph12
+  br i1 %cmp2, label %while.body, label %if.end8.loopexit
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.cond
+  %j.110 = phi i32 [ %j.1.ph13, %while.body.lr.ph ], [ %inc7, %while.cond ]
+  %aclass_index = getelementptr inbounds %struct.anon* %0, i32 %j.110, i32 0
+  %3 = load i32* %aclass_index, align 4
+  %aclass5 = getelementptr inbounds %struct.anon.0* %2, i32 %3, i32 0
+  %4 = load i32* %aclass5, align 4
+  %tobool = icmp eq i32 %4, 0
+  %inc7 = add nsw i32 %j.110, 1
+  br i1 %tobool, label %while.cond, label %if.then6
+
+if.then6:                                         ; preds = %while.body
+  %j.110.lcssa = phi i32 [ %j.110, %while.body ]
+  %dec = add nsw i32 %c.0.ph12, -1
+  %cmp29 = icmp slt i32 %j.110.lcssa, %dec
+  br i1 %cmp29, label %while.body.lr.ph, label %if.end8.loopexit17
+
+if.end8.loopexit:                                 ; preds = %while.cond
+  br label %if.end8
+
+if.end8.loopexit17:                               ; preds = %if.then6
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.end8.loopexit17, %if.end8.loopexit, %entry
+  ret i32 undef
+}
+
+attributes #0 = { nounwind readonly uwtable }
+

diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll
index 891e801..cd77548 100644
--- a/test/CodeGen/PowerPC/recipest.ll
+++ b/test/CodeGen/PowerPC/recipest.ll

@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck -check-prefix=CHECK-SAFE %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck -check-prefix=CHECK-SAFE %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -8,7 +8,6 @@
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 
 define double @foo(double %a, double %b) nounwind {
-entry:
   %x = call double @llvm.sqrt.f64(double %b)
   %r = fdiv double %a, %x
   ret double %r
@@ -17,12 +16,12 @@
 ; CHECK-DAG: frsqrte
 ; CHECK-DAG: fnmsub
 ; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fmul
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
 ; CHECK: blr
 
 ; CHECK-SAFE: @foo
@@ -32,7 +31,6 @@
 }
 
 define double @foof(double %a, float %b) nounwind {
-entry:
   %x = call float @llvm.sqrt.f32(float %b)
   %y = fpext float %x to double
   %r = fdiv double %a, %y
@@ -42,10 +40,10 @@
 ; CHECK-DAG: frsqrtes
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmuls
-; CHECK: fmadds
-; CHECK: fmuls
-; CHECK: fmul
-; CHECK: blr
+; CHECK-NEXT: fmadds
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: fmul
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @foof
 ; CHECK-SAFE: fsqrts
@@ -54,7 +52,6 @@
 }
 
 define float @food(float %a, double %b) nounwind {
-entry:
   %x = call double @llvm.sqrt.f64(double %b)
   %y = fptrunc double %x to float
   %r = fdiv float %a, %y
@@ -64,14 +61,14 @@
 ; CHECK-DAG: frsqrte
 ; CHECK-DAG: fnmsub
 ; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: frsp
-; CHECK: fmuls
-; CHECK: blr
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: frsp
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @foo
 ; CHECK-SAFE: fsqrt
@@ -80,7 +77,6 @@
 }
 
 define float @goo(float %a, float %b) nounwind {
-entry:
   %x = call float @llvm.sqrt.f32(float %b)
   %r = fdiv float %a, %x
   ret float %r
@@ -89,10 +85,10 @@
 ; CHECK-DAG: frsqrtes
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmuls
-; CHECK: fmadds
-; CHECK: fmuls
-; CHECK: fmuls
-; CHECK: blr
+; CHECK-NEXT: fmadds
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @goo
 ; CHECK-SAFE: fsqrts
@@ -100,8 +96,35 @@
 ; CHECK-SAFE: blr
 }
 
+; Recognize that this is rsqrt(a) * rcp(b) * c, 
+; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
+define float @rsqrt_fmul(float %a, float %b, float %c) {
+  %x = call float @llvm.sqrt.f32(float %a)
+  %y = fmul float %x, %b 
+  %z = fdiv float %c, %y
+  ret float %z
+
+; CHECK: @rsqrt_fmul
+; CHECK-DAG: frsqrtes
+; CHECK-DAG: fres
+; CHECK-DAG: fnmsubs
+; CHECK-DAG: fmuls
+; CHECK-DAG: fnmsubs
+; CHECK-DAG: fmadds
+; CHECK-DAG: fmadds
+; CHECK: fmuls
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: blr
+
+; CHECK-SAFE: @rsqrt_fmul
+; CHECK-SAFE: fsqrts
+; CHECK-SAFE: fmuls
+; CHECK-SAFE: fdivs
+; CHECK-SAFE: blr
+}
+
 define <4 x float> @hoo(<4 x float> %a, <4 x float> %b) nounwind {
-entry:
   %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b)
   %r = fdiv <4 x float> %a, %x
   ret <4 x float> %r
@@ -115,7 +138,6 @@
 }
 
 define double @foo2(double %a, double %b) nounwind {
-entry:
   %r = fdiv double %a, %b
   ret double %r
 
@@ -123,10 +145,10 @@
 ; CHECK-DAG: fre
 ; CHECK-DAG: fnmsub
 ; CHECK: fmadd
-; CHECK: fnmsub
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: blr
+; CHECK-NEXT: fnmsub
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @foo2
 ; CHECK-SAFE: fdiv
@@ -134,7 +156,6 @@
 }
 
 define float @goo2(float %a, float %b) nounwind {
-entry:
   %r = fdiv float %a, %b
   ret float %r
 
@@ -142,8 +163,8 @@
 ; CHECK-DAG: fres
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmadds
-; CHECK: fmuls
-; CHECK: blr
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: blr
 
 ; CHECK-SAFE: @goo2
 ; CHECK-SAFE: fdivs
@@ -151,7 +172,6 @@
 }
 
 define <4 x float> @hoo2(<4 x float> %a, <4 x float> %b) nounwind {
-entry:
   %r = fdiv <4 x float> %a, %b
   ret <4 x float> %r
 
@@ -164,7 +184,6 @@
 }
 
 define double @foo3(double %a) nounwind {
-entry:
   %r = call double @llvm.sqrt.f64(double %a)
   ret double %r
 
@@ -173,16 +192,12 @@
 ; CHECK-DAG: frsqrte
 ; CHECK-DAG: fnmsub
 ; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fmul
-; CHECK: fmadd
-; CHECK: fmul
-; CHECK: fre
-; CHECK: fnmsub
-; CHECK: fmadd
-; CHECK: fnmsub
-; CHECK: fmadd
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmadd
+; CHECK-NEXT: fmul
+; CHECK-NEXT: fmul
 ; CHECK: blr
 
 ; CHECK-SAFE: @foo3
@@ -191,7 +206,6 @@
 }
 
 define float @goo3(float %a) nounwind {
-entry:
   %r = call float @llvm.sqrt.f32(float %a)
   ret float %r
 
@@ -200,11 +214,9 @@
 ; CHECK-DAG: frsqrtes
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmuls
-; CHECK: fmadds
-; CHECK: fmuls
-; CHECK: fres
-; CHECK: fnmsubs
-; CHECK: fmadds
+; CHECK-NEXT: fmadds
+; CHECK-NEXT: fmuls
+; CHECK-NEXT: fmuls
 ; CHECK: blr
 
 ; CHECK-SAFE: @goo3
@@ -213,13 +225,11 @@
 }
 
 define <4 x float> @hoo3(<4 x float> %a) nounwind {
-entry:
   %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
   ret <4 x float> %r
 
 ; CHECK: @hoo3
 ; CHECK: vrsqrtefp
-; CHECK-DAG: vrefp
 ; CHECK-DAG: vcmpeqfp
 
 ; CHECK-SAFE: @hoo3

diff --git a/test/CodeGen/PowerPC/resolvefi-disp.ll b/test/CodeGen/PowerPC/resolvefi-disp.ll
new file mode 100644
index 0000000..ca42bcd
--- /dev/null
+++ b/test/CodeGen/PowerPC/resolvefi-disp.ll

@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -print-after=localstackalloc <%s >%t 2>&1 && FileCheck <%t %s
+
+; Due to a bug in isFrameOffsetLegal we ended up with resolveFrameIndex creating
+; addresses with out-of-range displacements.  Verify that this no longer happens.
+; CHECK-NOT: LD {{3276[8-9]}}
+; CHECK-NOT: LD {{327[7-9][0-9]}}
+; CHECK-NOT: LD {{32[8-9][0-9][0-9]}}
+; CHECK-NOT: LD {{3[3-9][0-9][0-9][0-9]}}
+; CHECK-NOT: LD {{[4-9][0-9][0-9][0-9][0-9]}}
+; CHECK-NOT: LD {{[1-9][0-9][0-9][0-9][0-9][0-9]+}}
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+%struct.S2760 = type { <2 x float>, %struct.anon, i32, [28 x i8] }
+%struct.anon = type { [11 x %struct.anon.0], i64, [6 x { i64, i64 }], [24 x i8] }
+%struct.anon.0 = type { [30 x %union.U4DI], i8, [0 x i16], [30 x i8] }
+%union.U4DI = type { <4 x i64> }
+
+@s2760 = external global %struct.S2760
+@fails = external global i32
+
+define void @check2760(%struct.S2760* noalias sret %agg.result, %struct.S2760* byval align 16, %struct.S2760* %arg1, %struct.S2760* byval align 16) {
+entry:
+  %arg0 = alloca %struct.S2760, align 32
+  %arg2 = alloca %struct.S2760, align 32
+  %arg1.addr = alloca %struct.S2760*, align 8
+  %ret = alloca %struct.S2760, align 32
+  %b1 = alloca %struct.S2760, align 32
+  %b2 = alloca %struct.S2760, align 32
+  %2 = bitcast %struct.S2760* %arg0 to i8*
+  %3 = bitcast %struct.S2760* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 11104, i32 16, i1 false)
+  %4 = bitcast %struct.S2760* %arg2 to i8*
+  %5 = bitcast %struct.S2760* %1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 11104, i32 16, i1 false)
+  store %struct.S2760* %arg1, %struct.S2760** %arg1.addr, align 8
+  %6 = bitcast %struct.S2760* %ret to i8*
+  call void @llvm.memset.p0i8.i64(i8* %6, i8 0, i64 11104, i32 32, i1 false)
+  %7 = bitcast %struct.S2760* %b1 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %7, i8 0, i64 11104, i32 32, i1 false)
+  %8 = bitcast %struct.S2760* %b2 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %8, i8 0, i64 11104, i32 32, i1 false)
+  %b = getelementptr inbounds %struct.S2760* %arg0, i32 0, i32 1
+  %g = getelementptr inbounds %struct.anon* %b, i32 0, i32 1
+  %9 = load i64* %g, align 8
+  %10 = load i64* getelementptr inbounds (%struct.S2760* @s2760, i32 0, i32 1, i32 1), align 8
+  %cmp = icmp ne i64 %9, %10
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %11 = load i32* @fails, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* @fails, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %12 = load i64* getelementptr inbounds (%struct.S2760* @s2760, i32 0, i32 1, i32 1), align 8
+  %b3 = getelementptr inbounds %struct.S2760* %ret, i32 0, i32 1
+  %g4 = getelementptr inbounds %struct.anon* %b3, i32 0, i32 1
+  store i64 %12, i64* %g4, align 8
+  %13 = bitcast %struct.S2760* %agg.result to i8*
+  %14 = bitcast %struct.S2760* %ret to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %13, i8* %14, i64 11104, i32 32, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+

diff --git a/test/CodeGen/PowerPC/rounding-ops.ll b/test/CodeGen/PowerPC/rounding-ops.ll
index bf0a641..42f1236 100644
--- a/test/CodeGen/PowerPC/rounding-ops.ll
+++ b/test/CodeGen/PowerPC/rounding-ops.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -8,6 +9,8 @@
 
 ; CHECK-LABEL: test1:
 ; CHECK: frim 1, 1
+; CHECK-VSX-LABEL: test1:
+; CHECK-VSX: frim 1, 1
 }
 
 declare float @floorf(float) nounwind readnone
@@ -18,6 +21,8 @@
 
 ; CHECK-LABEL: test2:
 ; CHECK: frim 1, 1
+; CHECK-VSX-LABEL: test2:
+; CHECK-VSX: xsrdpim 1, 1
 }
 
 declare double @floor(double) nounwind readnone
@@ -28,6 +33,8 @@
 
 ; CHECK-LABEL: test3:
 ; CHECK: frin 1, 1
+; CHECK-VSX-LABEL: test3:
+; CHECK-VSX: frin 1, 1
 }
 
 declare float @roundf(float) nounwind readnone
@@ -38,6 +45,8 @@
 
 ; CHECK-LABEL: test4:
 ; CHECK: frin 1, 1
+; CHECK-VSX-LABEL: test4:
+; CHECK-VSX: xsrdpi 1, 1
 }
 
 declare double @round(double) nounwind readnone
@@ -48,6 +57,8 @@
 
 ; CHECK-LABEL: test5:
 ; CHECK: frip 1, 1
+; CHECK-VSX-LABEL: test5:
+; CHECK-VSX: frip 1, 1
 }
 
 declare float @ceilf(float) nounwind readnone
@@ -58,6 +69,8 @@
 
 ; CHECK-LABEL: test6:
 ; CHECK: frip 1, 1
+; CHECK-VSX-LABEL: test6:
+; CHECK-VSX: xsrdpip 1, 1
 }
 
 declare double @ceil(double) nounwind readnone
@@ -68,6 +81,8 @@
 
 ; CHECK-LABEL: test9:
 ; CHECK: friz 1, 1
+; CHECK-VSX-LABEL: test9:
+; CHECK-VSX: friz 1, 1
 }
 
 declare float @truncf(float) nounwind readnone
@@ -78,6 +93,8 @@
 
 ; CHECK-LABEL: test10:
 ; CHECK: friz 1, 1
+; CHECK-VSX-LABEL: test10:
+; CHECK-VSX: xsrdpiz 1, 1
 }
 
 declare double @trunc(double) nounwind readnone

diff --git a/test/CodeGen/PowerPC/sections.ll b/test/CodeGen/PowerPC/sections.ll
index 0ff4a89..37a8d16 100644
--- a/test/CodeGen/PowerPC/sections.ll
+++ b/test/CodeGen/PowerPC/sections.ll

@@ -5,4 +5,3 @@
 
 ; CHECK:  .section  .bss,"aw",@nobits
 ; CHECK:  .globl A
-

diff --git a/test/CodeGen/PowerPC/split-index-tc.ll b/test/CodeGen/PowerPC/split-index-tc.ll
new file mode 100644
index 0000000..03aff24
--- /dev/null
+++ b/test/CodeGen/PowerPC/split-index-tc.ll

@@ -0,0 +1,82 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%"class.llvm::MachineOperand" = type { i8, [3 x i8], i64, i64*, i64 }
+
+; Function Attrs: nounwind
+define void @_ZN4llvm17ScheduleDAGInstrs14addPhysRegDepsEPNS_5SUnitEj() #0 align 2 {
+
+; If we were able to split out the indexing, the load with update should be
+; removed (resulting in a nearly-empty output).
+; CHECK-LABEL: @_ZN4llvm17ScheduleDAGInstrs14addPhysRegDepsEPNS_5SUnitEj
+; CHECK-NOT: lhzu
+
+entry:
+  %0 = load %"class.llvm::MachineOperand"** undef, align 8
+  br i1 undef, label %_ZNK4llvm14MachineOperand6getRegEv.exit, label %cond.false.i123
+
+cond.false.i123:                                  ; preds = %_ZN4llvm12MachineInstr10getOperandEj.exit
+  unreachable
+
+_ZNK4llvm14MachineOperand6getRegEv.exit:          ; preds = %_ZN4llvm12MachineInstr10getOperandEj.exit
+  %IsDef.i = getelementptr inbounds %"class.llvm::MachineOperand"* %0, i64 undef, i32 1
+  %1 = bitcast [3 x i8]* %IsDef.i to i24*
+  %bf.load.i = load i24* %1, align 1
+  %2 = and i24 %bf.load.i, 128
+  br i1 undef, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %_ZNK4llvm14MachineOperand6getRegEv.exit
+  %3 = zext i24 %2 to i32
+  br i1 undef, label %cond.false.i134, label %_ZNK4llvm18MCRegAliasIteratordeEv.exit
+
+for.cond.cleanup:                                 ; preds = %_ZNK4llvm14MachineOperand6getRegEv.exit
+  br i1 undef, label %_ZNK4llvm14MachineOperand5isDefEv.exit, label %cond.false.i129
+
+cond.false.i129:                                  ; preds = %for.cond.cleanup
+  unreachable
+
+_ZNK4llvm14MachineOperand5isDefEv.exit:           ; preds = %for.cond.cleanup
+  br i1 undef, label %_ZNK4llvm14MachineOperand6getRegEv.exit247, label %cond.false.i244
+
+cond.false.i134:                                  ; preds = %for.body.lr.ph
+  unreachable
+
+_ZNK4llvm18MCRegAliasIteratordeEv.exit:           ; preds = %for.body.lr.ph
+  unreachable
+
+cond.false.i244:                                  ; preds = %_ZNK4llvm14MachineOperand5isDefEv.exit
+  unreachable
+
+_ZNK4llvm14MachineOperand6getRegEv.exit247:       ; preds = %_ZNK4llvm14MachineOperand5isDefEv.exit
+  br i1 undef, label %if.then53, label %if.end55
+
+if.then53:                                        ; preds = %_ZNK4llvm14MachineOperand6getRegEv.exit247
+  unreachable
+
+if.end55:                                         ; preds = %_ZNK4llvm14MachineOperand6getRegEv.exit247
+  br i1 undef, label %_ZNK4llvm14MachineOperand6isDeadEv.exit262, label %cond.false.i257
+
+cond.false.i257:                                  ; preds = %if.end55
+  unreachable
+
+_ZNK4llvm14MachineOperand6isDeadEv.exit262:       ; preds = %if.end55
+  %bf.load.i259 = load i24* %1, align 1
+  br i1 undef, label %if.then57, label %if.else59
+
+if.then57:                                        ; preds = %_ZNK4llvm14MachineOperand6isDeadEv.exit262
+  unreachable
+
+if.else59:                                        ; preds = %_ZNK4llvm14MachineOperand6isDeadEv.exit262
+  br i1 undef, label %if.end89, label %if.then62
+
+if.then62:                                        ; preds = %if.else59
+  unreachable
+
+if.end89:                                         ; preds = %if.else59
+  unreachable
+}
+
+attributes #0 = { nounwind }
+
+

diff --git a/test/CodeGen/PowerPC/stack-realign.ll b/test/CodeGen/PowerPC/stack-realign.ll
index 1c7a36a..a59fceb 100644
--- a/test/CodeGen/PowerPC/stack-realign.ll
+++ b/test/CodeGen/PowerPC/stack-realign.ll

@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
 ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -disable-fp-elim < %s | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu -disable-fp-elim < %s | FileCheck -check-prefix=CHECK-32 %s
+; RUN: llc -mtriple=powerpc-unknown-linux-gnu -disable-fp-elim -relocation-model=pic < %s | FileCheck -check-prefix=CHECK-32-PIC %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -7,6 +9,8 @@
 
 declare void @bar(i32*)
 
+@barbaz = external global i32
+
 define void @goo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 32
@@ -16,8 +20,9 @@
   store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
   %1 = load i32* %b, align 4
+  %2 = load i32* @barbaz, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4
+  store i32 %2, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx)
   ret void
 }
@@ -69,6 +74,24 @@
 ; CHECK-FP-DAG: mtlr 0
 ; CHECK-FP: blr
 
+; CHECK-32-LABEL: @goo
+; CHECK-32-DAG: mflr 0
+; CHECK-32-DAG: rlwinm [[REG:[0-9]+]], 1, 0, 27, 31
+; CHECK-32-DAG: stw 30, -8(1)
+; CHECK-32-DAG: mr 30, 1
+; CHECK-32-DAG: stw 0, 4(1)
+; CHECK-32-DAG: subfic 0, [[REG]], -64
+; CHECK-32: stwux 1, 1, 0
+
+; CHECK-32-PIC-LABEL: @goo
+; CHECK-32-PIC-DAG: mflr 0
+; CHECK-32-PIC-DAG: rlwinm [[REG:[0-9]+]], 1, 0, 27, 31
+; CHECK-32-PIC-DAG: stw 29, -12(1)
+; CHECK-32-PIC-DAG: mr 29, 1
+; CHECK-32-PIC-DAG: stw 0, 4(1)
+; CHECK-32-PIC-DAG: subfic 0, [[REG]], -64
+; CHECK-32-PIC: stwux 1, 1, 0
+
 ; The large-frame-size case.
 define void @hoo(%struct.s* byval nocapture readonly %a) {
 entry:
@@ -99,6 +122,34 @@
 
 ; CHECK: blr
 
+; CHECK-32-LABEL: @hoo
+
+; CHECK-32-DAG: lis [[REG1:[0-9]+]], -13
+; CHECK-32-DAG: rlwinm [[REG3:[0-9]+]], 1, 0, 27, 31
+; CHECK-32-DAG: mflr 0
+; CHECK-32-DAG: ori [[REG2:[0-9]+]], [[REG1]], 51904
+; CHECK-32-DAG: stw 30, -8(1)
+; CHECK-32-DAG: mr 30, 1
+; CHECK-32-DAG: stw 0, 4(1)
+; CHECK-32-DAG: subfc 0, [[REG3]], [[REG2]]
+; CHECK-32: stwux 1, 1, 0
+
+; CHECK-32: blr
+
+; CHECK-32-PIC-LABEL: @hoo
+
+; CHECK-32-PIC-DAG: lis [[REG1:[0-9]+]], -13
+; CHECK-32-PIC-DAG: rlwinm [[REG3:[0-9]+]], 1, 0, 27, 31
+; CHECK-32-PIC-DAG: mflr 0
+; CHECK-32-PIC-DAG: ori [[REG2:[0-9]+]], [[REG1]], 51904
+; CHECK-32-PIC-DAG: stw 29, -12(1)
+; CHECK-32-PIC-DAG: mr 29, 1
+; CHECK-32-PIC-DAG: stw 0, 4(1)
+; CHECK-32-PIC-DAG: subfc 0, [[REG3]], [[REG2]]
+; CHECK-32: stwux 1, 1, 0
+
+; CHECK-32: blr
+
 ; Make sure that the FP save area is still allocated correctly relative to
 ; where r30 is saved.
 define void @loo(%struct.s* byval nocapture readonly %a) {

diff --git a/test/CodeGen/PowerPC/subsumes-pred-regs.ll b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
index da637cd..c510e36 100644
--- a/test/CodeGen/PowerPC/subsumes-pred-regs.ll
+++ b/test/CodeGen/PowerPC/subsumes-pred-regs.ll

@@ -35,7 +35,7 @@
   br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49
 
 ; CHECK: .LBB0_7:
-; CHECK:	beq 1, .LBB0_10
+; CHECK:	bne 1, .LBB0_10
 ; CHECK:	beq 0, .LBB0_10
 ; CHECK: .LBB0_9:
 

diff --git a/test/CodeGen/PowerPC/tls-pic.ll b/test/CodeGen/PowerPC/tls-pic.ll
index 9f3ab6e..9ba3725 100644
--- a/test/CodeGen/PowerPC/tls-pic.ll
+++ b/test/CodeGen/PowerPC/tls-pic.ll

@@ -1,5 +1,7 @@
 ; RUN: llc -march=ppc64 -mcpu=pwr7 -O0 -relocation-model=pic < %s | FileCheck -check-prefix=OPT0 %s
 ; RUN: llc -march=ppc64 -mcpu=pwr7 -O1 -relocation-model=pic < %s | FileCheck -check-prefix=OPT1 %s
+; RUN: llc -march=ppc32 -O0 -relocation-model=pic < %s | FileCheck -check-prefix=OPT0-32 %s
+; RUN: llc -march=ppc32 -O1 -relocation-model=pic < %s | FileCheck -check-prefix=OPT1-32 %s
 
 target triple = "powerpc64-unknown-linux-gnu"
 ; Test correct assembly code generation for thread-local storage using
@@ -22,6 +24,16 @@
 ; OPT0-NEXT: nop
 ; OPT0:      addis [[REG2:[0-9]+]], 3, a@dtprel@ha
 ; OPT0-NEXT: addi {{[0-9]+}}, [[REG2]], a@dtprel@l
+; OPT0-32-LABEL: main
+; OPT0-32:        addi {{[0-9]+}}, {{[0-9]+}}, a@got@tlsld
+; OPT0-32:        bl __tls_get_addr(a@tlsld)@PLT
+; OPT0-32:        addis [[REG:[0-9]+]], 3, a@dtprel@ha
+; OPT0-32-NEXT:   addi  {{[0-9]+}}, [[REG]], a@dtprel@l
+; OPT1-32-LABEL: main
+; OPT1-32:        addi 3, {{[0-9]+}}, a@got@tlsld
+; OPT1-32:        bl __tls_get_addr(a@tlsld)@PLT
+; OPT1-32:        addis [[REG:[0-9]+]], 3, a@dtprel@ha
+; OPT1-32-NEXT:   addi  {{[0-9]+}}, [[REG]], a@dtprel@l
 
 ; Test peephole optimization for thread-local storage using the
 ; local dynamic model.
@@ -52,4 +64,6 @@
 ; OPT1-NEXT: addi 3, [[REG]], a2@got@tlsgd@l
 ; OPT1:      bl __tls_get_addr(a2@tlsgd)
 ; OPT1-NEXT: nop
-
+; OPT1-32-LABEL: main2
+; OPT1-32:        addi 3, {{[0-9]+}}, a2@got@tlsgd
+; OPT1-32:        bl __tls_get_addr(a2@tlsgd)@PLT

diff --git a/test/CodeGen/PowerPC/tls-store2.ll b/test/CodeGen/PowerPC/tls-store2.ll
new file mode 100644
index 0000000..f884dd8
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-store2.ll

@@ -0,0 +1,33 @@
+; RUN: llc -march=ppc64 -mcpu=pwr7 -O2 -relocation-model=pic < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Test back-to-back stores of TLS variables to ensure call sequences no
+; longer overlap.
+
+@__once_callable = external thread_local global i8**
+@__once_call = external thread_local global void ()*
+
+define i64 @call_once(i64 %flag, i8* %ptr) {
+entry:
+  %var = alloca i8*, align 8
+  store i8* %ptr, i8** %var, align 8
+  store i8** %var, i8*** @__once_callable, align 8
+  store void ()* @__once_call_impl, void ()** @__once_call, align 8
+  ret i64 %flag
+}
+
+; CHECK-LABEL: call_once:
+; CHECK: addis 3, 2, __once_callable@got@tlsgd@ha
+; CHECK: addi 3, 3, __once_callable@got@tlsgd@l
+; CHECK: bl __tls_get_addr(__once_callable@tlsgd)
+; CHECK-NEXT: nop
+; CHECK: std {{[0-9]+}}, 0(3)
+; CHECK: addis 3, 2, __once_call@got@tlsgd@ha
+; CHECK: addi 3, 3, __once_call@got@tlsgd@l
+; CHECK: bl __tls_get_addr(__once_call@tlsgd)
+; CHECK-NEXT: nop
+; CHECK: std {{[0-9]+}}, 0(3)
+
+declare void @__once_call_impl()

diff --git a/test/CodeGen/PowerPC/toc-load-sched-bug.ll b/test/CodeGen/PowerPC/toc-load-sched-bug.ll
new file mode 100644
index 0000000..d437915
--- /dev/null
+++ b/test/CodeGen/PowerPC/toc-load-sched-bug.ll

@@ -0,0 +1,534 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; This test checks for misordering of a TOC restore instruction relative
+; to subsequent uses of the TOC register.  Previously this test broke
+; because there was no TOC register dependency between the instructions,
+; and the usual stack-adjust instructions that held the TOC restore in
+; place were optimized away.
+
+%"class.llvm::Module" = type { %"class.llvm::LLVMContext"*, %"class.llvm::iplist", %"class.llvm::iplist.0", %"class.llvm::iplist.9", %"struct.llvm::ilist", %"class.std::basic_string", %"class.llvm::ValueSymbolTable"*, %"class.llvm::StringMap", %"class.std::unique_ptr", %"class.std::basic_string", %"class.std::basic_string", i8*, %"class.llvm::RandomNumberGenerator"*, %"class.std::basic_string", %"class.llvm::DataLayout" }
+%"class.llvm::iplist" = type { %"struct.llvm::ilist_traits", %"class.llvm::GlobalVariable"* }
+%"struct.llvm::ilist_traits" = type { %"class.llvm::ilist_node" }
+%"class.llvm::ilist_node" = type { %"class.llvm::ilist_half_node", %"class.llvm::GlobalVariable"* }
+%"class.llvm::ilist_half_node" = type { %"class.llvm::GlobalVariable"* }
+%"class.llvm::GlobalVariable" = type { %"class.llvm::GlobalObject", %"class.llvm::ilist_node", i8 }
+%"class.llvm::GlobalObject" = type { %"class.llvm::GlobalValue", %"class.std::basic_string", %"class.llvm::Comdat"* }
+%"class.llvm::GlobalValue" = type { %"class.llvm::Constant", i32, %"class.llvm::Module"* }
+%"class.llvm::Constant" = type { %"class.llvm::User" }
+%"class.llvm::User" = type { %"class.llvm::Value.base", i32, %"class.llvm::Use"* }
+%"class.llvm::Value.base" = type <{ i32 (...)**, %"class.llvm::Type"*, %"class.llvm::Use"*, %"class.llvm::StringMapEntry"*, i8, i8, i16 }>
+%"class.llvm::Type" = type { %"class.llvm::LLVMContext"*, i32, i32, %"class.llvm::Type"** }
+%"class.llvm::StringMapEntry" = type opaque
+%"class.llvm::Use" = type { %"class.llvm::Value"*, %"class.llvm::Use"*, %"class.llvm::PointerIntPair" }
+%"class.llvm::Value" = type { i32 (...)**, %"class.llvm::Type"*, %"class.llvm::Use"*, %"class.llvm::StringMapEntry"*, i8, i8, i16 }
+%"class.llvm::PointerIntPair" = type { i64 }
+%"class.llvm::Comdat" = type { %"class.llvm::StringMapEntry.43"*, i32 }
+%"class.llvm::StringMapEntry.43" = type opaque
+%"class.llvm::iplist.0" = type { %"struct.llvm::ilist_traits.1", %"class.llvm::Function"* }
+%"struct.llvm::ilist_traits.1" = type { %"class.llvm::ilist_node.7" }
+%"class.llvm::ilist_node.7" = type { %"class.llvm::ilist_half_node.8", %"class.llvm::Function"* }
+%"class.llvm::ilist_half_node.8" = type { %"class.llvm::Function"* }
+%"class.llvm::Function" = type { %"class.llvm::GlobalObject", %"class.llvm::ilist_node.7", %"class.llvm::iplist.44", %"class.llvm::iplist.52", %"class.llvm::ValueSymbolTable"*, %"class.llvm::AttributeSet" }
+%"class.llvm::iplist.44" = type { %"struct.llvm::ilist_traits.45", %"class.llvm::BasicBlock"* }
+%"struct.llvm::ilist_traits.45" = type { %"class.llvm::ilist_half_node.51" }
+%"class.llvm::ilist_half_node.51" = type { %"class.llvm::BasicBlock"* }
+%"class.llvm::BasicBlock" = type { %"class.llvm::Value.base", %"class.llvm::ilist_node.61", %"class.llvm::iplist.62", %"class.llvm::Function"* }
+%"class.llvm::ilist_node.61" = type { %"class.llvm::ilist_half_node.51", %"class.llvm::BasicBlock"* }
+%"class.llvm::iplist.62" = type { %"struct.llvm::ilist_traits.63", %"class.llvm::Instruction"* }
+%"struct.llvm::ilist_traits.63" = type { %"class.llvm::ilist_half_node.69" }
+%"class.llvm::ilist_half_node.69" = type { %"class.llvm::Instruction"* }
+%"class.llvm::Instruction" = type { %"class.llvm::User", %"class.llvm::ilist_node.70", %"class.llvm::BasicBlock"*, %"class.llvm::DebugLoc" }
+%"class.llvm::ilist_node.70" = type { %"class.llvm::ilist_half_node.69", %"class.llvm::Instruction"* }
+%"class.llvm::DebugLoc" = type { i32, i32 }
+%"class.llvm::iplist.52" = type { %"struct.llvm::ilist_traits.53", %"class.llvm::Argument"* }
+%"struct.llvm::ilist_traits.53" = type { %"class.llvm::ilist_half_node.59" }
+%"class.llvm::ilist_half_node.59" = type { %"class.llvm::Argument"* }
+%"class.llvm::Argument" = type { %"class.llvm::Value.base", %"class.llvm::ilist_node.60", %"class.llvm::Function"* }
+%"class.llvm::ilist_node.60" = type { %"class.llvm::ilist_half_node.59", %"class.llvm::Argument"* }
+%"class.llvm::AttributeSet" = type { %"class.llvm::AttributeSetImpl"* }
+%"class.llvm::AttributeSetImpl" = type opaque
+%"class.llvm::iplist.9" = type { %"struct.llvm::ilist_traits.10", %"class.llvm::GlobalAlias"* }
+%"struct.llvm::ilist_traits.10" = type { %"class.llvm::ilist_node.16" }
+%"class.llvm::ilist_node.16" = type { %"class.llvm::ilist_half_node.17", %"class.llvm::GlobalAlias"* }
+%"class.llvm::ilist_half_node.17" = type { %"class.llvm::GlobalAlias"* }
+%"class.llvm::GlobalAlias" = type { %"class.llvm::GlobalValue", %"class.llvm::ilist_node.16" }
+%"struct.llvm::ilist" = type { %"class.llvm::iplist.18" }
+%"class.llvm::iplist.18" = type { %"struct.llvm::ilist_traits.19", %"class.llvm::NamedMDNode"* }
+%"struct.llvm::ilist_traits.19" = type { %"class.llvm::ilist_node.24" }
+%"class.llvm::ilist_node.24" = type { %"class.llvm::ilist_half_node.25", %"class.llvm::NamedMDNode"* }
+%"class.llvm::ilist_half_node.25" = type { %"class.llvm::NamedMDNode"* }
+%"class.llvm::NamedMDNode" = type { %"class.llvm::ilist_node.24", %"class.std::basic_string", %"class.llvm::Module"*, i8* }
+%"class.llvm::ValueSymbolTable" = type opaque
+%"class.llvm::StringMap" = type { %"class.llvm::StringMapImpl", %"class.llvm::MallocAllocator" }
+%"class.llvm::StringMapImpl" = type { %"class.llvm::StringMapEntryBase"**, i32, i32, i32, i32 }
+%"class.llvm::StringMapEntryBase" = type { i32 }
+%"class.llvm::MallocAllocator" = type { i8 }
+%"class.std::unique_ptr" = type { %"class.std::tuple" }
+%"class.std::tuple" = type { %"struct.std::_Tuple_impl" }
+%"struct.std::_Tuple_impl" = type { %"struct.std::_Head_base.28" }
+%"struct.std::_Head_base.28" = type { %"class.llvm::GVMaterializer"* }
+%"class.llvm::GVMaterializer" = type opaque
+%"class.llvm::RandomNumberGenerator" = type opaque
+%"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+%"class.llvm::DataLayout" = type { i8, i32, i32, [4 x i8], %"class.llvm::SmallVector", %"class.llvm::SmallVector.29", %"class.llvm::SmallVector.36", i8* }
+%"class.llvm::SmallVector" = type { %"class.llvm::SmallVectorImpl.base", %"struct.llvm::SmallVectorStorage" }
+%"class.llvm::SmallVectorImpl.base" = type { %"class.llvm::SmallVectorTemplateBase.base" }
+%"class.llvm::SmallVectorTemplateBase.base" = type { %"class.llvm::SmallVectorTemplateCommon.base" }
+%"class.llvm::SmallVectorTemplateCommon.base" = type <{ %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion" }>
+%"class.llvm::SmallVectorBase" = type { i8*, i8*, i8* }
+%"struct.llvm::AlignedCharArrayUnion" = type { %"struct.llvm::AlignedCharArray" }
+%"struct.llvm::AlignedCharArray" = type { [1 x i8] }
+%"struct.llvm::SmallVectorStorage" = type { [7 x %"struct.llvm::AlignedCharArrayUnion"] }
+%"class.llvm::SmallVector.29" = type { %"class.llvm::SmallVectorImpl.30", %"struct.llvm::SmallVectorStorage.35" }
+%"class.llvm::SmallVectorImpl.30" = type { %"class.llvm::SmallVectorTemplateBase.31" }
+%"class.llvm::SmallVectorTemplateBase.31" = type { %"class.llvm::SmallVectorTemplateCommon.32" }
+%"class.llvm::SmallVectorTemplateCommon.32" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.33" }
+%"struct.llvm::AlignedCharArrayUnion.33" = type { %"struct.llvm::AlignedCharArray.34" }
+%"struct.llvm::AlignedCharArray.34" = type { [8 x i8] }
+%"struct.llvm::SmallVectorStorage.35" = type { [15 x %"struct.llvm::AlignedCharArrayUnion.33"] }
+%"class.llvm::SmallVector.36" = type { %"class.llvm::SmallVectorImpl.37", %"struct.llvm::SmallVectorStorage.42" }
+%"class.llvm::SmallVectorImpl.37" = type { %"class.llvm::SmallVectorTemplateBase.38" }
+%"class.llvm::SmallVectorTemplateBase.38" = type { %"class.llvm::SmallVectorTemplateCommon.39" }
+%"class.llvm::SmallVectorTemplateCommon.39" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.40" }
+%"struct.llvm::AlignedCharArrayUnion.40" = type { %"struct.llvm::AlignedCharArray.41" }
+%"struct.llvm::AlignedCharArray.41" = type { [16 x i8] }
+%"struct.llvm::SmallVectorStorage.42" = type { [7 x %"struct.llvm::AlignedCharArrayUnion.40"] }
+%"class.llvm::SMDiagnostic" = type { %"class.llvm::SourceMgr"*, %"class.llvm::SMLoc", %"class.std::basic_string", i32, i32, i32, %"class.std::basic_string", %"class.std::basic_string", %"class.std::vector.79", %"class.llvm::SmallVector.84" }
+%"class.llvm::SourceMgr" = type { %"class.std::vector", %"class.std::vector.74", i8*, void (%"class.llvm::SMDiagnostic"*, i8*)*, i8* }
+%"class.std::vector" = type { %"struct.std::_Vector_base" }
+%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<llvm::SourceMgr::SrcBuffer, std::allocator<llvm::SourceMgr::SrcBuffer> >::_Vector_impl" }
+%"struct.std::_Vector_base<llvm::SourceMgr::SrcBuffer, std::allocator<llvm::SourceMgr::SrcBuffer> >::_Vector_impl" = type { %"struct.llvm::SourceMgr::SrcBuffer"*, %"struct.llvm::SourceMgr::SrcBuffer"*, %"struct.llvm::SourceMgr::SrcBuffer"* }
+%"struct.llvm::SourceMgr::SrcBuffer" = type { %"class.llvm::MemoryBuffer"*, %"class.llvm::SMLoc" }
+%"class.llvm::MemoryBuffer" = type { i32 (...)**, i8*, i8* }
+%"class.std::vector.74" = type { %"struct.std::_Vector_base.75" }
+%"struct.std::_Vector_base.75" = type { %"struct.std::_Vector_base<std::basic_string<char>, std::allocator<std::basic_string<char> > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::basic_string<char>, std::allocator<std::basic_string<char> > >::_Vector_impl" = type { %"class.std::basic_string"*, %"class.std::basic_string"*, %"class.std::basic_string"* }
+%"class.llvm::SMLoc" = type { i8* }
+%"class.std::vector.79" = type { %"struct.std::_Vector_base.80" }
+%"struct.std::_Vector_base.80" = type { %"struct.std::_Vector_base<std::pair<unsigned int, unsigned int>, std::allocator<std::pair<unsigned int, unsigned int> > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::pair<unsigned int, unsigned int>, std::allocator<std::pair<unsigned int, unsigned int> > >::_Vector_impl" = type { %"struct.std::pair"*, %"struct.std::pair"*, %"struct.std::pair"* }
+%"struct.std::pair" = type { i32, i32 }
+%"class.llvm::SmallVector.84" = type { %"class.llvm::SmallVectorImpl.85", %"struct.llvm::SmallVectorStorage.90" }
+%"class.llvm::SmallVectorImpl.85" = type { %"class.llvm::SmallVectorTemplateBase.86" }
+%"class.llvm::SmallVectorTemplateBase.86" = type { %"class.llvm::SmallVectorTemplateCommon.87" }
+%"class.llvm::SmallVectorTemplateCommon.87" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.88" }
+%"struct.llvm::AlignedCharArrayUnion.88" = type { %"struct.llvm::AlignedCharArray.89" }
+%"struct.llvm::AlignedCharArray.89" = type { [24 x i8] }
+%"struct.llvm::SmallVectorStorage.90" = type { [3 x %"struct.llvm::AlignedCharArrayUnion.88"] }
+%"class.llvm::LLVMContext" = type { %"class.llvm::LLVMContextImpl"* }
+%"class.llvm::LLVMContextImpl" = type opaque
+%"class.std::allocator" = type { i8 }
+%"class.llvm::ErrorOr.109" = type { %union.anon.110, i8, [7 x i8] }
+%union.anon.110 = type { %"struct.llvm::AlignedCharArrayUnion.93" }
+%"struct.llvm::AlignedCharArrayUnion.93" = type { %"struct.llvm::AlignedCharArray.94" }
+%"struct.llvm::AlignedCharArray.94" = type { [16 x i8] }
+%"class.llvm::ErrorOr" = type { %union.anon, i8, [7 x i8] }
+%union.anon = type { %"struct.llvm::AlignedCharArrayUnion.93" }
+%"class.std::error_category" = type { i32 (...)** }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep_base" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep_base" = type { i64, i64, i32 }
+%"class.llvm::SMFixIt" = type { %"class.llvm::SMRange", %"class.std::basic_string" }
+%"class.llvm::SMRange" = type { %"class.llvm::SMLoc", %"class.llvm::SMLoc" }
+%"struct.llvm::NamedRegionTimer" = type { %"class.llvm::TimeRegion" }
+%"class.llvm::TimeRegion" = type { %"class.llvm::Timer"* }
+%"class.llvm::Timer" = type { %"class.llvm::TimeRecord", %"class.std::basic_string", i8, %"class.llvm::TimerGroup"*, %"class.llvm::Timer"**, %"class.llvm::Timer"* }
+%"class.llvm::TimeRecord" = type { double, double, double, i64 }
+%"class.llvm::TimerGroup" = type { %"class.std::basic_string", %"class.llvm::Timer"*, %"class.std::vector.103", %"class.llvm::TimerGroup"**, %"class.llvm::TimerGroup"* }
+%"class.std::vector.103" = type { %"struct.std::_Vector_base.104" }
+%"struct.std::_Vector_base.104" = type { %"struct.std::_Vector_base<std::pair<llvm::TimeRecord, std::basic_string<char> >, std::allocator<std::pair<llvm::TimeRecord, std::basic_string<char> > > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::pair<llvm::TimeRecord, std::basic_string<char> >, std::allocator<std::pair<llvm::TimeRecord, std::basic_string<char> > > >::_Vector_impl" = type { %"struct.std::pair.108"*, %"struct.std::pair.108"*, %"struct.std::pair.108"* }
+%"struct.std::pair.108" = type opaque
+%struct.LLVMOpaqueContext = type opaque
+%struct.LLVMOpaqueMemoryBuffer = type opaque
+%struct.LLVMOpaqueModule = type opaque
+%"class.llvm::raw_string_ostream" = type { %"class.llvm::raw_ostream.base", %"class.std::basic_string"* }
+%"class.llvm::raw_ostream.base" = type <{ i32 (...)**, i8*, i8*, i8*, i32 }>
+%"class.llvm::raw_ostream" = type { i32 (...)**, i8*, i8*, i8*, i32 }
+
+@.str = private unnamed_addr constant [28 x i8] c"Could not open input file: \00", align 1
+@.str1 = private unnamed_addr constant [54 x i8] c"!HasError && \22Cannot get value when an error exists!\22\00", align 1
+@.str2 = private unnamed_addr constant [61 x i8] c"/home/wschmidt/llvm/llvm-test/include/llvm/Support/ErrorOr.h\00", align 1
+@__PRETTY_FUNCTION__._ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv = private unnamed_addr constant [206 x i8] c"storage_type *llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer> > >::getStorage() [T = std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer> >]\00", align 1
+@_ZNSs4_Rep20_S_empty_rep_storageE = external global [0 x i64]
+
+declare void @_ZN4llvm12MemoryBuffer14getFileOrSTDINENS_9StringRefEl(%"class.llvm::ErrorOr"* sret, [2 x i64], i64) #1
+
+declare void @_ZN4llvm16NamedRegionTimerC1ENS_9StringRefES1_b(%"struct.llvm::NamedRegionTimer"*, [2 x i64], [2 x i64], i1 zeroext) #1
+
+; Function Attrs: nounwind
+define %"class.llvm::Module"* @_ZN4llvm11ParseIRFileERKSsRNS_12SMDiagnosticERNS_11LLVMContextE(%"class.std::basic_string"* nocapture readonly dereferenceable(8) %Filename, %"class.llvm::SMDiagnostic"* dereferenceable(200) %Err, %"class.llvm::LLVMContext"* dereferenceable(8) %Context) #0 {
+entry:
+; CHECK: .globl	_ZN4llvm11ParseIRFileERKSsRNS_12SMDiagnosticERNS_11LLVMContextE
+; CHECK: bctrl
+; CHECK: ld 2, 24(1)
+; CHECK: addis [[REG:[0-9]+]], 2, .L.str@toc@ha
+; CHECK: addi {{[0-9]+}}, [[REG]], .L.str@toc@l
+; CHECK: bl _ZNSs6insertEmPKcm
+  %.atomicdst.i.i.i.i.i46 = alloca i32, align 4
+  %ref.tmp.i.i47 = alloca %"class.std::allocator", align 1
+  %.atomicdst.i.i.i.i.i = alloca i32, align 4
+  %ref.tmp.i.i = alloca %"class.std::allocator", align 1
+  %ref.tmp.i.i2.i = alloca %"class.std::allocator", align 1
+  %ref.tmp.i.i.i = alloca %"class.std::allocator", align 1
+  %FileOrErr = alloca %"class.llvm::ErrorOr", align 8
+  %ref.tmp = alloca %"class.llvm::SMDiagnostic", align 8
+  %ref.tmp5 = alloca %"class.std::basic_string", align 8
+  %_M_p.i.i.i = getelementptr inbounds %"class.std::basic_string"* %Filename, i64 0, i32 0, i32 0
+  %0 = load i8** %_M_p.i.i.i, align 8, !tbaa !1
+  %1 = ptrtoint i8* %0 to i64
+  %arrayidx.i.i.i = getelementptr inbounds i8* %0, i64 -24
+  %_M_length.i.i = bitcast i8* %arrayidx.i.i.i to i64*
+  %2 = load i64* %_M_length.i.i, align 8, !tbaa !7
+  %.fca.0.insert18 = insertvalue [2 x i64] undef, i64 %1, 0
+  %.fca.1.insert21 = insertvalue [2 x i64] %.fca.0.insert18, i64 %2, 1
+  call void @_ZN4llvm12MemoryBuffer14getFileOrSTDINENS_9StringRefEl(%"class.llvm::ErrorOr"* sret %FileOrErr, [2 x i64] %.fca.1.insert21, i64 -1) #3
+  %HasError.i24 = getelementptr inbounds %"class.llvm::ErrorOr"* %FileOrErr, i64 0, i32 1
+  %bf.load.i25 = load i8* %HasError.i24, align 8
+  %3 = and i8 %bf.load.i25, 1
+  %bf.cast.i26 = icmp eq i8 %3, 0
+  br i1 %bf.cast.i26, label %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE3getEv.exit, label %_ZNK4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE8getErrorEv.exit
+
+_ZNK4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE8getErrorEv.exit: ; preds = %entry
+  %retval.sroa.0.0..sroa_cast.i = bitcast %"class.llvm::ErrorOr"* %FileOrErr to i64*
+  %retval.sroa.0.0.copyload.i = load i64* %retval.sroa.0.0..sroa_cast.i, align 8
+  %retval.sroa.3.0..sroa_idx.i = getelementptr inbounds %"class.llvm::ErrorOr"* %FileOrErr, i64 0, i32 0, i32 0, i32 0, i32 0, i64 8
+  %retval.sroa.3.0..sroa_cast.i = bitcast i8* %retval.sroa.3.0..sroa_idx.i to i64*
+  %retval.sroa.3.0.copyload.i = load i64* %retval.sroa.3.0..sroa_cast.i, align 8
+  %phitmp = trunc i64 %retval.sroa.0.0.copyload.i to i32
+  %cmp.i = icmp eq i32 %phitmp, 0
+  br i1 %cmp.i, label %cond.false.i.i, label %if.then
+
+if.then:                                          ; preds = %_ZNK4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE8getErrorEv.exit
+  %.c = inttoptr i64 %retval.sroa.3.0.copyload.i to %"class.std::error_category"*
+  %4 = load i8** %_M_p.i.i.i, align 8, !tbaa !1
+  %arrayidx.i.i.i30 = getelementptr inbounds i8* %4, i64 -24
+  %_M_length.i.i31 = bitcast i8* %arrayidx.i.i.i30 to i64*
+  %5 = load i64* %_M_length.i.i31, align 8, !tbaa !7
+  %6 = inttoptr i64 %retval.sroa.3.0.copyload.i to void (%"class.std::basic_string"*, %"class.std::error_category"*, i32)***
+  %vtable.i = load void (%"class.std::basic_string"*, %"class.std::error_category"*, i32)*** %6, align 8, !tbaa !11
+  %vfn.i = getelementptr inbounds void (%"class.std::basic_string"*, %"class.std::error_category"*, i32)** %vtable.i, i64 3
+  %7 = load void (%"class.std::basic_string"*, %"class.std::error_category"*, i32)** %vfn.i, align 8
+  call void %7(%"class.std::basic_string"* sret %ref.tmp5, %"class.std::error_category"* %.c, i32 signext %phitmp) #3
+  %call2.i.i = call dereferenceable(8) %"class.std::basic_string"* @_ZNSs6insertEmPKcm(%"class.std::basic_string"* %ref.tmp5, i64 0, i8* getelementptr inbounds ([28 x i8]* @.str, i64 0, i64 0), i64 27) #3
+  %_M_p2.i.i.i.i = getelementptr inbounds %"class.std::basic_string"* %call2.i.i, i64 0, i32 0, i32 0
+  %8 = load i8** %_M_p2.i.i.i.i, align 8, !tbaa !13
+  store i8* bitcast (i64* getelementptr inbounds ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8** %_M_p2.i.i.i.i, align 8, !tbaa !1
+  %arrayidx.i.i.i36 = getelementptr inbounds i8* %8, i64 -24
+  %_M_length.i.i37 = bitcast i8* %arrayidx.i.i.i36 to i64*
+  %9 = load i64* %_M_length.i.i37, align 8, !tbaa !7
+  %Filename.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 2
+  %10 = getelementptr inbounds %"class.std::allocator"* %ref.tmp.i.i2.i, i64 0, i32 0
+  %11 = bitcast %"class.llvm::SMDiagnostic"* %ref.tmp to i8*
+  call void @llvm.memset.p0i8.i64(i8* %11, i8 0, i64 16, i32 8, i1 false) #3
+  call void @llvm.lifetime.start(i64 1, i8* %10) #3
+  %tobool.i.i4.i = icmp eq i8* %4, null
+  br i1 %tobool.i.i4.i, label %if.then.i.i6.i, label %if.end.i.i8.i
+
+if.then.i.i6.i:                                   ; preds = %if.then
+  %_M_p.i.i.i.i.i.i5.i = getelementptr inbounds %"class.std::basic_string"* %Filename.i, i64 0, i32 0, i32 0
+  store i8* bitcast (i64* getelementptr inbounds ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8** %_M_p.i.i.i.i.i.i5.i, align 8, !tbaa !13
+  br label %_ZNK4llvm9StringRefcvSsEv.exit9.i
+
+if.end.i.i8.i:                                    ; preds = %if.then
+  call void @_ZNSsC1EPKcmRKSaIcE(%"class.std::basic_string"* %Filename.i, i8* %4, i64 %5, %"class.std::allocator"* dereferenceable(1) %ref.tmp.i.i2.i) #3
+  br label %_ZNK4llvm9StringRefcvSsEv.exit9.i
+
+_ZNK4llvm9StringRefcvSsEv.exit9.i:                ; preds = %if.end.i.i8.i, %if.then.i.i6.i
+  call void @llvm.lifetime.end(i64 1, i8* %10) #3
+  %LineNo.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 3
+  store i32 -1, i32* %LineNo.i, align 8, !tbaa !14
+  %ColumnNo.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 4
+  store i32 -1, i32* %ColumnNo.i, align 4, !tbaa !21
+  %Kind.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 5
+  store i32 0, i32* %Kind.i, align 8, !tbaa !22
+  %Message.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 6
+  %12 = getelementptr inbounds %"class.std::allocator"* %ref.tmp.i.i.i, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %12) #3
+  %tobool.i.i.i = icmp eq i8* %8, null
+  br i1 %tobool.i.i.i, label %if.then.i.i.i, label %if.end.i.i.i
+
+if.then.i.i.i:                                    ; preds = %_ZNK4llvm9StringRefcvSsEv.exit9.i
+  %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::basic_string"* %Message.i, i64 0, i32 0, i32 0
+  store i8* bitcast (i64* getelementptr inbounds ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !13
+  br label %_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit
+
+if.end.i.i.i:                                     ; preds = %_ZNK4llvm9StringRefcvSsEv.exit9.i
+  call void @_ZNSsC1EPKcmRKSaIcE(%"class.std::basic_string"* %Message.i, i8* %8, i64 %9, %"class.std::allocator"* dereferenceable(1) %ref.tmp.i.i.i) #3
+  br label %_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit
+
+_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit: ; preds = %if.then.i.i.i, %if.end.i.i.i
+  call void @llvm.lifetime.end(i64 1, i8* %12) #3
+  %_M_p.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 7, i32 0, i32 0
+  store i8* bitcast (i64* getelementptr inbounds ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8** %_M_p.i.i.i.i.i, align 8, !tbaa !13
+  %Ranges.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 8
+  %13 = bitcast %"class.std::vector.79"* %Ranges.i to i8*
+  call void @llvm.memset.p0i8.i64(i8* %13, i8 0, i64 24, i32 8, i1 false) #3
+  %14 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i64 0
+  %BeginX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0
+  store i8* %14, i8** %BeginX.i.i.i.i.i.i, align 8, !tbaa !23
+  %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 1
+  store i8* %14, i8** %EndX.i.i.i.i.i.i, align 8, !tbaa !25
+  %CapacityX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 2
+  %add.ptr.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i64 96
+  store i8* %add.ptr.i.i.i.i.i.i, i8** %CapacityX.i.i.i.i.i.i, align 8, !tbaa !26
+  %15 = bitcast %"class.llvm::SMDiagnostic"* %Err to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %15, i8* %11, i64 16, i32 8, i1 false) #3
+  %Filename.i38 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 2
+  call void @_ZNSs4swapERSs(%"class.std::basic_string"* %Filename.i38, %"class.std::basic_string"* dereferenceable(8) %Filename.i) #3
+  %LineNo.i39 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 3
+  %16 = bitcast i32* %LineNo.i39 to i8*
+  %17 = bitcast i32* %LineNo.i to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %16, i8* %17, i64 12, i32 4, i1 false) #3
+  %Message.i40 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 6
+  call void @_ZNSs4swapERSs(%"class.std::basic_string"* %Message.i40, %"class.std::basic_string"* dereferenceable(8) %Message.i) #3
+  %LineContents.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 7
+  %LineContents7.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 7
+  call void @_ZNSs4swapERSs(%"class.std::basic_string"* %LineContents.i, %"class.std::basic_string"* dereferenceable(8) %LineContents7.i) #3
+  %Ranges.i41 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 8
+  %_M_start.i7.i.i.i = getelementptr inbounds %"class.std::vector.79"* %Ranges.i41, i64 0, i32 0, i32 0, i32 0
+  %18 = load %"struct.std::pair"** %_M_start.i7.i.i.i, align 8, !tbaa !27
+  %_M_finish.i9.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 8, i32 0, i32 0, i32 1
+  %_M_end_of_storage.i11.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 8, i32 0, i32 0, i32 2
+  %_M_start2.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 8, i32 0, i32 0, i32 0
+  %19 = bitcast %"class.std::vector.79"* %Ranges.i41 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %19, i8 0, i64 16, i32 8, i1 false) #3
+  %20 = load %"struct.std::pair"** %_M_start2.i.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* %20, %"struct.std::pair"** %_M_start.i7.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* null, %"struct.std::pair"** %_M_start2.i.i.i.i, align 8, !tbaa !27
+  %_M_finish3.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 8, i32 0, i32 0, i32 1
+  %21 = load %"struct.std::pair"** %_M_finish3.i.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* %21, %"struct.std::pair"** %_M_finish.i9.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* null, %"struct.std::pair"** %_M_finish3.i.i.i.i, align 8, !tbaa !27
+  %_M_end_of_storage4.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 8, i32 0, i32 0, i32 2
+  %22 = load %"struct.std::pair"** %_M_end_of_storage4.i.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* %22, %"struct.std::pair"** %_M_end_of_storage.i11.i.i.i, align 8, !tbaa !27
+  store %"struct.std::pair"* null, %"struct.std::pair"** %_M_end_of_storage4.i.i.i.i, align 8, !tbaa !27
+  %tobool.i.i.i.i.i.i = icmp eq %"struct.std::pair"* %18, null
+  br i1 %tobool.i.i.i.i.i.i, label %_ZN4llvm12SMDiagnosticaSEOS0_.exit, label %if.then.i.i.i.i.i.i
+
+if.then.i.i.i.i.i.i:                              ; preds = %_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit
+  %23 = bitcast %"struct.std::pair"* %18 to i8*
+  call void @_ZdlPv(i8* %23) #3
+  br label %_ZN4llvm12SMDiagnosticaSEOS0_.exit
+
+_ZN4llvm12SMDiagnosticaSEOS0_.exit:               ; preds = %_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit, %if.then.i.i.i.i.i.i
+  %24 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %Err, i64 0, i32 9, i32 0
+  %25 = getelementptr inbounds %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 9, i32 0
+  %call2.i.i42 = call dereferenceable(48) %"class.llvm::SmallVectorImpl.85"* @_ZN4llvm15SmallVectorImplINS_7SMFixItEEaSEOS2_(%"class.llvm::SmallVectorImpl.85"* %24, %"class.llvm::SmallVectorImpl.85"* dereferenceable(48) %25) #3
+  call void @_ZN4llvm12SMDiagnosticD2Ev(%"class.llvm::SMDiagnostic"* %ref.tmp) #3
+  %26 = getelementptr inbounds %"class.std::allocator"* %ref.tmp.i.i, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %26) #3
+  %27 = bitcast i8* %arrayidx.i.i.i36 to %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"*
+  %cmp.i.i.i = icmp eq i8* %arrayidx.i.i.i36, bitcast ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE to i8*)
+  br i1 %cmp.i.i.i, label %_ZNSsD1Ev.exit, label %if.then.i.i.i45, !prof !28
+
+if.then.i.i.i45:                                  ; preds = %_ZN4llvm12SMDiagnosticaSEOS0_.exit
+  %_M_refcount.i.i.i = getelementptr inbounds i8* %8, i64 -8
+  %28 = bitcast i8* %_M_refcount.i.i.i to i32*
+  br i1 icmp ne (i8* bitcast (i32 (i32*, void (i8*)*)* @__pthread_key_create to i8*), i8* null), label %if.then.i.i.i.i, label %if.else.i.i.i.i
+
+if.then.i.i.i.i:                                  ; preds = %if.then.i.i.i45
+  %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast = bitcast i32* %.atomicdst.i.i.i.i.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  %29 = atomicrmw volatile add i32* %28, i32 -1 acq_rel
+  store i32 %29, i32* %.atomicdst.i.i.i.i.i, align 4
+  %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i = load volatile i32* %.atomicdst.i.i.i.i.i, align 4
+  call void @llvm.lifetime.end(i64 4, i8* %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i
+
+if.else.i.i.i.i:                                  ; preds = %if.then.i.i.i45
+  %30 = load i32* %28, align 4, !tbaa !29
+  %add.i.i.i.i.i = add nsw i32 %30, -1
+  store i32 %add.i.i.i.i.i, i32* %28, align 4, !tbaa !29
+  br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i
+
+_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i: ; preds = %if.else.i.i.i.i, %if.then.i.i.i.i
+  %retval.0.i.i.i.i = phi i32 [ %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i, %if.then.i.i.i.i ], [ %30, %if.else.i.i.i.i ]
+  %cmp3.i.i.i = icmp slt i32 %retval.0.i.i.i.i, 1
+  br i1 %cmp3.i.i.i, label %if.then4.i.i.i, label %_ZNSsD1Ev.exit
+
+if.then4.i.i.i:                                   ; preds = %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i
+  call void @_ZNSs4_Rep10_M_destroyERKSaIcE(%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"* %27, %"class.std::allocator"* dereferenceable(1) %ref.tmp.i.i) #3
+  br label %_ZNSsD1Ev.exit
+
+_ZNSsD1Ev.exit:                                   ; preds = %_ZN4llvm12SMDiagnosticaSEOS0_.exit, %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i, %if.then4.i.i.i
+  call void @llvm.lifetime.end(i64 1, i8* %26) #3
+  %31 = getelementptr inbounds %"class.std::allocator"* %ref.tmp.i.i47, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* %31) #3
+  %_M_p.i.i.i.i48 = getelementptr inbounds %"class.std::basic_string"* %ref.tmp5, i64 0, i32 0, i32 0
+  %32 = load i8** %_M_p.i.i.i.i48, align 8, !tbaa !1
+  %arrayidx.i.i.i49 = getelementptr inbounds i8* %32, i64 -24
+  %33 = bitcast i8* %arrayidx.i.i.i49 to %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"*
+  %cmp.i.i.i50 = icmp eq i8* %arrayidx.i.i.i49, bitcast ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE to i8*)
+  br i1 %cmp.i.i.i50, label %_ZNSsD1Ev.exit62, label %if.then.i.i.i52, !prof !28
+
+if.then.i.i.i52:                                  ; preds = %_ZNSsD1Ev.exit
+  %_M_refcount.i.i.i51 = getelementptr inbounds i8* %32, i64 -8
+  %34 = bitcast i8* %_M_refcount.i.i.i51 to i32*
+  br i1 icmp ne (i8* bitcast (i32 (i32*, void (i8*)*)* @__pthread_key_create to i8*), i8* null), label %if.then.i.i.i.i55, label %if.else.i.i.i.i57
+
+if.then.i.i.i.i55:                                ; preds = %if.then.i.i.i52
+  %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast = bitcast i32* %.atomicdst.i.i.i.i.i46 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  %35 = atomicrmw volatile add i32* %34, i32 -1 acq_rel
+  store i32 %35, i32* %.atomicdst.i.i.i.i.i46, align 4
+  %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i54 = load volatile i32* %.atomicdst.i.i.i.i.i46, align 4
+  call void @llvm.lifetime.end(i64 4, i8* %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60
+
+if.else.i.i.i.i57:                                ; preds = %if.then.i.i.i52
+  %36 = load i32* %34, align 4, !tbaa !29
+  %add.i.i.i.i.i56 = add nsw i32 %36, -1
+  store i32 %add.i.i.i.i.i56, i32* %34, align 4, !tbaa !29
+  br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60
+
+_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60: ; preds = %if.else.i.i.i.i57, %if.then.i.i.i.i55
+  %retval.0.i.i.i.i58 = phi i32 [ %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i54, %if.then.i.i.i.i55 ], [ %36, %if.else.i.i.i.i57 ]
+  %cmp3.i.i.i59 = icmp slt i32 %retval.0.i.i.i.i58, 1
+  br i1 %cmp3.i.i.i59, label %if.then4.i.i.i61, label %_ZNSsD1Ev.exit62
+
+if.then4.i.i.i61:                                 ; preds = %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60
+  call void @_ZNSs4_Rep10_M_destroyERKSaIcE(%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"* %33, %"class.std::allocator"* dereferenceable(1) %ref.tmp.i.i47) #3
+  br label %_ZNSsD1Ev.exit62
+
+_ZNSsD1Ev.exit62:                                 ; preds = %_ZNSsD1Ev.exit, %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60, %if.then4.i.i.i61
+  call void @llvm.lifetime.end(i64 1, i8* %31) #3
+  br label %cleanup
+
+cond.false.i.i:                                   ; preds = %_ZNK4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE8getErrorEv.exit
+  call void @__assert_fail(i8* getelementptr inbounds ([54 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([61 x i8]* @.str2, i64 0, i64 0), i32 zeroext 242, i8* getelementptr inbounds ([206 x i8]* @__PRETTY_FUNCTION__._ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv, i64 0, i64 0)) #7
+  unreachable
+
+_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE3getEv.exit: ; preds = %entry
+  %_M_head_impl.i.i.i.i.i = bitcast %"class.llvm::ErrorOr"* %FileOrErr to %"class.llvm::MemoryBuffer"**
+  %37 = load %"class.llvm::MemoryBuffer"** %_M_head_impl.i.i.i.i.i, align 8, !tbaa !27
+  %call9 = call %"class.llvm::Module"* @_ZN4llvm7ParseIREPNS_12MemoryBufferERNS_12SMDiagnosticERNS_11LLVMContextE(%"class.llvm::MemoryBuffer"* %37, %"class.llvm::SMDiagnostic"* dereferenceable(200) %Err, %"class.llvm::LLVMContext"* dereferenceable(8) %Context)
+  br label %cleanup
+
+cleanup:                                          ; preds = %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE3getEv.exit, %_ZNSsD1Ev.exit62
+  %retval.0 = phi %"class.llvm::Module"* [ null, %_ZNSsD1Ev.exit62 ], [ %call9, %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE3getEv.exit ]
+  %bf.load.i = load i8* %HasError.i24, align 8
+  %38 = and i8 %bf.load.i, 1
+  %bf.cast.i = icmp eq i8 %38, 0
+  br i1 %bf.cast.i, label %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv.exit.i, label %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEED2Ev.exit
+
+_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv.exit.i: ; preds = %cleanup
+  %_M_head_impl.i.i.i.i.i.i = bitcast %"class.llvm::ErrorOr"* %FileOrErr to %"class.llvm::MemoryBuffer"**
+  %39 = load %"class.llvm::MemoryBuffer"** %_M_head_impl.i.i.i.i.i.i, align 8, !tbaa !27
+  %cmp.i.i = icmp eq %"class.llvm::MemoryBuffer"* %39, null
+  br i1 %cmp.i.i, label %_ZNSt10unique_ptrIN4llvm12MemoryBufferESt14default_deleteIS1_EED2Ev.exit.i, label %_ZNKSt14default_deleteIN4llvm12MemoryBufferEEclEPS1_.exit.i.i
+
+_ZNKSt14default_deleteIN4llvm12MemoryBufferEEclEPS1_.exit.i.i: ; preds = %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv.exit.i
+  %40 = bitcast %"class.llvm::MemoryBuffer"* %39 to void (%"class.llvm::MemoryBuffer"*)***
+  %vtable.i.i.i = load void (%"class.llvm::MemoryBuffer"*)*** %40, align 8, !tbaa !11
+  %vfn.i.i.i = getelementptr inbounds void (%"class.llvm::MemoryBuffer"*)** %vtable.i.i.i, i64 1
+  %41 = load void (%"class.llvm::MemoryBuffer"*)** %vfn.i.i.i, align 8
+  call void %41(%"class.llvm::MemoryBuffer"* %39) #3
+  br label %_ZNSt10unique_ptrIN4llvm12MemoryBufferESt14default_deleteIS1_EED2Ev.exit.i
+
+_ZNSt10unique_ptrIN4llvm12MemoryBufferESt14default_deleteIS1_EED2Ev.exit.i: ; preds = %_ZNKSt14default_deleteIN4llvm12MemoryBufferEEclEPS1_.exit.i.i, %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE10getStorageEv.exit.i
+  store %"class.llvm::MemoryBuffer"* null, %"class.llvm::MemoryBuffer"** %_M_head_impl.i.i.i.i.i.i, align 8, !tbaa !27
+  br label %_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEED2Ev.exit
+
+_ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEED2Ev.exit: ; preds = %cleanup, %_ZNSt10unique_ptrIN4llvm12MemoryBufferESt14default_deleteIS1_EED2Ev.exit.i
+  ret %"class.llvm::Module"* %retval.0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #3
+
+; Function Attrs: noreturn nounwind
+declare void @__assert_fail(i8*, i8*, i32 zeroext, i8*) #4
+
+declare dereferenceable(8) %"class.std::basic_string"* @_ZNSs6insertEmPKcm(%"class.std::basic_string"*, i64, i8*, i64) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #3
+
+; Function Attrs: nounwind
+declare void @_ZNSs4_Rep10_M_destroyERKSaIcE(%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"*, %"class.std::allocator"* dereferenceable(1)) #0
+
+; Function Attrs: nounwind
+declare extern_weak signext i32 @__pthread_key_create(i32*, void (i8*)*) #0
+
+; Function Attrs: nobuiltin nounwind
+declare void @_ZdlPv(i8*) #6
+
+declare void @_ZNSsC1EPKcmRKSaIcE(%"class.std::basic_string"*, i8*, i64, %"class.std::allocator"* dereferenceable(1)) #1
+
+declare hidden void @_ZN4llvm12SMDiagnosticD2Ev(%"class.llvm::SMDiagnostic"* readonly %this) unnamed_addr #2 align 2
+
+declare dereferenceable(48) %"class.llvm::SmallVectorImpl.85"* @_ZN4llvm15SmallVectorImplINS_7SMFixItEEaSEOS2_(%"class.llvm::SmallVectorImpl.85"* %this, %"class.llvm::SmallVectorImpl.85"* dereferenceable(48) %RHS) #0 align 2
+
+declare %"class.llvm::Module"* @_ZN4llvm7ParseIREPNS_12MemoryBufferERNS_12SMDiagnosticERNS_11LLVMContextE(%"class.llvm::MemoryBuffer"* %Buffer, %"class.llvm::SMDiagnostic"* dereferenceable(200) %Err, %"class.llvm::LLVMContext"* dereferenceable(8) %Context) #0
+
+declare void @_ZNSs4swapERSs(%"class.std::basic_string"*, %"class.std::basic_string"* dereferenceable(8)) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { inlinehint nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { nobuiltin nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #7 = { noreturn nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 (trunk 215115) (llvm/trunk 215117)"}
+!1 = metadata !{metadata !2, metadata !4, i64 0}
+!2 = metadata !{metadata !"_ZTSSs", metadata !3, i64 0}
+!3 = metadata !{metadata !"_ZTSNSs12_Alloc_hiderE", metadata !4, i64 0}
+!4 = metadata !{metadata !"any pointer", metadata !5, i64 0}
+!5 = metadata !{metadata !"omnipotent char", metadata !6, i64 0}
+!6 = metadata !{metadata !"Simple C/C++ TBAA"}
+!7 = metadata !{metadata !8, metadata !9, i64 0}
+!8 = metadata !{metadata !"_ZTSNSs9_Rep_baseE", metadata !9, i64 0, metadata !9, i64 8, metadata !10, i64 16}
+!9 = metadata !{metadata !"long", metadata !5, i64 0}
+!10 = metadata !{metadata !"int", metadata !5, i64 0}
+!11 = metadata !{metadata !12, metadata !12, i64 0}
+!12 = metadata !{metadata !"vtable pointer", metadata !6, i64 0}
+!13 = metadata !{metadata !3, metadata !4, i64 0}
+!14 = metadata !{metadata !15, metadata !10, i64 24}
+!15 = metadata !{metadata !"_ZTSN4llvm12SMDiagnosticE", metadata !4, i64 0, metadata !16, i64 8, metadata !2, i64 16, metadata !10, i64 24, metadata !10, i64 28, metadata !17, i64 32, metadata !2, i64 40, metadata !2, i64 48, metadata !18, i64 56, metadata !19, i64 80}
+!16 = metadata !{metadata !"_ZTSN4llvm5SMLocE", metadata !4, i64 0}
+!17 = metadata !{metadata !"_ZTSN4llvm9SourceMgr8DiagKindE", metadata !5, i64 0}
+!18 = metadata !{metadata !"_ZTSSt6vectorISt4pairIjjESaIS1_EE"}
+!19 = metadata !{metadata !"_ZTSN4llvm11SmallVectorINS_7SMFixItELj4EEE", metadata !20, i64 48}
+!20 = metadata !{metadata !"_ZTSN4llvm18SmallVectorStorageINS_7SMFixItELj4EEE", metadata !5, i64 0}
+!21 = metadata !{metadata !15, metadata !10, i64 28}
+!22 = metadata !{metadata !15, metadata !17, i64 32}
+!23 = metadata !{metadata !24, metadata !4, i64 0}
+!24 = metadata !{metadata !"_ZTSN4llvm15SmallVectorBaseE", metadata !4, i64 0, metadata !4, i64 8, metadata !4, i64 16}
+!25 = metadata !{metadata !24, metadata !4, i64 8}
+!26 = metadata !{metadata !24, metadata !4, i64 16}
+!27 = metadata !{metadata !4, metadata !4, i64 0}
+!28 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!29 = metadata !{metadata !10, metadata !10, i64 0}
+!30 = metadata !{metadata !31, metadata !4, i64 8}
+!31 = metadata !{metadata !"_ZTSN4llvm12MemoryBufferE", metadata !4, i64 8, metadata !4, i64 16}
+!32 = metadata !{metadata !31, metadata !4, i64 16}
+!33 = metadata !{metadata !5, metadata !5, i64 0}
+!34 = metadata !{metadata !35, metadata !4, i64 0}
+!35 = metadata !{metadata !"_ZTSSt12_Vector_baseISt4pairIjjESaIS1_EE", metadata !36, i64 0}
+!36 = metadata !{metadata !"_ZTSNSt12_Vector_baseISt4pairIjjESaIS1_EE12_Vector_implE", metadata !4, i64 0, metadata !4, i64 8, metadata !4, i64 16}
+!37 = metadata !{metadata !38, metadata !38, i64 0}
+!38 = metadata !{metadata !"bool", metadata !5, i64 0}
+!39 = metadata !{i8 0, i8 2}
+!40 = metadata !{metadata !41, metadata !4, i64 0}
+!41 = metadata !{metadata !"_ZTSN4llvm10TimeRegionE", metadata !4, i64 0}
+!42 = metadata !{metadata !43, metadata !44, i64 32}
+!43 = metadata !{metadata !"_ZTSN4llvm11raw_ostreamE", metadata !4, i64 8, metadata !4, i64 16, metadata !4, i64 24, metadata !44, i64 32}
+!44 = metadata !{metadata !"_ZTSN4llvm11raw_ostream10BufferKindE", metadata !5, i64 0}
+!45 = metadata !{metadata !43, metadata !4, i64 24}
+!46 = metadata !{metadata !43, metadata !4, i64 8}
+!47 = metadata !{i64 0, i64 8, metadata !27, i64 8, i64 8, metadata !27}

diff --git a/test/CodeGen/PowerPC/unal-altivec-wint.ll b/test/CodeGen/PowerPC/unal-altivec-wint.ll
new file mode 100644
index 0000000..7e0963f
--- /dev/null
+++ b/test/CodeGen/PowerPC/unal-altivec-wint.ll

@@ -0,0 +1,48 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <4 x i32> @llvm.ppc.altivec.lvx(i8*) #1
+
+define <4 x i32> @test1(<4 x i32>* %h) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  %vl = call <4 x i32> @llvm.ppc.altivec.lvx(i8* %hv)
+
+  %v0 = load <4 x i32>* %h, align 8
+
+  %a = add <4 x i32> %v0, %vl
+  ret <4 x i32> %a
+
+; CHECK-LABEL: @test1
+; CHECK: li [[REG:[0-9]+]], 16
+; CHECK-NOT: li {{[0-9]+}}, 15
+; CHECK-DAG: lvx {{[0-9]+}}, 0, 3
+; CHECK-DAG: lvx {{[0-9]+}}, 3, [[REG]]
+; CHECK: blr
+}
+
+declare void @llvm.ppc.altivec.stvx(<4 x i32>, i8*) #0
+
+define <4 x i32> @test2(<4 x i32>* %h, <4 x i32> %d) #0 {
+entry:
+  %h1 = getelementptr <4 x i32>* %h, i64 1
+  %hv = bitcast <4 x i32>* %h1 to i8*
+  call void @llvm.ppc.altivec.stvx(<4 x i32> %d, i8* %hv)
+
+  %v0 = load <4 x i32>* %h, align 8
+
+  ret <4 x i32> %v0
+
+; CHECK-LABEL: @test2
+; CHECK: li [[REG:[0-9]+]], 16
+; CHECK-NOT: li {{[0-9]+}}, 15
+; CHECK-DAG: lvx {{[0-9]+}}, 0, 3
+; CHECK-DAG: lvx {{[0-9]+}}, 3, [[REG]]
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+

diff --git a/test/CodeGen/PowerPC/unal4-std.ll b/test/CodeGen/PowerPC/unal4-std.ll
index 9f29e31..e911099 100644
--- a/test/CodeGen/PowerPC/unal4-std.ll
+++ b/test/CodeGen/PowerPC/unal4-std.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mcpu=pwr7 -mattr=-vsx| FileCheck %s
+; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -22,6 +23,9 @@
 ; a multiple of 4).
 ; CHECK: @copy_to_conceal
 ; CHECK: stdx {{[0-9]+}}, 0,
+
+; CHECK-VSX: @copy_to_conceal
+; CHECK-VSX: stxvw4x {{[0-9]+}}, 0,
 }
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/PowerPC/unaligned.ll b/test/CodeGen/PowerPC/unaligned.ll
index d469c62..64c03cd 100644
--- a/test/CodeGen/PowerPC/unaligned.ll
+++ b/test/CodeGen/PowerPC/unaligned.ll

@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx | FileCheck %s
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128-n32"
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128-n32"
 
 define void @foo1(i16* %p, i16* %r) nounwind {
@@ -10,6 +12,10 @@
 ; CHECK: @foo1
 ; CHECK: lhz
 ; CHECK: sth
+
+; CHECK-VSX: @foo1
+; CHECK-VSX: lhz
+; CHECK-VSX: sth
 }
 
 define void @foo2(i32* %p, i32* %r) nounwind {
@@ -21,6 +27,10 @@
 ; CHECK: @foo2
 ; CHECK: lwz
 ; CHECK: stw
+
+; CHECK-VSX: @foo2
+; CHECK-VSX: lwz
+; CHECK-VSX: stw
 }
 
 define void @foo3(i64* %p, i64* %r) nounwind {
@@ -32,6 +42,10 @@
 ; CHECK: @foo3
 ; CHECK: ld
 ; CHECK: std
+
+; CHECK-VSX: @foo3
+; CHECK-VSX: ld
+; CHECK-VSX: std
 }
 
 define void @foo4(float* %p, float* %r) nounwind {
@@ -43,6 +57,10 @@
 ; CHECK: @foo4
 ; CHECK: lfs
 ; CHECK: stfs
+
+; CHECK-VSX: @foo4
+; CHECK-VSX: lfs
+; CHECK-VSX: stfs
 }
 
 define void @foo5(double* %p, double* %r) nounwind {
@@ -54,6 +72,10 @@
 ; CHECK: @foo5
 ; CHECK: lfd
 ; CHECK: stfd
+
+; CHECK-VSX: @foo5
+; CHECK-VSX: lxsdx
+; CHECK-VSX: stxsdx
 }
 
 define void @foo6(<4 x float>* %p, <4 x float>* %r) nounwind {
@@ -69,5 +91,15 @@
 ; CHECK-DAG: ld
 ; CHECK-DAG: stdx
 ; CHECK: stdx
+
+; For VSX on P7, unaligned loads and stores are preferable to aligned
+; stack slots, but lvsl/vperm is better still.  (On P8 lxvw4x is preferable.)
+; Using unaligned stxvw4x is preferable on both machines.
+; CHECK-VSX: @foo6
+; CHECK-VSX-DAG: lvsl
+; CHECK-VSX-DAG: lvx
+; CHECK-VSX-DAG: lvx
+; CHECK-VSX: vperm
+; CHECK-VSX: stxvw4x
 }
 

diff --git a/test/CodeGen/PowerPC/unsafe-math.ll b/test/CodeGen/PowerPC/unsafe-math.ll
index b0bdcc2..f643027 100644
--- a/test/CodeGen/PowerPC/unsafe-math.ll
+++ b/test/CodeGen/PowerPC/unsafe-math.ll

@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=ppc32 | grep fmul | count 2
-; RUN: llc < %s -march=ppc32 -enable-unsafe-fp-math | \
+; RUN: llc < %s -mattr=-vsx -march=ppc32 | grep fmul | count 2
+; RUN: llc < %s -mattr=-vsx -march=ppc32 -enable-unsafe-fp-math | \
 ; RUN:   grep fmul | count 1
 
 define double @foo(double %X) nounwind {

diff --git a/test/CodeGen/PowerPC/unwind-dw2-g.ll b/test/CodeGen/PowerPC/unwind-dw2-g.ll
index 24b5207..54d3189 100644
--- a/test/CodeGen/PowerPC/unwind-dw2-g.ll
+++ b/test/CodeGen/PowerPC/unwind-dw2-g.ll

@@ -21,15 +21,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/unwind-dw2.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/unwind-dw2.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"/tmp/unwind-dw2.c", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/unwind-dw2.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/unwind-dw2.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !9 = metadata !{i32 2, i32 0, metadata !4, null}
 !10 = metadata !{i32 3, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/PowerPC/varargs-struct-float.ll b/test/CodeGen/PowerPC/varargs-struct-float.ll
index fb1835f..0fd9fc5 100644
--- a/test/CodeGen/PowerPC/varargs-struct-float.ll
+++ b/test/CodeGen/PowerPC/varargs-struct-float.ll

@@ -16,8 +16,8 @@
   ret void
 }
 
-; CHECK: stfs {{[0-9]+}}, 60(1)
-; CHECK: ld 4, 56(1)
+; CHECK: stfs {{[0-9]+}}, 116(1)
+; CHECK: lwz 4, 116(1)
 ; CHECK: bl
 
 declare void @testvaSf1(i32, ...)

diff --git a/test/CodeGen/PowerPC/vec-abi-align.ll b/test/CodeGen/PowerPC/vec-abi-align.ll
index 3239cf6..5075ff2 100644
--- a/test/CodeGen/PowerPC/vec-abi-align.ll
+++ b/test/CodeGen/PowerPC/vec-abi-align.ll

@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=-vsx < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -16,6 +17,10 @@
 ; CHECK-LABEL: @test1
 ; CHECK: stvx 2,
 ; CHECK: blr
+
+; CHECK-VSX-LABEL: @test1
+; CHECK-VSX: stxvw4x 34,
+; CHECK-VSX: blr
 }
 
 ; Function Attrs: nounwind
@@ -35,6 +40,13 @@
 ; CHECK: addi [[REGB:[0-9]+]], 1, 112
 ; CHECK: lvx 2, [[REGB]], [[REG16]]
 ; CHECK: blr
+
+; CHECK-VSX-LABEL: @test2
+; CHECK-VSX: ld {{[0-9]+}}, 112(1)
+; CHECK-VSX: li [[REG16:[0-9]+]], 16
+; CHECK-VSX: addi [[REGB:[0-9]+]], 1, 112
+; CHECK-VSX: lxvw4x {{[0-9]+}}, [[REGB]], [[REG16]]
+; CHECK-VSX: blr
 }
 
 ; Function Attrs: nounwind
@@ -54,6 +66,13 @@
 ; CHECK: addi [[REGB:[0-9]+]], 1, 128
 ; CHECK: lvx 2, [[REGB]], [[REG16]]
 ; CHECK: blr
+
+; CHECK-VSX-LABEL: @test3
+; CHECK-VSX: ld {{[0-9]+}}, 128(1)
+; CHECK-VSX: li [[REG16:[0-9]+]], 16
+; CHECK-VSX: addi [[REGB:[0-9]+]], 1, 128
+; CHECK-VSX: lxvw4x {{[0-9]+}}, [[REGB]], [[REG16]]
+; CHECK-VSX: blr
 }
 
 attributes #0 = { nounwind }

diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
index 2733089..516b2dd 100644
--- a/test/CodeGen/PowerPC/vec_cmp.ll
+++ b/test/CodeGen/PowerPC/vec_cmp.ll

@@ -63,9 +63,8 @@
   ret <16 x i8> %sext
 }
 ; CHECK-LABEL:      v16si8_cmp_le:
-; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsb [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtsb [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <16 x i8> @v16ui8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -74,9 +73,8 @@
   ret <16 x i8> %sext
 }
 ; CHECK-LABEL:      v16ui8_cmp_le:
-; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtub [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtub [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <16 x i8> @v16si8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -121,9 +119,8 @@
   ret <16 x i8> %sext
 }
 ; CHECK-LABEL:      v16si8_cmp_ge:
-; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsb [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtsb [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <16 x i8> @v16ui8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -132,9 +129,8 @@
   ret <16 x i8> %sext
 }
 ; CHECK-LABEL:      v16ui8_cmp_ge:
-; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtub [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtub [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 
 define <32 x i8> @v32si8_cmp(<32 x i8> %x, <32 x i8> %y) nounwind readnone {
@@ -193,9 +189,8 @@
   ret <8 x i16> %sext
 }
 ; CHECK-LABEL:      v8si16_cmp_le:
-; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsh [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtsh [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <8 x i16> @v8ui16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -204,9 +199,8 @@
   ret <8 x i16> %sext
 }
 ; CHECK-LABEL:      v8ui16_cmp_le:
-; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtuh [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtuh [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <8 x i16> @v8si16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -251,9 +245,8 @@
   ret <8 x i16> %sext
 }
 ; CHECK-LABEL:      v8si16_cmp_ge:
-; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsh [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtsh [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <8 x i16> @v8ui16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -262,9 +255,8 @@
   ret <8 x i16> %sext
 }
 ; CHECK-LABEL:      v8ui16_cmp_ge:
-; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtuh [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtuh [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 
 define <16 x i16> @v16si16_cmp(<16 x i16> %x, <16 x i16> %y) nounwind readnone {
@@ -326,9 +318,8 @@
   ret <4 x i32> %sext
 }
 ; CHECK-LABEL:      v4si32_cmp_le:
-; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsw [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtsw [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <4 x i32> @v4ui32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -337,9 +328,8 @@
   ret <4 x i32> %sext
 }
 ; CHECK-LABEL:      v4ui32_cmp_le:
-; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtuw [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK:      vcmpgtuw [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <4 x i32> @v4si32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -384,9 +374,8 @@
   ret <4 x i32> %sext
 }
 ; CHECK-LABEL:      v4si32_cmp_ge:
-; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtsw [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtsw [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 define <4 x i32> @v4ui32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -395,9 +384,8 @@
   ret <4 x i32> %sext
 }
 ; CHECK-LABEL:      v4ui32_cmp_ge:
-; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtuw [[RCMPGT:[0-9]+]], 2, 3
-; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+; CHECK:      vcmpgtuw [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
 
 
 define <8 x i32> @v8si32_cmp(<8 x i32> %x, <8 x i32> %y) nounwind readnone {
@@ -480,9 +468,7 @@
   ret <4 x float> %0
 }
 ; CHECK-LABEL:      v4f32_cmp_le:
-; CHECK:      vcmpeqfp [[RCMPEQ:[0-9]+]], 2, 3
-; CHECK-NEXT: vcmpgtfp [[RCMPLE:[0-9]+]], 3, 2
-; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+; CHECK: vcmpgefp 2, 3, 2
 
 define <4 x float> @v4f32_cmp_lt(<4 x float> %x, <4 x float> %y) nounwind readnone {
 entry:
@@ -514,6 +500,50 @@
 ; CHECK-LABEL: v4f32_cmp_gt:
 ; CHECK: vcmpgtfp 2, 2, 3
 
+define <4 x float> @v4f32_cmp_ule(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ule <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK-LABEL: v4f32_cmp_ule:
+; CHECK:      vcmpgtfp [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_ult(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ult <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK-LABEL: v4f32_cmp_ult:
+; CHECK:      vcmpgefp [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_uge(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp uge <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK-LABEL: v4f32_cmp_uge:
+; CHECK:      vcmpgtfp [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_ugt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ugt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK-LABEL: v4f32_cmp_ugt:
+; CHECK:      vcmpgefp [[RET:[0-9]+]], 3, 2
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
 
 define <8 x float> @v8f32_cmp(<8 x float> %x, <8 x float> %y) nounwind readnone {
 entry:

diff --git a/test/CodeGen/PowerPC/vec_misaligned.ll b/test/CodeGen/PowerPC/vec_misaligned.ll
index 304a84d..73a4a4d 100644
--- a/test/CodeGen/PowerPC/vec_misaligned.ll
+++ b/test/CodeGen/PowerPC/vec_misaligned.ll

@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -mattr=-power8-vector | FileCheck %s
 ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
 
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"

diff --git a/test/CodeGen/PowerPC/vec_mul.ll b/test/CodeGen/PowerPC/vec_mul.ll
index 8a44815..86596d4 100644
--- a/test/CodeGen/PowerPC/vec_mul.ll
+++ b/test/CodeGen/PowerPC/vec_mul.ll

@@ -1,6 +1,8 @@
-; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -march=ppc32 -mattr=+altivec | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -march=ppc64 -mattr=+altivec | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64 -mattr=+altivec | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -march=ppc32 -mattr=+altivec -mattr=-vsx | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -march=ppc64 -mattr=+altivec -mattr=-vsx -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64 -mattr=+altivec -mattr=-vsx -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -march=ppc64 -mattr=+altivec -mattr=+vsx -mcpu=pwr7 | FileCheck %s -check-prefix=CHECK-VSX
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -march=ppc64 -mattr=+altivec -mattr=+vsx -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-LE-VSX
 
 define <4 x i32> @test_v4i32(<4 x i32>* %X, <4 x i32>* %Y) {
 	%tmp = load <4 x i32>* %X		; <<4 x i32>> [#uses=1]
@@ -14,6 +16,12 @@
 ; CHECK-LE-LABEL: test_v4i32:
 ; CHECK-LE: vmsumuhm
 ; CHECK-LE-NOT: mullw
+; CHECK-VSX-LABEL: test_v4i32:
+; CHECK-VSX: vmsumuhm
+; CHECK-VSX-NOT: mullw
+; CHECK-LE-VSX-LABEL: test_v4i32:
+; CHECK-LE-VSX: vmsumuhm
+; CHECK-LE-VSX-NOT: mullw
 
 define <8 x i16> @test_v8i16(<8 x i16>* %X, <8 x i16>* %Y) {
 	%tmp = load <8 x i16>* %X		; <<8 x i16>> [#uses=1]
@@ -27,6 +35,12 @@
 ; CHECK-LE-LABEL: test_v8i16:
 ; CHECK-LE: vmladduhm
 ; CHECK-LE-NOT: mullw
+; CHECK-VSX-LABEL: test_v8i16:
+; CHECK-VSX: vmladduhm
+; CHECK-VSX-NOT: mullw
+; CHECK-LE-VSX-LABEL: test_v8i16:
+; CHECK-LE-VSX: vmladduhm
+; CHECK-LE-VSX-NOT: mullw
 
 define <16 x i8> @test_v16i8(<16 x i8>* %X, <16 x i8>* %Y) {
 	%tmp = load <16 x i8>* %X		; <<16 x i8>> [#uses=1]
@@ -43,6 +57,15 @@
 ; CHECK-LE: vmuleub [[REG2:[0-9]+]]
 ; CHECK-LE: vperm {{[0-9]+}}, [[REG2]], [[REG1]]
 ; CHECK-LE-NOT: mullw
+; CHECK-VSX-LABEL: test_v16i8:
+; CHECK-VSX: vmuloub
+; CHECK-VSX: vmuleub
+; CHECK-VSX-NOT: mullw
+; CHECK-LE-VSX-LABEL: test_v16i8:
+; CHECK-LE-VSX: vmuloub [[REG1:[0-9]+]]
+; CHECK-LE-VSX: vmuleub [[REG2:[0-9]+]]
+; CHECK-LE-VSX: vperm {{[0-9]+}}, [[REG2]], [[REG1]]
+; CHECK-LE-VSX-NOT: mullw
 
 define <4 x float> @test_float(<4 x float>* %X, <4 x float>* %Y) {
 	%tmp = load <4 x float>* %X
@@ -61,3 +84,7 @@
 ; CHECK-LE: vspltisw [[ZNEG:[0-9]+]], -1
 ; CHECK-LE: vslw     {{[0-9]+}}, [[ZNEG]], [[ZNEG]]
 ; CHECK-LE: vmaddfp
+; CHECK-VSX-LABEL: test_float:
+; CHECK-VSX: xvmulsp
+; CHECK-LE-VSX-LABEL: test_float:
+; CHECK-LE-VSX: xvmulsp

diff --git a/test/CodeGen/PowerPC/vec_shuffle_le.ll b/test/CodeGen/PowerPC/vec_shuffle_le.ll
index 635721c..a4b2119 100644
--- a/test/CodeGen/PowerPC/vec_shuffle_le.ll
+++ b/test/CodeGen/PowerPC/vec_shuffle_le.ll

@@ -6,7 +6,9 @@
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK: vpkuhum
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vpkuhum [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -27,7 +29,9 @@
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29>
-; CHECK: vpkuwum
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vpkuwum [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -48,7 +52,9 @@
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-; CHECK: vmrglb
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrglb [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -69,7 +75,9 @@
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK: vmrghb
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrghb [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -90,7 +98,9 @@
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 2, i32 3, i32 18, i32 19, i32 4, i32 5, i32 20, i32 21, i32 6, i32 7, i32 22, i32 23>
-; CHECK: vmrglh
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrglh [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -111,7 +121,9 @@
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 9, i32 24, i32 25, i32 10, i32 11, i32 26, i32 27, i32 12, i32 13, i32 28, i32 29, i32 14, i32 15, i32 30, i32 31>
-; CHECK: vmrghh
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrghh [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -132,7 +144,9 @@
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23>
-; CHECK: vmrglw
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrglw [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -153,7 +167,9 @@
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
         %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
-; CHECK: vmrghw
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vmrghw [[REG3:[0-9]+]], [[REG2]], [[REG1]]
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -173,8 +189,10 @@
 ; CHECK: VSLDOI_xy:
         %tmp = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
-; CHECK: vsldoi
+        %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK: lvx [[REG1:[0-9]+]]
+; CHECK: lvx [[REG2:[0-9]+]]
+; CHECK: vsldoi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 4
         store <16 x i8> %tmp3, <16 x i8>* %A
         ret void
 }
@@ -183,7 +201,7 @@
 entry:
 ; CHECK: VSLDOI_xx:
         %tmp = load <16 x i8>* %A
-        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+        %tmp2 = shufflevector <16 x i8> %tmp, <16 x i8> %tmp, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
 ; CHECK: vsldoi
         store <16 x i8> %tmp2, <16 x i8>* %A
         ret void

diff --git a/test/CodeGen/PowerPC/vec_urem_const.ll b/test/CodeGen/PowerPC/vec_urem_const.ll
new file mode 100644
index 0000000..814a826
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_urem_const.ll

@@ -0,0 +1,13 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Common code used to replace the urem by a mulhu, and compilation would
+; then crash since mulhu isn't supported on vector types.
+
+define <4 x i32> @test(<4 x i32> %x) {
+entry:
+  %0 = urem <4 x i32> %x, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  ret <4 x i32> %0
+}

diff --git a/test/CodeGen/PowerPC/vrspill.ll b/test/CodeGen/PowerPC/vrspill.ll
index c3d1bf8..b55e129 100644
--- a/test/CodeGen/PowerPC/vrspill.ll
+++ b/test/CodeGen/PowerPC/vrspill.ll

@@ -1,5 +1,6 @@
-; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs -fast-isel=false < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -mattr=-vsx -verify-machineinstrs -fast-isel=false -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -mattr=+vsx -verify-machineinstrs -fast-isel=false -mcpu=pwr7 < %s | FileCheck -check-prefix=CHECK-VSX %s
 
 ; This verifies that we generate correct spill/reload code for vector regs.
 
@@ -15,4 +16,9 @@
 
 ; CHECK: stvx 2,
 
+; We would prefer to test for "stxvw4x 34," but current -O0 code
+; needlessly generates "vor 3,2,2 / stxvw4x 35,0,3", so we'll settle for
+; the opcode.
+; CHECK-VSX: stxvw4x
+
 declare void @foo(i32*)

diff --git a/test/CodeGen/PowerPC/vsx-div.ll b/test/CodeGen/PowerPC/vsx-div.ll
new file mode 100644
index 0000000..8a9578e
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-div.ll

@@ -0,0 +1,29 @@
+; RUN: llc -mcpu=pwr7 -mattr=+vsx -O1 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@vf_res = common global <4 x float> zeroinitializer, align 16
+@vd_res = common global <2 x double> zeroinitializer, align 16
+
+define void @test1() {
+entry:
+  %0 = load <4 x float>* @vf, align 16
+  %1 = tail call <4 x float> @llvm.ppc.vsx.xvdivsp(<4 x float> %0, <4 x float> %0)
+  store <4 x float> %1, <4 x float>* @vf_res, align 16
+  ret void
+}
+; CHECK-LABEL: @test1
+; CHECK: xvdivsp
+
+define void @test2() {
+entry:
+  %0 = load <2 x double>* @vd, align 16
+  %1 = tail call <2 x double> @llvm.ppc.vsx.xvdivdp(<2 x double> %0, <2 x double> %0)
+  store <2 x double> %1, <2 x double>* @vd_res, align 16
+  ret void
+}
+; CHECK-LABEL: @test2
+; CHECK: xvdivdp
+
+declare <2 x double> @llvm.ppc.vsx.xvdivdp(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.ppc.vsx.xvdivsp(<4 x float>, <4 x float>)

diff --git a/test/CodeGen/PowerPC/vsx-fma-m.ll b/test/CodeGen/PowerPC/vsx-fma-m.ll
index da4a204..9dff9a7 100644
--- a/test/CodeGen/PowerPC/vsx-fma-m.ll
+++ b/test/CodeGen/PowerPC/vsx-fma-m.ll

@@ -177,21 +177,27 @@
   store <2 x double> %1, <2 x double>* %arrayidx3, align 8
   ret void
 
+; Note: There is some unavoidable changeability in this variant.  If the
+; FMAs are reordered differently, the algorithm can pick a different
+; multiplicand to destroy, changing the register assignment.  There isn't
+; a good way to express this possibility, so hopefully this doesn't change
+; too often.
+
 ; CHECK-LABEL: @testv3
 ; CHECK-DAG: xxlor [[V1:[0-9]+]], 34, 34
-; CHECK-DAG: xvmaddmdp 37, 35, 34
 ; CHECK-DAG: li [[C1:[0-9]+]], 48
 ; CHECK-DAG: li [[C2:[0-9]+]], 32
-; CHECK-DAG: xvmaddadp 34, 35, 38
+; CHECK-DAG: xvmaddmdp 37, 35, 34
 ; CHECK-DAG: li [[C3:[0-9]+]], 16
 
 ; Note: We could convert this next FMA to M-type as well, but it would require
 ; re-ordering the instructions.
 ; CHECK-DAG: xvmaddadp [[V1]], 35, 36
 
-; CHECK-DAG: xvmaddmdp 35, 36, 37
+; CHECK-DAG: xvmaddmdp 36, 35, 37
+; CHECK-DAG: xvmaddadp 34, 35, 38
 ; CHECK-DAG: stxvd2x 32, 0, 3
-; CHECK-DAG: stxvd2x 35, 3, [[C1]]
+; CHECK-DAG: stxvd2x 36, 3, [[C1]]
 ; CHECK-DAG: stxvd2x 34, 3, [[C2]]
 ; CHECK-DAG: stxvd2x 37, 3, [[C3]]
 ; CHECK: blr

diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll
new file mode 100644
index 0000000..0c9ebef
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-ldst.ll

@@ -0,0 +1,36 @@
+; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64-unknown-linux-gnu < %s > %t
+; RUN: grep lxvw4x < %t | count 3
+; RUN: grep lxvd2x < %t | count 3
+; RUN: grep stxvw4x < %t | count 3
+; RUN: grep stxvd2x < %t | count 3
+
+@vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
+@vui = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vsll = global <2 x i64> <i64 255, i64 -937>, align 16
+@vull = global <2 x i64> <i64 1447, i64 2894>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@res_vsi = common global <4 x i32> zeroinitializer, align 16
+@res_vui = common global <4 x i32> zeroinitializer, align 16
+@res_vf = common global <4 x float> zeroinitializer, align 16
+@res_vsll = common global <2 x i64> zeroinitializer, align 16
+@res_vull = common global <2 x i64> zeroinitializer, align 16
+@res_vd = common global <2 x double> zeroinitializer, align 16
+
+; Function Attrs: nounwind
+define void @test1() {
+entry:
+  %0 = load <4 x i32>* @vsi, align 16
+  %1 = load <4 x i32>* @vui, align 16
+  %2 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 16
+  %3 = load <2 x double>* bitcast (<2 x i64>* @vsll to <2 x double>*), align 16
+  %4 = load <2 x double>* bitcast (<2 x i64>* @vull to <2 x double>*), align 16
+  %5 = load <2 x double>* @vd, align 16
+  store <4 x i32> %0, <4 x i32>* @res_vsi, align 16
+  store <4 x i32> %1, <4 x i32>* @res_vui, align 16
+  store <4 x i32> %2, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 16
+  store <2 x double> %3, <2 x double>* bitcast (<2 x i64>* @res_vsll to <2 x double>*), align 16
+  store <2 x double> %4, <2 x double>* bitcast (<2 x i64>* @res_vull to <2 x double>*), align 16
+  store <2 x double> %5, <2 x double>* @res_vd, align 16
+  ret void
+}

diff --git a/test/CodeGen/PowerPC/vsx-minmax.ll b/test/CodeGen/PowerPC/vsx-minmax.ll
new file mode 100644
index 0000000..47f50ab
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-minmax.ll

@@ -0,0 +1,98 @@
+; RUN: llc -mcpu=pwr7 -mattr=+vsx -O0 -fast-isel=0 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@vf = global <4 x float> <float -1.500000e+00, float 2.500000e+00, float -3.500000e+00, float 4.500000e+00>, align 16
+@vd = global <2 x double> <double 3.500000e+00, double -7.500000e+00>, align 16
+@d = global double 2.340000e+01, align 8
+@vf1 = common global <4 x float> zeroinitializer, align 16
+@vd1 = common global <2 x double> zeroinitializer, align 16
+@vf2 = common global <4 x float> zeroinitializer, align 16
+@vf3 = common global <4 x float> zeroinitializer, align 16
+@vd2 = common global <2 x double> zeroinitializer, align 16
+@vf4 = common global <4 x float> zeroinitializer, align 16
+@d1 = common global double 0.000000e+00, align 8
+@d2 = common global double 0.000000e+00, align 8
+
+; Function Attrs: nounwind
+define void @test1() #0 {
+; CHECK-LABEL: @test1
+entry:
+  %0 = load volatile <4 x float>* @vf, align 16
+  %1 = load volatile <4 x float>* @vf, align 16
+  %2 = tail call <4 x float> @llvm.ppc.vsx.xvmaxsp(<4 x float> %0, <4 x float> %1)
+; CHECK: xvmaxsp
+  store <4 x float> %2, <4 x float>* @vf1, align 16
+  %3 = load <2 x double>* @vd, align 16
+  %4 = tail call <2 x double> @llvm.ppc.vsx.xvmaxdp(<2 x double> %3, <2 x double> %3)
+; CHECK: xvmaxdp
+  store <2 x double> %4, <2 x double>* @vd1, align 16
+  %5 = load volatile <4 x float>* @vf, align 16
+  %6 = load volatile <4 x float>* @vf, align 16
+  %7 = tail call <4 x float> @llvm.ppc.vsx.xvmaxsp(<4 x float> %5, <4 x float> %6)
+; CHECK: xvmaxsp
+  store <4 x float> %7, <4 x float>* @vf2, align 16
+  %8 = load volatile <4 x float>* @vf, align 16
+  %9 = load volatile <4 x float>* @vf, align 16
+  %10 = tail call <4 x float> @llvm.ppc.vsx.xvminsp(<4 x float> %8, <4 x float> %9)
+; CHECK: xvminsp
+  store <4 x float> %10, <4 x float>* @vf3, align 16
+  %11 = load <2 x double>* @vd, align 16
+  %12 = tail call <2 x double> @llvm.ppc.vsx.xvmindp(<2 x double> %11, <2 x double> %11)
+; CHECK: xvmindp
+  store <2 x double> %12, <2 x double>* @vd2, align 16
+  %13 = load volatile <4 x float>* @vf, align 16
+  %14 = load volatile <4 x float>* @vf, align 16
+  %15 = tail call <4 x float> @llvm.ppc.vsx.xvminsp(<4 x float> %13, <4 x float> %14)
+; CHECK: xvminsp
+  store <4 x float> %15, <4 x float>* @vf4, align 16
+  %16 = load double* @d, align 8
+  %17 = tail call double @llvm.ppc.vsx.xsmaxdp(double %16, double %16)
+; CHECK: xsmaxdp
+  store double %17, double* @d1, align 8
+  %18 = tail call double @llvm.ppc.vsx.xsmindp(double %16, double %16)
+; CHECK: xsmindp
+  store double %18, double* @d2, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare double @llvm.ppc.vsx.xsmaxdp(double, double)
+
+; Function Attrs: nounwind readnone
+declare double @llvm.ppc.vsx.xsmindp(double, double)
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.ppc.vsx.xvminsp(<4 x float>, <4 x float>)
+
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.xvmindp(<2 x double>, <2 x double>)
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.ppc.vsx.xvmaxsp(<4 x float>, <4 x float>)
+
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.xvmaxdp(<2 x double>, <2 x double>)
+
+; Generated from C source:
+
+; % clang -O1 -maltivec -mvsx -S -emit-llvm vsx-minmax.c
+;
+;volatile vector float vf = { -1.5, 2.5, -3.5, 4.5 };
+;vector double vd = { 3.5, -7.5 };
+;double d = 23.4;
+;
+;vector float vf1, vf2, vf3, vf4;
+;vector double vd1, vd2;
+;double d1, d2;
+;
+;void test1() {
+;  vf1 = vec_max(vf, vf);
+;  vd1 = vec_max(vd, vd);
+;  vf2 = vec_vmaxfp(vf, vf);
+;  vf3 = vec_min(vf, vf);
+;  vd2 = vec_min(vd, vd);
+;  vf4 = vec_vminfp(vf, vf);
+;  d1 = __builtin_vsx_xsmaxdp(d, d);
+;  d2 = __builtin_vsx_xsmindp(d, d);
+;}

diff --git a/test/CodeGen/PowerPC/vsx-p8.ll b/test/CodeGen/PowerPC/vsx-p8.ll
new file mode 100644
index 0000000..81406b6
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx-p8.ll

@@ -0,0 +1,42 @@
+; RUN: llc -mcpu=pwr8 -mattr=+power8-vector < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Unaligned loads/stores on P8 and later should use VSX where possible.
+
+define <2 x double> @test28u(<2 x double>* %a) {
+  %v = load <2 x double>* %a, align 8
+  ret <2 x double> %v
+
+; CHECK-LABEL: @test28u
+; CHECK: lxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test29u(<2 x double>* %a, <2 x double> %b) {
+  store <2 x double> %b, <2 x double>* %a, align 8
+  ret void
+
+; CHECK-LABEL: @test29u
+; CHECK: stxvd2x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x float> @test32u(<4 x float>* %a) {
+  %v = load <4 x float>* %a, align 8
+  ret <4 x float> %v
+
+; CHECK-LABEL: @test32u
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test33u(<4 x float>* %a, <4 x float> %b) {
+  store <4 x float> %b, <4 x float>* %a, align 8
+  ret void
+
+; CHECK-LABEL: @test33u
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+

diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll
index f5ac577..333b75a 100644
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll

@@ -356,6 +356,63 @@
 ; CHECK: blr
 }
 
+define <4 x float> @test32(<4 x float>* %a) {
+  %v = load <4 x float>* %a, align 16
+  ret <4 x float> %v
+
+; CHECK-LABEL: @test32
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test33(<4 x float>* %a, <4 x float> %b) {
+  store <4 x float> %b, <4 x float>* %a, align 16
+  ret void
+
+; CHECK-LABEL: @test33
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x float> @test32u(<4 x float>* %a) {
+  %v = load <4 x float>* %a, align 8
+  ret <4 x float> %v
+
+; CHECK-LABEL: @test32u
+; CHECK-DAG: lvsl
+; CHECK-DAG: lvx
+; CHECK-DAG: lvx
+; CHECK: vperm 2,
+; CHECK: blr
+}
+
+define void @test33u(<4 x float>* %a, <4 x float> %b) {
+  store <4 x float> %b, <4 x float>* %a, align 8
+  ret void
+
+; CHECK-LABEL: @test33u
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define <4 x i32> @test34(<4 x i32>* %a) {
+  %v = load <4 x i32>* %a, align 16
+  ret <4 x i32> %v
+
+; CHECK-LABEL: @test34
+; CHECK: lxvw4x 34, 0, 3
+; CHECK: blr
+}
+
+define void @test35(<4 x i32>* %a, <4 x i32> %b) {
+  store <4 x i32> %b, <4 x i32>* %a, align 16
+  ret void
+
+; CHECK-LABEL: @test35
+; CHECK: stxvw4x 34, 0, 3
+; CHECK: blr
+}
+
 define <2 x double> @test40(<2 x i64> %a) {
   %v = uitofp <2 x i64> %a to <2 x double>
   ret <2 x double> %v
@@ -634,7 +691,7 @@
 ; CHECK-DAG: addi [[R1:[0-9]+]], 3, 3
 ; CHECK-DAG: addi [[R2:[0-9]+]], 1, -16
 ; CHECK-DAG: addi [[R3:[0-9]+]], 3, 2
-; CHECK: std [[R1]], 8([[R2]])
+; CHECK: std [[R1]], -8(1)
 ; CHECK: std [[R3]], -16(1)
 ; CHECK: lxvd2x 34, 0, [[R2]]
 ; CHECK-NOT: stxvd2x
@@ -649,3 +706,14 @@
 ; CHECK: blr
 }
 
+define double @test82(double %a, double %b, double %c, double %d) {
+entry:
+  %m = fcmp oeq double %c, %d
+  %v = select i1 %m, double %a, double %b
+  ret double %v
+
+; CHECK-LABEL: @test82
+; CHECK: xscmpudp [[REG:[0-9]+]], 3, 4
+; CHECK: beqlr [[REG]]
+}
+

diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
index 3c4fcf7..d9b0ff2 100644
--- a/test/CodeGen/R600/128bit-kernel-args.ll
+++ b/test/CodeGen/R600/128bit-kernel-args.ll

@@ -1,27 +1,27 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @v4i32_kernel_arg
+; R600-CHECK: {{^}}v4i32_kernel_arg:
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
-; SI-CHECK: @v4i32_kernel_arg
-; SI-CHECK: BUFFER_STORE_DWORDX4
+; SI-CHECK: {{^}}v4i32_kernel_arg:
+; SI-CHECK: buffer_store_dwordx4
 define void @v4i32_kernel_arg(<4 x i32> addrspace(1)* %out, <4 x i32>  %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: @v4f32_kernel_arg
+; R600-CHECK: {{^}}v4f32_kernel_arg:
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Y, KC0[3].Z
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].Z, KC0[3].W
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR]].W, KC0[4].X
-; SI-CHECK: @v4f32_kernel_arg
-; SI-CHECK: BUFFER_STORE_DWORDX4
-define void @v4f32_kernel_args(<4 x float> addrspace(1)* %out, <4 x float>  %in) {
+; SI-CHECK: {{^}}v4f32_kernel_arg:
+; SI-CHECK: buffer_store_dwordx4
+define void @v4f32_kernel_arg(<4 x float> addrspace(1)* %out, <4 x float>  %in) {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out
   ret void

diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
index 7dec426..4ff2762 100644
--- a/test/CodeGen/R600/32-bit-local-address-space.ll
+++ b/test/CodeGen/R600/32-bit-local-address-space.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
 ; the global address space(1) uses 64-bit pointers.  These tests check to make sure
@@ -9,9 +9,9 @@
 ; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
 ; instructions with B64, U64, and I64 take 64-bit operands.
 
-; CHECK-LABEL: @local_address_load
-; CHECK: V_MOV_B32_e{{32|64}} [[PTR:v[0-9]]]
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[PTR]]
+; FUNC-LABEL: {{^}}local_address_load:
+; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
+; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
 define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32 addrspace(3)* %in
@@ -19,10 +19,10 @@
   ret void
 }
 
-; CHECK-LABEL: @local_address_gep
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]]
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; CHECK: DS_READ_B32 [[VPTR]]
+; FUNC-LABEL: {{^}}local_address_gep:
+; SI: s_add_i32 [[SPTR:s[0-9]]]
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_read_b32 [[VPTR]]
 define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(3)* %in, i32 %offset
@@ -31,9 +31,9 @@
   ret void
 }
 
-; CHECK-LABEL: @local_address_gep_const_offset
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VPTR]], 0x4,
+; FUNC-LABEL: {{^}}local_address_gep_const_offset:
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
 define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(3)* %in, i32 1
@@ -43,10 +43,10 @@
 }
 
 ; Offset too large, can't fold into 16-bit immediate offset.
-; CHECK-LABEL: @local_address_gep_large_const_offset
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; CHECK: DS_READ_B32 [[VPTR]]
+; FUNC-LABEL: {{^}}local_address_gep_large_const_offset:
+; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_read_b32 [[VPTR]]
 define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(3)* %in, i32 16385
@@ -55,10 +55,10 @@
   ret void
 }
 
-; CHECK-LABEL: @null_32bit_lds_ptr:
-; CHECK: V_CMP_NE_I32
-; CHECK-NOT: V_CMP_NE_I32
-; CHECK: V_CNDMASK_B32
+; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
+; SI: v_cmp_ne_i32
+; SI-NOT: v_cmp_ne_i32
+; SI: v_cndmask_b32
 define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
   %cmp = icmp ne i32 addrspace(3)* %lds, null
   %x = select i1 %cmp, i32 123, i32 456
@@ -66,10 +66,10 @@
   ret void
 }
 
-; CHECK-LABEL: @mul_32bit_ptr:
-; CHECK: V_MUL_LO_I32
-; CHECK-NEXT: V_ADD_I32_e32
-; CHECK-NEXT: DS_READ_B32
+; FUNC-LABEL: {{^}}mul_32bit_ptr:
+; SI: s_mul_i32
+; SI-NEXT: s_add_i32
+; SI: ds_read_b32
 define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
   %ptr = getelementptr [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
   %val = load float addrspace(3)* %ptr
@@ -77,11 +77,11 @@
   ret void
 }
 
-@g_lds = addrspace(3) global float zeroinitializer, align 4
+@g_lds = addrspace(3) global float undef, align 4
 
-; CHECK-LABEL: @infer_ptr_alignment_global_offset:
-; CHECK: V_MOV_B32_e32 [[REG:v[0-9]+]], 0
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[REG]]
+; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
 define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
@@ -89,37 +89,37 @@
 }
 
 
-@ptr = addrspace(3) global i32 addrspace(3)* null
-@dst = addrspace(3) global [16384 x i32] zeroinitializer
+@ptr = addrspace(3) global i32 addrspace(3)* undef
+@dst = addrspace(3) global [16384 x i32] undef
 
-; CHECK-LABEL: @global_ptr:
-; CHECK: DS_WRITE_B32
+; FUNC-LABEL: {{^}}global_ptr:
+; SI: ds_write_b32
 define void @global_ptr() nounwind {
   store i32 addrspace(3)* getelementptr ([16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
 
-; CHECK-LABEL: @local_address_store
-; CHECK: DS_WRITE_B32
+; FUNC-LABEL: {{^}}local_address_store:
+; SI: ds_write_b32
 define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
   store i32 %val, i32 addrspace(3)* %out
   ret void
 }
 
-; CHECK-LABEL: @local_address_gep_store
-; CHECK: S_ADD_I32 [[SADDR:s[0-9]+]],
-; CHECK: V_MOV_B32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
-; CHECK: DS_WRITE_B32 [[ADDR]], v{{[0-9]+}},
+; FUNC-LABEL: {{^}}local_address_gep_store:
+; SI: s_add_i32 [[SADDR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
+; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
 define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
   %gep = getelementptr i32 addrspace(3)* %out, i32 %offset
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
 }
 
-; CHECK-LABEL: @local_address_gep_const_offset_store
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: V_MOV_B32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_WRITE_B32 [[VPTR]], [[VAL]], 0x4
+; FUNC-LABEL: {{^}}local_address_gep_const_offset_store:
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
+; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
+; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
 define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
@@ -127,10 +127,10 @@
 }
 
 ; Offset too large, can't fold into 16-bit immediate offset.
-; CHECK-LABEL: @local_address_gep_large_const_offset_store
-; CHECK: S_ADD_I32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
-; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; CHECK: DS_WRITE_B32 [[VPTR]], v{{[0-9]+}}, 0
+; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
+; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_write_b32 [[VPTR]], v{{[0-9]+}} [M0]{{$}}
 define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32 addrspace(3)* %out, i32 16385
   store i32 %val, i32 addrspace(3)* %gep, align 4

diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
index 2d82c1e..cf4e055 100644
--- a/test/CodeGen/R600/64bit-kernel-args.ll
+++ b/test/CodeGen/R600/64bit-kernel-args.ll

@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; SI-CHECK: @f64_kernel_arg
-; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
-; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
-; SI-CHECK: BUFFER_STORE_DWORDX2
+; SI-CHECK: {{^}}f64_kernel_arg:
+; SI-CHECK-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
+; SI-CHECK-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
+; SI-CHECK: buffer_store_dwordx2
 define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out

diff --git a/test/CodeGen/R600/add-debug.ll b/test/CodeGen/R600/add-debug.ll
new file mode 100644
index 0000000..166e0f6
--- /dev/null
+++ b/test/CodeGen/R600/add-debug.ll

@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=tahiti -debug
+; REQUIRES: asserts
+
+; Check that SelectionDAGDumper does not crash on int_SI_if.
+define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+

diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index 711a2bc..767a642 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll

@@ -1,12 +1,12 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK --check-prefix=FUNC %s
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=FUNC %s
 
-;FUNC-LABEL: @test1:
+;FUNC-LABEL: {{^}}test1:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: V_ADD_I32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
 ;SI-CHECK-NOT: [[REG]]
-;SI-CHECK: BUFFER_STORE_DWORD [[REG]],
+;SI-CHECK: buffer_store_dword [[REG]],
 define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
   %a = load i32 addrspace(1)* %in
@@ -16,12 +16,12 @@
   ret void
 }
 
-;FUNC-LABEL: @test2:
+;FUNC-LABEL: {{^}}test2:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -32,16 +32,16 @@
   ret void
 }
 
-;FUNC-LABEL: @test4:
+;FUNC-LABEL: {{^}}test4:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_add_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -52,7 +52,7 @@
   ret void
 }
 
-; FUNC-LABEL: @test8
+; FUNC-LABEL: {{^}}test8:
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
@@ -61,14 +61,14 @@
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
 define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
   %0 = add <8 x i32> %a, %b
@@ -76,7 +76,7 @@
   ret void
 }
 
-; FUNC-LABEL: @test16
+; FUNC-LABEL: {{^}}test16:
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
@@ -93,22 +93,22 @@
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
 ; EG-CHECK: ADD_INT
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADD_I32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
+; SI-CHECK: s_add_i32
 define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
   %0 = add <16 x i32> %a, %b
@@ -116,9 +116,9 @@
   ret void
 }
 
-; FUNC-LABEL: @add64
-; SI-CHECK: S_ADD_I32
-; SI-CHECK: S_ADDC_U32
+; FUNC-LABEL: {{^}}add64:
+; SI-CHECK: s_add_u32
+; SI-CHECK: s_addc_u32
 define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = add i64 %a, %b
@@ -126,13 +126,13 @@
   ret void
 }
 
-; The V_ADDC_U32 and V_ADD_I32 instruction can't read SGPRs, because they
+; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
 ; use VCC.  The test is designed so that %a will be stored in an SGPR and
 ; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
 ; to a VGPR before doing the add.
 
-; FUNC-LABEL: @add64_sgpr_vgpr
-; SI-CHECK-NOT: V_ADDC_U32_e32 s
+; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
+; SI-CHECK-NOT: v_addc_u32_e32 s
 define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64 addrspace(1)* %in
@@ -141,12 +141,10 @@
   ret void
 }
 
-; Test i64 add inside a branch.  We don't allow SALU instructions inside of
-; branches.
-; FIXME: We are being conservative here.  We could allow this in some cases.
-; FUNC-LABEL: @add64_in_branch
-; SI-CHECK-NOT: S_ADD_I32
-; SI-CHECK-NOT: S_ADDC_U32
+; Test i64 add inside a branch.
+; FUNC-LABEL: {{^}}add64_in_branch:
+; SI-CHECK: s_add_u32
+; SI-CHECK: s_addc_u32
 define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0

diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
index f733d90..47ecf6d 100644
--- a/test/CodeGen/R600/add_i64.ll
+++ b/test/CodeGen/R600/add_i64.ll

@@ -3,9 +3,9 @@
 
 declare i32 @llvm.r600.read.tidig.x() readnone
 
-; SI-LABEL: @test_i64_vreg:
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; SI-LABEL: {{^}}test_i64_vreg:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr i64 addrspace(1)* %inA, i32 %tid
@@ -18,9 +18,9 @@
 }
 
 ; Check that the SGPR add operand is correctly moved to a VGPR.
-; SI-LABEL: @sgpr_operand:
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; SI-LABEL: {{^}}sgpr_operand:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
   %foo = load i64 addrspace(1)* %in, align 8
   %result = add i64 %foo, %a
@@ -31,9 +31,9 @@
 ; Swap the arguments. Check that the SGPR -> VGPR copy works with the
 ; SGPR as other operand.
 ;
-; SI-LABEL: @sgpr_operand_reversed:
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; SI-LABEL: {{^}}sgpr_operand_reversed:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
   %foo = load i64 addrspace(1)* %in, align 8
   %result = add i64 %a, %foo
@@ -42,22 +42,22 @@
 }
 
 
-; SI-LABEL: @test_v2i64_sreg:
-; SI: S_ADD_I32
-; SI: S_ADDC_U32
-; SI: S_ADD_I32
-; SI: S_ADDC_U32
+; SI-LABEL: {{^}}test_v2i64_sreg:
+; SI: s_add_u32
+; SI: s_addc_u32
+; SI: s_add_u32
+; SI: s_addc_u32
 define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: @test_v2i64_vreg:
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; SI-LABEL: {{^}}test_v2i64_vreg:
+; SI: v_add_i32
+; SI: v_addc_u32
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <2 x i64> addrspace(1)* %inA, i32 %tid
@@ -69,13 +69,13 @@
   ret void
 }
 
-; SI-LABEL: @trunc_i64_add_to_i32
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
-; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI-NOT: ADDC
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI-LABEL: {{^}}trunc_i64_add_to_i32:
+; SI: s_load_dword s[[SREG0:[0-9]+]]
+; SI: s_load_dword s[[SREG1:[0-9]+]]
+; SI: s_add_i32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
+; SI-NOT: addc
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
 define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32

diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
index f75a8ac..d04afe6 100644
--- a/test/CodeGen/R600/address-space.ll
+++ b/test/CodeGen/R600/address-space.ll

@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
 
 ; Test that codegenprepare understands address space sizes
 
@@ -7,11 +7,11 @@
 ; FIXME: Extra V_MOV from SGPR to VGPR for second read. The address is
 ; already in a VGPR after the first read.
 
-; CHECK-LABEL: @do_as_ptr_calcs:
-; CHECK: S_LOAD_DWORD [[SREG1:s[0-9]+]],
-; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
-; CHECK: DS_READ_B32 v{{[0-9]+}}, [[VREG1]], 0x14
-; CHECK: DS_READ_B32 v{{[0-9]+}}, v{{[0-9]+}}, 0xc
+; CHECK-LABEL: {{^}}do_as_ptr_calcs:
+; CHECK: s_load_dword [[SREG1:s[0-9]+]],
+; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
+; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:20
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0

diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
index cf11481..9a76fce 100644
--- a/test/CodeGen/R600/and.ll
+++ b/test/CodeGen/R600/and.ll

@@ -1,12 +1,12 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @test2
+; FUNC-LABEL: {{^}}test2:
 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -17,16 +17,16 @@
   ret void
 }
 
-; FUNC-LABEL: @test4
+; FUNC-LABEL: {{^}}test4:
 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -37,24 +37,24 @@
   ret void
 }
 
-; FUNC-LABEL: @s_and_i32
-; SI: S_AND_B32
+; FUNC-LABEL: {{^}}s_and_i32:
+; SI: s_and_b32
 define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, %b
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @s_and_constant_i32
-; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
+; FUNC-LABEL: {{^}}s_and_constant_i32:
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
 define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @v_and_i32
-; SI: V_AND_B32
+; FUNC-LABEL: {{^}}v_and_i32:
+; SI: v_and_b32
 define void @v_and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -63,8 +63,8 @@
   ret void
 }
 
-; FUNC-LABEL: @v_and_constant_i32
-; SI: V_AND_B32
+; FUNC-LABEL: {{^}}v_and_constant_i32:
+; SI: v_and_b32
 define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 1234567
@@ -72,25 +72,34 @@
   ret void
 }
 
-; FUNC-LABEL: @s_and_i64
-; SI: S_AND_B64
+; FUNC-LABEL: {{^}}s_and_i64:
+; SI: s_and_b64
 define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and = and i64 %a, %b
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @s_and_constant_i64
-; SI: S_AND_B64
+; FIXME: Should use SGPRs
+; FUNC-LABEL: {{^}}s_and_i1:
+; SI: v_and_b32
+define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+  %and = and i1 %a, %b
+  store i1 %and, i1 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_constant_i64:
+; SI: s_and_b64
 define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 281474976710655
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @v_and_i64
-; SI: V_AND_B32
-; SI: V_AND_B32
+; FUNC-LABEL: {{^}}v_and_i64:
+; SI: v_and_b32
+; SI: v_and_b32
 define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64 addrspace(1)* %aptr, align 8
   %b = load i64 addrspace(1)* %bptr, align 8
@@ -99,12 +108,51 @@
   ret void
 }
 
-; FUNC-LABEL: @v_and_constant_i64
-; SI: V_AND_B32
-; SI: V_AND_B32
+; FUNC-LABEL: {{^}}v_and_i64_br:
+; SI: v_and_b32
+; SI: v_and_b32
+define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i32 %cond) {
+entry:
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %endif
+
+if:
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
+  %and = and i64 %a, %b
+  br label %endif
+
+endif:
+  %tmp1 = phi i64 [%and, %if], [0, %entry]
+  store i64 %tmp1, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_and_constant_i64:
+; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; SI: v_and_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FIXME: Replace and 0 with mov 0
+; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
+; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
+; SI: v_and_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
+define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %and = and i64 %a, 64
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_and_inline_imm_i64:
+; SI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 64
+define void @s_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+  %and = and i64 %a, 64
+  store i64 %and, i64 addrspace(1)* %out, align 8
+  ret void
+}

diff --git a/test/CodeGen/R600/anyext.ll b/test/CodeGen/R600/anyext.ll
index bbe5d0a..23fdcbb 100644
--- a/test/CodeGen/R600/anyext.ll
+++ b/test/CodeGen/R600/anyext.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-; CHECK-LABEL: @anyext_i1_i32
-; CHECK: V_CNDMASK_B32_e64
+; CHECK-LABEL: {{^}}anyext_i1_i32:
+; CHECK: v_cndmask_b32_e64
 define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0

diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
index 3230353..84d3540 100644
--- a/test/CodeGen/R600/array-ptr-calc-i32.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i32.ll

@@ -1,4 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
@@ -8,14 +9,21 @@
 ; 64-bit pointer add. This should work since private pointers should
 ; be 32-bits.
 
-; SI-LABEL: @test_private_array_ptr_calc:
-; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
+; SI-LABEL: {{^}}test_private_array_ptr_calc:
+
+; FIXME: We end up with zero argument for ADD, because
+; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
+; with the appropriate offset.  We should fold this into the store.
+; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], 0, v{{[0-9]+}}
+; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}]
 ;
 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
 ; alloca to a vector.  It currently fails because it does not know how
 ; to interpret:
 ; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
-; SI: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
+
+; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], 16
+; SI-PROMOTE: ds_write_b32 [[PTRREG]]
 define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %alloca = alloca [4 x i32], i32 4, align 16
   %tid = call i32 @llvm.SI.tid() readnone

diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll
index e254c5f..75f6394 100644
--- a/test/CodeGen/R600/array-ptr-calc-i64.ll
+++ b/test/CodeGen/R600/array-ptr-calc-i64.ll

@@ -1,13 +1,13 @@
-; XFAIL: *
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() readnone
 
-
-; SI-LABEL: @test_array_ptr_calc(
-define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [16 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+; SI-LABEL: {{^}}test_array_ptr_calc:
+; SI: v_mul_lo_i32
+; SI: v_mul_hi_i32
+define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.SI.tid() readnone
-  %a_ptr = getelementptr [16 x i32] addrspace(1)* %inA, i32 1, i32 %tid
+  %a_ptr = getelementptr [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
   %b_ptr = getelementptr i32 addrspace(1)* %inB, i32 %tid
   %a = load i32 addrspace(1)* %a_ptr
   %b = load i32 addrspace(1)* %b_ptr
@@ -15,4 +15,3 @@
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
-

diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
index eb9539e..223f4d3 100644
--- a/test/CodeGen/R600/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll

@@ -1,13 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i32_offset:
-; SI: S_LOAD_DWORD [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: V_MOV_B32_e32 [[VCMP:v[0-9]+]], 7
-; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: V_MOV_B32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
-; SI: DS_CMPST_RTN_B32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]], 0x10, [M0]
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; SI: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
+; SI: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
@@ -16,18 +17,18 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_cmpxchg_ret_i64_offset:
-; SI: S_LOAD_DWORDX2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: S_MOV_B64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
-; SI-DAG: V_MOV_B32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
-; SI-DAG: V_MOV_B32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
-; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI-DAG: V_MOV_B32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
-; SI-DAG: V_MOV_B32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
-; SI: DS_CMPST_RTN_B64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}}, 0x20, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: s_mov_b64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
+; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
+; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; SI: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 [M0]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
@@ -35,3 +36,50 @@
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_bad_si_offset
+; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 [M0]
+; SI: s_endpgm
+define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i32_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
+; SI-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
+; SI: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16 [M0]
+; SI: s_endpgm
+define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_mov_b64  s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
+; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
+; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
+; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
+; SI: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32 [M0]
+; SI: s_endpgm
+define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
+  %result = extractvalue { i64, i1 } %pair, 0
+  ret void
+}

diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
index c26f9cd..f0eff21 100644
--- a/test/CodeGen/R600/atomic_load_add.ll
+++ b/test/CodeGen/R600/atomic_load_add.ll

@@ -1,35 +1,35 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; FUNC-LABEL: @atomic_add_local
+; FUNC-LABEL: {{^}}atomic_add_local:
 ; R600: LDS_ADD *
-; SI: DS_ADD_RTN_U32
+; SI: ds_add_u32
 define void @atomic_add_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
 
-; FUNC-LABEL: @atomic_add_local_const_offset
+; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
 ; R600: LDS_ADD *
-; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
 }
 
-; FUNC-LABEL: @atomic_add_ret_local
+; FUNC-LABEL: {{^}}atomic_add_ret_local:
 ; R600: LDS_ADD_RET *
-; SI: DS_ADD_RTN_U32
+; SI: ds_add_rtn_u32
 define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @atomic_add_ret_local_const_offset
+; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
 ; R600: LDS_ADD_RET *
-; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x14
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
 define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst

diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
index 3569d91..61ff296 100644
--- a/test/CodeGen/R600/atomic_load_sub.ll
+++ b/test/CodeGen/R600/atomic_load_sub.ll

@@ -1,35 +1,35 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @atomic_sub_local
+; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
-; SI: DS_SUB_RTN_U32
+; SI: ds_sub_u32
 define void @atomic_sub_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
 
-; FUNC-LABEL: @atomic_sub_local_const_offset
+; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
 ; R600: LDS_SUB *
-; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
+; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
 }
 
-; FUNC-LABEL: @atomic_sub_ret_local
+; FUNC-LABEL: {{^}}atomic_sub_ret_local:
 ; R600: LDS_SUB_RET *
-; SI: DS_SUB_RTN_U32
+; SI: ds_sub_rtn_u32
 define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @atomic_sub_ret_local_const_offset
+; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
 ; R600: LDS_SUB_RET *
-; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x14
+; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
 define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst

diff --git a/test/CodeGen/R600/basic-branch.ll b/test/CodeGen/R600/basic-branch.ll
index d084132..073ab79 100644
--- a/test/CodeGen/R600/basic-branch.ll
+++ b/test/CodeGen/R600/basic-branch.ll

@@ -1,7 +1,7 @@
 ; XFAIL: *
 ; RUN: llc -O0 -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK-LABEL: @test_branch(
+; CHECK-LABEL: {{^}}test_branch(
 define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %end

diff --git a/test/CodeGen/R600/basic-loop.ll b/test/CodeGen/R600/basic-loop.ll
index 6d0ff07..3cd609135 100644
--- a/test/CodeGen/R600/basic-loop.ll
+++ b/test/CodeGen/R600/basic-loop.ll

@@ -1,7 +1,7 @@
 ; XFAIL: *
 ; RUN: llc -O0 -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s
 
-; CHECK-LABEL: @test_loop:
+; CHECK-LABEL: {{^}}test_loop:
 define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
 entry:
   br label %loop.body

diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll
index fe466e6..6fe23e9 100644
--- a/test/CodeGen/R600/bfe_uint.ll
+++ b/test/CodeGen/R600/bfe_uint.ll

@@ -2,7 +2,7 @@
 
 ; XFAIL: *
 
-; CHECK: @bfe_def
+; CHECK: {{^}}bfe_def:
 ; CHECK: BFE_UINT
 define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
 entry:
@@ -17,7 +17,7 @@
 ; implmented with a LSHR instruction, which is better, because LSHR has less
 ; operands and requires less constants.
 
-; CHECK: @bfe_shift
+; CHECK: {{^}}bfe_shift:
 ; CHECK-NOT: BFE_UINT
 define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
 entry:

diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
index bbfe856..2a0bb37 100644
--- a/test/CodeGen/R600/bfi_int.ll
+++ b/test/CodeGen/R600/bfi_int.ll

@@ -4,10 +4,10 @@
 ; BFI_INT Definition pattern from ISA docs
 ; (y & x) | (z & ~x)
 ;
-; R600-CHECK: @bfi_def
+; R600-CHECK: {{^}}bfi_def:
 ; R600-CHECK: BFI_INT
 ; SI-CHECK:   @bfi_def
-; SI-CHECK:   V_BFI_B32
+; SI-CHECK:   v_bfi_b32
 define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %x, -1
@@ -20,10 +20,10 @@
 
 ; SHA-256 Ch function
 ; z ^ (x & (y ^ z))
-; R600-CHECK: @bfi_sha256_ch
+; R600-CHECK: {{^}}bfi_sha256_ch:
 ; R600-CHECK: BFI_INT
 ; SI-CHECK:   @bfi_sha256_ch
-; SI-CHECK:   V_BFI_B32
+; SI-CHECK:   v_bfi_b32
 define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %y, %z
@@ -35,11 +35,11 @@
 
 ; SHA-256 Ma function
 ; ((x & z) | (y & (x | z)))
-; R600-CHECK: @bfi_sha256_ma
+; R600-CHECK: {{^}}bfi_sha256_ma:
 ; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
 ; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
-; SI-CHECK: V_XOR_B32_e64 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}}
-; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{[sv][0-9]+, [sv][0-9]+}}
+; SI-CHECK: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
+; SI-CHECK: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
 
 define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:

diff --git a/test/CodeGen/R600/big_alu.ll b/test/CodeGen/R600/big_alu.ll
index 511e8ef..28be216 100644
--- a/test/CodeGen/R600/big_alu.ll
+++ b/test/CodeGen/R600/big_alu.ll

@@ -1,5 +1,4 @@
 ;RUN: llc < %s -march=r600 -mcpu=cedar
-;REQUIRES: asserts
 
 ;This test ensures that R600 backend can handle ifcvt properly
 ;and do not generate ALU clauses with more than 128 instructions.

diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
index 0be79e6..725d5ba 100644
--- a/test/CodeGen/R600/bitcast.ll
+++ b/test/CodeGen/R600/bitcast.ll

@@ -4,8 +4,8 @@
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
-; FUNC-LABEL: @v32i8_to_v8i32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v32i8_to_v8i32:
+; SI: s_endpgm
 define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
   %1 = load <32 x i8> addrspace(2)* %0
@@ -17,8 +17,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i8ptr_v16i8ptr
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
+; SI: s_endpgm
 define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
@@ -55,8 +55,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bitcast_v2i32_to_f64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
+; SI: s_endpgm
 define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32> addrspace(1)* %in, align 8
   %add = add <2 x i32> %val, <i32 4, i32 9>
@@ -65,8 +65,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bitcast_f64_to_v2i32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
+; SI: s_endpgm
 define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0

diff --git a/test/CodeGen/R600/bswap.ll b/test/CodeGen/R600/bswap.ll
index 6aebe85..1c5a0c6 100644
--- a/test/CodeGen/R600/bswap.ll
+++ b/test/CodeGen/R600/bswap.ll

@@ -1,12 +1,21 @@
-; RUN: llc -march=r600 -mcpu=SI < %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) nounwind readnone
 declare i64 @llvm.bswap.i64(i64) nounwind readnone
 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) nounwind readnone
 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
 
+; FUNC-LABEL: @test_bswap_i32
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
+; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
+; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
+; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
 define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
@@ -14,6 +23,14 @@
   ret void
 }
 
+; FUNC-LABEL: @test_bswap_v2i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
 define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %val = load <2 x i32> addrspace(1)* %in, align 8
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
@@ -21,6 +38,20 @@
   ret void
 }
 
+; FUNC-LABEL: @test_bswap_v4i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
 define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
   %val = load <4 x i32> addrspace(1)* %in, align 16
   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
@@ -28,6 +59,39 @@
   ret void
 }
 
+; FUNC-LABEL: @test_bswap_v8i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_bfi_b32
+; SI: s_endpgm
+define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+  %val = load <8 x i32> addrspace(1)* %in, align 32
+  %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
+  store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
 define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %val = load i64 addrspace(1)* %in, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone

diff --git a/test/CodeGen/R600/build_vector.ll b/test/CodeGen/R600/build_vector.ll
index 8179de1..9137eee 100644
--- a/test/CodeGen/R600/build_vector.ll
+++ b/test/CodeGen/R600/build_vector.ll

@@ -1,32 +1,32 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @build_vector2
+; R600-CHECK: {{^}}build_vector2:
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK-NOT: MOV
-; SI-CHECK: @build_vector2
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[X:[0-9]]], 5
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[Y:[0-9]]], 6
-; SI-CHECK: BUFFER_STORE_DWORDX2 v{{\[}}[[X]]:[[Y]]{{\]}}
+; SI-CHECK: {{^}}build_vector2:
+; SI-CHECK-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-CHECK-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI-CHECK: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
 define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: @build_vector4
+; R600-CHECK: {{^}}build_vector4:
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK-NOT: MOV
-; SI-CHECK: @build_vector4
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[X:[0-9]]], 5
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[Y:[0-9]]], 6
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[Z:[0-9]]], 7
-; SI-CHECK-DAG: V_MOV_B32_e32 v[[W:[0-9]]], 8
-; SI-CHECK: BUFFER_STORE_DWORDX4 v{{\[}}[[X]]:[[W]]{{\]}}
+; SI-CHECK: {{^}}build_vector4:
+; SI-CHECK-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
+; SI-CHECK-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
+; SI-CHECK-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
+; SI-CHECK-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
+; SI-CHECK: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
 define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out

diff --git a/test/CodeGen/R600/call.ll b/test/CodeGen/R600/call.ll
index d803474..1448f04 100644
--- a/test/CodeGen/R600/call.ll
+++ b/test/CodeGen/R600/call.ll

@@ -1,7 +1,7 @@
 ; RUN: not llc -march=r600 -mcpu=SI -verify-machineinstrs< %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=r600 -mcpu=cypress < %s 2>&1 | FileCheck %s
 
-; CHECK: error: unsupported call to function defined_function in test_call
+; CHECK: error: unsupported call to function external_function in test_call_external
 
 
 declare i32 @external_function(i32) nounwind

diff --git a/test/CodeGen/R600/call_fs.ll b/test/CodeGen/R600/call_fs.ll
index f7c4e5b..7df2240 100644
--- a/test/CodeGen/R600/call_fs.ll
+++ b/test/CodeGen/R600/call_fs.ll

@@ -2,10 +2,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood -show-mc-encoding -o - | FileCheck --check-prefix=EG-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=rv710 -show-mc-encoding -o - | FileCheck --check-prefix=R600-CHECK %s
 
-; EG-CHECK: @call_fs
+; EG-CHECK: {{^}}call_fs:
 ; EG-CHECK: .long 257
 ; EG-CHECK: CALL_FS  ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0xc0,0x84]
-; R600-CHECK: @call_fs
+; R600-CHECK: {{^}}call_fs:
 ; R600-CHECK: .long 257
 ; R600-CHECK:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89]
 

diff --git a/test/CodeGen/R600/cayman-loop-bug.ll b/test/CodeGen/R600/cayman-loop-bug.ll
index a873528..c7b8c40 100644
--- a/test/CodeGen/R600/cayman-loop-bug.ll
+++ b/test/CodeGen/R600/cayman-loop-bug.ll

@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
 
-; CHECK-LABEL: @main
+; CHECK-LABEL: {{^}}main:
 ; CHECK: LOOP_START_DX10
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK: LOOP_START_DX10

diff --git a/test/CodeGen/R600/cf-stack-bug.ll b/test/CodeGen/R600/cf-stack-bug.ll
index c3a4612..02c87d7 100644
--- a/test/CodeGen/R600/cf-stack-bug.ll
+++ b/test/CodeGen/R600/cf-stack-bug.ll

@@ -17,7 +17,7 @@
 ; BUG64-NOT: Applying bug work-around
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: @nested3
+; FUNC-LABEL: {{^}}nested3:
 define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
@@ -50,7 +50,7 @@
 ; BUG64: Applying bug work-around
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: @nested4
+; FUNC-LABEL: {{^}}nested4:
 define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
@@ -91,7 +91,7 @@
 ; BUG64: Applying bug work-around
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: @nested7
+; FUNC-LABEL: {{^}}nested7:
 define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
@@ -156,7 +156,7 @@
 ; BUG64: Applying bug work-around
 ; BUG32: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
-; FUNC-LABEL: @nested8
+; FUNC-LABEL: {{^}}nested8:
 define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0

diff --git a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
index f8b4a61..b42b904 100644
--- a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll

@@ -1,14 +1,15 @@
-; RUN: opt -codegenprepare -S -o - %s | FileCheck --check-prefix=OPT --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-LLC --check-prefix=FUNC %s
+; RUN: opt -codegenprepare -S -o - %s | FileCheck --check-prefix=OPT %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-LLC %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 target triple = "r600--"
 
-; FUNC-LABEL: @test
+; OPT-LABEL: @test
 ; OPT: mul nsw i32
 ; OPT-NEXT: sext
-; SI-LLC: V_MUL_LO_I32
-; SI-LLC-NOT: V_MUL_HI
+; SI-LLC-LABEL: {{^}}test:
+; SI-LLC: s_mul_i32
+; SI-LLC-NOT: mul
 define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
 entry:
   %0 = mul nsw i32 %a, 3

diff --git a/test/CodeGen/R600/combine_vloads.ll b/test/CodeGen/R600/combine_vloads.ll
index f8ec712..38420b2 100644
--- a/test/CodeGen/R600/combine_vloads.ll
+++ b/test/CodeGen/R600/combine_vloads.ll

@@ -9,7 +9,7 @@
 
 
 ; 128-bit loads instead of many 8-bit
-; EG-LABEL: @combine_vloads:
+; EG-LABEL: {{^}}combine_vloads:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {

diff --git a/test/CodeGen/R600/commute_modifiers.ll b/test/CodeGen/R600/commute_modifiers.ll
new file mode 100644
index 0000000..30c8067
--- /dev/null
+++ b/test/CodeGen/R600/commute_modifiers.ll

@@ -0,0 +1,181 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+
+; FUNC-LABEL: @commute_add_imm_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %x = load float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %z = fadd float 2.0, %x.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %x = load float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %x.fneg.fabs = fsub float -0.000000e+00, %x.fabs
+  %z = fmul float 4.0, %x.fneg.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_imm_fneg_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %x = load float addrspace(1)* %gep.0
+  %x.fneg = fsub float -0.000000e+00, %x
+  %z = fmul float 4.0, %x.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should use SGPR for literal.
+; FUNC-LABEL: @commute_add_lit_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %x = load float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %z = fadd float 1024.0, %x.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_add_fabs_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fadd float %x, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %y.fneg = fsub float -0.000000e+00, %y
+  %z = fmul float %x, %y.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; There's no reason to commute this.
+; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fmul float %x.fabs, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x.fabs, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure we commute the multiply part for the constant in src0 even
+; though we have negate modifier on src2.
+
+; SI-LABEL: {{^}}fma_a_2.0_neg_b_f32
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], |[[R2]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r2.fabs = call float @llvm.fabs.f32(float %r2)
+
+  %r3 = tail call float @llvm.fma.f32(float %r1, float 2.0, float %r2.fabs)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/complex-folding.ll b/test/CodeGen/R600/complex-folding.ll
index 99f0d99..a5399a7 100644
--- a/test/CodeGen/R600/complex-folding.ll
+++ b/test/CodeGen/R600/complex-folding.ll

@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @main
+; CHECK: {{^}}main:
 ; CHECK-NOT: MOV
 define void @main(<4 x float> inreg %reg0) #0 {
 entry:

diff --git a/test/CodeGen/R600/concat_vectors.ll b/test/CodeGen/R600/concat_vectors.ll
new file mode 100644
index 0000000..19992eb
--- /dev/null
+++ b/test/CodeGen/R600/concat_vectors.ll

@@ -0,0 +1,284 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_concat_v1i32:
+; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF
+; instructions that access scratch memory.  Bit 23, which is the add_tid_enable
+; bit, is only set for scratch access, so we can check for the absence of this
+; value if we want to ensure scratch memory is not being used.
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+  %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+  %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+  %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+  %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+  %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+  %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16f32:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16f64:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+  %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+  %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+  %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+  %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+  %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v32i1:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+  %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+  store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v1i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+  %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
+  store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v2i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+  %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v4i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+  %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v8i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+  %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_concat_v16i16:
+; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
+; SI-NOT: movrel
+define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+  %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
+  ret void
+}

diff --git a/test/CodeGen/R600/copy-illegal-type.ll b/test/CodeGen/R600/copy-illegal-type.ll
new file mode 100644
index 0000000..66ea88e
--- /dev/null
+++ b/test/CodeGen/R600/copy-illegal-type.ll

@@ -0,0 +1,166 @@
+; RUN: llc -march=r600 -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_copy_v4i8:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
+; SI: buffer_load_dword [[REG:v[0-9]+]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: buffer_store_dword [[REG]]
+; SI: s_endpgm
+define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+
+; After scalarizing v4i8 loads is fixed.
+; XSI: buffer_load_dword
+; XSI: V_BFE
+; XSI: V_ADD
+; XSI: V_ADD
+; XSI: V_ADD
+; XSI: buffer_store_dword
+; XSI: buffer_store_dword
+
+; SI: s_endpgm
+define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: v_add
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_byte
+; SI_DAG: buffer_store_byte
+
+; XSI: buffer_load_dword
+; XSI: BFE
+; XSI: buffer_store_dword
+; XSI: V_ADD
+; XSI: buffer_store_dword
+; XSI-NEXT: buffer_store_dword
+
+; SI: s_endpgm
+define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
+  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v3i8:
+; SI-NOT: bfe
+; SI-NOT: bfi
+; SI: s_endpgm
+define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+  %val = load <3 x i8> addrspace(1)* %in, align 4
+  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: s_endpgm
+define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load volatile <4 x i8> addrspace(1)* %in, align 4
+  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: s_endpgm
+define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+  %val = load <4 x i8> addrspace(1)* %in, align 4
+  store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/copy-to-reg.ll b/test/CodeGen/R600/copy-to-reg.ll
new file mode 100644
index 0000000..f90ee78
--- /dev/null
+++ b/test/CodeGen/R600/copy-to-reg.ll

@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+
+; Test that CopyToReg instructions don't have non-register operands prior
+; to being emitted.
+
+; Make sure this doesn't crash
+; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
+define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+  %alloca = alloca [16 x i32]
+  br label %loop
+
+loop:
+  %inc = phi i32 [0, %entry], [%inc.i, %loop]
+  %ptr = getelementptr [16 x i32]* %alloca, i32 0, i32 %inc
+  store i32 %inc, i32* %ptr
+  %inc.i = add i32 %inc, 1
+  %cnd = icmp uge i32 %inc.i, 16
+  br i1 %cnd, label %done, label %loop
+
+done:
+  %tmp0 = getelementptr [16 x i32]* %alloca, i32 0, i32 0
+  %tmp1 = load i32* %tmp0
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/ctlz_zero_undef.ll b/test/CodeGen/R600/ctlz_zero_undef.ll
index 15b5188..f699127 100644
--- a/test/CodeGen/R600/ctlz_zero_undef.ll
+++ b/test/CodeGen/R600/ctlz_zero_undef.ll

@@ -1,26 +1,31 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
 
-; FUNC-LABEL: @s_ctlz_zero_undef_i32:
-; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
-; SI: S_FLBIT_I32_B32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_flbit_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @v_ctlz_zero_undef_i32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_FFBH_U32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@@ -28,12 +33,15 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctlz_zero_undef_v2i32:
-; SI: BUFFER_LOAD_DWORDX2
-; SI: V_FFBH_U32_e32
-; SI: V_FFBH_U32_e32
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
+; SI: buffer_load_dwordx2
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
@@ -41,14 +49,19 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctlz_zero_undef_v4i32:
-; SI: BUFFER_LOAD_DWORDX4
-; SI: V_FFBH_U32_e32
-; SI: V_FFBH_U32_e32
-; SI: V_FFBH_U32_e32
-; SI: V_FFBH_U32_e32
-; SI: BUFFER_STORE_DWORDX4
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: v_ffbh_u32_e32
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
+; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone

diff --git a/test/CodeGen/R600/ctpop.ll b/test/CodeGen/R600/ctpop.ll
index 15be8e1..5cfdaef 100644
--- a/test/CodeGen/R600/ctpop.ll
+++ b/test/CodeGen/R600/ctpop.ll

@@ -7,12 +7,12 @@
 declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
 
-; FUNC-LABEL: @s_ctpop_i32:
-; SI: S_LOAD_DWORD [[SVAL:s[0-9]+]],
-; SI: S_BCNT1_I32_B32 [[SRESULT:s[0-9]+]], [[SVAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctpop_i32:
+; SI: s_load_dword [[SVAL:s[0-9]+]],
+; SI: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
@@ -22,12 +22,12 @@
 }
 
 ; XXX - Why 0 in register?
-; FUNC-LABEL: @v_ctpop_i32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
-; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VZERO]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -37,15 +37,14 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_add_chain_i32
-; SI: BUFFER_LOAD_DWORD [[VAL0:v[0-9]+]],
-; SI: BUFFER_LOAD_DWORD [[VAL1:v[0-9]+]],
-; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
-; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
-; SI-NOT: ADD
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
+; SI: buffer_load_dword [[VAL0:v[0-9]+]],
+; SI: buffer_load_dword [[VAL1:v[0-9]+]],
+; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
+; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], [[VAL1]], [[VZERO]]
+; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -59,10 +58,24 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v2i32:
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
+; SI: buffer_load_dword [[VAL0:v[0-9]+]],
+; SI-NEXT: s_waitcnt
+; SI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
+; SI-NEXT: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+  %val0 = load i32 addrspace(1)* %in0, align 4
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
+  %add = add i32 %ctpop0, %sval
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_ctpop_v2i32:
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -73,12 +86,12 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v4i32:
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v4i32:
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -91,16 +104,16 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v8i32:
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v8i32:
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -117,24 +130,24 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v16i32:
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: V_BCNT_U32_B32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v16i32:
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: v_bcnt_u32_b32_e32
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
@@ -159,11 +172,11 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_inline_constant:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -174,11 +187,11 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_inline_constant_inv:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
@@ -189,12 +202,12 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_literal:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_MOV_B32_e32 [[LIT:v[0-9]+]], 0x1869f
-; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
@@ -203,12 +216,12 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_var:
-; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: s_load_dword [[VAR:s[0-9]+]],
+; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
@@ -219,12 +232,12 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_var_inv:
-; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI-DAG: S_LOAD_DWORD [[VAR:s[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: s_load_dword [[VAR:s[0-9]+]],
+; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
@@ -235,12 +248,12 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i32_add_vvar_inv
-; SI-DAG: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]], {{.*}} + 0x0
-; SI-DAG: BUFFER_LOAD_DWORD [[VAR:v[0-9]+]], {{.*}} + 0x10
-; SI: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], {{0$}}
+; SI-DAG: buffer_load_dword [[VAR:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:0x10
+; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BCNT_INT
 define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
@@ -256,29 +269,29 @@
 ; FIXME: We currently disallow SALU instructions in all branches,
 ; but there are some cases when the should be allowed.
 
-; FUNC-LABEL: @ctpop_i32_in_br
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_BCNT_U32_B32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}ctpop_i32_in_br:
+; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
+; SI: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[RESULT]], [[SRESULT]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 ; EG: BCNT_INT
-define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
-  %0 = icmp eq i32 %cond, 0
-  br i1 %0, label %if, label %else
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %else
 
 if:
-  %1 = load i32 addrspace(1)* %in
-  %2 = call i32 @llvm.ctpop.i32(i32 %1)
+  %tmp2 = call i32 @llvm.ctpop.i32(i32 %ctpop_arg)
   br label %endif
 
 else:
-  %3 = getelementptr i32 addrspace(1)* %in, i32 1
-  %4 = load i32 addrspace(1)* %3
+  %tmp3 = getelementptr i32 addrspace(1)* %in, i32 1
+  %tmp4 = load i32 addrspace(1)* %tmp3
   br label %endif
 
 endif:
-  %5 = phi i32 [%2, %if], [%4, %else]
-  store i32 %5, i32 addrspace(1)* %out
+  %tmp5 = phi i32 [%tmp2, %if], [%tmp4, %else]
+  store i32 %tmp5, i32 addrspace(1)* %out
   ret void
 }

diff --git a/test/CodeGen/R600/ctpop64.ll b/test/CodeGen/R600/ctpop64.ll
index b36ecc6..2efac8f 100644
--- a/test/CodeGen/R600/ctpop64.ll
+++ b/test/CodeGen/R600/ctpop64.ll

@@ -6,12 +6,12 @@
 declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
 declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
 
-; FUNC-LABEL: @s_ctpop_i64:
-; SI: S_LOAD_DWORDX2 [[SVAL:s\[[0-9]+:[0-9]+\]]],
-; SI: S_BCNT1_I32_B64 [[SRESULT:s[0-9]+]], [[SVAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctpop_i64:
+; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
 define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
@@ -19,13 +19,13 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_i64:
-; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; SI: V_MOV_B32_e32 [[VZERO:v[0-9]+]], 0
-; SI: V_BCNT_U32_B32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
-; SI-NEXT: V_BCNT_U32_B32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_i64:
+; SI: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
+; SI: v_mov_b32_e32 [[VZERO:v[0-9]+]], 0
+; SI: v_bcnt_u32_b32_e32 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], [[VZERO]]
+; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %val = load i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
@@ -34,10 +34,10 @@
   ret void
 }
 
-; FUNC-LABEL: @s_ctpop_v2i64:
-; SI: S_BCNT1_I32_B64
-; SI: S_BCNT1_I32_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctpop_v2i64:
+; SI: s_bcnt1_i32_b64
+; SI: s_bcnt1_i32_b64
+; SI: s_endpgm
 define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -45,12 +45,12 @@
   ret void
 }
 
-; FUNC-LABEL: @s_ctpop_v4i64:
-; SI: S_BCNT1_I32_B64
-; SI: S_BCNT1_I32_B64
-; SI: S_BCNT1_I32_B64
-; SI: S_BCNT1_I32_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_ctpop_v4i64:
+; SI: s_bcnt1_i32_b64
+; SI: s_bcnt1_i32_b64
+; SI: s_bcnt1_i32_b64
+; SI: s_bcnt1_i32_b64
+; SI: s_endpgm
 define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -58,12 +58,12 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v2i64:
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v2i64:
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: s_endpgm
 define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i64> addrspace(1)* %in, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
@@ -72,16 +72,16 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ctpop_v4i64:
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: V_BCNT_U32_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_ctpop_v4i64:
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: v_bcnt_u32_b32
+; SI: s_endpgm
 define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i64> addrspace(1)* %in, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
@@ -93,30 +93,29 @@
 ; FIXME: We currently disallow SALU instructions in all branches,
 ; but there are some cases when the should be allowed.
 
-; FUNC-LABEL: @ctpop_i64_in_br
-; SI: V_BCNT_U32_B32_e64 [[BCNT_LO:v[0-9]+]], v{{[0-9]+}}, 0
-; SI: V_BCNT_U32_B32_e32 v[[BCNT:[0-9]+]], v{{[0-9]+}}, [[BCNT_LO]]
-; SI: V_MOV_B32_e32 v[[ZERO:[0-9]+]], 0
-; SI: BUFFER_STORE_DWORDX2 v[
-; SI: [[BCNT]]:[[ZERO]]]
-; SI: S_ENDPGM
-define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i32 %cond) {
+; FUNC-LABEL: {{^}}ctpop_i64_in_br:
+; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
+; SI: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
+; SI: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
+; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
+; SI: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
+; SI: s_endpgm
+define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
-  %0 = icmp eq i32 %cond, 0
-  br i1 %0, label %if, label %else
+  %tmp0 = icmp eq i32 %cond, 0
+  br i1 %tmp0, label %if, label %else
 
 if:
-  %1 = load i64 addrspace(1)* %in
-  %2 = call i64 @llvm.ctpop.i64(i64 %1)
+  %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg)
   br label %endif
 
 else:
-  %3 = getelementptr i64 addrspace(1)* %in, i32 1
-  %4 = load i64 addrspace(1)* %3
+  %tmp3 = getelementptr i64 addrspace(1)* %in, i32 1
+  %tmp4 = load i64 addrspace(1)* %tmp3
   br label %endif
 
 endif:
-  %5 = phi i64 [%2, %if], [%4, %else]
-  store i64 %5, i64 addrspace(1)* %out
+  %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else]
+  store i64 %tmp5, i64 addrspace(1)* %out
   ret void
 }

diff --git a/test/CodeGen/R600/cttz_zero_undef.ll b/test/CodeGen/R600/cttz_zero_undef.ll
index cf44f8e..c4b1463 100644
--- a/test/CodeGen/R600/cttz_zero_undef.ll
+++ b/test/CodeGen/R600/cttz_zero_undef.ll

@@ -1,26 +1,31 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
 
-; FUNC-LABEL: @s_cttz_zero_undef_i32:
-; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
-; SI: S_FF1_I32_B32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @v_cttz_zero_undef_i32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_FFBL_B32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32 addrspace(1)* %valptr, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
@@ -28,12 +33,15 @@
   ret void
 }
 
-; FUNC-LABEL: @v_cttz_zero_undef_v2i32:
-; SI: BUFFER_LOAD_DWORDX2
-; SI: V_FFBL_B32_e32
-; SI: V_FFBL_B32_e32
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
+; SI: buffer_load_dwordx2
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32> addrspace(1)* %valptr, align 8
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
@@ -41,14 +49,19 @@
   ret void
 }
 
-; FUNC-LABEL: @v_cttz_zero_undef_v4i32:
-; SI: BUFFER_LOAD_DWORDX4
-; SI: V_FFBL_B32_e32
-; SI: V_FFBL_B32_e32
-; SI: V_FFBL_B32_e32
-; SI: V_FFBL_B32_e32
-; SI: BUFFER_STORE_DWORDX4
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
+; SI: buffer_load_dwordx4
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: v_ffbl_b32_e32
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
+; EG: FFBL_INT {{\*? *}}[[RESULT]]
 define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32> addrspace(1)* %valptr, align 16
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone

diff --git a/test/CodeGen/R600/cvt_f32_ubyte.ll b/test/CodeGen/R600/cvt_f32_ubyte.ll
index fe97a44..0d1db19 100644
--- a/test/CodeGen/R600/cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/cvt_f32_ubyte.ll

@@ -1,11 +1,11 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @load_i8_to_f32:
-; SI: BUFFER_LOAD_UBYTE [[LOADREG:v[0-9]+]],
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI: V_CVT_F32_UBYTE0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
-; SI: BUFFER_STORE_DWORD [[CONV]],
+; SI-LABEL: {{^}}load_i8_to_f32:
+; SI: buffer_load_ubyte [[LOADREG:v[0-9]+]],
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dword [[CONV]],
 define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8 addrspace(1)* %in, align 1
   %cvt = uitofp i8 %load to float
@@ -13,14 +13,14 @@
   ret void
 }
 
-; SI-LABEL: @load_v2i8_to_v2f32:
-; SI: BUFFER_LOAD_USHORT [[LOADREG:v[0-9]+]],
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI-NOT: AND
-; SI-DAG: V_CVT_F32_UBYTE1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: V_CVT_F32_UBYTE0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
-; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; SI-LABEL: {{^}}load_v2i8_to_v2f32:
+; SI: buffer_load_ushort [[LOADREG:v[0-9]+]],
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-NOT: and
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <2 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <2 x i8> %load to <2 x float>
@@ -28,13 +28,13 @@
   ret void
 }
 
-; SI-LABEL: @load_v3i8_to_v3f32:
-; SI-NOT: BFE
-; SI-NOT: V_CVT_F32_UBYTE3_e32
-; SI-DAG: V_CVT_F32_UBYTE2_e32
-; SI-DAG: V_CVT_F32_UBYTE1_e32
-; SI-DAG: V_CVT_F32_UBYTE0_e32
-; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; SI-LABEL: {{^}}load_v3i8_to_v3f32:
+; SI-NOT: bfe
+; SI-NOT: v_cvt_f32_ubyte3_e32
+; SI-DAG: v_cvt_f32_ubyte2_e32
+; SI-DAG: v_cvt_f32_ubyte1_e32
+; SI-DAG: v_cvt_f32_ubyte0_e32
+; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <3 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <3 x i8> %load to <3 x float>
@@ -42,15 +42,19 @@
   ret void
 }
 
-; SI-LABEL: @load_v4i8_to_v4f32:
-; SI: BUFFER_LOAD_DWORD [[LOADREG:v[0-9]+]],
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI-DAG: V_CVT_F32_UBYTE3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, [[LOADREG]]
-; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, [[LOADREG]]
-; SI-DAG: V_CVT_F32_UBYTE0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
-; SI: BUFFER_STORE_DWORDX4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
+; SI-LABEL: {{^}}load_v4i8_to_v4f32:
+; We can't use buffer_load_dword here, because the load is byte aligned, and
+; buffer_load_dword requires dword alignment.
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: v_or_b32_e32 [[LOADREG:v[0-9]+]]
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
@@ -58,27 +62,27 @@
   ret void
 }
 
-; XXX - This should really still be able to use the V_CVT_F32_UBYTE0
+; XXX - This should really still be able to use the v_cvt_f32_ubyte0
 ; for each component, but computeKnownBits doesn't handle vectors very
 ; well.
 
-; SI-LABEL: @load_v4i8_to_v4f32_2_uses:
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_CVT_F32_UBYTE0_e32
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_CVT_F32_UBYTE0_e32
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_CVT_F32_UBYTE0_e32
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_CVT_F32_UBYTE0_e32
+; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses:
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
+; SI: v_cvt_f32_ubyte0_e32
 
 ; XXX - replace with this when v4i8 loads aren't scalarized anymore.
-; XSI: BUFFER_LOAD_DWORD
-; XSI: V_CVT_F32_U32_e32
-; XSI: V_CVT_F32_U32_e32
-; XSI: V_CVT_F32_U32_e32
-; XSI: V_CVT_F32_U32_e32
-; SI: S_ENDPGM
+; XSI: buffer_load_dword
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; XSI: v_cvt_f32_u32_e32
+; SI: s_endpgm
 define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
@@ -89,8 +93,8 @@
 }
 
 ; Make sure this doesn't crash.
-; SI-LABEL: @load_v7i8_to_v7f32:
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}load_v7i8_to_v7f32:
+; SI: s_endpgm
 define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <7 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <7 x i8> %load to <7 x float>
@@ -98,28 +102,28 @@
   ret void
 }
 
-; SI-LABEL: @load_v8i8_to_v8f32:
-; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI-DAG: V_CVT_F32_UBYTE3_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: V_CVT_F32_UBYTE0_e32 v{{[0-9]+}}, v[[LOLOAD]]
-; SI-DAG: V_CVT_F32_UBYTE3_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-DAG: V_CVT_F32_UBYTE2_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-DAG: V_CVT_F32_UBYTE1_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-DAG: V_CVT_F32_UBYTE0_e32 v{{[0-9]+}}, v[[HILOAD]]
-; SI-NOT: BFE
-; SI-NOT: LSHR
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
+; SI-LABEL: {{^}}load_v8i8_to_v8f32:
+; SI: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[LOLOAD]]
+; SI-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
+; SI-NOT: bfe
+; SI-NOT: lshr
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <8 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <8 x i8> %load to <8 x float>
@@ -127,11 +131,11 @@
   ret void
 }
 
-; SI-LABEL: @i8_zext_inreg_i32_to_f32:
-; SI: BUFFER_LOAD_DWORD [[LOADREG:v[0-9]+]],
-; SI: V_ADD_I32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
-; SI-NEXT: V_CVT_F32_UBYTE0_e32 [[CONV:v[0-9]+]], [[ADD]]
-; SI: BUFFER_STORE_DWORD [[CONV]],
+; SI-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
+; SI: buffer_load_dword [[LOADREG:v[0-9]+]],
+; SI: v_add_i32_e32 [[ADD:v[0-9]+]], 2, [[LOADREG]]
+; SI-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
+; SI: buffer_store_dword [[CONV]],
 define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 2
@@ -141,7 +145,7 @@
   ret void
 }
 
-; SI-LABEL: @i8_zext_inreg_hi1_to_f32:
+; SI-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
 define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %inreg = and i32 %load, 65280

diff --git a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
index 6607c12..1e47bfa 100644
--- a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll

@@ -7,7 +7,7 @@
 ; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
 
 
-; CHECK: @sint
+; CHECK: {{^}}sint:
 ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -21,7 +21,7 @@
   ret void
 }
 
-;CHECK: @uint
+;CHECK: {{^}}uint:
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {

diff --git a/test/CodeGen/R600/default-fp-mode.ll b/test/CodeGen/R600/default-fp-mode.ll
index 214b2c2..935bf97 100644
--- a/test/CodeGen/R600/default-fp-mode.ll
+++ b/test/CodeGen/R600/default-fp-mode.ll

@@ -1,8 +1,27 @@
-; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=FP64-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=FP32-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals,+fp64-denormals < %s | FileCheck -check-prefix=BOTH-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals,-fp64-denormals < %s | FileCheck -check-prefix=NO-DENORMAL -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=+fp64-denormals < %s | FileCheck -check-prefix=DEFAULT -check-prefix=FUNC %s
 
-; SI-LABEL: @test_kernel
-; SI: FloatMode: 240
-; SI: IeeeMode: 0
+; FUNC-LABEL: {{^}}test_kernel:
+
+; DEFAULT: FloatMode: 192
+; DEFAULT: IeeeMode: 0
+
+; FP64-DENORMAL: FloatMode: 192
+; FP64-DENORMAL: IeeeMode: 0
+
+; FP32-DENORMAL: FloatMode: 48
+; FP32-DENORMAL: IeeeMode: 0
+
+; BOTH-DENORMAL: FloatMode: 240
+; BOTH-DENORMAL: IeeeMode: 0
+
+; NO-DENORMAL: FloatMode: 0
+; NO-DENORMAL: IeeeMode: 0
 define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1

diff --git a/test/CodeGen/R600/disconnected-predset-break-bug.ll b/test/CodeGen/R600/disconnected-predset-break-bug.ll
index 012c17b..858e4b9 100644
--- a/test/CodeGen/R600/disconnected-predset-break-bug.ll
+++ b/test/CodeGen/R600/disconnected-predset-break-bug.ll

@@ -4,7 +4,7 @@
 ; result.  This tests that there are no instructions between the PRED_SET*
 ; and the PREDICATE_BREAK in this loop.
 
-; CHECK: @loop_ge
+; CHECK: {{^}}loop_ge:
 ; CHECK: LOOP_START_DX10
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK-NEXT: JUMP

diff --git a/test/CodeGen/R600/dot4-folding.ll b/test/CodeGen/R600/dot4-folding.ll
index 3e8330f..dca6a59 100644
--- a/test/CodeGen/R600/dot4-folding.ll
+++ b/test/CodeGen/R600/dot4-folding.ll

@@ -2,7 +2,7 @@
 
 ; Exactly one constant vector can be folded into dot4, which means exactly
 ; 4 MOV instructions
-; CHECK: @main
+; CHECK: {{^}}main:
 ; CHECK: MOV
 ; CHECK: MOV
 ; CHECK: MOV

diff --git a/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
new file mode 100644
index 0000000..f334062
--- /dev/null
+++ b/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll

@@ -0,0 +1,69 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local() #1
+
+; Function Attrs: nounwind
+; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
+; CHECK: BB0_1:
+; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], 4, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], 0x80, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x84:v[0-9]+]], 0x84, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x84]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], 0x100, [[VADDR]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
+
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:0 offset1:1
+; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:33
+; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
+; CHECK: s_endpgm
+define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+entry:
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #0
+  %mul = shl nsw i32 %x.i, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
+  %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
+  %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.AMDGPU.barrier.local() #1
+  %arrayidx = getelementptr inbounds float addrspace(3)* %lptr, i32 %offset.02
+  %tmp = load float addrspace(3)* %arrayidx, align 4
+  %add1 = add nsw i32 %offset.02, 1
+  %arrayidx2 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add1
+  %tmp1 = load float addrspace(3)* %arrayidx2, align 4
+  %add3 = add nsw i32 %offset.02, 32
+  %arrayidx4 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add3
+  %tmp2 = load float addrspace(3)* %arrayidx4, align 4
+  %add5 = add nsw i32 %offset.02, 33
+  %arrayidx6 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add5
+  %tmp3 = load float addrspace(3)* %arrayidx6, align 4
+  %add7 = add nsw i32 %offset.02, 64
+  %arrayidx8 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add7
+  %tmp4 = load float addrspace(3)* %arrayidx8, align 4
+  %add9 = fadd float %tmp, %tmp1
+  %add10 = fadd float %add9, %tmp2
+  %add11 = fadd float %add10, %tmp3
+  %add12 = fadd float %add11, %tmp4
+  %add13 = fadd float %sum.03, %add12
+  %inc = add nsw i32 %k.01, 1
+  %add14 = add nsw i32 %offset.02, 97
+  %exitcond = icmp eq i32 %inc, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %tmp5 = sext i32 %x.i to i64
+  %arrayidx15 = getelementptr inbounds float addrspace(1)* %out, i64 %tmp5
+  store float %add13, float addrspace(1)* %arrayidx15, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/R600/ds_read2.ll b/test/CodeGen/R600/ds_read2.ll
new file mode 100644
index 0000000..6e0c8be
--- /dev/null
+++ b/test/CodeGen/R600/ds_read2.ll

@@ -0,0 +1,515 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+; FIXME: We don't get cases where the address was an SGPR because we
+; get a copy to the address register for each one.
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+ @lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+; SI-LABEL: @simple_read2_f32
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:8
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f32(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_max_offset
+; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:255
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_too_far
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; SI: s_endpgm
+define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_x2
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:0 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Make sure there is an instruction between the two sets of reads.
+; SI-LABEL: @simple_read2_f32_x2_barrier
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:0 offset1:8
+; SI: s_barrier
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  call void @llvm.AMDGPU.barrier.local() #2
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; For some reason adding something to the base address for the first
+; element results in only folding the inner pair.
+
+; SI-LABEL: @simple_read2_f32_x2_nonzero_base
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum.0 = fadd float %val0, %val1
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  %val2 = load float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  %val3 = load float addrspace(3)* %arrayidx3, align 4
+  %sum.1 = fadd float %val2, %val3
+
+  %sum = fadd float %sum.0, %sum.1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %idx.0
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Be careful of vectors of pointers. We don't know if the 2 pointers
+; in the vectors are really the same base, so this is not safe to
+; merge.
+; Base pointers come from different subregister of same super
+; register. We can't safely merge this.
+
+; SI-LABEL: @read2_ptr_is_subreg_arg_f32
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %val0 = load float addrspace(3)* %gep.0, align 4
+  %val1 = load float addrspace(3)* %gep.1, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Apply a constant scalar offset after the pointer vector extract.  We
+; are rejecting merges that have the same, constant 0 offset, so make
+; sure we are really rejecting it because of the different
+; subregisters.
+
+; SI-LABEL: @read2_ptr_is_subreg_arg_offset_f32
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+
+  ; Apply an additional offset after the vector that will be more obviously folded.
+  %gep.1.offset = getelementptr float addrspace(3)* %gep.1, i32 8
+
+  %val0 = load float addrspace(3)* %gep.0, align 4
+  %val1 = load float addrspace(3)* %gep.1.offset, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; We should be able to merge in this case, but probably not worth the effort.
+; SI-NOT: ds_read2_b32
+; SI: ds_read_b32
+; SI: ds_read_b32
+; SI: s_endpgm
+define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
+  %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
+  %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
+  %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
+  %gep = getelementptr inbounds <2 x [512 x float] addrspace(3)*> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+  %val0 = load float addrspace(3)* %gep.0, align 4
+  %val1 = load float addrspace(3)* %gep.1, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_volatile_0
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load volatile float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f32_volatile_1
+; SI-NOT ds_read2_b32
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load volatile float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Can't fold since not correctly aligned.
+; XXX: This isn't really testing anything useful now. I think CI
+; allows unaligned LDS accesses, which would be a problem here.
+; SI-LABEL: @unaligned_read2_f32
+; SI-NOT: ds_read2_b32
+; SI: s_endpgm
+define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 1
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 1
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @misaligned_2_simple_read2_f32
+; SI-NOT: ds_read2_b32
+; SI: s_endpgm
+define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 2
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 2
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
+; SI: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset0:0 offset1:8
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2_f64(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64_max_offset
+; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:0 offset1:255
+; SI: s_endpgm
+define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2_f64_too_far
+; SI-NOT ds_read2_b64
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
+; SI: s_endpgm
+define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; Alignment only 4
+; SI-LABEL: @misaligned_read2_f64
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
+; SI: s_endpgm
+define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 7
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 4
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+@foo = addrspace(3) global [4 x i32] undef, align 4
+
+; SI-LABEL: @load_constant_adjacent_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:1
+define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+  %val0 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  %sum = add i32 %val0, %val1
+  store i32 %sum, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @load_constant_disjoint_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:2
+define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+  %val0 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  %val1 = load i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  %sum = add i32 %val0, %val1
+  store i32 %sum, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+@bar = addrspace(3) global [4 x i64] undef, align 4
+
+; SI-LABEL: @load_misaligned64_constant_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
+define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+  %val0 = load i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  %val1 = load i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  %sum = add i64 %val0, %val1
+  store i64 %sum, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+@bar.large = addrspace(3) global [4096 x i64] undef, align 4
+
+; SI-LABEL: @load_misaligned64_constant_large_offsets
+; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset0:0 offset1:1
+; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset0:0 offset1:1
+; SI: s_endpgm
+define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+  %val0 = load i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  %val1 = load i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  %sum = add i64 %val0, %val1
+  store i64 %sum, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
+@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
+
+define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %arrayidx44 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  %tmp16 = load float addrspace(3)* %arrayidx44, align 4
+  %add47 = add nsw i32 %x.i, 1
+  %arrayidx48 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  %tmp17 = load float addrspace(3)* %arrayidx48, align 4
+  %add51 = add nsw i32 %x.i, 16
+  %arrayidx52 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  %tmp18 = load float addrspace(3)* %arrayidx52, align 4
+  %add55 = add nsw i32 %x.i, 17
+  %arrayidx56 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  %tmp19 = load float addrspace(3)* %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  %tmp20 = load float addrspace(3)* %arrayidx60, align 4
+  %add63 = add nsw i32 %y.i, 1
+  %arrayidx64 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  %tmp21 = load float addrspace(3)* %arrayidx64, align 4
+  %add67 = add nsw i32 %y.i, 32
+  %arrayidx68 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  %tmp22 = load float addrspace(3)* %arrayidx68, align 4
+  %add71 = add nsw i32 %y.i, 33
+  %arrayidx72 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  %tmp23 = load float addrspace(3)* %arrayidx72, align 4
+  %add75 = add nsw i32 %y.i, 64
+  %arrayidx76 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  %tmp24 = load float addrspace(3)* %arrayidx76, align 4
+  %add79 = add nsw i32 %y.i, 65
+  %arrayidx80 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  %tmp25 = load float addrspace(3)* %arrayidx80, align 4
+  %sum.0 = fadd float %tmp16, %tmp17
+  %sum.1 = fadd float %sum.0, %tmp18
+  %sum.2 = fadd float %sum.1, %tmp19
+  %sum.3 = fadd float %sum.2, %tmp20
+  %sum.4 = fadd float %sum.3, %tmp21
+  %sum.5 = fadd float %sum.4, %tmp22
+  %sum.6 = fadd float %sum.5, %tmp23
+  %sum.7 = fadd float %sum.6, %tmp24
+  %sum.8 = fadd float %sum.7, %tmp25
+  store float %sum.8, float addrspace(1)* %C, align 4
+  ret void
+}
+
+define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+  %load = load <2 x i32> addrspace(3)* %in, align 4
+  store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+  %load = load i64 addrspace(3)* %in, align 4
+  store i64 %load, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }

diff --git a/test/CodeGen/R600/ds_read2st64.ll b/test/CodeGen/R600/ds_read2st64.ll
new file mode 100644
index 0000000..3e98e59
--- /dev/null
+++ b/test/CodeGen/R600/ds_read2st64.ll

@@ -0,0 +1,272 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+
+; SI-LABEL: @simple_read2st64_f32_0_1
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_1_2
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 128
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_max_offset
+; SI: ds_read2st64_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:255
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 16320
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f32_over_max_offset
+; SI-NOT: ds_read2st64_b32
+; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
+; SI: s_endpgm
+define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 16384
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @odd_invalid_read2st64_f32_0
+; SI-NOT: ds_read2st64_b32
+; SI: s_endpgm
+define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 63
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @odd_invalid_read2st64_f32_1
+; SI-NOT: ds_read2st64_b32
+; SI: s_endpgm
+define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 127
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  %out.gep = getelementptr inbounds float addrspace(1)* %out, i32 %x.i
+  store float %sum, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_0_1
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_1_2
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:2
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 128
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; Alignment only
+
+; SI-LABEL: @misaligned_read2st64_f64
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:0 offset1:1
+; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
+; SI: s_endpgm
+define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 4
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; The maximum is not the usual 0xff because 0xff * 8 * 64 > 0xffff
+; SI-LABEL: @simple_read2st64_f64_max_offset
+; SI: ds_read2st64_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4 offset1:127
+; SI: s_waitcnt lgkmcnt(0)
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 256
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8128
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @simple_read2st64_f64_over_max_offset
+; SI-NOT: ds_read2st64_b64
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
+; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}}
+; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
+; SI: s_endpgm
+define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8192
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; SI-LABEL: @invalid_read2st64_f64_odd_offset
+; SI-NOT: ds_read2st64_b64
+; SI: s_endpgm
+define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.x.0 = add nsw i32 %x.i, 64
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8129
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; The stride of 8 elements is 8 * 8 bytes. We need to make sure the
+; stride in elements, not bytes, is a multiple of 64.
+
+; SI-LABEL: @byte_size_only_divisible_64_read2_f64
+; SI-NOT: ds_read2st_b64
+; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:0 offset1:8
+; SI: s_endpgm
+define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  %val0 = load double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  %val1 = load double addrspace(3)* %arrayidx1, align 8
+  %sum = fadd double %val0, %val1
+  %out.gep = getelementptr inbounds double addrspace(1)* %out, i32 %x.i
+  store double %sum, double addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }

diff --git a/test/CodeGen/R600/ds_write2.ll b/test/CodeGen/R600/ds_write2.ll
new file mode 100644
index 0000000..1807fb5
--- /dev/null
+++ b/test/CodeGen/R600/ds_write2.ll

@@ -0,0 +1,425 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+@lds.f64 = addrspace(3) global [512 x double] undef, align 8
+
+
+; SI-LABEL: @simple_write2_one_val_f32
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr float addrspace(1)* %in, i32 %x.i
+  %val = load float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_volatile_0
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store volatile float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_volatile_1
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
+; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store volatile float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; 2 data subregisters from different super registers.
+; SI-LABEL: @simple_write2_two_val_subreg2_mixed_f32
+; SI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; SI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr <2 x float> addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr <2 x float> addrspace(1)* %in.gep.0, i32 1
+  %val0 = load <2 x float> addrspace(1)* %in.gep.0, align 8
+  %val1 = load <2 x float> addrspace(1)* %in.gep.1, align 8
+  %val0.0 = extractelement <2 x float> %val0, i32 0
+  %val1.1 = extractelement <2 x float> %val1, i32 1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0.0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1.1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_subreg2_f32
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr <2 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <2 x float> addrspace(1)* %in.gep, align 8
+  %val0 = extractelement <2 x float> %val, i32 0
+  %val1 = extractelement <2 x float> %val, i32 1
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_subreg4_f32
+; SI-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr <4 x float> addrspace(1)* %in, i32 %x.i
+  %val = load <4 x float> addrspace(1)* %in.gep, align 16
+  %val0 = extractelement <4 x float> %val, i32 0
+  %val1 = extractelement <4 x float> %val, i32 3
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_max_offset_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 255
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_too_far_f32
+; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
+; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
+; SI: s_endpgm
+define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 257
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_x2
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:0 offset1:8
+; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+
+  %idx.0 = add nsw i32 %tid.x, 0
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  store float %val0, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  store float %val1, float addrspace(3)* %arrayidx3, align 4
+
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
+; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
+; SI: s_endpgm
+define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %tid.x
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %tid.x
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+
+  %idx.0 = add nsw i32 %tid.x, 3
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+
+  %idx.1 = add nsw i32 %tid.x, 8
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+
+  %idx.2 = add nsw i32 %tid.x, 11
+  %arrayidx2 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.2
+  store float %val0, float addrspace(3)* %arrayidx2, align 4
+
+  %idx.3 = add nsw i32 %tid.x, 27
+  %arrayidx3 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.3
+  store float %val1, float addrspace(3)* %arrayidx3, align 4
+
+  ret void
+}
+
+; SI-LABEL: @write2_ptr_subreg_arg_two_val_f32
+; SI-NOT: ds_write2_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: s_endpgm
+define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in0.gep = getelementptr float addrspace(1)* %in0, i32 %x.i
+  %in1.gep = getelementptr float addrspace(1)* %in1, i32 %x.i
+  %val0 = load float addrspace(1)* %in0.gep, align 4
+  %val1 = load float addrspace(1)* %in1.gep, align 4
+
+  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
+  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
+  %gep = getelementptr inbounds <2 x float addrspace(3)*> %lds.ptr, <2 x i32> %index.1
+  %gep.0 = extractelement <2 x float addrspace(3)*> %gep, i32 0
+  %gep.1 = extractelement <2 x float addrspace(3)*> %gep, i32 1
+
+  ; Apply an additional offset after the vector that will be more obviously folded.
+  %gep.1.offset = getelementptr float addrspace(3)* %gep.1, i32 8
+  store float %val0, float addrspace(3)* %gep.0, align 4
+
+  %add.x = add nsw i32 %x.i, 8
+  store float %val1, float addrspace(3)* %gep.1.offset, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_one_val_f64
+; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
+; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
+  %val = load double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; SI-LABEL: @misaligned_simple_write2_one_val_f64
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:0 offset1:1 [M0]
+; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15 [M0]
+; SI: s_endpgm
+define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
+  %val = load double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 7
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2_two_val_f64
+; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
+; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:8 [M0]
+; SI: s_endpgm
+define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double addrspace(1)* %in.gep.1, align 8
+  %arrayidx0 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
+  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %add.x
+  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+@foo = addrspace(3) global [4 x i32] undef, align 4
+
+; SI-LABEL: @store_constant_adjacent_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+define void @store_constant_adjacent_offsets() {
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
+  ret void
+}
+
+; SI-LABEL: @store_constant_disjoint_offsets
+; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
+; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset0:0 offset1:2
+define void @store_constant_disjoint_offsets() {
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
+  store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
+  ret void
+}
+
+@bar = addrspace(3) global [4 x i64] undef, align 4
+
+; SI-LABEL: @store_misaligned64_constant_offsets
+; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+define void @store_misaligned64_constant_offsets() {
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
+  ret void
+}
+
+@bar.large = addrspace(3) global [4096 x i64] undef, align 4
+
+; SI-LABEL: @store_misaligned64_constant_large_offsets
+; SI-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
+; SI-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
+; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: s_endpgm
+define void @store_misaligned64_constant_large_offsets() {
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
+  store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
+  ret void
+}
+
+@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
+@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
+
+define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %y.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %val = load float addrspace(1)* %in
+  %arrayidx44 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx44, align 4
+  %add47 = add nsw i32 %x.i, 1
+  %arrayidx48 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add47
+  store float %val, float addrspace(3)* %arrayidx48, align 4
+  %add51 = add nsw i32 %x.i, 16
+  %arrayidx52 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add51
+  store float %val, float addrspace(3)* %arrayidx52, align 4
+  %add55 = add nsw i32 %x.i, 17
+  %arrayidx56 = getelementptr inbounds [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %add55
+  store float %val, float addrspace(3)* %arrayidx56, align 4
+  %arrayidx60 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %y.i
+  store float %val, float addrspace(3)* %arrayidx60, align 4
+  %add63 = add nsw i32 %y.i, 1
+  %arrayidx64 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add63
+  store float %val, float addrspace(3)* %arrayidx64, align 4
+  %add67 = add nsw i32 %y.i, 32
+  %arrayidx68 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add67
+  store float %val, float addrspace(3)* %arrayidx68, align 4
+  %add71 = add nsw i32 %y.i, 33
+  %arrayidx72 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add71
+  store float %val, float addrspace(3)* %arrayidx72, align 4
+  %add75 = add nsw i32 %y.i, 64
+  %arrayidx76 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add75
+  store float %val, float addrspace(3)* %arrayidx76, align 4
+  %add79 = add nsw i32 %y.i, 65
+  %arrayidx80 = getelementptr inbounds [776 x float] addrspace(3)* @sgemm.lB, i32 0, i32 %add79
+  store float %val, float addrspace(3)* %arrayidx80, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }

diff --git a/test/CodeGen/R600/ds_write2st64.ll b/test/CodeGen/R600/ds_write2st64.ll
new file mode 100644
index 0000000..4cafb7c
--- /dev/null
+++ b/test/CodeGen/R600/ds_write2st64.ll

@@ -0,0 +1,119 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+
+@lds = addrspace(3) global [512 x float] undef, align 4
+
+
+; SI-LABEL: @simple_write2st64_one_val_f32_0_1
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset0:0 offset1:1 [M0]
+; SI: s_endpgm
+define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr float addrspace(1)* %in, i32 %x.i
+  %val = load float addrspace(1)* %in.gep, align 4
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
+  store float %val, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 64
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
+  store float %val, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_f32_2_5
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5 [M0]
+; SI: s_endpgm
+define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float addrspace(1)* %in.gep.1, align 4
+  %add.x.0 = add nsw i32 %x.i, 128
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x.1 = add nsw i32 %x.i, 320
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.1
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_max_offset_f32
+; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:0 offset1:255 [M0]
+; SI: s_endpgm
+define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr float addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr float addrspace(1)* %in.gep.0, i32 1
+  %val0 = load float addrspace(1)* %in.gep.0, align 4
+  %val1 = load float addrspace(1)* %in.gep.1, align 4
+  %arrayidx0 = getelementptr inbounds float addrspace(3)* %lds, i32 %x.i
+  store float %val0, float addrspace(3)* %arrayidx0, align 4
+  %add.x = add nsw i32 %x.i, 16320
+  %arrayidx1 = getelementptr inbounds float addrspace(3)* %lds, i32 %add.x
+  store float %val1, float addrspace(3)* %arrayidx1, align 4
+  ret void
+}
+
+; SI-LABEL: @simple_write2st64_two_val_max_offset_f64
+; SI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
+; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127 [M0]
+; SI: s_endpgm
+define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep.0 = getelementptr double addrspace(1)* %in, i32 %x.i
+  %in.gep.1 = getelementptr double addrspace(1)* %in.gep.0, i32 1
+  %val0 = load double addrspace(1)* %in.gep.0, align 8
+  %val1 = load double addrspace(1)* %in.gep.1, align 8
+  %add.x.0 = add nsw i32 %x.i, 256
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.0
+  store double %val0, double addrspace(3)* %arrayidx0, align 8
+  %add.x.1 = add nsw i32 %x.i, 8128
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x.1
+  store double %val1, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; SI-LABEL: @byte_size_only_divisible_64_write2st64_f64
+; SI-NOT: ds_write2st64_b64
+; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:0 offset1:8
+; SI: s_endpgm
+define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %in.gep = getelementptr double addrspace(1)* %in, i32 %x.i
+  %val = load double addrspace(1)* %in.gep, align 8
+  %arrayidx0 = getelementptr inbounds double addrspace(3)* %lds, i32 %x.i
+  store double %val, double addrspace(3)* %arrayidx0, align 8
+  %add.x = add nsw i32 %x.i, 8
+  %arrayidx1 = getelementptr inbounds double addrspace(3)* %lds, i32 %add.x
+  store double %val, double addrspace(3)* %arrayidx1, align 8
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noduplicate nounwind }

diff --git a/test/CodeGen/R600/elf.ll b/test/CodeGen/R600/elf.ll
index 9385150..6c521d0 100644
--- a/test/CodeGen/R600/elf.ll
+++ b/test/CodeGen/R600/elf.ll

@@ -5,6 +5,8 @@
 ; ELF-CHECK: Name: .AMDGPU.config
 ; ELF-CHECK: Type: SHT_PROGBITS
 
+; CONFIG-CHECK: .align 256
+; CONFIG-CHECK: test:
 ; CONFIG-CHECK: .section .AMDGPU.config
 ; CONFIG-CHECK-NEXT: .long   45096
 ; CONFIG-CHECK-NEXT: .long   0

diff --git a/test/CodeGen/R600/empty-function.ll b/test/CodeGen/R600/empty-function.ll
new file mode 100644
index 0000000..d4ff803
--- /dev/null
+++ b/test/CodeGen/R600/empty-function.ll

@@ -0,0 +1,20 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; Make sure we don't assert on empty functions
+
+; SI-LABEL: {{^}}empty_function_ret:
+; SI: .text
+; SI: s_endpgm
+; SI: codeLenInByte = 4
+define void @empty_function_ret() #0 {
+  ret void
+}
+
+; SI-LABEL: {{^}}empty_function_unreachable:
+; SI: .text
+; SI: codeLenInByte = 0
+define void @empty_function_unreachable() #0 {
+  unreachable
+}
+
+attributes #0 = { nounwind }

diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
index dc056e0..5bda8f8 100644
--- a/test/CodeGen/R600/extload.ll
+++ b/test/CodeGen/R600/extload.ll

@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @anyext_load_i8:
+; FUNC-LABEL: {{^}}anyext_load_i8:
 ; EG: AND_INT
 ; EG: 255
 define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
@@ -13,7 +13,7 @@
   ret void
 }
 
-; FUNC-LABEL: @anyext_load_i16:
+; FUNC-LABEL: {{^}}anyext_load_i16:
 ; EG: AND_INT
 ; EG: AND_INT
 ; EG-DAG: 65535
@@ -27,7 +27,7 @@
   ret void
 }
 
-; FUNC-LABEL: @anyext_load_lds_i8:
+; FUNC-LABEL: {{^}}anyext_load_lds_i8:
 ; EG: AND_INT
 ; EG: 255
 define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
@@ -39,7 +39,7 @@
   ret void
 }
 
-; FUNC-LABEL: @anyext_load_lds_i16:
+; FUNC-LABEL: {{^}}anyext_load_lds_i16:
 ; EG: AND_INT
 ; EG: AND_INT
 ; EG-DAG: 65535
@@ -53,10 +53,10 @@
   ret void
 }
 
-; FUNC-LABEL: @sextload_global_i8_to_i64
-; SI: BUFFER_LOAD_SBYTE [[LOAD:v[0-9]+]],
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sextload_global_i8_to_i64:
+; SI: buffer_load_sbyte [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
 define void @sextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %a = load i8 addrspace(1)* %in, align 8
   %ext = sext i8 %a to i64
@@ -64,10 +64,10 @@
   ret void
 }
 
-; FUNC-LABEL: @sextload_global_i16_to_i64
-; SI: BUFFER_LOAD_SSHORT [[LOAD:v[0-9]+]],
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sextload_global_i16_to_i64:
+; SI: buffer_load_sshort [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
 define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16 addrspace(1)* %in, align 8
   %ext = sext i16 %a to i64
@@ -75,10 +75,10 @@
   ret void
 }
 
-; FUNC-LABEL: @sextload_global_i32_to_i64
-; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, [[LOAD]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sextload_global_i32_to_i64:
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
+; SI: buffer_store_dwordx2
 define void @sextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32 addrspace(1)* %in, align 8
   %ext = sext i32 %a to i64
@@ -86,11 +86,11 @@
   ret void
 }
 
-; FUNC-LABEL: @zextload_global_i8_to_i64
-; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
-; SI: BUFFER_LOAD_UBYTE [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
+; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
+; SI-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
+; SI: buffer_store_dwordx2
 define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %a = load i8 addrspace(1)* %in, align 8
   %ext = zext i8 %a to i64
@@ -98,11 +98,11 @@
   ret void
 }
 
-; FUNC-LABEL: @zextload_global_i16_to_i64
-; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
-; SI: BUFFER_LOAD_USHORT [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
+; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
+; SI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]],
+; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
+; SI: buffer_store_dwordx2
 define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16 addrspace(1)* %in, align 8
   %ext = zext i16 %a to i64
@@ -110,11 +110,11 @@
   ret void
 }
 
-; FUNC-LABEL: @zextload_global_i32_to_i64
-; SI: S_MOV_B32 [[ZERO:s[0-9]+]], 0
-; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
-; SI: V_MOV_B32_e32 {{v[0-9]+}}, [[ZERO]]
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
+; SI-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
+; SI-DAG: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_mov_b32_e32 {{v[0-9]+}}, [[ZERO]]
+; SI: buffer_store_dwordx2
 define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32 addrspace(1)* %in, align 8
   %ext = zext i32 %a to i64

diff --git a/test/CodeGen/R600/extract_vector_elt_i16.ll b/test/CodeGen/R600/extract_vector_elt_i16.ll
index 5cd1b04..efdc1c8 100644
--- a/test/CodeGen/R600/extract_vector_elt_i16.ll
+++ b/test/CodeGen/R600/extract_vector_elt_i16.ll

@@ -1,10 +1,10 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @extract_vector_elt_v2i16
-; SI: BUFFER_LOAD_USHORT
-; SI: BUFFER_STORE_SHORT
-; SI: BUFFER_LOAD_USHORT
-; SI: BUFFER_STORE_SHORT
+; FUNC-LABEL: {{^}}extract_vector_elt_v2i16:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_store_short
+; SI: buffer_store_short
 define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> %foo) nounwind {
   %p0 = extractelement <2 x i16> %foo, i32 0
   %p1 = extractelement <2 x i16> %foo, i32 1
@@ -14,11 +14,11 @@
   ret void
 }
 
-; FUNC-LABEL: @extract_vector_elt_v4i16
-; SI: BUFFER_LOAD_USHORT
-; SI: BUFFER_STORE_SHORT
-; SI: BUFFER_LOAD_USHORT
-; SI: BUFFER_STORE_SHORT
+; FUNC-LABEL: {{^}}extract_vector_elt_v4i16:
+; SI: buffer_load_ushort
+; SI: buffer_load_ushort
+; SI: buffer_store_short
+; SI: buffer_store_short
 define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) nounwind {
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2

diff --git a/test/CodeGen/R600/fabs.f64.ll b/test/CodeGen/R600/fabs.f64.ll
new file mode 100644
index 0000000..d2ba320
--- /dev/null
+++ b/test/CodeGen/R600/fabs.f64.ll

@@ -0,0 +1,97 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+declare double @fabs(double) readnone
+declare double @llvm.fabs.f64(double) readnone
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
+declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
+
+; FUNC-LABEL: {{^}}v_fabs_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tidext = sext i32 %tid to i64
+  %gep = getelementptr double addrspace(1)* %in, i64 %tidext
+  %val = load double addrspace(1)* %gep, align 8
+  %fabs = call double @llvm.fabs.f64(double %val)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_f64:
+; SI: v_and_b32
+; SI-NOT: v_and_b32
+; SI: s_endpgm
+define void @fabs_f64(double addrspace(1)* %out, double %in) {
+  %fabs = call double @llvm.fabs.f64(double %in)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v2f64:
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  store <2 x double> %fabs, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_v4f64:
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
+  store <4 x double> %fabs, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fold_f64:
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: and
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
+; SI: s_endpgm
+define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+  %fabs = call double @llvm.fabs.f64(double %in0)
+  %fmul = fmul double %fabs, %in1
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fn_fold_f64:
+; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: and
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
+; SI: s_endpgm
+define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+  %fabs = call double @fabs(double %in0)
+  %fmul = fmul double %fabs, %in1
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_free_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc= bitcast i64 %in to double
+  %fabs = call double @llvm.fabs.f64(double %bc)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_fn_free_f64:
+; SI: v_and_b32
+; SI: s_endpgm
+define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc= bitcast i64 %in to double
+  %fabs = call double @fabs(double %bc)
+  store double %fabs, double addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index b87ce22..06cc97f 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll

@@ -1,65 +1,98 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
 
 ; DAGCombiner will transform:
 ; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
 ; unless isFabsFree returns true
 
-; R600-CHECK-LABEL: @fabs_free
-; R600-CHECK-NOT: AND
-; R600-CHECK: |PV.{{[XYZW]}}|
-; SI-CHECK-LABEL: @fabs_free
-; SI-CHECK: V_AND_B32
+; FUNC-LABEL: {{^}}fabs_fn_free:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+
+; SI: v_and_b32
+
+define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+  %bc= bitcast i32 %in to float
+  %fabs = call float @fabs(float %bc)
+  store float %fabs, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fabs_free:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+
+; SI: v_and_b32
 
 define void @fabs_free(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = bitcast i32 %in to float
-  %1 = call float @fabs(float %0)
-  store float %1, float addrspace(1)* %out
+  %bc= bitcast i32 %in to float
+  %fabs = call float @llvm.fabs.f32(float %bc)
+  store float %fabs, float addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @fabs_v2
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; SI-CHECK-LABEL: @fabs_v2
-; SI-CHECK: V_AND_B32
-; SI-CHECK: V_AND_B32
-define void @fabs_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+; FUNC-LABEL: {{^}}fabs_f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; SI: v_and_b32
+define void @fabs_f32(float addrspace(1)* %out, float %in) {
+  %fabs = call float @llvm.fabs.f32(float %in)
+  store float %fabs, float addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @fabs_v4
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; SI-CHECK-LABEL: @fabs_v4
-; SI-CHECK: V_AND_B32
-; SI-CHECK: V_AND_B32
-; SI-CHECK: V_AND_B32
-; SI-CHECK: V_AND_B32
-define void @fabs_v4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
+; FUNC-LABEL: {{^}}fabs_v2f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; SI: v_and_b32
+; SI: v_and_b32
+define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  store <2 x float> %fabs, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @fabs_fold
-; SI-CHECK-NOT: V_AND_B32_e32
-; SI-CHECK: V_MUL_F32_e64 v{{[0-9]+}}, s{{[0-9]+}}, |v{{[0-9]+}}|
+; FUNC-LABEL: {{^}}fabs_v4f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+; SI: v_and_b32
+define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  store <4 x float> %fabs, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fn_fold:
+; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; SI-NOT: and
+; SI: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
+define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
+  %fabs = call float @fabs(float %in0)
+  %fmul = fmul float %fabs, %in1
+  store float %fmul, float addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fabs_fold:
+; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; SI-NOT: and
+; SI: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
 define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
-entry:
-  %0 = call float @fabs(float %in0)
-  %1 = fmul float %0, %in1
-  store float %1, float addrspace(1)* %out
+  %fabs = call float @llvm.fabs.f32(float %in0)
+  %fmul = fmul float %fabs, %in1
+  store float %fmul, float addrspace(1)* %out
   ret void
 }
 
-declare float @fabs(float ) readnone
-declare <2 x float> @llvm.fabs.v2f32(<2 x float> ) readnone
-declare <4 x float> @llvm.fabs.v4f32(<4 x float> ) readnone
+declare float @fabs(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone

diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 5d2b806..774dd0b 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll

@@ -1,66 +1,63 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 
-; FUNC-LABEL: @fadd_f32
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK: V_ADD_F32
+; FUNC-LABEL: {{^}}fadd_f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI: v_add_f32
 define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
-entry:
-   %0 = fadd float %a, %b
-   store float %0, float addrspace(1)* %out
+   %add = fadd float %a, %b
+   store float %add, float addrspace(1)* %out, align 4
    ret void
 }
 
-; FUNC-LABEL: @fadd_v2f32
-; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
+; FUNC-LABEL: {{^}}fadd_v2f32:
+; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; SI: v_add_f32
+; SI: v_add_f32
 define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
-entry:
-  %0 = fadd <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %add = fadd <2 x float> %a, %b
+  store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @fadd_v4f32
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
+; FUNC-LABEL: {{^}}fadd_v4f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
 define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1) * %in
-  %b = load <4 x float> addrspace(1) * %b_ptr
+  %a = load <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float> addrspace(1)* %b_ptr, align 16
   %result = fadd <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: @fadd_v8f32
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; R600-CHECK: ADD
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
-; SI-CHECK: V_ADD_F32
+; FUNC-LABEL: {{^}}fadd_v8f32:
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; R600: ADD
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
+; SI: v_add_f32
 define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) {
-entry:
-  %0 = fadd <8 x float> %a, %b
-  store <8 x float> %0, <8 x float> addrspace(1)* %out
+  %add = fadd <8 x float> %a, %b
+  store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
   ret void
 }

diff --git a/test/CodeGen/R600/fadd64.ll b/test/CodeGen/R600/fadd64.ll
index 48cd3cf..3ca8500 100644
--- a/test/CodeGen/R600/fadd64.ll
+++ b/test/CodeGen/R600/fadd64.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @fadd_f64
-; CHECK: V_ADD_F64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
+; CHECK: {{^}}fadd_f64:
+; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
 
 define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {

diff --git a/test/CodeGen/R600/fceil.ll b/test/CodeGen/R600/fceil.ll
index 458363a..56dc796 100644
--- a/test/CodeGen/R600/fceil.ll
+++ b/test/CodeGen/R600/fceil.ll

@@ -8,8 +8,8 @@
 declare <8 x float> @llvm.ceil.v8f32(<8 x float>) nounwind readnone
 declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
 
-; FUNC-LABEL: @fceil_f32:
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_f32:
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
 define void @fceil_f32(float addrspace(1)* %out, float %x) {
@@ -18,9 +18,9 @@
   ret void
 }
 
-; FUNC-LABEL: @fceil_v2f32:
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v2f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
@@ -30,10 +30,10 @@
   ret void
 }
 
-; FUNC-LABEL: @fceil_v3f32:
-; FIXME-SI: V_CEIL_F32_e32
-; FIXME-SI: V_CEIL_F32_e32
-; FIXME-SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v3f32:
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-SI: v_ceil_f32_e32
+; FIXME-SI: v_ceil_f32_e32
 ; FIXME-EG: v3 is treated as v2 and v1, hence 2 stores
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
@@ -46,11 +46,11 @@
   ret void
 }
 
-; FUNC-LABEL: @fceil_v4f32:
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v4f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
@@ -62,15 +62,15 @@
   ret void
 }
 
-; FUNC-LABEL: @fceil_v8f32:
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v8f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
 ; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
@@ -87,23 +87,23 @@
   ret void
 }
 
-; FUNC-LABEL: @fceil_v16f32:
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
-; SI: V_CEIL_F32_e32
+; FUNC-LABEL: {{^}}fceil_v16f32:
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
+; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT1:T[0-9]+]]{{\.[XYZW]}}
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT2:T[0-9]+]]{{\.[XYZW]}}
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT3:T[0-9]+]]{{\.[XYZW]}}

diff --git a/test/CodeGen/R600/fceil64.ll b/test/CodeGen/R600/fceil64.ll
index b42aefa..029f41d 100644
--- a/test/CodeGen/R600/fceil64.ll
+++ b/test/CodeGen/R600/fceil64.ll

@@ -8,94 +8,94 @@
 declare <8 x double> @llvm.ceil.v8f64(<8 x double>) nounwind readnone
 declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 
-; FUNC-LABEL: @fceil_f64:
-; CI: V_CEIL_F64_e32
-; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: S_LSHR_B64
-; SI: S_NOT_B64
-; SI: S_AND_B64
-; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: CMP_LT_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: CMP_GT_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: CMP_GT_F64
-; SI: CNDMASK_B32
-; SI: CMP_NE_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: V_ADD_F64
+; FUNC-LABEL: {{^}}fceil_f64:
+; CI: v_ceil_f64_e32
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI-DAG: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_gt_f64
+; SI: cndmask_b32
+; SI: cmp_ne_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: v_add_f64
 define void @fceil_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @fceil_v2f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
+; FUNC-LABEL: {{^}}fceil_v2f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
 define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
 }
 
-; FIXME-FUNC-LABEL: @fceil_v3f64:
-; FIXME-CI: V_CEIL_F64_e32
-; FIXME-CI: V_CEIL_F64_e32
-; FIXME-CI: V_CEIL_F64_e32
+; FIXME-FUNC-LABEL: {{^}}fceil_v3f64:
+; FIXME-CI: v_ceil_f64_e32
+; FIXME-CI: v_ceil_f64_e32
+; FIXME-CI: v_ceil_f64_e32
 ; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
 ; }
 
-; FUNC-LABEL: @fceil_v4f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
+; FUNC-LABEL: {{^}}fceil_v4f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
 define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @fceil_v8f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
+; FUNC-LABEL: {{^}}fceil_v8f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
 define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @fceil_v16f64:
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
-; CI: V_CEIL_F64_e32
+; FUNC-LABEL: {{^}}fceil_v16f64:
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
+; CI: v_ceil_f64_e32
 define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out

diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
index c76a758..3399218 100644
--- a/test/CodeGen/R600/fcmp.ll
+++ b/test/CodeGen/R600/fcmp.ll

@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @fcmp_sext
+; CHECK: {{^}}fcmp_sext:
 ; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
@@ -18,7 +18,7 @@
 ; SET*_DX10 instruction.  Previously we were lowering this to:
 ; SET* + FP_TO_SINT
 
-; CHECK: @fcmp_br
+; CHECK: {{^}}fcmp_br:
 ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
 ; CHECK-NEXT {{[0-9]+(5.0}}
 

diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll
index bcc7a8c..dc24443 100644
--- a/test/CodeGen/R600/fcmp64.ll
+++ b/test/CodeGen/R600/fcmp64.ll

@@ -1,60 +1,55 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @flt_f64
-; CHECK: V_CMP_LT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
-define void @flt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+; CHECK-LABEL: {{^}}flt_f64:
+; CHECK: v_cmp_lt_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
    %r1 = load double addrspace(1)* %in2
    %r2 = fcmp ult double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
    ret void
 }
 
-; CHECK: @fle_f64
-; CHECK: V_CMP_LE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
-define void @fle_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+; CHECK-LABEL: {{^}}fle_f64:
+; CHECK: v_cmp_le_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
    %r1 = load double addrspace(1)* %in2
    %r2 = fcmp ule double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
    ret void
 }
 
-; CHECK: @fgt_f64
-; CHECK: V_CMP_GT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
-define void @fgt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+; CHECK-LABEL: {{^}}fgt_f64:
+; CHECK: v_cmp_gt_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
    %r1 = load double addrspace(1)* %in2
    %r2 = fcmp ugt double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
    ret void
 }
 
-; CHECK: @fge_f64
-; CHECK: V_CMP_GE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
-define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+; CHECK-LABEL: {{^}}fge_f64:
+; CHECK: v_cmp_ge_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
+define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
    %r1 = load double addrspace(1)* %in2
    %r2 = fcmp uge double %r0, %r1
-   %r3 = select i1 %r2, double %r0, double %r1
-   store double %r3, double addrspace(1)* %out
+   %r3 = zext i1 %r2 to i32
+   store i32 %r3, i32 addrspace(1)* %out
    ret void
 }
 
-; CHECK: @fne_f64
-; CHECK: V_CMP_NEQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
+; CHECK-LABEL: {{^}}fne_f64:
+; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
@@ -65,9 +60,8 @@
    ret void
 }
 
-; CHECK: @feq_f64
-; CHECK: V_CMP_EQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-
+; CHECK-LABEL: {{^}}feq_f64:
+; CHECK: v_cmp_eq_f64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1

diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
index 9c3a7e3..097c89f 100644
--- a/test/CodeGen/R600/fconst64.ll
+++ b/test/CodeGen/R600/fconst64.ll

@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @fconst_f64
-; CHECK-DAG: S_MOV_B32 {{s[0-9]+}}, 0x40140000
-; CHECK-DAG: S_MOV_B32 {{s[0-9]+}}, 0
+; CHECK: {{^}}fconst_f64:
+; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
+; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
 
 define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double addrspace(1)* %in

diff --git a/test/CodeGen/R600/fcopysign.f32.ll b/test/CodeGen/R600/fcopysign.f32.ll
index 7b4425b..897830e 100644
--- a/test/CodeGen/R600/fcopysign.f32.ll
+++ b/test/CodeGen/R600/fcopysign.f32.ll

@@ -7,15 +7,15 @@
 declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
 ; Try to identify arg based on higher address.
-; FUNC-LABEL: @test_copysign_f32:
-; SI: S_LOAD_DWORD [[SSIGN:s[0-9]+]], {{.*}} 0xc
-; SI: V_MOV_B32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
-; SI-DAG: S_LOAD_DWORD [[SMAG:s[0-9]+]], {{.*}} 0xb
-; SI-DAG: V_MOV_B32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
-; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
-; SI: V_BFI_B32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_f32:
+; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
+; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
+; SI-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
+; SI-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
+; SI-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 
 ; EG: BFI_INT
 define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
@@ -24,8 +24,8 @@
   ret void
 }
 
-; FUNC-LABEL: @test_copysign_v2f32:
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_v2f32:
+; SI: s_endpgm
 
 ; EG: BFI_INT
 ; EG: BFI_INT
@@ -35,8 +35,8 @@
   ret void
 }
 
-; FUNC-LABEL: @test_copysign_v4f32:
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_v4f32:
+; SI: s_endpgm
 
 ; EG: BFI_INT
 ; EG: BFI_INT

diff --git a/test/CodeGen/R600/fcopysign.f64.ll b/test/CodeGen/R600/fcopysign.f64.ll
index ea7a6db..90f0ce3 100644
--- a/test/CodeGen/R600/fcopysign.f64.ll
+++ b/test/CodeGen/R600/fcopysign.f64.ll

@@ -4,32 +4,32 @@
 declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
 
-; FUNC-LABEL: @test_copysign_f64:
-; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: V_MOV_B32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
-; SI-DAG: S_LOAD_DWORDX2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: V_MOV_B32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
-; SI-DAG: S_MOV_B32 [[SCONST:s[0-9]+]], 0x7fffffff
-; SI: V_BFI_B32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
-; SI: V_MOV_B32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
-; SI: BUFFER_STORE_DWORDX2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_f64:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
+; SI-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
+; SI-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
+; SI: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
+; SI: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
+; SI: s_endpgm
 define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @test_copysign_v2f64:
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_v2f64:
+; SI: s_endpgm
 define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @test_copysign_v4f64:
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}test_copysign_v4f64:
+; SI: s_endpgm
 define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8

diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 3d21524..5321fdb 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll

@@ -1,20 +1,37 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; These tests check that fdiv is expanded correctly and also test that the
 ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
 ; instruction groups.
 
-; R600-CHECK: @fdiv_v2f32
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
-; SI-CHECK: @fdiv_v2f32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
+; FUNC-LABEL: {{^}}fdiv_f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fdiv float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+
+
+; FUNC-LABEL: {{^}}fdiv_v2f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fdiv <2 x float> %a, %b
@@ -22,24 +39,24 @@
   ret void
 }
 
-; R600-CHECK: @fdiv_v4f32
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; SI-CHECK: @fdiv_v4f32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
-; SI-CHECK-DAG: V_RCP_F32
-; SI-CHECK-DAG: V_MUL_F32
+; FUNC-LABEL: {{^}}fdiv_v4f32:
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
+; SI-DAG: v_rcp_f32
+; SI-DAG: v_mul_f32
 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in

diff --git a/test/CodeGen/R600/fdiv64.ll b/test/CodeGen/R600/fdiv64.ll
index 79b5c8b..d424898 100644
--- a/test/CodeGen/R600/fdiv64.ll
+++ b/test/CodeGen/R600/fdiv64.ll

@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @fdiv_f64
-; CHECK: V_RCP_F64_e32 {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: V_MUL_F64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: {{^}}fdiv_f64:
+; CHECK: v_rcp_f64_e32 {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 
 define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {

diff --git a/test/CodeGen/R600/fetch-limits.r600.ll b/test/CodeGen/R600/fetch-limits.r600.ll
index f78d1d9..d35573e 100644
--- a/test/CodeGen/R600/fetch-limits.r600.ll
+++ b/test/CodeGen/R600/fetch-limits.r600.ll

@@ -3,7 +3,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=rv670 | FileCheck %s
 
 ; R600 supports 8 fetches in a clause
-; CHECK: @fetch_limits_r600
+; CHECK: {{^}}fetch_limits_r600:
 ; CHECK: Fetch clause
 ; CHECK: Fetch clause
 

diff --git a/test/CodeGen/R600/fetch-limits.r700+.ll b/test/CodeGen/R600/fetch-limits.r700+.ll
index 1a8a43f..17760a0 100644
--- a/test/CodeGen/R600/fetch-limits.r700+.ll
+++ b/test/CodeGen/R600/fetch-limits.r700+.ll

@@ -12,7 +12,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
 
 ; r700+ supports 16 fetches in a clause
-; CHECK: @fetch_limits_r700
+; CHECK: {{^}}fetch_limits_r700:
 ; CHECK: Fetch clause
 ; CHECK: Fetch clause
 

diff --git a/test/CodeGen/R600/ffloor.ll b/test/CodeGen/R600/ffloor.ll
index 31c6116..166f705 100644
--- a/test/CodeGen/R600/ffloor.ll
+++ b/test/CodeGen/R600/ffloor.ll

@@ -8,95 +8,95 @@
 declare <8 x double> @llvm.floor.v8f64(<8 x double>) nounwind readnone
 declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
 
-; FUNC-LABEL: @ffloor_f64:
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_f64:
+; CI: v_floor_f64_e32
 
-; SI: S_BFE_I32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
-; SI: S_ADD_I32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
-; SI: S_LSHR_B64
-; SI: S_NOT_B64
-; SI: S_AND_B64
-; SI: S_AND_B32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI: CMP_LT_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: CMP_GT_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: CMP_LT_F64
-; SI: CNDMASK_B32
-; SI: CMP_NE_I32
-; SI: CNDMASK_B32
-; SI: CNDMASK_B32
-; SI: V_ADD_F64
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI-DAG: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_lt_f64
+; SI: cndmask_b32
+; SI: cmp_ne_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: v_add_f64
 define void @ffloor_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.floor.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ffloor_v2f64:
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_v2f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
 define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
 }
 
-; FIXME-FUNC-LABEL: @ffloor_v3f64:
-; FIXME-CI: V_FLOOR_F64_e32
-; FIXME-CI: V_FLOOR_F64_e32
-; FIXME-CI: V_FLOOR_F64_e32
+; FIXME-FUNC-LABEL: {{^}}ffloor_v3f64:
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
+; FIXME-CI: v_floor_f64_e32
 ; define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
 ; }
 
-; FUNC-LABEL: @ffloor_v4f64:
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_v4f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
 define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ffloor_v8f64:
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_v8f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
 define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ffloor_v16f64:
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
-; CI: V_FLOOR_F64_e32
+; FUNC-LABEL: {{^}}ffloor_v16f64:
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
+; CI: v_floor_f64_e32
 define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out

diff --git a/test/CodeGen/R600/flat-address-space.ll b/test/CodeGen/R600/flat-address-space.ll
new file mode 100644
index 0000000..fc5af7c
--- /dev/null
+++ b/test/CodeGen/R600/flat-address-space.ll

@@ -0,0 +1,182 @@
+; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
+; RUN: llc -O0 -march=r600 -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
+
+; Disable optimizations in case there are optimizations added that
+; specialize away generic pointer accesses.
+
+
+; CHECK-LABEL: {{^}}branch_use_flat_i32:
+; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, [M0, FLAT_SCRATCH]
+; CHECK: s_endpgm
+define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+entry:
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %local, label %global
+
+local:
+  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
+  br label %end
+
+global:
+  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  br label %end
+
+end:
+  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+;  %val = load i32 addrspace(4)* %fptr, align 4
+;  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; These testcases might become useless when there are optimizations to
+; remove generic pointers.
+
+; CHECK-LABEL: {{^}}store_flat_i32:
+; CHECK: v_mov_b32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: flat_store_dword v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_i64:
+; CHECK: flat_store_dwordx2
+define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  store i64 %x, i64 addrspace(4)* %fptr, align 8
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_v4i32:
+; CHECK: flat_store_dwordx4
+define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_trunc_i16:
+; CHECK: flat_store_short
+define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %y = trunc i32 %x to i16
+  store i16 %y, i16 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_flat_trunc_i8:
+; CHECK: flat_store_byte
+define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %y = trunc i32 %x to i8
+  store i8 %y, i8 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+
+
+; CHECK-LABEL @load_flat_i32:
+; CHECK: flat_load_dword
+define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  %fload = load i32 addrspace(4)* %fptr, align 4
+  store i32 %fload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @load_flat_i64:
+; CHECK: flat_load_dwordx2
+define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  %fload = load i64 addrspace(4)* %fptr, align 4
+  store i64 %fload, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @load_flat_v4i32:
+; CHECK: flat_load_dwordx4
+define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  %fload = load <4 x i32> addrspace(4)* %fptr, align 4
+  store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i8:
+; CHECK: flat_load_sbyte
+define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8 addrspace(4)* %fptr, align 4
+  %ext = sext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i8:
+; CHECK: flat_load_ubyte
+define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8 addrspace(4)* %fptr, align 4
+  %ext = zext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i16:
+; CHECK: flat_load_sshort
+define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16 addrspace(4)* %fptr, align 4
+  %ext = sext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i16:
+; CHECK: flat_load_ushort
+define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16 addrspace(4)* %fptr, align 4
+  %ext = zext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; TODO: This should not be zero when registers are used for small
+; scratch allocations again.
+
+; Check for prologue initializing special SGPRs pointing to scratch.
+; CHECK-LABEL: {{^}}store_flat_scratch:
+; CHECK: s_movk_i32 flat_scratch_lo, 0
+; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
+; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
+; CHECK: flat_store_dword
+; CHECK: s_barrier
+; CHECK: flat_load_dword
+define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+  %alloca = alloca i32, i32 9, align 4
+  %x = call i32 @llvm.r600.read.tidig.x() #3
+  %pptr = getelementptr i32* %alloca, i32 %x
+  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() #1
+  %reload = load i32 addrspace(4)* %fptr, align 4
+  store i32 %reload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+declare i32 @llvm.r600.read.tidig.x() #3
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noduplicate }
+attributes #3 = { nounwind readnone }

diff --git a/test/CodeGen/R600/fma.f64.ll b/test/CodeGen/R600/fma.f64.ll
new file mode 100644
index 0000000..4b0ab76
--- /dev/null
+++ b/test/CodeGen/R600/fma.f64.ll

@@ -0,0 +1,46 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+
+
+; FUNC-LABEL: {{^}}fma_f64:
+; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = load double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v2f64:
+; SI: v_fma_f64
+; SI: v_fma_f64
+define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                       <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
+   %r0 = load <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double> addrspace(1)* %in2
+   %r2 = load <2 x double> addrspace(1)* %in3
+   %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2)
+   store <2 x double> %r3, <2 x double> addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fma_v4f64:
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+; SI: v_fma_f64
+define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+                       <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
+   %r0 = load <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double> addrspace(1)* %in2
+   %r2 = load <4 x double> addrspace(1)* %in3
+   %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2)
+   store <4 x double> %r3, <4 x double> addrspace(1)* %out
+   ret void
+}

diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll
index d72ffec..637e799 100644
--- a/test/CodeGen/R600/fma.ll
+++ b/test/CodeGen/R600/fma.ll

@@ -1,89 +1,92 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
 
-declare double @llvm.fma.f64(double, double, double) nounwind readnone
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-; FUNC-LABEL: @fma_f32
-; SI: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+; FUNC-LABEL: {{^}}fma_f32:
+; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
+; EG: FMA {{\*? *}}[[RES]]
 define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
-   %r0 = load float addrspace(1)* %in1
-   %r1 = load float addrspace(1)* %in2
-   %r2 = load float addrspace(1)* %in3
-   %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
-   store float %r3, float addrspace(1)* %out
-   ret void
+  %r0 = load float addrspace(1)* %in1
+  %r1 = load float addrspace(1)* %in2
+  %r2 = load float addrspace(1)* %in3
+  %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %out
+  ret void
 }
 
-; FUNC-LABEL: @fma_v2f32
-; SI: V_FMA_F32
-; SI: V_FMA_F32
+; FUNC-LABEL: {{^}}fma_v2f32:
+; SI: v_fma_f32
+; SI: v_fma_f32
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
+; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
+; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
 define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                        <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
-   %r0 = load <2 x float> addrspace(1)* %in1
-   %r1 = load <2 x float> addrspace(1)* %in2
-   %r2 = load <2 x float> addrspace(1)* %in3
-   %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
-   store <2 x float> %r3, <2 x float> addrspace(1)* %out
-   ret void
+  %r0 = load <2 x float> addrspace(1)* %in1
+  %r1 = load <2 x float> addrspace(1)* %in2
+  %r2 = load <2 x float> addrspace(1)* %in3
+  %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
+  store <2 x float> %r3, <2 x float> addrspace(1)* %out
+  ret void
 }
 
-; FUNC-LABEL: @fma_v4f32
-; SI: V_FMA_F32
-; SI: V_FMA_F32
-; SI: V_FMA_F32
-; SI: V_FMA_F32
+; FUNC-LABEL: {{^}}fma_v4f32:
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+; SI: v_fma_f32
+
+; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
+; EG-DAG: FMA {{\*? *}}[[RES]].X
+; EG-DAG: FMA {{\*? *}}[[RES]].Y
+; EG-DAG: FMA {{\*? *}}[[RES]].Z
+; EG-DAG: FMA {{\*? *}}[[RES]].W
 define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                        <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
-   %r0 = load <4 x float> addrspace(1)* %in1
-   %r1 = load <4 x float> addrspace(1)* %in2
-   %r2 = load <4 x float> addrspace(1)* %in3
-   %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
-   store <4 x float> %r3, <4 x float> addrspace(1)* %out
-   ret void
+  %r0 = load <4 x float> addrspace(1)* %in1
+  %r1 = load <4 x float> addrspace(1)* %in2
+  %r2 = load <4 x float> addrspace(1)* %in3
+  %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
+  store <4 x float> %r3, <4 x float> addrspace(1)* %out
+  ret void
 }
 
-; FUNC-LABEL: @fma_f64
-; SI: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
-                     double addrspace(1)* %in2, double addrspace(1)* %in3) {
-   %r0 = load double addrspace(1)* %in1
-   %r1 = load double addrspace(1)* %in2
-   %r2 = load double addrspace(1)* %in3
-   %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
-   store double %r3, double addrspace(1)* %out
-   ret void
+; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
+; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
+define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %b = load float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
 }
 
-; FUNC-LABEL: @fma_v2f64
-; SI: V_FMA_F64
-; SI: V_FMA_F64
-define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
-                       <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
-   %r0 = load <2 x double> addrspace(1)* %in1
-   %r1 = load <2 x double> addrspace(1)* %in2
-   %r2 = load <2 x double> addrspace(1)* %in3
-   %r3 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2)
-   store <2 x double> %r3, <2 x double> addrspace(1)* %out
-   ret void
-}
+; FUNC-LABEL: @fma_commute_mul_s_f32
+define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
 
-; FUNC-LABEL: @fma_v4f64
-; SI: V_FMA_F64
-; SI: V_FMA_F64
-; SI: V_FMA_F64
-; SI: V_FMA_F64
-define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
-                       <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
-   %r0 = load <4 x double> addrspace(1)* %in1
-   %r1 = load <4 x double> addrspace(1)* %in2
-   %r2 = load <4 x double> addrspace(1)* %in3
-   %r3 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %r0, <4 x double> %r1, <4 x double> %r2)
-   store <4 x double> %r3, <4 x double> addrspace(1)* %out
-   ret void
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %c = load float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
 }

diff --git a/test/CodeGen/R600/fmax3.ll b/test/CodeGen/R600/fmax3.ll
new file mode 100644
index 0000000..cf371b3
--- /dev/null
+++ b/test/CodeGen/R600/fmax3.ll

@@ -0,0 +1,38 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.maxnum.f32(float, float) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmax3_olt_0:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %c = load float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Commute operand of second fmax
+; SI-LABEL: {{^}}test_fmax3_olt_1:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %c = load float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/fmax_legacy.ll b/test/CodeGen/R600/fmax_legacy.ll
new file mode 100644
index 0000000..e9d837b
--- /dev/null
+++ b/test/CodeGen/R600/fmax_legacy.ll

@@ -0,0 +1,83 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmax_legacy_uge_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp uge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_oge_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp oge float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ugt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ugt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ogt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/fmaxnum.f64.ll b/test/CodeGen/R600/fmaxnum.f64.ll
new file mode 100644
index 0000000..51cbf4d
--- /dev/null
+++ b/test/CodeGen/R600/fmaxnum.f64.ll

@@ -0,0 +1,75 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.maxnum.f64(double, double) #0
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) #0
+declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>) #0
+declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0
+
+; FUNC-LABEL: @test_fmax_f64
+; SI: v_max_f64
+define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %val = call double @llvm.maxnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v2f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
+  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v4f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
+  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v8f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
+  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v16f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+; SI: v_max_f64
+define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
+  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+attributes #0 = { nounwind readnone }

diff --git a/test/CodeGen/R600/fmaxnum.ll b/test/CodeGen/R600/fmaxnum.ll
new file mode 100644
index 0000000..01d30b0
--- /dev/null
+++ b/test/CodeGen/R600/fmaxnum.ll

@@ -0,0 +1,191 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.maxnum.f32(float, float) #0
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
+declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0
+declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0
+
+declare double @llvm.maxnum.f64(double, double)
+
+; FUNC-LABEL: @test_fmax_f32
+; SI: v_max_f32_e32
+define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v2f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
+  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v4f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
+  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v8f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
+  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_v16f32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
+  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_val_nan
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_nan_val
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0
+; SI-NOT: v_max_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_var_immediate_f32
+; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_immediate_var_f32
+; SI: v_max_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_var_literal_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmax_literal_var_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }

diff --git a/test/CodeGen/R600/fmin3.ll b/test/CodeGen/R600/fmin3.ll
new file mode 100644
index 0000000..7420368
--- /dev/null
+++ b/test/CodeGen/R600/fmin3.ll

@@ -0,0 +1,38 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.minnum.f32(float, float) nounwind readnone
+
+; SI-LABEL: {{^}}test_fmin3_olt_0:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %c = load float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Commute operand of second fmin
+; SI-LABEL: {{^}}test_fmin3_olt_1:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+  %a = load float addrspace(1)* %aptr, align 4
+  %b = load float addrspace(1)* %bptr, align 4
+  %c = load float addrspace(1)* %cptr, align 4
+  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
+  %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
+  store float %f1, float addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/fmin_legacy.ll b/test/CodeGen/R600/fmin_legacy.ll
new file mode 100644
index 0000000..2fbdb6b
--- /dev/null
+++ b/test/CodeGen/R600/fmin_legacy.ll

@@ -0,0 +1,92 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmin_legacy_f32
+; EG: MIN *
+; SI: v_min_legacy_f32_e32
+define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r1, float %r0
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16
+   ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ule_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ule float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ole float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_olt_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp olt float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ult_f32
+; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
+  %cmp = fcmp ult float %a, %b
+  %val = select i1 %cmp, float %a, float %b
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/fminnum.f64.ll b/test/CodeGen/R600/fminnum.f64.ll
new file mode 100644
index 0000000..11b0c20
--- /dev/null
+++ b/test/CodeGen/R600/fminnum.f64.ll

@@ -0,0 +1,75 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.minnum.f64(double, double) #0
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0
+declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
+declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
+
+; FUNC-LABEL: @test_fmin_f64
+; SI: v_min_f64
+define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %val = call double @llvm.minnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v2f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+  %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
+  store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v4f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+  %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
+  store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v8f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+  %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
+  store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v16f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+; SI: v_min_f64
+define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+  %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
+  store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
+  ret void
+}
+
+attributes #0 = { nounwind readnone }

diff --git a/test/CodeGen/R600/fminnum.ll b/test/CodeGen/R600/fminnum.ll
new file mode 100644
index 0000000..65adab6
--- /dev/null
+++ b/test/CodeGen/R600/fminnum.ll

@@ -0,0 +1,189 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.minnum.f32(float, float) #0
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
+declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0
+declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
+
+; FUNC-LABEL: @test_fmin_f32
+; SI: v_min_f32_e32
+define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v2f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+  %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
+  store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v4f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+  %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
+  store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v8f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+  %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
+  store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_v16f32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+  %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
+  store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_val_nan
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_nan_val
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0
+; SI-NOT: v_min_f32_e32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000
+; SI: buffer_store_dword [[REG]]
+define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
+  %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_var_immediate_f32
+; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_immediate_var_f32
+; SI: v_min_f32_e64 {{v[0-9]+}}, 2.0, {{s[0-9]+}}
+define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_var_literal_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fmin_literal_var_f32
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
+; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
+define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+  %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
+  store float %val, float addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }

diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index 2a7825f..eabb271 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll

@@ -1,10 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; R600-CHECK: @fmul_f32
-; R600-CHECK: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK: @fmul_f32
-; SI-CHECK: V_MUL_F32
+
+; FUNC-LABEL: {{^}}fmul_f32:
+; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+
+; SI: v_mul_f32
 define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fmul float %a, %b
@@ -16,12 +17,12 @@
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; R600-CHECK: @fmul_v2f32
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-; SI-CHECK: @fmul_v2f32
-; SI-CHECK: V_MUL_F32
-; SI-CHECK: V_MUL_F32
+; FUNC-LABEL: {{^}}fmul_v2f32:
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+
+; SI: v_mul_f32
+; SI: v_mul_f32
 define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
@@ -29,16 +30,16 @@
   ret void
 }
 
-; R600-CHECK: @fmul_v4f32
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: @fmul_v4f32
-; SI-CHECK: V_MUL_F32
-; SI-CHECK: V_MUL_F32
-; SI-CHECK: V_MUL_F32
-; SI-CHECK: V_MUL_F32
+; FUNC-LABEL: {{^}}fmul_v4f32:
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
+; SI: v_mul_f32
 define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
@@ -47,3 +48,28 @@
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}test_mul_2_k:
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+; SI: s_endpgm
+define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
+  %y = fmul float %x, 2.0
+  %z = fmul float %y, 3.0
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_mul_2_k_inv:
+; SI: v_mul_f32
+; SI-NOT: v_mul_f32
+; SI-NOT: v_mad_f32
+; SI: s_endpgm
+define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
+  %y = fmul float %x, 3.0
+  %z = fmul float %y, 2.0
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }

diff --git a/test/CodeGen/R600/fmul64.ll b/test/CodeGen/R600/fmul64.ll
index 7c7bf04..0a5f707 100644
--- a/test/CodeGen/R600/fmul64.ll
+++ b/test/CodeGen/R600/fmul64.ll

@@ -1,8 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
 
-; CHECK: @fmul_f64
-; CHECK: V_MUL_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-
+; FUNC-LABEL: {{^}}fmul_f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1
@@ -11,3 +10,29 @@
    store double %r2, double addrspace(1)* %out
    ret void
 }
+
+; FUNC-LABEL: {{^}}fmul_v2f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                        <2 x double> addrspace(1)* %in2) {
+   %r0 = load <2 x double> addrspace(1)* %in1
+   %r1 = load <2 x double> addrspace(1)* %in2
+   %r2 = fmul <2 x double> %r0, %r1
+   store <2 x double> %r2, <2 x double> addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}fmul_v4f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+                        <4 x double> addrspace(1)* %in2) {
+   %r0 = load <4 x double> addrspace(1)* %in1
+   %r1 = load <4 x double> addrspace(1)* %in2
+   %r2 = fmul <4 x double> %r0, %r1
+   store <4 x double> %r2, <4 x double> addrspace(1)* %out
+   ret void
+}

diff --git a/test/CodeGen/R600/fmuladd.ll b/test/CodeGen/R600/fmuladd.ll
index 48944f6..16003a5 100644
--- a/test/CodeGen/R600/fmuladd.ll
+++ b/test/CodeGen/R600/fmuladd.ll

@@ -1,7 +1,12 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK: @fmuladd_f32
-; CHECK: V_MAD_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+declare float @llvm.fmuladd.f32(float, float, float)
+declare double @llvm.fmuladd.f64(double, double, double)
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+; CHECK-LABEL: {{^}}fmuladd_f32:
+; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                          float addrspace(1)* %in2, float addrspace(1)* %in3) {
@@ -13,10 +18,8 @@
    ret void
 }
 
-declare float @llvm.fmuladd.f32(float, float, float)
-
-; CHECK: @fmuladd_f64
-; CHECK: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; CHECK-LABEL: {{^}}fmuladd_f64:
+; CHECK: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 
 define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                          double addrspace(1)* %in2, double addrspace(1)* %in3) {
@@ -28,4 +31,169 @@
    ret void
 }
 
-declare double @llvm.fmuladd.f64(double, double, double)
+; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float %r1, float 2.0, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fadd_a_a_b_f32:
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fadd_a_a_b_f32(float addrspace(1)* %out,
+                            float addrspace(1)* %in1,
+                            float addrspace(1)* %in2) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r0 = load float addrspace(1)* %gep.0
+  %r1 = load float addrspace(1)* %gep.1
+
+  %add.0 = fadd float %r0, %r0
+  %add.1 = fadd float %add.0, %r1
+  store float %add.1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fadd_b_a_a_f32:
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fadd_b_a_a_f32(float addrspace(1)* %out,
+                            float addrspace(1)* %in1,
+                            float addrspace(1)* %in2) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r0 = load float addrspace(1)* %gep.0
+  %r1 = load float addrspace(1)* %gep.1
+
+  %add.0 = fadd float %r0, %r0
+  %add.1 = fadd float %r1, %add.0
+  store float %add.1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float -2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r1.fneg = fsub float -0.000000e+00, %r1
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1.fneg, float %r2)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+
+; CHECK-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32
+; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
+; CHECK: buffer_store_dword [[RESULT]]
+define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %r2.fneg = fsub float -0.000000e+00, %r2
+
+  %r3 = tail call float @llvm.fmuladd.f32(float 2.0, float %r1, float %r2.fneg)
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}

diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll
new file mode 100644
index 0000000..555f4cc
--- /dev/null
+++ b/test/CodeGen/R600/fneg-fabs.f64.ll

@@ -0,0 +1,101 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FIXME: Check something here. Currently it seems fabs + fneg aren't
+; into 2 modifiers, although theoretically that should work.
+
+; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x7fffffff
+; SI: v_and_b32_e32 v[[FABS:[0-9]+]], {{s[0-9]+}}, [[IMMREG]]
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+}}:[[FABS]]{{\]}}
+define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fadd = fadd double %y, %fsub
+  store double %fadd, double addrspace(1)* %out, align 8
+  ret void
+}
+
+define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
+  %x = load double addrspace(1)* %xptr, align 8
+  %y = load double addrspace(1)* %xptr, align 8
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fadd = fadd double %y, %fsub
+  store double %fadd, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fmul_f64:
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|
+define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
+  %fabs = call double @llvm.fabs.f64(double %x)
+  %fsub = fsub double -0.000000e+00, %fabs
+  %fmul = fmul double %y, %fsub
+  store double %fmul, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_free_f64:
+define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fabs = call double @llvm.fabs.f64(double %bc)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fabs = call double @fabs(double %bc)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_f64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
+; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
+define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+  %fabs = call double @llvm.fabs.f64(double %in)
+  %fsub = fsub double -0.000000e+00, %fabs
+  store double %fsub, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v2f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
+  store <2 x double> %fsub, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v4f64:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
+  %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
+  store <4 x double> %fsub, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+declare double @fabs(double) readnone
+declare double @llvm.fabs.f64(double) readnone
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
+declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone

diff --git a/test/CodeGen/R600/fneg-fabs.ll b/test/CodeGen/R600/fneg-fabs.ll
index d95e131..3cc832f 100644
--- a/test/CodeGen/R600/fneg-fabs.ll
+++ b/test/CodeGen/R600/fneg-fabs.ll

@@ -1,55 +1,117 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
+; SI-NOT: and
+; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
+define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %fsub = fsub float -0.000000e+00, %fabs
+  %fadd = fadd float %y, %fsub
+  store float %fadd, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
+; SI-NOT: and
+; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
+; SI-NOT: and
+define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %fsub = fsub float -0.000000e+00, %fabs
+  %fmul = fmul float %y, %fsub
+  store float %fmul, float addrspace(1)* %out, align 4
+  ret void
+}
 
 ; DAGCombiner will transform:
 ; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
 ; unless isFabsFree returns true
 
-; R600-CHECK-LABEL: @fneg_fabs_free
-; R600-CHECK-NOT: AND
-; R600-CHECK: |PV.{{[XYZW]}}|
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg_fabs_free
-; SI-CHECK: V_OR_B32
+; FUNC-LABEL: {{^}}fneg_fabs_free_f32:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+; R600: -PV
 
-define void @fneg_fabs_free(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = bitcast i32 %in to float
-  %1 = call float @fabs(float %0)
-  %2 = fsub float -0.000000e+00, %1
-  store float %2, float addrspace(1)* %out
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fabs = call float @llvm.fabs.f32(float %bc)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @fneg_fabs_v2
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: -PV
-; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg_fabs_v2
-; SI-CHECK: V_OR_B32
-; SI-CHECK: V_OR_B32
-define void @fneg_fabs_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
-entry:
-  %0 = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
-  %1 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %0
-  store <2 x float> %1, <2 x float> addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f32:
+; R600-NOT: AND
+; R600: |PV.{{[XYZW]}}|
+; R600: -PV
+
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fabs = call float @fabs(float %bc)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @fneg_fabs_v4
-; SI-CHECK: V_OR_B32
-; SI-CHECK: V_OR_B32
-; SI-CHECK: V_OR_B32
-; SI-CHECK: V_OR_B32
-define void @fneg_fabs_v4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
-entry:
-  %0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
-  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
-  store <4 x float> %1, <4 x float> addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_fabs_f32:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
+  %fabs = call float @llvm.fabs.f32(float %in)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out, align 4
   ret void
 }
 
-declare float @fabs(float ) readnone
-declare <2 x float> @llvm.fabs.v2f32(<2 x float> ) readnone
-declare <4 x float> @llvm.fabs.v4f32(<4 x float> ) readnone
+; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
+define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %val = load float addrspace(1)* %in, align 4
+  %fabs = call float @llvm.fabs.f32(float %val)
+  %fsub = fsub float -0.000000e+00, %fabs
+  store float %fsub, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_fabs_v2f32:
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: -PV
+; R600: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600: -PV
+
+; FIXME: SGPR should be used directly for first src operand.
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
+  store <2 x float> %fsub, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: SGPR should be used directly for first src operand.
+; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
+; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
+; SI-NOT: 0x80000000
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
+define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+  %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
+  store <4 x float> %fsub, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @fabs(float) readnone
+declare float @llvm.fabs.f32(float) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone

diff --git a/test/CodeGen/R600/fneg.f64.ll b/test/CodeGen/R600/fneg.f64.ll
new file mode 100644
index 0000000..7aa08a9
--- /dev/null
+++ b/test/CodeGen/R600/fneg.f64.ll

@@ -0,0 +1,59 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}fneg_f64:
+; SI: v_xor_b32
+define void @fneg_f64(double addrspace(1)* %out, double %in) {
+  %fneg = fsub double -0.000000e+00, %in
+  store double %fneg, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v2f64:
+; SI: v_xor_b32
+; SI: v_xor_b32
+define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
+  %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
+  store <2 x double> %fneg, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fneg_v4f64:
+; R600: -PV
+; R600: -T
+; R600: -PV
+; R600: -PV
+
+; SI: v_xor_b32
+; SI: v_xor_b32
+; SI: v_xor_b32
+; SI: v_xor_b32
+define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
+  %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
+  store <4 x double> %fneg, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; DAGCombiner will transform:
+; (fneg (f64 bitcast (i64 a))) => (f64 bitcast (xor (i64 a), 0x80000000))
+; unless the target returns true for isNegFree()
+
+; FUNC-LABEL: {{^}}fneg_free_f64:
+; FIXME: Unnecessary copy to VGPRs
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]$}}
+define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
+  %bc = bitcast i64 %in to double
+  %fsub = fsub double 0.0, %bc
+  store double %fsub, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}fneg_fold_f64:
+; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NOT: xor
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+  %fsub = fsub double -0.0, %in
+  %fmul = fmul double %fsub, %in
+  store double %fmul, double addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
index 4cddc73..c20cf24 100644
--- a/test/CodeGen/R600/fneg.ll
+++ b/test/CodeGen/R600/fneg.ll

@@ -1,44 +1,41 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; R600-CHECK-LABEL: @fneg
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg
-; SI-CHECK: V_XOR_B32
-define void @fneg(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fsub float -0.000000e+00, %in
-  store float %0, float addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_f32:
+; R600: -PV
+
+; SI: v_xor_b32
+define void @fneg_f32(float addrspace(1)* %out, float %in) {
+  %fneg = fsub float -0.000000e+00, %in
+  store float %fneg, float addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @fneg_v2
-; R600-CHECK: -PV
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg_v2
-; SI-CHECK: V_XOR_B32
-; SI-CHECK: V_XOR_B32
-define void @fneg_v2(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
-entry:
-  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_v2f32:
+; R600: -PV
+; R600: -PV
+
+; SI: v_xor_b32
+; SI: v_xor_b32
+define void @fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+  %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
+  store <2 x float> %fneg, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK-LABEL: @fneg_v4
-; R600-CHECK: -PV
-; R600-CHECK: -T
-; R600-CHECK: -PV
-; R600-CHECK: -PV
-; SI-CHECK-LABEL: @fneg_v4
-; SI-CHECK: V_XOR_B32
-; SI-CHECK: V_XOR_B32
-; SI-CHECK: V_XOR_B32
-; SI-CHECK: V_XOR_B32
-define void @fneg_v4(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
-entry:
-  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
-  store <4 x float> %0, <4 x float> addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_v4f32:
+; R600: -PV
+; R600: -T
+; R600: -PV
+; R600: -PV
+
+; SI: v_xor_b32
+; SI: v_xor_b32
+; SI: v_xor_b32
+; SI: v_xor_b32
+define void @fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+  %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
+  store <4 x float> %fneg, <4 x float> addrspace(1)* %out
   ret void
 }
 
@@ -46,27 +43,26 @@
 ; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
 ; unless the target returns true for isNegFree()
 
-; R600-CHECK-LABEL: @fneg_free
-; R600-CHECK-NOT: XOR
-; R600-CHECK: -KC0[2].Z
-; SI-CHECK-LABEL: @fneg_free
-; XXX: We could use V_ADD_F32_e64 with the negate bit here instead.
-; SI-CHECK: V_SUB_F32_e64 v{{[0-9]}}, 0.000000e+00, s{{[0-9]}}, 0, 0
-define void @fneg_free(float addrspace(1)* %out, i32 %in) {
-entry:
-  %0 = bitcast i32 %in to float
-  %1 = fsub float 0.0, %0
-  store float %1, float addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_free_f32:
+; R600-NOT: XOR
+; R600: -KC0[2].Z
+
+; XXX: We could use v_add_f32_e64 with the negate bit here instead.
+; SI: v_sub_f32_e64 v{{[0-9]}}, 0.0, s{{[0-9]+$}}
+define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
+  %bc = bitcast i32 %in to float
+  %fsub = fsub float 0.0, %bc
+  store float %fsub, float addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @fneg_fold
-; SI-CHECK-NOT: V_XOR_B32
-; SI-CHECK: V_MUL_F32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
-define void @fneg_fold(float addrspace(1)* %out, float %in) {
-entry:
-  %0 = fsub float -0.0, %in
-  %1 = fmul float %0, %in
-  store float %1, float addrspace(1)* %out
+; FUNC-LABEL: {{^}}fneg_fold_f32:
+; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+; SI-NOT: xor
+; SI: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
+define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
+  %fsub = fsub float -0.0, %in
+  %fmul = fmul float %fsub, %in
+  store float %fmul, float addrspace(1)* %out
   ret void
 }

diff --git a/test/CodeGen/R600/fp16_to_fp.ll b/test/CodeGen/R600/fp16_to_fp.ll
new file mode 100644
index 0000000..ec3e051
--- /dev/null
+++ b/test/CodeGen/R600/fp16_to_fp.ll

@@ -0,0 +1,28 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+
+; SI-LABEL: {{^}}test_convert_fp16_to_fp32:
+; SI: buffer_load_ushort [[VAL:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16 addrspace(1)* %in, align 2
+  %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
+  store float %cvt, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; SI-LABEL: {{^}}test_convert_fp16_to_fp64:
+; SI: buffer_load_ushort [[VAL:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
+; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+  %val = load i16 addrspace(1)* %in, align 2
+  %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
+  store double %cvt, double addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/fp16_to_fp32.ll b/test/CodeGen/R600/fp16_to_fp32.ll
deleted file mode 100644
index fa2e379..0000000
--- a/test/CodeGen/R600/fp16_to_fp32.ll
+++ /dev/null

@@ -1,14 +0,0 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
-
-; SI-LABEL: @test_convert_fp16_to_fp32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]]
-; SI: V_CVT_F16_F32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-define void @test_convert_fp16_to_fp32(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
-  %val = load float addrspace(1)* %in, align 4
-  %cvt = call i16 @llvm.convert.to.fp16(float %val) nounwind readnone
-  store i16 %cvt, i16 addrspace(1)* %out, align 2
-  ret void
-}

diff --git a/test/CodeGen/R600/fp32_to_fp16.ll b/test/CodeGen/R600/fp32_to_fp16.ll
index 9997cd3..e86ee62 100644
--- a/test/CodeGen/R600/fp32_to_fp16.ll
+++ b/test/CodeGen/R600/fp32_to_fp16.ll

@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
 
-; SI-LABEL: @test_convert_fp16_to_fp32:
-; SI: BUFFER_LOAD_USHORT [[VAL:v[0-9]+]]
-; SI: V_CVT_F32_F16_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_DWORD [[RESULT]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
-  %val = load i16 addrspace(1)* %in, align 2
-  %cvt = call float @llvm.convert.from.fp16(i16 %val) nounwind readnone
-  store float %cvt, float addrspace(1)* %out, align 4
+; SI-LABEL: {{^}}test_convert_fp32_to_fp16:
+; SI: buffer_load_dword [[VAL:v[0-9]+]]
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+  %val = load float addrspace(1)* %in, align 4
+  %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
+  store i16 %cvt, i16 addrspace(1)* %out, align 2
   ret void
 }

diff --git a/test/CodeGen/R600/fp64_to_sint.ll b/test/CodeGen/R600/fp64_to_sint.ll
deleted file mode 100644
index 185e21c..0000000
--- a/test/CodeGen/R600/fp64_to_sint.ll
+++ /dev/null

@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
-
-; CHECK: @fp64_to_sint
-; CHECK: V_CVT_I32_F64_e32
-define void @fp64_to_sint(i32 addrspace(1)* %out, double %in) {
-  %result = fptosi double %in to i32
-  store i32 %result, i32 addrspace(1)* %out
-  ret void
-}

diff --git a/test/CodeGen/R600/fp_to_sint.f64.ll b/test/CodeGen/R600/fp_to_sint.f64.ll
new file mode 100644
index 0000000..09edb40
--- /dev/null
+++ b/test/CodeGen/R600/fp_to_sint.f64.ll

@@ -0,0 +1,56 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @fp_to_sint_f64_i32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
+  %result = fptosi double %in to i32
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_v2f64_v2i32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+  %result = fptosi <2 x double> %in to <2 x i32>
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_v4f64_v4i32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+; SI: v_cvt_i32_f64_e32
+define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+  %result = fptosi <4 x double> %in to <4 x i32>
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_sint_i64_f64
+; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
+; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
+
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
+
+; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
+
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
+; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
+; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %val = load double addrspace(1)* %gep, align 8
+  %cast = fptosi double %val to i64
+  store i64 %cast, i64 addrspace(1)* %out, align 8
+  ret void
+}

diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 8302b4f..c583ec3 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll

@@ -1,31 +1,216 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; R600-CHECK: @fp_to_sint_v2i32
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI-CHECK: @fp_to_sint_v2i32
-; SI-CHECK: V_CVT_I32_F32_e32
-; SI-CHECK: V_CVT_I32_F32_e32
+; FUNC-LABEL: {{^}}fp_to_sint_i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: s_endpgm
+define void @fp_to_sint_i32 (i32 addrspace(1)* %out, float %in) {
+  %conv = fptosi float %in to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_sint_v2i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
 define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptosi <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: @fp_to_sint_v4i32
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI-CHECK: @fp_to_sint_v4i32
-; SI-CHECK: V_CVT_I32_F32_e32
-; SI-CHECK: V_CVT_I32_F32_e32
-; SI-CHECK: V_CVT_I32_F32_e32
-; SI-CHECK: V_CVT_I32_F32_e32
+; FUNC-LABEL: {{^}}fp_to_sint_v4i32:
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
+; SI: v_cvt_i32_f32_e32
 define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
   %result = fptosi <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}fp_to_sint_i64:
+
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; Check that the compiler doesn't crash with a "cannot select" error
+; SI: s_endpgm
+define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fptosi float %in to i64
+  store i64 %0, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_sint_v2i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+  %conv = fptosi <2 x float> %x to <2 x i64>
+  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_sint_v4i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+  %conv = fptosi <4 x float> %x to <4 x i64>
+  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/fp_to_sint_i64.ll b/test/CodeGen/R600/fp_to_sint_i64.ll
deleted file mode 100644
index ec3e198..0000000
--- a/test/CodeGen/R600/fp_to_sint_i64.ll
+++ /dev/null

@@ -1,12 +0,0 @@
-; FIXME: Merge into fp_to_sint.ll when EG/NI supports 64-bit types
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
-
-; SI-LABEL: @fp_to_sint_i64
-; Check that the compiler doesn't crash with a "cannot select" error
-; SI: S_ENDPGM
-define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
-entry:
-  %0 = fptosi float %in to i64
-  store i64 %0, i64 addrspace(1)* %out
-  ret void
-}

diff --git a/test/CodeGen/R600/fp_to_uint.f64.ll b/test/CodeGen/R600/fp_to_uint.f64.ll
index bf607ce..25859bb 100644
--- a/test/CodeGen/R600/fp_to_uint.f64.ll
+++ b/test/CodeGen/R600/fp_to_uint.f64.ll

@@ -1,9 +1,70 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-; SI-LABEL: @fp_to_uint_i32_f64
-; SI: V_CVT_U32_F64_e32
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}fp_to_uint_i32_f64:
+; SI: v_cvt_u32_f64_e32
 define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
   %cast = fptoui double %in to i32
   store i32 %cast, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; SI-LABEL: @fp_to_uint_v2i32_v2f64
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+  %cast = fptoui <2 x double> %in to <2 x i32>
+  store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v4i32_v4f64
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+; SI: v_cvt_u32_f64_e32
+define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+  %cast = fptoui <4 x double> %in to <4 x i32>
+  store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @fp_to_uint_i64_f64
+; CI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; CI-DAG: v_trunc_f64_e32 [[TRUNC:v\[[0-9]+:[0-9]+\]]], [[VAL]]
+; CI-DAG: s_mov_b32 s[[K0_LO:[0-9]+]], 0{{$}}
+; CI-DAG: s_mov_b32 s[[K0_HI:[0-9]+]], 0x3df00000
+
+; CI-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[VAL]], s{{\[}}[[K0_LO]]:[[K0_HI]]{{\]}}
+; CI-DAG: v_floor_f64_e32 [[FLOOR:v\[[0-9]+:[0-9]+\]]], [[MUL]]
+
+; CI-DAG: s_mov_b32 s[[K1_HI:[0-9]+]], 0xc1f00000
+
+; CI-DAG: v_fma_f64 [[FMA:v\[[0-9]+:[0-9]+\]]], [[FLOOR]], s{{\[[0-9]+}}:[[K1_HI]]{{\]}}, [[TRUNC]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
+; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
+; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+  %val = load double addrspace(1)* %gep, align 8
+  %cast = fptoui double %val to i64
+  store i64 %cast, i64 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v2i64_v2f64
+define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
+  %cast = fptoui <2 x double> %in to <2 x i64>
+  store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: @fp_to_uint_v4i64_v4f64
+define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
+  %cast = fptoui <4 x double> %in to <4 x i64>
+  store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
+  ret void
+}

diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index 77db43b..91bf4b7 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll

@@ -1,12 +1,21 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; R600-CHECK: @fp_to_uint_v2i32
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: @fp_to_uint_v2i32
-; SI-CHECK: V_CVT_U32_F32_e32
-; SI-CHECK: V_CVT_U32_F32_e32
+; FUNC-LABEL: {{^}}fp_to_uint_i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_u32_f32_e32
+; SI: s_endpgm
+define void @fp_to_uint_i32 (i32 addrspace(1)* %out, float %in) {
+  %conv = fptoui float %in to i32
+  store i32 %conv, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fp_to_uint_v2i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
 
 define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptoui <2 x float> %in to <2 x i32>
@@ -14,16 +23,15 @@
   ret void
 }
 
-; R600-CHECK: @fp_to_uint_v4i32
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; SI-CHECK: @fp_to_uint_v4i32
-; SI-CHECK: V_CVT_U32_F32_e32
-; SI-CHECK: V_CVT_U32_F32_e32
-; SI-CHECK: V_CVT_U32_F32_e32
-; SI-CHECK: V_CVT_U32_F32_e32
+; FUNC-LABEL: {{^}}fp_to_uint_v4i32:
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
+; SI: v_cvt_u32_f32_e32
 
 define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
@@ -31,3 +39,177 @@
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; FUNC: {{^}}fp_to_uint_i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_i64(i64 addrspace(1)* %out, float %x) {
+  %conv = fptoui float %x to i64
+  store i64 %conv, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_v2i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+  %conv = fptoui <2 x float> %x to <2 x i64>
+  store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC: {{^}}fp_to_uint_v4i64:
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: AND_INT
+; EG-DAG: LSHR
+; EG-DAG: SUB_INT
+; EG-DAG: AND_INT
+; EG-DAG: ASHR
+; EG-DAG: AND_INT
+; EG-DAG: OR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: LSHL
+; EG-DAG: LSHL
+; EG-DAG: SUB_INT
+; EG-DAG: LSHR
+; EG-DAG: LSHR
+; EG-DAG: SETGT_UINT
+; EG-DAG: SETGT_INT
+; EG-DAG: XOR_INT
+; EG-DAG: XOR_INT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT
+; EG-DAG: CNDE_INT
+; EG-DAG: CNDE_INT
+
+; SI: s_endpgm
+define void @fp_to_uint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+  %conv = fptoui <4 x float> %x to <4 x i64>
+  store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/fpext.ll b/test/CodeGen/R600/fpext.ll
index 143ee79..418395f 100644
--- a/test/CodeGen/R600/fpext.ll
+++ b/test/CodeGen/R600/fpext.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
 
-; CHECK: @fpext
-; CHECK: V_CVT_F64_F32_e32
+; CHECK: {{^}}fpext:
+; CHECK: v_cvt_f64_f32_e32
 define void @fpext(double addrspace(1)* %out, float %in) {
   %result = fpext float %in to double
   store double %result, double addrspace(1)* %out

diff --git a/test/CodeGen/R600/fptrunc.ll b/test/CodeGen/R600/fptrunc.ll
index 20a8c00..8ac8d3b 100644
--- a/test/CodeGen/R600/fptrunc.ll
+++ b/test/CodeGen/R600/fptrunc.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
 
-; CHECK: @fptrunc
-; CHECK: V_CVT_F32_F64_e32
+; CHECK: {{^}}fptrunc:
+; CHECK: v_cvt_f32_f64_e32
 define void @fptrunc(float addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to float
   store float %result, float addrspace(1)* %out

diff --git a/test/CodeGen/R600/frem.ll b/test/CodeGen/R600/frem.ll
new file mode 100644
index 0000000..c846a77
--- /dev/null
+++ b/test/CodeGen/R600/frem.ll

@@ -0,0 +1,103 @@
+; RUN: llc -march=r600 -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}frem_f32:
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:0x10
+; SI-DAG: v_cmp
+; SI-DAG: v_mul_f32
+; SI: v_rcp_f32_e32
+; SI: v_mul_f32_e32
+; SI: v_mul_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_mad_f32
+; SI: s_endpgm
+define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                      float addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr float addrspace(1)* %in2, i32 4
+   %r0 = load float addrspace(1)* %in1, align 4
+   %r1 = load float addrspace(1)* %gep2, align 4
+   %r2 = frem float %r0, %r1
+   store float %r2, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_frem_f32:
+; SI: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:0x10
+; SI: buffer_load_dword [[X:v[0-9]+]], {{.*}}
+; SI: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
+; SI: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
+; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                             float addrspace(1)* %in2) #1 {
+   %gep2 = getelementptr float addrspace(1)* %in2, i32 4
+   %r0 = load float addrspace(1)* %in1, align 4
+   %r1 = load float addrspace(1)* %gep2, align 4
+   %r2 = frem float %r0, %r1
+   store float %r2, float addrspace(1)* %out, align 4
+   ret void
+}
+
+; TODO: This should check something when f64 fdiv is implemented
+; correctly
+
+; FUNC-LABEL: {{^}}frem_f64:
+; SI: s_endpgm
+define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                      double addrspace(1)* %in2) #0 {
+   %r0 = load double addrspace(1)* %in1, align 8
+   %r1 = load double addrspace(1)* %in2, align 8
+   %r2 = frem double %r0, %r1
+   store double %r2, double addrspace(1)* %out, align 8
+   ret void
+}
+
+; FUNC-LABEL: {{^}}unsafe_frem_f64:
+; SI: v_rcp_f64_e32
+; SI: v_mul_f64
+; SI: v_bfe_u32
+; SI: v_fma_f64
+; SI: s_endpgm
+define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                             double addrspace(1)* %in2) #1 {
+   %r0 = load double addrspace(1)* %in1, align 8
+   %r1 = load double addrspace(1)* %in2, align 8
+   %r2 = frem double %r0, %r1
+   store double %r2, double addrspace(1)* %out, align 8
+   ret void
+}
+
+define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+                        <2 x float> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <2 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x float> addrspace(1)* %in1, align 8
+   %r1 = load <2 x float> addrspace(1)* %gep2, align 8
+   %r2 = frem <2 x float> %r0, %r1
+   store <2 x float> %r2, <2 x float> addrspace(1)* %out, align 8
+   ret void
+}
+
+define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+                        <4 x float> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <4 x float> addrspace(1)* %in2, i32 4
+   %r0 = load <4 x float> addrspace(1)* %in1, align 16
+   %r1 = load <4 x float> addrspace(1)* %gep2, align 16
+   %r2 = frem <4 x float> %r0, %r1
+   store <4 x float> %r2, <4 x float> addrspace(1)* %out, align 16
+   ret void
+}
+
+define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+                        <2 x double> addrspace(1)* %in2) #0 {
+   %gep2 = getelementptr <2 x double> addrspace(1)* %in2, i32 4
+   %r0 = load <2 x double> addrspace(1)* %in1, align 16
+   %r1 = load <2 x double> addrspace(1)* %gep2, align 16
+   %r2 = frem <2 x double> %r0, %r1
+   store <2 x double> %r2, <2 x double> addrspace(1)* %out, align 16
+   ret void
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="false" }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }

diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll
index ae50b17..1f91faf 100644
--- a/test/CodeGen/R600/fsqrt.ll
+++ b/test/CodeGen/R600/fsqrt.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
-; CHECK: @fsqrt_f32
-; CHECK: V_SQRT_F32_e32 {{v[0-9]+, v[0-9]+}}
+; CHECK: {{^}}fsqrt_f32:
+; CHECK: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
 
 define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
    %r0 = load float addrspace(1)* %in
@@ -10,8 +10,8 @@
    ret void
 }
 
-; CHECK: @fsqrt_f64
-; CHECK: V_SQRT_F64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+; CHECK: {{^}}fsqrt_f64:
+; CHECK: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 
 define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r0 = load double addrspace(1)* %in

diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 4f74efb..6e5ccf1 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll

@@ -1,14 +1,25 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; R600-CHECK: @fsub_f32
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
-; SI-CHECK: @fsub_f32
-; SI-CHECK: V_SUB_F32
-define void @fsub_f32(float addrspace(1)* %out, float %a, float %b) {
-entry:
-  %0 = fsub float %a, %b
-  store float %0, float addrspace(1)* %out
+
+; FUNC-LABEL: {{^}}v_fsub_f32:
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %b_ptr = getelementptr float addrspace(1)* %in, i32 1
+  %a = load float addrspace(1)* %in, align 4
+  %b = load float addrspace(1)* %b_ptr, align 4
+  %result = fsub float %a, %b
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_fsub_f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
+
+; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+  %sub = fsub float %a, %b
+  store float %sub, float addrspace(1)* %out, align 4
   ret void
 }
 
@@ -16,34 +27,48 @@
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; R600-CHECK: @fsub_v2f32
-; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
-; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
-; SI-CHECK: @fsub_v2f32
-; SI-CHECK: V_SUB_F32
-; SI-CHECK: V_SUB_F32
+; FUNC-LABEL: {{^}}fsub_v2f32:
+; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
+; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
+
+; FIXME: Should be using SGPR directly for first operand
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
-entry:
-  %0 = fsub <2 x float> %a, %b
-  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  %sub = fsub <2 x float> %a, %b
+  store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
   ret void
 }
 
-; R600-CHECK: @fsub_v4f32
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; SI-CHECK: @fsub_v4f32
-; SI-CHECK: V_SUB_F32
-; SI-CHECK: V_SUB_F32
-; SI-CHECK: V_SUB_F32
-; SI-CHECK: V_SUB_F32
-define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+; FUNC-LABEL: {{^}}v_fsub_v4f32:
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1) * %in
-  %b = load <4 x float> addrspace(1) * %b_ptr
+  %a = load <4 x float> addrspace(1)* %in, align 16
+  %b = load <4 x float> addrspace(1)* %b_ptr, align 16
   %result = fsub <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
+; FIXME: Should be using SGPR directly for first operand
+
+; FUNC-LABEL: {{^}}s_fsub_v4f32:
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI: s_endpgm
+define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
+  %result = fsub <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
 }

diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
index f5e5708..eca1b62 100644
--- a/test/CodeGen/R600/fsub64.ll
+++ b/test/CodeGen/R600/fsub64.ll

@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @fsub_f64:
-; SI: V_ADD_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
+; SI-LABEL: {{^}}fsub_f64:
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double addrspace(1)* %in1

diff --git a/test/CodeGen/R600/ftrunc.f64.ll b/test/CodeGen/R600/ftrunc.f64.ll
new file mode 100644
index 0000000..fba6154
--- /dev/null
+++ b/test/CodeGen/R600/ftrunc.f64.ll

@@ -0,0 +1,110 @@
+; RUN: llc -march=r600 -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.trunc.f64(double) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+declare <3 x double> @llvm.trunc.v3f64(<3 x double>) nounwind readnone
+declare <4 x double> @llvm.trunc.v4f64(<4 x double>) nounwind readnone
+declare <8 x double> @llvm.trunc.v8f64(<8 x double>) nounwind readnone
+declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
+
+; FUNC-LABEL: {{^}}v_ftrunc_f64:
+; CI: v_trunc_f64
+; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
+; SI: s_endpgm
+define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+  %x = load double addrspace(1)* %in, align 8
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_f64:
+; CI: v_trunc_f64_e32
+
+; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
+; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01
+; SI: s_lshr_b64
+; SI: s_not_b64
+; SI: s_and_b64
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: cmp_lt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: cmp_gt_i32
+; SI: cndmask_b32
+; SI: cndmask_b32
+; SI: s_endpgm
+define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+  %y = call double @llvm.trunc.f64(double %x) nounwind readnone
+  store double %y, double addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v2f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+  %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
+  store <2 x double> %y, <2 x double> addrspace(1)* %out
+  ret void
+}
+
+; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f64:
+; FIXME-CI: v_trunc_f64_e32
+; FIXME-CI: v_trunc_f64_e32
+; FIXME-CI: v_trunc_f64_e32
+; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
+;   store <3 x double> %y, <3 x double> addrspace(1)* %out
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}ftrunc_v4f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+  %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
+  store <4 x double> %y, <4 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v8f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+  %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
+  store <8 x double> %y, <8 x double> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}ftrunc_v16f64:
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+; CI: v_trunc_f64_e32
+define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+  %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
+  store <16 x double> %y, <16 x double> addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/ftrunc.ll b/test/CodeGen/R600/ftrunc.ll
index 0d7d467..0eb1d7d 100644
--- a/test/CodeGen/R600/ftrunc.ll
+++ b/test/CodeGen/R600/ftrunc.ll

@@ -8,55 +8,55 @@
 declare <8 x float> @llvm.trunc.v8f32(<8 x float>) nounwind readnone
 declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
 
-; FUNC-LABEL: @ftrunc_f32:
+; FUNC-LABEL: {{^}}ftrunc_f32:
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.trunc.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ftrunc_v2f32:
+; FUNC-LABEL: {{^}}ftrunc_v2f32:
 ; EG: TRUNC
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FIXME-FUNC-LABEL: @ftrunc_v3f32:
+; FIXME-FUNC-LABEL: {{^}}ftrunc_v3f32:
 ; FIXME-EG: TRUNC
 ; FIXME-EG: TRUNC
 ; FIXME-EG: TRUNC
-; FIXME-SI: V_TRUNC_F32_e32
-; FIXME-SI: V_TRUNC_F32_e32
-; FIXME-SI: V_TRUNC_F32_e32
+; FIXME-SI: v_trunc_f32_e32
+; FIXME-SI: v_trunc_f32_e32
+; FIXME-SI: v_trunc_f32_e32
 ; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
 ;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
 ;   store <3 x float> %y, <3 x float> addrspace(1)* %out
 ;   ret void
 ; }
 
-; FUNC-LABEL: @ftrunc_v4f32:
+; FUNC-LABEL: {{^}}ftrunc_v4f32:
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ftrunc_v8f32:
+; FUNC-LABEL: {{^}}ftrunc_v8f32:
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
@@ -65,21 +65,21 @@
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @ftrunc_v16f32:
+; FUNC-LABEL: {{^}}ftrunc_v16f32:
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
@@ -96,22 +96,22 @@
 ; EG: TRUNC
 ; EG: TRUNC
 ; EG: TRUNC
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
-; SI: V_TRUNC_F32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
+; SI: v_trunc_f32_e32
 define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out

diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
index ab2c0bf..036daaf 100644
--- a/test/CodeGen/R600/gep-address-space.ll
+++ b/test/CodeGen/R600/gep-address-space.ll

@@ -1,29 +1,33 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
 define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
-; CHECK-LABEL: @use_gep_address_space:
-; CHECK: V_MOV_B32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
-; CHECK: DS_WRITE_B32 [[PTR]], v{{[0-9]+}}, 0x40
+; CHECK-LABEL: {{^}}use_gep_address_space:
+; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
+; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
   %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16
   store i32 99, i32 addrspace(3)* %p
   ret void
 }
 
 define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
-; CHECK-LABEL: @use_gep_address_space_large_offset:
-; CHECK: S_ADD_I32
-; CHECK: DS_WRITE_B32
+; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
+; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
+; SI, which is why it is being OR'd with the base pointer.
+; SI: s_or_b32
+; CI: s_add_i32
+; CHECK: ds_write_b32
   %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
   store i32 99, i32 addrspace(3)* %p
   ret void
 }
 
 define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
-; CHECK-LABEL: @gep_as_vector_v4:
-; CHECK: S_ADD_I32
-; CHECK: S_ADD_I32
-; CHECK: S_ADD_I32
-; CHECK: S_ADD_I32
+; CHECK-LABEL: {{^}}gep_as_vector_v4:
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+; CHECK: s_add_i32
+; CHECK: s_add_i32
   %p = getelementptr <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
@@ -37,9 +41,9 @@
 }
 
 define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
-; CHECK-LABEL: @gep_as_vector_v2:
-; CHECK: S_ADD_I32
-; CHECK: S_ADD_I32
+; CHECK-LABEL: {{^}}gep_as_vector_v2:
+; CHECK: s_add_i32
+; CHECK: s_add_i32
   %p = getelementptr <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
   %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1

diff --git a/test/CodeGen/R600/global-directive.ll b/test/CodeGen/R600/global-directive.ll
new file mode 100644
index 0000000..d1244b8
--- /dev/null
+++ b/test/CodeGen/R600/global-directive.ll

@@ -0,0 +1,14 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; Make sure the GlobalDirective isn't merged with the function name
+
+; SI:	.globl	foo
+; SI: {{^}}foo:
+define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/global-zero-initializer.ll b/test/CodeGen/R600/global-zero-initializer.ll
new file mode 100644
index 0000000..b69b061
--- /dev/null
+++ b/test/CodeGen/R600/global-zero-initializer.ll

@@ -0,0 +1,12 @@
+; RUN: not llc -march=r600 -mcpu=SI < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_init_global_global
+
+@lds = addrspace(1) global [256 x i32] zeroinitializer
+
+define void @load_init_global_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [256 x i32] addrspace(1)* @lds, i32 0, i32 10
+  %ld = load i32 addrspace(1)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/global_atomics.ll b/test/CodeGen/R600/global_atomics.ll
new file mode 100644
index 0000000..533a964
--- /dev/null
+++ b/test/CodeGen/R600/global_atomics.ll

@@ -0,0 +1,801 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}atomic_add_i32_offset:
+; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
+; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32:
+; SI: buffer_atomic_add v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
+; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
+; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_offset:
+; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
+; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32:
+; SI: buffer_atomic_and v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
+; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
+; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
+; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
+; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32:
+; SI: buffer_atomic_sub v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
+; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
+; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_offset:
+; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
+; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32:
+; SI: buffer_atomic_smax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
+; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
+; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
+; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
+; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32:
+; SI: buffer_atomic_umax v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
+; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
+; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_offset:
+; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
+; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32:
+; SI: buffer_atomic_smin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
+; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
+; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
+; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
+; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32:
+; SI: buffer_atomic_umin v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
+; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
+; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_offset:
+; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
+; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32:
+; SI: buffer_atomic_or v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
+; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
+; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
+; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
+; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32:
+; SI: buffer_atomic_swap v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
+; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
+; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
+; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10{{$}}
+define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:0x10 glc {{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %gep = getelementptr i32 addrspace(1)* %out, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
+; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10{{$}}
+define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:0x10 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 4
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32:
+; SI: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
+define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+entry:
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
+; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
+define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
+; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
+; SI: buffer_store_dword [[RET]]
+define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+entry:
+  %ptr = getelementptr i32 addrspace(1)* %out, i64 %index
+  %0  = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
+  store i32 %0, i32 addrspace(1)* %out2
+  ret void
+}

diff --git a/test/CodeGen/R600/gv-const-addrspace-fail.ll b/test/CodeGen/R600/gv-const-addrspace-fail.ll
index ebd7811..905948f 100644
--- a/test/CodeGen/R600/gv-const-addrspace-fail.ll
+++ b/test/CodeGen/R600/gv-const-addrspace-fail.ll

@@ -1,14 +1,13 @@
-; XFAIL: *
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 @a = internal addrspace(2) constant [1 x i8] [ i8 7 ], align 1
 
-; FUNC-LABEL: @test_i8
+; FUNC-LABEL: {{^}}test_i8:
 ; EG: CF_END
-; SI: BUFFER_STORE_BYTE
-; SI: S_ENDPGM
+; SI: buffer_store_byte
+; SI: s_endpgm
 define void @test_i8( i32 %s, i8 addrspace(1)* %out) #3 {
   %arrayidx = getelementptr inbounds [1 x i8] addrspace(2)* @a, i32 0, i32 %s
   %1 = load i8 addrspace(2)* %arrayidx, align 1
@@ -18,10 +17,10 @@
 
 @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
 
-; FUNC-LABEL: @test_i16
+; FUNC-LABEL: {{^}}test_i16:
 ; EG: CF_END
-; SI: BUFFER_STORE_SHORT
-; SI: S_ENDPGM
+; SI: buffer_store_short
+; SI: s_endpgm
 define void @test_i16( i32 %s, i16 addrspace(1)* %out) #3 {
   %arrayidx = getelementptr inbounds [1 x i16] addrspace(2)* @b, i32 0, i32 %s
   %1 = load i16 addrspace(2)* %arrayidx, align 2
@@ -32,9 +31,9 @@
 %struct.bar = type { float, [5 x i8] }
 
 ; The illegal i8s aren't handled
-@struct_bar_gv = internal addrspace(2) unnamed_addr constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
+@struct_bar_gv = internal addrspace(2) constant [1 x %struct.bar] [ %struct.bar { float 16.0, [5 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4] } ]
 
-; FUNC-LABEL: @struct_bar_gv_load
+; FUNC-LABEL: {{^}}struct_bar_gv_load:
 define void @struct_bar_gv_load(i8 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.bar] addrspace(2)* @struct_bar_gv, i32 0, i32 0, i32 1, i32 %index
   %load = load i8 addrspace(2)* %gep, align 1
@@ -49,7 +48,7 @@
                                                                     <4 x i32> <i32 9, i32 10, i32 11, i32 12>,
                                                                     <4 x i32> <i32 13, i32 14, i32 15, i32 16> ]
 
-; FUNC-LABEL: @array_vector_gv_load
+; FUNC-LABEL: {{^}}array_vector_gv_load:
 define void @array_vector_gv_load(<4 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <4 x i32>] addrspace(2)* @array_vector_gv, i32 0, i32 %index
   %load = load <4 x i32> addrspace(2)* %gep, align 16

diff --git a/test/CodeGen/R600/gv-const-addrspace.ll b/test/CodeGen/R600/gv-const-addrspace.ll
index db64a6f..6aa20b8 100644
--- a/test/CodeGen/R600/gv-const-addrspace.ll
+++ b/test/CodeGen/R600/gv-const-addrspace.ll

@@ -4,11 +4,11 @@
 
 @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
 
-; XXX: Test on SI once 64-bit adds are supportes.
-
 @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
 
-; FUNC-LABEL: @float
+; FUNC-LABEL: {{^}}float:
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -27,7 +27,10 @@
 
 @i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
 
-; FUNC-LABEL: @i32
+; FUNC-LABEL: {{^}}i32:
+
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
 
 ; EG-DAG: MOV {{\** *}}T2.X
 ; EG-DAG: MOV {{\** *}}T3.X
@@ -49,7 +52,8 @@
 
 @struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
 
-; FUNC-LABEL: @struct_foo_gv_load
+; FUNC-LABEL: {{^}}struct_foo_gv_load:
+; SI: s_load_dword
 
 define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
@@ -63,10 +67,31 @@
                                                                 <1 x i32> <i32 3>,
                                                                 <1 x i32> <i32 4> ]
 
-; FUNC-LABEL: @array_v1_gv_load
+; FUNC-LABEL: {{^}}array_v1_gv_load:
+; FIXME: We should be using s_load_dword here.
+; SI: buffer_load_dword
 define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
   %load = load <1 x i32> addrspace(2)* %gep, align 4
   store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
   ret void
 }
+
+define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+entry:
+  %0 = icmp eq i32 0, %a
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = getelementptr inbounds [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
+  %2 = load float addrspace(2)* %1
+  store float %2, float addrspace(1)* %out
+  br label %endif
+
+else:
+  store float 1.0, float addrspace(1)* %out
+  br label %endif
+
+endif:
+  ret void
+}

diff --git a/test/CodeGen/R600/half.ll b/test/CodeGen/R600/half.ll
new file mode 100644
index 0000000..6ad9b2f
--- /dev/null
+++ b/test/CodeGen/R600/half.ll

@@ -0,0 +1,61 @@
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_load_store:
+; CHECK: buffer_load_ushort [[TMP:v[0-9]+]]
+; CHECK: buffer_store_short [[TMP]]
+  %val = load half addrspace(1)* %in
+  store half %val, half addrspace(1) * %out
+  ret void
+}
+
+define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_bitcast_from_half:
+; CHECK: buffer_load_ushort [[TMP:v[0-9]+]]
+; CHECK: buffer_store_short [[TMP]]
+  %val = load half addrspace(1) * %in
+  %val_int = bitcast half %val to i16
+  store i16 %val_int, i16 addrspace(1)* %out
+  ret void
+}
+
+define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) {
+; CHECK-LABEL: {{^}}test_bitcast_to_half:
+; CHECK: buffer_load_ushort [[TMP:v[0-9]+]]
+; CHECK: buffer_store_short [[TMP]]
+  %val = load i16 addrspace(1)* %in
+  %val_fp = bitcast i16 %val to half
+  store half %val_fp, half addrspace(1)* %out
+  ret void
+}
+
+define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_extend32:
+; CHECK: v_cvt_f32_f16_e32
+
+  %val16 = load half addrspace(1)* %in
+  %val32 = fpext half %val16 to float
+  store float %val32, float addrspace(1)* %out
+  ret void
+}
+
+define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_extend64:
+; CHECK: v_cvt_f32_f16_e32
+; CHECK: v_cvt_f64_f32_e32
+
+  %val16 = load half addrspace(1)* %in
+  %val64 = fpext half %val16 to double
+  store double %val64, double addrspace(1)* %out
+  ret void
+}
+
+define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) {
+; CHECK-LABEL: {{^}}test_trunc32:
+; CHECK: v_cvt_f16_f32_e32
+
+  %val32 = load float addrspace(1)* %in
+  %val16 = fptrunc float %val32 to half
+  store half %val16, half addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/i1-copy-implicit-def.ll b/test/CodeGen/R600/i1-copy-implicit-def.ll
new file mode 100644
index 0000000..7c5bc04
--- /dev/null
+++ b/test/CodeGen/R600/i1-copy-implicit-def.ll

@@ -0,0 +1,21 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SILowerI1Copies was not handling IMPLICIT_DEF
+; SI-LABEL: {{^}}br_implicit_def:
+; SI: BB#0:
+; SI-NEXT: s_and_saveexec_b64
+; SI-NEXT: s_xor_b64
+; SI-NEXT: BB#1:
+define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  store volatile i32 123, i32 addrspace(1)* %out
+  ret void
+
+bb2:
+  ret void
+}
+
+attributes #0 = { nounwind }

diff --git a/test/CodeGen/R600/i1-copy-phi.ll b/test/CodeGen/R600/i1-copy-phi.ll
new file mode 100644
index 0000000..bfa8672
--- /dev/null
+++ b/test/CodeGen/R600/i1-copy-phi.ll

@@ -0,0 +1,29 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}br_i1_phi:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; SI: s_and_saveexec_b64
+; SI: s_xor_b64
+; SI: v_mov_b32_e32 [[REG]], -1{{$}}
+; SI: v_cmp_ne_i32_e64 {{s\[[0-9]+:[0-9]+\]}}, [[REG]], 0
+; SI: s_and_saveexec_b64
+; SI: s_xor_b64
+; SI: s_endpgm
+define void @br_i1_phi(i32 %arg, i1 %arg1) #0 {
+bb:
+  br i1 %arg1, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %tmp = phi i1 [ true, %bb2 ], [ false, %bb ]
+  br i1 %tmp, label %bb4, label %bb6
+
+bb4:                                              ; preds = %bb3
+  %tmp5 = mul i32 undef, %arg
+  br label %bb6
+
+bb6:                                              ; preds = %bb4, %bb3
+  ret void
+}

diff --git a/test/CodeGen/R600/icmp64.ll b/test/CodeGen/R600/icmp64.ll
index c9e62ff..870bf7f 100644
--- a/test/CodeGen/R600/icmp64.ll
+++ b/test/CodeGen/R600/icmp64.ll

@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @test_i64_eq:
-; SI: V_CMP_EQ_I64
+; SI-LABEL: {{^}}test_i64_eq:
+; SI: v_cmp_eq_i64
 define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp eq i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -9,8 +9,8 @@
   ret void
 }
 
-; SI-LABEL: @test_i64_ne:
-; SI: V_CMP_NE_I64
+; SI-LABEL: {{^}}test_i64_ne:
+; SI: v_cmp_ne_i64
 define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ne i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -18,8 +18,8 @@
   ret void
 }
 
-; SI-LABEL: @test_i64_slt:
-; SI: V_CMP_LT_I64
+; SI-LABEL: {{^}}test_i64_slt:
+; SI: v_cmp_lt_i64
 define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp slt i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -27,8 +27,8 @@
   ret void
 }
 
-; SI-LABEL: @test_i64_ult:
-; SI: V_CMP_LT_U64
+; SI-LABEL: {{^}}test_i64_ult:
+; SI: v_cmp_lt_u64
 define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ult i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -36,8 +36,8 @@
   ret void
 }
 
-; SI-LABEL: @test_i64_sle:
-; SI: V_CMP_LE_I64
+; SI-LABEL: {{^}}test_i64_sle:
+; SI: v_cmp_le_i64
 define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sle i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -45,8 +45,8 @@
   ret void
 }
 
-; SI-LABEL: @test_i64_ule:
-; SI: V_CMP_LE_U64
+; SI-LABEL: {{^}}test_i64_ule:
+; SI: v_cmp_le_u64
 define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ule i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -54,8 +54,8 @@
   ret void
 }
 
-; SI-LABEL: @test_i64_sgt:
-; SI: V_CMP_GT_I64
+; SI-LABEL: {{^}}test_i64_sgt:
+; SI: v_cmp_gt_i64
 define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sgt i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -63,8 +63,8 @@
   ret void
 }
 
-; SI-LABEL: @test_i64_ugt:
-; SI: V_CMP_GT_U64
+; SI-LABEL: {{^}}test_i64_ugt:
+; SI: v_cmp_gt_u64
 define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -72,8 +72,8 @@
   ret void
 }
 
-; SI-LABEL: @test_i64_sge:
-; SI: V_CMP_GE_I64
+; SI-LABEL: {{^}}test_i64_sge:
+; SI: v_cmp_ge_i64
 define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sge i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -81,8 +81,8 @@
   ret void
 }
 
-; SI-LABEL: @test_i64_uge:
-; SI: V_CMP_GE_U64
+; SI-LABEL: {{^}}test_i64_uge:
+; SI: v_cmp_ge_u64
 define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp uge i64 %a, %b
   %result = sext i1 %cmp to i32

diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
index b047315..1fcaf29 100644
--- a/test/CodeGen/R600/imm.ll
+++ b/test/CodeGen/R600/imm.ll

@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
-; CHECK: @i64_imm_inline_lo
-; CHECK: S_MOV_B32 [[LO:s[0-9]+]], 5
-; CHECK: V_MOV_B32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
-; CHECK: BUFFER_STORE_DWORDX2 v{{\[}}[[LO_VGPR]]:
+; CHECK-LABEL: {{^}}i64_imm_inline_lo:
+; CHECK: s_mov_b32 [[LO:s[0-9]+]], 5
+; CHECK: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
 define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 entry:
   store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
@@ -12,12 +12,204 @@
 }
 
 ; Use a 64-bit value with hi bits that can be represented as an inline constant
-; CHECK: @i64_imm_inline_hi
-; CHECK: S_MOV_B32 [[HI:s[0-9]+]], 5
-; CHECK: V_MOV_B32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
-; CHECK: BUFFER_STORE_DWORDX2 v{{\[[0-9]+:}}[[HI_VGPR]]
+; CHECK-LABEL: {{^}}i64_imm_inline_hi:
+; CHECK: s_mov_b32 [[HI:s[0-9]+]], 5
+; CHECK: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
+; CHECK: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
 define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 entry:
   store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
   ret void
 }
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+  store float 0.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_0.5_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+  store float 0.5, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_0.5_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+  store float -0.5, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_1.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+  store float 1.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_1.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+  store float -1.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_2.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+  store float 2.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_2.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+  store float -2.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_4.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+  store float 4.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_inline_imm_m_4.0_f32
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+  store float -4.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_literal_imm_f32:
+; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @store_literal_imm_f32(float addrspace(1)* %out) {
+  store float 4096.0, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_0.5_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_0.5_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_1.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 1.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_1.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -1.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_2.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 2.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_2.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -2.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_4.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, 4.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}add_inline_imm_neg_4.0_f32
+; CHECK: s_load_dword [[VAL:s[0-9]+]]
+; CHECK: v_add_f32_e64 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+  %y = fadd float %x, -4.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @commute_add_inline_imm_0.5_f32
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %x = load float addrspace(1)* %in
+  %y = fadd float %x, 0.5
+  store float %y, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @commute_add_literal_f32
+; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
+; CHECK: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
+; CHECK-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %x = load float addrspace(1)* %in
+  %y = fadd float %x, 1024.0
+  store float %y, float addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/indirect-addressing-si.ll b/test/CodeGen/R600/indirect-addressing-si.ll
index 169d69b..0ba1614 100644
--- a/test/CodeGen/R600/indirect-addressing-si.ll
+++ b/test/CodeGen/R600/indirect-addressing-si.ll

@@ -4,8 +4,8 @@
 ; indexing of vectors.
 
 ; CHECK: extract_w_offset
-; CHECK: S_MOV_B32 m0
-; CHECK-NEXT: V_MOVRELS_B32_e32
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movrels_b32_e32
 define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 %in, 1
@@ -15,8 +15,8 @@
 }
 
 ; CHECK: extract_wo_offset
-; CHECK: S_MOV_B32 m0
-; CHECK-NEXT: V_MOVRELS_B32_e32
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movrels_b32_e32
 define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
@@ -25,8 +25,8 @@
 }
 
 ; CHECK: insert_w_offset
-; CHECK: S_MOV_B32 m0
-; CHECK-NEXT: V_MOVRELD_B32_e32
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movreld_b32_e32
 define void @insert_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 %in, 1
@@ -37,8 +37,8 @@
 }
 
 ; CHECK: insert_wo_offset
-; CHECK: S_MOV_B32 m0
-; CHECK-NEXT: V_MOVRELD_B32_e32
+; CHECK: s_mov_b32 m0
+; CHECK-NEXT: v_movreld_b32_e32
 define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in

diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll
index b127b7e..e0a6ce1 100644
--- a/test/CodeGen/R600/indirect-private-64.ll
+++ b/test/CodeGen/R600/indirect-private-64.ll

@@ -1,10 +1,16 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+
 
 declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
 
-; SI-LABEL: @private_access_f64_alloca:
-; SI: DS_WRITE_B64
-; SI: DS_READ_B64
+; SI-LABEL: {{^}}private_access_f64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx2
+; SI-ALLOCA: buffer_load_dwordx2
+
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load double addrspace(1)* %in, align 8
   %array = alloca double, i32 16, align 8
@@ -16,11 +22,19 @@
   ret void
 }
 
-; SI-LABEL: @private_access_v2f64_alloca:
-; SI: DS_WRITE_B64
-; SI: DS_WRITE_B64
-; SI: DS_READ_B64
-; SI: DS_READ_B64
+; SI-LABEL: {{^}}private_access_v2f64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx4
+; SI-ALLOCA: buffer_load_dwordx4
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x double> addrspace(1)* %in, align 16
   %array = alloca <2 x double>, i32 16, align 16
@@ -32,9 +46,13 @@
   ret void
 }
 
-; SI-LABEL: @private_access_i64_alloca:
-; SI: DS_WRITE_B64
-; SI: DS_READ_B64
+; SI-LABEL: {{^}}private_access_i64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx2
+; SI-ALLOCA: buffer_load_dwordx2
+
+; SI-PROMOTE: ds_write_b64
+; SI-PROMOTE: ds_read_b64
 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load i64 addrspace(1)* %in, align 8
   %array = alloca i64, i32 16, align 8
@@ -46,11 +64,19 @@
   ret void
 }
 
-; SI-LABEL: @private_access_v2i64_alloca:
-; SI: DS_WRITE_B64
-; SI: DS_WRITE_B64
-; SI: DS_READ_B64
-; SI: DS_READ_B64
+; SI-LABEL: {{^}}private_access_v2i64_alloca:
+
+; SI-ALLOCA: buffer_store_dwordx4
+; SI-ALLOCA: buffer_load_dwordx4
+
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
   %val = load <2 x i64> addrspace(1)* %in, align 16
   %array = alloca <2 x i64>, i32 16, align 16

diff --git a/test/CodeGen/R600/infinite-loop.ll b/test/CodeGen/R600/infinite-loop.ll
index 68ffaae..48edab0 100644
--- a/test/CodeGen/R600/infinite-loop.ll
+++ b/test/CodeGen/R600/infinite-loop.ll

@@ -1,11 +1,11 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @infinite_loop:
-; SI: V_MOV_B32_e32 [[REG:v[0-9]+]], 0x3e7
+; SI-LABEL: {{^}}infinite_loop:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
 ; SI: BB0_1:
-; SI: BUFFER_STORE_DWORD [[REG]]
-; SI: S_WAITCNT vmcnt(0) expcnt(0)
-; SI: S_BRANCH BB0_1
+; SI: buffer_store_dword [[REG]]
+; SI: s_waitcnt vmcnt(0) expcnt(0)
+; SI: s_branch BB0_1
 define void @infinite_loop(i32 addrspace(1)* %out) {
 entry:
   br label %for.body

diff --git a/test/CodeGen/R600/inline-calls.ll b/test/CodeGen/R600/inline-calls.ll
new file mode 100644
index 0000000..3bceeca
--- /dev/null
+++ b/test/CodeGen/R600/inline-calls.ll

@@ -0,0 +1,24 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck  %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-NOT: {{^}}func:
+define internal fastcc i32 @func(i32 %a) {
+entry:
+  %tmp0 = add i32 %a, 1
+  ret i32 %tmp0
+}
+
+; CHECK: {{^}}kernel:
+define void @kernel(i32 addrspace(1)* %out) {
+entry:
+  %tmp0 = call i32 @func(i32 1)
+  store i32 %tmp0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: {{^}}kernel2:
+define void @kernel2(i32 addrspace(1)* %out) {
+entry:
+  call void @kernel(i32 addrspace(1)* %out)
+  ret void
+}

diff --git a/test/CodeGen/R600/input-mods.ll b/test/CodeGen/R600/input-mods.ll
index 13bfbab..e3e9499 100644
--- a/test/CodeGen/R600/input-mods.ll
+++ b/test/CodeGen/R600/input-mods.ll

@@ -1,9 +1,9 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
 
-;EG-CHECK-LABEL: @test
+;EG-CHECK-LABEL: {{^}}test:
 ;EG-CHECK: EXP_IEEE *
-;CM-CHECK-LABEL: @test
+;CM-CHECK-LABEL: {{^}}test:
 ;CM-CHECK: EXP_IEEE T{{[0-9]+}}.X, -|T{{[0-9]+}}.X|
 ;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Y (MASKED), -|T{{[0-9]+}}.X|
 ;CM-CHECK: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X|

diff --git a/test/CodeGen/R600/insert_subreg.ll b/test/CodeGen/R600/insert_subreg.ll
new file mode 100644
index 0000000..e311e19
--- /dev/null
+++ b/test/CodeGen/R600/insert_subreg.ll

@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
+
+; Test that INSERT_SUBREG instructions don't have non-register operands after
+; instruction selection.
+
+; Make sure this doesn't crash
+; CHECK-LABEL: test:
+define void @test(i64 addrspace(1)* %out) {
+entry:
+  %tmp0 = alloca [16 x i32]
+  %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32
+  %tmp2 = sext i32 %tmp1 to i64
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
index 43b4efc..857c414 100644
--- a/test/CodeGen/R600/insert_vector_elt.ll
+++ b/test/CodeGen/R600/insert_vector_elt.ll

@@ -8,116 +8,116 @@
 ; FIXME: Why is the constant moved into the intermediate register and
 ; not just directly into the vector component?
 
-; SI-LABEL: @insertelement_v4f32_0:
-; S_LOAD_DWORDX4 s{{[}}[[LOW_REG:[0-9]+]]:
-; V_MOV_B32_e32
-; V_MOV_B32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
-; V_MOV_B32_e32 v[[LOW_REG]], [[CONSTREG]]
-; BUFFER_STORE_DWORDX4 v{{[}}[[LOW_REG]]:
+; SI-LABEL: {{^}}insertelement_v4f32_0:
+; s_load_dwordx4 s{{[}}[[LOW_REG:[0-9]+]]:
+; v_mov_b32_e32
+; v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 5.000000e+00
+; v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
+; buffer_store_dwordx4 v{{[}}[[LOW_REG]]:
 define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @insertelement_v4f32_1:
+; SI-LABEL: {{^}}insertelement_v4f32_1:
 define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @insertelement_v4f32_2:
+; SI-LABEL: {{^}}insertelement_v4f32_2:
 define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @insertelement_v4f32_3:
+; SI-LABEL: {{^}}insertelement_v4f32_3:
 define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @insertelement_v4i32_0:
+; SI-LABEL: {{^}}insertelement_v4i32_0:
 define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v2f32:
-; SI: V_MOV_B32_e32 [[CONST:v[0-9]+]], 5.000000e+00
-; SI: V_MOVRELD_B32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: BUFFER_STORE_DWORDX2 {{v\[}}[[LOW_RESULT_REG]]:
+; SI-LABEL: {{^}}dynamic_insertelement_v2f32:
+; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v4f32:
-; SI: V_MOV_B32_e32 [[CONST:v[0-9]+]], 5.000000e+00
-; SI: V_MOVRELD_B32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
-; SI: BUFFER_STORE_DWORDX4 {{v\[}}[[LOW_RESULT_REG]]:
+; SI-LABEL: {{^}}dynamic_insertelement_v4f32:
+; SI: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
+; SI: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
+; SI: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
 define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v8f32:
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v8f32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v16f32:
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v16f32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v2i32:
-; SI: BUFFER_STORE_DWORDX2
+; SI-LABEL: {{^}}dynamic_insertelement_v2i32:
+; SI: buffer_store_dwordx2
 define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v4i32:
-; SI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v4i32:
+; SI: buffer_store_dwordx4
 define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 5, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v8i32:
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v8i32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v16i32:
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v16i32:
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
@@ -125,16 +125,16 @@
 }
 
 
-; SI-LABEL: @dynamic_insertelement_v2i16:
-; FIXMESI: BUFFER_STORE_DWORDX2
+; SI-LABEL: {{^}}dynamic_insertelement_v2i16:
+; FIXMESI: buffer_store_dwordx2
 define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v4i16:
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v4i16:
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
   store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 16
@@ -142,7 +142,7 @@
 }
 
 
-; SI-LABEL: @dynamic_insertelement_v2i8:
+; SI-LABEL: {{^}}dynamic_insertelement_v2i8:
 ; FIXMESI: BUFFER_STORE_USHORT
 define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
@@ -150,24 +150,24 @@
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v4i8:
-; FIXMESI: BUFFER_STORE_DWORD
+; SI-LABEL: {{^}}dynamic_insertelement_v4i8:
+; FIXMESI: buffer_store_dword
 define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v8i8:
-; FIXMESI: BUFFER_STORE_DWORDX2
+; SI-LABEL: {{^}}dynamic_insertelement_v8i8:
+; FIXMESI: buffer_store_dwordx2
 define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
   store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @dynamic_insertelement_v16i8:
-; FIXMESI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}dynamic_insertelement_v16i8:
+; FIXMESI: buffer_store_dwordx4
 define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
@@ -176,7 +176,7 @@
 
 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 ; the compiler doesn't crash.
-; SI-LABEL: @insert_split_bb
+; SI-LABEL: {{^}}insert_split_bb:
 define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
@@ -199,3 +199,53 @@
   store <2 x i32> %7, <2 x i32> addrspace(1)* %out
   ret void
 }
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
+  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v2i64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
+  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v4f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
+  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}dynamic_insertelement_v8f64:
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
+  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
+  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
+  ret void
+}

diff --git a/test/CodeGen/R600/insert_vector_elt_f64.ll b/test/CodeGen/R600/insert_vector_elt_f64.ll
deleted file mode 100644
index 595bc59..0000000
--- a/test/CodeGen/R600/insert_vector_elt_f64.ll
+++ /dev/null

@@ -1,36 +0,0 @@
-; REQUIRES: asserts
-; XFAIL: *
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-
-
-; SI-LABEL: @dynamic_insertelement_v2f64:
-; SI: BUFFER_STORE_DWORDX4
-define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
-  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: @dynamic_insertelement_v2f64:
-; SI: BUFFER_STORE_DWORDX4
-define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
-  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
-  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
-  ret void
-}
-
-; SI-LABEL: @dynamic_insertelement_v4f64:
-; SI: BUFFER_STORE_DWORDX4
-define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
-  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
-  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
-  ret void
-}
-
-; SI-LABEL: @dynamic_insertelement_v8f64:
-; SI: BUFFER_STORE_DWORDX4
-define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
-  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
-  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
-  ret void
-}

diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
index 0baa3cd..27840b2 100644
--- a/test/CodeGen/R600/kcache-fold.ll
+++ b/test/CodeGen/R600/kcache-fold.ll

@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @main1
+; CHECK: {{^}}main1:
 ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
 define void @main1() {
 main_body:
@@ -48,7 +48,7 @@
   ret void
 }
 
-; CHECK: @main2
+; CHECK: {{^}}main2:
 ; CHECK-NOT: MOV
 define void @main2() {
 main_body:

diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll
index 6fc6979..9a7da90 100644
--- a/test/CodeGen/R600/kernel-args.ll
+++ b/test/CodeGen/R600/kernel-args.ll

@@ -2,10 +2,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; EG-CHECK-LABEL: @i8_arg
+; EG-CHECK-LABEL: {{^}}i8_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}i8_arg:
+; SI-CHECK: buffer_load_ubyte
 
 define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
@@ -14,10 +14,10 @@
   ret void
 }
 
-; EG-CHECK-LABEL: @i8_zext_arg
+; EG-CHECK-LABEL: {{^}}i8_zext_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i8_zext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i8_zext_arg:
+; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
@@ -26,10 +26,10 @@
   ret void
 }
 
-; EG-CHECK-LABEL: @i8_sext_arg
+; EG-CHECK-LABEL: {{^}}i8_sext_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i8_sext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i8_sext_arg:
+; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
@@ -38,10 +38,10 @@
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_arg
+; EG-CHECK-LABEL: {{^}}i16_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i16_arg
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}i16_arg:
+; SI-CHECK: buffer_load_ushort
 
 define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
@@ -50,10 +50,10 @@
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_zext_arg
+; EG-CHECK-LABEL: {{^}}i16_zext_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i16_zext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i16_zext_arg:
+; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
@@ -62,10 +62,10 @@
   ret void
 }
 
-; EG-CHECK-LABEL: @i16_sext_arg
+; EG-CHECK-LABEL: {{^}}i16_sext_arg:
 ; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i16_sext_arg
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i16_sext_arg:
+; SI-CHECK: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 
 define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
@@ -74,176 +74,176 @@
   ret void
 }
 
-; EG-CHECK-LABEL: @i32_arg
+; EG-CHECK-LABEL: {{^}}i32_arg:
 ; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @i32_arg
-; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}i32_arg:
+; s_load_dword s{{[0-9]}}, s[0:1], 0xb
 define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @f32_arg
+; EG-CHECK-LABEL: {{^}}f32_arg:
 ; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; SI-CHECK-LABEL: @f32_arg
-; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}f32_arg:
+; s_load_dword s{{[0-9]}}, s[0:1], 0xb
 define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v2i8_arg
+; EG-CHECK-LABEL: {{^}}v2i8_arg:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @v2i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}v2i8_arg:
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v2i16_arg
+; EG-CHECK-LABEL: {{^}}v2i16_arg:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @v2i16_arg
-; SI-CHECK-DAG: BUFFER_LOAD_USHORT
-; SI-CHECK-DAG: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}v2i16_arg:
+; SI-CHECK-DAG: buffer_load_ushort
+; SI-CHECK-DAG: buffer_load_ushort
 define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v2i32_arg
+; EG-CHECK-LABEL: {{^}}v2i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI-CHECK-LABEL: @v2i32_arg
-; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}v2i32_arg:
+; SI-CHECK: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v2f32_arg
+; EG-CHECK-LABEL: {{^}}v2f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
-; SI-CHECK-LABEL: @v2f32_arg
-; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
+; SI-CHECK-LABEL: {{^}}v2f32_arg:
+; SI-CHECK: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v3i8_arg
+; EG-CHECK-LABEL: {{^}}v3i8_arg:
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-; SI-CHECK-LABEL: @v3i8_arg
+; SI-CHECK-LABEL: {{^}}v3i8_arg:
 define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v3i16_arg
+; EG-CHECK-LABEL: {{^}}v3i16_arg:
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
-; SI-CHECK-LABEL: @v3i16_arg
+; SI-CHECK-LABEL: {{^}}v3i16_arg:
 define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
 }
-; EG-CHECK-LABEL: @v3i32_arg
+; EG-CHECK-LABEL: {{^}}v3i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI-CHECK-LABEL: @v3i32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; SI-CHECK-LABEL: {{^}}v3i32_arg:
+; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v3f32_arg
+; EG-CHECK-LABEL: {{^}}v3f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
-; SI-CHECK-LABEL: @v3f32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
+; SI-CHECK-LABEL: {{^}}v3f32_arg:
+; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v4i8_arg
+; EG-CHECK-LABEL: {{^}}v4i8_arg:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @v4i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}v4i8_arg:
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v4i16_arg
+; EG-CHECK-LABEL: {{^}}v4i16_arg:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @v4i16_arg
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}v4i16_arg:
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v4i32_arg
+; EG-CHECK-LABEL: {{^}}v4i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI-CHECK-LABEL: @v4i32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; SI-CHECK-LABEL: {{^}}v4i32_arg:
+; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v4f32_arg
+; EG-CHECK-LABEL: {{^}}v4f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
-; SI-CHECK-LABEL: @v4f32_arg
-; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
+; SI-CHECK-LABEL: {{^}}v4f32_arg:
+; SI-CHECK: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v8i8_arg
+; EG-CHECK-LABEL: {{^}}v8i8_arg:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
@@ -252,21 +252,21 @@
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @v8i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}v8i8_arg:
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v8i16_arg
+; EG-CHECK-LABEL: {{^}}v8i16_arg:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
@@ -275,22 +275,22 @@
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @v8i16_arg
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}v8i16_arg:
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v8i32_arg
+; EG-CHECK-LABEL: {{^}}v8i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -299,15 +299,15 @@
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI-CHECK-LABEL: @v8i32_arg
-; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; SI-CHECK-LABEL: {{^}}v8i32_arg:
+; SI-CHECK: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v8f32_arg
+; EG-CHECK-LABEL: {{^}}v8f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
@@ -316,15 +316,15 @@
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI-CHECK-LABEL: @v8f32_arg
-; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; SI-CHECK-LABEL: {{^}}v8f32_arg:
+; SI-CHECK: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v16i8_arg
+; EG-CHECK-LABEL: {{^}}v16i8_arg:
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
@@ -341,30 +341,30 @@
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
 ; EG-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @v16i8_arg
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK-LABEL: {{^}}v16i8_arg:
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v16i16_arg
+; EG-CHECK-LABEL: {{^}}v16i16_arg:
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
@@ -381,30 +381,30 @@
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
 ; EG-CHECK: VTX_READ_16
-; SI-CHECK-LABEL: @v16i16_arg
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK-LABEL: {{^}}v16i16_arg:
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @v16i32_arg
+; EG-CHECK-LABEL: {{^}}v16i32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
@@ -421,15 +421,15 @@
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI-CHECK-LABEL: @v16i32_arg
-; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; SI-CHECK-LABEL: {{^}}v16i32_arg:
+; SI-CHECK: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
   ret void
 }
 
-; EG-CHECK-LABEL: @v16f32_arg
+; EG-CHECK-LABEL: {{^}}v16f32_arg:
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
@@ -446,10 +446,28 @@
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI-CHECK-LABEL: @v16f32_arg
-; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
+; SI-CHECK-LABEL: {{^}}v16f32_arg:
+; SI-CHECK: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}kernel_arg_i64:
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI: buffer_store_dwordx2
+define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+  store i64 %a, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
+; XSI: s_load_dwordx2
+; XSI: s_load_dwordx2
+; XSI: buffer_store_dwordx2
+; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
+;   ret void
+; }

diff --git a/test/CodeGen/R600/large-constant-initializer.ll b/test/CodeGen/R600/large-constant-initializer.ll
index 552cd05..5612dd3 100644
--- a/test/CodeGen/R600/large-constant-initializer.ll
+++ b/test/CodeGen/R600/large-constant-initializer.ll

@@ -1,6 +1,5 @@
-; XFAIL: *
-; REQUIRES: asserts
 ; RUN: llc -march=r600 -mcpu=SI < %s
+; CHECK: s_endpgm
 
 @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
 

diff --git a/test/CodeGen/R600/lds-initializer.ll b/test/CodeGen/R600/lds-initializer.ll
new file mode 100644
index 0000000..91d5d12
--- /dev/null
+++ b/test/CodeGen/R600/lds-initializer.ll

@@ -0,0 +1,12 @@
+; RUN: not llc -march=r600 -mcpu=SI < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_init_lds_global
+
+@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
+
+define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [8 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32 addrspace(3)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/lds-oqap-crash.ll b/test/CodeGen/R600/lds-oqap-crash.ll
index 7959150..fbcd778 100644
--- a/test/CodeGen/R600/lds-oqap-crash.ll
+++ b/test/CodeGen/R600/lds-oqap-crash.ll

@@ -9,7 +9,7 @@
 ; because the LDS instructions are pseudo instructions and the OQAP
 ; reads and writes are bundled together in the same instruction.
 
-; CHECK: @lds_crash
+; CHECK: {{^}}lds_crash:
 define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = load i32 addrspace(3)* %in

diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll
index d5dc061..cda75b0 100644
--- a/test/CodeGen/R600/lds-output-queue.ll
+++ b/test/CodeGen/R600/lds-output-queue.ll

@@ -3,12 +3,12 @@
 ; This test checks that the lds input queue will is empty at the end of
 ; the ALU clause.
 
-; CHECK-LABEL: @lds_input_queue
+; CHECK-LABEL: {{^}}lds_input_queue:
 ; CHECK: LDS_READ_RET * OQAP
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
 
-@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] [i32 1, i32 2], align 4
+@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
 
 define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
 entry:
@@ -84,7 +84,7 @@
 ; analysis, we should be able to keep these instructions sparate before
 ; scheduling.
 ;
-; CHECK-LABEL: @local_global_alias
+; CHECK-LABEL: {{^}}local_global_alias:
 ; CHECK: LDS_READ_RET
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP

diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll
index 9182e25..5287723 100644
--- a/test/CodeGen/R600/lds-size.ll
+++ b/test/CodeGen/R600/lds-size.ll

@@ -3,10 +3,10 @@
 ; This test makes sure we do not double count global values when they are
 ; used in different basic blocks.
 
-; CHECK-LABEL: @test
+; CHECK-LABEL: {{^}}test:
 ; CHECK: .long   166120
 ; CHECK-NEXT: .long   1
-@lds = internal unnamed_addr addrspace(3) global i32 zeroinitializer, align 4
+@lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
 define void @test(i32 addrspace(1)* %out, i32 %cond) {
 entry:

diff --git a/test/CodeGen/R600/lds-zero-initializer.ll b/test/CodeGen/R600/lds-zero-initializer.ll
new file mode 100644
index 0000000..23912a9
--- /dev/null
+++ b/test/CodeGen/R600/lds-zero-initializer.ll

@@ -0,0 +1,12 @@
+; RUN: not llc -march=r600 -mcpu=SI < %s 2>&1 | FileCheck %s
+
+; CHECK: error: unsupported initializer for address space in load_zeroinit_lds_global
+
+@lds = addrspace(3) global [256 x i32] zeroinitializer
+
+define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
+ %gep = getelementptr [256 x i32] addrspace(3)* @lds, i32 0, i32 10
+  %ld = load i32 addrspace(3)* %gep
+  store i32 %ld, i32 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
index 1aae7f9..b9fa8e9 100644
--- a/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll
+++ b/test/CodeGen/R600/legalizedag-bug-expand-setcc.ll

@@ -8,7 +8,7 @@
 ; instructions, when only one is needed.
 ;
 
-; CHECK: @setcc_expand
+; CHECK: {{^}}setcc_expand:
 ; CHECK: SET
 ; CHECK-NOT: CND
 define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {

diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
index 47191e0..cff1c24 100644
--- a/test/CodeGen/R600/literals.ll
+++ b/test/CodeGen/R600/literals.ll

@@ -6,7 +6,7 @@
 ; or
 ; ADD_INT literal.x KC0[2].Z, 5
 
-; CHECK: @i32_literal
+; CHECK: {{^}}i32_literal:
 ; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5
@@ -23,7 +23,7 @@
 ; or
 ; ADD literal.x KC0[2].Z, 5.0
 
-; CHECK: @float_literal
+; CHECK: {{^}}float_literal:
 ; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.0
@@ -35,7 +35,7 @@
 }
 
 ; Make sure inline literals are folded into REG_SEQUENCE instructions.
-; CHECK: @inline_literal_reg_sequence
+; CHECK: {{^}}inline_literal_reg_sequence:
 ; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
@@ -47,7 +47,7 @@
   ret void
 }
 
-; CHECK: @inline_literal_dot4
+; CHECK: {{^}}inline_literal_dot4:
 ; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0
 ; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
 ; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0

diff --git a/test/CodeGen/R600/llvm.AMDGPU.abs.ll b/test/CodeGen/R600/llvm.AMDGPU.abs.ll
index a0a47b7..b4aede8 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.abs.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.abs.ll

@@ -6,10 +6,10 @@
 ; Legacy name
 declare i32 @llvm.AMDIL.abs.i32(i32) nounwind readnone
 
-; FUNC-LABEL: @s_abs_i32
-; SI: S_SUB_I32
-; SI: S_MAX_I32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_abs_i32:
+; SI: s_sub_i32
+; SI: s_max_i32
+; SI: s_endpgm
 
 ; EG: SUB_INT
 ; EG: MAX_INT
@@ -19,10 +19,10 @@
   ret void
 }
 
-; FUNC-LABEL: @v_abs_i32
-; SI: V_SUB_I32_e32
-; SI: V_MAX_I32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_abs_i32:
+; SI: v_sub_i32_e32
+; SI: v_max_i32_e32
+; SI: s_endpgm
 
 ; EG: SUB_INT
 ; EG: MAX_INT
@@ -33,10 +33,10 @@
   ret void
 }
 
-; FUNC-LABEL: @abs_i32_legacy_amdil
-; SI: V_SUB_I32_e32
-; SI: V_MAX_I32_e32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}abs_i32_legacy_amdil:
+; SI: v_sub_i32_e32
+; SI: v_max_i32_e32
+; SI: s_endpgm
 
 ; EG: SUB_INT
 ; EG: MAX_INT

diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll
new file mode 100644
index 0000000..98f6695
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.global.ll

@@ -0,0 +1,28 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_barrier_global:
+; EG: GROUP_BARRIER
+; SI: s_barrier
+
+define void @test_barrier_global(i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.r600.read.tidig.x()
+  %1 = getelementptr i32 addrspace(1)* %out, i32 %0
+  store i32 %0, i32 addrspace(1)* %1
+  call void @llvm.AMDGPU.barrier.global()
+  %2 = call i32 @llvm.r600.read.local.size.x()
+  %3 = sub i32 %2, 1
+  %4 = sub i32 %3, %0
+  %5 = getelementptr i32 addrspace(1)* %out, i32 %4
+  %6 = load i32 addrspace(1)* %5
+  store i32 %6, i32 addrspace(1)* %1
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.global()
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.r600.read.local.size.x() #0
+
+attributes #0 = { readnone }

diff --git a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
index 8d3c9ca..92fe9f2 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.barrier.local.ll

@@ -1,8 +1,11 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; CHECK: GROUP_BARRIER
+; FUNC-LABEL: {{^}}test_barrier_local:
+; EG: GROUP_BARRIER
+; SI: s_barrier
 
-define void @test(i32 addrspace(1)* %out) {
+define void @test_barrier_local(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x()
   %1 = getelementptr i32 addrspace(1)* %out, i32 %0
@@ -17,8 +20,9 @@
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #0
 declare void @llvm.AMDGPU.barrier.local()
+
+declare i32 @llvm.r600.read.tidig.x() #0
 declare i32 @llvm.r600.read.local.size.x() #0
 
 attributes #0 = { readnone }

diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
index eb50942..0b60d0d 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.i32.ll

@@ -3,8 +3,8 @@
 
 declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfe_i32_arg_arg_arg
-; SI: V_BFE_I32
+; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg:
+; SI: v_bfe_i32
 ; EG: BFE_INT
 ; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
 define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
@@ -13,8 +13,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_arg_arg_imm
-; SI: V_BFE_I32
+; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm:
+; SI: v_bfe_i32
 ; EG: BFE_INT
 define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
@@ -22,8 +22,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_arg_imm_arg
-; SI: V_BFE_I32
+; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg:
+; SI: v_bfe_i32
 ; EG: BFE_INT
 define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
@@ -31,8 +31,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_imm_arg_arg
-; SI: V_BFE_I32
+; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg:
+; SI: v_bfe_i32
 ; EG: BFE_INT
 define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
@@ -40,8 +40,8 @@
   ret void
 }
 
-; FUNC-LABEL: @v_bfe_print_arg
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
+; FUNC-LABEL: {{^}}v_bfe_print_arg:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
 define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
   %load = load i32 addrspace(1)* %src0, align 4
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
@@ -49,9 +49,9 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_arg_0_width_reg_offset
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
@@ -59,9 +59,9 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_arg_0_width_imm_offset
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
@@ -69,10 +69,10 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_6
-; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_6:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: s_endpgm
 define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -81,12 +81,12 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_7
-; SI-NOT: SHL
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_7:
+; SI-NOT: shl
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -96,10 +96,10 @@
 }
 
 ; FIXME: The shifts should be 1 BFE
-; FUNC-LABEL: @bfe_i32_test_8
-; SI: BUFFER_LOAD_DWORD
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_8:
+; SI: buffer_load_dword
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: s_endpgm
 define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -108,11 +108,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_9
-; SI-NOT: BFE
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
@@ -120,11 +120,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_10
-; SI-NOT: BFE
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
@@ -132,11 +132,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_11
-; SI-NOT: BFE
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
@@ -144,11 +144,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_12
-; SI-NOT: BFE
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
@@ -156,10 +156,10 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_13
-; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_13:
+; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
@@ -167,10 +167,10 @@
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
 }
 
-; FUNC-LABEL: @bfe_i32_test_14
-; SI-NOT: LSHR
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_test_14:
+; SI-NOT: lshr
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
@@ -178,11 +178,11 @@
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_0
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
@@ -190,11 +190,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_1
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
@@ -202,11 +202,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_2
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
@@ -214,11 +214,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_3
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
@@ -226,11 +226,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_4
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
@@ -238,11 +238,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_5
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
@@ -250,11 +250,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_6
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0xffffff80
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
@@ -262,11 +262,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_7
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
@@ -274,11 +274,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_8
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
@@ -286,11 +286,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_9
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
@@ -298,11 +298,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_10
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
@@ -310,11 +310,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_11
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -6
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
@@ -322,11 +322,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_12
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
@@ -334,11 +334,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_13
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
@@ -346,11 +346,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_14
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
@@ -358,11 +358,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_15
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
@@ -370,11 +370,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_16
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
@@ -382,11 +382,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_17
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
@@ -394,11 +394,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_i32_constant_fold_test_18
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
@@ -408,14 +408,14 @@
 
 ; XXX - This should really be a single BFE, but the sext_inreg of the
 ; extended type i24 is never custom lowered.
-; FUNC-LABEL: @bfe_sext_in_reg_i24
-; SI: BUFFER_LOAD_DWORD [[LOAD:v[0-9]+]],
-; SI: V_LSHLREV_B32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
-; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
-; XSI: V_BFE_I32 [[BFE:v[0-9]+]], [[LOAD]], 0, 8
+; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24:
+; SI: buffer_load_dword [[LOAD:v[0-9]+]],
+; SI: v_lshlrev_b32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 8, {{v[0-9]+}}
+; XSI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 8
 ; XSI-NOT: SHL
 ; XSI-NOT: SHR
-; XSI: BUFFER_STORE_DWORD [[BFE]],
+; XSI: buffer_store_dword [[BFE]],
 define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
@@ -424,3 +424,18 @@
   store i32 %ashr, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: @simplify_demanded_bfe_sdiv
+; SI: buffer_load_dword [[LOAD:v[0-9]+]]
+; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
+; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
+; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], [[TMP0]], [[BFE]]
+; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
+; SI: buffer_store_dword [[TMP2]]
+define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %src = load i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
+  %div = sdiv i32 %bfe, 2
+  store i32 %div, i32 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
index 1a62253..0794ac4 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfe.u32.ll

@@ -3,8 +3,8 @@
 
 declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfe_u32_arg_arg_arg
-; SI: V_BFE_U32
+; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg:
+; SI: v_bfe_u32
 ; EG: BFE_UINT
 define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
@@ -12,8 +12,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_arg_arg_imm
-; SI: V_BFE_U32
+; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm:
+; SI: v_bfe_u32
 ; EG: BFE_UINT
 define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
@@ -21,8 +21,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_arg_imm_arg
-; SI: V_BFE_U32
+; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg:
+; SI: v_bfe_u32
 ; EG: BFE_UINT
 define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
@@ -30,8 +30,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_imm_arg_arg
-; SI: V_BFE_U32
+; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg:
+; SI: v_bfe_u32
 ; EG: BFE_UINT
 define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
@@ -39,9 +39,9 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_arg_0_width_reg_offset
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
@@ -49,9 +49,9 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_arg_0_width_imm_offset
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
@@ -59,10 +59,10 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zextload_i8
-; SI: BUFFER_LOAD_UBYTE
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zextload_i8:
+; SI: buffer_load_ubyte
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %load = load i8 addrspace(1)* %in
   %ext = zext i8 %load to i32
@@ -71,12 +71,12 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i8
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: V_AND_B32_e32
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -86,12 +86,12 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i16
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: V_AND_B32_e32
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -101,11 +101,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_1
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI: bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -115,12 +115,12 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_3
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0xf8
-; SI-NEXT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
+; SI-NEXT: bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -130,12 +130,12 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i8_offset_7
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: V_AND_B32_e32 {{v[0-9]+}}, 0x80
-; SI-NEXT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
+; SI-NEXT: bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -145,11 +145,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_zext_in_reg_i16_offset_8
-; SI: BUFFER_LOAD_DWORD
-; SI: V_ADD_I32
-; SI-NEXT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8:
+; SI: buffer_load_dword
+; SI: v_add_i32
+; SI-NEXT: bfe
+; SI: s_endpgm
 define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
@@ -159,10 +159,10 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_1
-; SI: BUFFER_LOAD_DWORD
-; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_1:
+; SI: buffer_load_dword
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI: s_endpgm
 ; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
 define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
@@ -187,13 +187,13 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_4
-; SI-NOT: LSHL
-; SI-NOT: SHR
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_4:
+; SI-NOT: lshl
+; SI-NOT: shr
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -203,12 +203,12 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_5
-; SI: BUFFER_LOAD_DWORD
-; SI-NOT: LSHL
-; SI-NOT: SHR
-; SI: V_BFE_I32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_5:
+; SI: buffer_load_dword
+; SI-NOT: lshl
+; SI-NOT: shr
+; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
+; SI: s_endpgm
 define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -218,10 +218,10 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_6
-; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_6:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: s_endpgm
 define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -230,10 +230,10 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_7
-; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_7:
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -242,11 +242,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_8
-; SI-NOT: BFE
-; SI: V_AND_B32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -255,11 +255,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_9
-; SI-NOT: BFE
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
@@ -267,11 +267,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_10
-; SI-NOT: BFE
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
@@ -279,11 +279,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_11
-; SI-NOT: BFE
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
@@ -291,11 +291,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_12
-; SI-NOT: BFE
-; SI: V_LSHRREV_B32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
@@ -303,10 +303,10 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_13
+; FUNC-LABEL: {{^}}bfe_u32_test_13:
 ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
@@ -314,10 +314,10 @@
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
 }
 
-; FUNC-LABEL: @bfe_u32_test_14
-; SI-NOT: LSHR
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_test_14:
+; SI-NOT: lshr
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
@@ -325,11 +325,11 @@
   store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_0
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
@@ -337,11 +337,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_1
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
@@ -349,11 +349,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_2
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
@@ -361,11 +361,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_3
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
@@ -373,11 +373,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_4
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], -1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
@@ -385,11 +385,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_5
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
@@ -397,11 +397,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_6
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x80
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
@@ -409,11 +409,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_7
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
@@ -421,11 +421,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_8
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
@@ -433,11 +433,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_9
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFEfppppppppppppp
 define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
@@ -445,11 +445,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_10
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
@@ -457,11 +457,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_11
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
@@ -469,11 +469,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_12
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
@@ -481,11 +481,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_13
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 1
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
@@ -493,11 +493,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_14
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 40
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
@@ -505,11 +505,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_15
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 10
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
@@ -517,11 +517,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_16
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
@@ -529,11 +529,11 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_17
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
@@ -541,14 +541,36 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_u32_constant_fold_test_18
-; SI-NOT: BFE
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], 0
-; SI: BUFFER_STORE_DWORD [[VREG]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18:
+; SI-NOT: {{[^@]}}bfe
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; SI: buffer_store_dword [[VREG]],
+; SI: s_endpgm
 ; EG-NOT: BFE
 define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; Make sure that SimplifyDemandedBits doesn't cause the and to be
+; reduced to the bits demanded by the bfe.
+
+; XXX: The operand to v_bfe_u32 could also just directly be the load register.
+; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg:
+; SI: buffer_load_dword [[ARG:v[0-9]+]]
+; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]]
+; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2
+; SI-DAG: buffer_store_dword [[AND]]
+; SI-DAG: buffer_store_dword [[BFE]]
+; SI: s_endpgm
+define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+                                            i32 addrspace(1)* %out1,
+                                            i32 addrspace(1)* %in) nounwind {
+  %src = load i32 addrspace(1)* %in, align 4
+  %and = and i32 %src, 63
+  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone
+  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
+  store i32 %and, i32 addrspace(1)* %out1, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
index e1de45b..df61b0b 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfi.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfi.ll

@@ -3,8 +3,8 @@
 
 declare i32 @llvm.AMDGPU.bfi(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfi_arg_arg_arg
-; SI: V_BFI_B32
+; FUNC-LABEL: {{^}}bfi_arg_arg_arg:
+; SI: v_bfi_b32
 ; EG: BFI_INT
 define void @bfi_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
@@ -12,8 +12,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfi_arg_arg_imm
-; SI: V_BFI_B32
+; FUNC-LABEL: {{^}}bfi_arg_arg_imm:
+; SI: v_bfi_b32
 ; EG: BFI_INT
 define void @bfi_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 %src1, i32 123) nounwind readnone
@@ -21,8 +21,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfi_arg_imm_arg
-; SI: V_BFI_B32
+; FUNC-LABEL: {{^}}bfi_arg_imm_arg:
+; SI: v_bfi_b32
 ; EG: BFI_INT
 define void @bfi_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfi = call i32 @llvm.AMDGPU.bfi(i32 %src0, i32 123, i32 %src2) nounwind readnone
@@ -30,8 +30,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfi_imm_arg_arg
-; SI: V_BFI_B32
+; FUNC-LABEL: {{^}}bfi_imm_arg_arg:
+; SI: v_bfi_b32
 ; EG: BFI_INT
 define void @bfi_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfi = call i32 @llvm.AMDGPU.bfi(i32 123, i32 %src1, i32 %src2) nounwind readnone

diff --git a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
index ef8721e..0ba4af5 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.bfm.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.bfm.ll

@@ -3,8 +3,8 @@
 
 declare i32 @llvm.AMDGPU.bfm(i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfm_arg_arg
-; SI: V_BFM
+; FUNC-LABEL: {{^}}bfm_arg_arg:
+; SI: v_bfm
 ; EG: BFM_INT
 define void @bfm_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 %src1) nounwind readnone
@@ -12,8 +12,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfm_arg_imm
-; SI: V_BFM
+; FUNC-LABEL: {{^}}bfm_arg_imm:
+; SI: v_bfm
 ; EG: BFM_INT
 define void @bfm_arg_imm(i32 addrspace(1)* %out, i32 %src0) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 %src0, i32 123) nounwind readnone
@@ -21,8 +21,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfm_imm_arg
-; SI: V_BFM
+; FUNC-LABEL: {{^}}bfm_imm_arg:
+; SI: v_bfm
 ; EG: BFM_INT
 define void @bfm_imm_arg(i32 addrspace(1)* %out, i32 %src1) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 %src1) nounwind readnone
@@ -30,8 +30,8 @@
   ret void
 }
 
-; FUNC-LABEL: @bfm_imm_imm
-; SI: V_BFM
+; FUNC-LABEL: {{^}}bfm_imm_imm:
+; SI: v_bfm
 ; EG: BFM_INT
 define void @bfm_imm_imm(i32 addrspace(1)* %out) nounwind {
   %bfm = call i32 @llvm.AMDGPU.bfm(i32 123, i32 456) nounwind readnone

diff --git a/test/CodeGen/R600/llvm.AMDGPU.brev.ll b/test/CodeGen/R600/llvm.AMDGPU.brev.ll
index 68a5ad0..647df34 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.brev.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.brev.ll

@@ -2,23 +2,23 @@
 
 declare i32 @llvm.AMDGPU.brev(i32) nounwind readnone
 
-; FUNC-LABEL: @s_brev_i32:
-; SI: S_LOAD_DWORD [[VAL:s[0-9]+]],
-; SI: S_BREV_B32 [[SRESULT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}s_brev_i32:
+; SI: s_load_dword [[VAL:s[0-9]+]],
+; SI: s_brev_b32 [[SRESULT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
 define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @v_brev_i32:
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_BFREV_B32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}v_brev_i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.AMDGPU.brev(i32 %val) nounwind readnone

diff --git a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
index d608953..c6efdb9 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.clamp.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.clamp.ll

@@ -1,14 +1,15 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
+declare float @llvm.fabs.f32(float) nounwind readnone
 declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
 declare float @llvm.AMDIL.clamp.f32(float, float, float) nounwind readnone
 
-; FUNC-LABEL: @clamp_0_1_f32
-; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_ADD_F32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0, 1, 0
-; SI: BUFFER_STORE_DWORD [[RESULT]]
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}clamp_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
 
 ; EG: MOV_SAT
 define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
@@ -17,10 +18,47 @@
   ret void
 }
 
-; FUNC-LABEL: @clamp_0_1_amdil_legacy_f32
-; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_ADD_F32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0, 1, 0
-; SI: BUFFER_STORE_DWORD [[RESULT]]
+; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, |[[ARG]]| clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -[[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fneg = fsub float -0.0, %src
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, -|[[ARG]]| clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
+  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
+  %src.fneg.fabs = fsub float -0.0, %src.fabs
+  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone
+  store float %clamp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}clamp_0_1_amdil_legacy_f32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], 0, [[ARG]] clamp{{$}}
+; SI: buffer_store_dword [[RESULT]]
 define void @clamp_0_1_amdil_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
   %clamp = call float @llvm.AMDIL.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
   store float %clamp, float addrspace(1)* %out, align 4

diff --git a/test/CodeGen/R600/llvm.AMDGPU.cube.ll b/test/CodeGen/R600/llvm.AMDGPU.cube.ll
index 110bbfd..aa07afd 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.cube.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.cube.ll

@@ -1,7 +1,7 @@
 
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @cube
+; CHECK: {{^}}cube:
 ; CHECK: CUBE T{{[0-9]}}.X
 ; CHECK: CUBE T{{[0-9]}}.Y
 ; CHECK: CUBE T{{[0-9]}}.Z

diff --git a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
index 6facb47..7aacbb9 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.cvt_f32_ubyte.ll

@@ -5,8 +5,8 @@
 declare float @llvm.AMDGPU.cvt.f32.ubyte2(i32) nounwind readnone
 declare float @llvm.AMDGPU.cvt.f32.ubyte3(i32) nounwind readnone
 
-; SI-LABEL: @test_unpack_byte0_to_float:
-; SI: V_CVT_F32_UBYTE0
+; SI-LABEL: {{^}}test_unpack_byte0_to_float:
+; SI: v_cvt_f32_ubyte0
 define void @test_unpack_byte0_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte0(i32 %val) nounwind readnone
@@ -14,8 +14,8 @@
   ret void
 }
 
-; SI-LABEL: @test_unpack_byte1_to_float:
-; SI: V_CVT_F32_UBYTE1
+; SI-LABEL: {{^}}test_unpack_byte1_to_float:
+; SI: v_cvt_f32_ubyte1
 define void @test_unpack_byte1_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte1(i32 %val) nounwind readnone
@@ -23,8 +23,8 @@
   ret void
 }
 
-; SI-LABEL: @test_unpack_byte2_to_float:
-; SI: V_CVT_F32_UBYTE2
+; SI-LABEL: {{^}}test_unpack_byte2_to_float:
+; SI: v_cvt_f32_ubyte2
 define void @test_unpack_byte2_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte2(i32 %val) nounwind readnone
@@ -32,8 +32,8 @@
   ret void
 }
 
-; SI-LABEL: @test_unpack_byte3_to_float:
-; SI: V_CVT_F32_UBYTE3
+; SI-LABEL: {{^}}test_unpack_byte3_to_float:
+; SI: v_cvt_f32_ubyte3
 define void @test_unpack_byte3_to_float(float addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %cvt = call float @llvm.AMDGPU.cvt.f32.ubyte3(i32 %val) nounwind readnone

diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
index c8c7357..009fd73 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fixup.ll

@@ -3,23 +3,23 @@
 declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
 declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
 
-; SI-LABEL: @test_div_fixup_f32:
-; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: V_DIV_FIXUP_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}test_div_fixup_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @test_div_fixup_f64:
-; SI: V_DIV_FIXUP_F64
+; SI-LABEL: {{^}}test_div_fixup_f64:
+; SI: v_div_fixup_f64
 define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
   %result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8

diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
index 4f1e827..dcca9e9 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_fmas.ll

@@ -1,27 +1,27 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-declare float @llvm.AMDGPU.div.fmas.f32(float, float, float) nounwind readnone
-declare double @llvm.AMDGPU.div.fmas.f64(double, double, double) nounwind readnone
+declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
+declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
 
-; SI-LABEL: @test_div_fmas_f32:
-; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
-; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: V_DIV_FMAS_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
-define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
-  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c) nounwind readnone
+; SI-LABEL: {{^}}test_div_fmas_f32:
+; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
+; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
+; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
+define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+  %result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @test_div_fmas_f64:
-; SI: V_DIV_FMAS_F64
-define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
-  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c) nounwind readnone
+; SI-LABEL: {{^}}test_div_fmas_f64:
+; SI: v_div_fmas_f64
+define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+  %result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
 }

diff --git a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
index 527c8da..641c8ca 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.div_scale.ll

@@ -1,13 +1,23 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
 declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
 
 ; SI-LABEL @test_div_scale_f32_1:
-; SI: V_DIV_SCALE_F32
-define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
-  %a = load float addrspace(1)* %aptr, align 4
-  %b = load float addrspace(1)* %bptr, align 4
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -15,10 +25,19 @@
 }
 
 ; SI-LABEL @test_div_scale_f32_2:
-; SI: V_DIV_SCALE_F32
-define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
-  %a = load float addrspace(1)* %aptr, align 4
-  %b = load float addrspace(1)* %bptr, align 4
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+
+  %a = load float addrspace(1)* %gep.0, align 4
+  %b = load float addrspace(1)* %gep.1, align 4
+
   %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -26,10 +45,19 @@
 }
 
 ; SI-LABEL @test_div_scale_f64_1:
-; SI: V_DIV_SCALE_F64
-define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind {
-  %a = load double addrspace(1)* %aptr, align 8
-  %b = load double addrspace(1)* %bptr, align 8
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -37,10 +65,221 @@
 }
 
 ; SI-LABEL @test_div_scale_f64_1:
-; SI: V_DIV_SCALE_F64
-define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr, double addrspace(1)* %cptr) nounwind {
-  %a = load double addrspace(1)* %aptr, align 8
-  %b = load double addrspace(1)* %bptr, align 8
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x8
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_num_1:
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
+; SI-DAG: s_load_dword [[A:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+
+  %b = load float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_num_2:
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
+; SI-DAG: s_load_dword [[A:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+
+  %b = load float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_den_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: s_load_dword [[B:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+
+  %a = load float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_scalar_den_2:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
+; SI-DAG: s_load_dword [[B:s[0-9]+]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr float addrspace(1)* %in, i32 %tid
+
+  %a = load float addrspace(1)* %gep, align 4
+
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_num_1:
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+
+  %b = load double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_num_2:
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+
+  %b = load double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_den_1:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+
+  %a = load double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_scalar_den_2:
+; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr double addrspace(1)* %in, i32 %tid
+
+  %a = load double addrspace(1)* %gep, align 8
+
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_all_scalar_1:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f32_all_scalar_2:
+; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
+; SI: buffer_store_dword [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
+  %result0 = extractvalue { float, i1 } %result, 0
+  store float %result0, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_all_scalar_1:
+; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
+; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
+  %result0 = extractvalue { double, i1 } %result, 0
+  store double %result0, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL @test_div_scale_f64_all_scalar_2:
+; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
+; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
+; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
+; SI: buffer_store_dwordx2 [[RESULT0]]
+; SI: s_endpgm
+define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8

diff --git a/test/CodeGen/R600/llvm.AMDGPU.fract.ll b/test/CodeGen/R600/llvm.AMDGPU.fract.ll
index 72ec1c5..235068c 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.fract.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.fract.ll

@@ -6,8 +6,8 @@
 ; Legacy name
 declare float @llvm.AMDIL.fraction.f32(float) nounwind readnone
 
-; FUNC-LABEL: @fract_f32
-; SI: V_FRACT_F32
+; FUNC-LABEL: {{^}}fract_f32:
+; SI: v_fract_f32
 ; EG: FRACT
 define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
   %val = load float addrspace(1)* %src, align 4
@@ -16,8 +16,8 @@
   ret void
 }
 
-; FUNC-LABEL: @fract_f32_legacy_amdil
-; SI: V_FRACT_F32
+; FUNC-LABEL: {{^}}fract_f32_legacy_amdil:
+; SI: v_fract_f32
 ; EG: FRACT
 define void @fract_f32_legacy_amdil(float addrspace(1)* %out, float addrspace(1)* %src) nounwind {
   %val = load float addrspace(1)* %src, align 4

diff --git a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
index 95795ea..8998840 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imad24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imad24.ll

@@ -8,8 +8,8 @@
 
 declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @test_imad24
-; SI: V_MAD_I32_I24
+; FUNC-LABEL: {{^}}test_imad24:
+; SI: v_mad_i32_i24
 ; CM: MULADD_INT24
 ; R600: MULLO_INT
 ; R600: ADD_INT

diff --git a/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
index 01c9f43..dac21a4 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imax.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @vector_imax
-; SI: V_MAX_I32_e32
+; SI-LABEL: {{^}}vector_imax:
+; SI: v_max_i32_e32
 define void @vector_imax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
   %load = load i32 addrspace(1)* %in, align 4
@@ -11,8 +11,8 @@
   ret void
 }
 
-; SI-LABEL: @scalar_imax
-; SI: S_MAX_I32
+; SI-LABEL: {{^}}scalar_imax:
+; SI: s_max_i32
 define void @scalar_imax(i32 %p0, i32 %p1) #0 {
 entry:
   %max = call i32 @llvm.AMDGPU.imax(i32 %p0, i32 %p1)

diff --git a/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
index 565bf34..462c497 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imin.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @vector_imin
-; SI: V_MIN_I32_e32
+; SI-LABEL: {{^}}vector_imin:
+; SI: v_min_i32_e32
 define void @vector_imin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
   %load = load i32 addrspace(1)* %in, align 4
@@ -11,8 +11,8 @@
   ret void
 }
 
-; SI-LABEL: @scalar_imin
-; SI: S_MIN_I32
+; SI-LABEL: {{^}}scalar_imin:
+; SI: s_min_i32
 define void @scalar_imin(i32 %p0, i32 %p1) #0 {
 entry:
   %min = call i32 @llvm.AMDGPU.imin(i32 %p0, i32 %p1)

diff --git a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
index 8ee3520..db563dd 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imul24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imul24.ll

@@ -4,8 +4,8 @@
 
 declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
 
-; FUNC-LABEL: @test_imul24
-; SI: V_MUL_I32_I24
+; FUNC-LABEL: {{^}}test_imul24:
+; SI: v_mul_i32_i24
 ; CM: MUL_INT24
 ; R600: MULLO_INT
 define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {

diff --git a/test/CodeGen/R600/llvm.AMDGPU.kill.ll b/test/CodeGen/R600/llvm.AMDGPU.kill.ll
index 4ab6a8a..988b43c 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.kill.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.kill.ll

@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @kill_gs_const
-; SI-NOT: V_CMPX_LE_F32
-; SI: S_MOV_B64 exec, 0
+; SI-LABEL: {{^}}kill_gs_const:
+; SI-NOT: v_cmpx_le_f32
+; SI: s_mov_b64 exec, 0
 
 define void @kill_gs_const() #0 {
 main_body:

diff --git a/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll b/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll
new file mode 100644
index 0000000..72719fe
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.ldexp.ll

@@ -0,0 +1,22 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.AMDGPU.ldexp.f32(float, i32) nounwind readnone
+declare double @llvm.AMDGPU.ldexp.f64(double, i32) nounwind readnone
+
+; SI-LABEL: {{^}}test_ldexp_f32:
+; SI: v_ldexp_f32
+; SI: s_endpgm
+define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+  %result = call float @llvm.AMDGPU.ldexp.f32(float %a, i32 %b) nounwind readnone
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_ldexp_f64:
+; SI: v_ldexp_f64
+; SI: s_endpgm
+define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+  %result = call double @llvm.AMDGPU.ldexp.f64(double %a, i32 %b) nounwind readnone
+  store double %result, double addrspace(1)* %out, align 8
+  ret void
+}

diff --git a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
index 51964ee..6e3fa25 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.legacy.rsq.ll

@@ -3,8 +3,8 @@
 
 declare float @llvm.AMDGPU.legacy.rsq(float) nounwind readnone
 
-; FUNC-LABEL: @rsq_legacy_f32
-; SI: V_RSQ_LEGACY_F32_e32
+; FUNC-LABEL: {{^}}rsq_legacy_f32:
+; SI: v_rsq_legacy_f32_e32
 ; EG: RECIPSQRT_IEEE
 define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq = call float @llvm.AMDGPU.legacy.rsq(float %src) nounwind readnone

diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll
new file mode 100644
index 0000000..c4b04c5
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.f64.ll

@@ -0,0 +1,30 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+; FUNC-LABEL: {{^}}rcp_f64:
+; SI: v_rcp_f64_e32
+define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_pat_f64:
+; SI: v_rcp_f64_e32
+define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %rcp = fdiv double 1.0, %src
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_rcp_pat_f64:
+; SI-UNSAFE: v_rsq_f64_e32
+; SI-SAFE-NOT: v_rsq_f64_e32
+define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
+  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
+  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
+  store double %rcp, double addrspace(1)* %out, align 8
+  ret void
+}

diff --git a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
index ca5260d..3ee3e6b 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rcp.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rcp.ll

@@ -1,58 +1,47 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=r600 -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s
+
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
 declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
 
-
 declare float @llvm.sqrt.f32(float) nounwind readnone
-declare double @llvm.sqrt.f64(double) nounwind readnone
 
-; FUNC-LABEL: @rcp_f32
-; SI: V_RCP_F32_e32
+; FUNC-LABEL: {{^}}rcp_f32:
+; SI: v_rcp_f32_e32
+; EG: RECIP_IEEE
 define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
   %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @rcp_f64
-; SI: V_RCP_F64_e32
-define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
+; FIXME: Evergreen only ever does unsafe fp math.
+; FUNC-LABEL: {{^}}rcp_pat_f32:
 
-; FUNC-LABEL: @rcp_pat_f32
-; SI: V_RCP_F32_e32
+; SI-SAFE: v_rcp_f32_e32
+; XSI-SAFE-SPDENORM-NOT: v_rcp_f32_e32
+
+; EG: RECIP_IEEE
+
 define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @rcp_pat_f64
-; SI: V_RCP_F64_e32
-define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
-  %rcp = fdiv double 1.0, %src
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}
+; FUNC-LABEL: {{^}}rsq_rcp_pat_f32:
+; SI-UNSAFE: v_rsq_f32_e32
+; SI-SAFE: v_sqrt_f32_e32
+; SI-SAFE: v_rcp_f32_e32
 
-; FUNC-LABEL: @rsq_rcp_pat_f32
-; SI: V_RSQ_F32_e32
+; EG: RECIPSQRT_IEEE
 define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
   %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
   %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
-
-; FUNC-LABEL: @rsq_rcp_pat_f64
-; SI: V_RSQ_F64_e32
-define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
-  %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
-  %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
-  store double %rcp, double addrspace(1)* %out, align 8
-  ret void
-}

diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
index 100d6ff..18854be 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.f64.ll

@@ -2,8 +2,8 @@
 
 declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
 
-; FUNC-LABEL: @rsq_clamped_f64
-; SI: V_RSQ_CLAMP_F64_e32
+; FUNC-LABEL: {{^}}rsq_clamped_f64:
+; SI: v_rsq_clamp_f64_e32
 define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
   %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
   store double %rsq_clamped, double addrspace(1)* %out, align 8

diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
index 683df73..6bf9f0c 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.clamped.ll

@@ -4,8 +4,8 @@
 
 declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone
 
-; FUNC-LABEL: @rsq_clamped_f32
-; SI: V_RSQ_CLAMP_F32_e32
+; FUNC-LABEL: {{^}}rsq_clamped_f32:
+; SI: v_rsq_clamp_f32_e32
 ; EG: RECIPSQRT_CLAMPED
 define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone

diff --git a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
index 27cf6b2..d6299b8 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.rsq.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.rsq.ll

@@ -3,11 +3,30 @@
 
 declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone
 
-; FUNC-LABEL: @rsq_f32
-; SI: V_RSQ_F32_e32
+; FUNC-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
 ; EG: RECIPSQRT_IEEE
 define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
 }
+
+; TODO: Really these should be constant folded
+; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
+; EG: RECIPSQRT_IEEE
+define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+  %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone
+  store float %rsq, float addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
index 1c736d4..2e6bd5c 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trig_preop.ll

@@ -2,12 +2,12 @@
 
 declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
 
-; SI-LABEL: @test_trig_preop_f64:
-; SI-DAG: BUFFER_LOAD_DWORD [[SEG:v[0-9]+]]
-; SI-DAG: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
-; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}test_trig_preop_f64:
+; SI-DAG: buffer_load_dword [[SEG:v[0-9]+]]
+; SI-DAG: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load double addrspace(1)* %aptr, align 8
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -16,11 +16,11 @@
   ret void
 }
 
-; SI-LABEL: @test_trig_preop_f64_imm_segment:
-; SI: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
-; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}test_trig_preop_f64_imm_segment:
+; SI: buffer_load_dwordx2 [[SRC:v\[[0-9]+:[0-9]+\]]],
+; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %a = load double addrspace(1)* %aptr, align 8
   %result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone

diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
index e6bb2c4..fdd531d 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll

@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-; R600-CHECK: @amdgpu_trunc
+; R600-CHECK: {{^}}amdgpu_trunc:
 ; R600-CHECK: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: @amdgpu_trunc
-; SI-CHECK: V_TRUNC_F32
+; SI-CHECK: {{^}}amdgpu_trunc:
+; SI-CHECK: v_trunc_f32
 
 define void @amdgpu_trunc(float addrspace(1)* %out, float %x) {
 entry:

diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
index afdfb18..59d6248 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll

@@ -5,9 +5,10 @@
 ; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-; FUNC-LABEL: @test_umad24
-; SI: V_MAD_U32_U24
+; FUNC-LABEL: {{^}}test_umad24:
+; SI: v_mad_u32_u24
 ; EG: MULADD_UINT24
 ; R600: MULLO_UINT
 ; R600: ADD_INT
@@ -17,3 +18,21 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}commute_umad24:
+; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %src0.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %src2.gep = getelementptr i32 addrspace(1)* %src0.gep, i32 1
+
+  %src0 = load i32 addrspace(1)* %src0.gep, align 4
+  %src2 = load i32 addrspace(1)* %src2.gep, align 4
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+

diff --git a/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
index 1b8da2e..ee854ec 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umax.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @vector_umax
-; SI: V_MAX_U32_e32
+; SI-LABEL: {{^}}vector_umax:
+; SI: v_max_u32_e32
 define void @vector_umax(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
   %load = load i32 addrspace(1)* %in, align 4
@@ -11,8 +11,8 @@
   ret void
 }
 
-; SI-LABEL: @scalar_umax
-; SI: S_MAX_U32
+; SI-LABEL: {{^}}scalar_umax:
+; SI: s_max_u32
 define void @scalar_umax(i32 %p0, i32 %p1) #0 {
 entry:
   %max = call i32 @llvm.AMDGPU.umax(i32 %p0, i32 %p1)
@@ -21,11 +21,11 @@
   ret void
 }
 
-; SI-LABEL: @trunc_zext_umax
-; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
-; SI: V_MAX_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: AND
-; SI: BUFFER_STORE_SHORT [[RESULT]],
+; SI-LABEL: {{^}}trunc_zext_umax:
+; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
+; SI: v_max_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: and
+; SI: buffer_store_short [[RESULT]],
 define void @trunc_zext_umax(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
   %tmp5 = load i8 addrspace(1)* %src, align 1
   %tmp2 = zext i8 %tmp5 to i32

diff --git a/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
index 08397f8..2eaa372 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umin.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @vector_umin
-; SI: V_MIN_U32_e32
+; SI-LABEL: {{^}}vector_umin:
+; SI: v_min_u32_e32
 define void @vector_umin(i32 %p0, i32 %p1, i32 addrspace(1)* %in) #0 {
 main_body:
   %load = load i32 addrspace(1)* %in, align 4
@@ -11,8 +11,8 @@
   ret void
 }
 
-; SI-LABEL: @scalar_umin
-; SI: S_MIN_U32
+; SI-LABEL: {{^}}scalar_umin:
+; SI: s_min_u32
 define void @scalar_umin(i32 %p0, i32 %p1) #0 {
 entry:
   %min = call i32 @llvm.AMDGPU.umin(i32 %p0, i32 %p1)
@@ -21,11 +21,11 @@
   ret void
 }
 
-; SI-LABEL: @trunc_zext_umin
-; SI: BUFFER_LOAD_UBYTE [[VREG:v[0-9]+]],
-; SI: V_MIN_U32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
-; SI-NOT: AND
-; SI: BUFFER_STORE_SHORT [[RESULT]],
+; SI-LABEL: {{^}}trunc_zext_umin:
+; SI: buffer_load_ubyte [[VREG:v[0-9]+]],
+; SI: v_min_u32_e32 [[RESULT:v[0-9]+]], 0, [[VREG]]
+; SI-NOT: and
+; SI: buffer_store_short [[RESULT]],
 define void @trunc_zext_umin(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
   %tmp5 = load i8 addrspace(1)* %src, align 1
   %tmp2 = zext i8 %tmp5 to i32

diff --git a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
index 72a3602..567ac31 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umul24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umul24.ll

@@ -6,8 +6,8 @@
 
 declare i32 @llvm.AMDGPU.umul24(i32, i32) nounwind readnone
 
-; FUNC-LABEL: @test_umul24
-; SI: V_MUL_U32_U24
+; FUNC-LABEL: {{^}}test_umul24:
+; SI: v_mul_u32_u24
 ; R600: MUL_UINT24
 ; R600: MULLO_UINT
 define void @test_umul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {

diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
index 0438ecc..d26bc32 100644
--- a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
+++ b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll

@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: S_MOV_B32
-;CHECK-NEXT: V_INTERP_MOV_F32
+;CHECK: s_mov_b32
+;CHECK-NEXT: v_interp_mov_f32
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:

diff --git a/test/CodeGen/R600/llvm.SI.gather4.ll b/test/CodeGen/R600/llvm.SI.gather4.ll
index 8402faa..91a2012 100644
--- a/test/CodeGen/R600/llvm.SI.gather4.ll
+++ b/test/CodeGen/R600/llvm.SI.gather4.ll

@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-LABEL: @gather4_v2
-;CHECK: IMAGE_GATHER4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_v2:
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_v2() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -13,8 +13,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4
-;CHECK: IMAGE_GATHER4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4:
+;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -26,8 +26,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_cl
-;CHECK: IMAGE_GATHER4_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_cl:
+;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_cl() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -39,8 +39,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_l
-;CHECK: IMAGE_GATHER4_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_l:
+;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_l() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -52,8 +52,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_b
-;CHECK: IMAGE_GATHER4_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b:
+;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -65,8 +65,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_cl
-;CHECK: IMAGE_GATHER4_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_cl:
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_cl() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -78,8 +78,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_cl_v8
-;CHECK: IMAGE_GATHER4_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_cl_v8:
+;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_cl_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -91,8 +91,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_lz_v2
-;CHECK: IMAGE_GATHER4_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_lz_v2:
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_lz_v2() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -104,8 +104,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_lz
-;CHECK: IMAGE_GATHER4_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_lz:
+;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_lz() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -119,8 +119,8 @@
 
 
 
-;CHECK-LABEL: @gather4_o
-;CHECK: IMAGE_GATHER4_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_o:
+;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -132,8 +132,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_cl_o
-;CHECK: IMAGE_GATHER4_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_cl_o:
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_cl_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -145,8 +145,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_cl_o_v8
-;CHECK: IMAGE_GATHER4_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_cl_o_v8:
+;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_cl_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -158,8 +158,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_l_o
-;CHECK: IMAGE_GATHER4_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_l_o:
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_l_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -171,8 +171,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_l_o_v8
-;CHECK: IMAGE_GATHER4_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_l_o_v8:
+;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_l_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -184,8 +184,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_o
-;CHECK: IMAGE_GATHER4_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_o:
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -197,8 +197,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_o_v8
-;CHECK: IMAGE_GATHER4_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_o_v8:
+;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -210,8 +210,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_b_cl_o
-;CHECK: IMAGE_GATHER4_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_b_cl_o:
+;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_b_cl_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -223,8 +223,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_lz_o
-;CHECK: IMAGE_GATHER4_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_lz_o:
+;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_lz_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -238,8 +238,8 @@
 
 
 
-;CHECK-LABEL: @gather4_c
-;CHECK: IMAGE_GATHER4_C {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c:
+;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -251,8 +251,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_cl
-;CHECK: IMAGE_GATHER4_C_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_cl:
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_cl() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -264,8 +264,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_cl_v8
-;CHECK: IMAGE_GATHER4_C_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_cl_v8:
+;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_cl_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -277,8 +277,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_l
-;CHECK: IMAGE_GATHER4_C_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_l:
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_l() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -290,8 +290,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_l_v8
-;CHECK: IMAGE_GATHER4_C_L {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_l_v8:
+;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_l_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -303,8 +303,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b
-;CHECK: IMAGE_GATHER4_C_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b:
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -316,8 +316,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b_v8
-;CHECK: IMAGE_GATHER4_C_B {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b_v8:
+;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -329,8 +329,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b_cl
-;CHECK: IMAGE_GATHER4_C_B_CL {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b_cl:
+;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b_cl() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -342,8 +342,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_lz
-;CHECK: IMAGE_GATHER4_C_LZ {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_lz:
+;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_lz() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -357,8 +357,8 @@
 
 
 
-;CHECK-LABEL: @gather4_c_o
-;CHECK: IMAGE_GATHER4_C_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_o:
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -370,8 +370,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_o_v8
-;CHECK: IMAGE_GATHER4_C_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_o_v8:
+;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -383,8 +383,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_cl_o
-;CHECK: IMAGE_GATHER4_C_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_cl_o:
+;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_cl_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -396,8 +396,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_l_o
-;CHECK: IMAGE_GATHER4_C_L_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_l_o:
+;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_l_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -409,8 +409,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b_o
-;CHECK: IMAGE_GATHER4_C_B_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b_o:
+;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -422,8 +422,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_b_cl_o
-;CHECK: IMAGE_GATHER4_C_B_CL_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_b_cl_o:
+;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_b_cl_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -435,8 +435,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_lz_o
-;CHECK: IMAGE_GATHER4_C_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_lz_o:
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_lz_o() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -448,8 +448,8 @@
   ret void
 }
 
-;CHECK-LABEL: @gather4_c_lz_o_v8
-;CHECK: IMAGE_GATHER4_C_LZ_O {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}gather4_c_lz_o_v8:
+;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 1, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @gather4_c_lz_o_v8() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)

diff --git a/test/CodeGen/R600/llvm.SI.getlod.ll b/test/CodeGen/R600/llvm.SI.getlod.ll
index a7a17ec..ec26fe5 100644
--- a/test/CodeGen/R600/llvm.SI.getlod.ll
+++ b/test/CodeGen/R600/llvm.SI.getlod.ll

@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-LABEL: @getlod
-;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}getlod:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @getlod() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -11,8 +11,8 @@
   ret void
 }
 
-;CHECK-LABEL: @getlod_v2
-;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}getlod_v2:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @getlod_v2() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
@@ -22,8 +22,8 @@
   ret void
 }
 
-;CHECK-LABEL: @getlod_v4
-;CHECK: IMAGE_GET_LOD {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+;CHECK-LABEL: {{^}}getlod_v4:
+;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, -1, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
 define void @getlod_v4() #0 {
 main_body:
   %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <32 x i8> undef, <16 x i8> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)

diff --git a/test/CodeGen/R600/llvm.SI.image.ll b/test/CodeGen/R600/llvm.SI.image.ll
new file mode 100644
index 0000000..4eec543
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.image.ll

@@ -0,0 +1,49 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}image_load:
+;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @image_load() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}image_load_mip:
+;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @image_load_mip() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}getresinfo:
+;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @getresinfo() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/llvm.SI.image.sample.ll b/test/CodeGen/R600/llvm.SI.image.sample.ll
new file mode 100644
index 0000000..ebff391
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.image.sample.ll

@@ -0,0 +1,289 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}sample:
+;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d:
+;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_l:
+;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b:
+;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_lz:
+;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd:
+;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c:
+;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/llvm.SI.image.sample.o.ll b/test/CodeGen/R600/llvm.SI.image.sample.o.ll
new file mode 100644
index 0000000..dbc1b2b
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.image.sample.o.ll

@@ -0,0 +1,289 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}sample:
+;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cl:
+;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d:
+;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_d_cl:
+;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_l:
+;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b:
+;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_b_cl:
+;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_lz:
+;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd:
+;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_cd_cl:
+;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c:
+;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cl:
+;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d:
+;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_d_cl:
+;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_d_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_l:
+;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_l() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b:
+;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_b_cl:
+;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_b_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_lz:
+;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_lz() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd:
+;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}sample_c_cd_cl:
+;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+define void @sample_c_cd_cl() #0 {
+main_body:
+  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r0 = extractelement <4 x float> %r, i32 0
+  %r1 = extractelement <4 x float> %r, i32 1
+  %r2 = extractelement <4 x float> %r, i32 2
+  %r3 = extractelement <4 x float> %r, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll
index 59e00f0..673d92d 100644
--- a/test/CodeGen/R600/llvm.SI.imageload.ll
+++ b/test/CodeGen/R600/llvm.SI.imageload.ll

@@ -1,15 +1,15 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_LOAD {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 2, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 1, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 4, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 8, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 8, 0, 0, -1
+;CHECK-DAG: image_load {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 2, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 1, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 4, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+;CHECK-DAG: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+;CHECK-DAG: image_load_mip {{v[0-9]+}}, 8, 0, 0, -1
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -84,7 +84,7 @@
 
 ; Test that ccordinates are stored in vgprs and not sgprs
 ; CHECK: vgpr_coords
-; CHECK: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
 define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr float addrspace(2)* addrspace(2)* %0, i32 0

diff --git a/test/CodeGen/R600/llvm.SI.load.dword.ll b/test/CodeGen/R600/llvm.SI.load.dword.ll
index a622775..e5c6201 100644
--- a/test/CodeGen/R600/llvm.SI.load.dword.ll
+++ b/test/CodeGen/R600/llvm.SI.load.dword.ll

@@ -3,11 +3,11 @@
 ; Example of a simple geometry shader loading vertex attributes from the
 ; ESGS ring buffer
 
-; CHECK-LABEL: @main
-; CHECK: BUFFER_LOAD_DWORD
-; CHECK: BUFFER_LOAD_DWORD
-; CHECK: BUFFER_LOAD_DWORD
-; CHECK: BUFFER_LOAD_DWORD
+; CHECK-LABEL: {{^}}main:
+; CHECK: buffer_load_dword
+; CHECK: buffer_load_dword
+; CHECK: buffer_load_dword
+; CHECK: buffer_load_dword
 
 define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, [2 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* inreg, [17 x <16 x i8>] addrspace(2)* inreg, i32, i32, i32, i32) #0 {
 main_body:

diff --git a/test/CodeGen/R600/llvm.SI.resinfo.ll b/test/CodeGen/R600/llvm.SI.resinfo.ll
index af3afc1..d8f3722 100644
--- a/test/CodeGen/R600/llvm.SI.resinfo.ll
+++ b/test/CodeGen/R600/llvm.SI.resinfo.ll

@@ -1,21 +1,21 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
 
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 2, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 1, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 4, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 8, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 8, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 2, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 1, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 4, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
+; CHECK-DAG: image_get_resinfo {{v[0-9]+}}, 8, 0, 0, -1
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8,
 		  i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) {

diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll b/test/CodeGen/R600/llvm.SI.sample-masked.ll
index 445359a..9e86bec 100644
--- a/test/CodeGen/R600/llvm.SI.sample-masked.ll
+++ b/test/CodeGen/R600/llvm.SI.sample-masked.ll

@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
 
-; CHECK-LABEL: @v1
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 13
+; CHECK-LABEL: {{^}}v1:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 13
 define void @v1(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -13,8 +13,8 @@
   ret void
 }
 
-; CHECK-LABEL: @v2
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 11
+; CHECK-LABEL: {{^}}v2:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 11
 define void @v2(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -26,8 +26,8 @@
   ret void
 }
 
-; CHECK-LABEL: @v3
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
+; CHECK-LABEL: {{^}}v3:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
 define void @v3(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -39,8 +39,8 @@
   ret void
 }
 
-; CHECK-LABEL: @v4
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 7
+; CHECK-LABEL: {{^}}v4:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 7
 define void @v4(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -52,8 +52,8 @@
   ret void
 }
 
-; CHECK-LABEL: @v5
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 10
+; CHECK-LABEL: {{^}}v5:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
 define void @v5(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -64,8 +64,8 @@
   ret void
 }
 
-; CHECK-LABEL: @v6
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 6
+; CHECK-LABEL: {{^}}v6:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 6
 define void @v6(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
@@ -76,8 +76,8 @@
   ret void
 }
 
-; CHECK-LABEL: @v7
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 9
+; CHECK-LABEL: {{^}}v7:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 9
 define void @v7(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0

diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
index 24e8f64..a1d2c02 100644
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.sample.ll

@@ -1,21 +1,21 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 2
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 1
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 4
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 8
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: image_sample {{v[0-9]+}}, 2
+;CHECK-DAG: image_sample {{v[0-9]+}}, 1
+;CHECK-DAG: image_sample {{v[0-9]+}}, 4
+;CHECK-DAG: image_sample {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: image_sample {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: image_sample {{v[0-9]+}}, 8
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -135,8 +135,8 @@
    ret void
 }
 
-; CHECK: @v1
-; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15
+; CHECK: {{^}}v1:
+; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, 15
 define void @v1(i32 %a1) #0 {
 entry:
   %0 = insertelement <1 x i32> undef, i32 %a1, i32 0

diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
index 366456f..91b71f3 100644
--- a/test/CodeGen/R600/llvm.SI.sampled.ll
+++ b/test/CodeGen/R600/llvm.SI.sampled.ll

@@ -1,21 +1,21 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 15
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 3
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 2
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 1
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 4
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 8
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 5
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 9
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 6
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 10
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 12
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 7
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 11
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 13
-;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 14
-;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 2
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 1
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 4
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: image_sample_d {{v[0-9]+}}, 8
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) #0 {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0

diff --git a/test/CodeGen/R600/llvm.SI.sendmsg.ll b/test/CodeGen/R600/llvm.SI.sendmsg.ll
index 581d422..042fc5b 100644
--- a/test/CodeGen/R600/llvm.SI.sendmsg.ll
+++ b/test/CodeGen/R600/llvm.SI.sendmsg.ll

@@ -1,10 +1,10 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-; CHECK-LABEL: @main
-; CHECK: S_SENDMSG Gs(emit stream 0)
-; CHECK: S_SENDMSG Gs(cut stream 1)
-; CHECK: S_SENDMSG Gs(emit-cut stream 2)
-; CHECK: S_SENDMSG Gs_done(nop)
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_sendmsg Gs(emit stream 0)
+; CHECK: s_sendmsg Gs(cut stream 1)
+; CHECK: s_sendmsg Gs(emit-cut stream 2)
+; CHECK: s_sendmsg Gs_done(nop)
 
 define void @main() {
 main_body:

diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
index 740581a..702daea 100644
--- a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll

@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-LABEL: @test1
-;CHECK: TBUFFER_STORE_FORMAT_XYZW {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK-LABEL: {{^}}test1:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test1(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -10,8 +10,8 @@
     ret void
 }
 
-;CHECK-LABEL: @test2
-;CHECK: TBUFFER_STORE_FORMAT_XYZ {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK-LABEL: {{^}}test2:
+;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test2(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -20,8 +20,8 @@
     ret void
 }
 
-;CHECK-LABEL: @test3
-;CHECK: TBUFFER_STORE_FORMAT_XY {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK-LABEL: {{^}}test3:
+;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test3(i32 %a1, i32 %vaddr) #0 {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
@@ -30,8 +30,8 @@
     ret void
 }
 
-;CHECK-LABEL: @test4
-;CHECK: TBUFFER_STORE_FORMAT_X {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK-LABEL: {{^}}test4:
+;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
 define void @test4(i32 %vdata, i32 %vaddr) #0 {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,

diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll
index fe17304..ee96124 100644
--- a/test/CodeGen/R600/llvm.SI.tid.ll
+++ b/test/CodeGen/R600/llvm.SI.tid.ll

@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_MBCNT_LO_U32_B32_e64
-;CHECK: V_MBCNT_HI_U32_B32_e32
+;CHECK: v_mbcnt_lo_u32_b32_e64
+;CHECK: v_mbcnt_hi_u32_b32_e32
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
 main_body:

diff --git a/test/CodeGen/R600/llvm.amdgpu.dp4.ll b/test/CodeGen/R600/llvm.amdgpu.dp4.ll
new file mode 100644
index 0000000..812b6a4
--- /dev/null
+++ b/test/CodeGen/R600/llvm.amdgpu.dp4.ll

@@ -0,0 +1,11 @@
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) nounwind readnone
+
+define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
+  %src0 = load <4 x float> addrspace(1)* %a, align 16
+  %src1 = load <4 x float> addrspace(1)* %b, align 16
+  %dp4 = call float @llvm.AMDGPU.dp4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
+  store float %dp4, float addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/llvm.amdgpu.kilp.ll b/test/CodeGen/R600/llvm.amdgpu.kilp.ll
new file mode 100644
index 0000000..08bee38
--- /dev/null
+++ b/test/CodeGen/R600/llvm.amdgpu.kilp.ll

@@ -0,0 +1,20 @@
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}kilp_gs_const:
+; SI: s_mov_b64 exec, 0
+define void @kilp_gs_const() #0 {
+main_body:
+  %0 = icmp ule i32 0, 3
+  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kilp(float %1)
+  %2 = icmp ule i32 3, 0
+  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kilp(float %3)
+  ret void
+}
+
+declare void @llvm.AMDGPU.kilp(float)
+
+attributes #0 = { "ShaderType"="2" }
+
+!0 = metadata !{metadata !"const", null, i32 1}

diff --git a/test/CodeGen/R600/llvm.amdgpu.lrp.ll b/test/CodeGen/R600/llvm.amdgpu.lrp.ll
new file mode 100644
index 0000000..ee922fe
--- /dev/null
+++ b/test/CodeGen/R600/llvm.amdgpu.lrp.ll

@@ -0,0 +1,12 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone
+
+; FUNC-LABEL: {{^}}test_lrp:
+; SI: v_sub_f32
+; SI: v_mad_f32
+define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind {
+  %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone
+  store float %mad, float addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index 9e7a4de..837340f 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll

@@ -7,8 +7,8 @@
 ;EG: ADD *
 ;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ;EG-NOT: COS
-;SI: V_COS_F32
-;SI-NOT: V_COS_F32
+;SI: v_cos_f32
+;SI-NOT: v_cos_f32
 
 define void @test(float addrspace(1)* %out, float %x) #1 {
    %cos = call float @llvm.cos.f32(float %x)
@@ -22,11 +22,11 @@
 ;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ;EG: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ;EG-NOT: COS
-;SI: V_COS_F32
-;SI: V_COS_F32
-;SI: V_COS_F32
-;SI: V_COS_F32
-;SI-NOT: V_COS_F32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI: v_cos_f32
+;SI-NOT: v_cos_f32
 
 define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
    %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)

diff --git a/test/CodeGen/R600/llvm.exp2.ll b/test/CodeGen/R600/llvm.exp2.ll
index 119d5ef..52dc67d 100644
--- a/test/CodeGen/R600/llvm.exp2.ll
+++ b/test/CodeGen/R600/llvm.exp2.ll

@@ -2,13 +2,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC
 ;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
-;FUNC-LABEL: @test
+;FUNC-LABEL: {{^}}test:
 ;EG-CHECK: EXP_IEEE
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_EXP_F32
+;SI-CHECK: v_exp_f32
 
 define void @test(float addrspace(1)* %out, float %in) {
 entry:
@@ -17,7 +17,7 @@
    ret void
 }
 
-;FUNC-LABEL: @testv2
+;FUNC-LABEL: {{^}}testv2:
 ;EG-CHECK: EXP_IEEE
 ;EG-CHECK: EXP_IEEE
 ; FIXME: We should be able to merge these packets together on Cayman so we
@@ -30,8 +30,8 @@
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_EXP_F32
-;SI-CHECK: V_EXP_F32
+;SI-CHECK: v_exp_f32
+;SI-CHECK: v_exp_f32
 
 define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
@@ -40,7 +40,7 @@
   ret void
 }
 
-;FUNC-LABEL: @testv4
+;FUNC-LABEL: {{^}}testv4:
 ;EG-CHECK: EXP_IEEE
 ;EG-CHECK: EXP_IEEE
 ;EG-CHECK: EXP_IEEE
@@ -63,10 +63,10 @@
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_EXP_F32
-;SI-CHECK: V_EXP_F32
-;SI-CHECK: V_EXP_F32
-;SI-CHECK: V_EXP_F32
+;SI-CHECK: v_exp_f32
+;SI-CHECK: v_exp_f32
+;SI-CHECK: v_exp_f32
+;SI-CHECK: v_exp_f32
 define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)

diff --git a/test/CodeGen/R600/llvm.floor.ll b/test/CodeGen/R600/llvm.floor.ll
index f7071cd..0c7a15b 100644
--- a/test/CodeGen/R600/llvm.floor.ll
+++ b/test/CodeGen/R600/llvm.floor.ll

@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @f32
+; R600-CHECK: {{^}}f32:
 ; R600-CHECK: FLOOR
-; SI-CHECK: @f32
-; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: {{^}}f32:
+; SI-CHECK: v_floor_f32_e32
 define void @f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.floor.f32(float %in)
@@ -12,12 +12,12 @@
   ret void
 }
 
-; R600-CHECK: @v2f32
+; R600-CHECK: {{^}}v2f32:
 ; R600-CHECK: FLOOR
 ; R600-CHECK: FLOOR
-; SI-CHECK: @v2f32
-; SI-CHECK: V_FLOOR_F32_e32
-; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: {{^}}v2f32:
+; SI-CHECK: v_floor_f32_e32
+; SI-CHECK: v_floor_f32_e32
 define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.floor.v2f32(<2 x float> %in)
@@ -25,16 +25,16 @@
   ret void
 }
 
-; R600-CHECK: @v4f32
+; R600-CHECK: {{^}}v4f32:
 ; R600-CHECK: FLOOR
 ; R600-CHECK: FLOOR
 ; R600-CHECK: FLOOR
 ; R600-CHECK: FLOOR
-; SI-CHECK: @v4f32
-; SI-CHECK: V_FLOOR_F32_e32
-; SI-CHECK: V_FLOOR_F32_e32
-; SI-CHECK: V_FLOOR_F32_e32
-; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: {{^}}v4f32:
+; SI-CHECK: v_floor_f32_e32
+; SI-CHECK: v_floor_f32_e32
+; SI-CHECK: v_floor_f32_e32
+; SI-CHECK: v_floor_f32_e32
 define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %in)

diff --git a/test/CodeGen/R600/llvm.log2.ll b/test/CodeGen/R600/llvm.log2.ll
index 4cba2d4..0b54a46 100644
--- a/test/CodeGen/R600/llvm.log2.ll
+++ b/test/CodeGen/R600/llvm.log2.ll

@@ -2,13 +2,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK --check-prefix=FUNC
 ;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
 
-;FUNC-LABEL: @test
+;FUNC-LABEL: {{^}}test:
 ;EG-CHECK: LOG_IEEE
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_LOG_F32
+;SI-CHECK: v_log_f32
 
 define void @test(float addrspace(1)* %out, float %in) {
 entry:
@@ -17,7 +17,7 @@
    ret void
 }
 
-;FUNC-LABEL: @testv2
+;FUNC-LABEL: {{^}}testv2:
 ;EG-CHECK: LOG_IEEE
 ;EG-CHECK: LOG_IEEE
 ; FIXME: We should be able to merge these packets together on Cayman so we
@@ -30,8 +30,8 @@
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED)
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_LOG_F32
-;SI-CHECK: V_LOG_F32
+;SI-CHECK: v_log_f32
+;SI-CHECK: v_log_f32
 
 define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
@@ -40,7 +40,7 @@
   ret void
 }
 
-;FUNC-LABEL: @testv4
+;FUNC-LABEL: {{^}}testv4:
 ;EG-CHECK: LOG_IEEE
 ;EG-CHECK: LOG_IEEE
 ;EG-CHECK: LOG_IEEE
@@ -63,10 +63,10 @@
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;CM-CHECK-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
-;SI-CHECK: V_LOG_F32
-;SI-CHECK: V_LOG_F32
-;SI-CHECK: V_LOG_F32
-;SI-CHECK: V_LOG_F32
+;SI-CHECK: v_log_f32
+;SI-CHECK: v_log_f32
+;SI-CHECK: v_log_f32
+;SI-CHECK: v_log_f32
 define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)

diff --git a/test/CodeGen/R600/llvm.memcpy.ll b/test/CodeGen/R600/llvm.memcpy.ll
new file mode 100644
index 0000000..5f2710a
--- /dev/null
+++ b/test/CodeGen/R600/llvm.memcpy.ll

@@ -0,0 +1,364 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
+
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_write_b8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+; SI: ds_read_u8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+; SI: ds_write_b8
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2:
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+; SI: ds_write_b16
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
+  ret void
+}
+
+; FIXME: Use 64-bit ops
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: ds_read_b32
+; SI-DAG: ds_write_b32
+
+; SI-DAG: s_endpgm
+define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
+  %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
+  call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align1:
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: buffer_store_byte
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align2:
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+; SI-DAG: buffer_load_ushort
+
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+; SI-DAG: buffer_store_short
+
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align4:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align8:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_small_memcpy_i64_global_to_global_align16:
+; SI: buffer_load_dwordx4
+; SI: buffer_load_dwordx4
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
+; SI: s_endpgm
+define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+  %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
+  %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
+  ret void
+}

diff --git a/test/CodeGen/R600/llvm.rint.f64.ll b/test/CodeGen/R600/llvm.rint.f64.ll
index 3e2884b..72b546e 100644
--- a/test/CodeGen/R600/llvm.rint.f64.ll
+++ b/test/CodeGen/R600/llvm.rint.f64.ll

@@ -1,15 +1,15 @@
 ; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @rint_f64
-; CI: V_RNDNE_F64_e32
+; FUNC-LABEL: {{^}}rint_f64:
+; CI: v_rndne_f64_e32
 
-; SI-DAG: V_ADD_F64
-; SI-DAG: V_ADD_F64
-; SI-DAG V_CMP_GT_F64_e64
-; SI: V_CNDMASK_B32
-; SI: V_CNDMASK_B32
-; SI: S_ENDPGM
+; SI-DAG: v_add_f64
+; SI-DAG: v_add_f64
+; SI-DAG v_cmp_gt_f64_e64
+; SI: v_cndmask_b32
+; SI: v_cndmask_b32
+; SI: s_endpgm
 define void @rint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.rint.f64(double %in)
@@ -17,9 +17,9 @@
   ret void
 }
 
-; FUNC-LABEL: @rint_v2f64
-; CI: V_RNDNE_F64_e32
-; CI: V_RNDNE_F64_e32
+; FUNC-LABEL: {{^}}rint_v2f64:
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
 define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
@@ -27,11 +27,11 @@
   ret void
 }
 
-; FUNC-LABEL: @rint_v4f64
-; CI: V_RNDNE_F64_e32
-; CI: V_RNDNE_F64_e32
-; CI: V_RNDNE_F64_e32
-; CI: V_RNDNE_F64_e32
+; FUNC-LABEL: {{^}}rint_v4f64:
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
+; CI: v_rndne_f64_e32
 define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)

diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
index 209bb43..2e05964 100644
--- a/test/CodeGen/R600/llvm.rint.ll
+++ b/test/CodeGen/R600/llvm.rint.ll

@@ -1,10 +1,10 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @rint_f32
+; FUNC-LABEL: {{^}}rint_f32:
 ; R600: RNDNE
 
-; SI: V_RNDNE_F32_e32
+; SI: v_rndne_f32_e32
 define void @rint_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.rint.f32(float %in) #0
@@ -12,12 +12,12 @@
   ret void
 }
 
-; FUNC-LABEL: @rint_v2f32
+; FUNC-LABEL: {{^}}rint_v2f32:
 ; R600: RNDNE
 ; R600: RNDNE
 
-; SI: V_RNDNE_F32_e32
-; SI: V_RNDNE_F32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
 define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
@@ -25,16 +25,16 @@
   ret void
 }
 
-; FUNC-LABEL: @rint_v4f32
+; FUNC-LABEL: {{^}}rint_v4f32:
 ; R600: RNDNE
 ; R600: RNDNE
 ; R600: RNDNE
 ; R600: RNDNE
 
-; SI: V_RNDNE_F32_e32
-; SI: V_RNDNE_F32_e32
-; SI: V_RNDNE_F32_e32
-; SI: V_RNDNE_F32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
+; SI: v_rndne_f32_e32
 define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
@@ -42,10 +42,10 @@
   ret void
 }
 
-; FUNC-LABEL: @legacy_amdil_round_nearest_f32
+; FUNC-LABEL: {{^}}legacy_amdil_round_nearest_f32:
 ; R600: RNDNE
 
-; SI: V_RNDNE_F32_e32
+; SI: v_rndne_f32_e32
 define void @legacy_amdil_round_nearest_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.AMDIL.round.nearest.f32(float %in) #0

diff --git a/test/CodeGen/R600/llvm.round.ll b/test/CodeGen/R600/llvm.round.ll
index e06d45d..bedf4ba 100644
--- a/test/CodeGen/R600/llvm.round.ll
+++ b/test/CodeGen/R600/llvm.round.ll

@@ -1,11 +1,13 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
 
-; FUNC-LABEL: @f32
-; R600: FRACT
-; R600-DAG: ADD
-; R600-DAG: CEIL
-; R600-DAG: FLOOR
-; R600: CNDGE
+; FUNC-LABEL: {{^}}f32:
+; R600: FRACT {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
+; R600-DAG: ADD  {{.*}}, -0.5
+; R600-DAG: CEIL {{.*}} [[ARG]]
+; R600-DAG: FLOOR {{.*}} [[ARG]]
+; R600-DAG: CNDGE
+; R600-DAG: CNDGT
+; R600: CNDGE {{[^,]+}}, [[ARG]]
 define void @f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.round.f32(float %in)

diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index 41c363c..7e45710 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll

@@ -1,35 +1,84 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -check-prefix=EG -check-prefix=FUNC
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=SI-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
 
-;FUNC-LABEL: test
-;EG: MULADD_IEEE *
-;EG: FRACT *
-;EG: ADD *
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG-NOT: SIN
-;SI: V_MUL_F32
-;SI: V_SIN_F32
-;SI-NOT: V_SIN_F32
+; FUNC-LABEL: sin_f32
+; EG: MULADD_IEEE *
+; EG: FRACT *
+; EG: ADD *
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG-NOT: SIN
+; SI: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
 
-define void @test(float addrspace(1)* %out, float %x) #1 {
+define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
    %sin = call float @llvm.sin.f32(float %x)
    store float %sin, float addrspace(1)* %out
    ret void
 }
 
-;FUNC-LABEL: testv
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-;EG-NOT: SIN
-;SI: V_SIN_F32
-;SI: V_SIN_F32
-;SI: V_SIN_F32
-;SI: V_SIN_F32
-;SI-NOT: V_SIN_F32
+; FUNC-LABEL: {{^}}sin_3x_f32:
+; SI-UNSAFE-NOT: v_add_f32
+; SI-UNSAFE: 0x3ef47644
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_mul_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 3.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
 
-define void @testv(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+; FUNC-LABEL: {{^}}sin_2x_f32:
+; SI-UNSAFE-NOT: v_add_f32
+; SI-UNSAFE: 0x3ea2f983
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_add_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+  %y = fmul float 2.0, %x
+  %sin = call float @llvm.sin.f32(float %y)
+  store float %sin, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_2sin_f32:
+; SI-UNSAFE: 0x3ea2f983
+; SI-UNSAFE: v_mul_f32
+; SI-SAFE: v_add_f32
+; SI-SAFE: v_mul_f32
+; SI: v_fract_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+define void @test_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+   %y = fmul float 2.0, %x
+   %sin = call float @llvm.sin.f32(float %y)
+   store float %sin, float addrspace(1)* %out
+   ret void
+}
+
+; FUNC-LABEL: {{^}}sin_v4f32:
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; EG-NOT: SIN
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI: v_sin_f32
+; SI-NOT: v_sin_f32
+
+define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
    %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
    store <4 x float> %sin, <4 x float> addrspace(1)* %out
    ret void

diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll
index 4eee37f..c039225 100644
--- a/test/CodeGen/R600/llvm.sqrt.ll
+++ b/test/CodeGen/R600/llvm.sqrt.ll

@@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 --mcpu=SI -verify-machineinstrs| FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK-LABEL: @sqrt_f32
+; R600-CHECK-LABEL: {{^}}sqrt_f32:
 ; R600-CHECK: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; R600-CHECK: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-; SI-CHECK-LABEL: @sqrt_f32
-; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK-LABEL: {{^}}sqrt_f32:
+; SI-CHECK: v_sqrt_f32_e32
 define void @sqrt_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.sqrt.f32(float %in)
@@ -13,14 +13,14 @@
   ret void
 }
 
-; R600-CHECK-LABEL: @sqrt_v2f32
+; R600-CHECK-LABEL: {{^}}sqrt_v2f32:
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-; SI-CHECK-LABEL: @sqrt_v2f32
-; SI-CHECK: V_SQRT_F32_e32
-; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK-LABEL: {{^}}sqrt_v2f32:
+; SI-CHECK: v_sqrt_f32_e32
+; SI-CHECK: v_sqrt_f32_e32
 define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
@@ -28,7 +28,7 @@
   ret void
 }
 
-; R600-CHECK-LABEL: @sqrt_v4f32
+; R600-CHECK-LABEL: {{^}}sqrt_v4f32:
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
@@ -37,11 +37,11 @@
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
 ; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-; SI-CHECK-LABEL: @sqrt_v4f32
-; SI-CHECK: V_SQRT_F32_e32
-; SI-CHECK: V_SQRT_F32_e32
-; SI-CHECK: V_SQRT_F32_e32
-; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK-LABEL: {{^}}sqrt_v4f32:
+; SI-CHECK: v_sqrt_f32_e32
+; SI-CHECK: v_sqrt_f32_e32
+; SI-CHECK: v_sqrt_f32_e32
+; SI-CHECK: v_sqrt_f32_e32
 define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)

diff --git a/test/CodeGen/R600/llvm.trunc.ll b/test/CodeGen/R600/llvm.trunc.ll
index fa6fb99..5585477 100644
--- a/test/CodeGen/R600/llvm.trunc.ll
+++ b/test/CodeGen/R600/llvm.trunc.ll

@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK-LABEL: @trunc_f32
+; CHECK-LABEL: {{^}}trunc_f32:
 ; CHECK: TRUNC
 
 define void @trunc_f32(float addrspace(1)* %out, float %in) {

diff --git a/test/CodeGen/R600/load-i1.ll b/test/CodeGen/R600/load-i1.ll
index 9ba81b8..d85e16f 100644
--- a/test/CodeGen/R600/load-i1.ll
+++ b/test/CodeGen/R600/load-i1.ll

@@ -1,21 +1,21 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
-; SI-LABEL: @global_copy_i1_to_i1
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_AND_B32_e32 v{{[0-9]+}}, 1
-; SI: BUFFER_STORE_BYTE
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}global_copy_i1_to_i1:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1
+; SI: buffer_store_byte
+; SI: s_endpgm
 define void @global_copy_i1_to_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   store i1 %load, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @global_sextload_i1_to_i32
+; SI-LABEL: {{^}}global_sextload_i1_to_i32:
 ; XSI: BUFFER_LOAD_BYTE
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
@@ -23,10 +23,10 @@
   ret void
 }
 
-; SI-LABEL: @global_zextload_i1_to_i32
-; SI: BUFFER_LOAD_UBYTE
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}global_zextload_i1_to_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
@@ -34,10 +34,10 @@
   ret void
 }
 
-; SI-LABEL: @global_sextload_i1_to_i64
+; SI-LABEL: {{^}}global_sextload_i1_to_i64:
 ; XSI: BUFFER_LOAD_BYTE
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = sext i1 %load to i64
@@ -45,10 +45,10 @@
   ret void
 }
 
-; SI-LABEL: @global_zextload_i1_to_i64
-; SI: BUFFER_LOAD_UBYTE
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}global_zextload_i1_to_i64:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1 addrspace(1)* %in
   %ext = zext i1 %load to i64
@@ -56,50 +56,50 @@
   ret void
 }
 
-; SI-LABEL: @i1_arg
-; SI: BUFFER_LOAD_UBYTE
-; SI: V_AND_B32_e32
-; SI: BUFFER_STORE_BYTE
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}i1_arg:
+; SI: buffer_load_ubyte
+; SI: v_and_b32_e32
+; SI: buffer_store_byte
+; SI: s_endpgm
 define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   store i1 %x, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @i1_arg_zext_i32
-; SI: BUFFER_LOAD_UBYTE
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}i1_arg_zext_i32:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @i1_arg_zext_i64
-; SI: BUFFER_LOAD_UBYTE
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}i1_arg_zext_i64:
+; SI: buffer_load_ubyte
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @i1_arg_sext_i32
+; SI-LABEL: {{^}}i1_arg_sext_i32:
 ; XSI: BUFFER_LOAD_BYTE
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i32
   store i32 %ext, i32addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @i1_arg_sext_i64
+; SI-LABEL: {{^}}i1_arg_sext_i64:
 ; XSI: BUFFER_LOAD_BYTE
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8

diff --git a/test/CodeGen/R600/load-input-fold.ll b/test/CodeGen/R600/load-input-fold.ll
index ca86d0e..265fa9b 100644
--- a/test/CodeGen/R600/load-input-fold.ll
+++ b/test/CodeGen/R600/load-input-fold.ll

@@ -1,5 +1,4 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
-;REQUIRES: asserts
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
 main_body:

diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index a57df5c..62d3063 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll

@@ -7,10 +7,10 @@
 ;===------------------------------------------------------------------------===;
 
 ; Load an i8 value from the global address space.
-; FUNC-LABEL: @load_i8
+; FUNC-LABEL: {{^}}load_i8:
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
-; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
+; SI-CHECK: buffer_load_ubyte v{{[0-9]+}},
 define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8 addrspace(1)* %in
   %2 = zext i8 %1 to i32
@@ -18,13 +18,13 @@
   ret void
 }
 
-; FUNC-LABEL: @load_i8_sext
+; FUNC-LABEL: {{^}}load_i8_sext:
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: buffer_load_sbyte
 define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = load i8 addrspace(1)* %in
@@ -33,11 +33,11 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v2i8
+; FUNC-LABEL: {{^}}load_v2i8:
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(1)* %in
@@ -46,7 +46,7 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v2i8_sext
+; FUNC-LABEL: {{^}}load_v2i8_sext:
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
@@ -57,8 +57,8 @@
 ; R600-CHECK-DAG: 24
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
 ; R600-CHECK-DAG: 24
-; SI-CHECK: BUFFER_LOAD_SBYTE
-; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: buffer_load_sbyte
+; SI-CHECK: buffer_load_sbyte
 define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(1)* %in
@@ -67,15 +67,15 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v4i8
+; FUNC-LABEL: {{^}}load_v4i8:
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
-; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
+; SI-CHECK: buffer_load_ubyte
 define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(1)* %in
@@ -84,7 +84,7 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v4i8_sext
+; FUNC-LABEL: {{^}}load_v4i8_sext:
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
@@ -105,10 +105,10 @@
 ; R600-CHECK-DAG: 24
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
 ; R600-CHECK-DAG: 24
-; SI-CHECK: BUFFER_LOAD_SBYTE
-; SI-CHECK: BUFFER_LOAD_SBYTE
-; SI-CHECK: BUFFER_LOAD_SBYTE
-; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: buffer_load_sbyte
+; SI-CHECK: buffer_load_sbyte
+; SI-CHECK: buffer_load_sbyte
+; SI-CHECK: buffer_load_sbyte
 define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(1)* %in
@@ -118,9 +118,9 @@
 }
 
 ; Load an i16 value from the global address space.
-; FUNC-LABEL: @load_i16
+; FUNC-LABEL: {{^}}load_i16:
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
 define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %0 = load i16	 addrspace(1)* %in
@@ -129,13 +129,13 @@
   ret void
 }
 
-; FUNC-LABEL: @load_i16_sext
+; FUNC-LABEL: {{^}}load_i16_sext:
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: buffer_load_sshort
 define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %0 = load i16 addrspace(1)* %in
@@ -144,11 +144,11 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v2i16
+; FUNC-LABEL: {{^}}load_v2i16:
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(1)* %in
@@ -157,7 +157,7 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v2i16_sext
+; FUNC-LABEL: {{^}}load_v2i16_sext:
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
@@ -168,8 +168,8 @@
 ; R600-CHECK-DAG: 16
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
 ; R600-CHECK-DAG: 16
-; SI-CHECK: BUFFER_LOAD_SSHORT
-; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: buffer_load_sshort
+; SI-CHECK: buffer_load_sshort
 define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(1)* %in
@@ -178,15 +178,15 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v4i16
+; FUNC-LABEL: {{^}}load_v4i16:
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
 ; R600-CHECK: VTX_READ_16
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
+; SI-CHECK: buffer_load_ushort
 define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(1)* %in
@@ -195,7 +195,7 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v4i16_sext
+; FUNC-LABEL: {{^}}load_v4i16_sext:
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
@@ -216,10 +216,10 @@
 ; R600-CHECK-DAG: 16
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
 ; R600-CHECK-DAG: 16
-; SI-CHECK: BUFFER_LOAD_SSHORT
-; SI-CHECK: BUFFER_LOAD_SSHORT
-; SI-CHECK: BUFFER_LOAD_SSHORT
-; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: buffer_load_sshort
+; SI-CHECK: buffer_load_sshort
+; SI-CHECK: buffer_load_sshort
+; SI-CHECK: buffer_load_sshort
 define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(1)* %in
@@ -229,10 +229,10 @@
 }
 
 ; load an i32 value from the global address space.
-; FUNC-LABEL: @load_i32
+; FUNC-LABEL: {{^}}load_i32:
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
+; SI-CHECK: buffer_load_dword v{{[0-9]+}}
 define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32 addrspace(1)* %in
@@ -241,10 +241,10 @@
 }
 
 ; load a f32 value from the global address space.
-; FUNC-LABEL: @load_f32
+; FUNC-LABEL: {{^}}load_f32:
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
+; SI-CHECK: buffer_load_dword v{{[0-9]+}}
 define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float addrspace(1)* %in
@@ -253,10 +253,10 @@
 }
 
 ; load a v2f32 value from the global address space
-; FUNC-LABEL: @load_v2f32
+; FUNC-LABEL: {{^}}load_v2f32:
+; R600-CHECK: MEM_RAT
 ; R600-CHECK: VTX_READ_64
-
-; SI-CHECK: BUFFER_LOAD_DWORDX2
+; SI-CHECK: buffer_load_dwordx2
 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 entry:
   %0 = load <2 x float> addrspace(1)* %in
@@ -264,11 +264,9 @@
   ret void
 }
 
-; FUNC-LABEL: @load_i64
-; R600-CHECK: MEM_RAT
-; R600-CHECK: MEM_RAT
-
-; SI-CHECK: BUFFER_LOAD_DWORDX2
+; FUNC-LABEL: {{^}}load_i64:
+; R600-CHECK: VTX_READ_64
+; SI-CHECK: buffer_load_dwordx2
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64 addrspace(1)* %in
@@ -276,12 +274,12 @@
   ret void
 }
 
-; FUNC-LABEL: @load_i64_sext
+; FUNC-LABEL: {{^}}load_i64_sext:
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
 ; R600-CHECK: 31
-; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: buffer_load_dword
 
 define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -291,7 +289,7 @@
   ret void
 }
 
-; FUNC-LABEL: @load_i64_zext
+; FUNC-LABEL: {{^}}load_i64_zext:
 ; R600-CHECK: MEM_RAT
 ; R600-CHECK: MEM_RAT
 define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -302,18 +300,18 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v8i32
+; FUNC-LABEL: {{^}}load_v8i32:
 ; R600-CHECK: VTX_READ_128
 ; R600-CHECK: VTX_READ_128
 ; XXX: We should be using DWORDX4 instructions on SI.
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <8 x i32> addrspace(1)* %in
@@ -321,28 +319,28 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v16i32
+; FUNC-LABEL: {{^}}load_v16i32:
 ; R600-CHECK: VTX_READ_128
 ; R600-CHECK: VTX_READ_128
 ; R600-CHECK: VTX_READ_128
 ; R600-CHECK: VTX_READ_128
 ; XXX: We should be using DWORDX4 instructions on SI.
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
-; SI-CHECK: BUFFER_LOAD_DWORD
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
+; SI-CHECK: buffer_load_dword
 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
 entry:
   %0 = load <16 x i32> addrspace(1)* %in
@@ -355,13 +353,13 @@
 ;===------------------------------------------------------------------------===;
 
 ; Load a sign-extended i8 value
-; FUNC-LABEL: @load_const_i8_sext
+; FUNC-LABEL: {{^}}load_const_i8_sext:
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK: BUFFER_LOAD_SBYTE v{{[0-9]+}},
+; SI-CHECK: buffer_load_sbyte v{{[0-9]+}},
 define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -371,9 +369,9 @@
 }
 
 ; Load an aligned i8 value
-; FUNC-LABEL: @load_const_i8_aligned
+; FUNC-LABEL: {{^}}load_const_i8_aligned:
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
+; SI-CHECK: buffer_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -383,9 +381,9 @@
 }
 
 ; Load an un-aligned i8 value
-; FUNC-LABEL: @load_const_i8_unaligned
+; FUNC-LABEL: {{^}}load_const_i8_unaligned:
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
+; SI-CHECK: buffer_load_ubyte v{{[0-9]+}},
 define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(2)* %in, i32 1
@@ -396,13 +394,13 @@
 }
 
 ; Load a sign-extended i16 value
-; FUNC-LABEL: @load_const_i16_sext
+; FUNC-LABEL: {{^}}load_const_i16_sext:
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: buffer_load_sshort
 define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = load i16 addrspace(2)* %in
@@ -412,9 +410,9 @@
 }
 
 ; Load an aligned i16 value
-; FUNC-LABEL: @load_const_i16_aligned
+; FUNC-LABEL: {{^}}load_const_i16_aligned:
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
 define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = load i16 addrspace(2)* %in
@@ -424,9 +422,9 @@
 }
 
 ; Load an un-aligned i16 value
-; FUNC-LABEL: @load_const_i16_unaligned
+; FUNC-LABEL: {{^}}load_const_i16_unaligned:
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: buffer_load_ushort
 define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i16 addrspace(2)* %in, i32 1
@@ -437,10 +435,10 @@
 }
 
 ; Load an i32 value from the constant address space.
-; FUNC-LABEL: @load_const_addrspace_i32
+; FUNC-LABEL: {{^}}load_const_addrspace_i32:
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
+; SI-CHECK: s_load_dword s{{[0-9]+}}
 define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %0 = load i32 addrspace(2)* %in
@@ -449,10 +447,10 @@
 }
 
 ; Load a f32 value from the constant address space.
-; FUNC-LABEL: @load_const_addrspace_f32
+; FUNC-LABEL: {{^}}load_const_addrspace_f32:
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
+; SI-CHECK: s_load_dword s{{[0-9]+}}
 define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
   %1 = load float addrspace(2)* %in
   store float %1, float addrspace(1)* %out
@@ -464,11 +462,11 @@
 ;===------------------------------------------------------------------------===;
 
 ; Load an i8 value from the local address space.
-; FUNC-LABEL: @load_i8_local
+; FUNC-LABEL: {{^}}load_i8_local:
 ; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u8
 define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
   %1 = load i8 addrspace(3)* %in
   %2 = zext i8 %1 to i32
@@ -476,12 +474,12 @@
   ret void
 }
 
-; FUNC-LABEL: @load_i8_sext_local
+; FUNC-LABEL: {{^}}load_i8_sext_local:
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i8
 define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
 entry:
   %0 = load i8 addrspace(3)* %in
@@ -490,13 +488,13 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v2i8_local
+; FUNC-LABEL: {{^}}load_v2i8_local:
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U8
-; SI-CHECK: DS_READ_U8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u8
+; SI-CHECK: ds_read_u8
 define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(3)* %in
@@ -505,15 +503,15 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v2i8_sext_local
+; FUNC-LABEL: {{^}}load_v2i8_sext_local:
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I8
-; SI-CHECK: DS_READ_I8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i8
+; SI-CHECK: ds_read_i8
 define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i8> addrspace(3)* %in
@@ -522,17 +520,17 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v4i8_local
+; FUNC-LABEL: {{^}}load_v4i8_local:
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
 ; R600-CHECK: LDS_UBYTE_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U8
-; SI-CHECK: DS_READ_U8
-; SI-CHECK: DS_READ_U8
-; SI-CHECK: DS_READ_U8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u8
+; SI-CHECK: ds_read_u8
+; SI-CHECK: ds_read_u8
+; SI-CHECK: ds_read_u8
 define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(3)* %in
@@ -541,7 +539,7 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v4i8_sext_local
+; FUNC-LABEL: {{^}}load_v4i8_sext_local:
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
 ; R600-CHECK-DAG: LDS_UBYTE_READ_RET
@@ -550,12 +548,12 @@
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I8
-; SI-CHECK: DS_READ_I8
-; SI-CHECK: DS_READ_I8
-; SI-CHECK: DS_READ_I8
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i8
+; SI-CHECK: ds_read_i8
+; SI-CHECK: ds_read_i8
+; SI-CHECK: ds_read_i8
 define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i8> addrspace(3)* %in
@@ -565,11 +563,11 @@
 }
 
 ; Load an i16 value from the local address space.
-; FUNC-LABEL: @load_i16_local
+; FUNC-LABEL: {{^}}load_i16_local:
 ; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u16
 define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
   %0 = load i16	 addrspace(3)* %in
@@ -578,12 +576,12 @@
   ret void
 }
 
-; FUNC-LABEL: @load_i16_sext_local
+; FUNC-LABEL: {{^}}load_i16_sext_local:
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i16
 define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
 entry:
   %0 = load i16 addrspace(3)* %in
@@ -592,13 +590,13 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v2i16_local
+; FUNC-LABEL: {{^}}load_v2i16_local:
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U16
-; SI-CHECK: DS_READ_U16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u16
+; SI-CHECK: ds_read_u16
 define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(3)* %in
@@ -607,15 +605,15 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v2i16_sext_local
+; FUNC-LABEL: {{^}}load_v2i16_sext_local:
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I16
-; SI-CHECK: DS_READ_I16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i16
+; SI-CHECK: ds_read_i16
 define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <2 x i16> addrspace(3)* %in
@@ -624,17 +622,17 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v4i16_local
+; FUNC-LABEL: {{^}}load_v4i16_local:
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
 ; R600-CHECK: LDS_USHORT_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_U16
-; SI-CHECK: DS_READ_U16
-; SI-CHECK: DS_READ_U16
-; SI-CHECK: DS_READ_U16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_u16
+; SI-CHECK: ds_read_u16
+; SI-CHECK: ds_read_u16
+; SI-CHECK: ds_read_u16
 define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(3)* %in
@@ -643,7 +641,7 @@
   ret void
 }
 
-; FUNC-LABEL: @load_v4i16_sext_local
+; FUNC-LABEL: {{^}}load_v4i16_sext_local:
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
 ; R600-CHECK-DAG: LDS_USHORT_READ_RET
@@ -652,12 +650,12 @@
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
 ; R600-CHECK-DAG: ASHR
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_I16
-; SI-CHECK: DS_READ_I16
-; SI-CHECK: DS_READ_I16
-; SI-CHECK: DS_READ_I16
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_i16
+; SI-CHECK: ds_read_i16
+; SI-CHECK: ds_read_i16
+; SI-CHECK: ds_read_i16
 define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %0 = load <4 x i16> addrspace(3)* %in
@@ -667,11 +665,11 @@
 }
 
 ; load an i32 value from the local address space.
-; FUNC-LABEL: @load_i32_local
+; FUNC-LABEL: {{^}}load_i32_local:
 ; R600-CHECK: LDS_READ_RET
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_B32
+; SI-CHECK-NOT: s_wqm_b64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_b32
 define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32 addrspace(3)* %in
@@ -680,10 +678,10 @@
 }
 
 ; load a f32 value from the local address space.
-; FUNC-LABEL: @load_f32_local
+; FUNC-LABEL: {{^}}load_f32_local:
 ; R600-CHECK: LDS_READ_RET
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_B32
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_b32
 define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
 entry:
   %0 = load float addrspace(3)* %in
@@ -692,14 +690,50 @@
 }
 
 ; load a v2f32 value from the local address space
-; FUNC-LABEL: @load_v2f32_local
+; FUNC-LABEL: {{^}}load_v2f32_local:
 ; R600-CHECK: LDS_READ_RET
 ; R600-CHECK: LDS_READ_RET
-; SI-CHECK: S_MOV_B32 m0
-; SI-CHECK: DS_READ_B64
+; SI-CHECK: s_mov_b32 m0
+; SI-CHECK: ds_read_b64
 define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
 entry:
   %0 = load <2 x float> addrspace(3)* %in
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
+
+; Test loading a i32 and v2i32 value from the same base pointer.
+; FUNC-LABEL: {{^}}load_i32_v2i32_local:
+; R600-CHECK: LDS_READ_RET
+; R600-CHECK: LDS_READ_RET
+; R600-CHECK: LDS_READ_RET
+; SI-CHECK-DAG: ds_read_b32
+; SI-CHECK-DAG: ds_read2_b32
+define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
+  %scalar = load i32 addrspace(3)* %in
+  %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
+  %vec_ptr = getelementptr <2 x i32> addrspace(3)* %tmp0, i32 2
+  %vec0 = load <2 x i32> addrspace(3)* %vec_ptr, align 4
+  %vec1 = insertelement <2 x i32> <i32 0, i32 0>, i32 %scalar, i32 0
+  %vec = add <2 x i32> %vec0, %vec1
+  store <2 x i32> %vec, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+
+@lds = addrspace(3) global [512 x i32] undef, align 4
+
+; On SI we need to make sure that the base offset is a register and not
+; an immediate.
+; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
+; SI-CHECK: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
+; SI-CHECK: ds_read_b32 v0, v[[ZERO]] offset:4
+; R600-CHECK: LDS_READ_RET
+define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %tmp0 = getelementptr [512 x i32] addrspace(3)* @lds, i32 0, i32 1
+  %tmp1 = load i32 addrspace(3)* %tmp0
+  %tmp2 = getelementptr i32 addrspace(1)* %out, i32 1
+  store i32 %tmp1, i32 addrspace(1)* %tmp2
+  ret void
+}

diff --git a/test/CodeGen/R600/load.vec.ll b/test/CodeGen/R600/load.vec.ll
index 81a6310..0d6e213 100644
--- a/test/CodeGen/R600/load.vec.ll
+++ b/test/CodeGen/R600/load.vec.ll

@@ -2,10 +2,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK  %s
 
 ; load a v2i32 value from the global address space.
-; EG-CHECK: @load_v2i32
+; EG-CHECK: {{^}}load_v2i32:
 ; EG-CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
-; SI-CHECK: @load_v2i32
-; SI-CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; SI-CHECK: {{^}}load_v2i32:
+; SI-CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %a = load <2 x i32> addrspace(1) * %in
   store <2 x i32> %a, <2 x i32> addrspace(1)* %out
@@ -13,10 +13,10 @@
 }
 
 ; load a v4i32 value from the global address space.
-; EG-CHECK: @load_v4i32
+; EG-CHECK: {{^}}load_v4i32:
 ; EG-CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
-; SI-CHECK: @load_v4i32
-; SI-CHECK: BUFFER_LOAD_DWORDX4 v[{{[0-9]+:[0-9]+}}]
+; SI-CHECK: {{^}}load_v4i32:
+; SI-CHECK: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}]
 define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %a = load <4 x i32> addrspace(1) * %in
   store <4 x i32> %a, <4 x i32> addrspace(1)* %out

diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll
index a117557..a60c4eb 100644
--- a/test/CodeGen/R600/load64.ll
+++ b/test/CodeGen/R600/load64.ll

@@ -1,18 +1,18 @@
 ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; load a f64 value from the global address space.
-; CHECK-LABEL: @load_f64:
-; CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: BUFFER_STORE_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; CHECK-LABEL: {{^}}load_f64:
+; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %1 = load double addrspace(1)* %in
   store double %1, double addrspace(1)* %out
   ret void
 }
 
-; CHECK-LABEL: @load_i64:
-; CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
-; CHECK: BUFFER_STORE_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; CHECK-LABEL: {{^}}load_i64:
+; CHECK: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tmp = load i64 addrspace(1)* %in
   store i64 %tmp, i64 addrspace(1)* %out, align 8
@@ -20,9 +20,9 @@
 }
 
 ; Load a f64 value from the constant address space.
-; CHECK-LABEL: @load_const_addrspace_f64:
-; CHECK: S_LOAD_DWORDX2 s[{{[0-9]+:[0-9]+}}]
-; CHECK: BUFFER_STORE_DWORDX2 v[{{[0-9]+:[0-9]+}}]
+; CHECK-LABEL: {{^}}load_const_addrspace_f64:
+; CHECK: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}]
 define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
   %1 = load double addrspace(2)* %in
   store double %1, double addrspace(1)* %out

diff --git a/test/CodeGen/R600/local-64.ll b/test/CodeGen/R600/local-64.ll
index c52b41b..eb14b5f 100644
--- a/test/CodeGen/R600/local-64.ll
+++ b/test/CodeGen/R600/local-64.ll

@@ -1,8 +1,9 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
 
-; SI-LABEL: @local_i32_load
-; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0x1c, [M0]
-; SI: BUFFER_STORE_DWORD [[REG]],
+; BOTH-LABEL: {{^}}local_i32_load
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28 [M0]
+; BOTH: buffer_store_dword [[REG]],
 define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %gep = getelementptr i32 addrspace(3)* %in, i32 7
   %val = load i32 addrspace(3)* %gep, align 4
@@ -10,19 +11,19 @@
   ret void
 }
 
-; SI-LABEL: @local_i32_load_0_offset
-; SI: DS_READ_B32 [[REG:v[0-9]+]], v{{[0-9]+}}, 0x0, [M0]
-; SI: BUFFER_STORE_DWORD [[REG]],
+; BOTH-LABEL: {{^}}local_i32_load_0_offset
+; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} [M0]
+; BOTH: buffer_store_dword [[REG]],
 define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %val = load i32 addrspace(3)* %in, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: @local_i8_load_i16_max_offset
-; SI-NOT: ADD
-; SI: DS_READ_U8 [[REG:v[0-9]+]], {{v[0-9]+}}, 0xffff, [M0]
-; SI: BUFFER_STORE_BYTE [[REG]],
+; BOTH-LABEL: {{^}}local_i8_load_i16_max_offset:
+; BOTH-NOT: ADD
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535 [M0]
+; BOTH: buffer_store_byte [[REG]],
 define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65535
   %val = load i8 addrspace(3)* %gep, align 4
@@ -30,11 +31,14 @@
   ret void
 }
 
-; SI-LABEL: @local_i8_load_over_i16_max_offset
-; SI: S_ADD_I32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
-; SI: V_MOV_B32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
-; SI: DS_READ_U8 [[REG:v[0-9]+]], [[VREGADDR]], 0x0, [M0]
-; SI: BUFFER_STORE_BYTE [[REG]],
+; BOTH-LABEL: {{^}}local_i8_load_over_i16_max_offset:
+; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
+; SI, which is why it is being OR'd with the base pointer.
+; SI: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; CI: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
+; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]] [M0]
+; BOTH: buffer_store_byte [[REG]],
 define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8 addrspace(3)* %in, i32 65536
   %val = load i8 addrspace(3)* %gep, align 4
@@ -42,10 +46,10 @@
   ret void
 }
 
-; SI-LABEL: @local_i64_load
-; SI-NOT: ADD
-; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 0x38, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[REG]],
+; BOTH-LABEL: {{^}}local_i64_load:
+; BOTH-NOT: ADD
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 [M0]
+; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %gep = getelementptr i64 addrspace(3)* %in, i32 7
   %val = load i64 addrspace(3)* %gep, align 8
@@ -53,19 +57,19 @@
   ret void
 }
 
-; SI-LABEL: @local_i64_load_0_offset
-; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0x0, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[REG]],
+; BOTH-LABEL: {{^}}local_i64_load_0_offset
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} [M0]
+; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %val = load i64 addrspace(3)* %in, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @local_f64_load
-; SI-NOT: ADD
-; SI: DS_READ_B64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}}, 0x38, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[REG]],
+; BOTH-LABEL: {{^}}local_f64_load:
+; BOTH-NOT: ADD
+; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56 [M0]
+; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %gep = getelementptr double addrspace(3)* %in, i32 7
   %val = load double addrspace(3)* %gep, align 8
@@ -73,85 +77,89 @@
   ret void
 }
 
-; SI-LABEL: @local_f64_load_0_offset
-; SI: DS_READ_B64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, 0x0, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[REG]],
+; BOTH-LABEL: {{^}}local_f64_load_0_offset
+; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} [M0]
+; BOTH: buffer_store_dwordx2 [[REG]],
 define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %val = load double addrspace(3)* %in, align 8
   store double %val, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @local_i64_store
-; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x38 [M0]
+; BOTH-LABEL: {{^}}local_i64_store:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 [M0]
 define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
   %gep = getelementptr i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
   ret void
 }
 
-; SI-LABEL: @local_i64_store_0_offset
-; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
+; BOTH-LABEL: {{^}}local_i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
 define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @local_f64_store
-; SI-NOT: ADD
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x38 [M0]
+; BOTH-LABEL: {{^}}local_f64_store:
+; BOTH-NOT: ADD
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56 [M0]
 define void @local_f64_store(double addrspace(3)* %out) nounwind {
   %gep = getelementptr double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
   ret void
 }
 
-; SI-LABEL: @local_f64_store_0_offset
-; SI: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
+; BOTH-LABEL: {{^}}local_f64_store_0_offset
+; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
 define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @local_v2i64_store
-; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x78 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x70 [M0]
+; BOTH-LABEL: {{^}}local_v2i64_store:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120 [M0]
+; BOTH: s_endpgm
 define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64> addrspace(3)* %out, i32 7
   store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
   ret void
 }
 
-; SI-LABEL: @local_v2i64_store_0_offset
-; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
+; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 [M0]
+; BOTH: s_endpgm
 define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
   ret void
 }
 
-; SI-LABEL: @local_v4i64_store
-; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xf8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xf0 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xe8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0xe0 [M0]
+; BOTH-LABEL: {{^}}local_v4i64_store:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248 [M0]
+; BOTH: s_endpgm
 define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64> addrspace(3)* %out, i32 7
   store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
   ret void
 }
 
-; SI-LABEL: @local_v4i64_store_0_offset
-; SI-NOT: ADD
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x18 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x10 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x8 [M0]
-; SI-DAG: DS_WRITE_B64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, 0x0 [M0]
+; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
+; BOTH-NOT: ADD
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16 [M0]
+; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24 [M0]
+; BOTH: s_endpgm
 define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
   ret void

diff --git a/test/CodeGen/R600/local-atomics.ll b/test/CodeGen/R600/local-atomics.ll
index 5a44951..2ac811f 100644
--- a/test/CodeGen/R600/local-atomics.ll
+++ b/test/CodeGen/R600/local-atomics.ll

@@ -1,21 +1,25 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; FUNC-LABEL: @lds_atomic_xchg_ret_i32:
-; SI: S_LOAD_DWORD [[SPTR:s[0-9]+]],
-; SI: V_MOV_B32_e32 [[DATA:v[0-9]+]], 4
-; SI: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: DS_WRXCHG_RTN_B32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]], 0x0, [M0]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
+; EG: LDS_WRXCHG_RET *
+; SI: s_load_dword [[SPTR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xchg_ret_i32_offset:
-; SI: DS_WRXCHG_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32_offset:
+; EG: LDS_WRXCHG_RET *
+; SI: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -24,22 +28,24 @@
 }
 
 ; XXX - Is it really necessary to load 4 into VGPR?
-; FUNC-LABEL: @lds_atomic_add_ret_i32:
-; SI: S_LOAD_DWORD [[SPTR:s[0-9]+]],
-; SI: V_MOV_B32_e32 [[DATA:v[0-9]+]], 4
-; SI: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: DS_ADD_RTN_U32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]], 0x0, [M0]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
+; EG: LDS_ADD_RET *
+; SI: s_load_dword [[SPTR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_add_ret_i32_offset:
-; SI: DS_ADD_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -47,22 +53,38 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_inc_ret_i32:
-; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
-; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: DS_INC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x0
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32_bad_si_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} [M0]
+; CI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32:
+; EG: LDS_ADD_RET *
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
+; SI: s_endpgm
 define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_inc_ret_i32_offset:
-; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
-; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: DS_INC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
+; EG: LDS_ADD_RET *
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; SI: s_endpgm
 define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
@@ -70,18 +92,34 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_sub_ret_i32:
-; SI: DS_SUB_RTN_U32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_bad_si_offset:
+; EG: LDS_ADD_RET *
+; SI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} [M0]
+; CI: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_inc_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32:
+; EG: LDS_SUB_RET *
+; SI: ds_sub_rtn_u32
+; SI: s_endpgm
 define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_sub_ret_i32_offset:
-; SI: DS_SUB_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i32_offset:
+; EG: LDS_SUB_RET *
+; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -89,22 +127,24 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_dec_ret_i32:
-; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
-; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: DS_DEC_RTN_U32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x0
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32:
+; EG: LDS_SUB_RET *
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_dec_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] [M0]
+; SI: s_endpgm
 define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_dec_ret_i32_offset:
-; SI: S_MOV_B32 [[SNEGONE:s[0-9]+]], -1
-; SI: V_MOV_B32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
-; SI: DS_DEC_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]], 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
+; EG: LDS_SUB_RET *
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[NEGONE]] offset:16
+; SI: s_endpgm
 define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
@@ -112,18 +152,20 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_and_ret_i32:
-; SI: DS_AND_RTN_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32:
+; EG: LDS_AND_RET *
+; SI: ds_and_rtn_b32
+; SI: s_endpgm
 define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_and_ret_i32_offset:
-; SI: DS_AND_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i32_offset:
+; EG: LDS_AND_RET *
+; SI: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -131,18 +173,20 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_or_ret_i32:
-; SI: DS_OR_RTN_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32:
+; EG: LDS_OR_RET *
+; SI: ds_or_rtn_b32
+; SI: s_endpgm
 define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_or_ret_i32_offset:
-; SI: DS_OR_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i32_offset:
+; EG: LDS_OR_RET *
+; SI: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -150,18 +194,20 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xor_ret_i32:
-; SI: DS_XOR_RTN_B32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32:
+; EG: LDS_XOR_RET *
+; SI: ds_xor_rtn_b32
+; SI: s_endpgm
 define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xor_ret_i32_offset:
-; SI: DS_XOR_RTN_B32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i32_offset:
+; EG: LDS_XOR_RET *
+; SI: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -170,25 +216,27 @@
 }
 
 ; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: @lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
+; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
 ; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i32 %result, i32 addrspace(1)* %out, align 4
 ;   ret void
 ; }
 
-; FUNC-LABEL: @lds_atomic_min_ret_i32:
-; SI: DS_MIN_RTN_I32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32:
+; EG: LDS_MIN_INT_RET *
+; SI: ds_min_rtn_i32
+; SI: s_endpgm
 define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_min_ret_i32_offset:
-; SI: DS_MIN_RTN_I32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i32_offset:
+; EG: LDS_MIN_INT_RET *
+; SI: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -196,18 +244,20 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_max_ret_i32:
-; SI: DS_MAX_RTN_I32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32:
+; EG: LDS_MAX_INT_RET *
+; SI: ds_max_rtn_i32
+; SI: s_endpgm
 define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_max_ret_i32_offset:
-; SI: DS_MAX_RTN_I32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i32_offset:
+; EG: LDS_MAX_INT_RET *
+; SI: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -215,18 +265,20 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umin_ret_i32:
-; SI: DS_MIN_RTN_U32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32:
+; EG: LDS_MIN_UINT_RET *
+; SI: ds_min_rtn_u32
+; SI: s_endpgm
 define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umin_ret_i32_offset:
-; SI: DS_MIN_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i32_offset:
+; EG: LDS_MIN_UINT_RET *
+; SI: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
@@ -234,21 +286,273 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umax_ret_i32:
-; SI: DS_MAX_RTN_U32
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32:
+; EG: LDS_MAX_UINT_RET *
+; SI: ds_max_rtn_u32
+; SI: s_endpgm
 define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umax_ret_i32_offset:
-; SI: DS_MAX_RTN_U32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 0x10
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i32_offset:
+; EG: LDS_MAX_UINT_RET *
+; SI: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
 define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32:
+; SI: s_load_dword [[SPTR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]] [M0]
+; SI: s_endpgm
+define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
+; SI: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; XXX - Is it really necessary to load 4 into VGPR?
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32:
+; SI: s_load_dword [[SPTR:s[0-9]+]],
+; SI: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
+; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; SI: ds_add_u32 [[VPTR]], [[DATA]] [M0]
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_bad_si_offset
+; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} [M0]
+; CI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 [M0]
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32:
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] [M0]
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_inc_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_bad_si_offset:
+; SI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}}
+; CI: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+  %sub = sub i32 %a, %b
+  %add = add i32 %sub, 4
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 %add
+  %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
+; SI: ds_sub_u32
+; SI: s_endpgm
+define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
+; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32:
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_dec_u32  v{{[0-9]+}}, [[NEGONE]]
+; SI: s_endpgm
+define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
+; SI: s_mov_b32 [[SNEGONE:s[0-9]+]], -1
+; SI: v_mov_b32_e32 [[NEGONE:v[0-9]+]], [[SNEGONE]]
+; SI: ds_dec_u32 v{{[0-9]+}}, [[NEGONE]] offset:16
+; SI: s_endpgm
+define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
+; SI: ds_and_b32
+; SI: s_endpgm
+define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
+; SI: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
+; SI: ds_or_b32
+; SI: s_endpgm
+define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
+; SI: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
+; SI: ds_xor_b32
+; SI: s_endpgm
+define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
+; SI: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
+; SI: ds_min_i32
+; SI: s_endpgm
+define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
+; SI: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
+; SI: ds_max_i32
+; SI: s_endpgm
+define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
+; SI: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
+; SI: ds_min_u32
+; SI: s_endpgm
+define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
+; SI: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
+; SI: ds_max_u32
+; SI: s_endpgm
+define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
+; SI: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
+; SI: s_endpgm
+define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i32 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
+  ret void
+}

diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
index 849b033..ce0cf59 100644
--- a/test/CodeGen/R600/local-atomics64.ll
+++ b/test/CodeGen/R600/local-atomics64.ll

@@ -1,17 +1,17 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
 
-; FUNC-LABEL: @lds_atomic_xchg_ret_i64:
-; SI: DS_WRXCHG_RTN_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64:
+; SI: ds_wrxchg_rtn_b64
+; SI: s_endpgm
 define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xchg_ret_i64_offset:
-; SI: DS_WRXCHG_RTN_B64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
+; SI: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -19,24 +19,24 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_add_ret_i64:
-; SI: DS_ADD_RTN_U64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64:
+; SI: ds_add_rtn_u64
+; SI: s_endpgm
 define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_add_ret_i64_offset:
-; SI: S_LOAD_DWORD [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
-; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI-DAG: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[PTR]]
-; SI: DS_ADD_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}, 0x20, [M0]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
@@ -44,22 +44,22 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_inc_ret_i64:
-; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
-; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI: DS_INC_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}},
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_inc_ret_i64_offset:
-; SI: DS_INC_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64_offset:
+; SI: ds_inc_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
@@ -67,18 +67,18 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_sub_ret_i64:
-; SI: DS_SUB_RTN_U64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64:
+; SI: ds_sub_rtn_u64
+; SI: s_endpgm
 define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_sub_ret_i64_offset:
-; SI: DS_SUB_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
+; SI: ds_sub_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -86,22 +86,22 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_dec_ret_i64:
-; SI: S_MOV_B64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
-; SI-DAG: V_MOV_B32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
-; SI-DAG: V_MOV_B32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
-; SI: DS_DEC_RTN_U64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}},
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SI: buffer_store_dwordx2 [[RESULT]],
+; SI: s_endpgm
 define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_dec_ret_i64_offset:
-; SI: DS_DEC_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64_offset:
+; SI: ds_dec_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
@@ -109,18 +109,18 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_and_ret_i64:
-; SI: DS_AND_RTN_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64:
+; SI: ds_and_rtn_b64
+; SI: s_endpgm
 define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_and_ret_i64_offset:
-; SI: DS_AND_RTN_B64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
+; SI: ds_and_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -128,18 +128,18 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_or_ret_i64:
-; SI: DS_OR_RTN_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64:
+; SI: ds_or_rtn_b64
+; SI: s_endpgm
 define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_or_ret_i64_offset:
-; SI: DS_OR_RTN_B64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
+; SI: ds_or_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -147,18 +147,18 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xor_ret_i64:
-; SI: DS_XOR_RTN_B64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64:
+; SI: ds_xor_rtn_b64
+; SI: s_endpgm
 define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_xor_ret_i64_offset:
-; SI: DS_XOR_RTN_B64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
+; SI: ds_xor_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -167,25 +167,25 @@
 }
 
 ; FIXME: There is no atomic nand instr
-; XFUNC-LABEL: @lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
+; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
 ; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; FUNC-LABEL: @lds_atomic_min_ret_i64:
-; SI: DS_MIN_RTN_I64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64:
+; SI: ds_min_rtn_i64
+; SI: s_endpgm
 define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_min_ret_i64_offset:
-; SI: DS_MIN_RTN_I64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
+; SI: ds_min_rtn_i64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -193,18 +193,18 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_max_ret_i64:
-; SI: DS_MAX_RTN_I64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64:
+; SI: ds_max_rtn_i64
+; SI: s_endpgm
 define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_max_ret_i64_offset:
-; SI: DS_MAX_RTN_I64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
+; SI: ds_max_rtn_i64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -212,18 +212,18 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umin_ret_i64:
-; SI: DS_MIN_RTN_U64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64:
+; SI: ds_min_rtn_u64
+; SI: s_endpgm
 define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umin_ret_i64_offset:
-; SI: DS_MIN_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
+; SI: ds_min_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
@@ -231,21 +231,243 @@
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umax_ret_i64:
-; SI: DS_MAX_RTN_U64
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64:
+; SI: ds_max_rtn_u64
+; SI: s_endpgm
 define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @lds_atomic_umax_ret_i64_offset:
-; SI: DS_MAX_RTN_U64 {{.*}} 0x20
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
+; SI: ds_max_rtn_u64 {{.*}} offset:32
+; SI: s_endpgm
 define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64:
+; SI: ds_wrxchg_rtn_b64
+; SI: s_endpgm
+define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
+; SI: ds_wrxchg_rtn_b64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64:
+; SI: ds_add_u64
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
+; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
+; SI: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
+; SI: s_endpgm
+define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i64 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset:
+; SI: ds_inc_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64:
+; SI: ds_sub_u64
+; SI: s_endpgm
+define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
+; SI: ds_sub_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
+; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
+; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
+; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
+; SI: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
+; SI: s_endpgm
+define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset:
+; SI: ds_dec_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64:
+; SI: ds_and_b64
+; SI: s_endpgm
+define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
+; SI: ds_and_b64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64:
+; SI: ds_or_b64
+; SI: s_endpgm
+define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
+; SI: ds_or_b64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64:
+; SI: ds_xor_b64
+; SI: s_endpgm
+define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
+; SI: ds_xor_b64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FIXME: There is no atomic nand instr
+; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
+; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
+;   ret void
+; }
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64:
+; SI: ds_min_i64
+; SI: s_endpgm
+define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
+; SI: ds_min_i64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64:
+; SI: ds_max_i64
+; SI: s_endpgm
+define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
+; SI: ds_max_i64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64:
+; SI: ds_min_u64
+; SI: s_endpgm
+define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
+; SI: ds_min_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64:
+; SI: ds_max_u64
+; SI: s_endpgm
+define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
+  ret void
+}
+
+; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
+; SI: ds_max_u64 {{.*}} offset:32
+; SI: s_endpgm
+define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+  %gep = getelementptr i64 addrspace(3)* %ptr, i32 4
+  %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
+  ret void
+}

diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index e29e4cc..88ef05d 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll

@@ -1,10 +1,11 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=SI %s
+; RUN: llc < %s -march=r600 -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK --check-prefix=CI %s
 
-@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
-@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4
+@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
+@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 
-; EG-CHECK: @local_memory_two_objects
+; EG-CHECK: {{^}}local_memory_two_objects:
 
 ; Check that the LDS size emitted correctly
 ; EG-CHECK: .long 166120
@@ -17,8 +18,8 @@
 ; this consistently on evergreen GPUs.
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
-; SI-CHECK: DS_WRITE_B32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
-; SI-CHECK-NOT: DS_WRITE_B32 {{v[0-9]*}}, v[[ADDRW]]
+; SI-CHECK: ds_write_b32 {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
+; SI-CHECK-NOT: ds_write_b32 {{v[0-9]*}}, v[[ADDRW]]
 
 ; GROUP_BARRIER must be the last instruction in a clause
 ; EG-CHECK: GROUP_BARRIER
@@ -28,8 +29,10 @@
 ; constant offsets.
 ; EG-CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]], 0x10
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, [[ADDRR]], 0x0,
+; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], 16, v{{[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] [M0]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 [M0]
+; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] [M0]
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:

diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 51af484..9b13cb2 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll

@@ -1,32 +1,30 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=CI-CHECK %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
-@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] zeroinitializer, align 4
+@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
 
-; EG-CHECK-LABEL: @local_memory
-; SI-CHECK-LABEL: @local_memory
-; CI-CHECK-LABEL: @local_memory
+; FUNC-LABEL: {{^}}local_memory:
 
 ; Check that the LDS size emitted correctly
-; EG-CHECK: .long 166120
-; EG-CHECK-NEXT: .long 128
-; SI-CHECK: .long 47180
-; SI-CHECK-NEXT: .long 65536
-; CI-CHECK: .long 47180
-; CI-CHECK-NEXT: .long 32768
+; EG: .long 166120
+; EG-NEXT: .long 128
+; SI: .long 47180
+; SI-NEXT: .long 65536
+; CI: .long 47180
+; CI-NEXT: .long 32768
 
-; EG-CHECK: LDS_WRITE
-; SI-CHECK-NOT: S_WQM_B64
-; SI-CHECK: DS_WRITE_B32
+; EG: LDS_WRITE
+; SI-NOT: s_wqm_b64
+; SI: ds_write_b32
 
 ; GROUP_BARRIER must be the last instruction in a clause
-; EG-CHECK: GROUP_BARRIER
-; EG-CHECK-NEXT: ALU clause
-; SI-CHECK: S_BARRIER
+; EG: GROUP_BARRIER
+; EG-NEXT: ALU clause
+; SI: s_barrier
 
-; EG-CHECK: LDS_READ_RET
-; SI-CHECK: DS_READ_B32 {{v[0-9]+}},
+; EG: LDS_READ_RET
+; SI: ds_read_b32 {{v[0-9]+}},
 
 define void @local_memory(i32 addrspace(1)* %out) {
 entry:

diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll
index 128f661..0478bdb 100644
--- a/test/CodeGen/R600/loop-idiom.ll
+++ b/test/CodeGen/R600/loop-idiom.ll

@@ -10,8 +10,8 @@
 ; implementations of these for R600.
 
 ; FUNC: @no_memcpy
-; R600-NOT: @llvm.memcpy
-; SI-NOT: @llvm.memcpy
+; R600-NOT: {{^}}llvm.memcpy
+; SI-NOT: {{^}}llvm.memcpy
 define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
 entry:
   %dest = alloca i8, i32 32
@@ -32,10 +32,10 @@
 }
 
 ; FUNC: @no_memset
-; R600-NOT: @llvm.memset
-; R600-NOT: @memset_pattern16
-; SI-NOT: @llvm.memset
-; SI-NOT: @memset_pattern16
+; R600-NOT: {{^}}llvm.memset
+; R600-NOT: {{^}}memset_pattern16:
+; SI-NOT: {{^}}llvm.memset
+; SI-NOT: {{^}}memset_pattern16:
 define void @no_memset(i32 %size) {
 entry:
   %dest = alloca i8, i32 32

diff --git a/test/CodeGen/R600/lshl.ll b/test/CodeGen/R600/lshl.ll
index 2162839..9785866 100644
--- a/test/CodeGen/R600/lshl.ll
+++ b/test/CodeGen/R600/lshl.ll

@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: S_LSHL_B32 s{{[0-9]}}, s{{[0-9]}}, 1
+;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = mul i32 %p, 2

diff --git a/test/CodeGen/R600/lshr.ll b/test/CodeGen/R600/lshr.ll
index 886d1c4..acfc1fd 100644
--- a/test/CodeGen/R600/lshr.ll
+++ b/test/CodeGen/R600/lshr.ll

@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: S_LSHR_B32 s{{[0-9]}}, s{{[0-9]}}, 1
+;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = udiv i32 %p, 2

diff --git a/test/CodeGen/R600/m0-spill.ll b/test/CodeGen/R600/m0-spill.ll
new file mode 100644
index 0000000..a8b0e0d
--- /dev/null
+++ b/test/CodeGen/R600/m0-spill.ll

@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+@lds = external addrspace(3) global [64 x float]
+
+; CHECK-LABEL: {{^}}main:
+; CHECK-NOT: v_readlane_b32 m0
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" {
+main_body:
+  %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  %cmp = fcmp ueq float 0.0, %4
+  br i1 %cmp, label %if, label %else
+
+if:
+  %lds_ptr = getelementptr [64 x float] addrspace(3)* @lds, i32 0, i32 0
+  %lds_data = load float addrspace(3)* %lds_ptr
+  br label %endif
+
+else:
+  %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
+  br label %endif
+
+endif:
+  %export = phi float [%lds_data, %if], [%interp, %else]
+  %5 = call i32 @llvm.SI.packf16(float %export, float %export)
+  %6 = bitcast i32 %5 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %6, float %6, float %6, float %6)
+  ret void
+}
+
+declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
+
+declare i32 @llvm.SI.packf16(float, float) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

diff --git a/test/CodeGen/R600/mad-sub.ll b/test/CodeGen/R600/mad-sub.ll
new file mode 100644
index 0000000..240abd0
--- /dev/null
+++ b/test/CodeGen/R600/mad-sub.ll

@@ -0,0 +1,215 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare float @llvm.fabs.f32(float) #0
+
+; FUNC-LABEL: {{^}}mad_sub_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_inv_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %mul = fmul float %a, %b
+  %sub = fsub float %c, %mul
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_f64:
+; SI: v_mul_f64
+; SI: v_add_f64
+define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr double addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr double addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr double addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr double addrspace(1)* %out, i64 %tid.ext
+  %a = load double addrspace(1)* %gep0, align 8
+  %b = load double addrspace(1)* %gep1, align 8
+  %c = load double addrspace(1)* %gep2, align 8
+  %mul = fmul double %a, %b
+  %sub = fsub double %mul, %c
+  store double %sub, double addrspace(1)* %outgep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_fabs_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %c.abs = call float @llvm.fabs.f32(float %c) #0
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c.abs
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_sub_fabs_inv_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %c.abs = call float @llvm.fabs.f32(float %c) #0
+  %mul = fmul float %a, %b
+  %sub = fsub float %c.abs, %mul
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}neg_neg_mad_f32:
+; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %nega = fsub float -0.000000e+00, %a
+  %negb = fsub float -0.000000e+00, %b
+  %mul = fmul float %nega, %negb
+  %sub = fadd float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mad_fabs_sub_f32:
+; SI: buffer_load_dword [[REGA:v[0-9]+]]
+; SI: buffer_load_dword [[REGB:v[0-9]+]]
+; SI: buffer_load_dword [[REGC:v[0-9]+]]
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
+; SI: buffer_store_dword [[RESULT]]
+define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+  %tid.ext = sext i32 %tid to i64
+  %gep0 = getelementptr float addrspace(1)* %ptr, i64 %tid.ext
+  %add1 = add i64 %tid.ext, 1
+  %gep1 = getelementptr float addrspace(1)* %ptr, i64 %add1
+  %add2 = add i64 %tid.ext, 2
+  %gep2 = getelementptr float addrspace(1)* %ptr, i64 %add2
+  %outgep = getelementptr float addrspace(1)* %out, i64 %tid.ext
+  %a = load float addrspace(1)* %gep0, align 4
+  %b = load float addrspace(1)* %gep1, align 4
+  %c = load float addrspace(1)* %gep2, align 4
+  %b.abs = call float @llvm.fabs.f32(float %b) #0
+  %mul = fmul float %a, %b.abs
+  %sub = fsub float %mul, %c
+  store float %sub, float addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fsub_c_fadd_a_a:
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %add = fadd float %r1, %r1
+  %r3 = fsub float %r2, %add
+
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}fsub_fadd_a_a_c:
+; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], -[[R2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @fsub_fadd_a_a_c(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep.0 = getelementptr float addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %r1 = load float addrspace(1)* %gep.0
+  %r2 = load float addrspace(1)* %gep.1
+
+  %add = fadd float %r1, %r1
+  %r3 = fsub float %add, %r2
+
+  store float %r3, float addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }

diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
index abb5290..c8dd377 100644
--- a/test/CodeGen/R600/mad_int24.ll
+++ b/test/CodeGen/R600/mad_int24.ll

@@ -2,14 +2,16 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; FUNC-LABEL: @i32_mad24
+declare i32 @llvm.AMDGPU.imul24(i32, i32) nounwind readnone
+
+; FUNC-LABEL: {{^}}i32_mad24:
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
 ; EG: MULLO_INT
 ; Make sure we aren't masking the inputs.
 ; CM-NOT: AND
 ; CM: MULADD_INT24
-; SI-NOT: AND
-; SI: V_MAD_I32_I24
+; SI-NOT: and
+; SI: v_mad_i32_i24
 define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
@@ -21,3 +23,12 @@
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: @test_imul24
+; SI: v_mad_i32_i24
+define void @test_imul24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+  %mul = call i32 @llvm.AMDGPU.imul24(i32 %src0, i32 %src1) nounwind readnone
+  %add = add i32 %mul, %src2
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
index 0f0893b..b7b32fe 100644
--- a/test/CodeGen/R600/mad_uint24.ll
+++ b/test/CodeGen/R600/mad_uint24.ll

@@ -2,9 +2,9 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; FUNC-LABEL: @u32_mad24
+; FUNC-LABEL: {{^}}u32_mad24:
 ; EG: MULADD_UINT24
-; SI: V_MAD_U32_U24
+; SI: v_mad_u32_u24
 
 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
@@ -18,14 +18,14 @@
   ret void
 }
 
-; FUNC-LABEL: @i16_mad24
+; FUNC-LABEL: {{^}}i16_mad24:
 ; The order of A and B does not matter.
 ; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
 ; EG: 16
-; SI: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
+; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
 
 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
@@ -36,13 +36,13 @@
   ret void
 }
 
-; FUNC-LABEL: @i8_mad24
+; FUNC-LABEL: {{^}}i8_mad24:
 ; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
 ; EG: 8
-; SI: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
+; SI: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
 
 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
@@ -60,9 +60,9 @@
 ; 24-bit mad pattern wasn't being matched.
 
 ; Check that the select instruction is not deleted.
-; FUNC-LABEL: @i24_i32_i32_mad
+; FUNC-LABEL: {{^}}i24_i32_i32_mad:
 ; EG: CNDE_INT
-; SI: V_CNDMASK
+; SI: v_cndmask
 define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   %0 = ashr i32 %a, 8

diff --git a/test/CodeGen/R600/max-literals.ll b/test/CodeGen/R600/max-literals.ll
index 65a6d2b..c357524 100644
--- a/test/CodeGen/R600/max-literals.ll
+++ b/test/CodeGen/R600/max-literals.ll

@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @main
+; CHECK-LABEL: {{^}}main:
 ; CHECK: ADD *
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
@@ -29,7 +29,7 @@
   ret void
 }
 
-; CHECK: @main
+; CHECK-LABEL: {{^}}main2:
 ; CHECK-NOT: ADD *
 
 define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {

diff --git a/test/CodeGen/R600/max.ll b/test/CodeGen/R600/max.ll
new file mode 100644
index 0000000..d67ef47
--- /dev/null
+++ b/test/CodeGen/R600/max.ll

@@ -0,0 +1,99 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imax_sge_i32
+; SI: v_max_i32_e32
+define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imax_sge_i32
+; SI: s_max_i32
+define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imax_sgt_i32
+; SI: v_max_i32_e32
+define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sgt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imax_sgt_i32
+; SI: s_max_i32
+define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax_uge_i32
+; SI: v_max_u32_e32
+define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp uge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umax_uge_i32
+; SI: s_max_u32
+define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax_ugt_i32
+; SI: v_max_u32_e32
+define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ugt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umax_ugt_i32
+; SI: s_max_u32
+define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/max3.ll b/test/CodeGen/R600/max3.ll
new file mode 100644
index 0000000..74b08f6
--- /dev/null
+++ b/test/CodeGen/R600/max3.ll

@@ -0,0 +1,41 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imax3_sgt_i32
+; SI: v_max3_i32
+define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp sgt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp sgt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umax3_ugt_i32
+; SI: v_max3_u32
+define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp ugt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp ugt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/min.ll b/test/CodeGen/R600/min.ll
new file mode 100644
index 0000000..88c0dff
--- /dev/null
+++ b/test/CodeGen/R600/min.ll

@@ -0,0 +1,99 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imin_sle_i32
+; SI: v_min_i32_e32
+define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp sle i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imin_sle_i32
+; SI: s_min_i32
+define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imin_slt_i32
+; SI: v_min_i32_e32
+define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp slt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_imin_slt_i32
+; SI: s_min_i32
+define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ule_i32
+; SI: v_min_u32_e32
+define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ule i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umin_ule_i32
+; SI: s_min_u32
+define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_ult_i32
+; SI: v_min_u32_e32
+define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @s_test_umin_ult_i32
+; SI: s_min_u32
+define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %val = select i1 %cmp, i32 %a, i32 %b
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/min3.ll b/test/CodeGen/R600/min3.ll
new file mode 100644
index 0000000..f852cff
--- /dev/null
+++ b/test/CodeGen/R600/min3.ll

@@ -0,0 +1,111 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; FUNC-LABEL: @v_test_imin3_slt_i32
+; SI: v_min3_i32
+define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp slt i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin3_ult_i32
+; SI: v_min3_u32
+define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %icmp0 = icmp ult i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+  %icmp1 = icmp ult i32 %i0, %c
+  %i1 = select i1 %icmp1, i32 %i0, i32 %c
+  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin_umin_umin
+; SI: v_min_i32
+; SI: v_min3_i32
+define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid2 = mul i32 %tid, 2
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+
+  %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2
+
+  %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2
+
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %d = load i32 addrspace(1)* %gep3, align 4
+
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+
+  %icmp1 = icmp slt i32 %c, %d
+  %i1 = select i1 %icmp1, i32 %c, i32 %d
+
+  %icmp2 = icmp slt i32 %i0, %i1
+  %i2 = select i1 %icmp2, i32 %i0, i32 %i1
+
+  store i32 %i2, i32 addrspace(1)* %outgep1, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_umin3_2_uses
+; SI-NOT: v_min3
+define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %tid2 = mul i32 %tid, 2
+  %gep0 = getelementptr i32 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i32 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i32 addrspace(1)* %cptr, i32 %tid
+
+  %gep3 = getelementptr i32 addrspace(1)* %aptr, i32 %tid2
+  %gep4 = getelementptr i32 addrspace(1)* %bptr, i32 %tid2
+  %gep5 = getelementptr i32 addrspace(1)* %cptr, i32 %tid2
+
+  %outgep0 = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %outgep1 = getelementptr i32 addrspace(1)* %out, i32 %tid2
+
+  %a = load i32 addrspace(1)* %gep0, align 4
+  %b = load i32 addrspace(1)* %gep1, align 4
+  %c = load i32 addrspace(1)* %gep2, align 4
+  %d = load i32 addrspace(1)* %gep3, align 4
+
+  %icmp0 = icmp slt i32 %a, %b
+  %i0 = select i1 %icmp0, i32 %a, i32 %b
+
+  %icmp1 = icmp slt i32 %c, %d
+  %i1 = select i1 %icmp1, i32 %c, i32 %d
+
+  %icmp2 = icmp slt i32 %i0, %c
+  %i2 = select i1 %icmp2, i32 %i0, i32 %c
+
+  store i32 %i2, i32 addrspace(1)* %outgep0, align 4
+  store i32 %i0, i32 addrspace(1)* %outgep1, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/missing-store.ll b/test/CodeGen/R600/missing-store.ll
new file mode 100644
index 0000000..5346046
--- /dev/null
+++ b/test/CodeGen/R600/missing-store.ll

@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+@ptr_load = addrspace(3) global i32 addrspace(2)* undef, align 8
+
+; Make sure when the load from %ptr2 is folded the chain isn't lost,
+; resulting in losing the store to gptr
+
+; FUNC-LABEL: {{^}}missing_store_reduced:
+; SI: ds_read_b64
+; SI: buffer_store_dword
+; SI: buffer_load_dword
+; SI: buffer_store_dword
+; SI: s_endpgm
+define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
+  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+
+  store i32 %tmp2, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+

diff --git a/test/CodeGen/R600/mubuf.ll b/test/CodeGen/R600/mubuf.ll
index f465d3d..c2efda4 100644
--- a/test/CodeGen/R600/mubuf.ll
+++ b/test/CodeGen/R600/mubuf.ll

@@ -1,12 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
 
 ;;;==========================================================================;;;
 ;;; MUBUF LOAD TESTS
 ;;;==========================================================================;;;
 
 ; MUBUF load with an immediate byte offset that fits into 12-bits
-; CHECK-LABEL: @mubuf_load0
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
+; CHECK-LABEL: {{^}}mubuf_load0:
+; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x30,0xe0
 define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1
@@ -16,8 +18,8 @@
 }
 
 ; MUBUF load with the largest possible immediate offset
-; CHECK-LABEL: @mubuf_load1
-; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0xfff ; encoding: [0xff,0x8f
+; CHECK-LABEL: {{^}}mubuf_load1:
+; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x20,0xe0
 define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(1)* %in, i64 4095
@@ -27,8 +29,8 @@
 }
 
 ; MUBUF load with an immediate byte offset that doesn't fit into 12-bits
-; CHECK-LABEL: @mubuf_load2
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80
+; CHECK-LABEL: {{^}}mubuf_load2:
+; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 ; encoding: [0x00,0x80,0x30,0xe0
 define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 1024
@@ -38,9 +40,9 @@
 }
 
 ; MUBUF load with a 12-bit immediate offset and a register offset
-; CHECK-LABEL: @mubuf_load3
+; CHECK-LABEL: {{^}}mubuf_load3:
 ; CHECK-NOT: ADD
-; CHECK: BUFFER_LOAD_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
+; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x30,0xe0
 define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %in, i64 %offset
@@ -55,8 +57,8 @@
 ;;;==========================================================================;;;
 
 ; MUBUF store with an immediate byte offset that fits into 12-bits
-; CHECK-LABEL: @mubuf_store0
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
+; CHECK-LABEL: {{^}}mubuf_store0:
+; CHECK: buffer_store_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0x4 ; encoding: [0x04,0x00,0x70,0xe0
 define void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1
@@ -65,8 +67,8 @@
 }
 
 ; MUBUF store with the largest possible immediate offset
-; CHECK-LABEL: @mubuf_store1
-; CHECK: BUFFER_STORE_BYTE v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0xfff ; encoding: [0xff,0x8f
+; CHECK-LABEL: {{^}}mubuf_store1:
+; CHECK: buffer_store_byte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:0xfff ; encoding: [0xff,0x0f,0x60,0xe0
 
 define void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
@@ -76,8 +78,8 @@
 }
 
 ; MUBUF store with an immediate byte offset that doesn't fit into 12-bits
-; CHECK-LABEL: @mubuf_store2
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x0 ; encoding: [0x00,0x80
+; CHECK-LABEL: {{^}}mubuf_store2:
+; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]:[0-9]}}], 0 addr64 ; encoding: [0x00,0x80,0x70,0xe0
 define void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 1024
@@ -86,9 +88,9 @@
 }
 
 ; MUBUF store with a 12-bit immediate offset and a register offset
-; CHECK-LABEL: @mubuf_store3
+; CHECK-LABEL: {{^}}mubuf_store3:
 ; CHECK-NOT: ADD
-; CHECK: BUFFER_STORE_DWORD v{{[0-9]}}, s[{{[0-9]:[0-9]}}] + v[{{[0-9]:[0-9]}}] + 0x4 ; encoding: [0x04,0x80
+; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:0x4 ; encoding: [0x04,0x80,0x70,0xe0
 define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
   %0 = getelementptr i32 addrspace(1)* %out, i64 %offset
@@ -96,3 +98,35 @@
   store i32 0, i32 addrspace(1)* %1
   ret void
 }
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0
+define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+  store i32 99, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
+; CHECK: buffer_store_dword v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:0x28
+define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 10
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 32768
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}store_vgpr_ptr:
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  store i32 99, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index d231e92..be5d6a0 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll

@@ -3,14 +3,14 @@
 
 ; mul24 and mad24 are affected
 
-; FUNC-LABEL: @test2
+; FUNC-LABEL: {{^}}test_mul_v2i32:
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32> addrspace(1) * %in
   %b = load <2 x i32> addrspace(1) * %b_ptr
@@ -19,18 +19,18 @@
   ret void
 }
 
-; FUNC-LABEL: @test4
+; FUNC-LABEL: {{^}}v_mul_v4i32:
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32> addrspace(1) * %in
   %b = load <4 x i32> addrspace(1) * %b_ptr
@@ -39,12 +39,26 @@
   ret void
 }
 
-; FUNC-LABEL: @trunc_i64_mul_to_i32
-; SI: S_LOAD_DWORD
-; SI: S_LOAD_DWORD
-; SI: V_MUL_LO_I32
-; SI: BUFFER_STORE_DWORD
-define void @trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+; FUNC-LABEL: {{^}}s_trunc_i64_mul_to_i32:
+; SI: s_load_dword
+; SI: s_load_dword
+; SI: s_mul_i32
+; SI: buffer_store_dword
+define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+  %mul = mul i64 %b, %a
+  %trunc = trunc i64 %mul to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_trunc_i64_mul_to_i32:
+; SI: s_load_dword
+; SI: s_load_dword
+; SI: v_mul_lo_i32
+; SI: buffer_store_dword
+define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -53,11 +67,11 @@
 
 ; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top
 ; 32-bits of both arguments are sign bits.
-; FUNC-LABEL: @mul64_sext_c
+; FUNC-LABEL: {{^}}mul64_sext_c:
 ; EG-DAG: MULLO_INT
 ; EG-DAG: MULHI_INT
-; SI-DAG: V_MUL_LO_I32
-; SI-DAG: V_MUL_HI_I32
+; SI-DAG: s_mul_i32
+; SI-DAG: v_mul_hi_i32
 define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
@@ -66,16 +80,120 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_mul64_sext_c:
+; EG-DAG: MULLO_INT
+; EG-DAG: MULHI_INT
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_i32
+; SI: s_endpgm
+define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ext = sext i32 %val to i64
+  %mul = mul i64 %ext, 80
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul64_sext_inline_imm:
+; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, 9, v{{[0-9]+}}
+; SI: s_endpgm
+define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %val = load i32 addrspace(1)* %in, align 4
+  %ext = sext i32 %val to i64
+  %mul = mul i64 %ext, 9
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_mul_i32:
+; SI: s_load_dword [[SRC0:s[0-9]+]],
+; SI: s_load_dword [[SRC1:s[0-9]+]],
+; SI: s_mul_i32 [[SRESULT:s[0-9]+]], [[SRC0]], [[SRC1]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
+; SI: s_endpgm
+define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %mul = mul i32 %a, %b
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_i32:
+; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = mul i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
 ; A standard 64-bit multiply.  The expansion should be around 6 instructions.
 ; It would be difficult to match the expansion correctly without writing
 ; a really complicated list of FileCheck expressions.  I don't want
 ; to confuse people who may 'break' this test with a correct optimization,
 ; so this test just uses FUNC-LABEL to make sure the compiler does not
 ; crash with a 'failed to select' error.
-; FUNC-LABEL: @mul64
-define void @mul64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+
+; FUNC-LABEL: {{^}}s_mul_i64:
+define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+  %mul = mul i64 %a, %b
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_mul_i64:
+; SI: v_mul_lo_i32
+define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
+  %mul = mul i64 %a, %b
+  store i64 %mul, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul32_in_branch:
+; SI: s_mul_i32
+define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
-  %0 = mul i64 %a, %b
-  store i64 %0, i64 addrspace(1)* %out
+  %0 = icmp eq i32 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i32 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = mul i32 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i32 [%1, %if], [%2, %else]
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}mul64_in_branch:
+; SI-DAG: s_mul_i32
+; SI-DAG: v_mul_hi_u32
+; SI: s_endpgm
+define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = mul i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
   ret void
 }

diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
index 046911b..be58f7e 100644
--- a/test/CodeGen/R600/mul_int24.ll
+++ b/test/CodeGen/R600/mul_int24.ll

@@ -2,14 +2,14 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; FUNC-LABEL: @i32_mul24
+; FUNC-LABEL: {{^}}i32_mul24:
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
 ; EG: MULLO_INT
 ; Make sure we are not masking the inputs
 ; CM-NOT: AND
 ; CM: MUL_INT24
-; SI-NOT: AND
-; SI: V_MUL_I32_I24
+; SI-NOT: and
+; SI: v_mul_i32_i24
 define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8

diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
index 419f275..8d1cda8 100644
--- a/test/CodeGen/R600/mul_uint24.ll
+++ b/test/CodeGen/R600/mul_uint24.ll

@@ -2,9 +2,9 @@
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
-; FUNC-LABEL: @u32_mul24
+; FUNC-LABEL: {{^}}u32_mul24:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI: V_MUL_U32_U24
+; SI: v_mul_u32_u24
 
 define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
@@ -17,13 +17,13 @@
   ret void
 }
 
-; FUNC-LABEL: @i16_mul24
+; FUNC-LABEL: {{^}}i16_mul24:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
 ; EG: 16
-; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
+; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
 define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %0 = mul i16 %a, %b
@@ -32,12 +32,12 @@
   ret void
 }
 
-; FUNC-LABEL: @i8_mul24
+; FUNC-LABEL: {{^}}i8_mul24:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; The result must be sign-extended
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
+; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
 
 define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
@@ -48,12 +48,12 @@
 }
 
 ; Multiply with 24-bit inputs and 64-bit output
-; FUNC_LABEL: @mul24_i64
+; FUNC_LABEL: {{^}}mul24_i64:
 ; EG; MUL_UINT24
 ; EG: MULHI
-; SI: V_MUL_U32_U24
+; SI: v_mul_u32_u24
 ; FIXME: SI support 24-bit mulhi
-; SI: V_MUL_HI_U32
+; SI: v_mul_hi_u32
 define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = shl i64 %a, 40

diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll
index 8640127..82a0783 100644
--- a/test/CodeGen/R600/mulhu.ll
+++ b/test/CodeGen/R600/mulhu.ll

@@ -1,8 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_MOV_B32_e32 v{{[0-9]+}}, 0xaaaaaaab
-;CHECK: V_MUL_HI_U32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
-;CHECK-NEXT: V_LSHRREV_B32_e32 v0, 1, v0
+;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
+;CHECK: v_mul_hi_u32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
+;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
 
 define void @test(i32 %p) {
    %i = udiv i32 %p, 3

diff --git a/test/CodeGen/R600/no-initializer-constant-addrspace.ll b/test/CodeGen/R600/no-initializer-constant-addrspace.ll
index ab82e7e..cd2dca3 100644
--- a/test/CodeGen/R600/no-initializer-constant-addrspace.ll
+++ b/test/CodeGen/R600/no-initializer-constant-addrspace.ll

@@ -3,7 +3,7 @@
 
 @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
 
-; FUNC-LABEL: @load_extern_const_init
+; FUNC-LABEL: {{^}}load_extern_const_init:
 define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -12,7 +12,7 @@
 
 @undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
 
-; FUNC-LABEL: @load_undef_const_init
+; FUNC-LABEL: {{^}}load_undef_const_init:
 define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32 addrspace(2)* getelementptr ([5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4

diff --git a/test/CodeGen/R600/operand-spacing.ll b/test/CodeGen/R600/operand-spacing.ll
new file mode 100644
index 0000000..f0d228d
--- /dev/null
+++ b/test/CodeGen/R600/operand-spacing.ll

@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
+
+; Make sure there isn't an extra space between the instruction name and first operands.
+
+; SI-LABEL: {{^}}add_f32:
+; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
+; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
+; SI: buffer_store_dword [[RESULT]],
+define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+  %result = fadd float %a, %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 91a70b7..b7493d3 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll

@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
-; EG-LABEL: @or_v2i32
+; EG-LABEL: {{^}}or_v2i32:
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI-LABEL: @or_v2i32
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI-LABEL: {{^}}or_v2i32:
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@
   ret void
 }
 
-; EG-LABEL: @or_v4i32
+; EG-LABEL: {{^}}or_v4i32:
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI-LABEL: @or_v4i32
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI-LABEL: {{^}}or_v4i32:
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,16 +39,16 @@
   ret void
 }
 
-; SI-LABEL: @scalar_or_i32
-; SI: S_OR_B32
+; SI-LABEL: {{^}}scalar_or_i32:
+; SI: s_or_b32
 define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %or = or i32 %a, %b
   store i32 %or, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: @vector_or_i32
-; SI: V_OR_B32_e32 v{{[0-9]}}
+; SI-LABEL: {{^}}vector_or_i32:
+; SI: v_or_b32_e32 v{{[0-9]}}
 define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
   %loada = load i32 addrspace(1)* %a
   %or = or i32 %loada, %b
@@ -56,20 +56,46 @@
   ret void
 }
 
-; EG-LABEL: @scalar_or_i64
+; SI-LABEL: {{^}}scalar_or_literal_i32:
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
+define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
+  %or = or i32 %a, 99999
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}vector_or_literal_i32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+  %loada = load i32 addrspace(1)* %a, align 4
+  %or = or i32 %loada, 65535
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}vector_or_inline_immediate_i32:
+; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
+define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+  %loada = load i32 addrspace(1)* %a, align 4
+  %or = or i32 %loada, 4
+  store i32 %or, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-LABEL: {{^}}scalar_or_i64:
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; SI-LABEL: @scalar_or_i64
-; SI: S_OR_B64
+; SI-LABEL: {{^}}scalar_or_i64:
+; SI: s_or_b64
 define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, %b
   store i64 %or, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: @vector_or_i64
-; SI: V_OR_B32_e32 v{{[0-9]}}
-; SI: V_OR_B32_e32 v{{[0-9]}}
+; SI-LABEL: {{^}}vector_or_i64:
+; SI: v_or_b32_e32 v{{[0-9]}}
+; SI: v_or_b32_e32 v{{[0-9]}}
 define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64 addrspace(1)* %a, align 8
   %loadb = load i64 addrspace(1)* %a, align 8
@@ -78,9 +104,9 @@
   ret void
 }
 
-; SI-LABEL: @scalar_vector_or_i64
-; SI: V_OR_B32_e32 v{{[0-9]}}
-; SI: V_OR_B32_e32 v{{[0-9]}}
+; SI-LABEL: {{^}}scalar_vector_or_i64:
+; SI: v_or_b32_e32 v{{[0-9]}}
+; SI: v_or_b32_e32 v{{[0-9]}}
 define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
   %loada = load i64 addrspace(1)* %a
   %or = or i64 %loada, %b
@@ -88,13 +114,13 @@
   ret void
 }
 
-; SI-LABEL: @vector_or_i64_loadimm
-; SI-DAG: S_MOV_B32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
-; SI-DAG: S_MOV_B32 [[HI_S_IMM:s[0-9]+]], 0x146f
-; SI-DAG: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
-; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}vector_or_i64_loadimm:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xdf77987f
+; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x146f
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
 define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 22470723082367
@@ -103,11 +129,11 @@
 }
 
 ; FIXME: The or 0 should really be removed.
-; SI-LABEL: @vector_or_i64_imm
-; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
-; SI: V_OR_B32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
-; SI: V_OR_B32_e32 {{v[0-9]+}}, 0, {{.*}}
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}vector_or_i64_imm:
+; SI: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI: v_or_b32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
+; SI: v_or_b32_e32 {{v[0-9]+}}, 0, {{.*}}
+; SI: s_endpgm
 define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 8
@@ -115,15 +141,31 @@
   ret void
 }
 
-; SI-LABEL: @trunc_i64_or_to_i32
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
-; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
-; SI: S_OR_B32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
-; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; SI: BUFFER_STORE_DWORD [[VRESULT]],
+; SI-LABEL: {{^}}trunc_i64_or_to_i32:
+; SI: s_load_dword s[[SREG0:[0-9]+]]
+; SI: s_load_dword s[[SREG1:[0-9]+]]
+; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
+; SI: buffer_store_dword [[VRESULT]],
 define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = or i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
   ret void
 }
+
+; EG-CHECK: {{^}}or_i1:
+; EG-CHECK: OR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+
+; SI-CHECK: {{^}}or_i1:
+; SI-CHECK: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+define void @or_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+  %a = load float addrspace(1) * %in0
+  %b = load float addrspace(1) * %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 0.000000e+00
+  %or = or i1 %acmp, %bcmp
+  %result = select i1 %or, float %a, float %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/packetizer.ll b/test/CodeGen/R600/packetizer.ll
index 0a405c5..49a7c0d 100644
--- a/test/CodeGen/R600/packetizer.ll
+++ b/test/CodeGen/R600/packetizer.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s
 
-; CHECK: @test
+; CHECK: {{^}}test:
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.X
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Y
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z

diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll
index 8a269e0..82b1150 100644
--- a/test/CodeGen/R600/parallelandifcollapse.ll
+++ b/test/CodeGen/R600/parallelandifcollapse.ll

@@ -1,5 +1,5 @@
 ; Function Attrs: nounwind
-; RUN: llc < %s -march=r600 -mcpu=redwood  | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s
 ;
 ; CFG flattening should use parallel-and mode to generate branch conditions and
 ; then merge if-regions with the same bodies.
@@ -11,7 +11,6 @@
 ; FIXME: For some reason having the allocas here allowed the flatten cfg pass
 ; to do its transfomation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
-; XFAIL: *
 
 define void @_Z9chk1D_512v() #0 {
 entry:

diff --git a/test/CodeGen/R600/predicate-dp4.ll b/test/CodeGen/R600/predicate-dp4.ll
index e48d6a7..6bc1875 100644
--- a/test/CodeGen/R600/predicate-dp4.ll
+++ b/test/CodeGen/R600/predicate-dp4.ll

@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
 
-; CHECK-LABEL: @main
+; CHECK-LABEL: {{^}}main:
 ; CHECK: PRED_SETE_INT * Pred,
 ; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
 define void @main(<4 x float> inreg) #0 {

diff --git a/test/CodeGen/R600/predicates.ll b/test/CodeGen/R600/predicates.ll
index 902508f..0ce74d9 100644
--- a/test/CodeGen/R600/predicates.ll
+++ b/test/CodeGen/R600/predicates.ll

@@ -3,7 +3,7 @@
 ; These tests make sure the compiler is optimizing branches using predicates
 ; when it is legal to do so.
 
-; CHECK: @simple_if
+; CHECK: {{^}}simple_if:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
@@ -21,7 +21,7 @@
   ret void
 }
 
-; CHECK: @simple_if_else
+; CHECK: {{^}}simple_if_else:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
@@ -44,7 +44,7 @@
   ret void
 }
 
-; CHECK: @nested_if
+; CHECK: {{^}}nested_if:
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK: JUMP
 ; CHECK: POP
@@ -71,7 +71,7 @@
   ret void
 }
 
-; CHECK: @nested_if_else
+; CHECK: {{^}}nested_if_else:
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK: JUMP
 ; CHECK: POP

diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
index 89122be..bfb4a6a 100644
--- a/test/CodeGen/R600/private-memory.ll
+++ b/test/CodeGen/R600/private-memory.ll

@@ -1,19 +1,23 @@
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-; FUNC-LABEL: @mova_same_clause
+; FUNC-LABEL: {{^}}mova_same_clause:
 
-; R600-CHECK: LDS_WRITE
-; R600-CHECK: LDS_WRITE
-; R600-CHECK: LDS_READ
-; R600-CHECK: LDS_READ
+; R600: LDS_WRITE
+; R600: LDS_WRITE
+; R600: LDS_READ
+; R600: LDS_READ
 
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_READ_B32
-; SI-CHECK: DS_READ_B32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_write_b32
+; SI-PROMOTE: ds_read_b32
+; SI-PROMOTE: ds_read_b32
+
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
+; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
@@ -41,9 +45,10 @@
 ; XXX: This generated code has unnecessary MOVs, we should be able to optimize
 ; this.
 
-; FUNC-LABEL: @multiple_structs
-; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-NOT: V_MOVREL
+; FUNC-LABEL: {{^}}multiple_structs:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
+; SI-NOT: v_movrel
 %struct.point = type { i32, i32 }
 
 define void @multiple_structs(i32 addrspace(1)* %out) {
@@ -71,9 +76,9 @@
 ; loads and stores should be lowered to copies, so there shouldn't be any
 ; MOVA instructions.
 
-; FUNC-LABEL: @direct_loop
-; R600-CHECK-NOT: MOVA_INT
-; SI-CHECK-NOT: V_MOVREL
+; FUNC-LABEL: {{^}}direct_loop:
+; R600-NOT: MOVA_INT
+; SI-NOT: v_movrel
 
 define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -107,11 +112,13 @@
   ret void
 }
 
-; FUNC-LABEL: @short_array
+; FUNC-LABEL: {{^}}short_array:
 
-; R600-CHECK: MOVA_INT
+; R600: MOVA_INT
 
-; SI-CHECK: V_MOVRELS_B32_e32
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0
+; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x2 ; encoding: [0x02,0x10,0x68,0xe0
+; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 define void @short_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = alloca [2 x i16]
@@ -126,12 +133,12 @@
   ret void
 }
 
-; FUNC-LABEL: @char_array
+; FUNC-LABEL: {{^}}char_array:
 
-; R600-CHECK: MOVA_INT
+; R600: MOVA_INT
 
-; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
-; SI-CHECK: V_MOVRELS_B32_e32
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0
+; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x1 ; encoding: [0x01,0x10,0x60,0xe0
 define void @char_array(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = alloca [2 x i8]
@@ -149,12 +156,12 @@
 
 ; Make sure we don't overwrite workitem information with private memory
 
-; FUNC-LABEL: @work_item_info
-; R600-CHECK-NOT: MOV T0.X
+; FUNC-LABEL: {{^}}work_item_info:
+; R600-NOT: MOV T0.X
 ; Additional check in case the move ends up in the last slot
-; R600-CHECK-NOT: MOV * TO.X
+; R600-NOT: MOV * TO.X
 
-; SI-CHECK-NOT: V_MOV_B32_e{{(32|64)}} v0
+; SI-NOT: v_mov_b32_e{{(32|64)}} v0
 define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
@@ -172,11 +179,11 @@
 
 ; Test that two stack objects are not stored in the same register
 ; The second stack object should be in T3.X
-; FUNC-LABEL: @no_overlap
+; FUNC-LABEL: {{^}}no_overlap:
 ; R600_CHECK: MOV
 ; R600_CHECK: [[CHAN:[XYZW]]]+
-; R600-CHECK-NOT: [[CHAN]]+
-; SI-CHECK: V_MOV_B32_e32 v3
+; R600-NOT: [[CHAN]]+
+; SI: v_mov_b32_e32 v3
 define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [3 x i8], align 1
@@ -283,3 +290,22 @@
   ret void
 }
 
+; AMDGPUPromoteAlloca does not know how to handle ptrtoint.  When it
+; finds one, it should stop trying to promote.
+
+; FUNC-LABEL: ptrtoint:
+; SI-NOT: ds_write
+; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
+; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:0x5
+define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %alloca = alloca [16 x i32]
+  %tmp0 = getelementptr [16 x i32]* %alloca, i32 0, i32 %a
+  store i32 5, i32* %tmp0
+  %tmp1 = ptrtoint [16 x i32]* %alloca to i32
+  %tmp2 = add i32 %tmp1, 5
+  %tmp3 = inttoptr i32 %tmp2 to i32*
+  %tmp4 = getelementptr i32* %tmp3, i32 %b
+  %tmp5 = load i32* %tmp4
+  store i32 %tmp5, i32 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
index 55eb56d..1908f15 100644
--- a/test/CodeGen/R600/pv.ll
+++ b/test/CodeGen/R600/pv.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 | FileCheck %s
 
-;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
-;CHECK: MAX T{{[0-9].[XYZW]}}, PV.X, 0.0
+; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
+; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
 main_body:

diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll
index b760c88..112cdac 100644
--- a/test/CodeGen/R600/r600-encoding.ll
+++ b/test/CodeGen/R600/r600-encoding.ll

@@ -4,10 +4,10 @@
 ; The earliest R600 GPUs have a slightly different encoding than the rest of
 ; the VLIW4/5 GPUs.
 
-; EG-CHECK: @test
+; EG-CHECK: {{^}}test:
 ; EG-CHECK: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
-; R600-CHECK: @test
+; R600-CHECK: {{^}}test:
 ; R600-CHECK: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
 define void @test(<4 x float> inreg %reg0) #0 {

diff --git a/test/CodeGen/R600/r600-export-fix.ll b/test/CodeGen/R600/r600-export-fix.ll
index 73bc063..7d72856 100644
--- a/test/CodeGen/R600/r600-export-fix.ll
+++ b/test/CodeGen/R600/r600-export-fix.ll

@@ -3,9 +3,9 @@
 ;CHECK:	EXPORT T{{[0-9]}}.XYZW
 ;CHECK:	EXPORT T{{[0-9]}}.0000
 ;CHECK: EXPORT T{{[0-9]}}.0000
-;CHECK: EXPORT T{{[0-9]}}.0XZW
+;CHECK: EXPORT T{{[0-9]}}.0XYZ
 ;CHECK: EXPORT T{{[0-9]}}.XYZW
-;CHECK: EXPORT T{{[0-9]}}.YX00
+;CHECK: EXPORT T{{[0-9]}}.YZ00
 ;CHECK: EXPORT T{{[0-9]}}.0000
 ;CHECK: EXPORT T{{[0-9]}}.0000
 

diff --git a/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll
index c89398f..f388f8f 100644
--- a/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll
+++ b/test/CodeGen/R600/r600-infinite-loop-bug-while-reorganizing-vector.ll

@@ -1,5 +1,4 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
-;REQUIRES: asserts
 
 define void @main(<4 x float> inreg, <4 x float> inreg) #0 {
 main_body:

diff --git a/test/CodeGen/R600/r600cfg.ll b/test/CodeGen/R600/r600cfg.ll
index 6dee3ef..dddc9de 100644
--- a/test/CodeGen/R600/r600cfg.ll
+++ b/test/CodeGen/R600/r600cfg.ll

@@ -1,5 +1,4 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood
-;REQUIRES: asserts
 
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:

diff --git a/test/CodeGen/R600/register-count-comments.ll b/test/CodeGen/R600/register-count-comments.ll
index 329077c..61d1b5e 100644
--- a/test/CodeGen/R600/register-count-comments.ll
+++ b/test/CodeGen/R600/register-count-comments.ll

@@ -1,8 +1,8 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -asm-verbose < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
 
-; SI-LABEL: @foo:
+; SI-LABEL: {{^}}foo:
 ; SI: .section	.AMDGPU.csdata
 ; SI: ; Kernel info:
 ; SI: ; NumSgprs: {{[0-9]+}}
@@ -18,3 +18,10 @@
   store i32 %result, i32 addrspace(1)* %outptr, align 4
   ret void
 }
+
+; SI-LABEL: {{^}}one_vgpr_used:
+; SI: NumVgprs: 1
+define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
+  store i32 %x, i32 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/reorder-stores.ll b/test/CodeGen/R600/reorder-stores.ll
index be2fcc6..30c0171 100644
--- a/test/CodeGen/R600/reorder-stores.ll
+++ b/test/CodeGen/R600/reorder-stores.ll

@@ -1,15 +1,15 @@
 ; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @no_reorder_v2f64_global_load_store
-; SI: BUFFER_LOAD_DWORDX2
-; SI: BUFFER_LOAD_DWORDX2
-; SI: BUFFER_LOAD_DWORDX2
-; SI: BUFFER_LOAD_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_load_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <2 x double> addrspace(1)* %x, align 16
   %tmp4 = load <2 x double> addrspace(1)* %y, align 16
@@ -18,12 +18,12 @@
   ret void
 }
 
-; SI-LABEL: @no_reorder_scalarized_v2f64_local_load_store
-; SI: DS_READ_B64
-; SI: DS_READ_B64
-; SI: DS_WRITE_B64
-; SI: DS_WRITE_B64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_write_b64
+; SI: ds_write_b64
+; SI: s_endpgm
 define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x double> addrspace(3)* %x, align 16
   %tmp4 = load <2 x double> addrspace(3)* %y, align 16
@@ -32,48 +32,48 @@
   ret void
 }
 
-; SI-LABEL: @no_reorder_split_v8i32_global_load_store
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
+; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
-; SI: BUFFER_LOAD_DWORD
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+; SI: buffer_load_dword
 
 
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
 
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
 
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
 
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <8 x i32> addrspace(1)* %x, align 32
   %tmp4 = load <8 x i32> addrspace(1)* %y, align 32
@@ -82,13 +82,13 @@
   ret void
 }
 
-; SI-LABEL: @no_reorder_extload_64
-; SI: DS_READ_B64
-; SI: DS_READ_B64
-; SI: DS_WRITE_B64
-; SI-NOT: DS_READ
-; SI: DS_WRITE_B64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}no_reorder_extload_64:
+; SI: ds_read_b64
+; SI: ds_read_b64
+; SI: ds_write_b64
+; SI-NOT: ds_read
+; SI: ds_write_b64
+; SI: s_endpgm
 define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x i32> addrspace(3)* %x, align 8
   %tmp4 = load <2 x i32> addrspace(3)* %y, align 8

diff --git a/test/CodeGen/R600/rotl.i64.ll b/test/CodeGen/R600/rotl.i64.ll
index bda0b66..84a35b6 100644
--- a/test/CodeGen/R600/rotl.i64.ll
+++ b/test/CodeGen/R600/rotl.i64.ll

@@ -1,10 +1,11 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @s_rotl_i64:
-; SI: S_LSHL_B64
-; SI: S_SUB_I32
-; SI: S_LSHR_B64
-; SI: S_OR_B64
+; FUNC-LABEL: {{^}}s_rotl_i64:
+; SI-DAG: s_lshl_b64
+; SI-DAG: s_sub_i32
+; SI-DAG: s_lshr_b64
+; SI: s_or_b64
+; SI: s_endpgm
 define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %0 = shl i64 %x, %y
@@ -15,12 +16,13 @@
   ret void
 }
 
-; FUNC-LABEL: @v_rotl_i64:
-; SI: V_LSHL_B64
-; SI: V_SUB_I32
-; SI: V_LSHR_B64
-; SI: V_OR_B32
-; SI: V_OR_B32
+; FUNC-LABEL: {{^}}v_rotl_i64:
+; SI-DAG: v_lshl_b64
+; SI-DAG: v_sub_i32
+; SI: v_lshr_b64
+; SI: v_or_b32
+; SI: v_or_b32
+; SI: s_endpgm
 define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64 addrspace(1)* %xptr, align 8

diff --git a/test/CodeGen/R600/rotl.ll b/test/CodeGen/R600/rotl.ll
index 83f657f..6c8e503 100644
--- a/test/CodeGen/R600/rotl.ll
+++ b/test/CodeGen/R600/rotl.ll

@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @rotl_i32:
+; FUNC-LABEL: {{^}}rotl_i32:
 ; R600: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
 ; R600-NEXT: 32
 ; R600: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
 
-; SI: S_SUB_I32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
-; SI: V_MOV_B32_e32 [[VDST:v[0-9]+]], [[SDST]]
-; SI: V_ALIGNBIT_B32 {{v[0-9]+, [s][0-9]+, v[0-9]+}}, [[VDST]]
+; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
+; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
+; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
 define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
@@ -19,11 +19,12 @@
   ret void
 }
 
-; FUNC-LABEL: @rotl_v2i32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
+; FUNC-LABEL: {{^}}rotl_v2i32:
+; SI-DAG: s_sub_i32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: v_alignbit_b32
+; SI: s_endpgm
 define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %0 = shl <2 x i32> %x, %y
@@ -34,15 +35,16 @@
   ret void
 }
 
-; FUNC-LABEL: @rotl_v4i32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
-; SI: S_SUB_I32
-; SI: V_ALIGNBIT_B32
+; FUNC-LABEL: {{^}}rotl_v4i32:
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI-DAG: s_sub_i32
+; SI-DAG: v_alignbit_b32
+; SI: s_endpgm
 define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %0 = shl <4 x i32> %x, %y

diff --git a/test/CodeGen/R600/rotr.i64.ll b/test/CodeGen/R600/rotr.i64.ll
index c264751..9e14570 100644
--- a/test/CodeGen/R600/rotr.i64.ll
+++ b/test/CodeGen/R600/rotr.i64.ll

@@ -1,10 +1,10 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @s_rotr_i64
-; SI: S_LSHR_B64
-; SI: S_SUB_I32
-; SI: S_LSHL_B64
-; SI: S_OR_B64
+; FUNC-LABEL: {{^}}s_rotr_i64:
+; SI-DAG: s_sub_i32
+; SI-DAG: s_lshr_b64
+; SI-DAG: s_lshl_b64
+; SI: s_or_b64
 define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %tmp0 = sub i64 64, %y
@@ -15,12 +15,12 @@
   ret void
 }
 
-; FUNC-LABEL: @v_rotr_i64
-; SI: V_LSHR_B64
-; SI: V_SUB_I32
-; SI: V_LSHL_B64
-; SI: V_OR_B32
-; SI: V_OR_B32
+; FUNC-LABEL: {{^}}v_rotr_i64:
+; SI-DAG: v_sub_i32
+; SI-DAG: v_lshr_b64
+; SI-DAG: v_lshl_b64
+; SI: v_or_b32
+; SI: v_or_b32
 define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64 addrspace(1)* %xptr, align 8
@@ -33,7 +33,7 @@
   ret void
 }
 
-; FUNC-LABEL: @s_rotr_v2i64
+; FUNC-LABEL: {{^}}s_rotr_v2i64:
 define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
 entry:
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
@@ -44,7 +44,7 @@
   ret void
 }
 
-; FUNC-LABEL: @v_rotr_v2i64
+; FUNC-LABEL: {{^}}v_rotr_v2i64:
 define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
 entry:
   %x = load <2 x i64> addrspace(1)* %xptr, align 8

diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index a5a4da4..a1add11 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll

@@ -1,10 +1,10 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @rotr_i32:
+; FUNC-LABEL: {{^}}rotr_i32:
 ; R600: BIT_ALIGN_INT
 
-; SI: V_ALIGNBIT_B32
+; SI: v_alignbit_b32
 define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %tmp0 = sub i32 32, %y
@@ -15,12 +15,12 @@
   ret void
 }
 
-; FUNC-LABEL: @rotr_v2i32:
+; FUNC-LABEL: {{^}}rotr_v2i32:
 ; R600: BIT_ALIGN_INT
 ; R600: BIT_ALIGN_INT
 
-; SI: V_ALIGNBIT_B32
-; SI: V_ALIGNBIT_B32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
 define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
@@ -31,16 +31,16 @@
   ret void
 }
 
-; FUNC-LABEL: @rotr_v4i32:
+; FUNC-LABEL: {{^}}rotr_v4i32:
 ; R600: BIT_ALIGN_INT
 ; R600: BIT_ALIGN_INT
 ; R600: BIT_ALIGN_INT
 ; R600: BIT_ALIGN_INT
 
-; SI: V_ALIGNBIT_B32
-; SI: V_ALIGNBIT_B32
-; SI: V_ALIGNBIT_B32
-; SI: V_ALIGNBIT_B32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
+; SI: v_alignbit_b32
 define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y

diff --git a/test/CodeGen/R600/rsq.ll b/test/CodeGen/R600/rsq.ll
index 87c0570..d792c9f 100644
--- a/test/CodeGen/R600/rsq.ll
+++ b/test/CodeGen/R600/rsq.ll

@@ -1,11 +1,12 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
 
 declare float @llvm.sqrt.f32(float) nounwind readnone
 declare double @llvm.sqrt.f64(double) nounwind readnone
 
-; SI-LABEL: @rsq_f32
-; SI: V_RSQ_F32_e32
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}rsq_f32:
+; SI: v_rsq_f32_e32
+; SI: s_endpgm
 define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
@@ -14,9 +15,10 @@
   ret void
 }
 
-; SI-LABEL: @rsq_f64
-; SI: V_RSQ_F64_e32
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}rsq_f64:
+; SI-UNSAFE: v_rsq_f64_e32
+; SI-SAFE: v_sqrt_f64_e32
+; SI: s_endpgm
 define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
@@ -24,3 +26,13 @@
   store double %div, double addrspace(1)* %out, align 4
   ret void
 }
+
+; SI-LABEL: {{^}}rsq_f32_sgpr:
+; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
+; SI: s_endpgm
+define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
+  %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv float 1.0, %sqrt
+  store float %div, float addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/s_movk_i32.ll b/test/CodeGen/R600/s_movk_i32.ll
new file mode 100644
index 0000000..71f9a41
--- /dev/null
+++ b/test/CodeGen/R600/s_movk_i32.ll

@@ -0,0 +1,184 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_movk_i32_k0:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k1:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k2:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x7fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 64{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k3:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k4:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0x20000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 1{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k5:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0xffef{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0xff00ffff{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k6:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x41{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 63{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k7:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x2000{{$}}
+; SI-DAG: s_movk_i32 [[HI_S_IMM:s[0-9]+]], 0x4000{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+
+; SI-LABEL: {{^}}s_movk_i32_k8:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8000{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k9:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8001{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k10:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8888{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k11:
+; SI-DAG: s_movk_i32 [[LO_S_IMM:s[0-9]+]], 0x8fff{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_movk_i32_k12:
+; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff7001{{$}}
+; SI-DAG: s_mov_b32 [[HI_S_IMM:s[0-9]+]], 0x11111111{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
+; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
+; SI: s_endpgm
+define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+  %loada = load i64 addrspace(1)* %a, align 4
+  %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
+  store i64 %or, i64 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/saddo.ll b/test/CodeGen/R600/saddo.ll
index c80480e..654967c 100644
--- a/test/CodeGen/R600/saddo.ll
+++ b/test/CodeGen/R600/saddo.ll

@@ -4,7 +4,7 @@
 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 
-; FUNC-LABEL: @saddo_i64_zext
+; FUNC-LABEL: {{^}}saddo_i64_zext:
 define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
@@ -15,7 +15,7 @@
   ret void
 }
 
-; FUNC-LABEL: @s_saddo_i32
+; FUNC-LABEL: {{^}}s_saddo_i32:
 define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
@@ -25,7 +25,7 @@
   ret void
 }
 
-; FUNC-LABEL: @v_saddo_i32
+; FUNC-LABEL: {{^}}v_saddo_i32:
 define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -37,7 +37,7 @@
   ret void
 }
 
-; FUNC-LABEL: @s_saddo_i64
+; FUNC-LABEL: {{^}}s_saddo_i64:
 define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
@@ -47,9 +47,9 @@
   ret void
 }
 
-; FUNC-LABEL: @v_saddo_i64
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; FUNC-LABEL: {{^}}v_saddo_i64:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64 addrspace(1)* %aptr, align 4
   %b = load i64 addrspace(1)* %bptr, align 4

diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
index e7719b6..23af3e4 100644
--- a/test/CodeGen/R600/salu-to-valu.ll
+++ b/test/CodeGen/R600/salu-to-valu.ll

@@ -7,15 +7,15 @@
 ; sgpr register pair and use that for the pointer operand
 ; (low 64-bits of srsrc).
 
-; CHECK-LABEL: @mubuf
+; CHECK-LABEL: {{^}}mubuf:
 
-; Make sure we aren't using VGPRs for the source operand of S_MOV_B64
-; CHECK-NOT: S_MOV_B64 s[{{[0-9]+:[0-9]+}}], v
+; Make sure we aren't using VGPRs for the source operand of s_mov_b64
+; CHECK-NOT: s_mov_b64 s[{{[0-9]+:[0-9]+}}], v
 
 ; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
 ; instructions
-; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
-; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
+; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
+; CHECK: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
 define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #1
@@ -49,9 +49,9 @@
 
 ; Test moving an SMRD instruction to the VALU
 
-; CHECK-LABEL: @smrd_valu
-; CHECK: BUFFER_LOAD_DWORD [[OUT:v[0-9]+]]
-; CHECK: BUFFER_STORE_DWORD [[OUT]]
+; CHECK-LABEL: {{^}}smrd_valu:
+; CHECK: buffer_load_dword [[OUT:v[0-9]+]]
+; CHECK: buffer_store_dword [[OUT]]
 
 define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 addrspace(1)* %out) {
 entry:
@@ -77,8 +77,8 @@
 
 ; Test moving ann SMRD with an immediate offset to the VALU
 
-; CHECK-LABEL: @smrd_valu2
-; CHECK: BUFFER_LOAD_DWORD
+; CHECK-LABEL: {{^}}smrd_valu2:
+; CHECK: buffer_load_dword
 define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -88,3 +88,31 @@
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
+
+; CHECK-LABEL: {{^}}s_load_imm_v8i32:
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp1 = getelementptr inbounds i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <8 x i32> addrspace(2)*
+  %tmp3 = load <8 x i32> addrspace(2)* %tmp2, align 4
+  store <8 x i32> %tmp3, <8 x i32> addrspace(1)* %out, align 32
+  ret void
+}
+
+; CHECK-LABEL: {{^}}s_load_imm_v16i32:
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+; CHECK: buffer_load_dwordx4
+define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) {
+entry:
+  %tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
+  %tmp1 = getelementptr inbounds i32 addrspace(2)* %in, i32 %tmp0
+  %tmp2 = bitcast i32 addrspace(2)* %tmp1 to <16 x i32> addrspace(2)*
+  %tmp3 = load <16 x i32> addrspace(2)* %tmp2, align 4
+  store <16 x i32> %tmp3, <16 x i32> addrspace(1)* %out, align 32
+  ret void
+}

diff --git a/test/CodeGen/R600/scalar_to_vector.ll b/test/CodeGen/R600/scalar_to_vector.ll
index bcccb06..dc9ebe0 100644
--- a/test/CodeGen/R600/scalar_to_vector.ll
+++ b/test/CodeGen/R600/scalar_to_vector.ll

@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 
-; FUNC-LABEL: @scalar_to_vector_v2i32
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: s_endpgm
 define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tmp1 = load i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
@@ -17,14 +17,14 @@
   ret void
 }
 
-; FUNC-LABEL: @scalar_to_vector_v2f32
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_LSHRREV_B32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: BUFFER_STORE_SHORT [[RESULT]]
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}scalar_to_vector_v2f32:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: buffer_store_short [[RESULT]]
+; SI: s_endpgm
 define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tmp1 = load float addrspace(1)* %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>

diff --git a/test/CodeGen/R600/schedule-global-loads.ll b/test/CodeGen/R600/schedule-global-loads.ll
new file mode 100644
index 0000000..5422ca7
--- /dev/null
+++ b/test/CodeGen/R600/schedule-global-loads.ll

@@ -0,0 +1,41 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FIXME: This currently doesn't do a great job of clustering the
+; loads, which end up with extra moves between them. Right now, it
+; seems the only things areLoadsFromSameBasePtr is accomplishing is
+; ordering the loads so that the lower address loads come first.
+
+; FUNC-LABEL: {{^}}cluster_global_arg_loads:
+; SI-DAG: buffer_load_dword [[REG0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:0x4
+; SI: buffer_store_dword [[REG0]]
+; SI: buffer_store_dword [[REG1]]
+define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
+  %load0 = load i32 addrspace(1)* %ptr, align 4
+  %gep = getelementptr i32 addrspace(1)* %ptr, i32 1
+  %load1 = load i32 addrspace(1)* %gep, align 4
+  store i32 %load0, i32 addrspace(1)* %out0, align 4
+  store i32 %load1, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test for a crach in SIInstrInfo::areLoadsFromSameBasePtr() when checking
+; an MUBUF load which does not have a vaddr operand.
+; FUNC-LABEL: {{^}}same_base_ptr_crash:
+; SI: buffer_load_dword
+; SI: buffer_load_dword
+define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+entry:
+  %out1 = getelementptr i32 addrspace(1)* %out, i32 %offset
+  %tmp0 = load i32 addrspace(1)* %out
+  %tmp1 = load i32 addrspace(1)* %out1
+  %tmp2 = add i32 %tmp0, %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/schedule-kernel-arg-loads.ll b/test/CodeGen/R600/schedule-kernel-arg-loads.ll
new file mode 100644
index 0000000..e774157
--- /dev/null
+++ b/test/CodeGen/R600/schedule-kernel-arg-loads.ll

@@ -0,0 +1,12 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
+
+; FUNC-LABEL: {{^}}cluster_arg_loads:
+; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
+; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
+define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
+  store i32 %x, i32 addrspace(1)* %out0, align 4
+  store i32 %y, i32 addrspace(1)* %out1, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
index 3d2142d..baac5b5 100644
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop-failure.ll

@@ -5,7 +5,7 @@
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
 
 
-; SI-LABEL: @main(
+; SI-LABEL: {{^}}main(
 define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
   %0 = extractelement <4 x float> %reg1, i32 0

diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
index e922d5c..16853e0 100644
--- a/test/CodeGen/R600/sdiv.ll
+++ b/test/CodeGen/R600/sdiv.ll

@@ -10,7 +10,7 @@
 ; This was fixed by adding an additional pattern in R600Instructions.td to
 ; match this pattern with a CNDGE_INT.
 
-; FUNC-LABEL: @sdiv_i32
+; FUNC-LABEL: {{^}}sdiv_i32:
 ; EG: CF_END
 define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
@@ -21,7 +21,7 @@
   ret void
 }
 
-; FUNC-LABEL: @sdiv_i32_4
+; FUNC-LABEL: {{^}}sdiv_i32_4:
 define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32 addrspace(1) * %in
   %result = sdiv i32 %num, 4
@@ -32,16 +32,16 @@
 ; Multiply by a weird constant to make sure setIntDivIsCheap is
 ; working.
 
-; FUNC-LABEL: @slow_sdiv_i32_3435
-; SI: BUFFER_LOAD_DWORD [[VAL:v[0-9]+]],
-; SI: V_MOV_B32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
-; SI: V_MUL_HI_I32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
-; SI: V_ADD_I32
-; SI: V_LSHRREV_B32
-; SI: V_ASHRREV_I32
-; SI: V_ADD_I32
-; SI: BUFFER_STORE_DWORD
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
+; SI: buffer_load_dword [[VAL:v[0-9]+]],
+; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
+; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
+; SI: v_add_i32
+; SI: v_lshrrev_b32
+; SI: v_ashrrev_i32
+; SI: v_add_i32
+; SI: buffer_store_dword
+; SI: s_endpgm
 define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32 addrspace(1) * %in
   %result = sdiv i32 %num, 3435

diff --git a/test/CodeGen/R600/sdivrem24.ll b/test/CodeGen/R600/sdivrem24.ll
new file mode 100644
index 0000000..228cf76
--- /dev/null
+++ b/test/CodeGen/R600/sdivrem24.ll

@@ -0,0 +1,238 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}sdiv24_i8:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
+  %num = load i8 addrspace(1) * %in
+  %den = load i8 addrspace(1) * %den_ptr
+  %result = sdiv i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv24_i16:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
+  %num = load i16 addrspace(1) * %in, align 2
+  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %result = sdiv i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sdiv25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = sdiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i8:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
+  %num = load i8 addrspace(1) * %in
+  %den = load i8 addrspace(1) * %den_ptr
+  %result = srem i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i16:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
+  %num = load i16 addrspace(1) * %in, align 2
+  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %result = srem i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem24_i32:
+; SI: v_cvt_f32_i32
+; SI: v_cvt_f32_i32
+; SI: v_rcp_f32
+; SI: v_cvt_i32_f32
+
+; EG: INT_TO_FLT
+; EG-DAG: INT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_INT
+define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}srem25_i32:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_srem24_i32_1:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_srem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = ashr i32 %num.i24.0, 8
+  %den.i24 = ashr i32 %den.i24.0, 7
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_srem24_i32_2:
+; SI-NOT: v_cvt_f32_i32
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: INT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_srem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = ashr i32 %num.i24.0, 7
+  %den.i24 = ashr i32 %den.i24.0, 8
+  %result = srem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/select-i1.ll b/test/CodeGen/R600/select-i1.ll
new file mode 100644
index 0000000..2e2d0e4
--- /dev/null
+++ b/test/CodeGen/R600/select-i1.ll

@@ -0,0 +1,14 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
+
+; FUNC-LABEL: {{^}}select_i1:
+; SI: v_cndmask_b32
+; SI-NOT: v_cndmask_b32
+define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i1 %a, i1 %b
+  store i1 %sel, i1 addrspace(1)* %out, align 4
+  ret void
+}
+

diff --git a/test/CodeGen/R600/select-vectors.ll b/test/CodeGen/R600/select-vectors.ll
index 94605fe..7d8df2e 100644
--- a/test/CodeGen/R600/select-vectors.ll
+++ b/test/CodeGen/R600/select-vectors.ll

@@ -4,11 +4,11 @@
 ; Evergreen not enabled since it seems to be having problems with doubles.
 
 
-; FUNC-LABEL: @select_v4i8
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v4i8:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
@@ -16,11 +16,11 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v4i16
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v4i16:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
@@ -28,10 +28,10 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v2i32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}select_v2i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_dwordx2
 define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
@@ -39,12 +39,12 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v4i32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: BUFFER_STORE_DWORDX4
+; FUNC-LABEL: {{^}}select_v4i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: buffer_store_dwordx4
 define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
@@ -52,15 +52,15 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v8i32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v8i32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
@@ -68,8 +68,8 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v2f32
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}select_v2f32:
+; SI: buffer_store_dwordx2
 define void @select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
@@ -77,8 +77,8 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v4f32
-; SI: BUFFER_STORE_DWORDX4
+; FUNC-LABEL: {{^}}select_v4f32:
+; SI: buffer_store_dwordx4
 define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
@@ -86,15 +86,15 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v8f32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v8f32:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
@@ -102,11 +102,11 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v2f64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v2f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
@@ -114,15 +114,15 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v4f64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v4f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
@@ -130,23 +130,23 @@
   ret void
 }
 
-; FUNC-LABEL: @select_v8f64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
+; FUNC-LABEL: {{^}}select_v8f64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
 define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b

diff --git a/test/CodeGen/R600/select.ll b/test/CodeGen/R600/select.ll
index f940142..45f3cd5 100644
--- a/test/CodeGen/R600/select.ll
+++ b/test/CodeGen/R600/select.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
 
 ; Normally icmp + select is optimized to select_cc, when this happens the
 ; DAGLegalizer never sees the select and doesn't have a chance to leaglize it.
@@ -6,13 +7,13 @@
 ; In order to avoid the select_cc optimization, this test case calculates the
 ; condition for the select in a separate basic block.
 
-; CHECK-LABEL: @select
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+; FUNC-LABEL: {{^}}select:
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
 define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
                      <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
                      <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,

diff --git a/test/CodeGen/R600/select64.ll b/test/CodeGen/R600/select64.ll
index 6b87d98..8de34d5 100644
--- a/test/CodeGen/R600/select64.ll
+++ b/test/CodeGen/R600/select64.ll

@@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
 
-; CHECK-LABEL: @select0
+; CHECK-LABEL: {{^}}select0:
 ; i64 select should be split into two i32 selects, and we shouldn't need
 ; to use a shfit to extract the hi dword of the input.
-; CHECK-NOT: S_LSHR_B64
-; CHECK: V_CNDMASK
-; CHECK: V_CNDMASK
+; CHECK-NOT: s_lshr_b64
+; CHECK: v_cndmask
+; CHECK: v_cndmask
 define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
 entry:
   %0 = icmp ugt i32 %cond, 5
@@ -13,3 +13,38 @@
   store i64 %1, i64 addrspace(1)* %out
   ret void
 }
+
+; CHECK-LABEL: {{^}}select_trunc_i64:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i64 0, i64 %in
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}select_trunc_i64_2:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
+; CHECK: v_cndmask_b32
+; CHECK-NOT: v_cndmask_b32
+define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %cmp = icmp ugt i32 %cond, 5
+  %a = load i64 addrspace(1)* %aptr, align 8
+  %b = load i64 addrspace(1)* %bptr, align 8
+  %sel = select i1 %cmp, i64 %a, i64 %b
+  %trunc = trunc i64 %sel to i32
+  store i32 %trunc, i32 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll
index 834c030..82577bb 100644
--- a/test/CodeGen/R600/selectcc-opt.ll
+++ b/test/CodeGen/R600/selectcc-opt.ll

@@ -1,8 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
-; CHECK: @test_a
-; CHECK-NOT: CND
-; CHECK: SET{{[NEQGTL]+}}_DX10
+
+; FUNC-LABEL: {{^}}test_a:
+; EG-NOT: CND
+; EG: SET{{[NEQGTL]+}}_DX10
 
 define void @test_a(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -28,10 +30,10 @@
 ; Same as test_a, but the branch labels are swapped to produce the inverse cc
 ; for the icmp instruction
 
-; CHECK: @test_b
-; CHECK: SET{{[GTEQN]+}}_DX10
-; CHECK-NEXT: PRED_
-; CHECK-NEXT: ALU clause starting
+; EG-LABEL: {{^}}test_b:
+; EG: SET{{[GTEQN]+}}_DX10
+; EG-NEXT: PRED_
+; EG-NEXT: ALU clause starting
 define void @test_b(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.0
@@ -54,8 +56,8 @@
 }
 
 ; Test a CND*_INT instruction with float true/false values
-; CHECK: @test_c
-; CHECK: CND{{[GTE]+}}_INT
+; EG-LABEL: {{^}}test_c:
+; EG: CND{{[GTE]+}}_INT
 define void @test_c(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
@@ -63,3 +65,15 @@
   store float %1, float addrspace(1)* %out
   ret void
 }
+
+; FUNC-LABEL: {{^}}selectcc_bool:
+; SI: v_cmp_ne_i32
+; SI-NEXT: v_cndmask_b32_e64
+; SI-NOT: cmp
+; SI-NOT: cndmask
+define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = select i1 %icmp0, i32 -1, i32 0
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/selectcc.ll b/test/CodeGen/R600/selectcc.ll
index a8f57cf..5a09b5c 100644
--- a/test/CodeGen/R600/selectcc.ll
+++ b/test/CodeGen/R600/selectcc.ll

@@ -1,15 +1,15 @@
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @selectcc_i64
+; FUNC-LABEL: {{^}}selectcc_i64:
 ; EG: XOR_INT
 ; EG: XOR_INT
 ; EG: OR_INT
 ; EG: CNDE_INT
 ; EG: CNDE_INT
-; SI: V_CMP_EQ_I64
-; SI: V_CNDMASK
-; SI: V_CNDMASK
+; SI: v_cmp_eq_i64
+; SI: v_cndmask
+; SI: v_cndmask
 define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs

diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll
index 5c7d499..53694dc 100644
--- a/test/CodeGen/R600/set-dx10.ll
+++ b/test/CodeGen/R600/set-dx10.ll

@@ -4,7 +4,7 @@
 ; to store integer true (-1) and false (0) values are lowered to one of the
 ; SET*DX10 instructions.
 
-; CHECK: @fcmp_une_select_fptosi
+; CHECK: {{^}}fcmp_une_select_fptosi:
 ; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -18,7 +18,7 @@
   ret void
 }
 
-; CHECK: @fcmp_une_select_i32
+; CHECK: {{^}}fcmp_une_select_i32:
 ; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -30,7 +30,7 @@
   ret void
 }
 
-; CHECK: @fcmp_oeq_select_fptosi
+; CHECK: {{^}}fcmp_oeq_select_fptosi:
 ; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -44,7 +44,7 @@
   ret void
 }
 
-; CHECK: @fcmp_oeq_select_i32
+; CHECK: {{^}}fcmp_oeq_select_i32:
 ; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -56,7 +56,7 @@
   ret void
 }
 
-; CHECK: @fcmp_ogt_select_fptosi
+; CHECK: {{^}}fcmp_ogt_select_fptosi:
 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -70,7 +70,7 @@
   ret void
 }
 
-; CHECK: @fcmp_ogt_select_i32
+; CHECK: {{^}}fcmp_ogt_select_i32:
 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -82,7 +82,7 @@
   ret void
 }
 
-; CHECK: @fcmp_oge_select_fptosi
+; CHECK: {{^}}fcmp_oge_select_fptosi:
 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -96,7 +96,7 @@
   ret void
 }
 
-; CHECK: @fcmp_oge_select_i32
+; CHECK: {{^}}fcmp_oge_select_i32:
 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -108,7 +108,7 @@
   ret void
 }
 
-; CHECK: @fcmp_ole_select_fptosi
+; CHECK: {{^}}fcmp_ole_select_fptosi:
 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -122,7 +122,7 @@
   ret void
 }
 
-; CHECK: @fcmp_ole_select_i32
+; CHECK: {{^}}fcmp_ole_select_i32:
 ; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -134,7 +134,7 @@
   ret void
 }
 
-; CHECK: @fcmp_olt_select_fptosi
+; CHECK: {{^}}fcmp_olt_select_fptosi:
 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -148,7 +148,7 @@
   ret void
 }
 
-; CHECK: @fcmp_olt_select_i32
+; CHECK: {{^}}fcmp_olt_select_i32:
 ; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)

diff --git a/test/CodeGen/R600/setcc-equivalent.ll b/test/CodeGen/R600/setcc-equivalent.ll
index f796748..11ea793 100644
--- a/test/CodeGen/R600/setcc-equivalent.ll
+++ b/test/CodeGen/R600/setcc-equivalent.ll

@@ -1,7 +1,6 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
-; XFAIL: *
 
-; EG-LABEL: @and_setcc_setcc_i32
+; EG-LABEL: {{^}}and_setcc_setcc_i32:
 ; EG: AND_INT
 ; EG-NEXT: SETE_INT
 define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
@@ -13,7 +12,7 @@
   ret void
 }
 
-; EG-LABEL: @and_setcc_setcc_v4i32
+; EG-LABEL: {{^}}and_setcc_setcc_v4i32:
 ; EG: AND_INT
 ; EG: AND_INT
 ; EG: SETE_INT
@@ -28,4 +27,4 @@
   %ext = sext <4 x i1> %and to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out, align 4
   ret void
-}
\ No newline at end of file
+}

diff --git a/test/CodeGen/R600/setcc-opt.ll b/test/CodeGen/R600/setcc-opt.ll
new file mode 100644
index 0000000..af48df8
--- /dev/null
+++ b/test/CodeGen/R600/setcc-opt.ll

@@ -0,0 +1,15 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; SI-LABEL: {{^}}sext_bool_icmp_ne:
+; SI: v_cmp_ne_i32
+; SI-NEXT: v_cndmask_b32
+; SI-NOT: v_cmp_ne_i32
+; SI-NOT: v_cndmask_b32
+; SI: s_endpgm
+define void @sext_bool_icmp_ne(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+  %icmp0 = icmp ne i32 %a, %b
+  %ext = sext i1 %icmp0 to i32
+  %icmp1 = icmp ne i32 %ext, 0
+  store i1 %icmp1, i1 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 5bd95b7..8dd2ce4 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll

@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
 ;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
-; FUNC-LABEL: @setcc_v2i32
+; FUNC-LABEL: {{^}}setcc_v2i32:
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
 
@@ -12,7 +12,7 @@
   ret void
 }
 
-; FUNC-LABEL: @setcc_v4i32
+; FUNC-LABEL: {{^}}setcc_v4i32:
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -32,9 +32,9 @@
 ;; Float comparisons
 ;;;==========================================================================;;;
 
-; FUNC-LABEL: @f32_oeq
+; FUNC-LABEL: {{^}}f32_oeq:
 ; R600: SETE_DX10
-; SI: V_CMP_EQ_F32
+; SI: v_cmp_eq_f32
 define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp oeq float %a, %b
@@ -43,9 +43,9 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_ogt
+; FUNC-LABEL: {{^}}f32_ogt:
 ; R600: SETGT_DX10
-; SI: V_CMP_GT_F32
+; SI: v_cmp_gt_f32
 define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ogt float %a, %b
@@ -54,9 +54,9 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_oge
+; FUNC-LABEL: {{^}}f32_oge:
 ; R600: SETGE_DX10
-; SI: V_CMP_GE_F32
+; SI: v_cmp_ge_f32
 define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp oge float %a, %b
@@ -65,9 +65,9 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_olt
+; FUNC-LABEL: {{^}}f32_olt:
 ; R600: SETGT_DX10
-; SI: V_CMP_LT_F32
+; SI: v_cmp_lt_f32
 define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp olt float %a, %b
@@ -76,9 +76,9 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_ole
+; FUNC-LABEL: {{^}}f32_ole:
 ; R600: SETGE_DX10
-; SI: V_CMP_LE_F32
+; SI: v_cmp_le_f32
 define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ole float %a, %b
@@ -87,18 +87,18 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_one
+; FUNC-LABEL: {{^}}f32_one:
 ; R600-DAG: SETE_DX10
 ; R600-DAG: SETE_DX10
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_DX10
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
-; SI: V_CMP_O_F32
-; SI: V_CMP_NEQ_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_AND_B32_e32
+; SI: v_cmp_o_f32
+; SI: v_cmp_neq_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_and_b32_e32
 define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp one float %a, %b
@@ -107,12 +107,12 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_ord
+; FUNC-LABEL: {{^}}f32_ord:
 ; R600-DAG: SETE_DX10
 ; R600-DAG: SETE_DX10
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
-; SI: V_CMP_O_F32
+; SI: v_cmp_o_f32
 define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ord float %a, %b
@@ -121,18 +121,18 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_ueq
+; FUNC-LABEL: {{^}}f32_ueq:
 ; R600-DAG: SETNE_DX10
 ; R600-DAG: SETNE_DX10
 ; R600-DAG: OR_INT
 ; R600-DAG: SETE_DX10
 ; R600-DAG: OR_INT
 ; R600-DAG: SETNE_INT
-; SI: V_CMP_U_F32
-; SI: V_CMP_EQ_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_eq_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ueq float %a, %b
@@ -141,14 +141,14 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_ugt
+; FUNC-LABEL: {{^}}f32_ugt:
 ; R600: SETGE
 ; R600: SETE_DX10
-; SI: V_CMP_U_F32
-; SI: V_CMP_GT_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_gt_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ugt float %a, %b
@@ -157,14 +157,14 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_uge
+; FUNC-LABEL: {{^}}f32_uge:
 ; R600: SETGT
 ; R600: SETE_DX10
-; SI: V_CMP_U_F32
-; SI: V_CMP_GE_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_ge_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp uge float %a, %b
@@ -173,14 +173,14 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_ult
+; FUNC-LABEL: {{^}}f32_ult:
 ; R600: SETGE
 ; R600: SETE_DX10
-; SI: V_CMP_U_F32
-; SI: V_CMP_LT_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_lt_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ult float %a, %b
@@ -189,14 +189,14 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_ule
+; FUNC-LABEL: {{^}}f32_ule:
 ; R600: SETGT
 ; R600: SETE_DX10
-; SI: V_CMP_U_F32
-; SI: V_CMP_LE_F32
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; SI: v_cmp_u_f32
+; SI: v_cmp_le_f32
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp ule float %a, %b
@@ -205,9 +205,9 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_une
+; FUNC-LABEL: {{^}}f32_une:
 ; R600: SETNE_DX10
-; SI: V_CMP_NEQ_F32
+; SI: v_cmp_neq_f32
 define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp une float %a, %b
@@ -216,12 +216,12 @@
   ret void
 }
 
-; FUNC-LABEL: @f32_uno
+; FUNC-LABEL: {{^}}f32_uno:
 ; R600: SETNE_DX10
 ; R600: SETNE_DX10
 ; R600: OR_INT
 ; R600: SETNE_INT
-; SI: V_CMP_U_F32
+; SI: v_cmp_u_f32
 define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fcmp uno float %a, %b
@@ -234,9 +234,9 @@
 ;; 32-bit integer comparisons
 ;;;==========================================================================;;;
 
-; FUNC-LABEL: @i32_eq
+; FUNC-LABEL: {{^}}i32_eq:
 ; R600: SETE_INT
-; SI: V_CMP_EQ_I32
+; SI: v_cmp_eq_i32
 define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp eq i32 %a, %b
@@ -245,9 +245,9 @@
   ret void
 }
 
-; FUNC-LABEL: @i32_ne
+; FUNC-LABEL: {{^}}i32_ne:
 ; R600: SETNE_INT
-; SI: V_CMP_NE_I32
+; SI: v_cmp_ne_i32
 define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp ne i32 %a, %b
@@ -256,9 +256,9 @@
   ret void
 }
 
-; FUNC-LABEL: @i32_ugt
+; FUNC-LABEL: {{^}}i32_ugt:
 ; R600: SETGT_UINT
-; SI: V_CMP_GT_U32
+; SI: v_cmp_gt_u32
 define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp ugt i32 %a, %b
@@ -267,9 +267,9 @@
   ret void
 }
 
-; FUNC-LABEL: @i32_uge
+; FUNC-LABEL: {{^}}i32_uge:
 ; R600: SETGE_UINT
-; SI: V_CMP_GE_U32
+; SI: v_cmp_ge_u32
 define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp uge i32 %a, %b
@@ -278,9 +278,9 @@
   ret void
 }
 
-; FUNC-LABEL: @i32_ult
+; FUNC-LABEL: {{^}}i32_ult:
 ; R600: SETGT_UINT
-; SI: V_CMP_LT_U32
+; SI: v_cmp_lt_u32
 define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp ult i32 %a, %b
@@ -289,9 +289,9 @@
   ret void
 }
 
-; FUNC-LABEL: @i32_ule
+; FUNC-LABEL: {{^}}i32_ule:
 ; R600: SETGE_UINT
-; SI: V_CMP_LE_U32
+; SI: v_cmp_le_u32
 define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp ule i32 %a, %b
@@ -300,9 +300,9 @@
   ret void
 }
 
-; FUNC-LABEL: @i32_sgt
+; FUNC-LABEL: {{^}}i32_sgt:
 ; R600: SETGT_INT
-; SI: V_CMP_GT_I32
+; SI: v_cmp_gt_i32
 define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp sgt i32 %a, %b
@@ -311,9 +311,9 @@
   ret void
 }
 
-; FUNC-LABEL: @i32_sge
+; FUNC-LABEL: {{^}}i32_sge:
 ; R600: SETGE_INT
-; SI: V_CMP_GE_I32
+; SI: v_cmp_ge_i32
 define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp sge i32 %a, %b
@@ -322,9 +322,9 @@
   ret void
 }
 
-; FUNC-LABEL: @i32_slt
+; FUNC-LABEL: {{^}}i32_slt:
 ; R600: SETGT_INT
-; SI: V_CMP_LT_I32
+; SI: v_cmp_lt_i32
 define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp slt i32 %a, %b
@@ -333,9 +333,9 @@
   ret void
 }
 
-; FUNC-LABEL: @i32_sle
+; FUNC-LABEL: {{^}}i32_sle:
 ; R600: SETGE_INT
-; SI: V_CMP_LE_I32
+; SI: v_cmp_le_i32
 define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp sle i32 %a, %b

diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll
index 54a33b3..6e43172 100644
--- a/test/CodeGen/R600/setcc64.ll
+++ b/test/CodeGen/R600/setcc64.ll

@@ -6,8 +6,8 @@
 ;; Double comparisons
 ;;;==========================================================================;;;
 
-; FUNC-LABEL: @f64_oeq
-; SI: V_CMP_EQ_F64
+; FUNC-LABEL: {{^}}f64_oeq:
+; SI: v_cmp_eq_f64
 define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp oeq double %a, %b
@@ -16,8 +16,8 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_ogt
-; SI: V_CMP_GT_F64
+; FUNC-LABEL: {{^}}f64_ogt:
+; SI: v_cmp_gt_f64
 define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ogt double %a, %b
@@ -26,8 +26,8 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_oge
-; SI: V_CMP_GE_F64
+; FUNC-LABEL: {{^}}f64_oge:
+; SI: v_cmp_ge_f64
 define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp oge double %a, %b
@@ -36,8 +36,8 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_olt
-; SI: V_CMP_LT_F64
+; FUNC-LABEL: {{^}}f64_olt:
+; SI: v_cmp_lt_f64
 define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp olt double %a, %b
@@ -46,8 +46,8 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_ole
-; SI: V_CMP_LE_F64
+; FUNC-LABEL: {{^}}f64_ole:
+; SI: v_cmp_le_f64
 define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ole double %a, %b
@@ -56,12 +56,12 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_one
-; SI: V_CMP_O_F64
-; SI: V_CMP_NEQ_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_AND_B32_e32
+; FUNC-LABEL: {{^}}f64_one:
+; SI: v_cmp_o_f64
+; SI: v_cmp_neq_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_and_b32_e32
 define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp one double %a, %b
@@ -70,8 +70,8 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_ord
-; SI: V_CMP_O_F64
+; FUNC-LABEL: {{^}}f64_ord:
+; SI: v_cmp_o_f64
 define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ord double %a, %b
@@ -80,12 +80,12 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_ueq
-; SI: V_CMP_U_F64
-; SI: V_CMP_EQ_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_ueq:
+; SI: v_cmp_u_f64
+; SI: v_cmp_eq_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ueq double %a, %b
@@ -94,12 +94,12 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_ugt
-; SI: V_CMP_U_F64
-; SI: V_CMP_GT_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_ugt:
+; SI: v_cmp_u_f64
+; SI: v_cmp_gt_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ugt double %a, %b
@@ -108,12 +108,12 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_uge
-; SI: V_CMP_U_F64
-; SI: V_CMP_GE_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_uge:
+; SI: v_cmp_u_f64
+; SI: v_cmp_ge_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp uge double %a, %b
@@ -122,12 +122,12 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_ult
-; SI: V_CMP_U_F64
-; SI: V_CMP_LT_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_ult:
+; SI: v_cmp_u_f64
+; SI: v_cmp_lt_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ult double %a, %b
@@ -136,12 +136,12 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_ule
-; SI: V_CMP_U_F64
-; SI: V_CMP_LE_F64
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: V_OR_B32_e32
+; FUNC-LABEL: {{^}}f64_ule:
+; SI: v_cmp_u_f64
+; SI: v_cmp_le_f64
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: v_or_b32_e32
 define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp ule double %a, %b
@@ -150,8 +150,8 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_une
-; SI: V_CMP_NEQ_F64
+; FUNC-LABEL: {{^}}f64_une:
+; SI: v_cmp_neq_f64
 define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp une double %a, %b
@@ -160,8 +160,8 @@
   ret void
 }
 
-; FUNC-LABEL: @f64_uno
-; SI: V_CMP_U_F64
+; FUNC-LABEL: {{^}}f64_uno:
+; SI: v_cmp_u_f64
 define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) {
 entry:
   %0 = fcmp uno double %a, %b
@@ -174,8 +174,8 @@
 ;; 64-bit integer comparisons
 ;;;==========================================================================;;;
 
-; FUNC-LABEL: @i64_eq
-; SI: V_CMP_EQ_I64
+; FUNC-LABEL: {{^}}i64_eq:
+; SI: v_cmp_eq_i64
 define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, %b
@@ -184,8 +184,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i64_ne
-; SI: V_CMP_NE_I64
+; FUNC-LABEL: {{^}}i64_ne:
+; SI: v_cmp_ne_i64
 define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp ne i64 %a, %b
@@ -194,8 +194,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i64_ugt
-; SI: V_CMP_GT_U64
+; FUNC-LABEL: {{^}}i64_ugt:
+; SI: v_cmp_gt_u64
 define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp ugt i64 %a, %b
@@ -204,8 +204,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i64_uge
-; SI: V_CMP_GE_U64
+; FUNC-LABEL: {{^}}i64_uge:
+; SI: v_cmp_ge_u64
 define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp uge i64 %a, %b
@@ -214,8 +214,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i64_ult
-; SI: V_CMP_LT_U64
+; FUNC-LABEL: {{^}}i64_ult:
+; SI: v_cmp_lt_u64
 define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp ult i64 %a, %b
@@ -224,8 +224,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i64_ule
-; SI: V_CMP_LE_U64
+; FUNC-LABEL: {{^}}i64_ule:
+; SI: v_cmp_le_u64
 define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp ule i64 %a, %b
@@ -234,8 +234,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i64_sgt
-; SI: V_CMP_GT_I64
+; FUNC-LABEL: {{^}}i64_sgt:
+; SI: v_cmp_gt_i64
 define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp sgt i64 %a, %b
@@ -244,8 +244,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i64_sge
-; SI: V_CMP_GE_I64
+; FUNC-LABEL: {{^}}i64_sge:
+; SI: v_cmp_ge_i64
 define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp sge i64 %a, %b
@@ -254,8 +254,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i64_slt
-; SI: V_CMP_LT_I64
+; FUNC-LABEL: {{^}}i64_slt:
+; SI: v_cmp_lt_i64
 define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp slt i64 %a, %b
@@ -264,8 +264,8 @@
   ret void
 }
 
-; FUNC-LABEL: @i64_sle
-; SI: V_CMP_LE_I64
+; FUNC-LABEL: {{^}}i64_sle:
+; SI: v_cmp_le_i64
 define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = icmp sle i64 %a, %b

diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll
index e90e788..5fe6ff6 100644
--- a/test/CodeGen/R600/seto.ll
+++ b/test/CodeGen/R600/seto.ll

@@ -1,8 +1,8 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
 
-;CHECK-LABEL: @main
-;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0
-
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
+; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
 define void @main(float %p) {
 main_body:
   %c = fcmp oeq float %p, %p

diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll
index 3b1db8b..a391177 100644
--- a/test/CodeGen/R600/setuo.ll
+++ b/test/CodeGen/R600/setuo.ll

@@ -1,8 +1,8 @@
-;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
 
-;CHECK-LABEL: @main
-;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0
-
+; CHECK-LABEL: {{^}}main:
+; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
+; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
 define void @main(float %p) {
 main_body:
   %c = fcmp une float %p, %p

diff --git a/test/CodeGen/R600/sext-eliminate.ll b/test/CodeGen/R600/sext-eliminate.ll
new file mode 100644
index 0000000..7dc6eb8
--- /dev/null
+++ b/test/CodeGen/R600/sext-eliminate.ll

@@ -0,0 +1,26 @@
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_add:
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: SUB_INT {{[* ]*}}[[RES]]
+; EG-NOT: BFE
+define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+  %sext = sext i1 %a to i32
+  %res = add i32 %b, %sext
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32_sub:
+
+; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
+; EG: ADD_INT {{[* ]*}}[[RES]]
+; EG-NOT: BFE
+define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+  %sext = sext i1 %a to i32
+  %res = sub i32 %b, %sext
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
index 1b02e4b..d364e6b 100644
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ b/test/CodeGen/R600/sext-in-reg.ll

@@ -2,13 +2,14 @@
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 
-; FUNC-LABEL: @sext_in_reg_i1_i32
-; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
-; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
-; SI: BUFFER_STORE_DWORD [[EXTRACT]],
+; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
+; SI: s_load_dword [[ARG:s[0-9]+]],
+; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
+; SI: buffer_store_dword [[EXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1
@@ -20,11 +21,11 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i8_to_i32
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
-; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT
@@ -38,11 +39,11 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i16_to_i32
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
-; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT
@@ -56,11 +57,11 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i8_to_v1i32
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: V_MOV_B32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
-; SI: BUFFER_STORE_DWORD [[VEXTRACT]],
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32:
+; SI: s_add_i32 [[VAL:s[0-9]+]],
+; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]]
+; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]]
+; SI: buffer_store_dword [[VEXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT
@@ -74,29 +75,31 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i1_to_i64
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_BFE_I32 s{{[0-9]+}}, s{{[0-9]+}}, 0x10000
-; SI: S_MOV_B32 {{s[0-9]+}}, -1
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = add i64 %a, %b
+  %c = shl i64 %a, %b
   %shl = shl i64 %c, 63
   %ashr = ashr i64 %shl, 63
   store i64 %ashr, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i8_to_i64
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I8 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: S_MOV_B32 {{s[0-9]+}}, -1
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: ADD_INT
-; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: LSHL
+; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
 ; EG: ASHR [[RES_HI]]
 ; EG-NOT: BFE_INT
 ; EG: LSHR
@@ -104,23 +107,24 @@
 ;; TODO Check address computation, using | with variables in {{}} does not work,
 ;; also the _LO/_HI order might be different
 define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = add i64 %a, %b
+  %c = shl i64 %a, %b
   %shl = shl i64 %c, 56
   %ashr = ashr i64 %shl, 56
   store i64 %ashr, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i16_to_i64
-; SI: S_ADD_I32 [[VAL:s[0-9]+]],
-; SI: S_SEXT_I32_I16 [[EXTRACT:s[0-9]+]], [[VAL]]
-; SI: S_MOV_B32 {{s[0-9]+}}, -1
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
-; EG: ADD_INT
-; EG-NEXT: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
+; EG: LSHL
+; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal
 ; EG: ASHR [[RES_HI]]
 ; EG-NOT: BFE_INT
 ; EG: LSHR
@@ -128,32 +132,32 @@
 ;; TODO Check address computation, using | with variables in {{}} does not work,
 ;; also the _LO/_HI order might be different
 define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = add i64 %a, %b
+  %c = shl i64 %a, %b
   %shl = shl i64 %c, 48
   %ashr = ashr i64 %shl, 48
   store i64 %ashr, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i32_to_i64
-; SI: S_LOAD_DWORD
-; SI: S_LOAD_DWORD
-; SI: S_ADD_I32 [[ADD:s[0-9]+]],
-; SI: S_ASHR_I32 s{{[0-9]+}}, [[ADD]], 31
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64:
+; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000
+; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]]
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE_INT
-; EG: ADD_INT {{\*?}} [[RES_LO]]
+
 ; EG: ASHR [[RES_HI]]
-; EG: ADD_INT
+
 ; EG: LSHR
 ; EG: LSHR
 ;; TODO Check address computation, using | with variables in {{}} does not work,
 ;; also the _LO/_HI order might be different
 define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
-  %c = add i64 %a, %b
+  %c = shl i64 %a, %b
   %shl = shl i64 %c, 32
   %ashr = ashr i64 %shl, 32
   store i64 %ashr, i64 addrspace(1)* %out, align 8
@@ -161,10 +165,10 @@
 }
 
 ; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
-; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
-; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
-; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
-; XSI: BUFFER_STORE_DWORD
+; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64:
+; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31
+; XSI: buffer_store_dword
 ; XEG: BFE_INT
 ; XEG: ASHR
 ; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind {
@@ -175,10 +179,93 @@
 ;   ret void
 ; }
 
-; FUNC-LABEL: @sext_in_reg_i1_in_i32_other_amount
-; SI-NOT: BFE
-; SI: S_LSHL_B32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6
-; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG]], 7
+; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
+  %a = load i64 addrspace(1)* %a.gep, align 8
+  %b = load i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 63
+  %ashr = ashr i64 %shl, 63
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
+  %a = load i64 addrspace(1)* %a.gep, align 8
+  %b = load i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 56
+  %ashr = ashr i64 %shl, 56
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16
+; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
+define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
+  %a = load i64 addrspace(1)* %a.gep, align 8
+  %b = load i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 48
+  %ashr = ashr i64 %shl, 48
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64:
+; SI: buffer_load_dwordx2
+; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
+; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}}
+define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr i64 addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr i64 addrspace(1)* %out, i32 %tid
+  %a = load i64 addrspace(1)* %a.gep, align 8
+  %b = load i64 addrspace(1)* %b.gep, align 8
+
+  %c = shl i64 %a, %b
+  %shl = shl i64 %c, 32
+  %ashr = ashr i64 %shl, 32
+  store i64 %ashr, i64 addrspace(1)* %out.gep, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_lshl_b32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6
+; SI: s_ashr_i32 {{s[0-9]+}}, [[REG]], 7
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
@@ -194,11 +281,12 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v2i1_in_v2i32_other_amount
-; SI: S_LSHL_B32 [[REG0:s[0-9]+]], {{s[0-9]}}, 6
-; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG0]], 7
-; SI: S_LSHL_B32 [[REG1:s[0-9]+]], {{s[0-9]}}, 6
-; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG1]], 7
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount:
+; SI-DAG: s_lshl_b32 [[REG0:s[0-9]+]], {{s[0-9]}}, 6
+; SI-DAG: s_ashr_i32 {{s[0-9]+}}, [[REG0]], 7
+; SI-DAG: s_lshl_b32 [[REG1:s[0-9]+]], {{s[0-9]}}, 6
+; SI-DAG: s_ashr_i32 {{s[0-9]+}}, [[REG1]], 7
+; SI: s_endpgm
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG-NOT: BFE
@@ -217,10 +305,10 @@
 }
 
 
-; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32:
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: buffer_store_dwordx2
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -234,12 +322,12 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
-; SI: BUFFER_STORE_DWORDX4
+; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32:
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: buffer_store_dwordx4
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -255,10 +343,10 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v2i8_to_v2i32
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32:
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx2
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -272,12 +360,12 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v4i8_to_v4i32
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I8 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: BUFFER_STORE_DWORDX4
+; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32:
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx4
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -293,10 +381,10 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_v2i16_to_v2i32
-; SI: S_SEXT_I32_I16 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: S_SEXT_I32_I16 {{s[0-9]+}}, {{s[0-9]+}}
-; SI: BUFFER_STORE_DWORDX2
+; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32:
+; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}}
+; SI: buffer_store_dwordx2
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: BFE_INT [[RES]]
@@ -310,7 +398,7 @@
   ret void
 }
 
-; FUNC-LABEL: @testcase
+; FUNC-LABEL: {{^}}testcase:
 define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
@@ -322,7 +410,7 @@
   ret void
 }
 
-; FUNC-LABEL: @testcase_3
+; FUNC-LABEL: {{^}}testcase_3:
 define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
@@ -334,11 +422,11 @@
   ret void
 }
 
-; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32:
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
   %loada = load <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32> addrspace(1)* %b, align 16
@@ -349,9 +437,9 @@
   ret void
 }
 
-; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
 define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
   %loada = load <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32> addrspace(1)* %b, align 16
@@ -365,11 +453,11 @@
 ; FIXME: The BFE should really be eliminated. I think it should happen
 ; when computeKnownBitsForTargetNode is implemented for imax.
 
-; FUNC-LABEL: @sext_in_reg_to_illegal_type
-; SI: BUFFER_LOAD_SBYTE
-; SI: V_MAX_I32
-; SI: V_BFE_I32
-; SI: BUFFER_STORE_SHORT
+; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type:
+; SI: buffer_load_sbyte
+; SI: v_max_i32
+; SI: v_bfe_i32
+; SI: buffer_store_short
 define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind {
   %tmp5 = load i8 addrspace(1)* %src, align 1
   %tmp2 = sext i8 %tmp5 to i32
@@ -382,9 +470,9 @@
 
 declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
 
-; FUNC-LABEL: @bfe_0_width
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_0_width:
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
   %load = load i32 addrspace(1)* %ptr, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
@@ -392,10 +480,10 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_8_bfe_8
-; SI: V_BFE_I32
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_8_bfe_8:
+; SI: v_bfe_i32
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
   %load = load i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
@@ -404,9 +492,9 @@
   ret void
 }
 
-; FUNC-LABEL: @bfe_8_bfe_16
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_8_bfe_16:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI: s_endpgm
 define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
   %load = load i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
@@ -416,10 +504,10 @@
 }
 
 ; This really should be folded into 1
-; FUNC-LABEL: @bfe_16_bfe_8
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}bfe_16_bfe_8:
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind {
   %load = load i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
@@ -429,10 +517,10 @@
 }
 
 ; Make sure there isn't a redundant BFE
-; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe
-; SI: S_SEXT_I32_I8 s{{[0-9]+}}, s{{[0-9]+}}
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
+; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
@@ -442,7 +530,7 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i8_to_i32_bfe_wrong
+; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
 define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
@@ -452,10 +540,10 @@
   ret void
 }
 
-; FUNC-LABEL: @sextload_i8_to_i32_bfe
-; SI: BUFFER_LOAD_SBYTE
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe:
+; SI: buffer_load_sbyte
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
   %load = load i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
@@ -466,9 +554,10 @@
   ret void
 }
 
-; FUNC-LABEL: @sextload_i8_to_i32_bfe_0:
-; SI-NOT: BFE
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:
+; SI: .text
+; SI-NOT: {{[^@]}}bfe
+; SI: s_endpgm
 define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind {
   %load = load i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
@@ -479,11 +568,11 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_0:
-; SI-NOT: SHR
-; SI-NOT: SHL
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
+; SI-NOT: shr
+; SI-NOT: shl
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; SI: s_endpgm
 define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
@@ -493,12 +582,12 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i1_bfe_offset_1
-; SI: BUFFER_LOAD_DWORD
-; SI-NOT: SHL
-; SI-NOT: SHR
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
+; SI: buffer_load_dword
+; SI-NOT: shl
+; SI-NOT: shr
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
+; SI: s_endpgm
 define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30
@@ -508,12 +597,12 @@
   ret void
 }
 
-; FUNC-LABEL: @sext_in_reg_i2_bfe_offset_1:
-; SI: BUFFER_LOAD_DWORD
-; SI: V_LSHLREV_B32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
-; SI: V_ASHRREV_I32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
-; SI: V_BFE_I32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
+; SI: buffer_load_dword
+; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 30, v{{[0-9]+}}
+; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
+; SI: s_endpgm
 define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30

diff --git a/test/CodeGen/R600/sgpr-control-flow.ll b/test/CodeGen/R600/sgpr-control-flow.ll
index 06ad24d..d8b8dff 100644
--- a/test/CodeGen/R600/sgpr-control-flow.ll
+++ b/test/CodeGen/R600/sgpr-control-flow.ll

@@ -4,9 +4,14 @@
 ; Most SALU instructions ignore control flow, so we need to make sure
 ; they don't overwrite values from other blocks.
 
-; SI-NOT: S_ADD
+; If the branch decision is made based on a value in an SGPR then all
+; threads will execute the same code paths, so we don't need to worry
+; about instructions in different blocks overwriting each other.
+; SI-LABEL: {{^}}sgpr_if_else_salu_br:
+; SI: s_add
+; SI: s_add
 
-define void @sgpr_if_else(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -25,3 +30,35 @@
   store i32 %4, i32 addrspace(1)* %out
   ret void
 }
+
+; The two S_ADD instructions should write to different registers, since
+; different threads will take different control flow paths.
+
+; SI-LABEL: {{^}}sgpr_if_else_valu_br:
+; SI: s_add_i32 [[SGPR:s[0-9]+]]
+; SI-NOT: s_add_i32 [[SGPR]]
+
+define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  %tid = call i32 @llvm.r600.read.tidig.x() #0
+  %tid_f = uitofp i32 %tid to float
+  %tmp1 = fcmp ueq float %tid_f, 0.0
+  br i1 %tmp1, label %if, label %else
+
+if:
+  %tmp2 = add i32 %b, %c
+  br label %endif
+
+else:
+  %tmp3 = add i32 %d, %e
+  br label %endif
+
+endif:
+  %tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]
+  store i32 %tmp4, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }

diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
index 9d8a623..aa97fbf 100644
--- a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll

@@ -3,8 +3,8 @@
 ; Copy VGPR -> SGPR used twice as an instruction operand, which is then
 ; used in an REG_SEQUENCE that also needs to be handled.
 
-; SI-LABEL: @test_dup_operands:
-; SI: V_ADD_I32_e32
+; SI-LABEL: {{^}}test_dup_operands:
+; SI: v_add_i32_e32
 define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
   %a = load <2 x i32> addrspace(1)* %in
   %lo = extractelement <2 x i32> %a, i32 0

diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
index c7d5bf9..8daf753 100644
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ b/test/CodeGen/R600/sgpr-copy.ll

@@ -2,9 +2,9 @@
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
-; CHECK-LABEL: @phi1
-; CHECK: S_BUFFER_LOAD_DWORD [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0
-; CHECK: V_MOV_B32_e32 v{{[0-9]}}, [[DST]]
+; CHECK-LABEL: {{^}}phi1:
+; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
 
 define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
@@ -29,7 +29,7 @@
 }
 
 ; Make sure this program doesn't crash
-; CHECK-LABEL: @phi2
+; CHECK-LABEL: {{^}}phi2:
 define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -149,7 +149,7 @@
 }
 
 ; We just want ot make sure the program doesn't crash
-; CHECK-LABEL: @loop
+; CHECK-LABEL: {{^}}loop:
 
 define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
@@ -227,11 +227,11 @@
 ; registers were being identified as an SGPR regclass which was causing
 ; an assertion failure.
 
-; CHECK-LABEL: @sample_v3
-; CHECK: IMAGE_SAMPLE
-; CHECK: IMAGE_SAMPLE
-; CHECK: EXP
-; CHECK: S_ENDPGM
+; CHECK-LABEL: {{^}}sample_v3:
+; CHECK: image_sample
+; CHECK: image_sample
+; CHECK: exp
+; CHECK: s_endpgm
 define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 
 entry:
@@ -269,10 +269,10 @@
 
 !2 = metadata !{metadata !"const", null, i32 1}
 
-; CHECK-LABEL: @copy1
-; CHECK: BUFFER_LOAD_DWORD
-; CHECK: V_ADD
-; CHECK: S_ENDPGM
+; CHECK-LABEL: {{^}}copy1:
+; CHECK: buffer_load_dword
+; CHECK: v_add
+; CHECK: s_endpgm
 define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
 entry:
   %0 = load float addrspace(1)* %in0
@@ -296,8 +296,8 @@
 }
 
 ; This test is just checking that we don't crash / assertion fail.
-; CHECK-LABEL: @copy2
-; CHECK: S_ENDPGM
+; CHECK-LABEL: {{^}}copy2:
+; CHECK: s_endpgm
 
 define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 entry:
@@ -325,3 +325,54 @@
 
 attributes #0 = { "ShaderType"="0" }
 
+; This test checks that image_sample resource descriptors aren't loaded into
+; vgprs.  The verifier will fail if this happens.
+; CHECK-LABEL:{{^}}sample_rsrc:
+; CHECK: image_sample
+; CHECK: image_sample
+; CHECK: s_endpgm
+define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+bb:
+  %tmp = getelementptr [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
+  %tmp22 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
+  %tmp25 = getelementptr [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
+  %tmp26 = load <8 x i32> addrspace(2)* %tmp25, !tbaa !0
+  %tmp27 = getelementptr [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
+  %tmp28 = load <4 x i32> addrspace(2)* %tmp27, !tbaa !0
+  %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp31 = bitcast float %tmp23 to i32
+  %tmp36 = icmp ne i32 %tmp31, 0
+  br i1 %tmp36, label %bb38, label %bb80
+
+bb38:                                             ; preds = %bb
+  %tmp52 = bitcast float %tmp29 to i32
+  %tmp53 = bitcast float %tmp30 to i32
+  %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0
+  %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1
+  %tmp56 = bitcast <8 x i32> %tmp26 to <32 x i8>
+  %tmp57 = bitcast <4 x i32> %tmp28 to <16 x i8>
+  %tmp58 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp55, <32 x i8> %tmp56, <16 x i8> %tmp57, i32 2)
+  br label %bb71
+
+bb80:                                             ; preds = %bb
+  %tmp81 = bitcast float %tmp29 to i32
+  %tmp82 = bitcast float %tmp30 to i32
+  %tmp82.2 = add i32 %tmp82, 1
+  %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0
+  %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1
+  %tmp85 = bitcast <8 x i32> %tmp26 to <32 x i8>
+  %tmp86 = bitcast <4 x i32> %tmp28 to <16 x i8>
+  %tmp87 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %tmp84, <32 x i8> %tmp85, <16 x i8> %tmp86, i32 2)
+  br label %bb71
+
+bb71:                                             ; preds = %bb80, %bb38
+  %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ]
+  %tmp88 = extractelement <4 x float> %tmp72, i32 0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88)
+  ret void
+}
+
+attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/shared-op-cycle.ll b/test/CodeGen/R600/shared-op-cycle.ll
index 0484fc9..f52a9ba 100644
--- a/test/CodeGen/R600/shared-op-cycle.ll
+++ b/test/CodeGen/R600/shared-op-cycle.ll

@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: @main
+; CHECK: {{^}}main:
 ; CHECK: MULADD_IEEE *
 ; CHECK-NOT: MULADD_IEEE *
 

diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index 43fab2a..71c9fc4 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll

@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @shl_v2i32
+;EG-CHECK: {{^}}shl_v2i32:
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @shl_v2i32
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}shl_v2i32:
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@
   ret void
 }
 
-;EG-CHECK: @shl_v4i32
+;EG-CHECK: {{^}}shl_v4i32:
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @shl_v4i32
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}shl_v4i32:
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshl_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,7 +39,7 @@
   ret void
 }
 
-;EG-CHECK: @shl_i64
+;EG-CHECK: {{^}}shl_i64:
 ;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG-CHECK: LSHR {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPLO:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
@@ -51,8 +51,8 @@
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 
-;SI-CHECK: @shl_i64
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}shl_i64:
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
@@ -63,7 +63,7 @@
   ret void
 }
 
-;EG-CHECK: @shl_v2i64
+;EG-CHECK: {{^}}shl_v2i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: LSHR {{\*? *}}[[COMPSHA]]
@@ -85,9 +85,9 @@
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK: @shl_v2i64
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}shl_v2i64:
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
@@ -98,7 +98,7 @@
   ret void
 }
 
-;EG-CHECK: @shl_v4i64
+;EG-CHECK: {{^}}shl_v4i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
@@ -140,11 +140,11 @@
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK: @shl_v4i64
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHL_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}shl_v4i64:
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1

diff --git a/test/CodeGen/R600/shl_add_constant.ll b/test/CodeGen/R600/shl_add_constant.ll
new file mode 100644
index 0000000..801f77d
--- /dev/null
+++ b/test/CodeGen/R600/shl_add_constant.ll

@@ -0,0 +1,90 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Test with inline immediate
+
+; FUNC-LABEL: {{^}}shl_2_add_9_i32:
+; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 36, [[REG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32 addrspace(1)* %ptr, align 4
+  %add = add i32 %val, 9
+  %result = shl i32 %add, 2
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}shl_2_add_9_i32_2_add_uses:
+; SI-DAG: v_add_i32_e32 [[ADDREG:v[0-9]+]], 9, {{v[0-9]+}}
+; SI-DAG: v_lshlrev_b32_e32 [[SHLREG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI-DAG: buffer_store_dword [[ADDREG]]
+; SI-DAG: buffer_store_dword [[SHLREG]]
+; SI: s_endpgm
+define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32 addrspace(1)* %ptr, align 4
+  %add = add i32 %val, 9
+  %result = shl i32 %add, 2
+  store i32 %result, i32 addrspace(1)* %out0, align 4
+  store i32 %add, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; Test with add literal constant
+
+; FUNC-LABEL: {{^}}shl_2_add_999_i32:
+; SI: v_lshlrev_b32_e32  [[REG:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], 0xf9c, [[REG]]
+; SI: buffer_store_dword [[RESULT]]
+; SI: s_endpgm
+define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %ptr = getelementptr i32 addrspace(1)* %in, i32 %tid.x
+  %val = load i32 addrspace(1)* %ptr, align 4
+  %shl = add i32 %val, 999
+  %result = shl i32 %shl, 2
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_add_shl_add_constant:
+; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; SI: buffer_store_dword [[VRESULT]]
+define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %shl, %y
+   store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv:
+; SI-DAG: s_load_dword [[X:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], [[Y]]
+; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8
+; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
+; SI: buffer_store_dword [[VRESULT]]
+
+define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %y, %shl
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/shl_add_ptr.ll b/test/CodeGen/R600/shl_add_ptr.ll
new file mode 100644
index 0000000..047cf25
--- /dev/null
+++ b/test/CodeGen/R600/shl_add_ptr.ll

@@ -0,0 +1,282 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+
+; Test that doing a shift of a pointer with a constant add will be
+; folded into the constant offset addressing mode even if the add has
+; multiple uses. This is relevant to accessing 2 separate, adjacent
+; LDS globals.
+
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+@lds0 = addrspace(3) global [512 x float] undef, align 4
+@lds1 = addrspace(3) global [512 x float] undef, align 4
+
+
+; Make sure the (add tid, 2) << 2 gets folded into the ds's offset as (tid << 2) + 8
+
+; SI-LABEL: {{^}}load_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8 [M0]
+; SI: s_endpgm
+define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  store float %val0, float addrspace(1)* %out
+  ret void
+}
+
+; Make sure once the first use is folded into the addressing mode, the
+; remaining add use goes through the normal shl + add constant fold.
+
+; SI-LABEL: {{^}}load_shl_base_lds_1:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 [M0]
+; SI: v_add_i32_e32 [[ADDUSE:v[0-9]+]], 8, v{{[0-9]+}}
+; SI-DAG: buffer_store_dword [[RESULT]]
+; SI-DAG: buffer_store_dword [[ADDUSE]]
+; SI: s_endpgm
+define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %shl_add_use = shl i32 %idx.0, 2
+  store i32 %shl_add_use, i32 addrspace(1)* %add_use, align 4
+  store float %val0, float addrspace(1)* %out
+  ret void
+}
+
+@maxlds = addrspace(3) global [65536 x i8] undef, align 4
+
+; SI-LABEL: {{^}}load_shl_base_lds_max_offset
+; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
+; SI: s_endpgm
+define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 65535
+  %arrayidx0 = getelementptr inbounds [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
+  %val0 = load i8 addrspace(3)* %arrayidx0
+  store i32 %idx.0, i32 addrspace(1)* %add_use
+  store i8 %val0, i8 addrspace(1)* %out
+  ret void
+}
+
+; The two globals are placed adjacent in memory, so the same base
+; pointer can be used with an offset into the second one.
+
+; SI-LABEL: {{^}}load_shl_base_lds_2:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 [M0]
+; SI: s_endpgm
+define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 64
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  %val0 = load float addrspace(3)* %arrayidx0, align 4
+  %arrayidx1 = getelementptr inbounds [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
+  %val1 = load float addrspace(3)* %arrayidx1, align 4
+  %sum = fadd float %val0, %val1
+  store float %sum, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8 [M0]
+; SI: s_endpgm
+define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
+  store float 1.0, float addrspace(3)* %arrayidx0, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+
+; --------------------------------------------------------------------------------
+; Atomics.
+
+@lds2 = addrspace(3) global [512 x i32] undef, align 4
+
+; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %idx.0 = add nsw i32 %tid.x, 2
+;   %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %val = load atomic i32 addrspace(3)* %arrayidx0 seq_cst, align 4
+;   store i32 %val, i32 addrspace(1)* %out, align 4
+;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+;   ret void
+; }
+
+
+; SI-LABEL: {{^}}atomic_cmpxchg_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %pair = cmpxchg i32 addrspace(3)* %arrayidx0, i32 7, i32 %swap seq_cst monotonic
+  %result = extractvalue { i32, i1 } %pair, 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_swap_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_add_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw add i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_sub_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw sub i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_and_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw and i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_or_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw or i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_xor_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw xor i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+;   %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+;   %idx.0 = add nsw i32 %tid.x, 2
+;   %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+;   %val = atomicrmw nand i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+;   store i32 %val, i32 addrspace(1)* %out, align 4
+;   store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+;   ret void
+; }
+
+; SI-LABEL: {{^}}atomic_min_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw min i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_max_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw max i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_umin_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw umin i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}atomic_umax_shl_base_lds_0:
+; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
+; SI: s_endpgm
+define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+  %tid.x = tail call i32 @llvm.r600.read.tidig.x() #1
+  %idx.0 = add nsw i32 %tid.x, 2
+  %arrayidx0 = getelementptr inbounds [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
+  %val = atomicrmw umax i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
+  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %idx.0, i32 addrspace(1)* %add_use, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll
index daa4667..6d60b0a 100644
--- a/test/CodeGen/R600/si-annotate-cf-assertion.ll
+++ b/test/CodeGen/R600/si-annotate-cf-assertion.ll

@@ -4,7 +4,7 @@
 
 
 define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
-; CHECK-LABEL: @test:
+; CHECK-LABEL: {{^}}test:
 
 entry:
   switch i32 %x, label %sw.default [

diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll
index 8d7a79c..60277d6 100644
--- a/test/CodeGen/R600/si-lod-bias.ll
+++ b/test/CodeGen/R600/si-lod-bias.ll

@@ -3,8 +3,8 @@
 ; This shader has the potential to generated illegal VGPR to SGPR copies if
 ; the wrong register class is used for the REG_SEQUENCE instructions.
 
-; CHECK: @main
-; CHECK: IMAGE_SAMPLE_B v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
+; CHECK: {{^}}main:
+; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
 
 define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:

diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll
index 53a0965..439d8e2 100644
--- a/test/CodeGen/R600/si-sgpr-spill.ll
+++ b/test/CodeGen/R600/si-sgpr-spill.ll

@@ -3,10 +3,10 @@
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
 
-; CHECK-LABEL: @main
+; CHECK-LABEL: {{^}}main:
 ; Writing to M0 from an SMRD instruction will hang the GPU.
-; CHECK-NOT: S_BUFFER_LOAD_DWORD m0
-; CHECK: S_ENDPGM
+; CHECK-NOT: s_buffer_load_dword m0
+; CHECK: s_endpgm
 @ddxy_lds = external addrspace(3) global [64 x i32]
 
 define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
@@ -688,8 +688,8 @@
 
 !0 = metadata !{metadata !"const", null, i32 1}
 
-; CHECK-LABEL: @main1
-; CHECK: S_ENDPGM
+; CHECK-LABEL: {{^}}main1:
+; CHECK: s_endpgm
 define void @main1([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0

diff --git a/test/CodeGen/R600/si-triv-disjoint-mem-access.ll b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll
new file mode 100644
index 0000000..2c146eb
--- /dev/null
+++ b/test/CodeGen/R600/si-triv-disjoint-mem-access.ll

@@ -0,0 +1,238 @@
+; RUN: llc -march=r600 -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.AMDGPU.barrier.local() #2
+
+
+@stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4
+@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8
+@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
+
+; FUNC-LABEL: @reorder_local_load_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+; CI: buffer_store_dword
+define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI: buffer_store_dword
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+  store volatile i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI: buffer_store_dword
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
+define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
+  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+
+  %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  call void @llvm.AMDGPU.barrier.local() #2
+  %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Technically we could reorder these, but just comparing the
+; instruction type of the load is insufficient.
+
+; FUNC-LABEL: @no_reorder_constant_load_global_store_constant_load
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+define void @no_reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+  %ptr0 = load i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+
+  %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(1)* %gptr, align 4
+  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XXX: Should be able to reorder this, but the laods count as ordered
+
+; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
+; CI: buffer_load_dword
+; CI: ds_write_b32
+; CI: buffer_load_dword
+; CI: buffer_store_dword
+define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
+  %ptr0 = load i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
+
+  %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load
+; CI: s_load_dword
+; CI: s_load_dword
+; CI: s_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32 addrspace(2)* %ptr0, i64 2
+
+  %tmp1 = load i32 addrspace(2)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32 addrspace(2)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_global_load_local_store_global_load
+; CI: buffer_load_dword
+; CI: buffer_load_dword
+; CI: ds_write_b32
+; CI: buffer_store_dword
+define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32 addrspace(1)* %ptr0, i64 1
+  %ptr2 = getelementptr inbounds i32 addrspace(1)* %ptr0, i64 2
+
+  %tmp1 = load i32 addrspace(1)* %ptr1, align 4
+  store i32 99, i32 addrspace(3)* %lptr, align 4
+  %tmp2 = load i32 addrspace(1)* %ptr2, align 4
+
+  %add = add nsw i32 %tmp1, %tmp2
+
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_local_offsets
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 101
+
+  store i32 123, i32 addrspace(3)* %ptr1, align 4
+  %tmp1 = load i32 addrspace(3)* %ptr2, align 4
+  %tmp2 = load i32 addrspace(3)* %ptr3, align 4
+  store i32 123, i32 addrspace(3)* %ptr2, align 4
+  %tmp3 = load i32 addrspace(3)* %ptr1, align 4
+  store i32 789, i32 addrspace(3)* %ptr3, align 4
+
+  %add.0 = add nsw i32 %tmp2, %tmp1
+  %add.1 = add nsw i32 %add.0, %tmp3
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: @reorder_global_offsets
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0xc
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x190
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x194
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x190
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:0x194
+; CI: buffer_store_dword
+; CI: s_endpgm
+define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
+  %ptr1 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 3
+  %ptr2 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 100
+  %ptr3 = getelementptr inbounds i32 addrspace(1)* %ptr0, i32 101
+
+  store i32 123, i32 addrspace(1)* %ptr1, align 4
+  %tmp1 = load i32 addrspace(1)* %ptr2, align 4
+  %tmp2 = load i32 addrspace(1)* %ptr3, align 4
+  store i32 123, i32 addrspace(1)* %ptr2, align 4
+  %tmp3 = load i32 addrspace(1)* %ptr1, align 4
+  store i32 789, i32 addrspace(1)* %ptr3, align 4
+
+  %add.0 = add nsw i32 %tmp2, %tmp1
+  %add.1 = add nsw i32 %add.0, %tmp3
+  store i32 %add.1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load
+; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
+; XCI: TBUFFER_STORE_FORMAT
+; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8
+; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 {
+;   %ptr0 = load i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
+
+;   %ptr1 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 1
+;   %ptr2 = getelementptr inbounds i32 addrspace(3)* %ptr0, i32 2
+
+;   %tmp1 = load i32 addrspace(3)* %ptr1, align 4
+
+;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+;         i32 1, i32 0)
+
+;   %tmp2 = load i32 addrspace(3)* %ptr2, align 4
+
+;   %add = add nsw i32 %tmp1, %tmp2
+
+;   store i32 %add, i32 addrspace(1)* %out, align 4
+;   ret void
+; }
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { nounwind noduplicate }

diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll
index 093234f..6f91c71 100644
--- a/test/CodeGen/R600/si-vector-hang.ll
+++ b/test/CodeGen/R600/si-vector-hang.ll

@@ -1,14 +1,14 @@
 ; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-; CHECK: @test_8_min_char
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
-; CHECK: BUFFER_STORE_BYTE
+; CHECK: {{^}}test_8_min_char:
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
+; CHECK: buffer_store_byte
 ; ModuleID = 'radeon'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
 target triple = "r600--"

diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
index e3bee50..94f4c46 100644
--- a/test/CodeGen/R600/sign_extend.ll
+++ b/test/CodeGen/R600/sign_extend.ll

@@ -1,8 +1,8 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @s_sext_i1_to_i32:
-; SI: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}s_sext_i1_to_i32:
+; SI: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
@@ -10,10 +10,10 @@
   ret void
 }
 
-; SI-LABEL: @test:
-; SI: V_ASHR
-; SI: S_ENDPG
-define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+; SI-LABEL: {{^}}test_s_sext_i32_to_i64:
+; SI: s_ashr_i32
+; SI: s_endpg
+define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 entry:
   %mul = mul i32 %a, %b
   %add = add i32 %mul, %c
@@ -22,10 +22,10 @@
   ret void
 }
 
-; SI-LABEL: @s_sext_i1_to_i64:
-; SI: V_CNDMASK_B32_e64
-; SI: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}s_sext_i1_to_i64:
+; SI: v_cndmask_b32_e64
+; SI: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
@@ -33,18 +33,18 @@
   ret void
 }
 
-; SI-LABEL: @s_sext_i32_to_i64:
-; SI: S_ASHR_I32
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}s_sext_i32_to_i64:
+; SI: s_ashr_i32
+; SI: s_endpgm
 define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
   %sext = sext i32 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @v_sext_i32_to_i64:
-; SI: V_ASHR
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}v_sext_i32_to_i64:
+; SI: v_ashr
+; SI: s_endpgm
 define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32 addrspace(1)* %in, align 4
   %sext = sext i32 %val to i64
@@ -52,8 +52,8 @@
   ret void
 }
 
-; SI-LABEL: @s_sext_i16_to_i64:
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}s_sext_i16_to_i64:
+; SI: s_endpgm
 define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
   %sext = sext i16 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8

diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
index dee4326..8d9ee42 100644
--- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll
+++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll

@@ -1,6 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-
 ; XFAIL: *
+; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI -mattr=-promote-alloca < %s | FileCheck -check-prefix=SI %s
 
 ; 64-bit select was originally lowered with a build_pair, and this
 ; could be simplified to 1 cndmask instead of 2, but that broken when
@@ -15,10 +14,10 @@
 }
 
 ; FIXME: Fix truncating store for local memory
-; SI-LABEL: @trunc_load_alloca_i64:
-; SI: DS_READ_B32
-; SI-NOT: DS_READ_B64
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}trunc_load_alloca_i64:
+; SI: v_movrels_b32
+; SI-NOT: v_movrels_b32
+; SI: s_endpgm
 define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) {
   %idx = add i32 %a, %b
   %alloca = alloca i64, i32 4

diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll
new file mode 100644
index 0000000..6e4f87c
--- /dev/null
+++ b/test/CodeGen/R600/sint_to_fp.f64.ll

@@ -0,0 +1,60 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
+; SI: v_cvt_f64_i32_e32
+define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}sint_to_fp_i1_f64:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
+; we should be able to fold the SGPRs into the V_CNDMASK instructions.
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
+define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+  %cmp = icmp eq i32 %in, 0
+  %fp = sitofp i1 %cmp to double
+  store double %fp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}sint_to_fp_i1_f64_load:
+; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, -1
+; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
+define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+  %fp = sitofp i1 %in to double
+  store double %fp, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: @s_sint_to_fp_i64_to_f64
+define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+  %result = sitofp i64 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @v_sint_to_fp_i64_to_f64
+; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI-DAG: v_cvt_f64_i32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64 addrspace(1)* %in, i32 %tid
+  %val = load i64 addrspace(1)* %gep, align 8
+  %result = sitofp i64 %val to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}

diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index b27dfda..7b6ce43 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll

@@ -1,28 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; R600-CHECK: @sint_to_fp_v2i32
-; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
-; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-; SI-CHECK: @sint_to_fp_v2i32
-; SI-CHECK: V_CVT_F32_I32_e32
-; SI-CHECK: V_CVT_F32_I32_e32
+
+; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32:
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
+define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sint_to_fp_v2i32:
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
+; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
 define void @sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
   %result = sitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; R600-CHECK: @sint_to_fp_v4i32
-; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; R600-CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: @sint_to_fp_v4i32
-; SI-CHECK: V_CVT_F32_I32_e32
-; SI-CHECK: V_CVT_F32_I32_e32
-; SI-CHECK: V_CVT_F32_I32_e32
-; SI-CHECK: V_CVT_F32_I32_e32
+; FUNC-LABEL: {{^}}sint_to_fp_v4i32:
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
 define void @sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %value = load <4 x i32> addrspace(1) * %in
   %result = sitofp <4 x i32> %value to <4 x float>
@@ -30,11 +40,11 @@
   ret void
 }
 
-; FUNC-LABEL: @sint_to_fp_i1_f32:
-; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; SI-NEXT: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, -1.000000e+00, [[CMP]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sint_to_fp_i1_f32:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
@@ -42,10 +52,10 @@
   ret void
 }
 
-; FUNC-LABEL: @sint_to_fp_i1_f32_load:
-; SI: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, -1.000000e+00
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}sint_to_fp_i1_f32_load:
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
   %fp = sitofp i1 %in to float
   store float %fp, float addrspace(1)* %out, align 4

diff --git a/test/CodeGen/R600/sint_to_fp64.ll b/test/CodeGen/R600/sint_to_fp64.ll
deleted file mode 100644
index 12b8cf5..0000000
--- a/test/CodeGen/R600/sint_to_fp64.ll
+++ /dev/null

@@ -1,35 +0,0 @@
-; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI: @sint_to_fp64
-; SI: V_CVT_F64_I32_e32
-define void @sint_to_fp64(double addrspace(1)* %out, i32 %in) {
-  %result = sitofp i32 %in to double
-  store double %result, double addrspace(1)* %out
-  ret void
-}
-
-; SI-LABEL: @sint_to_fp_i1_f64:
-; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
-; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
-define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
-  %cmp = icmp eq i32 %in, 0
-  %fp = sitofp i1 %cmp to double
-  store double %fp, double addrspace(1)* %out, align 4
-  ret void
-}
-
-; SI-LABEL: @sint_to_fp_i1_f64_load:
-; SI: V_CNDMASK_B32_e64 [[IRESULT:v[0-9]]], 0, -1
-; SI-NEXT: V_CVT_F64_I32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]]
-; SI: S_ENDPGM
-define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
-  %fp = sitofp i1 %in to double
-  store double %fp, double addrspace(1)* %out, align 8
-  ret void
-}

diff --git a/test/CodeGen/R600/smrd.ll b/test/CodeGen/R600/smrd.ll
index dec6185..1c7df16 100644
--- a/test/CodeGen/R600/smrd.ll
+++ b/test/CodeGen/R600/smrd.ll

@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck %s
 
 ; SMRD load with an immediate offset.
-; CHECK-LABEL: @smrd0
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; CHECK-LABEL: {{^}}smrd0:
+; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 1
@@ -12,8 +12,8 @@
 }
 
 ; SMRD load with the largest possible immediate offset.
-; CHECK-LABEL: @smrd1
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; CHECK-LABEL: {{^}}smrd1:
+; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 255
@@ -23,9 +23,10 @@
 }
 
 ; SMRD load with an offset greater than the largest possible immediate.
-; CHECK-LABEL: @smrd2
-; CHECK: S_MOV_B32 s[[OFFSET:[0-9]]], 0x400
-; CHECK: S_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CHECK-LABEL: {{^}}smrd2:
+; CHECK: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; CHECK: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CHECK: s_endpgm
 define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 256
@@ -35,15 +36,15 @@
 }
 
 ; SMRD load with a 64-bit offset
-; CHECK-LABEL: @smrd3
-; CHECK-DAG: S_MOV_B32 s[[SHI:[0-9]+]], 4
-; CHECK-DAG: S_MOV_B32 s[[SLO:[0-9]+]], 0
+; CHECK-LABEL: {{^}}smrd3:
+; CHECK-DAG: s_mov_b32 s[[SHI:[0-9]+]], 4
+; CHECK-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0 ;
 ; FIXME: We don't need to copy these values to VGPRs
-; CHECK-DAG: V_MOV_B32_e32 v[[VHI:[0-9]+]], s[[SHI]]
-; CHECK-DAG: V_MOV_B32_e32 v[[VLO:[0-9]+]], s[[SLO]]
-; FIXME: We should be able to use S_LOAD_DWORD here
-; BUFFER_LOAD_DWORD v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] + v[[[VLO]]:[[VHI]]] + 0x0
-
+; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
+; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
+; FIXME: We should be able to use s_load_dword here
+; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[}}[[VLO]]:[[VHI]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64
+; CHECK: s_endpgm
 define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
@@ -53,8 +54,8 @@
 }
 
 ; SMRD load using the load.const intrinsic with an immediate offset
-; CHECK-LABEL: @smrd_load_const0
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; CHECK-LABEL: {{^}}smrd_load_const0:
+; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -64,10 +65,10 @@
   ret void
 }
 
-; SMRD load using the load.const intrinsic with an offset greater largest possible
-; immediate offset.
-; CHECK-LABEL: @smrd_load_const1
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; SMRD load using the load.const intrinsic with the largest possible immediate
+; offset.
+; CHECK-LABEL: {{^}}smrd_load_const1:
+; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
@@ -76,10 +77,12 @@
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
   ret void
 }
-; SMRD load using the load.const intrinsic with the largetst possible
+; SMRD load using the load.const intrinsic with an offset greater than the
+; largets possible immediate.
 ; immediate offset.
-; CHECK-LABEL: @smrd_load_const2
-; CHECK: S_BUFFER_LOAD_DWORD s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CHECK-LABEL: {{^}}smrd_load_const2:
+; CHECK: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; CHECK: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0

diff --git a/test/CodeGen/R600/split-scalar-i64-add.ll b/test/CodeGen/R600/split-scalar-i64-add.ll
new file mode 100644
index 0000000..e3448dc
--- /dev/null
+++ b/test/CodeGen/R600/split-scalar-i64-add.ll

@@ -0,0 +1,48 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+; This is broken because the low half of the 64-bit add remains on the
+; SALU, but the upper half does not. The addc expects the carry bit
+; set in vcc, which is undefined since the low scalar half add sets
+; scc instead.
+
+; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+  %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, 399
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, %val1
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; Doesn't use constants
+; FUNC-LABEL @imp_def_vcc_split_i64_add_2
+; SI: v_add_i32
+; SI: v_addc_u32
+define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %gep = getelementptr i32 addrspace(1)* %in, i32 %tid
+  %load = load i32 addrspace(1)* %gep
+  %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %vec.1 = insertelement <2 x i32> %vec.0, i32 %load, i32 1
+  %bc = bitcast <2 x i32> %vec.1 to i64
+  %add = add i64 %bc, %val1
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}

diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
index 9eb3dc5..8ba9daa 100644
--- a/test/CodeGen/R600/sra.ll
+++ b/test/CodeGen/R600/sra.ll

@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK-LABEL: @ashr_v2i32
+;EG-CHECK-LABEL: {{^}}ashr_v2i32:
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @ashr_v2i32
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_v2i32:
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_v4i32
+;EG-CHECK-LABEL: {{^}}ashr_v4i32:
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @ashr_v4i32
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_v4i32:
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_ashr_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,11 +39,11 @@
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_i64
+;EG-CHECK-LABEL: {{^}}ashr_i64:
 ;EG-CHECK: ASHR
 
-;SI-CHECK-LABEL: @ashr_i64
-;SI-CHECK: S_ASHR_I64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
+;SI-CHECK-LABEL: {{^}}ashr_i64:
+;SI-CHECK: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
@@ -52,7 +52,7 @@
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_i64_2
+;EG-CHECK-LABEL: {{^}}ashr_i64_2:
 ;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
@@ -66,8 +66,8 @@
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 
-;SI-CHECK-LABEL: @ashr_i64_2
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_i64_2:
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
@@ -78,7 +78,7 @@
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_v2i64
+;EG-CHECK-LABEL: {{^}}ashr_v2i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
@@ -104,9 +104,9 @@
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK-LABEL: @ashr_v2i64
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_v2i64:
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
@@ -117,7 +117,7 @@
   ret void
 }
 
-;EG-CHECK-LABEL: @ashr_v4i64
+;EG-CHECK-LABEL: {{^}}ashr_v4i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
@@ -167,11 +167,11 @@
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK-LABEL: @ashr_v4i64
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_ASHR_I64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK-LABEL: {{^}}ashr_v4i64:
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_ashr_i64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1

diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
index 44ad73f..8c5daf6 100644
--- a/test/CodeGen/R600/srl.ll
+++ b/test/CodeGen/R600/srl.ll

@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @lshr_v2i32
+;EG-CHECK: {{^}}lshr_v2i32:
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @lshr_v2i32
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}lshr_v2i32:
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,17 +19,17 @@
 }
 
 
-;EG-CHECK: @lshr_v4i32
+;EG-CHECK: {{^}}lshr_v4i32:
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @lshr_v4i32
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}lshr_v4i32:
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_lshr_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -40,7 +40,7 @@
   ret void
 }
 
-;EG-CHECK: @lshr_i64
+;EG-CHECK: {{^}}lshr_i64:
 ;EG-CHECK: SUB_INT {{\*? *}}[[COMPSH:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHIFT:T[0-9]+\.[XYZW]]]
 ;EG-CHECK: LSHL {{\* *}}[[TEMP:T[0-9]+\.[XYZW]]], [[OPHI:T[0-9]+\.[XYZW]]], {{[[COMPSH]]|PV.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}[[OVERF:T[0-9]+\.[XYZW]]], {{[[TEMP]]|PV.[XYZW]}}, 1
@@ -53,8 +53,8 @@
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ;EG-CHECK-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
 
-;SI-CHECK: @lshr_i64
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}lshr_i64:
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64 addrspace(1)* %in, i64 1
@@ -65,7 +65,7 @@
   ret void
 }
 
-;EG-CHECK: @lshr_v2i64
+;EG-CHECK: {{^}}lshr_v2i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: LSHL {{\*? *}}[[COMPSHA]]
@@ -89,9 +89,9 @@
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK: @lshr_v2i64
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}lshr_v2i64:
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64> addrspace(1)* %in, i64 1
@@ -103,7 +103,7 @@
 }
 
 
-;EG-CHECK: @lshr_v4i64
+;EG-CHECK: {{^}}lshr_v4i64:
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHA:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHA:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHB:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHB:T[0-9]+\.[XYZW]]]
 ;EG-CHECK-DAG: SUB_INT {{\*? *}}[[COMPSHC:T[0-9]+\.[XYZW]]], {{literal.[xy]}}, [[SHC:T[0-9]+\.[XYZW]]]
@@ -151,11 +151,11 @@
 ;EG-CHECK-DAG: CNDE_INT
 ;EG-CHECK-DAG: CNDE_INT
 
-;SI-CHECK: @lshr_v4i64
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
-;SI-CHECK: V_LSHR_B64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: {{^}}lshr_v4i64:
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
+;SI-CHECK: v_lshr_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 
 define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64> addrspace(1)* %in, i64 1

diff --git a/test/CodeGen/R600/ssubo.ll b/test/CodeGen/R600/ssubo.ll
index b330276..8031c6f 100644
--- a/test/CodeGen/R600/ssubo.ll
+++ b/test/CodeGen/R600/ssubo.ll

@@ -4,7 +4,7 @@
 declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
 
-; FUNC-LABEL: @ssubo_i64_zext
+; FUNC-LABEL: {{^}}ssubo_i64_zext:
 define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
@@ -15,7 +15,7 @@
   ret void
 }
 
-; FUNC-LABEL: @s_ssubo_i32
+; FUNC-LABEL: {{^}}s_ssubo_i32:
 define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
@@ -25,7 +25,7 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ssubo_i32
+; FUNC-LABEL: {{^}}v_ssubo_i32:
 define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -37,9 +37,9 @@
   ret void
 }
 
-; FUNC-LABEL: @s_ssubo_i64
-; SI: S_SUB_I32
-; SI: S_SUBB_U32
+; FUNC-LABEL: {{^}}s_ssubo_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
 define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
@@ -49,9 +49,9 @@
   ret void
 }
 
-; FUNC-LABEL: @v_ssubo_i64
-; SI: V_SUB_I32_e32
-; SI: V_SUBB_U32_e32
+; FUNC-LABEL: {{^}}v_ssubo_i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
 define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64 addrspace(1)* %aptr, align 4
   %b = load i64 addrspace(1)* %bptr, align 4

diff --git a/test/CodeGen/R600/store-v3i32.ll b/test/CodeGen/R600/store-v3i32.ll
index 3357803..0f28f33 100644
--- a/test/CodeGen/R600/store-v3i32.ll
+++ b/test/CodeGen/R600/store-v3i32.ll

@@ -4,8 +4,8 @@
 ; 3 vectors have the same size and alignment as 4 vectors, so this
 ; should be done in a single store.
 
-; SI-LABEL: @store_v3i32:
-; SI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}store_v3i32:
+; SI: buffer_store_dwordx4
 define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
   ret void

diff --git a/test/CodeGen/R600/store-v3i64.ll b/test/CodeGen/R600/store-v3i64.ll
index 58d28b5..247a561 100644
--- a/test/CodeGen/R600/store-v3i64.ll
+++ b/test/CodeGen/R600/store-v3i64.ll

@@ -1,27 +1,27 @@
 ; XFAIL: *
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI
 
-; SI-LABEL: @global_store_v3i64:
-; SI: BUFFER_STORE_DWORDX4
-; SI: BUFFER_STORE_DWORDX4
+; SI-LABEL: {{^}}global_store_v3i64:
+; SI: buffer_store_dwordx4
+; SI: buffer_store_dwordx4
 define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
   ret void
 }
 
-; SI-LABEL: @global_store_v3i64_unaligned:
+; SI-LABEL: {{^}}global_store_v3i64_unaligned:
 define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @local_store_v3i64:
+; SI-LABEL: {{^}}local_store_v3i64:
 define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
   ret void
 }
 
-; SI-LABEL: @local_store_v3i64_unaligned:
+; SI-LABEL: {{^}}local_store_v3i64_unaligned:
 define void @local_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
   ret void

diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
index 41c5edc..aee639b 100644
--- a/test/CodeGen/R600/store-vector-ptrs.ll
+++ b/test/CodeGen/R600/store-vector-ptrs.ll

@@ -1,9 +1,11 @@
-; REQUIRES: asserts
-; XFAIL: *
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s
 
+; This tests for a bug that caused a crash in
+; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
+; scratch loads and stores.
+; CHECK-LABEL: {{^}}store_vector_ptrs:
 define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
   %p = getelementptr <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   store <4 x i32*> %p, <4 x i32*>* %out
   ret void
-}
\ No newline at end of file
+}

diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index dd27533..713ecd6 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll

@@ -5,9 +5,9 @@
 ;===------------------------------------------------------------------------===;
 ; Global Address Space
 ;===------------------------------------------------------------------------===;
-; FUNC-LABEL: @store_i1
+; FUNC-LABEL: {{^}}store_i1:
 ; EG-CHECK: MEM_RAT MSKOR
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: buffer_store_byte
 define void @store_i1(i1 addrspace(1)* %out) {
 entry:
   store i1 true, i1 addrspace(1)* %out
@@ -15,7 +15,7 @@
 }
 
 ; i8 store
-; EG-CHECK-LABEL: @store_i8
+; EG-CHECK-LABEL: {{^}}store_i8:
 ; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
 ; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]]
 ; IG 0: Get the byte index and truncate the value
@@ -34,8 +34,8 @@
 ; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
 ; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
 
-; SI-CHECK-LABEL: @store_i8
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK-LABEL: {{^}}store_i8:
+; SI-CHECK: buffer_store_byte
 
 define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
 entry:
@@ -44,7 +44,7 @@
 }
 
 ; i16 store
-; EG-CHECK-LABEL: @store_i16
+; EG-CHECK-LABEL: {{^}}store_i16:
 ; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
 ; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]]
 ; IG 0: Get the byte index and truncate the value
@@ -63,20 +63,20 @@
 ; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
 ; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
 
-; SI-CHECK-LABEL: @store_i16
-; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK-LABEL: {{^}}store_i16:
+; SI-CHECK: buffer_store_short
 define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_v2i8
+; EG-CHECK-LABEL: {{^}}store_v2i8:
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK-NOT: MEM_RAT MSKOR
-; SI-CHECK-LABEL: @store_v2i8
-; SI-CHECK: BUFFER_STORE_BYTE
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK-LABEL: {{^}}store_v2i8:
+; SI-CHECK: buffer_store_byte
+; SI-CHECK: buffer_store_byte
 define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
@@ -85,13 +85,13 @@
 }
 
 
-; EG-CHECK-LABEL: @store_v2i16
+; EG-CHECK-LABEL: {{^}}store_v2i16:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @store_v2i16
+; CM-CHECK-LABEL: {{^}}store_v2i16:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @store_v2i16
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK-LABEL: {{^}}store_v2i16:
+; SI-CHECK: buffer_store_short
+; SI-CHECK: buffer_store_short
 define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
@@ -99,15 +99,15 @@
   ret void
 }
 
-; EG-CHECK-LABEL: @store_v4i8
+; EG-CHECK-LABEL: {{^}}store_v4i8:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @store_v4i8
+; CM-CHECK-LABEL: {{^}}store_v4i8:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @store_v4i8
-; SI-CHECK: BUFFER_STORE_BYTE
-; SI-CHECK: BUFFER_STORE_BYTE
-; SI-CHECK: BUFFER_STORE_BYTE
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK-LABEL: {{^}}store_v4i8:
+; SI-CHECK: buffer_store_byte
+; SI-CHECK: buffer_store_byte
+; SI-CHECK: buffer_store_byte
+; SI-CHECK: buffer_store_byte
 define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
@@ -116,30 +116,30 @@
 }
 
 ; floating-point store
-; EG-CHECK-LABEL: @store_f32
+; EG-CHECK-LABEL: {{^}}store_f32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
-; CM-CHECK-LABEL: @store_f32
+; CM-CHECK-LABEL: {{^}}store_f32:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK-LABEL: @store_f32
-; SI-CHECK: BUFFER_STORE_DWORD
+; SI-CHECK-LABEL: {{^}}store_f32:
+; SI-CHECK: buffer_store_dword
 
 define void @store_f32(float addrspace(1)* %out, float %in) {
   store float %in, float addrspace(1)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_v4i16
+; EG-CHECK-LABEL: {{^}}store_v4i16:
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK: MEM_RAT MSKOR
 ; EG-CHECK-NOT: MEM_RAT MSKOR
-; SI-CHECK-LABEL: @store_v4i16
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK: BUFFER_STORE_SHORT
-; SI-CHECK-NOT: BUFFER_STORE_BYTE
+; SI-CHECK-LABEL: {{^}}store_v4i16:
+; SI-CHECK: buffer_store_short
+; SI-CHECK: buffer_store_short
+; SI-CHECK: buffer_store_short
+; SI-CHECK: buffer_store_short
+; SI-CHECK-NOT: buffer_store_byte
 define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
@@ -148,12 +148,12 @@
 }
 
 ; vec2 floating-point stores
-; EG-CHECK-LABEL: @store_v2f32
+; EG-CHECK-LABEL: {{^}}store_v2f32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @store_v2f32
+; CM-CHECK-LABEL: {{^}}store_v2f32:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @store_v2f32
-; SI-CHECK: BUFFER_STORE_DWORDX2
+; SI-CHECK-LABEL: {{^}}store_v2f32:
+; SI-CHECK: buffer_store_dwordx2
 
 define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
@@ -163,23 +163,23 @@
   ret void
 }
 
-; EG-CHECK-LABEL: @store_v4i32
+; EG-CHECK-LABEL: {{^}}store_v4i32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
 ; EG-CHECK-NOT: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @store_v4i32
+; CM-CHECK-LABEL: {{^}}store_v4i32:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
 ; CM-CHECK-NOT: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @store_v4i32
-; SI-CHECK: BUFFER_STORE_DWORDX4
+; SI-CHECK-LABEL: {{^}}store_v4i32:
+; SI-CHECK: buffer_store_dwordx4
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @store_i64_i8
+; FUNC-LABEL: {{^}}store_i64_i8:
 ; EG-CHECK: MEM_RAT MSKOR
-; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: buffer_store_byte
 define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
@@ -187,9 +187,9 @@
   ret void
 }
 
-; FUNC-LABEL: @store_i64_i16
+; FUNC-LABEL: {{^}}store_i64_i16:
 ; EG-CHECK: MEM_RAT MSKOR
-; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: buffer_store_short
 define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
@@ -201,99 +201,99 @@
 ; Local Address Space
 ;===------------------------------------------------------------------------===;
 
-; FUNC-LABEL: @store_local_i1
+; FUNC-LABEL: {{^}}store_local_i1:
 ; EG-CHECK: LDS_BYTE_WRITE
-; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: ds_write_b8
 define void @store_local_i1(i1 addrspace(3)* %out) {
 entry:
   store i1 true, i1 addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_i8
+; EG-CHECK-LABEL: {{^}}store_local_i8:
 ; EG-CHECK: LDS_BYTE_WRITE
-; SI-CHECK-LABEL: @store_local_i8
-; SI-CHECK: DS_WRITE_B8
+; SI-CHECK-LABEL: {{^}}store_local_i8:
+; SI-CHECK: ds_write_b8
 define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
   store i8 %in, i8 addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_i16
+; EG-CHECK-LABEL: {{^}}store_local_i16:
 ; EG-CHECK: LDS_SHORT_WRITE
-; SI-CHECK-LABEL: @store_local_i16
-; SI-CHECK: DS_WRITE_B16
+; SI-CHECK-LABEL: {{^}}store_local_i16:
+; SI-CHECK: ds_write_b16
 define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
   store i16 %in, i16 addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_v2i16
+; EG-CHECK-LABEL: {{^}}store_local_v2i16:
 ; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: @store_local_v2i16
+; CM-CHECK-LABEL: {{^}}store_local_v2i16:
 ; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: @store_local_v2i16
-; SI-CHECK: DS_WRITE_B16
-; SI-CHECK: DS_WRITE_B16
+; SI-CHECK-LABEL: {{^}}store_local_v2i16:
+; SI-CHECK: ds_write_b16
+; SI-CHECK: ds_write_b16
 define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_v4i8
+; EG-CHECK-LABEL: {{^}}store_local_v4i8:
 ; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: @store_local_v4i8
+; CM-CHECK-LABEL: {{^}}store_local_v4i8:
 ; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: @store_local_v4i8
-; SI-CHECK: DS_WRITE_B8
-; SI-CHECK: DS_WRITE_B8
-; SI-CHECK: DS_WRITE_B8
-; SI-CHECK: DS_WRITE_B8
+; SI-CHECK-LABEL: {{^}}store_local_v4i8:
+; SI-CHECK: ds_write_b8
+; SI-CHECK: ds_write_b8
+; SI-CHECK: ds_write_b8
+; SI-CHECK: ds_write_b8
 define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_v2i32
+; EG-CHECK-LABEL: {{^}}store_local_v2i32:
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: @store_local_v2i32
+; CM-CHECK-LABEL: {{^}}store_local_v2i32:
 ; CM-CHECK: LDS_WRITE
 ; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: @store_local_v2i32
-; SI-CHECK: DS_WRITE_B64
+; SI-CHECK-LABEL: {{^}}store_local_v2i32:
+; SI-CHECK: ds_write_b64
 define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(3)* %out
   ret void
 }
 
-; EG-CHECK-LABEL: @store_local_v4i32
+; EG-CHECK-LABEL: {{^}}store_local_v4i32:
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
 ; EG-CHECK: LDS_WRITE
-; CM-CHECK-LABEL: @store_local_v4i32
+; CM-CHECK-LABEL: {{^}}store_local_v4i32:
 ; CM-CHECK: LDS_WRITE
 ; CM-CHECK: LDS_WRITE
 ; CM-CHECK: LDS_WRITE
 ; CM-CHECK: LDS_WRITE
-; SI-CHECK-LABEL: @store_local_v4i32
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_WRITE_B32
-; SI-CHECK: DS_WRITE_B32
+; SI-CHECK-LABEL: {{^}}store_local_v4i32:
+; SI-CHECK: ds_write_b32
+; SI-CHECK: ds_write_b32
+; SI-CHECK: ds_write_b32
+; SI-CHECK: ds_write_b32
 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
   ret void
 }
 
-; FUNC-LABEL: @store_local_i64_i8
+; FUNC-LABEL: {{^}}store_local_i64_i8:
 ; EG-CHECK: LDS_BYTE_WRITE
-; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: ds_write_b8
 define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
@@ -301,9 +301,9 @@
   ret void
 }
 
-; FUNC-LABEL: @store_local_i64_i16
+; FUNC-LABEL: {{^}}store_local_i64_i16:
 ; EG-CHECK: LDS_SHORT_WRITE
-; SI-CHECK: DS_WRITE_B16
+; SI-CHECK: ds_write_b16
 define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
@@ -318,12 +318,12 @@
 ; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
 ; be two 32-bit stores.
 
-; EG-CHECK-LABEL: @vecload2
+; EG-CHECK-LABEL: {{^}}vecload2:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
-; CM-CHECK-LABEL: @vecload2
+; CM-CHECK-LABEL: {{^}}vecload2:
 ; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
-; SI-CHECK-LABEL: @vecload2
-; SI-CHECK: BUFFER_STORE_DWORDX2
+; SI-CHECK-LABEL: {{^}}vecload2:
+; SI-CHECK: buffer_store_dwordx2
 define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32 addrspace(2)* %mem, align 4
@@ -339,7 +339,7 @@
 
 ; When i128 was a legal type this program generated cannot select errors:
 
-; FUNC-LABEL: @i128-const-store
+; FUNC-LABEL: {{^}}"i128-const-store":
 ; FIXME: We should be able to to this with one store instruction
 ; EG-CHECK: STORE_RAW
 ; EG-CHECK: STORE_RAW
@@ -349,8 +349,8 @@
 ; CM-CHECK: STORE_DWORD
 ; CM-CHECK: STORE_DWORD
 ; CM-CHECK: STORE_DWORD
-; SI: BUFFER_STORE_DWORDX2
-; SI: BUFFER_STORE_DWORDX2
+; SI: buffer_store_dwordx2
+; SI: buffer_store_dwordx2
 define void @i128-const-store(i32 addrspace(1)* %out) {
 entry:
   store i32 1, i32 addrspace(1)* %out, align 4

diff --git a/test/CodeGen/R600/store.r600.ll b/test/CodeGen/R600/store.r600.ll
index 00589a0..3df30d4 100644
--- a/test/CodeGen/R600/store.r600.ll
+++ b/test/CodeGen/R600/store.r600.ll

@@ -3,7 +3,7 @@
 ; XXX: Merge this test into store.ll once it is supported on SI
 
 ; v4i32 store
-; EG-CHECK: @store_v4i32
+; EG-CHECK: {{^}}store_v4i32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
@@ -13,7 +13,7 @@
 }
 
 ; v4f32 store
-; EG-CHECK: @store_v4f32
+; EG-CHECK: {{^}}store_v4f32:
 ; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %1 = load <4 x float> addrspace(1) * %in

diff --git a/test/CodeGen/R600/structurize.ll b/test/CodeGen/R600/structurize.ll
index c2acd93..02e592e 100644
--- a/test/CodeGen/R600/structurize.ll
+++ b/test/CodeGen/R600/structurize.ll

@@ -13,7 +13,7 @@
 ;
 ;
 
-; CHECK-LABEL: @branch_into_diamond
+; CHECK-LABEL: {{^}}branch_into_diamond:
 ; === entry block:
 ; CHECK: ALU_PUSH_BEFORE
 ; === Branch instruction (IF):

diff --git a/test/CodeGen/R600/structurize1.ll b/test/CodeGen/R600/structurize1.ll
index 8c10301..77432c1 100644
--- a/test/CodeGen/R600/structurize1.ll
+++ b/test/CodeGen/R600/structurize1.ll

@@ -16,7 +16,7 @@
 ;   }
 ; }
 
-; CHECK-LABEL: @if_inside_loop
+; CHECK-LABEL: {{^}}if_inside_loop:
 ; CHECK: LOOP_START_DX10
 ; CHECK: END_LOOP
 define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {

diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index 8e64148..2bbc0cf 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll

@@ -3,12 +3,12 @@
 
 declare i32 @llvm.r600.read.tidig.x() readnone
 
-;FUNC-LABEL: @test2
+;FUNC-LABEL: {{^}}test2:
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,16 +19,16 @@
   ret void
 }
 
-;FUNC-LABEL: @test4
+;FUNC-LABEL: {{^}}test4:
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI: v_sub_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,9 +39,9 @@
   ret void
 }
 
-; FUNC-LABEL: @s_sub_i64:
-; SI: S_SUB_I32
-; SI: S_SUBB_U32
+; FUNC-LABEL: {{^}}s_sub_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
 
 ; EG-DAG: SETGE_UINT
 ; EG-DAG: CNDE_INT
@@ -54,9 +54,9 @@
   ret void
 }
 
-; FUNC-LABEL: @v_sub_i64:
-; SI: V_SUB_I32_e32
-; SI: V_SUBB_U32_e32
+; FUNC-LABEL: {{^}}v_sub_i64:
+; SI: v_sub_i32_e32
+; SI: v_subb_u32_e32
 
 ; EG-DAG: SETGE_UINT
 ; EG-DAG: CNDE_INT

diff --git a/test/CodeGen/R600/swizzle-export.ll b/test/CodeGen/R600/swizzle-export.ll
index 16c3f19..3e6f7a7 100644
--- a/test/CodeGen/R600/swizzle-export.ll
+++ b/test/CodeGen/R600/swizzle-export.ll

@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 
-;EG-CHECK: @main
+;EG-CHECK: {{^}}main:
 ;EG-CHECK: EXPORT T{{[0-9]+}}.XYXX
 ;EG-CHECK: EXPORT T{{[0-9]+}}.ZXXX
 ;EG-CHECK: EXPORT T{{[0-9]+}}.XXWX
@@ -92,9 +92,9 @@
   ret void
 }
 
-; EG-CHECK: @main2
+; EG-CHECK: {{^}}main2:
 ; EG-CHECK: T{{[0-9]+}}.XY__
-; EG-CHECK: T{{[0-9]+}}.YXZ0
+; EG-CHECK: T{{[0-9]+}}.ZXY0
 
 define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:

diff --git a/test/CodeGen/R600/trunc-store-i1.ll b/test/CodeGen/R600/trunc-store-i1.ll
index a3975c8..3c1b19f 100644
--- a/test/CodeGen/R600/trunc-store-i1.ll
+++ b/test/CodeGen/R600/trunc-store-i1.ll

@@ -1,30 +1,30 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
-; SI-LABEL: @global_truncstore_i32_to_i1
-; SI: S_LOAD_DWORD [[LOAD:s[0-9]+]],
-; SI: S_AND_B32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: BUFFER_STORE_BYTE [[VREG]],
+; SI-LABEL: {{^}}global_truncstore_i32_to_i1:
+; SI: s_load_dword [[LOAD:s[0-9]+]],
+; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: buffer_store_byte [[VREG]],
 define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @global_truncstore_i64_to_i1
-; SI: BUFFER_STORE_BYTE
+; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
+; SI: buffer_store_byte
 define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 
-; SI-LABEL: @global_truncstore_i16_to_i1
-; SI: S_LOAD_DWORD [[LOAD:s[0-9]+]],
-; SI: S_AND_B32 [[SREG:s[0-9]+]], [[LOAD]], 1
-; SI: V_MOV_B32_e32 [[VREG:v[0-9]+]], [[SREG]]
-; SI: BUFFER_STORE_BYTE [[VREG]],
+; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
+; SI: s_load_dword [[LOAD:s[0-9]+]],
+; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
+; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
+; SI: buffer_store_byte [[VREG]],
 define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1

diff --git a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
index ec959c2..878ea3f 100644
--- a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
+++ b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll

@@ -4,7 +4,7 @@
 ; vector stores at the end of a basic block were not being added to the
 ; LegalizedNodes list, which triggered an assertion failure.
 
-; CHECK-LABEL: @test
+; CHECK-LABEL: {{^}}test:
 ; CHECK: MEM_RAT_CACHELESS STORE_RAW
 define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
 entry:

diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
index 31cdfcd..7519d10 100644
--- a/test/CodeGen/R600/trunc.ll
+++ b/test/CodeGen/R600/trunc.ll

@@ -2,12 +2,12 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
 
 define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
-; SI-LABEL: @trunc_i64_to_i32_store
-; SI: S_LOAD_DWORD s0, s[0:1], 0xb
-; SI: V_MOV_B32_e32 v0, s0
-; SI: BUFFER_STORE_DWORD v0
+; SI-LABEL: {{^}}trunc_i64_to_i32_store:
+; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb
+; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
+; SI: buffer_store_dword [[VLOAD]]
 
-; EG-LABEL: @trunc_i64_to_i32_store
+; EG-LABEL: {{^}}trunc_i64_to_i32_store:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
 ; EG: LSHR
 ; EG-NEXT: 2(
@@ -16,12 +16,12 @@
   ret void
 }
 
-; SI-LABEL: @trunc_load_shl_i64:
-; SI-DAG: S_LOAD_DWORDX2
-; SI-DAG: S_LOAD_DWORD [[SREG:s[0-9]+]],
-; SI: S_LSHL_B32 [[SHL:s[0-9]+]], [[SREG]], 2
-; SI: V_MOV_B32_e32 [[VSHL:v[0-9]+]], [[SHL]]
-; SI: BUFFER_STORE_DWORD [[VSHL]],
+; SI-LABEL: {{^}}trunc_load_shl_i64:
+; SI-DAG: s_load_dwordx2
+; SI-DAG: s_load_dword [[SREG:s[0-9]+]],
+; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
+; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
+; SI: buffer_store_dword [[VSHL]],
 define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
@@ -29,12 +29,13 @@
   ret void
 }
 
-; SI-LABEL: @trunc_shl_i64:
-; SI: S_LOAD_DWORDX2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}},
-; SI: S_ADD_I32 s[[LO_ADD:[0-9]+]], s[[LO_SREG]],
-; SI: S_LSHL_B64 s{{\[}}[[LO_SREG2:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_ADD]]:{{[0-9]+\]}}, 2
-; SI: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
-; SI: BUFFER_STORE_DWORD v[[LO_VREG]],
+; SI-LABEL: {{^}}trunc_shl_i64:
+; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
+; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
+; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
+; SI: s_addc_u32
+; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
+; SI: buffer_store_dword v[[LO_VREG]],
 define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
   %b = shl i64 %aa, 2
@@ -44,10 +45,21 @@
   ret void
 }
 
-; SI-LABEL: @trunc_i32_to_i1:
-; SI: V_AND_B32
-; SI: V_CMP_EQ_I32
-define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+; SI-LABEL: {{^}}trunc_i32_to_i1:
+; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; SI: v_cmp_eq_i32
+define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
+  %a = load i32 addrspace(1)* %ptr, align 4
+  %trunc = trunc i32 %a to i1
+  %result = select i1 %trunc, i32 1, i32 0
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1:
+; SI: v_and_b32_e64 v{{[0-9]+}}, 1, s{{[0-9]+}}
+; SI: v_cmp_eq_i32
+define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
   store i32 %result, i32 addrspace(1)* %out, align 4

diff --git a/test/CodeGen/R600/uaddo.ll b/test/CodeGen/R600/uaddo.ll
index a80e502..eb242c1 100644
--- a/test/CodeGen/R600/uaddo.ll
+++ b/test/CodeGen/R600/uaddo.ll

@@ -4,10 +4,10 @@
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
 
-; FUNC-LABEL: @uaddo_i64_zext
-; SI: ADD
-; SI: ADDC
-; SI: ADDC
+; FUNC-LABEL: {{^}}uaddo_i64_zext:
+; SI: add
+; SI: addc
+; SI: addc
 define void @uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -18,8 +18,8 @@
   ret void
 }
 
-; FUNC-LABEL: @s_uaddo_i32
-; SI: S_ADD_I32
+; FUNC-LABEL: {{^}}s_uaddo_i32:
+; SI: s_add_i32
 define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %uadd, 0
@@ -29,8 +29,8 @@
   ret void
 }
 
-; FUNC-LABEL: @v_uaddo_i32
-; SI: V_ADD_I32
+; FUNC-LABEL: {{^}}v_uaddo_i32:
+; SI: v_add_i32
 define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -42,9 +42,9 @@
   ret void
 }
 
-; FUNC-LABEL: @s_uaddo_i64
-; SI: S_ADD_I32
-; SI: S_ADDC_U32
+; FUNC-LABEL: {{^}}s_uaddo_i64:
+; SI: s_add_u32
+; SI: s_addc_u32
 define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -54,9 +54,9 @@
   ret void
 }
 
-; FUNC-LABEL: @v_uaddo_i64
-; SI: V_ADD_I32
-; SI: V_ADDC_U32
+; FUNC-LABEL: {{^}}v_uaddo_i64:
+; SI: v_add_i32
+; SI: v_addc_u32
 define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64 addrspace(1)* %aptr, align 4
   %b = load i64 addrspace(1)* %bptr, align 4

diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index 5371321..59e91f8 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll

@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK-LABEL: @test
+;EG-CHECK-LABEL: {{^}}test:
 ;EG-CHECK-NOT: SETGE_INT
 ;EG-CHECK: CF_END
 
@@ -18,10 +18,10 @@
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
 ;a v4i32 udiv
 
-;EG-CHECK-LABEL: @test2
+;EG-CHECK-LABEL: {{^}}test2:
 ;EG-CHECK: CF_END
-;SI-CHECK-LABEL: @test2
-;SI-CHECK: S_ENDPGM
+;SI-CHECK-LABEL: {{^}}test2:
+;SI-CHECK: s_endpgm
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -32,10 +32,10 @@
   ret void
 }
 
-;EG-CHECK-LABEL: @test4
+;EG-CHECK-LABEL: {{^}}test4:
 ;EG-CHECK: CF_END
-;SI-CHECK-LABEL: @test4
-;SI-CHECK: S_ENDPGM
+;SI-CHECK-LABEL: {{^}}test4:
+;SI-CHECK: s_endpgm
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1

diff --git a/test/CodeGen/R600/udivrem.ll b/test/CodeGen/R600/udivrem.ll
index 5f5753a..f20705b 100644
--- a/test/CodeGen/R600/udivrem.ll
+++ b/test/CodeGen/R600/udivrem.ll

@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
-; FUNC-LABEL: @test_udivrem
+; FUNC-LABEL: {{^}}test_udivrem:
 ; EG: RECIP_UINT
 ; EG-DAG: MULHI
 ; EG-DAG: MULLO_INT
@@ -26,30 +26,30 @@
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 
-; SI: V_RCP_IFLAG_F32_e32 [[RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[RCP_HI:v[0-9]+]], [[RCP]]
-; SI-DAG: V_MUL_LO_I32 [[RCP_LO:v[0-9]+]], [[RCP]]
-; SI-DAG: V_SUB_I32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
-; SI: V_CNDMASK_B32_e64
-; SI: V_MUL_HI_U32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
-; SI-DAG: V_ADD_I32_e32 [[RCP_A_E:v[0-9]+]], [[E]], [[RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[RCP_S_E:v[0-9]+]], [[E]], [[RCP]]
-; SI: V_CNDMASK_B32_e64
-; SI: V_MUL_HI_U32 [[Quotient:v[0-9]+]]
-; SI: V_MUL_LO_I32 [[Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI: V_AND_B32_e32 [[Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI: v_rcp_iflag_f32_e32 [[RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]]
+; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]]
+; SI-DAG: v_sub_i32_e32 [[NEG_RCP_LO:v[0-9]+]], 0, [[RCP_LO]]
+; SI: v_cndmask_b32_e64
+; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]]
+; SI-DAG: v_add_i32_e32 [[RCP_A_E:v[0-9]+]], [[RCP]], [[E]]
+; SI-DAG: v_sub_i32_e32 [[RCP_S_E:v[0-9]+]], [[RCP]], [[E]]
+; SI: v_cndmask_b32_e64
+; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
+; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], 1, [[Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @test_udivrem(i32 addrspace(1)* %out, i32 %x, i32 %y) {
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out
@@ -58,7 +58,7 @@
   ret void
 }
 
-; FUNC-LABEL: @test_udivrem_v2
+; FUNC-LABEL: {{^}}test_udivrem_v2:
 ; EG-DAG: RECIP_UINT
 ; EG-DAG: MULHI
 ; EG-DAG: MULLO_INT
@@ -106,53 +106,53 @@
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[FIRST_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FIRST_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[FIRST_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[SECOND_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[SECOND_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[SECOND_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
   %result0 = udiv <2 x i32> %x, %y
   store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
@@ -162,7 +162,7 @@
 }
 
 
-; FUNC-LABEL: @test_udivrem_v4
+; FUNC-LABEL: {{^}}test_udivrem_v4:
 ; EG-DAG: RECIP_UINT
 ; EG-DAG: MULHI
 ; EG-DAG: MULLO_INT
@@ -256,99 +256,99 @@
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[FIRST_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_E]], [[FIRST_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FIRST_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[FIRST_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[FIRST_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[SECOND_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_E]], [[SECOND_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[SECOND_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[SECOND_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[SECOND_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[THIRD_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_E]], [[THIRD_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[THIRD_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[THIRD_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[THIRD_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[THIRD_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[THIRD_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[THIRD_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_RCP_IFLAG_F32_e32 [[FOURTH_RCP:v[0-9]+]]
-; SI-DAG: V_MUL_HI_U32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]]
-; SI-DAG: V_MUL_LO_I32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]]
-; SI-DAG: V_SUB_I32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
-; SI-DAG: V_ADD_I32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
-; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_E]], [[FOURTH_RCP]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_MUL_HI_U32 [[FOURTH_Quotient:v[0-9]+]]
-; SI-DAG: V_MUL_LO_I32 [[FOURTH_Num_S_Remainder:v[0-9]+]]
-; SI-DAG: V_SUB_I32_e32 [[FOURTH_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FOURTH_Num_S_Remainder]]
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_AND_B32_e32 [[FOURTH_Tmp1:v[0-9]+]]
-; SI-DAG: V_ADD_I32_e32 [[FOURTH_Quotient_A_One:v[0-9]+]], {{.*}}, [[FOURTH_Quotient]]
-; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_Quotient_S_One:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_ADD_I32_e32 [[FOURTH_Remainder_A_Den:v[0-9]+]],
-; SI-DAG: V_SUBREV_I32_e32 [[FOURTH_Remainder_S_Den:v[0-9]+]],
-; SI-DAG: V_CNDMASK_B32_e64
-; SI-DAG: V_CNDMASK_B32_e64
-; SI: S_ENDPGM
+; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], 0, [[FIRST_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]]
+; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], [[FIRST_RCP]], [[FIRST_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[FIRST_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FIRST_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], 0, [[SECOND_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]]
+; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], [[SECOND_RCP]], [[SECOND_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[SECOND_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[SECOND_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[THIRD_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[THIRD_RCP_HI:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: v_mul_lo_i32 [[THIRD_RCP_LO:v[0-9]+]], [[THIRD_RCP]]
+; SI-DAG: v_sub_i32_e32 [[THIRD_NEG_RCP_LO:v[0-9]+]], 0, [[THIRD_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[THIRD_E:v[0-9]+]], {{v[0-9]+}}, [[THIRD_RCP]]
+; SI-DAG: v_add_i32_e32 [[THIRD_RCP_A_E:v[0-9]+]], [[THIRD_RCP]], [[THIRD_E]]
+; SI-DAG: v_sub_i32_e32 [[THIRD_RCP_S_E:v[0-9]+]], [[THIRD_RCP]], [[THIRD_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[THIRD_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[THIRD_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[THIRD_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[THIRD_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[THIRD_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[THIRD_Quotient_A_One:v[0-9]+]], {{.*}}, [[THIRD_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[THIRD_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[THIRD_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_rcp_iflag_f32_e32 [[FOURTH_RCP:v[0-9]+]]
+; SI-DAG: v_mul_hi_u32 [[FOURTH_RCP_HI:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: v_mul_lo_i32 [[FOURTH_RCP_LO:v[0-9]+]], [[FOURTH_RCP]]
+; SI-DAG: v_sub_i32_e32 [[FOURTH_NEG_RCP_LO:v[0-9]+]], 0, [[FOURTH_RCP_LO]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FOURTH_E:v[0-9]+]], {{v[0-9]+}}, [[FOURTH_RCP]]
+; SI-DAG: v_add_i32_e32 [[FOURTH_RCP_A_E:v[0-9]+]], [[FOURTH_RCP]], [[FOURTH_E]]
+; SI-DAG: v_sub_i32_e32 [[FOURTH_RCP_S_E:v[0-9]+]], [[FOURTH_RCP]], [[FOURTH_E]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_mul_hi_u32 [[FOURTH_Quotient:v[0-9]+]]
+; SI-DAG: v_mul_lo_i32 [[FOURTH_Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_sub_i32_e32 [[FOURTH_Remainder:v[0-9]+]], {{[vs][0-9]+}}, [[FOURTH_Num_S_Remainder]]
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_and_b32_e32 [[FOURTH_Tmp1:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[FOURTH_Quotient_A_One:v[0-9]+]], {{.*}}, [[FOURTH_Quotient]]
+; SI-DAG: v_subrev_i32_e32 [[FOURTH_Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_add_i32_e32 [[FOURTH_Remainder_A_Den:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[FOURTH_Remainder_S_Den:v[0-9]+]],
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI: s_endpgm
 define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
   %result0 = udiv <4 x i32> %x, %y
   store <4 x i32> %result0, <4 x i32> addrspace(1)* %out

diff --git a/test/CodeGen/R600/udivrem24.ll b/test/CodeGen/R600/udivrem24.ll
new file mode 100644
index 0000000..defb3c0
--- /dev/null
+++ b/test/CodeGen/R600/udivrem24.ll

@@ -0,0 +1,244 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}udiv24_i8:
+; SI: v_cvt_f32_ubyte
+; SI: v_cvt_f32_ubyte
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
+  %num = load i8 addrspace(1) * %in
+  %den = load i8 addrspace(1) * %den_ptr
+  %result = udiv i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i16:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
+  %num = load i16 addrspace(1) * %in, align 2
+  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %result = udiv i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv24_i32:
+; SI: v_cvt_f32_u32
+; SI-DAG: v_cvt_f32_u32
+; SI-DAG: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}udiv25_i32:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_udiv24_i32_1:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_udiv24_i32_2:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = udiv i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i8:
+; SI: v_cvt_f32_ubyte
+; SI: v_cvt_f32_ubyte
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
+  %num = load i8 addrspace(1) * %in
+  %den = load i8 addrspace(1) * %den_ptr
+  %result = urem i8 %num, %den
+  store i8 %result, i8 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i16:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
+  %num = load i16 addrspace(1) * %in, align 2
+  %den = load i16 addrspace(1) * %den_ptr, align 2
+  %result = urem i16 %num, %den
+  store i16 %result, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem24_i32:
+; SI: v_cvt_f32_u32
+; SI: v_cvt_f32_u32
+; SI: v_rcp_f32
+; SI: v_cvt_u32_f32
+
+; EG: UINT_TO_FLT
+; EG-DAG: UINT_TO_FLT
+; EG-DAG: RECIP_IEEE
+; EG: FLT_TO_UINT
+define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}urem25_i32:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_urem24_i32_1:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 8
+  %den.i24.0 = shl i32 %den, 7
+  %num.i24 = lshr i32 %num.i24.0, 8
+  %den.i24 = lshr i32 %den.i24.0, 7
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_no_urem24_i32_2:
+; RCP_IFLAG is for URECIP in the full 32b alg
+; SI: v_rcp_iflag
+; SI-NOT: v_rcp_f32
+
+; EG-NOT: UINT_TO_FLT
+; EG-NOT: RECIP_IEEE
+define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in, align 4
+  %den = load i32 addrspace(1) * %den_ptr, align 4
+  %num.i24.0 = shl i32 %num, 7
+  %den.i24.0 = shl i32 %den, 8
+  %num.i24 = lshr i32 %num.i24.0, 7
+  %den.i24 = lshr i32 %den.i24.0, 8
+  %result = urem i32 %num.i24, %den.i24
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/udivrem64.ll b/test/CodeGen/R600/udivrem64.ll
index a71315a..8864c83 100644
--- a/test/CodeGen/R600/udivrem64.ll
+++ b/test/CodeGen/R600/udivrem64.ll

@@ -1,7 +1,7 @@
 ;XUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
-;FUNC-LABEL: @test_udiv
+;FUNC-LABEL: {{^}}test_udiv:
 ;EG: RECIP_UINT
 ;EG: LSHL {{.*}}, 1,
 ;EG: BFE_UINT
@@ -34,14 +34,14 @@
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: BFE_UINT
-;SI: S_ENDPGM
+;SI: s_endpgm
 define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-;FUNC-LABEL: @test_urem
+;FUNC-LABEL: {{^}}test_urem:
 ;EG: RECIP_UINT
 ;EG: BFE_UINT
 ;EG: BFE_UINT
@@ -74,7 +74,7 @@
 ;EG: BFE_UINT
 ;EG: BFE_UINT
 ;EG: AND_INT {{.*}}, 1,
-;SI: S_ENDPGM
+;SI: s_endpgm
 define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out

diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
index 9a41796..bddf700 100644
--- a/test/CodeGen/R600/uint_to_fp.f64.ll
+++ b/test/CodeGen/R600/uint_to_fp.f64.ll

@@ -1,22 +1,24 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @uint_to_fp_f64_i32
-; SI: V_CVT_F64_U32_e32
-; SI: S_ENDPGM
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
+; SI-LABEL: {{^}}uint_to_fp_f64_i32
+; SI: v_cvt_f64_u32_e32
+; SI: s_endpgm
 define void @uint_to_fp_f64_i32(double addrspace(1)* %out, i32 %in) {
   %cast = uitofp i32 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
 }
 
-; SI-LABEL: @uint_to_fp_i1_f64:
-; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-LABEL: {{^}}uint_to_fp_i1_f64:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
 ; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
 ; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: V_CNDMASK_B32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: BUFFER_STORE_DWORDX2
-; SI: S_ENDPGM
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; SI: buffer_store_dwordx2
+; SI: s_endpgm
 define void @uint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to double
@@ -24,13 +26,50 @@
   ret void
 }
 
-; SI-LABEL: @uint_to_fp_i1_f64_load:
-; SI: V_CNDMASK_B32_e64 [[IRESULT:v[0-9]]], 0, 1
-; SI-NEXT: V_CVT_F64_U32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
-; SI: BUFFER_STORE_DWORDX2 [[RESULT]]
-; SI: S_ENDPGM
+; SI-LABEL: {{^}}uint_to_fp_i1_f64_load:
+; SI: v_cndmask_b32_e64 [[IRESULT:v[0-9]]], 0, 1
+; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+; SI: s_endpgm
 define void @uint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
 }
+
+; SI-LABEL: {{^}}v_uint_to_fp_i64_to_f64
+; SI: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; SI-DAG: v_cvt_f64_u32_e32 [[LO_CONV:v\[[0-9]+:[0-9]+\]]], v[[LO]]
+; SI-DAG: v_cvt_f64_u32_e32 [[HI_CONV:v\[[0-9]+:[0-9]+\]]], v[[HI]]
+; SI: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
+; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
+; SI: buffer_store_dwordx2 [[RESULT]]
+define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %gep = getelementptr i64 addrspace(1)* %in, i32 %tid
+  %val = load i64 addrspace(1)* %gep, align 8
+  %result = uitofp i64 %val to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_f64_i64
+define void @s_uint_to_fp_f64_i64(double addrspace(1)* %out, i64 %in) {
+  %cast = uitofp i64 %in to double
+  store double %cast, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v2f64_v2i64
+define void @s_uint_to_fp_v2f64_v2i64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+  %cast = uitofp <2 x i64> %in to <2 x double>
+  store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
+  ret void
+}
+
+; SI-LABEL: {{^}}s_uint_to_fp_v4f64_v4i64
+define void @s_uint_to_fp_v4f64_v4i64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+  %cast = uitofp <4 x i64> %in to <4 x double>
+  store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
+  ret void
+}

diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index 8f5d42d..f58f10b 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll

@@ -1,30 +1,30 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
-; FUNC-LABEL: @uint_to_fp_v2i32
+; FUNC-LABEL: {{^}}uint_to_fp_v2i32:
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
 
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: S_ENDPGM
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
 define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @uint_to_fp_v4i32
+; FUNC-LABEL: {{^}}uint_to_fp_v4i32:
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: S_ENDPGM
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: s_endpgm
 define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %value = load <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
@@ -32,14 +32,14 @@
   ret void
 }
 
-; FUNC-LABEL: @uint_to_fp_i64_f32
+; FUNC-LABEL: {{^}}uint_to_fp_i64_f32:
 ; R600: UINT_TO_FLT
 ; R600: UINT_TO_FLT
 ; R600: MULADD_IEEE
-; SI: V_CVT_F32_U32_e32
-; SI: V_CVT_F32_U32_e32
-; SI: V_MAD_F32
-; SI: S_ENDPGM
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_mad_f32
+; SI: s_endpgm
 define void @uint_to_fp_i64_f32(float addrspace(1)* %out, i64 %in) {
 entry:
   %0 = uitofp i64 %in to float
@@ -47,11 +47,11 @@
   ret void
 }
 
-; FUNC-LABEL: @uint_to_fp_i1_f32:
-; SI: V_CMP_EQ_I32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; SI-NEXT: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, 1.000000e+00, [[CMP]]
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}uint_to_fp_i1_f32:
+; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @uint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
@@ -59,10 +59,10 @@
   ret void
 }
 
-; FUNC-LABEL: @uint_to_fp_i1_f32_load:
-; SI: V_CNDMASK_B32_e64 [[RESULT:v[0-9]+]], 0, 1.000000e+00
-; SI: BUFFER_STORE_DWORD [[RESULT]],
-; SI: S_ENDPGM
+; FUNC-LABEL: {{^}}uint_to_fp_i1_f32_load:
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
+; SI: buffer_store_dword [[RESULT]],
+; SI: s_endpgm
 define void @uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to float
   store float %fp, float addrspace(1)* %out, align 4

diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
index 4df69d1..f8737e6 100644
--- a/test/CodeGen/R600/unaligned-load-store.ll
+++ b/test/CodeGen/R600/unaligned-load-store.ll

@@ -1,17 +1,101 @@
 ; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: @unaligned_load_store_i32:
-; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
+; FIXME: This is probably wrong. This probably needs to expand to 8-bit reads and writes.
+; SI-LABEL: {{^}}unaligned_load_store_i32:
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_write_b32
+; SI: s_endpgm
 define void @unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
   %v = load i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
 }
 
-; SI-LABEL: @unaligned_load_store_v4i32:
-; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
+; SI-LABEL: {{^}}unaligned_load_store_v4i32:
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_read_u16
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: ds_write_b32
+; SI: s_endpgm
 define void @unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
   %v = load <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
 }
+
+; SI-LABEL: {{^}}load_lds_i64_align_4:
+; SI: ds_read2_b32
+; SI: s_endpgm
+define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %val = load i64 addrspace(3)* %in, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4_with_offset
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
+; SI: s_endpgm
+define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = getelementptr i64 addrspace(3)* %in, i32 4
+  %val = load i64 addrspace(3)* %ptr, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; SI-LABEL: {{^}}load_lds_i64_align_4_with_split_offset:
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:0 offset1:1
+; SI: s_endpgm
+define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+  %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
+  %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  %val = load i64 addrspace(3)* %ptri64, align 4
+  store i64 %val, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FIXME: Need to fix this case.
+; define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+;   %val = load i64 addrspace(3)* %in, align 1
+;   store i64 %val, i64 addrspace(1)* %out, align 8
+;   ret void
+; }
+
+; SI-LABEL: {{^}}store_lds_i64_align_4:
+; SI: ds_write2_b32
+; SI: s_endpgm
+define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+  store i64 %val, i64 addrspace(3)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4_with_offset
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
+; SI: s_endpgm
+define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = getelementptr i64 addrspace(3)* %out, i32 4
+  store i64 0, i64 addrspace(3)* %ptr, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}store_lds_i64_align_4_with_split_offset:
+; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
+; SI: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:0 offset1:1
+; SI: s_endpgm
+define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+  %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
+  %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255
+  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
+  store i64 0, i64 addrspace(3)* %out, align 4
+  ret void
+}

diff --git a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
index e4129c5..ff01a1e 100644
--- a/test/CodeGen/R600/unhandled-loop-condition-assertion.ll
+++ b/test/CodeGen/R600/unhandled-loop-condition-assertion.ll

@@ -5,7 +5,7 @@
 
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
-; COMMON-LABEL: @branch_true:
+; COMMON-LABEL: {{^}}branch_true:
 define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 true, label %for.end, label %for.body.lr.ph
@@ -39,9 +39,9 @@
   ret void
 }
 
-; COMMON-LABEL: @branch_false:
+; COMMON-LABEL: {{^}}branch_false:
 ; SI: .text
-; SI-NEXT: S_ENDPGM
+; SI-NEXT: s_endpgm
 define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
@@ -75,9 +75,9 @@
   ret void
 }
 
-; COMMON-LABEL: @branch_undef:
+; COMMON-LABEL: {{^}}branch_undef:
 ; SI: .text
-; SI-NEXT: S_ENDPGM
+; SI-NEXT: s_endpgm
 define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph

diff --git a/test/CodeGen/R600/unsupported-cc.ll b/test/CodeGen/R600/unsupported-cc.ll
index f986a02..8ab4faf 100644
--- a/test/CodeGen/R600/unsupported-cc.ll
+++ b/test/CodeGen/R600/unsupported-cc.ll

@@ -2,7 +2,7 @@
 
 ; These tests are for condition codes that are not supported by the hardware
 
-; CHECK-LABEL: @slt
+; CHECK-LABEL: {{^}}slt:
 ; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
@@ -14,7 +14,7 @@
   ret void
 }
 
-; CHECK-LABEL: @ult_i32
+; CHECK-LABEL: {{^}}ult_i32:
 ; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
@@ -26,7 +26,7 @@
   ret void
 }
 
-; CHECK-LABEL: @ult_float
+; CHECK-LABEL: {{^}}ult_float:
 ; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
@@ -39,7 +39,7 @@
   ret void
 }
 
-; CHECK-LABEL: @ult_float_native
+; CHECK-LABEL: {{^}}ult_float_native:
 ; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: LSHR *
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -51,7 +51,7 @@
   ret void
 }
 
-; CHECK-LABEL: @olt
+; CHECK-LABEL: {{^}}olt:
 ; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR *
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -63,7 +63,7 @@
   ret void
 }
 
-; CHECK-LABEL: @sle
+; CHECK-LABEL: {{^}}sle:
 ; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
@@ -75,7 +75,7 @@
   ret void
 }
 
-; CHECK-LABEL: @ule_i32
+; CHECK-LABEL: {{^}}ule_i32:
 ; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
@@ -87,7 +87,7 @@
   ret void
 }
 
-; CHECK-LABEL: @ule_float
+; CHECK-LABEL: {{^}}ule_float:
 ; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
@@ -100,7 +100,7 @@
   ret void
 }
 
-; CHECK-LABEL: @ule_float_native
+; CHECK-LABEL: {{^}}ule_float_native:
 ; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
 ; CHECK-NEXT: LSHR *
 ; CHECK-NEXT: 1084227584(5.000000e+00)
@@ -112,7 +112,7 @@
   ret void
 }
 
-; CHECK-LABEL: @ole
+; CHECK-LABEL: {{^}}ole:
 ; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
 ; CHECK-NEXT: LSHR *
 ; CHECK-NEXT:1084227584(5.000000e+00)

diff --git a/test/CodeGen/R600/urecip.ll b/test/CodeGen/R600/urecip.ll
index e808e3d..4d953b5 100644
--- a/test/CodeGen/R600/urecip.ll
+++ b/test/CodeGen/R600/urecip.ll

@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_RCP_IFLAG_F32_e32
+;CHECK: v_rcp_iflag_f32_e32
 
 define void @test(i32 %p, i32 %q) {
    %i = udiv i32 %p, %q

diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index 8045145..914f5d0 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll

@@ -5,10 +5,10 @@
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
 ;a v2i32/v4i32 urem
 
-;EG-CHECK: @test2
+;EG-CHECK: {{^}}test2:
 ;EG-CHECK: CF_END
-;SI-CHECK: @test2
-;SI-CHECK: S_ENDPGM
+;SI-CHECK: {{^}}test2:
+;SI-CHECK: s_endpgm
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,10 +19,10 @@
   ret void
 }
 
-;EG-CHECK: @test4
+;EG-CHECK: {{^}}test4:
 ;EG-CHECK: CF_END
-;SI-CHECK: @test4
-;SI-CHECK: S_ENDPGM
+;SI-CHECK: {{^}}test4:
+;SI-CHECK: s_endpgm
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1

diff --git a/test/CodeGen/R600/use-sgpr-multiple-times.ll b/test/CodeGen/R600/use-sgpr-multiple-times.ll
new file mode 100644
index 0000000..aa94a0e
--- /dev/null
+++ b/test/CodeGen/R600/use-sgpr-multiple-times.ll

@@ -0,0 +1,96 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare i32 @llvm.AMDGPU.imad24(i32, i32, i32) #1
+
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_binop:
+; SI: s_load_dword [[SGPR:s[0-9]+]],
+; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+  %dbl = fadd float %a, %a
+  store float %dbl, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_three_ternary_op:
+; SI: s_load_dword [[SGPR:s[0-9]+]],
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
+; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
+; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+  %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_imm:
+; SI: s_load_dword [[SGPR:s[0-9]+]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
+; SI: s_load_dword [[SGPR:s[0-9]+]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
+  store float %fma, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; Don't use fma since fma c, x, y is canonicalized to fma x, c, y
+; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_imm_a_a:
+; SI: s_load_dword [[SGPR:s[0-9]+]]
+; SI: v_mad_i32_i24 [[RESULT:v[0-9]+]], 2, [[SGPR]], [[SGPR]]
+; SI: buffer_store_dword [[RESULT]]
+define void @test_sgpr_use_twice_ternary_op_imm_a_a(i32 addrspace(1)* %out, i32 %a) #0 {
+  %fma = call i32 @llvm.AMDGPU.imad24(i32 2, i32 %a, i32 %a) #1
+  store i32 %fma, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/usubo.ll b/test/CodeGen/R600/usubo.ll
index d57a2c7..abc5bd2 100644
--- a/test/CodeGen/R600/usubo.ll
+++ b/test/CodeGen/R600/usubo.ll

@@ -4,7 +4,7 @@
 declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
 
-; FUNC-LABEL: @usubo_i64_zext
+; FUNC-LABEL: {{^}}usubo_i64_zext:
 define void @usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %usub, 0
@@ -15,8 +15,8 @@
   ret void
 }
 
-; FUNC-LABEL: @s_usubo_i32
-; SI: S_SUB_I32
+; FUNC-LABEL: {{^}}s_usubo_i32:
+; SI: s_sub_i32
 define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %usub, 0
@@ -26,8 +26,8 @@
   ret void
 }
 
-; FUNC-LABEL: @v_usubo_i32
-; SI: V_SUBREV_I32_e32
+; FUNC-LABEL: {{^}}v_usubo_i32:
+; SI: v_sub_i32_e32
 define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32 addrspace(1)* %aptr, align 4
   %b = load i32 addrspace(1)* %bptr, align 4
@@ -39,9 +39,9 @@
   ret void
 }
 
-; FUNC-LABEL: @s_usubo_i64
-; SI: S_SUB_I32
-; SI: S_SUBB_U32
+; FUNC-LABEL: {{^}}s_usubo_i64:
+; SI: s_sub_u32
+; SI: s_subb_u32
 define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %usub, 0
@@ -51,9 +51,9 @@
   ret void
 }
 
-; FUNC-LABEL: @v_usubo_i64
-; SI: V_SUB_I32
-; SI: V_SUBB_U32
+; FUNC-LABEL: {{^}}v_usubo_i64:
+; SI: v_sub_i32
+; SI: v_subb_u32
 define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64 addrspace(1)* %aptr, align 4
   %b = load i64 addrspace(1)* %bptr, align 4

diff --git a/test/CodeGen/R600/v1i64-kernel-arg.ll b/test/CodeGen/R600/v1i64-kernel-arg.ll
index 2aa1221..3175512 100644
--- a/test/CodeGen/R600/v1i64-kernel-arg.ll
+++ b/test/CodeGen/R600/v1i64-kernel-arg.ll

@@ -2,14 +2,14 @@
 ; XFAIL: *
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
-; CHECK-LABEL: @kernel_arg_i64
+; CHECK-LABEL: {{^}}kernel_arg_i64:
 define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; i64 arg works, v1i64 arg does not.
-; CHECK-LABEL: @kernel_arg_v1i64
+; CHECK-LABEL: {{^}}kernel_arg_v1i64:
 define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
   ret void

diff --git a/test/CodeGen/R600/v_cndmask.ll b/test/CodeGen/R600/v_cndmask.ll
index 84087ee..a24dcc7 100644
--- a/test/CodeGen/R600/v_cndmask.ll
+++ b/test/CodeGen/R600/v_cndmask.ll

@@ -1,14 +1,38 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI: @v_cnd_nan
-; SI: V_CNDMASK_B32_e64 v{{[0-9]}},
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; SI-LABEL: {{^}}v_cnd_nan_nosgpr:
+; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
 ; SI-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
-; SI-DAG: -1
-define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) {
-entry:
-  %0 = icmp ne i32 %c, 0
-  %1 = select i1 %0, float 0xFFFFFFFFE0000000, float %f
-  store float %1, float addrspace(1)* %out
+; SI: s_endpgm
+define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+  %idx = call i32 @llvm.r600.read.tidig.x() #1
+  %f.gep = getelementptr float addrspace(1)* %fptr, i32 %idx
+  %f = load float addrspace(1)* %fptr
+  %setcc = icmp ne i32 %c, 0
+  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
+  store float %select, float addrspace(1)* %out
   ret void
 }
+
+
+; This requires slightly trickier SGPR operand legalization since the
+; single constant bus SGPR usage is the last operand, and it should
+; never be moved.
+
+; SI-LABEL: {{^}}v_cnd_nan:
+; SI: v_cndmask_b32_e64 v{{[0-9]}}, v{{[0-9]}}, -1, s{{\[[0-9]+:[0-9]+\]}}
+; SI-DAG: v{{[0-9]}}
+; All nan values are converted to 0xffffffff
+; SI: s_endpgm
+define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+  %setcc = icmp ne i32 %c, 0
+  %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
+  store float %select, float addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

diff --git a/test/CodeGen/R600/valu-i1.ll b/test/CodeGen/R600/valu-i1.ll
index 5d5e3ff..2c209fc 100644
--- a/test/CodeGen/R600/valu-i1.ll
+++ b/test/CodeGen/R600/valu-i1.ll

@@ -2,8 +2,8 @@
 
 ; Make sure the i1 values created by the cfg structurizer pass are
 ; moved using VALU instructions
-; SI-NOT: S_MOV_B64 s[{{[0-9]:[0-9]}}], -1
-; SI: V_MOV_B32_e32 v{{[0-9]}}, -1
+; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
+; SI: v_mov_b32_e32 v{{[0-9]}}, -1
 define void @test_if(i32 %a, i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) {
 entry:
   switch i32 %a, label %default [

diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll
index 6543f6d..0b457a8 100644
--- a/test/CodeGen/R600/vector-alloca.ll
+++ b/test/CodeGen/R600/vector-alloca.ll

@@ -1,7 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
 
-; FUNC-LABEL: @vector_read
+; FUNC-LABEL: {{^}}vector_read:
 ; EG: MOV
 ; EG: MOV
 ; EG: MOV
@@ -24,7 +25,7 @@
   ret void
 }
 
-; FUNC-LABEL: @vector_write
+; FUNC-LABEL: {{^}}vector_write:
 ; EG: MOV
 ; EG: MOV
 ; EG: MOV
@@ -52,8 +53,8 @@
 
 ; This test should be optimize to:
 ; store i32 0, i32 addrspace(1)* %out
-; FUNC-LABEL: @bitcast_gep
-; CHECK: STORE_RAW
+; FUNC-LABEL: {{^}}bitcast_gep:
+; EG: STORE_RAW
 define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %0 = alloca [4 x i32]

diff --git a/test/CodeGen/R600/vertex-fetch-encoding.ll b/test/CodeGen/R600/vertex-fetch-encoding.ll
index 7ea7a5c..e24744e 100644
--- a/test/CodeGen/R600/vertex-fetch-encoding.ll
+++ b/test/CodeGen/R600/vertex-fetch-encoding.ll

@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI-CHECK %s
 ; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
 
-; NI-CHECK: @vtx_fetch32
+; NI-CHECK: {{^}}vtx_fetch32:
 ; NI-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
-; CM-CHECK: @vtx_fetch32
+; CM-CHECK: {{^}}vtx_fetch32:
 ; CM-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
 define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -13,7 +13,7 @@
   ret void
 }
 
-; NI-CHECK: @vtx_fetch128
+; NI-CHECK: {{^}}vtx_fetch128:
 ; NI-CHECK: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
 ; XXX: Add a case for Cayman when v4i32 stores are supported.
 

diff --git a/test/CodeGen/R600/vop-shrink.ll b/test/CodeGen/R600/vop-shrink.ll
new file mode 100644
index 0000000..e7f0288
--- /dev/null
+++ b/test/CodeGen/R600/vop-shrink.ll

@@ -0,0 +1,50 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+; Test that we correctly commute a sub instruction
+; FUNC-LABEL: {{^}}sub_rev:
+; SI-NOT: v_sub_i32_e32 v{{[0-9]+}}, s
+; SI: v_subrev_i32_e32 v{{[0-9]+}}, s
+
+; ModuleID = 'vop-shrink.ll'
+
+define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+entry:
+  %vgpr = call i32 @llvm.r600.read.tidig.x() #1
+  %tmp = icmp eq i32 %cond, 0
+  br i1 %tmp, label %if, label %else
+
+if:                                               ; preds = %entry
+  %tmp1 = getelementptr i32 addrspace(1)* %out, i32 1
+  %tmp2 = extractelement <4 x i32> %sgpr, i32 1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  br label %endif
+
+else:                                             ; preds = %entry
+  %tmp3 = extractelement <4 x i32> %sgpr, i32 2
+  %tmp4 = sub i32 %vgpr, %tmp3
+  store i32 %tmp4, i32 addrspace(1)* %out
+  br label %endif
+
+endif:                                            ; preds = %else, %if
+  ret void
+}
+
+; Test that we fold an immediate that was illegal for a 64-bit op into the
+; 32-bit op when we shrink it.
+
+; FUNC-LABEL: {{^}}add_fold:
+; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
+define void @add_fold(float addrspace(1)* %out) {
+entry:
+  %tmp = call i32 @llvm.r600.read.tidig.x()
+  %tmp1 = uitofp i32 %tmp to float
+  %tmp2 = fadd float %tmp1, 1.024000e+03
+  store float %tmp2, float addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { readnone }

diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
index dca7b06..e84b8f7 100644
--- a/test/CodeGen/R600/vselect.ll
+++ b/test/CodeGen/R600/vselect.ll

@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @test_select_v2i32
+;EG-CHECK: {{^}}test_select_v2i32:
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test_select_v2i32
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: {{^}}test_select_v2i32:
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
 
 define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
 entry:
@@ -19,13 +19,13 @@
   ret void
 }
 
-;EG-CHECK: @test_select_v2f32
+;EG-CHECK: {{^}}test_select_v2f32:
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test_select_v2f32
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: {{^}}test_select_v2f32:
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
 
 define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
@@ -37,17 +37,17 @@
   ret void
 }
 
-;EG-CHECK: @test_select_v4i32
+;EG-CHECK: {{^}}test_select_v4i32:
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test_select_v4i32
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
-;SI-CHECK: V_CNDMASK_B32_e64
+;SI-CHECK: {{^}}test_select_v4i32:
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
+;SI-CHECK: v_cndmask_b32_e64
 
 define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
 entry:
@@ -59,7 +59,7 @@
   ret void
 }
 
-;EG-CHECK: @test_select_v4f32
+;EG-CHECK: {{^}}test_select_v4f32:
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}

diff --git a/test/CodeGen/R600/vselect64.ll b/test/CodeGen/R600/vselect64.ll
index 604695b..ef85ebe 100644
--- a/test/CodeGen/R600/vselect64.ll
+++ b/test/CodeGen/R600/vselect64.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck  %s
 ; XXX: Merge this test into vselect.ll once SI supports 64-bit select.
 
-; CHECK-LABEL: @test_select_v4i64
+; CHECK-LABEL: {{^}}test_select_v4i64:
 ; Make sure the vectors aren't being stored on the stack.  We know they are
 ; being stored on the stack if the shaders uses at leat 10 registers.
 ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X

diff --git a/test/CodeGen/R600/vtx-fetch-branch.ll b/test/CodeGen/R600/vtx-fetch-branch.ll
index 0fc99de..bcbe34e 100644
--- a/test/CodeGen/R600/vtx-fetch-branch.ll
+++ b/test/CodeGen/R600/vtx-fetch-branch.ll

@@ -6,7 +6,7 @@
 ; after the fetch clause.
 
 
-; CHECK-LABEL: @test
+; CHECK-LABEL: {{^}}test:
 ; CHECK-NOT: ALU_POP_AFTER
 ; CHECK: TEX
 ; CHECK-NEXT: POP

diff --git a/test/CodeGen/R600/vtx-schedule.ll b/test/CodeGen/R600/vtx-schedule.ll
index ce852c5..8254c99 100644
--- a/test/CodeGen/R600/vtx-schedule.ll
+++ b/test/CodeGen/R600/vtx-schedule.ll

@@ -4,7 +4,7 @@
 ; the result of another VTX_READ instruction were being grouped in the
 ; same fetch clasue.
 
-; CHECK: @test
+; CHECK: {{^}}test:
 ; CHECK: Fetch clause
 ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
 ; CHECK: Fetch clause

diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll
index 2cf88fe..735eabd 100644
--- a/test/CodeGen/R600/wait.ll
+++ b/test/CodeGen/R600/wait.ll

@@ -1,37 +1,45 @@
-; RUN: llc < %s -march=r600 -mcpu=SI --verify-machineinstrs | FileCheck %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s
 
-;CHECK-LABEL: @main
-;CHECK: S_WAITCNT lgkmcnt(0)
-;CHECK: S_WAITCNT vmcnt(0)
-;CHECK: S_WAITCNT expcnt(0) lgkmcnt(0)
-
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, i32 inreg, i32, i32, i32, i32) #0 {
+; CHECK-LABEL: {{^}}main:
+; CHECK: s_load_dwordx4
+; CHECK: s_load_dwordx4
+; CHECK: s_waitcnt lgkmcnt(0){{$}}
+; CHECK: s_waitcnt vmcnt(0){{$}}
+; CHECK: s_waitcnt expcnt(0) lgkmcnt(0){{$}}
+define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
-  %10 = getelementptr <16 x i8> addrspace(2)* %3, i32 0
-  %11 = load <16 x i8> addrspace(2)* %10, !tbaa !0
-  %12 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %11, i32 0, i32 %6)
-  %13 = extractelement <4 x float> %12, i32 0
-  %14 = extractelement <4 x float> %12, i32 1
-  %15 = extractelement <4 x float> %12, i32 2
-  %16 = extractelement <4 x float> %12, i32 3
-  %17 = getelementptr <16 x i8> addrspace(2)* %3, i32 1
-  %18 = load <16 x i8> addrspace(2)* %17, !tbaa !0
-  %19 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %18, i32 0, i32 %6)
-  %20 = extractelement <4 x float> %19, i32 0
-  %21 = extractelement <4 x float> %19, i32 1
-  %22 = extractelement <4 x float> %19, i32 2
-  %23 = extractelement <4 x float> %19, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %20, float %21, float %22, float %23)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %13, float %14, float %15, float %16)
+  %tmp = getelementptr <16 x i8> addrspace(2)* %arg3, i32 0
+  %tmp10 = load <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
+  %tmp12 = extractelement <4 x float> %tmp11, i32 0
+  %tmp13 = extractelement <4 x float> %tmp11, i32 1
+  call void @llvm.AMDGPU.barrier.global() #1
+  %tmp14 = extractelement <4 x float> %tmp11, i32 2
+;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
+  %tmp15 = load float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
+  %tmp16 = getelementptr <16 x i8> addrspace(2)* %arg3, i32 1
+  %tmp17 = load <16 x i8> addrspace(2)* %tmp16, !tbaa !0
+  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
+  %tmp19 = extractelement <4 x float> %tmp18, i32 0
+  %tmp20 = extractelement <4 x float> %tmp18, i32 1
+  %tmp21 = extractelement <4 x float> %tmp18, i32 2
+  %tmp22 = extractelement <4 x float> %tmp18, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15)
   ret void
 }
 
+; Function Attrs: noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.global() #1
+
 ; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
 attributes #0 = { "ShaderType"="1" }
-attributes #1 = { nounwind readnone }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind readnone }
 
-!0 = metadata !{metadata !"const", null, i32 1}
+!0 = metadata !{metadata !1, metadata !1, i64 0, i32 1}
+!1 = metadata !{metadata !"const", null}

diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index 90079b0..47f65f5 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll

@@ -1,13 +1,14 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
-; R600-CHECK: @ngroups_x
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[0].X
-; SI-CHECK: @ngroups_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+
+; FUNC-LABEL: {{^}}ngroups_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
@@ -15,13 +16,13 @@
   ret void
 }
 
-; R600-CHECK: @ngroups_y
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[0].Y
-; SI-CHECK: @ngroups_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x1
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}ngroups_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].Y
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
@@ -29,13 +30,13 @@
   ret void
 }
 
-; R600-CHECK: @ngroups_z
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[0].Z
-; SI-CHECK: @ngroups_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x2
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}ngroups_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
@@ -43,13 +44,13 @@
   ret void
 }
 
-; R600-CHECK: @global_size_x
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[0].W
-; SI-CHECK: @global_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x3
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}global_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[0].W
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
@@ -57,13 +58,13 @@
   ret void
 }
 
-; R600-CHECK: @global_size_y
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[1].X
-; SI-CHECK: @global_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x4
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}global_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
@@ -71,13 +72,13 @@
   ret void
 }
 
-; R600-CHECK: @global_size_z
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[1].Y
-; SI-CHECK: @global_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x5
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}global_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Y
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
@@ -85,13 +86,13 @@
   ret void
 }
 
-; R600-CHECK: @local_size_x
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[1].Z
-; SI-CHECK: @local_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x6
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}local_size_x:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @local_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
@@ -99,13 +100,13 @@
   ret void
 }
 
-; R600-CHECK: @local_size_y
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[1].W
-; SI-CHECK: @local_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x7
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}local_size_y:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[1].W
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @local_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
@@ -113,13 +114,13 @@
   ret void
 }
 
-; R600-CHECK: @local_size_z
-; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV [[VAL]], KC0[2].X
-; SI-CHECK: @local_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0x8
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}local_size_z:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].X
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
 define void @local_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
@@ -127,13 +128,27 @@
   ret void
 }
 
-; The tgid values are stored in ss offset by the number of user ss.
-; Currently we always use exactly 2 user ss for the pointer to the
+; FUNC-LABEL: {{^}}get_work_dim:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; EG: MOV [[VAL]], KC0[2].Z
+
+; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; SI: buffer_store_dword [[VVAL]]
+define void @get_work_dim (i32 addrspace(1)* %out) {
+entry:
+  %0 = call i32 @llvm.AMDGPU.read.workdim() #0
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; The tgid values are stored in sgprs offset by the number of user sgprs.
+; Currently we always use exactly 2 user sgprs for the pointer to the
 ; kernel arguments, but this may change in the future.
 
-; SI-CHECK: @tgid_x
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}tgid_x:
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
+; SI: buffer_store_dword [[VVAL]]
 define void @tgid_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
@@ -141,9 +156,9 @@
   ret void
 }
 
-; SI-CHECK: @tgid_y
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}tgid_y:
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
+; SI: buffer_store_dword [[VVAL]]
 define void @tgid_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
@@ -151,9 +166,9 @@
   ret void
 }
 
-; SI-CHECK: @tgid_z
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
-; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
+; FUNC-LABEL: {{^}}tgid_z:
+; SI: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
+; SI: buffer_store_dword [[VVAL]]
 define void @tgid_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
@@ -161,8 +176,8 @@
   ret void
 }
 
-; SI-CHECK: @tidig_x
-; SI-CHECK: BUFFER_STORE_DWORD v0
+; FUNC-LABEL: {{^}}tidig_x:
+; SI: buffer_store_dword v0
 define void @tidig_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
@@ -170,8 +185,8 @@
   ret void
 }
 
-; SI-CHECK: @tidig_y
-; SI-CHECK: BUFFER_STORE_DWORD v1
+; FUNC-LABEL: {{^}}tidig_y:
+; SI: buffer_store_dword v1
 define void @tidig_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
@@ -179,8 +194,8 @@
   ret void
 }
 
-; SI-CHECK: @tidig_z
-; SI-CHECK: BUFFER_STORE_DWORD v2
+; FUNC-LABEL: {{^}}tidig_z:
+; SI: buffer_store_dword v2
 define void @tidig_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
@@ -208,4 +223,6 @@
 declare i32 @llvm.r600.read.tidig.y() #0
 declare i32 @llvm.r600.read.tidig.z() #0
 
+declare i32 @llvm.AMDGPU.read.workdim() #0
+
 attributes #0 = { readnone }

diff --git a/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
index b1cbe3f..d652d2d 100644
--- a/test/CodeGen/R600/wrong-transalu-pos-fix.ll
+++ b/test/CodeGen/R600/wrong-transalu-pos-fix.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ; We want all MULLO_INT inst to be last in their instruction group
-;CHECK: @fill3d
+;CHECK: {{^}}fill3d:
 ;CHECK-NOT: MULLO_INT T[0-9]+
 
 ; ModuleID = 'radeon'

diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index ab618cf..fa54e38 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll

@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @xor_v2i32
+;EG-CHECK: {{^}}xor_v2i32:
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @xor_v2i32
-;SI-CHECK: V_XOR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}xor_v2i32:
+;SI-CHECK: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 
 define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
@@ -18,17 +18,17 @@
   ret void
 }
 
-;EG-CHECK: @xor_v4i32
+;EG-CHECK: {{^}}xor_v4i32:
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @xor_v4i32
-;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: {{^}}xor_v4i32:
+;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32> addrspace(1) * %in0
@@ -38,11 +38,11 @@
   ret void
 }
 
-;EG-CHECK: @xor_i1
+;EG-CHECK: {{^}}xor_i1:
 ;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
 
-;SI-CHECK: @xor_i1
-;SI-CHECK: S_XOR_B64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+;SI-CHECK: {{^}}xor_i1:
+;SI-CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float addrspace(1) * %in0
@@ -55,8 +55,8 @@
   ret void
 }
 
-; SI-CHECK-LABEL: @vector_xor_i32
-; SI-CHECK: V_XOR_B32_e32
+; SI-CHECK-LABEL: {{^}}vector_xor_i32:
+; SI-CHECK: v_xor_b32_e32
 define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32 addrspace(1)* %in0
   %b = load i32 addrspace(1)* %in1
@@ -65,24 +65,24 @@
   ret void
 }
 
-; SI-CHECK-LABEL: @scalar_xor_i32
-; SI-CHECK: S_XOR_B32
+; SI-CHECK-LABEL: {{^}}scalar_xor_i32:
+; SI-CHECK: s_xor_b32
 define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = xor i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @scalar_not_i32
-; SI-CHECK: S_NOT_B32
+; SI-CHECK-LABEL: {{^}}scalar_not_i32:
+; SI-CHECK: s_not_b32
 define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
   %result = xor i32 %a, -1
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @vector_not_i32
-; SI-CHECK: V_NOT_B32
+; SI-CHECK-LABEL: {{^}}vector_not_i32:
+; SI-CHECK: v_not_b32
 define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32 addrspace(1)* %in0
   %b = load i32 addrspace(1)* %in1
@@ -91,10 +91,10 @@
   ret void
 }
 
-; SI-CHECK-LABEL: @vector_xor_i64
-; SI-CHECK: V_XOR_B32_e32
-; SI-CHECK: V_XOR_B32_e32
-; SI-CHECK: S_ENDPGM
+; SI-CHECK-LABEL: {{^}}vector_xor_i64:
+; SI-CHECK: v_xor_b32_e32
+; SI-CHECK: v_xor_b32_e32
+; SI-CHECK: s_endpgm
 define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64 addrspace(1)* %in0
   %b = load i64 addrspace(1)* %in1
@@ -103,26 +103,26 @@
   ret void
 }
 
-; SI-CHECK-LABEL: @scalar_xor_i64
-; SI-CHECK: S_XOR_B64
-; SI-CHECK: S_ENDPGM
+; SI-CHECK-LABEL: {{^}}scalar_xor_i64:
+; SI-CHECK: s_xor_b64
+; SI-CHECK: s_endpgm
 define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = xor i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @scalar_not_i64
-; SI-CHECK: S_NOT_B64
+; SI-CHECK-LABEL: {{^}}scalar_not_i64:
+; SI-CHECK: s_not_b64
 define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = xor i64 %a, -1
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-; SI-CHECK-LABEL: @vector_not_i64
-; SI-CHECK: V_NOT_B32
-; SI-CHECK: V_NOT_B32
+; SI-CHECK-LABEL: {{^}}vector_not_i64:
+; SI-CHECK: v_not_b32
+; SI-CHECK: v_not_b32
 define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64 addrspace(1)* %in0
   %b = load i64 addrspace(1)* %in1
@@ -135,9 +135,8 @@
 ; Note that in the future the backend may be smart enough to
 ; use an SALU instruction for this.
 
-; SI-CHECK-LABEL: @xor_cf
-; SI-CHECK: V_XOR
-; SI-CHECK: V_XOR
+; SI-CHECK-LABEL: {{^}}xor_cf:
+; SI-CHECK: s_xor_b64
 define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, 0

diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index 8585d4a..0fe1f15 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll

@@ -1,14 +1,14 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @test
+; R600-CHECK: {{^}}test:
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
 ; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
 
-; SI-CHECK: @test
-; SI-CHECK: S_MOV_B32 [[ZERO:s[0-9]]], 0
-; SI-CHECK: V_MOV_B32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
-; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[V_ZERO]]{{\]}}
+; SI-CHECK: {{^}}test:
+; SI-CHECK: s_mov_b32 [[ZERO:s[0-9]]], 0{{$}}
+; SI-CHECK: v_mov_b32_e32 v[[V_ZERO:[0-9]]], [[ZERO]]
+; SI-CHECK: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = mul i32 %a, %b
@@ -18,8 +18,8 @@
   ret void
 }
 
-; SI-CHECK-LABEL: @testi1toi32
-; SI-CHECK: V_CNDMASK_B32
+; SI-CHECK-LABEL: {{^}}testi1toi32:
+; SI-CHECK: v_cndmask_b32
 define void @testi1toi32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = icmp eq i32 %a, %b
@@ -28,10 +28,10 @@
   ret void
 }
 
-; SI-CHECK-LABEL: @zext_i1_to_i64
-; SI-CHECK: V_CMP_EQ_I32
-; SI-CHECK: V_CNDMASK_B32
-; SI-CHECK: S_MOV_B32 s{{[0-9]+}}, 0
+; SI-CHECK-LABEL: {{^}}zext_i1_to_i64:
+; SI-CHECK: v_cmp_eq_i32
+; SI-CHECK: v_cndmask_b32
+; SI-CHECK: s_mov_b32 s{{[0-9]+}}, 0
 define void @zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64

diff --git a/test/CodeGen/SPARC/empty-functions.ll b/test/CodeGen/SPARC/empty-functions.ll
new file mode 100644
index 0000000..38d2889
--- /dev/null
+++ b/test/CodeGen/SPARC/empty-functions.ll

@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=sparc-linux-gnu | FileCheck -check-prefix=LINUX-NO-FP %s
+; RUN: llc < %s -mtriple=sparc-linux-gnu -disable-fp-elim | FileCheck -check-prefix=LINUX-FP %s
+
+define void @func() {
+entry:
+  unreachable
+}
+
+; An empty function is perfectly fine on ELF.
+; LINUX-NO-FP: func:
+; LINUX-NO-FP-NEXT: .cfi_startproc
+; LINUX-NO-FP-NEXT: {{^}}!
+; LINUX-NO-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-NO-FP-NEXT: .size   func, .L{{.*}}-func
+; LINUX-NO-FP-NEXT: .cfi_endproc
+
+; A cfi directive can point to the end of a function. It (and in fact the
+; entire body) could be optimized out because of the unreachable, but we
+; don't do it right now.
+; LINUX-FP: func:
+; LINUX-FP-NEXT: .cfi_startproc
+; LINUX-FP-NEXT: {{^}}!
+; LINUX-FP-NEXT: save %sp, -96, %sp
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_def_cfa_register %fp
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_window_save
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_register 15, 31
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .size   func, .Ltmp3-func
+; LINUX-FP-NEXT: .cfi_endproc

diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index ffc9584..d31a84b 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll

@@ -25,7 +25,7 @@
   %storemerge = phi double [ -1.000000e+00, %4 ], [ 1.000000e+00, %3 ], [ 1.000000e+00, %3 ] ; <double> [#uses=1]
   %v_6 = icmp slt i32 %1, 2                         ; <i1> [#uses=1]
   %storemerge1 = select i1 %v_6, double 1.000000e+00, double -1.000000e+00 ; <double> [#uses=3]
-  call void @llvm.dbg.value(metadata !{double %storemerge}, i64 0, metadata !91), !dbg !0
+  call void @llvm.dbg.value(metadata !{double %storemerge}, i64 0, metadata !91, metadata !{metadata !"0x102"}), !dbg !0
   %v_7 = icmp eq i32 %2, 1, !dbg !92                ; <i1> [#uses=1]
   %storemerge2 = select i1 %v_7, double 1.000000e+00, double -1.000000e+00 ; <double> [#uses=3]
   %v_8 = getelementptr inbounds %0* %0, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
@@ -40,116 +40,116 @@
   ret void, !dbg !98
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare double @sqrt(double) nounwind readonly
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!104}
 !0 = metadata !{i32 46, i32 0, metadata !1, null}
-!1 = metadata !{i32 524299, metadata !101, metadata !2, i32 44, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{i32 524299, metadata !101, metadata !3, i32 44, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{i32 524334, metadata !101, null, metadata !"getClosestDiagonal3", metadata !"getClosestDiagonal3", metadata !"_Z19getClosestDiagonal3ii", i32 44, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!4 = metadata !{i32 524329, metadata !101} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 524305, metadata !101, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", i1 true, metadata !"", i32 0, metadata !102, metadata !102, metadata !103, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!1 = metadata !{metadata !"0xb\0044\000\000", metadata !101, metadata !2} ; [ DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0xb\0044\000\000", metadata !101, metadata !3} ; [ DW_TAG_lexical_block ]
+!3 = metadata !{metadata !"0x2e\00getClosestDiagonal3\00getClosestDiagonal3\00_Z19getClosestDiagonal3ii\0044\000\001\000\006\000\000\000", metadata !101, null, metadata !6, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = metadata !{metadata !"0x29", metadata !101} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)\001\00\000\00\000", metadata !101, metadata !102, metadata !102, metadata !103, null, null} ; [ DW_TAG_compile_unit ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !22, metadata !22}
-!8 = metadata !{i32 524307, metadata !99, null, metadata !"ggVector3", i32 66, i64 192, i64 32, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [ggVector3] [line 66, size 192, align 32, offset 0] [def] [from ]
-!9 = metadata !{i32 524329, metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src", metadata !5} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x13\00ggVector3\0066\00192\0032\000\000\000", metadata !99, null, null, metadata !10, null, null, null} ; [ DW_TAG_structure_type ] [ggVector3] [line 66, size 192, align 32, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0x29", metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src", metadata !5} ; [ DW_TAG_file_type ]
 !99 = metadata !{metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !10 = metadata !{metadata !11, metadata !16, metadata !23, metadata !26, metadata !29, metadata !30, metadata !35, metadata !36, metadata !37, metadata !41, metadata !42, metadata !43, metadata !46, metadata !47, metadata !48, metadata !52, metadata !53, metadata !54, metadata !57, metadata !60, metadata !63, metadata !66, metadata !70, metadata !71, metadata !74, metadata !75, metadata !76, metadata !77, metadata !78, metadata !81, metadata !82, metadata !83, metadata !84, metadata !85, metadata !88, metadata !89, metadata !90}
-!11 = metadata !{i32 524301, metadata !99, metadata !8, metadata !"e", i32 160, i64 192, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ]
-!12 = metadata !{i32 524289, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !13, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 192, align 32, offset 0] [from double]
-!13 = metadata !{i32 524324, metadata !101, metadata !4, metadata !"double", i32 0, i64 64, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0xd\00e\00160\00192\0032\000\000", metadata !99, metadata !8, metadata !12} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0x1\00\000\00192\0032\000\000", metadata !101, metadata !4, metadata !13, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 192, align 32, offset 0] [from double]
+!13 = metadata !{metadata !"0x24\00double\000\0064\0032\000\000\004", metadata !101, metadata !4} ; [ DW_TAG_base_type ]
 !14 = metadata !{metadata !15}
-!15 = metadata !{i32 524321, i64 0, i64 3}        ; [ DW_TAG_subrange_type ]
-!16 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 72, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x21\000\003"}        ; [ DW_TAG_subrange_type ]
+!16 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0072\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !17, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19, metadata !20}
-!19 = metadata !{i32 524303, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 64, metadata !8} ; [ DW_TAG_pointer_type ]
-!20 = metadata !{i32 524310, metadata !100, null, metadata !"ggBoolean", i32 478, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_typedef ]
-!21 = metadata !{i32 524329, metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm", metadata !5} ; [ DW_TAG_file_type ]
+!19 = metadata !{metadata !"0xf\00\000\0032\0032\000\0064", metadata !101, metadata !4, metadata !8} ; [ DW_TAG_pointer_type ]
+!20 = metadata !{metadata !"0x16\00ggBoolean\00478\000\000\000\000", metadata !100, null, metadata !22} ; [ DW_TAG_typedef ]
+!21 = metadata !{metadata !"0x29", metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm", metadata !5} ; [ DW_TAG_file_type ]
 !100 = metadata !{metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm"}
-!22 = metadata !{i32 524324, metadata !101, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!23 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 73, metadata !24, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!24 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !101, metadata !4} ; [ DW_TAG_base_type ]
+!23 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0073\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !24, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{null, metadata !19}
-!26 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 74, metadata !27, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!27 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!26 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0074\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !27, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !28 = metadata !{null, metadata !19, metadata !13, metadata !13, metadata !13}
-!29 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"Set", metadata !"Set", metadata !"_ZN9ggVector33SetEddd", i32 81, metadata !27, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!30 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"x", metadata !"x", metadata !"_ZNK9ggVector31xEv", i32 82, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!31 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !32, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{metadata !"0x2e\00Set\00Set\00_ZN9ggVector33SetEddd\0081\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !27, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!30 = metadata !{metadata !"0x2e\00x\00x\00_ZNK9ggVector31xEv\0082\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!31 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !32, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !32 = metadata !{metadata !13, metadata !33}
-!33 = metadata !{i32 524303, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 64, metadata !34} ; [ DW_TAG_pointer_type ]
-!34 = metadata !{i32 524326, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ]
-!35 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"y", metadata !"y", metadata !"_ZNK9ggVector31yEv", i32 83, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!36 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"z", metadata !"z", metadata !"_ZNK9ggVector31zEv", i32 84, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!37 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"x", metadata !"x", metadata !"_ZN9ggVector31xEv", i32 85, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!38 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !39, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = metadata !{metadata !"0xf\00\000\0032\0032\000\0064", metadata !101, metadata !4, metadata !34} ; [ DW_TAG_pointer_type ]
+!34 = metadata !{metadata !"0x26\00\000\00192\0032\000\000", metadata !101, metadata !4, metadata !8} ; [ DW_TAG_const_type ]
+!35 = metadata !{metadata !"0x2e\00y\00y\00_ZNK9ggVector31yEv\0083\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!36 = metadata !{metadata !"0x2e\00z\00z\00_ZNK9ggVector31zEv\0084\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!37 = metadata !{metadata !"0x2e\00x\00x\00_ZN9ggVector31xEv\0085\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!38 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !39, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !39 = metadata !{metadata !40, metadata !19}
-!40 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"double", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_reference_type ]
-!41 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"y", metadata !"y", metadata !"_ZN9ggVector31yEv", i32 86, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!42 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"z", metadata !"z", metadata !"_ZN9ggVector31zEv", i32 87, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!43 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetX", metadata !"SetX", metadata !"_ZN9ggVector34SetXEd", i32 88, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!44 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!40 = metadata !{metadata !"0x10\00double\000\0032\0032\000\000", metadata !101, metadata !4, metadata !13} ; [ DW_TAG_reference_type ]
+!41 = metadata !{metadata !"0x2e\00y\00y\00_ZN9ggVector31yEv\0086\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!42 = metadata !{metadata !"0x2e\00z\00z\00_ZN9ggVector31zEv\0087\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!43 = metadata !{metadata !"0x2e\00SetX\00SetX\00_ZN9ggVector34SetXEd\0088\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!44 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !45, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !45 = metadata !{null, metadata !19, metadata !13}
-!46 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetY", metadata !"SetY", metadata !"_ZN9ggVector34SetYEd", i32 89, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!47 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetZ", metadata !"SetZ", metadata !"_ZN9ggVector34SetZEd", i32 90, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!48 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 92, metadata !49, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!49 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !50, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!46 = metadata !{metadata !"0x2e\00SetY\00SetY\00_ZN9ggVector34SetYEd\0089\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!47 = metadata !{metadata !"0x2e\00SetZ\00SetZ\00_ZN9ggVector34SetZEd\0090\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !44, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!48 = metadata !{metadata !"0x2e\00ggVector3\00ggVector3\00\0092\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !49, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!49 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !50, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !50 = metadata !{null, metadata !19, metadata !51}
-!51 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !34} ; [ DW_TAG_reference_type ]
-!52 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"tolerance", metadata !"tolerance", metadata !"_ZNK9ggVector39toleranceEv", i32 100, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!53 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"tolerance", metadata !"tolerance", metadata !"_ZN9ggVector39toleranceEv", i32 101, metadata !38, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!54 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator+", metadata !"operator+", metadata !"_ZNK9ggVector3psEv", i32 107, metadata !55, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!55 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !56, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!51 = metadata !{metadata !"0x10\00\000\0032\0032\000\000", metadata !101, metadata !4, metadata !34} ; [ DW_TAG_reference_type ]
+!52 = metadata !{metadata !"0x2e\00tolerance\00tolerance\00_ZNK9ggVector39toleranceEv\00100\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!53 = metadata !{metadata !"0x2e\00tolerance\00tolerance\00_ZN9ggVector39toleranceEv\00101\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !38, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!54 = metadata !{metadata !"0x2e\00operator+\00operator+\00_ZNK9ggVector3psEv\00107\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !55, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!55 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !56, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !56 = metadata !{metadata !51, metadata !33}
-!57 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator-", metadata !"operator-", metadata !"_ZNK9ggVector3ngEv", i32 108, metadata !58, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!58 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !59, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!57 = metadata !{metadata !"0x2e\00operator-\00operator-\00_ZNK9ggVector3ngEv\00108\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !58, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!58 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !59, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !59 = metadata !{metadata !8, metadata !33}
-!60 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator[]", metadata !"operator[]", metadata !"_ZNK9ggVector3ixEi", i32 290, metadata !61, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!61 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !62, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!60 = metadata !{metadata !"0x2e\00operator[]\00operator[]\00_ZNK9ggVector3ixEi\00290\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !61, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!61 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !62, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !62 = metadata !{metadata !13, metadata !33, metadata !22}
-!63 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator[]", metadata !"operator[]", metadata !"_ZN9ggVector3ixEi", i32 278, metadata !64, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!64 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!63 = metadata !{metadata !"0x2e\00operator[]\00operator[]\00_ZN9ggVector3ixEi\00278\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !64, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!64 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !65, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !65 = metadata !{metadata !40, metadata !19, metadata !22}
-!66 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator+=", metadata !"operator+=", metadata !"_ZN9ggVector3pLERKS_", i32 303, metadata !67, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!67 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !68, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!66 = metadata !{metadata !"0x2e\00operator+=\00operator+=\00_ZN9ggVector3pLERKS_\00303\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !67, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!67 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !68, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !68 = metadata !{metadata !69, metadata !19, metadata !51}
-!69 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"ggVector3", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_reference_type ]
-!70 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator-=", metadata !"operator-=", metadata !"_ZN9ggVector3mIERKS_", i32 310, metadata !67, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!71 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator*=", metadata !"operator*=", metadata !"_ZN9ggVector3mLEd", i32 317, metadata !72, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!72 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !73, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!69 = metadata !{metadata !"0x10\00ggVector3\000\0032\0032\000\000", metadata !101, metadata !4, metadata !8} ; [ DW_TAG_reference_type ]
+!70 = metadata !{metadata !"0x2e\00operator-=\00operator-=\00_ZN9ggVector3mIERKS_\00310\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !67, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!71 = metadata !{metadata !"0x2e\00operator*=\00operator*=\00_ZN9ggVector3mLEd\00317\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !72, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!72 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !73, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !73 = metadata !{metadata !69, metadata !19, metadata !13}
-!74 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator/=", metadata !"operator/=", metadata !"_ZN9ggVector3dVEd", i32 324, metadata !72, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!75 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"length", metadata !"length", metadata !"_ZNK9ggVector36lengthEv", i32 121, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!76 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"squaredLength", metadata !"squaredLength", metadata !"_ZNK9ggVector313squaredLengthEv", i32 122, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!77 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"MakeUnitVector", metadata !"MakeUnitVector", metadata !"_ZN9ggVector314MakeUnitVectorEv", i32 217, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!78 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"Perturb", metadata !"Perturb", metadata !"_ZNK9ggVector37PerturbEdd", i32 126, metadata !79, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!79 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !80, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!74 = metadata !{metadata !"0x2e\00operator/=\00operator/=\00_ZN9ggVector3dVEd\00324\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !72, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!75 = metadata !{metadata !"0x2e\00length\00length\00_ZNK9ggVector36lengthEv\00121\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!76 = metadata !{metadata !"0x2e\00squaredLength\00squaredLength\00_ZNK9ggVector313squaredLengthEv\00122\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!77 = metadata !{metadata !"0x2e\00MakeUnitVector\00MakeUnitVector\00_ZN9ggVector314MakeUnitVectorEv\00217\000\001\000\006\000\000\000", metadata !9, metadata !8, metadata !24, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!78 = metadata !{metadata !"0x2e\00Perturb\00Perturb\00_ZNK9ggVector37PerturbEdd\00126\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !79, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!79 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !80, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !80 = metadata !{metadata !8, metadata !33, metadata !13, metadata !13}
-!81 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"maxComponent", metadata !"maxComponent", metadata !"_ZNK9ggVector312maxComponentEv", i32 128, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!82 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"minComponent", metadata !"minComponent", metadata !"_ZNK9ggVector312minComponentEv", i32 129, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!83 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"maxAbsComponent", metadata !"maxAbsComponent", metadata !"_ZNK9ggVector315maxAbsComponentEv", i32 131, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!84 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"minAbsComponent", metadata !"minAbsComponent", metadata !"_ZNK9ggVector315minAbsComponentEv", i32 132, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!85 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMinComponent", metadata !"indexOfMinComponent", metadata !"_ZNK9ggVector319indexOfMinComponentEv", i32 133, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!86 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !87, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!81 = metadata !{metadata !"0x2e\00maxComponent\00maxComponent\00_ZNK9ggVector312maxComponentEv\00128\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!82 = metadata !{metadata !"0x2e\00minComponent\00minComponent\00_ZNK9ggVector312minComponentEv\00129\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!83 = metadata !{metadata !"0x2e\00maxAbsComponent\00maxAbsComponent\00_ZNK9ggVector315maxAbsComponentEv\00131\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!84 = metadata !{metadata !"0x2e\00minAbsComponent\00minAbsComponent\00_ZNK9ggVector315minAbsComponentEv\00132\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !31, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!85 = metadata !{metadata !"0x2e\00indexOfMinComponent\00indexOfMinComponent\00_ZNK9ggVector319indexOfMinComponentEv\00133\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!86 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !101, metadata !4, null, metadata !87, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !87 = metadata !{metadata !22, metadata !33}
-!88 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMinAbsComponent", metadata !"indexOfMinAbsComponent", metadata !"_ZNK9ggVector322indexOfMinAbsComponentEv", i32 137, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!89 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMaxComponent", metadata !"indexOfMaxComponent", metadata !"_ZNK9ggVector319indexOfMaxComponentEv", i32 146, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!90 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMaxAbsComponent", metadata !"indexOfMaxAbsComponent", metadata !"_ZNK9ggVector322indexOfMaxAbsComponentEv", i32 150, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!91 = metadata !{i32 524544, metadata !1, metadata !"vx", metadata !4, i32 46, metadata !13} ; [ DW_TAG_auto_variable ]
+!88 = metadata !{metadata !"0x2e\00indexOfMinAbsComponent\00indexOfMinAbsComponent\00_ZNK9ggVector322indexOfMinAbsComponentEv\00137\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!89 = metadata !{metadata !"0x2e\00indexOfMaxComponent\00indexOfMaxComponent\00_ZNK9ggVector319indexOfMaxComponentEv\00146\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!90 = metadata !{metadata !"0x2e\00indexOfMaxAbsComponent\00indexOfMaxAbsComponent\00_ZNK9ggVector322indexOfMaxAbsComponentEv\00150\000\000\000\006\000\000\000", metadata !9, metadata !8, metadata !86, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!91 = metadata !{metadata !"0x100\00vx\0046\000", metadata !1, metadata !4, metadata !13} ; [ DW_TAG_auto_variable ]
 !92 = metadata !{i32 48, i32 0, metadata !1, null}
 !93 = metadata !{i32 218, i32 0, metadata !94, metadata !96}
-!94 = metadata !{i32 524299, metadata !101, metadata !95, i32 217, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!95 = metadata !{i32 524299, metadata !101, metadata !77, i32 217, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!94 = metadata !{metadata !"0xb\00217\000\000", metadata !101, metadata !95} ; [ DW_TAG_lexical_block ]
+!95 = metadata !{metadata !"0xb\00217\000\000", metadata !101, metadata !77} ; [ DW_TAG_lexical_block ]
 !96 = metadata !{i32 51, i32 0, metadata !1, null}
 !97 = metadata !{i32 227, i32 0, metadata !94, metadata !96}
 !98 = metadata !{i32 52, i32 0, metadata !1, null}
 !101 = metadata !{metadata !"ggEdgeDiscrepancy.cc", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !102 = metadata !{i32 0}
 !103 = metadata !{metadata !3, metadata !77}
-!104 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!104 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll b/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll
index b39978b..369ac96 100644
--- a/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll
+++ b/test/CodeGen/Thumb/2012-04-26-M0ISelBug.ll

@@ -5,7 +5,7 @@
 define i32 @t(i32 %a) nounwind {
 ; CHECK-LABEL: t:
 ; CHECK: asrs [[REG1:(r[0-9]+)]], [[REG2:(r[0-9]+)]], #31
-; CHECK: eors [[REG1]], [[REG2]]
+; CHECK: eors [[REG2]], [[REG1]]
   %tmp0 = ashr i32 %a, 31
   %tmp1 = xor i32 %tmp0, %a
   ret i32 %tmp1

diff --git a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
index ae66369..cfa1159 100644
--- a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll
+++ b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll

@@ -1,12 +1,11 @@
-; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - | FileCheck %s
 
 define void @foo(i32* %A) #0 {
 entry:
 ; CHECK-LABEL: foo:
 ; CHECK: push {r7, lr}
-; CHECK: ldm [[REG0:r[0-9]]]!,
-; CHECK-NEXT: subs [[REG0]]
+; CHECK: ldm
+; CHECK-NEXT: subs
 ; CHECK-NEXT: bl
   %0 = load i32* %A, align 4
   %arrayidx1 = getelementptr inbounds i32* %A, i32 1

diff --git a/test/CodeGen/Thumb/copy_thumb.ll b/test/CodeGen/Thumb/copy_thumb.ll
new file mode 100644
index 0000000..528f54b
--- /dev/null
+++ b/test/CodeGen/Thumb/copy_thumb.ll

@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=armv4-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=armv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=armv5-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=armv6-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=armv7-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=thumbv6-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; RUN: llc -mtriple=thumbv7-none--eabi < %s | FileCheck %s --check-prefix=CHECK-LOLOMOV
+; CHECK-LOLOMOV-LABEL:  foo
+; CHECK-LOLOMOV:        mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
+; CHECK-LOLOMOV-NEXT:   mov [[SRC1]], [[SRC2:r[01]]]
+; CHECK-LOLOMOV-NEXT:   mov [[SRC2]], [[TMP]]
+; CHECK-LOLOMOV-LABEL:  bar
+; CHECK-LOLOMOV-LABEL:  fnend
+; 
+; 'MOV lo, lo' in Thumb mode produces undefined results on pre-v6 hardware
+; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-NOLOLOMOV
+; RUN: llc -mtriple=thumbv5-none--eabi < %s | FileCheck %s --check-prefix=CHECK-NOLOLOMOV
+; CHECK-NOLOLOMOV-LABEL: foo
+; CHECK-NOLOLOMOV-NOT:   mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
+; CHECK-NOLOLOMOV:       push  {[[SRC1:r[01]]]}
+; CHECK-NOLOLOMOV-NEXT:  pop {[[TMP:r[0-7]]]}
+; CHECK-NOLOLOMOV-NOT:   mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
+; CHECK-NOLOLOMOV:       push  {[[SRC2:r[01]]]}
+; CHECK-NOLOLOMOV-NEXT:  pop {[[SRC1]]}
+; CHECK-NOLOLOMOV-NOT:   mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
+; CHECK-NOLOLOMOV:       push  {[[TMP]]}
+; CHECK-NOLOLOMOV-NEXT:  pop {[[SRC2]]}
+; CHECK-NOLOLOMOV-LABEL: bar
+; CHECK-NOLOLOMOV-LABEL: fnend
+
+declare void @bar(i32, i32)
+
+define void @foo(i32 %a, i32 %b) {
+entry:
+  call void @bar(i32 %b, i32 %a);
+  ret void
+}
+

diff --git a/test/CodeGen/Thumb/dyn-stackalloc.ll b/test/CodeGen/Thumb/dyn-stackalloc.ll
index 6c6de55..309d802 100644
--- a/test/CodeGen/Thumb/dyn-stackalloc.ll
+++ b/test/CodeGen/Thumb/dyn-stackalloc.ll

@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra | FileCheck %s
-; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY
+; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC
 
 	%struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
 	%struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
@@ -45,7 +45,8 @@
 ; CHECK: sub sp, #
 ; CHECK: mov r[[R0:[0-9]+]], sp
 ; CHECK: str r{{[0-9+]}}, [r[[R0]]
-; CHECK: str r{{[0-9+]}}, [r[[R0]]
+; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]]
+; RA_BASIC: stm r[[R0]]!
 ; CHECK-NOT: ldr r0, [sp
 ; CHECK: mov r[[R1:[0-9]+]], sp
 ; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}}

diff --git a/test/CodeGen/Thumb/inlineasm-thumb.ll b/test/CodeGen/Thumb/inlineasm-thumb.ll
index 2547ce8..cfaf2ba 100644
--- a/test/CodeGen/Thumb/inlineasm-thumb.ll
+++ b/test/CodeGen/Thumb/inlineasm-thumb.ll

@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -no-integrated-as %s -o - | FileCheck %s
 
 define i32 @t1(i32 %x, i32 %y) nounwind {
 entry:
@@ -6,3 +6,14 @@
   %0 = tail call i32 asm "mov $0, $1", "=l,h"(i32 %y) nounwind
   ret i32 %0
 }
+
+; CHECK-LABEL: constraint_r:
+; CHECK: foo2 r{{[0-7]+}}, r{{[0-7]+}}
+
+define i32 @constraint_r() {
+entry:
+  %0 = tail call i32 asm sideeffect "movs $0, #1", "=r"()
+  tail call void asm sideeffect "foo1", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7}"()
+  %1 = tail call i32 asm sideeffect "foo2 $0, $1", "=r,r"(i32 %0)
+  ret i32 %1
+}

diff --git a/test/CodeGen/Thumb/large-stack.ll b/test/CodeGen/Thumb/large-stack.ll
index fb6daa4..269bdd9 100644
--- a/test/CodeGen/Thumb/large-stack.ll
+++ b/test/CodeGen/Thumb/large-stack.ll

@@ -1,31 +1,57 @@
-; RUN: llc < %s -mtriple=thumb-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-apple-ios | FileCheck %s --check-prefix=CHECK --check-prefix=IOS
+; RUN: llc < %s -mtriple=thumb-none-eabi | FileCheck %s --check-prefix=CHECK --check-prefix=EABI
+; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-apple-ios
+; RUN: llvm-objdump -triple=thumbv6-apple-ios -d %t | FileCheck %s --check-prefix=CHECK --check-prefix=IOS
+; RUN: llc < %s -o %t -filetype=obj -mtriple=thumbv6-none-eabi
+; RUN: llvm-objdump -triple=thumbv6-none-eabi -d %t | FileCheck %s --check-prefix=CHECK --check-prefix=EABI
 
+; Largest stack for which a single tADDspi/tSUBspi is enough
 define void @test1() {
 ; CHECK-LABEL: test1:
-; CHECK: sub sp, #256
-; CHECK: add sp, #256
-    %tmp = alloca [ 64 x i32 ] , align 4
+; CHECK: sub sp, #508
+; CHECK: add sp, #508
+    %tmp = alloca [ 508 x i8 ] , align 4
     ret void
 }
 
+; Largest stack for which three tADDspi/tSUBspis are enough
+define void @test100() {
+; CHECK-LABEL: test100:
+; CHECK: sub sp, #508
+; CHECK: sub sp, #508
+; CHECK: sub sp, #508
+; EABI: add sp, #508
+; EABI: add sp, #508
+; EABI: add sp, #508
+; IOS: subs r4, r7, #4
+; IOS: mov sp, r4
+    %tmp = alloca [ 1524 x i8 ] , align 4
+    ret void
+}
+
+; Smallest stack for which we use a constant pool
 define void @test2() {
 ; CHECK-LABEL: test2:
-; CHECK: ldr r0, LCPI
+; CHECK: ldr r0,
 ; CHECK: add sp, r0
-; CHECK: subs r4, r7, #4
-; CHECK: mov sp, r4
-    %tmp = alloca [ 4168 x i8 ] , align 4
+; EABI: ldr r0,
+; EABI: add sp, r0
+; IOS: subs r4, r7, #4
+; IOS: mov sp, r4
+    %tmp = alloca [ 1528 x i8 ] , align 4
     ret void
 }
 
 define i32 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK: ldr r1, LCPI
+; CHECK: ldr r1,
 ; CHECK: add sp, r1
-; CHECK: ldr r1, LCPI
+; CHECK: ldr r1,
 ; CHECK: add r1, sp
-; CHECK: subs r4, r7, #4
-; CHECK: mov sp, r4
+; EABI: ldr r1,
+; EABI: add sp, r1
+; IOS: subs r4, r7, #4
+; IOS: mov sp, r4
     %retval = alloca i32, align 4
     %tmp = alloca i32, align 4
     %a = alloca [805306369 x i8], align 16
@@ -33,3 +59,22 @@
     %tmp1 = load i32* %tmp
     ret i32 %tmp1
 }
+
+; Here, the adds get optimized out because they are dead, but the calculation
+; of the address of stack_a is dead but not optimized out. When the address
+; calculation gets expanded to two instructions, we need to avoid reading a
+; dead register.
+; No CHECK lines (just test for crashes), as we hope this will be optimised
+; better in future.
+define i32 @test4() {
+entry:
+  %stack_a = alloca i8, align 1
+  %stack_b = alloca [256 x i32*], align 4
+  %int = ptrtoint i8* %stack_a to i32
+  %add = add i32 %int, 1
+  br label %block2
+
+block2:
+  %add2 = add i32 %add, 1
+  ret i32 0
+}

diff --git a/test/CodeGen/Thumb/ldm-merge-call.ll b/test/CodeGen/Thumb/ldm-merge-call.ll
new file mode 100644
index 0000000..febc96b
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-merge-call.ll

@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m--linux-gnueabi"
+
+; Function Attrs: nounwind optsize
+define void @foo(i32* nocapture readonly %A) #0 {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: ldm r[[BASE:[0-9]]]!,
+; CHECK-NEXT: mov r[[BASE]],
+  %0 = load i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i32 1
+  %1 = load i32* %arrayidx1, align 4
+  %call = tail call i32 @bar(i32 %0, i32 %1, i32 %0, i32 %1) #2
+  %call2 = tail call i32 @bar(i32 %0, i32 %1, i32 %0, i32 %1) #2
+  ret void
+}
+
+; Function Attrs: optsize
+declare i32 @bar(i32, i32, i32, i32) #1
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind optsize }

diff --git a/test/CodeGen/Thumb/ldm-merge-struct.ll b/test/CodeGen/Thumb/ldm-merge-struct.ll
new file mode 100644
index 0000000..2f732e0
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-merge-struct.ll

@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-none--eabi"
+
+%struct.S = type { i32, i32 }
+
+@s = common global %struct.S zeroinitializer, align 4
+
+define i32 @f() {
+entry:
+; CHECK-LABEL: f:
+; CHECK: ldm r[[BASE:[0-9]]],
+; CHECK-NEXT-NOT: subs r[[BASE]]
+  %0 = load i32* getelementptr inbounds (%struct.S* @s, i32 0, i32 0), align 4
+  %1 = load i32* getelementptr inbounds (%struct.S* @s, i32 0, i32 1), align 4
+  %cmp = icmp sgt i32 %0, %1
+  %2 = sub i32 0, %1
+  %cond.p = select i1 %cmp, i32 %1, i32 %2
+  %cond = add i32 %cond.p, %0
+  ret i32 %cond
+}

diff --git a/test/CodeGen/Thumb/ldm-stm-base-materialization.ll b/test/CodeGen/Thumb/ldm-stm-base-materialization.ll
new file mode 100644
index 0000000..6382c25
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-stm-base-materialization.ll

@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m-none--eabi"
+
+@a = external global i32*
+@b = external global i32*
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: ldr r[[SB:[0-9]]], .LCPI
+; CHECK: ldr r[[LB:[0-9]]], .LCPI
+; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
+; CHECK-NEXT: ldm r[[NLB]],
+; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
+; CHECK-NEXT: stm r[[NSB]]
+  %0 = load i32** @a, align 4
+  %arrayidx = getelementptr inbounds i32* %0, i32 1
+  %1 = bitcast i32* %arrayidx to i8*
+  %2 = load i32** @b, align 4
+  %arrayidx1 = getelementptr inbounds i32* %2, i32 1
+  %3 = bitcast i32* %arrayidx1 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 24, i32 4, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1

diff --git a/test/CodeGen/Thumb/pop.ll b/test/CodeGen/Thumb/pop.ll
index 1e45c7f..3c539c6 100644
--- a/test/CodeGen/Thumb/pop.ll
+++ b/test/CodeGen/Thumb/pop.ll

@@ -7,7 +7,9 @@
 ; CHECK-NEXT: add sp, #12
 ; CHECK-NEXT: bx r3
 entry:
-  %a.addr = alloca i8*
-  store i8* %a, i8** %a.addr
+  %a.addr = alloca i8, i32 4
+  call void @llvm.va_start(i8* %a.addr)
   ret void
 }
+
+declare void @llvm.va_start(i8*) nounwind

diff --git a/test/CodeGen/Thumb/stack_guard_remat.ll b/test/CodeGen/Thumb/stack_guard_remat.ll
new file mode 100644
index 0000000..e949cc1
--- /dev/null
+++ b/test/CodeGen/Thumb/stack_guard_remat.ll

@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=NO-PIC  -check-prefix=STATIC
+; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s  -check-prefix=NO-PIC -check-prefix=DYNAMIC-NO-PIC
+
+;PIC:   foo2
+;PIC:   ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
+;PIC: [[LABEL1:LPC[0-9_]+]]:
+;PIC:   add [[R0]], pc
+;PIC:   ldr [[R1:r[0-9]+]], {{\[}}[[R0]]{{\]}}
+;PIC:   ldr [[R1:r[0-9]+]], {{\[}}[[R1]]{{\]}}
+
+;PIC:      [[LABEL0]]:
+;PIC-NEXT:   .long L___stack_chk_guard$non_lazy_ptr-([[LABEL1]]+4)
+
+;NO-PIC:   foo2
+;NO-PIC:   ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
+;NO-PIC-NOT: LPC
+;NO-PIC:   ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+;STATIC:      [[LABEL0]]:
+;STATIC-NEXT:   .long ___stack_chk_guard
+
+;DYNAMIC-NO-PIC:      [[LABEL0]]:
+;DYNAMIC-NO-PIC-NEXT:   .long L___stack_chk_guard$non_lazy_ptr
+
+; Function Attrs: nounwind ssp
+define i32 @test_stack_guard_remat() #0 {
+  %a1 = alloca [256 x i32], align 4
+  %1 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %1)
+  %2 = getelementptr inbounds [256 x i32]* %a1, i32 0, i32 0
+  call void @foo3(i32* %2) #3
+  call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %1)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/Thumb/stm-merge.ll b/test/CodeGen/Thumb/stm-merge.ll
new file mode 100644
index 0000000..76e71f4
--- /dev/null
+++ b/test/CodeGen/Thumb/stm-merge.ll

@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv6m--linux-gnueabi"
+
+@d = internal unnamed_addr global i32 0, align 4
+@c = internal global i32* null, align 4
+@e = internal unnamed_addr global i32* null, align 4
+
+; Function Attrs: nounwind optsize
+define void @fn1() #0 {
+entry:
+; CHECK-LABEL: fn1:
+; CHECK: stm r[[BASE:[0-9]]]!, {{.*}}
+; CHECK-NOT: {{.*}} r[[BASE]]
+; CHECK: ldr r[[BASE]], {{.*}}
+  %g = alloca i32, align 4
+  %h = alloca i32, align 4
+  store i32 1, i32* %g, align 4
+  store i32 0, i32* %h, align 4
+  %.pr = load i32* @d, align 4
+  %cmp11 = icmp slt i32 %.pr, 1
+  br i1 %cmp11, label %for.inc.lr.ph, label %for.body5
+
+for.inc.lr.ph:                                    ; preds = %entry
+  store i32 1, i32* @d, align 4
+  br label %for.body5
+
+for.body5:                                        ; preds = %entry, %for.inc.lr.ph, %for.body5
+  %f.010 = phi i32 [ %inc7, %for.body5 ], [ 0, %for.inc.lr.ph ], [ 0, %entry ]
+  store volatile i32* %g, i32** @c, align 4
+  %inc7 = add nsw i32 %f.010, 1
+  %exitcond = icmp eq i32 %inc7, 2
+  br i1 %exitcond, label %for.end8, label %for.body5
+
+for.end8:                                         ; preds = %for.body5
+  store i32* %h, i32** @e, align 4
+  ret void
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/Thumb/thumb-ldm.ll b/test/CodeGen/Thumb/thumb-ldm.ll
index 95f3edc..7e9560e 100644
--- a/test/CodeGen/Thumb/thumb-ldm.ll
+++ b/test/CodeGen/Thumb/thumb-ldm.ll

@@ -1,5 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv6m-eabi -o - | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - | FileCheck %s
 
 @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
 

diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
index dedc82b..da2f3f0 100644
--- a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll
+++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll

@@ -1,35 +1,33 @@
-; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s
-; XFAIL: *
-
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
 @d = external global [64 x i32]
 @s = external global [64 x i32]
 
 ; Function Attrs: nounwind
 define void @t1() #0 {
 entry:
-; CHECK: ldr [[REG0:r[0-9]]],
-; CHECK: ldm [[REG0]]!,
-; CHECK: ldr [[REG1:r[0-9]]],
-; CHECK: stm [[REG1]]!,
-; CHECK: subs [[REG0]], #32
-; CHECK-NEXT: ldrb
-; CHECK: subs [[REG1]], #32
-; CHECK-NEXT: strb
-    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 33, i32 4, i1 false)
+; CHECK-LABEL: t1:
+; CHECK: ldr r[[LB:[0-9]]],
+; CHECK-NEXT: ldm r[[LB]]!,
+; CHECK-NEXT: ldr r[[SB:[0-9]]],
+; CHECK-NEXT: stm r[[SB]]!,
+; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]]
+; CHECK-NEXT: strb {{.*}}, [r[[SB]]]
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false)
     ret void
 }
 
 ; Function Attrs: nounwind
 define void @t2() #0 {
 entry:
-; CHECK: ldr [[REG0:r[0-9]]],
-; CHECK: ldm [[REG0]]!,
-; CHECK: ldr [[REG1:r[0-9]]],
-; CHECK: stm [[REG1]]!,
-; CHECK: ldrh
-; CHECK: ldrb
-; CHECK: strb
-; CHECK: strh
+; CHECK-LABEL: t2:
+; CHECK: ldr r[[LB:[0-9]]],
+; CHECK-NEXT: ldm r[[LB]]!,
+; CHECK-NEXT: ldr r[[SB:[0-9]]],
+; CHECK-NEXT: stm r[[SB]]!,
+; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]]
+; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2]
+; CHECK-NEXT: strb {{.*}}, [r[[SB]], #2]
+; CHECK-NEXT: strh {{.*}}, [r[[SB]]]
     tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false)
     ret void
 }

diff --git a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
index c8eac8d..59c2367 100644
--- a/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll
+++ b/test/CodeGen/Thumb2/2009-08-06-SpDecBug.ll

@@ -13,6 +13,7 @@
 ; CHECK-NOT: mov sp, r7
 ; CHECK: add sp, #8
 	call void @__gcov_flush() nounwind
+	call void @llvm.va_start(i8* null)
 	br i1 undef, label %bb5, label %bb
 
 bb:		; preds = %bb, %entry
@@ -27,3 +28,5 @@
 declare hidden void @__gcov_flush()
 
 declare i32 @execvp(i8*, i8**) nounwind
+
+declare void @llvm.va_start(i8*) nounwind

diff --git a/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll b/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
index 524e5a6..89b7148 100644
--- a/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll
+++ b/test/CodeGen/Thumb2/2009-12-01-LoopIVUsers.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | \
+; RUN: opt < %s -O3 | \
 ; RUN:   llc -mtriple=thumbv7-apple-darwin10 -mattr=+neon | FileCheck %s
 
 define void @fred(i32 %three_by_three, i8* %in, double %dt1, i32 %x_size, i32 %y_size, i8* %bp) nounwind {

diff --git a/test/CodeGen/Thumb2/aapcs.ll b/test/CodeGen/Thumb2/aapcs.ll
new file mode 100644
index 0000000..21af8c1
--- /dev/null
+++ b/test/CodeGen/Thumb2/aapcs.ll

@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m4 -mattr=-vfp2             | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 -mattr=+vfp4,+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a8 -mattr=+vfp3             | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP
+
+define float @float_in_reg(float %a, float %b) {
+entry:
+; CHECK-LABEL: float_in_reg:
+; SOFT: mov r0, r1
+; HARD: vmov.f32  s0, s1
+; CHECK-NEXT: bx lr
+  ret float %b
+}
+
+define double @double_in_reg(double %a, double %b) {
+entry:
+; CHECK-LABEL: double_in_reg:
+; SOFT: mov r0, r2
+; SOFT: mov r1, r3
+; SP: vmov.f32  s0, s2
+; SP: vmov.f32  s1, s3
+; DP: vmov.f64  d0, d1
+; CHECK-NEXT: bx lr
+  ret double %b
+}
+
+define float @float_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, float %i) {
+; CHECK-LABEL: float_on_stack:
+; SOFT: ldr r0, [sp, #48]
+; HARD: vldr s0, [sp]
+; CHECK-NEXT: bx lr
+  ret float %i
+}
+
+define double @double_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) {
+; CHECK-LABEL: double_on_stack:
+; SOFT: ldr r0, [sp, #48]
+; SOFT: ldr r1, [sp, #52]
+; HARD: vldr d0, [sp]
+; CHECK-NEXT: bx lr
+  ret double %i
+}
+
+define double @double_not_split(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i) {
+; CHECK-LABEL: double_not_split:
+; SOFT: ldr r0, [sp, #48]
+; SOFT: ldr r1, [sp, #52]
+; HARD: vldr d0, [sp]
+; CHECK-NEXT: bx lr
+  ret double %i
+}

diff --git a/test/CodeGen/Thumb2/constant-islands-new-island.ll b/test/CodeGen/Thumb2/constant-islands-new-island.ll
new file mode 100644
index 0000000..8ed657e
--- /dev/null
+++ b/test/CodeGen/Thumb2/constant-islands-new-island.ll

@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabihf %s -o - | FileCheck %s
+
+; Check that new water is created by splitting the basic block right after the
+; load instruction. Previously, new water was created before the load
+; instruction, which caused the pass to fail to converge.
+
+define void @test(i1 %tst) {
+; CHECK-LABEL: test:
+; CHECK: vldr  {{s[0-9]+}}, [[CONST:\.LCPI[0-9]+_[0-9]+]]
+; CHECK-NEXT: b.w [[CONTINUE:\.LBB[0-9]+_[0-9]+]]
+
+; CHECK: [[CONST]]:
+; CHECK-NEXT: .long
+
+; CHECK: [[CONTINUE]]:
+
+entry:
+  call i32 @llvm.arm.space(i32 2000, i32 undef)
+  br i1 %tst, label %true, label %false
+
+true:
+  %val = phi float [12345.0, %entry], [undef, %false]
+  call void @bar(float %val)
+  ret void
+
+false:
+  br label %true
+}
+
+declare void @bar(float)
+declare i32 @llvm.arm.space(i32, i32)

diff --git a/test/CodeGen/Thumb2/cortex-fp.ll b/test/CodeGen/Thumb2/cortex-fp.ll
index e63970a..5548492e 100644
--- a/test/CodeGen/Thumb2/cortex-fp.ll
+++ b/test/CodeGen/Thumb2/cortex-fp.ll

@@ -1,13 +1,15 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXM3
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXM4
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXM7
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -march=thumb -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=CORTEXA8
 
 
 define float @foo(float %a, float %b) {
 entry:
 ; CHECK-LABEL: foo:
-; CORTEXM3: blx ___mulsf3
+; CORTEXM3: bl ___mulsf3
 ; CORTEXM4: vmul.f32  s
+; CORTEXM7: vmul.f32  s
 ; CORTEXA8: vmul.f32  d
   %0 = fmul float %a, %b
   ret float %0
@@ -17,8 +19,9 @@
 entry:
 ; CHECK-LABEL: bar:
   %0 = fmul double %a, %b
-; CORTEXM3: blx ___muldf3
-; CORTEXM4: blx ___muldf3
+; CORTEXM3: bl ___muldf3
+; CORTEXM4: {{bl|b.w}} ___muldf3
+; CORTEXM7: vmul.f64  d
 ; CORTEXA8: vmul.f64  d
   ret double %0
 }

diff --git a/test/CodeGen/Thumb2/float-cmp.ll b/test/CodeGen/Thumb2/float-cmp.ll
new file mode 100644
index 0000000..88d6c3b
--- /dev/null
+++ b/test/CodeGen/Thumb2/float-cmp.ll

@@ -0,0 +1,301 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=NONE
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP
+
+
+
+define i1 @cmp_f_false(float %a, float %b) {
+; CHECK-LABEL: cmp_f_false:
+; NONE: movs r0, #0
+; HARD: movs r0, #0
+  %1 = fcmp false float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_oeq(float %a, float %b) {
+; CHECK-LABEL: cmp_f_oeq:
+; NONE: bl __aeabi_fcmpeq
+; HARD: vcmpe.f32
+; HARD: moveq r0, #1
+  %1 = fcmp oeq float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ogt(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ogt:
+; NONE: bl __aeabi_fcmpgt
+; HARD: vcmpe.f32
+; HARD: movgt r0, #1
+  %1 = fcmp ogt float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_oge(float %a, float %b) {
+; CHECK-LABEL: cmp_f_oge:
+; NONE: bl __aeabi_fcmpge
+; HARD: vcmpe.f32
+; HARD: movge r0, #1
+  %1 = fcmp oge float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_olt(float %a, float %b) {
+; CHECK-LABEL: cmp_f_olt:
+; NONE: bl __aeabi_fcmplt
+; HARD: vcmpe.f32
+; HARD: movmi r0, #1
+  %1 = fcmp olt float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ole(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ole:
+; NONE: bl __aeabi_fcmple
+; HARD: vcmpe.f32
+; HARD: movls r0, #1
+  %1 = fcmp ole float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_one(float %a, float %b) {
+; CHECK-LABEL: cmp_f_one:
+; NONE: bl __aeabi_fcmpgt
+; NONE: bl __aeabi_fcmplt
+; HARD: vcmpe.f32
+; HARD: movmi r0, #1
+; HARD: movgt r0, #1
+  %1 = fcmp one float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ord(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ord:
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movvc r0, #1
+  %1 = fcmp ord float %a, %b
+  ret i1 %1
+}define i1 @cmp_f_ueq(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ueq:
+; NONE: bl __aeabi_fcmpeq
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: moveq r0, #1
+; HARD: movvs r0, #1
+  %1 = fcmp ueq float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ugt(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ugt:
+; NONE: bl __aeabi_fcmpgt
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movhi r0, #1
+  %1 = fcmp ugt float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_uge(float %a, float %b) {
+; CHECK-LABEL: cmp_f_uge:
+; NONE: bl __aeabi_fcmpge
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movpl r0, #1
+  %1 = fcmp uge float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ult(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ult:
+; NONE: bl __aeabi_fcmplt
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movlt r0, #1
+  %1 = fcmp ult float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_ule(float %a, float %b) {
+; CHECK-LABEL: cmp_f_ule:
+; NONE: bl __aeabi_fcmple
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movle r0, #1
+  %1 = fcmp ule float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_une(float %a, float %b) {
+; CHECK-LABEL: cmp_f_une:
+; NONE: bl __aeabi_fcmpeq
+; HARD: vcmpe.f32
+; HARD: movne r0, #1
+  %1 = fcmp une float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_uno(float %a, float %b) {
+; CHECK-LABEL: cmp_f_uno:
+; NONE: bl __aeabi_fcmpun
+; HARD: vcmpe.f32
+; HARD: movvs r0, #1
+  %1 = fcmp uno float %a, %b
+  ret i1 %1
+}
+define i1 @cmp_f_true(float %a, float %b) {
+; CHECK-LABEL: cmp_f_true:
+; NONE: movs r0, #1
+; HARD: movs r0, #1
+  %1 = fcmp true float %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_false(double %a, double %b) {
+; CHECK-LABEL: cmp_d_false:
+; NONE: movs r0, #0
+; HARD: movs r0, #0
+  %1 = fcmp false double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_oeq(double %a, double %b) {
+; CHECK-LABEL: cmp_d_oeq:
+; NONE: bl __aeabi_dcmpeq
+; SP: bl __aeabi_dcmpeq
+; DP: vcmpe.f64
+; DP: moveq r0, #1
+  %1 = fcmp oeq double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ogt(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ogt:
+; NONE: bl __aeabi_dcmpgt
+; SP: bl __aeabi_dcmpgt
+; DP: vcmpe.f64
+; DP: movgt r0, #1
+  %1 = fcmp ogt double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_oge(double %a, double %b) {
+; CHECK-LABEL: cmp_d_oge:
+; NONE: bl __aeabi_dcmpge
+; SP: bl __aeabi_dcmpge
+; DP: vcmpe.f64
+; DP: movge r0, #1
+  %1 = fcmp oge double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_olt(double %a, double %b) {
+; CHECK-LABEL: cmp_d_olt:
+; NONE: bl __aeabi_dcmplt
+; SP: bl __aeabi_dcmplt
+; DP: vcmpe.f64
+; DP: movmi r0, #1
+  %1 = fcmp olt double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ole(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ole:
+; NONE: bl __aeabi_dcmple
+; SP: bl __aeabi_dcmple
+; DP: vcmpe.f64
+; DP: movls r0, #1
+  %1 = fcmp ole double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_one(double %a, double %b) {
+; CHECK-LABEL: cmp_d_one:
+; NONE: bl __aeabi_dcmpgt
+; NONE: bl __aeabi_dcmplt
+; SP: bl __aeabi_dcmpgt
+; SP: bl __aeabi_dcmplt
+; DP: vcmpe.f64
+; DP: movmi r0, #1
+; DP: movgt r0, #1
+  %1 = fcmp one double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ord(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ord:
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movvc r0, #1
+  %1 = fcmp ord double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ugt(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ugt:
+; NONE: bl __aeabi_dcmpgt
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpgt
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movhi r0, #1
+  %1 = fcmp ugt double %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_ult(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ult:
+; NONE: bl __aeabi_dcmplt
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmplt
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movlt r0, #1
+  %1 = fcmp ult double %a, %b
+  ret i1 %1
+}
+
+
+define i1 @cmp_d_uno(double %a, double %b) {
+; CHECK-LABEL: cmp_d_uno:
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movvs r0, #1
+  %1 = fcmp uno double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_true(double %a, double %b) {
+; CHECK-LABEL: cmp_d_true:
+; NONE: movs r0, #1
+; HARD: movs r0, #1
+  %1 = fcmp true double %a, %b
+  ret i1 %1
+}
+define i1 @cmp_d_ueq(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ueq:
+; NONE: bl __aeabi_dcmpeq
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpeq
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: moveq r0, #1
+; DP: movvs r0, #1
+  %1 = fcmp ueq double %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_uge(double %a, double %b) {
+; CHECK-LABEL: cmp_d_uge:
+; NONE: bl __aeabi_dcmpge
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmpge
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movpl r0, #1
+  %1 = fcmp uge double %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_ule(double %a, double %b) {
+; CHECK-LABEL: cmp_d_ule:
+; NONE: bl __aeabi_dcmple
+; NONE: bl __aeabi_dcmpun
+; SP: bl __aeabi_dcmple
+; SP: bl __aeabi_dcmpun
+; DP: vcmpe.f64
+; DP: movle r0, #1
+  %1 = fcmp ule double %a, %b
+  ret i1 %1
+}
+
+define i1 @cmp_d_une(double %a, double %b) {
+; CHECK-LABEL: cmp_d_une:
+; NONE: bl __aeabi_dcmpeq
+; SP: bl __aeabi_dcmpeq
+; DP: vcmpe.f64
+; DP: movne r0, #1
+  %1 = fcmp une double %a, %b
+  ret i1 %1
+}

diff --git a/test/CodeGen/Thumb2/float-intrinsics-double.ll b/test/CodeGen/Thumb2/float-intrinsics-double.ll
new file mode 100644
index 0000000..01a23bd
--- /dev/null
+++ b/test/CodeGen/Thumb2/float-intrinsics-double.ll

@@ -0,0 +1,228 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3                    | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4                    | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=SP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP  -check-prefix=FP-ARMv8
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=SP
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a57                   | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=FP-ARMv8
+
+declare double     @llvm.sqrt.f64(double %Val)
+define double @sqrt_d(double %a) {
+; CHECK-LABEL: sqrt_d:
+; SOFT: {{(bl|b)}} sqrt
+; HARD: vsqrt.f64 d0, d0
+  %1 = call double @llvm.sqrt.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.powi.f64(double %Val, i32 %power)
+define double @powi_d(double %a, i32 %b) {
+; CHECK-LABEL: powi_d:
+; SOFT: {{(bl|b)}} __powidf2
+; HARD: b __powidf2
+  %1 = call double @llvm.powi.f64(double %a, i32 %b)
+  ret double %1
+}
+
+declare double     @llvm.sin.f64(double %Val)
+define double @sin_d(double %a) {
+; CHECK-LABEL: sin_d:
+; SOFT: {{(bl|b)}} sin
+; HARD: b sin
+  %1 = call double @llvm.sin.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.cos.f64(double %Val)
+define double @cos_d(double %a) {
+; CHECK-LABEL: cos_d:
+; SOFT: {{(bl|b)}} cos
+; HARD: b cos
+  %1 = call double @llvm.cos.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.pow.f64(double %Val, double %power)
+define double @pow_d(double %a, double %b) {
+; CHECK-LABEL: pow_d:
+; SOFT: {{(bl|b)}} pow
+; HARD: b pow
+  %1 = call double @llvm.pow.f64(double %a, double %b)
+  ret double %1
+}
+
+declare double     @llvm.exp.f64(double %Val)
+define double @exp_d(double %a) {
+; CHECK-LABEL: exp_d:
+; SOFT: {{(bl|b)}} exp
+; HARD: b exp
+  %1 = call double @llvm.exp.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.exp2.f64(double %Val)
+define double @exp2_d(double %a) {
+; CHECK-LABEL: exp2_d:
+; SOFT: {{(bl|b)}} exp2
+; HARD: b exp2
+  %1 = call double @llvm.exp2.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.log.f64(double %Val)
+define double @log_d(double %a) {
+; CHECK-LABEL: log_d:
+; SOFT: {{(bl|b)}} log
+; HARD: b log
+  %1 = call double @llvm.log.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.log10.f64(double %Val)
+define double @log10_d(double %a) {
+; CHECK-LABEL: log10_d:
+; SOFT: {{(bl|b)}} log10
+; HARD: b log10
+  %1 = call double @llvm.log10.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.log2.f64(double %Val)
+define double @log2_d(double %a) {
+; CHECK-LABEL: log2_d:
+; SOFT: {{(bl|b)}} log2
+; HARD: b log2
+  %1 = call double @llvm.log2.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.fma.f64(double %a, double %b, double %c)
+define double @fma_d(double %a, double %b, double %c) {
+; CHECK-LABEL: fma_d:
+; SOFT: {{(bl|b)}} fma
+; HARD: vfma.f64
+  %1 = call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %1
+}
+
+; FIXME: the FPv4-SP version is less efficient than the no-FPU version
+declare double     @llvm.fabs.f64(double %Val)
+define double @abs_d(double %a) {
+; CHECK-LABEL: abs_d:
+; NONE: bic r1, r1, #-2147483648
+; SP: bl __aeabi_dcmpgt
+; SP: bl __aeabi_dcmpun
+; SP: bl __aeabi_dsub
+; DP: vabs.f64 d0, d0
+  %1 = call double @llvm.fabs.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.copysign.f64(double  %Mag, double  %Sgn)
+define double @copysign_d(double %a, double %b) {
+; CHECK-LABEL: copysign_d:
+; SOFT: lsrs [[REG:r[0-9]+]], r3, #31
+; SOFT: bfi r1, [[REG]], #31, #1
+; VFP: lsrs [[REG:r[0-9]+]], r3, #31
+; VFP: bfi r1, [[REG]], #31, #1
+; NEON: vmov.i32 [[REG:d[0-9]+]], #0x80000000
+; NEON: vshl.i64 [[REG]], [[REG]], #32
+; NEON: vbsl [[REG]], d
+  %1 = call double @llvm.copysign.f64(double %a, double %b)
+  ret double %1
+}
+
+declare double     @llvm.floor.f64(double %Val)
+define double @floor_d(double %a) {
+; CHECK-LABEL: floor_d:
+; SOFT: {{(bl|b)}} floor
+; VFP4: b floor
+; FP-ARMv8: vrintm.f64
+  %1 = call double @llvm.floor.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.ceil.f64(double %Val)
+define double @ceil_d(double %a) {
+; CHECK-LABEL: ceil_d:
+; SOFT: {{(bl|b)}} ceil
+; VFP4: b ceil
+; FP-ARMv8: vrintp.f64
+  %1 = call double @llvm.ceil.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.trunc.f64(double %Val)
+define double @trunc_d(double %a) {
+; CHECK-LABEL: trunc_d:
+; SOFT: {{(bl|b)}} trunc
+; FFP4: b trunc
+; FP-ARMv8: vrintz.f64
+  %1 = call double @llvm.trunc.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.rint.f64(double %Val)
+define double @rint_d(double %a) {
+; CHECK-LABEL: rint_d:
+; SOFT: {{(bl|b)}} rint
+; VFP4: b rint
+; FP-ARMv8: vrintx.f64
+  %1 = call double @llvm.rint.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.nearbyint.f64(double %Val)
+define double @nearbyint_d(double %a) {
+; CHECK-LABEL: nearbyint_d:
+; SOFT: {{(bl|b)}} nearbyint
+; VFP4: b nearbyint
+; FP-ARMv8: vrintr.f64
+  %1 = call double @llvm.nearbyint.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.round.f64(double %Val)
+define double @round_d(double %a) {
+; CHECK-LABEL: round_d:
+; SOFT: {{(bl|b)}} round
+; VFP4: b round
+; FP-ARMv8: vrinta.f64
+  %1 = call double @llvm.round.f64(double %a)
+  ret double %1
+}
+
+declare double     @llvm.fmuladd.f64(double %a, double %b, double %c)
+define double @fmuladd_d(double %a, double %b, double %c) {
+; CHECK-LABEL: fmuladd_d:
+; SOFT: bl __aeabi_dmul
+; SOFT: bl __aeabi_dadd
+; VFP4: vmul.f64
+; VFP4: vadd.f64
+; FP-ARMv8: vmla.f64
+  %1 = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
+  ret double %1
+}
+
+declare i16 @llvm.convert.to.fp16.f64(double %a)
+define i16 @d_to_h(double %a) {
+; CHECK-LABEL: d_to_h:
+; SOFT: bl __aeabi_d2h
+; VFP4: bl __aeabi_d2h
+; FP-ARMv8: vcvt{{[bt]}}.f16.f64
+  %1 = call i16 @llvm.convert.to.fp16.f64(double %a)
+  ret i16 %1
+}
+
+declare double @llvm.convert.from.fp16.f64(i16 %a)
+define double @h_to_d(i16 %a) {
+; CHECK-LABEL: h_to_d:
+; NONE: bl __gnu_h2f_ieee
+; NONE: bl __aeabi_f2d
+; SP: vcvt{{[bt]}}.f32.f16
+; SP: bl __aeabi_f2d
+; VFPv4: vcvt{{[bt]}}.f32.f16
+; VFPv4: vcvt.f64.f32
+; FP-ARMv8: vcvt{{[bt]}}.f64.f16
+  %1 = call double @llvm.convert.from.fp16.f64(i16 %a)
+  ret double %1
+}

diff --git a/test/CodeGen/Thumb2/float-intrinsics-float.ll b/test/CodeGen/Thumb2/float-intrinsics-float.ll
new file mode 100644
index 0000000..ec1bcd3
--- /dev/null
+++ b/test/CodeGen/Thumb2/float-intrinsics-float.ll

@@ -0,0 +1,221 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3                    | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP  -check-prefix=FP-ARMv8  -check-prefix=VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=FP-ARMv8 -check-prefix=VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7                    | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4 -check-prefix=NO-VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a57                   | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=FP-ARMv8 -check-prefix=VMLA
+
+declare float     @llvm.sqrt.f32(float %Val)
+define float @sqrt_f(float %a) {
+; CHECK-LABEL: sqrt_f:
+; SOFT: bl sqrtf
+; HARD: vsqrt.f32 s0, s0
+  %1 = call float @llvm.sqrt.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.powi.f32(float %Val, i32 %power)
+define float @powi_f(float %a, i32 %b) {
+; CHECK-LABEL: powi_f:
+; SOFT: bl __powisf2
+; HARD: b __powisf2
+  %1 = call float @llvm.powi.f32(float %a, i32 %b)
+  ret float %1
+}
+
+declare float     @llvm.sin.f32(float %Val)
+define float @sin_f(float %a) {
+; CHECK-LABEL: sin_f:
+; SOFT: bl sinf
+; HARD: b sinf
+  %1 = call float @llvm.sin.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.cos.f32(float %Val)
+define float @cos_f(float %a) {
+; CHECK-LABEL: cos_f:
+; SOFT: bl cosf
+; HARD: b cosf
+  %1 = call float @llvm.cos.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.pow.f32(float %Val, float %power)
+define float @pow_f(float %a, float %b) {
+; CHECK-LABEL: pow_f:
+; SOFT: bl powf
+; HARD: b powf
+  %1 = call float @llvm.pow.f32(float %a, float %b)
+  ret float %1
+}
+
+declare float     @llvm.exp.f32(float %Val)
+define float @exp_f(float %a) {
+; CHECK-LABEL: exp_f:
+; SOFT: bl expf
+; HARD: b expf
+  %1 = call float @llvm.exp.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.exp2.f32(float %Val)
+define float @exp2_f(float %a) {
+; CHECK-LABEL: exp2_f:
+; SOFT: bl exp2f
+; HARD: b exp2f
+  %1 = call float @llvm.exp2.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.log.f32(float %Val)
+define float @log_f(float %a) {
+; CHECK-LABEL: log_f:
+; SOFT: bl logf
+; HARD: b logf
+  %1 = call float @llvm.log.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.log10.f32(float %Val)
+define float @log10_f(float %a) {
+; CHECK-LABEL: log10_f:
+; SOFT: bl log10f
+; HARD: b log10f
+  %1 = call float @llvm.log10.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.log2.f32(float %Val)
+define float @log2_f(float %a) {
+; CHECK-LABEL: log2_f:
+; SOFT: bl log2f
+; HARD: b log2f
+  %1 = call float @llvm.log2.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.fma.f32(float %a, float %b, float %c)
+define float @fma_f(float %a, float %b, float %c) {
+; CHECK-LABEL: fma_f:
+; SOFT: bl fmaf
+; HARD: vfma.f32
+  %1 = call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %1
+}
+
+declare float     @llvm.fabs.f32(float %Val)
+define float @abs_f(float %a) {
+; CHECK-LABEL: abs_f:
+; SOFT: bic r0, r0, #-2147483648
+; HARD: vabs.f32
+  %1 = call float @llvm.fabs.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.copysign.f32(float  %Mag, float  %Sgn)
+define float @copysign_f(float %a, float %b) {
+; CHECK-LABEL: copysign_f:
+; NONE: lsrs [[REG:r[0-9]+]], r{{[0-9]+}}, #31
+; NONE: bfi r{{[0-9]+}}, [[REG]], #31, #1
+; SP: lsrs [[REG:r[0-9]+]], r{{[0-9]+}}, #31
+; SP: bfi r{{[0-9]+}}, [[REG]], #31, #1
+; VFP: lsrs [[REG:r[0-9]+]], r{{[0-9]+}}, #31
+; VFP: bfi r{{[0-9]+}}, [[REG]], #31, #1
+; NEON: vmov.i32 [[REG:d[0-9]+]], #0x80000000
+; NEON: vbsl [[REG]], d
+  %1 = call float @llvm.copysign.f32(float %a, float %b)
+  ret float %1
+}
+
+declare float     @llvm.floor.f32(float %Val)
+define float @floor_f(float %a) {
+; CHECK-LABEL: floor_f:
+; SOFT: bl floorf
+; VFP4: b floorf
+; FP-ARMv8: vrintm.f32
+  %1 = call float @llvm.floor.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.ceil.f32(float %Val)
+define float @ceil_f(float %a) {
+; CHECK-LABEL: ceil_f:
+; SOFT: bl ceilf
+; VFP4: b ceilf
+; FP-ARMv8: vrintp.f32
+  %1 = call float @llvm.ceil.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.trunc.f32(float %Val)
+define float @trunc_f(float %a) {
+; CHECK-LABEL: trunc_f:
+; SOFT: bl truncf
+; VFP4: b truncf
+; FP-ARMv8: vrintz.f32
+  %1 = call float @llvm.trunc.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.rint.f32(float %Val)
+define float @rint_f(float %a) {
+; CHECK-LABEL: rint_f:
+; SOFT: bl rintf
+; VFP4: b rintf
+; FP-ARMv8: vrintx.f32
+  %1 = call float @llvm.rint.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.nearbyint.f32(float %Val)
+define float @nearbyint_f(float %a) {
+; CHECK-LABEL: nearbyint_f:
+; SOFT: bl nearbyintf
+; VFP4: b nearbyintf
+; FP-ARMv8: vrintr.f32
+  %1 = call float @llvm.nearbyint.f32(float %a)
+  ret float %1
+}
+
+declare float     @llvm.round.f32(float %Val)
+define float @round_f(float %a) {
+; CHECK-LABEL: round_f:
+; SOFT: bl roundf
+; VFP4: b roundf
+; FP-ARMv8: vrinta.f32
+  %1 = call float @llvm.round.f32(float %a)
+  ret float %1
+}
+
+; FIXME: why does cortex-m4 use vmla, while cortex-a7 uses vmul+vadd?
+; (these should be equivalent, even the rounding is the same)
+declare float     @llvm.fmuladd.f32(float %a, float %b, float %c)
+define float @fmuladd_f(float %a, float %b, float %c) {
+; CHECK-LABEL: fmuladd_f:
+; SOFT: bl __aeabi_fmul
+; SOFT: bl __aeabi_fadd
+; VMLA: vmla.f32
+; NO-VMLA: vmul.f32
+; NO-VMLA: vadd.f32
+  %1 = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  ret float %1
+}
+
+declare i16 @llvm.convert.to.fp16.f32(float %a)
+define i16 @f_to_h(float %a) {
+; CHECK-LABEL: f_to_h:
+; SOFT: bl __gnu_f2h_ieee
+; HARD: vcvt{{[bt]}}.f16.f32
+  %1 = call i16 @llvm.convert.to.fp16.f32(float %a)
+  ret i16 %1
+}
+
+declare float @llvm.convert.from.fp16.f32(i16 %a)
+define float @h_to_f(i16 %a) {
+; CHECK-LABEL: h_to_f:
+; SOFT: bl __gnu_h2f_ieee
+; HARD: vcvt{{[bt]}}.f32.f16
+  %1 = call float @llvm.convert.from.fp16.f32(i16 %a)
+  ret float %1
+}

diff --git a/test/CodeGen/Thumb2/float-ops.ll b/test/CodeGen/Thumb2/float-ops.ll
new file mode 100644
index 0000000..d383065
--- /dev/null
+++ b/test/CodeGen/Thumb2/float-ops.ll

@@ -0,0 +1,293 @@
+; RUN: llc < %s -mtriple=thumbv7-none-eabi   -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=NONE
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VFP4-ALL
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=FP-ARMv8
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP4-ALL -check-prefix=VFP4-DP
+
+define float @add_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: add_f:
+; NONE: bl __aeabi_fadd
+; HARD: vadd.f32  s0, s0, s1
+  %0 = fadd float %a, %b
+  ret float %0
+}
+
+define double @add_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: add_d:
+; NONE: bl __aeabi_dadd
+; SP: bl __aeabi_dadd
+; DP: vadd.f64  d0, d0, d1
+  %0 = fadd double %a, %b
+  ret double %0
+}
+
+define float @sub_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: sub_f:
+; NONE: bl __aeabi_fsub
+; HARD: vsub.f32  s
+  %0 = fsub float %a, %b
+  ret float %0
+}
+
+define double @sub_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: sub_d:
+; NONE: bl __aeabi_dsub
+; SP: bl __aeabi_dsub
+; DP: vsub.f64  d0, d0, d1
+  %0 = fsub double %a, %b
+  ret double %0
+}
+
+define float @mul_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: mul_f:
+; NONE: bl __aeabi_fmul
+; HARD: vmul.f32  s
+  %0 = fmul float %a, %b
+  ret float %0
+}
+
+define double @mul_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: mul_d:
+; NONE: bl __aeabi_dmul
+; SP: bl __aeabi_dmul
+; DP: vmul.f64  d0, d0, d1
+  %0 = fmul double %a, %b
+  ret double %0
+}
+
+define float @div_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: div_f:
+; NONE: bl __aeabi_fdiv
+; HARD: vdiv.f32  s
+  %0 = fdiv float %a, %b
+  ret float %0
+}
+
+define double @div_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: div_d:
+; NONE: bl __aeabi_ddiv
+; SP: bl __aeabi_ddiv
+; DP: vdiv.f64  d0, d0, d1
+  %0 = fdiv double %a, %b
+  ret double %0
+}
+
+define float @rem_f(float %a, float %b) {
+entry:
+; CHECK-LABEL: rem_f:
+; NONE: bl fmodf
+; HARD: b fmodf
+  %0 = frem float %a, %b
+  ret float %0
+}
+
+define double @rem_d(double %a, double %b) {
+entry:
+; CHECK-LABEL: rem_d:
+; NONE: bl fmod
+; HARD: b fmod
+  %0 = frem double %a, %b
+  ret double %0
+}
+
+define float @load_f(float* %a) {
+entry:
+; CHECK-LABEL: load_f:
+; NONE: ldr r0, [r0]
+; HARD: vldr s0, [r0]
+  %0 = load float* %a, align 4
+  ret float %0
+}
+
+define double @load_d(double* %a) {
+entry:
+; CHECK-LABEL: load_d:
+; NONE: ldm.w r0, {r0, r1}
+; HARD: vldr d0, [r0]
+  %0 = load double* %a, align 8
+  ret double %0
+}
+
+define void @store_f(float* %a, float %b) {
+entry:
+; CHECK-LABEL: store_f:
+; NONE: str r1, [r0]
+; HARD: vstr s0, [r0]
+  store float %b, float* %a, align 4
+  ret void
+}
+
+define void @store_d(double* %a, double %b) {
+entry:
+; CHECK-LABEL: store_d:
+; NONE: mov r1, r3
+; NONE: str r2, [r0]
+; NONE: str r1, [r0, #4]
+; HARD: vstr d0, [r0]
+  store double %b, double* %a, align 8
+  ret void
+}
+
+define double @f_to_d(float %a) {
+; CHECK-LABEL: f_to_d:
+; NONE: bl __aeabi_f2d
+; SP: bl __aeabi_f2d
+; DP: vcvt.f64.f32 d0, s0
+  %1 = fpext float %a to double
+  ret double %1
+}
+
+define float @d_to_f(double %a) {
+; CHECK-LABEL: d_to_f:
+; NONE: bl __aeabi_d2f
+; SP: bl __aeabi_d2f
+; DP: vcvt.f32.f64 s0, d0
+  %1 = fptrunc double %a to float
+  ret float %1
+}
+
+define i32 @f_to_si(float %a) {
+; CHECK-LABEL: f_to_si:
+; NONE: bl __aeabi_f2iz
+; HARD: vcvt.s32.f32 s0, s0
+; HARD: vmov r0, s0
+  %1 = fptosi float %a to i32
+  ret i32 %1
+}
+
+define i32 @d_to_si(double %a) {
+; CHECK-LABEL: d_to_si:
+; NONE: bl __aeabi_d2iz
+; SP: vmov r0, r1, d0
+; SP: bl __aeabi_d2iz
+; DP: vcvt.s32.f64 s0, d0
+; DP: vmov r0, s0
+  %1 = fptosi double %a to i32
+  ret i32 %1
+}
+
+define i32 @f_to_ui(float %a) {
+; CHECK-LABEL: f_to_ui:
+; NONE: bl __aeabi_f2uiz
+; HARD: vcvt.u32.f32 s0, s0
+; HARD: vmov r0, s0
+  %1 = fptoui float %a to i32
+  ret i32 %1
+}
+
+define i32 @d_to_ui(double %a) {
+; CHECK-LABEL: d_to_ui:
+; NONE: bl __aeabi_d2uiz
+; SP: vmov r0, r1, d0
+; SP: bl __aeabi_d2uiz
+; DP: vcvt.u32.f64 s0, d0
+; DP: vmov r0, s0
+  %1 = fptoui double %a to i32
+  ret i32 %1
+}
+
+define float @si_to_f(i32 %a) {
+; CHECK-LABEL: si_to_f:
+; NONE: bl __aeabi_i2f
+; HARD: vcvt.f32.s32 s0, s0
+  %1 = sitofp i32 %a to float
+  ret float %1
+}
+
+define double @si_to_d(i32 %a) {
+; CHECK-LABEL: si_to_d:
+; NONE: bl __aeabi_i2d
+; SP: bl __aeabi_i2d
+; DP: vcvt.f64.s32 d0, s0
+  %1 = sitofp i32 %a to double
+  ret double %1
+}
+
+define float @ui_to_f(i32 %a) {
+; CHECK-LABEL: ui_to_f:
+; NONE: bl __aeabi_ui2f
+; HARD: vcvt.f32.u32 s0, s0
+  %1 = uitofp i32 %a to float
+  ret float %1
+}
+
+define double @ui_to_d(i32 %a) {
+; CHECK-LABEL: ui_to_d:
+; NONE: bl __aeabi_ui2d
+; SP: bl __aeabi_ui2d
+; DP: vcvt.f64.u32 d0, s0
+  %1 = uitofp i32 %a to double
+  ret double %1
+}
+
+define float @bitcast_i_to_f(i32 %a) {
+; CHECK-LABEL: bitcast_i_to_f:
+; NONE-NOT: mov
+; HARD: vmov s0, r0
+  %1 = bitcast i32 %a to float
+  ret float %1
+}
+
+define double @bitcast_i_to_d(i64 %a) {
+; CHECK-LABEL: bitcast_i_to_d:
+; NONE-NOT: mov
+; HARD: vmov d0, r0, r1
+  %1 = bitcast i64 %a to double
+  ret double %1
+}
+
+define i32 @bitcast_f_to_i(float %a) {
+; CHECK-LABEL: bitcast_f_to_i:
+; NONE-NOT: mov
+; HARD: vmov r0, s0
+  %1 = bitcast float %a to i32
+  ret i32 %1
+}
+
+define i64 @bitcast_d_to_i(double %a) {
+; CHECK-LABEL: bitcast_d_to_i:
+; NONE-NOT: mov
+; HARD: vmov r0, r1, d0
+  %1 = bitcast double %a to i64
+  ret i64 %1
+}
+
+define float @select_f(float %a, float %b, i1 %c) {
+; CHECK-LABEL: select_f:
+; NONE: tst.w   r2, #1
+; NONE: moveq   r0, r1
+; HARD: tst.w   r0, #1
+; VFP4-ALL: vmovne.f32      s1, s0
+; VFP4-ALL: vmov.f32        s0, s1
+; FP-ARMv8: vseleq.f32 s0, s1, s0
+  %1 = select i1 %c, float %a, float %b
+  ret float %1
+}
+
+define double @select_d(double %a, double %b, i1 %c) {
+; CHECK-LABEL: select_d:
+; NONE: ldr.w   [[REG:r[0-9]+]], [sp]
+; NONE: ands    [[REG]], [[REG]], #1
+; NONE: moveq   r0, r2
+; NONE: moveq   r1, r3
+; SP: ands r0, r0, #1
+; SP-DAG: vmov [[ALO:r[0-9]+]], [[AHI:r[0-9]+]], d0
+; SP-DAG: vmov [[BLO:r[0-9]+]], [[BHI:r[0-9]+]], d1
+; SP: itt ne
+; SP-DAG: movne [[BLO]], [[ALO]]
+; SP-DAG: movne [[BHI]], [[AHI]]
+; SP: vmov d0, [[BLO]], [[BHI]]
+; DP: tst.w   r0, #1
+; VFP4-DP: vmovne.f64      d1, d0
+; VFP4-DP: vmov.f64        d0, d1
+; FP-ARMV8: vseleq.f64      d0, d1, d0
+  %1 = select i1 %c, double %a, double %b
+  ret double %1
+}

diff --git a/test/CodeGen/Thumb2/stack_guard_remat.ll b/test/CodeGen/Thumb2/stack_guard_remat.ll
new file mode 100644
index 0000000..c8ea871
--- /dev/null
+++ b/test/CodeGen/Thumb2/stack_guard_remat.ll

@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=STATIC
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s  -check-prefix=DYNAMIC-NO-PIC
+
+;PIC:   foo2
+;PIC:   movw  [[R0:r[0-9]+]], :lower16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0:LPC[0-9_]+]]+4))
+;PIC:   movt  [[R0]], :upper16:(L___stack_chk_guard$non_lazy_ptr-([[LABEL0]]+4))
+;PIC: [[LABEL0]]:
+;PIC:   add [[R0]], pc
+;PIC:   ldr [[R1:r[0-9]+]], {{\[}}[[R0]]{{\]}}
+;PIC:   ldr {{r[0-9]+}}, {{\[}}[[R1]]{{\]}}
+
+;STATIC:   foo2
+;STATIC:   movw  [[R0:r[0-9]+]], :lower16:___stack_chk_guard
+;STATIC:   movt  [[R0]], :upper16:___stack_chk_guard
+;STATIC:   ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+;DYNAMIC-NO-PIC:   foo2
+;DYNAMIC-NO-PIC:   movw  [[R0:r[0-9]+]], :lower16:L___stack_chk_guard$non_lazy_ptr
+;DYNAMIC-NO-PIC:   movt  [[R0]], :upper16:L___stack_chk_guard$non_lazy_ptr
+;DYNAMIC-NO-PIC:   ldr {{r[0-9]+}}, {{\[}}[[R0]]{{\]}}
+
+; Function Attrs: nounwind ssp
+define i32 @test_stack_guard_remat() #0 {
+  %a1 = alloca [256 x i32], align 4
+  %1 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %1)
+  %2 = getelementptr inbounds [256 x i32]* %a1, i32 0, i32 0
+  call void @foo3(i32* %2) #3
+  call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %1)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
index cef3490..02a8c47 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll

@@ -2,15 +2,15 @@
 ; RUN:  | FileCheck %s
 
 define i32 @test0(i8 %A) {
-; CHECK: test0
+; CHECK-LABEL: test0:
 ; CHECK: sxtb r0, r0
         %B = sext i8 %A to i32
 	ret i32 %B
 }
 
 define signext i8 @test1(i32 %A)  {
-; CHECK: test1
-; CHECK: sxtb.w r0, r0, ror #8
+; CHECK-LABEL: test1:
+; CHECK: sbfx r0, r0, #8, #8
 	%B = lshr i32 %A, 8
 	%C = shl i32 %A, 24
 	%D = or i32 %B, %C
@@ -19,9 +19,8 @@
 }
 
 define signext i32 @test2(i32 %A, i32 %X)  {
-; CHECK: test2
-; CHECK: lsrs r0, r0, #8
-; CHECK: sxtab  r0, r1, r0
+; CHECK-LABEL: test2:
+; CHECK: sxtab  r0, r1, r0, ror #8
 	%B = lshr i32 %A, 8
 	%C = shl i32 %A, 24
 	%D = or i32 %B, %C
@@ -30,3 +29,14 @@
         %G = add i32 %F, %X
 	ret i32 %G
 }
+
+define i32 @test3(i32 %A, i32 %X) {
+; CHECK-LABEL: test3:
+; CHECK: sxtah r0, r0, r1, ror #8
+  %X.hi = lshr i32 %X, 8
+  %X.trunc = trunc i32 %X.hi to i16
+  %addend = sext i16 %X.trunc to i32
+
+  %sum = add i32 %A, %addend
+  ret i32 %sum
+}

diff --git a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
index bcd4a0f..4afea89 100644
--- a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll

@@ -24,8 +24,8 @@
 }
 
 define zeroext i32 @test3(i32 %A.u)  {
-; A8: test3
-; A8: uxth.w r0, r0, ror #8
+; A8-LABEL: test3
+; A8: ubfx  r0, r0, #8, #16
     %B.u = lshr i32 %A.u, 8
     %C.u = shl i32 %A.u, 24
     %D.u = or i32 %B.u, %C.u
@@ -33,3 +33,25 @@
     %F.u = zext i16 %E.u to i32
     ret i32 %F.u
 }
+
+define i32 @test4(i32 %A, i32 %X) {
+; A8-LABEL: test4:
+; A8: uxtab r0, r0, r1, ror #16
+  %X.hi = lshr i32 %X, 16
+  %X.trunc = trunc i32 %X.hi to i8
+  %addend = zext i8 %X.trunc to i32
+
+  %sum = add i32 %A, %addend
+  ret i32 %sum
+}
+
+define i32 @test5(i32 %A, i32 %X) {
+; A8-LABEL: test5:
+; A8: uxtah r0, r0, r1, ror #8
+  %X.hi = lshr i32 %X, 8
+  %X.trunc = trunc i32 %X.hi to i16
+  %addend = zext i16 %X.trunc to i32
+
+  %sum = add i32 %A, %addend
+  ret i32 %sum
+}

diff --git a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
index 638d399..62c503d 100644
--- a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
+++ b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll

@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s
 
-@__gthrw_pthread_once = alias weak i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
+@__gthrw_pthread_once = weak alias i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
 
 define weak i32 @pthread_once(i32*, void ()*) {
   ret i32 0

diff --git a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
index d2d5149..35857b7 100644
--- a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
+++ b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll

@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep xor | grep CPI
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 
+; CHECK: xorpd {{.*}}{{LCPI0_0|__xmm@}}
 define void @casin({ double, double }* sret  %agg.result, double %z.0, double %z.1) nounwind  {
 entry:
 	%memtmp = alloca { double, double }, align 8		; <{ double, double }*> [#uses=3]

diff --git a/test/CodeGen/X86/2008-06-18-BadShuffle.ll b/test/CodeGen/X86/2008-06-18-BadShuffle.ll
deleted file mode 100644
index 66f9065..0000000
--- a/test/CodeGen/X86/2008-06-18-BadShuffle.ll
+++ /dev/null

@@ -1,10 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=i386 -mattr=+sse2 | grep pinsrw
-
-; Test to make sure we actually insert the bottom element of the vector
-define <8 x i16> @a(<8 x i16> %a) nounwind  {
-entry:
-	shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> < i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 >
-	%add = add <8 x i16> %0, %a
-	ret <8 x i16> %add
-}
-

diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 296f0ca..207d122 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll

@@ -14,9 +14,9 @@
   %2 = alloca i64                                 ; <i64*> [#uses=1]
   %3 = alloca i64                                 ; <i64*> [#uses=6]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0), !dbg !7
+  call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0, metadata !{metadata !"0x102"}), !dbg !7
   store i8* %s1, i8** %s1_addr
-  call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8), !dbg !7
+  call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8, metadata !{metadata !"0x102"}), !dbg !7
   %4 = call i8* @llvm.stacksave(), !dbg !7        ; <i8*> [#uses=1]
   store i8* %4, i8** %saved_stack.1, align 8, !dbg !7
   %5 = load i8** %s1_addr, align 8, !dbg !13      ; <i8*> [#uses=1]
@@ -58,7 +58,7 @@
   ret i8 %retval12, !dbg !16
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i8* @llvm.stacksave() nounwind
 
@@ -66,21 +66,21 @@
 
 declare void @llvm.stackrestore(i8*) nounwind
 
-!0 = metadata !{i32 459009, metadata !1, metadata !"s1", metadata !2, i32 2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 458798, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 458769, metadata !17, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 458773, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00s1\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", i32 0, metadata !2, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !6}
-!5 = metadata !{i32 458788, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
+!5 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !5} ; [ DW_TAG_pointer_type ]
 !7 = metadata !{i32 2, i32 0, metadata !1, null}
-!8 = metadata !{i32 459008, metadata !1, metadata !"str.0", metadata !2, i32 3, metadata !9} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 458753, null, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!8 = metadata !{metadata !"0x100\00str.0\003\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", null, metadata !2, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x1\00\000\008\008\000\000", null, metadata !2, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 458785, i64 0, i64 1}        ; [ DW_TAG_subrange_type ]
+!12 = metadata !{metadata !"0x21\000\001"}        ; [ DW_TAG_subrange_type ]
 !13 = metadata !{i32 3, i32 0, metadata !14, null}
-!14 = metadata !{i32 458763, metadata !17, metadata !1, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\000\000\000", metadata !17, metadata !1} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 4, i32 0, metadata !14, null}
 !16 = metadata !{i32 5, i32 0, metadata !14, null}
 !17 = metadata !{metadata !"vla.c", metadata !"/tmp/"}

diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 764c2cd..e046b96 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll

@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "7 machine-licm"
 ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037

diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
deleted file mode 100644
index e1930e0..0000000
--- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
+++ /dev/null

@@ -1,30 +0,0 @@
-; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \
-; RUN:     -mcpu=generic -disable-fp-elim -mattr=-sse4.1,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
-; RUN:   FileCheck %s
-; rdar://6808032
-
-; CHECK: pextrw $14
-; CHECK-NEXT: shrl $8
-; CHECK-NEXT: pinsrw
-
-define void @update(i8** %args_list) nounwind {
-entry:
-	%cmp.i = icmp eq i32 0, 0		; <i1> [#uses=1]
-	br i1 %cmp.i, label %if.then.i, label %test_cl.exit
-
-if.then.i:		; preds = %entry
-	%val = load <16 x i8> addrspace(1)* null		; <<16 x i8>> [#uses=8]
-	%tmp10.i = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 undef, i8 0, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef>, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 29, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp17.i = shufflevector <16 x i8> %tmp10.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 18, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp24.i = shufflevector <16 x i8> %tmp17.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 24, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp31.i = shufflevector <16 x i8> %tmp24.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 21, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp38.i = shufflevector <16 x i8> %tmp31.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 27, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp45.i = shufflevector <16 x i8> %tmp38.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 29, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp52.i = shufflevector <16 x i8> %tmp45.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 21, i32 10, i32 11, i32 12, i32 13, i32 14, i32 undef>		; <<16 x i8>> [#uses=1]
-	%tmp59.i = shufflevector <16 x i8> %tmp52.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 20>		; <<16 x i8>> [#uses=1]
-	store <16 x i8> %tmp59.i, <16 x i8> addrspace(1)* null
-	ret void
-
-test_cl.exit:		; preds = %entry
-	ret void
-}

diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
index 50c62df..ffbe02c 100644
--- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
+++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll

@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 > %t1
-; RUN: grep movzwl %t1 | count 2
-; RUN: grep movzbl %t1 | count 1
-; RUN: grep movd %t1 | count 4
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
 
 define <4 x i16> @a(i32* %x1) nounwind {
+; CHECK-LABEL: a:
+; CHECK:         shrl %[[R:[^,]+]]
+; CHECK-NEXT:    movd %[[R]], %xmm0
+; CHECK-NEXT:    retl
+
   %x2 = load i32* %x1
   %x3 = lshr i32 %x2, 1
   %x = trunc i32 %x3 to i16
@@ -12,6 +14,12 @@
 }
 
 define <8 x i16> @b(i32* %x1) nounwind {
+; CHECK-LABEL: b:
+; CHECK:         shrl %e[[R:.]]x
+; CHECK-NEXT:    movzwl %[[R]]x, %e[[R]]x
+; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK-NEXT:    retl
+
   %x2 = load i32* %x1
   %x3 = lshr i32 %x2, 1
   %x = trunc i32 %x3 to i16
@@ -20,6 +28,12 @@
 }
 
 define <8 x i8> @c(i32* %x1) nounwind {
+; CHECK-LABEL: c:
+; CHECK:         shrl %e[[R:.]]x
+; CHECK-NEXT:    movzwl %[[R]]x, %e[[R]]x
+; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK-NEXT:    retl
+
   %x2 = load i32* %x1
   %x3 = lshr i32 %x2, 1
   %x = trunc i32 %x3 to i8
@@ -28,6 +42,12 @@
 }
 
 define <16 x i8> @d(i32* %x1) nounwind {
+; CHECK-LABEL: d:
+; CHECK:         shrl %e[[R:.]]x
+; CHECK-NEXT:    movzbl %[[R]]l, %e[[R]]x
+; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK-NEXT:    retl
+
   %x2 = load i32* %x1
   %x3 = lshr i32 %x2, 1
   %x = trunc i32 %x3 to i8

diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index a936edc..6fe2ee4 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll

@@ -9,7 +9,7 @@
   br label %do.body, !dbg !0
 
 do.body:                                          ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4)
+  call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4, metadata !{metadata !"0x102"})
   %conv = ptrtoint i32* %count_ to i32, !dbg !0   ; <i32> [#uses=1]
   %call = call i32 @foo(i32 %conv) ssp, !dbg !0   ; <i32> [#uses=0]
   br label %do.end, !dbg !0
@@ -18,17 +18,17 @@
   ret void, !dbg !7
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @foo(i32) ssp
 
 !0 = metadata !{i32 5, i32 2, metadata !1, null}
-!1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
-!4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ]
-!5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!6 = metadata !{i32 458788, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}; [DW_TAG_base_type ]
+!1 = metadata !{metadata !"0xb\001\001\000", null, metadata !2}; [DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, metadata !3, null, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !8, null, metadata !9, null, null, null}; [DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x100\00count_\005\000", metadata !5, metadata !3, metadata !6}; [ DW_TAG_auto_variable ]
+!5 = metadata !{metadata !"0xb\001\001\000", null, metadata !1}; [DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3}; [DW_TAG_base_type ]
 !7 = metadata !{i32 6, i32 1, metadata !2, null}
 !8 = metadata !{metadata !"genmodes.i", metadata !"/Users/yash/Downloads"}
 !9 = metadata !{i32 0}

diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index f99e682..0e2ed9d 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll

@@ -12,7 +12,7 @@
   %retval = alloca double                         ; <double*> [#uses=2]
   %0 = alloca double                              ; <double*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0), !dbg !15
+  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0, metadata !{metadata !"0x102"}), !dbg !15
   %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1]
   %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1]
   %3 = load double* %2, align 8, !dbg !16         ; <double> [#uses=1]
@@ -26,30 +26,30 @@
   ret double %retval1, !dbg !16
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00my_r0\0011\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\0011", metadata !19, metadata !2, metadata !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !19, metadata !20, metadata !20, metadata !18, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !7}
-!6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !19, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x13\00Rect\006\00256\0064\000\000\000", metadata !19, metadata !2, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0xd\00P1\007\00128\0064\000\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x13\00Pt\001\00128\0064\000\000\000", metadata !19, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
-!14 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P2", i32 8, i64 128, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0xd\00x\002\0064\0064\000\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
+!13 = metadata !{metadata !"0xd\00y\003\0064\0064\0064\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
+!14 = metadata !{metadata !"0xd\00P2\008\00128\0064\00128\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
 !15 = metadata !{i32 11, i32 0, metadata !1, null}
 !16 = metadata !{i32 12, i32 0, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !19, metadata !1, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!17 = metadata !{metadata !"0xb\0011\000\000", metadata !19, metadata !1} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{metadata !1}
 !19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
 !20 = metadata !{i32 0}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index 4d4e8c1..a35efdc 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll

@@ -8,28 +8,28 @@
 
 define i32 @"main(tart.core.String[])->int32"(i32 %args) {
 entry:
-  tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8)
+  tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8, metadata !{metadata !"0x102"})
   tail call void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType* @.type.SwitchStmtTest) ; <%tart.core.Object*> [#uses=2]
   ret i32 3
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType*) nounwind readnone
 
-!0 = metadata !{i32 458769, metadata !15, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, null, null, null, i32 0} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 458790, metadata !15, metadata !0, metadata !"", i32 0, i64 192, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_const_type ]
-!2 = metadata !{i32 458771, metadata !15, metadata !0, metadata !"C", i32 1, i64 192, i64 64, i64 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
+!0 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !15, metadata !16, metadata !16, null, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x26\00\000\00192\0064\000\000", metadata !15, metadata !0, metadata !2} ; [ DW_TAG_const_type ]
+!2 = metadata !{metadata !"0x13\00C\001\00192\0064\000\000\000", metadata !15, metadata !0, null, metadata !3, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
 !3 = metadata !{metadata !4, metadata !6, metadata !7}
-!4 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"x", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
-!5 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"y", i32 1, i64 64, i64 64, i64 64, i32 0, metadata !5} ; [ DW_TAG_member ]
-!7 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"z", i32 1, i64 64, i64 64, i64 128, i32 0, metadata !5} ; [ DW_TAG_member ]
-!8 = metadata !{i32 459008, metadata !9, metadata !"t", metadata !0, i32 5, metadata !2} ; [ DW_TAG_auto_variable ]
-!9 = metadata !{i32 458763, null, metadata !10, i32 0, i32 0, i32 0}        ; [ DW_TAG_lexical_block ]
-!10 = metadata !{i32 458798, i32 0, metadata !0, metadata !"foo", metadata !"foo", metadata !"foo", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 458773, metadata !15, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0xd\00x\001\0064\0064\000\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
+!5 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !15, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0xd\00y\001\0064\0064\0064\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
+!7 = metadata !{metadata !"0xd\00z\001\0064\0064\00128\000", metadata !15, metadata !2, metadata !5} ; [ DW_TAG_member ]
+!8 = metadata !{metadata !"0x100\00t\005\000", metadata !9, metadata !0, metadata !2} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0xb\000\000\000", null, metadata !10}        ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, metadata !0, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !15, metadata !0, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !15, metadata !0} ; [ DW_TAG_base_type ]
 !14 = metadata !{%tart.reflect.ComplexType* @.type.SwitchStmtTest}
 !15 = metadata !{metadata !"sm.c", metadata !""}
 !16 = metadata !{i32 0}

diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
index 5372bc5..60025bf 100644
--- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
+++ b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll

@@ -7,7 +7,7 @@
         %tmp1 = bitcast double %a to <8 x i8>
         %tmp2 = bitcast double %b to <8 x i8>
         %tmp3 = add <8 x i8> %tmp1, %tmp2
-; CHECK:  paddw
+; CHECK:  paddb
         store <8 x i8> %tmp3, <8 x i8>* null
         ret void
 }
@@ -18,7 +18,7 @@
         %tmp1 = bitcast double %a to <4 x i16>
         %tmp2 = bitcast double %b to <4 x i16>
         %tmp3 = add <4 x i16> %tmp1, %tmp2
-; CHECK:  paddd
+; CHECK:  paddw
         store <4 x i16> %tmp3, <4 x i16>* null
         ret void
 }
@@ -29,7 +29,7 @@
         %tmp1 = bitcast double %a to <2 x i32>
         %tmp2 = bitcast double %b to <2 x i32>
         %tmp3 = add <2 x i32> %tmp1, %tmp2
-; CHECK:  paddq
+; CHECK:  paddd
         store <2 x i32> %tmp3, <2 x i32>* null
         ret void
 }

diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 7faee99..1998011 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll

@@ -2,8 +2,7 @@
 ; RUN: llc -mtriple=x86_64-pc-linux -O2 -regalloc=basic < %s | FileCheck %s
 ; Test to check .debug_loc support. This test case emits many debug_loc entries.
 
-; CHECK: Loc expr size
-; CHECK-NEXT: .short
+; CHECK: .short {{.*}} # Loc expr size
 ; CHECK-NEXT: .Ltmp
 ; CHECK-NEXT: DW_OP_reg
 
@@ -11,10 +10,10 @@
 
 define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0)
-  tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11)
-  tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12)
-  tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13)
+  tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13, metadata !{metadata !"0x102"})
   %0 = tail call float @fabsf(float %c) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %1 = tail call float @fabsf(float %d) nounwind readnone, !dbg !19 ; <float> [#uses=1]
   %2 = fcmp olt float %0, %1, !dbg !19            ; <i1> [#uses=1]
@@ -22,34 +21,34 @@
 
 bb:                                               ; preds = %entry
   %3 = fdiv float %c, %d, !dbg !20                ; <float> [#uses=3]
-  tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !20
   %4 = fmul float %3, %c, !dbg !21                ; <float> [#uses=1]
   %5 = fadd float %4, %d, !dbg !21                ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !21
   %6 = fmul float %3, %a, !dbg !22                ; <float> [#uses=1]
   %7 = fadd float %6, %b, !dbg !22                ; <float> [#uses=1]
   %8 = fdiv float %7, %5, !dbg !22                ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17), !dbg !22
+  tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !22
   %9 = fmul float %3, %b, !dbg !23                ; <float> [#uses=1]
   %10 = fsub float %9, %a, !dbg !23               ; <float> [#uses=1]
   %11 = fdiv float %10, %5, !dbg !23              ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18), !dbg !23
+  tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !23
   br label %bb2, !dbg !23
 
 bb1:                                              ; preds = %entry
   %12 = fdiv float %d, %c, !dbg !24               ; <float> [#uses=3]
-  tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16), !dbg !24
+  tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !24
   %13 = fmul float %12, %d, !dbg !25              ; <float> [#uses=1]
   %14 = fadd float %13, %c, !dbg !25              ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !25
   %15 = fmul float %12, %b, !dbg !26              ; <float> [#uses=1]
   %16 = fadd float %15, %a, !dbg !26              ; <float> [#uses=1]
   %17 = fdiv float %16, %14, !dbg !26             ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !26
   %18 = fmul float %12, %a, !dbg !27              ; <float> [#uses=1]
   %19 = fsub float %b, %18, !dbg !27              ; <float> [#uses=1]
   %20 = fdiv float %19, %14, !dbg !27             ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18), !dbg !27
+  tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !27
   br label %bb2, !dbg !27
 
 bb2:                                              ; preds = %bb1, %bb
@@ -75,9 +74,9 @@
 bb8:                                              ; preds = %bb6
   %27 = tail call float @copysignf(float 0x7FF0000000000000, float %c) nounwind readnone, !dbg !30 ; <float> [#uses=2]
   %28 = fmul float %27, %a, !dbg !30              ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !30
   %29 = fmul float %27, %b, !dbg !31              ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18), !dbg !31
+  tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !31
   br label %bb46, !dbg !31
 
 bb9:                                              ; preds = %bb6, %bb4
@@ -107,24 +106,24 @@
 bb16:                                             ; preds = %bb15
   %iftmp.0.0 = select i1 %33, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %42 = tail call float @copysignf(float %iftmp.0.0, float %a) nounwind readnone, !dbg !33 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0), !dbg !33
+  tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0, metadata !{metadata !"0x102"}), !dbg !33
   %43 = fcmp ord float %b, 0.000000e+00           ; <i1> [#uses=1]
   %44 = fsub float %b, %b, !dbg !34               ; <float> [#uses=1]
   %45 = fcmp uno float %44, 0.000000e+00          ; <i1> [#uses=1]
   %46 = and i1 %43, %45, !dbg !34                 ; <i1> [#uses=1]
   %iftmp.1.0 = select i1 %46, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %47 = tail call float @copysignf(float %iftmp.1.0, float %b) nounwind readnone, !dbg !34 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11), !dbg !34
+  tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !34
   %48 = fmul float %42, %c, !dbg !35              ; <float> [#uses=1]
   %49 = fmul float %47, %d, !dbg !35              ; <float> [#uses=1]
   %50 = fadd float %48, %49, !dbg !35             ; <float> [#uses=1]
   %51 = fmul float %50, 0x7FF0000000000000, !dbg !35 ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !35
   %52 = fmul float %47, %c, !dbg !36              ; <float> [#uses=1]
   %53 = fmul float %42, %d, !dbg !36              ; <float> [#uses=1]
   %54 = fsub float %52, %53, !dbg !36             ; <float> [#uses=1]
   %55 = fmul float %54, 0x7FF0000000000000, !dbg !36 ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !36
   br label %bb46, !dbg !36
 
 bb27:                                             ; preds = %bb15, %bb14, %bb11
@@ -155,24 +154,24 @@
 bb35:                                             ; preds = %bb34
   %iftmp.2.0 = select i1 %59, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %67 = tail call float @copysignf(float %iftmp.2.0, float %c) nounwind readnone, !dbg !38 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !38
   %68 = fcmp ord float %d, 0.000000e+00           ; <i1> [#uses=1]
   %69 = fsub float %d, %d, !dbg !39               ; <float> [#uses=1]
   %70 = fcmp uno float %69, 0.000000e+00          ; <i1> [#uses=1]
   %71 = and i1 %68, %70, !dbg !39                 ; <i1> [#uses=1]
   %iftmp.3.0 = select i1 %71, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1]
   %72 = tail call float @copysignf(float %iftmp.3.0, float %d) nounwind readnone, !dbg !39 ; <float> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !39
   %73 = fmul float %67, %a, !dbg !40              ; <float> [#uses=1]
   %74 = fmul float %72, %b, !dbg !40              ; <float> [#uses=1]
   %75 = fadd float %73, %74, !dbg !40             ; <float> [#uses=1]
   %76 = fmul float %75, 0.000000e+00, !dbg !40    ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17), !dbg !40
+  tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !40
   %77 = fmul float %67, %b, !dbg !41              ; <float> [#uses=1]
   %78 = fmul float %72, %a, !dbg !41              ; <float> [#uses=1]
   %79 = fsub float %77, %78, !dbg !41             ; <float> [#uses=1]
   %80 = fmul float %79, 0.000000e+00, !dbg !41    ; <float> [#uses=1]
-  tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18), !dbg !41
+  tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !41
   br label %bb46, !dbg !41
 
 bb46:                                             ; preds = %bb35, %bb34, %bb33, %bb30, %bb16, %bb8, %bb2
@@ -196,30 +195,30 @@
 
 declare float @copysignf(float, float) nounwind readnone
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!48}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !45, metadata !2, metadata !"__divsc3", metadata !"__divsc3", metadata !"__divsc3", i32 1922, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43, i32 1922} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !45} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !45, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !44, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !45, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00a\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00__divsc3\00__divsc3\00__divsc3\001922\000\001\000\006\000\001\001922", metadata !45, metadata !2, metadata !4, null, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !45} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !45, metadata !47, metadata !47, metadata !44, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !45, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !9, metadata !9, metadata !9, metadata !9}
-!6 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SCtype", i32 170, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ]
-!7 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ]
-!8 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"complex float", i32 0, i64 64, i64 32, i64 0, i32 0, i32 3} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SFtype", i32 167, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ]
-!10 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786689, metadata !1, metadata !"b", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 786689, metadata !1, metadata !"d", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
-!14 = metadata !{i32 786688, metadata !15, metadata !"denom", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786443, metadata !45, metadata !1, i32 1922, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786688, metadata !15, metadata !"ratio", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!17 = metadata !{i32 786688, metadata !15, metadata !"x", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 786688, metadata !15, metadata !"y", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{metadata !"0x16\00SCtype\00170\000\000\000\000", metadata !46, metadata !7, metadata !8} ; [ DW_TAG_typedef ]
+!7 = metadata !{metadata !"0x29", metadata !46} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x24\00complex float\000\0064\0032\000\000\003", metadata !45, metadata !2} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x16\00SFtype\00167\000\000\000\000", metadata !46, metadata !7, metadata !10} ; [ DW_TAG_typedef ]
+!10 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", metadata !45, metadata !2} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0x101\00b\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{metadata !"0x101\00c\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!13 = metadata !{metadata !"0x101\00d\001921\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!14 = metadata !{metadata !"0x100\00denom\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0xb\001922\000\000", metadata !45, metadata !1} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x100\00ratio\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{metadata !"0x100\00x\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!18 = metadata !{metadata !"0x100\00y\001923\000", metadata !15, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
 !19 = metadata !{i32 1929, i32 0, metadata !15, null}
 !20 = metadata !{i32 1931, i32 0, metadata !15, null}
 !21 = metadata !{i32 1932, i32 0, metadata !15, null}
@@ -249,4 +248,4 @@
 !45 = metadata !{metadata !"libgcc2.c", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
 !46 = metadata !{metadata !"libgcc2.h", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
 !47 = metadata !{i32 0}
-!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index e11b538..09120a1 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll

@@ -9,7 +9,7 @@
 
 define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8)
+  tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8, metadata !{metadata !"0x102"})
   %0 = getelementptr inbounds %struct.a* %myvar, i64 0, i32 0, !dbg !28 ; <i32*> [#uses=1]
   %1 = load i32* %0, align 8, !dbg !28            ; <i32> [#uses=1]
   tail call void @foo(i32 %1) nounwind optsize noinline ssp, !dbg !28
@@ -19,41 +19,41 @@
 
 declare void @foo(i32) nounwind optsize noinline ssp
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!38}
 
-!0 = metadata !{i32 786484, i32 0, metadata !1, metadata !"ret", metadata !"ret", metadata !"", metadata !1, i32 7, metadata !3, i1 false, i1 true, null, null} ; [ DW_TAG_variable ]
-!1 = metadata !{i32 786473, metadata !36} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !36, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !37, metadata !37, metadata !32, metadata !31,  metadata !37, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!4 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !1, i32 12, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
-!5 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void (i32)* @foo, null, null, metadata !33, i32 13} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x34\00ret\00ret\00\007\000\001", metadata !1, metadata !1, metadata !3, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{metadata !"0x29", metadata !36} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !36, metadata !37, metadata !37, metadata !32, metadata !31,  metadata !37} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !36, metadata !1} ; [ DW_TAG_base_type ]
+!4 = metadata !{metadata !"0x101\00x\0012\000", metadata !5, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0013\000\001\000\006\000\001\0013", metadata !36, metadata !1, metadata !6, null, void (i32)* @foo, null, null, metadata !33} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !3}
-!8 = metadata !{i32 786689, metadata !9, metadata !"myvar", metadata !1, i32 17, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 17, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i8* (%struct.a*)* @bar, null, null, metadata !34, i32 17} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x101\00myvar\0017\000", metadata !9, metadata !1, metadata !13} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x2e\00bar\00bar\00bar\0017\000\001\000\006\000\001\0017", metadata !36, metadata !1, metadata !10, null, i8* (%struct.a*)* @bar, null, null, metadata !34} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786451, metadata !36, metadata !1, metadata !"a", i32 2, i64 128, i64 64, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, null} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !14} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0x13\00a\002\00128\0064\000\000\000", metadata !36, metadata !1, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"c", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !3} ; [ DW_TAG_member ]
-!17 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"d", i32 4, i64 64, i64 64, i64 64, i32 0, metadata !13} ; [ DW_TAG_member ]
-!18 = metadata !{i32 786689, metadata !19, metadata !"argc", metadata !1, i32 22, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 22, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, metadata !35, i32 22} ; [ DW_TAG_subprogram ]
-!20 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0xd\00c\003\0032\0032\000\000", metadata !36, metadata !14, metadata !3} ; [ DW_TAG_member ]
+!17 = metadata !{metadata !"0xd\00d\004\0064\0064\0064\000", metadata !36, metadata !14, metadata !13} ; [ DW_TAG_member ]
+!18 = metadata !{metadata !"0x101\00argc\0022\000", metadata !19, metadata !1, metadata !3} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x2e\00main\00main\00main\0022\000\001\000\006\000\001\0022", metadata !36, metadata !1, metadata !20, null, null, null, null, metadata !35} ; [ DW_TAG_subprogram ]
+!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !36, metadata !1, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{metadata !3, metadata !3, metadata !22}
-!22 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ]
-!24 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!25 = metadata !{i32 786689, metadata !19, metadata !"argv", metadata !1, i32 22, metadata !22, i32 0, null} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{i32 786688, metadata !27, metadata !"e", metadata !1, i32 23, metadata !14, i32 0, null} ; [ DW_TAG_auto_variable ]
-!27 = metadata !{i32 786443, metadata !36, metadata !19, i32 22, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !36, metadata !1, metadata !24} ; [ DW_TAG_pointer_type ]
+!24 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !36, metadata !1} ; [ DW_TAG_base_type ]
+!25 = metadata !{metadata !"0x101\00argv\0022\000", metadata !19, metadata !1, metadata !22} ; [ DW_TAG_arg_variable ]
+!26 = metadata !{metadata !"0x100\00e\0023\000", metadata !27, metadata !1, metadata !14} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{metadata !"0xb\0022\000\000", metadata !36, metadata !19} ; [ DW_TAG_lexical_block ]
 !28 = metadata !{i32 18, i32 0, metadata !29, null}
-!29 = metadata !{i32 786443, metadata !36, metadata !9, i32 17, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
+!29 = metadata !{metadata !"0xb\0017\000\001", metadata !36, metadata !9} ; [ DW_TAG_lexical_block ]
 !30 = metadata !{i32 19, i32 0, metadata !29, null}
 !31 = metadata !{metadata !0}
 !32 = metadata !{metadata !5, metadata !9, metadata !19}
@@ -73,18 +73,22 @@
 
 
 ; CHECK: Ldebug_loc0:
-; CHECK-NEXT: .quad   Lfunc_begin0
-; CHECK-NEXT: .quad   [[LABEL]]
+; CHECK-NEXT: [[SET1:.*]] = Lfunc_begin0-Lfunc_begin0
+; CHECK-NEXT: .quad   [[SET1]]
+; CHECK-NEXT: [[SET2:.*]] = [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: .quad   [[SET2]]
 ; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}               ## Loc expr size
 ; CHECK-NEXT: .short  Lset{{.*}}
 ; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   85
 ; CHECK-NEXT: Ltmp{{.*}}:
-; CHECK-NEXT: .quad   [[LABEL]]
-; CHECK-NEXT: .quad   [[CLOBBER]]
+; CHECK-NEXT: [[SET3:.*]] = [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: .quad   [[SET3]]
+; CHECK-NEXT: [[SET4:.*]] = [[CLOBBER]]-Lfunc_begin0
+; CHECK-NEXT: .quad   [[SET4]]
 ; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}               ## Loc expr size
 ; CHECK-NEXT: .short  Lset{{.*}}
 ; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   83
 ; CHECK-NEXT: Ltmp{{.*}}:
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index 1114c8d..b0a4e8d 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll

@@ -4,19 +4,19 @@
 
 define i32 @foo(i32 %y) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0)
+  tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0, metadata !{metadata !"0x102"})
   %0 = tail call i32 (...)* @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1]
   ret i32 %0, !dbg !9
 }
 
 declare i32 @zoo(...)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 define i32 @bar(i32 %x) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7)
-  tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0) nounwind
+  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0, metadata !{metadata !"0x102"}) nounwind
   %0 = tail call i32 (...)* @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1]
   %1 = add nsw i32 %0, %x, !dbg !13               ; <i32> [#uses=1]
   ret i32 %1, !dbg !13
@@ -25,21 +25,21 @@
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 2, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, metadata !15, i32 2} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00y\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\002", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @foo, null, null, metadata !15} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !17, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6}
-!6 = metadata !{i32 786468, metadata !18, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !2, i32 6, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"bar", metadata !"bar", metadata !"bar", i32 6, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @bar, null, null, metadata !16, i32 6} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !18, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x101\00x\006\000", metadata !8, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0x2e\00bar\00bar\00bar\006\000\001\000\006\000\001\006", metadata !18, metadata !2, metadata !4, null, i32 (i32)* @bar, null, null, metadata !16} ; [ DW_TAG_subprogram ]
 !9 = metadata !{i32 3, i32 0, metadata !10, null}
-!10 = metadata !{i32 786443, metadata !18, metadata !1, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0xb\002\000\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 1}
 !12 = metadata !{i32 3, i32 0, metadata !10, metadata !13}
 !13 = metadata !{i32 7, i32 0, metadata !14, null}
-!14 = metadata !{i32 786443, metadata !18, metadata !8, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\006\000\000", metadata !18, metadata !8} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{metadata !0}
 !16 = metadata !{metadata !7}
 !17 = metadata !{metadata !1, metadata !8}
@@ -49,4 +49,4 @@
 ;CHECK: DEBUG_VALUE: bar:x <- E
 ;CHECK: Ltmp
 ;CHECK:	DEBUG_VALUE: foo:y <- 1{{$}}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index 4181c26..dea9162 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll

@@ -10,51 +10,51 @@
 define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 {
 ;CHECK: DEBUG_VALUE: baz:this <- RDI{{$}}
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15)
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16)
+  tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15, metadata !{metadata !"0x102"})
+  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16, metadata !{metadata !"0x102"})
   %0 = mul nsw i32 %x, 7, !dbg !29                ; <i32> [#uses=1]
   %1 = add nsw i32 %0, 1, !dbg !29                ; <i32> [#uses=1]
   ret i32 %1, !dbg !29
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!4}
 !llvm.module.flags = !{!34}
 !llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !3, i32 11, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEi", i32 11, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
-!3 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
-!4 = metadata !{i32 786449, metadata !31, i32 4, metadata !"4.2.1 LLVM build", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !33, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x101\00this\0011\000", metadata !1, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEi\0011\000\001\000\006\000\001\0011", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x13\00foo\003\0032\0032\000\000\000", metadata !31, metadata !3, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
+!3 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !"0x11\004\004.2.1 LLVM build\001\00\000\00\000", metadata !31, metadata !32, metadata !32, metadata !33, null, null} ; [ DW_TAG_compile_unit ]
 !5 = metadata !{metadata !6, metadata !1, metadata !8}
-!6 = metadata !{i32 786445, metadata !31, metadata !2, metadata !"y", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ]
-!7 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"baz", metadata !"baz", metadata !"_ZN3foo3bazEi", i32 15, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 15} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0xd\00y\008\0032\0032\000\000", metadata !31, metadata !2, metadata !7} ; [ DW_TAG_member ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !31, metadata !3} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x2e\00baz\00baz\00_ZN3foo3bazEi\0015\000\001\000\006\000\001\0015", metadata !31, metadata !2, metadata !9, null, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null} ; [ DW_TAG_subprogram ]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{metadata !7, metadata !11, metadata !7}
-!11 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !2} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 786470, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !13} ; [ DW_TAG_const_type ]
-!13 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !3, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 786689, metadata !8, metadata !"this", metadata !3, i32 15, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !3, i32 15, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786689, metadata !18, metadata !"argc", metadata !3, i32 19, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!18 = metadata !{i32 786478, metadata !31, metadata !3, metadata !"main", metadata !"main", metadata !"main", i32 19, metadata !19, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, null, i32 19} ; [ DW_TAG_subprogram ]
-!19 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !31, metadata !3, metadata !13} ; [ DW_TAG_const_type ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !2} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0x101\00x\0011\000", metadata !1, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x101\00this\0015\000", metadata !8, metadata !3, metadata !12} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x101\00x\0015\000", metadata !8, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x101\00argc\0019\000", metadata !18, metadata !3, metadata !7} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x2e\00main\00main\00main\0019\000\001\000\006\000\001\0019", metadata !31, metadata !3, metadata !19, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !3, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !20 = metadata !{metadata !7, metadata !7, metadata !21}
-!21 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
-!22 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!23 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!24 = metadata !{i32 786689, metadata !18, metadata !"argv", metadata !3, i32 19, metadata !21, i32 0, null} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{i32 786688, metadata !26, metadata !"a", metadata !3, i32 20, metadata !2, i32 0, null} ; [ DW_TAG_auto_variable ]
-!26 = metadata !{i32 786443, metadata !31, metadata !27, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!27 = metadata !{i32 786443, metadata !31, metadata !18, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!28 = metadata !{i32 786688, metadata !26, metadata !"b", metadata !3, i32 21, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
+!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !31, metadata !3, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !31, metadata !3} ; [ DW_TAG_base_type ]
+!24 = metadata !{metadata !"0x101\00argv\0019\000", metadata !18, metadata !3, metadata !21} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{metadata !"0x100\00a\0020\000", metadata !26, metadata !3, metadata !2} ; [ DW_TAG_auto_variable ]
+!26 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !27} ; [ DW_TAG_lexical_block ]
+!27 = metadata !{metadata !"0xb\0019\000\000", metadata !31, metadata !18} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0x100\00b\0021\000", metadata !26, metadata !3, metadata !7} ; [ DW_TAG_auto_variable ]
 !29 = metadata !{i32 16, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !31, metadata !8, i32 15, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!30 = metadata !{metadata !"0xb\0015\000\000", metadata !31, metadata !8} ; [ DW_TAG_lexical_block ]
 !31 = metadata !{metadata !"foo.cp", metadata !"/tmp/"}
 !32 = metadata !{i32 0}
 !33 = metadata !{metadata !1, metadata !8, metadata !18}
-!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index b49aec3..9d65dc1 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll

@@ -3,29 +3,29 @@
 @.str = private constant [4 x i8] c"one\00", align 1 ; <[4 x i8]*> [#uses=1]
 @.str1 = private constant [4 x i8] c"two\00", align 1 ; <[5 x i8]*> [#uses=1]
 @C.9.2167 = internal constant [2 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0)]
-!38 = metadata !{i32 524329, metadata !109} ; [ DW_TAG_file_type ]
-!39 = metadata !{i32 524305, metadata !109, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !108, metadata !108, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!46 = metadata !{i32 524303, metadata !109, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !47} ; [ DW_TAG_pointer_type ]
-!47 = metadata !{i32 524324, metadata !109, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!97 = metadata !{i32 524334, i32 0, metadata !39, metadata !"main", metadata !"main", metadata !"main", i32 73, metadata !98, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!98 = metadata !{i32 524309, metadata !109, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !99, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!38 = metadata !{metadata !"0x29", metadata !109} ; [ DW_TAG_file_type ]
+!39 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", metadata !109, metadata !108, metadata !108, null, null, null} ; [ DW_TAG_compile_unit ]
+!46 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !109, null, metadata !47} ; [ DW_TAG_pointer_type ]
+!47 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !109, null} ; [ DW_TAG_base_type ]
+!97 = metadata !{metadata !"0x2e\00main\00main\00main\0073\000\001\000\006\000\000\000", i32 0, metadata !39, metadata !98, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!98 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !109, null, null, metadata !99, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !99 = metadata !{metadata !100}
-!100 = metadata !{i32 524324, metadata !109, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!100 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !109, null} ; [ DW_TAG_base_type ]
 !101 = metadata !{[2 x i8*]* @C.9.2167}
-!102 = metadata !{i32 524544, metadata !103, metadata !"find_strings", metadata !38, i32 75, metadata !104, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!103 = metadata !{i32 524299, null, metadata !97, i32 73, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!104 = metadata !{i32 524289, metadata !109, null, metadata !"", i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
+!102 = metadata !{metadata !"0x100\00find_strings\0075\000", metadata !103, metadata !38, metadata !104} ; [ DW_TAG_auto_variable ]
+!103 = metadata !{metadata !"0xb\0073\000\000", null, metadata !97} ; [ DW_TAG_lexical_block ]
+!104 = metadata !{metadata !"0x1\00\000\0085312\0064\000\000", metadata !109, null, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
 !105 = metadata !{metadata !106}
-!106 = metadata !{i32 524321, i64 0, i64 1333}    ; [ DW_TAG_subrange_type ]
+!106 = metadata !{metadata !"0x21\000\001333"}    ; [ DW_TAG_subrange_type ]
 !107 = metadata !{i32 73, i32 0, metadata !103, null}
 !108 = metadata !{i32 0}
 !109 = metadata !{metadata !"pbmsrch.c", metadata !"/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch"}
 
 define i32 @main() nounwind ssp {
 bb.nph:
-  tail call void @llvm.dbg.declare(metadata !101, metadata !102), !dbg !107
+  tail call void @llvm.dbg.declare(metadata !101, metadata !102, metadata !{metadata !"0x102"}), !dbg !107
   ret i32 0, !dbg !107
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 

diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index 09e34ef..a613939 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll

@@ -6,8 +6,8 @@
 define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23), !dbg !24
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25), !dbg !24
+  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !24
   %0 = icmp ne i32 %i, 0, !dbg !27                ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !27
 
@@ -34,7 +34,7 @@
 define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31), !dbg !34
+  call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !34
   %0 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 0, !dbg !34 ; <i8**> [#uses=1]
   store i8* null, i8** %0, align 8, !dbg !34
   %1 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 1, !dbg !34 ; <i32*> [#uses=1]
@@ -45,14 +45,14 @@
   ret void, !dbg !35
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @main() nounwind ssp {
 entry:
   %0 = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=3]
   %v = alloca %struct.SVal                        ; <%struct.SVal*> [#uses=4]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38), !dbg !41
+  call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38, metadata !{metadata !"0x102"}), !dbg !41
   call void @_ZN4SValC1Ev(%struct.SVal* %v) nounwind, !dbg !41
   %1 = getelementptr inbounds %struct.SVal* %v, i32 0, i32 1, !dbg !42 ; <i32*> [#uses=1]
   store i32 1, i32* %1, align 8, !dbg !42
@@ -65,65 +65,65 @@
   %7 = load i32* %6, align 8, !dbg !43            ; <i32> [#uses=1]
   store i32 %7, i32* %5, align 8, !dbg !43
   %8 = call i32 @_Z3fooi4SVal(i32 2, %struct.SVal* noalias %0) nounwind, !dbg !43 ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44), !dbg !43
+  call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44, metadata !{metadata !"0x102"}), !dbg !43
   br label %return, !dbg !45
 
 return:                                           ; preds = %entry
   ret i32 0, !dbg !45
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!49}
 !46 = metadata !{metadata !16, metadata !17, metadata !20}
 
-!0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
-!2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !47, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !48, metadata !48, metadata !46, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x13\00SVal\001\00128\0064\000\000\000", metadata !47, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
+!2 = metadata !{metadata !"0x29", metadata !47} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", metadata !47, metadata !48, metadata !48, metadata !46, null,  null} ; [ DW_TAG_compile_unit ]
 !4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
-!5 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Data", i32 7, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ]
-!8 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 12} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0xd\00Data\007\0064\0064\000\000", metadata !47, metadata !1, metadata !6} ; [ DW_TAG_member ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, null} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0xd\00Kind\008\0032\0032\0064\000", metadata !47, metadata !1, metadata !8} ; [ DW_TAG_member ]
+!8 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !47, metadata !2} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\0012", metadata !47, metadata !1, metadata !10, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12, metadata !13}
-!12 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !47, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !12}
-!16 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 16} ; [ DW_TAG_subprogram ]
-!18 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\0011", metadata !47, metadata !1, metadata !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\0016", metadata !47, metadata !2, metadata !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{metadata !13, metadata !13, metadata !1}
-!20 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 23} ; [ DW_TAG_subprogram ]
-!21 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\0023", metadata !47, metadata !2, metadata !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !47, metadata !2, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{metadata !13}
-!23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{metadata !"0x101\00i\0016\000", metadata !17, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 16, i32 0, metadata !17, null}
-!25 = metadata !{i32 786689, metadata !17, metadata !"location", metadata !2, i32 16, metadata !26, i32 0, null} ; [ DW_TAG_arg_variable ]
-!26 = metadata !{i32 786448, metadata !47, metadata !2, metadata !"SVal", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
+!25 = metadata !{metadata !"0x101\00location\0016\000", metadata !17, metadata !2, metadata !26} ; [ DW_TAG_arg_variable ]
+!26 = metadata !{metadata !"0x10\00SVal\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_reference_type ]
 !27 = metadata !{i32 17, i32 0, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !47, metadata !17, i32 16, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0xb\0016\000\002", metadata !47, metadata !17} ; [ DW_TAG_lexical_block ]
 !29 = metadata !{i32 18, i32 0, metadata !28, null}
 !30 = metadata !{i32 20, i32 0, metadata !28, null}
-!31 = metadata !{i32 786689, metadata !16, metadata !"this", metadata !2, i32 11, metadata !32, i32 0, null} ; [ DW_TAG_arg_variable ]
-!32 = metadata !{i32 786470, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !33} ; [ DW_TAG_const_type ]
-!33 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_pointer_type ]
+!31 = metadata !{metadata !"0x101\00this\0011\000", metadata !16, metadata !2, metadata !32} ; [ DW_TAG_arg_variable ]
+!32 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !47, metadata !2, metadata !33} ; [ DW_TAG_const_type ]
+!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !47, metadata !2, metadata !1} ; [ DW_TAG_pointer_type ]
 !34 = metadata !{i32 11, i32 0, metadata !16, null}
 !35 = metadata !{i32 11, i32 0, metadata !36, null}
-!36 = metadata !{i32 786443, metadata !47, metadata !37, i32 11, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
-!37 = metadata !{i32 786443, metadata !47, metadata !16, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!38 = metadata !{i32 786688, metadata !39, metadata !"v", metadata !2, i32 24, metadata !1, i32 0, null} ; [ DW_TAG_auto_variable ]
-!39 = metadata !{i32 786443, metadata !47, metadata !40, i32 23, i32 0, i32 4} ; [ DW_TAG_lexical_block ]
-!40 = metadata !{i32 786443, metadata !47, metadata !20, i32 23, i32 0, i32 3} ; [ DW_TAG_lexical_block ]
+!36 = metadata !{metadata !"0xb\0011\000\001", metadata !47, metadata !37} ; [ DW_TAG_lexical_block ]
+!37 = metadata !{metadata !"0xb\0011\000\000", metadata !47, metadata !16} ; [ DW_TAG_lexical_block ]
+!38 = metadata !{metadata !"0x100\00v\0024\000", metadata !39, metadata !2, metadata !1} ; [ DW_TAG_auto_variable ]
+!39 = metadata !{metadata !"0xb\0023\000\004", metadata !47, metadata !40} ; [ DW_TAG_lexical_block ]
+!40 = metadata !{metadata !"0xb\0023\000\003", metadata !47, metadata !20} ; [ DW_TAG_lexical_block ]
 !41 = metadata !{i32 24, i32 0, metadata !39, null}
 !42 = metadata !{i32 25, i32 0, metadata !39, null}
 !43 = metadata !{i32 26, i32 0, metadata !39, null}
-!44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ]
+!44 = metadata !{metadata !"0x100\00k\0026\000", metadata !39, metadata !2, metadata !13} ; [ DW_TAG_auto_variable ]
 !45 = metadata !{i32 27, i32 0, metadata !39, null}
 !47 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
 !48 = metadata !{i32 0}
-!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index a65b632..f52e922 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll

@@ -15,21 +15,21 @@
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 53, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114084)", i1 false, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0053\000\001\000\006\000\000\000", metadata !14, metadata !1, metadata !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 114084)\000\00\000\00\000", metadata !15, metadata !16, metadata !16, metadata !13, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !14, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !14, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !15, metadata !7, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !14, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", metadata !15, metadata !7, metadata !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
 !8 = metadata !{i32 53, i32 13, metadata !9, null}
-!9 = metadata !{i32 786443, metadata !14, metadata !0, i32 53, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!9 = metadata !{metadata !"0xb\0053\0011\000", metadata !14, metadata !0} ; [ DW_TAG_lexical_block ]
 !10 = metadata !{i32 4, i32 13, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !15, metadata !12, i32 4, i32 13, i32 2} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 786443, metadata !15, metadata !6, i32 4, i32 11, i32 1} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{metadata !"0xb\004\0013\002", metadata !15, metadata !12} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{metadata !"0xb\004\0011\001", metadata !15, metadata !6} ; [ DW_TAG_lexical_block ]
 !13 = metadata !{metadata !0, metadata !6}
 !14 = metadata !{metadata !"", metadata !"/private/tmp"}
 !15 = metadata !{metadata !"bug.c", metadata !"/private/tmp"}
 !16 = metadata !{i32 0}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index 21ac7c9..53fb0af 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll

@@ -9,32 +9,32 @@
 define i32 @foo(%struct.bar* nocapture %i) nounwind readnone optsize noinline ssp {
 ; CHECK: TAG_formal_parameter
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6), !dbg !12
+  tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
   ret i32 1, !dbg !13
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!19}
 
-!0 = metadata !{i32 786478, metadata !17, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.bar*)* @foo, null, null, metadata !16, i32 3} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !17} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !17, i32 12, metadata !"clang version 2.9 (trunk 117922)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, metadata !15, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !17, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", metadata !17, metadata !1, metadata !3, null, i32 (%struct.bar*)* @foo, null, null, metadata !16} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 117922)\001\00\000\00\000", metadata !17, metadata !18, metadata !18, metadata !15, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !17, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !17, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !0, metadata !"i", metadata !1, i32 3, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786447, metadata !17, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !17, metadata !1, metadata !"bar", i32 2, i64 64, i64 32, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !17, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00i\003\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !17, metadata !1, metadata !8} ; [ DW_TAG_pointer_type ]
+!8 = metadata !{metadata !"0x13\00bar\002\0064\0032\000\000\000", metadata !17, metadata !1, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !11}
-!10 = metadata !{i32 786445, metadata !17,  metadata !1, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
-!11 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !5} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0xd\00x\002\0032\0032\000\000", metadata !17,  metadata !1, metadata !5} ; [ DW_TAG_member ]
+!11 = metadata !{metadata !"0xd\00y\002\0032\0032\0032\000", metadata !17, metadata !1, metadata !5} ; [ DW_TAG_member ]
 !12 = metadata !{i32 3, i32 47, metadata !0, null}
 !13 = metadata !{i32 4, i32 2, metadata !14, null}
-!14 = metadata !{i32 786443, metadata !17, metadata !0, i32 3, i32 50, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\003\0050\000", metadata !17, metadata !0} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{metadata !0}
 !16 = metadata !{metadata !6}
 !17 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
 !18 = metadata !{i32 0}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index 625a351..ac7fbf2 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll

@@ -22,8 +22,8 @@
 
 define i64 @gcd(i64 %a, i64 %b) nounwind readnone optsize noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !19
   br label %while.body, !dbg !20
 
 while.body:                                       ; preds = %while.body, %entry
@@ -34,14 +34,14 @@
   br i1 %cmp, label %if.then, label %while.body, !dbg !23
 
 if.then:                                          ; preds = %while.body
-  tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !21
   ret i64 %b.addr.0, !dbg !23
 }
 
 define i32 @main() nounwind optsize ssp {
 entry:
   %call = tail call i32 @rand() nounwind optsize, !dbg !24
-  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14), !dbg !24
+  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !24
   %cmp = icmp ugt i32 %call, 21, !dbg !25
   br i1 %cmp, label %cond.true, label %cond.end, !dbg !25
 
@@ -51,7 +51,7 @@
 
 cond.end:                                         ; preds = %entry, %cond.true
   %cond = phi i32 [ %call1, %cond.true ], [ %call, %entry ], !dbg !25
-  tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !25
   %conv = sext i32 %cond to i64, !dbg !26
   %conv5 = zext i32 %call to i64, !dbg !26
   %call6 = tail call i64 @gcd(i64 %conv, i64 %conv5) optsize, !dbg !26
@@ -71,36 +71,36 @@
 
 declare i32 @printf(i8* nocapture, ...) nounwind optsize
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 declare i32 @puts(i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"gcd", metadata !"gcd", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i64 (i64, i64)* @gcd, null, null, metadata !29, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
-!1 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !31, i32 12, metadata !"clang version 2.9 (trunk 124117)", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !28, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00gcd\00gcd\00\005\000\001\000\006\00256\001\000", metadata !31, metadata !1, metadata !3, null, i64 (i64, i64)* @gcd, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
+!1 = metadata !{metadata !"0x29", metadata !31} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 124117)\001\00\000\00\001", metadata !31, metadata !32, metadata !32, metadata !28, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !30, i32 0} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
-!7 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00main\00main\00\0025\000\001\000\006\000\001\000", metadata !31, metadata !1, metadata !7, null, i32 ()* @main, null, null, metadata !30} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !31, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 786689, metadata !0, metadata !"b", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 786688, metadata !13, metadata !"c", metadata !1, i32 6, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!13 = metadata !{i32 786443, metadata !31, metadata !0, i32 5, i32 52, i32 0} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{i32 786688, metadata !15, metadata !"m", metadata !1, i32 26, metadata !16, i32 0, null} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786443, metadata !31, metadata !6, i32 25, i32 12, i32 2} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!17 = metadata !{i32 786688, metadata !15, metadata !"z_s", metadata !1, i32 27, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x101\00a\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x101\00b\005\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{metadata !"0x100\00c\006\000", metadata !13, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!13 = metadata !{metadata !"0xb\005\0052\000", metadata !31, metadata !0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0x100\00m\0026\000", metadata !15, metadata !1, metadata !16} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0xb\0025\0012\002", metadata !31, metadata !6} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0x100\00z_s\0027\000", metadata !15, metadata !1, metadata !9} ; [ DW_TAG_auto_variable ]
 !18 = metadata !{i32 5, i32 41, metadata !0, null}
 !19 = metadata !{i32 5, i32 49, metadata !0, null}
 !20 = metadata !{i32 7, i32 5, metadata !13, null}
 !21 = metadata !{i32 8, i32 9, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !31, metadata !13, i32 7, i32 14, i32 1} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\007\0014\001", metadata !31, metadata !13} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{i32 9, i32 9, metadata !22, null}
 !24 = metadata !{i32 26, i32 38, metadata !15, null}
 !25 = metadata !{i32 27, i32 38, metadata !15, null}
@@ -111,4 +111,4 @@
 !30 = metadata !{metadata !14, metadata !17}
 !31 = metadata !{metadata !"rem_small.c", metadata !"/private/tmp"}
 !32 = metadata !{i32 0}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/2011-08-29-InitOrder.ll b/test/CodeGen/X86/2011-08-29-InitOrder.ll
index a95dcb5..b278ad6 100644
--- a/test/CodeGen/X86/2011-08-29-InitOrder.ll
+++ b/test/CodeGen/X86/2011-08-29-InitOrder.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s --check-prefix=CHECK-DEFAULT
+; RUN: llc < %s -mtriple=i386-linux-gnu -use-ctors | FileCheck %s --check-prefix=CHECK-DEFAULT
 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s --check-prefix=CHECK-DARWIN
 ; PR5329
 

diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 16706ae..6651af7 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll

@@ -8,7 +8,7 @@
 ;CHECK: vpxor
 ;CHECK: vinserti128
 ;CHECK: vpshufd
-;CHECK: vpshufd
+;CHECK: vpbroadcastd
 ;CHECK: vmulps
 ;CHECK: vmulps
 ;CHECK: ret

diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
index 1c39c74..519c7ca 100644
--- a/test/CodeGen/X86/2012-07-15-broadcastfold.ll
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll

@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s
 
 declare x86_fastcallcc i64 @barrier()
 

diff --git a/test/CodeGen/X86/2012-10-02-DAGCycle.ll b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
index 8d914db..403d21a 100644
--- a/test/CodeGen/X86/2012-10-02-DAGCycle.ll
+++ b/test/CodeGen/X86/2012-10-02-DAGCycle.ll

@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s
-; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s
+; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s > /dev/null
+; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s > /dev/null
 
 ; rdar://12393897
 

diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 62ee1e1..1a5efda 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll

@@ -12,11 +12,11 @@
 %struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] }
 %struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4)
+  call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4, metadata !{metadata !"0x102"})
   %type = getelementptr inbounds %struct.node.0.27* %p, i64 0, i32 0
   %0 = load i16* %type, align 2
   %cmp = icmp eq i16 %0, 1
@@ -33,16 +33,20 @@
   ret i16 %retval.0
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !11, metadata !2, metadata !2, metadata !13, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
 !2 = metadata !{}
-!4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725]
-!5 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
-!6 = metadata !{i32 786454, metadata !11, null, metadata !"hgstruct", i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 786451, metadata !11, null, metadata !"", i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x101\00hg\0067109589\000", null, metadata !5, metadata !6} ; [ DW_TAG_arg_variable ] [hg] [line 725]
+!5 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x16\00hgstruct\00492\000\000\000\000", metadata !11, null, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x13\00\00487\00512\0064\000\000\000", metadata !11, null, null, null, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!13 = metadata !{metadata !14}
+!14 = metadata !{metadata !"0x2e\00subdivp\00subdivp\00\000\000\001\000\006\00256\001\001", metadata !11, metadata !5, metadata !15, null, i16 (%struct.node.0.27*, double, double, %struct.hgstruct.2.29* )* @subdivp, null, null, null} ; [ DW_TAG_subprogram ] [def] [subdivp]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{null}

diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 36667de..083aacd 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll

@@ -12,7 +12,7 @@
 
 @.str15 = external hidden unnamed_addr constant [6 x i8], align 1
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp {
 entry:
@@ -43,7 +43,7 @@
   br label %if.then4073
 
 if.then4073:                                      ; preds = %if.then3344
-  call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4)
+  call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4, metadata !{metadata !"0x102"})
   %arraydecay4078 = getelementptr inbounds [20 x i8]* %num14075, i64 0, i64 0
   %0 = load i32* undef, align 4
   %add4093 = add nsw i32 %0, 0
@@ -65,26 +65,31 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35}
 
-!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", metadata !19, metadata !2, metadata !2, metadata !20, metadata !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
 !2 = metadata !{}
-!4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815]
-!5 = metadata !{i32 786443, metadata !14, metadata !6, i32 815, i32 0, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!6 = metadata !{i32 786443, metadata !14, metadata !7, i32 812, i32 0, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!7 = metadata !{i32 786443, metadata !14, metadata !8, i32 807, i32 0, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!8 = metadata !{i32 786443, metadata !14, metadata !9, i32 440, i32 0, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!9 = metadata !{i32 786443, metadata !14, metadata !10, i32 435, i32 0, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!10 = metadata !{i32 786443, metadata !14, metadata !11, i32 434, i32 0, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!11 = metadata !{i32 786443, metadata !14, metadata !12, i32 250, i32 0, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!12 = metadata !{i32 786443, metadata !14, metadata !13, i32 249, i32 0, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!13 = metadata !{i32 786443, metadata !14, metadata !2, i32 221, i32 0, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
-!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!4 = metadata !{metadata !"0x100\00num1\00815\000", metadata !5, metadata !14, metadata !15} ; [ DW_TAG_auto_variable ] [num1] [line 815]
+!5 = metadata !{metadata !"0xb\00815\000\00177", metadata !14, metadata !6} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!6 = metadata !{metadata !"0xb\00812\000\00176", metadata !14, metadata !7} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!7 = metadata !{metadata !"0xb\00807\000\00175", metadata !14, metadata !8} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!8 = metadata !{metadata !"0xb\00440\000\0094", metadata !14, metadata !9} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!9 = metadata !{metadata !"0xb\00435\000\0091", metadata !14, metadata !10} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!10 = metadata !{metadata !"0xb\00434\000\0090", metadata !14, metadata !11} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!11 = metadata !{metadata !"0xb\00250\000\0024", metadata !14, metadata !12} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!12 = metadata !{metadata !"0xb\00249\000\0023", metadata !14, metadata !13} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!13 = metadata !{metadata !"0xb\00221\000\0019", metadata !14, metadata !2} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!14 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!15 = metadata !{metadata !"0x1\00\000\00160\008\000\000", null, null, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
+!18 = metadata !{metadata !"0x21\000\0020"}       ; [ DW_TAG_subrange_type ] [0, 19]
 !19 = metadata !{metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset"}
 
+!20 = metadata !{metadata !21}
+!21 = metadata !{metadata !"0x2e\00AttachGalley\00AttachGalley\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, i32 (%union.rec**)* @AttachGalley, null, null, null} ; [ DW_TAG_subprogram ] [def] [AttachGalley]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{null}
+
 ; Test DebugValue uses visited by RegisterPressureTracker findUseBetween().
 ;
 ; CHECK: @main
@@ -103,7 +108,7 @@
   unreachable
 
 cond.end:                                         ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31)
+  call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31, metadata !{metadata !"0x102"})
   %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5
   invoke void @_Znwm()
           to label %exit.i unwind label %lpad2.i.i.i.i
@@ -129,9 +134,11 @@
 
 !llvm.dbg.cu = !{!30}
 
-!30 = metadata !{i32 786449, metadata !34, i32 4, metadata !"clang version 3.3 (trunk 169129) (llvm/trunk 169135)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
-!31 = metadata !{i32 786688, null, metadata !"X", null, i32 29, metadata !32, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [X] [line 29]
-!32 = metadata !{i32 786454, metadata !34, null, metadata !"HM", i32 28, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
-!33 = metadata !{i32 786473, metadata !34} ; [ DW_TAG_file_type ]
+!30 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169129) (llvm/trunk 169135)\001\00\000\00\000", metadata !34, metadata !2, metadata !2, metadata !36, null, null} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus]
+!31 = metadata !{metadata !"0x100\00X\0029\000", null, null, metadata !32} ; [ DW_TAG_auto_variable ] [X] [line 29]
+!32 = metadata !{metadata !"0x16\00HM\0028\000\000\000\000", metadata !34, null, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
+!33 = metadata !{metadata !"0x29", metadata !34} ; [ DW_TAG_file_type ]
 !34 = metadata !{metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++"}
-!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!36 = metadata !{metadata !37}
+!37 = metadata !{metadata !"0x2e\00main\00main\00\000\000\001\000\006\00256\001\001", metadata !19, metadata !14, metadata !22, null, void ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [def] [main]

diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index 5aec3d9..458ce4f 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll

@@ -9,7 +9,7 @@
 
 %struct.btCompoundLeafCallback = type { i32, i32 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define void @test() unnamed_addr uwtable ssp align 2 {
 entry:
@@ -20,7 +20,7 @@
   unreachable
 
 if.end:                                           ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3)
+  call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3, metadata !{metadata !"0x102"})
   %m = getelementptr inbounds %struct.btCompoundLeafCallback* %callback, i64 0, i32 1
   store i32 0, i32* undef, align 8
   %cmp12447 = icmp sgt i32 undef, 0
@@ -36,11 +36,13 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8}
 
-!0 = metadata !{i32 786449, metadata !6, i32 4, metadata !"clang version 3.3 (trunk 168984) (llvm/trunk 168983)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
-!2 = metadata !{null}
-!3 = metadata !{i32 786688, null, metadata !"callback", null, i32 214, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [callback] [line 214]
-!4 = metadata !{i32 786451, metadata !6, null, metadata !"btCompoundLeafCallback", i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
-!5 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 168984) (llvm/trunk 168983)\001\00\000\00\000", metadata !6, null, null, metadata !1, null, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{metadata !"0x2e\00test\00test\00\000\000\001\000\006\00256\001\001", metadata !6, metadata !5, metadata !7, null, void ()* @test, null, null, null} ; [ DW_TAG_subprogram ] [def] [test]
+!3 = metadata !{metadata !"0x100\00callback\00214\000", null, null, metadata !4} ; [ DW_TAG_auto_variable ] [callback] [line 214]
+!4 = metadata !{metadata !"0x13\00btCompoundLeafCallback\0090\00512\0064\000\000\000", metadata !6, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x29", metadata !6} ; [ DW_TAG_file_type ]
 !6 = metadata !{metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet"}
-!7 = metadata !{i32 0}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!9 = metadata !{null}

diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
index bbba796..10dc927 100644
--- a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
+++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll

@@ -6,7 +6,7 @@
 ; we may reference variables that were not live across basic blocks
 ; resulting in undefined virtual registers.
 ;
-; In this example, this is illustrated by a the spill/reload of the
+; In this example, this is illustrated by a spill/reload of the
 ; LOADED_PTR_SLOT.
 ;
 ; Before this patch, the compiler was accessing two different spill

diff --git a/test/CodeGen/X86/2014-08-29-CompactUnwind.ll b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll
new file mode 100644
index 0000000..f65d7c9
--- /dev/null
+++ b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll

@@ -0,0 +1,46 @@
+; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 -filetype=obj -o - | llvm-objdump -d -unwind-info -s - | FileCheck %s
+; Regression test for http://llvm.org/bugs/show_bug.cgi?id=20800.
+
+; ModuleID = 'asan_report.ii'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@.str = private unnamed_addr constant [3 x i8] c"=>\00", align 1
+@.str1 = private unnamed_addr constant [3 x i8] c"  \00", align 1
+@.str2 = private unnamed_addr constant [6 x i8] c"%s%p:\00", align 1
+
+; CHECK: ___asan_report_error:
+
+; subq instruction starts at 0x0a, so the second byte of the compact encoding
+; (UNWIND_X86_64_FRAMELESS_STACK_SIZE in mach-o/compact_unwind_encoding.h)
+; must be 0x0d.
+; CHECK: {{a:.*subq.*%rsp}}
+
+; CHECK: Contents of __compact_unwind section
+; CHECK: ___asan_report_error
+
+; Because of incorrect push instruction size in X86AsmBackend.cpp the stack
+; size was also calculated incorrectly.
+; CHECK-NOT: {{compact encoding:.*0x0309f800}}
+; CHECK: {{compact encoding:.*0x030df800}}
+
+define void @__asan_report_error() #0 {
+  %str.i = alloca i64, align 8
+  %stack = alloca [256 x i64], align 8
+  br label %print_shadow_bytes.exit.i
+
+print_shadow_bytes.exit.i: ; preds = %print_shadow_bytes.exit.i, %0
+  %iv.i = phi i64 [ -5, %0 ], [ %iv.next.i, %print_shadow_bytes.exit.i ]
+  %reg15 = icmp eq i64 %iv.i, 0
+  %.str..str1.i = select i1 %reg15, [3 x i8]* @.str, [3 x i8]* @.str1
+  %reg16 = getelementptr inbounds [3 x i8]* %.str..str1.i, i64 0, i64 0
+  %reg17 = shl i64 %iv.i, 1
+  %reg19 = inttoptr i64 %reg17 to i8*
+  call void (i64*, i8*, ...)* @append(i64* %str.i, i8* getelementptr inbounds ([6 x i8]* @.str2, i64 0, i64 0), i8* %reg16, i8* %reg19)
+  %iv.next.i = add nsw i64 %iv.i, 0
+  br label %print_shadow_bytes.exit.i
+}
+
+declare void @append(i64*, i8*, ...)
+
+attributes #0 = { "no-frame-pointer-elim"="false" }

diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 4ce2fb3..54d8f65 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll

@@ -4,10 +4,10 @@
 target triple = "x86_64-apple-macosx10.7.0"
 
 define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp {
-  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6), !dbg !12
+  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
   %ab = load i32* %c, align 1, !dbg !14
-  tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7), !dbg !13
-  tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10), !dbg !14
+  tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !13
+  tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !14
   %cd = icmp eq i32 %i, 42, !dbg !15
   br i1 %cd, label %bb1, label %bb2, !dbg !15
 
@@ -23,23 +23,23 @@
   ret i32 %.0, !dbg !17
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32*)* @foo, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", metadata !20, metadata !21, metadata !21, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\001\000", metadata !20, metadata !2, metadata !3, null, i32 (i32, i32*)* @foo, null, null, metadata !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554434, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{i32 786468, null, metadata !0, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786688, metadata !11, metadata !"a", metadata !2, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !20, metadata !1, i32 2, i32 25, i32 0} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{metadata !"0x101\00c\0033554434\000", metadata !1, metadata !2, metadata !8} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !0, metadata !9} ; [ DW_TAG_pointer_type ]
+!9 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !0} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x100\00a\003\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\002\0025\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{i32 2, i32 13, metadata !1, null}
 !13 = metadata !{i32 2, i32 22, metadata !1, null}
 !14 = metadata !{i32 3, i32 14, metadata !11, null}
@@ -50,4 +50,4 @@
 !19 = metadata !{metadata !6, metadata !7, metadata !10}
 !20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 51d0d17..6865873 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll

@@ -5,7 +5,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define void @foo() nounwind uwtable ssp {
 entry:
@@ -17,7 +17,7 @@
 for.body:
   call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
   call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
-  call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22) nounwind
+  call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22, metadata !{metadata !"0x102"}) nounwind
   br label %for.body
 }
 
@@ -27,9 +27,9 @@
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23}
-!0 = metadata !{i32 524305, metadata !1, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\001\00clang\001\00\000\00\000", metadata !1, metadata !2, metadata !2, null, null, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"t.c", metadata !""}
-!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6}
+!16 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
 !2 = metadata !{i32 0}
-!22 = metadata !{i32 786688, null, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0}
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{metadata !"0x100\00x\0016\000", null, metadata !2, metadata !16} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll
index 100817a..a435272 100644
--- a/test/CodeGen/X86/SwizzleShuff.ll
+++ b/test/CodeGen/X86/SwizzleShuff.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s
 
 ; Check that we perform a scalar XOR on i32.
 

diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll
new file mode 100644
index 0000000..8c66412
--- /dev/null
+++ b/test/CodeGen/X86/TruncAssertZext.ll

@@ -0,0 +1,16 @@
+; RUN: llc < %s -O2 -march=x86-64 | FileCheck %s
+; Checks that a zeroing mov is inserted for the trunc/zext pair even when
+; the source of the zext is an AssertSext node
+; PR20494
+
+define i64 @main(i64 %a) {
+; CHECK-LABEL: main
+; CHECK: movl %e{{..}}, %eax
+; CHECK: ret
+  %or = or i64 %a, -2
+  %trunc = trunc i64 %or to i32
+  br label %l
+l:
+  %ext = zext i32 %trunc to i64
+  ret i64 %ext
+}

diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index 1513fcb..9c24be4 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll

@@ -4,7 +4,7 @@
 define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: test1:
-; CHECK: cmpl %ecx, %eax
+; CHECK: cmpl %ecx, %eax 
 ; CHECK-NOT: addl
 ; CHECK: adcl $0, %eax
   %add4 = add i32 %x, %sum

diff --git a/test/CodeGen/X86/add_shl_constant.ll b/test/CodeGen/X86/add_shl_constant.ll
new file mode 100644
index 0000000..33074e4
--- /dev/null
+++ b/test/CodeGen/X86/add_shl_constant.ll

@@ -0,0 +1,49 @@
+; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+; CHECK-LABEL: add_shl_add_constant_1_i32
+; CHECK: leal 984(%rsi,%rdi,8), %eax
+; CHECK-NEXT: retq
+define i32 @add_shl_add_constant_1_i32(i32 %x, i32 %y) nounwind {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %shl, %y
+  ret i32 %add.1
+}
+
+; CHECK-LABEL: add_shl_add_constant_2_i32
+; CHECK: leal 984(%rsi,%rdi,8), %eax
+; CHECK-NEXT: retq
+define i32 @add_shl_add_constant_2_i32(i32 %x, i32 %y) nounwind {
+  %add.0 = add i32 %x, 123
+  %shl = shl i32 %add.0, 3
+  %add.1 = add i32 %y, %shl
+  ret i32 %add.1
+}
+
+; CHECK: LCPI2_0:
+; CHECK: .long 984
+; CHECK: _add_shl_add_constant_1_v4i32
+; CHECK: pslld $3, %[[REG:xmm[0-9]+]]
+; CHECK: paddd %xmm1, %[[REG]]
+; CHECK: paddd LCPI2_0(%rip), %[[REG:xmm[0-9]+]]
+; CHECK: retq
+define <4 x i32> @add_shl_add_constant_1_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+  %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123>
+  %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3>
+  %add.1 = add <4 x i32> %shl, %y
+  ret <4 x i32> %add.1
+}
+
+; CHECK: LCPI3_0:
+; CHECK: .long 984
+; CHECK: _add_shl_add_constant_2_v4i32
+; CHECK: pslld $3, %[[REG:xmm[0-9]+]]
+; CHECK: paddd %xmm1, %[[REG]]
+; CHECK: paddd LCPI3_0(%rip), %[[REG:xmm[0-9]+]]
+; CHECK: retq
+define <4 x i32> @add_shl_add_constant_2_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+  %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123>
+  %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3>
+  %add.1 = add <4 x i32> %y, %shl
+  ret <4 x i32> %add.1
+}

diff --git a/test/CodeGen/X86/addr-mode-matcher.ll b/test/CodeGen/X86/addr-mode-matcher.ll
new file mode 100644
index 0000000..d592091
--- /dev/null
+++ b/test/CodeGen/X86/addr-mode-matcher.ll

@@ -0,0 +1,62 @@
+; RUN: llc < %s | FileCheck %s
+
+; This testcase used to hit an assert during ISel.  For details, see the big
+; comment inside the function.
+
+; CHECK-LABEL: foo:
+; The AND should be turned into a subreg access.
+; CHECK-NOT: and
+; The shift (leal) should be folded into the scale of the address in the load.
+; CHECK-NOT: leal
+; CHECK: movl {{.*}},4),
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.6.0"
+
+define void @foo(i32 %a) {
+bb:
+  br label %bb1692
+
+bb1692:
+  %tmp1694 = phi i32 [ 0, %bb ], [ %tmp1745, %bb1692 ]
+  %xor = xor i32 0, %tmp1694
+
+; %load1 = (load (and (shl %xor, 2), 1020))
+  %tmp1701 = shl i32 %xor, 2
+  %tmp1702 = and i32 %tmp1701, 1020
+  %tmp1703 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1702
+  %tmp1704 = bitcast i8* %tmp1703 to i32*
+  %load1 = load i32* %tmp1704, align 4
+
+; %load2 = (load (shl (and %xor, 255), 2))
+  %tmp1698 = and i32 %xor, 255
+  %tmp1706 = shl i32 %tmp1698, 2
+  %tmp1707 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1706
+  %tmp1708 = bitcast i8* %tmp1707 to i32*
+  %load2 = load i32* %tmp1708, align 4
+
+  %tmp1710 = or i32 %load2, %a
+
+; While matching xor we address-match %load1.  The and-of-shift reassocication
+; in address matching transform this into into a shift-of-and and the resuting
+; node becomes identical to %load2.  CSE replaces %load1 which leaves its
+; references in MatchScope and RecordedNodes stale.
+  %tmp1711 = xor i32 %load1, %tmp1710
+
+  %tmp1744 = getelementptr inbounds [256 x i32]* null, i32 0, i32 %tmp1711
+  store i32 0, i32* %tmp1744, align 4
+  %tmp1745 = add i32 %tmp1694, 1
+  indirectbr i8* undef, [label %bb1756, label %bb1692]
+
+bb1756:
+  br label %bb2705
+
+bb2705:
+  indirectbr i8* undef, [label %bb5721, label %bb5736]
+
+bb5721:
+  br label %bb2705
+
+bb5736:
+  ret void
+}

diff --git a/test/CodeGen/X86/address-type-promotion-constantexpr.ll b/test/CodeGen/X86/address-type-promotion-constantexpr.ll
new file mode 100644
index 0000000..32f29bd
--- /dev/null
+++ b/test/CodeGen/X86/address-type-promotion-constantexpr.ll

@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux
+
+; PR20314 is a crashing bug. This program does nothing with the load, so just check that the return is 0.
+
+@c = common global [2 x i32] zeroinitializer, align 4
+@a = common global i32 0, align 4
+@b = internal unnamed_addr constant [2 x i8] c"\01\00", align 1
+
+; CHECK-LABEL: main
+; CHECK: xor %eax, %eax
+define i32 @main() {
+entry:
+  %foo = load i8* getelementptr ([2 x i8]* @b, i64 0, i64 sext (i8 or (i8 zext (i1 icmp eq (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i8), i8 1) to i64)), align 1
+  ret i32 0
+}
+

diff --git a/test/CodeGen/X86/adx-intrinsics.ll b/test/CodeGen/X86/adx-intrinsics.ll
new file mode 100644
index 0000000..0498177
--- /dev/null
+++ b/test/CodeGen/X86/adx-intrinsics.ll

@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 --show-mc-encoding| FileCheck %s --check-prefix=NOADX --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=broadwell --show-mc-encoding| FileCheck %s --check-prefix=ADX --check-prefix=CHECK
+
+declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*)
+
+define i8 @test_addcarryx_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarryx_u32
+; CHECK: addb
+; ADX: adcxl
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)
+
+define i8 @test_addcarryx_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarryx_u64
+; CHECK: addb
+; ADX: adcxq
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.addcarryx.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*)
+
+define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_u32
+; CHECK: addb
+; ADX: adcxl
+; NOADX: adcl
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.addcarry.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*)
+
+define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_addcarry_u64
+; CHECK: addb
+; ADX: adcxq
+; NOADX: adcq
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.addcarry.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.subborrow.u32(i8, i32, i32, i8*)
+
+define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
+; CHECK-LABEL: test_subborrow_u32
+; CHECK: addb
+; CHECK: sbbl
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.subborrow.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
+  ret i8 %ret;
+}
+
+declare i8 @llvm.x86.subborrow.u64(i8, i64, i64, i8*)
+
+define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
+; CHECK-LABEL: test_subborrow_u64
+; CHECK: addb
+; CHECK: sbbq
+; CHECK: setb
+; CHECK: retq
+  %ret = tail call i8 @llvm.x86.subborrow.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
+  ret i8 %ret;
+}
+

diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index bf55644..82a8e48 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll

@@ -30,12 +30,12 @@
   ret i32 0
 }
 ; CHECK-DAG: .weak	bar_f
-@bar_f = alias weak %FunTy* @foo_f
+@bar_f = weak alias %FunTy* @foo_f
 
-@bar_l = alias linkonce_odr i32* @bar
+@bar_l = linkonce_odr alias i32* @bar
 ; CHECK-DAG: .weak	bar_l
 
-@bar_i = alias internal i32* @bar
+@bar_i = internal alias i32* @bar
 
 ; CHECK-DAG: .globl	A
 @A = alias bitcast (i32* @bar to i64*)

diff --git a/test/CodeGen/X86/aligned-variadic.ll b/test/CodeGen/X86/aligned-variadic.ll
new file mode 100644
index 0000000..e2155fe
--- /dev/null
+++ b/test/CodeGen/X86/aligned-variadic.ll

@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin   | FileCheck %s -check-prefix=X32
+
+%struct.Baz = type { [17 x i8] }
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+
+; Function Attrs: nounwind uwtable
+define void @bar(%struct.Baz* byval nocapture readnone align 8 %x, ...) {
+entry:
+  %va = alloca [1 x %struct.__va_list_tag], align 16
+  %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0
+  %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %va to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  %overflow_arg_area_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0, i32 2
+  %overflow_arg_area = load i8** %overflow_arg_area_p, align 8
+  %overflow_arg_area.next = getelementptr i8* %overflow_arg_area, i64 24
+  store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8
+; X32: leal    68(%esp), [[REG:%.*]]
+; X32: movl    [[REG]], 16(%esp)
+; X64: leaq    232(%rsp), [[REG:%.*]]
+; X64: movq    [[REG]], 184(%rsp)
+; X64: leaq    176(%rsp), %rdi
+  call void @qux(%struct.__va_list_tag* %arraydecay)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*)
+
+declare void @qux(%struct.__va_list_tag*)

diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll
index 74b9470..9d8b6cf 100644
--- a/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/test/CodeGen/X86/alloca-align-rounding.ll

@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux -enable-misched=false | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnux32 -enable-misched=false | FileCheck %s -check-prefix=X32ABI
 
 declare void @bar(<2 x i64>* %n)
 
@@ -6,15 +7,29 @@
   %p = alloca <2 x i64>, i64 %h
   call void @bar(<2 x i64>* %p)
   ret void
-; CHECK: foo
+; CHECK-LABEL: foo
 ; CHECK-NOT: andq $-32, %rax
+; X32ABI-LABEL: foo
+; X32ABI-NOT: andl $-32, %eax
 }
 
 define void @foo2(i64 %h) {
   %p = alloca <2 x i64>, i64 %h, align 32
   call void @bar(<2 x i64>* %p)
   ret void
-; CHECK: foo2
+; CHECK-LABEL: foo2
 ; CHECK: andq $-32, %rsp
 ; CHECK: andq $-32, %rax
+; X32ABI-LABEL: foo2
+; X32ABI: andl $-32, %esp
+; X32ABI: andl $-32, %eax
+}
+
+define void @foo3(i64 %h) {
+  %p = alloca <2 x i64>, i64 %h
+  ret void
+; CHECK-LABEL: foo3
+; CHECK: movq %rbp, %rsp
+; X32ABI-LABEL: foo3
+; X32ABI: movl %ebp, %esp
 }

diff --git a/test/CodeGen/X86/asm-block-labels.ll b/test/CodeGen/X86/asm-block-labels.ll
index 6dbfb16..9352438 100644
--- a/test/CodeGen/X86/asm-block-labels.ll
+++ b/test/CodeGen/X86/asm-block-labels.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc -no-integrated-as
+; RUN: opt < %s -O3 | llc -no-integrated-as
 ; ModuleID = 'block12.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"

diff --git a/test/CodeGen/X86/atomic-load-store-wide.ll b/test/CodeGen/X86/atomic-load-store-wide.ll
index 7352d5a..ad1a5c6 100644
--- a/test/CodeGen/X86/atomic-load-store-wide.ll
+++ b/test/CodeGen/X86/atomic-load-store-wide.ll

@@ -4,16 +4,18 @@
 ; FIXME: The generated code can be substantially improved.
 
 define void @test1(i64* %ptr, i64 %val1) {
-; CHECK: test1
-; CHECK: cmpxchg8b
+; CHECK-LABEL: test1
+; CHECK: lock
+; CHECK-NEXT: cmpxchg8b
 ; CHECK-NEXT: jne
   store atomic i64 %val1, i64* %ptr seq_cst, align 8
   ret void
 }
 
 define i64 @test2(i64* %ptr) {
-; CHECK: test2
-; CHECK: cmpxchg8b
+; CHECK-LABEL: test2
+; CHECK: lock
+; CHECK-NEXT: cmpxchg8b
   %val = load atomic i64* %ptr seq_cst, align 8
   ret i64 %val
 }

diff --git a/test/CodeGen/X86/atomic-ops-ancient-64.ll b/test/CodeGen/X86/atomic-ops-ancient-64.ll
index 18749b9..508d83b 100644
--- a/test/CodeGen/X86/atomic-ops-ancient-64.ll
+++ b/test/CodeGen/X86/atomic-ops-ancient-64.ll

@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s
+; XFAIL: *
 
 define i64 @test_add(i64* %addr, i64 %inc) {
 ; CHECK-LABEL: test_add:

diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll
index bdd25e6..f60212d 100644
--- a/test/CodeGen/X86/atomic_add.ll
+++ b/test/CodeGen/X86/atomic_add.ll

@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
 
 ; rdar://7103704
 
@@ -14,6 +15,8 @@
 entry:
 ; CHECK-LABEL: inc4:
 ; CHECK: incq
+; SLOW_INC-LABEL: inc4:
+; SLOW_INC-NOT: incq
   %0 = atomicrmw add i64* %p, i64 1 monotonic
   ret void
 }
@@ -39,6 +42,8 @@
 entry:
 ; CHECK-LABEL: inc3:
 ; CHECK: incb
+; SLOW_INC-LABEL: inc3:
+; SLOW_INC-NOT: incb
   %0 = atomicrmw add i8* %p, i8 1 monotonic
   ret void
 }
@@ -64,6 +69,8 @@
 entry:
 ; CHECK-LABEL: inc2:
 ; CHECK: incw
+; SLOW_INC-LABEL: inc2:
+; SLOW_INC-NOT: incw
   %0 = atomicrmw add i16* %p, i16 1 monotonic
   ret void
 }
@@ -89,6 +96,8 @@
 entry:
 ; CHECK-LABEL: inc1:
 ; CHECK: incl
+; SLOW_INC-LABEL: inc1:
+; SLOW_INC-NOT: incl
   %0 = atomicrmw add i32* %p, i32 1 monotonic
   ret void
 }
@@ -113,6 +122,8 @@
 entry:
 ; CHECK-LABEL: dec4:
 ; CHECK: decq
+; SLOW_INC-LABEL: dec4:
+; SLOW_INC-NOT: decq
   %0 = atomicrmw sub i64* %p, i64 1 monotonic
   ret void
 }
@@ -138,6 +149,8 @@
 entry:
 ; CHECK-LABEL: dec3:
 ; CHECK: decb
+; SLOW_INC-LABEL: dec3:
+; SLOW_INC-NOT: decb
   %0 = atomicrmw sub i8* %p, i8 1 monotonic
   ret void
 }
@@ -163,6 +176,8 @@
 entry:
 ; CHECK-LABEL: dec2:
 ; CHECK: decw
+; SLOW_INC-LABEL: dec2:
+; SLOW_INC-NOT: decw
   %0 = atomicrmw sub i16* %p, i16 1 monotonic
   ret void
 }
@@ -189,6 +204,8 @@
 entry:
 ; CHECK-LABEL: dec1:
 ; CHECK: decl
+; SLOW_INC-LABEL: dec1:
+; SLOW_INC-NOT: decl
   %0 = atomicrmw sub i32* %p, i32 1 monotonic
   ret void
 }

diff --git a/test/CodeGen/X86/atomic_idempotent.ll b/test/CodeGen/X86/atomic_idempotent.ll
new file mode 100644
index 0000000..1afc535
--- /dev/null
+++ b/test/CodeGen/X86/atomic_idempotent.ll

@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc < %s -march=x86 -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+
+; On x86, an atomic rmw operation that does not modify the value in memory
+; (such as atomic add 0) can be replaced by an mfence followed by a mov.
+; This is explained (with the motivation for such an optimization) in
+; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
+
+define i8 @add8(i8* %p) {
+; CHECK-LABEL: add8
+; CHECK: mfence
+; CHECK: movb
+  %1 = atomicrmw add i8* %p, i8 0 monotonic
+  ret i8 %1
+}
+
+define i16 @or16(i16* %p) {
+; CHECK-LABEL: or16
+; CHECK: mfence
+; CHECK: movw
+  %1 = atomicrmw or i16* %p, i16 0 acquire
+  ret i16 %1
+}
+
+define i32 @xor32(i32* %p) {
+; CHECK-LABEL: xor32
+; CHECK: mfence
+; CHECK: movl
+  %1 = atomicrmw xor i32* %p, i32 0 release
+  ret i32 %1
+}
+
+define i64 @sub64(i64* %p) {
+; CHECK-LABEL: sub64
+; X64: mfence
+; X64: movq
+; X32-NOT: mfence
+  %1 = atomicrmw sub i64* %p, i64 0 seq_cst
+  ret i64 %1
+}
+
+define i128 @or128(i128* %p) {
+; CHECK-LABEL: or128
+; CHECK-NOT: mfence
+  %1 = atomicrmw or i128* %p, i128 0 monotonic
+  ret i128 %1
+}
+
+; For 'and', the idempotent value is (-1)
+define i32 @and32 (i32* %p) {
+; CHECK-LABEL: and32
+; CHECK: mfence
+; CHECK: movl
+  %1 = atomicrmw and i32* %p, i32 -1 acq_rel
+  ret i32 %1
+}

diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll
new file mode 100644
index 0000000..19e019e
--- /dev/null
+++ b/test/CodeGen/X86/atomic_mi.ll

@@ -0,0 +1,525 @@
+; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix X64
+; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix X32
+; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
+
+; This file checks that atomic (non-seq_cst) stores of immediate values are
+; done in one mov instruction and not 2. More precisely, it makes sure that the
+; immediate is not first copied uselessly into a register.
+
+; Similarily, it checks that a binary operation of an immediate with an atomic
+; variable that is stored back in that variable is done as a single instruction.
+; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release)
+; should be just an add instruction, instead of loading x into a register, doing
+; an add and storing the result back.
+; The binary operations supported are currently add, and, or, xor.
+; sub is not supported because they are translated by an addition of the
+; negated immediate.
+; Finally, we also check the same kind of pattern for inc/dec
+
+; seq_cst stores are left as (lock) xchgl, but we try to check every other
+; attribute at least once.
+
+; Please note that these operations do not require the lock prefix: only
+; sequentially consistent stores require this kind of protection on X86.
+; And even for seq_cst operations, llvm uses the xchg instruction which has
+; an implicit lock prefix, so making it explicit is not required.
+
+define void @store_atomic_imm_8(i8* %p) {
+; X64-LABEL: store_atomic_imm_8
+; X64: movb
+; X64-NOT: movb
+; X32-LABEL: store_atomic_imm_8
+; X32: movb
+; X32-NOT: movb
+  store atomic i8 42, i8* %p release, align 1
+  ret void
+}
+
+define void @store_atomic_imm_16(i16* %p) {
+; X64-LABEL: store_atomic_imm_16
+; X64: movw
+; X64-NOT: movw
+; X32-LABEL: store_atomic_imm_16
+; X32: movw
+; X32-NOT: movw
+  store atomic i16 42, i16* %p monotonic, align 2
+  ret void
+}
+
+define void @store_atomic_imm_32(i32* %p) {
+; X64-LABEL: store_atomic_imm_32
+; X64: movl
+; X64-NOT: movl
+;   On 32 bits, there is an extra movl for each of those functions
+;   (probably for alignment reasons).
+; X32-LABEL: store_atomic_imm_32
+; X32: movl 4(%esp), %eax
+; X32: movl
+; X32-NOT: movl
+  store atomic i32 42, i32* %p release, align 4
+  ret void
+}
+
+define void @store_atomic_imm_64(i64* %p) {
+; X64-LABEL: store_atomic_imm_64
+; X64: movq
+; X64-NOT: movq
+;   These are implemented with a CAS loop on 32 bit architectures, and thus
+;   cannot be optimized in the same way as the others.
+; X32-LABEL: store_atomic_imm_64
+; X32: cmpxchg8b
+  store atomic i64 42, i64* %p release, align 8
+  ret void
+}
+
+; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
+; even on X64, one must use movabsq that can only target a register.
+define void @store_atomic_imm_64_big(i64* %p) {
+; X64-LABEL: store_atomic_imm_64_big
+; X64: movabsq
+; X64: movq
+  store atomic i64 100000000000, i64* %p monotonic, align 8
+  ret void
+}
+
+; It would be incorrect to replace a lock xchgl by a movl
+define void @store_atomic_imm_32_seq_cst(i32* %p) {
+; X64-LABEL: store_atomic_imm_32_seq_cst
+; X64: xchgl
+; X32-LABEL: store_atomic_imm_32_seq_cst
+; X32: xchgl
+  store atomic i32 42, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- ADD -----
+
+define void @add_8(i8* %p) {
+; X64-LABEL: add_8
+; X64-NOT: lock
+; X64: addb
+; X64-NOT: movb
+; X32-LABEL: add_8
+; X32-NOT: lock
+; X32: addb
+; X32-NOT: movb
+  %1 = load atomic i8* %p seq_cst, align 1
+  %2 = add i8 %1, 2
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @add_16(i16* %p) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: add_16
+; X64-NOT: addw
+; X32-LABEL: add_16
+; X32-NOT: addw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = add i16 %1, 2
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @add_32(i32* %p) {
+; X64-LABEL: add_32
+; X64-NOT: lock
+; X64: addl
+; X64-NOT: movl
+; X32-LABEL: add_32
+; X32-NOT: lock
+; X32: addl
+; X32-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = add i32 %1, 2
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+define void @add_64(i64* %p) {
+; X64-LABEL: add_64
+; X64-NOT: lock
+; X64: addq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'addq'.
+; X32-LABEL: add_64
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = add i64 %1, 2
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @add_32_seq_cst(i32* %p) {
+; X64-LABEL: add_32_seq_cst
+; X64: xchgl
+; X32-LABEL: add_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = add i32 %1, 2
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- AND -----
+
+define void @and_8(i8* %p) {
+; X64-LABEL: and_8
+; X64-NOT: lock
+; X64: andb
+; X64-NOT: movb
+; X32-LABEL: and_8
+; X32-NOT: lock
+; X32: andb
+; X32-NOT: movb
+  %1 = load atomic i8* %p monotonic, align 1
+  %2 = and i8 %1, 2
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @and_16(i16* %p) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: and_16
+; X64-NOT: andw
+; X32-LABEL: and_16
+; X32-NOT: andw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = and i16 %1, 2
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @and_32(i32* %p) {
+; X64-LABEL: and_32
+; X64-NOT: lock
+; X64: andl
+; X64-NOT: movl
+; X32-LABEL: and_32
+; X32-NOT: lock
+; X32: andl
+; X32-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = and i32 %1, 2
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @and_64(i64* %p) {
+; X64-LABEL: and_64
+; X64-NOT: lock
+; X64: andq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'andq'.
+; X32-LABEL: and_64
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = and i64 %1, 2
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @and_32_seq_cst(i32* %p) {
+; X64-LABEL: and_32_seq_cst
+; X64: xchgl
+; X32-LABEL: and_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = and i32 %1, 2
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- OR -----
+
+define void @or_8(i8* %p) {
+; X64-LABEL: or_8
+; X64-NOT: lock
+; X64: orb
+; X64-NOT: movb
+; X32-LABEL: or_8
+; X32-NOT: lock
+; X32: orb
+; X32-NOT: movb
+  %1 = load atomic i8* %p acquire, align 1
+  %2 = or i8 %1, 2
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @or_16(i16* %p) {
+; X64-LABEL: or_16
+; X64-NOT: orw
+; X32-LABEL: or_16
+; X32-NOT: orw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = or i16 %1, 2
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @or_32(i32* %p) {
+; X64-LABEL: or_32
+; X64-NOT: lock
+; X64: orl
+; X64-NOT: movl
+; X32-LABEL: or_32
+; X32-NOT: lock
+; X32: orl
+; X32-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = or i32 %1, 2
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @or_64(i64* %p) {
+; X64-LABEL: or_64
+; X64-NOT: lock
+; X64: orq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'orq'.
+; X32-LABEL: or_64
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = or i64 %1, 2
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @or_32_seq_cst(i32* %p) {
+; X64-LABEL: or_32_seq_cst
+; X64: xchgl
+; X32-LABEL: or_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = or i32 %1, 2
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- XOR -----
+
+define void @xor_8(i8* %p) {
+; X64-LABEL: xor_8
+; X64-NOT: lock
+; X64: xorb
+; X64-NOT: movb
+; X32-LABEL: xor_8
+; X32-NOT: lock
+; X32: xorb
+; X32-NOT: movb
+  %1 = load atomic i8* %p acquire, align 1
+  %2 = xor i8 %1, 2
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @xor_16(i16* %p) {
+; X64-LABEL: xor_16
+; X64-NOT: xorw
+; X32-LABEL: xor_16
+; X32-NOT: xorw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = xor i16 %1, 2
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @xor_32(i32* %p) {
+; X64-LABEL: xor_32
+; X64-NOT: lock
+; X64: xorl
+; X64-NOT: movl
+; X32-LABEL: xor_32
+; X32-NOT: lock
+; X32: xorl
+; X32-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = xor i32 %1, 2
+  store atomic i32 %2, i32* %p release, align 4
+  ret void
+}
+
+define void @xor_64(i64* %p) {
+; X64-LABEL: xor_64
+; X64-NOT: lock
+; X64: xorq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'xorq'.
+; X32-LABEL: xor_64
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = xor i64 %1, 2
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @xor_32_seq_cst(i32* %p) {
+; X64-LABEL: xor_32_seq_cst
+; X64: xchgl
+; X32-LABEL: xor_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = xor i32 %1, 2
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- INC -----
+
+define void @inc_8(i8* %p) {
+; X64-LABEL: inc_8
+; X64-NOT: lock
+; X64: incb
+; X64-NOT: movb
+; X32-LABEL: inc_8
+; X32-NOT: lock
+; X32: incb
+; X32-NOT: movb
+; SLOW_INC-LABEL: inc_8
+; SLOW_INC-NOT: incb
+; SLOW_INC-NOT: movb
+  %1 = load atomic i8* %p seq_cst, align 1
+  %2 = add i8 %1, 1
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @inc_16(i16* %p) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: inc_16
+; X64-NOT: incw
+; X32-LABEL: inc_16
+; X32-NOT: incw
+; SLOW_INC-LABEL: inc_16
+; SLOW_INC-NOT: incw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = add i16 %1, 1
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @inc_32(i32* %p) {
+; X64-LABEL: inc_32
+; X64-NOT: lock
+; X64: incl
+; X64-NOT: movl
+; X32-LABEL: inc_32
+; X32-NOT: lock
+; X32: incl
+; X32-NOT: movl
+; SLOW_INC-LABEL: inc_32
+; SLOW_INC-NOT: incl
+; SLOW_INC-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = add i32 %1, 1
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+define void @inc_64(i64* %p) {
+; X64-LABEL: inc_64
+; X64-NOT: lock
+; X64: incq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'incq'.
+; X32-LABEL: inc_64
+; SLOW_INC-LABEL: inc_64
+; SLOW_INC-NOT: incq
+; SLOW_INC-NOT: movq
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = add i64 %1, 1
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @inc_32_seq_cst(i32* %p) {
+; X64-LABEL: inc_32_seq_cst
+; X64: xchgl
+; X32-LABEL: inc_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = add i32 %1, 1
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}
+
+; ----- DEC -----
+
+define void @dec_8(i8* %p) {
+; X64-LABEL: dec_8
+; X64-NOT: lock
+; X64: decb
+; X64-NOT: movb
+; X32-LABEL: dec_8
+; X32-NOT: lock
+; X32: decb
+; X32-NOT: movb
+; SLOW_INC-LABEL: dec_8
+; SLOW_INC-NOT: decb
+; SLOW_INC-NOT: movb
+  %1 = load atomic i8* %p seq_cst, align 1
+  %2 = sub i8 %1, 1
+  store atomic i8 %2, i8* %p release, align 1
+  ret void
+}
+
+define void @dec_16(i16* %p) {
+;   Currently the transformation is not done on 16 bit accesses, as the backend
+;   treat 16 bit arithmetic as expensive on X86/X86_64.
+; X64-LABEL: dec_16
+; X64-NOT: decw
+; X32-LABEL: dec_16
+; X32-NOT: decw
+; SLOW_INC-LABEL: dec_16
+; SLOW_INC-NOT: decw
+  %1 = load atomic i16* %p acquire, align 2
+  %2 = sub i16 %1, 1
+  store atomic i16 %2, i16* %p release, align 2
+  ret void
+}
+
+define void @dec_32(i32* %p) {
+; X64-LABEL: dec_32
+; X64-NOT: lock
+; X64: decl
+; X64-NOT: movl
+; X32-LABEL: dec_32
+; X32-NOT: lock
+; X32: decl
+; X32-NOT: movl
+; SLOW_INC-LABEL: dec_32
+; SLOW_INC-NOT: decl
+; SLOW_INC-NOT: movl
+  %1 = load atomic i32* %p acquire, align 4
+  %2 = sub i32 %1, 1
+  store atomic i32 %2, i32* %p monotonic, align 4
+  ret void
+}
+
+define void @dec_64(i64* %p) {
+; X64-LABEL: dec_64
+; X64-NOT: lock
+; X64: decq
+; X64-NOT: movq
+;   We do not check X86-32 as it cannot do 'decq'.
+; X32-LABEL: dec_64
+; SLOW_INC-LABEL: dec_64
+; SLOW_INC-NOT: decq
+; SLOW_INC-NOT: movq
+  %1 = load atomic i64* %p acquire, align 8
+  %2 = sub i64 %1, 1
+  store atomic i64 %2, i64* %p release, align 8
+  ret void
+}
+
+define void @dec_32_seq_cst(i32* %p) {
+; X64-LABEL: dec_32_seq_cst
+; X64: xchgl
+; X32-LABEL: dec_32_seq_cst
+; X32: xchgl
+  %1 = load atomic i32* %p monotonic, align 4
+  %2 = sub i32 %1, 1
+  store atomic i32 %2, i32* %p seq_cst, align 4
+  ret void
+}

diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 1fd9085..02ea173 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll

@@ -51,46 +51,6 @@
   ret <4 x i64> %shuffle
 }
 
-;;;
-;;; Check that some 256-bit vectors are xformed into 128 ops
-; CHECK: _A
-; CHECK: vshufpd $1
-; CHECK-NEXT: vextractf128 $1
-; CHECK-NEXT: vshufpd $1
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @A(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: _B
-; CHECK: vshufpd $1, %ymm
-define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 undef, i32 undef, i32 6>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: movlhps
-; CHECK-NEXT: vextractf128  $1
-; CHECK-NEXT: movlhps
-; CHECK-NEXT: vinsertf128 $1
-define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 undef, i32 0, i32 undef, i32 6>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vpshufd $-96
-; CHECK: vpshufd $-6
-; CHECK: vinsertf128 $1
-define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 10, i32 10, i32 11, i32 11>
-  ret <8 x i32> %shuffle
-}
-
 ;;; Don't crash on movd
 ; CHECK: _VMOVZQI2PQI
 ; CHECK: vmovd (%

diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
deleted file mode 100644
index d2a22d7..0000000
--- a/test/CodeGen/X86/avx-blend.ll
+++ /dev/null

@@ -1,202 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx  -mattr=+avx | FileCheck %s
-
-; AVX128 tests:
-
-;CHECK-LABEL: vsel_float:
-; select mask is <i1 true, i1 false, i1 true, i1 false>.
-; Big endian representation is 0101 = 5.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; the inverted mask: 1010 = 10.
-; According to the ABI:
-; v1 is in xmm0 => first argument is xmm0.
-; v2 is in xmm1 => second argument is xmm1.
-; result is in xmm0 => destination argument.
-;CHECK: vblendps    $10, %xmm1, %xmm0, %xmm0
-;CHECK: ret
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
-  ret <4 x float> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i32:
-;CHECK: vblendps   $10, %xmm1, %xmm0, %xmm0
-;CHECK: ret
-define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
-  ret <4 x i32> %vsel
-}
-
-
-;CHECK-LABEL: vsel_double:
-;CHECK: vmovsd
-;CHECK: ret
-define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
-  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
-  ret <2 x double> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i64:
-;CHECK: vmovsd
-;CHECK: ret
-define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
-  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
-  ret <2 x i64> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i8:
-;CHECK: vpblendvb
-;CHECK: ret
-define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
-  %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
-  ret <16 x i8> %vsel
-}
-
-
-; AVX256 tests:
-
-
-;CHECK-LABEL: vsel_float8:
-;CHECK-NOT: vinsertf128
-; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
-; which translates into the boolean mask (big endian representation):
-; 00010001 = 17.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; the inverted mask: 11101110 = 238.
-;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
-;CHECK: ret
-define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
-  ret <8 x float> %vsel
-}
-
-;CHECK-LABEL: vsel_i328:
-;CHECK-NOT: vinsertf128
-;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
-;CHECK-NEXT: ret
-define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
-  ret <8 x i32> %vsel
-}
-
-;CHECK-LABEL: vsel_double8:
-; select mask is 2x: 0001 => intel mask: ~0001 = 14
-; ABI:
-; v1 is in ymm0 and ymm1.
-; v2 is in ymm2 and ymm3.
-; result is in ymm0 and ymm1.
-; Compute the low part: res.low = blend v1.low, v2.low, blendmask
-;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
-; Compute the high part.
-;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
-;CHECK: ret
-define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
-  ret <8 x double> %vsel
-}
-
-;CHECK-LABEL: vsel_i648:
-;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
-;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
-;CHECK: ret
-define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
-  ret <8 x i64> %vsel
-}
-
-;CHECK-LABEL: vsel_double4:
-;CHECK-NOT: vinsertf128
-;CHECK: vblendpd $10
-;CHECK-NEXT: ret
-define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
-  ret <4 x double> %vsel
-}
-
-;; TEST blend + compares
-; CHECK: testa
-define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
-  ; CHECK: vcmplepd
-  ; CHECK: vblendvpd
-  %max_is_x = fcmp oge <2 x double> %x, %y
-  %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
-  ret <2 x double> %max
-}
-
-; CHECK: testb
-define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
-  ; CHECK: vcmpnlepd
-  ; CHECK: vblendvpd
-  %min_is_x = fcmp ult <2 x double> %x, %y
-  %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
-  ret <2 x double> %min
-}
-
-; If we can figure out a blend has a constant mask, we should emit the
-; blend instruction with an immediate mask
-define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
-; CHECK-LABEL: constant_blendvpd_avx:
-; CHECK-NOT: mov
-; CHECK: vblendpd
-; CHECK: ret
-  %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
-  ret <4 x double> %1
-}
-
-define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
-; CHECK-LABEL: constant_blendvps_avx:
-; CHECK-NOT: mov
-; CHECK: vblendps
-; CHECK: ret
-  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
-  ret <8 x float> %1
-}
-
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
-declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
-
-;; 4 tests for shufflevectors that optimize to blend + immediate
-; CHECK-LABEL: @blend_shufflevector_4xfloat
-define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
-; Equivalent select mask is <i1 true, i1 false, i1 true, i1 false>.
-; Big endian representation is 0101 = 5.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; Inverted mask: 1010 = 10.
-; According to the ABI:
-; a is in xmm0 => first argument is xmm0.
-; b is in xmm1 => second argument is xmm1.
-; Result is in xmm0 => destination argument.
-; CHECK: vblendps $10, %xmm1, %xmm0, %xmm0
-; CHECK: ret
-  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x float> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_8xfloat
-define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
-; CHECK: vblendps $190, %ymm1, %ymm0, %ymm0
-; CHECK: ret
-  %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15>
-  ret <8 x float> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_4xdouble
-define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
-; CHECK: vblendpd $2, %ymm1, %ymm0, %ymm0
-; CHECK: ret
-  %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-  ret <4 x double> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_4xi64
-define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK: vblendpd $13, %ymm1, %ymm0, %ymm0
-; CHECK: ret
-  %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
-  ret <4 x i64> %1
-}

diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 3e051bf..70ec124 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll

@@ -89,23 +89,23 @@
 ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
 
 ; X64-LABEL: test_prolog_epilog
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
-; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Folded Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
+; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
 ; X64: call
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
-; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
    %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
    ret <16 x float> %c

diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..d2b44cd
--- /dev/null
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  ; CHECK: vblendpd
+  %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  ; CHECK: vblendps
+  %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  ; CHECK: vdpps
+  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+
+

diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index ce31161..ef3e83f 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll

@@ -455,21 +455,21 @@
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-  ; CHECK: vpslldq
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
-  ; CHECK: vpslldq
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+

+

+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {

+  ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]

+  ret <2 x i64> %res

+}

+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone

+

+

+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {

+  ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]

+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]

+  ret <2 x i64> %res

+}

 declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
@@ -551,21 +551,21 @@
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-  ; CHECK: vpsrldq
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
-  ; CHECK: vpsrldq
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+

+

+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {

+  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]

+  ret <2 x i64> %res

+}

+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone

+

+

+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {

+  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero

+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]

+  ret <2 x i64> %res

+}

 declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
@@ -818,18 +818,18 @@
 
 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
   ; CHECK: vblendpd
-  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vblendps
-  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
@@ -850,35 +850,35 @@
 
 define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
   ; CHECK: vdppd
-  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vdpps
-  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vinsertps
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 
 define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vmpsadbw
-  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
-declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
 define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
@@ -899,10 +899,10 @@
 
 define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
   ; CHECK: vpblendw
-  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
-declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
 
 
 define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
@@ -1770,18 +1770,18 @@
 
 define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
   ; CHECK: vblendpd
-  %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
+  %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
   ret <4 x double> %res
 }
-declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
 
 
 define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
   ; CHECK: vblendps
-  %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+  %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 
 
 define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
@@ -1950,10 +1950,10 @@
 
 define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
   ; CHECK: vdpps
-  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
+  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 
 
 define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) {
@@ -2309,7 +2309,7 @@
 
 define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
   ; CHECK: vpermilpd
-  %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone
@@ -2324,7 +2324,7 @@
 
 
 define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
-  ; CHECK: vpshufd
+  ; CHECK: vpermilps
   %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }

diff --git a/test/CodeGen/X86/avx-movdup.ll b/test/CodeGen/X86/avx-movdup.ll
deleted file mode 100644
index 42d84de..0000000
--- a/test/CodeGen/X86/avx-movdup.ll
+++ /dev/null

@@ -1,34 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vmovsldup
-define <8 x float> @movdupA(<8 x float> %src) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  ret <8 x float> %shuffle.i
-}
-
-; CHECK: vmovshdup
-define <8 x float> @movdupB(<8 x float> %src) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
-  ret <8 x float> %shuffle.i
-}
-
-; CHECK: vmovsldup
-define <4 x i64> @movdupC(<4 x i64> %src) nounwind uwtable readnone ssp {
-entry:
-  %0 = bitcast <4 x i64> %src to <8 x float>
-  %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  %1 = bitcast <8 x float> %shuffle.i to <4 x i64>
-  ret <4 x i64> %1
-}
-
-; CHECK: vmovshdup
-define <4 x i64> @movdupD(<4 x i64> %src) nounwind uwtable readnone ssp {
-entry:
-  %0 = bitcast <4 x i64> %src to <8 x float>
-  %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
-  %1 = bitcast <8 x float> %shuffle.i to <4 x i64>
-  ret <4 x i64> %1
-}
-

diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll
deleted file mode 100644
index fb2287f..0000000
--- a/test/CodeGen/X86/avx-sext.ll
+++ /dev/null

@@ -1,199 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2
-
-define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; AVX: sext_8i16_to_8i32
-; AVX: vpmovsxwd
-
-  %B = sext <8 x i16> %A to <8 x i32>
-  ret <8 x i32>%B
-}
-
-define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; AVX: sext_4i32_to_4i64
-; AVX: vpmovsxdq
-
-  %B = sext <4 x i32> %A to <4 x i64>
-  ret <4 x i64>%B
-}
-
-; AVX: load_sext_test1
-; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test1
-; SSSE3: movq
-; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}}
-; SSSE3: psrad $16
-; SSSE3: ret
-
-; SSE2: load_sext_test1
-; SSE2: movq
-; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}}
-; SSE2: psrad $16
-; SSE2: ret
-define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
- %X = load <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i32>
- ret <4 x i32>%Y
-}
-
-; AVX: load_sext_test2
-; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test2
-; SSSE3: movd
-; SSSE3: pshufb
-; SSSE3: psrad $24
-; SSSE3: ret
-
-; SSE2: load_sext_test2
-; SSE2: movl
-; SSE2: psrad $24
-; SSE2: ret
-define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
- %X = load <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i32>
- ret <4 x i32>%Y
-}
-
-; AVX: load_sext_test3
-; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test3
-; SSSE3: movsbq
-; SSSE3: movsbq
-; SSSE3: punpcklqdq
-; SSSE3: ret
-
-; SSE2: load_sext_test3
-; SSE2: movsbq
-; SSE2: movsbq
-; SSE2: punpcklqdq
-; SSE2: ret
-define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
- %X = load <2 x i8>* %ptr
- %Y = sext <2 x i8> %X to <2 x i64>
- ret <2 x i64>%Y
-}
-
-; AVX: load_sext_test4
-; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test4
-; SSSE3: movswq
-; SSSE3: movswq
-; SSSE3: punpcklqdq
-; SSSE3: ret
-
-; SSE2: load_sext_test4
-; SSE2: movswq
-; SSE2: movswq
-; SSE2: punpcklqdq
-; SSE2: ret
-define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
- %X = load <2 x i16>* %ptr
- %Y = sext <2 x i16> %X to <2 x i64>
- ret <2 x i64>%Y
-}
-
-; AVX: load_sext_test5
-; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test5
-; SSSE3: movslq
-; SSSE3: movslq
-; SSSE3: punpcklqdq
-; SSSE3: ret
-
-; SSE2: load_sext_test5
-; SSE2: movslq
-; SSE2: movslq
-; SSE2: punpcklqdq
-; SSE2: ret
-define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
- %X = load <2 x i32>* %ptr
- %Y = sext <2 x i32> %X to <2 x i64>
- ret <2 x i64>%Y
-}
-
-; AVX: load_sext_test6
-; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}}
-; AVX: ret
-
-; SSSE3: load_sext_test6
-; SSSE3: movq
-; SSSE3: punpcklbw
-; SSSE3: psraw $8
-; SSSE3: ret
-
-; SSE2: load_sext_test6
-; SSE2: movq
-; SSE2: punpcklbw
-; SSE2: psraw $8
-; SSE2: ret
-define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
- %X = load <8 x i8>* %ptr
- %Y = sext <8 x i8> %X to <8 x i16>
- ret <8 x i16>%Y
-}
-
-; AVX: sext_4i1_to_4i64
-; AVX: vpslld  $31
-; AVX: vpsrad  $31
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
-  %extmask = sext <4 x i1> %mask to <4 x i64>
-  ret <4 x i64> %extmask
-}
-
-; AVX-LABEL: sext_16i8_to_16i16
-; AVX: vpmovsxbw
-; AVX: vmovhlps
-; AVX: vpmovsxbw
-; AVX: ret
-define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
- %X = load <16 x i8>* %ptr
- %Y = sext <16 x i8> %X to <16 x i16>
- ret <16 x i16> %Y
-}
-
-; AVX: sext_4i8_to_4i64
-; AVX: vpslld  $24
-; AVX: vpsrad  $24
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
-  %extmask = sext <4 x i8> %mask to <4 x i64>
-  ret <4 x i64> %extmask
-}
-
-; AVX: sext_4i8_to_4i64
-; AVX: vpmovsxbd
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
- %X = load <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i64>
- ret <4 x i64>%Y
-}
-
-; AVX: sext_4i16_to_4i64
-; AVX: vpmovsxwd
-; AVX: vpmovsxdq
-; AVX: vpmovsxdq
-; AVX: ret
-define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
- %X = load <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i64>
- ret <4 x i64>%Y
-}

diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
deleted file mode 100644
index 4a996d7..0000000
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ /dev/null

@@ -1,336 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; PR11102
-define <4 x float> @test1(<4 x float> %a) nounwind {
-  %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef>
-  ret <4 x float> %b
-; CHECK-LABEL: test1:
-;; TODO: This test could be improved by removing the xor instruction and
-;; having vinsertps zero out the needed elements.
-; CHECK: vxorps
-; CHECK: vinsertps
-}
-
-; rdar://10538417
-define <3 x i64> @test2(<2 x i64> %v) nounwind readnone {
-; CHECK-LABEL: test2:
-; CHECK: vinsertf128
-  %1 = shufflevector <2 x i64> %v, <2 x i64> %v, <3 x i32> <i32 0, i32 1, i32 undef>
-  %2 = shufflevector <3 x i64> zeroinitializer, <3 x i64> %1, <3 x i32> <i32 3, i32 4, i32 2>
-  ret <3 x i64> %2
-; CHECK: ret
-}
-
-define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind {
-  %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 undef>
-  ret <4 x i64> %c
-; CHECK-LABEL: test3:
-; CHECK: vblendpd
-; CHECK: ret
-}
-
-define <8 x float> @test4(float %a) nounwind {
-  %b = insertelement <8 x float> zeroinitializer, float %a, i32 0
-  ret <8 x float> %b
-; CHECK-LABEL: test4:
-; CHECK: vinsertf128
-}
-
-; rdar://10594409
-define <8 x float> @test5(float* nocapture %f) nounwind uwtable readonly ssp {
-entry:
-  %0 = bitcast float* %f to <4 x float>*
-  %1 = load <4 x float>* %0, align 16
-; CHECK: test5
-; CHECK: vmovaps
-; CHECK-NOT: vxorps
-; CHECK-NOT: vinsertf128
-  %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-  ret <8 x float> %shuffle.i
-}
-
-define <4 x double> @test6(double* nocapture %d) nounwind uwtable readonly ssp {
-entry:
-  %0 = bitcast double* %d to <2 x double>*
-  %1 = load <2 x double>* %0, align 16
-; CHECK: test6
-; CHECK: vmovaps
-; CHECK-NOT: vxorps
-; CHECK-NOT: vinsertf128
-  %shuffle.i = shufflevector <2 x double> %1, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-  ret <4 x double> %shuffle.i
-}
-
-define <16 x i16> @test7(<4 x i16> %a) nounwind {
-; CHECK: test7
-  %b = shufflevector <4 x i16> %a, <4 x i16> undef, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK: ret
-  ret <16 x i16> %b
-}
-
-; CHECK: test8
-define void @test8() {
-entry:
-  %0 = load <16 x i64> addrspace(1)* null, align 128
-  %1 = shufflevector <16 x i64> <i64 undef, i64 undef, i64 0, i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> %0, <16 x i32> <i32 17, i32 18, i32 2, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 26>
-  %2 = shufflevector <16 x i64> %1, <16 x i64> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 30, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 22, i32 20, i32 15>
-  store <16 x i64> %2, <16 x i64> addrspace(1)* undef, align 128
-; CHECK: ret
-  ret void
-}
-
-; Extract a value from a shufflevector..
-define i32 @test9(<4 x i32> %a) nounwind {
-; CHECK: test9
-; CHECK: vpextrd
-  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4>
-  %r = extractelement <8 x i32> %b, i32 2
-; CHECK: ret
-  ret i32 %r
-}
-
-; Extract a value which is the result of an undef mask.
-define i32 @test10(<4 x i32> %a) nounwind {
-; CHECK: @test10
-; CHECK-NOT: {{^[^#]*[a-z]}}
-; CHECK: ret
-  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %r = extractelement <8 x i32> %b, i32 2
-  ret i32 %r
-}
-
-define <4 x float> @test11(<4 x float> %a) nounwind  {
-; CHECK: test11
-; CHECK: vpshufd $27
-  %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x float> %tmp1
-}
-
-define <4 x float> @test12(<4 x float>* %a) nounwind  {
-; CHECK: test12
-; CHECK: vpshufd
-  %tmp0 = load <4 x float>* %a
-  %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x float> %tmp1
-}
-
-define <4 x i32> @test13(<4 x i32> %a) nounwind  {
-; CHECK: test13
-; CHECK: vpshufd $27
-  %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test14(<4 x i32>* %a) nounwind  {
-; CHECK: test14
-; CHECK: vpshufd $27, (
-  %tmp0 = load <4 x i32>* %a
-  %tmp1 = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ret <4 x i32> %tmp1
-}
-
-; CHECK: test15
-; CHECK: vpshufd $8
-; CHECK: ret
-define <4 x i32> @test15(<2 x i32>%x) nounwind readnone {
-  %x1 = shufflevector <2 x i32> %x, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  ret <4 x i32>%x1
-}
-
-; rdar://10974078
-define <8 x float> @test16(float* nocapture %f) nounwind uwtable readonly ssp {
-entry:
-  %0 = bitcast float* %f to <4 x float>*
-  %1 = load <4 x float>* %0, align 8
-; CHECK: test16
-; CHECK: vmovups
-; CHECK-NOT: vxorps
-; CHECK-NOT: vinsertf128
-  %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-  ret <8 x float> %shuffle.i
-}
-
-; PR12413
-; CHECK: shuf1
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpshufb
-define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) {
-entry:
- %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
- ret <32 x i8> %0
-}
-
-; handle the case where only half of the 256-bits is splittable
-; CHECK: shuf2
-; CHECK: vpshufb
-; CHECK: vpshufb
-; CHECK: vpextrb
-; CHECK: vpextrb
-define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) {
-entry:
- %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
- ret <32 x i8> %0
-}
-
-; CHECK: blend1
-; CHECK: vblendps
-; CHECK: ret
-define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-  ret <4 x i32> %t
-}
-
-; CHECK: blend2
-; CHECK: vblendps
-; CHECK: ret
-define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x i32> %t
-}
-
-; CHECK: blend2a
-; CHECK: vblendps
-; CHECK: ret
-define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x float> %t
-}
-
-; CHECK: blend3
-; CHECK-NOT: vblendps
-; CHECK: ret
-define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 2, i32 7>
-  ret <4 x i32> %t
-}
-
-; CHECK: blend4
-; CHECK: vblendpd
-; CHECK: ret
-define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-  ret <4 x i64> %t
-}
-
-; CHECK: narrow
-; CHECK: vpermilps
-; CHECK: ret
-define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline {
-  %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef>
-  ret <16 x i16> %t
-}
-
-;CHECK-LABEL: test17:
-;CHECK-NOT: vinsertf128
-;CHECK: ret
-define   <8 x float> @test17(<4 x float> %y) {
-  %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <8 x float> %x
-}
-
-; CHECK: test18
-; CHECK: vmovshdup
-; CHECK: vblendps
-; CHECK: ret
-define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
-  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  ret <8 x float>%S
-}
-
-; CHECK: test19
-; CHECK: vmovsldup
-; CHECK: vblendps
-; CHECK: ret
-define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
-  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  ret <8 x float>%S
-}
-
-; rdar://12684358
-; Make sure loads happen before stores.
-; CHECK: swap8doubles
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
-; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
-; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
-; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
-; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
-; CHECK: vextractf128
-; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
-; CHECK: vextractf128
-; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
-; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi)
-define void @swap8doubles(double* nocapture %A, double* nocapture %C) nounwind uwtable ssp {
-entry:
-  %add.ptr = getelementptr inbounds double* %A, i64 2
-  %v.i = bitcast double* %A to <2 x double>*
-  %0 = load <2 x double>* %v.i, align 1
-  %shuffle.i.i = shufflevector <2 x double> %0, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-  %v1.i = bitcast double* %add.ptr to <2 x double>*
-  %1 = load <2 x double>* %v1.i, align 1
-  %2 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i, <2 x double> %1, i8 1) nounwind
-  %add.ptr1 = getelementptr inbounds double* %A, i64 6
-  %add.ptr2 = getelementptr inbounds double* %A, i64 4
-  %v.i27 = bitcast double* %add.ptr2 to <2 x double>*
-  %3 = load <2 x double>* %v.i27, align 1
-  %shuffle.i.i28 = shufflevector <2 x double> %3, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-  %v1.i29 = bitcast double* %add.ptr1 to <2 x double>*
-  %4 = load <2 x double>* %v1.i29, align 1
-  %5 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i28, <2 x double> %4, i8 1) nounwind
-  %6 = bitcast double* %C to <4 x double>*
-  %7 = load <4 x double>* %6, align 32
-  %add.ptr5 = getelementptr inbounds double* %C, i64 4
-  %8 = bitcast double* %add.ptr5 to <4 x double>*
-  %9 = load <4 x double>* %8, align 32
-  %shuffle.i26 = shufflevector <4 x double> %7, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  %10 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %7, i8 1)
-  %shuffle.i = shufflevector <4 x double> %9, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  %11 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %9, i8 1)
-  store <2 x double> %shuffle.i26, <2 x double>* %v.i, align 16
-  store <2 x double> %10, <2 x double>* %v1.i, align 16
-  store <2 x double> %shuffle.i, <2 x double>* %v.i27, align 16
-  store <2 x double> %11, <2 x double>* %v1.i29, align 16
-  store <4 x double> %2, <4 x double>* %6, align 32
-  store <4 x double> %5, <4 x double>* %8, align 32
-  ret void
-}
-declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
-declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-
-; this test case just should not fail
-define void @test20() {
-  %a0 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double 0.000000e+00, i32 2
-  store <3 x double> %a0, <3 x double>* undef, align 1
-  %a1 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double undef, i32 2
-  store <3 x double> %a1, <3 x double>* undef, align 1
-  ret void
-}
-
-define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
-; CHECK-LABEL: test_insert_64_zext
-; CHECK-NOT: xor
-; CHECK: vmovq
-  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
-  ret <2 x i64> %1
-}
-
-;; Ensure we don't use insertps from non v4x32 vectors.
-;; On SSE4.1 it works because bigger vectors use more than 1 register.
-;; On AVX they get passed in a single register.
-;; FIXME: We could probably optimize this case, if we're only using the
-;; first 4 indices.
-define <4 x i32> @insert_from_diff_size(<8 x i32> %x) {
-; CHECK-LABEL: insert_from_diff_size:
-; CHECK-NOT: insertps
-; CHECK: ret
-  %vecext = extractelement <8 x i32> %x, i32 0
-  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
-  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
-  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
-  %a.0 = extractelement <8 x i32> %x, i32 0
-  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3
-  ret <4 x i32> %vecinit3
-}

diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index b1b2f8b..98c1645 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll

@@ -1,9 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 
-; CHECK: vpunpcklbw %xmm
-; CHECK-NEXT: vpunpckhbw %xmm
-; CHECK-NEXT: vpshufd $85
+; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
 ; CHECK-NEXT: vinsertf128 $1
 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
 entry:
@@ -11,8 +9,7 @@
   ret <32 x i8> %shuffle
 }
 
-; CHECK: vpunpckhwd %xmm
-; CHECK-NEXT: vpshufd $85
+; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11]
 ; CHECK-NEXT: vinsertf128 $1
 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
 entry:
@@ -21,7 +18,7 @@
 }
 
 ; CHECK: vmovq
-; CHECK-NEXT: vmovlhps %xmm
+; CHECK-NEXT: vunpcklpd %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
 entry:
@@ -32,7 +29,7 @@
   ret <4 x i64> %vecinit6.i
 }
 
-; CHECK: vpermilpd $0
+; CHECK: vunpcklpd %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
 entry:
@@ -72,7 +69,7 @@
   ret <8 x float> %load_broadcast12281250
 }
 
-; CHECK: vpshufd $0
+; CHECK: vpermilps $4
 ; CHECK-NEXT: vinsertf128 $1
 define <8 x float> @funcF(i32 %val) nounwind {
   %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
@@ -81,7 +78,7 @@
   ret <8 x float> %tmp
 }
 
-; CHECK: vpshufd  $0
+; CHECK: vpermilps $0
 ; CHECK-NEXT: vinsertf128  $1
 define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
 entry:
@@ -90,7 +87,7 @@
 }
 
 ; CHECK: vextractf128  $1
-; CHECK-NEXT: vpshufd
+; CHECK-NEXT: vpermilps $85
 ; CHECK-NEXT: vinsertf128  $1
 define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
 entry:

diff --git a/test/CodeGen/X86/avx-vmovddup.ll b/test/CodeGen/X86/avx-vmovddup.ll
deleted file mode 100644
index 1c56fe2..0000000
--- a/test/CodeGen/X86/avx-vmovddup.ll
+++ /dev/null

@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vmovddup %ymm
-define <4 x i64> @A(<4 x i64> %a) {
-  %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-  ret <4 x i64> %c
-}
-
-; CHECK: vmovddup (%
-define <4 x i64> @B(<4 x i64>* %ptr) {
-  %a = load <4 x i64>* %ptr
-  %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
-  ret <4 x i64> %c
-}

diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll
deleted file mode 100644
index c20775b..0000000
--- a/test/CodeGen/X86/avx-vperm2f128.ll
+++ /dev/null

@@ -1,69 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: _A
-; CHECK: vperm2f128 $1
-define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: _B
-; CHECK: vblendps $240
-define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: _C
-; CHECK: vperm2f128 $0
-define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: _D
-; CHECK: vperm2f128 $17
-define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: _E
-; CHECK: vperm2f128 $17
-define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <32 x i8> %shuffle
-}
-
-; CHECK: _E2
-; CHECK: vperm2f128 $3
-define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
-  ret <4 x i64> %shuffle
-}
-
-;;;; Cases with undef indicies mixed in the mask
-
-; CHECK: _F
-; CHECK: vperm2f128 $33
-define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
-  ret <8 x float> %shuffle
-}
-
-;;;; Cases we must not select vperm2f128
-
-; CHECK: _G
-; CHECK-NOT: vperm2f128
-define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
-  ret <8 x float> %shuffle
-}

diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
new file mode 100644
index 0000000..a103405
--- /dev/null
+++ b/test/CodeGen/X86/avx-vperm2x128.ll

@@ -0,0 +1,202 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: A:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: B:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: C:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: D:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: E:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: E2:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: Ei:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: Ei:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vpaddb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    retq
+entry:
+  ; add forces execution domain
+  %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E2i:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: E2i:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX2-NEXT:    retq
+entry:
+  ; add forces execution domain
+  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
+  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E3i:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: E3i:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT:    retq
+entry:
+  ; add forces execution domain
+  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %shuffle
+}
+
+define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E4i:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: E4i:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  ; add forces execution domain
+  %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: E5i:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vmovaps (%rsi), %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: E5i:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %c = load <16 x i16>* %a
+  %d = load <16 x i16>* %b
+  %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i16> %shuffle
+}
+
+;;;; Cases with undef indicies mixed in the mask
+
+define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; ALL-LABEL: F:
+; ALL:       ## BB#0: ## %entry
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[0,1,0,1]
+; ALL-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
+  ret <8 x float> %shuffle
+}
+
+;;;; Cases we must not select vperm2f128
+
+define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+; AVX1-LABEL: G:
+; AVX1:       ## BB#0: ## %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: G:
+; AVX2:       ## BB#0: ## %entry
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
+; AVX2-NEXT:    retq
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
+  ret <8 x float> %shuffle
+}

diff --git a/test/CodeGen/X86/avx-vpermil.ll b/test/CodeGen/X86/avx-vpermil.ll
deleted file mode 100644
index b7f8d72..0000000
--- a/test/CodeGen/X86/avx-vpermil.ll
+++ /dev/null

@@ -1,54 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vpermilps
-define <8 x float> @funcA(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vpermilpd
-define <4 x double> @funcB(<4 x double> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vpermilps
-define <8 x i32> @funcC(<8 x i32> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5>
-  ret <8 x i32> %shuffle
-}
-
-; CHECK: vpermilpd
-define <4 x i64> @funcD(<4 x i64> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vpermilpd
-define <4 x i64> @funcQ(<4 x i64>* %a) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x i64>* %a
-  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
-  ret <4 x i64> %shuffle
-}
-
-; vpermil should match masks like this: <u,3,1,2,4,u,5,6>. Check that the
-; target specific mask was correctly generated.
-; CHECK: vpermilps $-100
-define <8 x float> @funcE(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 8, i32 3, i32 1, i32 2, i32 4, i32 8, i32 5, i32 6>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: palignr $8
-; CHECK: palignr $8
-define <8 x float> @funcF(<8 x float> %a) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-  ret <8 x float> %shuffle
-}

diff --git a/test/CodeGen/X86/avx-vshufp.ll b/test/CodeGen/X86/avx-vshufp.ll
deleted file mode 100644
index ad3dbc1..0000000
--- a/test/CodeGen/X86/avx-vshufp.ll
+++ /dev/null

@@ -1,157 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-; CHECK: vshufps  $-53, %ymm
-define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vshufps  $-53, (%{{.*}}), %ymm
-define <8 x float> @A2(<8 x float>* %a, <8 x float>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <8 x float>* %a
-  %b2 = load <8 x float>* %b
-  %shuffle = shufflevector <8 x float> %a2, <8 x float> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vshufps  $-53, %ymm
-define <8 x i32> @A3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
-  ret <8 x i32> %shuffle
-}
-
-; CHECK: vshufps  $-53, (%{{.*}}), %ymm
-define <8 x i32> @A4(<8 x i32>* %a, <8 x i32>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <8 x i32>* %a
-  %b2 = load <8 x i32>* %b
-  %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
-  ret <8 x i32> %shuffle
-}
-
-; CHECK: vblendpd  $10, %ymm
-define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vblendpd  $10, (%{{.*}}), %ymm
-define <4 x double> @B2(<4 x double>* %a, <4 x double>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x double>* %a
-  %b2 = load <4 x double>* %b
-  %shuffle = shufflevector <4 x double> %a2, <4 x double> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vblendpd  $10, %ymm
-define <4 x i64> @B3(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vblendpd  $10, (%{{.*}}), %ymm
-define <4 x i64> @B4(<4 x i64>* %a, <4 x i64>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x i64>* %a
-  %b2 = load <4 x i64>* %b
-  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vshufps  $-53, %ymm
-define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 undef, i32 undef, i32 11, i32 undef, i32 6, i32 12, i32 undef>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vblendpd  $2, %ymm
-define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 undef>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vshufps $-55, %ymm
-define <8 x float> @E(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 10, i32 0, i32 3, i32 13, i32 14, i32 4, i32 7>
-  ret <8 x float> %shuffle
-}
-
-; CHECK: vshufpd  $8, %ymm
-define <4 x double> @F(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
-  ret <4 x double> %shuffle
-}
-
-; CHECK: vshufps  $-53, %xmm
-define <4 x float> @A128(<4 x float> %a, <4 x float> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
-  ret <4 x float> %shuffle
-}
-
-; CHECK: vshufps  $-53, (%{{.*}}), %xmm
-define <4 x float> @A2128(<4 x float>* %a, <4 x float>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x float>* %a
-  %b2 = load <4 x float>* %b
-  %shuffle = shufflevector <4 x float> %a2, <4 x float> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
-  ret <4 x float> %shuffle
-}
-
-; CHECK: vshufps  $-53, %xmm
-define <4 x i32> @A3128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
-  ret <4 x i32> %shuffle
-}
-
-; CHECK: vshufps  $-53, (%{{.*}}), %xmm
-define <4 x i32> @A4128(<4 x i32>* %a, <4 x i32>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <4 x i32>* %a
-  %b2 = load <4 x i32>* %b
-  %shuffle = shufflevector <4 x i32> %a2, <4 x i32> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7>
-  ret <4 x i32> %shuffle
-}
-
-; CHECK: vshufpd  $1, %xmm
-define <2 x double> @B128(<2 x double> %a, <2 x double> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
-  ret <2 x double> %shuffle
-}
-
-; CHECK: vshufpd  $1, (%{{.*}}), %xmm
-define <2 x double> @B2128(<2 x double>* %a, <2 x double>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <2 x double>* %a
-  %b2 = load <2 x double>* %b
-  %shuffle = shufflevector <2 x double> %a2, <2 x double> %b2, <2 x i32> <i32 1, i32 2>
-  ret <2 x double> %shuffle
-}
-
-; CHECK: vshufpd  $1, %xmm
-define <2 x i64> @B3128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
-  ret <2 x i64> %shuffle
-}
-
-; CHECK: vshufpd  $1, (%{{.*}}), %xmm
-define <2 x i64> @B4128(<2 x i64>* %a, <2 x i64>* %b) nounwind uwtable readnone ssp {
-entry:
-  %a2 = load <2 x i64>* %a
-  %b2 = load <2 x i64>* %b
-  %shuffle = shufflevector <2 x i64> %a2, <2 x i64> %b2, <2 x i32> <i32 1, i32 2>
-  ret <2 x i64> %shuffle
-}

diff --git a/test/CodeGen/X86/avx-zext.ll b/test/CodeGen/X86/avx-zext.ll
deleted file mode 100644
index 7511746..0000000
--- a/test/CodeGen/X86/avx-zext.ll
+++ /dev/null

@@ -1,41 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-
-define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-;CHECK-LABEL: zext_8i16_to_8i32:
-;CHECK: vpunpckhwd
-;CHECK: ret
-
-  %B = zext <8 x i16> %A to <8 x i32>
-  ret <8 x i32>%B
-}
-
-define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-;CHECK-LABEL: zext_4i32_to_4i64:
-;CHECK: vpunpckhdq
-;CHECK: ret
-
-  %B = zext <4 x i32> %A to <4 x i64>
-  ret <4 x i64>%B
-}
-
-define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
-;CHECK-LABEL: zext_8i8_to_8i32:
-;CHECK: vpunpckhwd
-;CHECK: vpmovzxwd
-;CHECK: vinsertf128
-;CHECK: ret
-  %t = zext <8 x i8> %z to <8 x i32>
-  ret <8 x i32> %t
-}
-
-; PR17654
-define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
-; CHECK-LABEL: zext_16i8_to_16i16:
-; CHECK: vpxor
-; CHECK: vpunpckhbw
-; CHECK: vpunpcklbw
-; CHECK: vinsertf128
-; CHECK: ret
-  %t = zext <16 x i8> %z to <16 x i16>
-  ret <16 x i16> %t
-}

diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
index 6069c14..cba6d98 100644
--- a/test/CodeGen/X86/avx.ll
+++ b/test/CodeGen/X86/avx.ll

@@ -60,7 +60,7 @@
 ; X32: movl    8(%esp), %ecx
 ; CHECK-NOT: mov
 ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK: vinsertps    $-64, 12(%{{...}},%{{...}}), %
 ; CHECK-NEXT: ret
   %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
   %2 = load <4 x float>* %1, align 16

diff --git a/test/CodeGen/X86/avx1-stack-reload-folding.ll b/test/CodeGen/X86/avx1-stack-reload-folding.ll
new file mode 100644
index 0000000..2e669b0
--- /dev/null
+++ b/test/CodeGen/X86/avx1-stack-reload-folding.ll

@@ -0,0 +1,68 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s

+

+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

+target triple = "x86_64-unknown-unknown"

+

+; Stack reload folding tests - we use the 'big vectors' pattern to guarantee spilling to stack.

+;

+; Many of these tests are primarily to check memory folding with specific instructions. Using a basic

+; load/cvt/store pattern to test for this would mean that it wouldn't be the memory folding code thats

+; being tested - the load-execute version of the instruction from the tables would be matched instead.

+

+define void @stack_fold_vmulpd(<64 x double>* %a, <64 x double>* %b, <64 x double>* %c) {

+  ;CHECK-LABEL: stack_fold_vmulpd

+  ;CHECK:       vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload

+

+  %1 = load <64 x double>* %a

+  %2 = load <64 x double>* %b

+  %3 = fadd <64 x double> %1, %2

+  %4 = fsub <64 x double> %1, %2

+  %5 = fmul <64 x double> %3, %4

+  store <64 x double> %5, <64 x double>* %c

+  ret void

+}

+

+define void @stack_fold_cvtdq2ps(<128 x i32>* %a, <128 x i32>* %b, <128 x float>* %c) {

+  ;CHECK-LABEL: stack_fold_cvtdq2ps

+  ;CHECK:   vcvtdq2ps {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload

+

+  %1 = load <128 x i32>* %a

+  %2 = load <128 x i32>* %b

+  %3 = and <128 x i32> %1, %2

+  %4 = xor <128 x i32> %1, %2

+  %5 = sitofp <128 x i32> %3 to <128 x float>

+  %6 = sitofp <128 x i32> %4 to <128 x float>

+  %7 = fadd <128 x float> %5, %6

+  store <128 x float> %7, <128 x float>* %c

+  ret void

+}

+

+define void @stack_fold_cvttpd2dq(<64 x double>* %a, <64 x double>* %b, <64 x i32>* %c) #0 {

+  ;CHECK-LABEL: stack_fold_cvttpd2dq

+  ;CHECK:  vcvttpd2dqy {{[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload

+

+  %1 = load <64 x double>* %a

+  %2 = load <64 x double>* %b

+  %3 = fadd <64 x double> %1, %2

+  %4 = fsub <64 x double> %1, %2

+  %5 = fptosi <64 x double> %3 to <64 x i32>

+  %6 = fptosi <64 x double> %4 to <64 x i32>

+  %7 = or <64 x i32> %5, %6

+  store <64 x i32> %7, <64 x i32>* %c

+  ret void

+}

+

+define void @stack_fold_cvttps2dq(<128 x float>* %a, <128 x float>* %b, <128 x i32>* %c) #0 {

+  ;CHECK-LABEL: stack_fold_cvttps2dq

+  ;CHECK:   vcvttps2dq {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload

+

+  %1 = load <128 x float>* %a

+  %2 = load <128 x float>* %b

+  %3 = fadd <128 x float> %1, %2

+  %4 = fsub <128 x float> %1, %2

+  %5 = fptosi <128 x float> %3 to <128 x i32>

+  %6 = fptosi <128 x float> %4 to <128 x i32>

+  %7 = or <128 x i32> %5, %6

+  store <128 x i32> %7, <128 x i32>* %c

+  ret void

+}


diff --git a/test/CodeGen/X86/avx2-blend.ll b/test/CodeGen/X86/avx2-blend.ll
deleted file mode 100644
index b02442b..0000000
--- a/test/CodeGen/X86/avx2-blend.ll
+++ /dev/null

@@ -1,11 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
-
-define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
-; CHECK-LABEL: constant_pblendvb_avx2:
-; CHECK: vmovdqa
-; CHECK: vpblendvb
-  %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
-  ret <32 x i8> %1
-}
-
-declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)

diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..ac2c73b
--- /dev/null
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll

@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=core-avx2 -mattr=avx2 | FileCheck %s
+
+define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+  ; CHECK: vpblendw
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone
+
+
+define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: vpblendd
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+  ; CHECK: vpblendd
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
+  ; CHECK: vmpsadbw
+  %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
+

diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index ab3d591..84b22b7 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll

@@ -158,21 +158,21 @@
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
-  ; CHECK: vpslldq
-  %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
-  ; CHECK: vpslldq
-  %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
+

+

+define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {

+  ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]

+  %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]

+  ret <4 x i64> %res

+}

+declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone

+

+

+define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {

+  ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]

+  %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]

+  ret <4 x i64> %res

+}

 declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
@@ -254,21 +254,21 @@
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
-  ; CHECK: vpsrldq
-  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
-  ; CHECK: vpsrldq
-  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
+

+

+define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {

+  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]

+  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]

+  ret <4 x i64> %res

+}

+declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone

+

+

+define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {

+  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero

+  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]

+  ret <4 x i64> %res

+}

 declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
@@ -475,10 +475,10 @@
 
 define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
   ; CHECK: vmpsadbw
-  %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+  %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
-declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
@@ -499,10 +499,10 @@
 
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
   ; CHECK: vpblendw
-  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
-declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
 
 
 define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
@@ -706,18 +706,18 @@
 
 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
   ; CHECK: vpblendd
-  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
-declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
   ; CHECK: vpblendd
-  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
-declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
 
 
 define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {

diff --git a/test/CodeGen/X86/avx2-palignr.ll b/test/CodeGen/X86/avx2-palignr.ll
deleted file mode 100644
index 83573dc..0000000
--- a/test/CodeGen/X86/avx2-palignr.ll
+++ /dev/null

@@ -1,57 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-define <8 x i32> @test1(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test1:
-; CHECK: vpalignr $4
-  %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
-  ret <8 x i32> %C
-}
-
-define <8 x i32> @test2(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: vpalignr $4
-  %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 undef, i32 12>
-  ret <8 x i32> %C
-}
-
-define <8 x i32> @test3(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: vpalignr $4
-  %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
-  ret <8 x i32> %C
-}
-;
-define <8 x i32> @test4(<8 x i32> %A, <8 x i32> %B) nounwind {
-; CHECK-LABEL: test4:
-; CHECK: vpalignr $8
-  %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 10, i32 11, i32 undef, i32 1, i32 14, i32 15, i32 4, i32 5>
-  ret <8 x i32> %C
-}
-
-define <16 x i16> @test5(<16 x i16> %A, <16 x i16> %B) nounwind {
-; CHECK-LABEL: test5:
-; CHECK: vpalignr $6
-  %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 undef, i32 6, i32 7, i32 16, i32 17, i32 18, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26>
-  ret <16 x i16> %C
-}
-
-define <16 x i16> @test6(<16 x i16> %A, <16 x i16> %B) nounwind {
-; CHECK-LABEL: test6:
-; CHECK: vpalignr $6
-  %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26>
-  ret <16 x i16> %C
-}
-
-define <16 x i16> @test7(<16 x i16> %A, <16 x i16> %B) nounwind {
-; CHECK-LABEL: test7:
-; CHECK: vpalignr $6
-  %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <16 x i16> %C
-}
-
-define <32 x i8> @test8(<32 x i8> %A, <32 x i8> %B) nounwind {
-; CHECK-LABEL: test8:
-; CHECK: vpalignr $5
-  %C = shufflevector <32 x i8> %A, <32 x i8> %B, <32 x i32> <i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
-  ret <32 x i8> %C
-}

diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
deleted file mode 100644
index 185b989..0000000
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ /dev/null

@@ -1,127 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; Make sure that we don't match this shuffle using the vpblendw YMM instruction.
-; The mask for the vpblendw instruction needs to be identical for both halves
-; of the YMM. Need to use two vpblendw instructions.
-
-; CHECK: vpblendw_test1
-; mask = 10010110,b = 150,d
-; CHECK: vpblendw  $150, %ymm
-; CHECK: ret
-define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
-  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3,  i32 20, i32 5,  i32 6,  i32 23, 
-                                                               i32 8, i32 25, i32 26, i32 11, i32 28, i32 13, i32 14, i32 31>
-  ret <16 x i16> %t
-}
-
-; CHECK: vpblendw_test2
-; mask1 = 00010110 = 22
-; mask2 = 10000000 = 128
-; CHECK: vpblendw  $128, %xmm
-; CHECK: vpblendw  $22, %xmm
-; CHECK: vinserti128
-; CHECK: ret
-define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
-  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, 
-                                                               i32 8, i32 9,  i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
-  ret <16 x i16> %t
-}
-
-; CHECK: blend_test1
-; CHECK: vpblendd
-; CHECK: ret
-define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
-  ret <8 x i32> %t
-}
-
-; CHECK: blend_test2
-; CHECK: vpblendd
-; CHECK: ret
-define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline {
-  %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
-  ret <8 x i32> %t
-}
-
-
-; CHECK: blend_test3
-; CHECK: vblendps
-; CHECK: ret
-define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline {
-  %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7>
-  ret <8 x float> %t
-}
-
-; CHECK: blend_test4
-; CHECK: vblendpd
-; CHECK: ret
-define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
-  %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-  ret <4 x i64> %t
-}
-
-;; 2 tests for shufflevectors that optimize to blend + immediate
-; CHECK-LABEL: @blend_test5
-; CHECK: vpblendd $10, %xmm1, %xmm0, %xmm0
-; CHECK: ret
-define <4 x i32> @blend_test5(<4 x i32> %a, <4 x i32> %b) {
-  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x i32> %1
-}
-
-; CHECK-LABEL: @blend_test6
-; CHECK: vpblendw $134, %ymm1, %ymm0, %ymm0
-; CHECK: ret
-define <16 x i16> @blend_test6(<16 x i16> %a, <16 x i16> %b) {
-  %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32  3, i32  4, i32  5, i32  6, i32 23,
-                                                               i32 8, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 31>
-  ret <16 x i16> %1
-}
-
-; CHECK: vpshufhw $27, %ymm
-define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpshuflw $27, %ymm
-define <16 x i16> @vpshuflw(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpshufb_test
-; CHECK: vpshufb {{.*\(%r.*}}, %ymm
-; CHECK: ret
-define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
-  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
-                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
-                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
-                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
-  ret <32 x i8>%S
-}
-
-; CHECK: vpshufb1_test
-; CHECK: vpshufb {{.*\(%r.*}}, %ymm
-; CHECK: ret
-define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
-  %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
-                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
-                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
-                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
-  ret <32 x i8>%S
-}
-
-
-; CHECK: vpshufb2_test
-; CHECK: vpshufb {{.*\(%r.*}}, %ymm
-; CHECK: ret
-define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
-  %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
-                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
-                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
-                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
-  ret <32 x i8>%S
-}

diff --git a/test/CodeGen/X86/avx2-unpack.ll b/test/CodeGen/X86/avx2-unpack.ll
deleted file mode 100644
index 6d17443..0000000
--- a/test/CodeGen/X86/avx2-unpack.ll
+++ /dev/null

@@ -1,86 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; CHECK: vpunpckhdq
-define <8 x i32> @unpackhidq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  ret <8 x i32> %shuffle.i
-}
-
-; CHECK: vpunpckhqdq
-define <4 x i64> @unpackhiqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  ret <4 x i64> %shuffle.i
-}
-
-; CHECK: vpunpckldq
-define <8 x i32> @unpacklodq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  ret <8 x i32> %shuffle.i
-}
-
-; CHECK: vpunpcklqdq
-define <4 x i64> @unpacklqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  ret <4 x i64> %shuffle.i
-}
-
-; CHECK: vpunpckhwd
-define <16 x i16> @unpackhwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpunpcklwd
-define <16 x i16> @unpacklwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpunpckhbw
-define <32 x i8> @unpackhbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
-  ret <32 x i8> %shuffle.i
-}
-
-; CHECK: vpunpcklbw
-define <32 x i8> @unpacklbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
-  ret <32 x i8> %shuffle.i
-}
-
-; CHECK: vpunpckhdq
-define <8 x i32> @unpackhidq1_undef(<8 x i32> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  ret <8 x i32> %shuffle.i
-}
-
-; CHECK: vpunpckhqdq
-define <4 x i64> @unpackhiqdq1_undef(<4 x i64> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  ret <4 x i64> %shuffle.i
-}
-
-; CHECK: vpunpckhwd
-define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  ret <16 x i16> %shuffle.i
-}
-
-; CHECK: vpunpcklwd
-define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
-entry:
-  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
-  ret <16 x i16> %shuffle.i
-}
-

diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 66f586d..924c06e 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll

@@ -317,7 +317,7 @@
 }
 
 ;CHECK-LABEL: _inreg2xdouble:
-;CHECK: vpbroadcastq
+;CHECK: vunpcklpd
 ;CHECK: ret
 define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
   %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer

diff --git a/test/CodeGen/X86/avx2-vperm2i128.ll b/test/CodeGen/X86/avx2-vperm2i128.ll
deleted file mode 100644
index 1937db5..0000000
--- a/test/CodeGen/X86/avx2-vperm2i128.ll
+++ /dev/null

@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
-
-; CHECK: vperm2i128 $17
-define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-entry:
-  ; add forces execution domain
-  %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <32 x i8> %shuffle
-}
-
-; CHECK: vperm2i128 $3
-define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
-entry:
-  ; add forces execution domain
-  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
-  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
-  ret <4 x i64> %shuffle
-}
-
-; CHECK: vperm2i128 $49
-define <8 x i32> @E3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
-entry:
-  ; add forces execution domain
-  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i32> %shuffle
-}
-
-; CHECK: vperm2i128 $2
-define <16 x i16> @E4(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
-entry:
-  ; add forces execution domain
-  %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-  %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <16 x i16> %shuffle
-}
-
-; CHECK: vperm2i128 $2, (%
-define <16 x i16> @E5(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
-entry:
-  %c = load <16 x i16>* %a
-  %d = load <16 x i16>* %b
-  %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-  %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <16 x i16> %shuffle
-}

diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 4d1c9f7..c43da9c 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll

@@ -1,189 +1,217 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-; CHECK-LABEL: addpd512
-; CHECK: vaddpd
-; CHECK: ret
 define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: addpd512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <8 x double> %x, %y
   ret <8 x double> %add.i
 }
 
-; CHECK-LABEL: addpd512fold
-; CHECK: vaddpd LCP{{.*}}(%rip)
-; CHECK: ret
 define <8 x double> @addpd512fold(<8 x double> %y) {
+; CHECK-LABEL: addpd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
   ret <8 x double> %add.i
 }
 
-; CHECK-LABEL: addps512
-; CHECK: vaddps
-; CHECK: ret
 define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: addps512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <16 x float> %x, %y
   ret <16 x float> %add.i
 }
 
-; CHECK-LABEL: addps512fold
-; CHECK: vaddps LCP{{.*}}(%rip)
-; CHECK: ret
 define <16 x float> @addps512fold(<16 x float> %y) {
+; CHECK-LABEL: addps512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vaddps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000,  float 0x4002666660000000, float 0x3FF3333340000000>
   ret <16 x float> %add.i
 }
 
-; CHECK-LABEL: subpd512
-; CHECK: vsubpd
-; CHECK: ret
 define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: subpd512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <8 x double> %x, %y
   ret <8 x double> %sub.i
 }
 
-; CHECK-LABEL: @subpd512fold
-; CHECK: vsubpd (%
-; CHECK: ret
 define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
+; CHECK-LABEL: subpd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %tmp2 = load <8 x double>* %x, align 8
   %sub.i = fsub <8 x double> %y, %tmp2
   ret <8 x double> %sub.i
 }
 
-; CHECK-LABEL: @subps512
-; CHECK: vsubps
-; CHECK: ret
 define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: subps512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <16 x float> %x, %y
   ret <16 x float> %sub.i
 }
 
-; CHECK-LABEL: subps512fold
-; CHECK: vsubps (%
-; CHECK: ret
 define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
+; CHECK-LABEL: subps512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsubps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %tmp2 = load <16 x float>* %x, align 4
   %sub.i = fsub <16 x float> %y, %tmp2
   ret <16 x float> %sub.i
 }
 
-; CHECK-LABEL: imulq512
-; CHECK: vpmuludq
-; CHECK: vpmuludq
-; CHECK: ret
 define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
+; CHECK-LABEL: imulq512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2
+; CHECK-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; CHECK-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
+; CHECK-NEXT:    vpsllq $32, %zmm3, %zmm3
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; CHECK-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %z = mul <8 x i64>%x, %y
   ret <8 x i64>%z
 }
 
-; CHECK-LABEL: mulpd512
-; CHECK: vmulpd
-; CHECK: ret
 define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: mulpd512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <8 x double> %x, %y
   ret <8 x double> %mul.i
 }
 
-; CHECK-LABEL: mulpd512fold
-; CHECK: vmulpd LCP{{.*}}(%rip)
-; CHECK: ret
 define <8 x double> @mulpd512fold(<8 x double> %y) {
+; CHECK-LABEL: mulpd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
   ret <8 x double> %mul.i
 }
 
-; CHECK-LABEL: mulps512
-; CHECK: vmulps
-; CHECK: ret
 define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: mulps512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <16 x float> %x, %y
   ret <16 x float> %mul.i
 }
 
-; CHECK-LABEL: mulps512fold
-; CHECK: vmulps LCP{{.*}}(%rip)
-; CHECK: ret
 define <16 x float> @mulps512fold(<16 x float> %y) {
+; CHECK-LABEL: mulps512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
   ret <16 x float> %mul.i
 }
 
-; CHECK-LABEL: divpd512
-; CHECK: vdivpd
-; CHECK: ret
 define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: divpd512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <8 x double> %x, %y
   ret <8 x double> %div.i
 }
 
-; CHECK-LABEL: divpd512fold
-; CHECK: vdivpd LCP{{.*}}(%rip)
-; CHECK: ret
 define <8 x double> @divpd512fold(<8 x double> %y) {
+; CHECK-LABEL: divpd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
   ret <8 x double> %div.i
 }
 
-; CHECK-LABEL: divps512
-; CHECK: vdivps
-; CHECK: ret
 define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: divps512:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <16 x float> %x, %y
   ret <16 x float> %div.i
 }
 
-; CHECK-LABEL: divps512fold
-; CHECK: vdivps LCP{{.*}}(%rip)
-; CHECK: ret
 define <16 x float> @divps512fold(<16 x float> %y) {
+; CHECK-LABEL: divps512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vdivps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
   ret <16 x float> %div.i
 }
 
-; CHECK-LABEL: vpaddq_test
-; CHECK: vpaddq %zmm
-; CHECK: ret
 define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpaddq_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = add <8 x i64> %i, %j
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpaddq_fold_test
-; CHECK: vpaddq (%
-; CHECK: ret
 define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
+; CHECK-LABEL: vpaddq_fold_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %tmp = load <8 x i64>* %j, align 4
   %x = add <8 x i64> %i, %tmp
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpaddq_broadcast_test
-; CHECK: vpaddq LCP{{.*}}(%rip){1to8}
-; CHECK: ret
 define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
+; CHECK-LABEL: vpaddq_broadcast_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = add <8 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpaddq_broadcast2_test
-; CHECK: vpaddq (%rdi){1to8}
-; CHECK: ret
 define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
+; CHECK-LABEL: vpaddq_broadcast2_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %tmp = load i64* %j
   %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
   %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
@@ -197,55 +225,67 @@
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpaddd_test
-; CHECK: vpaddd %zmm
-; CHECK: ret
 define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpaddd_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = add <16 x i32> %i, %j
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpaddd_fold_test
-; CHECK: vpaddd (%
-; CHECK: ret
 define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
+; CHECK-LABEL: vpaddd_fold_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %tmp = load <16 x i32>* %j, align 4
   %x = add <16 x i32> %i, %tmp
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpaddd_broadcast_test
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
-; CHECK: ret
 define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
+; CHECK-LABEL: vpaddd_broadcast_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpaddd_mask_test
-; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
-; CHECK: ret
 define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %x = add <16 x i32> %i, %j
   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_maskz_test
-; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z} }}
-; CHECK: ret
 define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %x = add <16 x i32> %i, %j
   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_mask_fold_test
-; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
-; CHECK: ret
 define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_fold_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %j = load <16 x i32>* %j.ptr
   %x = add <16 x i32> %i, %j
@@ -253,20 +293,26 @@
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_mask_broadcast_test
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
-; CHECK: ret
 define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_broadcast_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_maskz_fold_test
-; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
-; CHECK: ret
 define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_fold_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %j = load <16 x i32>* %j.ptr
   %x = add <16 x i32> %i, %j
@@ -274,125 +320,141 @@
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpaddd_maskz_broadcast_test
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
-; CHECK: ret
 define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_broadcast_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %mask = icmp ne <16 x i32> %mask1, zeroinitializer
   %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
   ret <16 x i32> %r
 }
 
-; CHECK-LABEL: vpsubq_test
-; CHECK: vpsubq %zmm
-; CHECK: ret
 define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpsubq_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = sub <8 x i64> %i, %j
   ret <8 x i64> %x
 }
 
-; CHECK-LABEL: vpsubd_test
-; CHECK: vpsubd
-; CHECK: ret
 define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpsubd_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = sub <16 x i32> %i, %j
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: vpmulld_test
-; CHECK: vpmulld %zmm
-; CHECK: ret
 define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
+; CHECK-LABEL: vpmulld_test:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %x = mul <16 x i32> %i, %j
   ret <16 x i32> %x
 }
 
-; CHECK-LABEL: sqrtA
-; CHECK: vsqrtss {{.*}} encoding: [0x62
-; CHECK: ret
 declare float @sqrtf(float) readnone
 define float @sqrtA(float %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtA:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %conv1 = tail call float @sqrtf(float %a) nounwind readnone
   ret float %conv1
 }
 
-; CHECK-LABEL: sqrtB
-; CHECK: vsqrtsd {{.*}}## encoding: [0x62
-; CHECK: ret
 declare double @sqrt(double) readnone
 define double @sqrtB(double %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtB:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %call = tail call double @sqrt(double %a) nounwind readnone
   ret double %call
 }
 
-; CHECK-LABEL: sqrtC
-; CHECK: vsqrtss {{.*}}## encoding: [0x62
-; CHECK: ret
 declare float @llvm.sqrt.f32(float)
 define float @sqrtC(float %a) nounwind {
+; CHECK-LABEL: sqrtC:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %b = call float @llvm.sqrt.f32(float %a)
   ret float %b
 }
 
-; CHECK-LABEL: sqrtD
-; CHECK: vsqrtps {{.*}}
-; CHECK: ret
 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
 define <16 x float> @sqrtD(<16 x float> %a) nounwind {
+; CHECK-LABEL: sqrtD:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
   ret <16 x float> %b
 }
 
-; CHECK-LABEL: sqrtE
-; CHECK: vsqrtpd {{.*}}
-; CHECK: ret
 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
 define <8 x double> @sqrtE(<8 x double> %a) nounwind {
+; CHECK-LABEL: sqrtE:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
   ret <8 x double> %b
 }
 
-; CHECK-LABEL: fadd_broadcast
-; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0
-; CHECK: ret
 define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
+; CHECK-LABEL: fadd_broadcast:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
   ret <16 x float> %b
 }
 
-; CHECK-LABEL: addq_broadcast
-; CHECK: vpaddq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK: ret
 define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
+; CHECK-LABEL: addq_broadcast:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
   ret <8 x i64> %b
 }
 
-; CHECK-LABEL: orq_broadcast
-; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
-; CHECK: ret
 define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
+; CHECK-LABEL: orq_broadcast:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
   ret <8 x i64> %b
 }
 
-; CHECK-LABEL: andd512fold
-; CHECK: vpandd (%
-; CHECK: ret
 define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
+; CHECK-LABEL: andd512fold:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpandd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %a = load <16 x i32>* %x, align 4
   %b = and <16 x i32> %y, %a
   ret <16 x i32> %b
 }
 
-; CHECK-LABEL: andqbrst
-; CHECK: vpandq  (%rdi){1to8}, %zmm
-; CHECK: ret
 define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
+; CHECK-LABEL: andqbrst:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpandq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
 entry:
   %a = load i64* %ap, align 8
   %b = insertelement <8 x i64> undef, i64 %a, i32 0

diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index b5a2aa8..9e9ad31 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll

@@ -1,30 +1,43 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-; CHECK-LABEL: test1
-; CHECK: vpxord
-; CHECK: ret
 define <16 x i32> @test1(i32* %x) {
+; CHECK-LABEL: test1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovd (%rdi), %xmm0
+; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7]
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
    %y = load i32* %x, align 4
    %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4
    ret <16 x i32>%res
 }
 
-; CHECK-LABEL: test2
-; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
-; CHECK: ret
 define <16 x i32> @test2(<16 x i32> %x) {
+; CHECK-LABEL: test2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
    %res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
    ret <16 x i32>%res
 }
 
-; CHECK-LABEL: test3
-; CHECK: vinsertf128
-; CHECK: vinsertf64x4
-; CHECK: ret
 define <16 x float> @test3(<4 x float> %a) {
+; CHECK-LABEL: test3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vmovss %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vmovss %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; CHECK-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = extractelement <4 x float> %a, i32 2
   %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
   %b1 = extractelement <4 x float> %a, i32 0
   %c1 = insertelement <16 x float> %c, float %b1, i32 6
   ret <16 x float>%c1
-}
\ No newline at end of file
+}

diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index 47e50a9..6e0d185 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll

@@ -28,10 +28,9 @@
   ret float %c1
 }
 
+; FIXME: Can use vcmpeqss and extract from the mask here in AVX512.
 ; CHECK-LABEL: test3
-; CHECK: vcmpeqss
-; CHECK: kmov
-; CHECK: ret
+; CHECK: vucomiss {{.*}}encoding: [0x62
 define i32 @test3(float %a, float %b) {
 
   %cmp10.i = fcmp oeq float %a, %b
@@ -86,3 +85,17 @@
   %res = select i1 %tmp5, i32 1, i32 %a3
   ret i32 %res
 }
+
+; CHECK-LABEL: test9
+; CHECK: testb
+; CHECK-NOT: kmov
+; CHECK: ret
+define i32 @test9(i64 %a) {
+ %b = and i64 %a, 1
+ %cmp10.i = icmp eq i64 %b, 0
+ br i1 %cmp10.i, label %A, label %B
+A:
+ ret i32 6
+B:
+ ret i32 7
+}

diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index f5cda96..2b672a7 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll

@@ -255,3 +255,56 @@
   %b = uitofp i32 %a to double
   ret double %b
 }
+
+; CHECK-LABEL: @sitofp_16i1_float
+; CHECK: vpbroadcastd
+; CHECK: vcvtdq2ps
+define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = sitofp <16 x i1> %mask to <16 x float>
+  ret <16 x float> %1
+}
+
+; CHECK-LABEL: @sitofp_16i8_float
+; CHECK: vpmovsxbd
+; CHECK: vcvtdq2ps
+define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
+  %1 = sitofp <16 x i8> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+; CHECK-LABEL: @sitofp_16i16_float
+; CHECK: vpmovsxwd
+; CHECK: vcvtdq2ps
+define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
+  %1 = sitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+; CHECK-LABEL: @sitofp_8i16_double
+; CHECK: vpmovsxwd
+; CHECK: vcvtdq2pd
+define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
+  %1 = sitofp <8 x i16> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+; CHECK-LABEL: sitofp_8i8_double
+; CHECK: vpmovzxwd
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: vcvtdq2pd
+define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
+  %1 = sitofp <8 x i8> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+
+; CHECK-LABEL: @sitofp_8i1_double
+; CHECK: vpbroadcastq
+; CHECK: vcvtdq2pd
+define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
+  %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
+  %1 = sitofp <8 x i1> %cmpres to <8 x double>
+  ret <8 x double> %1
+}

diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index ce3d759..366d324 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll

@@ -1,97 +1,113 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
 
 define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmadd_ps_z
   ; CHECK: vfmadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmadd_pd_z
   ; CHECK: vfmadd213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <8 x double> @test_mask_fmadd_pd(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmadd_pd:
+; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
 
 define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubps_z
   ; CHECK: vfmsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubpd_z
   ; CHECK: vfmsub213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfnmadd_ps_z
   ; CHECK: vfnmadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfnmadd_pd_z
   ; CHECK: vfnmadd213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfnmsubps_z
   ; CHECK: vfnmsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfnmsubpd_z
   ; CHECK: vfnmsub213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmaddsubps_z
   ; CHECK: vfmaddsub213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; CHECK-LABEL: test_mask_fmaddsub_ps:
+; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2]
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmaddsubpd_z
   ; CHECK: vfmaddsub213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubaddps_z
   ; CHECK: vfmsubadd213ps %zmm
-  %res = call <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
   ; CHECK-LABEL: test_x86_vfmsubaddpd_z
   ; CHECK: vfmsubadd213pd %zmm
-  %res = call <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone

diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index b360c71..eba895e 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL --check-prefix=CHECK %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=CHECK %s
 
 ;CHECK-LABEL: test1:
 ;CHECK: vinsertps
@@ -12,9 +13,11 @@
 }
 
 ;CHECK-LABEL: test2:
-;CHECK: vinsertf32x4
-;CHECK: vextractf32x4
-;CHECK: vinsertf32x4
+;KNL: vinsertf32x4 $0
+;SKX: vinsertf64x2 $0
+;CHECK: vextractf32x4 $3
+;KNL: vinsertf32x4 $3
+;SKX: vinsertf64x2 $3
 ;CHECK: ret
 define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
   %rrr = load double* %br
@@ -24,8 +27,8 @@
 }
 
 ;CHECK-LABEL: test3:
-;CHECK: vextractf32x4
-;CHECK: vinsertf32x4
+;CHECK: vextractf32x4 $1
+;CHECK: vinsertf32x4 $0
 ;CHECK: ret
 define <16 x float> @test3(<16 x float> %x) nounwind {
   %eee = extractelement <16 x float> %x, i32 4
@@ -34,8 +37,9 @@
 }
 
 ;CHECK-LABEL: test4:
-;CHECK: vextracti32x4
-;CHECK: vinserti32x4
+;CHECK: vextracti32x4 $2
+;KNL: vinserti32x4 $0
+;SKX: vinserti64x2 $0
 ;CHECK: ret
 define <8 x i64> @test4(<8 x i64> %x) nounwind {
   %eee = extractelement <8 x i64> %x, i32 4
@@ -186,12 +190,13 @@
 ;CHECK-LABEL: test17
 ;CHECK: kshiftlw
 ;CHECK: kshiftrw
-;CHECK: korw
+;KNL: korw
+;SKX: korb
 ;CHECK: ret
 define i8 @test17(i1 *%addr, i8 %a) {
   %x = load i1 * %addr, align 128
   %a1 = bitcast i8 %a to <8 x i1>
-  %x1 = insertelement <8 x i1> %a1, i1 %x, i32 10
+  %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
   %x2 = bitcast <8 x i1>%x1 to i8
   ret i8 %x2
 }

diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 18cfcfe..691d1fb 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll

@@ -60,20 +60,6 @@
 }
 declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
 
-define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrcp28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
-  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
-  ; CHECK: vrcp28pd {sae}, {{.*}}encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
-  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ; <<8 x double>> [#uses=1]
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
 declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
 
 define <8 x double> @test7(<8 x double> %a) {
@@ -97,13 +83,6 @@
 }
 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
 
-define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrsqrt28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
 define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
   ; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0]
   %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
@@ -111,13 +90,6 @@
 }
 declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
-define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
 define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
   ; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0]
   %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
@@ -125,26 +97,19 @@
 }
 declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
-define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
-  ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
-  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
   ; CHECK: vsqrtpd
-  %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
   ; CHECK: vsqrtps
-  %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vsqrtss {{.*}}encoding: [0x62
@@ -611,3 +576,515 @@
 }
 
 declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
+
+define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_valign_q:
+; CHECK: valignq $2, %zmm1, %zmm0, %zmm0
+  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
+; CHECK-LABEL: test_mask_valign_q:
+; CHECK: valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> %src, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i8, <8 x i64>, i8)
+
+define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_maskz_valign_d:
+; CHECK: valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x03,0xc1,0x05]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i8 5, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i8, <16 x i32>, i16)
+
+define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
+ ; CHECK-LABEL: test_mask_store_ss
+ ; CHECK: vmovss %xmm0, (%rdi) {%k1}     ## encoding: [0x62,0xf1,0x7e,0x09,0x11,0x07]
+ call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask)
+ ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
+
+define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
+
+define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d
+; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d
+; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q
+; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q
+; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
+
+define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK_LABEL: test_cmp_d_512
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltd %zmm1, %zmm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpled %zmm1, %zmm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnled %zmm1, %zmm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordd %zmm1, %zmm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_cmp_d_512
+; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltd %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpled %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnled %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordd %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK_LABEL: test_ucmp_d_512
+; CHECK: vpcmpequd %zmm1, %zmm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltud %zmm1, %zmm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleud %zmm1, %zmm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordud %zmm1, %zmm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_ucmp_d_512
+; CHECK: vpcmpequd %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordud %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+
+define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK_LABEL: test_cmp_q_512
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %zmm1, %zmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %zmm1, %zmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %zmm1, %zmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_q_512
+; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK_LABEL: test_ucmp_q_512
+; CHECK: vpcmpequq %zmm1, %zmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %zmm1, %zmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_q_512
+; CHECK: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+
+define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextractf32x4:
+; CHECK: vextractf32x4 $2, %zmm1, %xmm0 {%k1}
+  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i8 2, <4 x float> %b, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i8, <4 x float>, i8)
+
+define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextracti64x4:
+; CHECK: vextracti64x4 $2, %zmm1, %ymm0 {%k1}
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i8 2, <4 x i64> %b, i8 %mask)
+  ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i8, <4 x i64>, i8)
+
+define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: test_maskz_vextracti32x4:
+; CHECK: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
+  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i8 2, <4 x i32> zeroinitializer, i8 %mask)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i8, <4 x i32>, i8)
+
+define <4 x double> @test_vextractf64x4(<8 x double> %a) {
+; CHECK-LABEL: test_vextractf64x4:
+; CHECK: vextractf64x4 $2, %zmm0, %ymm0 ##
+  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i8 2, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i8, <4 x double>, i8)
+
+define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_pslli_d
+  ; CHECK: vpslld
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_pslli_d 
+  ; CHECK: vpslld $7, %zmm0, %zmm1 {%k1}  
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d
+  ; CHECK: vpslld $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_pslli_q
+  ; CHECK: vpsllq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_pslli_q
+  ; CHECK: vpsllq $7, %zmm0, %zmm1 {%k1}   
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q
+  ; CHECK: vpsllq $7, %zmm0, %zmm0 {%k1} {z} 
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_psrli_d
+  ; CHECK: vpsrld
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrli_d
+  ; CHECK: vpsrld $7, %zmm0, %zmm1 {%k1}  
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d
+  ; CHECK: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_psrli_q
+  ; CHECK: vpsrlq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrli_q
+  ; CHECK: vpsrlq $7, %zmm0, %zmm1 {%k1}  
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q
+  ; CHECK: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_psrai_d
+  ; CHECK: vpsrad
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrai_d
+  ; CHECK: vpsrad $7, %zmm0, %zmm1 {%k1}  
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d
+  ; CHECK: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
+  ; CHECK-LABEL: test_x86_avx512_psrai_q
+  ; CHECK: vpsraq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_mask_psrai_q
+  ; CHECK: vpsraq $7, %zmm0, %zmm1 {%k1}   
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
+  ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q
+  ; CHECK: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone

diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index dd33ffd..35d3348 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll

@@ -1,12 +1,14 @@
-; RUN: llc < %s -march=x86-64 -mcpu=knl | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
 define i16 @mask16(i16 %x) {
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <16 x i1> %m1 to i16
   ret i16 %ret
-; CHECK: mask16
-; CHECK: knotw
+; CHECK-LABEL: mask16
+; CHECK: kmovw
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw
 ; CHECK: ret
 }
 
@@ -15,8 +17,38 @@
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <8 x i1> %m1 to i8
   ret i8 %ret
-; CHECK: mask8
-; CHECK: knotw
+; CHECK-LABEL: mask8
+; CHECK: kmovw
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw
+; CHECK: ret
+}
+
+define void @mask16_mem(i16* %ptr) {
+  %x = load i16* %ptr, align 4
+  %m0 = bitcast i16 %x to <16 x i1>
+  %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <16 x i1> %m1 to i16
+  store i16 %ret, i16* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask16_mem
+; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
+; CHECK: ret
+}
+
+define void @mask8_mem(i8* %ptr) {
+  %x = load i8* %ptr, align 4
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <8 x i1> %m1 to i8
+  store i8 %ret, i8* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask8_mem
+; CHECK: kmovw ([[ARG1]]), %k{{[0-7]}}
+; CHECK-NEXT: knotw
+; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]])
 ; CHECK: ret
 }
 

diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 009802f..93875e8 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll

@@ -153,31 +153,295 @@
   ret void
 }
 
-; CHECK-LABEL: store_i1_1
-; CHECK: movb
-; CHECK: movb
+; CHECK-LABEL: test19
+; CHECK: vmovdqu32
 ; CHECK: ret
-define void @store_i1_1() {
-  store i1 true, i1 addrspace(3)* undef, align 128
-  store i1 false, i1 addrspace(2)* undef, align 128
+define void @test19(i8 * %addr, <16 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  store <16 x i32>%data, <16 x i32>* %vaddr, align 1
   ret void
 }
 
-; CHECK-LABEL: store_i1_2
-; CHECK: movb
+; CHECK-LABEL: test20
+; CHECK: vmovdqa32
 ; CHECK: ret
-define void @store_i1_2(i64 %a, i64 %b) {
-  %res = icmp eq i64 %a, %b
-  store i1 %res, i1 addrspace(3)* undef, align 128
+define void @test20(i8 * %addr, <16 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  store <16 x i32>%data, <16 x i32>* %vaddr, align 64
   ret void
 }
 
-; CHECK-LABEL: store_i1_3
-; CHECK: kmovw
+; CHECK-LABEL: test21
+; CHECK: vmovdqa64
 ; CHECK: ret
-define void @store_i1_3(i16 %a) {
-  %a_vec = bitcast i16 %a to <16 x i1>
-  %res = extractelement <16 x i1> %a_vec, i32 4
-  store i1 %res, i1 addrspace(3)* undef, align 128
+define  <8 x i64> @test21(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %res = load <8 x i64>* %vaddr, align 64
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test22
+; CHECK: vmovdqu64
+; CHECK: ret
+define void @test22(i8 * %addr, <8 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  store <8 x i64>%data, <8 x i64>* %vaddr, align 1
   ret void
 }
+
+; CHECK-LABEL: test23
+; CHECK: vmovdqu64
+; CHECK: ret
+define <8 x i64> @test23(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %res = load <8 x i64>* %vaddr, align 1
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test24
+; CHECK: vmovapd
+; CHECK: ret
+define void @test24(i8 * %addr, <8 x double> %data) {
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  store <8 x double>%data, <8 x double>* %vaddr, align 64
+  ret void
+}
+
+; CHECK-LABEL: test25
+; CHECK: vmovapd
+; CHECK: ret
+define <8 x double> @test25(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %res = load <8 x double>* %vaddr, align 64
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test26
+; CHECK: vmovaps
+; CHECK: ret
+define void @test26(i8 * %addr, <16 x float> %data) {
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  store <16 x float>%data, <16 x float>* %vaddr, align 64
+  ret void
+}
+
+; CHECK-LABEL: test27
+; CHECK: vmovaps
+; CHECK: ret
+define <16 x float> @test27(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %res = load <16 x float>* %vaddr, align 64
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test28
+; CHECK: vmovupd
+; CHECK: ret
+define void @test28(i8 * %addr, <8 x double> %data) {
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  store <8 x double>%data, <8 x double>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test29
+; CHECK: vmovupd
+; CHECK: ret
+define <8 x double> @test29(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %res = load <8 x double>* %vaddr, align 1
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test30
+; CHECK: vmovups
+; CHECK: ret
+define void @test30(i8 * %addr, <16 x float> %data) {
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  store <16 x float>%data, <16 x float>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test31
+; CHECK: vmovups
+; CHECK: ret
+define <16 x float> @test31(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %res = load <16 x float>* %vaddr, align 1
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test32
+; CHECK: vmovdqa32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test33
+; CHECK: vmovdqu32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test34
+; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test35
+; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test36
+; CHECK: vmovdqa64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test37
+; CHECK: vmovdqu64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test38
+; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test39
+; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+  ret <8 x i64>%res
+}
+
+; CHECK-LABEL: test40
+; CHECK: vmovaps{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test41
+; CHECK: vmovups{{.*{%k[1-7]} }}
+; CHECK: ret
+define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test42
+; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) {
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test43
+; CHECK: vmovups{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) {
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+  ret <16 x float>%res
+}
+
+; CHECK-LABEL: test44
+; CHECK: vmovapd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test45
+; CHECK: vmovupd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test46
+; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) {
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+  ret <8 x double>%res
+}
+
+; CHECK-LABEL: test47
+; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) {
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+  ret <8 x double>%res
+}

diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 83f4698..0dbf286 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll

@@ -39,3 +39,56 @@
   %cond = select i1 %cmp, double %c, double %b
   ret double %cond
 }
+
+; CHECK-LABEL: @select04
+; CHECK: vmovaps %zmm3, %zmm1
+; CHECK-NEXT: ret
+; PR20677
+define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
+  %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
+  ret <16 x double> %sel
+}
+
+; CHECK-LABEL: select05
+; CHECK: kmovw   %esi, %k0
+; CHECK-NEXT: kmovw   %edi, %k1
+; CHECK-NEXT: korw    %k1, %k0, %k0
+; CHECK-NEXT: kmovw   %k0, %eax
+define i8 @select05(i8 %a.0, i8 %m) {
+  %mask = bitcast i8 %m to <8 x i1>
+  %a = bitcast i8 %a.0 to <8 x i1>
+  %r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
+  %res = bitcast <8 x i1> %r to i8
+  ret i8 %res;
+}
+
+; CHECK-LABEL: select06
+; CHECK: kmovw   %esi, %k0
+; CHECK-NEXT: kmovw   %edi, %k1
+; CHECK-NEXT: kandw    %k1, %k0, %k0
+; CHECK-NEXT: kmovw   %k0, %eax
+define i8 @select06(i8 %a.0, i8 %m) {
+  %mask = bitcast i8 %m to <8 x i1>
+  %a = bitcast i8 %a.0 to <8 x i1>
+  %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
+  %res = bitcast <8 x i1> %r to i8
+  ret i8 %res;
+}
+
+; CHECK-LABEL: select07
+; CHECK-DAG:  kmovw   %edx, %k0
+; CHECK-DAG:  kmovw   %edi, %k1
+; CHECK-DAG:  kmovw   %esi, %k2
+; CHECK: kandw %k0, %k1, %k1
+; CHECK-NEXT: knotw    %k0, %k0
+; CHECK-NEXT: kandw    %k0, %k2, %k0
+; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: kmovw   %k0, %eax
+define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
+  %mask = bitcast i8 %m to <8 x i1>
+  %a = bitcast i8 %a.0 to <8 x i1>
+  %b = bitcast i8 %b.0 to <8 x i1>
+  %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> %b
+  %res = bitcast <8 x i1> %r to i8
+  ret i8 %res;
+}

diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
deleted file mode 100644
index b99e89a..0000000
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ /dev/null

@@ -1,314 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
-; CHECK: LCP
-; CHECK: .long 2
-; CHECK: .long 5
-; CHECK: .long 0
-; CHECK: .long 0
-; CHECK: .long 7
-; CHECK: .long 0
-; CHECK: .long 10
-; CHECK: .long 1
-; CHECK: .long 0
-; CHECK: .long 5
-; CHECK: .long 0
-; CHECK: .long 4
-; CHECK: .long 7
-; CHECK: .long 0
-; CHECK: .long 10
-; CHECK: .long 1
-; CHECK-LABEL: test1:
-; CHECK: vpermps
-; CHECK: ret
-define <16 x float> @test1(<16 x float> %a) nounwind {
-  %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
-  ret <16 x float> %c
-}
-
-; CHECK-LABEL: test2:
-; CHECK: vpermd
-; CHECK: ret
-define <16 x i32> @test2(<16 x i32> %a) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
-  ret <16 x i32> %c
-}
-
-; CHECK-LABEL: test3:
-; CHECK: vpermq
-; CHECK: ret
-define <8 x i64> @test3(<8 x i64> %a) nounwind {
-  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1>
-  ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test4:
-; CHECK: vpermpd
-; CHECK: ret
-define <8 x double> @test4(<8 x double> %a) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <8 x double> %c
-}
-
-; CHECK-LABEL: test5:
-; CHECK: vpermt2pd
-; CHECK: ret
-define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  ret <8 x double> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test5m:
-; CHECK: vpermt2pd {{.* {%k[1-7]} {z}}}
-define <8 x double> @test5m(<8 x double> %a, <8 x double> %b, i8 %mask) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  %m = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %m, <8 x double> %c, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-; CHECK-LABEL: test6:
-; CHECK: vpermq $30
-; CHECK: ret
-define <8 x i64> @test6(<8 x i64> %a) nounwind {
-  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
-  ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test7:
-; CHECK: vpermt2q
-; CHECK: ret
-define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
-  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  ret <8 x i64> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test7m:
-; CHECK: vpermt2q {{.* {%k[1-7]} {z}}}
-define <8 x i64> @test7m(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind {
-  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  %m = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-; The mem variant of vpermt2 with a writemask
-; CHECK-LABEL: test7mm:
-; CHECK: vpermt2q {{\(.*\).* {%k[1-7]} {z}}}
-define <8 x i64> @test7mm(<8 x i64> %a, <8 x i64> *%pb, i8 %mask) nounwind {
-  %b = load <8 x i64>* %pb
-  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
-  %m = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-; CHECK-LABEL: test8:
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  ret <16 x i32> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test8m:
-; CHECK: vpermt2d {{.* {%k[1-7]} {z}}}
-define <16 x i32> @test8m(<16 x i32> %a, <16 x i32> %b, i16 %mask) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  %m = bitcast i16 %mask to <16 x i1>
-  %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-; The mem variant of vpermt2 with a writemask
-; CHECK-LABEL: test8mm:
-; CHECK: vpermt2d {{\(.*\).* {%k[1-7]} {z}}}
-define <16 x i32> @test8mm(<16 x i32> %a, <16 x i32> *%pb, i16 %mask) nounwind {
-  %b = load <16 x i32> * %pb
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  %m = bitcast i16 %mask to <16 x i1>
-  %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-; CHECK-LABEL: test9:
-; CHECK: vpermt2ps
-; CHECK: ret
-define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
-  %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  ret <16 x float> %c
-}
-
-; The reg variant of vpermt2 with a writemask
-; CHECK-LABEL: test9m:
-; CHECK: vpermt2ps {{.*}} {%k{{.}}} {z}
-define <16 x float> @test9m(<16 x float> %a, <16 x float> %b, i16 %mask) nounwind {
-  %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  %m = bitcast i16 %mask to <16 x i1>
-  %res = select <16 x i1> %m, <16 x float> %c, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-; CHECK-LABEL: test10:
-; CHECK: vpermt2ps (
-; CHECK: ret
-define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
-  %c = load <16 x float>* %b
-  %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  ret <16 x float> %d
-}
-
-; CHECK-LABEL: test11:
-; CHECK: vpermt2d 
-; CHECK: ret
-define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
-  %c = load <16 x i32>* %b
-  %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
-  ret <16 x i32> %d
-}
-
-; CHECK-LABEL: test12
-; CHECK: vmovlhps {{.*}}## encoding: [0x62
-; CHECK: ret
-define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind {
-  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x i32> %c
-}
-
-; CHECK-LABEL: test13
-; CHECK: vpermilps $-79, %zmm
-; CHECK: ret
-define <16 x float> @test13(<16 x float> %a) {
- %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test14
-; CHECK: vpermilpd $-53, %zmm
-; CHECK: ret
-define <8 x double> @test14(<8 x double> %a) {
- %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
- ret <8 x double> %b
-}
-
-; CHECK-LABEL: test15
-; CHECK: vpshufd $-79, %zmm
-; CHECK: ret
-define <16 x i32> @test15(<16 x i32> %a) {
- %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x i32> %b
-}
-; CHECK-LABEL: test16
-; CHECK: valignq $2, %zmm0, %zmm1
-; CHECK: ret
-define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-  ret <8 x double> %c
-}
-
-; CHECK-LABEL: test17
-; CHECK: vshufpd $19, %zmm1, %zmm0
-; CHECK: ret
-define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
-  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
-  ret <8 x double> %c
-}
-
-; CHECK-LABEL: test18
-; CHECK: vpunpckhdq %zmm
-; CHECK: ret
-define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15, i32 18, i32 26, i32 19, i32 27, i32 22, i32 30, i32 23, i32 31>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test19
-; CHECK: vpunpckldq %zmm
-; CHECK: ret
-define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test20
-; CHECK: vpunpckhqdq  %zmm
-; CHECK: ret
-define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) {
- %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15>
- ret <8 x i64> %b
-}
-
-; CHECK-LABEL: test21
-; CHECK: vunpcklps %zmm
-; CHECK: ret
-define <16 x float> @test21(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test22
-; CHECK: vmovhlps {{.*}}## encoding: [0x62
-; CHECK: ret
-define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind {
-  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  ret <4 x i32> %c
-}
-
-; CHECK-LABEL: @test23
-; CHECK: vshufps $-112, %zmm
-; CHECK: ret
-define <16 x float> @test23(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: @test24
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test25
-; CHECK: vshufps  $52
-; CHECK: ret
-define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 undef, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef>
-  ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test26
-; CHECK: vmovshdup
-; CHECK: ret
-define <16 x i32> @test26(<16 x i32> %a) nounwind {
-  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
-  ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test27
-; CHECK: ret
-define <16 x i32> @test27(<4 x i32>%a) {
- %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %res
-}
-
-; CHECK-LABEL: @test28
-; CHECK: vinserti64x4 $1
-; CHECK: ret
-define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) {
- %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
-                                                              i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
- ret <16 x i32> %res
-}
-
-; CHECK-LABEL: @test29
-; CHECK: vinserti64x4 $0
-; CHECK: ret
-define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) {
- %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
-                                                              i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i32> %res
-}
-

diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll
index 5e097be..91ef5d5 100644
--- a/test/CodeGen/X86/avx512-trunc-ext.ll
+++ b/test/CodeGen/X86/avx512-trunc-ext.ll

@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s
 
 ; CHECK-LABEL: trunc_16x32_to_16x8
 ; CHECK: vpmovdb
@@ -118,6 +119,7 @@
 
 ; CHECK-LABEL: sext_8i1_8i32
 ; CHECK: vpbroadcastq  LCP{{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX: vpmovm2d
 ; CHECK: ret
 define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
   %x = icmp slt <8 x i32> %a1, %a2
@@ -135,9 +137,8 @@
 }
 
 ; CHECK-LABEL: trunc_i32_to_i1
-; CHECK: andl
-; CHECK: kmov
-; CHECK: kortest
+; CHECK: testb
+; CHECK: setne
 ; CKECK: orl
 ; CHECK: ret
 define i16 @trunc_i32_to_i1(i32 %a) {
@@ -146,3 +147,30 @@
   %res = bitcast <16 x i1> %maskv to i16
   ret i16 %res
 }
+
+; CHECK-LABEL: sext_8i1_8i16
+; SKX: vpmovm2w
+; CHECK: ret
+define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+  %x = icmp slt <8 x i32> %a1, %a2
+  %y = sext <8 x i1> %x to <8 x i16>
+  ret <8 x i16> %y
+}
+
+; CHECK-LABEL: sext_16i1_16i32
+; SKX: vpmovm2d
+; CHECK: ret
+define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
+  %x = icmp slt <16 x i32> %a1, %a2
+  %y = sext <16 x i1> %x to <16 x i32>
+  ret <16 x i32> %y
+}
+
+; CHECK-LABEL: sext_8i1_8i64
+; SKX: vpmovm2q
+; CHECK: ret
+define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+  %x = icmp slt <8 x i32> %a1, %a2
+  %y = sext <8 x i1> %x to <8 x i64>
+  ret <8 x i64> %y
+}

diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 9c6db11..0b0e0fc 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll

@@ -1,59 +1,72 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-;CHECK-LABEL: _inreg16xi32:
-;CHECK: vpbroadcastd {{.*}}, %zmm
-;CHECK: ret
 define   <16 x i32> @_inreg16xi32(i32 %a) {
+; CHECK-LABEL: _inreg16xi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
+; CHECK-NEXT:    retq
   %b = insertelement <16 x i32> undef, i32 %a, i32 0
   %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
   ret <16 x i32> %c
 }
 
-;CHECK-LABEL: _inreg8xi64:
-;CHECK: vpbroadcastq {{.*}}, %zmm
-;CHECK: ret
 define   <8 x i64> @_inreg8xi64(i64 %a) {
+; CHECK-LABEL: _inreg8xi64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT:    retq
   %b = insertelement <8 x i64> undef, i64 %a, i32 0
   %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
   ret <8 x i64> %c
 }
 
-;CHECK-LABEL: _inreg16xfloat:
-;CHECK: vbroadcastss {{.*}}, %zmm
-;CHECK: ret
 define   <16 x float> @_inreg16xfloat(float %a) {
+; CHECK-LABEL: _inreg16xfloat:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = insertelement <16 x float> undef, float %a, i32 0
   %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %c
 }
 
-;CHECK-LABEL: _inreg8xdouble:
-;CHECK: vbroadcastsd {{.*}}, %zmm
-;CHECK: ret
 define   <8 x double> @_inreg8xdouble(double %a) {
+; CHECK-LABEL: _inreg8xdouble:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = insertelement <8 x double> undef, double %a, i32 0
   %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
   ret <8 x double> %c
 }
 
-;CHECK-LABEL: _xmm16xi32
-;CHECK: vpbroadcastd
-;CHECK: ret
 define   <16 x i32> @_xmm16xi32(<16 x i32> %a) {
+; CHECK-LABEL: _xmm16xi32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
   ret <16 x i32> %b
 }
 
-;CHECK-LABEL: _xmm16xfloat
-;CHECK: vbroadcastss {{.*}}## encoding: [0x62
-;CHECK: ret
 define   <16 x float> @_xmm16xfloat(<16 x float> %a) {
+; CHECK-LABEL: _xmm16xfloat:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq
   %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %b
 }
 
 define <16 x i32> @test_vbroadcast() {
-  ; CHECK: vpbroadcastd
+; CHECK-LABEL: test_vbroadcast:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vcmpunordps %zmm0, %zmm0, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT:    knotw %k1, %k1
+; CHECK-NEXT:    vmovdqu32 %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
 entry:
   %0 = sext <16 x i1> zeroinitializer to <16 x i32>
   %1 = fcmp uno <16 x float> undef, zeroinitializer
@@ -62,3 +75,108 @@
   ret <16 x i32> %3
 }
 
+; We implement the set1 intrinsics with vector initializers.  Verify that the
+; IR generated will produce broadcasts at the end.
+define <8 x double> @test_set1_pd(double %d) #2 {
+; CHECK-LABEL: test_set1_pd:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %vecinit.i = insertelement <8 x double> undef, double %d, i32 0
+  %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1
+  %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %d, i32 2
+  %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %d, i32 3
+  %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %d, i32 4
+  %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %d, i32 5
+  %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %d, i32 6
+  %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %d, i32 7
+  ret <8 x double> %vecinit7.i
+}
+
+define <8 x i64> @test_set1_epi64(i64 %d) #2 {
+; CHECK-LABEL: test_set1_epi64:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0
+  %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1
+  %vecinit2.i = insertelement <8 x i64> %vecinit1.i, i64 %d, i32 2
+  %vecinit3.i = insertelement <8 x i64> %vecinit2.i, i64 %d, i32 3
+  %vecinit4.i = insertelement <8 x i64> %vecinit3.i, i64 %d, i32 4
+  %vecinit5.i = insertelement <8 x i64> %vecinit4.i, i64 %d, i32 5
+  %vecinit6.i = insertelement <8 x i64> %vecinit5.i, i64 %d, i32 6
+  %vecinit7.i = insertelement <8 x i64> %vecinit6.i, i64 %d, i32 7
+  ret <8 x i64> %vecinit7.i
+}
+
+define <16 x float> @test_set1_ps(float %f) #2 {
+; CHECK-LABEL: test_set1_ps:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %vecinit.i = insertelement <16 x float> undef, float %f, i32 0
+  %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1
+  %vecinit2.i = insertelement <16 x float> %vecinit1.i, float %f, i32 2
+  %vecinit3.i = insertelement <16 x float> %vecinit2.i, float %f, i32 3
+  %vecinit4.i = insertelement <16 x float> %vecinit3.i, float %f, i32 4
+  %vecinit5.i = insertelement <16 x float> %vecinit4.i, float %f, i32 5
+  %vecinit6.i = insertelement <16 x float> %vecinit5.i, float %f, i32 6
+  %vecinit7.i = insertelement <16 x float> %vecinit6.i, float %f, i32 7
+  %vecinit8.i = insertelement <16 x float> %vecinit7.i, float %f, i32 8
+  %vecinit9.i = insertelement <16 x float> %vecinit8.i, float %f, i32 9
+  %vecinit10.i = insertelement <16 x float> %vecinit9.i, float %f, i32 10
+  %vecinit11.i = insertelement <16 x float> %vecinit10.i, float %f, i32 11
+  %vecinit12.i = insertelement <16 x float> %vecinit11.i, float %f, i32 12
+  %vecinit13.i = insertelement <16 x float> %vecinit12.i, float %f, i32 13
+  %vecinit14.i = insertelement <16 x float> %vecinit13.i, float %f, i32 14
+  %vecinit15.i = insertelement <16 x float> %vecinit14.i, float %f, i32 15
+  ret <16 x float> %vecinit15.i
+}
+
+define <16 x i32> @test_set1_epi32(i32 %f) #2 {
+; CHECK-LABEL: test_set1_epi32:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0
+  %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1
+  %vecinit2.i = insertelement <16 x i32> %vecinit1.i, i32 %f, i32 2
+  %vecinit3.i = insertelement <16 x i32> %vecinit2.i, i32 %f, i32 3
+  %vecinit4.i = insertelement <16 x i32> %vecinit3.i, i32 %f, i32 4
+  %vecinit5.i = insertelement <16 x i32> %vecinit4.i, i32 %f, i32 5
+  %vecinit6.i = insertelement <16 x i32> %vecinit5.i, i32 %f, i32 6
+  %vecinit7.i = insertelement <16 x i32> %vecinit6.i, i32 %f, i32 7
+  %vecinit8.i = insertelement <16 x i32> %vecinit7.i, i32 %f, i32 8
+  %vecinit9.i = insertelement <16 x i32> %vecinit8.i, i32 %f, i32 9
+  %vecinit10.i = insertelement <16 x i32> %vecinit9.i, i32 %f, i32 10
+  %vecinit11.i = insertelement <16 x i32> %vecinit10.i, i32 %f, i32 11
+  %vecinit12.i = insertelement <16 x i32> %vecinit11.i, i32 %f, i32 12
+  %vecinit13.i = insertelement <16 x i32> %vecinit12.i, i32 %f, i32 13
+  %vecinit14.i = insertelement <16 x i32> %vecinit13.i, i32 %f, i32 14
+  %vecinit15.i = insertelement <16 x i32> %vecinit14.i, i32 %f, i32 15
+  ret <16 x i32> %vecinit15.i
+}
+
+; We implement the scalar broadcast intrinsics with vector initializers.
+; Verify that the IR generated will produce the broadcast at the end.
+define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) {
+; CHECK-LABEL: test_mm512_broadcastsd_pd:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = extractelement <2 x double> %a, i32 0
+  %vecinit.i = insertelement <8 x double> undef, double %0, i32 0
+  %vecinit1.i = insertelement <8 x double> %vecinit.i, double %0, i32 1
+  %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %0, i32 2
+  %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %0, i32 3
+  %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %0, i32 4
+  %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %0, i32 5
+  %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %0, i32 6
+  %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %0, i32 7
+  ret <8 x double> %vecinit7.i
+}

diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index d762f00..c71e60e 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll

@@ -1,145 +1,176 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-; CHECK-LABEL: test1
-; CHECK: vcmpleps
-; CHECK: vmovups
-; CHECK: ret
 define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmpleps %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = fcmp ole <16 x float> %x, %y
   %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
   ret <16 x float> %max
 }
 
-; CHECK-LABEL: test2
-; CHECK: vcmplepd
-; CHECK: vmovupd
-; CHECK: ret
 define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmplepd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = fcmp ole <8 x double> %x, %y
   %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
   ret <8 x double> %max
 }
 
-; CHECK-LABEL: test3
-; CHECK: vpcmpeqd  (%rdi)
-; CHECK: vmovdqu32
-; CHECK: ret
 define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %y = load <16 x i32>* %yp, align 4
   %mask = icmp eq <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
   ret <16 x i32> %max
 }
 
-; CHECK-LABEL: @test4_unsigned
-; CHECK: vpcmpnltud
-; CHECK: vmovdqu32
-; CHECK: ret
 define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
+; CHECK-LABEL: test4_unsigned:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = icmp uge <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
   ret <16 x i32> %max
 }
 
-; CHECK-LABEL: test5
-; CHECK: vpcmpeqq {{.*}}%k1
-; CHECK: vmovdqu64 {{.*}}%k1
-; CHECK: ret
 define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
+; CHECK-LABEL: test5:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = icmp eq <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
   ret <8 x i64> %max
 }
 
-; CHECK-LABEL: test6_unsigned
-; CHECK: vpcmpnleuq {{.*}}%k1
-; CHECK: vmovdqu64 {{.*}}%k1
-; CHECK: ret
 define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind {
+; CHECK-LABEL: test6_unsigned:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %mask = icmp ugt <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
   ret <8 x i64> %max
 }
 
-; CHECK-LABEL: test7
-; CHECK: xor
-; CHECK: vcmpltps
-; CHECK: vblendvps
-; CHECK: ret
 define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test7:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpltps %xmm2, %xmm0, %xmm2
+; CHECK-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %mask = fcmp olt <4 x float> %a, zeroinitializer
   %c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b
   ret <4 x float>%c
 }
 
-; CHECK-LABEL: test8
-; CHECK: xor
-; CHECK: vcmpltpd
-; CHECK: vblendvpd
-; CHECK: ret
 define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpltpd %xmm2, %xmm0, %xmm2
+; CHECK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %mask = fcmp olt <2 x double> %a, zeroinitializer
   %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
   ret <2 x double>%c
 }
 
-; CHECK-LABEL: test9
-; CHECK: vpcmpeqd
-; CHECK: vpblendmd
-; CHECK: ret
 define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: test9:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:      ## kill: YMM1<def> YMM1<kill> ZMM1<def>
+; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<def>
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<kill>
+; CHECK-NEXT:    retq
   %mask = icmp eq <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
 }
 
-; CHECK-LABEL: test10
-; CHECK: vcmpeqps
-; CHECK: vblendmps
-; CHECK: ret
 define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
+; CHECK-LABEL: test10:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:      ## kill: YMM1<def> YMM1<kill> ZMM1<def>
+; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<def>
+; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:      ## kill: YMM0<def> YMM0<kill> ZMM0<kill>
+; CHECK-NEXT:    retq
   %mask = fcmp oeq <8 x float> %x, %y
   %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
   ret <8 x float> %max
 }
 
-; CHECK-LABEL: test11_unsigned
-; CHECK: vpmaxud
-; CHECK: ret
 define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: test11_unsigned:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
   %mask = icmp ugt <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
 }
 
-; CHECK-LABEL: test12
-; CHECK: vpcmpeqq        %zmm2, %zmm0, [[LO:%k[0-7]]]
-; CHECK: vpcmpeqq        %zmm3, %zmm1, [[HI:%k[0-7]]]
-; CHECK: kunpckbw        [[LO]], [[HI]], {{%k[0-7]}}
 
 define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
+; CHECK-LABEL: test12:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT:    kunpckbw %k0, %k1, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:      ## kill: AX<def> AX<kill> EAX<kill>
+; CHECK-NEXT:    retq
   %res = icmp eq <16 x i64> %a, %b
   %res1 = bitcast <16 x i1> %res to i16
   ret i16 %res1
 }
 
-; CHECK-LABEL: test13
-; CHECK: vcmpeqps        %zmm
-; CHECK: vpbroadcastd
-; CHECK: ret
 define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
+; CHECK-LABEL: test13:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
 {
   %cmpvector_i = fcmp oeq <16 x float> %a, %b
   %conv = zext <16 x i1> %cmpvector_i to <16 x i32>
   ret <16 x i32> %conv
 }
 
-; CHECK-LABEL: test14
-; CHECK: vpcmp
-; CHECK-NOT: vpcmp
-; CHECK: vmovdqu32 {{.*}}{%k1} {z}
-; CHECK: ret
 define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
+; CHECK-LABEL: test14:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
+; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    knotw %k0, %k1
+; CHECK-NEXT:    vmovdqu32 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %sub_r = sub <16 x i32> %a, %b
   %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
   %sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32>
@@ -148,12 +179,15 @@
   ret <16 x i32>%res
 }
 
-; CHECK-LABEL: test15
-; CHECK: vpcmpgtq
-; CHECK-NOT: vpcmp
-; CHECK: vmovdqu64 {{.*}}{%k1} {z}
-; CHECK: ret
 define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
+; CHECK-LABEL: test15:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm1
+; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    knotw %k0, %k1
+; CHECK-NEXT:    vmovdqu64 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %sub_r = sub <8 x i64> %a, %b
   %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
   %sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64>
@@ -162,3 +196,181 @@
   ret <8 x i64>%res
 }
 
+define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y) nounwind {
+; CHECK-LABEL: test16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled %zmm0, %zmm1, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask = icmp sge <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test17:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %y = load <16 x i32>* %y.ptr, align 4
+  %mask = icmp sgt <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test18:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled (%rdi), %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %y = load <16 x i32>* %y.ptr, align 4
+  %mask = icmp sle <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
+; CHECK-LABEL: test19:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %y = load <16 x i32>* %y.ptr, align 4
+  %mask = icmp ule <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind {
+; CHECK-LABEL: test20:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp eq <16 x i32> %x1, %y1
+  %mask0 = icmp eq <16 x i32> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32> %max
+}
+
+define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind {
+; CHECK-LABEL: test21:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpcmpleq %zmm2, %zmm3, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sge <8 x i64> %x1, %y1
+  %mask0 = icmp sle <8 x i64> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+  ret <8 x i64> %max
+}
+
+define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
+; CHECK-LABEL: test22:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpgtq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sgt <8 x i64> %x1, %y1
+  %y = load <8 x i64>* %y.ptr, align 4
+  %mask0 = icmp sgt <8 x i64> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+  ret <8 x i64> %max
+}
+
+define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
+; CHECK-LABEL: test23:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sge <16 x i32> %x1, %y1
+  %y = load <16 x i32>* %y.ptr, align 4
+  %mask0 = icmp ule <16 x i32> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
+; CHECK-LABEL: test24:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
+  %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
+  %mask = icmp eq <8 x i64> %x, %y
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+  ret <8 x i64> %max
+}
+
+define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind {
+; CHECK-LABEL: test25:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled (%rdi){1to16}, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
+  %mask = icmp sle <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
+; CHECK-LABEL: test26:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpled %zmm1, %zmm2, %k1
+; CHECK-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sge <16 x i32> %x1, %y1
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
+  %mask0 = icmp sgt <16 x i32> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
+}
+
+define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
+; CHECK-LABEL: test27:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpcmpleq        %zmm1, %zmm2, %k1
+; CHECK-NEXT:    vpcmpleq        (%rdi){1to8}, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %mask1 = icmp sge <8 x i64> %x1, %y1
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
+  %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
+  %mask0 = icmp sle <8 x i64> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1
+  ret <8 x i64> %max
+}

diff --git a/test/CodeGen/X86/avx512-zext-load-crash.ll b/test/CodeGen/X86/avx512-zext-load-crash.ll
deleted file mode 100644
index 07ded13..0000000
--- a/test/CodeGen/X86/avx512-zext-load-crash.ll
+++ /dev/null

@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-
-define <8 x i16> @test_zext_load() {
-  ; CHECK: vmovq
-entry:
-  %0 = load <2 x i16> ** undef, align 8
-  %1 = getelementptr inbounds <2 x i16>* %0, i64 1
-  %2 = load <2 x i16>* %0, align 1
-  %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %4 = load <2 x i16>* %1, align 1
-  %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
-  ret <8 x i16> %6
-}

diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
new file mode 100644
index 0000000..bbc418c
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll

@@ -0,0 +1,305 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw --show-mc-encoding| FileCheck %s
+
+define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+  %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+  ret i64 %res
+}
+
+define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+  ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+  ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
+
+define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b
+; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ##
+  %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
+  ret i64 %res
+}
+
+define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b
+; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+  ret i64 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w
+; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
+  ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w
+; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
+
+define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; CHECK_LABEL: test_cmp_b_512
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
+  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
+  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
+  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
+  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
+  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
+  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
+  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
+  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
+  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
+  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
+  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
+  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
+  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
+  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
+  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
+  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+  ret <8 x i64> %vec7
+}
+
+define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; CHECK_LABEL: test_mask_cmp_b_512
+; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
+  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
+  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
+  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
+  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
+  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
+  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
+  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
+  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+  ret <8 x i64> %vec7
+}
+
+declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
+
+define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+; CHECK_LABEL: test_ucmp_b_512
+; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
+  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
+  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
+  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
+  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
+  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
+  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
+  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
+  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
+  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
+  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
+  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
+  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
+  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
+  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
+  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
+  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+  ret <8 x i64> %vec7
+}
+
+define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+; CHECK_LABEL: test_mask_ucmp_b_512
+; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
+  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
+; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
+  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
+; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
+  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
+; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
+  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
+; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
+  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
+; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
+  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
+; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
+  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
+; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
+  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
+  ret <8 x i64> %vec7
+}
+
+declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
+
+define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK_LABEL: test_cmp_w_512
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_cmp_w_512
+; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
+
+define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK_LABEL: test_ucmp_w_512
+; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_ucmp_w_512
+; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone

diff --git a/test/CodeGen/X86/avx512bw-mask-op.ll b/test/CodeGen/X86/avx512bw-mask-op.ll
new file mode 100644
index 0000000..9d7630c
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-mask-op.ll

@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+define i32 @mask32(i32 %x) {
+  %m0 = bitcast i32 %x to <32 x i1>
+  %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <32 x i1> %m1 to i32
+  ret i32 %ret
+; CHECK-LABEL: mask32
+; CHECK: kmovd
+; CHECK-NEXT: knotd
+; CHECK-NEXT: kmovd
+; CHECK_NEXT: ret
+}
+
+define i64 @mask64(i64 %x) {
+  %m0 = bitcast i64 %x to <64 x i1>
+  %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <64 x i1> %m1 to i64
+  ret i64 %ret
+; CHECK-LABEL: mask64
+; CHECK: kmovq
+; CHECK-NEXT: knotq
+; CHECK-NEXT: kmovq
+; CHECK_NEXT: ret
+}
+
+define void @mask32_mem(i32* %ptr) {
+  %x = load i32* %ptr, align 4
+  %m0 = bitcast i32 %x to <32 x i1>
+  %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <32 x i1> %m1 to i32
+  store i32 %ret, i32* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask32_mem
+; CHECK: kmovd ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
+; CHECK-NEXT: knotd
+; CHECK-NEXT: kmovd %k{{[0-7]}}, ([[ARG1]])
+; CHECK_NEXT: ret
+}
+
+define void @mask64_mem(i64* %ptr) {
+  %x = load i64* %ptr, align 4
+  %m0 = bitcast i64 %x to <64 x i1>
+  %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1,
+                            i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <64 x i1> %m1 to i64
+  store i64 %ret, i64* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask64_mem
+; CHECK: kmovq ([[ARG1]]), %k{{[0-7]}}
+; CHECK-NEXT: knotq
+; CHECK-NEXT: kmovq %k{{[0-7]}}, ([[ARG1]])
+; CHECK_NEXT: ret
+}
+
+define i32 @mand32(i32 %x, i32 %y) {
+  %ma = bitcast i32 %x to <32 x i1>
+  %mb = bitcast i32 %y to <32 x i1>
+  %mc = and <32 x i1> %ma, %mb
+  %md = xor <32 x i1> %ma, %mb
+  %me = or <32 x i1> %mc, %md
+  %ret = bitcast <32 x i1> %me to i32
+; CHECK: kandd
+; CHECK: kxord
+; CHECK: kord
+  ret i32 %ret
+}
+
+define i64 @mand64(i64 %x, i64 %y) {
+  %ma = bitcast i64 %x to <64 x i1>
+  %mb = bitcast i64 %y to <64 x i1>
+  %mc = and <64 x i1> %ma, %mb
+  %md = xor <64 x i1> %ma, %mb
+  %me = or <64 x i1> %mc, %md
+  %ret = bitcast <64 x i1> %me to i64
+; CHECK: kandq
+; CHECK: kxorq
+; CHECK: korq
+  ret i64 %ret
+}

diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll
new file mode 100644
index 0000000..2ff6d28
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-mov.ll

@@ -0,0 +1,81 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vmovdqu8
+; CHECK: ret
+define <64 x i8> @test1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <64 x i8>*
+  %res = load <64 x i8>* %vaddr, align 1
+  ret <64 x i8>%res
+}
+
+; CHECK-LABEL: test2
+; CHECK: vmovdqu8
+; CHECK: ret
+define void @test2(i8 * %addr, <64 x i8> %data) {
+  %vaddr = bitcast i8* %addr to <64 x i8>*
+  store <64 x i8>%data, <64 x i8>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test3
+; CHECK: vmovdqu8{{.*{%k[1-7]}}}
+; CHECK: ret
+define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
+  %mask = icmp ne <64 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <64 x i8>*
+  %r = load <64 x i8>* %vaddr, align 1
+  %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> %old
+  ret <64 x i8>%res
+}
+
+; CHECK-LABEL: test4
+; CHECK: vmovdqu8{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
+  %mask = icmp ne <64 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <64 x i8>*
+  %r = load <64 x i8>* %vaddr, align 1
+  %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> zeroinitializer
+  ret <64 x i8>%res
+}
+
+; CHECK-LABEL: test5
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test5(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <32 x i16>*
+  %res = load <32 x i16>* %vaddr, align 1
+  ret <32 x i16>%res
+}
+
+; CHECK-LABEL: test6
+; CHECK: vmovdqu16
+; CHECK: ret
+define void @test6(i8 * %addr, <32 x i16> %data) {
+  %vaddr = bitcast i8* %addr to <32 x i16>*
+  store <32 x i16>%data, <32 x i16>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test7
+; CHECK: vmovdqu16{{.*{%k[1-7]}}}
+; CHECK: ret
+define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
+  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <32 x i16>*
+  %r = load <32 x i16>* %vaddr, align 1
+  %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> %old
+  ret <32 x i16>%res
+}
+
+; CHECK-LABEL: test8
+; CHECK: vmovdqu16{{.*{%k[1-7]} {z}}}
+; CHECK: ret
+define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
+  %mask = icmp ne <32 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <32 x i16>*
+  %r = load <32 x i16>* %vaddr, align 1
+  %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> zeroinitializer
+  ret <32 x i16>%res
+}

diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll
new file mode 100644
index 0000000..d2b1724
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll

@@ -0,0 +1,135 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vpcmpeqb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
+  %mask = icmp eq <64 x i8> %x, %y
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: test2
+; CHECK: vpcmpgtb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y) nounwind {
+  %mask = icmp sgt <64 x i8> %x, %y
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: @test3
+; CHECK: vpcmplew {{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind {
+  %mask = icmp sge <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: test4
+; CHECK: vpcmpnleub {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y) nounwind {
+  %mask = icmp ugt <64 x i8> %x, %y
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: test5
+; CHECK: vpcmpeqw  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwind {
+  %y = load <32 x i16>* %yp, align 4
+  %mask = icmp eq <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test6
+; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+  %y = load <32 x i16>* %y.ptr, align 4
+  %mask = icmp sgt <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test7
+; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+  %y = load <32 x i16>* %y.ptr, align 4
+  %mask = icmp sle <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test8
+; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
+  %y = load <32 x i16>* %y.ptr, align 4
+  %mask = icmp ule <32 x i16> %x, %y
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test9
+; CHECK: vpcmpeqw %zmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+  %mask1 = icmp eq <32 x i16> %x1, %y1
+  %mask0 = icmp eq <32 x i16> %x, %y
+  %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %y
+  ret <32 x i16> %max
+}
+
+; CHECK-LABEL: @test10
+; CHECK: vpcmpleb %zmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+  %mask1 = icmp sge <64 x i8> %x1, %y1
+  %mask0 = icmp sle <64 x i8> %x, %y
+  %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: @test11
+; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i8> %y1) nounwind {
+  %mask1 = icmp sgt <64 x i8> %x1, %y1
+  %y = load <64 x i8>* %y.ptr, align 4
+  %mask0 = icmp sgt <64 x i8> %x, %y
+  %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer
+  %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1
+  ret <64 x i8> %max
+}
+
+; CHECK-LABEL: @test12
+; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 x i16> %y1) nounwind {
+  %mask1 = icmp sge <32 x i16> %x1, %y1
+  %y = load <32 x i16>* %y.ptr, align 4
+  %mask0 = icmp ule <32 x i16> %x, %y
+  %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+  %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1
+  ret <32 x i16> %max
+}

diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
new file mode 100644
index 0000000..45f8d6d
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll

@@ -0,0 +1,613 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; 256-bit
+
+define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+  ret i32 %res
+}
+
+define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
+
+define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b_256
+; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
+  ret i32 %res
+}
+
+define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b_256
+; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w_256
+; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w_256
+; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
+
+define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK_LABEL: test_cmp_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_cmp_b_256
+; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
+
+define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK_LABEL: test_ucmp_b_256
+; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+; CHECK_LABEL: test_mask_ucmp_b_256
+; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
+  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
+; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
+  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
+; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
+  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
+; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
+  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
+; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
+  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
+; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
+  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
+; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
+  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
+; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
+  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
+  ret <8 x i32> %vec7
+}
+
+declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
+
+define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK_LABEL: test_cmp_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_cmp_w_256
+; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK_LABEL: test_ucmp_w_256
+; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_ucmp_w_256
+; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
+
+; 128-bit
+
+define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_pcmpeq_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_pcmpeq_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
+
+define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_pcmpgt_b_128
+; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_b_128
+; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_pcmpgt_w_128
+; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_w_128
+; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
+
+define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK_LABEL: test_cmp_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_cmp_b_128
+; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK_LABEL: test_ucmp_b_128
+; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
+; CHECK_LABEL: test_mask_ucmp_b_128
+; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
+
+define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK_LABEL: test_cmp_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_w_128
+; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK_LABEL: test_ucmp_w_128
+; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_w_128
+; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone

diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll
new file mode 100644
index 0000000..835844f
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-mov.ll

@@ -0,0 +1,162 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; CHECK-LABEL: test_256_1
+; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <32 x i8> @test_256_1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <32 x i8>*
+  %res = load <32 x i8>* %vaddr, align 1
+  ret <32 x i8>%res
+}
+
+; CHECK-LABEL: test_256_2
+; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_2(i8 * %addr, <32 x i8> %data) {
+  %vaddr = bitcast i8* %addr to <32 x i8>*
+  store <32 x i8>%data, <32 x i8>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_3
+; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
+  %mask = icmp ne <32 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <32 x i8>*
+  %r = load <32 x i8>* %vaddr, align 1
+  %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> %old
+  ret <32 x i8>%res
+}
+
+; CHECK-LABEL: test_256_4
+; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
+  %mask = icmp ne <32 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <32 x i8>*
+  %r = load <32 x i8>* %vaddr, align 1
+  %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> zeroinitializer
+  ret <32 x i8>%res
+}
+
+; CHECK-LABEL: test_256_5
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define <16 x i16> @test_256_5(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x i16>*
+  %res = load <16 x i16>* %vaddr, align 1
+  ret <16 x i16>%res
+}
+
+; CHECK-LABEL: test_256_6
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_6(i8 * %addr, <16 x i16> %data) {
+  %vaddr = bitcast i8* %addr to <16 x i16>*
+  store <16 x i16>%data, <16 x i16>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_7
+; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
+  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i16>*
+  %r = load <16 x i16>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> %old
+  ret <16 x i16>%res
+}
+
+; CHECK-LABEL: test_256_8
+; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
+  %mask = icmp ne <16 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i16>*
+  %r = load <16 x i16>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> zeroinitializer
+  ret <16 x i16>%res
+}
+
+; CHECK-LABEL: test_128_1
+; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <16 x i8> @test_128_1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x i8>*
+  %res = load <16 x i8>* %vaddr, align 1
+  ret <16 x i8>%res
+}
+
+; CHECK-LABEL: test_128_2
+; CHECK: vmovdqu8{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_2(i8 * %addr, <16 x i8> %data) {
+  %vaddr = bitcast i8* %addr to <16 x i8>*
+  store <16 x i8>%data, <16 x i8>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_3
+; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
+  %mask = icmp ne <16 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i8>*
+  %r = load <16 x i8>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> %old
+  ret <16 x i8>%res
+}
+
+; CHECK-LABEL: test_128_4
+; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
+  %mask = icmp ne <16 x i8> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i8>*
+  %r = load <16 x i8>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> zeroinitializer
+  ret <16 x i8>%res
+}
+
+; CHECK-LABEL: test_128_5
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define <8 x i16> @test_128_5(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i16>*
+  %res = load <8 x i16>* %vaddr, align 1
+  ret <8 x i16>%res
+}
+
+; CHECK-LABEL: test_128_6
+; CHECK: vmovdqu16{{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_6(i8 * %addr, <8 x i16> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i16>*
+  store <8 x i16>%data, <8 x i16>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_7
+; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62
+; CHECK: ret
+define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
+  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i16>*
+  %r = load <8 x i16>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> %old
+  ret <8 x i16>%res
+}
+
+; CHECK-LABEL: test_128_8
+; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62
+; CHECK: ret
+define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) {
+  %mask = icmp ne <8 x i16> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i16>*
+  %r = load <8 x i16>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> zeroinitializer
+  ret <8 x i16>%res
+}
+

diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
new file mode 100644
index 0000000..2d13a16
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll

@@ -0,0 +1,269 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+; CHECK-LABEL: test256_1
+; CHECK: vpcmpeqb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind {
+  %mask = icmp eq <32 x i8> %x, %y
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: test256_2
+; CHECK: vpcmpgtb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+  %mask = icmp sgt <32 x i8> %x, %y
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: @test256_3
+; CHECK: vpcmplew {{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounwind {
+  %mask = icmp sge <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: test256_4
+; CHECK: vpcmpnleub {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
+  %mask = icmp ugt <32 x i8> %x, %y
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: test256_5
+; CHECK: vpcmpeqw  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nounwind {
+  %y = load <16 x i16>* %yp, align 4
+  %mask = icmp eq <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_6
+; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+  %y = load <16 x i16>* %y.ptr, align 4
+  %mask = icmp sgt <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_7
+; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+  %y = load <16 x i16>* %y.ptr, align 4
+  %mask = icmp sle <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_8
+; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
+  %y = load <16 x i16>* %y.ptr, align 4
+  %mask = icmp ule <16 x i16> %x, %y
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_9
+; CHECK: vpcmpeqw %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+  %mask1 = icmp eq <16 x i16> %x1, %y1
+  %mask0 = icmp eq <16 x i16> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: @test256_10
+; CHECK: vpcmpleb %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+  %mask1 = icmp sge <32 x i8> %x1, %y1
+  %mask0 = icmp sle <32 x i8> %x, %y
+  %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: @test256_11
+; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 x i8> %y1) nounwind {
+  %mask1 = icmp sgt <32 x i8> %x1, %y1
+  %y = load <32 x i8>* %y.ptr, align 4
+  %mask0 = icmp sgt <32 x i8> %x, %y
+  %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer
+  %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
+  ret <32 x i8> %max
+}
+
+; CHECK-LABEL: @test256_12
+; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, <16 x i16> %y1) nounwind {
+  %mask1 = icmp sge <16 x i16> %x1, %y1
+  %y = load <16 x i16>* %y.ptr, align 4
+  %mask0 = icmp ule <16 x i16> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1
+  ret <16 x i16> %max
+}
+
+; CHECK-LABEL: test128_1
+; CHECK: vpcmpeqb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind {
+  %mask = icmp eq <16 x i8> %x, %y
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: test128_2
+; CHECK: vpcmpgtb {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+  %mask = icmp sgt <16 x i8> %x, %y
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: @test128_3
+; CHECK: vpcmplew {{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind {
+  %mask = icmp sge <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: test128_4
+; CHECK: vpcmpnleub {{.*%k[0-7]}}
+; CHECK: vmovdqu8 {{.*}}%k1
+; CHECK: ret
+define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
+  %mask = icmp ugt <16 x i8> %x, %y
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: test128_5
+; CHECK: vpcmpeqw  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwind {
+  %y = load <8 x i16>* %yp, align 4
+  %mask = icmp eq <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_6
+; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+  %y = load <8 x i16>* %y.ptr, align 4
+  %mask = icmp sgt <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_7
+; CHECK: vpcmplew (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+  %y = load <8 x i16>* %y.ptr, align 4
+  %mask = icmp sle <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_8
+; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
+  %y = load <8 x i16>* %y.ptr, align 4
+  %mask = icmp ule <8 x i16> %x, %y
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_9
+; CHECK: vpcmpeqw %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+  %mask1 = icmp eq <8 x i16> %x1, %y1
+  %mask0 = icmp eq <8 x i16> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %max
+}
+
+; CHECK-LABEL: @test128_10
+; CHECK: vpcmpleb %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+  %mask1 = icmp sge <16 x i8> %x1, %y1
+  %mask0 = icmp sle <16 x i8> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: @test128_11
+; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu8
+; CHECK: ret
+define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 x i8> %y1) nounwind {
+  %mask1 = icmp sgt <16 x i8> %x1, %y1
+  %y = load <16 x i8>* %y.ptr, align 4
+  %mask0 = icmp sgt <16 x i8> %x, %y
+  %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
+  %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
+  ret <16 x i8> %max
+}
+
+; CHECK-LABEL: @test128_12
+; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqu16
+; CHECK: ret
+define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 x i16> %y1) nounwind {
+  %mask1 = icmp sge <8 x i16> %x1, %y1
+  %y = load <8 x i16>* %y.ptr, align 4
+  %mask0 = icmp ule <8 x i16> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1
+  ret <8 x i16> %max
+}

diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll
new file mode 100644
index 0000000..32a2633
--- /dev/null
+++ b/test/CodeGen/X86/avx512dq-mask-op.ll

@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+define i8 @mask8(i8 %x) {
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <8 x i1> %m1 to i8
+  ret i8 %ret
+; CHECK: mask8
+; CHECK: knotb
+; CHECK: ret
+}
+
+define void @mask8_mem(i8* %ptr) {
+  %x = load i8* %ptr, align 4
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <8 x i1> %m1 to i8
+  store i8 %ret, i8* %ptr, align 4
+  ret void
+; CHECK-LABEL: mask8_mem
+; CHECK: kmovb ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}}
+; CHECK-NEXT: knotb
+; CHECK-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]])
+; CHECK: ret
+}
+
+define i8 @mand8(i8 %x, i8 %y) {
+  %ma = bitcast i8 %x to <8 x i1>
+  %mb = bitcast i8 %y to <8 x i1>
+  %mc = and <8 x i1> %ma, %mb
+  %md = xor <8 x i1> %ma, %mb
+  %me = or <8 x i1> %mc, %md
+  %ret = bitcast <8 x i1> %me to i8
+; CHECK: kandb
+; CHECK: kxorb
+; CHECK: korb
+  ret i8 %ret
+}

diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
new file mode 100644
index 0000000..0000ece
--- /dev/null
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll

@@ -0,0 +1,79 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s
+
+define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
+  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK: kmovw
+  ; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) {
+  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
+  ; CHECK: kmovw
+  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
+  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8) 
+  ret <16 x float> %res
+}
+
+
+declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
+  ; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
+  ; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) 
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
+  ; CHECK: vexp2ps %zmm0, %zmm0 {sae}      # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
+  ; CHECK: vexp2pd %zmm0, %zmm0 {sae}      # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
+  ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
+  ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+

diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
new file mode 100644
index 0000000..fa19084
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll

@@ -0,0 +1,613 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; 256-bit
+
+define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_256
+; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_256
+; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_256
+; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_256
+; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
+
+define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK_LABEL: test_cmp_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_d_256
+; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK_LABEL: test_ucmp_d_256
+; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_d_256
+; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK_LABEL: test_cmp_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_q_256
+; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK_LABEL: test_ucmp_q_256
+; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_q_256
+; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
+
+; 128-bit
+
+define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpeq_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpeq_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpeq_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
+
+define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_pcmpgt_d_128
+; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_d_128
+; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
+
+define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_pcmpgt_q_128
+; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_pcmpgt_q_128
+; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ##
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
+
+define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK_LABEL: test_cmp_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_d_128
+; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK_LABEL: test_ucmp_d_128
+; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_d_128
+; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK_LABEL: test_cmp_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_cmp_q_128
+; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK_LABEL: test_ucmp_q_128
+; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+; CHECK_LABEL: test_mask_ucmp_q_128
+; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone

diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll
new file mode 100644
index 0000000..3224656
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-mov.ll

@@ -0,0 +1,642 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+
+; CHECK-LABEL: test_256_1
+; CHECK: vmovdqu32
+; CHECK: ret
+define <8 x i32> @test_256_1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %res = load <8 x i32>* %vaddr, align 1
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_2
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test_256_2(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %res = load <8 x i32>* %vaddr, align 32
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_3
+; CHECK: vmovdqa64
+; CHECK: ret
+define void @test_256_3(i8 * %addr, <4 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  store <4 x i64>%data, <4 x i64>* %vaddr, align 32
+  ret void
+}
+
+; CHECK-LABEL: test_256_4
+; CHECK: vmovdqu32
+; CHECK: ret
+define void @test_256_4(i8 * %addr, <8 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  store <8 x i32>%data, <8 x i32>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_5
+; CHECK: vmovdqa32
+; CHECK: ret
+define void @test_256_5(i8 * %addr, <8 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  store <8 x i32>%data, <8 x i32>* %vaddr, align 32
+  ret void
+}
+
+; CHECK-LABEL: test_256_6
+; CHECK: vmovdqa64
+; CHECK: ret
+define  <4 x i64> @test_256_6(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %res = load <4 x i64>* %vaddr, align 32
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_7
+; CHECK: vmovdqu64
+; CHECK: ret
+define void @test_256_7(i8 * %addr, <4 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  store <4 x i64>%data, <4 x i64>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_8
+; CHECK: vmovdqu64
+; CHECK: ret
+define <4 x i64> @test_256_8(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %res = load <4 x i64>* %vaddr, align 1
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_9
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_9(i8 * %addr, <4 x double> %data) {
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  store <4 x double>%data, <4 x double>* %vaddr, align 32
+  ret void
+}
+
+; CHECK-LABEL: test_256_10
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x double> @test_256_10(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %res = load <4 x double>* %vaddr, align 32
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_11
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_11(i8 * %addr, <8 x float> %data) {
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  store <8 x float>%data, <8 x float>* %vaddr, align 32
+  ret void
+}
+
+; CHECK-LABEL: test_256_12
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <8 x float> @test_256_12(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %res = load <8 x float>* %vaddr, align 32
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_13
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_13(i8 * %addr, <4 x double> %data) {
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  store <4 x double>%data, <4 x double>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_14
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x double> @test_256_14(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %res = load <4 x double>* %vaddr, align 1
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_15
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_256_15(i8 * %addr, <8 x float> %data) {
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  store <8 x float>%data, <8 x float>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_256_16
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <8 x float> @test_256_16(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %res = load <8 x float>* %vaddr, align 1
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_17
+; CHECK: vmovdqa32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %r = load <8 x i32>* %vaddr, align 32
+  %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_18
+; CHECK: vmovdqu32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %r = load <8 x i32>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_19
+; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %r = load <8 x i32>* %vaddr, align 32
+  %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_20
+; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i32>*
+  %r = load <8 x i32>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer
+  ret <8 x i32>%res
+}
+
+; CHECK-LABEL: test_256_21
+; CHECK: vmovdqa64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %r = load <4 x i64>* %vaddr, align 32
+  %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_22
+; CHECK: vmovdqu64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %r = load <4 x i64>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_23
+; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %r = load <4 x i64>* %vaddr, align 32
+  %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_24
+; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i64>*
+  %r = load <4 x i64>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer
+  ret <4 x i64>%res
+}
+
+; CHECK-LABEL: test_256_25
+; CHECK: vmovaps{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+  %mask = fcmp one <8 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %r = load <8 x float>* %vaddr, align 32
+  %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_26
+; CHECK: vmovups{{.*{%k[1-7]} }}
+; CHECK: ret
+define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
+  %mask = fcmp one <8 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %r = load <8 x float>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_27
+; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) {
+  %mask = fcmp one <8 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %r = load <8 x float>* %vaddr, align 32
+  %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_28
+; CHECK: vmovups{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
+  %mask = fcmp one <8 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x float>*
+  %r = load <8 x float>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer
+  ret <8 x float>%res
+}
+
+; CHECK-LABEL: test_256_29
+; CHECK: vmovapd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %r = load <4 x double>* %vaddr, align 32
+  %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_30
+; CHECK: vmovupd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %r = load <4 x double>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_31
+; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %r = load <4 x double>* %vaddr, align 32
+  %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_256_32
+; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
+  %mask = icmp ne <4 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x double>*
+  %r = load <4 x double>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer
+  ret <4 x double>%res
+}
+
+; CHECK-LABEL: test_128_1
+; CHECK: vmovdqu32
+; CHECK: ret
+define <4 x i32> @test_128_1(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %res = load <4 x i32>* %vaddr, align 1
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_2
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test_128_2(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %res = load <4 x i32>* %vaddr, align 16
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_3
+; CHECK: vmovdqa64
+; CHECK: ret
+define void @test_128_3(i8 * %addr, <2 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  store <2 x i64>%data, <2 x i64>* %vaddr, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_128_4
+; CHECK: vmovdqu32
+; CHECK: ret
+define void @test_128_4(i8 * %addr, <4 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  store <4 x i32>%data, <4 x i32>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_5
+; CHECK: vmovdqa32
+; CHECK: ret
+define void @test_128_5(i8 * %addr, <4 x i32> %data) {
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  store <4 x i32>%data, <4 x i32>* %vaddr, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_128_6
+; CHECK: vmovdqa64
+; CHECK: ret
+define  <2 x i64> @test_128_6(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %res = load <2 x i64>* %vaddr, align 16
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_7
+; CHECK: vmovdqu64
+; CHECK: ret
+define void @test_128_7(i8 * %addr, <2 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  store <2 x i64>%data, <2 x i64>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_8
+; CHECK: vmovdqu64
+; CHECK: ret
+define <2 x i64> @test_128_8(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %res = load <2 x i64>* %vaddr, align 1
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_9
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_9(i8 * %addr, <2 x double> %data) {
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  store <2 x double>%data, <2 x double>* %vaddr, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_128_10
+; CHECK: vmovapd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <2 x double> @test_128_10(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %res = load <2 x double>* %vaddr, align 16
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_11
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_11(i8 * %addr, <4 x float> %data) {
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  store <4 x float>%data, <4 x float>* %vaddr, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_128_12
+; CHECK: vmovaps {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x float> @test_128_12(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %res = load <4 x float>* %vaddr, align 16
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_13
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_13(i8 * %addr, <2 x double> %data) {
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  store <2 x double>%data, <2 x double>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_14
+; CHECK: vmovupd {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <2 x double> @test_128_14(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %res = load <2 x double>* %vaddr, align 1
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_15
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define void @test_128_15(i8 * %addr, <4 x float> %data) {
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  store <4 x float>%data, <4 x float>* %vaddr, align 1
+  ret void
+}
+
+; CHECK-LABEL: test_128_16
+; CHECK: vmovups {{.*}} ## encoding: [0x62
+; CHECK: ret
+define <4 x float> @test_128_16(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %res = load <4 x float>* %vaddr, align 1
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_17
+; CHECK: vmovdqa32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %r = load <4 x i32>* %vaddr, align 16
+  %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_18
+; CHECK: vmovdqu32{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %r = load <4 x i32>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_19
+; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %r = load <4 x i32>* %vaddr, align 16
+  %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_20
+; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x i32>*
+  %r = load <4 x i32>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer
+  ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test_128_21
+; CHECK: vmovdqa64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %r = load <2 x i64>* %vaddr, align 16
+  %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_22
+; CHECK: vmovdqu64{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %r = load <2 x i64>* %vaddr, align 1
+  %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_23
+; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %r = load <2 x i64>* %vaddr, align 16
+  %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_24
+; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x i64>*
+  %r = load <2 x i64>* %vaddr, align 1
+  %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer
+  ret <2 x i64>%res
+}
+
+; CHECK-LABEL: test_128_25
+; CHECK: vmovaps{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %r = load <4 x float>* %vaddr, align 16
+  %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_26
+; CHECK: vmovups{{.*{%k[1-7]} }}
+; CHECK: ret
+define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %r = load <4 x float>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_27
+; CHECK: vmovaps{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %r = load <4 x float>* %vaddr, align 16
+  %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_28
+; CHECK: vmovups{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
+  %mask = icmp ne <4 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <4 x float>*
+  %r = load <4 x float>* %vaddr, align 1
+  %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer
+  ret <4 x float>%res
+}
+
+; CHECK-LABEL: test_128_29
+; CHECK: vmovapd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %r = load <2 x double>* %vaddr, align 16
+  %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_30
+; CHECK: vmovupd{{.*{%k[1-7]} }}
+; CHECK: ret
+define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %r = load <2 x double>* %vaddr, align 1
+  %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_31
+; CHECK: vmovapd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %r = load <2 x double>* %vaddr, align 16
+  %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer
+  ret <2 x double>%res
+}
+
+; CHECK-LABEL: test_128_32
+; CHECK: vmovupd{{.*{%k[1-7]} {z} }}
+; CHECK: ret
+define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) {
+  %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <2 x double>*
+  %r = load <2 x double>* %vaddr, align 1
+  %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer
+  ret <2 x double>%res
+}
+

diff --git a/test/CodeGen/X86/avx512vl-nontemporal.ll b/test/CodeGen/X86/avx512vl-nontemporal.ll
new file mode 100644
index 0000000..2ad9768
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-nontemporal.ll

@@ -0,0 +1,34 @@
+; RUN: llc < %s  -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
+
+define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) {
+; CHECK: vmovntps %ymm{{.*}} ## encoding: [0x62
+  %cast = bitcast i8* %B to <8 x float>*
+  %A2 = fadd <8 x float> %A, %AA
+  store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0
+; CHECK: vmovntdq %ymm{{.*}} ## encoding: [0x62
+  %cast1 = bitcast i8* %B to <4 x i64>*
+  %E2 = add <4 x i64> %E, %EE
+  store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0
+; CHECK: vmovntpd %ymm{{.*}} ## encoding: [0x62
+  %cast2 = bitcast i8* %B to <4 x double>*
+  %C2 = fadd <4 x double> %C, %CC
+  store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0
+  ret void
+}
+
+define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) {
+; CHECK: vmovntps %xmm{{.*}} ## encoding: [0x62
+  %cast = bitcast i8* %B to <4 x float>*
+  %A2 = fadd <4 x float> %A, %AA
+  store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0
+; CHECK: vmovntdq %xmm{{.*}} ## encoding: [0x62
+  %cast1 = bitcast i8* %B to <2 x i64>*
+  %E2 = add <2 x i64> %E, %EE
+  store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0
+; CHECK: vmovntpd %xmm{{.*}} ## encoding: [0x62
+  %cast2 = bitcast i8* %B to <2 x double>*
+  %C2 = fadd <2 x double> %C, %CC
+  store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0
+  ret void
+}
+!0 = metadata !{i32 1}

diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
new file mode 100644
index 0000000..9c64c03
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll

@@ -0,0 +1,381 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+; CHECK-LABEL: test256_1
+; CHECK: vpcmpeqq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
+  %mask = icmp eq <4 x i64> %x, %y
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test256_2
+; CHECK: vpcmpgtq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y) nounwind {
+  %mask = icmp sgt <4 x i64> %x, %y
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: @test256_3
+; CHECK: vpcmpled {{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind {
+  %mask = icmp sge <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_4
+; CHECK: vpcmpnleuq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y) nounwind {
+  %mask = icmp ugt <4 x i64> %x, %y
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test256_5
+; CHECK: vpcmpeqd  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
+  %y = load <8 x i32>* %yp, align 4
+  %mask = icmp eq <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_6
+; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+  %y = load <8 x i32>* %y.ptr, align 4
+  %mask = icmp sgt <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_7
+; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+  %y = load <8 x i32>* %y.ptr, align 4
+  %mask = icmp sle <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_8
+; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
+  %y = load <8 x i32>* %y.ptr, align 4
+  %mask = icmp ule <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_9
+; CHECK: vpcmpeqd %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+  %mask1 = icmp eq <8 x i32> %x1, %y1
+  %mask0 = icmp eq <8 x i32> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: @test256_10
+; CHECK: vpcmpleq %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+  %mask1 = icmp sge <4 x i64> %x1, %y1
+  %mask0 = icmp sle <4 x i64> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: @test256_11
+; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+  %mask1 = icmp sgt <4 x i64> %x1, %y1
+  %y = load <4 x i64>* %y.ptr, align 4
+  %mask0 = icmp sgt <4 x i64> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: @test256_12
+; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+  %mask1 = icmp sge <8 x i32> %x1, %y1
+  %y = load <8 x i32>* %y.ptr, align 4
+  %mask0 = icmp ule <8 x i32> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_13
+; CHECK: vpcmpeqq  (%rdi){1to4}, %ymm
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind {
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
+  %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
+  %mask = icmp eq <4 x i64> %x, %y
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test256_14
+; CHECK: vpcmpled  (%rdi){1to8}, %ymm
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind {
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %mask = icmp sle <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_15
+; CHECK: vpcmpgtd  (%rdi){1to8}, %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
+  %mask1 = icmp sge <8 x i32> %x1, %y1
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer
+  %mask0 = icmp sgt <8 x i32> %x, %y
+  %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test256_16
+; CHECK: vpcmpgtq  (%rdi){1to4}, %ymm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
+  %mask1 = icmp sge <4 x i64> %x1, %y1
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
+  %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer
+  %mask0 = icmp sgt <4 x i64> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1
+  ret <4 x i64> %max
+}
+
+; CHECK-LABEL: test128_1
+; CHECK: vpcmpeqq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
+  %mask = icmp eq <2 x i64> %x, %y
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: test128_2
+; CHECK: vpcmpgtq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y) nounwind {
+  %mask = icmp sgt <2 x i64> %x, %y
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: @test128_3
+; CHECK: vpcmpled {{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind {
+  %mask = icmp sge <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_4
+; CHECK: vpcmpnleuq {{.*%k[0-7]}}
+; CHECK: vmovdqa64 {{.*}}%k1
+; CHECK: ret
+define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y) nounwind {
+  %mask = icmp ugt <2 x i64> %x, %y
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: test128_5
+; CHECK: vpcmpeqd  (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
+  %y = load <4 x i32>* %yp, align 4
+  %mask = icmp eq <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_6
+; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+  %y = load <4 x i32>* %y.ptr, align 4
+  %mask = icmp sgt <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_7
+; CHECK: vpcmpled (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+  %y = load <4 x i32>* %y.ptr, align 4
+  %mask = icmp sle <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_8
+; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
+  %y = load <4 x i32>* %y.ptr, align 4
+  %mask = icmp ule <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_9
+; CHECK: vpcmpeqd %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+  %mask1 = icmp eq <4 x i32> %x1, %y1
+  %mask0 = icmp eq <4 x i32> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: @test128_10
+; CHECK: vpcmpleq %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+  %mask1 = icmp sge <2 x i64> %x1, %y1
+  %mask0 = icmp sle <2 x i64> %x, %y
+  %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: @test128_11
+; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+  %mask1 = icmp sgt <2 x i64> %x1, %y1
+  %y = load <2 x i64>* %y.ptr, align 4
+  %mask0 = icmp sgt <2 x i64> %x, %y
+  %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: @test128_12
+; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+  %mask1 = icmp sge <4 x i32> %x1, %y1
+  %y = load <4 x i32>* %y.ptr, align 4
+  %mask0 = icmp ule <4 x i32> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_13
+; CHECK: vpcmpeqq  (%rdi){1to2}, %xmm
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind {
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
+  %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
+  %mask = icmp eq <2 x i64> %x, %y
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+  ret <2 x i64> %max
+}
+
+; CHECK-LABEL: test128_14
+; CHECK: vpcmpled  (%rdi){1to4}, %xmm
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind {
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mask = icmp sle <4 x i32> %x, %y
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_15
+; CHECK: vpcmpgtd  (%rdi){1to4}, %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa32
+; CHECK: ret
+define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
+  %mask1 = icmp sge <4 x i32> %x1, %y1
+  %yb = load i32* %yb.ptr, align 4
+  %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
+  %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mask0 = icmp sgt <4 x i32> %x, %y
+  %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer
+  %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1
+  ret <4 x i32> %max
+}
+
+; CHECK-LABEL: test128_16
+; CHECK: vpcmpgtq  (%rdi){1to2}, %xmm{{.*{%k[1-7]}}}
+; CHECK: vmovdqa64
+; CHECK: ret
+define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
+  %mask1 = icmp sge <2 x i64> %x1, %y1
+  %yb = load i64* %yb.ptr, align 4
+  %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
+  %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1
+  %mask0 = icmp sgt <2 x i64> %x, %y
+  %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer
+  %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1
+  ret <2 x i64> %max
+}

diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
deleted file mode 100644
index 34aaf2c..0000000
--- a/test/CodeGen/X86/blend-msb.ll
+++ /dev/null

@@ -1,40 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
-
-
-; Verify that we produce movss instead of blendvps when possible.
-
-;CHECK-LABEL: vsel_float:
-;CHECK-NOT: blend
-;CHECK: movss
-;CHECK: ret
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
-  ret <4 x float> %vsel
-}
-
-;CHECK-LABEL: vsel_4xi8:
-;CHECK-NOT: blend
-;CHECK: movss
-;CHECK: ret
-define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
-  ret <4 x i8> %vsel
-}
-
-;CHECK-LABEL: vsel_8xi16:
-; The select mask is
-; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
-; which translates into the boolean mask (big endian representation):
-; 00010001 = 17.
-; '1' means takes the first argument, '0' means takes the second argument.
-; This is the opposite of the intel syntax, thus we expect
-; the inverted mask: 11101110 = 238.
-; According to the ABI:
-; v1 is in xmm0 => first argument is xmm0.
-; v2 is in xmm1 => second argument is xmm1.
-;CHECK: pblendw $238, %xmm1, %xmm0
-;CHECK: ret
-define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
-  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
-  ret <8 x i16> %vsel
-}

diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index 2681c10..cc40bcf 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll

@@ -237,44 +237,6 @@
   ret i32 %base
 }
 
-define void @test_loop_rotate_reversed_blocks() {
-; This test case (greatly reduced from an Olden bencmark) ensures that the loop
-; rotate implementation doesn't assume that loops are laid out in a particular
-; order. The first loop will get split into two basic blocks, with the loop
-; header coming after the loop latch.
-;
-; CHECK: test_loop_rotate_reversed_blocks
-; CHECK: %entry
-; Look for a jump into the middle of the loop, and no branches mid-way.
-; CHECK: jmp
-; CHECK: %loop1
-; CHECK-NOT: j{{\w*}} .LBB{{.*}}
-; CHECK: %loop1
-; CHECK: je
-
-entry:
-  %cond1 = load volatile i1* undef
-  br i1 %cond1, label %loop2.preheader, label %loop1
-
-loop1:
-  call i32 @f()
-  %cond2 = load volatile i1* undef
-  br i1 %cond2, label %loop2.preheader, label %loop1
-
-loop2.preheader:
-  call i32 @f()
-  %cond3 = load volatile i1* undef
-  br i1 %cond3, label %exit, label %loop2
-
-loop2:
-  call i32 @f()
-  %cond4 = load volatile i1* undef
-  br i1 %cond4, label %exit, label %loop2
-
-exit:
-  ret void
-}
-
 define i32 @test_loop_align(i32 %i, i32* %a) {
 ; Check that we provide basic loop body alignment with the block placement
 ; pass.

diff --git a/test/CodeGen/X86/byval-callee-cleanup.ll b/test/CodeGen/X86/byval-callee-cleanup.ll
new file mode 100644
index 0000000..8e059d4
--- /dev/null
+++ b/test/CodeGen/X86/byval-callee-cleanup.ll

@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
+
+; Previously we would forget to align to stack slot alignment after placing a
+; byval argument.  Subsequent arguments would align themselves, but if it was
+; the last argument, the argument size would not be a multiple of stack slot
+; size. This resulted in retl $6 in callee-cleanup functions, as well as subtle
+; varargs bugs.
+
+%struct.Six = type { [6 x i8] }
+
+define x86_stdcallcc void @f(%struct.Six* byval %a) {
+  ret void
+}
+; CHECK-LABEL: _f@8:
+; CHECK: retl $8
+
+define x86_thiscallcc void @g(i8* %this, %struct.Six* byval %a) {
+  ret void
+}
+; CHECK-LABEL: _g:
+; CHECK: retl $8
+
+define x86_fastcallcc void @h(i32 inreg %x, i32 inreg %y, %struct.Six* byval %a) {
+  ret void
+}
+; CHECK-LABEL: @h@16:
+; CHECK: retl $8

diff --git a/test/CodeGen/X86/cfi_enforcing.ll b/test/CodeGen/X86/cfi_enforcing.ll
new file mode 100644
index 0000000..bcad8c1
--- /dev/null
+++ b/test/CodeGen/X86/cfi_enforcing.ll

@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=i386-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86-64 %s
+
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @m(void ()* %fun) {
+  call void ()* %fun()
+; CHECK: subl
+; X86-64: andq    $8,
+; X86-64: leaq    __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]]
+; X86-64-NOT: callq __llvm_cfi_pointer_warning
+; X86-64: callq   *[[REG]]
+; X86: andl    $8,
+; X86: leal    __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]]
+; X86-NOT: calll __llvm_cfi_pointer_warning
+; X86: calll   *[[REG]]
+  ret i32 0
+}
+
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %f = call void ()* ()* @get_fun()
+  %a = call i32 @m(void ()* %f)
+  ret i32 %a
+}
+
+; CHECK: .align 8
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK: jmp indirect_fun@PLT

diff --git a/test/CodeGen/X86/cfi_invoke.ll b/test/CodeGen/X86/cfi_invoke.ll
new file mode 100644
index 0000000..dd0d42a
--- /dev/null
+++ b/test/CodeGen/X86/cfi_invoke.ll

@@ -0,0 +1,35 @@
+; RUN: llc <%s -fcfi -cfi-type=sub | FileCheck %s
+; ModuleID = 'test.cc'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @__gxx_personality_v0(...)
+
+@_ZTIPKc = external constant i8*
+@_ZTIi = external constant i8*
+
+define void @f() unnamed_addr jumptable {
+  ret void
+}
+
+@a = global void ()* @f
+
+; Make sure invoke gets targeted as well as regular calls
+define void @_Z3foov(void ()* %f) uwtable ssp {
+; CHECK-LABEL: _Z3foov:
+ entry:
+   invoke void %f()
+           to label %try.cont unwind label %lpad
+; CHECK: callq __llvm_cfi_pointer_warning
+; CHECK: callq *%rbx
+
+ lpad:
+   %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+                                  catch i8* bitcast (i8** @_ZTIi to i8*)
+                                  filter [1 x i8*] [i8* bitcast (i8** @_ZTIPKc to i8*)]
+   ret void
+
+ try.cont:
+   ret void
+}
+

diff --git a/test/CodeGen/X86/cfi_non_default_function.ll b/test/CodeGen/X86/cfi_non_default_function.ll
new file mode 100644
index 0000000..29774a1
--- /dev/null
+++ b/test/CodeGen/X86/cfi_non_default_function.ll

@@ -0,0 +1,27 @@
+; RUN: llc -fcfi -cfi-func-name=cfi_new_failure <%s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @m(void ()* %fun) {
+; CHECK-LABEL: @m
+  call void ()* %fun()
+; CHECK: callq cfi_new_failure
+  ret i32 0
+}
+
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %f = call void ()* ()* @get_fun()
+  %a = call i32 @m(void ()* %f)
+  ret i32 %a
+}
+
+; CHECK: .align 8
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK: jmp indirect_fun@PLT

diff --git a/test/CodeGen/X86/cfi_simple_indirect_call.ll b/test/CodeGen/X86/cfi_simple_indirect_call.ll
new file mode 100644
index 0000000..0ee118d
--- /dev/null
+++ b/test/CodeGen/X86/cfi_simple_indirect_call.ll

@@ -0,0 +1,43 @@
+; RUN: llc -fcfi -cfi-type=sub <%s | FileCheck --check-prefix=SUB %s
+; RUN: llc -fcfi -cfi-type=add <%s | FileCheck --check-prefix=ADD %s
+; RUN: llc -fcfi -cfi-type=ror <%s | FileCheck --check-prefix=ROR %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @indirect_fun() unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @m(void ()* %fun) {
+  call void ()* %fun()
+; SUB: subl    
+; SUB: andq    $8
+; SUB-LABEL: leaq    __llvm_jump_instr_table_0_1
+; SUB-LABEL: callq   __llvm_cfi_pointer_warning
+
+; ROR: subq
+; ROR: rolq    $61
+; ROR: testq
+; ROR-LABEL: callq   __llvm_cfi_pointer_warning
+
+; ADD: andq    $8
+; ADD-LABEL: leaq    __llvm_jump_instr_table_0_1
+; ADD: cmpq
+; ADD-LABEL: callq   __llvm_cfi_pointer_warning
+ret i32 0
+}
+
+define void ()* @get_fun() {
+  ret void ()* @indirect_fun
+}
+
+define i32 @main(i32 %argc, i8** %argv) {
+  %f = call void ()* ()* @get_fun()
+  %a = call i32 @m(void ()* %f)
+  ret i32 %a
+}
+; SUB: .text
+; SUB: .align 8
+; SUB-LABEL: .type __llvm_jump_instr_table_0_1,@function
+; SUB-LABEL:__llvm_jump_instr_table_0_1:
+; SUB-LABEL: jmp indirect_fun@PLT

diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
new file mode 100644
index 0000000..3cb8b97
--- /dev/null
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll

@@ -0,0 +1,86 @@
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s
+
+declare i32 @bar()
+
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+; CHECK-LABEL: test_intervening_call:
+; CHECK: cmpxchg
+; CHECK: pushfq
+; CHECK: popq [[FLAGS:%.*]]
+
+; CHECK: callq bar
+
+; CHECK: pushq [[FLAGS]]
+; CHECK: popfq
+; CHECK: jne
+  %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
+  %p = extractvalue { i64, i1 } %cx, 1
+  call i32 @bar()
+  br i1 %p, label %t, label %f
+
+t:
+  ret i64 42
+
+f:
+  ret i64 0
+}
+
+; Interesting in producing a clobber without any function calls.
+define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) {
+; CHECK-LABEL: test_control_flow:
+
+; CHECK: cmpxchg
+; CHECK-NEXT: jne
+entry:
+  %cmp = icmp sgt i32 %i, %j
+  br i1 %cmp, label %loop_start, label %cond.end
+
+loop_start:
+  br label %while.condthread-pre-split.i
+
+while.condthread-pre-split.i:
+  %.pr.i = load i32* %p, align 4
+  br label %while.cond.i
+
+while.cond.i:
+  %0 = phi i32 [ %.pr.i, %while.condthread-pre-split.i ], [ 0, %while.cond.i ]
+  %tobool.i = icmp eq i32 %0, 0
+  br i1 %tobool.i, label %while.cond.i, label %while.body.i
+
+while.body.i:
+  %.lcssa = phi i32 [ %0, %while.cond.i ]
+  %1 = cmpxchg i32* %p, i32 %.lcssa, i32 %.lcssa seq_cst seq_cst
+  %2 = extractvalue { i32, i1 } %1, 1
+  br i1 %2, label %cond.end.loopexit, label %while.condthread-pre-split.i
+
+cond.end.loopexit:
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ %i, %entry ], [ 0, %cond.end.loopexit ]
+  ret i32 %cond
+}
+
+; This one is an interesting case because CMOV doesn't have a chain
+; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here.
+define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: test_feed_cmov:
+
+; CHECK: cmpxchg
+; CHECK: pushfq
+; CHECK: popq [[FLAGS:%.*]]
+
+; CHECK: callq bar
+
+; CHECK: pushq [[FLAGS]]
+; CHECK: popfq
+
+  %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %res, 1
+
+  %rhs = call i32 @bar()
+
+  %ret = select i1 %success, i32 %new, i32 %rhs
+  ret i32 %ret
+}

diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index 78e1dd2..85bfff2 100644
--- a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll

@@ -1,7 +1,7 @@
 ; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
 ; RUN: opt -S -codegenprepare -addr-sink-using-gep=1 %s -o - | FileCheck -check-prefix=CHECK-GEP %s
 ; This file tests the different cases what are involved when codegen prepare
-; tries to get sign extension out of the way of addressing mode.
+; tries to get sign/zero extension out of the way of addressing mode.
 ; This tests require an actual target as addressing mode decisions depends
 ; on the target.
 
@@ -67,6 +67,43 @@
   ret i8 %res
 }
 
+; Check that we are able to merge a sign extension with a zero extension.
+; CHECK-LABEL: @oneArgPromotionZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionZExt(i8 %arg1, i8* %base) {
+  %zext = zext i8 %arg1 to i32
+  %add = add nsw i32 %zext, 1 
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; When promoting a constant zext, the IR builder returns a constant,
+; not an instruction. Make sure this is properly handled. This used
+; to crash.
+; Note: The constant zext is promoted, but does not help matching
+; more thing in the addressing mode. Therefore the modification is
+; rolled back.
+; Still, this test case exercises the desired code path.
+; CHECK-LABEL: @oneArgPromotionCstZExt
+; CHECK: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i16 undef to i32
+; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXT]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionCstZExt(i8* %base) {
+  %cst = zext i16 undef to i32
+  %add = add nsw i32 %cst, 1
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
 ; Check that we do not promote truncate when we cannot determine the
 ; bits that are dropped.
 ; CHECK-LABEL: @oneArgPromotionBlockTrunc1
@@ -321,3 +358,177 @@
   %final = load i32* %addr
   ret i32 %final
 }
+
+%struct.dns_packet = type { i32, i32, %union.anon }
+%union.anon = type { i32 }
+
+@a = common global i32 0, align 4
+@b = common global i16 0, align 2
+
+; We used to crash on this function because we did not return the right
+; promoted instruction for %conv.i.
+; Make sure we generate the right code now.
+; CHECK-LABEL: @fn3
+; %conv.i is used twice and only one of its use is being promoted.
+; Use it at the starting point for the matching.
+; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32
+; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64
+; CHECK-NEXT: [[BASE:%[a-zA-Z_0-9-]+]] = ptrtoint %struct.dns_packet* %P to i64
+; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add i64 [[BASE]], [[PROMOTED_CONV]]
+; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = add i64 [[ADD]], 7
+; CHECK-NEXT: [[CAST:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[ADDR]] to i8*
+; CHECK-NEXT: load i8* [[CAST]], align 1
+define signext i16 @fn3(%struct.dns_packet* nocapture readonly %P) {
+entry:
+  %tmp = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 2
+  %data.i.i = bitcast %union.anon* %tmp to [0 x i8]*
+  br label %while.body.i.i
+
+while.body.i.i:                                   ; preds = %while.body.i.i, %entry
+  %src.addr.0.i.i = phi i16 [ 0, %entry ], [ %inc.i.i, %while.body.i.i ]
+  %inc.i.i = add i16 %src.addr.0.i.i, 1
+  %idxprom.i.i = sext i16 %src.addr.0.i.i to i64
+  %arrayidx.i.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i.i
+  %tmp1 = load i8* %arrayidx.i.i, align 1
+  %conv2.i.i = zext i8 %tmp1 to i32
+  %and.i.i = and i32 %conv2.i.i, 15
+  store i32 %and.i.i, i32* @a, align 4
+  %tobool.i.i = icmp eq i32 %and.i.i, 0
+  br i1 %tobool.i.i, label %while.body.i.i, label %fn1.exit.i
+
+fn1.exit.i:                                       ; preds = %while.body.i.i
+  %inc.i.i.lcssa = phi i16 [ %inc.i.i, %while.body.i.i ]
+  %conv.i = zext i16 %inc.i.i.lcssa to i32
+  %sub.i = add nsw i32 %conv.i, -1
+  %idxprom.i = sext i32 %sub.i to i64
+  %arrayidx.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i
+  %tmp2 = load i8* %arrayidx.i, align 1
+  %conv2.i = sext i8 %tmp2 to i16
+  store i16 %conv2.i, i16* @b, align 2
+  %sub4.i = sub nsw i32 0, %conv.i
+  %conv5.i = zext i16 %conv2.i to i32
+  %cmp.i = icmp sgt i32 %conv5.i, %sub4.i
+  br i1 %cmp.i, label %if.then.i, label %fn2.exit
+
+if.then.i:                                        ; preds = %fn1.exit.i
+  %end.i = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 1
+  %tmp3 = load i32* %end.i, align 4
+  %sub7.i = add i32 %tmp3, 65535
+  %conv8.i = trunc i32 %sub7.i to i16
+  br label %fn2.exit
+
+fn2.exit:                                         ; preds = %if.then.i, %fn1.exit.i
+  %retval.0.i = phi i16 [ %conv8.i, %if.then.i ], [ undef, %fn1.exit.i ]
+  ret i16 %retval.0.i
+}
+
+; Check that we do not promote an extension if the non-wrapping flag does not
+; match the kind of the extension.
+; CHECK-LABEL: @noPromotionFlag
+; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = zext i32 [[ADD]] to i64
+; CHECK: inttoptr i64 [[PROMOTED]] to i8*
+; CHECK: ret
+define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) {
+  %add = add nsw i32 %arg1, %arg2 
+  %zextadd = zext i32 %add to i64
+  %base = inttoptr i64 %zextadd to i8*
+  %res = load i8* %base
+  ret i8 %res
+}
+
+; Check that we correctly promote both operands of the promotable add with zext.
+; CHECK-LABEL: @twoArgsPromotionZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg1 to i64
+; CHECK: [[ARG2ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg2 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], [[ARG2ZEXT]]
+; CHECK: inttoptr i64 [[PROMOTED]] to i8*
+; CHECK: ret
+define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) {
+  %add = add nuw i32 %arg1, %arg2 
+  %zextadd = zext i32 %add to i64
+  %base = inttoptr i64 %zextadd to i8*
+  %res = load i8* %base
+  ret i8 %res
+}
+
+; Check that we correctly promote constant arguments.
+; CHECK-LABEL: @oneArgPromotionNegativeCstZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 255
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, i8* %base) {
+  %add = add nuw i8 %arg1, -1 
+  %zextadd = zext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we are able to merge two zero extensions.
+; CHECK-LABEL: @oneArgPromotionZExtZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionZExtZExt(i8 %arg1, i8* %base) {
+  %zext = zext i8 %arg1 to i32
+  %add = add nuw i32 %zext, 1 
+  %zextadd = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote truncate when the dropped bits
+; are of a different kind.
+; CHECK-LABEL: @oneArgPromotionBlockTruncZExt
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
+; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1TRUNC]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, i8* %base) {
+  %sextarg1 = sext i1 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nuw i8 %trunc, 1 
+  %zextadd = zext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we are able to promote truncate when we know all the bits
+; that are dropped.
+; CHECK-LABEL: @oneArgPromotionPassTruncZExt
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i1 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, i8* %base) {
+  %sextarg1 = zext i1 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nuw i8 %trunc, 1 
+  %zextadd = zext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote sext with zext.
+; CHECK-LABEL: @oneArgPromotionBlockSExtZExt
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i8
+; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1SEXT]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, i8* %base) {
+  %sextarg1 = sext i1 %arg1 to i8
+  %add = add nuw i8 %sextarg1, 1 
+  %zextadd = zext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}

diff --git a/test/CodeGen/X86/coff-comdat.ll b/test/CodeGen/X86/coff-comdat.ll
index bf27b2f..ac4546d 100644
--- a/test/CodeGen/X86/coff-comdat.ll
+++ b/test/CodeGen/X86/coff-comdat.ll

@@ -73,19 +73,19 @@
 ; CHECK: .globl  @v8@0
 ; CHECK: .section        .text,"xr",discard,@f8@0
 ; CHECK: .globl  @f8@0
-; CHECK: .section        .bss,"bw",associative,_f1
+; CHECK: .section        .bss,"wb",associative,_f1
 ; CHECK: .globl  _v1
-; CHECK: .section        .bss,"bw",associative,_f2
+; CHECK: .section        .bss,"wb",associative,_f2
 ; CHECK: .globl  _v2
-; CHECK: .section        .bss,"bw",associative,_f3
+; CHECK: .section        .bss,"wb",associative,_f3
 ; CHECK: .globl  _v3
-; CHECK: .section        .bss,"bw",associative,_f4
+; CHECK: .section        .bss,"wb",associative,_f4
 ; CHECK: .globl  _v4
-; CHECK: .section        .bss,"bw",associative,_f5
+; CHECK: .section        .bss,"wb",associative,_f5
 ; CHECK: .globl  _v5
-; CHECK: .section        .bss,"bw",associative,_f6
+; CHECK: .section        .bss,"wb",associative,_f6
 ; CHECK: .globl  _v6
-; CHECK: .section        .bss,"bw",same_size,_f6
+; CHECK: .section        .bss,"wb",same_size,_f6
 ; CHECK: .globl  _f6
 ; CHECK: .section        .rdata,"rd",largest,_vftable
 ; CHECK: .globl  _vftable

diff --git a/test/CodeGen/X86/coff-comdat2.ll b/test/CodeGen/X86/coff-comdat2.ll
index 6744b5b..58bc04e 100644
--- a/test/CodeGen/X86/coff-comdat2.ll
+++ b/test/CodeGen/X86/coff-comdat2.ll

@@ -6,4 +6,4 @@
 $foo = comdat largest
 @foo = global i32 0
 @bar = global i32 0, comdat $foo
-; CHECK: Associative COMDAT symbol 'foo' is not a key for it's COMDAT.
+; CHECK: Associative COMDAT symbol 'foo' is not a key for its COMDAT.

diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll
new file mode 100644
index 0000000..59a7a19
--- /dev/null
+++ b/test/CodeGen/X86/combine-and.ll

@@ -0,0 +1,164 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s
+;
+; Verify that the DAGCombiner is able to fold a vector AND into a blend
+; if one of the operands to the AND is a vector of all constants, and each
+; constant element is either zero or all-ones.
+
+
+define <4 x i32> @test1(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test1
+; CHECK: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test2(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test2
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test3(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test3
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test4(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 0, i32 0, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test4
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test5(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test5
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test6(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test6
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test7(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test7
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test8(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test8
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test9(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test9
+; CHECK: movq %xmm0, %xmm0
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test10(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test10
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test11(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test11
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test12(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 -1, i32 0>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test12
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test13(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test13
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test14(<4 x i32> %A) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test14
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
+  %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 0>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test15
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) {
+  %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
+  %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 -1>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test16
+; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT: retq
+
+
+define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) {
+  %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
+  %2 = and <4 x i32> %B, <i32 -1, i32 0, i32 -1, i32 0>
+  %3 = or <4 x i32> %1, %2
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test17
+; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; CHECK-NEXT: retq

diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index ff807b9..9539eae 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll

@@ -5,265 +5,293 @@
 ; instruction which performs a blend operation.
 
 define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test1
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test2
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 
 define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test3
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test4
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test5
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test6
-; CHECK-NOT: xorps
-; CHECK: blendps $12
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
   %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
   %or = or <4 x i32> %and1, %and2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test7
-; CHECK-NOT: xorps
-; CHECK: blendps $12
-; CHECK-NEXT: ret
 
 
 define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <2 x i64> %a, <i64 -1, i64 0>
   %and2 = and <2 x i64> %b, <i64 0, i64 -1>
   %or = or <2 x i64> %and1, %and2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test8
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
   %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
   %or = or <4 x i32> %and1, %and2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test9
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 
 define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <2 x i64> %a, <i64 0, i64 -1>
   %and2 = and <2 x i64> %b, <i64 -1, i64 0>
   %or = or <2 x i64> %and1, %and2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test10
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
   %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
   %or = or <4 x i32> %and1, %and2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test11
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; CHECK-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
   %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
   %or = or <4 x i32> %and1, %and2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test12
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NEXT: ret
 
 
 ; Verify that the following test cases are folded into single shuffles.
 
 define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test13
-; CHECK-NOT: xorps
-; CHECK: shufps
-; CHECK-NEXT: ret
 
 
 define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test14
-; CHECK-NOT: pslldq
-; CHECK-NOT: por
-; CHECK: punpcklqdq
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test15
-; CHECK-NOT: xorps
-; CHECK: shufps
-; CHECK-NOT: shufps
-; CHECK-NOT: orps
-; CHECK: ret
 
 
 define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test16
-; CHECK-NOT: pslldq
-; CHECK-NOT: por
-; CHECK: punpcklqdq
-; CHECK: ret
 
 
 ; Verify that the dag-combiner does not fold a OR of two shuffles into a single
 ; shuffle instruction when the shuffle indexes are not compatible.
 
 define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2]
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; CHECK-NEXT:    orps %xmm1, %xmm2
+; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test17
-; CHECK: por
-; CHECK-NEXT: ret
 
 
 define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; CHECK-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test18
-; CHECK: orps
-; CHECK: ret
 
 
 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test19:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    xorps %xmm3, %xmm3
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3]
+; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2]
+; CHECK-NEXT:    orps %xmm3, %xmm2
+; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
   %or = or <4 x i32> %shuf1, %shuf2
   ret <4 x i32> %or
 }
-; CHECK-LABEL: test19
-; CHECK: por
-; CHECK-NEXT: ret
 
 
 define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test20:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    movq %xmm0, %xmm0
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test20
-; CHECK-NOT: xorps
-; CHECK: orps
-; CHECK-NEXT: movq
-; CHECK-NEXT: ret
 
 
 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    movq %xmm0, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    retq
   %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
   %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
   %or = or <2 x i64> %shuf1, %shuf2
   ret <2 x i64> %or
 }
-; CHECK-LABEL: test21
-; CHECK: por
-; CHECK-NEXT: pslldq
-; CHECK-NEXT: ret
 
+; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle
+; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to
+; handle legal vector value types.
+define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
+; CHECK-LABEL: test_crash:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i8> %shuf1, %shuf2
+  ret <4 x i8> %or
+}
 

diff --git a/test/CodeGen/X86/combine-vec-shuffle-2.ll b/test/CodeGen/X86/combine-vec-shuffle-2.ll
deleted file mode 100644
index 7ab7f80..0000000
--- a/test/CodeGen/X86/combine-vec-shuffle-2.ll
+++ /dev/null

@@ -1,164 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
-
-; Check that DAGCombiner correctly folds the following pairs of shuffles
-; using the following rules:
-;  1. shuffle(shuffle(x, y), undef) -> x
-;  2. shuffle(shuffle(x, y), undef) -> y
-;  3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef)
-;  4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y)
-;
-; Rules 3. and 4. are used only if the resulting shuffle mask is legal.
-
-define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test1
-; Mask: [3,0,0,1]
-; CHECK: pshufd $67
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test2
-; Mask: [2,0,0,3]
-; CHECK: pshufd $-62
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test3
-; Mask: [2,0,0,3]
-; CHECK: pshufd $-62
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test4
-; Mask: [0,0,0,1]
-; CHECK: pshufd $64
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test5
-; Mask: [1,1]
-; CHECK: movhlps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test6
-; Mask: [2,0,0,0]
-; CHECK: pshufd $2
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test7
-; Mask: [0,2,0,2]
-; CHECK: pshufd $-120
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test8
-; Mask: [1,0,3,0]
-; CHECK: pshufd $49
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test9
-; Mask: [1,3,0,2]
-; CHECK: pshufd $-115
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test10
-; Mask: [1,0,1,0]
-; CHECK: pshufd $17
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test11
-; Mask: [1,0,2,1]
-; CHECK: pshufd $97
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test12
-; Mask: [0,0,0,0]
-; CHECK: pshufd $0
-; CHECK-NEXT: ret
-
-
-; The following pair of shuffles is folded into vector %A.
-define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test13
-; CHECK-NOT: pshufd
-; CHECK: ret
-
-
-; The following pair of shuffles is folded into vector %B.
-define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) {
-  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
-  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
-  ret <4 x i32> %2
-}
-; CHECK-LABEL: test14
-; CHECK-NOT: pshufd
-; CHECK: ret
-

diff --git a/test/CodeGen/X86/combine-vec-shuffle.ll b/test/CodeGen/X86/combine-vec-shuffle.ll
deleted file mode 100644
index 9e6ab89..0000000
--- a/test/CodeGen/X86/combine-vec-shuffle.ll
+++ /dev/null

@@ -1,253 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
-
-; Verify that the DAGCombiner correctly folds according to the following rules:
-
-; fold (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
-; fold (OR  (shuf (A, C), shuf (B, C)) -> shuf (OR  (A, B), C)
-; fold (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
-
-; fold (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
-; fold (OR  (shuf (C, A), shuf (C, B)) -> shuf (C, OR  (A, B))
-; fold (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
-
-
-
-define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test1
-; CHECK-NOT: pshufd
-; CHECK: pand
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test2
-; CHECK-NOT: pshufd
-; CHECK: por
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test3
-; CHECK-NOT: pshufd
-; CHECK: pxor
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test4
-; CHECK-NOT: pshufd
-; CHECK: pand
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test5
-; CHECK-NOT: pshufd
-; CHECK: por
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test6
-; CHECK-NOT: pshufd
-; CHECK: pxor
-; CHECK-NEXT: pshufd
-; CHECK-NEXT: ret
-
-
-; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
-; are not performing a swizzle operations.
-
-define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test1b
-; CHECK-NOT: blendps
-; CHECK: andps
-; CHECK-NEXT: blendps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test2b
-; CHECK-NOT: blendps
-; CHECK: orps
-; CHECK-NEXT: blendps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test3b
-; CHECK-NOT: blendps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: blendps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test4b
-; CHECK-NOT: blendps
-; CHECK: andps
-; CHECK-NEXT: blendps
-; CHECK: ret
-
-
-define <4 x i32> @test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test5b
-; CHECK-NOT: blendps
-; CHECK: orps
-; CHECK-NEXT: blendps
-; CHECK: ret
-
-
-define <4 x i32> @test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test6b
-; CHECK-NOT: blendps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: blendps
-; CHECK: ret
-
-define <4 x i32> @test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test1c
-; CHECK-NOT: shufps
-; CHECK: andps
-; CHECK-NEXT: shufps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test2c
-; CHECK-NOT: shufps
-; CHECK: orps
-; CHECK-NEXT: shufps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test3c
-; CHECK-NOT: shufps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: shufps
-; CHECK-NEXT: ret
-
-
-define <4 x i32> @test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %and = and <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %and
-}
-; CHECK-LABEL: test4c
-; CHECK-NOT: shufps
-; CHECK: andps
-; CHECK-NEXT: shufps
-; CHECK: ret
-
-
-define <4 x i32> @test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %or = or <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %or
-}
-; CHECK-LABEL: test5c
-; CHECK-NOT: shufps
-; CHECK: orps
-; CHECK-NEXT: shufps
-; CHECK: ret
-
-
-define <4 x i32> @test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
-  %xor = xor <4 x i32> %shuf1, %shuf2
-  ret <4 x i32> %xor
-}
-; CHECK-LABEL: test6c
-; CHECK-NOT: shufps
-; CHECK: xorps
-; CHECK-NEXT: xorps
-; CHECK-NEXT: shufps
-; CHECK: ret
-

diff --git a/test/CodeGen/X86/commute-blend-avx2.ll b/test/CodeGen/X86/commute-blend-avx2.ll
new file mode 100644
index 0000000..d06c6da
--- /dev/null
+++ b/test/CodeGen/X86/commute-blend-avx2.ll

@@ -0,0 +1,89 @@
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s
+
+define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 {
+  %1 = load <8 x i16>* %b
+  %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
+  ret <8 x i16> %2
+
+  ;LABEL:      commute_fold_vpblendw_128
+  ;CHECK:      vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+  ;CHECK-NEXT: retq
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 {
+  %1 = load <16 x i16>* %b
+  %2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17)
+  ret <16 x i16> %2
+
+  ;LABEL:      commute_fold_vpblendw_256
+  ;CHECK:      vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
+  ;CHECK-NEXT: retq
+}
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
+
+define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 {
+  %1 = load <4 x i32>* %b
+  %2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1)
+  ret <4 x i32> %2
+
+  ;LABEL:      commute_fold_vpblendd_128
+  ;CHECK:      vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+  ;CHECK-NEXT: retq
+}
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 {
+  %1 = load <8 x i32>* %b
+  %2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129)
+  ret <8 x i32> %2
+
+  ;LABEL:      commute_fold_vpblendd_256
+  ;CHECK:      vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
+  ;CHECK-NEXT: retq
+}
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
+
+define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 {
+  %1 = load <4 x float>* %b
+  %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
+  ret <4 x float> %2
+
+  ;LABEL:      commute_fold_vblendps_128
+  ;CHECK:      vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+  ;CHECK-NEXT: retq
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 {
+  %1 = load <8 x float>* %b
+  %2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7)
+  ret <8 x float> %2
+
+  ;LABEL:      commute_fold_vblendps_256
+  ;CHECK:      vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
+  ;CHECK-NEXT: retq
+}
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 {
+  %1 = load <2 x double>* %b
+  %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
+  ret <2 x double> %2
+
+  ;LABEL:      commute_fold_vblendpd_128
+  ;CHECK:      vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+  ;CHECK-NEXT: retq
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 {
+  %1 = load <4 x double>* %b
+  %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
+  ret <4 x double> %2
+
+  ;LABEL:      commute_fold_vblendpd_256
+  ;CHECK:      vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
+  ;CHECK-NEXT: retq
+}
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone

diff --git a/test/CodeGen/X86/commute-blend-sse41.ll b/test/CodeGen/X86/commute-blend-sse41.ll
new file mode 100644
index 0000000..59fef8c
--- /dev/null
+++ b/test/CodeGen/X86/commute-blend-sse41.ll

@@ -0,0 +1,34 @@
+; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s
+
+define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 {
+  %1 = load <8 x i16>* %b
+  %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
+  ret <8 x i16> %2
+
+  ;LABEL:      commute_fold_pblendw
+  ;CHECK:      pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+  ;CHECK-NEXT: retq
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 {
+  %1 = load <4 x float>* %b
+  %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
+  ret <4 x float> %2
+
+  ;LABEL:      commute_fold_blendps
+  ;CHECK:      blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+  ;CHECK-NEXT: retq
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
+  %1 = load <2 x double>* %b
+  %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
+  ret <2 x double> %2
+
+  ;LABEL:      commute_fold_vblendpd
+  ;CHECK:      blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+  ;CHECK-NEXT: retq
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone

diff --git a/test/CodeGen/X86/commuted-blend-mask.ll b/test/CodeGen/X86/commuted-blend-mask.ll
new file mode 100644
index 0000000..e6322cb
--- /dev/null
+++ b/test/CodeGen/X86/commuted-blend-mask.ll

@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s
+
+; When commuting the operands of a SSE blend, make sure that the resulting blend
+; mask can be encoded as a imm8.
+; Before, when commuting the operands to the shuffle in function @test, the backend
+; produced the following assembly:
+;   pblendw $4294967103, %xmm1, %xmm0
+
+define <4 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
+  ;CHECK: pblendw $63, %xmm1, %xmm0
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+  ret <4 x i32> %shuffle
+}

diff --git a/test/CodeGen/X86/constant-pool-remat-0.ll b/test/CodeGen/X86/constant-pool-remat-0.ll
index 4a01108..e42a87c 100644
--- a/test/CodeGen/X86/constant-pool-remat-0.ll
+++ b/test/CodeGen/X86/constant-pool-remat-0.ll

@@ -1,7 +1,7 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -mtriple=x86_64-linux   | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-linux -regalloc=greedy | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-linux -mattr=+sse2 | FileCheck %s
 ; CHECK:     LCPI
 ; CHECK:     LCPI
 ; CHECK:     LCPI

diff --git a/test/CodeGen/X86/constant-pool-sharing.ll b/test/CodeGen/X86/constant-pool-sharing.ll
index 26318dd..3682165 100644
--- a/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/test/CodeGen/X86/constant-pool-sharing.ll

@@ -1,12 +1,13 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=LINUX
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s --check-prefix=COMMON --check-prefix=MSVC
 
 ; llc should share constant pool entries between this integer vector
 ; and this floating-point vector since they have the same encoding.
 
-; CHECK:  LCPI0_0(%rip), %xmm0
-; CHECK:  movaps        %xmm0, ({{%rdi|%rcx}})
-; CHECK:  movaps        %xmm0, ({{%rsi|%rdx}})
+; LINUX:   LCPI0_0(%rip), %xmm0
+; MSVC:    __xmm@40000000400000004000000040000000(%rip), %xmm0
+; COMMON:  movaps        %xmm0, ({{%rdi|%rcx}})
+; COMMON:  movaps        %xmm0, ({{%rsi|%rdx}})
 
 define void @foo(<4 x i32>* %p, <4 x float>* %q, i1 %t) nounwind {
 entry:

diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
index b578896..7160dcc 100644
--- a/test/CodeGen/X86/constructor.ll
+++ b/test/CodeGen/X86/constructor.ll

@@ -1,6 +1,8 @@
-; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=CTOR %s
-; RUN: llc -mtriple x86_64-pc-linux -use-init-array < %s | FileCheck --check-prefix=INIT-ARRAY %s
-@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }, { i32, void ()* } { i32 15, void ()* @g }]
+; RUN: llc -mtriple x86_64-pc-linux -use-ctors < %s | FileCheck --check-prefix=CTOR %s
+; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s
+@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }]
+
+@v = weak_odr global i8 0
 
 define void @f() {
 entry:
@@ -12,14 +14,14 @@
   ret void
 }
 
-; CTOR:		.section	.ctors.65520,"aw",@progbits
+; CTOR:		.section	.ctors.65520,"aGw",@progbits,v,comdat
 ; CTOR-NEXT:	.align	8
 ; CTOR-NEXT:	.quad	g
 ; CTOR-NEXT:	.section	.ctors,"aw",@progbits
 ; CTOR-NEXT:	.align	8
 ; CTOR-NEXT:	.quad	f
 
-; INIT-ARRAY:		.section	.init_array.15,"aw",@init_array
+; INIT-ARRAY:		.section	.init_array.15,"aGw",@init_array,v,comdat
 ; INIT-ARRAY-NEXT:	.align	8
 ; INIT-ARRAY-NEXT:	.quad	g
 ; INIT-ARRAY-NEXT:	.section	.init_array,"aw",@init_array

diff --git a/test/CodeGen/X86/cvt16.ll b/test/CodeGen/X86/cvt16.ll
index 951b5c3..4d920e2 100644
--- a/test/CodeGen/X86/cvt16.ll
+++ b/test/CodeGen/X86/cvt16.ll

@@ -21,7 +21,7 @@
 
 
 define void @test1(float %src, i16* %dest) {
-  %1 = tail call i16 @llvm.convert.to.fp16(float %src)
+  %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src)
   store i16 %1, i16* %dest, align 2
   ret void
 }
@@ -34,7 +34,7 @@
 
 define float @test2(i16* nocapture %src) {
   %1 = load i16* %src, align 2
-  %2 = tail call float @llvm.convert.from.fp16(i16 %1)
+  %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
   ret float %2
 }
 ; CHECK-LABEL: test2:
@@ -45,8 +45,8 @@
 
 
 define float @test3(float %src) nounwind uwtable readnone {
-  %1 = tail call i16 @llvm.convert.to.fp16(float %src)
-  %2 = tail call float @llvm.convert.from.fp16(i16 %1)
+  %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src)
+  %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
   ret float %2
 }
 
@@ -59,6 +59,31 @@
 ; F16C-NEXT: vcvtph2ps
 ; F16C: ret
 
-declare float @llvm.convert.from.fp16(i16) nounwind readnone
-declare i16 @llvm.convert.to.fp16(float) nounwind readnone
+define double @test4(i16* nocapture %src) {
+  %1 = load i16* %src, align 2
+  %2 = tail call double @llvm.convert.from.fp16.f64(i16 %1)
+  ret double %2
+}
+; CHECK-LABEL: test4:
+; LIBCALL: callq  __gnu_h2f_ieee
+; LIBCALL: cvtss2sd
+; SOFTFLOAT: callq  __gnu_h2f_ieee
+; SOFTFLOAT: callq __extendsfdf2
+; F16C: vcvtph2ps
+; F16C: vcvtss2sd
+; F16C: ret
 
+
+define i16 @test5(double %src) {
+  %val = tail call i16 @llvm.convert.to.fp16.f64(double %src)
+  ret i16 %val
+}
+; CHECK-LABEL: test5:
+; LIBCALL: jmp  __truncdfhf2
+; SOFTFLOAT: callq  __truncdfhf2
+; F16C: jmp __truncdfhf2
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone

diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 4912213..d0791dc 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll

@@ -52,48 +52,48 @@
 entry:
   %var1 = alloca %struct.AAA3, align 1
   %var2 = alloca %struct.AAA3, align 1
-  tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30), !dbg !47
-  tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31), !dbg !47
-  tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32), !dbg !49
+  tail call void @llvm.dbg.value(metadata !{i32 %param1}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !47
+  tail call void @llvm.dbg.value(metadata !{i32 %param2}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !47
+  tail call void @llvm.dbg.value(metadata !48, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49
   %tobool = icmp eq i32 %param2, 0, !dbg !50
   br i1 %tobool, label %if.end, label %if.then, !dbg !50
 
 if.then:                                          ; preds = %entry
   %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !52
-  tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32), !dbg !49
+  tail call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !49
   br label %if.end, !dbg !54
 
 if.end:                                           ; preds = %entry, %if.then
-  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55
-  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56), !dbg !57
-  tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59), !dbg !60
+  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55
+  tail call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !56, metadata !{metadata !"0x102"}), !dbg !57
+  tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59, metadata !{metadata !"0x102"}), !dbg !60
   %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !61
   call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !61
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64), !dbg !65
-  call void @llvm.dbg.value(metadata !58, i64 0, metadata !66), !dbg !67
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !65
+  call void @llvm.dbg.value(metadata !58, i64 0, metadata !66, metadata !{metadata !"0x102"}), !dbg !67
   %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !68
   call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !68
   %tobool1 = icmp eq i32 %param1, 0, !dbg !69
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34), !dbg !63
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !63
   br i1 %tobool1, label %if.else, label %if.then2, !dbg !69
 
 if.then2:                                         ; preds = %if.end
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71), !dbg !73
-  call void @llvm.dbg.value(metadata !74, i64 0, metadata !75), !dbg !76
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !71, metadata !{metadata !"0x102"}), !dbg !73
+  call void @llvm.dbg.value(metadata !74, i64 0, metadata !75, metadata !{metadata !"0x102"}), !dbg !76
   call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)), !dbg !76
   br label %if.end3, !dbg !72
 
 if.else:                                          ; preds = %if.end
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77), !dbg !79
-  call void @llvm.dbg.value(metadata !80, i64 0, metadata !81), !dbg !82
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var2}, i64 0, metadata !77, metadata !{metadata !"0x102"}), !dbg !79
+  call void @llvm.dbg.value(metadata !80, i64 0, metadata !81, metadata !{metadata !"0x102"}), !dbg !82
   call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)), !dbg !82
   br label %if.end3
 
 if.end3:                                          ; preds = %if.else, %if.then2
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33), !dbg !55
-  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83), !dbg !85
-  call void @llvm.dbg.value(metadata !58, i64 0, metadata !86), !dbg !87
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !33, metadata !{metadata !"0x102"}), !dbg !55
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !83, metadata !{metadata !"0x102"}), !dbg !85
+  call void @llvm.dbg.value(metadata !58, i64 0, metadata !86, metadata !{metadata !"0x102"}), !dbg !87
   call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !87
   ret void, !dbg !88
 }
@@ -103,7 +103,7 @@
 declare void @_Z3fooPcjPKc(i8*, i32, i8*) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -113,92 +113,92 @@
 !llvm.module.flags = !{!44, !45}
 !llvm.ident = !{!46}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !23, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"dbg-changes-codegen-branch-folding.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"AAA3", i32 4, i64 32, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00AAA3\004\0032\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6, metadata !11, metadata !17, metadata !18}
-!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS4AAA3", metadata !"text", i32 8, i64 32, i64 8, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ]
-!7 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 8, i32 0, i32 0, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
-!8 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!6 = metadata !{metadata !"0xd\00text\008\0032\008\000\000", metadata !1, metadata !"_ZTS4AAA3", metadata !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ]
+!7 = metadata !{metadata !"0x1\00\000\0032\008\000\000", null, null, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
+!8 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
-!11 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"", i32 5, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 5} ; [ DW_TAG_subprogram ] [line 5] [AAA3]
-!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
+!11 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00\005\000\000\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [AAA3]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{null, metadata !14, metadata !15}
-!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3]
-!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!16 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
-!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 6} ; [ DW_TAG_subprogram ] [line 6] [operator=]
-!18 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator const char *", metadata !"operator const char *", metadata !"_ZNK4AAA3cvPKcEv", i32 7, metadata !19, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 7} ; [ DW_TAG_subprogram ] [line 7] [operator const char *]
-!19 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!16 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
+!17 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\000\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [operator=]
+!18 = metadata !{metadata !"0x2e\00operator const char *\00operator const char *\00_ZNK4AAA3cvPKcEv\007\000\000\000\006\00256\001\007", metadata !1, metadata !"_ZTS4AAA3", metadata !19, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator const char *]
+!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !20 = metadata !{metadata !15, metadata !21}
-!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
-!22 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3]
+!21 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
+!22 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3]
 !23 = metadata !{metadata !24, metadata !35, metadata !40}
-!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"bar", metadata !"bar", metadata !"_Z3barii", i32 11, metadata !26, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32, i32)* @_Z3barii, null, null, metadata !29, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
-!25 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!26 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barii\0011\000\001\000\006\00256\001\0011", metadata !1, metadata !25, metadata !26, null, void (i32, i32)* @_Z3barii, null, null, metadata !29} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
+!25 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !27 = metadata !{null, metadata !28, metadata !28}
-!28 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!28 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !29 = metadata !{metadata !30, metadata !31, metadata !32, metadata !33, metadata !34}
-!30 = metadata !{i32 786689, metadata !24, metadata !"param1", metadata !25, i32 16777227, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param1] [line 11]
-!31 = metadata !{i32 786689, metadata !24, metadata !"param2", metadata !25, i32 33554443, metadata !28, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [param2] [line 11]
-!32 = metadata !{i32 786688, metadata !24, metadata !"temp", metadata !25, i32 12, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [temp] [line 12]
-!33 = metadata !{i32 786688, metadata !24, metadata !"var1", metadata !25, i32 17, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var1] [line 17]
-!34 = metadata !{i32 786688, metadata !24, metadata !"var2", metadata !25, i32 18, metadata !"_ZTS4AAA3", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var2] [line 18]
-!35 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"operator=", metadata !"operator=", metadata !"_ZN4AAA3aSEPKc", i32 6, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !17, metadata !36, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=]
+!30 = metadata !{metadata !"0x101\00param1\0016777227\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param1] [line 11]
+!31 = metadata !{metadata !"0x101\00param2\0033554443\000", metadata !24, metadata !25, metadata !28} ; [ DW_TAG_arg_variable ] [param2] [line 11]
+!32 = metadata !{metadata !"0x100\00temp\0012\000", metadata !24, metadata !25, metadata !15} ; [ DW_TAG_auto_variable ] [temp] [line 12]
+!33 = metadata !{metadata !"0x100\00var1\0017\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var1] [line 17]
+!34 = metadata !{metadata !"0x100\00var2\0018\000", metadata !24, metadata !25, metadata !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var2] [line 18]
+!35 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\001\000\006\00256\001\006", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !17, metadata !36} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=]
 !36 = metadata !{metadata !37, metadata !39}
-!37 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!38 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3]
-!39 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 6]
-!40 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4AAA3", metadata !"AAA3", metadata !"AAA3", metadata !"_ZN4AAA3C2EPKc", i32 5, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !11, metadata !41, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3]
+!37 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3]
+!39 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!40 = metadata !{metadata !"0x2e\00AAA3\00AAA3\00_ZN4AAA3C2EPKc\005\000\001\000\006\00256\001\005", metadata !1, metadata !"_ZTS4AAA3", metadata !12, null, null, null, metadata !11, metadata !41} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3]
 !41 = metadata !{metadata !42, metadata !43}
-!42 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!43 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!42 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!43 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15} ; [ DW_TAG_arg_variable ] [value] [line 5]
 !44 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!45 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !46 = metadata !{metadata !"clang version 3.5.0 "}
 !47 = metadata !{i32 11, i32 0, metadata !24, null}
 !48 = metadata !{i8* null}
 !49 = metadata !{i32 12, i32 0, metadata !24, null}
 !50 = metadata !{i32 14, i32 0, metadata !51, null}
-!51 = metadata !{i32 786443, metadata !1, metadata !24, i32 14, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!51 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
 !52 = metadata !{i32 15, i32 0, metadata !53, null}
-!53 = metadata !{i32 786443, metadata !1, metadata !51, i32 14, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!53 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !51} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
 !54 = metadata !{i32 16, i32 0, metadata !53, null}
 !55 = metadata !{i32 17, i32 0, metadata !24, null}
-!56 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!56 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !55} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !57 = metadata !{i32 0, i32 0, metadata !40, metadata !55}
 !58 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)}
-!59 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!59 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !55} ; [ DW_TAG_arg_variable ] [value] [line 5]
 !60 = metadata !{i32 5, i32 0, metadata !40, metadata !55}
 !61 = metadata !{i32 5, i32 0, metadata !62, metadata !55}
-!62 = metadata !{i32 786443, metadata !1, metadata !40, i32 5, i32 0, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!62 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !40} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
 !63 = metadata !{i32 18, i32 0, metadata !24, null}
-!64 = metadata !{i32 786689, metadata !40, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!64 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !40, null, metadata !38, metadata !63} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !65 = metadata !{i32 0, i32 0, metadata !40, metadata !63}
-!66 = metadata !{i32 786689, metadata !40, metadata !"value", metadata !25, i32 33554437, metadata !15, i32 0, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5]
+!66 = metadata !{metadata !"0x101\00value\0033554437\000", metadata !40, metadata !25, metadata !15, metadata !63} ; [ DW_TAG_arg_variable ] [value] [line 5]
 !67 = metadata !{i32 5, i32 0, metadata !40, metadata !63}
 !68 = metadata !{i32 5, i32 0, metadata !62, metadata !63}
 !69 = metadata !{i32 20, i32 0, metadata !70, null}
-!70 = metadata !{i32 786443, metadata !1, metadata !24, i32 20, i32 0, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
-!71 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!70 = metadata !{metadata !"0xb\0020\000\000", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp]
+!71 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !72} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !72 = metadata !{i32 21, i32 0, metadata !70, null}
 !73 = metadata !{i32 0, i32 0, metadata !35, metadata !72}
 !74 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)}
-!75 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!75 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !72} ; [ DW_TAG_arg_variable ] [value] [line 6]
 !76 = metadata !{i32 6, i32 0, metadata !35, metadata !72}
-!77 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!77 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !78} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !78 = metadata !{i32 23, i32 0, metadata !70, null}
 !79 = metadata !{i32 0, i32 0, metadata !35, metadata !78}
 !80 = metadata !{i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)}
-!81 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!81 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !78} ; [ DW_TAG_arg_variable ] [value] [line 6]
 !82 = metadata !{i32 6, i32 0, metadata !35, metadata !78}
-!83 = metadata !{i32 786689, metadata !35, metadata !"this", null, i32 16777216, metadata !38, i32 1088, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!83 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !35, null, metadata !38, metadata !84} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !84 = metadata !{i32 24, i32 0, metadata !24, null}
 !85 = metadata !{i32 0, i32 0, metadata !35, metadata !84}
-!86 = metadata !{i32 786689, metadata !35, metadata !"value", metadata !25, i32 33554438, metadata !15, i32 0, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6]
+!86 = metadata !{metadata !"0x101\00value\0033554438\000", metadata !35, metadata !25, metadata !15, metadata !84} ; [ DW_TAG_arg_variable ] [value] [line 6]
 !87 = metadata !{i32 6, i32 0, metadata !35, metadata !84}
 !88 = metadata !{i32 25, i32 0, metadata !24, null}

diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
index 0b17c45..aae95e8 100644
--- a/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll

@@ -44,7 +44,7 @@
 define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 {
 entry:
   %0 = load %struct.Foo** @pfoo, align 8
-  tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62)
+  tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62, metadata !{metadata !"0x102"})
   %cmp.i = icmp eq %struct.Foo* %0, %this
   ret i1 %cmp.i
 }
@@ -53,7 +53,7 @@
 define void @_Z3bazv() #1 {
 entry:
   %0 = load %struct.Wibble** @wibble1, align 8
-  tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65)
+  tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65, metadata !{metadata !"0x102"})
   %1 = load %struct.Wibble** @wibble2, align 8
   %cmp.i = icmp ugt %struct.Wibble* %1, %0
   br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit
@@ -69,15 +69,15 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 
-!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
-!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
-!62 = metadata !{i32 786689, null, metadata !"arg", null, i32 33554436, metadata !17, i32 0, null} ; [ DW_TAG_arg_variable ] [arg] [line 4]
+!17 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
+!45 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
+!62 = metadata !{metadata !"0x101\00arg\0033554436\000", null, null, metadata !17} ; [ DW_TAG_arg_variable ] [arg] [line 4]
 !64 = metadata !{%struct.Flibble* undef}
-!65 = metadata !{i32 786689, null, metadata !"this", null, i32 16777229, metadata !45, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 13]
+!65 = metadata !{metadata !"0x101\00this\0016777229\001088", null, null, metadata !45} ; [ DW_TAG_arg_variable ] [this] [line 13]

diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index 21225e3..fd07a3f 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll

@@ -31,6 +31,7 @@
 ; CHECK-LABEL: test3:
 ; CHECK: movzbl  8(%esp), %eax
 ; CHECK-NEXT: imull	$171, %eax
+; CHECK-NEXT: andl $65024, %eax
 ; CHECK-NEXT: shrl	$9, %eax
 ; CHECK-NEXT: ret
 }
@@ -56,9 +57,10 @@
   %div = sdiv i16 %x, 10
   ret i16 %div
 ; CHECK-LABEL: test6:
-; CHECK: imull $26215, %eax, %ecx
-; CHECK: sarl $18, %ecx
-; CHECK: shrl $15, %eax
+; CHECK: imull $26215, %eax
+; CHECK: movl %eax, %ecx
+; CHECK: shrl $31, %ecx
+; CHECK: sarl $18, %eax
 }
 
 define i32 @test7(i32 %x) nounwind {

diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll
new file mode 100644
index 0000000..ec367c8
--- /dev/null
+++ b/test/CodeGen/X86/divrem8_ext.ll

@@ -0,0 +1,100 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-64
+; RUN: llc -march=x86    < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-32
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_udivrem_zext_ah
+; CHECK:   divb
+; CHECK:   movzbl %ah, [[REG_REM:%[a-z0-9]+]]
+; CHECK:   movb   %al, ([[REG_ZPTR:%[a-z0-9]+]])
+; CHECK:   movl   [[REG_REM]], %eax
+; CHECK:   ret
+  %div = udiv i8 %x, %y
+  store i8 %div, i8* @z
+  %1 = urem i8 %x, %y
+  ret i8 %1
+}
+
+define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_zext_ah
+; CHECK:   divb
+; CHECK:   movzbl %ah, %eax
+; CHECK:   ret
+  %1 = urem i8 %x, %y
+  ret i8 %1
+}
+
+define i8 @test_urem_noext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_noext_ah
+; CHECK:   divb   [[REG_X:%[a-z0-9]+]]
+; CHECK:   movzbl %ah, %eax
+; CHECK:   addb   [[REG_X]], %al
+; CHECK:   ret
+  %1 = urem i8 %x, %y
+  %2 = add i8 %1, %y
+  ret i8 %2
+}
+
+define i64 @test_urem_zext64_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_urem_zext64_ah
+; CHECK:    divb
+; CHECK:    movzbl %ah, %eax
+; CHECK-32: xorl %edx, %edx
+; CHECK:    ret
+  %1 = urem i8 %x, %y
+  %2 = zext i8 %1 to i64
+  ret i64 %2
+}
+
+define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_sdivrem_sext_ah
+; CHECK:   cbtw
+; CHECK:   idivb
+; CHECK:   movsbl %ah, [[REG_REM:%[a-z0-9]+]]
+; CHECK:   movb   %al, ([[REG_ZPTR]])
+; CHECK:   movl   [[REG_REM]], %eax
+; CHECK:   ret
+  %div = sdiv i8 %x, %y
+  store i8 %div, i8* @z
+  %1 = srem i8 %x, %y
+  ret i8 %1
+}
+
+define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_sext_ah
+; CHECK:   cbtw
+; CHECK:   idivb
+; CHECK:   movsbl %ah, %eax
+; CHECK:   ret
+  %1 = srem i8 %x, %y
+  ret i8 %1
+}
+
+define i8 @test_srem_noext_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_noext_ah
+; CHECK:   cbtw
+; CHECK:   idivb [[REG_X:%[a-z0-9]+]]
+; CHECK:   movsbl %ah, %eax
+; CHECK:   addb   [[REG_X]], %al
+; CHECK:   ret
+  %1 = srem i8 %x, %y
+  %2 = add i8 %1, %y
+  ret i8 %2
+}
+
+define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
+; CHECK-LABEL: test_srem_sext64_ah
+; CHECK:    cbtw
+; CHECK:    idivb
+; CHECK:    movsbl %ah, %eax
+; CHECK-32: movl %eax, %edx
+; CHECK-32: sarl $31, %edx
+; CHECK-64: movsbq %al, %rax
+; CHECK:    ret
+  %1 = srem i8 %x, %y
+  %2 = sext i8 %1 to i64
+  ret i64 %2
+}
+
+@z = external global i8

diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index 0d5afa1..c673f5d 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll

@@ -70,7 +70,7 @@
 
 ; CHECK: .weak weak_alias
 ; CHECK: weak_alias = f1
-@weak_alias = dllexport alias weak_odr void()* @f1
+@weak_alias = weak_odr dllexport alias void()* @f1
 
 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
 @blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*)

diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index e2c3f13..5035aa1 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll

@@ -89,7 +89,7 @@
 
 ; CHECK: .weak _weak_alias
 ; CHECK: _weak_alias = _f1
-@weak_alias = dllexport alias weak_odr void()* @f1
+@weak_alias = weak_odr dllexport alias void()* @f1
 
 
 ; CHECK: .section .drectve

diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll
index 666409f..839bca4 100644
--- a/test/CodeGen/X86/dllimport-x86_64.ll
+++ b/test/CodeGen/X86/dllimport-x86_64.ll

@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple x86_64-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
 ; PR6275
 ;
-; RUN: opt -mtriple x86_64-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+; RUN: opt -mtriple x86_64-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT
 
 @Var1 = external dllimport global i32
 @Var2 = available_externally dllimport unnamed_addr constant i32 1

diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll
index 695bfce..231ad65 100644
--- a/test/CodeGen/X86/dllimport.ll
+++ b/test/CodeGen/X86/dllimport.ll

@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple i386-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
 ; PR6275
 ;
-; RUN: opt -mtriple i386-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+; RUN: opt -mtriple i386-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT
 
 @Var1 = external dllimport global i32
 @Var2 = available_externally dllimport unnamed_addr constant i32 1

diff --git a/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
new file mode 100644
index 0000000..24d9533
--- /dev/null
+++ b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll

@@ -0,0 +1,20 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+
+; CHECK-LABEL: @bar
+; CHECK: movl $1074339512,
+; CHECK: movl $1374389535,
+; CHECK: movl $1078523331,
+define void @bar() unnamed_addr {
+entry-block:
+  %a = alloca double
+  %b = alloca float
+
+  store double 3.140000e+00, double* %a
+  %0 = load double* %a
+
+  %1 = fptrunc double %0 to float
+
+  store float %1, float* %b
+
+  ret void
+}

diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index c8d7527..872f7fa 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll

@@ -7,15 +7,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", metadata !4, metadata !2, metadata !7, metadata !2, metadata !2, null} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{}
-!3 = metadata !{i32 786473, metadata !4} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x29", metadata !4} ; [ DW_TAG_file_type ]
 !4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"}
-!6 = metadata !{i32 786451, metadata !4, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !4, null, null, metadata !2, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
 !7 = metadata !{metadata !6}
 
 ; The important part of the following check is that dir = #0.
 ;                        Dir  Mod Time   File Len   File Name
 ;                        ---- ---------- ---------- ---------------------------
 ; CHECK: file_names[  1]    0 0x00000000 0x00000000 empty.c
-!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/dynamic-alloca-lifetime.ll b/test/CodeGen/X86/dynamic-alloca-lifetime.ll
new file mode 100644
index 0000000..f019bed
--- /dev/null
+++ b/test/CodeGen/X86/dynamic-alloca-lifetime.ll

@@ -0,0 +1,44 @@
+; RUN: llc -no-stack-coloring=false < %s | FileCheck %s
+
+; This test crashed in PEI because the stack protector was dead.
+; This was due to it being colored, which was in turn due to incorrect
+; lifetimes being applied to the stack protector frame index.
+
+; CHECK: stack_chk_guard
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.10.0"
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: ssp
+define void @foo(i1 %cond1, i1 %cond2) #1 {
+entry:
+  %bitmapBuffer = alloca [8192 x i8], align 1
+  br i1 %cond1, label %end1, label %bb1
+
+bb1:
+  %bitmapBuffer229 = alloca [8192 x i8], align 1
+  br i1 %cond2, label %end1, label %if.else130
+
+end1:
+  ret void
+
+if.else130:                                       ; preds = %bb1
+  %tmp = getelementptr inbounds [8192 x i8]* %bitmapBuffer, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 8192, i8* %tmp) #0
+  call void @llvm.lifetime.end(i64 8192, i8* %tmp) #0
+  %tmp25 = getelementptr inbounds [8192 x i8]* %bitmapBuffer229, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 8192, i8* %tmp25) #0
+  call void @llvm.lifetime.end(i64 8192, i8* %tmp25) #0
+  br label %end1
+}
+
+declare void @bar()
+
+attributes #0 = { nounwind }
+attributes #1 = { ssp }
\ No newline at end of file

diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll
index ac5174d..4234968 100644
--- a/test/CodeGen/X86/empty-functions.ll
+++ b/test/CodeGen/X86/empty-functions.ll

@@ -1,10 +1,14 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=CHECK-NO-FP %s
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=LINUX-NO-FP %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -disable-fp-elim | FileCheck -check-prefix=LINUX-FP %s
 
 define void @func() {
 entry:
   unreachable
 }
+
+; MachO cannot handle an empty function.
 ; CHECK-NO-FP:     _func:
 ; CHECK-NO-FP-NEXT: .cfi_startproc
 ; CHECK-NO-FP:     nop
@@ -21,5 +25,30 @@
 ; CHECK-FP-NEXT: movq %rsp, %rbp
 ; CHECK-FP-NEXT: :
 ; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-FP-NEXT: nop
 ; CHECK-FP-NEXT: .cfi_endproc
+
+; An empty function is perfectly fine on ELF.
+; LINUX-NO-FP: func:
+; LINUX-NO-FP-NEXT: .cfi_startproc
+; LINUX-NO-FP-NEXT: {{^}}#
+; LINUX-NO-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-NO-FP-NEXT: .size   func, .L{{.*}}-func
+; LINUX-NO-FP-NEXT: .cfi_endproc
+
+; A cfi directive can point to the end of a function. It (and in fact the
+; entire body) could be optimized out because of the unreachable, but we
+; don't do it right now.
+; LINUX-FP: func:
+; LINUX-FP-NEXT: .cfi_startproc
+; LINUX-FP-NEXT: {{^}}#
+; LINUX-FP-NEXT: pushq %rbp
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT:  .cfi_def_cfa_offset 16
+; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_offset %rbp, -16
+; LINUX-FP-NEXT: movq        %rsp, %rbp
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .cfi_def_cfa_register %rbp
+; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
+; LINUX-FP-NEXT: .size   func, .Ltmp3-func
+; LINUX-FP-NEXT: .cfi_endproc

diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll
index a18f751..ab92fe0 100644
--- a/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll

@@ -93,10 +93,11 @@
 
 
 ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
-; ExeDepsFix works top down, thus it coalesces vmovlhps domain with
-; vandps and there is nothing more you can do to match vmaxpd.
-; CHECK: vmovlhps
-; CHECK: vandps
+; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with
+; vpand and there is nothing more you can do to match vmaxpd.
+; CHECK: vmovq
+; CHECK: vpbroadcastq
+; CHECK: vpand
 ; CHECK: vmaxpd
 ; CHECK: ret
 define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {

diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll
index cadc0fb..8647599 100644
--- a/test/CodeGen/X86/extractelement-load.ll
+++ b/test/CodeGen/X86/extractelement-load.ll

@@ -1,6 +1,8 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s
 ; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s
 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
 define i32 @t(<2 x i64>* %val) nounwind  {
 ; CHECK-LABEL: t:
 ; CHECK-NOT: movd
@@ -23,3 +25,40 @@
   %y = extractelement <8 x i32> %Shuff68, i32 0
   ret i32 %y
 }
+
+; This case could easily end up inf-looping in the DAG combiner due to an
+; low alignment load of the vector which prevents us from reliably forming a
+; narrow load.
+; FIXME: It would be nice to detect whether the target has fast and legal
+; unaligned loads and use them here.
+define void @t3() {
+; CHECK-LABEL: t3:
+;
+; This movs the entire vector, shuffling the high double down. If we fixed the
+; FIXME above it would just move the high double directly.
+; CHECK: movupd
+; CHECK: shufpd
+; CHECK: movlpd
+
+bb:
+  %tmp13 = load <2 x double>* undef, align 1
+  %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1
+  store double %.sroa.3.24.vec.extract, double* undef, align 8
+  unreachable
+}
+
+; Case where a load is unary shuffled, then bitcast (to a type with the same
+; number of elements) before extractelement.
+; This is testing for an assertion - the extraction was assuming that the undef
+; second shuffle operand was a post-bitcast type instead of a pre-bitcast type.
+define i64 @t4(<2 x double>* %a) {
+; CHECK-LABEL: t4:
+; CHECK: mov
+; CHECK: ret
+  %b = load <2 x double>* %a, align 16
+  %c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0>
+  %d = bitcast <2 x double> %c to <2 x i64>
+  %e = extractelement <2 x i64> %d, i32 1
+  ret i64 %e
+}
+

diff --git a/test/CodeGen/X86/fast-isel-args-fail.ll b/test/CodeGen/X86/fast-isel-args-fail.ll
index 7467edd..7e783d2 100644
--- a/test/CodeGen/X86/fast-isel-args-fail.ll
+++ b/test/CodeGen/X86/fast-isel-args-fail.ll

@@ -1,7 +1,6 @@
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win64 | FileCheck %s -check-prefix=WIN64
-; REQUIRES: asserts
 
 ; Previously, this would cause an assert.
 define i31 @t1(i31 %a, i31 %b, i31 %c) {

diff --git a/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/test/CodeGen/X86/fast-isel-cmp-branch3.ll
index a3f6851..0df782d 100644
--- a/test/CodeGen/X86/fast-isel-cmp-branch3.ll
+++ b/test/CodeGen/X86/fast-isel-cmp-branch3.ll

@@ -351,7 +351,7 @@
 define i32 @icmp_eq(i32 %x) {
 ; CHECK-LABEL: icmp_eq
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp eq i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -387,7 +387,7 @@
 define i32 @icmp_uge(i32 %x) {
 ; CHECK-LABEL: icmp_uge
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp uge i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -411,7 +411,7 @@
 define i32 @icmp_ule(i32 %x) {
 ; CHECK-LABEL: icmp_ule
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp ule i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -435,7 +435,7 @@
 define i32 @icmp_sge(i32 %x) {
 ; CHECK-LABEL: icmp_sge
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp sge i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -459,7 +459,7 @@
 define i32 @icmp_sle(i32 %x) {
 ; CHECK-LABEL: icmp_sle
 ; CHECK-NOT:   cmpl
-; CHECK:       movl $0, %eax
+; CHECK:       xorl %eax, %eax
   %1 = icmp sle i32 %x, %x
   br i1 %1, label %bb1, label %bb2
 bb2:

diff --git a/test/CodeGen/X86/fast-isel-constpool.ll b/test/CodeGen/X86/fast-isel-constpool.ll
index bbbaeb2..4e6f7c0 100644
--- a/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/test/CodeGen/X86/fast-isel-constpool.ll

@@ -1,19 +1,23 @@
-; RUN: llc < %s -fast-isel | FileCheck %s
-; CHECK: LCPI0_0(%rip)
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large < %s | FileCheck %s --check-prefix=LARGE
 
-; Make sure fast isel uses rip-relative addressing when required.
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-apple-darwin9.0"
+; Make sure fast isel uses rip-relative addressing for the small code model.
+define float @constpool_float(float %x) {
+; CHECK-LABEL: constpool_float
+; CHECK:       LCPI0_0(%rip)
 
-define i32 @f0(double %x) nounwind {
-entry:
-	%retval = alloca i32		; <i32*> [#uses=2]
-	%x.addr = alloca double		; <double*> [#uses=2]
-	store double %x, double* %x.addr
-	%tmp = load double* %x.addr		; <double> [#uses=1]
-	%cmp = fcmp olt double %tmp, 8.500000e-01		; <i1> [#uses=1]
-	%conv = zext i1 %cmp to i32		; <i32> [#uses=1]
-	store i32 %conv, i32* %retval
-	%0 = load i32* %retval		; <i32> [#uses=1]
-	ret i32 %0
+; LARGE-LABEL: constpool_float
+; LARGE:       movabsq  $LCPI0_0, %rax
+  %1 = fadd float %x, 16.50e+01
+  ret float %1
+}
+
+define double @constpool_double(double %x) nounwind {
+; CHECK-LABEL: constpool_double
+; CHECK:       LCPI1_0(%rip)
+
+; LARGE-LABEL: constpool_double
+; LARGE:       movabsq  $LCPI1_0, %rax
+  %1 = fadd double %x, 8.500000e-01
+  ret double %1
 }

diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll
index cd2dc1d..eca1ae9 100644
--- a/test/CodeGen/X86/fast-isel-mem.ll
+++ b/test/CodeGen/X86/fast-isel-mem.ll

@@ -36,11 +36,11 @@
 	store i32 (...)** getelementptr ([4 x i32 (...)*]* @LotsStuff, i32 0, i32 2), i32 (...)*** null, align 4
 	ret void
 ; CHECK: _t:
-; CHECK:	movl	$0, %eax
+; CHECK:	xorl    %eax, %eax
 ; CHECK:	movl	L_LotsStuff$non_lazy_ptr, %ecx
 
 ; ATOM: _t:
 ; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %e{{..}}
-; ATOM:         movl    $0, %e{{..}}
+; ATOM:         xorl    %e{{..}}, %e{{..}}
 
 }

diff --git a/test/CodeGen/X86/fast-isel-tls.ll b/test/CodeGen/X86/fast-isel-tls.ll
index f71abd2..686df43 100644
--- a/test/CodeGen/X86/fast-isel-tls.ll
+++ b/test/CodeGen/X86/fast-isel-tls.ll

@@ -13,7 +13,7 @@
 ; CHECK: leal	v@TLSGD
 ; CHECK: __tls_get_addr
 
-@alias = alias internal i32* @v
+@alias = internal alias i32* @v
 define i32 @f_alias() nounwind {
 entry:
           %t = load i32* @v

diff --git a/test/CodeGen/X86/fast-isel-x32.ll b/test/CodeGen/X86/fast-isel-x32.ll
new file mode 100644
index 0000000..d49a108
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-x32.ll

@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl -fast-isel -fast-isel-abort | FileCheck %s
+
+; Test that alloca addresses are materialized with the right size instruction.
+
+declare void @bar(i32* %arg)
+
+; CHECK-LABEL: @foo
+define void @foo() {
+  %a = alloca i32
+; CHECK: leal {{.*}}, %edi
+  call void @bar(i32* %a)
+  ret void
+}

diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index f7d2750..3747d04 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll

@@ -144,7 +144,7 @@
 ; CHECK-LABEL: test12:
 ; CHECK: testb	$1,
 ; CHECK-NEXT: je L
-; CHECK-NEXT: movl $0, %edi
+; CHECK-NEXT: xorl %edi, %edi
 ; CHECK-NEXT: callq
 }
 
@@ -154,7 +154,7 @@
   call void @test13f(i1 0)
   ret void
 ; CHECK-LABEL: test13:
-; CHECK: movl $0, %edi
+; CHECK: xorl %edi, %edi
 ; CHECK-NEXT: callq
 }
 
@@ -194,12 +194,10 @@
   br label %block2
 
 block2:
-; CHECK: movabsq $1
-; CHECK: cvtsi2sdq {{.*}} %xmm0
+; CHECK: movsd LCP{{.*}}_{{.*}}(%rip), %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
 
-; AVX: movabsq $1
 ; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
 ; AVX: movb $1, %al
 ; AVX: callq _test16callee
@@ -280,7 +278,7 @@
   call void @foo22(i32 3)
   ret void
 ; CHECK-LABEL: test22:
-; CHECK: movl	$0, %edi
+; CHECK: xorl	%edi, %edi
 ; CHECK: callq	_foo22
 ; CHECK: movl	$1, %edi
 ; CHECK: callq	_foo22
@@ -304,3 +302,13 @@
 }
 
 declare i8* @foo23()
+
+declare void @takesi32ptr(i32* %arg)
+
+; CHECK-LABEL: allocamaterialize
+define void @allocamaterialize() {
+  %a = alloca i32
+; CHECK: leaq {{.*}}, %rdi
+  call void @takesi32ptr(i32* %a)
+  ret void
+}

diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index a212a7c..61e9b98 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll

@@ -60,3 +60,21 @@
 ; CHECK: addl $28
 }
 declare fastcc void @test4fastccsret(%struct.a* sret)
+
+
+; Check that fast-isel cleans up when it fails to lower a call instruction.
+define void @test5() {
+entry:
+  %call = call i32 @test5dllimport(i32 42)
+  ret void
+; CHECK-LABEL: test5:
+; Local value area is still there:
+; CHECK: movl $42, {{%[a-z]+}}
+; Fast-ISel's arg push is not here:
+; CHECK-NOT: movl $42, (%esp)
+; SDag-ISel's arg push:
+; CHECK: movl %esp, [[REGISTER:%[a-z]+]]
+; CHECK: movl $42, ([[REGISTER]])
+; CHECK: movl __imp__test5dllimport
+}
+declare dllimport i32 @test5dllimport(i32)

diff --git a/test/CodeGen/X86/fastmath-optnone.ll b/test/CodeGen/X86/fastmath-optnone.ll
new file mode 100644
index 0000000..0caadff
--- /dev/null
+++ b/test/CodeGen/X86/fastmath-optnone.ll

@@ -0,0 +1,35 @@
+; RUN: llc < %s -mcpu=corei7 -march=x86-64 -mattr=+sse2 | FileCheck %s
+; Verify that floating-point operations inside 'optnone' functions
+; are not optimized even if unsafe-fp-math is set.
+
+define float @foo(float %x) #0 {
+entry:
+  %add = fadd fast float %x, %x
+  %add1 = fadd fast float %add, %x
+  ret float %add1
+}
+
+; CHECK-LABEL: @foo
+; CHECK-NOT: add
+; CHECK: mul
+; CHECK-NOT: add
+; CHECK: ret
+
+define float @fooWithOptnone(float %x) #1 {
+entry:
+  %add = fadd fast float %x, %x
+  %add1 = fadd fast float %add, %x
+  ret float %add1
+}
+
+; CHECK-LABEL: @fooWithOptnone
+; CHECK-NOT: mul
+; CHECK: add
+; CHECK-NOT: mul
+; CHECK: add
+; CHECK-NOT: mul
+; CHECK: ret
+
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { noinline optnone "unsafe-fp-math"="true" }

diff --git a/test/CodeGen/X86/fma-intrinsics-x86_64.ll b/test/CodeGen/X86/fma-intrinsics-x86_64.ll
new file mode 100644
index 0000000..aadd731
--- /dev/null
+++ b/test/CodeGen/X86/fma-intrinsics-x86_64.ll

@@ -0,0 +1,278 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK
+
+; VFMADD
+define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfmaddss
+  ; CHECK-FMA: vfmadd213ss
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfmaddsd
+  ; CHECK-FMA: vfmadd213sd
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfmaddps
+  ; CHECK-FMA: vfmadd213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfmaddpd
+  ; CHECK-FMA: vfmadd213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+  ; CHECK-FMA4: vfmaddps
+  ; CHECK-FMA: vfmadd213ps
+  ; CHECK: ymm
+  %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
+  ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+  ; CHECK-FMA4: vfmaddpd
+  ; CHECK-FMA: vfmadd213pd
+  ; CHECK: ymm
+  %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
+  ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFMSUB
+define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfmsubss
+  ; CHECK-FMA: vfmsub213ss
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfmsubsd
+  ; CHECK-FMA: vfmsub213sd
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfmsubps
+  ; CHECK-FMA: vfmsub213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfmsubpd
+  ; CHECK-FMA: vfmsub213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+  ; CHECK-FMA4: vfmsubps
+  ; CHECK-FMA: vfmsub213ps
+  ; CHECK: ymm
+  %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
+  ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+  ; CHECK-FMA4: vfmsubpd
+  ; CHECK-FMA: vfmsub213pd
+  ; CHECK: ymm
+  %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
+  ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFNMADD
+define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfnmaddss
+  ; CHECK-FMA: vfnmadd213ss
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfnmaddsd
+  ; CHECK-FMA: vfnmadd213sd
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfnmaddps
+  ; CHECK-FMA: vfnmadd213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfnmaddpd
+  ; CHECK-FMA: vfnmadd213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+  ; CHECK-FMA4: vfnmaddps
+  ; CHECK-FMA: vfnmadd213ps
+  ; CHECK: ymm
+  %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
+  ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+  ; CHECK-FMA4: vfnmaddpd
+  ; CHECK-FMA: vfnmadd213pd
+  ; CHECK: ymm
+  %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
+  ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFNMSUB
+define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfnmsubss
+  ; CHECK-FMA: vfnmsub213ss
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfnmsubsd
+  ; CHECK-FMA: vfnmsub213sd
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfnmsubps
+  ; CHECK-FMA: vfnmsub213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfnmsubpd
+  ; CHECK-FMA: vfnmsub213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+  ; CHECK-FMA4: vfnmsubps
+  ; CHECK-FMA: vfnmsub213ps
+  ; CHECK: ymm
+  %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
+  ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+  ; CHECK-FMA4: vfnmsubpd
+  ; CHECK-FMA: vfnmsub213pd
+  ; CHECK: ymm
+  %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
+  ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFMADDSUB
+define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfmaddsubps
+  ; CHECK-FMA: vfmaddsub213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfmaddsubpd
+  ; CHECK-FMA: vfmaddsub213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+  ; CHECK-FMA4: vfmaddsubps
+  ; CHECK-FMA: vfmaddsub213ps
+  ; CHECK: ymm
+  %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
+  ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+  ; CHECK-FMA4: vfmaddsubpd
+  ; CHECK-FMA: vfmaddsub213pd
+  ; CHECK: ymm
+  %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
+  ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFMSUBADD
+define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+  ; CHECK-FMA4: vfmsubaddps
+  ; CHECK-FMA: vfmsubadd213ps
+  %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+  ; CHECK-FMA4: vfmsubaddpd
+  ; CHECK-FMA: vfmsubadd213pd
+  %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+  ; CHECK-FMA4: vfmsubaddps
+  ; CHECK-FMA: vfmsubadd213ps
+  ; CHECK: ymm
+  %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2)
+  ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+  ; CHECK-FMA4: vfmsubaddpd
+  ; CHECK-FMA: vfmsubadd213pd
+  ; CHECK: ymm
+  %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2)
+  ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone

diff --git a/test/CodeGen/X86/fma-phi-213-to-231.ll b/test/CodeGen/X86/fma-phi-213-to-231.ll
new file mode 100644
index 0000000..9715bc7
--- /dev/null
+++ b/test/CodeGen/X86/fma-phi-213-to-231.ll

@@ -0,0 +1,246 @@
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; CHECK-LABEL: fmaddsubpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmaddsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmsubadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmadd231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubpd_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmsub231pd        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <4 x double> %c.addr.0
+}
+
+declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+
+; CHECK-LABEL: fmaddsubps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmaddsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubaddps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmsubadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmaddps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmadd231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
+; CHECK-LABEL: fmsubps_loop
+; CHECK: [[BODYLBL:LBB.+]]:
+; CHECK:   vfmsub231ps        %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}}
+; CHECK: [[INCLBL:LBB.+]]:
+; CHECK:   addl  $1, [[INDREG:%[a-z0-9]+]]
+; CHECK:   cmpl  {{%.+}}, [[INDREG]]
+; CHECK:   jl    [[BODYLBL]]
+define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, %iter
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  br label %for.inc
+
+for.inc:
+  %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret <8 x float> %c.addr.0
+}
+
+declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)

diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 47252ec..2eb152b 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll

@@ -43,8 +43,8 @@
 }
 
 ; Test FMA3 variant selection
-; CHECK: fma3_select231ssX:
-; CHECK: vfmadd231ss xmm
+; CHECK-FMA-INST: fma3_select231ssX:
+; CHECK-FMA-INST: vfmadd231ss %xmm
 define float @fma3_select231ssX(float %x, float %y) #0 {
 entry:
   br label %while.body
@@ -58,8 +58,8 @@
 }
 
 ; Test FMA3 variant selection
-; CHECK: fma3_select231pdY:
-; CHECK: vfmadd231pd ymm
+; CHECK-FMA-INST: fma3_select231pdY:
+; CHECK-FMA-INST: vfmadd231pd %ymm
 define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) #0 {
 entry:
   br label %while.body

diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll
new file mode 100644
index 0000000..64a2068
--- /dev/null
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll

@@ -0,0 +1,84 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
+
+; VFMADD
+define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
+  ; CHECK: vfmaddss (%{{.*}})
+  %x = load float *%a2
+  %y = insertelement <4 x float> undef, float %x, i32 0
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y)
+  ret < 4 x float > %res
+}
+define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
+  ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
+  %x = load float *%a1
+  %y = insertelement <4 x float> undef, float %x, i32 0
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+
+declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
+  ; CHECK: vfmaddsd (%{{.*}})
+  %x = load double *%a2
+  %y = insertelement <2 x double> undef, double %x, i32 0
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y)
+  ret < 2 x double > %res
+}
+define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
+  ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
+  %x = load double *%a1
+  %y = insertelement <2 x double> undef, double %x, i32 0
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
+  ; CHECK: vfmaddps (%{{.*}})
+  %x = load <4 x float>* %a2
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x)
+  ret < 4 x float > %res
+}
+define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+  %x = load <4 x float>* %a1
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+; To test execution dependency
+define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+  ; CHECK: vmovaps
+  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+  %x = load <4 x float>* %a0
+  %y = load <4 x float>* %a1
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2)
+  ret < 4 x float > %res
+}
+
+define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
+  ; CHECK: vfmaddpd (%{{.*}})
+  %x = load <2 x double>* %a2
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x)
+  ret < 2 x double > %res
+}
+define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+  %x = load <2 x double>* %a1
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+; To test execution dependency
+define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+  ; CHECK: vmovapd
+  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+  %x = load <2 x double>* %a0
+  %y = load <2 x double>* %a1
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2)
+  ret < 2 x double > %res
+}
+

diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
deleted file mode 100644
index 494cb28..0000000
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ /dev/null

@@ -1,316 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
-
-; VFMADD
-define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddss
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
-  ; CHECK: vfmaddss (%{{.*}})
-  %x = load float *%a2
-  %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
-  %x = load float *%a1
-  %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddsd
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
-  ; CHECK: vfmaddsd (%{{.*}})
-  %x = load double *%a2
-  %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
-  %x = load double *%a1
-  %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddps
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
-  ; CHECK: vfmaddps (%{{.*}})
-  %x = load <4 x float>* %a2
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
-  %x = load <4 x float>* %a1
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-; To test execution dependency
-define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) {
-  ; CHECK: vmovaps
-  ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
-  %x = load <4 x float>* %a0
-  %y = load <4 x float>* %a1
-  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-
-define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddpd
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
-  ; CHECK: vfmaddpd (%{{.*}})
-  %x = load <2 x double>* %a2
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
-  %x = load <2 x double>* %a1
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-; To test execution dependency
-define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) {
-  ; CHECK: vmovapd
-  ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
-  %x = load <2 x double>* %a0
-  %y = load <2 x double>* %a1
-  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-
-define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfmaddps
-  ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
-  ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfmaddpd
-  ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
-  ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFMSUB
-define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmsubss
-  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmsubsd
-  %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmsubps
-  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmsubpd
-  %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfmsubps
-  ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
-  ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfmsubpd
-  ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
-  ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFNMADD
-define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfnmaddss
-  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfnmaddsd
-  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfnmaddps
-  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfnmaddpd
-  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfnmaddps
-  ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
-  ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfnmaddpd
-  ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
-  ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFNMSUB
-define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfnmsubss
-  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfnmsubsd
-  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfnmsubps
-  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfnmsubpd
-  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfnmsubps
-  ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
-  ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfnmsubpd
-  ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
-  ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFMADDSUB
-define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmaddsubps
-  %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmaddsubpd
-  %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfmaddsubps
-  ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
-  ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfmaddsubpd
-  ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
-  ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
-
-; VFMSUBADD
-define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
-  ; CHECK: vfmsubaddps
-  %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
-  ret < 4 x float > %res
-}
-declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-
-define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
-  ; CHECK: vfmsubaddpd
-  %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
-  ret < 2 x double > %res
-}
-declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
-
-define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
-  ; CHECK: vfmsubaddps
-  ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
-  ret < 8 x float > %res
-}
-declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
-
-define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
-  ; CHECK: vfmsubaddpd
-  ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
-  ret < 4 x double > %res
-}
-declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone

diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index cfb598d..9b52db9 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll

@@ -184,7 +184,7 @@
 
 ; CHECK: test_x86_fmadd_ps_load
 ; CHECK: vmovaps         (%rdi), %xmm2
-; CHECK: vfmadd213ps     %xmm1, %xmm0, %xmm2
+; CHECK: vfmadd213ps     %xmm1, %xmm2, %xmm0
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fmadd_ps_load
 ; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
@@ -198,7 +198,7 @@
 
 ; CHECK: test_x86_fmsub_ps_load
 ; CHECK: vmovaps         (%rdi), %xmm2
-; CHECK: fmsub213ps     %xmm1, %xmm0, %xmm2
+; CHECK: fmsub213ps     %xmm1, %xmm2, %xmm0
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fmsub_ps_load
 ; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0

diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll
new file mode 100644
index 0000000..23678c4
--- /dev/null
+++ b/test/CodeGen/X86/fmaxnum.ll

@@ -0,0 +1,50 @@
+; RUN: llc  -march=x86 -mtriple=i386-linux-gnu  < %s | FileCheck %s
+
+declare float @fmaxf(float, float)
+declare double @fmax(double, double)
+declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)
+declare float @llvm.maxnum.f32(float, float)
+declare double @llvm.maxnum.f64(double, double)
+declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80)
+
+; CHECK-LABEL: @test_fmaxf
+; CHECK: calll fmaxf
+define float @test_fmaxf(float %x, float %y) {
+  %z = call float @fmaxf(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_fmax
+; CHECK: calll fmax
+define double @test_fmax(double %x, double %y) {
+  %z = call double @fmax(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_fmaxl
+; CHECK: calll fmaxl
+define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) {
+  %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone
+  ret x86_fp80 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxf
+; CHECK: calll fmaxf
+define float @test_intrinsic_fmaxf(float %x, float %y) {
+  %z = call float @llvm.maxnum.f32(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmax
+; CHECK: calll fmax
+define double @test_intrinsic_fmax(double %x, double %y) {
+  %z = call double @llvm.maxnum.f64(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmaxl
+; CHECK: calll fmaxl
+define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) {
+  %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
+  ret x86_fp80 %z
+}

diff --git a/test/CodeGen/X86/fminnum.ll b/test/CodeGen/X86/fminnum.ll
new file mode 100644
index 0000000..1e33cf4
--- /dev/null
+++ b/test/CodeGen/X86/fminnum.ll

@@ -0,0 +1,95 @@
+; RUN: llc  -march=x86 -mtriple=i386-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s
+
+declare float @fminf(float, float)
+declare double @fmin(double, double)
+declare x86_fp80 @fminl(x86_fp80, x86_fp80)
+declare float @llvm.minnum.f32(float, float)
+declare double @llvm.minnum.f64(double, double)
+declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80)
+
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
+
+; CHECK-LABEL: @test_fminf
+; CHECK: jmp fminf
+define float @test_fminf(float %x, float %y) {
+  %z = call float @fminf(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_fmin
+; CHECK: jmp fmin
+define double @test_fmin(double %x, double %y) {
+  %z = call double @fmin(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_fminl
+; CHECK: calll fminl
+define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) {
+  %z = call x86_fp80 @fminl(x86_fp80 %x, x86_fp80 %y) readnone
+  ret x86_fp80 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fminf
+; CHECK: jmp fminf
+define float @test_intrinsic_fminf(float %x, float %y) {
+  %z = call float @llvm.minnum.f32(float %x, float %y) readnone
+  ret float %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin
+; CHECK: jmp fmin
+define double @test_intrinsic_fmin(double %x, double %y) {
+  %z = call double @llvm.minnum.f64(double %x, double %y) readnone
+  ret double %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fminl
+; CHECK: calll fminl
+define x86_fp80 @test_intrinsic_fminl(x86_fp80 %x, x86_fp80 %y) {
+  %z = call x86_fp80 @llvm.minnum.f80(x86_fp80 %x, x86_fp80 %y) readnone
+  ret x86_fp80 %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v2f32
+; CHECK: calll fminf
+; CHECK: calll fminf
+define <2 x float> @test_intrinsic_fmin_v2f32(<2 x float> %x, <2 x float> %y) {
+  %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone
+  ret <2 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v4f32
+; CHECK: calll fminf
+; CHECK: calll fminf
+; CHECK: calll fminf
+; CHECK: calll fminf
+define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) {
+  %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v2f64
+; CHECK: calll fmin
+; CHECK: calll fmin
+define <2 x double> @test_intrinsic_fmin_v2f64(<2 x double> %x, <2 x double> %y) {
+  %z = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) readnone
+  ret <2 x double> %z
+}
+
+; CHECK-LABEL: @test_intrinsic_fmin_v8f64
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+; CHECK: calll fmin
+define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) {
+  %z = call <8 x double> @llvm.minnum.v8f64(<8 x double> %x, <8 x double> %y) readnone
+  ret <8 x double> %z
+}

diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
new file mode 100644
index 0000000..7036511
--- /dev/null
+++ b/test/CodeGen/X86/fmul-combines.ll

@@ -0,0 +1,147 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -march=x86-64 < %s | FileCheck %s
+
+; CHECK-LABEL: fmul2_f32:
+; CHECK: addss %xmm0, %xmm0
+define float @fmul2_f32(float %x) {
+  %y = fmul float %x, 2.0
+  ret float %y
+}
+
+; fmul 2.0, x -> fadd x, x for vectors.
+
+; CHECK-LABEL: fmul2_v4f32:
+; CHECK: addps %xmm0, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmul2_v4f32(<4 x float> %x) {
+  %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0>
+  ret <4 x float> %y
+}
+
+; CHECK-LABEL: constant_fold_fmul_v4f32:
+; CHECK: movaps
+; CHECK-NEXT: ret
+define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) {
+  %y = fmul <4 x float> <float 4.0, float 4.0, float 4.0, float 4.0>, <float 2.0, float 2.0, float 2.0, float 2.0>
+  ret <4 x float> %y
+}
+
+; CHECK-LABEL: fmul0_v4f32:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmul0_v4f32(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 0.0, float 0.0, float 0.0, float 0.0>
+  ret <4 x float> %y
+}
+
+; CHECK-LABEL: fmul_c2_c4_v4f32:
+; CHECK-NOT: addps
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_c2_c4_v4f32(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0>
+  %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0>
+  ret <4 x float> %z
+}
+
+; CHECK-LABEL: fmul_c3_c4_v4f32:
+; CHECK-NOT: addps
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 3.0, float 3.0, float 3.0, float 3.0>
+  %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0>
+  ret <4 x float> %z
+}
+
+; We should be able to pre-multiply the two constant vectors.
+; CHECK: float 5.000000e+00
+; CHECK: float 1.200000e+01
+; CHECK: float 2.100000e+01
+; CHECK: float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+  ret <4 x float> %z
+}
+
+; Same as above, but reverse operands to make sure non-canonical form is also handled.
+; CHECK: float 5.000000e+00
+; CHECK: float 1.200000e+01
+; CHECK: float 2.100000e+01
+; CHECK: float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical:
+; CHECK: mulps
+; CHECK-NOT: mulps
+; CHECK-NEXT: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x) #0 {
+  %y = fmul <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
+  %z = fmul <4 x float> <float 5.0, float 6.0, float 7.0, float 8.0>, %y
+  ret <4 x float> %z
+}
+
+; More than one use of a constant multiply should not inhibit the optimization.
+; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. 
+; CHECK: float 5.000000e+00
+; CHECK: float 1.200000e+01
+; CHECK: float 2.100000e+01
+; CHECK: float 3.200000e+01
+; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use:
+; CHECK: mulps
+; CHECK: mulps
+; CHECK: addps
+; CHECK: ret
+define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 {
+  %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0>
+  %a = fadd <4 x float> %y, %z
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: fmul_c2_c4_f32:
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: mulss
+; CHECK-NEXT: ret
+define float @fmul_c2_c4_f32(float %x) #0 {
+  %y = fmul float %x, 2.0
+  %z = fmul float %y, 4.0
+  ret float %z
+}
+
+; CHECK-LABEL: fmul_c3_c4_f32:
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: mulss
+; CHECK-NET: ret
+define float @fmul_c3_c4_f32(float %x) #0 {
+  %y = fmul float %x, 3.0
+  %z = fmul float %y, 4.0
+  ret float %z
+}
+
+; CHECK-LABEL: fmul_fneg_fneg_f32:
+; CHECK: mulss %xmm1, %xmm0
+; CHECK-NEXT: retq
+define float @fmul_fneg_fneg_f32(float %x, float %y) {
+  %x.neg = fsub float -0.0, %x
+  %y.neg = fsub float -0.0, %y
+  %mul = fmul float %x.neg, %y.neg
+  ret float %mul
+}
+; CHECK-LABEL: fmul_fneg_fneg_v4f32:
+; CHECK: mulps {{%xmm1|\(%rdx\)}}, %xmm0
+; CHECK-NEXT: retq
+define <4 x float> @fmul_fneg_fneg_v4f32(<4 x float> %x, <4 x float> %y) {
+  %x.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %x
+  %y.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %y
+  %mul = fmul <4 x float> %x.neg, %y.neg
+  ret <4 x float> %mul
+}
+
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }

diff --git a/test/CodeGen/X86/fnabs.ll b/test/CodeGen/X86/fnabs.ll
new file mode 100644
index 0000000..19718d3
--- /dev/null
+++ b/test/CodeGen/X86/fnabs.ll

@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx| FileCheck %s
+
+; Verify that we generate a single OR instruction for a scalar, vec128, and vec256
+; FNABS(x) operation -> FNEG (FABS(x)).
+; If the FABS() result isn't used, the AND instruction should be eliminated.
+; PR20578: http://llvm.org/bugs/show_bug.cgi?id=20578
+
+define float @scalar_no_abs(float %a) {
+; CHECK-LABEL: scalar_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+  %fabs = tail call float @fabsf(float %a) #1
+  %fsub = fsub float -0.0, %fabs
+  ret float %fsub
+}
+
+define float @scalar_uses_abs(float %a) {
+; CHECK-LABEL: scalar_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulss
+; CHECK-NEXT: retq
+  %fabs = tail call float @fabsf(float %a) #1
+  %fsub = fsub float -0.0, %fabs
+  %fmul = fmul float %fsub, %fabs
+  ret float %fmul
+}
+
+define <4 x float> @vector128_no_abs(<4 x float> %a) {
+; CHECK-LABEL: vector128_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+  %fabs = tail call <4 x float> @llvm.fabs.v4f32(< 4 x float> %a) #1
+  %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  ret <4 x float> %fsub
+}
+
+define <4 x float> @vector128_uses_abs(<4 x float> %a) {
+; CHECK-LABEL: vector128_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulps
+; CHECK-NEXT: retq
+  %fabs = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #1
+  %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  %fmul = fmul <4 x float> %fsub, %fabs
+  ret <4 x float> %fmul
+}
+
+define <8 x float> @vector256_no_abs(<8 x float> %a) {
+; CHECK-LABEL: vector256_no_abs:
+; CHECK: vorps
+; CHECK-NEXT: retq
+  %fabs = tail call <8 x float> @llvm.fabs.v8f32(< 8 x float> %a) #1
+  %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  ret <8 x float> %fsub
+}
+
+define <8 x float> @vector256_uses_abs(<8 x float> %a) {
+; CHECK-LABEL: vector256_uses_abs:
+; CHECK-DAG: vandps
+; CHECK-DAG: vorps
+; CHECK: vmulps
+; CHECK-NEXT: retq
+  %fabs = tail call <8 x float> @llvm.fabs.v8f32(<8 x float> %a) #1
+  %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs
+  %fmul = fmul <8 x float> %fsub, %fabs
+  ret <8 x float> %fmul
+}
+
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+
+declare float @fabsf(float)
+
+attributes #1 = { readnone }
+

diff --git a/test/CodeGen/X86/fold-pcmpeqd-0.ll b/test/CodeGen/X86/fold-pcmpeqd-0.ll
deleted file mode 100644
index 1d315ff..0000000
--- a/test/CodeGen/X86/fold-pcmpeqd-0.ll
+++ /dev/null

@@ -1,117 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s
-; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s
-
-; i386 test has been disabled when scheduler 2-addr hack is disabled.
-
-; This testcase shouldn't need to spill the -1 value,
-; so it should just use pcmpeqd to materialize an all-ones vector.
-; For i386, cp load of -1 are folded.
-
-; With -regalloc=greedy, the live range is split before spilling, so the first
-; pcmpeq doesn't get folded as a constant pool load.
-
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-; I386-NOT: pcmpeqd
-; I386: orps LCPI0_2, %xmm
-
-; X86-64: pcmpeqd
-; X86-64-NOT: pcmpeqd
-
-	%struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }>
-	%struct._cl_image_format_t = type <{ i32, i32, i32 }>
-	%struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }>
-
-define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind {
-entry:
-	%tmp3.i = load i32* null		; <i32> [#uses=1]
-	%cmp = icmp sgt i32 %tmp3.i, 200		; <i1> [#uses=1]
-	br i1 %cmp, label %forcond, label %ifthen
-
-ifthen:		; preds = %entry
-	ret void
-
-forcond:		; preds = %entry
-	%tmp3.i536 = load i32* null		; <i32> [#uses=1]
-	%cmp12 = icmp slt i32 0, %tmp3.i536		; <i1> [#uses=1]
-	br i1 %cmp12, label %forbody, label %afterfor
-
-forbody:		; preds = %forcond
-	%bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float>		; <<4 x float>> [#uses=1]
-	%mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer		; <<4 x float>> [#uses=1]
-	%mul257 = fmul <4 x float> %mul233, zeroinitializer		; <<4 x float>> [#uses=1]
-	%mul275 = fmul <4 x float> %mul257, zeroinitializer		; <<4 x float>> [#uses=1]
-	%tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=0]
-	%bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind		; <<4 x i32>> [#uses=1]
-	%tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind		; <<4 x float>> [#uses=1]
-	%sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70		; <<4 x float>> [#uses=2]
-	%mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78		; <<4 x float>> [#uses=1]
-	%add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 >		; <<4 x float>> [#uses=1]
-	%mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78		; <<4 x float>> [#uses=1]
-	%add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >		; <<4 x float>> [#uses=1]
-	%bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float>		; <<4 x float>> [#uses=1]
-	%mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer		; <<4 x float>> [#uses=1]
-	%bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102		; <<4 x i32>> [#uses=1]
-	%bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float>		; <<4 x float>> [#uses=1]
-	%cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind		; <<4 x float>> [#uses=1]
-	%tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%sub140.i = fsub <4 x float> zeroinitializer, %tmp80		; <<4 x float>> [#uses=1]
-	%bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 >		; <<4 x i32>> [#uses=0]
-	%mul171.i = fmul <4 x float> zeroinitializer, %sub140.i		; <<4 x float>> [#uses=1]
-	%add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 >		; <<4 x float>> [#uses=1]
-	%bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float>		; <<4 x float>> [#uses=1]
-	%mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer		; <<4 x float>> [#uses=1]
-	%bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=0]
-	%bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%orps203.i = or <4 x i32> %andnps192.i, %xorps.i		; <<4 x i32>> [#uses=1]
-	%bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float>		; <<4 x float>> [#uses=1]
-	%mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer		; <<4 x float>> [#uses=1]
-	%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer		; <<4 x float>> [#uses=2]
-	%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer		; <<4 x float>> [#uses=1]
-	%tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=2]
-	%andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4		; <<4 x i32>> [#uses=1]
-	%bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
-	%andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7		; <<4 x i32>> [#uses=1]
-	%orps.i9 = or <4 x i32> %andnps.i8, %andps.i5		; <<4 x i32>> [#uses=1]
-	%bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float>		; <<4 x float>> [#uses=1]
-	%bitcast.i = bitcast <4 x float> %mul313 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andps.i = and <4 x i32> %bitcast.i, zeroinitializer		; <<4 x i32>> [#uses=1]
-	%orps.i = or <4 x i32> zeroinitializer, %andps.i		; <<4 x i32>> [#uses=1]
-	%bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float>		; <<4 x float>> [#uses=1]
-	call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
-	unreachable
-
-afterfor:		; preds = %forcond
-	ret void
-}
-
-declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
-
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone

diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll
new file mode 100644
index 0000000..a643d86
--- /dev/null
+++ b/test/CodeGen/X86/fold-tied-op.ll

@@ -0,0 +1,84 @@
+; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s

+; Regression test for http://reviews.llvm.org/D5701

+

+; ModuleID = 'xxhash.i'

+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"

+target triple = "i386--netbsd"

+

+; CHECK-LABEL: fn1

+; CHECK:       shldl {{.*#+}} 4-byte Folded Spill

+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload

+; CHECK:       shldl {{.*#+}} 4-byte Folded Spill

+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload

+; CHECK:       addl  {{.*#+}} 4-byte Folded Reload

+; CHECK:       imull {{.*#+}} 4-byte Folded Reload

+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload

+; CHECK:       retl

+

+%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }

+

+@a = common global i32 0, align 4

+@b = common global i64 0, align 8

+

+; Function Attrs: nounwind uwtable

+define i64 @fn1() #0 {

+entry:

+  %0 = load i32* @a, align 4, !tbaa !1

+  %1 = inttoptr i32 %0 to %struct.XXH_state64_t*

+  %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0

+  %2 = load i32* %total_len, align 4, !tbaa !5

+  %tobool = icmp eq i32 %2, 0

+  br i1 %tobool, label %if.else, label %if.then

+

+if.then:                                          ; preds = %entry

+  %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3

+  %3 = load i64* %v3, align 4, !tbaa !8

+  %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4

+  %4 = load i64* %v4, align 4, !tbaa !9

+  %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2

+  %5 = load i64* %v2, align 4, !tbaa !10

+  %shl = shl i64 %5, 1

+  %or = or i64 %shl, %5

+  %shl2 = shl i64 %3, 2

+  %shr = lshr i64 %3, 1

+  %or3 = or i64 %shl2, %shr

+  %add = add i64 %or, %or3

+  %mul = mul i64 %4, -4417276706812531889

+  %shl4 = mul i64 %4, -8834553413625063778

+  %shr5 = ashr i64 %mul, 3

+  %or6 = or i64 %shr5, %shl4

+  %mul7 = mul nsw i64 %or6, 1400714785074694791

+  %xor = xor i64 %add, %mul7

+  store i64 %xor, i64* @b, align 8, !tbaa !11

+  %mul8 = mul nsw i64 %xor, 1400714785074694791

+  br label %if.end

+

+if.else:                                          ; preds = %entry

+  %6 = load i64* @b, align 8, !tbaa !11

+  %xor10 = xor i64 %6, -4417276706812531889

+  %mul11 = mul nsw i64 %xor10, 400714785074694791

+  br label %if.end

+

+if.end:                                           ; preds = %if.else, %if.then

+  %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]

+  %storemerge = add i64 %storemerge.in, -8796714831421723037

+  store i64 %storemerge, i64* @b, align 8, !tbaa !11

+  ret i64 undef

+}

+

+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

+

+!llvm.ident = !{!0}

+

+!0 = metadata !{metadata !"clang version 3.6 (trunk 219587)"}

+!1 = metadata !{metadata !2, metadata !2, i64 0}

+!2 = metadata !{metadata !"int", metadata !3, i64 0}

+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}

+!4 = metadata !{metadata !"Simple C/C++ TBAA"}

+!5 = metadata !{metadata !6, metadata !2, i64 0}

+!6 = metadata !{metadata !"XXH_state64_t", metadata !2, i64 0, metadata !2, i64 4, metadata !7, i64 8, metadata !7, i64 16, metadata !7, i64 24}

+!7 = metadata !{metadata !"long long", metadata !3, i64 0}

+!8 = metadata !{metadata !6, metadata !7, i64 16}

+!9 = metadata !{metadata !6, metadata !7, i64 24}

+!10 = metadata !{metadata !6, metadata !7, i64 8}

+!11 = metadata !{metadata !7, metadata !7, i64 0}


diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll
index a973bef..e6c1e1a 100644
--- a/test/CodeGen/X86/fp-load-trunc.ll
+++ b/test/CodeGen/X86/fp-load-trunc.ll

@@ -2,57 +2,87 @@
 ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
 
 define <1 x float> @test1(<1 x double>* %p) nounwind {
-; CHECK: test1
-; CHECK: cvtsd2ss
-; CHECK: ret
-; AVX:   test1
-; AVX:   vcvtsd2ss
-; AVX:   ret
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movsd (%eax), %xmm0
+; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    pushl %eax
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vmovsd (%eax), %xmm0
+; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovss %xmm0, (%esp)
+; AVX-NEXT:    flds (%esp)
+; AVX-NEXT:    popl %eax
+; AVX-NEXT:    retl
   %x = load <1 x double>* %p
   %y = fptrunc <1 x double> %x to <1 x float>
   ret <1 x float> %y
 }
 
 define <2 x float> @test2(<2 x double>* %p) nounwind {
-; CHECK: test2
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: ret
-; AVX:   test2
-; AVX:   vcvtpd2psx {{[0-9]*}}(%{{.*}})
-; AVX:   ret
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cvtpd2ps (%eax), %xmm0
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vcvtpd2psx (%eax), %xmm0
+; AVX-NEXT:    retl
   %x = load <2 x double>* %p
   %y = fptrunc <2 x double> %x to <2 x float>
   ret <2 x float> %y
 }
 
 define <4 x float> @test3(<4 x double>* %p) nounwind {
-; CHECK: test3
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: movlhps
-; CHECK: ret
-; AVX:   test3
-; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
-; AVX:   ret
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cvtpd2ps 16(%eax), %xmm1
+; CHECK-NEXT:    cvtpd2ps (%eax), %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vcvtpd2psy (%eax), %xmm0
+; AVX-NEXT:    retl
   %x = load <4 x double>* %p
   %y = fptrunc <4 x double> %x to <4 x float>
   ret <4 x float> %y
 }
 
 define <8 x float> @test4(<8 x double>* %p) nounwind {
-; CHECK: test4
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: movlhps
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
-; CHECK: movlhps
-; CHECK: ret
-; AVX:   test4
-; AVX:   vcvtpd2psy
-; AVX:   vcvtpd2psy
-; AVX:   vinsertf128
-; AVX:   ret
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cvtpd2ps 16(%eax), %xmm1
+; CHECK-NEXT:    cvtpd2ps (%eax), %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    cvtpd2ps 48(%eax), %xmm2
+; CHECK-NEXT:    cvtpd2ps 32(%eax), %xmm1
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vcvtpd2psy (%eax), %xmm0
+; AVX-NEXT:    vcvtpd2psy 32(%eax), %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retl
   %x = load <8 x double>* %p
   %y = fptrunc <8 x double> %x to <8 x float>
   ret <8 x float> %y

diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 25442fc..6424bfc 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll

@@ -2,55 +2,77 @@
 ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
 
 define <1 x float> @test1(<1 x double> %x) nounwind {
-; CHECK: test1
-; CHECK: cvtsd2ss
-; CHECK: ret
-; AVX:   test1
-; AVX:   vcvtsd2ss
-; AVX:   ret
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movsd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    pushl %eax
+; AVX-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0
+; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovss %xmm0, (%esp)
+; AVX-NEXT:    flds (%esp)
+; AVX-NEXT:    popl %eax
+; AVX-NEXT:    retl
   %y = fptrunc <1 x double> %x to <1 x float>
   ret <1 x float> %y
 }
 
 define <2 x float> @test2(<2 x double> %x) nounwind {
-; CHECK: test2
-; CHECK: cvtpd2ps
-; CHECK: ret
-; AVX:   test2
-; AVX-NOT:  vcvtpd2psy
-; AVX:   vcvtpd2ps
-; AVX:   ret
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtpd2ps %xmm0, %xmm0
+; AVX-NEXT:    retl
   %y = fptrunc <2 x double> %x to <2 x float>
   ret <2 x float> %y
 }
 
 define <4 x float> @test3(<4 x double> %x) nounwind {
-; CHECK: test3
-; CHECK: cvtpd2ps
-; CHECK: cvtpd2ps
-; CHECK: movlhps
-; CHECK: ret
-; AVX:   test3
-; AVX:   vcvtpd2psy
-; AVX:   ret
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
+; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtpd2psy %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retl
   %y = fptrunc <4 x double> %x to <4 x float>
   ret <4 x float> %y
 }
 
 define <8 x float> @test4(<8 x double> %x) nounwind {
-; CHECK: test4
-; CHECK: cvtpd2ps
-; CHECK: cvtpd2ps
-; CHECK: movlhps
-; CHECK: cvtpd2ps
-; CHECK: cvtpd2ps
-; CHECK: movlhps
-; CHECK: ret
-; AVX:   test4
-; AVX:   vcvtpd2psy
-; AVX:   vcvtpd2psy
-; AVX:   vinsertf128
-; AVX:   ret
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
+; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    cvtpd2ps %xmm3, %xmm3
+; CHECK-NEXT:    cvtpd2ps %xmm2, %xmm1
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-NEXT:    retl
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtpd2psy %ymm0, %xmm0
+; AVX-NEXT:    vcvtpd2psy %ymm1, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retl
   %y = fptrunc <8 x double> %x to <8 x float>
   ret <8 x float> %y
 }

diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
new file mode 100644
index 0000000..dfc59a3
--- /dev/null
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll

@@ -0,0 +1,71 @@
+; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as
+
+@g1 = global double 0.000000e+00, align 8
+@g2 = global i32 0, align 4
+
+define void @_Z16fpuop_arithmeticjj(i32, i32) {
+entry:
+  switch i32 undef, label %sw.bb.i1921 [
+  ]
+
+sw.bb261:                                         ; preds = %entry, %entry
+  unreachable
+
+sw.bb.i1921:                                      ; preds = %if.end504
+  switch i32 undef, label %if.end511 [
+    i32 1, label %sw.bb27.i
+  ]
+
+sw.bb27.i:                                        ; preds = %sw.bb.i1921
+  %conv.i.i1923 = fpext float undef to x86_fp80
+  br label %if.end511
+
+if.end511:                                        ; preds = %sw.bb27.i, %sw.bb13.i
+  %src.sroa.0.0.src.sroa.0.0.2280 = phi x86_fp80 [ %conv.i.i1923, %sw.bb27.i ], [ undef, %sw.bb.i1921 ]
+  switch i32 undef, label %sw.bb992 [
+    i32 3, label %sw.bb735
+    i32 18, label %if.end41.i2210
+  ]
+
+sw.bb735:                                         ; preds = %if.end511
+  %2 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280)
+  unreachable
+
+if.end41.i2210:                                   ; preds = %if.end511
+  call void @llvm.dbg.value(metadata !{x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280}, i64 0, metadata !20, metadata !{metadata !"0x102"})
+  unreachable
+
+sw.bb992:                                         ; preds = %if.end511
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!24, !25}
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !21, metadata !2} ; [ DW_TAG_compile_unit ] [x87stackifier/fpu_ieee.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"fpu_ieee.cpp", metadata !"x87stackifier"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00fpuop_arithmetic\00fpuop_arithmetic\00_Z16fpuop_arithmeticjj\0011\000\001\000\006\00256\001\0013", metadata !5, metadata !6, metadata !7, null, void (i32, i32)* @_Z16fpuop_arithmeticjj, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 13] [fpuop_arithmetic]
+!5 = metadata !{metadata !"f1.cpp", metadata !"x87stackifier"}
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [x87stackifier/f1.cpp]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !9}
+!9 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !18, metadata !20}
+!11 = metadata !{metadata !"0x101\00\0016777227\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11]
+!12 = metadata !{metadata !"0x101\00\0033554443\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 11]
+!13 = metadata !{metadata !"0x100\00x\0014\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [x] [line 14]
+!14 = metadata !{metadata !"0x16\00fpu_extended\003\000\000\000\000", metadata !5, null, metadata !15} ; [ DW_TAG_typedef ] [fpu_extended] [line 3, size 0, align 0, offset 0] [from fpu_register]
+!15 = metadata !{metadata !"0x16\00fpu_register\002\000\000\000\000", metadata !5, null, metadata !16} ; [ DW_TAG_typedef ] [fpu_register] [line 2, size 0, align 0, offset 0] [from uae_f64]
+!16 = metadata !{metadata !"0x16\00uae_f64\001\000\000\000\000", metadata !5, null, metadata !17} ; [ DW_TAG_typedef ] [uae_f64] [line 1, size 0, align 0, offset 0] [from double]
+!17 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!18 = metadata !{metadata !"0x100\00a\0015\000", metadata !4, metadata !6, metadata !19} ; [ DW_TAG_auto_variable ] [a] [line 15]
+!19 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = metadata !{metadata !"0x100\00value\0016\000", metadata !4, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [value] [line 16]
+!21 = metadata !{metadata !22, metadata !23}
+!22 = metadata !{metadata !"0x34\00g1\00g1\00\005\000\001", null, metadata !6, metadata !14, double* @g1, null} ; [ DW_TAG_variable ] [g1] [line 5] [def]
+!23 = metadata !{metadata !"0x34\00g2\00g2\00\006\000\001", null, metadata !6, metadata !19, i32* @g2, null} ; [ DW_TAG_variable ] [g2] [line 6] [def]
+!24 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!25 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/frameaddr.ll b/test/CodeGen/X86/frameaddr.ll
index 6c1ca25..452c8e5 100644
--- a/test/CodeGen/X86/frameaddr.ll
+++ b/test/CodeGen/X86/frameaddr.ll

@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=x86    -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-32
 ; RUN: llc < %s -march=x86-64                             | FileCheck %s --check-prefix=CHECK-64
 ; RUN: llc < %s -march=x86-64 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=x86_64-gnux32                    | FileCheck %s --check-prefix=CHECK-X32ABI
+; RUN: llc < %s -mtriple=x86_64-gnux32 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-X32ABI
 
 define i8* @test1() nounwind {
 entry:
@@ -17,6 +19,12 @@
 ; CHECK-64-NEXT:  movq %rbp, %rax
 ; CHECK-64-NEXT:  pop
 ; CHECK-64-NEXT:  ret
+; CHECK-X32ABI-LABEL: test1
+; CHECK-X32ABI:       pushq %rbp
+; CHECK-X32ABI-NEXT:  movl %esp, %ebp
+; CHECK-X32ABI-NEXT:  movl %ebp, %eax
+; CHECK-X32ABI-NEXT:  popq %rbp
+; CHECK-X32ABI-NEXT:  ret
   %0 = tail call i8* @llvm.frameaddress(i32 0)
   ret i8* %0
 }
@@ -37,6 +45,13 @@
 ; CHECK-64-NEXT:  movq (%rax), %rax
 ; CHECK-64-NEXT:  pop
 ; CHECK-64-NEXT:  ret
+; CHECK-X32ABI-LABEL: test2
+; CHECK-X32ABI:       pushq %rbp
+; CHECK-X32ABI-NEXT:  movl %esp, %ebp
+; CHECK-X32ABI-NEXT:  movl (%ebp), %eax
+; CHECK-X32ABI-NEXT:  movl (%eax), %eax
+; CHECK-X32ABI-NEXT:  popq %rbp
+; CHECK-X32ABI-NEXT:  ret
   %0 = tail call i8* @llvm.frameaddress(i32 2)
   ret i8* %0
 }

diff --git a/test/CodeGen/X86/gcc_except_table_functions.ll b/test/CodeGen/X86/gcc_except_table_functions.ll
new file mode 100644
index 0000000..4a81680
--- /dev/null
+++ b/test/CodeGen/X86/gcc_except_table_functions.ll

@@ -0,0 +1,53 @@
+; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+
+; This test demonstrates that it is possible to use functions for typeinfo
+; instead of global variables. While __gxx_personality_v0 would never know what
+; to do with them, other EH schemes such as SEH might use them.
+
+declare i32 @__gxx_personality_v0(...)
+declare void @filt0()
+declare void @filt1()
+declare void @_Z1fv()
+declare i32 @llvm.eh.typeid.for(i8*)
+
+define i32 @main() uwtable {
+entry:
+  invoke void @_Z1fv()
+          to label %try.cont unwind label %lpad
+
+try.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (void ()* @filt0 to i8*)
+          catch i8* bitcast (void ()* @filt1 to i8*)
+  %sel = extractvalue { i8*, i32 } %0, 1
+  %id0 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt0 to i8*))
+  %is_f0 = icmp eq i32 %sel, %id0
+  br i1 %is_f0, label %try.cont, label %check_f1
+
+check_f1:
+  %id1 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt1 to i8*))
+  %is_f1 = icmp eq i32 %sel, %id1
+  br i1 %is_f1, label %try.cont, label %eh.resume
+
+eh.resume:
+  resume { i8*, i32 } %0
+}
+
+; CHECK-LABEL: main:
+; CHECK: .cfi_startproc
+; CHECK: .cfi_personality 3, __gxx_personality_v0
+; CHECK: .cfi_lsda 3, .Lexception0
+; CHECK: .cfi_def_cfa_offset 16
+; CHECK: callq _Z1fv
+; CHECK: retq
+; CHECK: cmpl $2, %edx
+; CHECK: je
+; CHECK: cmpl $1, %edx
+; CHECK: je
+; CHECK: callq _Unwind_Resume
+; CHECK: .cfi_endproc
+; CHECK: GCC_except_table0:
+; CHECK: Lexception0:

diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index c763f39..fa1169d 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll

@@ -53,21 +53,20 @@
 
 
 ; _Complex long long const G4 = 34;
-@G4 = unnamed_addr constant {i64,i64} { i64 34, i64 0 }
+@G4 = private unnamed_addr constant {i64,i64} { i64 34, i64 0 }
 
 ; DARWIN: .section        __TEXT,__literal16,16byte_literals
-; DARWIN: _G4:
+; DARWIN: L_G4:
 ; DARWIN:     .long 34
 
 ; DARWIN-STATIC: .section        __TEXT,__literal16,16byte_literals
-; DARWIN-STATIC: _G4:
+; DARWIN-STATIC: L_G4:
 ; DARWIN-STATIC:     .long 34
 
 ; DARWIN64: .section        __TEXT,__literal16,16byte_literals
-; DARWIN64: _G4:
+; DARWIN64: L_G4:
 ; DARWIN64:     .quad 34
 
-
 ; int G5 = 47;
 @G5 = global i32 47
 
@@ -194,3 +193,23 @@
 ; WIN32-SECTIONS: L_G14:
 ; WIN32-SECTIONS:        .asciz  "foo"
 
+; cannot be merged on MachO, but can on other formats.
+@G15 = unnamed_addr constant i64 0
+
+; LINUX: .section        .rodata.cst8,"aM",@progbits,8
+; LINUX: G15:
+
+; DARWIN: .section      __TEXT,__const
+; DARWIN: _G15:
+
+; DARWIN-STATIC: .section       __TEXT,__const
+; DARWIN-STATIC: _G15:
+
+; DARWIN64: .section       __TEXT,__const
+; DARWIN64: _G15:
+
+; LINUX-SECTIONS: .section      .rodata.G15,"aM",@progbits,8
+; LINUX-SECTIONS: G15:
+
+; WIN32-SECTIONS: .section      .rdata,"rd",one_only,_G15
+; WIN32-SECTIONS: _G15:

diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
new file mode 100644
index 0000000..1dcf939
--- /dev/null
+++ b/test/CodeGen/X86/half.ll

@@ -0,0 +1,69 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LIBCALL
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-F16C
+
+define void @test_load_store(half* %in, half* %out) {
+; CHECK-LABEL: test_load_store:
+; CHECK: movw (%rdi), [[TMP:%[a-z0-9]+]]
+; CHECK: movw [[TMP]], (%rsi)
+  %val = load half* %in
+  store half %val, half* %out
+  ret void
+}
+
+define i16 @test_bitcast_from_half(half* %addr) {
+; CHECK-LABEL: test_bitcast_from_half:
+; CHECK: movzwl (%rdi), %eax
+  %val = load half* %addr
+  %val_int = bitcast half %val to i16
+  ret i16 %val_int
+}
+
+define void @test_bitcast_to_half(half* %addr, i16 %in) {
+; CHECK-LABEL: test_bitcast_to_half:
+; CHECK: movw %si, (%rdi)
+  %val_fp = bitcast i16 %in to half
+  store half %val_fp, half* %addr
+  ret void
+}
+
+define float @test_extend32(half* %addr) {
+; CHECK-LABEL: test_extend32:
+
+; CHECK-LIBCALL: jmp __gnu_h2f_ieee
+; CHECK-FP16: vcvtph2ps
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to float
+  ret float %val32
+}
+
+define double @test_extend64(half* %addr) {
+; CHECK-LABEL: test_extend64:
+
+; CHECK-LIBCALL: callq __gnu_h2f_ieee
+; CHECK-LIBCALL: cvtss2sd
+; CHECK-FP16: vcvtph2ps
+; CHECK-FP16: vcvtss2sd
+  %val16 = load half* %addr
+  %val32 = fpext half %val16 to double
+  ret double %val32
+}
+
+define void @test_trunc32(float %in, half* %addr) {
+; CHECK-LABEL: test_trunc32:
+
+; CHECK-LIBCALL: callq __gnu_f2h_ieee
+; CHECK-FP16: vcvtps2ph
+  %val16 = fptrunc float %in to half
+  store half %val16, half* %addr
+  ret void
+}
+
+define void @test_trunc64(double %in, half* %addr) {
+; CHECK-LABEL: test_trunc64:
+
+; CHECK-LIBCALL: callq __truncdfhf2
+; CHECK-FP16: callq __truncdfhf2
+  %val16 = fptrunc double %in to half
+  store half %val16, half* %addr
+  ret void
+}

diff --git a/test/CodeGen/X86/i8-umulo.ll b/test/CodeGen/X86/i8-umulo.ll
deleted file mode 100644
index ba846f3..0000000
--- a/test/CodeGen/X86/i8-umulo.ll
+++ /dev/null

@@ -1,24 +0,0 @@
-; RUN: llc -mcpu=generic -march=x86 < %s | FileCheck %s
-; PR19858
-
-declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
-define i8 @testumulo(i32 %argc) {
-; CHECK: imulw
-; CHECK: testb %{{.+}}, %{{.+}}
-; CHECK: je [[NOOVERFLOWLABEL:.+]]
-; CHECK: {{.*}}[[NOOVERFLOWLABEL]]:
-; CHECK-NEXT: movb
-; CHECK-NEXT: retl
-top:
-  %RHS = trunc i32 %argc to i8
-  %umul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 25, i8 %RHS)
-  %ex = extractvalue { i8, i1 } %umul, 1
-  br i1 %ex, label %overflow, label %nooverlow
-
-overflow:
-  ret i8 %RHS
-
-nooverlow:
-  %umul.value = extractvalue { i8, i1 } %umul, 0
-  ret i8 %umul.value
-}

diff --git a/test/CodeGen/X86/inalloca-regparm.ll b/test/CodeGen/X86/inalloca-regparm.ll
new file mode 100644
index 0000000..9dd916b
--- /dev/null
+++ b/test/CodeGen/X86/inalloca-regparm.ll

@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=i686-windows-msvc < %s -o /dev/null
+; RUN: not llc -mtriple=x86_64-windows-msvc %s -o /dev/null 2>&1 | FileCheck %s
+
+; This will compile successfully on x86 but not x86_64, because %b will become a
+; register parameter.
+
+declare x86_thiscallcc i32 @f(i32 %a, i32* inalloca %b)
+define void @g() {
+  %b = alloca inalloca i32
+  store i32 2, i32* %b
+  call x86_thiscallcc i32 @f(i32 0, i32* inalloca %b)
+  ret void
+}
+
+; CHECK: cannot use inalloca attribute on a register parameter

diff --git a/test/CodeGen/X86/inline-asm-fpstack.ll b/test/CodeGen/X86/inline-asm-fpstack.ll
index 91c477b..bb3778a 100644
--- a/test/CodeGen/X86/inline-asm-fpstack.ll
+++ b/test/CodeGen/X86/inline-asm-fpstack.ll

@@ -340,3 +340,65 @@
   %0 = tail call i32 asm "fcomi $2, $1; pushf; pop $0", "=r,{st},{st(1)},~{dirflag},~{fpsr},~{flags}"(double 2.000000e+00, double 2.000000e+00) nounwind
   ret i32 %0
 }
+
+; <rdar://problem/16952634>
+; X87 stackifier asserted when there was an ST register defined by an
+; inline-asm instruction and the ST register was live across another
+; inline-asm instruction.
+;
+; INLINEASM <es:frndint> [sideeffect] [attdialect], $0:[regdef], %ST0<imp-def,tied5>, $1:[reguse tiedto:$0], %ST0<tied3>, $2:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; INLINEASM <es:fldcw $0> [sideeffect] [mayload] [attdialect], $0:[mem], %EAX<undef>, 1, %noreg, 0, %noreg, $1:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
+; %FP0<def> = COPY %ST0
+
+; CHECK-LABEL: _test_live_st
+; CHECK: ## InlineAsm Start
+; CHECK: frndint
+; CHECK: ## InlineAsm End
+; CHECK: ## InlineAsm Start
+; CHECK: fldcw
+; CHECK: ## InlineAsm End
+
+%struct.fpu_t = type { [8 x x86_fp80], x86_fp80, %struct.anon1, %struct.anon2, i32, i8, [15 x i8] }
+%struct.anon1 = type { i32, i32, i32 }
+%struct.anon2 = type { i32, i32, i32, i32 }
+
+@fpu = external global %struct.fpu_t, align 16
+
+; Function Attrs: ssp
+define void @test_live_st(i32 %a1) {
+entry:
+  %0 = load x86_fp80* undef, align 16
+  %cond = icmp eq i32 %a1, 1
+  br i1 %cond, label %sw.bb4.i, label %_Z5tointRKe.exit
+
+sw.bb4.i:
+  %1 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %0)
+  call void asm sideeffect "fldcw $0", "*m,~{dirflag},~{fpsr},~{flags}"(i32* undef)
+  br label %_Z5tointRKe.exit
+
+_Z5tointRKe.exit:
+  %result.0.i = phi x86_fp80 [ %1, %sw.bb4.i ], [ %0, %entry ]
+  %conv.i1814 = fptosi x86_fp80 %result.0.i to i32
+  %conv626 = sitofp i32 %conv.i1814 to x86_fp80
+  store x86_fp80 %conv626, x86_fp80* getelementptr inbounds (%struct.fpu_t* @fpu, i32 0, i32 1)
+  br label %return
+
+return:
+  ret void
+}
+
+; Check that x87 stackifier is correctly rewriting FP registers to ST registers.
+;
+; CHECK-LABEL: _test_operand_rewrite
+; CHECK: ## InlineAsm Start
+; CHECK: foo %st(0), %st(1)
+; CHECK: ## InlineAsm End
+
+define double @test_operand_rewrite() {
+entry:
+  %0 = tail call { double, double } asm sideeffect "foo $0, $1", "={st},={st(1)},~{dirflag},~{fpsr},~{flags}"()
+  %asmresult = extractvalue { double, double } %0, 0
+  %asmresult1 = extractvalue { double, double } %0, 1
+  %sub = fsub double %asmresult, %asmresult1
+  ret double %sub
+}

diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index d417453..dfa8aed 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll

@@ -284,7 +284,7 @@
 define i32 @func_test1(i32 %p1) nounwind uwtable {
 entry:
 ; CHECK-LABEL: func_test1:
-; CHECK: testb
+; CHECK: andb
 ; CHECK: j
 ; CHECK: ret
   %0 = load i32* @b, align 4

diff --git a/test/CodeGen/X86/jump_table_alias.ll b/test/CodeGen/X86/jump_table_alias.ll
index f3691fd..2062200 100644
--- a/test/CodeGen/X86/jump_table_alias.ll
+++ b/test/CodeGen/X86/jump_table_alias.ll

@@ -5,7 +5,7 @@
   ret i32 0
 }
 
-@i = alias internal i32 ()* @f
+@i = internal alias i32 ()* @f
 @j = alias i32 ()* @f
 
 define i32 @main(i32 %argc, i8** %argv) {
@@ -25,7 +25,6 @@
 ; There should only be one table, even though there are two GlobalAliases,
 ; because they both alias the same value.
 
-; CHECK:         .globl  __llvm_jump_instr_table_0_1
 ; CHECK:         .align  8, 0x90
 ; CHECK:         .type   __llvm_jump_instr_table_0_1,@function
 ; CHECK: __llvm_jump_instr_table_0_1:

diff --git a/test/CodeGen/X86/jump_table_align.ll b/test/CodeGen/X86/jump_table_align.ll
new file mode 100644
index 0000000..6ad48d1
--- /dev/null
+++ b/test/CodeGen/X86/jump_table_align.ll

@@ -0,0 +1,29 @@
+; RUN: llc -filetype=obj <%s -jump-table-type=single -o %t1
+; RUN: llvm-objdump -triple=x86_64-unknown-linux-gnu -d %t1 | FileCheck %s
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @f() unnamed_addr jumptable {
+  ret i32 0
+}
+
+define i32 @g(i8* %a) unnamed_addr jumptable {
+  ret i32 0
+}
+
+define void @h(void ()* %func) unnamed_addr jumptable {
+  ret void
+}
+
+define i32 @main() {
+  %g = alloca i32 (...)*, align 8
+  store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8
+  %1 = load i32 (...)** %g, align 8
+  %call = call i32 (...)* %1()
+  call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*))
+  %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null)
+  ret i32 %a
+}
+
+; Make sure that the padding from getJumpInstrTableEntryBound is right.
+; CHECK: __llvm_jump_instr_table_0_1:
+; CHECK-NEXT: e9 00 00 00 00                                  jmp     0
+; CHECK-NEXT: 0f 1f 00                                        nopl    (%rax)

diff --git a/test/CodeGen/X86/jump_table_bitcast.ll b/test/CodeGen/X86/jump_table_bitcast.ll
index 33a798f..749b77a 100644
--- a/test/CodeGen/X86/jump_table_bitcast.ll
+++ b/test/CodeGen/X86/jump_table_bitcast.ll

@@ -15,12 +15,12 @@
 define i32 @main() {
   %g = alloca i32 (...)*, align 8
   store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8
-; CHECK: movq    $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]], (%rsp)
-; CHECK: movl    $__llvm_jump_instr_table_0_[[ENTRY]], %ecx
+; CHECK: movq    $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]],
+; CHECK: movl    $__llvm_jump_instr_table_0_[[ENTRY]],
   %1 = load i32 (...)** %g, align 8
   %call = call i32 (...)* %1()
   call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*))
-; CHECK: movl    $__llvm_jump_instr_table_0_{{1|2|3}}, %edi
+; CHECK: movl    $__llvm_jump_instr_table_0_{{1|2|3}},
 ; CHECK: callq   h
 
   %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null)
@@ -28,17 +28,14 @@
   ret i32 %a
 }
 
-; CHECK:         .globl  __llvm_jump_instr_table_0_1
 ; CHECK:         .align  8, 0x90
 ; CHECK:         .type   __llvm_jump_instr_table_0_1,@function
 ; CHECK: __llvm_jump_instr_table_0_1:
 ; CHECK:         jmp     {{f|g|h}}@PLT
-; CHECK:         .globl  __llvm_jump_instr_table_0_2
 ; CHECK:         .align  8, 0x90
 ; CHECK:         .type   __llvm_jump_instr_table_0_2,@function
 ; CHECK: __llvm_jump_instr_table_0_2:
 ; CHECK:         jmp     {{f|g|h}}@PLT
-; CHECK:         .globl  __llvm_jump_instr_table_0_3
 ; CHECK:         .align  8, 0x90
 ; CHECK:         .type   __llvm_jump_instr_table_0_3,@function
 ; CHECK: __llvm_jump_instr_table_0_3:

diff --git a/test/CodeGen/X86/jump_tables.ll b/test/CodeGen/X86/jump_tables.ll
index 5a0aed0..485154e 100644
--- a/test/CodeGen/X86/jump_tables.ll
+++ b/test/CodeGen/X86/jump_tables.ll

@@ -7,6 +7,20 @@
 
 %struct.fun_struct = type { i32 (...)* }
 
+@a = global [12 x i32 () *] [ i32 ()* bitcast (void ()* @indirect_fun to i32 ()*),
+			      i32 ()* bitcast (void ()* @indirect_fun_match to i32 ()*),
+			      i32 ()* bitcast (i32 ()* @indirect_fun_i32 to i32 ()*),
+			      i32 ()* bitcast (i32 (i32)* @indirect_fun_i32_1 to i32 ()*),
+			      i32 ()* bitcast (i32 (i32, i32)* @indirect_fun_i32_2 to i32 ()*),
+			      i32 ()* bitcast (i32* (i32*, i32)* @indirect_fun_i32S_2 to i32 ()*),
+			      i32 ()* bitcast (void (%struct.fun_struct)* @indirect_fun_struct to i32 ()*),
+			      i32 ()* bitcast (void (i32 (...)*, i32)* @indirect_fun_fun to i32 ()*),
+			      i32 ()* bitcast (i32 (i32 (...)*, i32)* @indirect_fun_fun_ret to i32 ()*),
+			      i32 ()* bitcast (void ([19 x i8])* @indirect_fun_array to i32 ()*),
+			      i32 ()* bitcast (void (<3 x i32>)* @indirect_fun_vec to i32 ()*),
+			      i32 ()* bitcast (void (<4 x float>)* @indirect_fun_vec_2 to i32 ()*)
+			    ]
+
 define void @indirect_fun() unnamed_addr jumptable {
   ret void
 }
@@ -74,62 +88,50 @@
   ret i32 %a
 }
 
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_1
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_1,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_1:
 ; SINGLE-DAG:         jmp     indirect_fun_array@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_2
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_2,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_2:
 ; SINGLE-DAG:         jmp     indirect_fun_i32_2@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_3
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_3,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_3:
 ; SINGLE-DAG:         jmp     indirect_fun_vec_2@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_4
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_4,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_4:
 ; SINGLE-DAG:         jmp     indirect_fun_i32S_2@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_5
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_5,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_5:
 ; SINGLE-DAG:         jmp     indirect_fun_struct@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_6
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_6,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_6:
 ; SINGLE-DAG:         jmp     indirect_fun_i32_1@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_7
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_7,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_7:
 ; SINGLE-DAG:         jmp     indirect_fun_i32@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_8
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_8,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_8:
 ; SINGLE-DAG:         jmp     indirect_fun_fun@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_9
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_9,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_9:
 ; SINGLE-DAG:         jmp     indirect_fun_fun_ret@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_10
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_10,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_10:
 ; SINGLE-DAG:         jmp     indirect_fun@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_11
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_11,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_11:
 ; SINGLE-DAG:         jmp     indirect_fun_match@PLT
-; SINGLE-DAG:         .globl  __llvm_jump_instr_table_0_12
 ; SINGLE-DAG:         .align  8, 0x90
 ; SINGLE-DAG:         .type   __llvm_jump_instr_table_0_12,@function
 ; SINGLE-DAG: __llvm_jump_instr_table_0_12:
@@ -144,82 +146,69 @@
 ; SINGLE-DAG:         ud2
 
 
-; ARITY-DAG:         .globl  __llvm_jump_instr_table_2_1
 ; ARITY-DAG:         .align  8, 0x90
 ; ARITY-DAG:         .type   __llvm_jump_instr_table_2_1,@function
 ; ARITY-DAG: __llvm_jump_instr_table_2_1:
 ; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
 ; ARITY-DAG:         .align  8, 0x90
 ; ARITY-DAG:         ud2
-; ARITY-DAG:         .globl  __llvm_jump_instr_table_0_1
 ; ARITY-DAG:         .align  8, 0x90
 ; ARITY-DAG:         .type   __llvm_jump_instr_table_0_1,@function
 ; ARITY-DAG: __llvm_jump_instr_table_0_1:
 ; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
-; ARITY-DAG:         .globl  __llvm_jump_instr_table_1_1
 ; ARITY-DAG:         .align  8, 0x90
 ; ARITY-DAG:         .type   __llvm_jump_instr_table_1_1,@function
 ; ARITY-DAG: __llvm_jump_instr_table_1_1:
 ; ARITY-DAG:         jmp     indirect_fun{{.*}}@PLT
 
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_2_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_2_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_2_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         ud2
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_0_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_0_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_0_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_1_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_1_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_1_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_3_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_3_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_3_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
-; SIMPL-DAG:         .globl  __llvm_jump_instr_table_4_1
 ; SIMPL-DAG:         .align  8, 0x90
 ; SIMPL-DAG:         .type   __llvm_jump_instr_table_4_1,@function
 ; SIMPL-DAG: __llvm_jump_instr_table_4_1:
 ; SIMPL-DAG:         jmp     indirect_fun{{.*}}@PLT
 
 
-; FULL-DAG:        .globl  __llvm_jump_instr_table_10_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_10_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_10_1:
 ; FULL-DAG:        jmp     indirect_fun_i32_1@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_9_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_9_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_9_1:
 ; FULL-DAG:        jmp     indirect_fun_i32_2@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_7_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_7_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_7_1:
 ; FULL-DAG:        jmp     indirect_fun_i32S_2@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_3_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_3_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_3_1:
 ; FULL-DAG:        jmp     indirect_fun_vec_2@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_2_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_2_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_2_1:
@@ -228,42 +217,36 @@
 ; FULL-DAG:        ud2
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_8_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_8_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_8_1:
 ; FULL-DAG:        jmp     indirect_fun_i32@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_1_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_1_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_1_1:
 ; FULL-DAG:        jmp     indirect_fun_array@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_0_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_0_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_0_1:
 ; FULL-DAG:        jmp     indirect_fun_vec@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_6_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_6_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_6_1:
 ; FULL-DAG:        jmp     indirect_fun_struct@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_5_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_5_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_5_1:
 ; FULL-DAG:        jmp     indirect_fun_fun@PLT
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        ud2
-; FULL-DAG:        .globl  __llvm_jump_instr_table_4_1
 ; FULL-DAG:        .align  8, 0x90
 ; FULL-DAG:        .type   __llvm_jump_instr_table_4_1,@function
 ; FULL-DAG:__llvm_jump_instr_table_4_1:

diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll
index 82cefb7..6fb3879 100644
--- a/test/CodeGen/X86/lea-2.ll
+++ b/test/CodeGen/X86/lea-2.ll

@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl -x86-asm-syntax=intel | FileCheck %s
 
 define i32 @test1(i32 %A, i32 %B) {
   %tmp1 = shl i32 %A, 2

diff --git a/test/CodeGen/X86/lea-3.ll b/test/CodeGen/X86/lea-3.ll
index c439ee1..a56403a 100644
--- a/test/CodeGen/X86/lea-3.ll
+++ b/test/CodeGen/X86/lea-3.ll

@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
 
 ; CHECK: leaq (,[[A0:%rdi|%rcx]],4), %rax

diff --git a/test/CodeGen/X86/lea-4.ll b/test/CodeGen/X86/lea-4.ll
index cef4726..00c2278 100644
--- a/test/CodeGen/X86/lea-4.ll
+++ b/test/CodeGen/X86/lea-4.ll

@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
+
 
 define zeroext i16 @t1(i32 %on_off) nounwind {
 entry:

diff --git a/test/CodeGen/X86/lea-5.ll b/test/CodeGen/X86/lea-5.ll
new file mode 100644
index 0000000..50d3aaf
--- /dev/null
+++ b/test/CodeGen/X86/lea-5.ll

@@ -0,0 +1,59 @@
+; test for more complicated forms of lea operands which can be generated
+; in loop optimized cases.
+; See also http://llvm.org/bugs/show_bug.cgi?id=20016
+
+; RUN: llc < %s -mtriple=x86_64-linux -O2        | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O2 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-nacl -O2 | FileCheck %s -check-prefix=X32
+
+; Function Attrs: nounwind readnone uwtable
+define void @foo(i32 %x, i32 %d) #0 {
+entry:
+  %a = alloca [8 x i32], align 16
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ]
+  %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0
+
+; CHECK: leaq	-40(%rsp,%r{{[^,]*}},4), %rax
+; X32:   leal	-40(%rsp,%r{{[^,]*}},4), %eax
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp eq i32 %0, 0
+  %inc = add nsw i32 %d.addr.0, 1
+
+; CHECK: leaq	4(%r{{[^,]*}}), %r{{[^,]*}}
+; X32:   leal	4(%r{{[^,]*}}), %e{{[^,]*}}
+  br i1 %cmp1, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+; The same test as above but with enforsed stack realignment (%a aligned by 64)
+; to check one more case of correct lea generation.
+
+; Function Attrs: nounwind readnone uwtable
+define void @bar(i32 %x, i32 %d) #0 {
+entry:
+  %a = alloca [8 x i32], align 64
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ]
+  %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0
+
+; CHECK: leaq	(%rsp,%r{{[^,]*}},4), %rax
+; X32:   leal	(%rsp,%r{{[^,]*}},4), %eax
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp eq i32 %0, 0
+  %inc = add nsw i32 %d.addr.0, 1
+
+; CHECK: leaq	4(%r{{[^,]*}}), %r{{[^,]*}}
+; X32:   leal	4(%r{{[^,]*}}), %e{{[^,]*}}
+  br i1 %cmp1, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+

diff --git a/test/CodeGen/X86/lea.ll b/test/CodeGen/X86/lea.ll
index 93cfe46..9b6632c 100644
--- a/test/CodeGen/X86/lea.ll
+++ b/test/CodeGen/X86/lea.ll

@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
 
 define i32 @test1(i32 %x) nounwind {
         %tmp1 = shl i32 %x, 3

diff --git a/test/CodeGen/X86/long-extend.ll b/test/CodeGen/X86/long-extend.ll
deleted file mode 100644
index 5bbd41d..0000000
--- a/test/CodeGen/X86/long-extend.ll
+++ /dev/null

@@ -1,18 +0,0 @@
-; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s
-define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind {
-; CHECK-LABEL: test_long_extend
-; CHECK: vpunpcklbw	%xmm1, %xmm0, [[REG1:%xmm[0-9]+]]
-; CHECK: vpunpckhwd	%xmm1, [[REG1]], [[REG2:%xmm[0-9]+]]
-; CHECK: vpunpcklwd	%xmm1, [[REG1]], %x[[REG3:mm[0-9]+]]
-; CHECK: vinsertf128	$1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]]
-; CHECK: vpunpckhbw	%xmm1, %xmm0, [[REG4:%xmm[0-9]+]]
-; CHECK: vpunpckhwd	%xmm1, [[REG4]], [[REG5:%xmm[0-9]+]]
-; CHECK: vpunpcklwd	%xmm1, [[REG4]], %x[[REG6:mm[0-9]+]]
-; CHECK: vinsertf128	$1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]]
-; CHECK: vmovaps	[[REG_result1]], 32(%rdi)
-; CHECK: vmovaps	[[REG_result0]], (%rdi)
-
-  %tmp = zext <16 x i8> %a to <16 x i32>
-  store <16 x i32> %tmp, <16 x i32>*%p
-  ret void
-}

diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll
index 1d04276..c36047c 100644
--- a/test/CodeGen/X86/loop-strength-reduce8.ll
+++ b/test/CodeGen/X86/loop-strength-reduce8.ll

@@ -1,6 +1,9 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
 
-; CHECK: leal 16(%eax), %edx
+; FIXME: The first two instructions, movl and addl, should have been combined to
+; "leal 16(%eax), %edx" by the backend (PR20776).
+; CHECK: movl    %eax, %edx
+; CHECK: addl    $16, %edx
 ; CHECK: align
 ; CHECK: addl    $4, %edx
 ; CHECK: decl    %ecx

diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
index f47161e..edb8433e 100644
--- a/test/CodeGen/X86/lower-bitcast.ll
+++ b/test/CodeGen/X86/lower-bitcast.ll

@@ -68,13 +68,13 @@
   %2 = bitcast <2 x i32> %add to i64
   ret i64 %2
 }
-; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd.
+; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.
 ; Ideally, we should fold that sequence into a single paddd. This is fixed with
 ; the widening legalization.
 ;
 ; CHECK-LABEL: test4
 ; CHECK: pshufd
-; CHECK-NEXT: paddq
+; CHECK-NEXT: paddd
 ; CHECK-NEXT: pshufd
 ; CHECK: ret
 ;

diff --git a/test/CodeGen/X86/mem-intrin-base-reg.ll b/test/CodeGen/X86/mem-intrin-base-reg.ll
new file mode 100644
index 0000000..dd7f396
--- /dev/null
+++ b/test/CodeGen/X86/mem-intrin-base-reg.ll

@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=i686-windows -mattr=+sse2 < %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+
+; There is a conflict between lowering the X86 memory intrinsics and the "base"
+; register used to address stack locals.  See X86RegisterInfo::hasBaseRegister
+; for when this is necessary. Typically, we chose ESI for the base register,
+; which all of the X86 string instructions use.
+
+; The pattern of vector icmp and extractelement is used in these tests because
+; it forces creation of an aligned stack temporary. Perhaps such temporaries
+; shouldn't be aligned.
+
+declare void @escape_vla_and_icmp(i8*, i1 zeroext)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1)
+
+define i32 @memcpy_novla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false)
+  br i1 %cond, label %spill_vectors, label %no_vectors
+
+no_vectors:
+  ret i32 0
+
+spill_vectors:
+  %vp1 = getelementptr <4 x i32>* %vp0, i32 1
+  %v0 = load <4 x i32>* %vp0
+  %v1 = load <4 x i32>* %vp1
+  %vicmp = icmp slt <4 x i32> %v0, %v1
+  %icmp = extractelement <4 x i1> %vicmp, i32 0
+  call void @escape_vla_and_icmp(i8* null, i1 zeroext %icmp)
+  %r = extractelement <4 x i32> %v0, i32 0
+  ret i32 %r
+}
+
+; CHECK-LABEL: _memcpy_novla_vector:
+; CHECK: andl $-16, %esp
+; CHECK-DAG: movl $32, %ecx
+; CHECK-DAG: movl {{.*}}, %esi
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: rep;movsl
+
+define i32 @memcpy_vla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false)
+  br i1 %cond, label %spill_vectors, label %no_vectors
+
+no_vectors:
+  ret i32 0
+
+spill_vectors:
+  %vp1 = getelementptr <4 x i32>* %vp0, i32 1
+  %v0 = load <4 x i32>* %vp0
+  %v1 = load <4 x i32>* %vp1
+  %vicmp = icmp slt <4 x i32> %v0, %v1
+  %icmp = extractelement <4 x i1> %vicmp, i32 0
+  %vla = alloca i8, i32 %n
+  call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp)
+  %r = extractelement <4 x i32> %v0, i32 0
+  ret i32 %r
+}
+
+; CHECK-LABEL: _memcpy_vla_vector:
+; CHECK: andl $-16, %esp
+; CHECK: movl %esp, %esi
+; CHECK: movl $128, {{.*}}(%esp)
+; CHECK: calll _memcpy
+; CHECK: calll __chkstk
+
+; stosd doesn't clobber esi, so we can use it.
+
+define i32 @memset_vla_vector(<4 x i32>* %vp0, i8* %a, i32 %n, i1 zeroext %cond) {
+  call void @llvm.memset.p0i8.i32(i8* %a, i8 42, i32 128, i32 4, i1 false)
+  br i1 %cond, label %spill_vectors, label %no_vectors
+
+no_vectors:
+  ret i32 0
+
+spill_vectors:
+  %vp1 = getelementptr <4 x i32>* %vp0, i32 1
+  %v0 = load <4 x i32>* %vp0
+  %v1 = load <4 x i32>* %vp1
+  %vicmp = icmp slt <4 x i32> %v0, %v1
+  %icmp = extractelement <4 x i1> %vicmp, i32 0
+  %vla = alloca i8, i32 %n
+  call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp)
+  %r = extractelement <4 x i32> %v0, i32 0
+  ret i32 %r
+}
+
+; CHECK-LABEL: _memset_vla_vector:
+; CHECK: andl $-16, %esp
+; CHECK: movl %esp, %esi
+; CHECK-DAG: movl $707406378, %eax        # imm = 0x2A2A2A2A
+; CHECK-DAG: movl $32, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK-NOT: movl {{.*}}, %esi
+; CHECK: rep;stosl
+
+; Add a test for memcmp if we ever add a special lowering for it.

diff --git a/test/CodeGen/X86/mem-promote-integers.ll b/test/CodeGen/X86/mem-promote-integers.ll
index 0015df0..ea38b95 100644
--- a/test/CodeGen/X86/mem-promote-integers.ll
+++ b/test/CodeGen/X86/mem-promote-integers.ll

@@ -1,8 +1,8 @@
 ; Test the basic functionality of integer element promotions of different types.
 ; This tests checks passing of arguments, loading and storing to memory and
 ; basic arithmetic.
-; RUN: llc -march=x86 < %s
-; RUN: llc -march=x86-64 < %s
+; RUN: llc -march=x86 < %s > /dev/null
+; RUN: llc -march=x86-64 < %s > /dev/null
 
 define <1 x i8> @test_1xi8(<1 x i8> %x, <1 x i8>* %b) {
   %bb = load <1 x i8>* %b

diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll
index 3ea6512..5454b7c 100644
--- a/test/CodeGen/X86/misched-matmul.ll
+++ b/test/CodeGen/X86/misched-matmul.ll

@@ -10,7 +10,7 @@
 ; more complex cases.
 ;
 ; CHECK: @wrap_mul4
-; CHECK: 22 regalloc - Number of spills inserted
+; CHECK: 23 regalloc - Number of spills inserted
 
 define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 {
 entry:

diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index 71b0723..96c5dbb 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll

@@ -3,40 +3,58 @@
 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64
 
 define i32 @test1() nounwind readonly {
+; X32-LABEL: test1:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movl %gs:196, %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq %gs:320, %rax
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    retq
 entry:
 	%tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31)		; <i32*> [#uses=1]
 	%tmp1 = load i32* %tmp		; <i32> [#uses=1]
 	ret i32 %tmp1
 }
-; X32-LABEL: test1:
-; X32: 	movl	%gs:196, %eax
-; X32: 	movl	(%eax), %eax
-; X32: 	ret
-
-; X64-LABEL: test1:
-; X64: 	movq	%gs:320, %rax
-; X64: 	movl	(%rax), %eax
-; X64: 	ret
 
 define i64 @test2(void (i8*)* addrspace(256)* %tmp8) nounwind {
+; X32-LABEL: test2:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    calll *%gs:(%eax)
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    {{(subq.*%rsp|pushq)}}
+; X64-NEXT:    callq *%gs:(%{{(rcx|rdi)}})
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    {{(addq.*%rsp|popq)}}
+; X64-NEXT:    retq
 entry:
   %tmp9 = load void (i8*)* addrspace(256)* %tmp8, align 8
   tail call void %tmp9(i8* undef) nounwind optsize
   ret i64 0
 }
 
-; rdar://8453210
-; X32-LABEL: test2:
-; X32: movl	{{.*}}(%esp), %eax
-; X32: calll	*%gs:(%eax)
-
-; X64-LABEL: test2:
-; X64: callq	*%gs:([[A0:%rdi|%rcx]])
-
-
-
-
 define <2 x i64> @pmovsxwd_1(i64 addrspace(256)* %p) nounwind readonly {
+; X32-LABEL: pmovsxwd_1:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pmovsxwd %gs:(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pmovsxwd_1:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    pmovsxwd %gs:(%{{(rcx|rdi)}}), %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = load i64 addrspace(256)* %p
   %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0
@@ -44,20 +62,26 @@
   %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone
   %3 = bitcast <4 x i32> %2 to <2 x i64>
   ret <2 x i64> %3
-  
-; X32-LABEL: pmovsxwd_1:
-; X32: 	movl	4(%esp), %eax
-; X32: 	pmovsxwd	%gs:(%eax), %xmm0
-; X32: 	ret
-
-; X64-LABEL: pmovsxwd_1:
-; X64:	pmovsxwd	%gs:([[A0]]), %xmm0
-; X64:	ret
 }
 
 ; The two loads here both look identical to selection DAG, except for their
 ; address spaces.  Make sure they aren't CSE'd.
 define i32 @test_no_cse() nounwind readonly {
+; X32-LABEL: test_no_cse:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movl %gs:196, %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    movl %fs:196, %ecx
+; X32-NEXT:    addl (%ecx), %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_no_cse:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq %gs:320, %rax
+; X64-NEXT:    movl (%rax), %eax
+; X64-NEXT:    movq %fs:320, %rcx
+; X64-NEXT:    addl (%rcx), %eax
+; X64-NEXT:    retq
 entry:
 	%tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31)		; <i32*> [#uses=1]
 	%tmp1 = load i32* %tmp		; <i32> [#uses=1]
@@ -66,9 +90,5 @@
 	%tmp4 = add i32 %tmp1, %tmp3
 	ret i32 %tmp4
 }
-; X32-LABEL: test_no_cse:
-; X32: 	movl	%gs:196
-; X32: 	movl	%fs:196
-; X32: 	ret
 
 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone

diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index 6910515..f0bdbba 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll

@@ -110,7 +110,7 @@
 entry:
   %val = alloca i32, align 64
   store i32 -1, i32* %val, align 64
-  call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val) #1
+  call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val)
   %sp = load i32* %val, align 64
   ret i32 %sp
 ; CHECK-LABEL: t31:
@@ -125,3 +125,12 @@
 ; CHECK: movl (%esp), %eax
 ; CHECK: ret
 }
+
+declare hidden void @other_func()
+
+define void @naked() #0 {
+  call void asm sideeffect inteldialect "call dword ptr $0", "*m,~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{esp},~{ebp},~{dirflag},~{fpsr},~{flags}"(void()* @other_func)
+  unreachable
+}
+
+attributes #0 = { naked }

diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
new file mode 100644
index 0000000..1e99c14
--- /dev/null
+++ b/test/CodeGen/X86/musttail-varargs.ll

@@ -0,0 +1,119 @@
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX
+; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS
+
+; Test that we actually spill and reload all arguments in the variadic argument
+; pack. Doing a normal call will clobber all argument registers, and we will
+; spill around it. A simple adjustment should not require any XMM spills.
+
+declare void(i8*, ...)* @get_f(i8* %this)
+
+define void @f_thunk(i8* %this, ...) {
+  %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this)
+  musttail call void (i8*, ...)* %fptr(i8* %this, ...)
+  ret void
+}
+
+; Save and restore 6 GPRs, 8 XMMs, and AL around the call.
+
+; LINUX-LABEL: f_thunk:
+; LINUX-DAG: movq %rdi, {{.*}}
+; LINUX-DAG: movq %rsi, {{.*}}
+; LINUX-DAG: movq %rdx, {{.*}}
+; LINUX-DAG: movq %rcx, {{.*}}
+; LINUX-DAG: movq %r8, {{.*}}
+; LINUX-DAG: movq %r9, {{.*}}
+; LINUX-DAG: movb %al, {{.*}}
+; LINUX-DAG: movaps %xmm0, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm1, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm2, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm3, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm4, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm5, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm6, {{[0-9]*}}(%rsp)
+; LINUX-DAG: movaps %xmm7, {{[0-9]*}}(%rsp)
+; LINUX: callq get_f
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm0
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm1
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm2
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm3
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm4
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm5
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm6
+; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm7
+; LINUX-DAG: movq {{.*}}, %rdi
+; LINUX-DAG: movq {{.*}}, %rsi
+; LINUX-DAG: movq {{.*}}, %rdx
+; LINUX-DAG: movq {{.*}}, %rcx
+; LINUX-DAG: movq {{.*}}, %r8
+; LINUX-DAG: movq {{.*}}, %r9
+; LINUX-DAG: movb {{.*}}, %al
+; LINUX: jmpq *{{.*}}  # TAILCALL
+
+; WINDOWS-LABEL: f_thunk:
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS-DAG: movq %rdx, {{.*}}
+; WINDOWS-DAG: movq %rcx, {{.*}}
+; WINDOWS-DAG: movq %r8, {{.*}}
+; WINDOWS-DAG: movq %r9, {{.*}}
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS: callq get_f
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS-DAG: movq {{.*}}, %rdx
+; WINDOWS-DAG: movq {{.*}}, %rcx
+; WINDOWS-DAG: movq {{.*}}, %r8
+; WINDOWS-DAG: movq {{.*}}, %r9
+; WINDOWS-NOT: mov{{.}}ps
+; WINDOWS: jmpq *{{.*}} # TAILCALL
+
+; This thunk shouldn't require any spills and reloads, assuming the register
+; allocator knows what it's doing.
+
+define void @g_thunk(i8* %fptr_i8, ...) {
+  %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)*
+  musttail call void (i8*, ...)* %fptr(i8* %fptr_i8, ...)
+  ret void
+}
+
+; LINUX-LABEL: g_thunk:
+; LINUX-NOT: movq
+; LINUX: jmpq *%rdi  # TAILCALL
+
+; WINDOWS-LABEL: g_thunk:
+; WINDOWS-NOT: movq
+; WINDOWS: jmpq *%rcx # TAILCALL
+
+; Do a simple multi-exit multi-bb test.
+
+%struct.Foo = type { i1, i8*, i8* }
+
+@g = external global i32
+
+define void @h_thunk(%struct.Foo* %this, ...) {
+  %cond_p = getelementptr %struct.Foo* %this, i32 0, i32 0
+  %cond = load i1* %cond_p
+  br i1 %cond, label %then, label %else
+
+then:
+  %a_p = getelementptr %struct.Foo* %this, i32 0, i32 1
+  %a_i8 = load i8** %a_p
+  %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)*
+  musttail call void (%struct.Foo*, ...)* %a(%struct.Foo* %this, ...)
+  ret void
+
+else:
+  %b_p = getelementptr %struct.Foo* %this, i32 0, i32 2
+  %b_i8 = load i8** %b_p
+  %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)*
+  store i32 42, i32* @g
+  musttail call void (%struct.Foo*, ...)* %b(%struct.Foo* %this, ...)
+  ret void
+}
+
+; LINUX-LABEL: h_thunk:
+; LINUX: jne
+; LINUX: jmpq *{{.*}} # TAILCALL
+; LINUX: jmpq *{{.*}} # TAILCALL
+; WINDOWS-LABEL: h_thunk:
+; WINDOWS: jne
+; WINDOWS: jmpq *{{.*}} # TAILCALL
+; WINDOWS: jmpq *{{.*}} # TAILCALL

diff --git a/test/CodeGen/X86/nancvt.ll b/test/CodeGen/X86/nancvt.ll
index 8036710..8a665fa 100644
--- a/test/CodeGen/X86/nancvt.ll
+++ b/test/CodeGen/X86/nancvt.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc > %t
+; RUN: opt < %s -O3 | llc > %t
 ; RUN: grep 2147027116 %t | count 3
 ; RUN: grep 2147228864 %t | count 3
 ; RUN: grep 2146502828 %t | count 3

diff --git a/test/CodeGen/X86/narrow-shl-load.ll b/test/CodeGen/X86/narrow-shl-load.ll
index 30387925..5175bfc 100644
--- a/test/CodeGen/X86/narrow-shl-load.ll
+++ b/test/CodeGen/X86/narrow-shl-load.ll

@@ -30,40 +30,6 @@
   ret void
 }
 
-
-; DAGCombiner shouldn't fold the sdiv (ashr) away.
-; rdar://8636812
-; CHECK-LABEL: test2:
-; CHECK:   sarl
-
-define i32 @test2() nounwind {
-entry:
-  %i = alloca i32, align 4
-  %j = alloca i8, align 1
-  store i32 127, i32* %i, align 4
-  store i8 0, i8* %j, align 1
-  %tmp3 = load i32* %i, align 4
-  %mul = mul nsw i32 %tmp3, 2
-  %conv4 = trunc i32 %mul to i8
-  %conv5 = sext i8 %conv4 to i32
-  %div6 = sdiv i32 %conv5, 2
-  %conv7 = trunc i32 %div6 to i8
-  %conv9 = sext i8 %conv7 to i32
-  %cmp = icmp eq i32 %conv9, -1
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  ret i32 0
-
-if.end:                                           ; preds = %entry
-  call void @abort() noreturn
-  unreachable
-}
-
-declare void @abort() noreturn
-
-declare void @exit(i32) noreturn
-
 ; DAG Combiner can't fold this into a load of the 1'th byte.
 ; PR8757
 define i32 @test3(i32 *%P) nounwind ssp {

diff --git a/test/CodeGen/X86/nonconst-static-ev.ll b/test/CodeGen/X86/nonconst-static-ev.ll
index f852cae..5449791 100644
--- a/test/CodeGen/X86/nonconst-static-ev.ll
+++ b/test/CodeGen/X86/nonconst-static-ev.ll

@@ -1,6 +1,5 @@
 ; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t
 ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-; REQUIRES: shell
 
 @0 = global i8 extractvalue ([1 x i8] select (i1 ptrtoint (i32* @1 to i1), [1 x i8] [ i8 1 ], [1 x i8] [ i8 2 ]), 0)
 @1 = external global i32

diff --git a/test/CodeGen/X86/nonconst-static-iv.ll b/test/CodeGen/X86/nonconst-static-iv.ll
index 8fad39b..30613ef 100644
--- a/test/CodeGen/X86/nonconst-static-iv.ll
+++ b/test/CodeGen/X86/nonconst-static-iv.ll

@@ -1,6 +1,5 @@
 ; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t
 ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
-; REQUIRES: shell
 
 @0 = global i8 insertvalue( { i8 } select (i1 ptrtoint (i32* @1 to i1), { i8 } { i8 1 }, { i8 } { i8 2 }), i8 0, 0)
 @1 = external global i32

diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
new file mode 100644
index 0000000..9d0cb9a
--- /dev/null
+++ b/test/CodeGen/X86/nontemporal-2.ll

@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+
+
+; Make sure that we generate non-temporal stores for the test cases below.
+
+define void @test1(<4 x float>* %dst) {
+; CHECK-LABEL: test1:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test2(<4 x i32>* %dst) {
+; CHECK-LABEL: test2:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test3(<2 x double>* %dst) {
+; CHECK-LABEL: test3:
+; SSE: movntps
+; AVX: vmovntps
+  store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+!1 = metadata !{i32 1}

diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index fa77fcb..b559729 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll

@@ -14,16 +14,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !13}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !" ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""}
+!0 = metadata !{metadata !"0x11\004\00 \001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"", metadata !"", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* null, null, null, metadata !2, i32 2}
-!5 = metadata !{i32 786473, metadata !1}
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null}
+!4 = metadata !{metadata !"0x2e\00\00\00\002\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 ()* null, null, null, metadata !2} ; [ DW_TAG_subprogram ]
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!8 = metadata !{metadata !"0x24\00\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"_ZL1i", metadata !5, i32 1, metadata !8, i32 1, i32 1, null, null}
+!10 = metadata !{metadata !"0x34\00i\00i\00_ZL1i\001\001\001", null, metadata !5, metadata !8, null, null} ; [ DW_TAG_variable ]
 !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/object-size.ll b/test/CodeGen/X86/object-size.ll
index ec35d29..0610f0b 100644
--- a/test/CodeGen/X86/object-size.ll
+++ b/test/CodeGen/X86/object-size.ll

@@ -1,4 +1,4 @@
-; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s -check-prefix=X64
+; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s
 
 ; ModuleID = 'ts.c'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -12,8 +12,8 @@
   %tmp = load i8** @p                             ; <i8*> [#uses=1]
   %0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp, i1 0) ; <i64> [#uses=1]
   %cmp = icmp ne i64 %0, -1                       ; <i1> [#uses=1]
-; X64: movabsq $-1, [[RAX:%r..]]
-; X64: cmpq    $-1, [[RAX]]
+; CHECK: movq $-1, [[RAX:%r..]]
+; CHECK: cmpq $-1, [[RAX]]
   br i1 %cmp, label %cond.true, label %cond.false
 
 cond.true:                                        ; preds = %entry

diff --git a/test/CodeGen/X86/osx-private-labels.ll b/test/CodeGen/X86/osx-private-labels.ll
index 349ce7d..e30cb48 100644
--- a/test/CodeGen/X86/osx-private-labels.ll
+++ b/test/CodeGen/X86/osx-private-labels.ll

@@ -69,3 +69,20 @@
 ; CHECK: .section	__DATA,__foobar,interposing
 ; CHECK-NEXT: .align	3
 ; CHECK-NEXT: L_private12:
+
+@private13 = private global i32 42, section "__DATA, __objc_classlist, regular, no_dead_strip"
+; CHECK: .section	__DATA,__objc_classlist,regular,no_dead_strip
+; CHECK-NEXT: .align	2
+; CHECK-NEXT: L_private13:
+
+@private14 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_classname,cstring_literals"
+; CHECK: .section	__TEXT,__objc_classname,cstring_literals
+; CHECK-NEXT: L_private14:
+
+@private15 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methname,cstring_literals"
+; CHECK: .section	__TEXT,__objc_methname,cstring_literals
+; CHECK-NEXT: L_private15:
+
+@private16 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methtype,cstring_literals"
+; CHECK: .section	__TEXT,__objc_methtype,cstring_literals
+; CHECK-NEXT: L_private16:

diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll
index ec6564d..3efcc2e 100644
--- a/test/CodeGen/X86/palignr.ll
+++ b/test/CodeGen/X86/palignr.ll

@@ -3,58 +3,127 @@
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test1:
-; CHECK: pshufd
-; CHECK-YONAH: pshufd
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test1:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 >
 	ret <4 x i32> %C
 }
 
 define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test2:
-; CHECK: palignr
-; CHECK-YONAH: shufps
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test2:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; CHECK-YONAH-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 3, i32 4 >
 	ret <4 x i32> %C
 }
 
 define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test3:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test3:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 undef, i32 4 >
 	ret <4 x i32> %C
 }
 
 define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test4:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test4:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; CHECK-YONAH-NEXT:    movapd %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
 	ret <4 x i32> %C
 }
 
 define <4 x float> @test5(<4 x float> %A, <4 x float> %B) nounwind {
 ; CHECK-LABEL: test5:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test5:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; CHECK-YONAH-NEXT:    movapd %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
 	ret <4 x float> %C
 }
 
 define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; CHECK-LABEL: test6:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test6:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; CHECK-YONAH-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; CHECK-YONAH-NEXT:    por %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 >
 	ret <8 x i16> %C
 }
 
 define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; CHECK-LABEL: test7:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test7:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-YONAH-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
+; CHECK-YONAH-NEXT:    por %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 >
 	ret <8 x i16> %C
 }
 
 define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind {
 ; CHECK-LABEL: test8:
-; CHECK: palignr
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test8:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
+; CHECK-YONAH-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; CHECK-YONAH-NEXT:    por %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 >
 	ret <16 x i8> %C
 }
@@ -65,8 +134,19 @@
 ; was an UNDEF.)
 define <8 x i16> @test9(<8 x i16> %A, <8 x i16> %B) nounwind {
 ; CHECK-LABEL: test9:
-; CHECK-NOT: palignr
-; CHECK: pshufb
+; CHECK:       # BB#0:
+; CHECK-NEXT:    palignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-YONAH-LABEL: test9:
+; CHECK-YONAH:       # BB#0:
+; CHECK-YONAH-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; CHECK-YONAH-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
+; CHECK-YONAH-NEXT:    por %xmm0, %xmm1
+; CHECK-YONAH-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-YONAH-NEXT:    retl
   %C = shufflevector <8 x i16> %B, <8 x i16> %A, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 >
 	ret <8 x i16> %C
 }

diff --git a/test/CodeGen/X86/patchpoint-invoke.ll b/test/CodeGen/X86/patchpoint-invoke.ll
new file mode 100644
index 0000000..192cacc
--- /dev/null
+++ b/test/CodeGen/X86/patchpoint-invoke.ll

@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=x86_64-unknown-linux -mcpu=corei7                             < %s | FileCheck %s
+
+; Test invoking of patchpoints
+;
+define i64 @patchpoint_invoke(i64 %p1, i64 %p2) {
+entry:
+; CHECK-LABEL: patchpoint_invoke:
+; CHECK-NEXT: .cfi_startproc
+; CHECK:      [[FUNC_BEGIN:.L.*]]:
+; CHECK:      .cfi_lsda 3, [[EXCEPTION_LABEL:.L[^ ]*]]
+; CHECK:      pushq %rbp
+
+; Unfortunately, hardcode the name of the label that begins the patchpoint:
+; CHECK:      .Ltmp0:
+; CHECK:      movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK-NEXT: xchgw %ax, %ax
+; CHECK-NEXT: [[PP_END:.L.*]]:
+; CHECK:      ret
+  %resolveCall = inttoptr i64 -559038736 to i8*
+  %result = invoke i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall, i32 1, i64 %p1, i64 %p2)
+            to label %success unwind label %threw
+
+success:
+  ret i64 %result
+
+threw:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i64 0
+}
+
+; Verify that the exception table was emitted:
+; CHECK:      [[EXCEPTION_LABEL]]:
+; CHECK-NEXT: .byte 255
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 21
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 13
+; Verify that the unwind data covers the entire patchpoint region:
+; CHECK-NEXT: .long .Ltmp0-[[FUNC_BEGIN]]
+; CHECK-NEXT: .long [[PP_END]]-.Ltmp0
+
+
+; Verify that the stackmap section got emitted:
+; CHECK-LABEL: __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 1
+; Num LargeConstants
+; CHECK-NEXT:   .long 0
+; Num Callsites
+; CHECK-NEXT:   .long 1
+; CHECK-NEXT:   .quad patchpoint_invoke
+
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+declare i32 @__gxx_personality_v0(...)

diff --git a/test/CodeGen/X86/patchpoint-webkit_jscc.ll b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
new file mode 100644
index 0000000..5e76bf8
--- /dev/null
+++ b/test/CodeGen/X86/patchpoint-webkit_jscc.ll

@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7                             < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
+
+; Test the webkit_jscc calling convention.
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in $rax.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      movq %r{{.+}}, (%rsp)
+; CHECK:      movq %r{{.+}}, %rax
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK:      movq %rax, (%rsp)
+; CHECK:      callq
+; FAST-LABEL: jscall_patchpoint_codegen:
+; FAST:       Ltmp
+; FAST:       movq %r{{.+}}, (%rsp)
+; FAST:       movq %r{{.+}}, %rax
+; FAST:       Ltmp
+; FAST-NEXT:  movabsq $-559038736, %r11
+; FAST-NEXT:  callq *%r11
+; FAST:       movq %rax, (%rsp)
+; FAST:       callq
+  %resolveCall2 = inttoptr i64 -559038736 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 -559038737 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:      Ltmp
+; CHECK:      movq $6, 24(%rsp)
+; CHECK-NEXT: movl $4, 16(%rsp)
+; CHECK-NEXT: movq $2, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; FAST-LABEL: jscall_patchpoint_codegen2:
+; FAST:       Ltmp
+; FAST:       movq $2, (%rsp)
+; FAST-NEXT:  movl $4, 16(%rsp)
+; FAST-NEXT:  movq $6, 24(%rsp)
+; FAST:       Ltmp
+; FAST-NEXT:  movabsq $-559038736, %r11
+; FAST-NEXT:  callq *%r11
+  %call = inttoptr i64 -559038736 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:      Ltmp
+; CHECK:      movq $10, 48(%rsp)
+; CHECK-NEXT: movl  $8, 36(%rsp)
+; CHECK-NEXT: movq  $6, 24(%rsp)
+; CHECK-NEXT: movl  $4, 16(%rsp)
+; CHECK-NEXT: movq  $2, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; FAST-LABEL: jscall_patchpoint_codegen3:
+; FAST:       Ltmp
+; FAST:       movq  $2, (%rsp)
+; FAST-NEXT:  movl  $4, 16(%rsp)
+; FAST-NEXT:  movq  $6, 24(%rsp)
+; FAST-NEXT:  movl  $8, 36(%rsp)
+; FAST-NEXT:  movq $10, 48(%rsp)
+; FAST:       Ltmp
+; FAST-NEXT:  movabsq $-559038736, %r11
+; FAST-NEXT:  callq *%r11
+  %call = inttoptr i64 -559038736 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+

diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
index 62b1273..07148f0 100644
--- a/test/CodeGen/X86/patchpoint.ll
+++ b/test/CodeGen/X86/patchpoint.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7                             < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s
 
 ; Trivial patchpoint codegen
 ;
@@ -38,61 +39,6 @@
   ret void
 }
 
-; Test the webkit_jscc calling convention.
-; One argument will be passed in register, the other will be pushed on the stack.
-; Return value in $rax.
-define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen:
-; CHECK:      Ltmp
-; CHECK:      movq %r{{.+}}, (%rsp)
-; CHECK:      movq %r{{.+}}, %rax
-; CHECK:      Ltmp
-; CHECK-NEXT: movabsq $-559038736, %r11
-; CHECK-NEXT: callq *%r11
-; CHECK:      movq %rax, (%rsp)
-; CHECK:      callq
-  %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
-  %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
-  ret void
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen2(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen2:
-; CHECK:      Ltmp
-; CHECK:      movq $6, 24(%rsp)
-; CHECK-NEXT: movl $4, 16(%rsp)
-; CHECK-NEXT: movq $2, (%rsp)
-; CHECK:      Ltmp
-; CHECK-NEXT: movabsq $-559038736, %r11
-; CHECK-NEXT: callq *%r11
-  %call = inttoptr i64 -559038736 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
-  ret i64 %result
-}
-
-; Test if the arguments are properly aligned and that we don't store undef arguments.
-define i64 @jscall_patchpoint_codegen3(i64 %callee) {
-entry:
-; CHECK-LABEL: jscall_patchpoint_codegen3:
-; CHECK:      Ltmp
-; CHECK:      movq $10, 48(%rsp)
-; CHECK-NEXT: movl  $8, 36(%rsp)
-; CHECK-NEXT: movq  $6, 24(%rsp)
-; CHECK-NEXT: movl  $4, 16(%rsp)
-; CHECK-NEXT: movq  $2, (%rsp)
-; CHECK:      Ltmp
-; CHECK-NEXT: movabsq $-559038736, %r11
-; CHECK-NEXT: callq *%r11
-  %call = inttoptr i64 -559038736 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
-  ret i64 %result
-}
-
 ; Test patchpoints reusing the same TargetConstant.
 ; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
 ; There is no way to verify this, since it depends on memory allocation.
@@ -125,6 +71,17 @@
   ret void
 }
 
+; Test large target address.
+define i64 @large_target_address_patchpoint_codegen() {
+entry:
+; CHECK-LABEL: large_target_address_patchpoint_codegen:
+; CHECK:      movabsq $6153737369414576827, %r11
+; CHECK-NEXT: callq *%r11
+  %resolveCall2 = inttoptr i64 6153737369414576827 to i8*
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 0)
+  ret i64 %result
+}
+
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)

diff --git a/test/CodeGen/X86/peep-vector-extract-concat.ll b/test/CodeGen/X86/peep-vector-extract-concat.ll
deleted file mode 100644
index f73ebb9..0000000
--- a/test/CodeGen/X86/peep-vector-extract-concat.ll
+++ /dev/null

@@ -1,11 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2,-sse4.1 | FileCheck %s
-; CHECK: pshufd $3, %xmm0, %xmm0
-
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2,-sse4.1 | FileCheck %s -check-prefix=WIN64
-; %a is passed indirectly on Win64.
-; WIN64: movss   12(%rcx), %xmm0
-
-define float @foo(<8 x float> %a) nounwind {
-  %c = extractelement <8 x float> %a, i32 3
-  ret float %c
-}

diff --git a/test/CodeGen/X86/peep-vector-extract-insert.ll b/test/CodeGen/X86/peep-vector-extract-insert.ll
deleted file mode 100644
index f958b6b..0000000
--- a/test/CodeGen/X86/peep-vector-extract-insert.ll
+++ /dev/null

@@ -1,12 +0,0 @@
-; RUN: llc < %s -march=x86-64 | grep "xorps	%xmm0, %xmm0" | count 2
-
-define float @foo(<4 x float> %a) {
-  %b = insertelement <4 x float> %a, float 0.0, i32 3
-  %c = extractelement <4 x float> %b, i32 3
-  ret float %c
-}
-define float @bar(float %a) {
-  %b = insertelement <4 x float> <float 0x400B333340000000, float 4.5, float 0.0, float 0x4022666660000000>, float %a, i32 3
-  %c = extractelement <4 x float> %b, i32 2
-  ret float %c
-}

diff --git a/test/CodeGen/X86/peephole-fold-movsd.ll b/test/CodeGen/X86/peephole-fold-movsd.ll
new file mode 100644
index 0000000..09d9328
--- /dev/null
+++ b/test/CodeGen/X86/peephole-fold-movsd.ll

@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+;
+; Check that x86's peephole optimization doesn't fold a 64-bit load (movsd) into
+; addpd.
+; rdar://problem/18236850
+
+%struct.S1 = type { double, double }
+
+@g = common global %struct.S1 zeroinitializer, align 8
+
+declare void @foo3(%struct.S1*)
+
+; CHECK: movsd {{[0-9]*}}(%rsp), [[R0:%xmm[0-9]+]]
+; CHECK: addpd [[R0]], %xmm{{[0-9]+}}
+
+define void @foo1(double %a.coerce0, double %a.coerce1, double %b.coerce0, double %b.coerce1) {
+  %1 = alloca <2 x double>, align 16
+  %tmpcast = bitcast <2 x double>* %1 to %struct.S1*
+  call void @foo3(%struct.S1* %tmpcast) #2
+  %p2 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 0
+  %2 = load double* %p2, align 16
+  %p3 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 1
+  %3 = load double* %p3, align 8
+  %4 = insertelement <2 x double> undef, double %2, i32 0
+  %5 = insertelement <2 x double> %4, double 0.000000e+00, i32 1
+  %6 = insertelement <2 x double> undef, double %3, i32 1
+  %7 = insertelement <2 x double> %6, double 1.000000e+00, i32 0
+  %8 = fadd <2 x double> %5, %7
+  store <2 x double> %8, <2 x double>* bitcast (%struct.S1* @g to <2 x double>*), align 16
+  ret void
+}

diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 7bf8a61..8937d6a 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll

@@ -1,32 +1,96 @@
-; RUN: llc < %s -march=x86 -mattr=sse4.1 -mcpu=nehalem -stack-alignment=16 > %t
-; RUN: grep pmul %t | count 12
-; RUN: grep mov %t | count 14
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
 
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
-        %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
-        ret <4 x i32> %A
+; SSE2-LABEL: a:
+; SSE2:         movdqa {{.*}}, %[[X1:xmm[0-9]+]]
+; SSE2-NEXT:    pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %[[X1]], %xmm0
+; SSE2-NEXT:    pmuludq %[[X1]], %[[X2]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: a:
+; SSE41:         pmulld
+; SSE41-NEXT:    retq
+entry:
+  %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
+  ret <4 x i32> %A
 }
+
 define <2 x i64> @b(<2 x i64> %i) nounwind  {
-        %A = mul <2 x i64> %i, < i64 117, i64 117 >
-        ret <2 x i64> %A
+; ALL-LABEL: b:
+; ALL:         pmuludq
+; ALL:         pmuludq
+; ALL:         pmuludq
+entry:
+  %A = mul <2 x i64> %i, < i64 117, i64 117 >
+  ret <2 x i64> %A
 }
+
 define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind  {
-        %A = mul <4 x i32> %i, %j
-        ret <4 x i32> %A
+; SSE2-LABEL: c:
+; SSE2:         pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %[[X2]], %xmm1
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: c:
+; SSE41:         pmulld
+; SSE41-NEXT:    retq
+entry:
+  %A = mul <4 x i32> %i, %j
+  ret <4 x i32> %A
 }
+
 define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind  {
-        %A = mul <2 x i64> %i, %j
-        ret <2 x i64> %A
+; ALL-LABEL: d:
+; ALL:         pmuludq
+; ALL:         pmuludq
+; ALL:         pmuludq
+entry:
+  %A = mul <2 x i64> %i, %j
+  ret <2 x i64> %A
 }
-; Use a call to force spills.
+
 declare void @foo()
+
 define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind  {
-        call void @foo()
-        %A = mul <4 x i32> %i, %j
-        ret <4 x i32> %A
+; SSE2-LABEL: e:
+; SSE2:         movdqa {{[0-9]*}}(%rsp), %xmm0
+; SSE2-NEXT:    pshufd {{.*}} # [[X1:xmm[0-9]+]] = xmm0[1,1,3,3]
+; SSE2-NEXT:    movdqa {{[0-9]*}}(%rsp), %[[X2:xmm[0-9]+]]
+; SSE2-NEXT:    pmuludq %[[X2]], %xmm0
+; SSE2-NEXT:    pshufd {{.*}} # [[X2]] = [[X2]][1,1,3,3]
+; SSE2-NEXT:    pmuludq %[[X1]], %[[X2]]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2]
+; SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    addq ${{[0-9]+}}, %rsp
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: e:
+; SSE41:         pmulld {{[0-9]+}}(%rsp), %xmm
+; SSE41-NEXT:    addq ${{[0-9]+}}, %rsp
+; SSE41-NEXT:    retq
+entry:
+  ; Use a call to force spills.
+  call void @foo()
+  %A = mul <4 x i32> %i, %j
+  ret <4 x i32> %A
 }
+
 define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind  {
-        call void @foo()
-        %A = mul <2 x i64> %i, %j
-        ret <2 x i64> %A
+; ALL-LABEL: f:
+; ALL:         pmuludq
+; ALL:         pmuludq
+; ALL:         pmuludq
+entry:
+  ; Use a call to force spills.
+  call void @foo()
+  %A = mul <2 x i64> %i, %j
+  ret <2 x i64> %A
 }

diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll
index e7e29e0..0bdb0ec 100644
--- a/test/CodeGen/X86/pr11334.ll
+++ b/test/CodeGen/X86/pr11334.ll

@@ -15,7 +15,7 @@
 entry:
 ; CHECK: v3f2d_ext_vec
 ; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
 ; CHECK: cvtps2pd
 ; AVX:   v3f2d_ext_vec
 ; AVX:   vcvtps2pd
@@ -28,7 +28,7 @@
 entry:
 ; CHECK: v4f2d_ext_vec
 ; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
 ; CHECK: cvtps2pd
 ; AVX:   v4f2d_ext_vec
 ; AVX:   vcvtps2pd
@@ -42,9 +42,9 @@
 ; CHECK: v8f2d_ext_vec
 ; CHECK: cvtps2pd
 ; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
 ; CHECK: cvtps2pd
-; CHECK: movhlps
+; CHECK: shufpd
 ; CHECK: cvtps2pd
 ; AVX:   v8f2d_ext_vec
 ; AVX:   vcvtps2pd

diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
deleted file mode 100644
index 024b163..0000000
--- a/test/CodeGen/X86/pr12359.ll
+++ /dev/null

@@ -1,10 +0,0 @@
-; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
-define <16 x i8> @shuf(<16 x i8> %inval1) {
-entry:
-  %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
-  ret <16 x i8> %0
-; CHECK: shuf
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: pshufb
-; CHECK-NEXT: ret
-}

diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll
index ff4532e..c2bb8d3 100644
--- a/test/CodeGen/X86/pr14161.ll
+++ b/test/CodeGen/X86/pr14161.ll

@@ -3,6 +3,12 @@
 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>)
 
 define <2 x i16> @good(<4 x i32>*, <4 x i8>*) {
+; CHECK-LABEL: good:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-NEXT:    pminud {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pmovzxwq %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %2 = load <4 x i32>* %0, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
@@ -13,13 +19,17 @@
   %8 = bitcast i32 %4 to <2 x i16>
   %9 = bitcast i32 %5 to <2 x i16>
   ret <2 x i16> %8
-; CHECK: good
-; CHECK: pminud
-; CHECK-NEXT: pmovzxwq
-; CHECK: ret
 }
 
 define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) {
+; CHECK-LABEL: bad:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movdqa (%rdi), %xmm0
+; CHECK-NEXT:    pminud {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pextrd $1, %xmm0, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    pmovzxwq %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %2 = load <4 x i32>* %0, align 16
   %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
@@ -30,9 +40,4 @@
   %8 = bitcast i32 %4 to <2 x i16>
   %9 = bitcast i32 %5 to <2 x i16>
   ret <2 x i16> %9
-; CHECK: bad
-; CHECK: pminud
-; CHECK: pextrd
-; CHECK: pmovzxwq
-; CHECK: ret
 }

diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
index c8aaf32..b4dc5fd 100644
--- a/test/CodeGen/X86/pr15267.ll
+++ b/test/CodeGen/X86/pr15267.ll

@@ -48,19 +48,22 @@
 
 ; CHECK: test3
 ; CHECK: movzbl
-; CHECK: shrl
-; CHECK: andl $1
-; CHECK: andl $1
-; CHECK: vmovd
-; CHECK: pinsrd $1
-; CHECK: shrl $2
-; CHECK: andl $1
-; CHECK: pinsrd $2
-; CHECK: shrl $3
-; CHECK: andl $1
-; CHECK: pinsrd $3
-; CHECK: pslld
-; CHECK: psrad
-; CHECK: pmovsxdq
-; CHECK: pmovsxdq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: vpunpcklqdq
+; CHECK: movq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: shlq
+; CHECK: sarq
+; CHECK: vmovq
+; CHECK: vpunpcklqdq
+; CHECK: vinsertf128
 ; CHECK: ret

diff --git a/test/CodeGen/X86/pr18846.ll b/test/CodeGen/X86/pr18846.ll
new file mode 100644
index 0000000..27801be
--- /dev/null
+++ b/test/CodeGen/X86/pr18846.ll

@@ -0,0 +1,139 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; pr18846 - needless avx spill/reload
+; Test for unnecessary repeated spills due to eliminateRedundantSpills failing
+; to recognise unaligned ymm load/stores to the stack.
+; Bugpoint reduced testcase.
+
+;CHECK-LABEL: _Z16opt_kernel_cachePfS_S_
+;CHECK-NOT:   vmovups {{.*#+}} 32-byte Folded Spill
+;CHECK-NOT:   vmovups {{.*#+}} 32-byte Folded Reload
+
+; Function Attrs: uwtable
+define void @_Z16opt_kernel_cachePfS_S_() #0 {
+entry:
+  br label %for.body29
+
+for.body29:                                       ; preds = %for.body29, %entry
+  br i1 undef, label %for.body29, label %for.body65
+
+for.body65:                                       ; preds = %for.body29
+  %0 = load float* undef, align 4, !tbaa !1
+  %vecinit7.i4448 = insertelement <8 x float> undef, float %0, i32 7
+  %1 = load float* null, align 4, !tbaa !1
+  %vecinit7.i4304 = insertelement <8 x float> undef, float %1, i32 7
+  %2 = load float* undef, align 4, !tbaa !1
+  %vecinit7.i4196 = insertelement <8 x float> undef, float %2, i32 7
+  %3 = or i64 0, 16
+  %add.ptr111.sum4096 = add i64 %3, 0
+  %4 = load <8 x float>* null, align 16, !tbaa !5
+  %add.ptr162 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr111.sum4096
+  %__v.i4158 = bitcast float* %add.ptr162 to <8 x float>*
+  %5 = load <8 x float>* %__v.i4158, align 16, !tbaa !5
+  %add.ptr158.sum40975066 = or i64 %add.ptr111.sum4096, 8
+  %add.ptr183 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr158.sum40975066
+  %__v.i4162 = bitcast float* %add.ptr183 to <8 x float>*
+  %6 = load <8 x float>* %__v.i4162, align 16, !tbaa !5
+  %add.ptr200.sum40995067 = or i64 undef, 8
+  %add.ptr225 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr200.sum40995067
+  %__v.i4167 = bitcast float* %add.ptr225 to <8 x float>*
+  %7 = load <8 x float>* %__v.i4167, align 4, !tbaa !5
+  %8 = load <8 x float>* undef, align 16, !tbaa !5
+  %add.ptr242.sum41015068 = or i64 0, 8
+  %add.ptr267 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr242.sum41015068
+  %__v.i4171 = bitcast float* %add.ptr267 to <8 x float>*
+  %9 = load <8 x float>* %__v.i4171, align 4, !tbaa !5
+  %mul.i4690 = fmul <8 x float> %7, undef
+  %add.i4665 = fadd <8 x float> undef, undef
+  %mul.i4616 = fmul <8 x float> %8, undef
+  %mul.i4598 = fmul <8 x float> undef, undef
+  %add.i4597 = fadd <8 x float> undef, %mul.i4598
+  %mul.i4594 = fmul <8 x float> %6, undef
+  %add.i4593 = fadd <8 x float> undef, %mul.i4594
+  %mul.i4578 = fmul <8 x float> %9, undef
+  %add.i4577 = fadd <8 x float> %add.i4593, %mul.i4578
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4577) #1
+  %10 = load <8 x float>* null, align 16, !tbaa !5
+  %11 = load <8 x float>* undef, align 16, !tbaa !5
+  %mul.i4564 = fmul <8 x float> %4, undef
+  %add.i4563 = fadd <8 x float> %10, %mul.i4564
+  %mul.i4560 = fmul <8 x float> %5, undef
+  %add.i4559 = fadd <8 x float> %11, %mul.i4560
+  %add.i4547 = fadd <8 x float> %add.i4563, undef
+  %mul.i4546 = fmul <8 x float> %7, undef
+  %add.i4545 = fadd <8 x float> undef, %mul.i4546
+  %mul.i4544 = fmul <8 x float> %8, undef
+  %add.i4543 = fadd <8 x float> %add.i4559, %mul.i4544
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4547) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4545) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4543) #1
+  %add.i4455 = fadd <8 x float> undef, undef
+  %mul.i4454 = fmul <8 x float> undef, undef
+  %add.i4453 = fadd <8 x float> undef, %mul.i4454
+  %mul.i4440 = fmul <8 x float> zeroinitializer, %vecinit7.i4448
+  %add.i4439 = fadd <8 x float> %add.i4455, %mul.i4440
+  %mul.i4438 = fmul <8 x float> %7, %vecinit7.i4448
+  %add.i4437 = fadd <8 x float> %add.i4453, %mul.i4438
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4439) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4437) #1
+  %add.i4413 = fadd <8 x float> zeroinitializer, undef
+  %mul.i4400 = fmul <8 x float> %8, undef
+  %add.i4399 = fadd <8 x float> undef, %mul.i4400
+  %add.i4397 = fadd <8 x float> %add.i4413, zeroinitializer
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4399) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4397) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1
+  %mul.i4330 = fmul <8 x float> %7, undef
+  %add.i4329 = fadd <8 x float> undef, %mul.i4330
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4329) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1
+  %mul.i4312 = fmul <8 x float> %4, undef
+  %add.i4311 = fadd <8 x float> undef, %mul.i4312
+  %mul.i4306 = fmul <8 x float> %6, undef
+  %add.i4305 = fadd <8 x float> undef, %mul.i4306
+  %add.i4295 = fadd <8 x float> %add.i4311, undef
+  %mul.i4294 = fmul <8 x float> %7, %vecinit7.i4304
+  %add.i4293 = fadd <8 x float> undef, %mul.i4294
+  %mul.i4292 = fmul <8 x float> %8, %vecinit7.i4304
+  %add.i4291 = fadd <8 x float> undef, %mul.i4292
+  %mul.i4290 = fmul <8 x float> %9, %vecinit7.i4304
+  %add.i4289 = fadd <8 x float> %add.i4305, %mul.i4290
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4295) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4293) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4291) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4289) #1
+  %12 = load <8 x float>* undef, align 16, !tbaa !5
+  %mul.i4274 = fmul <8 x float> undef, undef
+  %add.i4273 = fadd <8 x float> %12, %mul.i4274
+  %mul.i4258 = fmul <8 x float> %7, undef
+  %add.i4257 = fadd <8 x float> %add.i4273, %mul.i4258
+  %mul.i4254 = fmul <8 x float> %9, undef
+  %add.i4253 = fadd <8 x float> undef, %mul.i4254
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4257) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4253) #1
+  %mul.i = fmul <8 x float> %9, %vecinit7.i4196
+  %add.i = fadd <8 x float> undef, %mul.i
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1
+  call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i) #1
+  unreachable
+}
+
+; Function Attrs: nounwind
+declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5 "}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"float", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !3, metadata !3, i64 0}

diff --git a/test/CodeGen/X86/pr21099.ll b/test/CodeGen/X86/pr21099.ll
new file mode 100644
index 0000000..07292c1
--- /dev/null
+++ b/test/CodeGen/X86/pr21099.ll

@@ -0,0 +1,10 @@
+; RUN: llc < %s -O2 -march=x86-64 -verify-machineinstrs | FileCheck %s
+
+define void @pr21099(i64* %p) {
+; CHECK-LABEL: pr21099
+; CHECK: lock
+; CHECK-NEXT: addq $-2147483648
+; This number is INT32_MIN: 0x80000000UL
+  %1 = atomicrmw add i64* %p, i64 -2147483648 seq_cst
+  ret void
+}

diff --git a/test/CodeGen/X86/pr21529.ll b/test/CodeGen/X86/pr21529.ll
new file mode 100644
index 0000000..655bc84
--- /dev/null
+++ b/test/CodeGen/X86/pr21529.ll

@@ -0,0 +1,15 @@
+; RUN: llc -show-mc-encoding < %s | FileCheck %s
+
+; Test that the direct object emission selects the and variant with 8 bit
+; immediate.
+; We used to get this wrong when using direct object emission, but not when
+; reading assembly.
+
+; CHECK: andq    $-32, %rsp              # encoding: [0x48,0x83,0xe4,0xe0]
+
+target triple = "x86_64-pc-linux"
+
+define void @f() {
+  %foo = alloca i8, align 32
+  ret void
+}

diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
new file mode 100644
index 0000000..7fc9890
--- /dev/null
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll

@@ -0,0 +1,30 @@
+; RUN: llc < %s -march=x86-64 -mattr=+ssse3 | FileCheck %s
+
+; Test that the pshufb mask comment is correct.
+
+define <16 x i8> @test1(<16 x i8> %V) {
+; CHECK-LABEL: test1:
+; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4]
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 2, i8 0, i8 0, i8 0, i8 0, i8 3, i8 0, i8 0, i8 0, i8 0, i8 4>)
+  ret <16 x i8> %1
+}
+
+; Test that indexes larger than the size of the vector are shown masked (bottom 4 bits).
+
+define <16 x i8> @test2(<16 x i8> %V) {
+; CHECK-LABEL: test2:
+; CHECK: pshufb {{.*}}# xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2]
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 15, i8 0, i8 0, i8 0, i8 0, i8 16, i8 0, i8 0, i8 0, i8 0, i8 17, i8 0, i8 0, i8 0, i8 0, i8 50>)
+  ret <16 x i8> %1
+}
+
+; Test that indexes with bit seven set are shown as zero.
+
+define <16 x i8> @test3(<16 x i8> %V) {
+; CHECK-LABEL: test3:
+; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4]
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 127, i8 0, i8 2, i8 0, i8 0, i8 128, i8 0, i8 3, i8 0, i8 0, i8 255, i8 0, i8 4>)
+  ret <16 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone

diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index d8e4572..49d58f4 100644
--- a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll

@@ -2,10 +2,12 @@
 ; Without the last chance recoloring, this test fails with:
 ; "ran out of registers".
 
-; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH
+; NOTE: With the fix to PR18883, we don't actually run out of registers here
+; any more, and so those checks are disabled. This test remains only for general coverage.
+; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH
 ; Test whether failure due to cutoff for depth is reported
 
-; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF
+; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF
 ; Test whether failure due to cutoff for interference is reported
 
 ; RUN: llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 -lcr-max-depth=0 -exhaustive-register-search < %s > %t 2>&1

diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
new file mode 100644
index 0000000..83b86ac
--- /dev/null
+++ b/test/CodeGen/X86/recip-fastmath.ll

@@ -0,0 +1,109 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+use-recip-est,+avx -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE
+
+; If the target's divss/divps instructions are substantially
+; slower than rcpss/rcpps with a Newton-Raphson refinement,
+; we should generate the estimate sequence.
+
+; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 )
+; for details about the accuracy, speed, and implementation
+; differences of x86 reciprocal estimates.
+
+define float @reciprocal_estimate(float %x) #0 {
+  %div = fdiv fast float 1.0, %x
+  ret float %div
+
+; CHECK-LABEL: reciprocal_estimate:
+; CHECK: movss
+; CHECK-NEXT: divss
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate:
+; BTVER2: vrcpss
+; BTVER2: vmulss
+; BTVER2: vsubss
+; BTVER2: vmulss
+; BTVER2: vaddss
+; BTVER2-NEXT: retq
+
+; REFINE-LABEL: reciprocal_estimate:
+; REFINE: vrcpss
+; REFINE: vmulss
+; REFINE: vsubss
+; REFINE: vmulss
+; REFINE: vaddss
+; REFINE: vmulss
+; REFINE: vsubss
+; REFINE: vmulss
+; REFINE: vaddss
+; REFINE-NEXT: retq
+}
+
+define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 {
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <4 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v4f32:
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v4f32:
+; BTVER2: vrcpps
+; BTVER2: vmulps
+; BTVER2: vsubps
+; BTVER2: vmulps
+; BTVER2: vaddps
+; BTVER2-NEXT: retq
+
+; REFINE-LABEL: reciprocal_estimate_v4f32:
+; REFINE: vrcpps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE-NEXT: retq
+}
+
+define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 {
+  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
+  ret <8 x float> %div
+
+; CHECK-LABEL: reciprocal_estimate_v8f32:
+; CHECK: movaps
+; CHECK: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: divps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: retq
+
+; BTVER2-LABEL: reciprocal_estimate_v8f32:
+; BTVER2: vrcpps
+; BTVER2: vmulps
+; BTVER2: vsubps
+; BTVER2: vmulps
+; BTVER2: vaddps
+; BTVER2-NEXT: retq
+
+; REFINE-LABEL: reciprocal_estimate_v8f32:
+; REFINE: vrcpps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE: vmulps
+; REFINE: vsubps
+; REFINE: vmulps
+; REFINE: vaddps
+; REFINE-NEXT: retq
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }

diff --git a/test/CodeGen/X86/return_zeroext_i2.ll b/test/CodeGen/X86/return_zeroext_i2.ll
new file mode 100644
index 0000000..d535b0c
--- /dev/null
+++ b/test/CodeGen/X86/return_zeroext_i2.ll

@@ -0,0 +1,7 @@
+; RUN: llc -mtriple=i386-pc-win32 < %s | FileCheck %s 
+; Check that the testcase does not crash
+define zeroext i2 @crash () {
+  ret i2 0
+}
+; CHECK: xorl	%eax, %eax
+; CHECK-NEXT: retl

diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index b82be41..e34ba54 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll

@@ -1,7 +1,9 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
@@ -61,6 +63,26 @@
 ; X64-NEXT: callq __morestack_allocate_stack_space
 ; X64:      movq %rax, %rdi
 
+; X32ABI-LABEL:      test_basic:
+
+; X32ABI:      cmpl %fs:64, %esp
+; X32ABI-NEXT: ja      .LBB0_2
+
+; X32ABI:      movl $24, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+
+; X32ABI:      movl %esp, %[[EDI:edi|eax]]
+; X32ABI:      subl %{{.*}}, %[[EDI]]
+; X32ABI-NEXT: cmpl %[[EDI]], %fs:64
+
+; X32ABI:      movl %[[EDI]], %esp
+
+; X32ABI:      movl %{{.*}}, %edi
+; X32ABI-NEXT: callq __morestack_allocate_stack_space
+; X32ABI:      movl %eax, %edi
+
 }
 
 attributes #0 = { "split-stack" }

diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 9dab3cd..2db7c11 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll

@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
@@ -9,6 +10,7 @@
 ; We used to crash with filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj
@@ -51,6 +53,16 @@
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
 
+; X32ABI-LABEL:       test_basic:
+
+; X32ABI:       cmpl %fs:64, %esp
+; X32ABI-NEXT:  ja      .LBB0_2
+
+; X32ABI:       movl $40, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+
 ; X32-Darwin-LABEL:      test_basic:
 
 ; X32-Darwin:      movl $432, %ecx
@@ -129,6 +141,16 @@
 ; X64-Linux-NEXT:  ret
 ; X64-Linux-NEXT:  movq %rax, %r10
 
+; X32ABI:       cmpl %fs:64, %esp
+; X32ABI-NEXT:  ja      .LBB1_2
+
+; X32ABI:       movl %r10d, %eax
+; X32ABI-NEXT:  movl $56, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+; X32ABI-NEXT:  movq %rax, %r10
+
 ; X32-Darwin:      movl $432, %edx
 ; X32-Darwin-NEXT: cmpl %gs:(%edx), %esp
 ; X32-Darwin-NEXT: ja      LBB1_2
@@ -202,6 +224,15 @@
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
 
+; X32ABI:       leal -40008(%rsp), %r11d
+; X32ABI-NEXT:  cmpl %fs:64, %r11d
+; X32ABI-NEXT:  ja      .LBB2_2
+
+; X32ABI:       movl $40008, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+
 ; X32-Darwin:      leal -40012(%esp), %ecx
 ; X32-Darwin-NEXT: movl $432, %eax
 ; X32-Darwin-NEXT: cmpl %gs:(%eax), %ecx
@@ -276,6 +307,16 @@
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
 
+; X32ABI-LABEL:       test_fastcc:
+
+; X32ABI:       cmpl %fs:64, %esp
+; X32ABI-NEXT:  ja      .LBB3_2
+
+; X32ABI:       movl $40, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+
 ; X32-Darwin-LABEL:      test_fastcc:
 
 ; X32-Darwin:      movl $432, %eax
@@ -356,6 +397,17 @@
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
 
+; X32ABI-LABEL:       test_fastcc_large:
+
+; X32ABI:       leal -40008(%rsp), %r11d
+; X32ABI-NEXT:  cmpl %fs:64, %r11d
+; X32ABI-NEXT:  ja      .LBB4_2
+
+; X32ABI:       movl $40008, %r10d
+; X32ABI-NEXT:  movl $0, %r11d
+; X32ABI-NEXT:  callq __morestack
+; X32ABI-NEXT:  ret
+
 ; X32-Darwin-LABEL:      test_fastcc_large:
 
 ; X32-Darwin:      leal -40012(%esp), %eax
@@ -446,6 +498,9 @@
 ; X64-Linux-LABEL: test_nostack:
 ; X32-Linux-NOT:   callq __morestack
 
+; X32ABI-LABEL: test_nostack:
+; X32ABI-NOT:   callq __morestack
+
 ; X32-Darwin-LABEL: test_nostack:
 ; X32-Darwin-NOT:   calll __morestack
 

diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index cdd258d..7e6f153 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll

@@ -357,3 +357,47 @@
 ; ATOM: cmpl $15, %edi
 ; ATOM: cmovgel %edx
 }
+
+; CHECK-LABEL: @trunc_select_miscompile
+; CHECK-NOT: sarb
+define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
+  %tmp1 = select i1 %cc, i32 3, i32 2
+  %tmp2 = shl i32 %a, %tmp1
+  ret i32 %tmp2
+}
+
+define void @test19() {
+; This is a massive reduction of an llvm-stress test case that generates
+; interesting chains feeding setcc and eventually a f32 select operation. This
+; is intended to exercise the SELECT formation in the DAG combine simplifying
+; a simplified select_cc node. If it it regresses and is no longer triggering
+; that code path, it can be deleted.
+;
+; CHECK-LABEL: @test19
+; CHECK: testb
+; CHECK: cmpl
+; CHECK: ucomiss
+
+BB:
+  br label %CF
+
+CF:
+  %Cmp10 = icmp ule i8 undef, undef
+  br i1 %Cmp10, label %CF, label %CF250
+
+CF250:
+  %E12 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 2
+  %Cmp32 = icmp ugt i1 %Cmp10, false
+  br i1 %Cmp32, label %CF, label %CF242
+
+CF242:
+  %Cmp38 = icmp uge i32 %E12, undef
+  %FC = uitofp i1 %Cmp38 to float
+  %Sl59 = select i1 %Cmp32, float %FC, float undef
+  %Cmp60 = fcmp ugt float undef, undef
+  br i1 %Cmp60, label %CF242, label %CF244
+
+CF244:
+  %B122 = fadd float %Sl59, undef
+  ret void
+}

diff --git a/test/CodeGen/X86/sext-i1.ll b/test/CodeGen/X86/sext-i1.ll
index 64de0ae..1a575db 100644
--- a/test/CodeGen/X86/sext-i1.ll
+++ b/test/CodeGen/X86/sext-i1.ll

@@ -61,3 +61,36 @@
   %xor27 = xor i32 undef, %cond                   ; <i32> [#uses=0]
   ret i32 0
 }
+
+define i32 @t4(i64 %x) nounwind readnone ssp {
+entry:
+; 32-LABEL: t4:
+; 32: movl
+; 32: orl
+; 32: movl
+; 32: je
+; 32: xorl
+
+; 64-LABEL: t4:
+; 64: cmpq $1
+; 64: sbbl
+  %0 = icmp eq i64 %x, 0
+  %1 = sext i1 %0 to i32
+  ret i32 %1
+}
+
+define i64 @t5(i32 %x) nounwind readnone ssp {
+entry:
+; 32-LABEL: t5:
+; 32: cmpl $1
+; 32: sbbl
+; 32: movl
+
+; 64-LABEL: t5:
+; 64: cmpl $1
+; 64: sbbq
+  %0 = icmp eq i32 %x, 0
+  %1 = sext i1 %0 to i64
+  ret i64 %1
+}
+

diff --git a/test/CodeGen/X86/shift-parts.ll b/test/CodeGen/X86/shift-parts.ll
index ddad307..763da63 100644
--- a/test/CodeGen/X86/shift-parts.ll
+++ b/test/CodeGen/X86/shift-parts.ll

@@ -7,13 +7,13 @@
 
 ; CHECK: shrdq
 
-define i32 @int87(i32 %uint64p_8) nounwind {
+define i32 @int87(i32 %uint64p_8, i1 %cond) nounwind {
 entry:
   %srcval4 = load i320* bitcast (%0* @g_144 to i320*), align 8 ; <i320> [#uses=1]
   br label %for.cond
 
 for.cond:                                         ; preds = %for.cond, %entry
-  %call3.in.in.in.v = select i1 undef, i320 192, i320 128 ; <i320> [#uses=1]
+  %call3.in.in.in.v = select i1 %cond, i320 192, i320 128 ; <i320> [#uses=1]
   %call3.in.in.in = lshr i320 %srcval4, %call3.in.in.in.v ; <i320> [#uses=1]
   %call3.in = trunc i320 %call3.in.in.in to i32   ; <i32> [#uses=1]
   %tobool = icmp eq i32 %call3.in, 0              ; <i1> [#uses=1]

diff --git a/test/CodeGen/X86/shuffle-combine-crash.ll b/test/CodeGen/X86/shuffle-combine-crash.ll
new file mode 100644
index 0000000..6ab7b97
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-combine-crash.ll

@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 
+
+; Verify that DAGCombiner does not crash when checking if it is
+; safe to fold the shuffles in function @sample_test according to rule
+;  (shuffle (shuffle A, Undef, M0), Undef, M1) -> (shuffle A, Undef, M2)
+;
+; The DAGCombiner avoids folding shuffles if
+; the resulting shuffle dag node is not legal for the target.
+; That means, the shuffle must have legal type and legal mask.
+;
+; Before, the DAGCombiner forgot to check if the resulting shuffle
+; was legal. It instead just called method
+; 'X86TargetLowering::isShuffleMaskLegal'; however, that was not enough since
+; that method always expect to have a valid vector type in input.
+; As a consequence, compiling the function below would have caused a crash.
+
+define void @sample_test() {
+  br i1 undef, label %5, label %1
+
+; <label>:1                                       ; preds = %0
+  %2 = load <4 x i8>* undef
+  %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  %4 = shufflevector <4 x i8> %3, <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  store <4 x i8> %4, <4 x i8>* undef
+  br label %5
+
+; <label>:5                                       ; preds = %1, %0
+  ret void
+}
+

diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll
index 2dc8816..1e34a2b 100644
--- a/test/CodeGen/X86/sincos-opt.ll
+++ b/test/CodeGen/X86/sincos-opt.ll

@@ -15,7 +15,8 @@
 
 ; OSX_SINCOS-LABEL: test1:
 ; OSX_SINCOS: callq ___sincosf_stret
-; OSX_SINCOS: pshufd $1, %xmm0, %xmm1
+; OSX_SINCOS: movaps %xmm0, %xmm1
+; OSX_SINCOS: shufps {{.*}} ## xmm1 = xmm1[1,1,2,3]
 ; OSX_SINCOS: addss %xmm0, %xmm1
 
 ; OSX_NOOPT: test1

diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll
new file mode 100644
index 0000000..6e3a003
--- /dev/null
+++ b/test/CodeGen/X86/sink-blockfreq.ll

@@ -0,0 +1,45 @@
+; RUN: llc -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI
+; RUN: llc -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI
+
+; Test that by changing BlockFrequencyInfo we change the order in which
+; machine-sink looks for sucessor blocks. By not using BFI, both G and B
+; have the same loop depth and no instructions is sinked - B is selected but
+; can't be used as to avoid breaking a non profitable critical edge. By using
+; BFI, "mul" is sinked into the less frequent block G.
+define i32 @sink_freqinfo(i32 %a, i32 %b) nounwind uwtable ssp {
+; MSINK_BFI-LABEL: sink_freqinfo
+; MSINK_BFI: jl
+; MSINK_BFI-NEXT: ## BB#
+; MSINK_BFI-NEXT: imull
+
+; MSINK_NOBFI-LABEL: sink_freqinfo
+; MSINK_NOBFI: imull
+; MSINK_NOBFI: jl
+entry:
+  br label %B
+
+B:
+  %ee = phi i32 [ 0, %entry ], [ %inc, %F ]
+  %xx = sub i32 %a, %ee
+  %cond0 = icmp slt i32 %xx, 0
+  br i1 %cond0, label %F, label %exit, !prof !0
+
+F:
+  %inc = add nsw i32 %xx, 2
+  %aa = mul nsw i32 %b, %inc
+  %exitcond = icmp slt i32 %inc, %a
+  br i1 %exitcond, label %B, label %G, !prof !1
+
+G:
+  %ii = add nsw i32 %aa, %a
+  %ll = add i32 %b, 45
+  %exitcond2 = icmp sge i32 %ii, %b
+  br i1 %exitcond2, label %G, label %exit, !prof !2
+
+exit:
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 4, i32 1}
+!1 = metadata !{metadata !"branch_weights", i32 128, i32 1}
+!2 = metadata !{metadata !"branch_weights", i32 1, i32 1}

diff --git a/test/CodeGen/X86/sink-out-of-loop.ll b/test/CodeGen/X86/sink-out-of-loop.ll
index c600f92..6757f31 100644
--- a/test/CodeGen/X86/sink-out-of-loop.ll
+++ b/test/CodeGen/X86/sink-out-of-loop.ll

@@ -5,7 +5,7 @@
 ; MOV32ri outside the loop.
 ; rdar://11980766
 define i32 @sink_succ(i32 %argc, i8** nocapture %argv) nounwind uwtable ssp {
-; CHECK: sink_succ
+; CHECK-LABEL: sink_succ
 ; CHECK: [[OUTER_LN1:LBB0_[0-9]+]]: ## %preheader
 ; CHECK: %exit
 ; CHECK-NOT: movl
@@ -52,3 +52,24 @@
 for.end20:
   ret i32 0
 }
+
+define i32 @sink_out_of_loop(i32 %n, i32* %output) {
+; CHECK-LABEL: sink_out_of_loop:
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i2, %loop ]
+  %j = mul i32 %i, %i
+  %addr = getelementptr i32* %output, i32 %i
+  store i32 %i, i32* %addr
+  %i2 = add i32 %i, 1
+  %exit_cond = icmp sge i32 %i2, %n
+  br i1 %exit_cond, label %exit, label %loop
+
+exit:
+; CHECK: BB#2
+; CHECK: imull %eax, %eax
+; CHECK: retq
+  ret i32 %j
+}

diff --git a/test/CodeGen/X86/slow-incdec.ll b/test/CodeGen/X86/slow-incdec.ll
new file mode 100644
index 0000000..541d992
--- /dev/null
+++ b/test/CodeGen/X86/slow-incdec.ll

@@ -0,0 +1,80 @@
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-slow-incdec < %s | FileCheck -check-prefix=INCDEC %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+slow-incdec < %s | FileCheck -check-prefix=ADD %s
+
+; check -mattr=-slow-incdec
+; INCDEC-NOT: addl $-1
+; INCDEC: dec
+; INCDEC-NOT: addl $1
+; INCDEC: inc
+
+; check -mattr=+slow-incdec
+; ADD: addl $-1
+; ADD-NOT: dec
+; ADD: addl $1
+; ADD-NOT: inc
+
+; Function Attrs: nounwind readonly
+define i32 @slow_1(i32* nocapture readonly %a, i32 %s) #0 {
+entry:
+  %cmp5 = icmp eq i32 %s, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %i.06 = phi i32 [ %dec, %for.cond ], [ %s, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %a, i32 %i.06
+  %0 = load i32* %arrayidx, align 4, !tbaa !1
+  %cmp1 = icmp eq i32 %0, 0
+;
+  %dec = add nsw i32 %i.06, -1
+  br i1 %cmp1, label %for.end.loopexit, label %for.cond
+
+for.end.loopexit:                                 ; preds = %for.cond, %for.body
+  %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Function Attrs: nounwind readonly
+define i32 @slow_2(i32* nocapture readonly %a, i32 %s) #0 {
+entry:
+  %cmp5 = icmp eq i32 %s, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.cond
+  %i.06 = phi i32 [ %inc, %for.cond ], [ %s, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %a, i32 %i.06
+  %0 = load i32* %arrayidx, align 4, !tbaa !1
+  %cmp1 = icmp eq i32 %0, 0
+  %inc = add nsw i32 %i.06, 1
+  br i1 %cmp1, label %for.end.loopexit, label %for.cond
+
+for.end.loopexit:                                 ; preds = %for.cond, %for.body
+  %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}

diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll
new file mode 100644
index 0000000..c052ad2
--- /dev/null
+++ b/test/CodeGen/X86/splat-for-size.ll

@@ -0,0 +1,141 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2
+
+; Check constant loads of every 128-bit and 256-bit vector type 
+; for size optimization using splat ops available with AVX and AVX2.
+
+; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr).
+define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
+  %add = fadd <2 x double> %x, <double 1.0, double 1.0>
+  ret <2 x double> %add
+; CHECK-LABEL: splat_v2f64
+; CHECK: vmovddup
+; CHECK: vaddpd 
+; CHECK-NEXT: retq
+}
+
+define <4 x double> @splat_v4f64(<4 x double> %x) #0 {
+  %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  ret <4 x double> %add
+; CHECK-LABEL: splat_v4f64
+; CHECK: vbroadcastsd 
+; CHECK-NEXT: vaddpd
+; CHECK-NEXT: retq
+}
+
+define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
+  %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <4 x float> %add
+; CHECK-LABEL: splat_v4f32
+; CHECK: vbroadcastss 
+; CHECK-NEXT: vaddps
+; CHECK-NEXT: retq
+}
+
+define <8 x float> @splat_v8f32(<8 x float> %x) #0 {
+  %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <8 x float> %add
+; CHECK-LABEL: splat_v8f32
+; CHECK: vbroadcastss 
+; CHECK-NEXT: vaddps
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
+; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
+define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 {
+  %add = add <2 x i64> %x, <i64 1, i64 1>
+  ret <2 x i64> %add
+; CHECK-LABEL: splat_v2i64
+; CHECK: vmovddup 
+; CHECK: vpaddq
+; CHECK-NEXT: retq
+}
+
+; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
+; and then we fake it: use vmovddup to splat 64-bit value.
+define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
+  %add = add <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %add
+; CHECK-LABEL: splat_v4i64
+; AVX: vmovddup
+; AVX: vpaddq 
+; AVX: vpaddq 
+; AVX2: vpbroadcastq 
+; AVX2: vpaddq 
+; CHECK: retq
+}
+
+; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
+define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 {
+  %add = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %add
+; CHECK-LABEL: splat_v4i32
+; AVX: vbroadcastss
+; AVX2: vpbroadcastd 
+; CHECK-NEXT: vpaddd 
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
+define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
+  %add = add <8 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %add
+; CHECK-LABEL: splat_v8i32
+; AVX: vbroadcastss
+; AVX: vpaddd 
+; AVX: vpaddd 
+; AVX2: vpbroadcastd 
+; AVX2: vpaddd 
+; CHECK: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
+define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 {
+  %add = add <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %add
+; CHECK-LABEL: splat_v8i16
+; AVX-NOT: broadcast
+; AVX2: vpbroadcastw 
+; CHECK: vpaddw 
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
+define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
+  %add = add <16 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <16 x i16> %add
+; CHECK-LABEL: splat_v16i16
+; AVX-NOT: broadcast
+; AVX: vpaddw 
+; AVX: vpaddw 
+; AVX2: vpbroadcastw 
+; AVX2: vpaddw 
+; CHECK: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
+define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 {
+  %add = add <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %add
+; CHECK-LABEL: splat_v16i8
+; AVX-NOT: broadcast
+; AVX2: vpbroadcastb 
+; CHECK: vpaddb 
+; CHECK-NEXT: retq
+}
+
+; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
+define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
+  %add = add <32 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <32 x i8> %add
+; CHECK-LABEL: splat_v32i8
+; AVX-NOT: broadcast
+; AVX: vpaddb 
+; AVX: vpaddb 
+; AVX2: vpbroadcastb 
+; AVX2: vpaddb 
+; CHECK: retq
+}
+
+attributes #0 = { optsize }

diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll
deleted file mode 100644
index 4d59b9c..0000000
--- a/test/CodeGen/X86/splat-scalar-load.ll
+++ /dev/null

@@ -1,17 +0,0 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -mcpu=nehalem | FileCheck %s
-; rdar://7434544
-
-define <2 x i64> @t2() nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: pshufd	$85, (%esp), %xmm0
-  %array = alloca [8 x float], align 4
-  %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 1
-  %tmp2 = load float* %arrayidx
-  %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0
-  %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1
-  %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2
-  %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3
-  %0 = bitcast <4 x float> %vecinit9 to <2 x i64>
-  ret <2 x i64> %0
-}

diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index fc79e31..24b175e 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 
 ; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from
 ; #include <math.h>
@@ -52,9 +53,80 @@
   ret x86_fp80 %call
 }
 
-; Function Attrs: nounwind readnone
 declare x86_fp80 @__sqrtl_finite(x86_fp80) #1
 
+declare float @llvm.sqrt.f32(float) #1
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
+
+; If the target's sqrtss and divss instructions are substantially
+; slower than rsqrtss with a Newton-Raphson refinement, we should
+; generate the estimate sequence.
+
+define float @reciprocal_square_root(float %x) #0 {
+  %sqrt = tail call float @llvm.sqrt.f32(float %x)
+  %div = fdiv fast float 1.0, %sqrt
+  ret float %div
+
+; CHECK-LABEL: reciprocal_square_root:
+; CHECK: sqrtss
+; CHECK-NEXT: movss
+; CHECK-NEXT: divss
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root:
+; BTVER2: vrsqrtss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: vaddss
+; BTVER2-NEXT: vmulss
+; BTVER2-NEXT: retq
+}
+
+define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 {
+  %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
+  ret <4 x float> %div
+
+; CHECK-LABEL: reciprocal_square_root_v4f32:
+; CHECK: sqrtps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root_v4f32:
+; BTVER2: vrsqrtps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: retq
+}
+
+define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 {
+  %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
+  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
+  ret <8 x float> %div
+
+; CHECK-LABEL: reciprocal_square_root_v8f32:
+; CHECK: sqrtps
+; CHECK-NEXT: sqrtps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: movaps
+; CHECK-NEXT: divps
+; CHECK-NEXT: divps
+; CHECK-NEXT: retq
+; BTVER2-LABEL: reciprocal_square_root_v8f32:
+; BTVER2: vrsqrtps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: vaddps
+; BTVER2-NEXT: vmulps
+; BTVER2-NEXT: retq
+}
+
+
 attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }

diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll
index 2351fd6..396da0f 100644
--- a/test/CodeGen/X86/sse-align-12.ll
+++ b/test/CodeGen/X86/sse-align-12.ll

@@ -1,9 +1,11 @@
-; RUN: llc < %s -march=x86-64 -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=nehalem | FileCheck %s
 
-; CHECK-LABEL: a:
-; CHECK: movdqu
-; CHECK: pshufd
 define <4 x float> @a(<4 x float>* %y) nounwind {
+; CHECK-LABEL: a:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movups (%rdi), %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %x = load <4 x float>* %y, align 4
   %a = extractelement <4 x float> %x, i32 0
   %b = extractelement <4 x float> %x, i32 1
@@ -16,10 +18,12 @@
   ret <4 x float> %s
 }
 
-; CHECK-LABEL: b:
-; CHECK: movups
-; CHECK: unpckhps
 define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind {
+; CHECK-LABEL: b:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movups (%rdi), %xmm1
+; CHECK-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    retq
   %x = load <4 x float>* %y, align 4
   %a = extractelement <4 x float> %x, i32 2
   %b = extractelement <4 x float> %x, i32 3
@@ -32,10 +36,12 @@
   ret <4 x float> %s
 }
 
-; CHECK-LABEL: c:
-; CHECK: movupd
-; CHECK: shufpd
 define <2 x double> @c(<2 x double>* %y) nounwind {
+; CHECK-LABEL: c:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movupd (%rdi), %xmm0
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    retq
   %x = load <2 x double>* %y, align 8
   %a = extractelement <2 x double> %x, i32 0
   %c = extractelement <2 x double> %x, i32 1
@@ -44,10 +50,12 @@
   ret <2 x double> %r
 }
 
-; CHECK-LABEL: d:
-; CHECK: movupd
-; CHECK: unpckhpd
 define <2 x double> @d(<2 x double>* %y, <2 x double> %z) nounwind {
+; CHECK-LABEL: d:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movupd (%rdi), %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT:    retq
   %x = load <2 x double>* %y, align 8
   %a = extractelement <2 x double> %x, i32 1
   %c = extractelement <2 x double> %z, i32 1

diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 5122c44..da36a42 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll

@@ -138,8 +138,7 @@
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      ogt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ogt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -157,8 +156,7 @@
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      olt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      olt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -177,8 +175,7 @@
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      ogt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ogt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -198,8 +195,7 @@
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      olt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      olt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -217,8 +213,7 @@
 ; CHECK-NEXT: andpd
 ; UNSAFE-LABEL:      oge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      oge_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -235,8 +230,7 @@
 ; CHECK-NEXT: andpd
 ; UNSAFE-LABEL:      ole_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ole_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -253,8 +247,7 @@
 ; CHECK-NEXT: andnpd
 ; UNSAFE-LABEL:      oge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      oge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -271,8 +264,7 @@
 ; CHECK:      cmplesd %xmm
 ; UNSAFE-LABEL:      ole_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ole_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -412,8 +404,7 @@
 ; CHECK-NEXT: andpd
 ; UNSAFE-LABEL:      ugt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ugt_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -430,8 +421,7 @@
 ; CHECK-NEXT: andpd
 ; UNSAFE-LABEL:      ult_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ult_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -448,8 +438,7 @@
 ; CHECK-NEXT: andnpd
 ; UNSAFE-LABEL:      ugt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ugt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -467,8 +456,7 @@
 ; CHECK-NEXT: andnpd
 ; UNSAFE-LABEL:      ult_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ult_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -488,8 +476,7 @@
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      uge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      uge_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -508,8 +495,7 @@
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      ule_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ule_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -527,8 +513,7 @@
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      uge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      uge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -547,8 +532,7 @@
 ; CHECK-NEXT: ret
 ; UNSAFE-LABEL:      ule_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE-LABEL:      ule_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1

diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
deleted file mode 100644
index 600ee1b..0000000
--- a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
+++ /dev/null

@@ -1,423 +0,0 @@
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
-
-; Ensure that the backend selects SSE/AVX scalar fp instructions
-; from a packed fp instrution plus a vector insert.
-
-
-define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fadd <4 x float> %a, %b
-  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_add_ss
-; SSE2: addss   %xmm1, %xmm0
-; AVX: vaddss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fsub <4 x float> %a, %b
-  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_sub_ss
-; SSE2: subss   %xmm1, %xmm0
-; AVX: vsubss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fmul <4 x float> %a, %b
-  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_mul_ss
-; SSE2: mulss   %xmm1, %xmm0
-; AVX: vmulss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fdiv <4 x float> %a, %b
-  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test_div_ss
-; SSE2: divss   %xmm1, %xmm0
-; AVX: vdivss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fadd <2 x double> %a, %b
-  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_add_sd
-; SSE2: addsd   %xmm1, %xmm0
-; AVX: vaddsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fsub <2 x double> %a, %b
-  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_sub_sd
-; SSE2: subsd   %xmm1, %xmm0
-; AVX: vsubsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fmul <2 x double> %a, %b
-  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_mul_sd
-; SSE2: mulsd   %xmm1, %xmm0
-; AVX: vmulsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fdiv <2 x double> %a, %b
-  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test_div_sd
-; SSE2: divsd   %xmm1, %xmm0
-; AVX: vdivsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fadd <4 x float> %b, %a
-  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_add_ss
-; SSE2: addss   %xmm0, %xmm1
-; AVX: vaddss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fsub <4 x float> %b, %a
-  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_sub_ss
-; SSE2: subss   %xmm0, %xmm1
-; AVX: vsubss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fmul <4 x float> %b, %a
-  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_mul_ss
-; SSE2: mulss   %xmm0, %xmm1
-; AVX: vmulss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fdiv <4 x float> %b, %a
-  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test2_div_ss
-; SSE2: divss   %xmm0, %xmm1
-; AVX: vdivss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fadd <2 x double> %b, %a
-  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_add_sd
-; SSE2: addsd   %xmm0, %xmm1
-; AVX: vaddsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fsub <2 x double> %b, %a
-  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_sub_sd
-; SSE2: subsd   %xmm0, %xmm1
-; AVX: vsubsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fmul <2 x double> %b, %a
-  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_mul_sd
-; SSE2: mulsd   %xmm0, %xmm1
-; AVX: vmulsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fdiv <2 x double> %b, %a
-  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test2_div_sd
-; SSE2: divsd   %xmm0, %xmm1
-; AVX: vdivsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <4 x float> @test3_add_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fadd <4 x float> %a, %b
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_add_ss
-; SSE2: addss   %xmm1, %xmm0
-; AVX: vaddss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test3_sub_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fsub <4 x float> %a, %b
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_sub_ss
-; SSE2: subss   %xmm1, %xmm0
-; AVX: vsubss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test3_mul_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fmul <4 x float> %a, %b
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_mul_ss
-; SSE2: mulss   %xmm1, %xmm0
-; AVX: vmulss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test3_div_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fdiv <4 x float> %a, %b
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test3_div_ss
-; SSE2: divss   %xmm1, %xmm0
-; AVX: vdivss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test3_add_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fadd <2 x double> %a, %b
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_add_sd
-; SSE2: addsd   %xmm1, %xmm0
-; AVX: vaddsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test3_sub_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fsub <2 x double> %a, %b
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_sub_sd
-; SSE2: subsd   %xmm1, %xmm0
-; AVX: vsubsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test3_mul_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fmul <2 x double> %a, %b
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_mul_sd
-; SSE2: mulsd   %xmm1, %xmm0
-; AVX: vmulsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test3_div_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fdiv <2 x double> %a, %b
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test3_div_sd
-; SSE2: divsd   %xmm1, %xmm0
-; AVX: vdivsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <4 x float> @test4_add_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fadd <4 x float> %b, %a
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_add_ss
-; SSE2: addss   %xmm0, %xmm1
-; AVX: vaddss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test4_sub_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fsub <4 x float> %b, %a
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_sub_ss
-; SSE2: subss   %xmm0, %xmm1
-; AVX: vsubss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test4_mul_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fmul <4 x float> %b, %a
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_mul_ss
-; SSE2: mulss   %xmm0, %xmm1
-; AVX: vmulss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <4 x float> @test4_div_ss(<4 x float> %a, <4 x float> %b) {
-  %1 = fdiv <4 x float> %b, %a
-  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
-  ret <4 x float> %2
-}
-
-; CHECK-LABEL: test4_div_ss
-; SSE2: divss   %xmm0, %xmm1
-; AVX: vdivss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
-define <2 x double> @test4_add_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fadd <2 x double> %b, %a
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_add_sd
-; SSE2: addsd   %xmm0, %xmm1
-; AVX: vaddsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test4_sub_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fsub <2 x double> %b, %a
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_sub_sd
-; SSE2: subsd   %xmm0, %xmm1
-; AVX: vsubsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test4_mul_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fmul <2 x double> %b, %a
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_mul_sd
-; SSE2: mulsd   %xmm0, %xmm1
-; AVX: vmulsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
-define <2 x double> @test4_div_sd(<2 x double> %a, <2 x double> %b) {
-  %1 = fdiv <2 x double> %b, %a
-  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
-  ret <2 x double> %2
-}
-
-; CHECK-LABEL: test4_div_sd
-; SSE2: divsd   %xmm0, %xmm1
-; AVX: vdivsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-

diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 3949a83..b122ef6 100644
--- a/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll

@@ -1,13 +1,23 @@
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
-; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
+; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s
+; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s
+
+target triple = "x86_64-unknown-unknown"
 
 ; Ensure that the backend no longer emits unnecessary vector insert
 ; instructions immediately after SSE scalar fp instructions
 ; like addss or mulss.
 
-
 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %add = fadd float %2, %1
@@ -15,14 +25,16 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_add_ss
-; SSE2: addss   %xmm1, %xmm0
-; AVX: vaddss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %sub = fsub float %2, %1
@@ -30,13 +42,16 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_sub_ss
-; SSE2: subss   %xmm1, %xmm0
-; AVX: vsubss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %mul = fmul float %2, %1
@@ -44,14 +59,16 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_mul_ss
-; SSE2: mulss   %xmm1, %xmm0
-; AVX: vmulss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %div = fdiv float %2, %1
@@ -59,14 +76,16 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_div_ss
-; SSE2: divss   %xmm1, %xmm0
-; AVX: vdivss   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %add = fadd double %2, %1
@@ -74,14 +93,16 @@
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test_add_sd
-; SSE2: addsd   %xmm1, %xmm0
-; AVX: vaddsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %sub = fsub double %2, %1
@@ -89,14 +110,16 @@
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test_sub_sd
-; SSE2: subsd   %xmm1, %xmm0
-; AVX: vsubsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %mul = fmul double %2, %1
@@ -104,14 +127,16 @@
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test_mul_sd
-; SSE2: mulsd   %xmm1, %xmm0
-; AVX: vmulsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %b, i32 0
   %2 = extractelement <2 x double> %a, i32 0
   %div = fdiv double %2, %1
@@ -119,14 +144,17 @@
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test_div_sd
-; SSE2: divsd   %xmm1, %xmm0
-; AVX: vdivsd   %xmm1, %xmm0, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %add = fadd float %1, %2
@@ -134,14 +162,17 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test2_add_ss
-; SSE2: addss   %xmm0, %xmm1
-; AVX: vaddss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %sub = fsub float %2, %1
@@ -149,14 +180,17 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test2_sub_ss
-; SSE2: subss   %xmm0, %xmm1
-; AVX: vsubss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %mul = fmul float %1, %2
@@ -164,14 +198,17 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test2_mul_ss
-; SSE2: mulss   %xmm0, %xmm1
-; AVX: vmulss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test2_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %a, i32 0
   %2 = extractelement <4 x float> %b, i32 0
   %div = fdiv float %2, %1
@@ -179,14 +216,17 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test2_div_ss
-; SSE2: divss   %xmm0, %xmm1
-; AVX: vdivss   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %add = fadd double %1, %2
@@ -194,14 +234,17 @@
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test2_add_sd
-; SSE2: addsd   %xmm0, %xmm1
-; AVX: vaddsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %sub = fsub double %2, %1
@@ -209,14 +252,17 @@
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test2_sub_sd
-; SSE2: subsd   %xmm0, %xmm1
-; AVX: vsubsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %mul = fmul double %1, %2
@@ -224,14 +270,17 @@
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test2_mul_sd
-; SSE2: mulsd   %xmm0, %xmm1
-; AVX: vmulsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: test2_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <2 x double> %a, i32 0
   %2 = extractelement <2 x double> %b, i32 0
   %div = fdiv double %2, %1
@@ -239,14 +288,18 @@
   ret <2 x double> %3
 }
 
-; CHECK-LABEL: test2_div_sd
-; SSE2: divsd   %xmm0, %xmm1
-; AVX: vdivsd   %xmm0, %xmm1, %xmm0
-; CHECK-NOT: movsd
-; CHECK: ret
-
-
 define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_multiple_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %add = fadd float %2, %1
@@ -255,14 +308,19 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_multiple_add_ss
-; CHECK: addss
-; CHECK: addss
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    subss %xmm1, %xmm2
+; SSE-NEXT:    subss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_multiple_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %sub = fsub float %2, %1
@@ -271,14 +329,18 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_multiple_sub_ss
-; CHECK: subss
-; CHECK: subss
-; CHECK-NOT: movss
-; CHECK: ret
-
-
 define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm0, %xmm1
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_multiple_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %mul = fmul float %2, %1
@@ -287,13 +349,19 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_multiple_mul_ss
-; CHECK: mulss
-; CHECK: mulss
-; CHECK-NOT: movss
-; CHECK: ret
-
 define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: test_multiple_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    divss %xmm1, %xmm2
+; SSE-NEXT:    divss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_multiple_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = extractelement <4 x float> %b, i32 0
   %2 = extractelement <4 x float> %a, i32 0
   %div = fdiv float %2, %1
@@ -302,9 +370,501 @@
   ret <4 x float> %3
 }
 
-; CHECK-LABEL: test_multiple_div_ss
-; CHECK: divss
-; CHECK: divss
-; CHECK-NOT: movss
-; CHECK: ret
+; Ensure that the backend selects SSE/AVX scalar fp instructions
+; from a packed fp instrution plus a vector insert.
 
+define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test2_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test2_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test2_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test3_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test3_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test3_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_add_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_add_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_sub_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_sub_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_mul_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_mul_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: insert_test4_div_ss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_div_ss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_add_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_add_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fadd <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_sub_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_sub_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fsub <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_mul_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_mul_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fmul <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: insert_test4_div_sd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_test4_div_sd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = fdiv <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}

diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index 183297e..fd35e75 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll

@@ -1,17 +1,6 @@
 ; Tests for SSE1 and below, without SSE2+.
-; RUN: llc < %s -march=x86 -mcpu=pentium3 -O3 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s
-
-define <8 x i16> @test1(<8 x i32> %a) nounwind {
-; CHECK: test1
-  ret <8 x i16> zeroinitializer
-}
-
-define <8 x i16> @test2(<8 x i32> %a) nounwind {
-; CHECK: test2
-  %c = trunc <8 x i32> %a to <8 x i16>            ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %c
-}
+; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mcpu=pentium3 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s
 
 ; PR7993
 ;define <4 x i32> @test3(<4 x i16> %a) nounwind {
@@ -23,6 +12,15 @@
 ; vector that this ends up returning.
 ; rdar://8368414
 define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movaps %xmm0, %xmm2
+; CHECK-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; CHECK-NEXT:    addss %xmm1, %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT:    subss %xmm1, %xmm2
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    ret
 entry:
   %tmp7 = extractelement <2 x float> %A, i32 0
   %tmp5 = extractelement <2 x float> %A, i32 1
@@ -33,15 +31,6 @@
   %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
   %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
   ret <2 x float> %tmp9
-; CHECK-LABEL: test4:
-; CHECK-NOT: shufps	$16
-; CHECK: shufps	$1, 
-; CHECK-NOT: shufps	$16
-; CHECK: shufps	$1, 
-; CHECK-NOT: shufps	$16
-; CHECK: unpcklps
-; CHECK-NOT: shufps	$16
-; CHECK: ret
 }
 
 ; We used to get stuck in type legalization for this example when lowering the
@@ -50,8 +39,9 @@
 ; condition operand and widening the resulting vselect for the v4f32 result.
 ; PR18036
 
-; CHECK-LABEL: vselect
 define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
+; CHECK-LABEL: vselect:
+; CHECK:         ret
 entry:
   %a1 = icmp eq <4 x i32> %q, zeroinitializer
   %a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer

diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
deleted file mode 100644
index c63ff72..0000000
--- a/test/CodeGen/X86/sse2-blend.ll
+++ /dev/null

@@ -1,57 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
-
-; CHECK-LABEL: vsel_float
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
-define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) {
-  %A = load <4 x float>* %v1
-  %B = load <4 x float>* %v2
-  %vsel = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
-  store <4 x float > %vsel, <4 x float>* %v1
-  ret void
-}
-
-; CHECK-LABEL: vsel_i32
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK-NOT: orps
-; CHECK: ret
-define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
-  %A = load <4 x i32>* %v1
-  %B = load <4 x i32>* %v2
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
-  store <4 x i32 > %vsel, <4 x i32>* %v1
-  ret void
-}
-
-; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK-LABEL: vsel_i64
-; CHECK: andnps
-; CHECK: orps
-; CHECK: ret
-
-define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) {
-  %A = load <2 x i64>* %v1
-  %B = load <2 x i64>* %v2
-  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %A, <2 x i64> %B
-  store <2 x i64 > %vsel, <2 x i64>* %v1
-  ret void
-}
-
-; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK-LABEL: vsel_double
-; CHECK: andnps
-; CHECK: orps
-; CHECK: ret
-
-define void@vsel_double(<2 x double>* %v1, <2 x double>* %v2) {
-  %A = load <2 x double>* %v1
-  %B = load <2 x double>* %v2
-  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %A, <2 x double> %B
-  store <2 x double > %vsel, <2 x double>* %v1
-  ret void
-}
-
-

diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index c906ecd..c4d9e6d 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll

@@ -408,21 +408,21 @@
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-  ; CHECK: pslldq
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
-  ; CHECK: pslldq
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+

+

+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {

+  ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]

+  ret <2 x i64> %res

+}

+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone

+

+

+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {

+  ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]

+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]

+  ret <2 x i64> %res

+}

 declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
 
 
@@ -504,21 +504,21 @@
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-  ; CHECK: psrldq
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
-  ; CHECK: psrldq
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+

+

+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {

+  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]

+  ret <2 x i64> %res

+}

+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone

+

+

+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {

+  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero

+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]

+  ret <2 x i64> %res

+}

 declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
 
 

diff --git a/test/CodeGen/X86/sse2-mul.ll b/test/CodeGen/X86/sse2-mul.ll
deleted file mode 100644
index e066368..0000000
--- a/test/CodeGen/X86/sse2-mul.ll
+++ /dev/null

@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s
-
-define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) {
-  %m = mul <4 x i32> %x, %y
-  ret <4 x i32> %m
-; CHECK-LABEL: test1:
-; CHECK: pshufd $49
-; CHECK: pmuludq
-; CHECK: pshufd $49
-; CHECK: pmuludq
-; CHECK: shufps $-120
-; CHECK: pshufd $-40
-; CHECK: ret
-}

diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index e8d3d6f..b7db6cb 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll

@@ -2,39 +2,48 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
 
 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
+; CHECK-LABEL: test1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movapd (%ecx), %xmm0
+; CHECK-NEXT:    movlpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    movapd %xmm0, (%eax)
+; CHECK-NEXT:    retl
 	%tmp3 = load <2 x double>* %A, align 16
 	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-
-; CHECK-LABEL: test1:
-; CHECK: 	movl	4(%esp), %eax
-; CHECK-NEXT: 	movl	8(%esp), %ecx
-; CHECK-NEXT: 	movapd	(%ecx), %xmm0
-; CHECK-NEXT: 	movlpd	12(%esp), %xmm0
-; CHECK-NEXT: 	movapd	%xmm0, (%eax)
-; CHECK-NEXT: 	ret
 }
 
 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
+; CHECK-LABEL: test2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movapd (%ecx), %xmm0
+; CHECK-NEXT:    movhpd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    movapd %xmm0, (%eax)
+; CHECK-NEXT:    retl
 	%tmp3 = load <2 x double>* %A, align 16
 	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-
-; CHECK-LABEL: test2:
-; CHECK: 	movl	4(%esp), %eax
-; CHECK: 	movl	8(%esp), %ecx
-; CHECK-NEXT: 	movapd	(%ecx), %xmm0
-; CHECK-NEXT: 	movhpd	12(%esp), %xmm0
-; CHECK-NEXT: 	movapd	%xmm0, (%eax)
-; CHECK-NEXT: 	ret
 }
 
 
 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movaps (%edx), %xmm0
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
 	%tmp = load <4 x float>* %B		; <<4 x float>> [#uses=2]
 	%tmp3 = load <4 x float>* %A		; <<4 x float>> [#uses=2]
 	%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0		; <float> [#uses=1]
@@ -47,24 +56,30 @@
 	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3		; <<4 x float>> [#uses=1]
 	store <4 x float> %tmp13, <4 x float>* %res
 	ret void
-; CHECK: @test3
-; CHECK: 	unpcklps
 }
 
 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
 	%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
 	store <4 x float> %tmp5, <4 x float>* %res
 	ret void
-; CHECK: @test4
-; CHECK: 	pshufd	$50, %xmm0, %xmm0
 }
 
 define <4 x i32> @test5(i8** %ptr) nounwind {
 ; CHECK-LABEL: test5:
-; CHECK: pxor
-; CHECK: punpcklbw
-; CHECK: punpcklwd
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    movss (%eax), %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    retl
 	%tmp = load i8** %ptr		; <i8*> [#uses=1]
 	%tmp.upgrd.1 = bitcast i8* %tmp to float*		; <float*> [#uses=1]
 	%tmp.upgrd.2 = load float* %tmp.upgrd.1		; <float> [#uses=1]
@@ -81,30 +96,39 @@
 }
 
 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
-        %tmp1 = load <4 x float>* %A            ; <<4 x float>> [#uses=1]
-        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
-        store <4 x float> %tmp2, <4 x float>* %res
-        ret void
-
 ; CHECK-LABEL: test6:
-; CHECK: 	movaps	(%ecx), %xmm0
-; CHECK:	movaps	%xmm0, (%eax)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movaps (%ecx), %xmm0
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* %A            ; <<4 x float>> [#uses=1]
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
+  store <4 x float> %tmp2, <4 x float>* %res
+  ret void
 }
 
 define void @test7() nounwind {
-        bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
-        shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
-        store <4 x float> %2, <4 x float>* null
-        ret void
-
 ; CHECK-LABEL: test7:
-; CHECK:	xorps	%xmm0, %xmm0
-; CHECK:	movaps	%xmm0, 0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movaps %xmm0, 0
+; CHECK-NEXT:    retl
+  bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
+  shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
+  store <4 x float> %2, <4 x float>* null
+  ret void
 }
 
 @x = external global [4 x i32]
 
 define <2 x i64> @test8() nounwind {
+; CHECK-LABEL: test8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl L_x$non_lazy_ptr, %eax
+; CHECK-NEXT:    movups (%eax), %xmm0
+; CHECK-NEXT:    retl
 	%tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0)		; <i32> [#uses=1]
 	%tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1)		; <i32> [#uses=1]
 	%tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2)		; <i32> [#uses=1]
@@ -115,90 +139,123 @@
 	%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3		; <<4 x i32>> [#uses=1]
 	%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
 	ret <2 x i64> %tmp16
-; CHECK-LABEL: test8:
-; CHECK: movups	(%eax), %xmm0
 }
 
 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: test9:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    retl
 	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
 	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
 	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
 	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
 	ret <4 x float> %tmp13
-; CHECK-LABEL: test9:
-; CHECK: movups	8(%esp), %xmm0
 }
 
 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: test10:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    retl
 	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
 	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
 	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
 	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
 	ret <4 x float> %tmp13
-; CHECK-LABEL: test10:
-; CHECK: movaps	4(%esp), %xmm0
 }
 
 define <2 x double> @test11(double %a, double %b) nounwind {
+; CHECK-LABEL: test11:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    retl
 	%tmp = insertelement <2 x double> undef, double %a, i32 0		; <<2 x double>> [#uses=1]
 	%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1		; <<2 x double>> [#uses=1]
 	ret <2 x double> %tmp7
-; CHECK-LABEL: test11:
-; CHECK: movaps	4(%esp), %xmm0
 }
 
 define void @test12() nounwind {
-        %tmp1 = load <4 x float>* null          ; <<4 x float>> [#uses=2]
-        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
-        %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
-        %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
-        store <4 x float> %tmp4, <4 x float>* null
-        ret void
 ; CHECK-LABEL: test12:
-; CHECK: movhlps
-; CHECK: shufps
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movapd 0, %xmm0
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; CHECK-NEXT:    movsd %xmm0, %xmm1
+; CHECK-NEXT:    xorpd %xmm2, %xmm2
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    movaps %xmm0, 0
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* null          ; <<4 x float>> [#uses=2]
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
+  %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
+  %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
+  store <4 x float> %tmp4, <4 x float>* null
+  ret void
 }
 
 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
-        %tmp3 = load <4 x float>* %B            ; <<4 x float>> [#uses=1]
-        %tmp5 = load <4 x float>* %C            ; <<4 x float>> [#uses=1]
-        %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
-        store <4 x float> %tmp11, <4 x float>* %res
-        ret void
-; CHECK: test13
-; CHECK: shufps	$69, (%ecx), %xmm0
-; CHECK: pshufd	$-40, %xmm0, %xmm0
+; CHECK-LABEL: test13:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movaps (%edx), %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
+  %tmp3 = load <4 x float>* %B            ; <<4 x float>> [#uses=1]
+  %tmp5 = load <4 x float>* %C            ; <<4 x float>> [#uses=1]
+  %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
+  store <4 x float> %tmp11, <4 x float>* %res
+  ret void
 }
 
 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
-        %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=2]
-        %tmp5 = load <4 x float>* %x            ; <<4 x float>> [#uses=2]
-        %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
-        %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
-        %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
-        ret <4 x float> %tmp27
 ; CHECK-LABEL: test14:
-; CHECK: 	addps	[[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]]
-; CHECK: 	subps	[[X1]], [[X2:%xmm[0-9]+]]
-; CHECK: 	movlhps	[[X2]], [[X0]]
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movaps (%ecx), %xmm1
+; CHECK-NEXT:    movaps (%eax), %xmm2
+; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    subps %xmm1, %xmm2
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    retl
+  %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=2]
+  %tmp5 = load <4 x float>* %x            ; <<4 x float>> [#uses=2]
+  %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
+  %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
+  %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
+  ret <4 x float> %tmp27
 }
 
 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
-entry:
-        %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=1]
-        %tmp3 = load <4 x float>* %x            ; <<4 x float>> [#uses=1]
-        %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
-        ret <4 x float> %tmp4
 ; CHECK-LABEL: test15:
-; CHECK: 	movhlps	%xmm1, %xmm0
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movapd (%ecx), %xmm0
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-NEXT:    retl
+entry:
+  %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=1]
+  %tmp3 = load <4 x float>* %x            ; <<4 x float>> [#uses=1]
+  %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
+  ret <4 x float> %tmp4
 }
 
 ; PR8900
-; CHECK-LABEL: test16:
-; CHECK: unpcklpd
-; CHECK: ret
 
 define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
+; CHECK-LABEL: test16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movapd 96(%eax), %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    retl
   %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3
   %i6 = load <4 x double>* %i5, align 32
   %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
@@ -207,6 +264,11 @@
 
 ; PR9009
 define fastcc void @test17() nounwind {
+; CHECK-LABEL: test17:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
+; CHECK-NEXT:    movaps %xmm0, (%eax)
+; CHECK-NEXT:    retl
 entry:
   %0 = insertelement <4 x i32> undef, i32 undef, i32 1
   %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -217,25 +279,48 @@
 
 ; PR9210
 define <4 x float> @f(<4 x double>) nounwind {
+; CHECK-LABEL: f:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
+; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retl
 entry:
  %double2float.i = fptrunc <4 x double> %0 to <4 x float>
  ret <4 x float> %double2float.i
 }
 
 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
-; CHECK-LABEL: test_insert_64_zext
-; CHECK-NOT: xor
-; CHECK: movq
+; CHECK-LABEL: test_insert_64_zext:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movq %xmm0, %xmm0
+; CHECK-NEXT:    retl
   %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %1
 }
 
 define <4 x i32> @PR19721(<4 x i32> %i) {
+; CHECK-LABEL: PR19721:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    movss %xmm1, %xmm0
+; CHECK-NEXT:    retl
   %bc = bitcast <4 x i32> %i to i128
   %insert = and i128 %bc, -4294967296
   %bc2 = bitcast i128 %insert to <4 x i32>
   ret <4 x i32> %bc2
+}
 
-; CHECK-LABEL: PR19721
-; CHECK: punpckldq
+define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_mul:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; CHECK-NEXT:    pmuludq %xmm1, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; CHECK-NEXT:    pmuludq %xmm2, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT:    retl
+  %m = mul <4 x i32> %x, %y
+  ret <4 x i32> %m
 }

diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index b7706cc..5b2de28 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
 
 

diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll
index 8b66743..431588f 100644
--- a/test/CodeGen/X86/sse3-avx-addsub.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK
 ; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -check-prefix=CHECK
 
 ; Test ADDSUB ISel patterns.
@@ -141,156 +141,3 @@
 ; AVX: vaddsubpd
 ; CHECK-NEXT: ret
 
-; Functions below are obtained from the following source:
-;
-; float4 test1(float4 A, float4 B) {
-;   float4 X = A + B;
-;   float4 Y = A - B;
-;   return (float4){X[0], Y[1], X[2], Y[3]};
-; }
-;
-; float8 test2(float8 A, float8 B) {
-;   float8 X = A + B;
-;   float8 Y = A - B;
-;   return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], Y[7]};
-; }
-;
-; double4 test3(double4 A, double4 B) {
-;   double4 X = A + B;
-;   double4 Y = A - B;
-;   return (double4){X[0], Y[1], X[2], Y[3]};
-; }
-;
-; double2 test4(double2 A, double2 B) {
-;   double2 X = A + B;
-;   double2 Y = A - B;
-;   return (double2){X[0], Y[1]};
-; }
-
-define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
-  %sub = fsub <4 x float> %A, %B
-  %add = fadd <4 x float> %A, %B
-  %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x float> %vecinit6
-}
-; CHECK-LABEL: test5
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; CHECK: ret
-
-
-define <8 x float> @test6(<8 x float> %A, <8 x float> %B) {
-  %sub = fsub <8 x float> %A, %B
-  %add = fadd <8 x float> %A, %B
-  %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
-  ret <8 x float> %vecinit14
-}
-; CHECK-LABEL: test6
-; SSE: xorps
-; SSE-NEXT: addsubps
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; AVX-NOT: vxorps
-; AVX-NOT: vaddsubps
-; CHECK: ret
-
-
-define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
-  %sub = fsub <4 x double> %A, %B
-  %add = fadd <4 x double> %A, %B
-  %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x double> %vecinit6
-}
-; CHECK-LABEL: test7
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; AVX-NOT: vxorpd
-; AVX-NOT: vaddsubpd
-; CHECK: ret
-
-
-define <2 x double> @test8(<2 x double> %A, <2 x double> %B) #0 {
-  %add = fadd <2 x double> %A, %B
-  %sub = fsub <2 x double> %A, %B
-  %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %vecinit2
-}
-; CHECK-LABEL: test8
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; CHECK: ret
-
-
-define <4 x float> @test5b(<4 x float> %A, <4 x float> %B) {
-  %sub = fsub <4 x float> %A, %B
-  %add = fadd <4 x float> %B, %A
-  %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x float> %vecinit6
-}
-; CHECK-LABEL: test5
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; CHECK: ret
-
-
-define <8 x float> @test6b(<8 x float> %A, <8 x float> %B) {
-  %sub = fsub <8 x float> %A, %B
-  %add = fadd <8 x float> %B, %A
-  %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
-  ret <8 x float> %vecinit14
-}
-; CHECK-LABEL: test6
-; SSE: xorps
-; SSE-NEXT: addsubps
-; SSE: xorps
-; SSE-NEXT: addsubps
-; AVX: vxorps
-; AVX-NEXT: vaddsubps
-; AVX-NOT: vxorps
-; AVX-NOT: vaddsubps
-; CHECK: ret
-
-
-define <4 x double> @test7b(<4 x double> %A, <4 x double> %B) {
-  %sub = fsub <4 x double> %A, %B
-  %add = fadd <4 x double> %B, %A
-  %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-  ret <4 x double> %vecinit6
-}
-; CHECK-LABEL: test7
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; AVX-NOT: vxorpd
-; AVX-NOT: vaddsubpd
-; CHECK: ret
-
-
-define <2 x double> @test8b(<2 x double> %A, <2 x double> %B) #0 {
-  %add = fadd <2 x double> %B, %A
-  %sub = fsub <2 x double> %A, %B
-  %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3>
-  ret <2 x double> %vecinit2
-}
-; CHECK-LABEL: test8
-; SSE: xorpd
-; SSE-NEXT: addsubpd
-; AVX: vxorpd
-; AVX-NEXT: vaddsubpd
-; CHECK: ret
-

diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 18bdcb3..0a5b0ca 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll

@@ -1,99 +1,120 @@
 ; These are tests for SSE3 codegen.
 
-; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 \
-; RUN:              | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 | FileCheck %s --check-prefix=X64
 
 ; Test for v8xi16 lowering where we extract the first element of the vector and
 ; placed it in the second element of the result.
 
 define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {
+; X64-LABEL: t0:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movl $1, %eax
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X64-NEXT:    movdqa %xmm0, (%rdi)
+; X64-NEXT:    retq
 entry:
 	%tmp3 = load <8 x i16>* %old
 	%tmp6 = shufflevector <8 x i16> %tmp3,
-                <8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
+                <8 x i16> < i16 1, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >,
                 <8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef  >
 	store <8 x i16> %tmp6, <8 x i16>* %dest
 	ret void
-
-; X64-LABEL: t0:
-; X64:	movdqa	(%rsi), %xmm0
-; X64:	pslldq	$2, %xmm0
-; X64:	movdqa	%xmm0, (%rdi)
-; X64:	ret
 }
 
 define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+; X64-LABEL: t1:
+; X64:       ## BB#0:
+; X64-NEXT:    movdqa (%rdi), %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT:    retq
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp3
 
-; X64-LABEL: t1:
-; X64: 	movdqa	(%rdi), %xmm0
-; X64: 	pinsrw	$0, (%rsi), %xmm0
-; X64: 	ret
 }
 
 define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t2:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,3,4,5,6,7]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp
-; X64-LABEL: t2:
-; X64:	pextrw	$1, %xmm1, %eax
-; X64:	pinsrw	$0, %eax, %xmm0
-; X64:	pinsrw	$3, %eax, %xmm0
-; X64:	ret
 }
 
 define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t3:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
 	ret <8 x i16> %tmp
-; X64-LABEL: t3:
-; X64: 	pextrw	$5, %xmm0, %eax
-; X64: 	pshuflw	$44, %xmm0, %xmm0
-; X64: 	pshufhw	$27, %xmm0, %xmm0
-; X64: 	pinsrw	$3, %eax, %xmm0
-; X64: 	ret
 }
 
 define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t4:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7]
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
 	ret <8 x i16> %tmp
-; X64-LABEL: t4:
-; X64: 	pextrw	$7, [[XMM0:%xmm[0-9]+]], %eax
-; X64: 	pshufhw	$100, [[XMM0]], [[XMM1:%xmm[0-9]+]]
-; X64: 	pinsrw	$1, %eax, [[XMM1]]
-; X64: 	pextrw	$1, [[XMM0]], %eax
-; X64: 	pinsrw	$4, %eax, %xmm{{[0-9]}}
-; X64: 	ret
 }
 
 define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t5:
+; X64:       ## BB#0:
+; X64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT:    movdqa %xmm1, %xmm0
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
 	ret <8 x i16> %tmp
-; X64: 	t5:
-; X64: 		movlhps	%xmm1, %xmm0
-; X64: 		pshufd	$114, %xmm0, %xmm0
-; X64: 		ret
 }
 
 define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t6:
+; X64:       ## BB#0:
+; X64-NEXT:    movss %xmm1, %xmm0
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
 	ret <8 x i16> %tmp
-; X64: 	t6:
-; X64: 		movss	%xmm1, %xmm0
-; X64: 		ret
 }
 
 define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind {
+; X64-LABEL: t7:
+; X64:       ## BB#0:
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
+; X64-NEXT:    retq
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >
 	ret <8 x i16> %tmp
-; X64: 	t7:
-; X64: 		pshuflw	$-80, %xmm0, %xmm0
-; X64: 		pshufhw	$-56, %xmm0, %xmm0
-; X64: 		ret
 }
 
 define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
+; X64-LABEL: t8:
+; X64:       ## BB#0:
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; X64-NEXT:    movdqa %xmm0, (%rdi)
+; X64-NEXT:    retq
 	%tmp = load <2 x i64>* %A
 	%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16>
 	%tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0
@@ -115,14 +136,15 @@
 	%tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64>
 	store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res
 	ret void
-; X64: 	t8:
-; X64: 		pshuflw	$-58, (%rsi), %xmm0
-; X64: 		pshufhw	$-58, %xmm0, %xmm0
-; X64: 		movdqa	%xmm0, (%rdi)
-; X64: 		ret
 }
 
 define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
+; X64-LABEL: t9:
+; X64:       ## BB#0:
+; X64-NEXT:    movapd (%rdi), %xmm0
+; X64-NEXT:    movhpd (%rsi), %xmm0
+; X64-NEXT:    movapd %xmm0, (%rdi)
+; X64-NEXT:    retq
 	%tmp = load <4 x float>* %r
 	%tmp.upgrd.3 = bitcast <2 x i32>* %A to double*
 	%tmp.upgrd.4 = load double* %tmp.upgrd.3
@@ -139,11 +161,6 @@
 	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3
 	store <4 x float> %tmp13, <4 x float>* %r
 	ret void
-; X64: 	t9:
-; X64: 		movaps	(%rdi), %xmm0
-; X64:	        movhps	(%rsi), %xmm0
-; X64:	        movaps	%xmm0, (%rdi)
-; X64: 		ret
 }
 
 
@@ -154,113 +171,121 @@
 @g1 = external constant <4 x i32>
 @g2 = external constant <4 x i16>
 
-define internal void @t10() nounwind {
-        load <4 x i32>* @g1, align 16
-        bitcast <4 x i32> %1 to <8 x i16>
-        shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >
-        bitcast <8 x i16> %3 to <2 x i64>
-        extractelement <2 x i64> %4, i32 0
-        bitcast i64 %5 to <4 x i16>
-        store <4 x i16> %6, <4 x i16>* @g2, align 8
-        ret void
-; X64: 	t10:
-; X64: 		pextrw	$4, [[X0:%xmm[0-9]+]], %e{{..}}
-; X64: 		pextrw	$6, [[X0]], %e{{..}}
-; X64: 		movlhps [[X0]], [[X0]]
-; X64: 		pshuflw	$8, [[X0]], [[X0]]
-; X64: 		pinsrw	$2, %e{{..}}, [[X0]]
-; X64: 		pinsrw	$3, %e{{..}}, [[X0]]
+define void @t10() nounwind {
+; X64-LABEL: t10:
+; X64:       ## BB#0:
+; X64-NEXT:    movq _g1@{{.*}}(%rip), %rax
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    movq _g2@{{.*}}(%rip), %rax
+; X64-NEXT:    movq %xmm0, (%rax)
+; X64-NEXT:    retq
+  load <4 x i32>* @g1, align 16
+  bitcast <4 x i32> %1 to <8 x i16>
+  shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef >
+  bitcast <8 x i16> %3 to <2 x i64>
+  extractelement <2 x i64> %4, i32 0
+  bitcast i64 %5 to <4 x i16>
+  store <4 x i16> %6, <4 x i16>* @g2, align 8
+  ret void
 }
 
-
 ; Pack various elements via shuffles.
 define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t11:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; X64-NEXT:    retq
 entry:
 	%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
 	ret <8 x i16> %tmp7
 
-; X64-LABEL: t11:
-; X64:	movd	%xmm1, %eax
-; X64:	movlhps	%xmm0, %xmm0
-; X64:	pshuflw	$1, %xmm0, %xmm0
-; X64:	pinsrw	$1, %eax, %xmm0
-; X64:	ret
 }
 
-
 define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t12:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT:    retq
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
 	ret <8 x i16> %tmp9
 
-; X64-LABEL: t12:
-; X64: 	pextrw	$3, %xmm1, %eax
-; X64: 	movlhps	%xmm0, %xmm0
-; X64: 	pshufhw	$3, %xmm0, %xmm0
-; X64: 	pinsrw	$5, %eax, %xmm0
-; X64: 	ret
 }
 
-
 define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t13:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7]
+; X64-NEXT:    retq
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef >
 	ret <8 x i16> %tmp9
-; X64-LABEL: t13:
-; X64: 	punpcklqdq	%xmm0, %xmm1
-; X64: 	pextrw	$3, %xmm1, %eax
-; X64: 	pshufhw	$12, %xmm1, %xmm0
-; X64: 	pinsrw	$4, %eax, %xmm0
-; X64: 	ret
 }
 
-
 define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t14:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT:    retq
 entry:
 	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef >
 	ret <8 x i16> %tmp9
-; X64-LABEL: t14:
-; X64: 	punpcklqdq	%xmm0, %xmm1
-; X64: 	pshufhw	$8, %xmm1, %xmm0
-; X64: 	ret
 }
 
-
 ; FIXME: t15 is worse off from disabling of scheduler 2-address hack.
 define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
+; X64-LABEL: t15:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; X64-NEXT:    retq
 entry:
-        %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
-        ret <8 x i16> %tmp8
-; X64: 	t15:
-; X64: 		pextrw	$7, %xmm0, %eax
-; X64: 		punpcklqdq	%xmm1, %xmm0
-; X64: 		pshuflw	$-128, %xmm0, %xmm0
-; X64: 		pinsrw	$2, %eax, %xmm0
-; X64: 		ret
+  %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
+  ret <8 x i16> %tmp8
 }
 
-
 ; Test yonah where we convert a shuffle to pextrw and pinrsw
 define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
+; X64-LABEL: t16:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    pxor %xmm2, %xmm2
+; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    packuswb %xmm0, %xmm0
+; X64-NEXT:    retq
 entry:
-        %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0,  i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
-        %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0,  <16 x i32> < i32 0, i32 1, i32 2, i32 17,  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
-        ret <16 x i8> %tmp9
-; X64: 	t16:
-; X64: 		pextrw	$8, %xmm0, %eax
-; X64: 		pslldq	$2, %xmm0
-; X64: 		pextrw	$1, %xmm0, %ecx
-; X64: 		movzbl	%cl, %ecx
-; X64: 		orl	%eax, %ecx
-; X64: 		pinsrw	$1, %ecx, %xmm0
-; X64: 		ret
+  %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0,  i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
+  %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0,  <16 x i32> < i32 0, i32 1, i32 2, i32 17,  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
+  ret <16 x i8> %tmp9
 }
 
 ; rdar://8520311
 define <4 x i32> @t17() nounwind {
-entry:
 ; X64-LABEL: t17:
-; X64:          movddup (%rax), %xmm0
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movddup (%rax), %xmm0
+; X64-NEXT:    andpd {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
+entry:
   %tmp1 = load <4 x float>* undef, align 16
   %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   %tmp3 = load <4 x float>* undef, align 16

diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
deleted file mode 100644
index 3a48121..0000000
--- a/test/CodeGen/X86/sse41-blend.ll
+++ /dev/null

@@ -1,140 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
-
-;CHECK-LABEL: vsel_float:
-;CHECK: blendps
-;CHECK: ret
-define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
-  ret <4 x float> %vsel
-}
-
-
-;CHECK-LABEL: vsel_4xi8:
-;CHECK: blendps
-;CHECK: ret
-define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
-  ret <4 x i8> %vsel
-}
-
-;CHECK-LABEL: vsel_4xi16:
-;CHECK: blendps
-;CHECK: ret
-define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
-  ret <4 x i16> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i32:
-;CHECK: blendps
-;CHECK: ret
-define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
-  ret <4 x i32> %vsel
-}
-
-
-;CHECK-LABEL: vsel_double:
-;CHECK: movsd
-;CHECK: ret
-define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %v1, <4 x double> %v2
-  ret <4 x double> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i64:
-;CHECK: movsd
-;CHECK: ret
-define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v1, <4 x i64> %v2
-  ret <4 x i64> %vsel
-}
-
-
-;CHECK-LABEL: vsel_i8:
-;CHECK: pblendvb
-;CHECK: ret
-define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
-  %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
-  ret <16 x i8> %vsel
-}
-
-;; TEST blend + compares
-; CHECK: A
-define <2 x double> @A(<2 x double> %x, <2 x double> %y) {
-  ; CHECK: cmplepd
-  ; CHECK: blendvpd
-  %max_is_x = fcmp oge <2 x double> %x, %y
-  %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
-  ret <2 x double> %max
-}
-
-; CHECK: B
-define <2 x double> @B(<2 x double> %x, <2 x double> %y) {
-  ; CHECK: cmpnlepd
-  ; CHECK: blendvpd
-  %min_is_x = fcmp ult <2 x double> %x, %y
-  %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
-  ret <2 x double> %min
-}
-
-; CHECK: float_crash
-define void @float_crash() nounwind {
-entry:
-  %merge205vector_func.i = select <4 x i1> undef, <4 x double> undef, <4 x double> undef
-  %extract214vector_func.i = extractelement <4 x double> %merge205vector_func.i, i32 0
-  store double %extract214vector_func.i, double addrspace(1)* undef, align 8
-  ret void
-}
-
-; If we can figure out a blend has a constant mask, we should emit the
-; blend instruction with an immediate mask
-define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
-; In this case, we emit a simple movss
-; CHECK-LABEL: constant_blendvpd
-; CHECK: movsd
-; CHECK: ret
-  %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab
-  ret <2 x double> %1
-}
-
-define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
-; CHECK-LABEL: constant_blendvps
-; CHECK-NOT: mov
-; CHECK: blendps $7
-; CHECK: ret
-  %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd
-  ret <4 x float> %1
-}
-
-define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
-; CHECK-LABEL: constant_pblendvb:
-; CHECK: movaps
-; CHECK: pblendvb
-; CHECK: ret
-  %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
-  ret <16 x i8> %1
-}
-
-declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
-
-;; 2 tests for shufflevectors that optimize to blend + immediate
-; CHECK-LABEL: @blend_shufflevector_4xfloat
-; CHECK: blendps $6, %xmm1, %xmm0
-; CHECK: ret
-define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
-  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-  ret <4 x float> %1
-}
-
-; CHECK-LABEL: @blend_shufflevector_8xi16
-; CHECK: pblendw $134, %xmm1, %xmm0
-; CHECK: ret
-define <8 x i16> @blend_shufflevector_8xi16(<8 x i16> %a, <8 x i16> %b) {
-  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15>
-  ret <8 x i16> %1
-}

diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
new file mode 100644
index 0000000..6fab98e
--- /dev/null
+++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll

@@ -0,0 +1,61 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s
+; This test works just like the non-upgrade one except that it only checks
+; forms which require auto-upgrading.
+
+define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: blendpd
+  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: blendps
+  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: dppd
+  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: dpps
+  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: insertps
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+
+define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: mpsadbw
+  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pblendw
+  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+
+

diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
index 37eff43..5f25a16 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll

@@ -2,18 +2,18 @@
 
 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
   ; CHECK: blendpd
-  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: blendps
-  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
@@ -34,35 +34,35 @@
 
 define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
   ; CHECK: dppd
-  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: dpps
-  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: insertps
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
 
 
 define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: mpsadbw
-  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
-declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
 define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
@@ -83,10 +83,10 @@
 
 define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
   ; CHECK: pblendw
-  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
   ret <8 x i16> %res
 }
-declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
 
 
 define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {

diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 6726a3e..d5c6f74 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll

@@ -1,30 +1,47 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
 
 @g16 = external global i16
 
 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
-        %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
-        ret <4 x i32> %tmp1
 ; X32-LABEL: pinsrd_1:
-; X32:    pinsrd $1, 4(%esp), %xmm0
-
+; X32:       ## BB#0:
+; X32-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    retl
+;
 ; X64-LABEL: pinsrd_1:
-; X64:    pinsrd $1, %edi, %xmm0
+; X64:       ## BB#0:
+; X64-NEXT:    pinsrd $1, %edi, %xmm0
+; X64-NEXT:    retq
+  %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
+  ret <4 x i32> %tmp1
 }
 
 define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
-        %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
-        ret <16 x i8> %tmp1
 ; X32-LABEL: pinsrb_1:
-; X32:    pinsrb $1, 4(%esp), %xmm0
-
+; X32:       ## BB#0:
+; X32-NEXT:    pinsrb $1, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    retl
+;
 ; X64-LABEL: pinsrb_1:
-; X64:    pinsrb $1, %edi, %xmm0
+; X64:       ## BB#0:
+; X64-NEXT:    pinsrb $1, %edi, %xmm0
+; X64-NEXT:    retq
+  %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
+  ret <16 x i8> %tmp1
 }
 
-
 define <2 x i64> @pmovsxbd_1(i32* %p) nounwind {
+; X32-LABEL: pmovsxbd_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pmovsxbd (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pmovsxbd_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    pmovsxbd (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
 	%0 = load i32* %p, align 4
 	%1 = insertelement <4 x i32> undef, i32 %0, i32 0
@@ -35,16 +52,19 @@
 	%6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone
 	%7 = bitcast <4 x i32> %6 to <2 x i64>
 	ret <2 x i64> %7
-        
-; X32: _pmovsxbd_1:
-; X32:   movl      4(%esp), %eax
-; X32:   pmovsxbd   (%eax), %xmm0
-
-; X64: _pmovsxbd_1:
-; X64:   pmovsxbd   (%rdi), %xmm0
 }
 
 define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly {
+; X32-LABEL: pmovsxwd_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pmovsxwd (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pmovsxwd_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    pmovsxwd (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
 	%0 = load i64* %p		; <i64> [#uses=1]
 	%tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0		; <<2 x i64>> [#uses=1]
@@ -52,63 +72,59 @@
 	%2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone		; <<4 x i32>> [#uses=1]
 	%3 = bitcast <4 x i32> %2 to <2 x i64>		; <<2 x i64>> [#uses=1]
 	ret <2 x i64> %3
-        
-; X32: _pmovsxwd_1:
-; X32:   movl 4(%esp), %eax
-; X32:   pmovsxwd (%eax), %xmm0
-
-; X64: _pmovsxwd_1:
-; X64:   pmovsxwd (%rdi), %xmm0
 }
 
-
-
-
 define <2 x i64> @pmovzxbq_1() nounwind {
+; X32-LABEL: pmovzxbq_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl L_g16$non_lazy_ptr, %eax
+; X32-NEXT:    pmovzxbq (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pmovzxbq_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movq _g16@{{.*}}(%rip), %rax
+; X64-NEXT:    pmovzxbq (%rax), %xmm0
+; X64-NEXT:    retq
 entry:
 	%0 = load i16* @g16, align 2		; <i16> [#uses=1]
 	%1 = insertelement <8 x i16> undef, i16 %0, i32 0		; <<8 x i16>> [#uses=1]
 	%2 = bitcast <8 x i16> %1 to <16 x i8>		; <<16 x i8>> [#uses=1]
 	%3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone		; <<2 x i64>> [#uses=1]
 	ret <2 x i64> %3
-
-; X32: _pmovzxbq_1:
-; X32:   movl	L_g16$non_lazy_ptr, %eax
-; X32:   pmovzxbq	(%eax), %xmm0
-
-; X64: _pmovzxbq_1:
-; X64:   movq	_g16@GOTPCREL(%rip), %rax
-; X64:   pmovzxbq	(%rax), %xmm0
 }
 
 declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
 
-
-
-
 define i32 @extractps_1(<4 x float> %v) nounwind {
+; X32-LABEL: extractps_1:
+; X32:       ## BB#0:
+; X32-NEXT:    extractps $3, %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: extractps_1:
+; X64:       ## BB#0:
+; X64-NEXT:    extractps $3, %xmm0, %eax
+; X64-NEXT:    retq
   %s = extractelement <4 x float> %v, i32 3
   %i = bitcast float %s to i32
   ret i32 %i
-
-; X32: _extractps_1:  
-; X32:	  extractps	$3, %xmm0, %eax
-
-; X64: _extractps_1:  
-; X64:	  extractps	$3, %xmm0, %eax
 }
 define i32 @extractps_2(<4 x float> %v) nounwind {
+; X32-LABEL: extractps_2:
+; X32:       ## BB#0:
+; X32-NEXT:    extractps $3, %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: extractps_2:
+; X64:       ## BB#0:
+; X64-NEXT:    extractps $3, %xmm0, %eax
+; X64-NEXT:    retq
   %t = bitcast <4 x float> %v to <4 x i32>
   %s = extractelement <4 x i32> %t, i32 3
   ret i32 %s
-
-; X32: _extractps_2:
-; X32:	  extractps	$3, %xmm0, %eax
-
-; X64: _extractps_2:
-; X64:	  extractps	$3, %xmm0, %eax
 }
 
 
@@ -117,106 +133,152 @@
 ; is bitcasted to i32, but unsuitable for much of anything else.
 
 define float @ext_1(<4 x float> %v) nounwind {
+; X32-LABEL: ext_1:
+; X32:       ## BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-NEXT:    addss LCPI7_0, %xmm0
+; X32-NEXT:    movss %xmm0, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ext_1:
+; X64:       ## BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT:    addss {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
   %s = extractelement <4 x float> %v, i32 3
   %t = fadd float %s, 1.0
   ret float %t
-
-; X32: _ext_1:
-; X32:	  pshufd	$3, %xmm0, %xmm0
-; X32:	  addss	LCPI7_0, %xmm0
-
-; X64: _ext_1:
-; X64:	  pshufd	$3, %xmm0, %xmm0
-; X64:	  addss	LCPI7_0(%rip), %xmm0
 }
 define float @ext_2(<4 x float> %v) nounwind {
+; X32-LABEL: ext_2:
+; X32:       ## BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-NEXT:    movss %xmm0, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ext_2:
+; X64:       ## BB#0:
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-NEXT:    retq
   %s = extractelement <4 x float> %v, i32 3
   ret float %s
-
-; X32: _ext_2:
-; X32:	  pshufd	$3, %xmm0, %xmm0
-
-; X64: _ext_2:
-; X64:	  pshufd	$3, %xmm0, %xmm0
 }
 define i32 @ext_3(<4 x i32> %v) nounwind {
+; X32-LABEL: ext_3:
+; X32:       ## BB#0:
+; X32-NEXT:    pextrd $3, %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ext_3:
+; X64:       ## BB#0:
+; X64-NEXT:    pextrd $3, %xmm0, %eax
+; X64-NEXT:    retq
   %i = extractelement <4 x i32> %v, i32 3
   ret i32 %i
-
-; X32: _ext_3:
-; X32:	  pextrd	$3, %xmm0, %eax
-
-; X64: _ext_3:
-; X64:	  pextrd	$3, %xmm0, %eax
 }
 
 define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
-        %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
-        ret <4 x float> %tmp1
-; X32: _insertps_1:
-; X32:    insertps  $1, %xmm1, %xmm0
-
-; X64: _insertps_1:
-; X64:    insertps  $1, %xmm1, %xmm0
+; X32-LABEL: insertps_1:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_1:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3]
+; X64-NEXT:    retq
+  %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone
+  ret <4 x float> %tmp1
 }
 
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
 
 define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
-        %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
-        ret <4 x float> %tmp1
-; X32: _insertps_2:
-; X32:    insertps  $0, 4(%esp), %xmm0
-
-; X64: _insertps_2:
-; X64:    insertps  $0, %xmm1, %xmm0        
+; X32-LABEL: insertps_2:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps $0, {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_2:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
+  ret <4 x float> %tmp1
 }
-
 define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
-        %tmp2 = extractelement <4 x float> %t2, i32 0
-        %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
-        ret <4 x float> %tmp1
-; X32: _insertps_3:
-; X32:    insertps  $0, %xmm1, %xmm0        
-
-; X64: _insertps_3:
-; X64:    insertps  $0, %xmm1, %xmm0        
+; X32-LABEL: insertps_3:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_3:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-NEXT:    retq
+  %tmp2 = extractelement <4 x float> %t2, i32 0
+  %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
+  ret <4 x float> %tmp1
 }
 
 define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
-        %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
-        ret i32 %tmp1
-; X32: _ptestz_1:
-; X32:    ptest 	%xmm1, %xmm0
-; X32:    sete	%al
-
-; X64: _ptestz_1:
-; X64:    ptest 	%xmm1, %xmm0
-; X64:    sete	%al
+; X32-LABEL: ptestz_1:
+; X32:       ## BB#0:
+; X32-NEXT:    ptest %xmm1, %xmm0
+; X32-NEXT:    sete %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ptestz_1:
+; X64:       ## BB#0:
+; X64-NEXT:    ptest %xmm1, %xmm0
+; X64-NEXT:    sete %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
+  ret i32 %tmp1
 }
 
 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
-        %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
-        ret i32 %tmp1
-; X32: _ptestz_2:
-; X32:    ptest 	%xmm1, %xmm0
-; X32:    sbbl	%eax
-
-; X64: _ptestz_2:
-; X64:    ptest 	%xmm1, %xmm0
-; X64:    sbbl	%eax
+; X32-LABEL: ptestz_2:
+; X32:       ## BB#0:
+; X32-NEXT:    ptest %xmm1, %xmm0
+; X32-NEXT:    sbbl %eax, %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ptestz_2:
+; X64:       ## BB#0:
+; X64-NEXT:    ptest %xmm1, %xmm0
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    retq
+  %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
+  ret i32 %tmp1
 }
 
 define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
-        %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
-        ret i32 %tmp1
-; X32: _ptestz_3:
-; X32:    ptest 	%xmm1, %xmm0
-; X32:    seta	%al
-
-; X64: _ptestz_3:
-; X64:    ptest 	%xmm1, %xmm0
-; X64:    seta	%al
+; X32-LABEL: ptestz_3:
+; X32:       ## BB#0:
+; X32-NEXT:    ptest %xmm1, %xmm0
+; X32-NEXT:    seta %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: ptestz_3:
+; X64:       ## BB#0:
+; X64-NEXT:    ptest %xmm1, %xmm0
+; X64-NEXT:    seta %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+  %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
+  ret i32 %tmp1
 }
 
 
@@ -227,6 +289,25 @@
 ; This used to compile to insertps $0  + insertps $16.  insertps $0 is always
 ; pointless.
 define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind  {
+; X32-LABEL: buildvector:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movaps %xmm0, %xmm2
+; X32-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X32-NEXT:    addss %xmm1, %xmm0
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X32-NEXT:    addss %xmm2, %xmm1
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: buildvector:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movaps %xmm0, %xmm2
+; X64-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; X64-NEXT:    addss %xmm1, %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    addss %xmm2, %xmm1
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; X64-NEXT:    retq
 entry:
   %tmp7 = extractelement <2 x float> %A, i32 0
   %tmp5 = extractelement <2 x float> %A, i32 1
@@ -237,97 +318,124 @@
   %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0
   %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1
   ret <2 x float> %tmp9
-; X32-LABEL: buildvector:
-; X32-NOT: insertps $0
-; X32: insertps $16
-; X32-NOT: insertps $0
-; X32: ret
-; X64-LABEL: buildvector:
-; X64-NOT: insertps $0
-; X64: insertps $16
-; X64-NOT: insertps $0
-; X64: ret
 }
 
 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; X32-LABEL: insertps_from_shufflevector_1:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $48, (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_shufflevector_1:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    insertps $48, (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = load <4 x float>* %pb, align 16
   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x float> %vecinit6
-; CHECK-LABEL: insertps_from_shufflevector_1:
-; CHECK-NOT: movss
-; CHECK-NOT: shufps
-; CHECK: insertps    $48,
-; CHECK: ret
 }
 
 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+; X32-LABEL: insertps_from_shufflevector_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_shufflevector_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
+; X64-NEXT:    retq
 entry:
   %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
   ret <4 x float> %vecinit6
-; CHECK-LABEL: insertps_from_shufflevector_2:
-; CHECK-NOT: shufps
-; CHECK: insertps    $96,
-; CHECK: ret
 }
 
 ; For loading an i32 from memory into an xmm register we use pinsrd
 ; instead of insertps
 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
+; X32-LABEL: pinsrd_from_shufflevector_i32:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $48, (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pinsrd_from_shufflevector_i32:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    insertps $48, (%rdi), %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = load <4 x i32>* %pb, align 16
   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %vecinit6
-; CHECK-LABEL: pinsrd_from_shufflevector_i32:
-; CHECK-NOT: movss
-; CHECK-NOT: shufps
-; CHECK: pinsrd  $3,
-; CHECK: ret
 }
 
 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
+; X32-LABEL: insertps_from_shufflevector_i32_2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_shufflevector_i32_2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3]
+; X64-NEXT:    retq
 entry:
   %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
   ret <4 x i32> %vecinit6
-; CHECK-LABEL: insertps_from_shufflevector_i32_2:
-; CHECK-NOT: shufps
-; CHECK-NOT: movaps
-; CHECK: insertps    $208,
-; CHECK: ret
 }
 
 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
-; CHECK-LABEL: insertps_from_load_ins_elt_undef:
-; CHECK-NOT: movss
-; CHECK-NOT: shufps
-; CHECK: insertps    $16,
-; CHECK: ret
+; X32-LABEL: insertps_from_load_ins_elt_undef:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $16, (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_load_ins_elt_undef:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps $16, (%rdi), %xmm0
+; X64-NEXT:    retq
   %1 = load float* %b, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
   ret <4 x float> %result
 }
 
-define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
-; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
-;; aCHECK-NOT: movd
-; CHECK-NOT: shufps
-; CHECK: insertps    $32,
-; CHECK: ret
+define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
+; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movd (%eax), %xmm1
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
+; X64:       ## BB#0:
+; X64-NEXT:    movd (%rdi), %xmm1
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X64-NEXT:    retq
   %1 = load i32* %b, align 4
   %2 = insertelement <4 x i32> undef, i32 %1, i32 0
   %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
   ret <4 x i32> %result
 }
 
-;;;;;; Shuffles optimizable with a single insertps instruction
+;;;;;; Shuffles optimizable with a single insertps or blend instruction
 define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XYZ0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $8
-; CHECK: ret
+; X32-LABEL: shuf_XYZ0:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_XYZ0:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -339,11 +447,15 @@
 }
 
 define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XY00:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $12
-; CHECK: ret
+; X32-LABEL: shuf_XY00:
+; X32:       ## BB#0:
+; X32-NEXT:    movq %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_XY00:
+; X64:       ## BB#0:
+; X64-NEXT:    movq %xmm0, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -354,11 +466,15 @@
 }
 
 define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XYY0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $104
-; CHECK: ret
+; X32-LABEL: shuf_XYY0:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_XYY0:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -369,9 +485,15 @@
 }
 
 define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_XYW0:
-; CHECK: insertps    $232
-; CHECK: ret
+; X32-LABEL: shuf_XYW0:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_XYW0:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -383,11 +505,15 @@
 }
 
 define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_W00W:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $198
-; CHECK: ret
+; X32-LABEL: shuf_W00W:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_W00W:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 3
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -397,11 +523,19 @@
 }
 
 define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_X00A:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps    $48
-; CHECK: ret
+; X32-LABEL: shuf_X00A:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_X00A:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -411,11 +545,21 @@
 }
 
 define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_X00X:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps    $48
-; CHECK: ret
+; X32-LABEL: shuf_X00X:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0]
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_X00X:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0]
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -425,12 +569,23 @@
 }
 
 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
-; CHECK-LABEL: shuf_X0YC:
-; CHECK: shufps
-; CHECK-NOT: movhlps
-; CHECK-NOT: shufps
-; CHECK: insertps    $176
-; CHECK: ret
+; X32-LABEL: shuf_X0YC:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X32-NEXT:    movaps %xmm2, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: shuf_X0YC:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X64-NEXT:    movaps %xmm2, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
@@ -440,11 +595,17 @@
 }
 
 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XYZ0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $8
-; CHECK: ret
+; X32-LABEL: i32_shuf_XYZ0:
+; X32:       ## BB#0:
+; X32-NEXT:    pxor %xmm1, %xmm1
+; X32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_XYZ0:
+; X64:       ## BB#0:
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecext1 = extractelement <4 x i32> %x, i32 1
@@ -456,11 +617,15 @@
 }
 
 define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XY00:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $12
-; CHECK: ret
+; X32-LABEL: i32_shuf_XY00:
+; X32:       ## BB#0:
+; X32-NEXT:    movq %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_XY00:
+; X64:       ## BB#0:
+; X64-NEXT:    movq %xmm0, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecext1 = extractelement <4 x i32> %x, i32 1
@@ -471,11 +636,15 @@
 }
 
 define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XYY0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $104
-; CHECK: ret
+; X32-LABEL: i32_shuf_XYY0:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_XYY0:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecext1 = extractelement <4 x i32> %x, i32 1
@@ -486,11 +655,15 @@
 }
 
 define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_XYW0:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $232
-; CHECK: ret
+; X32-LABEL: i32_shuf_XYW0:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_XYW0:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecext1 = extractelement <4 x i32> %x, i32 1
@@ -502,11 +675,15 @@
 }
 
 define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_W00W:
-; CHECK-NOT: pextrd
-; CHECK-NOT: punpckldq
-; CHECK: insertps    $198
-; CHECK: ret
+; X32-LABEL: i32_shuf_W00W:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_W00W:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 3
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -516,11 +693,19 @@
 }
 
 define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_X00A:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps    $48
-; CHECK: ret
+; X32-LABEL: i32_shuf_X00A:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_X00A:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -530,11 +715,21 @@
 }
 
 define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_X00X:
-; CHECK-NOT: movaps
-; CHECK-NOT: shufps
-; CHECK: insertps    $48
-; CHECK: ret
+; X32-LABEL: i32_shuf_X00X:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_X00X:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -544,12 +739,23 @@
 }
 
 define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
-; CHECK-LABEL: i32_shuf_X0YC:
-; CHECK: shufps
-; CHECK-NOT: movhlps
-; CHECK-NOT: shufps
-; CHECK: insertps    $176
-; CHECK: ret
+; X32-LABEL: i32_shuf_X0YC:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X32-NEXT:    movaps %xmm2, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: i32_shuf_X0YC:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2]
+; X64-NEXT:    movaps %xmm2, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x i32> %x, i32 0
   %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
@@ -560,11 +766,19 @@
 
 ;; Test for a bug in the first implementation of LowerBuildVectorv4x32
 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
-; CHECK-LABEL: test_insertps_no_undef:
-; CHECK: movaps  %xmm0, %xmm1
-; CHECK-NEXT: insertps        $8, %xmm1, %xmm1
-; CHECK-NEXT: maxps   %xmm1, %xmm0
-; CHECK-NEXT: ret
+; X32-LABEL: test_insertps_no_undef:
+; X32:       ## BB#0:
+; X32-NEXT:    xorps %xmm1, %xmm1
+; X32-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
+; X32-NEXT:    maxps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_insertps_no_undef:
+; X64:       ## BB#0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
+; X64-NEXT:    maxps %xmm1, %xmm0
+; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %x, i32 1
@@ -578,48 +792,75 @@
 }
 
 define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
-; CHECK-LABEL: blendvb_fallback
-; CHECK: blendvb
-; CHECK: ret
+; X32-LABEL: blendvb_fallback:
+; X32:       ## BB#0:
+; X32-NEXT:    psllw $15, %xmm0
+; X32-NEXT:    psraw $15, %xmm0
+; X32-NEXT:    pblendvb %xmm1, %xmm2
+; X32-NEXT:    movdqa %xmm2, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: blendvb_fallback:
+; X64:       ## BB#0:
+; X64-NEXT:    psllw $15, %xmm0
+; X64-NEXT:    psraw $15, %xmm0
+; X64-NEXT:    pblendvb %xmm1, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm0
+; X64-NEXT:    retq
   %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %ret
 }
 
-define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; CHECK-LABEL: insertps_from_vector_load:
 ; On X32, account for the argument's move to registers
-; X32: movl    4(%esp), %eax
-; CHECK-NOT: mov
-; CHECK: insertps    $48
-; CHECK-NEXT: ret
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; X32-LABEL: insertps_from_vector_load:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $48, (%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_vector_load:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps $48, (%rdi), %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
   ret <4 x float> %2
 }
 
 ;; Use a non-zero CountS for insertps
-define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; CHECK-LABEL: insertps_from_vector_load_offset:
-; On X32, account for the argument's move to registers
-; X32: movl    4(%esp), %eax
-; CHECK-NOT: mov
 ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: insertps    $96, 4(%{{...}}), %
-; CHECK-NEXT: ret
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; X32-LABEL: insertps_from_vector_load_offset:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $96, 4(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_vector_load_offset:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps $96, 4(%rdi), %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
   ret <4 x float> %2
 }
 
-define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
-; CHECK-LABEL: insertps_from_vector_load_offset_2:
-; On X32, account for the argument's move to registers
-; X32: movl    4(%esp), %eax
-; X32: movl    8(%esp), %ecx
-; CHECK-NOT: mov
 ;; Try to match a bit more of the instr, since we need the load's offset.
-; CHECK: insertps    $192, 12(%{{...}},%{{...}}), %
-; CHECK-NEXT: ret
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; X32-LABEL: insertps_from_vector_load_offset_2:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    shll $4, %ecx
+; X32-NEXT:    insertps $-64, 12(%eax,%ecx), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_vector_load_offset_2:
+; X64:       ## BB#0:
+; X64-NEXT:    shlq $4, %rsi
+; X64-NEXT:    insertps $-64, 12(%rdi,%rsi), %xmm0
+; X64-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
   %2 = load <4 x float>* %1, align 16
   %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
@@ -627,13 +868,21 @@
 }
 
 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
-; CHECK-LABEL: insertps_from_broadcast_loadf32:
-; On X32, account for the arguments' move to registers
-; X32: movl    8(%esp), %eax
-; X32: movl    4(%esp), %ecx
-; CHECK-NOT: mov
-; CHECK: insertps    $48
-; CHECK-NEXT: ret
+; X32-LABEL: insertps_from_broadcast_loadf32:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movss (%ecx,%eax,4), %xmm1
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_broadcast_loadf32:
+; X64:       ## BB#0:
+; X64-NEXT:    movss (%rdi,%rsi,4), %xmm1
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT:    retq
   %1 = getelementptr inbounds float* %fb, i64 %index
   %2 = load float* %1, align 4
   %3 = insertelement <4 x float> undef, float %2, i32 0
@@ -645,12 +894,20 @@
 }
 
 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
-; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
-; On X32, account for the arguments' move to registers
-; X32: movl    4(%esp), %{{...}}
-; CHECK-NOT: mov
-; CHECK: insertps    $48
-; CHECK-NEXT: ret
+; X32-LABEL: insertps_from_broadcast_loadv4f32:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movups (%eax), %xmm1
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_broadcast_loadv4f32:
+; X64:       ## BB#0:
+; X64-NEXT:    movups (%rdi), %xmm1
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; X64-NEXT:    retq
   %1 = load <4 x float>* %b, align 4
   %2 = extractelement <4 x float> %1, i32 0
   %3 = insertelement <4 x float> undef, float %2, i32 0
@@ -663,20 +920,33 @@
 
 ;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
-; CHECK-LABEL: insertps_from_broadcast_multiple_use:
-; On X32, account for the arguments' move to registers
-; X32: movl    8(%esp), %eax
-; X32: movl    4(%esp), %ecx
-; CHECK: movss
-; CHECK-NOT: mov
-; CHECK: insertps    $48
-; CHECK: insertps    $48
-; CHECK: insertps    $48
-; CHECK: insertps    $48
-; CHECK: addps
-; CHECK: addps
-; CHECK: addps
-; CHECK-NEXT: ret
+; X32-LABEL: insertps_from_broadcast_multiple_use:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movss (%ecx,%eax,4), %xmm4
+; X32-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
+; X32-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; X32-NEXT:    addps %xmm1, %xmm0
+; X32-NEXT:    addps %xmm2, %xmm3
+; X32-NEXT:    addps %xmm3, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_from_broadcast_multiple_use:
+; X64:       ## BB#0:
+; X64-NEXT:    movss (%rdi,%rsi,4), %xmm4
+; X64-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT:    insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
+; X64-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; X64-NEXT:    addps %xmm1, %xmm0
+; X64-NEXT:    addps %xmm2, %xmm3
+; X64-NEXT:    addps %xmm3, %xmm0
+; X64-NEXT:    retq
   %1 = getelementptr inbounds float* %fb, i64 %index
   %2 = load float* %1, align 4
   %3 = insertelement <4 x float> undef, float %2, i32 0
@@ -694,10 +964,20 @@
 }
 
 define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
-; CHECK-LABEL: insertps_with_undefs:
-; CHECK-NOT: shufps
-; CHECK: insertps    $32, %xmm0
-; CHECK: ret
+; X32-LABEL: insertps_with_undefs:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movss (%eax), %xmm1
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3]
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_with_undefs:
+; X64:       ## BB#0:
+; X64-NEXT:    movss (%rdi), %xmm1
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3]
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
   %1 = load float* %b, align 4
   %2 = insertelement <4 x float> undef, float %1, i32 0
   %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7>
@@ -707,10 +987,162 @@
 ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using
 ; the destination index to change the load, instead of the source index.
 define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
-; CHECK-LABEL: pr20087:
-; CHECK: insertps  $48
-; CHECK: ret
+; X32-LABEL: pr20087:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    insertps $-78, 8(%eax), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: pr20087:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps $-78, 8(%rdi), %xmm0
+; X64-NEXT:    retq
   %load = load <4 x float> *%ptr
   %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2>
   ret <4 x float> %ret
 }
+
+; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
+define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
+; X32-LABEL: insertps_pr20411:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; X32-NEXT:    insertps $-36, LCPI49_1+12, %xmm0
+; X32-NEXT:    movups %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_pr20411:
+; X64:       ## BB#0:
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; X64-NEXT:    insertps $-36, LCPI49_1+{{.*}}(%rip), %xmm0
+; X64-NEXT:    movups %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>  ; 4 5 6 7
+  %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; 3 x x x
+  %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 4, i32 3, i32 undef, i32 undef> ; 3 7 x x
+  %ptrcast = bitcast i32* %RET to <4 x i32>*
+  store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
+  ret void
+}
+
+define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_4:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_4:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+  %vecext2 = extractelement <4 x float> %B, i32 2
+  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_5:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_5:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %B, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_6:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_6:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 1
+  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
+  %vecext1 = extractelement <4 x float> %B, i32 2
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit3
+}
+
+define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_7:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_7:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
+  %vecext2 = extractelement <4 x float> %B, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_8:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_8:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %B, i32 0
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
+; X32-LABEL: insertps_9:
+; X32:       ## BB#0:
+; X32-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: insertps_9:
+; X64:       ## BB#0:
+; X64-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    retq
+entry:
+  %vecext = extractelement <4 x float> %A, i32 0
+  %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
+  %vecext1 = extractelement <4 x float> %B, i32 2
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
+  ret <4 x float> %vecinit3
+}

diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index cf88ade..cf0f999 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll

@@ -10,88 +10,88 @@
 ; Function Attrs: nounwind sspreq
 define i32 @_Z18read_response_sizev() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23), !dbg !39
+  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !39
   %0 = load i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0), align 8, !dbg !40
-  tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64), !dbg !71
+  tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64, metadata !{metadata !"0x102"}), !dbg !71
   %1 = trunc i64 %0 to i32
   ret i32 %1
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 attributes #0 = { sspreq }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21, !72}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \001\00\000\00\001", metadata !1, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !"/Users/matt/ryan_bug"}
 !2 = metadata !{metadata !3}
-!3 = metadata !{i32 786436, metadata !1, metadata !4, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 19, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
+!3 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !1, metadata !4, null, metadata !6, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00C\0019\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{}
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
+!7 = metadata !{metadata !"0x28\00max_frame_size\000"} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
 !8 = metadata !{metadata !9, metadata !24, metadata !41, metadata !65}
-!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
-!10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x2e\00read_response_size\00read_response_size\00_Z18read_response_sizev\0027\000\001\000\006\00256\001\0027", metadata !1, metadata !10, metadata !11, null, i32 ()* @_Z18read_response_sizev, null, null, metadata !14} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
+!10 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !14 = metadata !{metadata !15, metadata !19}
-!15 = metadata !{i32 786688, metadata !9, metadata !"b", metadata !10, i32 28, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 28]
-!16 = metadata !{i32 786451, metadata !1, null, metadata !"B", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
+!15 = metadata !{metadata !"0x100\00b\0028\000", metadata !9, metadata !10, metadata !16} ; [ DW_TAG_auto_variable ] [b] [line 28]
+!16 = metadata !{metadata !"0x13\00B\0016\0032\0032\000\000\000", metadata !1, null, null, metadata !17, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"end_of_file", i32 17, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
-!19 = metadata !{i32 786688, metadata !9, metadata !"c", metadata !10, i32 29, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [c] [line 29]
+!18 = metadata !{metadata !"0xd\00end_of_file\0017\0032\0032\000\000", metadata !1, metadata !16, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
+!19 = metadata !{metadata !"0x100\00c\0029\000", metadata !9, metadata !10, metadata !13} ; [ DW_TAG_auto_variable ] [c] [line 29]
 !20 = metadata !{}
 !21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
 !22 = metadata !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)}
-!23 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
-!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long>", metadata !"min<unsigned long long>", metadata !"_ZN3__13minIyEERKT_S3_RS1_", i32 12, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !33, null, metadata !35, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
-!25 = metadata !{i32 786489, metadata !26, null, metadata !"__1", i32 1} ; [ DW_TAG_namespace ] [__1] [line 1]
+!23 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!24 = metadata !{metadata !"0x2e\00min<unsigned long long>\00min<unsigned long long>\00_ZN3__13minIyEERKT_S3_RS1_\0012\000\001\000\006\00256\001\0012", metadata !1, metadata !25, metadata !27, null, null, metadata !33, null, metadata !35} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
+!25 = metadata !{metadata !"0x39\00__1\001", metadata !26, null} ; [ DW_TAG_namespace ] [__1] [line 1]
 !26 = metadata !{metadata !"main.cpp", metadata !"/Users/matt/ryan_bug"}
-!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !28 = metadata !{metadata !29, metadata !29, metadata !32}
-!29 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!30 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
-!31 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!32 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!29 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!31 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!32 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
 !33 = metadata !{metadata !34}
-!34 = metadata !{i32 786479, null, metadata !"_Tp", metadata !31, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!34 = metadata !{metadata !"0x2f\00_Tp\000\000", null, metadata !31, null} ; [ DW_TAG_template_type_parameter ]
 !35 = metadata !{metadata !36, metadata !37}
-!36 = metadata !{i32 786689, metadata !24, metadata !"p1", metadata !10, i32 16777228, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 12]
-!37 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!36 = metadata !{metadata !"0x101\00p1\0016777228\000", metadata !24, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 12]
+!37 = metadata !{metadata !"0x101\00p2\0033554444\000", metadata !24, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 12]
 !38 = metadata !{i32 33, i32 0, metadata !9, null}
 !39 = metadata !{i32 12, i32 0, metadata !24, metadata !38}
 !40 = metadata !{i32 9, i32 0, metadata !41, metadata !59}
-!41 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long, __1::A>", metadata !"min<unsigned long long, __1::A>", metadata !"_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", i32 7, metadata !42, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !53, null, metadata !55, i32 8} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
-!42 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!41 = metadata !{metadata !"0x2e\00min<unsigned long long, __1::A>\00min<unsigned long long, __1::A>\00_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_\007\000\001\000\006\00256\001\008", metadata !1, metadata !25, metadata !42, null, null, metadata !53, null, metadata !55} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
+!42 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !43 = metadata !{metadata !29, metadata !29, metadata !32, metadata !44}
-!44 = metadata !{i32 786451, metadata !1, metadata !25, metadata !"A", i32 0, i64 8, i64 8, i32 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
+!44 = metadata !{metadata !"0x13\00A\000\008\008\000\000\000", metadata !1, metadata !25, null, metadata !45, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
 !45 = metadata !{metadata !46}
-!46 = metadata !{i32 786478, metadata !1, metadata !44, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !52, i32 1} ; [ DW_TAG_subprogram ] [line 1] [operator()]
-!47 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !48, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!46 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\000\000\006\00256\001\001", metadata !1, metadata !44, metadata !47, null, null, null, i32 0, metadata !52} ; [ DW_TAG_subprogram ] [line 1] [operator()]
+!47 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !48, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !48 = metadata !{metadata !13, metadata !49, metadata !50, metadata !50}
-!49 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!50 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!51 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!49 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!50 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!51 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
 !52 = metadata !{i32 786468}
 !53 = metadata !{metadata !34, metadata !54}
-!54 = metadata !{i32 786479, null, metadata !"_Compare", metadata !44, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!54 = metadata !{metadata !"0x2f\00_Compare\000\000", null, metadata !44, null} ; [ DW_TAG_template_type_parameter ]
 !55 = metadata !{metadata !56, metadata !57, metadata !58}
-!56 = metadata !{i32 786689, metadata !41, metadata !"p1", metadata !10, i32 16777223, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 7]
-!57 = metadata !{i32 786689, metadata !41, metadata !"p2", metadata !10, i32 33554439, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 7]
-!58 = metadata !{i32 786689, metadata !41, metadata !"p3", metadata !10, i32 50331656, metadata !44, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p3] [line 8]
+!56 = metadata !{metadata !"0x101\00p1\0016777223\000", metadata !41, metadata !10, metadata !29} ; [ DW_TAG_arg_variable ] [p1] [line 7]
+!57 = metadata !{metadata !"0x101\00p2\0033554439\000", metadata !41, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ] [p2] [line 7]
+!58 = metadata !{metadata !"0x101\00p3\0050331656\000", metadata !41, metadata !10, metadata !44} ; [ DW_TAG_arg_variable ] [p3] [line 8]
 !59 = metadata !{i32 13, i32 0, metadata !24, metadata !38}
 !63 = metadata !{i32 undef}
-!64 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
-!65 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !46, metadata !66, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
+!64 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!65 = metadata !{metadata !"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\001\000\006\00256\001\002", metadata !1, metadata !25, metadata !47, null, null, null, metadata !46, metadata !66} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
 !66 = metadata !{metadata !67, metadata !69, metadata !70}
-!67 = metadata !{i32 786689, metadata !65, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!68 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!69 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 1]
-!70 = metadata !{i32 786689, metadata !65, metadata !"", metadata !10, i32 50331650, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 2]
+!67 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !65, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!69 = metadata !{metadata !"0x101\00p1\0033554433\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!70 = metadata !{metadata !"0x101\00\0050331650\000", metadata !65, metadata !10, metadata !50} ; [ DW_TAG_arg_variable ] [line 2]
 !71 = metadata !{i32 1, i32 0, metadata !65, metadata !40}
-!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/stack_guard_remat.ll b/test/CodeGen/X86/stack_guard_remat.ll
new file mode 100644
index 0000000..dd639a7
--- /dev/null
+++ b/test/CodeGen/X86/stack_guard_remat.ll

@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s -check-prefix=CHECK
+
+;CHECK:  foo2
+;CHECK:  movq ___stack_chk_guard@GOTPCREL(%rip), [[R0:%[a-z0-9]+]]
+;CHECK:  movq ([[R0]]), {{%[a-z0-9]+}}
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test_stack_guard_remat() #0 {
+entry:
+  %a1 = alloca [256 x i32], align 16
+  %0 = bitcast [256 x i32]* %a1 to i8*
+  call void @llvm.lifetime.start(i64 1024, i8* %0)
+  %arraydecay = getelementptr inbounds [256 x i32]* %a1, i64 0, i64 0
+  call void @foo3(i32* %arraydecay)
+  call void asm sideeffect "foo2", "~{r12},~{r13},~{r14},~{r15},~{ebx},~{esi},~{edi},~{dirflag},~{fpsr},~{flags}"()
+  call void @llvm.lifetime.end(i64 1024, i8* %0)
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @foo3(i32*)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll
index 0b7e6db..dfb16ad 100644
--- a/test/CodeGen/X86/stackmap-fast-isel.ll
+++ b/test/CodeGen/X86/stackmap-fast-isel.ll

@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim                             | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -fast-isel -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7                             | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort | FileCheck %s
 
 ; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
 ; CHECK-NEXT:  __LLVM_StackMaps:

diff --git a/test/CodeGen/X86/stackmap-large-constants.ll b/test/CodeGen/X86/stackmap-large-constants.ll
new file mode 100644
index 0000000..73ee4f3
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-large-constants.ll

@@ -0,0 +1,83 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+; CHECK-LABEL:	.section	__LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT: __LLVM_StackMaps:
+; version
+; CHECK-NEXT: 	.byte	1
+; reserved
+; CHECK-NEXT: 	.byte	0
+; reserved
+; CHECK-NEXT: 	.short	0
+; # functions
+; CHECK-NEXT: 	.long	2
+; # constants
+; CHECK-NEXT: 	.long	2
+; # records
+; CHECK-NEXT: 	.long	2
+; function address & stack size
+; CHECK-NEXT: 	.quad	_foo
+; CHECK-NEXT: 	.quad	8
+; function address & stack size
+; CHECK-NEXT: 	.quad	_bar
+; CHECK-NEXT: 	.quad	8
+
+; Constants Array:
+; CHECK-NEXT: 	.quad	9223372036854775807
+; CHECK-NEXT: 	.quad	-9223372036854775808
+
+; Patchpoint ID
+; CHECK-NEXT: 	.quad	0
+; Instruction offset
+; CHECK-NEXT: 	.long	L{{.*}}-_foo
+; reserved
+; CHECK-NEXT: 	.short	0
+; # locations
+; CHECK-NEXT: 	.short	1
+; ConstantIndex
+; CHECK-NEXT: 	.byte	5
+; reserved
+; CHECK-NEXT: 	.byte	8
+; Dwarf RegNum
+; CHECK-NEXT: 	.short	0
+; Offset
+; CHECK-NEXT: 	.long	0
+; padding
+; CHECK-NEXT: 	.short	0
+; NumLiveOuts
+; CHECK-NEXT: 	.short	0
+
+; CHECK-NEXT: 	.align	3
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+
+define void @foo() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 9223372036854775807)
+  ret void
+}
+
+; Patchpoint ID
+; CHECK-NEXT: 	.quad	0
+; Instruction Offset
+; CHECK-NEXT: 	.long	L{{.*}}-_bar
+; reserved
+; CHECK-NEXT: 	.short	0
+; # locations
+; CHECK-NEXT: 	.short	1
+; ConstantIndex
+; CHECK-NEXT: 	.byte	5
+; reserved
+; CHECK-NEXT: 	.byte	8
+; Dwarf RegNum
+; CHECK-NEXT: 	.short	0
+; Offset
+; CHECK-NEXT: 	.long	1
+; padding
+; CHECK-NEXT: 	.short	0
+; NumLiveOuts
+; CHECK-NEXT: 	.short	0
+
+
+define void @bar() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 -9223372036854775808)
+  ret void
+}

diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
index 897595d..31553c0 100644
--- a/test/CodeGen/X86/stackmap-liveness.ll
+++ b/test/CodeGen/X86/stackmap-liveness.ll

@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-patchpoint-liveness=false | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim                                   | FileCheck -check-prefix=PATCH %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -enable-patchpoint-liveness=false | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx                                   | FileCheck -check-prefix=PATCH %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 

diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll
index 5a78f24..7932c0d 100644
--- a/test/CodeGen/X86/stackmap-nops.ll
+++ b/test/CodeGen/X86/stackmap-nops.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
 
 define void @nop_test() {
 entry:
@@ -224,6 +224,10 @@
   tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 28, i32 28)
   tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 29, i32 29)
   tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 30, i32 30)
+; Add an extra stackmap with a zero-length shadow to thwart the shadow
+; optimization. This will force all 15 bytes of the previous shadow to be
+; padded with nops.
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 31, i32 0)
   ret void
 }
 

diff --git a/test/CodeGen/X86/stackmap-shadow-optimization.ll b/test/CodeGen/X86/stackmap-shadow-optimization.ll
new file mode 100644
index 0000000..a3725f2
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-shadow-optimization.ll

@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+
+; Check that the X86 stackmap shadow optimization is only outputting a 3-byte
+; nop here. 8-bytes are requested, but 5 are covered by the code for the call to
+; bar.  However, the frame teardown and the return do not count towards the
+; stackmap shadow as the call return counts as a branch target so must flush
+; the shadow.
+; Note that in order for a thread to not return in to the patched space
+; the call must be at the end of the shadow, so the required nop must be
+; before the call, not after.
+define void @shadow_optimization_test() {
+entry:
+; CHECK-LABEL:  shadow_optimization_test:
+; CHECK:        callq   _bar
+; CHECK:        nop
+; CHECK:        callq   _bar
+; CHECK-NOT:    nop
+; CHECK:        callq   _bar
+; CHECK-NOT:    nop
+  call void @bar()
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 8)
+  call void @bar()
+  call void @bar()
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @bar()

diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll
index 8567037..5e356f3 100644
--- a/test/CodeGen/X86/stackmap.ll
+++ b/test/CodeGen/X86/stackmap.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
@@ -9,11 +9,11 @@
 ; CHECK-NEXT:   .byte 0
 ; CHECK-NEXT:   .short 0
 ; Num Functions
-; CHECK-NEXT:   .long 15
+; CHECK-NEXT:   .long 16
 ; Num LargeConstants
 ; CHECK-NEXT:   .long 3
 ; Num Callsites
-; CHECK-NEXT:   .long 19
+; CHECK-NEXT:   .long 20
 
 ; Functions and stack size
 ; CHECK-NEXT:   .quad _constantargs
@@ -46,6 +46,8 @@
 ; CHECK-NEXT:   .quad 8
 ; CHECK-NEXT:   .quad _clobberScratch
 ; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _needsStackRealignment
+; CHECK-NEXT:   .quad -1
 
 ; Large Constants
 ; CHECK-NEXT:   .quad   2147483648
@@ -464,6 +466,23 @@
   ret void
 }
 
+; A stack frame which needs to be realigned at runtime (to meet alignment 
+; criteria for values on the stack) does not have a fixed frame size. 
+; CHECK-LABEL:  .long L{{.*}}-_needsStackRealignment
+; CHECK-NEXT:   .short 0
+; 0 locations
+; CHECK-NEXT:   .short 0
+define void @needsStackRealignment() {
+  %val = alloca i64, i32 3, align 128
+  tail call void (...)* @escape_values(i64* %val)
+; Note: Adding any non-constant to the stackmap would fail because we
+; expected to be able to address off the frame pointer.  In a realigned
+; frame, we must use the stack pointer instead.  This is a separate bug.
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0)
+  ret void
+}
+declare void @escape_values(...)
+
 declare void @llvm.experimental.stackmap(i64, i32, ...)
 declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
 declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)

diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index 7557f25..e3cc2fa 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll

@@ -34,8 +34,8 @@
 ; X64: movb	%sil, 1(%rdi)
 
 ; X32-LABEL: test2:
-; X32: movb	8(%esp), %[[REG:[abcd]l]]
-; X32: movb	%[[REG]], 1(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]]]l
+; X32: movb	%[[REG]]l, 1(%{{.*}})
 }
 
 define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -67,8 +67,8 @@
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test4:
-; X32: movl	8(%esp), %e[[REG:[abcd]x]]
-; X32: movw	%[[REG]], 2(%{{.*}})
+; X32: movw	8(%esp), %[[REG:[abcd]]]x
+; X32: movw	%[[REG]]x, 2(%{{.*}})
 }
 
 define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -84,8 +84,8 @@
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test5:
-; X32: movzwl	8(%esp), %e[[REG:[abcd]x]]
-; X32: movw	%[[REG]], 2(%{{.*}})
+; X32: movw	8(%esp), %[[REG:[abcd]]]x
+; X32: movw	%[[REG]]x, 2(%{{.*}})
 }
 
 define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {

diff --git a/test/CodeGen/X86/swizzle-2.ll b/test/CodeGen/X86/swizzle-2.ll
index 4b1f903..697af84 100644
--- a/test/CodeGen/X86/swizzle-2.ll
+++ b/test/CodeGen/X86/swizzle-2.ll

@@ -8,508 +8,433 @@
 ; illegal shuffle that is expanded into a sub-optimal sequence of instructions
 ; during lowering stage.
 
-
 define <4 x i32> @swizzle_1(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_1
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_2(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_2
-; Mask: [2,1,3,0]
-; CHECK: pshufd $54
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_3(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_3
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_4(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_4
-; Mask: [3,1,0,2]
-; CHECK: pshufd $-121
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_5(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_5
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_6(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_6
-; Mask: [2,0,1,3]
-; CHECK: pshufd $-46
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_7(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_7
-; Mask: [0,2,3,1]
-; CHECK: pshufd $120
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_8(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_8
-; Mask: [1,3,2,0]
-; CHECK: pshufd $45
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_9(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_9
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_10(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_10
-; Mask: [1,2,0,3]
-; CHECK: pshufd $-55
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_11(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_11
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_12(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_12
-; Mask: [0,3,1,2]
-; CHECK: pshufd $-100
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_13(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_13
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x i32> @swizzle_14(<4 x i32> %v) {
+; CHECK-LABEL: swizzle_14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,2,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
   ret <4 x i32> %2
 }
-; CHECK-LABEL: swizzle_14
-; Mask: [3,0,2,1]
-; CHECK: pshufd $99
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_15(<4 x float> %v) {
+; CHECK-LABEL: swizzle_15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_15
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_16(<4 x float> %v) {
+; CHECK-LABEL: swizzle_16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_16
-; Mask: [2,1,3,0]
-; CHECK: pshufd $54
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_17(<4 x float> %v) {
+; CHECK-LABEL: swizzle_17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_17
-; Mask: [1,0,3,2]
-; CHECK: pshufd $-79
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_18(<4 x float> %v) {
+; CHECK-LABEL: swizzle_18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,0,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_18
-; Mask: [3,1,0,2]
-; CHECK: pshufd $-121
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_19(<4 x float> %v) {
+; CHECK-LABEL: swizzle_19:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_19
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_20(<4 x float> %v) {
+; CHECK-LABEL: swizzle_20:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_20
-; Mask: [2,0,1,3]
-; CHECK: pshufd $-46
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_21(<4 x float> %v) {
+; CHECK-LABEL: swizzle_21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_21
-; Mask: [0,2,3,1]
-; CHECK: pshufd $120
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_22(<4 x float> %v) {
+; CHECK-LABEL: swizzle_22:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_22
-; Mask: [1,3,2,0]
-; CHECK: pshufd $45
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_23(<4 x float> %v) {
+; CHECK-LABEL: swizzle_23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_23
-; Mask: [2,3,0,1]
-; CHECK: pshufd $78
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_24(<4 x float> %v) {
+; CHECK-LABEL: swizzle_24:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2,0,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_24
-; Mask: [1,2,0,3]
-; CHECK: pshufd $-55
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_25(<4 x float> %v) {
+; CHECK-LABEL: swizzle_25:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_25
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_26(<4 x float> %v) {
+; CHECK-LABEL: swizzle_26:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_26
-; Mask: [0,3,1,2]
-; CHECK: pshufd $-100
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_27(<4 x float> %v) {
+; CHECK-LABEL: swizzle_27:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_27
-; Mask: [3,2,1,0]
-; CHECK: pshufd $27
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_28(<4 x float> %v) {
+; CHECK-LABEL: swizzle_28:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,2,1]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_28
-; Mask: [3,0,2,1]
-; CHECK: pshufd $99
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
-
 
 define <4 x float> @swizzle_29(<4 x float> %v) {
+; CHECK-LABEL: swizzle_29:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,0]
+; CHECK-NEXT:    retq
   %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
   %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
   ret <4 x float> %2
 }
-; CHECK-LABEL: swizzle_29
-; Mask: [1,3,2,0]
-; CHECK: pshufd $45
-; CHECK-NOT: pshufd
-; CHECK-NEXT: ret
 
 ; Make sure that we combine the shuffles from each function below into a single
 ; legal shuffle (either pshuflw or pshufb depending on the masks).
 
 define <8 x i16> @swizzle_30(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_30:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_30
-; Mask: [1,3,2,0,5,7,6,4]
-; CHECK: pshuflw $45
-; CHECK-NOT: pshufb
-; CHECK-NEXT: ret
-
 
 define <8 x i16> @swizzle_31(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_31:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_31
-; Mask: [1,3,2,0,4,5,6,7]
-; CHECK: pshuflw $45
-; CHECK-NOT: pshufb
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_32(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_32
-; Mask: [2,3,0,1,4,5,6,7] --> equivalent to pshufd mask [1,0,2,3]
-; CHECK: pshufd $-31
-; CHECK-NOT: pshufb
-; CHECK: ret
 
 define <8 x i16> @swizzle_33(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_33:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_33
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_34(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_34:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_34
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_35(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_35:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_35
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_36(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_36:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_36
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_37(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_37:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 5, i32 6, i32 4>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 4, i32 6, i32 5>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_37
-; Mask: [0,1,2,3,4,7,6,5]
-; CHECK: pshufhw $108
-; CHECK-NOT: pshufb
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_38(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_38:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_38
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_39(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_39:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,3,1,0,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_39
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_40(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_40:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_40
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_41(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_41:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_41
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK-NOT: shufpd
-; CHECK: ret
-
 
 define <8 x i16> @swizzle_42(<8 x i16> %v) {
+; CHECK-LABEL: swizzle_42:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; CHECK-NEXT:    retq
   %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5>
   ret <8 x i16> %2
 }
-; CHECK-LABEL: swizzle_42
-; Mask: [0,1,2,3,5,4,7,6]
-; CHECK: pshufhw $-79
-; CHECK-NOT: pshufb
-; CHECK: ret
-
-

diff --git a/test/CodeGen/X86/swizzle.ll b/test/CodeGen/X86/swizzle.ll
deleted file mode 100644
index 23e0c24..0000000
--- a/test/CodeGen/X86/swizzle.ll
+++ /dev/null

@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movlps
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movsd
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep movups
-; rdar://6523650
-
-	%struct.vector4_t = type { <4 x float> }
-
-define void @swizzle(i8* nocapture %a, %struct.vector4_t* nocapture %b, %struct.vector4_t* nocapture %c) nounwind {
-entry:
-	%0 = getelementptr %struct.vector4_t* %b, i32 0, i32 0		; <<4 x float>*> [#uses=2]
-	%1 = load <4 x float>* %0, align 4		; <<4 x float>> [#uses=1]
-	%tmp.i = bitcast i8* %a to double*		; <double*> [#uses=1]
-	%tmp1.i = load double* %tmp.i		; <double> [#uses=1]
-	%2 = insertelement <2 x double> undef, double %tmp1.i, i32 0		; <<2 x double>> [#uses=1]
-	%tmp2.i = bitcast <2 x double> %2 to <4 x float>		; <<4 x float>> [#uses=1]
-	%3 = shufflevector <4 x float> %1, <4 x float> %tmp2.i, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x float>> [#uses=1]
-	store <4 x float> %3, <4 x float>* %0, align 4
-	ret void
-}

diff --git a/test/CodeGen/X86/tailcall-multiret.ll b/test/CodeGen/X86/tailcall-multiret.ll
new file mode 100644
index 0000000..a77a59c
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-multiret.ll

@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core2 | FileCheck %s
+; See PR19530
+declare double    @llvm.powi.f64(double %Val, i32 %power)
+define <3 x double> @julia_foo17589(i32 %arg) {
+  %tmp1 = call double @llvm.powi.f64(double 1.000000e+00, i32 %arg)
+; CHECK: callq   __powidf2
+  %tmp2 = insertelement <3 x double> undef, double %tmp1, i32 0
+  %tmp3 = call double @llvm.powi.f64(double 2.000000e+00, i32 %arg)
+; CHECK: callq   __powidf2
+  %tmp4 = insertelement <3 x double> %tmp2, double %tmp3, i32 1
+  %tmp5 = call double @llvm.powi.f64(double 3.000000e+00, i32 %arg)
+; CHECK: callq   __powidf2
+  %tmp6 = insertelement <3 x double> %tmp4, double %tmp5, i32 2
+; CHECK-NOT: TAILCALL
+  ret <3 x double> %tmp6
+}

diff --git a/test/CodeGen/X86/tls-addr-non-leaf-function.ll b/test/CodeGen/X86/tls-addr-non-leaf-function.ll
new file mode 100644
index 0000000..ec47232
--- /dev/null
+++ b/test/CodeGen/X86/tls-addr-non-leaf-function.ll

@@ -0,0 +1,37 @@
+; RUN: llc < %s -relocation-model=pic -O2 -disable-fp-elim -o - | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -O2 -o - | FileCheck %s
+
+; This test runs twice with different options regarding the frame pointer:
+; first the elimination is disabled, then it is enabled. The disabled case is
+; the "control group".
+; The function 'foo' below is marked with the "no-frame-pointer-elim-non-leaf"
+; attribute which dictates that the frame pointer should not be eliminated
+; unless the function is a leaf (i.e. it doesn't call any other function).
+; Now, 'foo' is not a leaf function, because it performs a TLS access which on
+; X86 ELF in PIC mode is expanded as a library call.
+; This call is represented with a pseudo-instruction which doesn't appear to be
+; a call when inspected by the analysis passes (it doesn't have the "isCall"
+; flag), and the ISel lowering code creating the pseudo was not informing the 
+; MachineFrameInfo that the function contained calls. This affected the decision
+; whether to eliminate the frame pointer.
+; With the fix, the "hasCalls" flag is set in the MFI for the function whenever
+; a TLS access pseudo-instruction is created, so 'foo' appears to be a non-leaf
+; function, and the difference in the options does not affect codegen: both
+; versions will have a frame pointer.
+
+; Test that there's some frame pointer usage in 'foo'...
+; CHECK: foo:
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; ... and the TLS library call is also present.
+; CHECK: leaq x@TLSGD(%rip), %rdi
+; CHECK: callq __tls_get_addr@PLT
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = thread_local global i32 0
+define i32 @foo() "no-frame-pointer-elim-non-leaf" {
+  %a = load i32* @x, align 4
+  ret i32 %a
+}

diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index d230f1f..8de6297 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll

@@ -20,7 +20,7 @@
 ; Read 32-bits
 ;CHECK: pmovzxwq
 ;CHECK: paddq
-;CHECK: pshufb
+;CHECK: pshufd
 ;CHECK: movd
 ;CHECK: ret
 define void @load_2_i16(<2 x i16>* %A)  {
@@ -32,7 +32,7 @@
 
 ;CHECK-LABEL: load_2_i32:
 ;CHECK: pmovzxdq
-;CHECK: paddq
+;CHECK: paddd
 ;CHECK: pshufd
 ;CHECK: ret
 define void @load_2_i32(<2 x i32>* %A)  {
@@ -56,7 +56,7 @@
 
 ;CHECK-LABEL: load_4_i16:
 ;CHECK: pmovzxwd
-;CHECK: paddd
+;CHECK: paddw
 ;CHECK: pshufb
 ;CHECK: ret
 define void @load_4_i16(<4 x i16>* %A)  {
@@ -68,7 +68,7 @@
 
 ;CHECK-LABEL: load_8_i8:
 ;CHECK: pmovzxbw
-;CHECK: paddw
+;CHECK: paddb
 ;CHECK: pshufb
 ;CHECK: ret
 define void @load_8_i8(<8 x i8>* %A)  {

diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll
index c5a61c3..e47f154 100644
--- a/test/CodeGen/X86/uint_to_fp-2.ll
+++ b/test/CodeGen/X86/uint_to_fp-2.ll

@@ -1,15 +1,20 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mattr=+sse2 | FileCheck %s
 
 ; rdar://6504833
 define float @test1(i32 %x) nounwind readnone {
-; CHECK: test1
-; CHECK: movd
-; CHECK: orps
-; CHECK: subsd
-; CHECK: cvtsd2ss
-; CHECK: movss
-; CHECK: flds
-; CHECK: ret
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movsd .LCPI0_0, %xmm0
+; CHECK-NEXT:    movd {{[0-9]+}}(%esp), %xmm1
+; CHECK-NEXT:    orps %xmm0, %xmm1
+; CHECK-NEXT:    subsd %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
 entry:
 	%0 = uitofp i32 %x to float
 	ret float %0
@@ -17,15 +22,20 @@
 
 ; PR10802
 define float @test2(<4 x i32> %x) nounwind readnone ssp {
-; CHECK: test2
-; CHECK: xorps [[ZERO:%xmm[0-9]+]]
-; CHECK: movss {{.*}}, [[ZERO]]
-; CHECK: orps
-; CHECK: subsd
-; CHECK: cvtsd2ss
-; CHECK: movss
-; CHECK: flds
-; CHECK: ret
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    movss %xmm0, %xmm1
+; CHECK-NEXT:    movsd .LCPI1_0, %xmm0
+; CHECK-NEXT:    orps %xmm0, %xmm1
+; CHECK-NEXT:    subsd %xmm0, %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
 entry:
   %vecext = extractelement <4 x i32> %x, i32 0
   %conv = uitofp i32 %vecext to float

diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index d7ae469..ca9ea4a 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll

@@ -21,16 +21,16 @@
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 1, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 1, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, i32, i32, i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !10, i32 12, metadata !"producer", i1 false, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00x\001\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\001", metadata !10, metadata !2, metadata !4, null, i32 (i32, i32, i32, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00producer\000\00\000\00\000", metadata !10, metadata !11, metadata !11, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786468, metadata !10, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786443, metadata !2, metadata !1, i32 1, i32 30, i32 0} ; [ DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0xb\001\0030\000", metadata !2, metadata !1} ; [ DW_TAG_lexical_block ]
 !8 = metadata !{i32 4, i32 3, metadata !7, null}
 !9 = metadata !{metadata !1}
 !10 = metadata !{metadata !"test.c", metadata !"/dir"}
 !11 = metadata !{i32 0}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/CodeGen/X86/v-binop-widen.ll b/test/CodeGen/X86/v-binop-widen.ll
deleted file mode 100644
index fca4da6..0000000
--- a/test/CodeGen/X86/v-binop-widen.ll
+++ /dev/null

@@ -1,11 +0,0 @@
-; RUN: llc -mcpu=generic -march=x86 -mattr=+sse < %s | FileCheck %s
-; CHECK: divps
-; CHECK: divps
-; CHECK: divss
-
-%vec = type <9 x float>
-define %vec @vecdiv( %vec %p1, %vec %p2)
-{
-  %result = fdiv %vec %p1, %p2
-  ret %vec %result
-}

diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll
deleted file mode 100644
index 3342111..0000000
--- a/test/CodeGen/X86/v-binop-widen2.ll
+++ /dev/null

@@ -1,47 +0,0 @@
-; RUN: llc -march=x86 -mcpu=generic -mattr=+sse < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s
-
-%vec = type <6 x float>
-; CHECK: divps
-; CHECK: divss
-; CHECK: divss
-
-; Scheduler causes a different instruction order to be produced on Intel Atom
-; ATOM: divps
-; ATOM: divss
-; ATOM: divss
-
-define %vec @vecdiv( %vec %p1, %vec %p2)
-{
-  %result = fdiv %vec %p1, %p2
-  ret %vec %result
-}
-
-@a = constant %vec < float 2.0, float 4.0, float 8.0, float 16.0, float 32.0, float 64.0 >
-@b = constant %vec < float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0 >
-
-; Expected result: < 1.0, 2.0, 4.0, ..., 2.0^(n-1) >
-; main() returns 0 if the result is expected and 1 otherwise
-; to execute, use llvm-as < %s | lli
-define i32 @main() nounwind {
-entry:
-  %avec = load %vec* @a
-  %bvec = load %vec* @b
-
-  %res = call %vec @vecdiv(%vec %avec, %vec %bvec)
-  br label %loop
-loop:
-  %idx = phi i32 [0, %entry], [%nextInd, %looptail]
-  %expected = phi float [1.0, %entry], [%nextExpected, %looptail]
-  %elem = extractelement %vec %res, i32 %idx
-  %expcmp = fcmp oeq float %elem, %expected
-  br i1 %expcmp, label %looptail, label %return
-looptail:
-  %nextExpected = fmul float %expected, 2.0
-  %nextInd = add i32 %idx, 1
-  %cmp = icmp slt i32 %nextInd, 6
-  br i1 %cmp, label %loop, label %return
-return:
-  %retval = phi i32 [0, %looptail], [1, %loop]
-  ret i32 %retval
-}

diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll
index dab5e7b..b9bd80f9 100644
--- a/test/CodeGen/X86/v2f32.ll
+++ b/test/CodeGen/X86/v2f32.ll

@@ -1,115 +1,94 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -asm-verbose=0 -o - | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -o - | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -o - | FileCheck %s --check-prefix=X32
 
 ; PR7518
 define void @test1(<2 x float> %Q, float *%P2) nounwind {
+; X64-LABEL: test1:
+; X64:       # BB#0:
+; X64-NEXT:    movaps %xmm0, %xmm1
+; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X64-NEXT:    addss %xmm0, %xmm1
+; X64-NEXT:    movss %xmm1, (%rdi)
+; X64-NEXT:    retq
+;
+; X32-LABEL: test1:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movaps %xmm0, %xmm1
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X32-NEXT:    addss %xmm0, %xmm1
+; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    retl
   %a = extractelement <2 x float> %Q, i32 0
   %b = extractelement <2 x float> %Q, i32 1
   %c = fadd float %a, %b
-
   store float %c, float* %P2
   ret void
-; X64-LABEL: test1:
-; X64-NEXT: pshufd	$1, %xmm0, %xmm1
-; X64-NEXT: addss	%xmm0, %xmm1
-; X64-NEXT: movss	%xmm1, (%rdi)
-; X64-NEXT: ret
-
-; W64-LABEL: test1:
-; W64-NEXT: movdqa  (%rcx), %xmm0
-; W64-NEXT: pshufd  $1, %xmm0, %xmm1
-; W64-NEXT: addss   %xmm0, %xmm1
-; W64-NEXT: movss   %xmm1, (%rdx)
-; W64-NEXT: ret
-
-; X32-LABEL: test1:
-; X32-NEXT: movl	4(%esp), %eax
-; X32-NEXT: pshufd	$1, %xmm0, %xmm1
-; X32-NEXT: addss	%xmm0, %xmm1
-; X32-NEXT: movss	%xmm1, (%eax)
-; X32-NEXT: ret
 }
 
-
 define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, <2 x float> *%P) nounwind {
+; X64-LABEL: test2:
+; X64:       # BB#0:
+; X64-NEXT:    addps %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test2:
+; X32:       # BB#0:
+; X32-NEXT:    addps %xmm1, %xmm0
+; X32-NEXT:    retl
   %Z = fadd <2 x float> %Q, %R
   ret <2 x float> %Z
-  
-; X64-LABEL: test2:
-; X64-NEXT: addps	%xmm1, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test2:
-; W64-NEXT: movaps  (%rcx), %xmm0
-; W64-NEXT: addps   (%rdx), %xmm0
-; W64-NEXT: ret
-
-; X32-LABEL: test2:
-; X32:      addps	%xmm1, %xmm0
 }
 
-
 define <2 x float> @test3(<4 x float> %A) nounwind {
+; X64-LABEL: test3:
+; X64:       # BB#0:
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test3:
+; X32:       # BB#0:
+; X32-NEXT:    addps %xmm0, %xmm0
+; X32-NEXT:    retl
 	%B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
 	%C = fadd <2 x float> %B, %B
 	ret <2 x float> %C
-; X64-LABEL: test3:
-; X64-NEXT: addps	%xmm0, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test3:
-; W64-NEXT: movaps  (%rcx), %xmm0
-; W64-NEXT: addps   %xmm0, %xmm0
-; W64-NEXT: ret
-
-; X32-LABEL: test3:
-; X32-NEXT: addps	%xmm0, %xmm0
-; X32-NEXT: ret
 }
 
 define <2 x float> @test4(<2 x float> %A) nounwind {
+; X64-LABEL: test4:
+; X64:       # BB#0:
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test4:
+; X32:       # BB#0:
+; X32-NEXT:    addps %xmm0, %xmm0
+; X32-NEXT:    retl
 	%C = fadd <2 x float> %A, %A
 	ret <2 x float> %C
-; X64-LABEL: test4:
-; X64-NEXT: addps	%xmm0, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test4:
-; W64-NEXT: movaps  (%rcx), %xmm0
-; W64-NEXT: addps   %xmm0, %xmm0
-; W64-NEXT: ret
-
-; X32-LABEL: test4:
-; X32-NEXT: addps	%xmm0, %xmm0
-; X32-NEXT: ret
 }
 
 define <4 x float> @test5(<4 x float> %A) nounwind {
+; X64-LABEL: test5:
+; X64:       # BB#0:
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    addps %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test5:
+; X32:       # BB#0:
+; X32-NEXT:    addps %xmm0, %xmm0
+; X32-NEXT:    addps %xmm0, %xmm0
+; X32-NEXT:    retl
 	%B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
 	%C = fadd <2 x float> %B, %B
-        br label %BB
-        
+  br label %BB
+
 BB:
-        %D = fadd <2 x float> %C, %C
+  %D = fadd <2 x float> %C, %C
 	%E = shufflevector <2 x float> %D, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 	ret <4 x float> %E
-        
-; X64-LABEL: test5:
-; X64-NEXT: addps	%xmm0, %xmm0
-; X64-NEXT: addps	%xmm0, %xmm0
-; X64-NEXT: ret
-
-; W64-LABEL: test5:
-; W64-NEXT: movaps  (%rcx), %xmm0
-; W64-NEXT: addps   %xmm0, %xmm0
-; W64-NEXT: addps   %xmm0, %xmm0
-; W64-NEXT: ret
-
-; X32-LABEL: test5:
-; X32-NEXT: addps	%xmm0, %xmm0
-; X32-NEXT: addps	%xmm0, %xmm0
-; X32-NEXT: ret
 }
 
 

diff --git a/test/CodeGen/X86/vararg-callee-cleanup.ll b/test/CodeGen/X86/vararg-callee-cleanup.ll
new file mode 100644
index 0000000..2dcf319
--- /dev/null
+++ b/test/CodeGen/X86/vararg-callee-cleanup.ll

@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=i686-pc-windows < %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+
+declare x86_thiscallcc void @thiscall_thunk(i8* %this, ...)
+define i32 @call_varargs_thiscall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
+  call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
+  call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2)
+  %t1 = add i32 %b, %c
+  %r = add i32 %t1, %d
+  ret i32 %r
+}
+
+; CHECK: _call_varargs_thiscall_thunk:
+; CHECK: calll _thiscall_thunk
+; CHECK-NEXT: subl $8, %esp
+
+; We don't mangle the argument size into variadic callee cleanup functions.
+
+declare x86_stdcallcc void @stdcall_thunk(i8* %this, ...)
+define i32 @call_varargs_stdcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
+  call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
+  call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2)
+  %t1 = add i32 %b, %c
+  %r = add i32 %t1, %d
+  ret i32 %r
+}
+
+; CHECK: _call_varargs_stdcall_thunk:
+; CHECK: calll _stdcall_thunk{{$}}
+; CHECK-NEXT: subl $12, %esp
+
+declare x86_fastcallcc void @fastcall_thunk(i8* %this, ...)
+define i32 @call_varargs_fastcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) {
+  call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+  call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2)
+  %t1 = add i32 %b, %c
+  %r = add i32 %t1, %d
+  ret i32 %r
+}
+
+; CHECK: _call_varargs_fastcall_thunk:
+; CHECK: calll @fastcall_thunk{{$}}
+; CHECK-NEXT: subl $4, %esp
+
+; If you actually return from such a thunk, it will only pop the non-variadic
+; portion of the arguments, which is different from what the callee passes.
+
+define x86_stdcallcc void @varargs_stdcall_return(i32, i32, ...) {
+  ret void
+}
+
+; CHECK: _varargs_stdcall_return:
+; CHECK: retl $8

diff --git a/test/CodeGen/X86/vararg_no_start.ll b/test/CodeGen/X86/vararg_no_start.ll
new file mode 100644
index 0000000..ab5c6fc
--- /dev/null
+++ b/test/CodeGen/X86/vararg_no_start.ll

@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s
+
+define void @foo(i8*, ...) {
+  ret void
+}
+; CHECK-LABEL: {{^_?}}foo:
+; CHECK-NOT: movq
+; CHECK: retq

diff --git a/test/CodeGen/X86/vastart-defs-eflags.ll b/test/CodeGen/X86/vastart-defs-eflags.ll
index 6017753..d0c5150 100644
--- a/test/CodeGen/X86/vastart-defs-eflags.ll
+++ b/test/CodeGen/X86/vastart-defs-eflags.ll

@@ -14,6 +14,7 @@
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
+  call void @llvm.va_start(i8* null)
   br label %if.end
 
 if.end:                                           ; preds = %entry, %if.then
@@ -21,3 +22,4 @@
   ret i32 %hasflag
 }
 
+declare void @llvm.va_start(i8*) nounwind

diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 1a6c05d..8600c48 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll

@@ -1,75 +1,177 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
 
-;CHECK-LABEL: foo1_8:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo1_8:
-;CHECK-WIDE:      vpmovzxbd %xmm0, %xmm1
-;CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1
-;CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1
-;CHECK-WIDE-NEXT: vpshufb {{.*}}, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-;CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
-;CHECK-WIDE-NEXT: ret
 define <8 x float> @foo1_8(<8 x i8> %src) {
+; CHECK-LABEL: foo1_8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm0
+; CHECK-NEXT:    vpslld $24, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrad $24, %xmm0, %xmm0
+; CHECK-NEXT:    vpslld $24, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrad $24, %xmm1, %xmm1
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo1_8:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpmovzxbd %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vpslld $24, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpsrad $24, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-WIDE-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-WIDE-NEXT:    vpslld $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpsrad $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-WIDE-NEXT:    retl
   %res = sitofp <8 x i8> %src to <8 x float>
   ret <8 x float> %res
 }
 
-;CHECK-LABEL: foo1_4:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo1_4:
-;CHECK-WIDE:      vpmovzxbd %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0
-;CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-;CHECK-WIDE-NEXT: ret
 define <4 x float> @foo1_4(<4 x i8> %src) {
+; CHECK-LABEL: foo1_4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpslld $24, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrad $24, %xmm0, %xmm0
+; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo1_4:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpmovzxbd %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpslld $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpsrad $24, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    retl
   %res = sitofp <4 x i8> %src to <4 x float>
   ret <4 x float> %res
 }
 
-;CHECK-LABEL: foo2_8:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo2_8:
-;CHECK-WIDE: vcvtdq2ps %ymm{{.*}}, %ymm{{.*}}
-;CHECK-WIDE: ret
 define <8 x float> @foo2_8(<8 x i8> %src) {
+; CHECK-LABEL: foo2_8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm1
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vandps LCPI2_0, %ymm0, %ymm0
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo2_8:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; CHECK-WIDE-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; CHECK-WIDE-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; CHECK-WIDE-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; CHECK-WIDE-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-WIDE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; CHECK-WIDE-NEXT:    retl
   %res = uitofp <8 x i8> %src to <8 x float>
   ret <8 x float> %res
 }
 
-;CHECK-LABEL: foo2_4:
-;CHECK: vcvtdq2ps
-;CHECK: ret
-;
-;CHECK-WIDE-LABEL: foo2_4:
-;CHECK-WIDE: vcvtdq2ps %xmm{{.*}}, %xmm{{.*}}
-;CHECK-WIDE: ret
 define <4 x float> @foo2_4(<4 x i8> %src) {
+; CHECK-LABEL: foo2_4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vandps LCPI3_0, %xmm0, %xmm0
+; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo2_4:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpmovzxbd %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    retl
   %res = uitofp <4 x i8> %src to <4 x float>
   ret <4 x float> %res
 }
 
-;CHECK-LABEL: foo3_8:
-;CHECK: vcvttps2dq
-;CHECK: ret
 define <8 x i8> @foo3_8(<8 x float> %src) {
+; CHECK-LABEL: foo3_8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo3_8:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
+; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
+; CHECK-WIDE-NEXT:    orl %eax, %ecx
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
+; CHECK-WIDE-NEXT:    movzbl %dl, %edx
+; CHECK-WIDE-NEXT:    orl %eax, %edx
+; CHECK-WIDE-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
+; CHECK-WIDE-NEXT:    orl %eax, %ecx
+; CHECK-WIDE-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
+; CHECK-WIDE-NEXT:    orl %eax, %ecx
+; CHECK-WIDE-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm0
+; CHECK-WIDE-NEXT:    vzeroupper
+; CHECK-WIDE-NEXT:    retl
   %res = fptosi <8 x float> %src to <8 x i8>
   ret <8 x i8> %res
 }
-;CHECK-LABEL: foo3_4:
-;CHECK: vcvttps2dq
-;CHECK: ret
+
 define <4 x i8> @foo3_4(<4 x float> %src) {
+; CHECK-LABEL: foo3_4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    retl
+;
+; CHECK-WIDE-LABEL: foo3_4:
+; CHECK-WIDE:       ## BB#0:
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %ecx
+; CHECK-WIDE-NEXT:    movzbl %cl, %ecx
+; CHECK-WIDE-NEXT:    orl %eax, %ecx
+; CHECK-WIDE-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    shll $8, %eax
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %edx
+; CHECK-WIDE-NEXT:    movzbl %dl, %edx
+; CHECK-WIDE-NEXT:    orl %eax, %edx
+; CHECK-WIDE-NEXT:    vpinsrw $0, %edx, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    retl
   %res = fptosi <4 x float> %src to <4 x i8>
   ret <4 x i8> %res
 }

diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
deleted file mode 100644
index 4da7953..0000000
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ /dev/null

@@ -1,30 +0,0 @@
-; RUN: llc < %s -mtriple=i686-linux -mcpu=penryn | FileCheck %s
-
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
-
-define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
-entry:
-; CHECK: cfi_def_cfa_offset
-; CHECK-NOT: set
-; CHECK: pmovzxwq
-; CHECK: pshufb
-  %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
-  %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
-  %sub322.i = sub <4 x i32> %shr.i, zeroinitializer ; <<4 x i32>> [#uses=1]
-  %cmp323.x = icmp slt <4 x i32> zeroinitializer, %sub322.i ; <<4 x i1>> [#uses=1]
-  %cmp323.i = sext <4 x i1> %cmp323.x to <4 x i32> ; <<4 x i32>> [#uses=1]
-  %or.i = or <4 x i32> %cmp318.i, %cmp323.i       ; <<4 x i32>> [#uses=1]
-  %tmp10.i83.i = bitcast <4 x i32> %or.i to <4 x float> ; <<4 x float>> [#uses=1]
-  %0 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> undef, <4 x float> undef, <4 x float> %tmp10.i83.i) nounwind ; <<4 x float>> [#uses=1]
-  %conv.i.i15.i = bitcast <4 x float> %0 to <4 x i32> ; <<4 x i32>> [#uses=1]
-  %swz.i.i28.i = shufflevector <4 x i32> %conv.i.i15.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i32>> [#uses=1]
-  %tmp6.i29.i = bitcast <2 x i32> %swz.i.i28.i to <4 x i16> ; <<4 x i16>> [#uses=1]
-  %swz.i30.i = shufflevector <4 x i16> %tmp6.i29.i, <4 x i16> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i16>> [#uses=1]
-  store <2 x i16> %swz.i30.i, <2 x i16>* undef
-  unreachable
-  ret void
-}

diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 0aa72b1..318aca1 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
 
 declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)

diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index 3cb519a..530911a 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll

@@ -1,10 +1,14 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 -o %t
-; RUN: not grep extractps   %t
-; RUN: not grep pextrd      %t
-; RUN: not grep pshufd  %t
-; RUN: not grep movss   %t
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 | FileCheck %s
 
 define void @t1(float* %R, <4 x float>* %P1) nounwind {
+; CHECK-LABEL: t1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movss 12(%ecx), %xmm0
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+
 	%X = load <4 x float>* %P1
 	%tmp = extractelement <4 x float> %X, i32 3
 	store float %tmp, float* %R
@@ -12,12 +16,31 @@
 }
 
 define float @t2(<4 x float>* %P1) nounwind {
+; CHECK-LABEL: t2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movapd (%eax), %xmm0
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
+
 	%X = load <4 x float>* %P1
 	%tmp = extractelement <4 x float> %X, i32 2
 	ret float %tmp
 }
 
 define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
+; CHECK-LABEL: t3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl 12(%ecx), %ecx
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    retl
+
 	%X = load <4 x i32>* %P1
 	%tmp = extractelement <4 x i32> %X, i32 3
 	store i32 %tmp, i32* %R
@@ -25,6 +48,12 @@
 }
 
 define i32 @t4(<4 x i32>* %P1) nounwind {
+; CHECK-LABEL: t4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl 12(%eax), %eax
+; CHECK-NEXT:    retl
+
 	%X = load <4 x i32>* %P1
 	%tmp = extractelement <4 x i32> %X, i32 3
 	ret i32 %tmp

diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index 88f5a58..6df7be7 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll

@@ -1,10 +1,17 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 -o %t
-; RUN: grep movss    %t | count 4
-; RUN: grep movhlps  %t | count 1
-; RUN: not grep pshufd   %t 
-; RUN: grep unpckhpd %t | count 1
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
 
 define void @test1(<4 x float>* %F, float* %f) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movaps (%ecx), %xmm0
+; CHECK-NEXT:    addps %xmm0, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+entry:
 	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=2]
 	%tmp7 = fadd <4 x float> %tmp, %tmp		; <<4 x float>> [#uses=1]
 	%tmp2 = extractelement <4 x float> %tmp7, i32 0		; <float> [#uses=1]
@@ -13,6 +20,18 @@
 }
 
 define float @test2(<4 x float>* %F, float* %f) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movaps (%eax), %xmm0
+; CHECK-NEXT:    addps %xmm0, %xmm0
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    movss %xmm0, (%esp)
+; CHECK-NEXT:    flds (%esp)
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:    retl
+entry:
 	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=2]
 	%tmp7 = fadd <4 x float> %tmp, %tmp		; <<4 x float>> [#uses=1]
 	%tmp2 = extractelement <4 x float> %tmp7, i32 2		; <float> [#uses=1]
@@ -20,6 +39,14 @@
 }
 
 define void @test3(float* %R, <4 x float>* %P1) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movss 12(%ecx), %xmm0
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+entry:
 	%X = load <4 x float>* %P1		; <<4 x float>> [#uses=1]
 	%tmp = extractelement <4 x float> %X, i32 3		; <float> [#uses=1]
 	store float %tmp, float* %R
@@ -27,6 +54,17 @@
 }
 
 define double @test4(double %A) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    calll foo
+; CHECK-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    addsd {{[0-9]+}}(%esp), %xmm0
+; CHECK-NEXT:    movsd %xmm0, (%esp)
+; CHECK-NEXT:    fldl (%esp)
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+entry:
 	%tmp1 = call <2 x double> @foo( )		; <<2 x double>> [#uses=1]
 	%tmp2 = extractelement <2 x double> %tmp1, i32 1		; <double> [#uses=1]
 	%tmp3 = fadd double %tmp2, %A		; <double> [#uses=1]

diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index 82517cb..ac02acf 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll

@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s
 
 
 define <2 x double> @fabs_v2f64(<2 x double> %p)
 {
-  ; CHECK: fabs_v2f64
+  ; CHECK-LABEL: fabs_v2f64
   ; CHECK: vandps
   %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
   ret <2 x double> %t
@@ -12,7 +12,7 @@
 
 define <4 x float> @fabs_v4f32(<4 x float> %p)
 {
-  ; CHECK: fabs_v4f32
+  ; CHECK-LABEL: fabs_v4f32
   ; CHECK: vandps
   %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
   ret <4 x float> %t
@@ -21,7 +21,7 @@
 
 define <4 x double> @fabs_v4f64(<4 x double> %p)
 {
-  ; CHECK: fabs_v4f64
+  ; CHECK-LABEL: fabs_v4f64
   ; CHECK: vandps
   %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
   ret <4 x double> %t
@@ -30,9 +30,46 @@
 
 define <8 x float> @fabs_v8f32(<8 x float> %p)
 {
-  ; CHECK: fabs_v8f32
+  ; CHECK-LABEL: fabs_v8f32
   ; CHECK: vandps
   %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
   ret <8 x float> %t
 }
 declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+
+; PR20354: when generating code for a vector fabs op,
+; make sure that we're only turning off the sign bit of each float value.
+; No constant pool loads or vector ops are needed for the fabs of a
+; bitcasted integer constant; we should just return an integer constant
+; that has the sign bits turned off.
+;
+; So instead of something like this:
+;    movabsq (constant pool load of mask for sign bits) 
+;    vmovq   (move from integer register to vector/fp register)
+;    vandps  (mask off sign bits)
+;    vmovq   (move vector/fp register back to integer return register)
+;
+; We should generate:
+;    mov     (put constant value in return register)
+
+define i64 @fabs_v2f32_1() {
+; CHECK-LABEL: fabs_v2f32_1:
+; CHECK: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000
+; CHECK-NEXT: retq
+ %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}
+
+define i64 @fabs_v2f32_2() {
+; CHECK-LABEL: fabs_v2f32_2:
+; CHECK: movl $2147483647, %eax       # imm = 0x7FFFFFFF
+; CHECK-NEXT: retq
+ %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+}
+
+declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p)

diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll
index d49c70e..9743f71 100644
--- a/test/CodeGen/X86/vec_fneg.ll
+++ b/test/CodeGen/X86/vec_fneg.ll

@@ -1,11 +1,45 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
 
+; FNEG is defined as subtraction from -0.0.
+
+; This test verifies that we use an xor with a constant to flip the sign bits; no subtraction needed.
 define <4 x float> @t1(<4 x float> %Q) {
-        %tmp15 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
-	ret <4 x float> %tmp15
+; CHECK-LABEL: t1:
+; CHECK: xorps	{{.*}}LCPI0_0{{.*}}, %xmm0
+; CHECK-NEXT: retq
+        %tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
+	ret <4 x float> %tmp
 }
 
+; This test verifies that we generate an FP subtraction because "0.0 - x" is not an fneg.
 define <4 x float> @t2(<4 x float> %Q) {
-        %tmp15 = fsub <4 x float> zeroinitializer, %Q
-	ret <4 x float> %tmp15
+; CHECK-LABEL: t2:
+; CHECK: xorps	%[[X:xmm[0-9]+]], %[[X]]
+; CHECK-NEXT: subps	%xmm0, %[[X]]
+; CHECK-NEXT: movaps	%[[X]], %xmm0
+; CHECK-NEXT: retq
+        %tmp = fsub <4 x float> zeroinitializer, %Q
+	ret <4 x float> %tmp
+}
+
+; If we're bitcasting an integer to an FP vector, we should avoid the FPU/vector unit entirely.
+; Make sure that we're flipping the sign bit and only the sign bit of each float.
+; So instead of something like this:
+;    movd	%rdi, %xmm0
+;    xorps	.LCPI2_0(%rip), %xmm0
+;
+; We should generate:
+;    movabsq     (put sign bit mask in integer register))
+;    xorq        (flip sign bits)
+;    movd        (move to xmm return register) 
+
+define <2 x float> @fneg_bitcast(i64 %i) {
+; CHECK-LABEL: fneg_bitcast:
+; CHECK:	movabsq	$-9223372034707292160, %rax # imm = 0x8000000080000000
+; CHECK-NEXT:	xorq	%rdi, %rax
+; CHECK-NEXT:	movd	%rax, %xmm0
+; CHECK-NEXT:	retq
+  %bitcast = bitcast i64 %i to <2 x float>
+  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast
+  ret <2 x float> %fneg
 }

diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 7ec07ae..b882a5e 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll

@@ -3,6 +3,8 @@
 
 ; PR11674
 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
+; CHECK-LABEL: fpext_frommem:
+; AVX-LABEL: fpext_frommem:
 entry:
 ; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
 ; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
@@ -13,6 +15,8 @@
 }
 
 define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+; CHECK-LABEL: fpext_frommem4:
+; AVX-LABEL: fpext_frommem4:
 entry:
 ; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
 ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
@@ -24,6 +28,8 @@
 }
 
 define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+; CHECK-LABEL: fpext_frommem8:
+; AVX-LABEL: fpext_frommem8:
 entry:
 ; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
 ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}

diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 5cb9f69..b72044a 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll

@@ -2,66 +2,87 @@
 ; There are no MMX operations in @t1
 
 define void  @t1(i32 %a, x86_mmx* %P) nounwind {
-       %tmp12 = shl i32 %a, 12
-       %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
-       %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
-       %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
-       store x86_mmx %tmp23, x86_mmx* %P
-       ret void
-
 ; CHECK-LABEL: t1:
-; CHECK-NOT: %mm
-; CHECK: shll $12
-; CHECK-NOT: %mm
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    shll $12, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1]
+; CHECK-NEXT:    movlpd %xmm0, (%eax)
+; CHECK-NEXT:    retl
+ %tmp12 = shl i32 %a, 12
+ %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
+ %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
+ %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
+ store x86_mmx %tmp23, x86_mmx* %P
+ ret void
 }
 
 define <4 x float> @t2(<4 x float>* %P) nounwind {
-        %tmp1 = load <4 x float>* %P
-        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
-        ret <4 x float> %tmp2
-
 ; CHECK-LABEL: t2:
-; CHECK: pslldq $12
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movaps (%eax), %xmm1
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* %P
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
+  ret <4 x float> %tmp2
 }
 
 define <4 x float> @t3(<4 x float>* %P) nounwind {
-        %tmp1 = load <4 x float>* %P
-        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
-        ret <4 x float> %tmp2
-
 ; CHECK-LABEL: t3:
-; CHECK: psrldq $8
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movaps (%eax), %xmm0
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,0]
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* %P
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 >
+  ret <4 x float> %tmp2
 }
 
 define <4 x float> @t4(<4 x float>* %P) nounwind {
-        %tmp1 = load <4 x float>* %P
-        %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
-        ret <4 x float> %tmp2
-
 ; CHECK-LABEL: t4:
-; CHECK: psrldq $12
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movaps (%eax), %xmm0
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; CHECK-NEXT:    retl
+  %tmp1 = load <4 x float>* %P
+  %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 >
+  ret <4 x float> %tmp2
 }
 
 define <16 x i8> @t5(<16 x i8> %x) nounwind {
-        %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
-        ret <16 x i8> %s
-
 ; CHECK-LABEL: t5:
-; CHECK: psrldq $1
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
+  ret <16 x i8> %s
 }
 
 define <16 x i8> @t6(<16 x i8> %x) nounwind {
-        %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-        ret <16 x i8> %s
-
 ; CHECK-LABEL: t6:
-; CHECK: palignr $1
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
 }
 
 define <16 x i8> @t7(<16 x i8> %x) nounwind {
-        %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
-        ret <16 x i8> %s
-
 ; CHECK-LABEL: t7:
-; CHECK: pslldq $13
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; CHECK-NEXT:    retl
+  %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
+  ret <16 x i8> %s
 }

diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll
deleted file mode 100644
index 4583e19..0000000
--- a/test/CodeGen/X86/vec_insert-6.ll
+++ /dev/null

@@ -1,9 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | grep pslldq
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6
-
-define <4 x float> @t3(<4 x float>* %P) nounwind  {
-	%tmp1 = load <4 x float>* %P
-	%tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 4, i32 4, i32 4, i32 0 >
-	ret <4 x float> %tmp2
-}

diff --git a/test/CodeGen/X86/vec_insert.ll b/test/CodeGen/X86/vec_insert.ll
deleted file mode 100644
index 0ed8f10..0000000
--- a/test/CodeGen/X86/vec_insert.ll
+++ /dev/null

@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep movss | count 1
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | not grep pinsrw
-
-define void @test(<4 x float>* %F, i32 %I) nounwind {
-	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=1]
-	%f = sitofp i32 %I to float		; <float> [#uses=1]
-	%tmp1 = insertelement <4 x float> %tmp, float %f, i32 0		; <<4 x float>> [#uses=2]
-	%tmp18 = fadd <4 x float> %tmp1, %tmp1		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp18, <4 x float>* %F
-	ret void
-}
-
-define void @test2(<4 x float>* %F, i32 %I, float %g) nounwind {
-	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=1]
-	%f = sitofp i32 %I to float		; <float> [#uses=1]
-	%tmp1 = insertelement <4 x float> %tmp, float %f, i32 2		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp1, <4 x float>* %F
-	ret void
-}

diff --git a/test/CodeGen/X86/vec_return.ll b/test/CodeGen/X86/vec_return.ll
index 2cf5dc6..f7fcd03 100644
--- a/test/CodeGen/X86/vec_return.ll
+++ b/test/CodeGen/X86/vec_return.ll

@@ -10,7 +10,7 @@
 ; Prefer a constant pool load here.
 ; CHECK: test2
 ; CHECK-NOT: shuf
-; CHECK: movaps {{.*}}CPI
+; CHECK: movaps {{.*}}{{CPI|__xmm@}}
 define <4 x i32> @test2() nounwind  {
 	ret <4 x i32> < i32 0, i32 0, i32 1, i32 0 >
 }

diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index d1d7608..a13c813 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll

@@ -1,17 +1,37 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -o %t
-; RUN: grep pshufd %t | count 2
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s
 
-define <4 x float> @test(float %a) nounwind {
-        %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1               ; <<4 x float>> [#uses=1]
-        %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2               ; <<4 x float>> [#uses=1]
-        %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3              ; <<4 x float>> [#uses=1]
-        ret <4 x float> %tmp6
+define <4 x float> @test(float %a) {
+; CHECK-LABEL: test:
+; CHECK:         insertps $29, {{.*}}, %xmm0
+; CHECK-NEXT:    retl
+
+entry:
+  %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
+  %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2
+  %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3
+  ret <4 x float> %tmp6
 }
 
-define <2 x i64> @test2(i32 %a) nounwind {
-        %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2          ; <<4 x i32>> [#uses=1]
-        %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3             ; <<4 x i32>> [#uses=1]
-        %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>           ; <<2 x i64>> [#uses=1]
-        ret <2 x i64> %tmp10
+define <2 x i64> @test2(i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK:         movd {{.*}}, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; CHECK-NEXT:    retl
+
+entry:
+  %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
+  %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3
+  %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64>
+  ret <2 x i64> %tmp10
 }
 
+define <4 x float> @test3(<4 x float> %A) {
+; CHECK-LABEL: test3:
+; CHECK:         insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; CHECK-NEXT:    retl
+
+  %tmp0 = extractelement <4 x float> %A, i32 0
+  %tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
+  %tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2
+  ret <4 x float> %tmp2
+}

diff --git a/test/CodeGen/X86/vec_set-5.ll b/test/CodeGen/X86/vec_set-5.ll
deleted file mode 100644
index f811a74..0000000
--- a/test/CodeGen/X86/vec_set-5.ll
+++ /dev/null

@@ -1,28 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
-; RUN: grep movlhps   %t | count 1
-; RUN: grep movq      %t | count 2
-
-define <4 x float> @test1(float %a, float %b) nounwind {
-	%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0		; <<4 x float>> [#uses=1]
-	%tmp6 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
-	%tmp8 = insertelement <4 x float> %tmp6, float %b, i32 2		; <<4 x float>> [#uses=1]
-	%tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
-	ret <4 x float> %tmp9
-}
-
-define <4 x float> @test2(float %a, float %b) nounwind {
-	%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0		; <<4 x float>> [#uses=1]
-	%tmp7 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
-	%tmp8 = insertelement <4 x float> %tmp7, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
-	%tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
-	ret <4 x float> %tmp9
-}
-
-define <2 x i64> @test3(i32 %a, i32 %b) nounwind {
-	%tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0		; <<4 x i32>> [#uses=1]
-	%tmp6 = insertelement <4 x i32> %tmp, i32 %b, i32 1		; <<4 x i32>> [#uses=1]
-	%tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2		; <<4 x i32>> [#uses=1]
-	%tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3		; <<4 x i32>> [#uses=1]
-	%tmp11 = bitcast <4 x i32> %tmp10 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	ret <2 x i64> %tmp11
-}

diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll
deleted file mode 100644
index a739090..0000000
--- a/test/CodeGen/X86/vec_set-9.ll
+++ /dev/null

@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mattr=-avx,-pad-short-functions | FileCheck %s
-
-; CHECK: test3
-; CHECK: movd
-; CHECK-NOT: movd
-; CHECK: {{movlhps.*%xmm0, %xmm0}}
-; CHECK-NEXT: ret
-
-define <2 x i64> @test3(i64 %A) nounwind {
-entry:
-	%B = insertelement <2 x i64> undef, i64 %A, i32 1
-	ret <2 x i64> %B
-}
-

diff --git a/test/CodeGen/X86/vec_set-E.ll b/test/CodeGen/X86/vec_set-E.ll
deleted file mode 100644
index d78be66..0000000
--- a/test/CodeGen/X86/vec_set-E.ll
+++ /dev/null

@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movq
-
-define <4 x float> @t(float %X) nounwind  {
-	%tmp11 = insertelement <4 x float> undef, float %X, i32 0
-	%tmp12 = insertelement <4 x float> %tmp11, float %X, i32 1
-	%tmp27 = insertelement <4 x float> %tmp12, float 0.000000e+00, i32 2
-	%tmp28 = insertelement <4 x float> %tmp27, float 0.000000e+00, i32 3
-	ret <4 x float> %tmp28
-}

diff --git a/test/CodeGen/X86/vec_set-G.ll b/test/CodeGen/X86/vec_set-G.ll
deleted file mode 100644
index 4a542fe..0000000
--- a/test/CodeGen/X86/vec_set-G.ll
+++ /dev/null

@@ -1,9 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss
-
-define fastcc void @t(<4 x float> %A) nounwind  {
-	%tmp41896 = extractelement <4 x float> %A, i32 0		; <float> [#uses=1]
-	%tmp14082 = insertelement <4 x float> < float 0.000000e+00, float undef, float undef, float undef >, float %tmp41896, i32 1		; <<4 x float>> [#uses=1]
-	%tmp14083 = insertelement <4 x float> %tmp14082, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp14083, <4 x float>* null, align 16
-        ret void
-}

diff --git a/test/CodeGen/X86/vec_set-I.ll b/test/CodeGen/X86/vec_set-I.ll
deleted file mode 100644
index c5d6ab8..0000000
--- a/test/CodeGen/X86/vec_set-I.ll
+++ /dev/null

@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-
-; CHECK-NOT: xorp
-; CHECK: movd
-; CHECK-NOT: xorp
-
-define void @t1() nounwind  {
-	%tmp298.i.i = load <4 x float>* null, align 16
-	%tmp304.i.i = bitcast <4 x float> %tmp298.i.i to <4 x i32>
-	%tmp305.i.i = and <4 x i32> %tmp304.i.i, < i32 -1, i32 0, i32 0, i32 0 >
-	store <4 x i32> %tmp305.i.i, <4 x i32>* null, align 16
-	unreachable
-}

diff --git a/test/CodeGen/X86/vec_set-J.ll b/test/CodeGen/X86/vec_set-J.ll
deleted file mode 100644
index d90ab85..0000000
--- a/test/CodeGen/X86/vec_set-J.ll
+++ /dev/null

@@ -1,10 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss
-; PR2472
-
-define <4 x i32> @a(<4 x i32> %a) nounwind {
-entry:
-        %vecext = extractelement <4 x i32> %a, i32 0
-        insertelement <4 x i32> zeroinitializer, i32 %vecext, i32 0
-        %add = add <4 x i32> %a, %0
-        ret <4 x i32> %add
-}

diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index 322dbae..b69f90c 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll

@@ -62,8 +62,7 @@
 ; SSE2-LABEL: v8i16_icmp_ule:
 ; SSE2: psubusw %xmm1, %xmm0
 ; SSE2: pxor    %xmm1, %xmm1
-; SSE2: pcmpeqw %xmm0, %xmm1
-; SSE2: movdqa  %xmm1, %xmm0
+; SSE2: pcmpeqw %xmm1, %xmm0
 
 ; SSE41-LABEL: v8i16_icmp_ule:
 ; SSE41: pminuw  %xmm0, %xmm1
@@ -106,8 +105,7 @@
 ; SSE2: pxor    %xmm2, %xmm0
 ; SSE2: pcmpgtd %xmm1, %xmm0
 ; SSE2: pcmpeqd %xmm1, %xmm1
-; SSE2: pxor    %xmm0, %xmm1
-; SSE2: movdqa  %xmm1, %xmm0
+; SSE2: pxor    %xmm1, %xmm0
 
 ; SSE41-LABEL: v4i32_icmp_ule:
 ; SSE41: pminud  %xmm0, %xmm1

diff --git a/test/CodeGen/X86/vec_sext.ll b/test/CodeGen/X86/vec_sext.ll
deleted file mode 100644
index 776ddec..0000000
--- a/test/CodeGen/X86/vec_sext.ll
+++ /dev/null

@@ -1,69 +0,0 @@
-; RUN: llc < %s -march=x86-64
-; PR 9267
-
-define<4 x i32> @func_16_32() {
-  %F = load <4 x i16>* undef
-  %G = sext <4 x i16> %F to <4 x i32>
-  %H = load <4 x i16>* undef
-  %Y = sext <4 x i16> %H to <4 x i32>
-  %T = add <4 x i32> %Y, %G
-  store <4 x i32>%T , <4 x i32>* undef
-  ret <4 x i32> %T
-}
-
-define<4 x i64> @func_16_64() {
-  %F = load <4 x i16>* undef
-  %G = sext <4 x i16> %F to <4 x i64>
-  %H = load <4 x i16>* undef
-  %Y = sext <4 x i16> %H to <4 x i64>
-  %T = xor <4 x i64> %Y, %G
-  store <4 x i64>%T , <4 x i64>* undef
-  ret <4 x i64> %T
-}
-
-define<4 x i64> @func_32_64() {
-  %F = load <4 x i32>* undef
-  %G = sext <4 x i32> %F to <4 x i64>
-  %H = load <4 x i32>* undef
-  %Y = sext <4 x i32> %H to <4 x i64>
-  %T = or <4 x i64> %Y, %G
-  ret <4 x i64> %T
-}
-
-define<4 x i16> @func_8_16() {
-  %F = load <4 x i8>* undef
-  %G = sext <4 x i8> %F to <4 x i16>
-  %H = load <4 x i8>* undef
-  %Y = sext <4 x i8> %H to <4 x i16>
-  %T = add <4 x i16> %Y, %G
-  ret <4 x i16> %T
-}
-
-define<4 x i32> @func_8_32() {
-  %F = load <4 x i8>* undef
-  %G = sext <4 x i8> %F to <4 x i32>
-  %H = load <4 x i8>* undef
-  %Y = sext <4 x i8> %H to <4 x i32>
-  %T = sub <4 x i32> %Y, %G
-  ret <4 x i32> %T
-}
-
-define<4 x i64> @func_8_64() {
-  %F = load <4 x i8>* undef
-  %G = sext <4 x i8> %F to <4 x i64>
-  %H = load <4 x i8>* undef
-  %Y = sext <4 x i8> %H to <4 x i64>
-  %T = add <4 x i64> %Y, %G
-  ret <4 x i64> %T
-}
-
-define<4 x i32> @const_16_32() {
-  %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32>
-  ret <4 x i32> %G
-}
-
-define<4 x i64> @const_16_64() {
-  %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64>
-  ret <4 x i64> %G
-}
-

diff --git a/test/CodeGen/X86/vec_shuffle-11.ll b/test/CodeGen/X86/vec_shuffle-11.ll
deleted file mode 100644
index 640745a..0000000
--- a/test/CodeGen/X86/vec_shuffle-11.ll
+++ /dev/null

@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep mov
-
-define <4 x i32> @test() nounwind {
-        %tmp131 = call <2 x i64> @llvm.x86.sse2.psrl.dq( <2 x i64> < i64 -1, i64 -1 >, i32 96 )         ; <<2 x i64>> [#uses=1]
-        %tmp137 = bitcast <2 x i64> %tmp131 to <4 x i32>                ; <<4 x i32>> [#uses=1]
-        %tmp138 = and <4 x i32> %tmp137, bitcast (<2 x i64> < i64 -1, i64 -1 > to <4 x i32>)            ; <<4 x i32>> [#uses=1]
-        ret <4 x i32> %tmp138
-}
-
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32)

diff --git a/test/CodeGen/X86/vec_shuffle-14.ll b/test/CodeGen/X86/vec_shuffle-14.ll
deleted file mode 100644
index 8f25197..0000000
--- a/test/CodeGen/X86/vec_shuffle-14.ll
+++ /dev/null

@@ -1,70 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-32
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-64
-
-define <4 x i32> @t1(i32 %a) nounwind  {
-entry:
-        %tmp = insertelement <4 x i32> undef, i32 %a, i32 0
-	%tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> < i32 4, i32 1, i32 2, i32 3 >		; <<4 x i32>> [#uses=1]
-	ret <4 x i32> %tmp6
-
-; X86-32-LABEL: t1:
-; X86-32: movd	4(%esp), %xmm0
-
-; X86-64-LABEL: t1:
-; X86-64: movd	%e{{..}}, %xmm0
-}
-
-define <2 x i64> @t2(i64 %a) nounwind  {
-entry:
-        %tmp = insertelement <2 x i64> undef, i64 %a, i32 0
-	%tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %tmp, <2 x i32> < i32 2, i32 1 >		; <<4 x i32>> [#uses=1]
-	ret <2 x i64> %tmp6
-
-; X86-32-LABEL: t2:
-; X86-32: movq	4(%esp), %xmm0
-
-; X86-64-LABEL: t2:
-; X86-64: movd	%r{{..}}, %xmm0
-}
-
-define <2 x i64> @t3(<2 x i64>* %a) nounwind  {
-entry:
-	%tmp4 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
-	%tmp6 = bitcast <2 x i64> %tmp4 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%tmp7 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp6, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x i32>> [#uses=1]
-	%tmp8 = bitcast <4 x i32> %tmp7 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	ret <2 x i64> %tmp8
-
-; X86-32-LABEL: t3:
-; X86-32: movl	4(%esp)
-; X86-32: movq
-
-; X86-64-LABEL: t3:
-; X86-64: movq	({{.*}}), %xmm0
-}
-
-define <2 x i64> @t4(<2 x i64> %a) nounwind  {
-entry:
-	%tmp5 = bitcast <2 x i64> %a to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp5, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x i32>> [#uses=1]
-	%tmp7 = bitcast <4 x i32> %tmp6 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	ret <2 x i64> %tmp7
-
-; X86-32-LABEL: t4:
-; X86-32: movq %xmm0, %xmm0
-
-; X86-64-LABEL: t4:
-; X86-64: movq {{.*}}, %xmm0
-}
-
-define <2 x i64> @t5(<2 x i64> %a) nounwind  {
-entry:
-	%tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <2 x i32> < i32 2, i32 1 >		; <<4 x i32>> [#uses=1]
-	ret <2 x i64> %tmp6
-
-; X86-32-LABEL: t5:
-; X86-32: movq %xmm0, %xmm0
-
-; X86-64-LABEL: t5:
-; X86-64: movq {{.*}}, %xmm0
-}

diff --git a/test/CodeGen/X86/vec_shuffle-15.ll b/test/CodeGen/X86/vec_shuffle-15.ll
deleted file mode 100644
index 5a9b8fd..0000000
--- a/test/CodeGen/X86/vec_shuffle-15.ll
+++ /dev/null

@@ -1,81 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
-
-define <2 x i64> @t00(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 0 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t01(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 1 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t02(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 2 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t03(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 3 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t10(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 0 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t11(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 1 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t12(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 2 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 3 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t20(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 0 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t21(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 1 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t22(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 2 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t23(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 3 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t30(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 0 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t31(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 1 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t32(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 2 >
-	ret <2 x i64> %tmp
-}
-
-define <2 x i64> @t33(<2 x i64> %a, <2 x i64> %b) nounwind  {
-	%tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 3 >
-	ret <2 x i64> %tmp
-}

diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll
deleted file mode 100644
index 9aeb942..0000000
--- a/test/CodeGen/X86/vec_shuffle-16.ll
+++ /dev/null

@@ -1,43 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
-
-; sse-LABEL:  t1:
-; sse2-LABEL: t1:
-define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind  {
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
-        %tmp1 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
-        ret <4 x float> %tmp1
-}
-
-; sse-LABEL:  t2:
-; sse2-LABEL: t2:
-define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind {
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
-	%tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 3, i32 3, i32 3, i32 3 >
-	ret <4 x float> %tmp
-}
-
-; sse-LABEL:  t3:
-; sse2-LABEL: t3:
-define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind {
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
-	%tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 4, i32 4, i32 4, i32 4 >
-	ret <4 x float> %tmp
-}
-
-; sse-LABEL:  t4:
-; sse2-LABEL: t4:
-define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind {
-
-; sse: shufps
-; sse2: pshufd
-; sse2-NEXT: ret
-	%tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 1, i32 3, i32 2, i32 0 >
-	ret <4 x float> %tmp
-}

diff --git a/test/CodeGen/X86/vec_shuffle-17.ll b/test/CodeGen/X86/vec_shuffle-17.ll
deleted file mode 100644
index f2f96ba..0000000
--- a/test/CodeGen/X86/vec_shuffle-17.ll
+++ /dev/null

@@ -1,16 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s
-; CHECK-NOT: xor
-; CHECK: movd {{%rdi|%rcx}}, %xmm0
-; CHECK-NOT: xor
-; PR2108
-
-define <2 x i64> @doload64(i64 %x) nounwind  {
-entry:
-	%tmp717 = bitcast i64 %x to double		; <double> [#uses=1]
-	%tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0		; <<2 x double>> [#uses=1]
-	%tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1		; <<2 x double>> [#uses=1]
-	%tmp11 = bitcast <2 x double> %tmp9 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	ret <2 x i64> %tmp11
-}
-

diff --git a/test/CodeGen/X86/vec_shuffle-18.ll b/test/CodeGen/X86/vec_shuffle-18.ll
deleted file mode 100644
index 1104a4a..0000000
--- a/test/CodeGen/X86/vec_shuffle-18.ll
+++ /dev/null

@@ -1,25 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7
-
-	%struct.vector4_t = type { <4 x float> }
-
-define void @swizzle(i8* %a, %struct.vector4_t* %b, %struct.vector4_t* %c) nounwind  {
-entry:
-	%tmp9 = getelementptr %struct.vector4_t* %b, i32 0, i32 0		; <<4 x float>*> [#uses=2]
-	%tmp10 = load <4 x float>* %tmp9, align 16		; <<4 x float>> [#uses=1]
-	%tmp14 = bitcast i8* %a to double*		; <double*> [#uses=1]
-	%tmp15 = load double* %tmp14		; <double> [#uses=1]
-	%tmp16 = insertelement <2 x double> undef, double %tmp15, i32 0		; <<2 x double>> [#uses=1]
-	%tmp18 = bitcast <2 x double> %tmp16 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp19 = shufflevector <4 x float> %tmp10, <4 x float> %tmp18, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp19, <4 x float>* %tmp9, align 16
-	%tmp28 = getelementptr %struct.vector4_t* %c, i32 0, i32 0		; <<4 x float>*> [#uses=2]
-	%tmp29 = load <4 x float>* %tmp28, align 16		; <<4 x float>> [#uses=1]
-	%tmp26 = getelementptr i8* %a, i32 8		; <i8*> [#uses=1]
-	%tmp33 = bitcast i8* %tmp26 to double*		; <double*> [#uses=1]
-	%tmp34 = load double* %tmp33		; <double> [#uses=1]
-	%tmp35 = insertelement <2 x double> undef, double %tmp34, i32 0		; <<2 x double>> [#uses=1]
-	%tmp37 = bitcast <2 x double> %tmp35 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp38 = shufflevector <4 x float> %tmp29, <4 x float> %tmp37, <4 x i32> < i32 4, i32 5, i32 2, i32 3 >		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp38, <4 x float>* %tmp28, align 16
-	ret void
-}

diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll
deleted file mode 100644
index 48db8de..0000000
--- a/test/CodeGen/X86/vec_shuffle-19.ll
+++ /dev/null

@@ -1,9 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -o /dev/null -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4
-; PR2485
-
-define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind  {
-entry:
-	%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> < i32 4, i32 0, i32 0, i32 0 >		; <<4 x i32>> [#uses=1]
-	ret <4 x i32> %shuffle
-}

diff --git a/test/CodeGen/X86/vec_shuffle-20.ll b/test/CodeGen/X86/vec_shuffle-20.ll
deleted file mode 100644
index 5a2c444..0000000
--- a/test/CodeGen/X86/vec_shuffle-20.ll
+++ /dev/null

@@ -1,8 +0,0 @@
-; REQUIRES: asserts
-; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2
-
-define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind  {
-entry:
-	shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 >		; <<4 x float>>:0 [#uses=1]
-	ret <4 x float> %0
-}

diff --git a/test/CodeGen/X86/vec_shuffle-22.ll b/test/CodeGen/X86/vec_shuffle-22.ll
deleted file mode 100644
index 6807e4d..0000000
--- a/test/CodeGen/X86/vec_shuffle-22.ll
+++ /dev/null

@@ -1,15 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium-m  | FileCheck %s
-
-define <4 x float> @t1(<4 x float> %a) nounwind  {
-; CHECK: movlhps
-  %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 >       ; <<4 x float>> [#uses=1]
-  ret <4 x float> %tmp1
-}
-
-define <4 x i32> @t2(<4 x i32>* %a) nounwind {
-; CHECK: pshufd
-; CHECK: ret
-  %tmp1 = load <4 x i32>* %a
-	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 >		; <<4 x i32>> [#uses=1]
-	ret <4 x i32> %tmp2
-}

diff --git a/test/CodeGen/X86/vec_shuffle-23.ll b/test/CodeGen/X86/vec_shuffle-23.ll
deleted file mode 100644
index 2468735..0000000
--- a/test/CodeGen/X86/vec_shuffle-23.ll
+++ /dev/null

@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2                | not grep punpck
-; RUN: llc < %s -march=x86 -mattr=+sse2                |     grep pshufd
-
-define i32 @t() nounwind {
-entry:
-	%a = alloca <4 x i32>		; <<4 x i32>*> [#uses=2]
-	%b = alloca <4 x i32>		; <<4 x i32>*> [#uses=5]
-	store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a
-	%tmp = load <4 x i32>* %a		; <<4 x i32>> [#uses=1]
-	store <4 x i32> %tmp, <4 x i32>* %b
-	%tmp1 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%tmp2 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x i32>> [#uses=1]
-	store <4 x i32> %punpckldq, <4 x i32>* %b
-	%tmp3 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%result = extractelement <4 x i32> %tmp3, i32 0		; <i32> [#uses=1]
-	ret i32 %result
-}

diff --git a/test/CodeGen/X86/vec_shuffle-24.ll b/test/CodeGen/X86/vec_shuffle-24.ll
deleted file mode 100644
index d038daf..0000000
--- a/test/CodeGen/X86/vec_shuffle-24.ll
+++ /dev/null

@@ -1,18 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
-
-define i32 @t() nounwind optsize {
-entry:
-; CHECK: punpckldq
-	%a = alloca <4 x i32>		; <<4 x i32>*> [#uses=2]
-	%b = alloca <4 x i32>		; <<4 x i32>*> [#uses=5]
-	store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a
-	%tmp = load <4 x i32>* %a		; <<4 x i32>> [#uses=1]
-	store <4 x i32> %tmp, <4 x i32>* %b
-	%tmp1 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%tmp2 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x i32>> [#uses=1]
-	store <4 x i32> %punpckldq, <4 x i32>* %b
-	%tmp3 = load <4 x i32>* %b		; <<4 x i32>> [#uses=1]
-	%result = extractelement <4 x i32> %tmp3, i32 0		; <i32> [#uses=1]
-	ret i32 %result
-}

diff --git a/test/CodeGen/X86/vec_shuffle-25.ll b/test/CodeGen/X86/vec_shuffle-25.ll
deleted file mode 100644
index 3f42a13..0000000
--- a/test/CodeGen/X86/vec_shuffle-25.ll
+++ /dev/null

@@ -1,34 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=sse4.1 -o %t
-; RUN: grep unpcklps %t | count 3
-; RUN: grep unpckhps %t | count 1
- 
-; Transpose example using the more generic vector shuffle.  We return
-; float8 instead of float16 since x86 can return that in register.
-; ModuleID = 'transpose2_opt.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i386-apple-cl.1.0"
-@r0 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r1 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r2 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r3 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-
-define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
-entry:
-	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
-	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
-	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
-	%unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
-	%unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
-	%unpcklps14a = shufflevector <4 x float> %unpcklps14,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	%unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
-	%unpckhps17a = shufflevector <4 x float> %unpckhps17,  <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	%r1 = shufflevector <16 x float> %unpcklps14a,  <16 x float> %unpckhps17a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-	%unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
-	%unpcklps20a = shufflevector <4 x float> %unpcklps20,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	%r2 = shufflevector <16 x float> %r1,  <16 x float> %unpcklps20a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-	%unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
-	%unpckhps23a = shufflevector <4 x float> %unpckhps23,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	%r3 = shufflevector <16 x float> %r2,  <16 x float> %unpckhps23a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-	%r4 = shufflevector <16 x float> %r3,  <16 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-	ret <8 x float> %r4
-}

diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
deleted file mode 100644
index 00e8e73..0000000
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ /dev/null

@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse4.1 | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
-
-; Transpose example using the more generic vector shuffle. Return float8
-; instead of float16
-; ModuleID = 'transpose2_opt.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i386-apple-cl.1.0"
-@r0 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r1 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r2 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-@r3 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
-
-define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
-entry:
-; CHECK: transpose2
-; CHECK: unpckhps
-; CHECK: unpckhps
-; CHECK: unpcklps
-; CHECK: unpckhps
-; Different instruction order for Atom.
-; ATOM: transpose2
-; ATOM: unpckhps
-; ATOM: unpckhps
-; ATOM: unpckhps
-; ATOM: unpcklps
-	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
-	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
-	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
-	%unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
-	%unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
-	%unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
-        %r1 = shufflevector <4 x float> %unpcklps14,  <4 x float> %unpckhps17,  <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
-	%unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
-	%unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
-        %r2 = shufflevector <4 x float> %unpcklps20,  <4 x float> %unpckhps23,  <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
-;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
-	ret <8 x float> %r2
-}
-
-define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
-entry:
-; movhps should happen before extractps to assure it gets the correct value.
-; CHECK: lo_hi_shift
-; CHECK: movhps ([[BASEREG:%[a-z]+]]),
-; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; ATOM: lo_hi_shift
-; ATOM: movhps ([[BASEREG:%[a-z]+]]),
-; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
-  %v.i = bitcast float* %y to <4 x float>*
-  %0 = load <4 x float>* %v.i, align 1
-  %1 = bitcast float* %x to <1 x i64>*
-  %.val = load <1 x i64>* %1, align 1
-  %2 = bitcast <1 x i64> %.val to <2 x float>
-  %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  %cast.i = bitcast <4 x float> %0 to <2 x i64>
-  %extract.i = extractelement <2 x i64> %cast.i, i32 1
-  %3 = bitcast float* %x to i64*
-  store i64 %extract.i, i64* %3, align 4
-  %4 = bitcast <4 x float> %0 to <16 x i8>
-  %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
-  %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  %6 = bitcast <16 x i8> %palignr to <2 x i64>
-  ret <2 x i64> %6
-}

diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll
deleted file mode 100644
index c9b2fb5..0000000
--- a/test/CodeGen/X86/vec_shuffle-27.ll
+++ /dev/null

@@ -1,38 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
-
-; ModuleID = 'vec_shuffle-27.bc'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-apple-cl.1.0"
-
-define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone {
-entry:
-; CHECK: subps
-; CHECK: subps
-; CHECK: mulps
-; CHECK: mulps
-; CHECK: addps
-; CHECK: addps
-	%tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 >		; <<8 x float>> [#uses=1]
-	%sub = fsub <8 x float> %T1, %T0		; <<8 x float>> [#uses=1]
-	%mul = fmul <8 x float> %sub, %tmp7		; <<8 x float>> [#uses=1]
-	%add = fadd <8 x float> %mul, %T0		; <<8 x float>> [#uses=1]
-	ret <8 x float> %add
-}
-
-; Test case for r122206
-define void @test2(<4 x i64>* %ap, <4 x i64>* %bp) nounwind {
-entry:
-; CHECK: movdqa
-  %a = load <4 x i64> * %ap
-  %b = load <4 x i64> * %bp
-  %mulaa = mul <4 x i64> %a, %a
-  %mulbb = mul <4 x i64> %b, %b
-  %mulab = mul <4 x i64> %a, %b
-  %vect1271 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
-  %vect1272 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
-  %vect1487 = shufflevector <4 x i64> %vect1271, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-  %vect1488 = shufflevector <4 x i64> %vect1272, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-  store <4 x i64> %vect1487, <4 x i64>* %ap
-  store <4 x i64> %vect1488, <4 x i64>* %bp
-  ret void;
-}

diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll
deleted file mode 100644
index ebf5577..0000000
--- a/test/CodeGen/X86/vec_shuffle-28.ll
+++ /dev/null

@@ -1,14 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
-
-; CHECK:     pshufb
-; CHECK-NOT: pshufb
-
-; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently.
-;        Don't XFAIL it because it's still better than the previous code.
-
-; Pack various elements via shuffles.
-define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
-	ret <8 x i16> %tmp7
-}

diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll
deleted file mode 100644
index f5f8842..0000000
--- a/test/CodeGen/X86/vec_shuffle-30.ll
+++ /dev/null

@@ -1,26 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
-
-; CHECK: test
-; Test case when creating pshufhw, we incorrectly set the higher order bit
-; for an undef,
-define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind {
-entry:
-; CHECK-NOT: vmovaps
-; CHECK: vmovlpd
-; CHECK: vpshufhw        $-95
-  %0 = load <8 x i16>* %dest
-  %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14>
-  store <8 x i16> %1, <8 x i16>* %dest
-  ret void
-}
-
-; CHECK: test2
-; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq
-define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind {
-entry:
-; CHECK-NOT: pslldq
-; CHECK: shufps
-  %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2>
-  store <4 x i32> %0, <4 x i32>* %dest
-  ret void
-}

diff --git a/test/CodeGen/X86/vec_shuffle-31.ll b/test/CodeGen/X86/vec_shuffle-31.ll
deleted file mode 100644
index bb06e15..0000000
--- a/test/CodeGen/X86/vec_shuffle-31.ll
+++ /dev/null

@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 -o %t
-; RUN: grep pshufb %t | count 1
-
-define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef >
-	ret <8 x i16> %tmp9
-}

diff --git a/test/CodeGen/X86/vec_shuffle-34.ll b/test/CodeGen/X86/vec_shuffle-34.ll
deleted file mode 100644
index d057b3f..0000000
--- a/test/CodeGen/X86/vec_shuffle-34.ll
+++ /dev/null

@@ -1,7 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | grep pshufb | count 2
-
-define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
-	ret <8 x i16> %tmp8
-}

diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll
deleted file mode 100644
index f5083b4..0000000
--- a/test/CodeGen/X86/vec_shuffle-35.ll
+++ /dev/null

@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t
-; RUN: grep pextrw %t | count 12
-; RUN: grep pinsrw %t | count 13
-; RUN: grep rolw %t | count 13
-; RUN: not grep esp %t
-; RUN: not grep ebp %t
-; RUN: llc < %s -march=x86 -mcpu=core2 -stack-alignment=16 -o %t
-; RUN: grep pshufb %t | count 3
-
-define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone {
-entry:
-	%tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
-	ret <16 x i8> %tmp8
-}
-
-define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-entry:
-	%tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 >
-	ret <16 x i8> %tmp8
-}

diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll
deleted file mode 100644
index f1d0f93..0000000
--- a/test/CodeGen/X86/vec_shuffle-36.ll
+++ /dev/null

@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
-
-define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-; CHECK: pshufb
-; CHECK-NOT: pshufb
-; CHECK: ret
-entry:
-  %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 3, i32 2, i32 0, i32 2, i32 1, i32 5, i32 6 , i32 undef >
-  ret <8 x i16> %tmp9
-}
-
-define <8 x i16> @shuf7(<8 x i16> %t0) {
-; CHECK: pshufd
-  %tmp10 = shufflevector <8 x i16> %t0, <8 x i16> undef, <8 x i32> < i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef >
-  ret <8 x i16> %tmp10
-}

diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll
deleted file mode 100644
index ed285f9..0000000
--- a/test/CodeGen/X86/vec_shuffle-37.ll
+++ /dev/null

@@ -1,47 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s
-; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0
-
-define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
-entry:
-; CHECK: movaps  ({{%rdi|%rcx}}), %[[XMM0:xmm[0-9]+]]
-; CHECK: movaps  %[[XMM0]], %[[XMM1:xmm[0-9]+]]
-; CHECK-NEXT: movss   %xmm{{[0-9]+}}, %[[XMM1]]
-; CHECK-NEXT: shufps  $36, %[[XMM1]], %[[XMM0]]
-  %0 = load <4 x i32>* undef, align 16
-  %1 = load <4 x i32>* %a0, align 16
-  %2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-  ret <4 x i32> %2
-}
-
-define void @t01(double* %a0) nounwind ssp {
-entry:
-; CHECK_O0: movsd (%eax), %xmm0
-; CHECK_O0: unpcklpd  %xmm0, %xmm0
-  %tmp93 = load double* %a0, align 8
-  %vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1
-  store <2 x double> %vecinit94, <2 x double>* undef
-  ret void
-}
-
-define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
-entry:
-; CHECK: t02
-; CHECK: movaps
-; CHECK: shufps
-; CHECK: pshufd
-; CHECK: movq
-; CHECK: ret
-  %0 = bitcast <8 x i32>* %source to <4 x i32>*
-  %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
-  %tmp2 = load <4 x i32>* %arrayidx, align 16
-  %tmp3 = extractelement <4 x i32> %tmp2, i32 0
-  %tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0
-  %arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1
-  %1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>*
-  %tmp8 = load <4 x i32>* %1, align 16
-  %tmp9 = extractelement <4 x i32> %tmp8, i32 1
-  %tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1
-  store <2 x i32> %tmp11, <2 x i32>* %dest, align 8
-  ret void
-}

diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll
deleted file mode 100644
index ec196df..0000000
--- a/test/CodeGen/X86/vec_shuffle-38.ll
+++ /dev/null

@@ -1,77 +0,0 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-
-define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp {
-; CHECK: unpcklpd
-  %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> zeroinitializer
-  ret <2 x double> %shuffle
-}
-
-define <2 x double> @hd(<2 x double> %p) nounwind optsize ssp {
-; CHECK: unpckhpd
-  %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x double> %shuffle
-}
-
-define <2 x i64> @ldi(<2 x i64> %p) nounwind optsize ssp {
-; CHECK: punpcklqdq
-  %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> zeroinitializer
-  ret <2 x i64> %shuffle
-}
-
-define <2 x i64> @hdi(<2 x i64> %p) nounwind optsize ssp {
-; CHECK: punpckhqdq
-  %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
-  ret <2 x i64> %shuffle
-}
-
-; rdar://10050549
-%struct.Float2 = type { float, float }
-
-define <4 x float> @loadhpi(%struct.Float2* %vPtr, <4 x float> %vecin1) nounwind readonly ssp {
-entry:
-; CHECK: loadhpi
-; CHECK-NOT: movq
-; CHECK: movhps (
-  %tmp1 = bitcast %struct.Float2* %vPtr to <1 x i64>*
-  %addptr7 = getelementptr inbounds <1 x i64>* %tmp1, i64 0
-  %tmp2 = bitcast <1 x i64>* %addptr7 to float*
-  %tmp3 = load float* %tmp2, align 4
-  %vec = insertelement <4 x float> undef, float %tmp3, i32 0
-  %addptr.i12 = getelementptr inbounds float* %tmp2, i64 1
-  %tmp4 = load float* %addptr.i12, align 4
-  %vecin2 = insertelement <4 x float> %vec, float %tmp4, i32 1
-  %shuffle = shufflevector <4 x float> %vecin1, <4 x float> %vecin2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x float> %shuffle
-}
-
-; rdar://10119696
-; CHECK: f
-define <4 x float> @f(<4 x float> %x, double* nocapture %y) nounwind readonly ssp {
-entry:
-  ; CHECK: movlps  (%{{rdi|rdx}}), %xmm0
-  %u110.i = load double* %y, align 1
-  %tmp8.i = insertelement <2 x double> undef, double %u110.i, i32 0
-  %tmp9.i = bitcast <2 x double> %tmp8.i to <4 x float>
-  %shuffle.i = shufflevector <4 x float> %x, <4 x float> %tmp9.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  ret <4 x float> %shuffle.i
-}
-
-define <4 x float> @loadhpi2(%struct.Float2* nocapture %vHiCoefPtr_0, %struct.Float2* nocapture %vLoCoefPtr_0, i32 %s) nounwind readonly ssp {
-entry:
-; CHECK: loadhpi2
-; CHECK: movhps (
-; CHECK-NOT: movlhps
-  %0 = bitcast %struct.Float2* %vHiCoefPtr_0 to <1 x i64>*
-  %idx.ext = sext i32 %s to i64
-  %add.ptr = getelementptr inbounds <1 x i64>* %0, i64 %idx.ext
-  %add.ptr.val = load <1 x i64>* %add.ptr, align 1
-  %1 = bitcast <1 x i64> %add.ptr.val to <2 x float>
-  %shuffle.i = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %2 = bitcast %struct.Float2* %vLoCoefPtr_0 to <1 x i64>*
-  %add.ptr2 = getelementptr inbounds <1 x i64>* %2, i64 %idx.ext
-  %add.ptr2.val = load <1 x i64>* %add.ptr2, align 1
-  %3 = bitcast <1 x i64> %add.ptr2.val to <2 x float>
-  %shuffle.i4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %shuffle1.i5 = shufflevector <4 x float> %shuffle.i, <4 x float> %shuffle.i4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  ret <4 x float> %shuffle1.i5
-}

diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll
deleted file mode 100644
index 8fd9a5c..0000000
--- a/test/CodeGen/X86/vec_shuffle-39.ll
+++ /dev/null

@@ -1,86 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s
-; rdar://10050222, rdar://10134392
-
-define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
-entry:
-; CHECK-LABEL: t1:
-; CHECK: movlps (%rdi), %xmm0
-; CHECK: ret
-  %p.val = load <1 x i64>* %p, align 1
-  %0 = bitcast <1 x i64> %p.val to <2 x float>
-  %shuffle.i = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %shuffle1.i = shufflevector <4 x float> %a, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  ret <4 x float> %shuffle1.i
-}
-
-define <4 x float> @t1a(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
-entry:
-; CHECK-LABEL: t1a:
-; CHECK: movlps (%rdi), %xmm0
-; CHECK: ret
-  %0 = bitcast <1 x i64>* %p to double*
-  %1 = load double* %0
-  %2 = insertelement <2 x double> undef, double %1, i32 0
-  %3 = bitcast <2 x double> %2 to <4 x float>
-  %4 = shufflevector <4 x float> %a, <4 x float> %3, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  ret <4 x float> %4
-}
-
-define void @t2(<1 x i64>* nocapture %p, <4 x float> %a) nounwind {
-entry:
-; CHECK-LABEL: t2:
-; CHECK: movlps %xmm0, (%rdi)
-; CHECK: ret
-  %cast.i = bitcast <4 x float> %a to <2 x i64>
-  %extract.i = extractelement <2 x i64> %cast.i, i32 0
-  %0 = getelementptr inbounds <1 x i64>* %p, i64 0, i64 0
-  store i64 %extract.i, i64* %0, align 8
-  ret void
-}
-
-define void @t2a(<1 x i64>* nocapture %p, <4 x float> %a) nounwind {
-entry:
-; CHECK-LABEL: t2a:
-; CHECK: movlps %xmm0, (%rdi)
-; CHECK: ret
-  %0 = bitcast <1 x i64>* %p to double*
-  %1 = bitcast <4 x float> %a to <2 x double>
-  %2 = extractelement <2 x double> %1, i32 0
-  store double %2, double* %0
-  ret void
-}
-
-; rdar://10436044
-define <2 x double> @t3() nounwind readonly {
-bb:
-; CHECK-LABEL: t3:
-; CHECK: movq (%rax), %xmm1
-; CHECK: punpcklqdq %xmm2, %xmm0
-; CHECK: movsd %xmm1, %xmm0
-  %tmp0 = load i128* null, align 1
-  %tmp1 = load <2 x i32>* undef, align 8
-  %tmp2 = bitcast i128 %tmp0 to <16 x i8>
-  %tmp3 = bitcast <2 x i32> %tmp1 to i64
-  %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0
-  %tmp5 = bitcast <16 x i8> %tmp2 to <2 x double>
-  %tmp6 = bitcast <2 x i64> %tmp4 to <2 x double>
-  %tmp7 = shufflevector <2 x double> %tmp5, <2 x double> %tmp6, <2 x i32> <i32 2, i32 1>
-  ret <2 x double> %tmp7
-}
-
-; rdar://10450317
-define <2 x i64> @t4() nounwind readonly {
-bb:
-; CHECK-LABEL: t4:
-; CHECK: movq (%rax), %xmm0
-; CHECK: punpcklqdq %{{xmm.}}, %[[XMM:xmm[0-9]]]
-; CHECK: movsd %[[XMM]], %xmm0
-  %tmp0 = load i128* null, align 1
-  %tmp1 = load <2 x i32>* undef, align 8
-  %tmp2 = bitcast i128 %tmp0 to <16 x i8>
-  %tmp3 = bitcast <2 x i32> %tmp1 to i64
-  %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0
-  %tmp5 = bitcast <16 x i8> %tmp2 to <2 x i64>
-  %tmp6 = shufflevector <2 x i64> %tmp4, <2 x i64> %tmp5, <2 x i32> <i32 2, i32 1>
-  ret <2 x i64> %tmp6
-}

diff --git a/test/CodeGen/X86/vec_shuffle-40.ll b/test/CodeGen/X86/vec_shuffle-40.ll
deleted file mode 100644
index 75b45e3..0000000
--- a/test/CodeGen/X86/vec_shuffle-40.ll
+++ /dev/null

@@ -1,22 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
-
-define void @shuffle_v16i16(<16 x i16>* %a) {
-; CHECK-LABEL: shuffle_v16i16:
-; CHECK: vpshufb {{.*}}%ymm
-; CHECK-NOT: vpshufb {{.*}}%xmm
-entry:
-  %0 = load <16 x i16>* %a, align 32
-  %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-  store <16 x i16> %shuffle, <16 x i16>* %a, align 32
-  ret void
-}
-
-define void @shuffle_v16i16_lanecrossing(<16 x i16>* %a) {
-; CHECK-LABEL: shuffle_v16i16_lanecrossing:
-; CHECK-NOT: vpshufb {{.*}}%ymm
-entry:
-  %0 = load <16 x i16>* %a, align 32
-  %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 13, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-  store <16 x i16> %shuffle, <16 x i16>* %a, align 32
-  ret void
-}

diff --git a/test/CodeGen/X86/vec_shuffle-41.ll b/test/CodeGen/X86/vec_shuffle-41.ll
deleted file mode 100644
index 28fdd2f..0000000
--- a/test/CodeGen/X86/vec_shuffle-41.ll
+++ /dev/null

@@ -1,21 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
-
-; Use buildFromShuffleMostly which allows this to be generated as two 128-bit
-; shuffles and an insert.
-
-; This is the (somewhat questionable) LLVM IR that is generated for:
-;    x8.s0123456 = x8.s1234567;  // x8 is a <8 x float> type
-;    x8.s7 = f;                  // f is float
-
-
-define <8 x float> @test1(<8 x float> %a, float %b) {
-; CHECK-LABEL: test1:
-; CHECK: vinsertps
-; CHECK-NOT: vinsertps
-entry:
-  %shift = shufflevector <8 x float> %a, <8 x float> undef, <7 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %extend = shufflevector <7 x float> %shift, <7 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
-  %insert = insertelement <8 x float> %extend, float %b, i32 7
-
-  ret <8 x float> %insert
-}

diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll
deleted file mode 100644
index 6599598..0000000
--- a/test/CodeGen/X86/vec_shuffle.ll
+++ /dev/null

@@ -1,50 +0,0 @@
-; RUN: llc < %s -mtriple=i686-linux -mcpu=core2 | FileCheck %s
-
-; CHECK: test_v4sf
-; CHECK: movq 8(%esp)
-; CHECK: pshufd $80
-define void @test_v4sf(<4 x float>* %P, float %X, float %Y) nounwind {
-	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
-	%tmp2 = insertelement <4 x float> %tmp, float %X, i32 1		; <<4 x float>> [#uses=1]
-	%tmp4 = insertelement <4 x float> %tmp2, float %Y, i32 2		; <<4 x float>> [#uses=1]
-	%tmp6 = insertelement <4 x float> %tmp4, float %Y, i32 3		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp6, <4 x float>* %P
-	ret void
-}
-
-; CHECK: test_v2sd
-; CHECK: movups	8(%esp)
-; CHECK: movaps
-define void @test_v2sd(<2 x double>* %P, double %X, double %Y) nounwind {
-	%tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0		; <<2 x double>> [#uses=1]
-	%tmp2 = insertelement <2 x double> %tmp, double %Y, i32 1		; <<2 x double>> [#uses=1]
-	store <2 x double> %tmp2, <2 x double>* %P
-	ret void
-}
-
-; CHECK: test_v8i16
-; CHECK: pshufhw $-58
-; CHECK: movdqa
-define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) nounwind {
-	%tmp = load <2 x i64>* %A		; <<2 x i64>> [#uses=1]
-	%tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16>		; <<8 x i16>> [#uses=8]
-	%tmp.upgrd.2 = extractelement <8 x i16> %tmp.upgrd.1, i32 0		; <i16> [#uses=1]
-	%tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1		; <i16> [#uses=1]
-	%tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2		; <i16> [#uses=1]
-	%tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3		; <i16> [#uses=1]
-	%tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 6		; <i16> [#uses=1]
-	%tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5		; <i16> [#uses=1]
-	%tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 4		; <i16> [#uses=1]
-	%tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7		; <i16> [#uses=1]
-	%tmp8 = insertelement <8 x i16> undef, i16 %tmp.upgrd.2, i32 0		; <<8 x i16>> [#uses=1]
-	%tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1		; <<8 x i16>> [#uses=1]
-	%tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp2, i32 2		; <<8 x i16>> [#uses=1]
-	%tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3		; <<8 x i16>> [#uses=1]
-	%tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 4		; <<8 x i16>> [#uses=1]
-	%tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5		; <<8 x i16>> [#uses=1]
-	%tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 6		; <<8 x i16>> [#uses=1]
-	%tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7		; <<8 x i16>> [#uses=1]
-	%tmp15.upgrd.3 = bitcast <8 x i16> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	store <2 x i64> %tmp15.upgrd.3, <2 x i64>* %res
-	ret void
-}

diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll
deleted file mode 100644
index 9d82f97..0000000
--- a/test/CodeGen/X86/vec_splat-2.ll
+++ /dev/null

@@ -1,33 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s
-
-define void @test(<2 x i64>* %P, i8 %x) nounwind {
-	%tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0		; <<16 x i8>> [#uses=1]
-	%tmp36 = insertelement <16 x i8> %tmp, i8 %x, i32 1		; <<16 x i8>> [#uses=1]
-	%tmp38 = insertelement <16 x i8> %tmp36, i8 %x, i32 2		; <<16 x i8>> [#uses=1]
-	%tmp40 = insertelement <16 x i8> %tmp38, i8 %x, i32 3		; <<16 x i8>> [#uses=1]
-	%tmp42 = insertelement <16 x i8> %tmp40, i8 %x, i32 4		; <<16 x i8>> [#uses=1]
-	%tmp44 = insertelement <16 x i8> %tmp42, i8 %x, i32 5		; <<16 x i8>> [#uses=1]
-	%tmp46 = insertelement <16 x i8> %tmp44, i8 %x, i32 6		; <<16 x i8>> [#uses=1]
-	%tmp48 = insertelement <16 x i8> %tmp46, i8 %x, i32 7		; <<16 x i8>> [#uses=1]
-	%tmp50 = insertelement <16 x i8> %tmp48, i8 %x, i32 8		; <<16 x i8>> [#uses=1]
-	%tmp52 = insertelement <16 x i8> %tmp50, i8 %x, i32 9		; <<16 x i8>> [#uses=1]
-	%tmp54 = insertelement <16 x i8> %tmp52, i8 %x, i32 10		; <<16 x i8>> [#uses=1]
-	%tmp56 = insertelement <16 x i8> %tmp54, i8 %x, i32 11		; <<16 x i8>> [#uses=1]
-	%tmp58 = insertelement <16 x i8> %tmp56, i8 %x, i32 12		; <<16 x i8>> [#uses=1]
-	%tmp60 = insertelement <16 x i8> %tmp58, i8 %x, i32 13		; <<16 x i8>> [#uses=1]
-	%tmp62 = insertelement <16 x i8> %tmp60, i8 %x, i32 14		; <<16 x i8>> [#uses=1]
-	%tmp64 = insertelement <16 x i8> %tmp62, i8 %x, i32 15		; <<16 x i8>> [#uses=1]
-	%tmp68 = load <2 x i64>* %P		; <<2 x i64>> [#uses=1]
-	%tmp71 = bitcast <2 x i64> %tmp68 to <16 x i8>		; <<16 x i8>> [#uses=1]
-	%tmp73 = add <16 x i8> %tmp71, %tmp64		; <<16 x i8>> [#uses=1]
-	%tmp73.upgrd.1 = bitcast <16 x i8> %tmp73 to <2 x i64>		; <<2 x i64>> [#uses=1]
-	store <2 x i64> %tmp73.upgrd.1, <2 x i64>* %P
-	ret void
-
-; CHECK-LABEL: test:
-; CHECK-NOT: pshufd
-; CHECK: punpcklbw
-; CHECK: punpcklbw
-; CHECK: pshufd $0
-; CHECK-NOT: pshufd
-}

diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll
deleted file mode 100644
index 754cbf4..0000000
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ /dev/null

@@ -1,230 +0,0 @@
-; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
-
-; Splat test for v8i16
-define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_0:
-; CHECK: pshuflw $0
-}
-
-define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_1:
-; CHECK: pshuflw $5
-}
-
-define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_2:
-; CHECK: punpcklwd
-; CHECK-NEXT: pshufd $-86
-}
-
-define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_3:
-; CHECK: pshuflw $15
-}
-
-define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_4:
-; CHECK: movhlps
-}
-
-define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_5:
-; CHECK: punpckhwd
-; CHECK-NEXT: pshufd $85
-}
-
-define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 6, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_6:
-; CHECK: punpckhwd
-; CHECK-NEXT: pshufd $-86
-}
-
-define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-	ret <8 x i16> %tmp6
-
-; CHECK-LABEL: shuf_8i16_7:
-; CHECK: punpckhwd
-; CHECK-NEXT: pshufd $-1
-}
-
-; Splat test for v16i8
-define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_8:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $0
-}
-
-define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_9:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_10:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 3, i32 undef, i32 undef, i32 3, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_11:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-1
-}
-
-
-define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_12:
-; CHECK: pshufd $5
-}
-
-define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_13:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 6, i32 undef, i32 undef, i32 6, i32 undef, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_14:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_15:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-1
-}
-
-define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 8, i32 undef, i32 undef, i32 8, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_16:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $0
-}
-
-define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 9, i32 undef, i32 undef, i32 9, i32 undef, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_17:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 10, i32 undef, i32 undef, i32 10, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_18:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 11, i32 undef, i32 undef, i32 11, i32 undef, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_19:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-1
-}
-
-define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 12, i32 undef, i32 undef, i32 12, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_20:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $0
-}
-
-define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 13, i32 undef, i32 undef, i32 13, i32 undef, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_21:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $85
-}
-
-define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 14, i32 undef, i32 undef, i32 14, i32 undef, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_22:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-86
-}
-
-define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
-	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 15, i32 undef, i32 undef, i32 15, i32 undef, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-	ret <16 x i8> %tmp6
-
-; CHECK-LABEL: shuf_16i8_23:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-1
-}

diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
deleted file mode 100644
index 28f2a90..0000000
--- a/test/CodeGen/X86/vec_splat.ll
+++ /dev/null

@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX
-
-define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
-	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
-	%tmp2 = insertelement <4 x float> %tmp, float %X, i32 1		; <<4 x float>> [#uses=1]
-	%tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2		; <<4 x float>> [#uses=1]
-	%tmp6 = insertelement <4 x float> %tmp4, float %X, i32 3		; <<4 x float>> [#uses=1]
-	%tmp8 = load <4 x float>* %Q		; <<4 x float>> [#uses=1]
-	%tmp10 = fmul <4 x float> %tmp8, %tmp6		; <<4 x float>> [#uses=1]
-	store <4 x float> %tmp10, <4 x float>* %P
-	ret void
-
-; SSE2-LABEL: test_v4sf:
-; SSE2: pshufd $0
-
-; SSE3-LABEL: test_v4sf:
-; SSE3: pshufd $0
-}
-
-define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
-	%tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0		; <<2 x double>> [#uses=1]
-	%tmp2 = insertelement <2 x double> %tmp, double %X, i32 1		; <<2 x double>> [#uses=1]
-	%tmp4 = load <2 x double>* %Q		; <<2 x double>> [#uses=1]
-	%tmp6 = fmul <2 x double> %tmp4, %tmp2		; <<2 x double>> [#uses=1]
-	store <2 x double> %tmp6, <2 x double>* %P
-	ret void
-
-; SSE2-LABEL: test_v2sd:
-; SSE2: shufpd $0
-
-; SSE3-LABEL: test_v2sd:
-; SSE3: movddup
-}
-
-; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
-define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
-  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
-  %2 = load <4 x float>* %1, align 16
-  %3 = trunc i64 %j to i32
-  %4 = extractelement <4 x float> %2, i32 %3
-  %5 = insertelement <4 x float> undef, float %4, i32 0
-  %6 = insertelement <4 x float> %5, float %4, i32 1
-  %7 = insertelement <4 x float> %6, float %4, i32 2
-  %8 = insertelement <4 x float> %7, float %4, i32 3
-  ret <4 x float> %8
-  
-; AVX-LABEL: load_extract_splat
-; AVX-NOT: rsp
-; AVX: vbroadcastss
-}
-
-; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
-define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
-  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
-  %2 = load <4 x float>* %1, align 16
-  %3 = extractelement <4 x float> %2, i64 %j
-  %4 = insertelement <4 x float> undef, float %3, i32 0
-  %5 = insertelement <4 x float> %4, float %3, i32 1
-  %6 = insertelement <4 x float> %5, float %3, i32 2
-  %7 = insertelement <4 x float> %6, float %3, i32 3
-  ret <4 x float> %7
-  
-; AVX-LABEL: load_extract_splat1
-; AVX-NOT: movs
-; AVX: vbroadcastss
-}

diff --git a/test/CodeGen/X86/vec_trunc_sext.ll b/test/CodeGen/X86/vec_trunc_sext.ll
new file mode 100644
index 0000000..3c446bb
--- /dev/null
+++ b/test/CodeGen/X86/vec_trunc_sext.ll

@@ -0,0 +1,30 @@
+; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='-sse4.1' -o - | FileCheck %s -check-prefix=NO_SSE_41
+; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='+sse4.1' -o - | FileCheck %s -check-prefix=SSE_41
+
+; PR20472 ( http://llvm.org/bugs/show_bug.cgi?id=20472 )
+; When sexting a trunc'd vector value, we can't eliminate the zext.
+; If we don't have SSE4.1, use punpck.
+; If we have SSE4.1, use pmovzx because it combines the load op.
+; There may be a better way to do this using pshufb + pmovsx,
+; but that is beyond our current codegen capabilities.
+
+define <4 x i32> @trunc_sext(<4 x i16>* %in) {
+  %load = load <4 x i16>* %in
+  %trunc = trunc <4 x i16> %load to <4 x i8>
+  %sext = sext <4 x i8> %trunc to <4 x i32>
+  ret <4 x i32> %sext
+
+; NO_SSE_41-LABEL: trunc_sext:
+; NO_SSE_41: movq (%rdi), %xmm0
+; NO_SSE_41-NEXT: punpcklwd %xmm0, %xmm0
+; NO_SSE_41-NEXT: pslld $24, %xmm0
+; NO_SSE_41-NEXT: psrad $24, %xmm0
+; NO_SSE_41-NEXT: retq
+
+; SSE_41-LABEL: trunc_sext:
+; SSE_41: pmovzxwd (%rdi), %xmm0
+; SSE_41-NEXT: pslld $24, %xmm0
+; SSE_41-NEXT: psrad $24, %xmm0
+; SSE_41-NEXT: retq
+}
+

diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll
index ee20f1f..46cfcd9 100644
--- a/test/CodeGen/X86/vec_uint_to_fp.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp.ll

@@ -1,11 +1,167 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck --check-prefix=CHECK --check-prefix=SSE41 --check-prefix=CST  %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 | FileCheck --check-prefix=CHECK --check-prefix=AVX2 %s
 
-; Test that we are not lowering uinttofp to scalars
+; Check that the constant used in the vectors are the right ones.
+; SSE: [[MASKCSTADDR:LCPI0_[0-9]+]]:
+; SSE-NEXT: .long	65535                   ## 0xffff
+; SSE-NEXT: .long	65535                   ## 0xffff
+; SSE-NEXT: .long	65535                   ## 0xffff
+; SSE-NEXT: .long	65535                   ## 0xffff
+
+; CST: [[LOWCSTADDR:LCPI0_[0-9]+]]:
+; CST-NEXT: .long	1258291200              ## 0x4b000000
+; CST-NEXT: .long	1258291200              ## 0x4b000000
+; CST-NEXT: .long	1258291200              ## 0x4b000000
+; CST-NEXT: .long	1258291200              ## 0x4b000000
+
+; CST: [[HIGHCSTADDR:LCPI0_[0-9]+]]:
+; CST-NEXT: .long	1392508928              ## 0x53000000
+; CST-NEXT: .long	1392508928              ## 0x53000000
+; CST-NEXT: .long	1392508928              ## 0x53000000
+; CST-NEXT: .long	1392508928              ## 0x53000000
+
+; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]:
+; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+; CST-NEXT: .long	3539992704              ## float -5.497642e+11
+
+; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]:
+; AVX2-NEXT: .long	1258291200              ## 0x4b000000
+
+; AVX2: [[HIGHCSTADDR:LCPI0_[0-9]+]]:
+; AVX2-NEXT: .long	1392508928              ## 0x53000000
+
+; AVX2: [[MAGICCSTADDR:LCPI0_[0-9]+]]:
+; AVX2-NEXT: .long	3539992704              ## float -5.49764202E+11
+
 define <4 x float> @test1(<4 x i32> %A) nounwind {
 ; CHECK-LABEL: test1:
-; CHECK-NOT: cvtsd2ss
-; CHECK: ret
+;
+; SSE: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]]
+; SSE-NEXT: pand %xmm0, [[MASK]]
+; After this instruction, MASK will have the value of the low parts
+; of the vector.
+; SSE-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]]
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0
+; SSE-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0
+; SSE-NEXT: addps [[MASK]], %xmm0
+; SSE-NEXT: retq
+;
+; Currently we commute the arguments of the first blend, but this could be
+; improved to match the lowering of the second blend.
+; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]]
+; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]]
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0
+; SSE41-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0
+; SSE41-NEXT: addps [[LOWVEC]], %xmm0
+; SSE41-NEXT: retq
+;
+; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]]
+; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]]
+; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]]
+; AVX-NEXT: vaddps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]]
+; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0
+; AVX-NEXT: retq
+;
+; The lowering for AVX2 is a bit messy, because we select broadcast
+; instructions, instead of folding the constant loads.
+; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]]
+; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]]
+; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]]
+; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]]
+; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]]
+; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0
+; AVX2-NEXT: retq
   %C = uitofp <4 x i32> %A to <4 x float>
   ret <4 x float> %C
 }
 
+; Match the AVX2 constants used in the next function
+; AVX2: [[LOWCSTADDR:LCPI1_[0-9]+]]:
+; AVX2-NEXT: .long	1258291200              ## 0x4b000000
+
+; AVX2: [[HIGHCSTADDR:LCPI1_[0-9]+]]:
+; AVX2-NEXT: .long	1392508928              ## 0x53000000
+
+; AVX2: [[MAGICCSTADDR:LCPI1_[0-9]+]]:
+; AVX2-NEXT: .long	3539992704              ## float -5.49764202E+11
+
+define <8 x float> @test2(<8 x i32> %A) nounwind {
+; CHECK-LABEL: test2:
+; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX.
+; The constant used for in the vector instruction are shared between the
+; two sequences of instructions.
+;
+; SSE: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]]
+; SSE-NEXT: pand %[[MASK]], [[VECLOW]]
+; SSE-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200]
+; SSE-NEXT: por %[[LOWCST]], [[VECLOW]]
+; SSE-NEXT: psrld $16, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928]
+; SSE-NEXT: por %[[HIGHCST]], %xmm0
+; SSE-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
+; SSE-NEXT: addps %[[MAGICCST]], %xmm0
+; SSE-NEXT: addps [[VECLOW]], %xmm0
+; MASK is the low vector of the second part after this point.
+; SSE-NEXT: pand %xmm1, %[[MASK]]
+; SSE-NEXT: por %[[LOWCST]], %[[MASK]]
+; SSE-NEXT: psrld $16, %xmm1
+; SSE-NEXT: por %[[HIGHCST]], %xmm1
+; SSE-NEXT: addps %[[MAGICCST]], %xmm1
+; SSE-NEXT: addps %[[MASK]], %xmm1
+; SSE-NEXT: retq
+;
+; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200]
+; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]]
+; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]]
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928]
+; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0
+; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
+; SSE41-NEXT: addps %[[MAGICCST]], %xmm0
+; SSE41-NEXT: addps [[VECLOW]], %xmm0
+; LOWCST is the low vector of the second part after this point.
+; The operands of the blend are inverted because we reuse xmm1
+; in the next shift.
+; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]]
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1
+; SSE41-NEXT: addps %[[MAGICCST]], %xmm1
+; SSE41-NEXT: addps %[[LOWCST]], %xmm1
+; SSE41-NEXT: retq
+;
+; Test that we are not lowering uinttofp to scalars
+; AVX-NOT: cvtsd2ss
+; AVX: retq
+;
+; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]]
+; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]]
+; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]]
+; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]]
+; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]]
+; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]]
+; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0
+; AVX2-NEXT: retq
+  %C = uitofp <8 x i32> %A to <8 x float>
+  ret <8 x float> %C
+}
+
+define <4 x double> @test3(<4 x i32> %arg) {
+; CHECK-LABEL: test3:
+; This test used to crash because we were custom lowering it as if it was
+; a conversion between <4 x i32> and <4 x float>.
+; AVX: vcvtdq2pd
+; AVX2: vcvtdq2pd
+; CHECK: retq
+  %tmp = uitofp <4 x i32> %arg to <4 x double>
+  ret <4 x double> %tmp
+}

diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll
new file mode 100644
index 0000000..827d418
--- /dev/null
+++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll

@@ -0,0 +1,23 @@
+; RUN: llc < %s -enable-unsafe-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
+
+; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math.
+
+; Subtracting zero is free.
+define <4 x float> @vec_fsub_zero(<4 x float> %x) {
+; CHECK-LABEL: vec_fsub_zero:
+; CHECK-NOT: subps
+; CHECK-NOT: xorps
+; CHECK: retq
+  %sub = fsub <4 x float> %x, zeroinitializer
+  ret <4 x float> %sub
+}
+
+; Negating doesn't require subtraction.
+define <4 x float> @vec_fneg(<4 x float> %x) {
+; CHECK-LABEL: vec_fneg:
+; CHECK: xorps  {{.*}}LCP{{.*}}, %xmm0
+; CHECK-NOT: subps
+; CHECK-NEXT: retq
+  %sub = fsub <4 x float> zeroinitializer, %x
+  ret <4 x float> %sub
+}

diff --git a/test/CodeGen/X86/vec_zext.ll b/test/CodeGen/X86/vec_zext.ll
deleted file mode 100644
index 615a50b..0000000
--- a/test/CodeGen/X86/vec_zext.ll
+++ /dev/null

@@ -1,69 +0,0 @@
-; RUN: llc < %s -march=x86-64
-; PR 9267
-
-define<4 x i32> @func_16_32() {
-  %F = load <4 x i16>* undef
-  %G = zext <4 x i16> %F to <4 x i32>
-  %H = load <4 x i16>* undef
-  %Y = zext <4 x i16> %H to <4 x i32>
-  %T = add <4 x i32> %Y, %G
-  store <4 x i32>%T , <4 x i32>* undef
-  ret <4 x i32> %T
-}
-
-define<4 x i64> @func_16_64() {
-  %F = load <4 x i16>* undef
-  %G = zext <4 x i16> %F to <4 x i64>
-  %H = load <4 x i16>* undef
-  %Y = zext <4 x i16> %H to <4 x i64>
-  %T = xor <4 x i64> %Y, %G
-  store <4 x i64>%T , <4 x i64>* undef
-  ret <4 x i64> %T
-}
-
-define<4 x i64> @func_32_64() {
-  %F = load <4 x i32>* undef
-  %G = zext <4 x i32> %F to <4 x i64>
-  %H = load <4 x i32>* undef
-  %Y = zext <4 x i32> %H to <4 x i64>
-  %T = or <4 x i64> %Y, %G
-  ret <4 x i64> %T
-}
-
-define<4 x i16> @func_8_16() {
-  %F = load <4 x i8>* undef
-  %G = zext <4 x i8> %F to <4 x i16>
-  %H = load <4 x i8>* undef
-  %Y = zext <4 x i8> %H to <4 x i16>
-  %T = add <4 x i16> %Y, %G
-  ret <4 x i16> %T
-}
-
-define<4 x i32> @func_8_32() {
-  %F = load <4 x i8>* undef
-  %G = zext <4 x i8> %F to <4 x i32>
-  %H = load <4 x i8>* undef
-  %Y = zext <4 x i8> %H to <4 x i32>
-  %T = sub <4 x i32> %Y, %G
-  ret <4 x i32> %T
-}
-
-define<4 x i64> @func_8_64() {
-  %F = load <4 x i8>* undef
-  %G = zext <4 x i8> %F to <4 x i64>
-  %H = load <4 x i8>* undef
-  %Y = zext <4 x i8> %H to <4 x i64>
-  %T = add <4 x i64> %Y, %G
-  ret <4 x i64> %T
-}
-
-define<4 x i32> @const_16_32() {
-  %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32>
-  ret <4 x i32> %G
-}
-
-define<4 x i64> @const_16_64() {
-  %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64>
-  ret <4 x i64> %G
-}
-

diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
new file mode 100644
index 0000000..0a3ed7e
--- /dev/null
+++ b/test/CodeGen/X86/vector-blend.ll

@@ -0,0 +1,708 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; AVX128 tests:
+
+define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
+; SSE2-LABEL: vsel_float:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_float:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_float:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vsel_float:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
+  ret <4 x float> %vsel
+}
+
+define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) {
+; SSE-LABEL: vsel_float2:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movss %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_float2:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+  ret <4 x float> %vsel
+}
+
+define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
+; SSE2-LABEL: vsel_4xi8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_4xi8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_4xi8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vsel_4xi8:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_4xi8:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
+  ret <4 x i8> %vsel
+}
+
+define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
+; SSE2-LABEL: vsel_4xi16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_4xi16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_4xi16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vsel_4xi16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_4xi16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
+  ret <4 x i16> %vsel
+}
+
+define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: vsel_i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vsel_i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+  ret <4 x i32> %vsel
+}
+
+define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
+; SSE-LABEL: vsel_double:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_double:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
+  ret <2 x double> %vsel
+}
+
+define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
+; SSE-LABEL: vsel_i64:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_i64:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
+  ret <2 x i64> %vsel
+}
+
+define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
+; SSE2-LABEL: vsel_8xi16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_8xi16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_8xi16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vsel_8xi16:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
+  ret <8 x i16> %vsel
+}
+
+define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
+; SSE2-LABEL: vsel_i8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_i8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm1
+; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_i8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vsel_i8:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2
+  ret <16 x i8> %vsel
+}
+
+
+; AVX256 tests:
+
+define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
+; SSE-LABEL: vsel_float8:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movss %xmm0, %xmm2
+; SSE-NEXT:    movss %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_float8:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
+  ret <8 x float> %vsel
+}
+
+define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
+; SSE-LABEL: vsel_i328:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movss %xmm0, %xmm2
+; SSE-NEXT:    movss %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: vsel_i328:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_i328:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
+  ret <8 x i32> %vsel
+}
+
+define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
+; SSE2-LABEL: vsel_double8:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd %xmm0, %xmm4
+; SSE2-NEXT:    movsd %xmm2, %xmm6
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm5, %xmm1
+; SSE2-NEXT:    movaps %xmm6, %xmm2
+; SSE2-NEXT:    movaps %xmm7, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_double8:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd %xmm0, %xmm4
+; SSSE3-NEXT:    movsd %xmm2, %xmm6
+; SSSE3-NEXT:    movaps %xmm4, %xmm0
+; SSSE3-NEXT:    movaps %xmm5, %xmm1
+; SSSE3-NEXT:    movaps %xmm6, %xmm2
+; SSSE3-NEXT:    movaps %xmm7, %xmm3
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_double8:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
+; SSE41-NEXT:    blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
+; SSE41-NEXT:    movaps %xmm5, %xmm1
+; SSE41-NEXT:    movaps %xmm7, %xmm3
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vsel_double8:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
+; AVX-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
+  ret <8 x double> %vsel
+}
+
+define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
+; SSE2-LABEL: vsel_i648:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd %xmm0, %xmm4
+; SSE2-NEXT:    movsd %xmm2, %xmm6
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm5, %xmm1
+; SSE2-NEXT:    movaps %xmm6, %xmm2
+; SSE2-NEXT:    movaps %xmm7, %xmm3
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: vsel_i648:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd %xmm0, %xmm4
+; SSSE3-NEXT:    movsd %xmm2, %xmm6
+; SSSE3-NEXT:    movaps %xmm4, %xmm0
+; SSSE3-NEXT:    movaps %xmm5, %xmm1
+; SSSE3-NEXT:    movaps %xmm6, %xmm2
+; SSSE3-NEXT:    movaps %xmm7, %xmm3
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: vsel_i648:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT:    movaps %xmm5, %xmm1
+; SSE41-NEXT:    movaps %xmm7, %xmm3
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: vsel_i648:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: vsel_i648:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+entry:
+  %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
+  ret <8 x i64> %vsel
+}
+
+define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
+; SSE-LABEL: vsel_double4:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movsd %xmm0, %xmm2
+; SSE-NEXT:    movsd %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: vsel_double4:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; AVX-NEXT:    retq
+entry:
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
+  ret <4 x double> %vsel
+}
+
+define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: testa:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    cmplepd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm2, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm2
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: testa:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movapd %xmm1, %xmm2
+; SSSE3-NEXT:    cmplepd %xmm0, %xmm2
+; SSSE3-NEXT:    andpd %xmm2, %xmm0
+; SSSE3-NEXT:    andnpd %xmm1, %xmm2
+; SSSE3-NEXT:    orpd %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: testa:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    cmplepd %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: testa:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vcmplepd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %max_is_x = fcmp oge <2 x double> %x, %y
+  %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %max
+}
+
+define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
+; SSE2-LABEL: testb:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    cmpnlepd %xmm0, %xmm2
+; SSE2-NEXT:    andpd %xmm2, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm2
+; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: testb:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movapd %xmm1, %xmm2
+; SSSE3-NEXT:    cmpnlepd %xmm0, %xmm2
+; SSSE3-NEXT:    andpd %xmm2, %xmm0
+; SSSE3-NEXT:    andnpd %xmm1, %xmm2
+; SSSE3-NEXT:    orpd %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: testb:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movapd %xmm0, %xmm2
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    cmpnlepd %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: testb:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vcmpnlepd %xmm0, %xmm1, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %min_is_x = fcmp ult <2 x double> %x, %y
+  %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %min
+}
+
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; SSE-LABEL: constant_blendvpd_avx:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movsd %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: constant_blendvpd_avx:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX-NEXT:    retq
+entry:
+  %select = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
+  ret <4 x double> %select
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; SSE2-LABEL: constant_blendvps_avx:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0]
+; SSE2-NEXT:    andps %xmm4, %xmm2
+; SSE2-NEXT:    movaps {{.*#+}} xmm5 = [0,0,0,4294967295]
+; SSE2-NEXT:    andps %xmm5, %xmm0
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm4, %xmm3
+; SSE2-NEXT:    andps %xmm5, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: constant_blendvps_avx:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0]
+; SSSE3-NEXT:    andps %xmm4, %xmm2
+; SSSE3-NEXT:    movaps {{.*#+}} xmm5 = [0,0,0,4294967295]
+; SSSE3-NEXT:    andps %xmm5, %xmm0
+; SSSE3-NEXT:    orps %xmm2, %xmm0
+; SSSE3-NEXT:    andps %xmm4, %xmm3
+; SSSE3-NEXT:    andps %xmm5, %xmm1
+; SSSE3-NEXT:    orps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: constant_blendvps_avx:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: constant_blendvps_avx:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
+; AVX-NEXT:    retq
+entry:
+  %select = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
+  ret <8 x float> %select
+}
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; SSE2-LABEL: constant_pblendvb_avx2:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE2-NEXT:    andps %xmm4, %xmm2
+; SSE2-NEXT:    movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE2-NEXT:    andps %xmm5, %xmm0
+; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm4, %xmm3
+; SSE2-NEXT:    andps %xmm5, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: constant_pblendvb_avx2:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSSE3-NEXT:    andps %xmm4, %xmm2
+; SSSE3-NEXT:    movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSSE3-NEXT:    andps %xmm5, %xmm0
+; SSSE3-NEXT:    orps %xmm2, %xmm0
+; SSSE3-NEXT:    andps %xmm4, %xmm3
+; SSSE3-NEXT:    andps %xmm5, %xmm1
+; SSSE3-NEXT:    orps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: constant_pblendvb_avx2:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: constant_pblendvb_avx2:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_pblendvb_avx2:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %select = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
+  ret <32 x i8> %select
+}
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+;; 4 tests for shufflevectors that optimize to blend + immediate
+define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: blend_shufflevector_4xfloat:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: blend_shufflevector_4xfloat:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: blend_shufflevector_4xfloat:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_shufflevector_4xfloat:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX-NEXT:    retq
+entry:
+  %select = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %select
+}
+
+define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: blend_shufflevector_8xfloat:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movss %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: blend_shufflevector_8xfloat:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movss %xmm0, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: blend_shufflevector_8xfloat:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_shufflevector_8xfloat:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7]
+; AVX-NEXT:    retq
+entry:
+  %select = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15>
+  ret <8 x float> %select
+}
+
+define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: blend_shufflevector_4xdouble:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: blend_shufflevector_4xdouble:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd %xmm0, %xmm2
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: blend_shufflevector_4xdouble:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_shufflevector_4xdouble:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX-NEXT:    retq
+entry:
+  %select = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x double> %select
+}
+
+define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: blend_shufflevector_4xi64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsd %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: blend_shufflevector_4xi64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsd %xmm2, %xmm0
+; SSSE3-NEXT:    movaps %xmm3, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: blend_shufflevector_4xi64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    movaps %xmm3, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: blend_shufflevector_4xi64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: blend_shufflevector_4xi64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+entry:
+  %select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  ret <4 x i64> %select
+}

diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index b6d43e9..4b269dc 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll

@@ -1,221 +1,1255 @@
-; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41
-; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
-; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
+; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX
+
+target triple = "x86_64-unknown-unknown"
 
 define <4 x i32> @test1(<4 x i32> %a) {
+; SSE41-LABEL: test1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    psubd %xmm2, %xmm0
+; SSE41-NEXT:    psrld $1, %xmm0
+; SSE41-NEXT:    paddd %xmm2, %xmm0
+; SSE41-NEXT:    psrld $2, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test1:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm1, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    psubd %xmm2, %xmm0
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    psrld $2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
   ret <4 x i32> %div
-
-; SSE41-LABEL: test1:
-; SSE41: pmuludq
-; SSE41: pshufd	$49
-; SSE41: pmuludq
-; SSE41: shufps	$-35
-; SSE41: psubd
-; SSE41: psrld $1
-; SSE41: padd
-; SSE41: psrld $2
-
-; AVX-LABEL: test1:
-; AVX: vpmuludq
-; AVX: vpshufd	$49
-; AVX: vpmuludq
-; AVX: vshufps	$-35
-; AVX: vpsubd
-; AVX: vpsrld $1
-; AVX: vpadd
-; AVX: vpsrld $2
 }
 
 define <8 x i32> @test2(<8 x i32> %a) {
+; SSE41-LABEL: test2:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT:    psubd %xmm3, %xmm0
+; SSE41-NEXT:    psrld $1, %xmm0
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    psrld $2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm4, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    psubd %xmm2, %xmm1
+; SSE41-NEXT:    psrld $1, %xmm1
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    psrld $2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT:    psubd %xmm3, %xmm0
+; SSE-NEXT:    psrld $1, %xmm0
+; SSE-NEXT:    paddd %xmm3, %xmm0
+; SSE-NEXT:    psrld $2, %xmm0
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    psubd %xmm2, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    psrld $2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsrld $1, %ymm0, %ymm0
+; AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsrld $2, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %div
-
-; AVX-LABEL: test2:
-; AVX: vpbroadcastd
-; AVX: vpalignr $4
-; AVX: vpmuludq
-; AVX: vpmuludq
-; AVX: vpblendd $170
-; AVX: vpsubd
-; AVX: vpsrld $1
-; AVX: vpadd
-; AVX: vpsrld $2
 }
 
 define <8 x i16> @test3(<8 x i16> %a) {
+; SSE41-LABEL: test3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE41-NEXT:    psubw %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    psrlw $2, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT:    pmulhuw %xmm0, %xmm1
+; SSE-NEXT:    psubw %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    psrlw $2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   ret <8 x i16> %div
-
-; SSE41-LABEL: test3:
-; SSE41: pmulhuw
-; SSE41: psubw
-; SSE41: psrlw $1
-; SSE41: paddw
-; SSE41: psrlw $2
-
-; AVX-LABEL: test3:
-; AVX: vpmulhuw
-; AVX: vpsubw
-; AVX: vpsrlw $1
-; AVX: vpaddw
-; AVX: vpsrlw $2
 }
 
 define <16 x i16> @test4(<16 x i16> %a) {
+; SSE41-LABEL: test4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pmulhuw %xmm2, %xmm3
+; SSE41-NEXT:    psubw %xmm3, %xmm0
+; SSE41-NEXT:    psrlw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm3, %xmm0
+; SSE41-NEXT:    psrlw $2, %xmm0
+; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
+; SSE41-NEXT:    psubw %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $1, %xmm1
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    psrlw $2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pmulhuw %xmm2, %xmm3
+; SSE-NEXT:    psubw %xmm3, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    paddw %xmm3, %xmm0
+; SSE-NEXT:    psrlw $2, %xmm0
+; SSE-NEXT:    pmulhuw %xmm1, %xmm2
+; SSE-NEXT:    psubw %xmm2, %xmm1
+; SSE-NEXT:    psrlw $1, %xmm1
+; SSE-NEXT:    paddw %xmm2, %xmm1
+; SSE-NEXT:    psrlw $2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
+; AVX-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsrlw $2, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
   ret <16 x i16> %div
-
-; AVX-LABEL: test4:
-; AVX: vpmulhuw
-; AVX: vpsubw
-; AVX: vpsrlw $1
-; AVX: vpaddw
-; AVX: vpsrlw $2
-; AVX-NOT: vpmulhuw
 }
 
 define <8 x i16> @test5(<8 x i16> %a) {
+; SSE41-LABEL: test5:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmulhw {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $15, %xmm1
+; SSE41-NEXT:    psraw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test5:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmulhw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $15, %xmm1
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
+; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   ret <8 x i16> %div
-
-; SSE41-LABEL: test5:
-; SSE41: pmulhw
-; SSE41: psrlw $15
-; SSE41: psraw $1
-; SSE41: paddw
-
-; AVX-LABEL: test5:
-; AVX: vpmulhw
-; AVX: vpsrlw $15
-; AVX: vpsraw $1
-; AVX: vpaddw
 }
 
 define <16 x i16> @test6(<16 x i16> %a) {
+; SSE41-LABEL: test6:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; SSE41-NEXT:    pmulhw %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrlw $15, %xmm3
+; SSE41-NEXT:    psraw $1, %xmm0
+; SSE41-NEXT:    paddw %xmm3, %xmm0
+; SSE41-NEXT:    pmulhw %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlw $15, %xmm2
+; SSE41-NEXT:    psraw $1, %xmm1
+; SSE41-NEXT:    paddw %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test6:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
+; SSE-NEXT:    pmulhw %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrlw $15, %xmm3
+; SSE-NEXT:    psraw $1, %xmm0
+; SSE-NEXT:    paddw %xmm3, %xmm0
+; SSE-NEXT:    pmulhw %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrlw $15, %xmm2
+; SSE-NEXT:    psraw $1, %xmm1
+; SSE-NEXT:    paddw %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmulhw {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    vpsrlw $15, %ymm0, %ymm1
+; AVX-NEXT:    vpsraw $1, %ymm0, %ymm0
+; AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
   ret <16 x i16> %div
-
-; AVX-LABEL: test6:
-; AVX: vpmulhw
-; AVX: vpsrlw $15
-; AVX: vpsraw $1
-; AVX: vpaddw
-; AVX-NOT: vpmulhw
 }
 
 define <16 x i8> @test7(<16 x i8> %a) {
+; SSE41-LABEL: test7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pextrb $1, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pextrb $0, %xmm0, %ecx
+; SSE41-NEXT:    movsbl %cl, %ecx
+; SSE41-NEXT:    imull $-109, %ecx, %edx
+; SSE41-NEXT:    shrl $8, %edx
+; SSE41-NEXT:    addb %dl, %cl
+; SSE41-NEXT:    movb %cl, %dl
+; SSE41-NEXT:    shrb $7, %dl
+; SSE41-NEXT:    sarb $2, %cl
+; SSE41-NEXT:    addb %dl, %cl
+; SSE41-NEXT:    movzbl %cl, %ecx
+; SSE41-NEXT:    movd %ecx, %xmm1
+; SSE41-NEXT:    pinsrb $1, %eax, %xmm1
+; SSE41-NEXT:    pextrb $2, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $2, %eax, %xmm1
+; SSE41-NEXT:    pextrb $3, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $3, %eax, %xmm1
+; SSE41-NEXT:    pextrb $4, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $4, %eax, %xmm1
+; SSE41-NEXT:    pextrb $5, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $5, %eax, %xmm1
+; SSE41-NEXT:    pextrb $6, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $6, %eax, %xmm1
+; SSE41-NEXT:    pextrb $7, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $7, %eax, %xmm1
+; SSE41-NEXT:    pextrb $8, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $8, %eax, %xmm1
+; SSE41-NEXT:    pextrb $9, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $9, %eax, %xmm1
+; SSE41-NEXT:    pextrb $10, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $10, %eax, %xmm1
+; SSE41-NEXT:    pextrb $11, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $11, %eax, %xmm1
+; SSE41-NEXT:    pextrb $12, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $12, %eax, %xmm1
+; SSE41-NEXT:    pextrb $13, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $13, %eax, %xmm1
+; SSE41-NEXT:    pextrb $14, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $14, %eax, %xmm1
+; SSE41-NEXT:    pextrb $15, %xmm0, %eax
+; SSE41-NEXT:    movsbl %al, %eax
+; SSE41-NEXT:    imull $-109, %eax, %ecx
+; SSE41-NEXT:    shrl $8, %ecx
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movb %al, %cl
+; SSE41-NEXT:    shrb $7, %cl
+; SSE41-NEXT:    sarb $2, %al
+; SSE41-NEXT:    addb %cl, %al
+; SSE41-NEXT:    movzbl %al, %eax
+; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test7:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm4
+; SSE-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    imull $-109, %eax, %ecx
+; SSE-NEXT:    shrl $8, %ecx
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movb %cl, %al
+; SSE-NEXT:    shrb $7, %al
+; SSE-NEXT:    sarb $2, %cl
+; SSE-NEXT:    addb %al, %cl
+; SSE-NEXT:    movzbl %cl, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX-NEXT:    movsbl %al, %eax
+; AVX-NEXT:    imull $-109, %eax, %ecx
+; AVX-NEXT:    shrl $8, %ecx
+; AVX-NEXT:    addb %cl, %al
+; AVX-NEXT:    movb %al, %cl
+; AVX-NEXT:    shrb $7, %cl
+; AVX-NEXT:    sarb $2, %al
+; AVX-NEXT:    addb %cl, %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    vpextrb $0, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %dl
+; AVX-NEXT:    shrb $7, %dl
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movzbl %cl, %ecx
+; AVX-NEXT:    vmovd %ecx, %xmm1
+; AVX-NEXT:    vpextrb $2, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $3, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $4, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $5, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $6, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $7, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $8, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $9, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $10, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $11, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $12, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $13, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $14, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpextrb $15, %xmm0, %ecx
+; AVX-NEXT:    movsbl %cl, %ecx
+; AVX-NEXT:    imull $-109, %ecx, %edx
+; AVX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm0
+; AVX-NEXT:    shrl $8, %edx
+; AVX-NEXT:    addb %dl, %cl
+; AVX-NEXT:    movb %cl, %al
+; AVX-NEXT:    shrb $7, %al
+; AVX-NEXT:    sarb $2, %cl
+; AVX-NEXT:    addb %al, %cl
+; AVX-NEXT:    movzbl %cl, %eax
+; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
   ret <16 x i8> %div
-
-; FIXME: scalarized
-; SSE41-LABEL: test7:
-; SSE41: pext
-; AVX-LABEL: test7:
-; AVX: pext
 }
 
 define <4 x i32> @test8(<4 x i32> %a) {
+; SSE41-LABEL: test8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pmuldq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm2, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrld $31, %xmm0
+; SSE41-NEXT:    psrad $2, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test8:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrad $31, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    paddd %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pmuludq %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm4
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT:    psubd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $31, %xmm0
+; SSE-NEXT:    psrad $2, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; AVX-NEXT:    vpmuldq %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpsrld $31, %xmm0, %xmm1
+; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
   ret <4 x i32> %div
-
-; SSE41-LABEL: test8:
-; SSE41: pmuldq
-; SSE41: pshufd	$49
-; SSE41-NOT: pshufd	$49
-; SSE41: pmuldq
-; SSE41: shufps	$-35
-; SSE41: pshufd	$-40
-; SSE41: padd
-; SSE41: psrld $31
-; SSE41: psrad $2
-; SSE41: padd
-
-; SSE-LABEL: test8:
-; SSE: psrad $31
-; SSE: pand
-; SSE: paddd
-; SSE: pmuludq
-; SSE: pshufd	$49
-; SSE-NOT: pshufd	$49
-; SSE: pmuludq
-; SSE: shufps	$-35
-; SSE: pshufd	$-40
-; SSE: psubd
-; SSE: padd
-; SSE: psrld $31
-; SSE: psrad $2
-; SSE: padd
-
-; AVX-LABEL: test8:
-; AVX: vpmuldq
-; AVX: vpshufd	$49
-; AVX-NOT: vpshufd	$49
-; AVX: vpmuldq
-; AVX: vshufps	$-35
-; AVX: vpshufd	$-40
-; AVX: vpadd
-; AVX: vpsrld $31
-; AVX: vpsrad $2
-; AVX: vpadd
 }
 
 define <8 x i32> @test9(<8 x i32> %a) {
+; SSE41-LABEL: test9:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT:   # kill: XMM0<def> XMM3<kill>
+; SSE41-NEXT:    pmuldq %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld $31, %xmm3
+; SSE41-NEXT:    psrad $2, %xmm0
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    pmuldq %xmm2, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrld $31, %xmm2
+; SSE41-NEXT:    psrad $2, %xmm1
+; SSE41-NEXT:    paddd %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test9:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psrad $31, %xmm4
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    psrad $31, %xmm5
+; SSE-NEXT:    pand %xmm1, %xmm5
+; SSE-NEXT:    paddd %xmm0, %xmm5
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm6, %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    psubd %xmm5, %xmm0
+; SSE-NEXT:    paddd %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrld $31, %xmm3
+; SSE-NEXT:    psrad $2, %xmm0
+; SSE-NEXT:    paddd %xmm3, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm4
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    psrad $31, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    paddd %xmm4, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm6, %xmm4
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT:    psubd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrld $31, %xmm2
+; SSE-NEXT:    psrad $2, %xmm1
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test9:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vpsrld $31, %ymm0, %ymm1
+; AVX-NEXT:    vpsrad $2, %ymm0, %ymm0
+; AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %div
-
-; AVX-LABEL: test9:
-; AVX: vpalignr $4
-; AVX: vpbroadcastd
-; AVX: vpmuldq
-; AVX: vpmuldq
-; AVX: vpblendd $170
-; AVX: vpadd
-; AVX: vpsrld $31
-; AVX: vpsrad $2
-; AVX: vpadd
 }
 
 define <8 x i32> @test10(<8 x i32> %a) {
+; SSE41-LABEL: test10:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psubd %xmm3, %xmm5
+; SSE41-NEXT:    psrld $1, %xmm5
+; SSE41-NEXT:    paddd %xmm3, %xmm5
+; SSE41-NEXT:    psrld $2, %xmm5
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7]
+; SSE41-NEXT:    pmulld %xmm3, %xmm5
+; SSE41-NEXT:    psubd %xmm5, %xmm0
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
+; SSE41-NEXT:    psubd %xmm2, %xmm4
+; SSE41-NEXT:    psrld $1, %xmm4
+; SSE41-NEXT:    paddd %xmm2, %xmm4
+; SSE41-NEXT:    psrld $2, %xmm4
+; SSE41-NEXT:    pmulld %xmm3, %xmm4
+; SSE41-NEXT:    psubd %xmm4, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test10:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pmuludq %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    psubd %xmm3, %xmm5
+; SSE-NEXT:    psrld $1, %xmm5
+; SSE-NEXT:    paddd %xmm3, %xmm5
+; SSE-NEXT:    psrld $2, %xmm5
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [7,7,7,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm3, %xmm5
+; SSE-NEXT:    pmuludq %xmm3, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT:    psubd %xmm5, %xmm0
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    psubd %xmm2, %xmm4
+; SSE-NEXT:    psrld $1, %xmm4
+; SSE-NEXT:    paddd %xmm2, %xmm4
+; SSE-NEXT:    psrld $2, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm3, %xmm4
+; SSE-NEXT:    pmuludq %xmm3, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
+; SSE-NEXT:    psubd %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm2
+; AVX-NEXT:    vpsrld $1, %ymm2, %ymm2
+; AVX-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
+; AVX-NEXT:    vpsrld $2, %ymm1, %ymm1
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %rem
-
-; AVX-LABEL: test10:
-; AVX: vpbroadcastd
-; AVX: vpalignr $4
-; AVX: vpmuludq
-; AVX: vpmuludq
-; AVX: vpblendd $170
-; AVX: vpsubd
-; AVX: vpsrld $1
-; AVX: vpadd
-; AVX: vpsrld $2
-; AVX: vpmulld
 }
 
 define <8 x i32> @test11(<8 x i32> %a) {
+; SSE41-LABEL: test11:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pmuldq %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm5
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    psrld $31, %xmm5
+; SSE41-NEXT:    psrad $2, %xmm3
+; SSE41-NEXT:    paddd %xmm5, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [7,7,7,7]
+; SSE41-NEXT:    pmulld %xmm5, %xmm3
+; SSE41-NEXT:    psubd %xmm3, %xmm0
+; SSE41-NEXT:    pmuldq %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE41-NEXT:    paddd %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrld $31, %xmm3
+; SSE41-NEXT:    psrad $2, %xmm2
+; SSE41-NEXT:    paddd %xmm3, %xmm2
+; SSE41-NEXT:    pmulld %xmm5, %xmm2
+; SSE41-NEXT:    psubd %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test11:
+; SSE:       # BB#0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    psrad $31, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    pand %xmm0, %xmm4
+; SSE-NEXT:    movdqa %xmm0, %xmm6
+; SSE-NEXT:    psrad $31, %xmm6
+; SSE-NEXT:    pand %xmm2, %xmm6
+; SSE-NEXT:    paddd %xmm4, %xmm6
+; SSE-NEXT:    movdqa %xmm0, %xmm7
+; SSE-NEXT:    pmuludq %xmm2, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm5, %xmm4
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT:    psubd %xmm6, %xmm7
+; SSE-NEXT:    paddd %xmm0, %xmm7
+; SSE-NEXT:    movdqa %xmm7, %xmm4
+; SSE-NEXT:    psrld $31, %xmm4
+; SSE-NEXT:    psrad $2, %xmm7
+; SSE-NEXT:    paddd %xmm4, %xmm7
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,7,7,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm7
+; SSE-NEXT:    pmuludq %xmm4, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT:    psubd %xmm7, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm6
+; SSE-NEXT:    psrad $31, %xmm6
+; SSE-NEXT:    pand %xmm2, %xmm6
+; SSE-NEXT:    paddd %xmm3, %xmm6
+; SSE-NEXT:    pmuludq %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm5, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    psubd %xmm6, %xmm2
+; SSE-NEXT:    paddd %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    psrld $31, %xmm3
+; SSE-NEXT:    psrad $2, %xmm2
+; SSE-NEXT:    paddd %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm4, %xmm2
+; SSE-NEXT:    pmuludq %xmm4, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    psubd %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpmuldq %ymm2, %ymm3, %ymm2
+; AVX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm1
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX-NEXT:    vpaddd %ymm0, %ymm1, %ymm1
+; AVX-NEXT:    vpsrld $31, %ymm1, %ymm2
+; AVX-NEXT:    vpsrad $2, %ymm1, %ymm1
+; AVX-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
   %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %rem
-
-; AVX-LABEL: test11:
-; AVX: vpalignr $4
-; AVX: vpbroadcastd
-; AVX: vpmuldq
-; AVX: vpmuldq
-; AVX: vpblendd $170
-; AVX: vpadd
-; AVX: vpsrld $31
-; AVX: vpsrad $2
-; AVX: vpadd
-; AVX: vpmulld
 }
 
 define <2 x i16> @test12() {
+; SSE41-LABEL: test12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: test12:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0
   %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1
   %B9 = urem <2 x i16> %I9, %I9
   ret <2 x i16> %B9
+}
 
-; AVX-LABEL: test12:
-; AVX: xorps
+define <4 x i32> @PR20355(<4 x i32> %a) {
+; SSE41-LABEL: PR20355:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm2, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE41-NEXT:    movaps %xmm0, %xmm1
+; SSE41-NEXT:    psrld $31, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; SSE-LABEL: PR20355:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    psrad $31, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    paddd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT:    pmuludq %xmm2, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    psubd %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $31, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR20355:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[1,3],xmm0[1,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    vpsrld $31, %xmm0, %xmm1
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %sdiv
 }

diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
new file mode 100644
index 0000000..7a329d7
--- /dev/null
+++ b/test/CodeGen/X86/vector-sext.ll

@@ -0,0 +1,943 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+;
+; Just one 32-bit run to make sure we do reasonable things there.
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
+
+define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_8i16_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:   # kill: XMM0<def> XMM1<kill>
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pslld $16, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pslld $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_8i16_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:   # kill: XMM0<def> XMM1<kill>
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pslld $16, %xmm0
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pslld $16, %xmm1
+; SSSE3-NEXT:    psrad $16, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_8i16_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxwd %xmm1, %xmm0
+; SSE41-NEXT:    pslld $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pslld $16, %xmm1
+; SSE41-NEXT:    psrad $16, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_8i16_to_8i32:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE41-NEXT:    pmovzxwd %xmm1, %xmm0
+; X32-SSE41-NEXT:    pslld $16, %xmm0
+; X32-SSE41-NEXT:    psrad $16, %xmm0
+; X32-SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; X32-SSE41-NEXT:    pslld $16, %xmm1
+; X32-SSE41-NEXT:    psrad $16, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+
+define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: sext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_4i32_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    pmovzxdq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm2
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm1, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm1, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+  %B = sext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_test1:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq (%rdi), %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test1:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq (%rdi), %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $16, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test1:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test1:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test1:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwd (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i32>
+ ret <4 x i32>%Y
+}
+
+define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_test2:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd (%rdi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test2:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd (%rdi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test2:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test2:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test2:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbd (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i32>
+ ret <4 x i32>%Y
+}
+
+define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_test3:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movsbq 1(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movsbq (%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test3:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movsbq 1(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movsbq (%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test3:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test3:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test3:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i8>* %ptr
+ %Y = sext <2 x i8> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_test4:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movswq 2(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movswq (%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test4:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movswq 2(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movswq (%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test4:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test4:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxwq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test4:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i16>* %ptr
+ %Y = sext <2 x i16> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) {
+; SSE2-LABEL: load_sext_test5:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movslq 4(%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    movslq (%rdi), %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test5:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movslq 4(%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    movslq (%rdi), %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test5:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxdq (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test5:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxdq (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test5:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxdq (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <2 x i32>* %ptr
+ %Y = sext <2 x i32> %X to <2 x i64>
+ ret <2 x i64>%Y
+}
+
+define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_test6:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq (%rdi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_test6:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq (%rdi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_test6:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: load_sext_test6:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
+; AVX-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_test6:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <8 x i8>* %ptr
+ %Y = sext <8 x i8> %X to <8 x i16>
+ ret <8 x i16>%Y
+}
+
+define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
+; SSE2-LABEL: sext_4i1_to_4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pslld $31, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i1_to_4i64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pslld $31, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i1_to_4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pslld $31, %xmm0
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i1_to_4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i1_to_4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_4i1_to_4i64:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    pslld $31, %xmm0
+; X32-SSE41-NEXT:    psrad $31, %xmm0
+; X32-SSE41-NEXT:    pmovzxdq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm2
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm1, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm1, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+  %extmask = sext <4 x i1> %mask to <4 x i64>
+  ret <4 x i64> %extmask
+}
+
+define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
+; SSE2-LABEL: sext_16i8_to_16i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psllw $8, %xmm0
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psllw $8, %xmm1
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_16i8_to_16i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psllw $8, %xmm0
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    psllw $8, %xmm1
+; SSSE3-NEXT:    psraw $8, %xmm1
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_16i8_to_16i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxbw %xmm1, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    psraw $8, %xmm0
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    psllw $8, %xmm1
+; SSE41-NEXT:    psraw $8, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_16i8_to_16i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_16i8_to_16i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_16i8_to_16i16:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movdqa (%eax), %xmm1
+; X32-SSE41-NEXT:    pmovzxbw %xmm1, %xmm0
+; X32-SSE41-NEXT:    psllw $8, %xmm0
+; X32-SSE41-NEXT:    psraw $8, %xmm0
+; X32-SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; X32-SSE41-NEXT:    psllw $8, %xmm1
+; X32-SSE41-NEXT:    psraw $8, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <16 x i8>* %ptr
+ %Y = sext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
+define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
+; SSE2-LABEL: sext_4i8_to_4i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    cltq
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: sext_4i8_to_4i64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pslld $24, %xmm0
+; SSSE3-NEXT:    psrad $24, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm1, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm0, %rax
+; SSSE3-NEXT:    cltq
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: sext_4i8_to_4i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm1
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm3
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    cltq
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: sext_4i8_to_4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: sext_4i8_to_4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: sext_4i8_to_4i64:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    pslld $24, %xmm0
+; X32-SSE41-NEXT:    psrad $24, %xmm0
+; X32-SSE41-NEXT:    pmovzxdq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm2
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm2
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm1, %eax
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pextrd $2, %xmm1, %ecx
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %ecx
+; X32-SSE41-NEXT:    pinsrd $3, %ecx, %xmm1
+; X32-SSE41-NEXT:    movdqa %xmm2, %xmm0
+; X32-SSE41-NEXT:    retl
+  %extmask = sext <4 x i8> %mask to <4 x i64>
+  ret <4 x i64> %extmask
+}
+
+define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_4i8_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movsbq %al, %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i8_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movsbq %al, %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i8_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxdq %xmm1, %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    movsbq %al, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    movsbq %al, %rax
+; SSE41-NEXT:    movd %rax, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    movsbq %al, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    movsbq %al, %rax
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i8_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i8_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovzxbd %xmm0, %xmm1
+; X32-SSE41-NEXT:    pmovzxbq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    movsbl %al, %eax
+; X32-SSE41-NEXT:    movd %eax, %xmm0
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm0
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT:    movsbl %al, %eax
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    movsbl %al, %eax
+; X32-SSE41-NEXT:    movd %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT:    movsbl %al, %eax
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i8>* %ptr
+ %Y = sext <4 x i8> %X to <4 x i64>
+ ret <4 x i64>%Y
+}
+
+define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
+; SSE2-LABEL: load_sext_4i16_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq (%rdi), %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movd %rax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movd %rax, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm2, %rax
+; SSE2-NEXT:    movswq %ax, %rax
+; SSE2-NEXT:    movd %rax, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i16_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movq (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movd %rax, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movd %rax, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSSE3-NEXT:    movd %xmm2, %rax
+; SSSE3-NEXT:    movswq %ax, %rax
+; SSSE3-NEXT:    movd %rax, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i16_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movq (%rdi), %xmm0
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm1
+; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; SSE41-NEXT:    movswq %ax, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm0, %rax
+; SSE41-NEXT:    movswq %ax, %rax
+; SSE41-NEXT:    movd %rax, %xmm0
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; SSE41-NEXT:    movswq %ax, %rax
+; SSE41-NEXT:    movd %rax, %xmm2
+; SSE41-NEXT:    movd %xmm1, %rax
+; SSE41-NEXT:    movswq %ax, %rax
+; SSE41-NEXT:    movd %rax, %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i16_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i16_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovsxwq (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
+; X32-SSE41:       # BB#0: # %entry
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movsd (%eax), %xmm0
+; X32-SSE41-NEXT:    pmovzxwd %xmm0, %xmm1
+; X32-SSE41-NEXT:    pmovzxwq %xmm0, %xmm2
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    cwtl
+; X32-SSE41-NEXT:    movd %eax, %xmm0
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm0
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT:    cwtl
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm0
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; X32-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
+; X32-SSE41-NEXT:    movd %xmm2, %eax
+; X32-SSE41-NEXT:    cwtl
+; X32-SSE41-NEXT:    movd %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $1, %eax, %xmm1
+; X32-SSE41-NEXT:    pextrd $2, %xmm2, %eax
+; X32-SSE41-NEXT:    cwtl
+; X32-SSE41-NEXT:    pinsrd $2, %eax, %xmm1
+; X32-SSE41-NEXT:    sarl $31, %eax
+; X32-SSE41-NEXT:    pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT:    retl
+entry:
+ %X = load <4 x i16>* %ptr
+ %Y = sext <4 x i16> %X to <4 x i64>
+ ret <4 x i64>%Y
+}

diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 4da7e42..30ad366 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll

@@ -1,196 +1,1110 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; FIXME: SSE2 should look like the following:
+; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
+; FIXME:       # BB#0:
+; FIXME-NEXT:    punpcklbw %xmm0, %xmm0
+; FIXME-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; FIXME-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
+; FIXME-NEXT:    retq
+;
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,2,4,5,6,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    punpcklwd %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    punpckhwd %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; FIXME: SSE2 should be the following:
+; FIXME-LABEL: @shuffle_v16i8_0101010101010101
+; FIXME:       # BB#0:
+; FIXME-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; FIXME-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
+; FIXME-NEXT:    retq
+;
+; SSE2-LABEL: shuffle_v16i8_0101010101010101:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_0101010101010101:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_0101010101010101:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v16i8_0101010101010101:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i8_0101010101010101:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23
-; CHECK-SSE2:         punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   ret <16 x i8> %shuffle
 }
 
+define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
+; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle
+}
+
 define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12
-; CHECK-SSE2:         pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-SSE2-NEXT:    punpckhbw %xmm1, %xmm2
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    packuswb %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20
-; CHECK-SSE2:         pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
   ret <16 x i8> %shuffle
 }
 
 define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20
-; CHECK-SSE2:         pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm3
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; CHECK-SSE2-NEXT:    punpckhbw %xmm2, %xmm4
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1]
-; CHECK-SSE2-NEXT:    punpckhbw %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    punpcklbw %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; CHECK-SSE2-NEXT:    packuswb %xmm4, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    movsd %xmm4, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
   ret <16 x i8> %shuffle
 }
 
-define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @zext_to_v8i16_shuffle
-; CHECK-SSE2:         pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
-  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
+define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
+; SSE2-LABEL: trunc_v4i32_shuffle:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: trunc_v4i32_shuffle:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: trunc_v4i32_shuffle:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: trunc_v4i32_shuffle:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <16 x i8> %shuffle
 }
 
-define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @zext_to_v4i32_shuffle
-; CHECK-SSE2:         pxor %xmm1, %xmm1
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    punpcklbw %xmm1, %xmm0
+define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) {
+; We don't have anything useful to check here. This generates 100s of
+; instructions. Instead, just make sure we survived codegen.
+; ALL-LABEL: stress_test0:
+; ALL:         retq
+entry:
+  %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6>
+  %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28>
+  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8>
+  %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29>
+  %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29>
+  %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17>
+  %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23>
+  %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17>
+  %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10>
+  ret <16 x i8> %s.16.0
+}
+
+define <16 x i8> @stress_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
+; There is nothing interesting to check about these instructions other than
+; that they survive codegen. However, we actually do better and delete all of
+; them because the result is 'undef'.
+;
+; ALL-LABEL: stress_test1:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
+  %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22>
+  %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9>
+  %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11>
+  %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29>
+  %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef>
+  %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10>
+  %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef>
+  %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5>
+  %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef>
+
+  ret <16 x i8> %s.12.4
+}
+
+define <16 x i8> @PR20540(<8 x i8> %a) {
+; SSE2-LABEL: PR20540:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: PR20540:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: PR20540:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: PR20540:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl %dil, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl %dil, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,0],zero,xmm1[0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
+; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl %dil, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 3
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbd %xmm0, %xmm0
+; AVX-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
   ret <16 x i8> %shuffle
 }
 
-define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
-; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pand
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-SSE2-NEXT:    retq
-  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbw %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
   ret <16 x i8> %shuffle
 }
+
+define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
+; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbw %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
+; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,1,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,1,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    packuswb %xmm0, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT:    packuswb %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT:    retq
+entry:
+  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
+
+  ret <16 x i8> %shuffle
+}
+
+define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
+; Nothing interesting to test here. Just make sure we didn't crashe.
+; ALL-LABEL: stress_test2:
+; ALL:         retq
+entry:
+  %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5>
+  %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22>
+  %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19>
+
+  ret <16 x i8> %s.2.0
+}
+
+define void @constant_gets_selected() {
+; ALL-LABEL: constant_gets_selected:
+; ALL-NOT movd $0, {{%xmm[0-9]+}}
+  %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
+  %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+  %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
+  store <4 x i32> %weirder_zero, <4 x i32>* undef, align 16
+  store <4 x i32> zeroinitializer, <4 x i32>* undef, align 16
+  ret void
+}

diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 78b4ee7..9affee9 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll

@@ -1,219 +1,1138 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_00
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_00:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_10(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_10
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_10:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_11(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_11
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_11:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_22
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_22:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_22:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_22:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_32(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_32
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_32:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_33
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_33:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_33:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 3>
   ret <2 x i64> %shuffle
 }
 
 define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_00
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2f64_00:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2f64_00:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2f64_00:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_00:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_00:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_10
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_10:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 0>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_11
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_11:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
-; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
-;        of a mov?
+; SSE2-LABEL: shuffle_v2f64_22:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
 ;
-; CHECK-SSE2-LABEL: @shuffle_v2f64_22
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE3-LABEL: shuffle_v2f64_22:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2f64_22:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_22:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_22:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_32
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_32:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 2>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2f64_33
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_33:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_33:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm1[1,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
   ret <2 x double> %shuffle
 }
+define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: shuffle_v2f64_03:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2f64_03:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2f64_03:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_03:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_03:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: shuffle_v2f64_21:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2f64_21:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2f64_21:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_21:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_21:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
+  ret <2 x double> %shuffle
+}
 
 
 define <2 x i64> @shuffle_v2i64_02(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_02
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_02:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_02:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_02_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm2[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_02_copy:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_02_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_03
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_03:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_03:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_03:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_03:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_03:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_03:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_03_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_03_copy:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_03_copy:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm1, %xmm2
+; SSE3-NEXT:    movaps %xmm2, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_03_copy:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm2
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_03_copy:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_03_copy:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_03_copy:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_12
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_12:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_12:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_12:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_12_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm2[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_12_copy:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_12_copy:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_12_copy:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_12_copy:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_12_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_13(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_13
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_13:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_13:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_13_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_13_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm2[1]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_13_copy:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_13_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_20(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_20
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_20:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_20:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_20_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[0],xmm1[0]
-; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_20_copy:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_20_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_21
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_21:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_21:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_21:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_21:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_21:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_21:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_21_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_21_copy:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_21_copy:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsd %xmm2, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_21_copy:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm2, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_21_copy:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_21_copy:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_21_copy:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_30(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_30
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_30:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_30:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_30:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_30:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_30:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_30_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[1],xmm1[0]
-; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v2i64_30_copy:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_30_copy:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
+; SSE3-NEXT:    movapd %xmm2, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_30_copy:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_30_copy:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_30_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_31(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_31
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[1],xmm0[1]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_31:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_31:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v2i64_31_copy
-; CHECK-SSE2:         shufpd {{.*}} # xmm2 = xmm2[1],xmm1[1]
-; CHECK-SSE2-NEXT:    movapd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_31_copy:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_31_copy:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm1[1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
   ret <2 x i64> %shuffle
 }
+
+define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) {
+; SSE-LABEL: shuffle_v2i64_0z:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_0z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) {
+; SSE-LABEL: shuffle_v2i64_1z:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_1z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) {
+; SSE-LABEL: shuffle_v2i64_z0:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2i64_z0:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 0>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
+; SSE2-LABEL: shuffle_v2i64_z1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2i64_z1:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2i64_z1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2i64_z1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v2i64_z1:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v2i64_z1:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_0z:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_0z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_1z:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_1z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
+; SSE-LABEL: shuffle_v2f64_z0:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_z0:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
+; SSE2-LABEL: shuffle_v2f64_z1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v2f64_z1:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v2f64_z1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v2f64_z1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorpd %xmm1, %xmm1
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v2f64_z1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  ret <2 x double> %shuffle
+}
+
+define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) {
+; SSE-LABEL: insert_reg_and_zero_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %rdi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_and_zero_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %rdi, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_and_zero_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x double> @insert_reg_and_zero_v2f64(double %a) {
+; SSE-LABEL: insert_reg_and_zero_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_and_zero_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_and_zero_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsd (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
+; SSE2-LABEL: insert_reg_lo_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd %rdi, %xmm1
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_reg_lo_v2i64:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movd %rdi, %xmm1
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_reg_lo_v2i64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %rdi, %xmm1
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_reg_lo_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %rdi, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_reg_lo_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq %rdi, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_reg_lo_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
+; SSE2-LABEL: insert_mem_lo_v2i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlpd (%rdi), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_mem_lo_v2i64:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movlpd (%rdi), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_mem_lo_v2i64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movlpd (%rdi), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_mem_lo_v2i64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movq (%rdi), %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_mem_lo_v2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq (%rdi), %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_lo_v2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq (%rdi), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) {
+; SSE-LABEL: insert_reg_hi_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %rdi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_hi_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %rdi, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
+; SSE-LABEL: insert_mem_hi_v2i64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq (%rdi), %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_hi_v2i64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq (%rdi), %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <2 x i64> undef, i64 %a, i32 0
+  %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x i64> %shuffle
+}
+
+define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
+; SSE-LABEL: insert_reg_lo_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_lo_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) {
+; SSE-LABEL: insert_mem_lo_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movlpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_lo_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) {
+; SSE-LABEL: insert_reg_hi_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_hi_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) {
+; SSE-LABEL: insert_mem_hi_v2f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_hi_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @insert_dup_reg_v2f64(double %a) {
+; FIXME: We should match movddup for SSE3 and higher here.
+;
+; SSE2-LABEL: insert_dup_reg_v2f64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_dup_reg_v2f64:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_reg_v2f64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_reg_v2f64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_dup_reg_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
+; SSE2-LABEL: insert_dup_mem_v2f64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd (%rdi), %xmm0
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_dup_mem_v2f64:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movddup (%rdi), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_dup_mem_v2f64:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movddup (%rdi), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_dup_mem_v2f64:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movddup (%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_dup_mem_v2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovddup (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+
+define <2 x double> @shuffle_mem_v2f64_10(<2 x double>* %ptr) {
+; SSE-LABEL: shuffle_mem_v2f64_10:
+; SSE:       # BB#0:
+; SSE-NEXT:    movapd (%rdi), %xmm0
+; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_mem_v2f64_10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = mem[1,0]
+; AVX-NEXT:    retq
+  %a = load <2 x double>* %ptr
+  %shuffle = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x double> %shuffle
+}

diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 7d496fa..833b822 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll

@@ -1,170 +1,1386 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0001
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,0,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0001:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0001:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0020
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,0,2,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0020:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0020:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   ret <4 x i32> %shuffle
 }
+define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4i32_0112:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0112:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+  ret <4 x i32> %shuffle
+}
 define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0300
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[0,3,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0300:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0300:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_1000
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[1,0,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_1000:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_1000:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_2200
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[2,2,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_2200:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_2200:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_3330
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[3,3,3,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_3330:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_3330:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_3210
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm0[3,2,1,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_3210:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_3210:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i32> %shuffle
 }
 
+define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: shuffle_v4i32_2121:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_2121:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
+  ret <4 x i32> %shuffle
+}
+
 define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_0001
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,0,0,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_0001:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0001:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_0020
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,0,2,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_0020:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0020:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_0300
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,3,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_0300:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0300:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_1000
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[1,0,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_1000:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_1000:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_2200
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[2,2,0,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_2200:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_2200:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_3330
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[3,3,3,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_3330:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_3330:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
   ret <4 x float> %shuffle
 }
 define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4f32_3210
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[3,2,1,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4f32_3210:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_3210:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %shuffle
 }
+define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_0011:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0011:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: shuffle_v4f32_2233:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_2233:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: shuffle_v4f32_0022:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_0022:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_0022:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_0022:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_0022:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: shuffle_v4f32_1133:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_1133:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_1133:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_1133:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_1133:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
 
 define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0124
-; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v4i32_0124:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_0124:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0124:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0124:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0124:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0142
-; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,2]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0142:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0142:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0412
-; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm1 = xmm1[2,0],xmm0[1,2]
-; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0412:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0412:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,2]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_4012
-; CHECK-SSE2:         shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm1 = xmm1[0,2],xmm0[1,2]
-; CHECK-SSE2-NEXT:    movaps %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_4012:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_4012:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,2]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0145
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0145:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0145:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_0451
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[0,2,3,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_0451:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0451:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_4501
-; CHECK-SSE2:         shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_4501:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_4501:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
   ret <4 x i32> %shuffle
 }
 define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v4i32_4015
-; CHECK-SSE2:         shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1]
-; CHECK-SSE2-NEXT:    shufps {{.*}} # xmm0 = xmm0[2,0,1,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v4i32_4015:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_4015:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
   ret <4 x i32> %shuffle
 }
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_4zzz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_4zzz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_4zzz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_4zzz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_4zzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_z4zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_z4zz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_z4zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_z4zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_z4zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_zz4z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_zz4z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_zz4z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_zz4z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_zz4z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_zuu4:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_zuu4:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_zuu4:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_zuu4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_zuu4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_zzz7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_zzz7:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_zzz7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_zzz7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_zzz7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE2-LABEL: shuffle_v4f32_z6zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4f32_z6zz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4f32_z6zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4f32_z6zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4f32_z6zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_4zzz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_4zzz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_4zzz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_4zzz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_4zzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_z4zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_z4zz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_z4zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_z4zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_z4zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_zz4z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_zz4z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_zz4z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_zz4z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_zz4z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_zuu4:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_zuu4:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_zuu4:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_zuu4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_zuu4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,0]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_z6zz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_z6zz:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_z6zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_z6zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_z6zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_7012:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_7012:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_7012:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_7012:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_7012:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_6701:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_6701:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE3-NEXT:    movapd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_6701:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_6701:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_6701:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_5670:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_5670:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_5670:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_5670:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_5670:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_1234:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_1234:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_1234:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_1234:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_1234:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_2345:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_2345:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_2345:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_2345:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_2345:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_3456:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_3456:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_3456:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_3456:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_3456:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: shuffle_v4i32_0u1u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_0u1u:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0u1u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0u1u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0u1u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxdq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
+; SSE2-LABEL: shuffle_v4i32_0z1z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: shuffle_v4i32_0z1z:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v4i32_0z1z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v4i32_0z1z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v4i32_0z1z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxdq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
+; SSE-LABEL: insert_reg_and_zero_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %edi, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_and_zero_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_and_zero_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load i32* %ptr
+  %v = insertelement <4 x i32> undef, i32 %a, i32 0
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
+; SSE2-LABEL: insert_reg_and_zero_v4f32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_reg_and_zero_v4f32:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    xorps %xmm1, %xmm1
+; SSE3-NEXT:    movss %xmm0, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_reg_and_zero_v4f32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_reg_and_zero_v4f32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_and_zero_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vmovss %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
+; SSE-LABEL: insert_mem_and_zero_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_and_zero_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovss (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %a = load float* %ptr
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
+; SSE2-LABEL: insert_reg_lo_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd %rdi, %xmm1
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_reg_lo_v4i32:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movd %rdi, %xmm1
+; SSE3-NEXT:    movsd %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_reg_lo_v4i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %rdi, %xmm1
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_reg_lo_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %rdi, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_reg_lo_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq %rdi, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_reg_lo_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %a.cast = bitcast i64 %a to <2 x i32>
+  %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
+; SSE2-LABEL: insert_mem_lo_v4i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlpd (%rdi), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE3-LABEL: insert_mem_lo_v4i32:
+; SSE3:       # BB#0:
+; SSE3-NEXT:    movlpd (%rdi), %xmm0
+; SSE3-NEXT:    retq
+;
+; SSSE3-LABEL: insert_mem_lo_v4i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movlpd (%rdi), %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: insert_mem_lo_v4i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movq (%rdi), %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: insert_mem_lo_v4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq (%rdi), %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_lo_v4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq (%rdi), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    retq
+  %a = load <2 x i32>* %ptr
+  %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
+; SSE-LABEL: insert_reg_hi_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %rdi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_hi_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq %rdi, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %a.cast = bitcast i64 %a to <2 x i32>
+  %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
+; SSE-LABEL: insert_mem_hi_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq (%rdi), %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_hi_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovq (%rdi), %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %a = load <2 x i32>* %ptr
+  %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
+; SSE-LABEL: insert_reg_lo_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movsd %xmm0, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_lo_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %a.cast = bitcast double %a to <2 x float>
+  %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE-LABEL: insert_mem_lo_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movlpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_lo_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = load <2 x float>* %ptr
+  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
+; SSE-LABEL: insert_reg_hi_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_reg_hi_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
+  %a.cast = bitcast double %a to <2 x float>
+  %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE-LABEL: insert_mem_hi_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhpd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: insert_mem_hi_v4f32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a = load <2 x float>* %ptr
+  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
+; SSE-LABEL: shuffle_mem_v4f32_3210:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_mem_v4f32_3210:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; AVX-NEXT:    retq
+  %a = load <4 x float>* %ptr
+  %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %shuffle
+}

diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 5d1922a..59af434 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll

@@ -1,493 +1,1941 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_01012323
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,0,1,1]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_01012323:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_01012323:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_67452301
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,2,1,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_67452301:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_67452301:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_456789AB
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_456789AB:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_456789AB:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_456789AB:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_456789AB:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_00000000
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_00000000:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_00000000:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_00000000:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v8i16_00000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_00000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_00004444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_00004444:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_00004444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
+define <8 x i16> @shuffle_v8i16_u0u1u2u3(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_u0u1u2u3:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u0u1u2u3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_u4u5u6u7(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_u4u5u6u7:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u4u5u6u7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
+  ret <8 x i16> %shuffle
+}
 define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_31206745
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_31206745:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_31206745:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_44440000
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,1,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_44440000:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_44440000:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_44440000:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_44440000:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i16> %shuffle
 }
+define <8 x i16> @shuffle_v8i16_23016745(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_23016745:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_23016745:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_23026745:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_23026745:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5>
+  ret <8 x i16> %shuffle
+}
+define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_23016747:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_23016747:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7>
+  ret <8 x i16> %shuffle
+}
 define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_75643120
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_75643120:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_75643120:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_75643120:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_75643120:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 0>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_10545410
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_10545410:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_10545410:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_10545410:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_10545410:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_54105410
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_54105410:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_54105410:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_54105410:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_54105410:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_54101054
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_54101054:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_54101054:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_54101054:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_54101054:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04400440
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,4,4,6]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_04400440:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,4,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_04400440:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_04400440:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_04400440:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_40044004
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,0,0,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_40044004:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_40044004:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_40044004:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_40044004:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 4>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_26405173
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_26405173:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_26405173:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_26405173:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_26405173:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 3>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_20645173
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_20645173:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_20645173:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_20645173:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_20645173:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 3>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_26401375
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_26401375:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_26401375:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_26401375:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_26401375:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
   ret <8 x i16> %shuffle
 }
 
+define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_66751643:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,3,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_66751643:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_66751643:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_66751643:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 3>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_60514754:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,5,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_60514754:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_60514754:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_60514754:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 6, i32 0, i32 5, i32 1, i32 4, i32 7, i32 5, i32 4>
+  ret <8 x i16> %shuffle
+}
+
 define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_00444444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_00444444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_00444444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_00444444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_00444444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_44004444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,2,0,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_44004444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_44004444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_44004444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_44004444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04404444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_04404444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_04404444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_04404444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_04404444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04400000
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,0,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_04400000:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_04400000:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_04400000:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_04400000:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_04404567
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_04404567:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_04404567:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0X444444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,1,2,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_0X444444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0X444444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0X444444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0X444444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_44X04444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_44X04444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_44X04444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_44X04444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_44X04444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_X4404444
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_X4404444:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_X4404444:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_X4404444:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_X4404444:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0127XXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_0127XXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0127XXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0127XXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0127XXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXX4563
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,0]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_XXXX4563:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XXXX4563:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_XXXX4563:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_XXXX4563:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_4563XXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_4563XXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_4563XXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_4563XXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_4563XXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_01274563
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,1,3]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_01274563:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_01274563:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_01274563:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_01274563:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_45630127
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,0,1,3]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_45630127:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_45630127:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_45630127:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_45630127:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>
   ret <8 x i16> %shuffle
 }
 
+define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_37102735:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_37102735:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_37102735:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_37102735:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 5>
+  ret <8 x i16> %shuffle
+}
+
 define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_08192a3b
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_08192a3b:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_08192a3b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d2e3f
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_0c1d2e3f:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_4c5d6e7f
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_4c5d6e7f:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_4c5d6e7f:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_48596a7b
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_48596a7b:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_48596a7b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_08196e7f
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[0,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_08196e7f:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_08196e7f:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d6879
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,0,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_0c1d6879:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0c1d6879:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_109832ba
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm0[2,0,3,1,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[2,0,3,1,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklqdq %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_109832ba:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_109832ba:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_8091a2b3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    punpcklwd %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_8091a2b3:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_8091a2b3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3>
   ret <8 x i16> %shuffle
 }
 define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_c4d5e6f7
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm0[2,3,2,3]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_c4d5e6f7:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_c4d5e6f7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_0213cedf
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,2,1,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklqdq %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: shuffle_v8i16_0213cedf:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0213cedf:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>
   ret <8 x i16> %shuffle
 }
 
+define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_443aXXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_443aXXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_443aXXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_443aXXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
 define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_032dXXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_032dXXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_032dXXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_032dXXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_032dXXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
-define <8 x i16> @shuffle_v8i16_XXXcXXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXcXXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
+; SSE-LABEL: shuffle_v8i16_XXXdXXXX:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_XXXdXXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_012dXXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_012dXXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_012dXXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_012dXXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_012dXXXX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXXcde3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
-; CHECK-SSE2-NEXT:    punpckhwd %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,0,2]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_XXXXcde3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XXXXcde3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_XXXXcde3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_XXXXcde3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_cde3XXXX
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,1,2,1]
-; CHECK-SSE2-NEXT:    punpckhwd %xmm0, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[0,2,2,3]
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_cde3XXXX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_cde3XXXX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_cde3XXXX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_cde3XXXX:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle
 }
 
 define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SSE2-LABEL: @shuffle_v8i16_012dcde3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm2 = xmm0[0,1,2,1]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm3 = xmm1[2,1,2,3]
-; CHECK-SSE2-NEXT:    punpckhwd %xmm2, %xmm1
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,7,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm1 = xmm1[0,2,2,3]
-; CHECK-SSE2-NEXT:    punpcklwd %xmm3, %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[3,1,2,0]
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-SSE2-NEXT:    pshufd {{.*}} # xmm0 = xmm0[2,1,2,3]
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7]
-; CHECK-SSE2-NEXT:    punpcklqdq %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    retq
+; SSE2-LABEL: shuffle_v8i16_012dcde3:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_012dcde3:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_012dcde3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: shuffle_v8i16_012dcde3:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i16_012dcde3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3>
   ret <8 x i16> %shuffle
 }
+
+define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_XXX1X579:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XXX1X579:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_XXX1X579:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_XXX1X579:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_XX4X8acX:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_8zzzzzzz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_8zzzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_z8zzzzzz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_zzzzz8zz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_zuuzuuz8:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 0
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
+; SSE-LABEL: shuffle_v8i16_zzBzzzzz:
+; SSE:       # BB#0:
+; SSE-NEXT:    movzwl %di, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    movzwl %di, %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT:    retq
+  %a = insertelement <8 x i16> undef, i16 %i, i32 3
+  %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_def01234:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_def01234:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_def01234:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_def01234:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_ueuu123u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_ueuu123u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_ueuu123u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_ueuu123u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_ueuu123u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_56701234:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_56701234:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_56701234:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_56701234:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u6uu123u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u6uu123u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u6uu123u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u6uu123u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_uuuu123u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_uuuu123u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_uuuu123u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_uuuu123u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_bcdef012(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_bcdef012:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_bcdef012:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_bcdef012:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_bcdef012:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_ucdeuu1u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_ucdeuu1u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_ucdeuu1u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_ucdeuu1u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_ucdeuu1u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 1, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_34567012:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_34567012:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_34567012:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_34567012:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u456uu1u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u456uu1u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u456uu1u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u456uu1u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u456uuuu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u456uuuu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u456uuuu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u456uuuu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_3456789a(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_3456789a:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_3456789a:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_3456789a:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_3456789a:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u456uu9u(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u456uu9u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u456uu9u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u456uu9u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u456uu9u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 9, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_56789abc(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_56789abc:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_56789abc:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_56789abc:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_56789abc:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: shuffle_v8i16_u6uu9abu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_u6uu9abu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_u6uu9abu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_u6uu9abu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0uuu1uuu:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0uuu1uuu:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0uuu1uuu:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0uuu1uuu:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxwq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0zzz1zzz:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0zzz1zzz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0zzz1zzz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwq %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0zzz1zzz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxwq %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0u1u2u3u:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0u1u2u3u:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0u1u2u3u:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0u1u2u3u:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxwd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
+  ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_0z1z2z3z:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: shuffle_v8i16_0z1z2z3z:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v8i16_0z1z2z3z:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v8i16_0z1z2z3z:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxwd %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x i16> %shuffle
+}

diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
new file mode 100644
index 0000000..4db0280
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll

@@ -0,0 +1,1267 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,0,1,u,u,0,1,u,u,0,1,u,u,16,17,u,u,16,17,u,u,16,17,u,u,16,17]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8>
+  ret <16 x i16> %shuffle
+}
+
+define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7]
+; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12>
+  ret <16 x i16> %shuffle
+}

diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
new file mode 100644
index 0000000..79c906b
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll

@@ -0,0 +1,1562 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm1
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movl $15, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movl $15, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[6],zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[7],zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    movl $128, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    movl $15, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vinserti128 $0, %xmm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15,23,23,23,23,23,23,23,23,31,31,31,31,31,31,31,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15,19,19,19,19,23,23,23,23,27,27,27,27,31,31,31,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15, i32 19, i32 19, i32 19, i32 19, i32 23, i32 23, i32 23, i32 23, i32 27, i32 27, i32 27, i32 27, i32 31, i32 31, i32 31, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15,17,17,19,19,21,21,23,23,25,25,27,27,29,29,31,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15, i32 17, i32 17, i32 19, i32 19, i32 21, i32 21, i32 23, i32 23, i32 25, i32 25, i32 27, i32 27, i32 29, i32 29, i32 31, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movl $15, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movl $15, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,8,9,10,11,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,15,14,13,12,11,10,9,8]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u,31,30,29,28,27,26,25,24]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,7,6,5,4,3,2,1,0]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 18, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,30,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 30, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movl $15, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 31, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 17, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 18, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,30,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movl $15, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,31]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 31>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,28,28,28,28,24,24,24,24,20,20,20,20,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 28, i32 28, i32 28, i32 28, i32 24, i32 24, i32 24, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,u,u,u,u,u,0,0,0,0,0,14,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,u,u,u,u,u,16,16,16,16,16,30,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[14,14,1,1,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,u,0,u,u,u,u,0,0,0,0,0,0,14,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,0,0,0,0,0,0,0,0,0,0,0,0,16,16,u,16,u,u,u,u,16,16,16,16,16,16,30,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 undef, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,u,u,u,4,u,8,8,8,8,u,u,12,u,28,28,28,28,u,u,u,24,20,20,20,20,16,16,16,16]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 undef, i32 28, i32 28, i32 28, i32 28, i32 undef, i32 undef, i32 undef, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,9,9,8,8,8,8,8,8,8,8]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,u,u,u,u,u,u,u,u,16,16,16,u,u,u,u,u,u,u,24,24,24,24,24,24]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  ret <32 x i8> %shuffle
+}
+
+define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u]
+; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
+; AVX1-NEXT:    vpor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15]
+; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u]
+; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero
+; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15]
+; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0>
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39>
+  ret <32 x i8> %shuffle
+}

diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
new file mode 100644
index 0000000..0bd1bd9
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll

@@ -0,0 +1,748 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0001:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0001:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0020:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0020:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0300:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0300:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_1000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_1000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_2200:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_2200:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_3330:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_3330:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_3210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_3210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0023:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0022:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1032:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1133:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1023:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1022:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0423:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_0423:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0462:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0426:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_1537:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_4062:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_5173:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_5163:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0527:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_4163:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0145:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_4501:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
+; ALL-LABEL: shuffle_v4f64_0167:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0001:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0001:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0020:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0020:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0112:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0112:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0300:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0300:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_1000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_1000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_2200:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_2200:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_3330:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_3330:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_3210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_3210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0124:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0124:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0142:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0142:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0412:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0412:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_4012:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_4012:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: shuffle_v4i64_0145:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0451:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_0451:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) {
+; ALL-LABEL: shuffle_v4i64_4501:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_4015:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_4015:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_2u35:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_2u35:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_1251:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_1251:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: stress_test1:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm0[1,0,3,2]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
+; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: stress_test1:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[3,1,1,0]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3]
+; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-NEXT:    retq
+  %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
+  %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> <i32 3, i32 undef, i32 2, i32 undef>
+  %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 undef>
+  %f = shufflevector <4 x i64> %d, <4 x i64> %e, <4 x i32> <i32 5, i32 1, i32 1, i32 0>
+
+  ret <4 x i64> %f
+}
+
+define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
+; AVX1-LABEL: insert_reg_and_zero_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq %rdi, %xmm0
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_reg_and_zero_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq %rdi, %xmm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+  %v = insertelement <4 x i64> undef, i64 %a, i64 0
+  %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
+; AVX1-LABEL: insert_mem_and_zero_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovq (%rdi), %xmm0
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: insert_mem_and_zero_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovq (%rdi), %xmm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <4 x i64> undef, i64 %a, i64 0
+  %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
+; ALL-LABEL: insert_reg_and_zero_v4f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vmovsd %xmm0, %xmm1, %xmm0
+; ALL-NEXT:    retq
+  %v = insertelement <4 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
+; ALL-LABEL: insert_mem_and_zero_v4f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vmovsd (%rdi), %xmm0
+; ALL-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <4 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @splat_mem_v4f64(double* %ptr) {
+; ALL-LABEL: splat_mem_v4f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <4 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x double> %shuffle
+}
+
+define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
+; AVX1-LABEL: splat_mem_v4i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovddup (%rdi), %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splat_mem_v4i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+  %a = load i64* %ptr
+  %v = insertelement <4 x i64> undef, i64 %a, i64 0
+  %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i64> %shuffle
+}
+
+define <4 x double> @splat_mem_v4f64_2(double* %p) {
+; ALL-LABEL: splat_mem_v4f64_2:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %1 = load double* %p
+  %2 = insertelement <2 x double> undef, double %1, i32 0
+  %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %3
+}
+
+define <4 x double> @splat_v4f64(<2 x double> %r) {
+; AVX1-LABEL: splat_v4f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4f64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %1
+}

diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
new file mode 100644
index 0000000..ded8232
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll

@@ -0,0 +1,1931 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+
+target triple = "x86_64-unknown-unknown"
+
+define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00000010:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00000010:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00000200:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00000200:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00003000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00003000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00040000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00040000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00500000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00500000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_06000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_06000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_70000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_70000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    movl $7, %eax
+; AVX2-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $0, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_01014545:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00112233:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00112233:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_00001111:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_00001111:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_81a3c5e7:
+; ALL:       # BB#0:
+; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_08080808:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_08080808:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_08084c4c:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_8823cc67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_9832dc76:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_9810dc54:
+; ALL:       # BB#0:
+; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08194c5d(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_08194c5d:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_2a3b6e7f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_08192a3b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_08192a3b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_08991abb:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_08991abb:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_091b2d3f:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_091b2d3f:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_09ab1def:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_09ab1def:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00014445:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00204464:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_03004744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10005444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_22006644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_33307774:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_32107654:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00234467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00224466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10325476:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_11335577:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10225466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00015444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00204644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_03004474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10004444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_22006446:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_33307474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_32104567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00236744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00226644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_10324567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_11334567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_01235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_01235466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_002u6u44:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_00uu66uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_103245uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_1133uu67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_0uu354uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
+; ALL-LABEL: shuffle_v8f32_uuu3uu66:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_c348cda0:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_c348cda0:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,3,4,u,u,u,u,0>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_f511235a:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5]
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_f511235a:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_32103210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_32103210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_76547654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_76547654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_76543210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_76543210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_3210ba98:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_3210ba98:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_3210fedc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_3210fedc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_7654fedc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_7654fedc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_fedc7654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_fedc7654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_ba987654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_ba987654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8f32_ba983210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_ba983210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x float> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00000010:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00000010:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00000200:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00000200:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00003000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00003000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00040000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00040000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00500000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00500000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_06000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_06000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_70000000:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_70000000:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    movl $7, %eax
+; AVX2-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vinserti128 $0, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_01014545:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_01014545:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00112233:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00112233:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00001111:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00001111:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_81a3c5e7:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_81a3c5e7:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08080808:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08080808:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08084c4c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08084c4c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_8823cc67:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_8823cc67:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_9832dc76:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_9832dc76:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_9810dc54:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_9810dc54:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,0,4,5,5,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08194c5d(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08194c5d:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08194c5d:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_2a3b6e7f(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_2a3b6e7f:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_2a3b6e7f:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08192a3b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08192a3b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_08991abb:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_08991abb:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_091b2d3f:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_091b2d3f:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_09ab1def:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_09ab1def:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00014445:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00014445:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00204464:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00204464:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_03004744:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_03004744:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10005444:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10005444:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_22006644:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_22006644:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_33307774:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_33307774:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_32107654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_32107654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00234467:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00234467:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00224466(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00224466:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00224466:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10325476:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10325476:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_11335577(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_11335577:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_11335577:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10235467:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10235467:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10225466:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10225466:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00015444:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00015444:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00204644:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00204644:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_03004474:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_03004474:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10004444:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10004444:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_22006446:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_22006446:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_33307474:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_33307474:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_32104567:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_32104567:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00236744:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00236744:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00226644:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00226644:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_10324567:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_10324567:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_11334567:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_11334567:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_01235467:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_01235467:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_01235466:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_01235466:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_002u6u44:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_002u6u44:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_00uu66uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_00uu66uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_103245uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_103245uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_1133uu67:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_1133uu67:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_0uu354uu:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_0uu354uu:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_uuu3uu66:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_uuu3uu66:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_6caa87e5:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_6caa87e5:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,4,2,2,0,u,6,u>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_32103210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_32103210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_76547654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_76547654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_76543210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_76543210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_3210ba98:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_3210ba98:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,3,2,1,0>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_3210fedc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_3210fedc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_7654fedc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_7654fedc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_fedc7654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_fedc7654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_ba987654:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_ba987654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
+; AVX1-LABEL: shuffle_v8i32_ba983210:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8i32_ba983210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %shuffle
+}
+
+define <8 x float> @splat_mem_v8f32_2(float* %p) {
+; ALL-LABEL: splat_mem_v8f32_2:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastss (%rdi), %ymm0
+; ALL-NEXT:    retq
+  %1 = load float* %p
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %3
+}
+
+define <8 x float> @splat_v8f32(<4 x float> %r) {
+; AVX1-LABEL: splat_v8f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8f32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %1
+}

diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
new file mode 100644
index 0000000..8f87c7c
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll

@@ -0,0 +1,1429 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+
+target triple = "x86_64-unknown-unknown"
+
+define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00000010:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00000200:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00003000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00040000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00500000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_06000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_70000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_01014545:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00112233:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00001111:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_81a3c5e7:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08080808:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08084c4c:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3]
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_8823cc67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_9832dc76:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_9810dc54:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,1,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08194c5d:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_2a3b6e7f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08192a3b:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_08991abb:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm2 = ymm0[1,0,2,2]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm1[0,2,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_091b2d3f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_09ab1def:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00014445:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00204464:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_03004744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10005444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_22006644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_33307774:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_32107654:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00234467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00224466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10325476:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_11335577:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10225466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00015444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00204644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_03004474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10004444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_22006446:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_33307474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_32104567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00236744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00226644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_10324567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_11334567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_01235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_01235466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_002u6u44:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_00uu66uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_103245uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_1133uu67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_0uu354uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_uuu3uu66:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_c348cda0:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[0,1],ymm2[0,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vbroadcastsd %xmm1, %ymm4
+; ALL-NEXT:    vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3]
+; ALL-NEXT:    vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
+  ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
+; ALL-LABEL: shuffle_v8f64_f511235a:
+; ALL:       # BB#0:
+; ALL-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermpd {{.*#+}} ymm4 = ymm3[0,1,1,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3]
+; ALL-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm1[0,0,2,2]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3]
+; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3]
+; ALL-NEXT:    vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; ALL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
+  ret <8 x double> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00000010:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00000200:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00003000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00040000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00500000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_06000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_70000000:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_01014545:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00112233:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00001111:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_81a3c5e7:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08080808:
+; ALL:       # BB#0:
+; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08084c4c:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vinserti128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
+; ALL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_8823cc67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_9832dc76:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_9810dc54:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,0]
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08194c5d:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_2a3b6e7f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08192a3b:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_08991abb:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,2,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_091b2d3f:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_09ab1def:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00014445:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00204464:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_03004744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10005444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_22006644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_33307774:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_32107654:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00234467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00224466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10325476:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_11335577:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10225466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00015444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00204644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_03004474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10004444:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_22006446:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_33307474:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_32104567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00236744:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00226644:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_10324567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_11334567:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_01235467:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_01235466:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_002u6u44:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_00uu66uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_103245uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_1133uu67:
+; ALL:       # BB#0:
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_0uu354uu:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_uuu3uu66:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
+  ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
+; ALL-LABEL: shuffle_v8i64_6caa87e5:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[0,1,0,1]
+; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; ALL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT:    vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
+  ret <8 x i64> %shuffle
+}

diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index e60ecb7..22a6749 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll

@@ -1,6 +1,14 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+;
+; Verify that the DAG combiner correctly folds bitwise operations across
+; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
+; basic and always-safe patterns. Also test that the DAG combiner will combine
+; target-specific shuffle instructions where reasonable.
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
@@ -8,57 +16,72 @@
 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
 
 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd1
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
-  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 
+; ALL-LABEL: combine_pshufd1:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
+  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
   ret <4 x i32> %c
 }
 
 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd2
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+; ALL-LABEL: combine_pshufd2:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
   %c.cast = bitcast <8 x i16> %c to <4 x i32>
-  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
   ret <4 x i32> %d
 }
 
 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 
+; ALL-LABEL: combine_pshufd3:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
   %c.cast = bitcast <8 x i16> %c to <4 x i32>
-  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
   ret <4 x i32> %d
 }
 
 define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd4
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 
+; SSE-LABEL: combine_pshufd4:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufd4:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; AVX-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
   %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
   %c.cast = bitcast <8 x i16> %c to <4 x i32>
-  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 
+  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
   ret <4 x i32> %d
 }
 
 define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd5
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
-  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 
+; SSE-LABEL: combine_pshufd5:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufd5:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX-NEXT:    retq
+entry:
+  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
   %b.cast = bitcast <4 x i32> %b to <8 x i16>
   %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
   %c.cast = bitcast <8 x i16> %c to <4 x i32>
@@ -67,53 +90,2458 @@
 }
 
 define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufd6
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufd $0
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: combine_pshufd6:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufd6:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT:    retq
+entry:
   %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
   %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
   ret <4 x i32> %c
 }
 
 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshuflw1
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
-  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 
-  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 
+; ALL-LABEL: combine_pshuflw1:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
+  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
   ret <8 x i16> %c
 }
 
 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshuflw2
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    retq
+; ALL-LABEL: combine_pshuflw2:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    retq
+entry:
   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
-  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 
-  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
+  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
   ret <8 x i16> %d
 }
 
 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshuflw3
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: combine_pshuflw3:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshuflw3:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; AVX-NEXT:    retq
+entry:
   %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
-  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 
-  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 
+  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
+  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
   ret <8 x i16> %d
 }
 
 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
-; CHECK-SSE2-LABEL: @combine_pshufhw1
-; CHECK-SSE2:       # BB#0:
-; CHECK-SSE2-NEXT:    pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
-; CHECK-SSE2-NEXT:    retq
+; SSE-LABEL: combine_pshufhw1:
+; SSE:       # BB#0: # %entry
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufhw1:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; AVX-NEXT:    retq
+entry:
   %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
-  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 
-  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 
+  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
+  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
   ret <8 x i16> %d
 }
 
+define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test1:
+; SSE:       # BB#0:
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test5:
+; SSE:       # BB#0:
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test6:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+
+; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
+; are not performing a swizzle operations.
+
+define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test1b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test1b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test1b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test1b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test1b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test2b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test2b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test2b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test2b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test2b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test3b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test3b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm0
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test3b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test3b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test3b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test4b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test4b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    andps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test4b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test4b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test4b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test5b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test5b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    orps %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test5b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test5b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test5b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE2-LABEL: combine_bitwise_ops_test6b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    xorps %xmm1, %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test6b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm0
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test6b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_bitwise_ops_test6b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_bitwise_ops_test6b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test1c:
+; SSE:       # BB#0:
+; SSE-NEXT:    andps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test1c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test2c:
+; SSE:       # BB#0:
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test2c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test3c:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test3c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test4c:
+; SSE:       # BB#0:
+; SSE-NEXT:    andps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test4c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+
+define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test5c:
+; SSE:       # BB#0:
+; SSE-NEXT:    orps %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test5c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+
+define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; SSE-LABEL: combine_bitwise_ops_test6c:
+; SSE:       # BB#0:
+; SSE-NEXT:    xorps %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_bitwise_ops_test6c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
+; AVX-NEXT:    retq
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+
+define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test1:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test4:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test4:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test5:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test6:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test7:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test8:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test9:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test9:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test10:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test10:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test11:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test12:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test12:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test12:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
+  ret <4 x i32> %2
+}
+
+; The following pair of shuffles is folded into vector %A.
+define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
+; ALL-LABEL: combine_nested_undef_test13:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
+  ret <4 x i32> %2
+}
+
+; The following pair of shuffles is folded into vector %B.
+define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test14:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
+  ret <4 x i32> %2
+}
+
+
+; Verify that we don't optimize the following cases. We expect more than one shuffle.
+;
+; FIXME: Many of these already don't make sense, and the rest should stop
+; making sense with th enew vector shuffle lowering. Revisit at least testing for
+; it.
+
+define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test15:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test15:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[3,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
+; SSE2-LABEL: combine_nested_undef_test16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_nested_undef_test16:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_nested_undef_test16:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test17:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test17:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test18:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test18:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test19:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test19:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test20:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test20:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test21:
+; SSE:       # BB#0:
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test21:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+
+; Test that we correctly combine shuffles according to rule
+;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
+
+define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test22:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test22:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test23:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test23:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test24:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test24:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test25:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test25:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test25:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test26:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test26:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test27:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_nested_undef_test27:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_nested_undef_test27:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
+; SSE-LABEL: combine_nested_undef_test28:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_nested_undef_test28:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test2:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test2:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test5:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test5:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm1, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test5:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test6:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test6:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test6:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test6:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test7:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test7:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test8:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test9:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test9:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test10:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test10:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm1, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
+; SSSE3-NEXT:    movaps %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test10:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test10:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test10:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %2
+}
+
+define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
+; ALL-LABEL: combine_test11:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test12:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test12:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test12:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test13:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test13:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test14:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test15:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test15:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm0, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test15:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test15:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
+; ALL-LABEL: combine_test16:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test17:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test17:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test17:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test17:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test17:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test18:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test18:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test19:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test19:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: combine_test20:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test20:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm0, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test20:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test20:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test20:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x i32> %2
+}
+
+
+; Check some negative cases.
+; FIXME: Do any of these really make sense? Are they redundant with the above tests?
+
+define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test1b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test1b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test1b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test1b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test2b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test2b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3-NEXT:    movapd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test2b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
+; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test2b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_test3b:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm1, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test3b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_test4b:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test4b:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test4b:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE41-NEXT:    movaps %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test4b:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
+  ret <4 x float> %2
+}
+
+
+; Verify that we correctly fold shuffles even when we use illegal vector types.
+
+define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test1c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movd (%rsi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movss %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test1c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movd (%rsi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movss %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test1c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test1c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test1c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX2-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+  %A = load <4 x i8>* %a
+  %B = load <4 x i8>* %b
+  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x i8> %2
+}
+
+define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test2c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd (%rdi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    movd (%rsi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test2c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd (%rdi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    movd (%rsi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test2c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm0
+; SSE41-NEXT:    pmovzxbd (%rsi), %xmm1
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test2c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %A = load <4 x i8>* %a
+  %B = load <4 x i8>* %b
+  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
+  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x i8> %2
+}
+
+define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test3c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movd (%rsi), %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test3c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movd (%rsi), %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test3c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_test3c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %A = load <4 x i8>* %a
+  %B = load <4 x i8>* %b
+  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i8> %2
+}
+
+define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
+; SSE2-LABEL: combine_test4c:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd (%rdi), %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movd (%rsi), %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_test4c:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd (%rdi), %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movd (%rsi), %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_test4c:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxbd (%rsi), %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: combine_test4c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX1-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_test4c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovzxbd (%rdi), %xmm0
+; AVX2-NEXT:    vpmovzxbd (%rsi), %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT:    retq
+  %A = load <4 x i8>* %a
+  %B = load <4 x i8>* %b
+  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i8> %2
+}
+
+
+; The following test cases are generated from this C++ code
+;
+;__m128 blend_01(__m128 a, __m128 b)
+;{
+;  __m128 s = a;
+;  s = _mm_blend_ps( s, b, 1<<0 );
+;  s = _mm_blend_ps( s, b, 1<<1 );
+;  return s;
+;}
+;
+;__m128 blend_02(__m128 a, __m128 b)
+;{
+;  __m128 s = a;
+;  s = _mm_blend_ps( s, b, 1<<0 );
+;  s = _mm_blend_ps( s, b, 1<<2 );
+;  return s;
+;}
+;
+;__m128 blend_123(__m128 a, __m128 b)
+;{
+;  __m128 s = a;
+;  s = _mm_blend_ps( s, b, 1<<1 );
+;  s = _mm_blend_ps( s, b, 1<<2 );
+;  s = _mm_blend_ps( s, b, 1<<3 );
+;  return s;
+;}
+
+; Ideally, we should collapse the following shuffles into a single one.
+
+define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_blend_01:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_blend_01:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_blend_01:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_blend_01:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
+  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x float> %shuffle6
+}
+
+define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_blend_02:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm1, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_blend_02:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm1, %xmm0
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_blend_02:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_blend_02:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
+  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+  ret <4 x float> %shuffle6
+}
+
+define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_blend_123:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_blend_123:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movss %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_blend_123:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_blend_123:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
+  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
+  %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle12
+}
+
+define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test_movhl_1:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test_movhl_1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test_movhl_2:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test_movhl_2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: combine_test_movhl_3:
+; SSE:       # BB#0:
+; SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_test_movhl_3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
+  ret <4 x i32> %2
+}
+
+
+; Verify that we fold shuffles according to rule:
+;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
+
+define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test1:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test1:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test2:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test3:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test4:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test5:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test5:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test5:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test5:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+
+; Verify that we fold shuffles according to rule:
+;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
+
+define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test6:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test7:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test7:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test7:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test7:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test8:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test8:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test8:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test8:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
+; SSE-LABEL: combine_undef_input_test9:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test9:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test10:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test11:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test11:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test11:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test11:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test12:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test12:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test13:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test13:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
+; SSE-LABEL: combine_undef_input_test14:
+; SSE:       # BB#0:
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test14:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: combine_undef_input_test15:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsd %xmm0, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test15:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsd %xmm0, %xmm1
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test15:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test15:
+; AVX:       # BB#0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+
+; Verify that shuffles are canonicalized according to rules:
+;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
+;
+; This allows to trigger the following combine rule:
+;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
+;
+; As a result, all the shuffle pairs in each function below should be
+; combined into a single legal shuffle operation.
+
+define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test16:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test17:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test17:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test17:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test17:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
+; SSE2-LABEL: combine_undef_input_test18:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_undef_input_test18:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_undef_input_test18:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test18:
+; AVX:       # BB#0:
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
+; SSE-LABEL: combine_undef_input_test19:
+; SSE:       # BB#0:
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_undef_input_test19:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
+; AVX-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x float> %2
+}
+
+define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
+; ALL-LABEL: combine_undef_input_test20:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
+  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
+  ret <4 x float> %2
+}
+
+; These tests are designed to test the ability to combine away unnecessary
+; operations feeding into a shuffle. The AVX cases are the important ones as
+; they leverage operations which cannot be done naturally on the entire vector
+; and thus are decomposed into multiple smaller operations.
+
+define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
+; SSE-LABEL: combine_unneeded_subvector1:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_unneeded_subvector1:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_unneeded_subvector1:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i32> %c
+}
+
+define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: combine_unneeded_subvector2:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd {{.*}}(%rip), %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_unneeded_subvector2:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_unneeded_subvector2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,6,5,4,u,u,u,u>
+; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    retq
+  %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
+  ret <8 x i32> %d
+}
+
+define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps1:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_insertps1:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
+; AVX-NEXT:    retq
+
+  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
+  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
+  ret <4 x float> %d
+}
+
+define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps2:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_insertps2:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
+; AVX-NEXT:    retq
+
+  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
+  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
+  ret <4 x float> %d
+}
+
+define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps3:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_insertps3:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT:    retq
+
+  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
+  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
+  ret <4 x float> %d
+}
+
+define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
+; SSE41-LABEL: combine_insertps4:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: combine_insertps4:
+; AVX:       # BB#0:
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT:    retq
+
+  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
+  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
+  ret <4 x float> %d
+}

diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll
new file mode 100644
index 0000000..226deb0
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-sse1.ll

@@ -0,0 +1,235 @@
+; RUN: llc < %s -mcpu=x86-64 -mattr=-sse2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=SSE1
+
+target triple = "x86_64-unknown-unknown"
+
+define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0001:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0020:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0300:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_1000:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_2200:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_3330:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_3210:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0011:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_2233:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0022:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_1133:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_4zzz:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    movss %xmm0, %xmm1
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_z4zz:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_zz4z:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2]
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_zuu4:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_zzz7:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE1-LABEL: shuffle_v4f32_z6zz:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
+; SSE1-LABEL: insert_reg_and_zero_v4f32:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    xorps %xmm1, %xmm1
+; SSE1-NEXT:    movss %xmm0, %xmm1
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
+; SSE1-LABEL: insert_mem_and_zero_v4f32:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movss (%rdi), %xmm0
+; SSE1-NEXT:    retq
+  %a = load float* %ptr
+  %v = insertelement <4 x float> undef, float %a, i32 0
+  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE1-LABEL: insert_mem_lo_v4f32:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movq (%rdi), %rax
+; SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT:    shrq $32, %rax
+; SSE1-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm1
+; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm2
+; SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE1-NEXT:    xorps %xmm2, %xmm2
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1]
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; SSE1-NEXT:    movaps %xmm1, %xmm0
+; SSE1-NEXT:    retq
+  %a = load <2 x float>* %ptr
+  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
+; SSE1-LABEL: insert_mem_hi_v4f32:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movq (%rdi), %rax
+; SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT:    shrq $32, %rax
+; SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%rsp)
+; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm1
+; SSE1-NEXT:    movss {{[-0-9]+}}(%rsp), %xmm2
+; SSE1-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE1-NEXT:    xorps %xmm2, %xmm2
+; SSE1-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1]
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1]
+; SSE1-NEXT:    retq
+  %a = load <2 x float>* %ptr
+  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
+; SSE1-LABEL: shuffle_mem_v4f32_3210:
+; SSE1:       # BB#0:
+; SSE1-NEXT:    movaps (%rdi), %xmm0
+; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE1-NEXT:    retq
+  %a = load <4 x float>* %ptr
+  %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x float> %shuffle
+}

diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
new file mode 100644
index 0000000..afd7a24
--- /dev/null
+++ b/test/CodeGen/X86/vector-zext.ll

@@ -0,0 +1,206 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_8i16_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i16_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i16_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i16_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i16_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxwd %xmm0, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %B = zext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+
+define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
+; SSE2-LABEL: zext_4i32_to_4i64:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_4i32_to_4i64:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_4i32_to_4i64:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxdq %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE41-NEXT:    pand %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSE41-NEXT:    pand %xmm3, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_4i32_to_4i64:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_4i32_to_4i64:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxdq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %B = zext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
+; SSE2-LABEL: zext_8i8_to_8i32:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_8i8_to_8i32:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_8i8_to_8i32:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxwd %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_8i8_to_8i32:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpmovzxwd %xmm0, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_8i8_to_8i32:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxwd %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %t = zext <8 x i8> %z to <8 x i32>
+  ret <8 x i32> %t
+}
+
+; PR17654
+define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
+; SSE2-LABEL: zext_16i8_to_16i16:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_16i8_to_16i16:
+; SSSE3:       # BB#0: # %entry
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    pand %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_16i8_to_16i16:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    pmovzxbw %xmm0, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_16i8_to_16i16:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_16i8_to_16i16:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpmovzxbw %xmm0, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %t = zext <16 x i8> %z to <16 x i16>
+  ret <16 x i16> %t
+}

diff --git a/test/CodeGen/X86/vectorcall.ll b/test/CodeGen/X86/vectorcall.ll
new file mode 100644
index 0000000..1e52654
--- /dev/null
+++ b/test/CodeGen/X86/vectorcall.ll

@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=i686-pc-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+; RUN: llc -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+
+; Test integer arguments.
+
+define x86_vectorcallcc i32 @test_int_1() {
+  ret i32 0
+}
+
+; CHECK-LABEL: {{^}}test_int_1@@0:
+; CHECK: xorl %eax, %eax
+
+define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) {
+  ret i32 %a
+}
+
+; X86-LABEL: {{^}}test_int_2@@4:
+; X64-LABEL: {{^}}test_int_2@@8:
+; CHECK: movl %ecx, %eax
+
+define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) {
+  %at = trunc i64 %a to i32
+  ret i32 %at
+}
+
+; X86-LABEL: {{^}}test_int_3@@8:
+; X64-LABEL: {{^}}test_int_3@@8:
+; CHECK: movl %ecx, %eax
+
+define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) {
+  %s = add i32 %a, %b
+  ret i32 %s
+}
+
+; X86-LABEL: {{^}}test_int_4@@8:
+; X86: leal (%ecx,%edx), %eax
+
+; X64-LABEL: {{^}}test_int_4@@16:
+; X64: leal (%rcx,%rdx), %eax
+
+define x86_vectorcallcc i32 @"\01test_int_5"(i32, i32) {
+  ret i32 0
+}
+; CHECK-LABEL: {{^}}test_int_5:
+
+define x86_vectorcallcc double @test_fp_1(double %a, double %b) {
+  ret double %b
+}
+; CHECK-LABEL: {{^}}test_fp_1@@16:
+; CHECK: movaps %xmm1, %xmm0
+
+define x86_vectorcallcc double @test_fp_2(
+    double, double, double, double, double, double, double %r) {
+  ret double %r
+}
+; CHECK-LABEL: {{^}}test_fp_2@@56:
+; CHECK: movsd {{[0-9]+\(%[re]sp\)}}, %xmm0
+
+define x86_vectorcallcc {double, double, double, double} @test_fp_3() {
+  ret {double, double, double, double}
+        { double 0.0, double 0.0, double 0.0, double 0.0 }
+}
+; CHECK-LABEL: {{^}}test_fp_3@@0:
+; CHECK: xorps %xmm0
+; CHECK: xorps %xmm1
+; CHECK: xorps %xmm2
+; CHECK: xorps %xmm3
+
+; FIXME: Returning via x87 isn't compatible, but its hard to structure the
+; tablegen any other way.
+define x86_vectorcallcc {double, double, double, double, double} @test_fp_4() {
+  ret {double, double, double, double, double}
+        { double 0.0, double 0.0, double 0.0, double 0.0, double 0.0 }
+}
+; CHECK-LABEL: {{^}}test_fp_4@@0:
+; CHECK: fldz
+; CHECK: xorps %xmm0
+; CHECK: xorps %xmm1
+; CHECK: xorps %xmm2
+; CHECK: xorps %xmm3
+
+define x86_vectorcallcc <16 x i8> @test_vec_1(<16 x i8> %a, <16 x i8> %b) {
+  ret <16 x i8> %b
+}
+; CHECK-LABEL: {{^}}test_vec_1@@32:
+; CHECK: movaps %xmm1, %xmm0
+
+define x86_vectorcallcc <16 x i8> @test_vec_2(
+    double, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> %r) {
+  ret <16 x i8> %r
+}
+; CHECK-LABEL: {{^}}test_vec_2@@104:
+; CHECK: movaps (%{{[re]}}cx), %xmm0

diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
new file mode 100644
index 0000000..0c0f4bb
--- /dev/null
+++ b/test/CodeGen/X86/vselect-avx.ll

@@ -0,0 +1,85 @@
+; RUN: llc %s -o - -mattr=+avx | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; For this test we used to optimize the <i1 true, i1 false, i1 false, i1 true>
+; mask into <i32 2147483648, i32 0, i32 0, i32 2147483648> because we thought
+; we would lower that into a blend where only the high bit is relevant.
+; However, since the whole mask is constant, this is simplified incorrectly
+; by the generic code, because it was expecting -1 in place of 2147483648.
+; 
+; The problem does not occur without AVX, because vselect of v4i32 is not legal
+; nor custom.
+;
+; <rdar://problem/18675020>
+
+; CHECK-LABEL: test:
+; CHECK: vmovdqa {{.*#+}} xmm0 = [65535,0,0,65535]
+; CHECK: vmovdqa {{.*#+}} xmm2 = [65533,124,125,14807]
+; CHECK: ret
+define void @test(<4 x i16>* %a, <4 x i16>* %b) {
+body:
+  %predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127>
+  %predphi42 = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
+  store <4 x i16> %predphi, <4 x i16>* %a, align 8
+  store <4 x i16> %predphi42, <4 x i16>* %b, align 8
+  ret void
+}
+
+; Improve code coverage.
+;
+; When shrinking the condition used into the select to match a blend, this
+; test case exercises the path where the modified node is not the root
+; of the condition.
+;
+; CHECK-LABEL: test2:
+; CHECK:	vpslld	$31, %xmm0, %xmm0
+; CHECK-NEXT:	vpmovsxdq	%xmm0, %xmm1
+; CHECK-NEXT:	vpshufd	$78, %xmm0, %xmm0       ## xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:	vpmovsxdq	%xmm0, %xmm0
+; CHECK-NEXT:	vinsertf128	$1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]]
+; CHECK: vblendvpd	[[MASK]]
+; CHECK: retq
+define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
+bb:
+  %arrayidx1928 = getelementptr inbounds double** %call1559, i64 %indvars.iv4198
+  %tmp1888 = load double** %arrayidx1928, align 8
+  %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+  %tmp1900 = bitcast double* %tmp1888 to <4 x double>*
+  store <4 x double> %predphi.v.v, <4 x double>* %tmp1900, align 8
+  ret void
+}
+
+; For this test, we used to optimized the conditional mask for the blend, i.e.,
+; we shrunk some of its bits.
+; However, this same mask was used in another select (%predphi31) that turned out
+; to be optimized into a and. In that case, the conditional mask was wrong.
+;
+; Make sure that the and is fed by the original mask.
+; 
+; <rdar://problem/18819506>
+
+; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we
+; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise,
+; even a faulty pattern would pass!
+;  
+; CHECK-LABEL: test3:
+; Compute the original mask.
+;	CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]]
+; Shrink the bit of the mask.
+; CHECK-NEXT: vpslld	$31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]]
+; Use the shrunk mask in the blend.
+; CHECK-NEXT:	vblendvps	[[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; Use the original mask in the and.
+; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}} 
+; CHECK: retq
+define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,  <4 x i16> %tmp3, <4 x i16> %tmp12) {
+  %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
+  %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer
+  %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12
+  %predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer
+
+  store <4 x i16> %predphi31, <4 x i16>* %tmp16, align 8
+  store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8
+ ret void
+}

diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 42cf06a..3bd1dc4 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll

@@ -3,270 +3,253 @@
 ; Verify that we don't emit packed vector shifts instructions if the
 ; condition used by the vector select is a vector of constants.
 
-
 define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test1
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test2
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test3
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <4 x float> @test4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test4
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: movaps  %xmm1, %xmm0
-; CHECK: ret
-
 
 define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test5
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [0,65535,0,65535,0,65535,0,65535]
+; CHECK-NEXT:    andps %xmm0, %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test6
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test7
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
-
 
 define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test8
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test9
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: movaps  %xmm1, %xmm0
-; CHECK-NEXT: ret
 
 define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test10
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps {{.*#+}} xmm2 = <0,65535,65535,0,u,65535,65535,u>
+; CHECK-NEXT:    andps %xmm2, %xmm0
+; CHECK-NEXT:    andnps %xmm1, %xmm2
+; CHECK-NEXT:    orps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test11
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 false, i1 undef, i1 false, i1 false, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test12
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test13
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK: ret
 
 ; Fold (vselect (build_vector AllOnes), N1, N2) -> N1
-
 define <4 x float> @test14(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 undef, i1 true, i1 undef>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test14
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: pcmpeq
-; CHECK: ret
 
 define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test15
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: pcmpeq
-; CHECK: ret
 
 ; Fold (vselect (build_vector AllZeros), N1, N2) -> N2
-
 define <4 x float> @test16(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 undef, i1 false, i1 undef>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
-} 
-; CHECK-LABEL: test16
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: ret 
+}
 
 define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
   ret <8 x i16> %1
 }
-; CHECK-LABEL: test17
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: ret
 
 define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test18
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
 
 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test19:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test19
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
 
 define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test20:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1
 }
-; CHECK-LABEL: test20
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b
   ret <2 x i64> %1
 }
-; CHECK-LABEL: test21
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test22:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
-; CHECK-LABEL: test22
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
 
 define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movss %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %1
 }
-; CHECK-LABEL: test23
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movss
-; CHECK: ret
 
 define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test24:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1
 }
-; CHECK-LABEL: test24
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test25:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
   ret <2 x i64> %1
 }
-; CHECK-LABEL: test25
-; CHECK-NOT: psllw
-; CHECK-NOT: psraw
-; CHECK-NOT: xorps
-; CHECK: movsd
-; CHECK: ret
 
 define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) {
-; CHECK-LABEL: select_of_shuffles_0
-; CHECK-DAG: movlhps %xmm2, [[REGA:%xmm[0-9]+]]
-; CHECK-DAG: movlhps %xmm3, [[REGB:%xmm[0-9]+]]
-; CHECK: subps [[REGB]], [[REGA]]
+; CHECK-LABEL: select_of_shuffles_0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; CHECK-NEXT:    subps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %2 = shufflevector <2 x float> %a1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
   %3 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %2, <4 x float> %1
@@ -276,3 +259,24 @@
   %7 = fsub <4 x float> %3, %6
   ret <4 x float> %7
 }
+
+; PR20677
+define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) {
+; CHECK-LABEL: select_illegal:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; CHECK-NEXT:    movaps %xmm7, 112(%rdi)
+; CHECK-NEXT:    movaps %xmm6, 96(%rdi)
+; CHECK-NEXT:    movaps %xmm5, 80(%rdi)
+; CHECK-NEXT:    movaps %xmm4, 64(%rdi)
+; CHECK-NEXT:    movaps %xmm3, 48(%rdi)
+; CHECK-NEXT:    movaps %xmm2, 32(%rdi)
+; CHECK-NEXT:    movaps %xmm1, 16(%rdi)
+; CHECK-NEXT:    movaps %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+  %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
+  ret <16 x double> %sel
+}

diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index d115929..e0b861f 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll

@@ -2,12 +2,12 @@
 ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: movl
-; CHECK: paddd
+; CHECK: paddw
 ; CHECK: movlpd
 
 ; Scheduler causes produce a different instruction order
 ; ATOM: movl
-; ATOM: paddd
+; ATOM: paddw
 ; ATOM: movlpd
 
 ; bitcast a v4i16 to v2i32

diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index 9f6778c..3f54ab6 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll

@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
-; CHECK: paddq
+; CHECK: paddd
 
 ; truncate v2i64 to v2i32
 

diff --git a/test/CodeGen/X86/widen_conversions.ll b/test/CodeGen/X86/widen_conversions.ll
index 522ab47..8e5174f 100644
--- a/test/CodeGen/X86/widen_conversions.ll
+++ b/test/CodeGen/X86/widen_conversions.ll

@@ -9,7 +9,7 @@
 ; CHECK:      movd (%{{.*}}), %[[X:xmm[0-9]+]]
 ; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]]
 ; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
-; CHECK-NEXT: punpcklbw %[[Z]], %[[X]]
+; CHECK-NEXT: punpcklwd %[[Z]], %[[X]]
 ; CHECK-NEXT: ret
 
   %val = load <4 x i8>* %ptr

diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 41bea85..0ec3574 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll

@@ -4,12 +4,12 @@
 ;
 
 %i32vec3 = type <3 x i32>
-; CHECK: add3i32
 define void @add3i32(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
-; CHECK: movdqa
-; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movq
+; CHECK-LABEL: add3i32:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    paddd   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    pextrd  $2, %[[R0]], 8(%{{.*}})
+; CHECK-NEXT:    movq    %[[R0]], (%{{.*}})
 	%a = load %i32vec3* %ap, align 16
 	%b = load %i32vec3* %bp, align 16
 	%x = add %i32vec3 %a, %b
@@ -17,15 +17,15 @@
 	ret void
 }
 
-; CHECK: add3i32_2
 define void @add3i32_2(%i32vec3*  sret %ret, %i32vec3* %ap, %i32vec3* %bp)  {
-; CHECK: movq
-; CHECK: pinsrd
-; CHECK: movq
-; CHECK: pinsrd
-; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movq
+; CHECK-LABEL: add3i32_2:
+; CHECK:         movq    (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    pinsrd  $2, 8(%{{.*}}), %[[R0]]
+; CHECK-NEXT:    movq    (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    pinsrd  $2, 8(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    paddd   %[[R0]], %[[R1]]
+; CHECK-NEXT:    pextrd  $2, %[[R1]], 8(%{{.*}})
+; CHECK-NEXT:    movq    %[[R1]], (%{{.*}})
 	%a = load %i32vec3* %ap, align 8
 	%b = load %i32vec3* %bp, align 8
 	%x = add %i32vec3 %a, %b
@@ -34,15 +34,15 @@
 }
 
 %i32vec7 = type <7 x i32>
-; CHECK: add7i32
 define void @add7i32(%i32vec7*  sret %ret, %i32vec7* %ap, %i32vec7* %bp)  {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddd
-; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movq
-; CHECK: movdqa
+; CHECK-LABEL: add7i32:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddd   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddd   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    pextrd  $2, %[[R1]], 24(%{{.*}})
+; CHECK-NEXT:    movq    %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i32vec7* %ap, align 16
 	%b = load %i32vec7* %bp, align 16
 	%x = add %i32vec7 %a, %b
@@ -50,18 +50,18 @@
 	ret void
 }
 
-; CHECK: add12i32
 %i32vec12 = type <12 x i32>
 define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddd
-; CHECK: paddd
-; CHECK: paddd
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: movdqa
+; CHECK-LABEL: add12i32:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  32(%{{.*}}), %[[R2:xmm[0-9]+]]
+; CHECK-NEXT:    paddd   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddd   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    paddd   32(%{{.*}}), %[[R2]]
+; CHECK-NEXT:    movdqa  %[[R2]], 32(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i32vec12* %ap, align 16
 	%b = load %i32vec12* %bp, align 16
 	%x = add %i32vec12 %a, %b
@@ -70,11 +70,17 @@
 }
 
 
-; CHECK: add3i16
 %i16vec3 = type <3 x i16>
 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
-; CHECK: paddd
-; CHECK: ret
+; CHECK-LABEL: add3i16:
+; CHECK:         pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddd    %[[R0]], %[[R1]]
+; CHECK-NEXT:    movdqa   %[[R1]], %[[R0]]
+; CHECK-NEXT:    pshufb   {{.*}}, %[[R0]]
+; CHECK-NEXT:    pmovzxdq %[[R0]], %[[R0]]
+; CHECK-NEXT:    pextrw   $4, %[[R1]], 4(%{{.*}})
+; CHECK-NEXT:    movd     %[[R0]], (%{{.*}})
 	%a = load %i16vec3* %ap, align 16
 	%b = load %i16vec3* %bp, align 16
 	%x = add %i16vec3 %a, %b
@@ -82,11 +88,13 @@
 	ret void
 }
 
-; CHECK: add4i16
 %i16vec4 = type <4 x i16>
 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
-; CHECK: paddd
-; CHECK: movq
+; CHECK-LABEL: add4i16:
+; CHECK:         movq    (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movq    (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddw   %[[R0]], %[[R1]]
+; CHECK-NEXT:    movq    %[[R1]], (%{{.*}})
 	%a = load %i16vec4* %ap, align 16
 	%b = load %i16vec4* %bp, align 16
 	%x = add %i16vec4 %a, %b
@@ -94,15 +102,15 @@
 	ret void
 }
 
-; CHECK: add12i16
 %i16vec12 = type <12 x i16>
 define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddw
-; CHECK: paddw
-; CHECK: movq
-; CHECK: movdqa
+; CHECK-LABEL: add12i16:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddw   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddw   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    movq    %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i16vec12* %ap, align 16
 	%b = load %i16vec12* %bp, align 16
 	%x = add %i16vec12 %a, %b
@@ -110,18 +118,18 @@
 	ret void
 }
 
-; CHECK: add18i16
 %i16vec18 = type <18 x i16>
 define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddw
-; CHECK: paddw
-; CHECK: paddw
-; CHECK: movd
-; CHECK: movdqa
-; CHECK: movdqa
+; CHECK-LABEL: add18i16:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  32(%{{.*}}), %[[R2:xmm[0-9]+]]
+; CHECK-NEXT:    paddw   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddw   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    paddw   32(%{{.*}}), %[[R2]]
+; CHECK-NEXT:    movd    %[[R2]], 32(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i16vec18* %ap, align 16
 	%b = load %i16vec18* %bp, align 16
 	%x = add %i16vec18 %a, %b
@@ -130,11 +138,18 @@
 }
 
 
-; CHECK: add3i8
 %i8vec3 = type <3 x i8>
 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
-; CHECK: paddd
-; CHECK: ret
+; CHECK-LABEL: add3i8:
+; CHECK:         pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddd    %[[R0]], %[[R1]]
+; CHECK-NEXT:    movdqa   %[[R1]], %[[R0]]
+; CHECK-NEXT:    pshufb   {{.*}}, %[[R0]]
+; CHECK-NEXT:    pmovzxwq %[[R0]], %[[R0]]
+; CHECK-NEXT:    pextrb   $8, %[[R1]], 2(%{{.*}})
+; CHECK-NEXT:    movd     %[[R0]], %e[[R2:[abcd]]]x
+; CHECK-NEXT:    movw     %[[R2]]x, (%{{.*}})
 	%a = load %i8vec3* %ap, align 16
 	%b = load %i8vec3* %bp, align 16
 	%x = add %i8vec3 %a, %b
@@ -142,17 +157,18 @@
 	ret void
 }
 
-; CHECK-LABEL: add31i8:
 %i8vec31 = type <31 x i8>
 define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
-; CHECK: movdqa
-; CHECK: movdqa
-; CHECK: paddb
-; CHECK: paddb
-; CHECK: pextrb
-; CHECK: pextrw
-; CHECK: movq
-; CHECK: ret
+; CHECK-LABEL: add31i8:
+; CHECK:         movdqa  (%{{.*}}), %[[R0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  16(%{{.*}}), %[[R1:xmm[0-9]+]]
+; CHECK-NEXT:    paddb   (%{{.*}}), %[[R0]]
+; CHECK-NEXT:    paddb   16(%{{.*}}), %[[R1]]
+; CHECK-NEXT:    pextrb  $14, %[[R1]], 30(%{{.*}})
+; CHECK-NEXT:    pextrw  $6, %[[R1]], 28(%{{.*}})
+; CHECK-NEXT:    pextrd  $2, %[[R1]], 24(%{{.*}})
+; CHECK-NEXT:    movq    %[[R1]], 16(%{{.*}})
+; CHECK-NEXT:    movdqa  %[[R0]], (%{{.*}})
 	%a = load %i8vec31* %ap, align 16
 	%b = load %i8vec31* %bp, align 16
 	%x = add %i8vec31 %a, %b
@@ -161,14 +177,43 @@
 }
 
 
-; CHECK: rot
 %i8vec3pack = type { <3 x i8>, i8 }
-define %i8vec3pack  @rot() nounwind {
-; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
+define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
+; CHECK-LABEL: rot:
+; CHECK:         movdqa  {{.*}}, %[[CONSTANT0:xmm[0-9]+]]
+; CHECK-NEXT:    movdqa  {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]]
+; CHECK-NEXT:    pshufb  %[[SHUFFLE_MASK]], %[[CONSTANT0]]
+; CHECK-NEXT:    pmovzxwq %[[CONSTANT0]], %[[CONSTANT0]]
+; CHECK-NEXT:    movd    %[[CONSTANT0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    movw    %[[R0]]x, (%[[PTR0:.*]])
+; CHECK-NEXT:    movb    $-98, 2(%[[PTR0]])
+; CHECK-NEXT:    movdqa  {{.*}}, %[[CONSTANT1:xmm[0-9]+]]
+; CHECK-NEXT:    pshufb  %[[SHUFFLE_MASK]], %[[CONSTANT1]]
+; CHECK-NEXT:    pmovzxwq %[[CONSTANT1]], %[[CONSTANT1]]
+; CHECK-NEXT:    movd    %[[CONSTANT1]], %e[[R1:[abcd]]]x
+; CHECK-NEXT:    movw    %[[R1]]x, (%[[PTR1:.*]])
+; CHECK-NEXT:    movb    $1, 2(%[[PTR1]])
+; CHECK-NEXT:    pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]]
+; CHECK-NEXT:    pand    {{.*}}, %[[X0]]
+; CHECK-NEXT:    pextrd  $1, %[[X0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    shrl    %e[[R0]]x
+; CHECK-NEXT:    movd    %[[X0]], %e[[R1:[abcd]]]x
+; CHECK-NEXT:    shrl    %e[[R1]]x
+; CHECK-NEXT:    movd    %e[[R1]]x, %[[X1:xmm[0-9]+]]
+; CHECK-NEXT:    pinsrd  $1, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT:    pextrd  $2, %[[X0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    shrl    %e[[R0]]x
+; CHECK-NEXT:    pinsrd  $2, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT:    pextrd  $3, %[[X0]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    pinsrd  $3, %e[[R0]]x, %[[X1]]
+; CHECK-NEXT:    movdqa  %[[X1]], %[[X2:xmm[0-9]+]]
+; CHECK-NEXT:    pshufb  %[[SHUFFLE_MASK]], %[[X2]]
+; CHECK-NEXT:    pmovzxwq %[[X2]], %[[X3:xmm[0-9]+]]
+; CHECK-NEXT:    pextrb  $8, %[[X1]], 2(%{{.*}})
+; CHECK-NEXT:    movd    %[[X3]], %e[[R0:[abcd]]]x
+; CHECK-NEXT:    movw    %[[R0]]x, (%{{.*}})
+
 entry:
-  %X = alloca %i8vec3pack, align 4
-  %rot = alloca %i8vec3pack, align 4
-  %result = alloca %i8vec3pack, align 4
   %storetmp = bitcast %i8vec3pack* %X to <3 x i8>*
   store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp
   %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>*
@@ -180,7 +225,6 @@
   %shr = lshr <3 x i8> %extractVec, %extractVec3
   %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>*
   store <3 x i8> %shr, <3 x i8>* %storetmp4
-  %tmp5 = load %i8vec3pack* %result
-  ret %i8vec3pack %tmp5
+  ret void
 }
 

diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index a355b75..70fdbb7 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll

@@ -1,43 +1,56 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 
+target triple = "x86_64-unknown-unknown"
+
 ; widening shuffle v3float and then a add
 define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
-entry:
 ; CHECK-LABEL: shuf:
-; CHECK: extractps
-; CHECK: extractps
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    extractps $2, %xmm0, 8(%eax)
+; CHECK-NEXT:    extractps $1, %xmm0, 4(%eax)
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+entry:
 	%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 1, i32 2>
 	%val = fadd <3 x float> %x, %src2
 	store <3 x float> %val, <3 x float>* %dst.addr
 	ret void
-; CHECK: ret
 }
 
 
 ; widening shuffle v3float with a different mask and then a add
 define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
-entry:
 ; CHECK-LABEL: shuf2:
-; CHECK: extractps
-; CHECK: extractps
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    extractps $2, %xmm0, 8(%eax)
+; CHECK-NEXT:    extractps $1, %xmm0, 4(%eax)
+; CHECK-NEXT:    movss %xmm0, (%eax)
+; CHECK-NEXT:    retl
+entry:
 	%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 4, i32 2>
 	%val = fadd <3 x float> %x, %src2
 	store <3 x float> %val, <3 x float>* %dst.addr
 	ret void
-; CHECK: ret
 }
 
 ; Example of when widening a v3float operation causes the DAG to replace a node
 ; with the operation that we are currently widening, i.e. when replacing
 ; opA with opB, the DAG will produce new operations with opA.
 define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
-entry:
 ; CHECK-LABEL: shuf3:
-; CHECK-NOT: movlhps
-; CHECK-NOT: shufps
-; CHECK: pshufd
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; CHECK-NEXT:    movaps %xmm1, (%eax)
+; CHECK-NEXT:    retl
+entry:
   %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 
+  %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
   %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %tmp3.i13 = shufflevector <4 x float> %tmp1.i.i, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> ; <<3 x float>>
   %tmp6.i14 = shufflevector <3 x float> %tmp3.i13, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -45,27 +58,35 @@
   %tmp2.i18 = shufflevector <3 x float> %tmp97.i, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   %t5 = bitcast <4 x float> %tmp2.i18 to <4 x i32>
   %shr.i.i19 = lshr <4 x i32> %t5, <i32 19, i32 19, i32 19, i32 19>
-  %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080> 
+  %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080>
   %shuffle.i.i.i21 = shufflevector <4 x float> %tmp2.i18, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
   store <4 x float> %shuffle.i.i.i21, <4 x float>* %dst
   ret void
-; CHECK: ret
 }
 
 ; PR10421: make sure we correctly handle extreme widening with CONCAT_VECTORS
 define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
 ; CHECK-LABEL: shuf4:
-; CHECK-NOT: punpckldq
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    pshufb %xmm2, %xmm1
+; CHECK-NEXT:    pshufb %xmm2, %xmm0
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retl
   %vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %vshuf
-; CHECK: ret
 }
 
 ; PR11389: another CONCAT_VECTORS case
 define void @shuf5(<8 x i8>* %p) nounwind {
 ; CHECK-LABEL: shuf5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <4,33,u,u,u,u,u,u>
+; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    movlpd %xmm0, (%eax)
+; CHECK-NEXT:    retl
   %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   store <8 x i8> %v, <8 x i8>* %p, align 8
   ret void
-; CHECK: ret
 }

diff --git a/test/CodeGen/X86/win32-pic-jumptable.ll b/test/CodeGen/X86/win32-pic-jumptable.ll
new file mode 100644
index 0000000..cabd36a
--- /dev/null
+++ b/test/CodeGen/X86/win32-pic-jumptable.ll

@@ -0,0 +1,36 @@
+; RUN: llc < %s -relocation-model=pic | FileCheck %s
+
+; CHECK:        calll L0$pb
+; CHECK-NEXT: L0$pb:
+; CHECK-NEXT:   popl %eax
+; CHECK-NEXT:   addl LJTI0_0(,%ecx,4), %eax
+; CHECK-NEXT:   jmpl *%eax
+
+; CHECK:      LJTI0_0:
+; CHECK-NEXT:   .long LBB0_4-L0$pb
+; CHECK-NEXT:   .long LBB0_5-L0$pb
+; CHECK-NEXT:   .long LBB0_6-L0$pb
+; CHECK-NEXT:   .long LBB0_7-L0$pb
+
+
+target triple = "i686--windows-itanium"
+define i32 @f(i64 %x) {
+bb0:
+  switch i64 %x, label %bb5 [
+    i64 1, label %bb1
+    i64 2, label %bb2
+    i64 3, label %bb3
+    i64 4, label %bb4
+  ]
+bb1:
+  br label %bb5
+bb2:
+  br label %bb5
+bb3:
+  br label %bb5
+bb4:
+  br label %bb5
+bb5:
+  %y = phi i32 [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ]
+  ret i32 %y
+}

diff --git a/test/CodeGen/X86/win64_call_epi.ll b/test/CodeGen/X86/win64_call_epi.ll
new file mode 100644
index 0000000..bc73ad4
--- /dev/null
+++ b/test/CodeGen/X86/win64_call_epi.ll

@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN64
+
+declare void @bar()
+declare void @baz()
+declare i32 @personality(...)
+
+; Check for 'nop' between the last call and the epilogue.
+define void @foo1() {
+
+    invoke void @bar()
+        to label %normal
+        unwind label %catch
+
+normal:
+    ret void
+
+catch:
+    %1 = landingpad { i8*, i32 } personality i32 (...)* @personality cleanup
+    resume { i8*, i32 } %1
+}
+; WIN64-LABEL: foo1:
+; WIN64: .seh_proc foo1
+; WIN64: callq bar
+; WIN64: nop
+; WIN64: addq ${{[0-9]+}}, %rsp
+; WIN64: retq
+; Check for 'ud2' after noreturn call
+; WIN64: callq _Unwind_Resume
+; WIN64-NEXT: ud2
+; WIN64: .seh_endproc
+
+
+; Check it still works when blocks are reordered.
+@something = global i32 0
+define void @foo2(i1 zeroext %cond ) {
+    br i1 %cond, label %a, label %b, !prof !0
+a:
+    call void @bar()
+    br label %done
+b:
+    call void @baz()
+    store i32 0, i32* @something
+    br label %done
+done:
+    ret void
+}
+!0 = metadata !{metadata !"branch_weights", i32 100, i32 0}
+; WIN64-LABEL: foo2:
+; WIN64: callq bar
+; WIN64: nop
+; WIN64: addq ${{[0-9]+}}, %rsp
+; WIN64: retq
+
+
+; Check nop is not emitted when call is not adjacent to epilogue.
+define i32 @foo3() {
+    call void @bar()
+    ret i32 0
+}
+; WIN64-LABEL: foo3:
+; WIN64: callq bar
+; WIN64: xorl
+; WIN64-NOT: nop
+; WIN64: addq ${{[0-9]+}}, %rsp
+; WIN64: retq

diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll
index 1a51b2a..8d7f201 100644
--- a/test/CodeGen/X86/win64_vararg.ll
+++ b/test/CodeGen/X86/win64_vararg.ll

@@ -111,3 +111,22 @@
   %tmp = va_arg i8** %ap, i32
   ret i32 %tmp
 }
+
+define void @sret_arg(i32* sret %agg.result, i8* nocapture readnone %format, ...) {
+entry:
+  %ap = alloca i8*
+  %ap_i8 = bitcast i8** %ap to i8*
+  call void @llvm.va_start(i8* %ap_i8)
+  %tmp = va_arg i8** %ap, i32
+  store i32 %tmp, i32* %agg.result
+  ret void
+}
+; CHECK-LABEL: sret_arg:
+; CHECK: pushq
+; CHECK-DAG: movq %r9, 40(%rsp)
+; CHECK-DAG: movq %r8, 32(%rsp)
+; CHECK: movl 32(%rsp), %[[tmp:[^ ]*]]
+; CHECK: movl %[[tmp]], (%[[sret:[^ ]*]])
+; CHECK: movq %[[sret]], %rax
+; CHECK: popq
+; CHECK: retq

diff --git a/test/CodeGen/X86/win_cst_pool.ll b/test/CodeGen/X86/win_cst_pool.ll
new file mode 100644
index 0000000..e8b853a
--- /dev/null
+++ b/test/CodeGen/X86/win_cst_pool.ll

@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define double @double() {
+  ret double 0x0000000000800000
+}
+; CHECK:              .globl  __real@0000000000800000
+; CHECK-NEXT:         .section        .rdata,"rd",discard,__real@0000000000800000
+; CHECK-NEXT:         .align  8
+; CHECK-NEXT: __real@0000000000800000:
+; CHECK-NEXT:         .quad   8388608
+; CHECK:      double:
+; CHECK:               movsd   __real@0000000000800000(%rip), %xmm0
+; CHECK-NEXT:          ret
+
+define <4 x i32> @vec1() {
+  ret <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+}
+; CHECK:              .globl  __xmm@00000000000000010000000200000003
+; CHECK-NEXT:         .section        .rdata,"rd",discard,__xmm@00000000000000010000000200000003
+; CHECK-NEXT:         .align  16
+; CHECK-NEXT: __xmm@00000000000000010000000200000003:
+; CHECK-NEXT:         .long   3
+; CHECK-NEXT:         .long   2
+; CHECK-NEXT:         .long   1
+; CHECK-NEXT:         .long   0
+; CHECK:      vec1:
+; CHECK:               movaps  __xmm@00000000000000010000000200000003(%rip), %xmm0
+; CHECK-NEXT:          ret
+
+define <8 x i16> @vec2() {
+  ret <8 x i16> <i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>
+}
+; CHECK:             .globl  __xmm@00000001000200030004000500060007
+; CHECK-NEXT:        .section        .rdata,"rd",discard,__xmm@00000001000200030004000500060007
+; CHECK-NEXT:        .align  16
+; CHECK-NEXT: __xmm@00000001000200030004000500060007:
+; CHECK-NEXT:        .short  7
+; CHECK-NEXT:        .short  6
+; CHECK-NEXT:        .short  5
+; CHECK-NEXT:        .short  4
+; CHECK-NEXT:        .short  3
+; CHECK-NEXT:        .short  2
+; CHECK-NEXT:        .short  1
+; CHECK-NEXT:        .short  0
+; CHECK:      vec2:
+; CHECK:               movaps  __xmm@00000001000200030004000500060007(%rip), %xmm0
+; CHECK-NEXT:          ret
+
+
+define <4 x float> @undef1() {
+  ret <4 x float> <float 1.0, float 1.0, float undef, float undef>
+
+; CHECK:             .globl  __xmm@00000000000000003f8000003f800000
+; CHECK-NEXT:        .section        .rdata,"rd",discard,__xmm@00000000000000003f8000003f800000
+; CHECK-NEXT:        .align  16
+; CHECK-NEXT: __xmm@00000000000000003f8000003f800000:
+; CHECK-NEXT:        .long   1065353216              # float 1
+; CHECK-NEXT:        .long   1065353216              # float 1
+; CHECK-NEXT:        .zero   4
+; CHECK-NEXT:        .zero   4
+; CHECK:      undef1:
+; CHECK:               movaps  __xmm@00000000000000003f8000003f800000(%rip), %xmm0
+; CHECK-NEXT:          ret
+}

diff --git a/test/CodeGen/X86/windows-itanium-alloca.ll b/test/CodeGen/X86/windows-itanium-alloca.ll
new file mode 100644
index 0000000..0a06cde
--- /dev/null
+++ b/test/CodeGen/X86/windows-itanium-alloca.ll

@@ -0,0 +1,16 @@
+; RUN: llc -mtriple i686-windows-itanium -filetype asm -o - %s | FileCheck %s
+
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686--windows-itanium"
+
+declare void @external(i8*)
+
+define dllexport void @alloca(i32 %sz) {
+entry:
+  %vla = alloca i8, i32 %sz, align 1
+  call void @external(i8* %vla)
+  ret void
+}
+
+; CHECK: __chkstk
+

diff --git a/test/CodeGen/X86/x32-function_pointer-1.ll b/test/CodeGen/X86/x32-function_pointer-1.ll
new file mode 100644
index 0000000..2baf92a
--- /dev/null
+++ b/test/CodeGen/X86/x32-function_pointer-1.ll

@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s
+
+; Test for x32 function pointer tail call
+
+@foo1 = external global void (i8*)*
+@foo2 = external global void (i8*)*
+
+define void @bar(i8* %h) nounwind uwtable {
+entry:
+  %0 = load void (i8*)** @foo1, align 4
+; CHECK: movl	foo1(%rip), %e{{[^,]*}}
+  tail call void %0(i8* %h) nounwind
+; CHECK: callq	*%r{{[^,]*}}
+  %1 = load void (i8*)** @foo2, align 4
+; CHECK: movl	foo2(%rip), %e{{[^,]*}}
+  tail call void %1(i8* %h) nounwind
+; CHECK: jmpq	*%r{{[^,]*}}
+  ret void
+}

diff --git a/test/CodeGen/X86/x32-function_pointer-2.ll b/test/CodeGen/X86/x32-function_pointer-2.ll
new file mode 100644
index 0000000..f727d41
--- /dev/null
+++ b/test/CodeGen/X86/x32-function_pointer-2.ll

@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s
+
+; Test call function pointer with function argument
+;
+; void bar (void * h, void (*foo) (void *))
+;    {
+;      foo (h);
+;      foo (h);
+;    }
+
+
+define void @bar(i8* %h, void (i8*)* nocapture %foo) nounwind {
+entry:
+  tail call void %foo(i8* %h) nounwind
+; CHECK: mov{{l|q}}	%{{e|r}}si, %{{e|r}}[[REG:.*]]{{d?}}
+; CHECK: callq	*%r[[REG]]
+  tail call void %foo(i8* %h) nounwind
+; CHECK: jmpq	*%r{{[^,]*}}
+  ret void
+}

diff --git a/test/CodeGen/X86/x32-function_pointer-3.ll b/test/CodeGen/X86/x32-function_pointer-3.ll
new file mode 100644
index 0000000..5eaf85d
--- /dev/null
+++ b/test/CodeGen/X86/x32-function_pointer-3.ll

@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s
+
+; Test calling function pointer passed in struct
+
+;    The fuction argument `h' in
+
+;    struct foo {
+;      void (*f) (void);
+;      int i;
+;    };
+;    void
+;    bar (struct foo h)
+;    {
+;      h.f ();
+;    }
+
+;    is passed in the 64-bit %rdi register.  The `f' field is in the lower 32
+;    bits of %rdi register and the `i' field is in the upper 32 bits of %rdi
+;    register.  We need to zero-extend %edi to %rdi before branching via %rdi.
+
+define void @bar(i64 %h.coerce) nounwind {
+entry:
+  %h.sroa.0.0.extract.trunc = trunc i64 %h.coerce to i32
+  %0 = inttoptr i32 %h.sroa.0.0.extract.trunc to void ()*
+; CHECK: movl	%edi, %e[[REG:.*]]
+  tail call void %0() nounwind
+; CHECK: jmpq	*%r[[REG]]
+  ret void
+}

diff --git a/test/CodeGen/X86/x86-64-call.ll b/test/CodeGen/X86/x86-64-call.ll
new file mode 100644
index 0000000..300f8d1
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-call.ll

@@ -0,0 +1,15 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux-gnux32 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686-pc-linux -verify-machineinstrs | FileCheck %s -check-prefix=IA32
+
+; trivial test for correct call suffix
+
+define i32 @far() nounwind uwtable {
+entry:
+; CHECK: callq foo
+; IA32: calll foo
+  tail call void @foo() nounwind
+  ret i32 0
+}
+
+declare void @foo()

diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll
index da8082b..8790fa6 100644
--- a/test/CodeGen/X86/x86-64-pic-10.ll
+++ b/test/CodeGen/X86/x86-64-pic-10.ll

@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
 ; RUN: grep "callq	g@PLT" %t1
 
-@g = alias weak i32 ()* @f
+@g = weak alias i32 ()* @f
 
 define void @h() {
 entry:

diff --git a/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll
new file mode 100644
index 0000000..c476ffd
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll

@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
+; RUN: llc -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s
+
+; x32 uses %esp, %ebp as stack and frame pointers
+
+; CHECK-LABEL: foo
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: movq %rdi, -8(%rbp)
+; CHECK: popq %rbp
+; X32ABI-LABEL: foo
+; X32ABI: pushq %rbp
+; X32ABI: movl %esp, %ebp
+; X32ABI: movl %edi, -4(%ebp)
+; X32ABI: popq %rbp
+; NACL-LABEL: foo
+; NACL: pushq %rbp
+; NACL: movq %rsp, %rbp
+; NACL: movl %edi, -4(%rbp)
+; NACL: popq %rbp
+
+
+define void @foo(i32* %a) #0 {
+entry:
+  %a.addr = alloca i32*, align 4
+  %b = alloca i32*, align 4
+  store i32* %a, i32** %a.addr, align 4
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"}
+
+

diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll
index 641786f..2879fb4 100644
--- a/test/CodeGen/X86/x86-64-tls-1.ll
+++ b/test/CodeGen/X86/x86-64-tls-1.ll

@@ -1,10 +1,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 @tm_nest_level = internal thread_local global i32 0
 define i64 @z() nounwind {
-; FIXME: The codegen here is primitive at best and could be much better.
-; The add and the moves can be folded together.
-; CHECK-DAG: movq    $tm_nest_level@TPOFF, %rcx
-; CHECK-DAG: movq    %fs:0, %rax
-; CHECK: addl    %ecx, %eax
+; CHECK:      movq    $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x
+; CHECK-NEXT: addl    %fs:0, %e[[R0]]x
+; CHECK-NEXT: andq    $100, %r[[R0]]x
+
   ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100)
 }

diff --git a/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll
new file mode 100644
index 0000000..fcf7eae
--- /dev/null
+++ b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll

@@ -0,0 +1,35 @@
+; RUN: llc  -mtriple=x86_64-apple-macosx10.9.0  -mcpu=core2 -mattr=+64bit,+sse2 < %s | FileCheck %s
+
+; DAGCombine may choose to rewrite 2 loads feeding a select as a select of
+; addresses feeding a load. This test ensures that when it does that it creates
+; a load with alignment equivalent to the most restrictive source load.
+
+declare void @sink(<2 x double>)
+
+define void @test1(i1 %cmp) align 2 {
+  %1 = alloca  <2 x double>, align 16
+  %2 = alloca  <2 x double>, align 8
+
+  %val = load <2 x double>* %1, align 16
+  %val2 = load <2 x double>* %2, align 8
+  %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2
+  call void @sink(<2 x double> %val3)
+  ret void
+  ; CHECK: test1
+  ; CHECK: movups
+  ; CHECK: ret
+}
+
+define void @test2(i1 %cmp) align 2 {
+  %1 = alloca  <2 x double>, align 16
+  %2 = alloca  <2 x double>, align 8
+
+  %val = load <2 x double>* %1, align 16
+  %val2 = load <2 x double>* %2, align 16
+  %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2
+  call void @sink(<2 x double> %val3)
+  ret void
+  ; CHECK: test2
+  ; CHECK: movaps
+  ; CHECK: ret
+}

diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
new file mode 100644
index 0000000..4317d8a
--- /dev/null
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll

@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: LCPI0_0:
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-LABEL: foo:
+; CHECK: cmpeqps %xmm1, %xmm0
+; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0
+; CHECK-NEXT: retq
+
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x float>
+  ret <4 x float> %result
+}
+
+; Make sure the operation doesn't try to get folded when the sizes don't match,
+; as that ends up crashing later when trying to form a bitcast operation for
+; the folded nodes.
+define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind {
+; CHECK-LABEL: LCPI1_0:
+; CHECK-NEXT: .long 1                       ## 0x1
+; CHECK-NEXT: .long 1                       ## 0x1
+; CHECK-NEXT: .long 1                       ## 0x1
+; CHECK-NEXT: .long 1                       ## 0x1
+; CHECK-LABEL: foo1:
+;   FIXME: The operation gets scalarized. If/when the compiler learns to better
+;          use [V]CVTDQ2PD, this will need updated.
+; CHECK: cvtsi2sdq
+; CHECK: cvtsi2sdq
+; CHECK: cvtsi2sdq
+; CHECK: cvtsi2sdq
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %result = sitofp <4 x i32> %ext to <4 x double>
+  store <4 x double> %result, <4 x double>* %p
+  ret void
+}
+
+; Also test the general purpose constant folding of int->fp.
+define void @foo2(<4 x float>* noalias %result) nounwind {
+; CHECK-LABEL: LCPI2_0:
+; CHECK-NEXT: .long 1082130432              ## float 4.000000e+00
+; CHECK-NEXT: .long 1084227584              ## float 5.000000e+00
+; CHECK-NEXT: .long 1086324736              ## float 6.000000e+00
+; CHECK-NEXT: .long 1088421888              ## float 7.000000e+00
+; CHECK-LABEL: foo2:
+; CHECK:  movaps LCPI2_0(%rip), %xmm0
+
+  %val = uitofp <4 x i32> <i32 4, i32 5, i32 6, i32 7> to <4 x float>
+  store <4 x float> %val, <4 x float>* %result
+  ret void
+}
+
+; Fold explicit AND operations when the constant isn't a splat of a single
+; scalar value like what the zext creates.
+define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
+; CHECK-LABEL: LCPI3_0:
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 0                       ## 0x0
+; CHECK-NEXT: .long 1065353216              ## 0x3f800000
+; CHECK-NEXT: .long 0                       ## 0x0
+; CHECK-LABEL: foo3:
+; CHECK: cmpeqps %xmm1, %xmm0
+; CHECK-NEXT: andps LCPI3_0(%rip), %xmm0
+  %cmp = fcmp oeq <4 x float> %val, %test
+  %ext = zext <4 x i1> %cmp to <4 x i32>
+  %and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258>
+  %result = sitofp <4 x i32> %and to <4 x float>
+  ret <4 x float> %result
+}

diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
index f078631..54a4d6aa 100644
--- a/test/CodeGen/X86/xaluo.ll
+++ b/test/CodeGen/X86/xaluo.ll

@@ -1,7 +1,5 @@
-; RUN: llc -mtriple=x86_64-darwin-unknown < %s                             | FileCheck %s --check-prefix=DAG
-; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST
-; RUN: llc -mtriple=x86_64-darwin-unknown < %s                             | FileCheck %s
-; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin-unknown                             < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
+; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
 
 ;
 ; Get the actual value of the overflow bit.
@@ -9,12 +7,9 @@
 ; SADDO reg, reg
 define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) {
 entry:
-; DAG-LABEL:    saddo.i8
-; DAG:          addb %sil, %dil
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i8
-; FAST:         addb %sil, %dil
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.i8
+; CHECK:       addb %sil, %dil
+; CHECK-NEXT:  seto %al
   %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -24,12 +19,9 @@
 
 define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) {
 entry:
-; DAG-LABEL:    saddo.i16
-; DAG:          addw %si, %di
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i16
-; FAST:         addw %si, %di
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.i16
+; CHECK:       addw %si, %di
+; CHECK-NEXT:  seto %al
   %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -39,12 +31,9 @@
 
 define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    saddo.i32
-; DAG:          addl %esi, %edi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i32
-; FAST:         addl %esi, %edi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.i32
+; CHECK:       addl %esi, %edi
+; CHECK-NEXT:  seto %al
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -54,12 +43,9 @@
 
 define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64
-; DAG:          addq %rsi, %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i64
-; FAST:         addq %rsi, %rdi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.i64
+; CHECK:       addq %rsi, %rdi
+; CHECK-NEXT:  seto %al
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -67,16 +53,48 @@
   ret i1 %obit
 }
 
-; SADDO reg, imm | imm, reg
-; FIXME: INC isn't supported in FastISel yet
-define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
+; SADDO reg, 1 | INC
+define zeroext i1 @saddo.inc.i8(i8 %v1, i8* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm1
-; DAG:          incq %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i64imm1
-; FAST:         addq $1, %rdi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: saddo.inc.i8
+; CHECK:       incb %dil
+; CHECK-NEXT:  seto %al
+  %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 1)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.inc.i16(i16 %v1, i16* %res) {
+entry:
+; CHECK-LABEL: saddo.inc.i16
+; CHECK:       incw %di
+; CHECK-NEXT:  seto %al
+  %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 1)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.inc.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL: saddo.inc.i32
+; CHECK:       incl %edi
+; CHECK-NEXT:  seto %al
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 1)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo.inc.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL: saddo.inc.i64
+; CHECK:       incq %rdi
+; CHECK-NEXT:  seto %al
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -84,17 +102,18 @@
   ret i1 %obit
 }
 
+; SADDO reg, imm | imm, reg
 ; FIXME: DAG doesn't optimize immediates on the LHS.
-define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
+define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm2
-; DAG:          mov
-; DAG-NEXT:     addq
-; DAG-NEXT:     seto
-; FAST-LABEL:   saddo.i64imm2
-; FAST:         addq $1, %rdi
-; FAST-NEXT:    seto %al
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1)
+; SDAG-LABEL: saddo.i64imm1
+; SDAG:       mov
+; SDAG-NEXT:  addq
+; SDAG-NEXT:  seto
+; FAST-LABEL: saddo.i64imm1
+; FAST:       addq $2, %rdi
+; FAST-NEXT:  seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 2, i64 %v1)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
   store i64 %val, i64* %res
@@ -102,15 +121,25 @@
 }
 
 ; Check boundary conditions for large immediates.
+define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL: saddo.i64imm2
+; CHECK:       addq $-2147483648, %rdi
+; CHECK-NEXT:  seto %al
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
 define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm3
-; DAG:          addq $-2147483648, %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   saddo.i64imm3
-; FAST:         addq $-2147483648, %rdi
-; FAST-NEXT:    seto %al
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648)
+; CHECK-LABEL: saddo.i64imm3
+; CHECK:       movabsq $-21474836489, %[[REG:[a-z]+]]
+; CHECK-NEXT:  addq %rdi, %[[REG]]
+; CHECK-NEXT:  seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
   store i64 %val, i64* %res
@@ -119,15 +148,10 @@
 
 define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm4
-; DAG:          movabsq $-21474836489, %[[REG:[a-z]+]]
-; DAG-NEXT:     addq %rdi, %[[REG]]
-; DAG-NEXT:     seto
-; FAST-LABEL:   saddo.i64imm4
-; FAST:         movabsq $-21474836489, %[[REG:[a-z]+]]
-; FAST-NEXT:    addq %rdi, %[[REG]]
-; FAST-NEXT:    seto
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489)
+; CHECK-LABEL: saddo.i64imm4
+; CHECK:       addq $2147483647, %rdi
+; CHECK-NEXT:  seto
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
   store i64 %val, i64* %res
@@ -136,30 +160,10 @@
 
 define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
 entry:
-; DAG-LABEL:    saddo.i64imm5
-; DAG:          addq $2147483647, %rdi
-; DAG-NEXT:     seto
-; FAST-LABEL:   saddo.i64imm5
-; FAST:         addq $2147483647, %rdi
-; FAST-NEXT:    seto
-  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-; TODO: FastISel shouldn't use movabsq.
-define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) {
-entry:
-; DAG-LABEL:    saddo.i64imm6
-; DAG:          movl $2147483648, %ecx
-; DAG:          addq %rdi, %rcx
-; DAG-NEXT:     seto
-; FAST-LABEL:   saddo.i64imm6
-; FAST:         movabsq $2147483648, %[[REG:[a-z]+]]
-; FAST:         addq %rdi, %[[REG]]
-; FAST-NEXT:     seto
+; CHECK-LABEL: saddo.i64imm5
+; CHECK:       movl $2147483648
+; CHECK:       addq %rdi
+; CHECK-NEXT:  seto
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -170,12 +174,9 @@
 ; UADDO
 define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    uaddo.i32
-; DAG:          addl %esi, %edi
-; DAG-NEXT:     setb %al
-; FAST-LABEL:   uaddo.i32
-; FAST:         addl %esi, %edi
-; FAST-NEXT:    setb %al
+; CHECK-LABEL: uaddo.i32
+; CHECK:       addl %esi, %edi
+; CHECK-NEXT:  setb %al
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -185,12 +186,9 @@
 
 define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    uaddo.i64
-; DAG:          addq %rsi, %rdi
-; DAG-NEXT:     setb %al
-; FAST-LABEL:   uaddo.i64
-; FAST:         addq %rsi, %rdi
-; FAST-NEXT:    setb %al
+; CHECK-LABEL: uaddo.i64
+; CHECK:       addq %rsi, %rdi
+; CHECK-NEXT:  setb %al
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -198,15 +196,57 @@
   ret i1 %obit
 }
 
+; UADDO reg, 1 | NOT INC
+define zeroext i1 @uaddo.inc.i8(i8 %v1, i8* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i8
+; CHECK-NOT:   incb %dil
+  %t = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %v1, i8 1)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.inc.i16(i16 %v1, i16* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i16
+; CHECK-NOT:   incw %di
+  %t = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %v1, i16 1)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.inc.i32(i32 %v1, i32* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i32
+; CHECK-NOT:   incl %edi
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.inc.i64(i64 %v1, i64* %res) {
+entry:
+; CHECK-LABEL: uaddo.inc.i64
+; CHECK-NOT:   incq %rdi
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
 ; SSUBO
 define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    ssubo.i32
-; DAG:          subl %esi, %edi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   ssubo.i32
-; FAST:         subl %esi, %edi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: ssubo.i32
+; CHECK:       subl %esi, %edi
+; CHECK-NEXT:  seto %al
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -216,12 +256,9 @@
 
 define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    ssubo.i64
-; DAG:          subq %rsi, %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   ssubo.i64
-; FAST:         subq %rsi, %rdi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: ssubo.i64
+; CHECK:       subq %rsi, %rdi
+; CHECK-NEXT:  seto %al
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -232,12 +269,9 @@
 ; USUBO
 define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    usubo.i32
-; DAG:          subl %esi, %edi
-; DAG-NEXT:     setb %al
-; FAST-LABEL:   usubo.i32
-; FAST:         subl %esi, %edi
-; FAST-NEXT:    setb %al
+; CHECK-LABEL: usubo.i32
+; CHECK:       subl %esi, %edi
+; CHECK-NEXT:  setb %al
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -247,12 +281,9 @@
 
 define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    usubo.i64
-; DAG:          subq %rsi, %rdi
-; DAG-NEXT:     setb %al
-; FAST-LABEL:   usubo.i64
-; FAST:         subq %rsi, %rdi
-; FAST-NEXT:    setb %al
+; CHECK-LABEL: usubo.i64
+; CHECK:       subq %rsi, %rdi
+; CHECK-NEXT:  setb %al
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -263,10 +294,10 @@
 ; SMULO
 define zeroext i1 @smulo.i8(i8 %v1, i8 %v2, i8* %res) {
 entry:
-; FAST-LABEL:   smulo.i8
-; FAST:         movb %dil, %al
-; FAST-NEXT:    imulb %sil
-; FAST-NEXT:    seto %cl
+; CHECK-LABEL:   smulo.i8
+; CHECK:         movb %dil, %al
+; CHECK-NEXT:    imulb %sil
+; CHECK-NEXT:    seto %cl
   %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -276,12 +307,9 @@
 
 define zeroext i1 @smulo.i16(i16 %v1, i16 %v2, i16* %res) {
 entry:
-; DAG-LABEL:    smulo.i16
-; DAG:          imulw %si, %di
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   smulo.i16
-; FAST:         imulw %si, %di
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: smulo.i16
+; CHECK:       imulw %si, %di
+; CHECK-NEXT:  seto %al
   %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -291,12 +319,9 @@
 
 define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    smulo.i32
-; DAG:          imull %esi, %edi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   smulo.i32
-; FAST:         imull %esi, %edi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: smulo.i32
+; CHECK:       imull %esi, %edi
+; CHECK-NEXT:  seto %al
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -306,12 +331,9 @@
 
 define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    smulo.i64
-; DAG:          imulq %rsi, %rdi
-; DAG-NEXT:     seto %al
-; FAST-LABEL:   smulo.i64
-; FAST:         imulq %rsi, %rdi
-; FAST-NEXT:    seto %al
+; CHECK-LABEL: smulo.i64
+; CHECK:       imulq %rsi, %rdi
+; CHECK-NEXT:  seto %al
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -322,10 +344,10 @@
 ; UMULO
 define zeroext i1 @umulo.i8(i8 %v1, i8 %v2, i8* %res) {
 entry:
-; FAST-LABEL:   umulo.i8
-; FAST:         movb %dil, %al
-; FAST-NEXT:    mulb %sil
-; FAST-NEXT:    seto %cl
+; CHECK-LABEL:   umulo.i8
+; CHECK:         movb %dil, %al
+; CHECK-NEXT:    mulb %sil
+; CHECK-NEXT:    seto %cl
   %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -335,12 +357,9 @@
 
 define zeroext i1 @umulo.i16(i16 %v1, i16 %v2, i16* %res) {
 entry:
-; DAG-LABEL:    umulo.i16
-; DAG:          mulw %si
-; DAG-NEXT:     seto
-; FAST-LABEL:   umulo.i16
-; FAST:         mulw %si
-; FAST-NEXT:    seto
+; CHECK-LABEL: umulo.i16
+; CHECK:       mulw %si
+; CHECK-NEXT:  seto
   %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -350,12 +369,9 @@
 
 define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
 entry:
-; DAG-LABEL:    umulo.i32
-; DAG:          mull %esi
-; DAG-NEXT:     seto
-; FAST-LABEL:   umulo.i32
-; FAST:         mull %esi
-; FAST-NEXT:    seto
+; CHECK-LABEL: umulo.i32
+; CHECK:       mull %esi
+; CHECK-NEXT:  seto
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -365,12 +381,9 @@
 
 define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
 entry:
-; DAG-LABEL:    umulo.i64
-; DAG:          mulq %rsi
-; DAG-NEXT:     seto
-; FAST-LABEL:   umulo.i64
-; FAST:         mulq %rsi
-; FAST-NEXT:    seto
+; CHECK-LABEL: umulo.i64
+; CHECK:       mulq %rsi
+; CHECK-NEXT:  seto
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -383,9 +396,9 @@
 ;
 define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    saddo.select.i32
-; CHECK:          addl   %esi, %eax
-; CHECK-NEXT:     cmovol %edi, %esi
+; CHECK-LABEL: saddo.select.i32
+; CHECK:       addl   %esi, %eax
+; CHECK-NEXT:  cmovol %edi, %esi
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -394,9 +407,9 @@
 
 define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    saddo.select.i64
-; CHECK:          addq   %rsi, %rax
-; CHECK-NEXT:     cmovoq %rdi, %rsi
+; CHECK-LABEL: saddo.select.i64
+; CHECK:       addq   %rsi, %rax
+; CHECK-NEXT:  cmovoq %rdi, %rsi
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -405,9 +418,9 @@
 
 define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    uaddo.select.i32
-; CHECK:          addl   %esi, %eax
-; CHECK-NEXT:     cmovbl %edi, %esi
+; CHECK-LABEL: uaddo.select.i32
+; CHECK:       addl   %esi, %eax
+; CHECK-NEXT:  cmovbl %edi, %esi
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -416,9 +429,9 @@
 
 define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    uaddo.select.i64
-; CHECK:          addq   %rsi, %rax
-; CHECK-NEXT:     cmovbq %rdi, %rsi
+; CHECK-LABEL: uaddo.select.i64
+; CHECK:       addq   %rsi, %rax
+; CHECK-NEXT:  cmovbq %rdi, %rsi
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -427,9 +440,9 @@
 
 define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    ssubo.select.i32
-; CHECK:          cmpl   %esi, %edi
-; CHECK-NEXT:     cmovol %edi, %esi
+; CHECK-LABEL: ssubo.select.i32
+; CHECK:       cmpl   %esi, %edi
+; CHECK-NEXT:  cmovol %edi, %esi
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -438,9 +451,9 @@
 
 define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    ssubo.select.i64
-; CHECK:          cmpq   %rsi, %rdi
-; CHECK-NEXT:     cmovoq %rdi, %rsi
+; CHECK-LABEL: ssubo.select.i64
+; CHECK:       cmpq   %rsi, %rdi
+; CHECK-NEXT:  cmovoq %rdi, %rsi
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -449,9 +462,9 @@
 
 define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    usubo.select.i32
-; CHECK:          cmpl   %esi, %edi
-; CHECK-NEXT:     cmovbl %edi, %esi
+; CHECK-LABEL: usubo.select.i32
+; CHECK:       cmpl   %esi, %edi
+; CHECK-NEXT:  cmovbl %edi, %esi
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -460,9 +473,9 @@
 
 define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    usubo.select.i64
-; CHECK:          cmpq   %rsi, %rdi
-; CHECK-NEXT:     cmovbq %rdi, %rsi
+; CHECK-LABEL: usubo.select.i64
+; CHECK:       cmpq   %rsi, %rdi
+; CHECK-NEXT:  cmovbq %rdi, %rsi
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -471,9 +484,9 @@
 
 define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    smulo.select.i32
-; CHECK:          imull  %esi, %eax
-; CHECK-NEXT:     cmovol %edi, %esi
+; CHECK-LABEL: smulo.select.i32
+; CHECK:       imull  %esi, %eax
+; CHECK-NEXT:  cmovol %edi, %esi
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -482,9 +495,9 @@
 
 define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    smulo.select.i64
-; CHECK:          imulq  %rsi, %rax
-; CHECK-NEXT:     cmovoq %rdi, %rsi
+; CHECK-LABEL: smulo.select.i64
+; CHECK:       imulq  %rsi, %rax
+; CHECK-NEXT:  cmovoq %rdi, %rsi
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -493,9 +506,9 @@
 
 define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    umulo.select.i32
-; CHECK:          mull   %esi
-; CHECK-NEXT:     cmovol %edi, %esi
+; CHECK-LABEL: umulo.select.i32
+; CHECK:       mull   %esi
+; CHECK-NEXT:  cmovol %edi, %esi
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
@@ -504,9 +517,9 @@
 
 define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    umulo.select.i64
-; CHECK:          mulq   %rsi
-; CHECK-NEXT:     cmovoq %rdi, %rsi
+; CHECK-LABEL: umulo.select.i64
+; CHECK:       mulq   %rsi
+; CHECK-NEXT:  cmovoq %rdi, %rsi
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -519,9 +532,9 @@
 ;
 define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    saddo.br.i32
-; CHECK:          addl   %esi, %edi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: saddo.br.i32
+; CHECK:       addl   %esi, %edi
+; CHECK-NEXT:  jo
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -536,9 +549,9 @@
 
 define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    saddo.br.i64
-; CHECK:          addq   %rsi, %rdi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: saddo.br.i64
+; CHECK:       addq   %rsi, %rdi
+; CHECK-NEXT:  jo
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -553,9 +566,9 @@
 
 define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    uaddo.br.i32
-; CHECK:          addl   %esi, %edi
-; CHECK-NEXT:     jb
+; CHECK-LABEL: uaddo.br.i32
+; CHECK:       addl   %esi, %edi
+; CHECK-NEXT:  jb
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -570,9 +583,9 @@
 
 define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    uaddo.br.i64
-; CHECK:          addq   %rsi, %rdi
-; CHECK-NEXT:     jb
+; CHECK-LABEL: uaddo.br.i64
+; CHECK:       addq   %rsi, %rdi
+; CHECK-NEXT:  jb
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -587,9 +600,9 @@
 
 define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    ssubo.br.i32
-; CHECK:          cmpl   %esi, %edi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: ssubo.br.i32
+; CHECK:       cmpl   %esi, %edi
+; CHECK-NEXT:  jo
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -604,9 +617,9 @@
 
 define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    ssubo.br.i64
-; CHECK:          cmpq   %rsi, %rdi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: ssubo.br.i64
+; CHECK:       cmpq   %rsi, %rdi
+; CHECK-NEXT:  jo
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -621,9 +634,9 @@
 
 define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    usubo.br.i32
-; CHECK:          cmpl   %esi, %edi
-; CHECK-NEXT:     jb
+; CHECK-LABEL: usubo.br.i32
+; CHECK:       cmpl   %esi, %edi
+; CHECK-NEXT:  jb
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -638,9 +651,9 @@
 
 define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    usubo.br.i64
-; CHECK:          cmpq   %rsi, %rdi
-; CHECK-NEXT:     jb
+; CHECK-LABEL: usubo.br.i64
+; CHECK:       cmpq   %rsi, %rdi
+; CHECK-NEXT:  jb
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -655,9 +668,9 @@
 
 define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    smulo.br.i32
-; CHECK:          imull  %esi, %edi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: smulo.br.i32
+; CHECK:       imull  %esi, %edi
+; CHECK-NEXT:  jo
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -672,9 +685,9 @@
 
 define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    smulo.br.i64
-; CHECK:          imulq  %rsi, %rdi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: smulo.br.i64
+; CHECK:       imulq  %rsi, %rdi
+; CHECK-NEXT:  jo
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -689,9 +702,9 @@
 
 define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
 entry:
-; CHECK-LABEL:    umulo.br.i32
-; CHECK:          mull  %esi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: umulo.br.i32
+; CHECK:       mull  %esi
+; CHECK-NEXT:  jo
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -706,9 +719,9 @@
 
 define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 entry:
-; CHECK-LABEL:    umulo.br.i64
-; CHECK:          mulq  %rsi
-; CHECK-NEXT:     jo
+; CHECK-LABEL: umulo.br.i64
+; CHECK:       mulq  %rsi
+; CHECK-NEXT:  jo
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -725,6 +738,8 @@
 declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
 declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8,  i1} @llvm.uadd.with.overflow.i8 (i8,  i8 ) nounwind readnone
+declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) nounwind readnone
 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone

diff --git a/test/CodeGen/XCore/atomic.ll b/test/CodeGen/XCore/atomic.ll
index 58ef38b..6ca80cf 100644
--- a/test/CodeGen/XCore/atomic.ll
+++ b/test/CodeGen/XCore/atomic.ll

@@ -22,11 +22,10 @@
 ; CHECK-LABEL: atomicloadstore
 
 ; CHECK: ldw r[[R0:[0-9]+]], dp[pool]
-; CHECK-NEXT: #MEMBARRIER
-  %0 = load atomic i32* bitcast (i64* @pool to i32*) acquire, align 4
-
 ; CHECK-NEXT: ldaw r[[R1:[0-9]+]], dp[pool]
+; CHECK-NEXT: #MEMBARRIER
 ; CHECK-NEXT: ldc r[[R2:[0-9]+]], 0
+  %0 = load atomic i32* bitcast (i64* @pool to i32*) acquire, align 4
 
 ; CHECK-NEXT: ld16s r3, r[[R1]][r[[R2]]]
 ; CHECK-NEXT: #MEMBARRIER

diff --git a/test/CodeGen/XCore/dwarf_debug.ll b/test/CodeGen/XCore/dwarf_debug.ll
index 2f4b231..47db82d 100644
--- a/test/CodeGen/XCore/dwarf_debug.ll
+++ b/test/CodeGen/XCore/dwarf_debug.ll

@@ -13,27 +13,27 @@
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !11), !dbg !12
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !11, metadata !{metadata !"0x102"}), !dbg !12
   %0 = load i32* %a.addr, align 4, !dbg !12
   %add = add nsw i32 %0, 1, !dbg !12
   ret i32 %add, !dbg !12
 }
 
-declare void @llvm.dbg.declare(metadata, metadata)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1}
+!0 = metadata !{metadata !"0x11\0012\00\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @f, null, null, metadata !2, i32 2}
-!5 = metadata !{i32 786473, metadata !1}
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null}
+!4 = metadata !{metadata !"0x2e\00f\00f\00\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ]
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!11 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777218, metadata !8, i32 0, i32 0}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!11 = metadata !{metadata !"0x101\00a\0016777218\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ]
 !12 = metadata !{i32 2, i32 0, metadata !4, null}
 

diff --git a/test/CodeGen/XCore/exception.ll b/test/CodeGen/XCore/exception.ll
index 3179fcd..fec83eb 100644
--- a/test/CodeGen/XCore/exception.ll
+++ b/test/CodeGen/XCore/exception.ll

@@ -107,17 +107,12 @@
 ; CHECK: .asciiz
 ; CHECK: .byte  3
 ; CHECK: .byte  26
-; CHECK: [[SET0:.L[a-zA-Z0-9_]+]] = [[PRE_G]]-[[START]]
-; CHECK: .long [[SET0]]
-; CHECK: [[SET1:.L[a-zA-Z0-9_]+]] = [[POST_G]]-[[PRE_G]]
-; CHECK: .long [[SET1]]
-; CHECK: [[SET2:.L[a-zA-Z0-9_]+]] = [[LANDING]]-[[START]]
-; CHECK: .long [[SET2]]
+; CHECK: .long [[PRE_G]]-[[START]]
+; CHECK: .long [[POST_G]]-[[PRE_G]]
+; CHECK: .long [[LANDING]]-[[START]]
 ; CHECK: .byte 3
-; CHECK: [[SET3:.L[a-zA-Z0-9_]+]] = [[POST_G]]-[[START]]
-; CHECK: .long [[SET3]]
-; CHECK: [[SET4:.L[a-zA-Z0-9_]+]] = [[END]]-[[POST_G]]
-; CHECK: .long [[SET4]]
+; CHECK: .long [[POST_G]]-[[START]]
+; CHECK: .long [[END]]-[[POST_G]]
 ; CHECK: .long 0
 ; CHECK: .byte 0
 ; CHECK: .byte 1

diff --git a/test/DebugInfo/2009-11-03-InsertExtractValue.ll b/test/DebugInfo/2009-11-03-InsertExtractValue.ll
index 21a60b8..838ba05 100644
--- a/test/DebugInfo/2009-11-03-InsertExtractValue.ll
+++ b/test/DebugInfo/2009-11-03-InsertExtractValue.ll

@@ -4,12 +4,12 @@
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!6}
 
-!0 = metadata !{i32 786478, metadata !4, metadata !1, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 3, metadata !2, i1 false, i1 false, i32 0, i32 0, null, i32 258, i1 false, null, null, i32 0, metadata !1, i32 3} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 41, metadata !4} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 21, metadata !4, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !3, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!0 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\003\000\000\000\006\00258\000\003", metadata !4, metadata !1, metadata !2, null, null, null, i32 0, metadata !1} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !4} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !4, metadata !1, null, metadata !3, null} ; [ DW_TAG_subroutine_type ]
 !3 = metadata !{null}
 !4 = metadata !{metadata !"/foo", metadata !"bar.cpp"}
-!5 = metadata !{i32 458769, metadata !4, i32 12, metadata !"", i1 true, metadata !"", i32 0, metadata !3, metadata !3, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
+!5 = metadata !{metadata !"0x11\0012\00\001\00\000\00\000", metadata !4, metadata !3, metadata !3, null, null, null}; [DW_TAG_compile_unit ]
 
 define <{i32, i32}> @f1() {
 ; CHECK: !dbgx ![[NUMBER:[0-9]+]]
@@ -20,4 +20,4 @@
 }
 
 ; CHECK: [protected]
-!6 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!6 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
index 65907d6..9c714d7 100644
--- a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll

@@ -10,17 +10,17 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{i32 720913, metadata !17, i32 12, metadata !"clang version 3.0 (trunk 139632)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 139632)\001\00\000\00\000", metadata !17, metadata !1, metadata !1, metadata !3, metadata !12, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !17, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
-!6 = metadata !{i32 720937, metadata !17} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\000\001\000", metadata !17, metadata !6, metadata !7, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
+!6 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !12 = metadata !{metadata !14}
-!14 = metadata !{i32 720948, i32 0, metadata !5, metadata !"bar", metadata !"bar", metadata !"", metadata !6, i32 2, metadata !9, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
+!14 = metadata !{metadata !"0x34\00bar\00bar\00\002\001\001", metadata !5, metadata !6, metadata !9, null, null} ; [ DW_TAG_variable ]
 !15 = metadata !{i32 3, i32 3, metadata !16, null}
-!16 = metadata !{i32 720907, metadata !17, metadata !5, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0xb\001\0011\000", metadata !17, metadata !5} ; [ DW_TAG_lexical_block ]
 !17 = metadata !{metadata !"fb.c", metadata !"/private/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll b/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
index 9beab20..4524b27 100644
--- a/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll

@@ -4,11 +4,11 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9}
 
-!0 = metadata !{i32 720913, metadata !8, i32 12, metadata !"clang version 3.0 (trunk 139632)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 139632)\001\00\000\00\000", metadata !8, metadata !2, metadata !2, metadata !2, metadata !3, null} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720948, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i32* @0, null} ; [ DW_TAG_variable ]
-!6 = metadata !{i32 720937, metadata !8} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x34\00a\00a\00\002\000\001", null, metadata !6, metadata !7, i32* @0, null} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !"g.c", metadata !"/private/tmp"}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2009-11-10-CurrentFn.ll b/test/DebugInfo/2009-11-10-CurrentFn.ll
index 151d631..76b1eda 100644
--- a/test/DebugInfo/2009-11-10-CurrentFn.ll
+++ b/test/DebugInfo/2009-11-10-CurrentFn.ll

@@ -8,24 +8,24 @@
 
 declare void @foo(...)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{i32 720913, metadata !17, i32 12, metadata !"clang version 3.0 (trunk 139632)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 139632)\001\00\000\00\000", metadata !17, metadata !1, metadata !1, metadata !3, metadata !1, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !17, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32)* @bar, null, null, metadata !9, metadata !""} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [bar]
-!6 = metadata !{i32 720937, metadata !17} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00bar\00bar\00\003\000\001\000\006\00256\001\000", metadata !17, metadata !6, metadata !7, null, void (i32)* @bar, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [bar]
+!6 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{metadata !11}
-!11 = metadata !{i32 721153, metadata !17, metadata !5, metadata !"i", i32 16777219, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0x101\00i\0016777219\000", metadata !17, metadata !5, metadata !12} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !13 = metadata !{i32 3, i32 14, metadata !5, null}
 !14 = metadata !{i32 4, i32 3, metadata !15, null}
-!15 = metadata !{i32 720907, metadata !17, metadata !5, i32 3, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0xb\003\0017\000", metadata !17, metadata !5} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{i32 5, i32 1, metadata !15, null}
 !17 = metadata !{metadata !"cf.c", metadata !"/private/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-01-05-DbgScope.ll b/test/DebugInfo/2010-01-05-DbgScope.ll
index 809cebf..e85a9ec 100644
--- a/test/DebugInfo/2010-01-05-DbgScope.ll
+++ b/test/DebugInfo/2010-01-05-DbgScope.ll

@@ -12,14 +12,14 @@
 !llvm.module.flags = !{!14}
 
 !0 = metadata !{i32 571, i32 3, metadata !1, null}
-!1 = metadata !{i32 458763, metadata !11, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"foo", metadata !"foo", metadata !"foo", i32 561, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0}; [DW_TAG_subprogram ]
-!3 = metadata !{i32 458769, metadata !11, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, metadata !12, metadata !12, metadata !13, null, null, metadata !""}; [DW_TAG_compile_unit ]
-!4 = metadata !{i32 458773, null, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!1 = metadata !{metadata !"0xb\001\001\000", metadata !11, metadata !2}; [DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00foo\00foo\00foo\00561\000\001\000\006\000\000\000", i32 0, metadata !3, metadata !4, null, null, null, null, null}; [DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x11\0012\00clang 1.1\001\00\000\00\000", metadata !11, metadata !12, metadata !12, metadata !13, null, null}; [DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, metadata !3, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 458788, null, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !3} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 588, i32 1, metadata !2, null}
 !11 = metadata !{metadata !"hashtab.c", metadata !"/usr/src/gnu/usr.bin/cc/cc_tools/../../../../contrib/gcclibs/libiberty"}
 !12 = metadata !{i32 0}
 !13 = metadata !{metadata !2}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-03-12-llc-crash.ll b/test/DebugInfo/2010-03-12-llc-crash.ll
index 241bb37..0075f4e 100644
--- a/test/DebugInfo/2010-03-12-llc-crash.ll
+++ b/test/DebugInfo/2010-03-12-llc-crash.ll

@@ -1,22 +1,22 @@
 ; RUN: llc -O0 < %s -o /dev/null
 ; llc should not crash on this invalid input.
 ; PR6588
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define void @foo() {
 entry:
-  call void @llvm.dbg.declare(metadata !{i32* undef}, metadata !0)
+  call void @llvm.dbg.declare(metadata !{i32* undef}, metadata !0, metadata !{metadata !"0x102"})
   ret void
 }
 
-!0 = metadata !{i32 524545, metadata !1, metadata !"sy", metadata !2, i32 890, metadata !7} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 524334, metadata !8, metadata !3, metadata !"foo", metadata !"foo", metadata !"foo", i32 892, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 524329, metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 524305, metadata !9, i32 4, metadata !"clang 1.1", i1 true, metadata !"", i32 0, metadata !10, metadata !10, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !9, metadata !5, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{i32 524329, metadata !9} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x101\00sy\00890\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\00892\000\001\000\006\000\000\000", metadata !8, metadata !3, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\004\00clang 1.1\001\00\000\00\000", metadata !9, metadata !10, metadata !10, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !9, metadata !5, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x29", metadata !9} ; [ DW_TAG_file_type ]
 !6 = metadata !{null}
-!7 = metadata !{i32 524324, metadata !9, metadata !5, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !9, metadata !5} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !"qpainter.h", metadata !"QtGui"}
 !9 = metadata !{metadata !"splineeditor.cpp", metadata !"src"}
 !10 = metadata !{i32 0}

diff --git a/test/DebugInfo/2010-03-19-DbgDeclare.ll b/test/DebugInfo/2010-03-19-DbgDeclare.ll
index 94aa259..32021c5 100644
--- a/test/DebugInfo/2010-03-19-DbgDeclare.ll
+++ b/test/DebugInfo/2010-03-19-DbgDeclare.ll

@@ -1,19 +1,19 @@
 ; RUN: opt < %s -verify -S | FileCheck %s
 
-; CHECK: lang 0x8001
+; CHECK: [DW_LANG_Mips_Assembler]
 
 define void @Foo(i32 %a, i32 %b) {
 entry:
-  call void @llvm.dbg.declare(metadata !{i32* null}, metadata !1)
+  call void @llvm.dbg.declare(metadata !{i32* null}, metadata !1, metadata !{metadata !"0x102"})
   ret void
 }
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!5}
-!2 = metadata !{i32 786449, metadata !4, i32 32769, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !3, metadata !3, metadata !3, metadata !3,  metadata !3, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [lang 0x8001]
+!2 = metadata !{metadata !"0x11\0032769\00clang version 3.3 \000\00\000\00\001", metadata !4, metadata !3, metadata !3, metadata !3, metadata !3,  metadata !3} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [lang 0x8001]
 !3 = metadata !{}
 !0 = metadata !{i32 662302, i32 26, metadata !1, null}
 !1 = metadata !{i32 4, metadata !"foo"}
 !4 = metadata !{metadata !"scratch.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
+!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-03-24-MemberFn.ll b/test/DebugInfo/2010-03-24-MemberFn.ll
index 4ea9d2c..71f4acb 100644
--- a/test/DebugInfo/2010-03-24-MemberFn.ll
+++ b/test/DebugInfo/2010-03-24-MemberFn.ll

@@ -8,7 +8,7 @@
   %0 = alloca i32                                 ; <i32*> [#uses=2]
   %s1 = alloca %struct.S                          ; <%struct.S*> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.S* %s1}, metadata !0), !dbg !16
+  call void @llvm.dbg.declare(metadata !{%struct.S* %s1}, metadata !0, metadata !{metadata !"0x102"}), !dbg !16
   %1 = call i32 @_ZN1S3fooEv(%struct.S* %s1) nounwind, !dbg !17 ; <i32> [#uses=1]
   store i32 %1, i32* %0, align 4, !dbg !17
   %2 = load i32* %0, align 4, !dbg !17            ; <i32> [#uses=1]
@@ -25,7 +25,7 @@
   %this_addr = alloca %struct.S*                  ; <%struct.S**> [#uses=1]
   %retval = alloca i32                            ; <i32*> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.S** %this_addr}, metadata !18), !dbg !21
+  call void @llvm.dbg.declare(metadata !{%struct.S** %this_addr}, metadata !18, metadata !{metadata !"0x102"}), !dbg !21
   store %struct.S* %this, %struct.S** %this_addr
   br label %return, !dbg !21
 
@@ -34,37 +34,37 @@
   ret i32 %retval1, !dbg !22
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!28}
 
-!0 = metadata !{i32 786688, metadata !1, metadata !"s1", metadata !4, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{i32 786443, metadata !25, metadata !2, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{i32 786443, metadata !25, metadata !3, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{i32 786478, metadata !25, metadata !4, metadata !"bar", metadata !"bar", metadata !"_Z3barv", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @_Z3barv, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
-!4 = metadata !{i32 786473, metadata !25} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786449, metadata !25, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !27, metadata !27, metadata !24, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 786453, metadata !25, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x100\00s1\003\000", metadata !1, metadata !4, metadata !9} ; [ DW_TAG_auto_variable ]
+!1 = metadata !{metadata !"0xb\003\000\000", metadata !25, metadata !2} ; [ DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0xb\003\000\000", metadata !25, metadata !3} ; [ DW_TAG_lexical_block ]
+!3 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barv\003\000\001\000\006\000\000\003", metadata !25, metadata !4, metadata !6, null, i32 ()* @_Z3barv, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = metadata !{metadata !"0x29", metadata !25} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !25, metadata !27, metadata !27, metadata !24, null,  null} ; [ DW_TAG_compile_unit ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !25, metadata !4, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, metadata !25, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786451, metadata !26, metadata !4, metadata !"S", i32 2, i64 8, i64 8, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 2, size 8, align 8, offset 0] [def] [from ]
-!10 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !25, metadata !4} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x13\00S\002\008\008\000\000\000", metadata !26, metadata !4, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 2, size 8, align 8, offset 0] [def] [from ]
+!10 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786478, metadata !26, metadata !9, metadata !"foo", metadata !"foo", metadata !"_ZN1S3fooEv", i32 3, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 (%struct.S*)* @_ZN1S3fooEv, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 786453, metadata !25, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1S3fooEv\003\000\001\000\006\000\000\003", metadata !26, metadata !9, metadata !13, null, i32 (%struct.S*)* @_ZN1S3fooEv, null, null, null} ; [ DW_TAG_subprogram ]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !25, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{metadata !8, metadata !15}
-!15 = metadata !{i32 786447, metadata !25, metadata !4, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !9} ; [ DW_TAG_pointer_type ]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !25, metadata !4, metadata !9} ; [ DW_TAG_pointer_type ]
 !16 = metadata !{i32 3, i32 0, metadata !1, null}
 !17 = metadata !{i32 3, i32 0, metadata !3, null}
-!18 = metadata !{i32 786689, metadata !12, metadata !"this", metadata !10, i32 3, metadata !19, i32 0, null} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786470, metadata !25, metadata !4, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !20} ; [ DW_TAG_const_type ]
-!20 = metadata !{i32 786447, metadata !25, metadata !4, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
+!18 = metadata !{metadata !"0x101\00this\003\000", metadata !12, metadata !10, metadata !19} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !25, metadata !4, metadata !20} ; [ DW_TAG_const_type ]
+!20 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !25, metadata !4, metadata !9} ; [ DW_TAG_pointer_type ]
 !21 = metadata !{i32 3, i32 0, metadata !12, null}
 !22 = metadata !{i32 3, i32 0, metadata !23, null}
-!23 = metadata !{i32 786443, metadata !26, metadata !12, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!23 = metadata !{metadata !"0xb\003\000\000", metadata !26, metadata !12} ; [ DW_TAG_lexical_block ]
 !24 = metadata !{metadata !3, metadata !12}
 !25 = metadata !{metadata !"one.cc", metadata !"/tmp/"}
 !26 = metadata !{metadata !"one.h", metadata !"/tmp/"}
 !27 = metadata !{i32 0}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll b/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
index 81285a9..1f90a34 100644
--- a/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
+++ b/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll

@@ -2,35 +2,35 @@
 
 define void @baz(i32 %i) nounwind ssp {
 entry:
-  call void @llvm.dbg.declare(metadata !0, metadata !1), !dbg !0
+  call void @llvm.dbg.declare(metadata !0, metadata !1, metadata !{metadata !"0x102"}), !dbg !0
   ret void, !dbg !0
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!22}
 
 !0 = metadata !{{ [0 x i8] }** undef}
-!1 = metadata !{i32 524544, metadata !2, metadata !"x", metadata !4, i32 11, metadata !9} ; [ DW_TAG_auto_variable ]
-!2 = metadata !{i32 524299, metadata !20, metadata !3, i32 8, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{i32 524334, metadata !20, null, metadata !"baz", metadata !"baz", metadata !"baz", i32 8, metadata !6, i1 true, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!4 = metadata !{i32 524329, metadata !20} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 524305, metadata !20, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 524309, metadata !20, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!1 = metadata !{metadata !"0x100\00x\0011\000", metadata !2, metadata !4, metadata !9} ; [ DW_TAG_auto_variable ]
+!2 = metadata !{metadata !"0xb\008\000\000", metadata !20, metadata !3} ; [ DW_TAG_lexical_block ]
+!3 = metadata !{metadata !"0x2e\00baz\00baz\00baz\008\001\001\000\006\000\000\000", metadata !20, null, metadata !6, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !4, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
-!8 = metadata !{i32 524324, metadata !20, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 524303, metadata !20, metadata !4, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 524307, metadata !20, metadata !3, metadata !"", i32 11, i64 8, i64 8, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 11, size 8, align 8, offset 0] [def] [from ]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !20, metadata !4} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !20, metadata !4, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x13\00\0011\008\008\000\000\000", metadata !20, metadata !3, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [line 11, size 8, align 8, offset 0] [def] [from ]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 524301, metadata !20, metadata !10, metadata !"b", i32 11, i64 8, i64 8, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ]
-!13 = metadata !{i32 524310, metadata !20, metadata !3, metadata !"A", i32 11, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_typedef ]
-!14 = metadata !{i32 524289, metadata !20, metadata !4, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !15, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
-!15 = metadata !{i32 524324, metadata !20, metadata !4, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!12 = metadata !{metadata !"0xd\00b\0011\008\008\000\000", metadata !20, metadata !10, metadata !13} ; [ DW_TAG_member ]
+!13 = metadata !{metadata !"0x16\00A\0011\000\000\000\000", metadata !20, metadata !3, metadata !14} ; [ DW_TAG_typedef ]
+!14 = metadata !{metadata !"0x1\00\000\008\008\000\000", metadata !20, metadata !4, metadata !15, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!15 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !20, metadata !4} ; [ DW_TAG_base_type ]
 !16 = metadata !{metadata !17}
-!17 = metadata !{i32 524321, i64 0, i64 1}        ; [ DW_TAG_subrange_type ]
+!17 = metadata !{metadata !"0x21\000\001"}        ; [ DW_TAG_subrange_type ]
 !18 = metadata !{metadata !"llvm.mdnode.fwdref.19"}
 !19 = metadata !{metadata !"llvm.mdnode.fwdref.23"}
 !20 = metadata !{metadata !"2007-12-VarArrayDebug.c", metadata !"/Users/sabre/llvm/test/FrontendC/"}
 !21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
index 5f7cb69..b60e5c4 100644
--- a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
+++ b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll

@@ -26,14 +26,14 @@
   %retval = alloca i32, align 4                   ; <i32*> [#uses=3]
   %b = alloca %class.A, align 1                   ; <%class.A*> [#uses=1]
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{%class.A* %b}, metadata !0), !dbg !14
+  call void @llvm.dbg.declare(metadata !{%class.A* %b}, metadata !0, metadata !{metadata !"0x102"}), !dbg !14
   %call = call i32 @_ZN1B2fnEv(%class.A* %b), !dbg !15 ; <i32> [#uses=1]
   store i32 %call, i32* %retval, !dbg !15
   %0 = load i32* %retval, !dbg !16                ; <i32> [#uses=1]
   ret i32 %0, !dbg !16
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define linkonce_odr i32 @_ZN1B2fnEv(%class.A* %this) ssp align 2 {
 entry:
@@ -42,10 +42,10 @@
   %a = alloca %class.A, align 1                   ; <%class.A*> [#uses=1]
   %i = alloca i32, align 4                        ; <i32*> [#uses=2]
   store %class.A* %this, %class.A** %this.addr
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !17), !dbg !18
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
   %this1 = load %class.A** %this.addr             ; <%class.A*> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !19), !dbg !27
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !28), !dbg !29
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !19, metadata !{metadata !"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !28, metadata !{metadata !"0x102"}), !dbg !29
   %call = call i32 @_ZZN1B2fnEvEN1A3fooEv(%class.A* %a), !dbg !30 ; <i32> [#uses=1]
   store i32 %call, i32* %i, !dbg !30
   %tmp = load i32* %i, !dbg !31                   ; <i32> [#uses=1]
@@ -59,7 +59,7 @@
   %retval = alloca i32, align 4                   ; <i32*> [#uses=2]
   %this.addr = alloca %class.A*, align 8          ; <%class.A**> [#uses=2]
   store %class.A* %this, %class.A** %this.addr
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !33), !dbg !34
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !33, metadata !{metadata !"0x102"}), !dbg !34
   %this1 = load %class.A** %this.addr             ; <%class.A*> [#uses=0]
   store i32 42, i32* %retval, !dbg !35
   %0 = load i32* %retval, !dbg !35                ; <i32> [#uses=1]
@@ -70,43 +70,43 @@
 !llvm.module.flags = !{!40}
 !37 = metadata !{metadata !2, metadata !10, metadata !23}
 
-!0 = metadata !{i32 786688, metadata !1, metadata !"b", metadata !3, i32 16, metadata !8, i32 0, null} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{i32 786443, metadata !38, metadata !2, i32 15, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{i32 786478, metadata !38, metadata !3, metadata !"main", metadata !"main", metadata !"main", i32 15, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i32 ()* @main, null, null, null, i32 15} ; [ DW_TAG_subprogram ]
-!3 = metadata !{i32 786473, metadata !38} ; [ DW_TAG_file_type ]
-!4 = metadata !{i32 786449, metadata !38, i32 4, metadata !"clang 1.5", i1 false, metadata !"", i32 0, metadata !39, metadata !39, metadata !37, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 786453, metadata !38, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x100\00b\0016\000", metadata !1, metadata !3, metadata !8} ; [ DW_TAG_auto_variable ]
+!1 = metadata !{metadata !"0xb\0015\0012\000", metadata !38, metadata !2} ; [ DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00main\00main\00main\0015\000\001\000\006\000\000\0015", metadata !38, metadata !3, metadata !5, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x29", metadata !38} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !"0x11\004\00clang 1.5\000\00\000\00\000", metadata !38, metadata !39, metadata !39, metadata !37, null,  null} ; [ DW_TAG_compile_unit ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !38, metadata !3, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786468, metadata !38, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786434, metadata !38, metadata !3, metadata !"B", i32 2, i64 8, i64 8, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_class_type ] [B] [line 2, size 8, align 8, offset 0] [def] [from ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !38, metadata !3} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x2\00B\002\008\008\000\000\000", metadata !38, metadata !3, null, metadata !9, null, null, null} ; [ DW_TAG_class_type ] [B] [line 2, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786478, metadata !38, metadata !8, metadata !"fn", metadata !"fn", metadata !"_ZN1B2fnEv", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i32 (%class.A*)* @_ZN1B2fnEv, null, null, null, i32 4} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !38, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x2e\00fn\00fn\00_ZN1B2fnEv\004\000\001\000\006\000\000\004", metadata !38, metadata !8, metadata !11, null, i32 (%class.A*)* @_ZN1B2fnEv, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !38, metadata !3, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !7, metadata !13}
-!13 = metadata !{i32 786447, metadata !38, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !8} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !38, metadata !3, metadata !8} ; [ DW_TAG_pointer_type ]
 !14 = metadata !{i32 16, i32 5, metadata !1, null}
 !15 = metadata !{i32 17, i32 3, metadata !1, null}
 !16 = metadata !{i32 18, i32 1, metadata !2, null}
-!17 = metadata !{i32 786689, metadata !10, metadata !"this", metadata !3, i32 4, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x101\00this\004\000", metadata !10, metadata !3, metadata !13} ; [ DW_TAG_arg_variable ]
 !18 = metadata !{i32 4, i32 7, metadata !10, null}
-!19 = metadata !{i32 786688, metadata !20, metadata !"a", metadata !3, i32 9, metadata !21, i32 0, null} ; [ DW_TAG_auto_variable ]
-!20 = metadata !{i32 786443, metadata !38, metadata !10, i32 4, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
-!21 = metadata !{i32 786434, metadata !38, metadata !10, metadata !"A", i32 5, i64 8, i64 8, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 5, size 8, align 8, offset 0] [def] [from ]
+!19 = metadata !{metadata !"0x100\00a\009\000", metadata !20, metadata !3, metadata !21} ; [ DW_TAG_auto_variable ]
+!20 = metadata !{metadata !"0xb\004\0012\000", metadata !38, metadata !10} ; [ DW_TAG_lexical_block ]
+!21 = metadata !{metadata !"0x2\00A\005\008\008\000\000\000", metadata !38, metadata !10, null, metadata !22, null, null, null} ; [ DW_TAG_class_type ] [A] [line 5, size 8, align 8, offset 0] [def] [from ]
 !22 = metadata !{metadata !23}
-!23 = metadata !{i32 786478, metadata !38, metadata !21, metadata !"foo", metadata !"foo", metadata !"_ZZN1B2fnEvEN1A3fooEv", i32 7, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i32 (%class.A*)* @_ZZN1B2fnEvEN1A3fooEv, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
-!24 = metadata !{i32 786453, metadata !38, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{metadata !"0x2e\00foo\00foo\00_ZZN1B2fnEvEN1A3fooEv\007\000\001\000\006\000\000\007", metadata !38, metadata !21, metadata !24, null, i32 (%class.A*)* @_ZZN1B2fnEvEN1A3fooEv, null, null, null} ; [ DW_TAG_subprogram ]
+!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !38, metadata !3, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{metadata !7, metadata !26}
-!26 = metadata !{i32 786447, metadata !38, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !21} ; [ DW_TAG_pointer_type ]
+!26 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !38, metadata !3, metadata !21} ; [ DW_TAG_pointer_type ]
 !27 = metadata !{i32 9, i32 7, metadata !20, null}
-!28 = metadata !{i32 786688, metadata !20, metadata !"i", metadata !3, i32 10, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
+!28 = metadata !{metadata !"0x100\00i\0010\000", metadata !20, metadata !3, metadata !7} ; [ DW_TAG_auto_variable ]
 !29 = metadata !{i32 10, i32 9, metadata !20, null}
 !30 = metadata !{i32 10, i32 5, metadata !20, null}
 !31 = metadata !{i32 11, i32 5, metadata !20, null}
 !32 = metadata !{i32 12, i32 3, metadata !10, null}
-!33 = metadata !{i32 786689, metadata !23, metadata !"this", metadata !3, i32 7, metadata !26, i32 0, null} ; [ DW_TAG_arg_variable ]
+!33 = metadata !{metadata !"0x101\00this\007\000", metadata !23, metadata !3, metadata !26} ; [ DW_TAG_arg_variable ]
 !34 = metadata !{i32 7, i32 11, metadata !23, null}
 !35 = metadata !{i32 7, i32 19, metadata !36, null}
-!36 = metadata !{i32 786443, metadata !38, metadata !23, i32 7, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
+!36 = metadata !{metadata !"0xb\007\0017\000", metadata !38, metadata !23} ; [ DW_TAG_lexical_block ]
 !38 = metadata !{metadata !"one.cc", metadata !"/tmp" }
 !39 = metadata !{i32 0}
-!40 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!40 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-04-19-FramePtr.ll b/test/DebugInfo/2010-04-19-FramePtr.ll
index 6c77223..e0a9219 100644
--- a/test/DebugInfo/2010-04-19-FramePtr.ll
+++ b/test/DebugInfo/2010-04-19-FramePtr.ll

@@ -24,14 +24,14 @@
 !9 = metadata !{metadata !1}
 
 !0 = metadata !{i32 2, i32 0, metadata !1, null}
-!1 = metadata !{i32 786478, metadata !10, null, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @foo, null, null, null, i32 2} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !10, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\002", metadata !10, null, metadata !4, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !10, metadata !11, metadata !11, metadata !9, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786468, metadata !10, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !2} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 2, i32 0, metadata !8, null}
-!8 = metadata !{i32 786443, metadata !10, metadata !1, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!8 = metadata !{metadata !"0xb\002\000\000", metadata !10, metadata !1} ; [ DW_TAG_lexical_block ]
 !10 = metadata !{metadata !"a.c", metadata !"/tmp"}
 !11 = metadata !{i32 0}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-05-03-DisableFramePtr.ll b/test/DebugInfo/2010-05-03-DisableFramePtr.ll
index ba8d0e5..87e2498 100644
--- a/test/DebugInfo/2010-05-03-DisableFramePtr.ll
+++ b/test/DebugInfo/2010-05-03-DisableFramePtr.ll

@@ -6,7 +6,7 @@
 entry:
   %userUPP_addr = alloca void (%struct.AppleEvent*)* ; <void (%struct.AppleEvent*)**> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{void (%struct.AppleEvent*)** %userUPP_addr}, metadata !0), !dbg !13
+  call void @llvm.dbg.declare(metadata !{void (%struct.AppleEvent*)** %userUPP_addr}, metadata !0, metadata !{metadata !"0x102"}), !dbg !13
   store void (%struct.AppleEvent*)* %userUPP, void (%struct.AppleEvent*)** %userUPP_addr
   br label %return, !dbg !14
 
@@ -14,27 +14,27 @@
   ret void, !dbg !14
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!19}
-!0 = metadata !{i32 524545, metadata !1, metadata !"userUPP", metadata !2, i32 7, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 524334, metadata !16, null, metadata !"DisposeDMNotificationUPP", metadata !"DisposeDMNotificationUPP", metadata !"DisposeDMNotificationUPP", i32 7, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 524329, metadata !16} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 524305, metadata !16, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !18, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !16, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00userUPP\007\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00DisposeDMNotificationUPP\00DisposeDMNotificationUPP\00DisposeDMNotificationUPP\007\000\001\000\006\000\000\000", metadata !16, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", metadata !16, metadata !17, metadata !17, metadata !18, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !16, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{null, metadata !6}
-!6 = metadata !{i32 524310, metadata !16, metadata !2, metadata !"DMNotificationUPP", i32 6, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
-!7 = metadata !{i32 524303, metadata !16, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 524309, metadata !16, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x16\00DMNotificationUPP\006\000\000\000\000", metadata !16, metadata !2, metadata !7} ; [ DW_TAG_typedef ]
+!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !16, metadata !2, metadata !8} ; [ DW_TAG_pointer_type ]
+!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !16, metadata !2, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !9 = metadata !{null, metadata !10}
-!10 = metadata !{i32 524303, metadata !16, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 524310, metadata !16, metadata !2, metadata !"AppleEvent", i32 4, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_typedef ]
-!12 = metadata !{i32 524307, metadata !16, metadata !2, metadata !"AEDesc", i32 1, i64 0, i64 0, i64 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [AEDesc] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !16, metadata !2, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0x16\00AppleEvent\004\000\000\000\000", metadata !16, metadata !2, metadata !12} ; [ DW_TAG_typedef ]
+!12 = metadata !{metadata !"0x13\00AEDesc\001\000\000\000\004\000", metadata !16, metadata !2, null, null, null, null, null} ; [ DW_TAG_structure_type ] [AEDesc] [line 1, size 0, align 0, offset 0] [decl] [from ]
 !13 = metadata !{i32 7, i32 0, metadata !1, null}
 !14 = metadata !{i32 8, i32 0, metadata !15, null}
-!15 = metadata !{i32 524299, metadata !16, metadata !1, i32 7, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0xb\007\000\000", metadata !16, metadata !1} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{metadata !"t.c", metadata !"/Users/echeng/LLVM/radars/r7937664/"}
 !17 = metadata !{i32 0}
 !18 = metadata !{metadata !1}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-05-03-OriginDIE.ll b/test/DebugInfo/2010-05-03-OriginDIE.ll
index 0c5d876..fd36d47 100644
--- a/test/DebugInfo/2010-05-03-OriginDIE.ll
+++ b/test/DebugInfo/2010-05-03-OriginDIE.ll

@@ -23,12 +23,12 @@
   %a10 = call i64 @llvm.bswap.i64(i64 %a9) nounwind ; <i64> [#uses=1]
   %a11 = getelementptr inbounds %struct.gpt_t* %gpt, i32 0, i32 8, !dbg !7 ; <i64*> [#uses=1]
   %a12 = load i64* %a11, align 4, !dbg !7         ; <i64> [#uses=1]
-  call void @llvm.dbg.declare(metadata !{i64* %data_addr.i17}, metadata !8) nounwind, !dbg !14
+  call void @llvm.dbg.declare(metadata !{i64* %data_addr.i17}, metadata !8, metadata !{metadata !"0x102"}) nounwind, !dbg !14
   store i64 %a12, i64* %data_addr.i17, align 8
-  call void @llvm.dbg.value(metadata !6, i64 0, metadata !15) nounwind
-  call void @llvm.dbg.value(metadata !18, i64 0, metadata !19) nounwind
-  call void @llvm.dbg.declare(metadata !6, metadata !23) nounwind
-  call void @llvm.dbg.value(metadata !{i64* %data_addr.i17}, i64 0, metadata !34) nounwind
+  call void @llvm.dbg.value(metadata !6, i64 0, metadata !15, metadata !{metadata !"0x102"}) nounwind
+  call void @llvm.dbg.value(metadata !18, i64 0, metadata !19, metadata !{metadata !"0x102"}) nounwind
+  call void @llvm.dbg.declare(metadata !6, metadata !23, metadata !{metadata !"0x102"}) nounwind
+  call void @llvm.dbg.value(metadata !{i64* %data_addr.i17}, i64 0, metadata !34, metadata !{metadata !"0x102"}) nounwind
   %a13 = load volatile i64* %data_addr.i17, align 8 ; <i64> [#uses=1]
   %a14 = call i64 @llvm.bswap.i64(i64 %a13) nounwind ; <i64> [#uses=2]
   %a15 = add i64 %a10, %a14, !dbg !7              ; <i64> [#uses=1]
@@ -38,9 +38,9 @@
   ret void, !dbg !7
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 
@@ -51,44 +51,44 @@
 !llvm.dbg.cu = !{!4}
 !llvm.module.flags = !{!41}
 !0 = metadata !{i32 808, i32 0, metadata !1, null}
-!1 = metadata !{i32 524299, metadata !39, metadata !2, i32 807, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{i32 524334, metadata !39, null, metadata !"gpt2gpm", metadata !"gpt2gpm", metadata !"gpt2gpm", i32 807, metadata !5, i1 true, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!3 = metadata !{i32 524329, metadata !39} ; [ DW_TAG_file_type ]
-!4 = metadata !{i32 524305, metadata !39, i32 1, metadata !"llvm-gcc", i1 true, metadata !"", i32 0, metadata !18, metadata !18, metadata !40, null, null, i32 0} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 524309, metadata !39, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!1 = metadata !{metadata !"0xb\00807\000\000", metadata !39, metadata !2} ; [ DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00gpt2gpm\00gpt2gpm\00gpt2gpm\00807\001\001\000\006\000\000\000", metadata !39, null, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x29", metadata !39} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !"0x11\001\00llvm-gcc\001\00\000\00\000", metadata !39, metadata !18, metadata !18, metadata !40, null, null} ; [ DW_TAG_compile_unit ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !39, metadata !3, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{null}
 !7 = metadata !{i32 810, i32 0, metadata !1, null}
-!8 = metadata !{i32 524545, metadata !9, metadata !"data", metadata !10, i32 201, metadata !11} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 524334, metadata !10, null, metadata !"_OSSwapInt64", metadata !"_OSSwapInt64", metadata !"_OSSwapInt64", i32 202, metadata !5, i1 true, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 524329, metadata !"OSByteOrder.h", metadata !"/usr/include/libkern/ppc", metadata !4} ; [ DW_TAG_file_type ]
-!11 = metadata !{i32 524310, metadata !36, metadata !3, metadata !"uint64_t", i32 59, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_typedef ]
-!12 = metadata !{i32 524329, metadata !"stdint.h", metadata !"/usr/4.2.1/include", metadata !4} ; [ DW_TAG_file_type ]
-!13 = metadata !{i32 524324, metadata !39, metadata !3, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x101\00data\00201\000", metadata !9, metadata !10, metadata !11} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x2e\00_OSSwapInt64\00_OSSwapInt64\00_OSSwapInt64\00202\001\001\000\006\000\000\000", metadata !10, null, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x29", metadata !"OSByteOrder.h", metadata !"/usr/include/libkern/ppc", metadata !4} ; [ DW_TAG_file_type ]
+!11 = metadata !{metadata !"0x16\00uint64_t\0059\000\000\000\000", metadata !36, metadata !3, metadata !13} ; [ DW_TAG_typedef ]
+!12 = metadata !{metadata !"0x29", metadata !"stdint.h", metadata !"/usr/4.2.1/include", metadata !4} ; [ DW_TAG_file_type ]
+!13 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", metadata !39, metadata !3} ; [ DW_TAG_base_type ]
 !14 = metadata !{i32 202, i32 0, metadata !9, metadata !7}
-!15 = metadata !{i32 524545, metadata !16, metadata !"base", metadata !10, i32 92, metadata !17} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{i32 524334, metadata !38, null, metadata !"OSReadSwapInt64", metadata !"OSReadSwapInt64", metadata !"OSReadSwapInt64", i32 95, metadata !5, i1 true, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 524303, metadata !39, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
+!15 = metadata !{metadata !"0x101\00base\0092\000", metadata !16, metadata !10, metadata !17} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x2e\00OSReadSwapInt64\00OSReadSwapInt64\00OSReadSwapInt64\0095\001\001\000\006\000\000\000", metadata !38, null, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !39, metadata !3, null} ; [ DW_TAG_pointer_type ]
 !18 = metadata !{i32 0}
-!19 = metadata !{i32 524545, metadata !16, metadata !"byteOffset", metadata !10, i32 94, metadata !20} ; [ DW_TAG_arg_variable ]
-!20 = metadata !{i32 524310, metadata !37, metadata !3, metadata !"uintptr_t", i32 114, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_typedef ]
-!21 = metadata !{i32 524329, metadata !"types.h", metadata !"/usr/include/ppc", metadata !4} ; [ DW_TAG_file_type ]
-!22 = metadata !{i32 524324, metadata !39, metadata !3, metadata !"long unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!23 = metadata !{i32 524544, metadata !24, metadata !"u", metadata !10, i32 100, metadata !25} ; [ DW_TAG_auto_variable ]
-!24 = metadata !{i32 524299, metadata !38, metadata !16, i32 95, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!25 = metadata !{i32 524311, metadata !38, metadata !16, metadata !"", i32 97, i64 64, i64 64, i64 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_union_type ] [line 97, size 64, align 64, offset 0] [def] [from ]
+!19 = metadata !{metadata !"0x101\00byteOffset\0094\000", metadata !16, metadata !10, metadata !20} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{metadata !"0x16\00uintptr_t\00114\000\000\000\000", metadata !37, metadata !3, metadata !22} ; [ DW_TAG_typedef ]
+!21 = metadata !{metadata !"0x29", metadata !"types.h", metadata !"/usr/include/ppc", metadata !4} ; [ DW_TAG_file_type ]
+!22 = metadata !{metadata !"0x24\00long unsigned int\000\0032\0032\000\000\007", metadata !39, metadata !3} ; [ DW_TAG_base_type ]
+!23 = metadata !{metadata !"0x100\00u\00100\000", metadata !24, metadata !10, metadata !25} ; [ DW_TAG_auto_variable ]
+!24 = metadata !{metadata !"0xb\0095\000\000", metadata !38, metadata !16} ; [ DW_TAG_lexical_block ]
+!25 = metadata !{metadata !"0x17\00\0097\0064\0064\000\000\000", metadata !38, metadata !16, null, metadata !26, null, null, null} ; [ DW_TAG_union_type ] [line 97, size 64, align 64, offset 0] [def] [from ]
 !26 = metadata !{metadata !27, metadata !28}
-!27 = metadata !{i32 524301, metadata !38, metadata !25, metadata !"u64", i32 98, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_member ]
-!28 = metadata !{i32 524301, metadata !38, metadata !25, metadata !"u32", i32 99, i64 64, i64 32, i64 0, i32 0, metadata !29} ; [ DW_TAG_member ]
-!29 = metadata !{i32 524289, metadata !39, metadata !3, metadata !"", i32 0, i64 64, i64 32, i64 0, i32 0, metadata !30, metadata !32, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from uint32_t]
-!30 = metadata !{i32 524310, metadata !36, metadata !3, metadata !"uint32_t", i32 55, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_typedef ]
-!31 = metadata !{i32 524324, metadata !39, metadata !3, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!27 = metadata !{metadata !"0xd\00u64\0098\0064\0064\000\000", metadata !38, metadata !25, metadata !11} ; [ DW_TAG_member ]
+!28 = metadata !{metadata !"0xd\00u32\0099\0064\0032\000\000", metadata !38, metadata !25, metadata !29} ; [ DW_TAG_member ]
+!29 = metadata !{metadata !"0x1\00\000\0064\0032\000\000", metadata !39, metadata !3, metadata !30, metadata !32, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from uint32_t]
+!30 = metadata !{metadata !"0x16\00uint32_t\0055\000\000\000\000", metadata !36, metadata !3, metadata !31} ; [ DW_TAG_typedef ]
+!31 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !39, metadata !3} ; [ DW_TAG_base_type ]
 !32 = metadata !{metadata !33}
-!33 = metadata !{i32 524321, i64 0, i64 2}        ; [ DW_TAG_subrange_type ]
-!34 = metadata !{i32 524544, metadata !24, metadata !"addr", metadata !10, i32 96, metadata !35} ; [ DW_TAG_auto_variable ]
-!35 = metadata !{i32 524303, metadata !39, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!33 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ]
+!34 = metadata !{metadata !"0x100\00addr\0096\000", metadata !24, metadata !10, metadata !35} ; [ DW_TAG_auto_variable ]
+!35 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !39, metadata !3, metadata !11} ; [ DW_TAG_pointer_type ]
 !36 = metadata !{metadata !"stdint.h", metadata !"/usr/4.2.1/include"}
 !37 = metadata !{metadata !"types.h", metadata !"/usr/include/ppc"}
 !38 = metadata !{metadata !"OSByteOrder.h", metadata !"/usr/include/libkern/ppc"}
 !39 = metadata !{metadata !"G.c", metadata !"/tmp"}
 !40 = metadata !{metadata !2, metadata !9, metadata !16}
-!41 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!41 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-05-10-MultipleCU.ll b/test/DebugInfo/2010-05-10-MultipleCU.ll
index 75d2e70..2e18dbf 100644
--- a/test/DebugInfo/2010-05-10-MultipleCU.ll
+++ b/test/DebugInfo/2010-05-10-MultipleCU.ll

@@ -23,22 +23,22 @@
 !17 = metadata !{metadata !10}
 
 !0 = metadata !{i32 3, i32 0, metadata !1, null}
-!1 = metadata !{i32 786443, metadata !18, metadata !2, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{i32 786478, metadata !18, metadata !3, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!3 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!4 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !19, metadata !19, metadata !16, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 786453, metadata !18, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!1 = metadata !{metadata !"0xb\002\000\000", metadata !18, metadata !2} ; [ DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", metadata !18, metadata !3, metadata !5, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !18, metadata !19, metadata !19, metadata !16, null, null} ; [ DW_TAG_compile_unit ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !3, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786468, metadata !18, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !18, metadata !3} ; [ DW_TAG_base_type ]
 !8 = metadata !{i32 3, i32 0, metadata !9, null}
-!9 = metadata !{i32 786443, metadata !20, metadata !10, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!10 = metadata !{i32 786478, metadata !20, metadata !11, metadata !"bar", metadata !"bar", metadata !"bar", i32 2, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!12 = metadata !{i32 786449, metadata !20, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!13 = metadata !{i32 786453, metadata !20, metadata !11, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0xb\002\000\000", metadata !20, metadata !10} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0x2e\00bar\00bar\00bar\002\000\001\000\006\000\000\000", metadata !20, metadata !11, metadata !13, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!12 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !20, metadata !19, metadata !19, metadata !17, null, null} ; [ DW_TAG_compile_unit ]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !11, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{metadata !15}
-!15 = metadata !{i32 786468, metadata !20, metadata !11, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!15 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !20, metadata !11} ; [ DW_TAG_base_type ]
 !18 = metadata !{metadata !"a.c", metadata !"/tmp/"}
 !19 = metadata !{i32 0}
 !20 = metadata !{metadata !"b.c", metadata !"/tmp/"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll b/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
index a461abd..e1e42cd 100644
--- a/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
+++ b/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll

@@ -7,15 +7,15 @@
 
 @i = common global i32 0                          ; <i32*> [#uses=2]
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 define i32 @bar() nounwind ssp {
 entry:
   %0 = load i32* @i, align 4, !dbg !17            ; <i32> [#uses=2]
-  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !9), !dbg !19
-  tail call void @llvm.dbg.declare(metadata !29, metadata !10), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !19
+  tail call void @llvm.dbg.declare(metadata !29, metadata !10, metadata !{metadata !"0x102"}), !dbg !21
   %1 = mul nsw i32 %0, %0, !dbg !22               ; <i32> [#uses=2]
   store i32 %1, i32* @i, align 4, !dbg !17
   ret i32 %1, !dbg !23
@@ -24,25 +24,25 @@
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!28}
 
-!0 = metadata !{i32 786478, metadata !27, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 9, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, metadata !24, i32 9} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !27} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !27, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !20, metadata !20, metadata !25, metadata !26,  metadata !20, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !27, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\009\001\001\000\006\000\001\009", metadata !27, metadata !1, metadata !3, null, null, null, null, metadata !24} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !27} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !27, metadata !20, metadata !20, metadata !25, metadata !26,  metadata !20} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !27, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !5}
-!5 = metadata !{i32 786468, metadata !27, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !27, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 14, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786453, metadata !27, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !27, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00bar\00bar\00bar\0014\000\001\000\006\000\001\000", metadata !27, metadata !1, metadata !7, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !27, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !5}
-!9 = metadata !{i32 786689, metadata !0, metadata !"j", metadata !1, i32 9, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{i32 786688, metadata !11, metadata !"xyz", metadata !1, i32 10, metadata !12, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !1, metadata !0, i32 9, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 786451, metadata !27, metadata !0, metadata !"X", i32 10, i64 64, i64 32, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 10, size 64, align 32, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0x101\00j\009\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
+!10 = metadata !{metadata !"0x100\00xyz\0010\000", metadata !11, metadata !1, metadata !12} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !0} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{metadata !"0x13\00X\0010\0064\0032\000\000\000", metadata !27, metadata !0, null, metadata !13, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 10, size 64, align 32, offset 0] [def] [from ]
 !13 = metadata !{metadata !14, metadata !15}
-!14 = metadata !{i32 786445, metadata !27, metadata !12, metadata !"a", i32 10, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
-!15 = metadata !{i32 786445, metadata !27, metadata !12, metadata !"b", i32 10, i64 32, i64 32, i64 32, i32 0, metadata !5} ; [ DW_TAG_member ]
-!16 = metadata !{i32 786484, i32 0, metadata !1, metadata !"i", metadata !"i", metadata !"", metadata !1, i32 5, metadata !5, i1 false, i1 true, i32* @i, null} ; [ DW_TAG_variable ]
+!14 = metadata !{metadata !"0xd\00a\0010\0032\0032\000\000", metadata !27, metadata !12, metadata !5} ; [ DW_TAG_member ]
+!15 = metadata !{metadata !"0xd\00b\0010\0032\0032\0032\000", metadata !27, metadata !12, metadata !5} ; [ DW_TAG_member ]
+!16 = metadata !{metadata !"0x34\00i\00i\00\005\000\001", metadata !1, metadata !1, metadata !5, i32* @i, null} ; [ DW_TAG_variable ]
 !17 = metadata !{i32 15, i32 0, metadata !18, null}
-!18 = metadata !{i32 786443, metadata !1, metadata !6, i32 14, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{metadata !"0xb\0014\000\001", metadata !1, metadata !6} ; [ DW_TAG_lexical_block ]
 !19 = metadata !{i32 9, i32 0, metadata !0, metadata !17}
 !20 = metadata !{}
 !21 = metadata !{i32 9, i32 0, metadata !11, metadata !17}
@@ -52,5 +52,5 @@
 !25 = metadata !{metadata !0, metadata !6}
 !26 = metadata !{metadata !16}
 !27 = metadata !{metadata !"bar.c", metadata !"/tmp/"}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !29 = metadata !{null}

diff --git a/test/DebugInfo/2010-07-19-Crash.ll b/test/DebugInfo/2010-07-19-Crash.ll
index a10b10a..7330843 100644
--- a/test/DebugInfo/2010-07-19-Crash.ll
+++ b/test/DebugInfo/2010-07-19-Crash.ll

@@ -12,19 +12,19 @@
 !llvm.dbg.sp = !{!0, !6, !11}
 !llvm.dbg.lv.foo = !{!7}
 
-!0 = metadata !{i32 524334, metadata !12, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 524329, metadata !12} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 524305, metadata !12, i32 12, metadata !"clang 2.8", i1 true, metadata !"", i32 0, metadata !14, metadata !14, metadata !13, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 524309, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00bar\00bar\00bar\003\000\001\000\006\000\001\000", metadata !12, metadata !1, metadata !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang 2.8\001\00\000\00\000", metadata !12, metadata !14, metadata !14, metadata !13, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 524324, metadata !12, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 524334, metadata !12, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 7, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 524544, metadata !8, metadata !"one", metadata !1, i32 8, metadata !5} ; [ DW_TAG_auto_variable ]
-!8 = metadata !{i32 524299, metadata !12, metadata !6, i32 7, i32 18, i32 0} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00foo\00foo\00foo\007\001\001\000\006\000\001\000", metadata !12, metadata !1, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{metadata !"0x100\00one\008\000", metadata !8, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!8 = metadata !{metadata !"0xb\007\0018\000", metadata !12, metadata !6} ; [ DW_TAG_lexical_block ]
 !9 = metadata !{i32 4, i32 3, metadata !10, null}
-!10 = metadata !{i32 524299, metadata !12, metadata !0, i32 3, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 524334, metadata !12, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 7, metadata !3, i1 true, i1 false, i32 0, i32 0, null, i1 false, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0xb\003\0011\000", metadata !12, metadata !0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{metadata !"0x2e\00foo\00foo\00foo\007\001\000\000\006\000\001\000", metadata !12, metadata !1, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
 !12 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
 !13 = metadata !{metadata !0}
 !14 = metadata !{i32 0}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/2010-10-01-crash.ll b/test/DebugInfo/2010-10-01-crash.ll
index f8dbb6e..6c6c7f5 100644
--- a/test/DebugInfo/2010-10-01-crash.ll
+++ b/test/DebugInfo/2010-10-01-crash.ll

@@ -4,23 +4,23 @@
 
 define void @CGRectStandardize(i32* sret %agg.result, i32* byval %rect) nounwind ssp {
 entry:
-  call void @llvm.dbg.declare(metadata !{i32* %rect}, metadata !23), !dbg !24
+  call void @llvm.dbg.declare(metadata !{i32* %rect}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
   ret void
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!27}
-!0 = metadata !{i32 589870, metadata !1, null, metadata !"CGRectStandardize", metadata !"CGRectStandardize", metadata !"CGRectStandardize", i32 54, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void (i32*, i32*)* @CGRectStandardize, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 54] [def] [scope 0] [CGRectStandardize]
-!1 = metadata !{i32 589865, metadata !25}
-!2 = metadata !{i32 589841, metadata !25, i32 16, metadata !"clang version 2.9 (trunk 115292)", i1 true, metadata !"", i32 1, metadata !26, metadata !26, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 589846, metadata !25, null, metadata !"CGRect", i32 49, i64 0, i64 0, i64 0, i32 0, null}
-!23 = metadata !{i32 590081, metadata !0, metadata !"rect", metadata !1, i32 53, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!0 = metadata !{metadata !"0x2e\00CGRectStandardize\00CGRectStandardize\00CGRectStandardize\0054\000\001\000\006\000\000\000", metadata !1, null, null, null, void (i32*, i32*)* @CGRectStandardize, null, null, null} ; [ DW_TAG_subprogram ] [line 54] [def] [scope 0] [CGRectStandardize]
+!1 = metadata !{metadata !"0x29", metadata !25} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0016\00clang version 2.9 (trunk 115292)\001\00\001\00\000", metadata !25, metadata !26, metadata !26, null, null, null} ; [ DW_TAG_compile_unit ]
+!5 = metadata !{metadata !"0x16\00CGRect\0049\000\000\000\000", metadata !25, null, null} ; [ DW_TAG_typedef ]
+!23 = metadata !{metadata !"0x101\00rect\0053\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 53, i32 33, metadata !0, null}
 !25 = metadata !{metadata !"GSFusedSilica.m", metadata !"/Volumes/Data/Users/sabre/Desktop"}
 !26 = metadata !{i32 0}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/AArch64/big-endian-dump.ll b/test/DebugInfo/AArch64/big-endian-dump.ll
new file mode 100644
index 0000000..3af3001
--- /dev/null
+++ b/test/DebugInfo/AArch64/big-endian-dump.ll

@@ -0,0 +1,16 @@
+; RUN: llc -O0 -filetype=obj -mtriple=aarch64_be-none-linux < %s | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: file format ELF64-aarch64-big
+
+target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/AArch64/big-endian.ll b/test/DebugInfo/AArch64/big-endian.ll
new file mode 100644
index 0000000..8391d44
--- /dev/null
+++ b/test/DebugInfo/AArch64/big-endian.ll

@@ -0,0 +1,22 @@
+; RUN: llc %s -filetype=asm -o -
+
+target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64_be--none-eabi"
+
+@a = common global i32 0, align 4
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/work/validation/-] [DW_LANG_C99]
+!1 = metadata !{metadata !"-", metadata !"/work/validation"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !5, metadata !7, i32* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!5 = metadata !{metadata !"0x29", metadata !6}          ; [ DW_TAG_file_type ] [/work/validation/<stdin>]
+!6 = metadata !{metadata !"<stdin>", metadata !"/work/validation"}
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!10 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/AArch64/dwarfdump.ll b/test/DebugInfo/AArch64/dwarfdump.ll
index 98e863d..e9dd428 100644
--- a/test/DebugInfo/AArch64/dwarfdump.ll
+++ b/test/DebugInfo/AArch64/dwarfdump.ll

@@ -27,14 +27,14 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10}
 
-!0 = metadata !{i32 786449, metadata !9, i32 12, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/timnor01/llvm/build/tmp.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 \000\00\000\00\000", metadata !9, metadata !1, metadata !1, metadata !2, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/timnor01/llvm/build/tmp.c] [DW_LANG_C99]
 !1 = metadata !{}
 !2 = metadata !{metadata !3}
-!3 = metadata !{i32 786478, metadata !9, metadata !4, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!4 = metadata !{i32 786473, metadata !9} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!3 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\001", metadata !9, metadata !4, metadata !5, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!4 = metadata !{metadata !"0x29", metadata !9} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !8 = metadata !{i32 2, i32 0, metadata !3, null}
 !9 = metadata !{metadata !"tmp.c", metadata !"/home/tim/llvm/build"}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/AArch64/little-endian-dump.ll b/test/DebugInfo/AArch64/little-endian-dump.ll
new file mode 100644
index 0000000..5c7f336
--- /dev/null
+++ b/test/DebugInfo/AArch64/little-endian-dump.ll

@@ -0,0 +1,16 @@
+; RUN: llc -O0 -filetype=obj -mtriple=aarch64-none-linux < %s | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: file format ELF64-aarch64-little
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/AArch64/processes-relocations.ll b/test/DebugInfo/AArch64/processes-relocations.ll
new file mode 100644
index 0000000..5ce9262
--- /dev/null
+++ b/test/DebugInfo/AArch64/processes-relocations.ll

@@ -0,0 +1,15 @@
+; RUN: llc -filetype=obj -O0 < %s -mtriple aarch64-unknown-linux | \
+; RUN:      llvm-dwarfdump - 2>&1 | FileCheck %s
+
+; CHECK-NOT: failed to compute relocation
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/AArch64/struct_by_value.ll b/test/DebugInfo/AArch64/struct_by_value.ll
index 0e336f7..b9adb45 100644
--- a/test/DebugInfo/AArch64/struct_by_value.ll
+++ b/test/DebugInfo/AArch64/struct_by_value.ll

@@ -32,14 +32,14 @@
 ; Function Attrs: nounwind ssp
 define i32 @return_five_int(%struct.five* %f) #0 {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.five* %f}, metadata !17), !dbg !18
+  call void @llvm.dbg.declare(metadata !{%struct.five* %f}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
   %a = getelementptr inbounds %struct.five* %f, i32 0, i32 0, !dbg !19
   %0 = load i32* %a, align 4, !dbg !19
   ret i32 %0, !dbg !19
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp }
 attributes #1 = { nounwind readnone }
@@ -47,24 +47,24 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !20}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"LLVM version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [struct_by_value.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00LLVM version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [struct_by_value.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"struct_by_value.c", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"return_five_int", metadata !"return_five_int", metadata !"", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.five*)* @return_five_int, null, null, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 14] [return_five_int]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [struct_by_value.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00return_five_int\00return_five_int\00\0013\000\001\000\006\00256\000\0014", metadata !1, metadata !5, metadata !6, null, i32 (%struct.five*)* @return_five_int, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 14] [return_five_int]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [struct_by_value.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !9}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786451, metadata !1, null, metadata !"five", i32 1, i64 160, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [five] [line 1, size 160, align 32, offset 0] [def] [from ]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x13\00five\001\00160\0032\000\000\000", metadata !1, null, null, metadata !10, null, null, null} ; [ DW_TAG_structure_type ] [five] [line 1, size 160, align 32, offset 0] [def] [from ]
 !10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !14, metadata !15}
-!11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"a", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 3, size 32, align 32, offset 0] [from int]
-!12 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"b", i32 4, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 32] [from int]
-!13 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"c", i32 5, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] [c] [line 5, size 32, align 32, offset 64] [from int]
-!14 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"d", i32 6, i64 32, i64 32, i64 96, i32 0, metadata !8} ; [ DW_TAG_member ] [d] [line 6, size 32, align 32, offset 96] [from int]
-!15 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"e", i32 7, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [e] [line 7, size 32, align 32, offset 128] [from int]
+!11 = metadata !{metadata !"0xd\00a\003\0032\0032\000\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [a] [line 3, size 32, align 32, offset 0] [from int]
+!12 = metadata !{metadata !"0xd\00b\004\0032\0032\0032\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 32] [from int]
+!13 = metadata !{metadata !"0xd\00c\005\0032\0032\0064\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [c] [line 5, size 32, align 32, offset 64] [from int]
+!14 = metadata !{metadata !"0xd\00d\006\0032\0032\0096\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [d] [line 6, size 32, align 32, offset 96] [from int]
+!15 = metadata !{metadata !"0xd\00e\007\0032\0032\00128\000", metadata !1, metadata !9, metadata !8} ; [ DW_TAG_member ] [e] [line 7, size 32, align 32, offset 128] [from int]
 !16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!17 = metadata !{i32 786689, metadata !4, metadata !"f", metadata !5, i32 16777229, metadata !9, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [f] [line 13]
+!17 = metadata !{metadata !"0x101\00f\0016777229\008192", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [f] [line 13]
 !18 = metadata !{i32 13, i32 0, metadata !4, null}
 !19 = metadata !{i32 16, i32 0, metadata !4, null}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/ARM/PR16736.ll b/test/DebugInfo/ARM/PR16736.ll
index 8c025ad..afa0ece 100644
--- a/test/DebugInfo/ARM/PR16736.ll
+++ b/test/DebugInfo/ARM/PR16736.ll

@@ -15,14 +15,14 @@
 ; Function Attrs: nounwind
 define arm_aapcscc void @_Z1hiiiif(i32, i32, i32, i32, float %x) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !12), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !13), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{i32 %2}, i64 0, metadata !14), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{i32 %3}, i64 0, metadata !15), !dbg !18
-  tail call void @llvm.dbg.value(metadata !{float %x}, i64 0, metadata !16), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %2}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %3}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{float %x}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
   %call = tail call arm_aapcscc i32 @_Z1fv() #3, !dbg !19
   %conv = sitofp i32 %call to float, !dbg !19
-  tail call void @llvm.dbg.value(metadata !{float %conv}, i64 0, metadata !16), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{float %conv}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !19
   tail call arm_aapcscc void @_Z1gf(float %conv) #3, !dbg !19
   ret void, !dbg !20
 }
@@ -32,7 +32,7 @@
 declare arm_aapcscc i32 @_Z1fv()
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { nounwind  }
 attributes #2 = { nounwind readnone }
@@ -41,25 +41,25 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17, !21}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 190804) (llvm/trunk 190797)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [//<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 190804) (llvm/trunk 190797)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [//<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"/<unknown>", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"h", metadata !"h", metadata !"_Z1hiiiif", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32, i32, i32, i32, float)* @_Z1hiiiif, null, null, metadata !11, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [h]
+!4 = metadata !{metadata !"0x2e\00h\00h\00_Z1hiiiif\003\000\001\000\006\00256\001\003", metadata !5, metadata !6, metadata !7, null, void (i32, i32, i32, i32, float)* @_Z1hiiiif, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 3] [def] [h]
 !5 = metadata !{metadata !"/arm.cpp", metadata !""}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [//arm.cpp]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [//arm.cpp]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !9, metadata !9, metadata !9, metadata !10}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
 !11 = metadata !{metadata !12, metadata !13, metadata !14, metadata !15, metadata !16}
-!12 = metadata !{i32 786689, metadata !4, metadata !"", metadata !6, i32 16777219, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 3]
-!13 = metadata !{i32 786689, metadata !4, metadata !"", metadata !6, i32 33554435, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 3]
-!14 = metadata !{i32 786689, metadata !4, metadata !"", metadata !6, i32 50331651, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 3]
-!15 = metadata !{i32 786689, metadata !4, metadata !"", metadata !6, i32 67108867, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 3]
-!16 = metadata !{i32 786689, metadata !4, metadata !"x", metadata !6, i32 83886083, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 3]
+!12 = metadata !{metadata !"0x101\00\0016777219\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 3]
+!13 = metadata !{metadata !"0x101\00\0033554435\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 3]
+!14 = metadata !{metadata !"0x101\00\0050331651\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 3]
+!15 = metadata !{metadata !"0x101\00\0067108867\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 3]
+!16 = metadata !{metadata !"0x101\00x\0083886083\000", metadata !4, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ] [x] [line 3]
 !17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !18 = metadata !{i32 3, i32 0, metadata !4, null}
 !19 = metadata !{i32 4, i32 0, metadata !4, null}
 !20 = metadata !{i32 5, i32 0, metadata !4, null}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/ARM/big-endian-dump.ll b/test/DebugInfo/ARM/big-endian-dump.ll
new file mode 100644
index 0000000..e35f097
--- /dev/null
+++ b/test/DebugInfo/ARM/big-endian-dump.ll

@@ -0,0 +1,18 @@
+; RUN: llc -O0 -filetype=obj -mtriple=armeb-none-linux < %s | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: file format ELF32-arm-big
+
+target datalayout = "E-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{i32 1, metadata !"wchar_size", i32 4}
+!6 = metadata !{i32 1, metadata !"min_enum_size", i32 4}
+!7 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/ARM/little-endian-dump.ll b/test/DebugInfo/ARM/little-endian-dump.ll
new file mode 100644
index 0000000..da60657
--- /dev/null
+++ b/test/DebugInfo/ARM/little-endian-dump.ll

@@ -0,0 +1,18 @@
+; RUN: llc -O0 -filetype=obj -mtriple=arm-none-linux < %s | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: file format ELF32-arm-little
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{i32 1, metadata !"wchar_size", i32 4}
+!6 = metadata !{i32 1, metadata !"min_enum_size", i32 4}
+!7 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll b/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll
index 0378c75..764c57d 100644
--- a/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll
+++ b/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll

@@ -19,18 +19,18 @@
 ; Function Attrs: nounwind optsize readnone
 define void @run(float %r) #0 {
 entry:
-  tail call void @llvm.dbg.declare(metadata !{float %r}, metadata !11), !dbg !22
+  tail call void @llvm.dbg.declare(metadata !{float %r}, metadata !11, metadata !{metadata !"0x102"}), !dbg !22
   %conv = fptosi float %r to i32, !dbg !23
-  tail call void @llvm.dbg.declare(metadata !{i32 %conv}, metadata !12), !dbg !23
+  tail call void @llvm.dbg.declare(metadata !{i32 %conv}, metadata !12, metadata !{metadata !"0x102"}), !dbg !23
   %vla = alloca float, i32 %conv, align 4, !dbg !24
-  tail call void @llvm.dbg.declare(metadata !{float* %vla}, metadata !14), !dbg !24
+  tail call void @llvm.dbg.declare(metadata !{float* %vla}, metadata !14, metadata !{metadata !"0x102"}), !dbg !24
 ; The VLA alloca should be described by a dbg.declare:
-; CHECK: call void @llvm.dbg.declare(metadata !{float* %vla}, metadata ![[VLA:.*]])
+; CHECK: call void @llvm.dbg.declare(metadata !{float* %vla}, metadata ![[VLA:.*]], metadata {{.*}})
 ; The VLA alloca and following store into the array should not be lowered to like this:
 ; CHECK-NOT:  call void @llvm.dbg.value(metadata !{float %r}, i64 0, metadata ![[VLA]])
 ; the backend interprets this as "vla has the location of %r".
   store float %r, float* %vla, align 4, !dbg !25, !tbaa !26
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !18), !dbg !30
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !30
   %cmp8 = icmp sgt i32 %conv, 0, !dbg !30
   br i1 %cmp8, label %for.body, label %for.end, !dbg !30
 
@@ -41,7 +41,7 @@
   %div = fdiv float %0, %r, !dbg !31
   store float %div, float* %arrayidx2, align 4, !dbg !31, !tbaa !26
   %inc = add nsw i32 %i.09, 1, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !18), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !30
   %exitcond = icmp eq i32 %inc, %conv, !dbg !30
   br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !dbg !30
 
@@ -55,10 +55,10 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind optsize readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -67,26 +67,26 @@
 !llvm.module.flags = !{!20, !33}
 !llvm.ident = !{!21}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Data/radar/15464571/<unknown>] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Volumes/Data/radar/15464571/<unknown>] [DW_LANG_C99]
 !1 = metadata !{metadata !"<unknown>", metadata !"/Volumes/Data/radar/15464571"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"run", metadata !"run", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (float)* @run, null, null, metadata !10, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [run]
+!4 = metadata !{metadata !"0x2e\00run\00run\00\001\000\001\000\006\00256\001\002", metadata !5, metadata !6, metadata !7, null, void (float)* @run, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [run]
 !5 = metadata !{metadata !"test.c", metadata !"/Volumes/Data/radar/15464571"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15464571/test.c]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15464571/test.c]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!9 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
 !10 = metadata !{metadata !11, metadata !12, metadata !14, metadata !18}
-!11 = metadata !{i32 786689, metadata !4, metadata !"r", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 1]
-!12 = metadata !{i32 786688, metadata !4, metadata !"count", metadata !6, i32 3, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [count] [line 3]
-!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{i32 786688, metadata !4, metadata !"vla", metadata !6, i32 4, metadata !15, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [vla] [line 4]
-!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from float]
+!11 = metadata !{metadata !"0x101\00r\0016777217\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [r] [line 1]
+!12 = metadata !{metadata !"0x100\00count\003\000", metadata !4, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [count] [line 3]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !"0x100\00vla\004\008192", metadata !4, metadata !6, metadata !15} ; [ DW_TAG_auto_variable ] [vla] [line 4]
+!15 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from float]
 !16 = metadata !{metadata !17}
-!17 = metadata !{i32 786465, i64 0, i64 -1}       ; [ DW_TAG_subrange_type ] [unbounded]
-!18 = metadata !{i32 786688, metadata !19, metadata !"i", metadata !6, i32 6, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 6]
-!19 = metadata !{i32 786443, metadata !5, metadata !4, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Volumes/Data/radar/15464571/test.c]
+!17 = metadata !{metadata !"0x21\000\00-1"}       ; [ DW_TAG_subrange_type ] [unbounded]
+!18 = metadata !{metadata !"0x100\00i\006\000", metadata !19, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [i] [line 6]
+!19 = metadata !{metadata !"0xb\006\000\000", metadata !5, metadata !4} ; [ DW_TAG_lexical_block ] [/Volumes/Data/radar/15464571/test.c]
 !20 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
 !21 = metadata !{metadata !"clang version 3.4 "}
 !22 = metadata !{i32 1, i32 0, metadata !4, null}
@@ -99,5 +99,5 @@
 !29 = metadata !{metadata !"Simple C/C++ TBAA"}
 !30 = metadata !{i32 6, i32 0, metadata !19, null}
 !31 = metadata !{i32 7, i32 0, metadata !19, null}
-!32 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!32 = metadata !{i32 8, i32 0, metadata !4, null}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/ARM/processes-relocations.ll b/test/DebugInfo/ARM/processes-relocations.ll
new file mode 100644
index 0000000..8edd954
--- /dev/null
+++ b/test/DebugInfo/ARM/processes-relocations.ll

@@ -0,0 +1,15 @@
+; RUN: llc -filetype=obj -O0 < %s -mtriple arm-unknown-linux | \
+; RUN:      llvm-dwarfdump - 2>&1 | FileCheck %s
+
+; CHECK-NOT: failed to compute relocation
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/ARM/s-super-register.ll b/test/DebugInfo/ARM/s-super-register.ll
new file mode 100644
index 0000000..0120045
--- /dev/null
+++ b/test/DebugInfo/ARM/s-super-register.ll

@@ -0,0 +1,63 @@
+; RUN: llc < %s - -filetype=obj | llvm-dwarfdump -debug-dump=loc - | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.6.7"
+
+; The S registers on ARM are expressed as pieces of their super-registers in DWARF.
+;
+; 0x90   DW_OP_regx of super-register
+; 0x93   DW_OP_piece
+; 0x9d   DW_OP_bit_piece
+; CHECK:            Location description: 90 {{.. .. ((93 ..)|(9d .. ..)) $}}
+
+define void @_Z3foov() optsize ssp {
+entry:
+  %call = tail call float @_Z3barv() optsize, !dbg !11
+  tail call void @llvm.dbg.value(metadata !{float %call}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !11
+  %call16 = tail call float @_Z2f2v() optsize, !dbg !12
+  %cmp7 = fcmp olt float %call, %call16, !dbg !12
+  br i1 %cmp7, label %for.body, label %for.end, !dbg !12
+
+for.body:                                         ; preds = %entry, %for.body
+  %k.08 = phi float [ %inc, %for.body ], [ %call, %entry ]
+  %call4 = tail call float @_Z2f3f(float %k.08) optsize, !dbg !13
+  %inc = fadd float %k.08, 1.000000e+00, !dbg !14
+  %call1 = tail call float @_Z2f2v() optsize, !dbg !12
+  %cmp = fcmp olt float %inc, %call1, !dbg !12
+  br i1 %cmp, label %for.body, label %for.end, !dbg !12
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !15
+}
+
+declare float @_Z3barv() optsize
+
+declare float @_Z2f2v() optsize
+
+declare float @_Z2f3f(float) optsize
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!20}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 130845)\001\00\000\00\001", metadata !18, metadata !19, metadata !19, metadata !16, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\001\005", metadata !18, metadata !2, metadata !3, null, void ()* @_Z3foov, null, null, metadata !17} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{null}
+!5 = metadata !{metadata !"0x100\00k\006\000", metadata !6, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{metadata !"0xb\005\0012\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !0} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x100\00y\008\000", metadata !9, metadata !2, metadata !7} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{metadata !"0xb\007\0025\002", metadata !18, metadata !10} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0xb\007\003\001", metadata !18, metadata !6} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 6, i32 18, metadata !6, null}
+!12 = metadata !{i32 7, i32 3, metadata !6, null}
+!13 = metadata !{i32 8, i32 20, metadata !9, null}
+!14 = metadata !{i32 7, i32 20, metadata !10, null}
+!15 = metadata !{i32 10, i32 1, metadata !6, null}
+!16 = metadata !{metadata !1}
+!17 = metadata !{metadata !5, metadata !8}
+!18 = metadata !{metadata !"k.cc", metadata !"/private/tmp"}
+!19 = metadata !{i32 0}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/ARM/sectionorder.ll b/test/DebugInfo/ARM/sectionorder.ll
index a7030cd..24733d9 100644
--- a/test/DebugInfo/ARM/sectionorder.ll
+++ b/test/DebugInfo/ARM/sectionorder.ll

@@ -11,7 +11,8 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.dbg.cu = !{!0}
 
-!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"test.c", metadata !"/Volumes/Data/radar/15623193", metadata !"LLVM", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !1} ; [ DW_TAG_compile_unit ] [/Volumes/Data/radar/15623193/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00LLVM\001\00\00\00\00", metadata !5, metadata !1, metadata !1, metadata !1, metadata !1, null} ; [ DW_TAG_compile_unit ] [/Volumes/Data/radar/15623193/test.c] [DW_LANG_C99]
 !1 = metadata !{}
 !3 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!5 = metadata !{metadata !"test.c", metadata !"/Volumes/Data/radar/15623193"}

diff --git a/test/DebugInfo/ARM/selectiondag-deadcode.ll b/test/DebugInfo/ARM/selectiondag-deadcode.ll
index cc151e0..76e19ef 100644
--- a/test/DebugInfo/ARM/selectiondag-deadcode.ll
+++ b/test/DebugInfo/ARM/selectiondag-deadcode.ll

@@ -13,15 +13,15 @@
   ; and SelectionDAGISel crashes.  It should definitely not
   ; crash. Drop the dbg_value instead.
   ; CHECK-NOT: "matrix"
-  tail call void @llvm.dbg.declare(metadata !{%class.Matrix3.0.6.10* %agg.result}, metadata !45)
+  tail call void @llvm.dbg.declare(metadata !{%class.Matrix3.0.6.10* %agg.result}, metadata !45, metadata !{metadata !"0x102"})
   %2 = getelementptr inbounds %class.Matrix3.0.6.10* %agg.result, i32 0, i32 0, i32 8
   ret void
 }
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 declare arm_aapcscc void @_ZL4Sqrtd() #2
-!4 = metadata !{i32 786434, metadata !5, null, metadata !"Matrix3", i32 20, i64 288, i64 32, i32 0, i32 0, null, null, i32 0, null, null, metadata !"_ZTS7Matrix3"} ; [ DW_TAG_class_type ] [Matrix3] [line 20, size 288, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00Matrix3\0020\00288\0032\000\000\000", metadata !5, null, null, null, null, null, metadata !"_ZTS7Matrix3"} ; [ DW_TAG_class_type ] [Matrix3] [line 20, size 288, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !"test.ii", metadata !"/Volumes/Data/radar/15094721"}
-!39 = metadata !{i32 786478, metadata !5, metadata !40, metadata !"GetMatrix", metadata !"GetMatrix", metadata !"_Z9GetMatrixv", i32 32, metadata !41, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.Matrix3.0.6.10*)* @_Z9GetMatrixv, null, null, null, i32 32} ; [ DW_TAG_subprogram ] [line 32] [def] [GetMatrix]
-!40 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15094721/test.ii]
-!41 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, null, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!45 = metadata !{i32 786688, metadata !39, metadata !"matrix", metadata !40, i32 35, metadata !4, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [matrix] [line 35]
+!39 = metadata !{metadata !"0x2e\00GetMatrix\00GetMatrix\00_Z9GetMatrixv\0032\000\001\000\006\00256\001\0032", metadata !5, metadata !40, metadata !41, null, void (%class.Matrix3.0.6.10*)* @_Z9GetMatrixv, null, null, null} ; [ DW_TAG_subprogram ] [line 32] [def] [GetMatrix]
+!40 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15094721/test.ii]
+!41 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, null, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!45 = metadata !{metadata !"0x100\00matrix\0035\008192", metadata !39, metadata !40, metadata !4} ; [ DW_TAG_auto_variable ] [matrix] [line 35]

diff --git a/test/DebugInfo/ARM/tls.ll b/test/DebugInfo/ARM/tls.ll
index e54d160..c4be030 100644
--- a/test/DebugInfo/ARM/tls.ll
+++ b/test/DebugInfo/ARM/tls.ll

@@ -16,13 +16,13 @@
 ; The debug relocation of the address of the tls variable
 ; CHECK: .long x(tlsldo)
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/tls.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/tls.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"tls.c", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !5, i32 1, metadata !6, i32 0, i32 1, i32* @x, null} ; [ DW_TAG_variable ] [x] [line 1] [def]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.c]
-!6 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!4 = metadata !{metadata !"0x34\00x\00x\00\001\000\001", null, metadata !5, metadata !6, i32* @x, null} ; [ DW_TAG_variable ] [x] [line 1] [def]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.c]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5 "}

diff --git a/test/DebugInfo/COFF/asan-module-ctor.ll b/test/DebugInfo/COFF/asan-module-ctor.ll
index c1d8e75..a62604c 100644
--- a/test/DebugInfo/COFF/asan-module-ctor.ll
+++ b/test/DebugInfo/COFF/asan-module-ctor.ll

@@ -78,14 +78,14 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [D:\/asan.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/asan.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"asan.c", metadata !"D:\5C"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [D:\/asan.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 ()* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [D:\/asan.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5.0 "}
 !10 = metadata !{i32 2, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/COFF/asan-module-without-functions.ll b/test/DebugInfo/COFF/asan-module-without-functions.ll
index 419faa0..d5af109 100644
--- a/test/DebugInfo/COFF/asan-module-without-functions.ll
+++ b/test/DebugInfo/COFF/asan-module-without-functions.ll

@@ -45,9 +45,9 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [D:\/asan.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/asan.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"asan.c", metadata !"D:\5C"}
 !2 = metadata !{}
 !3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !5 = metadata !{metadata !"clang version 3.5.0 "}

diff --git a/test/DebugInfo/COFF/asm.ll b/test/DebugInfo/COFF/asm.ll
index 8c9dff0..9c9dad8 100644
--- a/test/DebugInfo/COFF/asm.ll
+++ b/test/DebugInfo/COFF/asm.ll

@@ -13,21 +13,45 @@
 ;  6 }
 
 ; X86-LABEL: _f:
-; X86-NEXT: # BB
+; X86:      # BB
 ; X86-NEXT: [[ASM_LINE:^L.*]]:{{$}}
 ; X86:      [[CALL_LINE:^L.*]]:{{$}}
-; X86-NEXT: calll   _g
+; X86:      calll   _g
 ; X86-NEXT: [[RETURN_STMT:.*]]:
-; X86-NEXT: ret
-; X86-NEXT: [[END_OF_F:.*]]:
+; X86:      ret
+; X86-NEXT: L{{.*}}:
+; X86-NEXT: [[END_OF_F:^L.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rnd"
+; X86-LABEL: .section        .debug$S,"rd"
 ; X86-NEXT: .long   4
+; Symbol subsection
+; X86-NEXT: .long   241
+; X86-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X86-NEXT: [[F1_START]]:
+; X86-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X86-NEXT: [[PROC_SEGMENT_START]]:
+; X86-NEXT: .short  4423
+; X86-NEXT: .zero   12
+; X86-NEXT: .long [[END_OF_F]]-_f
+; X86-NEXT: .zero   12
+; X86-NEXT: .secrel32 _f
+; X86-NEXT: .secidx _f
+; X86-NEXT: .byte   0
+; X86-NEXT: .byte   102
+; X86-NEXT: .byte   0
+; X86-NEXT: [[PROC_SEGMENT_END]]:
+; X86-NEXT: .short  2
+; X86-NEXT: .short  4431
+; X86-NEXT: [[F1_END]]:
+; Padding
+; X86-NEXT: .zero   3
+; Line table
 ; X86-NEXT: .long   242
 ; X86-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32 _f
 ; X86-NEXT: .secidx _f
+; X86-NEXT: .short 0
 ; X86-NEXT: .long [[END_OF_F]]-_f
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -60,8 +84,20 @@
 ; OBJ32:      Characteristics [ (0x42100040)
 ; OBJ32:      ]
 ; OBJ32:      Relocations [
-; OBJ32-NEXT:   0xC IMAGE_REL_I386_SECREL _f
-; OBJ32-NEXT:   0x10 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x2C IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x30 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x44 IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x48 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT: ]
+; OBJ32:      Subsection [
+; OBJ32-NEXT:   Type: 0xF1
+; OBJ32-NOT:    ]
+; OBJ32:        ProcStart {
+; OBJ32-NEXT:     DisplayName: f
+; OBJ32-NEXT:     Section: _f
+; OBJ32-NEXT:     CodeSize: 0x6
+; OBJ32-NEXT:   }
+; OBJ32-NEXT:   ProcEnd
 ; OBJ32-NEXT: ]
 ; OBJ32:      FunctionLineTable [
 ; OBJ32-NEXT:   Name: _f
@@ -80,23 +116,47 @@
 
 ; X64-LABEL: f:
 ; X64-NEXT: [[START:.*]]:{{$}}
-; X64-NEXT: # BB
-; X64-NEXT: subq    $40, %rsp
+; X64:      # BB
+; X64:      subq    $40, %rsp
 ; X64-NEXT: [[ASM_LINE:.*]]:{{$}}
 ; X64:      [[CALL_LINE:.*]]:{{$}}
-; X64-NEXT: callq   g
+; X64:      callq   g
 ; X64-NEXT: [[EPILOG_AND_RET:.*]]:
-; X64-NEXT: addq    $40, %rsp
+; X64:      addq    $40, %rsp
 ; X64-NEXT: ret
+; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rnd"
+; X64-LABEL: .section        .debug$S,"rd"
 ; X64-NEXT: .long   4
+; Symbol subsection
+; X64-NEXT: .long   241
+; X64-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X64-NEXT: [[F1_START]]:
+; X64-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X64-NEXT: [[PROC_SEGMENT_START]]:
+; X64-NEXT: .short  4423
+; X64-NEXT: .zero   12
+; X64-NEXT: .long [[END_OF_F]]-f
+; X64-NEXT: .zero   12
+; X64-NEXT: .secrel32 f
+; X64-NEXT: .secidx f
+; X64-NEXT: .byte   0
+; X64-NEXT: .byte   102
+; X64-NEXT: .byte   0
+; X64-NEXT: [[PROC_SEGMENT_END]]:
+; X64-NEXT: .short  2
+; X64-NEXT: .short  4431
+; X64-NEXT: [[F1_END]]:
+; Padding
+; X64-NEXT: .zero   3
+; Line table
 ; X64-NEXT: .long   242
 ; X64-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 f
 ; X64-NEXT: .secidx f
+; X64-NEXT: .short 0
 ; X64-NEXT: .long [[END_OF_F]]-f
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -131,8 +191,20 @@
 ; OBJ64:      Characteristics [ (0x42100040)
 ; OBJ64:      ]
 ; OBJ64:      Relocations [
-; OBJ64-NEXT:   0xC IMAGE_REL_AMD64_SECREL f
-; OBJ64-NEXT:   0x10 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x2C IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x30 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x44 IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x48 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT: ]
+; OBJ64:      Subsection [
+; OBJ64-NEXT:   Type: 0xF1
+; OBJ64-NOT:    ]
+; OBJ64:        ProcStart {
+; OBJ64-NEXT:     DisplayName: f
+; OBJ64-NEXT:     Section: f
+; OBJ64-NEXT:     CodeSize: 0xE
+; OBJ64-NEXT:   }
+; OBJ64-NEXT:   ProcEnd
 ; OBJ64-NEXT: ]
 ; OBJ64:      FunctionLineTable [
 ; OBJ64-NEXT:   Name: f
@@ -167,17 +239,17 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
 !1 = metadata !{metadata !"<unknown>", metadata !"D:\5C"}
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @f, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!4 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", metadata !5, metadata !6, metadata !7, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
 !5 = metadata !{metadata !"asm.c", metadata !"D:\5C"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [D:\/asm.c]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [D:\/asm.c]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5 "}
 !12 = metadata !{i32 4, i32 0, metadata !4, null}
 !13 = metadata !{i32 5, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/COFF/cpp-mangling.ll b/test/DebugInfo/COFF/cpp-mangling.ll
new file mode 100644
index 0000000..1ccf2f9
--- /dev/null
+++ b/test/DebugInfo/COFF/cpp-mangling.ll

@@ -0,0 +1,43 @@
+; RUN: llc -mcpu=core2 -mtriple=i686-pc-win32 -o - -O0 < %s | llvm-mc -triple=i686-pc-win32 -filetype=obj | llvm-readobj -s -sr -codeview-linetables | FileCheck %s
+
+; This LL file was generated by running clang on the following code:
+; D:\src.cpp:
+;  1 namespace foo {
+;  2 int bar(int x) {
+;  3   return x * 2;
+;  4 }
+;  5 }
+
+; CHECK:        ProcStart {
+; FIXME: The display name should in fact be "foo::bar", see PR21528
+; CHECK-NEXT:     DisplayName: ?bar@foo@@YAHH@Z
+; CHECK-NEXT:     Section: ?bar@foo@@YAHH@Z
+
+; Function Attrs: nounwind
+define i32 @"\01?bar@foo@@YAHH@Z"(i32 %x) #0 {
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  %0 = load i32* %x.addr, align 4, !dbg !11
+  %mul = mul nsw i32 %0, 2, !dbg !11
+  ret i32 %mul, !dbg !11
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"D:\5C"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00bar\00bar\00\002\000\001\000\000\00256\000\002", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @"\01?bar@foo@@YAHH@Z", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!5 = metadata !{metadata !"src.cpp", metadata !"D:\5C"}
+!6 = metadata !{metadata !"0x29", metadata !5}    ; [ DW_TAG_file_type ] [D:\/src.cpp]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!10 = metadata !{metadata !"clang version 3.6.0 "}
+!11 = metadata !{i32 3, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/COFF/multifile.ll b/test/DebugInfo/COFF/multifile.ll
index c04bdb3..3bc1286 100644
--- a/test/DebugInfo/COFF/multifile.ll
+++ b/test/DebugInfo/COFF/multifile.ll

@@ -7,34 +7,58 @@
 ; D:\input.c:
 ;  1 void g(void);
 ;  2
-;  3 void f() {
+;  3 void f(void) {
 ;  4 #line 1 "one.c"
-;  5   g(void);
+;  5   g();
 ;  6 #line 2 "two.c"
-;  7   g(void);
+;  7   g();
 ;  8 #line 7 "one.c"
-;  9   g(void);
+;  9   g();
 ; 10 }
 
 ; X86-LABEL: _f:
-; X86-NEXT: # BB
+; X86:      # BB
 ; X86-NEXT: [[CALL_LINE_1:.*]]:{{$}}
-; X86-NEXT: calll   _g
+; X86:      calll   _g
 ; X86-NEXT: [[CALL_LINE_2:.*]]:{{$}}
-; X86-NEXT: calll   _g
+; X86:      calll   _g
 ; X86-NEXT: [[CALL_LINE_3:.*]]:{{$}}
-; X86-NEXT: calll   _g
+; X86:      calll   _g
 ; X86-NEXT: [[RETURN_STMT:.*]]:
-; X86-NEXT: ret
+; X86:      ret
+; X86-NEXT: L{{.*}}:
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rnd"
+; X86-LABEL: .section        .debug$S,"rd"
 ; X86-NEXT: .long   4
+; Symbol subsection
+; X86-NEXT: .long   241
+; X86-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X86-NEXT: [[F1_START]]:
+; X86-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X86-NEXT: [[PROC_SEGMENT_START]]:
+; X86-NEXT: .short  4423
+; X86-NEXT: .zero   12
+; X86-NEXT: .long [[END_OF_F]]-_f
+; X86-NEXT: .zero   12
+; X86-NEXT: .secrel32 _f
+; X86-NEXT: .secidx _f
+; X86-NEXT: .byte   0
+; X86-NEXT: .byte   102
+; X86-NEXT: .byte   0
+; X86-NEXT: [[PROC_SEGMENT_END]]:
+; X86-NEXT: .short  2
+; X86-NEXT: .short  4431
+; X86-NEXT: [[F1_END]]:
+; Padding
+; X86-NEXT: .zero   3
+; Line table
 ; X86-NEXT: .long   242
 ; X86-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32 _f
 ; X86-NEXT: .secidx _f
+; X86-NEXT: .short 0
 ; X86-NEXT: .long [[END_OF_F]]-_f
 ; Segment for file 'D:\\one.c' begins
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
@@ -85,8 +109,20 @@
 ; OBJ32:      Characteristics [ (0x42100040)
 ; OBJ32:      ]
 ; OBJ32:      Relocations [
-; OBJ32-NEXT:   0xC IMAGE_REL_I386_SECREL _f
-; OBJ32-NEXT:   0x10 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x2C IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x30 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x44 IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x48 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT: ]
+; OBJ32:      Subsection [
+; OBJ32-NEXT:   Type: 0xF1
+; OBJ32-NOT:    ]
+; OBJ32:        ProcStart {
+; OBJ32-NEXT:     DisplayName: f
+; OBJ32-NEXT:     Section: _f
+; OBJ32-NEXT:     CodeSize: 0x10
+; OBJ32-NEXT:   }
+; OBJ32-NEXT:   ProcEnd
 ; OBJ32-NEXT: ]
 ; OBJ32:      FunctionLineTable [
 ; OBJ32-NEXT:   Name: _f
@@ -109,26 +145,50 @@
 
 ; X64-LABEL: f:
 ; X64-NEXT: [[START:.*]]:{{$}}
-; X64-NEXT: # BB
-; X64-NEXT: subq    $40, %rsp
+; X64:      # BB
+; X64:      subq    $40, %rsp
 ; X64-NEXT: [[CALL_LINE_1:.*]]:{{$}}
-; X64-NEXT: callq   g
+; X64:      callq   g
 ; X64-NEXT: [[CALL_LINE_2:.*]]:{{$}}
-; X64-NEXT: callq   g
+; X64:      callq   g
 ; X64-NEXT: [[CALL_LINE_3:.*]]:{{$}}
-; X64-NEXT: callq   g
+; X64:      callq   g
 ; X64-NEXT: [[EPILOG_AND_RET:.*]]:
-; X64-NEXT: addq    $40, %rsp
+; X64:      addq    $40, %rsp
 ; X64-NEXT: ret
+; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rnd"
+; X64-LABEL: .section        .debug$S,"rd"
 ; X64-NEXT: .long   4
+; Symbol subsection
+; X64-NEXT: .long   241
+; X64-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X64-NEXT: [[F1_START]]:
+; X64-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X64-NEXT: [[PROC_SEGMENT_START]]:
+; X64-NEXT: .short  4423
+; X64-NEXT: .zero   12
+; X64-NEXT: .long [[END_OF_F]]-f
+; X64-NEXT: .zero   12
+; X64-NEXT: .secrel32 f
+; X64-NEXT: .secidx f
+; X64-NEXT: .byte   0
+; X64-NEXT: .byte   102
+; X64-NEXT: .byte   0
+; X64-NEXT: [[PROC_SEGMENT_END]]:
+; X64-NEXT: .short  2
+; X64-NEXT: .short  4431
+; X64-NEXT: [[F1_END]]:
+; Padding
+; X64-NEXT: .zero   3
+; Line table
 ; X64-NEXT: .long   242
 ; X64-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 f
 ; X64-NEXT: .secidx f
+; X64-NEXT: .short 0
 ; X64-NEXT: .long [[END_OF_F]]-f
 ; Segment for file 'D:\\input.c' begins
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
@@ -191,8 +251,20 @@
 ; OBJ64:      Characteristics [ (0x42100040)
 ; OBJ64:      ]
 ; OBJ64:      Relocations [
-; OBJ64-NEXT:   0xC IMAGE_REL_AMD64_SECREL f
-; OBJ64-NEXT:   0x10 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x2C IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x30 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x44 IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x48 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT: ]
+; OBJ64:      Subsection [
+; OBJ64-NEXT:   Type: 0xF1
+; OBJ64-NOT:    ]
+; OBJ64:        ProcStart {
+; OBJ64-NEXT:     DisplayName: f
+; OBJ64-NEXT:     Section: f
+; OBJ64-NEXT:     CodeSize: 0x18
+; OBJ64-NEXT:   }
+; OBJ64-NEXT:   ProcEnd
 ; OBJ64-NEXT: ]
 ; OBJ64:      FunctionLineTable [
 ; OBJ64-NEXT:   Name: f
@@ -235,23 +307,23 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
 !1 = metadata !{metadata !"<unknown>", metadata !"D:\5C"}
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @f, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!4 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", metadata !5, metadata !6, metadata !7, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
 !5 = metadata !{metadata !"input.c", metadata !"D:\5C"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [D:\/input.c]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [D:\/input.c]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5 "}
 !12 = metadata !{i32 1, i32 0, metadata !13, null}
-!13 = metadata !{i32 786443, metadata !14, metadata !4} ; [ DW_TAG_lexical_block ] [D:\/one.c]
+!13 = metadata !{metadata !"0xb\000", metadata !14, metadata !4} ; [ DW_TAG_lexical_block ] [D:\/one.c]
 !14 = metadata !{metadata !"one.c", metadata !"D:\5C"}
 !15 = metadata !{i32 2, i32 0, metadata !16, null}
-!16 = metadata !{i32 786443, metadata !17, metadata !4} ; [ DW_TAG_lexical_block ] [D:\/two.c]
+!16 = metadata !{metadata !"0xb\000", metadata !17, metadata !4} ; [ DW_TAG_lexical_block ] [D:\/two.c]
 !17 = metadata !{metadata !"two.c", metadata !"D:\5C"}
 !18 = metadata !{i32 7, i32 0, metadata !13, null}
-!19 = metadata !{i32 8, i32 0, metadata !13, null} ; [ DW_TAG_imported_declaration ]
+!19 = metadata !{i32 8, i32 0, metadata !13, null}

diff --git a/test/DebugInfo/COFF/multifunction.ll b/test/DebugInfo/COFF/multifunction.ll
index 5a65558..4d4f506 100644
--- a/test/DebugInfo/COFF/multifunction.ll
+++ b/test/DebugInfo/COFF/multifunction.ll

@@ -23,41 +23,66 @@
 
 
 ; X86-LABEL: _x:
-; X86-NEXT: # BB
+; X86:      # BB
 ; X86-NEXT: [[X_CALL:.*]]:{{$}}
-; X86-NEXT: calll   _z
+; X86:      calll   _z
 ; X86-NEXT: [[X_RETURN:.*]]:
-; X86-NEXT: ret
+; X86:      ret
+; X86-NEXT: L{{.*}}:
 ; X86-NEXT: [[END_OF_X:.*]]:
 ;
 ; X86-LABEL: _y:
-; X86-NEXT: # BB
+; X86:      # BB
 ; X86-NEXT: [[Y_CALL:.*]]:{{$}}
-; X86-NEXT: calll   _z
+; X86:      calll   _z
 ; X86-NEXT: [[Y_RETURN:.*]]:
-; X86-NEXT: ret
+; X86:      ret
+; X86-NEXT: L{{.*}}:
 ; X86-NEXT: [[END_OF_Y:.*]]:
 ;
 ; X86-LABEL: _f:
-; X86-NEXT: # BB
+; X86:      # BB
 ; X86-NEXT: [[F_CALLS_X:.*]]:{{$}}
-; X86-NEXT: calll   _x
+; X86:      calll   _x
 ; X86-NEXT: [[F_CALLS_Y:.*]]:
-; X86-NEXT: calll   _y
+; X86:      calll   _y
 ; X86-NEXT: [[F_CALLS_Z:.*]]:
-; X86-NEXT: calll   _z
+; X86:      calll   _z
 ; X86-NEXT: [[F_RETURN:.*]]:
-; X86-NEXT: ret
+; X86:      ret
+; X86-NEXT: L{{.*}}:
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rnd"
+; X86-LABEL: .section        .debug$S,"rd"
 ; X86-NEXT: .long   4
+; Symbol subsection for x
+; X86-NEXT: .long   241
+; X86-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X86-NEXT: [[F1_START]]:
+; X86-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X86-NEXT: [[PROC_SEGMENT_START]]:
+; X86-NEXT: .short  4423
+; X86-NEXT: .zero   12
+; X86-NEXT: .long [[END_OF_X]]-_x
+; X86-NEXT: .zero   12
+; X86-NEXT: .secrel32 _x
+; X86-NEXT: .secidx _x
+; X86-NEXT: .byte   0
+; X86-NEXT: .byte   120
+; X86-NEXT: .byte   0
+; X86-NEXT: [[PROC_SEGMENT_END]]:
+; X86-NEXT: .short  2
+; X86-NEXT: .short  4431
+; X86-NEXT: [[F1_END]]:
+; Padding
+; X86-NEXT: .zero   3
 ; Line table subsection for x
 ; X86-NEXT: .long   242
 ; X86-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32       _x
 ; X86-NEXT: .secidx _x
+; X86-NEXT: .short 0
 ; X86-NEXT: .long [[END_OF_X]]-_x
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -69,12 +94,34 @@
 ; X86-NEXT: .long   5
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; X86-NEXT: [[F2_END]]:
+; Symbol subsection for y
+; X86-NEXT: .long   241
+; X86-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X86-NEXT: [[F1_START]]:
+; X86-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X86-NEXT: [[PROC_SEGMENT_START]]:
+; X86-NEXT: .short  4423
+; X86-NEXT: .zero   12
+; X86-NEXT: .long [[END_OF_Y]]-_y
+; X86-NEXT: .zero   12
+; X86-NEXT: .secrel32 _y
+; X86-NEXT: .secidx _y
+; X86-NEXT: .byte   0
+; X86-NEXT: .byte   121
+; X86-NEXT: .byte   0
+; X86-NEXT: [[PROC_SEGMENT_END]]:
+; X86-NEXT: .short  2
+; X86-NEXT: .short  4431
+; X86-NEXT: [[F1_END]]:
+; Padding
+; X86-NEXT: .zero   3
 ; Line table subsection for y
 ; X86-NEXT: .long   242
 ; X86-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32       _y
 ; X86-NEXT: .secidx _y
+; X86-NEXT: .short 0
 ; X86-NEXT: .long [[END_OF_Y]]-_y
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -86,12 +133,34 @@
 ; X86-NEXT: .long   9
 ; X86-NEXT: [[FILE_SEGMENT_END]]:
 ; X86-NEXT: [[F2_END]]:
+; Symbol subsection for f
+; X86-NEXT: .long   241
+; X86-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X86-NEXT: [[F1_START]]:
+; X86-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X86-NEXT: [[PROC_SEGMENT_START]]:
+; X86-NEXT: .short  4423
+; X86-NEXT: .zero   12
+; X86-NEXT: .long [[END_OF_F]]-_f
+; X86-NEXT: .zero   12
+; X86-NEXT: .secrel32 _f
+; X86-NEXT: .secidx _f
+; X86-NEXT: .byte   0
+; X86-NEXT: .byte   102
+; X86-NEXT: .byte   0
+; X86-NEXT: [[PROC_SEGMENT_END]]:
+; X86-NEXT: .short  2
+; X86-NEXT: .short  4431
+; X86-NEXT: [[F1_END]]:
+; Padding
+; X86-NEXT: .zero   3
 ; Line table subsection for f
 ; X86-NEXT: .long   242
 ; X86-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32 _f
 ; X86-NEXT: .secidx _f
+; X86-NEXT: .short 0
 ; X86-NEXT: .long [[END_OF_F]]-_f
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -125,13 +194,58 @@
 ; OBJ32:      Characteristics [ (0x42100040)
 ; OBJ32:      ]
 ; OBJ32:      Relocations [
-; OBJ32-NEXT:   0xC IMAGE_REL_I386_SECREL _x
-; OBJ32-NEXT:   0x10 IMAGE_REL_I386_SECTION _x
-; OBJ32-NEXT:   0x3C IMAGE_REL_I386_SECREL _y
-; OBJ32-NEXT:   0x40 IMAGE_REL_I386_SECTION _y
-; OBJ32-NEXT:   0x6C IMAGE_REL_I386_SECREL _f
-; OBJ32-NEXT:   0x70 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x2C IMAGE_REL_I386_SECREL _x
+; OBJ32-NEXT:   0x30 IMAGE_REL_I386_SECTION _x
+; OBJ32-NEXT:   0x44 IMAGE_REL_I386_SECREL _x
+; OBJ32-NEXT:   0x48 IMAGE_REL_I386_SECTION _x
+; OBJ32-NEXT:   0x94 IMAGE_REL_I386_SECREL _y
+; OBJ32-NEXT:   0x98 IMAGE_REL_I386_SECTION _y
+; OBJ32-NEXT:   0xAC IMAGE_REL_I386_SECREL _y
+; OBJ32-NEXT:   0xB0 IMAGE_REL_I386_SECTION _y
+; OBJ32-NEXT:   0xFC IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x100 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x114 IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x118 IMAGE_REL_I386_SECTION _f
 ; OBJ32-NEXT: ]
+; OBJ32:      Subsection [
+; OBJ32-NEXT:   Type: 0xF1
+; OBJ32-NOT:    ]
+; OBJ32:        ProcStart {
+; OBJ32-NEXT:     DisplayName: x
+; OBJ32-NEXT:     Section: _x
+; OBJ32-NEXT:     CodeSize: 0x6
+; OBJ32-NEXT:   }
+; OBJ32-NEXT:   ProcEnd
+; OBJ32-NEXT: ]
+; OBJ32:      Subsection [
+; OBJ32-NEXT:   Type: 0xF2
+; OBJ32:      ]
+; OBJ32:      Subsection [
+; OBJ32-NEXT:   Type: 0xF1
+; OBJ32-NOT:    ]
+; OBJ32:        ProcStart {
+; OBJ32-NEXT:     DisplayName: y
+; OBJ32-NEXT:     Section: _y
+; OBJ32-NEXT:     CodeSize: 0x6
+; OBJ32-NEXT:   }
+; OBJ32-NEXT:   ProcEnd
+; OBJ32-NEXT: ]
+; OBJ32:      Subsection [
+; OBJ32-NEXT:   Type: 0xF2
+; OBJ32:      ]
+; OBJ32:      Subsection [
+; OBJ32-NEXT:   Type: 0xF1
+; OBJ32-NOT:    ]
+; OBJ32:        ProcStart {
+; OBJ32-NEXT:     DisplayName: f
+; OBJ32-NEXT:     Section: _f
+; OBJ32-NEXT:     CodeSize: 0x10
+; OBJ32-NEXT:   }
+; OBJ32-NEXT:   ProcEnd
+; OBJ32-NEXT: ]
+; OBJ32:      Subsection [
+; OBJ32-NEXT:   Type: 0xF2
+; OBJ32:      ]
 ; OBJ32:      FunctionLineTable [
 ; OBJ32-NEXT:   Name: _x
 ; OBJ32-NEXT:   CodeSize: 0x6
@@ -165,49 +279,74 @@
 
 ; X64-LABEL: x:
 ; X64-NEXT: [[X_START:.*]]:{{$}}
-; X64-NEXT: # BB
-; X64-NEXT: subq    $40, %rsp
+; X64:      # BB
+; X64:      subq    $40, %rsp
 ; X64-NEXT: [[X_CALL_LINE:.*]]:{{$}}
 ; X64-NEXT: callq   z
 ; X64-NEXT: [[X_EPILOG_AND_RET:.*]]:
-; X64-NEXT: addq    $40, %rsp
+; X64:      addq    $40, %rsp
 ; X64-NEXT: ret
+; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_X:.*]]:
 ;
 ; X64-LABEL: y:
 ; X64-NEXT: [[Y_START:.*]]:{{$}}
-; X64-NEXT: # BB
-; X64-NEXT: subq    $40, %rsp
+; X64:      # BB
+; X64:      subq    $40, %rsp
 ; X64-NEXT: [[Y_CALL_LINE:.*]]:{{$}}
 ; X64-NEXT: callq   z
 ; X64-NEXT: [[Y_EPILOG_AND_RET:.*]]:
-; X64-NEXT: addq    $40, %rsp
+; X64:      addq    $40, %rsp
 ; X64-NEXT: ret
+; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_Y:.*]]:
 ;
 ; X64-LABEL: f:
 ; X64-NEXT: [[F_START:.*]]:{{$}}
-; X64-NEXT: # BB
-; X64-NEXT: subq    $40, %rsp
+; X64:      # BB
+; X64:      subq    $40, %rsp
 ; X64-NEXT: [[F_CALLS_X:.*]]:{{$}}
 ; X64-NEXT: callq   x
 ; X64-NEXT: [[F_CALLS_Y:.*]]:
-; X64-NEXT: callq   y
+; X64:      callq   y
 ; X64-NEXT: [[F_CALLS_Z:.*]]:
-; X64-NEXT: callq   z
+; X64:      callq   z
 ; X64-NEXT: [[F_EPILOG_AND_RET:.*]]:
-; X64-NEXT: addq    $40, %rsp
+; X64:      addq    $40, %rsp
 ; X64-NEXT: ret
+; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rnd"
+; X64-LABEL: .section        .debug$S,"rd"
 ; X64-NEXT: .long   4
+; Symbol subsection for x
+; X64-NEXT: .long   241
+; X64-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X64-NEXT: [[F1_START]]:
+; X64-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X64-NEXT: [[PROC_SEGMENT_START]]:
+; X64-NEXT: .short  4423
+; X64-NEXT: .zero   12
+; X64-NEXT: .long [[END_OF_X]]-x
+; X64-NEXT: .zero   12
+; X64-NEXT: .secrel32 x
+; X64-NEXT: .secidx x
+; X64-NEXT: .byte   0
+; X64-NEXT: .byte   120
+; X64-NEXT: .byte   0
+; X64-NEXT: [[PROC_SEGMENT_END]]:
+; X64-NEXT: .short  2
+; X64-NEXT: .short  4431
+; X64-NEXT: [[F1_END]]:
+; Padding
+; X64-NEXT: .zero   3
 ; Line table subsection for x
 ; X64-NEXT: .long   242
 ; X64-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 x
 ; X64-NEXT: .secidx x
+; X64-NEXT: .short 0
 ; X64-NEXT: .long [[END_OF_X]]-x
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -221,12 +360,34 @@
 ; X64-NEXT: .long   5
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; X64-NEXT: [[F2_END]]:
+; Symbol subsection for y
+; X64-NEXT: .long   241
+; X64-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X64-NEXT: [[F1_START]]:
+; X64-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X64-NEXT: [[PROC_SEGMENT_START]]:
+; X64-NEXT: .short  4423
+; X64-NEXT: .zero   12
+; X64-NEXT: .long [[END_OF_Y]]-y
+; X64-NEXT: .zero   12
+; X64-NEXT: .secrel32 y
+; X64-NEXT: .secidx y
+; X64-NEXT: .byte   0
+; X64-NEXT: .byte   121
+; X64-NEXT: .byte   0
+; X64-NEXT: [[PROC_SEGMENT_END]]:
+; X64-NEXT: .short  2
+; X64-NEXT: .short  4431
+; X64-NEXT: [[F1_END]]:
+; Padding
+; X64-NEXT: .zero   3
 ; Line table subsection for y
 ; X64-NEXT: .long   242
 ; X64-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 y
 ; X64-NEXT: .secidx y
+; X64-NEXT: .short 0
 ; X64-NEXT: .long [[END_OF_Y]]-y
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -240,12 +401,34 @@
 ; X64-NEXT: .long   9
 ; X64-NEXT: [[FILE_SEGMENT_END]]:
 ; X64-NEXT: [[F2_END]]:
+; Symbol subsection for f
+; X64-NEXT: .long   241
+; X64-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X64-NEXT: [[F1_START]]:
+; X64-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X64-NEXT: [[PROC_SEGMENT_START]]:
+; X64-NEXT: .short  4423
+; X64-NEXT: .zero   12
+; X64-NEXT: .long [[END_OF_F]]-f
+; X64-NEXT: .zero   12
+; X64-NEXT: .secrel32 f
+; X64-NEXT: .secidx f
+; X64-NEXT: .byte   0
+; X64-NEXT: .byte   102
+; X64-NEXT: .byte   0
+; X64-NEXT: [[PROC_SEGMENT_END]]:
+; X64-NEXT: .short  2
+; X64-NEXT: .short  4431
+; X64-NEXT: [[F1_END]]:
+; Padding
+; X64-NEXT: .zero   3
 ; Line table subsection for f
 ; X64-NEXT: .long   242
 ; X64-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 f
 ; X64-NEXT: .secidx f
+; X64-NEXT: .short 0
 ; X64-NEXT: .long [[END_OF_F]]-f
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -281,13 +464,58 @@
 ; OBJ64:      Characteristics [ (0x42100040)
 ; OBJ64:      ]
 ; OBJ64:      Relocations [
-; OBJ64-NEXT:   0xC IMAGE_REL_AMD64_SECREL x
-; OBJ64-NEXT:   0x10 IMAGE_REL_AMD64_SECTION x
-; OBJ64-NEXT:   0x44 IMAGE_REL_AMD64_SECREL y
-; OBJ64-NEXT:   0x48 IMAGE_REL_AMD64_SECTION y
-; OBJ64-NEXT:   0x7C IMAGE_REL_AMD64_SECREL f
-; OBJ64-NEXT:   0x80 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x2C IMAGE_REL_AMD64_SECREL x
+; OBJ64-NEXT:   0x30 IMAGE_REL_AMD64_SECTION x
+; OBJ64-NEXT:   0x44 IMAGE_REL_AMD64_SECREL x
+; OBJ64-NEXT:   0x48 IMAGE_REL_AMD64_SECTION x
+; OBJ64-NEXT:   0x9C IMAGE_REL_AMD64_SECREL y
+; OBJ64-NEXT:   0xA0 IMAGE_REL_AMD64_SECTION y
+; OBJ64-NEXT:   0xB4 IMAGE_REL_AMD64_SECREL y
+; OBJ64-NEXT:   0xB8 IMAGE_REL_AMD64_SECTION y
+; OBJ64-NEXT:   0x10C IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x110 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x124 IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x128 IMAGE_REL_AMD64_SECTION f
 ; OBJ64-NEXT: ]
+; OBJ64:      Subsection [
+; OBJ64-NEXT:   Type: 0xF1
+; OBJ64-NOT:    ]
+; OBJ64:        ProcStart {
+; OBJ64-NEXT:     DisplayName: x
+; OBJ64-NEXT:     Section: x
+; OBJ64-NEXT:     CodeSize: 0xE
+; OBJ64-NEXT:   }
+; OBJ64-NEXT:   ProcEnd
+; OBJ64-NEXT: ]
+; OBJ64:      Subsection [
+; OBJ64-NEXT:   Type: 0xF2
+; OBJ64:      ]
+; OBJ64:      Subsection [
+; OBJ64-NEXT:   Type: 0xF1
+; OBJ64-NOT:    ]
+; OBJ64:        ProcStart {
+; OBJ64-NEXT:     DisplayName: y
+; OBJ64-NEXT:     Section: y
+; OBJ64-NEXT:     CodeSize: 0xE
+; OBJ64-NEXT:   }
+; OBJ64-NEXT:   ProcEnd
+; OBJ64-NEXT: ]
+; OBJ64:      Subsection [
+; OBJ64-NEXT:   Type: 0xF2
+; OBJ64:      ]
+; OBJ64:      Subsection [
+; OBJ64-NEXT:   Type: 0xF1
+; OBJ64-NOT:    ]
+; OBJ64:        ProcStart {
+; OBJ64-NEXT:     DisplayName: f
+; OBJ64-NEXT:     Section: f
+; OBJ64-NEXT:     CodeSize: 0x18
+; OBJ64-NEXT:   }
+; OBJ64-NEXT:   ProcEnd
+; OBJ64-NEXT: ]
+; OBJ64:      Subsection [
+; OBJ64-NEXT:   Type: 0xF2
+; OBJ64:      ]
 ; OBJ64:      FunctionLineTable [
 ; OBJ64-NEXT:   Name: x
 ; OBJ64-NEXT:   CodeSize: 0xE
@@ -354,23 +582,23 @@
 !llvm.module.flags = !{!11, !12}
 !llvm.ident = !{!13}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
 !1 = metadata !{metadata !"<unknown>", metadata !"D:\5C"}
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !9, metadata !10}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"x", metadata !"x", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @x, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [x]
+!4 = metadata !{metadata !"0x2e\00x\00x\00\003\000\001\000\006\00256\000\003", metadata !5, metadata !6, metadata !7, null, void ()* @x, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [x]
 !5 = metadata !{metadata !"source.c", metadata !"D:\5C"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [D:\/source.c]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [D:\/source.c]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"y", metadata !"y", metadata !"", i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @y, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [y]
-!10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 11, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @f, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [f]
+!9 = metadata !{metadata !"0x2e\00y\00y\00\007\000\001\000\006\00256\000\007", metadata !5, metadata !6, metadata !7, null, void ()* @y, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [y]
+!10 = metadata !{metadata !"0x2e\00f\00f\00\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !6, metadata !7, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [f]
 !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !13 = metadata !{metadata !"clang version 3.5 "}
 !14 = metadata !{i32 4, i32 0, metadata !4, null}
 !15 = metadata !{i32 5, i32 0, metadata !4, null}
-!16 = metadata !{i32 8, i32 0, metadata !9, null} ; [ DW_TAG_imported_declaration ]
+!16 = metadata !{i32 8, i32 0, metadata !9, null}
 !17 = metadata !{i32 9, i32 0, metadata !9, null}
 !18 = metadata !{i32 12, i32 0, metadata !10, null}
 !19 = metadata !{i32 13, i32 0, metadata !10, null}

diff --git a/test/DebugInfo/COFF/simple.ll b/test/DebugInfo/COFF/simple.ll
index 2613a18..00f1829 100644
--- a/test/DebugInfo/COFF/simple.ll
+++ b/test/DebugInfo/COFF/simple.ll

@@ -12,20 +12,44 @@
 ; 5 }
 
 ; X86-LABEL: _f:
-; X86-NEXT: # BB
+; X86:      # BB
 ; X86-NEXT: [[CALL_LINE:^L.*]]:{{$}}
-; X86-NEXT: calll   _g
+; X86:      calll   _g
 ; X86-NEXT: [[RETURN_STMT:.*]]:
-; X86-NEXT: ret
+; X86:      ret
+; X86-NEXT: L{{.*}}:
 ; X86-NEXT: [[END_OF_F:.*]]:
 ;
-; X86-LABEL: .section        .debug$S,"rnd"
+; X86-LABEL: .section        .debug$S,"rd"
 ; X86-NEXT: .long   4
+; Symbol subsection
+; X86-NEXT: .long   241
+; X86-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X86-NEXT: [[F1_START]]:
+; X86-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X86-NEXT: [[PROC_SEGMENT_START]]:
+; X86-NEXT: .short  4423
+; X86-NEXT: .zero   12
+; X86-NEXT: .long [[END_OF_F]]-_f
+; X86-NEXT: .zero   12
+; X86-NEXT: .secrel32 _f
+; X86-NEXT: .secidx _f
+; X86-NEXT: .byte   0
+; X86-NEXT: .byte   102
+; X86-NEXT: .byte   0
+; X86-NEXT: [[PROC_SEGMENT_END]]:
+; X86-NEXT: .short  2
+; X86-NEXT: .short  4431
+; X86-NEXT: [[F1_END]]:
+; Padding
+; X86-NEXT: .zero   3
+; Line table
 ; X86-NEXT: .long   242
 ; X86-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X86-NEXT: [[F2_START]]:
 ; X86-NEXT: .secrel32 _f
 ; X86-NEXT: .secidx _f
+; X86-NEXT: .short  0
 ; X86-NEXT: .long [[END_OF_F]]-_f
 ; X86-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X86-NEXT: .long   0
@@ -56,8 +80,20 @@
 ; OBJ32:      Characteristics [ (0x42100040)
 ; OBJ32:      ]
 ; OBJ32:      Relocations [
-; OBJ32-NEXT:   0xC IMAGE_REL_I386_SECREL _f
-; OBJ32-NEXT:   0x10 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x2C IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x30 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT:   0x44 IMAGE_REL_I386_SECREL _f
+; OBJ32-NEXT:   0x48 IMAGE_REL_I386_SECTION _f
+; OBJ32-NEXT: ]
+; OBJ32:      Subsection [
+; OBJ32-NEXT:   Type: 0xF1
+; OBJ32-NOT:    ]
+; OBJ32:        ProcStart {
+; OBJ32-NEXT:     DisplayName: f
+; OBJ32-NEXT:     Section: _f
+; OBJ32-NEXT:     CodeSize: 0x6
+; OBJ32-NEXT:   }
+; OBJ32-NEXT:   ProcEnd
 ; OBJ32-NEXT: ]
 ; OBJ32:      FunctionLineTable [
 ; OBJ32-NEXT:   Name: _f
@@ -72,22 +108,46 @@
 
 ; X64-LABEL: f:
 ; X64-NEXT: [[START:.*]]:{{$}}
-; X64-NEXT: # BB
-; X64-NEXT: subq    $40, %rsp
+; X64:      # BB
+; X64:      subq    $40, %rsp
 ; X64-NEXT: [[CALL_LINE:.*]]:{{$}}
 ; X64-NEXT: callq   g
 ; X64-NEXT: [[EPILOG_AND_RET:.*]]:
-; X64-NEXT: addq    $40, %rsp
+; X64:      addq    $40, %rsp
 ; X64-NEXT: ret
+; X64-NEXT: .L{{.*}}:
 ; X64-NEXT: [[END_OF_F:.*]]:
 ;
-; X64-LABEL: .section        .debug$S,"rnd"
+; X64-LABEL: .section        .debug$S,"rd"
 ; X64-NEXT: .long   4
+; Symbol subsection
+; X64-NEXT: .long   241
+; X64-NEXT: .long [[F1_END:.*]]-[[F1_START:.*]]
+; X64-NEXT: [[F1_START]]:
+; X64-NEXT: .short [[PROC_SEGMENT_END:.*]]-[[PROC_SEGMENT_START:.*]]
+; X64-NEXT: [[PROC_SEGMENT_START]]:
+; X64-NEXT: .short  4423
+; X64-NEXT: .zero   12
+; X64-NEXT: .long [[END_OF_F]]-f
+; X64-NEXT: .zero   12
+; X64-NEXT: .secrel32 f
+; X64-NEXT: .secidx f
+; X64-NEXT: .byte   0
+; X64-NEXT: .byte   102
+; X64-NEXT: .byte   0
+; X64-NEXT: [[PROC_SEGMENT_END]]:
+; X64-NEXT: .short  2
+; X64-NEXT: .short  4431
+; X64-NEXT: [[F1_END]]:
+; Padding
+; X64-NEXT: .zero   3
+; Line table
 ; X64-NEXT: .long   242
 ; X64-NEXT: .long [[F2_END:.*]]-[[F2_START:.*]]
 ; X64-NEXT: [[F2_START]]:
 ; X64-NEXT: .secrel32 f
 ; X64-NEXT: .secidx f
+; X64-NEXT: .short  0
 ; X64-NEXT: .long [[END_OF_F]]-f
 ; X64-NEXT: [[FILE_SEGMENT_START:[^:]*]]:
 ; X64-NEXT: .long   0
@@ -120,8 +180,20 @@
 ; OBJ64:      Characteristics [ (0x42100040)
 ; OBJ64:      ]
 ; OBJ64:      Relocations [
-; OBJ64-NEXT:   0xC IMAGE_REL_AMD64_SECREL f
-; OBJ64-NEXT:   0x10 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x2C IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x30 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT:   0x44 IMAGE_REL_AMD64_SECREL f
+; OBJ64-NEXT:   0x48 IMAGE_REL_AMD64_SECTION f
+; OBJ64-NEXT: ]
+; OBJ64:      Subsection [
+; OBJ64-NEXT:   Type: 0xF1
+; OBJ64-NOT:    ]
+; OBJ64:        ProcStart {
+; OBJ64-NEXT:     DisplayName: f
+; OBJ64-NEXT:     Section: f
+; OBJ64-NEXT:     CodeSize: 0xE
+; OBJ64-NEXT:   }
+; OBJ64-NEXT:   ProcEnd
 ; OBJ64-NEXT: ]
 ; OBJ64:      FunctionLineTable [
 ; OBJ64-NEXT:   Name: f
@@ -151,17 +223,17 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/<unknown>] [DW_LANG_C99]
 !1 = metadata !{metadata !"<unknown>", metadata !"D:\5C"}
-!2 = metadata !{i32 0}
+!2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @f, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!4 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", metadata !5, metadata !6, metadata !7, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
 !5 = metadata !{metadata !"test.c", metadata !"D:\5C"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [D:\/test.c]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [D:\/test.c]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5 "}
 !12 = metadata !{i32 4, i32 0, metadata !4, null}
 !13 = metadata !{i32 5, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll b/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll
index 4d2e427..8db2fd0 100644
--- a/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll
+++ b/test/DebugInfo/COFF/tail-call-without-lexical-scopes.ll

@@ -18,11 +18,11 @@
 ; X86-LABEL: {{^}}"?bar@@YAXHZZ":
 ; X86-NEXT: # BB
 ; X86-NEXT: [[JMP_LINE:^L.*]]:{{$}}
-; X86-NEXT: jmp "?foo@@YAXXZ"
+; X86:      jmp "?foo@@YAXXZ"
 ; X86-NEXT: [[END_OF_BAR:^L.*]]:{{$}}
 ; X86-NOT:  ret
 
-; X86-LABEL: .section        .debug$S,"rnd"
+; X86-LABEL: .section        .debug$S,"rd"
 ; X86:       .secrel32 "?bar@@YAXHZZ"
 ; X86-NEXT:  .secidx   "?bar@@YAXHZZ"
 ; X86:       .long   0
@@ -61,18 +61,18 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [D:\/test.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\/test.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"test.cpp", metadata !"D:\5C"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"spam", metadata !"spam", metadata !"", i32 7, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @"\01?spam@@YAXXZ", null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [spam]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [D:\/test.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 3, metadata !6, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [bar]
+!4 = metadata !{metadata !"0x2e\00spam\00spam\00\007\000\001\000\006\00256\001\007", metadata !1, metadata !5, metadata !6, null, void ()* @"\01?spam@@YAXXZ", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [spam]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [D:\/test.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00bar\00bar\00\003\001\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [bar]
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5.0 "}
-!11 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!11 = metadata !{i32 8, i32 0, metadata !4, null}
 !12 = metadata !{i32 9, i32 0, metadata !4, null}
 !13 = metadata !{i32 4, i32 0, metadata !7, null}
 !14 = metadata !{i32 5, i32 0, metadata !7, null}

diff --git a/test/DebugInfo/Inputs/cross-cu-inlining.c b/test/DebugInfo/Inputs/cross-cu-inlining.c
new file mode 100644
index 0000000..0553581
--- /dev/null
+++ b/test/DebugInfo/Inputs/cross-cu-inlining.c

@@ -0,0 +1,18 @@
+// To generate the test file:
+// clang cross-cu-inlining.c -DA_C -g -emit-llvm -S -o a.ll
+// clang cross-cu-inlining.c -DB_C -g -emit-llvm -S -o b.ll
+// llvm-link a.ll b.ll -o ab.bc
+// opt -inline ab.bc -o cross-cu-inlining.bc
+// clang -c cross-cu-inlining.bc -o cross-cu-inlining.o
+#ifdef A_C
+int i;
+int func(int);
+int main() {
+  return func(i);
+}
+#endif
+#ifdef B_C
+int __attribute__((always_inline)) func(int x) {
+  return x * 2;
+}
+#endif

diff --git a/test/DebugInfo/Inputs/cross-cu-inlining.x86_64-macho.o b/test/DebugInfo/Inputs/cross-cu-inlining.x86_64-macho.o
new file mode 100644
index 0000000..052d4c9
--- /dev/null
+++ b/test/DebugInfo/Inputs/cross-cu-inlining.x86_64-macho.o
Binary files differ

diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
index 6df03da..decc72b 100755
--- a/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
Binary files differ

diff --git a/test/DebugInfo/Inputs/dwarfdump-objc.m b/test/DebugInfo/Inputs/dwarfdump-objc.m
new file mode 100644
index 0000000..54fbee2
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-objc.m

@@ -0,0 +1,16 @@
+// Compile with clang -g dwarfdump-objc.m -c -Wno-objc-root-class
+
+@interface NSObject {} @end
+
+
+@interface TestInterface
+@property (readonly) int ReadOnly;
+@property (assign) int Assign;
+@property (readwrite) int ReadWrite;
+@property (retain) NSObject *Retain;
+@property (copy) NSObject *Copy;
+@property (nonatomic) int NonAtomic;
+@end
+
+@implementation TestInterface
+@end

diff --git a/test/DebugInfo/Inputs/dwarfdump-objc.x86_64.o b/test/DebugInfo/Inputs/dwarfdump-objc.x86_64.o
new file mode 100644
index 0000000..6b55d38
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-objc.x86_64.o
Binary files differ

diff --git a/test/DebugInfo/Inputs/gmlt.ll b/test/DebugInfo/Inputs/gmlt.ll
new file mode 100644
index 0000000..ba8d113
--- /dev/null
+++ b/test/DebugInfo/Inputs/gmlt.ll

@@ -0,0 +1,153 @@
+; REQUIRES: object-emission
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s
+
+; Generated from the following source compiled with clang++ -gmlt:
+; void f1() {}
+; void __attribute__((section("__TEXT,__bar"))) f2() {}
+; void __attribute__((always_inline)) f3() { f1(); }
+; void f4() { f3(); }
+
+; Check that
+;  * -gmlt includes no DW_TAG_subprograms for subprograms without inlined
+;    subroutines.
+;  * yet still produces DW_AT_ranges and a range list in debug_ranges that
+;    describes those subprograms
+
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_AT_ranges [DW_FORM_sec_offset] (0x00000000
+; CHECK-NOT: {{DW_TAG|NULL}}
+
+; Omitting the subprograms without inlined subroutines is not possible
+; currently on Darwin as dsymutil will drop the whole CU if it has no subprograms
+; (which happens with this optimization if there are no inlined subroutines).
+
+; DARWIN:  DW_TAG_subprogram
+; DARWIN-NOT: DW_TAG
+; DARWIN:    DW_AT_name {{.*}} "f1"
+; DARWIN-NOT: {{DW_TAG|NULL}}
+; DARWIN:  DW_TAG_subprogram
+; DARWIN-NOT: DW_TAG
+; DARWIN:    DW_AT_name {{.*}} "f2"
+; DARWIN-NOT: {{DW_TAG|NULL}}
+; DARWIN:  DW_TAG_subprogram
+; DARWIN-NOT: DW_TAG
+; Can't check the abstract_origin value across the DARWIN/CHECK checking and
+; ordering, so don't bother - just trust me, it refers to f3 down there.
+; DARWIN:    DW_AT_abstract_origin
+; DARWIN-NOT: {{DW_TAG|NULL}}
+
+
+; FIXME: Emitting separate abstract definitions is inefficient when we could
+; just attach the DW_AT_name to the inlined_subroutine directly. Except that
+; would produce many string relocations. Implement string indexing in the
+; skeleton CU to address the relocation problem, then remove abstract
+; definitions from -gmlt here.
+
+; CHECK: DW_TAG_subprogram
+; CHECK-NEXT:     DW_AT_name {{.*}} "f3"
+
+; FIXME: We don't really need DW_AT_inline, consumers can ignore this due to
+; the absence of high_pc/low_pc/ranges and know that they just need it for
+; retrieving the name of a concrete inlined instance
+
+; CHECK-NOT: {{DW_TAG|DW_AT|NULL}}
+
+; Check that we only provide the minimal attributes on a subprogram to save space.
+; CHECK:   DW_TAG_subprogram
+; CHECK-NEXT:     DW_AT_low_pc
+; CHECK-NEXT:     DW_AT_high_pc
+; CHECK-NEXT:     DW_AT_name
+; CHECK-NOT: {{DW_TAG|DW_AT}}
+; CHECK:     DW_TAG_inlined_subroutine
+
+; As mentioned above - replace DW_AT_abstract_origin with DW_AT_name to save
+; space once we have support for string indexing in non-dwo sections
+
+; CHECK-NEXT:       DW_AT_abstract_origin {{.*}} "f3"
+; CHECK-NEXT:       DW_AT_low_pc
+; CHECK-NEXT:       DW_AT_high_pc
+; CHECK-NEXT:       DW_AT_call_file
+; CHECK-NEXT:       DW_AT_call_line
+
+; Make sure we don't have any other subprograms here (subprograms with no
+; inlined subroutines are omitted by design to save space)
+
+; CHECK-NOT: {{DW_TAG|DW_AT}}
+; CHECK: NULL
+; CHECK-NOT: {{DW_TAG|DW_AT}}
+; CHECK: NULL
+
+
+; CHECK: .debug_ranges contents:
+
+; ... some addresses (depends on platform (such as platforms with function
+; reordering in the linker), and looks wonky on platforms with zero values
+; written in relocation places (dumper needs to	be fixed to read the
+; relocations rather than interpret that as the end of a range list))
+
+; CHECK: 00000000 <End of list>
+
+
+; Check that we don't emit any pubnames or pubtypes under -gmlt
+; CHECK: .debug_pubnames contents:
+; CHECK-NOT: Offset
+
+; CHECK: .debug_pubtypes contents:
+; CHECK-NOT: Offset
+
+; CHECK: .apple{{.*}} contents:
+
+; Function Attrs: nounwind uwtable
+define void @_Z2f1v() #0 {
+entry:
+  ret void, !dbg !13
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z2f2v() #0 section "__TEXT,__bar" {
+entry:
+  ret void, !dbg !14
+}
+
+; Function Attrs: alwaysinline nounwind uwtable
+define void @_Z2f3v() #1 {
+entry:
+  call void @_Z2f1v(), !dbg !15
+  ret void, !dbg !16
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z2f4v() #0 {
+entry:
+  call void @_Z2f1v() #2, !dbg !17
+  ret void, !dbg !19
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/gmlt.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"gmlt.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !7, metadata !8, metadata !9}
+!4 = metadata !{metadata !"0x2e\00f1\00f1\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f1v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f1]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/gmlt.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00f2\00f2\00\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f2]
+!8 = metadata !{metadata !"0x2e\00f3\00f3\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f3v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f3]
+!9 = metadata !{metadata !"0x2e\00f4\00f4\00\004\000\001\000\006\00256\000\004", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f4v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f4]
+!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!11 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!12 = metadata !{metadata !"clang version 3.6.0 "}
+!13 = metadata !{i32 1, i32 12, metadata !4, null}
+!14 = metadata !{i32 2, i32 53, metadata !7, null}
+!15 = metadata !{i32 3, i32 44, metadata !8, null}
+!16 = metadata !{i32 3, i32 50, metadata !8, null}
+!17 = metadata !{i32 3, i32 44, metadata !8, metadata !18}
+!18 = metadata !{i32 4, i32 13, metadata !9, null}
+!19 = metadata !{i32 4, i32 19, metadata !9, null}

diff --git a/test/DebugInfo/Inputs/split-dwarf-test b/test/DebugInfo/Inputs/split-dwarf-test
new file mode 100755
index 0000000..a441112
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-test
Binary files differ

diff --git a/test/DebugInfo/Inputs/split-dwarf-test.cc b/test/DebugInfo/Inputs/split-dwarf-test.cc
new file mode 100644
index 0000000..5ed56f9
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-test.cc

@@ -0,0 +1,17 @@
+int foo(int a) {
+  return a + 1;
+}
+
+int main(int argc, char *argv[]) {
+  return foo(argc);
+}
+
+// Build instructions:
+// 1) clang++ -### -O2 -gsplit-dwarf.cc split-dwarf-test.cc -o split-dwarf-test
+// 2) Replace the value "-fdebug-compilation-dir" flag to "Output"
+//      (this is the temp directory used by lit).
+// 3) Manually run clang-cc1, objcopy and ld invocations.
+// 4) Copy the binary and .dwo file to the Inputs directory. Make sure the
+//    .dwo file will be available for symbolizer (use test RUN-lines to copy
+//    the .dwo file to a directory
+//    <execution_directory>/<directory_provided_in_fdebug_compilation_dir>.

diff --git a/test/DebugInfo/Inputs/split-dwarf-test.dwo b/test/DebugInfo/Inputs/split-dwarf-test.dwo
new file mode 100644
index 0000000..74183a4
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-test.dwo
Binary files differ

diff --git a/test/DebugInfo/Mips/delay-slot.ll b/test/DebugInfo/Mips/delay-slot.ll
index 9bce4ba..5587bcb 100644
--- a/test/DebugInfo/Mips/delay-slot.ll
+++ b/test/DebugInfo/Mips/delay-slot.ll

@@ -26,7 +26,7 @@
 ; Function Attrs: nounwind
 define i32 @foo(i32 %x) #0 {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !12), !dbg !13
+  call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !13
   %tobool = icmp ne i32 %x, 0, !dbg !14
   br i1 %tobool, label %if.then, label %if.end, !dbg !14
 
@@ -42,10 +42,10 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
@@ -54,22 +54,22 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test.c", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5.0"}
-!12 = metadata !{i32 786689, metadata !4, metadata !"x", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!12 = metadata !{metadata !"0x101\00x\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [x] [line 1]
 !13 = metadata !{i32 1, i32 0, metadata !4, null}
 !14 = metadata !{i32 2, i32 0, metadata !15, null}
-!15 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/test.c]
+!15 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/test.c]
 !16 = metadata !{i32 3, i32 0, metadata !15, null}
 !17 = metadata !{i32 4, i32 0, metadata !4, null}
 !18 = metadata !{i32 5, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/Mips/processes-relocations.ll b/test/DebugInfo/Mips/processes-relocations.ll
new file mode 100644
index 0000000..98eba68
--- /dev/null
+++ b/test/DebugInfo/Mips/processes-relocations.ll

@@ -0,0 +1,17 @@
+; RUN: llc -filetype=obj -O0 < %s -mtriple mips-unknown-linux | \
+; RUN:      llvm-dwarfdump - 2>&1 | FileCheck %s
+; RUN: llc -filetype=obj -O0 < %s -mtriple mips64-unknown-linux | \
+; RUN:      llvm-dwarfdump - 2>&1 | FileCheck %s
+
+; CHECK-NOT: failed to compute relocation
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/PR20038.ll b/test/DebugInfo/PR20038.ll
index 61145e5..2cd40fb 100644
--- a/test/DebugInfo/PR20038.ll
+++ b/test/DebugInfo/PR20038.ll

@@ -14,15 +14,15 @@
 ; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_name {{.*}} "C"
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: [[C_DTOR_DECL:.*]]:  DW_TAG_subprogram
+; CHECK:   DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_name {{.*}} "~C"
 
-; CHECK: [[D1_ABS:.*]]: DW_TAG_subprogram
+; CHECK:  DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN1CD1Ev"
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: [[D1_THIS_ABS:.*]]:   DW_TAG_formal_parameter
+; CHECK:  DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_name {{.*}} "this"
 
@@ -30,23 +30,19 @@
 ; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_name {{.*}} "fun4"
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK:   DW_TAG_lexical_block
-; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK:     DW_TAG_inlined_subroutine
+; CHECK:   DW_TAG_inlined_subroutine
 ; CHECK-NOT: DW_TAG
-; CHECK:       DW_AT_abstract_origin {{.*}} {[[D1_ABS]]}
+; CHECK:     DW_AT_abstract_origin {{.*}} "_ZN1CD1Ev"
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK:       DW_TAG_formal_parameter
+; CHECK:     DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK:         DW_AT_abstract_origin {{.*}} {[[D1_THIS_ABS]]}
+; CHECK:       DW_AT_abstract_origin {{.*}} "this"
 
 ; FIXME: D2 is actually inlined into D1 but doesn't show up here, possibly due
 ; to there being no work in D2 (calling another member function from the dtor
 ; causes D2 to show up, calling a free function doesn't).
 
 ; CHECK-NOT: DW_TAG
-; CHECK:       NULL
-; CHECK-NOT: DW_TAG
 ; CHECK:     NULL
 ; CHECK-NOT: DW_TAG
 ; CHECK:   NULL
@@ -78,10 +74,10 @@
 
 cleanup.action:                                   ; preds = %land.end
   store %struct.C* %agg.tmp.ensured, %struct.C** %this.addr.i, align 8, !dbg !22
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !29), !dbg !31
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !29, metadata !{metadata !"0x102"}), !dbg !31
   %this1.i = load %struct.C** %this.addr.i, !dbg !22
   store %struct.C* %this1.i, %struct.C** %this.addr.i.i, align 8, !dbg !21
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i.i}, metadata !32), !dbg !33
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i.i}, metadata !32, metadata !{metadata !"0x102"}), !dbg !33
   %this1.i.i = load %struct.C** %this.addr.i.i, !dbg !21
   br label %cleanup.done, !dbg !22
 
@@ -95,10 +91,10 @@
   %this.addr.i = alloca %struct.C*, align 8, !dbg !37
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !29), !dbg !38
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !29, metadata !{metadata !"0x102"}), !dbg !38
   %this1 = load %struct.C** %this.addr
   store %struct.C* %this1, %struct.C** %this.addr.i, align 8, !dbg !37
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !32), !dbg !39
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr.i}, metadata !32, metadata !{metadata !"0x102"}), !dbg !39
   %this1.i = load %struct.C** %this.addr.i, !dbg !37
   ret void, !dbg !37
 }
@@ -108,13 +104,13 @@
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !32), !dbg !40
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !40
   %this1 = load %struct.C** %this.addr
   ret void, !dbg !41
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -124,43 +120,43 @@
 !llvm.module.flags = !{!18, !19}
 !llvm.ident = !{!20}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00C\001\008\008\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !"PR20038.cpp", metadata !"/tmp/dbginfo"}
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"~C", metadata !"~C", metadata !"", i32 2, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [~C]
-!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00~C\00~C\00\002\000\000\000\006\00256\000\002", metadata !5, metadata !"_ZTS1C", metadata !8, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 2] [~C]
+!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !9 = metadata !{null, metadata !10}
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
 !11 = metadata !{metadata !12, metadata !16, metadata !17}
-!12 = metadata !{i32 786478, metadata !5, metadata !13, metadata !"fun4", metadata !"fun4", metadata !"_Z4fun4v", i32 5, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4fun4v, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [fun4]
-!13 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/PR20038.cpp]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x2e\00fun4\00fun4\00_Z4fun4v\005\000\001\000\006\00256\000\005", metadata !5, metadata !13, metadata !14, null, void ()* @_Z4fun4v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [fun4]
+!13 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/PR20038.cpp]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null}
-!16 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"~C", metadata !"~C", metadata !"_ZN1CD2Ev", i32 6, metadata !8, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1CD2Ev, null, metadata !7, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
-!17 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"~C", metadata !"~C", metadata !"_ZN1CD1Ev", i32 6, metadata !8, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1CD1Ev, null, metadata !7, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
+!16 = metadata !{metadata !"0x2e\00~C\00~C\00_ZN1CD2Ev\006\000\001\000\006\00256\000\006", metadata !5, metadata !"_ZTS1C", metadata !8, null, void (%struct.C*)* @_ZN1CD2Ev, null, metadata !7, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
+!17 = metadata !{metadata !"0x2e\00~C\00~C\00_ZN1CD1Ev\006\000\001\000\006\00256\000\006", metadata !5, metadata !"_ZTS1C", metadata !8, null, void (%struct.C*)* @_ZN1CD1Ev, null, metadata !7, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~C]
 !18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !20 = metadata !{metadata !"clang version 3.5.0 "}
 !21 = metadata !{i32 6, i32 0, metadata !17, metadata !22}
 !22 = metadata !{i32 5, i32 0, metadata !23, null}
-!23 = metadata !{i32 786443, metadata !5, metadata !12, i32 5, i32 0, i32 3, i32 3} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!23 = metadata !{metadata !"0xb\005\000\003", metadata !5, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
 !24 = metadata !{i32 5, i32 0, metadata !12, null}
 !25 = metadata !{i32 5, i32 0, metadata !26, null}
-!26 = metadata !{i32 786443, metadata !5, metadata !12, i32 5, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!26 = metadata !{metadata !"0xb\005\000\001", metadata !5, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
 !27 = metadata !{i32 5, i32 0, metadata !28, null}
-!28 = metadata !{i32 786443, metadata !5, metadata !12, i32 5, i32 0, i32 2, i32 2} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
-!29 = metadata !{i32 786689, metadata !17, metadata !"this", null, i32 16777216, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!28 = metadata !{metadata !"0xb\005\000\002", metadata !5, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!29 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !17, null, metadata !30} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
 !31 = metadata !{i32 0, i32 0, metadata !17, metadata !22}
-!32 = metadata !{i32 786689, metadata !16, metadata !"this", null, i32 16777216, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!32 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !16, null, metadata !30} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !33 = metadata !{i32 0, i32 0, metadata !16, metadata !21}
 !34 = metadata !{i32 5, i32 0, metadata !35, null}
-!35 = metadata !{i32 786443, metadata !5, metadata !36, i32 5, i32 0, i32 5, i32 5} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
-!36 = metadata !{i32 786443, metadata !5, metadata !12, i32 5, i32 0, i32 4, i32 4} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!35 = metadata !{metadata !"0xb\005\000\005", metadata !5, metadata !36} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
+!36 = metadata !{metadata !"0xb\005\000\004", metadata !5, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/PR20038.cpp]
 !37 = metadata !{i32 6, i32 0, metadata !17, null}
 !38 = metadata !{i32 0, i32 0, metadata !17, null}
 !39 = metadata !{i32 0, i32 0, metadata !16, metadata !37}

diff --git a/test/DebugInfo/PowerPC/processes-relocations.ll b/test/DebugInfo/PowerPC/processes-relocations.ll
new file mode 100644
index 0000000..5e661f7
--- /dev/null
+++ b/test/DebugInfo/PowerPC/processes-relocations.ll

@@ -0,0 +1,17 @@
+; RUN: llc -filetype=obj -O0 < %s -mtriple powerpc64-unknown-linux | \
+; RUN:    llvm-dwarfdump - 2>&1 | FileCheck %s
+; RUN: llc -filetype=obj -O0 < %s -mtriple powerpc-unknown-linux | \
+; RUN:    llvm-dwarfdump - 2>&1 | FileCheck %s
+
+; CHECK-NOT: failed to compute relocation
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/PowerPC/tls-fission.ll b/test/DebugInfo/PowerPC/tls-fission.ll
index 9cde2c7..fa198e1 100644
--- a/test/DebugInfo/PowerPC/tls-fission.ll
+++ b/test/DebugInfo/PowerPC/tls-fission.ll

@@ -21,12 +21,12 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !"tls.dwo"} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00tls.dwo\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tls.cpp", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786484, i32 0, null, metadata !"tls", metadata !"tls", metadata !"", metadata !5, i32 1, metadata !6, i32 0, i32 1, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
-!6 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!4 = metadata !{metadata !"0x34\00tls\00tls\00\001\000\001", null, metadata !5, metadata !6, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/PowerPC/tls.ll b/test/DebugInfo/PowerPC/tls.ll
index f2586ed..22da193 100644
--- a/test/DebugInfo/PowerPC/tls.ll
+++ b/test/DebugInfo/PowerPC/tls.ll

@@ -17,13 +17,13 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tls.cpp", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786484, i32 0, null, metadata !"tls", metadata !"tls", metadata !"", metadata !5, i32 1, metadata !6, i32 0, i32 1, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
-!6 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!4 = metadata !{metadata !"0x34\00tls\00tls\00\001\000\001", null, metadata !5, metadata !6, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/Sparc/gnu-window-save.ll b/test/DebugInfo/Sparc/gnu-window-save.ll
index 303a287..66066dd 100644
--- a/test/DebugInfo/Sparc/gnu-window-save.ll
+++ b/test/DebugInfo/Sparc/gnu-window-save.ll

@@ -55,17 +55,17 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 (http://llvm.org/git/clang.git 6a0714fee07fb7c4e32d3972b4fe2ce2f5678cf4) (llvm/ 672e88e934757f76d5c5e5258be41e7615094844)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/venkatra/work/benchmarks/test/hello/hello.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 (http://llvm.org/git/clang.git 6a0714fee07fb7c4e32d3972b4fe2ce2f5678cf4) (llvm/ 672e88e934757f76d5c5e5258be41e7615094844)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/home/venkatra/work/benchmarks/test/hello/hello.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"hello.c", metadata !"/home/venkatra/work/benchmarks/test/hello"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/venkatra/work/benchmarks/test/hello/hello.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\00256\000\004", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/venkatra/work/benchmarks/test/hello/hello.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5 (http://llvm.org/git/clang.git 6a0714fee07fb7c4e32d3972b4fe2ce2f5678cf4) (llvm/ 672e88e934757f76d5c5e5258be41e7615094844)"}
 !12 = metadata !{i32 5, i32 0, metadata !4, null}
 !13 = metadata !{i32 6, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/Sparc/processes-relocations.ll b/test/DebugInfo/Sparc/processes-relocations.ll
new file mode 100644
index 0000000..89cab9e
--- /dev/null
+++ b/test/DebugInfo/Sparc/processes-relocations.ll

@@ -0,0 +1,17 @@
+; RUN: llc -filetype=obj -O0 < %s -mtriple sparc-unknown-linux | \
+; RUN:    llvm-dwarfdump - 2>&1 | FileCheck %s
+; RUN: llc -filetype=obj -O0 < %s -mtriple sparcv9-unknown-linux | \
+; RUN:    llvm-dwarfdump - 2>&1 | FileCheck %s
+
+; CHECK-NOT: failed to compute relocation
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/SystemZ/processes-relocations.ll b/test/DebugInfo/SystemZ/processes-relocations.ll
new file mode 100644
index 0000000..6f276f9
--- /dev/null
+++ b/test/DebugInfo/SystemZ/processes-relocations.ll

@@ -0,0 +1,15 @@
+; RUN: llc -filetype=obj -O0 < %s -mtriple s390x-unknown-linux | \
+; RUN:     llvm-dwarfdump - 2>&1 | FileCheck %s
+
+; CHECK-NOT: failed to compute relocation
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/SystemZ/variable-loc.ll b/test/DebugInfo/SystemZ/variable-loc.ll
index 23df1cb..13e2e60 100644
--- a/test/DebugInfo/SystemZ/variable-loc.ll
+++ b/test/DebugInfo/SystemZ/variable-loc.ll

@@ -25,7 +25,7 @@
 
 declare void @populate_array(i32*, i32) nounwind
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @sum_array(i32*, i32) nounwind
 
@@ -35,8 +35,8 @@
   %main_arr = alloca [100 x i32], align 4
   %val = alloca i32, align 4
   store volatile i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{[100 x i32]* %main_arr}, metadata !17), !dbg !22
-  call void @llvm.dbg.declare(metadata !{i32* %val}, metadata !23), !dbg !24
+  call void @llvm.dbg.declare(metadata !{[100 x i32]* %main_arr}, metadata !17, metadata !{metadata !"0x102"}), !dbg !22
+  call void @llvm.dbg.declare(metadata !{i32* %val}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
   %arraydecay = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !25
   call void @populate_array(i32* %arraydecay, i32 100), !dbg !25
   %arraydecay1 = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !26
@@ -52,31 +52,31 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!30}
 
-!0 = metadata !{i32 786449, metadata !29, i32 12, metadata !"clang version 3.2 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/timnor01/a64-trunk/build/simple.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 \000\00\000\00\000", metadata !29, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/timnor01/a64-trunk/build/simple.c] [DW_LANG_C99]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !11, metadata !14}
-!5 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"populate_array", metadata !"populate_array", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*, i32)* @populate_array, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [populate_array]
-!6 = metadata !{i32 786473, metadata !29} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00populate_array\00populate_array\00\004\000\001\000\006\00256\000\004", metadata !29, metadata !6, metadata !7, null, void (i32*, i32)* @populate_array, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 4] [def] [populate_array]
+!6 = metadata !{metadata !"0x29", metadata !29} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !10}
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"sum_array", metadata !"sum_array", metadata !"", i32 9, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*, i32)* @sum_array, null, null, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [sum_array]
-!12 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{metadata !"0x2e\00sum_array\00sum_array\00\009\000\001\000\006\00256\000\009", metadata !29, metadata !6, metadata !12, null, i32 (i32*, i32)* @sum_array, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 9] [def] [sum_array]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{metadata !10, metadata !9, metadata !10}
-!14 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
-!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0x2e\00main\00main\00\0018\000\001\000\006\00256\000\0018", metadata !29, metadata !6, metadata !15, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !10}
-!17 = metadata !{i32 786688, metadata !18, metadata !"main_arr", metadata !6, i32 19, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [main_arr] [line 19]
-!18 = metadata !{i32 786443, metadata !29, metadata !14, i32 18, i32 16, i32 4} ; [ DW_TAG_lexical_block ] [/home/timnor01/a64-trunk/build/simple.c]
-!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 3200, i64 32, i32 0, i32 0, metadata !10, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
-!20 = metadata !{i32 786465, i64 0, i64 99}       ; [ DW_TAG_subrange_type ] [0, 99]
+!17 = metadata !{metadata !"0x100\00main_arr\0019\000", metadata !18, metadata !6, metadata !19} ; [ DW_TAG_auto_variable ] [main_arr] [line 19]
+!18 = metadata !{metadata !"0xb\0018\0016\004", metadata !29, metadata !14} ; [ DW_TAG_lexical_block ] [/home/timnor01/a64-trunk/build/simple.c]
+!19 = metadata !{metadata !"0x1\00\000\003200\0032\000\000", null, null, metadata !10, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
+!20 = metadata !{metadata !"0x21\000\0099"}       ; [ DW_TAG_subrange_type ] [0, 99]
 !22 = metadata !{i32 19, i32 7, metadata !18, null}
-!23 = metadata !{i32 786688, metadata !18, metadata !"val", metadata !6, i32 20, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [val] [line 20]
+!23 = metadata !{metadata !"0x100\00val\0020\000", metadata !18, metadata !6, metadata !10} ; [ DW_TAG_auto_variable ] [val] [line 20]
 !24 = metadata !{i32 20, i32 7, metadata !18, null}
 !25 = metadata !{i32 22, i32 3, metadata !18, null}
 !26 = metadata !{i32 23, i32 9, metadata !18, null}
 !27 = metadata !{i32 24, i32 3, metadata !18, null}
 !28 = metadata !{i32 26, i32 3, metadata !18, null}
 !29 = metadata !{metadata !"simple.c", metadata !"/home/timnor01/a64-trunk/build"}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/2010-04-13-PubType.ll b/test/DebugInfo/X86/2010-04-13-PubType.ll
index 0440afc..0996725 100644
--- a/test/DebugInfo/X86/2010-04-13-PubType.ll
+++ b/test/DebugInfo/X86/2010-04-13-PubType.ll

@@ -12,9 +12,9 @@
   %retval = alloca i32                            ; <i32*> [#uses=2]
   %0 = alloca i32                                 ; <i32*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.X** %x_addr}, metadata !0), !dbg !13
+  call void @llvm.dbg.declare(metadata !{%struct.X** %x_addr}, metadata !0, metadata !{metadata !"0x102"}), !dbg !13
   store %struct.X* %x, %struct.X** %x_addr
-  call void @llvm.dbg.declare(metadata !{%struct.Y** %y_addr}, metadata !14), !dbg !13
+  call void @llvm.dbg.declare(metadata !{%struct.Y** %y_addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !13
   store %struct.Y* %y, %struct.Y** %y_addr
   store i32 0, i32* %0, align 4, !dbg !13
   %1 = load i32* %0, align 4, !dbg !13            ; <i32> [#uses=1]
@@ -26,29 +26,29 @@
   ret i32 %retval1, !dbg !15
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 7, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 7, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 (%struct.X*, %struct.Y*)* @foo, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00x\007\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\007\000\001\000\006\000\000\007", metadata !18, metadata !2, metadata !4, null, i32 (%struct.X*, %struct.Y*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !18, metadata !19, metadata !19, metadata !17, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !7, metadata !9}
-!6 = metadata !{i32 786468, metadata !18, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786447, metadata !18, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !18, metadata !2, metadata !"X", i32 3, i64 0, i64 0, i64 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 3, size 0, align 0, offset 0] [decl] [from ]
-!9 = metadata !{i32 786447, metadata !18, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 786451, metadata !18, metadata !2, metadata !"Y", i32 4, i64 32, i64 32, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Y] [line 4, size 32, align 32, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !18, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !18, metadata !2, metadata !8} ; [ DW_TAG_pointer_type ]
+!8 = metadata !{metadata !"0x13\00X\003\000\000\000\004\000", metadata !18, metadata !2, null, null, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 3, size 0, align 0, offset 0] [decl] [from ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !18, metadata !2, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x13\00Y\004\0032\0032\000\000\000", metadata !18, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [Y] [line 4, size 32, align 32, offset 0] [def] [from ]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786445, metadata !18, metadata !10, metadata !"x", i32 5, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0xd\00x\005\0032\0032\000\000", metadata !18, metadata !10, metadata !6} ; [ DW_TAG_member ]
 !13 = metadata !{i32 7, i32 0, metadata !1, null}
-!14 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 7, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
+!14 = metadata !{metadata !"0x101\00y\007\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
 !15 = metadata !{i32 7, i32 0, metadata !16, null}
-!16 = metadata !{i32 786443, metadata !18, metadata !1, i32 7, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0xb\007\000\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
 !17 = metadata !{metadata !1}
 !18 = metadata !{metadata !"a.c", metadata !"/tmp/"}
 !19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/2010-08-10-DbgConstant.ll b/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
index 7f42e7b..7a1b4fe 100644
--- a/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
+++ b/test/DebugInfo/X86/2010-08-10-DbgConstant.ll

@@ -14,18 +14,18 @@
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{i32 786478, metadata !12, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void ()* @foo, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang 2.8", i1 false, metadata !"", i32 0, metadata !4, metadata !4, metadata !10, metadata !11,  metadata !14, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\003\000\001\000\006\000\000\003", metadata !12, metadata !1, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang 2.8\000\00\000\00\000", metadata !12, metadata !4, metadata !4, metadata !10, metadata !11,  metadata !14} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 786471, i32 0, metadata !1, metadata !"ro", metadata !"ro", metadata !"ro", metadata !1, i32 1, metadata !6, i1 true, i1 true, i32 201, null} ; [ DW_TAG_constant ]
-!6 = metadata !{i32 786470, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_const_type ]
-!7 = metadata !{i32 786468, metadata !12, metadata !1, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x27\00ro\00ro\00ro\001\001\001", metadata !1, metadata !1, metadata !6, i32 201, null} ; [ DW_TAG_constant ]
+!6 = metadata !{metadata !"0x26\00\000\000\000\000\000", metadata !12, metadata !1, metadata !7} ; [ DW_TAG_const_type ]
+!7 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !12, metadata !1} ; [ DW_TAG_base_type ]
 !8 = metadata !{i32 3, i32 14, metadata !9, null}
-!9 = metadata !{i32 786443, metadata !12, metadata !0, i32 3, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
+!9 = metadata !{metadata !"0xb\003\0012\000", metadata !12, metadata !0} ; [ DW_TAG_lexical_block ]
 !10 = metadata !{metadata !0}
 !11 = metadata !{metadata !5}
 !12 = metadata !{metadata !"/tmp/l.c", metadata !"/Volumes/Lalgate/clean/D"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !14 = metadata !{}

diff --git a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
index 4dc747f..56a1a2b 100644
--- a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
+++ b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll

@@ -7,30 +7,30 @@
 
 define i32 @f() nounwind {
   %LOC = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %LOC}, metadata !15), !dbg !17
+  call void @llvm.dbg.declare(metadata !{i32* %LOC}, metadata !15, metadata !{metadata !"0x102"}), !dbg !17
   %1 = load i32* @GLB, align 4, !dbg !18
   store i32 %1, i32* %LOC, align 4, !dbg !18
   %2 = load i32* @GLB, align 4, !dbg !19
   ret i32 %2, !dbg !19
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"clang version 3.0 (trunk)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk)\000\00\000\00\000", metadata !20, metadata !1, metadata !1, metadata !3, metadata !12,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @f, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [f]
-!6 = metadata !{i32 720937, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\000\000\000", metadata !6, metadata !6, metadata !7, null, i32 ()* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [f]
+!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !12 = metadata !{metadata !14}
-!14 = metadata !{i32 720948, i32 0, null, metadata !"GLB", metadata !"GLB", metadata !"", metadata !6, i32 1, metadata !9, i32 0, i32 1, i32* @GLB, null} ; [ DW_TAG_variable ]
-!15 = metadata !{i32 786688, metadata !16, metadata !"LOC", metadata !6, i32 4, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!16 = metadata !{i32 786443, metadata !20, metadata !5, i32 3, i32 9, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0x34\00GLB\00GLB\00\001\000\001", null, metadata !6, metadata !9, i32* @GLB, null} ; [ DW_TAG_variable ]
+!15 = metadata !{metadata !"0x100\00LOC\004\000", metadata !16, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
+!16 = metadata !{metadata !"0xb\003\009\000", metadata !20, metadata !5} ; [ DW_TAG_lexical_block ]
 !17 = metadata !{i32 4, i32 9, metadata !16, null}
 !18 = metadata !{i32 4, i32 23, metadata !16, null}
 !19 = metadata !{i32 5, i32 5, metadata !16, null}
@@ -40,16 +40,16 @@
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]*}}] = "GLB")
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01)
+; CHECK: DW_AT_decl_file [DW_FORM_data1] ("/work/llvm/vanilla/test/DebugInfo{{[/\\]}}test.c")
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_decl_line [DW_FORM_data1] (0x01)
+; CHECK: DW_AT_decl_line [DW_FORM_data1] (1)
 
 ; CHECK: DW_TAG_variable
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name [DW_FORM_strp]   ( .debug_str[0x{{[0-9a-f]*}}] = "LOC")
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_decl_file [DW_FORM_data1]     (0x01)
+; CHECK: DW_AT_decl_file [DW_FORM_data1]     ("/work/llvm/vanilla/test/DebugInfo{{[/\\]}}test.c")
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_decl_line [DW_FORM_data1]     (0x04)
+; CHECK: DW_AT_decl_line [DW_FORM_data1]     (4)
 
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
index 21dccd7..5b30480 100644
--- a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
+++ b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll

@@ -15,24 +15,24 @@
   %myBar = alloca %struct.bar, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !49), !dbg !50
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !49, metadata !{metadata !"0x102"}), !dbg !50
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !51), !dbg !52
-  call void @llvm.dbg.declare(metadata !{%struct.bar* %myBar}, metadata !53), !dbg !55
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !51, metadata !{metadata !"0x102"}), !dbg !52
+  call void @llvm.dbg.declare(metadata !{%struct.bar* %myBar}, metadata !53, metadata !{metadata !"0x102"}), !dbg !55
   call void @_ZN3barC1Ei(%struct.bar* %myBar, i32 1), !dbg !56
   ret i32 0, !dbg !57
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define linkonce_odr void @_ZN3barC1Ei(%struct.bar* %this, i32 %x) unnamed_addr uwtable ssp align 2 {
 entry:
   %this.addr = alloca %struct.bar*, align 8
   %x.addr = alloca i32, align 4
   store %struct.bar* %this, %struct.bar** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.bar** %this.addr}, metadata !58), !dbg !59
+  call void @llvm.dbg.declare(metadata !{%struct.bar** %this.addr}, metadata !58, metadata !{metadata !"0x102"}), !dbg !59
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !60), !dbg !61
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !60, metadata !{metadata !"0x102"}), !dbg !61
   %this1 = load %struct.bar** %this.addr
   %0 = load i32* %x.addr, align 4, !dbg !62
   call void @_ZN3barC2Ei(%struct.bar* %this1, i32 %0), !dbg !62
@@ -44,9 +44,9 @@
   %this.addr = alloca %struct.bar*, align 8
   %x.addr = alloca i32, align 4
   store %struct.bar* %this, %struct.bar** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.bar** %this.addr}, metadata !63), !dbg !64
+  call void @llvm.dbg.declare(metadata !{%struct.bar** %this.addr}, metadata !63, metadata !{metadata !"0x102"}), !dbg !64
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !65), !dbg !66
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !65, metadata !{metadata !"0x102"}), !dbg !66
   %this1 = load %struct.bar** %this.addr
   %b = getelementptr inbounds %struct.bar* %this1, i32 0, i32 0, !dbg !67
   %0 = load i32* %x.addr, align 4, !dbg !67
@@ -62,9 +62,9 @@
   %this.addr = alloca %struct.baz*, align 8
   %a.addr = alloca i32, align 4
   store %struct.baz* %this, %struct.baz** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.baz** %this.addr}, metadata !70), !dbg !71
+  call void @llvm.dbg.declare(metadata !{%struct.baz** %this.addr}, metadata !70, metadata !{metadata !"0x102"}), !dbg !71
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !72), !dbg !73
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !72, metadata !{metadata !"0x102"}), !dbg !73
   %this1 = load %struct.baz** %this.addr
   %0 = load i32* %a.addr, align 4, !dbg !74
   call void @_ZN3bazC2Ei(%struct.baz* %this1, i32 %0), !dbg !74
@@ -76,9 +76,9 @@
   %this.addr = alloca %struct.baz*, align 8
   %a.addr = alloca i32, align 4
   store %struct.baz* %this, %struct.baz** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.baz** %this.addr}, metadata !75), !dbg !76
+  call void @llvm.dbg.declare(metadata !{%struct.baz** %this.addr}, metadata !75, metadata !{metadata !"0x102"}), !dbg !76
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !77), !dbg !78
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !77, metadata !{metadata !"0x102"}), !dbg !78
   %this1 = load %struct.baz** %this.addr
   %h = getelementptr inbounds %struct.baz* %this1, i32 0, i32 0, !dbg !79
   %0 = load i32* %a.addr, align 4, !dbg !79
@@ -89,78 +89,78 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!83}
 
-!0 = metadata !{i32 720913, metadata !82, i32 4, metadata !"clang version 3.1 (trunk 146596)", i1 false, metadata !"", i32 0, metadata !1, metadata !3, metadata !27, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.1 (trunk 146596)\000\00\000\00\000", metadata !82, metadata !1, metadata !3, metadata !27, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !9}
-!5 = metadata !{i32 720898, metadata !82, null, metadata !"bar", i32 9, i64 128, i64 64, i32 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 128, align 64, offset 0] [def] [from ]
-!6 = metadata !{i32 720937, metadata !82} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x2\00bar\009\00128\0064\000\000\000", metadata !82, null, null, metadata !7, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 128, align 64, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x29", metadata !82} ; [ DW_TAG_file_type ]
 !7 = metadata !{metadata !8, metadata !19, metadata !21}
-!8 = metadata !{i32 720909, metadata !82, metadata !5, metadata !"b", i32 11, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ]
-!9 = metadata !{i32 720898, metadata !82, null, metadata !"baz", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_class_type ] [baz] [line 3, size 32, align 32, offset 0] [def] [from ]
+!8 = metadata !{metadata !"0xd\00b\0011\0032\0032\000\000", metadata !82, metadata !5, metadata !9} ; [ DW_TAG_member ]
+!9 = metadata !{metadata !"0x2\00baz\003\0032\0032\000\000\000", metadata !82, null, null, metadata !10, null, null, null} ; [ DW_TAG_class_type ] [baz] [line 3, size 32, align 32, offset 0] [def] [from ]
 !10 = metadata !{metadata !11, metadata !13}
-!11 = metadata !{i32 720909, metadata !82, metadata !9, metadata !"h", i32 5, i64 32, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ]
-!12 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!13 = metadata !{i32 720942, metadata !82, metadata !9, metadata !"baz", metadata !"baz", metadata !"", i32 6, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 0} ; [ DW_TAG_subprogram ]
-!14 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0xd\00h\005\0032\0032\000\000", metadata !82, metadata !9, metadata !12} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!13 = metadata !{metadata !"0x2e\00baz\00baz\00\006\000\000\000\006\00256\000\000", metadata !82, metadata !9, metadata !14, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !16, metadata !12}
-!16 = metadata !{i32 720911, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !9} ; [ DW_TAG_pointer_type ]
-!19 = metadata !{i32 720909, metadata !82, metadata !5, metadata !"b_ref", i32 12, i64 64, i64 64, i64 64, i32 0, metadata !20} ; [ DW_TAG_member ]
-!20 = metadata !{i32 720912, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_reference_type ]
-!21 = metadata !{i32 720942, metadata !82, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 13, metadata !22, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 0} ; [ DW_TAG_subprogram ]
-!22 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !9} ; [ DW_TAG_pointer_type ]
+!19 = metadata !{metadata !"0xd\00b_ref\0012\0064\0064\0064\000", metadata !82, metadata !5, metadata !20} ; [ DW_TAG_member ]
+!20 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_reference_type ]
+!21 = metadata !{metadata !"0x2e\00bar\00bar\00\0013\000\000\000\006\00256\000\000", metadata !82, metadata !5, metadata !22, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null, metadata !24, metadata !12}
-!24 = metadata !{i32 720911, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !5} ; [ DW_TAG_pointer_type ]
+!24 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !5} ; [ DW_TAG_pointer_type ]
 !27 = metadata !{metadata !29, metadata !37, metadata !40, metadata !43, metadata !46}
-!29 = metadata !{i32 720942, metadata !82, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 17, metadata !30, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 17] [def] [scope 0] [main]
-!30 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{metadata !"0x2e\00main\00main\00\0017\000\001\000\006\00256\000\000", metadata !82, metadata !6, metadata !30, null, i32 (i32, i8**)* @main, null, null, null} ; [ DW_TAG_subprogram ] [line 17] [def] [scope 0] [main]
+!30 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !31, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !31 = metadata !{metadata !12, metadata !12, metadata !32}
-!32 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !33} ; [ DW_TAG_pointer_type ]
-!33 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !34} ; [ DW_TAG_pointer_type ]
-!34 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!32 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !33} ; [ DW_TAG_pointer_type ]
+!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !34} ; [ DW_TAG_pointer_type ]
+!34 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
 !35 = metadata !{metadata !36}
-!36 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!37 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC1Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC1Ei, null, metadata !21, null, i32 0} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
+!36 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!37 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3barC1Ei\0013\000\001\000\006\00256\000\000", metadata !82, null, metadata !22, null, void (%struct.bar*, i32)* @_ZN3barC1Ei, null, metadata !21, null} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
 !38 = metadata !{metadata !39}
-!39 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!40 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC2Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC2Ei, null, metadata !21, null, i32 0} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
+!39 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!40 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3barC2Ei\0013\000\001\000\006\00256\000\000", metadata !82, null, metadata !22, null, void (%struct.bar*, i32)* @_ZN3barC2Ei, null, metadata !21, null} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
 !41 = metadata !{metadata !42}
-!42 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!43 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC1Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC1Ei, null, metadata !13, null, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
+!42 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!43 = metadata !{metadata !"0x2e\00baz\00baz\00_ZN3bazC1Ei\006\000\001\000\006\00256\000\000", metadata !82, null, metadata !14, null, void (%struct.baz*, i32)* @_ZN3bazC1Ei, null, metadata !13, null} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
 !44 = metadata !{metadata !45}
-!45 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!46 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC2Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC2Ei, null, metadata !13, null, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
-!49 = metadata !{i32 721153, metadata !29, metadata !"argc", metadata !6, i32 16777232, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!45 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!46 = metadata !{metadata !"0x2e\00baz\00baz\00_ZN3bazC2Ei\006\000\001\000\006\00256\000\000", metadata !82, null, metadata !14, null, void (%struct.baz*, i32)* @_ZN3bazC2Ei, null, metadata !13, null} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
+!49 = metadata !{metadata !"0x101\00argc\0016777232\000", metadata !29, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
 !50 = metadata !{i32 16, i32 14, metadata !29, null}
-!51 = metadata !{i32 721153, metadata !29, metadata !"argv", metadata !6, i32 33554448, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!51 = metadata !{metadata !"0x101\00argv\0033554448\000", metadata !29, metadata !6, metadata !32} ; [ DW_TAG_arg_variable ]
 !52 = metadata !{i32 16, i32 27, metadata !29, null}
-!53 = metadata !{i32 721152, metadata !54, metadata !"myBar", metadata !6, i32 18, metadata !5, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!54 = metadata !{i32 720907, metadata !82, metadata !29, i32 17, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!53 = metadata !{metadata !"0x100\00myBar\0018\000", metadata !54, metadata !6, metadata !5} ; [ DW_TAG_auto_variable ]
+!54 = metadata !{metadata !"0xb\0017\001\000", metadata !82, metadata !29} ; [ DW_TAG_lexical_block ]
 !55 = metadata !{i32 18, i32 9, metadata !54, null}
 !56 = metadata !{i32 18, i32 17, metadata !54, null}
 !57 = metadata !{i32 19, i32 5, metadata !54, null}
-!58 = metadata !{i32 721153, metadata !37, metadata !"this", metadata !6, i32 16777229, metadata !24, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!58 = metadata !{metadata !"0x101\00this\0016777229\0064", metadata !37, metadata !6, metadata !24} ; [ DW_TAG_arg_variable ]
 !59 = metadata !{i32 13, i32 5, metadata !37, null}
-!60 = metadata !{i32 721153, metadata !37, metadata !"x", metadata !6, i32 33554445, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!60 = metadata !{metadata !"0x101\00x\0033554445\000", metadata !37, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
 !61 = metadata !{i32 13, i32 13, metadata !37, null}
 !62 = metadata !{i32 13, i32 34, metadata !37, null}
-!63 = metadata !{i32 721153, metadata !40, metadata !"this", metadata !6, i32 16777229, metadata !24, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!63 = metadata !{metadata !"0x101\00this\0016777229\0064", metadata !40, metadata !6, metadata !24} ; [ DW_TAG_arg_variable ]
 !64 = metadata !{i32 13, i32 5, metadata !40, null}
-!65 = metadata !{i32 721153, metadata !40, metadata !"x", metadata !6, i32 33554445, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!65 = metadata !{metadata !"0x101\00x\0033554445\000", metadata !40, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
 !66 = metadata !{i32 13, i32 13, metadata !40, null}
 !67 = metadata !{i32 13, i32 33, metadata !40, null}
 !68 = metadata !{i32 13, i32 34, metadata !69, null}
-!69 = metadata !{i32 720907, metadata !82, metadata !40, i32 13, i32 33, i32 1} ; [ DW_TAG_lexical_block ]
-!70 = metadata !{i32 721153, metadata !43, metadata !"this", metadata !6, i32 16777222, metadata !16, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!69 = metadata !{metadata !"0xb\0013\0033\001", metadata !82, metadata !40} ; [ DW_TAG_lexical_block ]
+!70 = metadata !{metadata !"0x101\00this\0016777222\0064", metadata !43, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ]
 !71 = metadata !{i32 6, i32 5, metadata !43, null}
-!72 = metadata !{i32 721153, metadata !43, metadata !"a", metadata !6, i32 33554438, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!72 = metadata !{metadata !"0x101\00a\0033554438\000", metadata !43, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
 !73 = metadata !{i32 6, i32 13, metadata !43, null}
 !74 = metadata !{i32 6, i32 24, metadata !43, null}
-!75 = metadata !{i32 721153, metadata !46, metadata !"this", metadata !6, i32 16777222, metadata !16, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!75 = metadata !{metadata !"0x101\00this\0016777222\0064", metadata !46, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ]
 !76 = metadata !{i32 6, i32 5, metadata !46, null}
-!77 = metadata !{i32 721153, metadata !46, metadata !"a", metadata !6, i32 33554438, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!77 = metadata !{metadata !"0x101\00a\0033554438\000", metadata !46, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ]
 !78 = metadata !{i32 6, i32 13, metadata !46, null}
 !79 = metadata !{i32 6, i32 23, metadata !46, null}
 !80 = metadata !{i32 6, i32 24, metadata !81, null}
-!81 = metadata !{i32 720907, metadata !82, metadata !46, i32 6, i32 23, i32 2} ; [ DW_TAG_lexical_block ]
+!81 = metadata !{metadata !"0xb\006\0023\002", metadata !82, metadata !46} ; [ DW_TAG_lexical_block ]
 !82 = metadata !{metadata !"main.cpp", metadata !"/Users/echristo/tmp/bad-struct-ref"}
-!83 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!83 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/DW_AT_byte_size.ll b/test/DebugInfo/X86/DW_AT_byte_size.ll
index 59921bd..2ce5ed5 100644
--- a/test/DebugInfo/X86/DW_AT_byte_size.ll
+++ b/test/DebugInfo/X86/DW_AT_byte_size.ll

@@ -14,33 +14,33 @@
 entry:
   %a.addr = alloca %struct.A*, align 8
   store %struct.A* %a, %struct.A** %a.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.A** %a.addr}, metadata !16), !dbg !17
+  call void @llvm.dbg.declare(metadata !{%struct.A** %a.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !17
   %0 = load %struct.A** %a.addr, align 8, !dbg !18
   %b = getelementptr inbounds %struct.A* %0, i32 0, i32 0, !dbg !18
   %1 = load i32* %b, align 4, !dbg !18
   ret i32 %1, !dbg !18
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.1 (trunk 150996)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.1 (trunk 150996)\000\00\000\00\000", metadata !20, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !20, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooP1A", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.A*)* @_Z3fooP1A, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooP1A\003\000\001\000\006\00256\000\003", metadata !20, metadata !6, metadata !7, null, i32 (%struct.A*)* @_Z3fooP1A, null, null, null} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !20, null, null, metadata !12, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786445, metadata !20, metadata !11, metadata !"b", i32 1, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ]
-!16 = metadata !{i32 786689, metadata !5, metadata !"a", metadata !6, i32 16777219, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!13 = metadata !{metadata !"0xd\00b\001\0032\0032\000\000", metadata !20, metadata !11, metadata !9} ; [ DW_TAG_member ]
+!16 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
 !17 = metadata !{i32 3, i32 13, metadata !5, null}
 !18 = metadata !{i32 4, i32 3, metadata !19, null}
-!19 = metadata !{i32 786443, metadata !20, metadata !5, i32 3, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{metadata !"0xb\003\0016\000", metadata !20, metadata !5} ; [ DW_TAG_lexical_block ]
 !20 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/DW_AT_linkage_name.ll b/test/DebugInfo/X86/DW_AT_linkage_name.ll
index dce234a..ca3b85f 100644
--- a/test/DebugInfo/X86/DW_AT_linkage_name.ll
+++ b/test/DebugInfo/X86/DW_AT_linkage_name.ll

@@ -18,14 +18,14 @@
 ; Test that we do emit a linkage name for a specific instance of it.
 
 ; CHECK: DW_TAG_subprogram
-; CHECK: [[A_DTOR:.*]]:     DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK: DW_AT_name {{.*}} "~A"
 ; CHECK-NOT: DW_AT_MIPS_linkage_name
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_MIPS_linkage_name {{.*}} "_ZN1AD2Ev"
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_specification {{.*}}[[A_DTOR]]
+; CHECK: DW_AT_specification {{.*}} "~A"
 
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
@@ -38,20 +38,20 @@
 entry:
   %this.addr = alloca %struct.A*, align 8
   store %struct.A* %this, %struct.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.A** %this.addr}, metadata !26), !dbg !28
+  call void @llvm.dbg.declare(metadata !{%struct.A** %this.addr}, metadata !26, metadata !{metadata !"0x102"}), !dbg !28
   %this1 = load %struct.A** %this.addr
   ret void, !dbg !29
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind ssp uwtable
 define void @_ZN1AD1Ev(%struct.A* %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca %struct.A*, align 8
   store %struct.A* %this, %struct.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.A** %this.addr}, metadata !30), !dbg !31
+  call void @llvm.dbg.declare(metadata !{%struct.A** %this.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
   %this1 = load %struct.A** %this.addr
   call void @_ZN1AD2Ev(%struct.A* %this1), !dbg !32
   ret void, !dbg !33
@@ -61,7 +61,7 @@
 define void @_Z3foov() #2 {
 entry:
   %a = alloca %struct.A, align 1
-  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !34), !dbg !35
+  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !34, metadata !{metadata !"0x102"}), !dbg !35
   call void @_ZN1AC1Ei(%struct.A* %a, i32 1), !dbg !35
   call void @_ZN1AD1Ev(%struct.A* %a), !dbg !36
   ret void, !dbg !36
@@ -77,40 +77,40 @@
 !llvm.module.flags = !{!23, !24}
 !llvm.ident = !{!25}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !16, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [linkage-name.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !16, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [linkage-name.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"linkage-name.cpp", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"A", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00A\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6, metadata !12}
-!6 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !11, i32 2} ; [ DW_TAG_subprogram ] [line 2] [A]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x2e\00A\00A\00\002\000\000\000\006\00256\000\002", metadata !1, metadata !"_ZTS1A", metadata !7, null, null, null, i32 0, metadata !11} ; [ DW_TAG_subprogram ] [line 2] [A]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !10}
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !11 = metadata !{i32 786468}
-!12 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"", i32 3, metadata !13, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !15, i32 3} ; [ DW_TAG_subprogram ] [line 3] [~A]
-!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x2e\00~A\00~A\00\003\000\000\000\006\00256\000\003", metadata !1, metadata !"_ZTS1A", metadata !13, null, null, null, i32 0, metadata !15} ; [ DW_TAG_subprogram ] [line 3] [~A]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{null, metadata !9}
 !15 = metadata !{i32 786468}
 !16 = metadata !{metadata !17, metadata !18, metadata !19}
-!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"_ZN1AD2Ev", i32 6, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.A*)* @_ZN1AD2Ev, null, metadata !12, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
-!18 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"_ZN1AD1Ev", i32 6, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.A*)* @_ZN1AD1Ev, null, metadata !12, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
-!19 = metadata !{i32 786478, metadata !1, metadata !20, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 10, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3foov, null, null, metadata !2, i32 10} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
-!20 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [linkage-name.cpp]
-!21 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD2Ev\006\000\001\000\006\00256\000\006", metadata !1, metadata !"_ZTS1A", metadata !13, null, void (%struct.A*)* @_ZN1AD2Ev, null, metadata !12, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
+!18 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD1Ev\006\000\001\000\006\00256\000\006", metadata !1, metadata !"_ZTS1A", metadata !13, null, void (%struct.A*)* @_ZN1AD1Ev, null, metadata !12, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [~A]
+!19 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\0010\000\001\000\006\00256\000\0010", metadata !1, metadata !20, metadata !21, null, void ()* @_Z3foov, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!20 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [linkage-name.cpp]
+!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{null}
 !23 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !25 = metadata !{metadata !"clang version 3.5.0 "}
-!26 = metadata !{i32 786689, metadata !17, metadata !"this", null, i32 16777216, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!26 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !17, null, metadata !27} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
 !28 = metadata !{i32 0, i32 0, metadata !17, null}
-!29 = metadata !{i32 8, i32 0, metadata !17, null} ; [ DW_TAG_imported_declaration ]
-!30 = metadata !{i32 786689, metadata !18, metadata !"this", null, i32 16777216, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!29 = metadata !{i32 8, i32 0, metadata !17, null}
+!30 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !18, null, metadata !27} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !31 = metadata !{i32 0, i32 0, metadata !18, null}
 !32 = metadata !{i32 6, i32 0, metadata !18, null}
-!33 = metadata !{i32 8, i32 0, metadata !18, null} ; [ DW_TAG_imported_declaration ]
-!34 = metadata !{i32 786688, metadata !19, metadata !"a", metadata !20, i32 11, metadata !"_ZTS1A", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 11]
+!33 = metadata !{i32 8, i32 0, metadata !18, null}
+!34 = metadata !{metadata !"0x100\00a\0011\000", metadata !19, metadata !20, metadata !"_ZTS1A"} ; [ DW_TAG_auto_variable ] [a] [line 11]
 !35 = metadata !{i32 11, i32 0, metadata !19, null}
 !36 = metadata !{i32 12, i32 0, metadata !19, null}

diff --git a/test/DebugInfo/X86/DW_AT_location-reference.ll b/test/DebugInfo/X86/DW_AT_location-reference.ll
index f31b0ad..874ecd6 100644
--- a/test/DebugInfo/X86/DW_AT_location-reference.ll
+++ b/test/DebugInfo/X86/DW_AT_location-reference.ll

@@ -64,7 +64,7 @@
 entry:
   %call = tail call i32 @g(i32 0, i32 0) nounwind, !dbg !8
   store i32 %call, i32* @a, align 4, !dbg !8
-  tail call void @llvm.dbg.value(metadata !12, i64 0, metadata !5), !dbg !13
+  tail call void @llvm.dbg.value(metadata !12, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !13
   br label %while.body
 
 while.body:                                       ; preds = %entry, %while.body
@@ -75,10 +75,10 @@
   br i1 %tobool, label %while.end, label %while.body, !dbg !14
 
 while.end:                                        ; preds = %while.body
-  tail call void @llvm.dbg.value(metadata !{i32 %mul}, i64 0, metadata !5), !dbg !14
+  tail call void @llvm.dbg.value(metadata !{i32 %mul}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !14
   %call4 = tail call i32 @g(i32 %mul, i32 0) nounwind, !dbg !15
   store i32 %call4, i32* @a, align 4, !dbg !15
-  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !5), !dbg !17
+  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !17
   br label %while.body9
 
 while.body9:                                      ; preds = %while.end, %while.body9
@@ -89,7 +89,7 @@
   br i1 %tobool8, label %while.end13, label %while.body9, !dbg !18
 
 while.end13:                                      ; preds = %while.body9
-  tail call void @llvm.dbg.value(metadata !{i32 %mul12}, i64 0, metadata !5), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %mul12}, i64 0, metadata !5, metadata !{metadata !"0x102"}), !dbg !18
   %call15 = tail call i32 @g(i32 0, i32 %mul12) nounwind, !dbg !19
   store i32 %call15, i32* @a, align 4, !dbg !19
   ret void, !dbg !20
@@ -97,19 +97,19 @@
 
 declare i32 @g(i32, i32)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!24}
 
-!0 = metadata !{i32 786478, metadata !23, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @f, null, null, metadata !22, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [f]
-!1 = metadata !{i32 786473, metadata !23} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !23, i32 12, metadata !"clang version 3.0 (trunk)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !21, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !23, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00f\00f\00\004\000\001\000\006\00256\001\004", metadata !23, metadata !1, metadata !3, null, void ()* @f, null, null, metadata !22} ; [ DW_TAG_subprogram ] [line 4] [def] [f]
+!1 = metadata !{metadata !"0x29", metadata !23} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk)\001\00\000\00\001", metadata !23, metadata !4, metadata !4, metadata !21, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !23, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 786688, metadata !6, metadata !"x", metadata !1, i32 5, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
-!6 = metadata !{i32 786443, metadata !23, metadata !0, i32 4, i32 14, i32 0} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x100\00x\005\000", metadata !6, metadata !1, metadata !7} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{metadata !"0xb\004\0014\000", metadata !23, metadata !0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
 !8 = metadata !{i32 6, i32 3, metadata !6, null}
 !12 = metadata !{i32 1}
 !13 = metadata !{i32 7, i32 3, metadata !6, null}
@@ -123,4 +123,4 @@
 !21 = metadata !{metadata !0}
 !22 = metadata !{metadata !5}
 !23 = metadata !{metadata !"simple.c", metadata !"/home/rengol01/temp/tests/dwarf/relocation"}
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
index 4b9fae8..ca4beb2 100644
--- a/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll

@@ -17,21 +17,21 @@
   %.addr = alloca i32, align 4
   %a = alloca %class.A, align 4
   store i32 %0, i32* %.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %.addr}, metadata !36), !dbg !35
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !21), !dbg !23
+  call void @llvm.dbg.declare(metadata !{i32* %.addr}, metadata !36, metadata !{metadata !"0x102"}), !dbg !35
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !21, metadata !{metadata !"0x102"}), !dbg !23
   call void @_ZN1AC1Ev(%class.A* %a), !dbg !24
   %m_a = getelementptr inbounds %class.A* %a, i32 0, i32 0, !dbg !25
   %1 = load i32* %m_a, align 4, !dbg !25
   ret i32 %1, !dbg !25
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define linkonce_odr void @_ZN1AC1Ev(%class.A* %this) unnamed_addr nounwind uwtable ssp align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !26), !dbg !28
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !26, metadata !{metadata !"0x102"}), !dbg !28
   %this1 = load %class.A** %this.addr
   call void @_ZN1AC2Ev(%class.A* %this1), !dbg !29
   ret void, !dbg !29
@@ -41,7 +41,7 @@
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !30), !dbg !31
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
   %this1 = load %class.A** %this.addr
   %m_a = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !32
   store i32 0, i32* %m_a, align 4, !dbg !32
@@ -51,40 +51,40 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!38}
 
-!0 = metadata !{i32 786449, metadata !37, i32 4, metadata !"clang version 3.2 (trunk 163586) (llvm/trunk 163570)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 163586) (llvm/trunk 163570)\000\00\000\00\000", metadata !37, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !10, metadata !20}
-!5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3fooi, null, null, metadata !1, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
-!6 = metadata !{i32 786473, metadata !37} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi\007\000\001\000\006\00256\000\007", metadata !6, metadata !6, metadata !7, null, i32 (i32)* @_Z3fooi, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
+!6 = metadata !{metadata !"0x29", metadata !37} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786478, metadata !6, null, metadata !"A", metadata !"A", metadata !"_ZN1AC1Ev", i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC1Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
-!11 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC1Ev\003\000\001\000\006\00256\000\003", metadata !6, null, metadata !11, null, void (%class.A*)* @_ZN1AC1Ev, null, metadata !17, metadata !1} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!14 = metadata !{i32 786434, metadata !37, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!14 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !37, null, null, metadata !15, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{i32 786445, metadata !37, metadata !14, metadata !"m_a", i32 4, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [m_a] [line 4, size 32, align 32, offset 0] [from int]
-!17 = metadata !{i32 786478, metadata !6, metadata !14, metadata !"A", metadata !"A", metadata !"", i32 3, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [A]
+!16 = metadata !{metadata !"0xd\00m_a\004\0032\0032\000\000", metadata !37, metadata !14, metadata !9} ; [ DW_TAG_member ] [m_a] [line 4, size 32, align 32, offset 0] [from int]
+!17 = metadata !{metadata !"0x2e\00A\00A\00\003\000\000\000\006\00256\000\003", metadata !6, metadata !14, metadata !11, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 3] [A]
 !18 = metadata !{metadata !19}
-!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!20 = metadata !{i32 786478, metadata !6, null, metadata !"A", metadata !"A", metadata !"_ZN1AC2Ev", i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC2Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
-!21 = metadata !{i32 786688, metadata !22, metadata !"a", metadata !6, i32 8, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 8]
-!22 = metadata !{i32 786443, metadata !6, metadata !5, i32 7, i32 11, i32 0} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2Ev\003\000\001\000\006\00256\000\003", metadata !6, null, metadata !11, null, void (%class.A*)* @_ZN1AC2Ev, null, metadata !17, metadata !1} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!21 = metadata !{metadata !"0x100\00a\008\000", metadata !22, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [a] [line 8]
+!22 = metadata !{metadata !"0xb\007\0011\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
 !23 = metadata !{i32 8, i32 5, metadata !22, null}
 !24 = metadata !{i32 8, i32 6, metadata !22, null}
 !25 = metadata !{i32 9, i32 3, metadata !22, null}
-!26 = metadata !{i32 786689, metadata !10, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
-!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!26 = metadata !{metadata !"0x101\00this\0016777219\001088", metadata !10, metadata !6, metadata !27} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
 !28 = metadata !{i32 3, i32 3, metadata !10, null}
 !29 = metadata !{i32 3, i32 18, metadata !10, null}
-!30 = metadata !{i32 786689, metadata !20, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!30 = metadata !{metadata !"0x101\00this\0016777219\001088", metadata !20, metadata !6, metadata !27} ; [ DW_TAG_arg_variable ] [this] [line 3]
 !31 = metadata !{i32 3, i32 3, metadata !20, null}
 !32 = metadata !{i32 3, i32 9, metadata !33, null}
-!33 = metadata !{i32 786443, metadata !6, metadata !20, i32 3, i32 7, i32 1} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!33 = metadata !{metadata !"0xb\003\007\001", metadata !6, metadata !20} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
 !34 = metadata !{i32 3, i32 18, metadata !33, null}
 !35 = metadata !{i32 7, i32 0, metadata !5, null}
-!36 = metadata !{i32 786689, metadata !5, metadata !"", metadata !6, i32 16777223, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 7]
+!36 = metadata !{metadata !"0x101\00\0016777223\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [line 7]
 !37 = metadata !{metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests"}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/DW_AT_specification.ll b/test/DebugInfo/X86/DW_AT_specification.ll
index 4f45f36..93aa47e 100644
--- a/test/DebugInfo/X86/DW_AT_specification.ll
+++ b/test/DebugInfo/X86/DW_AT_specification.ll

@@ -3,11 +3,11 @@
 
 ; test that the DW_AT_specification is a back edge in the file.
 
-; CHECK: [[BAR_DECL:0x[0-9a-f]*]]: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK-NEXT: DW_AT_MIPS_linkage_name {{.*}} "_ZN3foo3barEv"
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_specification {{.*}} {[[BAR_DECL]]}
+; CHECK: DW_AT_specification {{.*}} "_ZN3foo3barEv"
 
 
 @_ZZN3foo3barEvE1x = constant i32 0, align 4
@@ -20,23 +20,23 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!28}
 
-!0 = metadata !{i32 786449, metadata !27, i32 4, metadata !"clang version 3.0 ()", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.0 ()\000\00\000\00\000", metadata !27, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN3foo3barEv, null, metadata !11, null, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [bar]
-!6 = metadata !{i32 720937, metadata !27} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\004\000\001\000\006\00256\000\004", metadata !6, null, metadata !7, null, void ()* @_ZN3foo3barEv, null, metadata !11, null} ; [ DW_TAG_subprogram ] [line 4] [def] [bar]
+!6 = metadata !{metadata !"0x29", metadata !27} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 786451, metadata !27, null, metadata !"foo", i32 1, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
-!11 = metadata !{i32 720942, metadata !6, metadata !12, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ]
-!12 = metadata !{i32 720898, metadata !27, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !13, i32 0, null, null} ; [ DW_TAG_class_type ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x13\00foo\001\000\000\000\004\000", metadata !27, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!11 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\002\000\000\000\006\00256\000\002", metadata !6, metadata !12, metadata !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
+!12 = metadata !{metadata !"0x2\00foo\001\008\008\000\000\000", metadata !27, null, null, metadata !13, null, null} ; [ DW_TAG_class_type ]
 !13 = metadata !{metadata !11}
 !18 = metadata !{metadata !20}
-!20 = metadata !{i32 720948, i32 0, metadata !5, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 5, metadata !21, i32 1, i32 1, i32* @_ZZN3foo3barEvE1x, null} ; [ DW_TAG_variable ]
-!21 = metadata !{i32 720934, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_const_type ]
-!22 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!20 = metadata !{metadata !"0x34\00x\00x\00\005\001\001", metadata !5, metadata !6, metadata !21, i32* @_ZZN3foo3barEvE1x, null} ; [ DW_TAG_variable ]
+!21 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !22} ; [ DW_TAG_const_type ]
+!22 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !25 = metadata !{i32 6, i32 1, metadata !26, null}
-!26 = metadata !{i32 786443, metadata !6, metadata !5, i32 4, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{metadata !"0xb\004\0017\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
 !27 = metadata !{metadata !"nsNativeAppSupportBase.ii", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/toolkit/library"}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll b/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll
index f16cbb0..d54774d 100644
--- a/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll
+++ b/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll

@@ -30,15 +30,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test.c", metadata !"C:\5CProjects"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !10 = metadata !{i32 3, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/DW_TAG_friend.ll b/test/DebugInfo/X86/DW_TAG_friend.ll
index 2facc40..23d5c81 100644
--- a/test/DebugInfo/X86/DW_TAG_friend.ll
+++ b/test/DebugInfo/X86/DW_TAG_friend.ll

@@ -18,31 +18,31 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{i32 786449, metadata !28, i32 4, metadata !"clang version 3.1 (trunk 153413) (llvm/trunk 153428)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.1 (trunk 153413) (llvm/trunk 153428)\000\00\000\00\000", metadata !28, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !17}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 10, metadata !7, i32 0, i32 1, %class.A* @a, null} ; [ DW_TAG_variable ]
-!6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786434, metadata !28, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x34\00a\00a\00\0010\000\001", null, metadata !6, metadata !7, %class.A* @a, null} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !28, null, null, metadata !8, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !11}
-!9 = metadata !{i32 786445, metadata !28, metadata !7, metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 1, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786478, metadata !6, metadata !7, metadata !"A", metadata !"A", metadata !"", i32 1, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !15, i32 1} ; [ DW_TAG_subprogram ]
-!12 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0xd\00a\002\0032\0032\000\001", metadata !28, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0x2e\00A\00A\00\001\000\000\000\006\00320\000\001", metadata !6, metadata !7, metadata !12, null, null, null, i32 0, metadata !15} ; [ DW_TAG_subprogram ]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{null, metadata !14}
-!14 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !7} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !7} ; [ DW_TAG_pointer_type ]
 !15 = metadata !{metadata !16}
-!16 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!17 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !6, i32 11, metadata !18, i32 0, i32 1, %class.B* @b, null} ; [ DW_TAG_variable ]
-!18 = metadata !{i32 786434, metadata !28, null, metadata !"B", i32 5, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_class_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
+!16 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0x34\00b\00b\00\0011\000\001", null, metadata !6, metadata !18, %class.B* @b, null} ; [ DW_TAG_variable ]
+!18 = metadata !{metadata !"0x2\00B\005\0032\0032\000\000\000", metadata !28, null, null, metadata !19, null, null, null} ; [ DW_TAG_class_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
 !19 = metadata !{metadata !20, metadata !21, metadata !27}
-!20 = metadata !{i32 786445, metadata !28, metadata !18, metadata !"b", i32 7, i64 32, i64 32, i64 0, i32 1, metadata !10} ; [ DW_TAG_member ]
-!21 = metadata !{i32 786478, metadata !6, metadata !18, metadata !"B", metadata !"B", metadata !"", i32 5, metadata !22, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !25, i32 5} ; [ DW_TAG_subprogram ]
-!22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !"0xd\00b\007\0032\0032\000\001", metadata !28, metadata !18, metadata !10} ; [ DW_TAG_member ]
+!21 = metadata !{metadata !"0x2e\00B\00B\00\005\000\000\000\006\00320\000\005", metadata !6, metadata !18, metadata !22, null, null, null, i32 0, metadata !25} ; [ DW_TAG_subprogram ]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null, metadata !24}
-!24 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !18} ; [ DW_TAG_pointer_type ]
+!24 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !18} ; [ DW_TAG_pointer_type ]
 !25 = metadata !{metadata !26}
-!26 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!27 = metadata !{i32 786474, metadata !18, null, metadata !6, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_friend ]
+!26 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!27 = metadata !{metadata !"0x2a\00\000\000\000\000\000", metadata !18, null, metadata !7} ; [ DW_TAG_friend ]
 !28 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/aligned_stack_var.ll b/test/DebugInfo/X86/aligned_stack_var.ll
index 54484ac..9dea6b7 100644
--- a/test/DebugInfo/X86/aligned_stack_var.ll
+++ b/test/DebugInfo/X86/aligned_stack_var.ll

@@ -18,26 +18,26 @@
 define void @_Z3runv() nounwind uwtable {
 entry:
   %x = alloca i32, align 32
-  call void @llvm.dbg.declare(metadata !{i32* %x}, metadata !9), !dbg !12
+  call void @llvm.dbg.declare(metadata !{i32* %x}, metadata !9, metadata !{metadata !"0x102"}), !dbg !12
   ret void, !dbg !13
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15}
 
-!0 = metadata !{i32 786449, metadata !14, i32 4, metadata !"clang version 3.2 (trunk 155696:155697) (llvm/trunk 155696)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 155696:155697) (llvm/trunk 155696)\000\00\000\00\000", metadata !14, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !14, metadata !6, metadata !"run", metadata !"run", metadata !"_Z3runv", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00run\00run\00_Z3runv\001\000\001\000\006\00256\000\001", metadata !14, metadata !6, metadata !7, null, void ()* @_Z3runv, null, null, metadata !1} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{i32 786688, metadata !10, metadata !"x", metadata !6, i32 2, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{i32 786443, metadata !14, metadata !5, i32 1, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x100\00x\002\000", metadata !10, metadata !6, metadata !11} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{metadata !"0xb\001\0012\000", metadata !14, metadata !5} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !12 = metadata !{i32 2, i32 7, metadata !10, null}
 !13 = metadata !{i32 3, i32 1, metadata !10, null}
 !14 = metadata !{metadata !"test.cc", metadata !"/home/samsonov/debuginfo"}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/arange.ll b/test/DebugInfo/X86/arange.ll
index 4eea646..d773e87 100644
--- a/test/DebugInfo/X86/arange.ll
+++ b/test/DebugInfo/X86/arange.ll

@@ -29,18 +29,18 @@
 !llvm.module.flags = !{!12, !13}
 !llvm.ident = !{!14}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !9, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/simple.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !2, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/simple.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"simple.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"foo<&i>", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !5, metadata !"_ZTS3fooIXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [foo<&i>] [line 3, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00foo<&i>\003\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS3fooIXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [foo<&i>] [line 3, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786480, null, metadata !"x", metadata !7, i32* @i, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!7 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!6 = metadata !{metadata !"0x30\00x\000\000", null, metadata !7, i32* @i, null} ; [ DW_TAG_template_value_parameter ]
+!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !11, i32 6, metadata !4, i32 0, i32 1, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
-!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/simple.cpp]
+!10 = metadata !{metadata !"0x34\00f\00f\00\006\000\001", null, metadata !11, metadata !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
+!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/simple.cpp]
 !12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !14 = metadata !{metadata !"clang version 3.5 "}

diff --git a/test/DebugInfo/X86/arguments.ll b/test/DebugInfo/X86/arguments.ll
index 989e4ff..779db48 100644
--- a/test/DebugInfo/X86/arguments.ll
+++ b/test/DebugInfo/X86/arguments.ll

@@ -31,8 +31,8 @@
 ; Function Attrs: nounwind uwtable
 define void @_Z4func3fooS_(%struct.foo* %f, %struct.foo* %g) #0 {
 entry:
-  call void @llvm.dbg.declare(metadata !{%struct.foo* %f}, metadata !19), !dbg !20
-  call void @llvm.dbg.declare(metadata !{%struct.foo* %g}, metadata !21), !dbg !20
+  call void @llvm.dbg.declare(metadata !{%struct.foo* %f}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
+  call void @llvm.dbg.declare(metadata !{%struct.foo* %g}, metadata !21, metadata !{metadata !"0x102"}), !dbg !20
   %i = getelementptr inbounds %struct.foo* %f, i32 0, i32 0, !dbg !22
   %0 = load i32* %i, align 4, !dbg !22
   %inc = add nsw i32 %0, 1, !dbg !22
@@ -41,7 +41,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -49,28 +49,28 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!24}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"scratch.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_Z4func3fooS_", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*, %struct.foo*)* @_Z4func3fooS_, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4func3fooS_\006\000\001\000\006\00256\000\006", metadata !1, metadata !5, metadata !6, null, void (%struct.foo*, %struct.foo*)* @_Z4func3fooS_, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8, metadata !8}
-!8 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
+!8 = metadata !{metadata !"0x13\00foo\001\0032\0032\000\000\000", metadata !1, null, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !12}
-!10 = metadata !{i32 786445, metadata !1, metadata !8, metadata !"i", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !11} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 0] [from int]
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{i32 786478, metadata !1, metadata !8, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !13, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !18, i32 2} ; [ DW_TAG_subprogram ] [line 2] [foo]
-!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0xd\00i\003\0032\0032\000\000", metadata !1, metadata !8, metadata !11} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 0] [from int]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\000\000\006\00256\000\002", metadata !1, metadata !8, metadata !13, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 2] [foo]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{null, metadata !15, metadata !16}
-!15 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
-!16 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
+!16 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
 !18 = metadata !{i32 786468}
-!19 = metadata !{i32 786689, metadata !4, metadata !"f", metadata !5, i32 16777222, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [f] [line 6]
+!19 = metadata !{metadata !"0x101\00f\0016777222\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [f] [line 6]
 !20 = metadata !{i32 6, i32 0, metadata !4, null}
-!21 = metadata !{i32 786689, metadata !4, metadata !"g", metadata !5, i32 33554438, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [g] [line 6]
+!21 = metadata !{metadata !"0x101\00g\0033554438\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [g] [line 6]
 !22 = metadata !{i32 7, i32 0, metadata !4, null}
-!23 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!23 = metadata !{i32 8, i32 0, metadata !4, null}
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/array.ll b/test/DebugInfo/X86/array.ll
index dc6c7a4..3fbfb1d 100644
--- a/test/DebugInfo/X86/array.ll
+++ b/test/DebugInfo/X86/array.ll

@@ -25,7 +25,7 @@
 
 ; Function Attrs: nounwind ssp uwtable
 define void @f(i32* nocapture %p) #0 {
-  tail call void @llvm.dbg.value(metadata !{i32* %p}, i64 0, metadata !11), !dbg !28
+  tail call void @llvm.dbg.value(metadata !{i32* %p}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !28
   store i32 42, i32* %p, align 4, !dbg !29, !tbaa !30
   ret void, !dbg !34
 }
@@ -33,15 +33,15 @@
 ; Function Attrs: nounwind ssp uwtable
 define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
   %array = alloca [4 x i32], align 16
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !19), !dbg !35
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !20), !dbg !35
-  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !36
   %1 = bitcast [4 x i32]* %array to i8*, !dbg !36
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !36
-  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !36
   %2 = getelementptr inbounds [4 x i32]* %array, i64 0, i64 0, !dbg !37
   call void @f(i32* %2), !dbg !37
-  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{[4 x i32]* %array}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !36
   %3 = load i32* %2, align 16, !dbg !38, !tbaa !30
   ret i32 %3, !dbg !38
 }
@@ -50,7 +50,7 @@
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { nounwind ssp uwtable }
 attributes #1 = { nounwind }
@@ -60,33 +60,33 @@
 !llvm.module.flags = !{!25, !26}
 !llvm.ident = !{!27}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/array.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/array.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"array.c", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !12}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*)* @f, null, null, metadata !10, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/array.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*)* @f, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/array.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
-!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786689, metadata !4, metadata !"p", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 1]
-!12 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 5, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !18, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
-!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0x101\00p\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!12 = metadata !{metadata !"0x2e\00main\00main\00\005\000\001\000\006\00256\001\005", metadata !1, metadata !5, metadata !13, null, i32 (i32, i8**)* @main, null, null, metadata !18} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{metadata !9, metadata !9, metadata !15}
-!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!17 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!17 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !18 = metadata !{metadata !19, metadata !20, metadata !21}
-!19 = metadata !{i32 786689, metadata !12, metadata !"argc", metadata !5, i32 16777221, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 5]
-!20 = metadata !{i32 786689, metadata !12, metadata !"argv", metadata !5, i32 33554437, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 5]
-!21 = metadata !{i32 786688, metadata !12, metadata !"array", metadata !5, i32 6, metadata !22, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [array] [line 6]
-!22 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 128, i64 32, i32 0, i32 0, metadata !9, metadata !23, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
+!19 = metadata !{metadata !"0x101\00argc\0016777221\000", metadata !12, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [argc] [line 5]
+!20 = metadata !{metadata !"0x101\00argv\0033554437\000", metadata !12, metadata !5, metadata !15} ; [ DW_TAG_arg_variable ] [argv] [line 5]
+!21 = metadata !{metadata !"0x100\00array\006\000", metadata !12, metadata !5, metadata !22} ; [ DW_TAG_auto_variable ] [array] [line 6]
+!22 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, null, metadata !9, metadata !23, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
 !23 = metadata !{metadata !24}
-!24 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
+!24 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
 !25 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !27 = metadata !{metadata !"clang version 3.5.0 "}
 !28 = metadata !{i32 1, i32 0, metadata !4, null}
 !29 = metadata !{i32 2, i32 0, metadata !4, null}
@@ -98,4 +98,4 @@
 !35 = metadata !{i32 5, i32 0, metadata !12, null}
 !36 = metadata !{i32 6, i32 0, metadata !12, null}
 !37 = metadata !{i32 7, i32 0, metadata !12, null}
-!38 = metadata !{i32 8, i32 0, metadata !12, null} ; [ DW_TAG_imported_declaration ]
+!38 = metadata !{i32 8, i32 0, metadata !12, null}

diff --git a/test/DebugInfo/X86/array2.ll b/test/DebugInfo/X86/array2.ll
index 2dc2af3..e2d42e8 100644
--- a/test/DebugInfo/X86/array2.ll
+++ b/test/DebugInfo/X86/array2.ll

@@ -29,7 +29,7 @@
 entry:
   %p.addr = alloca i32*, align 8
   store i32* %p, i32** %p.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32** %p.addr}, metadata !19), !dbg !20
+  call void @llvm.dbg.declare(metadata !{i32** %p.addr}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
   %0 = load i32** %p.addr, align 8, !dbg !21
   %arrayidx = getelementptr inbounds i32* %0, i64 0, !dbg !21
   store i32 42, i32* %arrayidx, align 4, !dbg !21
@@ -37,7 +37,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind ssp uwtable
 define i32 @main(i32 %argc, i8** %argv) #0 {
@@ -48,10 +48,10 @@
   %array = alloca [4 x i32], align 16
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !23), !dbg !24
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !25), !dbg !24
-  call void @llvm.dbg.declare(metadata !{[4 x i32]* %array}, metadata !26), !dbg !30
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !25, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata !{[4 x i32]* %array}, metadata !26, metadata !{metadata !"0x102"}), !dbg !30
   %0 = bitcast [4 x i32]* %array to i8*, !dbg !30
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !30
   %arraydecay = getelementptr inbounds [4 x i32]* %array, i32 0, i32 0, !dbg !31
@@ -72,36 +72,36 @@
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [array.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [array.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"array.c", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*)* @f, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [array.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32*)* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [array.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
-!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 5, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x2e\00main\00main\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !11, null, i32 (i32, i8**)* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !9, metadata !9, metadata !13}
-!13 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!15 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!15 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !18 = metadata !{metadata !"clang version 3.5.0 "}
-!19 = metadata !{i32 786689, metadata !4, metadata !"p", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!19 = metadata !{metadata !"0x101\00p\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [p] [line 1]
 !20 = metadata !{i32 1, i32 0, metadata !4, null}
 !21 = metadata !{i32 2, i32 0, metadata !4, null}
 !22 = metadata !{i32 3, i32 0, metadata !4, null}
-!23 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !5, i32 16777221, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 5]
+!23 = metadata !{metadata !"0x101\00argc\0016777221\000", metadata !10, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [argc] [line 5]
 !24 = metadata !{i32 5, i32 0, metadata !10, null}
-!25 = metadata !{i32 786689, metadata !10, metadata !"argv", metadata !5, i32 33554437, metadata !13, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 5]
-!26 = metadata !{i32 786688, metadata !10, metadata !"array", metadata !5, i32 6, metadata !27, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [array] [line 6]
-!27 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 128, i64 32, i32 0, i32 0, metadata !9, metadata !28, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
+!25 = metadata !{metadata !"0x101\00argv\0033554437\000", metadata !10, metadata !5, metadata !13} ; [ DW_TAG_arg_variable ] [argv] [line 5]
+!26 = metadata !{metadata !"0x100\00array\006\000", metadata !10, metadata !5, metadata !27} ; [ DW_TAG_auto_variable ] [array] [line 6]
+!27 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, null, metadata !9, metadata !28, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from int]
 !28 = metadata !{metadata !29}
-!29 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
+!29 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
 !30 = metadata !{i32 6, i32 0, metadata !10, null}
 !31 = metadata !{i32 7, i32 0, metadata !10, null}
-!32 = metadata !{i32 8, i32 0, metadata !10, null} ; [ DW_TAG_imported_declaration ]
+!32 = metadata !{i32 8, i32 0, metadata !10, null}

diff --git a/test/DebugInfo/X86/block-capture.ll b/test/DebugInfo/X86/block-capture.ll
index e842afe..e59aa05 100644
--- a/test/DebugInfo/X86/block-capture.ll
+++ b/test/DebugInfo/X86/block-capture.ll

@@ -17,15 +17,15 @@
 %struct.__block_descriptor = type { i64, i64 }
 %struct.__block_literal_generic = type { i8*, i32, i32, i8*, %struct.__block_descriptor* }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define hidden void @__foo_block_invoke_0(i8* %.block_descriptor) uwtable ssp {
 entry:
   %exn.slot = alloca i8*
   %ehselector.slot = alloca i32
-  call void @llvm.dbg.value(metadata !{i8* %.block_descriptor}, i64 0, metadata !39), !dbg !51
+  call void @llvm.dbg.value(metadata !{i8* %.block_descriptor}, i64 0, metadata !39, metadata !{metadata !"0x102"}), !dbg !51
   %block = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void ()* }>*, !dbg !52
-  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void ()* }>* %block}, metadata !53), !dbg !54
+  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void ()* }>* %block}, metadata !53, metadata !65), !dbg !54
   %block.capture.addr = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, void ()* }>* %block, i32 0, i32 5, !dbg !55
   %0 = load void ()** %block.capture.addr, align 8, !dbg !55
   %block.literal = bitcast void ()* %0 to %struct.__block_literal_generic*, !dbg !55
@@ -58,7 +58,7 @@
   br label %eh.cont, !dbg !58
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 declare i8* @objc_begin_catch(i8*)
 
@@ -69,65 +69,65 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35, !36, !37, !38, !64}
 
-!0 = metadata !{i32 786449, metadata !63, i32 16, metadata !"clang version 3.1 (trunk 151227)", i1 false, metadata !"", i32 2, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0016\00clang version 3.1 (trunk 151227)\000\00\002\00\001", metadata !63, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !28, metadata !31, metadata !34}
-!5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 5} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !63} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\001\000\006\00256\000\005", metadata !6, metadata !6, metadata !7, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !63} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786454, metadata !63, null, metadata !"dispatch_block_t", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786451, metadata !63, metadata !6, metadata !"__block_literal_generic", i32 5, i64 256, i64 0, i32 0, i32 8, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 5, size 256, align 0, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0x16\00dispatch_block_t\001\000\000\000\000", metadata !63, null, metadata !10} ; [ DW_TAG_typedef ]
+!10 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0x13\00__block_literal_generic\005\00256\000\000\008\000", metadata !63, metadata !6, null, metadata !12, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 5, size 256, align 0, offset 0] [def] [from ]
 !12 = metadata !{metadata !13, metadata !15, metadata !17, metadata !18, metadata !19}
-!13 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__isa", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_member ]
-!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!15 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__flags", i32 0, i64 32, i64 32, i64 64, i32 0, metadata !16} ; [ DW_TAG_member ]
-!16 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!17 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__reserved", i32 0, i64 32, i64 32, i64 96, i32 0, metadata !16} ; [ DW_TAG_member ]
-!18 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__FuncPtr", i32 0, i64 64, i64 64, i64 128, i32 0, metadata !14} ; [ DW_TAG_member ]
-!19 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__descriptor", i32 5, i64 64, i64 64, i64 192, i32 0, metadata !20} ; [ DW_TAG_member ]
-!20 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !21} ; [ DW_TAG_pointer_type ]
-!21 = metadata !{i32 786451, metadata !63, metadata !6, metadata !"__block_descriptor", i32 5, i64 128, i64 0, i32 0, i32 8, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 5, size 128, align 0, offset 0] [def] [from ]
+!13 = metadata !{metadata !"0xd\00__isa\000\0064\0064\000\000", metadata !63, metadata !6, metadata !14} ; [ DW_TAG_member ]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ]
+!15 = metadata !{metadata !"0xd\00__flags\000\0032\0032\0064\000", metadata !63, metadata !6, metadata !16} ; [ DW_TAG_member ]
+!16 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0xd\00__reserved\000\0032\0032\0096\000", metadata !63, metadata !6, metadata !16} ; [ DW_TAG_member ]
+!18 = metadata !{metadata !"0xd\00__FuncPtr\000\0064\0064\00128\000", metadata !63, metadata !6, metadata !14} ; [ DW_TAG_member ]
+!19 = metadata !{metadata !"0xd\00__descriptor\005\0064\0064\00192\000", metadata !63, metadata !6, metadata !20} ; [ DW_TAG_member ]
+!20 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !21} ; [ DW_TAG_pointer_type ]
+!21 = metadata !{metadata !"0x13\00__block_descriptor\005\00128\000\000\008\000", metadata !63, metadata !6, null, metadata !22, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 5, size 128, align 0, offset 0] [def] [from ]
 !22 = metadata !{metadata !23, metadata !25}
-!23 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"reserved", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_member ]
-!24 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!25 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"Size", i32 0, i64 64, i64 64, i64 64, i32 0, metadata !24} ; [ DW_TAG_member ]
-!28 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__foo_block_invoke_0", metadata !"__foo_block_invoke_0", metadata !"", i32 7, metadata !29, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @__foo_block_invoke_0, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
-!29 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{metadata !"0xd\00reserved\000\0064\0064\000\000", metadata !63, metadata !6, metadata !24} ; [ DW_TAG_member ]
+!24 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ]
+!25 = metadata !{metadata !"0xd\00Size\000\0064\0064\0064\000", metadata !63, metadata !6, metadata !24} ; [ DW_TAG_member ]
+!28 = metadata !{metadata !"0x2e\00__foo_block_invoke_0\00__foo_block_invoke_0\00\007\001\001\000\006\00256\000\007", metadata !6, metadata !6, metadata !29, null, void (i8*)* @__foo_block_invoke_0, null, null, null} ; [ DW_TAG_subprogram ]
+!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !30, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !30 = metadata !{null, metadata !14}
-!31 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__copy_helper_block_", metadata !"__copy_helper_block_", metadata !"", i32 10, metadata !32, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 10} ; [ DW_TAG_subprogram ]
-!32 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !33, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!31 = metadata !{metadata !"0x2e\00__copy_helper_block_\00__copy_helper_block_\00\0010\001\001\000\006\00256\000\0010", metadata !6, metadata !6, metadata !32, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !33 = metadata !{null, metadata !14, metadata !14}
-!34 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__destroy_helper_block_", metadata !"__destroy_helper_block_", metadata !"", i32 10, metadata !29, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 10} ; [ DW_TAG_subprogram ]
+!34 = metadata !{metadata !"0x2e\00__destroy_helper_block_\00__destroy_helper_block_\00\0010\001\001\000\006\00256\000\0010", metadata !6, metadata !6, metadata !29, null, null, null, null, null} ; [ DW_TAG_subprogram ]
 !35 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
 !36 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
 !37 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
 !38 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!39 = metadata !{i32 786689, metadata !28, metadata !".block_descriptor", metadata !6, i32 16777223, metadata !40, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!40 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !41} ; [ DW_TAG_pointer_type ]
-!41 = metadata !{i32 786451, metadata !63, metadata !6, metadata !"__block_literal_1", i32 7, i64 320, i64 64, i32 0, i32 0, null, metadata !42, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 7, size 320, align 64, offset 0] [def] [from ]
+!39 = metadata !{metadata !"0x101\00.block_descriptor\0016777223\0064", metadata !28, metadata !6, metadata !40} ; [ DW_TAG_arg_variable ]
+!40 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !41} ; [ DW_TAG_pointer_type ]
+!41 = metadata !{metadata !"0x13\00__block_literal_1\007\00320\0064\000\000\000", metadata !63, metadata !6, null, metadata !42, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 7, size 320, align 64, offset 0] [def] [from ]
 !42 = metadata !{metadata !43, metadata !44, metadata !45, metadata !46, metadata !47, metadata !50}
-!43 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__isa", i32 7, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_member ]
-!44 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__flags", i32 7, i64 32, i64 32, i64 64, i32 0, metadata !16} ; [ DW_TAG_member ]
-!45 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__reserved", i32 7, i64 32, i64 32, i64 96, i32 0, metadata !16} ; [ DW_TAG_member ]
-!46 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__FuncPtr", i32 7, i64 64, i64 64, i64 128, i32 0, metadata !14} ; [ DW_TAG_member ]
-!47 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__descriptor", i32 7, i64 64, i64 64, i64 192, i32 0, metadata !48} ; [ DW_TAG_member ]
-!48 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !49} ; [ DW_TAG_pointer_type ]
-!49 = metadata !{i32 786451, metadata !63, null, metadata !"__block_descriptor_withcopydispose", i32 7, i32 0, i32 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 7, size 0, align 0, offset 0] [decl] [from ]
-!50 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"block", i32 7, i64 64, i64 64, i64 256, i32 0, metadata !9} ; [ DW_TAG_member ]
+!43 = metadata !{metadata !"0xd\00__isa\007\0064\0064\000\000", metadata !63, metadata !6, metadata !14} ; [ DW_TAG_member ]
+!44 = metadata !{metadata !"0xd\00__flags\007\0032\0032\0064\000", metadata !63, metadata !6, metadata !16} ; [ DW_TAG_member ]
+!45 = metadata !{metadata !"0xd\00__reserved\007\0032\0032\0096\000", metadata !63, metadata !6, metadata !16} ; [ DW_TAG_member ]
+!46 = metadata !{metadata !"0xd\00__FuncPtr\007\0064\0064\00128\000", metadata !63, metadata !6, metadata !14} ; [ DW_TAG_member ]
+!47 = metadata !{metadata !"0xd\00__descriptor\007\0064\0064\00192\000", metadata !63, metadata !6, metadata !48} ; [ DW_TAG_member ]
+!48 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !49} ; [ DW_TAG_pointer_type ]
+!49 = metadata !{metadata !"0x13\00__block_descriptor_withcopydispose\007\000\000\000\004\000", metadata !63, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 7, size 0, align 0, offset 0] [decl] [from ]
+!50 = metadata !{metadata !"0xd\00block\007\0064\0064\00256\000", metadata !63, metadata !6, metadata !9} ; [ DW_TAG_member ]
 !51 = metadata !{i32 7, i32 18, metadata !28, null}
 !52 = metadata !{i32 7, i32 19, metadata !28, null}
-!53 = metadata !{i32 786688, metadata !28, metadata !"block", metadata !6, i32 5, metadata !9, i32 0, i32 0, metadata !65} ; [ DW_TAG_auto_variable ]
+!53 = metadata !{metadata !"0x100\00block\005\000", metadata !28, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
 !54 = metadata !{i32 5, i32 27, metadata !28, null}
 !55 = metadata !{i32 8, i32 22, metadata !56, null}
-!56 = metadata !{i32 786443, metadata !6, metadata !57, i32 7, i32 26, i32 2} ; [ DW_TAG_lexical_block ]
-!57 = metadata !{i32 786443, metadata !6, metadata !28, i32 7, i32 19, i32 1} ; [ DW_TAG_lexical_block ]
+!56 = metadata !{metadata !"0xb\007\0026\002", metadata !6, metadata !57} ; [ DW_TAG_lexical_block ]
+!57 = metadata !{metadata !"0xb\007\0019\001", metadata !6, metadata !28} ; [ DW_TAG_lexical_block ]
 !58 = metadata !{i32 10, i32 20, metadata !59, null}
-!59 = metadata !{i32 786443, metadata !6, metadata !60, i32 9, i32 35, i32 4} ; [ DW_TAG_lexical_block ]
-!60 = metadata !{i32 786443, metadata !6, metadata !57, i32 9, i32 35, i32 3} ; [ DW_TAG_lexical_block ]
+!59 = metadata !{metadata !"0xb\009\0035\004", metadata !6, metadata !60} ; [ DW_TAG_lexical_block ]
+!60 = metadata !{metadata !"0xb\009\0035\003", metadata !6, metadata !57} ; [ DW_TAG_lexical_block ]
 !61 = metadata !{i32 10, i32 21, metadata !28, null}
 !62 = metadata !{i32 9, i32 20, metadata !56, null}
 !63 = metadata !{metadata !"foo.m", metadata !"/Users/echristo"}
-!64 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!65 = metadata !{i64 1, i64 32}
+!64 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!65 = metadata !{metadata !"0x102\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_plus 32]

diff --git a/test/DebugInfo/X86/byvalstruct.ll b/test/DebugInfo/X86/byvalstruct.ll
index d787ef3..0570950 100644
--- a/test/DebugInfo/X86/byvalstruct.ll
+++ b/test/DebugInfo/X86/byvalstruct.ll

@@ -66,20 +66,20 @@
   %otherBitmap.addr = alloca %0*, align 8
   %length.addr = alloca i64, align 8
   store %0* %self, %0** %self.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%0** %self.addr}, metadata !28), !dbg !29
+  call void @llvm.dbg.declare(metadata !{%0** %self.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !29
   store i8* %_cmd, i8** %_cmd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %_cmd.addr}, metadata !30), !dbg !29
+  call void @llvm.dbg.declare(metadata !{i8** %_cmd.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !29
   store %0* %otherBitmap, %0** %otherBitmap.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%0** %otherBitmap.addr}, metadata !32), !dbg !29
-  call void @llvm.dbg.declare(metadata !{%struct.ImageInfo* %info}, metadata !33), !dbg !34
+  call void @llvm.dbg.declare(metadata !{%0** %otherBitmap.addr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !29
+  call void @llvm.dbg.declare(metadata !{%struct.ImageInfo* %info}, metadata !33, metadata !{metadata !"0x102"}), !dbg !34
   store i64 %length, i64* %length.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %length.addr}, metadata !35), !dbg !36
+  call void @llvm.dbg.declare(metadata !{i64* %length.addr}, metadata !35, metadata !{metadata !"0x102"}), !dbg !36
   %0 = load i8** %retval, !dbg !37
   ret i8* %0, !dbg !37
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { ssp uwtable }
 attributes #1 = { nounwind readnone }
@@ -87,42 +87,42 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!24, !25, !26, !27, !38}
 
-!0 = metadata !{i32 786449, metadata !1, i32 17, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 2, metadata !2, metadata !3, metadata !6, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/t.mm] [DW_LANG_ObjC_plus_plus]
+!0 = metadata !{metadata !"0x11\0017\00clang version 3.4 \000\00\002\00\000", metadata !1, metadata !2, metadata !3, metadata !6, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/t.mm] [DW_LANG_ObjC_plus_plus]
 !1 = metadata !{metadata !"t.mm", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, metadata !5, metadata !"Bitmap", i32 8, i64 8, i64 8, i32 0, i32 512, null, metadata !2, i32 17, null, null, null} ; [ DW_TAG_structure_type ] [Bitmap] [line 8, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/t.mm]
+!4 = metadata !{metadata !"0x13\00Bitmap\008\008\008\000\00512\0017", metadata !1, metadata !5, null, metadata !2, null, null, null} ; [ DW_TAG_structure_type ] [Bitmap] [line 8, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/t.mm]
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"-[Bitmap initWithCopy:andInfo:andLength:]", metadata !"-[Bitmap initWithCopy:andInfo:andLength:]", metadata !"", i32 9, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, i8* (%0*, i8*, %0*, %struct.ImageInfo*, i64)* @"\01-[Bitmap initWithCopy:andInfo:andLength:]", null, null, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [local] [def] [-[Bitmap initWithCopy:andInfo:andLength:]]
-!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00-[Bitmap initWithCopy:andInfo:andLength:]\00-[Bitmap initWithCopy:andInfo:andLength:]\00\009\001\001\000\006\00256\000\009", metadata !1, metadata !5, metadata !8, null, i8* (%0*, i8*, %0*, %struct.ImageInfo*, i64)* @"\01-[Bitmap initWithCopy:andInfo:andLength:]", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [local] [def] [-[Bitmap initWithCopy:andInfo:andLength:]]
+!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !9 = metadata !{metadata !4, metadata !10, metadata !11, metadata !14, metadata !15, metadata !19}
-!10 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Bitmap]
-!11 = metadata !{i32 786454, metadata !1, null, metadata !"SEL", i32 9, i64 0, i64 0, i64 0, i32 64, metadata !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [artificial] [from ]
-!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
-!13 = metadata !{i32 786451, metadata !1, null, metadata !"objc_selector", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Bitmap]
-!15 = metadata !{i32 786454, metadata !1, null, metadata !"ImageInfo", i32 7, i64 0, i64 0, i64 0, i32 0, metadata !16} ; [ DW_TAG_typedef ] [ImageInfo] [line 7, size 0, align 0, offset 0] [from ]
-!16 = metadata !{i32 786451, metadata !1, null, metadata !"", i32 2, i64 192, i64 64, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 2, size 192, align 64, offset 0] [def] [from ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Bitmap]
+!11 = metadata !{metadata !"0x16\00SEL\009\000\000\000\0064", metadata !1, null, metadata !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [artificial] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
+!13 = metadata !{metadata !"0x13\00objc_selector\000\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Bitmap]
+!15 = metadata !{metadata !"0x16\00ImageInfo\007\000\000\000\000", metadata !1, null, metadata !16} ; [ DW_TAG_typedef ] [ImageInfo] [line 7, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0x13\00\002\00192\0064\000\000\000", metadata !1, null, null, metadata !17, null, null, null} ; [ DW_TAG_structure_type ] [line 2, size 192, align 64, offset 0] [def] [from ]
 !17 = metadata !{metadata !18, metadata !21, metadata !22}
-!18 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"width", i32 4, i64 64, i64 64, i64 0, i32 0, metadata !19} ; [ DW_TAG_member ] [width] [line 4, size 64, align 64, offset 0] [from NSUInteger]
-!19 = metadata !{i32 786454, metadata !1, null, metadata !"NSUInteger", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_typedef ] [NSUInteger] [line 1, size 0, align 0, offset 0] [from long unsigned int]
-!20 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!21 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"height", i32 5, i64 64, i64 64, i64 64, i32 0, metadata !19} ; [ DW_TAG_member ] [height] [line 5, size 64, align 64, offset 64] [from NSUInteger]
-!22 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"pixelAspect", i32 6, i64 64, i64 64, i64 128, i32 0, metadata !23} ; [ DW_TAG_member ] [pixelAspect] [line 6, size 64, align 64, offset 128] [from double]
-!23 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!18 = metadata !{metadata !"0xd\00width\004\0064\0064\000\000", metadata !1, metadata !16, metadata !19} ; [ DW_TAG_member ] [width] [line 4, size 64, align 64, offset 0] [from NSUInteger]
+!19 = metadata !{metadata !"0x16\00NSUInteger\001\000\000\000\000", metadata !1, null, metadata !20} ; [ DW_TAG_typedef ] [NSUInteger] [line 1, size 0, align 0, offset 0] [from long unsigned int]
+!20 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!21 = metadata !{metadata !"0xd\00height\005\0064\0064\0064\000", metadata !1, metadata !16, metadata !19} ; [ DW_TAG_member ] [height] [line 5, size 64, align 64, offset 64] [from NSUInteger]
+!22 = metadata !{metadata !"0xd\00pixelAspect\006\0064\0064\00128\000", metadata !1, metadata !16, metadata !23} ; [ DW_TAG_member ] [pixelAspect] [line 6, size 64, align 64, offset 128] [from double]
+!23 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
 !24 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
 !25 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
 !26 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
 !27 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!28 = metadata !{i32 786689, metadata !7, metadata !"self", metadata !5, i32 16777225, metadata !14, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [self] [line 9]
+!28 = metadata !{metadata !"0x101\00self\0016777225\001088", metadata !7, metadata !5, metadata !14} ; [ DW_TAG_arg_variable ] [self] [line 9]
 !29 = metadata !{i32 9, i32 0, metadata !7, null}
-!30 = metadata !{i32 786689, metadata !7, metadata !"_cmd", metadata !5, i32 33554441, metadata !31, i32 64, i32 0} ; [ DW_TAG_arg_variable ] [_cmd] [line 9]
-!31 = metadata !{i32 786454, metadata !1, null, metadata !"SEL", i32 9, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [from ]
-!32 = metadata !{i32 786689, metadata !7, metadata !"otherBitmap", metadata !5, i32 50331657, metadata !14, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [otherBitmap] [line 9]
-!33 = metadata !{i32 786689, metadata !7, metadata !"info", metadata !5, i32 67108874, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [info] [line 10]
+!30 = metadata !{metadata !"0x101\00_cmd\0033554441\0064", metadata !7, metadata !5, metadata !31} ; [ DW_TAG_arg_variable ] [_cmd] [line 9]
+!31 = metadata !{metadata !"0x16\00SEL\009\000\000\000\000", metadata !1, null, metadata !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [from ]
+!32 = metadata !{metadata !"0x101\00otherBitmap\0050331657\000", metadata !7, metadata !5, metadata !14} ; [ DW_TAG_arg_variable ] [otherBitmap] [line 9]
+!33 = metadata !{metadata !"0x101\00info\0067108874\000", metadata !7, metadata !5, metadata !15} ; [ DW_TAG_arg_variable ] [info] [line 10]
 !34 = metadata !{i32 10, i32 0, metadata !7, null}
-!35 = metadata !{i32 786689, metadata !7, metadata !"length", metadata !5, i32 83886091, metadata !19, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [length] [line 11]
+!35 = metadata !{metadata !"0x101\00length\0083886091\000", metadata !7, metadata !5, metadata !19} ; [ DW_TAG_arg_variable ] [length] [line 11]
 !36 = metadata !{i32 11, i32 0, metadata !7, null}
 !37 = metadata !{i32 13, i32 0, metadata !7, null}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/c-type-units.ll b/test/DebugInfo/X86/c-type-units.ll
index 431b029..9326e31 100644
--- a/test/DebugInfo/X86/c-type-units.ll
+++ b/test/DebugInfo/X86/c-type-units.ll

@@ -17,13 +17,13 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/simple.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/simple.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"simple.c", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !5, i32 2, metadata !6, i32 0, i32 1, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 2] [def]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/simple.c]
-!6 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 0, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x34\00f\00f\00\002\000\001", null, metadata !5, metadata !6, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 2] [def]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/simple.c]
+!6 = metadata !{metadata !"0x13\00foo\001\000\008\000\000\000", metadata !1, null, null, metadata !2, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 8, offset 0] [def] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5 "}

diff --git a/test/DebugInfo/X86/coff_debug_info_type.ll b/test/DebugInfo/X86/coff_debug_info_type.ll
index a1051c3..ec85944 100644
--- a/test/DebugInfo/X86/coff_debug_info_type.ll
+++ b/test/DebugInfo/X86/coff_debug_info_type.ll

@@ -1,10 +1,12 @@
-; RUN: llc -mtriple=i686-pc-mingw32 -filetype=asm -O0 < %s | FileCheck %s
-; RUN: llc -mtriple=i686-pc-cygwin -filetype=asm -O0 < %s | FileCheck %s
-; RUN: llc -mtriple=i686-w64-mingw32 -filetype=asm -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-pc-mingw32 -dwarf-accel-tables=Enable -filetype=asm -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-pc-cygwin -dwarf-accel-tables=Enable -filetype=asm -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-w64-mingw32 -dwarf-accel-tables=Enable -filetype=asm -O0 < %s | FileCheck %s
 ; CHECK:    .section  .debug_info
+; CHECK:    .section  .apple_names
+; CHECK:    .section  .apple_types
 
 ; RUN: llc -mtriple=i686-pc-win32 -filetype=asm -O0 < %s | FileCheck -check-prefix=WIN32 %s
-; WIN32:    .section .debug$S,"rnd"
+; WIN32:    .section .debug$S,"rd"
 
 ; RUN: llc -mtriple=i686-pc-win32 -filetype=null -O0 < %s
 
@@ -27,15 +29,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test.c", metadata !"C:\5CProjects"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !10 = metadata !{i32 3, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/coff_relative_names.ll b/test/DebugInfo/X86/coff_relative_names.ll
index 3b4854e..067992d 100644
--- a/test/DebugInfo/X86/coff_relative_names.ll
+++ b/test/DebugInfo/X86/coff_relative_names.ll

@@ -23,15 +23,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test.c", metadata !"C:\5CProjects"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !10 = metadata !{i32 3, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index ac038f3..43f881e 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll

@@ -8,54 +8,54 @@
 
 ; CHECK: DW_TAG_class_type
 ; CHECK:   DW_TAG_subprogram
-; CHECK: [[ASSIGN_DECL:0x........]]:  DW_TAG_subprogram
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_MIPS_linkage_name {{.*}} "_ZN12nsAutoRefCntaSEi"
 
 ; CHECK: DW_TAG_class_type
-; CHECK: [[RELEASE_DECL:0x........]]:  DW_TAG_subprogram
-; CHECK: [[DTOR_DECL:0x........]]:  DW_TAG_subprogram
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_MIPS_linkage_name {{.*}} "_ZN17nsAutoRefCnt7ReleaseEv"
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name {{.*}} "~nsAutoRefCnt"
 
-; CHECK: [[D2_ABS:.*]]: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK-NEXT:     DW_AT_{{.*}}linkage_name {{.*}}D2
-; CHECK-NEXT:     DW_AT_specification {{.*}} {[[DTOR_DECL]]}
+; CHECK-NEXT:     DW_AT_specification {{.*}} "~nsAutoRefCnt"
 ; CHECK-NEXT:     DW_AT_inline
 ; CHECK-NOT:      DW_AT
 ; CHECK: DW_TAG
-; CHECK: [[D1_ABS:.*]]: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK-NEXT:     DW_AT_{{.*}}linkage_name {{.*}}D1
-; CHECK-NEXT:     DW_AT_specification {{.*}} {[[DTOR_DECL]]}
+; CHECK-NEXT:     DW_AT_specification {{.*}} "~nsAutoRefCnt"
 ; CHECK-NEXT:     DW_AT_inline
 ; CHECK-NOT:     DW_AT
 ; CHECK: [[D1_THIS_ABS:.*]]: DW_TAG_formal_parameter
 
-; CHECK: [[RELEASE:0x........]]: DW_TAG_subprogram
-; CHECK:     DW_AT_specification {{.*}} {[[RELEASE_DECL]]}
+; CHECK: DW_TAG_subprogram
+; CHECK:     DW_AT_specification {{.*}} "_ZN17nsAutoRefCnt7ReleaseEv"
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: NULL
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_TAG_lexical_block
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} "_ZN12nsAutoRefCntaSEi"
 ; CHECK-NOT: NULL
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[ASSIGN:0x........]]}
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} "_ZN17nsAutoRefCntD1Ev"
 ; CHECK-NOT: NULL
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D1_ABS]]}
-; CHECK-NOT: NULL
-; CHECK-NOT: DW_TAG
-; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D2_ABS]]}
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} "_ZN17nsAutoRefCntD2Ev"
 
 ; and then that a TAG_subprogram refers to it with AT_abstract_origin.
 
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_abstract_origin {{.*}} {[[D1_ABS]]}
+; CHECK: DW_AT_abstract_origin {{.*}} "_ZN17nsAutoRefCntD1Ev"
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_abstract_origin {{.*}} {[[D1_THIS_ABS]]}
+; CHECK: DW_AT_abstract_origin {{.*}} {[[D1_THIS_ABS]]} "this"
 ; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[D2_ABS]]}
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} "_ZN17nsAutoRefCntD2Ev"
 
 
 define i32 @_ZN17nsAutoRefCnt7ReleaseEv() {
@@ -76,56 +76,56 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!60}
 
-!0 = metadata !{i32 786449, metadata !59, i32 4, metadata !"clang version 3.1 ()", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !47,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.1 ()\001\00\000\00\000", metadata !59, metadata !1, metadata !1, metadata !3, metadata !47,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !23, metadata !27, metadata !31}
-!5 = metadata !{i32 720942, metadata !6, null, metadata !"Release", metadata !"Release", metadata !"_ZN17nsAutoRefCnt7ReleaseEv", i32 14, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_ZN17nsAutoRefCnt7ReleaseEv , null, metadata !12, metadata !20, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [Release]
-!6 = metadata !{i32 720937, metadata !59} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00Release\00Release\00_ZN17nsAutoRefCnt7ReleaseEv\0014\000\001\000\006\00256\001\0014", metadata !6, null, metadata !7, null, i32 ()* @_ZN17nsAutoRefCnt7ReleaseEv , null, metadata !12, metadata !20} ; [ DW_TAG_subprogram ] [line 14] [def] [Release]
+!6 = metadata !{metadata !"0x29", metadata !59} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10}
-!9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786451, metadata !59, null, metadata !"nsAutoRefCnt", i32 10, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 10, size 0, align 0, offset 0] [decl] [from ]
-!12 = metadata !{i32 720942, metadata !6, metadata !13, metadata !"Release", metadata !"Release", metadata !"_ZN17nsAutoRefCnt7ReleaseEv", i32 11, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 11} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 720898, metadata !59, null, metadata !"nsAutoRefCnt", i32 10, i64 8, i64 8, i32 0, i32 0, null, metadata !14, null, null, null} ; [ DW_TAG_class_type ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0x13\00nsAutoRefCnt\0010\000\000\000\004\000", metadata !59, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 10, size 0, align 0, offset 0] [decl] [from ]
+!12 = metadata !{metadata !"0x2e\00Release\00Release\00_ZN17nsAutoRefCnt7ReleaseEv\0011\000\000\000\006\00256\001\0011", metadata !6, metadata !13, metadata !7, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
+!13 = metadata !{metadata !"0x2\00nsAutoRefCnt\0010\008\008\000\000\000", metadata !59, null, null, metadata !14, null, null} ; [ DW_TAG_class_type ]
 !14 = metadata !{metadata !12, metadata !15}
-!15 = metadata !{i32 720942, metadata !6, metadata !13, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"", i32 12, metadata !16, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 12} ; [ DW_TAG_subprogram ]
-!16 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x2e\00~nsAutoRefCnt\00~nsAutoRefCnt\00\0012\000\000\000\006\00256\001\0012", metadata !6, metadata !13, metadata !16, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null, metadata !10}
 !18 = metadata !{}
 !20 = metadata !{metadata !22}
-!22 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777230, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD1Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_ZN17nsAutoRefCntD1Ev, null, metadata !15, metadata !24, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
+!22 = metadata !{metadata !"0x101\00this\0016777230\0064", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{metadata !"0x2e\00~nsAutoRefCnt\00~nsAutoRefCnt\00_ZN17nsAutoRefCntD1Ev\0018\000\001\000\006\00256\001\0018", metadata !6, null, metadata !16, null, void ()* @_ZN17nsAutoRefCntD1Ev, null, metadata !15, metadata !24} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
 !24 = metadata !{metadata !26}
-!26 = metadata !{i32 786689, metadata !23, metadata !"this", metadata !6, i32 16777234, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!27 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD2Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32* null, null, metadata !15, metadata !28, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
+!26 = metadata !{metadata !"0x101\00this\0016777234\0064", metadata !23, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
+!27 = metadata !{metadata !"0x2e\00~nsAutoRefCnt\00~nsAutoRefCnt\00_ZN17nsAutoRefCntD2Ev\0018\000\001\000\006\00256\001\0018", metadata !6, null, metadata !16, null, i32* null, null, metadata !15, metadata !28} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
 !28 = metadata !{metadata !30}
-!30 = metadata !{i32 786689, metadata !27, metadata !"this", metadata !6, i32 16777234, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!31 = metadata !{i32 720942, metadata !6, null, metadata !"operator=", metadata !"operator=", metadata !"_ZN12nsAutoRefCntaSEi", i32 4, metadata !32, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !36, metadata !43, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [operator=]
-!32 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !33, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{metadata !"0x101\00this\0016777234\0064", metadata !27, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
+!31 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN12nsAutoRefCntaSEi\004\000\001\000\006\00256\001\004", metadata !6, null, metadata !32, null, null, null, metadata !36, metadata !43} ; [ DW_TAG_subprogram ] [line 4] [def] [operator=]
+!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !33 = metadata !{metadata !9, metadata !34, metadata !9}
-!34 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !35} ; [ DW_TAG_pointer_type ]
-!35 = metadata !{i32 786451, metadata !59, null, metadata !"nsAutoRefCnt", i32 2, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 2, size 0, align 0, offset 0] [decl] [from ]
-!36 = metadata !{i32 720942, metadata !6, metadata !37, metadata !"operator=", metadata !"operator=", metadata !"_ZN12nsAutoRefCntaSEi", i32 4, metadata !32, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 4} ; [ DW_TAG_subprogram ]
-!37 = metadata !{i32 720898, metadata !59, null, metadata !"nsAutoRefCnt", i32 2, i64 32, i64 32, i32 0, i32 0, null, metadata !38, i32 0, null, null, null} ; [ DW_TAG_class_type ] [nsAutoRefCnt] [line 2, size 32, align 32, offset 0] [def] [from ]
+!34 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !35} ; [ DW_TAG_pointer_type ]
+!35 = metadata !{metadata !"0x13\00nsAutoRefCnt\002\000\000\000\004\000", metadata !59, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 2, size 0, align 0, offset 0] [decl] [from ]
+!36 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN12nsAutoRefCntaSEi\004\000\000\000\006\00256\001\004", metadata !6, metadata !37, metadata !32, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
+!37 = metadata !{metadata !"0x2\00nsAutoRefCnt\002\0032\0032\000\000\000", metadata !59, null, null, metadata !38, null, null, null} ; [ DW_TAG_class_type ] [nsAutoRefCnt] [line 2, size 32, align 32, offset 0] [def] [from ]
 !38 = metadata !{metadata !39, metadata !40, metadata !36}
-!39 = metadata !{i32 786445, metadata !59, metadata !37, metadata !"mValue", i32 7, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ]
-!40 = metadata !{i32 720942, metadata !6, metadata !37, metadata !"nsAutoRefCnt", metadata !"nsAutoRefCnt", metadata !"", i32 3, metadata !41, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ]
-!41 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !42, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!39 = metadata !{metadata !"0xd\00mValue\007\0032\0032\000\000", metadata !59, metadata !37, metadata !9} ; [ DW_TAG_member ]
+!40 = metadata !{metadata !"0x2e\00nsAutoRefCnt\00nsAutoRefCnt\00\003\000\000\000\006\00256\001\003", metadata !6, metadata !37, metadata !41, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
+!41 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !42, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !42 = metadata !{null, metadata !34}
 !43 = metadata !{metadata !45, metadata !46}
-!45 = metadata !{i32 786689, metadata !31, metadata !"this", metadata !6, i32 16777220, metadata !34, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!46 = metadata !{i32 786689, metadata !31, metadata !"aValue", metadata !6, i32 33554436, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!45 = metadata !{metadata !"0x101\00this\0016777220\0064", metadata !31, metadata !6, metadata !34} ; [ DW_TAG_arg_variable ]
+!46 = metadata !{metadata !"0x101\00aValue\0033554436\000", metadata !31, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
 !47 = metadata !{metadata !49}
-!49 = metadata !{i32 720948, i32 0, null, metadata !"mRefCnt", metadata !"mRefCnt", metadata !"", metadata !6, i32 9, metadata !37, i32 0, i32 1, i32* null, null} ; [ DW_TAG_variable ]
+!49 = metadata !{metadata !"0x34\00mRefCnt\00mRefCnt\00\009\000\001", null, metadata !6, metadata !37, i32* null, null} ; [ DW_TAG_variable ]
 !50 = metadata !{i32 5, i32 5, metadata !51, metadata !52}
-!51 = metadata !{i32 786443, metadata !6, metadata !31, i32 4, i32 29, i32 2} ; [ DW_TAG_lexical_block ]
+!51 = metadata !{metadata !"0xb\004\0029\002", metadata !6, metadata !31} ; [ DW_TAG_lexical_block ]
 !52 = metadata !{i32 15, i32 0, metadata !53, null}
-!53 = metadata !{i32 786443, metadata !6, metadata !5, i32 14, i32 34, i32 0} ; [ DW_TAG_lexical_block ]
+!53 = metadata !{metadata !"0xb\0014\0034\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
 !54 = metadata !{i32 19, i32 3, metadata !55, metadata !56}
-!55 = metadata !{i32 786443, metadata !6, metadata !27, i32 18, i32 41, i32 1} ; [ DW_TAG_lexical_block ]
+!55 = metadata !{metadata !"0xb\0018\0041\001", metadata !6, metadata !27} ; [ DW_TAG_lexical_block ]
 !56 = metadata !{i32 18, i32 41, metadata !23, metadata !52}
 !57 = metadata !{i32 19, i32 3, metadata !55, metadata !58}
 !58 = metadata !{i32 18, i32 41, metadata !23, null}
 !59 = metadata !{metadata !"nsAutoRefCnt.ii", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/netwerk/base/src"}
-!60 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!60 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/cu-ranges-odr.ll b/test/DebugInfo/X86/cu-ranges-odr.ll
index c42a908..b73d33d 100644
--- a/test/DebugInfo/X86/cu-ranges-odr.ll
+++ b/test/DebugInfo/X86/cu-ranges-odr.ll

@@ -35,9 +35,9 @@
   %this.addr = alloca %class.A*, align 8
   %i.addr = alloca i32, align 4
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !27), !dbg !29
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !27, metadata !{metadata !"0x102"}), !dbg !29
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !30), !dbg !31
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
   %this1 = load %class.A** %this.addr
   %a = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !31
   %0 = load i32* %i.addr, align 4, !dbg !31
@@ -46,7 +46,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
 entry:
@@ -61,36 +61,36 @@
 !llvm.module.flags = !{!23, !24}
 !llvm.ident = !{!25}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 (trunk 199923) (llvm/trunk 199940)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !13, metadata !21, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 (trunk 199923) (llvm/trunk 199940)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !13, metadata !21, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"baz.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786434, metadata !1, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !6, metadata !8}
-!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1A", metadata !"a", i32 5, i64 32, i64 32, i64 0, i32 1, metadata !7} ; [ DW_TAG_member ] [a] [line 5, size 32, align 32, offset 0] [private] [from int]
-!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 3, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !12, i32 3} ; [ DW_TAG_subprogram ] [line 3] [A]
-!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0xd\00a\005\0032\0032\000\001", metadata !1, metadata !"_ZTS1A", metadata !7} ; [ DW_TAG_member ] [a] [line 5, size 32, align 32, offset 0] [private] [from int]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x2e\00A\00A\00\003\000\000\000\006\00256\000\003", metadata !1, metadata !"_ZTS1A", metadata !9, null, null, null, i32 0, metadata !12} ; [ DW_TAG_subprogram ] [line 3] [A]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{null, metadata !11, metadata !7}
-!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
 !12 = metadata !{i32 786468}
 !13 = metadata !{metadata !14, metadata !18, metadata !19}
-!14 = metadata !{i32 786478, metadata !1, metadata !15, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 8, metadata !16, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 8} ; [ DW_TAG_subprogram ] [line 8] [local] [def] [__cxx_global_var_init]
-!15 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/baz.cpp]
-!16 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\008\001\001\000\006\00256\000\008", metadata !1, metadata !15, metadata !16, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [local] [def] [__cxx_global_var_init]
+!15 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/baz.cpp]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null}
-!18 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"_ZN1AC2Ei", i32 3, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, i32)* @_ZN1AC2Ei, null, metadata !8, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
-!19 = metadata !{i32 786478, metadata !1, metadata !15, metadata !"", metadata !"", metadata !"_GLOBAL__I_a", i32 3, metadata !20, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__I_a, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [local] [def]
-!20 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2Ei\003\000\001\000\006\00256\000\003", metadata !1, metadata !"_ZTS1A", metadata !9, null, void (%class.A*, i32)* @_ZN1AC2Ei, null, metadata !8, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!19 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__I_a\003\001\001\000\006\0064\000\003", metadata !1, metadata !15, metadata !20, null, void ()* @_GLOBAL__I_a, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [local] [def]
+!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{metadata !22}
-!22 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !15, i32 8, metadata !4, i32 0, i32 1, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 8] [def]
+!22 = metadata !{metadata !"0x34\00a\00a\00\008\000\001", null, metadata !15, metadata !4, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 8] [def]
 !23 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !25 = metadata !{metadata !"clang version 3.5 (trunk 199923) (llvm/trunk 199940)"}
-!26 = metadata !{i32 8, i32 0, metadata !14, null} ; [ DW_TAG_imported_declaration ]
-!27 = metadata !{i32 786689, metadata !18, metadata !"this", null, i32 16777216, metadata !28, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!28 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!26 = metadata !{i32 8, i32 0, metadata !14, null}
+!27 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !18, null, metadata !28} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!28 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
 !29 = metadata !{i32 0, i32 0, metadata !18, null}
-!30 = metadata !{i32 786689, metadata !18, metadata !"i", metadata !15, i32 33554435, metadata !7, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 3]
+!30 = metadata !{metadata !"0x101\00i\0033554435\000", metadata !18, metadata !15, metadata !7} ; [ DW_TAG_arg_variable ] [i] [line 3]
 !31 = metadata !{i32 3, i32 0, metadata !18, null}
 !32 = metadata !{i32 3, i32 0, metadata !19, null}

diff --git a/test/DebugInfo/X86/cu-ranges.ll b/test/DebugInfo/X86/cu-ranges.ll
index 405a498..a9821b0 100644
--- a/test/DebugInfo/X86/cu-ranges.ll
+++ b/test/DebugInfo/X86/cu-ranges.ll

@@ -1,9 +1,9 @@
 ; RUN: llc -split-dwarf=Enable -O0 %s -function-sections -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
-; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck --check-prefix=FUNCTION-SECTIONS %s
+; RUN: llvm-dwarfdump -debug-dump=abbrev %t | FileCheck --check-prefix=FUNCTION-SECTIONS %s
 ; RUN: llvm-readobj --relocations %t | FileCheck --check-prefix=FUNCTION-SECTIONS-RELOCS %s
 
 ; RUN: llc -split-dwarf=Enable -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
-; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck --check-prefix=NO-FUNCTION-SECTIONS %s
+; RUN: llvm-dwarfdump -debug-dump=abbrev %t | FileCheck --check-prefix=NO-FUNCTION-SECTIONS %s
 
 ; From:
 ; int foo (int a) {
@@ -21,27 +21,29 @@
 
 ; Without function sections enabled make sure that we have no DW_AT_ranges attribute.
 ; NO-FUNCTION-SECTIONS-NOT: DW_AT_ranges
+; NO-FUNCTION-SECTIONS: DW_AT_low_pc DW_FORM_addr
+; NO-FUNCTION-SECTIONS-NOT: DW_AT_ranges
 
 ; Function Attrs: nounwind uwtable
 define i32 @foo(i32 %a) #0 {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !13), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
   %0 = load i32* %a.addr, align 4, !dbg !14
   %add = add nsw i32 %0, 1, !dbg !14
   ret i32 %add, !dbg !14
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind uwtable
 define i32 @bar(i32 %b) #0 {
 entry:
   %b.addr = alloca i32, align 4
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !15), !dbg !16
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
   %0 = load i32* %b.addr, align 4, !dbg !16
   %add = add nsw i32 %0, 2, !dbg !16
   ret i32 %add, !dbg !16
@@ -54,20 +56,20 @@
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/z.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/z.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"z.c", metadata !"/usr/local/google/home/echristo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/z.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/z.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @bar, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x2e\00bar\00bar\00\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
 !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !12 = metadata !{metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
-!13 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!13 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 1]
 !14 = metadata !{i32 1, i32 0, metadata !4, null}
-!15 = metadata !{i32 786689, metadata !9, metadata !"b", metadata !5, i32 16777218, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 2]
+!15 = metadata !{metadata !"0x101\00b\0016777218\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 2]
 !16 = metadata !{i32 2, i32 0, metadata !9, null}

diff --git a/test/DebugInfo/X86/data_member_location.ll b/test/DebugInfo/X86/data_member_location.ll
index 1c76258..db88bb1 100644
--- a/test/DebugInfo/X86/data_member_location.ll
+++ b/test/DebugInfo/X86/data_member_location.ll

@@ -34,20 +34,20 @@
 !llvm.module.flags = !{!13, !15}
 !llvm.ident = !{!14}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !10, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/data_member_location.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !2, metadata !10, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/data_member_location.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"data_member_location.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 64, i64 32, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00foo\001\0064\0032\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !6, metadata !8}
-!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS3foo", metadata !"c", i32 2, i64 8, i64 8, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [c] [line 2, size 8, align 8, offset 0] [from char]
-!7 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!8 = metadata !{i32 786445, metadata !1, metadata !"_ZTS3foo", metadata !"i", i32 3, i64 32, i64 32, i64 32, i32 0, metadata !9} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 32] [from int]
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!6 = metadata !{metadata !"0xd\00c\002\008\008\000\000", metadata !1, metadata !"_ZTS3foo", metadata !7} ; [ DW_TAG_member ] [c] [line 2, size 8, align 8, offset 0] [from char]
+!7 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!8 = metadata !{metadata !"0xd\00i\003\0032\0032\0032\000", metadata !1, metadata !"_ZTS3foo", metadata !9} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 32] [from int]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !12, i32 6, metadata !4, i32 0, i32 1, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
-!12 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/data_member_location.cpp]
+!11 = metadata !{metadata !"0x34\00f\00f\00\006\000\001", null, metadata !12, metadata !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
+!12 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/data_member_location.cpp]
 !13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !14 = metadata !{metadata !"clang version 3.4 "}
 
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-asm.s b/test/DebugInfo/X86/dbg-asm.s
new file mode 100644
index 0000000..f6e5233b
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-asm.s

@@ -0,0 +1,30 @@
+# RUN: llvm-mc -triple i686-windows-gnu -g %s -filetype obj -o - \
+# RUN:   | llvm-readobj -r - | FileCheck -check-prefix CHECK-COFF %s
+# RUN: llvm-mc -triple i686-windows-itanium -g %s -filetype obj -o - \
+# RUN:   | llvm-readobj -r - | FileCheck -check-prefix CHECK-COFF %s
+# RUN: llvm-mc -triple i686-linux-gnu -g %s -filetype obj -o - \
+# RUN:   | llvm-readobj -r - | FileCheck -check-prefix CHECK-ELF %s
+
+_a:
+	movl $65, %eax
+	ret
+
+# CHECK-COFF: Relocations [
+# CHECK-COFF:   Section {{.*}} .debug_info {
+# CHECK-COFF:     0x6 IMAGE_REL_I386_SECREL .debug_abbrev
+# CHECK-COFF:     0xC IMAGE_REL_I386_SECREL .debug_line
+# CHECK-COFF:   }
+# CHECK-COFF:   Section {{.*}} .debug_aranges {
+# CHECK-COFF:     0x6 IMAGE_REL_I386_SECREL .debug_info
+# CHECK-COFF:   }
+# CHECK-COFF: ]
+
+# CHECK-ELF: Relocations [
+# CHECK-ELF:   Section {{.*}} .rel.debug_info {
+# CHECK-ELF:     0x6 R_386_32 .debug_abbrev
+# CHECK-ELF:     0xC R_386_32 .debug_line
+# CHECK-ELF:   }
+# CHECK-ELF:   Section {{.*}} .rel.debug_aranges {
+# CHECK-ELF:     0x6 R_386_32 .debug_info
+# CHECK-ELF:   }
+# CHECK-ELF: ]

diff --git a/test/DebugInfo/X86/dbg-at-specficiation.ll b/test/DebugInfo/X86/dbg-at-specficiation.ll
index c765367..034574b 100644
--- a/test/DebugInfo/X86/dbg-at-specficiation.ll
+++ b/test/DebugInfo/X86/dbg-at-specficiation.ll

@@ -8,14 +8,14 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{i32 720913, metadata !11, i32 12, metadata !"clang version 3.0 (trunk 140253)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, i32 0} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 140253)\001\00\000\00\000", metadata !11, metadata !2, metadata !2, metadata !2, metadata !3, null} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720948, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, [10 x i32]* @a, null} ; [ DW_TAG_variable ]
-!6 = metadata !{i32 720937, metadata !11} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720897, null, null, null, i32 0, i64 320, i64 32, i32 0, i32 0, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
-!8 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !6, metadata !7, [10 x i32]* @a, null} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x1\00\000\00320\0032\000\000", null, null, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 720929, i64 0, i64 10}        ; [ DW_TAG_subrange_type ]
+!10 = metadata !{metadata !"0x21\000\0010"}        ; [ DW_TAG_subrange_type ]
 !11 = metadata !{metadata !"x.c", metadata !"/private/tmp"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-byval-parameter.ll b/test/DebugInfo/X86/dbg-byval-parameter.ll
index c658b50..49cd6ba 100644
--- a/test/DebugInfo/X86/dbg-byval-parameter.ll
+++ b/test/DebugInfo/X86/dbg-byval-parameter.ll

@@ -9,7 +9,7 @@
   %retval = alloca double                         ; <double*> [#uses=2]
   %0 = alloca double                              ; <double*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0), !dbg !15
+  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0, metadata !{metadata !"0x102"}), !dbg !15
   %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1]
   %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1]
   %3 = load double* %2, align 8, !dbg !16         ; <double> [#uses=1]
@@ -23,30 +23,30 @@
   ret double %retval1, !dbg !16
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00my_r0\0011\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\000", metadata !19, metadata !2, metadata !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !19, metadata !20, metadata !20, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !7}
-!6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !19, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x13\00Rect\006\00256\0064\000\000\000", metadata !19, metadata !2, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0xd\00P1\007\00128\0064\000\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x13\00Pt\001\00128\0064\000\000\000", metadata !19, metadata !2, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
-!14 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P2", i32 8, i64 128, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ]
+!12 = metadata !{metadata !"0xd\00x\002\0064\0064\000\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
+!13 = metadata !{metadata !"0xd\00y\003\0064\0064\0064\000", metadata !19, metadata !10, metadata !6} ; [ DW_TAG_member ]
+!14 = metadata !{metadata !"0xd\00P2\008\00128\0064\00128\000", metadata !19, metadata !7, metadata !10} ; [ DW_TAG_member ]
 !15 = metadata !{i32 11, i32 0, metadata !1, null}
 !16 = metadata !{i32 12, i32 0, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !19, metadata !1, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!17 = metadata !{metadata !"0xb\0011\000\000", metadata !19, metadata !1} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{metadata !1}
 !19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
 !20 = metadata !{i32 0}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-const-int.ll b/test/DebugInfo/X86/dbg-const-int.ll
index bf7ee08..c7e5e92 100644
--- a/test/DebugInfo/X86/dbg-const-int.ll
+++ b/test/DebugInfo/X86/dbg-const-int.ll

@@ -12,23 +12,23 @@
 
 define i32 @foo() nounwind uwtable readnone optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
+  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !9
   ret i32 42, !dbg !10
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15}
 
-!0 = metadata !{i32 786449, metadata !13, i32 12, metadata !"clang version 3.0 (trunk 132191)", i1 true, metadata !"", i32 0, metadata !14, metadata !14, metadata !11, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !13, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @foo, null, null, metadata !12, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
-!2 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !13, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 132191)\001\00\000\00\000", metadata !13, metadata !14, metadata !14, metadata !11, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\000\001\000", metadata !13, metadata !2, metadata !3, null, i32 ()* @foo, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
+!2 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !13, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786688, metadata !7, metadata !"i", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!7 = metadata !{i32 786443, metadata !13, metadata !1, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x100\00i\002\000", metadata !7, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!7 = metadata !{metadata !"0xb\001\0011\000", metadata !13, metadata !1} ; [ DW_TAG_lexical_block ]
 !8 = metadata !{i32 42}
 !9 = metadata !{i32 2, i32 12, metadata !7, null}
 !10 = metadata !{i32 3, i32 2, metadata !7, null}
@@ -36,4 +36,4 @@
 !12 = metadata !{metadata !6}
 !13 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !14 = metadata !{i32 0}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-const.ll b/test/DebugInfo/X86/dbg-const.ll
index 300c1ee..20e8652 100644
--- a/test/DebugInfo/X86/dbg-const.ll
+++ b/test/DebugInfo/X86/dbg-const.ll

@@ -17,28 +17,28 @@
 ;CHECK-NEXT:  .byte	42
 define i32 @foobar() nounwind readonly noinline ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
+  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !9
   %call = tail call i32 @bar(), !dbg !11
-  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !6), !dbg !11
+  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !11
   %call2 = tail call i32 @bar(), !dbg !11
   %add = add nsw i32 %call2, %call, !dbg !12
   ret i32 %add, !dbg !10
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 declare i32 @bar() nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{i32 786478, metadata !15, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"foobar", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @foobar, null, null, metadata !14, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114183)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !15, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foobar\00foobar\00foobar\0012\000\001\000\006\000\001\000", metadata !15, metadata !1, metadata !3, null, i32 ()* @foobar, null, null, metadata !14} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 114183)\001\00\000\00\001", metadata !15, metadata !16, metadata !16, metadata !13, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !15, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !15, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
-!6 = metadata !{i32 786688, metadata !7, metadata !"j", metadata !1, i32 15, metadata !5, i32 0, null}
-!7 = metadata !{i32 786443, metadata !15, metadata !0, i32 12, i32 52, i32 0} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !15, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x100\00j\0015\000", metadata !7, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!7 = metadata !{metadata !"0xb\0012\0052\000", metadata !15, metadata !0} ; [ DW_TAG_lexical_block ]
 !8 = metadata !{i32 42}
 !9 = metadata !{i32 15, i32 12, metadata !7, null}
 !10 = metadata !{i32 23, i32 3, metadata !7, null}
@@ -48,4 +48,4 @@
 !14 = metadata !{metadata !6}
 !15 = metadata !{metadata !"mu.c", metadata !"/private/tmp"}
 !16 = metadata !{i32 0}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-declare-arg.ll b/test/DebugInfo/X86/dbg-declare-arg.ll
index b537265..b589ed97 100644
--- a/test/DebugInfo/X86/dbg-declare-arg.ll
+++ b/test/DebugInfo/X86/dbg-declare-arg.ll

@@ -14,8 +14,8 @@
   %nrvo = alloca i1
   %cleanup.dest.slot = alloca i32
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !26), !dbg !27
-  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !28), !dbg !30
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !28, metadata !{metadata !"0x102"}), !dbg !30
   store i32 0, i32* %j, align 4, !dbg !31
   %tmp = load i32* %i.addr, align 4, !dbg !32
   %cmp = icmp eq i32 %tmp, 42, !dbg !32
@@ -29,7 +29,7 @@
 
 if.end:                                           ; preds = %if.then, %entry
   store i1 false, i1* %nrvo, !dbg !36
-  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !37), !dbg !39
+  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !37, metadata !{metadata !"0x102"}), !dbg !39
   %tmp2 = load i32* %j, align 4, !dbg !40
   %x = getelementptr inbounds %class.A* %agg.result, i32 0, i32 0, !dbg !40
   store i32 %tmp2, i32* %x, align 4, !dbg !40
@@ -46,13 +46,13 @@
   ret void, !dbg !42
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define linkonce_odr void @_ZN1AD1Ev(%class.A* %this) unnamed_addr ssp align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !43), !dbg !44
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !43, metadata !{metadata !"0x102"}), !dbg !44
   %this1 = load %class.A** %this.addr
   call void @_ZN1AD2Ev(%class.A* %this1)
   ret void, !dbg !45
@@ -62,7 +62,7 @@
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !46), !dbg !47
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !46, metadata !{metadata !"0x102"}), !dbg !47
   %this1 = load %class.A** %this.addr
   %x = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !48
   store i32 1, i32* %x, align 4, !dbg !48
@@ -72,56 +72,56 @@
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!52}
 
-!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"~A", metadata !"~A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 589826, metadata !51, metadata !2, metadata !"A", i32 2, i64 128, i64 32, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 2, size 128, align 32, offset 0] [def] [from ]
-!2 = metadata !{i32 786449, metadata !51, i32 4, metadata !"clang version 3.0 (trunk 130127)", i1 false, metadata !"", i32 0, metadata !24, metadata !24, metadata !50, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786473, metadata !51} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x2e\00~A\00~A\00\002\000\000\000\006\00256\000\000", metadata !51, metadata !1, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x2\00A\002\00128\0032\000\000\000", metadata !51, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_class_type ] [A] [line 2, size 128, align 32, offset 0] [def] [from ]
+!2 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 130127)\000\00\000\00\001", metadata !51, metadata !24, metadata !24, metadata !50, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x29", metadata !51} ; [ DW_TAG_file_type ]
 !4 = metadata !{metadata !5, metadata !7, metadata !8, metadata !9, metadata !0, metadata !10, metadata !14}
-!5 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !6} ; [ DW_TAG_member ]
-!8 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"z", i32 2, i64 32, i64 32, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
-!9 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"o", i32 2, i64 32, i64 32, i64 96, i32 0, metadata !6} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0xd\00x\002\0032\0032\000\000", metadata !51, metadata !3, metadata !6} ; [ DW_TAG_member ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0xd\00y\002\0032\0032\0032\000", metadata !51, metadata !3, metadata !6} ; [ DW_TAG_member ]
+!8 = metadata !{metadata !"0xd\00z\002\0032\0032\0064\000", metadata !51, metadata !3, metadata !6} ; [ DW_TAG_member ]
+!9 = metadata !{metadata !"0xd\00o\002\0032\0032\0096\000", metadata !51, metadata !3, metadata !6} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x2e\00A\00A\00\002\000\000\000\006\00320\000\000", metadata !51, metadata !1, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !3, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786447, metadata !2, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!15 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !2, null, metadata !1} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0x2e\00A\00A\00\002\000\000\000\006\00320\000\000", metadata !51, metadata !1, metadata !15, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !3, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !13, metadata !17}
-!17 = metadata !{i32 589840, null, metadata !2, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_reference_type ]
-!18 = metadata !{i32 786470, metadata !2, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_const_type ]
-!19 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", i32 4, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, i32)* @_Z3fooi, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
-!20 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, metadata !2, metadata !18} ; [ DW_TAG_reference_type ]
+!18 = metadata !{metadata !"0x26\00\000\000\000\000\000", metadata !2, null, metadata !1} ; [ DW_TAG_const_type ]
+!19 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi\004\000\001\000\006\00256\000\000", metadata !51, metadata !3, metadata !20, null, void (%class.A*, i32)* @_Z3fooi, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
+!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !3, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{metadata !1}
-!22 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD1Ev", i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AD1Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
-!23 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !24, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD1Ev\002\000\001\000\006\00256\000\000", metadata !51, metadata !3, metadata !23, null, void (%class.A*)* @_ZN1AD1Ev, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
+!23 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !51, metadata !3, null, metadata !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !24 = metadata !{null}
-!25 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD2Ev", i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AD2Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
-!26 = metadata !{i32 786689, metadata !19, metadata !"i", metadata !3, i32 16777220, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD2Ev\002\000\001\000\006\00256\000\000", metadata !51, metadata !3, metadata !23, null, void (%class.A*)* @_ZN1AD2Ev, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
+!26 = metadata !{metadata !"0x101\00i\0016777220\000", metadata !19, metadata !3, metadata !6} ; [ DW_TAG_arg_variable ]
 !27 = metadata !{i32 4, i32 11, metadata !19, null}
-!28 = metadata !{i32 786688, metadata !29, metadata !"j", metadata !3, i32 5, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786443, metadata !51, metadata !19, i32 4, i32 14, i32 0} ; [ DW_TAG_lexical_block ]
+!28 = metadata !{metadata !"0x100\00j\005\000", metadata !29, metadata !3, metadata !6} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{metadata !"0xb\004\0014\000", metadata !51, metadata !19} ; [ DW_TAG_lexical_block ]
 !30 = metadata !{i32 5, i32 7, metadata !29, null}
 !31 = metadata !{i32 5, i32 12, metadata !29, null}
 !32 = metadata !{i32 6, i32 3, metadata !29, null}
 !33 = metadata !{i32 7, i32 5, metadata !34, null}
-!34 = metadata !{i32 786443, metadata !51, metadata !29, i32 6, i32 16, i32 1} ; [ DW_TAG_lexical_block ]
+!34 = metadata !{metadata !"0xb\006\0016\001", metadata !51, metadata !29} ; [ DW_TAG_lexical_block ]
 !35 = metadata !{i32 8, i32 3, metadata !34, null}
 !36 = metadata !{i32 9, i32 9, metadata !29, null}
-!37 = metadata !{i32 786688, metadata !29, metadata !"my_a", metadata !3, i32 9, metadata !38, i32 0, null} ; [ DW_TAG_auto_variable ]
-!38 = metadata !{i32 589840, metadata !2, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
+!37 = metadata !{metadata !"0x100\00my_a\009\000", metadata !29, metadata !3, metadata !38} ; [ DW_TAG_auto_variable ]
+!38 = metadata !{metadata !"0x10\00\000\000\000\000\000", metadata !2, null, metadata !1} ; [ DW_TAG_reference_type ]
 !39 = metadata !{i32 9, i32 5, metadata !29, null}
 !40 = metadata !{i32 10, i32 3, metadata !29, null}
 !41 = metadata !{i32 11, i32 3, metadata !29, null}
 !42 = metadata !{i32 12, i32 1, metadata !29, null}
-!43 = metadata !{i32 786689, metadata !22, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64, null} ; [ DW_TAG_arg_variable ]
+!43 = metadata !{metadata !"0x101\00this\0016777218\0064", metadata !22, metadata !3, metadata !13} ; [ DW_TAG_arg_variable ]
 !44 = metadata !{i32 2, i32 47, metadata !22, null}
 !45 = metadata !{i32 2, i32 61, metadata !22, null}
-!46 = metadata !{i32 786689, metadata !25, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64, null} ; [ DW_TAG_arg_variable ]
+!46 = metadata !{metadata !"0x101\00this\0016777218\0064", metadata !25, metadata !3, metadata !13} ; [ DW_TAG_arg_variable ]
 !47 = metadata !{i32 2, i32 47, metadata !25, null}
 !48 = metadata !{i32 2, i32 54, metadata !49, null}
-!49 = metadata !{i32 786443, metadata !51, metadata !25, i32 2, i32 52, i32 2} ; [ DW_TAG_lexical_block ]
+!49 = metadata !{metadata !"0xb\002\0052\002", metadata !51, metadata !25} ; [ DW_TAG_lexical_block ]
 !50 = metadata !{metadata !19, metadata !22, metadata !25}
 !51 = metadata !{metadata !"a.cc", metadata !"/private/tmp"}
-!52 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!52 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-declare.ll b/test/DebugInfo/X86/dbg-declare.ll
index 241a5a1..fd30115 100644
--- a/test/DebugInfo/X86/dbg-declare.ll
+++ b/test/DebugInfo/X86/dbg-declare.ll

@@ -7,21 +7,21 @@
   %saved_stack = alloca i8*
   %cleanup.dest.slot = alloca i32
   store i32* %x, i32** %x.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32** %x.addr}, metadata !14), !dbg !15
+  call void @llvm.dbg.declare(metadata !{i32** %x.addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
   %0 = load i32** %x.addr, align 8, !dbg !16
   %1 = load i32* %0, align 4, !dbg !16
   %2 = zext i32 %1 to i64, !dbg !16
   %3 = call i8* @llvm.stacksave(), !dbg !16
   store i8* %3, i8** %saved_stack, !dbg !16
   %vla = alloca i8, i64 %2, align 16, !dbg !16
-  call void @llvm.dbg.declare(metadata !{i8* %vla}, metadata !18), !dbg !23
+  call void @llvm.dbg.declare(metadata !{i8* %vla}, metadata !18, metadata !{metadata !"0x102"}), !dbg !23
   store i32 1, i32* %cleanup.dest.slot
   %4 = load i8** %saved_stack, !dbg !24
   call void @llvm.stackrestore(i8* %4), !dbg !24
   ret i32 0, !dbg !25
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i8* @llvm.stacksave() nounwind
 
@@ -30,27 +30,27 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!27}
 
-!0 = metadata !{i32 786449, metadata !26, i32 12, metadata !"clang version 3.1 (trunk 153698)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 153698)\000\00\000\00\000", metadata !26, metadata !1, metadata !1, metadata !3, metadata !1, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !26, metadata !0, metadata !"foo", metadata !"foo", metadata !"", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00\006\000\001\000\006\00256\000\000", metadata !26, metadata !0, metadata !7, null, i32 (i32*)* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ]
-!14 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777221, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_const_type ]
+!14 = metadata !{metadata !"0x101\00x\0016777221\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
 !15 = metadata !{i32 5, i32 21, metadata !5, null}
 !16 = metadata !{i32 7, i32 13, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !26, metadata !5, i32 6, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{i32 786688, metadata !17, metadata !"a", metadata !6, i32 7, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!19 = metadata !{i32 786433, null, null, null, i32 0, i64 0, i64 8, i32 0, i32 0, metadata !20, metadata !21, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 8, offset 0] [from char]
-!20 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0xb\006\001\000", metadata !26, metadata !5} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{metadata !"0x100\00a\007\000", metadata !17, metadata !6, metadata !19} ; [ DW_TAG_auto_variable ]
+!19 = metadata !{metadata !"0x1\00\000\000\008\000\000", null, null, metadata !20, metadata !21, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 8, offset 0] [from char]
+!20 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
 !21 = metadata !{metadata !22}
-!22 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
+!22 = metadata !{metadata !"0x21\000\00-1"}        ; [ DW_TAG_subrange_type ]
 !23 = metadata !{i32 7, i32 8, metadata !17, null}
 !24 = metadata !{i32 9, i32 1, metadata !17, null}
 !25 = metadata !{i32 8, i32 3, metadata !17, null}
 !26 = metadata !{metadata !"20020104-2.c", metadata !"/Volumes/Sandbox/llvm"}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-file-name.ll b/test/DebugInfo/X86/dbg-file-name.ll
index e9c61c1..f1a9e78 100644
--- a/test/DebugInfo/X86/dbg-file-name.ll
+++ b/test/DebugInfo/X86/dbg-file-name.ll

@@ -12,13 +12,13 @@
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12}
 
-!1 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !10, i32 1, metadata !"LLVM build 00", i1 true, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 786468, metadata !10, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !10, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 9, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786453, metadata !10, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!1 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\00LLVM build 00\001\00\000\00\000", metadata !10, metadata !11, metadata !11, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00main\00main\00main\009\000\001\000\006\00256\000\000", metadata !10, metadata !1, metadata !7, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !5}
 !9 = metadata !{metadata !6}
 !10 = metadata !{metadata !"simple.c", metadata !"/Users/manav/one/two"}
 !11 = metadata !{i32 0}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-i128-const.ll b/test/DebugInfo/X86/dbg-i128-const.ll
index 01b105f..0f5a03e 100644
--- a/test/DebugInfo/X86/dbg-i128-const.ll
+++ b/test/DebugInfo/X86/dbg-i128-const.ll

@@ -5,30 +5,30 @@
 
 define i128 @__foo(i128 %a, i128 %b) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !0, i64 0, metadata !1), !dbg !11
+  tail call void @llvm.dbg.value(metadata !0, i64 0, metadata !1, metadata !{metadata !"0x102"}), !dbg !11
   %add = add i128 %a, %b, !dbg !11
   ret i128 %add, !dbg !11
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!5}
 !llvm.module.flags = !{!16}
 
 !0 = metadata !{i128 42 }
-!1 = metadata !{i32 786688, metadata !2, metadata !"MAX", metadata !4, i32 29, metadata !8, i32 0, null} ; [ DW_TAG_auto_variable ]
-!2 = metadata !{i32 786443, metadata !13, metadata !3, i32 26, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{i32 786478, metadata !13, metadata !4, metadata !"__foo", metadata !"__foo", metadata !"__foo", i32 26, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i128 (i128, i128)* @__foo, null, null, null, i32 26} ; [ DW_TAG_subprogram ]
-!4 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786449, metadata !13, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !15, metadata !15, metadata !12, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 786453, metadata !13, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!1 = metadata !{metadata !"0x100\00MAX\0029\000", metadata !2, metadata !4, metadata !8} ; [ DW_TAG_auto_variable ]
+!2 = metadata !{metadata !"0xb\0026\000\000", metadata !13, metadata !3} ; [ DW_TAG_lexical_block ]
+!3 = metadata !{metadata !"0x2e\00__foo\00__foo\00__foo\0026\000\001\000\006\000\000\0026", metadata !13, metadata !4, metadata !6, null, i128 (i128, i128)* @__foo, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x11\001\00clang\001\00\000\00\000", metadata !13, metadata !15, metadata !15, metadata !12, null,  null} ; [ DW_TAG_compile_unit ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !13, metadata !4, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8, metadata !8}
-!8 = metadata !{i32 786454, metadata !14, metadata !4, metadata !"ti_int", i32 78, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ]
-!9 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
-!10 = metadata !{i32 786468, metadata !13, metadata !4, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x16\00ti_int\0078\000\000\000\000", metadata !14, metadata !4, metadata !10} ; [ DW_TAG_typedef ]
+!9 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
+!10 = metadata !{metadata !"0x24\00\000\00128\00128\000\000\005", metadata !13, metadata !4} ; [ DW_TAG_base_type ]
 !11 = metadata !{i32 29, i32 0, metadata !2, null}
 !12 = metadata !{metadata !3}
 !13 = metadata !{metadata !"foo.c", metadata !"/tmp"}
 !14 = metadata !{metadata !"myint.h", metadata !"/tmp"}
 !15 = metadata !{i32 0}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-merge-loc-entry.ll b/test/DebugInfo/X86/dbg-merge-loc-entry.ll
index 016d0a1..f4f1788 100644
--- a/test/DebugInfo/X86/dbg-merge-loc-entry.ll
+++ b/test/DebugInfo/X86/dbg-merge-loc-entry.ll

@@ -14,8 +14,8 @@
 
 define hidden i128 @__divti3(i128 %u, i128 %v) nounwind readnone {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i128 %u}, i64 0, metadata !14), !dbg !15
-  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !17), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i128 %u}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
+  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !17, metadata !{metadata !"0x102"}), !dbg !21
   br i1 undef, label %bb2, label %bb4, !dbg !22
 
 bb2:                                              ; preds = %entry
@@ -31,45 +31,45 @@
   ret i128 undef, !dbg !27
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 declare %0 @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!32}
 
-!0 = metadata !{i32 786478, metadata !29, metadata !1, metadata !"__udivmodti4", metadata !"__udivmodti4", metadata !"", i32 879, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, null, i32 879} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !29} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !29, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !31, metadata !31, metadata !28, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !29, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00__udivmodti4\00__udivmodti4\00\00879\001\001\000\006\00256\001\00879", metadata !29, metadata !1, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !29} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !29, metadata !31, metadata !31, metadata !28, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !29, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !5, metadata !5, metadata !8}
-!5 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"UTItype", i32 166, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786473, metadata !30} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786447, metadata !29, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{i32 786478, metadata !29, metadata !1, metadata !"__divti3", metadata !"__divti3", metadata !"__divti3", i32 1094, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i128 (i128, i128)* @__divti3, null, null, null, i32 1094} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !29, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x16\00UTItype\00166\000\000\000\000", metadata !30, metadata !6, metadata !7} ; [ DW_TAG_typedef ]
+!6 = metadata !{metadata !"0x29", metadata !30} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x24\00\000\00128\00128\000\000\007", metadata !29, metadata !1} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !29, metadata !1, metadata !5} ; [ DW_TAG_pointer_type ]
+!9 = metadata !{metadata !"0x2e\00__divti3\00__divti3\00__divti3\001094\000\001\000\006\00256\001\001094", metadata !29, metadata !1, metadata !10, null, i128 (i128, i128)* @__divti3, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !29, metadata !1, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{metadata !12, metadata !12, metadata !12}
-!12 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"TItype", i32 160, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_typedef ]
-!13 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786689, metadata !9, metadata !"u", metadata !1, i32 1093, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{metadata !"0x16\00TItype\00160\000\000\000\000", metadata !30, metadata !6, metadata !13} ; [ DW_TAG_typedef ]
+!13 = metadata !{metadata !"0x24\00\000\00128\00128\000\000\005", metadata !29, metadata !1} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x101\00u\001093\000", metadata !9, metadata !1, metadata !12} ; [ DW_TAG_arg_variable ]
 !15 = metadata !{i32 1093, i32 0, metadata !9, null}
 !16 = metadata !{i64 0}
-!17 = metadata !{i32 786688, metadata !18, metadata !"c", metadata !1, i32 1095, metadata !19, i32 0, null} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 786443, metadata !29, metadata !9, i32 1094, i32 0, i32 13} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"word_type", i32 424, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_typedef ]
-!20 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0x100\00c\001095\000", metadata !18, metadata !1, metadata !19} ; [ DW_TAG_auto_variable ]
+!18 = metadata !{metadata !"0xb\001094\000\0013", metadata !29, metadata !9} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{metadata !"0x16\00word_type\00424\000\000\000\000", metadata !30, metadata !6, metadata !20} ; [ DW_TAG_typedef ]
+!20 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", metadata !29, metadata !1} ; [ DW_TAG_base_type ]
 !21 = metadata !{i32 1095, i32 0, metadata !18, null}
 !22 = metadata !{i32 1103, i32 0, metadata !18, null}
 !23 = metadata !{i32 1104, i32 0, metadata !18, null}
 !24 = metadata !{i32 1003, i32 0, metadata !25, metadata !26}
-!25 = metadata !{i32 786443, metadata !29, metadata !0, i32 879, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!25 = metadata !{metadata !"0xb\00879\000\000", metadata !29, metadata !0} ; [ DW_TAG_lexical_block ]
 !26 = metadata !{i32 1107, i32 0, metadata !18, null}
 !27 = metadata !{i32 1111, i32 0, metadata !18, null}
 !28 = metadata !{metadata !0, metadata !9}
 !29 = metadata !{metadata !"foobar.c", metadata !"/tmp"}
 !30 = metadata !{metadata !"foobar.h", metadata !"/tmp"}
 !31 = metadata !{i32 0}
-!32 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!32 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-prolog-end.ll b/test/DebugInfo/X86/dbg-prolog-end.ll
index a7c6cb5..f51dd70 100644
--- a/test/DebugInfo/X86/dbg-prolog-end.ll
+++ b/test/DebugInfo/X86/dbg-prolog-end.ll

@@ -8,8 +8,8 @@
   %i.addr = alloca i32, align 4
   %j = alloca i32, align 4
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !7), !dbg !8
-  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !9), !dbg !11
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !7, metadata !{metadata !"0x102"}), !dbg !8
+  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !9, metadata !{metadata !"0x102"}), !dbg !11
   store i32 2, i32* %j, align 4, !dbg !12
   %tmp = load i32* %j, align 4, !dbg !13
   %inc = add nsw i32 %tmp, 1, !dbg !13
@@ -22,7 +22,7 @@
   ret i32 %tmp3, !dbg !15
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @main() nounwind ssp {
 entry:
@@ -36,24 +36,24 @@
 !llvm.module.flags = !{!21}
 !18 = metadata !{metadata !1, metadata !6}
 
-!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.0 (trunk 131100)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 131100)\000\00\000\00\000", metadata !19, metadata !20, metadata !20, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !19, metadata !2, metadata !3, null, i32 (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!2 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, null, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!7 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777217, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\000\000\007", metadata !19, metadata !2, metadata !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!7 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
 !8 = metadata !{i32 1, i32 13, metadata !1, null}
-!9 = metadata !{i32 786688, metadata !10, metadata !"j", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{i32 786443, metadata !19, metadata !1, i32 1, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
+!9 = metadata !{metadata !"0x100\00j\002\000", metadata !10, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{metadata !"0xb\001\0016\000", metadata !19, metadata !1} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 2, i32 6, metadata !10, null}
 !12 = metadata !{i32 2, i32 11, metadata !10, null}
 !13 = metadata !{i32 3, i32 2, metadata !10, null}
 !14 = metadata !{i32 4, i32 2, metadata !10, null}
 !15 = metadata !{i32 5, i32 2, metadata !10, null}
 !16 = metadata !{i32 8, i32 2, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !19, metadata !6, i32 7, i32 12, i32 1} ; [ DW_TAG_lexical_block ]
+!17 = metadata !{metadata !"0xb\007\0012\001", metadata !19, metadata !6} ; [ DW_TAG_lexical_block ]
 !19 = metadata !{metadata !"/tmp/a.c", metadata !"/private/tmp"}
 !20 = metadata !{i32 0}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-subrange.ll b/test/DebugInfo/X86/dbg-subrange.ll
index f8761d0..8102779 100644
--- a/test/DebugInfo/X86/dbg-subrange.ll
+++ b/test/DebugInfo/X86/dbg-subrange.ll

@@ -4,7 +4,7 @@
 target triple = "x86_64-apple-macosx10.7.2"
 
 @s = common global [4294967296 x i8] zeroinitializer, align 16
-;CHECK: .long	4294967295
+; CHECK: .quad 4294967296 ## DW_AT_count
 
 define void @bar() nounwind uwtable ssp {
 entry:
@@ -15,21 +15,21 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{i32 786449, metadata !21, i32 12, metadata !"clang version 3.1 (trunk 144833)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !11,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 144833)\000\00\000\00\000", metadata !21, metadata !1, metadata !1, metadata !3, metadata !11,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !21, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
-!6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00bar\00bar\00\004\000\001\000\006\00256\000\000", metadata !21, metadata !6, metadata !7, null, void ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
+!6 = metadata !{metadata !"0x29", metadata !21} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !11 = metadata !{metadata !13}
-!13 = metadata !{i32 720948, i32 0, null, metadata !"s", metadata !"s", metadata !"", metadata !6, i32 2, metadata !14, i32 0, i32 1, [4294967296 x i8]* @s, null} ; [ DW_TAG_variable ]
-!14 = metadata !{i32 720897, null, null, null, i32 0, i64 34359738368, i64 8, i32 0, i32 0, metadata !15, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 34359738368, align 8, offset 0] [from char]
-!15 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!13 = metadata !{metadata !"0x34\00s\00s\00\002\000\001", null, metadata !6, metadata !14, [4294967296 x i8]* @s, null} ; [ DW_TAG_variable ]
+!14 = metadata !{metadata !"0x1\00\000\0034359738368\008\000\000", null, null, metadata !15, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 34359738368, align 8, offset 0] [from char]
+!15 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
 !16 = metadata !{metadata !17}
-!17 = metadata !{i32 720929, i64 0, i64 4294967296} ; [ DW_TAG_subrange_type ]
+!17 = metadata !{metadata !"0x21\000\004294967296"} ; [ DW_TAG_subrange_type ]
 !18 = metadata !{i32 5, i32 3, metadata !19, null}
-!19 = metadata !{i32 786443, metadata !21, metadata !5, i32 4, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{metadata !"0xb\004\001\000", metadata !21, metadata !5} ; [ DW_TAG_lexical_block ]
 !20 = metadata !{i32 6, i32 1, metadata !19, null}
 !21 = metadata !{metadata !"small.c", metadata !"/private/tmp"}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-value-const-byref.ll b/test/DebugInfo/X86/dbg-value-const-byref.ll
index 23fa352..0182d65 100644
--- a/test/DebugInfo/X86/dbg-value-const-byref.ll
+++ b/test/DebugInfo/X86/dbg-value-const-byref.ll

@@ -50,13 +50,13 @@
 define i32 @foo() #0 {
 entry:
   %i = alloca i32, align 4
-  call void @llvm.dbg.value(metadata !14, i64 0, metadata !10), !dbg !15
+  call void @llvm.dbg.value(metadata !14, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !15
   %call = call i32 @f3(i32 3) #3, !dbg !16
-  call void @llvm.dbg.value(metadata !17, i64 0, metadata !10), !dbg !18
+  call void @llvm.dbg.value(metadata !17, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !18
   %call1 = call i32 (...)* @f1() #3, !dbg !19
-  call void @llvm.dbg.value(metadata !{i32 %call1}, i64 0, metadata !10), !dbg !19
+  call void @llvm.dbg.value(metadata !{i32 %call1}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !19
   store i32 %call1, i32* %i, align 4, !dbg !19, !tbaa !20
-  call void @llvm.dbg.value(metadata !{i32* %i}, i64 0, metadata !10), !dbg !24
+  call void @llvm.dbg.value(metadata !{i32* %i}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !24
   call void @f2(i32* %i) #3, !dbg !24
   ret i32 0, !dbg !25
 }
@@ -68,7 +68,7 @@
 declare void @f2(i32*)
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { nounwind ssp uwtable }
 attributes #2 = { nounwind readnone }
@@ -78,25 +78,25 @@
 !llvm.module.flags = !{!11, !12}
 !llvm.ident = !{!13}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [dbg-value-const-byref.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [dbg-value-const-byref.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"dbg-value-const-byref.c", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @foo, null, null, metadata !9, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [dbg-value-const-byref.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\001\000\006\000\001\005", metadata !1, metadata !5, metadata !6, null, i32 ()* @foo, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [dbg-value-const-byref.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786688, metadata !4, metadata !"i", metadata !5, i32 6, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 6]
+!10 = metadata !{metadata !"0x100\00i\006\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 6]
 !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !13 = metadata !{metadata !"clang version 3.5.0 "}
 !14 = metadata !{i32 3}
 !15 = metadata !{i32 6, i32 0, metadata !4, null}
 !16 = metadata !{i32 7, i32 0, metadata !4, null}
 !17 = metadata !{i32 7}
-!18 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!18 = metadata !{i32 8, i32 0, metadata !4, null}
 !19 = metadata !{i32 9, i32 0, metadata !4, null}
 !20 = metadata !{metadata !21, metadata !21, i64 0}
 !21 = metadata !{metadata !"int", metadata !22, i64 0}

diff --git a/test/DebugInfo/X86/dbg-value-dag-combine.ll b/test/DebugInfo/X86/dbg-value-dag-combine.ll
index 12aa61b..cf839b2 100644
--- a/test/DebugInfo/X86/dbg-value-dag-combine.ll
+++ b/test/DebugInfo/X86/dbg-value-dag-combine.ll

@@ -4,21 +4,19 @@
 ; PR 9817
 
 
-declare  <4 x i32> @__amdil_get_global_id_int()
-declare  void @llvm.dbg.value(metadata , i64 , metadata )
+declare <4 x i32> @__amdil_get_global_id_int()
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 define void @__OpenCL_test_kernel(i32 addrspace(1)* %ip) nounwind {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata
-!7), !dbg !8
+  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !7, metadata !{metadata !"0x102"}), !dbg !8
   %0 = call <4 x i32> @__amdil_get_global_id_int() nounwind
   %1 = extractelement <4 x i32> %0, i32 0
-  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !9), !dbg !11
-  call void @llvm.dbg.value(metadata !12, i64 0, metadata !13), !dbg !14
+  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !11
+  call void @llvm.dbg.value(metadata !12, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
   %tmp2 = load i32 addrspace(1)* %ip, align 4, !dbg !15
   %tmp3 = add i32 0, %tmp2, !dbg !15
 ; CHECK:  ##DEBUG_VALUE: idx <- E{{..$}}
-  call void @llvm.dbg.value(metadata !{i32 %tmp3}, i64 0, metadata !13), !dbg
-!15
+  call void @llvm.dbg.value(metadata !{i32 %tmp3}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !15
   %arrayidx = getelementptr i32 addrspace(1)* %ip, i32 %1, !dbg !16
   store i32 %tmp3, i32 addrspace(1)* %arrayidx, align 4, !dbg !16
   ret void, !dbg !17
@@ -26,24 +24,24 @@
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{i32 786478, metadata !19, metadata !1, metadata !"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_test_kernel]
-!1 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !19, i32 1, metadata !"clc", i1 false, metadata !"", i32 0, metadata !12, metadata !12, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !19, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00__OpenCL_test_kernel\00__OpenCL_test_kernel\00__OpenCL_test_kernel\002\000\001\000\006\000\000\000", metadata !19, metadata !1, metadata !3, null, void (i32 addrspace(1)*)* @__OpenCL_test_kernel, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_test_kernel]
+!1 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\00clc\000\00\000\00\001", metadata !19, metadata !12, metadata !12, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !19, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null, metadata !5}
-!5 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
-!6 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!5 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !6} ; [ DW_TAG_pointer_type ]
+!6 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x101\00ip\001\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
 !8 = metadata !{i32 1, i32 42, metadata !0, null}
-!9 = metadata !{i32 786688, metadata !10, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{i32 786443, metadata !19, metadata !0, i32 2, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!9 = metadata !{metadata !"0x100\00gid\003\000", metadata !10, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{metadata !"0xb\002\001\000", metadata !19, metadata !0} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 3, i32 41, metadata !10, null}
 !12 = metadata !{i32 0}
-!13 = metadata !{i32 786688, metadata !10, metadata !"idx", metadata !1, i32 4, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!13 = metadata !{metadata !"0x100\00idx\004\000", metadata !10, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
 !14 = metadata !{i32 4, i32 20, metadata !10, null}
 !15 = metadata !{i32 5, i32 15, metadata !10, null}
 !16 = metadata !{i32 6, i32 18, metadata !10, null}
 !17 = metadata !{i32 7, i32 1, metadata !0, null}
 !18 = metadata !{metadata !0}
 !19 = metadata !{metadata !"OCL6368.tmp.cl", metadata !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp"}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index 4d18f7d..2f0454e 100644
--- a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll

@@ -6,26 +6,26 @@
 ; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=CHECK --check-prefix=DARWIN %s
 
 ; CHECK: DW_TAG_subprogram
-; CHECK:   DW_AT_abstract_origin {{.*}}{[[ABS:.*]]}
+; CHECK:   DW_AT_abstract_origin {{.*}} "foo"
 ; CHECK:   DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK:     DW_AT_abstract_origin {{.*}}{[[ABS_SP:.*]]}
+; CHECK:     DW_AT_abstract_origin {{.*}} "sp"
 ; CHECK:   DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK:     DW_AT_abstract_origin {{.*}}{[[ABS_NUMS:.*]]}
+; CHECK:     DW_AT_abstract_origin {{.*}} "nums"
 
-; CHECK: [[ABS]]: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_name {{.*}} "foo"
-; CHECK: [[ABS_SP]]:   DW_TAG_formal_parameter
+; CHECK:   DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_name {{.*}} "sp"
-; CHECK: [[ABS_NUMS]]:  DW_TAG_formal_parameter
+; CHECK:   DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_name {{.*}} "nums"
 
 ;CHECK: DW_TAG_inlined_subroutine
-;CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABS]]}
+;CHECK-NEXT: DW_AT_abstract_origin {{.*}} "foo"
 ;CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr]
 ;CHECK-NEXT: DW_AT_high_pc [DW_FORM_data4]
 ;CHECK-NEXT: DW_AT_call_file
@@ -34,9 +34,9 @@
 ;CHECK: DW_TAG_formal_parameter
 ;FIXME: Linux shouldn't drop this parameter either...
 ;CHECK-NOT: DW_TAG
-;DARWIN:   DW_AT_abstract_origin {{.*}}{[[ABS_SP]]}
+;DARWIN:   DW_AT_abstract_origin {{.*}} "sp"
 ;DARWIN: DW_TAG_formal_parameter
-;CHECK: DW_AT_abstract_origin {{.*}}{[[ABS_NUMS]]}
+;CHECK: DW_AT_abstract_origin {{.*}} "nums"
 ;CHECK-NOT: DW_TAG_formal_parameter
 
 %struct.S1 = type { float*, i32 }
@@ -45,8 +45,8 @@
 
 define i32 @foo(%struct.S1* nocapture %sp, i32 %nums) nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.S1* %sp}, i64 0, metadata !9), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i32 %nums}, i64 0, metadata !18), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{%struct.S1* %sp}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{i32 %nums}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !21
   %tmp2 = getelementptr inbounds %struct.S1* %sp, i64 0, i32 1, !dbg !22
   store i32 %nums, i32* %tmp2, align 4, !dbg !22
   %call = tail call float* @bar(i32 %nums) nounwind optsize, !dbg !27
@@ -61,49 +61,49 @@
 
 define void @foobar() nounwind optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !30, i64 0, metadata !9) nounwind, !dbg !31
-  tail call void @llvm.dbg.value(metadata !34, i64 0, metadata !18) nounwind, !dbg !35
+  tail call void @llvm.dbg.value(metadata !30, i64 0, metadata !9, metadata !{metadata !"0x102"}) nounwind, !dbg !31
+  tail call void @llvm.dbg.value(metadata !34, i64 0, metadata !18, metadata !{metadata !"0x102"}) nounwind, !dbg !35
   store i32 1, i32* getelementptr inbounds (%struct.S1* @p, i64 0, i32 1), align 8, !dbg !36
   %call.i = tail call float* @bar(i32 1) nounwind optsize, !dbg !37
   store float* %call.i, float** getelementptr inbounds (%struct.S1* @p, i64 0, i32 0), align 8, !dbg !37
   ret void, !dbg !38
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!43}
 
-!0 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 8, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.S1*, i32)* @foo, null, null, metadata !41, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [foo]
-!1 = metadata !{i32 786473, metadata !42} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !42, i32 12, metadata !"clang version 2.9 (trunk 125693)", i1 true, metadata !"", i32 0, metadata !8, metadata !8, metadata !39, metadata !40,  metadata !44, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !42, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\008\000\001\000\006\00256\001\008", metadata !1, metadata !1, metadata !3, null, i32 (%struct.S1*, i32)* @foo, null, null, metadata !41} ; [ DW_TAG_subprogram ] [line 8] [def] [foo]
+!1 = metadata !{metadata !"0x29", metadata !42} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 125693)\001\00\000\00\001", metadata !42, metadata !8, metadata !8, metadata !39, metadata !40,  metadata !44} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !42, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"", i32 15, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void ()* @foobar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 15] [def] [scope 0] [foobar]
-!7 = metadata !{i32 786453, metadata !42, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00foobar\00foobar\00\0015\000\001\000\006\000\001\000", metadata !1, metadata !1, metadata !7, null, void ()* @foobar, null, null, null} ; [ DW_TAG_subprogram ] [line 15] [def] [scope 0] [foobar]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !42, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{i32 786689, metadata !0, metadata !"sp", metadata !1, i32 16777223, metadata !10, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786454, metadata !42, metadata !2, metadata !"S1", i32 4, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_typedef ]
-!12 = metadata !{i32 786451, metadata !42, metadata !2, metadata !"S1", i32 1, i64 128, i64 64, i32 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [S1] [line 1, size 128, align 64, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0x101\00sp\0016777223\000", metadata !0, metadata !1, metadata !10, metadata !32} ; [ DW_TAG_arg_variable ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0x16\00S1\004\000\000\000\000", metadata !42, metadata !2, metadata !12} ; [ DW_TAG_typedef ]
+!12 = metadata !{metadata !"0x13\00S1\001\00128\0064\000\000\000", metadata !42, metadata !2, null, metadata !13, null, null, null} ; [ DW_TAG_structure_type ] [S1] [line 1, size 128, align 64, offset 0] [def] [from ]
 !13 = metadata !{metadata !14, metadata !17}
-!14 = metadata !{i32 786445, metadata !42, metadata !1, metadata !"m", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !15} ; [ DW_TAG_member ]
-!15 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ]
-!16 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!17 = metadata !{i32 786445, metadata !42, metadata !1, metadata !"nums", i32 3, i64 32, i64 32, i64 64, i32 0, metadata !5} ; [ DW_TAG_member ]
-!18 = metadata !{i32 786689, metadata !0, metadata !"nums", metadata !1, i32 33554439, metadata !5, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 786484, i32 0, metadata !2, metadata !"p", metadata !"p", metadata !"", metadata !1, i32 14, metadata !11, i32 0, i32 1, %struct.S1* @p, null} ; [ DW_TAG_variable ]
+!14 = metadata !{metadata !"0xd\00m\002\0064\0064\000\000", metadata !42, metadata !1, metadata !15} ; [ DW_TAG_member ]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !16} ; [ DW_TAG_pointer_type ]
+!16 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0xd\00nums\003\0032\0032\0064\000", metadata !42, metadata !1, metadata !5} ; [ DW_TAG_member ]
+!18 = metadata !{metadata !"0x101\00nums\0033554439\000", metadata !0, metadata !1, metadata !5, metadata !32} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x34\00p\00p\00\0014\000\001", metadata !2, metadata !1, metadata !11, %struct.S1* @p, null} ; [ DW_TAG_variable ]
 !20 = metadata !{i32 7, i32 13, metadata !0, null}
 !21 = metadata !{i32 7, i32 21, metadata !0, null}
 !22 = metadata !{i32 9, i32 3, metadata !23, null}
-!23 = metadata !{i32 786443, metadata !1, metadata !0, i32 8, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!23 = metadata !{metadata !"0xb\008\001\000", metadata !1, metadata !0} ; [ DW_TAG_lexical_block ]
 !27 = metadata !{i32 10, i32 3, metadata !23, null}
 !29 = metadata !{i32 11, i32 3, metadata !23, null}
 !30 = metadata !{%struct.S1* @p}
 !31 = metadata !{i32 7, i32 13, metadata !0, metadata !32}
 !32 = metadata !{i32 16, i32 3, metadata !33, null}
-!33 = metadata !{i32 786443, metadata !1, metadata !6, i32 15, i32 15, i32 1} ; [ DW_TAG_lexical_block ]
+!33 = metadata !{metadata !"0xb\0015\0015\001", metadata !1, metadata !6} ; [ DW_TAG_lexical_block ]
 !34 = metadata !{i32 1}
 !35 = metadata !{i32 7, i32 21, metadata !0, metadata !32}
 !36 = metadata !{i32 9, i32 3, metadata !23, metadata !32}
@@ -113,5 +113,5 @@
 !40 = metadata !{metadata !19}
 !41 = metadata !{metadata !9, metadata !18}
 !42 = metadata !{metadata !"nm2.c", metadata !"/private/tmp"}
-!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !44 = metadata !{}

diff --git a/test/DebugInfo/X86/dbg-value-isel.ll b/test/DebugInfo/X86/dbg-value-isel.ll
index 155f76f..6e5d81a 100644
--- a/test/DebugInfo/X86/dbg-value-isel.ll
+++ b/test/DebugInfo/X86/dbg-value-isel.ll

@@ -13,7 +13,7 @@
 
 define void @__OpenCL_nbt02_kernel(i32 addrspace(1)* %ip) nounwind {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !8), !dbg !9
+  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !8, metadata !{metadata !"0x102"}), !dbg !9
   %0 = call <4 x i32> @__amdil_get_local_id_int() nounwind
   %1 = extractelement <4 x i32> %0, i32 0
   br label %2
@@ -28,7 +28,7 @@
 
 get_local_id.exit:                                ; preds = %4
   %6 = phi i32 [ %5, %4 ]
-  call void @llvm.dbg.value(metadata !{i32 %6}, i64 0, metadata !10), !dbg !12
+  call void @llvm.dbg.value(metadata !{i32 %6}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !12
   %7 = call <4 x i32> @__amdil_get_global_id_int() nounwind, !dbg !12
   %8 = extractelement <4 x i32> %7, i32 0, !dbg !12
   br label %9
@@ -43,7 +43,7 @@
 
 get_global_id.exit:                               ; preds = %11
   %13 = phi i32 [ %12, %11 ]
-  call void @llvm.dbg.value(metadata !{i32 %13}, i64 0, metadata !13), !dbg !14
+  call void @llvm.dbg.value(metadata !{i32 %13}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
   %14 = call <4 x i32> @__amdil_get_local_size_int() nounwind
   %15 = extractelement <4 x i32> %14, i32 0
   br label %16
@@ -58,7 +58,7 @@
 
 get_local_size.exit:                              ; preds = %18
   %20 = phi i32 [ %19, %18 ]
-  call void @llvm.dbg.value(metadata !{i32 %20}, i64 0, metadata !15), !dbg !16
+  call void @llvm.dbg.value(metadata !{i32 %20}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
   %tmp5 = add i32 %6, %13, !dbg !17
   %tmp7 = add i32 %tmp5, %20, !dbg !17
   store i32 %tmp7, i32 addrspace(1)* %ip, align 4, !dbg !17
@@ -68,7 +68,7 @@
   ret void, !dbg !18
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare <4 x i32> @__amdil_get_local_size_int() nounwind
 
@@ -76,31 +76,31 @@
 
 declare <4 x i32> @__amdil_get_global_id_int() nounwind
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{i32 786478, metadata !20, metadata !1, metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_nbt02_kernel]
-!1 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !20, i32 1, metadata !"clc", i1 false, metadata !"", i32 0, metadata !21, metadata !21, metadata !19, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !20, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00__OpenCL_nbt02_kernel\00__OpenCL_nbt02_kernel\00__OpenCL_nbt02_kernel\002\000\001\000\006\000\000\000", metadata !20, metadata !1, metadata !3, null, void (i32 addrspace(1)*)* @__OpenCL_nbt02_kernel, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_nbt02_kernel]
+!1 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\00clc\000\00\000\00\001", metadata !20, metadata !21, metadata !21, metadata !19, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null, metadata !5}
-!5 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
-!6 = metadata !{i32 589846, metadata !20, metadata !2, metadata !"uint", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
-!7 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!5 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !6} ; [ DW_TAG_pointer_type ]
+!6 = metadata !{metadata !"0x16\00uint\000\000\000\000\000", metadata !20, metadata !2, metadata !7} ; [ DW_TAG_typedef ]
+!7 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x101\00ip\001\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
 !9 = metadata !{i32 1, i32 32, metadata !0, null}
-!10 = metadata !{i32 786688, metadata !11, metadata !"tid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !1, metadata !0, i32 2, i32 1, i32 1} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0x100\00tid\003\000", metadata !11, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\002\001\001", metadata !1, metadata !0} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{i32 5, i32 24, metadata !11, null}
-!13 = metadata !{i32 786688, metadata !11, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!13 = metadata !{metadata !"0x100\00gid\003\000", metadata !11, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
 !14 = metadata !{i32 6, i32 25, metadata !11, null}
-!15 = metadata !{i32 786688, metadata !11, metadata !"lsz", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0x100\00lsz\003\000", metadata !11, metadata !1, metadata !6} ; [ DW_TAG_auto_variable ]
 !16 = metadata !{i32 7, i32 26, metadata !11, null}
 !17 = metadata !{i32 9, i32 24, metadata !11, null}
 !18 = metadata !{i32 10, i32 1, metadata !0, null}
 !19 = metadata !{metadata !0}
 !20 = metadata !{metadata !"OCLlLwTXZ.cl", metadata !"/tmp"}
 !21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-value-location.ll b/test/DebugInfo/X86/dbg-value-location.ll
index 55d1ae6..1bfb28f 100644
--- a/test/DebugInfo/X86/dbg-value-location.ll
+++ b/test/DebugInfo/X86/dbg-value-location.ll

@@ -14,11 +14,11 @@
 
 @dfm = external global i32, align 4
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @foo(i32 %dev, i64 %cmd, i8* %data, i32 %data2) nounwind optsize ssp {
 entry:
-  call void @llvm.dbg.value(metadata !{i32 %dev}, i64 0, metadata !12), !dbg !13
+  call void @llvm.dbg.value(metadata !{i32 %dev}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !13
   %tmp.i = load i32* @dfm, align 4, !dbg !14
   %cmp.i = icmp eq i32 %tmp.i, 0, !dbg !14
   br i1 %cmp.i, label %if.else, label %if.end.i, !dbg !14
@@ -45,35 +45,35 @@
 declare hidden fastcc i32 @bar(i32, i32* nocapture) nounwind optsize ssp
 declare hidden fastcc i32 @bar2(i32) nounwind optsize ssp
 declare hidden fastcc i32 @bar3(i32) nounwind optsize ssp
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 19510, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i64, i8*, i32)* @foo, null, null, null, i32 19510} ; [ DW_TAG_subprogram ] [line 19510] [def] [foo]
-!1 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !27, i32 12, metadata !"clang version 2.9 (trunk 124753)", i1 true, metadata !"", i32 0, metadata !28, metadata !28, metadata !24, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !26, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\0019510\000\001\000\006\00256\001\0019510", metadata !26, metadata !1, metadata !3, null, i32 (i32, i64, i8*, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 19510] [def] [foo]
+!1 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 124753)\001\00\000\00\000", metadata !27, metadata !28, metadata !28, metadata !24, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !26, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar3", metadata !"bar3", metadata !"", i32 14827, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @bar3, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 14827] [local] [def] [scope 0] [bar3]
-!7 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar2", metadata !"bar2", metadata !"", i32 15397, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @bar2, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 15397] [local] [def] [scope 0] [bar2]
-!8 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar", metadata !"bar", metadata !"", i32 12382, metadata !9, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32*)* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 12382] [local] [def] [scope 0] [bar]
-!9 = metadata !{i32 786453, metadata !26, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00bar3\00bar3\00\0014827\001\001\000\006\00256\001\000", metadata !26, metadata !1, metadata !3, null, i32 (i32)* @bar3, null, null, null} ; [ DW_TAG_subprogram ] [line 14827] [local] [def] [scope 0] [bar3]
+!7 = metadata !{metadata !"0x2e\00bar2\00bar2\00\0015397\001\001\000\006\00256\001\000", metadata !26, metadata !1, metadata !3, null, i32 (i32)* @bar2, null, null, null} ; [ DW_TAG_subprogram ] [line 15397] [local] [def] [scope 0] [bar2]
+!8 = metadata !{metadata !"0x2e\00bar\00bar\00\0012382\001\001\000\006\00256\001\000", metadata !26, metadata !1, metadata !9, null, i32 (i32, i32*)* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 12382] [local] [def] [scope 0] [bar]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !26, metadata !1, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 786689, metadata !0, metadata !"var", metadata !1, i32 19509, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", null, metadata !2} ; [ DW_TAG_base_type ]
+!12 = metadata !{metadata !"0x101\00var\0019509\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
 !13 = metadata !{i32 19509, i32 20, metadata !0, null}
 !14 = metadata !{i32 18091, i32 2, metadata !15, metadata !17}
-!15 = metadata !{i32 786443, metadata !26, metadata !16, i32 18086, i32 1, i32 748} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"foo_bar", metadata !"foo_bar", metadata !"", i32 18086, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 18086] [local] [def] [scope 0] [foo_bar]
+!15 = metadata !{metadata !"0xb\0018086\001\00748", metadata !26, metadata !16} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x2e\00foo_bar\00foo_bar\00\0018086\001\001\000\006\00256\001\000", metadata !26, metadata !1, metadata !3, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 18086] [local] [def] [scope 0] [foo_bar]
 !17 = metadata !{i32 19514, i32 2, metadata !18, null}
-!18 = metadata !{i32 786443, metadata !26, metadata !0, i32 19510, i32 1, i32 99} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{metadata !"0xb\0019510\001\0099", metadata !26, metadata !0} ; [ DW_TAG_lexical_block ]
 !22 = metadata !{i32 18094, i32 2, metadata !15, metadata !17}
 !23 = metadata !{i32 19524, i32 1, metadata !18, null}
 !24 = metadata !{metadata !0, metadata !6, metadata !7, metadata !8, metadata !16}
-!25 = metadata !{i32 786473, metadata !27} ; [ DW_TAG_file_type ]
+!25 = metadata !{metadata !"0x29", metadata !27} ; [ DW_TAG_file_type ]
 !26 = metadata !{metadata !"/tmp/f.c", metadata !"/tmp"}
 !27 = metadata !{metadata !"f.i", metadata !"/tmp"}
 !28 = metadata !{i32 0}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-value-range.ll b/test/DebugInfo/X86/dbg-value-range.ll
index d9e7a63..aa75369 100644
--- a/test/DebugInfo/X86/dbg-value-range.ll
+++ b/test/DebugInfo/X86/dbg-value-range.ll

@@ -4,10 +4,10 @@
 
 define i32 @bar(%struct.a* nocapture %b) nounwind ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.a* %b}, i64 0, metadata !6), !dbg !13
+  tail call void @llvm.dbg.value(metadata !{%struct.a* %b}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !13
   %tmp1 = getelementptr inbounds %struct.a* %b, i64 0, i32 0, !dbg !14
   %tmp2 = load i32* %tmp1, align 4, !dbg !14
-  tail call void @llvm.dbg.value(metadata !{i32 %tmp2}, i64 0, metadata !11), !dbg !14
+  tail call void @llvm.dbg.value(metadata !{i32 %tmp2}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !14
   %call = tail call i32 (...)* @foo(i32 %tmp2) nounwind , !dbg !18
   %add = add nsw i32 %tmp2, 1, !dbg !19
   ret i32 %add, !dbg !19
@@ -15,24 +15,24 @@
 
 declare i32 @foo(...) 
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!24}
 
-!0 = metadata !{i32 786478, metadata !22, metadata !1, metadata !"bar", metadata !"bar", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.a*)* @bar, null, null, metadata !21, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [bar]
-!1 = metadata !{i32 786473, metadata !22} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !22, i32 12, metadata !"clang version 2.9 (trunk 122997)", i1 true, metadata !"", i32 0, metadata !23, metadata !23, metadata !20, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !22, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00bar\00bar\00\005\000\001\000\006\00256\001\000", metadata !22, metadata !1, metadata !3, null, i32 (%struct.a*)* @bar, null, null, metadata !21} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [bar]
+!1 = metadata !{metadata !"0x29", metadata !22} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 122997)\001\00\000\00\001", metadata !22, metadata !23, metadata !23, metadata !20, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !22, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !0, metadata !"b", metadata !1, i32 5, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !22, metadata !2, metadata !"a", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00b\005\000", metadata !0, metadata !1, metadata !7} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !8} ; [ DW_TAG_pointer_type ]
+!8 = metadata !{metadata !"0x13\00a\001\0032\0032\000\000\000", metadata !22, metadata !2, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 1, size 32, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786445, metadata !22, metadata !1, metadata !"c", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
-!11 = metadata !{i32 786688, metadata !12, metadata !"x", metadata !1, i32 6, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!12 = metadata !{i32 786443, metadata !22, metadata !0, i32 5, i32 22, i32 0} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0xd\00c\002\0032\0032\000\000", metadata !22, metadata !1, metadata !5} ; [ DW_TAG_member ]
+!11 = metadata !{metadata !"0x100\00x\006\000", metadata !12, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!12 = metadata !{metadata !"0xb\005\0022\000", metadata !22, metadata !0} ; [ DW_TAG_lexical_block ]
 !13 = metadata !{i32 5, i32 19, metadata !0, null}
 !14 = metadata !{i32 6, i32 14, metadata !12, null}
 !18 = metadata !{i32 7, i32 2, metadata !12, null}
@@ -51,8 +51,10 @@
 ;CHECK-NEXT: [[CLOBBER:Ltmp[0-9]*]]
 
 ;CHECK:Ldebug_loc0:
+;CHECK-NEXT: Lset{{.*}} =
 ;CHECK-NEXT:	.quad
-;CHECK-NEXT:	.quad	[[CLOBBER]]
+;CHECK-NEXT: [[CLOBBER_OFF:Lset.*]] = [[CLOBBER]]-{{.*}}
+;CHECK-NEXT:	.quad	[[CLOBBER_OFF]]
 ;CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}
 ;CHECK-NEXT:    .short  Lset
 ;CHECK-NEXT: Ltmp
@@ -60,4 +62,4 @@
 ;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg-value-terminator.ll b/test/DebugInfo/X86/dbg-value-terminator.ll
index 974e0ad..763034d 100644
--- a/test/DebugInfo/X86/dbg-value-terminator.ll
+++ b/test/DebugInfo/X86/dbg-value-terminator.ll

@@ -87,7 +87,7 @@
 "44.i":                                           ; preds = %"42.i"
   %2 = load %a** undef, align 8, !dbg !12
   %3 = bitcast %a* %2 to %a*, !dbg !12
-  call void @llvm.dbg.value(metadata !{%a* %3}, i64 0, metadata !6), !dbg !12
+  call void @llvm.dbg.value(metadata !{%a* %3}, i64 0, metadata !6, metadata !{metadata !"0x102"}), !dbg !12
   br label %may_unswitch_on.exit, !dbg !12
 
 "45.i":                                           ; preds = %"38.i"
@@ -108,26 +108,26 @@
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind uwtable }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22}
 
-!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, %a* ()* @test, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00Apple clang version\001\00\000\00\001", metadata !20, metadata !21, metadata !21, metadata !18, null,  null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00\00\002\000\001\000\006\00256\001\000", metadata !20, metadata !2, metadata !3, null, %a* ()* @test, null, null, metadata !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554434, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{i32 786468, null, metadata !0, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786688, metadata !11, metadata !"a", metadata !2, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !20, metadata !1, i32 2, i32 25, i32 0} ; [ DW_TAG_lexical_block ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{metadata !"0x101\00c\0033554434\000", metadata !1, metadata !2, metadata !8} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !0, metadata !9} ; [ DW_TAG_pointer_type ]
+!9 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, metadata !0} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x100\00a\003\000", metadata !11, metadata !2, metadata !9} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\002\0025\000", metadata !20, metadata !1} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{i32 2, i32 13, metadata !1, null}
 !18 = metadata !{metadata !1}
 !19 = metadata !{metadata !6, metadata !7, metadata !10}
 !20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dbg_value_direct.ll b/test/DebugInfo/X86/dbg_value_direct.ll
index db947ac..edc42c0 100644
--- a/test/DebugInfo/X86/dbg_value_direct.ll
+++ b/test/DebugInfo/X86/dbg_value_direct.ll

@@ -53,7 +53,7 @@
   %19 = inttoptr i64 %18 to i8*
   %20 = load i8* %19
   %21 = icmp ne i8 %20, 0
-  call void @llvm.dbg.declare(metadata !{i32* %3}, metadata !23)
+  call void @llvm.dbg.declare(metadata !{i32* %3}, metadata !23, metadata !28)
   br i1 %21, label %22, label %28
 
 ; <label>:22                                      ; preds = %entry
@@ -70,7 +70,7 @@
 
 ; <label>:28                                      ; preds = %22, %entry
   store i32 %0, i32* %3, align 4
-  call void @llvm.dbg.declare(metadata !{%struct.A* %agg.result}, metadata !24), !dbg !25
+  call void @llvm.dbg.declare(metadata !{%struct.A* %agg.result}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
   call void @_ZN1AC1Ev(%struct.A* %agg.result), !dbg !25
   store i64 1172321806, i64* %4, !dbg !26
   %29 = inttoptr i64 %10 to i32*, !dbg !26
@@ -85,7 +85,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare void @_ZN1AC1Ev(%struct.A*) #2
 
@@ -147,32 +147,32 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !27}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/crash.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/crash.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"crash.cpp", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.A*, i32)* @_Z4funci, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/crash.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\006\000\001\000\006\00256\000\006", metadata !1, metadata !5, metadata !6, null, void (%struct.A*, i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/crash.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !21}
-!8 = metadata !{i32 786451, metadata !1, null, metadata !"A", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
+!8 = metadata !{metadata !"0x13\00A\001\008\008\000\000\000", metadata !1, null, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !15}
-!10 = metadata !{i32 786478, metadata !1, metadata !8, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !14, i32 2} ; [ DW_TAG_subprogram ] [line 2] [A]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x2e\00A\00A\00\002\000\000\000\006\00256\000\002", metadata !1, metadata !8, metadata !11, null, null, null, i32 0, metadata !14} ; [ DW_TAG_subprogram ] [line 2] [A]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
 !14 = metadata !{i32 786468}
-!15 = metadata !{i32 786478, metadata !1, metadata !8, metadata !"A", metadata !"A", metadata !"", i32 3, metadata !16, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !20, i32 3} ; [ DW_TAG_subprogram ] [line 3] [A]
-!16 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x2e\00A\00A\00\003\000\000\000\006\00256\000\003", metadata !1, metadata !8, metadata !16, null, null, null, i32 0, metadata !20} ; [ DW_TAG_subprogram ] [line 3] [A]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null, metadata !13, metadata !18}
-!18 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !19} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!19 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from A]
+!18 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !19} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from A]
 !20 = metadata !{i32 786468}
-!21 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!21 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !22 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!23 = metadata !{i32 786689, metadata !4, metadata !"", metadata !5, i32 16777222, metadata !21, i32 0, i32 0, metadata !28} ; [ DW_TAG_arg_variable ] [line 6]
-!24 = metadata !{i32 786688, metadata !4, metadata !"a", metadata !5, i32 7, metadata !8, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 7]
+!23 = metadata !{metadata !"0x101\00\0016777222\000", metadata !4, metadata !5, metadata !21} ; [ DW_TAG_arg_variable ] [line 6]
+!24 = metadata !{metadata !"0x100\00a\007\008192", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [a] [line 7]
 !25 = metadata !{i32 7, i32 0, metadata !4, null}
-!26 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!28 = metadata !{i64 2}
+!26 = metadata !{i32 8, i32 0, metadata !4, null}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!28 = metadata !{metadata !"0x102\006"} ; [ DW_TAG_expression ] [DW_OP_deref]

diff --git a/test/DebugInfo/X86/debug-dead-local-var.ll b/test/DebugInfo/X86/debug-dead-local-var.ll
index 64f0b2a..08a22a6 100644
--- a/test/DebugInfo/X86/debug-dead-local-var.ll
+++ b/test/DebugInfo/X86/debug-dead-local-var.ll

@@ -27,25 +27,25 @@
 !llvm.module.flags = !{!18, !19}
 !llvm.ident = !{!20}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/debug-dead-local-var.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/debug-dead-local-var.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"debug-dead-local-var.c", metadata !"/usr/local/google/home/echristo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 11, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @bar, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/debug-dead-local-var.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00bar\00bar\00\0011\000\001\000\006\000\001\0011", metadata !1, metadata !5, metadata !6, null, i32 ()* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [bar]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/debug-dead-local-var.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 6, metadata !10, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, metadata !12, i32 6} ; [ DW_TAG_subprogram ] [line 6] [local] [def] [foo]
-!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x2e\00foo\00foo\00\006\001\001\000\006\000\001\006", metadata !1, metadata !5, metadata !10, null, null, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 6] [local] [def] [foo]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null}
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786688, metadata !9, metadata !"xyz", metadata !5, i32 8, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xyz] [line 8]
-!14 = metadata !{i32 786451, metadata !1, metadata !9, metadata !"X", i32 8, i64 64, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 8, size 64, align 32, offset 0] [def] [from ]
+!13 = metadata !{metadata !"0x100\00xyz\008\000", metadata !9, metadata !5, metadata !14} ; [ DW_TAG_auto_variable ] [xyz] [line 8]
+!14 = metadata !{metadata !"0x13\00X\008\0064\0032\000\000\000", metadata !1, metadata !9, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 8, size 64, align 32, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{i32 786445, metadata !1, metadata !14, metadata !"a", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 8, size 32, align 32, offset 0] [from int]
-!17 = metadata !{i32 786445, metadata !1, metadata !14, metadata !"b", i32 8, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 8, size 32, align 32, offset 32] [from int]
+!16 = metadata !{metadata !"0xd\00a\008\0032\0032\000\000", metadata !1, metadata !14, metadata !8} ; [ DW_TAG_member ] [a] [line 8, size 32, align 32, offset 0] [from int]
+!17 = metadata !{metadata !"0xd\00b\008\0032\0032\0032\000", metadata !1, metadata !14, metadata !8} ; [ DW_TAG_member ] [b] [line 8, size 32, align 32, offset 32] [from int]
 !18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !20 = metadata !{metadata !"clang version 3.5.0 (trunk 209255) (llvm/trunk 209253)"}
 !21 = metadata !{i32 13, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/debug-info-access.ll b/test/DebugInfo/X86/debug-info-access.ll
new file mode 100644
index 0000000..952330c
--- /dev/null
+++ b/test/DebugInfo/X86/debug-info-access.ll

@@ -0,0 +1,150 @@
+; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+;
+; Test the DW_AT_accessibility DWARF attribute.
+;
+;
+; Regenerate me:
+; clang++ -g tools/clang/test/CodeGenCXX/debug-info-access.cpp -S -emit-llvm -o -
+;
+;   struct A {
+;     void pub_default();
+;     static int pub_default_static;
+;   };
+;
+;   class B : public A {
+;   public:
+;     void pub();
+;     static int public_static;
+;   protected:
+;     void prot();
+;   private:
+;     void priv_default();
+;   };
+;
+;   union U {
+;     void union_pub_default();
+;   private:
+;     int union_priv;
+;   };
+;
+;   void free() {}
+;
+;   A a;
+;   B b;
+;   U u;
+
+; CHECK: DW_TAG_member
+; CHECK:     DW_AT_name {{.*}}"pub_default_static")
+; CHECK-NOT: DW_AT_accessibility
+; CHECK-NOT: DW_TAG
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:     DW_AT_name {{.*}}"pub_default")
+; CHECK-NOT: DW_AT_accessibility
+; CHECK: DW_TAG
+;
+; CHECK: DW_TAG_inheritance
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_accessibility {{.*}}(DW_ACCESS_public)
+;
+; CHECK: DW_TAG_member
+; CHECK:     DW_AT_name {{.*}}"public_static")
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_accessibility {{.*}}(DW_ACCESS_public)
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:     DW_AT_name {{.*}}"pub")
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_accessibility {{.*}}(DW_ACCESS_public)
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:     DW_AT_name {{.*}}"prot")
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_accessibility {{.*}}(DW_ACCESS_protected)
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:     DW_AT_name {{.*}}"priv_default")
+; CHECK-NOT: DW_AT_accessibility
+; CHECK: DW_TAG
+;
+; CHECK: DW_TAG_member
+; CHECK:     DW_AT_name {{.*}}"union_priv")
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_accessibility {{.*}}(DW_ACCESS_private)
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:     DW_AT_name {{.*}}"union_pub_default")
+; CHECK-NOT: DW_AT_accessibility
+; CHECK: DW_TAG
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:     DW_AT_name {{.*}}"free")
+; CHECK-NOT: DW_AT_accessibility
+; CHECK-NOT: DW_TAG
+;
+; ModuleID = '/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+%struct.A = type { i8 }
+%class.B = type { i8 }
+%union.U = type { i32 }
+
+@a = global %struct.A zeroinitializer, align 1
+@b = global %class.B zeroinitializer, align 1
+@u = global %union.U zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z4freev() #0 {
+  ret void, !dbg !41
+}
+
+attributes #0 = { nounwind ssp uwtable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!38, !39}
+!llvm.ident = !{!40}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !29, metadata !34, metadata !2} ; [ DW_TAG_compile_unit ] [/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !12, metadata !22}
+!4 = metadata !{metadata !"0x13\00A\003\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !8}
+!6 = metadata !{metadata !"0xd\00pub_default_static\007\000\000\000\004096", metadata !1, metadata !"_ZTS1A", metadata !7, null} ; [ DW_TAG_member ] [pub_default_static] [line 7, size 0, align 0, offset 0] [static] [from int]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x2e\00pub_default\00pub_default\00_ZN1A11pub_defaultEv\005\000\000\000\006\00256\000\005", metadata !1, metadata !"_ZTS1A", metadata !9, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [pub_default]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{null, metadata !11}
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!12 = metadata !{metadata !"0x2\00B\0011\008\008\000\000\000", metadata !1, null, null, metadata !13, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 11, size 8, align 8, offset 0] [def] [from ]
+!13 = metadata !{metadata !14, metadata !15, metadata !16, metadata !20, metadata !21}
+!14 = metadata !{metadata !"0x1c\00\000\000\000\000\003", null, metadata !"_ZTS1B", metadata !"_ZTS1A"} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [public] [from _ZTS1A]
+!15 = metadata !{metadata !"0xd\00public_static\0016\000\000\000\004099", metadata !1, metadata !"_ZTS1B", metadata !7, null} ; [ DW_TAG_member ] [public_static] [line 16, size 0, align 0, offset 0] [public] [static] [from int]
+!16 = metadata !{metadata !"0x2e\00pub\00pub\00_ZN1B3pubEv\0014\000\000\000\006\00259\000\0014", metadata !1, metadata !"_ZTS1B", metadata !17, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 14] [public] [pub]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{null, metadata !19}
+!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!20 = metadata !{metadata !"0x2e\00prot\00prot\00_ZN1B4protEv\0019\000\000\000\006\00258\000\0019", metadata !1, metadata !"_ZTS1B", metadata !17, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 19] [protected] [prot]
+!21 = metadata !{metadata !"0x2e\00priv_default\00priv_default\00_ZN1B12priv_defaultEv\0022\000\000\000\006\00256\000\0022", metadata !1, metadata !"_ZTS1B", metadata !17, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 22] [priv_default]
+!22 = metadata !{metadata !"0x17\00U\0025\0032\0032\000\000\000", metadata !1, null, null, metadata !23, null, null, metadata !"_ZTS1U"} ; [ DW_TAG_union_type ] [U] [line 25, size 32, align 32, offset 0] [def] [from ]
+!23 = metadata !{metadata !24, metadata !25}
+!24 = metadata !{metadata !"0xd\00union_priv\0030\0032\0032\000\001", metadata !1, metadata !"_ZTS1U", metadata !7} ; [ DW_TAG_member ] [union_priv] [line 30, size 32, align 32, offset 0] [private] [from int]
+!25 = metadata !{metadata !"0x2e\00union_pub_default\00union_pub_default\00_ZN1U17union_pub_defaultEv\0027\000\000\000\006\00256\000\0027", metadata !1, metadata !"_ZTS1U", metadata !26, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 27] [union_pub_default]
+!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = metadata !{null, metadata !28}
+!28 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1U"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1U]
+!29 = metadata !{metadata !30}
+!30 = metadata !{metadata !"0x2e\00free\00free\00_Z4freev\0035\000\001\000\006\00256\000\0035", metadata !1, metadata !31, metadata !32, null, void ()* @_Z4freev, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 35] [def] [free]
+!31 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/llvm/tools/clang/test/CodeGenCXX/debug-info-access.cpp]
+!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = metadata !{null}
+!34 = metadata !{metadata !35, metadata !36, metadata !37}
+!35 = metadata !{metadata !"0x34\00a\00a\00\0037\000\001", null, metadata !31, metadata !"_ZTS1A", %struct.A* @a, null} ; [ DW_TAG_variable ] [a] [line 37] [def]
+!36 = metadata !{metadata !"0x34\00b\00b\00\0038\000\001", null, metadata !31, metadata !"_ZTS1B", %class.B* @b, null} ; [ DW_TAG_variable ] [b] [line 38] [def]
+!37 = metadata !{metadata !"0x34\00u\00u\00\0039\000\001", null, metadata !31, metadata !"_ZTS1U", %union.U* @u, null} ; [ DW_TAG_variable ] [u] [line 39] [def]
+!38 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!39 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!40 = metadata !{metadata !"clang version 3.6.0 "}
+!41 = metadata !{i32 35, i32 14, metadata !30, null}

diff --git a/test/DebugInfo/X86/debug-info-block-captured-self.ll b/test/DebugInfo/X86/debug-info-block-captured-self.ll
index 95eda60..d610aa6 100644
--- a/test/DebugInfo/X86/debug-info-block-captured-self.ll
+++ b/test/DebugInfo/X86/debug-info-block-captured-self.ll

@@ -63,50 +63,50 @@
 ; ModuleID = 'llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m'
 %0 = type opaque
 %struct.__block_descriptor = type { i64, i64 }
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 define internal void @"__24-[Main initWithContext:]_block_invoke"(i8* %.block_descriptor, i8* %obj) #0 {
   %block = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !84
   %block.captured-self = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i32 0, i32 5, !dbg !84
-  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block}, metadata !86), !dbg !87
+  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block}, metadata !86, metadata !110), !dbg !87
   ret void, !dbg !87
 }
 
 define internal void @"__24-[Main initWithContext:]_block_invoke_2"(i8* %.block_descriptor, i8* %object) #0 {
   %block = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !103
   %block.captured-self = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i32 0, i32 5, !dbg !103
-  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block}, metadata !105), !dbg !106
+  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block}, metadata !105, metadata !109), !dbg !106
   ret void, !dbg !106
 }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!108}
-!0 = metadata !{i32 786449, metadata !107, i32 16, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 2, metadata !2, metadata !4, metadata !23, metadata !15,  metadata !15, metadata !""} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m] [DW_LANG_ObjC]
-!1 = metadata !{i32 786473, metadata !107} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x11\0016\00clang version 3.3 \000\00\002\00\000", metadata !107, metadata !2, metadata !4, metadata !23, metadata !15,  metadata !15} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m] [DW_LANG_ObjC]
+!1 = metadata !{metadata !"0x29", metadata !107} ; [ DW_TAG_file_type ]
 !2 = metadata !{metadata !3}
-!3 = metadata !{i32 786436, metadata !107, null, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!3 = metadata !{metadata !"0x4\00\0020\0032\0032\000\000\000", metadata !107, null, null, metadata !4, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
 !4 = metadata !{}
 !15 = metadata !{}
 !23 = metadata !{metadata !38, metadata !42}
-!27 = metadata !{i32 786454, metadata !107, null, metadata !"id", i32 31, i64 0, i64 0, i64 0, i32 0, metadata !28} ; [ DW_TAG_typedef ] [id] [line 31, size 0, align 0, offset 0] [from ]
-!28 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
-!29 = metadata !{i32 786451, metadata !107, null, metadata !"objc_object", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
+!27 = metadata !{metadata !"0x16\00id\0031\000\000\000\000", metadata !107, null, metadata !28} ; [ DW_TAG_typedef ] [id] [line 31, size 0, align 0, offset 0] [from ]
+!28 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
+!29 = metadata !{metadata !"0x13\00objc_object\000\000\000\000\000\000", metadata !107, null, null, metadata !30, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
 !30 = metadata !{metadata !31}
-!31 = metadata !{i32 786445, metadata !107, metadata !29, metadata !"isa", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
-!32 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
-!33 = metadata !{i32 786451, metadata !107, null, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!34 = metadata !{i32 786451, metadata !107, null, metadata !"Main", i32 23, i64 0, i64 0, i32 0, i32 1092, null, i32 0, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [Main] [line 23, size 0, align 0, offset 0] [artificial] [decl] [from ]
-!38 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"__24-[Main initWithContext:]_block_invoke", metadata !"__24-[Main initWithContext:]_block_invoke", metadata !"", i32 33, metadata !39, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke", null, null, metadata !15, i32 33} ; [ DW_TAG_subprogram ] [line 33] [local] [def] [__24-[Main initWithContext:]_block_invoke]
-!39 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !40, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!31 = metadata !{metadata !"0xd\00isa\000\0064\000\000\000", metadata !107, metadata !29, metadata !32} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
+!32 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !33} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
+!33 = metadata !{metadata !"0x13\00objc_class\000\000\000\000\004\000", metadata !107, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!34 = metadata !{metadata !"0x13\00Main\0023\000\000\000\001092\0016", metadata !107, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Main] [line 23, size 0, align 0, offset 0] [artificial] [decl] [from ]
+!38 = metadata !{metadata !"0x2e\00__24-[Main initWithContext:]_block_invoke\00__24-[Main initWithContext:]_block_invoke\00\0033\001\001\000\006\00256\000\0033", metadata !1, metadata !1, metadata !39, null, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke", null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 33] [local] [def] [__24-[Main initWithContext:]_block_invoke]
+!39 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !40, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !40 = metadata !{null, metadata !41, metadata !27}
-!41 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!42 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"__24-[Main initWithContext:]_block_invoke_2", metadata !"__24-[Main initWithContext:]_block_invoke_2", metadata !"", i32 35, metadata !39, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke_2", null, null, metadata !15, i32 35} ; [ DW_TAG_subprogram ] [line 35] [local] [def] [__24-[Main initWithContext:]_block_invoke_2]
+!41 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!42 = metadata !{metadata !"0x2e\00__24-[Main initWithContext:]_block_invoke_2\00__24-[Main initWithContext:]_block_invoke_2\00\0035\001\001\000\006\00256\000\0035", metadata !1, metadata !1, metadata !39, null, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke_2", null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 35] [local] [def] [__24-[Main initWithContext:]_block_invoke_2]
 !84 = metadata !{i32 33, i32 0, metadata !38, null}
-!86 = metadata !{i32 786688, metadata !38, metadata !"self", metadata !1, i32 41, metadata !34, i32 0, i32 0, metadata !110} ; [ DW_TAG_auto_variable ] [self] [line 41]
+!86 = metadata !{metadata !"0x100\00self\0041\000", metadata !38, metadata !1, metadata !34} ; [ DW_TAG_auto_variable ] [self] [line 41]
 !87 = metadata !{i32 41, i32 0, metadata !38, null}
 !103 = metadata !{i32 35, i32 0, metadata !42, null}
-!105 = metadata !{i32 786688, metadata !42, metadata !"self", metadata !1, i32 40, metadata !34, i32 0, i32 0, metadata !109} ; [ DW_TAG_auto_variable ] [self] [line 40]
+!105 = metadata !{metadata !"0x100\00self\0040\000", metadata !42, metadata !1, metadata !34} ; [ DW_TAG_auto_variable ] [self] [line 40]
 !106 = metadata !{i32 40, i32 0, metadata !42, null}
 !107 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m", metadata !""}
-!108 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!109 = metadata !{i64 1, i64 32}
-!110 = metadata !{i64 1, i64 32}
+!108 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!109 = metadata !{metadata !"0x102\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_plus 32]
+!110 = metadata !{metadata !"0x102\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_plus 32]

diff --git a/test/DebugInfo/X86/debug-info-blocks.ll b/test/DebugInfo/X86/debug-info-blocks.ll
index 8a1a125..9f6ed5c 100644
--- a/test/DebugInfo/X86/debug-info-blocks.ll
+++ b/test/DebugInfo/X86/debug-info-blocks.ll

@@ -101,9 +101,9 @@
   %3 = alloca %struct._objc_super
   %4 = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>, align 8
   store %0* %self, %0** %1, align 8
-  call void @llvm.dbg.declare(metadata !{%0** %1}, metadata !60), !dbg !62
+  call void @llvm.dbg.declare(metadata !{%0** %1}, metadata !60, metadata !{metadata !"0x102"}), !dbg !62
   store i8* %_cmd, i8** %2, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %2}, metadata !63), !dbg !62
+  call void @llvm.dbg.declare(metadata !{i8** %2}, metadata !63, metadata !{metadata !"0x102"}), !dbg !62
   %5 = load %0** %1, !dbg !65
   %6 = bitcast %0* %5 to i8*, !dbg !65
   %7 = getelementptr inbounds %struct._objc_super* %3, i32 0, i32 0, !dbg !65
@@ -143,14 +143,14 @@
   ret i8* %26, !dbg !71
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare i8* @objc_msgSendSuper2(%struct._objc_super*, i8*, ...)
 
 define internal void @run(void ()* %block) #0 {
   %1 = alloca void ()*, align 8
   store void ()* %block, void ()** %1, align 8
-  call void @llvm.dbg.declare(metadata !{void ()** %1}, metadata !72), !dbg !73
+  call void @llvm.dbg.declare(metadata !{void ()** %1}, metadata !72, metadata !{metadata !"0x102"}), !dbg !73
   %2 = load void ()** %1, align 8, !dbg !74
   %3 = bitcast void ()* %2 to %struct.__block_literal_generic*, !dbg !74
   %4 = getelementptr inbounds %struct.__block_literal_generic* %3, i32 0, i32 3, !dbg !74
@@ -167,13 +167,13 @@
   %d = alloca %1*, align 8
   store i8* %.block_descriptor, i8** %1, align 8
   %3 = load i8** %1
-  call void @llvm.dbg.value(metadata !{i8* %3}, i64 0, metadata !76), !dbg !88
-  call void @llvm.dbg.declare(metadata !{i8* %.block_descriptor}, metadata !76), !dbg !88
+  call void @llvm.dbg.value(metadata !{i8* %3}, i64 0, metadata !76, metadata !{metadata !"0x102"}), !dbg !88
+  call void @llvm.dbg.declare(metadata !{i8* %.block_descriptor}, metadata !76, metadata !{metadata !"0x102"}), !dbg !88
   %4 = bitcast i8* %.block_descriptor to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !88
   store <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %4, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>** %2, align 8, !dbg !88
   %5 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %4, i32 0, i32 5, !dbg !88
-  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>** %2}, metadata !89), !dbg !90
-  call void @llvm.dbg.declare(metadata !{%1** %d}, metadata !91), !dbg !100
+  call void @llvm.dbg.declare(metadata !{<{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>** %2}, metadata !89, metadata !111), !dbg !90
+  call void @llvm.dbg.declare(metadata !{%1** %d}, metadata !91, metadata !{metadata !"0x102"}), !dbg !100
   %6 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_", !dbg !100
   %7 = bitcast %struct._class_t* %6 to i8*, !dbg !100
   %8 = load i8** getelementptr inbounds (%struct._message_ref_t* bitcast ({ i8* (i8*, %struct._message_ref_t*, ...)*, i8* }* @"\01l_objc_msgSend_fixup_alloc" to %struct._message_ref_t*), i32 0, i32 0), !dbg !100
@@ -200,7 +200,7 @@
   ret void, !dbg !90
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 declare i8* @objc_msgSend_fixup(i8*, %struct._message_ref_t*, ...)
 
@@ -210,9 +210,9 @@
   %3 = alloca i8*, align 8
   %4 = alloca i8*, align 8
   store i8* %0, i8** %3, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %3}, metadata !102), !dbg !103
+  call void @llvm.dbg.declare(metadata !{i8** %3}, metadata !102, metadata !{metadata !"0x102"}), !dbg !103
   store i8* %1, i8** %4, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %4}, metadata !104), !dbg !103
+  call void @llvm.dbg.declare(metadata !{i8** %4}, metadata !104, metadata !{metadata !"0x102"}), !dbg !103
   %5 = load i8** %4, !dbg !103
   %6 = bitcast i8* %5 to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !103
   %7 = load i8** %3, !dbg !103
@@ -231,7 +231,7 @@
 define internal void @__destroy_helper_block_(i8*) {
   %2 = alloca i8*, align 8
   store i8* %0, i8** %2, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %2}, metadata !105), !dbg !106
+  call void @llvm.dbg.declare(metadata !{i8** %2}, metadata !105, metadata !{metadata !"0x102"}), !dbg !106
   %3 = load i8** %2, !dbg !106
   %4 = bitcast i8* %3 to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>*, !dbg !106
   %5 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %4, i32 0, i32 5, !dbg !106
@@ -247,7 +247,7 @@
   %1 = alloca i32, align 4
   %a = alloca %0*, align 8
   store i32 0, i32* %1
-  call void @llvm.dbg.declare(metadata !{%0** %a}, metadata !107), !dbg !108
+  call void @llvm.dbg.declare(metadata !{%0** %a}, metadata !107, metadata !{metadata !"0x102"}), !dbg !108
   %2 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_5", !dbg !108
   %3 = bitcast %struct._class_t* %2 to i8*, !dbg !108
   %4 = load i8** getelementptr inbounds (%struct._message_ref_t* bitcast ({ i8* (i8*, %struct._message_ref_t*, ...)*, i8* }* @"\01l_objc_msgSend_fixup_alloc" to %struct._message_ref_t*), i32 0, i32 0), !dbg !108
@@ -270,115 +270,115 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!56, !57, !58, !59, !110}
 
-!0 = metadata !{i32 786449, metadata !1, i32 16, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 2, metadata !2, metadata !3, metadata !12, metadata !2,  metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/<unknown>] [DW_LANG_ObjC]
+!0 = metadata !{metadata !"0x11\0016\00clang version 3.3 \000\00\002\00\001", metadata !1, metadata !2, metadata !3, metadata !12, metadata !2,  metadata !2} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/<unknown>] [DW_LANG_ObjC]
 !1 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/<unknown>", metadata !"llvm/_build.ninja.Debug"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"A", i32 33, i64 32, i64 32, i32 0, i32 512, null, metadata !7, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 33, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00A\0033\0032\0032\000\00512\0016", metadata !5, metadata !6, null, metadata !7, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 33, size 32, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m", metadata !"llvm/_build.ninja.Debug"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
 !7 = metadata !{metadata !8, metadata !10}
-!8 = metadata !{i32 786460, null, metadata !4, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
-!9 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"NSObject", i32 21, i64 0, i64 8, i32 0, i32 0, null, metadata !2, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSObject] [line 21, size 0, align 8, offset 0] [def] [from ]
-!10 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"ivar", i32 35, i64 32, i64 32, i64 0, i32 0, metadata !11, null} ; [ DW_TAG_member ] [ivar] [line 35, size 32, align 32, offset 0] [from int]
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !4, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
+!9 = metadata !{metadata !"0x13\00NSObject\0021\000\008\000\000\0016", metadata !5, metadata !6, null, metadata !2, null, null, null} ; [ DW_TAG_structure_type ] [NSObject] [line 21, size 0, align 8, offset 0] [def] [from ]
+!10 = metadata !{metadata !"0xd\00ivar\0035\0032\0032\000\000", metadata !5, metadata !6, metadata !11, null} ; [ DW_TAG_member ] [ivar] [line 35, size 32, align 32, offset 0] [from int]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13, metadata !27, metadata !31, metadata !35, metadata !36, metadata !39}
-!13 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"-[A init]", metadata !"-[A init]", metadata !"", i32 46, metadata !14, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, i8* (%0*, i8*)* @"\01-[A init]", null, null, metadata !2, i32 46} ; [ DW_TAG_subprogram ] [line 46] [local] [def] [-[A init]]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x2e\00-[A init]\00-[A init]\00\0046\001\001\000\006\00256\000\0046", metadata !5, metadata !6, metadata !14, null, i8* (%0*, i8*)* @"\01-[A init]", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 46] [local] [def] [-[A init]]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{metadata !16, metadata !23, metadata !24}
-!16 = metadata !{i32 786454, metadata !5, null, metadata !"id", i32 46, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_typedef ] [id] [line 46, size 0, align 0, offset 0] [from ]
-!17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
-!18 = metadata !{i32 786451, metadata !1, null, metadata !"objc_object", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
+!16 = metadata !{metadata !"0x16\00id\0046\000\000\000\000", metadata !5, null, metadata !17} ; [ DW_TAG_typedef ] [id] [line 46, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
+!18 = metadata !{metadata !"0x13\00objc_object\000\000\000\000\000\000", metadata !1, null, null, metadata !19, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
 !19 = metadata !{metadata !20}
-!20 = metadata !{i32 786445, metadata !1, metadata !18, metadata !"isa", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !21} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
-!21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
-!22 = metadata !{i32 786451, metadata !1, null, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!23 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!24 = metadata !{i32 786454, metadata !5, null, metadata !"SEL", i32 46, i64 0, i64 0, i64 0, i32 64, metadata !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [artificial] [from ]
-!25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !26} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
-!26 = metadata !{i32 786451, metadata !1, null, metadata !"objc_selector", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!27 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"__9-[A init]_block_invoke", metadata !"__9-[A init]_block_invoke", metadata !"", i32 49, metadata !28, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @"__9-[A init]_block_invoke", null, null, metadata !2, i32 49} ; [ DW_TAG_subprogram ] [line 49] [local] [def] [__9-[A init]_block_invoke]
-!28 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !"0xd\00isa\000\0064\000\000\000", metadata !1, metadata !18, metadata !21} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
+!21 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
+!22 = metadata !{metadata !"0x13\00objc_class\000\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!23 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!24 = metadata !{metadata !"0x16\00SEL\0046\000\000\000\0064", metadata !5, null, metadata !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [artificial] [from ]
+!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !26} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
+!26 = metadata !{metadata !"0x13\00objc_selector\000\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!27 = metadata !{metadata !"0x2e\00__9-[A init]_block_invoke\00__9-[A init]_block_invoke\00\0049\001\001\000\006\00256\000\0049", metadata !5, metadata !6, metadata !28, null, void (i8*)* @"__9-[A init]_block_invoke", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 49] [local] [def] [__9-[A init]_block_invoke]
+!28 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !29 = metadata !{null, metadata !30}
-!30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!31 = metadata !{i32 786478, metadata !1, metadata !32, metadata !"__copy_helper_block_", metadata !"__copy_helper_block_", metadata !"", i32 52, metadata !33, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 false, void (i8*, i8*)* @__copy_helper_block_, null, null, metadata !2, i32 52} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__copy_helper_block_]
-!32 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/<unknown>]
-!33 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!31 = metadata !{metadata !"0x2e\00__copy_helper_block_\00__copy_helper_block_\00\0052\001\001\000\006\000\000\0052", metadata !1, metadata !32, metadata !33, null, void (i8*, i8*)* @__copy_helper_block_, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__copy_helper_block_]
+!32 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/<unknown>]
+!33 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !34, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !34 = metadata !{null, metadata !30, metadata !30}
-!35 = metadata !{i32 786478, metadata !1, metadata !32, metadata !"__destroy_helper_block_", metadata !"__destroy_helper_block_", metadata !"", i32 52, metadata !28, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 false, void (i8*)* @__destroy_helper_block_, null, null, metadata !2, i32 52} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__destroy_helper_block_]
-!36 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !37, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 60} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 60] [main]
-!37 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !38, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!35 = metadata !{metadata !"0x2e\00__destroy_helper_block_\00__destroy_helper_block_\00\0052\001\001\000\006\000\000\0052", metadata !1, metadata !32, metadata !28, null, void (i8*)* @__destroy_helper_block_, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__destroy_helper_block_]
+!36 = metadata !{metadata !"0x2e\00main\00main\00\0059\000\001\000\006\000\000\0060", metadata !5, metadata !6, metadata !37, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 60] [main]
+!37 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !38, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !38 = metadata !{metadata !11}
-!39 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"run", metadata !"run", metadata !"", i32 39, metadata !40, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (void ()*)* @run, null, null, metadata !2, i32 40} ; [ DW_TAG_subprogram ] [line 39] [local] [def] [scope 40] [run]
-!40 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !41, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!39 = metadata !{metadata !"0x2e\00run\00run\00\0039\001\001\000\006\00256\000\0040", metadata !5, metadata !6, metadata !40, null, void (void ()*)* @run, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 39] [local] [def] [scope 40] [run]
+!40 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !41, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !41 = metadata !{null, metadata !42}
-!42 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !43} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_generic]
-!43 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"__block_literal_generic", i32 40, i64 256, i64 0, i32 0, i32 8, null, metadata !44, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 40, size 256, align 0, offset 0] [def] [from ]
+!42 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !43} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_generic]
+!43 = metadata !{metadata !"0x13\00__block_literal_generic\0040\00256\000\000\008\000", metadata !5, metadata !6, null, metadata !44, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 40, size 256, align 0, offset 0] [def] [from ]
 !44 = metadata !{metadata !45, metadata !46, metadata !47, metadata !48, metadata !49}
-!45 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__isa", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !30} ; [ DW_TAG_member ] [__isa] [line 0, size 64, align 64, offset 0] [from ]
-!46 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__flags", i32 0, i64 32, i64 32, i64 64, i32 0, metadata !11} ; [ DW_TAG_member ] [__flags] [line 0, size 32, align 32, offset 64] [from int]
-!47 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__reserved", i32 0, i64 32, i64 32, i64 96, i32 0, metadata !11} ; [ DW_TAG_member ] [__reserved] [line 0, size 32, align 32, offset 96] [from int]
-!48 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__FuncPtr", i32 0, i64 64, i64 64, i64 128, i32 0, metadata !30} ; [ DW_TAG_member ] [__FuncPtr] [line 0, size 64, align 64, offset 128] [from ]
-!49 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__descriptor", i32 40, i64 64, i64 64, i64 192, i32 0, metadata !50} ; [ DW_TAG_member ] [__descriptor] [line 40, size 64, align 64, offset 192] [from ]
-!50 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !51} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_descriptor]
-!51 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"__block_descriptor", i32 40, i64 128, i64 0, i32 0, i32 8, null, metadata !52, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 40, size 128, align 0, offset 0] [def] [from ]
+!45 = metadata !{metadata !"0xd\00__isa\000\0064\0064\000\000", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_member ] [__isa] [line 0, size 64, align 64, offset 0] [from ]
+!46 = metadata !{metadata !"0xd\00__flags\000\0032\0032\0064\000", metadata !5, metadata !6, metadata !11} ; [ DW_TAG_member ] [__flags] [line 0, size 32, align 32, offset 64] [from int]
+!47 = metadata !{metadata !"0xd\00__reserved\000\0032\0032\0096\000", metadata !5, metadata !6, metadata !11} ; [ DW_TAG_member ] [__reserved] [line 0, size 32, align 32, offset 96] [from int]
+!48 = metadata !{metadata !"0xd\00__FuncPtr\000\0064\0064\00128\000", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_member ] [__FuncPtr] [line 0, size 64, align 64, offset 128] [from ]
+!49 = metadata !{metadata !"0xd\00__descriptor\0040\0064\0064\00192\000", metadata !5, metadata !6, metadata !50} ; [ DW_TAG_member ] [__descriptor] [line 40, size 64, align 64, offset 192] [from ]
+!50 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !51} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_descriptor]
+!51 = metadata !{metadata !"0x13\00__block_descriptor\0040\00128\000\000\008\000", metadata !5, metadata !6, null, metadata !52, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 40, size 128, align 0, offset 0] [def] [from ]
 !52 = metadata !{metadata !53, metadata !55}
-!53 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"reserved", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !54} ; [ DW_TAG_member ] [reserved] [line 0, size 64, align 64, offset 0] [from long unsigned int]
-!54 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!55 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"Size", i32 0, i64 64, i64 64, i64 64, i32 0, metadata !54} ; [ DW_TAG_member ] [Size] [line 0, size 64, align 64, offset 64] [from long unsigned int]
+!53 = metadata !{metadata !"0xd\00reserved\000\0064\0064\000\000", metadata !5, metadata !6, metadata !54} ; [ DW_TAG_member ] [reserved] [line 0, size 64, align 64, offset 0] [from long unsigned int]
+!54 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!55 = metadata !{metadata !"0xd\00Size\000\0064\0064\0064\000", metadata !5, metadata !6, metadata !54} ; [ DW_TAG_member ] [Size] [line 0, size 64, align 64, offset 64] [from long unsigned int]
 !56 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
 !57 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
 !58 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
 !59 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
-!60 = metadata !{i32 786689, metadata !13, metadata !"self", metadata !32, i32 16777262, metadata !61, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [self] [line 46]
-!61 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!60 = metadata !{metadata !"0x101\00self\0016777262\001088", metadata !13, metadata !32, metadata !61} ; [ DW_TAG_arg_variable ] [self] [line 46]
+!61 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
 !62 = metadata !{i32 46, i32 0, metadata !13, null}
-!63 = metadata !{i32 786689, metadata !13, metadata !"_cmd", metadata !32, i32 33554478, metadata !64, i32 64, i32 0} ; [ DW_TAG_arg_variable ] [_cmd] [line 46]
-!64 = metadata !{i32 786454, metadata !5, null, metadata !"SEL", i32 46, i64 0, i64 0, i64 0, i32 0, metadata !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [from ]
+!63 = metadata !{metadata !"0x101\00_cmd\0033554478\0064", metadata !13, metadata !32, metadata !64} ; [ DW_TAG_arg_variable ] [_cmd] [line 46]
+!64 = metadata !{metadata !"0x16\00SEL\0046\000\000\000\000", metadata !5, null, metadata !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [from ]
 !65 = metadata !{i32 48, i32 0, metadata !66, null}
-!66 = metadata !{i32 786443, metadata !5, metadata !13, i32 47, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
+!66 = metadata !{metadata !"0xb\0047\000\000", metadata !5, metadata !13} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
 !67 = metadata !{}
 !68 = metadata !{i32 49, i32 0, metadata !69, null}
-!69 = metadata !{i32 786443, metadata !5, metadata !66, i32 48, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
+!69 = metadata !{metadata !"0xb\0048\000\001", metadata !5, metadata !66} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
 !70 = metadata !{i32 53, i32 0, metadata !69, null}
 !71 = metadata !{i32 54, i32 0, metadata !66, null}
-!72 = metadata !{i32 786689, metadata !39, metadata !"block", metadata !6, i32 16777255, metadata !42, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [block] [line 39]
+!72 = metadata !{metadata !"0x101\00block\0016777255\000", metadata !39, metadata !6, metadata !42} ; [ DW_TAG_arg_variable ] [block] [line 39]
 !73 = metadata !{i32 39, i32 0, metadata !39, null}
 !74 = metadata !{i32 41, i32 0, metadata !39, null}
 !75 = metadata !{i32 42, i32 0, metadata !39, null}
-!76 = metadata !{i32 786689, metadata !27, metadata !".block_descriptor", metadata !6, i32 16777265, metadata !77, i32 64, i32 0} ; [ DW_TAG_arg_variable ] [.block_descriptor] [line 49]
-!77 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !78} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_1]
-!78 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"__block_literal_1", i32 49, i64 320, i64 64, i32 0, i32 0, null, metadata !79, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 49, size 320, align 64, offset 0] [def] [from ]
+!76 = metadata !{metadata !"0x101\00.block_descriptor\0016777265\0064", metadata !27, metadata !6, metadata !77} ; [ DW_TAG_arg_variable ] [.block_descriptor] [line 49]
+!77 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !78} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_1]
+!78 = metadata !{metadata !"0x13\00__block_literal_1\0049\00320\0064\000\000\000", metadata !5, metadata !6, null, metadata !79, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 49, size 320, align 64, offset 0] [def] [from ]
 !79 = metadata !{metadata !80, metadata !81, metadata !82, metadata !83, metadata !84, metadata !87}
-!80 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__isa", i32 49, i64 64, i64 64, i64 0, i32 0, metadata !30} ; [ DW_TAG_member ] [__isa] [line 49, size 64, align 64, offset 0] [from ]
-!81 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__flags", i32 49, i64 32, i64 32, i64 64, i32 0, metadata !11} ; [ DW_TAG_member ] [__flags] [line 49, size 32, align 32, offset 64] [from int]
-!82 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__reserved", i32 49, i64 32, i64 32, i64 96, i32 0, metadata !11} ; [ DW_TAG_member ] [__reserved] [line 49, size 32, align 32, offset 96] [from int]
-!83 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__FuncPtr", i32 49, i64 64, i64 64, i64 128, i32 0, metadata !30} ; [ DW_TAG_member ] [__FuncPtr] [line 49, size 64, align 64, offset 128] [from ]
-!84 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__descriptor", i32 49, i64 64, i64 64, i64 192, i32 0, metadata !85} ; [ DW_TAG_member ] [__descriptor] [line 49, size 64, align 64, offset 192] [from ]
-!85 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !86} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from __block_descriptor_withcopydispose]
-!86 = metadata !{i32 786451, metadata !1, null, metadata !"__block_descriptor_withcopydispose", i32 49, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 49, size 0, align 0, offset 0] [decl] [from ]
-!87 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"self", i32 49, i64 64, i64 64, i64 256, i32 0, metadata !61} ; [ DW_TAG_member ] [self] [line 49, size 64, align 64, offset 256] [from ]
+!80 = metadata !{metadata !"0xd\00__isa\0049\0064\0064\000\000", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_member ] [__isa] [line 49, size 64, align 64, offset 0] [from ]
+!81 = metadata !{metadata !"0xd\00__flags\0049\0032\0032\0064\000", metadata !5, metadata !6, metadata !11} ; [ DW_TAG_member ] [__flags] [line 49, size 32, align 32, offset 64] [from int]
+!82 = metadata !{metadata !"0xd\00__reserved\0049\0032\0032\0096\000", metadata !5, metadata !6, metadata !11} ; [ DW_TAG_member ] [__reserved] [line 49, size 32, align 32, offset 96] [from int]
+!83 = metadata !{metadata !"0xd\00__FuncPtr\0049\0064\0064\00128\000", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_member ] [__FuncPtr] [line 49, size 64, align 64, offset 128] [from ]
+!84 = metadata !{metadata !"0xd\00__descriptor\0049\0064\0064\00192\000", metadata !5, metadata !6, metadata !85} ; [ DW_TAG_member ] [__descriptor] [line 49, size 64, align 64, offset 192] [from ]
+!85 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !86} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from __block_descriptor_withcopydispose]
+!86 = metadata !{metadata !"0x13\00__block_descriptor_withcopydispose\0049\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 49, size 0, align 0, offset 0] [decl] [from ]
+!87 = metadata !{metadata !"0xd\00self\0049\0064\0064\00256\000", metadata !5, metadata !6, metadata !61} ; [ DW_TAG_member ] [self] [line 49, size 64, align 64, offset 256] [from ]
 !88 = metadata !{i32 49, i32 0, metadata !27, null}
-!89 = metadata !{i32 786688, metadata !27, metadata !"self", metadata !32, i32 52, metadata !23, i32 0, i32 0, metadata !111} ; [ DW_TAG_auto_variable ] [self] [line 52]
+!89 = metadata !{metadata !"0x100\00self\0052\000", metadata !27, metadata !32, metadata !23} ; [ DW_TAG_auto_variable ] [self] [line 52]
 !90 = metadata !{i32 52, i32 0, metadata !27, null}
-!91 = metadata !{i32 786688, metadata !92, metadata !"d", metadata !6, i32 50, metadata !93, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 50]
-!92 = metadata !{i32 786443, metadata !5, metadata !27, i32 49, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
-!93 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !94} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from NSMutableDictionary]
-!94 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"NSMutableDictionary", i32 30, i64 0, i64 8, i32 0, i32 0, null, metadata !95, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSMutableDictionary] [line 30, size 0, align 8, offset 0] [def] [from ]
+!91 = metadata !{metadata !"0x100\00d\0050\000", metadata !92, metadata !6, metadata !93} ; [ DW_TAG_auto_variable ] [d] [line 50]
+!92 = metadata !{metadata !"0xb\0049\000\002", metadata !5, metadata !27} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
+!93 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !94} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from NSMutableDictionary]
+!94 = metadata !{metadata !"0x13\00NSMutableDictionary\0030\000\008\000\000\0016", metadata !5, metadata !6, null, metadata !95, null, null, null} ; [ DW_TAG_structure_type ] [NSMutableDictionary] [line 30, size 0, align 8, offset 0] [def] [from ]
 !95 = metadata !{metadata !96}
-!96 = metadata !{i32 786460, null, metadata !94, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !97} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSDictionary]
-!97 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"NSDictionary", i32 26, i64 0, i64 8, i32 0, i32 0, null, metadata !98, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSDictionary] [line 26, size 0, align 8, offset 0] [def] [from ]
+!96 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !94, metadata !97} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSDictionary]
+!97 = metadata !{metadata !"0x13\00NSDictionary\0026\000\008\000\000\0016", metadata !5, metadata !6, null, metadata !98, null, null, null} ; [ DW_TAG_structure_type ] [NSDictionary] [line 26, size 0, align 8, offset 0] [def] [from ]
 !98 = metadata !{metadata !99}
-!99 = metadata !{i32 786460, null, metadata !97, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
+!99 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !97, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
 !100 = metadata !{i32 50, i32 0, metadata !92, null}
 !101 = metadata !{i32 51, i32 0, metadata !92, null}
-!102 = metadata !{i32 786689, metadata !31, metadata !"", metadata !32, i32 16777268, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [line 52]
+!102 = metadata !{metadata !"0x101\00\0016777268\001088", metadata !31, metadata !32, metadata !30} ; [ DW_TAG_arg_variable ] [line 52]
 !103 = metadata !{i32 52, i32 0, metadata !31, null}
-!104 = metadata !{i32 786689, metadata !31, metadata !"", metadata !32, i32 33554484, metadata !30, i32 64, i32 0} ; [ DW_TAG_arg_variable ] [line 52]
-!105 = metadata !{i32 786689, metadata !35, metadata !"", metadata !32, i32 16777268, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [line 52]
+!104 = metadata !{metadata !"0x101\00\0033554484\0064", metadata !31, metadata !32, metadata !30} ; [ DW_TAG_arg_variable ] [line 52]
+!105 = metadata !{metadata !"0x101\00\0016777268\001088", metadata !35, metadata !32, metadata !30} ; [ DW_TAG_arg_variable ] [line 52]
 !106 = metadata !{i32 52, i32 0, metadata !35, null}
-!107 = metadata !{i32 786688, metadata !36, metadata !"a", metadata !6, i32 61, metadata !61, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 61]
+!107 = metadata !{metadata !"0x100\00a\0061\000", metadata !36, metadata !6, metadata !61} ; [ DW_TAG_auto_variable ] [a] [line 61]
 !108 = metadata !{i32 61, i32 0, metadata !36, null}
 !109 = metadata !{i32 62, i32 0, metadata !36, null}
-!110 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!111 = metadata !{i64 2, i64 1, i64 32}
+!110 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!111 = metadata !{metadata !"0x102\006\0034\0032"} ; [ DW_TAG_expression ] [DW_OP_deref DW_OP_plus 32]

diff --git a/test/DebugInfo/X86/debug-info-static-member.ll b/test/DebugInfo/X86/debug-info-static-member.ll
index 7d258f9..37fe997 100644
--- a/test/DebugInfo/X86/debug-info-static-member.ll
+++ b/test/DebugInfo/X86/debug-info-static-member.ll

@@ -1,5 +1,5 @@
 ; RUN: llc %s -o %t -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu -dwarf-version=4
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=PRESENT
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=PRESENT 
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=ABSENT
 ; RUN: llc %s -o %t -filetype=obj -O0 -mtriple=x86_64-apple-darwin -dwarf-version=4
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=DARWINP
@@ -47,45 +47,45 @@
   %retval = alloca i32, align 4
   %instance_C = alloca %class.C, align 4
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{%class.C* %instance_C}, metadata !29), !dbg !30
+  call void @llvm.dbg.declare(metadata !{%class.C* %instance_C}, metadata !29, metadata !{metadata !"0x102"}), !dbg !30
   %d = getelementptr inbounds %class.C* %instance_C, i32 0, i32 0, !dbg !31
   store i32 8, i32* %d, align 4, !dbg !31
   %0 = load i32* @_ZN1C1cE, align 4, !dbg !32
   ret i32 %0, !dbg !32
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!34}
 
-!0 = metadata !{i32 786449, metadata !33, i32 4, metadata !"clang version 3.3 (trunk 171914)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !10,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/probinson/projects/upstream/static-member/test/debug-info-static-member.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 171914)\000\00\000\00\000", metadata !33, metadata !1, metadata !1, metadata !3, metadata !10,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/probinson/projects/upstream/static-member/test/debug-info-static-member.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !33, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 23} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 23] [main]
-!6 = metadata !{i32 786473, metadata !33} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00main\00main\00\0018\000\001\000\006\00256\000\0023", metadata !33, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 23] [main]
+!6 = metadata !{metadata !"0x29", metadata !33} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !12, metadata !27, metadata !28}
-!12 = metadata !{i32 786484, i32 0, metadata !13, metadata !"a", metadata !"a", metadata !"_ZN1C1aE", metadata !6, i32 14, metadata !9, i32 0, i32 1, i32* @_ZN1C1aE, metadata !15} ; [ DW_TAG_variable ] [a] [line 14] [def]
-!13 = metadata !{i32 786434, metadata !33, null, metadata !"C", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_class_type ] [C] [line 1, size 32, align 32, offset 0] [def] [from ]
+!12 = metadata !{metadata !"0x34\00a\00a\00_ZN1C1aE\0014\000\001", null, metadata !6, metadata !9, i32* @_ZN1C1aE, metadata !15} ; [ DW_TAG_variable ] [a] [line 14] [def]
+!13 = metadata !{metadata !"0x2\00C\001\0032\0032\000\000\000", metadata !33, null, null, metadata !14, null, null, null} ; [ DW_TAG_class_type ] [C] [line 1, size 32, align 32, offset 0] [def] [from ]
 !14 = metadata !{metadata !15, metadata !16, metadata !19, metadata !20, metadata !23, metadata !24, metadata !26}
-!15 = metadata !{i32 786445, metadata !33, metadata !13, metadata !"a", i32 3, i64 0, i64 0, i64 0, i32 4097, metadata !9, null} ; [ DW_TAG_member ] [a] [line 3, size 0, align 0, offset 0] [private] [static] [from int]
-!16 = metadata !{i32 786445, metadata !33, metadata !13, metadata !"const_a", i32 4, i64 0, i64 0, i64 0, i32 4097, metadata !17, i1 true} ; [ DW_TAG_member ] [const_a] [line 4, size 0, align 0, offset 0] [private] [static] [from ]
-!17 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from bool]
-!18 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
-!19 = metadata !{i32 786445, metadata !33, metadata !13, metadata !"b", i32 6, i64 0, i64 0, i64 0, i32 4098, metadata !9, null} ; [ DW_TAG_member ] [b] [line 6, size 0, align 0, offset 0] [protected] [static] [from int]
-!20 = metadata !{i32 786445, metadata !33, metadata !13, metadata !"const_b", i32 7, i64 0, i64 0, i64 0, i32 4098, metadata !21, float 0x40091EB860000000} ; [ DW_TAG_member ] [const_b] [line 7, size 0, align 0, offset 0] [protected] [static] [from ]
-!21 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from float]
-!22 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
-!23 = metadata !{i32 786445, metadata !33, metadata !13, metadata !"c", i32 9, i64 0, i64 0, i64 0, i32 4096, metadata !9, null} ; [ DW_TAG_member ] [c] [line 9, size 0, align 0, offset 0] [static] [from int]
-!24 = metadata !{i32 786445, metadata !33, metadata !13, metadata !"const_c", i32 10, i64 0, i64 0, i64 0, i32 4096, metadata !25, i32 18} ; [ DW_TAG_member ] [const_c] [line 10, size 0, align 0, offset 0] [static] [from ]
-!25 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
-!26 = metadata !{i32 786445, metadata !33, metadata !13, metadata !"d", i32 11, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [d] [line 11, size 32, align 32, offset 0] [from int]
-!27 = metadata !{i32 786484, i32 0, metadata !13, metadata !"b", metadata !"b", metadata !"_ZN1C1bE", metadata !6, i32 15, metadata !9, i32 0, i32 1, i32* @_ZN1C1bE, metadata !19} ; [ DW_TAG_variable ] [b] [line 15] [def]
-!28 = metadata !{i32 786484, i32 0, metadata !13, metadata !"c", metadata !"c", metadata !"_ZN1C1cE", metadata !6, i32 16, metadata !9, i32 0, i32 1, i32* @_ZN1C1cE, metadata !23} ; [ DW_TAG_variable ] [c] [line 16] [def]
-!29 = metadata !{i32 786688, metadata !5, metadata !"instance_C", metadata !6, i32 20, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [instance_C] [line 20]
+!15 = metadata !{metadata !"0xd\00a\003\000\000\000\004097", metadata !33, metadata !13, metadata !9, null} ; [ DW_TAG_member ] [a] [line 3, size 0, align 0, offset 0] [private] [static] [from int]
+!16 = metadata !{metadata !"0xd\00const_a\004\000\000\000\004097", metadata !33, metadata !13, metadata !17, i1 true} ; [ DW_TAG_member ] [const_a] [line 4, size 0, align 0, offset 0] [private] [static] [from ]
+!17 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !18} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from bool]
+!18 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!19 = metadata !{metadata !"0xd\00b\006\000\000\000\004098", metadata !33, metadata !13, metadata !9, null} ; [ DW_TAG_member ] [b] [line 6, size 0, align 0, offset 0] [protected] [static] [from int]
+!20 = metadata !{metadata !"0xd\00const_b\007\000\000\000\004098", metadata !33, metadata !13, metadata !21, float 0x40091EB860000000} ; [ DW_TAG_member ] [const_b] [line 7, size 0, align 0, offset 0] [protected] [static] [from ]
+!21 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !22} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from float]
+!22 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!23 = metadata !{metadata !"0xd\00c\009\000\000\000\004099", metadata !33, metadata !13, metadata !9, null} ; [ DW_TAG_member ] [c] [line 9, size 0, align 0, offset 0] [static] [from int]
+!24 = metadata !{metadata !"0xd\00const_c\0010\000\000\000\004099", metadata !33, metadata !13, metadata !25, i32 18} ; [ DW_TAG_member ] [const_c] [line 10, size 0, align 0, offset 0] [static] [from ]
+!25 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!26 = metadata !{metadata !"0xd\00d\0011\0032\0032\000\003", metadata !33, metadata !13, metadata !9} ; [ DW_TAG_member ] [d] [line 11, size 32, align 32, offset 0] [from int]
+!27 = metadata !{metadata !"0x34\00b\00b\00_ZN1C1bE\0015\000\001", null, metadata !6, metadata !9, i32* @_ZN1C1bE, metadata !19} ; [ DW_TAG_variable ] [b] [line 15] [def]
+!28 = metadata !{metadata !"0x34\00c\00c\00_ZN1C1cE\0016\000\001", null, metadata !6, metadata !9, i32* @_ZN1C1cE, metadata !23} ; [ DW_TAG_variable ] [c] [line 16] [def]
+!29 = metadata !{metadata !"0x100\00instance_C\0020\000", metadata !5, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [instance_C] [line 20]
 !30 = metadata !{i32 20, i32 0, metadata !5, null}
 !31 = metadata !{i32 21, i32 0, metadata !5, null}
 !32 = metadata !{i32 22, i32 0, metadata !5, null}
@@ -95,101 +95,101 @@
 ; (for variables) or DW_AT_const_value (for constants).
 ;
 ; PRESENT:      .debug_info contents:
+; PRESENT:      DW_TAG_variable
+; PRESENT-NEXT: DW_AT_specification {{.*}} "a"
+; PRESENT-NEXT: DW_AT_location
+; PRESENT-NEXT: DW_AT_linkage_name {{.*}} "_ZN1C1aE"
 ; PRESENT:      DW_TAG_class_type
 ; PRESENT-NEXT: DW_AT_name {{.*}} "C"
-; PRESENT:      0x[[DECL_A:[0-9a-f]+]]: DW_TAG_member
+; PRESENT:      DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "a"
 ; PRESENT:      DW_AT_external
 ; PRESENT:      DW_AT_declaration
-; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x03)
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_private)
 ; PRESENT:      DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "const_a"
 ; PRESENT:      DW_AT_external
 ; PRESENT:      DW_AT_declaration
-; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x03)
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_private)
 ; PRESENT:      DW_AT_const_value {{.*}} (1)
-; PRESENT:      0x[[DECL_B:[0-9a-f]+]]: DW_TAG_member
+; PRESENT:      DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "b"
-; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_protected)
 ; PRESENT:      DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "const_b"
-; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_protected)
 ; PRESENT:      DW_AT_const_value [DW_FORM_udata] (1078523331)
-; PRESENT:      0x[[DECL_C:[0-9a-f]+]]: DW_TAG_member
+; PRESENT:      DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "c"
-; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_public)
 ; PRESENT:      DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "const_c"
-; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_public)
 ; PRESENT:      DW_AT_const_value {{.*}} (18)
 ; While we're here, a normal member has data_member_location and
 ; accessibility attributes.
 ; PRESENT:      DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "d"
 ; PRESENT:      DW_AT_data_member_location
-; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_public)
 ; PRESENT:      NULL
 ; Definitions point back to their declarations, and have a location.
 ; PRESENT:      DW_TAG_variable
-; PRESENT-NEXT: DW_AT_specification {{.*}} {0x[[DECL_A]]}
-; PRESENT-NEXT: DW_AT_location
-; PRESENT-NEXT: DW_AT_linkage_name {{.*}} "_ZN1C1aE"
-; PRESENT:      DW_TAG_variable
-; PRESENT-NEXT: DW_AT_specification {{.*}} {0x[[DECL_B]]}
+; PRESENT-NEXT: DW_AT_specification {{.*}} "b"
 ; PRESENT-NEXT: DW_AT_location
 ; PRESENT-NEXT: DW_AT_linkage_name {{.*}} "_ZN1C1bE"
 ; PRESENT:      DW_TAG_variable
-; PRESENT-NEXT: DW_AT_specification {{.*}} {0x[[DECL_C]]}
+; PRESENT-NEXT: DW_AT_specification {{.*}} "c"
 ; PRESENT-NEXT: DW_AT_location
 ; PRESENT-NEXT: DW_AT_linkage_name {{.*}} "_ZN1C1cE"
 
 ; For Darwin gdb:
 ; DARWINP:      .debug_info contents:
+; DARWINP:      DW_TAG_variable
+; DARWINP-NEXT: DW_AT_specification {{.*}} "a"
+; DARWINP-NEXT: DW_AT_location
+; DARWINP-NEXT: DW_AT_linkage_name {{.*}} "_ZN1C1aE"
 ; DARWINP:      DW_TAG_class_type
 ; DARWINP-NEXT: DW_AT_name {{.*}} "C"
-; DARWINP:      0x[[DECL_A:[0-9a-f]+]]: DW_TAG_member
+; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "a"
 ; DARWINP:      DW_AT_external
 ; DARWINP:      DW_AT_declaration
-; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x03)
+; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_private)
 ; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "const_a"
 ; DARWINP:      DW_AT_external
 ; DARWINP:      DW_AT_declaration
-; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x03)
+; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_private)
 ; DARWINP:      DW_AT_const_value {{.*}} (1)
-; DARWINP:      0x[[DECL_B:[0-9a-f]+]]: DW_TAG_member
+; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "b"
-; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
+; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_protected)
 ; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "const_b"
-; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
+; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_protected)
 ; DARWINP:      DW_AT_const_value [DW_FORM_udata] (1078523331)
-; DARWINP:      0x[[DECL_C:[0-9a-f]+]]: DW_TAG_member
+; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "c"
-; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_public)
 ; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "const_c"
-; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_public)
 ; DARWINP:      DW_AT_const_value {{.*}} (18)
 ; While we're here, a normal member has data_member_location and
 ; accessibility attributes.
 ; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "d"
 ; DARWINP:      DW_AT_data_member_location
-; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_public)
 ; DARWINP:      NULL
 ; Definitions point back to their declarations, and have a location.
 ; DARWINP:      DW_TAG_variable
-; DARWINP-NEXT: DW_AT_specification {{.*}} {0x[[DECL_A]]}
-; DARWINP-NEXT: DW_AT_location
-; DARWINP-NEXT: DW_AT_linkage_name {{.*}} "_ZN1C1aE"
-; DARWINP:      DW_TAG_variable
-; DARWINP-NEXT: DW_AT_specification {{.*}} {0x[[DECL_B]]}
+; DARWINP-NEXT: DW_AT_specification {{.*}} "b"
 ; DARWINP-NEXT: DW_AT_location
 ; DARWINP-NEXT: DW_AT_linkage_name {{.*}} "_ZN1C1bE"
 ; DARWINP:      DW_TAG_variable
-; DARWINP-NEXT: DW_AT_specification {{.*}} {0x[[DECL_C]]}
+; DARWINP-NEXT: DW_AT_specification {{.*}} "c"
 ; DARWINP-NEXT: DW_AT_location
 ; DARWINP-NEXT: DW_AT_linkage_name {{.*}} "_ZN1C1cE"
 
@@ -253,4 +253,4 @@
 ; DARWINA-NOT:  DW_AT_const_value
 ; DARWINA-NOT:  DW_AT_location
 ; DARWINA:      NULL
-!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/debug-loc-asan.ll b/test/DebugInfo/X86/debug-loc-asan.ll
index b1980ec..869db75 100644
--- a/test/DebugInfo/X86/debug-loc-asan.ll
+++ b/test/DebugInfo/X86/debug-loc-asan.ll

@@ -22,17 +22,13 @@
 ; We expect two location ranges for the variable.
 
 ; First, it is stored in %rdx:
-; CHECK:      .Lset{{[0-9]+}} = .Lfunc_begin0-.Lfunc_begin0
-; CHECK-NEXT: .quad .Lset{{[0-9]+}}
-; CHECK-NEXT: .Lset{{[0-9]+}} = [[START_LABEL]]-.Lfunc_begin0
-; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK:      .quad .Lfunc_begin0-.Lfunc_begin0
+; CHECK-NEXT: .quad [[START_LABEL]]-.Lfunc_begin0
 ; CHECK: DW_OP_reg5
 
 ; Then it's addressed via %rsp:
-; CHECK:      .Lset{{[0-9]+}} = [[START_LABEL]]-.Lfunc_begin0
-; CHECK-NEXT: .quad .Lset{{[0-9]+}}
-; CHECK-NEXT: .Lset{{[0-9]+}} = .Lfunc_end0-.Lfunc_begin0
-; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
+; CHECK-NEXT: .Lfunc_end0-.Lfunc_begin0
 ; CHECK: DW_OP_breg7
 ; CHECK-NEXT: [[OFFSET]]
 ; CHECK: DW_OP_deref
@@ -81,7 +77,7 @@
   %21 = inttoptr i64 %20 to i8*
   %22 = load i8* %21
   %23 = icmp ne i8 %22, 0
-  call void @llvm.dbg.declare(metadata !{i32* %8}, metadata !12)
+  call void @llvm.dbg.declare(metadata !{i32* %8}, metadata !12, metadata !14)
   br i1 %23, label %24, label %30
 
 ; <label>:24                                      ; preds = %5
@@ -147,7 +143,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 define internal void @asan.module_ctor() {
   call void @__asan_init_v3()
@@ -169,18 +165,18 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (209308)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/test.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (209308)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/test.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"test.cc", metadata !"/llvm_cmake_gcc"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"_Z3bari", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3bari, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/test.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3bari\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z3bari, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/test.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5.0 (209308)"}
-!12 = metadata !{i32 786689, metadata !4, metadata !"y", metadata !5, i32 16777217, metadata !8, i32 0, i32 0, metadata !14} ; [ DW_TAG_arg_variable ] [y] [line 1]
+!12 = metadata !{metadata !"0x101\00y\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [y] [line 1]
 !13 = metadata !{i32 2, i32 0, metadata !4, null}
-!14 = metadata !{i64 2}
+!14 = metadata !{metadata !"0x102\006"} ; [ DW_TAG_expression ] [DW_OP_deref]

diff --git a/test/DebugInfo/X86/debug-loc-offset.ll b/test/DebugInfo/X86/debug-loc-offset.ll
index 7866d0e..bdd3f20 100644
--- a/test/DebugInfo/X86/debug-loc-offset.ll
+++ b/test/DebugInfo/X86/debug-loc-offset.ll

@@ -64,20 +64,20 @@
 entry:
   %b.addr = alloca i32, align 4
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !21), !dbg !22
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !21, metadata !{metadata !"0x102"}), !dbg !22
   %0 = load i32* %b.addr, align 4, !dbg !23
   %add = add nsw i32 %0, 4, !dbg !23
   ret i32 %add, !dbg !23
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 define void @_Z3baz1A(%struct.A* %a) #2 {
 entry:
   %z = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !24), !dbg !25
-  call void @llvm.dbg.declare(metadata !{i32* %z}, metadata !26), !dbg !27
+  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata !{i32* %z}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
   store i32 2, i32* %z, align 4, !dbg !27
   %var = getelementptr inbounds %struct.A* %a, i32 0, i32 1, !dbg !28
   %0 = load i32* %var, align 4, !dbg !28
@@ -116,38 +116,38 @@
 !llvm.module.flags = !{!18, !19}
 !llvm.ident = !{!20, !20}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (210479)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset1.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (210479)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset1.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"debug-loc-offset1.cc", metadata !"/llvm_cmake_gcc"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"_Z3bari", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3bari, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset1.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3bari\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z3bari, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [bar]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset1.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.5.0 (210479)", i1 false, metadata !"", i32 0, metadata !2, metadata !11, metadata !13, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset2.cc] [DW_LANG_C_plus_plus]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (210479)\000\00\000\00\001", metadata !10, metadata !2, metadata !11, metadata !13, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/debug-loc-offset2.cc] [DW_LANG_C_plus_plus]
 !10 = metadata !{metadata !"debug-loc-offset2.cc", metadata !"/llvm_cmake_gcc"}
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786451, metadata !10, null, metadata !"A", i32 1, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!12 = metadata !{metadata !"0x13\00A\001\000\000\000\004\000", metadata !10, null, null, null, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 0, align 0, offset 0] [decl] [from ]
 !13 = metadata !{metadata !14}
-!14 = metadata !{i32 786478, metadata !10, metadata !15, metadata !"baz", metadata !"baz", metadata !"_Z3baz1A", i32 6, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.A*)* @_Z3baz1A, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [baz]
-!15 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
-!16 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0x2e\00baz\00baz\00_Z3baz1A\006\000\001\000\006\00256\000\006", metadata !10, metadata !15, metadata !16, null, void (%struct.A*)* @_Z3baz1A, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [baz]
+!15 = metadata !{metadata !"0x29", metadata !10}        ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null, metadata !12}
 !18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !20 = metadata !{metadata !"clang version 3.5.0 (210479)"}
-!21 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
+!21 = metadata !{metadata !"0x101\00b\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 1]
 !22 = metadata !{i32 1, i32 0, metadata !4, null}
 !23 = metadata !{i32 2, i32 0, metadata !4, null}
-!24 = metadata !{i32 786689, metadata !14, metadata !"a", metadata !15, i32 16777222, metadata !"_ZTS1A", i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 6]
+!24 = metadata !{metadata !"0x101\00a\0016777222\008192", metadata !14, metadata !15, metadata !"_ZTS1A"} ; [ DW_TAG_arg_variable ] [a] [line 6]
 !25 = metadata !{i32 6, i32 0, metadata !14, null}
-!26 = metadata !{i32 786688, metadata !14, metadata !"z", metadata !15, i32 7, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [z] [line 7]
+!26 = metadata !{metadata !"0x100\00z\007\000", metadata !14, metadata !15, metadata !8} ; [ DW_TAG_auto_variable ] [z] [line 7]
 !27 = metadata !{i32 7, i32 0, metadata !14, null}
-!28 = metadata !{i32 8, i32 0, metadata !29, null} ; [ DW_TAG_imported_declaration ]
-!29 = metadata !{i32 786443, metadata !10, metadata !14, i32 8, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!28 = metadata !{i32 8, i32 0, metadata !29, null}
+!29 = metadata !{metadata !"0xb\008\000\000", metadata !10, metadata !14} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
 !30 = metadata !{i32 9, i32 0, metadata !29, null}
 !31 = metadata !{i32 10, i32 0, metadata !32, null}
-!32 = metadata !{i32 786443, metadata !10, metadata !14, i32 10, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
+!32 = metadata !{metadata !"0xb\0010\000\000", metadata !10, metadata !14} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/debug-loc-offset2.cc]
 !33 = metadata !{i32 11, i32 0, metadata !32, null}
 !34 = metadata !{i32 12, i32 0, metadata !14, null}

diff --git a/test/DebugInfo/X86/debug-ranges-offset.ll b/test/DebugInfo/X86/debug-ranges-offset.ll
index 365ba17..48d1db6 100644
--- a/test/DebugInfo/X86/debug-ranges-offset.ll
+++ b/test/DebugInfo/X86/debug-ranges-offset.ll

@@ -31,11 +31,11 @@
   %call = call i8* @_Znwm(i64 4) #4, !dbg !19
   %_msret = load i64* getelementptr inbounds ([8 x i64]* @__msan_retval_tls, i64 0, i64 0), align 8, !dbg !19
   %3 = bitcast i8* %call to i32*, !dbg !19
-  tail call void @llvm.dbg.value(metadata !{i32* %3}, i64 0, metadata !9), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i32* %3}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !19
   %4 = inttoptr i64 %1 to i64*, !dbg !19
   store i64 %_msret, i64* %4, align 8, !dbg !19
   store volatile i32* %3, i32** %p, align 8, !dbg !19
-  tail call void @llvm.dbg.value(metadata !{i32** %p}, i64 0, metadata !9), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i32** %p}, i64 0, metadata !9, metadata !{metadata !"0x102"}), !dbg !19
   %p.0.p.0. = load volatile i32** %p, align 8, !dbg !20
   %_msld = load i64* %4, align 8, !dbg !20
   %_mscmp = icmp eq i64 %_msld, 0, !dbg !20
@@ -96,11 +96,11 @@
   %call.i = call i8* @_Znwm(i64 4) #4, !dbg !30
   %_msret = load i64* getelementptr inbounds ([8 x i64]* @__msan_retval_tls, i64 0, i64 0), align 8, !dbg !30
   %3 = bitcast i8* %call.i to i32*, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i32* %3}, i64 0, metadata !32), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32* %3}, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !30
   %4 = inttoptr i64 %1 to i64*, !dbg !30
   store i64 %_msret, i64* %4, align 8, !dbg !30
   store volatile i32* %3, i32** %p.i, align 8, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i32** %p.i}, i64 0, metadata !32), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32** %p.i}, i64 0, metadata !32, metadata !{metadata !"0x102"}), !dbg !30
   %p.i.0.p.0.p.0..i = load volatile i32** %p.i, align 8, !dbg !33
   %_msld = load i64* %4, align 8, !dbg !33
   %_mscmp = icmp eq i64 %_msld, 0, !dbg !33
@@ -148,7 +148,7 @@
 declare void @__msan_init()
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 ; Function Attrs: nounwind
 declare i32 @puts(i8* nocapture readonly) #3
@@ -202,28 +202,28 @@
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 207243) (llvm/trunk 207259)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 207243) (llvm/trunk 207259)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !13}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"_Z1fv", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z1fv, null, null, metadata !8, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00f\00f\00_Z1fv\003\000\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, void ()* @_Z1fv, null, null, metadata !8} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786688, metadata !4, metadata !"p", metadata !5, i32 4, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [p] [line 4]
-!10 = metadata !{i32 786485, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from ]
-!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 9, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x100\00p\004\000", metadata !4, metadata !5, metadata !10} ; [ DW_TAG_auto_variable ] [p] [line 4]
+!10 = metadata !{metadata !"0x35\00\000\000\000\000\000", null, null, metadata !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{metadata !"0x2e\00main\00main\00\009\000\001\000\006\00256\001\009", metadata !1, metadata !5, metadata !14, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{metadata !12}
 !16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !18 = metadata !{metadata !"clang version 3.5.0 (trunk 207243) (llvm/trunk 207259)"}
 !19 = metadata !{i32 4, i32 0, metadata !4, null}
 !20 = metadata !{i32 5, i32 0, metadata !21, null}
-!21 = metadata !{i32 786443, metadata !1, metadata !4, i32 5, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.cpp]
+!21 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.cpp]
 !22 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
 !23 = metadata !{metadata !24, metadata !24, i64 0}
 !24 = metadata !{metadata !"int", metadata !25, i64 0}
@@ -234,7 +234,7 @@
 !29 = metadata !{i32 7, i32 0, metadata !4, null}
 !30 = metadata !{i32 4, i32 0, metadata !4, metadata !31}
 !31 = metadata !{i32 10, i32 0, metadata !13, null}
-!32 = metadata !{i32 786688, metadata !4, metadata !"p", metadata !5, i32 4, metadata !10, i32 0, metadata !31} ; [ DW_TAG_auto_variable ] [p] [line 4]
+!32 = metadata !{metadata !"0x100\00p\004\000", metadata !4, metadata !5, metadata !10, metadata !31} ; [ DW_TAG_auto_variable ] [p] [line 4]
 !33 = metadata !{i32 5, i32 0, metadata !21, metadata !31}
 !34 = metadata !{i32 6, i32 0, metadata !21, metadata !31}
 !35 = metadata !{i32 7, i32 0, metadata !4, metadata !31}

diff --git a/test/DebugInfo/X86/debug_frame.ll b/test/DebugInfo/X86/debug_frame.ll
index 67f2e5d..3b3071f 100644
--- a/test/DebugInfo/X86/debug_frame.ll
+++ b/test/DebugInfo/X86/debug_frame.ll

@@ -13,10 +13,10 @@
 !llvm.module.flags = !{!7}
 !5 = metadata !{metadata !0}
 
-!0 = metadata !{i32 786478, metadata !6, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!1 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !6, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !5, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !6, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\001\001", metadata !6, metadata !1, metadata !3, null, void ()* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!1 = metadata !{metadata !"0x29", metadata !6} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", metadata !6, metadata !4, metadata !4, metadata !5, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !6, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !6 = metadata !{metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build"}
-!7 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!7 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/decl-derived-member.ll b/test/DebugInfo/X86/decl-derived-member.ll
index 4035602..43985b2 100644
--- a/test/DebugInfo/X86/decl-derived-member.ll
+++ b/test/DebugInfo/X86/decl-derived-member.ll

@@ -37,7 +37,7 @@
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !36), !dbg !38
+  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !36, metadata !{metadata !"0x102"}), !dbg !38
   %this1 = load %struct.foo** %this.addr
   %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !39
   call void @_ZN4baseC2Ev(%struct.base* %b) #2, !dbg !39
@@ -49,7 +49,7 @@
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !40), !dbg !41
+  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !40, metadata !{metadata !"0x102"}), !dbg !41
   %this1 = load %struct.foo** %this.addr
   %b = getelementptr inbounds %struct.foo* %this1, i32 0, i32 0, !dbg !42
   call void @_ZN4baseD1Ev(%struct.base* %b), !dbg !42
@@ -60,7 +60,7 @@
 declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #2
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #3
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #3
 
 declare void @_ZN4baseD1Ev(%struct.base*) #4
 
@@ -69,7 +69,7 @@
 entry:
   %this.addr = alloca %struct.base*, align 8
   store %struct.base* %this, %struct.base** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.base** %this.addr}, metadata !45), !dbg !47
+  call void @llvm.dbg.declare(metadata !{%struct.base** %this.addr}, metadata !45, metadata !{metadata !"0x102"}), !dbg !47
   %this1 = load %struct.base** %this.addr
   %0 = bitcast %struct.base* %this1 to i8***, !dbg !48
   store i8** getelementptr inbounds ([4 x i8*]* @_ZTV4base, i64 0, i64 2), i8*** %0, !dbg !48
@@ -92,53 +92,53 @@
 !llvm.module.flags = !{!32, !33}
 !llvm.ident = !{!34}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 203673) (llvm/trunk 203681)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !8, metadata !30, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 203673) (llvm/trunk 203681)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !8, metadata !30, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"foo.cc", metadata !"/usr/local/google/home/echristo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 5, i64 64, i64 64, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 5, size 64, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00foo\005\0064\0064\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 5, size 64, align 64, offset 0] [def] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS3foo", metadata !"b", i32 6, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4base"} ; [ DW_TAG_member ] [b] [line 6, size 64, align 64, offset 0] [from _ZTS4base]
-!7 = metadata !{i32 786451, metadata !1, null, metadata !"base", i32 1, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS4base"} ; [ DW_TAG_structure_type ] [base] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!6 = metadata !{metadata !"0xd\00b\006\0064\0064\000\000", metadata !1, metadata !"_ZTS3foo", metadata !"_ZTS4base"} ; [ DW_TAG_member ] [b] [line 6, size 64, align 64, offset 0] [from _ZTS4base]
+!7 = metadata !{metadata !"0x13\00base\001\000\000\000\004\000", metadata !1, null, null, null, null, null, metadata !"_ZTS4base"} ; [ DW_TAG_structure_type ] [base] [line 1, size 0, align 0, offset 0] [decl] [from ]
 !8 = metadata !{metadata !9, metadata !13, metadata !19, metadata !22, metadata !28}
-!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 9, metadata !11, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [local] [def] [__cxx_global_var_init]
-!10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/foo.cc]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\009\001\001\000\006\00256\000\009", metadata !1, metadata !10, metadata !11, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [local] [def] [__cxx_global_var_init]
+!10 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/foo.cc]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null}
-!13 = metadata !{i32 786478, metadata !1, metadata !"_ZTS3foo", metadata !"~foo", metadata !"~foo", metadata !"_ZN3fooD2Ev", i32 5, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 320, i1 false, void (%struct.foo*)* @_ZN3fooD2Ev, null, metadata !17, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [~foo]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x2e\00~foo\00~foo\00_ZN3fooD2Ev\005\000\001\000\006\00320\000\005", metadata !1, metadata !"_ZTS3foo", metadata !14, null, void (%struct.foo*)* @_ZN3fooD2Ev, null, metadata !17, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [~foo]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !16}
-!16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
-!17 = metadata !{i32 786478, null, metadata !"_ZTS3foo", metadata !"~foo", metadata !"~foo", metadata !"", i32 0, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !18, i32 0} ; [ DW_TAG_subprogram ] [line 0] [~foo]
+!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!17 = metadata !{metadata !"0x2e\00~foo\00~foo\00\000\000\000\000\006\00320\000\000", null, metadata !"_ZTS3foo", metadata !14, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 0] [~foo]
 !18 = metadata !{i32 786468}
-!19 = metadata !{i32 786478, metadata !1, metadata !"_ZTS3foo", metadata !"foo", metadata !"foo", metadata !"_ZN3fooC2Ev", i32 5, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 320, i1 false, void (%struct.foo*)* @_ZN3fooC2Ev, null, metadata !20, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!20 = metadata !{i32 786478, null, metadata !"_ZTS3foo", metadata !"foo", metadata !"foo", metadata !"", i32 0, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !21, i32 0} ; [ DW_TAG_subprogram ] [line 0] [foo]
+!19 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN3fooC2Ev\005\000\001\000\006\00320\000\005", metadata !1, metadata !"_ZTS3foo", metadata !14, null, void (%struct.foo*)* @_ZN3fooC2Ev, null, metadata !20, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!20 = metadata !{metadata !"0x2e\00foo\00foo\00\000\000\000\000\006\00320\000\000", null, metadata !"_ZTS3foo", metadata !14, null, null, null, i32 0, metadata !21} ; [ DW_TAG_subprogram ] [line 0] [foo]
 !21 = metadata !{i32 786468}
-!22 = metadata !{i32 786478, metadata !1, metadata !"_ZTS4base", metadata !"base", metadata !"base", metadata !"_ZN4baseC2Ev", i32 1, metadata !23, i1 false, i1 true, i32 0, i32 0, null, i32 320, i1 false, void (%struct.base*)* @_ZN4baseC2Ev, null, metadata !26, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [base]
-!23 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !24, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x2e\00base\00base\00_ZN4baseC2Ev\001\000\001\000\006\00320\000\001", metadata !1, metadata !"_ZTS4base", metadata !23, null, void (%struct.base*)* @_ZN4baseC2Ev, null, metadata !26, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [base]
+!23 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !24 = metadata !{null, metadata !25}
-!25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4base]
-!26 = metadata !{i32 786478, null, metadata !"_ZTS4base", metadata !"base", metadata !"base", metadata !"", i32 0, metadata !23, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !27, i32 0} ; [ DW_TAG_subprogram ] [line 0] [base]
+!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4base]
+!26 = metadata !{metadata !"0x2e\00base\00base\00\000\000\000\000\006\00320\000\000", null, metadata !"_ZTS4base", metadata !23, null, null, null, i32 0, metadata !27} ; [ DW_TAG_subprogram ] [line 0] [base]
 !27 = metadata !{i32 786468}
-!28 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"", metadata !"", metadata !"_GLOBAL__I_a", i32 1, metadata !29, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__I_a, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [local] [def]
-!29 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__I_a\001\001\001\000\006\0064\000\001", metadata !1, metadata !10, metadata !29, null, void ()* @_GLOBAL__I_a, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [local] [def]
+!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !30 = metadata !{metadata !31}
-!31 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !10, i32 9, metadata !4, i32 0, i32 1, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 9] [def]
+!31 = metadata !{metadata !"0x34\00f\00f\00\009\000\001", null, metadata !10, metadata !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 9] [def]
 !32 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !34 = metadata !{metadata !"clang version 3.5.0 (trunk 203673) (llvm/trunk 203681)"}
 !35 = metadata !{i32 9, i32 0, metadata !9, null}
-!36 = metadata !{i32 786689, metadata !19, metadata !"this", null, i32 16777216, metadata !37, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!37 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!36 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !19, null, metadata !37} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!37 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
 !38 = metadata !{i32 0, i32 0, metadata !19, null}
 !39 = metadata !{i32 5, i32 0, metadata !19, null}
-!40 = metadata !{i32 786689, metadata !13, metadata !"this", null, i32 16777216, metadata !37, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!40 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !13, null, metadata !37} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !41 = metadata !{i32 0, i32 0, metadata !13, null}
 !42 = metadata !{i32 5, i32 0, metadata !43, null}
-!43 = metadata !{i32 786443, metadata !1, metadata !13, i32 5, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cc]
+!43 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !13} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cc]
 !44 = metadata !{i32 5, i32 0, metadata !13, null}
-!45 = metadata !{i32 786689, metadata !22, metadata !"this", null, i32 16777216, metadata !46, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!46 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4base]
+!45 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !46} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!46 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4base"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4base]
 !47 = metadata !{i32 0, i32 0, metadata !22, null}
 !48 = metadata !{i32 1, i32 0, metadata !22, null}
 !49 = metadata !{i32 1, i32 0, metadata !28, null}

diff --git a/test/DebugInfo/X86/discriminator.ll b/test/DebugInfo/X86/discriminator.ll
index aafdae1..b906e18 100644
--- a/test/DebugInfo/X86/discriminator.ll
+++ b/test/DebugInfo/X86/discriminator.ll

@@ -41,22 +41,22 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [./discriminator.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./discriminator.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"discriminator.c", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [./discriminator.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./discriminator.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5 "}
 !10 = metadata !{i32 2, i32 0, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./discriminator.c]
+!11 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./discriminator.c]
 !12 = metadata !{i32 3, i32 0, metadata !4, null}
 !13 = metadata !{i32 4, i32 0, metadata !4, null}
 !14 = metadata !{i32 2, i32 0, metadata !15, null}
-!15 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 42, i32 1} ; [ DW_TAG_lexical_block ] [./discriminator.c]
+!15 = metadata !{metadata !"0xb\0042", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./discriminator.c]
 
 ; CHECK: Address            Line   Column File   ISA Discriminator Flags
 ; CHECK: ------------------ ------ ------ ------ --- ------------- -------------

diff --git a/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll b/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll
index 021b89e..d5d1f72 100644
--- a/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll
+++ b/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll

@@ -28,14 +28,14 @@
 ; Function Attrs: nounwind readnone uwtable
 define i32 @_Z3fooi(i32 %bar) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %bar}, i64 0, metadata !10), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{i32 %bar}, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !20
   ret i32 %bar, !dbg !20
 }
 
 ; Function Attrs: nounwind readnone uwtable
 define i32 @_Z4foo2i(i32 %bar2) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %bar2}, i64 0, metadata !13), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i32 %bar2}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !21
   ret i32 %bar2, !dbg !21
 }
 
@@ -51,7 +51,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -60,25 +60,25 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !26}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (191881)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !17, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/debug_ranges/a.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (191881)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !17, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/debug_ranges/a.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tmp/debug_ranges/a.cc", metadata !"/"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !11, metadata !14}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @_Z3fooi, null, null, metadata !9, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/debug_ranges/a.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooi\002\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z3fooi, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/debug_ranges/a.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786689, metadata !4, metadata !"bar", metadata !5, i32 16777218, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bar] [line 2]
-!11 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo2", metadata !"foo2", metadata !"_Z4foo2i", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @_Z4foo2i, null, null, metadata !12, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [foo2]
+!10 = metadata !{metadata !"0x101\00bar\0016777218\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [bar] [line 2]
+!11 = metadata !{metadata !"0x2e\00foo2\00foo2\00_Z4foo2i\003\000\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z4foo2i, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 3] [def] [foo2]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786689, metadata !11, metadata !"bar2", metadata !5, i32 16777219, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bar2] [line 3]
-!14 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 5, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x101\00bar2\0016777219\000", metadata !11, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [bar2] [line 3]
+!14 = metadata !{metadata !"0x2e\00main\00main\00\005\000\001\000\006\00256\001\005", metadata !1, metadata !5, metadata !15, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !8}
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786484, i32 0, null, metadata !"global", metadata !"global", metadata !"", metadata !5, i32 1, metadata !8, i32 0, i32 1, i32* @global, null} ; [ DW_TAG_variable ] [global] [line 1] [def]
+!18 = metadata !{metadata !"0x34\00global\00global\00\001\000\001", null, metadata !5, metadata !8, i32* @global, null} ; [ DW_TAG_variable ] [global] [line 1] [def]
 !19 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !20 = metadata !{i32 2, i32 0, metadata !4, null}
 !21 = metadata !{i32 3, i32 0, metadata !11, null}
@@ -86,4 +86,4 @@
 !23 = metadata !{metadata !"int", metadata !24}
 !24 = metadata !{metadata !"omnipotent char", metadata !25}
 !25 = metadata !{metadata !"Simple C/C++ TBAA"}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dwarf-aranges.ll b/test/DebugInfo/X86/dwarf-aranges.ll
index 9ad6185..237e418 100644
--- a/test/DebugInfo/X86/dwarf-aranges.ll
+++ b/test/DebugInfo/X86/dwarf-aranges.ll

@@ -15,18 +15,15 @@
 
 ; <data section> - it should have made one span covering all vars in this CU.
 ; CHECK-NEXT: .quad some_data
-; CHECK-NEXT: [[R1:\.[A-Za-z0-9]*]] = .Ldebug_end1-some_data
-; CHECK-NEXT: .quad [[R1]]
+; CHECK-NEXT: .quad .Ldebug_end1-some_data
 
 ; <text section> - it should have made one span covering all functions in this CU.
 ; CHECK-NEXT: .quad .Lfunc_begin0
-; CHECK-NEXT: [[R2:\.[A-Za-z0-9]*]] = .Ldebug_end2-.Lfunc_begin0
-; CHECK-NEXT: .quad [[R2]]
+; CHECK-NEXT: .quad .Ldebug_end2-.Lfunc_begin0
 
 ; <other sections> - it should have made one span covering all vars in this CU.
 ; CHECK-NEXT: .quad some_other
-; CHECK-NEXT: [[R3:\.[A-Za-z0-9]*]] = .Ldebug_end3-some_other
-; CHECK-NEXT: .quad [[R3]]
+; CHECK-NEXT: .quad .Ldebug_end3-some_other
 
 ; -- finish --
 ; CHECK-NEXT: # ARange terminator
@@ -65,20 +62,20 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !16}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !8, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/kayamon/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !8, metadata !2} ; [ DW_TAG_compile_unit ] [/home/kayamon/test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test.c", metadata !"/home/kayamon"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"some_code", metadata !"some_code", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @some_code, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [some_code]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00some_code\00some_code\00\005\000\001\000\006\000\000\006", metadata !1, metadata !5, metadata !6, null, void ()* @some_code, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [some_code]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{metadata !9, metadata !11, metadata !12}
-!9 = metadata !{i32 786484, i32 0, null, metadata !"some_data", metadata !"some_data", metadata !"", metadata !5, i32 1, metadata !10, i32 0, i32 1, i32* @some_data, null} ; [ DW_TAG_variable ] [some_data] [line 1] [def]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{i32 786484, i32 0, null, metadata !"some_other", metadata !"some_other", metadata !"", metadata !5, i32 3, metadata !10, i32 0, i32 1, i32* @some_other, null} ; [ DW_TAG_variable ] [some_other] [line 3] [def]
-!12 = metadata !{i32 786484, i32 0, null, metadata !"some_bss", metadata !"some_bss", metadata !"", metadata !5, i32 2, metadata !10, i32 0, i32 1, i32* @some_bss, null} ; [ DW_TAG_variable ] [some_bss] [line 2] [def]
+!9 = metadata !{metadata !"0x34\00some_data\00some_data\00\001\000\001", null, metadata !5, metadata !10, i32* @some_data, null} ; [ DW_TAG_variable ] [some_data] [line 1] [def]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{metadata !"0x34\00some_other\00some_other\00\003\000\001", null, metadata !5, metadata !10, i32* @some_other, null} ; [ DW_TAG_variable ] [some_other] [line 3] [def]
+!12 = metadata !{metadata !"0x34\00some_bss\00some_bss\00\002\000\001", null, metadata !5, metadata !10, i32* @some_bss, null} ; [ DW_TAG_variable ] [some_bss] [line 2] [def]
 !13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !14 = metadata !{i32 7, i32 0, metadata !4, null}
-!15 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{i32 8, i32 0, metadata !4, null}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dwarf-public-names.ll b/test/DebugInfo/X86/dwarf-public-names.ll
index 793971a..aebc7ef 100644
--- a/test/DebugInfo/X86/dwarf-public-names.ll
+++ b/test/DebugInfo/X86/dwarf-public-names.ll

@@ -41,6 +41,7 @@
 
 ; Skip the output to the header of the pubnames section.
 ; LINUX: debug_pubnames
+; LINUX: unit_size = 0x00000128
 
 ; Check for each name in the output.
 ; LINUX-DAG: "ns"
@@ -62,13 +63,13 @@
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !28), !dbg !30
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !30
   %this1 = load %struct.C** %this.addr
   store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !31
   ret void, !dbg !32
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @_ZN1C22static_member_functionEv() nounwind uwtable align 2 {
 entry:
@@ -93,36 +94,36 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!38}
 
-!0 = metadata !{i32 786449, metadata !37, i32 4, metadata !"clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !24,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)\000\00\000\00\000", metadata !37, metadata !1, metadata !1, metadata !2, metadata !24,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{}
 !2 = metadata !{metadata !3, metadata !18, metadata !19, metadata !20}
-!3 = metadata !{i32 786478, metadata !4, null, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !12, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
-!4 = metadata !{i32 786473, metadata !37} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!3 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\009\000\001\000\006\00256\000\009", metadata !4, null, metadata !5, null, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !12, metadata !1} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!4 = metadata !{metadata !"0x29", metadata !37} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{null, metadata !7}
-!7 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
-!8 = metadata !{i32 786451, metadata !37, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
+!8 = metadata !{metadata !"0x13\00C\001\008\008\000\000\000", metadata !37, null, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !12, metadata !14}
-!10 = metadata !{i32 786445, metadata !37, metadata !8, metadata !"static_member_variable", i32 4, i64 0, i64 0, i64 0, i32 4096, metadata !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{i32 786478, metadata !4, metadata !8, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !5, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !13, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
-!13 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!14 = metadata !{i32 786478, metadata !4, metadata !8, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !17, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
-!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0xd\00static_member_variable\004\000\000\000\004096", metadata !37, metadata !8, metadata !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\002\000\000\000\006\00256\000\002", metadata !4, metadata !8, metadata !5, null, null, null, i32 0, metadata !13} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!13 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!14 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\003\000\000\000\006\00256\000\003", metadata !4, metadata !8, metadata !15, null, null, null, i32 0, metadata !17} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !11}
-!17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!18 = metadata !{i32 786478, metadata !4, null, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
-!19 = metadata !{i32 786478, metadata !4, metadata !4, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !1, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
-!20 = metadata !{i32 786478, metadata !4, metadata !21, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!21 = metadata !{i32 786489, metadata !4, null, metadata !"ns", i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
-!22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!18 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\0013\000\001\000\006\00256\000\0013", metadata !4, null, metadata !15, null, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!19 = metadata !{metadata !"0x2e\00global_function\00global_function\00_Z15global_functionv\0019\000\001\000\006\00256\000\0019", metadata !4, metadata !4, metadata !15, null, i32 ()* @_Z15global_functionv, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!20 = metadata !{metadata !"0x2e\00global_namespace_function\00global_namespace_function\00_ZN2ns25global_namespace_functionEv\0024\000\001\000\006\00256\000\0024", metadata !4, metadata !21, metadata !22, null, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!21 = metadata !{metadata !"0x39\00ns\0023", metadata !4, null} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null}
 !24 = metadata !{metadata !25, metadata !26, metadata !27}
-!25 = metadata !{i32 786484, i32 0, metadata !8, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !4, i32 7, metadata !11, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
-!26 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !4, i32 17, metadata !8, i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
-!27 = metadata !{i32 786484, i32 0, metadata !21, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !4, i32 27, metadata !11, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
-!28 = metadata !{i32 786689, metadata !3, metadata !"this", metadata !4, i32 16777225, metadata !29, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 9]
-!29 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
+!25 = metadata !{metadata !"0x34\00static_member_variable\00static_member_variable\00_ZN1C22static_member_variableE\007\000\001", metadata !8, metadata !4, metadata !11, i32* @_ZN1C22static_member_variableE, metadata !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!26 = metadata !{metadata !"0x34\00global_variable\00global_variable\00\0017\000\001", null, metadata !4, metadata !8, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!27 = metadata !{metadata !"0x34\00global_namespace_variable\00global_namespace_variable\00_ZN2ns25global_namespace_variableE\0027\000\001", metadata !21, metadata !4, metadata !11, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!28 = metadata !{metadata !"0x101\00this\0016777225\001088", metadata !3, metadata !4, metadata !29} ; [ DW_TAG_arg_variable ] [this] [line 9]
+!29 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
 !30 = metadata !{i32 9, i32 0, metadata !3, null}
 !31 = metadata !{i32 10, i32 0, metadata !3, null}
 !32 = metadata !{i32 11, i32 0, metadata !3, null}
@@ -131,4 +132,4 @@
 !35 = metadata !{i32 25, i32 0, metadata !20, null}
 !36 = metadata !{i32 26, i32 0, metadata !20, null}
 !37 = metadata !{metadata !"dwarf-public-names.cpp", metadata !"/usr2/kparzysz/s.hex/t"}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/dwarf-pubnames-split.ll b/test/DebugInfo/X86/dwarf-pubnames-split.ll
index 65c46d3..87dd0ff 100644
--- a/test/DebugInfo/X86/dwarf-pubnames-split.ll
+++ b/test/DebugInfo/X86/dwarf-pubnames-split.ll

@@ -24,15 +24,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 189287) (llvm/trunk 189296)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 189287) (llvm/trunk 189296)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !10 = metadata !{i32 2, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/earlydup-crash.ll b/test/DebugInfo/X86/earlydup-crash.ll
index b5dc01e..6bbd620 100644
--- a/test/DebugInfo/X86/earlydup-crash.ll
+++ b/test/DebugInfo/X86/earlydup-crash.ll

@@ -4,7 +4,7 @@
 
 %struct.cpp_dir = type { %struct.cpp_dir*, i8*, i32, i8, i8**, i8*, i8* (i8*, %struct.cpp_dir*)*, i64, i32, i8 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 define internal i8* @framework_construct_pathname(i8* %fname, %struct.cpp_dir* %dir) nounwind ssp {
 entry:
@@ -13,7 +13,7 @@
 bb:                                               ; preds = %entry
   %tmp = icmp eq i32 undef, 0
   %tmp1 = add i32 0, 11
-  call void @llvm.dbg.value(metadata !{i32 %tmp1}, i64 0, metadata !0)
+  call void @llvm.dbg.value(metadata !{i32 %tmp1}, i64 0, metadata !0, metadata !{metadata !"0x102"})
   br i1 undef, label %bb18, label %bb31.preheader
 
 bb31.preheader:                                   ; preds = %bb19, %bb
@@ -44,51 +44,51 @@
 
 !llvm.dbg.cu = !{!4}
 !llvm.module.flags = !{!47}
-!0 = metadata !{i32 590080, metadata !1, metadata !"frname_len", metadata !3, i32 517, metadata !38, i32 0} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{i32 589835, metadata !44, metadata !2, i32 515, i32 0, i32 19} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{i32 589870, metadata !44, null, metadata !"framework_construct_pathname", metadata !"framework_construct_pathname", metadata !"", i32 515, metadata !5, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8* (i8*, %struct.cpp_dir*)* @framework_construct_pathname, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!3 = metadata !{i32 589865, metadata !44}  ; [ DW_TAG_file_type ]
-!4 = metadata !{i32 589841, metadata !44, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !46, metadata !46, metadata !45, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 589845, metadata !44, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x100\00frname_len\00517\000", metadata !1, metadata !3, metadata !38} ; [ DW_TAG_auto_variable ]
+!1 = metadata !{metadata !"0xb\00515\000\0019", metadata !44, metadata !2} ; [ DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0x2e\00framework_construct_pathname\00framework_construct_pathname\00\00515\001\001\000\006\00256\001\000", metadata !44, null, metadata !5, null, i8* (i8*, %struct.cpp_dir*)* @framework_construct_pathname, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x29", metadata !44}  ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !44, metadata !46, metadata !46, metadata !45, null, null} ; [ DW_TAG_compile_unit ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !44, metadata !3, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7, metadata !9, metadata !11}
-!7 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 589860, metadata !44, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 589862, metadata !44, metadata !3, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ]
-!11 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 589846, metadata !41, metadata !13, metadata !"cpp_dir", i32 45, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_typedef ]
-!13 = metadata !{i32 589865, metadata !41} ; [ DW_TAG_file_type ]
-!14 = metadata !{i32 589843, metadata !41, metadata !3, metadata !"cpp_dir", i32 43, i64 352, i64 32, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [cpp_dir] [line 43, size 352, align 32, offset 0] [def] [from ]
+!7 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !8} ; [ DW_TAG_pointer_type ]
+!8 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x26\00\000\008\008\000\000", metadata !44, metadata !3, metadata !8} ; [ DW_TAG_const_type ]
+!11 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0x16\00cpp_dir\0045\000\000\000\000", metadata !41, metadata !13, metadata !14} ; [ DW_TAG_typedef ]
+!13 = metadata !{metadata !"0x29", metadata !41} ; [ DW_TAG_file_type ]
+!14 = metadata !{metadata !"0x13\00cpp_dir\0043\00352\0032\000\000\000", metadata !41, metadata !3, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [cpp_dir] [line 43, size 352, align 32, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !18, metadata !19, metadata !21, metadata !23, metadata !25, metadata !27, metadata !29, metadata !33, metadata !36}
-!16 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"next", i32 572, i64 32, i64 32, i64 0, i32 0, metadata !17} ; [ DW_TAG_member ]
-!17 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!18 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"name", i32 575, i64 32, i64 32, i64 32, i32 0, metadata !7} ; [ DW_TAG_member ]
-!19 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"len", i32 576, i64 32, i64 32, i64 64, i32 0, metadata !20} ; [ DW_TAG_member ]
-!20 = metadata !{i32 589860, metadata !44, metadata !3, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!21 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"sysp", i32 580, i64 8, i64 8, i64 96, i32 0, metadata !22} ; [ DW_TAG_member ]
-!22 = metadata !{i32 589860, metadata !44, metadata !3, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!23 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"name_map", i32 584, i64 32, i64 32, i64 128, i32 0, metadata !24} ; [ DW_TAG_member ]
-!24 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
-!25 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"header_map", i32 590, i64 32, i64 32, i64 160, i32 0, metadata !26} ; [ DW_TAG_member ]
-!26 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!27 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"construct", i32 597, i64 32, i64 32, i64 192, i32 0, metadata !28} ; [ DW_TAG_member ]
-!28 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
-!29 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"ino", i32 601, i64 64, i64 64, i64 224, i32 0, metadata !30} ; [ DW_TAG_member ]
-!30 = metadata !{i32 589846, metadata !42, metadata !31, metadata !"ino_t", i32 141, i64 0, i64 0, i64 0, i32 0, metadata !32} ; [ DW_TAG_typedef ]
-!31 = metadata !{i32 589865, metadata !42} ; [ DW_TAG_file_type ]
-!32 = metadata !{i32 589860, metadata !44, metadata !3, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!33 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"dev", i32 602, i64 32, i64 32, i64 288, i32 0, metadata !34} ; [ DW_TAG_member ]
-!34 = metadata !{i32 589846, metadata !42, metadata !31, metadata !"dev_t", i32 107, i64 0, i64 0, i64 0, i32 0, metadata !35} ; [ DW_TAG_typedef ]
-!35 = metadata !{i32 589860, metadata !44, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!36 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"user_supplied_p", i32 605, i64 8, i64 8, i64 320, i32 0, metadata !37} ; [ DW_TAG_member ]
-!37 = metadata !{i32 589860, metadata !44, metadata !3, metadata !"_Bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
-!38 = metadata !{i32 589846, metadata !43, metadata !39, metadata !"size_t", i32 326, i64 0, i64 0, i64 0, i32 0, metadata !40} ; [ DW_TAG_typedef ]
-!39 = metadata !{i32 589865, metadata !43} ; [ DW_TAG_file_type ]
-!40 = metadata !{i32 589860, metadata !44, metadata !3, metadata !"long unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !"0xd\00next\00572\0032\0032\000\000", metadata !41, metadata !14, metadata !17} ; [ DW_TAG_member ]
+!17 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !14} ; [ DW_TAG_pointer_type ]
+!18 = metadata !{metadata !"0xd\00name\00575\0032\0032\0032\000", metadata !41, metadata !14, metadata !7} ; [ DW_TAG_member ]
+!19 = metadata !{metadata !"0xd\00len\00576\0032\0032\0064\000", metadata !41, metadata !14, metadata !20} ; [ DW_TAG_member ]
+!20 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
+!21 = metadata !{metadata !"0xd\00sysp\00580\008\008\0096\000", metadata !41, metadata !14, metadata !22} ; [ DW_TAG_member ]
+!22 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
+!23 = metadata !{metadata !"0xd\00name_map\00584\0032\0032\00128\000", metadata !41, metadata !14, metadata !24} ; [ DW_TAG_member ]
+!24 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !9} ; [ DW_TAG_pointer_type ]
+!25 = metadata !{metadata !"0xd\00header_map\00590\0032\0032\00160\000", metadata !41, metadata !14, metadata !26} ; [ DW_TAG_member ]
+!26 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, null} ; [ DW_TAG_pointer_type ]
+!27 = metadata !{metadata !"0xd\00construct\00597\0032\0032\00192\000", metadata !41, metadata !14, metadata !28} ; [ DW_TAG_member ]
+!28 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", metadata !44, metadata !3, metadata !5} ; [ DW_TAG_pointer_type ]
+!29 = metadata !{metadata !"0xd\00ino\00601\0064\0064\00224\000", metadata !41, metadata !14, metadata !30} ; [ DW_TAG_member ]
+!30 = metadata !{metadata !"0x16\00ino_t\00141\000\000\000\000", metadata !42, metadata !31, metadata !32} ; [ DW_TAG_typedef ]
+!31 = metadata !{metadata !"0x29", metadata !42} ; [ DW_TAG_file_type ]
+!32 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
+!33 = metadata !{metadata !"0xd\00dev\00602\0032\0032\00288\000", metadata !41, metadata !14, metadata !34} ; [ DW_TAG_member ]
+!34 = metadata !{metadata !"0x16\00dev_t\00107\000\000\000\000", metadata !42, metadata !31, metadata !35} ; [ DW_TAG_typedef ]
+!35 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
+!36 = metadata !{metadata !"0xd\00user_supplied_p\00605\008\008\00320\000", metadata !41, metadata !14, metadata !37} ; [ DW_TAG_member ]
+!37 = metadata !{metadata !"0x24\00_Bool\000\008\008\000\000\002", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
+!38 = metadata !{metadata !"0x16\00size_t\00326\000\000\000\000", metadata !43, metadata !39, metadata !40} ; [ DW_TAG_typedef ]
+!39 = metadata !{metadata !"0x29", metadata !43} ; [ DW_TAG_file_type ]
+!40 = metadata !{metadata !"0x24\00long unsigned int\000\0032\0032\000\000\007", metadata !44, metadata !3} ; [ DW_TAG_base_type ]
 !41 = metadata !{metadata !"cpplib.h", metadata !"/Users/espindola/llvm/build-llvm-gcc/gcc/../../llvm-gcc-4.2/gcc/../libcpp/include"}
 !42 = metadata !{metadata !"types.h", metadata !"/usr/include/sys"}
 !43 = metadata !{metadata !"stddef.h", metadata !"/Users/espindola/llvm/build-llvm-gcc/./prev-gcc/include"}
 !44 = metadata !{metadata !"darwin-c.c", metadata !"/Users/espindola/llvm/build-llvm-gcc/gcc/../../llvm-gcc-4.2/gcc/config"}
 !45 = metadata !{metadata !2}
 !46 = metadata !{i32 0}
-!47 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!47 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
index 36fd232..71be903 100644
--- a/test/DebugInfo/X86/elf-names.ll
+++ b/test/DebugInfo/X86/elf-names.ll

@@ -22,7 +22,7 @@
 
 define void @_ZN1DC2Ev(%class.D* nocapture %this) unnamed_addr nounwind uwtable align 2 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !29), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !29, metadata !{metadata !"0x102"}), !dbg !36
   %c1 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !37
   store i32 1, i32* %c1, align 4, !dbg !37
   %c2 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !42
@@ -36,8 +36,8 @@
 
 define void @_ZN1DC2ERKS_(%class.D* nocapture %this, %class.D* nocapture %d) unnamed_addr nounwind uwtable align 2 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !34), !dbg !46
-  tail call void @llvm.dbg.value(metadata !{%class.D* %d}, i64 0, metadata !35), !dbg !46
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !34, metadata !{metadata !"0x102"}), !dbg !46
+  tail call void @llvm.dbg.value(metadata !{%class.D* %d}, i64 0, metadata !35, metadata !{metadata !"0x102"}), !dbg !46
   %c1 = getelementptr inbounds %class.D* %d, i64 0, i32 0, !dbg !47
   %0 = load i32* %c1, align 4, !dbg !47
   %c12 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !47
@@ -57,56 +57,56 @@
   ret void, !dbg !52
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!54}
 
-!0 = metadata !{i32 786449, metadata !53, i32 4, metadata !"clang version 3.2 (trunk 167506) (llvm/trunk 167505)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 167506) (llvm/trunk 167505)\001\00\000\00\000", metadata !53, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !31}
-!5 = metadata !{i32 786478, metadata !6, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2Ev", i32 12, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*)* @_ZN1DC2Ev, null, metadata !17, metadata !27, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
-!6 = metadata !{i32 786473, metadata !53} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00D\00D\00_ZN1DC2Ev\0012\000\001\000\006\00256\001\0012", metadata !6, null, metadata !7, null, void (%class.D*)* @_ZN1DC2Ev, null, metadata !17, metadata !27} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
+!6 = metadata !{metadata !"0x29", metadata !53} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
-!10 = metadata !{i32 786434, metadata !53, null, metadata !"D", i32 1, i64 128, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!10 = metadata !{metadata !"0x2\00D\001\00128\0032\000\000\000", metadata !53, null, null, metadata !11, null, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [def] [from ]
 !11 = metadata !{metadata !12, metadata !14, metadata !15, metadata !16, metadata !17, metadata !20}
-!12 = metadata !{i32 786445, metadata !53, metadata !10, metadata !"c1", i32 6, i64 32, i64 32, i64 0, i32 1, metadata !13} ; [ DW_TAG_member ] [c1] [line 6, size 32, align 32, offset 0] [private] [from int]
-!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{i32 786445, metadata !53, metadata !10, metadata !"c2", i32 7, i64 32, i64 32, i64 32, i32 1, metadata !13} ; [ DW_TAG_member ] [c2] [line 7, size 32, align 32, offset 32] [private] [from int]
-!15 = metadata !{i32 786445, metadata !53, metadata !10, metadata !"c3", i32 8, i64 32, i64 32, i64 64, i32 1, metadata !13} ; [ DW_TAG_member ] [c3] [line 8, size 32, align 32, offset 64] [private] [from int]
-!16 = metadata !{i32 786445, metadata !53, metadata !10, metadata !"c4", i32 9, i64 32, i64 32, i64 96, i32 1, metadata !13} ; [ DW_TAG_member ] [c4] [line 9, size 32, align 32, offset 96] [private] [from int]
-!17 = metadata !{i32 786478, metadata !6, metadata !10, metadata !"D", metadata !"D", metadata !"", i32 3, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [D]
+!12 = metadata !{metadata !"0xd\00c1\006\0032\0032\000\001", metadata !53, metadata !10, metadata !13} ; [ DW_TAG_member ] [c1] [line 6, size 32, align 32, offset 0] [private] [from int]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !"0xd\00c2\007\0032\0032\0032\001", metadata !53, metadata !10, metadata !13} ; [ DW_TAG_member ] [c2] [line 7, size 32, align 32, offset 32] [private] [from int]
+!15 = metadata !{metadata !"0xd\00c3\008\0032\0032\0064\001", metadata !53, metadata !10, metadata !13} ; [ DW_TAG_member ] [c3] [line 8, size 32, align 32, offset 64] [private] [from int]
+!16 = metadata !{metadata !"0xd\00c4\009\0032\0032\0096\001", metadata !53, metadata !10, metadata !13} ; [ DW_TAG_member ] [c4] [line 9, size 32, align 32, offset 96] [private] [from int]
+!17 = metadata !{metadata !"0x2e\00D\00D\00\003\000\000\000\006\00256\001\003", metadata !6, metadata !10, metadata !7, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 3] [D]
 !18 = metadata !{metadata !19}
-!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!20 = metadata !{i32 786478, metadata !6, metadata !10, metadata !"D", metadata !"D", metadata !"", i32 4, metadata !21, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !25, i32 4} ; [ DW_TAG_subprogram ] [line 4] [D]
-!21 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{metadata !"0x2e\00D\00D\00\004\000\000\000\006\00256\001\004", metadata !6, metadata !10, metadata !21, null, null, null, i32 0, metadata !25} ; [ DW_TAG_subprogram ] [line 4] [D]
+!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{null, metadata !9, metadata !23}
-!23 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !24} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!24 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
+!23 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !24} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
 !25 = metadata !{metadata !26}
-!26 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!26 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
 !27 = metadata !{metadata !29}
-!29 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777228, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 12]
-!30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
-!31 = metadata !{i32 786478, metadata !6, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2ERKS_", i32 19, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*, %class.D*)* @_ZN1DC2ERKS_, null, metadata !20, metadata !32, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [D]
+!29 = metadata !{metadata !"0x101\00this\0016777228\001088", metadata !5, metadata !6, metadata !30} ; [ DW_TAG_arg_variable ] [this] [line 12]
+!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!31 = metadata !{metadata !"0x2e\00D\00D\00_ZN1DC2ERKS_\0019\000\001\000\006\00256\001\0019", metadata !6, null, metadata !21, null, void (%class.D*, %class.D*)* @_ZN1DC2ERKS_, null, metadata !20, metadata !32} ; [ DW_TAG_subprogram ] [line 19] [def] [D]
 !32 = metadata !{metadata !34, metadata !35}
-!34 = metadata !{i32 786689, metadata !31, metadata !"this", metadata !6, i32 16777235, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 19]
-!35 = metadata !{i32 786689, metadata !31, metadata !"d", metadata !6, i32 33554451, metadata !23, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 19]
+!34 = metadata !{metadata !"0x101\00this\0016777235\001088", metadata !31, metadata !6, metadata !30} ; [ DW_TAG_arg_variable ] [this] [line 19]
+!35 = metadata !{metadata !"0x101\00d\0033554451\000", metadata !31, metadata !6, metadata !23} ; [ DW_TAG_arg_variable ] [d] [line 19]
 !36 = metadata !{i32 12, i32 0, metadata !5, null}
 !37 = metadata !{i32 13, i32 0, metadata !38, null}
-!38 = metadata !{i32 786443, metadata !6, metadata !5, i32 12, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!38 = metadata !{metadata !"0xb\0012\000\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
 !42 = metadata !{i32 14, i32 0, metadata !38, null}
 !43 = metadata !{i32 15, i32 0, metadata !38, null}
 !44 = metadata !{i32 16, i32 0, metadata !38, null}
 !45 = metadata !{i32 17, i32 0, metadata !38, null}
 !46 = metadata !{i32 19, i32 0, metadata !31, null}
 !47 = metadata !{i32 20, i32 0, metadata !48, null}
-!48 = metadata !{i32 786443, metadata !6, metadata !31, i32 19, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!48 = metadata !{metadata !"0xb\0019\000\001", metadata !6, metadata !31} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
 !49 = metadata !{i32 21, i32 0, metadata !48, null}
 !50 = metadata !{i32 22, i32 0, metadata !48, null}
 !51 = metadata !{i32 23, i32 0, metadata !48, null}
 !52 = metadata !{i32 24, i32 0, metadata !48, null}
 !53 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo"}
-!54 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!54 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/empty-and-one-elem-array.ll b/test/DebugInfo/X86/empty-and-one-elem-array.ll
index f5c37df..bbf527d 100644
--- a/test/DebugInfo/X86/empty-and-one-elem-array.ll
+++ b/test/DebugInfo/X86/empty-and-one-elem-array.ll

@@ -9,8 +9,8 @@
 entry:
   %my_foo = alloca %struct.foo, align 4
   %my_bar = alloca %struct.bar, align 4
-  call void @llvm.dbg.declare(metadata !{%struct.foo* %my_foo}, metadata !10), !dbg !19
-  call void @llvm.dbg.declare(metadata !{%struct.bar* %my_bar}, metadata !20), !dbg !28
+  call void @llvm.dbg.declare(metadata !{%struct.foo* %my_foo}, metadata !10, metadata !{metadata !"0x102"}), !dbg !19
+  call void @llvm.dbg.declare(metadata !{%struct.bar* %my_bar}, metadata !20, metadata !{metadata !"0x102"}), !dbg !28
   %a = getelementptr inbounds %struct.foo* %my_foo, i32 0, i32 0, !dbg !29
   store i32 3, i32* %a, align 4, !dbg !29
   %a1 = getelementptr inbounds %struct.bar* %my_bar, i32 0, i32 0, !dbg !30
@@ -23,14 +23,11 @@
   ret i32 %add, !dbg !31
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-; An empty array should not have an AT_upper_bound attribute. But an array of 1
-; should.
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 ; CHECK:      DW_TAG_base_type
 ; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "int")
-; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
+; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (DW_ATE_signed)
 ; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
 
 ; int foo::b[1]:
@@ -46,7 +43,7 @@
 ; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
 ; CHECK:      DW_TAG_subrange_type [{{.*}}]
 ; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
-; CHECK-NEXT: DW_AT_upper_bound [DW_FORM_data1]  (0x00)
+; CHECK-NEXT: DW_AT_count [DW_FORM_data1]  (0x01)
 
 ; int bar::b[0]:
 ; CHECK: DW_TAG_structure_type
@@ -59,42 +56,42 @@
 ; int[0]:
 ; CHECK:      DW_TAG_array_type [{{.*}}] *
 ; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
-; CHECK:      DW_TAG_subrange_type [11]
+; CHECK:      DW_TAG_subrange_type
 ; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
-; CHECK-NOT:  DW_AT_upper_bound
+; CHECK:      DW_AT_count [DW_FORM_data1]  (0x00)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786449, metadata !32, i32 12, metadata !"clang version 3.3 (trunk 169136)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 169136)\000\00\000\00\000", metadata !32, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/test.c] [DW_LANG_C99]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"func", metadata !"func", metadata !"", i32 11, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @func, null, null, metadata !1, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [func]
-!6 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00func\00func\00\0011\000\001\000\006\000\000\0011", metadata !6, metadata !6, metadata !7, null, i32 ()* @func, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 11] [def] [func]
+!6 = metadata !{metadata !"0x29", metadata !32} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786688, metadata !11, metadata !"my_foo", metadata !6, i32 12, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [my_foo] [line 12]
-!11 = metadata !{i32 786443, metadata !6, metadata !5, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Volumes/Sandbox/llvm/test.c]
-!12 = metadata !{i32 786451, metadata !32, null, metadata !"foo", i32 1, i64 64, i64 32, i32 0, i32 0, null, metadata !13, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x100\00my_foo\0012\000", metadata !11, metadata !6, metadata !12} ; [ DW_TAG_auto_variable ] [my_foo] [line 12]
+!11 = metadata !{metadata !"0xb\0011\000\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ] [/Volumes/Sandbox/llvm/test.c]
+!12 = metadata !{metadata !"0x13\00foo\001\0064\0032\000\000\000", metadata !32, null, null, metadata !13, null, i32 0, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
 !13 = metadata !{metadata !14, metadata !15}
-!14 = metadata !{i32 786445, metadata !32, metadata !12, metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!15 = metadata !{i32 786445, metadata !32, metadata !12, metadata !"b", i32 3, i64 32, i64 32, i64 32, i32 0, metadata !16} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from ]
-!16 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 32, i32 0, i32 0, metadata !9, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 32, offset 0] [from int]
+!14 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !32, metadata !12, metadata !9} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!15 = metadata !{metadata !"0xd\00b\003\0032\0032\0032\000", metadata !32, metadata !12, metadata !16} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from ]
+!16 = metadata !{metadata !"0x1\00\000\0032\0032\000\000", null, null, metadata !9, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 32, offset 0] [from int]
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786465, i64 0, i64 1} ; [ DW_TAG_subrange_type ] [0, 1]
+!18 = metadata !{metadata !"0x21\000\001"} ; [ DW_TAG_subrange_type ] [0, 1]
 !19 = metadata !{i32 12, i32 0, metadata !11, null}
-!20 = metadata !{i32 786688, metadata !11, metadata !"my_bar", metadata !6, i32 13, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [my_bar] [line 13]
-!21 = metadata !{i32 786451, metadata !32, null, metadata !"bar", i32 6, i64 32, i64 32, i32 0, i32 0, null, metadata !22, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [bar] [line 6, size 32, align 32, offset 0] [def] [from ]
+!20 = metadata !{metadata !"0x100\00my_bar\0013\000", metadata !11, metadata !6, metadata !21} ; [ DW_TAG_auto_variable ] [my_bar] [line 13]
+!21 = metadata !{metadata !"0x13\00bar\006\0032\0032\000\000\000", metadata !32, null, null, metadata !22, null, i32 0, null} ; [ DW_TAG_structure_type ] [bar] [line 6, size 32, align 32, offset 0] [def] [from ]
 !22 = metadata !{metadata !23, metadata !24}
-!23 = metadata !{i32 786445, metadata !32, metadata !21, metadata !"a", i32 7, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [a] [line 7, size 32, align 32, offset 0] [from int]
-!24 = metadata !{i32 786445, metadata !32, metadata !21, metadata !"b", i32 8, i64 0, i64 32, i64 32, i32 0, metadata !25} ; [ DW_TAG_member ] [b] [line 8, size 0, align 32, offset 32] [from ]
-!25 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !26, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!23 = metadata !{metadata !"0xd\00a\007\0032\0032\000\000", metadata !32, metadata !21, metadata !9} ; [ DW_TAG_member ] [a] [line 7, size 32, align 32, offset 0] [from int]
+!24 = metadata !{metadata !"0xd\00b\008\000\0032\0032\000", metadata !32, metadata !21, metadata !25} ; [ DW_TAG_member ] [b] [line 8, size 0, align 32, offset 32] [from ]
+!25 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !9, metadata !26, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !26 = metadata !{metadata !27}
-!27 = metadata !{i32 786465, i64 0, i64 0} ; [ DW_TAG_subrange_type ] [0, 0]
+!27 = metadata !{metadata !"0x21\000\000"} ; [ DW_TAG_subrange_type ] [0, 0]
 !28 = metadata !{i32 13, i32 0, metadata !11, null}
 !29 = metadata !{i32 15, i32 0, metadata !11, null}
 !30 = metadata !{i32 16, i32 0, metadata !11, null}
 !31 = metadata !{i32 17, i32 0, metadata !11, null}
 !32 = metadata !{metadata !"test.c", metadata !"/Volumes/Sandbox/llvm"}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/empty-array.ll b/test/DebugInfo/X86/empty-array.ll
index 3fab313..f334ed3 100644
--- a/test/DebugInfo/X86/empty-array.ll
+++ b/test/DebugInfo/X86/empty-array.ll

@@ -22,28 +22,28 @@
 ; CHECK: [[BASE2]]: DW_TAG_base_type
 ; CHECK-NEXT: DW_AT_name
 ; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x08)
-; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x07)
+; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (DW_ATE_unsigned)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.3 (trunk 169136)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169136)\000\00\000\00\000", metadata !20, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 0, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !6, metadata !7, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x2\00A\001\000\0032\000\000\000", metadata !20, null, null, metadata !8, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{i32 786445, metadata !20, metadata !7, metadata !"x", i32 1, i64 0, i64 0, i64 0, i32 1, metadata !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
-!10 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xd\00x\001\000\000\000\001", metadata !20, metadata !7, metadata !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
+!10 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786465, i64 0, i64 -1} ; [ DW_TAG_subrange_type ] [unbound]
-!14 = metadata !{i32 786478, metadata !6, metadata !7, metadata !"A", metadata !"A", metadata !"", i32 1, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !18, i32 1} ; [ DW_TAG_subprogram ] [line 1] [A]
-!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x21\000\00-1"} ; [ DW_TAG_subrange_type ] [unbound]
+!14 = metadata !{metadata !"0x2e\00A\00A\00\001\000\000\000\006\00320\000\001", metadata !6, metadata !7, metadata !15, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 1] [A]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17}
-!17 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
 !18 = metadata !{metadata !19}
-!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
 !20 = metadata !{metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/ending-run.ll b/test/DebugInfo/X86/ending-run.ll
index 165074e..0fcfdf1 100644
--- a/test/DebugInfo/X86/ending-run.ll
+++ b/test/DebugInfo/X86/ending-run.ll

@@ -13,8 +13,8 @@
   %x.addr = alloca i32, align 4
   %y = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !12), !dbg !13
-  call void @llvm.dbg.declare(metadata !{i32* %y}, metadata !14), !dbg !16
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !12, metadata !{metadata !"0x102"}), !dbg !13
+  call void @llvm.dbg.declare(metadata !{i32* %y}, metadata !14, metadata !{metadata !"0x102"}), !dbg !16
   %0 = load i32* %x.addr, align 4, !dbg !17
   %1 = load i32* %x.addr, align 4, !dbg !17
   %mul = mul nsw i32 %0, %1, !dbg !17
@@ -24,25 +24,25 @@
   ret i32 %sub, !dbg !18
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.1 (trunk 153921) (llvm/trunk 153916)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 153921) (llvm/trunk 153916)\000\00\000\00\000", metadata !19, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !19, metadata !6, metadata !"callee", metadata !"callee", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 (i32)* @callee, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00callee\00callee\00\004\000\001\000\006\000\000\007", metadata !19, metadata !6, metadata !7, null, i32 (i32)* @callee, null, null, null} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !19} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777221, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!12 = metadata !{metadata !"0x101\00x\0016777221\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
 !13 = metadata !{i32 5, i32 5, metadata !5, null}
-!14 = metadata !{i32 786688, metadata !15, metadata !"y", metadata !6, i32 8, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786443, metadata !19, metadata !5, i32 7, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0x100\00y\008\000", metadata !15, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0xb\007\001\000", metadata !19, metadata !5} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{i32 8, i32 9, metadata !15, null}
 !17 = metadata !{i32 8, i32 18, metadata !15, null}
 !18 = metadata !{i32 9, i32 5, metadata !15, null}
 !19 = metadata !{metadata !"ending-run.c", metadata !"/Users/echristo/tmp"}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/enum-class.ll b/test/DebugInfo/X86/enum-class.ll
index 23ffbcc..7520d08 100644
--- a/test/DebugInfo/X86/enum-class.ll
+++ b/test/DebugInfo/X86/enum-class.ll

@@ -8,25 +8,25 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23}
 
-!0 = metadata !{i32 786449, metadata !22, i32 4, metadata !"clang version 3.2 (trunk 157269) (llvm/trunk 157264)", i1 false, metadata !"", i32 0, metadata !1, metadata !15, metadata !15, metadata !17,  metadata !15, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 157269) (llvm/trunk 157264)\000\00\000\00\000", metadata !22, metadata !1, metadata !15, metadata !15, metadata !17,  metadata !15} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !3, metadata !8, metadata !12}
-!3 = metadata !{i32 786436, metadata !4, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, metadata !5, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from int]
-!4 = metadata !{i32 786473, metadata !22} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!3 = metadata !{metadata !"0x4\00A\001\0032\0032\000\000\000", metadata !4, null, metadata !5, metadata !6, null, null, null} ; [ DW_TAG_enumeration_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from int]
+!4 = metadata !{metadata !"0x29", metadata !22} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786472, metadata !"A1", i64 1} ; [ DW_TAG_enumerator ]
-!8 = metadata !{i32 786436, metadata !4, null, metadata !"B", i32 2, i64 64, i64 64, i32 0, i32 0, metadata !9, metadata !10, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [B] [line 2, size 64, align 64, offset 0] [def] [from long unsigned int]
-!9 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x28\00A1\001"} ; [ DW_TAG_enumerator ]
+!8 = metadata !{metadata !"0x4\00B\002\0064\0064\000\000\000", metadata !4, null, metadata !9, metadata !10, null, null, null} ; [ DW_TAG_enumeration_type ] [B] [line 2, size 64, align 64, offset 0] [def] [from long unsigned int]
+!9 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ]
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786472, metadata !"B1", i64 1} ; [ DW_TAG_enumerator ]
-!12 = metadata !{i32 786436, metadata !4, null, metadata !"C", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [C] [line 3, size 32, align 32, offset 0] [def] [from ]
+!11 = metadata !{metadata !"0x28\00B1\001"} ; [ DW_TAG_enumerator ]
+!12 = metadata !{metadata !"0x4\00C\003\0032\0032\000\000\000", metadata !4, null, null, metadata !13, null, null, null} ; [ DW_TAG_enumeration_type ] [C] [line 3, size 32, align 32, offset 0] [def] [from ]
 !13 = metadata !{metadata !14}
-!14 = metadata !{i32 786472, metadata !"C1", i64 1} ; [ DW_TAG_enumerator ]
+!14 = metadata !{metadata !"0x28\00C1\001"} ; [ DW_TAG_enumerator ]
 !15 = metadata !{}
 !17 = metadata !{metadata !19, metadata !20, metadata !21}
-!19 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !4, i32 4, metadata !3, i32 0, i32 1, i32* @a, null} ; [ DW_TAG_variable ]
-!20 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !4, i32 5, metadata !8, i32 0, i32 1, i64* @b, null} ; [ DW_TAG_variable ]
-!21 = metadata !{i32 786484, i32 0, null, metadata !"c", metadata !"c", metadata !"", metadata !4, i32 6, metadata !12, i32 0, i32 1, i32* @c, null} ; [ DW_TAG_variable ]
+!19 = metadata !{metadata !"0x34\00a\00a\00\004\000\001", null, metadata !4, metadata !3, i32* @a, null} ; [ DW_TAG_variable ]
+!20 = metadata !{metadata !"0x34\00b\00b\00\005\000\001", null, metadata !4, metadata !8, i64* @b, null} ; [ DW_TAG_variable ]
+!21 = metadata !{metadata !"0x34\00c\00c\00\006\000\001", null, metadata !4, metadata !12, i32* @c, null} ; [ DW_TAG_variable ]
 !22 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
 
 ; CHECK: DW_TAG_enumeration_type [{{.*}}]
@@ -42,4 +42,4 @@
 ; CHECK: DW_TAG_enumeration_type [6]
 ; CHECK-NOT: DW_AT_enum_class
 ; CHECK: DW_AT_name [DW_FORM_strp]      ( .debug_str[{{.*}}] = "C")
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/enum-fwd-decl.ll b/test/DebugInfo/X86/enum-fwd-decl.ll
index adb962e..91472f2 100644
--- a/test/DebugInfo/X86/enum-fwd-decl.ll
+++ b/test/DebugInfo/X86/enum-fwd-decl.ll

@@ -6,16 +6,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9}
 
-!0 = metadata !{i32 786449, metadata !8, i32 4, metadata !"clang version 3.2 (trunk 165274) (llvm/trunk 165272)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/foo.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 165274) (llvm/trunk 165272)\000\00\000\00\000", metadata !8, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/tmp/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i16* @e, null} ; [ DW_TAG_variable ] [e] [line 2] [def]
-!6 = metadata !{i32 786473, metadata !8} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786436, metadata !8, null, metadata !"E", i32 1, i64 16, i64 16, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [decl] [from ]
+!5 = metadata !{metadata !"0x34\00e\00e\00\002\000\001", null, metadata !6, metadata !7, i16* @e, null} ; [ DW_TAG_variable ] [e] [line 2] [def]
+!6 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x4\00E\001\0016\0016\000\004\000", metadata !8, null, null, null, null, null, null} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [decl] [from ]
 !8 = metadata !{metadata !"foo.cpp", metadata !"/tmp"}
 
 ; CHECK: DW_TAG_enumeration_type
 ; CHECK-NEXT: DW_AT_name
 ; CHECK-NEXT: DW_AT_byte_size
 ; CHECK-NEXT: DW_AT_declaration
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/fission-cu.ll b/test/DebugInfo/X86/fission-cu.ll
index 7f17698..58692b9 100644
--- a/test/DebugInfo/X86/fission-cu.ll
+++ b/test/DebugInfo/X86/fission-cu.ll

@@ -8,12 +8,12 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9}
 
-!0 = metadata !{i32 786449, metadata !8, i32 12, metadata !"clang version 3.3 (trunk 169021) (llvm/trunk 169020)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !"baz.dwo"} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 169021) (llvm/trunk 169020)\000\00\000\00baz.dwo\000", metadata !8, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.c] [DW_LANG_C99]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, i32* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!6 = metadata !{i32 786473, metadata !8} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!5 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !6, metadata !7, i32* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!6 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !8 = metadata !{metadata !"baz.c", metadata !"/usr/local/google/home/echristo/tmp"}
 
 ; Check that the skeleton compile unit contains the proper attributes:
@@ -65,7 +65,7 @@
 ; CHECK: .debug_info.dwo contents:
 ; CHECK: DW_TAG_compile_unit
 ; CHECK: DW_AT_producer [DW_FORM_GNU_str_index] ( indexed (00000000) string = "clang version 3.3 (trunk 169021) (llvm/trunk 169020)")
-; CHECK: DW_AT_language [DW_FORM_data2]        (0x000c)
+; CHECK: DW_AT_language [DW_FORM_data2]        (DW_LANG_C99)
 ; CHECK: DW_AT_name [DW_FORM_GNU_str_index]    ( indexed (00000001) string = "baz.c")
 ; CHECK-NOT: DW_AT_low_pc
 ; CHECK-NOT: DW_AT_stmt_list
@@ -76,7 +76,7 @@
 ; CHECK: DW_AT_type [DW_FORM_ref4]       (cu + 0x{{[0-9a-f]*}} => {[[TYPE:0x[0-9a-f]*]]})
 ; CHECK: DW_AT_external [DW_FORM_flag_present]   (true)
 ; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01)
-; CHECK: DW_AT_decl_line [DW_FORM_data1] (0x01)
+; CHECK: DW_AT_decl_line [DW_FORM_data1] (1)
 ; CHECK: DW_AT_location [DW_FORM_exprloc] (<0x2> fb 00 )
 ; CHECK: [[TYPE]]: DW_TAG_base_type
 ; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000003) string = "int")
@@ -111,4 +111,4 @@
 ; HDR-NOT: .debug_aranges
 ; HDR-NOT: .rela.{{.*}}.dwo
 
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/fission-hash.ll b/test/DebugInfo/X86/fission-hash.ll
index 3987faa..9831063 100644
--- a/test/DebugInfo/X86/fission-hash.ll
+++ b/test/DebugInfo/X86/fission-hash.ll

@@ -9,8 +9,8 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 188230) (llvm/trunk 188234)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"foo.dwo"} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 188230) (llvm/trunk 188234)\000\00\000\00foo.dwo\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/fission-inline.ll b/test/DebugInfo/X86/fission-inline.ll
new file mode 100644
index 0000000..29c770d
--- /dev/null
+++ b/test/DebugInfo/X86/fission-inline.ll

@@ -0,0 +1,119 @@
+; RUN: llc -split-dwarf=Enable -O0 < %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Test the emission of gmlt-like inlining information into the skeleton unit.
+; This allows inline-aware symbolication/backtracing given only the linked
+; executable, without needing access to the .dwos.
+
+; A simple example of inlining generated with clang -gsplit-dwarf
+
+; A member function is used to force emission of the declaration of the
+; function into the .dwo file, which may be shared with other CUs in the dwo ;
+; under fission, but should not be shared with the skeleton's CU. This also
+; tests the general case of context emission, which is suppressed in gmlt-like
+; data.
+
+; Include a template just to test template parameters are not emitted in
+; gmlt-like data.
+
+; And some varargs to make sure DW_TAG_unspecified_parameters is not emitted.
+
+; And a using declaration in a nested lexical_block... because that shouldn't
+; be emitted either.
+
+; Minor complication: after generating the LLVM IR, it was manually edited so
+; that the 'f1()' call from f3 was reordered to appear between the two inlined
+; f1 calls from f2. This causes f2's inlined_subroutine to use DW_AT_ranges,
+; thus exercising range list generation/referencing which was buggy.
+
+; struct foo {
+;   template<typename T>
+;   static void f2();
+;   static void f3(...);
+; };
+;
+; void f1();
+;
+; template<typename T>
+; inline __attribute__((always_inline)) void foo::f2() {
+;   f1();
+;   f1();
+; }
+;
+; void foo::f3(...) {
+;   if (true) {
+;     f1();
+;     f2<int>();
+;     using ::foo;
+;   }
+; }
+
+; Check that we emit the usual gmlt-like data for this file, including brief
+; descriptions of subprograms with inlined scopes.
+
+; FIXME: Once tools support indexed addresses in the skeleton CU, we should use
+; those (DW_FORM_addr would become DW_FORM_GNU_addr_index below) since those
+; addresses will already be in the address pool anyway.
+
+; CHECK:      DW_TAG_subprogram
+; CHECK-NEXT:   DW_AT_name {{.*}} "f2<int>"
+; CHECK-NOT: DW_
+; CHECK:      DW_TAG_subprogram
+; CHECK-NEXT:   DW_AT_low_pc [DW_FORM_addr]
+; CHECK-NEXT:   DW_AT_high_pc
+; CHECK-NEXT:   DW_AT_name {{.*}} "f3"
+; CHECK-NOT: {{DW_|NULL}}
+; CHECK:        DW_TAG_inlined_subroutine
+; CHECK-NEXT:     DW_AT_abstract_origin {{.*}} "f2<int>"
+; CHECK-NEXT:     DW_AT_ranges
+; CHECK-NEXT:     DW_AT_call_file
+; CHECK-NEXT:     DW_AT_call_line {{.*}} (18)
+; CHECK-NOT: DW_
+
+; Function Attrs: uwtable
+define void @_ZN3foo2f3Ez(...) #0 align 2 {
+entry:
+  call void @_Z2f1v(), !dbg !26
+  call void @_Z2f1v(), !dbg !25
+  call void @_Z2f1v(), !dbg !28
+  ret void, !dbg !29
+}
+
+declare void @_Z2f1v() #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22, !23}
+!llvm.ident = !{!24}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00fission-inline.dwo\001", metadata !1, metadata !2, metadata !3, metadata !9, metadata !2, metadata !18} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/fission-inline.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"fission-inline.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{metadata !"0x2e\00f3\00f3\00_ZN3foo2f3Ez\004\000\000\000\000\00256\000\004", metadata !1, metadata !"_ZTS3foo", metadata !7, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [f3]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, null}
+!9 = metadata !{metadata !10, metadata !11}
+!10 = metadata !{metadata !"0x2e\00f3\00f3\00_ZN3foo2f3Ez\0015\000\001\000\000\00256\000\0015", metadata !1, metadata !"_ZTS3foo", metadata !7, null, void (...)* @_ZN3foo2f3Ez, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 15] [def] [f3]
+!11 = metadata !{metadata !"0x2e\00f2<int>\00f2<int>\00_ZN3foo2f2IiEEvv\0010\000\001\000\000\00256\000\0010", metadata !1, metadata !"_ZTS3foo", metadata !12, null, null, metadata !14, metadata !17, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [f2<int>]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{null}
+!14 = metadata !{metadata !15}
+!15 = metadata !{metadata !"0x2f\00T\000\000", null, metadata !16, null} ; [ DW_TAG_template_type_parameter ]
+!16 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!17 = metadata !{metadata !"0x2e\00f2<int>\00f2<int>\00_ZN3foo2f2IiEEvv\0010\000\000\000\000\00256\000\0010", metadata !1, metadata !"_ZTS3foo", metadata !12, null, null, metadata !14, null, null} ; [ DW_TAG_subprogram ] [line 10] [f2<int>]
+!18 = metadata !{metadata !19}
+!19 = metadata !{metadata !"0x8\0019\00", metadata !20, metadata !"_ZTS3foo"} ; [ DW_TAG_imported_declaration ]
+!20 = metadata !{metadata !"0xb\0016\0013\001", metadata !1, metadata !21} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/fission-inline.cpp]
+!21 = metadata !{metadata !"0xb\0016\007\000", metadata !1, metadata !10} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/fission-inline.cpp]
+!22 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!23 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!24 = metadata !{metadata !"clang version 3.6.0 "}
+!25 = metadata !{i32 17, i32 5, metadata !20, null}
+!26 = metadata !{i32 11, i32 3, metadata !11, metadata !27}
+!27 = metadata !{i32 18, i32 5, metadata !20, null}
+!28 = metadata !{i32 12, i32 3, metadata !11, metadata !27}
+!29 = metadata !{i32 21, i32 1, metadata !10, null}

diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index 1358375..f66382a 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll

@@ -16,7 +16,7 @@
 ; CHECK: DW_AT_location [DW_FORM_sec_offset]   ([[E:0x[0-9a-z]*]])
 ; CHECK: DW_AT_location [DW_FORM_sec_offset]   ([[B:0x[0-9a-z]*]])
 ; CHECK: DW_AT_location [DW_FORM_sec_offset]   ([[D:0x[0-9a-z]*]])
-; CHECK: DW_AT_ranges [DW_FORM_sec_offset]   (0x000000a0)
+; CHECK: DW_AT_ranges [DW_FORM_sec_offset]   (0x00000000
 ; CHECK: .debug_loc contents:
 ; CHECK-NOT: Beginning address offset
 ; CHECK: .debug_loc.dwo contents:
@@ -25,7 +25,7 @@
 ; if they've changed due to a bugfix, change in register allocation, etc.
 
 ; CHECK: [[A]]: Beginning address index: 2
-; CHECK-NEXT:                    Length: 199
+; CHECK-NEXT:                    Length: 190
 ; CHECK-NEXT:      Location description: 11 00
 ; CHECK-NEXT: {{^$}}
 ; CHECK-NEXT:   Beginning address index: 3
@@ -91,8 +91,8 @@
 ; Function Attrs: nounwind uwtable
 define internal fastcc void @foo() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !29, i64 0, metadata !13), !dbg !30
-  tail call void @llvm.dbg.value(metadata !44, i64 0, metadata !14), !dbg !31
+  tail call void @llvm.dbg.value(metadata !29, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata !44, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !31
   %c.promoted9 = load i32* @c, align 4, !dbg !32, !tbaa !33
   br label %for.cond1.preheader, !dbg !31
 
@@ -114,28 +114,28 @@
 for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
   %and2 = phi i32 [ %and.lcssa5, %for.cond7.preheader ], [ %and, %for.body9 ], !dbg !40
   %e.01 = phi i32 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
-  tail call void @llvm.dbg.value(metadata !41, i64 0, metadata !19), !dbg !40
+  tail call void @llvm.dbg.value(metadata !41, i64 0, metadata !19, metadata !{metadata !"0x102"}), !dbg !40
   %and = and i32 %and2, 1, !dbg !32
   %inc = add i32 %e.01, 1, !dbg !39
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !18), !dbg !39
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !39
   %exitcond = icmp eq i32 %inc, 30, !dbg !39
   br i1 %exitcond, label %for.inc10, label %for.body9, !dbg !39
 
 for.inc10:                                        ; preds = %for.body9
   %inc11 = add nsw i32 %b.03, 1, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i32 %inc11}, i64 0, metadata !15), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i32 %inc11}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !38
   %exitcond11 = icmp eq i32 %inc11, 30, !dbg !38
   br i1 %exitcond11, label %for.inc13, label %for.cond7.preheader, !dbg !38
 
 for.inc13:                                        ; preds = %for.inc10
   %inc14 = add i32 %d.06, 1, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{i32 %inc14}, i64 0, metadata !16), !dbg !37
+  tail call void @llvm.dbg.value(metadata !{i32 %inc14}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !37
   %exitcond12 = icmp eq i32 %inc14, 30, !dbg !37
   br i1 %exitcond12, label %for.inc16, label %for.cond4.preheader, !dbg !37
 
 for.inc16:                                        ; preds = %for.inc13
   %inc17 = add nsw i32 %a.08, 1, !dbg !31
-  tail call void @llvm.dbg.value(metadata !{i32 %inc17}, i64 0, metadata !14), !dbg !31
+  tail call void @llvm.dbg.value(metadata !{i32 %inc17}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !31
   %exitcond13 = icmp eq i32 %inc17, 30, !dbg !31
   br i1 %exitcond13, label %for.end18, label %for.cond1.preheader, !dbg !31
 
@@ -145,7 +145,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -153,32 +153,32 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!26, !43}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 191700) (llvm/trunk 191710)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"small.dwo"} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/small.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 191700) (llvm/trunk 191710)\001\00\000\00small.dwo\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/small.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"small.c", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 18, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void ()* @bar, null, null, metadata !2, i32 19} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 19] [bar]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/small.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00bar\00bar\00\0018\000\001\000\006\000\001\0019", metadata !1, metadata !5, metadata !6, null, void ()* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 19] [bar]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/small.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
-!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !9, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @foo, null, null, metadata !12, i32 3} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [scope 3] [foo]
-!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x2e\00foo\00foo\00\002\001\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !9, null, void ()* @foo, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [scope 3] [foo]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{null, metadata !11}
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16, metadata !18, metadata !19}
-!13 = metadata !{i32 786689, metadata !8, metadata !"p", metadata !5, i32 16777218, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 2]
-!14 = metadata !{i32 786688, metadata !8, metadata !"a", metadata !5, i32 4, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 4]
-!15 = metadata !{i32 786688, metadata !8, metadata !"b", metadata !5, i32 4, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 4]
-!16 = metadata !{i32 786688, metadata !8, metadata !"d", metadata !5, i32 5, metadata !17, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 5]
-!17 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!18 = metadata !{i32 786688, metadata !8, metadata !"e", metadata !5, i32 5, metadata !17, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [e] [line 5]
-!19 = metadata !{i32 786688, metadata !20, metadata !"w", metadata !5, i32 12, metadata !25, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [w] [line 12]
-!20 = metadata !{i32 786443, metadata !1, metadata !21, i32 11, i32 0, i32 4} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!21 = metadata !{i32 786443, metadata !1, metadata !22, i32 10, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!22 = metadata !{i32 786443, metadata !1, metadata !23, i32 9, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!23 = metadata !{i32 786443, metadata !1, metadata !24, i32 8, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!24 = metadata !{i32 786443, metadata !1, metadata !8, i32 7, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
-!25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!13 = metadata !{metadata !"0x101\00p\0016777218\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [p] [line 2]
+!14 = metadata !{metadata !"0x100\00a\004\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [a] [line 4]
+!15 = metadata !{metadata !"0x100\00b\004\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [b] [line 4]
+!16 = metadata !{metadata !"0x100\00d\005\000", metadata !8, metadata !5, metadata !17} ; [ DW_TAG_auto_variable ] [d] [line 5]
+!17 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!18 = metadata !{metadata !"0x100\00e\005\000", metadata !8, metadata !5, metadata !17} ; [ DW_TAG_auto_variable ] [e] [line 5]
+!19 = metadata !{metadata !"0x100\00w\0012\000", metadata !20, metadata !5, metadata !25} ; [ DW_TAG_auto_variable ] [w] [line 12]
+!20 = metadata !{metadata !"0xb\0011\000\004", metadata !1, metadata !21} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!21 = metadata !{metadata !"0xb\0010\000\003", metadata !1, metadata !22} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!22 = metadata !{metadata !"0xb\009\000\002", metadata !1, metadata !23} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!23 = metadata !{metadata !"0xb\008\000\001", metadata !1, metadata !24} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!24 = metadata !{metadata !"0xb\007\000\000", metadata !1, metadata !8} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
 !26 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !27 = metadata !{i32 20, i32 0, metadata !4, null}
 !28 = metadata !{i32 21, i32 0, metadata !4, null}
@@ -190,11 +190,11 @@
 !34 = metadata !{metadata !"int", metadata !35, i64 0}
 !35 = metadata !{metadata !"omnipotent char", metadata !36, i64 0}
 !36 = metadata !{metadata !"Simple C/C++ TBAA"}
-!37 = metadata !{i32 8, i32 0, metadata !23, null} ; [ DW_TAG_imported_declaration ]
+!37 = metadata !{i32 8, i32 0, metadata !23, null}
 !38 = metadata !{i32 9, i32 0, metadata !22, null}
 !39 = metadata !{i32 10, i32 0, metadata !21, null}
 !40 = metadata !{i32 12, i32 0, metadata !20, null}
 !41 = metadata !{i32* @c}
 !42 = metadata !{i32 15, i32 0, metadata !8, null}
-!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !44 = metadata !{i32 0}

diff --git a/test/DebugInfo/X86/formal_parameter.ll b/test/DebugInfo/X86/formal_parameter.ll
index 2fdab7a..56891ec 100644
--- a/test/DebugInfo/X86/formal_parameter.ll
+++ b/test/DebugInfo/X86/formal_parameter.ll

@@ -28,7 +28,7 @@
 entry:
   %map.addr = alloca i32, align 4
   store i32 %map, i32* %map.addr, align 4, !tbaa !15
-  call void @llvm.dbg.declare(metadata !{i32* %map.addr}, metadata !10), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i32* %map.addr}, metadata !10, metadata !{metadata !"0x102"}), !dbg !14
   %call = call i32 (i32*, ...)* bitcast (i32 (...)* @lookup to i32 (i32*, ...)*)(i32* %map.addr) #3, !dbg !19
   ; Ensure that all dbg intrinsics have the same scope after
   ; LowerDbgDeclare is finished with them.
@@ -42,14 +42,14 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare i32 @lookup(...)
 
 declare i32 @verify(...)
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp uwtable }
 attributes #1 = { nounwind readnone }
@@ -59,19 +59,19 @@
 !llvm.module.flags = !{!11, !12}
 !llvm.ident = !{!13}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [formal_parameter.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [formal_parameter.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"formal_parameter.c", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32)* @foo, null, null, metadata !9, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [formal_parameter.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [formal_parameter.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786689, metadata !4, metadata !"map", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [map] [line 1]
+!10 = metadata !{metadata !"0x101\00map\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [map] [line 1]
 !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !13 = metadata !{metadata !"clang version 3.5.0 "}
 !14 = metadata !{i32 1, i32 0, metadata !4, null}
 !15 = metadata !{metadata !16, metadata !16, i64 0}
@@ -80,5 +80,5 @@
 !18 = metadata !{metadata !"Simple C/C++ TBAA"}
 !19 = metadata !{i32 3, i32 0, metadata !4, null}
 !20 = metadata !{i32 4, i32 0, metadata !21, null}
-!21 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [formal_parameter.c]
+!21 = metadata !{metadata !"0xb\004\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [formal_parameter.c]
 !22 = metadata !{i32 5, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/generate-odr-hash.ll b/test/DebugInfo/X86/generate-odr-hash.ll
index 2256b3e..e7a37ea 100644
--- a/test/DebugInfo/X86/generate-odr-hash.ll
+++ b/test/DebugInfo/X86/generate-odr-hash.ll

@@ -1,10 +1,12 @@
 ; REQUIRES: object-emission
 
-; RUN: llc %s -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu
+; RUN: llc < %s -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu
 ; RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=SINGLE %s
+; RUN: llvm-readobj -s -t %t | FileCheck --check-prefix=OBJ_SINGLE %s
 
-; RUN: llc %s -split-dwarf=Enable -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu
+; RUN: llc < %s -split-dwarf=Enable -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu
 ; RUN: llvm-dwarfdump %t | FileCheck --check-prefix=CHECK --check-prefix=FISSION %s
+; RUN: llvm-readobj -s -t %t | FileCheck --check-prefix=OBJ_FISSION %s
 
 ; Generated from bar.cpp:
 
@@ -72,27 +74,16 @@
 ; CHECK-NEXT: DW_AT_declaration
 ; CHECK-NEXT: DW_AT_signature {{.*}} (0xfd756cee88f8a118)
 
-; FISSION-LABEL: .debug_types contents:
-; FISSION-NOT: type_signature
-; FISSION-LABEL: type_signature = 0x1d02f3be30cc5688
-; FISSION: DW_TAG_type_unit
-; FISSION-NEXT: DW_AT_GNU_dwo_name{{.*}}"bar.dwo"
-; FISSION-NEXT: DW_AT_comp_dir{{.*}}"/tmp/dbginfo"
-; FISSION-NOT: type_signature
-; FISSION-LABEL: type_signature = 0xb04af47397402e77
-; FISSION-NOT: type_signature
-; FISSION-LABEL: type_signature = 0xfd756cee88f8a118
-; FISSION-NOT: type_signature
-; FISSION-LABEL: type_signature = 0xe94f6d3843e62d6b
-
 ; SINGLE-LABEL: .debug_types contents:
+; FISSION-NOT: .debug_types contents:
 ; FISSION-LABEL: .debug_types.dwo contents:
 
 ; Check that we generate a hash for bar and the value.
 ; CHECK-NOT: type_signature
 ; CHECK-LABEL: type_signature = 0x1d02f3be30cc5688
 ; CHECK: DW_TAG_structure_type
-; CHECK-NEXT: DW_AT_name{{.*}}"bar"
+; FISSION-NEXT: DW_AT_name {{.*}} ( indexed {{.*}} "bar"
+; SINGLE-NEXT: DW_AT_name {{.*}} "bar"
 
 
 ; Check that we generate a hash for fluffy and the value.
@@ -161,6 +152,20 @@
 ; CHECK-DAG: [[WOMBAT]] "wombat"
 ; CHECK-DAG: [[FLUFFY]] "echidna::capybara::mongoose::fluffy"
 
+; Make sure debug_types are in comdat groups. This could be more rigid to check
+; that they're the right comdat groups (each type in a separate comdat group,
+; etc)
+; OBJ_SINGLE: Name: .debug_types (
+; OBJ_SINGLE-NOT: }
+; OBJ_SINGLE: SHF_GROUP
+
+; Fission type units don't go in comdat groups, since their linker is debug
+; aware it's handled using the debug info semantics rather than raw ELF object
+; semantics.
+; OBJ_FISSION: Name: .debug_types.dwo (
+; OBJ_FISSION-NOT: SHF_GROUP
+; OBJ_FISSION: }
+
 %struct.bar = type { i8 }
 %"class.echidna::capybara::mongoose::fluffy" = type { i32, i32 }
 %"struct.<anonymous namespace>::walrus" = type { i8 }
@@ -178,12 +183,12 @@
 define void @_Z3foov() #0 {
 entry:
   %b = alloca %struct.baz, align 1
-  call void @llvm.dbg.declare(metadata !{%struct.baz* %b}, metadata !46), !dbg !48
+  call void @llvm.dbg.declare(metadata !{%struct.baz* %b}, metadata !46, metadata !{metadata !"0x102"}), !dbg !48
   ret void, !dbg !49
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 define internal void @__cxx_global_var_init() section ".text.startup" {
 entry:
@@ -196,7 +201,7 @@
 entry:
   %this.addr = alloca %"struct.<anonymous namespace>::walrus"*, align 8
   store %"struct.<anonymous namespace>::walrus"* %this, %"struct.<anonymous namespace>::walrus"** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%"struct.<anonymous namespace>::walrus"** %this.addr}, metadata !51), !dbg !53
+  call void @llvm.dbg.declare(metadata !{%"struct.<anonymous namespace>::walrus"** %this.addr}, metadata !51, metadata !{metadata !"0x102"}), !dbg !53
   %this1 = load %"struct.<anonymous namespace>::walrus"** %this.addr
   ret void, !dbg !54
 }
@@ -214,59 +219,59 @@
 !llvm.module.flags = !{!43, !44}
 !llvm.ident = !{!45}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !21, metadata !38, metadata !2, metadata !"bar.dwo"} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/bar.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00bar.dwo\000", metadata !1, metadata !2, metadata !3, metadata !21, metadata !38, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/bar.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"bar.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !6, metadata !14, metadata !17}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"bar", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 1, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00bar\001\008\008\000\000\000", metadata !5, null, null, metadata !2, null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 1, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !"bar.h", metadata !"/tmp/dbginfo"}
-!6 = metadata !{i32 786434, metadata !1, metadata !7, metadata !"fluffy", i32 13, i64 64, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE"} ; [ DW_TAG_class_type ] [fluffy] [line 13, size 64, align 32, offset 0] [def] [from ]
-!7 = metadata !{i32 786489, metadata !1, metadata !8, metadata !"mongoose", i32 12} ; [ DW_TAG_namespace ] [mongoose] [line 12]
-!8 = metadata !{i32 786489, metadata !1, metadata !9, metadata !"capybara", i32 11} ; [ DW_TAG_namespace ] [capybara] [line 11]
-!9 = metadata !{i32 786489, metadata !1, null, metadata !"echidna", i32 10} ; [ DW_TAG_namespace ] [echidna] [line 10]
+!6 = metadata !{metadata !"0x2\00fluffy\0013\0064\0032\000\000\000", metadata !1, metadata !7, null, metadata !10, null, null, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE"} ; [ DW_TAG_class_type ] [fluffy] [line 13, size 64, align 32, offset 0] [def] [from ]
+!7 = metadata !{metadata !"0x39\00mongoose\0012", metadata !1, metadata !8} ; [ DW_TAG_namespace ] [mongoose] [line 12]
+!8 = metadata !{metadata !"0x39\00capybara\0011", metadata !1, metadata !9} ; [ DW_TAG_namespace ] [capybara] [line 11]
+!9 = metadata !{metadata !"0x39\00echidna\0010", metadata !1, null} ; [ DW_TAG_namespace ] [echidna] [line 10]
 !10 = metadata !{metadata !11, metadata !13}
-!11 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE", metadata !"a", i32 14, i64 32, i64 32, i64 0, i32 1, metadata !12} ; [ DW_TAG_member ] [a] [line 14, size 32, align 32, offset 0] [private] [from int]
-!12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE", metadata !"b", i32 15, i64 32, i64 32, i64 32, i32 1, metadata !12} ; [ DW_TAG_member ] [b] [line 15, size 32, align 32, offset 32] [private] [from int]
-!14 = metadata !{i32 786451, metadata !1, null, metadata !"wombat", i32 31, i64 64, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null, metadata !"_ZTS6wombat"} ; [ DW_TAG_structure_type ] [wombat] [line 31, size 64, align 32, offset 0] [def] [from ]
+!11 = metadata !{metadata !"0xd\00a\0014\0032\0032\000\001", metadata !1, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE", metadata !12} ; [ DW_TAG_member ] [a] [line 14, size 32, align 32, offset 0] [private] [from int]
+!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{metadata !"0xd\00b\0015\0032\0032\0032\001", metadata !1, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE", metadata !12} ; [ DW_TAG_member ] [b] [line 15, size 32, align 32, offset 32] [private] [from int]
+!14 = metadata !{metadata !"0x13\00wombat\0031\0064\0032\000\000\000", metadata !1, null, null, metadata !15, null, null, metadata !"_ZTS6wombat"} ; [ DW_TAG_structure_type ] [wombat] [line 31, size 64, align 32, offset 0] [def] [from ]
 !15 = metadata !{metadata !16}
-!16 = metadata !{i32 786445, metadata !1, metadata !"_ZTS6wombat", metadata !"a_b", i32 35, i64 64, i64 32, i64 0, i32 0, metadata !"_ZTSN6wombatUt_E"} ; [ DW_TAG_member ] [a_b] [line 35, size 64, align 32, offset 0] [from _ZTSN6wombatUt_E]
-!17 = metadata !{i32 786451, metadata !1, metadata !"_ZTS6wombat", metadata !"", i32 32, i64 64, i64 32, i32 0, i32 0, null, metadata !18, i32 0, null, null, metadata !"_ZTSN6wombatUt_E"} ; [ DW_TAG_structure_type ] [line 32, size 64, align 32, offset 0] [def] [from ]
+!16 = metadata !{metadata !"0xd\00a_b\0035\0064\0032\000\000", metadata !1, metadata !"_ZTS6wombat", metadata !"_ZTSN6wombatUt_E"} ; [ DW_TAG_member ] [a_b] [line 35, size 64, align 32, offset 0] [from _ZTSN6wombatUt_E]
+!17 = metadata !{metadata !"0x13\00\0032\0064\0032\000\000\000", metadata !1, metadata !"_ZTS6wombat", null, metadata !18, null, null, metadata !"_ZTSN6wombatUt_E"} ; [ DW_TAG_structure_type ] [line 32, size 64, align 32, offset 0] [def] [from ]
 !18 = metadata !{metadata !19, metadata !20}
-!19 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN6wombatUt_E", metadata !"a", i32 33, i64 32, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ] [a] [line 33, size 32, align 32, offset 0] [from int]
-!20 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN6wombatUt_E", metadata !"b", i32 34, i64 32, i64 32, i64 32, i32 0, metadata !12} ; [ DW_TAG_member ] [b] [line 34, size 32, align 32, offset 32] [from int]
+!19 = metadata !{metadata !"0xd\00a\0033\0032\0032\000\000", metadata !1, metadata !"_ZTSN6wombatUt_E", metadata !12} ; [ DW_TAG_member ] [a] [line 33, size 32, align 32, offset 0] [from int]
+!20 = metadata !{metadata !"0xd\00b\0034\0032\0032\0032\000", metadata !1, metadata !"_ZTSN6wombatUt_E", metadata !12} ; [ DW_TAG_member ] [b] [line 34, size 32, align 32, offset 32] [from int]
 !21 = metadata !{metadata !22, metadata !26, metadata !27, metadata !36}
-!22 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 5, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3foov, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
-!23 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/bar.cpp]
-!24 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\005\000\001\000\006\00256\000\005", metadata !1, metadata !23, metadata !24, null, void ()* @_Z3foov, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!23 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/bar.cpp]
+!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{null}
-!26 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 29, metadata !24, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 29} ; [ DW_TAG_subprogram ] [line 29] [local] [def] [__cxx_global_var_init]
-!27 = metadata !{i32 786478, metadata !1, metadata !28, metadata !"walrus", metadata !"walrus", metadata !"_ZN12_GLOBAL__N_16walrusC2Ev", i32 25, metadata !32, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%"struct.<anonymous namespace>::walrus"*)* @_ZN12_GLOBAL__N_16walrusC2Ev, null, metadata !31, metadata !2, i32 25} ; [ DW_TAG_subprogram ] [line 25] [local] [def] [walrus]
-!28 = metadata !{i32 786451, metadata !1, metadata !29, metadata !"walrus", i32 24, i64 8, i64 8, i32 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [walrus] [line 24, size 8, align 8, offset 0] [def] [from ]
-!29 = metadata !{i32 786489, metadata !1, null, metadata !"", i32 23} ; [ DW_TAG_namespace ] [line 23]
+!26 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\0029\001\001\000\006\00256\000\0029", metadata !1, metadata !23, metadata !24, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 29] [local] [def] [__cxx_global_var_init]
+!27 = metadata !{metadata !"0x2e\00walrus\00walrus\00_ZN12_GLOBAL__N_16walrusC2Ev\0025\001\001\000\006\00256\000\0025", metadata !1, metadata !28, metadata !32, null, void (%"struct.<anonymous namespace>::walrus"*)* @_ZN12_GLOBAL__N_16walrusC2Ev, null, metadata !31, metadata !2} ; [ DW_TAG_subprogram ] [line 25] [local] [def] [walrus]
+!28 = metadata !{metadata !"0x13\00walrus\0024\008\008\000\000\000", metadata !1, metadata !29, null, metadata !30, null, null, null} ; [ DW_TAG_structure_type ] [walrus] [line 24, size 8, align 8, offset 0] [def] [from ]
+!29 = metadata !{metadata !"0x39\00\0023", metadata !1, null} ; [ DW_TAG_namespace ] [line 23]
 !30 = metadata !{metadata !31}
-!31 = metadata !{i32 786478, metadata !1, metadata !28, metadata !"walrus", metadata !"walrus", metadata !"", i32 25, metadata !32, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !35, i32 25} ; [ DW_TAG_subprogram ] [line 25] [walrus]
-!32 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !33, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!31 = metadata !{metadata !"0x2e\00walrus\00walrus\00\0025\000\000\000\006\00256\000\0025", metadata !1, metadata !28, metadata !32, null, null, null, i32 0, metadata !35} ; [ DW_TAG_subprogram ] [line 25] [walrus]
+!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !33 = metadata !{null, metadata !34}
-!34 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from walrus]
+!34 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from walrus]
 !35 = metadata !{i32 786468}
-!36 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"", metadata !"", metadata !"_GLOBAL__I_a", i32 25, metadata !37, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__I_a, null, null, metadata !2, i32 25} ; [ DW_TAG_subprogram ] [line 25] [local] [def]
-!37 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!36 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__I_a\0025\001\001\000\006\0064\000\0025", metadata !1, metadata !23, metadata !37, null, void ()* @_GLOBAL__I_a, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 25] [local] [def]
+!37 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !38 = metadata !{metadata !39, metadata !40, metadata !41, metadata !42}
-!39 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !23, i32 3, metadata !4, i32 0, i32 1, %struct.bar* @b, null} ; [ DW_TAG_variable ] [b] [line 3] [def]
-!40 = metadata !{i32 786484, i32 0, metadata !7, metadata !"animal", metadata !"animal", metadata !"_ZN7echidna8capybara8mongoose6animalE", metadata !23, i32 18, metadata !6, i32 0, i32 1, %"class.echidna::capybara::mongoose::fluffy"* @_ZN7echidna8capybara8mongoose6animalE, null} ; [ DW_TAG_variable ] [animal] [line 18] [def]
-!41 = metadata !{i32 786484, i32 0, null, metadata !"w", metadata !"w", metadata !"", metadata !23, i32 29, metadata !28, i32 1, i32 1, %"struct.<anonymous namespace>::walrus"* @w, null} ; [ DW_TAG_variable ] [w] [line 29] [local] [def]
-!42 = metadata !{i32 786484, i32 0, null, metadata !"wom", metadata !"wom", metadata !"", metadata !23, i32 38, metadata !14, i32 0, i32 1, %struct.wombat* @wom, null} ; [ DW_TAG_variable ] [wom] [line 38] [def]
+!39 = metadata !{metadata !"0x34\00b\00b\00\003\000\001", null, metadata !23, metadata !4, %struct.bar* @b, null} ; [ DW_TAG_variable ] [b] [line 3] [def]
+!40 = metadata !{metadata !"0x34\00animal\00animal\00_ZN7echidna8capybara8mongoose6animalE\0018\000\001", metadata !7, metadata !23, metadata !6, %"class.echidna::capybara::mongoose::fluffy"* @_ZN7echidna8capybara8mongoose6animalE, null} ; [ DW_TAG_variable ] [animal] [line 18] [def]
+!41 = metadata !{metadata !"0x34\00w\00w\00\0029\001\001", null, metadata !23, metadata !28, %"struct.<anonymous namespace>::walrus"* @w, null} ; [ DW_TAG_variable ] [w] [line 29] [local] [def]
+!42 = metadata !{metadata !"0x34\00wom\00wom\00\0038\000\001", null, metadata !23, metadata !14, %struct.wombat* @wom, null} ; [ DW_TAG_variable ] [wom] [line 38] [def]
 !43 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!44 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!44 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !45 = metadata !{metadata !"clang version 3.5 "}
-!46 = metadata !{i32 786688, metadata !22, metadata !"b", metadata !23, i32 7, metadata !47, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 7]
-!47 = metadata !{i32 786451, metadata !1, metadata !22, metadata !"baz", i32 6, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [baz] [line 6, size 8, align 8, offset 0] [def] [from ]
+!46 = metadata !{metadata !"0x100\00b\007\000", metadata !22, metadata !23, metadata !47} ; [ DW_TAG_auto_variable ] [b] [line 7]
+!47 = metadata !{metadata !"0x13\00baz\006\008\008\000\000\000", metadata !1, metadata !22, null, metadata !2, null, null, null} ; [ DW_TAG_structure_type ] [baz] [line 6, size 8, align 8, offset 0] [def] [from ]
 !48 = metadata !{i32 7, i32 0, metadata !22, null}
-!49 = metadata !{i32 8, i32 0, metadata !22, null} ; [ DW_TAG_imported_declaration ]
+!49 = metadata !{i32 8, i32 0, metadata !22, null}
 !50 = metadata !{i32 29, i32 0, metadata !26, null}
-!51 = metadata !{i32 786689, metadata !27, metadata !"this", null, i32 16777216, metadata !52, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!52 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from walrus]
+!51 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !27, null, metadata !52} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!52 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from walrus]
 !53 = metadata !{i32 0, i32 0, metadata !27, null}
 !54 = metadata !{i32 25, i32 0, metadata !27, null}
 !55 = metadata !{i32 25, i32 0, metadata !36, null}

diff --git a/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll b/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll
new file mode 100644
index 0000000..c430b3e
--- /dev/null
+++ b/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll

@@ -0,0 +1,105 @@
+; REQUIRES: object-emission
+
+; RUN: llc -mtriple=x86_64-apple-macosx10.10.0 -o %t %s
+
+; Testcase generated from:
+; #include <stdint.h>
+; int foo(int a) {
+;     int b = (int16_t)a + 8;
+;     int c = (int16_t)b + 8;
+;     int d = (int16_t)c + 8;
+;     int e = (int16_t)d + 8;
+;     int f = (int16_t)e + 8;
+;     return f;
+; }
+; by emitting the IR and then manually applying mem2reg to it.
+
+; This testcase would trigger the assert commited along with it if the
+; fix of r221709 isn't applied. There is no other check except the successful
+; run of llc.
+; What happened before r221709, is that SDDbgInfo (the data structure helping
+; SelectionDAG to keep track of dbg.values) kept a map keyed by SDNode pointers.
+; This map was never purged when the SDNodes were deallocated and thus if a new
+; SDNode was allocated in the same memory, it would have an entry in the SDDbgInfo
+; map upon creation (Reallocation in the same memory can happen easily as
+; SelectionDAG uses a Recycling allocator). This behavior could turn into a
+; pathological memory consumption explosion if the DAG combiner hit the 'right'
+; allocation patterns as could be seen in PR20893.
+; By nature, this test could bitrot quite easily. If it doesn't trigger an assert
+; when run with r221709 reverted, then it really doesn't test anything anymore.
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo(i32 %a) #0 {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 %a}, i64 0, metadata !16, metadata !17), !dbg !18
+  %conv = trunc i32 %a to i16, !dbg !19
+  %conv1 = sext i16 %conv to i32, !dbg !19
+  %add = add nsw i32 %conv1, 8, !dbg !19
+  call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !20, metadata !17), !dbg !21
+  %conv2 = trunc i32 %add to i16, !dbg !22
+  %conv3 = sext i16 %conv2 to i32, !dbg !22
+  %add4 = add nsw i32 %conv3, 8, !dbg !22
+  call void @llvm.dbg.value(metadata !{i32 %add4}, i64 0, metadata !23, metadata !17), !dbg !24
+  %conv5 = trunc i32 %add4 to i16, !dbg !25
+  %conv6 = sext i16 %conv5 to i32, !dbg !25
+  %add7 = add nsw i32 %conv6, 8, !dbg !25
+  call void @llvm.dbg.value(metadata !{i32 %add7}, i64 0, metadata !26, metadata !17), !dbg !27
+  %conv8 = trunc i32 %add7 to i16, !dbg !28
+  %conv9 = sext i16 %conv8 to i32, !dbg !28
+  %add10 = add nsw i32 %conv9, 8, !dbg !28
+  call void @llvm.dbg.value(metadata !{i32 %add10}, i64 0, metadata !29, metadata !17), !dbg !30
+  %conv11 = trunc i32 %add10 to i16, !dbg !31
+  %conv12 = sext i16 %conv11 to i32, !dbg !31
+  %add13 = add nsw i32 %conv12, 8, !dbg !31
+  call void @llvm.dbg.value(metadata !{i32 %add13}, i64 0, metadata !32, metadata !17), !dbg !33
+  ret i32 %add13, !dbg !34
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !14}
+!llvm.ident = !{!15}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !7, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/ghost-sdnode-dbgvalues.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"ghost-sdnode-dbgvalues.c", metadata !"/tmp"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x16\00int16_t\0030\000\000\000\000", metadata !5, null, metadata !6} ; [ DW_TAG_typedef ] [int16_t] [line 30, size 0, align 0, offset 0] [from short]
+!5 = metadata !{metadata !"/usr/include/sys/_types/_int16_t.h", metadata !"/tmp"}
+!6 = metadata !{metadata !"0x24\00short\000\0016\0016\000\000\005", null, null} ; [ DW_TAG_base_type ] [short] [line 0, size 16, align 16, offset 0, enc DW_ATE_signed]
+!7 = metadata !{metadata !8}
+!8 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\000\00256\000\003", metadata !1, metadata !9, metadata !10, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!9 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/tmp/ghost-sdnode-dbgvalues.c]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !12, metadata !12}
+!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!14 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!15 = metadata !{metadata !"clang version 3.6.0 "}
+!16 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!17 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
+!18 = metadata !{i32 3, i32 13, metadata !8, null}
+!19 = metadata !{i32 4, i32 5, metadata !8, null}
+!20 = metadata !{metadata !"0x100\00b\004\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [b] [line 4]
+!21 = metadata !{i32 4, i32 9, metadata !8, null}
+!22 = metadata !{i32 5, i32 5, metadata !8, null}
+!23 = metadata !{metadata !"0x100\00c\005\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [c] [line 5]
+!24 = metadata !{i32 5, i32 9, metadata !8, null}
+!25 = metadata !{i32 6, i32 5, metadata !8, null}
+!26 = metadata !{metadata !"0x100\00d\006\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [d] [line 6]
+!27 = metadata !{i32 6, i32 9, metadata !8, null}
+!28 = metadata !{i32 7, i32 5, metadata !8, null}
+!29 = metadata !{metadata !"0x100\00e\007\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [e] [line 7]
+!30 = metadata !{i32 7, i32 9, metadata !8, null}
+!31 = metadata !{i32 8, i32 5, metadata !8, null}
+!32 = metadata !{metadata !"0x100\00f\008\000", metadata !8, metadata !9, metadata !12} ; [ DW_TAG_auto_variable ] [f] [line 8]
+!33 = metadata !{i32 8, i32 9, metadata !8, null}
+!34 = metadata !{i32 9, i32 5, metadata !8, null}

diff --git a/test/DebugInfo/X86/gmlt.test b/test/DebugInfo/X86/gmlt.test
new file mode 100644
index 0000000..6cdd71d
--- /dev/null
+++ b/test/DebugInfo/X86/gmlt.test

@@ -0,0 +1,2 @@
+; RUN: llc -O0 -filetype=obj < %S/../Inputs/gmlt.ll -mtriple x86_64-apple-darwin | llvm-dwarfdump - \
+; RUN:     | FileCheck --check-prefix=CHECK --check-prefix=DARWIN %S/../Inputs/gmlt.ll

diff --git a/test/DebugInfo/X86/gnu-public-names-empty.ll b/test/DebugInfo/X86/gnu-public-names-empty.ll
index 46ae65d..4c97b3f 100644
--- a/test/DebugInfo/X86/gnu-public-names-empty.ll
+++ b/test/DebugInfo/X86/gnu-public-names-empty.ll

@@ -12,8 +12,8 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 191846) (llvm/trunk 191866)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 191846) (llvm/trunk 191866)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll
index 96fa52b..1696288 100644
--- a/test/DebugInfo/X86/gnu-public-names.ll
+++ b/test/DebugInfo/X86/gnu-public-names.ll

@@ -49,37 +49,43 @@
 ; CHECK: DW_AT_GNU_pubnames [DW_FORM_flag_present]   (true)
 ; CHECK-NOT: DW_AT_GNU_pubtypes [
 
+; CHECK: [[STATIC_MEM_VAR:0x[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}} "static_member_variable"
+
 ; CHECK: [[C:0x[0-9a-f]+]]: DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "C"
 
-; CHECK: [[STATIC_MEM_DECL:0x[0-9a-f]+]]: DW_TAG_member
+; CHECK: DW_TAG_member
 ; CHECK-NEXT: DW_AT_name {{.*}} "static_member_variable"
 
-; CHECK: [[MEM_FUNC_DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK-NEXT: DW_AT_MIPS_linkage_name
 ; CHECK-NEXT: DW_AT_name {{.*}} "member_function"
 
-; CHECK: [[STATIC_MEM_FUNC_DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK-NEXT: DW_AT_MIPS_linkage_name
 ; CHECK-NEXT: DW_AT_name {{.*}} "static_member_function"
 
 ; CHECK: [[INT:0x[0-9a-f]+]]: DW_TAG_base_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "int"
 
-; CHECK: [[STATIC_MEM_VAR:0x[0-9a-f]+]]: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}} {[[STATIC_MEM_DECL]]}
-
 ; CHECK: [[GLOB_VAR:0x[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} "global_variable"
 
 ; CHECK: [[NS:0x[0-9a-f]+]]: DW_TAG_namespace
 ; CHECK-NEXT: DW_AT_name {{.*}} "ns"
 
-; CHECK: [[GLOB_NS_VAR_DECL:0x[0-9a-f]+]]: DW_TAG_variable
+; CHECK: [[GLOB_NS_VAR:0x[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} "global_namespace_variable"
+; CHECK-NOT: DW_AT_specification
+; CHECK: DW_AT_location
+; CHECK-NOT: DW_AT_specification
 
-; CHECK: [[D_VAR_DECL:0x[0-9a-f]+]]: DW_TAG_variable
+; CHECK: [[D_VAR:0x[0-9a-f]+]]: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name {{.*}} "d"
+; CHECK-NOT: DW_AT_specification
+; CHECK: DW_AT_location
+; CHECK-NOT: DW_AT_specification
 
 ; CHECK: [[D:0x[0-9a-f]+]]: DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "D"
@@ -90,12 +96,6 @@
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name {{.*}} "global_namespace_function"
 
-; CHECK: [[GLOB_NS_VAR:0x[0-9a-f]+]]: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}} {[[GLOB_NS_VAR_DECL]]}
-
-; CHECK: [[D_VAR:0x[0-9a-f]+]]: DW_TAG_variable
-; CHECK-NEXT: DW_AT_specification {{.*}} {[[D_VAR_DECL]]}
-
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_name {{.*}} "f3"
@@ -116,7 +116,7 @@
 ; CHECK: [[OUTER_ANON:.*]]:  DW_TAG_namespace
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK-NOT:     DW_AT_name
-; CHECK: [[OUTER_ANON_C_DECL:.*]]:     DW_TAG_variable
+; CHECK: [[OUTER_ANON_C:.*]]: DW_TAG_variable
 ; CHECK-NOT: DW_TAG
 ; CHECK:       DW_AT_name {{.*}} "c"
 ; CHECK-NOT: {{DW_TAG|NULL}}
@@ -129,9 +129,6 @@
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK:   NULL
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: [[OUTER_ANON_C:.*]]: DW_TAG_variable
-; CHECK-NOT: DW_TAG
-; CHECK-NEXT:   DW_AT_specification {{.*}} {[[OUTER_ANON_C_DECL]]}
 
 ; CHECK: [[ANON:.*]]: DW_TAG_namespace
 ; CHECK-NOT:   DW_AT_name
@@ -139,32 +136,26 @@
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_name {{.*}} "inner"
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: [[ANON_INNER_B_DECL:.*]]:     DW_TAG_variable
+; CHECK: [[ANON_INNER_B:.*]]: DW_TAG_variable
 ; CHECK-NOT: DW_TAG
 ; CHECK:       DW_AT_name {{.*}} "b"
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK:     NULL
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: [[ANON_I_DECL:.*]]:   DW_TAG_variable
+; CHECK: [[ANON_I:.*]]: DW_TAG_variable
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_name {{.*}} "i"
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK:   NULL
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: [[ANON_INNER_B:.*]]: DW_TAG_variable
-; CHECK-NOT: DW_TAG
-; CHECK-NEXT:   DW_AT_specification {{.*}} {[[ANON_INNER_B_DECL]]}
-; CHECK: [[ANON_I:.*]]: DW_TAG_variable
-; CHECK-NOT: DW_TAG
-; CHECK-NEXT:   DW_AT_specification {{.*}} {[[ANON_I_DECL]]}
 
 ; CHECK: [[MEM_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_specification {{.*}} {[[MEM_FUNC_DECL]]}
+; CHECK: DW_AT_specification {{.*}} "_ZN1C15member_functionEv"
 
 ; CHECK: [[STATIC_MEM_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_specification {{.*}} {[[STATIC_MEM_FUNC_DECL]]}
+; CHECK: DW_AT_specification {{.*}} "_ZN1C22static_member_functionEv"
 
 ; CHECK: [[GLOBAL_FUNC:0x[0-9a-f]+]]: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
@@ -223,14 +214,14 @@
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !50), !dbg !52
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !50, metadata !{metadata !"0x102"}), !dbg !52
   %this1 = load %struct.C** %this.addr
   store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !53
   ret void, !dbg !54
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind uwtable
 define i32 @_ZN1C22static_member_functionEv() #0 align 2 {
@@ -279,58 +270,58 @@
 !llvm.module.flags = !{!47, !48}
 !llvm.ident = !{!49}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !19, metadata !32, metadata !45, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/pubnames.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !19, metadata !32, metadata !45} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/pubnames.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"pubnames.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !15}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00C\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6, metadata !8, metadata !12}
-!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1C", metadata !"static_member_variable", i32 4, i64 0, i64 0, i64 0, i32 4096, metadata !7, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
-!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
-!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0xd\00static_member_variable\004\000\000\000\004096", metadata !1, metadata !"_ZTS1C", metadata !7, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\002\000\000\000\006\00256\000\002", metadata !1, metadata !"_ZTS1C", metadata !9, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{null, metadata !11}
-!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
-!12 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !13, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
-!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!12 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\003\000\000\000\006\00256\000\003", metadata !1, metadata !"_ZTS1C", metadata !13, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{metadata !7}
-!15 = metadata !{i32 786451, metadata !1, metadata !16, metadata !"D", i32 28, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null, metadata !"_ZTSN2ns1DE"} ; [ DW_TAG_structure_type ] [D] [line 28, size 32, align 32, offset 0] [def] [from ]
-!16 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 23} ; [ DW_TAG_namespace ] [ns] [line 23]
+!15 = metadata !{metadata !"0x13\00D\0028\0032\0032\000\000\000", metadata !1, metadata !16, null, metadata !17, null, null, metadata !"_ZTSN2ns1DE"} ; [ DW_TAG_structure_type ] [D] [line 28, size 32, align 32, offset 0] [def] [from ]
+!16 = metadata !{metadata !"0x39\00ns\0023", metadata !1, null} ; [ DW_TAG_namespace ] [ns] [line 23]
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN2ns1DE", metadata !"A", i32 29, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [A] [line 29, size 32, align 32, offset 0] [from int]
+!18 = metadata !{metadata !"0xd\00A\0029\0032\0032\000\000", metadata !1, metadata !"_ZTSN2ns1DE", metadata !7} ; [ DW_TAG_member ] [A] [line 29, size 32, align 32, offset 0] [from int]
 !19 = metadata !{metadata !20, metadata !21, metadata !22, metadata !24, metadata !27, metadata !31}
-!20 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !8, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
-!21 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !12, metadata !2, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
-!22 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !2, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
-!23 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/pubnames.cpp]
-!24 = metadata !{i32 786478, metadata !1, metadata !16, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !25, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !2, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!25 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\009\000\001\000\006\00256\000\009", metadata !1, metadata !"_ZTS1C", metadata !9, null, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !8, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!21 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\0013\000\001\000\006\00256\000\0013", metadata !1, metadata !"_ZTS1C", metadata !13, null, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !12, metadata !2} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!22 = metadata !{metadata !"0x2e\00global_function\00global_function\00_Z15global_functionv\0019\000\001\000\006\00256\000\0019", metadata !1, metadata !23, metadata !13, null, i32 ()* @_Z15global_functionv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!23 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/pubnames.cpp]
+!24 = metadata !{metadata !"0x2e\00global_namespace_function\00global_namespace_function\00_ZN2ns25global_namespace_functionEv\0024\000\001\000\006\00256\000\0024", metadata !1, metadata !16, metadata !25, null, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!25 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !26 = metadata !{null}
-!27 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"f3", metadata !"f3", metadata !"_Z2f3v", i32 37, metadata !28, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32* ()* @_Z2f3v, null, null, metadata !2, i32 37} ; [ DW_TAG_subprogram ] [line 37] [def] [f3]
-!28 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = metadata !{metadata !"0x2e\00f3\00f3\00_Z2f3v\0037\000\001\000\006\00256\000\0037", metadata !1, metadata !23, metadata !28, null, i32* ()* @_Z2f3v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 37] [def] [f3]
+!28 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !29 = metadata !{metadata !30}
-!30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!31 = metadata !{i32 786478, metadata !1, metadata !23, metadata !"f7", metadata !"f7", metadata !"_Z2f7v", i32 54, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z2f7v, null, null, metadata !2, i32 54} ; [ DW_TAG_subprogram ] [line 54] [def] [f7]
+!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!31 = metadata !{metadata !"0x2e\00f7\00f7\00_Z2f7v\0054\000\001\000\006\00256\000\0054", metadata !1, metadata !23, metadata !13, null, i32 ()* @_Z2f7v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 54] [def] [f7]
 !32 = metadata !{metadata !33, metadata !34, metadata !35, metadata !36, metadata !37, metadata !38, metadata !41, metadata !44}
-!33 = metadata !{i32 786484, i32 0, metadata !4, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !23, i32 7, metadata !7, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !6} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
-!34 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !23, i32 17, metadata !"_ZTS1C", i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
-!35 = metadata !{i32 786484, i32 0, metadata !16, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !23, i32 27, metadata !7, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
-!36 = metadata !{i32 786484, i32 0, metadata !16, metadata !"d", metadata !"d", metadata !"_ZN2ns1dE", metadata !23, i32 30, metadata !"_ZTSN2ns1DE", i32 0, i32 1, %"struct.ns::D"* @_ZN2ns1dE, null} ; [ DW_TAG_variable ] [d] [line 30] [def]
-!37 = metadata !{i32 786484, i32 0, metadata !27, metadata !"z", metadata !"z", metadata !"", metadata !23, i32 38, metadata !7, i32 1, i32 1, i32* @_ZZ2f3vE1z, null} ; [ DW_TAG_variable ] [z] [line 38] [local] [def]
-!38 = metadata !{i32 786484, i32 0, metadata !39, metadata !"c", metadata !"c", metadata !"_ZN5outer12_GLOBAL__N_11cE", metadata !23, i32 50, metadata !7, i32 1, i32 1, i32* @_ZN5outer12_GLOBAL__N_11cE, null} ; [ DW_TAG_variable ] [c] [line 50] [local] [def]
-!39 = metadata !{i32 786489, metadata !1, metadata !40, metadata !"", i32 49} ; [ DW_TAG_namespace ] [line 49]
-!40 = metadata !{i32 786489, metadata !1, null, metadata !"outer", i32 48} ; [ DW_TAG_namespace ] [outer] [line 48]
-!41 = metadata !{i32 786484, i32 0, metadata !42, metadata !"b", metadata !"b", metadata !"_ZN12_GLOBAL__N_15inner1bE", metadata !23, i32 44, metadata !7, i32 1, i32 1, i32* @_ZN12_GLOBAL__N_15inner1bE, null} ; [ DW_TAG_variable ] [b] [line 44] [local] [def]
-!42 = metadata !{i32 786489, metadata !1, metadata !43, metadata !"inner", i32 43} ; [ DW_TAG_namespace ] [inner] [line 43]
-!43 = metadata !{i32 786489, metadata !1, null, metadata !"", i32 33} ; [ DW_TAG_namespace ] [line 33]
-!44 = metadata !{i32 786484, i32 0, metadata !43, metadata !"i", metadata !"i", metadata !"_ZN12_GLOBAL__N_11iE", metadata !23, i32 34, metadata !7, i32 1, i32 1, i32* @_ZN12_GLOBAL__N_11iE, null} ; [ DW_TAG_variable ] [i] [line 34] [local] [def]
+!33 = metadata !{metadata !"0x34\00static_member_variable\00static_member_variable\00_ZN1C22static_member_variableE\007\000\001", null, metadata !23, metadata !7, i32* @_ZN1C22static_member_variableE, metadata !6} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!34 = metadata !{metadata !"0x34\00global_variable\00global_variable\00\0017\000\001", null, metadata !23, metadata !"_ZTS1C", %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!35 = metadata !{metadata !"0x34\00global_namespace_variable\00global_namespace_variable\00_ZN2ns25global_namespace_variableE\0027\000\001", metadata !16, metadata !23, metadata !7, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!36 = metadata !{metadata !"0x34\00d\00d\00_ZN2ns1dE\0030\000\001", metadata !16, metadata !23, metadata !"_ZTSN2ns1DE", %"struct.ns::D"* @_ZN2ns1dE, null} ; [ DW_TAG_variable ] [d] [line 30] [def]
+!37 = metadata !{metadata !"0x34\00z\00z\00\0038\001\001", metadata !27, metadata !23, metadata !7, i32* @_ZZ2f3vE1z, null} ; [ DW_TAG_variable ] [z] [line 38] [local] [def]
+!38 = metadata !{metadata !"0x34\00c\00c\00_ZN5outer12_GLOBAL__N_11cE\0050\001\001", metadata !39, metadata !23, metadata !7, i32* @_ZN5outer12_GLOBAL__N_11cE, null} ; [ DW_TAG_variable ] [c] [line 50] [local] [def]
+!39 = metadata !{metadata !"0x39\00\0049", metadata !1, metadata !40} ; [ DW_TAG_namespace ] [line 49]
+!40 = metadata !{metadata !"0x39\00outer\0048", metadata !1, null} ; [ DW_TAG_namespace ] [outer] [line 48]
+!41 = metadata !{metadata !"0x34\00b\00b\00_ZN12_GLOBAL__N_15inner1bE\0044\001\001", metadata !42, metadata !23, metadata !7, i32* @_ZN12_GLOBAL__N_15inner1bE, null} ; [ DW_TAG_variable ] [b] [line 44] [local] [def]
+!42 = metadata !{metadata !"0x39\00inner\0043", metadata !1, metadata !43} ; [ DW_TAG_namespace ] [inner] [line 43]
+!43 = metadata !{metadata !"0x39\00\0033", metadata !1, null} ; [ DW_TAG_namespace ] [line 33]
+!44 = metadata !{metadata !"0x34\00i\00i\00_ZN12_GLOBAL__N_11iE\0034\001\001", metadata !43, metadata !23, metadata !7, i32* @_ZN12_GLOBAL__N_11iE, null} ; [ DW_TAG_variable ] [i] [line 34] [local] [def]
 !45 = metadata !{metadata !46}
-!46 = metadata !{i32 786490, metadata !40, metadata !39, i32 40} ; [ DW_TAG_imported_module ]
+!46 = metadata !{metadata !"0x3a\0040\00", metadata !40, metadata !39} ; [ DW_TAG_imported_module ]
 !47 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!48 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!48 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !49 = metadata !{metadata !"clang version 3.5.0 "}
-!50 = metadata !{i32 786689, metadata !20, metadata !"this", null, i32 16777216, metadata !51, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!51 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!50 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !20, null, metadata !51} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!51 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
 !52 = metadata !{i32 0, i32 0, metadata !20, null}
 !53 = metadata !{i32 10, i32 0, metadata !20, null}
 !54 = metadata !{i32 11, i32 0, metadata !20, null}

diff --git a/test/DebugInfo/X86/inline-member-function.ll b/test/DebugInfo/X86/inline-member-function.ll
index 3dc6043..214fdba 100644
--- a/test/DebugInfo/X86/inline-member-function.ll
+++ b/test/DebugInfo/X86/inline-member-function.ll

@@ -18,13 +18,14 @@
 
 ; But make sure we emit DW_AT_object_pointer on the abstract definition.
 ; CHECK: [[ABSTRACT_ORIGIN:.*]]: DW_TAG_subprogram
-; CHECK-NOT: NULL
-; CHECK-NOT: TAG
+; CHECK-NOT: {{NULL|TAG}}
+; CHECK: DW_AT_specification {{.*}} "_ZN3foo4funcEi"
+; CHECK-NOT: {{NULL|TAG}}
 ; CHECK: DW_AT_object_pointer
 
 ; Ensure we omit DW_AT_object_pointer on inlined subroutines.
 ; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}}{[[ABSTRACT_ORIGIN]]}
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[ABSTRACT_ORIGIN]]} "_ZN3foo4funcEi"
 ; CHECK-NOT: NULL
 ; CHECK-NOT: DW_AT_object_pointer
 ; CHECK: DW_TAG_formal_parameter
@@ -45,9 +46,9 @@
   store i32 0, i32* %retval
   %0 = load i32* @i, align 4, !dbg !23
   store %struct.foo* %tmp, %struct.foo** %this.addr.i, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr.i}, metadata !24), !dbg !26
+  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr.i}, metadata !24, metadata !{metadata !"0x102"}), !dbg !26
   store i32 %0, i32* %x.addr.i, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !27), !dbg !28
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !27, metadata !{metadata !"0x102"}), !dbg !28
   %this1.i = load %struct.foo** %this.addr.i
   %1 = load i32* %x.addr.i, align 4, !dbg !28
   %add.i = add nsw i32 %1, 2, !dbg !28
@@ -55,7 +56,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -64,32 +65,32 @@
 !llvm.module.flags = !{!20, !21}
 !llvm.ident = !{!22}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !12, metadata !18, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !12, metadata !18, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"inline.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786478, metadata !1, metadata !"_ZTS3foo", metadata !"func", metadata !"func", metadata !"_ZN3foo4funcEi", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !11, i32 2} ; [ DW_TAG_subprogram ] [line 2] [func]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x2e\00func\00func\00_ZN3foo4funcEi\002\000\000\000\006\00256\000\002", metadata !1, metadata !"_ZTS3foo", metadata !7, null, null, null, i32 0, metadata !11} ; [ DW_TAG_subprogram ] [line 2] [func]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
 !11 = metadata !{i32 786468}
 !12 = metadata !{metadata !13, metadata !17}
-!13 = metadata !{i32 786478, metadata !1, metadata !14, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!14 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline.cpp]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !14, metadata !15, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!14 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline.cpp]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !9}
-!17 = metadata !{i32 786478, metadata !1, metadata !"_ZTS3foo", metadata !"func", metadata !"func", metadata !"_ZN3foo4funcEi", i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, metadata !6, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
+!17 = metadata !{metadata !"0x2e\00func\00func\00_ZN3foo4funcEi\002\000\001\000\006\00256\000\002", metadata !1, metadata !"_ZTS3foo", metadata !7, null, null, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
 !18 = metadata !{metadata !19}
-!19 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"", metadata !14, i32 5, metadata !9, i32 0, i32 1, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 5] [def]
+!19 = metadata !{metadata !"0x34\00i\00i\00\005\000\001", null, metadata !14, metadata !9, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 5] [def]
 !20 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !22 = metadata !{metadata !"clang version 3.5.0 "}
-!23 = metadata !{i32 8, i32 0, metadata !13, null} ; [ DW_TAG_imported_declaration ]
-!24 = metadata !{i32 786689, metadata !17, metadata !"this", null, i32 16777216, metadata !25, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!23 = metadata !{i32 8, i32 0, metadata !13, null}
+!24 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !17, null, metadata !25} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
 !26 = metadata !{i32 0, i32 0, metadata !17, metadata !23}
-!27 = metadata !{i32 786689, metadata !17, metadata !"x", metadata !14, i32 33554434, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 2]
+!27 = metadata !{metadata !"0x101\00x\0033554434\000", metadata !17, metadata !14, metadata !9} ; [ DW_TAG_arg_variable ] [x] [line 2]
 !28 = metadata !{i32 2, i32 0, metadata !17, metadata !23}

diff --git a/test/DebugInfo/X86/inline-seldag-test.ll b/test/DebugInfo/X86/inline-seldag-test.ll
index 615f03a..278604d 100644
--- a/test/DebugInfo/X86/inline-seldag-test.ll
+++ b/test/DebugInfo/X86/inline-seldag-test.ll

@@ -11,12 +11,8 @@
 ;   x = f(x);
 ; }
 
-; CHECK: [[F:.*]]: DW_TAG_subprogram
-; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_name {{.*}} "f"
-
 ; CHECK: DW_TAG_inlined_subroutine
-; CHECK-NEXT: DW_AT_abstract_origin {{.*}} {[[F]]}
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} "f"
 
 
 ; Make sure the condition test is attributed to the inline function, not the
@@ -31,10 +27,10 @@
 entry:
   %y.addr.i = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x}, metadata !15), !dbg !17
+  call void @llvm.dbg.declare(metadata !{i32* %x}, metadata !15, metadata !{metadata !"0x102"}), !dbg !17
   %0 = load volatile i32* %x, align 4, !dbg !18
   store i32 %0, i32* %y.addr.i, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %y.addr.i}, metadata !19), !dbg !20
+  call void @llvm.dbg.declare(metadata !{i32* %y.addr.i}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
   %1 = load i32* %y.addr.i, align 4, !dbg !21
   %tobool.i = icmp ne i32 %1, 0, !dbg !21
   %cond.i = select i1 %tobool.i, i32 4, i32 7, !dbg !21
@@ -43,7 +39,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -52,26 +48,26 @@
 !llvm.module.flags = !{!12, !13}
 !llvm.ident = !{!14}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-seldag-test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-seldag-test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"inline-seldag-test.c", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @func, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [func]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-seldag-test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00func\00func\00\004\000\001\000\006\000\000\004", metadata !1, metadata !5, metadata !6, null, void ()* @func, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [func]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-seldag-test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
-!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !9, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{metadata !11, metadata !11}
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !14 = metadata !{metadata !"clang version 3.5.0 "}
-!15 = metadata !{i32 786688, metadata !4, metadata !"x", metadata !5, i32 5, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [x] [line 5]
-!16 = metadata !{i32 786485, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from int]
+!15 = metadata !{metadata !"0x100\00x\005\000", metadata !4, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [x] [line 5]
+!16 = metadata !{metadata !"0x35\00\000\000\000\000\000", null, null, metadata !11} ; [ DW_TAG_volatile_type ] [line 0, size 0, align 0, offset 0] [from int]
 !17 = metadata !{i32 5, i32 0, metadata !4, null}
 !18 = metadata !{i32 6, i32 7, metadata !4, null}
-!19 = metadata !{i32 786689, metadata !8, metadata !"y", metadata !5, i32 16777217, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [y] [line 1]
+!19 = metadata !{metadata !"0x101\00y\0016777217\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [y] [line 1]
 !20 = metadata !{i32 1, i32 0, metadata !8, metadata !18}
 !21 = metadata !{i32 2, i32 0, metadata !8, metadata !18}
 !22 = metadata !{i32 7, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/instcombine-instrinsics.ll b/test/DebugInfo/X86/instcombine-instrinsics.ll
index 2fd7ee3..a2cc35e 100644
--- a/test/DebugInfo/X86/instcombine-instrinsics.ll
+++ b/test/DebugInfo/X86/instcombine-instrinsics.ll

@@ -30,7 +30,7 @@
 ; Function Attrs: nounwind ssp uwtable
 define void @init() #0 {
   %p = alloca %struct.i14*, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.i14** %p}, metadata !11), !dbg !18
+  call void @llvm.dbg.declare(metadata !{%struct.i14** %p}, metadata !11, metadata !{metadata !"0x102"}), !dbg !18
   store %struct.i14* null, %struct.i14** %p, align 8, !dbg !18
   %1 = call i32 @foo(%struct.i14** %p), !dbg !19
   %2 = load %struct.i14** %p, align 8, !dbg !20
@@ -43,7 +43,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare i32 @foo(%struct.i14**)
 
@@ -54,25 +54,25 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [instcombine_intrinsics.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [instcombine_intrinsics.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"instcombine_intrinsics.c", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"init", metadata !"init", metadata !"", i32 7, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @init, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [init]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [instcombine_intrinsics.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00init\00init\00\007\000\001\000\006\000\000\007", metadata !1, metadata !5, metadata !6, null, void ()* @init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [init]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [instcombine_intrinsics.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5.0 "}
-!11 = metadata !{i32 786688, metadata !4, metadata !"p", metadata !5, i32 8, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [p] [line 8]
-!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from i14]
-!13 = metadata !{i32 786454, metadata !1, null, metadata !"i14", i32 3, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_typedef ] [i14] [line 3, size 0, align 0, offset 0] [from ]
-!14 = metadata !{i32 786451, metadata !1, null, metadata !"", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 64, align 64, offset 0] [def] [from ]
+!11 = metadata !{metadata !"0x100\00p\008\000", metadata !4, metadata !5, metadata !12} ; [ DW_TAG_auto_variable ] [p] [line 8]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from i14]
+!13 = metadata !{metadata !"0x16\00i14\003\000\000\000\000", metadata !1, null, metadata !14} ; [ DW_TAG_typedef ] [i14] [line 3, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0x13\00\001\0064\0064\000\000\000", metadata !1, null, null, metadata !15, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 64, align 64, offset 0] [def] [from ]
 !15 = metadata !{metadata !16}
-!16 = metadata !{i32 786445, metadata !1, metadata !14, metadata !"i", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !17} ; [ DW_TAG_member ] [i] [line 2, size 64, align 64, offset 0] [from long int]
-!17 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!18 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!16 = metadata !{metadata !"0xd\00i\002\0064\0064\000\000", metadata !1, metadata !14, metadata !17} ; [ DW_TAG_member ] [i] [line 2, size 64, align 64, offset 0] [from long int]
+!17 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!18 = metadata !{i32 8, i32 0, metadata !4, null}
 !19 = metadata !{i32 9, i32 0, metadata !4, null}
 !20 = metadata !{i32 10, i32 0, metadata !4, null}
 !21 = metadata !{i32 11, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/lexical_block.ll b/test/DebugInfo/X86/lexical_block.ll
index 95b3921..e2832a0 100644
--- a/test/DebugInfo/X86/lexical_block.ll
+++ b/test/DebugInfo/X86/lexical_block.ll

@@ -1,11 +1,19 @@
 ; REQUIRES: object-emission
 
 ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s \
-; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V4 %s
+; RUN: llc -mtriple=x86_64-linux -dwarf-version=3 -O0 -filetype=obj < %s \
+; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V3 %s
+
+; Check that we emit DW_TAG_lexical_block and that it has the right encoding
+; depending on the dwarf version.
 
 ; CHECK: DW_TAG_lexical_block
-; CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr]
-; CHECK-NEXT: DW_AT_high_pc [DW_FORM_data4]
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_low_pc [DW_FORM_addr]
+; CHECK-NOT: DW_TAG
+; CHECK-V4: DW_AT_high_pc [DW_FORM_data4]
+; CHECK-V3: DW_AT_high_pc [DW_FORM_addr]
 
 ; Test case produced from:
 ; void b() {
@@ -17,7 +25,7 @@
 define void @_Z1bv() #0 {
 entry:
   %i = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !11), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !11, metadata !{metadata !"0x102"}), !dbg !14
   store i32 3, i32* %i, align 4, !dbg !14
   %0 = load i32* %i, align 4, !dbg !14
   %tobool = icmp ne i32 %0, 0, !dbg !14
@@ -31,7 +39,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -40,20 +48,20 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/lexical_block.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/lexical_block.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"lexical_block.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"b", metadata !"b", metadata !"_Z1bv", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z1bv, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [b]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/lexical_block.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00b\00b\00_Z1bv\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @_Z1bv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [b]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/lexical_block.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5.0 "}
-!11 = metadata !{i32 786688, metadata !12, metadata !"i", metadata !5, i32 2, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2]
-!12 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/lexical_block.cpp]
-!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{metadata !"0x100\00i\002\000", metadata !12, metadata !5, metadata !13} ; [ DW_TAG_auto_variable ] [i] [line 2]
+!12 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/lexical_block.cpp]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !14 = metadata !{i32 2, i32 0, metadata !12, null}
 !15 = metadata !{i32 3, i32 0, metadata !12, null}
 !16 = metadata !{i32 4, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/line-info.ll b/test/DebugInfo/X86/line-info.ll
index f6deee9..8e0afee 100644
--- a/test/DebugInfo/X86/line-info.ll
+++ b/test/DebugInfo/X86/line-info.ll

@@ -18,14 +18,14 @@
 entry:
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !14), !dbg !15
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
   %0 = load i32* %x.addr, align 4, !dbg !16
   %inc = add nsw i32 %0, 1, !dbg !16
   store i32 %inc, i32* %x.addr, align 4, !dbg !16
   ret i32 %inc, !dbg !16
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 define i32 @main() #0 {
 entry:
@@ -38,23 +38,23 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2,  metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/list0.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2,  metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/list0.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"list0.c", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
 !5 = metadata !{metadata !"./list0.h", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/./list0.h]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/./list0.h]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
-!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
-!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\000\000\002", metadata !1, metadata !11, metadata !12, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
+!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{metadata !9}
-!14 = metadata !{i32 786689, metadata !4, metadata !"x", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!14 = metadata !{metadata !"0x101\00x\0016777217\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [x] [line 1]
 !15 = metadata !{i32 1, i32 0, metadata !4, null}
 !16 = metadata !{i32 2, i32 0, metadata !4, null}
 !17 = metadata !{i32 3, i32 0, metadata !18, null}
-!18 = metadata !{i32 786443, metadata !11, metadata !10} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{metadata !"0xb\000", metadata !11, metadata !10} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/linkage-name.ll b/test/DebugInfo/X86/linkage-name.ll
index 2b1647b..f687078 100644
--- a/test/DebugInfo/X86/linkage-name.ll
+++ b/test/DebugInfo/X86/linkage-name.ll

@@ -14,39 +14,39 @@
   %this.addr = alloca %class.A*, align 8
   %b.addr = alloca i32, align 4
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !21), !dbg !23
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !21, metadata !{metadata !"0x102"}), !dbg !23
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !24), !dbg !25
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
   %this1 = load %class.A** %this.addr
   %0 = load i32* %b.addr, align 4, !dbg !26
   ret i32 %0, !dbg !26
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{i32 786449, metadata !28, i32 4, metadata !"clang version 3.1 (trunk 152691) (llvm/trunk 152692)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.1 (trunk 152691) (llvm/trunk 152692)\000\00\000\00\000", metadata !28, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !6, null, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, null, i32 5} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00a\00a\00_ZN1A1aEi\005\000\001\000\006\00256\000\005", metadata !6, null, metadata !7, null, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, null} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786434, metadata !28, null, metadata !"A", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0x2\00A\001\008\008\000\000\000", metadata !28, null, null, metadata !12, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786478, metadata !6, metadata !11, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, null, i32 0} ; [ DW_TAG_subprogram ]
+!13 = metadata !{metadata !"0x2e\00a\00a\00_ZN1A1aEi\002\000\000\000\006\00257\000\000", metadata !6, metadata !11, metadata !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ]
 !18 = metadata !{metadata !20}
-!20 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 9, metadata !11, i32 0, i32 1, %class.A* @a, null} ; [ DW_TAG_variable ]
-!21 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777221, metadata !22, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!20 = metadata !{metadata !"0x34\00a\00a\00\009\000\001", null, metadata !6, metadata !11, %class.A* @a, null} ; [ DW_TAG_variable ]
+!21 = metadata !{metadata !"0x101\00this\0016777221\0064", metadata !5, metadata !6, metadata !22} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
 !23 = metadata !{i32 5, i32 8, metadata !5, null}
-!24 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554437, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!24 = metadata !{metadata !"0x101\00b\0033554437\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
 !25 = metadata !{i32 5, i32 14, metadata !5, null}
 !26 = metadata !{i32 6, i32 4, metadata !27, null}
-!27 = metadata !{i32 786443, metadata !6, metadata !5, i32 5, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
+!27 = metadata !{metadata !"0xb\005\0017\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
 !28 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo"}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/low-pc-cu.ll b/test/DebugInfo/X86/low-pc-cu.ll
index 979d400..7fd8f19 100644
--- a/test/DebugInfo/X86/low-pc-cu.ll
+++ b/test/DebugInfo/X86/low-pc-cu.ll

@@ -1,14 +1,24 @@
-; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -filetype=obj < %s \
+; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V4
+; RUN: llc -mtriple=x86_64-apple-darwin -filetype=obj -dwarf-version=3 < %s \
+; RUN:     | llvm-dwarfdump -debug-dump=info - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-V3
 
-; Check that we use DW_AT_low_pc
+
+; Check that we use DW_AT_low_pc and that it has the right encoding depending
+; on dwarf version.
 
 ; CHECK: DW_TAG_compile_unit [1]
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_low_pc [DW_FORM_addr]       (0x0000000000000000)
-; CHECK: DW_AT_high_pc [DW_FORM_data4]
+; CHECK-NOT: DW_TAG
+; CHECK-V3: DW_AT_high_pc [DW_FORM_addr]
+; CHECK-V4: DW_AT_high_pc [DW_FORM_data4]
 ; CHECK: DW_TAG_subprogram [2]
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_low_pc [DW_FORM_addr]
-; CHECK: DW_AT_high_pc [DW_FORM_data4]
+; CHECK-NOT: DW_TAG
+; CHECK-V3: DW_AT_high_pc [DW_FORM_addr]
+; CHECK-V4: DW_AT_high_pc [DW_FORM_data4]
 
 ; Function Attrs: nounwind uwtable
 define void @z() #0 {
@@ -22,15 +32,15 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/z.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/z.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"z.c", metadata !"/usr/local/google/home/echristo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z", metadata !"z", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @z, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [z]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/z.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00z\00z\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @z, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [z]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/z.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
 !11 = metadata !{i32 1, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/misched-dbg-value.ll b/test/DebugInfo/X86/misched-dbg-value.ll
index c713e65..b2033a5 100644
--- a/test/DebugInfo/X86/misched-dbg-value.ll
+++ b/test/DebugInfo/X86/misched-dbg-value.ll

@@ -48,12 +48,12 @@
 
 define void @Proc8(i32* nocapture %Array1Par, [51 x i32]* nocapture %Array2Par, i32 %IntParI1, i32 %IntParI2) nounwind optsize {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %Array1Par}, i64 0, metadata !23), !dbg !64
-  tail call void @llvm.dbg.value(metadata !{[51 x i32]* %Array2Par}, i64 0, metadata !24), !dbg !65
-  tail call void @llvm.dbg.value(metadata !{i32 %IntParI1}, i64 0, metadata !25), !dbg !66
-  tail call void @llvm.dbg.value(metadata !{i32 %IntParI2}, i64 0, metadata !26), !dbg !67
+  tail call void @llvm.dbg.value(metadata !{i32* %Array1Par}, i64 0, metadata !23, metadata !{metadata !"0x102"}), !dbg !64
+  tail call void @llvm.dbg.value(metadata !{[51 x i32]* %Array2Par}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !65
+  tail call void @llvm.dbg.value(metadata !{i32 %IntParI1}, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !66
+  tail call void @llvm.dbg.value(metadata !{i32 %IntParI2}, i64 0, metadata !26, metadata !{metadata !"0x102"}), !dbg !67
   %add = add i32 %IntParI1, 5, !dbg !68
-  tail call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !27), !dbg !68
+  tail call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !68
   %idxprom = sext i32 %add to i64, !dbg !69
   %arrayidx = getelementptr inbounds i32* %Array1Par, i64 %idxprom, !dbg !69
   store i32 %IntParI2, i32* %arrayidx, align 4, !dbg !69
@@ -65,7 +65,7 @@
   %idxprom7 = sext i32 %add6 to i64, !dbg !74
   %arrayidx8 = getelementptr inbounds i32* %Array1Par, i64 %idxprom7, !dbg !74
   store i32 %add, i32* %arrayidx8, align 4, !dbg !74
-  tail call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !28), !dbg !75
+  tail call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !75
   br label %for.body, !dbg !75
 
 for.body:                                         ; preds = %entry, %for.body
@@ -74,7 +74,7 @@
   %arrayidx13 = getelementptr inbounds [51 x i32]* %Array2Par, i64 %idxprom, i64 %indvars.iv, !dbg !77
   store i32 %add, i32* %arrayidx13, align 4, !dbg !77
   %inc = add nsw i32 %IntIndex.046, 1, !dbg !75
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !28), !dbg !75
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !28, metadata !{metadata !"0x102"}), !dbg !75
   %cmp = icmp sgt i32 %inc, %add3, !dbg !75
   %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !75
   br i1 %cmp, label %for.end, label %for.body, !dbg !75
@@ -95,7 +95,7 @@
   ret void, !dbg !81
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 attributes #0 = { nounwind optsize ssp uwtable }
 attributes #1 = { nounwind readnone }
@@ -103,70 +103,70 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!83}
 
-!0 = metadata !{i32 786449, metadata !82, i32 12, metadata !"clang version 3.3 (trunk 175015)", i1 true, metadata !"", i32 0, metadata !1, metadata !10, metadata !11, metadata !29,  metadata !10, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 175015)\001\00\000\00\001", metadata !82, metadata !1, metadata !10, metadata !11, metadata !29,  metadata !10} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
-!2 = metadata !{i32 786436, metadata !82, null, metadata !"", i32 128, i64 32, i64 32, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 128, size 32, align 32, offset 0] [def] [from ]
-!3 = metadata !{i32 786473, metadata !82} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x4\00\00128\0032\0032\000\000\000", metadata !82, null, null, metadata !4, null, null, null} ; [ DW_TAG_enumeration_type ] [line 128, size 32, align 32, offset 0] [def] [from ]
+!3 = metadata !{metadata !"0x29", metadata !82} ; [ DW_TAG_file_type ]
 !4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8, metadata !9}
-!5 = metadata !{i32 786472, metadata !"Ident1", i64 0} ; [ DW_TAG_enumerator ] [Ident1 :: 0]
-!6 = metadata !{i32 786472, metadata !"Ident2", i64 10000} ; [ DW_TAG_enumerator ] [Ident2 :: 10000]
-!7 = metadata !{i32 786472, metadata !"Ident3", i64 10001} ; [ DW_TAG_enumerator ] [Ident3 :: 10001]
-!8 = metadata !{i32 786472, metadata !"Ident4", i64 10002} ; [ DW_TAG_enumerator ] [Ident4 :: 10002]
-!9 = metadata !{i32 786472, metadata !"Ident5", i64 10003} ; [ DW_TAG_enumerator ] [Ident5 :: 10003]
+!5 = metadata !{metadata !"0x28\00Ident1\000"} ; [ DW_TAG_enumerator ] [Ident1 :: 0]
+!6 = metadata !{metadata !"0x28\00Ident2\0010000"} ; [ DW_TAG_enumerator ] [Ident2 :: 10000]
+!7 = metadata !{metadata !"0x28\00Ident3\0010001"} ; [ DW_TAG_enumerator ] [Ident3 :: 10001]
+!8 = metadata !{metadata !"0x28\00Ident4\0010002"} ; [ DW_TAG_enumerator ] [Ident4 :: 10002]
+!9 = metadata !{metadata !"0x28\00Ident5\0010003"} ; [ DW_TAG_enumerator ] [Ident5 :: 10003]
 !10 = metadata !{}
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786478, metadata !82, metadata !3, metadata !"Proc8", metadata !"Proc8", metadata !"", i32 180, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void (i32*, [51 x i32]*, i32, i32)* @Proc8, null, null, metadata !22, i32 185} ; [ DW_TAG_subprogram ] [line 180] [def] [scope 185] [Proc8]
-!13 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x2e\00Proc8\00Proc8\00\00180\000\001\000\006\000\001\00185", metadata !82, metadata !3, metadata !13, null, void (i32*, [51 x i32]*, i32, i32)* @Proc8, null, null, metadata !22} ; [ DW_TAG_subprogram ] [line 180] [def] [scope 185] [Proc8]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{null, metadata !15, metadata !17, metadata !21, metadata !21}
-!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!16 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!18 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1632, i64 32, i32 0, i32 0, metadata !16, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1632, align 32, offset 0] [from int]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!16 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!18 = metadata !{metadata !"0x1\00\000\001632\0032\000\000", null, null, metadata !16, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1632, align 32, offset 0] [from int]
 !19 = metadata !{metadata !20}
-!20 = metadata !{i32 786465, i64 0, i64 51}       ; [ DW_TAG_subrange_type ] [0, 50]
-!21 = metadata !{i32 786454, metadata !82, null, metadata !"OneToFifty", i32 132, i64 0, i64 0, i64 0, i32 0, metadata !16} ; [ DW_TAG_typedef ] [OneToFifty] [line 132, size 0, align 0, offset 0] [from int]
+!20 = metadata !{metadata !"0x21\000\0051"}       ; [ DW_TAG_subrange_type ] [0, 50]
+!21 = metadata !{metadata !"0x16\00OneToFifty\00132\000\000\000\000", metadata !82, null, metadata !16} ; [ DW_TAG_typedef ] [OneToFifty] [line 132, size 0, align 0, offset 0] [from int]
 !22 = metadata !{metadata !23, metadata !24, metadata !25, metadata !26, metadata !27, metadata !28}
-!23 = metadata !{i32 786689, metadata !12, metadata !"Array1Par", metadata !3, i32 16777397, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [Array1Par] [line 181]
-!24 = metadata !{i32 786689, metadata !12, metadata !"Array2Par", metadata !3, i32 33554614, metadata !17, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [Array2Par] [line 182]
-!25 = metadata !{i32 786689, metadata !12, metadata !"IntParI1", metadata !3, i32 50331831, metadata !21, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [IntParI1] [line 183]
-!26 = metadata !{i32 786689, metadata !12, metadata !"IntParI2", metadata !3, i32 67109048, metadata !21, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [IntParI2] [line 184]
-!27 = metadata !{i32 786688, metadata !12, metadata !"IntLoc", metadata !3, i32 186, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [IntLoc] [line 186]
-!28 = metadata !{i32 786688, metadata !12, metadata !"IntIndex", metadata !3, i32 187, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [IntIndex] [line 187]
+!23 = metadata !{metadata !"0x101\00Array1Par\0016777397\000", metadata !12, metadata !3, metadata !15} ; [ DW_TAG_arg_variable ] [Array1Par] [line 181]
+!24 = metadata !{metadata !"0x101\00Array2Par\0033554614\000", metadata !12, metadata !3, metadata !17} ; [ DW_TAG_arg_variable ] [Array2Par] [line 182]
+!25 = metadata !{metadata !"0x101\00IntParI1\0050331831\000", metadata !12, metadata !3, metadata !21} ; [ DW_TAG_arg_variable ] [IntParI1] [line 183]
+!26 = metadata !{metadata !"0x101\00IntParI2\0067109048\000", metadata !12, metadata !3, metadata !21} ; [ DW_TAG_arg_variable ] [IntParI2] [line 184]
+!27 = metadata !{metadata !"0x100\00IntLoc\00186\000", metadata !12, metadata !3, metadata !21} ; [ DW_TAG_auto_variable ] [IntLoc] [line 186]
+!28 = metadata !{metadata !"0x100\00IntIndex\00187\000", metadata !12, metadata !3, metadata !21} ; [ DW_TAG_auto_variable ] [IntIndex] [line 187]
 !29 = metadata !{metadata !30, metadata !35, metadata !36, metadata !38, metadata !39, metadata !40, metadata !42, metadata !46, metadata !63}
-!30 = metadata !{i32 786484, i32 0, null, metadata !"Version", metadata !"Version", metadata !"", metadata !3, i32 111, metadata !31, i32 0, i32 1, [4 x i8]* @Version, null} ; [ DW_TAG_variable ] [Version] [line 111] [def]
-!31 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 8, i32 0, i32 0, metadata !32, metadata !33, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
-!32 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!30 = metadata !{metadata !"0x34\00Version\00Version\00\00111\000\001", null, metadata !3, metadata !31, [4 x i8]* @Version, null} ; [ DW_TAG_variable ] [Version] [line 111] [def]
+!31 = metadata !{metadata !"0x1\00\000\0032\008\000\000", null, null, metadata !32, metadata !33, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
+!32 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !33 = metadata !{metadata !34}
-!34 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
-!35 = metadata !{i32 786484, i32 0, null, metadata !"IntGlob", metadata !"IntGlob", metadata !"", metadata !3, i32 171, metadata !16, i32 0, i32 1, i32* @IntGlob, null} ; [ DW_TAG_variable ] [IntGlob] [line 171] [def]
-!36 = metadata !{i32 786484, i32 0, null, metadata !"BoolGlob", metadata !"BoolGlob", metadata !"", metadata !3, i32 172, metadata !37, i32 0, i32 1, i32* @BoolGlob, null} ; [ DW_TAG_variable ] [BoolGlob] [line 172] [def]
-!37 = metadata !{i32 786454, metadata !82, null, metadata !"boolean", i32 149, i64 0, i64 0, i64 0, i32 0, metadata !16} ; [ DW_TAG_typedef ] [boolean] [line 149, size 0, align 0, offset 0] [from int]
-!38 = metadata !{i32 786484, i32 0, null, metadata !"Char1Glob", metadata !"Char1Glob", metadata !"", metadata !3, i32 173, metadata !32, i32 0, i32 1, i8* @Char1Glob, null} ; [ DW_TAG_variable ] [Char1Glob] [line 173] [def]
-!39 = metadata !{i32 786484, i32 0, null, metadata !"Char2Glob", metadata !"Char2Glob", metadata !"", metadata !3, i32 174, metadata !32, i32 0, i32 1, i8* @Char2Glob, null} ; [ DW_TAG_variable ] [Char2Glob] [line 174] [def]
-!40 = metadata !{i32 786484, i32 0, null, metadata !"Array1Glob", metadata !"Array1Glob", metadata !"", metadata !3, i32 175, metadata !41, i32 0, i32 1, [51 x i32]* @Array1Glob, null} ; [ DW_TAG_variable ] [Array1Glob] [line 175] [def]
-!41 = metadata !{i32 786454, metadata !82, null, metadata !"Array1Dim", i32 135, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_typedef ] [Array1Dim] [line 135, size 0, align 0, offset 0] [from ]
-!42 = metadata !{i32 786484, i32 0, null, metadata !"Array2Glob", metadata !"Array2Glob", metadata !"", metadata !3, i32 176, metadata !43, i32 0, i32 1, [51 x [51 x i32]]* @Array2Glob, null} ; [ DW_TAG_variable ] [Array2Glob] [line 176] [def]
-!43 = metadata !{i32 786454, metadata !82, null, metadata !"Array2Dim", i32 136, i64 0, i64 0, i64 0, i32 0, metadata !44} ; [ DW_TAG_typedef ] [Array2Dim] [line 136, size 0, align 0, offset 0] [from ]
-!44 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 83232, i64 32, i32 0, i32 0, metadata !16, metadata !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 83232, align 32, offset 0] [from int]
+!34 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
+!35 = metadata !{metadata !"0x34\00IntGlob\00IntGlob\00\00171\000\001", null, metadata !3, metadata !16, i32* @IntGlob, null} ; [ DW_TAG_variable ] [IntGlob] [line 171] [def]
+!36 = metadata !{metadata !"0x34\00BoolGlob\00BoolGlob\00\00172\000\001", null, metadata !3, metadata !37, i32* @BoolGlob, null} ; [ DW_TAG_variable ] [BoolGlob] [line 172] [def]
+!37 = metadata !{metadata !"0x16\00boolean\00149\000\000\000\000", metadata !82, null, metadata !16} ; [ DW_TAG_typedef ] [boolean] [line 149, size 0, align 0, offset 0] [from int]
+!38 = metadata !{metadata !"0x34\00Char1Glob\00Char1Glob\00\00173\000\001", null, metadata !3, metadata !32, i8* @Char1Glob, null} ; [ DW_TAG_variable ] [Char1Glob] [line 173] [def]
+!39 = metadata !{metadata !"0x34\00Char2Glob\00Char2Glob\00\00174\000\001", null, metadata !3, metadata !32, i8* @Char2Glob, null} ; [ DW_TAG_variable ] [Char2Glob] [line 174] [def]
+!40 = metadata !{metadata !"0x34\00Array1Glob\00Array1Glob\00\00175\000\001", null, metadata !3, metadata !41, [51 x i32]* @Array1Glob, null} ; [ DW_TAG_variable ] [Array1Glob] [line 175] [def]
+!41 = metadata !{metadata !"0x16\00Array1Dim\00135\000\000\000\000", metadata !82, null, metadata !18} ; [ DW_TAG_typedef ] [Array1Dim] [line 135, size 0, align 0, offset 0] [from ]
+!42 = metadata !{metadata !"0x34\00Array2Glob\00Array2Glob\00\00176\000\001", null, metadata !3, metadata !43, [51 x [51 x i32]]* @Array2Glob, null} ; [ DW_TAG_variable ] [Array2Glob] [line 176] [def]
+!43 = metadata !{metadata !"0x16\00Array2Dim\00136\000\000\000\000", metadata !82, null, metadata !44} ; [ DW_TAG_typedef ] [Array2Dim] [line 136, size 0, align 0, offset 0] [from ]
+!44 = metadata !{metadata !"0x1\00\000\0083232\0032\000\000", null, null, metadata !16, metadata !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 83232, align 32, offset 0] [from int]
 !45 = metadata !{metadata !20, metadata !20}
-!46 = metadata !{i32 786484, i32 0, null, metadata !"PtrGlb", metadata !"PtrGlb", metadata !"", metadata !3, i32 177, metadata !47, i32 0, i32 1, %struct.Record** @PtrGlb, null} ; [ DW_TAG_variable ] [PtrGlb] [line 177] [def]
-!47 = metadata !{i32 786454, metadata !82, null, metadata !"RecordPtr", i32 148, i64 0, i64 0, i64 0, i32 0, metadata !48} ; [ DW_TAG_typedef ] [RecordPtr] [line 148, size 0, align 0, offset 0] [from ]
-!48 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !49} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from RecordType]
-!49 = metadata !{i32 786454, metadata !82, null, metadata !"RecordType", i32 147, i64 0, i64 0, i64 0, i32 0, metadata !50} ; [ DW_TAG_typedef ] [RecordType] [line 147, size 0, align 0, offset 0] [from Record]
-!50 = metadata !{i32 786451, metadata !82, null, metadata !"Record", i32 138, i64 448, i64 64, i32 0, i32 0, null, metadata !51, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [Record] [line 138, size 448, align 64, offset 0] [def] [from ]
+!46 = metadata !{metadata !"0x34\00PtrGlb\00PtrGlb\00\00177\000\001", null, metadata !3, metadata !47, %struct.Record** @PtrGlb, null} ; [ DW_TAG_variable ] [PtrGlb] [line 177] [def]
+!47 = metadata !{metadata !"0x16\00RecordPtr\00148\000\000\000\000", metadata !82, null, metadata !48} ; [ DW_TAG_typedef ] [RecordPtr] [line 148, size 0, align 0, offset 0] [from ]
+!48 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !49} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from RecordType]
+!49 = metadata !{metadata !"0x16\00RecordType\00147\000\000\000\000", metadata !82, null, metadata !50} ; [ DW_TAG_typedef ] [RecordType] [line 147, size 0, align 0, offset 0] [from Record]
+!50 = metadata !{metadata !"0x13\00Record\00138\00448\0064\000\000\000", metadata !82, null, null, metadata !51, null, i32 0, null} ; [ DW_TAG_structure_type ] [Record] [line 138, size 448, align 64, offset 0] [def] [from ]
 !51 = metadata !{metadata !52, metadata !54, metadata !56, metadata !57, metadata !58}
-!52 = metadata !{i32 786445, metadata !82, metadata !50, metadata !"PtrComp", i32 140, i64 64, i64 64, i64 0, i32 0, metadata !53} ; [ DW_TAG_member ] [PtrComp] [line 140, size 64, align 64, offset 0] [from ]
-!53 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !50} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Record]
-!54 = metadata !{i32 786445, metadata !82, metadata !50, metadata !"Discr", i32 141, i64 32, i64 32, i64 64, i32 0, metadata !55} ; [ DW_TAG_member ] [Discr] [line 141, size 32, align 32, offset 64] [from Enumeration]
-!55 = metadata !{i32 786454, metadata !82, null, metadata !"Enumeration", i32 128, i64 0, i64 0, i64 0, i32 0, metadata !2} ; [ DW_TAG_typedef ] [Enumeration] [line 128, size 0, align 0, offset 0] [from ]
-!56 = metadata !{i32 786445, metadata !82, metadata !50, metadata !"EnumComp", i32 142, i64 32, i64 32, i64 96, i32 0, metadata !55} ; [ DW_TAG_member ] [EnumComp] [line 142, size 32, align 32, offset 96] [from Enumeration]
-!57 = metadata !{i32 786445, metadata !82, metadata !50, metadata !"IntComp", i32 143, i64 32, i64 32, i64 128, i32 0, metadata !21} ; [ DW_TAG_member ] [IntComp] [line 143, size 32, align 32, offset 128] [from OneToFifty]
-!58 = metadata !{i32 786445, metadata !82, metadata !50, metadata !"StringComp", i32 144, i64 248, i64 8, i64 160, i32 0, metadata !59} ; [ DW_TAG_member ] [StringComp] [line 144, size 248, align 8, offset 160] [from String30]
-!59 = metadata !{i32 786454, metadata !82, null, metadata !"String30", i32 134, i64 0, i64 0, i64 0, i32 0, metadata !60} ; [ DW_TAG_typedef ] [String30] [line 134, size 0, align 0, offset 0] [from ]
-!60 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 248, i64 8, i32 0, i32 0, metadata !32, metadata !61, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 248, align 8, offset 0] [from char]
+!52 = metadata !{metadata !"0xd\00PtrComp\00140\0064\0064\000\000", metadata !82, metadata !50, metadata !53} ; [ DW_TAG_member ] [PtrComp] [line 140, size 64, align 64, offset 0] [from ]
+!53 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !50} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Record]
+!54 = metadata !{metadata !"0xd\00Discr\00141\0032\0032\0064\000", metadata !82, metadata !50, metadata !55} ; [ DW_TAG_member ] [Discr] [line 141, size 32, align 32, offset 64] [from Enumeration]
+!55 = metadata !{metadata !"0x16\00Enumeration\00128\000\000\000\000", metadata !82, null, metadata !2} ; [ DW_TAG_typedef ] [Enumeration] [line 128, size 0, align 0, offset 0] [from ]
+!56 = metadata !{metadata !"0xd\00EnumComp\00142\0032\0032\0096\000", metadata !82, metadata !50, metadata !55} ; [ DW_TAG_member ] [EnumComp] [line 142, size 32, align 32, offset 96] [from Enumeration]
+!57 = metadata !{metadata !"0xd\00IntComp\00143\0032\0032\00128\000", metadata !82, metadata !50, metadata !21} ; [ DW_TAG_member ] [IntComp] [line 143, size 32, align 32, offset 128] [from OneToFifty]
+!58 = metadata !{metadata !"0xd\00StringComp\00144\00248\008\00160\000", metadata !82, metadata !50, metadata !59} ; [ DW_TAG_member ] [StringComp] [line 144, size 248, align 8, offset 160] [from String30]
+!59 = metadata !{metadata !"0x16\00String30\00134\000\000\000\000", metadata !82, null, metadata !60} ; [ DW_TAG_typedef ] [String30] [line 134, size 0, align 0, offset 0] [from ]
+!60 = metadata !{metadata !"0x1\00\000\00248\008\000\000", null, null, metadata !32, metadata !61, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 248, align 8, offset 0] [from char]
 !61 = metadata !{metadata !62}
-!62 = metadata !{i32 786465, i64 0, i64 31}       ; [ DW_TAG_subrange_type ] [0, 30]
-!63 = metadata !{i32 786484, i32 0, null, metadata !"PtrGlbNext", metadata !"PtrGlbNext", metadata !"", metadata !3, i32 178, metadata !47, i32 0, i32 1, %struct.Record** @PtrGlbNext, null} ; [ DW_TAG_variable ] [PtrGlbNext] [line 178] [def]
+!62 = metadata !{metadata !"0x21\000\0031"}       ; [ DW_TAG_subrange_type ] [0, 30]
+!63 = metadata !{metadata !"0x34\00PtrGlbNext\00PtrGlbNext\00\00178\000\001", null, metadata !3, metadata !47, %struct.Record** @PtrGlbNext, null} ; [ DW_TAG_variable ] [PtrGlbNext] [line 178] [def]
 !64 = metadata !{i32 181, i32 0, metadata !12, null}
 !65 = metadata !{i32 182, i32 0, metadata !12, null}
 !66 = metadata !{i32 183, i32 0, metadata !12, null}
@@ -176,11 +176,11 @@
 !73 = metadata !{i32 191, i32 0, metadata !12, null}
 !74 = metadata !{i32 192, i32 0, metadata !12, null}
 !75 = metadata !{i32 193, i32 0, metadata !76, null}
-!76 = metadata !{i32 786443, metadata !82, metadata !12, i32 193, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c]
+!76 = metadata !{metadata !"0xb\00193\000\000", metadata !82, metadata !12} ; [ DW_TAG_lexical_block ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c]
 !77 = metadata !{i32 194, i32 0, metadata !76, null}
 !78 = metadata !{i32 195, i32 0, metadata !12, null}
 !79 = metadata !{i32 196, i32 0, metadata !12, null}
 !80 = metadata !{i32 197, i32 0, metadata !12, null}
 !81 = metadata !{i32 198, i32 0, metadata !12, null}
 !82 = metadata !{metadata !"dry.c", metadata !"/Users/manmanren/test-Nov/rdar_13183203/test2"}
-!83 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!83 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/multiple-aranges.ll b/test/DebugInfo/X86/multiple-aranges.ll
index 2da2938..47eef2d 100644
--- a/test/DebugInfo/X86/multiple-aranges.ll
+++ b/test/DebugInfo/X86/multiple-aranges.ll

@@ -8,8 +8,7 @@
 ; CHECK-NEXT: .byte   0                       # Segment Size (in bytes)
 ; CHECK-NEXT: .zero   4,255
 ; CHECK-NEXT: .quad   kittens
-; CHECK-NEXT: .Lset0 = rainbows-kittens
-; CHECK-NEXT: .quad   .Lset0
+; CHECK-NEXT: .quad   rainbows-kittens
 ; CHECK-NEXT: .quad   0                       # ARange terminator
 ; CHECK-NEXT: .quad   0
 
@@ -21,8 +20,7 @@
 ; CHECK-NEXT: .byte   0                       # Segment Size (in bytes)
 ; CHECK-NEXT: .zero   4,255
 ; CHECK-NEXT: .quad   rainbows
-; CHECK-NEXT: .Lset1 = .Ldebug_end0-rainbows
-; CHECK-NEXT: .quad   .Lset1
+; CHECK-NEXT: .quad   .Ldebug_end0-rainbows
 ; CHECK-NEXT: .quad   0                       # ARange terminator
 ; CHECK-NEXT: .quad   0
 
@@ -44,17 +42,17 @@
 !llvm.dbg.cu = !{!0, !7}
 !llvm.module.flags = !{!12, !13}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/kayamon/test1.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/home/kayamon/test1.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test1.c", metadata !"/home/kayamon"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786484, i32 0, null, metadata !"kittens", metadata !"kittens", metadata !"", metadata !5, i32 1, metadata !6, i32 0, i32 1, i32* @kittens, null} ; [ DW_TAG_variable ] [kittens] [line 1] [def]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test1.c]
-!6 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!7 = metadata !{i32 786449, metadata !8, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !9, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/kayamon/test2.c] [DW_LANG_C99]
+!4 = metadata !{metadata !"0x34\00kittens\00kittens\00\001\000\001", null, metadata !5, metadata !6, i32* @kittens, null} ; [ DW_TAG_variable ] [kittens] [line 1] [def]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test1.c]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!7 = metadata !{metadata !"0x11\0012\00clang version 3.4 \000\00\000\00\000", metadata !8, metadata !2, metadata !2, metadata !2, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/home/kayamon/test2.c] [DW_LANG_C99]
 !8 = metadata !{metadata !"test2.c", metadata !"/home/kayamon"}
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786484, i32 0, null, metadata !"rainbows", metadata !"rainbows", metadata !"", metadata !11, i32 1, metadata !6, i32 0, i32 1, i32* @rainbows, null} ; [ DW_TAG_variable ] [rainbows] [line 1] [def]
-!11 = metadata !{i32 786473, metadata !8}         ; [ DW_TAG_file_type ] [/home/kayamon/test2.c]
+!10 = metadata !{metadata !"0x34\00rainbows\00rainbows\00\001\000\001", null, metadata !11, metadata !6, i32* @rainbows, null} ; [ DW_TAG_variable ] [rainbows] [line 1] [def]
+!11 = metadata !{metadata !"0x29", metadata !8}         ; [ DW_TAG_file_type ] [/home/kayamon/test2.c]
 !12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/multiple-at-const-val.ll b/test/DebugInfo/X86/multiple-at-const-val.ll
index 27a5510..55991c1 100644
--- a/test/DebugInfo/X86/multiple-at-const-val.ll
+++ b/test/DebugInfo/X86/multiple-at-const-val.ll

@@ -27,37 +27,37 @@
 
 declare %"class.std::basic_ostream"* @test(%"class.std::basic_ostream"*, i8*, i64)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!1803}
 
-!0 = metadata !{i32 786449, metadata !1802, i32 4, metadata !"clang version 3.3 (trunk 174207)", i1 true, metadata !"", i32 0, metadata !1, metadata !955, metadata !956, metadata !1786,  metadata !955, metadata !""} ; [ DW_TAG_compile_unit ] [/privite/tmp/student2.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 174207)\001\00\000\00\000", metadata !1802, metadata !1, metadata !955, metadata !956, metadata !1786,  metadata !955} ; [ DW_TAG_compile_unit ] [/privite/tmp/student2.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !26}
-!4 = metadata !{i32 786489, null, metadata !"std", metadata !5, i32 48} ; [ DW_TAG_namespace ]
-!5 = metadata !{i32 786473, metadata !1801} ; [ DW_TAG_file_type ]
-!25 = metadata !{i32 786472, metadata !"_S_os_fmtflags_end", i64 65536} ; [ DW_TAG_enumerator ]
-!26 = metadata !{i32 786436, metadata !1801, metadata !4, metadata !"_Ios_Iostate", i32 146, i64 32, i64 32, i32 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [_Ios_Iostate] [line 146, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x39\00std\0048", null, metadata !5} ; [ DW_TAG_namespace ]
+!5 = metadata !{metadata !"0x29", metadata !1801} ; [ DW_TAG_file_type ]
+!25 = metadata !{metadata !"0x28\00_S_os_fmtflags_end\0065536"} ; [ DW_TAG_enumerator ]
+!26 = metadata !{metadata !"0x4\00_Ios_Iostate\00146\0032\0032\000\000\000", metadata !1801, metadata !4, null, metadata !27, null, null, null} ; [ DW_TAG_enumeration_type ] [_Ios_Iostate] [line 146, size 32, align 32, offset 0] [def] [from ]
 !27 = metadata !{metadata !28, metadata !29, metadata !30, metadata !31, metadata !32}
-!28 = metadata !{i32 786472, metadata !"_S_goodbit", i64 0} ; [ DW_TAG_enumerator ] [_S_goodbit :: 0]
-!29 = metadata !{i32 786472, metadata !"_S_badbit", i64 1} ; [ DW_TAG_enumerator ] [_S_badbit :: 1]
-!30 = metadata !{i32 786472, metadata !"_S_eofbit", i64 2} ; [ DW_TAG_enumerator ] [_S_eofbit :: 2]
-!31 = metadata !{i32 786472, metadata !"_S_failbit", i64 4} ; [ DW_TAG_enumerator ] [_S_failbit :: 4]
-!32 = metadata !{i32 786472, metadata !"_S_os_ostate_end", i64 65536} ; [ DW_TAG_enumerator ] [_S_os_ostate_end :: 65536]
-!49 = metadata !{i32 786434, metadata !1801, metadata !4, metadata !"os_base", i32 200, i64 1728, i64 64, i32 0, i32 0, null, metadata !50, i32 0, metadata !49, null, null} ; [ DW_TAG_class_type ] [os_base] [line 200, size 1728, align 64, offset 0] [def] [from ]
+!28 = metadata !{metadata !"0x28\00_S_goodbit\000"} ; [ DW_TAG_enumerator ] [_S_goodbit :: 0]
+!29 = metadata !{metadata !"0x28\00_S_badbit\001"} ; [ DW_TAG_enumerator ] [_S_badbit :: 1]
+!30 = metadata !{metadata !"0x28\00_S_eofbit\002"} ; [ DW_TAG_enumerator ] [_S_eofbit :: 2]
+!31 = metadata !{metadata !"0x28\00_S_failbit\004"} ; [ DW_TAG_enumerator ] [_S_failbit :: 4]
+!32 = metadata !{metadata !"0x28\00_S_os_ostate_end\0065536"} ; [ DW_TAG_enumerator ] [_S_os_ostate_end :: 65536]
+!49 = metadata !{metadata !"0x2\00os_base\00200\001728\0064\000\000\000", metadata !1801, metadata !4, null, metadata !50, metadata !49, null, null} ; [ DW_TAG_class_type ] [os_base] [line 200, size 1728, align 64, offset 0] [def] [from ]
 !50 = metadata !{metadata !77}
-!54 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !55, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!54 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !55, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !55 = metadata !{metadata !56}
-!56 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!77 = metadata !{i32 786445, metadata !1801, metadata !49, metadata !"badbit", i32 331, i64 0, i64 0, i64 0, i32 4096, metadata !78, i32 1} ; [ DW_TAG_member ]
-!78 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !79} ; [ DW_TAG_const_type ]
-!79 = metadata !{i32 786454, metadata !1801, metadata !49, metadata !"ostate", i32 327, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_typedef ]
+!56 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!77 = metadata !{metadata !"0xd\00badbit\00331\000\000\000\004096", metadata !1801, metadata !49, metadata !78, i32 1} ; [ DW_TAG_member ]
+!78 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !79} ; [ DW_TAG_const_type ]
+!79 = metadata !{metadata !"0x16\00ostate\00327\000\000\000\000", metadata !1801, metadata !49, metadata !26} ; [ DW_TAG_typedef ]
 !955 = metadata !{}
 !956 = metadata !{metadata !960}
-!960 = metadata !{i32 786478, metadata !1802, null, metadata !"main", metadata !"main", metadata !"", i32 73, metadata !54, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !955, i32 73} ; [ DW_TAG_subprogram ]
-!961 = metadata !{i32 786473, metadata !1802} ; [ DW_TAG_file_type ]
+!960 = metadata !{metadata !"0x2e\00main\00main\00\0073\000\001\000\006\00256\001\0073", metadata !1802, null, metadata !54, null, i32 ()* @main, null, null, metadata !955} ; [ DW_TAG_subprogram ]
+!961 = metadata !{metadata !"0x29", metadata !1802} ; [ DW_TAG_file_type ]
 !1786 = metadata !{metadata !1800}
-!1800 = metadata !{i32 786484, i32 0, metadata !5, metadata !"badbit", metadata !"badbit", metadata !"badbit", metadata !5, i32 331, metadata !78, i32 1, i32 1, i32 1, metadata !77} ; [ DW_TAG_variable ]
+!1800 = metadata !{metadata !"0x34\00badbit\00badbit\00badbit\00331\001\001", metadata !5, metadata !5, metadata !78, i32 1, metadata !77} ; [ DW_TAG_variable ]
 !1801 = metadata !{metadata !"os_base.h", metadata !"/privite/tmp"}
 !1802 = metadata !{metadata !"student2.cpp", metadata !"/privite/tmp"}
-!1803 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!1803 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/nodebug_with_debug_loc.ll b/test/DebugInfo/X86/nodebug_with_debug_loc.ll
new file mode 100644
index 0000000..555abe6
--- /dev/null
+++ b/test/DebugInfo/X86/nodebug_with_debug_loc.ll

@@ -0,0 +1,139 @@
+; REQUIRES: object-emission
+
+; RUN: llc -mtriple=i386-linux-gnu -filetype=obj -relocation-model=pic %s -o /dev/null
+
+; Derived from the test case in PR20367, there's nothing really positive to
+; test here (hence no FileCheck, etc). All that was wrong is that the debug info
+; intrinsics (introduced by inlining) in 'f1' were causing codegen to crash, but
+; since 'f1' is a nodebug function, there's no positive outcome to confirm, just
+; that debug info doesn't get in the way/cause a crash.
+
+; The test case isn't particularly well reduced/tidy, but as simple as I could
+; get the C++ source. I assume the complexity is mostly just about producing a
+; certain amount of register pressure, so it might be able to be simplified/made
+; more uniform.
+
+; Generated from:
+; $ clang-tot -cc1 -triple i386 -emit-obj -g -O3 repro.cpp
+; void sink(const void *);
+; int source();
+; void f3(int);
+; 
+; extern bool b;
+; 
+; struct string {
+;   unsigned *mem;
+; };
+; 
+; extern string &str;
+; 
+; inline __attribute__((always_inline)) void s2(string *lhs) { sink(lhs->mem); }
+; inline __attribute__((always_inline)) void f() {
+;   string str2;
+;   s2(&str2);
+;   sink(&str2);
+; }
+; void __attribute__((nodebug)) f1() {
+;   for (int iter = 0; iter != 2; ++iter) {
+;     f();
+;     sink(str.mem);
+;     if (b) return;
+;   }
+; }
+
+%struct.string = type { i32* }
+
+@str = external constant %struct.string*
+@b = external global i8
+
+; Function Attrs: nounwind
+define void @_Z2f1v() #0 {
+entry:
+  %str2.i = alloca %struct.string, align 4
+  %0 = bitcast %struct.string* %str2.i to i8*, !dbg !26
+  %1 = load %struct.string** @str, align 4
+  %mem = getelementptr inbounds %struct.string* %1, i32 0, i32 0
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iter.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  call void @llvm.lifetime.start(i64 4, i8* %0), !dbg !26
+  call void @llvm.dbg.value(metadata !{%struct.string* %str2.i}, i64 0, metadata !16, metadata !{metadata !"0x102"}) #3, !dbg !26
+  call void @llvm.dbg.value(metadata !{%struct.string* %str2.i}, i64 0, metadata !27, metadata !{metadata !"0x102"}) #3, !dbg !29
+  call void @_Z4sinkPKv(i8* undef) #3, !dbg !29
+  call void @_Z4sinkPKv(i8* %0) #3, !dbg !30
+  call void @llvm.lifetime.end(i64 4, i8* %0), !dbg !31
+  %2 = load i32** %mem, align 4, !tbaa !32
+  %3 = bitcast i32* %2 to i8*
+  call void @_Z4sinkPKv(i8* %3) #3
+  %4 = load i8* @b, align 1, !tbaa !37, !range !39
+  %tobool = icmp ne i8 %4, 0
+  %inc = add nsw i32 %iter.02, 1
+  %cmp = icmp eq i32 %inc, 2
+  %or.cond = or i1 %tobool, %cmp
+  br i1 %or.cond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare void @_Z4sinkPKv(i8*) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #3
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!23, !24}
+!llvm.ident = !{!25}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !10, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x13\00string\007\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS6string"} ; [ DW_TAG_structure_type ] [string] [line 7, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"repro.cpp", metadata !"/tmp/dbginfo"}
+!6 = metadata !{metadata !7}
+!7 = metadata !{metadata !"0xd\00mem\008\0032\0032\000\000", metadata !5, metadata !"_ZTS6string", metadata !8} ; [ DW_TAG_member ] [mem] [line 8, size 32, align 32, offset 0] [from ]
+!8 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from unsigned int]
+!9 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!10 = metadata !{metadata !11, metadata !17}
+!11 = metadata !{metadata !"0x2e\00f\00f\00_Z1fv\0014\000\001\000\006\00256\001\0014", metadata !5, metadata !12, metadata !13, null, null, null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 14] [def] [f]
+!12 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/repro.cpp]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{null}
+!15 = metadata !{metadata !16}
+!16 = metadata !{metadata !"0x100\00str2\0015\000", metadata !11, metadata !12, metadata !"_ZTS6string"} ; [ DW_TAG_auto_variable ] [str2] [line 15]
+!17 = metadata !{metadata !"0x2e\00s2\00s2\00_Z2s2P6string\0013\000\001\000\006\00256\001\0013", metadata !5, metadata !12, metadata !18, null, null, null, null, metadata !21} ; [ DW_TAG_subprogram ] [line 13] [def] [s2]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{null, metadata !20}
+!20 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !"_ZTS6string"} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from _ZTS6string]
+!21 = metadata !{metadata !22}
+!22 = metadata !{metadata !"0x101\00lhs\0016777229\000", metadata !17, metadata !12, metadata !20} ; [ DW_TAG_arg_variable ] [lhs] [line 13]
+!23 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!24 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!25 = metadata !{metadata !"clang version 3.5.0 "}
+!26 = metadata !{i32 15, i32 0, metadata !11, null}
+!27 = metadata !{metadata !"0x101\00lhs\0016777229\000", metadata !17, metadata !12, metadata !20, metadata !28} ; [ DW_TAG_arg_variable ] [lhs] [line 13]
+!28 = metadata !{i32 16, i32 0, metadata !11, null}
+!29 = metadata !{i32 13, i32 0, metadata !17, metadata !28}
+!30 = metadata !{i32 17, i32 0, metadata !11, null}
+!31 = metadata !{i32 18, i32 0, metadata !11, null}
+!32 = metadata !{metadata !33, metadata !34, i64 0}
+!33 = metadata !{metadata !"_ZTS6string", metadata !34, i64 0}
+!34 = metadata !{metadata !"any pointer", metadata !35, i64 0}
+!35 = metadata !{metadata !"omnipotent char", metadata !36, i64 0}
+!36 = metadata !{metadata !"Simple C/C++ TBAA"}
+!37 = metadata !{metadata !38, metadata !38, i64 0}
+!38 = metadata !{metadata !"bool", metadata !35, i64 0}
+!39 = metadata !{i8 0, i8 2}

diff --git a/test/DebugInfo/X86/nondefault-subrange-array.ll b/test/DebugInfo/X86/nondefault-subrange-array.ll
index 4df1bd4..212114f 100644
--- a/test/DebugInfo/X86/nondefault-subrange-array.ll
+++ b/test/DebugInfo/X86/nondefault-subrange-array.ll

@@ -19,34 +19,34 @@
 ; CHECK: DW_TAG_subrange_type
 ; CHECK-NEXT:                   DW_AT_type [DW_FORM_ref4]  (cu + 0x{{[0-9a-f]*}} => {[[BASE2:0x[0-9a-f]*]]})
 ; CHECK-NEXT:                   DW_AT_lower_bound [DW_FORM_data8]       (0xfffffffffffffffd)
-; CHECK-NEXT:                   DW_AT_upper_bound [DW_FORM_data1]       (0x26)
+; CHECK-NEXT:                   DW_AT_count [DW_FORM_data1]       (0x2a)
 
 ; CHECK: [[BASE]]: DW_TAG_base_type
 ; CHECK: [[BASE2]]: DW_TAG_base_type
 ; CHECK-NEXT:                 DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]*}}] = "sizetype")
 ; CHECK-NEXT:                 DW_AT_byte_size [DW_FORM_data1] (0x08)
-; CHECK-NEXT:                 DW_AT_encoding [DW_FORM_data1]  (0x07)
+; CHECK-NEXT:                 DW_AT_encoding [DW_FORM_data1]  (DW_ATE_unsigned)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.3 (trunk 169136)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169136)\000\00\000\00\000", metadata !20, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
-!6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 0, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !6, metadata !7, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x2\00A\001\000\0032\000\000\000", metadata !20, null, null, metadata !8, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{i32 786445, metadata !20, metadata !7, metadata !"x", i32 1, i64 0, i64 0, i64 0, i32 1, metadata !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
-!10 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xd\00x\001\000\000\000\001", metadata !20, metadata !7, metadata !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
+!10 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786465, i64 -3, i64 42} ; [ DW_TAG_subrange_type ] [-3, 39]
-!14 = metadata !{i32 786478, metadata !6, metadata !7, metadata !"A", metadata !"A", metadata !"", i32 1, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !18, i32 1} ; [ DW_TAG_subprogram ] [line 1] [A]
-!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x21\00-3\0042"} ; [ DW_TAG_subrange_type ] [-3, 39]
+!14 = metadata !{metadata !"0x2e\00A\00A\00\001\000\000\000\006\00320\000\001", metadata !6, metadata !7, metadata !15, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 1] [A]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17}
-!17 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
 !18 = metadata !{metadata !19}
-!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
 !20 = metadata !{metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/objc-fwd-decl.ll b/test/DebugInfo/X86/objc-fwd-decl.ll
index 1ec56be..e6144d0 100644
--- a/test/DebugInfo/X86/objc-fwd-decl.ll
+++ b/test/DebugInfo/X86/objc-fwd-decl.ll

@@ -12,16 +12,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11, !12, !14}
 
-!0 = metadata !{i32 786449, metadata !13, i32 16, metadata !"clang version 3.1 (trunk 152054 trunk 152094)", i1 false, metadata !"", i32 2, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0016\00clang version 3.1 (trunk 152054 trunk 152094)\000\00\002\00\000", metadata !13, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 3, metadata !7, i32 0, i32 1, %0** @a, null} ; [ DW_TAG_variable ]
-!6 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !13, null, metadata !"FooBarBaz", i32 1, i32 0, i32 0, i32 0, i32 4, null, null, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [FooBarBaz] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!5 = metadata !{metadata !"0x34\00a\00a\00\003\000\001", null, metadata !6, metadata !7, %0** @a, null} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ]
+!8 = metadata !{metadata !"0x13\00FooBarBaz\001\000\000\000\004\0016", metadata !13, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [FooBarBaz] [line 1, size 0, align 0, offset 0] [decl] [from ]
 !9 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
 !10 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
 !11 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
 !12 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
 !13 = metadata !{metadata !"foo.m", metadata !"/Users/echristo"}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/objc-property-void.ll b/test/DebugInfo/X86/objc-property-void.ll
index d366a7a..0f50869 100644
--- a/test/DebugInfo/X86/objc-property-void.ll
+++ b/test/DebugInfo/X86/objc-property-void.ll

@@ -56,14 +56,14 @@
   %self.addr = alloca %0*, align 8
   %_cmd.addr = alloca i8*, align 8
   store %0* %self, %0** %self.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%0** %self.addr}, metadata !24), !dbg !26
+  call void @llvm.dbg.declare(metadata !{%0** %self.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !26
   store i8* %_cmd, i8** %_cmd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %_cmd.addr}, metadata !27), !dbg !26
+  call void @llvm.dbg.declare(metadata !{i8** %_cmd.addr}, metadata !27, metadata !{metadata !"0x102"}), !dbg !26
   ret void, !dbg !29
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -72,33 +72,33 @@
 !llvm.module.flags = !{!17, !18, !19, !20, !21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{i32 786449, metadata !1, i32 16, metadata !"", i1 false, metadata !"", i32 2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [] [DW_LANG_ObjC]
+!0 = metadata !{metadata !"0x11\0016\00\000\00\002\00\000", metadata !1, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [] [DW_LANG_ObjC]
 !1 = metadata !{metadata !"-", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"Foo", i32 1, i64 0, i64 8, i32 0, i32 512, null, metadata !7, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [Foo] [line 1, size 0, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00Foo\001\000\008\000\00512\0016", metadata !5, metadata !6, null, metadata !7, null, null, null} ; [ DW_TAG_structure_type ] [Foo] [line 1, size 0, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !"<stdin>", metadata !""}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] []
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] []
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 803328, metadata !"foo", metadata !6, i32 2, metadata !"", metadata !"", i32 2117, null} ; [ DW_TAG_APPLE_property ] [foo] [line 2, properties 2117]
+!8 = metadata !{metadata !"0x4200\00foo\002\00\00\002117", metadata !6, null} ; [ DW_TAG_APPLE_property ] [foo] [line 2, properties 2117]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"-[Foo foo]", metadata !"-[Foo foo]", metadata !"", i32 5, metadata !11, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%0*, i8*)* @"\01-[Foo foo]", null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [-[Foo foo]]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x2e\00-[Foo foo]\00-[Foo foo]\00\005\001\001\000\006\00256\000\005", metadata !5, metadata !6, metadata !11, null, void (%0*, i8*)* @"\01-[Foo foo]", null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [-[Foo foo]]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13, metadata !14}
-!13 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Foo]
-!14 = metadata !{i32 786454, metadata !5, null, metadata !"SEL", i32 5, i64 0, i64 0, i64 0, i32 64, metadata !15} ; [ DW_TAG_typedef ] [SEL] [line 5, size 0, align 0, offset 0] [artificial] [from ]
-!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
-!16 = metadata !{i32 786451, metadata !1, null, metadata !"objc_selector", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Foo]
+!14 = metadata !{metadata !"0x16\00SEL\005\000\000\000\0064", metadata !5, null, metadata !15} ; [ DW_TAG_typedef ] [SEL] [line 5, size 0, align 0, offset 0] [artificial] [from ]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
+!16 = metadata !{metadata !"0x13\00objc_selector\000\000\000\000\004\000", metadata !1, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
 !17 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
 !18 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
 !19 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
 !20 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
 !21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !23 = metadata !{metadata !""}
-!24 = metadata !{i32 786689, metadata !10, metadata !"self", null, i32 16777216, metadata !25, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [self] [line 0]
-!25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Foo]
+!24 = metadata !{metadata !"0x101\00self\0016777216\001088", metadata !10, null, metadata !25} ; [ DW_TAG_arg_variable ] [self] [line 0]
+!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Foo]
 !26 = metadata !{i32 0, i32 0, metadata !10, null}
-!27 = metadata !{i32 786689, metadata !10, metadata !"_cmd", null, i32 33554432, metadata !28, i32 64, i32 0} ; [ DW_TAG_arg_variable ] [_cmd] [line 0]
-!28 = metadata !{i32 786454, metadata !5, null, metadata !"SEL", i32 5, i64 0, i64 0, i64 0, i32 0, metadata !15} ; [ DW_TAG_typedef ] [SEL] [line 5, size 0, align 0, offset 0] [from ]
+!27 = metadata !{metadata !"0x101\00_cmd\0033554432\0064", metadata !10, null, metadata !28} ; [ DW_TAG_arg_variable ] [_cmd] [line 0]
+!28 = metadata !{metadata !"0x16\00SEL\005\000\000\000\000", metadata !5, null, metadata !15} ; [ DW_TAG_typedef ] [SEL] [line 5, size 0, align 0, offset 0] [from ]
 !29 = metadata !{i32 5, i32 0, metadata !10, null}

diff --git a/test/DebugInfo/X86/op_deref.ll b/test/DebugInfo/X86/op_deref.ll
index 31003ee..18c4fc1 100644
--- a/test/DebugInfo/X86/op_deref.ll
+++ b/test/DebugInfo/X86/op_deref.ll

@@ -29,14 +29,14 @@
   %saved_stack = alloca i8*
   %i = alloca i32, align 4
   store i32 %s, i32* %s.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %s.addr}, metadata !10), !dbg !11
+  call void @llvm.dbg.declare(metadata !{i32* %s.addr}, metadata !10, metadata !{metadata !"0x102"}), !dbg !11
   %0 = load i32* %s.addr, align 4, !dbg !12
   %1 = zext i32 %0 to i64, !dbg !12
   %2 = call i8* @llvm.stacksave(), !dbg !12
   store i8* %2, i8** %saved_stack, !dbg !12
   %vla = alloca i32, i64 %1, align 16, !dbg !12
-  call void @llvm.dbg.declare(metadata !{i32* %vla}, metadata !14), !dbg !18
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !19), !dbg !20
+  call void @llvm.dbg.declare(metadata !{i32* %vla}, metadata !14, metadata !30), !dbg !18
+  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
   store i32 0, i32* %i, align 4, !dbg !21
   br label %for.cond, !dbg !21
 
@@ -68,7 +68,7 @@
   ret void, !dbg !27
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i8* @llvm.stacksave() nounwind
 
@@ -77,32 +77,32 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{i32 786449, metadata !28, i32 12, metadata !"clang version 3.2 (trunk 156005) (llvm/trunk 156000)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 156005) (llvm/trunk 156000)\000\00\000\00\001", metadata !28, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !28, metadata !6, metadata !"testVLAwithSize", metadata !"testVLAwithSize", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @testVLAwithSize, null, null, metadata !1, i32 2} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00testVLAwithSize\00testVLAwithSize\00\001\000\001\000\006\00256\000\002", metadata !28, metadata !6, metadata !7, null, void (i32)* @testVLAwithSize, null, null, metadata !1} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786689, metadata !5, metadata !"s", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x101\00s\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
 !11 = metadata !{i32 1, i32 26, metadata !5, null}
 !12 = metadata !{i32 3, i32 13, metadata !13, null}
-!13 = metadata !{i32 786443, metadata !28, metadata !5, i32 2, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{i32 786688, metadata !13, metadata !"vla", metadata !6, i32 3, metadata !15, i32 8192, i32 0, metadata !30} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!13 = metadata !{metadata !"0xb\002\001\000", metadata !28, metadata !5} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0x100\00vla\003\008192", metadata !13, metadata !6, metadata !15} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !16 = metadata !{metadata !17}
-!17 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
+!17 = metadata !{metadata !"0x21\000\00-1"}        ; [ DW_TAG_subrange_type ]
 !18 = metadata !{i32 3, i32 7, metadata !13, null}
-!19 = metadata !{i32 786688, metadata !13, metadata !"i", metadata !6, i32 4, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+!19 = metadata !{metadata !"0x100\00i\004\000", metadata !13, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
 !20 = metadata !{i32 4, i32 7, metadata !13, null}
 !21 = metadata !{i32 5, i32 8, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !28, metadata !13, i32 5, i32 3, i32 1} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\005\003\001", metadata !28, metadata !13} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{i32 6, i32 5, metadata !24, null}
-!24 = metadata !{i32 786443, metadata !28, metadata !22, i32 5, i32 27, i32 2} ; [ DW_TAG_lexical_block ]
+!24 = metadata !{metadata !"0xb\005\0027\002", metadata !28, metadata !22} ; [ DW_TAG_lexical_block ]
 !25 = metadata !{i32 7, i32 3, metadata !24, null}
 !26 = metadata !{i32 5, i32 22, metadata !22, null}
 !27 = metadata !{i32 8, i32 1, metadata !13, null}
 !28 = metadata !{metadata !"bar.c", metadata !"/Users/echristo/tmp"}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!30 = metadata !{i64 2}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!30 = metadata !{metadata !"0x102\006"} ; [ DW_TAG_expression ] [DW_OP_deref]

diff --git a/test/DebugInfo/X86/parameters.ll b/test/DebugInfo/X86/parameters.ll
index 4215c21..fde63e7 100644
--- a/test/DebugInfo/X86/parameters.ll
+++ b/test/DebugInfo/X86/parameters.ll

@@ -42,13 +42,13 @@
 ; Function Attrs: uwtable
 define void @_ZN7pr147634funcENS_3fooE(%"struct.pr14763::foo"* noalias sret %agg.result, %"struct.pr14763::foo"* %f) #0 {
 entry:
-  call void @llvm.dbg.declare(metadata !{%"struct.pr14763::foo"* %f}, metadata !22), !dbg !24
+  call void @llvm.dbg.declare(metadata !{%"struct.pr14763::foo"* %f}, metadata !22, metadata !{metadata !"0x102"}), !dbg !24
   call void @_ZN7pr147633fooC1ERKS0_(%"struct.pr14763::foo"* %agg.result, %"struct.pr14763::foo"* %f), !dbg !25
   ret void, !dbg !25
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare void @_ZN7pr147633fooC1ERKS0_(%"struct.pr14763::foo"*, %"struct.pr14763::foo"*) #2
 
@@ -58,8 +58,8 @@
   %b.addr = alloca i8, align 1
   %frombool = zext i1 %b to i8
   store i8 %frombool, i8* %b.addr, align 1
-  call void @llvm.dbg.declare(metadata !{i8* %b.addr}, metadata !26), !dbg !27
-  call void @llvm.dbg.declare(metadata !{%"struct.pr14763::foo"* %g}, metadata !28), !dbg !27
+  call void @llvm.dbg.declare(metadata !{i8* %b.addr}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
+  call void @llvm.dbg.declare(metadata !{%"struct.pr14763::foo"* %g}, metadata !28, metadata !{metadata !"0x102"}), !dbg !27
   %0 = load i8* %b.addr, align 1, !dbg !29
   %tobool = trunc i8 %0 to i1, !dbg !29
   br i1 %tobool, label %if.then, label %if.end, !dbg !29
@@ -82,37 +82,37 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21, !33}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/pass.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/pass.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"pass.cpp", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !17}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_ZN7pr147634funcENS_3fooE", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%"struct.pr14763::foo"*, %"struct.pr14763::foo"*)* @_ZN7pr147634funcENS_3fooE, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
-!5 = metadata !{i32 786489, metadata !1, null, metadata !"pr14763", i32 1} ; [ DW_TAG_namespace ] [pr14763] [line 1]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00func\00func\00_ZN7pr147634funcENS_3fooE\006\000\001\000\006\00256\000\006", metadata !1, metadata !5, metadata !6, null, void (%"struct.pr14763::foo"*, %"struct.pr14763::foo"*)* @_ZN7pr147634funcENS_3fooE, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!5 = metadata !{metadata !"0x39\00pr14763\001", metadata !1, null} ; [ DW_TAG_namespace ] [pr14763] [line 1]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786451, metadata !1, metadata !5, metadata !"foo", i32 2, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 2, size 8, align 8, offset 0] [def] [from ]
+!8 = metadata !{metadata !"0x13\00foo\002\008\008\000\000\000", metadata !1, metadata !5, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 2, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786478, metadata !1, metadata !8, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !16, i32 3} ; [ DW_TAG_subprogram ] [line 3] [foo]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\000\000\006\00256\000\003", metadata !1, metadata !8, metadata !11, null, null, null, i32 0, metadata !16} ; [ DW_TAG_subprogram ] [line 3] [foo]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13, metadata !14}
-!13 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
-!14 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !15} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!15 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
+!14 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !15} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
 !16 = metadata !{i32 786468}
-!17 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func2", metadata !"func2", metadata !"_ZN7pr147635func2EbNS_3fooE", i32 12, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i1, %"struct.pr14763::foo"*)* @_ZN7pr147635func2EbNS_3fooE, null, null, metadata !2, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [func2]
-!18 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0x2e\00func2\00func2\00_ZN7pr147635func2EbNS_3fooE\0012\000\001\000\006\00256\000\0012", metadata !1, metadata !5, metadata !18, null, void (i1, %"struct.pr14763::foo"*)* @_ZN7pr147635func2EbNS_3fooE, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 12] [def] [func2]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{null, metadata !20, metadata !8}
-!20 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!20 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
 !21 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!22 = metadata !{i32 786689, metadata !4, metadata !"f", metadata !23, i32 16777222, metadata !8, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [f] [line 6]
-!23 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/pass.cpp]
+!22 = metadata !{metadata !"0x101\00f\0016777222\008192", metadata !4, metadata !23, metadata !8} ; [ DW_TAG_arg_variable ] [f] [line 6]
+!23 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/pass.cpp]
 !24 = metadata !{i32 6, i32 0, metadata !4, null}
 !25 = metadata !{i32 7, i32 0, metadata !4, null}
-!26 = metadata !{i32 786689, metadata !17, metadata !"b", metadata !23, i32 16777228, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 12]
+!26 = metadata !{metadata !"0x101\00b\0016777228\000", metadata !17, metadata !23, metadata !20} ; [ DW_TAG_arg_variable ] [b] [line 12]
 !27 = metadata !{i32 12, i32 0, metadata !17, null}
-!28 = metadata !{i32 786689, metadata !17, metadata !"g", metadata !23, i32 33554444, metadata !8, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [g] [line 12]
+!28 = metadata !{metadata !"0x101\00g\0033554444\008192", metadata !17, metadata !23, metadata !8} ; [ DW_TAG_arg_variable ] [g] [line 12]
 !29 = metadata !{i32 13, i32 0, metadata !30, null}
-!30 = metadata !{i32 786443, metadata !1, metadata !17, i32 13, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/pass.cpp]
+!30 = metadata !{metadata !"0xb\0013\000\000", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [/tmp/pass.cpp]
 !31 = metadata !{i32 14, i32 0, metadata !30, null}
 !32 = metadata !{i32 15, i32 0, metadata !17, null}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/pieces-1.ll b/test/DebugInfo/X86/pieces-1.ll
new file mode 100644
index 0000000..db36b03
--- /dev/null
+++ b/test/DebugInfo/X86/pieces-1.ll

@@ -0,0 +1,79 @@
+; RUN: llc -O0 %s -filetype=obj -o %t.o
+; RUN: llvm-dwarfdump -debug-dump=loc %t.o | FileCheck %s
+;
+; rdar://problem/15928306
+;
+; Test that we can emit debug info for aggregate values that are split
+; up across multiple registers by SROA.
+;
+;    // Compile with -O1.
+;    typedef struct { long int a; int b;} S;
+;
+;    int foo(S s) {
+;            return s.b;
+;    }
+;
+;
+; CHECK: .debug_loc contents:
+;
+
+; 0x0000000000000000 - 0x0000000000000006: rdi, piece 0x00000008, rsi, piece 0x00000004
+; CHECK:            Beginning address offset: 0x0000000000000000
+; CHECK:               Ending address offset: [[LTMP3:.*]]
+; CHECK:                Location description: 55 93 08 54 93 04
+; 0x0000000000000006 - 0x0000000000000008: rbp-8, piece 0x00000008, rax, piece 0x00000004 )
+; CHECK:            Beginning address offset: [[LTMP3]]
+; CHECK:               Ending address offset: [[END:.*]]
+; CHECK:                Location description: 76 78 93 08 54 93 04
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo(i64 %s.coerce0, i32 %s.coerce1) #0 {
+entry:
+  call void @llvm.dbg.value(metadata !{i64 %s.coerce0}, i64 0, metadata !20, metadata !24), !dbg !21
+  call void @llvm.dbg.value(metadata !{i32 %s.coerce1}, i64 0, metadata !22, metadata !27), !dbg !21
+  ret i32 %s.coerce1, !dbg !23
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17, !18}
+!llvm.ident = !{!19}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"pieces.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, i32 (i64, i32)* @foo, null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/pieces.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !9}
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x16\00S\001\000\000\000\000", metadata !1, null, metadata !10} ; [ DW_TAG_typedef ] [S] [line 1, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x13\00\001\00128\0064\000\000\000", metadata !1, null, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 128, align 64, offset 0] [def] [from ]
+!11 = metadata !{metadata !12, metadata !14}
+!12 = metadata !{metadata !"0xd\00a\001\0064\0064\000\000", metadata !1, metadata !10, metadata !13} ; [ DW_TAG_member ] [a] [line 1, size 64, align 64, offset 0] [from long int]
+!13 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !"0xd\00b\001\0032\0032\0064\000", metadata !1, metadata !10, metadata !8} ; [ DW_TAG_member ] [b] [line 1, size 32, align 32, offset 64] [from int]
+!15 = metadata !{metadata !16}
+!16 = metadata !{metadata !"0x101\00s\0016777219\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!19 = metadata !{metadata !"clang version 3.5 "}
+!20 = metadata !{metadata !"0x101\00s\0016777219\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!21 = metadata !{i32 3, i32 0, metadata !4, null}
+!22 = metadata !{metadata !"0x101\00s\0016777219\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [s] [line 3]
+!23 = metadata !{i32 4, i32 0, metadata !4, null}
+!24 = metadata !{metadata !"0x102\00147\000\008"} ; [ DW_TAG_expression ] [DW_OP_piece 0 8] [piece, size 8, offset 0]
+!25 = metadata !{}
+!27 = metadata !{metadata !"0x102\00147\008\004"} ; [ DW_TAG_expression ] [DW_OP_piece 8 4] [piece, size 4, offset 8]

diff --git a/test/DebugInfo/X86/pieces-2.ll b/test/DebugInfo/X86/pieces-2.ll
new file mode 100644
index 0000000..760c9f6
--- /dev/null
+++ b/test/DebugInfo/X86/pieces-2.ll

@@ -0,0 +1,91 @@
+; RUN: llc %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+;
+;    // Compile with -O1
+;    typedef struct {
+;      int a;
+;      long int b;
+;    } Inner;
+;
+;    typedef struct {
+;      Inner inner[2];
+;    } Outer;
+;
+;    int foo(Outer outer) {
+;      Inner i1 = outer.inner[1];
+;      return i1.a;
+;    }
+;
+;
+; CHECK: DW_TAG_variable [4]
+;                                                  rax, piece 0x00000004
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1]{{.*}}50 93 04
+; CHECK-NEXT:  DW_AT_name {{.*}}"i1"
+;
+; ModuleID = '/Volumes/Data/llvm/test/DebugInfo/X86/sroasplit-1.ll'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%struct.Outer = type { [2 x %struct.Inner] }
+%struct.Inner = type { i32, i64 }
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo(%struct.Outer* byval align 8 %outer) #0 {
+entry:
+  call void @llvm.dbg.declare(metadata !{%struct.Outer* %outer}, metadata !25, metadata !{metadata !"0x102"}), !dbg !26
+  %i1.sroa.0.0..sroa_idx = getelementptr inbounds %struct.Outer* %outer, i64 0, i32 0, i64 1, i32 0, !dbg !27
+  %i1.sroa.0.0.copyload = load i32* %i1.sroa.0.0..sroa_idx, align 8, !dbg !27
+  call void @llvm.dbg.value(metadata !{i32 %i1.sroa.0.0.copyload}, i64 0, metadata !28, metadata !29), !dbg !27
+  %i1.sroa.2.0..sroa_raw_cast = bitcast %struct.Outer* %outer to i8*, !dbg !27
+  %i1.sroa.2.0..sroa_raw_idx = getelementptr inbounds i8* %i1.sroa.2.0..sroa_raw_cast, i64 20, !dbg !27
+  ret i32 %i1.sroa.0.0.copyload, !dbg !32
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22, !23}
+!llvm.ident = !{!24}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/sroasplit-1.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"sroasplit-1.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\0010\000\001\000\006\00256\000\0010", metadata !1, metadata !5, metadata !6, null, i32 (%struct.Outer*)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/sroasplit-1.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !9}
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x16\00Outer\008\000\000\000\000", metadata !1, null, metadata !10} ; [ DW_TAG_typedef ] [Outer] [line 8, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x13\00\006\00256\0064\000\000\000", metadata !1, null, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [line 6, size 256, align 64, offset 0] [def] [from ]
+!11 = metadata !{metadata !12}
+!12 = metadata !{metadata !"0xd\00inner\007\00256\0064\000\000", metadata !1, metadata !10, metadata !13} ; [ DW_TAG_member ] [inner] [line 7, size 256, align 64, offset 0] [from ]
+!13 = metadata !{metadata !"0x1\00\000\00256\0064\000\000", null, null, metadata !14, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from Inner]
+!14 = metadata !{metadata !"0x16\00Inner\004\000\000\000\000", metadata !1, null, metadata !15} ; [ DW_TAG_typedef ] [Inner] [line 4, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x13\00\001\00128\0064\000\000\000", metadata !1, null, null, metadata !16, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 128, align 64, offset 0] [def] [from ]
+!16 = metadata !{metadata !17, metadata !18}
+!17 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !15, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!18 = metadata !{metadata !"0xd\00b\003\0064\0064\0064\000", metadata !1, metadata !15, metadata !19} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from long int]
+!19 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!20 = metadata !{metadata !21}
+!21 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
+!22 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!24 = metadata !{metadata !"clang version 3.5.0 "}
+!25 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!26 = metadata !{i32 10, i32 0, metadata !4, null}
+!27 = metadata !{i32 11, i32 0, metadata !4, null}
+!28 = metadata !{metadata !"0x100\00i1\0011\000", metadata !4, metadata !5, metadata !14} ; [ DW_TAG_auto_variable ] [i1] [line 11]
+!29 = metadata !{metadata !"0x102\00147\000\004"} ; [ DW_TAG_expression ] [DW_OP_piece 0 4] [piece, size 4, offset 0]
+!31 = metadata !{i32 3, i32 0, i32 12}
+!32 = metadata !{i32 12, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/pieces-3.ll b/test/DebugInfo/X86/pieces-3.ll
new file mode 100644
index 0000000..5dd480d
--- /dev/null
+++ b/test/DebugInfo/X86/pieces-3.ll

@@ -0,0 +1,106 @@
+; RUN: llc %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+;
+;    // Compile with -O1
+;    typedef struct {
+;      int a;
+;      int b;
+;    } Inner;
+;
+;    typedef struct {
+;      Inner inner[2];
+;    } Outer;
+;
+;    int foo(Outer outer) {
+;      Inner i1 = outer.inner[1];
+;      return i1.a;
+;    }
+;
+; CHECK: DW_TAG_formal_parameter [3]
+; CHECK-NEXT:   DW_AT_location [DW_FORM_data4]        ([[LOC:.*]])
+; CHECK-NEXT:   DW_AT_name {{.*}}"outer"
+; CHECK: DW_TAG_variable
+;                                                 rsi, piece 0x00000004
+; CHECK-NEXT:   DW_AT_location [DW_FORM_block1]       {{.*}} 54 93 04
+; CHECK-NEXT:   "i1"
+;
+; CHECK: .debug_loc
+; CHECK: [[LOC]]:
+; CHECK: Beginning address offset: 0x0000000000000000
+; CHECK:    Ending address offset: 0x0000000000000004
+; rdi, piece 0x00000008, piece 0x00000004, rsi, piece 0x00000004
+; CHECK: Location description: 55 93 08 93 04 54 93 04 
+;
+; ModuleID = '/Volumes/Data/llvm/test/DebugInfo/X86/sroasplit-2.ll'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo(i64 %outer.coerce0, i64 %outer.coerce1) #0 {
+  call void @llvm.dbg.value(metadata !{i64 %outer.coerce0}, i64 0, metadata !24, metadata !25), !dbg !26
+  call void @llvm.dbg.declare(metadata !{null}, metadata !27, metadata !28), !dbg !26
+  call void @llvm.dbg.value(metadata !{i64 %outer.coerce1}, i64 0, metadata !29, metadata !30), !dbg !26
+  call void @llvm.dbg.declare(metadata !{null}, metadata !31, metadata !32), !dbg !26
+  %outer.sroa.1.8.extract.trunc = trunc i64 %outer.coerce1 to i32, !dbg !33
+  call void @llvm.dbg.value(metadata !{i32 %outer.sroa.1.8.extract.trunc}, i64 0, metadata !34, metadata !35), !dbg !33
+  %outer.sroa.1.12.extract.shift = lshr i64 %outer.coerce1, 32, !dbg !33
+  %outer.sroa.1.12.extract.trunc = trunc i64 %outer.sroa.1.12.extract.shift to i32, !dbg !33
+  call void @llvm.dbg.value(metadata !{i64 %outer.sroa.1.12.extract.shift}, i64 0, metadata !34, metadata !35), !dbg !33
+  call void @llvm.dbg.value(metadata !{i32 %outer.sroa.1.12.extract.trunc}, i64 0, metadata !34, metadata !35), !dbg !33
+  call void @llvm.dbg.declare(metadata !{null}, metadata !34, metadata !35), !dbg !33
+  ret i32 %outer.sroa.1.8.extract.trunc, !dbg !36
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "no-frame-pointer-elim"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21, !22}
+!llvm.ident = !{!23}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/sroasplit-2.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"sroasplit-2.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\0010\000\001\000\006\00256\000\0010", metadata !1, metadata !5, metadata !6, null, i32 (i64, i64)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/sroasplit-2.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !9}
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x16\00Outer\008\000\000\000\000", metadata !1, null, metadata !10} ; [ DW_TAG_typedef ] [Outer] [line 8, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x13\00\006\00128\0032\000\000\000", metadata !1, null, null, metadata !11, null, null, null} ; [ DW_TAG_structure_type ] [line 6, size 128, align 32, offset 0] [def] [from ]
+!11 = metadata !{metadata !12}
+!12 = metadata !{metadata !"0xd\00inner\007\00128\0032\000\000", metadata !1, metadata !10, metadata !13} ; [ DW_TAG_member ] [inner] [line 7, size 128, align 32, offset 0] [from ]
+!13 = metadata !{metadata !"0x1\00\000\00128\0032\000\000", null, null, metadata !14, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 32, offset 0] [from Inner]
+!14 = metadata !{metadata !"0x16\00Inner\004\000\000\000\000", metadata !1, null, metadata !15} ; [ DW_TAG_typedef ] [Inner] [line 4, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x13\00\001\0064\0032\000\000\000", metadata !1, null, null, metadata !16, null, null, null} ; [ DW_TAG_structure_type ] [line 1, size 64, align 32, offset 0] [def] [from ]
+!16 = metadata !{metadata !17, metadata !18}
+!17 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !15, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!18 = metadata !{metadata !"0xd\00b\003\0032\0032\0032\000", metadata !1, metadata !15, metadata !8} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from int]
+!19 = metadata !{metadata !20}
+!20 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
+!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!23 = metadata !{metadata !"clang version 3.5.0 "}
+!24 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!25 = metadata !{metadata !"0x102\00147\000\008"} ; [ DW_TAG_expression ] [DW_OP_piece 0 8] [piece, size 8, offset 0]
+!26 = metadata !{i32 10, i32 0, metadata !4, null}
+!27 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!28 = metadata !{metadata !"0x102\00147\008\008"} ; [ DW_TAG_expression ] [DW_OP_piece 8 8] [piece, size 8, offset 8]
+!29 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!30 = metadata !{metadata !"0x102\00147\0012\004"} ; [ DW_TAG_expression ] [DW_OP_piece 12 4] [piece, size 4, offset 12]
+!31 = metadata !{metadata !"0x101\00outer\0016777226\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [outer] [line 10]
+!32 = metadata !{metadata !"0x102\00147\008\004"} ; [ DW_TAG_expression ] [DW_OP_piece 8 4] [piece, size 4, offset 8]
+!33 = metadata !{i32 11, i32 0, metadata !4, null}
+!34 = metadata !{metadata !"0x100\00i1\0011\000", metadata !4, metadata !5, metadata !14} ; [ DW_TAG_auto_variable ] [i1] [line 11]
+!35 = metadata !{metadata !"0x102\00147\000\004"} ; [ DW_TAG_expression ] [DW_OP_piece 0 4] [piece, size 4, offset 0]
+!36 = metadata !{i32 12, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/pointer-type-size.ll b/test/DebugInfo/X86/pointer-type-size.ll
index 40dc955..1280181 100644
--- a/test/DebugInfo/X86/pointer-type-size.ll
+++ b/test/DebugInfo/X86/pointer-type-size.ll

@@ -11,16 +11,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14}
 
-!0 = metadata !{i32 786449, metadata !13, i32 12, metadata !"clang version 3.1 (trunk 147882)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 147882)\000\00\000\00\000", metadata !13, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720948, i32 0, null, metadata !"crass", metadata !"crass", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, %struct.crass* @crass, null} ; [ DW_TAG_variable ]
-!6 = metadata !{i32 720937, metadata !13} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786451, metadata !13, null, metadata !"crass", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [crass] [line 1, size 64, align 64, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x34\00crass\00crass\00\001\000\001", null, metadata !6, metadata !7, %struct.crass* @crass, null} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x13\00crass\001\0064\0064\000\000\000", metadata !13, null, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [crass] [line 1, size 64, align 64, offset 0] [def] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786445, metadata !13, metadata !7, metadata !"ptr", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 720934, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_const_type ]
-!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0xd\00ptr\001\0064\0064\000\000", metadata !13, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !11} ; [ DW_TAG_const_type ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
 !13 = metadata !{metadata !"foo.c", metadata !"/Users/echristo/tmp"}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/pr11300.ll b/test/DebugInfo/X86/pr11300.ll
index 11c409c..4fdbbed 100644
--- a/test/DebugInfo/X86/pr11300.ll
+++ b/test/DebugInfo/X86/pr11300.ll

@@ -6,11 +6,11 @@
 ; Skip the definition of zed(foo*)
 ; CHECK: DW_TAG_subprogram
 ; CHECK: DW_TAG_class_type
-; CHECK: [[BAR_DECL:0x[0-9a-f]*]]:     DW_TAG_subprogram
+; CHECK:   DW_TAG_subprogram
 ; CHECK:     DW_AT_MIPS_linkage_name {{.*}} "_ZN3foo3barEv"
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_specification {{.*}} {[[BAR_DECL]]}
+; CHECK:   DW_AT_specification {{.*}} "_ZN3foo3barEv"
 
 %struct.foo = type { i8 }
 
@@ -18,19 +18,19 @@
 entry:
   %x.addr = alloca %struct.foo*, align 8
   store %struct.foo* %x, %struct.foo** %x.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %x.addr}, metadata !23), !dbg !24
+  call void @llvm.dbg.declare(metadata !{%struct.foo** %x.addr}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
   %0 = load %struct.foo** %x.addr, align 8, !dbg !25
   call void @_ZN3foo3barEv(%struct.foo* %0), !dbg !25
   ret void, !dbg !27
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define linkonce_odr void @_ZN3foo3barEv(%struct.foo* %this) nounwind uwtable align 2 {
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !28), !dbg !29
+  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !29
   %this1 = load %struct.foo** %this.addr
   ret void, !dbg !30
 }
@@ -38,33 +38,33 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786449, metadata !32, i32 4, metadata !"clang version 3.0 ()", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.0 ()\000\00\000\00\000", metadata !32, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !20}
-!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"zed", metadata !"zed", metadata !"_Z3zedP3foo", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_Z3zedP3foo, null, null, null, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [zed]
-!6 = metadata !{i32 720937, metadata !32} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00zed\00zed\00_Z3zedP3foo\004\000\001\000\006\00256\000\004", metadata !6, metadata !6, metadata !7, null, void (%struct.foo*)* @_Z3zedP3foo, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [zed]
+!6 = metadata !{metadata !"0x29", metadata !32} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 720898, metadata !32, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_class_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x2\00foo\001\008\008\000\000\000", metadata !32, null, null, metadata !11, null, null, null} ; [ DW_TAG_class_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 720942, metadata !6, metadata !10, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !13, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !16, i32 2} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\002\000\000\000\006\00256\000\002", metadata !6, metadata !10, metadata !13, null, null, null, i32 0, metadata !16} ; [ DW_TAG_subprogram ]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{null, metadata !15}
-!15 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !10} ; [ DW_TAG_pointer_type ]
 !16 = metadata !{metadata !17}
-!17 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
 !18 = metadata !{metadata !19}
-!19 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!20 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_ZN3foo3barEv, null, metadata !12, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
-!23 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!20 = metadata !{metadata !"0x2e\00bar\00bar\00_ZN3foo3barEv\002\000\001\000\006\00256\000\002", metadata !6, null, metadata !13, null, void (%struct.foo*)* @_ZN3foo3barEv, null, metadata !12, null} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!23 = metadata !{metadata !"0x101\00x\0016777220\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 4, i32 15, metadata !5, null}
 !25 = metadata !{i32 4, i32 20, metadata !26, null}
-!26 = metadata !{i32 786443, metadata !6, metadata !5, i32 4, i32 18, i32 0} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{metadata !"0xb\004\0018\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
 !27 = metadata !{i32 4, i32 30, metadata !26, null}
-!28 = metadata !{i32 786689, metadata !20, metadata !"this", metadata !6, i32 16777218, metadata !15, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!28 = metadata !{metadata !"0x101\00this\0016777218\0064", metadata !20, metadata !6, metadata !15} ; [ DW_TAG_arg_variable ]
 !29 = metadata !{i32 2, i32 8, metadata !20, null}
 !30 = metadata !{i32 2, i32 15, metadata !31, null}
-!31 = metadata !{i32 786443, metadata !6, metadata !20, i32 2, i32 14, i32 1} ; [ DW_TAG_lexical_block ]
+!31 = metadata !{metadata !"0xb\002\0014\001", metadata !6, metadata !20} ; [ DW_TAG_lexical_block ]
 !32 = metadata !{metadata !"/home/espindola/llvm/test.cc", metadata !"/home/espindola/tmpfs/build"}
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/pr12831.ll b/test/DebugInfo/X86/pr12831.ll
index 79d00ed..3951bbd 100644
--- a/test/DebugInfo/X86/pr12831.ll
+++ b/test/DebugInfo/X86/pr12831.ll

@@ -9,8 +9,8 @@
 %class.anon = type { i8 }
 %class.anon.0 = type { i8 }
 
-@"_ZN8functionIFvvEEC1IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_" = alias internal void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_"
-@"_ZN8functionIFvvEEC1IZN17BPLFunctionWriter9writeExprEvE3$_0EET_" = alias internal void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_"
+@"_ZN8functionIFvvEEC1IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_" = internal alias void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_"
+@"_ZN8functionIFvvEEC1IZN17BPLFunctionWriter9writeExprEvE3$_0EET_" = internal alias void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_"
 
 define void @_ZN17BPLFunctionWriter9writeExprEv(%class.BPLFunctionWriter* %this) nounwind uwtable align 2 {
 entry:
@@ -20,7 +20,7 @@
   %agg.tmp4 = alloca %class.function, align 1
   %agg.tmp5 = alloca %class.anon.0, align 1
   store %class.BPLFunctionWriter* %this, %class.BPLFunctionWriter** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.BPLFunctionWriter** %this.addr}, metadata !133), !dbg !135
+  call void @llvm.dbg.declare(metadata !{%class.BPLFunctionWriter** %this.addr}, metadata !133, metadata !{metadata !"0x102"}), !dbg !135
   %this1 = load %class.BPLFunctionWriter** %this.addr
   %MW = getelementptr inbounds %class.BPLFunctionWriter* %this1, i32 0, i32 0, !dbg !136
   %0 = load %struct.BPLModuleWriter** %MW, align 8, !dbg !136
@@ -33,7 +33,7 @@
   ret void, !dbg !139
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare void @_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE(%struct.BPLModuleWriter*)
 
@@ -42,8 +42,8 @@
   %this.addr = alloca %class.function*, align 8
   %__f = alloca %class.anon.0, align 1
   store %class.function* %this, %class.function** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.function** %this.addr}, metadata !140), !dbg !142
-  call void @llvm.dbg.declare(metadata !{%class.anon.0* %__f}, metadata !143), !dbg !144
+  call void @llvm.dbg.declare(metadata !{%class.function** %this.addr}, metadata !140, metadata !{metadata !"0x102"}), !dbg !142
+  call void @llvm.dbg.declare(metadata !{%class.anon.0* %__f}, metadata !143, metadata !{metadata !"0x102"}), !dbg !144
   %this1 = load %class.function** %this.addr
   call void @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_"(%class.anon.0* %__f), !dbg !145
   ret void, !dbg !147
@@ -61,8 +61,8 @@
   %this.addr = alloca %class.function*, align 8
   %__f = alloca %class.anon, align 1
   store %class.function* %this, %class.function** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.function** %this.addr}, metadata !150), !dbg !151
-  call void @llvm.dbg.declare(metadata !{%class.anon* %__f}, metadata !152), !dbg !153
+  call void @llvm.dbg.declare(metadata !{%class.function** %this.addr}, metadata !150, metadata !{metadata !"0x102"}), !dbg !151
+  call void @llvm.dbg.declare(metadata !{%class.anon* %__f}, metadata !152, metadata !{metadata !"0x102"}), !dbg !153
   %this1 = load %class.function** %this.addr
   call void @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_"(%class.anon* %__f), !dbg !154
   ret void, !dbg !156
@@ -78,163 +78,163 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!162}
 
-!0 = metadata !{i32 786449, metadata !161, i32 4, metadata !"clang version 3.2 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !128, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.2 \000\00\000\00\000", metadata !161, metadata !1, metadata !1, metadata !3, metadata !128, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !106, metadata !107, metadata !126, metadata !127}
-!5 = metadata !{i32 786478, metadata !6, null, metadata !"writeExpr", metadata !"writeExpr", metadata !"_ZN17BPLFunctionWriter9writeExprEv", i32 19, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.BPLFunctionWriter*)* @_ZN17BPLFunctionWriter9writeExprEv, null, metadata !103, metadata !1, i32 19} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !160} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00writeExpr\00writeExpr\00_ZN17BPLFunctionWriter9writeExprEv\0019\000\001\000\006\00256\000\0019", metadata !6, null, metadata !7, null, void (%class.BPLFunctionWriter*)* @_ZN17BPLFunctionWriter9writeExprEv, null, metadata !103, metadata !1} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !160} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 786434, metadata !160, null, metadata !"BPLFunctionWriter", i32 15, i64 64, i64 64, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_class_type ] [BPLFunctionWriter] [line 15, size 64, align 64, offset 0] [def] [from ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x2\00BPLFunctionWriter\0015\0064\0064\000\000\000", metadata !160, null, null, metadata !11, null, null, null} ; [ DW_TAG_class_type ] [BPLFunctionWriter] [line 15, size 64, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !12, metadata !103}
-!12 = metadata !{i32 786445, metadata !160, metadata !10, metadata !"MW", i32 16, i64 64, i64 64, i64 0, i32 1, metadata !13} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786434, metadata !160, null, metadata !"BPLModuleWriter", i32 12, i64 8, i64 8, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_class_type ] [BPLModuleWriter] [line 12, size 8, align 8, offset 0] [def] [from ]
+!12 = metadata !{metadata !"0xd\00MW\0016\0064\0064\000\001", metadata !160, metadata !10, metadata !13} ; [ DW_TAG_member ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !14} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{metadata !"0x2\00BPLModuleWriter\0012\008\008\000\000\000", metadata !160, null, null, metadata !15, null, null, null} ; [ DW_TAG_class_type ] [BPLModuleWriter] [line 12, size 8, align 8, offset 0] [def] [from ]
 !15 = metadata !{metadata !16}
-!16 = metadata !{i32 786478, metadata !6, metadata !14, metadata !"writeIntrinsic", metadata !"writeIntrinsic", metadata !"_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE", i32 13, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !101, i32 13} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0x2e\00writeIntrinsic\00writeIntrinsic\00_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE\0013\000\000\000\006\00256\000\0013", metadata !6, metadata !14, metadata !17, null, null, null, i32 0, metadata !101} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19, metadata !20}
-!19 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !14} ; [ DW_TAG_pointer_type ]
-!20 = metadata !{i32 786434, metadata !160, null, metadata !"function<void ()>", i32 6, i64 8, i64 8, i32 0, i32 0, null, metadata !21, i32 0, null, metadata !97, null} ; [ DW_TAG_class_type ] [function<void ()>] [line 6, size 8, align 8, offset 0] [def] [from ]
+!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !14} ; [ DW_TAG_pointer_type ]
+!20 = metadata !{metadata !"0x2\00function<void ()>\006\008\008\000\000\000", metadata !160, null, null, metadata !21, null, metadata !97, null} ; [ DW_TAG_class_type ] [function<void ()>] [line 6, size 8, align 8, offset 0] [def] [from ]
 !21 = metadata !{metadata !22, metadata !51, metadata !58, metadata !86, metadata !92}
-!22 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"", i32 8, metadata !23, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !47, i32 0, metadata !49, i32 8} ; [ DW_TAG_subprogram ]
-!23 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !24, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00\008\000\000\000\006\00256\000\008", metadata !6, metadata !20, metadata !23, null, null, metadata !47, i32 0, metadata !49} ; [ DW_TAG_subprogram ]
+!23 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !24, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !24 = metadata !{null, metadata !25, metadata !26}
-!25 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !20} ; [ DW_TAG_pointer_type ]
-!26 = metadata !{i32 786434, metadata !160, metadata !5, metadata !"", i32 20, i64 8, i64 8, i32 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_class_type ] [line 20, size 8, align 8, offset 0] [def] [from ]
+!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !20} ; [ DW_TAG_pointer_type ]
+!26 = metadata !{metadata !"0x2\00\0020\008\008\000\000\000", metadata !160, metadata !5, null, metadata !27, null, null, null} ; [ DW_TAG_class_type ] [line 20, size 8, align 8, offset 0] [def] [from ]
 !27 = metadata !{metadata !28, metadata !35, metadata !41}
-!28 = metadata !{i32 786478, metadata !6, metadata !26, metadata !"operator()", metadata !"operator()", metadata !"", i32 20, metadata !29, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !33, i32 20} ; [ DW_TAG_subprogram ]
-!29 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{metadata !"0x2e\00operator()\00operator()\00\0020\000\000\000\006\00256\000\0020", metadata !6, metadata !26, metadata !29, null, null, null, i32 0, metadata !33} ; [ DW_TAG_subprogram ]
+!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !30, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !30 = metadata !{null, metadata !31}
-!31 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !32} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_const_type ]
+!31 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !26} ; [ DW_TAG_const_type ]
 !33 = metadata !{metadata !34}
-!34 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!35 = metadata !{i32 786478, metadata !6, metadata !26, metadata !"~", metadata !"~", metadata !"", i32 20, metadata !36, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !39, i32 20} ; [ DW_TAG_subprogram ]
-!36 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!34 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!35 = metadata !{metadata !"0x2e\00~\00~\00\0020\000\000\000\006\00320\000\0020", metadata !6, metadata !26, metadata !36, null, null, null, i32 0, metadata !39} ; [ DW_TAG_subprogram ]
+!36 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !37, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !37 = metadata !{null, metadata !38}
-!38 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !26} ; [ DW_TAG_pointer_type ]
+!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !26} ; [ DW_TAG_pointer_type ]
 !39 = metadata !{metadata !40}
-!40 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!41 = metadata !{i32 786478, metadata !6, metadata !26, metadata !"", metadata !"", metadata !"", i32 20, metadata !42, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !45, i32 20} ; [ DW_TAG_subprogram ]
-!42 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!40 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!41 = metadata !{metadata !"0x2e\00\00\00\0020\000\000\000\006\00320\000\0020", metadata !6, metadata !26, metadata !42, null, null, null, i32 0, metadata !45} ; [ DW_TAG_subprogram ]
+!42 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !43 = metadata !{null, metadata !38, metadata !44}
-!44 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_rvalue_reference_type ]
+!44 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !26} ; [ DW_TAG_rvalue_reference_type ]
 !45 = metadata !{metadata !46}
-!46 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!46 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
 !47 = metadata !{metadata !48}
-!48 = metadata !{i32 786479, null, metadata !"_Functor", metadata !26, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!48 = metadata !{metadata !"0x2f\00_Functor\000\000", null, metadata !26, null} ; [ DW_TAG_template_type_parameter ]
 !49 = metadata !{metadata !50}
-!50 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!51 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"function<function<void ()> >", metadata !"function<function<void ()> >", metadata !"", i32 8, metadata !52, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !54, i32 0, metadata !56, i32 8} ; [ DW_TAG_subprogram ]
-!52 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !53, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!50 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!51 = metadata !{metadata !"0x2e\00function<function<void ()> >\00function<function<void ()> >\00\008\000\000\000\006\00256\000\008", metadata !6, metadata !20, metadata !52, null, null, metadata !54, i32 0, metadata !56} ; [ DW_TAG_subprogram ]
+!52 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !53, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !53 = metadata !{null, metadata !25, metadata !20}
 !54 = metadata !{metadata !55}
-!55 = metadata !{i32 786479, null, metadata !"_Functor", metadata !20, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!55 = metadata !{metadata !"0x2f\00_Functor\000\000", null, metadata !20, null} ; [ DW_TAG_template_type_parameter ]
 !56 = metadata !{metadata !57}
-!57 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!58 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"", i32 8, metadata !59, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !82, i32 0, metadata !84, i32 8} ; [ DW_TAG_subprogram ]
-!59 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !60, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!57 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!58 = metadata !{metadata !"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00\008\000\000\000\006\00256\000\008", metadata !6, metadata !20, metadata !59, null, null, metadata !82, i32 0, metadata !84} ; [ DW_TAG_subprogram ]
+!59 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !60, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !60 = metadata !{null, metadata !25, metadata !61}
-!61 = metadata !{i32 786434, metadata !160, metadata !5, metadata !"", i32 23, i64 8, i64 8, i32 0, i32 0, null, metadata !62, i32 0, null, null, null} ; [ DW_TAG_class_type ] [line 23, size 8, align 8, offset 0] [def] [from ]
+!61 = metadata !{metadata !"0x2\00\0023\008\008\000\000\000", metadata !160, metadata !5, null, metadata !62, null, null, null} ; [ DW_TAG_class_type ] [line 23, size 8, align 8, offset 0] [def] [from ]
 !62 = metadata !{metadata !63, metadata !70, metadata !76}
-!63 = metadata !{i32 786478, metadata !6, metadata !61, metadata !"operator()", metadata !"operator()", metadata !"", i32 23, metadata !64, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !68, i32 23} ; [ DW_TAG_subprogram ]
-!64 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!63 = metadata !{metadata !"0x2e\00operator()\00operator()\00\0023\000\000\000\006\00256\000\0023", metadata !6, metadata !61, metadata !64, null, null, null, i32 0, metadata !68} ; [ DW_TAG_subprogram ]
+!64 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !65, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !65 = metadata !{null, metadata !66}
-!66 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !67} ; [ DW_TAG_pointer_type ]
-!67 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_const_type ]
+!66 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !67} ; [ DW_TAG_pointer_type ]
+!67 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !61} ; [ DW_TAG_const_type ]
 !68 = metadata !{metadata !69}
-!69 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!70 = metadata !{i32 786478, metadata !6, metadata !61, metadata !"~", metadata !"~", metadata !"", i32 23, metadata !71, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !74, i32 23} ; [ DW_TAG_subprogram ]
-!71 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !72, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!69 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!70 = metadata !{metadata !"0x2e\00~\00~\00\0023\000\000\000\006\00320\000\0023", metadata !6, metadata !61, metadata !71, null, null, null, i32 0, metadata !74} ; [ DW_TAG_subprogram ]
+!71 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !72, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !72 = metadata !{null, metadata !73}
-!73 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !61} ; [ DW_TAG_pointer_type ]
+!73 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", i32 0, null, metadata !61} ; [ DW_TAG_pointer_type ]
 !74 = metadata !{metadata !75}
-!75 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!76 = metadata !{i32 786478, metadata !6, metadata !61, metadata !"", metadata !"", metadata !"", i32 23, metadata !77, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !80, i32 23} ; [ DW_TAG_subprogram ]
-!77 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !78, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!75 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!76 = metadata !{metadata !"0x2e\00\00\00\0023\000\000\000\006\00320\000\0023", metadata !6, metadata !61, metadata !77, null, null, null, i32 0, metadata !80} ; [ DW_TAG_subprogram ]
+!77 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !78, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !78 = metadata !{null, metadata !73, metadata !79}
-!79 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_rvalue_reference_type ]
+!79 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !61} ; [ DW_TAG_rvalue_reference_type ]
 !80 = metadata !{metadata !81}
-!81 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!81 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
 !82 = metadata !{metadata !83}
-!83 = metadata !{i32 786479, null, metadata !"_Functor", metadata !61, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!83 = metadata !{metadata !"0x2f\00_Functor\000\000", null, metadata !61, null} ; [ DW_TAG_template_type_parameter ]
 !84 = metadata !{metadata !85}
-!85 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!86 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"function", metadata !"function", metadata !"", i32 6, metadata !87, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !90, i32 6} ; [ DW_TAG_subprogram ]
-!87 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !88, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!85 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!86 = metadata !{metadata !"0x2e\00function\00function\00\006\000\000\000\006\00320\000\006", metadata !6, metadata !20, metadata !87, null, null, null, i32 0, metadata !90} ; [ DW_TAG_subprogram ]
+!87 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !88, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !88 = metadata !{null, metadata !25, metadata !89}
-!89 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_rvalue_reference_type ]
+!89 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !20} ; [ DW_TAG_rvalue_reference_type ]
 !90 = metadata !{metadata !91}
-!91 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!92 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"~function", metadata !"~function", metadata !"", i32 6, metadata !93, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !95, i32 6} ; [ DW_TAG_subprogram ]
-!93 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !94, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!91 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
+!92 = metadata !{metadata !"0x2e\00~function\00~function\00\006\000\000\000\006\00320\000\006", metadata !6, metadata !20, metadata !93, null, null, null, i32 0, metadata !95} ; [ DW_TAG_subprogram ]
+!93 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !94, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !94 = metadata !{null, metadata !25}
 !95 = metadata !{metadata !96}
-!96 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!96 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
 !97 = metadata !{metadata !98}
-!98 = metadata !{i32 786479, null, metadata !"T", metadata !99, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
-!99 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !100, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!98 = metadata !{metadata !"0x2f\00T\000\000", null, metadata !99, null} ; [ DW_TAG_template_type_parameter ]
+!99 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !100, i32 0} ; [ DW_TAG_subroutine_type ]
 !100 = metadata !{null}
 !101 = metadata !{metadata !102}
-!102 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
-!103 = metadata !{i32 786478, metadata !6, metadata !10, metadata !"writeExpr", metadata !"writeExpr", metadata !"_ZN17BPLFunctionWriter9writeExprEv", i32 17, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !104, i32 17} ; [ DW_TAG_subprogram ]
+!102 = metadata !{metadata !"0x24"}                     ; [ DW_TAG_base_type ]
+!103 = metadata !{metadata !"0x2e\00writeExpr\00writeExpr\00_ZN17BPLFunctionWriter9writeExprEv\0017\000\000\000\006\00257\000\0017", metadata !6, metadata !10, metadata !7, null, null, null, i32 0, metadata !104} ; [ DW_TAG_subprogram ]
 !104 = metadata !{metadata !105}
-!105 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
-!106 = metadata !{i32 786478, metadata !6, null, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_", i32 8, metadata !59, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_", metadata !82, metadata !58, metadata !1, i32 8} ; [ DW_TAG_subprogram ]
-!107 = metadata !{i32 786478, metadata !6, null, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", i32 3, metadata !108, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.anon.0*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", metadata !111, metadata !113, metadata !1, i32 3} ; [ DW_TAG_subprogram ]
-!108 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !109, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!105 = metadata !{metadata !"0x24"}                     ; [ DW_TAG_base_type ]
+!106 = metadata !{metadata !"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_\008\001\001\000\006\00256\000\008", metadata !6, null, metadata !59, null, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_", metadata !82, metadata !58, metadata !1} ; [ DW_TAG_subprogram ]
+!107 = metadata !{metadata !"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_\003\001\001\000\006\00256\000\003", metadata !6, null, metadata !108, null, void (%class.anon.0*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", metadata !111, metadata !113, metadata !1} ; [ DW_TAG_subprogram ]
+!108 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !109, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !109 = metadata !{null, metadata !110}
-!110 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_reference_type ]
+!110 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !61} ; [ DW_TAG_reference_type ]
 !111 = metadata !{metadata !112}
-!112 = metadata !{i32 786479, null, metadata !"_Tp", metadata !61, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
-!113 = metadata !{i32 786478, metadata !6, metadata !114, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", i32 3, metadata !108, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !111, i32 0, metadata !124, i32 3} ; [ DW_TAG_subprogram ]
-!114 = metadata !{i32 786434, metadata !160, null, metadata !"_Base_manager", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !115, i32 0, null, null, null} ; [ DW_TAG_class_type ] [_Base_manager] [line 1, size 8, align 8, offset 0] [def] [from ]
+!112 = metadata !{metadata !"0x2f\00_Tp\000\000", null, metadata !61, null} ; [ DW_TAG_template_type_parameter ]
+!113 = metadata !{metadata !"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_\003\000\000\000\006\00256\000\003", metadata !6, metadata !114, metadata !108, null, null, metadata !111, i32 0, metadata !124} ; [ DW_TAG_subprogram ]
+!114 = metadata !{metadata !"0x2\00_Base_manager\001\008\008\000\000\000", metadata !160, null, null, metadata !115, null, null, null} ; [ DW_TAG_class_type ] [_Base_manager] [line 1, size 8, align 8, offset 0] [def] [from ]
 !115 = metadata !{metadata !116, metadata !113}
-!116 = metadata !{i32 786478, metadata !6, metadata !114, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", i32 3, metadata !117, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !120, i32 0, metadata !122, i32 3} ; [ DW_TAG_subprogram ]
-!117 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !118, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!116 = metadata !{metadata !"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_\003\000\000\000\006\00256\000\003", metadata !6, metadata !114, metadata !117, null, null, metadata !120, i32 0, metadata !122} ; [ DW_TAG_subprogram ]
+!117 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !118, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !118 = metadata !{null, metadata !119}
-!119 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_reference_type ]
+!119 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !26} ; [ DW_TAG_reference_type ]
 !120 = metadata !{metadata !121}
-!121 = metadata !{i32 786479, null, metadata !"_Tp", metadata !26, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!121 = metadata !{metadata !"0x2f\00_Tp\000\000", null, metadata !26, null} ; [ DW_TAG_template_type_parameter ]
 !122 = metadata !{metadata !123}
-!123 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
+!123 = metadata !{metadata !"0x24"}                     ; [ DW_TAG_base_type ]
 !124 = metadata !{metadata !125}
-!125 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
-!126 = metadata !{i32 786478, metadata !6, null, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_", i32 8, metadata !23, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_", metadata !47, metadata !22, metadata !1, i32 8} ; [ DW_TAG_subprogram ]
-!127 = metadata !{i32 786478, metadata !6, null, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", i32 3, metadata !117, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.anon*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", metadata !120, metadata !116, metadata !1, i32 3} ; [ DW_TAG_subprogram ]
+!125 = metadata !{metadata !"0x24"}                     ; [ DW_TAG_base_type ]
+!126 = metadata !{metadata !"0x2e\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_\008\001\001\000\006\00256\000\008", metadata !6, null, metadata !23, null, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_", metadata !47, metadata !22, metadata !1} ; [ DW_TAG_subprogram ]
+!127 = metadata !{metadata !"0x2e\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >\00_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_\003\001\001\000\006\00256\000\003", metadata !6, null, metadata !117, null, void (%class.anon*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", metadata !120, metadata !116, metadata !1} ; [ DW_TAG_subprogram ]
 !128 = metadata !{metadata !130}
-!130 = metadata !{i32 786484, i32 0, metadata !114, metadata !"__stored_locally", metadata !"__stored_locally", metadata !"__stored_locally", metadata !6, i32 2, metadata !131, i32 1, i32 1, i1 true, null} ; [ DW_TAG_variable ]
-!131 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !132} ; [ DW_TAG_const_type ]
-!132 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
-!133 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777235, metadata !134, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!134 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
+!130 = metadata !{metadata !"0x34\00__stored_locally\00__stored_locally\00__stored_locally\002\001\001", metadata !114, metadata !6, metadata !131, i1 1, null} ; [ DW_TAG_variable ]
+!131 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !132} ; [ DW_TAG_const_type ]
+!132 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ]
+!133 = metadata !{metadata !"0x101\00this\0016777235\0064", metadata !5, metadata !6, metadata !134} ; [ DW_TAG_arg_variable ]
+!134 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ]
 !135 = metadata !{i32 19, i32 39, metadata !5, null}
 !136 = metadata !{i32 20, i32 17, metadata !137, null}
-!137 = metadata !{i32 786443, metadata !6, metadata !5, i32 19, i32 51, i32 0} ; [ DW_TAG_lexical_block ]
+!137 = metadata !{metadata !"0xb\0019\0051\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ]
 !138 = metadata !{i32 23, i32 17, metadata !137, null}
 !139 = metadata !{i32 26, i32 15, metadata !137, null}
-!140 = metadata !{i32 786689, metadata !106, metadata !"this", metadata !6, i32 16777224, metadata !141, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!141 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ]
+!140 = metadata !{metadata !"0x101\00this\0016777224\0064", metadata !106, metadata !6, metadata !141} ; [ DW_TAG_arg_variable ]
+!141 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !20} ; [ DW_TAG_pointer_type ]
 !142 = metadata !{i32 8, i32 45, metadata !106, null}
-!143 = metadata !{i32 786689, metadata !106, metadata !"__f", metadata !6, i32 33554440, metadata !61, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!143 = metadata !{metadata !"0x101\00__f\0033554440\000", metadata !106, metadata !6, metadata !61} ; [ DW_TAG_arg_variable ]
 !144 = metadata !{i32 8, i32 63, metadata !106, null}
 !145 = metadata !{i32 9, i32 9, metadata !146, null}
-!146 = metadata !{i32 786443, metadata !6, metadata !106, i32 8, i32 81, i32 1} ; [ DW_TAG_lexical_block ]
+!146 = metadata !{metadata !"0xb\008\0081\001", metadata !6, metadata !106} ; [ DW_TAG_lexical_block ]
 !147 = metadata !{i32 10, i32 13, metadata !146, null}
 !148 = metadata !{i32 4, i32 5, metadata !149, null}
-!149 = metadata !{i32 786443, metadata !6, metadata !107, i32 3, i32 105, i32 2} ; [ DW_TAG_lexical_block ]
-!150 = metadata !{i32 786689, metadata !126, metadata !"this", metadata !6, i32 16777224, metadata !141, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!149 = metadata !{metadata !"0xb\003\00105\002", metadata !6, metadata !107} ; [ DW_TAG_lexical_block ]
+!150 = metadata !{metadata !"0x101\00this\0016777224\0064", metadata !126, metadata !6, metadata !141} ; [ DW_TAG_arg_variable ]
 !151 = metadata !{i32 8, i32 45, metadata !126, null}
-!152 = metadata !{i32 786689, metadata !126, metadata !"__f", metadata !6, i32 33554440, metadata !26, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!152 = metadata !{metadata !"0x101\00__f\0033554440\000", metadata !126, metadata !6, metadata !26} ; [ DW_TAG_arg_variable ]
 !153 = metadata !{i32 8, i32 63, metadata !126, null}
 !154 = metadata !{i32 9, i32 9, metadata !155, null}
-!155 = metadata !{i32 786443, metadata !6, metadata !126, i32 8, i32 81, i32 3} ; [ DW_TAG_lexical_block ]
+!155 = metadata !{metadata !"0xb\008\0081\003", metadata !6, metadata !126} ; [ DW_TAG_lexical_block ]
 !156 = metadata !{i32 10, i32 13, metadata !155, null}
 !157 = metadata !{i32 4, i32 5, metadata !158, null}
-!158 = metadata !{i32 786443, metadata !6, metadata !127, i32 3, i32 105, i32 4} ; [ DW_TAG_lexical_block ]
-!159 = metadata !{i32 786473, metadata !161} ; [ DW_TAG_file_type ]
+!158 = metadata !{metadata !"0xb\003\00105\004", metadata !6, metadata !127} ; [ DW_TAG_lexical_block ]
+!159 = metadata !{metadata !"0x29", metadata !161} ; [ DW_TAG_file_type ]
 !160 = metadata !{metadata !"BPLFunctionWriter2.ii", metadata !"/home/peter/crashdelta"}
 !161 = metadata !{metadata !"BPLFunctionWriter.cpp", metadata !"/home/peter/crashdelta"}
-!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/pr13303.ll b/test/DebugInfo/X86/pr13303.ll
index 16e5966..0b417bf 100644
--- a/test/DebugInfo/X86/pr13303.ll
+++ b/test/DebugInfo/X86/pr13303.ll

@@ -15,15 +15,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang version 3.2 (trunk 160143)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 160143)\000\00\000\00\000", metadata !12, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !12, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!6 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\006\000\000\001", metadata !12, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!6 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 1, i32 14, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !12, metadata !5, i32 1, i32 12, i32 0} ; [ DW_TAG_lexical_block ] [/home/probinson/PR13303.c]
+!11 = metadata !{metadata !"0xb\001\0012\000", metadata !12, metadata !5} ; [ DW_TAG_lexical_block ] [/home/probinson/PR13303.c]
 !12 = metadata !{metadata !"PR13303.c", metadata !"/home/probinson"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/pr19307.ll b/test/DebugInfo/X86/pr19307.ll
index 07e3a42..4223cb7 100644
--- a/test/DebugInfo/X86/pr19307.ll
+++ b/test/DebugInfo/X86/pr19307.ll

@@ -20,10 +20,8 @@
 ; Verify that we have proper range in debug_loc section:
 ; CHECK: .Ldebug_loc{{[0-9]+}}:
 ; CHECK: DW_OP_breg1
-; CHECK:      .Lset{{[0-9]+}} = [[START_LABEL]]-.Lfunc_begin0
-; CHECK-NEXT: .quad .Lset{{[0-9]+}}
-; CHECK-NEXT: .Lset{{[0-9]+}} = .Lfunc_end0-.Lfunc_begin0
-; CHECK-NEXT: .quad .Lset{{[0-9]+}}
+; CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lfunc_end0-.Lfunc_begin0
 ; CHECK: DW_OP_breg6
 ; CHECK: DW_OP_deref
 
@@ -42,10 +40,10 @@
   %offset.addr = alloca i64*, align 8
   %limit.addr = alloca i64*, align 8
   store i64* %offset, i64** %offset.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64** %offset.addr}, metadata !45), !dbg !46
+  call void @llvm.dbg.declare(metadata !{i64** %offset.addr}, metadata !45, metadata !{metadata !"0x102"}), !dbg !46
   store i64* %limit, i64** %limit.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64** %limit.addr}, metadata !47), !dbg !46
-  call void @llvm.dbg.declare(metadata !{%"class.std::basic_string"* %range}, metadata !48), !dbg !49
+  call void @llvm.dbg.declare(metadata !{i64** %limit.addr}, metadata !47, metadata !{metadata !"0x102"}), !dbg !46
+  call void @llvm.dbg.declare(metadata !{%"class.std::basic_string"* %range}, metadata !48, metadata !{metadata !"0x102"}), !dbg !49
   %call = call i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"* %range, i64 0, i64 6, i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0)), !dbg !50
   %cmp = icmp ne i32 %call, 0, !dbg !50
   br i1 %cmp, label %if.then, label %lor.lhs.false, !dbg !50
@@ -70,7 +68,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"*, i64, i64, i8*) #2
 
@@ -86,62 +84,62 @@
 !llvm.module.flags = !{!42, !43}
 !llvm.ident = !{!44}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (209308)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !12, metadata !2, metadata !21, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/pr19307.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (209308)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !12, metadata !2, metadata !21} ; [ DW_TAG_compile_unit ] [/llvm_cmake_gcc/pr19307.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"pr19307.cc", metadata !"/llvm_cmake_gcc"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !6, metadata !8}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"", i32 83, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_structure_type ] [line 83, size 0, align 0, offset 0] [decl] [from ]
+!4 = metadata !{metadata !"0x13\00\0083\000\000\000\004\000", metadata !5, null, null, null, null, null, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_structure_type ] [line 83, size 0, align 0, offset 0] [decl] [from ]
 !5 = metadata !{metadata !"/usr/include/wchar.h", metadata !"/llvm_cmake_gcc"}
-!6 = metadata !{i32 786451, metadata !7, null, metadata !"lconv", i32 54, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS5lconv"} ; [ DW_TAG_structure_type ] [lconv] [line 54, size 0, align 0, offset 0] [decl] [from ]
+!6 = metadata !{metadata !"0x13\00lconv\0054\000\000\000\004\000", metadata !7, null, null, null, null, null, metadata !"_ZTS5lconv"} ; [ DW_TAG_structure_type ] [lconv] [line 54, size 0, align 0, offset 0] [decl] [from ]
 !7 = metadata !{metadata !"/usr/include/locale.h", metadata !"/llvm_cmake_gcc"}
-!8 = metadata !{i32 786434, metadata !9, metadata !10, metadata !"basic_string<char, std::char_traits<char>, std::allocator<char> >", i32 1134, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTSSs"} ; [ DW_TAG_class_type ] [basic_string<char, std::char_traits<char>, std::allocator<char> >] [line 1134, size 0, align 0, offset 0] [decl] [from ]
+!8 = metadata !{metadata !"0x2\00basic_string<char, std::char_traits<char>, std::allocator<char> >\001134\000\000\000\004\000", metadata !9, metadata !10, null, null, null, null, metadata !"_ZTSSs"} ; [ DW_TAG_class_type ] [basic_string<char, std::char_traits<char>, std::allocator<char> >] [line 1134, size 0, align 0, offset 0] [decl] [from ]
 !9 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/basic_string.tcc", metadata !"/llvm_cmake_gcc"}
-!10 = metadata !{i32 786489, metadata !11, null, metadata !"std", i32 153} ; [ DW_TAG_namespace ] [std] [line 153]
+!10 = metadata !{metadata !"0x39\00std\00153", metadata !11, null} ; [ DW_TAG_namespace ] [std] [line 153]
 !11 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/x86_64-linux-gnu/bits/c++config.h", metadata !"/llvm_cmake_gcc"}
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786478, metadata !1, metadata !14, metadata !"parse_range", metadata !"parse_range", metadata !"_Z11parse_rangeRyS_Ss", i32 3, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i64*, i64*, %"class.std::basic_string"*)* @_Z11parse_rangeRyS_Ss, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [parse_range]
-!14 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/pr19307.cc]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x2e\00parse_range\00parse_range\00_Z11parse_rangeRyS_Ss\003\000\001\000\006\00256\000\004", metadata !1, metadata !14, metadata !15, null, void (i64*, i64*, %"class.std::basic_string"*)* @_Z11parse_rangeRyS_Ss, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [parse_range]
+!14 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/llvm_cmake_gcc/pr19307.cc]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17, metadata !17, metadata !19}
-!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
-!18 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!19 = metadata !{i32 786454, metadata !20, metadata !10, metadata !"string", i32 65, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTSSs"} ; [ DW_TAG_typedef ] [string] [line 65, size 0, align 0, offset 0] [from _ZTSSs]
+!17 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !18} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!18 = metadata !{metadata !"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!19 = metadata !{metadata !"0x16\00string\0065\000\000\000\000", metadata !20, metadata !10, metadata !"_ZTSSs"} ; [ DW_TAG_typedef ] [string] [line 65, size 0, align 0, offset 0] [from _ZTSSs]
 !20 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", metadata !"/llvm_cmake_gcc"}
 !21 = metadata !{metadata !22, metadata !26, metadata !29, metadata !33, metadata !38, metadata !41}
-!22 = metadata !{i32 786490, metadata !23, metadata !25, i32 57} ; [ DW_TAG_imported_module ]
-!23 = metadata !{i32 786489, metadata !24, null, metadata !"__gnu_debug", i32 55} ; [ DW_TAG_namespace ] [__gnu_debug] [line 55]
+!22 = metadata !{metadata !"0x3a\0057\00", metadata !23, metadata !25} ; [ DW_TAG_imported_module ]
+!23 = metadata !{metadata !"0x39\00__gnu_debug\0055", metadata !24, null} ; [ DW_TAG_namespace ] [__gnu_debug] [line 55]
 !24 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", metadata !"/llvm_cmake_gcc"}
-!25 = metadata !{i32 786489, metadata !24, metadata !10, metadata !"__debug", i32 49} ; [ DW_TAG_namespace ] [__debug] [line 49]
-!26 = metadata !{i32 786440, metadata !10, metadata !27, i32 66} ; [ DW_TAG_imported_declaration ]
-!27 = metadata !{i32 786454, metadata !5, null, metadata !"mbstate_t", i32 106, i64 0, i64 0, i64 0, i32 0, metadata !28} ; [ DW_TAG_typedef ] [mbstate_t] [line 106, size 0, align 0, offset 0] [from __mbstate_t]
-!28 = metadata !{i32 786454, metadata !5, null, metadata !"__mbstate_t", i32 95, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_typedef ] [__mbstate_t] [line 95, size 0, align 0, offset 0] [from _ZTS11__mbstate_t]
-!29 = metadata !{i32 786440, metadata !10, metadata !30, i32 141} ; [ DW_TAG_imported_declaration ]
-!30 = metadata !{i32 786454, metadata !31, null, metadata !"wint_t", i32 141, i64 0, i64 0, i64 0, i32 0, metadata !32} ; [ DW_TAG_typedef ] [wint_t] [line 141, size 0, align 0, offset 0] [from unsigned int]
+!25 = metadata !{metadata !"0x39\00__debug\0049", metadata !24, metadata !10} ; [ DW_TAG_namespace ] [__debug] [line 49]
+!26 = metadata !{metadata !"0x8\0066\00", metadata !10, metadata !27} ; [ DW_TAG_imported_declaration ]
+!27 = metadata !{metadata !"0x16\00mbstate_t\00106\000\000\000\000", metadata !5, null, metadata !28} ; [ DW_TAG_typedef ] [mbstate_t] [line 106, size 0, align 0, offset 0] [from __mbstate_t]
+!28 = metadata !{metadata !"0x16\00__mbstate_t\0095\000\000\000\000", metadata !5, null, metadata !"_ZTS11__mbstate_t"} ; [ DW_TAG_typedef ] [__mbstate_t] [line 95, size 0, align 0, offset 0] [from _ZTS11__mbstate_t]
+!29 = metadata !{metadata !"0x8\00141\00", metadata !10, metadata !30} ; [ DW_TAG_imported_declaration ]
+!30 = metadata !{metadata !"0x16\00wint_t\00141\000\000\000\000", metadata !31, null, metadata !32} ; [ DW_TAG_typedef ] [wint_t] [line 141, size 0, align 0, offset 0] [from unsigned int]
 !31 = metadata !{metadata !"/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", metadata !"/llvm_cmake_gcc"}
-!32 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!33 = metadata !{i32 786440, metadata !34, metadata !36, i32 42} ; [ DW_TAG_imported_declaration ]
-!34 = metadata !{i32 786489, metadata !35, null, metadata !"__gnu_cxx", i32 69} ; [ DW_TAG_namespace ] [__gnu_cxx] [line 69]
+!32 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!33 = metadata !{metadata !"0x8\0042\00", metadata !34, metadata !36} ; [ DW_TAG_imported_declaration ]
+!34 = metadata !{metadata !"0x39\00__gnu_cxx\0069", metadata !35, null} ; [ DW_TAG_namespace ] [__gnu_cxx] [line 69]
 !35 = metadata !{metadata !"/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", metadata !"/llvm_cmake_gcc"}
-!36 = metadata !{i32 786454, metadata !11, metadata !10, metadata !"size_t", i32 155, i64 0, i64 0, i64 0, i32 0, metadata !37} ; [ DW_TAG_typedef ] [size_t] [line 155, size 0, align 0, offset 0] [from long unsigned int]
-!37 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!38 = metadata !{i32 786440, metadata !34, metadata !39, i32 43} ; [ DW_TAG_imported_declaration ]
-!39 = metadata !{i32 786454, metadata !11, metadata !10, metadata !"ptrdiff_t", i32 156, i64 0, i64 0, i64 0, i32 0, metadata !40} ; [ DW_TAG_typedef ] [ptrdiff_t] [line 156, size 0, align 0, offset 0] [from long int]
-!40 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
-!41 = metadata !{i32 786440, metadata !10, metadata !"_ZTS5lconv", i32 55} ; [ DW_TAG_imported_declaration ]
+!36 = metadata !{metadata !"0x16\00size_t\00155\000\000\000\000", metadata !11, metadata !10, metadata !37} ; [ DW_TAG_typedef ] [size_t] [line 155, size 0, align 0, offset 0] [from long unsigned int]
+!37 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!38 = metadata !{metadata !"0x8\0043\00", metadata !34, metadata !39} ; [ DW_TAG_imported_declaration ]
+!39 = metadata !{metadata !"0x16\00ptrdiff_t\00156\000\000\000\000", metadata !11, metadata !10, metadata !40} ; [ DW_TAG_typedef ] [ptrdiff_t] [line 156, size 0, align 0, offset 0] [from long int]
+!40 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!41 = metadata !{metadata !"0x8\0055\00", metadata !10, metadata !"_ZTS5lconv"} ; [ DW_TAG_imported_declaration ]
 !42 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!43 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!43 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !44 = metadata !{metadata !"clang version 3.5.0 (209308)"}
-!45 = metadata !{i32 786689, metadata !13, metadata !"offset", metadata !14, i32 16777219, metadata !17, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [offset] [line 3]
+!45 = metadata !{metadata !"0x101\00offset\0016777219\000", metadata !13, metadata !14, metadata !17} ; [ DW_TAG_arg_variable ] [offset] [line 3]
 !46 = metadata !{i32 3, i32 0, metadata !13, null}
-!47 = metadata !{i32 786689, metadata !13, metadata !"limit", metadata !14, i32 33554435, metadata !17, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [limit] [line 3]
-!48 = metadata !{i32 786689, metadata !13, metadata !"range", metadata !14, i32 50331652, metadata !19, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [range] [line 4]
+!47 = metadata !{metadata !"0x101\00limit\0033554435\000", metadata !13, metadata !14, metadata !17} ; [ DW_TAG_arg_variable ] [limit] [line 3]
+!48 = metadata !{metadata !"0x101\00range\0050331652\008192", metadata !13, metadata !14, metadata !19} ; [ DW_TAG_arg_variable ] [range] [line 4]
 !49 = metadata !{i32 4, i32 0, metadata !13, null}
 !50 = metadata !{i32 5, i32 0, metadata !51, null}
-!51 = metadata !{i32 786443, metadata !1, metadata !13, i32 5, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
+!51 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !13} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
 !52 = metadata !{i32 5, i32 0, metadata !53, null}
-!53 = metadata !{i32 786443, metadata !1, metadata !51, i32 5, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
+!53 = metadata !{metadata !"0xb\005\000\001", metadata !1, metadata !51} ; [ DW_TAG_lexical_block ] [/llvm_cmake_gcc/pr19307.cc]
 !54 = metadata !{i32 6, i32 0, metadata !51, null}
 !55 = metadata !{i32 7, i32 0, metadata !13, null}
-!56 = metadata !{i32 8, i32 0, metadata !13, null} ; [ DW_TAG_imported_declaration ]
+!56 = metadata !{i32 8, i32 0, metadata !13, null}
 !57 = metadata !{i32 9, i32 0, metadata !13, null}
 

diff --git a/test/DebugInfo/X86/processes-relocations.ll b/test/DebugInfo/X86/processes-relocations.ll
new file mode 100644
index 0000000..2a29be4
--- /dev/null
+++ b/test/DebugInfo/X86/processes-relocations.ll

@@ -0,0 +1,21 @@
+; RUN: llc -filetype=obj -O0 < %s -mtriple x86_64-none-linux | \
+; RUN:     llvm-dwarfdump - 2>&1 | FileCheck %s
+; RUN: llc -filetype=obj -O0 < %s -mtriple i386-none-linux | \
+; RUN:     llvm-dwarfdump - 2>&1 | FileCheck %s
+; RUN: llc -filetype=obj -O0 < %s -mtriple x86_64-none-mingw32 | \
+; RUN:     llvm-dwarfdump - 2>&1 | FileCheck %s
+; RUN: llc -filetype=obj -O0 < %s -mtriple i386-none-mingw32 | \
+; RUN:     llvm-dwarfdump - 2>&1 | FileCheck %s
+
+; CHECK-NOT: failed to compute relocation
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.6.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/a/empty.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"empty.c", metadata !"/a"}
+!2 = metadata !{}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!5 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/DebugInfo/X86/prologue-stack.ll b/test/DebugInfo/X86/prologue-stack.ll
index a5bae84..b6dbd41 100644
--- a/test/DebugInfo/X86/prologue-stack.ll
+++ b/test/DebugInfo/X86/prologue-stack.ll

@@ -21,16 +21,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14}
 
-!0 = metadata !{i32 786449, metadata !13, i32 12, metadata !"clang version 3.2 (trunk 164980) (llvm/trunk 164979)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 164980) (llvm/trunk 164979)\000\00\000\00\000", metadata !13, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.c] [DW_LANG_C99]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !13, metadata !6, metadata !"isel_line_test2", metadata !"isel_line_test2", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @isel_line_test2, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [isel_line_test2]
-!6 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00isel_line_test2\00isel_line_test2\00\003\000\001\000\006\000\000\004", metadata !13, metadata !6, metadata !7, null, i32 ()* @isel_line_test2, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [isel_line_test2]
+!6 = metadata !{metadata !"0x29", metadata !13} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 5, i32 3, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !13, metadata !5, i32 4, i32 1, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/bar.c]
+!11 = metadata !{metadata !"0xb\004\001\000", metadata !13, metadata !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/bar.c]
 !12 = metadata !{i32 6, i32 3, metadata !11, null}
 !13 = metadata !{metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp"}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/recursive_inlining.ll b/test/DebugInfo/X86/recursive_inlining.ll
new file mode 100644
index 0000000..251f04e
--- /dev/null
+++ b/test/DebugInfo/X86/recursive_inlining.ll

@@ -0,0 +1,275 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; This isn't a very pretty test case - I imagine there might be other ways to
+; tickle the optimizers into producing the desired code, but I haven't found
+; them.
+
+; The issue is when a function is inlined into itself, the inlined argument
+; accidentally overwrote the concrete argument and was lost.
+
+; IR generated from the following source compiled with clang -g:
+; void fn1(void *);
+; void fn2(int, int, int, int);
+; void fn3();
+; void fn8();
+; struct C {
+;   int b;
+;   void m_fn2() {
+;     fn8();
+;     if (b) fn2(0, 0, 0, 0);
+;     fn3();
+;   }
+; };
+; C *x;
+; inline void fn7() {}
+; void fn6() {
+;   fn8();
+;   x->m_fn2();
+;   fn7();
+; }
+; void fn3() { fn6(); }
+; void fn4() { x->m_fn2(); }
+; void fn5() { x->m_fn2(); }
+
+; The definition of C and declaration of C::m_fn2
+; CHECK: DW_TAG_structure_type
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: DW_TAG_member
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "m_fn2"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[M_FN2_THIS_DECL:.*]]:     DW_TAG_formal_parameter
+
+; The abstract definition of C::m_fn2
+; CHECK: [[M_FN2_ABS_DEF:.*]]: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_specification {{.*}} "_ZN1C5m_fn2Ev"
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_inline
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK: [[M_FN2_THIS_ABS_DEF:.*]]:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_name {{.*}} "this"
+
+; Skip some other functions
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
+
+; The concrete definition of C::m_fn2
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_abstract_origin {{.*}} {[[M_FN2_ABS_DEF]]} "_ZN1C5m_fn2Ev"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_AT_abstract_origin {{.*}} {[[M_FN2_THIS_ABS_DEF]]}
+; CHECK-NOT: {{DW_TAG|NULL}}
+; Inlined fn3:
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK-NOT: {{DW_TAG|NULL}}
+; Inlined fn6:
+; CHECK:       DW_TAG_inlined_subroutine
+; CHECK-NOT: {{DW_TAG|NULL}}
+; Inlined C::m_fn2:
+; CHECK:         DW_TAG_inlined_subroutine
+; CHECK-NOT: DW_TAG
+; CHECK:           DW_AT_abstract_origin {{.*}} {[[M_FN2_ABS_DEF]]} "_ZN1C5m_fn2Ev"
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:           DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:              DW_AT_abstract_origin {{.*}} {[[M_FN2_THIS_ABS_DEF]]}
+
+
+
+%struct.C = type { i32 }
+
+@x = global %struct.C* null, align 8
+
+; Function Attrs: nounwind
+define void @_Z3fn6v() #0 {
+entry:
+  tail call void @_Z3fn8v() #3, !dbg !31
+  %0 = load %struct.C** @x, align 8, !dbg !32, !tbaa !33
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %0}, i64 0, metadata !37, metadata !{metadata !"0x102"}) #3, !dbg !38
+  tail call void @_Z3fn8v() #3, !dbg !39
+  %b.i = getelementptr inbounds %struct.C* %0, i64 0, i32 0, !dbg !40
+  %1 = load i32* %b.i, align 4, !dbg !40, !tbaa !42
+  %tobool.i = icmp eq i32 %1, 0, !dbg !40
+  br i1 %tobool.i, label %_ZN1C5m_fn2Ev.exit, label %if.then.i, !dbg !40
+
+if.then.i:                                        ; preds = %entry
+  tail call void @_Z3fn2iiii(i32 0, i32 0, i32 0, i32 0) #3, !dbg !45
+  br label %_ZN1C5m_fn2Ev.exit, !dbg !45
+
+_ZN1C5m_fn2Ev.exit:                               ; preds = %entry, %if.then.i
+  tail call void @_Z3fn3v() #3, !dbg !47
+  ret void, !dbg !48
+}
+
+declare void @_Z3fn8v() #1
+
+; Function Attrs: nounwind
+define linkonce_odr void @_ZN1C5m_fn2Ev(%struct.C* nocapture readonly %this) #0 align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %this}, i64 0, metadata !24, metadata !{metadata !"0x102"}), !dbg !49
+  tail call void @_Z3fn8v() #3, !dbg !50
+  %b = getelementptr inbounds %struct.C* %this, i64 0, i32 0, !dbg !51
+  %0 = load i32* %b, align 4, !dbg !51, !tbaa !42
+  %tobool = icmp eq i32 %0, 0, !dbg !51
+  br i1 %tobool, label %if.end, label %if.then, !dbg !51
+
+if.then:                                          ; preds = %entry
+  tail call void @_Z3fn2iiii(i32 0, i32 0, i32 0, i32 0) #3, !dbg !52
+  br label %if.end, !dbg !52
+
+if.end:                                           ; preds = %entry, %if.then
+  tail call void @_Z3fn8v() #3, !dbg !53
+  %1 = load %struct.C** @x, align 8, !dbg !56, !tbaa !33
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %1}, i64 0, metadata !57, metadata !{metadata !"0x102"}) #3, !dbg !58
+  tail call void @_Z3fn8v() #3, !dbg !59
+  %b.i.i = getelementptr inbounds %struct.C* %1, i64 0, i32 0, !dbg !60
+  %2 = load i32* %b.i.i, align 4, !dbg !60, !tbaa !42
+  %tobool.i.i = icmp eq i32 %2, 0, !dbg !60
+  br i1 %tobool.i.i, label %_Z3fn6v.exit, label %if.then.i.i, !dbg !60
+
+if.then.i.i:                                      ; preds = %if.end
+  tail call void @_Z3fn2iiii(i32 0, i32 0, i32 0, i32 0) #3, !dbg !61
+  br label %_Z3fn6v.exit, !dbg !61
+
+_Z3fn6v.exit:                                     ; preds = %if.end, %if.then.i.i
+  tail call void @_Z3fn3v() #3, !dbg !62
+  ret void, !dbg !63
+}
+
+; Function Attrs: nounwind
+define void @_Z3fn3v() #0 {
+entry:
+  br label %tailrecurse
+
+tailrecurse:                                      ; preds = %tailrecurse.backedge, %entry
+  tail call void @_Z3fn8v() #3, !dbg !64
+  %0 = load %struct.C** @x, align 8, !dbg !66, !tbaa !33
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %0}, i64 0, metadata !67, metadata !{metadata !"0x102"}) #3, !dbg !68
+  tail call void @_Z3fn8v() #3, !dbg !69
+  %b.i.i = getelementptr inbounds %struct.C* %0, i64 0, i32 0, !dbg !70
+  %1 = load i32* %b.i.i, align 4, !dbg !70, !tbaa !42
+  %tobool.i.i = icmp eq i32 %1, 0, !dbg !70
+  br i1 %tobool.i.i, label %tailrecurse.backedge, label %if.then.i.i, !dbg !70
+
+tailrecurse.backedge:                             ; preds = %tailrecurse, %if.then.i.i
+  br label %tailrecurse
+
+if.then.i.i:                                      ; preds = %tailrecurse
+  tail call void @_Z3fn2iiii(i32 0, i32 0, i32 0, i32 0) #3, !dbg !71
+  br label %tailrecurse.backedge, !dbg !71
+}
+
+; Function Attrs: nounwind
+define void @_Z3fn4v() #0 {
+entry:
+  %0 = load %struct.C** @x, align 8, !dbg !72, !tbaa !33
+  tail call void @_ZN1C5m_fn2Ev(%struct.C* %0), !dbg !72
+  ret void, !dbg !72
+}
+
+; Function Attrs: nounwind
+define void @_Z3fn5v() #0 {
+entry:
+  %0 = load %struct.C** @x, align 8, !dbg !73, !tbaa !33
+  tail call void @_ZN1C5m_fn2Ev(%struct.C* %0), !dbg !73
+  ret void, !dbg !73
+}
+
+declare void @_Z3fn2iiii(i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!28, !29}
+!llvm.ident = !{!30}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !13, metadata !26, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x13\00C\005\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 5, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"recursive_inlining.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce"}
+!6 = metadata !{metadata !7, metadata !9}
+!7 = metadata !{metadata !"0xd\00b\006\0032\0032\000\000", metadata !5, metadata !"_ZTS1C", metadata !8} ; [ DW_TAG_member ] [b] [line 6, size 32, align 32, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x2e\00m_fn2\00m_fn2\00_ZN1C5m_fn2Ev\007\000\000\000\006\00256\001\007", metadata !5, metadata !"_ZTS1C", metadata !10, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [m_fn2]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{null, metadata !12}
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!13 = metadata !{metadata !14, metadata !18, metadata !19, metadata !20, metadata !21, metadata !22}
+!14 = metadata !{metadata !"0x2e\00fn6\00fn6\00_Z3fn6v\0015\000\001\000\006\00256\001\0015", metadata !5, metadata !15, metadata !16, null, void ()* @_Z3fn6v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 15] [def] [fn6]
+!15 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/recursive_inlining.cpp]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{null}
+!18 = metadata !{metadata !"0x2e\00fn3\00fn3\00_Z3fn3v\0020\000\001\000\006\00256\001\0020", metadata !5, metadata !15, metadata !16, null, void ()* @_Z3fn3v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 20] [def] [fn3]
+!19 = metadata !{metadata !"0x2e\00fn4\00fn4\00_Z3fn4v\0021\000\001\000\006\00256\001\0021", metadata !5, metadata !15, metadata !16, null, void ()* @_Z3fn4v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 21] [def] [fn4]
+!20 = metadata !{metadata !"0x2e\00fn5\00fn5\00_Z3fn5v\0022\000\001\000\006\00256\001\0022", metadata !5, metadata !15, metadata !16, null, void ()* @_Z3fn5v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 22] [def] [fn5]
+!21 = metadata !{metadata !"0x2e\00fn7\00fn7\00_Z3fn7v\0014\000\001\000\006\00256\001\0014", metadata !5, metadata !15, metadata !16, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 14] [def] [fn7]
+!22 = metadata !{metadata !"0x2e\00m_fn2\00m_fn2\00_ZN1C5m_fn2Ev\007\000\001\000\006\00256\001\007", metadata !5, metadata !"_ZTS1C", metadata !10, null, void (%struct.C*)* @_ZN1C5m_fn2Ev, null, metadata !9, metadata !23} ; [ DW_TAG_subprogram ] [line 7] [def] [m_fn2]
+!23 = metadata !{metadata !24}
+!24 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !25} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!26 = metadata !{metadata !27}
+!27 = metadata !{metadata !"0x34\00x\00x\00\0013\000\001", null, metadata !15, metadata !25, %struct.C** @x, null} ; [ DW_TAG_variable ] [x] [line 13] [def]
+!28 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!29 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!30 = metadata !{metadata !"clang version 3.6.0 "}
+!31 = metadata !{i32 16, i32 0, metadata !14, null}
+!32 = metadata !{i32 17, i32 0, metadata !14, null}
+!33 = metadata !{metadata !34, metadata !34, i64 0}
+!34 = metadata !{metadata !"any pointer", metadata !35, i64 0}
+!35 = metadata !{metadata !"omnipotent char", metadata !36, i64 0}
+!36 = metadata !{metadata !"Simple C/C++ TBAA"}
+!37 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !25, metadata !32} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!38 = metadata !{i32 0, i32 0, metadata !22, metadata !32}
+!39 = metadata !{i32 8, i32 0, metadata !22, metadata !32}
+!40 = metadata !{i32 9, i32 0, metadata !41, metadata !32}
+!41 = metadata !{metadata !"0xb\009\000\000", metadata !5, metadata !22} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/recursive_inlining.cpp]
+!42 = metadata !{metadata !43, metadata !44, i64 0}
+!43 = metadata !{metadata !"_ZTS1C", metadata !44, i64 0}
+!44 = metadata !{metadata !"int", metadata !35, i64 0}
+!45 = metadata !{i32 9, i32 0, metadata !46, metadata !32}
+!46 = metadata !{metadata !"0xb\009\000\001", metadata !5, metadata !41} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/missing_concrete_variable_on_darwin/reduce/recursive_inlining.cpp]
+!47 = metadata !{i32 10, i32 0, metadata !22, metadata !32}
+!48 = metadata !{i32 19, i32 0, metadata !14, null}
+!49 = metadata !{i32 0, i32 0, metadata !22, null}
+!50 = metadata !{i32 8, i32 0, metadata !22, null}
+!51 = metadata !{i32 9, i32 0, metadata !41, null}
+!52 = metadata !{i32 9, i32 0, metadata !46, null}
+!53 = metadata !{i32 16, i32 0, metadata !14, metadata !54}
+!54 = metadata !{i32 20, i32 0, metadata !18, metadata !55}
+!55 = metadata !{i32 10, i32 0, metadata !22, null}
+!56 = metadata !{i32 17, i32 0, metadata !14, metadata !54}
+!57 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !25, metadata !56} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!58 = metadata !{i32 0, i32 0, metadata !22, metadata !56}
+!59 = metadata !{i32 8, i32 0, metadata !22, metadata !56}
+!60 = metadata !{i32 9, i32 0, metadata !41, metadata !56}
+!61 = metadata !{i32 9, i32 0, metadata !46, metadata !56}
+!62 = metadata !{i32 10, i32 0, metadata !22, metadata !56}
+!63 = metadata !{i32 11, i32 0, metadata !22, null}
+!64 = metadata !{i32 16, i32 0, metadata !14, metadata !65}
+!65 = metadata !{i32 20, i32 0, metadata !18, null}
+!66 = metadata !{i32 17, i32 0, metadata !14, metadata !65}
+!67 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !22, null, metadata !25, metadata !66} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = metadata !{i32 0, i32 0, metadata !22, metadata !66}
+!69 = metadata !{i32 8, i32 0, metadata !22, metadata !66}
+!70 = metadata !{i32 9, i32 0, metadata !41, metadata !66}
+!71 = metadata !{i32 9, i32 0, metadata !46, metadata !66}
+!72 = metadata !{i32 21, i32 0, metadata !19, null}
+!73 = metadata !{i32 22, i32 0, metadata !20, null}

diff --git a/test/DebugInfo/X86/ref_addr_relocation.ll b/test/DebugInfo/X86/ref_addr_relocation.ll
index 76e6aa6..4d77322 100644
--- a/test/DebugInfo/X86/ref_addr_relocation.ll
+++ b/test/DebugInfo/X86/ref_addr_relocation.ll

@@ -23,7 +23,7 @@
 ; CHECK: DW_TAG_variable
 ; CHECK: .long [[TYPE:.*]] # DW_AT_type
 ; CHECK: DW_TAG_structure_type
-; CHECK: debug_info_end0
+; CHECK: debug_info_begin1
 ; CHECK: DW_TAG_compile_unit
 ; CHECK-NOT: DW_TAG_structure_type
 ; This variable's type is in the 1st CU.
@@ -31,7 +31,7 @@
 ; Make sure this is relocatable.
 ; CHECK: .quad .Lsection_info+[[TYPE]] # DW_AT_type
 ; CHECK-NOT: DW_TAG_structure_type
-; CHECK: debug_info_end1
+; CHECK: .section
 
 ; CHECK-DWARF: DW_TAG_compile_unit
 ; CHECK-DWARF: 0x[[ADDR:.*]]: DW_TAG_structure_type
@@ -53,19 +53,19 @@
 !llvm.dbg.cu = !{!0, !9}
 !llvm.module.flags = !{!14, !15}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 191799)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !6, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 191799)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !2, metadata !6, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tu1.cpp", metadata !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !5, null, null, metadata !2, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !"./hdr.h", metadata !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !8, i32 2, metadata !4, i32 0, i32 1, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 2] [def]
-!8 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp]
-!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.4 (trunk 191799)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !11, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp] [DW_LANG_C_plus_plus]
+!7 = metadata !{metadata !"0x34\00f\00f\00\002\000\001", null, metadata !8, metadata !4, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 2] [def]
+!8 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp]
+!9 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 191799)\000\00\000\00\000", metadata !10, metadata !2, metadata !3, metadata !2, metadata !11, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp] [DW_LANG_C_plus_plus]
 !10 = metadata !{metadata !"tu2.cpp", metadata !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786484, i32 0, null, metadata !"g", metadata !"g", metadata !"", metadata !13, i32 2, metadata !4, i32 0, i32 1, %struct.foo* @g, null} ; [ DW_TAG_variable ] [g] [line 2] [def]
-!13 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp]
+!12 = metadata !{metadata !"0x34\00g\00g\00\002\000\001", null, metadata !13, metadata !4, %struct.foo* @g, null} ; [ DW_TAG_variable ] [g] [line 2] [def]
+!13 = metadata !{metadata !"0x29", metadata !10}        ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp]
 !14 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/reference-argument.ll b/test/DebugInfo/X86/reference-argument.ll
index 4a6bdca..fe268e2 100644
--- a/test/DebugInfo/X86/reference-argument.ll
+++ b/test/DebugInfo/X86/reference-argument.ll

@@ -13,15 +13,15 @@
 %class.A = type { i8 }
 
 declare void @_Z3barR4SVal(%class.SVal* %v)
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 declare i32 @main()
 ; Function Attrs: nounwind ssp uwtable
 define linkonce_odr void @_ZN1A3fooE4SVal(%class.A* %this, %class.SVal* %v) nounwind ssp uwtable align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !59), !dbg !61
-  call void @llvm.dbg.declare(metadata !{%class.SVal* %v}, metadata !62), !dbg !61
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !59, metadata !{metadata !"0x102"}), !dbg !61
+  call void @llvm.dbg.declare(metadata !{%class.SVal* %v}, metadata !62, metadata !{metadata !"0x102"}), !dbg !61
   %this1 = load %class.A** %this.addr
   call void @_Z3barR4SVal(%class.SVal* %v), !dbg !61
   ret void, !dbg !61
@@ -32,72 +32,72 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!47, !68}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [aggregate-indirect-arg.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [aggregate-indirect-arg.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"aggregate-indirect-arg.cpp", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !29, metadata !33, metadata !34, metadata !35}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"_Z3barR4SVal", i32 19, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.SVal*)* @_Z3barR4SVal, null, null, metadata !2, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [bar]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [aggregate-indirect-arg.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barR4SVal\0019\000\001\000\006\00256\000\0019", metadata !1, metadata !5, metadata !6, null, void (%class.SVal*)* @_Z3barR4SVal, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 19] [def] [bar]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [aggregate-indirect-arg.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
-!8 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from SVal]
-!9 = metadata !{i32 786434, metadata !1, null, metadata !"SVal", i32 12, i64 128, i64 64, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_class_type ] [SVal] [line 12, size 128, align 64, offset 0] [def] [from ]
+!8 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from SVal]
+!9 = metadata !{metadata !"0x2\00SVal\0012\00128\0064\000\000\000", metadata !1, null, null, metadata !10, null, null, null} ; [ DW_TAG_class_type ] [SVal] [line 12, size 128, align 64, offset 0] [def] [from ]
 !10 = metadata !{metadata !11, metadata !14, metadata !16, metadata !21, metadata !23}
-!11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"Data", i32 15, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ] [Data] [line 15, size 64, align 64, offset 0] [from ]
-!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from ]
-!14 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"Kind", i32 16, i64 32, i64 32, i64 64, i32 0, metadata !15} ; [ DW_TAG_member ] [Kind] [line 16, size 32, align 32, offset 64] [from unsigned int]
-!15 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!16 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 14, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !20, i32 14} ; [ DW_TAG_subprogram ] [line 14] [~SVal]
-!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0xd\00Data\0015\0064\0064\000\000", metadata !1, metadata !9, metadata !12} ; [ DW_TAG_member ] [Data] [line 15, size 64, align 64, offset 0] [from ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0xd\00Kind\0016\0032\0032\0064\000", metadata !1, metadata !9, metadata !15} ; [ DW_TAG_member ] [Kind] [line 16, size 32, align 32, offset 64] [from unsigned int]
+!15 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!16 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00\0014\000\000\000\006\00256\000\0014", metadata !1, metadata !9, metadata !17, null, null, null, i32 0, metadata !20} ; [ DW_TAG_subprogram ] [line 14] [~SVal]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19}
-!19 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from SVal]
+!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from SVal]
 !20 = metadata !{i32 786468}
-!21 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"SVal", metadata !"SVal", metadata !"", i32 12, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !22, i32 12} ; [ DW_TAG_subprogram ] [line 12] [SVal]
+!21 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0012\000\000\000\006\00320\000\0012", metadata !1, metadata !9, metadata !17, null, null, null, i32 0, metadata !22} ; [ DW_TAG_subprogram ] [line 12] [SVal]
 !22 = metadata !{i32 786468}
-!23 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"SVal", metadata !"SVal", metadata !"", i32 12, metadata !24, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !28, i32 12} ; [ DW_TAG_subprogram ] [line 12] [SVal]
-!24 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{metadata !"0x2e\00SVal\00SVal\00\0012\000\000\000\006\00320\000\0012", metadata !1, metadata !9, metadata !24, null, null, null, i32 0, metadata !28} ; [ DW_TAG_subprogram ] [line 12] [SVal]
+!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{null, metadata !19, metadata !26}
-!26 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !27} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!27 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from SVal]
+!26 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !27} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from SVal]
 !28 = metadata !{i32 786468}
-!29 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !30, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 25} ; [ DW_TAG_subprogram ] [line 25] [def] [main]
-!30 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{metadata !"0x2e\00main\00main\00\0025\000\001\000\006\00256\000\0025", metadata !1, metadata !5, metadata !30, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 25] [def] [main]
+!30 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !31, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !31 = metadata !{metadata !32}
-!32 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!33 = metadata !{i32 786478, metadata !1, null, metadata !"~SVal", metadata !"~SVal", metadata !"_ZN4SValD1Ev", i32 14, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.SVal*)* @_ZN4SValD1Ev, null, metadata !16, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
-!34 = metadata !{i32 786478, metadata !1, null, metadata !"~SVal", metadata !"~SVal", metadata !"_ZN4SValD2Ev", i32 14, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.SVal*)* @_ZN4SValD2Ev, null, metadata !16, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
-!35 = metadata !{i32 786478, metadata !1, null, metadata !"foo", metadata !"foo", metadata !"_ZN1A3fooE4SVal", i32 22, metadata !36, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, %class.SVal*)* @_ZN1A3fooE4SVal, null, metadata !41, metadata !2, i32 22} ; [ DW_TAG_subprogram ] [line 22] [def] [foo]
-!36 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!32 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!33 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00_ZN4SValD1Ev\0014\000\001\000\006\00256\000\0014", metadata !1, null, metadata !17, null, void (%class.SVal*)* @_ZN4SValD1Ev, null, metadata !16, metadata !2} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
+!34 = metadata !{metadata !"0x2e\00~SVal\00~SVal\00_ZN4SValD2Ev\0014\000\001\000\006\00256\000\0014", metadata !1, null, metadata !17, null, void (%class.SVal*)* @_ZN4SValD2Ev, null, metadata !16, metadata !2} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
+!35 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1A3fooE4SVal\0022\000\001\000\006\00256\000\0022", metadata !1, null, metadata !36, null, void (%class.A*, %class.SVal*)* @_ZN1A3fooE4SVal, null, metadata !41, metadata !2} ; [ DW_TAG_subprogram ] [line 22] [def] [foo]
+!36 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !37, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !37 = metadata !{null, metadata !38, metadata !9}
-!38 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!39 = metadata !{i32 786434, metadata !1, null, metadata !"A", i32 20, i64 8, i64 8, i32 0, i32 0, null, metadata !40, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 20, size 8, align 8, offset 0] [def] [from ]
+!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!39 = metadata !{metadata !"0x2\00A\0020\008\008\000\000\000", metadata !1, null, null, metadata !40, null, null, null} ; [ DW_TAG_class_type ] [A] [line 20, size 8, align 8, offset 0] [def] [from ]
 !40 = metadata !{metadata !41, metadata !43}
-!41 = metadata !{i32 786478, metadata !1, metadata !39, metadata !"foo", metadata !"foo", metadata !"_ZN1A3fooE4SVal", i32 22, metadata !36, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !42, i32 22} ; [ DW_TAG_subprogram ] [line 22] [foo]
+!41 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1A3fooE4SVal\0022\000\000\000\006\00256\000\0022", metadata !1, metadata !39, metadata !36, null, null, null, i32 0, metadata !42} ; [ DW_TAG_subprogram ] [line 22] [foo]
 !42 = metadata !{i32 786468}
-!43 = metadata !{i32 786478, metadata !1, metadata !39, metadata !"A", metadata !"A", metadata !"", i32 20, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !46, i32 20} ; [ DW_TAG_subprogram ] [line 20] [A]
-!44 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!43 = metadata !{metadata !"0x2e\00A\00A\00\0020\000\000\000\006\00320\000\0020", metadata !1, metadata !39, metadata !44, null, null, null, i32 0, metadata !46} ; [ DW_TAG_subprogram ] [line 20] [A]
+!44 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !45, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !45 = metadata !{null, metadata !38}
 !46 = metadata !{i32 786468}
 !47 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!48 = metadata !{i32 786689, metadata !4, metadata !"v", metadata !5, i32 16777235, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [v] [line 19]
+!48 = metadata !{metadata !"0x101\00v\0016777235\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [v] [line 19]
 !49 = metadata !{i32 19, i32 0, metadata !4, null}
-!50 = metadata !{i32 786688, metadata !29, metadata !"v", metadata !5, i32 26, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [v] [line 26]
+!50 = metadata !{metadata !"0x100\00v\0026\000", metadata !29, metadata !5, metadata !9} ; [ DW_TAG_auto_variable ] [v] [line 26]
 !51 = metadata !{i32 26, i32 0, metadata !29, null}
 !52 = metadata !{i32 27, i32 0, metadata !29, null}
 !53 = metadata !{i32 28, i32 0, metadata !29, null}
-!54 = metadata !{i32 786688, metadata !29, metadata !"a", metadata !5, i32 29, metadata !39, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 29]
+!54 = metadata !{metadata !"0x100\00a\0029\000", metadata !29, metadata !5, metadata !39} ; [ DW_TAG_auto_variable ] [a] [line 29]
 !55 = metadata !{i32 29, i32 0, metadata !29, null}
 !56 = metadata !{i32 30, i32 0, metadata !29, null}
 !57 = metadata !{i32 31, i32 0, metadata !29, null}
 !58 = metadata !{i32 32, i32 0, metadata !29, null}
-!59 = metadata !{i32 786689, metadata !35, metadata !"this", metadata !5, i32 16777238, metadata !60, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 22]
-!60 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!59 = metadata !{metadata !"0x101\00this\0016777238\001088", metadata !35, metadata !5, metadata !60} ; [ DW_TAG_arg_variable ] [this] [line 22]
+!60 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
 !61 = metadata !{i32 22, i32 0, metadata !35, null}
-!62 = metadata !{i32 786689, metadata !35, metadata !"v", metadata !5, i32 33554454, metadata !9, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [v] [line 22]
-!63 = metadata !{i32 786689, metadata !33, metadata !"this", metadata !5, i32 16777230, metadata !64, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 14]
-!64 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from SVal]
+!62 = metadata !{metadata !"0x101\00v\0033554454\008192", metadata !35, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [v] [line 22]
+!63 = metadata !{metadata !"0x101\00this\0016777230\001088", metadata !33, metadata !5, metadata !64} ; [ DW_TAG_arg_variable ] [this] [line 14]
+!64 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from SVal]
 !65 = metadata !{i32 14, i32 0, metadata !33, null}
-!66 = metadata !{i32 786689, metadata !34, metadata !"this", metadata !5, i32 16777230, metadata !64, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 14]
+!66 = metadata !{metadata !"0x101\00this\0016777230\001088", metadata !34, metadata !5, metadata !64} ; [ DW_TAG_arg_variable ] [this] [line 14]
 !67 = metadata !{i32 14, i32 0, metadata !34, null}
-!68 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!68 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/rvalue-ref.ll b/test/DebugInfo/X86/rvalue-ref.ll
index b8ed021..bbee6a2 100644
--- a/test/DebugInfo/X86/rvalue-ref.ll
+++ b/test/DebugInfo/X86/rvalue-ref.ll

@@ -9,33 +9,33 @@
 entry:
   %i.addr = alloca i32*, align 8
   store i32* %i, i32** %i.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32** %i.addr}, metadata !11), !dbg !12
+  call void @llvm.dbg.declare(metadata !{i32** %i.addr}, metadata !11, metadata !{metadata !"0x102"}), !dbg !12
   %0 = load i32** %i.addr, align 8, !dbg !13
   %1 = load i32* %0, align 4, !dbg !13
   %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1), !dbg !13
   ret void, !dbg !15
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @printf(i8*, ...)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{i32 786449, metadata !16, i32 4, metadata !"clang version 3.2 (trunk 157054) (llvm/trunk 157060)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 157054) (llvm/trunk 157060)\000\00\000\00\000", metadata !16, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !16, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooOi", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*)* @_Z3fooOi, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooOi\004\000\001\000\006\00256\000\005", metadata !16, metadata !6, metadata !7, null, void (i32*)* @_Z3fooOi, null, null, metadata !1} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_rvalue_reference_type ]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786689, metadata !5, metadata !"i", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !10} ; [ DW_TAG_rvalue_reference_type ]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0x101\00i\0016777220\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
 !12 = metadata !{i32 4, i32 17, metadata !5, null}
 !13 = metadata !{i32 6, i32 3, metadata !14, null}
-!14 = metadata !{i32 786443, metadata !16, metadata !5, i32 5, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\005\001\000", metadata !16, metadata !5} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 7, i32 1, metadata !14, null}
 !16 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/sret.ll b/test/DebugInfo/X86/sret.ll
index faf5158..7e51183 100644
--- a/test/DebugInfo/X86/sret.ll
+++ b/test/DebugInfo/X86/sret.ll

@@ -3,8 +3,8 @@
 
 ; Based on the debuginfo-tests/sret.cpp code.
 
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x5b59949640ec1580)
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x5b59949640ec1580)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x51ac5644b1937aa1)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x51ac5644b1937aa1)
 
 %class.A = type { i32 (...)**, i32 }
 %class.B = type { i8 }
@@ -23,9 +23,9 @@
   %this.addr = alloca %class.A*, align 8
   %i.addr = alloca i32, align 4
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !67), !dbg !69
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !67, metadata !{metadata !"0x102"}), !dbg !69
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !70), !dbg !71
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !70, metadata !{metadata !"0x102"}), !dbg !71
   %this1 = load %class.A** %this.addr
   %0 = bitcast %class.A* %this1 to i8***, !dbg !72
   store i8** getelementptr inbounds ([4 x i8*]* @_ZTV1A, i64 0, i64 2), i8*** %0, !dbg !72
@@ -36,7 +36,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind uwtable
 define void @_ZN1AC2ERKS_(%class.A* %this, %class.A* %rhs) unnamed_addr #0 align 2 {
@@ -44,9 +44,9 @@
   %this.addr = alloca %class.A*, align 8
   %rhs.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !74), !dbg !75
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !74, metadata !{metadata !"0x102"}), !dbg !75
   store %class.A* %rhs, %class.A** %rhs.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %rhs.addr}, metadata !76), !dbg !77
+  call void @llvm.dbg.declare(metadata !{%class.A** %rhs.addr}, metadata !76, metadata !{metadata !"0x102"}), !dbg !77
   %this1 = load %class.A** %this.addr
   %0 = bitcast %class.A* %this1 to i8***, !dbg !78
   store i8** getelementptr inbounds ([4 x i8*]* @_ZTV1A, i64 0, i64 2), i8*** %0, !dbg !78
@@ -64,9 +64,9 @@
   %this.addr = alloca %class.A*, align 8
   %rhs.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !80), !dbg !81
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !80, metadata !{metadata !"0x102"}), !dbg !81
   store %class.A* %rhs, %class.A** %rhs.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %rhs.addr}, metadata !82), !dbg !83
+  call void @llvm.dbg.declare(metadata !{%class.A** %rhs.addr}, metadata !82, metadata !{metadata !"0x102"}), !dbg !83
   %this1 = load %class.A** %this.addr
   %0 = load %class.A** %rhs.addr, align 8, !dbg !84
   %m_int = getelementptr inbounds %class.A* %0, i32 0, i32 1, !dbg !84
@@ -81,7 +81,7 @@
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !86), !dbg !87
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !86, metadata !{metadata !"0x102"}), !dbg !87
   %this1 = load %class.A** %this.addr
   %m_int = getelementptr inbounds %class.A* %this1, i32 0, i32 1, !dbg !88
   %0 = load i32* %m_int, align 4, !dbg !88
@@ -95,10 +95,10 @@
   %nrvo = alloca i1
   %cleanup.dest.slot = alloca i32
   store %class.B* %this, %class.B** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.B** %this.addr}, metadata !89), !dbg !91
+  call void @llvm.dbg.declare(metadata !{%class.B** %this.addr}, metadata !89, metadata !{metadata !"0x102"}), !dbg !91
   %this1 = load %class.B** %this.addr
   store i1 false, i1* %nrvo, !dbg !92
-  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !93), !dbg !92
+  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !93, metadata !{metadata !"0x102"}), !dbg !92
   call void @_ZN1AC1Ei(%class.A* %agg.result, i32 12), !dbg !92
   store i1 true, i1* %nrvo, !dbg !94
   store i32 1, i32* %cleanup.dest.slot
@@ -118,7 +118,7 @@
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !101), !dbg !102
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !101, metadata !{metadata !"0x102"}), !dbg !102
   %this1 = load %class.A** %this.addr
   ret void, !dbg !103
 }
@@ -138,12 +138,12 @@
   %cleanup.dest.slot = alloca i32
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !104), !dbg !105
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !104, metadata !{metadata !"0x102"}), !dbg !105
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !106), !dbg !105
-  call void @llvm.dbg.declare(metadata !{%class.B* %b}, metadata !107), !dbg !108
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !106, metadata !{metadata !"0x102"}), !dbg !105
+  call void @llvm.dbg.declare(metadata !{%class.B* %b}, metadata !107, metadata !{metadata !"0x102"}), !dbg !108
   call void @_ZN1BC2Ev(%class.B* %b), !dbg !108
-  call void @llvm.dbg.declare(metadata !{i32* %return_val}, metadata !109), !dbg !110
+  call void @llvm.dbg.declare(metadata !{i32* %return_val}, metadata !109, metadata !{metadata !"0x102"}), !dbg !110
   call void @_ZN1B9AInstanceEv(%class.A* sret %temp.lvalue, %class.B* %b), !dbg !110
   %call = invoke i32 @_ZN1A7get_intEv(%class.A* %temp.lvalue)
           to label %invoke.cont unwind label %lpad, !dbg !110
@@ -151,7 +151,7 @@
 invoke.cont:                                      ; preds = %entry
   call void @_ZN1AD2Ev(%class.A* %temp.lvalue), !dbg !111
   store i32 %call, i32* %return_val, align 4, !dbg !111
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !113), !dbg !114
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !113, metadata !{metadata !"0x102"}), !dbg !114
   call void @_ZN1B9AInstanceEv(%class.A* sret %a, %class.B* %b), !dbg !114
   %0 = load i32* %return_val, align 4, !dbg !115
   store i32 %0, i32* %retval, !dbg !115
@@ -193,7 +193,7 @@
 entry:
   %this.addr = alloca %class.B*, align 8
   store %class.B* %this, %class.B** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.B** %this.addr}, metadata !123), !dbg !124
+  call void @llvm.dbg.declare(metadata !{%class.B** %this.addr}, metadata !123, metadata !{metadata !"0x102"}), !dbg !124
   %this1 = load %class.B** %this.addr
   ret void, !dbg !125
 }
@@ -218,7 +218,7 @@
   %exn.slot = alloca i8*
   %ehselector.slot = alloca i32
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !126), !dbg !127
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !126, metadata !{metadata !"0x102"}), !dbg !127
   %this1 = load %class.A** %this.addr
   invoke void @_ZN1AD2Ev(%class.A* %this1)
           to label %invoke.cont unwind label %lpad, !dbg !128
@@ -263,131 +263,131 @@
 !llvm.module.flags = !{!64, !65}
 !llvm.ident = !{!66}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 203283) (llvm/trunk 203307)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !48, metadata !2, metadata !2, metadata !"sret.dwo", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/sret.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 203283) (llvm/trunk 203307)\000\00\000\00sret.dwo\001", metadata !1, metadata !2, metadata !3, metadata !48, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/sret.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"sret.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !37}
-!4 = metadata !{i32 786434, metadata !1, null, metadata !"A", i32 1, i64 128, i64 64, i32 0, i32 0, null, metadata !5, i32 0, metadata !"_ZTS1A", null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 128, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00A\001\00128\0064\000\000\000", metadata !1, null, null, metadata !5, metadata !"_ZTS1A", null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 128, align 64, offset 0] [def] [from ]
 !5 = metadata !{metadata !6, metadata !13, metadata !14, metadata !19, metadata !25, metadata !29, metadata !33}
-!6 = metadata !{i32 786445, metadata !1, metadata !7, metadata !"_vptr$A", i32 0, i64 64, i64 0, i64 0, i32 64, metadata !8} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
-!7 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
-!9 = metadata !{i32 786447, null, null, metadata !"__vtbl_ptr_type", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
-!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0xd\00_vptr$A\000\0064\000\000\0064", metadata !1, metadata !7, metadata !8} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!7 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!8 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!9 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1A", metadata !"m_int", i32 13, i64 32, i64 32, i64 64, i32 2, metadata !12} ; [ DW_TAG_member ] [m_int] [line 13, size 32, align 32, offset 64] [protected] [from int]
-!14 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 4, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 4} ; [ DW_TAG_subprogram ] [line 4] [A]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{metadata !"0xd\00m_int\0013\0032\0032\0064\002", metadata !1, metadata !"_ZTS1A", metadata !12} ; [ DW_TAG_member ] [m_int] [line 13, size 32, align 32, offset 64] [protected] [from int]
+!14 = metadata !{metadata !"0x2e\00A\00A\00\004\000\000\000\006\00256\000\004", metadata !1, metadata !"_ZTS1A", metadata !15, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 4] [A]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17, metadata !12}
-!17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!19 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 5, metadata !20, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 5} ; [ DW_TAG_subprogram ] [line 5] [A]
-!20 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!19 = metadata !{metadata !"0x2e\00A\00A\00\005\000\000\000\006\00256\000\005", metadata !1, metadata !"_ZTS1A", metadata !20, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [A]
+!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{null, metadata !17, metadata !22}
-!22 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !23} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
-!23 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
-!25 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"operator=", metadata !"operator=", metadata !"_ZN1AaSERKS_", i32 7, metadata !26, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 7} ; [ DW_TAG_subprogram ] [line 7] [operator=]
-!26 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !23} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
+!25 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN1AaSERKS_\007\000\000\000\006\00256\000\007", metadata !1, metadata !"_ZTS1A", metadata !26, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator=]
+!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !27 = metadata !{metadata !22, metadata !17, metadata !22}
-!29 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"", i32 8, metadata !30, i1 false, i1 false, i32 1, i32 0, metadata !"_ZTS1A", i32 256, i1 false, null, null, i32 0, null, i32 8} ; [ DW_TAG_subprogram ] [line 8] [~A]
-!30 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{metadata !"0x2e\00~A\00~A\00\008\000\000\001\006\00256\000\008", metadata !1, metadata !"_ZTS1A", metadata !30, metadata !"_ZTS1A", null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 8] [~A]
+!30 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !31, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !31 = metadata !{null, metadata !17}
-!33 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"get_int", metadata !"get_int", metadata !"_ZN1A7get_intEv", i32 10, metadata !34, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 10} ; [ DW_TAG_subprogram ] [line 10] [get_int]
-!34 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !35, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = metadata !{metadata !"0x2e\00get_int\00get_int\00_ZN1A7get_intEv\0010\000\000\000\006\00256\000\0010", metadata !1, metadata !"_ZTS1A", metadata !34, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 10] [get_int]
+!34 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !35, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !35 = metadata !{metadata !12, metadata !17}
-!37 = metadata !{i32 786434, metadata !1, null, metadata !"B", i32 38, i64 8, i64 8, i32 0, i32 0, null, metadata !38, i32 0, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 38, size 8, align 8, offset 0] [def] [from ]
+!37 = metadata !{metadata !"0x2\00B\0038\008\008\000\000\000", metadata !1, null, null, metadata !38, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 38, size 8, align 8, offset 0] [def] [from ]
 !38 = metadata !{metadata !39, metadata !44}
-!39 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1B", metadata !"B", metadata !"B", metadata !"", i32 41, metadata !40, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 41} ; [ DW_TAG_subprogram ] [line 41] [B]
-!40 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !41, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!39 = metadata !{metadata !"0x2e\00B\00B\00\0041\000\000\000\006\00256\000\0041", metadata !1, metadata !"_ZTS1B", metadata !40, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 41] [B]
+!40 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !41, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !41 = metadata !{null, metadata !42}
-!42 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
-!44 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1B", metadata !"AInstance", metadata !"AInstance", metadata !"_ZN1B9AInstanceEv", i32 43, metadata !45, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 43} ; [ DW_TAG_subprogram ] [line 43] [AInstance]
-!45 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !46, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!42 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!44 = metadata !{metadata !"0x2e\00AInstance\00AInstance\00_ZN1B9AInstanceEv\0043\000\000\000\006\00256\000\0043", metadata !1, metadata !"_ZTS1B", metadata !45, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 43] [AInstance]
+!45 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !46, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !46 = metadata !{metadata !4, metadata !42}
 !48 = metadata !{metadata !49, metadata !50, metadata !51, metadata !52, metadata !53, metadata !54, metadata !61, metadata !62, metadata !63}
-!49 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"_ZN1AC2Ei", i32 16, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, i32)* @_ZN1AC2Ei, null, metadata !14, metadata !2, i32 18} ; [ DW_TAG_subprogram ] [line 16] [def] [scope 18] [A]
-!50 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"_ZN1AC2ERKS_", i32 21, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, %class.A*)* @_ZN1AC2ERKS_, null, metadata !19, metadata !2, i32 23} ; [ DW_TAG_subprogram ] [line 21] [def] [scope 23] [A]
-!51 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"operator=", metadata !"operator=", metadata !"_ZN1AaSERKS_", i32 27, metadata !26, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, %class.A* (%class.A*, %class.A*)* @_ZN1AaSERKS_, null, metadata !25, metadata !2, i32 28} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [operator=]
-!52 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"get_int", metadata !"get_int", metadata !"_ZN1A7get_intEv", i32 33, metadata !34, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*)* @_ZN1A7get_intEv, null, metadata !33, metadata !2, i32 34} ; [ DW_TAG_subprogram ] [line 33] [def] [scope 34] [get_int]
-!53 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1B", metadata !"AInstance", metadata !"AInstance", metadata !"_ZN1B9AInstanceEv", i32 47, metadata !45, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, %class.B*)* @_ZN1B9AInstanceEv, null, metadata !44, metadata !2, i32 48} ; [ DW_TAG_subprogram ] [line 47] [def] [scope 48] [AInstance]
-!54 = metadata !{i32 786478, metadata !1, metadata !7, metadata !"main", metadata !"main", metadata !"", i32 53, metadata !55, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !2, i32 54} ; [ DW_TAG_subprogram ] [line 53] [def] [scope 54] [main]
-!55 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !56, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!49 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2Ei\0016\000\001\000\006\00256\000\0018", metadata !1, metadata !"_ZTS1A", metadata !15, null, void (%class.A*, i32)* @_ZN1AC2Ei, null, metadata !14, metadata !2} ; [ DW_TAG_subprogram ] [line 16] [def] [scope 18] [A]
+!50 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2ERKS_\0021\000\001\000\006\00256\000\0023", metadata !1, metadata !"_ZTS1A", metadata !20, null, void (%class.A*, %class.A*)* @_ZN1AC2ERKS_, null, metadata !19, metadata !2} ; [ DW_TAG_subprogram ] [line 21] [def] [scope 23] [A]
+!51 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN1AaSERKS_\0027\000\001\000\006\00256\000\0028", metadata !1, metadata !"_ZTS1A", metadata !26, null, %class.A* (%class.A*, %class.A*)* @_ZN1AaSERKS_, null, metadata !25, metadata !2} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [operator=]
+!52 = metadata !{metadata !"0x2e\00get_int\00get_int\00_ZN1A7get_intEv\0033\000\001\000\006\00256\000\0034", metadata !1, metadata !"_ZTS1A", metadata !34, null, i32 (%class.A*)* @_ZN1A7get_intEv, null, metadata !33, metadata !2} ; [ DW_TAG_subprogram ] [line 33] [def] [scope 34] [get_int]
+!53 = metadata !{metadata !"0x2e\00AInstance\00AInstance\00_ZN1B9AInstanceEv\0047\000\001\000\006\00256\000\0048", metadata !1, metadata !"_ZTS1B", metadata !45, null, void (%class.A*, %class.B*)* @_ZN1B9AInstanceEv, null, metadata !44, metadata !2} ; [ DW_TAG_subprogram ] [line 47] [def] [scope 48] [AInstance]
+!54 = metadata !{metadata !"0x2e\00main\00main\00\0053\000\001\000\006\00256\000\0054", metadata !1, metadata !7, metadata !55, null, i32 (i32, i8**)* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 53] [def] [scope 54] [main]
+!55 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !56, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !56 = metadata !{metadata !12, metadata !12, metadata !57}
-!57 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !58} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!58 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !59} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!59 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !60} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
-!60 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!61 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"_ZN1AD0Ev", i32 8, metadata !30, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AD0Ev, null, metadata !29, metadata !2, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [~A]
-!62 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1B", metadata !"B", metadata !"B", metadata !"_ZN1BC2Ev", i32 41, metadata !40, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.B*)* @_ZN1BC2Ev, null, metadata !39, metadata !2, i32 41} ; [ DW_TAG_subprogram ] [line 41] [def] [B]
-!63 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"~A", metadata !"~A", metadata !"_ZN1AD2Ev", i32 8, metadata !30, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AD2Ev, null, metadata !29, metadata !2, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [~A]
+!57 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !58} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!58 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !59} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!59 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !60} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char]
+!60 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!61 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD0Ev\008\000\001\000\006\00256\000\008", metadata !1, metadata !"_ZTS1A", metadata !30, null, void (%class.A*)* @_ZN1AD0Ev, null, metadata !29, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [~A]
+!62 = metadata !{metadata !"0x2e\00B\00B\00_ZN1BC2Ev\0041\000\001\000\006\00256\000\0041", metadata !1, metadata !"_ZTS1B", metadata !40, null, void (%class.B*)* @_ZN1BC2Ev, null, metadata !39, metadata !2} ; [ DW_TAG_subprogram ] [line 41] [def] [B]
+!63 = metadata !{metadata !"0x2e\00~A\00~A\00_ZN1AD2Ev\008\000\001\000\006\00256\000\008", metadata !1, metadata !"_ZTS1A", metadata !30, null, void (%class.A*)* @_ZN1AD2Ev, null, metadata !29, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [~A]
 !64 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!65 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!65 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !66 = metadata !{metadata !"clang version 3.5.0 (trunk 203283) (llvm/trunk 203307)"}
-!67 = metadata !{i32 786689, metadata !49, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!68 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!67 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !49, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
 !69 = metadata !{i32 0, i32 0, metadata !49, null}
-!70 = metadata !{i32 786689, metadata !49, metadata !"i", metadata !7, i32 33554448, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 16]
+!70 = metadata !{metadata !"0x101\00i\0033554448\000", metadata !49, metadata !7, metadata !12} ; [ DW_TAG_arg_variable ] [i] [line 16]
 !71 = metadata !{i32 16, i32 0, metadata !49, null}
 !72 = metadata !{i32 18, i32 0, metadata !49, null}
 !73 = metadata !{i32 19, i32 0, metadata !49, null}
-!74 = metadata !{i32 786689, metadata !50, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!74 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !50, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !75 = metadata !{i32 0, i32 0, metadata !50, null}
-!76 = metadata !{i32 786689, metadata !50, metadata !"rhs", metadata !7, i32 33554453, metadata !22, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rhs] [line 21]
+!76 = metadata !{metadata !"0x101\00rhs\0033554453\000", metadata !50, metadata !7, metadata !22} ; [ DW_TAG_arg_variable ] [rhs] [line 21]
 !77 = metadata !{i32 21, i32 0, metadata !50, null}
 !78 = metadata !{i32 23, i32 0, metadata !50, null}
 !79 = metadata !{i32 24, i32 0, metadata !50, null}
-!80 = metadata !{i32 786689, metadata !51, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!80 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !51, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !81 = metadata !{i32 0, i32 0, metadata !51, null}
-!82 = metadata !{i32 786689, metadata !51, metadata !"rhs", metadata !7, i32 33554459, metadata !22, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rhs] [line 27]
+!82 = metadata !{metadata !"0x101\00rhs\0033554459\000", metadata !51, metadata !7, metadata !22} ; [ DW_TAG_arg_variable ] [rhs] [line 27]
 !83 = metadata !{i32 27, i32 0, metadata !51, null}
 !84 = metadata !{i32 29, i32 0, metadata !51, null}
 !85 = metadata !{i32 30, i32 0, metadata !51, null}
-!86 = metadata !{i32 786689, metadata !52, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!86 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !52, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !87 = metadata !{i32 0, i32 0, metadata !52, null}
 !88 = metadata !{i32 35, i32 0, metadata !52, null}
-!89 = metadata !{i32 786689, metadata !53, metadata !"this", null, i32 16777216, metadata !90, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!90 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
+!89 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !53, null, metadata !90} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!90 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
 !91 = metadata !{i32 0, i32 0, metadata !53, null}
 !92 = metadata !{i32 49, i32 0, metadata !53, null}
-!93 = metadata !{i32 786688, metadata !53, metadata !"a", metadata !7, i32 49, metadata !4, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 49]
+!93 = metadata !{metadata !"0x100\00a\0049\008192", metadata !53, metadata !7, metadata !4} ; [ DW_TAG_auto_variable ] [a] [line 49]
 !94 = metadata !{i32 50, i32 0, metadata !53, null}
 !95 = metadata !{i32 51, i32 0, metadata !53, null}
 !96 = metadata !{i32 51, i32 0, metadata !97, null}
-!97 = metadata !{i32 786443, metadata !1, metadata !53, i32 51, i32 0, i32 2, i32 5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!97 = metadata !{metadata !"0xb\0051\000\002", metadata !1, metadata !53} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
 !98 = metadata !{i32 51, i32 0, metadata !99, null}
-!99 = metadata !{i32 786443, metadata !1, metadata !100, i32 51, i32 0, i32 3, i32 6} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!100 = metadata !{i32 786443, metadata !1, metadata !53, i32 51, i32 0, i32 1, i32 4} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!101 = metadata !{i32 786689, metadata !63, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!99 = metadata !{metadata !"0xb\0051\000\003", metadata !1, metadata !100} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!100 = metadata !{metadata !"0xb\0051\000\001", metadata !1, metadata !53} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!101 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !63, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !102 = metadata !{i32 0, i32 0, metadata !63, null}
-!103 = metadata !{i32 8, i32 0, metadata !63, null} ; [ DW_TAG_imported_declaration ]
-!104 = metadata !{i32 786689, metadata !54, metadata !"argc", metadata !7, i32 16777269, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 53]
+!103 = metadata !{i32 8, i32 0, metadata !63, null}
+!104 = metadata !{metadata !"0x101\00argc\0016777269\000", metadata !54, metadata !7, metadata !12} ; [ DW_TAG_arg_variable ] [argc] [line 53]
 !105 = metadata !{i32 53, i32 0, metadata !54, null}
-!106 = metadata !{i32 786689, metadata !54, metadata !"argv", metadata !7, i32 33554485, metadata !57, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 53]
-!107 = metadata !{i32 786688, metadata !54, metadata !"b", metadata !7, i32 55, metadata !37, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 55]
+!106 = metadata !{metadata !"0x101\00argv\0033554485\000", metadata !54, metadata !7, metadata !57} ; [ DW_TAG_arg_variable ] [argv] [line 53]
+!107 = metadata !{metadata !"0x100\00b\0055\000", metadata !54, metadata !7, metadata !37} ; [ DW_TAG_auto_variable ] [b] [line 55]
 !108 = metadata !{i32 55, i32 0, metadata !54, null}
-!109 = metadata !{i32 786688, metadata !54, metadata !"return_val", metadata !7, i32 56, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [return_val] [line 56]
+!109 = metadata !{metadata !"0x100\00return_val\0056\000", metadata !54, metadata !7, metadata !12} ; [ DW_TAG_auto_variable ] [return_val] [line 56]
 !110 = metadata !{i32 56, i32 0, metadata !54, null}
 !111 = metadata !{i32 56, i32 0, metadata !112, null}
-!112 = metadata !{i32 786443, metadata !1, metadata !54, i32 56, i32 0, i32 1, i32 7} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!113 = metadata !{i32 786688, metadata !54, metadata !"a", metadata !7, i32 58, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 58]
-!114 = metadata !{i32 58, i32 0, metadata !54, null} ; [ DW_TAG_imported_module ]
+!112 = metadata !{metadata !"0xb\0056\000\001", metadata !1, metadata !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!113 = metadata !{metadata !"0x100\00a\0058\000", metadata !54, metadata !7, metadata !4} ; [ DW_TAG_auto_variable ] [a] [line 58]
+!114 = metadata !{i32 58, i32 0, metadata !54, null}
 !115 = metadata !{i32 59, i32 0, metadata !54, null}
 !116 = metadata !{i32 60, i32 0, metadata !54, null}
 !117 = metadata !{i32 60, i32 0, metadata !118, null}
-!118 = metadata !{i32 786443, metadata !1, metadata !54, i32 60, i32 0, i32 1, i32 8} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!118 = metadata !{metadata !"0xb\0060\000\001", metadata !1, metadata !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
 !119 = metadata !{i32 60, i32 0, metadata !120, null}
-!120 = metadata !{i32 786443, metadata !1, metadata !54, i32 60, i32 0, i32 3, i32 10} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!120 = metadata !{metadata !"0xb\0060\000\003", metadata !1, metadata !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
 !121 = metadata !{i32 60, i32 0, metadata !122, null}
-!122 = metadata !{i32 786443, metadata !1, metadata !54, i32 60, i32 0, i32 2, i32 9} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!123 = metadata !{i32 786689, metadata !62, metadata !"this", null, i32 16777216, metadata !90, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!122 = metadata !{metadata !"0xb\0060\000\002", metadata !1, metadata !54} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!123 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !62, null, metadata !90} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !124 = metadata !{i32 0, i32 0, metadata !62, null}
 !125 = metadata !{i32 41, i32 0, metadata !62, null}
-!126 = metadata !{i32 786689, metadata !61, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!126 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !61, null, metadata !68} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !127 = metadata !{i32 0, i32 0, metadata !61, null}
-!128 = metadata !{i32 8, i32 0, metadata !61, null} ; [ DW_TAG_imported_declaration ]
-!129 = metadata !{i32 8, i32 0, metadata !130, null} ; [ DW_TAG_imported_declaration ]
-!130 = metadata !{i32 786443, metadata !1, metadata !61, i32 8, i32 0, i32 1, i32 11} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!131 = metadata !{i32 8, i32 0, metadata !132, null} ; [ DW_TAG_imported_declaration ]
-!132 = metadata !{i32 786443, metadata !1, metadata !61, i32 8, i32 0, i32 2, i32 12} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
-!133 = metadata !{i32 8, i32 0, metadata !134, null} ; [ DW_TAG_imported_declaration ]
-!134 = metadata !{i32 786443, metadata !1, metadata !61, i32 8, i32 0, i32 3, i32 13} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!128 = metadata !{i32 8, i32 0, metadata !61, null}
+!129 = metadata !{i32 8, i32 0, metadata !130, null}
+!130 = metadata !{metadata !"0xb\008\000\001", metadata !1, metadata !61} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!131 = metadata !{i32 8, i32 0, metadata !132, null}
+!132 = metadata !{metadata !"0xb\008\000\002", metadata !1, metadata !61} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]
+!133 = metadata !{i32 8, i32 0, metadata !134, null}
+!134 = metadata !{metadata !"0xb\008\000\003", metadata !1, metadata !61} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/sret.cpp]

diff --git a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
index 8816fe7..c98ef28 100644
--- a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
+++ b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll

@@ -60,45 +60,45 @@
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !15), !dbg !16
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
   %0 = load i32* %a.addr, align 4, !dbg !17
   %call = call i32 @fn(i32 %0), !dbg !17
   ret i32 %call, !dbg !17
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @fn(i32 %a) nounwind uwtable ssp {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !19), !dbg !20
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
   %0 = load i32* %a.addr, align 4, !dbg !21
   ret i32 %0, !dbg !21
 }
 
 !llvm.dbg.cu = !{!0, !10}
 !llvm.module.flags = !{!25}
-!0 = metadata !{i32 786449, metadata !23, i32 12, metadata !"clang version 3.3", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !"", i32 1} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3\000\00\000\00\001", metadata !23, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !23, metadata !6, metadata !"test", metadata !"test", metadata !"", i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @test, null, null, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [test]
-!6 = metadata !{i32 786473, metadata !23} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00test\00test\00\002\000\001\000\006\00256\000\003", metadata !23, metadata !6, metadata !7, null, i32 (i32)* @test, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [test]
+!6 = metadata !{metadata !"0x29", metadata !23} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786449, metadata !24, i32 12, metadata !"clang version 3.3 (trunk 172862)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !11, metadata !1,  metadata !1, metadata !"", i32 1} ; [ DW_TAG_compile_unit ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 172862)\000\00\000\00\001", metadata !24, metadata !1, metadata !1, metadata !11, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ]
 !11 = metadata !{metadata !13}
-!13 = metadata !{i32 786478, metadata !24, metadata !14, metadata !"fn", metadata !"fn", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @fn, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [fn]
-!14 = metadata !{i32 786473, metadata !24} ; [ DW_TAG_file_type ]
-!15 = metadata !{i32 786689, metadata !5, metadata !"a", metadata !6, i32 16777218, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 2]
+!13 = metadata !{metadata !"0x2e\00fn\00fn\00\001\000\001\000\006\00256\000\001", metadata !24, metadata !14, metadata !7, null, i32 (i32)* @fn, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 1] [def] [fn]
+!14 = metadata !{metadata !"0x29", metadata !24} ; [ DW_TAG_file_type ]
+!15 = metadata !{metadata !"0x101\00a\0016777218\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [a] [line 2]
 !16 = metadata !{i32 2, i32 0, metadata !5, null}
 !17 = metadata !{i32 4, i32 0, metadata !18, null}
-!18 = metadata !{i32 786443, metadata !23, metadata !5, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 786689, metadata !13, metadata !"a", metadata !14, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!18 = metadata !{metadata !"0xb\003\000\000", metadata !23, metadata !5} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !13, metadata !14, metadata !9} ; [ DW_TAG_arg_variable ] [a] [line 1]
 !20 = metadata !{i32 1, i32 0, metadata !13, null}
 !21 = metadata !{i32 2, i32 0, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !24, metadata !13, i32 1, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\001\000\000", metadata !24, metadata !13} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{metadata !"simple.c", metadata !"/private/tmp"}
 !24 = metadata !{metadata !"simple2.c", metadata !"/private/tmp"}
-!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/stmt-list.ll b/test/DebugInfo/X86/stmt-list.ll
index 99bd0fc..2bf4339 100644
--- a/test/DebugInfo/X86/stmt-list.ll
+++ b/test/DebugInfo/X86/stmt-list.ll

@@ -14,10 +14,10 @@
 !llvm.module.flags = !{!7}
 !5 = metadata !{metadata !0}
 
-!0 = metadata !{i32 786478, metadata !6, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!1 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !6, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !5, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !6, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\001\001", metadata !6, metadata !1, metadata !3, null, void ()* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!1 = metadata !{metadata !"0x29", metadata !6} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", metadata !6, metadata !4, metadata !4, metadata !5, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !6, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !6 = metadata !{metadata !"test2.c", metadata !"/home/espindola/llvm"}
-!7 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!7 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/stringpool.ll b/test/DebugInfo/X86/stringpool.ll
index 846d210..9ff4a2a 100644
--- a/test/DebugInfo/X86/stringpool.ll
+++ b/test/DebugInfo/X86/stringpool.ll

@@ -6,12 +6,12 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9}
 
-!0 = metadata !{i32 786449, metadata !8, i32 12, metadata !"clang version 3.1 (trunk 143009)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 143009)\001\00\000\00\000", metadata !8, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720948, i32 0, null, metadata !"yyyy", metadata !"yyyy", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, i32* @yyyy, null} ; [ DW_TAG_variable ]
-!6 = metadata !{i32 720937, metadata !8} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x34\00yyyy\00yyyy\00\001\000\001", null, metadata !6, metadata !7, i32* @yyyy, null} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !"z.c", metadata !"/home/nicholas"}
 
 ; Verify that "yyyy" ended up in the stringpool.
@@ -40,4 +40,4 @@
 ; DARWIN-NEXT:        .byte   9                       ## DW_AT_location
 ; DARWIN-NEXT:        .byte   3
 ; DARWIN-NEXT:        .quad   _yyyy
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/struct-loc.ll b/test/DebugInfo/X86/struct-loc.ll
index 390d8da..4ce04a7 100644
--- a/test/DebugInfo/X86/struct-loc.ll
+++ b/test/DebugInfo/X86/struct-loc.ll

@@ -14,14 +14,14 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.1 (trunk 152837) (llvm/trunk 152845)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 152837) (llvm/trunk 152845)\000\00\000\00\000", metadata !11, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !6, i32 5, metadata !7, i32 0, i32 1, %struct.foo* @f, null} ; [ DW_TAG_variable ]
-!6 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786451, metadata !11, null, metadata !"foo", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x34\00f\00f\00\005\000\001", null, metadata !6, metadata !7, %struct.foo* @f, null} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x13\00foo\001\0032\0032\000\000\000", metadata !11, null, null, metadata !8, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786445, metadata !11, metadata !7, metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !11, metadata !7, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !11 = metadata !{metadata !"struct_bug.c", metadata !"/Users/echristo/tmp"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/subrange-type.ll b/test/DebugInfo/X86/subrange-type.ll
index 14dca46..035e50b 100644
--- a/test/DebugInfo/X86/subrange-type.ll
+++ b/test/DebugInfo/X86/subrange-type.ll

@@ -12,29 +12,29 @@
   %retval = alloca i32, align 4
   %i = alloca [2 x i32], align 4
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{[2 x i32]* %i}, metadata !10), !dbg !15
+  call void @llvm.dbg.declare(metadata !{[2 x i32]* %i}, metadata !10, metadata !{metadata !"0x102"}), !dbg !15
   ret i32 0, !dbg !16
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{i32 786449, metadata !17, i32 12, metadata !"clang version 3.3 (trunk 171472) (llvm/trunk 171487)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 171472) (llvm/trunk 171487)\000\00\000\00\000", metadata !17, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [main]
-!6 = metadata !{i32 786473, metadata !17} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\00256\000\003", metadata !6, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [main]
+!6 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786688, metadata !11, metadata !"i", metadata !6, i32 4, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 4]
-!11 = metadata !{i32 786443, metadata !6, metadata !5, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.c]
-!12 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 64, i64 32, i32 0, i32 0, metadata !9, metadata !13, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from int]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x100\00i\004\000", metadata !11, metadata !6, metadata !12} ; [ DW_TAG_auto_variable ] [i] [line 4]
+!11 = metadata !{metadata !"0xb\003\000\000", metadata !6, metadata !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.c]
+!12 = metadata !{metadata !"0x1\00\000\0064\0032\000\000", null, null, metadata !9, metadata !13, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from int]
 !13 = metadata !{metadata !14}
-!14 = metadata !{i32 786465, i64 0, i64 2}        ; [ DW_TAG_subrange_type ] [0, 1]
+!14 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
 !15 = metadata !{i32 4, i32 0, metadata !11, null}
 !16 = metadata !{i32 6, i32 0, metadata !11, null}
 !17 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/subreg.ll b/test/DebugInfo/X86/subreg.ll
index 22fd1a8..a9665cb 100644
--- a/test/DebugInfo/X86/subreg.ll
+++ b/test/DebugInfo/X86/subreg.ll

@@ -10,26 +10,26 @@
 
 define i16 @f(i16 signext %zzz) nounwind {
 entry:
-  call void @llvm.dbg.value(metadata !{i16 %zzz}, i64 0, metadata !0)
+  call void @llvm.dbg.value(metadata !{i16 %zzz}, i64 0, metadata !0, metadata !{metadata !"0x102"})
   %conv = sext i16 %zzz to i32, !dbg !7
   %conv1 = trunc i32 %conv to i16
   ret i16 %conv1
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!11}
 !9 = metadata !{metadata !1}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"zzz", metadata !2, i32 16777219, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i16 (i16)* @f, null, null, null, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !10, i32 12, metadata !"clang version 3.0 ()", i1 false, metadata !"", i32 0, metadata !5, metadata !5, metadata !9, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00zzz\0016777219\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\000\003", metadata !10, metadata !2, metadata !4, null, i16 (i16)* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!2 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\000\00\000\00\001", metadata !10, metadata !5, metadata !5, metadata !9, null,  null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{null}
-!6 = metadata !{i32 786468, null, metadata !3, metadata !"short", i32 0, i64 16, i64 16, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x24\00short\000\0016\0016\000\000\005", null, metadata !3} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 4, i32 22, metadata !8, null}
-!8 = metadata !{i32 786443, metadata !10, metadata !1, i32 3, i32 19, i32 0} ; [ DW_TAG_lexical_block ]
+!8 = metadata !{metadata !"0xb\003\0019\000", metadata !10, metadata !1} ; [ DW_TAG_lexical_block ]
 !10 = metadata !{metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build"}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/subregisters.ll b/test/DebugInfo/X86/subregisters.ll
index d46a95f..dfad9c8 100644
--- a/test/DebugInfo/X86/subregisters.ll
+++ b/test/DebugInfo/X86/subregisters.ll

@@ -40,16 +40,16 @@
 ; Function Attrs: noinline nounwind ssp uwtable
 define void @doSomething(%struct.bar* nocapture readonly %b) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.bar* %b}, i64 0, metadata !15), !dbg !25
+  tail call void @llvm.dbg.value(metadata !{%struct.bar* %b}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !25
   %a1 = getelementptr inbounds %struct.bar* %b, i64 0, i32 0, !dbg !26
   %0 = load i32* %a1, align 4, !dbg !26, !tbaa !27
-  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !16), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !26
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %0) #4, !dbg !32
   ret void, !dbg !33
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind
 declare i32 @printf(i8* nocapture readonly, ...) #2
@@ -59,14 +59,14 @@
 entry:
   %myBar = alloca i64, align 8, !dbg !34
   %tmpcast = bitcast i64* %myBar to %struct.bar*, !dbg !34
-  tail call void @llvm.dbg.declare(metadata !{%struct.bar* %tmpcast}, metadata !21), !dbg !34
+  tail call void @llvm.dbg.declare(metadata !{%struct.bar* %tmpcast}, metadata !21, metadata !{metadata !"0x102"}), !dbg !34
   store i64 17179869187, i64* %myBar, align 8, !dbg !34
   call void @doSomething(%struct.bar* %tmpcast), !dbg !35
   ret i32 0, !dbg !36
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { noinline nounwind ssp uwtable }
 attributes #1 = { nounwind readnone }
@@ -78,30 +78,30 @@
 !llvm.module.flags = !{!22, !23}
 !llvm.ident = !{!24}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [subregisters.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [subregisters.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"subregisters.c", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !17}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"doSomething", metadata !"doSomething", metadata !"", i32 10, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%struct.bar*)* @doSomething, null, null, metadata !14, i32 11} ; [ DW_TAG_subprogram ] [line 10] [def] [scope 11] [doSomething]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [subregisters.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00doSomething\00doSomething\00\0010\000\001\000\006\00256\001\0011", metadata !1, metadata !5, metadata !6, null, void (%struct.bar*)* @doSomething, null, null, metadata !14} ; [ DW_TAG_subprogram ] [line 10] [def] [scope 11] [doSomething]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [subregisters.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
-!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from bar]
-!9 = metadata !{i32 786451, metadata !1, null, metadata !"bar", i32 3, i64 64, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 3, size 64, align 32, offset 0] [def] [from ]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from bar]
+!9 = metadata !{metadata !"0x13\00bar\003\0064\0032\000\000\000", metadata !1, null, null, metadata !10, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 3, size 64, align 32, offset 0] [def] [from ]
 !10 = metadata !{metadata !11, metadata !13}
-!11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"a", i32 4, i64 32, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ] [a] [line 4, size 32, align 32, offset 0] [from int]
-!12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"b", i32 5, i64 32, i64 32, i64 32, i32 0, metadata !12} ; [ DW_TAG_member ] [b] [line 5, size 32, align 32, offset 32] [from int]
+!11 = metadata !{metadata !"0xd\00a\004\0032\0032\000\000", metadata !1, metadata !9, metadata !12} ; [ DW_TAG_member ] [a] [line 4, size 32, align 32, offset 0] [from int]
+!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{metadata !"0xd\00b\005\0032\0032\0032\000", metadata !1, metadata !9, metadata !12} ; [ DW_TAG_member ] [b] [line 5, size 32, align 32, offset 32] [from int]
 !14 = metadata !{metadata !15, metadata !16}
-!15 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 16777226, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 10]
-!16 = metadata !{i32 786688, metadata !4, metadata !"a", metadata !5, i32 12, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 12]
-!17 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !20, i32 17} ; [ DW_TAG_subprogram ] [line 16] [def] [scope 17] [main]
-!18 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x101\00b\0016777226\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 10]
+!16 = metadata !{metadata !"0x100\00a\0012\000", metadata !4, metadata !5, metadata !12} ; [ DW_TAG_auto_variable ] [a] [line 12]
+!17 = metadata !{metadata !"0x2e\00main\00main\00\0016\000\001\000\006\000\001\0017", metadata !1, metadata !5, metadata !18, null, i32 ()* @main, null, null, metadata !20} ; [ DW_TAG_subprogram ] [line 16] [def] [scope 17] [main]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{metadata !12}
 !20 = metadata !{metadata !21}
-!21 = metadata !{i32 786688, metadata !17, metadata !"myBar", metadata !5, i32 18, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [myBar] [line 18]
+!21 = metadata !{metadata !"0x100\00myBar\0018\000", metadata !17, metadata !5, metadata !9} ; [ DW_TAG_auto_variable ] [myBar] [line 18]
 !22 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !24 = metadata !{metadata !"clang version 3.5 "}
 !25 = metadata !{i32 10, i32 0, metadata !4, null}
 !26 = metadata !{i32 12, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/template.ll b/test/DebugInfo/X86/template.ll
index 54c351c..9652973 100644
--- a/test/DebugInfo/X86/template.ll
+++ b/test/DebugInfo/X86/template.ll

@@ -87,40 +87,40 @@
 !llvm.module.flags = !{!31, !36}
 !llvm.ident = !{!32}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 192849) (llvm/trunk 192850)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !9, metadata !28, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 192849) (llvm/trunk 192850)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !9, metadata !28, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"bar.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"y_impl<int>", i32 2, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !5, metadata !"_ZTS6y_implIiE"} ; [ DW_TAG_structure_type ] [y_impl<int>] [line 2, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00y_impl<int>\002\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS6y_implIiE"} ; [ DW_TAG_structure_type ] [y_impl<int>] [line 2, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786479, null, metadata !"", metadata !7, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
-!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{i32 786451, metadata !1, metadata !"_ZTS6y_implIiE", metadata !"nested", i32 2, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTSN6y_implIiE6nestedE"} ; [ DW_TAG_structure_type ] [nested] [line 2, size 8, align 8, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x2f\00\000\000", null, metadata !7, null} ; [ DW_TAG_template_type_parameter ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x13\00nested\002\008\008\000\000\000", metadata !1, metadata !"_ZTS6y_implIiE", null, metadata !2, null, null, metadata !"_ZTSN6y_implIiE6nestedE"} ; [ DW_TAG_structure_type ] [nested] [line 2, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !14, metadata !26}
-!10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 3, metadata !12, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [__cxx_global_var_init]
-!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/bar.cpp]
-!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\003\001\001\000\006\00256\000\003", metadata !1, metadata !11, metadata !12, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [__cxx_global_var_init]
+!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/bar.cpp]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{null}
-!14 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"func<3, &glbl, y_impl, 1, 2>", metadata !"func<3, &glbl, y_impl, 1, 2>", metadata !"_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv", i32 1, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv, metadata !17, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func<3, &glbl, y_impl, 1, 2>]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0x2e\00func<3, &glbl, y_impl, 1, 2>\00func<3, &glbl, y_impl, 1, 2>\00_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv\001\000\001\000\006\00256\000\001", metadata !1, metadata !11, metadata !15, null, i32 ()* @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv, metadata !17, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func<3, &glbl, y_impl, 1, 2>]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !7}
 !17 = metadata !{metadata !18, metadata !19, metadata !21, metadata !22}
-!18 = metadata !{i32 786480, null, metadata !"x", metadata !7, i32 3, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!19 = metadata !{i32 786480, null, metadata !"", metadata !20, i32* @glbl, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!20 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!21 = metadata !{i32 803078, null, metadata !"y", null, metadata !"y_impl", null, i32 0, i32 0} ; [ DW_TAG_GNU_template_template_param ]
-!22 = metadata !{i32 803079, null, metadata !"z", null, metadata !23, null, i32 0, i32 0} ; [ DW_TAG_GNU_template_parameter_pack ]
+!18 = metadata !{metadata !"0x30\00x\000\000", null, metadata !7, i32 3, null} ; [ DW_TAG_template_value_parameter ]
+!19 = metadata !{metadata !"0x30\00\000\000", null, metadata !20, i32* @glbl, null} ; [ DW_TAG_template_value_parameter ]
+!20 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!21 = metadata !{metadata !"0x4106\00y\000\000", null, null, metadata !"y_impl", null} ; [ DW_TAG_GNU_template_template_param ]
+!22 = metadata !{metadata !"0x4107\00z\000\000", null, null, metadata !23, null} ; [ DW_TAG_GNU_template_parameter_pack ]
 !23 = metadata !{metadata !24, metadata !25}
-!24 = metadata !{i32 786480, null, metadata !"", metadata !7, i32 1, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!25 = metadata !{i32 786480, null, metadata !"", metadata !7, i32 2, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!26 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"", metadata !"", metadata !"_GLOBAL__I_a", i32 1, metadata !27, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__I_a, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [local] [def]
-!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{metadata !"0x30\00\000\000", null, metadata !7, i32 1, null} ; [ DW_TAG_template_value_parameter ]
+!25 = metadata !{metadata !"0x30\00\000\000", null, metadata !7, i32 2, null} ; [ DW_TAG_template_value_parameter ]
+!26 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__I_a\001\001\001\000\006\0064\000\001", metadata !1, metadata !11, metadata !27, null, void ()* @_GLOBAL__I_a, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [local] [def]
+!27 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !28 = metadata !{metadata !29, metadata !30}
-!29 = metadata !{i32 786484, i32 0, null, metadata !"glbl", metadata !"glbl", metadata !"", metadata !11, i32 3, metadata !7, i32 0, i32 1, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 3] [def]
-!30 = metadata !{i32 786484, i32 0, null, metadata !"n", metadata !"n", metadata !"", metadata !11, i32 4, metadata !8, i32 0, i32 1, %"struct.y_impl<int>::nested"* @n, null} ; [ DW_TAG_variable ] [n] [line 4] [def]
+!29 = metadata !{metadata !"0x34\00glbl\00glbl\00\003\000\001", null, metadata !11, metadata !7, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 3] [def]
+!30 = metadata !{metadata !"0x34\00n\00n\00\004\000\001", null, metadata !11, metadata !8, %"struct.y_impl<int>::nested"* @n, null} ; [ DW_TAG_variable ] [n] [line 4] [def]
 !31 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !32 = metadata !{metadata !"clang version 3.4 (trunk 192849) (llvm/trunk 192850)"}
 !33 = metadata !{i32 3, i32 0, metadata !10, null}
 !34 = metadata !{i32 1, i32 0, metadata !14, null}
 !35 = metadata !{i32 1, i32 0, metadata !26, null}
-!36 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!36 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/tls.ll b/test/DebugInfo/X86/tls.ll
index e49b12f..cb71797 100644
--- a/test/DebugInfo/X86/tls.ll
+++ b/test/DebugInfo/X86/tls.ll

@@ -81,22 +81,22 @@
 !llvm.module.flags = !{!15, !16}
 !llvm.ident = !{!17}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !12, metadata !2, metadata !"-.dwo"} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/tls.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00-.dwo\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !12, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/tls.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tls.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func<&glbl>", metadata !"func<&glbl>", metadata !"_Z4funcIXadL_Z4glblEEEiv", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z4funcIXadL_Z4glblEEEiv, metadata !9, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [func<&glbl>]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/tls.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00func<&glbl>\00func<&glbl>\00_Z4funcIXadL_Z4glblEEEiv\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, i32 ()* @_Z4funcIXadL_Z4glblEEEiv, metadata !9, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [func<&glbl>]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/tls.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786480, null, metadata !"I", metadata !11, i32* @glbl, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!10 = metadata !{metadata !"0x30\00I\000\000", null, metadata !11, i32* @glbl, null} ; [ DW_TAG_template_value_parameter ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
 !12 = metadata !{metadata !13, metadata !14}
-!13 = metadata !{i32 786484, i32 0, null, metadata !"tls", metadata !"tls", metadata !"", metadata !5, i32 1, metadata !8, i32 0, i32 1, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
-!14 = metadata !{i32 786484, i32 0, null, metadata !"glbl", metadata !"glbl", metadata !"", metadata !5, i32 2, metadata !8, i32 0, i32 1, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 2] [def]
+!13 = metadata !{metadata !"0x34\00tls\00tls\00\001\000\001", null, metadata !5, metadata !8, i32* @tls, null} ; [ DW_TAG_variable ] [tls] [line 1] [def]
+!14 = metadata !{metadata !"0x34\00glbl\00glbl\00\002\000\001", null, metadata !5, metadata !8, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 2] [def]
 !15 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !17 = metadata !{metadata !"clang version 3.5 "}
 !18 = metadata !{i32 6, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/X86/type_units_with_addresses.ll b/test/DebugInfo/X86/type_units_with_addresses.ll
index ff278f6..de7e717 100644
--- a/test/DebugInfo/X86/type_units_with_addresses.ll
+++ b/test/DebugInfo/X86/type_units_with_addresses.ll

@@ -112,40 +112,40 @@
 !llvm.module.flags = !{!34, !35}
 !llvm.ident = !{!36}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !27, metadata !2, metadata !"tu.dwo", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/tu.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00tu.dwo\001", metadata !1, metadata !2, metadata !3, metadata !2, metadata !27, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/tu.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tu.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !9, metadata !12, metadata !13, metadata !17, metadata !18, metadata !19, metadata !23, metadata !24}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"S1<&i>", i32 4, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !5, metadata !"_ZTS2S1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S1<&i>] [line 4, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00S1<&i>\004\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS2S1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S1<&i>] [line 4, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786480, null, metadata !"I", metadata !7, i32* @i, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!7 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786451, metadata !1, null, metadata !"S2", i32 11, i64 8, i64 8, i32 0, i32 0, null, metadata !10, i32 0, null, null, metadata !"_ZTS2S2"} ; [ DW_TAG_structure_type ] [S2] [line 11, size 8, align 8, offset 0] [def] [from ]
+!6 = metadata !{metadata !"0x30\00I\000\000", null, metadata !7, i32* @i, null} ; [ DW_TAG_template_value_parameter ]
+!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x13\00S2\0011\008\008\000\000\000", metadata !1, null, null, metadata !10, null, null, metadata !"_ZTS2S2"} ; [ DW_TAG_structure_type ] [S2] [line 11, size 8, align 8, offset 0] [def] [from ]
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S2", metadata !"s2_1", i32 12, i64 8, i64 8, i64 0, i32 0, metadata !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s2_1] [line 12, size 8, align 8, offset 0] [from _ZTS4S2_1IXadL_Z1iEEE]
-!12 = metadata !{i32 786451, metadata !1, null, metadata !"S2_1<&i>", i32 9, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !5, metadata !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S2_1<&i>] [line 9, size 8, align 8, offset 0] [def] [from ]
-!13 = metadata !{i32 786451, metadata !1, null, metadata !"S3", i32 22, i64 16, i64 8, i32 0, i32 0, null, metadata !14, i32 0, null, null, metadata !"_ZTS2S3"} ; [ DW_TAG_structure_type ] [S3] [line 22, size 16, align 8, offset 0] [def] [from ]
+!11 = metadata !{metadata !"0xd\00s2_1\0012\008\008\000\000", metadata !1, metadata !"_ZTS2S2", metadata !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s2_1] [line 12, size 8, align 8, offset 0] [from _ZTS4S2_1IXadL_Z1iEEE]
+!12 = metadata !{metadata !"0x13\00S2_1<&i>\009\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS4S2_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S2_1<&i>] [line 9, size 8, align 8, offset 0] [def] [from ]
+!13 = metadata !{metadata !"0x13\00S3\0022\0016\008\000\000\000", metadata !1, null, null, metadata !14, null, null, metadata !"_ZTS2S3"} ; [ DW_TAG_structure_type ] [S3] [line 22, size 16, align 8, offset 0] [def] [from ]
 !14 = metadata !{metadata !15, metadata !16}
-!15 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S3", metadata !"s3_1", i32 23, i64 8, i64 8, i64 0, i32 0, metadata !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s3_1] [line 23, size 8, align 8, offset 0] [from _ZTS4S3_1IXadL_Z1iEEE]
-!16 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S3", metadata !"s3_2", i32 24, i64 8, i64 8, i64 8, i32 0, metadata !"_ZTS4S3_2"} ; [ DW_TAG_member ] [s3_2] [line 24, size 8, align 8, offset 8] [from _ZTS4S3_2]
-!17 = metadata !{i32 786451, metadata !1, null, metadata !"S3_1<&i>", i32 18, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !5, metadata !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S3_1<&i>] [line 18, size 8, align 8, offset 0] [def] [from ]
-!18 = metadata !{i32 786451, metadata !1, null, metadata !"S3_2", i32 20, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS4S3_2"} ; [ DW_TAG_structure_type ] [S3_2] [line 20, size 8, align 8, offset 0] [def] [from ]
-!19 = metadata !{i32 786451, metadata !1, null, metadata !"S4", i32 34, i64 16, i64 8, i32 0, i32 0, null, metadata !20, i32 0, null, null, metadata !"_ZTS2S4"} ; [ DW_TAG_structure_type ] [S4] [line 34, size 16, align 8, offset 0] [def] [from ]
+!15 = metadata !{metadata !"0xd\00s3_1\0023\008\008\000\000", metadata !1, metadata !"_ZTS2S3", metadata !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s3_1] [line 23, size 8, align 8, offset 0] [from _ZTS4S3_1IXadL_Z1iEEE]
+!16 = metadata !{metadata !"0xd\00s3_2\0024\008\008\008\000", metadata !1, metadata !"_ZTS2S3", metadata !"_ZTS4S3_2"} ; [ DW_TAG_member ] [s3_2] [line 24, size 8, align 8, offset 8] [from _ZTS4S3_2]
+!17 = metadata !{metadata !"0x13\00S3_1<&i>\0018\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !5, metadata !"_ZTS4S3_1IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S3_1<&i>] [line 18, size 8, align 8, offset 0] [def] [from ]
+!18 = metadata !{metadata !"0x13\00S3_2\0020\008\008\000\000\000", metadata !1, null, null, metadata !2, null, null, metadata !"_ZTS4S3_2"} ; [ DW_TAG_structure_type ] [S3_2] [line 20, size 8, align 8, offset 0] [def] [from ]
+!19 = metadata !{metadata !"0x13\00S4\0034\0016\008\000\000\000", metadata !1, null, null, metadata !20, null, null, metadata !"_ZTS2S4"} ; [ DW_TAG_structure_type ] [S4] [line 34, size 16, align 8, offset 0] [def] [from ]
 !20 = metadata !{metadata !21, metadata !22}
-!21 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S4", metadata !"s4_1", i32 35, i64 8, i64 8, i64 0, i32 0, metadata !"_ZTS4S4_1"} ; [ DW_TAG_member ] [s4_1] [line 35, size 8, align 8, offset 0] [from _ZTS4S4_1]
-!22 = metadata !{i32 786445, metadata !1, metadata !"_ZTS2S4", metadata !"s4_2", i32 36, i64 8, i64 8, i64 8, i32 0, metadata !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s4_2] [line 36, size 8, align 8, offset 8] [from _ZTS4S4_2IXadL_Z1iEEE]
-!23 = metadata !{i32 786451, metadata !1, null, metadata !"S4_1", i32 29, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS4S4_1"} ; [ DW_TAG_structure_type ] [S4_1] [line 29, size 8, align 8, offset 0] [def] [from ]
-!24 = metadata !{i32 786451, metadata !1, null, metadata !"S4_2<&i>", i32 32, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !25, metadata !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S4_2<&i>] [line 32, size 8, align 8, offset 0] [def] [from ]
+!21 = metadata !{metadata !"0xd\00s4_1\0035\008\008\000\000", metadata !1, metadata !"_ZTS2S4", metadata !"_ZTS4S4_1"} ; [ DW_TAG_member ] [s4_1] [line 35, size 8, align 8, offset 0] [from _ZTS4S4_1]
+!22 = metadata !{metadata !"0xd\00s4_2\0036\008\008\008\000", metadata !1, metadata !"_ZTS2S4", metadata !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_member ] [s4_2] [line 36, size 8, align 8, offset 8] [from _ZTS4S4_2IXadL_Z1iEEE]
+!23 = metadata !{metadata !"0x13\00S4_1\0029\008\008\000\000\000", metadata !1, null, null, metadata !2, null, null, metadata !"_ZTS4S4_1"} ; [ DW_TAG_structure_type ] [S4_1] [line 29, size 8, align 8, offset 0] [def] [from ]
+!24 = metadata !{metadata !"0x13\00S4_2<&i>\0032\008\008\000\000\000", metadata !1, null, null, metadata !2, null, metadata !25, metadata !"_ZTS4S4_2IXadL_Z1iEEE"} ; [ DW_TAG_structure_type ] [S4_2<&i>] [line 32, size 8, align 8, offset 0] [def] [from ]
 !25 = metadata !{metadata !26}
-!26 = metadata !{i32 786480, null, metadata !"T", metadata !7, i32* @i, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!26 = metadata !{metadata !"0x30\00T\000\000", null, metadata !7, i32* @i, null} ; [ DW_TAG_template_value_parameter ]
 !27 = metadata !{metadata !28, metadata !30, metadata !31, metadata !32, metadata !33}
-!28 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"", metadata !29, i32 1, metadata !8, i32 0, i32 1, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 1] [def]
-!29 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/tu.cpp]
-!30 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !29, i32 6, metadata !"_ZTS2S1IXadL_Z1iEEE", i32 0, i32 1, %struct.S1* @a, null} ; [ DW_TAG_variable ] [a] [line 6] [def]
-!31 = metadata !{i32 786484, i32 0, null, metadata !"s2", metadata !"s2", metadata !"", metadata !29, i32 15, metadata !"_ZTS2S2", i32 0, i32 1, %struct.S2* @s2, null} ; [ DW_TAG_variable ] [s2] [line 15] [def]
-!32 = metadata !{i32 786484, i32 0, null, metadata !"s3", metadata !"s3", metadata !"", metadata !29, i32 27, metadata !"_ZTS2S3", i32 0, i32 1, %struct.S3* @s3, null} ; [ DW_TAG_variable ] [s3] [line 27] [def]
-!33 = metadata !{i32 786484, i32 0, null, metadata !"s4", metadata !"s4", metadata !"", metadata !29, i32 39, metadata !"_ZTS2S4", i32 0, i32 1, %struct.S4* @s4, null} ; [ DW_TAG_variable ] [s4] [line 39] [def]
+!28 = metadata !{metadata !"0x34\00i\00i\00\001\000\001", null, metadata !29, metadata !8, i32* @i, null} ; [ DW_TAG_variable ] [i] [line 1] [def]
+!29 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/tu.cpp]
+!30 = metadata !{metadata !"0x34\00a\00a\00\006\000\001", null, metadata !29, metadata !"_ZTS2S1IXadL_Z1iEEE", %struct.S1* @a, null} ; [ DW_TAG_variable ] [a] [line 6] [def]
+!31 = metadata !{metadata !"0x34\00s2\00s2\00\0015\000\001", null, metadata !29, metadata !"_ZTS2S2", %struct.S2* @s2, null} ; [ DW_TAG_variable ] [s2] [line 15] [def]
+!32 = metadata !{metadata !"0x34\00s3\00s3\00\0027\000\001", null, metadata !29, metadata !"_ZTS2S3", %struct.S3* @s3, null} ; [ DW_TAG_variable ] [s3] [line 27] [def]
+!33 = metadata !{metadata !"0x34\00s4\00s4\00\0039\000\001", null, metadata !29, metadata !"_ZTS2S4", %struct.S4* @s4, null} ; [ DW_TAG_variable ] [s4] [line 39] [def]
 !34 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !36 = metadata !{metadata !"clang version 3.5.0 "}

diff --git a/test/DebugInfo/X86/union-template.ll b/test/DebugInfo/X86/union-template.ll
index 5fdb349..6580a39 100644
--- a/test/DebugInfo/X86/union-template.ll
+++ b/test/DebugInfo/X86/union-template.ll

@@ -16,12 +16,12 @@
   %value.addr = alloca float, align 4
   %tempValue = alloca %"union.PR15637::Value", align 4
   store float %value, float* %value.addr, align 4
-  call void @llvm.dbg.declare(metadata !{float* %value.addr}, metadata !23), !dbg !24
-  call void @llvm.dbg.declare(metadata !{%"union.PR15637::Value"* %tempValue}, metadata !25), !dbg !26
+  call void @llvm.dbg.declare(metadata !{float* %value.addr}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata !{%"union.PR15637::Value"* %tempValue}, metadata !25, metadata !{metadata !"0x102"}), !dbg !26
   ret void, !dbg !27
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
@@ -29,32 +29,32 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!28}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.3 (trunk 178499) (llvm/trunk 178472)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9,  metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 178499) (llvm/trunk 178472)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9,  metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"foo.cc", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"g", metadata !"g", metadata !"_ZN7PR156371gEf", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (float)* @_ZN7PR156371gEf, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [g]
-!5 = metadata !{i32 786489, metadata !1, null, metadata !"PR15637", i32 1} ; [ DW_TAG_namespace ] [PR15637] [line 1]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00g\00g\00_ZN7PR156371gEf\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, void (float)* @_ZN7PR156371gEf, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [g]
+!5 = metadata !{metadata !"0x39\00PR15637\001", metadata !1, null} ; [ DW_TAG_namespace ] [PR15637] [line 1]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!8 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786484, i32 0, metadata !5, metadata !"f", metadata !"f", metadata !"_ZN7PR156371fE", metadata !11, i32 6, metadata !12, i32 0, i32 1, %"union.PR15637::Value"* @_ZN7PR156371fE, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
-!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cc]
-!12 = metadata !{i32 786455, metadata !1, metadata !5, metadata !"Value<float>", i32 2, i64 32, i64 32, i64 0, i32 0, null, metadata !13, i32 0, null, metadata !21, null} ; [ DW_TAG_union_type ] [Value<float>] [line 2, size 32, align 32, offset 0] [def] [from ]
+!10 = metadata !{metadata !"0x34\00f\00f\00_ZN7PR156371fE\006\000\001", metadata !5, metadata !11, metadata !12, %"union.PR15637::Value"* @_ZN7PR156371fE, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
+!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cc]
+!12 = metadata !{metadata !"0x17\00Value<float>\002\0032\0032\000\000\000", metadata !1, metadata !5, null, metadata !13, null, metadata !21, null} ; [ DW_TAG_union_type ] [Value<float>] [line 2, size 32, align 32, offset 0] [def] [from ]
 !13 = metadata !{metadata !14, metadata !16}
-!14 = metadata !{i32 786445, metadata !1, metadata !12, metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !15} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!15 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!16 = metadata !{i32 786478, metadata !1, metadata !12, metadata !"Value", metadata !"Value", metadata !"", i32 2, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !20, i32 2} ; [ DW_TAG_subprogram ] [line 2] [Value]
-!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !12, metadata !15} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!15 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!16 = metadata !{metadata !"0x2e\00Value\00Value\00\002\000\000\000\006\00320\000\002", metadata !1, metadata !12, metadata !17, null, null, null, i32 0, metadata !20} ; [ DW_TAG_subprogram ] [line 2] [Value]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19}
-!19 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Value<float>]
+!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Value<float>]
 !20 = metadata !{i32 786468}
 !21 = metadata !{metadata !22}
-!22 = metadata !{i32 786479, null, metadata !"T", metadata !8, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
-!23 = metadata !{i32 786689, metadata !4, metadata !"value", metadata !11, i32 16777219, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [value] [line 3]
+!22 = metadata !{metadata !"0x2f\00T\000\000", null, metadata !8, null} ; [ DW_TAG_template_type_parameter ]
+!23 = metadata !{metadata !"0x101\00value\0016777219\000", metadata !4, metadata !11, metadata !8} ; [ DW_TAG_arg_variable ] [value] [line 3]
 !24 = metadata !{i32 3, i32 0, metadata !4, null}
-!25 = metadata !{i32 786688, metadata !4, metadata !"tempValue", metadata !11, i32 4, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [tempValue] [line 4]
+!25 = metadata !{metadata !"0x100\00tempValue\004\000", metadata !4, metadata !11, metadata !12} ; [ DW_TAG_auto_variable ] [tempValue] [line 4]
 !26 = metadata !{i32 4, i32 0, metadata !4, null}
 !27 = metadata !{i32 5, i32 0, metadata !4, null}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/vector.ll b/test/DebugInfo/X86/vector.ll
index a7a1585..cd9fcd0 100644
--- a/test/DebugInfo/X86/vector.ll
+++ b/test/DebugInfo/X86/vector.ll

@@ -12,19 +12,19 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang version 3.3 (trunk 171825) (llvm/trunk 171822)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/echristo/foo.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 (trunk 171825) (llvm/trunk 171822)\000\00\000\00\000", metadata !12, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/foo.c] [DW_LANG_C99]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 3, metadata !7, i32 0, i32 1, <4 x i32>* @a, null} ; [ DW_TAG_variable ] [a] [line 3] [def]
-!6 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786454, metadata !12, null, metadata !"v4si", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] [v4si] [line 1, size 0, align 0, offset 0] [from ]
-!8 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 2048, metadata !9, metadata !10, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!5 = metadata !{metadata !"0x34\00a\00a\00\003\000\001", null, metadata !6, metadata !7, <4 x i32>* @a, null} ; [ DW_TAG_variable ] [a] [line 3] [def]
+!6 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x16\00v4si\001\000\000\000\000", metadata !12, null, metadata !8} ; [ DW_TAG_typedef ] [v4si] [line 1, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x1\00\000\00128\00128\000\002048", null, null, metadata !9, metadata !10, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
+!11 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
 !12 = metadata !{metadata !"foo.c", metadata !"/Users/echristo"}
 
 ; Check that we get an array type with a vector attribute.
 ; CHECK: DW_TAG_array_type
 ; CHECK-NEXT: DW_AT_GNU_vector
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/X86/vla.ll b/test/DebugInfo/X86/vla.ll
index a1a2e66..be05c3b 100644
--- a/test/DebugInfo/X86/vla.ll
+++ b/test/DebugInfo/X86/vla.ll

@@ -27,13 +27,13 @@
   %saved_stack = alloca i8*
   %cleanup.dest.slot = alloca i32
   store i32 %n, i32* %n.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %n.addr}, metadata !15), !dbg !16
+  call void @llvm.dbg.declare(metadata !{i32* %n.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
   %0 = load i32* %n.addr, align 4, !dbg !17
   %1 = zext i32 %0 to i64, !dbg !17
   %2 = call i8* @llvm.stacksave(), !dbg !17
   store i8* %2, i8** %saved_stack, !dbg !17
   %vla = alloca i32, i64 %1, align 16, !dbg !17
-  call void @llvm.dbg.declare(metadata !{i32* %vla}, metadata !18), !dbg !17
+  call void @llvm.dbg.declare(metadata !{i32* %vla}, metadata !18, metadata !{metadata !"0x102"}), !dbg !17
   %arrayidx = getelementptr inbounds i32* %vla, i64 0, !dbg !22
   store i32 42, i32* %arrayidx, align 4, !dbg !22
   %3 = load i32* %n.addr, align 4, !dbg !23
@@ -48,7 +48,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 ; Function Attrs: nounwind
 declare i8* @llvm.stacksave() nounwind
@@ -64,9 +64,9 @@
   %argv.addr = alloca i8**, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !25), !dbg !26
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !25, metadata !{metadata !"0x102"}), !dbg !26
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !27), !dbg !26
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !27, metadata !{metadata !"0x102"}), !dbg !26
   %0 = load i32* %argc.addr, align 4, !dbg !28
   %call = call i32 @vla(i32 %0), !dbg !28
   ret i32 %call, !dbg !28
@@ -75,33 +75,33 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!29}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/vla.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.3 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/vla.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"vla.c", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"vla", metadata !"vla", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @vla, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [vla]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/vla.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00vla\00vla\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @vla, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [vla]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/vla.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !5, metadata !10, null, i32 (i32, i8**)* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{metadata !8, metadata !8, metadata !12}
-!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!14 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!15 = metadata !{i32 786689, metadata !4, metadata !"n", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [n] [line 1]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!14 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!15 = metadata !{metadata !"0x101\00n\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [n] [line 1]
 !16 = metadata !{i32 1, i32 0, metadata !4, null}
 !17 = metadata !{i32 2, i32 0, metadata !4, null}
-!18 = metadata !{i32 786688, metadata !4, metadata !"a", metadata !5, i32 2, metadata !19, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 2]
-!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !8, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!18 = metadata !{metadata !"0x100\00a\002\008192", metadata !4, metadata !5, metadata !19} ; [ DW_TAG_auto_variable ] [a] [line 2]
+!19 = metadata !{metadata !"0x1\00\000\000\0032\000\000", null, null, metadata !8, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !20 = metadata !{metadata !21}
-!21 = metadata !{i32 786465, i64 0, i64 -1}       ; [ DW_TAG_subrange_type ] [unbounded]
+!21 = metadata !{metadata !"0x21\000\00-1"}       ; [ DW_TAG_subrange_type ] [unbounded]
 !22 = metadata !{i32 3, i32 0, metadata !4, null}
 !23 = metadata !{i32 4, i32 0, metadata !4, null}
 !24 = metadata !{i32 5, i32 0, metadata !4, null}
-!25 = metadata !{i32 786689, metadata !9, metadata !"argc", metadata !5, i32 16777223, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 7]
+!25 = metadata !{metadata !"0x101\00argc\0016777223\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [argc] [line 7]
 !26 = metadata !{i32 7, i32 0, metadata !9, null}
-!27 = metadata !{i32 786689, metadata !9, metadata !"argv", metadata !5, i32 33554439, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 7]
+!27 = metadata !{metadata !"0x101\00argv\0033554439\000", metadata !9, metadata !5, metadata !12} ; [ DW_TAG_arg_variable ] [argv] [line 7]
 !28 = metadata !{i32 8, i32 0, metadata !9, null}
-!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/array.ll b/test/DebugInfo/array.ll
index 72b0b99..2c7195b 100644
--- a/test/DebugInfo/array.ll
+++ b/test/DebugInfo/array.ll

@@ -6,34 +6,34 @@
   %retval = alloca i32, align 4
   %a = alloca [0 x i32], align 4
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{[0 x i32]* %a}, metadata !6), !dbg !11
+  call void @llvm.dbg.declare(metadata !{[0 x i32]* %a}, metadata !6, metadata !{metadata !"0x102"}), !dbg !11
   ret i32 0, !dbg !12
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!16}
 
-!0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, null, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
-!1 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !14, i32 12, metadata !"clang version 3.0 (trunk 129138)", i1 false, metadata !"", i32 0, metadata !15, metadata !15, metadata !13, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\000\000\003", metadata !14, metadata !1, metadata !3, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
+!1 = metadata !{metadata !"0x29", metadata !14} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 129138)\000\00\000\00\000", metadata !14, metadata !15, metadata !15, metadata !13, null,  null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !14, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786688, metadata !7, metadata !"a", metadata !1, i32 4, metadata !8, i32 0, null} ; [ DW_TAG_auto_variable ]
-!7 = metadata !{i32 786443, metadata !14, metadata !0, i32 3, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 786433, metadata !14, metadata !2, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !5, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x100\00a\004\000", metadata !7, metadata !1, metadata !8} ; [ DW_TAG_auto_variable ]
+!7 = metadata !{metadata !"0xb\003\0012\000", metadata !14, metadata !0} ; [ DW_TAG_lexical_block ]
+!8 = metadata !{metadata !"0x1\00\000\000\0032\000\000", metadata !14, metadata !2, metadata !5, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !9 = metadata !{metadata !10}
 ;CHECK: DW_TAG_subrange_type
 ;CHECK-NEXT: DW_AT_type
 ;CHECK-NOT: DW_AT_lower_bound
 ;CHECK-NOT: DW_AT_upper_bound
 ;CHECK-NEXT: End Of Children Mark
-!10 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
+!10 = metadata !{metadata !"0x21\000\00-1"}        ; [ DW_TAG_subrange_type ]
 !11 = metadata !{i32 4, i32 7, metadata !7, null}
 !12 = metadata !{i32 5, i32 3, metadata !7, null}
 !13 = metadata !{metadata !0}
 !14 = metadata !{metadata !"array.c", metadata !"/private/tmp"}
 !15 = metadata !{i32 0}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/bug_null_debuginfo.ll b/test/DebugInfo/bug_null_debuginfo.ll
index 458fb58..fd22fb3 100644
--- a/test/DebugInfo/bug_null_debuginfo.ll
+++ b/test/DebugInfo/bug_null_debuginfo.ll

@@ -3,6 +3,6 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"", i1 false, metadata !"", i32 0, null, null, null,  null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00\000\00\000\00\000", metadata !1, null, null, null,  null, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"t", metadata !""}
-!2 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!2 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/constant-pointers.ll b/test/DebugInfo/constant-pointers.ll
index fdde06d..e344fb8 100644
--- a/test/DebugInfo/constant-pointers.ll
+++ b/test/DebugInfo/constant-pointers.ll

@@ -30,22 +30,22 @@
 !llvm.module.flags = !{!15, !16}
 !llvm.ident = !{!17}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/constant-pointers.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/constant-pointers.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"constant-pointers.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func<nullptr, nullptr, 42>", metadata !"func<nullptr, nullptr, 42>", metadata !"_Z4funcILPv0ELPFvvE0ELi42EEvv", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4funcILPv0ELPFvvE0ELi42EEvv, metadata !8, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [func<nullptr, nullptr, 42>]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/constant-pointers.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00func<nullptr, nullptr, 42>\00func<nullptr, nullptr, 42>\00_Z4funcILPv0ELPFvvE0ELi42EEvv\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, void ()* @_Z4funcILPv0ELPFvvE0ELi42EEvv, metadata !8, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [func<nullptr, nullptr, 42>]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/constant-pointers.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{metadata !9, metadata !11, metadata !13}
-!9 = metadata !{i32 786480, null, metadata !"V", metadata !10, i8 0, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!11 = metadata !{i32 786480, null, metadata !"F", metadata !12, i8 0, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{i32 786480, null, metadata !"i", metadata !14, i32 42, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!14 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x30\00V\000\000", null, metadata !10, i8 0, null} ; [ DW_TAG_template_value_parameter ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!11 = metadata !{metadata !"0x30\00F\000\000", null, metadata !12, i8 0, null} ; [ DW_TAG_template_value_parameter ]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{metadata !"0x30\00i\000\000", null, metadata !14, i32 42, null} ; [ DW_TAG_template_value_parameter ]
+!14 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !15 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!16 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!16 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !17 = metadata !{metadata !"clang version 3.5.0 "}
 !18 = metadata !{i32 3, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/cross-cu-inlining.ll b/test/DebugInfo/cross-cu-inlining.ll
index 899558a..f262022 100644
--- a/test/DebugInfo/cross-cu-inlining.ll
+++ b/test/DebugInfo/cross-cu-inlining.ll

@@ -1,6 +1,7 @@
 ; REQUIRES: object-emission
 
-; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck -implicit-check-not=DW_TAG %s
+; RUN: %llc_dwarf -dwarf-accel-tables=Enable -O0 -filetype=obj < %s | llvm-dwarfdump - | FileCheck --check-prefix=CHECK-ACCEL --check-prefix=CHECK %s
 
 ; Build from source:
 ; $ clang++ a.cpp b.cpp -g -c -emit-llvm
@@ -24,12 +25,10 @@
 ; CHECK:   DW_AT_name {{.*}} "a.cpp"
 ; CHECK:   DW_TAG_subprogram
 ; CHECK:     DW_AT_type [DW_FORM_ref_addr] (0x00000000[[INT:.*]])
-; CHECK:     DW_TAG_inlined_subroutine
-; CHECK-NOT: DW_TAG
-; CHECK:       DW_AT_abstract_origin {{.*}}[[ABS_FUNC:........]])
+; CHECK:     0x[[INLINED:[0-9a-f]*]]:{{.*}}DW_TAG_inlined_subroutine
+; CHECK:       DW_AT_abstract_origin {{.*}}[[ABS_FUNC:........]] "_Z4funci"
 ; CHECK:       DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
-; CHECK:         DW_AT_abstract_origin {{.*}}[[ABS_VAR:........]])
+; CHECK:         DW_AT_abstract_origin {{.*}}[[ABS_VAR:........]] "x"
 
 ; Check the abstract definition is in the 'b.cpp' CU and doesn't contain any
 ; concrete information (address range or variable location)
@@ -38,28 +37,31 @@
 ; CHECK: 0x[[ABS_FUNC]]: DW_TAG_subprogram
 ; CHECK-NOT: DW_AT_low_pc
 ; CHECK: 0x[[ABS_VAR]]: DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
 ; CHECK-NOT: DW_AT_location
 ; CHECK: DW_AT_type [DW_FORM_ref4] {{.*}} {0x[[INT]]}
 ; CHECK-NOT: DW_AT_location
 
 ; CHECK: 0x[[INT]]: DW_TAG_base_type
-; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_name {{.*}} "int"
 
 ; Check the concrete out of line definition references the abstract and
 ; provides the address range and variable location
-; CHECK: DW_TAG_subprogram
-; CHECK-NOT: DW_TAG
+; CHECK: 0x[[FUNC:[0-9a-f]*]]{{.*}}DW_TAG_subprogram
 ; CHECK:   DW_AT_low_pc
-; CHECK-NOT: DW_TAG
-; CHECK:   DW_AT_abstract_origin {{.*}} {0x[[ABS_FUNC]]}
+; CHECK:   DW_AT_abstract_origin {{.*}} {0x[[ABS_FUNC]]} "_Z4funci"
 ; CHECK:   DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_location
-; CHECK-NOT: DW_TAG
-; CHECK:     DW_AT_abstract_origin {{.*}} {0x[[ABS_VAR]]}
+; CHECK:     DW_AT_abstract_origin {{.*}} {0x[[ABS_VAR]]} "x"
 
+; Check that both the inline and the non out of line version of func are
+; correctly referenced in the accelerator table. Before r221837, the one
+; in the second compilation unit had a wrong offset
+; CHECK-ACCEL: .apple_names contents:
+; CHECK-ACCEL: Name{{.*}}"func"
+; CHECK-ACCEL-NOT: Name
+; CHECK-ACCEL: Atom[0]{{.*}}[[INLINED]]
+; CHECK-ACCEL-NOT: Name
+; CHECK-ACCEL: Atom[0]{{.*}}[[FUNC]]
 
 @i = external global i32
 
@@ -73,7 +75,7 @@
   %1 = bitcast i32* %x.addr.i to i8*
   call void @llvm.lifetime.start(i64 4, i8* %1)
   store i32 %0, i32* %x.addr.i, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !20), !dbg !21
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr.i}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
   %2 = load i32* %x.addr.i, align 4, !dbg !22
   %mul.i = mul nsw i32 %2, 2, !dbg !22
   %3 = bitcast i32* %x.addr.i to i8*, !dbg !22
@@ -86,14 +88,14 @@
 entry:
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !20), !dbg !23
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !20, metadata !{metadata !"0x102"}), !dbg !23
   %0 = load i32* %x.addr, align 4, !dbg !24
   %mul = mul nsw i32 %0, 2, !dbg !24
   ret i32 %mul, !dbg !24
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.start(i64, i8* nocapture) #3
@@ -110,27 +112,27 @@
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18, !18}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !11, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !10, metadata !2, metadata !2, metadata !11, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
 !10 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786478, metadata !10, metadata !13, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
-!13 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", metadata !10, metadata !13, metadata !14, null, i32 (i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!13 = metadata !{metadata !"0x29", metadata !10}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{metadata !8, metadata !8}
 !16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !18 = metadata !{metadata !"clang version 3.5.0 "}
 !19 = metadata !{i32 4, i32 0, metadata !4, null}
-!20 = metadata !{i32 786689, metadata !12, metadata !"x", metadata !13, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!20 = metadata !{metadata !"0x101\00x\0016777217\000", metadata !12, metadata !13, metadata !8} ; [ DW_TAG_arg_variable ] [x] [line 1]
 !21 = metadata !{i32 1, i32 0, metadata !12, metadata !19}
 !22 = metadata !{i32 2, i32 0, metadata !12, metadata !19}
 !23 = metadata !{i32 1, i32 0, metadata !12, null}

diff --git a/test/DebugInfo/cross-cu-linkonce-distinct.ll b/test/DebugInfo/cross-cu-linkonce-distinct.ll
index 67eb6c0..e19f89c 100644
--- a/test/DebugInfo/cross-cu-linkonce-distinct.ll
+++ b/test/DebugInfo/cross-cu-linkonce-distinct.ll

@@ -52,14 +52,14 @@
 define linkonce_odr i32 @_Z4funci(i32 %i) #0 {
   %1 = alloca i32, align 4
   store i32 %i, i32* %1, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !22), !dbg !23
+  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
   %2 = load i32* %1, align 4, !dbg !24
   %3 = mul nsw i32 %2, 2, !dbg !24
   ret i32 %3, !dbg !24
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -68,28 +68,28 @@
 !llvm.module.flags = !{!19, !20}
 !llvm.ident = !{!21, !21}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !5, i32 4, metadata !11, i32 0, i32 1, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
-!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!12 = metadata !{i32 786449, metadata !13, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !14, metadata !17, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!10 = metadata !{metadata !"0x34\00x\00x\00\004\000\001", null, metadata !5, metadata !11, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!12 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !13, metadata !2, metadata !2, metadata !14, metadata !17, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
 !13 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
 !14 = metadata !{metadata !15}
-!15 = metadata !{i32 786478, metadata !13, metadata !16, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
-!16 = metadata !{i32 786473, metadata !13}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
+!15 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", metadata !13, metadata !16, metadata !6, null, i32 (i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!16 = metadata !{metadata !"0x29", metadata !13}        ; [ DW_TAG_file_type ] [/tmp/dbginfo/b.cpp]
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !16, i32 4, metadata !11, i32 0, i32 1, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
+!18 = metadata !{metadata !"0x34\00y\00y\00\004\000\001", null, metadata !16, metadata !11, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
 !19 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !21 = metadata !{metadata !"clang version 3.5.0 "}
-!22 = metadata !{i32 786689, metadata !4, metadata !"i", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!22 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [i] [line 1]
 !23 = metadata !{i32 1, i32 0, metadata !4, null}
 !24 = metadata !{i32 2, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/cross-cu-linkonce.ll b/test/DebugInfo/cross-cu-linkonce.ll
index 16a5012..8beb6fd 100644
--- a/test/DebugInfo/cross-cu-linkonce.ll
+++ b/test/DebugInfo/cross-cu-linkonce.ll

@@ -5,7 +5,6 @@
 ; Built from source:
 ; $ clang++ a.cpp b.cpp -g -c -emit-llvm
 ; $ llvm-link a.bc b.bc -o ab.bc
-; $ opt -inline ab.bc -o ab-opt.bc
 ; $ cat a.cpp
 ; # 1 "func.h"
 ; inline int func(int i) {
@@ -33,14 +32,14 @@
 define linkonce_odr i32 @_Z4funci(i32 %i) #0 {
   %1 = alloca i32, align 4
   store i32 %i, i32* %1, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !20), !dbg !21
+  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
   %2 = load i32* %1, align 4, !dbg !22
   %3 = mul nsw i32 %2, 2, !dbg !22
   ret i32 %3, !dbg !22
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -49,26 +48,26 @@
 !llvm.module.flags = !{!17, !18}
 !llvm.ident = !{!19, !19}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !10, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !10, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/a.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4funci, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4funci\001\000\001\000\006\00256\000\001", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @_Z4funci, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
 !5 = metadata !{metadata !"func.h", metadata !"/tmp/dbginfo"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/func.h]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/func.h]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 4, metadata !12, i32 0, i32 1, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
-!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{i32 786449, metadata !14, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
+!11 = metadata !{metadata !"0x34\00x\00x\00\004\000\001", null, metadata !6, metadata !12, i32 (i32)** @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !14, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/b.cpp] [DW_LANG_C_plus_plus]
 !14 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo"}
 !15 = metadata !{metadata !16}
-!16 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !6, i32 4, metadata !12, i32 0, i32 1, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
+!16 = metadata !{metadata !"0x34\00y\00y\00\004\000\001", null, metadata !6, metadata !12, i32 (i32)** @y, null} ; [ DW_TAG_variable ] [y] [line 4] [def]
 !17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !19 = metadata !{metadata !"clang version 3.5.0 "}
-!20 = metadata !{i32 786689, metadata !4, metadata !"i", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!20 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [i] [line 1]
 !21 = metadata !{i32 1, i32 0, metadata !4, null}
 !22 = metadata !{i32 2, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/cu-line-tables.ll b/test/DebugInfo/cu-line-tables.ll
deleted file mode 100644
index 2496f3f..0000000
--- a/test/DebugInfo/cu-line-tables.ll
+++ /dev/null

@@ -1,51 +0,0 @@
-; REQUIRES: object-emission
-; RUN: %llc_dwarf -O0 -filetype=obj %s -o %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
-
-; Check that we don't emit ranges if we're emitting line tables only.
-
-; CHECK: DW_TAG_compile_unit
-; CHECK-NOT: DW_AT_ranges
-; CHECK: DW_TAG_subprogram
-
-; FIXME: We probably want to avoid printing out anything if the section isn't there.
-; CHECK: .debug_ranges contents:
-; CHECK-NOT: 00000000 <End of list>
-
-; CHECK: .debug_pubnames contents:
-; CHECK-NOT: Offset
-
-; CHECK: .debug_pubtypes contents:
-; CHECK-NOT: Offset
-
-; Function Attrs: nounwind uwtable
-define i32 @f(i32 %a) #0 {
-entry:
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %0 = load i32* %a.addr, align 4, !dbg !14
-  %add = add nsw i32 %0, 4, !dbg !14
-  ret i32 %add, !dbg !14
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!9, !10}
-!llvm.ident = !{!11}
-
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 (trunk 197756) (llvm/trunk 197768)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @f, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!11 = metadata !{metadata !"clang version 3.5 (trunk 197756) (llvm/trunk 197768)"}
-!14 = metadata !{i32 2, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/cu-range-hole.ll b/test/DebugInfo/cu-range-hole.ll
index 65a4956..0bdabba 100644
--- a/test/DebugInfo/cu-range-hole.ll
+++ b/test/DebugInfo/cu-range-hole.ll

@@ -18,7 +18,7 @@
 entry:
   %c.addr = alloca i32, align 4
   store i32 %c, i32* %c.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %c.addr}, metadata !13), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i32* %c.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
   %0 = load i32* %c.addr, align 4, !dbg !14
   %add = add nsw i32 %0, 1, !dbg !14
   ret i32 %add, !dbg !14
@@ -35,14 +35,14 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind uwtable
 define i32 @d(i32 %e) #0 {
 entry:
   %e.addr = alloca i32, align 4
   store i32 %e, i32* %e.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %e.addr}, metadata !15), !dbg !16
+  call void @llvm.dbg.declare(metadata !{i32* %e.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
   %0 = load i32* %e.addr, align 4, !dbg !16
   %add = add nsw i32 %0, 1, !dbg !16
   ret i32 %add, !dbg !16
@@ -56,19 +56,19 @@
 !llvm.module.flags = !{!11, !12}
 
 !0 = metadata !{metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
-!1 = metadata !{i32 786449, metadata !2, i32 12, metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)", i1 false, metadata !"", i32 0, metadata !3, metadata !3, metadata !4, metadata !3, metadata !3, metadata !"", i32 1}
+!1 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", metadata !2, metadata !3, metadata !3, metadata !4, metadata !3, metadata !3} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{metadata !"b.c", metadata !"/usr/local/google/home/echristo"}
 !3 = metadata !{}
 !4 = metadata !{metadata !5, metadata !10}
-!5 = metadata !{i32 786478, metadata !2, metadata !6, metadata !"b", metadata !"b", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @b, null, null, metadata !3, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [b]
-!6 = metadata !{i32 786473, metadata !2}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/b.c]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00b\00b\00\001\000\001\000\006\00256\000\001", metadata !2, metadata !6, metadata !7, null, i32 (i32)* @b, null, null, metadata !3} ; [ DW_TAG_subprogram ] [line 1] [def] [b]
+!6 = metadata !{metadata !"0x29", metadata !2}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/b.c]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786478, metadata !2, metadata !6, metadata !"d", metadata !"d", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @d, null, null, metadata !3, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [d]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x2e\00d\00d\00\003\000\001\000\006\00256\000\003", metadata !2, metadata !6, metadata !7, null, i32 (i32)* @d, null, null, metadata !3} ; [ DW_TAG_subprogram ] [line 3] [def] [d]
 !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
-!13 = metadata !{i32 786689, metadata !5, metadata !"c", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 1]
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
+!13 = metadata !{metadata !"0x101\00c\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [c] [line 1]
 !14 = metadata !{i32 1, i32 0, metadata !5, null}
-!15 = metadata !{i32 786689, metadata !10, metadata !"e", metadata !6, i32 16777219, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [e] [line 3]
+!15 = metadata !{metadata !"0x101\00e\0016777219\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [e] [line 3]
 !16 = metadata !{i32 3, i32 0, metadata !10, null}

diff --git a/test/DebugInfo/cu-ranges.ll b/test/DebugInfo/cu-ranges.ll
index 9262a22..83d176a 100644
--- a/test/DebugInfo/cu-ranges.ll
+++ b/test/DebugInfo/cu-ranges.ll

@@ -22,21 +22,21 @@
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !13), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
   %0 = load i32* %a.addr, align 4, !dbg !15
   %add = add nsw i32 %0, 5, !dbg !15
   ret i32 %add, !dbg !15
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind uwtable
 define i32 @bar(i32 %a) #0 {
 entry:
   %a.addr = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !16), !dbg !17
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !17
   %0 = load i32* %a.addr, align 4, !dbg !18
   %add = add nsw i32 %0, 5, !dbg !18
   ret i32 %add, !dbg !18
@@ -49,23 +49,23 @@
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/foo.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/foo.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @bar, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [bar]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x2e\00bar\00bar\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [bar]
 !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !12 = metadata !{metadata !"clang version 3.5.0 (trunk 204164) (llvm/trunk 204183)"}
-!13 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!13 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 1]
 !14 = metadata !{i32 1, i32 0, metadata !4, null}
 !15 = metadata !{i32 2, i32 0, metadata !4, null}
-!16 = metadata !{i32 786689, metadata !9, metadata !"a", metadata !5, i32 16777221, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 5]
+!16 = metadata !{metadata !"0x101\00a\0016777221\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 5]
 !17 = metadata !{i32 5, i32 0, metadata !9, null}
 !18 = metadata !{i32 6, i32 0, metadata !9, null}
 

diff --git a/test/DebugInfo/dead-argument-order.ll b/test/DebugInfo/dead-argument-order.ll
index ea805a4..2809ccc 100644
--- a/test/DebugInfo/dead-argument-order.ll
+++ b/test/DebugInfo/dead-argument-order.ll

@@ -38,17 +38,17 @@
 ; Function Attrs: nounwind readnone uwtable
 define i32 @_Z8function1Si(i32 %s.coerce, i32 %i) #0 {
 entry:
-  tail call void @llvm.dbg.declare(metadata !19, metadata !14), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !15), !dbg !20
+  tail call void @llvm.dbg.declare(metadata !19, metadata !14, metadata !{metadata !"0x102"}), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !20
   %add = add nsw i32 %i, %s.coerce, !dbg !20
   ret i32 %add, !dbg !20
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -57,24 +57,24 @@
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !8, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dead-argument-order.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !8, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dead-argument-order.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"dead-argument-order.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"S", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1S"} ; [ DW_TAG_structure_type ] [S] [line 1, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00S\001\0032\0032\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1S"} ; [ DW_TAG_structure_type ] [S] [line 1, size 32, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1S", metadata !"i", i32 1, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [i] [line 1, size 32, align 32, offset 0] [from int]
-!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!6 = metadata !{metadata !"0xd\00i\001\0032\0032\000\000", metadata !1, metadata !"_ZTS1S", metadata !7} ; [ DW_TAG_member ] [i] [line 1, size 32, align 32, offset 0] [from int]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"function", metadata !"function", metadata !"_Z8function1Si", i32 2, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32)* @_Z8function1Si, null, null, metadata !13, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [function]
-!10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dead-argument-order.cpp]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x2e\00function\00function\00_Z8function1Si\002\000\001\000\006\00256\001\002", metadata !1, metadata !10, metadata !11, null, i32 (i32, i32)* @_Z8function1Si, null, null, metadata !13} ; [ DW_TAG_subprogram ] [line 2] [def] [function]
+!10 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/dead-argument-order.cpp]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !7, metadata !4, metadata !7}
 !13 = metadata !{metadata !14, metadata !15}
-!14 = metadata !{i32 786689, metadata !9, metadata !"s", metadata !10, i32 16777218, metadata !"_ZTS1S", i32 0, i32 0} ; [ DW_TAG_arg_variable ] [s] [line 2]
-!15 = metadata !{i32 786689, metadata !9, metadata !"i", metadata !10, i32 33554434, metadata !7, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 2]
+!14 = metadata !{metadata !"0x101\00s\0016777218\000", metadata !9, metadata !10, metadata !"_ZTS1S"} ; [ DW_TAG_arg_variable ] [s] [line 2]
+!15 = metadata !{metadata !"0x101\00i\0033554434\000", metadata !9, metadata !10, metadata !7} ; [ DW_TAG_arg_variable ] [i] [line 2]
 !16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !18 = metadata !{metadata !"clang version 3.5.0 "}
 !19 = metadata !{%struct.S* undef}
 !20 = metadata !{i32 2, i32 0, metadata !9, null}

diff --git a/test/DebugInfo/debug-info-always-inline.ll b/test/DebugInfo/debug-info-always-inline.ll
new file mode 100644
index 0000000..57ae079
--- /dev/null
+++ b/test/DebugInfo/debug-info-always-inline.ll

@@ -0,0 +1,143 @@
+; RUN: opt < %s -always-inline -S | FileCheck %s
+;
+; Generated from the following C++ source with:
+; clang -cc1 -disable-llvm-optzns -emit-llvm -g -stack-protector 2 test.cpp
+;
+; /* BEGIN SOURCE */
+; int __attribute__((always_inline)) foo()
+; {
+;    int arr[10];
+;    arr[0] = 5;
+;    int sum = 4;
+;    return sum;
+; }
+; 
+; extern void bar();
+; 
+; int main()
+; {
+;   bar();
+;   int i = foo();
+;   return i;
+; }
+; /* END SOURCE */
+
+; The patch that includes this test case, is addressing the following issue:
+; 
+; When functions are inlined, instructions without debug information 
+; are attributed with the call site's DebugLoc. After inlining, inlined static
+; allocas are moved to the caller's entry block, adjacent to the caller's original
+; static alloca instructions. By retaining the call site's DebugLoc, these instructions
+; may cause instructions that are subsequently inserted at the entry block to pick
+; up the same DebugLoc.
+;
+; In the offending case stack protection inserts an instruction at the caller's
+; entry block, which inadvertently picks up the inlined call's DebugLoc, because
+; the entry block's first instruction is the recently moved inlined alloca instruction. 
+;
+; The stack protection instruction then becomes part of the function prologue, with the
+; result that the line number that is associated with the stack protection instruction
+; is deemed to be the end of the function prologue. Since this line number is the
+; call site's line number, setting a breakpoint at the function in the debugger
+; will make the user stop at the line of the inlined call.
+
+; Note that without the stack protection instruction this effect would not occur
+; because the allocas all get collapsed into a single instruction that reserves
+; stack space and have no further influence on the prologue's line number information.
+
+
+; The selected solution is to not attribute static allocas with the call site's
+; DebugLoc. 
+
+; At some point in the future, it may be desirable to describe the inlining 
+; in the alloca instructions, but then the code that handles prologues must
+; be able to handle this correctly, including the late insertion of instructions
+; into it.
+
+; In this context it is also important to distingush between functions
+; with the "nodebug" attribute and those without it. Alloca instructions from
+; nodebug functions should continue to have no DebugLoc, whereas those from
+; non-nodebug functions (i.e. functions with debug information) may want to
+; have their DebugLocs augmented with inlining information.
+
+
+; Make sure that after inlining the call to foo() the alloca instructions for
+; arr.i and sum.i do not retain debug information.
+
+; CHECK: %arr.i = alloca [10 x i32], align {{[0-9]*$}}
+; CHECK: %sum.i = alloca i32, align {{[0-9]*$}}
+
+
+; ModuleID = 'test.cpp'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: alwaysinline nounwind sspstrong
+define i32 @_Z3foov() #0 {
+entry:
+  %arr = alloca [10 x i32], align 16
+  %sum = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata !{[10 x i32]* %arr}, metadata !14), !dbg !18
+  %arrayidx = getelementptr inbounds [10 x i32]* %arr, i32 0, i64 0, !dbg !19
+  store i32 5, i32* %arrayidx, align 4, !dbg !19
+  call void @llvm.dbg.declare(metadata !{i32* %sum}, metadata !20), !dbg !21
+  store i32 4, i32* %sum, align 4, !dbg !21
+  %0 = load i32* %sum, align 4, !dbg !22
+  ret i32 %0, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind sspstrong
+define i32 @main() #2 {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  call void @_Z3barv(), !dbg !23
+  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !24), !dbg !25
+  %call = call i32 @_Z3foov(), !dbg !25
+  store i32 %call, i32* %i, align 4, !dbg !25
+  %0 = load i32* %i, align 4, !dbg !26
+  ret i32 %0, !dbg !26
+}
+
+declare void @_Z3barv() #3
+
+attributes #0 = { alwaysinline nounwind sspstrong "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind sspstrong "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !12}
+!llvm.ident = !{!13}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.6.0 (217844)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/home/user/test/<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"/home/user/test"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !10}
+!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [foo]
+!5 = metadata !{metadata !"test.cpp", metadata !"/home/user/test"}
+!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/home/user/test/test.cpp]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 11, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 12} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 12] [main]
+!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!12 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{metadata !"clang version 3.6.0 (217844)"}
+!14 = metadata !{i32 786688, metadata !4, metadata !"arr", metadata !6, i32 3, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [arr] [line 3]
+!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 320, i64 32, i32 0, i32 0, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 786465, i64 0, i64 10}       ; [ DW_TAG_subrange_type ] [0, 9]
+!18 = metadata !{i32 3, i32 0, metadata !4, null}
+!19 = metadata !{i32 4, i32 0, metadata !4, null}
+!20 = metadata !{i32 786688, metadata !4, metadata !"sum", metadata !6, i32 5, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 5]
+!21 = metadata !{i32 5, i32 0, metadata !4, null}
+!22 = metadata !{i32 6, i32 0, metadata !4, null}
+!23 = metadata !{i32 13, i32 0, metadata !10, null}
+!24 = metadata !{i32 786688, metadata !10, metadata !"i", metadata !6, i32 14, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 14]
+!25 = metadata !{i32 14, i32 0, metadata !10, null}
+!26 = metadata !{i32 15, i32 0, metadata !10, null}

diff --git a/test/DebugInfo/debug-info-qualifiers.ll b/test/DebugInfo/debug-info-qualifiers.ll
index b624d38..5b21225 100644
--- a/test/DebugInfo/debug-info-qualifiers.ll
+++ b/test/DebugInfo/debug-info-qualifiers.ll

@@ -39,16 +39,16 @@
   %a = alloca %class.A, align 1
   %pl = alloca { i64, i64 }, align 8
   %pr = alloca { i64, i64 }, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !24), !dbg !25
-  call void @llvm.dbg.declare(metadata !{{ i64, i64 }* %pl}, metadata !26), !dbg !31
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata !{{ i64, i64 }* %pl}, metadata !26, metadata !{metadata !"0x102"}), !dbg !31
   store { i64, i64 } { i64 ptrtoint (void (%class.A*)* @_ZNKR1A1lEv to i64), i64 0 }, { i64, i64 }* %pl, align 8, !dbg !31
-  call void @llvm.dbg.declare(metadata !{{ i64, i64 }* %pr}, metadata !32), !dbg !35
+  call void @llvm.dbg.declare(metadata !{{ i64, i64 }* %pr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !35
   store { i64, i64 } { i64 ptrtoint (void (%class.A*)* @_ZNKO1A1rEv to i64), i64 0 }, { i64, i64 }* %pr, align 8, !dbg !35
   ret void, !dbg !36
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare void @_ZNKR1A1lEv(%class.A*)
 
@@ -61,40 +61,40 @@
 !llvm.module.flags = !{!21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !16, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !16, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786434, metadata !5, null, metadata !"A", i32 2, i64 8, i64 8, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00A\002\008\008\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !"debug-info-qualifiers.cpp", metadata !""}
 !6 = metadata !{metadata !7, metadata !13}
-!7 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"l", metadata !"l", metadata !"_ZNKR1A1lEv", i32 5, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 16640, i1 false, null, null, i32 0, metadata !12, i32 5} ; [ DW_TAG_subprogram ] [line 5] [reference] [l]
-!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 16384, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [reference] [from ]
+!7 = metadata !{metadata !"0x2e\00l\00l\00_ZNKR1A1lEv\005\000\000\000\006\0016640\000\005", metadata !5, metadata !"_ZTS1A", metadata !8, null, null, null, i32 0, metadata !12} ; [ DW_TAG_subprogram ] [line 5] [reference] [l]
+!8 = metadata !{metadata !"0x15\00\000\000\000\000\0016384\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [reference] [from ]
 !9 = metadata !{null, metadata !10}
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
-!11 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
+!11 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS1A]
 !12 = metadata !{i32 786468}
-!13 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"r", metadata !"r", metadata !"_ZNKO1A1rEv", i32 7, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 33024, i1 false, null, null, i32 0, metadata !15, i32 7} ; [ DW_TAG_subprogram ] [line 7] [rvalue reference] [r]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 32768, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [rvalue reference] [from ]
+!13 = metadata !{metadata !"0x2e\00r\00r\00_ZNKO1A1rEv\007\000\000\000\006\0033024\000\007", metadata !5, metadata !"_ZTS1A", metadata !14, null, null, null, i32 0, metadata !15} ; [ DW_TAG_subprogram ] [line 7] [rvalue reference] [r]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\0032768\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [rvalue reference] [from ]
 !15 = metadata !{i32 786468}
 !16 = metadata !{metadata !17}
-!17 = metadata !{i32 786478, metadata !5, metadata !18, metadata !"g", metadata !"g", metadata !"_Z1gv", i32 10, metadata !19, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z1gv, null, null, metadata !2, i32 10} ; [ DW_TAG_subprogram ] [line 10] [def] [g]
-!18 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ]
-!19 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0x2e\00g\00g\00_Z1gv\0010\000\001\000\006\00256\000\0010", metadata !5, metadata !18, metadata !19, null, void ()* @_Z1gv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [g]
+!18 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ]
+!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !20 = metadata !{null}
 !21 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !23 = metadata !{metadata !"clang version 3.5 "}
-!24 = metadata !{i32 786688, metadata !17, metadata !"a", metadata !18, i32 11, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 11]
+!24 = metadata !{metadata !"0x100\00a\0011\000", metadata !17, metadata !18, metadata !4} ; [ DW_TAG_auto_variable ] [a] [line 11]
 !25 = metadata !{i32 11, i32 0, metadata !17, null}
-!26 = metadata !{i32 786688, metadata !17, metadata !"pl", metadata !18, i32 16, metadata !27, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pl] [line 16]
-!27 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !28, metadata !"_ZTS1A"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
-!28 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 16384, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [reference] [from ]
+!26 = metadata !{metadata !"0x100\00pl\0016\000", metadata !17, metadata !18, metadata !27} ; [ DW_TAG_auto_variable ] [pl] [line 16]
+!27 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !28, metadata !"_ZTS1A"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{metadata !"0x15\00\000\000\000\000\0016384\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [reference] [from ]
 !29 = metadata !{null, metadata !30}
-!30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
 !31 = metadata !{i32 16, i32 0, metadata !17, null}
-!32 = metadata !{i32 786688, metadata !17, metadata !"pr", metadata !18, i32 21, metadata !33, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pr] [line 21]
-!33 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !34, metadata !"_ZTS1A"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
-!34 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 32768, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [rvalue reference] [from ]
+!32 = metadata !{metadata !"0x100\00pr\0021\000", metadata !17, metadata !18, metadata !33} ; [ DW_TAG_auto_variable ] [pr] [line 21]
+!33 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !34, metadata !"_ZTS1A"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
+!34 = metadata !{metadata !"0x15\00\000\000\000\000\0032768\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [rvalue reference] [from ]
 !35 = metadata !{i32 21, i32 0, metadata !17, null}
 !36 = metadata !{i32 22, i32 0, metadata !17, null}

diff --git a/test/DebugInfo/debuginfofinder-multiple-cu.ll b/test/DebugInfo/debuginfofinder-multiple-cu.ll
index 74965df..7892306 100644
--- a/test/DebugInfo/debuginfofinder-multiple-cu.ll
+++ b/test/DebugInfo/debuginfofinder-multiple-cu.ll

@@ -22,20 +22,20 @@
 !llvm.dbg.cu = !{!0, !8}
 !llvm.module.flags = !{!13, !16}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (192092)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/test1.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (192092)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/test1.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test1.c", metadata !"/tmp"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @f, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test1.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @f, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test1.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
-!8 = metadata !{i32 786449, metadata !9, i32 12, metadata !"clang version 3.4 (192092)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !10, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/test2.c] [DW_LANG_C99]
+!8 = metadata !{metadata !"0x11\0012\00clang version 3.4 (192092)\000\00\000\00\000", metadata !9, metadata !2, metadata !2, metadata !10, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/test2.c] [DW_LANG_C99]
 !9 = metadata !{metadata !"test2.c", metadata !"/tmp"}
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786478, metadata !9, metadata !12, metadata !"g", metadata !"g", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @g, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [g]
-!12 = metadata !{i32 786473, metadata !9}         ; [ DW_TAG_file_type ] [/tmp/test2.c]
+!11 = metadata !{metadata !"0x2e\00g\00g\00\001\000\001\000\006\000\000\001", metadata !9, metadata !12, metadata !6, null, void ()* @g, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [g]
+!12 = metadata !{metadata !"0x29", metadata !9}         ; [ DW_TAG_file_type ] [/tmp/test2.c]
 !13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !14 = metadata !{i32 1, i32 0, metadata !4, null}
 !15 = metadata !{i32 1, i32 0, metadata !11, null}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/duplicate_inline.ll b/test/DebugInfo/duplicate_inline.ll
new file mode 100644
index 0000000..008b52f
--- /dev/null
+++ b/test/DebugInfo/duplicate_inline.ll

@@ -0,0 +1,117 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Built with clang from the following source:
+; void f1(int);
+; __attribute__((always_inline)) inline void f2(int i) { f1(i); }
+;
+; #define MULTICALL \
+;   f2(x);          \
+;   f2(y);
+;
+; void f3(int x, int y) { MULTICALL; }
+
+; FIXME: This produces only one inlined_subroutine, with two formal_parameters
+; (both named "this"), one for each of the actual inlined subroutines.  ;
+; Inlined scopes are differentiated by the combination of 'inlined at' (call)
+; location and the location within the function. If two calls to the same
+; function occur at the same location the scopes end up conflated and there
+; appears to be only one inlined function.
+; To fix this, we'd need to add some kind of unique metadata per call site, possibly something like:
+;
+; !42 = metadata !{i32 1, i32 0, metadata !43, metadata !44}
+; !44 = metadata !{i32 2, i32 0, metadata !45, null}
+;
+; ->
+;
+; !42 = metadata !{i32 1, i32 0, metadata !43, metadata !44}
+; !44 = metadata !{metadata !45, metadata !44}
+; !45 = metadata !{i32 2, i32 0, metadata !45, null}
+;
+; since cycles in metadata are not uniqued, the !44 node would not be shared
+; between calls to the same function from the same location, ensuring separate
+; inlined subroutines would be generated.
+;
+; Once this is done, the (insufficient) hack in clang that adds column
+; information to call sites to differentiate inlined callers can be removed as it
+; will no longer be necessary.
+;
+; While it might be nice to omit the duplicate parameter in this case (while
+; we wait/work on the real fix), it's actually better to leave it in because it
+; allows us to hold the invariant that every DbgVariable has a DIE, every time.
+; This has proved valuable in finding other bugs, so I want to avoid removing the
+; invariant/assertion. Besides, we don't know which one's the right one anyway...
+
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_TAG_inlined_subroutine
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     DW_TAG_formal_parameter
+; CHECK-NOT: DW_TAG
+; CHECK:     NULL
+; CHECK-NOT: DW_TAG
+; CHECK:   NULL
+
+; Function Attrs: uwtable
+define void @_Z2f3ii(i32 %x, i32 %y) #0 {
+entry:
+  %i.addr.i1 = alloca i32, align 4
+  %i.addr.i = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !15, metadata !16), !dbg !17
+  store i32 %y, i32* %y.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %y.addr}, metadata !18, metadata !16), !dbg !19
+  %0 = load i32* %x.addr, align 4, !dbg !20
+  store i32 %0, i32* %i.addr.i, align 4, !dbg !20
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr.i}, metadata !21, metadata !16), !dbg !22
+  %1 = load i32* %i.addr.i, align 4, !dbg !23
+  call void @_Z2f1i(i32 %1), !dbg !23
+  %2 = load i32* %y.addr, align 4, !dbg !20
+  store i32 %2, i32* %i.addr.i1, align 4, !dbg !20
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr.i1}, metadata !21, metadata !16), !dbg !22
+  %3 = load i32* %i.addr.i1, align 4, !dbg !23
+  call void @_Z2f1i(i32 %3), !dbg !23
+  ret void, !dbg !24
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @_Z2f1i(i32) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12, !13}
+!llvm.ident = !{!14}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/duplicate_inline.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"duplicate_inline.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !9}
+!4 = metadata !{metadata !"0x2e\00f3\00f3\00_Z2f3ii\008\000\001\000\000\00256\000\008", metadata !1, metadata !5, metadata !6, null, void (i32, i32)* @_Z2f3ii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [f3]
+!5 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/tmp/dbginfo/duplicate_inline.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null, metadata !8, metadata !8}
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x2e\00f2\00f2\00_Z2f2i\002\000\001\000\000\00256\000\002", metadata !1, metadata !5, metadata !10, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f2]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{null, metadata !8}
+!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!13 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!14 = metadata !{metadata !"clang version 3.6.0 "}
+!15 = metadata !{metadata !"0x101\00x\0016777224\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [x] [line 8]
+!16 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
+!17 = metadata !{i32 8, i32 13, metadata !4, null}
+!18 = metadata !{metadata !"0x101\00y\0033554440\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [y] [line 8]
+!19 = metadata !{i32 8, i32 20, metadata !4, null}
+!20 = metadata !{i32 8, i32 25, metadata !4, null}
+!21 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [i] [line 2]
+!22 = metadata !{i32 2, i32 51, metadata !9, metadata !20}
+!23 = metadata !{i32 2, i32 56, metadata !9, metadata !20}
+!24 = metadata !{i32 8, i32 36, metadata !4, null}

diff --git a/test/DebugInfo/dwarf-public-names.ll b/test/DebugInfo/dwarf-public-names.ll
index 7218964..f6d8cd3 100644
--- a/test/DebugInfo/dwarf-public-names.ll
+++ b/test/DebugInfo/dwarf-public-names.ll

@@ -59,13 +59,13 @@
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !28), !dbg !30
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !30
   %this1 = load %struct.C** %this.addr
   store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !31
   ret void, !dbg !32
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @_ZN1C22static_member_functionEv() nounwind uwtable align 2 {
 entry:
@@ -90,36 +90,36 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!38}
 
-!0 = metadata !{i32 786449, metadata !37, i32 4, metadata !"clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !24,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)\000\00\000\00\000", metadata !37, metadata !1, metadata !1, metadata !2, metadata !24,  metadata !1} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{}
 !2 = metadata !{metadata !3, metadata !18, metadata !19, metadata !20}
-!3 = metadata !{i32 786478, metadata !4, null, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !12, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
-!4 = metadata !{i32 786473, metadata !37} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!3 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\009\000\001\000\006\00256\000\009", metadata !4, null, metadata !5, null, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !12, metadata !1} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!4 = metadata !{metadata !"0x29", metadata !37} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{null, metadata !7}
-!7 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
-!8 = metadata !{i32 786451, metadata !37, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!7 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
+!8 = metadata !{metadata !"0x13\00C\001\008\008\000\000\000", metadata !37, null, null, metadata !9, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !12, metadata !14}
-!10 = metadata !{i32 786445, metadata !37, metadata !8, metadata !"static_member_variable", i32 4, i64 0, i64 0, i64 0, i32 4096, metadata !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{i32 786478, metadata !4, metadata !8, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !5, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !13, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
-!13 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!14 = metadata !{i32 786478, metadata !4, metadata !8, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !17, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
-!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0xd\00static_member_variable\004\000\000\000\004096", metadata !37, metadata !8, metadata !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = metadata !{metadata !"0x2e\00member_function\00member_function\00_ZN1C15member_functionEv\002\000\000\000\006\00256\000\002", metadata !4, metadata !8, metadata !5, null, null, null, i32 0, metadata !13} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!13 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!14 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\003\000\000\000\006\00256\000\003", metadata !4, metadata !8, metadata !15, null, null, null, i32 0, metadata !17} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !11}
-!17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!18 = metadata !{i32 786478, metadata !4, null, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
-!19 = metadata !{i32 786478, metadata !4, metadata !4, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !1, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
-!20 = metadata !{i32 786478, metadata !4, metadata !21, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
-!21 = metadata !{i32 786489, metadata !4, null, metadata !"ns", i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
-!22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!18 = metadata !{metadata !"0x2e\00static_member_function\00static_member_function\00_ZN1C22static_member_functionEv\0013\000\001\000\006\00256\000\0013", metadata !4, null, metadata !15, null, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!19 = metadata !{metadata !"0x2e\00global_function\00global_function\00_Z15global_functionv\0019\000\001\000\006\00256\000\0019", metadata !4, metadata !4, metadata !15, null, i32 ()* @_Z15global_functionv, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!20 = metadata !{metadata !"0x2e\00global_namespace_function\00global_namespace_function\00_ZN2ns25global_namespace_functionEv\0024\000\001\000\006\00256\000\0024", metadata !4, metadata !21, metadata !22, null, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!21 = metadata !{metadata !"0x39\00ns\0023", metadata !4, null} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null}
 !24 = metadata !{metadata !25, metadata !26, metadata !27}
-!25 = metadata !{i32 786484, i32 0, metadata !8, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !4, i32 7, metadata !11, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
-!26 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !4, i32 17, metadata !8, i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
-!27 = metadata !{i32 786484, i32 0, metadata !21, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !4, i32 27, metadata !11, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
-!28 = metadata !{i32 786689, metadata !3, metadata !"this", metadata !4, i32 16777225, metadata !29, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 9]
-!29 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
+!25 = metadata !{metadata !"0x34\00static_member_variable\00static_member_variable\00_ZN1C22static_member_variableE\007\000\001", metadata !8, metadata !4, metadata !11, i32* @_ZN1C22static_member_variableE, metadata !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!26 = metadata !{metadata !"0x34\00global_variable\00global_variable\00\0017\000\001", null, metadata !4, metadata !8, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!27 = metadata !{metadata !"0x34\00global_namespace_variable\00global_namespace_variable\00_ZN2ns25global_namespace_variableE\0027\000\001", metadata !21, metadata !4, metadata !11, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!28 = metadata !{metadata !"0x101\00this\0016777225\001088", metadata !3, metadata !4, metadata !29} ; [ DW_TAG_arg_variable ] [this] [line 9]
+!29 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
 !30 = metadata !{i32 9, i32 0, metadata !3, null}
 !31 = metadata !{i32 10, i32 0, metadata !3, null}
 !32 = metadata !{i32 11, i32 0, metadata !3, null}
@@ -128,4 +128,4 @@
 !35 = metadata !{i32 25, i32 0, metadata !20, null}
 !36 = metadata !{i32 26, i32 0, metadata !20, null}
 !37 = metadata !{metadata !"dwarf-public-names.cpp", metadata !"/usr2/kparzysz/s.hex/t"}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/dwarfdump-accel.test b/test/DebugInfo/dwarfdump-accel.test
new file mode 100644
index 0000000..c5c3b01
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-accel.test

@@ -0,0 +1,63 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-objc.x86_64.o | FileCheck %s
+
+Gather some DIE indexes to verify the accelerator table contents.
+CHECK: .debug_info contents
+CHECK: [[TESTINTERFACE:0x[0-9a-f]*]]:{{.*}}DW_TAG_structure_type
+CHECK-NOT: DW_TAG
+CHECK:     DW_AT_name{{.*}}"TestInterface"
+CHECK: [[READONLY:0x[0-9a-f]*]]:{{.*}}DW_TAG_subprogram
+CHECK-NOT: DW_TAG
+CHECK:     DW_AT_name{{.*}}"-[TestInterface ReadOnly]"
+CHECK: [[ASSIGN:0x[0-9a-f]*]]:{{.*}}DW_TAG_subprogram
+CHECK-NOT: DW_TAG
+CHECK:     DW_AT_name{{.*}}"-[TestInterface Assign]"
+CHECK: [[SETASSIGN:0x[0-9a-f]*]]:{{.*}}DW_TAG_subprogram
+CHECK-NOT: DW_TAG
+CHECK:     DW_AT_name{{.*}}"-[TestInterface setAssign:]"
+
+
+Check that the section header is printed correclty.
+CHECK: .apple_names contents:
+CHECK: Magic = 0x48415348
+CHECK: Version = 0x0001
+CHECK: Hash function = 0x00000000
+CHECK: Bucket count = 11
+CHECK: Hashes count = 22
+CHECK: HeaderData length = 12
+CHECK: DIE offset base = 0
+CHECK: Number of atoms = 1
+CHECK: Atom[0]  Type: DW_ATOM_die_offset Form: DW_FORM_data4
+
+Check that empty buckets are handled correctly.
+CHECK: Bucket[2]
+CHECK:   EMPTY
+CHECK: Bucket[3]
+
+Check that the accelerators point to the right DIEs.
+CHECK:     Name:{{.*}}"-[TestInterface ReadOnly]"
+CHECK-NOT: Name
+CHECK:     {Atom[0]: [[READONLY]]}
+CHECK:     Name:{{.*}}"-[TestInterface setAssign:]"
+CHECK-NOT: Name
+CHECK:     {Atom[0]: [[SETASSIGN]]}
+CHECK:     Name:{{.*}}"-[TestInterface Assign]"
+CHECK-NOT: Name
+CHECK:     {Atom[0]: [[ASSIGN]]}
+
+Check that types are referenced correctly.
+CHECK: .apple_types contents:
+CHECK:     Name{{.*}}"TestInterface"
+CHECK-NOT: Name
+CHECK:     {Atom[0]: [[TESTINTERFACE]]}
+
+Check that an empty ecceleratorsection is handled correctly.
+CHECK: .apple_namespaces contents:
+CHECK-NOT: Magic
+
+Check ObjC specific accelerators.
+CHECK: .apple_objc contents:
+CHECK:     Name{{.*}}"TestInterface"
+CHECK-NOT Name
+CHECK:     {Atom[0]: [[READONLY]]}
+CHECK:     {Atom[0]: [[ASSIGN]]}
+CHECK:     {Atom[0]: [[SETASSIGN]]}

diff --git a/test/DebugInfo/dwarfdump-objc.test b/test/DebugInfo/dwarfdump-objc.test
new file mode 100644
index 0000000..6890c3a
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-objc.test

@@ -0,0 +1,40 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-objc.x86_64.o | FileCheck %s
+
+CHECK:      .debug_info contents:
+
+CHECK: DW_TAG_APPLE_property
+CHECK-NOT: TAG
+CHECK:    DW_AT_APPLE_property_name {{.*}} "ReadOnly"
+CHECK-NOT: TAG
+CHECK:    DW_AT_APPLE_property_attribute {{.*}} (0x01 (DW_APPLE_PROPERTY_readonly))
+
+CHECK: DW_TAG_APPLE_property
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_name {{.*}} "Assign"
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_attribute {{.*}} (0x0c (DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite))
+
+CHECK: DW_TAG_APPLE_property
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_name {{.*}} "ReadWrite"
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_attribute {{.*}} (0x0c (DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite))
+
+CHECK: DW_TAG_APPLE_property
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_name {{.*}} "Retain"
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_attribute {{.*}} (0x18 (DW_APPLE_PROPERTY_readwrite, DW_APPLE_PROPERTY_retain))
+
+CHECK: DW_TAG_APPLE_property
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_name {{.*}} "Copy"
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_attribute {{.*}} (0x28 (DW_APPLE_PROPERTY_readwrite, DW_APPLE_PROPERTY_copy))
+
+CHECK: DW_TAG_APPLE_property
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_name {{.*}} "NonAtomic"
+CHECK-NOT: TAG
+CHECK:   DW_AT_APPLE_property_attribute {{.*}} (0x4c (DW_APPLE_PROPERTY_assign, DW_APPLE_PROPERTY_readwrite, DW_APPLE_PROPERTY_nonatomic))
+

diff --git a/test/DebugInfo/dwarfdump-ranges.test b/test/DebugInfo/dwarfdump-ranges.test
index c9e33dc..710aec6 100644
--- a/test/DebugInfo/dwarfdump-ranges.test
+++ b/test/DebugInfo/dwarfdump-ranges.test

@@ -1,5 +1,19 @@
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 | FileCheck %s
 
+CHECK: .debug_info contents:
+CHECK: DW_TAG_compile_unit
+CHECK-NOT: TAG
+CHECK:  DW_AT_ranges [DW_FORM_data4]      (0x00000000
+CHECK-NEXT:          [0x000000000000062c - 0x0000000000000637)
+CHECK-NEXT:          [0x0000000000000637 - 0x000000000000063d))
+
+CHECK: DW_TAG_compile_unit
+CHECK-NOT: TAG
+CHECK:  DW_AT_ranges [DW_FORM_data4]      (0x00000030
+CHECK-NEXT:          [0x0000000000000640 - 0x000000000000064b)
+CHECK-NEXT:          [0x0000000000000637 - 0x000000000000063d))
+
+
 CHECK:      .debug_ranges contents:
 CHECK-NEXT: 00000000 000000000000062c 0000000000000637
 CHECK-NEXT: 00000000 0000000000000637 000000000000063d

diff --git a/test/DebugInfo/empty.ll b/test/DebugInfo/empty.ll
index cf40523..52211af 100644
--- a/test/DebugInfo/empty.ll
+++ b/test/DebugInfo/empty.ll

@@ -24,8 +24,8 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", metadata !4, metadata !2, metadata !2, metadata !2, metadata !2, null} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{}
-!3 = metadata !{i32 786473, metadata !4} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x29", metadata !4} ; [ DW_TAG_file_type ]
 !4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"}
-!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/enum-types.ll b/test/DebugInfo/enum-types.ll
new file mode 100644
index 0000000..787e5f5
--- /dev/null
+++ b/test/DebugInfo/enum-types.ll

@@ -0,0 +1,78 @@
+; REQUIRES: object-emission
+;
+; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Make sure we can handle enums with the same identifier but in enum types of
+; different compile units.
+; rdar://17628609
+
+; CHECK: DW_TAG_compile_unit
+; CHECK: 0x[[ENUM:.*]]: DW_TAG_enumeration_type
+; CHECK-NEXT:   DW_AT_name {{.*}} "EA"
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_AT_MIPS_linkage_name {{.*}} "_Z4topA2EA"
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_AT_type [DW_FORM_ref4] (cu + 0x{{.*}} => {0x[[ENUM]]})
+
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_Z4topB2EA"
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_AT_type [DW_FORM_ref_addr] {{.*}}[[ENUM]]
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z4topA2EA(i32 %sa) #0 {
+entry:
+  %sa.addr = alloca i32, align 4
+  store i32 %sa, i32* %sa.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %sa.addr}, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
+  ret void, !dbg !24
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z4topB2EA(i32 %sa) #0 {
+entry:
+  %sa.addr = alloca i32, align 4
+  store i32 %sa, i32* %sa.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %sa.addr}, metadata !25, metadata !{metadata !"0x102"}), !dbg !26
+  ret void, !dbg !27
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0, !12}
+!llvm.module.flags = !{!19, !20}
+!llvm.ident = !{!21, !21}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 214102:214133) (llvm/trunk 214102:214132)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !6, metadata !11, metadata !11} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"a.cpp", metadata !""}
+!2 = metadata !{metadata !3}
+!3 = metadata !{metadata !"0x4\00EA\001\0032\0032\000\000\000", metadata !1, null, null, metadata !4, null, null, metadata !"_ZTS2EA"} ; [ DW_TAG_enumeration_type ] [EA] [line 1, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{metadata !"0x28\00EA_0\000"} ; [ DW_TAG_enumerator ] [EA_0 :: 0]
+!6 = metadata !{metadata !7}
+!7 = metadata !{metadata !"0x2e\00topA\00topA\00_Z4topA2EA\005\000\001\000\006\00256\000\005", metadata !1, metadata !8, metadata !9, null, void (i32)* @_Z4topA2EA, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 5] [def] [topA]
+!8 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [a.cpp]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{null, metadata !"_ZTS2EA"}
+!11 = metadata !{}
+!12 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 214102:214133) (llvm/trunk 214102:214132)\000\00\000\00\001", metadata !13, metadata !14, metadata !14, metadata !16, metadata !11, metadata !11} ; [ DW_TAG_compile_unit ] [b.cpp] [DW_LANG_C_plus_plus]
+!13 = metadata !{metadata !"b.cpp", metadata !""}
+!14 = metadata !{metadata !15}
+!15 = metadata !{metadata !"0x4\00EA\001\0032\0032\000\000\000", metadata !13, null, null, metadata !4, null, null, metadata !"_ZTS2EA"} ; [ DW_TAG_enumeration_type ] [EA] [line 1, size 32, align 32, offset 0] [def] [from ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{metadata !"0x2e\00topB\00topB\00_Z4topB2EA\005\000\001\000\006\00256\000\005", metadata !13, metadata !18, metadata !9, null, void (i32)* @_Z4topB2EA, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 5] [def] [topB]
+!18 = metadata !{metadata !"0x29", metadata !13}        ; [ DW_TAG_file_type ] [b.cpp]
+!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!20 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!21 = metadata !{metadata !"clang version 3.5.0 (trunk 214102:214133) (llvm/trunk 214102:214132)"}
+!22 = metadata !{metadata !"0x101\00sa\0016777221\000", metadata !7, metadata !8, metadata !"_ZTS2EA"} ; [ DW_TAG_arg_variable ] [sa] [line 5]
+!23 = metadata !{i32 5, i32 14, metadata !7, null}
+!24 = metadata !{i32 6, i32 1, metadata !7, null}
+!25 = metadata !{metadata !"0x101\00sa\0016777221\000", metadata !17, metadata !18, metadata !"_ZTS2EA"} ; [ DW_TAG_arg_variable ] [sa] [line 5]
+!26 = metadata !{i32 5, i32 14, metadata !17, null}
+!27 = metadata !{i32 6, i32 1, metadata !17, null}

diff --git a/test/DebugInfo/enum.ll b/test/DebugInfo/enum.ll
index df097a6..a64795c 100644
--- a/test/DebugInfo/enum.ll
+++ b/test/DebugInfo/enum.ll

@@ -39,13 +39,13 @@
 define void @_Z4funcv() #0 {
 entry:
   %b = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b}, metadata !20), !dbg !22
+  call void @llvm.dbg.declare(metadata !{i32* %b}, metadata !20, metadata !{metadata !"0x102"}), !dbg !22
   store i32 0, i32* %b, align 4, !dbg !22
   ret void, !dbg !23
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -53,28 +53,28 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !24}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !11, metadata !12, metadata !17, metadata !11, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/enum.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !11, metadata !12, metadata !17, metadata !11} ; [ DW_TAG_compile_unit ] [/tmp/enum.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"enum.cpp", metadata !"/tmp"}
 !2 = metadata !{metadata !3, metadata !8}
-!3 = metadata !{i32 786436, metadata !1, null, metadata !"e1", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [e1] [line 1, size 64, align 64, offset 0] [def] [from ]
+!3 = metadata !{metadata !"0x4\00e1\001\0064\0064\000\000\000", metadata !1, null, null, metadata !4, null, null, null} ; [ DW_TAG_enumeration_type ] [e1] [line 1, size 64, align 64, offset 0] [def] [from ]
 !4 = metadata !{metadata !5, metadata !6, metadata !7}
-!5 = metadata !{i32 786472, metadata !"I", i64 0} ; [ DW_TAG_enumerator ] [I :: 0]
-!6 = metadata !{i32 786472, metadata !"J", i64 4294967295} ; [ DW_TAG_enumerator ] [J :: 4294967295]
-!7 = metadata !{i32 786472, metadata !"K", i64 -1152921504606846976} ; [ DW_TAG_enumerator ] [K :: 17293822569102704640]
-!8 = metadata !{i32 786436, metadata !1, null, metadata !"e2", i32 2, i64 32, i64 32, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [e2] [line 2, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"0x28\00I\000"} ; [ DW_TAG_enumerator ] [I :: 0]
+!6 = metadata !{metadata !"0x28\00J\004294967295"} ; [ DW_TAG_enumerator ] [J :: 4294967295]
+!7 = metadata !{metadata !"0x28\00K\00-1152921504606846976"} ; [ DW_TAG_enumerator ] [K :: 17293822569102704640]
+!8 = metadata !{metadata !"0x4\00e2\002\0032\0032\000\000\000", metadata !1, null, null, metadata !9, null, null, null} ; [ DW_TAG_enumeration_type ] [e2] [line 2, size 32, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786472, metadata !"X", i64 0} ; [ DW_TAG_enumerator ] [X :: 0]
+!10 = metadata !{metadata !"0x28\00X\000"} ; [ DW_TAG_enumerator ] [X :: 0]
 !11 = metadata !{}
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786478, metadata !1, metadata !14, metadata !"func", metadata !"func", metadata !"_Z4funcv", i32 3, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4funcv, null, null, metadata !11, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [func]
-!14 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/enum.cpp]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x2e\00func\00func\00_Z4funcv\003\000\001\000\006\00256\000\003", metadata !1, metadata !14, metadata !15, null, void ()* @_Z4funcv, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 3] [def] [func]
+!14 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/enum.cpp]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null}
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !14, i32 1, metadata !3, i32 0, i32 1, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
+!18 = metadata !{metadata !"0x34\00a\00a\00\001\000\001", null, metadata !14, metadata !3, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
 !19 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!20 = metadata !{i32 786688, metadata !13, metadata !"b", metadata !14, i32 4, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 4]
-!21 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = metadata !{metadata !"0x100\00b\004\000", metadata !13, metadata !14, metadata !21} ; [ DW_TAG_auto_variable ] [b] [line 4]
+!21 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !22 = metadata !{i32 4, i32 0, metadata !13, null}
 !23 = metadata !{i32 5, i32 0, metadata !13, null}
-!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/global-with-type-context.ll b/test/DebugInfo/global-with-type-context.ll
new file mode 100644
index 0000000..10b98a7
--- /dev/null
+++ b/test/DebugInfo/global-with-type-context.ll

@@ -0,0 +1,74 @@
+; REQUIRES: object-emission
+
+; RUN: %llc_dwarf -filetype=obj -O0 < %s > %t
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+
+; IR generated from clang -g with the following source:
+; struct F {
+;   static const int i = 2;
+;   virtual ~F();
+; };
+;
+; void f1() {
+;   int i = F::i;
+; }
+
+; Make sure we correctly handle context of a global variable being a type identifier.
+; CHECK:  [[STRUCT:.*]]: DW_TAG_structure_type
+; CHECK: DW_AT_name [DW_FORM_strp] {{.*}}= "F")
+; CHECK: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}} "i"
+; CHECK-NEXT: DW_AT_const_value [DW_FORM_sdata] (2)
+
+; Function Attrs: nounwind
+define void @_Z2f1v() #0 {
+entry:
+  %i = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !29, metadata !30), !dbg !31
+  store i32 2, i32* %i, align 4, !dbg !31
+  ret void, !dbg !32
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26, !27}
+!llvm.ident = !{!28}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 (trunk 222175)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !20, metadata !24, metadata !2} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<stdin>", metadata !"."}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x13\00F\001\0064\0064\000\000\000", metadata !5, null, null, metadata !6, metadata !"_ZTS1F", null, metadata !"_ZTS1F"} ; [ DW_TAG_structure_type ] [F] [line 1, size 64, align 64, offset 0] [def] [from ]
+!5 = metadata !{metadata !"test.cpp", metadata !"."}
+!6 = metadata !{metadata !7, metadata !14, metadata !16}
+!7 = metadata !{metadata !"0xd\00_vptr$F\000\0064\000\000\0064", metadata !5, metadata !8, metadata !9} ; [ DW_TAG_member ] [_vptr$F] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!8 = metadata !{metadata !"0x29", metadata !5}    ; [ DW_TAG_file_type ]
+!9 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!10 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !"0xd\00i\002\000\000\000\004096", metadata !5, metadata !"_ZTS1F", metadata !15, i32 2} ; [ DW_TAG_member ] [i] [line 2, size 0, align 0, offset 0] [static] [from ]
+!15 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!16 = metadata !{metadata !"0x2e\00~F\00~F\00\003\000\000\001\000\00256\000\003", metadata !5, metadata !"_ZTS1F", metadata !17, metadata !"_ZTS1F", null, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [~F]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{null, metadata !19}
+!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088\00", null, null, metadata !"_ZTS1F"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1F]
+!20 = metadata !{metadata !21}
+!21 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1v\006\000\001\000\000\00256\000\006", metadata !5, metadata !8, metadata !22, null, void ()* @_Z2f1v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{null}
+!24 = metadata !{metadata !25}
+!25 = metadata !{metadata !"0x34\00i\00i\00\002\001\001", metadata !"_ZTS1F", metadata !8, metadata !15, i32 2, metadata !14} ; [ DW_TAG_variable ] [i] [line 2] [local] [def]
+!26 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!27 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!28 = metadata !{metadata !"clang version 3.6.0 (trunk 222175)"}
+!29 = metadata !{metadata !"0x100\00i\007\000", metadata !21, metadata !8, metadata !13} ; [ DW_TAG_auto_variable ] [i] [line 7]
+!30 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
+!31 = metadata !{i32 7, i32 0, metadata !21, null}
+!32 = metadata !{i32 8, i32 0, metadata !21, null}

diff --git a/test/DebugInfo/global.ll b/test/DebugInfo/global.ll
index 3c97f0c..80f30c2 100644
--- a/test/DebugInfo/global.ll
+++ b/test/DebugInfo/global.ll

@@ -26,17 +26,17 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !13}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/global.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/global.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"global.cpp", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/global.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/global.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"_ZL1i", metadata !5, i32 1, metadata !8, i32 1, i32 1, null, null}
+!10 = metadata !{metadata !"0x34\00i\00i\00_ZL1i\001\001\001", null, metadata !5, metadata !8, null, null} ; [ DW_TAG_variable ]
 !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !12 = metadata !{i32 4, i32 0, metadata !4, null}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/gmlt.test b/test/DebugInfo/gmlt.test
new file mode 100644
index 0000000..0514dbf
--- /dev/null
+++ b/test/DebugInfo/gmlt.test

@@ -0,0 +1,5 @@
+; REQUIRES: object-emission
+; RUN: %llc_dwarf -O0 -filetype=obj < %S/Inputs/gmlt.ll | llvm-dwarfdump - | FileCheck %S/Inputs/gmlt.ll
+
+; There's a darwin specific test in X86/gmlt, so it's okay to XFAIL this here.
+; XFAIL: darwin

diff --git a/test/DebugInfo/incorrect-variable-debugloc.ll b/test/DebugInfo/incorrect-variable-debugloc.ll
index 284704c..987521c 100644
--- a/test/DebugInfo/incorrect-variable-debugloc.ll
+++ b/test/DebugInfo/incorrect-variable-debugloc.ll

@@ -38,11 +38,11 @@
 
 ; CHECK: DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name {{.*}} "C"
-; CHECK: [[FN3_DECL:.*]]: DW_TAG_subprogram
+; CHECK:   DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name {{.*}} "m_fn3"
 
-; CHECK: DW_AT_specification {{.*}} {[[FN3_DECL]]}
+; CHECK: DW_AT_specification {{.*}} "_ZN1C5m_fn3Ev"
 ; CHECK-NOT: DW_TAG
 ; CHECK:   DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
@@ -110,7 +110,7 @@
 
 ; <label>:30                                      ; preds = %24, %5
   store i32 0, i32* %i.i, align 4, !dbg !39, !tbaa !41
-  tail call void @llvm.dbg.value(metadata !{%struct.C* %8}, i64 0, metadata !27), !dbg !46
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %8}, i64 0, metadata !27, metadata !{metadata !"0x102"}), !dbg !46
   call void @_ZN1C5m_fn3Ev(%struct.C* %8), !dbg !47
   unreachable, !dbg !47
 }
@@ -145,7 +145,7 @@
   %16 = add i64 %15, 0, !dbg !48
   %17 = inttoptr i64 %16 to i64*, !dbg !48
   store i64 -868083113472691727, i64* %17, !dbg !48
-  tail call void @llvm.dbg.value(metadata !{%struct.C* %this}, i64 0, metadata !30), !dbg !48
+  tail call void @llvm.dbg.value(metadata !{%struct.C* %this}, i64 0, metadata !30, metadata !{metadata !"0x102"}), !dbg !48
   %call = call i32 @_ZN1A5m_fn1Ev(%struct.A* %8), !dbg !49
   %i.i = getelementptr inbounds %struct.C* %this, i64 0, i32 1, i32 0, !dbg !50
   %18 = ptrtoint i32* %i.i to i64, !dbg !50
@@ -198,7 +198,7 @@
 declare i32 @_ZN1A5m_fn1Ev(%struct.A*) #2
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #3
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
 
 define internal void @asan.module_ctor() {
   tail call void @__asan_init_v3()
@@ -336,44 +336,44 @@
 !llvm.module.flags = !{!36, !37}
 !llvm.ident = !{!38}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !21, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !21, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/<stdin>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<stdin>", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !14}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"C", i32 10, i64 64, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 10, size 64, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00C\0010\0064\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 10, size 64, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !"incorrect-variable-debug-loc.cpp", metadata !"/tmp/dbginfo"}
 !6 = metadata !{metadata !7, metadata !9, metadata !10}
-!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1C", metadata !"j", i32 12, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [j] [line 12, size 32, align 32, offset 0] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1C", metadata !"b", i32 13, i64 32, i64 32, i64 32, i32 0, metadata !"_ZTS1B"} ; [ DW_TAG_member ] [b] [line 13, size 32, align 32, offset 32] [from _ZTS1B]
-!10 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"m_fn3", metadata !"m_fn3", metadata !"_ZN1C5m_fn3Ev", i32 11, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 11} ; [ DW_TAG_subprogram ] [line 11] [m_fn3]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0xd\00j\0012\0032\0032\000\000", metadata !5, metadata !"_ZTS1C", metadata !8} ; [ DW_TAG_member ] [j] [line 12, size 32, align 32, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xd\00b\0013\0032\0032\0032\000", metadata !5, metadata !"_ZTS1C", metadata !"_ZTS1B"} ; [ DW_TAG_member ] [b] [line 13, size 32, align 32, offset 32] [from _ZTS1B]
+!10 = metadata !{metadata !"0x2e\00m_fn3\00m_fn3\00_ZN1C5m_fn3Ev\0011\000\000\000\006\00256\001\0011", metadata !5, metadata !"_ZTS1C", metadata !11, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 11] [m_fn3]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
-!14 = metadata !{i32 786451, metadata !5, null, metadata !"B", i32 5, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_structure_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
+!13 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!14 = metadata !{metadata !"0x13\00B\005\0032\0032\000\000\000", metadata !5, null, null, metadata !15, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_structure_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1B", metadata !"i", i32 7, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [i] [line 7, size 32, align 32, offset 0] [from int]
-!17 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1B", metadata !"m_fn2", metadata !"m_fn2", metadata !"_ZN1B5m_fn2Ev", i32 6, metadata !18, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 6} ; [ DW_TAG_subprogram ] [line 6] [m_fn2]
-!18 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0xd\00i\007\0032\0032\000\000", metadata !5, metadata !"_ZTS1B", metadata !8} ; [ DW_TAG_member ] [i] [line 7, size 32, align 32, offset 0] [from int]
+!17 = metadata !{metadata !"0x2e\00m_fn2\00m_fn2\00_ZN1B5m_fn2Ev\006\000\000\000\006\00256\001\006", metadata !5, metadata !"_ZTS1B", metadata !18, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [m_fn2]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{null, metadata !20}
-!20 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!20 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
 !21 = metadata !{metadata !22, metadata !28, metadata !32}
-!22 = metadata !{i32 786478, metadata !5, metadata !23, metadata !"fn1", metadata !"fn1", metadata !"_Z3fn1v", i32 16, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z3fn1v, null, null, metadata !26, i32 16} ; [ DW_TAG_subprogram ] [line 16] [def] [fn1]
-!23 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/incorrect-variable-debug-loc.cpp]
-!24 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{metadata !"0x2e\00fn1\00fn1\00_Z3fn1v\0016\000\001\000\006\00256\001\0016", metadata !5, metadata !23, metadata !24, null, i32 ()* @_Z3fn1v, null, null, metadata !26} ; [ DW_TAG_subprogram ] [line 16] [def] [fn1]
+!23 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/incorrect-variable-debug-loc.cpp]
+!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{metadata !8}
 !26 = metadata !{metadata !27}
-!27 = metadata !{i32 786688, metadata !22, metadata !"A", metadata !23, i32 17, metadata !"_ZTS1C", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [A] [line 17]
-!28 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1C", metadata !"m_fn3", metadata !"m_fn3", metadata !"_ZN1C5m_fn3Ev", i32 21, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%struct.C*)* @_ZN1C5m_fn3Ev, null, metadata !10, metadata !29, i32 21} ; [ DW_TAG_subprogram ] [line 21] [def] [m_fn3]
+!27 = metadata !{metadata !"0x100\00A\0017\000", metadata !22, metadata !23, metadata !"_ZTS1C"} ; [ DW_TAG_auto_variable ] [A] [line 17]
+!28 = metadata !{metadata !"0x2e\00m_fn3\00m_fn3\00_ZN1C5m_fn3Ev\0021\000\001\000\006\00256\001\0021", metadata !5, metadata !"_ZTS1C", metadata !11, null, void (%struct.C*)* @_ZN1C5m_fn3Ev, null, metadata !10, metadata !29} ; [ DW_TAG_subprogram ] [line 21] [def] [m_fn3]
 !29 = metadata !{metadata !30}
-!30 = metadata !{i32 786689, metadata !28, metadata !"this", null, i32 16777216, metadata !31, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!31 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
-!32 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1B", metadata !"m_fn2", metadata !"m_fn2", metadata !"_ZN1B5m_fn2Ev", i32 6, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !17, metadata !33, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [m_fn2]
+!30 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !28, null, metadata !31} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!32 = metadata !{metadata !"0x2e\00m_fn2\00m_fn2\00_ZN1B5m_fn2Ev\006\000\001\000\006\00256\001\006", metadata !5, metadata !"_ZTS1B", metadata !18, null, null, null, metadata !17, metadata !33} ; [ DW_TAG_subprogram ] [line 6] [def] [m_fn2]
 !33 = metadata !{metadata !34}
-!34 = metadata !{i32 786689, metadata !32, metadata !"this", null, i32 16777216, metadata !35, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!35 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
+!34 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !32, null, metadata !35} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!35 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
 !36 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!37 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!37 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !38 = metadata !{metadata !"clang version 3.5.0 "}
 !39 = metadata !{i32 6, i32 0, metadata !32, metadata !40}
 !40 = metadata !{i32 18, i32 0, metadata !22, null}

diff --git a/test/DebugInfo/inheritance.ll b/test/DebugInfo/inheritance.ll
index 6b3ae09..514f828 100644
--- a/test/DebugInfo/inheritance.ll
+++ b/test/DebugInfo/inheritance.ll

@@ -16,7 +16,7 @@
   %0 = alloca i32                                 ; <i32*> [#uses=2]
   %tst = alloca %struct.test1                     ; <%struct.test1*> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.test1* %tst}, metadata !0), !dbg !21
+  call void @llvm.dbg.declare(metadata !{%struct.test1* %tst}, metadata !0, metadata !{metadata !"0x102"}), !dbg !21
   call void @_ZN5test1C1Ev(%struct.test1* %tst) nounwind, !dbg !22
   store i32 0, i32* %0, align 4, !dbg !23
   %1 = load i32* %0, align 4, !dbg !23            ; <i32> [#uses=1]
@@ -32,7 +32,7 @@
 entry:
   %this_addr = alloca %struct.test1*              ; <%struct.test1**> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.test1** %this_addr}, metadata !24), !dbg !28
+  call void @llvm.dbg.declare(metadata !{%struct.test1** %this_addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !28
   store %struct.test1* %this, %struct.test1** %this_addr
   %0 = load %struct.test1** %this_addr, align 8, !dbg !28 ; <%struct.test1*> [#uses=1]
   %1 = getelementptr inbounds %struct.test1* %0, i32 0, i32 0, !dbg !28 ; <i32 (...)***> [#uses=1]
@@ -43,13 +43,13 @@
   ret void, !dbg !29
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define linkonce_odr void @_ZN5test1D1Ev(%struct.test1* %this) nounwind ssp align 2 {
 entry:
   %this_addr = alloca %struct.test1*              ; <%struct.test1**> [#uses=3]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.test1** %this_addr}, metadata !32), !dbg !34
+  call void @llvm.dbg.declare(metadata !{%struct.test1** %this_addr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !34
   store %struct.test1* %this, %struct.test1** %this_addr
   %0 = load %struct.test1** %this_addr, align 8, !dbg !35 ; <%struct.test1*> [#uses=1]
   %1 = getelementptr inbounds %struct.test1* %0, i32 0, i32 0, !dbg !35 ; <i32 (...)***> [#uses=1]
@@ -78,7 +78,7 @@
 entry:
   %this_addr = alloca %struct.test1*              ; <%struct.test1**> [#uses=3]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.test1** %this_addr}, metadata !38), !dbg !40
+  call void @llvm.dbg.declare(metadata !{%struct.test1** %this_addr}, metadata !38, metadata !{metadata !"0x102"}), !dbg !40
   store %struct.test1* %this, %struct.test1** %this_addr
   %0 = load %struct.test1** %this_addr, align 8, !dbg !41 ; <%struct.test1*> [#uses=1]
   %1 = getelementptr inbounds %struct.test1* %0, i32 0, i32 0, !dbg !41 ; <i32 (...)***> [#uses=1]
@@ -105,49 +105,49 @@
 
 declare void @_ZdlPv(i8*) nounwind
 
-!0 = metadata !{i32 459008, metadata !1, metadata !"tst", metadata !4, i32 13, metadata !8} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{i32 458763, metadata !44, metadata !2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!2 = metadata !{i32 458763, metadata !44, metadata !3, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{i32 458798, i32 0, metadata !4, metadata !"main", metadata !"main", metadata !"main", i32 11, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!4 = metadata !{i32 458769, metadata !44, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !45, metadata !45, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 458773, metadata !4, null, metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x100\00tst\0013\000", metadata !1, metadata !4, metadata !8} ; [ DW_TAG_auto_variable ]
+!1 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !2} ; [ DW_TAG_lexical_block ]
+!2 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !3} ; [ DW_TAG_lexical_block ]
+!3 = metadata !{metadata !"0x2e\00main\00main\00main\0011\000\001\000\006\000\000\000", i32 0, metadata !4, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!4 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !44, metadata !45, metadata !45, null, null, null} ; [ DW_TAG_compile_unit ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !4, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 458788, null, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 458771, metadata !44, metadata !4, metadata !"test1", i32 1, i64 64, i64 64, i64 0, i32 0, null, metadata !9, i32 0, metadata !8, null, null} ; [ DW_TAG_structure_type ] [test1] [line 1, size 64, align 64, offset 0] [def] [from ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !4} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x13\00test1\001\0064\0064\000\000\000", metadata !44, metadata !4, null, metadata !9, metadata !8, null, null} ; [ DW_TAG_structure_type ] [test1] [line 1, size 64, align 64, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !14, metadata !18}
-!10 = metadata !{i32 458765, metadata !44, metadata !8, metadata !"_vptr$test1", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_member ]
-!11 = metadata !{i32 458767, metadata !4, null, metadata !4, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 458767, null, metadata !4, metadata !"__vtbl_ptr_type", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
-!13 = metadata !{i32 458769, metadata !46, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !45, metadata !45, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!14 = metadata !{i32 458798, i32 0, metadata !8, metadata !"test1", metadata !"test1", metadata !"", i32 1, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i1 true, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!15 = metadata !{i32 458773, metadata !4, null, metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0xd\00_vptr$test1\001\0064\0064\000\000", metadata !44, metadata !8, metadata !11} ; [ DW_TAG_member ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !4, null, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\000\000\000\000", null, metadata !4, metadata !5} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", metadata !46, metadata !45, metadata !45, null, null, null} ; [ DW_TAG_compile_unit ]
+!14 = metadata !{metadata !"0x2e\00test1\00test1\00\001\000\000\000\006\001\000\000", i32 0, metadata !8, metadata !15, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !4, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17}
-!17 = metadata !{i32 458767, metadata !4, null, metadata !4, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !8} ; [ DW_TAG_pointer_type ]
-!18 = metadata !{i32 458798, i32 0, metadata !8, metadata !"~test1", metadata !"~test1", metadata !"", i32 4, metadata !19, i1 false, i1 false, i32 1, i32 0, metadata !8, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!19 = metadata !{i32 458773, metadata !4, null, metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\0064", metadata !4, null, metadata !8} ; [ DW_TAG_pointer_type ]
+!18 = metadata !{metadata !"0x2e\00~test1\00~test1\00\004\000\000\001\006\000\000\000", i32 0, metadata !8, metadata !19, metadata !8, null, null, null, null} ; [ DW_TAG_subprogram ]
+!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !4, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !20 = metadata !{null, metadata !17, metadata !7}
 !21 = metadata !{i32 11, i32 0, metadata !1, null}
 !22 = metadata !{i32 13, i32 0, metadata !1, null}
 !23 = metadata !{i32 14, i32 0, metadata !1, null}
-!24 = metadata !{i32 459009, metadata !25, metadata !"this", metadata !4, i32 13, metadata !26} ; [ DW_TAG_arg_variable ]
-!25 = metadata !{i32 458798, i32 0, metadata !4, metadata !"test1", metadata !"test1", metadata !"_ZN5test1C1Ev", i32 1, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!26 = metadata !{i32 458790, metadata !4, null, metadata !4, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !27} ; [ DW_TAG_const_type ]
-!27 = metadata !{i32 458767, metadata !4, null, metadata !4, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
+!24 = metadata !{metadata !"0x101\00this\0013\000", metadata !25, metadata !4, metadata !26} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{metadata !"0x2e\00test1\00test1\00_ZN5test1C1Ev\001\000\001\000\006\000\000\000", i32 0, metadata !4, metadata !15, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!26 = metadata !{metadata !"0x26\00\000\0064\0064\000\0064", metadata !4, null, metadata !27} ; [ DW_TAG_const_type ]
+!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !4, null, metadata !8} ; [ DW_TAG_pointer_type ]
 !28 = metadata !{i32 1, i32 0, metadata !25, null}
 !29 = metadata !{i32 1, i32 0, metadata !30, null}
-!30 = metadata !{i32 458763, metadata !44, metadata !31, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!31 = metadata !{i32 458763, metadata !44, metadata !25, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!32 = metadata !{i32 459009, metadata !33, metadata !"this", metadata !4, i32 4, metadata !26} ; [ DW_TAG_arg_variable ]
-!33 = metadata !{i32 458798, i32 0, metadata !8, metadata !"~test1", metadata !"~test1", metadata !"_ZN5test1D1Ev", i32 4, metadata !15, i1 false, i1 true, i32 1, i32 0, metadata !8, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!30 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !31} ; [ DW_TAG_lexical_block ]
+!31 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !25} ; [ DW_TAG_lexical_block ]
+!32 = metadata !{metadata !"0x101\00this\004\000", metadata !33, metadata !4, metadata !26} ; [ DW_TAG_arg_variable ]
+!33 = metadata !{metadata !"0x2e\00~test1\00~test1\00_ZN5test1D1Ev\004\000\001\001\006\000\000\000", i32 0, metadata !8, metadata !15, metadata !8, null, null, null, null} ; [ DW_TAG_subprogram ]
 !34 = metadata !{i32 4, i32 0, metadata !33, null}
 !35 = metadata !{i32 5, i32 0, metadata !36, null}
-!36 = metadata !{i32 458763, metadata !44, metadata !33, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!36 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !33} ; [ DW_TAG_lexical_block ]
 !37 = metadata !{i32 6, i32 0, metadata !36, null}
-!38 = metadata !{i32 459009, metadata !39, metadata !"this", metadata !4, i32 4, metadata !26} ; [ DW_TAG_arg_variable ]
-!39 = metadata !{i32 458798, i32 0, metadata !8, metadata !"~test1", metadata !"~test1", metadata !"_ZN5test1D0Ev", i32 4, metadata !15, i1 false, i1 true, i32 1, i32 1, metadata !8, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!38 = metadata !{metadata !"0x101\00this\004\000", metadata !39, metadata !4, metadata !26} ; [ DW_TAG_arg_variable ]
+!39 = metadata !{metadata !"0x2e\00~test1\00~test1\00_ZN5test1D0Ev\004\000\001\001\006\000\000\000", i32 0, metadata !8, metadata !15, metadata !8, null, null, null, null} ; [ DW_TAG_subprogram ]
 !40 = metadata !{i32 4, i32 0, metadata !39, null}
 !41 = metadata !{i32 5, i32 0, metadata !42, null}
-!42 = metadata !{i32 458763, metadata !44, metadata !39, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!42 = metadata !{metadata !"0xb\000\000\000", metadata !44, metadata !39} ; [ DW_TAG_lexical_block ]
 !43 = metadata !{i32 6, i32 0, metadata !42, null}
 !44 = metadata !{metadata !"inheritance.cpp", metadata !"/tmp/"}
 !45 = metadata !{i32 0}

diff --git a/test/DebugInfo/inline-debug-info-multiret.ll b/test/DebugInfo/inline-debug-info-multiret.ll
index 594512f..05b429a 100644
--- a/test/DebugInfo/inline-debug-info-multiret.ll
+++ b/test/DebugInfo/inline-debug-info-multiret.ll

@@ -27,8 +27,8 @@
   %k.addr = alloca i32, align 4
   %k2 = alloca i32, align 4
   store i32 %k, i32* %k.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %k.addr}, metadata !13), !dbg !14
-  call void @llvm.dbg.declare(metadata !{i32* %k2}, metadata !15), !dbg !16
+  call void @llvm.dbg.declare(metadata !{i32* %k.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i32* %k2}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
   %0 = load i32* %k.addr, align 4, !dbg !16
   %call = call i32 @_Z8test_exti(i32 %0), !dbg !16
   store i32 %call, i32* %k2, align 4, !dbg !16
@@ -53,7 +53,7 @@
 
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare i32 @_Z8test_exti(i32)
 
@@ -85,7 +85,7 @@
   br i1 %matches, label %catch, label %eh.resume, !dbg !23
 
 catch:                                            ; preds = %catch.dispatch
-  call void @llvm.dbg.declare(metadata !{i32* %e}, metadata !24), !dbg !25
+  call void @llvm.dbg.declare(metadata !{i32* %e}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
   %exn = load i8** %exn.slot, !dbg !23
   %5 = call i8* @__cxa_begin_catch(i8* %exn) #2, !dbg !23
   %6 = bitcast i8* %5 to i32*, !dbg !23
@@ -122,35 +122,35 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !""}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"test", metadata !"test", metadata !"_Z4testi", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4testi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
+!4 = metadata !{metadata !"0x2e\00test\00test\00_Z4testi\004\000\001\000\006\00256\000\004", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @_Z4testi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
 !5 = metadata !{metadata !"test.cpp", metadata !""}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [test.cpp]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [test.cpp]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"test2", metadata !"test2", metadata !"_Z5test2v", i32 11, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z5test2v, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x2e\00test2\00test2\00_Z5test2v\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !6, metadata !11, null, i32 ()* @_Z5test2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !9}
-!13 = metadata !{i32 786689, metadata !4, metadata !"k", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [k] [line 4]
+!13 = metadata !{metadata !"0x101\00k\0016777220\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [k] [line 4]
 !14 = metadata !{i32 4, i32 0, metadata !4, null}
-!15 = metadata !{i32 786688, metadata !4, metadata !"k2", metadata !6, i32 5, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k2] [line 5]
+!15 = metadata !{metadata !"0x100\00k2\005\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [k2] [line 5]
 !16 = metadata !{i32 5, i32 0, metadata !4, null}
 !17 = metadata !{i32 6, i32 0, metadata !4, null}
 !18 = metadata !{i32 7, i32 0, metadata !4, null}
 !19 = metadata !{i32 8, i32 0, metadata !4, null}
 !20 = metadata !{i32 9, i32 0, metadata !4, null}
 !21 = metadata !{i32 14, i32 0, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !5, metadata !10, i32 13, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [test.cpp]
+!22 = metadata !{metadata !"0xb\0013\000\000", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [test.cpp]
 !23 = metadata !{i32 15, i32 0, metadata !22, null}
-!24 = metadata !{i32 786688, metadata !10, metadata !"e", metadata !6, i32 16, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [e] [line 16]
+!24 = metadata !{metadata !"0x100\00e\0016\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [e] [line 16]
 !25 = metadata !{i32 16, i32 0, metadata !10, null}
 !26 = metadata !{i32 17, i32 0, metadata !27, null}
-!27 = metadata !{i32 786443, metadata !5, metadata !10, i32 16, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [test.cpp]
+!27 = metadata !{metadata !"0xb\0016\000\001", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [test.cpp]
 !28 = metadata !{i32 18, i32 0, metadata !27, null}
 !29 = metadata !{i32 19, i32 0, metadata !10, null}
 !30 = metadata !{i32 20, i32 0, metadata !10, null}
-!31 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!31 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/inline-debug-info.ll b/test/DebugInfo/inline-debug-info.ll
index b56ca95..3f971e4 100644
--- a/test/DebugInfo/inline-debug-info.ll
+++ b/test/DebugInfo/inline-debug-info.ll

@@ -47,8 +47,8 @@
   %k.addr = alloca i32, align 4
   %k2 = alloca i32, align 4
   store i32 %k, i32* %k.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %k.addr}, metadata !13), !dbg !14
-  call void @llvm.dbg.declare(metadata !{i32* %k2}, metadata !15), !dbg !16
+  call void @llvm.dbg.declare(metadata !{i32* %k.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i32* %k2}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
   %0 = load i32* %k.addr, align 4, !dbg !16
   %call = call i32 @_Z8test_exti(i32 %0), !dbg !16
   store i32 %call, i32* %k2, align 4, !dbg !16
@@ -71,7 +71,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare i32 @_Z8test_exti(i32)
 
@@ -103,7 +103,7 @@
   br i1 %matches, label %catch, label %eh.resume, !dbg !23
 
 catch:                                            ; preds = %catch.dispatch
-  call void @llvm.dbg.declare(metadata !{i32* %e}, metadata !24), !dbg !25
+  call void @llvm.dbg.declare(metadata !{i32* %e}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
   %exn = load i8** %exn.slot, !dbg !23
   %5 = call i8* @__cxa_begin_catch(i8* %exn) #2, !dbg !23
   %6 = bitcast i8* %5 to i32*, !dbg !23
@@ -140,35 +140,35 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !""}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4, metadata !10}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"test", metadata !"test", metadata !"_Z4testi", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4testi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
+!4 = metadata !{metadata !"0x2e\00test\00test\00_Z4testi\004\000\001\000\006\00256\000\004", metadata !5, metadata !6, metadata !7, null, i32 (i32)* @_Z4testi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
 !5 = metadata !{metadata !"test.cpp", metadata !""}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [test.cpp]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [test.cpp]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"test2", metadata !"test2", metadata !"_Z5test2v", i32 11, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z5test2v, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x2e\00test2\00test2\00_Z5test2v\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !6, metadata !11, null, i32 ()* @_Z5test2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !9}
-!13 = metadata !{i32 786689, metadata !4, metadata !"k", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [k] [line 4]
+!13 = metadata !{metadata !"0x101\00k\0016777220\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [k] [line 4]
 !14 = metadata !{i32 4, i32 0, metadata !4, null}
-!15 = metadata !{i32 786688, metadata !4, metadata !"k2", metadata !6, i32 5, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k2] [line 5]
+!15 = metadata !{metadata !"0x100\00k2\005\000", metadata !4, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [k2] [line 5]
 !16 = metadata !{i32 5, i32 0, metadata !4, null}
 !17 = metadata !{i32 6, i32 0, metadata !4, null}
 !18 = metadata !{i32 7, i32 0, metadata !4, null}
 !19 = metadata !{i32 8, i32 0, metadata !4, null}
 !20 = metadata !{i32 9, i32 0, metadata !4, null}
 !21 = metadata !{i32 14, i32 0, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !5, metadata !10, i32 13, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [test.cpp]
+!22 = metadata !{metadata !"0xb\0013\000\000", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [test.cpp]
 !23 = metadata !{i32 15, i32 0, metadata !22, null}
-!24 = metadata !{i32 786688, metadata !10, metadata !"e", metadata !6, i32 16, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [e] [line 16]
+!24 = metadata !{metadata !"0x100\00e\0016\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [e] [line 16]
 !25 = metadata !{i32 16, i32 0, metadata !10, null}
 !26 = metadata !{i32 17, i32 0, metadata !27, null}
-!27 = metadata !{i32 786443, metadata !5, metadata !10, i32 16, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [test.cpp]
+!27 = metadata !{metadata !"0xb\0016\000\001", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [test.cpp]
 !28 = metadata !{i32 18, i32 0, metadata !27, null}
 !29 = metadata !{i32 19, i32 0, metadata !10, null}
 !30 = metadata !{i32 20, i32 0, metadata !10, null}
-!31 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!31 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/inline-no-debug-info.ll b/test/DebugInfo/inline-no-debug-info.ll
index 2257b89..2de6a49 100644
--- a/test/DebugInfo/inline-no-debug-info.ll
+++ b/test/DebugInfo/inline-no-debug-info.ll

@@ -54,16 +54,16 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (210174)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (210174)\001\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test.c", metadata !"/code/llvm/build0"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"caller", metadata !"caller", metadata !"", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void ()* @caller, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [caller]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/code/llvm/build0/test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"callee2", metadata !"callee2", metadata !"", i32 2, metadata !6, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [callee2]
+!4 = metadata !{metadata !"0x2e\00caller\00caller\00\004\000\001\000\006\000\001\004", metadata !1, metadata !5, metadata !6, null, void ()* @caller, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [caller]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/code/llvm/build0/test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00callee2\00callee2\00\002\001\001\000\006\000\001\002", metadata !1, metadata !5, metadata !6, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [callee2]
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5.0 (210174)"}
 !11 = metadata !{i32 2, i32 0, metadata !7, null}
 !12 = metadata !{i32 4, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/inline-scopes.ll b/test/DebugInfo/inline-scopes.ll
index 36c0735..cdcfaf5 100644
--- a/test/DebugInfo/inline-scopes.ll
+++ b/test/DebugInfo/inline-scopes.ll

@@ -43,7 +43,7 @@
   %b.i3 = alloca i8, align 1
   %retval.i = alloca i32, align 4
   %b.i = alloca i8, align 1
-  call void @llvm.dbg.declare(metadata !{i8* %b.i}, metadata !16), !dbg !19
+  call void @llvm.dbg.declare(metadata !{i8* %b.i}, metadata !16, metadata !{metadata !"0x102"}), !dbg !19
   %call.i = call zeroext i1 @_Z1fv(), !dbg !19
   %frombool.i = zext i1 %call.i to i8, !dbg !19
   store i8 %frombool.i, i8* %b.i, align 1, !dbg !19
@@ -61,7 +61,7 @@
 
 _Z2f1v.exit:                                      ; preds = %if.then.i, %if.end.i
   %1 = load i32* %retval.i, !dbg !23
-  call void @llvm.dbg.declare(metadata !{i8* %b.i3}, metadata !24), !dbg !27
+  call void @llvm.dbg.declare(metadata !{i8* %b.i3}, metadata !24, metadata !{metadata !"0x102"}), !dbg !27
   %call.i4 = call zeroext i1 @_Z1fv(), !dbg !27
   %frombool.i5 = zext i1 %call.i4 to i8, !dbg !27
   store i8 %frombool.i5, i8* %b.i3, align 1, !dbg !27
@@ -83,7 +83,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare zeroext i1 @_Z1fv() #2
 
@@ -95,33 +95,33 @@
 !llvm.module.flags = !{!13, !14}
 !llvm.ident = !{!15}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-scopes.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/inline-scopes.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"inline-scopes.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !10, metadata !12}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !5, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
 !5 = metadata !{metadata !"y.cc", metadata !"/tmp/dbginfo"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/y.cc]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/y.cc]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"f2", metadata !"f2", metadata !"_Z2f2v", i32 8, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !2, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [f2]
-!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-scopes.cpp]
-!12 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"f1", metadata !"f1", metadata !"_Z2f1v", i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x2e\00f2\00f2\00_Z2f2v\008\000\001\000\006\00256\000\008", metadata !1, metadata !11, metadata !7, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [f2]
+!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/inline-scopes.cpp]
+!12 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1v\002\000\001\000\006\00256\000\002", metadata !1, metadata !11, metadata !7, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
 !13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !15 = metadata !{metadata !"clang version 3.5.0 "}
-!16 = metadata !{i32 786688, metadata !17, metadata !"b", metadata !11, i32 3, metadata !18, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 3]
-!17 = metadata !{i32 786443, metadata !1, metadata !12, i32 3, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/inline-scopes.cpp]
-!18 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!16 = metadata !{metadata !"0x100\00b\003\000", metadata !17, metadata !11, metadata !18} ; [ DW_TAG_auto_variable ] [b] [line 3]
+!17 = metadata !{metadata !"0xb\003\000\001", metadata !1, metadata !12} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/inline-scopes.cpp]
+!18 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
 !19 = metadata !{i32 3, i32 0, metadata !17, metadata !20}
-!20 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!20 = metadata !{i32 8, i32 0, metadata !4, null}
 !21 = metadata !{i32 4, i32 0, metadata !17, metadata !20}
 !22 = metadata !{i32 5, i32 0, metadata !12, metadata !20}
 !23 = metadata !{i32 6, i32 0, metadata !12, metadata !20}
-!24 = metadata !{i32 786688, metadata !25, metadata !"b", metadata !6, i32 2, metadata !18, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 2]
-!25 = metadata !{i32 786443, metadata !5, metadata !26, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
-!26 = metadata !{i32 786443, metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
+!24 = metadata !{metadata !"0x100\00b\002\000", metadata !25, metadata !6, metadata !18} ; [ DW_TAG_auto_variable ] [b] [line 2]
+!25 = metadata !{metadata !"0xb\002\000\000", metadata !5, metadata !26} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
+!26 = metadata !{metadata !"0xb\000", metadata !5, metadata !10} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/y.cc]
 !27 = metadata !{i32 2, i32 0, metadata !25, metadata !28}
 !28 = metadata !{i32 9, i32 0, metadata !4, null}
 !29 = metadata !{i32 3, i32 0, metadata !25, metadata !28}

diff --git a/test/DebugInfo/inlined-arguments.ll b/test/DebugInfo/inlined-arguments.ll
index 6979862..71d4414 100644
--- a/test/DebugInfo/inlined-arguments.ll
+++ b/test/DebugInfo/inlined-arguments.ll

@@ -24,16 +24,16 @@
 
 ; Function Attrs: uwtable
 define void @_Z2f2v() #0 {
-  tail call void @llvm.dbg.value(metadata !15, i64 0, metadata !16), !dbg !18
-  tail call void @llvm.dbg.value(metadata !19, i64 0, metadata !20), !dbg !18
+  tail call void @llvm.dbg.value(metadata !15, i64 0, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
+  tail call void @llvm.dbg.value(metadata !19, i64 0, metadata !20, metadata !{metadata !"0x102"}), !dbg !18
   tail call void @_Z2f3i(i32 2), !dbg !21
   ret void, !dbg !22
 }
 
 ; Function Attrs: uwtable
 define void @_Z2f1ii(i32 %x, i32 %y) #0 {
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !13), !dbg !23
-  tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !14), !dbg !23
+  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !23
+  tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !14, metadata !{metadata !"0x102"}), !dbg !23
   tail call void @_Z2f3i(i32 %y), !dbg !24
   ret void, !dbg !25
 }
@@ -41,7 +41,7 @@
 declare void @_Z2f3i(i32) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -50,30 +50,30 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!26}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"exp.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f2", metadata !"f2", metadata !"_Z2f2v", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z2f2v, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f2]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00f2\00f2\00_Z2f2v\003\000\001\000\006\00256\001\003", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f2]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
-!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f1", metadata !"f1", metadata !"_Z2f1ii", i32 6, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32, i32)* @_Z2f1ii, null, null, metadata !12, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
-!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1ii\006\000\001\000\006\00256\001\006", metadata !1, metadata !5, metadata !9, null, void (i32, i32)* @_Z2f1ii, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{null, metadata !11, metadata !11}
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13, metadata !14}
-!13 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !5, i32 16777222, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 6]
-!14 = metadata !{i32 786689, metadata !8, metadata !"y", metadata !5, i32 33554438, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [y] [line 6]
+!13 = metadata !{metadata !"0x101\00x\0016777222\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [x] [line 6]
+!14 = metadata !{metadata !"0x101\00y\0033554438\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [y] [line 6]
 !15 = metadata !{i32 undef}
-!16 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !5, i32 16777222, metadata !11, i32 0, metadata !17} ; [ DW_TAG_arg_variable ] [x] [line 6]
+!16 = metadata !{metadata !"0x101\00x\0016777222\000", metadata !8, metadata !5, metadata !11, metadata !17} ; [ DW_TAG_arg_variable ] [x] [line 6]
 !17 = metadata !{i32 4, i32 0, metadata !4, null}
 !18 = metadata !{i32 6, i32 0, metadata !8, metadata !17}
 !19 = metadata !{i32 2}
-!20 = metadata !{i32 786689, metadata !8, metadata !"y", metadata !5, i32 33554438, metadata !11, i32 0, metadata !17} ; [ DW_TAG_arg_variable ] [y] [line 6]
+!20 = metadata !{metadata !"0x101\00y\0033554438\000", metadata !8, metadata !5, metadata !11, metadata !17} ; [ DW_TAG_arg_variable ] [y] [line 6]
 !21 = metadata !{i32 7, i32 0, metadata !8, metadata !17}
 !22 = metadata !{i32 5, i32 0, metadata !4, null}
 !23 = metadata !{i32 6, i32 0, metadata !8, null}
 !24 = metadata !{i32 7, i32 0, metadata !8, null}
-!25 = metadata !{i32 8, i32 0, metadata !8, null} ; [ DW_TAG_imported_declaration ]
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!25 = metadata !{i32 8, i32 0, metadata !8, null}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/inlined-vars.ll b/test/DebugInfo/inlined-vars.ll
index 9cfde1f..1c540ec 100644
--- a/test/DebugInfo/inlined-vars.ll
+++ b/test/DebugInfo/inlined-vars.ll

@@ -4,8 +4,8 @@
 
 define i32 @main() uwtable {
 entry:
-  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !18), !dbg !21
-  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !22), !dbg !23
+  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !18, metadata !{metadata !"0x102"}), !dbg !21
+  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
   tail call void @smth(i32 0), !dbg !24
   tail call void @smth(i32 0), !dbg !25
   ret i32 0, !dbg !19
@@ -13,44 +13,44 @@
 
 declare void @smth(i32)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!27}
 
-!0 = metadata !{i32 786449, metadata !26, i32 4, metadata !"clang version 3.2 (trunk 159419)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2,  metadata !2, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.2 (trunk 159419)\001\00\000\00\000", metadata !26, metadata !2, metadata !2, metadata !3, metadata !2,  metadata !2} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !2 = metadata !{}
 !3 = metadata !{metadata !5, metadata !10}
-!5 = metadata !{i32 786478, metadata !26, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 10, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !2, i32 10} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00main\00main\00\0010\000\001\000\006\00256\001\0010", metadata !26, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786478, metadata !26, metadata !6, metadata !"f", metadata !"f", metadata !"_ZL1fi", i32 3, metadata !11, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !13, i32 3} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x2e\00f\00f\00_ZL1fi\003\001\001\000\006\00256\001\003", metadata !26, metadata !6, metadata !11, null, null, null, null, metadata !13} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !9, metadata !9}
 !13 = metadata !{metadata !15, metadata !16}
-!15 = metadata !{i32 786689, metadata !10, metadata !"argument", metadata !6, i32 16777219, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{metadata !"0x101\00argument\0016777219\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
 
 ; Two DW_TAG_formal_parameter: one abstract and one inlined.
 ; ARGUMENT: {{.*Abbrev.*DW_TAG_formal_parameter}}
 ; ARGUMENT: {{.*Abbrev.*DW_TAG_formal_parameter}}
 ; ARGUMENT-NOT: {{.*Abbrev.*DW_TAG_formal_parameter}}
 
-!16 = metadata !{i32 786688, metadata !10, metadata !"local", metadata !6, i32 4, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+!16 = metadata !{metadata !"0x100\00local\004\000", metadata !10, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
 
 ; Two DW_TAG_variable: one abstract and one inlined.
 ; VARIABLE: {{.*Abbrev.*DW_TAG_variable}}
 ; VARIABLE: {{.*Abbrev.*DW_TAG_variable}}
 ; VARIABLE-NOT: {{.*Abbrev.*DW_TAG_variable}}
 
-!18 = metadata !{i32 786689, metadata !10, metadata !"argument", metadata !6, i32 16777219, metadata !9, i32 0, metadata !19} ; [ DW_TAG_arg_variable ]
+!18 = metadata !{metadata !"0x101\00argument\0016777219\000", metadata !10, metadata !6, metadata !9, metadata !19} ; [ DW_TAG_arg_variable ]
 !19 = metadata !{i32 11, i32 10, metadata !5, null}
 !21 = metadata !{i32 3, i32 25, metadata !10, metadata !19}
-!22 = metadata !{i32 786688, metadata !10, metadata !"local", metadata !6, i32 4, metadata !9, i32 0, metadata !19} ; [ DW_TAG_auto_variable ]
+!22 = metadata !{metadata !"0x100\00local\004\000", metadata !10, metadata !6, metadata !9, metadata !19} ; [ DW_TAG_auto_variable ]
 !23 = metadata !{i32 4, i32 16, metadata !10, metadata !19}
 !24 = metadata !{i32 5, i32 3, metadata !10, metadata !19}
 !25 = metadata !{i32 6, i32 3, metadata !10, metadata !19}
 !26 = metadata !{metadata !"inline-bug.cc", metadata !"/tmp/dbginfo/pr13202"}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test
index 20d3dda..8a2aaaa 100644
--- a/test/DebugInfo/llvm-symbolizer.test
+++ b/test/DebugInfo/llvm-symbolizer.test

@@ -6,9 +6,9 @@
 RUN: echo "%p/Inputs/dwarfdump-test2.elf-x86-64 0x4004e8" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-test2.elf-x86-64 0x4004f4" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-test4.elf-x86-64 0x62c" >> %t.input
-RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x710" >> %t.input
-RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x7d1" >> %t.input
-RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x785" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x8dc" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0xa05" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x987" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-inl-test.high_pc.elf-x86-64 0x568" >> %t.input
 RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x640" >> %t.input
 RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x633" >> %t.input
@@ -19,6 +19,10 @@
 RUN: echo "%p/Inputs/llvm-symbolizer-dwo-test 0x400514" >> %t.input
 RUN: echo "%p/Inputs/fission-ranges.elf-x86_64 0x720" >> %t.input
 RUN: echo "%p/Inputs/arange-overlap.elf-x86_64 0x714" >> %t.input
+RUN: cp %p/Inputs/split-dwarf-test.dwo %T
+RUN: echo "%p/Inputs/split-dwarf-test 0x4004d0" >> %t.input
+RUN: echo "%p/Inputs/split-dwarf-test 0x4004c0" >> %t.input
+RUN: echo "%p/Inputs/cross-cu-inlining.x86_64-macho.o 0x17" >> %t.input
 
 RUN: llvm-symbolizer --functions=linkage --inlining --demangle=false \
 RUN:    --default-arch=i386 < %t.input | FileCheck %s
@@ -98,6 +102,21 @@
 CHECK: _ZN1S3bazEv
 CHECK-NEXT: {{.*}}arange-overlap.cc:6
 
+CHECK: _Z3fooi
+CHECK-NEXT: {{.*}}split-dwarf-test.cc
+CHECK-NEXT: main
+CHECK-NEXT: {{.*}}split-dwarf-test.cc
+
+CHECK: _Z3fooi
+CHECK-NEXT: {{.*}}split-dwarf-test.cc
+
+; func has been inlined into main by LTO. Check that the symbolizer is able
+; to resolve the cross-cu reference and retrieve func's name
+CHECK: func
+CHECK-NEXT: /tmp{{[/\\]}}cross-cu-inlining.c:16:3
+CHECK-NEXT: main
+CHECK-NEXT: /tmp{{[/\\]}}cross-cu-inlining.c:11:0
+
 RUN: echo "unexisting-file 0x1234" > %t.input2
 RUN: llvm-symbolizer < %t.input2
 

diff --git a/test/DebugInfo/lto-comp-dir.ll b/test/DebugInfo/lto-comp-dir.ll
index d272dff..f07b751 100644
--- a/test/DebugInfo/lto-comp-dir.ll
+++ b/test/DebugInfo/lto-comp-dir.ll

@@ -59,24 +59,24 @@
 !llvm.module.flags = !{!16, !17}
 !llvm.ident = !{!18, !18}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1}
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"a.cpp", metadata !"/tmp/dbginfo/a"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_Z4funcv", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4funcv, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a/a.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00func\00func\00_Z4funcv\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @_Z4funcv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [func]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/a/a.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
-!8 = metadata !{i32 786449, metadata !9, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !10, metadata !2, metadata !2, metadata !"", i32 1}
+!8 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !9, metadata !2, metadata !2, metadata !10, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
 !9 = metadata !{metadata !"b.cpp", metadata !"/tmp/dbginfo/b"}
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786478, metadata !9, metadata !12, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
-!12 = metadata !{i32 786473, metadata !9}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/b/b.cpp]
-!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0x2e\00main\00main\00\002\000\001\000\006\00256\000\002", metadata !9, metadata !12, metadata !13, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
+!12 = metadata !{metadata !"0x29", metadata !9}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/b/b.cpp]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{metadata !15}
-!15 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!15 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !16 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !18 = metadata !{metadata !"clang version 3.5.0 "}
 !19 = metadata !{i32 2, i32 0, metadata !4, null}
 !20 = metadata !{i32 3, i32 0, metadata !11, null}

diff --git a/test/DebugInfo/member-order.ll b/test/DebugInfo/member-order.ll
index 652a6cd..de485a6 100644
--- a/test/DebugInfo/member-order.ll
+++ b/test/DebugInfo/member-order.ll

@@ -29,13 +29,13 @@
 entry:
   %this.addr = alloca %struct.foo*, align 8
   store %struct.foo* %this, %struct.foo** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !16), !dbg !18
+  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !18
   %this1 = load %struct.foo** %this.addr
   ret void, !dbg !19
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -43,24 +43,24 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !20}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !13, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/member-order.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !13, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/member-order.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"member-order.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00foo\001\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6, metadata !11}
-!6 = metadata !{i32 786478, metadata !1, metadata !4, metadata !"f1", metadata !"f1", metadata !"_ZN3foo2f1Ev", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !10, i32 2} ; [ DW_TAG_subprogram ] [line 2] [f1]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x2e\00f1\00f1\00_ZN3foo2f1Ev\002\000\000\000\006\00256\000\002", metadata !1, metadata !4, metadata !7, null, null, null, i32 0, metadata !10} ; [ DW_TAG_subprogram ] [line 2] [f1]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
 !10 = metadata !{i32 786468}
-!11 = metadata !{i32 786478, metadata !1, metadata !4, metadata !"f2", metadata !"f2", metadata !"_ZN3foo2f2Ev", i32 3, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !12, i32 3} ; [ DW_TAG_subprogram ] [line 3] [f2]
+!11 = metadata !{metadata !"0x2e\00f2\00f2\00_ZN3foo2f2Ev\003\000\000\000\006\00256\000\003", metadata !1, metadata !4, metadata !7, null, null, null, i32 0, metadata !12} ; [ DW_TAG_subprogram ] [line 3] [f2]
 !12 = metadata !{i32 786468}
 !13 = metadata !{metadata !14}
-!14 = metadata !{i32 786478, metadata !1, null, metadata !"f1", metadata !"f1", metadata !"_ZN3foo2f1Ev", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_ZN3foo2f1Ev, null, metadata !6, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
+!14 = metadata !{metadata !"0x2e\00f1\00f1\00_ZN3foo2f1Ev\006\000\001\000\006\00256\000\006", metadata !1, null, metadata !7, null, void (%struct.foo*)* @_ZN3foo2f1Ev, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
 !15 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!16 = metadata !{i32 786689, metadata !14, metadata !"this", null, i32 16777216, metadata !17, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!16 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !14, null, metadata !17} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
 !18 = metadata !{i32 0, i32 0, metadata !14, null}
 !19 = metadata !{i32 7, i32 0, metadata !14, null}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/member-pointers.ll b/test/DebugInfo/member-pointers.ll
index 4ca6942..4d45ba6 100644
--- a/test/DebugInfo/member-pointers.ll
+++ b/test/DebugInfo/member-pointers.ll

@@ -23,18 +23,18 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16}
 
-!0 = metadata !{i32 786449, metadata !15, i32 4, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/blaikie/Development/scratch/simple.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 \000\00\000\00\000", metadata !15, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !1} ; [ DW_TAG_compile_unit ] [/home/blaikie/Development/scratch/simple.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{}
 !3 = metadata !{metadata !5, metadata !10}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 4, metadata !7, i32 0, i32 1, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
-!6 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786451, metadata !15, null, metadata !"S", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !1, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 8, align 8, offset 0] [def] [from ]
-!10 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !6, i32 5, metadata !11, i32 0, i32 1, { i64, i64 }* @y, null} ; [ DW_TAG_variable ] [y] [line 5] [def]
-!11 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !12, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x34\00x\00x\00\004\000\001", null, metadata !6, metadata !7, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!6 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !8, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x13\00S\001\008\008\000\000\000", metadata !15, null, null, metadata !1, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 8, align 8, offset 0] [def] [from ]
+!10 = metadata !{metadata !"0x34\00y\00y\00\005\000\001", null, metadata !6, metadata !11, { i64, i64 }* @y, null} ; [ DW_TAG_variable ] [y] [line 5] [def]
+!11 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !12, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{null, metadata !14, metadata !8}
-!14 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from S]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from S]
 !15 = metadata !{metadata !"simple.cpp", metadata !"/home/blaikie/Development/scratch"}
-!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/tools/llvm-profdata/Inputs/empty.profdata b/test/DebugInfo/member-pointers.o
similarity index 100%
copy from test/tools/llvm-profdata/Inputs/empty.profdata
copy to test/DebugInfo/member-pointers.o


diff --git a/test/DebugInfo/missing-abstract-variable.ll b/test/DebugInfo/missing-abstract-variable.ll
index 59a38cf..104080a 100644
--- a/test/DebugInfo/missing-abstract-variable.ll
+++ b/test/DebugInfo/missing-abstract-variable.ll

@@ -37,19 +37,17 @@
 ;   x(u);
 ; }
 
-; CHECK: [[ABS_X:.*]]: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_name {{.*}} "x"
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: [[ABS_B:.*]]:   DW_TAG_formal_parameter
+; CHECK:   DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_AT_name {{.*}} "b"
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK:     DW_TAG_lexical_block
-; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK:       DW_TAG_lexical_block
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: [[ABS_S:.*]]:       DW_TAG_variable
+; CHECK:   DW_TAG_variable
 ; CHECK-NOT: DW_TAG
 ; CHECK:         DW_AT_name {{.*}} "s"
 
@@ -59,11 +57,11 @@
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK:   DW_TAG_inlined_subroutine
 ; CHECK-NOT: DW_TAG
-; CHECK:     DW_AT_abstract_origin {{.*}} {[[ABS_X]]}
+; CHECK:     DW_AT_abstract_origin {{.*}} "_Z1xb"
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK:     DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK:       DW_AT_abstract_origin {{.*}} {[[ABS_B]]}
+; CHECK:       DW_AT_abstract_origin {{.*}} "b"
 ; Notice 'x's local variable 's' is missing. Not necessarily a bug here,
 ; since it's been optimized entirely away and it should be described in
 ; abstract subprogram.
@@ -80,35 +78,28 @@
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK:   DW_TAG_inlined_subroutine
 ; CHECK-NOT: DW_TAG
-; CHECK:     DW_AT_abstract_origin {{.*}} {[[ABS_X]]}
+; CHECK:     DW_AT_abstract_origin {{.*}} "_Z1xb"
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; FIXME: This formal parameter goes missing at least at -O2 (& on
 ; mips/powerpc), maybe before that. Perhaps SelectionDAG is to blame (and
 ; fastisel succeeds).
 ; CHECK:     DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK:       DW_AT_abstract_origin {{.*}} {[[ABS_B]]}
-
-; The two lexical blocks here are caused by the scope of the if that includes
-; the condition variable, and the scope within the if's composite statement. I'm
-; not sure we really need both of them since there's no variable declared in the
-; outer of the two
+; CHECK:       DW_AT_abstract_origin {{.*}} "b"
 
 ; CHECK-NOT: {{DW_TAG|NULL}}
 ; CHECK:     DW_TAG_lexical_block
 ; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK:       DW_TAG_lexical_block
-; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK:         DW_TAG_variable
+; CHECK:       DW_TAG_variable
 ; CHECK-NOT: DW_TAG
-; CHECK:           DW_AT_abstract_origin {{.*}} {[[ABS_S]]}
+; CHECK:         DW_AT_abstract_origin {{.*}} "s"
 
 @t = external global i32
 
 ; Function Attrs: uwtable
 define void @_Z1bv() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !24, i64 0, metadata !25), !dbg !27
+  tail call void @llvm.dbg.value(metadata !24, i64 0, metadata !25, metadata !{metadata !"0x102"}), !dbg !27
   tail call void @_Z1fi(i32 0), !dbg !28
   ret void, !dbg !29
 }
@@ -116,13 +107,13 @@
 ; Function Attrs: uwtable
 define void @_Z1ab(i1 zeroext %u) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !13), !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !31), !dbg !33
+  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !13, metadata !{metadata !"0x102"}), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i1 %u}, i64 0, metadata !31, metadata !{metadata !"0x102"}), !dbg !33
   br i1 %u, label %if.then.i, label %_Z1xb.exit, !dbg !34
 
 if.then.i:                                        ; preds = %entry
   %0 = load i32* @t, align 4, !dbg !35, !tbaa !36
-  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !40), !dbg !35
+  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !40, metadata !{metadata !"0x102"}), !dbg !35
   tail call void @_Z1fi(i32 %0), !dbg !41
   br label %_Z1xb.exit, !dbg !42
 
@@ -134,7 +125,7 @@
 declare void @_Z1fi(i32) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -144,38 +135,38 @@
 !llvm.module.flags = !{!21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/missing-abstract-variables.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/missing-abstract-variables.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"missing-abstract-variables.cc", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !8, metadata !14}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"b", metadata !"b", metadata !"_Z1bv", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z1bv, null, null, metadata !2, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/missing-abstract-variables.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00b\00b\00_Z1bv\0013\000\001\000\006\00256\001\0013", metadata !1, metadata !5, metadata !6, null, void ()* @_Z1bv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
-!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"a", metadata !"a", metadata !"_Z1ab", i32 17, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i1)* @_Z1ab, null, null, metadata !12, i32 17} ; [ DW_TAG_subprogram ] [line 17] [def] [a]
-!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x2e\00a\00a\00_Z1ab\0017\000\001\000\006\00256\001\0017", metadata !1, metadata !5, metadata !9, null, void (i1)* @_Z1ab, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 17] [def] [a]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{null, metadata !11}
-!11 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!11 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786689, metadata !8, metadata !"u", metadata !5, i32 16777233, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [u] [line 17]
-!14 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x", metadata !"x", metadata !"_Z1xb", i32 5, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !15, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [x]
+!13 = metadata !{metadata !"0x101\00u\0016777233\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [u] [line 17]
+!14 = metadata !{metadata !"0x2e\00x\00x\00_Z1xb\005\000\001\000\006\00256\001\005", metadata !1, metadata !5, metadata !9, null, null, null, null, metadata !15} ; [ DW_TAG_subprogram ] [line 5] [def] [x]
 !15 = metadata !{metadata !16, metadata !17}
-!16 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !5, i32 16777221, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 5]
-!17 = metadata !{i32 786688, metadata !18, metadata !"s", metadata !5, i32 7, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [s] [line 7]
-!18 = metadata !{i32 786443, metadata !1, metadata !19, i32 6, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
-!19 = metadata !{i32 786443, metadata !1, metadata !14, i32 6, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
-!20 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!16 = metadata !{metadata !"0x101\00b\0016777221\000", metadata !14, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!17 = metadata !{metadata !"0x100\00s\007\000", metadata !18, metadata !5, metadata !20} ; [ DW_TAG_auto_variable ] [s] [line 7]
+!18 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !19} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!19 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !14} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/missing-abstract-variables.cc]
+!20 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !21 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!22 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !23 = metadata !{metadata !"clang version 3.5.0 "}
 !24 = metadata !{i1 false}
-!25 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !5, i32 16777221, metadata !11, i32 0, metadata !26} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!25 = metadata !{metadata !"0x101\00b\0016777221\000", metadata !14, metadata !5, metadata !11, metadata !26} ; [ DW_TAG_arg_variable ] [b] [line 5]
 !26 = metadata !{i32 14, i32 0, metadata !4, null}
 !27 = metadata !{i32 5, i32 0, metadata !14, metadata !26}
 !28 = metadata !{i32 10, i32 0, metadata !14, metadata !26}
 !29 = metadata !{i32 15, i32 0, metadata !4, null}
 !30 = metadata !{i32 17, i32 0, metadata !8, null}
-!31 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !5, i32 16777221, metadata !11, i32 0, metadata !32} ; [ DW_TAG_arg_variable ] [b] [line 5]
+!31 = metadata !{metadata !"0x101\00b\0016777221\000", metadata !14, metadata !5, metadata !11, metadata !32} ; [ DW_TAG_arg_variable ] [b] [line 5]
 !32 = metadata !{i32 18, i32 0, metadata !8, null}
 !33 = metadata !{i32 5, i32 0, metadata !14, metadata !32}
 !34 = metadata !{i32 6, i32 0, metadata !19, metadata !32}
@@ -184,8 +175,8 @@
 !37 = metadata !{metadata !"int", metadata !38, i64 0}
 !38 = metadata !{metadata !"omnipotent char", metadata !39, i64 0}
 !39 = metadata !{metadata !"Simple C/C++ TBAA"}
-!40 = metadata !{i32 786688, metadata !18, metadata !"s", metadata !5, i32 7, metadata !20, i32 0, metadata !32} ; [ DW_TAG_auto_variable ] [s] [line 7]
-!41 = metadata !{i32 8, i32 0, metadata !18, metadata !32} ; [ DW_TAG_imported_declaration ]
+!40 = metadata !{metadata !"0x100\00s\007\000", metadata !18, metadata !5, metadata !20, metadata !32} ; [ DW_TAG_auto_variable ] [s] [line 7]
+!41 = metadata !{i32 8, i32 0, metadata !18, metadata !32}
 !42 = metadata !{i32 9, i32 0, metadata !18, metadata !32}
 !43 = metadata !{i32 10, i32 0, metadata !14, metadata !32}
 !44 = metadata !{i32 19, i32 0, metadata !8, null}

diff --git a/test/tools/llvm-profdata/Inputs/empty.profdata b/test/DebugInfo/missing-abstract-variable.o
similarity index 100%
copy from test/tools/llvm-profdata/Inputs/empty.profdata
copy to test/DebugInfo/missing-abstract-variable.o


diff --git a/test/DebugInfo/namespace.ll b/test/DebugInfo/namespace.ll
index a9de62c..edbeed5 100644
--- a/test/DebugInfo/namespace.ll
+++ b/test/DebugInfo/namespace.ll

@@ -5,16 +5,18 @@
 ; CHECK: debug_info contents
 ; CHECK: [[NS1:0x[0-9a-f]*]]:{{ *}}DW_TAG_namespace
 ; CHECK-NEXT: DW_AT_name{{.*}} = "A"
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F1:[0-9]]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x03)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F1:".*debug-info-namespace.cpp"]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(5)
 ; CHECK-NOT: NULL
 ; CHECK: [[NS2:0x[0-9a-f]*]]:{{ *}}DW_TAG_namespace
 ; CHECK-NEXT: DW_AT_name{{.*}} = "B"
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2:[0-9]]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x01)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2:".*foo.cpp"]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(1)
 ; CHECK-NOT: NULL
 ; CHECK: [[I:0x[0-9a-f]*]]:{{ *}}DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name{{.*}}= "i"
+; CHECK: [[VAR_FWD:0x[0-9a-f]*]]:{{ *}}DW_TAG_variable
+; CHECK-NEXT: DW_AT_name{{.*}}= "var_fwd"
 ; CHECK-NOT: NULL
 ; CHECK: [[FOO:0x[0-9a-f]*]]:{{ *}}DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name{{.*}}= "foo"
@@ -22,12 +24,28 @@
 ; CHECK-NOT: NULL
 ; CHECK: [[BAR:0x[0-9a-f]*]]:{{ *}}DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_name{{.*}}= "bar"
-; CHECK: NULL
 ; CHECK: [[FUNC1:.*]]: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_MIPS_linkage_name
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_name{{.*}}= "f1"
+; CHECK: [[BAZ:0x[0-9a-f]*]]:{{.*}}DW_TAG_typedef
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "baz"
+; CHECK: [[VAR_DECL:0x[0-9a-f]*]]:{{.*}}DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "var_decl"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_declaration
+; CHECK: [[FUNC_DECL:0x[0-9a-f]*]]:{{.*}}DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "func_decl"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_declaration
+; CHECK: [[FUNC_FWD:0x[0-9a-f]*]]:{{.*}}DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name{{.*}}= "func_fwd"
+; CHECK-NOT: DW_AT_declaration
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_MIPS_linkage_name
@@ -39,16 +57,16 @@
 ; CHECK: DW_TAG_imported_module
 ; This is a bug, it should be in F2 but it inherits the file from its
 ; enclosing scope
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F1]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x08)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F1]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(15)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS2]]})
 ; CHECK: NULL
 ; CHECK-NOT: NULL
 
 ; CHECK: DW_TAG_imported_module
 ; Same bug as above, this should be F2, not F1
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F1]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x0b)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F1]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(18)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
 ; CHECK-NOT: NULL
 
@@ -59,71 +77,102 @@
 ; CHECK: DW_AT_name{{.*}}= "func"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_module
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x12)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(26)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_declaration
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x13)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(27)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[FOO]]})
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_declaration
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x14)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(28)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[BAR]]})
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_declaration
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x15)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(29)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[FUNC1]]})
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_declaration
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x16)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(30)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[I]]})
 ; CHECK-NOT: NULL
+; CHECK: DW_TAG_imported_declaration
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(31)
+; CHECK-NEXT: DW_AT_import{{.*}}=> {[[BAZ]]})
+; CHECK-NOT: NULL
 ; CHECK: [[X:0x[0-9a-f]*]]:{{ *}}DW_TAG_imported_declaration
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x18)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(32)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS1]]})
 ; CHECK-NEXT: DW_AT_name{{.*}}"X"
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_declaration
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x19)
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(33)
 ; CHECK-NEXT: DW_AT_import{{.*}}=> {[[X]]})
 ; CHECK-NEXT: DW_AT_name{{.*}}"Y"
 ; CHECK-NOT: NULL
+; CHECK: DW_TAG_imported_declaration
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(34)
+; CHECK-NEXT: DW_AT_import{{.*}}=> {[[VAR_DECL]]})
+; CHECK: DW_TAG_imported_declaration
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(35)
+; CHECK-NEXT: DW_AT_import{{.*}}=> {[[FUNC_DECL]]})
+; CHECK: DW_TAG_imported_declaration
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(36)
+; CHECK-NEXT: DW_AT_import{{.*}}=> {[[VAR_FWD]]})
+; CHECK: DW_TAG_imported_declaration
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(37)
+; CHECK-NEXT: DW_AT_import{{.*}}=> {[[FUNC_FWD]]})
+
 ; CHECK: DW_TAG_lexical_block
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_imported_module
-; CHECK-NEXT: DW_AT_decl_file{{.*}}(0x0[[F2]])
-; CHECK-NEXT: DW_AT_decl_line{{.*}}(0x0f)
-; CHECK-NEXT: DW_AT_import{{.*}}=> {[[NS2]]})
+; CHECK-NEXT: DW_AT_decl_file{{.*}}([[F2]])
+; CHECK-NEXT: DW_AT_decl_line{{.*}}(23)
+; CHECK-NEXT: DW_AT_import{{.*}}=>
 ; CHECK: NULL
 ; CHECK: NULL
 ; CHECK: NULL
 
-; CHECK: file_names[  [[F1]]]{{.*}}debug-info-namespace.cpp
-; CHECK: file_names[  [[F2]]]{{.*}}foo.cpp
-
 ; IR generated from clang/test/CodeGenCXX/debug-info-namespace.cpp, file paths
 ; changed to protect the guilty. The C++ source code is:
+; // RUN...
+; // RUN...
+; // RUN...
+;
 ; namespace A {
 ; #line 1 "foo.cpp"
 ; namespace B {
-; int i;
-; void f1() { }
+; extern int i;
+; int f1() { return 0; }
 ; void f1(int) { }
 ; struct foo;
 ; struct bar { };
+; typedef bar baz;
+; extern int var_decl;
+; void func_decl(void);
+; extern int var_fwd;
+; void func_fwd(void);
 ; }
+; }
+; namespace A {
 ; using namespace B;
 ; }
 ;
 ; using namespace A;
-;
+; namespace E = A;
+; int B::i = f1();
 ; int func(bool b) {
 ;   if (b) {
 ;     using namespace A::B;
@@ -134,123 +183,184 @@
 ;   using B::bar;
 ;   using B::f1;
 ;   using B::i;
-;   bar x;
+;   using B::baz;
 ;   namespace X = A;
 ;   namespace Y = X;
+;   using B::var_decl;
+;   using B::func_decl;
+;   using B::var_fwd;
+;   using B::func_fwd;
 ;   return i + X::B::i + Y::B::i;
 ; }
-
-%"struct.A::B::bar" = type { i8 }
+;
+; namespace A {
+; using B::i;
+; namespace B {
+; int var_fwd = i;
+; }
+; }
+; void B::func_fwd() {}
 
 @_ZN1A1B1iE = global i32 0, align 4
+@_ZN1A1B7var_fwdE = global i32 0, align 4
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_debug_info_namespace.cpp, i8* null }]
 
-; Function Attrs: nounwind uwtable
-define void @_ZN1A1B2f1Ev() #0 {
+; Function Attrs: nounwind ssp uwtable
+define i32 @_ZN1A1B2f1Ev() #0 {
 entry:
-  ret void, !dbg !41
+  ret i32 0, !dbg !60
 }
 
-; Function Attrs: nounwind uwtable
+; Function Attrs: nounwind ssp uwtable
 define void @_ZN1A1B2f1Ei(i32) #0 {
 entry:
   %.addr = alloca i32, align 4
   store i32 %0, i32* %.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %.addr}, metadata !42), !dbg !43
-  ret void, !dbg !43
+  call void @llvm.dbg.declare(metadata !{i32* %.addr}, metadata !61, metadata !62), !dbg !63
+  ret void, !dbg !64
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-; Function Attrs: nounwind uwtable
+define internal void @__cxx_global_var_init() section "__TEXT,__StaticInit,regular,pure_instructions" {
+entry:
+  %call = call i32 @_ZN1A1B2f1Ev(), !dbg !65
+  store i32 %call, i32* @_ZN1A1B1iE, align 4, !dbg !65
+  ret void, !dbg !65
+}
+
+; Function Attrs: nounwind ssp uwtable
 define i32 @_Z4funcb(i1 zeroext %b) #0 {
 entry:
   %retval = alloca i32, align 4
   %b.addr = alloca i8, align 1
-  %x = alloca %"struct.A::B::bar", align 1
   %frombool = zext i1 %b to i8
   store i8 %frombool, i8* %b.addr, align 1
-  call void @llvm.dbg.declare(metadata !{i8* %b.addr}, metadata !44), !dbg !45
-  %0 = load i8* %b.addr, align 1, !dbg !46
-  %tobool = trunc i8 %0 to i1, !dbg !46
-  br i1 %tobool, label %if.then, label %if.end, !dbg !46
+  call void @llvm.dbg.declare(metadata !{i8* %b.addr}, metadata !66, metadata !62), !dbg !67
+  %0 = load i8* %b.addr, align 1, !dbg !68
+  %tobool = trunc i8 %0 to i1, !dbg !68
+  br i1 %tobool, label %if.then, label %if.end, !dbg !68
 
 if.then:                                          ; preds = %entry
-  %1 = load i32* @_ZN1A1B1iE, align 4, !dbg !47
-  store i32 %1, i32* %retval, !dbg !47
-  br label %return, !dbg !47
+  %1 = load i32* @_ZN1A1B1iE, align 4, !dbg !69
+  store i32 %1, i32* %retval, !dbg !69
+  br label %return, !dbg !69
 
 if.end:                                           ; preds = %entry
-  call void @llvm.dbg.declare(metadata !{%"struct.A::B::bar"* %x}, metadata !48), !dbg !49
-  %2 = load i32* @_ZN1A1B1iE, align 4, !dbg !50
-  %3 = load i32* @_ZN1A1B1iE, align 4, !dbg !50
-  %add = add nsw i32 %2, %3, !dbg !50
-  %4 = load i32* @_ZN1A1B1iE, align 4, !dbg !50
-  %add1 = add nsw i32 %add, %4, !dbg !50
-  store i32 %add1, i32* %retval, !dbg !50
-  br label %return, !dbg !50
+  %2 = load i32* @_ZN1A1B1iE, align 4, !dbg !70
+  %3 = load i32* @_ZN1A1B1iE, align 4, !dbg !70
+  %add = add nsw i32 %2, %3, !dbg !70
+  %4 = load i32* @_ZN1A1B1iE, align 4, !dbg !70
+  %add1 = add nsw i32 %add, %4, !dbg !70
+  store i32 %add1, i32* %retval, !dbg !70
+  br label %return, !dbg !70
 
 return:                                           ; preds = %if.end, %if.then
-  %5 = load i32* %retval, !dbg !51
-  ret i32 %5, !dbg !51
+  %5 = load i32* %retval, !dbg !71
+  ret i32 %5, !dbg !71
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+define internal void @__cxx_global_var_init1() section "__TEXT,__StaticInit,regular,pure_instructions" {
+entry:
+  %0 = load i32* @_ZN1A1B1iE, align 4, !dbg !72
+  store i32 %0, i32* @_ZN1A1B7var_fwdE, align 4, !dbg !72
+  ret void, !dbg !72
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @_ZN1A1B8func_fwdEv() #0 {
+entry:
+  ret void, !dbg !73
+}
+
+define internal void @_GLOBAL__sub_I_debug_info_namespace.cpp() section "__TEXT,__StaticInit,regular,pure_instructions" {
+entry:
+  call void @__cxx_global_var_init(), !dbg !74
+  call void @__cxx_global_var_init1(), !dbg !74
+  ret void, !dbg !74
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!52}
+!llvm.module.flags = !{!57, !58}
+!llvm.ident = !{!59}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !19, metadata !21, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/llvm/build/clang/debug//usr/local/google/home/blaikie/dev/llvm/src/tools/clang/test/CodeGenCXX/debug-info-namespace.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"/usr/local/google/home/blaikie/dev/llvm/src/tools/clang/test/CodeGenCXX/debug-info-namespace.cpp", metadata !"/usr/local/google/home/blaikie/dev/llvm/build/clang/debug"}
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !9, metadata !30, metadata !33} ; [ DW_TAG_compile_unit ] [/tmp/debug-info-namespace.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"debug-info-namespace.cpp", metadata !"/tmp"}
 !2 = metadata !{}
-!3 = metadata !{metadata !4, metadata !10, metadata !14}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f1", metadata !"f1", metadata !"_ZN1A1B2f1Ev", i32 3, metadata !8, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN1A1B2f1Ev, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f1]
-!5 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/blaikie/dev/llvm/build/clang/debug"}
-!6 = metadata !{i32 786489, metadata !5, metadata !7, metadata !"B", i32 1} ; [ DW_TAG_namespace ] [B] [line 1]
-!7 = metadata !{i32 786489, metadata !1, null, metadata !"A", i32 3} ; [ DW_TAG_namespace ] [A] [line 3]
-!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!9 = metadata !{null}
-!10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f1", metadata !"f1", metadata !"_ZN1A1B2f1Ei", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_ZN1A1B2f1Ei, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [f1]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{i32 786478, metadata !5, metadata !15, metadata !"func", metadata !"func", metadata !"_Z4funcb", i32 13, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i1)* @_Z4funcb, null, null, metadata !2, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [func]
-!15 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/llvm/build/clang/debug/foo.cpp]
-!16 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{metadata !13, metadata !18}
-!18 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
-!19 = metadata !{metadata !20}
-!20 = metadata !{i32 786484, i32 0, metadata !6, metadata !"i", metadata !"i", metadata !"_ZN1A1B1iE", metadata !15, i32 2, metadata !13, i32 0, i32 1, i32* @_ZN1A1B1iE, null} ; [ DW_TAG_variable ] [i] [line 2] [def]
-!21 = metadata !{metadata !22, metadata !23, metadata !24, metadata !26, metadata !27, metadata !29, metadata !37, metadata !38, metadata !39, metadata !40}
-!22 = metadata !{i32 786490, metadata !7, metadata !6, i32 8} ; [ DW_TAG_imported_module ]
-!23 = metadata !{i32 786490, metadata !0, metadata !7, i32 11} ; [ DW_TAG_imported_module ]
-!24 = metadata !{i32 786490, metadata !25, metadata !6, i32 15} ; [ DW_TAG_imported_module ]
-!25 = metadata !{i32 786443, metadata !5, metadata !14, i32 14, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/llvm/build/clang/debug/foo.cpp]
-!26 = metadata !{i32 786490, metadata !14, metadata !7, i32 18} ; [ DW_TAG_imported_module ]
-!27 = metadata !{i32 786440, metadata !14, metadata !28, i32 19} ; [ DW_TAG_imported_declaration ]
-!28 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"foo", i32 5, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 5, size 0, align 0, offset 0] [decl] [from ]
-!29 = metadata !{i32 786440, metadata !14, metadata !30, i32 20} ; [ DW_TAG_imported_declaration ]
-!30 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"bar", i32 6, i64 8, i64 8, i32 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 6, size 8, align 8, offset 0] [def] [from ]
-!31 = metadata !{metadata !32}
-!32 = metadata !{i32 786478, metadata !5, metadata !30, metadata !"bar", metadata !"bar", metadata !"", i32 6, metadata !33, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !36, i32 6} ; [ DW_TAG_subprogram ] [line 6] [bar]
-!33 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!34 = metadata !{null, metadata !35}
-!35 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !30} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from bar]
-!36 = metadata !{i32 786468}
-!37 = metadata !{i32 786440, metadata !14, metadata !10, i32 21} ; [ DW_TAG_imported_declaration ]
-!38 = metadata !{i32 786440, metadata !14, metadata !20, i32 22} ; [ DW_TAG_imported_declaration ]
-!39 = metadata !{i32 786440, metadata !14, metadata !7, i32 24, metadata !"X"} ; [ DW_TAG_imported_declaration ]
-!40 = metadata !{i32 786440, metadata !14, metadata !39, i32 25, metadata !"Y"} ; [ DW_TAG_imported_declaration ]
-!41 = metadata !{i32 3, i32 0, metadata !4, null}
-!42 = metadata !{i32 786689, metadata !10, metadata !"", metadata !15, i32 16777220, metadata !13, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 4]
-!43 = metadata !{i32 4, i32 0, metadata !10, null}
-!44 = metadata !{i32 786689, metadata !14, metadata !"b", metadata !15, i32 16777229, metadata !18, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 13]
-!45 = metadata !{i32 13, i32 0, metadata !14, null}
-!46 = metadata !{i32 14, i32 0, metadata !14, null}
-!47 = metadata !{i32 16, i32 0, metadata !25, null}
-!48 = metadata !{i32 786688, metadata !14, metadata !"x", metadata !15, i32 23, metadata !30, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [x] [line 23]
-!49 = metadata !{i32 23, i32 0, metadata !14, null}
-!50 = metadata !{i32 26, i32 0, metadata !14, null}
-!51 = metadata !{i32 27, i32 0, metadata !14, null}
-!52 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!3 = metadata !{metadata !4, metadata !8}
+!4 = metadata !{metadata !"0x13\00foo\005\000\000\000\004\000", metadata !5, metadata !6, null, null, null, null, metadata !"_ZTSN1A1B3fooE"} ; [ DW_TAG_structure_type ] [foo] [line 5, size 0, align 0, offset 0] [decl] [from ]
+!5 = metadata !{metadata !"foo.cpp", metadata !"/tmp"}
+!6 = metadata !{metadata !"0x39\00B\001", metadata !5, metadata !7} ; [ DW_TAG_namespace ] [B] [line 1]
+!7 = metadata !{metadata !"0x39\00A\005", metadata !1, null} ; [ DW_TAG_namespace ] [A] [line 5]
+!8 = metadata !{metadata !"0x13\00bar\006\008\008\000\000\000", metadata !5, metadata !6, null, metadata !2, null, null, metadata !"_ZTSN1A1B3barE"} ; [ DW_TAG_structure_type ] [bar] [line 6, size 8, align 8, offset 0] [def] [from ]
+!9 = metadata !{metadata !10, metadata !14, metadata !17, metadata !21, metadata !25, metadata !26, metadata !27}
+!10 = metadata !{metadata !"0x2e\00f1\00f1\00_ZN1A1B2f1Ev\003\000\001\000\000\00256\000\003", metadata !5, metadata !6, metadata !11, null, i32 ()* @_ZN1A1B2f1Ev, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f1]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !"0x2e\00f1\00f1\00_ZN1A1B2f1Ei\004\000\001\000\000\00256\000\004", metadata !5, metadata !6, metadata !15, null, void (i32)* @_ZN1A1B2f1Ei, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f1]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{null, metadata !13}
+!17 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\0020\001\001\000\000\00256\000\0020", metadata !5, metadata !18, metadata !19, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 20] [local] [def] [__cxx_global_var_init]
+!18 = metadata !{metadata !"0x29", metadata !5}   ; [ DW_TAG_file_type ] [/tmp/foo.cpp]
+!19 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{null}
+!21 = metadata !{metadata !"0x2e\00func\00func\00_Z4funcb\0021\000\001\000\000\00256\000\0021", metadata !5, metadata !18, metadata !22, null, i32 (i1)* @_Z4funcb, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 21] [def] [func]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{metadata !13, metadata !24}
+!24 = metadata !{metadata !"0x24\00bool\000\008\008\000\000\002", null, null} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!25 = metadata !{metadata !"0x2e\00__cxx_global_var_init1\00__cxx_global_var_init1\00\0044\001\001\000\000\00256\000\0044", metadata !5, metadata !18, metadata !19, null, void ()* @__cxx_global_var_init1, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 44] [local] [def] [__cxx_global_var_init1]
+!26 = metadata !{metadata !"0x2e\00func_fwd\00func_fwd\00_ZN1A1B8func_fwdEv\0047\000\001\000\000\00256\000\0047", metadata !5, metadata !6, metadata !19, null, void ()* @_ZN1A1B8func_fwdEv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 47] [def] [func_fwd]
+!27 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__sub_I_debug_info_namespace.cpp\000\001\001\000\000\0064\000\000", metadata !1, metadata !28, metadata !29, null, void ()* @_GLOBAL__sub_I_debug_info_namespace.cpp, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
+!28 = metadata !{metadata !"0x29", metadata !1}   ; [ DW_TAG_file_type ] [/tmp/debug-info-namespace.cpp]
+!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{metadata !31, metadata !32}
+!31 = metadata !{metadata !"0x34\00i\00i\00_ZN1A1B1iE\0020\000\001", metadata !6, metadata !18, metadata !13, i32* @_ZN1A1B1iE, null} ; [ DW_TAG_variable ] [i] [line 20] [def]
+!32 = metadata !{metadata !"0x34\00var_fwd\00var_fwd\00_ZN1A1B7var_fwdE\0044\000\001", metadata !6, metadata !18, metadata !13, i32* @_ZN1A1B7var_fwdE, null} ; [ DW_TAG_variable ] [var_fwd] [line 44] [def]
+!33 = metadata !{metadata !34, metadata !35, metadata !36, metadata !37, metadata !40, metadata !41, metadata !42, metadata !43, metadata !44, metadata !45, metadata !47, metadata !48, metadata !49, metadata !51, metadata !54, metadata !55, metadata !56}
+!34 = metadata !{metadata !"0x3a\0015\00", metadata !7, metadata !6} ; [ DW_TAG_imported_module ]
+!35 = metadata !{metadata !"0x3a\0018\00", metadata !0, metadata !7} ; [ DW_TAG_imported_module ]
+!36 = metadata !{metadata !"0x8\0019\00E", metadata !0, metadata !7} ; [ DW_TAG_imported_declaration ]
+!37 = metadata !{metadata !"0x3a\0023\00", metadata !38, metadata !6} ; [ DW_TAG_imported_module ]
+!38 = metadata !{metadata !"0xb\0022\0010\001", metadata !5, metadata !39} ; [ DW_TAG_lexical_block ] [/tmp/foo.cpp]
+!39 = metadata !{metadata !"0xb\0022\007\000", metadata !5, metadata !21} ; [ DW_TAG_lexical_block ] [/tmp/foo.cpp]
+!40 = metadata !{metadata !"0x3a\0026\00", metadata !21, metadata !7} ; [ DW_TAG_imported_module ]
+!41 = metadata !{metadata !"0x8\0027\00", metadata !21, metadata !"_ZTSN1A1B3fooE"} ; [ DW_TAG_imported_declaration ]
+!42 = metadata !{metadata !"0x8\0028\00", metadata !21, metadata !"_ZTSN1A1B3barE"} ; [ DW_TAG_imported_declaration ]
+!43 = metadata !{metadata !"0x8\0029\00", metadata !21, metadata !14} ; [ DW_TAG_imported_declaration ]
+!44 = metadata !{metadata !"0x8\0030\00", metadata !21, metadata !31} ; [ DW_TAG_imported_declaration ]
+!45 = metadata !{metadata !"0x8\0031\00", metadata !21, metadata !46} ; [ DW_TAG_imported_declaration ]
+!46 = metadata !{metadata !"0x16\00baz\007\000\000\000\000", metadata !5, metadata !6, metadata !"_ZTSN1A1B3barE"} ; [ DW_TAG_typedef ] [baz] [line 7, size 0, align 0, offset 0] [from _ZTSN1A1B3barE]
+!47 = metadata !{metadata !"0x8\0032\00X", metadata !21, metadata !7} ; [ DW_TAG_imported_declaration ]
+!48 = metadata !{metadata !"0x8\0033\00Y", metadata !21, metadata !47} ; [ DW_TAG_imported_declaration ]
+!49 = metadata !{metadata !"0x8\0034\00", metadata !21, metadata !50} ; [ DW_TAG_imported_declaration ]
+!50 = metadata !{metadata !"0x34\00var_decl\00var_decl\00_ZN1A1B8var_declE\008\000\000", metadata !6, metadata !18, metadata !13, null, null} ; [ DW_TAG_variable ] [var_decl] [line 8]
+!51 = metadata !{metadata !"0x8\0035\00", metadata !21, metadata !52} ; [ DW_TAG_imported_declaration ]
+!52 = metadata !{metadata !"0x2e\00func_decl\00func_decl\00_ZN1A1B9func_declEv\009\000\000\000\000\00256\000\000", metadata !5, metadata !6, metadata !19, null, null, null, null, metadata !53} ; [ DW_TAG_subprogram ] [line 9] [scope 0] [func_decl]
+!53 = metadata !{metadata !"0x24"}
+!54 = metadata !{metadata !"0x8\0036\00", metadata !21, metadata !32} ; [ DW_TAG_imported_declaration ]
+!55 = metadata !{metadata !"0x8\0037\00", metadata !21, metadata !26} ; [ DW_TAG_imported_declaration ]
+!56 = metadata !{metadata !"0x8\0042\00", metadata !7, metadata !31} ; [ DW_TAG_imported_declaration ]
+!57 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!58 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!59 = metadata !{metadata !"clang version 3.6.0 "}
+!60 = metadata !{i32 3, i32 12, metadata !10, null}
+!61 = metadata !{metadata !"0x101\00\0016777220\000", metadata !14, metadata !18, metadata !13} ; [ DW_TAG_arg_variable ] [line 4]
+!62 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
+!63 = metadata !{i32 4, i32 12, metadata !14, null}
+!64 = metadata !{i32 4, i32 16, metadata !14, null}
+!65 = metadata !{i32 20, i32 12, metadata !17, null}
+!66 = metadata !{metadata !"0x101\00b\0016777237\000", metadata !21, metadata !18, metadata !24} ; [ DW_TAG_arg_variable ] [b] [line 21]
+!67 = metadata !{i32 21, i32 15, metadata !21, null}
+!68 = metadata !{i32 22, i32 7, metadata !21, null}
+!69 = metadata !{i32 24, i32 5, metadata !38, null}
+!70 = metadata !{i32 38, i32 3, metadata !21, null}
+!71 = metadata !{i32 39, i32 1, metadata !21, null}
+!72 = metadata !{i32 44, i32 15, metadata !25, null}
+!73 = metadata !{i32 47, i32 21, metadata !26, null}
+!74 = metadata !{i32 0, i32 0, metadata !75, null}
+!75 = metadata !{metadata !"0xb\000", metadata !5, metadata !27} ; [ DW_TAG_lexical_block ] [/tmp/foo.cpp]

diff --git a/test/DebugInfo/namespace_function_definition.ll b/test/DebugInfo/namespace_function_definition.ll
index 590f2b3..7a7e8b8 100644
--- a/test/DebugInfo/namespace_function_definition.ll
+++ b/test/DebugInfo/namespace_function_definition.ll

@@ -30,15 +30,15 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_function_definition.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_function_definition.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"namespace_function_definition.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_ZN2ns4funcEv", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns4funcEv, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
-!5 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 1} ; [ DW_TAG_namespace ] [ns] [line 1]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00func\00func\00_ZN2ns4funcEv\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, void ()* @_ZN2ns4funcEv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [func]
+!5 = metadata !{metadata !"0x39\00ns\001", metadata !1, null} ; [ DW_TAG_namespace ] [ns] [line 1]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5.0 "}
 !11 = metadata !{i32 3, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/namespace_inline_function_definition.ll b/test/DebugInfo/namespace_inline_function_definition.ll
index 65fa4a4..943a836 100644
--- a/test/DebugInfo/namespace_inline_function_definition.ll
+++ b/test/DebugInfo/namespace_inline_function_definition.ll

@@ -15,19 +15,19 @@
 ; CHECK: DW_TAG_namespace
 ; CHECK-NEXT: DW_AT_name {{.*}} "ns"
 ; CHECK-NOT: DW_TAG
-; CHECK: [[ABS_DEF:0x.*]]: DW_TAG_subprogram
+; CHECK:   DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN2ns4funcEi"
 ; CHECK-NOT: DW_TAG
-; CHECK: [[ABS_PRM:0x.*]]:   DW_TAG_formal_parameter
+; CHECK:   DW_TAG_formal_parameter
 ; CHECK:   NULL
 ; CHECK-NOT: NULL
 ; CHECK:   DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
-; CHECK:     DW_AT_abstract_origin {{.*}} {[[ABS_DEF]]}
+; CHECK:     DW_AT_abstract_origin {{.*}} "_ZN2ns4funcEi"
 ; CHECK-NOT: DW_TAG
 ; CHECK:     DW_TAG_formal_parameter
-; CHECK:       DW_AT_abstract_origin {{.*}} {[[ABS_PRM]]}
+; CHECK:       DW_AT_abstract_origin {{.*}} "i"
 ; CHECK:     NULL
 ; CHECK:   NULL
 ; CHECK: NULL
@@ -42,7 +42,7 @@
   store i32 0, i32* %retval
   %0 = load i32* @x, align 4, !dbg !16
   store i32 %0, i32* %i.addr.i, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr.i}, metadata !17), !dbg !18
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr.i}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
   %1 = load i32* %i.addr.i, align 4, !dbg !18
   %mul.i = mul nsw i32 %1, 2, !dbg !18
   ret i32 %mul.i, !dbg !16
@@ -53,14 +53,14 @@
 entry:
   %i.addr = alloca i32, align 4
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !17), !dbg !19
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !17, metadata !{metadata !"0x102"}), !dbg !19
   %0 = load i32* %i.addr, align 4, !dbg !19
   %mul = mul nsw i32 %0, 2, !dbg !19
   ret i32 %mul, !dbg !19
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 
 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -70,23 +70,23 @@
 !llvm.module.flags = !{!13, !14}
 !llvm.ident = !{!15}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_inline_function_definition.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/namespace_inline_function_definition.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"namespace_inline_function_definition.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !9}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/namespace_inline_function_definition.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\005\000\001\000\006\00256\000\005", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/namespace_inline_function_definition.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"func", metadata !"func", metadata !"_ZN2ns4funcEi", i32 6, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_ZN2ns4funcEi, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
-!10 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 1} ; [ DW_TAG_namespace ] [ns] [line 1]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x2e\00func\00func\00_ZN2ns4funcEi\006\000\001\000\006\00256\000\006", metadata !1, metadata !10, metadata !11, null, i32 (i32)* @_ZN2ns4funcEi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
+!10 = metadata !{metadata !"0x39\00ns\001", metadata !1, null} ; [ DW_TAG_namespace ] [ns] [line 1]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !8, metadata !8}
 !13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!14 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !15 = metadata !{metadata !"clang version 3.5.0 "}
 !16 = metadata !{i32 5, i32 0, metadata !4, null}
-!17 = metadata !{i32 786689, metadata !9, metadata !"i", metadata !5, i32 16777222, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 6]
+!17 = metadata !{metadata !"0x101\00i\0016777222\000", metadata !9, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [i] [line 6]
 !18 = metadata !{i32 6, i32 0, metadata !9, metadata !16}
 !19 = metadata !{i32 6, i32 0, metadata !9, null}

diff --git a/test/DebugInfo/nodebug.ll b/test/DebugInfo/nodebug.ll
index 4d86b24..acd3e82 100644
--- a/test/DebugInfo/nodebug.ll
+++ b/test/DebugInfo/nodebug.ll

@@ -37,15 +37,15 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/nodebug.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/nodebug.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"nodebug.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f1", metadata !"f1", metadata !"_Z2f1v", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/nodebug.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1v\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, null, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [f1]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/nodebug.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5.0 "}
 !11 = metadata !{i32 3, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/restrict.ll b/test/DebugInfo/restrict.ll
index ceb844f..82d91a7 100644
--- a/test/DebugInfo/restrict.ll
+++ b/test/DebugInfo/restrict.ll

@@ -21,12 +21,12 @@
 entry:
   %dst.addr = alloca i8*, align 8
   store i8* %dst, i8** %dst.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8** %dst.addr}, metadata !13), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i8** %dst.addr}, metadata !13, metadata !{metadata !"0x102"}), !dbg !14
   ret void, !dbg !15
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -35,19 +35,19 @@
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/restrict.c] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/restrict.c] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"restrict.c", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"_Z3fooPv", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @_Z3fooPv, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/restrict.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooPv\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i8*)* @_Z3fooPv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/restrict.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
-!8 = metadata !{i32 786487, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_restrict_type ] [line 0, size 0, align 0, offset 0] [from ]
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!8 = metadata !{metadata !"0x37\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_restrict_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !12 = metadata !{metadata !"clang version 3.5.0 "}
-!13 = metadata !{i32 786689, metadata !4, metadata !"dst", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [dst] [line 1]
+!13 = metadata !{metadata !"0x101\00dst\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [dst] [line 1]
 !14 = metadata !{i32 1, i32 0, metadata !4, null}
 !15 = metadata !{i32 2, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/sugared-constants.ll b/test/DebugInfo/sugared-constants.ll
index 0d2ebe6..8f2a776 100644
--- a/test/DebugInfo/sugared-constants.ll
+++ b/test/DebugInfo/sugared-constants.ll

@@ -24,11 +24,11 @@
 ; Function Attrs: uwtable
 define i32 @main() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !20, i64 0, metadata !10), !dbg !21
+  tail call void @llvm.dbg.value(metadata !20, i64 0, metadata !10, metadata !{metadata !"0x102"}), !dbg !21
   tail call void @_Z4funci(i32 42), !dbg !22
-  tail call void @llvm.dbg.value(metadata !23, i64 0, metadata !12), !dbg !24
+  tail call void @llvm.dbg.value(metadata !23, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !24
   tail call void @_Z4funcj(i32 117), !dbg !25
-  tail call void @llvm.dbg.value(metadata !26, i64 0, metadata !15), !dbg !27
+  tail call void @llvm.dbg.value(metadata !26, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !27
   tail call void @_Z4funcDs(i16 zeroext 7), !dbg !28
   ret i32 0, !dbg !29
 }
@@ -40,7 +40,7 @@
 declare void @_Z4funcDs(i16 zeroext) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #2
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -50,32 +50,32 @@
 !llvm.module.flags = !{!17, !18}
 !llvm.ident = !{!19}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/const.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/const.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"const.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !9, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/const.cpp]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\004\000\001\000\006\00256\001\004", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !9} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/const.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10, metadata !12, metadata !15}
-!10 = metadata !{i32 786688, metadata !4, metadata !"i", metadata !5, i32 5, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 5]
-!11 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
-!12 = metadata !{i32 786688, metadata !4, metadata !"j", metadata !5, i32 7, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 7]
-!13 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from unsigned int]
-!14 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
-!15 = metadata !{i32 786688, metadata !4, metadata !"c", metadata !5, i32 9, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [c] [line 9]
-!16 = metadata !{i32 786468, null, null, metadata !"char16_t", i32 0, i64 16, i64 16, i64 0, i32 0, i32 16} ; [ DW_TAG_base_type ] [char16_t] [line 0, size 16, align 16, offset 0, enc DW_ATE_UTF]
+!10 = metadata !{metadata !"0x100\00i\005\000", metadata !4, metadata !5, metadata !11} ; [ DW_TAG_auto_variable ] [i] [line 5]
+!11 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!12 = metadata !{metadata !"0x100\00j\007\000", metadata !4, metadata !5, metadata !13} ; [ DW_TAG_auto_variable ] [j] [line 7]
+!13 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !14} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from unsigned int]
+!14 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!15 = metadata !{metadata !"0x100\00c\009\000", metadata !4, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [c] [line 9]
+!16 = metadata !{metadata !"0x24\00char16_t\000\0016\0016\000\000\0016", null, null} ; [ DW_TAG_base_type ] [char16_t] [line 0, size 16, align 16, offset 0, enc DW_ATE_UTF]
 !17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !19 = metadata !{metadata !"clang version 3.5.0 "}
 !20 = metadata !{i32 42}
 !21 = metadata !{i32 5, i32 0, metadata !4, null}
 !22 = metadata !{i32 6, i32 0, metadata !4, null}
 !23 = metadata !{i32 117}
 !24 = metadata !{i32 7, i32 0, metadata !4, null}
-!25 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!25 = metadata !{i32 8, i32 0, metadata !4, null}
 !26 = metadata !{i16 7}
 !27 = metadata !{i32 9, i32 0, metadata !4, null}
 !28 = metadata !{i32 10, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/template-recursive-void.ll b/test/DebugInfo/template-recursive-void.ll
index ffbc30e..155b3e8 100644
--- a/test/DebugInfo/template-recursive-void.ll
+++ b/test/DebugInfo/template-recursive-void.ll

@@ -25,41 +25,41 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!36, !37}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 187958) (llvm/trunk 187964)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 187958) (llvm/trunk 187964)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"debug-info-template-recursive.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786484, i32 0, null, metadata !"filters", metadata !"filters", metadata !"", metadata !5, i32 10, metadata !6, i32 0, i32 1, %class.bar* @filters, null} ; [ DW_TAG_variable ] [filters] [line 10] [def]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp]
-!6 = metadata !{i32 786434, metadata !1, null, metadata !"bar", i32 9, i64 8, i64 8, i32 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x34\00filters\00filters\00\0010\000\001", null, metadata !5, metadata !6, %class.bar* @filters, null} ; [ DW_TAG_variable ] [filters] [line 10] [def]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp]
+!6 = metadata !{metadata !"0x2\00bar\009\008\008\000\000\000", metadata !1, null, null, metadata !7, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 8, align 8, offset 0] [def] [from ]
 !7 = metadata !{metadata !8, metadata !31}
-!8 = metadata !{i32 786460, null, metadata !6, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from foo<void>]
-!9 = metadata !{i32 786434, metadata !1, null, metadata !"foo<void>", i32 5, i64 8, i64 8, i32 0, i32 0, null, metadata !10, i32 0, null, metadata !29, null} ; [ DW_TAG_class_type ] [foo<void>] [line 5, size 8, align 8, offset 0] [def] [from ]
+!8 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !6, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from foo<void>]
+!9 = metadata !{metadata !"0x2\00foo<void>\005\008\008\000\000\000", metadata !1, null, null, metadata !10, null, metadata !29, null} ; [ DW_TAG_class_type ] [foo<void>] [line 5, size 8, align 8, offset 0] [def] [from ]
 !10 = metadata !{metadata !11, metadata !19, metadata !25}
-!11 = metadata !{i32 786460, null, metadata !9, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from base]
-!12 = metadata !{i32 786434, metadata !1, null, metadata !"base", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_class_type ] [base] [line 3, size 8, align 8, offset 0] [def] [from ]
+!11 = metadata !{metadata !"0x1c\00\000\000\000\000\000", null, metadata !9, metadata !12} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from base]
+!12 = metadata !{metadata !"0x2\00base\003\008\008\000\000\000", metadata !1, null, null, metadata !13, null, null, null} ; [ DW_TAG_class_type ] [base] [line 3, size 8, align 8, offset 0] [def] [from ]
 !13 = metadata !{metadata !14}
-!14 = metadata !{i32 786478, metadata !1, metadata !12, metadata !"base", metadata !"base", metadata !"", i32 3, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [base]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0x2e\00base\00base\00\003\000\000\000\006\00320\000\003", metadata !1, metadata !12, metadata !15, null, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 3] [base]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17}
-!17 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from base]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from base]
 !18 = metadata !{i32 786468}
-!19 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"operator=", metadata !"operator=", metadata !"_ZN3fooIvEaSES0_", i32 6, metadata !20, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !24, i32 6} ; [ DW_TAG_subprogram ] [line 6] [private] [operator=]
-!20 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{metadata !"0x2e\00operator=\00operator=\00_ZN3fooIvEaSES0_\006\000\000\000\006\00257\000\006", metadata !1, metadata !9, metadata !20, null, null, null, i32 0, metadata !24} ; [ DW_TAG_subprogram ] [line 6] [private] [operator=]
+!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{null, metadata !22, metadata !23}
-!22 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo<void>]
-!23 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo<void>]
+!22 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo<void>]
+!23 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo<void>]
 !24 = metadata !{i32 786468}
-!25 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !26, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !28, i32 5} ; [ DW_TAG_subprogram ] [line 5] [foo]
-!26 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!25 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\000\000\006\00320\000\005", metadata !1, metadata !9, metadata !26, null, null, null, i32 0, metadata !28} ; [ DW_TAG_subprogram ] [line 5] [foo]
+!26 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !27 = metadata !{null, metadata !22}
 !28 = metadata !{i32 786468}
 !29 = metadata !{metadata !30}
-!30 = metadata !{i32 786479, null, metadata !"T", null, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
-!31 = metadata !{i32 786478, metadata !1, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 9, metadata !32, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !35, i32 9} ; [ DW_TAG_subprogram ] [line 9] [bar]
-!32 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !33, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{metadata !"0x2f\00T\000\000", null, null, null} ; [ DW_TAG_template_type_parameter ]
+!31 = metadata !{metadata !"0x2e\00bar\00bar\00\009\000\000\000\006\00320\000\009", metadata !1, metadata !6, metadata !32, null, null, null, i32 0, metadata !35} ; [ DW_TAG_subprogram ] [line 9] [bar]
+!32 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !33, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !33 = metadata !{null, metadata !34}
-!34 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from bar]
+!34 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, null, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from bar]
 !35 = metadata !{i32 786468}
 !36 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!37 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!37 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/tu-composite.ll b/test/DebugInfo/tu-composite.ll
index 7a8ff57..036c683 100644
--- a/test/DebugInfo/tu-composite.ll
+++ b/test/DebugInfo/tu-composite.ll

@@ -91,13 +91,13 @@
 entry:
   %this.addr = alloca %struct.C*, align 8
   store %struct.C* %this, %struct.C** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !36), !dbg !38
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !36, metadata !{metadata !"0x102"}), !dbg !38
   %this1 = load %struct.C** %this.addr
   ret void, !dbg !39
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind ssp uwtable
 define void @_Z4testv() #0 {
@@ -108,12 +108,12 @@
   %e = alloca %"struct.D::Nested", align 1
   %p = alloca %"struct.D::Nested2"*, align 8
   %t = alloca %"struct.D::virt", align 8
-  call void @llvm.dbg.declare(metadata !{%struct.bar* %B}, metadata !40), !dbg !42
-  call void @llvm.dbg.declare(metadata !{[3 x %struct.bar]* %A}, metadata !43), !dbg !47
-  call void @llvm.dbg.declare(metadata !{%struct.bar* %B2}, metadata !48), !dbg !50
-  call void @llvm.dbg.declare(metadata !{%"struct.D::Nested"* %e}, metadata !51), !dbg !52
-  call void @llvm.dbg.declare(metadata !{%"struct.D::Nested2"** %p}, metadata !53), !dbg !55
-  call void @llvm.dbg.declare(metadata !{%"struct.D::virt"* %t}, metadata !56), !dbg !57
+  call void @llvm.dbg.declare(metadata !{%struct.bar* %B}, metadata !40, metadata !{metadata !"0x102"}), !dbg !42
+  call void @llvm.dbg.declare(metadata !{[3 x %struct.bar]* %A}, metadata !43, metadata !{metadata !"0x102"}), !dbg !47
+  call void @llvm.dbg.declare(metadata !{%struct.bar* %B2}, metadata !48, metadata !{metadata !"0x102"}), !dbg !50
+  call void @llvm.dbg.declare(metadata !{%"struct.D::Nested"* %e}, metadata !51, metadata !{metadata !"0x102"}), !dbg !52
+  call void @llvm.dbg.declare(metadata !{%"struct.D::Nested2"** %p}, metadata !53, metadata !{metadata !"0x102"}), !dbg !55
+  call void @llvm.dbg.declare(metadata !{%"struct.D::virt"* %t}, metadata !56, metadata !{metadata !"0x102"}), !dbg !57
   ret void, !dbg !58
 }
 
@@ -123,63 +123,63 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35, !59}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !30, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [tmp.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !30, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [tmp.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tmp.cpp", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !18, metadata !19, metadata !22, metadata !23, metadata !24}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !5, i32 0, metadata !"_ZTS1C", null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 64, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00C\001\0064\0064\000\000\000", metadata !1, null, null, metadata !5, metadata !"_ZTS1C", null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 64, align 64, offset 0] [def] [from ]
 !5 = metadata !{metadata !6, metadata !13}
-!6 = metadata !{i32 786445, metadata !1, metadata !7, metadata !"_vptr$C", i32 0, i64 64, i64 0, i64 0, i32 64, metadata !8} ; [ DW_TAG_member ] [_vptr$C] [line 0, size 64, align 0, offset 0] [artificial] [from ]
-!7 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [tmp.cpp]
-!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
-!9 = metadata !{i32 786447, null, null, metadata !"__vtbl_ptr_type", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
-!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0xd\00_vptr$C\000\0064\000\000\0064", metadata !1, metadata !7, metadata !8} ; [ DW_TAG_member ] [_vptr$C] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!7 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [tmp.cpp]
+!8 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!9 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"foo", metadata !"foo", metadata !"_ZN1C3fooEv", i32 2, metadata !14, i1 false, i1 false, i32 1, i32 0, metadata !"_ZTS1C", i32 256, i1 false, null, null, i32 0, metadata !17, i32 2} ; [ DW_TAG_subprogram ] [line 2] [foo]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1C3fooEv\002\000\000\001\006\00256\000\002", metadata !1, metadata !"_ZTS1C", metadata !14, metadata !"_ZTS1C", null, null, i32 0, metadata !17} ; [ DW_TAG_subprogram ] [line 2] [foo]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !16}
-!16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
 !17 = metadata !{i32 786468}
-!18 = metadata !{i32 786451, metadata !1, null, metadata !"bar", i32 7, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 7, size 8, align 8, offset 0] [def] [from ]
-!19 = metadata !{i32 786451, metadata !1, null, metadata !"D", i32 9, i64 8, i64 8, i32 0, i32 0, null, metadata !20, i32 0, null, null, metadata !"_ZTS1D"} ; [ DW_TAG_structure_type ] [D] [line 9, size 8, align 8, offset 0] [def] [from ]
+!18 = metadata !{metadata !"0x13\00bar\007\008\008\000\000\000", metadata !1, null, null, metadata !2, null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 7, size 8, align 8, offset 0] [def] [from ]
+!19 = metadata !{metadata !"0x13\00D\009\008\008\000\000\000", metadata !1, null, null, metadata !20, null, null, metadata !"_ZTS1D"} ; [ DW_TAG_structure_type ] [D] [line 9, size 8, align 8, offset 0] [def] [from ]
 !20 = metadata !{metadata !21}
-!21 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1D", metadata !"a", i32 11, i64 0, i64 0, i64 0, i32 4096, metadata !12, null} ; [ DW_TAG_member ] [a] [line 11, size 0, align 0, offset 0] [static] [from int]
-!22 = metadata !{i32 786451, metadata !1, metadata !"_ZTS1D", metadata !"Nested", i32 12, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTSN1D6NestedE"} ; [ DW_TAG_structure_type ] [Nested] [line 12, size 8, align 8, offset 0] [def] [from ]
-!23 = metadata !{i32 786451, metadata !1, metadata !"_ZTS1D", metadata !"Nested2", i32 13, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTSN1D7Nested2E"} ; [ DW_TAG_structure_type ] [Nested2] [line 13, size 0, align 0, offset 0] [decl] [from ]
-!24 = metadata !{i32 786451, metadata !1, metadata !"_ZTS1D", metadata !"virt<bar>", i32 15, i64 64, i64 64, i32 0, i32 0, null, metadata !25, i32 0, null, metadata !28, metadata !"_ZTSN1D4virtI3barEE"} ; [ DW_TAG_structure_type ] [virt<bar>] [line 15, size 64, align 64, offset 0] [def] [from ]
+!21 = metadata !{metadata !"0xd\00a\0011\000\000\000\004096", metadata !1, metadata !"_ZTS1D", metadata !12, null} ; [ DW_TAG_member ] [a] [line 11, size 0, align 0, offset 0] [static] [from int]
+!22 = metadata !{metadata !"0x13\00Nested\0012\008\008\000\000\000", metadata !1, metadata !"_ZTS1D", null, metadata !2, null, null, metadata !"_ZTSN1D6NestedE"} ; [ DW_TAG_structure_type ] [Nested] [line 12, size 8, align 8, offset 0] [def] [from ]
+!23 = metadata !{metadata !"0x13\00Nested2\0013\000\000\000\004\000", metadata !1, metadata !"_ZTS1D", null, null, null, null, metadata !"_ZTSN1D7Nested2E"} ; [ DW_TAG_structure_type ] [Nested2] [line 13, size 0, align 0, offset 0] [decl] [from ]
+!24 = metadata !{metadata !"0x13\00virt<bar>\0015\0064\0064\000\000\000", metadata !1, metadata !"_ZTS1D", null, metadata !25, null, metadata !28, metadata !"_ZTSN1D4virtI3barEE"} ; [ DW_TAG_structure_type ] [virt<bar>] [line 15, size 64, align 64, offset 0] [def] [from ]
 !25 = metadata !{metadata !26}
-!26 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN1D4virtI3barEE", metadata !"values", i32 16, i64 64, i64 64, i64 0, i32 0, metadata !27} ; [ DW_TAG_member ] [values] [line 16, size 64, align 64, offset 0] [from ]
-!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS3bar"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3bar]
+!26 = metadata !{metadata !"0xd\00values\0016\0064\0064\000\000", metadata !1, metadata !"_ZTSN1D4virtI3barEE", metadata !27} ; [ DW_TAG_member ] [values] [line 16, size 64, align 64, offset 0] [from ]
+!27 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3bar]
 !28 = metadata !{metadata !29}
-!29 = metadata !{i32 786479, null, metadata !"T", metadata !"_ZTS3bar", null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!29 = metadata !{metadata !"0x2f\00T\000\000", null, metadata !"_ZTS3bar", null} ; [ DW_TAG_template_type_parameter ]
 !30 = metadata !{metadata !31, metadata !32}
-!31 = metadata !{i32 786478, metadata !1, null, metadata !"foo", metadata !"foo", metadata !"_ZN1C3fooEv", i32 4, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C3fooEv, null, metadata !13, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [foo]
-!32 = metadata !{i32 786478, metadata !1, metadata !7, metadata !"test", metadata !"test", metadata !"_Z4testv", i32 20, metadata !33, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4testv, null, null, metadata !2, i32 20} ; [ DW_TAG_subprogram ] [line 20] [def] [test]
-!33 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!31 = metadata !{metadata !"0x2e\00foo\00foo\00_ZN1C3fooEv\004\000\001\000\006\00256\000\004", metadata !1, null, metadata !14, null, void (%struct.C*)* @_ZN1C3fooEv, null, metadata !13, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [foo]
+!32 = metadata !{metadata !"0x2e\00test\00test\00_Z4testv\0020\000\001\000\006\00256\000\0020", metadata !1, metadata !7, metadata !33, null, void ()* @_Z4testv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 20] [def] [test]
+!33 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !34, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !34 = metadata !{null}
 !35 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!36 = metadata !{i32 786689, metadata !31, metadata !"this", null, i32 16777216, metadata !37, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!37 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!36 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !31, null, metadata !37} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!37 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
 !38 = metadata !{i32 0, i32 0, metadata !31, null}
 !39 = metadata !{i32 5, i32 0, metadata !31, null}
-!40 = metadata !{i32 786688, metadata !32, metadata !"B", metadata !7, i32 21, metadata !41, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [B] [line 21]
-!41 = metadata !{i32 786454, metadata !1, null, metadata !"baz", i32 8, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz] [line 8, size 0, align 0, offset 0] [from _ZTS3bar]
+!40 = metadata !{metadata !"0x100\00B\0021\000", metadata !32, metadata !7, metadata !41} ; [ DW_TAG_auto_variable ] [B] [line 21]
+!41 = metadata !{metadata !"0x16\00baz\008\000\000\000\000", metadata !1, null, metadata !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz] [line 8, size 0, align 0, offset 0] [from _ZTS3bar]
 !42 = metadata !{i32 21, i32 0, metadata !32, null}
-!43 = metadata !{i32 786688, metadata !32, metadata !"A", metadata !7, i32 22, metadata !44, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [A] [line 22]
-!44 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 24, i64 8, i32 0, i32 0, metadata !"_ZTS3bar", metadata !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 24, align 8, offset 0] [from _ZTS3bar]
+!43 = metadata !{metadata !"0x100\00A\0022\000", metadata !32, metadata !7, metadata !44} ; [ DW_TAG_auto_variable ] [A] [line 22]
+!44 = metadata !{metadata !"0x1\00\000\0024\008\000\000", null, null, metadata !"_ZTS3bar", metadata !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 24, align 8, offset 0] [from _ZTS3bar]
 !45 = metadata !{metadata !46}
-!46 = metadata !{i32 786465, i64 0, i64 3}        ; [ DW_TAG_subrange_type ] [0, 2]
+!46 = metadata !{metadata !"0x21\000\003"}        ; [ DW_TAG_subrange_type ] [0, 2]
 !47 = metadata !{i32 22, i32 0, metadata !32, null}
-!48 = metadata !{i32 786688, metadata !32, metadata !"B2", metadata !7, i32 23, metadata !49, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [B2] [line 23]
-!49 = metadata !{i32 786454, metadata !1, metadata !"_ZTS1D", metadata !"baz2", i32 10, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz2] [line 10, size 0, align 0, offset 0] [from _ZTS3bar]
+!48 = metadata !{metadata !"0x100\00B2\0023\000", metadata !32, metadata !7, metadata !49} ; [ DW_TAG_auto_variable ] [B2] [line 23]
+!49 = metadata !{metadata !"0x16\00baz2\0010\000\000\000\000", metadata !1, metadata !"_ZTS1D", metadata !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz2] [line 10, size 0, align 0, offset 0] [from _ZTS3bar]
 !50 = metadata !{i32 23, i32 0, metadata !32, null}
-!51 = metadata !{i32 786688, metadata !32, metadata !"e", metadata !7, i32 24, metadata !22, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [e] [line 24]
+!51 = metadata !{metadata !"0x100\00e\0024\000", metadata !32, metadata !7, metadata !22} ; [ DW_TAG_auto_variable ] [e] [line 24]
 !52 = metadata !{i32 24, i32 0, metadata !32, null}
-!53 = metadata !{i32 786688, metadata !32, metadata !"p", metadata !7, i32 25, metadata !54, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [p] [line 25]
-!54 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTSN1D7Nested2E"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTSN1D7Nested2E]
+!53 = metadata !{metadata !"0x100\00p\0025\000", metadata !32, metadata !7, metadata !54} ; [ DW_TAG_auto_variable ] [p] [line 25]
+!54 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTSN1D7Nested2E"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTSN1D7Nested2E]
 !55 = metadata !{i32 25, i32 0, metadata !32, null}
-!56 = metadata !{i32 786688, metadata !32, metadata !"t", metadata !7, i32 26, metadata !24, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 26]
+!56 = metadata !{metadata !"0x100\00t\0026\000", metadata !32, metadata !7, metadata !24} ; [ DW_TAG_auto_variable ] [t] [line 26]
 !57 = metadata !{i32 26, i32 0, metadata !32, null}
 !58 = metadata !{i32 27, i32 0, metadata !32, null}
-!59 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!59 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/tu-member-pointer.ll b/test/DebugInfo/tu-member-pointer.ll
index cd37a98..7f25f5a 100644
--- a/test/DebugInfo/tu-member-pointer.ll
+++ b/test/DebugInfo/tu-member-pointer.ll

@@ -16,15 +16,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !5, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !2, metadata !5, metadata !2} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"foo.cpp", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"Foo", i32 1, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS3Foo"} ; [ DW_TAG_structure_type ] [Foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!4 = metadata !{metadata !"0x13\00Foo\001\000\000\000\004\000", metadata !1, null, null, null, null, null, metadata !"_ZTS3Foo"} ; [ DW_TAG_structure_type ] [Foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !7, i32 4, metadata !8, i32 0, i32 1, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
-!7 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [foo.cpp]
-!8 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9, metadata !"_ZTS3Foo"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!6 = metadata !{metadata !"0x34\00x\00x\00\004\000\001", null, metadata !7, metadata !8, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!7 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [foo.cpp]
+!8 = metadata !{metadata !"0x1f\00\000\000\000\000\000", null, null, metadata !9, metadata !"_ZTS3Foo"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/two-cus-from-same-file.ll b/test/DebugInfo/two-cus-from-same-file.ll
index 2ab82a9..d893319 100644
--- a/test/DebugInfo/two-cus-from-same-file.ll
+++ b/test/DebugInfo/two-cus-from-same-file.ll

@@ -23,44 +23,44 @@
 
 define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !21), !dbg !26
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !22), !dbg !27
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !21, metadata !{metadata !"0x102"}), !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !22, metadata !{metadata !"0x102"}), !dbg !27
   %puts = tail call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @str1, i32 0, i32 0)), !dbg !28
   tail call void @foo() nounwind, !dbg !30
   ret i32 0, !dbg !31
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0, !9}
 !llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786449, metadata !32, i32 12, metadata !"clang version 3.2 (trunk 156513)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 156513)\001\00\000\00\001", metadata !32, metadata !1, metadata !1, metadata !3, metadata !1, metadata !1} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !32, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @foo, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\001\000\006\00256\001\005", metadata !32, metadata !6, metadata !7, null, void ()* @foo, null, null, metadata !1} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !32} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{i32 786449, metadata !32, i32 12, metadata !"clang version 3.2 (trunk 156513)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !10, metadata !1, metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
+!9 = metadata !{metadata !"0x11\0012\00clang version 3.2 (trunk 156513)\001\00\000\00\001", metadata !32, metadata !1, metadata !1, metadata !10, metadata !1, metadata !1} ; [ DW_TAG_compile_unit ]
 !10 = metadata !{metadata !12}
-!12 = metadata !{i32 786478, metadata !32, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 11, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !19, i32 11} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x2e\00main\00main\00\0011\000\001\000\006\00256\001\0011", metadata !32, metadata !6, metadata !13, null, i32 (i32, i8**)* @main, null, null, metadata !19} ; [ DW_TAG_subprogram ]
+!13 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !14, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{metadata !15, metadata !15, metadata !16}
-!15 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !17} ; [ DW_TAG_pointer_type ]
-!17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !18} ; [ DW_TAG_pointer_type ]
-!18 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!15 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !17} ; [ DW_TAG_pointer_type ]
+!17 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, null, metadata !18} ; [ DW_TAG_pointer_type ]
+!18 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
 !19 = metadata !{metadata !21, metadata !22}
-!21 = metadata !{i32 786689, metadata !12, metadata !"argc", metadata !6, i32 16777227, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 786689, metadata !12, metadata !"argv", metadata !6, i32 33554443, metadata !16, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{metadata !"0x101\00argc\0016777227\000", metadata !12, metadata !6, metadata !15} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{metadata !"0x101\00argv\0033554443\000", metadata !12, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ]
 !23 = metadata !{i32 6, i32 3, metadata !24, null}
-!24 = metadata !{i32 786443, metadata !32, metadata !5, i32 5, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
+!24 = metadata !{metadata !"0xb\005\0016\000", metadata !32, metadata !5} ; [ DW_TAG_lexical_block ]
 !25 = metadata !{i32 7, i32 1, metadata !24, null}
 !26 = metadata !{i32 11, i32 14, metadata !12, null}
 !27 = metadata !{i32 11, i32 26, metadata !12, null}
 !28 = metadata !{i32 12, i32 3, metadata !29, null}
-!29 = metadata !{i32 786443, metadata !32, metadata !12, i32 11, i32 34, i32 0} ; [ DW_TAG_lexical_block ]
+!29 = metadata !{metadata !"0xb\0011\0034\000", metadata !32, metadata !12} ; [ DW_TAG_lexical_block ]
 !30 = metadata !{i32 13, i32 3, metadata !29, null}
 !31 = metadata !{i32 14, i32 3, metadata !29, null}
 !32 = metadata !{metadata !"foo.c", metadata !"/tmp"}
@@ -70,4 +70,4 @@
 ; CHECK: {{DW_TAG_compile_unit}}
 ; CHECK: {{foo\.c}}
 
-!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/DebugInfo/typedef.ll b/test/DebugInfo/typedef.ll
index 40cecdf..941f5da 100644
--- a/test/DebugInfo/typedef.ll
+++ b/test/DebugInfo/typedef.ll

@@ -18,15 +18,15 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/typedef.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/typedef.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"typedef.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !5, i32 2, metadata !6, i32 0, i32 1, i8** @y, null} ; [ DW_TAG_variable ] [y] [line 2] [def]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/typedef.cpp]
-!6 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from x]
-!7 = metadata !{i32 786454, metadata !1, null, metadata !"x", i32 1, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [x] [line 1, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x34\00y\00y\00\002\000\001", null, metadata !5, metadata !6, i8** @y, null} ; [ DW_TAG_variable ] [y] [line 2] [def]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/dbginfo/typedef.cpp]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from x]
+!7 = metadata !{metadata !"0x16\00x\001\000\000\000\000", metadata !1, null, null} ; [ DW_TAG_typedef ] [x] [line 1, size 0, align 0, offset 0] [from ]
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5.0 "}
 

diff --git a/test/DebugInfo/unconditional-branch.ll b/test/DebugInfo/unconditional-branch.ll
index 6c31375..95f5f9e 100644
--- a/test/DebugInfo/unconditional-branch.ll
+++ b/test/DebugInfo/unconditional-branch.ll

@@ -22,7 +22,7 @@
 entry:
   %i.addr = alloca i32, align 4
   store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !12), !dbg !13
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !12, metadata !{metadata !"0x102"}), !dbg !13
   %0 = load i32* %i.addr, align 4, !dbg !14
   switch i32 %0, label %sw.default [
   ], !dbg !14
@@ -35,7 +35,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -44,21 +44,21 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (204712)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [D:\work\EPRs\396363/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (204712)\000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [D:\work\EPRs\396363/test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test.c", metadata !"D:\5Cwork\5CEPRs\5C396363"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [D:\work\EPRs\396363/test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [D:\work\EPRs\396363/test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5.0 (204712)"}
-!12 = metadata !{i32 786689, metadata !4, metadata !"i", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!12 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [i] [line 1]
 !13 = metadata !{i32 1, i32 0, metadata !4, null}
 !14 = metadata !{i32 2, i32 0, metadata !4, null}
 !15 = metadata !{i32 4, i32 0, metadata !16, null}
-!16 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [D:\work\EPRs\396363/test.c]
+!16 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [D:\work\EPRs\396363/test.c]
 !17 = metadata !{i32 6, i32 0, metadata !4, null}

diff --git a/test/DebugInfo/varargs.ll b/test/DebugInfo/varargs.ll
index ddfcd85..1fe598a 100644
--- a/test/DebugInfo/varargs.ll
+++ b/test/DebugInfo/varargs.ll

@@ -27,6 +27,10 @@
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
+; CHECK: DW_TAG_variable
+; CHECK-NOT: DW_TAG
 ; CHECK: DW_TAG_unspecified_parameters
 ;
 ; Variadic C++ member function.
@@ -51,15 +55,15 @@
   %a = alloca %struct.A, align 1
   %fptr = alloca void (i32, ...)*, align 8
   store i32 %c, i32* %1, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !21), !dbg !22
-  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !23), !dbg !24
-  call void @llvm.dbg.declare(metadata !{void (i32, ...)** %fptr}, metadata !25), !dbg !27
+  call void @llvm.dbg.declare(metadata !{i32* %1}, metadata !21, metadata !{metadata !"0x102"}), !dbg !22
+  call void @llvm.dbg.declare(metadata !{%struct.A* %a}, metadata !23, metadata !{metadata !"0x102"}), !dbg !24
+  call void @llvm.dbg.declare(metadata !{void (i32, ...)** %fptr}, metadata !25, metadata !{metadata !"0x102"}), !dbg !27
   store void (i32, ...)* @_Z1biz, void (i32, ...)** %fptr, align 8, !dbg !27
   ret void, !dbg !28
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp uwtable }
 attributes #1 = { nounwind readnone }
@@ -68,32 +72,31 @@
 !llvm.module.flags = !{!18, !19}
 !llvm.ident = !{!20}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !13, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !13, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp", metadata !"radar/13690847"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"A", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00A\003\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"a", metadata !"a", metadata !"_ZN1A1aEiz", i32 6, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !12, i32 6} ; [ DW_TAG_subprogram ] [line 6] [a]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{null, metadata !9, metadata !10, metadata !11}
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!11 = metadata !{i32 786456}
+!6 = metadata !{metadata !"0x2e\00a\00a\00_ZN1A1aEiz\006\000\000\000\006\00256\000\006", metadata !1, metadata !"_ZTS1A", metadata !7, null, null, null, i32 0, metadata !12} ; [ DW_TAG_subprogram ] [line 6] [a]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !10, null}
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!10 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{i32 786468}
 !13 = metadata !{metadata !14}
-!14 = metadata !{i32 786478, metadata !1, metadata !15, metadata !"b", metadata !"b", metadata !"_Z1biz", i32 13, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32, ...)* @_Z1biz, null, null, metadata !2, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
-!15 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp]
-!16 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!17 = metadata !{null, metadata !10, metadata !11}
+!14 = metadata !{metadata !"0x2e\00b\00b\00_Z1biz\0013\000\001\000\006\00256\000\0013", metadata !1, metadata !15, metadata !16, null, void (i32, ...)* @_Z1biz, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 13] [def] [b]
+!15 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenCXX/debug-info-varargs.cpp]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{null, metadata !10, null}
 !18 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !20 = metadata !{metadata !"clang version 3.5 "}
-!21 = metadata !{i32 786689, metadata !14, metadata !"c", metadata !15, i32 16777229, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 13]
+!21 = metadata !{metadata !"0x101\00c\0016777229\000", metadata !14, metadata !15, metadata !10} ; [ DW_TAG_arg_variable ] [c] [line 13]
 !22 = metadata !{i32 13, i32 0, metadata !14, null}
-!23 = metadata !{i32 786688, metadata !14, metadata !"a", metadata !15, i32 16, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 16]
+!23 = metadata !{metadata !"0x100\00a\0016\000", metadata !14, metadata !15, metadata !4} ; [ DW_TAG_auto_variable ] [a] [line 16]
 !24 = metadata !{i32 16, i32 0, metadata !14, null}
-!25 = metadata !{i32 786688, metadata !14, metadata !"fptr", metadata !15, i32 18, metadata !26, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [fptr] [line 18]
-!26 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!25 = metadata !{metadata !"0x100\00fptr\0018\000", metadata !14, metadata !15, metadata !26} ; [ DW_TAG_auto_variable ] [fptr] [line 18]
+!26 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !27 = metadata !{i32 18, i32 0, metadata !14, null}
 !28 = metadata !{i32 22, i32 0, metadata !14, null}

diff --git a/test/DebugInfo/version.ll b/test/DebugInfo/version.ll
index 9a201eb..73d62fa 100644
--- a/test/DebugInfo/version.ll
+++ b/test/DebugInfo/version.ll

@@ -18,15 +18,15 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 185475)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 185475)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"CodeGen/dwarf-version.c", metadata !"test"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\006\000\001\000\006\00256\000\006", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 6] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !10 = metadata !{i32 7, i32 0, metadata !4, null}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/ExecutionEngine/2002-12-16-ArgTest.ll b/test/ExecutionEngine/2002-12-16-ArgTest.ll
index 4c03519..eb2fe8c 100644
--- a/test/ExecutionEngine/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/2002-12-16-ArgTest.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 

diff --git a/test/ExecutionEngine/2003-01-04-ArgumentBug.ll b/test/ExecutionEngine/2003-01-04-ArgumentBug.ll
index 3182193..68fdefe 100644
--- a/test/ExecutionEngine/2003-01-04-ArgumentBug.ll
+++ b/test/ExecutionEngine/2003-01-04-ArgumentBug.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 define i32 @foo(i32 %X, i32 %Y, double %A) {
 	%cond212 = fcmp une double %A, 1.000000e+00		; <i1> [#uses=1]

diff --git a/test/ExecutionEngine/2003-01-04-LoopTest.ll b/test/ExecutionEngine/2003-01-04-LoopTest.ll
index 3e27e06..5a0311d 100644
--- a/test/ExecutionEngine/2003-01-04-LoopTest.ll
+++ b/test/ExecutionEngine/2003-01-04-LoopTest.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 define i32 @main() {
 	call i32 @mylog( i32 4 )		; <i32>:1 [#uses=0]

diff --git a/test/ExecutionEngine/2003-01-15-AlignmentTest.ll b/test/ExecutionEngine/2003-01-15-AlignmentTest.ll
index 80e19ba..038d750 100644
--- a/test/ExecutionEngine/2003-01-15-AlignmentTest.ll
+++ b/test/ExecutionEngine/2003-01-15-AlignmentTest.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 define i32 @bar(i8* %X) {
         ; pointer should be 4 byte aligned!

diff --git a/test/ExecutionEngine/2003-05-06-LivenessClobber.ll b/test/ExecutionEngine/2003-05-06-LivenessClobber.ll
index 6f61aa6..576ef7c 100644
--- a/test/ExecutionEngine/2003-05-06-LivenessClobber.ll
+++ b/test/ExecutionEngine/2003-05-06-LivenessClobber.ll

@@ -1,7 +1,6 @@
 ; This testcase should return with an exit code of 1.
 ;
 ; RUN: not %lli %s
-; XFAIL: arm
 
 @test = global i64 0		; <i64*> [#uses=1]
 

diff --git a/test/ExecutionEngine/2003-05-07-ArgumentTest.ll b/test/ExecutionEngine/2003-05-07-ArgumentTest.ll
index 236be18..42db5fe 100644
--- a/test/ExecutionEngine/2003-05-07-ArgumentTest.ll
+++ b/test/ExecutionEngine/2003-05-07-ArgumentTest.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s test
-; XFAIL: arm
 
 declare i32 @puts(i8*)
 

diff --git a/test/ExecutionEngine/2003-08-15-AllocaAssertion.ll b/test/ExecutionEngine/2003-08-15-AllocaAssertion.ll
index 22dd4cc..bee409c 100644
--- a/test/ExecutionEngine/2003-08-15-AllocaAssertion.ll
+++ b/test/ExecutionEngine/2003-08-15-AllocaAssertion.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 ; This testcase failed to work because two variable sized allocas confused the
 ; local register allocator.

diff --git a/test/ExecutionEngine/2003-08-21-EnvironmentTest.ll b/test/ExecutionEngine/2003-08-21-EnvironmentTest.ll
index 60dc3d6..63303fc 100644
--- a/test/ExecutionEngine/2003-08-21-EnvironmentTest.ll
+++ b/test/ExecutionEngine/2003-08-21-EnvironmentTest.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 ;
 ; Regression Test: EnvironmentTest.ll

diff --git a/test/ExecutionEngine/2003-08-23-RegisterAllocatePhysReg.ll b/test/ExecutionEngine/2003-08-23-RegisterAllocatePhysReg.ll
index 04a5e17..8fb1bbb 100644
--- a/test/ExecutionEngine/2003-08-23-RegisterAllocatePhysReg.ll
+++ b/test/ExecutionEngine/2003-08-23-RegisterAllocatePhysReg.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 ; This testcase exposes a bug in the local register allocator where it runs out
 ; of registers (due to too many overlapping live ranges), but then attempts to

diff --git a/test/ExecutionEngine/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll b/test/ExecutionEngine/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
index 6e48c60..6513540 100644
--- a/test/ExecutionEngine/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
+++ b/test/ExecutionEngine/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 @A = global i32 0		; <i32*> [#uses=1]
 

diff --git a/test/ExecutionEngine/2005-12-02-TailCallBug.ll b/test/ExecutionEngine/2005-12-02-TailCallBug.ll
index 8523b5e..2ac8ad1 100644
--- a/test/ExecutionEngine/2005-12-02-TailCallBug.ll
+++ b/test/ExecutionEngine/2005-12-02-TailCallBug.ll

@@ -1,6 +1,5 @@
 ; PR672
 ; RUN: %lli %s
-; XFAIL: arm
 
 define i32 @main() {
 	%f = bitcast i32 (i32, i32*, i32)* @check_tail to i32*		; <i32*> [#uses=1]

diff --git a/test/ExecutionEngine/Interpreter/intrinsics.ll b/test/ExecutionEngine/Interpreter/intrinsics.ll
new file mode 100644
index 0000000..49d0bbe
--- /dev/null
+++ b/test/ExecutionEngine/Interpreter/intrinsics.ll

@@ -0,0 +1,35 @@
+; RUN: lli -O0 -force-interpreter < %s
+
+; libffi does not support fp128 so we don’t test it
+declare float  @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare float  @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare float  @llvm.floor.f32(float)
+declare double @llvm.floor.f64(double)
+declare float  @llvm.ceil.f32(float)
+declare double @llvm.ceil.f64(double)
+declare float  @llvm.trunc.f32(float)
+declare double @llvm.trunc.f64(double)
+declare float  @llvm.round.f32(float)
+declare double @llvm.round.f64(double)
+declare float  @llvm.copysign.f32(float, float)
+declare double @llvm.copysign.f64(double, double)
+
+define i32 @main() {
+  %sin32 = call float @llvm.sin.f32(float 0.000000e+00)
+  %sin64 = call double @llvm.sin.f64(double 0.000000e+00)
+  %cos32 = call float @llvm.cos.f32(float 0.000000e+00)
+  %cos64 = call double @llvm.cos.f64(double 0.000000e+00)
+  %floor32 = call float @llvm.floor.f32(float 0.000000e+00)
+  %floor64 = call double @llvm.floor.f64(double 0.000000e+00)
+  %ceil32 = call float @llvm.ceil.f32(float 0.000000e+00)
+  %ceil64 = call double @llvm.ceil.f64(double 0.000000e+00)
+  %trunc32 = call float @llvm.trunc.f32(float 0.000000e+00)
+  %trunc64 = call double @llvm.trunc.f64(double 0.000000e+00)
+  %round32 = call float @llvm.round.f32(float 0.000000e+00)
+  %round64 = call double @llvm.round.f64(double 0.000000e+00)
+  %copysign32 = call float @llvm.copysign.f32(float 0.000000e+00, float 0.000000e+00)
+  %copysign64 = call double @llvm.copysign.f64(double 0.000000e+00, double 0.000000e+00)
+  ret i32 0
+}

diff --git a/test/ExecutionEngine/Interpreter/lit.local.cfg b/test/ExecutionEngine/Interpreter/lit.local.cfg
new file mode 100644
index 0000000..8cbaf03
--- /dev/null
+++ b/test/ExecutionEngine/Interpreter/lit.local.cfg

@@ -0,0 +1,3 @@
+# These tests require foreign function calls
+if config.enable_ffi != "ON":
+    config.unsupported = True

diff --git a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
index babd8f6..eb2fe8c 100644
--- a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 

diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
index bbb81b8..68fdefe 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @foo(i32 %X, i32 %Y, double %A) {
 	%cond212 = fcmp une double %A, 1.000000e+00		; <i1> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
index 7574267..5a0311d 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() {
 	call i32 @mylog( i32 4 )		; <i32>:1 [#uses=0]

diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
index 261939a..48576e7 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() {
 ; <label>:0

diff --git a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
index f76f998..ed58e11 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ; We were accidentally inverting the signedness of right shifts.  Whoops.
 

diff --git a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
index 2b83bb9..4960e59 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() {
 	%X = fadd double 0.000000e+00, 1.000000e+00		; <double> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
index d1ca2be..038d750 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @bar(i8* %X) {
         ; pointer should be 4 byte aligned!

diff --git a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
index 20ef0ff..576ef7c 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll

@@ -1,6 +1,6 @@
 ; This testcase should return with an exit code of 1.
 ;
-; RUN: not %lli_mcjit %s
+; RUN: not %lli %s
 
 @test = global i64 0		; <i64*> [#uses=1]
 

diff --git a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
index c7bcc54..42db5fe 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s test
+; RUN: %lli %s test
 
 declare i32 @puts(i8*)
 

diff --git a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
index 0512575..45279ad 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 target datalayout = "e-p:32:32"
 

diff --git a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
index c292a81..4342aa4 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 

diff --git a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
index c0a83f5..03b66c4 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 

diff --git a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
index 55ce689..bee409c 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ; This testcase failed to work because two variable sized allocas confused the
 ; local register allocator.

diff --git a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
index 2e99996..63303fc 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ;
 ; Regression Test: EnvironmentTest.ll

diff --git a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
index 659901b..8fb1bbb 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ; This testcase exposes a bug in the local register allocator where it runs out
 ; of registers (due to too many overlapping live ranges), but then attempts to

diff --git a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
index 68e31a7..6513540 100644
--- a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
+++ b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 @A = global i32 0		; <i32*> [#uses=1]
 

diff --git a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
index 0bc0105..6a3c0f2 100644
--- a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
+++ b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll

@@ -1,5 +1,5 @@
 ; PR672
-; RUN: %lli_mcjit %s
+; RUN: %lli %s
 ; XFAIL: mcjit-ia32
 
 define i32 @main() {

diff --git a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
index 43188f2..4183611 100644
--- a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
+++ b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -force-interpreter %s
+; RUN: %lli -force-interpreter %s
 ; PR1836
 
 define i32 @main() {

diff --git a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
index 9897602..349db69 100644
--- a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
+++ b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -force-interpreter=true %s | FileCheck %s
+; RUN: %lli -force-interpreter=true %s | FileCheck %s
 ; CHECK: 1
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"

diff --git a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
index 7ed0e38..8bf03de 100644
--- a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
+++ b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -force-interpreter=true %s > /dev/null
+; RUN: %lli -force-interpreter=true %s > /dev/null
 
 define i32 @main() {
        %a = add i32 0, undef

diff --git a/test/ExecutionEngine/MCJIT/2013-04-04-RelocAddend.ll b/test/ExecutionEngine/MCJIT/2013-04-04-RelocAddend.ll
index 3f402c5..d9ff347 100644
--- a/test/ExecutionEngine/MCJIT/2013-04-04-RelocAddend.ll
+++ b/test/ExecutionEngine/MCJIT/2013-04-04-RelocAddend.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s
+; RUN: %lli %s
 ;
 ; Verify relocations to global symbols with addend work correctly.
 ;

diff --git a/test/ExecutionEngine/MCJIT/cross-module-a.ll b/test/ExecutionEngine/MCJIT/cross-module-a.ll
index fe8d386..5d4e16f 100644
--- a/test/ExecutionEngine/MCJIT/cross-module-a.ll
+++ b/test/ExecutionEngine/MCJIT/cross-module-a.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -extra-module=%p/Inputs/cross-module-b.ll %s > /dev/null
+; RUN: %lli -extra-module=%p/Inputs/cross-module-b.ll %s > /dev/null
 
 declare i32 @FB()
 

diff --git a/test/ExecutionEngine/MCJIT/cross-module-sm-pic-a.ll b/test/ExecutionEngine/MCJIT/cross-module-sm-pic-a.ll
index ee26702..eb41424 100644
--- a/test/ExecutionEngine/MCJIT/cross-module-sm-pic-a.ll
+++ b/test/ExecutionEngine/MCJIT/cross-module-sm-pic-a.ll

@@ -1,5 +1,5 @@
-; RUN: %lli_mcjit -extra-module=%p/Inputs/cross-module-b.ll -relocation-model=pic -code-model=small %s > /dev/null
-; XFAIL: mips, i686, i386, arm
+; RUN: %lli -extra-module=%p/Inputs/cross-module-b.ll -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386
 
 declare i32 @FB()
 

diff --git a/test/ExecutionEngine/MCJIT/eh-lg-pic.ll b/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
index 539c890..bd097f2 100644
--- a/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
+++ b/test/ExecutionEngine/MCJIT/eh-lg-pic.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -relocation-model=pic -code-model=large %s
+; RUN: %lli -relocation-model=pic -code-model=large %s
 ; XFAIL: cygwin, win32, mingw, mips, i686, i386, aarch64, arm
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)

diff --git a/test/ExecutionEngine/MCJIT/eh-sm-pic.ll b/test/ExecutionEngine/MCJIT/eh-sm-pic.ll
index 00c2bb0..f3e61dc 100644
--- a/test/ExecutionEngine/MCJIT/eh-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/eh-sm-pic.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -relocation-model=pic -code-model=small %s
+; RUN: %lli -relocation-model=pic -code-model=small %s
 ; XFAIL: cygwin, win32, mingw, mips, i686, i386, darwin, aarch64, arm
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)

diff --git a/test/ExecutionEngine/MCJIT/eh.ll b/test/ExecutionEngine/MCJIT/eh.ll
index c213573..aa81bb5 100644
--- a/test/ExecutionEngine/MCJIT/eh.ll
+++ b/test/ExecutionEngine/MCJIT/eh.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s
+; RUN: %lli %s
 ; XFAIL: arm, cygwin, win32, mingw
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)

diff --git a/test/ExecutionEngine/MCJIT/fpbitcast.ll b/test/ExecutionEngine/MCJIT/fpbitcast.ll
index ea39617..e6d06f8 100644
--- a/test/ExecutionEngine/MCJIT/fpbitcast.ll
+++ b/test/ExecutionEngine/MCJIT/fpbitcast.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -force-interpreter=true %s | FileCheck %s
+; RUN: %lli -force-interpreter=true %s | FileCheck %s
 ; CHECK: 40091eb8
 
 define i32 @test(double %x) {

diff --git a/test/ExecutionEngine/MCJIT/hello-sm-pic.ll b/test/ExecutionEngine/MCJIT/hello-sm-pic.ll
index 115846c..7db60f7 100644
--- a/test/ExecutionEngine/MCJIT/hello-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/hello-sm-pic.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -relocation-model=pic -code-model=small %s > /dev/null
+; RUN: %lli -relocation-model=pic -code-model=small %s > /dev/null
 ; XFAIL: mips, i686, i386, darwin, aarch64, arm
 
 @.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/hello.ll b/test/ExecutionEngine/MCJIT/hello.ll
index b744707..47e36a5 100644
--- a/test/ExecutionEngine/MCJIT/hello.ll
+++ b/test/ExecutionEngine/MCJIT/hello.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 @.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
 

diff --git a/test/ExecutionEngine/MCJIT/hello2.ll b/test/ExecutionEngine/MCJIT/hello2.ll
index cd033d5..13b2588 100644
--- a/test/ExecutionEngine/MCJIT/hello2.ll
+++ b/test/ExecutionEngine/MCJIT/hello2.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 @X = global i32 7		; <i32*> [#uses=0]
 @msg = internal global [13 x i8] c"Hello World\0A\00"		; <[13 x i8]*> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/load-object-a.ll b/test/ExecutionEngine/MCJIT/load-object-a.ll
index 9d27e41..080bf6c 100644
--- a/test/ExecutionEngine/MCJIT/load-object-a.ll
+++ b/test/ExecutionEngine/MCJIT/load-object-a.ll

@@ -1,20 +1,20 @@
 ; This first line will generate the .o files for the next run line
 ; RUN: rm -rf %t.cachedir %t.cachedir2 %t.cachedir3
 ; RUN: mkdir -p %t.cachedir %t.cachedir2 %t.cachedir3
-; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -enable-cache-manager -object-cache-dir=%t.cachedir %s
+; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -enable-cache-manager -object-cache-dir=%t.cachedir %s
 
 ; Collect generated objects.
 ; RUN: find %t.cachedir -type f -name 'multi-module-?.o' -exec mv -v '{}' %t.cachedir2 ';'
 
 ; This line tests MCJIT object loading
-; RUN: %lli_mcjit -extra-object=%t.cachedir2/multi-module-b.o -extra-object=%t.cachedir2/multi-module-c.o %s
+; RUN: %lli -extra-object=%t.cachedir2/multi-module-b.o -extra-object=%t.cachedir2/multi-module-c.o %s
 
 ; These lines put the object files into an archive
 ; RUN: llvm-ar r %t.cachedir3/load-object.a %t.cachedir2/multi-module-b.o
 ; RUN: llvm-ar r %t.cachedir3/load-object.a %t.cachedir2/multi-module-c.o
 
 ; This line test MCJIT archive loading
-; RUN: %lli_mcjit -extra-archive=%t.cachedir3/load-object.a %s
+; RUN: %lli -extra-archive=%t.cachedir3/load-object.a %s
 
 declare i32 @FB()
 

diff --git a/test/ExecutionEngine/MCJIT/multi-module-a.ll b/test/ExecutionEngine/MCJIT/multi-module-a.ll
index 8848ca6..dc3154c 100644
--- a/test/ExecutionEngine/MCJIT/multi-module-a.ll
+++ b/test/ExecutionEngine/MCJIT/multi-module-a.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll %s > /dev/null
+; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll %s > /dev/null
 
 declare i32 @FB()
 

diff --git a/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll b/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll
index 66fafc9..10cfdcd 100644
--- a/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll
+++ b/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-eh-b.ll %s
+; RUN: %lli -extra-module=%p/Inputs/multi-module-eh-b.ll %s
 ; XFAIL: arm, cygwin, win32, mingw
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)

diff --git a/test/ExecutionEngine/MCJIT/multi-module-sm-pic-a.ll b/test/ExecutionEngine/MCJIT/multi-module-sm-pic-a.ll
index f2fa59f..01faecc 100644
--- a/test/ExecutionEngine/MCJIT/multi-module-sm-pic-a.ll
+++ b/test/ExecutionEngine/MCJIT/multi-module-sm-pic-a.ll

@@ -1,5 +1,5 @@
-; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -relocation-model=pic -code-model=small %s > /dev/null
-; XFAIL: mips, i686, i386, arm
+; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386
 
 declare i32 @FB()
 

diff --git a/test/ExecutionEngine/MCJIT/non-extern-addend-smallcodemodel.ll b/test/ExecutionEngine/MCJIT/non-extern-addend-smallcodemodel.ll
index 21db67d..03de30a 100644
--- a/test/ExecutionEngine/MCJIT/non-extern-addend-smallcodemodel.ll
+++ b/test/ExecutionEngine/MCJIT/non-extern-addend-smallcodemodel.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -code-model=small %s > /dev/null
+; RUN: %lli -code-model=small %s > /dev/null
 ; XFAIL: mips
 ;
 ; FIXME: Merge this file with non-extern-addend.ll once AArch64 supports PC-rel

diff --git a/test/ExecutionEngine/MCJIT/non-extern-addend.ll b/test/ExecutionEngine/MCJIT/non-extern-addend.ll
index e0827f6..72e67ae 100644
--- a/test/ExecutionEngine/MCJIT/non-extern-addend.ll
+++ b/test/ExecutionEngine/MCJIT/non-extern-addend.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @foo(i32 %x, i32 %y, double %d) {
 entry:

diff --git a/test/ExecutionEngine/MCJIT/pr13727.ll b/test/ExecutionEngine/MCJIT/pr13727.ll
index 1c719c5..6f5ae39 100644
--- a/test/ExecutionEngine/MCJIT/pr13727.ll
+++ b/test/ExecutionEngine/MCJIT/pr13727.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -O0 -disable-lazy-compilation=false %s
+; RUN: %lli -O0 -disable-lazy-compilation=false %s
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.

diff --git a/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
index b540bfa..c315723 100644
--- a/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
+++ b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+; RUN: %lli -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
 
 declare i32 @FB()
 

diff --git a/test/ExecutionEngine/MCJIT/remote/cross-module-sm-pic-a.ll b/test/ExecutionEngine/MCJIT/remote/cross-module-sm-pic-a.ll
index 589ba2f..d47fc6c 100644
--- a/test/ExecutionEngine/MCJIT/remote/cross-module-sm-pic-a.ll
+++ b/test/ExecutionEngine/MCJIT/remote/cross-module-sm-pic-a.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext -relocation-model=pic -code-model=small %s > /dev/null
+; RUN: %lli -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext -relocation-model=pic -code-model=small %s > /dev/null
 ; XFAIL: mips, i686, i386, arm
 
 declare i32 @FB()

diff --git a/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
index fbbb8bd..0fd363b 100644
--- a/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
+++ b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
 
 declare i32 @FB()
 

diff --git a/test/ExecutionEngine/MCJIT/remote/multi-module-sm-pic-a.ll b/test/ExecutionEngine/MCJIT/remote/multi-module-sm-pic-a.ll
index 9c23169..d248c4b 100644
--- a/test/ExecutionEngine/MCJIT/remote/multi-module-sm-pic-a.ll
+++ b/test/ExecutionEngine/MCJIT/remote/multi-module-sm-pic-a.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext -relocation-model=pic -code-model=small %s > /dev/null
+; RUN: %lli -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target%exeext -relocation-model=pic -code-model=small %s > /dev/null
 ; XFAIL: mips, i686, i386, arm
 
 declare i32 @FB()

diff --git a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
index 6c8ab3d..30b4dd8 100644
--- a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
 
 define i32 @bar() nounwind {
 	ret i32 0

diff --git a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
index 48b939b..da4ddc6 100644
--- a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
+; RUN: %lli -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
 ; XFAIL: *
 ; This test should fail until remote symbol resolution is supported.
 

diff --git a/test/ExecutionEngine/MCJIT/remote/stubs-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/stubs-sm-pic.ll
index 88faf21..f6a1607 100644
--- a/test/ExecutionEngine/MCJIT/remote/stubs-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/remote/stubs-sm-pic.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -disable-lazy-compilation=false -relocation-model=pic -code-model=small %s
+; RUN: %lli -remote-mcjit -disable-lazy-compilation=false -relocation-model=pic -code-model=small %s
 ; XFAIL: *
 ; This function should fail until remote symbol resolution is supported.
 

diff --git a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
index e07178e..0f58710 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
+; RUN: %lli -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target%exeext %s
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.

diff --git a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
index 129350b..435c21a 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll

@@ -1,4 +1,4 @@
-; RUN:  %lli_mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
+; RUN:  %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
 
 ; Check that a variable is always aligned as specified.
 

diff --git a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
index 8eec0f2..9d11415 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
 
 define double @test(double* %DP, double %Arg) nounwind {
 	%D = load double* %DP		; <double> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
index 9fbaeb7..40b514f 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
+; RUN: %lli -remote-mcjit -mcjit-remote-process=lli-child-target%exeext %s > /dev/null
 
 @count = global i32 1, align 4
 

diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
index 29ab24b..5119b72 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -relocation-model=pic -code-model=small %s > /dev/null
+; RUN: %lli -remote-mcjit -relocation-model=pic -code-model=small %s > /dev/null
 ; XFAIL: mips, aarch64, arm, i686, i386
 
 @count = global i32 1, align 4

diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
index d62631f..ba3ffff 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
+; RUN: %lli -remote-mcjit -O0 -mcjit-remote-process=lli-child-target%exeext %s
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4

diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
index bad026f..bbc71af 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -O0 -relocation-model=pic -code-model=small %s
+; RUN: %lli -remote-mcjit -O0 -relocation-model=pic -code-model=small %s
 ; XFAIL: mips, aarch64, arm, i686, i386
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1

diff --git a/test/ExecutionEngine/MCJIT/simplesttest.ll b/test/ExecutionEngine/MCJIT/simplesttest.ll
index 318baf4..85c1715 100644
--- a/test/ExecutionEngine/MCJIT/simplesttest.ll
+++ b/test/ExecutionEngine/MCJIT/simplesttest.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() {
 	ret i32 0

diff --git a/test/ExecutionEngine/MCJIT/simpletest.ll b/test/ExecutionEngine/MCJIT/simpletest.ll
index 5b0f2dd..167a0fd 100644
--- a/test/ExecutionEngine/MCJIT/simpletest.ll
+++ b/test/ExecutionEngine/MCJIT/simpletest.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @bar() {
 	ret i32 0

diff --git a/test/ExecutionEngine/MCJIT/stubs-sm-pic.ll b/test/ExecutionEngine/MCJIT/stubs-sm-pic.ll
index 9e214f5..9b83ed2 100644
--- a/test/ExecutionEngine/MCJIT/stubs-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/stubs-sm-pic.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -disable-lazy-compilation=false -relocation-model=pic -code-model=small %s
+; RUN: %lli -disable-lazy-compilation=false -relocation-model=pic -code-model=small %s
 ; XFAIL: mips, i686, i386, aarch64, arm
 
 define i32 @main() nounwind {

diff --git a/test/ExecutionEngine/MCJIT/stubs.ll b/test/ExecutionEngine/MCJIT/stubs.ll
index f4aac33..b7d922f 100644
--- a/test/ExecutionEngine/MCJIT/stubs.ll
+++ b/test/ExecutionEngine/MCJIT/stubs.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -disable-lazy-compilation=false %s
+; RUN: %lli -disable-lazy-compilation=false %s
 
 define i32 @main() nounwind {
 entry:

diff --git a/test/ExecutionEngine/MCJIT/test-arith.ll b/test/ExecutionEngine/MCJIT/test-arith.ll
index e1cc23b..79f989f 100644
--- a/test/ExecutionEngine/MCJIT/test-arith.ll
+++ b/test/ExecutionEngine/MCJIT/test-arith.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() {
 	%A = add i8 0, 12		; <i8> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/test-branch.ll b/test/ExecutionEngine/MCJIT/test-branch.ll
index cdf1035..3ae55d0 100644
--- a/test/ExecutionEngine/MCJIT/test-branch.ll
+++ b/test/ExecutionEngine/MCJIT/test-branch.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ; test unconditional branch
 define i32 @main() {

diff --git a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
index 8a36cf2..c3cb931 100644
--- a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @_Z14func_exit_codev() nounwind uwtable {
 entry:

diff --git a/test/ExecutionEngine/MCJIT/test-call.ll b/test/ExecutionEngine/MCJIT/test-call.ll
index 1a0f008..313a6c5 100644
--- a/test/ExecutionEngine/MCJIT/test-call.ll
+++ b/test/ExecutionEngine/MCJIT/test-call.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 declare void @exit(i32)
 

diff --git a/test/ExecutionEngine/MCJIT/test-cast.ll b/test/ExecutionEngine/MCJIT/test-cast.ll
index 335ec50..667fa80 100644
--- a/test/ExecutionEngine/MCJIT/test-cast.ll
+++ b/test/ExecutionEngine/MCJIT/test-cast.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @foo() {
 	ret i32 0

diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
index 989a473..a425b5c 100644
--- a/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -O0 %s
+; RUN: %lli -O0 %s
 
 ; This test checks that common symbols have been allocated addresses honouring
 ; the alignment requirement.

diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols.ll b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
index 13ee06a..19e2ce5 100644
--- a/test/ExecutionEngine/MCJIT/test-common-symbols.ll
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -O0 -disable-lazy-compilation=false %s
+; RUN: %lli -O0 -disable-lazy-compilation=false %s
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.

diff --git a/test/ExecutionEngine/MCJIT/test-constantexpr.ll b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
index 8f15cbd..d01479a 100644
--- a/test/ExecutionEngine/MCJIT/test-constantexpr.ll
+++ b/test/ExecutionEngine/MCJIT/test-constantexpr.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ; This tests to make sure that we can evaluate weird constant expressions
 

diff --git a/test/ExecutionEngine/MCJIT/test-data-align.ll b/test/ExecutionEngine/MCJIT/test-data-align.ll
index 2472d95..f21ea2e 100644
--- a/test/ExecutionEngine/MCJIT/test-data-align.ll
+++ b/test/ExecutionEngine/MCJIT/test-data-align.ll

@@ -1,4 +1,4 @@
-; RUN:  %lli_mcjit -O0 %s
+; RUN:  %lli -O0 %s
 
 ; Check that a variable is always aligned as specified.
 

diff --git a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
index f094f3d..adb0550 100644
--- a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/test-fp.ll b/test/ExecutionEngine/MCJIT/test-fp.ll
index b10e9d6..2bf0210 100644
--- a/test/ExecutionEngine/MCJIT/test-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/test-global-ctors.ll b/test/ExecutionEngine/MCJIT/test-global-ctors.ll
index 947d8f5..ec87d15 100644
--- a/test/ExecutionEngine/MCJIT/test-global-ctors.ll
+++ b/test/ExecutionEngine/MCJIT/test-global-ctors.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 ; XFAIL: darwin
 @var = global i32 1, align 4
 @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @ctor_func }]

diff --git a/test/ExecutionEngine/MCJIT/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/MCJIT/test-global-init-nonzero-sm-pic.ll
index eb031f2..26bd838 100644
--- a/test/ExecutionEngine/MCJIT/test-global-init-nonzero-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/test-global-init-nonzero-sm-pic.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -relocation-model=pic -code-model=small %s > /dev/null
+; RUN: %lli -relocation-model=pic -code-model=small %s > /dev/null
 ; XFAIL: mips, aarch64, arm, i686, i386
 
 @count = global i32 1, align 4

diff --git a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
index b9f74b8..3877e9a 100644
--- a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
+++ b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 @count = global i32 1, align 4
 

diff --git a/test/ExecutionEngine/MCJIT/test-global.ll b/test/ExecutionEngine/MCJIT/test-global.ll
index 6a8c042..69e5455 100644
--- a/test/ExecutionEngine/MCJIT/test-global.ll
+++ b/test/ExecutionEngine/MCJIT/test-global.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 @count = global i32 0, align 4
 

diff --git a/test/ExecutionEngine/MCJIT/test-loadstore.ll b/test/ExecutionEngine/MCJIT/test-loadstore.ll
index 9038194..1797599 100644
--- a/test/ExecutionEngine/MCJIT/test-loadstore.ll
+++ b/test/ExecutionEngine/MCJIT/test-loadstore.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define void @test(i8* %P, i16* %P.upgrd.1, i32* %P.upgrd.2, i64* %P.upgrd.3) {
 	%V = load i8* %P		; <i8> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/test-local.ll b/test/ExecutionEngine/MCJIT/test-local.ll
index d7c1734..ec5ba16 100644
--- a/test/ExecutionEngine/MCJIT/test-local.ll
+++ b/test/ExecutionEngine/MCJIT/test-local.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:

diff --git a/test/ExecutionEngine/MCJIT/test-logical.ll b/test/ExecutionEngine/MCJIT/test-logical.ll
index a03833e..05b381b 100644
--- a/test/ExecutionEngine/MCJIT/test-logical.ll
+++ b/test/ExecutionEngine/MCJIT/test-logical.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() {
 	%A = and i8 4, 8		; <i8> [#uses=2]

diff --git a/test/ExecutionEngine/MCJIT/test-loop.ll b/test/ExecutionEngine/MCJIT/test-loop.ll
index 5ed8c40..e951a14 100644
--- a/test/ExecutionEngine/MCJIT/test-loop.ll
+++ b/test/ExecutionEngine/MCJIT/test-loop.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() {
 ; <label>:0

diff --git a/test/ExecutionEngine/MCJIT/test-phi.ll b/test/ExecutionEngine/MCJIT/test-phi.ll
index 4245cca..c5bdfd5 100644
--- a/test/ExecutionEngine/MCJIT/test-phi.ll
+++ b/test/ExecutionEngine/MCJIT/test-phi.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ; test phi node
 @Y = global i32 6		; <i32*> [#uses=1]

diff --git a/test/ExecutionEngine/MCJIT/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/MCJIT/test-ptr-reloc-sm-pic.ll
index 9e06742..21bcaef 100644
--- a/test/ExecutionEngine/MCJIT/test-ptr-reloc-sm-pic.ll
+++ b/test/ExecutionEngine/MCJIT/test-ptr-reloc-sm-pic.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -O0 -relocation-model=pic -code-model=small %s
+; RUN: %lli -O0 -relocation-model=pic -code-model=small %s
 ; XFAIL: mips, aarch64, arm, i686, i386
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1

diff --git a/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
index 871d8bf..f139ddf 100644
--- a/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
+++ b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -O0 %s
+; RUN: %lli -O0 %s
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4

diff --git a/test/ExecutionEngine/MCJIT/test-ret.ll b/test/ExecutionEngine/MCJIT/test-ret.ll
index 6bfc480..025f53e 100644
--- a/test/ExecutionEngine/MCJIT/test-ret.ll
+++ b/test/ExecutionEngine/MCJIT/test-ret.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 ; test return instructions
 define void @test1() {

diff --git a/test/ExecutionEngine/MCJIT/test-return.ll b/test/ExecutionEngine/MCJIT/test-return.ll
index 4db1c3f..d464a4b 100644
--- a/test/ExecutionEngine/MCJIT/test-return.ll
+++ b/test/ExecutionEngine/MCJIT/test-return.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:

diff --git a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
index b4367d0..68276e6 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 
 define i32 @main() {

diff --git a/test/ExecutionEngine/MCJIT/test-setcond-int.ll b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
index 8c7d815..48dc021 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-int.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-int.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() {
 	%int1 = add i32 0, 0		; <i32> [#uses=6]

diff --git a/test/ExecutionEngine/MCJIT/test-shift.ll b/test/ExecutionEngine/MCJIT/test-shift.ll
index 8d9a94e..590e262 100644
--- a/test/ExecutionEngine/MCJIT/test-shift.ll
+++ b/test/ExecutionEngine/MCJIT/test-shift.ll

@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit %s > /dev/null
+; RUN: %lli %s > /dev/null
 
 define i32 @main() {
 	%shamt = add i8 0, 1		; <i8> [#uses=8]

diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/MachO_ARM64_relocations.s b/test/ExecutionEngine/RuntimeDyld/AArch64/MachO_ARM64_relocations.s
new file mode 100644
index 0000000..04d269e
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/MachO_ARM64_relocations.s

@@ -0,0 +1,67 @@
+# RUN: llvm-mc -triple=arm64-apple-ios7.0.0 -code-model=small -relocation-model=pic -filetype=obj -o %T/foo.o %s
+# RUN: llvm-rtdyld -triple=arm64-apple-ios7.0.0 -map-section foo.o,__text=0x10bc0 -verify -check=%s %/T/foo.o
+
+    .section  __TEXT,__text,regular,pure_instructions
+    .ios_version_min 7, 0
+    .globl  _foo
+    .align  2
+_foo:
+    movz  w0, #0
+    ret
+
+    .globl  _test_branch_reloc
+    .align  2
+
+
+# Test ARM64_RELOC_BRANCH26 relocation. The branch instruction only encodes 26
+# bits of the 28-bit possible branch range. The lower two bits are always zero
+# and therefore ignored.
+# rtdyld-check:  decode_operand(br1, 0)[25:0] = (_foo - br1)[27:2]
+_test_branch_reloc:
+br1:
+    b _foo
+    ret
+
+
+# Test ARM64_RELOC_PAGE21 and ARM64_RELOC_PAGEOFF12 relocation. adrp encodes
+# the PC-relative page (4 KiB) difference between the adrp instruction and the
+# variable ptr. ldr encodes the offset of the variable within the page. The ldr
+# instruction perfroms an implicit shift on the encoded immediate (imm<<3).
+# rtdyld-check:  decode_operand(adrp1, 1) = (_ptr[32:12] - adrp1[32:12])
+# rtdyld-check:  decode_operand(ldr1, 2) = _ptr[11:3]
+    .globl  _test_adrp_ldr
+    .align  2
+_test_adrp_ldr:
+adrp1:
+    adrp x0, _ptr@PAGE
+ldr1:
+    ldr  x0, [x0, _ptr@PAGEOFF]
+    ret
+
+# Test ARM64_RELOC_GOT_LOAD_PAGE21 and ARM64_RELOC_GOT_LOAD_PAGEOFF12
+# relocation. adrp encodes the PC-relative page (4 KiB) difference between the
+# adrp instruction and the GOT entry for ptr. ldr encodes the offset of the GOT
+# entry within the page. The ldr instruction perfroms an implicit shift on the
+# encoded immediate (imm<<3).
+# rtdyld-check:  *{8}(stub_addr(foo.o, __text, _ptr)) = _ptr
+# rtdyld-check:  decode_operand(adrp2, 1) = (stub_addr(foo.o, __text, _ptr)[32:12] - adrp2[32:12])
+# rtdyld-check:  decode_operand(ldr2, 2) = stub_addr(foo.o, __text, _ptr)[11:3]
+    .globl  _test_adrp_ldr
+    .align  2
+_test_got_adrp_ldr:
+adrp2:
+    adrp x0, _ptr@GOTPAGE
+ldr2:
+    ldr  x0, [x0, _ptr@GOTPAGEOFF]
+    ret
+
+
+# Test ARM64_RELOC_UNSIGNED relocation. The absolute 64-bit address of the
+# function should be stored at the 8-byte memory location.
+# rtdyld-check: *{8}_ptr = _foo
+    .section  __DATA,__data
+    .globl  _ptr
+    .align  3
+    .fill 4096, 1, 0
+_ptr:
+    .quad _foo

diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/lit.local.cfg b/test/ExecutionEngine/RuntimeDyld/AArch64/lit.local.cfg
new file mode 100644
index 0000000..cec29af
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/lit.local.cfg

@@ -0,0 +1,3 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
+

diff --git a/test/ExecutionEngine/RuntimeDyld/ARM/MachO_ARM_PIC_relocations.s b/test/ExecutionEngine/RuntimeDyld/ARM/MachO_ARM_PIC_relocations.s
new file mode 100644
index 0000000..7ff3a89
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/ARM/MachO_ARM_PIC_relocations.s

@@ -0,0 +1,51 @@
+# RUN: llvm-mc -triple=armv7s-apple-ios7.0.0 -relocation-model=pic -filetype=obj -o %T/foo.o %s
+# RUN: llvm-rtdyld -triple=armv7s-apple-ios7.0.0 -verify -check=%s %/T/foo.o
+
+        .syntax unified
+        .section        __TEXT,__text,regular,pure_instructions
+        .globl  bar
+        .align  2
+bar:
+# Check lower 16-bits of section difference relocation
+# rtdyld-check: decode_operand(insn1, 1) = (foo$non_lazy_ptr-(nextPC+8))[15:0]
+insn1:
+        movw    r0, :lower16:(foo$non_lazy_ptr-(nextPC+8))
+# Check upper 16-bits of section difference relocation
+# rtdyld-check: decode_operand(insn2, 2) = (foo$non_lazy_ptr-(nextPC+8))[31:16]
+insn2:
+        movt    r0, :upper16:(foo$non_lazy_ptr-(nextPC+8))
+nextPC:
+        add     r1, r0, r0
+
+# Check stub generation for external symbols by referencing a common symbol, 'baz'.
+# Check both the content of the stub, and the reference to the stub.
+# Stub should contain '0xe51ff004' (ldr pc, [pc, #-4]), followed by the target.
+#
+# rtdyld-check: *{4}(stub_addr(foo.o, __text, baz)) = 0xe51ff004
+# rtdyld-check: *{4}(stub_addr(foo.o, __text, baz) + 4) = baz
+#
+# rtdyld-check: decode_operand(insn3, 0) = stub_addr(foo.o, __text, baz) - (insn3 + 8)
+insn3:
+        bl      baz
+
+# Check stub generation for internal symbols by referencing 'bar'.
+# rtdyld-check: *{4}(stub_addr(foo.o, __text, bar) + 4) = bar
+insn4:
+        bl      bar
+        bx	lr
+
+# Add 'aaa' to the common symbols to make sure 'baz' isn't at the start of the
+# section. This ensures that we test VANILLA relocation addends correctly.
+        .comm   aaa, 4, 2
+        .comm   baz, 4, 2
+        .comm   foo, 4, 2
+
+# Check that the symbol pointer section entries are fixed up properly:
+# rtdyld-check: *{4}foo$non_lazy_ptr = foo
+        .section	__DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+  	.align	2
+foo$non_lazy_ptr:
+	.indirect_symbol	foo
+	.long	0
+
+.subsections_via_symbols

diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg b/test/ExecutionEngine/RuntimeDyld/ARM/lit.local.cfg
similarity index 100%
copy from test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
copy to test/ExecutionEngine/RuntimeDyld/ARM/lit.local.cfg


diff --git a/test/ExecutionEngine/RuntimeDyld/Inputs/arm_secdiff_reloc.o b/test/ExecutionEngine/RuntimeDyld/Inputs/arm_secdiff_reloc.o
deleted file mode 100644
index 5392266..0000000
--- a/test/ExecutionEngine/RuntimeDyld/Inputs/arm_secdiff_reloc.o
+++ /dev/null
Binary files differ

diff --git a/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_DynNoPIC_relocations.s b/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_DynNoPIC_relocations.s
new file mode 100644
index 0000000..f427b98
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_DynNoPIC_relocations.s

@@ -0,0 +1,45 @@
+# RUN: llvm-mc -triple=i386-apple-macosx10.4 -relocation-model=dynamic-no-pic -filetype=obj -o %T/test_i386.o %s
+# RUN: llvm-rtdyld -triple=i386-apple-macosx10.4 -verify -check=%s %/T/test_i386.o
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	bar
+	.align	4, 0x90
+bar:
+	calll	tmp0$pb
+tmp0$pb:
+	popl	%eax
+# Test section difference relocation to non-lazy ptr section.
+# rtdyld-check: decode_operand(inst1, 4) = x$non_lazy_ptr - tmp0$pb
+inst1:
+	movl	x$non_lazy_ptr-tmp0$pb(%eax), %eax
+        movl    (%eax), %ebx
+
+# Test VANILLA relocation to jump table.
+# rtdyld-check: decode_operand(inst2, 0) = bling$stub - next_pc(inst2)
+inst2:
+        calll	bling$stub
+        addl    %ebx, %eax
+
+# Test scattered VANILLA relocations.
+inst3:
+        movl    y+4, %ecx
+        addl    %ecx, %eax
+	retl
+
+	.section	__IMPORT,__jump_table,symbol_stubs,pure_instructions+self_modifying_code,5
+bling$stub:
+	.indirect_symbol	bling
+	.ascii	"\364\364\364\364\364"
+
+	.section	__IMPORT,__pointers,non_lazy_symbol_pointers
+x$non_lazy_ptr:
+	.indirect_symbol	x
+	.long	0
+
+        .comm   x,4,2
+        .comm   bling,4,2
+
+        .globl	y
+.zerofill __DATA,__common,y,8,3
+
+.subsections_via_symbols

diff --git a/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_eh_frame.s b/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_eh_frame.s
new file mode 100644
index 0000000..8814ec3
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/X86/MachO_i386_eh_frame.s

@@ -0,0 +1,30 @@
+# RUN: llvm-mc -triple=i386-apple-macosx10.4 -relocation-model=dynamic-no-pic -filetype=obj -o %T/MachO_i386_eh_frame.o %s
+# RUN: llvm-rtdyld -triple=i386-apple-macosx10.4 -verify -map-section MachO_i386_eh_frame.o,__text=0x2000 -check=%s %/T/MachO_i386_eh_frame.o
+
+# rtdyld-check: *{4}(section_addr(MachO_i386_eh_frame.o, __eh_frame) + 0x20) = (main - (section_addr(MachO_i386_eh_frame.o, __eh_frame) + 0x20))[31:0]
+# rtdyld-check: *{4}(section_addr(MachO_i386_eh_frame.o, __eh_frame) + 0x24) = 0x9
+
+	.section	__TEXT,__text,regular,pure_instructions
+
+	.globl	bar
+	.align	4, 0x90
+bar:
+        retl
+
+        .globl	main
+	.align	4, 0x90
+main:
+	.cfi_startproc
+	pushl	%ebp
+Ltmp0:
+	.cfi_def_cfa_offset 8
+Ltmp1:
+	.cfi_offset %ebp, -8
+	movl	%esp, %ebp
+Ltmp2:
+	.cfi_def_cfa_register %ebp
+	popl	%ebp
+	jmp	bar
+	.cfi_endproc
+
+.subsections_via_symbols

diff --git a/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s b/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s
index e87b449..502f276 100644
--- a/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s
+++ b/test/ExecutionEngine/RuntimeDyld/X86/MachO_x86-64_PIC_relocations.s

@@ -1,6 +1,5 @@
-# RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -relocation-model=pic -filetype=obj -o %t.o %s
-# RUN: llvm-rtdyld -triple=x86_64-apple-macosx10.9 -verify -check=%s %t.o
-# RUN: rm %t.o
+# RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -relocation-model=pic -filetype=obj -o %T/test_x86-64.o %s
+# RUN: llvm-rtdyld -triple=x86_64-apple-macosx10.9 -verify -check=%s %/T/test_x86-64.o
 
         .section	__TEXT,__text,regular,pure_instructions
 	.globl	foo
@@ -20,9 +19,20 @@
 # rtdyld-check: decode_operand(insn2, 4) = x - next_pc(insn2)
 insn2:
 	movl	x(%rip), %eax
-	movl	$0, %eax
+
+# Test PC-rel GOT relocation.
+# Verify both the contents of the GOT entry for y, and that the movq instruction
+# references the correct GOT entry address:
+# rtdyld-check: *{8}(stub_addr(test_x86-64.o, __text, y)) = y
+# rtdyld-check: decode_operand(insn3, 4) = stub_addr(test_x86-64.o, __text, y) - next_pc(insn3)
+insn3:
+        movq	y@GOTPCREL(%rip), %rax
+
+        movl	$0, %eax
 	retq
 
+        .comm   y,4,2
+
         .section	__DATA,__data
 	.globl	x
 	.align	2

diff --git a/test/ExecutionEngine/RuntimeDyld/macho_relocations.test b/test/ExecutionEngine/RuntimeDyld/macho_relocations.test
deleted file mode 100644
index 92e4dd7..0000000
--- a/test/ExecutionEngine/RuntimeDyld/macho_relocations.test
+++ /dev/null

@@ -1 +0,0 @@
-RUN: llvm-rtdyld -printline %p/Inputs/arm_secdiff_reloc.o

diff --git a/test/ExecutionEngine/frem.ll b/test/ExecutionEngine/frem.ll
new file mode 100644
index 0000000..7e0b606
--- /dev/null
+++ b/test/ExecutionEngine/frem.ll

@@ -0,0 +1,20 @@
+; LLI.exe used to crash on Windows\X86 when certain single precession
+; floating point intrinsics (defined as macros) are used.
+; This unit test guards against the failure.
+;
+; RUN: %lli %s | FileCheck %s
+
+@flt = internal global float 12.0e+0
+@str = internal constant [18 x i8] c"Double value: %f\0A\00"
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+define i32 @main() {
+  %flt = load float* @flt
+  %float2 = frem float %flt, 5.0
+  %double1 = fpext float %float2 to double
+  call i32 (i8*, ...)* @printf(i8* getelementptr ([18 x i8]* @str, i32 0, i64 0), double %double1)
+  ret i32 0
+}
+
+; CHECK: Double value: 2.0

diff --git a/test/ExecutionEngine/hello.ll b/test/ExecutionEngine/hello.ll
index f2c4a7f..47e36a5 100644
--- a/test/ExecutionEngine/hello.ll
+++ b/test/ExecutionEngine/hello.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 @.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
 

diff --git a/test/ExecutionEngine/hello2.ll b/test/ExecutionEngine/hello2.ll
index 155ed41..13b2588 100644
--- a/test/ExecutionEngine/hello2.ll
+++ b/test/ExecutionEngine/hello2.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 @X = global i32 7		; <i32*> [#uses=0]
 @msg = internal global [13 x i8] c"Hello World\0A\00"		; <[13 x i8]*> [#uses=1]

diff --git a/test/ExecutionEngine/mov64zext32.ll b/test/ExecutionEngine/mov64zext32.ll
index f38c21a..a5b2461 100644
--- a/test/ExecutionEngine/mov64zext32.ll
+++ b/test/ExecutionEngine/mov64zext32.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 define i64 @foo() {
   ret i64 42

diff --git a/test/ExecutionEngine/simpletest.ll b/test/ExecutionEngine/simpletest.ll
index 83f9b84..167a0fd 100644
--- a/test/ExecutionEngine/simpletest.ll
+++ b/test/ExecutionEngine/simpletest.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 define i32 @bar() {
 	ret i32 0

diff --git a/test/ExecutionEngine/stubs.ll b/test/ExecutionEngine/stubs.ll
index b40e4be..b7d922f 100644
--- a/test/ExecutionEngine/stubs.ll
+++ b/test/ExecutionEngine/stubs.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli -disable-lazy-compilation=false %s
-; XFAIL: arm
 
 define i32 @main() nounwind {
 entry:

diff --git a/test/ExecutionEngine/test-call-no-external-funcs.ll b/test/ExecutionEngine/test-call-no-external-funcs.ll
index b2dd532..c3cb931 100644
--- a/test/ExecutionEngine/test-call-no-external-funcs.ll
+++ b/test/ExecutionEngine/test-call-no-external-funcs.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 define i32 @_Z14func_exit_codev() nounwind uwtable {
 entry:

diff --git a/test/ExecutionEngine/test-call.ll b/test/ExecutionEngine/test-call.ll
index 3fd39fe..313a6c5 100644
--- a/test/ExecutionEngine/test-call.ll
+++ b/test/ExecutionEngine/test-call.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 declare void @exit(i32)
 

diff --git a/test/ExecutionEngine/test-common-symbols.ll b/test/ExecutionEngine/test-common-symbols.ll
index 4dd9265..19e2ce5 100644
--- a/test/ExecutionEngine/test-common-symbols.ll
+++ b/test/ExecutionEngine/test-common-symbols.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli -O0 -disable-lazy-compilation=false %s
-; XFAIL: arm
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.

diff --git a/test/ExecutionEngine/test-fp-no-external-funcs.ll b/test/ExecutionEngine/test-fp-no-external-funcs.ll
index 139b2ef..61b12c2 100644
--- a/test/ExecutionEngine/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/test-fp-no-external-funcs.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli  %s > /dev/null
-; XFAIL: arm
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]

diff --git a/test/ExecutionEngine/test-fp.ll b/test/ExecutionEngine/test-fp.ll
index c906450..2bf0210 100644
--- a/test/ExecutionEngine/test-fp.ll
+++ b/test/ExecutionEngine/test-fp.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]

diff --git a/test/ExecutionEngine/test-global-init-nonzero.ll b/test/ExecutionEngine/test-global-init-nonzero.ll
index ef2d37b..749a485 100644
--- a/test/ExecutionEngine/test-global-init-nonzero.ll
+++ b/test/ExecutionEngine/test-global-init-nonzero.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli  %s > /dev/null
-; XFAIL: arm
 
 @count = global i32 1, align 4
 

diff --git a/test/ExecutionEngine/test-global.ll b/test/ExecutionEngine/test-global.ll
index 2ea50de..69e5455 100644
--- a/test/ExecutionEngine/test-global.ll
+++ b/test/ExecutionEngine/test-global.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 @count = global i32 0, align 4
 

diff --git a/test/ExecutionEngine/test-loadstore.ll b/test/ExecutionEngine/test-loadstore.ll
index 7574314..1797599 100644
--- a/test/ExecutionEngine/test-loadstore.ll
+++ b/test/ExecutionEngine/test-loadstore.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 define void @test(i8* %P, i16* %P.upgrd.1, i32* %P.upgrd.2, i64* %P.upgrd.3) {
 	%V = load i8* %P		; <i8> [#uses=1]

diff --git a/test/ExecutionEngine/test-local.ll b/test/ExecutionEngine/test-local.ll
index 240b174..ec5ba16 100644
--- a/test/ExecutionEngine/test-local.ll
+++ b/test/ExecutionEngine/test-local.ll

@@ -1,5 +1,4 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
 
 define i32 @main() nounwind uwtable {
 entry:

diff --git a/test/Feature/aliases.ll b/test/Feature/aliases.ll
index ad1d1b0..c11fc47 100644
--- a/test/Feature/aliases.ll
+++ b/test/Feature/aliases.ll

@@ -21,10 +21,10 @@
 define i32 @foo_f() {
   ret i32 0
 }
-@bar_f = alias weak_odr %FunTy* @foo_f
+@bar_f = weak_odr alias %FunTy* @foo_f
 @bar_ff = alias i32()* @bar_f
 
-@bar_i = alias internal i32* @bar
+@bar_i = internal alias i32* @bar
 
 @A = alias bitcast (i32* @bar to i64*)
 

diff --git a/test/Feature/comdat.ll b/test/Feature/comdat.ll
index 05fb87c..1e878bb 100644
--- a/test/Feature/comdat.ll
+++ b/test/Feature/comdat.ll

@@ -16,3 +16,6 @@
   ret void
 }
 ; CHECK: define void @f() comdat $f
+
+$i = comdat largest
+@i = internal global i32 0, comdat $i

diff --git a/test/Feature/md_on_instruction.ll b/test/Feature/md_on_instruction.ll
index 955ace3..fe01162 100644
--- a/test/Feature/md_on_instruction.ll
+++ b/test/Feature/md_on_instruction.ll

@@ -18,10 +18,10 @@
 
 !llvm.module.flags = !{!6}
 
-!0 = metadata !{i32 458798, i32 0, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", metadata !1, i32 1, metadata !2, i1 false, i1 true}
-!1 = metadata !{i32 458769, metadata !4, i32 12, metadata !"clang 1.0", i1 true, metadata !"", i32 0, metadata !5, metadata !5, metadata !4, null, null, metadata !""}
-!2 = metadata !{i32 458788, null, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\000", i32 0, metadata !1, metadata !2, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x11\0012\00clang 1.0\001\00\000\00\000", metadata !4, metadata !5, metadata !5, metadata !4, null, null} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !1} ; [ DW_TAG_base_type ]
 !3 = metadata !{i32 1, i32 13, metadata !1, metadata !1}
 !4 = metadata !{metadata !"foo.c", metadata !"/tmp"}
 !5 = metadata !{i32 0}
-!6 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!6 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Feature/optnone-llc.ll b/test/Feature/optnone-llc.ll
index 6cb27d0..f19fa88 100644
--- a/test/Feature/optnone-llc.ll
+++ b/test/Feature/optnone-llc.ll

@@ -46,7 +46,7 @@
 ; LLC-Ox-DAG: Skipping pass 'Merge disjoint stack slots'
 ; LLC-Ox-DAG: Skipping pass 'Optimize machine instruction PHIs'
 ; LLC-Ox-DAG: Skipping pass 'Peephole Optimizations'
-; LLC-Ox-DAG: Skipping pass 'Post RA top-down list latency scheduler'
+; LLC-Ox-DAG: Skipping pass 'Post{{.*}}RA{{.*}}{{[Ss]}}cheduler'
 ; LLC-Ox-DAG: Skipping pass 'Remove dead machine instructions'
 ; LLC-Ox-DAG: Skipping pass 'Tail Duplication'
 

diff --git a/test/Feature/weak_constant.ll b/test/Feature/weak_constant.ll
index fba7f12..d331bf5 100644
--- a/test/Feature/weak_constant.ll
+++ b/test/Feature/weak_constant.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts -S > %t
+; RUN: opt < %s -O3 -S > %t
 ; RUN:   grep undef %t | count 1
 ; RUN:   grep 5 %t | count 1
 ; RUN:   grep 7 %t | count 1

diff --git a/test/FileCheck/check-empty.txt b/test/FileCheck/check-empty.txt
new file mode 100644
index 0000000..9caea65
--- /dev/null
+++ b/test/FileCheck/check-empty.txt

@@ -0,0 +1,11 @@
+// RUN: not FileCheck -check-prefix=FOO %s </dev/null 2>&1 | FileCheck -check-prefix=EMPTY-ERR %s
+// RUN: not FileCheck -check-prefix=NOFOO %s </dev/null 2>&1 | FileCheck -check-prefix=EMPTY-ERR %s
+// RUN: not FileCheck -check-prefix=FOO -allow-empty %s </dev/null 2>&1 | FileCheck -check-prefix=NO-EMPTY-ERR -check-prefix=NOT-FOUND %s
+// RUN: FileCheck -check-prefix=NOFOO -allow-empty %s </dev/null 2>&1 | FileCheck -allow-empty -check-prefix=NO-EMPTY-ERR %s
+
+; FOO: foo
+; NOFOO-NOT: foo
+
+; EMPTY-ERR: FileCheck error: '-' is empty.
+; NO-EMPTY-ERR-NOT: FileCheck error: '-' is empty.
+; NOT-FOUND: error: expected string not found in input

diff --git a/test/FileCheck/implicit-check-not.txt b/test/FileCheck/implicit-check-not.txt
new file mode 100644
index 0000000..4267736
--- /dev/null
+++ b/test/FileCheck/implicit-check-not.txt

@@ -0,0 +1,44 @@
+; RUN: sed 's#^;.*##' %s | FileCheck -check-prefix=CHECK-PASS -implicit-check-not=warning: %s
+; RUN: sed 's#^;.*##' %s | not FileCheck -check-prefix=CHECK-FAIL1 -implicit-check-not=warning: %s 2>&1 | FileCheck %s -check-prefix CHECK-ERROR1
+; RUN: sed 's#^;.*##' %s | not FileCheck -check-prefix=CHECK-FAIL2 -implicit-check-not=warning: %s 2>&1 | FileCheck %s -check-prefix CHECK-ERROR2
+; RUN: sed 's#^;.*##' %s | not FileCheck -check-prefix=CHECK-FAIL3 -implicit-check-not=warning: %s 2>&1 | FileCheck %s -check-prefix CHECK-ERROR3
+; RUN: sed 's#^;.*##' %s | not FileCheck -check-prefix=CHECK-FAIL1 -implicit-check-not='{{aaa|bbb|ccc}}' %s 2>&1 | FileCheck %s -check-prefix CHECK-ERROR4
+; RUN: sed 's#^;.*##' %s | not FileCheck -check-prefix=CHECK-FAIL1 -implicit-check-not=aaa -implicit-check-not=bbb -implicit-check-not=ccc %s 2>&1 | FileCheck %s -check-prefix CHECK-ERROR5
+; RUN: sed 's#^;.*##' %s | not FileCheck -check-prefix=CHECK-FAIL2 -implicit-check-not=aaa -implicit-check-not=bbb -implicit-check-not=ccc %s 2>&1 | FileCheck %s -check-prefix CHECK-ERROR6
+; RUN: sed 's#^;.*##' %s | not FileCheck -check-prefix=CHECK-FAIL3 -implicit-check-not=aaa -implicit-check-not=bbb -implicit-check-not=ccc %s 2>&1 | FileCheck %s -check-prefix CHECK-ERROR7
+
+warning: aaa
+; CHECK-PASS: warning: aaa
+; CHECK-ERROR1: error: CHECK-FAIL1-NOT: string occurred!
+; CHECK-ERROR1: command line:1:22: note: CHECK-FAIL1-NOT: pattern specified here
+; CHECK-ERROR1-NEXT: -implicit-check-not='warning:'
+; CHECK-FAIL2: warning: aaa
+; CHECK-FAIL3: warning: aaa
+; CHECK-ERROR4: error: CHECK-FAIL1-NOT: string occurred!
+; CHECK-ERROR4: command line:1:22: note: CHECK-FAIL1-NOT: pattern specified here
+; CHECK-ERROR4-NEXT: {{-implicit-check-not='\{\{aaa\|bbb\|ccc\}\}'}}
+; CHECK-ERROR5: error: CHECK-FAIL1-NOT: string occurred!
+; CHECK-ERROR5: command line:1:22: note: CHECK-FAIL1-NOT: pattern specified here
+; CHECK-ERROR5-NEXT: -implicit-check-not='aaa'
+
+warning: bbb
+; CHECK-PASS: warning: bbb
+; CHECK-FAIL1: warning: bbb
+; CHECK-ERROR2: error: CHECK-FAIL2-NOT: string occurred!
+; CHECK-ERROR2: command line:1:22: note: CHECK-FAIL2-NOT: pattern specified here
+; CHECK-ERROR2-NEXT: -implicit-check-not='warning:'
+; CHECK-FAIL3: warning: bbb
+; CHECK-ERROR6: error: CHECK-FAIL2-NOT: string occurred!
+; CHECK-ERROR6: command line:1:22: note: CHECK-FAIL2-NOT: pattern specified here
+; CHECK-ERROR6-NEXT: -implicit-check-not='bbb'
+
+warning: ccc
+; CHECK-PASS: warning: ccc
+; CHECK-FAIL1: warning: ccc
+; CHECK-FAIL2: warning: ccc
+; CHECK-ERROR3: error: CHECK-FAIL3-NOT: string occurred!
+; CHECK-ERROR3: command line:1:22: note: CHECK-FAIL3-NOT: pattern specified here
+; CHECK-ERROR3-NEXT: -implicit-check-not='warning:'
+; CHECK-ERROR7: error: CHECK-FAIL3-NOT: string occurred!
+; CHECK-ERROR7: command line:1:22: note: CHECK-FAIL3-NOT: pattern specified here
+; CHECK-ERROR7-NEXT: -implicit-check-not='ccc'

diff --git a/test/FileCheck/validate-check-prefix.txt b/test/FileCheck/validate-check-prefix.txt
index db3392d..6efec44 100644
--- a/test/FileCheck/validate-check-prefix.txt
+++ b/test/FileCheck/validate-check-prefix.txt

@@ -2,6 +2,7 @@
 // RUN: FileCheck -check-prefix=A1a-B_c -input-file %s %s
 // RUN: not FileCheck -check-prefix=REPEAT -check-prefix=REPEAT -input-file %s %s 2>&1 | FileCheck -check-prefix=BAD_PREFIX %s
 // RUN: not FileCheck -check-prefix=VALID -check-prefix=A! -input-file %s %s 2>&1 | FileCheck -check-prefix=BAD_PREFIX %s
+// RUN: not FileCheck -check-prefix= -input-file %s %s 2>&1 | FileCheck -check-prefix=BAD_PREFIX %s
 foobar
 ; A1a-B_c: foobar
 

diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_cfi.ll b/test/Instrumentation/AddressSanitizer/X86/asm_cfi.ll
new file mode 100644
index 0000000..6bfb153
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_cfi.ll

@@ -0,0 +1,54 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2 -asm-instrumentation=address -asan-instrument-assembly | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: mov8b_rbp
+; CHECK: pushq %rbp
+; CHECK-NOT: .cfi_adjust_cfa_offset 8
+; CHECK: movq %rbp, %rbp
+; CHECK: .cfi_remember_state
+; CHECK: .cfi_def_cfa_register %rbp
+; CHECK: leaq -128(%rsp)
+; CHECK: callq __asan_report_load8@PLT
+; CHECK: leaq 128(%rsp)
+; CHECK: popq %rbp
+; CHECK: .cfi_restore_state
+; CHECK-NOT: .cfi_adjust_cfa_offset -8
+; CHECK: retq
+define void @mov8b_rbp(i64* %dst, i64* %src) #0 {
+entry:
+  tail call void asm sideeffect "movq ($0), %rax \0A\09movq %rax, ($1) \0A\09", "r,r,~{rax},~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %src, i64* %dst)
+  ret void
+}
+
+; CHECK-LABEL: mov8b_rsp
+; CHECK: pushq %rbp
+; CHECK: .cfi_adjust_cfa_offset 8
+; CHECK: movq %rsp, %rbp
+; CHECK: .cfi_remember_state
+; CHECK: .cfi_def_cfa_register %rbp
+; CHECK: leaq -128(%rsp)
+; CHECK: callq __asan_report_load8@PLT
+; CHECK: leaq 128(%rsp)
+; CHECK: popq %rbp
+; CHECK: .cfi_restore_state
+; CHECK: .cfi_adjust_cfa_offset -8
+; CHECK: retq
+define void @mov8b_rsp(i64* %dst, i64* %src) #1 {
+entry:
+  tail call void asm sideeffect "movq ($0), %rax \0A\09movq %rax, ($1) \0A\09", "r,r,~{rax},~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %src, i64* %dst)
+  ret void
+}
+
+; CHECK-LABEL: mov8b_rsp_no_cfi
+; CHECK-NOT: .cfi{{[a-z_]+}}
+define void @mov8b_rsp_no_cfi(i64* %dst, i64* %src) #2 {
+entry:
+  tail call void asm sideeffect "movq ($0), %rax \0A\09movq %rax, ($1) \0A\09", "r,r,~{rax},~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %src, i64* %dst)
+  ret void
+}
+
+attributes #0 = { nounwind sanitize_address uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
+attributes #1 = { nounwind sanitize_address uwtable "no-frame-pointer-elim"="false" }
+attributes #2 = { nounwind sanitize_address "no-frame-pointer-elim"="false" }

diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_cfi.s b/test/Instrumentation/AddressSanitizer/X86/asm_cfi.s
new file mode 100644
index 0000000..417d7f3
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_cfi.s

@@ -0,0 +1,52 @@
+# The test verifies that correct DWARF directives are emitted when
+# assembly files are instrumented.
+
+# RUN: llvm-mc %s -triple=i386-unknown-linux-gnu -asm-instrumentation=address -asan-instrument-assembly | FileCheck %s
+
+# CHECK-LABEL: load4b_cfa_rbp
+# CHECK: pushl %ebx
+# CHECK-NOT: .cfi_adjust_cfa_offset 8
+# CHECK: movl %ebp, %ebx
+# CHECK: .cfi_remember_state
+# CHECK: .cfi_def_cfa_register %ebx
+# CHECK: popl %ebx
+# CHECK: .cfi_restore_state
+# CHECK-NOT: .cfi_adjust_cfa_offset -8
+# CHECK: retl
+
+	.text
+	.globl	load4b_cfa_rbp
+	.type	load4b_cfa_rbp,@function
+swap_cfa_rbp:                                   # @swap_cfa_rbp
+	.cfi_startproc
+	pushl	%ebp
+	.cfi_def_cfa_offset 8
+	.cfi_offset %ebp, -8
+	movl	%esp, %ebp
+	.cfi_def_cfa_register %ebp
+	movl	8(%ebp), %eax
+	popl	%ebp
+	retl
+	.cfi_endproc
+
+# CHECK-LABEL: load4b_cfa_rsp
+# CHECK: pushl %ebx
+# CHECK: .cfi_adjust_cfa_offset 4
+# CHECK: movl %esp, %ebx
+# CHECK: .cfi_remember_state
+# CHECK: .cfi_def_cfa_register %ebx
+# CHECK: popl %ebx
+# CHECK: .cfi_restore_state
+# CHECK: retl
+
+	.globl	load4b_cfa_rsp
+	.type	load4b_cfa_rsp,@function
+swap_cfa_rsp:                                   # @swap_cfa_rsp
+	.cfi_startproc
+	pushl	%ebp
+	.cfi_offset %ebp, 0
+	movl	%esp, %ebp
+	movl	8(%ebp), %eax
+	popl	%ebp
+	retl
+	.cfi_endproc

diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll b/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
index ad5e02e..7f5d3b0 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov.ll

@@ -6,8 +6,8 @@
 ; CHECK-LABEL: mov1b
 ; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: pushq %rcx
 ; CHECK-NEXT: pushq %rdi
+; CHECK-NEXT: pushq %rcx
 ; CHECK-NEXT: pushfq
 ; CHECK-NEXT: leaq {{.*}}, %rdi
 ; CHECK-NEXT: movq %rdi, %rax
@@ -26,8 +26,8 @@
 ; CHECK-NEXT: callq __asan_report_load1@PLT
 ; CHECK-NEXT: [[A]]:
 ; CHECK-NEXT: popfq
-; CHECK-NEXT: popq %rdi
 ; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: popq %rdi
 ; CHECK-NEXT: popq %rax
 ; CHECK-NEXT: leaq 128(%rsp), %rsp
 
@@ -81,8 +81,10 @@
 ; CHECK-LABEL: mov8b
 ; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: pushfq
-; CHECK-NEXT: leaq {{.*}}, %rax
+; CHECK-NEXT: leaq {{.*}}, %rdi
+; CHECK-NEXT: movq %rdi, %rax
 ; CHECK-NEXT: shrq $3, %rax
 ; CHECK-NEXT: cmpb $0, 2147450880(%rax)
 ; CHECK-NEXT: je [[A:.*]]
@@ -92,13 +94,16 @@
 ; CHECK-NEXT: callq __asan_report_load8@PLT
 ; CHECK-NEXT: [[A]]:
 ; CHECK-NEXT: popfq
+; CHECK-NEXT: popq %rdi
 ; CHECK-NEXT: popq %rax
 ; CHECK-NEXT: leaq 128(%rsp), %rsp
 
 ; CHECK: leaq -128(%rsp), %rsp
 ; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pushq %rdi
 ; CHECK-NEXT: pushfq
-; CHECK-NEXT: leaq {{.*}}, %rax
+; CHECK-NEXT: leaq {{.*}}, %rdi
+; CHECK-NEXT: movq %rdi, %rax
 ; CHECK-NEXT: shrq $3, %rax
 ; CHECK-NEXT: cmpb $0, 2147450880(%rax)
 ; CHECK-NEXT: je [[A:.*]]
@@ -108,6 +113,7 @@
 ; CHECK-NEXT: callq __asan_report_store8@PLT
 ; CHECK-NEXT: [[A]]:
 ; CHECK-NEXT: popfq
+; CHECK-NEXT: popq %rdi
 ; CHECK-NEXT: popq %rax
 ; CHECK-NEXT: leaq 128(%rsp), %rsp
 

diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov.s b/test/Instrumentation/AddressSanitizer/X86/asm_mov.s
index 74a788c..14fc056 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov.s
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov.s

@@ -10,13 +10,13 @@
 # CHECK: callq __asan_report_load1@PLT
 # CHECK: leaq 128(%rsp), %rsp
 #
-# CHECK-NEXT: movb (%rsi), %al
+# CHECK: movb (%rsi), %al
 #
-# CHECK-NEXT: leaq -128(%rsp), %rsp
+# CHECK: leaq -128(%rsp), %rsp
 # CHECK: callq __asan_report_store1@PLT
 # CHECK: leaq 128(%rsp), %rsp
 #
-# CHECK-NEXT: movb %al, (%rdi)
+# CHECK: movb %al, (%rdi)
 mov1b:                                  # @mov1b
 	.cfi_startproc
 # BB#0:
@@ -39,13 +39,13 @@
 # CHECK: callq __asan_report_load16@PLT
 # CHECK: leaq 128(%rsp), %rsp
 #
-# CHECK-NEXT: movaps (%rsi), %xmm0
+# CHECK: movaps (%rsi), %xmm0
 #
-# CHECK-NEXT: leaq -128(%rsp), %rsp
+# CHECK: leaq -128(%rsp), %rsp
 # CHECK: callq __asan_report_store16@PLT
 # CHECK: leaq 128(%rsp), %rsp
 #
-# CHECK-NEXT: movaps %xmm0, (%rdi)
+# CHECK: movaps %xmm0, (%rdi)
 mov16b:                                 # @mov16b
 	.cfi_startproc
 # BB#0:

diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s b/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s
index e3a1541..5d5de5d 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_mov_no_instrumentation.s

@@ -5,6 +5,8 @@
 	.align	16, 0x90
 	.type	mov1b,@function
 # CHECK-LABEL: mov1b
+# CHECK: movb (%rsi), %al
+# CHECK: movb %al, (%rdi)
 # CHECK-NOT: callq __asan_report_load1@PLT
 # CHECK-NOT: callq __asan_report_store1@PLT
 mov1b:                                  # @mov1b

diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll
new file mode 100644
index 0000000..c3c2435
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_rep_movs.ll

@@ -0,0 +1,87 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2 -asm-instrumentation=address -asan-instrument-assembly | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: rep_movs_1b
+; CHECK: pushfq
+; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: je [[B:.*]]
+
+; CHECK: leaq -128(%rsp), %rsp
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pushq %rdx
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: pushfq
+
+; CHECK: leaq (%rsi), %rdx
+; CHECK: movq %rdx, %rdi
+; CHECK-NEXT: callq __asan_report_load1@PLT
+
+; CHECK: leaq -1(%rsi,%rcx), %rdx
+; CHECK: movq %rdx, %rdi
+; CHECK-NEXT: callq __asan_report_load1@PLT
+
+; CHECK: leaq (%rdi), %rdx
+; CHECK: movq %rdx, %rdi
+; CHECK-NEXT: callq __asan_report_store1@PLT
+
+; CHECK: leaq -1(%rdi,%rcx), %rdx
+; CHECK: movq %rdx, %rdi
+; CHECK-NEXT: callq __asan_report_store1@PLT
+
+; CHECK: popfq
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %rdx
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: leaq 128(%rsp), %rsp
+
+; CHECK: [[B]]:
+; CHECK-NEXT: popfq
+
+; CHECK: rep
+; CHECK-NEXT: movsb (%rsi), %es:(%rdi)
+
+; Function Attrs: nounwind sanitize_address uwtable
+define void @rep_movs_1b(i8* %dst, i8* %src, i64 %n) #0 {
+entry:
+  tail call void asm sideeffect "rep movsb \0A\09", "{si},{di},{cx},~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %src, i8* %dst, i64 %n) #1
+  ret void
+}
+
+; CHECK-LABEL: rep_movs_8b
+; CHECK: pushfq
+; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: je [[Q:.*]]
+
+; CHECK: leaq (%rsi), %rdx
+; CHECK: movq %rdx, %rdi
+; CHECK-NEXT: callq __asan_report_load8@PLT
+
+; CHECK: leaq -1(%rsi,%rcx,8), %rdx
+; CHECK: movq %rdx, %rdi
+; CHECK-NEXT: callq __asan_report_load8@PLT
+
+; CHECK: leaq (%rdi), %rdx
+; CHECK: movq %rdx, %rdi
+; CHECK-NEXT: callq __asan_report_store8@PLT
+
+; CHECK: leaq -1(%rdi,%rcx,8), %rdx
+; CHECK: movq %rdx, %rdi
+; CHECK-NEXT: callq __asan_report_store8@PLT
+
+; CHECK: [[Q]]:
+; CHECK-NEXT: popfq
+
+; CHECK: rep
+; CHECK-NEXT: movsq (%rsi), %es:(%rdi)
+
+; Function Attrs: nounwind sanitize_address uwtable
+define void @rep_movs_8b(i64* %dst, i64* %src, i64 %n) #0 {
+entry:
+  tail call void asm sideeffect "rep movsq \0A\09", "{si},{di},{cx},~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %src, i64* %dst, i64 %n) #1
+  ret void
+}
+
+attributes #0 = { nounwind sanitize_address uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }

diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_rsp_mem_op.s b/test/Instrumentation/AddressSanitizer/X86/asm_rsp_mem_op.s
new file mode 100644
index 0000000..e40ecde
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_rsp_mem_op.s

@@ -0,0 +1,45 @@
+# The test verifies that memory references through %rsp are correctly
+# adjusted after instrumentation.
+
+# RUN: llvm-mc %s -triple=x86_64-unknown-linux-gnu -asm-instrumentation=address -asan-instrument-assembly | FileCheck %s
+
+# CHECK-LABEL: rsp_access
+# CHECK: leaq -128(%rsp), %rsp
+# CHECK: pushq %rax
+# CHECK: pushq %rdi
+# CHECK: pushfq
+# CHECK: leaq 160(%rsp), %rdi
+# CHECK: callq __asan_report_load8@PLT
+# CHECK: popfq
+# CHECK: popq %rdi
+# CHECK: popq %rax
+# CHECK: leaq 128(%rsp), %rsp
+# CHECK: movq 8(%rsp), %rax
+# CHECK: retq
+
+	.text
+	.globl rsp_access
+	.type rsp_access,@function
+rsp_access:
+	movq 8(%rsp), %rax
+	retq
+
+# CHECK-LABEL: rsp_32bit_access
+# CHECK: leaq -128(%rsp), %rsp
+# CHECK: pushq %rax
+# CHECK: pushq %rdi
+# CHECK: pushfq
+# CHECK: leaq 2147483647(%rsp), %rdi
+# CHECK: leaq 145(%rdi), %rdi
+# CHECK: callq __asan_report_load8@PLT
+# CHECK: popfq
+# CHECK: popq %rdi
+# CHECK: popq %rax
+# CHECK: leaq 128(%rsp), %rsp
+# CHECK: movq 2147483640(%rsp), %rax
+# CHECK: retq
+	.globl rsp_32bit_access
+	.type rsp_32bit_access,@function
+rsp_32bit_access:
+	movq 2147483640(%rsp), %rax
+	retq

diff --git a/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s b/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s
index ca3c54c..093c96b 100644
--- a/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s
+++ b/test/Instrumentation/AddressSanitizer/X86/asm_swap_intel.s

@@ -10,25 +10,25 @@
 # CHECK: callq __asan_report_load8@PLT
 # CHECK: leaq 128(%rsp), %rsp
 #
-# CHECK-NEXT: movq (%rcx), %rax
+# CHECK: movq (%rcx), %rax
 #
-# CHECK-NEXT: leaq -128(%rsp), %rsp
+# CHECK: leaq -128(%rsp), %rsp
 # CHECK: callq __asan_report_load8@PLT
 # CHECK: leaq 128(%rsp), %rsp
 #
-# CHECK-NEXT: movq (%rdx), %rbx
+# CHECK: movq (%rdx), %rbx
 #
-# CHECK-NEXT: leaq -128(%rsp), %rsp
+# CHECK: leaq -128(%rsp), %rsp
 # CHECK: callq __asan_report_store8@PLT
 # CHECK: leaq 128(%rsp), %rsp
 #
-# CHECK-NEXT: movq %rbx, (%rcx)
+# CHECK: movq %rbx, (%rcx)
 #
-# CHECK-NEXT: leaq -128(%rsp), %rsp
+# CHECK: leaq -128(%rsp), %rsp
 # CHECK: callq __asan_report_store8@PLT
 # CHECK: leaq 128(%rsp), %rsp
 #
-# CHECK-NEXT: movq %rax, (%rdx)
+# CHECK: movq %rax, (%rdx)
 swap:                                   # @swap
 	.cfi_startproc
 # BB#0:

diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index 7d1aa0b..d9997e2 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll

@@ -6,7 +6,7 @@
 target triple = "x86_64-unknown-linux-gnu"
 
 define i32 @test_load(i32* %a) sanitize_address {
-; CHECK: @test_load
+; CHECK-LABEL: @test_load
 ; CHECK-NOT: load
 ; CHECK:   %[[LOAD_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
 ; CHECK:   lshr i64 %[[LOAD_ADDR]], 3
@@ -14,7 +14,7 @@
 ; CHECK:   %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr
 ; CHECK:   %[[LOAD_SHADOW:[^ ]*]] = load i8* %[[LOAD_SHADOW_PTR]]
 ; CHECK:   icmp ne i8
-; CHECK:   br i1 %{{.*}}, label %{{.*}}, label %{{.*}}
+; CHECK:   br i1 %{{.*}}, label %{{.*}}, label %{{.*}}!prof ![[PROF:[0-9]+]]
 ;
 ; First instrumentation block refines the shadow test.
 ; CHECK:   and i64 %[[LOAD_ADDR]], 7
@@ -39,7 +39,7 @@
 }
 
 define void @test_store(i32* %a) sanitize_address {
-; CHECK: @test_store
+; CHECK-LABEL: @test_store
 ; CHECK-NOT: store
 ; CHECK:   %[[STORE_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
 ; CHECK:   lshr i64 %[[STORE_ADDR]], 3
@@ -84,7 +84,7 @@
   ret void
 }
 
-; CHECK: define void @alloca_test()
+; CHECK-LABEL: define void @alloca_test()
 ; CHECK: = alloca
 ; CHECK-NOT: = alloca
 ; CHECK: ret void
@@ -95,7 +95,7 @@
     ret void
 }
 
-; CHECK: LongDoubleTest
+; CHECK-LABEL: LongDoubleTest
 ; CHECK: __asan_report_store_n
 ; CHECK: __asan_report_store_n
 ; CHECK: ret void
@@ -108,7 +108,7 @@
   ret void
 }
 
-; CHECK: i40test
+; CHECK-LABEL: i40test
 ; CHECK: __asan_report_load_n{{.*}}, i64 5)
 ; CHECK: __asan_report_load_n{{.*}}, i64 5)
 ; CHECK: __asan_report_store_n{{.*}}, i64 5)
@@ -134,7 +134,7 @@
   ret void
 }
 
-; CHECK: i80test
+; CHECK-LABEL: i80test
 ; CHECK: __asan_report_load_n{{.*}}, i64 10)
 ; CHECK: __asan_report_load_n{{.*}}, i64 10)
 ; CHECK: __asan_report_store_n{{.*}}, i64 10)
@@ -147,7 +147,7 @@
   %tmp1 = load i32* %a
   ret i32 %tmp1
 }
-; CHECK: @f_available_externally
+; CHECK-LABEL: @f_available_externally
 ; CHECK-NOT: __asan_report
 ; CHECK: ret i32
 
@@ -169,3 +169,5 @@
 ; CHECK: __asan_memcpy
 ; CHECK: ret void
 
+; PROF
+; CHECK: ![[PROF]] = metadata !{metadata !"branch_weights", i32 1, i32 100000}

diff --git a/test/Instrumentation/AddressSanitizer/coverage-dbg.ll b/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
deleted file mode 100644
index 3f7998d..0000000
--- a/test/Instrumentation/AddressSanitizer/coverage-dbg.ll
+++ /dev/null

@@ -1,67 +0,0 @@
-; Test that coverage instrumentation does not lose debug location.
-
-; RUN: opt < %s -asan -asan-module -asan-coverage=1 -S | FileCheck %s
-
-; C++ source:
-; 1: struct A {
-; 2:  int f();
-; 3:  int x;
-; 4: };
-; 5:
-; 6: int A::f() {
-; 7:    return x;
-; 8: }
-; clang++ ../1.cc -O3 -g -S -emit-llvm  -fno-strict-aliasing
-; and add sanitize_address to @_ZN1A1fEv
-
-; Test that __sanitizer_cov call has !dbg pointing to the opening { of A::f().
-; CHECK: call void @__sanitizer_cov(), !dbg [[A:!.*]]
-; CHECK: [[A]] = metadata !{i32 6, i32 0, metadata !{{.*}}, null}
-
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.A = type { i32 }
-
-; Function Attrs: nounwind readonly uwtable
-define i32 @_ZN1A1fEv(%struct.A* nocapture readonly %this) #0 align 2 {
-entry:
-  tail call void @llvm.dbg.value(metadata !{%struct.A* %this}, i64 0, metadata !15), !dbg !20
-  %x = getelementptr inbounds %struct.A* %this, i64 0, i32 0, !dbg !21
-  %0 = load i32* %x, align 4, !dbg !21
-  ret i32 %0, !dbg !21
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
-
-attributes #0 = { sanitize_address nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!17, !18}
-!llvm.ident = !{!19}
-
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (210251)", i1 true, metadata !"", i32 0, metadata !2, metadata !3, metadata !12, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/../1.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"../1.cc", metadata !"/code/llvm/build0"}
-!2 = metadata !{}
-!3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
-!5 = metadata !{metadata !6, metadata !8}
-!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1A", metadata !"x", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [x] [line 3, size 32, align 32, offset 0] [from int]
-!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"f", metadata !"f", metadata !"_ZN1A1fEv", i32 2, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, null, i32 2} ; [ DW_TAG_subprogram ] [line 2] [f]
-!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !7, metadata !11}
-!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
-!12 = metadata !{metadata !13}
-!13 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1A", metadata !"f", metadata !"f", metadata !"_ZN1A1fEv", i32 6, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.A*)* @_ZN1A1fEv, null, metadata !8, metadata !14, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [f]
-!14 = metadata !{metadata !15}
-!15 = metadata !{i32 786689, metadata !13, metadata !"this", null, i32 16777216, metadata !16, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
-!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!18 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!19 = metadata !{metadata !"clang version 3.5.0 (210251)"}
-!20 = metadata !{i32 0, i32 0, metadata !13, null}
-!21 = metadata !{i32 7, i32 0, metadata !13, null}

diff --git a/test/Instrumentation/AddressSanitizer/coverage.ll b/test/Instrumentation/AddressSanitizer/coverage.ll
deleted file mode 100644
index 79bb5c1..0000000
--- a/test/Instrumentation/AddressSanitizer/coverage.ll
+++ /dev/null

@@ -1,60 +0,0 @@
-; RUN: opt < %s -asan -asan-module -asan-coverage=0 -S | FileCheck %s --check-prefix=CHECK0
-; RUN: opt < %s -asan -asan-module -asan-coverage=1 -S | FileCheck %s --check-prefix=CHECK1
-; RUN: opt < %s -asan -asan-module -asan-coverage=2 -S | FileCheck %s --check-prefix=CHECK2
-; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK2
-; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=1  -S | FileCheck %s --check-prefix=CHECK1
-
-; RUN: opt < %s -asan -asan-module -asan-coverage=0 -asan-globals=0 -S | \
-; RUN:     FileCheck %s --check-prefix=CHECK0
-; RUN: opt < %s -asan -asan-module -asan-coverage=1 -asan-globals=0 -S | \
-; RUN:     FileCheck %s --check-prefix=CHECK1
-; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-globals=0 -S | \
-; RUN:     FileCheck %s --check-prefix=CHECK2
-; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=10 \
-; RUN:     -asan-globals=0 -S | FileCheck %s --check-prefix=CHECK2
-; RUN: opt < %s -asan -asan-module -asan-coverage=2 -asan-coverage-block-threshold=1 \
-; RUN:     -asan-globals=0 -S | FileCheck %s --check-prefix=CHECK1
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-define void @foo(i32* %a) sanitize_address {
-entry:
-  %tobool = icmp eq i32* %a, null
-  br i1 %tobool, label %if.end, label %if.then
-
-  if.then:                                          ; preds = %entry
-  store i32 0, i32* %a, align 4
-  br label %if.end
-
-  if.end:                                           ; preds = %entry, %if.then
-  ret void
-}
-
-; CHECK0-NOT: call void @__sanitizer_cov(
-; CHECK0-NOT: call void @__sanitizer_cov_module_init(
-
-; CHECK1-LABEL: define void @foo
-; CHECK1: %0 = load atomic i8* @__asan_gen_cov_foo monotonic, align 1
-; CHECK1: %1 = icmp eq i8 0, %0
-; CHECK1: br i1 %1, label %2, label %3
-; CHECK1: call void @__sanitizer_cov
-; CHECK1-NOT: call void @__sanitizer_cov
-; CHECK1: store atomic i8 1, i8* @__asan_gen_cov_foo monotonic, align 1
-
-; CHECK1-LABEL: define internal void @asan.module_ctor
-; CHECK1-NOT: ret
-; CHECK1: call void @__sanitizer_cov_module_init(i64 1)
-; CHECK1: ret
-
-
-; CHECK2-LABEL: define void @foo
-; CHECK2: call void @__sanitizer_cov
-; CHECK2: call void @__sanitizer_cov
-; CHECK2: call void @__sanitizer_cov
-; CHECK2-NOT: call void @__sanitizer_cov
-; CHECK2: ret void
-
-; CHECK2-LABEL: define internal void @asan.module_ctor
-; CHECK2-NOT: ret
-; CHECK2: call void @__sanitizer_cov_module_init(i64 3)
-; CHECK2: ret

diff --git a/test/Instrumentation/AddressSanitizer/debug_info.ll b/test/Instrumentation/AddressSanitizer/debug_info.ll
index 336b98b..ea51551 100644
--- a/test/Instrumentation/AddressSanitizer/debug_info.ll
+++ b/test/Instrumentation/AddressSanitizer/debug_info.ll

@@ -11,8 +11,8 @@
   %p.addr = alloca i32, align 4
   %r = alloca i32, align 4
   store i32 %p, i32* %p.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %p.addr}, metadata !10), !dbg !11
-  call void @llvm.dbg.declare(metadata !{i32* %r}, metadata !12), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i32* %p.addr}, metadata !10, metadata !{metadata !"0x102"}), !dbg !11
+  call void @llvm.dbg.declare(metadata !{i32* %r}, metadata !12, metadata !{metadata !"0x102"}), !dbg !14
   %0 = load i32* %p.addr, align 4, !dbg !14
   %add = add nsw i32 %0, 1, !dbg !14
   store i32 %add, i32* %r, align 4, !dbg !14
@@ -24,39 +24,39 @@
 ;   CHECK: entry:
 ; Verify that llvm.dbg.declare calls are in the entry basic block.
 ;   CHECK-NOT: %entry
-;   CHECK: call void @llvm.dbg.declare(metadata {{.*}}, metadata ![[ARG_ID:[0-9]+]])
+;   CHECK: call void @llvm.dbg.declare(metadata {{.*}}, metadata ![[ARG_ID:[0-9]+]], metadata ![[OPDEREF:[0-9]+]])
 ;   CHECK-NOT: %entry
-;   CHECK: call void @llvm.dbg.declare(metadata {{.*}}, metadata ![[VAR_ID:[0-9]+]])
+;   CHECK: call void @llvm.dbg.declare(metadata {{.*}}, metadata ![[VAR_ID:[0-9]+]], metadata ![[OPDEREF:[0-9]+]])
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17}
 
-!0 = metadata !{i32 786449, metadata !16, i32 4, metadata !"clang version 3.3 (trunk 169314)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 169314)\001\00\000\00\000", metadata !16, metadata !1, metadata !1, metadata !3, metadata !1, null} ; [ DW_TAG_compile_unit ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !16, metadata !6, metadata !"zzz", metadata !"zzz", metadata !"_Z3zzzi", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3zzzi, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [zzz]
-!6 = metadata !{i32 786473, metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00zzz\00zzz\00_Z3zzzi\001\000\001\000\006\00256\000\001", metadata !16, metadata !6, metadata !7, null, i32 (i32)* @_Z3zzzi, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 1] [def] [zzz]
+!6 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786689, metadata !5, metadata !"p", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x101\00p\0016777217\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [p] [line 1]
 !11 = metadata !{i32 1, i32 0, metadata !5, null}
-!12 = metadata !{i32 786688, metadata !13, metadata !"r", metadata !6, i32 2, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [r] [line 2]
+!12 = metadata !{metadata !"0x100\00r\002\000", metadata !13, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [r] [line 2]
 
 ; Verify that debug descriptors for argument and local variable will be replaced
 ; with descriptors that end with OpDeref (encoded as 2).
-;   CHECK: ![[ARG_ID]] = {{.*}}metadata ![[OPDEREF:[0-9]+]]} ; [ DW_TAG_arg_variable ] [p] [line 1]
-;   CHECK: ![[OPDEREF]] = metadata !{i64 2}
-;   CHECK: ![[VAR_ID]] = {{.*}}metadata ![[OPDEREF]]} ; [ DW_TAG_auto_variable ] [r] [line 2]
+;   CHECK: ![[ARG_ID]] = {{.*}} ; [ DW_TAG_arg_variable ] [p] [line 1]
+;   CHECK: ![[OPDEREF]] = metadata !{metadata !"0x102\006"}
+;   CHECK: ![[VAR_ID]] = {{.*}} ; [ DW_TAG_auto_variable ] [r] [line 2]
 ; Verify that there are no more variable descriptors.
 ;   CHECK-NOT: DW_TAG_arg_variable
 ;   CHECK-NOT: DW_TAG_auto_variable
 
 
-!13 = metadata !{i32 786443, metadata !16, metadata !5, i32 1, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc]
+!13 = metadata !{metadata !"0xb\001\000\000", metadata !16, metadata !5} ; [ DW_TAG_lexical_block ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc]
 !14 = metadata !{i32 2, i32 0, metadata !13, null}
 !15 = metadata !{i32 3, i32 0, metadata !13, null}
 !16 = metadata !{metadata !"a.cc", metadata !"/usr/local/google/llvm_cmake_clang/tmp/debuginfo"}
-!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Instrumentation/AddressSanitizer/do-not-instrument-cstring.ll b/test/Instrumentation/AddressSanitizer/do-not-instrument-cstring.ll
new file mode 100644
index 0000000..de6a4de
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/do-not-instrument-cstring.ll

@@ -0,0 +1,7 @@
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+
+target datalayout = "e"
+
+@foo = private global [19 x i8] c"scannerWithString:\00", section "__TEXT,__objc_methname,cstring_literals"
+
+; CHECK: @foo = private global [19 x i8] c"scannerWithString:\00", section "__TEXT,__objc_methname,cstring_literals"
\ No newline at end of file

diff --git a/test/Instrumentation/AddressSanitizer/do-not-touch-comdat-global.ll b/test/Instrumentation/AddressSanitizer/do-not-touch-comdat-global.ll
new file mode 100644
index 0000000..8d14e83
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/do-not-touch-comdat-global.ll

@@ -0,0 +1,14 @@
+; This test checks that we instrument regular globals, but do not touch
+; the COMDAT ones.
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
+target triple = "i686-pc-windows-msvc"
+; no action should be taken for these globals
+$global_noinst = comdat largest
+@aliasee = private unnamed_addr constant [2 x i8] [i8 1, i8 2], comdat $global_noinst
+@global_noinst = unnamed_addr alias [2 x i8]* @aliasee
+; CHECK-NOT: {{asan_gen.*global_noinst}}
+; CHECK-DAG: @global_noinst = unnamed_addr alias [2 x i8]* @aliasee
+@global_inst = private constant [2 x i8] [i8 1, i8 2]
+; CHECK-DAG: {{asan_gen.*global_inst}}
+; CHECK: @asan.module_ctor

diff --git a/test/Instrumentation/AddressSanitizer/global_metadata.ll b/test/Instrumentation/AddressSanitizer/global_metadata.ll
index 9641c3e..fd5a8c6 100644
--- a/test/Instrumentation/AddressSanitizer/global_metadata.ll
+++ b/test/Instrumentation/AddressSanitizer/global_metadata.ll

@@ -11,20 +11,18 @@
 @.str = private unnamed_addr constant [14 x i8] c"Hello, world!\00", align 1
 @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_asan_globals.cpp, i8* null }]
 
-; Sanitizer location descriptors:
-@.str1 = private unnamed_addr constant [22 x i8] c"/tmp/asan-globals.cpp\00", align 1
-@.asan_loc_descr = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 5, i32 5 }
-@.asan_loc_descr1 = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 7, i32 5 }
-@.asan_loc_descr2 = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 12, i32 14 }
-@.asan_loc_descr4 = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 14, i32 25 }
-
-; Check that globals were instrumented, but sanitizer location descriptors weren't:
+; Check that globals were instrumented:
 ; CHECK: @global = global { i32, [60 x i8] } zeroinitializer, align 32
 ; CHECK: @.str = internal unnamed_addr constant { [14 x i8], [50 x i8] } { [14 x i8] c"Hello, world!\00", [50 x i8] zeroinitializer }, align 32
-; CHECK: @.asan_loc_descr = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* @.str1, i32 5, i32 5 }
 
-; Check that location decriptors were passed into __asan_register_globals:
-; CHECK: i64 ptrtoint ({ [22 x i8]*, i32, i32 }* @.asan_loc_descr to i64)
+; Check emitted location descriptions:
+; CHECK: [[VARNAME:@__asan_gen_[0-9]+]] = private unnamed_addr constant [7 x i8] c"global\00", align 1
+; CHECK: [[FILENAME:@__asan_gen_[0-9]+]] = private unnamed_addr constant [22 x i8] c"/tmp/asan-globals.cpp\00", align 1
+; CHECK: [[LOCDESCR:@__asan_gen_[0-9]+]] = private unnamed_addr constant { [22 x i8]*, i32, i32 } { [22 x i8]* [[FILENAME]], i32 5, i32 5 }
+
+; Check that location decriptors and global names were passed into __asan_register_globals:
+; CHECK: i64 ptrtoint ([7 x i8]* [[VARNAME]] to i64)
+; CHECK: i64 ptrtoint ({ [22 x i8]*, i32, i32 }* [[LOCDESCR]] to i64)
 
 ; Function Attrs: nounwind sanitize_address
 define internal void @__cxx_global_var_init() #0 section ".text.startup" {
@@ -55,9 +53,15 @@
 !llvm.asan.globals = !{!0, !1, !2, !3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32* @global, { [22 x i8]*, i32, i32 }* @.asan_loc_descr, i1 false, i1 false}
-!1 = metadata !{i32* @dyn_init_global, { [22 x i8]*, i32, i32 }* @.asan_loc_descr1, i1 true, i1 false}
-!2 = metadata !{i32* @blacklisted_global, null, i1 false, i1 true}
-!3 = metadata !{i32* @_ZZ4funcvE10static_var, { [22 x i8]*, i32, i32 }* @.asan_loc_descr2, i1 false, i1 false}
-!4 = metadata !{[14 x i8]* @.str, { [22 x i8]*, i32, i32 }* @.asan_loc_descr4, i1 false, i1 false}
+!0 = metadata !{i32* @global, metadata !6, metadata !"global", i1 false, i1 false}
+!1 = metadata !{i32* @dyn_init_global, metadata !7, metadata !"dyn_init_global", i1 true, i1 false}
+!2 = metadata !{i32* @blacklisted_global, null, null, i1 false, i1 true}
+!3 = metadata !{i32* @_ZZ4funcvE10static_var, metadata !8, metadata !"static_var", i1 false, i1 false}
+!4 = metadata !{[14 x i8]* @.str, metadata !9, metadata !"<string literal>", i1 false, i1 false}
+
 !5 = metadata !{metadata !"clang version 3.5.0 (211282)"}
+
+!6 = metadata !{metadata !"/tmp/asan-globals.cpp", i32 5, i32 5}
+!7 = metadata !{metadata !"/tmp/asan-globals.cpp", i32 7, i32 5}
+!8 = metadata !{metadata !"/tmp/asan-globals.cpp", i32 12, i32 14}
+!9 = metadata !{metadata !"/tmp/asan-globals.cpp", i32 14, i32 25}

diff --git a/test/Instrumentation/AddressSanitizer/instrument_global.ll b/test/Instrumentation/AddressSanitizer/instrument_global.ll
index 816ab29..80791d9 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_global.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_global.ll

@@ -69,7 +69,7 @@
 
 
 !llvm.asan.globals = !{!0}
-!0 = metadata !{[10 x i32]* @GlobDy, null, i1 true, i1 false}
+!0 = metadata !{[10 x i32]* @GlobDy, null, null, i1 true, i1 false}
 
 ; CHECK-LABEL: define internal void @asan.module_ctor
 ; CHECK-NOT: ret

diff --git a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
index 83ff53f..c2bb0aa 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll

@@ -7,10 +7,10 @@
 @YYY = global i32 0, align 4           ; W/o dynamic initializer.
 ; Clang will emit the following metadata identifying @xxx as dynamically
 ; initialized.
-!0 = metadata !{i32* @xxx, null, i1 true, i1 false}
-!1 = metadata !{i32* @XXX, null, i1 true, i1 false}
-!2 = metadata !{i32* @yyy, null, i1 false, i1 false}
-!3 = metadata !{i32* @YYY, null, i1 false, i1 false}
+!0 = metadata !{i32* @xxx, null, null, i1 true, i1 false}
+!1 = metadata !{i32* @XXX, null, null, i1 true, i1 false}
+!2 = metadata !{i32* @yyy, null, null, i1 false, i1 false}
+!3 = metadata !{i32* @YYY, null, null, i1 false, i1 false}
 !llvm.asan.globals = !{!0, !1, !2, !3}
 
 define i32 @initializer() uwtable {
@@ -25,29 +25,39 @@
   ret void
 }
 
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @__late_ctor }, { i32, void ()* } { i32 0, void ()* @__early_ctor }]
 
-define internal void @_GLOBAL__I_a() sanitize_address section ".text.startup" {
+define internal void @__late_ctor() sanitize_address section ".text.startup" {
 entry:
   call void @__cxx_global_var_init()
   ret void
 }
 
 ; Clang indicated that @xxx was dynamically initailized.
-; __asan_{before,after}_dynamic_init should be called from _GLOBAL__I_a
+; __asan_{before,after}_dynamic_init should be called from __late_ctor
 
-; CHECK: define internal void @_GLOBAL__I_a
+; CHECK-LABEL: define internal void @__late_ctor
 ; CHECK-NOT: ret
 ; CHECK: call void @__asan_before_dynamic_init
 ; CHECK: call void @__cxx_global_var_init
 ; CHECK: call void @__asan_after_dynamic_init
 ; CHECK: ret
 
+; CTOR with priority 0 should not be instrumented.
+define internal void @__early_ctor() sanitize_address section ".text.startup" {
+entry:
+  call void @__cxx_global_var_init()
+  ret void
+}
+; CHECK-LABEL: define internal void @__early_ctor
+; CHECK-NOT: __asan
+; CHECK: ret
+
 ; Check that xxx is instrumented.
 define void @touch_xxx() sanitize_address {
   store i32 0, i32 *@xxx, align 4
   ret void
-; CHECK: define void @touch_xxx
+; CHECK-LABEL: touch_xxx
 ; CHECK: call void @__asan_report_store4
 ; CHECK: ret void
 }

diff --git a/test/Instrumentation/AddressSanitizer/ubsan.ll b/test/Instrumentation/AddressSanitizer/ubsan.ll
new file mode 100644
index 0000000..22e4172
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/ubsan.ll

@@ -0,0 +1,52 @@
+; ASan shouldn't instrument code added by UBSan.
+
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { i32 (...)** }
+declare void @__ubsan_handle_dynamic_type_cache_miss(i8*, i64, i64) uwtable
+@__ubsan_vptr_type_cache = external global [128 x i64]
+@.src = private unnamed_addr constant [19 x i8] c"tmp/ubsan/vptr.cpp\00", align 1
+@0 = private unnamed_addr constant { i16, i16, [4 x i8] } { i16 -1, i16 0, [4 x i8] c"'A'\00" }
+@_ZTI1A = external constant i8*
+@1 = private unnamed_addr global { { [19 x i8]*, i32, i32 }, { i16, i16, [4 x i8] }*, i8*, i8 } { { [19 x i8]*, i32, i32 } { [19 x i8]* @.src, i32 2, i32 18 }, { i16, i16, [4 x i8] }* @0, i8* bitcast (i8** @_ZTI1A to i8*), i8 4 }
+
+define void @_Z3BarP1A(%struct.A* %a) uwtable sanitize_address {
+; CHECK-LABEL: define void @_Z3BarP1A
+entry:
+  %0 = bitcast %struct.A* %a to void (%struct.A*)***
+  %vtable = load void (%struct.A*)*** %0, align 8
+; CHECK: __asan_report_load8
+  %1 = load void (%struct.A*)** %vtable, align 8
+; CHECK: __asan_report_load8
+  %2 = ptrtoint void (%struct.A*)** %vtable to i64
+  %3 = xor i64 %2, -303164226014115343, !nosanitize !0
+  %4 = mul i64 %3, -7070675565921424023, !nosanitize !0
+  %5 = lshr i64 %4, 47, !nosanitize !0
+  %6 = xor i64 %4, %2, !nosanitize !0
+  %7 = xor i64 %6, %5, !nosanitize !0
+  %8 = mul i64 %7, -7070675565921424023, !nosanitize !0
+  %9 = lshr i64 %8, 47, !nosanitize !0
+  %10 = xor i64 %9, %8, !nosanitize !0
+  %11 = mul i64 %10, -7070675565921424023, !nosanitize !0
+  %12 = and i64 %11, 127, !nosanitize !0
+  %13 = getelementptr inbounds [128 x i64]* @__ubsan_vptr_type_cache, i64 0, i64 %12, !nosanitize !0
+; CHECK-NOT: __asan_report_load8
+  %14 = load i64* %13, align 8, !nosanitize !0
+  %15 = icmp eq i64 %14, %11, !nosanitize !0
+  br i1 %15, label %cont, label %handler.dynamic_type_cache_miss, !nosanitize !0
+
+handler.dynamic_type_cache_miss:                  ; preds = %entry
+  %16 = ptrtoint %struct.A* %a to i64, !nosanitize !0
+  tail call void @__ubsan_handle_dynamic_type_cache_miss(i8* bitcast ({ { [19 x i8]*, i32, i32 }, { i16, i16, [4 x i8] }*, i8*, i8 }* @1 to i8*), i64 %16, i64 %11) #2, !nosanitize !0
+  br label %cont, !nosanitize !0
+
+cont:                                             ; preds = %handler.dynamic_type_cache_miss, %entry
+  tail call void %1(%struct.A* %a)
+; CHECK: ret void
+  ret void
+}
+
+!0 = metadata !{}

diff --git a/test/Instrumentation/DataFlowSanitizer/Inputs/debuglist.txt b/test/Instrumentation/DataFlowSanitizer/Inputs/debuglist.txt
new file mode 100644
index 0000000..daf7b5f
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/Inputs/debuglist.txt

@@ -0,0 +1,2 @@
+fun:main=uninstrumented
+fun:main=discard

diff --git a/test/Instrumentation/DataFlowSanitizer/abilist.ll b/test/Instrumentation/DataFlowSanitizer/abilist.ll
index 66ddc14..ebf55d9 100644
--- a/test/Instrumentation/DataFlowSanitizer/abilist.ll
+++ b/test/Instrumentation/DataFlowSanitizer/abilist.ll

@@ -12,16 +12,38 @@
   ret i32 %c
 }
 
+; CHECK: define i32 (i32, i32)* @discardg(i32)
+; CHECK: %[[CALL:.*]] = call { i32 (i32, i32)*, i16 } @"dfs$g"(i32 %0, i16 0)
+; CHECK: %[[XVAL:.*]] = extractvalue { i32 (i32, i32)*, i16 } %[[CALL]], 0
+; CHECK: ret {{.*}} %[[XVAL]]
+@discardg = alias i32 (i32, i32)* (i32)* @g
+
 declare void @custom1(i32 %a, i32 %b)
 
+; CHECK: define linkonce_odr { i32, i16 } @"dfsw$custom2"(i32, i32, i16, i16)
+; CHECK: %[[LABELRETURN2:.*]] = alloca i16
+; CHECK: %[[RV:.*]] = call i32 @__dfsw_custom2
+; CHECK: %[[RVSHADOW:.*]] = load i16* %[[LABELRETURN2]]
+; CHECK: insertvalue {{.*}}[[RV]], 0
+; CHECK: insertvalue {{.*}}[[RVSHADOW]], 1
+; CHECK: ret { i32, i16 }
 declare i32 @custom2(i32 %a, i32 %b)
 
+; CHECK: define linkonce_odr void @"dfsw$custom3"(i32, i16, i16*, ...)
+; CHECK: call void @__dfsan_vararg_wrapper(i8*
+; CHECK: unreachable
+declare void @custom3(i32 %a, ...)
+
+declare i32 @custom4(i32 %a, ...)
+
 declare void @customcb(i32 (i32)* %cb)
 
 declare i32 @cb(i32)
 
 ; CHECK: @"dfs$f"
-define void @f() {
+define void @f(i32 %x) {
+  ; CHECK: %[[LABELVA2:.*]] = alloca [2 x i16]
+  ; CHECK: %[[LABELVA1:.*]] = alloca [2 x i16]
   ; CHECK: %[[LABELRETURN:.*]] = alloca i16
 
   ; CHECK: call void @__dfsw_custom1(i32 1, i32 2, i16 0, i16 0)
@@ -33,23 +55,22 @@
   ; CHECK: call void @__dfsw_customcb({{.*}} @"dfst0$customcb", i8* bitcast ({{.*}} @"dfs$cb" to i8*), i16 0)
   call void @customcb(i32 (i32)* @cb)
 
+  ; CHECK: %[[LABELVA1_0:.*]] = getelementptr inbounds [2 x i16]* %[[LABELVA1]], i32 0, i32 0
+  ; CHECK: store i16 0, i16* %[[LABELVA1_0]]
+  ; CHECK: %[[LABELVA1_1:.*]] = getelementptr inbounds [2 x i16]* %[[LABELVA1]], i32 0, i32 1
+  ; CHECK: store i16 %{{.*}}, i16* %[[LABELVA1_1]]
+  ; CHECK: %[[LABELVA1_0A:.*]] = getelementptr inbounds [2 x i16]* %[[LABELVA1]], i32 0, i32 0
+  ; CHECK: call void (i32, i16, i16*, ...)* @__dfsw_custom3(i32 1, i16 0, i16* %[[LABELVA1_0A]], i32 2, i32 %{{.*}})
+  call void (i32, ...)* @custom3(i32 1, i32 2, i32 %x)
+
+  ; CHECK: %[[LABELVA2_0:.*]] = getelementptr inbounds [2 x i16]* %[[LABELVA2]], i32 0, i32 0
+  ; CHECK: %[[LABELVA2_0A:.*]] = getelementptr inbounds [2 x i16]* %[[LABELVA2]], i32 0, i32 0
+  ; CHECK: call i32 (i32, i16, i16*, i16*, ...)* @__dfsw_custom4(i32 1, i16 0, i16* %[[LABELVA2_0A]], i16* %[[LABELRETURN]], i32 2, i32 3)
+  call i32 (i32, ...)* @custom4(i32 1, i32 2, i32 3)
+
   ret void
 }
 
-; CHECK: define i32 (i32, i32)* @discardg(i32)
-; CHECK: %[[CALL:.*]] = call { i32 (i32, i32)*, i16 } @"dfs$g"(i32 %0, i16 0)
-; CHECK: %[[XVAL:.*]] = extractvalue { i32 (i32, i32)*, i16 } %[[CALL]], 0
-; CHECK: ret {{.*}} %[[XVAL]]
-@discardg = alias i32 (i32, i32)* (i32)* @g
-
-; CHECK: define linkonce_odr { i32, i16 } @"dfsw$custom2"(i32, i32, i16, i16)
-; CHECK: %[[LABELRETURN2:.*]] = alloca i16
-; CHECK: %[[RV:.*]] = call i32 @__dfsw_custom2
-; CHECK: %[[RVSHADOW:.*]] = load i16* %[[LABELRETURN2]]
-; CHECK: insertvalue {{.*}}[[RV]], 0
-; CHECK: insertvalue {{.*}}[[RVSHADOW]], 1
-; CHECK: ret { i32, i16 }
-
 ; CHECK: @"dfs$g"
 define i32 (i32, i32)* @g(i32) {
   ; CHECK: ret {{.*}} @"dfsw$custom2"
@@ -73,3 +94,6 @@
 ; CHECK: %[[XVAL1:.*]] = extractvalue { i32, i16 } %[[CALL]], 1
 ; CHECK: store i16 %[[XVAL1]], i16* %3
 ; CHECK: ret i32 %[[XVAL0]]
+
+; CHECK: declare void @__dfsw_custom3(i32, i16, i16*, ...)
+; CHECK: declare i32 @__dfsw_custom4(i32, i16, i16*, i16*, ...)

diff --git a/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll b/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll
index 6bcd5c5..eb28c2c 100644
--- a/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll
+++ b/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll

@@ -3,13 +3,16 @@
 
 declare i32 @g()
 
-; CHECK: define { i32, i16 } @"dfs$f"(i32, i16)
-define i32 @f(i32) {
+; CHECK: define { i32, i16 } @"dfs$f"(i32, i32, i16, i16)
+define i32 @f(i32, i32) {
   ; CHECK: [[LOCALLABELALLOCA:%.*]] = alloca i16
-  ; CHECK: [[ARGCMP:%.*]] = icmp ne i16 %1, 0
-  ; CHECK: br i1 [[ARGCMP]]
   %i = alloca i32
-  store i32 %0, i32* %i
+  ; CHECK: [[ARGCMP1:%.*]] = icmp ne i16 %3, 0
+  ; CHECK: br i1 [[ARGCMP1]]
+  ; CHECK: [[ARGCMP2:%.*]] = icmp ne i16 %2, 0
+  ; CHECK: br i1 [[ARGCMP2]]
+  %x = add i32 %0, %1
+  store i32 %x, i32* %i
   ; CHECK: [[CALL:%.*]] = call { i32, i16 } @"dfs$g"()
   ; CHECK: [[CALLLABEL:%.*]] = extractvalue { i32, i16 } [[CALL]], 1
   ; CHECK: [[CALLCMP:%.*]] = icmp ne i16 [[CALLLABEL]], 0

diff --git a/test/Instrumentation/DataFlowSanitizer/debug.ll b/test/Instrumentation/DataFlowSanitizer/debug.ll
new file mode 100644
index 0000000..cfc9dd9
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/debug.ll

@@ -0,0 +1,36 @@
+; RUN: opt < %s -dfsan -dfsan-abilist=%S/Inputs/debuglist.txt -S | FileCheck %s
+
+; CHECK: i32 ()* @main, {{.*}} ; [ DW_TAG_subprogram ] {{.*}} [main]
+
+; Generated from a simple source file compiled with clang -g:
+; int main() {
+; }
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+  ret i32 0, !dbg !12
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/debug.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"debug.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00main\00main\00\001\000\001\000\000\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/tmp/dbginfo/debug.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!11 = metadata !{metadata !"clang version 3.6.0 "}
+!12 = metadata !{i32 2, i32 1, metadata !4, null}

diff --git a/test/Instrumentation/DataFlowSanitizer/load.ll b/test/Instrumentation/DataFlowSanitizer/load.ll
index 6cd5151..8324224 100644
--- a/test/Instrumentation/DataFlowSanitizer/load.ll
+++ b/test/Instrumentation/DataFlowSanitizer/load.ll

@@ -2,6 +2,18 @@
 ; RUN: opt < %s -dfsan -dfsan-combine-pointer-labels-on-load=0 -S | FileCheck %s --check-prefix=NO_COMBINE_PTR_LABEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
+define {} @load0({}* %p) {
+  ; COMBINE_PTR_LABEL: @"dfs$load0"
+  ; COMBINE_PTR_LABEL: load
+  ; COMBINE_PTR_LABEL-NOT: load
+
+  ; NO_COMBINE_PTR_LABEL: @"dfs$load0"
+  ; NO_COMBINE_PTR_LABEL: load
+  ; NO_COMBINE_PTR_LABEL-NOT: load
+  %a = load {}* %p
+  ret {} %a
+}
+
 define i8 @load8(i8* %p) {
   ; COMBINE_PTR_LABEL: @"dfs$load8"
   ; COMBINE_PTR_LABEL: load i16*
@@ -152,4 +164,4 @@
 
   %a = load i64* %p
   ret i64 %a
-}
\ No newline at end of file
+}

diff --git a/test/Instrumentation/DataFlowSanitizer/store.ll b/test/Instrumentation/DataFlowSanitizer/store.ll
index 8060537..d14bdb6 100644
--- a/test/Instrumentation/DataFlowSanitizer/store.ll
+++ b/test/Instrumentation/DataFlowSanitizer/store.ll

@@ -2,6 +2,19 @@
 ; RUN: opt < %s -dfsan -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefix=NO_COMBINE_PTR_LABEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
+define void @store0({} %v, {}* %p) {
+  ; COMBINE_PTR_LABEL: @"dfs$store0"
+  ; COMBINE_PTR_LABEL: store
+  ; COMBINE_PTR_LABEL-NOT: store
+
+  ; NO_COMBINE_PTR_LABEL: @"dfs$store0"
+  ; NO_COMBINE_PTR_LABEL: store
+  ; NO_COMBINE_PTR_LABEL-NOT: store
+
+  store {} %v, {}* %p
+  ret void
+}
+
 define void @store8(i8 %v, i8* %p) {
   ; NO_COMBINE_PTR_LABEL: @"dfs$store8"
   ; NO_COMBINE_PTR_LABEL: load i16* {{.*}} @__dfsan_arg_tls

diff --git a/test/Instrumentation/DataFlowSanitizer/union-large.ll b/test/Instrumentation/DataFlowSanitizer/union-large.ll
new file mode 100644
index 0000000..a388f73
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/union-large.ll

@@ -0,0 +1,3013 @@
+; RUN: opt < %s -dfsan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Check that we use dfsan_union in large functions instead of __dfsan_union.
+
+; CHECK-LABEL: @"dfs$foo"
+define i32 @foo(i32 %a, i32 %b) {
+bb0:
+  br label %bb1
+
+bb1:
+  br label %bb2
+
+bb2:
+  br label %bb3
+
+bb3:
+  br label %bb4
+
+bb4:
+  br label %bb5
+
+bb5:
+  br label %bb6
+
+bb6:
+  br label %bb7
+
+bb7:
+  br label %bb8
+
+bb8:
+  br label %bb9
+
+bb9:
+  br label %bb10
+
+bb10:
+  br label %bb11
+
+bb11:
+  br label %bb12
+
+bb12:
+  br label %bb13
+
+bb13:
+  br label %bb14
+
+bb14:
+  br label %bb15
+
+bb15:
+  br label %bb16
+
+bb16:
+  br label %bb17
+
+bb17:
+  br label %bb18
+
+bb18:
+  br label %bb19
+
+bb19:
+  br label %bb20
+
+bb20:
+  br label %bb21
+
+bb21:
+  br label %bb22
+
+bb22:
+  br label %bb23
+
+bb23:
+  br label %bb24
+
+bb24:
+  br label %bb25
+
+bb25:
+  br label %bb26
+
+bb26:
+  br label %bb27
+
+bb27:
+  br label %bb28
+
+bb28:
+  br label %bb29
+
+bb29:
+  br label %bb30
+
+bb30:
+  br label %bb31
+
+bb31:
+  br label %bb32
+
+bb32:
+  br label %bb33
+
+bb33:
+  br label %bb34
+
+bb34:
+  br label %bb35
+
+bb35:
+  br label %bb36
+
+bb36:
+  br label %bb37
+
+bb37:
+  br label %bb38
+
+bb38:
+  br label %bb39
+
+bb39:
+  br label %bb40
+
+bb40:
+  br label %bb41
+
+bb41:
+  br label %bb42
+
+bb42:
+  br label %bb43
+
+bb43:
+  br label %bb44
+
+bb44:
+  br label %bb45
+
+bb45:
+  br label %bb46
+
+bb46:
+  br label %bb47
+
+bb47:
+  br label %bb48
+
+bb48:
+  br label %bb49
+
+bb49:
+  br label %bb50
+
+bb50:
+  br label %bb51
+
+bb51:
+  br label %bb52
+
+bb52:
+  br label %bb53
+
+bb53:
+  br label %bb54
+
+bb54:
+  br label %bb55
+
+bb55:
+  br label %bb56
+
+bb56:
+  br label %bb57
+
+bb57:
+  br label %bb58
+
+bb58:
+  br label %bb59
+
+bb59:
+  br label %bb60
+
+bb60:
+  br label %bb61
+
+bb61:
+  br label %bb62
+
+bb62:
+  br label %bb63
+
+bb63:
+  br label %bb64
+
+bb64:
+  br label %bb65
+
+bb65:
+  br label %bb66
+
+bb66:
+  br label %bb67
+
+bb67:
+  br label %bb68
+
+bb68:
+  br label %bb69
+
+bb69:
+  br label %bb70
+
+bb70:
+  br label %bb71
+
+bb71:
+  br label %bb72
+
+bb72:
+  br label %bb73
+
+bb73:
+  br label %bb74
+
+bb74:
+  br label %bb75
+
+bb75:
+  br label %bb76
+
+bb76:
+  br label %bb77
+
+bb77:
+  br label %bb78
+
+bb78:
+  br label %bb79
+
+bb79:
+  br label %bb80
+
+bb80:
+  br label %bb81
+
+bb81:
+  br label %bb82
+
+bb82:
+  br label %bb83
+
+bb83:
+  br label %bb84
+
+bb84:
+  br label %bb85
+
+bb85:
+  br label %bb86
+
+bb86:
+  br label %bb87
+
+bb87:
+  br label %bb88
+
+bb88:
+  br label %bb89
+
+bb89:
+  br label %bb90
+
+bb90:
+  br label %bb91
+
+bb91:
+  br label %bb92
+
+bb92:
+  br label %bb93
+
+bb93:
+  br label %bb94
+
+bb94:
+  br label %bb95
+
+bb95:
+  br label %bb96
+
+bb96:
+  br label %bb97
+
+bb97:
+  br label %bb98
+
+bb98:
+  br label %bb99
+
+bb99:
+  br label %bb100
+
+bb100:
+  br label %bb101
+
+bb101:
+  br label %bb102
+
+bb102:
+  br label %bb103
+
+bb103:
+  br label %bb104
+
+bb104:
+  br label %bb105
+
+bb105:
+  br label %bb106
+
+bb106:
+  br label %bb107
+
+bb107:
+  br label %bb108
+
+bb108:
+  br label %bb109
+
+bb109:
+  br label %bb110
+
+bb110:
+  br label %bb111
+
+bb111:
+  br label %bb112
+
+bb112:
+  br label %bb113
+
+bb113:
+  br label %bb114
+
+bb114:
+  br label %bb115
+
+bb115:
+  br label %bb116
+
+bb116:
+  br label %bb117
+
+bb117:
+  br label %bb118
+
+bb118:
+  br label %bb119
+
+bb119:
+  br label %bb120
+
+bb120:
+  br label %bb121
+
+bb121:
+  br label %bb122
+
+bb122:
+  br label %bb123
+
+bb123:
+  br label %bb124
+
+bb124:
+  br label %bb125
+
+bb125:
+  br label %bb126
+
+bb126:
+  br label %bb127
+
+bb127:
+  br label %bb128
+
+bb128:
+  br label %bb129
+
+bb129:
+  br label %bb130
+
+bb130:
+  br label %bb131
+
+bb131:
+  br label %bb132
+
+bb132:
+  br label %bb133
+
+bb133:
+  br label %bb134
+
+bb134:
+  br label %bb135
+
+bb135:
+  br label %bb136
+
+bb136:
+  br label %bb137
+
+bb137:
+  br label %bb138
+
+bb138:
+  br label %bb139
+
+bb139:
+  br label %bb140
+
+bb140:
+  br label %bb141
+
+bb141:
+  br label %bb142
+
+bb142:
+  br label %bb143
+
+bb143:
+  br label %bb144
+
+bb144:
+  br label %bb145
+
+bb145:
+  br label %bb146
+
+bb146:
+  br label %bb147
+
+bb147:
+  br label %bb148
+
+bb148:
+  br label %bb149
+
+bb149:
+  br label %bb150
+
+bb150:
+  br label %bb151
+
+bb151:
+  br label %bb152
+
+bb152:
+  br label %bb153
+
+bb153:
+  br label %bb154
+
+bb154:
+  br label %bb155
+
+bb155:
+  br label %bb156
+
+bb156:
+  br label %bb157
+
+bb157:
+  br label %bb158
+
+bb158:
+  br label %bb159
+
+bb159:
+  br label %bb160
+
+bb160:
+  br label %bb161
+
+bb161:
+  br label %bb162
+
+bb162:
+  br label %bb163
+
+bb163:
+  br label %bb164
+
+bb164:
+  br label %bb165
+
+bb165:
+  br label %bb166
+
+bb166:
+  br label %bb167
+
+bb167:
+  br label %bb168
+
+bb168:
+  br label %bb169
+
+bb169:
+  br label %bb170
+
+bb170:
+  br label %bb171
+
+bb171:
+  br label %bb172
+
+bb172:
+  br label %bb173
+
+bb173:
+  br label %bb174
+
+bb174:
+  br label %bb175
+
+bb175:
+  br label %bb176
+
+bb176:
+  br label %bb177
+
+bb177:
+  br label %bb178
+
+bb178:
+  br label %bb179
+
+bb179:
+  br label %bb180
+
+bb180:
+  br label %bb181
+
+bb181:
+  br label %bb182
+
+bb182:
+  br label %bb183
+
+bb183:
+  br label %bb184
+
+bb184:
+  br label %bb185
+
+bb185:
+  br label %bb186
+
+bb186:
+  br label %bb187
+
+bb187:
+  br label %bb188
+
+bb188:
+  br label %bb189
+
+bb189:
+  br label %bb190
+
+bb190:
+  br label %bb191
+
+bb191:
+  br label %bb192
+
+bb192:
+  br label %bb193
+
+bb193:
+  br label %bb194
+
+bb194:
+  br label %bb195
+
+bb195:
+  br label %bb196
+
+bb196:
+  br label %bb197
+
+bb197:
+  br label %bb198
+
+bb198:
+  br label %bb199
+
+bb199:
+  br label %bb200
+
+bb200:
+  br label %bb201
+
+bb201:
+  br label %bb202
+
+bb202:
+  br label %bb203
+
+bb203:
+  br label %bb204
+
+bb204:
+  br label %bb205
+
+bb205:
+  br label %bb206
+
+bb206:
+  br label %bb207
+
+bb207:
+  br label %bb208
+
+bb208:
+  br label %bb209
+
+bb209:
+  br label %bb210
+
+bb210:
+  br label %bb211
+
+bb211:
+  br label %bb212
+
+bb212:
+  br label %bb213
+
+bb213:
+  br label %bb214
+
+bb214:
+  br label %bb215
+
+bb215:
+  br label %bb216
+
+bb216:
+  br label %bb217
+
+bb217:
+  br label %bb218
+
+bb218:
+  br label %bb219
+
+bb219:
+  br label %bb220
+
+bb220:
+  br label %bb221
+
+bb221:
+  br label %bb222
+
+bb222:
+  br label %bb223
+
+bb223:
+  br label %bb224
+
+bb224:
+  br label %bb225
+
+bb225:
+  br label %bb226
+
+bb226:
+  br label %bb227
+
+bb227:
+  br label %bb228
+
+bb228:
+  br label %bb229
+
+bb229:
+  br label %bb230
+
+bb230:
+  br label %bb231
+
+bb231:
+  br label %bb232
+
+bb232:
+  br label %bb233
+
+bb233:
+  br label %bb234
+
+bb234:
+  br label %bb235
+
+bb235:
+  br label %bb236
+
+bb236:
+  br label %bb237
+
+bb237:
+  br label %bb238
+
+bb238:
+  br label %bb239
+
+bb239:
+  br label %bb240
+
+bb240:
+  br label %bb241
+
+bb241:
+  br label %bb242
+
+bb242:
+  br label %bb243
+
+bb243:
+  br label %bb244
+
+bb244:
+  br label %bb245
+
+bb245:
+  br label %bb246
+
+bb246:
+  br label %bb247
+
+bb247:
+  br label %bb248
+
+bb248:
+  br label %bb249
+
+bb249:
+  br label %bb250
+
+bb250:
+  br label %bb251
+
+bb251:
+  br label %bb252
+
+bb252:
+  br label %bb253
+
+bb253:
+  br label %bb254
+
+bb254:
+  br label %bb255
+
+bb255:
+  br label %bb256
+
+bb256:
+  br label %bb257
+
+bb257:
+  br label %bb258
+
+bb258:
+  br label %bb259
+
+bb259:
+  br label %bb260
+
+bb260:
+  br label %bb261
+
+bb261:
+  br label %bb262
+
+bb262:
+  br label %bb263
+
+bb263:
+  br label %bb264
+
+bb264:
+  br label %bb265
+
+bb265:
+  br label %bb266
+
+bb266:
+  br label %bb267
+
+bb267:
+  br label %bb268
+
+bb268:
+  br label %bb269
+
+bb269:
+  br label %bb270
+
+bb270:
+  br label %bb271
+
+bb271:
+  br label %bb272
+
+bb272:
+  br label %bb273
+
+bb273:
+  br label %bb274
+
+bb274:
+  br label %bb275
+
+bb275:
+  br label %bb276
+
+bb276:
+  br label %bb277
+
+bb277:
+  br label %bb278
+
+bb278:
+  br label %bb279
+
+bb279:
+  br label %bb280
+
+bb280:
+  br label %bb281
+
+bb281:
+  br label %bb282
+
+bb282:
+  br label %bb283
+
+bb283:
+  br label %bb284
+
+bb284:
+  br label %bb285
+
+bb285:
+  br label %bb286
+
+bb286:
+  br label %bb287
+
+bb287:
+  br label %bb288
+
+bb288:
+  br label %bb289
+
+bb289:
+  br label %bb290
+
+bb290:
+  br label %bb291
+
+bb291:
+  br label %bb292
+
+bb292:
+  br label %bb293
+
+bb293:
+  br label %bb294
+
+bb294:
+  br label %bb295
+
+bb295:
+  br label %bb296
+
+bb296:
+  br label %bb297
+
+bb297:
+  br label %bb298
+
+bb298:
+  br label %bb299
+
+bb299:
+  br label %bb300
+
+bb300:
+  br label %bb301
+
+bb301:
+  br label %bb302
+
+bb302:
+  br label %bb303
+
+bb303:
+  br label %bb304
+
+bb304:
+  br label %bb305
+
+bb305:
+  br label %bb306
+
+bb306:
+  br label %bb307
+
+bb307:
+  br label %bb308
+
+bb308:
+  br label %bb309
+
+bb309:
+  br label %bb310
+
+bb310:
+  br label %bb311
+
+bb311:
+  br label %bb312
+
+bb312:
+  br label %bb313
+
+bb313:
+  br label %bb314
+
+bb314:
+  br label %bb315
+
+bb315:
+  br label %bb316
+
+bb316:
+  br label %bb317
+
+bb317:
+  br label %bb318
+
+bb318:
+  br label %bb319
+
+bb319:
+  br label %bb320
+
+bb320:
+  br label %bb321
+
+bb321:
+  br label %bb322
+
+bb322:
+  br label %bb323
+
+bb323:
+  br label %bb324
+
+bb324:
+  br label %bb325
+
+bb325:
+  br label %bb326
+
+bb326:
+  br label %bb327
+
+bb327:
+  br label %bb328
+
+bb328:
+  br label %bb329
+
+bb329:
+  br label %bb330
+
+bb330:
+  br label %bb331
+
+bb331:
+  br label %bb332
+
+bb332:
+  br label %bb333
+
+bb333:
+  br label %bb334
+
+bb334:
+  br label %bb335
+
+bb335:
+  br label %bb336
+
+bb336:
+  br label %bb337
+
+bb337:
+  br label %bb338
+
+bb338:
+  br label %bb339
+
+bb339:
+  br label %bb340
+
+bb340:
+  br label %bb341
+
+bb341:
+  br label %bb342
+
+bb342:
+  br label %bb343
+
+bb343:
+  br label %bb344
+
+bb344:
+  br label %bb345
+
+bb345:
+  br label %bb346
+
+bb346:
+  br label %bb347
+
+bb347:
+  br label %bb348
+
+bb348:
+  br label %bb349
+
+bb349:
+  br label %bb350
+
+bb350:
+  br label %bb351
+
+bb351:
+  br label %bb352
+
+bb352:
+  br label %bb353
+
+bb353:
+  br label %bb354
+
+bb354:
+  br label %bb355
+
+bb355:
+  br label %bb356
+
+bb356:
+  br label %bb357
+
+bb357:
+  br label %bb358
+
+bb358:
+  br label %bb359
+
+bb359:
+  br label %bb360
+
+bb360:
+  br label %bb361
+
+bb361:
+  br label %bb362
+
+bb362:
+  br label %bb363
+
+bb363:
+  br label %bb364
+
+bb364:
+  br label %bb365
+
+bb365:
+  br label %bb366
+
+bb366:
+  br label %bb367
+
+bb367:
+  br label %bb368
+
+bb368:
+  br label %bb369
+
+bb369:
+  br label %bb370
+
+bb370:
+  br label %bb371
+
+bb371:
+  br label %bb372
+
+bb372:
+  br label %bb373
+
+bb373:
+  br label %bb374
+
+bb374:
+  br label %bb375
+
+bb375:
+  br label %bb376
+
+bb376:
+  br label %bb377
+
+bb377:
+  br label %bb378
+
+bb378:
+  br label %bb379
+
+bb379:
+  br label %bb380
+
+bb380:
+  br label %bb381
+
+bb381:
+  br label %bb382
+
+bb382:
+  br label %bb383
+
+bb383:
+  br label %bb384
+
+bb384:
+  br label %bb385
+
+bb385:
+  br label %bb386
+
+bb386:
+  br label %bb387
+
+bb387:
+  br label %bb388
+
+bb388:
+  br label %bb389
+
+bb389:
+  br label %bb390
+
+bb390:
+  br label %bb391
+
+bb391:
+  br label %bb392
+
+bb392:
+  br label %bb393
+
+bb393:
+  br label %bb394
+
+bb394:
+  br label %bb395
+
+bb395:
+  br label %bb396
+
+bb396:
+  br label %bb397
+
+bb397:
+  br label %bb398
+
+bb398:
+  br label %bb399
+
+bb399:
+  br label %bb400
+
+bb400:
+  br label %bb401
+
+bb401:
+  br label %bb402
+
+bb402:
+  br label %bb403
+
+bb403:
+  br label %bb404
+
+bb404:
+  br label %bb405
+
+bb405:
+  br label %bb406
+
+bb406:
+  br label %bb407
+
+bb407:
+  br label %bb408
+
+bb408:
+  br label %bb409
+
+bb409:
+  br label %bb410
+
+bb410:
+  br label %bb411
+
+bb411:
+  br label %bb412
+
+bb412:
+  br label %bb413
+
+bb413:
+  br label %bb414
+
+bb414:
+  br label %bb415
+
+bb415:
+  br label %bb416
+
+bb416:
+  br label %bb417
+
+bb417:
+  br label %bb418
+
+bb418:
+  br label %bb419
+
+bb419:
+  br label %bb420
+
+bb420:
+  br label %bb421
+
+bb421:
+  br label %bb422
+
+bb422:
+  br label %bb423
+
+bb423:
+  br label %bb424
+
+bb424:
+  br label %bb425
+
+bb425:
+  br label %bb426
+
+bb426:
+  br label %bb427
+
+bb427:
+  br label %bb428
+
+bb428:
+  br label %bb429
+
+bb429:
+  br label %bb430
+
+bb430:
+  br label %bb431
+
+bb431:
+  br label %bb432
+
+bb432:
+  br label %bb433
+
+bb433:
+  br label %bb434
+
+bb434:
+  br label %bb435
+
+bb435:
+  br label %bb436
+
+bb436:
+  br label %bb437
+
+bb437:
+  br label %bb438
+
+bb438:
+  br label %bb439
+
+bb439:
+  br label %bb440
+
+bb440:
+  br label %bb441
+
+bb441:
+  br label %bb442
+
+bb442:
+  br label %bb443
+
+bb443:
+  br label %bb444
+
+bb444:
+  br label %bb445
+
+bb445:
+  br label %bb446
+
+bb446:
+  br label %bb447
+
+bb447:
+  br label %bb448
+
+bb448:
+  br label %bb449
+
+bb449:
+  br label %bb450
+
+bb450:
+  br label %bb451
+
+bb451:
+  br label %bb452
+
+bb452:
+  br label %bb453
+
+bb453:
+  br label %bb454
+
+bb454:
+  br label %bb455
+
+bb455:
+  br label %bb456
+
+bb456:
+  br label %bb457
+
+bb457:
+  br label %bb458
+
+bb458:
+  br label %bb459
+
+bb459:
+  br label %bb460
+
+bb460:
+  br label %bb461
+
+bb461:
+  br label %bb462
+
+bb462:
+  br label %bb463
+
+bb463:
+  br label %bb464
+
+bb464:
+  br label %bb465
+
+bb465:
+  br label %bb466
+
+bb466:
+  br label %bb467
+
+bb467:
+  br label %bb468
+
+bb468:
+  br label %bb469
+
+bb469:
+  br label %bb470
+
+bb470:
+  br label %bb471
+
+bb471:
+  br label %bb472
+
+bb472:
+  br label %bb473
+
+bb473:
+  br label %bb474
+
+bb474:
+  br label %bb475
+
+bb475:
+  br label %bb476
+
+bb476:
+  br label %bb477
+
+bb477:
+  br label %bb478
+
+bb478:
+  br label %bb479
+
+bb479:
+  br label %bb480
+
+bb480:
+  br label %bb481
+
+bb481:
+  br label %bb482
+
+bb482:
+  br label %bb483
+
+bb483:
+  br label %bb484
+
+bb484:
+  br label %bb485
+
+bb485:
+  br label %bb486
+
+bb486:
+  br label %bb487
+
+bb487:
+  br label %bb488
+
+bb488:
+  br label %bb489
+
+bb489:
+  br label %bb490
+
+bb490:
+  br label %bb491
+
+bb491:
+  br label %bb492
+
+bb492:
+  br label %bb493
+
+bb493:
+  br label %bb494
+
+bb494:
+  br label %bb495
+
+bb495:
+  br label %bb496
+
+bb496:
+  br label %bb497
+
+bb497:
+  br label %bb498
+
+bb498:
+  br label %bb499
+
+bb499:
+  br label %bb500
+
+bb500:
+  br label %bb501
+
+bb501:
+  br label %bb502
+
+bb502:
+  br label %bb503
+
+bb503:
+  br label %bb504
+
+bb504:
+  br label %bb505
+
+bb505:
+  br label %bb506
+
+bb506:
+  br label %bb507
+
+bb507:
+  br label %bb508
+
+bb508:
+  br label %bb509
+
+bb509:
+  br label %bb510
+
+bb510:
+  br label %bb511
+
+bb511:
+  br label %bb512
+
+bb512:
+  br label %bb513
+
+bb513:
+  br label %bb514
+
+bb514:
+  br label %bb515
+
+bb515:
+  br label %bb516
+
+bb516:
+  br label %bb517
+
+bb517:
+  br label %bb518
+
+bb518:
+  br label %bb519
+
+bb519:
+  br label %bb520
+
+bb520:
+  br label %bb521
+
+bb521:
+  br label %bb522
+
+bb522:
+  br label %bb523
+
+bb523:
+  br label %bb524
+
+bb524:
+  br label %bb525
+
+bb525:
+  br label %bb526
+
+bb526:
+  br label %bb527
+
+bb527:
+  br label %bb528
+
+bb528:
+  br label %bb529
+
+bb529:
+  br label %bb530
+
+bb530:
+  br label %bb531
+
+bb531:
+  br label %bb532
+
+bb532:
+  br label %bb533
+
+bb533:
+  br label %bb534
+
+bb534:
+  br label %bb535
+
+bb535:
+  br label %bb536
+
+bb536:
+  br label %bb537
+
+bb537:
+  br label %bb538
+
+bb538:
+  br label %bb539
+
+bb539:
+  br label %bb540
+
+bb540:
+  br label %bb541
+
+bb541:
+  br label %bb542
+
+bb542:
+  br label %bb543
+
+bb543:
+  br label %bb544
+
+bb544:
+  br label %bb545
+
+bb545:
+  br label %bb546
+
+bb546:
+  br label %bb547
+
+bb547:
+  br label %bb548
+
+bb548:
+  br label %bb549
+
+bb549:
+  br label %bb550
+
+bb550:
+  br label %bb551
+
+bb551:
+  br label %bb552
+
+bb552:
+  br label %bb553
+
+bb553:
+  br label %bb554
+
+bb554:
+  br label %bb555
+
+bb555:
+  br label %bb556
+
+bb556:
+  br label %bb557
+
+bb557:
+  br label %bb558
+
+bb558:
+  br label %bb559
+
+bb559:
+  br label %bb560
+
+bb560:
+  br label %bb561
+
+bb561:
+  br label %bb562
+
+bb562:
+  br label %bb563
+
+bb563:
+  br label %bb564
+
+bb564:
+  br label %bb565
+
+bb565:
+  br label %bb566
+
+bb566:
+  br label %bb567
+
+bb567:
+  br label %bb568
+
+bb568:
+  br label %bb569
+
+bb569:
+  br label %bb570
+
+bb570:
+  br label %bb571
+
+bb571:
+  br label %bb572
+
+bb572:
+  br label %bb573
+
+bb573:
+  br label %bb574
+
+bb574:
+  br label %bb575
+
+bb575:
+  br label %bb576
+
+bb576:
+  br label %bb577
+
+bb577:
+  br label %bb578
+
+bb578:
+  br label %bb579
+
+bb579:
+  br label %bb580
+
+bb580:
+  br label %bb581
+
+bb581:
+  br label %bb582
+
+bb582:
+  br label %bb583
+
+bb583:
+  br label %bb584
+
+bb584:
+  br label %bb585
+
+bb585:
+  br label %bb586
+
+bb586:
+  br label %bb587
+
+bb587:
+  br label %bb588
+
+bb588:
+  br label %bb589
+
+bb589:
+  br label %bb590
+
+bb590:
+  br label %bb591
+
+bb591:
+  br label %bb592
+
+bb592:
+  br label %bb593
+
+bb593:
+  br label %bb594
+
+bb594:
+  br label %bb595
+
+bb595:
+  br label %bb596
+
+bb596:
+  br label %bb597
+
+bb597:
+  br label %bb598
+
+bb598:
+  br label %bb599
+
+bb599:
+  br label %bb600
+
+bb600:
+  br label %bb601
+
+bb601:
+  br label %bb602
+
+bb602:
+  br label %bb603
+
+bb603:
+  br label %bb604
+
+bb604:
+  br label %bb605
+
+bb605:
+  br label %bb606
+
+bb606:
+  br label %bb607
+
+bb607:
+  br label %bb608
+
+bb608:
+  br label %bb609
+
+bb609:
+  br label %bb610
+
+bb610:
+  br label %bb611
+
+bb611:
+  br label %bb612
+
+bb612:
+  br label %bb613
+
+bb613:
+  br label %bb614
+
+bb614:
+  br label %bb615
+
+bb615:
+  br label %bb616
+
+bb616:
+  br label %bb617
+
+bb617:
+  br label %bb618
+
+bb618:
+  br label %bb619
+
+bb619:
+  br label %bb620
+
+bb620:
+  br label %bb621
+
+bb621:
+  br label %bb622
+
+bb622:
+  br label %bb623
+
+bb623:
+  br label %bb624
+
+bb624:
+  br label %bb625
+
+bb625:
+  br label %bb626
+
+bb626:
+  br label %bb627
+
+bb627:
+  br label %bb628
+
+bb628:
+  br label %bb629
+
+bb629:
+  br label %bb630
+
+bb630:
+  br label %bb631
+
+bb631:
+  br label %bb632
+
+bb632:
+  br label %bb633
+
+bb633:
+  br label %bb634
+
+bb634:
+  br label %bb635
+
+bb635:
+  br label %bb636
+
+bb636:
+  br label %bb637
+
+bb637:
+  br label %bb638
+
+bb638:
+  br label %bb639
+
+bb639:
+  br label %bb640
+
+bb640:
+  br label %bb641
+
+bb641:
+  br label %bb642
+
+bb642:
+  br label %bb643
+
+bb643:
+  br label %bb644
+
+bb644:
+  br label %bb645
+
+bb645:
+  br label %bb646
+
+bb646:
+  br label %bb647
+
+bb647:
+  br label %bb648
+
+bb648:
+  br label %bb649
+
+bb649:
+  br label %bb650
+
+bb650:
+  br label %bb651
+
+bb651:
+  br label %bb652
+
+bb652:
+  br label %bb653
+
+bb653:
+  br label %bb654
+
+bb654:
+  br label %bb655
+
+bb655:
+  br label %bb656
+
+bb656:
+  br label %bb657
+
+bb657:
+  br label %bb658
+
+bb658:
+  br label %bb659
+
+bb659:
+  br label %bb660
+
+bb660:
+  br label %bb661
+
+bb661:
+  br label %bb662
+
+bb662:
+  br label %bb663
+
+bb663:
+  br label %bb664
+
+bb664:
+  br label %bb665
+
+bb665:
+  br label %bb666
+
+bb666:
+  br label %bb667
+
+bb667:
+  br label %bb668
+
+bb668:
+  br label %bb669
+
+bb669:
+  br label %bb670
+
+bb670:
+  br label %bb671
+
+bb671:
+  br label %bb672
+
+bb672:
+  br label %bb673
+
+bb673:
+  br label %bb674
+
+bb674:
+  br label %bb675
+
+bb675:
+  br label %bb676
+
+bb676:
+  br label %bb677
+
+bb677:
+  br label %bb678
+
+bb678:
+  br label %bb679
+
+bb679:
+  br label %bb680
+
+bb680:
+  br label %bb681
+
+bb681:
+  br label %bb682
+
+bb682:
+  br label %bb683
+
+bb683:
+  br label %bb684
+
+bb684:
+  br label %bb685
+
+bb685:
+  br label %bb686
+
+bb686:
+  br label %bb687
+
+bb687:
+  br label %bb688
+
+bb688:
+  br label %bb689
+
+bb689:
+  br label %bb690
+
+bb690:
+  br label %bb691
+
+bb691:
+  br label %bb692
+
+bb692:
+  br label %bb693
+
+bb693:
+  br label %bb694
+
+bb694:
+  br label %bb695
+
+bb695:
+  br label %bb696
+
+bb696:
+  br label %bb697
+
+bb697:
+  br label %bb698
+
+bb698:
+  br label %bb699
+
+bb699:
+  br label %bb700
+
+bb700:
+  br label %bb701
+
+bb701:
+  br label %bb702
+
+bb702:
+  br label %bb703
+
+bb703:
+  br label %bb704
+
+bb704:
+  br label %bb705
+
+bb705:
+  br label %bb706
+
+bb706:
+  br label %bb707
+
+bb707:
+  br label %bb708
+
+bb708:
+  br label %bb709
+
+bb709:
+  br label %bb710
+
+bb710:
+  br label %bb711
+
+bb711:
+  br label %bb712
+
+bb712:
+  br label %bb713
+
+bb713:
+  br label %bb714
+
+bb714:
+  br label %bb715
+
+bb715:
+  br label %bb716
+
+bb716:
+  br label %bb717
+
+bb717:
+  br label %bb718
+
+bb718:
+  br label %bb719
+
+bb719:
+  br label %bb720
+
+bb720:
+  br label %bb721
+
+bb721:
+  br label %bb722
+
+bb722:
+  br label %bb723
+
+bb723:
+  br label %bb724
+
+bb724:
+  br label %bb725
+
+bb725:
+  br label %bb726
+
+bb726:
+  br label %bb727
+
+bb727:
+  br label %bb728
+
+bb728:
+  br label %bb729
+
+bb729:
+  br label %bb730
+
+bb730:
+  br label %bb731
+
+bb731:
+  br label %bb732
+
+bb732:
+  br label %bb733
+
+bb733:
+  br label %bb734
+
+bb734:
+  br label %bb735
+
+bb735:
+  br label %bb736
+
+bb736:
+  br label %bb737
+
+bb737:
+  br label %bb738
+
+bb738:
+  br label %bb739
+
+bb739:
+  br label %bb740
+
+bb740:
+  br label %bb741
+
+bb741:
+  br label %bb742
+
+bb742:
+  br label %bb743
+
+bb743:
+  br label %bb744
+
+bb744:
+  br label %bb745
+
+bb745:
+  br label %bb746
+
+bb746:
+  br label %bb747
+
+bb747:
+  br label %bb748
+
+bb748:
+  br label %bb749
+
+bb749:
+  br label %bb750
+
+bb750:
+  br label %bb751
+
+bb751:
+  br label %bb752
+
+bb752:
+  br label %bb753
+
+bb753:
+  br label %bb754
+
+bb754:
+  br label %bb755
+
+bb755:
+  br label %bb756
+
+bb756:
+  br label %bb757
+
+bb757:
+  br label %bb758
+
+bb758:
+  br label %bb759
+
+bb759:
+  br label %bb760
+
+bb760:
+  br label %bb761
+
+bb761:
+  br label %bb762
+
+bb762:
+  br label %bb763
+
+bb763:
+  br label %bb764
+
+bb764:
+  br label %bb765
+
+bb765:
+  br label %bb766
+
+bb766:
+  br label %bb767
+
+bb767:
+  br label %bb768
+
+bb768:
+  br label %bb769
+
+bb769:
+  br label %bb770
+
+bb770:
+  br label %bb771
+
+bb771:
+  br label %bb772
+
+bb772:
+  br label %bb773
+
+bb773:
+  br label %bb774
+
+bb774:
+  br label %bb775
+
+bb775:
+  br label %bb776
+
+bb776:
+  br label %bb777
+
+bb777:
+  br label %bb778
+
+bb778:
+  br label %bb779
+
+bb779:
+  br label %bb780
+
+bb780:
+  br label %bb781
+
+bb781:
+  br label %bb782
+
+bb782:
+  br label %bb783
+
+bb783:
+  br label %bb784
+
+bb784:
+  br label %bb785
+
+bb785:
+  br label %bb786
+
+bb786:
+  br label %bb787
+
+bb787:
+  br label %bb788
+
+bb788:
+  br label %bb789
+
+bb789:
+  br label %bb790
+
+bb790:
+  br label %bb791
+
+bb791:
+  br label %bb792
+
+bb792:
+  br label %bb793
+
+bb793:
+  br label %bb794
+
+bb794:
+  br label %bb795
+
+bb795:
+  br label %bb796
+
+bb796:
+  br label %bb797
+
+bb797:
+  br label %bb798
+
+bb798:
+  br label %bb799
+
+bb799:
+  br label %bb800
+
+bb800:
+  br label %bb801
+
+bb801:
+  br label %bb802
+
+bb802:
+  br label %bb803
+
+bb803:
+  br label %bb804
+
+bb804:
+  br label %bb805
+
+bb805:
+  br label %bb806
+
+bb806:
+  br label %bb807
+
+bb807:
+  br label %bb808
+
+bb808:
+  br label %bb809
+
+bb809:
+  br label %bb810
+
+bb810:
+  br label %bb811
+
+bb811:
+  br label %bb812
+
+bb812:
+  br label %bb813
+
+bb813:
+  br label %bb814
+
+bb814:
+  br label %bb815
+
+bb815:
+  br label %bb816
+
+bb816:
+  br label %bb817
+
+bb817:
+  br label %bb818
+
+bb818:
+  br label %bb819
+
+bb819:
+  br label %bb820
+
+bb820:
+  br label %bb821
+
+bb821:
+  br label %bb822
+
+bb822:
+  br label %bb823
+
+bb823:
+  br label %bb824
+
+bb824:
+  br label %bb825
+
+bb825:
+  br label %bb826
+
+bb826:
+  br label %bb827
+
+bb827:
+  br label %bb828
+
+bb828:
+  br label %bb829
+
+bb829:
+  br label %bb830
+
+bb830:
+  br label %bb831
+
+bb831:
+  br label %bb832
+
+bb832:
+  br label %bb833
+
+bb833:
+  br label %bb834
+
+bb834:
+  br label %bb835
+
+bb835:
+  br label %bb836
+
+bb836:
+  br label %bb837
+
+bb837:
+  br label %bb838
+
+bb838:
+  br label %bb839
+
+bb839:
+  br label %bb840
+
+bb840:
+  br label %bb841
+
+bb841:
+  br label %bb842
+
+bb842:
+  br label %bb843
+
+bb843:
+  br label %bb844
+
+bb844:
+  br label %bb845
+
+bb845:
+  br label %bb846
+
+bb846:
+  br label %bb847
+
+bb847:
+  br label %bb848
+
+bb848:
+  br label %bb849
+
+bb849:
+  br label %bb850
+
+bb850:
+  br label %bb851
+
+bb851:
+  br label %bb852
+
+bb852:
+  br label %bb853
+
+bb853:
+  br label %bb854
+
+bb854:
+  br label %bb855
+
+bb855:
+  br label %bb856
+
+bb856:
+  br label %bb857
+
+bb857:
+  br label %bb858
+
+bb858:
+  br label %bb859
+
+bb859:
+  br label %bb860
+
+bb860:
+  br label %bb861
+
+bb861:
+  br label %bb862
+
+bb862:
+  br label %bb863
+
+bb863:
+  br label %bb864
+
+bb864:
+  br label %bb865
+
+bb865:
+  br label %bb866
+
+bb866:
+  br label %bb867
+
+bb867:
+  br label %bb868
+
+bb868:
+  br label %bb869
+
+bb869:
+  br label %bb870
+
+bb870:
+  br label %bb871
+
+bb871:
+  br label %bb872
+
+bb872:
+  br label %bb873
+
+bb873:
+  br label %bb874
+
+bb874:
+  br label %bb875
+
+bb875:
+  br label %bb876
+
+bb876:
+  br label %bb877
+
+bb877:
+  br label %bb878
+
+bb878:
+  br label %bb879
+
+bb879:
+  br label %bb880
+
+bb880:
+  br label %bb881
+
+bb881:
+  br label %bb882
+
+bb882:
+  br label %bb883
+
+bb883:
+  br label %bb884
+
+bb884:
+  br label %bb885
+
+bb885:
+  br label %bb886
+
+bb886:
+  br label %bb887
+
+bb887:
+  br label %bb888
+
+bb888:
+  br label %bb889
+
+bb889:
+  br label %bb890
+
+bb890:
+  br label %bb891
+
+bb891:
+  br label %bb892
+
+bb892:
+  br label %bb893
+
+bb893:
+  br label %bb894
+
+bb894:
+  br label %bb895
+
+bb895:
+  br label %bb896
+
+bb896:
+  br label %bb897
+
+bb897:
+  br label %bb898
+
+bb898:
+  br label %bb899
+
+bb899:
+  br label %bb900
+
+bb900:
+  br label %bb901
+
+bb901:
+  br label %bb902
+
+bb902:
+  br label %bb903
+
+bb903:
+  br label %bb904
+
+bb904:
+  br label %bb905
+
+bb905:
+  br label %bb906
+
+bb906:
+  br label %bb907
+
+bb907:
+  br label %bb908
+
+bb908:
+  br label %bb909
+
+bb909:
+  br label %bb910
+
+bb910:
+  br label %bb911
+
+bb911:
+  br label %bb912
+
+bb912:
+  br label %bb913
+
+bb913:
+  br label %bb914
+
+bb914:
+  br label %bb915
+
+bb915:
+  br label %bb916
+
+bb916:
+  br label %bb917
+
+bb917:
+  br label %bb918
+
+bb918:
+  br label %bb919
+
+bb919:
+  br label %bb920
+
+bb920:
+  br label %bb921
+
+bb921:
+  br label %bb922
+
+bb922:
+  br label %bb923
+
+bb923:
+  br label %bb924
+
+bb924:
+  br label %bb925
+
+bb925:
+  br label %bb926
+
+bb926:
+  br label %bb927
+
+bb927:
+  br label %bb928
+
+bb928:
+  br label %bb929
+
+bb929:
+  br label %bb930
+
+bb930:
+  br label %bb931
+
+bb931:
+  br label %bb932
+
+bb932:
+  br label %bb933
+
+bb933:
+  br label %bb934
+
+bb934:
+  br label %bb935
+
+bb935:
+  br label %bb936
+
+bb936:
+  br label %bb937
+
+bb937:
+  br label %bb938
+
+bb938:
+  br label %bb939
+
+bb939:
+  br label %bb940
+
+bb940:
+  br label %bb941
+
+bb941:
+  br label %bb942
+
+bb942:
+  br label %bb943
+
+bb943:
+  br label %bb944
+
+bb944:
+  br label %bb945
+
+bb945:
+  br label %bb946
+
+bb946:
+  br label %bb947
+
+bb947:
+  br label %bb948
+
+bb948:
+  br label %bb949
+
+bb949:
+  br label %bb950
+
+bb950:
+  br label %bb951
+
+bb951:
+  br label %bb952
+
+bb952:
+  br label %bb953
+
+bb953:
+  br label %bb954
+
+bb954:
+  br label %bb955
+
+bb955:
+  br label %bb956
+
+bb956:
+  br label %bb957
+
+bb957:
+  br label %bb958
+
+bb958:
+  br label %bb959
+
+bb959:
+  br label %bb960
+
+bb960:
+  br label %bb961
+
+bb961:
+  br label %bb962
+
+bb962:
+  br label %bb963
+
+bb963:
+  br label %bb964
+
+bb964:
+  br label %bb965
+
+bb965:
+  br label %bb966
+
+bb966:
+  br label %bb967
+
+bb967:
+  br label %bb968
+
+bb968:
+  br label %bb969
+
+bb969:
+  br label %bb970
+
+bb970:
+  br label %bb971
+
+bb971:
+  br label %bb972
+
+bb972:
+  br label %bb973
+
+bb973:
+  br label %bb974
+
+bb974:
+  br label %bb975
+
+bb975:
+  br label %bb976
+
+bb976:
+  br label %bb977
+
+bb977:
+  br label %bb978
+
+bb978:
+  br label %bb979
+
+bb979:
+  br label %bb980
+
+bb980:
+  br label %bb981
+
+bb981:
+  br label %bb982
+
+bb982:
+  br label %bb983
+
+bb983:
+  br label %bb984
+
+bb984:
+  br label %bb985
+
+bb985:
+  br label %bb986
+
+bb986:
+  br label %bb987
+
+bb987:
+  br label %bb988
+
+bb988:
+  br label %bb989
+
+bb989:
+  br label %bb990
+
+bb990:
+  br label %bb991
+
+bb991:
+  br label %bb992
+
+bb992:
+  br label %bb993
+
+bb993:
+  br label %bb994
+
+bb994:
+  br label %bb995
+
+bb995:
+  br label %bb996
+
+bb996:
+  br label %bb997
+
+bb997:
+  br label %bb998
+
+bb998:
+  br label %bb999
+
+bb999:
+  br label %bb1000
+
+bb1000:
+  ; CHECK: call{{.*}}@dfsan_union
+  ; CHECK-NOT: phi
+  %ab = mul i32 %a, %b
+  ret i32 %ab
+}

diff --git a/test/Instrumentation/DataFlowSanitizer/union.ll b/test/Instrumentation/DataFlowSanitizer/union.ll
new file mode 100644
index 0000000..2b31081
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/union.ll

@@ -0,0 +1,52 @@
+; RUN: opt < %s -dfsan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global i32 0
+@b = common global i32 0
+
+; Check that we reuse unions where possible.
+
+; CHECK-LABEL: @"dfs$f"
+define void @f(i32 %x, i32 %y) {
+  ; CHECK: call{{.*}}__dfsan_union
+  %xay = add i32 %x, %y
+  store i32 %xay, i32* @a
+  ; CHECK-NOT: call{{.*}}__dfsan_union
+  %xmy = mul i32 %x, %y
+  store i32 %xmy, i32* @b
+  ret void
+}
+
+; In this case, we compute the unions on both sides because neither block
+; dominates the other.
+
+; CHECK-LABEL: @"dfs$g"
+define void @g(i1 %p, i32 %x, i32 %y) {
+  br i1 %p, label %l1, label %l2
+
+l1:
+  ; CHECK: call{{.*}}__dfsan_union
+  %xay = add i32 %x, %y
+  store i32 %xay, i32* @a
+  br label %l3
+
+l2:
+  ; CHECK: call{{.*}}__dfsan_union
+  %xmy = mul i32 %x, %y
+  store i32 %xmy, i32* @b
+  br label %l3
+
+l3:
+  ret void
+}
+
+; In this case, we know that the label for %xayax subsumes the label for %xay.
+
+; CHECK-LABEL: @"dfs$h"
+define i32 @h(i32 %x, i32 %y) {
+  ; CHECK: call{{.*}}__dfsan_union
+  %xay = add i32 %x, %y
+  ; CHECK-NOT: call{{.*}}__dfsan_union
+  %xayax = add i32 %xay, %x
+  ret i32 %xayax
+}

diff --git a/test/Instrumentation/MemorySanitizer/array_types.ll b/test/Instrumentation/MemorySanitizer/array_types.ll
new file mode 100644
index 0000000..fa3835f
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/array_types.ll

@@ -0,0 +1,89 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define [2 x i32] @InsertValue(i32 %x, i32 %y) sanitize_memory {
+entry:
+  %a = insertvalue [2 x i32] undef, i32 %x, 0
+  %b = insertvalue [2 x i32] %a, i32 %y, 1
+  ret [2 x i32] %b
+}
+
+; CHECK-LABEL: @InsertValue(
+; CHECK-DAG: [[Sy:%.*]] = load i32* {{.*}}@__msan_param_tls to i64), i64 8) to i32*)
+; CHECK-DAG: [[Sx:%.*]] = load i32* {{.*}}@__msan_param_tls to i32*)
+; CHECK: [[A:%.*]] = insertvalue [2 x i32] [i32 -1, i32 -1], i32 [[Sx]], 0
+; CHECK: [[B:%.*]] = insertvalue [2 x i32] [[A]], i32 [[Sy]], 1
+; CHECK: store [2 x i32] [[B]], [2 x i32]* {{.*}}@__msan_retval_tls
+; CHECK: ret [2 x i32]
+
+
+define [2 x double] @InsertValueDouble(double %x, double %y) sanitize_memory {
+entry:
+  %a = insertvalue [2 x double] undef, double %x, 0
+  %b = insertvalue [2 x double] %a, double %y, 1
+  ret [2 x double] %b
+}
+
+; CHECK-LABEL: @InsertValueDouble(
+; CHECK-DAG: [[Sy:%.*]] = load i64* {{.*}}@__msan_param_tls to i64), i64 8) to i64*)
+; CHECK-DAG: [[Sx:%.*]] = load i64* getelementptr {{.*}}@__msan_param_tls, i32 0, i32 0
+; CHECK: [[A:%.*]] = insertvalue [2 x i64] [i64 -1, i64 -1], i64 [[Sx]], 0
+; CHECK: [[B:%.*]] = insertvalue [2 x i64] [[A]], i64 [[Sy]], 1
+; CHECK: store [2 x i64] [[B]], [2 x i64]* {{.*}}@__msan_retval_tls
+; CHECK: ret [2 x double]
+
+
+define i32 @ExtractValue([2 x i32] %a) sanitize_memory {
+entry:
+  %x = extractvalue [2 x i32] %a, 1
+  ret i32 %x
+}
+
+; CHECK-LABEL: @ExtractValue(
+; CHECK: [[Sa:%.*]] = load [2 x i32]* {{.*}}@__msan_param_tls to [2 x i32]*)
+; CHECK: [[Sx:%.*]] = extractvalue [2 x i32] [[Sa]], 1
+; CHECK: store i32 [[Sx]], i32* {{.*}}@__msan_retval_tls
+; CHECK: ret i32
+
+
+; Regression test for PR20493.
+
+%MyStruct = type { i32, i32, [3 x i32] }
+
+define i32 @ArrayInStruct(%MyStruct %s) sanitize_memory {
+  %x = extractvalue %MyStruct %s, 2, 1
+  ret i32 %x
+}
+
+; CHECK-LABEL: @ArrayInStruct(
+; CHECK: [[Ss:%.*]] = load { i32, i32, [3 x i32] }* {{.*}}@__msan_param_tls to { i32, i32, [3 x i32] }*)
+; CHECK: [[Sx:%.*]] = extractvalue { i32, i32, [3 x i32] } [[Ss]], 2, 1
+; CHECK: store i32 [[Sx]], i32* {{.*}}@__msan_retval_tls
+; CHECK: ret i32
+
+
+define i32 @ArrayOfStructs([3 x { i32, i32 }] %a) sanitize_memory {
+  %x = extractvalue [3 x { i32, i32 }] %a, 2, 1
+  ret i32 %x
+}
+
+; CHECK-LABEL: @ArrayOfStructs(
+; CHECK: [[Ss:%.*]] = load [3 x { i32, i32 }]* {{.*}}@__msan_param_tls to [3 x { i32, i32 }]*)
+; CHECK: [[Sx:%.*]] = extractvalue [3 x { i32, i32 }] [[Ss]], 2, 1
+; CHECK: store i32 [[Sx]], i32* {{.*}}@__msan_retval_tls
+; CHECK: ret i32
+
+
+define <8 x i16> @ArrayOfVectors([3 x <8 x i16>] %a) sanitize_memory {
+  %x = extractvalue [3 x <8 x i16>] %a, 1
+  ret <8 x i16> %x
+}
+
+; CHECK-LABEL: @ArrayOfVectors(
+; CHECK: [[Ss:%.*]] = load [3 x <8 x i16>]* {{.*}}@__msan_param_tls to [3 x <8 x i16>]*)
+; CHECK: [[Sx:%.*]] = extractvalue [3 x <8 x i16>] [[Ss]], 1
+; CHECK: store <8 x i16> [[Sx]], <8 x i16>* {{.*}}@__msan_retval_tls
+; CHECK: ret <8 x i16>

diff --git a/test/Instrumentation/MemorySanitizer/byval-alignment.ll b/test/Instrumentation/MemorySanitizer/byval-alignment.ll
new file mode 100644
index 0000000..43e204a
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/byval-alignment.ll

@@ -0,0 +1,20 @@
+; Test that copy alignment for byval arguments is limited by param-tls slot alignment.
+
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.S = type { i64, i64, i64, [8 x i8] }
+
+; CHECK: [[A:%.*]] = bitcast i64* {{.*}} add {{.*}} ptrtoint {{.*}} @__msan_param_tls {{.*}} i64 8)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[A]], i8* {{.*}}, i64 32, i32 8, i1 false)
+
+define void @Caller() sanitize_memory {
+entry:
+  %agg.tmp = alloca %struct.S, align 16
+  call void @Callee(i32 1, %struct.S* byval align 16 %agg.tmp)
+  ret void
+}
+
+declare void @Callee(i32, %struct.S* byval align 16)

diff --git a/test/Instrumentation/MemorySanitizer/check-constant-shadow.ll b/test/Instrumentation/MemorySanitizer/check-constant-shadow.ll
new file mode 100644
index 0000000..11e4410
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/check-constant-shadow.ll

@@ -0,0 +1,15 @@
+; RUN: opt < %s -msan -msan-check-constant-shadow=1 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test that returning a literal undef from main() triggers an MSan warning.
+
+define i32 @main() nounwind uwtable sanitize_memory {
+entry:
+  ret i32 undef
+}
+
+; CHECK-LABEL: @main
+; CHECK: call void @__msan_warning_noreturn
+; CHECK: ret i32 undef

diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index 51693cd..0faf45d 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll

@@ -766,6 +766,24 @@
 ; CHECK: ret i32 [[A]]
 
 
+; Test that there are no __msan_param_origin_tls stores when
+; argument shadow is a compile-time zero constant (which is always the case
+; in functions missing sanitize_memory attribute).
+
+define i32 @NoSanitizeMemoryParamTLS(i32* nocapture readonly %x) {
+entry:
+  %0 = load i32* %x, align 4
+  %call = tail call i32 @NoSanitizeMemoryParamTLSHelper(i32 %0)
+  ret i32 %call
+}
+
+declare i32 @NoSanitizeMemoryParamTLSHelper(i32 %x)
+
+; CHECK-LABEL: define i32 @NoSanitizeMemoryParamTLS(
+; CHECK-NOT: __msan_param_origin_tls
+; CHECK: ret i32
+
+
 ; Test argument shadow alignment
 
 define <2 x i64> @ArgumentShadowAlignment(i64 %a, <2 x i64> %b) sanitize_memory {

diff --git a/test/Instrumentation/MemorySanitizer/store-origin.ll b/test/Instrumentation/MemorySanitizer/store-origin.ll
index 0bd9777..bde4e90 100644
--- a/test/Instrumentation/MemorySanitizer/store-origin.ll
+++ b/test/Instrumentation/MemorySanitizer/store-origin.ll

@@ -11,14 +11,14 @@
 ; Function Attrs: nounwind
 define void @Store(i32* nocapture %p, i32 %x) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %p}, i64 0, metadata !11), !dbg !16
-  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !12), !dbg !16
+  tail call void @llvm.dbg.value(metadata !{i32* %p}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !16
+  tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !12, metadata !{metadata !"0x102"}), !dbg !16
   store i32 %x, i32* %p, align 4, !dbg !17, !tbaa !18
   ret void, !dbg !22
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind sanitize_memory "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -27,21 +27,21 @@
 !llvm.module.flags = !{!13, !14}
 !llvm.ident = !{!15}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 (204220)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/build0/../2.cc] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 (204220)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/build0/../2.cc] [DW_LANG_C99]
 !1 = metadata !{metadata !"../2.cc", metadata !"/tmp/build0"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"Store", metadata !"Store", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @Store, null, null, metadata !10, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [Store]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/build0/../2.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00Store\00Store\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*, i32)* @Store, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 1] [def] [Store]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/build0/../2.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8, metadata !9}
-!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !11, metadata !12}
-!11 = metadata !{i32 786689, metadata !4, metadata !"p", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 1]
-!12 = metadata !{i32 786689, metadata !4, metadata !"x", metadata !5, i32 33554433, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!11 = metadata !{metadata !"0x101\00p\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!12 = metadata !{metadata !"0x101\00x\0033554433\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [x] [line 1]
 !13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !15 = metadata !{metadata !"clang version 3.5.0 (204220)"}
 !16 = metadata !{i32 1, i32 0, metadata !4, null}
 !17 = metadata !{i32 2, i32 0, metadata !4, null}

diff --git a/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll
new file mode 100644
index 0000000..eea93b8
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/coverage-dbg.ll

@@ -0,0 +1,67 @@
+; Test that coverage instrumentation does not lose debug location.
+
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s
+
+; C++ source:
+; 1: struct A {
+; 2:  int f();
+; 3:  int x;
+; 4: };
+; 5:
+; 6: int A::f() {
+; 7:    return x;
+; 8: }
+; clang++ ../1.cc -O3 -g -S -emit-llvm  -fno-strict-aliasing
+; and add sanitize_address to @_ZN1A1fEv
+
+; Test that __sanitizer_cov call has !dbg pointing to the opening { of A::f().
+; CHECK: call void @__sanitizer_cov(), !dbg [[A:!.*]]
+; CHECK: [[A]] = metadata !{i32 6, i32 0, metadata !{{.*}}, null}
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { i32 }
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @_ZN1A1fEv(%struct.A* nocapture readonly %this) #0 align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%struct.A* %this}, i64 0, metadata !15, metadata !{metadata !"0x102"}), !dbg !20
+  %x = getelementptr inbounds %struct.A* %this, i64 0, i32 0, !dbg !21
+  %0 = load i32* %x, align 4, !dbg !21
+  ret i32 %0, !dbg !21
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { sanitize_address nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17, !18}
+!llvm.ident = !{!19}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (210251)\001\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !12, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/code/llvm/build0/../1.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"../1.cc", metadata !"/code/llvm/build0"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x13\00A\001\0032\0032\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_structure_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !8}
+!6 = metadata !{metadata !"0xd\00x\003\0032\0032\000\000", metadata !1, metadata !"_ZTS1A", metadata !7} ; [ DW_TAG_member ] [x] [line 3, size 32, align 32, offset 0] [from int]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x2e\00f\00f\00_ZN1A1fEv\002\000\000\000\006\00256\001\002", metadata !1, metadata !"_ZTS1A", metadata !9, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 2] [f]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !7, metadata !11}
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!12 = metadata !{metadata !13}
+!13 = metadata !{metadata !"0x2e\00f\00f\00_ZN1A1fEv\006\000\001\000\006\00256\001\006", metadata !1, metadata !"_ZTS1A", metadata !9, null, i32 (%struct.A*)* @_ZN1A1fEv, null, metadata !8, metadata !14} ; [ DW_TAG_subprogram ] [line 6] [def] [f]
+!14 = metadata !{metadata !15}
+!15 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !13, null, metadata !16} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!18 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!19 = metadata !{metadata !"clang version 3.5.0 (210251)"}
+!20 = metadata !{i32 0, i32 0, metadata !13, null}
+!21 = metadata !{i32 7, i32 0, metadata !13, null}

diff --git a/test/Instrumentation/SanitizerCoverage/coverage.ll b/test/Instrumentation/SanitizerCoverage/coverage.ll
new file mode 100644
index 0000000..da0498d
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/coverage.ll

@@ -0,0 +1,89 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=0 -S | FileCheck %s --check-prefix=CHECK0
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -S | FileCheck %s --check-prefix=CHECK1
+; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -S | FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1  -S | FileCheck %s --check-prefix=CHECK1
+; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK3
+; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -S | FileCheck %s --check-prefix=CHECK4
+
+; RUN: opt < %s -sancov -sanitizer-coverage-level=0  -S | FileCheck %s --check-prefix=CHECK0
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1  -S | FileCheck %s --check-prefix=CHECK1
+; RUN: opt < %s -sancov -sanitizer-coverage-level=2  -S | FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 \
+; RUN:      -S | FileCheck %s --check-prefix=CHECK2
+; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=1 \
+; RUN:      -S | FileCheck %s --check-prefix=CHECK1
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+define void @foo(i32* %a) sanitize_address {
+entry:
+  %tobool = icmp eq i32* %a, null
+  br i1 %tobool, label %if.end, label %if.then
+
+  if.then:                                          ; preds = %entry
+  store i32 0, i32* %a, align 4
+  br label %if.end
+
+  if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; CHECK0-NOT: call void @__sanitizer_cov(
+; CHECK0-NOT: call void @__sanitizer_cov_module_init(
+
+; CHECK1-LABEL: define void @foo
+; CHECK1: %0 = load atomic i8* @__sancov_gen_cov_foo monotonic, align 1
+; CHECK1: %1 = icmp eq i8 0, %0
+; CHECK1: br i1 %1, label %2, label %3
+; CHECK1: call void @__sanitizer_cov
+; CHECK1-NOT: call void @__sanitizer_cov
+; CHECK1: store atomic i8 1, i8* @__sancov_gen_cov_foo monotonic, align 1
+
+; CHECK1-LABEL: define internal void @sancov.module_ctor
+; CHECK1-NOT: ret
+; CHECK1: call void @__sanitizer_cov_module_init(i64 2)
+; CHECK1: ret
+
+
+; CHECK2-LABEL: define void @foo
+; CHECK2: call void @__sanitizer_cov
+; CHECK2: call void @__sanitizer_cov
+; CHECK2: call void @__sanitizer_cov
+; CHECK2-NOT: call void @__sanitizer_cov
+; CHECK2: ret void
+
+; CHECK2-LABEL: define internal void @sancov.module_ctor
+; CHECK2-NOT: ret
+; CHECK2: call void @__sanitizer_cov_module_init(i64 4)
+; CHECK2: ret
+
+; CHECK3-LABEL: define void @foo
+; CHECK3: call void @__sanitizer_cov
+; CHECK3: call void @__sanitizer_cov
+; CHECK3: call void @__sanitizer_cov
+; CHECK3-NOT: ret void
+; CHECK3: call void @__sanitizer_cov
+; CHECK3-NOT: call void @__sanitizer_cov
+; CHECK3: ret void
+
+
+%struct.StructWithVptr = type { i32 (...)** }
+
+define void @CallViaVptr(%struct.StructWithVptr* %foo) uwtable sanitize_address {
+entry:
+  %0 = bitcast %struct.StructWithVptr* %foo to void (%struct.StructWithVptr*)***
+  %vtable = load void (%struct.StructWithVptr*)*** %0, align 8
+  %1 = load void (%struct.StructWithVptr*)** %vtable, align 8
+  tail call void %1(%struct.StructWithVptr* %foo)
+  tail call void %1(%struct.StructWithVptr* %foo)
+  tail call void asm sideeffect "", ""()
+  ret void
+}
+
+; We expect to see two calls to __sanitizer_cov_indir_call16
+; with different values of second argument.
+; CHECK4-LABEL: define void @CallViaVptr
+; CHECK4: call void @__sanitizer_cov_indir_call16({{.*}},[[CACHE:.*]])
+; CHECK4-NOT: call void @__sanitizer_cov_indir_call16({{.*}},[[CACHE]])
+; CHECK4: ret void

diff --git a/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll b/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll
new file mode 100644
index 0000000..9b26329
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/coverage2-dbg.ll

@@ -0,0 +1,75 @@
+; Test that coverage instrumentation does not lose debug location.
+
+; RUN: opt < %s -sancov  -sanitizer-coverage-level=2 -S | FileCheck %s
+
+; C++ source:
+; 1: void foo(int *a) {
+; 2:     if (a)
+; 3:         *a = 0;
+; 4: }
+; clang++ if.cc -O3 -g -S -emit-llvm
+; and add sanitize_address to @_Z3fooPi
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Check that __sanitizer_cov call has !dgb pointing to the beginning
+; of appropriate basic blocks.
+; CHECK-LABEL:_Z3fooPi
+; CHECK: call void @__sanitizer_cov(), !dbg [[A:!.*]]
+; CHECK: call void @__sanitizer_cov(), !dbg [[B:!.*]]
+; CHECK: call void @__sanitizer_cov(), !dbg [[C:!.*]]
+; CHECK: ret void
+; CHECK: [[A]] = metadata !{i32 1, i32 0, metadata !{{.*}}, null}
+; CHECK: [[B]] = metadata !{i32 3, i32 5, metadata !{{.*}}, null}
+; CHECK: [[C]] = metadata !{i32 4, i32 1, metadata !{{.*}}, null}
+
+define void @_Z3fooPi(i32* %a) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !11, metadata !{metadata !"0x102"}), !dbg !15
+  %tobool = icmp eq i32* %a, null, !dbg !16
+  br i1 %tobool, label %if.end, label %if.then, !dbg !16
+
+if.then:                                          ; preds = %entry
+  store i32 0, i32* %a, align 4, !dbg !18, !tbaa !19
+  br label %if.end, !dbg !18
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void, !dbg !23
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" sanitize_address}
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12, !13}
+!llvm.ident = !{!14}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 (217079)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [FOO/if.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"if.cc", metadata !"FOO"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooPi\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*)* @_Z3fooPi, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [FOO/if.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null, metadata !8}
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !11}
+!11 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!13 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!14 = metadata !{metadata !"clang version 3.6.0 (217079)"}
+!15 = metadata !{i32 1, i32 15, metadata !4, null}
+!16 = metadata !{i32 2, i32 7, metadata !17, null}
+!17 = metadata !{metadata !"0xb\002\007\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [FOO/if.cc]
+!18 = metadata !{i32 3, i32 5, metadata !17, null}
+!19 = metadata !{metadata !20, metadata !20, i64 0}
+!20 = metadata !{metadata !"int", metadata !21, i64 0}
+!21 = metadata !{metadata !"omnipotent char", metadata !22, i64 0}
+!22 = metadata !{metadata !"Simple C/C++ TBAA"}
+!23 = metadata !{i32 4, i32 1, metadata !4, null}

diff --git a/test/Instrumentation/SanitizerCoverage/tracing.ll b/test/Instrumentation/SanitizerCoverage/tracing.ll
new file mode 100644
index 0000000..c39cb1c
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/tracing.ll

@@ -0,0 +1,33 @@
+; Test -sanitizer-coverage-experimental-tracing
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-experimental-tracing  -S | FileCheck %s --check-prefix=CHECK1
+; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-experimental-tracing  -S | FileCheck %s --check-prefix=CHECK3
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+define void @foo(i32* %a) sanitize_address {
+entry:
+  %tobool = icmp eq i32* %a, null
+  br i1 %tobool, label %if.end, label %if.then
+
+  if.then:                                          ; preds = %entry
+  store i32 0, i32* %a, align 4
+  br label %if.end
+
+  if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; CHECK1-LABEL: define void @foo
+; CHECK1: call void @__sanitizer_cov_trace_func_enter
+; CHECK1: call void @__sanitizer_cov_trace_basic_block
+; CHECK1: call void @__sanitizer_cov_trace_basic_block
+; CHECK1-NOT: call void @__sanitizer_cov_trace_basic_block
+; CHECK1: ret void
+
+; CHECK3-LABEL: define void @foo
+; CHECK3: call void @__sanitizer_cov_trace_func_enter
+; CHECK3: call void @__sanitizer_cov_trace_basic_block
+; CHECK3: call void @__sanitizer_cov_trace_basic_block
+; CHECK3: call void @__sanitizer_cov_trace_basic_block
+; CHECK3-NOT: call void @__sanitizer_cov_trace_basic_block
+; CHECK3: ret void

diff --git a/test/JitListener/test-common-symbols.ll b/test/JitListener/test-common-symbols.ll
index a389bf7..3c8b9e3 100644
--- a/test/JitListener/test-common-symbols.ll
+++ b/test/JitListener/test-common-symbols.ll

@@ -34,7 +34,7 @@
   br label %if.end, !dbg !24
 
 if.end:                                           ; preds = %if.then, %entry
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !25), !dbg !27
+  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !25, metadata !{metadata !"0x102"}), !dbg !27
   store i32 1, i32* %i, align 4, !dbg !28
   br label %for.cond, !dbg !28
 
@@ -73,41 +73,41 @@
   ret i32 %cond, !dbg !33
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35}
 
-!0 = metadata !{i32 720913, metadata !34, i32 12, metadata !"clang version 3.1 ()", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.1 ()\001\00\000\00\000", metadata !34, metadata !1, metadata !1, metadata !3, metadata !12, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !34, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 720937, metadata !34} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00main\00main\00\006\000\001\000\006\000\000\000", metadata !34, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !10} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !34} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ]
 !12 = metadata !{metadata !14, metadata !15, metadata !17}
-!14 = metadata !{i32 720948, i32 0, null, metadata !"zero_int", metadata !"zero_int", metadata !"", metadata !6, i32 1, metadata !9, i32 0, i32 1, i32* @zero_int, null} ; [ DW_TAG_variable ]
-!15 = metadata !{i32 720948, i32 0, null, metadata !"zero_double", metadata !"zero_double", metadata !"", metadata !6, i32 2, metadata !16, i32 0, i32 1, double* @zero_double, null} ; [ DW_TAG_variable ]
-!16 = metadata !{i32 720932, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!17 = metadata !{i32 720948, i32 0, null, metadata !"zero_arr", metadata !"zero_arr", metadata !"", metadata !6, i32 3, metadata !18, i32 0, i32 1, [10 x i32]* @zero_arr, null} ; [ DW_TAG_variable ]
-!18 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 320, i64 32, i32 0, i32 0, metadata !9, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
+!14 = metadata !{metadata !"0x34\00zero_int\00zero_int\00\001\000\001", null, metadata !6, metadata !9, i32* @zero_int, null} ; [ DW_TAG_variable ]
+!15 = metadata !{metadata !"0x34\00zero_double\00zero_double\00\002\000\001", null, metadata !6, metadata !16, double* @zero_double, null} ; [ DW_TAG_variable ]
+!16 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ]
+!17 = metadata !{metadata !"0x34\00zero_arr\00zero_arr\00\003\000\001", null, metadata !6, metadata !18, [10 x i32]* @zero_arr, null} ; [ DW_TAG_variable ]
+!18 = metadata !{metadata !"0x1\00\000\00320\0032\000\000", null, metadata !"", metadata !9, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
 !19 = metadata !{metadata !20}
-!20 = metadata !{i32 720929, i64 0, i64 10}        ; [ DW_TAG_subrange_type ]
+!20 = metadata !{metadata !"0x21\000\0010"}        ; [ DW_TAG_subrange_type ]
 !21 = metadata !{i32 7, i32 5, metadata !22, null}
-!22 = metadata !{i32 720907, metadata !34, metadata !5, i32 6, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\006\001\000", metadata !34, metadata !5} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{i32 9, i32 5, metadata !22, null}
 !24 = metadata !{i32 10, i32 9, metadata !22, null}
-!25 = metadata !{i32 721152, metadata !26, metadata !"i", metadata !6, i32 12, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!26 = metadata !{i32 720907, metadata !34, metadata !22, i32 12, i32 5, i32 1} ; [ DW_TAG_lexical_block ]
+!25 = metadata !{metadata !"0x100\00i\0012\000", metadata !26, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
+!26 = metadata !{metadata !"0xb\0012\005\001", metadata !34, metadata !22} ; [ DW_TAG_lexical_block ]
 !27 = metadata !{i32 12, i32 14, metadata !26, null}
 !28 = metadata !{i32 12, i32 19, metadata !26, null}
 !29 = metadata !{i32 13, i32 9, metadata !30, null}
-!30 = metadata !{i32 720907, metadata !34, metadata !26, i32 12, i32 34, i32 2} ; [ DW_TAG_lexical_block ]
+!30 = metadata !{metadata !"0xb\0012\0034\002", metadata !34, metadata !26} ; [ DW_TAG_lexical_block ]
 !31 = metadata !{i32 14, i32 5, metadata !30, null}
 !32 = metadata !{i32 12, i32 29, metadata !26, null}
 !33 = metadata !{i32 15, i32 5, metadata !22, null}
 !34 = metadata !{metadata !"test-common-symbols.c", metadata !"/store/store/llvm/build"}
-!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/JitListener/test-inline.ll b/test/JitListener/test-inline.ll
index 0d365b1..a600734 100644
--- a/test/JitListener/test-inline.ll
+++ b/test/JitListener/test-inline.ll

@@ -42,18 +42,18 @@
   %l.addr = alloca i64, align 8
   %result = alloca double, align 8
   store float* %pf, float** %pf.addr, align 8
-  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !46), !dbg !47
+  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !46, metadata !{metadata !"0x102"}), !dbg !47
   store [2 x double]* %ppd, [2 x double]** %ppd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !48), !dbg !47
+  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !48, metadata !{metadata !"0x102"}), !dbg !47
   store %struct.char_struct* %s, %struct.char_struct** %s.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !49), !dbg !47
+  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !49, metadata !{metadata !"0x102"}), !dbg !47
   store i32** %ppn, i32*** %ppn.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !50), !dbg !47
+  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !50, metadata !{metadata !"0x102"}), !dbg !47
   store i16 %us, i16* %us.addr, align 2
-  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !51), !dbg !47
+  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !51, metadata !{metadata !"0x102"}), !dbg !47
   store i64 %l, i64* %l.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !52), !dbg !47
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !53), !dbg !55
+  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !52, metadata !{metadata !"0x102"}), !dbg !47
+  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !53, metadata !{metadata !"0x102"}), !dbg !55
   %0 = load float** %pf.addr, align 8, !dbg !55
   %arrayidx = getelementptr inbounds float* %0, i64 0, !dbg !55
   %1 = load float* %arrayidx, align 4, !dbg !55
@@ -84,7 +84,7 @@
   ret double %8, !dbg !56
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define linkonce_odr i32 @_Z3foov() nounwind uwtable inlinehint {
 entry:
@@ -102,13 +102,13 @@
   %result = alloca double, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !59), !dbg !60
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !59, metadata !{metadata !"0x102"}), !dbg !60
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !61), !dbg !60
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !62), !dbg !64
-  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !65), !dbg !66
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !61, metadata !{metadata !"0x102"}), !dbg !60
+  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !62, metadata !{metadata !"0x102"}), !dbg !64
+  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !65, metadata !{metadata !"0x102"}), !dbg !66
   store float 0.000000e+00, float* %f, align 4, !dbg !66
-  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !67), !dbg !70
+  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !67, metadata !{metadata !"0x102"}), !dbg !70
   %0 = bitcast [2 x [2 x double]]* %d to i8*, !dbg !70
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([2 x [2 x double]]* @_ZZ4mainE1d to i8*), i64 32, i32 16, i1 false), !dbg !70
   %c = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 0, !dbg !71
@@ -119,7 +119,7 @@
   %c21 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !73
   %arrayidx2 = getelementptr inbounds [2 x i8]* %c21, i32 0, i64 1, !dbg !73
   store i8 49, i8* %arrayidx2, align 1, !dbg !73
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !74), !dbg !75
+  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !74, metadata !{metadata !"0x102"}), !dbg !75
   %arraydecay = getelementptr inbounds [2 x [2 x double]]* %d, i32 0, i32 0, !dbg !75
   %call = call double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %f, [2 x double]* %arraydecay, %struct.char_struct* %s, i32** null, i16 zeroext 10, i64 42), !dbg !75
   store double %call, double* %result, align 8, !dbg !75
@@ -134,79 +134,79 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!78}
 
-!0 = metadata !{i32 786449, metadata !77, i32 4, metadata !"clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !43, null, metadata !""} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-inline.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)\001\00\000\00\000", metadata !77, metadata !1, metadata !1, metadata !3, metadata !43, null} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-inline.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !35, metadata !40}
-!5 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"test_parameters", metadata !"test_parameters", metadata !"_Z15test_parametersPfPA2_dR11char_structPPitm", i32 32, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1, i32 33} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
-!6 = metadata !{i32 786473, metadata !77} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00test_parameters\00test_parameters\00_Z15test_parametersPfPA2_dR11char_structPPitm\0032\000\001\000\006\00256\000\0033", metadata !77, metadata !6, metadata !7, null, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
+!6 = metadata !{metadata !"0x29", metadata !77} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10, metadata !12, metadata !16, metadata !29, metadata !32, metadata !33}
-!9 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!10 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
-!11 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
-!12 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !9, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
+!9 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
+!11 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{metadata !"0x1\00\000\00128\0064\000\000", null, metadata !"", metadata !9, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
 !14 = metadata !{metadata !15}
-!15 = metadata !{i32 786465, i64 0, i64 2}        ; [ DW_TAG_subrange_type ] [0, 1]
-!16 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
-!17 = metadata !{i32 786451, metadata !77, null, metadata !"char_struct", i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [def] [from ]
+!15 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
+!16 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
+!17 = metadata !{metadata !"0x13\00char_struct\0022\0024\008\000\000\000", metadata !77, null, null, metadata !18, null, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [def] [from ]
 !18 = metadata !{metadata !19, metadata !21, metadata !23}
-!19 = metadata !{i32 786445, metadata !77, metadata !17, metadata !"c", i32 23, i64 8, i64 8, i64 0, i32 0, metadata !20} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
-!20 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!21 = metadata !{i32 786445, metadata !77, metadata !17, metadata !"c2", i32 24, i64 16, i64 8, i64 8, i32 0, metadata !22} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
-!22 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !20, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
-!23 = metadata !{i32 786478, metadata !77, metadata !17, metadata !"char_struct", metadata !"char_struct", metadata !"", i32 22, metadata !24, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !27, i32 22} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
-!24 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{metadata !"0xd\00c\0023\008\008\000\000", metadata !77, metadata !17, metadata !20} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
+!20 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!21 = metadata !{metadata !"0xd\00c2\0024\0016\008\008\000", metadata !77, metadata !17, metadata !22} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
+!22 = metadata !{metadata !"0x1\00\000\0016\008\000\000", null, metadata !"", metadata !20, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
+!23 = metadata !{metadata !"0x2e\00char_struct\00char_struct\00\0022\000\000\000\006\00320\000\0022", metadata !77, metadata !17, metadata !24, null, null, null, i32 0, metadata !27} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
+!24 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !25, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{null, metadata !26}
-!26 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
+!26 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, metadata !"", metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
 !27 = metadata !{metadata !28}
-!28 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!29 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !30} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!30 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !31} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!31 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!32 = metadata !{i32 786468, null, null, metadata !"unsigned short", i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
-!33 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !34} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
-!34 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!35 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 38, metadata !36, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !1, i32 39} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
-!36 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!29 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !30} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !31} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!31 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!32 = metadata !{metadata !"0x24\00unsigned short\000\0016\0016\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
+!33 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, metadata !"", metadata !34} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
+!34 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!35 = metadata !{metadata !"0x2e\00main\00main\00\0038\000\001\000\006\00256\000\0039", metadata !77, metadata !6, metadata !36, null, i32 (i32, i8**)* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
+!36 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !37, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !37 = metadata !{metadata !31, metadata !31, metadata !38}
-!38 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!39 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!40 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 27, metadata !41, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 28} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
-!41 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !42, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!38 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!39 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!40 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\0027\000\001\000\006\00256\000\0028", metadata !77, metadata !6, metadata !41, null, i32 ()* @_Z3foov, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
+!41 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !42, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !42 = metadata !{metadata !31}
 !43 = metadata !{metadata !45}
-!45 = metadata !{i32 786484, i32 0, null, metadata !"compound_char", metadata !"compound_char", metadata !"", metadata !6, i32 25, metadata !17, i32 0, i32 1, %struct.char_struct* @compound_char, null} ; [ DW_TAG_variable ] [compound_char] [line 25] [def]
-!46 = metadata !{i32 786689, metadata !5, metadata !"pf", metadata !6, i32 16777248, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [pf] [line 32]
+!45 = metadata !{metadata !"0x34\00compound_char\00compound_char\00\0025\000\001", null, metadata !6, metadata !17, %struct.char_struct* @compound_char, null} ; [ DW_TAG_variable ] [compound_char] [line 25] [def]
+!46 = metadata !{metadata !"0x101\00pf\0016777248\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ] [pf] [line 32]
 !47 = metadata !{i32 32, i32 0, metadata !5, null}
-!48 = metadata !{i32 786689, metadata !5, metadata !"ppd", metadata !6, i32 33554464, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ppd] [line 32]
-!49 = metadata !{i32 786689, metadata !5, metadata !"s", metadata !6, i32 50331680, metadata !16, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [s] [line 32]
-!50 = metadata !{i32 786689, metadata !5, metadata !"ppn", metadata !6, i32 67108896, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ppn] [line 32]
-!51 = metadata !{i32 786689, metadata !5, metadata !"us", metadata !6, i32 83886112, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [us] [line 32]
-!52 = metadata !{i32 786689, metadata !5, metadata !"l", metadata !6, i32 100663328, metadata !33, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [l] [line 32]
-!53 = metadata !{i32 786688, metadata !54, metadata !"result", metadata !6, i32 34, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 34]
-!54 = metadata !{i32 786443, metadata !77, metadata !5, i32 33, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
+!48 = metadata !{metadata !"0x101\00ppd\0033554464\000", metadata !5, metadata !6, metadata !12} ; [ DW_TAG_arg_variable ] [ppd] [line 32]
+!49 = metadata !{metadata !"0x101\00s\0050331680\000", metadata !5, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ] [s] [line 32]
+!50 = metadata !{metadata !"0x101\00ppn\0067108896\000", metadata !5, metadata !6, metadata !29} ; [ DW_TAG_arg_variable ] [ppn] [line 32]
+!51 = metadata !{metadata !"0x101\00us\0083886112\000", metadata !5, metadata !6, metadata !32} ; [ DW_TAG_arg_variable ] [us] [line 32]
+!52 = metadata !{metadata !"0x101\00l\00100663328\000", metadata !5, metadata !6, metadata !33} ; [ DW_TAG_arg_variable ] [l] [line 32]
+!53 = metadata !{metadata !"0x100\00result\0034\000", metadata !54, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [result] [line 34]
+!54 = metadata !{metadata !"0xb\0033\000\000", metadata !77, metadata !5} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
 !55 = metadata !{i32 34, i32 0, metadata !54, null}
 !56 = metadata !{i32 35, i32 0, metadata !54, null}
 !57 = metadata !{i32 29, i32 0, metadata !58, null}
-!58 = metadata !{i32 786443, metadata !77, metadata !40, i32 28, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
-!59 = metadata !{i32 786689, metadata !35, metadata !"argc", metadata !6, i32 16777254, metadata !31, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 38]
+!58 = metadata !{metadata !"0xb\0028\000\002", metadata !77, metadata !40} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
+!59 = metadata !{metadata !"0x101\00argc\0016777254\000", metadata !35, metadata !6, metadata !31} ; [ DW_TAG_arg_variable ] [argc] [line 38]
 !60 = metadata !{i32 38, i32 0, metadata !35, null}
-!61 = metadata !{i32 786689, metadata !35, metadata !"argv", metadata !6, i32 33554470, metadata !38, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 38]
-!62 = metadata !{i32 786688, metadata !63, metadata !"s", metadata !6, i32 40, metadata !17, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [s] [line 40]
-!63 = metadata !{i32 786443, metadata !77, metadata !35, i32 39, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
+!61 = metadata !{metadata !"0x101\00argv\0033554470\000", metadata !35, metadata !6, metadata !38} ; [ DW_TAG_arg_variable ] [argv] [line 38]
+!62 = metadata !{metadata !"0x100\00s\0040\000", metadata !63, metadata !6, metadata !17} ; [ DW_TAG_auto_variable ] [s] [line 40]
+!63 = metadata !{metadata !"0xb\0039\000\001", metadata !77, metadata !35} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
 !64 = metadata !{i32 40, i32 0, metadata !63, null}
-!65 = metadata !{i32 786688, metadata !63, metadata !"f", metadata !6, i32 41, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [f] [line 41]
+!65 = metadata !{metadata !"0x100\00f\0041\000", metadata !63, metadata !6, metadata !11} ; [ DW_TAG_auto_variable ] [f] [line 41]
 !66 = metadata !{i32 41, i32 0, metadata !63, null}
-!67 = metadata !{i32 786688, metadata !63, metadata !"d", metadata !6, i32 42, metadata !68, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 42]
-!68 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !9, metadata !69, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
+!67 = metadata !{metadata !"0x100\00d\0042\000", metadata !63, metadata !6, metadata !68} ; [ DW_TAG_auto_variable ] [d] [line 42]
+!68 = metadata !{metadata !"0x1\00\000\00256\0064\000\000", null, metadata !"", metadata !9, metadata !69, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
 !69 = metadata !{metadata !15, metadata !15}
 !70 = metadata !{i32 42, i32 0, metadata !63, null}
 !71 = metadata !{i32 44, i32 0, metadata !63, null}
 !72 = metadata !{i32 45, i32 0, metadata !63, null}
 !73 = metadata !{i32 46, i32 0, metadata !63, null}
-!74 = metadata !{i32 786688, metadata !63, metadata !"result", metadata !6, i32 48, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 48]
+!74 = metadata !{metadata !"0x100\00result\0048\000", metadata !63, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ] [result] [line 48]
 !75 = metadata !{i32 48, i32 0, metadata !63, null}
 !76 = metadata !{i32 49, i32 0, metadata !63, null}
 !77 = metadata !{metadata !"test-inline.cpp", metadata !"/home/akaylor/dev"}
-!78 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!78 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/JitListener/test-parameters.ll b/test/JitListener/test-parameters.ll
index 7feb6bb..d1f3b76 100644
--- a/test/JitListener/test-parameters.ll
+++ b/test/JitListener/test-parameters.ll

@@ -46,18 +46,18 @@
   %l.addr = alloca i64, align 8
   %result = alloca double, align 8
   store float* %pf, float** %pf.addr, align 8
-  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !48), !dbg !49
+  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !48, metadata !{metadata !"0x102"}), !dbg !49
   store [2 x double]* %ppd, [2 x double]** %ppd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !50), !dbg !49
+  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !50, metadata !{metadata !"0x102"}), !dbg !49
   store %struct.char_struct* %s, %struct.char_struct** %s.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !51), !dbg !49
+  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !51, metadata !{metadata !"0x102"}), !dbg !49
   store i32** %ppn, i32*** %ppn.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !52), !dbg !49
+  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !52, metadata !{metadata !"0x102"}), !dbg !49
   store i16 %us, i16* %us.addr, align 2
-  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !53), !dbg !49
+  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !53, metadata !{metadata !"0x102"}), !dbg !49
   store i64 %l, i64* %l.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !54), !dbg !49
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !55), !dbg !57
+  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !54, metadata !{metadata !"0x102"}), !dbg !49
+  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !55, metadata !{metadata !"0x102"}), !dbg !57
   %0 = load float** %pf.addr, align 8, !dbg !57
   %arrayidx = getelementptr inbounds float* %0, i64 0, !dbg !57
   %1 = load float* %arrayidx, align 4, !dbg !57
@@ -88,7 +88,7 @@
   ret double %8, !dbg !58
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
 entry:
@@ -101,13 +101,13 @@
   %result = alloca double, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !59), !dbg !60
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !59, metadata !{metadata !"0x102"}), !dbg !60
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !61), !dbg !60
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !62), !dbg !64
-  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !65), !dbg !66
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !61, metadata !{metadata !"0x102"}), !dbg !60
+  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !62, metadata !{metadata !"0x102"}), !dbg !64
+  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !65, metadata !{metadata !"0x102"}), !dbg !66
   store float 0.000000e+00, float* %f, align 4, !dbg !66
-  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !67), !dbg !70
+  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !67, metadata !{metadata !"0x102"}), !dbg !70
   %0 = bitcast [2 x [2 x double]]* %d to i8*, !dbg !70
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([2 x [2 x double]]* @_ZZ4mainE1d to i8*), i64 32, i32 16, i1 false), !dbg !70
   %c = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 0, !dbg !71
@@ -118,7 +118,7 @@
   %c21 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !73
   %arrayidx2 = getelementptr inbounds [2 x i8]* %c21, i32 0, i64 1, !dbg !73
   store i8 49, i8* %arrayidx2, align 1, !dbg !73
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !74), !dbg !75
+  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !74, metadata !{metadata !"0x102"}), !dbg !75
   %arraydecay = getelementptr inbounds [2 x [2 x double]]* %d, i32 0, i32 0, !dbg !75
   %call = call double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %f, [2 x double]* %arraydecay, %struct.char_struct* %s, i32** null, i16 zeroext 10, i64 42), !dbg !75
   store double %call, double* %result, align 8, !dbg !75
@@ -133,79 +133,79 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!78}
 
-!0 = metadata !{i32 786449, metadata !77, i32 4, metadata !"clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !43, null, metadata !""} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-parameters.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)\001\00\000\00\000", metadata !77, metadata !1, metadata !1, metadata !3, metadata !43, null} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-parameters.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !10, metadata !38}
-!5 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 27, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 28} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
-!6 = metadata !{i32 786473, metadata !77} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\0027\000\001\000\006\00256\000\0028", metadata !77, metadata !6, metadata !7, null, i32 ()* @_Z3foov, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
+!6 = metadata !{metadata !"0x29", metadata !77} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"test_parameters", metadata !"test_parameters", metadata !"_Z15test_parametersPfPA2_dR11char_structPPitm", i32 32, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1, i32 33} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
-!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x2e\00test_parameters\00test_parameters\00_Z15test_parametersPfPA2_dR11char_structPPitm\0032\000\001\000\006\00256\000\0033", metadata !77, metadata !6, metadata !11, null, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13, metadata !14, metadata !16, metadata !20, metadata !33, metadata !35, metadata !36}
-!13 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!14 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
-!15 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
-!16 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!17 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !13, metadata !18, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
+!13 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
+!15 = metadata !{metadata !"0x24\00float\000\0032\0032\000\000\004", null, null} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!16 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!17 = metadata !{metadata !"0x1\00\000\00128\0064\000\000", null, metadata !"", metadata !13, metadata !18, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
 !18 = metadata !{metadata !19}
-!19 = metadata !{i32 786465, i64 0, i64 2}        ; [ DW_TAG_subrange_type ] [0, 1]
-!20 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !21} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
-!21 = metadata !{i32 786451, metadata !77, null, metadata !"char_struct", i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [def] [from ]
+!19 = metadata !{metadata !"0x21\000\002"}        ; [ DW_TAG_subrange_type ] [0, 1]
+!20 = metadata !{metadata !"0x10\00\000\000\000\000\000", null, null, metadata !21} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
+!21 = metadata !{metadata !"0x13\00char_struct\0022\0024\008\000\000\000", metadata !77, null, null, metadata !22, null, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [def] [from ]
 !22 = metadata !{metadata !23, metadata !25, metadata !27}
-!23 = metadata !{i32 786445, metadata !77, metadata !21, metadata !"c", i32 23, i64 8, i64 8, i64 0, i32 0, metadata !24} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
-!24 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!25 = metadata !{i32 786445, metadata !77, metadata !21, metadata !"c2", i32 24, i64 16, i64 8, i64 8, i32 0, metadata !26} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
-!26 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !24, metadata !18, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
-!27 = metadata !{i32 786478, metadata !77, metadata !21, metadata !"char_struct", metadata !"char_struct", metadata !"", i32 22, metadata !28, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !31, i32 22} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
-!28 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{metadata !"0xd\00c\0023\008\008\000\000", metadata !77, metadata !21, metadata !24} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
+!24 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!25 = metadata !{metadata !"0xd\00c2\0024\0016\008\008\000", metadata !77, metadata !21, metadata !26} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
+!26 = metadata !{metadata !"0x1\00\000\0016\008\000\000", null, metadata !"", metadata !24, metadata !18, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
+!27 = metadata !{metadata !"0x2e\00char_struct\00char_struct\00\0022\000\000\000\006\00320\000\0022", metadata !77, metadata !21, metadata !28, null, null, null, i32 0, metadata !31} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
+!28 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !29 = metadata !{null, metadata !30}
-!30 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !21} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
+!30 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", i32 0, metadata !"", metadata !21} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
 !31 = metadata !{metadata !32}
-!32 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
-!33 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !34} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!34 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!35 = metadata !{i32 786468, null, null, metadata !"unsigned short", i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
-!36 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !37} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
-!37 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
-!38 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 38, metadata !39, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !1, i32 39} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
-!39 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !40, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!32 = metadata !{metadata !"0x24"}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !34} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!34 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!35 = metadata !{metadata !"0x24\00unsigned short\000\0016\0016\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
+!36 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, metadata !"", metadata !37} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
+!37 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!38 = metadata !{metadata !"0x2e\00main\00main\00\0038\000\001\000\006\00256\000\0039", metadata !77, metadata !6, metadata !39, null, i32 (i32, i8**)* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
+!39 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, metadata !"", null, metadata !40, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !40 = metadata !{metadata !9, metadata !9, metadata !41}
-!41 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !42} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!42 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!41 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !42} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!42 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !"", metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
 !43 = metadata !{metadata !45}
-!45 = metadata !{i32 786484, i32 0, null, metadata !"compound_char", metadata !"compound_char", metadata !"", metadata !6, i32 25, metadata !21, i32 0, i32 1, %struct.char_struct* @compound_char, null} ; [ DW_TAG_variable ] [compound_char] [line 25] [def]
+!45 = metadata !{metadata !"0x34\00compound_char\00compound_char\00\0025\000\001", null, metadata !6, metadata !21, %struct.char_struct* @compound_char, null} ; [ DW_TAG_variable ] [compound_char] [line 25] [def]
 !46 = metadata !{i32 29, i32 0, metadata !47, null}
-!47 = metadata !{i32 786443, metadata !77, metadata !5, i32 28, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
-!48 = metadata !{i32 786689, metadata !10, metadata !"pf", metadata !6, i32 16777248, metadata !14, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [pf] [line 32]
+!47 = metadata !{metadata !"0xb\0028\000\000", metadata !77, metadata !5} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
+!48 = metadata !{metadata !"0x101\00pf\0016777248\000", metadata !10, metadata !6, metadata !14} ; [ DW_TAG_arg_variable ] [pf] [line 32]
 !49 = metadata !{i32 32, i32 0, metadata !10, null}
-!50 = metadata !{i32 786689, metadata !10, metadata !"ppd", metadata !6, i32 33554464, metadata !16, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ppd] [line 32]
-!51 = metadata !{i32 786689, metadata !10, metadata !"s", metadata !6, i32 50331680, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [s] [line 32]
-!52 = metadata !{i32 786689, metadata !10, metadata !"ppn", metadata !6, i32 67108896, metadata !33, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ppn] [line 32]
-!53 = metadata !{i32 786689, metadata !10, metadata !"us", metadata !6, i32 83886112, metadata !35, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [us] [line 32]
-!54 = metadata !{i32 786689, metadata !10, metadata !"l", metadata !6, i32 100663328, metadata !36, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [l] [line 32]
-!55 = metadata !{i32 786688, metadata !56, metadata !"result", metadata !6, i32 34, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 34]
-!56 = metadata !{i32 786443, metadata !77, metadata !10, i32 33, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
+!50 = metadata !{metadata !"0x101\00ppd\0033554464\000", metadata !10, metadata !6, metadata !16} ; [ DW_TAG_arg_variable ] [ppd] [line 32]
+!51 = metadata !{metadata !"0x101\00s\0050331680\000", metadata !10, metadata !6, metadata !20} ; [ DW_TAG_arg_variable ] [s] [line 32]
+!52 = metadata !{metadata !"0x101\00ppn\0067108896\000", metadata !10, metadata !6, metadata !33} ; [ DW_TAG_arg_variable ] [ppn] [line 32]
+!53 = metadata !{metadata !"0x101\00us\0083886112\000", metadata !10, metadata !6, metadata !35} ; [ DW_TAG_arg_variable ] [us] [line 32]
+!54 = metadata !{metadata !"0x101\00l\00100663328\000", metadata !10, metadata !6, metadata !36} ; [ DW_TAG_arg_variable ] [l] [line 32]
+!55 = metadata !{metadata !"0x100\00result\0034\000", metadata !56, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [result] [line 34]
+!56 = metadata !{metadata !"0xb\0033\000\001", metadata !77, metadata !10} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
 !57 = metadata !{i32 34, i32 0, metadata !56, null}
 !58 = metadata !{i32 35, i32 0, metadata !56, null}
-!59 = metadata !{i32 786689, metadata !38, metadata !"argc", metadata !6, i32 16777254, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 38]
+!59 = metadata !{metadata !"0x101\00argc\0016777254\000", metadata !38, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ] [argc] [line 38]
 !60 = metadata !{i32 38, i32 0, metadata !38, null}
-!61 = metadata !{i32 786689, metadata !38, metadata !"argv", metadata !6, i32 33554470, metadata !41, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 38]
-!62 = metadata !{i32 786688, metadata !63, metadata !"s", metadata !6, i32 40, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [s] [line 40]
-!63 = metadata !{i32 786443, metadata !77, metadata !38, i32 39, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
+!61 = metadata !{metadata !"0x101\00argv\0033554470\000", metadata !38, metadata !6, metadata !41} ; [ DW_TAG_arg_variable ] [argv] [line 38]
+!62 = metadata !{metadata !"0x100\00s\0040\000", metadata !63, metadata !6, metadata !21} ; [ DW_TAG_auto_variable ] [s] [line 40]
+!63 = metadata !{metadata !"0xb\0039\000\002", metadata !77, metadata !38} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
 !64 = metadata !{i32 40, i32 0, metadata !63, null}
-!65 = metadata !{i32 786688, metadata !63, metadata !"f", metadata !6, i32 41, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [f] [line 41]
+!65 = metadata !{metadata !"0x100\00f\0041\000", metadata !63, metadata !6, metadata !15} ; [ DW_TAG_auto_variable ] [f] [line 41]
 !66 = metadata !{i32 41, i32 0, metadata !63, null}
-!67 = metadata !{i32 786688, metadata !63, metadata !"d", metadata !6, i32 42, metadata !68, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 42]
-!68 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !13, metadata !69, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
+!67 = metadata !{metadata !"0x100\00d\0042\000", metadata !63, metadata !6, metadata !68} ; [ DW_TAG_auto_variable ] [d] [line 42]
+!68 = metadata !{metadata !"0x1\00\000\00256\0064\000\000", null, metadata !"", metadata !13, metadata !69, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
 !69 = metadata !{metadata !19, metadata !19}
 !70 = metadata !{i32 42, i32 0, metadata !63, null}
 !71 = metadata !{i32 44, i32 0, metadata !63, null}
 !72 = metadata !{i32 45, i32 0, metadata !63, null}
 !73 = metadata !{i32 46, i32 0, metadata !63, null}
-!74 = metadata !{i32 786688, metadata !63, metadata !"result", metadata !6, i32 48, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 48]
+!74 = metadata !{metadata !"0x100\00result\0048\000", metadata !63, metadata !6, metadata !13} ; [ DW_TAG_auto_variable ] [result] [line 48]
 !75 = metadata !{i32 48, i32 0, metadata !63, null}
 !76 = metadata !{i32 49, i32 0, metadata !63, null}
 !77 = metadata !{metadata !"test-parameters.cpp", metadata !"/home/akaylor/dev"}
-!78 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!78 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/LTO/Inputs/bcsection.macho.s b/test/LTO/Inputs/bcsection.macho.s
new file mode 100644
index 0000000..cb7fe03
--- /dev/null
+++ b/test/LTO/Inputs/bcsection.macho.s

@@ -0,0 +1,2 @@
+.section .llvmbc,.llvmbc
+.incbin "bcsection.bc"

diff --git a/test/LTO/Inputs/bcsection.s b/test/LTO/Inputs/bcsection.s
new file mode 100644
index 0000000..ede1e5c
--- /dev/null
+++ b/test/LTO/Inputs/bcsection.s

@@ -0,0 +1,2 @@
+.section .llvmbc
+.incbin "bcsection.bc"

diff --git a/test/LTO/bcsection.ll b/test/LTO/bcsection.ll
new file mode 100644
index 0000000..e65ade6
--- /dev/null
+++ b/test/LTO/bcsection.ll

@@ -0,0 +1,21 @@
+; RUN: llvm-as -o %T/bcsection.bc %s
+
+; RUN: llvm-mc -I=%T -filetype=obj -triple=x86_64-pc-win32 -o %T/bcsection.coff.bco %p/Inputs/bcsection.s
+; RUN: llvm-nm %T/bcsection.coff.bco | FileCheck %s
+; RUN: llvm-lto -exported-symbol=main -exported-symbol=_main -o %T/bcsection.coff.o %T/bcsection.coff.bco
+; RUN: llvm-nm %T/bcsection.coff.o | FileCheck %s
+
+; RUN: llvm-mc -I=%T -filetype=obj -triple=x86_64-unknown-linux-gnu -o %T/bcsection.elf.bco %p/Inputs/bcsection.s
+; RUN: llvm-nm %T/bcsection.elf.bco | FileCheck %s
+; RUN: llvm-lto -exported-symbol=main -exported-symbol=_main -o %T/bcsection.elf.o %T/bcsection.elf.bco
+; RUN: llvm-nm %T/bcsection.elf.o | FileCheck %s
+
+; RUN: llvm-mc -I=%T -filetype=obj -triple=x86_64-apple-darwin11 -o %T/bcsection.macho.bco %p/Inputs/bcsection.macho.s
+; RUN: llvm-nm %T/bcsection.macho.bco | FileCheck %s
+; RUN: llvm-lto -exported-symbol=main -exported-symbol=_main -o %T/bcsection.macho.o %T/bcsection.macho.bco
+; RUN: llvm-nm %T/bcsection.macho.o | FileCheck %s
+
+; CHECK: main
+define i32 @main() {
+  ret i32 0
+}

diff --git a/test/LTO/diagnostic-handler-remarks.ll b/test/LTO/diagnostic-handler-remarks.ll
new file mode 100644
index 0000000..4da9101
--- /dev/null
+++ b/test/LTO/diagnostic-handler-remarks.ll

@@ -0,0 +1,40 @@
+; RUN: llvm-as < %s >%t.bc
+; PR21108: Diagnostic handlers get pass remarks, even if they're not enabled.
+
+; Confirm that there are -pass-remarks.
+; RUN: llvm-lto -pass-remarks=inline \
+; RUN:          -exported-symbol _main -o %t.o %t.bc 2>&1 | \
+; RUN:     FileCheck %s -allow-empty -check-prefix=REMARKS
+; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM
+
+; RUN: llvm-lto -pass-remarks=inline -use-diagnostic-handler \
+; RUN:         -exported-symbol _main -o %t.o %t.bc 2>&1 | \
+; RUN:     FileCheck %s -allow-empty -check-prefix=REMARKS
+; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM
+
+; Confirm that -pass-remarks are not printed by default.
+; RUN: llvm-lto \
+; RUN:         -exported-symbol _main -o %t.o %t.bc 2>&1 | \
+; RUN:     FileCheck %s -allow-empty
+; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM
+
+; RUN: llvm-lto -use-diagnostic-handler \
+; RUN:         -exported-symbol _main -o %t.o %t.bc 2>&1 | \
+; RUN:     FileCheck %s -allow-empty
+; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM
+
+; REMARKS: remark:
+; CHECK-NOT: remark:
+; NM-NOT: foo
+; NM: main
+
+target triple = "x86_64-apple-darwin"
+
+define i32 @foo() {
+  ret i32 7
+}
+
+define i32 @main() {
+  %i = call i32 @foo()
+  ret i32 %i
+}

diff --git a/test/LTO/jump-table-type.ll b/test/LTO/jump-table-type.ll
index a39d3e9..a806c30 100644
--- a/test/LTO/jump-table-type.ll
+++ b/test/LTO/jump-table-type.ll

@@ -2,8 +2,8 @@
 ; RUN: llvm-lto -o %t2 %t1 -jump-table-type=arity
 ; RUN: llvm-nm %t2 | FileCheck %s
 
-; CHECK: T __llvm_jump_instr_table_0_1
-; CHECK: T __llvm_jump_instr_table_1_1
+; CHECK: t __llvm_jump_instr_table_0_1
+; CHECK: t __llvm_jump_instr_table_1_1
 
 target triple = "x86_64-unknown-linux-gnu"
 

diff --git a/test/Linker/2003-01-30-LinkerRename.ll b/test/Linker/2003-01-30-LinkerRename.ll
index cbf7541..1e25d3e 100644
--- a/test/Linker/2003-01-30-LinkerRename.ll
+++ b/test/Linker/2003-01-30-LinkerRename.ll

@@ -1,10 +1,17 @@
-; This fails because the linker renames the external symbol not the internal 
-; one...
-
-; RUN: echo "define internal i32 @foo() { ret i32 7 } " | llvm-as > %t.1.bc
+; RUN: llvm-as %S/Inputs/2003-01-30-LinkerRename.ll -o %t.1.bc
 ; RUN: llvm-as %s -o %t.2.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc -S | FileCheck %s
-; CHECK: internal{{.*}}@foo{{[0-9]}}()
 
-define i32 @foo() { ret i32 0 }
+; CHECK: @bar = global i32 ()* @foo2
 
+; CHECK:      define internal i32 @foo2() {
+; CHECK-NEXT:   ret i32 7
+; CHECK-NEXT: }
+
+; CHECK:      define i32 @foo() {
+; CHECK-NEXT:   ret i32 0
+; CHECK-NEXT: }
+
+define i32 @foo() {
+  ret i32 0
+}

diff --git a/test/Linker/2003-05-31-LinkerRename.ll b/test/Linker/2003-05-31-LinkerRename.ll
index 2e734be..0261fe3 100644
--- a/test/Linker/2003-05-31-LinkerRename.ll
+++ b/test/Linker/2003-05-31-LinkerRename.ll

@@ -1,18 +1,23 @@
-; The funcresolve pass will (intentionally) llvm-link an _internal_ function 
-; body with an external declaration.  Because of this, if we LINK an internal 
-; function body into a program that already has an external declaration for 
-; the function name, we must rename the internal function to something that 
-; does not conflict.
-
-; RUN: echo " define internal i32 @foo() { ret i32 7 } " | llvm-as > %t.1.bc
-; RUN: llvm-as < %s > %t.2.bc
+; RUN: llvm-as %S/Inputs/2003-05-31-LinkerRename.ll -o %t.1.bc
+; RUN: llvm-as  %s -o %t.2.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc -S | FileCheck %s
-; CHECK: internal {{.*}} @foo{{[0-9]}}(
 
-declare i32 @foo() 
+; CHECK: @bar = global i32 ()* @foo2
 
-define i32 @test() { 
+; CHECK:      define internal i32 @foo2() {
+; CHECK-NEXT:   ret i32 7
+; CHECK-NEXT: }
+
+; CHECK: declare i32 @foo()
+
+; CHECK:      define i32 @test() {
+; CHECK-NEXT:   %X = call i32 @foo()
+; CHECK-NEXT:   ret i32 %X
+; CHECK-NEXT: }
+
+declare i32 @foo()
+
+define i32 @test() {
   %X = call i32 @foo()
   ret i32 %X
 }
-

diff --git a/test/Linker/2008-03-05-AliasReference.ll b/test/Linker/2008-03-05-AliasReference.ll
index 7c19dfa..8ce1ccb 100644
--- a/test/Linker/2008-03-05-AliasReference.ll
+++ b/test/Linker/2008-03-05-AliasReference.ll

@@ -8,7 +8,7 @@
 target triple = "x86_64-unknown-linux-gnu"
 @foo = weak global i32 0		; <i32*> [#uses=1]
 
-@bar = alias weak i32* @foo		; <i32*> [#uses=1]
+@bar = weak alias i32* @foo		; <i32*> [#uses=1]
 
 define i32 @baz() nounwind  {
 entry:

diff --git a/test/Linker/2009-09-03-mdnode.ll b/test/Linker/2009-09-03-mdnode.ll
index 11862f7..d9871b2 100644
--- a/test/Linker/2009-09-03-mdnode.ll
+++ b/test/Linker/2009-09-03-mdnode.ll

@@ -26,5 +26,6 @@
 
 declare void @llvm.dbg.region.end(metadata) nounwind readnone
 
-!0 = metadata !{i32 458798, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"main", metadata !1, i32 2, null, i1 false, i1 true}
-!1 = metadata !{i32 458769, i32 0, i32 12, metadata !"a.c", metadata !"/home/rich/ellcc/test/source", metadata !"ellcc 0.1.0", i1 true, i1 true, metadata !"", i32 0}
+!0 = metadata !{metadata !"0x2e\00main\00main\00main\002\000\001\000\006\000\000\000", i32 0, metadata !1, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x11\0012\00ellcc 0.1.0\001\00\000\00\000", metadata !2, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{metadata !"a.c", metadata !"/home/rich/ellcc/test/source"}

diff --git a/test/Linker/2009-09-03-mdnode2.ll b/test/Linker/2009-09-03-mdnode2.ll
index 21589a4..b01f947 100644
--- a/test/Linker/2009-09-03-mdnode2.ll
+++ b/test/Linker/2009-09-03-mdnode2.ll

@@ -21,5 +21,6 @@
 
 declare void @llvm.dbg.region.end(metadata) nounwind readnone
 
-!0 = metadata !{i32 458798, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"f", metadata !1, i32 1, null, i1 false, i1 true}
-!1 = metadata !{i32 458769, i32 0, i32 12, metadata !"b.c", metadata !"/home/rich/ellcc/test/source", metadata !"ellcc 0.1.0", i1 true, i1 true, metadata !"", i32 0}
+!0 = metadata !{metadata !"0x2e\00f\00f\00f\001\000\001\000\006\000\000\000", i32 0, metadata !1, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x11\0012\00ellcc 0.1.0\001\00\000\00\000", metadata !2, null, null, null, null, null} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{metadata !"b.c", metadata !"/home/rich/ellcc/test/source"}

diff --git a/test/Linker/2011-08-04-DebugLoc.ll b/test/Linker/2011-08-04-DebugLoc.ll
index d26e8cd..a9307af 100644
--- a/test/Linker/2011-08-04-DebugLoc.ll
+++ b/test/Linker/2011-08-04-DebugLoc.ll

@@ -17,15 +17,15 @@
 !llvm.module.flags = !{!11}
 !llvm.dbg.sp = !{!1}
 
-!0 = metadata !{i32 589841, metadata !8, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !9, metadata !9, metadata !10, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 589870, metadata !8, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!2 = metadata !{i32 589865, metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 589845, metadata !8, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)\001\00\000\00\000", metadata !8, metadata !9, metadata !9, metadata !10, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\000\000\000", metadata !8, metadata !2, metadata !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!2 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 589860, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 2, i32 13, metadata !7, null}
-!7 = metadata !{i32 589835, metadata !8, metadata !1, i32 2, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{metadata !"0xb\002\0011\000", metadata !8, metadata !1} ; [ DW_TAG_lexical_block ]
 !8 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !9 = metadata !{i32 0}
 !10 = metadata !{metadata !1}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/2011-08-04-DebugLoc2.ll b/test/Linker/2011-08-04-DebugLoc2.ll
index c20941d..948dd18 100644
--- a/test/Linker/2011-08-04-DebugLoc2.ll
+++ b/test/Linker/2011-08-04-DebugLoc2.ll

@@ -14,15 +14,15 @@
 !llvm.module.flags = !{!11}
 !llvm.dbg.sp = !{!1}
 
-!0 = metadata !{i32 589841, metadata !8, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !9, metadata !9, metadata !10, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 589870, metadata !8, metadata !2, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
-!2 = metadata !{i32 589865, metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 589845, metadata !8, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)\001\00\000\00\000", metadata !8, metadata !9, metadata !9, metadata !10, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\000", metadata !8, metadata !2, metadata !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
+!2 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 589860, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 1, i32 13, metadata !7, null}
-!7 = metadata !{i32 589835, metadata !8, metadata !1, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{metadata !"0xb\001\0011\000", metadata !8, metadata !1} ; [ DW_TAG_lexical_block ]
 !8 = metadata !{metadata !"b.c", metadata !"/private/tmp"}
 !9 = metadata !{i32 0}
 !10 = metadata !{metadata !1}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/2011-08-04-Metadata.ll b/test/Linker/2011-08-04-Metadata.ll
index cdf4f6f..7bdbb33 100644
--- a/test/Linker/2011-08-04-Metadata.ll
+++ b/test/Linker/2011-08-04-Metadata.ll

@@ -2,7 +2,8 @@
 ; RUN: llvm-dis < %t.bc | FileCheck %s
 ; Test if internal global variable's debug info is merged appropriately or not.
 
-;CHECK:  metadata !{i32 589876, i32 0, metadata !{{[0-9]+}}, metadata !"x", metadata !"x", metadata !"", metadata !{{[0-9]+}}, i32 1, metadata !{{[0-9]+}}, i32 1, i32 1, i32* @x1}
+;CHECK:  metadata !{metadata !"0x34\00x\00x\00\002\001\001", metadata !{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}, i32* @x}
+;CHECK:  metadata !{metadata !"0x34\00x\00x\00\001\001\001", metadata !{{[0-9]+}}, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}, i32* @x1}
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.0"
 
@@ -19,15 +20,15 @@
 !llvm.dbg.sp = !{!1}
 !llvm.dbg.gv = !{!5}
 
-!0 = metadata !{i32 589841, metadata !9, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !10, null, null, metadata !""}
-!1 = metadata !{i32 589870, metadata !9, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
-!2 = metadata !{i32 589865, metadata !9}
-!3 = metadata !{i32 589845, metadata !9, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", metadata !9, metadata !4, metadata !4, metadata !10, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\000\000\000", metadata !9, metadata !2, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
+!2 = metadata !{metadata !"0x29", metadata !9} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !9, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 589876, i32 0, metadata !0, metadata !"x", metadata !"x", metadata !"", metadata !2, i32 2, metadata !6, i32 1, i32 1, i32* @x}
-!6 = metadata !{i32 589860, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!5 = metadata !{metadata !"0x34\00x\00x\00\002\001\001", metadata !0, metadata !2, metadata !6, i32* @x} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 3, i32 14, metadata !8, null}
-!8 = metadata !{i32 589835, metadata !9, metadata !1, i32 3, i32 12, i32 0}
+!8 = metadata !{metadata !"0xb\003\0012\000", metadata !9, metadata !1} ; [ DW_TAG_lexical_block ]
 !9 = metadata !{metadata !"/tmp/one.c", metadata !"/Volumes/Lalgate/Slate/D"}
 !10 = metadata !{metadata !1}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/2011-08-04-Metadata2.ll b/test/Linker/2011-08-04-Metadata2.ll
index 80884cc..fcf72aa 100644
--- a/test/Linker/2011-08-04-Metadata2.ll
+++ b/test/Linker/2011-08-04-Metadata2.ll

@@ -19,15 +19,15 @@
 !llvm.dbg.sp = !{!1}
 !llvm.dbg.gv = !{!5}
 
-!0 = metadata !{i32 589841, metadata !9, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !10, null, null, metadata !""}
-!1 = metadata !{i32 589870, metadata !9, metadata !2, metadata !"bar", metadata !"bar", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [bar]
-!2 = metadata !{i32 589865, metadata !9}
-!3 = metadata !{i32 589845, metadata !9, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 ()\001\00\000\00\000", metadata !9, metadata !4, metadata !4, metadata !10, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00bar\00bar\00\002\000\001\000\006\000\000\000", metadata !9, metadata !2, metadata !3, null, void ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [bar]
+!2 = metadata !{metadata !"0x29", metadata !9} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !9, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 589876, i32 0, metadata !0, metadata !"x", metadata !"x", metadata !"", metadata !2, i32 1, metadata !6, i32 1, i32 1, i32* @x}
-!6 = metadata !{i32 589860, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!5 = metadata !{metadata !"0x34\00x\00x\00\001\001\001", metadata !0, metadata !2, metadata !6, i32* @x} ; [ DW_TAG_variable ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 2, i32 14, metadata !8, null}
-!8 = metadata !{i32 589835, metadata !9, metadata !1, i32 2, i32 12, i32 0}
+!8 = metadata !{metadata !"0xb\002\0012\000", metadata !9, metadata !1} ; [ DW_TAG_lexical_block ]
 !9 = metadata !{metadata !"/tmp/two.c", metadata !"/Volumes/Lalgate/Slate/D"}
 !10 = metadata !{metadata !1}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/2011-08-18-unique-class-type.ll b/test/Linker/2011-08-18-unique-class-type.ll
index b077f23..6fa2126 100644
--- a/test/Linker/2011-08-18-unique-class-type.ll
+++ b/test/Linker/2011-08-18-unique-class-type.ll

@@ -11,30 +11,30 @@
 define void @_Z3fooN2N11AE() nounwind uwtable ssp {
 entry:
   %mya = alloca %"class.N1::A", align 1
-  call void @llvm.dbg.declare(metadata !{%"class.N1::A"* %mya}, metadata !9), !dbg !13
+  call void @llvm.dbg.declare(metadata !{%"class.N1::A"* %mya}, metadata !9, metadata !{metadata !"0x102"}), !dbg !13
   ret void, !dbg !14
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{i32 720913, metadata !16, i32 4, metadata !"clang version 3.0 (trunk 137954)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 137954)\001\00\000\00\000", metadata !16, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !16, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooN2N11AE", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3fooN2N11AE, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
-!6 = metadata !{i32 720937, metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, metadata !16, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3fooN2N11AE\004\000\001\000\006\00256\000\000", metadata !16, metadata !6, metadata !7, null, void ()* @_Z3fooN2N11AE, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
+!6 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !16, metadata !6, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{i32 721153, metadata !5, metadata !"mya", metadata !6, i32 16777220, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{i32 720898, metadata !17, metadata !11, metadata !"A", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
-!11 = metadata !{i32 720953, metadata !17, null, metadata !"N1", i32 2} ; [ DW_TAG_namespace ]
-!12 = metadata !{i32 720937, metadata !17} ; [ DW_TAG_file_type ]
+!9 = metadata !{metadata !"0x101\00mya\0016777220\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
+!10 = metadata !{metadata !"0x2\00A\003\008\008\000\000\000", metadata !17, metadata !11, null, metadata !2, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
+!11 = metadata !{metadata !"0x39\00N1\002", metadata !17, null} ; [ DW_TAG_namespace ]
+!12 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
 !13 = metadata !{i32 4, i32 12, metadata !5, null}
 !14 = metadata !{i32 4, i32 18, metadata !15, null}
-!15 = metadata !{i32 720907, metadata !16, metadata !5, i32 4, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0xb\004\0017\000", metadata !16, metadata !5} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{metadata !"n1.c", metadata !"/private/tmp"}
 !17 = metadata !{metadata !"./n.h", metadata !"/private/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/2011-08-18-unique-class-type2.ll b/test/Linker/2011-08-18-unique-class-type2.ll
index 7bfcd91..97fdcd0 100644
--- a/test/Linker/2011-08-18-unique-class-type2.ll
+++ b/test/Linker/2011-08-18-unique-class-type2.ll

@@ -9,30 +9,30 @@
 define void @_Z3barN2N11AE() nounwind uwtable ssp {
 entry:
   %youra = alloca %"class.N1::A", align 1
-  call void @llvm.dbg.declare(metadata !{%"class.N1::A"* %youra}, metadata !9), !dbg !13
+  call void @llvm.dbg.declare(metadata !{%"class.N1::A"* %youra}, metadata !9, metadata !{metadata !"0x102"}), !dbg !13
   ret void, !dbg !14
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18}
 
-!0 = metadata !{i32 720913, metadata !16, i32 4, metadata !"clang version 3.0 (trunk 137954)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.0 (trunk 137954)\001\00\000\00\000", metadata !16, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"bar", metadata !"bar", metadata !"_Z3barN2N11AE", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3barN2N11AE, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
-!6 = metadata !{i32 720937, metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, metadata !16, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barN2N11AE\004\000\001\000\006\00256\000\000", i32 0, metadata !6, metadata !7, null, void ()* @_Z3barN2N11AE, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
+!6 = metadata !{metadata !"0x29", metadata !16} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !16, metadata !6, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{i32 721153, metadata !5, metadata !"youra", metadata !6, i32 16777220, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{i32 720898, metadata !17, metadata !11, metadata !"A", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
-!11 = metadata !{i32 720953, metadata !17, null, metadata !"N1", i32 2} ; [ DW_TAG_namespace ]
-!12 = metadata !{i32 720937, metadata !17} ; [ DW_TAG_file_type ]
+!9 = metadata !{metadata !"0x101\00youra\0016777220\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
+!10 = metadata !{metadata !"0x2\00A\003\008\008\000\000\000", metadata !17, metadata !11, null, metadata !2, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
+!11 = metadata !{metadata !"0x39\00N1\002", metadata !17, null} ; [ DW_TAG_namespace ]
+!12 = metadata !{metadata !"0x29", metadata !17} ; [ DW_TAG_file_type ]
 !13 = metadata !{i32 4, i32 12, metadata !5, null}
 !14 = metadata !{i32 4, i32 20, metadata !15, null}
-!15 = metadata !{i32 720907, metadata !16, metadata !5, i32 4, i32 19, i32 0} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0xb\004\0019\000", metadata !16, metadata !5} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{metadata !"n2.c", metadata !"/private/tmp"}
 !17 = metadata !{metadata !"./n.h", metadata !"/private/tmp"}
-!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/2011-08-18-unique-debug-type.ll b/test/Linker/2011-08-18-unique-debug-type.ll
index 0e14f46..e9dcf87 100644
--- a/test/Linker/2011-08-18-unique-debug-type.ll
+++ b/test/Linker/2011-08-18-unique-debug-type.ll

@@ -1,6 +1,6 @@
 ; RUN: llvm-link %s %p/2011-08-18-unique-debug-type2.ll -S -o - | FileCheck %s
 ; Test to check only one MDNode for "int" after linking.
-; CHECK: !"int"
+; CHECK: !"0x24\00int\00{{.*}}"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.0"
 
@@ -12,16 +12,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{i32 720913, metadata !12, i32 12, metadata !"clang version 3.0 (trunk 137954)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 137954)\001\00\000\00\000", metadata !12, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !12, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
-!6 = metadata !{i32 720937, metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, metadata !12, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\000\000\000", metadata !12, metadata !6, metadata !7, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
+!6 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !6, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 1, i32 13, metadata !11, null}
-!11 = metadata !{i32 720907, metadata !12, metadata !5, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{metadata !"0xb\001\0011\000", metadata !12, metadata !5} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/2011-08-18-unique-debug-type2.ll b/test/Linker/2011-08-18-unique-debug-type2.ll
index 1185100..7bbed9f 100644
--- a/test/Linker/2011-08-18-unique-debug-type2.ll
+++ b/test/Linker/2011-08-18-unique-debug-type2.ll

@@ -12,16 +12,16 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13}
 
-!0 = metadata !{i32 720913, metadata !12, i32 12, metadata !"clang version 3.0 (trunk 137954)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 137954)\001\00\000\00\000", metadata !12, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !12, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
-!6 = metadata !{i32 720937, metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, metadata !12, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00bar\00bar\00\001\000\001\000\006\000\000\000", metadata !12, metadata !6, metadata !7, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
+!6 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !6, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 1, i32 13, metadata !11, null}
-!11 = metadata !{i32 720907, metadata !12, metadata !5, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{metadata !"0xb\001\0011\000", metadata !12, metadata !5} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{metadata !"two.c", metadata !"/private/tmp"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/2011-08-22-ResolveAlias.ll b/test/Linker/2011-08-22-ResolveAlias.ll
index 6b99233..d6df82d 100644
--- a/test/Linker/2011-08-22-ResolveAlias.ll
+++ b/test/Linker/2011-08-22-ResolveAlias.ll

@@ -7,32 +7,32 @@
 %union.pthread_mutexattr_t = type { [4 x i8] }
 %union.pthread_cond_t = type { [48 x i8] }
 
-@_ZL20__gthrw_pthread_oncePiPFvvE = alias weak i32 (i32*, void ()*)* @pthread_once
-@_ZL27__gthrw_pthread_getspecificj = alias weak i8* (i32)* @pthread_getspecific
-@_ZL27__gthrw_pthread_setspecificjPKv = alias weak i32 (i32, i8*)* @pthread_setspecific
-@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = alias weak i32 (i64*, %union.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create
-@_ZL20__gthrw_pthread_joinmPPv = alias weak i32 (i64, i8**)* @pthread_join
-@_ZL21__gthrw_pthread_equalmm = alias weak i32 (i64, i64)* @pthread_equal
-@_ZL20__gthrw_pthread_selfv = alias weak i64 ()* @pthread_self
-@_ZL22__gthrw_pthread_detachm = alias weak i32 (i64)* @pthread_detach
-@_ZL22__gthrw_pthread_cancelm = alias weak i32 (i64)* @pthread_cancel
-@_ZL19__gthrw_sched_yieldv = alias weak i32 ()* @sched_yield
-@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = alias weak i32 (%union.pthread_mutex_t*)* @pthread_mutex_lock
-@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = alias weak i32 (%union.pthread_mutex_t*)* @pthread_mutex_trylock
-@_ZL31__gthrw_pthread_mutex_timedlockP15pthread_mutex_tPK8timespec = alias weak i32 (%union.pthread_mutex_t*, %struct.timespec*)* @pthread_mutex_timedlock
-@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = alias weak i32 (%union.pthread_mutex_t*)* @pthread_mutex_unlock
-@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = alias weak i32 (%union.pthread_mutex_t*, %union.pthread_mutexattr_t*)* @pthread_mutex_init
-@_ZL29__gthrw_pthread_mutex_destroyP15pthread_mutex_t = alias weak i32 (%union.pthread_mutex_t*)* @pthread_mutex_destroy
-@_ZL30__gthrw_pthread_cond_broadcastP14pthread_cond_t = alias weak i32 (%union.pthread_cond_t*)* @pthread_cond_broadcast
-@_ZL27__gthrw_pthread_cond_signalP14pthread_cond_t = alias weak i32 (%union.pthread_cond_t*)* @pthread_cond_signal
-@_ZL25__gthrw_pthread_cond_waitP14pthread_cond_tP15pthread_mutex_t = alias weak i32 (%union.pthread_cond_t*, %union.pthread_mutex_t*)* @pthread_cond_wait
-@_ZL30__gthrw_pthread_cond_timedwaitP14pthread_cond_tP15pthread_mutex_tPK8timespec = alias weak i32 (%union.pthread_cond_t*, %union.pthread_mutex_t*, %struct.timespec*)* @pthread_cond_timedwait
-@_ZL28__gthrw_pthread_cond_destroyP14pthread_cond_t = alias weak i32 (%union.pthread_cond_t*)* @pthread_cond_destroy
-@_ZL26__gthrw_pthread_key_createPjPFvPvE = alias weak i32 (i32*, void (i8*)*)* @pthread_key_create
-@_ZL26__gthrw_pthread_key_deletej = alias weak i32 (i32)* @pthread_key_delete
-@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = alias weak i32 (%union.pthread_mutexattr_t*)* @pthread_mutexattr_init
-@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = alias weak i32 (%union.pthread_mutexattr_t*, i32)* @pthread_mutexattr_settype
-@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = alias weak i32 (%union.pthread_mutexattr_t*)* @pthread_mutexattr_destroy
+@_ZL20__gthrw_pthread_oncePiPFvvE = weak alias i32 (i32*, void ()*)* @pthread_once
+@_ZL27__gthrw_pthread_getspecificj = weak alias i8* (i32)* @pthread_getspecific
+@_ZL27__gthrw_pthread_setspecificjPKv = weak alias i32 (i32, i8*)* @pthread_setspecific
+@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = weak alias i32 (i64*, %union.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create
+@_ZL20__gthrw_pthread_joinmPPv = weak alias i32 (i64, i8**)* @pthread_join
+@_ZL21__gthrw_pthread_equalmm = weak alias i32 (i64, i64)* @pthread_equal
+@_ZL20__gthrw_pthread_selfv = weak alias i64 ()* @pthread_self
+@_ZL22__gthrw_pthread_detachm = weak alias i32 (i64)* @pthread_detach
+@_ZL22__gthrw_pthread_cancelm = weak alias i32 (i64)* @pthread_cancel
+@_ZL19__gthrw_sched_yieldv = weak alias i32 ()* @sched_yield
+@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = weak alias i32 (%union.pthread_mutex_t*)* @pthread_mutex_lock
+@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = weak alias i32 (%union.pthread_mutex_t*)* @pthread_mutex_trylock
+@_ZL31__gthrw_pthread_mutex_timedlockP15pthread_mutex_tPK8timespec = weak alias i32 (%union.pthread_mutex_t*, %struct.timespec*)* @pthread_mutex_timedlock
+@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = weak alias i32 (%union.pthread_mutex_t*)* @pthread_mutex_unlock
+@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = weak alias i32 (%union.pthread_mutex_t*, %union.pthread_mutexattr_t*)* @pthread_mutex_init
+@_ZL29__gthrw_pthread_mutex_destroyP15pthread_mutex_t = weak alias i32 (%union.pthread_mutex_t*)* @pthread_mutex_destroy
+@_ZL30__gthrw_pthread_cond_broadcastP14pthread_cond_t = weak alias i32 (%union.pthread_cond_t*)* @pthread_cond_broadcast
+@_ZL27__gthrw_pthread_cond_signalP14pthread_cond_t = weak alias i32 (%union.pthread_cond_t*)* @pthread_cond_signal
+@_ZL25__gthrw_pthread_cond_waitP14pthread_cond_tP15pthread_mutex_t = weak alias i32 (%union.pthread_cond_t*, %union.pthread_mutex_t*)* @pthread_cond_wait
+@_ZL30__gthrw_pthread_cond_timedwaitP14pthread_cond_tP15pthread_mutex_tPK8timespec = weak alias i32 (%union.pthread_cond_t*, %union.pthread_mutex_t*, %struct.timespec*)* @pthread_cond_timedwait
+@_ZL28__gthrw_pthread_cond_destroyP14pthread_cond_t = weak alias i32 (%union.pthread_cond_t*)* @pthread_cond_destroy
+@_ZL26__gthrw_pthread_key_createPjPFvPvE = weak alias i32 (i32*, void (i8*)*)* @pthread_key_create
+@_ZL26__gthrw_pthread_key_deletej = weak alias i32 (i32)* @pthread_key_delete
+@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = weak alias i32 (%union.pthread_mutexattr_t*)* @pthread_mutexattr_init
+@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = weak alias i32 (%union.pthread_mutexattr_t*, i32)* @pthread_mutexattr_settype
+@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = weak alias i32 (%union.pthread_mutexattr_t*)* @pthread_mutexattr_destroy
 
 declare extern_weak i32 @pthread_once(i32*, void ()*)
 

diff --git a/test/Linker/2011-08-22-ResolveAlias2.ll b/test/Linker/2011-08-22-ResolveAlias2.ll
index eee60d4..c380c23 100644
--- a/test/Linker/2011-08-22-ResolveAlias2.ll
+++ b/test/Linker/2011-08-22-ResolveAlias2.ll

@@ -10,32 +10,32 @@
 %union.pthread_cond_t = type { [48 x i8] }
 
 @_ZN13HexxagonBoardC1ERKS_ = alias void (%struct.HexxagonBoard*, %struct.HexxagonBoard*)* @_ZN13HexxagonBoardC2ERKS_
-@_ZL20__gthrw_pthread_oncePiPFvvE = alias weak i32 (i32*, void ()*)* @pthread_once
-@_ZL27__gthrw_pthread_getspecificj = alias weak i8* (i32)* @pthread_getspecific
-@_ZL27__gthrw_pthread_setspecificjPKv = alias weak i32 (i32, i8*)* @pthread_setspecific
-@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = alias weak i32 (i64*, %union.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create
-@_ZL20__gthrw_pthread_joinmPPv = alias weak i32 (i64, i8**)* @pthread_join
-@_ZL21__gthrw_pthread_equalmm = alias weak i32 (i64, i64)* @pthread_equal
-@_ZL20__gthrw_pthread_selfv = alias weak i64 ()* @pthread_self
-@_ZL22__gthrw_pthread_detachm = alias weak i32 (i64)* @pthread_detach
-@_ZL22__gthrw_pthread_cancelm = alias weak i32 (i64)* @pthread_cancel
-@_ZL19__gthrw_sched_yieldv = alias weak i32 ()* @sched_yield
-@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = alias weak i32 (%union.pthread_mutex_t*)* @pthread_mutex_lock
-@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = alias weak i32 (%union.pthread_mutex_t*)* @pthread_mutex_trylock
-@_ZL31__gthrw_pthread_mutex_timedlockP15pthread_mutex_tPK8timespec = alias weak i32 (%union.pthread_mutex_t*, %struct.timespec*)* @pthread_mutex_timedlock
-@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = alias weak i32 (%union.pthread_mutex_t*)* @pthread_mutex_unlock
-@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = alias weak i32 (%union.pthread_mutex_t*, %union.pthread_mutexattr_t*)* @pthread_mutex_init
-@_ZL29__gthrw_pthread_mutex_destroyP15pthread_mutex_t = alias weak i32 (%union.pthread_mutex_t*)* @pthread_mutex_destroy
-@_ZL30__gthrw_pthread_cond_broadcastP14pthread_cond_t = alias weak i32 (%union.pthread_cond_t*)* @pthread_cond_broadcast
-@_ZL27__gthrw_pthread_cond_signalP14pthread_cond_t = alias weak i32 (%union.pthread_cond_t*)* @pthread_cond_signal
-@_ZL25__gthrw_pthread_cond_waitP14pthread_cond_tP15pthread_mutex_t = alias weak i32 (%union.pthread_cond_t*, %union.pthread_mutex_t*)* @pthread_cond_wait
-@_ZL30__gthrw_pthread_cond_timedwaitP14pthread_cond_tP15pthread_mutex_tPK8timespec = alias weak i32 (%union.pthread_cond_t*, %union.pthread_mutex_t*, %struct.timespec*)* @pthread_cond_timedwait
-@_ZL28__gthrw_pthread_cond_destroyP14pthread_cond_t = alias weak i32 (%union.pthread_cond_t*)* @pthread_cond_destroy
-@_ZL26__gthrw_pthread_key_createPjPFvPvE = alias weak i32 (i32*, void (i8*)*)* @pthread_key_create
-@_ZL26__gthrw_pthread_key_deletej = alias weak i32 (i32)* @pthread_key_delete
-@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = alias weak i32 (%union.pthread_mutexattr_t*)* @pthread_mutexattr_init
-@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = alias weak i32 (%union.pthread_mutexattr_t*, i32)* @pthread_mutexattr_settype
-@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = alias weak i32 (%union.pthread_mutexattr_t*)* @pthread_mutexattr_destroy
+@_ZL20__gthrw_pthread_oncePiPFvvE = weak alias i32 (i32*, void ()*)* @pthread_once
+@_ZL27__gthrw_pthread_getspecificj = weak alias i8* (i32)* @pthread_getspecific
+@_ZL27__gthrw_pthread_setspecificjPKv = weak alias i32 (i32, i8*)* @pthread_setspecific
+@_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_ = weak alias i32 (i64*, %union.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create
+@_ZL20__gthrw_pthread_joinmPPv = weak alias i32 (i64, i8**)* @pthread_join
+@_ZL21__gthrw_pthread_equalmm = weak alias i32 (i64, i64)* @pthread_equal
+@_ZL20__gthrw_pthread_selfv = weak alias i64 ()* @pthread_self
+@_ZL22__gthrw_pthread_detachm = weak alias i32 (i64)* @pthread_detach
+@_ZL22__gthrw_pthread_cancelm = weak alias i32 (i64)* @pthread_cancel
+@_ZL19__gthrw_sched_yieldv = weak alias i32 ()* @sched_yield
+@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = weak alias i32 (%union.pthread_mutex_t*)* @pthread_mutex_lock
+@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = weak alias i32 (%union.pthread_mutex_t*)* @pthread_mutex_trylock
+@_ZL31__gthrw_pthread_mutex_timedlockP15pthread_mutex_tPK8timespec = weak alias i32 (%union.pthread_mutex_t*, %struct.timespec*)* @pthread_mutex_timedlock
+@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = weak alias i32 (%union.pthread_mutex_t*)* @pthread_mutex_unlock
+@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = weak alias i32 (%union.pthread_mutex_t*, %union.pthread_mutexattr_t*)* @pthread_mutex_init
+@_ZL29__gthrw_pthread_mutex_destroyP15pthread_mutex_t = weak alias i32 (%union.pthread_mutex_t*)* @pthread_mutex_destroy
+@_ZL30__gthrw_pthread_cond_broadcastP14pthread_cond_t = weak alias i32 (%union.pthread_cond_t*)* @pthread_cond_broadcast
+@_ZL27__gthrw_pthread_cond_signalP14pthread_cond_t = weak alias i32 (%union.pthread_cond_t*)* @pthread_cond_signal
+@_ZL25__gthrw_pthread_cond_waitP14pthread_cond_tP15pthread_mutex_t = weak alias i32 (%union.pthread_cond_t*, %union.pthread_mutex_t*)* @pthread_cond_wait
+@_ZL30__gthrw_pthread_cond_timedwaitP14pthread_cond_tP15pthread_mutex_tPK8timespec = weak alias i32 (%union.pthread_cond_t*, %union.pthread_mutex_t*, %struct.timespec*)* @pthread_cond_timedwait
+@_ZL28__gthrw_pthread_cond_destroyP14pthread_cond_t = weak alias i32 (%union.pthread_cond_t*)* @pthread_cond_destroy
+@_ZL26__gthrw_pthread_key_createPjPFvPvE = weak alias i32 (i32*, void (i8*)*)* @pthread_key_create
+@_ZL26__gthrw_pthread_key_deletej = weak alias i32 (i32)* @pthread_key_delete
+@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = weak alias i32 (%union.pthread_mutexattr_t*)* @pthread_mutexattr_init
+@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = weak alias i32 (%union.pthread_mutexattr_t*, i32)* @pthread_mutexattr_settype
+@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = weak alias i32 (%union.pthread_mutexattr_t*)* @pthread_mutexattr_destroy
 
 define void @_ZN13HexxagonBoardC2ERKS_(%struct.HexxagonBoard*, %struct.HexxagonBoard*) uwtable align 2 {
   ret void

diff --git a/test/Linker/ConstantGlobals.ll b/test/Linker/ConstantGlobals.ll
new file mode 100644
index 0000000..49f86a5
--- /dev/null
+++ b/test/Linker/ConstantGlobals.ll

@@ -0,0 +1,8 @@
+; RUN: llvm-link %s %S/Inputs/ConstantGlobals.ll -S | FileCheck %s
+; RUN: llvm-link %S/Inputs/ConstantGlobals.ll %s -S | FileCheck %s
+
+; CHECK-DAG: @X = constant [1 x i32] [i32 8]
+@X = external global [1 x i32]
+
+; CHECK-DAG: @Y = external global [1 x i32]
+@Y = external global [1 x i32]

diff --git a/test/Linker/ConstantGlobals1.ll b/test/Linker/ConstantGlobals1.ll
deleted file mode 100644
index a2bb6fb..0000000
--- a/test/Linker/ConstantGlobals1.ll
+++ /dev/null

@@ -1,10 +0,0 @@
-; Test that appending linkage works correctly when arrays are the same size.
-
-; RUN: echo "@X = constant [1 x i32] [i32 8] " | \
-; RUN:   llvm-as > %t.2.bc
-; RUN: llvm-as < %s > %t.1.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | FileCheck %s
-; CHECK: constant
-
-@X = external global [1 x i32]		; <[1 x i32]*> [#uses=0]
-

diff --git a/test/Linker/ConstantGlobals2.ll b/test/Linker/ConstantGlobals2.ll
deleted file mode 100644
index 4713779..0000000
--- a/test/Linker/ConstantGlobals2.ll
+++ /dev/null

@@ -1,10 +0,0 @@
-; Test that appending linkage works correctly when arrays are the same size.
-
-; RUN: echo "@X = external global [1 x i32] " | \
-; RUN:   llvm-as > %t.2.bc
-; RUN: llvm-as < %s > %t.1.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | FileCheck %s
-; CHECK: constant
-
-@X = constant [1 x i32] [ i32 12 ]		; <[1 x i32]*> [#uses=0]
-

diff --git a/test/Linker/ConstantGlobals3.ll b/test/Linker/ConstantGlobals3.ll
deleted file mode 100644
index 6b4ed24..0000000
--- a/test/Linker/ConstantGlobals3.ll
+++ /dev/null

@@ -1,9 +0,0 @@
-; Test that appending linkage works correctly when arrays are the same size.
-
-; RUN: echo "@X = external constant [1 x i32] " | \
-; RUN:   llvm-as > %t.2.bc
-; RUN: llvm-as < %s > %t.1.bc
-; RUN: llvm-link %t.1.bc %t.2.bc -S | FileCheck %s
-; CHECK: constant
-
-@X = external global [1 x i32]		; <[1 x i32]*> [#uses=0]

diff --git a/test/Linker/DbgDeclare.ll b/test/Linker/DbgDeclare.ll
index 4cca9d5..3d39b30 100644
--- a/test/Linker/DbgDeclare.ll
+++ b/test/Linker/DbgDeclare.ll

@@ -4,12 +4,12 @@
 
 ; rdar://13089880
 ; CHECK: define i32 @main(i32 %argc, i8** %argv)
-; CHECK: call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !{{[0-9]+}})
-; CHECK: call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !{{[0-9]+}})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !{{[0-9]+}}, metadata {{.*}})
+; CHECK: call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !{{[0-9]+}}, metadata {{.*}})
 ; CHECK: define void @test(i32 %argc, i8** %argv)
-; CHECK: call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !{{[0-9]+}})
-; CHECK: call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !{{[0-9]+}})
-; CHECK: call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !{{[0-9]+}})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !{{[0-9]+}}, metadata {{.*}})
+; CHECK: call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !{{[0-9]+}}, metadata {{.*}})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !{{[0-9]+}}, metadata {{.*}})
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
@@ -21,40 +21,40 @@
   %argv.addr = alloca i8**, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !14), !dbg !15
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !16), !dbg !15
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !15
   %0 = load i32* %argc.addr, align 4, !dbg !17
   %1 = load i8*** %argv.addr, align 8, !dbg !17
   call void @test(i32 %0, i8** %1), !dbg !17
   ret i32 0, !dbg !19
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare void @test(i32, i8**)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!21}
 
-!0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.3 (trunk 173515)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 173515)\001\00\000\00\000", metadata !20, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !20, null, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00main\00main\00\003\000\001\000\006\00256\000\004", metadata !20, null, metadata !7, null, i32 (i32, i8**)* @main, null, null, metadata !1} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9, metadata !10}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ]
-!13 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786689, metadata !5, metadata !"argc", metadata !6, i32 16777219, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ]
+!13 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x101\00argc\0016777219\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
 !15 = metadata !{i32 3, i32 0, metadata !5, null}
-!16 = metadata !{i32 786689, metadata !5, metadata !"argv", metadata !6, i32 33554435, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x101\00argv\0033554435\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
 !17 = metadata !{i32 5, i32 0, metadata !18, null}
-!18 = metadata !{i32 786443, metadata !20, metadata !5, i32 4, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{metadata !"0xb\004\000\000", metadata !20, metadata !5} ; [ DW_TAG_lexical_block ]
 !19 = metadata !{i32 6, i32 0, metadata !18, null}
 !20 = metadata !{metadata !"main.cpp", metadata !"/private/tmp"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/DbgDeclare2.ll b/test/Linker/DbgDeclare2.ll
index 2649fcc..d27ce53 100644
--- a/test/Linker/DbgDeclare2.ll
+++ b/test/Linker/DbgDeclare2.ll

@@ -11,10 +11,10 @@
   %argv.addr = alloca i8**, align 8
   %i = alloca i32, align 4
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !14), !dbg !15
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !14, metadata !{metadata !"0x102"}), !dbg !15
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !16), !dbg !15
-  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !17), !dbg !20
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !16, metadata !{metadata !"0x102"}), !dbg !15
+  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !17, metadata !{metadata !"0x102"}), !dbg !20
   store i32 0, i32* %i, align 4, !dbg !20
   br label %for.cond, !dbg !20
 
@@ -43,37 +43,37 @@
   ret void, !dbg !24
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @puts(i8*)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!27}
 
-!0 = metadata !{i32 786449, metadata !25, i32 4, metadata !"clang version 3.3 (trunk 173515)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 173515)\001\00\000\00\000", metadata !25, metadata !2, metadata !2, metadata !3, metadata !2, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !26, null, metadata !"print_args", metadata !"print_args", metadata !"test", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32, i8**)* @test, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00print_args\00print_args\00test\004\000\001\000\006\00256\000\005", metadata !26, null, metadata !7, null, void (i32, i8**)* @test, null, null, metadata !1} ; [ DW_TAG_subprogram ]
+!6 = metadata !{metadata !"0x29", metadata !26} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !10}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ]
-!13 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786689, metadata !5, metadata !"argc", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !13} ; [ DW_TAG_const_type ]
+!13 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0x101\00argc\0016777220\000", metadata !5, metadata !6, metadata !9} ; [ DW_TAG_arg_variable ]
 !15 = metadata !{i32 4, i32 0, metadata !5, null}
-!16 = metadata !{i32 786689, metadata !5, metadata !"argv", metadata !6, i32 33554436, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786688, metadata !18, metadata !"i", metadata !6, i32 6, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 786443, metadata !26, metadata !19, i32 6, i32 0, i32 1} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 786443, metadata !26, metadata !5, i32 5, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{metadata !"0x101\00argv\0033554436\000", metadata !5, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x100\00i\006\000", metadata !18, metadata !6, metadata !9} ; [ DW_TAG_auto_variable ]
+!18 = metadata !{metadata !"0xb\006\000\001", metadata !26, metadata !19} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{metadata !"0xb\005\000\000", metadata !26, metadata !5} ; [ DW_TAG_lexical_block ]
 !20 = metadata !{i32 6, i32 0, metadata !18, null}
 !21 = metadata !{i32 8, i32 0, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !26, metadata !18, i32 7, i32 0, i32 2} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\007\000\002", metadata !26, metadata !18} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{i32 9, i32 0, metadata !22, null}
 !24 = metadata !{i32 10, i32 0, metadata !19, null}
 !25 = metadata !{metadata !"main.cpp", metadata !"/private/tmp"}
 !26 = metadata !{metadata !"test.cpp", metadata !"/private/tmp"}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/Inputs/2003-01-30-LinkerRename.ll b/test/Linker/Inputs/2003-01-30-LinkerRename.ll
new file mode 100644
index 0000000..5c6b5f5
--- /dev/null
+++ b/test/Linker/Inputs/2003-01-30-LinkerRename.ll

@@ -0,0 +1,4 @@
+@bar = global i32()* @foo
+define internal i32 @foo() {
+  ret i32 7
+}

diff --git a/test/Linker/Inputs/2003-05-31-LinkerRename.ll b/test/Linker/Inputs/2003-05-31-LinkerRename.ll
new file mode 100644
index 0000000..dcd0bf5
--- /dev/null
+++ b/test/Linker/Inputs/2003-05-31-LinkerRename.ll

@@ -0,0 +1,5 @@
+@bar = global i32()* @foo
+
+define internal i32 @foo() {
+  ret i32 7
+}

diff --git a/test/Linker/Inputs/ConstantGlobals.ll b/test/Linker/Inputs/ConstantGlobals.ll
new file mode 100644
index 0000000..56c2ba5
--- /dev/null
+++ b/test/Linker/Inputs/ConstantGlobals.ll

@@ -0,0 +1,2 @@
+@X = constant [1 x i32] [i32 8]
+@Y = external constant [1 x i32]

diff --git a/test/Linker/Inputs/comdat8.ll b/test/Linker/Inputs/comdat8.ll
new file mode 100644
index 0000000..eaa9625
--- /dev/null
+++ b/test/Linker/Inputs/comdat8.ll

@@ -0,0 +1,4 @@
+$c1 = comdat largest
+
+@some_name = private unnamed_addr constant i32 42, comdat $c1
+@c1 = alias i32* @some_name

diff --git a/test/Linker/Inputs/comdat9.ll b/test/Linker/Inputs/comdat9.ll
new file mode 100644
index 0000000..679dbde
--- /dev/null
+++ b/test/Linker/Inputs/comdat9.ll

@@ -0,0 +1,5 @@
+$c = comdat any
+@a = alias void ()* @f
+define internal void @f() comdat $c {
+  ret void
+}

diff --git a/test/Linker/Inputs/constructor-comdat.ll b/test/Linker/Inputs/constructor-comdat.ll
new file mode 100644
index 0000000..b5f23da
--- /dev/null
+++ b/test/Linker/Inputs/constructor-comdat.ll

@@ -0,0 +1,7 @@
+define weak_odr void @_ZN3fooIiEC2Ev() {
+  ret void
+}
+
+define weak_odr void @_ZN3fooIiEC1Ev() {
+  ret void
+}

diff --git a/test/Linker/Inputs/ctors.ll b/test/Linker/Inputs/ctors.ll
new file mode 100644
index 0000000..f3307bc
--- /dev/null
+++ b/test/Linker/Inputs/ctors.ll

@@ -0,0 +1,6 @@
+@v = weak global i8 1
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* @v}]
+
+define weak void @f() {
+  ret void
+}

diff --git a/test/Linker/Inputs/ident.a.ll b/test/Linker/Inputs/ident.a.ll
new file mode 100644
index 0000000..ebda940
--- /dev/null
+++ b/test/Linker/Inputs/ident.a.ll

@@ -0,0 +1,3 @@
+!llvm.ident = !{!0, !1}
+!0 = metadata !{metadata !"Compiler V1"}
+!1 = metadata !{metadata !"Compiler V2"}

diff --git a/test/Linker/Inputs/ident.b.ll b/test/Linker/Inputs/ident.b.ll
new file mode 100644
index 0000000..21ee1d8
--- /dev/null
+++ b/test/Linker/Inputs/ident.b.ll

@@ -0,0 +1,2 @@
+!llvm.ident = !{!0}
+!0 = metadata !{metadata !"Compiler V3"}

diff --git a/test/Linker/Inputs/linkage2.ll b/test/Linker/Inputs/linkage2.ll
new file mode 100644
index 0000000..ce01c9d
--- /dev/null
+++ b/test/Linker/Inputs/linkage2.ll

@@ -0,0 +1,7 @@
+@test1_a = weak global i8 1
+
+@test2_a = external dllimport global i8
+
+@test3_a = common global i16 0
+
+@test4_a = common global i16 0, align 4

diff --git a/test/Linker/Inputs/module-flags-pic-1-b.ll b/test/Linker/Inputs/module-flags-pic-1-b.ll
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/test/Linker/Inputs/module-flags-pic-1-b.ll

@@ -0,0 +1 @@
+

diff --git a/test/Linker/Inputs/module-flags-pic-2-b.ll b/test/Linker/Inputs/module-flags-pic-2-b.ll
new file mode 100644
index 0000000..228e04a
--- /dev/null
+++ b/test/Linker/Inputs/module-flags-pic-2-b.ll

@@ -0,0 +1,3 @@
+!0 = metadata !{ i32 1, metadata !"PIC Level", i32 2 }
+
+!llvm.module.flags = !{!0}

diff --git a/test/Linker/Inputs/redefinition.ll b/test/Linker/Inputs/redefinition.ll
new file mode 100644
index 0000000..0f580e6
--- /dev/null
+++ b/test/Linker/Inputs/redefinition.ll

@@ -0,0 +1 @@
+define void @foo(i32 %x) { ret void }

diff --git a/test/Linker/Inputs/type-unique-inheritance-a.ll b/test/Linker/Inputs/type-unique-inheritance-a.ll
index 381210c..31df5b2 100644
--- a/test/Linker/Inputs/type-unique-inheritance-a.ll
+++ b/test/Linker/Inputs/type-unique-inheritance-a.ll

@@ -52,13 +52,13 @@
   %a.addr = alloca i32, align 4
   %t = alloca %class.A, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !20), !dbg !21
-  call void @llvm.dbg.declare(metadata !{%class.A* %t}, metadata !22), !dbg !23
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
+  call void @llvm.dbg.declare(metadata !{%class.A* %t}, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
   ret void, !dbg !24
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -66,29 +66,29 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !25}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"foo.cpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{i32 786434, metadata !5, null, metadata !"A", i32 3, i64 64, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00A\003\0064\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
 !6 = metadata !{metadata !7, metadata !13}
-!7 = metadata !{i32 786460, null, metadata !"_ZTS1A", null, i32 0, i64 0, i64 0, i64 0, i32 1, metadata !8} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
-!8 = metadata !{i32 786434, metadata !9, null, metadata !"Base", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
+!7 = metadata !{metadata !"0x1c\00\000\000\000\000\001", null, metadata !"_ZTS1A", metadata !8} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
+!8 = metadata !{metadata !"0x2\00Base\003\0032\0032\000\000\000", metadata !9, null, null, metadata !10, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !"./b.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
 !10 = metadata !{metadata !11}
-!11 = metadata !{i32 786445, metadata !9, metadata !"_ZTS4Base", metadata !"b", i32 4, i64 32, i64 32, i64 0, i32 1, metadata !12} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
-!12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!13 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1A", metadata !"x", i32 4, i64 32, i64 32, i64 32, i32 1, metadata !12} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
+!11 = metadata !{metadata !"0xd\00b\004\0032\0032\000\001", metadata !9, metadata !"_ZTS4Base", metadata !12} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
+!12 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{metadata !"0xd\00x\004\0032\0032\0032\001", metadata !5, metadata !"_ZTS1A", metadata !12} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
 !14 = metadata !{metadata !15}
-!15 = metadata !{i32 786478, metadata !1, metadata !16, metadata !"f", metadata !"f", metadata !"_Z1fi", i32 5, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1fi, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [f]
-!16 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp]
-!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x2e\00f\00f\00_Z1fi\005\000\001\000\006\00256\000\005", metadata !1, metadata !16, metadata !17, null, void (i32)* @_Z1fi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [f]
+!16 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !12}
 !19 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!20 = metadata !{i32 786689, metadata !15, metadata !"a", metadata !16, i32 16777221, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 5]
+!20 = metadata !{metadata !"0x101\00a\0016777221\000", metadata !15, metadata !16, metadata !12} ; [ DW_TAG_arg_variable ] [a] [line 5]
 !21 = metadata !{i32 5, i32 0, metadata !15, null}
-!22 = metadata !{i32 786688, metadata !15, metadata !"t", metadata !16, i32 6, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 6]
+!22 = metadata !{metadata !"0x100\00t\006\000", metadata !15, metadata !16, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 6]
 !23 = metadata !{i32 6, i32 0, metadata !15, null}
 !24 = metadata !{i32 7, i32 0, metadata !15, null}
-!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/Inputs/type-unique-inheritance-b.ll b/test/Linker/Inputs/type-unique-inheritance-b.ll
index 0cd43f6..d915e45 100644
--- a/test/Linker/Inputs/type-unique-inheritance-b.ll
+++ b/test/Linker/Inputs/type-unique-inheritance-b.ll

@@ -10,13 +10,13 @@
   %a.addr = alloca i32, align 4
   %t = alloca %class.B, align 8
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !28), !dbg !29
-  call void @llvm.dbg.declare(metadata !{%class.B* %t}, metadata !30), !dbg !31
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !28, metadata !{metadata !"0x102"}), !dbg !29
+  call void @llvm.dbg.declare(metadata !{%class.B* %t}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
   ret void, !dbg !32
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: ssp uwtable
 define i32 @main() #2 {
@@ -24,7 +24,7 @@
   %retval = alloca i32, align 4
   %a = alloca %class.A, align 4
   store i32 0, i32* %retval
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !33), !dbg !34
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !33, metadata !{metadata !"0x102"}), !dbg !34
   call void @_Z1fi(i32 0), !dbg !35
   call void @_Z1gi(i32 1), !dbg !36
   ret i32 0, !dbg !37
@@ -40,42 +40,42 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!27, !38}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !19, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !19, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"bar.cpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4, metadata !11, metadata !15}
-!4 = metadata !{i32 786434, metadata !5, null, metadata !"B", i32 7, i64 128, i64 64, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 7, size 128, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00B\007\00128\0064\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 7, size 128, align 64, offset 0] [def] [from ]
 !5 = metadata !{metadata !"./b.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
 !6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1B", metadata !"bb", i32 8, i64 32, i64 32, i64 0, i32 1, metadata !8} ; [ DW_TAG_member ] [bb] [line 8, size 32, align 32, offset 0] [private] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1B", metadata !"a", i32 9, i64 64, i64 64, i64 64, i32 1, metadata !10} ; [ DW_TAG_member ] [a] [line 9, size 64, align 64, offset 64] [private] [from ]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!11 = metadata !{i32 786434, metadata !12, null, metadata !"A", i32 3, i64 64, i64 32, i32 0, i32 0, null, metadata !13, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
+!7 = metadata !{metadata !"0xd\00bb\008\0032\0032\000\001", metadata !5, metadata !"_ZTS1B", metadata !8} ; [ DW_TAG_member ] [bb] [line 8, size 32, align 32, offset 0] [private] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xd\00a\009\0064\0064\0064\001", metadata !5, metadata !"_ZTS1B", metadata !10} ; [ DW_TAG_member ] [a] [line 9, size 64, align 64, offset 64] [private] [from ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!11 = metadata !{metadata !"0x2\00A\003\0064\0032\000\000\000", metadata !12, null, null, metadata !13, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
 !12 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
 !13 = metadata !{metadata !14, metadata !18}
-!14 = metadata !{i32 786460, null, metadata !"_ZTS1A", null, i32 0, i64 0, i64 0, i64 0, i32 1, metadata !15} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
-!15 = metadata !{i32 786434, metadata !5, null, metadata !"Base", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !16, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
+!14 = metadata !{metadata !"0x1c\00\000\000\000\000\001", null, metadata !"_ZTS1A", metadata !15} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
+!15 = metadata !{metadata !"0x2\00Base\003\0032\0032\000\000\000", metadata !5, null, null, metadata !16, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
 !16 = metadata !{metadata !17}
-!17 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"b", i32 4, i64 32, i64 32, i64 0, i32 1, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
-!18 = metadata !{i32 786445, metadata !12, metadata !"_ZTS1A", metadata !"x", i32 4, i64 32, i64 32, i64 32, i32 1, metadata !8} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
+!17 = metadata !{metadata !"0xd\00b\004\0032\0032\000\001", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
+!18 = metadata !{metadata !"0xd\00x\004\0032\0032\0032\001", metadata !12, metadata !"_ZTS1A", metadata !8} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
 !19 = metadata !{metadata !20, metadata !24}
-!20 = metadata !{i32 786478, metadata !1, metadata !21, metadata !"g", metadata !"g", metadata !"_Z1gi", i32 4, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1gi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
-!21 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp]
-!22 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!20 = metadata !{metadata !"0x2e\00g\00g\00_Z1gi\004\000\001\000\006\00256\000\004", metadata !1, metadata !21, metadata !22, null, void (i32)* @_Z1gi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
+!21 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp]
+!22 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null, metadata !8}
-!24 = metadata !{i32 786478, metadata !1, metadata !21, metadata !"main", metadata !"main", metadata !"", i32 9, metadata !25, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
-!25 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{metadata !"0x2e\00main\00main\00\009\000\001\000\006\00256\000\009", metadata !1, metadata !21, metadata !25, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
+!25 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !26, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !26 = metadata !{metadata !8}
 !27 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!28 = metadata !{i32 786689, metadata !20, metadata !"a", metadata !21, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!28 = metadata !{metadata !"0x101\00a\0016777220\000", metadata !20, metadata !21, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
 !29 = metadata !{i32 4, i32 0, metadata !20, null}
-!30 = metadata !{i32 786688, metadata !20, metadata !"t", metadata !21, i32 5, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 5]
+!30 = metadata !{metadata !"0x100\00t\005\000", metadata !20, metadata !21, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 5]
 !31 = metadata !{i32 5, i32 0, metadata !20, null}
 !32 = metadata !{i32 6, i32 0, metadata !20, null}
-!33 = metadata !{i32 786688, metadata !24, metadata !"a", metadata !21, i32 10, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 10]
+!33 = metadata !{metadata !"0x100\00a\0010\000", metadata !24, metadata !21, metadata !11} ; [ DW_TAG_auto_variable ] [a] [line 10]
 !34 = metadata !{i32 10, i32 0, metadata !24, null}
 !35 = metadata !{i32 11, i32 0, metadata !24, null}
 !36 = metadata !{i32 12, i32 0, metadata !24, null}
 !37 = metadata !{i32 13, i32 0, metadata !24, null}
-!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/Inputs/type-unique-simple2-a.ll b/test/Linker/Inputs/type-unique-simple2-a.ll
index 676b410..5ed5c2a 100644
--- a/test/Linker/Inputs/type-unique-simple2-a.ll
+++ b/test/Linker/Inputs/type-unique-simple2-a.ll

@@ -49,13 +49,13 @@
   %a.addr = alloca i32, align 4
   %t = alloca %struct.Base, align 8
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !17), !dbg !18
-  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !19), !dbg !20
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
+  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !19, metadata !{metadata !"0x102"}), !dbg !20
   ret void, !dbg !21
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -63,26 +63,26 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !22}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"foo.cpp", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"Base", i32 1, i64 128, i64 64, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00Base\001\00128\0064\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
 !5 = metadata !{metadata !"./a.hpp", metadata !"."}
 !6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"b", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4Base"}
+!7 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xd\00b\003\0064\0064\0064\000", metadata !5, metadata !"_ZTS4Base", metadata !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{metadata !12}
-!12 = metadata !{i32 786478, metadata !1, metadata !13, metadata !"f", metadata !"f", metadata !"_Z1fi", i32 3, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1fi, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!13 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [foo.cpp]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x2e\00f\00f\00_Z1fi\003\000\001\000\006\00256\000\003", metadata !1, metadata !13, metadata !14, null, void (i32)* @_Z1fi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!13 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [foo.cpp]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !8}
 !16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!17 = metadata !{i32 786689, metadata !12, metadata !"a", metadata !13, i32 16777219, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!17 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !12, metadata !13, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 3]
 !18 = metadata !{i32 3, i32 0, metadata !12, null}
-!19 = metadata !{i32 786688, metadata !12, metadata !"t", metadata !13, i32 4, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 4]
+!19 = metadata !{metadata !"0x100\00t\004\000", metadata !12, metadata !13, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 4]
 !20 = metadata !{i32 4, i32 0, metadata !12, null}
 !21 = metadata !{i32 5, i32 0, metadata !12, null}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/Inputs/type-unique-simple2-b.ll b/test/Linker/Inputs/type-unique-simple2-b.ll
index 3ec79e5..241218d 100644
--- a/test/Linker/Inputs/type-unique-simple2-b.ll
+++ b/test/Linker/Inputs/type-unique-simple2-b.ll

@@ -8,13 +8,13 @@
   %a.addr = alloca i32, align 4
   %t = alloca %struct.Base, align 8
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !20), !dbg !21
-  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !22), !dbg !23
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
+  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !22, metadata !{metadata !"0x102"}), !dbg !23
   ret void, !dbg !24
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: ssp uwtable
 define i32 @main() #2 {
@@ -36,32 +36,32 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !28}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [bar.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [bar.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"bar.cpp", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"Base", i32 1, i64 128, i64 64, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00Base\001\00128\0064\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
 !5 = metadata !{metadata !"./a.hpp", metadata !"."}
 !6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"b", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4Base"}
+!7 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xd\00b\003\0064\0064\0064\000", metadata !5, metadata !"_ZTS4Base", metadata !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{metadata !12, metadata !16}
-!12 = metadata !{i32 786478, metadata !1, metadata !13, metadata !"g", metadata !"g", metadata !"_Z1gi", i32 4, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1gi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
-!13 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [bar.cpp]
-!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !"0x2e\00g\00g\00_Z1gi\004\000\001\000\006\00256\000\004", metadata !1, metadata !13, metadata !14, null, void (i32)* @_Z1gi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
+!13 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [bar.cpp]
+!14 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !8}
-!16 = metadata !{i32 786478, metadata !1, metadata !13, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !13, metadata !17, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{metadata !8}
 !19 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!20 = metadata !{i32 786689, metadata !12, metadata !"a", metadata !13, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!20 = metadata !{metadata !"0x101\00a\0016777220\000", metadata !12, metadata !13, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
 !21 = metadata !{i32 4, i32 0, metadata !12, null}
-!22 = metadata !{i32 786688, metadata !12, metadata !"t", metadata !13, i32 5, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 5]
+!22 = metadata !{metadata !"0x100\00t\005\000", metadata !12, metadata !13, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 5]
 !23 = metadata !{i32 5, i32 0, metadata !12, null}
 !24 = metadata !{i32 6, i32 0, metadata !12, null}
-!25 = metadata !{i32 8, i32 0, metadata !16, null} ; [ DW_TAG_imported_declaration ]
+!25 = metadata !{i32 8, i32 0, metadata !16, null}
 !26 = metadata !{i32 9, i32 0, metadata !16, null}
 !27 = metadata !{i32 10, i32 0, metadata !16, null}
-!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/Inputs/unique-fwd-decl-b.ll b/test/Linker/Inputs/unique-fwd-decl-b.ll
new file mode 100644
index 0000000..240fbee
--- /dev/null
+++ b/test/Linker/Inputs/unique-fwd-decl-b.ll

@@ -0,0 +1,3 @@
+!b = !{!0}
+!0 = metadata !{metadata !1}
+!1 = metadata !{}

diff --git a/test/Linker/Inputs/visibility.ll b/test/Linker/Inputs/visibility.ll
new file mode 100644
index 0000000..2ab58fd
--- /dev/null
+++ b/test/Linker/Inputs/visibility.ll

@@ -0,0 +1,26 @@
+$c1 = comdat any
+
+; Variables
+@v1 = weak hidden global i32 0
+@v2 = weak protected global i32 0
+@v3 = weak hidden global i32 0
+@v4 = hidden global i32 1, comdat $c1
+
+; Aliases
+@a1 = weak hidden alias i32* @v1
+@a2 = weak protected alias i32* @v2
+@a3 = weak hidden alias i32* @v3
+
+; Functions
+define weak hidden void @f1() {
+entry:
+  ret void
+}
+define weak protected void @f2() {
+entry:
+  ret void
+}
+define weak hidden void @f3() {
+entry:
+  ret void
+}

diff --git a/test/Linker/comdat7.ll b/test/Linker/comdat7.ll
index c3ff3f6..d7e5e2d 100644
--- a/test/Linker/comdat7.ll
+++ b/test/Linker/comdat7.ll

@@ -1,9 +1,8 @@
-; RUN: not llvm-link %s %p/Inputs/comdat5.ll -S -o - 2>&1 | FileCheck %s
-target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
-target triple = "i686-pc-windows-msvc"
+; RUN: not llvm-link %s %s -S -o - 2>&1 | FileCheck %s
 
-$"\01??_7S@@6B@" = comdat largest
-define void @"\01??_7S@@6B@"() {
+$c1 = comdat largest
+
+define void @c1() comdat $c1 {
   ret void
 }
 ; CHECK: GlobalVariable required for data dependent selection!

diff --git a/test/Linker/comdat8.ll b/test/Linker/comdat8.ll
index 21669f6..e6da583 100644
--- a/test/Linker/comdat8.ll
+++ b/test/Linker/comdat8.ll

@@ -1,10 +1,8 @@
-; RUN: not llvm-link %s %p/Inputs/comdat5.ll -S -o - 2>&1 | FileCheck %s
-target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
-target triple = "i686-pc-windows-msvc"
+; RUN: not llvm-link %s %p/Inputs/comdat8.ll -S -o - 2>&1 | FileCheck %s
 
-$"\01??_7S@@6B@" = comdat largest
-define void @some_name() {
-  ret void
-}
-@"\01??_7S@@6B@" = alias i8* inttoptr (i32 ptrtoint (void ()* @some_name to i32) to i8*)
+$c1 = comdat largest
+
+@some_name = private unnamed_addr constant i32 42, comdat $c1
+@c1 = alias i8* inttoptr (i32 ptrtoint (i32* @some_name to i32) to i8*)
+
 ; CHECK: COMDAT key involves incomputable alias size.

diff --git a/test/Linker/comdat9.ll b/test/Linker/comdat9.ll
new file mode 100644
index 0000000..eada8c6
--- /dev/null
+++ b/test/Linker/comdat9.ll

@@ -0,0 +1,7 @@
+; RUN: llvm-link %s %p/Inputs/comdat9.ll -S -o - | FileCheck %s
+
+; CHECK: $c = comdat any
+; CHECK: @a = alias void ()* @f
+; CHECK: define internal void @f() comdat $c {
+; CHECK:   ret void
+; CHECK: }

diff --git a/test/Linker/constructor-comdat.ll b/test/Linker/constructor-comdat.ll
new file mode 100644
index 0000000..42e2d83
--- /dev/null
+++ b/test/Linker/constructor-comdat.ll

@@ -0,0 +1,13 @@
+; RUN: llvm-link %s %p/Inputs/constructor-comdat.ll -S -o - 2>&1 | FileCheck %s
+; RUN: llvm-link %p/Inputs/constructor-comdat.ll %s -S -o - 2>&1 | FileCheck %s
+
+$_ZN3fooIiEC5Ev = comdat any
+; CHECK: $_ZN3fooIiEC5Ev = comdat any
+
+@_ZN3fooIiEC1Ev = weak_odr alias void ()* @_ZN3fooIiEC2Ev
+; CHECK: @_ZN3fooIiEC1Ev = weak_odr alias void ()* @_ZN3fooIiEC2Ev
+
+; CHECK: define weak_odr void @_ZN3fooIiEC2Ev() comdat $_ZN3fooIiEC5Ev {
+define weak_odr void @_ZN3fooIiEC2Ev() comdat $_ZN3fooIiEC5Ev {
+  ret void
+}

diff --git a/test/Linker/ctors.ll b/test/Linker/ctors.ll
new file mode 100644
index 0000000..67bf456
--- /dev/null
+++ b/test/Linker/ctors.ll

@@ -0,0 +1,15 @@
+; RUN: llvm-link %s %p/Inputs/ctors.ll -S -o - | \
+; RUN:   FileCheck --check-prefix=ALL --check-prefix=CHECK1 %s
+; RUN: llvm-link %p/Inputs/ctors.ll %s -S -o - | \
+; RUN:   FileCheck --check-prefix=ALL --check-prefix=CHECK2 %s
+
+@v = weak global i8 0
+; CHECK1: @v = weak global i8 0
+; CHECK2: @v = weak global i8 1
+
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* @v }]
+; ALL: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* @v }]
+
+define weak void @f() {
+  ret void
+}

diff --git a/test/Linker/debug-info-version-a.ll b/test/Linker/debug-info-version-a.ll
index c3d9c87..64a0583 100644
--- a/test/Linker/debug-info-version-a.ll
+++ b/test/Linker/debug-info-version-a.ll

@@ -10,7 +10,7 @@
 !llvm.module.flags = !{ !0 }
 !llvm.dbg.cu = !{!1}
 
-!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
-!1 = metadata !{i32 589841, metadata !2, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !3, metadata !3, metadata !3, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!1 = metadata !{metadata !"0x11\0012\00clang\001\00\000\00\000", metadata !2, metadata !3, metadata !3, metadata !3, null, null} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{metadata !"a.c", metadata !""}
 !3 = metadata !{}

diff --git a/test/Linker/debug-info-version-b.ll b/test/Linker/debug-info-version-b.ll
index 2b4f184..515291f 100644
--- a/test/Linker/debug-info-version-b.ll
+++ b/test/Linker/debug-info-version-b.ll

@@ -5,6 +5,6 @@
 !llvm.dbg.cu = !{!1}
 
 !0 = metadata !{i32 2, metadata !"Debug Info Version", i32 42}
-!1 = metadata !{i32 589841, metadata !2, i32 12, metadata !"clang", metadata !"I AM UNEXPECTED!"} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x11\0012\00clang\000\00", metadata !"I AM UNEXPECTED!"} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{metadata !"b.c", metadata !""}
 !3 = metadata !{}

diff --git a/test/Linker/global_ctors.ll b/test/Linker/global_ctors.ll
index 541f0d4..49df81a 100644
--- a/test/Linker/global_ctors.ll
+++ b/test/Linker/global_ctors.ll

@@ -1,5 +1,6 @@
 ; RUN: llvm-as %s -o %t.new.bc
 ; RUN: llvm-link %t.new.bc %S/Inputs/old_global_ctors.3.4.bc | llvm-dis | FileCheck %s
+; RUN: llvm-link %S/Inputs/old_global_ctors.3.4.bc %t.new.bc | llvm-dis | FileCheck %s
 
 ; old_global_ctors.3.4.bc contains the following LLVM IL, assembled into
 ; bitcode by llvm-as from 3.4.  It uses a two element @llvm.global_ctors array.

diff --git a/test/Linker/ident.ll b/test/Linker/ident.ll
new file mode 100644
index 0000000..93bf8c7
--- /dev/null
+++ b/test/Linker/ident.ll

@@ -0,0 +1,9 @@
+; RUN: llvm-link %S/Inputs/ident.a.ll %S/Inputs/ident.b.ll -S | FileCheck %s
+
+; Verify that multiple input llvm.ident metadata are linked together.
+
+; CHECK-DAG: !llvm.ident = !{!0, !1, !2}
+; CHECK-DAG: "Compiler V1"
+; CHECK-DAG: "Compiler V2"
+; CHECK-DAG: "Compiler V3"
+

diff --git a/test/Linker/link-messages.ll b/test/Linker/link-messages.ll
deleted file mode 100644
index 4e7ffbc..0000000
--- a/test/Linker/link-messages.ll
+++ /dev/null

@@ -1,10 +0,0 @@
-; Test that linking two files with the same definition causes an error and
-; that error is printed out.
-; RUN: llvm-as %s -o %t.one.bc
-; RUN: llvm-as %s -o %t.two.bc
-; RUN: not llvm-link %t.one.bc %t.two.bc -o %t.bc 2>&1 | FileCheck %s
-
-; CHECK: symbol multiply defined
-define i32 @bar() {
-  ret i32 0
-}

diff --git a/test/Linker/linkage2.ll b/test/Linker/linkage2.ll
new file mode 100644
index 0000000..dbae7ca
--- /dev/null
+++ b/test/Linker/linkage2.ll

@@ -0,0 +1,14 @@
+; RUN: llvm-link %s %p/Inputs/linkage2.ll -S | FileCheck %s
+; RUN: llvm-link %p/Inputs/linkage2.ll %s -S | FileCheck %s
+
+@test1_a = common global i8 0
+; CHECK-DAG: @test1_a = common global i8 0
+
+@test2_a = global i8 0
+; CHECK-DAG: @test2_a = global i8 0
+
+@test3_a = common global i8 0
+; CHECK-DAG: @test3_a = common global i16 0
+
+@test4_a = common global i8 0, align 8
+; CHECK-DAG: @test4_a = common global i16 0, align 8

diff --git a/test/Linker/lto-attributes.ll b/test/Linker/lto-attributes.ll
new file mode 100644
index 0000000..0dc78ad
--- /dev/null
+++ b/test/Linker/lto-attributes.ll

@@ -0,0 +1,7 @@
+; RUN: llvm-link -S %s -o - | FileCheck %s
+
+; CHECK: @foo = private externally_initialized global i8* null
+@foo = private externally_initialized global i8* null
+; CHECK: @array = appending global [7 x i8] c"abcdefg", align 1
+@array = appending global [7 x i8] c"abcdefg", align 1
+

diff --git a/test/Linker/module-flags-pic-1-a.ll b/test/Linker/module-flags-pic-1-a.ll
new file mode 100644
index 0000000..bc4da95
--- /dev/null
+++ b/test/Linker/module-flags-pic-1-a.ll

@@ -0,0 +1,9 @@
+; RUN: llvm-link %s %p/Inputs/module-flags-pic-1-b.ll -S -o - | FileCheck %s
+
+; test linking modules with specified and default PIC levels
+
+!0 = metadata !{ i32 1, metadata !"PIC Level", i32 1 }
+
+!llvm.module.flags = !{!0}
+; CHECK: !llvm.module.flags = !{!0}
+; CHECK: !0 = metadata !{i32 1, metadata !"PIC Level", i32 1}

diff --git a/test/Linker/module-flags-pic-2-a.ll b/test/Linker/module-flags-pic-2-a.ll
new file mode 100644
index 0000000..3ff9c8f
--- /dev/null
+++ b/test/Linker/module-flags-pic-2-a.ll

@@ -0,0 +1,10 @@
+; RUN: not llvm-link %s %p/Inputs/module-flags-pic-2-b.ll -S -o - 2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; test linking modules with two different PIC levels
+
+!0 = metadata !{ i32 1, metadata !"PIC Level", i32 1 }
+
+!llvm.module.flags = !{!0}
+
+; CHECK-ERRORS: ERROR: linking module flags 'PIC Level': IDs have conflicting values

diff --git a/test/Linker/redefinition.ll b/test/Linker/redefinition.ll
index 64a8c34..1177a70 100644
--- a/test/Linker/redefinition.ll
+++ b/test/Linker/redefinition.ll

@@ -1,9 +1,6 @@
-; Test linking two functions with different prototypes and two globals 
+; Test linking two functions with different prototypes and two globals
 ; in different modules.
-; RUN: llvm-as %s -o %t.foo1.bc
-; RUN: llvm-as %s -o %t.foo2.bc
-; RUN: echo "define void @foo(i32 %x) { ret void }" | llvm-as -o %t.foo3.bc
-; RUN: not llvm-link %t.foo1.bc %t.foo2.bc -o %t.bc 2>&1 | FileCheck %s
-; RUN: not llvm-link %t.foo1.bc %t.foo3.bc -o %t.bc 2>&1 | FileCheck %s
-; CHECK: symbol multiply defined
+; RUN: not llvm-link %s %s -o %t.bc 2>&1 | FileCheck %s
+; RUN: not llvm-link %s %S/Inputs/redefinition.ll -o %t.bc 2>&1 | FileCheck %s
+; CHECK: ERROR: Linking globals named 'foo': symbol multiply defined!
 define void @foo() { ret void }

diff --git a/test/Linker/targettriple.ll b/test/Linker/targettriple.ll
index 7183047..c544a14 100644
--- a/test/Linker/targettriple.ll
+++ b/test/Linker/targettriple.ll

@@ -5,6 +5,9 @@
 ; RUN: llvm-link %s %S/Inputs/targettriple-b.ll -S -o - 2>%t.b.err | FileCheck %s
 ; RUN: cat %t.b.err | FileCheck --check-prefix=WARN-B %s
 
+; RUN: llvm-link -suppress-warnings %s %S/Inputs/targettriple-b.ll -S -o - 2>%t.no-warn.err | FileCheck %s
+; RUN: (echo foo ;cat %t.no-warn.err) | FileCheck --check-prefix=WARN-A %s
+
 target triple = "e"
 
 ; CHECK: target triple = "e"

diff --git a/test/Linker/type-unique-odr-a.ll b/test/Linker/type-unique-odr-a.ll
index 91c8033..e17cd2b 100644
--- a/test/Linker/type-unique-odr-a.ll
+++ b/test/Linker/type-unique-odr-a.ll

@@ -41,7 +41,7 @@
 ; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZL3barv"
 
 ; getFoo and A may only appear once.
-; CHECK-NOT:  {{(getFoo)|("A")}}
+; CHECK-NOT:  AT_name{{.*(getFoo)|("A")}}
 
 
 ; ModuleID = 'type-unique-odr-a.cpp'
@@ -59,12 +59,12 @@
 define internal void @_ZL3barv() #0 {
 entry:
   %a = alloca %class.A, align 4
-  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !24), !dbg !25
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
   ret void, !dbg !26
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
@@ -73,30 +73,30 @@
 !llvm.module.flags = !{!20, !21}
 !llvm.ident = !{!22}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786434, metadata !5, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00A\001\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !"type-unique-odr-a.cpp", metadata !""}
 !6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1A", metadata !"data", i32 2, i64 32, i64 32, i64 0, i32 1, metadata !8} ; [ DW_TAG_member ] [data] [line 2, size 32, align 32, offset 0] [private] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"getFoo", metadata !"getFoo", metadata !"_ZN1A6getFooEv", i32 4, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i32 258, i1 false, null, null, i32 0, metadata !13, i32 4} ; [ DW_TAG_subprogram ] [line 4] [protected] [getFoo]
-!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0xd\00data\002\0032\0032\000\001", metadata !5, metadata !"_ZTS1A", metadata !8} ; [ DW_TAG_member ] [data] [line 2, size 32, align 32, offset 0] [private] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\004\000\000\000\006\00258\000\004", metadata !5, metadata !"_ZTS1A", metadata !10, null, null, null, i32 0, metadata !13} ; [ DW_TAG_subprogram ] [line 4] [protected] [getFoo]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12}
-!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
 !13 = metadata !{i32 786468}
 !14 = metadata !{metadata !15, metadata !19}
-!15 = metadata !{i32 786478, metadata !5, metadata !16, metadata !"baz", metadata !"baz", metadata !"_Z3bazv", i32 11, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3bazv, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [baz]
-!16 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [type-unique-odr-a.cpp]
-!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x2e\00baz\00baz\00_Z3bazv\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !16, metadata !17, null, void ()* @_Z3bazv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [baz]
+!16 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [type-unique-odr-a.cpp]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null}
-!19 = metadata !{i32 786478, metadata !5, metadata !16, metadata !"bar", metadata !"bar", metadata !"_ZL3barv", i32 7, metadata !17, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZL3barv, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [local] [def] [bar]
+!19 = metadata !{metadata !"0x2e\00bar\00bar\00_ZL3barv\007\001\001\000\006\00256\000\007", metadata !5, metadata !16, metadata !17, null, void ()* @_ZL3barv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [local] [def] [bar]
 !20 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !22 = metadata !{metadata !"clang version 3.5.0 "}
 !23 = metadata !{i32 11, i32 0, metadata !15, null}
-!24 = metadata !{i32 786688, metadata !19, metadata !"a", metadata !16, i32 8, metadata !"_ZTS1A", i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 8]
-!25 = metadata !{i32 8, i32 0, metadata !19, null} ; [ DW_TAG_imported_declaration ]
+!24 = metadata !{metadata !"0x100\00a\008\000", metadata !19, metadata !16, metadata !"_ZTS1A"} ; [ DW_TAG_auto_variable ] [a] [line 8]
+!25 = metadata !{i32 8, i32 0, metadata !19, null}
 !26 = metadata !{i32 9, i32 0, metadata !19, null}

diff --git a/test/Linker/type-unique-odr-b.ll b/test/Linker/type-unique-odr-b.ll
index 3c8b7a1..e5f094e 100644
--- a/test/Linker/type-unique-odr-b.ll
+++ b/test/Linker/type-unique-odr-b.ll

@@ -26,13 +26,13 @@
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !24), !dbg !26
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !26
   %this1 = load %class.A** %this.addr
   ret void, !dbg !27
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind
 define void @_Z1fv() #0 {
@@ -54,33 +54,33 @@
 !llvm.module.flags = !{!21, !22}
 !llvm.ident = !{!23}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786434, metadata !5, null, metadata !"A", i32 2, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00A\002\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 32, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !"type-unique-odr-b.cpp", metadata !""}
 !6 = metadata !{metadata !7, metadata !9}
-!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1A", metadata !"data", i32 3, i64 32, i64 32, i64 0, i32 1, metadata !8} ; [ DW_TAG_member ] [data] [line 3, size 32, align 32, offset 0] [private] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"getFoo", metadata !"getFoo", metadata !"_ZN1A6getFooEv", i32 5, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i32 258, i1 false, null, null, i32 0, metadata !13, i32 5} ; [ DW_TAG_subprogram ] [line 5] [protected] [getFoo]
-!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0xd\00data\003\0032\0032\000\001", metadata !5, metadata !"_ZTS1A", metadata !8} ; [ DW_TAG_member ] [data] [line 3, size 32, align 32, offset 0] [private] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\005\000\000\000\006\00258\000\005", metadata !5, metadata !"_ZTS1A", metadata !10, null, null, null, i32 0, metadata !13} ; [ DW_TAG_subprogram ] [line 5] [protected] [getFoo]
+!10 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12}
-!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
 !13 = metadata !{i32 786468}
 !14 = metadata !{metadata !15, metadata !16, metadata !20}
-!15 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"getFoo", metadata !"getFoo", metadata !"_ZN1A6getFooEv", i32 8, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1A6getFooEv, null, metadata !9, metadata !2, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [getFoo]
-!16 = metadata !{i32 786478, metadata !5, metadata !17, metadata !"f", metadata !"f", metadata !"_Z1fv", i32 11, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z1fv, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [f]
-!17 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [type-unique-odr-b.cpp]
-!18 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\008\000\001\000\006\00256\000\008", metadata !5, metadata !"_ZTS1A", metadata !10, null, void (%class.A*)* @_ZN1A6getFooEv, null, metadata !9, metadata !2} ; [ DW_TAG_subprogram ] [line 8] [def] [getFoo]
+!16 = metadata !{metadata !"0x2e\00f\00f\00_Z1fv\0011\000\001\000\006\00256\000\0011", metadata !5, metadata !17, metadata !18, null, void ()* @_Z1fv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [f]
+!17 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [type-unique-odr-b.cpp]
+!18 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{null}
-!20 = metadata !{i32 786478, metadata !5, metadata !17, metadata !"bar", metadata !"bar", metadata !"_ZL3barv", i32 10, metadata !18, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZL3barv, null, null, metadata !2, i32 10} ; [ DW_TAG_subprogram ] [line 10] [local] [def] [bar]
+!20 = metadata !{metadata !"0x2e\00bar\00bar\00_ZL3barv\0010\001\001\000\006\00256\000\0010", metadata !5, metadata !17, metadata !18, null, void ()* @_ZL3barv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [local] [def] [bar]
 !21 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !23 = metadata !{metadata !"clang version 3.5.0 "}
-!24 = metadata !{i32 786689, metadata !15, metadata !"this", null, i32 16777216, metadata !25, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!24 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !15, null, metadata !25} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!25 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
 !26 = metadata !{i32 0, i32 0, metadata !15, null}
-!27 = metadata !{i32 8, i32 0, metadata !15, null} ; [ DW_TAG_imported_declaration ]
+!27 = metadata !{i32 8, i32 0, metadata !15, null}
 !28 = metadata !{i32 11, i32 0, metadata !16, null}
 !29 = metadata !{i32 10, i32 0, metadata !20, null}

diff --git a/test/Linker/type-unique-simple-a.ll b/test/Linker/type-unique-simple-a.ll
index 350cd1f..c01cd5c 100644
--- a/test/Linker/type-unique-simple-a.ll
+++ b/test/Linker/type-unique-simple-a.ll

@@ -54,13 +54,13 @@
   %a.addr = alloca i32, align 4
   %t = alloca %struct.Base, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !15), !dbg !16
-  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !17), !dbg !18
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !15, metadata !{metadata !"0x102"}), !dbg !16
+  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !17, metadata !{metadata !"0x102"}), !dbg !18
   ret void, !dbg !19
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -68,24 +68,24 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !20}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"foo.cpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"Base", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00Base\001\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!7 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"f", metadata !"f", metadata !"_Z1fi", i32 3, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1fi, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
-!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp]
-!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x2e\00f\00f\00_Z1fi\003\000\001\000\006\00256\000\003", metadata !1, metadata !11, metadata !12, null, void (i32)* @_Z1fi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{null, metadata !8}
 !14 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!15 = metadata !{i32 786689, metadata !10, metadata !"a", metadata !11, i32 16777219, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!15 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !10, metadata !11, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 3]
 !16 = metadata !{i32 3, i32 0, metadata !10, null}
-!17 = metadata !{i32 786688, metadata !10, metadata !"t", metadata !11, i32 4, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 4]
+!17 = metadata !{metadata !"0x100\00t\004\000", metadata !10, metadata !11, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 4]
 !18 = metadata !{i32 4, i32 0, metadata !10, null}
 !19 = metadata !{i32 5, i32 0, metadata !10, null}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/type-unique-simple-b.ll b/test/Linker/type-unique-simple-b.ll
index 854ec15..fabdb03 100644
--- a/test/Linker/type-unique-simple-b.ll
+++ b/test/Linker/type-unique-simple-b.ll

@@ -10,13 +10,13 @@
   %a.addr = alloca i32, align 4
   %t = alloca %struct.Base, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !18), !dbg !19
-  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !20), !dbg !21
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !18, metadata !{metadata !"0x102"}), !dbg !19
+  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !20, metadata !{metadata !"0x102"}), !dbg !21
   ret void, !dbg !22
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: ssp uwtable
 define i32 @main() #2 {
@@ -38,30 +38,30 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17, !26}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)\000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"bar.cpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"Base", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00Base\001\0032\0032\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!7 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !5, metadata !"_ZTS4Base", metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10, metadata !14}
-!10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"g", metadata !"g", metadata !"_Z1gi", i32 4, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1gi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
-!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp]
-!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x2e\00g\00g\00_Z1gi\004\000\001\000\006\00256\000\004", metadata !1, metadata !11, metadata !12, null, void (i32)* @_Z1gi, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
+!11 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp]
+!12 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{null, metadata !8}
-!14 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !11, metadata !15, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !8}
 !17 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!18 = metadata !{i32 786689, metadata !10, metadata !"a", metadata !11, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!18 = metadata !{metadata !"0x101\00a\0016777220\000", metadata !10, metadata !11, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 4]
 !19 = metadata !{i32 4, i32 0, metadata !10, null}
-!20 = metadata !{i32 786688, metadata !10, metadata !"t", metadata !11, i32 5, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 5]
+!20 = metadata !{metadata !"0x100\00t\005\000", metadata !10, metadata !11, metadata !4} ; [ DW_TAG_auto_variable ] [t] [line 5]
 !21 = metadata !{i32 5, i32 0, metadata !10, null}
 !22 = metadata !{i32 6, i32 0, metadata !10, null}
-!23 = metadata !{i32 8, i32 0, metadata !14, null} ; [ DW_TAG_imported_declaration ]
+!23 = metadata !{i32 8, i32 0, metadata !14, null}
 !24 = metadata !{i32 9, i32 0, metadata !14, null}
 !25 = metadata !{i32 10, i32 0, metadata !14, null}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Linker/type-unique-simple2-a.ll b/test/Linker/type-unique-simple2-a.ll
index d0f1155..691c5c5 100644
--- a/test/Linker/type-unique-simple2-a.ll
+++ b/test/Linker/type-unique-simple2-a.ll

@@ -19,7 +19,7 @@
 ; }
 ;
 ; CHECK: _ZN1A6setFooEv
-; CHECK: DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; CHECK: DW_AT_accessibility [DW_FORM_data1]   (DW_ACCESS_public)
 ; CHECK-NOT: DW_AT_accessibility
 ; CHECK: DW_TAG
 
@@ -48,7 +48,7 @@
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !39), !dbg !41
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !39, metadata !{metadata !"0x102"}), !dbg !41
   %this1 = load %class.A** %this.addr
   call void @_ZN1AC2Ev(%class.A* %this1) #1, !dbg !42
   ret void, !dbg !42
@@ -57,14 +57,14 @@
 declare i32 @_ZN1A6getFooEv(%class.A*)
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #4
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #4
 
 ; Function Attrs: inlinehint nounwind
 define linkonce_odr void @_ZN1AC2Ev(%class.A* %this) unnamed_addr #2 align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !44), !dbg !45
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !44, metadata !{metadata !"0x102"}), !dbg !45
   %this1 = load %class.A** %this.addr
   %0 = bitcast %class.A* %this1 to i8***, !dbg !46
   store i8** getelementptr inbounds ([4 x i8*]* @_ZTV1A, i64 0, i64 2), i8*** %0, !dbg !46
@@ -80,50 +80,50 @@
 !llvm.module.flags = !{!35, !36}
 !llvm.ident = !{!37}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !26, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !26, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786434, metadata !5, null, metadata !"A", i32 2, i64 64, i64 64, i32 0, i32 0, null, metadata !6, i32 0, metadata !"_ZTS1A", null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 64, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00A\002\0064\0064\000\000\000", metadata !5, null, null, metadata !6, metadata !"_ZTS1A", null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 64, align 64, offset 0] [def] [from ]
 !5 = metadata !{metadata !"./ab.h", metadata !""}
 !6 = metadata !{metadata !7, metadata !14, metadata !19}
-!7 = metadata !{i32 786445, metadata !5, metadata !8, metadata !"_vptr$A", i32 0, i64 64, i64 0, i64 0, i32 64, metadata !9} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
-!8 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/./ab.h]
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
-!10 = metadata !{i32 786447, null, null, metadata !"__vtbl_ptr_type", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0xd\00_vptr$A\000\0064\000\000\0064", metadata !5, metadata !8, metadata !9} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!8 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/./ab.h]
+!9 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!10 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"setFoo", metadata !"setFoo", metadata !"_ZN1A6setFooEv", i32 4, metadata !15, i1 false, i1 false, i32 1, i32 0, metadata !"_ZTS1A", i32 256, i1 false, null, null, i32 0, metadata !18, i32 4} ; [ DW_TAG_subprogram ] [line 4] [setFoo]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !"0x2e\00setFoo\00setFoo\00_ZN1A6setFooEv\004\000\000\001\006\00259\000\004", metadata !5, metadata !"_ZTS1A", metadata !15, metadata !"_ZTS1A", null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 4] [setFoo]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17}
-!17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
 !18 = metadata !{i32 786468}
-!19 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"getFoo", metadata !"getFoo", metadata !"_ZN1A6getFooEv", i32 5, metadata !20, i1 false, i1 false, i32 1, i32 1, metadata !"_ZTS1A", i32 256, i1 false, null, null, i32 0, metadata !25, i32 5} ; [ DW_TAG_subprogram ] [line 5] [getFoo]
-!20 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\005\000\000\001\006\00259\000\005", metadata !5, metadata !"_ZTS1A", metadata !20, metadata !"_ZTS1A", null, null, i32 0, metadata !25} ; [ DW_TAG_subprogram ] [line 5] [getFoo]
+!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{metadata !22, metadata !17}
-!22 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !23} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo_t]
-!23 = metadata !{i32 786454, metadata !24, null, metadata !"foo_t", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_typedef ] [foo_t] [line 1, size 0, align 0, offset 0] [from int]
+!22 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !23} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo_t]
+!23 = metadata !{metadata !"0x16\00foo_t\001\000\000\000\000", metadata !24, null, metadata !13} ; [ DW_TAG_typedef ] [foo_t] [line 1, size 0, align 0, offset 0] [from int]
 !24 = metadata !{metadata !"a.cpp", metadata !""}
 !25 = metadata !{i32 786468}
 !26 = metadata !{metadata !27, metadata !31, metadata !34}
-!27 = metadata !{i32 786478, metadata !24, metadata !28, metadata !"bar", metadata !"bar", metadata !"_Z3barv", i32 2, metadata !29, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3barv, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
-!28 = metadata !{i32 786473, metadata !24}        ; [ DW_TAG_file_type ] [/a.cpp]
-!29 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = metadata !{metadata !"0x2e\00bar\00bar\00_Z3barv\002\000\001\000\006\00256\000\002", metadata !24, metadata !28, metadata !29, null, i32 ()* @_Z3barv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!28 = metadata !{metadata !"0x29", metadata !24}        ; [ DW_TAG_file_type ] [/a.cpp]
+!29 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !30, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !30 = metadata !{metadata !23}
-!31 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"_ZN1AC1Ev", i32 2, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 320, i1 false, void (%class.A*)* @_ZN1AC1Ev, null, metadata !32, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [A]
-!32 = metadata !{i32 786478, null, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"", i32 0, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !33, i32 0} ; [ DW_TAG_subprogram ] [line 0] [A]
+!31 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC1Ev\002\000\001\000\006\00320\000\002", metadata !5, metadata !"_ZTS1A", metadata !15, null, void (%class.A*)* @_ZN1AC1Ev, null, metadata !32, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [A]
+!32 = metadata !{metadata !"0x2e\00A\00A\00\000\000\000\000\006\00320\000\000", null, metadata !"_ZTS1A", metadata !15, null, null, null, i32 0, metadata !33} ; [ DW_TAG_subprogram ] [line 0] [A]
 !33 = metadata !{i32 786468}
-!34 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"A", metadata !"A", metadata !"_ZN1AC2Ev", i32 2, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 320, i1 false, void (%class.A*)* @_ZN1AC2Ev, null, metadata !32, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [A]
+!34 = metadata !{metadata !"0x2e\00A\00A\00_ZN1AC2Ev\002\000\001\000\006\00320\000\002", metadata !5, metadata !"_ZTS1A", metadata !15, null, void (%class.A*)* @_ZN1AC2Ev, null, metadata !32, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [A]
 !35 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!36 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!36 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !37 = metadata !{metadata !"clang version 3.5 "}
 !38 = metadata !{i32 3, i32 0, metadata !27, null}
-!39 = metadata !{i32 786689, metadata !31, metadata !"this", null, i32 16777216, metadata !40, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!40 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!39 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !31, null, metadata !40} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!40 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
 !41 = metadata !{i32 0, i32 0, metadata !31, null}
 !42 = metadata !{i32 2, i32 0, metadata !43, null}
-!43 = metadata !{i32 786443, metadata !5, metadata !31} ; [ DW_TAG_lexical_block ] [/./ab.h]
-!44 = metadata !{i32 786689, metadata !34, metadata !"this", null, i32 16777216, metadata !40, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!43 = metadata !{metadata !"0xb\000", metadata !5, metadata !31} ; [ DW_TAG_lexical_block ] [/./ab.h]
+!44 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !34, null, metadata !40} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !45 = metadata !{i32 0, i32 0, metadata !34, null}
 !46 = metadata !{i32 2, i32 0, metadata !34, null}

diff --git a/test/Linker/type-unique-simple2-b.ll b/test/Linker/type-unique-simple2-b.ll
index 9155f69..f851316 100644
--- a/test/Linker/type-unique-simple2-b.ll
+++ b/test/Linker/type-unique-simple2-b.ll

@@ -22,20 +22,20 @@
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !32), !dbg !34
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !32, metadata !{metadata !"0x102"}), !dbg !34
   %this1 = load %class.A** %this.addr
   ret void, !dbg !35
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind
 define i32 @_ZN1A6getFooEv(%class.A* %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca %class.A*, align 8
   store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !36), !dbg !37
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !36, metadata !{metadata !"0x102"}), !dbg !37
   %this1 = load %class.A** %this.addr
   ret i32 1, !dbg !38
 }
@@ -47,42 +47,42 @@
 !llvm.module.flags = !{!29, !30}
 !llvm.ident = !{!31}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !25, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/<unknown>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !3, metadata !25, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !""}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786434, metadata !5, null, metadata !"A", i32 2, i64 64, i64 64, i32 0, i32 0, null, metadata !6, i32 0, metadata !"_ZTS1A", null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 64, align 64, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x2\00A\002\0064\0064\000\000\000", metadata !5, null, null, metadata !6, metadata !"_ZTS1A", null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 2, size 64, align 64, offset 0] [def] [from ]
 !5 = metadata !{metadata !"./ab.h", metadata !""}
 !6 = metadata !{metadata !7, metadata !14, metadata !19}
-!7 = metadata !{i32 786445, metadata !5, metadata !8, metadata !"_vptr$A", i32 0, i64 64, i64 0, i64 0, i32 64, metadata !9} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
-!8 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/./ab.h]
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
-!10 = metadata !{i32 786447, null, null, metadata !"__vtbl_ptr_type", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
-!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0xd\00_vptr$A\000\0064\000\000\0064", metadata !5, metadata !8, metadata !9} ; [ DW_TAG_member ] [_vptr$A] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!8 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/./ab.h]
+!9 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!10 = metadata !{metadata !"0xf\00__vtbl_ptr_type\000\0064\000\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!14 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"setFoo", metadata !"setFoo", metadata !"_ZN1A6setFooEv", i32 4, metadata !15, i1 false, i1 false, i32 1, i32 0, metadata !"_ZTS1A", i32 256, i1 false, null, null, i32 0, metadata !18, i32 4} ; [ DW_TAG_subprogram ] [line 4] [setFoo]
-!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !"0x2e\00setFoo\00setFoo\00_ZN1A6setFooEv\004\000\000\001\006\00259\000\004", metadata !5, metadata !"_ZTS1A", metadata !15, metadata !"_ZTS1A", null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ] [line 4] [setFoo]
+!15 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17}
-!17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!17 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
 !18 = metadata !{i32 786468}
-!19 = metadata !{i32 786478, metadata !5, metadata !"_ZTS1A", metadata !"getFoo", metadata !"getFoo", metadata !"_ZN1A6getFooEv", i32 5, metadata !20, i1 false, i1 false, i32 1, i32 1, metadata !"_ZTS1A", i32 256, i1 false, null, null, i32 0, metadata !24, i32 5} ; [ DW_TAG_subprogram ] [line 5] [getFoo]
-!20 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\005\000\000\001\006\00259\000\005", metadata !5, metadata !"_ZTS1A", metadata !20, metadata !"_ZTS1A", null, null, i32 0, metadata !24} ; [ DW_TAG_subprogram ] [line 5] [getFoo]
+!20 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{metadata !22, metadata !17}
-!22 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !23} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo_t]
-!23 = metadata !{i32 786454, metadata !5, null, metadata !"foo_t", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_typedef ] [foo_t] [line 1, size 0, align 0, offset 0] [from int]
+!22 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, null, metadata !23} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo_t]
+!23 = metadata !{metadata !"0x16\00foo_t\001\000\000\000\000", metadata !5, null, metadata !13} ; [ DW_TAG_typedef ] [foo_t] [line 1, size 0, align 0, offset 0] [from int]
 !24 = metadata !{i32 786468}
 !25 = metadata !{metadata !26, metadata !28}
-!26 = metadata !{i32 786478, metadata !27, metadata !"_ZTS1A", metadata !"setFoo", metadata !"setFoo", metadata !"_ZN1A6setFooEv", i32 2, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1A6setFooEv, null, metadata !14, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [setFoo]
+!26 = metadata !{metadata !"0x2e\00setFoo\00setFoo\00_ZN1A6setFooEv\002\000\001\000\006\00259\000\002", metadata !27, metadata !"_ZTS1A", metadata !15, null, void (%class.A*)* @_ZN1A6setFooEv, null, metadata !14, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [setFoo]
 !27 = metadata !{metadata !"b.cpp", metadata !""}
-!28 = metadata !{i32 786478, metadata !27, metadata !"_ZTS1A", metadata !"getFoo", metadata !"getFoo", metadata !"_ZN1A6getFooEv", i32 4, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*)* @_ZN1A6getFooEv, null, metadata !19, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [getFoo]
+!28 = metadata !{metadata !"0x2e\00getFoo\00getFoo\00_ZN1A6getFooEv\004\000\001\000\006\00259\000\004", metadata !27, metadata !"_ZTS1A", metadata !20, null, i32 (%class.A*)* @_ZN1A6getFooEv, null, metadata !19, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [getFoo]
 !29 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !31 = metadata !{metadata !"clang version 3.5 "}
-!32 = metadata !{i32 786689, metadata !26, metadata !"this", null, i32 16777216, metadata !33, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
-!33 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!32 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !26, null, metadata !33} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!33 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
 !34 = metadata !{i32 0, i32 0, metadata !26, null}
 !35 = metadata !{i32 2, i32 0, metadata !26, null}
-!36 = metadata !{i32 786689, metadata !28, metadata !"this", null, i32 16777216, metadata !33, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!36 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !28, null, metadata !33} ; [ DW_TAG_arg_variable ] [this] [line 0]
 !37 = metadata !{i32 0, i32 0, metadata !28, null}
 !38 = metadata !{i32 4, i32 0, metadata !28, null}

diff --git a/test/Linker/type-unique-type-array-a.ll b/test/Linker/type-unique-type-array-a.ll
new file mode 100644
index 0000000..1b908c6
--- /dev/null
+++ b/test/Linker/type-unique-type-array-a.ll

@@ -0,0 +1,129 @@
+; REQUIRES: object-emission
+;
+; RUN: llvm-link %s %p/type-unique-type-array-b.ll -S -o - | %llc_dwarf -filetype=obj -O0 | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+;
+; rdar://problem/17628609
+;
+; cat -n a.cpp
+;     1	struct SA {
+;     2	  int a;
+;     3	};
+;     4	
+;     5	class A {
+;     6	public:
+;     7	  void testA(SA a) {
+;     8	  }
+;     9	};
+;    10	
+;    11	void topA(A *a, SA sa) {
+;    12	  a->testA(sa);
+;    13	}
+;
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_class_type
+; CHECK-NEXT:   DW_AT_name {{.*}} "A"
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_AT_MIPS_linkage_name {{.*}} "_ZN1A5testAE2SA"
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + 0x{{.*}} => {0x[[STRUCT:.*]]})
+; CHECK: 0x[[STRUCT]]: DW_TAG_structure_type
+; CHECK-NEXT:   DW_AT_name {{.*}} "SA"
+
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_class_type
+; CHECK-NEXT:   DW_AT_name {{.*}} "B"
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_MIPS_linkage_name {{.*}} "_ZN1B5testBE2SA"
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref_addr] {{.*}}[[STRUCT]]
+
+%class.A = type { i8 }
+%struct.SA = type { i32 }
+
+; Function Attrs: ssp uwtable
+define void @_Z4topAP1A2SA(%class.A* %a, i32 %sa.coerce) #0 {
+entry:
+  %sa = alloca %struct.SA, align 4
+  %a.addr = alloca %class.A*, align 8
+  %agg.tmp = alloca %struct.SA, align 4
+  %coerce.dive = getelementptr %struct.SA* %sa, i32 0, i32 0
+  store i32 %sa.coerce, i32* %coerce.dive
+  store %class.A* %a, %class.A** %a.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %a.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata !{%struct.SA* %sa}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
+  %0 = load %class.A** %a.addr, align 8, !dbg !28
+  %1 = bitcast %struct.SA* %agg.tmp to i8*, !dbg !28
+  %2 = bitcast %struct.SA* %sa to i8*, !dbg !28
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 4, i32 4, i1 false), !dbg !28
+  %coerce.dive1 = getelementptr %struct.SA* %agg.tmp, i32 0, i32 0, !dbg !28
+  %3 = load i32* %coerce.dive1, !dbg !28
+  call void @_ZN1A5testAE2SA(%class.A* %0, i32 %3), !dbg !28
+  ret void, !dbg !29
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind ssp uwtable
+define linkonce_odr void @_ZN1A5testAE2SA(%class.A* %this, i32 %a.coerce) #2 align 2 {
+entry:
+  %a = alloca %struct.SA, align 4
+  %this.addr = alloca %class.A*, align 8
+  %coerce.dive = getelementptr %struct.SA* %a, i32 0, i32 0
+  store i32 %a.coerce, i32* %coerce.dive
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata !{%struct.SA* %a}, metadata !32, metadata !{metadata !"0x102"}), !dbg !33
+  %this1 = load %class.A** %this.addr
+  ret void, !dbg !34
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #3
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21, !22}
+!llvm.ident = !{!23}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [a.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"a.cpp", metadata !"/Users/manmanren/test-Nov/type_unique/rdar_di_array"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !10}
+!4 = metadata !{metadata !"0x2\00A\005\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 5, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{metadata !"0x2e\00testA\00testA\00_ZN1A5testAE2SA\007\000\000\000\006\00256\000\007", metadata !1, metadata !"_ZTS1A", metadata !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [testA]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !"_ZTS2SA"}
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1A]
+!10 = metadata !{metadata !"0x13\00SA\001\0032\0032\000\000\000", metadata !1, null, null, metadata !11, null, null, metadata !"_ZTS2SA"} ; [ DW_TAG_structure_type ] [SA] [line 1, size 32, align 32, offset 0] [def] [from ]
+!11 = metadata !{metadata !12}
+!12 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !"_ZTS2SA", metadata !13} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !15, metadata !20}
+!15 = metadata !{metadata !"0x2e\00topA\00topA\00_Z4topAP1A2SA\0011\000\001\000\006\00256\000\0011", metadata !1, metadata !16, metadata !17, null, void (%class.A*, i32)* @_Z4topAP1A2SA, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [topA]
+!16 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [a.cpp]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{null, metadata !19, metadata !"_ZTS2SA"}
+!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1A"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1A]
+!20 = metadata !{metadata !"0x2e\00testA\00testA\00_ZN1A5testAE2SA\007\000\001\000\006\00256\000\007", metadata !1, metadata !"_ZTS1A", metadata !7, null, void (%class.A*, i32)* @_ZN1A5testAE2SA, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [testA]
+!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!22 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!23 = metadata !{metadata !"clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)"}
+!24 = metadata !{metadata !"0x101\00a\0016777227\000", metadata !15, metadata !16, metadata !19} ; [ DW_TAG_arg_variable ] [a] [line 11]
+!25 = metadata !{i32 11, i32 14, metadata !15, null}
+!26 = metadata !{metadata !"0x101\00sa\0033554443\000", metadata !15, metadata !16, metadata !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [sa] [line 11]
+!27 = metadata !{i32 11, i32 20, metadata !15, null}
+!28 = metadata !{i32 12, i32 3, metadata !15, null}
+!29 = metadata !{i32 13, i32 1, metadata !15, null}
+!30 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !20, null, metadata !19} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = metadata !{i32 0, i32 0, metadata !20, null}
+!32 = metadata !{metadata !"0x101\00a\0033554439\000", metadata !20, metadata !16, metadata !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [a] [line 7]
+!33 = metadata !{i32 7, i32 17, metadata !20, null}
+!34 = metadata !{i32 8, i32 3, metadata !20, null}

diff --git a/test/Linker/type-unique-type-array-b.ll b/test/Linker/type-unique-type-array-b.ll
new file mode 100644
index 0000000..85ee5a5
--- /dev/null
+++ b/test/Linker/type-unique-type-array-b.ll

@@ -0,0 +1,108 @@
+; RUN: true
+; This file belongs to type-unique-type-array-a.ll.
+;
+; rdar://problem/17628609
+;
+; cat -n b.cpp
+;     1	struct SA {
+;     2	  int a;
+;     3	};
+;     4	
+;     5	class B {
+;     6	public:
+;     7	  void testB(SA sa) {
+;     8	  }
+;     9	};
+;    10	
+;    11	void topB(B* b, SA sa) {
+;    12	  b->testB(sa);
+;    13	}
+
+%class.B = type { i8 }
+%struct.SA = type { i32 }
+
+; Function Attrs: ssp uwtable
+define void @_Z4topBP1B2SA(%class.B* %b, i32 %sa.coerce) #0 {
+entry:
+  %sa = alloca %struct.SA, align 4
+  %b.addr = alloca %class.B*, align 8
+  %agg.tmp = alloca %struct.SA, align 4
+  %coerce.dive = getelementptr %struct.SA* %sa, i32 0, i32 0
+  store i32 %sa.coerce, i32* %coerce.dive
+  store %class.B* %b, %class.B** %b.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.B** %b.addr}, metadata !24, metadata !{metadata !"0x102"}), !dbg !25
+  call void @llvm.dbg.declare(metadata !{%struct.SA* %sa}, metadata !26, metadata !{metadata !"0x102"}), !dbg !27
+  %0 = load %class.B** %b.addr, align 8, !dbg !28
+  %1 = bitcast %struct.SA* %agg.tmp to i8*, !dbg !28
+  %2 = bitcast %struct.SA* %sa to i8*, !dbg !28
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 4, i32 4, i1 false), !dbg !28
+  %coerce.dive1 = getelementptr %struct.SA* %agg.tmp, i32 0, i32 0, !dbg !28
+  %3 = load i32* %coerce.dive1, !dbg !28
+  call void @_ZN1B5testBE2SA(%class.B* %0, i32 %3), !dbg !28
+  ret void, !dbg !29
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind ssp uwtable
+define linkonce_odr void @_ZN1B5testBE2SA(%class.B* %this, i32 %sa.coerce) #2 align 2 {
+entry:
+  %sa = alloca %struct.SA, align 4
+  %this.addr = alloca %class.B*, align 8
+  %coerce.dive = getelementptr %struct.SA* %sa, i32 0, i32 0
+  store i32 %sa.coerce, i32* %coerce.dive
+  store %class.B* %this, %class.B** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.B** %this.addr}, metadata !30, metadata !{metadata !"0x102"}), !dbg !31
+  call void @llvm.dbg.declare(metadata !{%struct.SA* %sa}, metadata !32, metadata !{metadata !"0x102"}), !dbg !33
+  %this1 = load %class.B** %this.addr
+  ret void, !dbg !34
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #3
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21, !22}
+!llvm.ident = !{!23}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [b.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"b.cpp", metadata !"/Users/manmanren/test-Nov/type_unique/rdar_di_array"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !10}
+!4 = metadata !{metadata !"0x2\00B\005\008\008\000\000\000", metadata !1, null, null, metadata !5, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 5, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{metadata !"0x2e\00testB\00testB\00_ZN1B5testBE2SA\007\000\000\000\006\00256\000\007", metadata !1, metadata !"_ZTS1B", metadata !7, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [testB]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !"_ZTS2SA"}
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1B]
+!10 = metadata !{metadata !"0x13\00SA\001\0032\0032\000\000\000", metadata !1, null, null, metadata !11, null, null, metadata !"_ZTS2SA"} ; [ DW_TAG_structure_type ] [SA] [line 1, size 32, align 32, offset 0] [def] [from ]
+!11 = metadata !{metadata !12}
+!12 = metadata !{metadata !"0xd\00a\002\0032\0032\000\000", metadata !1, metadata !"_ZTS2SA", metadata !13} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !15, metadata !20}
+!15 = metadata !{metadata !"0x2e\00topB\00topB\00_Z4topBP1B2SA\0011\000\001\000\006\00256\000\0011", metadata !1, metadata !16, metadata !17, null, void (%class.B*, i32)* @_Z4topBP1B2SA, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 11] [def] [topB]
+!16 = metadata !{metadata !"0x29", metadata !1}         ; [ DW_TAG_file_type ] [b.cpp]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{null, metadata !19, metadata !"_ZTS2SA"}
+!19 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !"_ZTS1B"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1B]
+!20 = metadata !{metadata !"0x2e\00testB\00testB\00_ZN1B5testBE2SA\007\000\001\000\006\00256\000\007", metadata !1, metadata !"_ZTS1B", metadata !7, null, void (%class.B*, i32)* @_ZN1B5testBE2SA, null, metadata !6, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [testB]
+!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!22 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!23 = metadata !{metadata !"clang version 3.5.0 (trunk 214102:214113M) (llvm/trunk 214102:214115M)"}
+!24 = metadata !{metadata !"0x101\00b\0016777227\000", metadata !15, metadata !16, metadata !19} ; [ DW_TAG_arg_variable ] [b] [line 11]
+!25 = metadata !{i32 11, i32 14, metadata !15, null}
+!26 = metadata !{metadata !"0x101\00sa\0033554443\000", metadata !15, metadata !16, metadata !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [sa] [line 11]
+!27 = metadata !{i32 11, i32 20, metadata !15, null}
+!28 = metadata !{i32 12, i32 3, metadata !15, null}
+!29 = metadata !{i32 13, i32 1, metadata !15, null}
+!30 = metadata !{metadata !"0x101\00this\0016777216\001088", metadata !20, null, metadata !19} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!31 = metadata !{i32 0, i32 0, metadata !20, null}
+!32 = metadata !{metadata !"0x101\00sa\0033554439\000", metadata !20, metadata !16, metadata !"_ZTS2SA"} ; [ DW_TAG_arg_variable ] [sa] [line 7]
+!33 = metadata !{i32 7, i32 17, metadata !20, null}
+!34 = metadata !{i32 8, i32 3, metadata !20, null}

diff --git a/test/Linker/unique-fwd-decl-a.ll b/test/Linker/unique-fwd-decl-a.ll
new file mode 100644
index 0000000..b9c7b2f
--- /dev/null
+++ b/test/Linker/unique-fwd-decl-a.ll

@@ -0,0 +1,9 @@
+; RUN: llvm-link %s %S/Inputs/unique-fwd-decl-b.ll -S -o - | FileCheck %s
+
+; Test that the arguments of !a and !b get uniqued.
+; CHECK: !a = !{!0}
+; CHECK: !b = !{!0}
+
+!a = !{!0}
+!0 = metadata !{metadata !1}
+!1 = metadata !{}

diff --git a/test/Linker/unnamed-addr1-b.ll b/test/Linker/unnamed-addr1-b.ll
index 39a0c8b..d0f54f2 100644
--- a/test/Linker/unnamed-addr1-b.ll
+++ b/test/Linker/unnamed-addr1-b.ll

@@ -1,7 +1,7 @@
 ; This file is for use with unnamed-addr1-a.ll
 ; RUN: true
 
-@global-c = common unnamed_addr global i32 42
+@global-c = common unnamed_addr global i32 0
 @global-d = unnamed_addr global i32 42
 @global-e = unnamed_addr global i32 42
 @global-f = unnamed_addr global i32 42
@@ -13,7 +13,7 @@
 define weak void @func-d() unnamed_addr { ret void }
 define weak void @func-e() unnamed_addr { ret void }
 
-@global-g = common global i32 42
+@global-g = common global i32 0
 @global-h = global i32 42
 @global-i = global i32 42
 @global-j = global i32 42

diff --git a/test/Linker/visibility.ll b/test/Linker/visibility.ll
new file mode 100644
index 0000000..6436197
--- /dev/null
+++ b/test/Linker/visibility.ll

@@ -0,0 +1,51 @@
+; RUN: llvm-link %s %p/Inputs/visibility.ll -S | FileCheck %s
+; RUN: llvm-link %p/Inputs/visibility.ll %s -S | FileCheck %s
+
+; The values in this file are strong, the ones in Inputs/visibility.ll are weak,
+; but we should still get the visibility from them.
+
+
+$c1 = comdat any
+
+; Variables
+; CHECK-DAG: @v1 = hidden global i32 0
+@v1 = global i32 0
+
+; CHECK-DAG: @v2 = protected  global i32 0
+@v2 = global i32 0
+
+; CHECK-DAG: @v3 = hidden global i32 0
+@v3 = protected global i32 0
+
+; CHECK-DAG: @v4 = hidden global i32 1, comdat $c1
+@v4 = global i32 1, comdat $c1
+
+; Aliases
+; CHECK: @a1 = hidden alias i32* @v1
+@a1 = alias i32* @v1
+
+; CHECK: @a2 = protected alias i32* @v2
+@a2 = alias i32* @v2
+
+; CHECK: @a3 = hidden alias i32* @v3
+@a3 = protected alias i32* @v3
+
+
+; Functions
+; CHECK: define hidden void @f1()
+define void @f1()  {
+entry:
+  ret void
+}
+
+; CHECK: define protected void @f2()
+define void @f2()  {
+entry:
+  ret void
+}
+
+; CHECK: define hidden void @f3()
+define protected void @f3()  {
+entry:
+  ret void
+}

diff --git a/test/Linker/visibility1.ll b/test/Linker/visibility1.ll
deleted file mode 100644
index 131f6d5..0000000
--- a/test/Linker/visibility1.ll
+++ /dev/null

@@ -1,46 +0,0 @@
-; RUN: llvm-link %s %p/visibility2.ll -S | FileCheck %s
-; RUN: llvm-link %p/visibility2.ll %s -S | FileCheck %s
-
-; The values in this file are strong, the ones in visibility2.ll are weak,
-; but we should still get the visibility from them.
-
-; Variables
-; CHECK: @v1 = hidden global i32 0
-@v1 = global i32 0
-
-; CHECK: @v2 = protected  global i32 0
-@v2 = global i32 0
-
-; CHECK: @v3 = hidden global i32 0
-@v3 = protected global i32 0
-
-
-; Aliases
-; CHECK: @a1 = hidden alias i32* @v1
-@a1 = alias i32* @v1
-
-; CHECK: @a2 = protected alias i32* @v2
-@a2 = alias i32* @v2
-
-; CHECK: @a3 = hidden alias i32* @v3
-@a3 = protected alias i32* @v3
-
-
-; Functions
-; CHECK: define hidden void @f1()
-define void @f1()  {
-entry:
-  ret void
-}
-
-; CHECK: define protected void @f2()
-define void @f2()  {
-entry:
-  ret void
-}
-
-; CHECK: define hidden void @f3()
-define protected void @f3()  {
-entry:
-  ret void
-}

diff --git a/test/Linker/visibility2.ll b/test/Linker/visibility2.ll
deleted file mode 100644
index e6363ca..0000000
--- a/test/Linker/visibility2.ll
+++ /dev/null

@@ -1,27 +0,0 @@
-; This file is used by visibility1.ll, so it doesn't actually do anything itself
-;
-; RUN: true
-
-; Variables
-@v1 = weak hidden global i32 0
-@v2 = weak protected global i32 0
-@v3 = weak hidden global i32 0
-
-; Aliases
-@a1 = hidden alias weak i32* @v1
-@a2 = protected alias weak i32* @v2
-@a3 = hidden alias weak i32* @v3
-
-; Functions
-define weak hidden void @f1() {
-entry:
-  ret void
-}
-define weak protected void @f2() {
-entry:
-  ret void
-}
-define weak hidden void @f3() {
-entry:
-  ret void
-}

diff --git a/test/MC/AArch64/arm64-be-datalayout.s b/test/MC/AArch64/arm64-be-datalayout.s
index f448a4b..a5b48f1 100644
--- a/test/MC/AArch64/arm64-be-datalayout.s
+++ b/test/MC/AArch64/arm64-be-datalayout.s

@@ -1,4 +1,4 @@
-// RUN: llvm-mc -filetype=obj -triple arm64_be %s | llvm-readobj -section-data -sections | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple aarch64_be %s | llvm-readobj -section-data -sections | FileCheck %s
 
 // CHECK: 0000: 00123456 789ABCDE
 foo:    .xword 0x123456789abcde

diff --git a/test/MC/AArch64/arm64-diags.s b/test/MC/AArch64/arm64-diags.s
index cf00e98..f8138bd 100644
--- a/test/MC/AArch64/arm64-diags.s
+++ b/test/MC/AArch64/arm64-diags.s

@@ -159,6 +159,15 @@
   ldp w1, w2, [x2], #16
   ldp w2, w2, [x2], #16
   ldp x1, x1, [x2]
+  ldp s1, s1, [x1], #8
+  ldp s1, s1, [x1, #8]!
+  ldp s1, s1, [x1, #8]
+  ldp d1, d1, [x1], #16
+  ldp d1, d1, [x1, #16]!
+  ldp d1, d1, [x1, #16]
+  ldp q1, q1, [x1], #32
+  ldp q1, q1, [x1, #32]!
+  ldp q1, q1, [x1, #32]
 
   ldr x2, [x2], #8
   ldr x2, [x2, #8]!
@@ -185,6 +194,33 @@
 ; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
 ; CHECK-ERRORS:   ldp x1, x1, [x2]
 ; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp s1, s1, [x1], #8
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp s1, s1, [x1, #8]!
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp s1, s1, [x1, #8]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp d1, d1, [x1], #16
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp d1, d1, [x1, #16]!
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp d1, d1, [x1, #16]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp q1, q1, [x1], #32
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp q1, q1, [x1, #32]!
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp q1, q1, [x1, #32]
+; CHECK-ERRORS:           ^
 ; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
 ; CHECK-ERRORS:   ldr x2, [x2], #8
 ; CHECK-ERRORS:       ^

diff --git a/test/MC/AArch64/arm64-system-encoding.s b/test/MC/AArch64/arm64-system-encoding.s
index 87f8f8a..eb29117 100644
--- a/test/MC/AArch64/arm64-system-encoding.s
+++ b/test/MC/AArch64/arm64-system-encoding.s

@@ -135,6 +135,8 @@
   msr VTTBR_EL2, x3
   msr SPSel, x3
   msr S3_2_C11_C6_4, x1
+  msr  S0_0_C0_C0_0, x0
+  msr  S1_2_C3_C4_5, x2
 ; CHECK: msr ACTLR_EL1, x3              ; encoding: [0x23,0x10,0x18,0xd5]
 ; CHECK: msr ACTLR_EL2, x3              ; encoding: [0x23,0x10,0x1c,0xd5]
 ; CHECK: msr ACTLR_EL3, x3              ; encoding: [0x23,0x10,0x1e,0xd5]
@@ -213,6 +215,8 @@
 ; CHECK: msr VTTBR_EL2, x3              ; encoding: [0x03,0x21,0x1c,0xd5]
 ; CHECK: msr  SPSEL, x3                 ; encoding: [0x03,0x42,0x18,0xd5]
 ; CHECK: msr  S3_2_C11_C6_4, x1         ; encoding: [0x81,0xb6,0x1a,0xd5]
+; CHECK: msr  S0_0_C0_C0_0, x0          ; encoding: [0x00,0x00,0x00,0xd5]
+; CHECK: msr  S1_2_C3_C4_5, x2          ; encoding: [0xa2,0x34,0x0a,0xd5]
 
   mrs x3, ACTLR_EL1
   mrs x3, ACTLR_EL2

diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s
index 5293131..07e6d01 100644
--- a/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/test/MC/AArch64/basic-a64-diagnostics.s

@@ -3679,26 +3679,26 @@
 // CHECK-ERROR-NEXT:                 ^
 
 // Now check some invalid generic names
-        mrs xzr, s2_5_c11_c13_2
         mrs x12, s3_8_c11_c13_2
-        mrs x13, s3_3_c12_c13_2
         mrs x19, s3_2_c15_c16_2
         mrs x30, s3_2_c15_c1_8
-// CHECK-ERROR-NEXT: error: expected readable system register
-// CHECK-ERROR-NEXT:         mrs xzr, s2_5_c11_c13_2
-// CHECK-ERROR-NEXT:                  ^
+        mrs x4, s4_7_c15_c15_7
+        mrs x14, s3_7_c16_c15_7
 // CHECK-ERROR-NEXT: error: expected readable system register
 // CHECK-ERROR-NEXT:         mrs x12, s3_8_c11_c13_2
 // CHECK-ERROR-NEXT:                  ^
 // CHECK-ERROR-NEXT: error: expected readable system register
-// CHECK-ERROR-NEXT:         mrs x13, s3_3_c12_c13_2
-// CHECK-ERROR-NEXT:                  ^
-// CHECK-ERROR-NEXT: error: expected readable system register
 // CHECK-ERROR-NEXT:         mrs x19, s3_2_c15_c16_2
 // CHECK-ERROR-NEXT:                  ^
 // CHECK-ERROR-NEXT: error: expected readable system register
 // CHECK-ERROR-NEXT:         mrs x30, s3_2_c15_c1_8
 // CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x4, s4_7_c15_c15_7
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x14, s3_7_c16_c15_7
+// CHECK-ERROR-NEXT:                  ^
 
 //------------------------------------------------------------------------------
 // Test and branch (immediate)

diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s
index 140ea33..dd8dfd4 100644
--- a/test/MC/AArch64/basic-a64-instructions.s
+++ b/test/MC/AArch64/basic-a64-instructions.s

@@ -370,23 +370,29 @@
         add w11, w13, w15, lsl #0
         add w9, w3, wzr, lsl #10
         add w17, w29, w20, lsl #31
+        add w17, w29, w20, lsl #(31-2)
 // CHECK: add      w11, w13, w15              // encoding: [0xab,0x01,0x0f,0x0b]
 // CHECK: add      w9, w3, wzr, lsl #10       // encoding: [0x69,0x28,0x1f,0x0b]
 // CHECK: add      w17, w29, w20, lsl #31     // encoding: [0xb1,0x7f,0x14,0x0b]
+// CHECK: add      w17, w29, w20, lsl #29     // encoding: [0xb1,0x77,0x14,0x0b]
 
         add w21, w22, w23, lsr #0
         add w24, w25, w26, lsr #18
         add w27, w28, w29, lsr #31
+        add w27, w28, w29, lsr #(31-2)
 // CHECK: add      w21, w22, w23, lsr #0      // encoding: [0xd5,0x02,0x57,0x0b]
 // CHECK: add      w24, w25, w26, lsr #18     // encoding: [0x38,0x4b,0x5a,0x0b]
 // CHECK: add      w27, w28, w29, lsr #31     // encoding: [0x9b,0x7f,0x5d,0x0b]
+// CHECK: add      w27, w28, w29, lsr #29     // encoding: [0x9b,0x77,0x5d,0x0b]
 
         add w2, w3, w4, asr #0
         add w5, w6, w7, asr #21
         add w8, w9, w10, asr #31
+        add w8, w9, w10, asr #(31-2)
 // CHECK: add      w2, w3, w4, asr #0         // encoding: [0x62,0x00,0x84,0x0b]
 // CHECK: add      w5, w6, w7, asr #21        // encoding: [0xc5,0x54,0x87,0x0b]
 // CHECK: add      w8, w9, w10, asr #31       // encoding: [0x28,0x7d,0x8a,0x0b]
+// CHECK: add      w8, w9, w10, asr #29       // encoding: [0x28,0x75,0x8a,0x0b]
 
         add x3, x5, x7
         add xzr, x3, x5
@@ -400,23 +406,29 @@
         add x11, x13, x15, lsl #0
         add x9, x3, xzr, lsl #10
         add x17, x29, x20, lsl #63
+        add x17, x29, x20, lsl #(63-5)
 // CHECK: add      x11, x13, x15              // encoding: [0xab,0x01,0x0f,0x8b]
 // CHECK: add      x9, x3, xzr, lsl #10       // encoding: [0x69,0x28,0x1f,0x8b]
 // CHECK: add      x17, x29, x20, lsl #63     // encoding: [0xb1,0xff,0x14,0x8b]
+// CHECK: add	   x17, x29, x20, lsl #58     // encoding: [0xb1,0xeb,0x14,0x8b]
 
         add x21, x22, x23, lsr #0
         add x24, x25, x26, lsr #18
         add x27, x28, x29, lsr #63
+        add x17, x29, x20, lsr #(63-5)
 // CHECK: add      x21, x22, x23, lsr #0      // encoding: [0xd5,0x02,0x57,0x8b]
 // CHECK: add      x24, x25, x26, lsr #18     // encoding: [0x38,0x4b,0x5a,0x8b]
 // CHECK: add      x27, x28, x29, lsr #63     // encoding: [0x9b,0xff,0x5d,0x8b]
+// CHECK: add	   x17, x29, x20, lsr #58     // encoding: [0xb1,0xeb,0x54,0x8b]
 
         add x2, x3, x4, asr #0
         add x5, x6, x7, asr #21
         add x8, x9, x10, asr #63
+        add x17, x29, x20, asr #(63-5)
 // CHECK: add      x2, x3, x4, asr #0         // encoding: [0x62,0x00,0x84,0x8b]
 // CHECK: add      x5, x6, x7, asr #21        // encoding: [0xc5,0x54,0x87,0x8b]
 // CHECK: add      x8, x9, x10, asr #63       // encoding: [0x28,0xfd,0x8a,0x8b]
+// CHECK: add	   x17, x29, x20, asr #58     // encoding: [0xb1,0xeb,0x94,0x8b]
 
         adds w3, w5, w7
         adds wzr, w3, w5
@@ -4786,12 +4798,16 @@
 
         mrs x12, s3_7_c15_c1_5
         mrs x13, s3_2_c11_c15_7
+        mrs x14, s1_3_c9_c2_1
         msr s3_0_c15_c0_0, x12
         msr s3_7_c11_c13_7, x5
+        msr s1_3_c9_c2_1, x4
 // CHECK: mrs     x12, {{s3_7_c15_c1_5|S3_7_C15_C1_5}}      // encoding: [0xac,0xf1,0x3f,0xd5]
-// CHECK: mrs     x13, {{s3_2_c11_c15_7|S3_2_C11_C15_7}}     // encoding: [0xed,0xbf,0x3a,0xd5]
+// CHECK: mrs     x13, {{s3_2_c11_c15_7|S3_2_C11_C15_7}}    // encoding: [0xed,0xbf,0x3a,0xd5]
+// CHECK: mrs     x14, {{s1_3_c9_c2_1|S1_3_C9_C2_1}}        // encoding: [0x2e,0x92,0x2b,0xd5]
 // CHECK: msr     {{s3_0_c15_c0_0|S3_0_C15_C0_0}}, x12      // encoding: [0x0c,0xf0,0x18,0xd5]
-// CHECK: msr     {{s3_7_c11_c13_7|S3_7_C11_C13_7}}, x5      // encoding: [0xe5,0xbd,0x1f,0xd5]
+// CHECK: msr     {{s3_7_c11_c13_7|S3_7_C11_C13_7}}, x5     // encoding: [0xe5,0xbd,0x1f,0xd5]
+// CHECK: msr     {{s1_3_c9_c2_1|S1_3_C9_C2_1}}, x4         // encoding: [0x24,0x92,0x0b,0xd5]
 
 //------------------------------------------------------------------------------
 // Unconditional branch (immediate)

diff --git a/test/MC/AArch64/elf_osabi_flags.s b/test/MC/AArch64/elf_osabi_flags.s
new file mode 100644
index 0000000..68cb385
--- /dev/null
+++ b/test/MC/AArch64/elf_osabi_flags.s

@@ -0,0 +1,5 @@
+# RUN: llvm-mc -filetype=obj -triple aarch64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=AARCH64-OSABI %s
+# AARCH64-OSABI: OS/ABI: SystemV (0x0)
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-linux-gnu %s -o -| llvm-readobj -h | FileCheck --check-prefix=AARCH64-LINUX-OSABI %s
+# AARCH64-LINUX-OSABI: OS/ABI: GNU/Linux (0x3)

diff --git a/test/MC/AArch64/inst-directive-diagnostic.s b/test/MC/AArch64/inst-directive-diagnostic.s
new file mode 100644
index 0000000..8abad5e
--- /dev/null
+++ b/test/MC/AArch64/inst-directive-diagnostic.s

@@ -0,0 +1,19 @@
+// RUN: not llvm-mc %s -triple=aarch64-none-linux-gnu -filetype asm -o - 2>&1 \
+// RUN:   | FileCheck -check-prefix CHECK-ERROR %s
+
+	.align 2
+	.global diagnostics
+	.type diagnostics,%function
+diagnostics:
+.Label:
+    .inst
+// CHECK-ERROR: expected expression following directive
+
+    .inst 0x5e104020,
+// CHECK-ERROR: expected expression
+
+    .inst .Label
+// CHECK-ERROR: expected constant expression
+
+    .inst 0x5e104020 0x5e104020
+// CHECK-ERROR: unexpected token in directive

diff --git a/test/MC/AArch64/inst-directive.s b/test/MC/AArch64/inst-directive.s
new file mode 100644
index 0000000..6a4b64e
--- /dev/null
+++ b/test/MC/AArch64/inst-directive.s

@@ -0,0 +1,24 @@
+// RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=asm -o - \
+// RUN:   | FileCheck %s --check-prefix=CHECK-ASM
+// RUN: llvm-mc %s -triple=aarch64-none-linux-gnu -filetype=obj -o - \
+// RUN:   | llvm-readobj -s -sd | FileCheck %s  --check-prefix=CHECK-OBJ
+
+    .section    .inst.aarch64_inst
+
+    .align  2
+    .global aarch64_inst
+    .type   aarch64_inst,%function
+aarch64_inst:
+    .inst 0x5e104020
+
+// CHECK-ASM:        .align  2
+// CHECK-ASM:        .globl  aarch64_inst
+// CHECK-ASM:        .type   aarch64_inst,@function
+// CHECK-ASM: aarch64_inst:
+// CHECK-ASM:        .inst   0x5E104020
+
+// CHECK-OBJ: Section {
+// CHECK-OBJ:   Name: .inst.aarch64_inst
+// CHECK-OBJ:   SectionData (
+// CHECK-OBJ-NEXT: 0000: 2040105E
+// CHECK-OBJ-NEXT: )

diff --git a/test/MC/AArch64/ldr-pseudo-diagnostics.s b/test/MC/AArch64/ldr-pseudo-diagnostics.s
new file mode 100644
index 0000000..e32c516
--- /dev/null
+++ b/test/MC/AArch64/ldr-pseudo-diagnostics.s

@@ -0,0 +1,14 @@
+//RUN: not llvm-mc -triple=aarch64-linux-gnu - < %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+
+// simple test
+.section a, "ax", @progbits
+f1:
+  ldr w0, =0x100000001
+// CHECK-ERROR: error: Immediate too large for register
+// CHECK-ERROR:   ldr w0, =0x100000001
+// CHECK-ERROR:           ^
+f2:
+  ldr w0, =-0x80000001
+// CHECK-ERROR: error: Immediate too large for register
+// CHECK-ERROR:  ldr w0, =-0x80000001
+// CHECK-ERROR:          ^

diff --git a/test/MC/AArch64/ldr-pseudo.s b/test/MC/AArch64/ldr-pseudo.s
index 6c82fb9..1bdb5d6 100644
--- a/test/MC/AArch64/ldr-pseudo.s
+++ b/test/MC/AArch64/ldr-pseudo.s

@@ -23,21 +23,21 @@
 .section b,"ax",@progbits
 // CHECK-LABEL: f3:
 f3:
-  ldr x0, =0x10001
-// CHECK: ldr x0, .Ltmp[[TMP0:[0-9]+]]
+  ldr w0, =0x10001
+// CHECK: ldr w0, .Ltmp[[TMP0:[0-9]+]]
 
 // loading multiple constants
 .section c,"ax",@progbits
 // CHECK-LABEL: f4:
 f4:
-  ldr x0, =0x10002
-// CHECK: ldr x0, .Ltmp[[TMP1:[0-9]+]]
+  ldr w0, =0x10002
+// CHECK: ldr w0, .Ltmp[[TMP1:[0-9]+]]
   adds x0, x0, #1
   adds x0, x0, #1
   adds x0, x0, #1
   adds x0, x0, #1
-  ldr x0, =0x10003
-// CHECK: ldr x0, .Ltmp[[TMP2:[0-9]+]]
+  ldr w0, =0x10003
+// CHECK: ldr w0, .Ltmp[[TMP2:[0-9]+]]
   adds x0, x0, #1
   adds x0, x0, #1
 
@@ -45,8 +45,8 @@
 .section d,"ax",@progbits
 // CHECK-LABEL: f5:
 f5:
-  ldr x0, =0x10004
-// CHECK: ldr x0, .Ltmp[[TMP3:[0-9]+]]
+  ldr w0, =0x10004
+// CHECK: ldr w0, .Ltmp[[TMP3:[0-9]+]]
   adds x0, x0, #1
   adds x0, x0, #1
   adds x0, x0, #1
@@ -54,8 +54,8 @@
   adds x0, x0, #1
   adds x0, x0, #1
   adds x0, x0, #1
-  ldr x0, =0x10004
-// CHECK: ldr x0, .Ltmp[[TMP4:[0-9]+]]
+  ldr w0, =0x10004
+// CHECK: ldr w0, .Ltmp[[TMP4:[0-9]+]]
   adds x0, x0, #1
   adds x0, x0, #1
   adds x0, x0, #1
@@ -67,8 +67,8 @@
 .section e,"ax",@progbits
 // CHECK-LABEL: f6:
 f6:
-  ldr x0, =0x10006
-// CHECK: ldr x0, .Ltmp[[TMP5:[0-9]+]]
+  ldr w0, =0x10006
+// CHECK: ldr w0, .Ltmp[[TMP5:[0-9]+]]
   adds x0, x0, #1
   adds x0, x0, #1
   adds x0, x0, #1
@@ -84,8 +84,8 @@
 // CHECK-LABEL: f8:
 f8:
   adds x0, x0, #1
-  ldr x0, =0x10007
-// CHECK: ldr x0, .Ltmp[[TMP6:[0-9]+]]
+  ldr w0, =0x10007
+// CHECK: ldr w0, .Ltmp[[TMP6:[0-9]+]]
   adds x0, x0, #1
   adds x0, x0, #1
 
@@ -97,22 +97,22 @@
 .section g,"ax",@progbits
 // CHECK-LABEL: f9:
 f9:
-  ldr x0, =foo
-// CHECK: ldr x0, .Ltmp[[TMP7:[0-9]+]]
+  ldr w0, =foo
+// CHECK: ldr w0, .Ltmp[[TMP7:[0-9]+]]
 
 // load a symbol from another section
 .section h,"ax",@progbits
 // CHECK-LABEL: f10:
 f10:
-  ldr x0, =f5
-// CHECK: ldr x0, .Ltmp[[TMP8:[0-9]+]]
+  ldr w0, =f5
+// CHECK: ldr w0, .Ltmp[[TMP8:[0-9]+]]
 
 // load a symbol from the same section
 .section i,"ax",@progbits
 // CHECK-LABEL: f11:
 f11:
-  ldr x0, =f12
-// CHECK: ldr x0, .Ltmp[[TMP9:[0-9]+]]
+  ldr w0, =f12
+// CHECK: ldr w0, .Ltmp[[TMP9:[0-9]+]]
   ldr w0,=0x3C000
 // CHECK: ldr     w0, .Ltmp[[TMP10:[0-9]+]]
 
@@ -127,12 +127,12 @@
 f13:
   adds x0, x0, #1
   adds x0, x0, #1
-  ldr x0, =0x101
-// CHECK: movz x0, #0x101
+  ldr w0, =0x101
+// CHECK: movz w0, #0x101
   adds x0, x0, #1
   adds x0, x0, #1
-  ldr x0, =bar
-// CHECK: ldr x0, .Ltmp[[TMP11:[0-9]+]]
+  ldr w0, =bar
+// CHECK: ldr w0, .Ltmp[[TMP11:[0-9]+]]
   adds x0, x0, #1
   adds x0, x0, #1
 //
@@ -141,27 +141,70 @@
 
 // usage in macro
 .macro useit_in_a_macro
-  ldr x0, =0x10008
-  ldr x0, =baz
+  ldr w0, =0x10008
+  ldr w0, =baz
 .endm
 .section k,"ax",@progbits
 // CHECK-LABEL: f14:
 f14:
   useit_in_a_macro
-// CHECK: ldr x0, .Ltmp[[TMP12:[0-9]+]]
-// CHECK: ldr x0, .Ltmp[[TMP13:[0-9]+]]
+// CHECK: ldr w0, .Ltmp[[TMP12:[0-9]+]]
+// CHECK: ldr w0, .Ltmp[[TMP13:[0-9]+]]
 
 // usage with expressions
 .section l, "ax", @progbits
 // CHECK-LABEL: f15:
 f15:
-  ldr x0, =0x10001+8
-// CHECK: ldr x0, .Ltmp[[TMP14:[0-9]+]]
+  ldr w0, =0x10001+8
+// CHECK: ldr w0, .Ltmp[[TMP14:[0-9]+]]
   adds x0, x0, #1
-  ldr x0, =bar+4
-// CHECK: ldr x0, .Ltmp[[TMP15:[0-9]+]]
+  ldr w0, =bar+4
+// CHECK: ldr w0, .Ltmp[[TMP15:[0-9]+]]
   adds x0, x0, #1
 
+// usage with 64-bit regs
+.section m, "ax", @progbits
+// CHECK-LABEL: f16:
+f16:
+  ldr x0, =0x0102030405060708
+// CHECK: ldr x0, .Ltmp[[TMP16:[0-9]+]]
+  add x0, x0, #1
+  ldr w0, =bar
+// CHECK: ldr w0, .Ltmp[[TMP17:[0-9]+]]
+  ldr x0, =bar+16
+// CHECK: ldr x0, .Ltmp[[TMP18:[0-9]+]]
+  add x0, x0, #1
+  ldr x0, =0x100000001
+// CHECK: ldr x0, .Ltmp[[TMP19:[0-9]+]]
+  ldr x1, =-0x80000001
+// CHECK: ldr x1, .Ltmp[[TMP20:[0-9]+]]
+  ldr x2, =0x10001
+// CHECK: ldr x2, .Ltmp[[TMP21:[0-9]+]]
+
+// check range for 32-bit regs
+.section n, "ax", @progbits
+// CHECK-LABEL: f17:
+f17:
+  ldr w0, =0xFFFFFFFF
+// CHECK: ldr w0, .Ltmp[[TMP22:[0-9]+]]
+  add w0, w0, #1
+  ldr w1, =-0x7FFFFFFF
+// CHECK: ldr w1, .Ltmp[[TMP23:[0-9]+]]
+  add w0, w0, #1
+  ldr w0, =-1
+// CHECK: ldr w0, .Ltmp[[TMP24:[0-9]+]]
+  add w0, w0, #1
+
+// make sure the same contant uses different pools for 32- and 64-bit registers
+.section o, "ax", @progbits
+// CHECK-LABEL: f18:
+f18:
+  ldr w0, =0x320064
+// CHECK: ldr w0, .Ltmp[[TMP25:[0-9]+]]
+  add w0, w0, #1
+  ldr x1, =0x320064
+// CHECK: ldr x1, .Ltmp[[TMP26:[0-9]+]]
+
 //
 // Constant Pools
 //
@@ -174,6 +217,7 @@
 // CHECK: .align 2
 // CHECK: .Ltmp[[TMP1]]
 // CHECK: .word 65538
+// CHECK: .align 2
 // CHECK: .Ltmp[[TMP2]]
 // CHECK: .word 65539
 
@@ -181,6 +225,7 @@
 // CHECK: .align 2
 // CHECK: .Ltmp[[TMP3]]
 // CHECK: .word 65540
+// CHECK: .align 2
 // CHECK: .Ltmp[[TMP4]]
 // CHECK: .word 65540
 
@@ -188,6 +233,7 @@
 // CHECK: .align 2
 // CHECK: .Ltmp[[TMP5]]
 // CHECK: .word 65542
+// CHECK: .align 2
 // CHECK: .Ltmp[[TMP6]]
 // CHECK: .word 65543
 
@@ -208,6 +254,7 @@
 // CHECK: .align 2
 // CHECK: .Ltmp[[TMP9]]
 // CHECK: .word f12
+// CHECK: .align 2
 // CHECK: .Ltmp[[TMP10]]
 // CHECK: .word 245760
 
@@ -220,6 +267,7 @@
 // CHECK: .align 2
 // CHECK: .Ltmp[[TMP12]]
 // CHECK: .word 65544
+// CHECK: .align 2
 // CHECK: .Ltmp[[TMP13]]
 // CHECK: .word baz
 
@@ -227,5 +275,45 @@
 // CHECK: .align 2
 // CHECK: .Ltmp[[TMP14]]
 // CHECK: .word 65545
+// CHECK: .align 2
 // CHECK: .Ltmp[[TMP15]]
 // CHECK: .word bar+4
+
+// CHECK: .section m,"ax",@progbits
+// CHECK: .align 3
+// CHECK: .Ltmp[[TMP16]]
+// CHECK: .xword 72623859790382856
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP17]]
+// CHECK: .word bar
+// CHECK: .align 3
+// CHECK: .Ltmp[[TMP18]]
+// CHECK: .xword bar+16
+// CHECK: .align 3
+// CHECK: .Ltmp[[TMP19]]
+// CHECK: .xword 4294967297
+// CHECK: .align 3
+// CHECK: .Ltmp[[TMP20]]
+// CHECK: .xword -2147483649
+// CHECK: .align 3
+// CHECK: .Ltmp[[TMP21]]
+// CHECK: .xword 65537
+
+// CHECK: .section n,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP22]]
+// CHECK: .word 4294967295
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP23]]
+// CHECK: .word -2147483647
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP24]]
+// CHECK: .word -1
+
+// CHECK: .section o,"ax",@progbits
+// CHECK: .align 2
+// CHECK: .Ltmp[[TMP25]]
+// CHECK: .word 3276900
+// CHECK: .align 3
+// CHECK: .Ltmp[[TMP26]]
+// CHECK: .xword 3276900

diff --git a/test/MC/AArch64/single-slash.s b/test/MC/AArch64/single-slash.s
new file mode 100644
index 0000000..c4c266c
--- /dev/null
+++ b/test/MC/AArch64/single-slash.s

@@ -0,0 +1,6 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu < %s | FileCheck %s
+
+// Test that a single slash is not mistaken as the start of comment.
+
+//CHECK: movz    x0, #0x10
+    movz x0, #(32 / 2)

diff --git a/test/MC/ARM/arm_instructions.s b/test/MC/ARM/arm_instructions.s
index a4b6bda..a4c100e 100644
--- a/test/MC/ARM/arm_instructions.s
+++ b/test/MC/ARM/arm_instructions.s

@@ -1,6 +1,6 @@
 @ RUN: llvm-mc -mcpu=cortex-a8 -triple arm-unknown-unknown -show-encoding %s \
 @ RUN:  | FileCheck %s -check-prefix=ALL
-@ RUN: llvm-mc -mcpu=cortex-a9-mp -triple armv7-unknown-nacl -show-encoding %s \
+@ RUN: llvm-mc -mcpu=cortex-a9 -triple armv7-unknown-nacl -show-encoding %s \
 @ RUN:  | FileCheck %s -check-prefix=NACL
 @ RUN: llvm-mc -mcpu=cortex-a8 -mattr=+nacl-trap -triple armv7 -show-encoding %s \
 @ RUN:  | FileCheck %s -check-prefix=NACL

diff --git a/test/MC/ARM/coff-debugging-secrel.ll b/test/MC/ARM/coff-debugging-secrel.ll
index f37b19e..0e5c8e6 100644
--- a/test/MC/ARM/coff-debugging-secrel.ll
+++ b/test/MC/ARM/coff-debugging-secrel.ll

@@ -17,16 +17,16 @@
 !llvm.module.flags = !{!9, !10}
 
 !0 = metadata !{i32 1, i32 0, metadata !1, null}
-!1 = metadata !{i32 786478, metadata !2, metadata !3, metadata !"function", metadata !"function", metadata !"", i32 1, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @function, null, null, metadata !6, i32 1} ; [ DW_TAG_subprogram ], [line 1], [def], [function]
+!1 = metadata !{metadata !"0x2e\00function\00function\00\001\000\001\000\006\000\000\001", metadata !2, metadata !3, metadata !4, null, void ()* @function, null, null, metadata !6} ; [ DW_TAG_subprogram ], [line 1], [def], [function]
 !2 = metadata !{metadata !"/Users/compnerd/work/llvm/test/MC/ARM/reduced.c", metadata !"/Users/compnerd/work/llvm"}
-!3 = metadata !{i32 786473, metadata !2} ; [ DW_TAG_file_type] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c]
-!4 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ], [line 0, size 0, align 0, offset 0] [from ]
+!3 = metadata !{metadata !"0x29", metadata !2} ; [ DW_TAG_file_type] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ], [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{null}
 !6 = metadata !{}
-!7 = metadata !{i32 786449, metadata !2, i32 12, metadata !"clang version 3.5.0", i1 false, metadata !"", i32 0, metadata !6, metadata !6, metadata !8, metadata !6, metadata !6, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c] [DW_LANG_C99]
+!7 = metadata !{metadata !"0x11\0012\00clang version 3.5.0\000\00\000\00\001", metadata !2, metadata !6, metadata !6, metadata !8, metadata !6, metadata !6} ; [ DW_TAG_compile_unit ] [/Users/compnerd/work/llvm/test/MC/ARM/reduced.c] [DW_LANG_C99]
 !8 = metadata !{metadata !1}
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 
 ; CHECK-ITANIUM: Relocations [
 ; CHECK-ITANIUM:   Section {{.*}} .debug_info {
@@ -42,8 +42,10 @@
 
 ; CHECK-MSVC: Relocations [
 ; CHECK-MSVC:   Section {{.*}} .debug$S {
-; CHECK-MSVC:     0xC IMAGE_REL_ARM_SECREL function
-; CHECK-MSVC:     0x10 IMAGE_REL_ARM_SECTION function
+; CHECK-MSVC:     0x2C IMAGE_REL_ARM_SECREL function
+; CHECK-MSVC:     0x30 IMAGE_REL_ARM_SECTION function
+; CHECK-MSVC:     0x48 IMAGE_REL_ARM_SECREL function
+; CHECK-MSVC:     0x4C IMAGE_REL_ARM_SECTION function
 ; CHECK-MSVC:   }
 ; CHECK-MSVC: ]
 

diff --git a/test/MC/ARM/coff-file.s b/test/MC/ARM/coff-file.s
index f0dd29a..d3f26f4 100644
--- a/test/MC/ARM/coff-file.s
+++ b/test/MC/ARM/coff-file.s

@@ -21,7 +21,7 @@
 // CHECK-SCN: Symbols [
 // CHECK-SCN:   Symbol {
 // CHECK-SCN:     Name: .file
-// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     Section: IMAGE_SYM_DEBUG (-2)
 // CHECK-SCN:     StorageClass: File
 // CHECK-SCN:     AuxFileRecord {
 // CHECK-SCN:       FileName: null-padded.asm
@@ -29,7 +29,7 @@
 // CHECK-SCN:   }
 // CHECK-SCN:   Symbol {
 // CHECK-SCN:     Name: .file
-// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     Section: IMAGE_SYM_DEBUG (-2)
 // CHECK-SCN:     StorageClass: File
 // CHECK-SCN:     AuxFileRecord {
 // CHECK-SCN:       FileName: eighteen-chars.asm
@@ -37,7 +37,7 @@
 // CHECK-SCN:   }
 // CHECK-SCN:   Symbol {
 // CHECK-SCN:     Name: .file
-// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     Section: IMAGE_SYM_DEBUG (-2)
 // CHECK-SCN:     StorageClass: File
 // CHECK-SCN:     AuxFileRecord {
 // CHECK-SCN:       FileName: multiple-auxiliary-entries.asm

diff --git a/test/MC/ARM/coproc-diag.s b/test/MC/ARM/coproc-diag.s
new file mode 100644
index 0000000..c96f2db
--- /dev/null
+++ b/test/MC/ARM/coproc-diag.s

@@ -0,0 +1,10 @@
+# Special test to make sure we don't error on VFP co-proc access
+@ RUN: llvm-mc -triple=armv5 < %s | FileCheck %s
+@ RUN: llvm-mc -triple=armv6 < %s | FileCheck %s
+
+        @ p10 and p11 are reserved for NEON, but accessible on v5/v6
+        ldc  p10, cr0, [r0], {0x20}
+        ldc2 p11, cr0, [r0], {0x21}
+        ldcl p11, cr0, [r0], {0x20}
+
+@ CHECK-NOT: error: invalid operand for instruction

diff --git a/test/MC/ARM/cps.s b/test/MC/ARM/cps.s
new file mode 100644
index 0000000..a848b22
--- /dev/null
+++ b/test/MC/ARM/cps.s

@@ -0,0 +1,17 @@
+@ RUN: llvm-mc -triple=thumbv6t2--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv7a--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv7r--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv8a--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: not llvm-mc -triple=thumbv7m--none-eabi -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=UNDEF
+
+  cpsie f
+  cpsie i, #3
+  cps #0
+
+@ CHECK: cpsie f                         @ encoding: [0x61,0xb6]
+@ CHECK: cpsie   i, #3                   @ encoding: [0xaf,0xf3,0x43,0x85]
+@ CHECK: cps     #0                      @ encoding: [0xaf,0xf3,0x00,0x81]
+
+@ UNDEF-DAG: cpsie f                         @ encoding: [0x61,0xb6]
+@ UNDEF-DAG: error: instruction requires:
+@ UNDEF-DAG: error: instruction 'cps' requires effect for M-class

diff --git a/test/MC/ARM/d16.s b/test/MC/ARM/d16.s
new file mode 100644
index 0000000..aa549a3
--- /dev/null
+++ b/test/MC/ARM/d16.s

@@ -0,0 +1,24 @@
+@ RUN:     llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mattr=+vfp4,-d16 2>&1 | FileCheck %s --check-prefix=D32
+@ RUN: not llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mattr=+vfp4,+d16 2>&1 | FileCheck %s --check-prefix=D16
+
+@ D32-NOT: error:
+
+@ D16: invalid operand for instruction
+@ D16-NEXT: vadd.f64 d1, d2, d16
+vadd.f64 d1, d2, d16
+
+@ D16: invalid operand for instruction
+@ D16-NEXT: vadd.f64 d1, d17, d6
+vadd.f64 d1, d17, d6
+
+@ D16: invalid operand for instruction
+@ D16-NEXT: vadd.f64 d19, d7, d6
+vadd.f64 d19, d7, d6
+
+@ D16: invalid operand for instruction
+@ D16-NEXT: vcvt.f64.f32 d22, s4
+vcvt.f64.f32 d22, s4
+
+@ D16: invalid operand for instruction
+@ D16-NEXT: vcvt.f32.f64 s26, d30
+vcvt.f32.f64 s26, d30

diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 88c5fb5..6b9574b 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s

@@ -491,3 +491,133 @@
 @ CHECK-ERRORS:                 ^
 @ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
 @ CHECK-ERRORS:                  ^
+
+        str r0, [r0, #4]!
+        str r0, [r0, r1]!
+        str r0, [r0], #4
+        str r0, [r0], r1
+        strh r0, [r0, #2]!
+        strh r0, [r0, r1]!
+        strh r0, [r0], #2
+        strh r0, [r0], r1
+        strb r0, [r0, #1]!
+        strb r0, [r0, r1]!
+        strb r0, [r0], #1
+        strb r0, [r0], r1
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: str r0, [r0, #4]!
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: str r0, [r0, r1]!
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: str r0, [r0], #4
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: str r0, [r0], r1
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: strh r0, [r0, #2]!
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: strh r0, [r0, r1]!
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: strh r0, [r0], #2
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: strh r0, [r0], r1
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: strb r0, [r0, #1]!
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: strb r0, [r0, r1]!
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: strb r0, [r0], #1
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: source register and base register can't be identical
+@ CHECK-ERRORS: strb r0, [r0], r1
+@ CHECK-ERRORS:          ^
+
+        ldr r0, [r0, #4]!
+        ldr r0, [r0, r1]!
+        ldr r0, [r0], #4
+        ldr r0, [r0], r1
+        ldrh r0, [r0, #2]!
+        ldrh r0, [r0, r1]!
+        ldrh r0, [r0], #2
+        ldrh r0, [r0], r1
+        ldrsh r0, [r0, #2]!
+        ldrsh r0, [r0, r1]!
+        ldrsh r0, [r0], #2
+        ldrsh r0, [r0], r1
+        ldrb r0, [r0, #1]!
+        ldrb r0, [r0, r1]!
+        ldrb r0, [r0], #1
+        ldrb r0, [r0], r1
+        ldrsb r0, [r0, #1]!
+        ldrsb r0, [r0, r1]!
+        ldrsb r0, [r0], #1
+        ldrsb r0, [r0], r1
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldr r0, [r0, #4]!
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldr r0, [r0, r1]!
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldr r0, [r0], #4
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldr r0, [r0], r1
+@ CHECK-ERRORS:         ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrh r0, [r0, #2]!
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrh r0, [r0, r1]!
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrh r0, [r0], #2
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrh r0, [r0], r1
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrsh r0, [r0, #2]!
+@ CHECK-ERRORS:           ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrsh r0, [r0, r1]!
+@ CHECK-ERRORS:           ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrsh r0, [r0], #2
+@ CHECK-ERRORS:           ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrsh r0, [r0], r1
+@ CHECK-ERRORS:           ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrb r0, [r0, #1]!
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrb r0, [r0, r1]!
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrb r0, [r0], #1
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrb r0, [r0], r1
+@ CHECK-ERRORS:          ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrsb r0, [r0, #1]!
+@ CHECK-ERRORS:           ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrsb r0, [r0, r1]!
+@ CHECK-ERRORS:           ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrsb r0, [r0], #1
+@ CHECK-ERRORS:           ^
+@ CHECK-ERRORS: error: destination register and base register can't be identical
+@ CHECK-ERRORS: ldrsb r0, [r0], r1
+@ CHECK-ERRORS:           ^

diff --git a/test/MC/ARM/directive-arch_extension-crc.s b/test/MC/ARM/directive-arch_extension-crc.s
index 9e4deda..1359b1f 100644
--- a/test/MC/ARM/directive-arch_extension-crc.s
+++ b/test/MC/ARM/directive-arch_extension-crc.s

@@ -37,21 +37,21 @@
 nocrc:
 	crc32b r0, r1, r2
 @ CHECK-V7: error: instruction requires: crc armv8
-@ CHECK-V8: error: instruction requires: crc arm-mode
+@ CHECK-V8: error: instruction requires: crc
 	crc32h r0, r1, r2
 @ CHECK-V7: error: instruction requires: crc armv8
-@ CHECK-V8: error: instruction requires: crc arm-mode
+@ CHECK-V8: error: instruction requires: crc
 	crc32w r0, r1, r2
 @ CHECK-V7: error: instruction requires: crc armv8
-@ CHECK-V8: error: instruction requires: crc arm-mode
+@ CHECK-V8: error: instruction requires: crc
 
 	crc32cb r0, r1, r2
 @ CHECK-V7: error: instruction requires: crc armv8
-@ CHECK-V8: error: instruction requires: crc arm-mode
+@ CHECK-V8: error: instruction requires: crc
 	crc32ch r0, r1, r2
 @ CHECK-V7: error: instruction requires: crc armv8
-@ CHECK-V8: error: instruction requires: crc arm-mode
+@ CHECK-V8: error: instruction requires: crc
 	crc32cw r0, r1, r2
 @ CHECK-V7: error: instruction requires: crc armv8
-@ CHECK-V8: error: instruction requires: crc arm-mode
+@ CHECK-V8: error: instruction requires: crc
 

diff --git a/test/MC/ARM/directive-arch_extension-fp.s b/test/MC/ARM/directive-arch_extension-fp.s
index 0327dd7..f2b4dc2 100644
--- a/test/MC/ARM/directive-arch_extension-fp.s
+++ b/test/MC/ARM/directive-arch_extension-fp.s

@@ -1,11 +1,11 @@
 @ RUN: not llvm-mc -triple armv7-eabi -filetype asm -o /dev/null 2>&1 %s \
-@ RUN:   | FileCheck %s -check-prefix CHECK-V7
+@ RUN:   | FileCheck %s -check-prefix CHECK-V7 -check-prefix CHECK
 @ RUN: not llvm-mc -triple armv8-eabi -filetype asm -o /dev/null 2>&1 %s \
-@ RUN:   | FileCheck %s -check-prefix CHECK-V8
+@ RUN:   | FileCheck %s -check-prefix CHECK-V8 -check-prefix CHECK
 @ RUN: not llvm-mc -triple thumbv7-eabi -filetype asm -o /dev/null 2>&1 %s \
-@ RUN:   | FileCheck %s -check-prefix CHECK-V7
+@ RUN:   | FileCheck %s -check-prefix CHECK-V7 -check-prefix CHECK
 @ RUN: not llvm-mc -triple thumbv8-eabi -filetype asm -o /dev/null 2>&1 %s \
-@ RUN:   | FileCheck %s -check-prefix CHECK-V8
+@ RUN:   | FileCheck %s -check-prefix CHECK-V8 -check-prefix CHECK
 
 	.syntax unified
 
@@ -153,192 +153,131 @@
 	.type nofp,%function
 nofp:
 	vmrs r0, mvfr2
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vselgt.f32 s0, s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vselge.f32 s0, s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vseleq.f32 s0, s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vselvs.f32 s0, s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vmaxnm.f32 s0, s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vminnm.f32 s0, s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vselgt.f64 d0, d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vselge.f64 d0, d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vseleq.f64 d0, d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vselvs.f64 d0, d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vmaxnm.f64 d0, d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vminnm.f64 d0, d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vcvtb.f64.f16 d0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtb.f16.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtt.f64.f16 d0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtt.f16.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vcvta.s32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvta.u32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvta.s32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvta.u32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtn.s32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtn.u32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtn.s32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtn.u32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtp.s32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtp.u32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtp.s32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtp.u32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtm.s32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtm.u32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtm.s32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtm.u32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vrintz.f32 s0, s1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintz.f64 d0, d1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintz.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintz.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintr.f32 s0, s1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintr.f64 d0, d1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintr.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintr.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintx.f32 s0, s1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintx.f64 d0, d1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintx.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintx.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vrinta.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrinta.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrinta.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrinta.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintn.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintn.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintn.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintn.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintp.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintp.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintp.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintp.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintm.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintm.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintm.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintm.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 

diff --git a/test/MC/ARM/directive-arch_extension-idiv.s b/test/MC/ARM/directive-arch_extension-idiv.s
index c63bbfb..88614ea 100644
--- a/test/MC/ARM/directive-arch_extension-idiv.s
+++ b/test/MC/ARM/directive-arch_extension-idiv.s

@@ -43,11 +43,11 @@
 	udiv r0, r1, r2
 @ CHECK-ARMv6: error: instruction requires: divide in ARM
 @ CHECK-THUMBv6: error: instruction requires: divide in ARM arm-mode
-@ CHECK-ARMv7: error: instruction requires: divide in ARM arm-mode
+@ CHECK-ARMv7: error: instruction requires: divide in ARM
 @ CHECK-THUMBv7: error: instruction requires: divide in THUMB
 	sdiv r0, r1, r2
 @ CHECK-ARMv6: error: instruction requires: divide in ARM
 @ CHECK-THUMBv6: error: instruction requires: divide in ARM arm-mode
-@ CHECK-ARMv7: error: instruction requires: divide in ARM arm-mode
+@ CHECK-ARMv7: error: instruction requires: divide in ARM
 @ CHECK-THUMBv7: error: instruction requires: divide in THUMB
 

diff --git a/test/MC/ARM/directive-arch_extension-mode-switch.s b/test/MC/ARM/directive-arch_extension-mode-switch.s
new file mode 100644
index 0000000..7e4159f
--- /dev/null
+++ b/test/MC/ARM/directive-arch_extension-mode-switch.s

@@ -0,0 +1,17 @@
+@ RUN: not llvm-mc -triple armv8-eabi -filetype asm -o /dev/null %s 2>&1 | FileCheck %s
+
+@ Ensure that a mode switch does not revert the architectural features that were
+@ alternated explicitly.
+
+	.syntax unified
+
+	.arch_extension noidiv
+
+	.arm
+	udiv r0, r0, r1
+@ CHECK: instruction requires: divide in ARM
+
+	.thumb
+	udiv r0, r0, r1
+@ CHECK: instruction requires: divide in THUMB
+

diff --git a/test/MC/ARM/directive-arch_extension-simd.s b/test/MC/ARM/directive-arch_extension-simd.s
index c9dbf21..14359c6 100644
--- a/test/MC/ARM/directive-arch_extension-simd.s
+++ b/test/MC/ARM/directive-arch_extension-simd.s

@@ -1,11 +1,11 @@
 @ RUN: not llvm-mc -triple armv7-eabi -filetype asm -o /dev/null 2>&1 %s \
-@ RUN:   | FileCheck %s -check-prefix CHECK-V7
+@ RUN:   | FileCheck %s -check-prefix CHECK-V7 -check-prefix CHECK
 @ RUN: not llvm-mc -triple armv8-eabi -filetype asm -o /dev/null 2>&1 %s \
-@ RUN:   | FileCheck %s -check-prefix CHECK-V8
+@ RUN:   | FileCheck %s -check-prefix CHECK-V8 -check-prefix CHECK
 @ RUN: not llvm-mc -triple thumbv7-eabi -filetype asm -o /dev/null 2>&1 %s \
-@ RUN:   | FileCheck %s -check-prefix CHECK-V7
+@ RUN:   | FileCheck %s -check-prefix CHECK-V7 -check-prefix CHECK
 @ RUN: not llvm-mc -triple thumbv8-eabi -filetype asm -o /dev/null 2>&1 %s \
-@ RUN:   | FileCheck %s -check-prefix CHECK-V8
+@ RUN:   | FileCheck %s -check-prefix CHECK-V8 -check-prefix CHECK
 
 	.syntax unified
 
@@ -125,151 +125,103 @@
 	.type nosimd,%function
 nosimd:
 	vmaxnm.f32 s0, s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vminnm.f32 s0, s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vmaxnm.f64 d0, d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vminnm.f64 d0, d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vcvta.s32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvta.u32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvta.s32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvta.u32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtn.s32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtn.u32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtn.s32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtn.u32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtp.s32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtp.u32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtp.s32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtp.u32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtm.s32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtm.u32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtm.s32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vcvtm.u32.f64 s0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vrintz.f32 s0, s1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintz.f64 d0, d1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintz.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintz.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintr.f32 s0, s1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintr.f64 d0, d1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintr.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintr.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintx.f32 s0, s1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintx.f64 d0, d1
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintx.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintx.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 
 	vrinta.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrinta.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrinta.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrinta.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintn.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintn.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintn.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintn.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintp.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintp.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintp.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintp.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintm.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintm.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintm.f32.f32 s0, s0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 	vrintm.f64.f64 d0, d0
-@ CHECK-V7: error: instruction requires: FPARMv8
-@ CHECK-V8: error: instruction requires: double precision VFP FPARMv8
+@ CHECK: error: instruction requires: FPARMv8
 

diff --git a/test/MC/ARM/directive-arch_extension-toggle.s b/test/MC/ARM/directive-arch_extension-toggle.s
new file mode 100644
index 0000000..c3fb901
--- /dev/null
+++ b/test/MC/ARM/directive-arch_extension-toggle.s

@@ -0,0 +1,8 @@
+@ RUN: llvm-mc -triple armv7-eabi -mattr hwdiv -filetype asm -o /dev/null %s
+
+	.syntax unified
+	.thumb
+
+	udiv r0, r1, r2
+	.arch_extension idiv
+	udiv r0, r1, r2

diff --git a/test/MC/ARM/directive-eabi_attribute-2.s b/test/MC/ARM/directive-eabi_attribute-2.s
deleted file mode 100644
index 8f00ac8..0000000
--- a/test/MC/ARM/directive-eabi_attribute-2.s
+++ /dev/null

@@ -1,98 +0,0 @@
-@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
-
-	.syntax unified
-	.thumb
-
-	.eabi_attribute Tag_CPU_raw_name, "Cortex-A9"
-@ CHECK: .eabi_attribute 4, "Cortex-A9"
-	.eabi_attribute Tag_CPU_name, "cortex-a9"
-@ CHECK: .cpu cortex-a9
-	.eabi_attribute Tag_CPU_arch, 10
-@ CHECK: .eabi_attribute 6, 10
-	.eabi_attribute Tag_CPU_arch_profile, 'A'
-@ CHECK: .eabi_attribute 7, 65
-	.eabi_attribute Tag_ARM_ISA_use, 0
-@ CHECK: .eabi_attribute 8, 0
-	.eabi_attribute Tag_THUMB_ISA_use, 2
-@ CHECK: .eabi_attribute 9, 2
-	.eabi_attribute Tag_FP_arch, 3
-@ CHECK: .eabi_attribute 10, 3
-	.eabi_attribute Tag_WMMX_arch, 0
-@ CHECK: .eabi_attribute 11, 0
-	.eabi_attribute Tag_Advanced_SIMD_arch, 1
-@ CHECK: .eabi_attribute 12, 1
-	.eabi_attribute Tag_PCS_config, 2
-@ CHECK: .eabi_attribute 13, 2
-	.eabi_attribute Tag_ABI_PCS_R9_use, 0
-@ CHECK: .eabi_attribute 14, 0
-	.eabi_attribute Tag_ABI_PCS_RW_data, 0
-@ CHECK: .eabi_attribute 15, 0
-	.eabi_attribute Tag_ABI_PCS_RO_data, 0
-@ CHECK: .eabi_attribute 16, 0
-	.eabi_attribute Tag_ABI_PCS_GOT_use, 0
-@ CHECK: .eabi_attribute 17, 0
-	.eabi_attribute Tag_ABI_PCS_wchar_t, 4
-@ CHECK: .eabi_attribute 18, 4
-	.eabi_attribute Tag_ABI_FP_rounding, 1
-@ CHECK: .eabi_attribute 19, 1
-	.eabi_attribute Tag_ABI_FP_denormal, 2
-@ CHECK: .eabi_attribute 20, 2
-	.eabi_attribute Tag_ABI_FP_exceptions, 1
-@ CHECK: .eabi_attribute 21, 1
-	.eabi_attribute Tag_ABI_FP_user_exceptions, 1
-@ CHECK: .eabi_attribute 22, 1
-	.eabi_attribute Tag_ABI_FP_number_model, 3
-@ CHECK: .eabi_attribute 23, 3
-	.eabi_attribute Tag_ABI_align_needed, 1
-@ CHECK: .eabi_attribute 24, 1
-	.eabi_attribute Tag_ABI_align_preserved, 2
-@ CHECK: .eabi_attribute 25, 2
-	.eabi_attribute Tag_ABI_enum_size, 3
-@ CHECK: .eabi_attribute 26, 3
-	.eabi_attribute Tag_ABI_HardFP_use, 0
-@ CHECK: .eabi_attribute 27, 0
-	.eabi_attribute Tag_ABI_VFP_args, 1
-@ CHECK: .eabi_attribute 28, 1
-	.eabi_attribute Tag_ABI_WMMX_args, 0
-@ CHECK: .eabi_attribute 29, 0
-	.eabi_attribute Tag_ABI_FP_optimization_goals, 1
-@ CHECK: .eabi_attribute 31, 1
-	.eabi_attribute Tag_compatibility, 1
-@ CHECK: .eabi_attribute 32, 1
-	.eabi_attribute Tag_compatibility, 1, "aeabi"
-@ CHECK: .eabi_attribute 32, 1, "aeabi"
-	.eabi_attribute Tag_CPU_unaligned_access, 0
-@ CHECK: .eabi_attribute 34, 0
-	.eabi_attribute Tag_FP_HP_extension, 0
-@ CHECK: .eabi_attribute 36, 0
-	.eabi_attribute Tag_ABI_FP_16bit_format, 0
-@ CHECK: .eabi_attribute 38, 0
-	.eabi_attribute Tag_MPextension_use, 0
-@ CHECK: .eabi_attribute 42, 0
-	.eabi_attribute Tag_DIV_use, 0
-@ CHECK: .eabi_attribute 44, 0
-	.eabi_attribute Tag_nodefaults, 0
-@ CHECK: .eabi_attribute 64, 0
-	.eabi_attribute Tag_also_compatible_with, "gnu"
-@ CHECK: .eabi_attribute 65, "gnu"
-	.eabi_attribute Tag_T2EE_use, 0
-@ CHECK: .eabi_attribute 66, 0
-	.eabi_attribute Tag_conformance, "2.09"
-@ CHECK: .eabi_attribute 67, "2.09"
-	.eabi_attribute Tag_Virtualization_use, 0
-@ CHECK: .eabi_attribute 68, 0
-
-@ ===--- Compatibility Checks ---===
-
-	.eabi_attribute Tag_ABI_align8_needed, 1
-@ CHECK: .eabi_attribute 24, 1
-	.eabi_attribute Tag_ABI_align8_preserved, 2
-@ CHECK: .eabi_attribute 25, 2
-
-@ ===--- GNU AS Compatibility Checks ---===
-
-	.eabi_attribute 2 * 2 + 1, "cortex-a9"
-@ CHECK: .cpu cortex-a9
-	.eabi_attribute 2 * 2 + 2, 5 * 2
-@ CHECK: .eabi_attribute 6, 10
-

diff --git a/test/MC/ARM/directive-eabi_attribute.s b/test/MC/ARM/directive-eabi_attribute.s
index c060b80..e2f1f9b 100644
--- a/test/MC/ARM/directive-eabi_attribute.s
+++ b/test/MC/ARM/directive-eabi_attribute.s

@@ -1,56 +1,247 @@
+@ RUN: llvm-mc -triple armv7-elf -filetype asm -o - %s | FileCheck %s
 @ RUN: llvm-mc < %s -triple armv7-unknown-linux-gnueabi -filetype=obj -o - \
-@ RUN:   | llvm-readobj -s -sd | FileCheck %s
+@ RUN:   | llvm-readobj -arm-attributes | FileCheck %s --check-prefix=CHECK-OBJ
 
-@ CHECK: Name: .ARM.attribute
-@ CHECK: SectionData (
+        .syntax unified
+        .thumb
 
-@ <format-version>
-@ CHECK: 41
+	.eabi_attribute Tag_CPU_raw_name, "Cortex-A9"
+@ CHECK: .eabi_attribute 4, "Cortex-A9"
+@ CHECK-OBJ:        Tag: 4
+@ CHECK-OBJ-NEXT:   TagName: CPU_raw_name
+@ CHECK-OBJ-NEXT:   Value: CORTEX-A9
+	.eabi_attribute Tag_CPU_name, "cortex-a9"
+@ CHECK: .cpu cortex-a9
+@ CHECK-OBJ:        Tag: 5
+@ CHECK-OBJ-NEXT:   TagName: CPU_name
+@ CHECK-OBJ-NEXT:   Value: CORTEX-A9
+	.eabi_attribute Tag_CPU_arch, 10
+@ CHECK: .eabi_attribute 6, 10
+@ CHECK-OBJ:        Tag: 6
+@ CHECK-OBJ-NEXT:   Value: 10
+@ CHECK-OBJ-NEXT:   TagName: CPU_arch
+@ CHECK-OBJ-NEXT:   Description: ARM v7
+	.eabi_attribute Tag_CPU_arch_profile, 'A'
+@ CHECK: .eabi_attribute 7, 65
+@ CHECK-OBJ:        Tag: 7
+@ CHECK-OBJ-NEXT:   Value: 65
+@ CHECK-OBJ-NEXT:   TagName: CPU_arch_profile
+@ CHECK-OBJ-NEXT:   Description: Application
+	.eabi_attribute Tag_ARM_ISA_use, 0
+@ CHECK: .eabi_attribute 8, 0
+@ CHECK-OBJ:        Tag: 8
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: ARM_ISA_use
+@ CHECK-OBJ-NEXT:   Description: Not Permitted
+	.eabi_attribute Tag_THUMB_ISA_use, 2
+@ CHECK: .eabi_attribute 9, 2
+@ CHECK-OBJ:        Tag: 9
+@ CHECK-OBJ-NEXT:   Value: 2
+@ CHECK-OBJ-NEXT:   TagName: THUMB_ISA_use
+@ CHECK-OBJ-NEXT:   Description: Thumb-2
+	.eabi_attribute Tag_FP_arch, 3
+@ CHECK: .eabi_attribute 10, 3
+@ CHECK-OBJ:        Tag: 10
+@ CHECK-OBJ-NEXT:   Value: 3
+@ CHECK-OBJ-NEXT:   TagName: FP_arch
+@ CHECK-OBJ-NEXT:   Description: VFPv3
+	.eabi_attribute Tag_WMMX_arch, 0
+@ CHECK: .eabi_attribute 11, 0
+@ CHECK-OBJ:        Tag: 11
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: WMMX_arch
+@ CHECK-OBJ-NEXT:   Description: Not Permitted
+	.eabi_attribute Tag_Advanced_SIMD_arch, 1
+@ CHECK: .eabi_attribute 12, 1
+@ CHECK-OBJ:        Tag: 12
+@ CHECK-OBJ-NEXT:   Value: 1
+@ CHECK-OBJ-NEXT:   TagName: Advanced_SIMD_arch
+@ CHECK-OBJ-NEXT:   Description: NEONv1
+	.eabi_attribute Tag_PCS_config, 2
+@ CHECK: .eabi_attribute 13, 2
+@ CHECK-OBJ:        Tag: 13
+@ CHECK-OBJ-NEXT:   Value: 2
+@ CHECK-OBJ-NEXT:   TagName: PCS_config
+@ CHECK-OBJ-NEXT:   Description: Linux Application
+	.eabi_attribute Tag_ABI_PCS_R9_use, 0
+@ CHECK: .eabi_attribute 14, 0
+@ CHECK-OBJ:        Tag: 14
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: ABI_PCS_R9_use
+@ CHECK-OBJ-NEXT:   Description: v6
+	.eabi_attribute Tag_ABI_PCS_RW_data, 0
+@ CHECK: .eabi_attribute 15, 0
+@ CHECK-OBJ:        Tag: 15
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: ABI_PCS_RW_data
+@ CHECK-OBJ-NEXT:   Description: Absolute
+	.eabi_attribute Tag_ABI_PCS_RO_data, 0
+@ CHECK: .eabi_attribute 16, 0
+@ CHECK-OBJ:        Tag: 16
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: ABI_PCS_RO_data
+@ CHECK-OBJ-NEXT:   Description: Absolute
+	.eabi_attribute Tag_ABI_PCS_GOT_use, 0
+@ CHECK: .eabi_attribute 17, 0
+@ CHECK-OBJ:        Tag: 17
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: ABI_PCS_GOT_use
+@ CHECK-OBJ-NEXT:   Description: Not Permitted
+	.eabi_attribute Tag_ABI_PCS_wchar_t, 4
+@ CHECK: .eabi_attribute 18, 4
+@ CHECK-OBJ:        Tag: 18
+@ CHECK-OBJ-NEXT:   Value: 4
+@ CHECK-OBJ-NEXT:   TagName: ABI_PCS_wchar_t
+@ CHECK-OBJ-NEXT:   Description: 4-byte
+	.eabi_attribute Tag_ABI_FP_rounding, 1
+@ CHECK: .eabi_attribute 19, 1
+@ CHECK-OBJ:        Tag: 19
+@ CHECK-OBJ-NEXT:   Value: 1
+@ CHECK-OBJ-NEXT:   TagName: ABI_FP_rounding
+@ CHECK-OBJ-NEXT:   Description: Runtime
+	.eabi_attribute Tag_ABI_FP_denormal, 2
+@ CHECK: .eabi_attribute 20, 2
+@ CHECK-OBJ:        Tag: 20
+@ CHECK-OBJ-NEXT:   Value: 2
+@ CHECK-OBJ-NEXT:   TagName: ABI_FP_denormal
+@ CHECK-OBJ-NEXT:   Description: Sign Only
+	.eabi_attribute Tag_ABI_FP_exceptions, 1
+@ CHECK: .eabi_attribute 21, 1
+@ CHECK-OBJ:        Tag: 21
+@ CHECK-OBJ-NEXT:   Value: 1
+@ CHECK-OBJ-NEXT:   TagName: ABI_FP_exceptions
+@ CHECK-OBJ-NEXT:   Description: IEEE-754
+	.eabi_attribute Tag_ABI_FP_user_exceptions, 1
+@ CHECK: .eabi_attribute 22, 1
+@ CHECK-OBJ:        Tag: 22
+@ CHECK-OBJ-NEXT:   Value: 1
+@ CHECK-OBJ-NEXT:   TagName: ABI_FP_user_exceptions
+@ CHECK-OBJ-NEXT:   Description: IEEE-754
+	.eabi_attribute Tag_ABI_FP_number_model, 3
+@ CHECK: .eabi_attribute 23, 3
+@ CHECK-OBJ:        Tag: 23
+@ CHECK-OBJ-NEXT:   Value: 3
+@ CHECK-OBJ-NEXT:   TagName: ABI_FP_number_model
+@ CHECK-OBJ-NEXT:   Description: IEEE-754
+	.eabi_attribute Tag_ABI_align_needed, 1
+@ CHECK: .eabi_attribute 24, 1
+@ CHECK-OBJ:        Tag: 24
+@ CHECK-OBJ-NEXT:   Value: 1
+@ CHECK-OBJ-NEXT:   TagName: ABI_align_needed
+@ CHECK-OBJ-NEXT:   Description: 8-byte alignment
+	.eabi_attribute Tag_ABI_align_preserved, 2
+@ CHECK: .eabi_attribute 25, 2
+@ CHECK-OBJ:        Tag: 25
+@ CHECK-OBJ-NEXT:   Value: 2
+@ CHECK-OBJ-NEXT:   TagName: ABI_align_preserved
+@ CHECK-OBJ-NEXT:   Description: 8-byte data and code alignment
+	.eabi_attribute Tag_ABI_enum_size, 3
+@ CHECK: .eabi_attribute 26, 3
+@ CHECK-OBJ:        Tag: 26
+@ CHECK-OBJ-NEXT:   Value: 3
+@ CHECK-OBJ-NEXT:   TagName: ABI_enum_size
+@ CHECK-OBJ-NEXT:   Description: External Int32
+	.eabi_attribute Tag_ABI_HardFP_use, 0
+@ CHECK: .eabi_attribute 27, 0
+@ CHECK-OBJ:        Tag: 27
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: ABI_HardFP_use
+@ CHECK-OBJ-NEXT:   Description: Tag_FP_arch
+	.eabi_attribute Tag_ABI_VFP_args, 1
+@ CHECK: .eabi_attribute 28, 1
+@ CHECK-OBJ:        Tag: 28
+@ CHECK-OBJ-NEXT:   Value: 1
+@ CHECK-OBJ-NEXT:   TagName: ABI_VFP_args
+@ CHECK-OBJ-NEXT:   Description: AAPCS VFP
+	.eabi_attribute Tag_ABI_WMMX_args, 0
+@ CHECK: .eabi_attribute 29, 0
+@ CHECK-OBJ:        Tag: 29
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: ABI_WMMX_args
+@ CHECK-OBJ-NEXT:   Description: AAPCS
+	.eabi_attribute Tag_ABI_FP_optimization_goals, 1
+@ CHECK: .eabi_attribute 31, 1
+@ CHECK-OBJ:        Tag: 31
+@ CHECK-OBJ-NEXT:   Value: 1
+@ CHECK-OBJ-NEXT:   TagName: ABI_FP_optimization_goals
+@ CHECK-OBJ-NEXT:   Description: Speed
+	.eabi_attribute Tag_compatibility, 1
+@ CHECK: .eabi_attribute 32, 1
+	.eabi_attribute Tag_compatibility, 1, "aeabi"
+@ CHECK: .eabi_attribute 32, 1, "aeabi"
+@ CHECK-OBJ:        Tag: 32
+@ CHECK-OBJ-NEXT:   Value: 1, AEABI
+@ CHECK-OBJ-NEXT:   TagName: compatibility
+@ CHECK-OBJ-NEXT:   Description: AEABI Conformant
+	.eabi_attribute Tag_CPU_unaligned_access, 0
+@ CHECK: .eabi_attribute 34, 0
+@ CHECK-OBJ:        Tag: 34
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: CPU_unaligned_access
+@ CHECK-OBJ-NEXT:   Description: Not Permitted
+	.eabi_attribute Tag_FP_HP_extension, 0
+@ CHECK: .eabi_attribute 36, 0
+@ CHECK-OBJ:        Tag: 36
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: FP_HP_extension
+@ CHECK-OBJ-NEXT:   Description: If Available
+	.eabi_attribute Tag_ABI_FP_16bit_format, 0
+@ CHECK: .eabi_attribute 38, 0
+@ CHECK-OBJ:        Tag: 38
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: ABI_FP_16bit_format
+@ CHECK-OBJ-NEXT:   Description: Not Permitte
+	.eabi_attribute Tag_MPextension_use, 0
+@ CHECK: .eabi_attribute 42, 0
+@ CHECK-OBJ:        Tag: 42
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: MPextension_use
+@ CHECK-OBJ-NEXT:   Description: Not Permitted
+	.eabi_attribute Tag_DIV_use, 0
+@ CHECK: .eabi_attribute 44, 0
+@ CHECK-OBJ:        Tag: 44
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: DIV_use
+@ CHECK-OBJ-NEXT:   Description: If Available
+	.eabi_attribute Tag_nodefaults, 0
+@ CHECK: .eabi_attribute 64, 0
+@ CHECK-OBJ:        Tag: 64
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: nodefaults
+@ CHECK-OBJ-NEXT:   Description: Unspecified Tags UNDEFINED
+	.eabi_attribute Tag_also_compatible_with, "gnu"
+@ CHECK: .eabi_attribute 65, "gnu"
+@ CHECK-OBJ:        Tag: 65
+@ CHECK-OBJ-NEXT:   TagName: also_compatible_with
+@ CHECK-OBJ-NEXT:   Value: GNU
+	.eabi_attribute Tag_T2EE_use, 0
+@ CHECK: .eabi_attribute 66, 0
+@ CHECK-OBJ:        Tag: 66
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: T2EE_use
+@ CHECK-OBJ-NEXT:   Description: Not Permitted
+	.eabi_attribute Tag_conformance, "2.09"
+@ CHECK: .eabi_attribute 67, "2.09"
+@ CHECK-OBJ:        Tag: 67
+@ CHECK-OBJ-NEXT:   TagName: conformance
+@ CHECK-OBJ-NEXT:   Value: 2.09
+	.eabi_attribute Tag_Virtualization_use, 0
+@ CHECK: .eabi_attribute 68, 0
+@ CHECK-OBJ:        Tag: 68
+@ CHECK-OBJ-NEXT:   Value: 0
+@ CHECK-OBJ-NEXT:   TagName: Virtualization_use
+@ CHECK-OBJ-NEXT:   Description: Not Permitted
 
-@ <section-length>
-@ CHECK: 250000 00
+@ ===--- Compatibility Checks ---===
 
-@ <vendor-name> "aeabi\0"
-@ CHECK: 616561 626900
+	.eabi_attribute Tag_ABI_align8_needed, 1
+@ CHECK: .eabi_attribute 24, 1
+	.eabi_attribute Tag_ABI_align8_preserved, 2
+@ CHECK: .eabi_attribute 25, 2
 
-@ <file-tag>
-@ CHECK: 01
+@ ===--- GNU AS Compatibility Checks ---===
 
-@ <size>
-@ CHECK: 1B000000
-
-@ <attribute>*
-
-	.eabi_attribute 6, 10
-@ CHECK: 060A
-
-	.eabi_attribute 7, 65
-@ CHECK: 0741
-
-	.eabi_attribute 8, 1
-@ CHECK: 0801
-
-	.eabi_attribute 9, 2
-@ CHECK: 0902
-
-	.eabi_attribute 10, 3
-@ CHECK: 0A03
-
-	.eabi_attribute 12, 1
-@ CHECK: 0C01
-
-	.eabi_attribute 20, 1
-@ CHECK: 1401
-
-	.eabi_attribute 21, 1
-@ CHECK: 1501
-
-	.eabi_attribute 23, 3
-@ CHECK: 1703
-
-	.eabi_attribute 24, 1
-@ CHECK: 1801
-
-	.eabi_attribute 25, 1
-@ CHECK: 1901
-@ CHECK: )
+	.eabi_attribute 2 * 2 + 1, "cortex-a9"
+@ CHECK: .cpu cortex-a9
+	.eabi_attribute 2 * 2 + 2, 5 * 2
+@ CHECK: .eabi_attribute 6, 10

diff --git a/test/MC/ARM/directive-fpu-instrs.s b/test/MC/ARM/directive-fpu-instrs.s
new file mode 100644
index 0000000..ec97a77
--- /dev/null
+++ b/test/MC/ARM/directive-fpu-instrs.s

@@ -0,0 +1,16 @@
+// RUN: llvm-mc -triple armv7-unknown-linux-gnueabi -mattr=+vfp3,-neon %s
+
+.fpu neon
+VAND d3, d5, d5
+vldr d21, [r7, #296]
+
+@ .thumb should not disable the prior .fpu neon
+.thumb
+
+vmov q4, q11 @ v4si
+str r6, [r7, #264]
+mov r6, r5
+vldr d21, [r7, #296]
+add r9, r7, #216
+
+fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15}

diff --git a/test/MC/ARM/directive-thumb_func.s b/test/MC/ARM/directive-thumb_func.s
new file mode 100644
index 0000000..f82e0d1
--- /dev/null
+++ b/test/MC/ARM/directive-thumb_func.s

@@ -0,0 +1,22 @@
+@ RUN: not llvm-mc -triple armv7-eabi -filetype asm -o /dev/null %s 2>&1 \
+@ RUN:    | FileCheck %s -check-prefix CHECK-EABI
+
+@ NOTE: this test ensures that both forms are accepted for MachO
+@ RUN: llvm-mc -triple armv7-darwin -filetype asm -o /dev/null %s
+
+	.syntax unified
+
+	.thumb_func
+no_suffix:
+	bx lr
+
+	.thumb_func suffix
+suffix:
+	bx lr
+
+// CHECK-EABI: error: unexpected token in directive
+// CHECK-EABI: 	.thumb_func suffix
+// CHECK-EABI:              ^
+
+// CHECK-EABI-NOT: error: invalid instruction
+

diff --git a/test/MC/ARM/directive-unsupported.s b/test/MC/ARM/directive-unsupported.s
new file mode 100644
index 0000000..0b1f9ba
--- /dev/null
+++ b/test/MC/ARM/directive-unsupported.s

@@ -0,0 +1,68 @@
+@ RUN: not llvm-mc -triple thumbv7-windows -filetype asm -o /dev/null %s 2>&1 \
+@ RUN:     | FileCheck %s
+
+@ RUN: not llvm-mc -triple armv7-darwin -filetype asm -o /dev/null %s 2>&1 \
+@ RUN:    | FileCheck %s
+
+	.syntax unified
+
+	.arch armv7
+
+// CHECK: error: unknown directive
+// CHECK: .arch armv7
+// CHECK: ^
+
+	.cpu cortex-a7
+
+// CHECK: error: unknown directive
+// CHECK: .cpu cortex-a7
+// CHECK: ^
+
+	.fpu neon
+
+// CHECK: error: unknown directive
+// CHECK: .fpu neon
+// CHECK: ^
+
+	.eabi_attribute 0, 0
+
+// CHECK: error: unknown directive
+// CHECK: .eabi_attribute 0, 0
+// CHECK: ^
+
+	.inst 0xdefe
+
+// CHECK: error: unknown directive
+// CHECK: .inst 0xdefe
+// CHECK: ^
+
+	.inst.n 0xdefe
+
+// CHECK: error: unknown directive
+// CHECK: .inst.n 0xdefe
+// CHECK: ^
+
+	.inst.w 0xdefe
+
+// CHECK: error: unknown directive
+// CHECK: .inst.w 0xdefe
+// CHECK: ^
+
+	.object_arch armv7
+
+// CHECK: error: unknown directive
+// CHECK: .object_arch armv7
+// CHECK: ^
+
+	.tlsdescseq undefined
+
+// CHECK: error: unknown directive
+// CHECK: .tlsdescseq undefined
+// CHECK: ^
+
+	.fnstart
+
+// CHECK: error: unknown directive
+// CHECK: .fnstart
+// CHECK: ^
+

diff --git a/test/MC/ARM/dwarf-asm-multiple-sections-dwarf-2.s b/test/MC/ARM/dwarf-asm-multiple-sections-dwarf-2.s
new file mode 100644
index 0000000..5bf8fbd
--- /dev/null
+++ b/test/MC/ARM/dwarf-asm-multiple-sections-dwarf-2.s

@@ -0,0 +1,66 @@
+// RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -fdebug-compilation-dir=/tmp -dwarf-version 2 2>&1 | FileCheck -check-prefix MESSAGES %s
+// RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF %s
+// RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC %s
+
+  .section .text, "ax"
+a:
+  mov r0, r0
+
+  .section foo, "ax"
+b:
+  mov r1, r1
+
+// MESSAGES: warning: DWARF2 only supports one section per compilation unit
+
+// DWARF: .debug_abbrev contents:
+// DWARF: Abbrev table for offset: 0x00000000
+// DWARF: [1] DW_TAG_compile_unit DW_CHILDREN_yes
+// DWARF:         DW_AT_stmt_list DW_FORM_data4
+// DWARF:         DW_AT_low_pc    DW_FORM_addr
+// DWARF:         DW_AT_high_pc   DW_FORM_addr
+// DWARF:         DW_AT_name      DW_FORM_string
+// DWARF:         DW_AT_comp_dir  DW_FORM_string
+// DWARF:         DW_AT_producer  DW_FORM_string
+// DWARF:         DW_AT_language  DW_FORM_data2
+
+// DWARF: .debug_info contents:
+// DWARF: 0x{{[0-9a-f]+}}: DW_TAG_compile_unit [1]
+// CHECK-NOT-DWARF: DW_TAG_
+// DWARF:               DW_AT_low_pc [DW_FORM_addr]       (0x0000000000000000)
+// DWARF:               DW_AT_high_pc [DW_FORM_addr]      (0x0000000000000004)
+
+// DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2] *
+// DWARF-NEXT: DW_AT_name [DW_FORM_string]     ("a")
+
+
+// DWARF: .debug_aranges contents:
+// DWARF-NEXT: Address Range Header: length = 0x00000024, version = 0x0002, cu_offset = 0x00000000, addr_size = 0x04, seg_size = 0x00
+// DWARF-NEXT: [0x00000000 - 0x00000004)
+// DWARF-NEXT: [0x00000000 - 0x00000004)
+
+// DWARF: .debug_line contents:
+// DWARF:      0x0000000000000000      7      0      1   0   0 is_stmt
+// DWARF-NEXT: 0x0000000000000004      7      0      1   0   0 is_stmt end_sequence
+// DWARF:      0x0000000000000000     11      0      1   0   0 is_stmt
+// DWARF-NEXT: 0x0000000000000004     11      0      1   0   0 is_stmt end_sequence
+
+
+// DWARF: .debug_ranges contents:
+// DWARF-NOT: {{0-9a-f}}
+// DWARF: .debug_pubnames contents:
+
+
+// RELOC: RELOCATION RECORDS FOR [.rel.debug_info]:
+// RELOC-NEXT: 00000006 R_ARM_ABS32 .debug_abbrev
+// RELOC-NEXT: 0000000c R_ARM_ABS32 .debug_line
+// RELOC-NEXT: R_ARM_ABS32 .text
+// RELOC-NEXT: R_ARM_ABS32 .text
+// RELOC-NEXT: R_ARM_ABS32 .text
+// RELOC-NEXT: R_ARM_ABS32 foo
+
+// RELOC-NOT: RELOCATION RECORDS FOR [.rel.debug_ranges]:
+
+// RELOC: RELOCATION RECORDS FOR [.rel.debug_aranges]:
+// RELOC-NEXT: 00000006 R_ARM_ABS32 .debug_info
+// RELOC-NEXT: 00000010 R_ARM_ABS32 .text
+// RELOC-NEXT: 00000018 R_ARM_ABS32 foo

diff --git a/test/MC/ARM/dwarf-asm-multiple-sections.s b/test/MC/ARM/dwarf-asm-multiple-sections.s
index ed1b89e..0eb8bab 100644
--- a/test/MC/ARM/dwarf-asm-multiple-sections.s
+++ b/test/MC/ARM/dwarf-asm-multiple-sections.s

@@ -1,7 +1,7 @@
 // RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -fdebug-compilation-dir=/tmp
 // RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF %s
 // RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC %s
-// RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 2 2>&1 | FileCheck -check-prefix VERSION %s
+// RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 2 2>&1 | FileCheck -check-prefix VERSION %s
 // RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 1 2>&1 | FileCheck -check-prefix DWARF1 %s
 // RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 5 2>&1 | FileCheck -check-prefix DWARF5 %s
   .section .text, "ax"
@@ -25,7 +25,7 @@
 // DWARF: .debug_info contents:
 // DWARF: 0x{{[0-9a-f]+}}: DW_TAG_compile_unit [1]
 // CHECK-NOT-DWARF: DW_TAG_
-// DWARF: DW_AT_ranges [DW_FORM_data4]      (0x00000000)
+// DWARF: DW_AT_ranges [DW_FORM_data4]      (0x00000000
 
 // DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2] *
 // DWARF-NEXT: DW_AT_name [DW_FORM_string]     ("a")
@@ -73,7 +73,7 @@
 // RELOC-NEXT: 00000018 R_ARM_ABS32 foo
 
 
-// VERSION: {{.*}} error: DWARF2 only supports one section per compilation unit
+// VERSION: {{.*}} warning: DWARF2 only supports one section per compilation unit
 
 // DWARF1: Dwarf version 1 is not supported.
 // DWARF5: Dwarf version 5 is not supported.

diff --git a/test/MC/ARM/ldr-pseudo-darwin.s b/test/MC/ARM/ldr-pseudo-darwin.s
index a77f6d5..f04f533 100644
--- a/test/MC/ARM/ldr-pseudo-darwin.s
+++ b/test/MC/ARM/ldr-pseudo-darwin.s

@@ -156,35 +156,38 @@
 @ Constant Pools
 @
 @ CHECK: .section __TEXT,b,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp0:
 @ CHECK: .long 65537
 @ CHECK: .end_data_region
 
 @ CHECK: .section __TEXT,c,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp1:
 @ CHECK: .long 65538
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp2:
 @ CHECK: .long 65539
 @ CHECK: .end_data_region
 
 @ CHECK: .section __TEXT,d,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp3:
 @ CHECK: .long 65540
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp4:
 @ CHECK: .long 65540
 @ CHECK: .end_data_region
 
 @ CHECK: .section __TEXT,e,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp5:
 @ CHECK: .long 65542
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp6:
 @ CHECK: .long 65543
 @ CHECK: .end_data_region
@@ -193,49 +196,52 @@
 @ CHECK-NOT: .section __TEXT,f,regular,pure_instructions
 
 @ CHECK: .section __TEXT,g,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp7:
 @ CHECK: .long foo
 @ CHECK: .end_data_region
 
 @ CHECK: .section __TEXT,h,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp8:
 @ CHECK: .long f5
 @ CHECK: .end_data_region
 
 @ CHECK: .section __TEXT,i,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp9:
 @ CHECK: .long f12
 @ CHECK: .end_data_region
 
 @ CHECK: .section __TEXT,j,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp10:
 @ CHECK: .long 257
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp11:
 @ CHECK: .long bar
 @ CHECK: .end_data_region
 
 @ CHECK: .section __TEXT,k,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp12:
 @ CHECK: .long 65544
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp13:
 @ CHECK: .long baz
 @ CHECK: .end_data_region
 
 @ CHECK: .section __TEXT,l,regular,pure_instructions
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp14:
 @ CHECK: .long 65545
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp15:
 @ CHECK: .long bar+4
 @ CHECK: .end_data_region

diff --git a/test/MC/ARM/ltorg-darwin.s b/test/MC/ARM/ltorg-darwin.s
index de6b7e9..3402f40 100644
--- a/test/MC/ARM/ltorg-darwin.s
+++ b/test/MC/ARM/ltorg-darwin.s

@@ -19,8 +19,8 @@
   b f3
 .ltorg
 @ constant pool
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp0:
 @ CHECK: .long 65537
 @ CHECK: .end_data_region
@@ -41,8 +41,8 @@
   b f5
 .ltorg
 @ constant pool
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp1:
 @ CHECK: .long 65538
 @ CHECK: .end_data_region
@@ -57,8 +57,8 @@
   b f6
 .ltorg
 @ constant pool
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp2:
 @ CHECK: .long 65539
 @ CHECK: .end_data_region
@@ -92,8 +92,8 @@
   b f10
 .ltorg
 @ constant pool
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp3:
 @ CHECK: .long bar
 @ CHECK: .end_data_region
@@ -114,8 +114,8 @@
   b f12
   .ltorg
 @ constant pool
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp4:
 @ CHECK: .long 65540
 @ CHECK: .end_data_region
@@ -141,8 +141,8 @@
 @ should have a non-empty constant pool at end of this section
 @ CHECK: .section __TEXT,e,regular,pure_instructions
 @ constant pool
-@ CHECK: .align 2
 @ CHECK: .data_region
+@ CHECK: .align 2
 @ CHECK-LABEL: Ltmp5:
 @ CHECK: .long 65541
 @ CHECK: .end_data_region

diff --git a/test/MC/ARM/move-banked-regs.s b/test/MC/ARM/move-banked-regs.s
new file mode 100644
index 0000000..3fac846
--- /dev/null
+++ b/test/MC/ARM/move-banked-regs.s

@@ -0,0 +1,220 @@
+@ RUN: llvm-mc -triple armv7 -mattr=virtualization -show-encoding %s | FileCheck %s --check-prefix=CHECK-ARM
+@ RUN: llvm-mc -triple thumbv7 -mattr=virtualization -show-encoding %s | FileCheck %s --check-prefix=CHECK-THUMB
+
+        mrs r2, r8_usr
+        mrs r3, r9_usr
+        mrs r5, r10_usr
+        mrs r7, r11_usr
+        mrs r11, r12_usr
+        mrs r1, sp_usr
+        mrs r2, lr_usr
+@ CHECK-ARM:         mrs     r2, r8_usr              @ encoding: [0x00,0x22,0x20,0xe1]
+@ CHECK-ARM:         mrs     r3, r9_usr              @ encoding: [0x00,0x32,0x21,0xe1]
+@ CHECK-ARM:         mrs     r5, r10_usr             @ encoding: [0x00,0x52,0x22,0xe1]
+@ CHECK-ARM:         mrs     r7, r11_usr             @ encoding: [0x00,0x72,0x23,0xe1]
+@ CHECK-ARM:         mrs     r11, r12_usr            @ encoding: [0x00,0xb2,0x24,0xe1]
+@ CHECK-ARM:         mrs     r1, sp_usr              @ encoding: [0x00,0x12,0x25,0xe1]
+@ CHECK-ARM:         mrs     r2, lr_usr              @ encoding: [0x00,0x22,0x26,0xe1]
+@ CHECK-THUMB:         mrs     r2, r8_usr              @ encoding: [0xe0,0xf3,0x20,0x82]
+@ CHECK-THUMB:         mrs     r3, r9_usr              @ encoding: [0xe1,0xf3,0x20,0x83]
+@ CHECK-THUMB:         mrs     r5, r10_usr             @ encoding: [0xe2,0xf3,0x20,0x85]
+@ CHECK-THUMB:         mrs     r7, r11_usr             @ encoding: [0xe3,0xf3,0x20,0x87]
+@ CHECK-THUMB:         mrs     r11, r12_usr            @ encoding: [0xe4,0xf3,0x20,0x8b]
+@ CHECK-THUMB:         mrs     r1, sp_usr              @ encoding: [0xe5,0xf3,0x20,0x81]
+@ CHECK-THUMB:         mrs     r2, lr_usr              @ encoding: [0xe6,0xf3,0x20,0x82]
+
+        mrs r2, r8_fiq
+        mrs r3, r9_fiq
+        mrs r5, r10_fiq
+        mrs r7, r11_fiq
+        mrs r11, r12_fiq
+        mrs r1, sp_fiq
+        mrs r2, lr_fiq
+        mrs r3, spsr_fiq
+@ CHECK-ARM:         mrs     r2, r8_fiq              @ encoding: [0x00,0x22,0x28,0xe1]
+@ CHECK-ARM:         mrs     r3, r9_fiq              @ encoding: [0x00,0x32,0x29,0xe1]
+@ CHECK-ARM:         mrs     r5, r10_fiq             @ encoding: [0x00,0x52,0x2a,0xe1]
+@ CHECK-ARM:         mrs     r7, r11_fiq             @ encoding: [0x00,0x72,0x2b,0xe1]
+@ CHECK-ARM:         mrs     r11, r12_fiq            @ encoding: [0x00,0xb2,0x2c,0xe1]
+@ CHECK-ARM:         mrs     r1, sp_fiq              @ encoding: [0x00,0x12,0x2d,0xe1]
+@ CHECK-ARM:         mrs     r2, lr_fiq              @ encoding: [0x00,0x22,0x2e,0xe1]
+@ CHECK-ARM:         mrs     r3, SPSR_fiq            @ encoding: [0x00,0x32,0x6e,0xe1]
+@ CHECK-THUMB:         mrs     r2, r8_fiq              @ encoding: [0xe8,0xf3,0x20,0x82]
+@ CHECK-THUMB:         mrs     r3, r9_fiq              @ encoding: [0xe9,0xf3,0x20,0x83]
+@ CHECK-THUMB:         mrs     r5, r10_fiq             @ encoding: [0xea,0xf3,0x20,0x85]
+@ CHECK-THUMB:         mrs     r7, r11_fiq             @ encoding: [0xeb,0xf3,0x20,0x87]
+@ CHECK-THUMB:         mrs     r11, r12_fiq            @ encoding: [0xec,0xf3,0x20,0x8b]
+@ CHECK-THUMB:         mrs     r1, sp_fiq              @ encoding: [0xed,0xf3,0x20,0x81]
+@ CHECK-THUMB:         mrs     r2, lr_fiq              @ encoding: [0xee,0xf3,0x20,0x82]
+@ CHECK-THUMB:         mrs     r3, SPSR_fiq            @ encoding: [0xfe,0xf3,0x20,0x83]
+
+        mrs r4, lr_irq
+        mrs r9, sp_irq
+        mrs r1, spsr_irq
+@ CHECK-ARM:         mrs     r4, lr_irq              @ encoding: [0x00,0x43,0x20,0xe1]
+@ CHECK-ARM:         mrs     r9, sp_irq              @ encoding: [0x00,0x93,0x21,0xe1]
+@ CHECK-ARM:         mrs     r1, SPSR_irq            @ encoding: [0x00,0x13,0x60,0xe1]
+@ CHECK-THUMB:         mrs     r4, lr_irq              @ encoding: [0xe0,0xf3,0x30,0x84]
+@ CHECK-THUMB:         mrs     r9, sp_irq              @ encoding: [0xe1,0xf3,0x30,0x89]
+@ CHECK-THUMB:         mrs     r1, SPSR_irq            @ encoding: [0xf0,0xf3,0x30,0x81]
+
+        mrs r1, lr_svc
+        mrs r3, sp_svc
+        mrs r5, spsr_svc
+@ CHECK-ARM:         mrs     r1, lr_svc              @ encoding: [0x00,0x13,0x22,0xe1]
+@ CHECK-ARM:         mrs     r3, sp_svc              @ encoding: [0x00,0x33,0x23,0xe1]
+@ CHECK-ARM:         mrs     r5, SPSR_svc            @ encoding: [0x00,0x53,0x62,0xe1]
+@ CHECK-THUMB:         mrs     r1, lr_svc              @ encoding: [0xe2,0xf3,0x30,0x81]
+@ CHECK-THUMB:         mrs     r3, sp_svc              @ encoding: [0xe3,0xf3,0x30,0x83]
+@ CHECK-THUMB:         mrs     r5, SPSR_svc            @ encoding: [0xf2,0xf3,0x30,0x85]
+
+        mrs r5, lr_abt
+        mrs r7, sp_abt
+        mrs r9, spsr_abt
+@ CHECK-ARM:         mrs     r5, lr_abt              @ encoding: [0x00,0x53,0x24,0xe1]
+@ CHECK-ARM:         mrs     r7, sp_abt              @ encoding: [0x00,0x73,0x25,0xe1]
+@ CHECK-ARM:         mrs     r9, SPSR_abt            @ encoding: [0x00,0x93,0x64,0xe1]
+@ CHECK-THUMB:         mrs     r5, lr_abt              @ encoding: [0xe4,0xf3,0x30,0x85]
+@ CHECK-THUMB:         mrs     r7, sp_abt              @ encoding: [0xe5,0xf3,0x30,0x87]
+@ CHECK-THUMB:         mrs     r9, SPSR_abt            @ encoding: [0xf4,0xf3,0x30,0x89]
+
+        mrs r9, lr_und
+        mrs r11, sp_und
+        mrs r12, spsr_und
+@ CHECK-ARM:         mrs     r9, lr_und              @ encoding: [0x00,0x93,0x26,0xe1]
+@ CHECK-ARM:         mrs     r11, sp_und             @ encoding: [0x00,0xb3,0x27,0xe1]
+@ CHECK-ARM:         mrs     r12, SPSR_und           @ encoding: [0x00,0xc3,0x66,0xe1]
+@ CHECK-THUMB:         mrs     r9, lr_und              @ encoding: [0xe6,0xf3,0x30,0x89]
+@ CHECK-THUMB:         mrs     r11, sp_und             @ encoding: [0xe7,0xf3,0x30,0x8b]
+@ CHECK-THUMB:         mrs     r12, SPSR_und           @ encoding: [0xf6,0xf3,0x30,0x8c]
+
+
+        mrs r2, lr_mon
+        mrs r4, sp_mon
+        mrs r6, spsr_mon
+@ CHECK-ARM:         mrs     r2, lr_mon              @ encoding: [0x00,0x23,0x2c,0xe1]
+@ CHECK-ARM:         mrs     r4, sp_mon              @ encoding: [0x00,0x43,0x2d,0xe1]
+@ CHECK-ARM:         mrs     r6, SPSR_mon            @ encoding: [0x00,0x63,0x6c,0xe1]
+@ CHECK-THUMB:         mrs     r2, lr_mon              @ encoding: [0xec,0xf3,0x30,0x82]
+@ CHECK-THUMB:         mrs     r4, sp_mon              @ encoding: [0xed,0xf3,0x30,0x84]
+@ CHECK-THUMB:         mrs     r6, SPSR_mon            @ encoding: [0xfc,0xf3,0x30,0x86]
+
+
+        mrs r6, elr_hyp
+        mrs r8, sp_hyp
+        mrs r10, spsr_hyp
+@ CHECK-ARM:         mrs     r6, elr_hyp             @ encoding: [0x00,0x63,0x2e,0xe1]
+@ CHECK-ARM:         mrs     r8, sp_hyp              @ encoding: [0x00,0x83,0x2f,0xe1]
+@ CHECK-ARM:         mrs     r10, SPSR_hyp            @ encoding: [0x00,0xa3,0x6e,0xe1]
+@ CHECK-THUMB:         mrs     r6, elr_hyp             @ encoding: [0xee,0xf3,0x30,0x86]
+@ CHECK-THUMB:         mrs     r8, sp_hyp              @ encoding: [0xef,0xf3,0x30,0x88]
+@ CHECK-THUMB:         mrs     r10, SPSR_hyp            @ encoding: [0xfe,0xf3,0x30,0x8a]
+
+
+        msr r8_usr, r2
+        msr r9_usr, r3
+        msr r10_usr, r5
+        msr r11_usr, r7
+        msr r12_usr, r11
+        msr sp_usr, r1
+        msr lr_usr, r2
+@ CHECK-ARM:         msr     r8_usr, r2              @ encoding: [0x02,0xf2,0x20,0xe1]
+@ CHECK-ARM:         msr     r9_usr, r3              @ encoding: [0x03,0xf2,0x21,0xe1]
+@ CHECK-ARM:         msr     r10_usr, r5             @ encoding: [0x05,0xf2,0x22,0xe1]
+@ CHECK-ARM:         msr     r11_usr, r7             @ encoding: [0x07,0xf2,0x23,0xe1]
+@ CHECK-ARM:         msr     r12_usr, r11            @ encoding: [0x0b,0xf2,0x24,0xe1]
+@ CHECK-ARM:         msr     sp_usr, r1              @ encoding: [0x01,0xf2,0x25,0xe1]
+@ CHECK-ARM:         msr     lr_usr, r2              @ encoding: [0x02,0xf2,0x26,0xe1]
+@ CHECK-THUMB:         msr     r8_usr, r2              @ encoding: [0x82,0xf3,0x20,0x80]
+@ CHECK-THUMB:         msr     r9_usr, r3              @ encoding: [0x83,0xf3,0x20,0x81]
+@ CHECK-THUMB:         msr     r10_usr, r5             @ encoding: [0x85,0xf3,0x20,0x82]
+@ CHECK-THUMB:         msr     r11_usr, r7             @ encoding: [0x87,0xf3,0x20,0x83]
+@ CHECK-THUMB:         msr     r12_usr, r11            @ encoding: [0x8b,0xf3,0x20,0x84]
+@ CHECK-THUMB:         msr     sp_usr, r1              @ encoding: [0x81,0xf3,0x20,0x85]
+@ CHECK-THUMB:         msr     lr_usr, r2              @ encoding: [0x82,0xf3,0x20,0x86]
+
+        msr r8_fiq, r2
+        msr r9_fiq, r3
+        msr r10_fiq, r5
+        msr r11_fiq, r7
+        msr r12_fiq, r11
+        msr sp_fiq, r1
+        msr lr_fiq, r2
+        msr spsr_fiq, r3
+@ CHECK-ARM:         msr     r8_fiq, r2              @ encoding: [0x02,0xf2,0x28,0xe1]
+@ CHECK-ARM:         msr     r9_fiq, r3              @ encoding: [0x03,0xf2,0x29,0xe1]
+@ CHECK-ARM:         msr     r10_fiq, r5             @ encoding: [0x05,0xf2,0x2a,0xe1]
+@ CHECK-ARM:         msr     r11_fiq, r7             @ encoding: [0x07,0xf2,0x2b,0xe1]
+@ CHECK-ARM:         msr     r12_fiq, r11            @ encoding: [0x0b,0xf2,0x2c,0xe1]
+@ CHECK-ARM:         msr     sp_fiq, r1              @ encoding: [0x01,0xf2,0x2d,0xe1]
+@ CHECK-ARM:         msr     lr_fiq, r2              @ encoding: [0x02,0xf2,0x2e,0xe1]
+@ CHECK-ARM:         msr     SPSR_fiq, r3            @ encoding: [0x03,0xf2,0x6e,0xe1]
+@ CHECK-THUMB:         msr     r8_fiq, r2              @ encoding: [0x82,0xf3,0x20,0x88]
+@ CHECK-THUMB:         msr     r9_fiq, r3              @ encoding: [0x83,0xf3,0x20,0x89]
+@ CHECK-THUMB:         msr     r10_fiq, r5             @ encoding: [0x85,0xf3,0x20,0x8a]
+@ CHECK-THUMB:         msr     r11_fiq, r7             @ encoding: [0x87,0xf3,0x20,0x8b]
+@ CHECK-THUMB:         msr     r12_fiq, r11            @ encoding: [0x8b,0xf3,0x20,0x8c]
+@ CHECK-THUMB:         msr     sp_fiq, r1              @ encoding: [0x81,0xf3,0x20,0x8d]
+@ CHECK-THUMB:         msr     lr_fiq, r2              @ encoding: [0x82,0xf3,0x20,0x8e]
+@ CHECK-THUMB:        msr     SPSR_fiq, r3            @ encoding: [0x93,0xf3,0x20,0x8e]
+
+        msr lr_irq, r4
+        msr sp_irq, r9
+        msr spsr_irq, r11
+@ CHECK-ARM:         msr     lr_irq, r4              @ encoding: [0x04,0xf3,0x20,0xe1]
+@ CHECK-ARM:         msr     sp_irq, r9              @ encoding: [0x09,0xf3,0x21,0xe1]
+@ CHECK-ARM:         msr     SPSR_irq, r11           @ encoding: [0x0b,0xf3,0x60,0xe1]
+@ CHECK-THUMB:         msr     lr_irq, r4              @ encoding: [0x84,0xf3,0x30,0x80]
+@ CHECK-THUMB:         msr     sp_irq, r9              @ encoding: [0x89,0xf3,0x30,0x81]
+@ CHECK-THUMB:         msr     SPSR_irq, r11           @ encoding: [0x9b,0xf3,0x30,0x80]
+
+        msr lr_svc, r1
+        msr sp_svc, r3
+        msr spsr_svc, r5
+@ CHECK-ARM:         msr     lr_svc, r1              @ encoding: [0x01,0xf3,0x22,0xe1]
+@ CHECK-ARM:         msr     sp_svc, r3              @ encoding: [0x03,0xf3,0x23,0xe1]
+@ CHECK-ARM:         msr     SPSR_svc, r5            @ encoding: [0x05,0xf3,0x62,0xe1]
+@ CHECK-THUMB:         msr     lr_svc, r1              @ encoding: [0x81,0xf3,0x30,0x82]
+@ CHECK-THUMB:         msr     sp_svc, r3              @ encoding: [0x83,0xf3,0x30,0x83]
+@ CHECK-THUMB:         msr     SPSR_svc, r5            @ encoding: [0x95,0xf3,0x30,0x82]
+
+        msr lr_abt, r5
+        msr sp_abt, r7
+        msr spsr_abt, r9
+@ CHECK-ARM:         msr     lr_abt, r5              @ encoding: [0x05,0xf3,0x24,0xe1]
+@ CHECK-ARM:         msr     sp_abt, r7              @ encoding: [0x07,0xf3,0x25,0xe1]
+@ CHECK-ARM:         msr     SPSR_abt, r9            @ encoding: [0x09,0xf3,0x64,0xe1]
+@ CHECK-THUMB:         msr     lr_abt, r5              @ encoding: [0x85,0xf3,0x30,0x84]
+@ CHECK-THUMB:         msr     sp_abt, r7              @ encoding: [0x87,0xf3,0x30,0x85]
+@ CHECK-THUMB:         msr     SPSR_abt, r9            @ encoding: [0x99,0xf3,0x30,0x84]
+
+        msr lr_und, r9
+        msr sp_und, r11
+        msr spsr_und, r12
+@ CHECK-ARM:         msr     lr_und, r9              @ encoding: [0x09,0xf3,0x26,0xe1]
+@ CHECK-ARM:         msr     sp_und, r11             @ encoding: [0x0b,0xf3,0x27,0xe1]
+@ CHECK-ARM:         msr     SPSR_und, r12           @ encoding: [0x0c,0xf3,0x66,0xe1]
+@ CHECK-THUMB:         msr     lr_und, r9              @ encoding: [0x89,0xf3,0x30,0x86]
+@ CHECK-THUMB:         msr     sp_und, r11             @ encoding: [0x8b,0xf3,0x30,0x87]
+@ CHECK-THUMB:         msr     SPSR_und, r12           @ encoding: [0x9c,0xf3,0x30,0x86]
+
+
+        msr lr_mon, r2
+        msr sp_mon, r4
+        msr spsr_mon, r6
+@ CHECK-ARM:         msr     lr_mon, r2              @ encoding: [0x02,0xf3,0x2c,0xe1]
+@ CHECK-ARM:         msr     sp_mon, r4              @ encoding: [0x04,0xf3,0x2d,0xe1]
+@ CHECK-ARM:         msr     SPSR_mon, r6            @ encoding: [0x06,0xf3,0x6c,0xe1]
+@ CHECK-THUMB:         msr     lr_mon, r2              @ encoding: [0x82,0xf3,0x30,0x8c]
+@ CHECK-THUMB:         msr     sp_mon, r4              @ encoding: [0x84,0xf3,0x30,0x8d]
+@ CHECK-THUMB:         msr     SPSR_mon, r6            @ encoding: [0x96,0xf3,0x30,0x8c]
+
+        msr elr_hyp, r6
+        msr sp_hyp, r8
+        msr spsr_hyp, r10
+@ CHECK-ARM:         msr     elr_hyp, r6             @ encoding: [0x06,0xf3,0x2e,0xe1]
+@ CHECK-ARM:         msr     sp_hyp, r8              @ encoding: [0x08,0xf3,0x2f,0xe1]
+@ CHECK-ARM:         msr     SPSR_hyp, r10           @ encoding: [0x0a,0xf3,0x6e,0xe1]
+@ CHECK-THUMB:         msr     elr_hyp, r6             @ encoding: [0x86,0xf3,0x30,0x8e]
+@ CHECK-THUMB:         msr     sp_hyp, r8              @ encoding: [0x88,0xf3,0x30,0x8f]
+@ CHECK-THUMB:         msr     SPSR_hyp, r10           @ encoding: [0x9a,0xf3,0x30,0x8e]

diff --git a/test/MC/ARM/neon-bitwise-encoding.s b/test/MC/ARM/neon-bitwise-encoding.s
index 8c72288..d142dba 100644
--- a/test/MC/ARM/neon-bitwise-encoding.s
+++ b/test/MC/ARM/neon-bitwise-encoding.s

@@ -29,18 +29,63 @@
 
 	vbic	d16, d17, d16
 	vbic	q8, q8, q9
+	vbic q10, q11
+	vbic d9, d1
+	vbic.i16	d16, #0xFF00
+	vbic.i16	q8,  #0xFF00
+	vbic.i16	d16, #0x00FF
+	vbic.i16	q8,  #0x00FF
 	vbic.i32	d16, #0xFF000000
-	vbic.i32	q8, #0xFF000000
-        vbic q10, q11
-        vbic d9, d1
+	vbic.i32	q8,  #0xFF000000
+	vbic.i32	d16, #0x00FF0000
+	vbic.i32	q8,  #0x00FF0000
+	vbic.i32	d16, #0x0000FF00
+	vbic.i32	q8,  #0x0000FF00
+	vbic.i32	d16, #0x000000FF
+	vbic.i32	q8,  #0x000000FF
 
 @ CHECK: vbic	d16, d17, d16           @ encoding: [0xb0,0x01,0x51,0xf2]
 @ CHECK: vbic	q8, q8, q9              @ encoding: [0xf2,0x01,0x50,0xf2]
-@ CHECK: vbic.i32	d16, #0xff000000 @ encoding: [0x3f,0x07,0xc7,0xf3]
-@ CHECK: vbic.i32	q8, #0xff000000 @ encoding: [0x7f,0x07,0xc7,0xf3]
 @ CHECK: vbic	q10, q10, q11           @ encoding: [0xf6,0x41,0x54,0xf2]
 @ CHECK: vbic	d9, d9, d1              @ encoding: [0x11,0x91,0x19,0xf2]
+@ CHECK: vbic.i16	d16, #0xff00    @ encoding: [0x3f,0x0b,0xc7,0xf3]
+@ CHECK: vbic.i16	q8, #0xff00     @ encoding: [0x7f,0x0b,0xc7,0xf3]
+@ CHECK: vbic.i16	d16, #0xff      @ encoding: [0x3f,0x09,0xc7,0xf3]
+@ CHECK: vbic.i16	q8, #0xff       @ encoding: [0x7f,0x09,0xc7,0xf3]
+@ CHECK: vbic.i32	d16, #0xff000000 @ encoding: [0x3f,0x07,0xc7,0xf3]
+@ CHECK: vbic.i32	q8, #0xff000000 @ encoding: [0x7f,0x07,0xc7,0xf3]
+@ CHECK: vbic.i32	d16, #0xff0000  @ encoding: [0x3f,0x05,0xc7,0xf3]
+@ CHECK: vbic.i32	q8, #0xff0000   @ encoding: [0x7f,0x05,0xc7,0xf3]
+@ CHECK: vbic.i32	d16, #0xff00    @ encoding: [0x3f,0x03,0xc7,0xf3]
+@ CHECK: vbic.i32	q8, #0xff00     @ encoding: [0x7f,0x03,0xc7,0xf3]
+@ CHECK: vbic.i32	d16, #0xff      @ encoding: [0x3f,0x01,0xc7,0xf3]
+@ CHECK: vbic.i32	q8, #0xff       @ encoding: [0x7f,0x01,0xc7,0xf3]
 
+	vand.i16 d10, #0xff03
+	vand.i16 q10, #0xff03
+	vand.i16 d10, #0x03ff
+	vand.i16 q10, #0x03ff
+	vand.i32 d10, #0x03ffffff
+	vand.i32 q10, #0x03ffffff
+	vand.i32 d10, #0xff03ffff
+	vand.i32 q10, #0xff03ffff
+	vand.i32 d10, #0xffff03ff
+	vand.i32 q10, #0xffff03ff
+	vand.i32 d10, #0xffffff03
+	vand.i32 q10, #0xffffff03
+
+@ CHECK: vbic.i16	d10, #0xfc      @ encoding: [0x3c,0xa9,0x87,0xf3]
+@ CHECK: vbic.i16	q10, #0xfc      @ encoding: [0x7c,0x49,0xc7,0xf3]
+@ CHECK: vbic.i16	d10, #0xfc00    @ encoding: [0x3c,0xab,0x87,0xf3]
+@ CHECK: vbic.i16	q10, #0xfc00    @ encoding: [0x7c,0x4b,0xc7,0xf3]
+@ CHECK: vbic.i32	d10, #0xfc000000 @ encoding: [0x3c,0xa7,0x87,0xf3]
+@ CHECK: vbic.i32	q10, #0xfc000000 @ encoding: [0x7c,0x47,0xc7,0xf3]
+@ CHECK: vbic.i32	d10, #0xfc0000  @ encoding: [0x3c,0xa5,0x87,0xf3]
+@ CHECK: vbic.i32	q10, #0xfc0000  @ encoding: [0x7c,0x45,0xc7,0xf3]
+@ CHECK: vbic.i32	d10, #0xfc00    @ encoding: [0x3c,0xa3,0x87,0xf3]
+@ CHECK: vbic.i32	q10, #0xfc00    @ encoding: [0x7c,0x43,0xc7,0xf3]
+@ CHECK: vbic.i32	d10, #0xfc      @ encoding: [0x3c,0xa1,0x87,0xf3]
+@ CHECK: vbic.i32	q10, #0xfc      @ encoding: [0x7c,0x41,0xc7,0xf3]
 
 	vorn	d16, d17, d16
 	vorn	q8, q8, q9

diff --git a/test/MC/ARM/neon-mov-vfp.s b/test/MC/ARM/neon-mov-vfp.s
new file mode 100644
index 0000000..6ee6bfd
--- /dev/null
+++ b/test/MC/ARM/neon-mov-vfp.s

@@ -0,0 +1,32 @@
+@ RUN: not llvm-mc -mcpu=cortex-a8 -triple armv7-unknown-unknown -show-encoding -mattr=-neon < %s 2>&1 | FileCheck %s --check-prefix=VFP --check-prefix=CHECK
+@ RUN: not llvm-mc -mcpu=cortex-a8 -triple thumbv7-unknown-unknown -show-encoding -mattr=-neon < %s 2>&1 | FileCheck %s --check-prefix=VFP --check-prefix=CHECK
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple armv7-unknown-unknown -show-encoding -mattr=+neon < %s 2>&1 | FileCheck %s --check-prefix=NEON --check-prefix=CHECK
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple thumbv7-unknown-unknown -show-encoding -mattr=+neon < %s 2>&1 | FileCheck %s --check-prefix=NEON --check-prefix=CHECK
+
+@ The 32-bit variants of the NEON scalar move instructions are also available
+@ to any core with VFPv2
+
+@ CHECK-DAG: vmov.32 d13[0], r6 @ encoding:
+@ CHECK-DAG: vmov.32 d17[1], r9 @ encoding:
+vmov.32 d13[0], r6
+vmov.32 d17[1], r9
+
+@ VFP-DAG: error: instruction requires: NEON
+@ VFP-DAG: error: instruction requires: NEON
+@ NEON-DAG: vmov.8  d22[5], r2 @ encoding:
+@ NEON-DAG: vmov.16 d3[2], r4 @ encoding:
+vmov.8 d22[5], r2
+vmov.16 d3[2], r4
+
+@ CHECK-DAG: vmov.32 r6, d13[0] @ encoding:
+@ CHECK-DAG: vmov.32 r9, d17[1] @ encoding:
+vmov.32 r6, d13[0]
+vmov.32 r9, d17[1]
+
+@ VFP-DAG: error: instruction requires: NEON
+@ VFP-DAG: error: instruction requires: NEON
+@ NEON-DAG: vmov.s8 r2, d22[5] @ encoding:
+@ NEON-DAG: vmov.u16        r4, d3[2] @ encoding:
+vmov.s8 r2, d22[5]
+vmov.u16 r4, d3[2]
+

diff --git a/test/MC/ARM/symbol-variants.s b/test/MC/ARM/symbol-variants.s
index a10fe50..af1bc07 100644
--- a/test/MC/ARM/symbol-variants.s
+++ b/test/MC/ARM/symbol-variants.s

@@ -19,8 +19,8 @@
 @ plt
 bl f04(PLT)
 bl f05(plt)
-@ARM: 10 R_ARM_PLT32 f04
-@ARM: 14 R_ARM_PLT32 f05
+@ARM: 10 R_ARM_CALL f04
+@ARM: 14 R_ARM_CALL f05
 @THUMB: 10 R_ARM_THM_CALL f04
 @THUMB: 14 R_ARM_THM_CALL f05
 

diff --git a/test/MC/ARM/thumb-diagnostics.s b/test/MC/ARM/thumb-diagnostics.s
index 19d17c2..2a79132 100644
--- a/test/MC/ARM/thumb-diagnostics.s
+++ b/test/MC/ARM/thumb-diagnostics.s

@@ -2,6 +2,8 @@
 @ RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
 @ RUN: not llvm-mc -triple=thumbv5-apple-darwin < %s 2> %t
 @ RUN: FileCheck --check-prefix=CHECK-ERRORS-V5 < %t %s
+@ RUN: not llvm-mc -triple=thumbv7m < %s 2> %t
+@ RUN: FileCheck --check-prefix=CHECK-ERRORS-V7M < %t %s
 @ RUN: not llvm-mc -triple=thumbv8 < %s 2> %t
 @ RUN: FileCheck --check-prefix=CHECK-ERRORS-V8 < %t %s
 
@@ -59,6 +61,13 @@
         ldm r2!, {r2, r3, r4}
         ldm r2!, {r2, r3, r4, r10}
         ldmdb r2!, {r2, r3, r4}
+        ldm r0, {r2, sp}
+        ldmia r0, {r2-r3, sp}
+        ldmia r0!, {r2-r3, sp}
+        ldmfd r2, {r1, r3-r6, sp}
+        ldmfd r2!, {r1, r3-r6, sp}
+        ldmdb r1, {r2, r3, sp}
+        ldmdb r1!, {r2, r3, sp} 
 @ CHECK-ERRORS: error: registers must be in range r0-r7
 @ CHECK-ERRORS:         ldm r2!, {r5, r8}
 @ CHECK-ERRORS:                  ^
@@ -74,6 +83,27 @@
 @ CHECK-ERRORS-V8: error: writeback register not allowed in register list
 @ CHECK-ERRORS-V8:         ldmdb r2!, {r2, r3, r4}
 @ CHECK-ERRORS-V8:                 ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         ldm r0, {r2, sp}
+@ CHECK-ERRORS-V7M:                 ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         ldmia r0, {r2-r3, sp}
+@ CHECK-ERRORS-V7M:                   ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         ldmia r0!, {r2-r3, sp}
+@ CHECK-ERRORS-V7M:                    ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         ldmfd r2, {r1, r3-r6, sp}
+@ CHECK-ERRORS-V7M:                   ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         ldmfd r2!, {r1, r3-r6, sp}
+@ CHECK-ERRORS-V7M:                    ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         ldmdb r1, {r2, r3, sp}
+@ CHECK-ERRORS-V7M:                   ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         ldmdb r1!, {r2, r3, sp}
+@ CHECK-ERRORS-V7M:                    ^
 
 @ Invalid writeback and register lists for PUSH/POP
         pop {r1, r2, r10}
@@ -91,6 +121,10 @@
         stm r1!, {r2, r9}
         stm r2!, {r2, r9}
         stmdb r2!, {r0, r2}
+        stm r1!, {r2, sp}
+        stmia r4!, {r0-r3, sp}
+        stmdb r1, {r2, r3, sp}
+        stmdb r1!, {r2, r3, sp}
 @ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         stm r1, {r2, r6}
 @ CHECK-ERRORS:         ^
@@ -103,6 +137,18 @@
 @ CHECK-ERRORS-V8: error: writeback register not allowed in register list
 @ CHECK-ERRORS-V8:         stmdb r2!, {r0, r2}
 @ CHECK-ERRORS-V8:                  ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         stm r1!, {r2, sp}
+@ CHECK-ERRORS-V7M:                  ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         stmia r4!, {r0-r3, sp}
+@ CHECK-ERRORS-V7M:                    ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         stmdb r1, {r2, r3, sp}
+@ CHECK-ERRORS-V7M:                   ^
+@ CHECK-ERRORS-V7M: error: SP not allowed in register list
+@ CHECK-ERRORS-V7M:         stmdb r1!, {r2, r3, sp}
+@ CHECK-ERRORS-V7M:                    ^
 
 @ Out of range immediates for LSL instruction.
         lsls r4, r5, #-1
@@ -218,3 +264,14 @@
         ldr r4, [pc, #-12]
 @ CHECK-ERRORS: error: instruction requires: thumb2
 
+@------------------------------------------------------------------------------
+@ STC2{L}/LDC2{L} - requires thumb2
+@------------------------------------------------------------------------------
+        stc2 p0, c8, [r1, #4]
+        stc2l p6, c2, [r7, #4]
+        ldc2 p0, c8, [r1, #4]
+        ldc2l p6, c2, [r7, #4]
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: invalid operand for instruction

diff --git a/test/MC/ARM/thumb-not-mclass.s b/test/MC/ARM/thumb-not-mclass.s
new file mode 100644
index 0000000..fec545e
--- /dev/null
+++ b/test/MC/ARM/thumb-not-mclass.s

@@ -0,0 +1,26 @@
+@ RUN: not llvm-mc -triple=thumbv7m-apple-darwin -show-encoding < %s 2> %t
+@ RUN: FileCheck < %t %s
+@ RUN: not llvm-mc -triple=thumbv6m -show-encoding < %s 2> %t
+@ RUN: FileCheck < %t %s
+  .syntax unified
+  .globl _func
+
+@ Check that the assembler rejects thumb instructions that are not valid
+@ on mclass.
+
+@------------------------------------------------------------------------------
+@ BLX (immediate)
+@------------------------------------------------------------------------------
+        blx _baz
+
+@ CHECK: error: instruction requires: !armv*m
+
+@------------------------------------------------------------------------------
+@ SETEND
+@------------------------------------------------------------------------------
+
+        setend be
+        setend le
+
+@ CHECK: error: invalid operand for instruction
+@ CHECK: error: invalid operand for instruction

diff --git a/test/MC/ARM/thumb2-bxj.s b/test/MC/ARM/thumb2-bxj.s
new file mode 100644
index 0000000..e60d1a4
--- /dev/null
+++ b/test/MC/ARM/thumb2-bxj.s

@@ -0,0 +1,10 @@
+@ RUN: llvm-mc -triple=thumbv6t2--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv7a--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv7r--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: not llvm-mc -triple=thumbv7m--none-eabi -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=UNDEF
+@ RUN: not llvm-mc -triple=thumbv8a--none-eabi -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=UNDEF
+
+        bxj r2
+
+@ CHECK: bxj r2                      @ encoding: [0xc2,0xf3,0x00,0x8f]
+@ UNDEF: error: instruction requires: arm-mode

diff --git a/test/MC/ARM/thumb2-exception-return-mclass.s b/test/MC/ARM/thumb2-exception-return-mclass.s
new file mode 100644
index 0000000..21669b0
--- /dev/null
+++ b/test/MC/ARM/thumb2-exception-return-mclass.s

@@ -0,0 +1,15 @@
+# RUN: not llvm-mc -triple thumbv7m -assemble < %s 2>&1 | FileCheck %s
+
+  .text
+
+# CHECK: instruction requires: !armv*m
+# CHECK-NEXT: srsdb sp, #7
+  srsdb sp, #7
+
+# CHECK: instruction requires: !armv*m
+# CHECK-NEXT: rfeia r6
+  rfeia r6
+
+# CHECK: instruction requires: !armv*m
+# CHECK-NEXT: subs pc, lr, #42
+  subs pc, lr, #42

diff --git a/test/MC/ARM/thumb2-ldrb-ldrh.s b/test/MC/ARM/thumb2-ldrb-ldrh.s
new file mode 100644
index 0000000..8c97987
--- /dev/null
+++ b/test/MC/ARM/thumb2-ldrb-ldrh.s

@@ -0,0 +1,51 @@
+@ RUN: not llvm-mc -triple thumbv7a-none-eabi -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK
+@ RUN: not llvm-mc -triple thumbv7m-none-eabi -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK
+
+@ Thumb2 LDRS?[BH] are not valid when Rt == PC (these encodings are used for
+@ preload hints).
+@ We don't check the actual error messages here as they are currently not very
+@ helpful, see http://llvm.org/bugs/show_bug.cgi?id=21066.
+
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+  ldrb    pc, [r0, #10]
+  ldrb.w  pc, [r1, #10]
+  ldrb    pc, [r2, #-5]
+  ldrb    pc, [pc, #7]
+  ldrb.w  pc, [pc, #7]
+
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+  ldrsb   pc, [r3, #10]
+  ldrsb.w pc, [r4, #10]
+  ldrsb   pc, [r5, #-5]
+  ldrsb   pc, [pc, #7]
+  ldrsb.w pc, [pc, #7]
+
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+  ldrh    pc, [r6, #10]
+  ldrh.w  pc, [r7, #10]
+  ldrh    pc, [r8, #-5]
+  ldrh    pc, [pc, #7]
+  ldrh.w  pc, [pc, #7]
+
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+@ CHECK: error:
+  ldrsh   pc, [r9, #10]
+  ldrsh.w pc, [r10, #10]
+  ldrsh   pc, [r11, #-5]
+  ldrsh   pc, [pc, #7]
+  ldrsh.w pc, [pc, #7]

diff --git a/test/MC/ARM/thumb2-ldrexd-strexd.s b/test/MC/ARM/thumb2-ldrexd-strexd.s
new file mode 100644
index 0000000..3ffb0cb
--- /dev/null
+++ b/test/MC/ARM/thumb2-ldrexd-strexd.s

@@ -0,0 +1,14 @@
+@ RUN: llvm-mc -triple=thumbv6t2--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv7a--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv7r--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv8a--none-eabi -show-encoding < %s | FileCheck %s
+@ RUN: not llvm-mc -triple=thumbv7m--none-eabi -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=UNDEF
+
+  ldrexd r0, r1, [r2]
+  strexd r3, r4, r5, [r6]
+
+@ CHECK: ldrexd r0, r1, [r2]            @ encoding: [0xd2,0xe8,0x7f,0x01]
+@ CHECK: strexd r3, r4, r5, [r6]        @ encoding: [0xc6,0xe8,0x73,0x45]
+
+@ UNDEF: error: instruction requires: !armv*m
+@ UNDEF: error: instruction requires: !armv*m

diff --git a/test/MC/ARM/thumb2-mclass.s b/test/MC/ARM/thumb2-mclass.s
index d9c96df..331ecc1 100644
--- a/test/MC/ARM/thumb2-mclass.s
+++ b/test/MC/ARM/thumb2-mclass.s

@@ -1,7 +1,7 @@
-@ RUN: llvm-mc -triple=thumbv7m-apple-darwin -show-encoding < %s | FileCheck %s
-@ RUN: llvm-mc -triple=thumbv6m -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv6m -show-encoding < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V6M %s
+@ RUN: llvm-mc -triple=thumbv7m -show-encoding < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V7M %s
+
   .syntax unified
-  .globl _func
 
 @ Check that the assembler can handle the documented syntax from the ARM ARM.
 @ These tests test instruction encodings specific to v6m & v7m (FeatureMClass).
@@ -40,20 +40,12 @@
 
         msr  apsr, r0
         msr  apsr_nzcvq, r0
-        msr  apsr_g, r0
-        msr  apsr_nzcvqg, r0
         msr  iapsr, r0
         msr  iapsr_nzcvq, r0
-        msr  iapsr_g, r0
-        msr  iapsr_nzcvqg, r0
         msr  eapsr, r0
         msr  eapsr_nzcvq, r0
-        msr  eapsr_g, r0
-        msr  eapsr_nzcvqg, r0
         msr  xpsr, r0
         msr  xpsr_nzcvq, r0
-        msr  xpsr_g, r0
-        msr  xpsr_nzcvqg, r0
         msr  ipsr, r0
         msr  epsr, r0
         msr  iepsr, r0
@@ -62,22 +54,22 @@
         msr  primask, r0
         msr  control, r0
 
-@ CHECK: msr	apsr, r0                @ encoding: [0x80,0xf3,0x00,0x88]
-@ CHECK: msr	apsr, r0                @ encoding: [0x80,0xf3,0x00,0x88]
-@ CHECK: msr	apsr_g, r0              @ encoding: [0x80,0xf3,0x00,0x84]
-@ CHECK: msr	apsr_nzcvqg, r0         @ encoding: [0x80,0xf3,0x00,0x8c]
-@ CHECK: msr	iapsr, r0               @ encoding: [0x80,0xf3,0x01,0x88]
-@ CHECK: msr	iapsr, r0               @ encoding: [0x80,0xf3,0x01,0x88]
-@ CHECK: msr	iapsr_g, r0             @ encoding: [0x80,0xf3,0x01,0x84]
-@ CHECK: msr	iapsr_nzcvqg, r0        @ encoding: [0x80,0xf3,0x01,0x8c]
-@ CHECK: msr	eapsr, r0               @ encoding: [0x80,0xf3,0x02,0x88]
-@ CHECK: msr	eapsr, r0               @ encoding: [0x80,0xf3,0x02,0x88]
-@ CHECK: msr	eapsr_g, r0             @ encoding: [0x80,0xf3,0x02,0x84]
-@ CHECK: msr	eapsr_nzcvqg, r0        @ encoding: [0x80,0xf3,0x02,0x8c]
-@ CHECK: msr	xpsr, r0                @ encoding: [0x80,0xf3,0x03,0x88]
-@ CHECK: msr	xpsr, r0                @ encoding: [0x80,0xf3,0x03,0x88]
-@ CHECK: msr	xpsr_g, r0              @ encoding: [0x80,0xf3,0x03,0x84]
-@ CHECK: msr	xpsr_nzcvqg, r0         @ encoding: [0x80,0xf3,0x03,0x8c]
+@ CHECK-V6M: msr	apsr, r0                @ encoding: [0x80,0xf3,0x00,0x88]
+@ CHECK-V6M: msr	apsr, r0                @ encoding: [0x80,0xf3,0x00,0x88]
+@ CHECK-V6M: msr	iapsr, r0               @ encoding: [0x80,0xf3,0x01,0x88]
+@ CHECK-V6M: msr	iapsr, r0               @ encoding: [0x80,0xf3,0x01,0x88]
+@ CHECK-V6M: msr	eapsr, r0               @ encoding: [0x80,0xf3,0x02,0x88]
+@ CHECK-V6M: msr	eapsr, r0               @ encoding: [0x80,0xf3,0x02,0x88]
+@ CHECK-V6M: msr	xpsr, r0                @ encoding: [0x80,0xf3,0x03,0x88]
+@ CHECK-V6M: msr	xpsr, r0                @ encoding: [0x80,0xf3,0x03,0x88]
+@ CHECK-V7M: msr	apsr_nzcvq, r0          @ encoding: [0x80,0xf3,0x00,0x88]
+@ CHECK-V7M: msr	apsr_nzcvq, r0          @ encoding: [0x80,0xf3,0x00,0x88]
+@ CHECK-V7M: msr	iapsr_nzcvq, r0         @ encoding: [0x80,0xf3,0x01,0x88]
+@ CHECK-V7M: msr	iapsr_nzcvq, r0         @ encoding: [0x80,0xf3,0x01,0x88]
+@ CHECK-V7M: msr	eapsr_nzcvq, r0         @ encoding: [0x80,0xf3,0x02,0x88]
+@ CHECK-V7M: msr	eapsr_nzcvq, r0         @ encoding: [0x80,0xf3,0x02,0x88]
+@ CHECK-V7M: msr	xpsr_nzcvq, r0          @ encoding: [0x80,0xf3,0x03,0x88]
+@ CHECK-V7M: msr	xpsr_nzcvq, r0          @ encoding: [0x80,0xf3,0x03,0x88]
 @ CHECK: msr	ipsr, r0                @ encoding: [0x80,0xf3,0x05,0x88]
 @ CHECK: msr	epsr, r0                @ encoding: [0x80,0xf3,0x06,0x88]
 @ CHECK: msr	iepsr, r0               @ encoding: [0x80,0xf3,0x07,0x88]

diff --git a/test/MC/ARM/thumb_rewrites.s b/test/MC/ARM/thumb_rewrites.s
new file mode 100644
index 0000000..c9d625e
--- /dev/null
+++ b/test/MC/ARM/thumb_rewrites.s

@@ -0,0 +1,52 @@
+@ RUN: llvm-mc -triple thumbv6m -show-encoding < %s | FileCheck %s
+
+    adds    r0, r0, #8
+@ CHECK: adds   r0, #8              @ encoding: [0x08,0x30]
+
+    adds    r0, r0, r0
+@ CHECK: adds   r0, r0, r0          @ encoding: [0x00,0x18]
+
+    add     r0, r0, r8
+@ CHECK: add    r0, r8              @ encoding: [0x40,0x44]
+
+    add     sp, sp, r0
+@ CHECK: add    sp, r0              @ encoding: [0x85,0x44]
+
+    add     r0, r0, r1
+@ CHECK: add    r0, r1              @ encoding: [0x08,0x44]
+
+    add     r2, r2, r3
+@ CHECK: add    r2, r3              @ encoding: [0x1a,0x44]
+
+    subs    r0, r0, r0
+@ CHECK: subs   r0, r0, r0          @ encoding: [0x00,0x1a]
+
+    ands    r0, r0, r1
+@ CHECK: ands   r0, r1              @ encoding: [0x08,0x40]
+
+    eors    r0, r0, r1
+@ CHECK: eors   r0, r1              @ encoding: [0x48,0x40]
+
+    lsls    r0, r0, r1
+@ CHECK: lsls   r0, r1              @ encoding: [0x88,0x40]
+
+    lsrs    r0, r0, r1
+@ CHECK: lsrs   r0, r1              @ encoding: [0xc8,0x40]
+
+    asrs    r0, r0, r1
+@ CHECK: asrs   r0, r1              @ encoding: [0x08,0x41]
+
+    adcs    r0, r0, r1
+@ CHECK: adcs   r0, r1              @ encoding: [0x48,0x41]
+
+    sbcs    r0, r0, r1
+@ CHECK: sbcs   r0, r1              @ encoding: [0x88,0x41]
+
+    rors    r0, r0, r1
+@ CHECK: rors   r0, r1              @ encoding: [0xc8,0x41]
+
+    orrs    r0, r0, r1
+@ CHECK: orrs   r0, r1              @ encoding: [0x08,0x43]
+
+    bics    r0, r0, r1
+@ CHECK: bics   r0, r1              @ encoding: [0x88,0x43]

diff --git a/test/MC/ARM/thumbv7em.s b/test/MC/ARM/thumbv7em.s
new file mode 100644
index 0000000..53ebff2
--- /dev/null
+++ b/test/MC/ARM/thumbv7em.s

@@ -0,0 +1,53 @@
+@ RUN: llvm-mc -triple=thumbv7em -show-encoding < %s | FileCheck %s
+@ RUN: not llvm-mc -triple=thumbv7m -show-encoding 2>&1 < %s | FileCheck --check-prefix=CHECK-V7M %s
+
+  .syntax unified
+
+@ Check that the assembler can handle the documented syntax from the ARM ARM.
+@ These tests test instruction encodings specific to ARMv7E-M.
+
+@------------------------------------------------------------------------------
+@ MSR
+@------------------------------------------------------------------------------
+
+        msr  apsr_g, r0
+        msr  apsr_nzcvqg, r0
+        msr  iapsr_g, r0
+        msr  iapsr_nzcvqg, r0
+        msr  eapsr_g, r0
+        msr  eapsr_nzcvqg, r0
+        msr  xpsr_g, r0
+        msr  xpsr_nzcvqg, r0
+
+@ CHECK: msr	apsr_g, r0              @ encoding: [0x80,0xf3,0x00,0x84]
+@ CHECK: msr	apsr_nzcvqg, r0         @ encoding: [0x80,0xf3,0x00,0x8c]
+@ CHECK: msr	iapsr_g, r0             @ encoding: [0x80,0xf3,0x01,0x84]
+@ CHECK: msr	iapsr_nzcvqg, r0        @ encoding: [0x80,0xf3,0x01,0x8c]
+@ CHECK: msr	eapsr_g, r0             @ encoding: [0x80,0xf3,0x02,0x84]
+@ CHECK: msr	eapsr_nzcvqg, r0        @ encoding: [0x80,0xf3,0x02,0x8c]
+@ CHECK: msr	xpsr_g, r0              @ encoding: [0x80,0xf3,0x03,0x84]
+@ CHECK: msr	xpsr_nzcvqg, r0         @ encoding: [0x80,0xf3,0x03,0x8c]
+@ CHECK-V7M: error: invalid operand for instruction
+@ CHECK-V7M-NEXT:         msr  apsr_g, r0
+@ CHECK-V7M-NEXT:              ^
+@ CHECK-V7M: error: invalid operand for instruction
+@ CHECK-V7M-NEXT:         msr  apsr_nzcvqg, r0
+@ CHECK-V7M-NEXT:              ^
+@ CHECK-V7M: error: invalid operand for instruction
+@ CHECK-V7M-NEXT:         msr  iapsr_g, r0
+@ CHECK-V7M-NEXT:              ^
+@ CHECK-V7M: error: invalid operand for instruction
+@ CHECK-V7M-NEXT:         msr  iapsr_nzcvqg, r0
+@ CHECK-V7M-NEXT:              ^
+@ CHECK-V7M: error: invalid operand for instruction
+@ CHECK-V7M-NEXT:         msr  eapsr_g, r0
+@ CHECK-V7M-NEXT:              ^
+@ CHECK-V7M: error: invalid operand for instruction
+@ CHECK-V7M-NEXT:         msr  eapsr_nzcvqg, r0
+@ CHECK-V7M-NEXT:              ^
+@ CHECK-V7M: error: invalid operand for instruction
+@ CHECK-V7M-NEXT:         msr  xpsr_g, r0
+@ CHECK-V7M-NEXT:              ^
+@ CHECK-V7M: error: invalid operand for instruction
+@ CHECK-V7M-NEXT:         msr  xpsr_nzcvqg, r0
+@ CHECK-V7M-NEXT:              ^

diff --git a/test/MC/ARM/vfp4.s b/test/MC/ARM/vfp4.s
index 8b1b0e0..1563b5a 100644
--- a/test/MC/ARM/vfp4.s
+++ b/test/MC/ARM/vfp4.s

@@ -6,7 +6,7 @@
 
 @ ARM: vfma.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xe2,0xee]
 @ THUMB: vfma.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xa1,0x0b]
-@ THUMB_V7EM-ERRORS: error: instruction requires: double precision VFP
+@ THUMB_V7EM-ERRORS: error: invalid operand for instruction
 @ THUMB_V7EM-ERRORS-NEXT: vfma.f64 d16, d18, d17
 vfma.f64 d16, d18, d17
 
@@ -17,7 +17,7 @@
 
 @ ARM: vfma.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x42,0xf2]
 @ THUMB: vfma.f32 d16, d18, d17 @ encoding: [0x42,0xef,0xb1,0x0c]
-@ THUMB_V7EM-ERRORS: error: instruction requires: NEON
+@ THUMB_V7EM-ERRORS: error: invalid operand for instruction
 @ THUMB_V7EM-ERRORS-NEXT: vfma.f32 d16, d18, d17
 vfma.f32 d16, d18, d17
 
@@ -29,7 +29,7 @@
 
 @ ARM: vfnma.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xd2,0xee]
 @ THUMB: vfnma.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xe1,0x0b]
-@ THUMB_V7EM-ERRORS: error: instruction requires: double precision VFP
+@ THUMB_V7EM-ERRORS: error: invalid operand for instruction
 @ THUMB_V7EM-ERRORS-NEXT: vfnma.f64 d16, d18, d17
 vfnma.f64 d16, d18, d17
 
@@ -40,7 +40,7 @@
 
 @ ARM: vfms.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xe2,0xee]
 @ THUMB: vfms.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xe1,0x0b]
-@ THUMB_V7EM-ERRORS: error: instruction requires: double precision VFP
+@ THUMB_V7EM-ERRORS: error: invalid operand for instruction
 @ THUMB_V7EM-ERRORS-NEXT: vfms.f64 d16, d18, d17
 vfms.f64 d16, d18, d17
 
@@ -51,7 +51,7 @@
 
 @ ARM: vfms.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x62,0xf2]
 @ THUMB: vfms.f32 d16, d18, d17 @ encoding: [0x62,0xef,0xb1,0x0c]
-@ THUMB_V7EM-ERRORS: error: instruction requires: NEON
+@ THUMB_V7EM-ERRORS: error: invalid operand for instruction
 @ THUMB_V7EM-ERRORS-NEXT: vfms.f32 d16, d18, d17
 vfms.f32 d16, d18, d17
 
@@ -63,7 +63,7 @@
 
 @ ARM: vfnms.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xd2,0xee]
 @ THUMB: vfnms.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xa1,0x0b]
-@ THUMB_V7EM-ERRORS: error: instruction requires: double precision VFP
+@ THUMB_V7EM-ERRORS: error: invalid operand for instruction
 @ THUMB_V7EM-ERRORS-NEXT: vfnms.f64 d16, d18, d17
 vfnms.f64 d16, d18, d17
 

diff --git a/test/MC/ARM/vorr-vbic-illegal-cases.s b/test/MC/ARM/vorr-vbic-illegal-cases.s
index 16ab6b5..673098a 100644
--- a/test/MC/ARM/vorr-vbic-illegal-cases.s
+++ b/test/MC/ARM/vorr-vbic-illegal-cases.s

@@ -1,6 +1,13 @@
 @ RUN: not llvm-mc -triple=armv7-linux-gnueabi %s 2>&1 | FileCheck %s
 .text
 
+        vorr.i32        d2, #0xffffffff
+        vorr.i32        q2, #0xffffffff
+        vorr.i32        d2, #0xabababab
+        vorr.i32        q2, #0xabababab
+        vorr.i16        q2, #0xabab
+        vorr.i16        q2, #0xabab
+
 @ CHECK: error: invalid operand for instruction
 @ CHECK: vorr.i32        d2, #0xffffffff
 @ CHECK: error: invalid operand for instruction
@@ -14,6 +21,13 @@
 @ CHECK: error: invalid operand for instruction
 @ CHECK: vorr.i16        q2, #0xabab
 
+        vbic.i32        d2, #0xffffffff
+        vbic.i32        q2, #0xffffffff
+        vbic.i32        d2, #0xabababab
+        vbic.i32        q2, #0xabababab
+        vbic.i16        d2, #0xabab
+        vbic.i16        q2, #0xabab
+
 @ CHECK: error: invalid operand for instruction
 @ CHECK: vbic.i32        d2, #0xffffffff
 @ CHECK: error: invalid operand for instruction
@@ -27,16 +41,25 @@
 @ CHECK: error: invalid operand for instruction
 @ CHECK: vbic.i16        q2, #0xabab
 
-        vorr.i32        d2, #0xffffffff
-        vorr.i32        q2, #0xffffffff
-        vorr.i32        d2, #0xabababab
-        vorr.i32        q2, #0xabababab
-        vorr.i16        q2, #0xabab
-        vorr.i16        q2, #0xabab
+        vbic.i32        d2, #0x03ffffff
+        vbic.i32        q2, #0x03ffff
+        vbic.i32        d2, #0x03ff
+        vbic.i32        d2, #0xff00ff
+        vbic.i16        d2, #0x03ff
+        vbic.i16        q2, #0xf0f0
+        vbic.i16        q2, #0xf0f0f0
 
-        vbic.i32        d2, #0xffffffff
-        vbic.i32        q2, #0xffffffff
-        vbic.i32        d2, #0xabababab
-        vbic.i32        q2, #0xabababab
-        vbic.i16        d2, #0xabab
-        vbic.i16        q2, #0xabab
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i32        d2, #0x03ffffff
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i32        q2, #0x03ffff
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i32        d2, #0x03ff
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i32        d2, #0xff00ff
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i16        d2, #0x03ff
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i16        q2, #0xf0f0
+@ CHECK: error: invalid operand for instruction
+@ CHECK: vbic.i16        q2, #0xf0f0f0

diff --git a/test/MC/AsmParser/comments-x86-darwin.s b/test/MC/AsmParser/comments-x86-darwin.s
new file mode 100644
index 0000000..e201f48
--- /dev/null
+++ b/test/MC/AsmParser/comments-x86-darwin.s

@@ -0,0 +1,14 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin %s 2>&1 | FileCheck %s
+# ensure that single '#' comments are worink as expected on x86 darwin
+.align 3            # test single hash after align
+// CHECK: .align 3
+foo:                # single hash should be ignored as comment
+// CHECK-LABEL: foo:
+    movl %esp, %ebp # same after an instruction
+// CHECK: movl %esp, %ebp
+#   movl %esp, %ebp ## start of the line
+// CHECK-NOT: movl %esp, %ebp
+    # movl %esp, %ebp ## not quite start of the line
+// CHECK-NOT: movl %esp, %ebp
+bar:
+// CHECK-LABEL: bar:

diff --git a/test/MC/AsmParser/directive-warning.s b/test/MC/AsmParser/directive-warning.s
new file mode 100644
index 0000000..311989e
--- /dev/null
+++ b/test/MC/AsmParser/directive-warning.s

@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple i386 %s 2>&1 | FileCheck %s
+
+	.warning
+// CHECK: warning: .warning directive invoked in source file
+// CHECK-NEXT: 	.warning
+// CHECK-NEXT:  ^
+
+	.ifc a,a
+		.warning
+	.endif
+// CHECK: warning: .warning directive invoked in source file
+// CHECK-NEXT:		.warning
+// CHECK-NEXT:          ^
+
+	.ifnc a,a
+		.warning
+	.endif
+// CHECK-NOT: warning: .warning directive invoked in source file
+
+	.warning "here be dragons"
+// CHECK: warning: here be dragons
+
+	.ifc one, two
+		.warning "dragons, i say"
+	.endif
+// CHECK-NOT: warning: dragons, i say

diff --git a/test/MC/AsmParser/macro-exitm.s b/test/MC/AsmParser/macro-exitm.s
new file mode 100644
index 0000000..66a0597
--- /dev/null
+++ b/test/MC/AsmParser/macro-exitm.s

@@ -0,0 +1,64 @@
+// RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+
+// .exitm is encountered in a normal macro expansion
+.macro REP
+.rept 3
+.long 0
+.exitm
+.endr
+.endm
+REP
+// Only the output from the first rept expansion should make it through:
+// CHECK: .long 0
+// CHECK-NOT: .long 0
+
+// .exitm is in a true branch
+.macro A
+.if 1
+.long 1
+.exitm
+.endif
+.long 1
+.endm
+A
+// CHECK: .long 1
+// CHECK-NOT: .long 1
+
+// .exitm is in a false branch
+.macro B
+.if 1
+.long 2
+.else
+.exitm
+.endif
+.long 2
+.endm
+B
+// CHECK: .long 2
+// CHECK: .long 2
+
+
+// .exitm is in a false branch that is encountered prior to the true branch
+.macro C
+.if 0
+.exitm
+.else
+.long 3
+.endif
+.long 3
+.endm
+C
+// CHECK: .long 3
+// CHECK: .long 3
+
+// .exitm is in a macro that's expanded in a conditional block.
+.macro D
+.long 4
+.exitm
+.long 4
+.endm
+.if 1
+D
+.endif
+// CHECK: .long 4
+// CHECK-NOT: .long 4

diff --git a/test/MC/AsmParser/macros-darwin-vararg.s b/test/MC/AsmParser/macros-darwin-vararg.s
index a650c08..4aa2f4c 100644
--- a/test/MC/AsmParser/macros-darwin-vararg.s
+++ b/test/MC/AsmParser/macros-darwin-vararg.s

@@ -1,8 +1,90 @@
-// RUN: not llvm-mc -triple i386-apple-darwin10 %s 2>&1 | FileCheck %s
+// RUN: llvm-mc -triple i386-apple-darwin10 %s 2>&1 | FileCheck %s
 
-// CHECK: error: vararg is not a valid parameter qualifier for 'arg' in macro 'abc'
-// CHECK: .macro abc arg:vararg
-
-.macro abc arg:vararg
-    \arg
+.macro abc a b:vararg
+.globl "\a, \b"
 .endm
+
+// CHECK: .globl "zed0, zed1, zed2"
+abc zed0, zed1, zed2
+
+.purgem abc
+
+.macro ifcc arg:vararg
+.if cc
+            \arg
+.endif
+.endm
+
+.macro ifcc2 arg0 arg1:vararg
+.if cc
+            movl \arg0, \arg1
+.endif
+.endm
+
+.macro ifcc3 arg0, arg1:vararg
+.if cc
+            movl \arg0, \arg1
+.endif
+.endm
+
+.macro ifcc4 arg0, arg1:vararg
+.if cc
+            movl \arg1, \arg0
+.endif
+.endm
+
+.text
+
+// CHECK: movl %esp, %ebp
+// CHECK: subl $0, %esp
+// CHECK: movl %eax, %ebx
+// CHECK: movl %ecx, %ebx
+// CHECK: movl %ecx, %eax
+// CHECK: movl %eax, %ecx
+// CHECK: movl %ecx, %eax
+// CHECK: movl %eax, %ecx
+.set cc,1
+  ifcc  movl    %esp, %ebp
+        subl $0, %esp
+
+  ifcc2 %eax, %ebx
+  ifcc2 %ecx, %ebx
+  ifcc3 %ecx, %eax
+  ifcc3 %eax, %ecx
+  ifcc4 %eax, %ecx  ## test
+  ifcc4 %ecx, %eax ## test
+
+// CHECK-NOT movl
+// CHECK: subl $1, %esp
+.set cc,0
+  ifcc  movl,    %esp, %ebp
+        subl $1, %esp
+
+.macro abc arg:vararg=nop
+  \arg
+.endm
+
+.macro abcd arg0=%eax, arg1:vararg=%ebx
+  movl \arg0, \arg1
+.endm
+
+.text
+
+// CHECK: nop
+  abc
+// CHECK: movl %eax, %ebx
+  abcd ,
+
+.macro .make_macro start, end, name, body:vararg
+\start \name
+\body
+\end
+.endmacro
+
+.make_macro .macro,.endmacro,.mybyte,.byte $0, $2, $1
+
+.data
+// CHECK: .byte 10
+// CHECK: .byte 12
+// CHECK: .byte 11
+.mybyte 10,11,12

diff --git a/test/MC/COFF/alias.s b/test/MC/COFF/alias.s
index eb5398a..2293d43 100644
--- a/test/MC/COFF/alias.s
+++ b/test/MC/COFF/alias.s

@@ -46,9 +46,9 @@
 // CHECK-NEXT:     AuxSymbolCount: 0
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
-// CHECK-NEXT:     Name: global_aliased_to_external
+// CHECK:          Name: global_aliased_to_external
 // CHECK-NEXT:     Value: 0
-// CHECK-NEXT:     Section:  (0)
+// CHECK-NEXT:     Section: IMAGE_SYM_UNDEFINED (0)
 // CHECK-NEXT:     BaseType: Null (0x0)
 // CHECK-NEXT:     ComplexType: Null (0x0)
 // CHECK-NEXT:     StorageClass: External (0x2)
@@ -57,7 +57,7 @@
 // CHECK-NEXT:   Symbol {
 // CHECK-NEXT:     Name: external1
 // CHECK-NEXT:     Value: 0
-// CHECK-NEXT:     Section:  (0)
+// CHECK-NEXT:     Section: IMAGE_SYM_UNDEFINED (0)
 // CHECK-NEXT:     BaseType: Null (0x0)
 // CHECK-NEXT:     ComplexType: Null (0x0)
 // CHECK-NEXT:     StorageClass: External (0x2)
@@ -84,21 +84,20 @@
 // CHECK-NEXT:   Symbol {
 // CHECK-NEXT:     Name: weak_aliased_to_external
 // CHECK-NEXT:     Value: 0
-// CHECK-NEXT:     Section:  (0)
+// CHECK-NEXT:     Section: IMAGE_SYM_UNDEFINED (0)
 // CHECK-NEXT:     BaseType: Null (0x0)
 // CHECK-NEXT:     ComplexType: Null (0x0)
 // CHECK-NEXT:     StorageClass: WeakExternal (0x69)
 // CHECK-NEXT:     AuxSymbolCount: 1
 // CHECK-NEXT:     AuxWeakExternal {
-// CHECK-NEXT:       Linked: external2 (13)
+// CHECK-NEXT:       Linked: external2
 // CHECK-NEXT:       Search: Library (0x2)
-// CHECK-NEXT:       Unused: (00 00 00 00 00 00 00 00 00 00)
 // CHECK-NEXT:     }
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Symbol {
 // CHECK-NEXT:     Name: external2
 // CHECK-NEXT:     Value: 0
-// CHECK-NEXT:     Section:  (0)
+// CHECK-NEXT:     Section: IMAGE_SYM_UNDEFINED (0)
 // CHECK-NEXT:     BaseType: Null (0x0)
 // CHECK-NEXT:     ComplexType: Null (0x0)
 // CHECK-NEXT:     StorageClass: External (0x2)

diff --git a/test/MC/COFF/basic-coff-64.s b/test/MC/COFF/basic-coff-64.s
index 38a9e57..62e4eb9 100644
--- a/test/MC/COFF/basic-coff-64.s
+++ b/test/MC/COFF/basic-coff-64.s

@@ -113,7 +113,6 @@
 // CHECK:       Checksum: 0x0
 // CHECK:       Number: [[DataNum]]
 // CHECK:       Selection: 0x0
-// CHECK:       Unused: (00 00 00)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
@@ -128,7 +127,7 @@
 // CHECK:   Symbol {
 // CHECK:     Name:           _printf
 // CHECK:     Value:          0
-// CHECK:     Section:        (0)
+// CHECK:     Section:        IMAGE_SYM_UNDEFINED (0)
 // CHECK:     BaseType:       Null
 // CHECK:     ComplexType:    Null
 // CHECK:     StorageClass:   External

diff --git a/test/MC/COFF/basic-coff.s b/test/MC/COFF/basic-coff.s
index 38bfa6d..549825a 100644
--- a/test/MC/COFF/basic-coff.s
+++ b/test/MC/COFF/basic-coff.s

@@ -113,7 +113,6 @@
 // CHECK:       Checksum: 0x0
 // CHECK:       Number: 2
 // CHECK:       Selection: 0x0
-// CHECK:       Unused: (00 00 00)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
@@ -128,7 +127,7 @@
 // CHECK:   Symbol {
 // CHECK:     Name:           _printf
 // CHECK:     Value:          0
-// CHECK:     Section:        (0)
+// CHECK:     Section:        IMAGE_SYM_UNDEFINED (0)
 // CHECK:     BaseType:       Null
 // CHECK:     ComplexType:    Null
 // CHECK:     StorageClass:   External

diff --git a/test/MC/COFF/bigobj.py b/test/MC/COFF/bigobj.py
new file mode 100644
index 0000000..2d61073
--- /dev/null
+++ b/test/MC/COFF/bigobj.py

@@ -0,0 +1,26 @@
+# RUN: python %s | llvm-mc -filetype=obj -triple i686-pc-win32 - | llvm-readobj -h | FileCheck %s
+
+# This test checks that the COFF object emitter can produce objects with
+# more than 65279 sections.
+
+# While we only generate 65277 sections, an implicit .text, .data and .bss will
+# also be emitted.  This brings the total to 65280.
+num_sections = 65277
+
+# CHECK:      ImageFileHeader {
+# CHECK-NEXT:   Machine: IMAGE_FILE_MACHINE_I386
+# CHECK-NEXT:   SectionCount: 65280
+# CHECK-NEXT:   TimeDateStamp: {{[0-9]+}}
+# CHECK-NEXT:   PointerToSymbolTable: 0x{{[0-9A-F]+}}
+# CHECK-NEXT:   SymbolCount: 195837
+# CHECK-NEXT:   OptionalHeaderSize: 0
+# CHECK-NEXT:   Characteristics [ (0x0)
+# CHECK-NEXT:   ]
+# CHECK-NEXT: }
+
+for i in range(0, num_sections):
+	print("""	.section	.bss,"bw",discard,_b%d
+	.globl	_b%d                     # @b%d
+_b%d:
+	.byte	0                       # 0x0
+""" % (i, i, i, i))

diff --git a/test/MC/COFF/bss_section.ll b/test/MC/COFF/bss_section.ll
index 477b3df..1921eeb 100644
--- a/test/MC/COFF/bss_section.ll
+++ b/test/MC/COFF/bss_section.ll

@@ -6,4 +6,4 @@
 ; CHECK: .bss
 
 @thingy_linkonce = linkonce_odr global %struct.foo zeroinitializer, align 4
-; CHECK: .section .bss,"bw",discard,_thingy_linkonce
+; CHECK: .section .bss,"wb",discard,_thingy_linkonce

diff --git a/test/MC/COFF/comm-align.s b/test/MC/COFF/comm-align.s
new file mode 100644
index 0000000..ca6bfbe
--- /dev/null
+++ b/test/MC/COFF/comm-align.s

@@ -0,0 +1,57 @@
+# RUN: llvm-mc -triple i686-windows-gnu -filetype obj -o - %s \
+# RUN:    | llvm-readobj -coff-directives -symbols | FileCheck %s
+
+# NOTE: this test checks multiple things:
+# - that -aligncomm is not emitted for 1-byte alignment
+# - that -aligncomm is emitted for the various alignments (greater than 1)
+# - that the alignment is represented as a log_2 of the alignment
+# - that the section switching occurs correctly
+# - that functions after the switch also are emitted into the correct section
+
+	.text
+
+	.def _a
+		.scl 3
+		.type 32
+	.endef
+_a:
+	ret
+
+	.data
+
+	.comm _s_1,4,0                  # @s_1
+	.comm _s_2,4,1                  # @s_2
+	.comm _s_4,4,2                  # @s_3
+	.comm _s_8,4,3                  # @s_4
+
+	.comm _small_but_overaligned,1,3                  # @s_4
+
+	.text
+
+	.def _b
+		.scl 3
+		.type 32
+	.endef
+_b:
+	ret
+
+# CHECK-NOT: -aligncomm:"_s_1",0
+
+# CHECK: Symbols [
+# CHECK:   Symbol {
+# CHECK:     Name: _a
+# CHECK:     Section: .text (1)
+# CHECK:   }
+# CHECK:   Symbol {
+# CHECK:     Name: _small_but_overaligned
+# CHECK-NEXT:Value: 1
+# CHECK-NEXT:Section: IMAGE_SYM_UNDEFINED (0)
+# CHECK:   }
+# CHECK:   Symbol {
+# CHECK:     Name: _b
+# CHECK:     Section: .text (1)
+# CHECK:   }
+# CHECK: ]
+
+# CHECK: Directive(s): -aligncomm:"_s_2",1 -aligncomm:"_s_4",2 -aligncomm:"_s_8",3 -aligncomm:"_small_but_overaligned",3
+

diff --git a/test/MC/COFF/comm.ll b/test/MC/COFF/comm.ll
index 6fe122e..74da557 100644
--- a/test/MC/COFF/comm.ll
+++ b/test/MC/COFF/comm.ll

@@ -9,5 +9,5 @@
 ; CHECK: .lcomm	_a,1
 ; CHECK: .lcomm	_b,8,8
 ; .comm uses log2 alignment
-; CHECK: .comm	_c,1
-; CHECK: .comm	_d,8
+; CHECK: .comm	_c,1,0
+; CHECK: .comm	_d,8,3

diff --git a/test/MC/COFF/comm.s b/test/MC/COFF/comm.s
index 37db75f..773ebde 100644
--- a/test/MC/COFF/comm.s
+++ b/test/MC/COFF/comm.s

@@ -1,7 +1,9 @@
 // RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | llvm-readobj -t | FileCheck %s
 
 .lcomm _a,4,4
-.comm	_b, 4
+.comm	_b, 4, 2
+// _c has size 1 but align 32, the value field is the max of size and align.
+.comm	_c, 1, 5
 
 
 // CHECK:       Symbol {
@@ -17,7 +19,17 @@
 // CHECK:       Symbol {
 // CHECK:         Name: _b
 // CHECK-NEXT:    Value: 4
-// CHECK-NEXT:    Section:  (0)
+// CHECK-NEXT:    Section:  IMAGE_SYM_UNDEFINED (0)
+// CHECK-NEXT:    BaseType: Null
+// CHECK-NEXT:    ComplexType: Null
+// CHECK-NEXT:    StorageClass: External
+// CHECK-NEXT:    AuxSymbolCount: 0
+// CHECK-NEXT:  }
+
+// CHECK:       Symbol {
+// CHECK:         Name: _c
+// CHECK-NEXT:    Value: 32
+// CHECK-NEXT:    Section:  IMAGE_SYM_UNDEFINED (0)
 // CHECK-NEXT:    BaseType: Null
 // CHECK-NEXT:    ComplexType: Null
 // CHECK-NEXT:    StorageClass: External

diff --git a/test/MC/COFF/const-gv-with-rel-init.ll b/test/MC/COFF/const-gv-with-rel-init.ll
new file mode 100644
index 0000000..7d3c5f6
--- /dev/null
+++ b/test/MC/COFF/const-gv-with-rel-init.ll

@@ -0,0 +1,11 @@
+; RUN: llc -mtriple x86_64-pc-windows-msvc < %s | FileCheck %s
+
+define void @f() {
+  ret void
+}
+
+@ptr = constant void ()* @f, section ".CRT$XLB", align 8
+; CHECK:  .section  .CRT$XLB,"rd"
+
+@weak_array = weak_odr unnamed_addr constant [1 x i8*] [i8* bitcast (void ()* @f to i8*)]
+; CHECK:  .section  .rdata,"rd",discard,weak_array

diff --git a/test/MC/COFF/feat00.s b/test/MC/COFF/feat00.s
index bfd47ad..f671ebe 100644
--- a/test/MC/COFF/feat00.s
+++ b/test/MC/COFF/feat00.s

@@ -6,7 +6,7 @@
 // CHECK: Symbol {
 // CHECK:   Name: @feat.00
 // CHECK:   Value: 123
-// CHECK:   Section: (65535)
+// CHECK:   Section: IMAGE_SYM_ABSOLUTE (-1)
 // CHECK:   BaseType: Null (0x0)
 // CHECK:   ComplexType: Null (0x0)
 // CHECK:   StorageClass: External (0x2)

diff --git a/test/MC/COFF/file.s b/test/MC/COFF/file.s
index 132e82b..a18a1f4 100644
--- a/test/MC/COFF/file.s
+++ b/test/MC/COFF/file.s

@@ -21,7 +21,7 @@
 // CHECK-SCN: Symbols [
 // CHECK-SCN:   Symbol {
 // CHECK-SCN:     Name: .file
-// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     Section: IMAGE_SYM_DEBUG (-2)
 // CHECK-SCN:     StorageClass: File
 // CHECK-SCN:     AuxFileRecord {
 // CHECK-SCN:       FileName: null-padded.asm
@@ -29,7 +29,7 @@
 // CHECK-SCN:   }
 // CHECK-SCN:   Symbol {
 // CHECK-SCN:     Name: .file
-// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     Section: IMAGE_SYM_DEBUG (-2)
 // CHECK-SCN:     StorageClass: File
 // CHECK-SCN:     AuxFileRecord {
 // CHECK-SCN:       FileName: eighteen-chars.asm
@@ -37,7 +37,7 @@
 // CHECK-SCN:   }
 // CHECK-SCN:   Symbol {
 // CHECK-SCN:     Name: .file
-// CHECK-SCN:     Section: (65534)
+// CHECK-SCN:     Section: IMAGE_SYM_DEBUG (-2)
 // CHECK-SCN:     StorageClass: File
 // CHECK-SCN:     AuxFileRecord {
 // CHECK-SCN:       FileName: multiple-auxiliary-entries.asm

diff --git a/test/MC/COFF/ir-to-imgrel.ll b/test/MC/COFF/ir-to-imgrel.ll
index 39884d2..dfc88b2 100644
--- a/test/MC/COFF/ir-to-imgrel.ll
+++ b/test/MC/COFF/ir-to-imgrel.ll

@@ -2,5 +2,5 @@
 
 @__ImageBase = external global i8
 
-; X64: .quad   "?x@@3HA"@IMGREL32
+; X64: .quad   "?x@@3HA"@IMGREL
 @"\01?x@@3HA" = global i64 sub nsw (i64 ptrtoint (i64* @"\01?x@@3HA" to i64), i64 ptrtoint (i8* @__ImageBase to i64)), align 8

diff --git a/test/MC/COFF/linker-options.ll b/test/MC/COFF/linker-options.ll
index 0be74e5..60baccf 100755
--- a/test/MC/COFF/linker-options.ll
+++ b/test/MC/COFF/linker-options.ll

@@ -14,7 +14,7 @@
   ret void
 }
 
-; CHECK: .section        .drectve,"r"
+; CHECK: .section        .drectve,"yn"
 ; CHECK: .ascii   " /DEFAULTLIB:msvcrt.lib"
 ; CHECK: .ascii   " /DEFAULTLIB:msvcrt.lib"
 ; CHECK: .ascii   " /DEFAULTLIB:secur32.lib"

diff --git a/test/MC/COFF/lset0.s b/test/MC/COFF/lset0.s
index 7321b01..f4a13bf 100755
--- a/test/MC/COFF/lset0.s
+++ b/test/MC/COFF/lset0.s

@@ -1,12 +1,11 @@
-// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-nm - | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-nm - | FileCheck %s --check-prefix=GLOBAL
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-nm - | FileCheck %s --check-prefix=LOCAL
 
 not_global = 123
 global = 456
 .globl global
-.Llocal = 789
+Llocal = 789
 
-// CHECK-NOT: not_global
-// CHECK-NOT: Llocal
-// CHECK: global
-// CHECK-NOT: not_global
-// CHECK-NOT: Llocal
+// LOCAL-NOT: local
+// GLOBAL: A global
+// GLOBAL: a not_global

diff --git a/test/MC/COFF/secidx.s b/test/MC/COFF/secidx.s
index 619d777..022804d 100644
--- a/test/MC/COFF/secidx.s
+++ b/test/MC/COFF/secidx.s

@@ -4,7 +4,9 @@
 
 Lfoo:
 	.secidx	Lfoo
+	.short  0
 	.secidx	Lbar
+	.short  0
 
 .section spam
 Lbar:

diff --git a/test/MC/COFF/section-invalid-flags.s b/test/MC/COFF/section-invalid-flags.s
index 17b1550..9cdceaf 100644
--- a/test/MC/COFF/section-invalid-flags.s
+++ b/test/MC/COFF/section-invalid-flags.s

@@ -6,3 +6,6 @@
 
 // CHECK: error: conflicting section flags 'b' and 'd'
 .section s_bd,"bd"; .long 1
+
+// CHECK: error: expected comdat type such as 'discard' or 'largest' after protection bits
+.section .stack, "w", @nobits

diff --git a/test/MC/COFF/section-name-encoding.s b/test/MC/COFF/section-name-encoding.s
index 7edd6d7..73ab4bd 100644
--- a/test/MC/COFF/section-name-encoding.s
+++ b/test/MC/COFF/section-name-encoding.s

@@ -21,14 +21,19 @@
 .section s1234567; .long 1
 
 
+// Note: the names in the string table will be sorted in reverse
+// lexicographical order. Use a suffix letter (z, y, x, ...) to
+// get the preferred ordering of names in the test.
+
 // Base 10 encoding
+// Ending in z should put the name first in the string table.
 
 // /4
 // CHECK:   Section {
 // CHECK:     Number: 6
-// CHECK:     Name: s12345678 (2F 34 00 00 00 00 00 00)
+// CHECK:     Name: s1234567z (2F 34 00 00 00 00 00 00)
 // CHECK:   }
-.section s12345678; .long 1
+.section s1234567z; .long 1
 
 
 // Generate padding sections to increase the string table size to at least
@@ -47,20 +52,20 @@
   pad_sections2 \pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad\pad
 .endm
 
-// 1000x 'a'
-pad_sections aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+// 1000x 'y'
+pad_sections yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy
 
 
 // /1000029 == 4 + 10 + (5 * (2 + (20 * 10 * 1000) + 1))
 //             v   |     |    v    ~~~~~~~~~~~~~~    v
 //    table size   v     v   "p0"        pad         NUL separator
-//     "s12345678\0"     # of pad sections
+//     "s1234567z\0"     # of pad sections
 //
 // CHECK:   Section {
 // CHECK:     Number: 12
-// CHECK:     Name: seven_digit (2F 31 30 30 30 30 32 39)
+// CHECK:     Name: sevendigitx (2F 31 30 30 30 30 32 39)
 // CHECK:   }
-.section seven_digit; .long 1
+.section sevendigitx; .long 1
 
 
 // Generate padding sections to increase the string table size to at least
@@ -71,18 +76,18 @@
 .endm
 
 // 1000x 'a'
-pad_sections_ex aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+pad_sections_ex wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww
 
 
 // //AAmJa4 == 1000029 + 12 + (5 * (2 + (9 * 20 * 10 * 1000) + 1)) == 38*64^3 + 9*64^2 + 26*64 + 56
 //             v         |     |    v    ~~~~~~~~~~~~~~~~~~    v
 // seven_digit offset    v     v   "p0"         pad            NUL separator
-//         "seven_digit\0"     # of pad sections
+//         "sevendigitx\0"     # of pad sections
 //
 // "2F 2F 41 41 6D 4A 61 34" is "//AAmJa4", which decodes to "0 0 38 9 26 56".
 //
 // CHECK:   Section {
 // CHECK:     Number: 18
-// CHECK:     Name: double_slash (2F 2F 41 41 6D 4A 61 34)
+// CHECK:     Name: doubleslashv (2F 2F 41 41 6D 4A 61 34)
 // CHECK:   }
-.section double_slash; .long 1
+.section doubleslashv; .long 1

diff --git a/test/MC/COFF/section-passthru-flags.s b/test/MC/COFF/section-passthru-flags.s
new file mode 100644
index 0000000..3bd061b
--- /dev/null
+++ b/test/MC/COFF/section-passthru-flags.s

@@ -0,0 +1,7 @@
+// RUN: llvm-mc -triple i386-pc-win32 < %s | FileCheck %s
+.section .klaatu,"wn"
+// CHECK: .section .klaatu,"wn"
+.section .barada,"y"
+// CHECK: .section .barada,"y"
+.section .nikto,"wds"
+// CHECK: .section .nikto,"wds"

diff --git a/test/MC/COFF/seh-linkonce.s b/test/MC/COFF/seh-linkonce.s
new file mode 100644
index 0000000..5631b74
--- /dev/null
+++ b/test/MC/COFF/seh-linkonce.s

@@ -0,0 +1,85 @@
+// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -sections -section-symbols | FileCheck %s
+
+        .text
+        .def     weak_func;
+        .scl    2;
+        .type   32;
+        .endef
+        .section        .text,"xr",discard,weak_func
+        .globl  weak_func
+        .align  16, 0x90
+weak_func:                              # @weak_func
+.Ltmp0:
+.seh_proc weak_func
+# BB#0:                                 # %entry
+        pushq   %rbp
+.Ltmp1:
+        .seh_pushreg 5
+        movq    %rsp, %rbp
+.Ltmp2:
+        .seh_setframe 5, 0
+.Ltmp3:
+        .seh_endprologue
+        xorl    %eax, %eax
+        popq    %rbp
+        retq
+.Leh_func_end0:
+.Ltmp4:
+        .seh_endproc
+
+// CHECK: Sections [
+// CHECK:   Section {
+// CHECK:     Name: .text
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Name: .data
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Name: .bss
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: [[TEXT_SECNUM:[0-9]+]]
+// CHECK:     Name: .text
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: [[XDATA_SECNUM:[0-9]+]]
+// CHECK:     Name: .xdata
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:     Symbols [
+// CHECK:       Symbol {
+// CHECK:         Name: .xdata
+// CHECK:         Section: .xdata ([[XDATA_SECNUM]])
+// CHECK:         StorageClass: Static (0x3)
+// CHECK:         AuxSymbolCount: 1
+// CHECK:         AuxSectionDef {
+// CHECK:           Selection: Associative (0x5)
+// CHECK:           AssocSection: .text ([[TEXT_SECNUM]])
+// CHECK:         }
+// CHECK:       }
+// CHECK:     ]
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: [[PDATA_SECNUM:[0-9]+]]
+// CHECK:     Name: .pdata
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:     Symbols [
+// CHECK:       Symbol {
+// CHECK:         Name: .pdata
+// CHECK:         Section: .pdata ([[PDATA_SECNUM]])
+// CHECK:         StorageClass: Static (0x3)
+// CHECK:         AuxSymbolCount: 1
+// CHECK:         AuxSectionDef {
+// CHECK:           Selection: Associative (0x5)
+// CHECK:           AssocSection: .text ([[TEXT_SECNUM]])
+// CHECK:         }
+// CHECK:       }
+// CHECK:     ]
+// CHECK:   }
+// CHECK: ]

diff --git a/test/MC/COFF/simple-fixups.s b/test/MC/COFF/simple-fixups.s
index 2a74f21..cb5d764 100644
--- a/test/MC/COFF/simple-fixups.s
+++ b/test/MC/COFF/simple-fixups.s

@@ -1,5 +1,6 @@
-// The purpose of this test is to verify that we do not produce unneeded
-// relocations when symbols are in the same section and we know their offset.
+// The purpose of this test is to verify that we produce relocations for
+// references to functions.  Failing to do so might cause pointer-to-function
+// equality to fail if /INCREMENTAL links are used.
 
 // RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | llvm-readobj -s | FileCheck %s
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | llvm-readobj -s | FileCheck %s
@@ -46,4 +47,4 @@
 	ret
 
 // CHECK:     Sections [
-// CHECK-NOT: RelocationCount: {{[^0]}}
+// CHECK: RelocationCount: 1

diff --git a/test/MC/COFF/symbol-fragment-offset-64.s b/test/MC/COFF/symbol-fragment-offset-64.s
index deac888..05b46bb 100644
--- a/test/MC/COFF/symbol-fragment-offset-64.s
+++ b/test/MC/COFF/symbol-fragment-offset-64.s

@@ -117,7 +117,6 @@
 // CHECK:       Checksum: 0x0
 // CHECK:       Number: 1
 // CHECK:       Selection: 0x0
-// CHECK:       Unused: (00 00 00)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
@@ -135,7 +134,6 @@
 // CHECK:       Checksum: 0x0
 // CHECK:       Number: 2
 // CHECK:       Selection: 0x0
-// CHECK:       Unused: (00 00 00)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
@@ -150,7 +148,7 @@
 // CHECK:   Symbol {
 // CHECK:     Name:                      _printf
 // CHECK:     Value:                     0
-// CHECK:     Section:                   (0)
+// CHECK:     Section:                   IMAGE_SYM_UNDEFINED (0)
 // CHECK:     BaseType:                  Null
 // CHECK:     ComplexType:               Null
 // CHECK:     StorageClass:              External
@@ -159,7 +157,7 @@
 // CHECK:   Symbol {
 // CHECK:     Name:                      _puts
 // CHECK:     Value:                     0
-// CHECK:     Section:                   (0)
+// CHECK:     Section:                   IMAGE_SYM_UNDEFINED (0)
 // CHECK:     BaseType:                  Null
 // CHECK:     ComplexType:               Null
 // CHECK:     StorageClass:              External

diff --git a/test/MC/COFF/symbol-fragment-offset.s b/test/MC/COFF/symbol-fragment-offset.s
index b09c5af..cc5040a 100644
--- a/test/MC/COFF/symbol-fragment-offset.s
+++ b/test/MC/COFF/symbol-fragment-offset.s

@@ -117,7 +117,6 @@
 // CHECK:       Checksum: 0x0
 // CHECK:       Number: 1
 // CHECK:       Selection: 0x0
-// CHECK:       Unused: (00 00 00)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
@@ -135,7 +134,6 @@
 // CHECK:       Checksum: 0x0
 // CHECK:       Number: 2
 // CHECK:       Selection: 0x0
-// CHECK:       Unused: (00 00 00)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
@@ -150,7 +148,7 @@
 // CHECK:   Symbol {
 // CHECK:     Name:                      _printf
 // CHECK:     Value:                     0
-// CHECK:     Section:                   (0)
+// CHECK:     Section:                   IMAGE_SYM_UNDEFINED (0)
 // CHECK:     BaseType:                  Null
 // CHECK:     ComplexType:               Null
 // CHECK:     StorageClass:              External
@@ -159,7 +157,7 @@
 // CHECK:   Symbol {
 // CHECK:     Name:                      _puts
 // CHECK:     Value:                     0
-// CHECK:     Section:                   (0)
+// CHECK:     Section:                   IMAGE_SYM_UNDEFINED (0)
 // CHECK:     BaseType:                  Null
 // CHECK:     ComplexType:               Null
 // CHECK:     StorageClass:              External

diff --git a/test/MC/COFF/weak.s b/test/MC/COFF/weak.s
index accd3f4..6086749 100644
--- a/test/MC/COFF/weak.s
+++ b/test/MC/COFF/weak.s

@@ -37,7 +37,7 @@
 // CHECK:      Symbol {
 // CHECK:        Name:           _test_weak
 // CHECK-NEXT:   Value:          0
-// CHECK-NEXT:   Section:        (0)
+// CHECK-NEXT:   Section:        IMAGE_SYM_UNDEFINED (0)
 // CHECK-NEXT:   BaseType:       Null
 // CHECK-NEXT:   ComplexType:    Null
 // CHECK-NEXT:   StorageClass:   WeakExternal
@@ -45,14 +45,13 @@
 // CHECK-NEXT:   AuxWeakExternal {
 // CHECK-NEXT:     Linked: .weak._test_weak.default
 // CHECK-NEXT:      Search: Library
-// CHECK-NEXT:      Unused: (00 00 00 00 00 00 00 00 00 00)
 // CHECK-NEXT:   }
 // CHECK-NEXT: }
 
 // CHECK:      Symbol {
 // CHECK:        Name:                .weak._test_weak.default
 // CHECK-NEXT:   Value:               0
-// CHECK-NEXT:   Section:             (65535)
+// CHECK-NEXT:   Section:             IMAGE_SYM_ABSOLUTE (-1)
 // CHECK-NEXT:   BaseType:            Null
 // CHECK-NEXT:   ComplexType:         Null
 // CHECK-NEXT:   StorageClass:        External
@@ -62,7 +61,7 @@
 // CHECK:      Symbol {
 // CHECK:        Name:           _test_weak_alias
 // CHECK-NEXT:   Value:          0
-// CHECK-NEXT:   Section:        (0)
+// CHECK-NEXT:   Section:        IMAGE_SYM_UNDEFINED (0)
 // CHECK-NEXT:   BaseType:       Null
 // CHECK-NEXT:   ComplexType:    Null
 // CHECK-NEXT:   StorageClass:   WeakExternal
@@ -70,6 +69,5 @@
 // CHECK-NEXT:   AuxWeakExternal {
 // CHECK-NEXT:     Linked: _main
 // CHECK-NEXT:      Search: Library
-// CHECK-NEXT:      Unused: (00 00 00 00 00 00 00 00 00 00)
 // CHECK-NEXT:   }
 // CHECK-NEXT: }

diff --git a/test/MC/Disassembler/ARM/arm-tests.txt b/test/MC/Disassembler/ARM/arm-tests.txt
index acc2d9f..e82f75a 100644
--- a/test/MC/Disassembler/ARM/arm-tests.txt
+++ b/test/MC/Disassembler/ARM/arm-tests.txt

@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 -mcpu=cortex-a9-mp | FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=armv7-apple-darwin9 -mcpu=cortex-a9 | FileCheck %s
 
 # CHECK:	addpl	r4, pc, #318767104
 0x4c 0x45 0x8f 0x52

diff --git a/test/MC/Disassembler/ARM/d16.txt b/test/MC/Disassembler/ARM/d16.txt
new file mode 100644
index 0000000..735af81
--- /dev/null
+++ b/test/MC/Disassembler/ARM/d16.txt

@@ -0,0 +1,23 @@
+# RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -disassemble -mattr=+vfp4,-d16 2>&1 | FileCheck %s --check-prefix=D32
+# RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -disassemble -mattr=+vfp4,-d16 2>&1 | FileCheck %s --check-prefix=D32
+
+
+# D32: vadd.f64 d1, d2, d16
+# D16: warning: invalid instruction encoding
+[0x32,0xee,0x20,0x1b]
+
+# D32: vadd.f64 d1, d17, d6
+# D16: warning: invalid instruction encoding
+[0x31,0xee,0x86,0x1b]
+
+# D32: vadd.f64 d19, d7, d6
+# D16: warning: invalid instruction encoding
+[0x77,0xee,0x06,0x3b]
+
+# D32: vcvt.f64.f32 d22, s4
+# D16: warning: invalid instruction encoding
+[0xf7,0xee,0xc2,0x6a]
+
+# D32: vcvt.f32.f64 s26, d30
+# D16: warning: invalid instruction encoding
+[0xb7,0xee,0xee,0xdb]

diff --git a/test/MC/Disassembler/ARM/invalid-thumb-MSR-MClass.txt b/test/MC/Disassembler/ARM/invalid-thumb-MSR-MClass.txt
new file mode 100644
index 0000000..26fa907
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-thumb-MSR-MClass.txt

@@ -0,0 +1,35 @@
+# RUN: not llvm-mc -disassemble %s -triple=thumbv7em 2>&1 | FileCheck --check-prefix=CHECK %s
+# RUN: not llvm-mc -disassemble %s -triple=thumbv7m 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V7M %s
+
+#------------------------------------------------------------------------------
+# Undefined encodings for mrs
+#------------------------------------------------------------------------------
+
+# invalid SYSm
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0xef 0xf3 0x80 0x80]
+[0xef 0xf3 0x80 0x80]
+
+#------------------------------------------------------------------------------
+# Undefined encodings for msr
+#------------------------------------------------------------------------------
+
+# invalid mask = '00'
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: [0x80 0xf3 0x00 0x80]
+[0x80 0xf3 0x00 0x80]
+
+# invalid mask = '11' with SYSm not in {0..3}
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: [0x80 0xf3 0x05 0x8c]
+[0x80 0xf3 0x05 0x8c]
+
+# invalid mask = '01' (ThumbV7M does not have the DSP extension)
+# CHECK-V7M: warning: potentially undefined instruction encoding
+# CHECK-V7M-NEXT: [0x80 0xf3 0x00 0x84]
+[0x80 0xf3 0x00 0x84]
+
+# invalid SYSm
+# CHECK: warning: invalid instruction encoding
+# CHECK-NEXT: [0x80 0xf3 0x80 0x88]
+[0x80 0xf3 0x80 0x88]

diff --git a/test/MC/Disassembler/ARM/move-banked-regs-arm.txt b/test/MC/Disassembler/ARM/move-banked-regs-arm.txt
new file mode 100644
index 0000000..dd1d463
--- /dev/null
+++ b/test/MC/Disassembler/ARM/move-banked-regs-arm.txt

@@ -0,0 +1,150 @@
+@ RUN: llvm-mc -disassemble -triple armv7 -mcpu=cyclone %s | FileCheck %s
+
+
+[0x00,0x22,0x20,0xe1]
+[0x00,0x32,0x21,0xe1]
+[0x00,0x52,0x22,0xe1]
+[0x00,0x72,0x23,0xe1]
+[0x00,0xb2,0x24,0xe1]
+[0x00,0x12,0x25,0xe1]
+[0x00,0x22,0x26,0xe1]
+@ CHECK:         mrs     r2, r8_usr
+@ CHECK:         mrs     r3, r9_usr
+@ CHECK:         mrs     r5, r10_usr
+@ CHECK:         mrs     r7, r11_usr
+@ CHECK:         mrs     r11, r12_usr
+@ CHECK:         mrs     r1, sp_usr
+@ CHECK:         mrs     r2, lr_usr
+
+[0x00,0x22,0x28,0xe1]
+[0x00,0x32,0x29,0xe1]
+[0x00,0x52,0x2a,0xe1]
+[0x00,0x72,0x2b,0xe1]
+[0x00,0xb2,0x2c,0xe1]
+[0x00,0x12,0x2d,0xe1]
+[0x00,0x22,0x2e,0xe1]
+[0x00,0x32,0x6e,0xe1]
+@ CHECK:         mrs     r2, r8_fiq
+@ CHECK:         mrs     r3, r9_fiq
+@ CHECK:         mrs     r5, r10_fiq
+@ CHECK:         mrs     r7, r11_fiq
+@ CHECK:         mrs     r11, r12_fiq
+@ CHECK:         mrs     r1, sp_fiq
+@ CHECK:         mrs     r2, lr_fiq
+@ CHECK:         mrs     r3, SPSR_fiq
+
+[0x00,0x43,0x20,0xe1]
+[0x00,0x93,0x21,0xe1]
+[0x00,0x13,0x60,0xe1]
+@ CHECK:         mrs     r4, lr_irq
+@ CHECK:         mrs     r9, sp_irq
+@ CHECK:         mrs     r1, SPSR_irq
+
+[0x00,0x13,0x22,0xe1]
+[0x00,0x33,0x23,0xe1]
+[0x00,0x53,0x62,0xe1]
+@ CHECK:         mrs     r1, lr_svc
+@ CHECK:         mrs     r3, sp_svc
+@ CHECK:         mrs     r5, SPSR_svc
+
+[0x00,0x53,0x24,0xe1]
+[0x00,0x73,0x25,0xe1]
+[0x00,0x93,0x64,0xe1]
+@ CHECK:         mrs     r5, lr_abt
+@ CHECK:         mrs     r7, sp_abt
+@ CHECK:         mrs     r9, SPSR_abt
+
+[0x00,0x93,0x26,0xe1]
+[0x00,0xb3,0x27,0xe1]
+[0x00,0xc3,0x66,0xe1]
+@ CHECK:         mrs     r9, lr_und
+@ CHECK:         mrs     r11, sp_und
+@ CHECK:         mrs     r12, SPSR_und
+
+[0x00,0x23,0x2c,0xe1]
+[0x00,0x43,0x2d,0xe1]
+[0x00,0x63,0x6c,0xe1]
+@ CHECK:         mrs     r2, lr_mon
+@ CHECK:         mrs     r4, sp_mon
+@ CHECK:         mrs     r6, SPSR_mon
+
+[0x00,0x63,0x2e,0xe1]
+[0x00,0x83,0x2f,0xe1]
+[0x00,0xa3,0x6e,0xe1]
+@ CHECK:         mrs     r6, elr_hyp
+@ CHECK:         mrs     r8, sp_hyp
+@ CHECK:         mrs     r10, SPSR_hyp
+
+[0x02,0xf2,0x20,0xe1]
+[0x03,0xf2,0x21,0xe1]
+[0x05,0xf2,0x22,0xe1]
+[0x07,0xf2,0x23,0xe1]
+[0x0b,0xf2,0x24,0xe1]
+[0x01,0xf2,0x25,0xe1]
+[0x02,0xf2,0x26,0xe1]
+@ CHECK:         msr     r8_usr, r2
+@ CHECK:         msr     r9_usr, r3
+@ CHECK:         msr     r10_usr, r5
+@ CHECK:         msr     r11_usr, r7
+@ CHECK:         msr     r12_usr, r11
+@ CHECK:         msr     sp_usr, r1
+@ CHECK:         msr     lr_usr, r2
+
+[0x02,0xf2,0x28,0xe1]
+[0x03,0xf2,0x29,0xe1]
+[0x05,0xf2,0x2a,0xe1]
+[0x07,0xf2,0x2b,0xe1]
+[0x0b,0xf2,0x2c,0xe1]
+[0x01,0xf2,0x2d,0xe1]
+[0x02,0xf2,0x2e,0xe1]
+[0x03,0xf2,0x6e,0xe1]
+@ CHECK:         msr     r8_fiq, r2
+@ CHECK:         msr     r9_fiq, r3
+@ CHECK:         msr     r10_fiq, r5
+@ CHECK:         msr     r11_fiq, r7
+@ CHECK:         msr     r12_fiq, r11
+@ CHECK:         msr     sp_fiq, r1
+@ CHECK:         msr     lr_fiq, r2
+@ CHECK:         msr     SPSR_fiq, r3
+
+[0x04,0xf3,0x20,0xe1]
+[0x09,0xf3,0x21,0xe1]
+[0x0b,0xf3,0x60,0xe1]
+@ CHECK:         msr     lr_irq, r4
+@ CHECK:         msr     sp_irq, r9
+@ CHECK:         msr     SPSR_irq, r11
+
+[0x01,0xf3,0x22,0xe1]
+[0x03,0xf3,0x23,0xe1]
+[0x05,0xf3,0x62,0xe1]
+@ CHECK:         msr     lr_svc, r1
+@ CHECK:         msr     sp_svc, r3
+@ CHECK:         msr     SPSR_svc, r5
+
+[0x05,0xf3,0x24,0xe1]
+[0x07,0xf3,0x25,0xe1]
+[0x09,0xf3,0x64,0xe1]
+@ CHECK:         msr     lr_abt, r5
+@ CHECK:         msr     sp_abt, r7
+@ CHECK:         msr     SPSR_abt, r9
+
+[0x09,0xf3,0x26,0xe1]
+[0x0b,0xf3,0x27,0xe1]
+[0x0c,0xf3,0x66,0xe1]
+@ CHECK:         msr     lr_und, r9
+@ CHECK:         msr     sp_und, r11
+@ CHECK:         msr     SPSR_und, r12
+
+[0x02,0xf3,0x2c,0xe1]
+[0x04,0xf3,0x2d,0xe1]
+[0x06,0xf3,0x6c,0xe1]
+@ CHECK:         msr     lr_mon, r2
+@ CHECK:         msr     sp_mon, r4
+@ CHECK:         msr     SPSR_mon, r6
+
+[0x06,0xf3,0x2e,0xe1]
+[0x08,0xf3,0x2f,0xe1]
+[0x0a,0xf3,0x6e,0xe1]
+@ CHECK:         msr     elr_hyp, r6
+@ CHECK:         msr     sp_hyp, r8
+@ CHECK:         msr     SPSR_hyp, r10

diff --git a/test/MC/Disassembler/ARM/move-banked-regs-thumb.txt b/test/MC/Disassembler/ARM/move-banked-regs-thumb.txt
new file mode 100644
index 0000000..29e91ab
--- /dev/null
+++ b/test/MC/Disassembler/ARM/move-banked-regs-thumb.txt

@@ -0,0 +1,153 @@
+@ RUN: llvm-mc -disassemble -triple thumb -mcpu=cyclone %s | FileCheck %s
+
+[0xe0,0xf3,0x20,0x82]
+[0xe1,0xf3,0x20,0x83]
+[0xe2,0xf3,0x20,0x85]
+[0xe3,0xf3,0x20,0x87]
+[0xe4,0xf3,0x20,0x8b]
+[0xe5,0xf3,0x20,0x81]
+[0xe6,0xf3,0x20,0x82]
+@ CHECK:         mrs     r2, r8_usr
+@ CHECK:         mrs     r3, r9_usr
+@ CHECK:         mrs     r5, r10_usr
+@ CHECK:         mrs     r7, r11_usr
+@ CHECK:         mrs     r11, r12_usr
+@ CHECK:         mrs     r1, sp_usr
+@ CHECK:         mrs     r2, lr_usr
+
+[0xe8,0xf3,0x20,0x82]
+[0xe9,0xf3,0x20,0x83]
+[0xea,0xf3,0x20,0x85]
+[0xeb,0xf3,0x20,0x87]
+[0xec,0xf3,0x20,0x8b]
+[0xed,0xf3,0x20,0x81]
+[0xee,0xf3,0x20,0x82]
+[0xfe,0xf3,0x20,0x83]
+@ CHECK:         mrs     r2, r8_fiq
+@ CHECK:         mrs     r3, r9_fiq
+@ CHECK:         mrs     r5, r10_fiq
+@ CHECK:         mrs     r7, r11_fiq
+@ CHECK:         mrs     r11, r12_fiq
+@ CHECK:         mrs     r1, sp_fiq
+@ CHECK:         mrs     r2, lr_fiq
+@ CHECK:         mrs     r3, SPSR_fiq
+
+[0xe0,0xf3,0x30,0x84]
+[0xe1,0xf3,0x30,0x89]
+[0xf0,0xf3,0x30,0x81]
+@ CHECK:         mrs     r4, lr_irq
+@ CHECK:         mrs     r9, sp_irq
+@ CHECK:         mrs     r1, SPSR_irq
+
+[0xe2,0xf3,0x30,0x81]
+[0xe3,0xf3,0x30,0x83]
+[0xf2,0xf3,0x30,0x85]
+@ CHECK:         mrs     r1, lr_svc
+@ CHECK:         mrs     r3, sp_svc
+@ CHECK:         mrs     r5, SPSR_svc
+
+[0xe4,0xf3,0x30,0x85]
+[0xe5,0xf3,0x30,0x87]
+[0xf4,0xf3,0x30,0x89]
+@ CHECK:         mrs     r5, lr_abt
+@ CHECK:         mrs     r7, sp_abt
+@ CHECK:         mrs     r9, SPSR_abt
+
+[0xe6,0xf3,0x30,0x89]
+[0xe7,0xf3,0x30,0x8b]
+[0xf6,0xf3,0x30,0x8c]
+@ CHECK:         mrs     r9, lr_und
+@ CHECK:         mrs     r11, sp_und
+@ CHECK:         mrs     r12, SPSR_und
+
+
+[0xec,0xf3,0x30,0x82]
+[0xed,0xf3,0x30,0x84]
+[0xfc,0xf3,0x30,0x86]
+@ CHECK:         mrs     r2, lr_mon
+@ CHECK:         mrs     r4, sp_mon
+@ CHECK:         mrs     r6, SPSR_mon
+
+
+[0xee,0xf3,0x30,0x86]
+[0xef,0xf3,0x30,0x88]
+[0xfe,0xf3,0x30,0x8a]
+@ CHECK:         mrs     r6, elr_hyp
+@ CHECK:         mrs     r8, sp_hyp
+@ CHECK:         mrs     r10, SPSR_hyp
+
+
+[0x82,0xf3,0x20,0x80]
+[0x83,0xf3,0x20,0x81]
+[0x85,0xf3,0x20,0x82]
+[0x87,0xf3,0x20,0x83]
+[0x8b,0xf3,0x20,0x84]
+[0x81,0xf3,0x20,0x85]
+[0x82,0xf3,0x20,0x86]
+@ CHECK:         msr     r8_usr, r2
+@ CHECK:         msr     r9_usr, r3
+@ CHECK:         msr     r10_usr, r5
+@ CHECK:         msr     r11_usr, r7
+@ CHECK:         msr     r12_usr, r11
+@ CHECK:         msr     sp_usr, r1
+@ CHECK:         msr     lr_usr, r2
+
+[0x82,0xf3,0x20,0x88]
+[0x83,0xf3,0x20,0x89]
+[0x85,0xf3,0x20,0x8a]
+[0x87,0xf3,0x20,0x8b]
+[0x8b,0xf3,0x20,0x8c]
+[0x81,0xf3,0x20,0x8d]
+[0x82,0xf3,0x20,0x8e]
+[0x93,0xf3,0x20,0x8e]
+@ CHECK:         msr     r8_fiq, r2
+@ CHECK:         msr     r9_fiq, r3
+@ CHECK:         msr     r10_fiq, r5
+@ CHECK:         msr     r11_fiq, r7
+@ CHECK:         msr     r12_fiq, r11
+@ CHECK:         msr     sp_fiq, r1
+@ CHECK:         msr     lr_fiq, r2
+@ CHECK:        msr     SPSR_fiq, r3
+
+[0x84,0xf3,0x30,0x80]
+[0x89,0xf3,0x30,0x81]
+[0x9b,0xf3,0x30,0x80]
+@ CHECK:         msr     lr_irq, r4
+@ CHECK:         msr     sp_irq, r9
+@ CHECK:         msr     SPSR_irq, r11
+
+[0x81,0xf3,0x30,0x82]
+[0x83,0xf3,0x30,0x83]
+[0x95,0xf3,0x30,0x82]
+@ CHECK:         msr     lr_svc, r1
+@ CHECK:         msr     sp_svc, r3
+@ CHECK:         msr     SPSR_svc, r5
+
+[0x85,0xf3,0x30,0x84]
+[0x87,0xf3,0x30,0x85]
+[0x99,0xf3,0x30,0x84]
+@ CHECK:         msr     lr_abt, r5
+@ CHECK:         msr     sp_abt, r7
+@ CHECK:         msr     SPSR_abt, r9
+
+[0x89,0xf3,0x30,0x86]
+[0x8b,0xf3,0x30,0x87]
+[0x9c,0xf3,0x30,0x86]
+@ CHECK:         msr     lr_und, r9
+@ CHECK:         msr     sp_und, r11
+@ CHECK:         msr     SPSR_und, r12
+
+
+[0x82,0xf3,0x30,0x8c]
+[0x84,0xf3,0x30,0x8d]
+[0x96,0xf3,0x30,0x8c]
+@ CHECK:         msr     lr_mon, r2
+@ CHECK:         msr     sp_mon, r4
+@ CHECK:         msr     SPSR_mon, r6
+
+[0x86,0xf3,0x30,0x8e]
+[0x88,0xf3,0x30,0x8f]
+[0x9a,0xf3,0x30,0x8e]
+@ CHECK:         msr     elr_hyp, r6
+@ CHECK:         msr     sp_hyp, r8
+@ CHECK:         msr     SPSR_hyp, r10

diff --git a/test/MC/Disassembler/ARM/thumb-MSR-MClass.txt b/test/MC/Disassembler/ARM/thumb-MSR-MClass.txt
index 497cb9a..c1a2790 100644
--- a/test/MC/Disassembler/ARM/thumb-MSR-MClass.txt
+++ b/test/MC/Disassembler/ARM/thumb-MSR-MClass.txt

@@ -1,7 +1,94 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 -mcpu cortex-m3 | FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=thumbv7em | FileCheck %s
 
-# CHECK: msr    primask, r0
-0x80 0xf3 0x10 0x80
+#------------------------------------------------------------------------------
+# MRS
+#------------------------------------------------------------------------------
 
-# CHECK: mrs    r0, primask
+# CHECK: mrs r0, apsr
+# CHECK: mrs r0, iapsr
+# CHECK: mrs r0, eapsr
+# CHECK: mrs r0, xpsr
+# CHECK: mrs r0, ipsr
+# CHECK: mrs r0, epsr
+# CHECK: mrs r0, iepsr
+# CHECK: mrs r0, msp
+# CHECK: mrs r0, psp
+# CHECK: mrs r0, primask
+# CHECK: mrs r0, basepri
+# CHECK: mrs r0, basepri_max
+# CHECK: mrs r0, faultmask
+# CHECK: mrs r0, control
+
+0xef 0xf3 0x00 0x80
+0xef 0xf3 0x01 0x80
+0xef 0xf3 0x02 0x80
+0xef 0xf3 0x03 0x80
+0xef 0xf3 0x05 0x80
+0xef 0xf3 0x06 0x80
+0xef 0xf3 0x07 0x80
+0xef 0xf3 0x08 0x80
+0xef 0xf3 0x09 0x80
 0xef 0xf3 0x10 0x80
+0xef 0xf3 0x11 0x80
+0xef 0xf3 0x12 0x80
+0xef 0xf3 0x13 0x80
+0xef 0xf3 0x14 0x80
+
+
+#------------------------------------------------------------------------------
+# MSR
+#------------------------------------------------------------------------------
+
+# CHECK: msr apsr_nzcvq, r0
+# CHECK: msr apsr_g, r0
+# CHECK: msr apsr_nzcvqg, r0
+
+0x80 0xf3 0x00 0x88
+0x80 0xf3 0x00 0x84
+0x80 0xf3 0x00 0x8c
+
+# CHECK: msr iapsr_nzcvq, r0
+# CHECK: msr iapsr_g, r0
+# CHECK: msr iapsr_nzcvqg, r0
+
+0x80 0xf3 0x01 0x88
+0x80 0xf3 0x01 0x84
+0x80 0xf3 0x01 0x8c
+
+# CHECK: msr eapsr_nzcvq, r0
+# CHECK: msr eapsr_g, r0
+# CHECK: msr eapsr_nzcvqg, r0
+
+0x80 0xf3 0x02 0x88
+0x80 0xf3 0x02 0x84
+0x80 0xf3 0x02 0x8c
+
+# CHECK: msr xpsr_nzcvq, r0
+# CHECK: msr xpsr_g, r0
+# CHECK: msr xpsr_nzcvqg, r0
+
+0x80 0xf3 0x03 0x88
+0x80 0xf3 0x03 0x84
+0x80 0xf3 0x03 0x8c
+
+# CHECK: msr ipsr, r0
+# CHECK: msr epsr, r0
+# CHECK: msr iepsr, r0
+# CHECK: msr msp, r0
+# CHECK: msr psp, r0
+# CHECK: msr primask, r0
+# CHECK: msr basepri, r0
+# CHECK: msr basepri_max, r0
+# CHECK: msr faultmask, r0
+# CHECK: msr control, r0
+
+0x80 0xf3 0x05 0x88
+0x80 0xf3 0x06 0x88
+0x80 0xf3 0x07 0x88
+0x80 0xf3 0x08 0x88
+0x80 0xf3 0x09 0x88
+0x80 0xf3 0x10 0x88
+0x80 0xf3 0x11 0x88
+0x80 0xf3 0x12 0x88
+0x80 0xf3 0x13 0x88
+0x80 0xf3 0x14 0x88

diff --git a/test/MC/Disassembler/ARM/thumb-tests.txt b/test/MC/Disassembler/ARM/thumb-tests.txt
index df2bac1..dcb6e3f 100644
--- a/test/MC/Disassembler/ARM/thumb-tests.txt
+++ b/test/MC/Disassembler/ARM/thumb-tests.txt

@@ -1,4 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 -mcpu=cortex-a9-mp | FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=thumbv7-apple-darwin9 -mcpu=cortex-a9 | FileCheck %s
 
 # CHECK:	add	r5, sp, #68
 0x11 0xad

diff --git a/test/MC/Disassembler/ARM/thumb2-preloads.txt b/test/MC/Disassembler/ARM/thumb2-preloads.txt
new file mode 100644
index 0000000..dec4d64
--- /dev/null
+++ b/test/MC/Disassembler/ARM/thumb2-preloads.txt

@@ -0,0 +1,69 @@
+# RUN: not llvm-mc -triple=thumbv6t2-none-eabi -disassemble            < %s 2>/dev/null | FileCheck %s --check-prefix=V6T2
+# RUN: not llvm-mc -triple=thumbv7a-none-eabi  -disassemble -mattr=-mp < %s 2>/dev/null | FileCheck %s --check-prefix=V6T2 --check-prefix=V7
+# RUN:     llvm-mc -triple=thumbv7a-none-eabi  -disassemble -mattr=+mp < %s 2>/dev/null | FileCheck %s --check-prefix=V6T2 --check-prefix=V7 --check-prefix=MP
+# RUN: not llvm-mc -triple=thumbv7m-none-eabi  -disassemble            < %s 2>/dev/null | FileCheck %s --check-prefix=V6T2 --check-prefix=V7
+
+# RUN: not llvm-mc -triple=thumbv6t2-none-eabi -disassemble            < %s 2>&1 >/dev/null | FileCheck %s --check-prefix=MP-ERR --check-prefix=V7-ERR
+# RUN: not llvm-mc -triple=thumbv7a-none-eabi  -disassemble -mattr=-mp < %s 2>&1 >/dev/null | FileCheck %s --check-prefix=MP-ERR
+# RUN:     llvm-mc -triple=thumbv7a-none-eabi  -disassemble -mattr=+mp < %s 2>&1 >/dev/null
+# RUN: not llvm-mc -triple=thumbv7m-none-eabi  -disassemble            < %s 2>&1 >/dev/null | FileCheck %s --check-prefix=MP-ERR
+
+# V6T2: pld     [r1, #3]
+[0x91,0xf8,0x03,0xf0]
+
+# V6T2: pld     [r2, #-5]
+[0x12,0xf8,0x05,0xfc]
+
+# MP: pldw    [r3, #4]
+# MP-ERR: invalid instruction encoding
+# MP-ERR-NEXT: [0xb3,0xf8,0x04,0xf0]
+[0xb3,0xf8,0x04,0xf0]
+
+# MP: pldw    [r4, #-6]
+# MP-ERR: invalid instruction encoding
+# MP-ERR-NEXT: [0x34,0xf8,0x06,0xfc]
+[0x34,0xf8,0x06,0xfc]
+
+# V6T2: pld     [pc, #8]
+[0x9f,0xf8,0x08,0xf0]
+
+# V6T2: pld     [pc, #-5]
+[0x1f,0xf8,0x05,0xf0]
+
+# V6T2: pld     [r5, r6]
+[0x15,0xf8,0x06,0xf0]
+
+# V6T2: pld     [r7, r8, lsl #1]
+[0x17,0xf8,0x18,0xf0]
+
+# MP: pldw    [r9, r10]
+# MP-ERR: invalid instruction encoding
+# MP-ERR-NEXT: [0x39,0xf8,0x0a,0xf0]
+[0x39,0xf8,0x0a,0xf0]
+
+# MP: pldw    [r11, r12, lsl #2]
+# MP-ERR: invalid instruction encoding
+# MP-ERR-NEXT: [0x3b,0xf8,0x2c,0xf0]
+[0x3b,0xf8,0x2c,0xf0]
+
+# V7: pli     [r1, #10]
+# V7-ERR: invalid instruction encoding
+# V7-ERR-NEXT: [0x91,0xf9,0x0a,0xf0]
+[0x91,0xf9,0x0a,0xf0]
+
+# V7: pli     [r2, #-3]
+# V7-ERR: invalid instruction encoding
+# V7-ERR-NEXT: [0x12,0xf9,0x03,0xfc]
+[0x12,0xf9,0x03,0xfc]
+
+# V7: pli     [pc, #6]
+# V7-ERR: invalid instruction encoding
+# V7-ERR-NEXT: [0x9f,0xf9,0x06,0xf0]
+[0x9f,0xf9,0x06,0xf0]
+
+# V7: pli     [pc, #-8]
+# V7-ERR: invalid instruction encoding
+# V7-ERR-NEXT: [0x1f,0xf9,0x08,0xf0]
+[0x1f,0xf9,0x08,0xf0]
+
+# NO-ERR-NOT: invalid instruction encoding

diff --git a/test/MC/Disassembler/Mips/micromips.txt b/test/MC/Disassembler/Mips/micromips.txt
index 1458ce2..6464824 100644
--- a/test/MC/Disassembler/Mips/micromips.txt
+++ b/test/MC/Disassembler/Mips/micromips.txt

@@ -294,3 +294,30 @@
 
 # CHECK: sc $2, 8($4)
 0x60 0x44 0xb0 0x08
+
+# CHECK: lwxs $2, $3($4)
+0x00 0x64 0x11 0x18
+
+# CHECK: bgezals $6, 1332
+0x42 0x66 0x02 0x9a
+
+# CHECK: bltzals $6, 1332
+0x42 0x26 0x02 0x9a
+
+# CHECK: beqzc $9, 1332
+0x40 0xe9 0x02 0x9a
+
+# CHECK: bnezc $9, 1332
+0x40 0xa9 0x02 0x9a
+
+# CHECK: jals 1328
+0x74 0x00 0x02 0x98
+
+# CHECK: jalrs $ra, $6
+0x03 0xe6 0x4f 0x3c
+
+# CHECK: lwm32 $16, $17, 8($4)
+0x20 0x44 0x50 0x08
+
+# CHECK: swm32 $16, $17, 8($4)
+0x20 0x44 0xd0 0x08

diff --git a/test/MC/Disassembler/Mips/micromips_le.txt b/test/MC/Disassembler/Mips/micromips_le.txt
index bdfe88e..d4dbc46 100644
--- a/test/MC/Disassembler/Mips/micromips_le.txt
+++ b/test/MC/Disassembler/Mips/micromips_le.txt

@@ -294,3 +294,30 @@
 
 # CHECK: sc $2, 8($4)
 0x44 0x60 0x08 0xb0
+
+# CHECK: lwxs $2, $3($4)
+0x64 0x00 0x18 0x11
+
+# CHECK: bgezals $6, 1332
+0x66 0x42 0x9a 0x02
+
+# CHECK: bltzals $6, 1332
+0x26 0x42 0x9a 0x02
+
+# CHECK: beqzc $9, 1332
+0xe9 0x40 0x9a 0x02
+
+# CHECK: bnezc $9, 1332
+0xa9 0x40 0x9a 0x02
+
+# CHECK: jals 1328
+0x00 0x74 0x98 0x02
+
+# CHECK: jalrs $ra, $6
+0xe6 0x03 0x3c 0x4f
+
+# CHECK: lwm32 $16, $17, 8($4)
+0x44 0x20 0x08 0x50
+
+# CHECK: swm32 $16, $17, 8($4)
+0x44 0x20 0x08 0xd0

diff --git a/test/MC/Disassembler/Mips/mips2.txt b/test/MC/Disassembler/Mips/mips2.txt
new file mode 100644
index 0000000..a604055
--- /dev/null
+++ b/test/MC/Disassembler/Mips/mips2.txt

@@ -0,0 +1,13 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mcpu=mips2 | FileCheck %s
+
+# CHECK: sdc3  $5, 9154($6)
+0xfc 0xc5 0x23 0xc2
+
+# CHECK: swc3  $6, 9158($7)
+0xec 0xe6 0x23 0xc6
+
+# CHECK: ldc3  $7, 9162($8)
+0xdd 0x07 0x23 0xca
+
+# CHECK: lwc3  $8, 9166($9)
+0xcd 0x28 0x23 0xce

diff --git a/test/MC/Disassembler/Mips/mips32.txt b/test/MC/Disassembler/Mips/mips32.txt
index bfb145e..bd4ae4d 100644
--- a/test/MC/Disassembler/Mips/mips32.txt
+++ b/test/MC/Disassembler/Mips/mips32.txt

@@ -1,4 +1,5 @@
 # RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux | FileCheck %s
+
 # CHECK: abs.d $f12, $f14
 0x46 0x20 0x73 0x05
 
@@ -436,3 +437,15 @@
 # CHECK: rdhwr   $5, $29
 # CHECK: .set    pop
 0x7c 0x05 0xe8 0x3b
+
+# CHECK: cache 1, 2($3)
+0xbc 0x61 0x00 0x02
+
+# CHECK: pref 3, 4($2)
+0xcc 0x43 0x00 0x04
+
+# CHECK: swc2  $9, 9158($7)
+0xe8 0xe9 0x23 0xc6
+
+# CHECK: lwc2  $8, 9162($6)
+0xc8 0xc8 0x23 0xca

diff --git a/test/MC/Disassembler/Mips/mips64.txt b/test/MC/Disassembler/Mips/mips64.txt
index f3d2d10..d494df6 100644
--- a/test/MC/Disassembler/Mips/mips64.txt
+++ b/test/MC/Disassembler/Mips/mips64.txt

@@ -85,3 +85,9 @@
 
 # CHECK: sdxc1 $f8, $4($25)
 0x4f 0x24 0x40 0x09
+
+# CHECK: sdc2  $9, 9158($7)
+0xf8 0xe9 0x23 0xc6
+
+# CHECK: ldc2  $3, 9162($8)
+0xd9 0x03 0x23 0xca

diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-4xx.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-4xx.txt
new file mode 100644
index 0000000..92e88f8
--- /dev/null
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-4xx.txt

@@ -0,0 +1,26 @@
+# RUN: llvm-mc --disassemble %s -triple powerpc64-unknown-unknown -mcpu=pwr7 | FileCheck %s
+
+# CHECK: mfdcr 3, 178
+0x7c 0x72 0x2a 0x86
+# CHECK: mtdcr 178, 3
+0x7c 0x72 0x2b 0x86
+
+# CHECK: tlbre 2, 3, 0
+0x7c 0x43 0x07 0x64
+# CHECK: tlbre 2, 3, 1
+0x7c 0x43 0x0f 0x64
+
+# CHECK: tlbwe 2, 3, 0
+0x7c 0x43 0x07 0xa4
+# CHECK: tlbwe 2, 3, 1
+0x7c 0x43 0x0f 0xa4
+
+# CHECK: tlbsx 2, 3, 1
+0x7c 0x43 0x0f 0x24
+# CHECK: tlbsx. 2, 3, 1
+0x7c 0x43 0x0f 0x25
+
+# CHECK: dccci 5, 6
+0x7c 0x05 0x33 0x8c
+# CHECK: iccci 5, 6
+0x7c 0x05 0x37 0x8c

diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-6xx.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-6xx.txt
new file mode 100644
index 0000000..7276847
--- /dev/null
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-6xx.txt

@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple powerpc64-unknown-unknown -mcpu=pwr7 | FileCheck %s
+
+# CHECK: tlbld 4
+0x7c 0x00 0x27 0xa4
+# CHECK: tlbli 4
+0x7c 0x00 0x27 0xe4

diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-bookII.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-bookII.txt
index 5e6033d..7a30b5c 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding-bookII.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-bookII.txt

@@ -3,6 +3,9 @@
 # CHECK: icbi 2, 3                       
 0x7c 0x02 0x1f 0xac
 
+# CHECK: icbt 0, 5, 31
+0x7c 0x05 0xf8 0x2c
+
 # CHECK: dcbt 2, 3                       
 0x7c 0x02 0x1a 0x2c
 
@@ -33,6 +36,9 @@
 # CHECK: wait 2                          
 0x7c 0x40 0x00 0x7c
 
+# CHECK: mbar 1
+0x7c 0x20 0x06 0xac
+
 # CHECK: dcbf 2, 3                       
 0x7c 0x02 0x18 0xac
 

diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-bookIII.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-bookIII.txt
index c5d6155..7996ed1 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding-bookIII.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-bookIII.txt

@@ -105,3 +105,23 @@
 # CHECK: tlbie 4,0                       
 0x7c 0x00 0x22 0x64
 
+# CHECK: rfi
+0x4c 0x00 0x00 0x64
+# CHECK: rfci
+0x4c 0x00 0x00 0x66
+
+# CHECK: wrtee 12
+0x7d 0x80 0x01 0x06
+# CHECK: wrteei 0
+0x7c 0x00 0x01 0x46
+# CHECK: wrteei 1
+0x7c 0x00 0x81 0x46
+
+# CHECK: tlbre
+0x7c 0x00 0x07 0x64
+# CHECK: tlbwe
+0x7c 0x00 0x07 0xa4
+# CHECK: tlbivax 11, 12
+0x7c 0x0b 0x66 0x24
+# CHECK: tlbsx 11, 12
+0x7c 0x0b 0x67 0x24

diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-e500.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-e500.txt
new file mode 100644
index 0000000..ef013d7
--- /dev/null
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-e500.txt

@@ -0,0 +1,7 @@
+# RUN: llvm-mc --disassemble %s -triple powerpc64-unknown-unknown -mcpu=pwr7 | FileCheck %s
+
+# CHECK: rfdi
+0x4c 0x00 0x00 0x4e
+# CHECK: rfmci
+0x4c 0x00 0x00 0x4c
+

diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding-ext.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding-ext.txt
index 108df30..3c2f935 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding-ext.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding-ext.txt

@@ -2251,3 +2251,26 @@
 # CHECK: mtcrf 255, 2
 0x7c 0x4f 0xf1 0x20
 
+# CHECK: dss 3
+0x7c 0x60 0x06 0x6c
+# CHECK: dssall
+0x7e 0x00 0x06 0x6c
+# CHECK: dst 12, 11, 3
+0x7c 0x6c 0x5a 0xac
+# CHECK: dstt 12, 11, 3
+0x7e 0x6c 0x5a 0xac
+# CHECK: dstst 12, 11, 3
+0x7c 0x6c 0x5a 0xec
+# CHECK: dststt 12, 11, 3
+0x7e 0x6c 0x5a 0xec
+
+# CHECK: tlbia
+0x7c 0x00 0x02 0xe4
+
+# CHECK: lswi 8, 6, 7
+0x7d 0x06 0x3c 0xaa
+# CHECK: stswi 8, 6, 7
+0x7d 0x06 0x3d 0xaa
+
+# CHECK: rfid
+0x4c 0x00 0x00 0x24

diff --git a/test/MC/Disassembler/PowerPC/ppc64-encoding.txt b/test/MC/Disassembler/PowerPC/ppc64-encoding.txt
index 33a8c0e..2e2e7c1 100644
--- a/test/MC/Disassembler/PowerPC/ppc64-encoding.txt
+++ b/test/MC/Disassembler/PowerPC/ppc64-encoding.txt

@@ -619,3 +619,7 @@
 # CHECK: mfocrf 16, 8                    
 0x7e 0x10 0x80 0x26
 
+# CHECK: mtsrin 10, 12
+0x7d 0x40 0x61 0xe4
+# CHECK: mfsrin 10, 12
+0x7d 0x40 0x65 0x26

diff --git a/test/MC/Disassembler/X86/avx-512.txt b/test/MC/Disassembler/X86/avx-512.txt
index b1a8aaf..62fc35b 100644
--- a/test/MC/Disassembler/X86/avx-512.txt
+++ b/test/MC/Disassembler/X86/avx-512.txt

@@ -1,4 +1,5 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=skx | FileCheck --check-prefix=CHECK-SKX %s
 
 # CHECK: vpbroadcastd    %xmm18, %zmm28 {%k7} {z}
 0x62 0x22 0x7d 0xcf 0x58 0xe2
@@ -13,7 +14,13 @@
 0x62 0x32 0xed 0x48 0x16 0x04 0x96
 
 # CHECK: vpbroadcastmw2d %k2, %zmm8
-0x62 0xd2 0x7e 0x48 0x3a 0xd0
+0x62 0x72 0x7e 0x48 0x3a 0xc2
+
+# CHECK-SKX: vpbroadcastmw2d %k2, %xmm8
+0x62 0x72 0x7e 0x08 0x3a 0xc2
+
+# CHECK-SKX: vpbroadcastmw2d %k2, %ymm8
+0x62 0x72 0x7e 0x28 0x3a 0xc2
 
 # CHECK: vpbroadcastq    (%r9,%rax), %zmm28
 0x62 0x42 0xfd 0x48 0x59 0x24 0x01
@@ -63,3 +70,42 @@
 
 # CHECK: kmovw   %k5, %k1
 0xc5 0xf8 0x90 0xcd
+
+#####################################################
+#             COMPRESSED DISPLACEMENT               #
+#####################################################
+
+# TupleType = FVM
+# CHECK: vmovdqu32 %zmm0, -448(%rcx)
+0x62 0xf1 0x7e 0x48 0x7f 0x41 0xf9
+
+# TupleType = T1S, 64-bit eltsize
+# CHECK: vaddsd 256(%rdx), %xmm0, %xmm16
+0x62 0xe1 0xff 0x08 0x58 0x42 0x20
+
+# TupleType = T1S, 32-bit eltsize
+# CHECK: vaddss 256(%rdx), %xmm0, %xmm16
+0x62 0xe1 0x7e 0x08 0x58 0x42 0x40
+
+# TupleType = FV
+# CHECK: vaddpd 256(%rdx), %zmm0, %zmm16
+0x62 0xe1 0xfd 0x48 0x58 0x42 0x04
+
+# TupleType = FV, broadcast, 64-bit eltsize
+# CHECK: vaddpd 256(%rdx){1to8}, %zmm0, %zmm16
+0x62 0xe1 0xfd 0x58 0x58 0x42 0x20
+
+# TupleType = FV, broadcast, 32-bit eltsize
+# CHECK: vaddps 256(%rdx){1to16}, %zmm0, %zmm16
+0x62 0xe1 0x7c 0x58 0x58 0x42 0x40
+
+# TupleType = T4
+# CHECK: vbroadcasti32x4 256(%rdx), %zmm16
+0x62 0xe2 0x7d 0x48 0x5a 0x42 0x10
+
+# Cases where we can't use cdisp8
+# CHECK: vaddss 255(%rdx), %xmm0, %xmm16
+0x62 0xe1 0x7e 0x08 0x58 0x82 0xff 0x00 0x00 0x00
+
+# CHECK: vaddss 1024(%rdx), %xmm0, %xmm16
+0x62 0xe1 0x7e 0x08 0x58 0x82 0x00 0x04 0x00 0x00

diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index c9c5086..79577c6 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt

@@ -711,3 +711,6 @@
 
 # CHECK: movq %mm0, %mm1
 0x0f 0x7f 0xc1
+
+# CHECK: vpermq $-18, %ymm2, %ymm2
+0xc4 0xe3 0xfd 0x00 0xd2 0xee

diff --git a/test/MC/ELF/cfi-version.ll b/test/MC/ELF/cfi-version.ll
index 10daa1d..2938dc7 100644
--- a/test/MC/ELF/cfi-version.ll
+++ b/test/MC/ELF/cfi-version.ll

@@ -22,17 +22,17 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test.c", metadata !"/tmp"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @foo, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\000\002", metadata !1, metadata !5, metadata !6, null, i32 ()* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5.0 "}
 !12 = metadata !{i32 2, i32 0, metadata !4, null}
 

diff --git a/test/MC/ELF/comdat.s b/test/MC/ELF/comdat.s
index 68b0f32..4796675 100644
--- a/test/MC/ELF/comdat.s
+++ b/test/MC/ELF/comdat.s

@@ -1,6 +1,6 @@
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -t | FileCheck %s
 
-// Test that we produce the group sections and that they are a the beginning
+// Test that we produce the group sections and that they are at the beginning
 // of the file.
 
 // CHECK:        Section {
@@ -41,7 +41,7 @@
 // CHECK-NEXT:     Offset: 0x54
 // CHECK-NEXT:     Size: 12
 // CHECK-NEXT:     Link: 13
-// CHECK-NEXT:     Info: 13
+// CHECK-NEXT:     Info: 10
 // CHECK-NEXT:     AddressAlignment: 4
 // CHECK-NEXT:     EntrySize: 4
 // CHECK-NEXT:   }

diff --git a/test/MC/ELF/reloc-same-name-section.s b/test/MC/ELF/reloc-same-name-section.s
new file mode 100644
index 0000000..e63ea54
--- /dev/null
+++ b/test/MC/ELF/reloc-same-name-section.s

@@ -0,0 +1,31 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux %s -o - | llvm-readobj -r --expand-relocs | FileCheck %s
+
+// test that we produce one relocation against each section.
+
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section {{.*}} {
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset:
+// CHECK-NEXT:       Type:
+// CHECK-NEXT:       Symbol:  .foo (7)
+// CHECK-NEXT:       Addend:
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset:
+// CHECK-NEXT:       Type:
+// CHECK-NEXT:       Symbol:  .foo (8)
+// CHECK-NEXT:       Addend:
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+	.section	.foo,"aG",@progbits,v,comdat
+f:
+
+	.section	.foo,"a",@progbits
+g:
+
+
+	.section	.bar
+	.quad	f
+	.quad	g

diff --git a/test/MC/ELF/section-sym-err.s b/test/MC/ELF/section-sym-err.s
new file mode 100644
index 0000000..789fee7
--- /dev/null
+++ b/test/MC/ELF/section-sym-err.s

@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.o 2>&1 | FileCheck %s
+
+.section foo
+foo:
+
+// CHECK: error: invalid symbol redefinition

diff --git a/test/MC/ELF/section-sym.s b/test/MC/ELF/section-sym.s
new file mode 100644
index 0000000..3b76d81
--- /dev/null
+++ b/test/MC/ELF/section-sym.s

@@ -0,0 +1,91 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -t -r --expand-relocs | FileCheck %s
+
+.section foo, "aG", @progbits, f1, comdat
+.section foo, "G", @progbits, f2, comdat
+.section bar
+.long foo
+
+// Test that the relocation points to the first section foo.
+
+// The first seciton foo has index 6
+// CHECK:      Section {
+// CHECK:        Index:   6
+// CHECK-NEXT:   Name:    foo (28)
+// CHECK-NEXT:   Type:    SHT_PROGBITS (0x1)
+// CHECK-NEXT:   Flags [ (0x202)
+// CHECK-NEXT:     SHF_ALLOC (0x2)
+// CHECK-NEXT:     SHF_GROUP (0x200)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Address:         0x0
+// CHECK-NEXT:   Offset:  0x50
+// CHECK-NEXT:   Size:    0
+// CHECK-NEXT:   Link:    0
+// CHECK-NEXT:   Info:    0
+// CHECK-NEXT:   AddressAlignment:        1
+// CHECK-NEXT:   EntrySize:       0
+// CHECK-NEXT: }
+// CHECK-NEXT: Section {
+// CHECK-NEXT:   Index:   7
+// CHECK-NEXT:   Name:    foo (28)
+// CHECK-NEXT:   Type:    SHT_PROGBITS (0x1)
+// CHECK-NEXT:   Flags [ (0x200)
+// CHECK-NEXT:     SHF_GROUP (0x200)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Address:         0x0
+// CHECK-NEXT:   Offset:  0x50
+// CHECK-NEXT:   Size:    0
+// CHECK-NEXT:   Link:    0
+// CHECK-NEXT:   Info:    0
+// CHECK-NEXT:   AddressAlignment:        1
+// CHECK-NEXT:   EntrySize:       0
+// CHECK-NEXT: }
+
+// The relocation points to symbol 6
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section (9) .relabar {
+// CHECK-NEXT:     Relocation {
+// CHECK-NEXT:       Offset:  0x0
+// CHECK-NEXT:       Type:    R_X86_64_32 (10)
+// CHECK-NEXT:       Symbol:  foo (6)
+// CHECK-NEXT:       Addend:  0x0
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+
+// The symbol 6 corresponds section 6
+// CHECK: Symbols [
+
+// symbol 0
+// CHECK-NOT: Name
+// CHECK: Name:
+
+// symbol 1
+// CHECK-NOT: Name
+// CHECK: Name:    f1
+
+// symbol 2
+// CHECK-NOT: Name
+// CHECK: Name:    f2
+
+// symbol 3
+// CHECK-NOT: Name
+// CHECK: Name:    .text
+
+// symbol 4
+// CHECK-NOT: Name
+// CHECK: Name:    .data
+
+// symbol 5
+// CHECK-NOT: Name
+// CHECK: Name:    .bss
+
+// symbol 6
+// CHECK-NOT: Name
+// CHECK: Name:    foo
+// CHECK: Section: foo (0x6)
+
+// symbol 7
+// CHECK-NOT: Name
+// CHECK: Name:    foo
+// CHECK: Section: foo (0x7)

diff --git a/test/MC/ELF/section-sym2.s b/test/MC/ELF/section-sym2.s
new file mode 100644
index 0000000..acdb7d9
--- /dev/null
+++ b/test/MC/ELF/section-sym2.s

@@ -0,0 +1,28 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj  -t -r --expand-relocs | FileCheck %s
+
+// Test that we can forward reference a section.
+
+mov .rodata, %rsi
+.section .rodata
+
+// CHECK:Relocations [
+// CHECK:  Section (2) .rela.text {
+// CHECK:    Relocation {
+// CHECK:      Offset: 0x4
+// CHECK:      Type: R_X86_64_32S (11)
+// CHECK:      Symbol: .rodata
+// CHECK:      Addend: 0x0
+// CHECK:    }
+// CHECK:  }
+// CHECK:]
+
+// There is only one .rodata symbol
+
+// CHECK:Symbols [
+// CHECK-NOT:    Name: .rodata
+// CHECK:        Name: .rodata
+// CHECK-NEXT:   Value: 0x0
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Binding: Local (0x0)
+// CHECK-NEXT:   Type: Section (0x3)
+// CHECK-NOT:    Name: .rodata

diff --git a/test/MC/Hexagon/basic.ll b/test/MC/Hexagon/basic.ll
new file mode 100644
index 0000000..8a5d2e6
--- /dev/null
+++ b/test/MC/Hexagon/basic.ll

@@ -0,0 +1,7 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-readobj -h -r | FileCheck -check-prefix=OBJ %s
+
+; OBJ: Format: ELF32-hexagon
+; OBJ: Arch: hexagon
+; OBJ: AddressSize: 32bit
+; OBJ: Machine: EM_HEXAGON

diff --git a/test/MC/Hexagon/inst_add.ll b/test/MC/Hexagon/inst_add.ll
new file mode 100644
index 0000000..5377d94
--- /dev/null
+++ b/test/MC/Hexagon/inst_add.ll

@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i32 %a, i32 %b)
+{
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK:  0000 004100f3 00c09f52
\ No newline at end of file

diff --git a/test/MC/Hexagon/inst_and.ll b/test/MC/Hexagon/inst_and.ll
new file mode 100644
index 0000000..16bf304
--- /dev/null
+++ b/test/MC/Hexagon/inst_and.ll

@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i32 %a, i32 %b)
+{
+  %1 = and i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK:  0000 004100f1 00c09f52
\ No newline at end of file

diff --git a/test/MC/Hexagon/inst_or.ll b/test/MC/Hexagon/inst_or.ll
new file mode 100644
index 0000000..fe8152b
--- /dev/null
+++ b/test/MC/Hexagon/inst_or.ll

@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i32 %a, i32 %b)
+{
+  %1 = or i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK:   0000 004120f1 00c09f52
\ No newline at end of file

diff --git a/test/MC/Hexagon/inst_sub.ll b/test/MC/Hexagon/inst_sub.ll
new file mode 100644
index 0000000..7523aa6
--- /dev/null
+++ b/test/MC/Hexagon/inst_sub.ll

@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i32 %a, i32 %b)
+{
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK:  0000 004021f3 00c09f52
\ No newline at end of file

diff --git a/test/MC/Hexagon/inst_xor.ll b/test/MC/Hexagon/inst_xor.ll
new file mode 100644
index 0000000..fe989e5
--- /dev/null
+++ b/test/MC/Hexagon/inst_xor.ll

@@ -0,0 +1,10 @@
+;; RUN: llc -mtriple=hexagon-unknown-elf -filetype=obj %s -o - \
+;; RUN: | llvm-objdump -s - | FileCheck %s
+
+define i32 @foo (i32 %a, i32 %b)
+{
+  %1 = xor i32 %a, %b
+  ret i32 %1
+}
+
+; CHECK:   0000 004160f1 00c09f52
\ No newline at end of file

diff --git a/test/MC/Hexagon/lit.local.cfg b/test/MC/Hexagon/lit.local.cfg
new file mode 100644
index 0000000..ba72ff6
--- /dev/null
+++ b/test/MC/Hexagon/lit.local.cfg

@@ -0,0 +1,3 @@
+if not 'Hexagon' in config.root.targets:
+    config.unsupported = True
+

diff --git a/test/MC/MachO/ARM/aliased-symbols.s b/test/MC/MachO/ARM/aliased-symbols.s
index 0b4463d..e87b81c 100644
--- a/test/MC/MachO/ARM/aliased-symbols.s
+++ b/test/MC/MachO/ARM/aliased-symbols.s

@@ -70,7 +70,7 @@
         // alias_to_local is an alias, but what it points to has no
         // MachO representation. We must resolve it.
 // CHECK: Symbol {
-// CHECK-NEXT:   Name: alias_to_local (37)
+// CHECK-NEXT:   Name: alias_to_local (42)
 // CHECK-NEXT:   Type: Section (0xE)
 // CHECK-NEXT:   Section:  (0x0)
 // CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)
@@ -93,7 +93,7 @@
 
         // var1 was another alias to an unknown variable. Not extern this time.
 // CHECK: Symbol {
-// CHECK-NEXT:   Name: var1 (1)
+// CHECK-NEXT:   Name: var1 (89)
 // CHECK-NEXT:   Type: Indirect (0xA)
 // CHECK-NEXT:   Section:  (0x0)
 // CHECK-NEXT:   RefType: UndefinedNonLazy (0x0)

diff --git a/test/MC/MachO/ARM/darwin-ARM-reloc.s b/test/MC/MachO/ARM/darwin-ARM-reloc.s
index b98c80c..374f880 100644
--- a/test/MC/MachO/ARM/darwin-ARM-reloc.s
+++ b/test/MC/MachO/ARM/darwin-ARM-reloc.s

@@ -110,10 +110,10 @@
 @ CHECK:   ('nsyms', 4)
 @ CHECK:   ('stroff', 488)
 @ CHECK:   ('strsize', 24)
-@ CHECK:   ('_string_data', '\x00_printf\x00_f0\x00_f1\x00_d0\x00\x00\x00\x00')
+@ CHECK:   ('_string_data', '\x00_printf\x00_f1\x00_f0\x00_d0\x00\x00\x00\x00')
 @ CHECK:   ('_symbols', [
 @ CHECK:     # Symbol 0
-@ CHECK:    (('n_strx', 9)
+@ CHECK:    (('n_strx', 13)
 @ CHECK:     ('n_type', 0xe)
 @ CHECK:     ('n_sect', 1)
 @ CHECK:     ('n_desc', 0)
@@ -121,7 +121,7 @@
 @ CHECK:     ('_string', '_f0')
 @ CHECK:    ),
 @ CHECK:     # Symbol 1
-@ CHECK:    (('n_strx', 13)
+@ CHECK:    (('n_strx', 9)
 @ CHECK:     ('n_type', 0xe)
 @ CHECK:     ('n_sect', 1)
 @ CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/ARM/ios-version-min-load-command.s b/test/MC/MachO/ARM/ios-version-min-load-command.s
index e065d14..9f63c9b 100644
--- a/test/MC/MachO/ARM/ios-version-min-load-command.s
+++ b/test/MC/MachO/ARM/ios-version-min-load-command.s

@@ -6,5 +6,5 @@
 // CHECK:  (('command', 37)
 // CHECK:   ('size', 16)
 // CHECK:   ('version, 6490119)
-// CHECK:   ('reserved, 0)
+// CHECK:   ('sdk, 0)
 // CHECK:  ),

diff --git a/test/MC/MachO/absolute.s b/test/MC/MachO/absolute.s
index 784e32a..0b22afb 100644
--- a/test/MC/MachO/absolute.s
+++ b/test/MC/MachO/absolute.s

@@ -63,10 +63,10 @@
 // CHECK:   ('nsyms', 8)
 // CHECK:   ('stroff', 420)
 // CHECK:   ('strsize', 84)
-// CHECK:   ('_string_data', '\x00foo_set1_global\x00foo_set2_global\x00_bar\x00_foo\x00foo_set1\x00foo_set2\x00foo_equals\x00foo_equals2\x00')
+// CHECK:   ('_string_data', '\x00foo_equals\x00_bar\x00_foo\x00foo_set2_global\x00foo_set1_global\x00foo_set2\x00foo_equals2\x00foo_set1\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 33)
+// CHECK:    (('n_strx', 12)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -74,7 +74,7 @@
 // CHECK:     ('_string', '_bar')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 38)
+// CHECK:    (('n_strx', 17)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -82,7 +82,7 @@
 // CHECK:     ('_string', '_foo')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 43)
+// CHECK:    (('n_strx', 75)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 32)
@@ -90,7 +90,7 @@
 // CHECK:     ('_string', 'foo_set1')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 52)
+// CHECK:    (('n_strx', 54)
 // CHECK:     ('n_type', 0x2)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 32)
@@ -98,7 +98,7 @@
 // CHECK:     ('_string', 'foo_set2')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 61)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -106,7 +106,7 @@
 // CHECK:     ('_string', 'foo_equals')
 // CHECK:    ),
 // CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 72)
+// CHECK:    (('n_strx', 63)
 // CHECK:     ('n_type', 0x2)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -114,7 +114,7 @@
 // CHECK:     ('_string', 'foo_equals2')
 // CHECK:    ),
 // CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 38)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 32)
@@ -122,7 +122,7 @@
 // CHECK:     ('_string', 'foo_set1_global')
 // CHECK:    ),
 // CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 17)
+// CHECK:    (('n_strx', 22)
 // CHECK:     ('n_type', 0x3)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 32)

diff --git a/test/MC/MachO/absolutize.s b/test/MC/MachO/absolutize.s
index 39571dd..19917e3 100644
--- a/test/MC/MachO/absolutize.s
+++ b/test/MC/MachO/absolutize.s

@@ -150,10 +150,10 @@
 // CHECK:   ('nsyms', 4)
 // CHECK:   ('stroff', 572)
 // CHECK:   ('strsize', 36)
-// CHECK:   ('_string_data', '\x00_text_a\x00_text_b\x00_data_a\x00_data_b\x00\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00_text_b\x00_data_b\x00_text_a\x00_data_a\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 17)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -161,7 +161,7 @@
 // CHECK:     ('_string', '_text_a')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 9)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -169,7 +169,7 @@
 // CHECK:     ('_string', '_text_b')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 17)
+// CHECK:    (('n_strx', 25)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -177,7 +177,7 @@
 // CHECK:     ('_string', '_data_a')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 25)
+// CHECK:    (('n_strx', 9)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/bad-darwin-x86_64-reloc-expr1.s b/test/MC/MachO/bad-darwin-x86_64-reloc-expr1.s
new file mode 100644
index 0000000..518ae64
--- /dev/null
+++ b/test/MC/MachO/bad-darwin-x86_64-reloc-expr1.s

@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - 2> %t.err > %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+
+_Z:
+.long (_Z+4)-_b
+// CHECK-ERROR: error: unsupported relocation with subtraction expression, symbol '_b' can not be undefined in a subtraction expression

diff --git a/test/MC/MachO/bad-darwin-x86_64-reloc-expr2.s b/test/MC/MachO/bad-darwin-x86_64-reloc-expr2.s
new file mode 100644
index 0000000..3aefd87
--- /dev/null
+++ b/test/MC/MachO/bad-darwin-x86_64-reloc-expr2.s

@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - 2> %t.err > %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+
+_Z:
+.long (_a+4)-_Z
+// CHECK-ERROR: error: unsupported relocation with subtraction expression, symbol '_a' can not be undefined in a subtraction expression

diff --git a/test/MC/MachO/comm-1.s b/test/MC/MachO/comm-1.s
index 5ffa979..cb240f9 100644
--- a/test/MC/MachO/comm-1.s
+++ b/test/MC/MachO/comm-1.s

@@ -51,10 +51,10 @@
 // CHECK:   ('nsyms', 4)
 // CHECK:   ('stroff', 304)
 // CHECK:   ('strsize', 48)
-// CHECK:   ('_string_data', '\x00sym_comm_B\x00sym_comm_A\x00sym_comm_C\x00sym_comm_D\x00\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00sym_comm_D\x00sym_comm_C\x00sym_comm_B\x00sym_comm_A\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 12)
+// CHECK:    (('n_strx', 34)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -62,7 +62,7 @@
 // CHECK:     ('_string', 'sym_comm_A')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 23)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -70,7 +70,7 @@
 // CHECK:     ('_string', 'sym_comm_B')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 23)
+// CHECK:    (('n_strx', 12)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 544)
@@ -78,7 +78,7 @@
 // CHECK:     ('_string', 'sym_comm_C')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 34)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 768)

diff --git a/test/MC/MachO/darwin-complex-difference.s b/test/MC/MachO/darwin-complex-difference.s
index e66bd09..f31d3ad 100644
--- a/test/MC/MachO/darwin-complex-difference.s
+++ b/test/MC/MachO/darwin-complex-difference.s

@@ -74,10 +74,10 @@
 // CHECK:   ('nsyms', 3)
 // CHECK:   ('stroff', 392)
 // CHECK:   ('strsize', 12)
-// CHECK:   ('_string_data', '\x00_a\x00_c\x00_d\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00_d\x00_c\x00_a\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 7)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -93,7 +93,7 @@
 // CHECK:     ('_string', '_c')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 7)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s b/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s
index 5d54879..b69cd1b 100644
--- a/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s
+++ b/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s

@@ -1,38 +1,16 @@
-// RUN: llvm-mc -triple x86_64-apple-darwin9 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
-
-// Test case for rdar://9356266
-
-// This tests that this expression does not cause a crash and produces these
-// four relocation entries:
-// Relocation information (__DATA,__data) 4 entries
-// address  pcrel length extern type    scattered symbolnum/value
-// 00000004 False long   False  SUB     False     2 (__DATA,__data)
-// 00000004 False long   False  UNSIGND False     2 (__DATA,__data)
-// 00000000 False long   False  SUB     False     2 (__DATA,__data)
-// 00000000 False long   False  UNSIGND False     2 (__DATA,__data)
+// RUN: llvm-mc -triple x86_64-apple-darwin9 %s -filetype=obj -o - | llvm-readobj -r | FileCheck %s
 
 	.data
 L_var1:
 L_var2:
-// This was working fine
 	.long L_var2 - L_var1
-	
 	.set L_var3, .
 	.set L_var4, .
-// But this was causing a crash
 	.long L_var4 - L_var3
 
-// CHECK:  ('_relocations', [
-// CHECK:    # Relocation 0
-// CHECK:    (('word-0', 0x4),
-// CHECK:     ('word-1', 0x54000002)),
-// CHECK:    # Relocation 1
-// CHECK:    (('word-0', 0x4),
-// CHECK:     ('word-1', 0x4000002)),
-// CHECK:    # Relocation 2
-// CHECK:    (('word-0', 0x0),
-// CHECK:     ('word-1', 0x54000002)),
-// CHECK:    # Relocation 3
-// CHECK:    (('word-0', 0x0),
-// CHECK:     ('word-1', 0x4000002)),
-// CHECK:  ])
+// CHECK:      Relocations [
+// CHECK-NEXT:   Section __data {
+// CHECK-NEXT:     0x4 0 2 0 X86_64_RELOC_SUBTRACTOR 0 0x2
+// CHECK-NEXT:     0x4 0 2 0 X86_64_RELOC_UNSIGNED 0 0x2
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]

diff --git a/test/MC/MachO/darwin-x86_64-diff-relocs.s b/test/MC/MachO/darwin-x86_64-diff-relocs.s
index f5d93ae..eb28cf1 100644
--- a/test/MC/MachO/darwin-x86_64-diff-relocs.s
+++ b/test/MC/MachO/darwin-x86_64-diff-relocs.s

@@ -258,7 +258,7 @@
 // CHECK:   ('nsyms', 5)
 // CHECK:   ('stroff', 908)
 // CHECK:   ('strsize', 24)
-// CHECK:   ('_string_data', '\x00_foo\x00_g0\x00_g1\x00_g2\x00_g3\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00_foo\x00_g3\x00_g2\x00_g1\x00_g0\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
 // CHECK:    (('n_strx', 1)
@@ -269,7 +269,7 @@
 // CHECK:     ('_string', '_foo')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 6)
+// CHECK:    (('n_strx', 18)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -277,7 +277,7 @@
 // CHECK:     ('_string', '_g0')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 10)
+// CHECK:    (('n_strx', 14)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -285,7 +285,7 @@
 // CHECK:     ('_string', '_g1')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 14)
+// CHECK:    (('n_strx', 10)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -293,7 +293,7 @@
 // CHECK:     ('_string', '_g2')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 18)
+// CHECK:    (('n_strx', 6)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/darwin-x86_64-reloc.s b/test/MC/MachO/darwin-x86_64-reloc.s
index 83c0de7..1dfb982 100644
--- a/test/MC/MachO/darwin-x86_64-reloc.s
+++ b/test/MC/MachO/darwin-x86_64-reloc.s

@@ -301,11 +301,11 @@
 // CHECK:   ('symoff', 1152)
 // CHECK:   ('nsyms', 9)
 // CHECK:   ('stroff', 1296)
-// CHECK:   ('strsize', 52)
-// CHECK:   ('_string_data', '\x00_foobar\x00_ext_foo\x00_foo\x00_baz\x00_bar\x00_prev\x00_f2\x00_f3\x00f6\x00\x00\x00')
+// CHECK:   ('strsize', 48)
+// CHECK:   ('_string_data', '\x00_baz\x00_prev\x00_foobar\x00_bar\x00_ext_foo\x00f6\x00_f3\x00_f2\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 18)
+// CHECK:    (('n_strx', 29)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -313,7 +313,7 @@
 // CHECK:     ('_string', '_foo')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 23)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -321,7 +321,7 @@
 // CHECK:     ('_string', '_baz')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 28)
+// CHECK:    (('n_strx', 20)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -329,7 +329,7 @@
 // CHECK:     ('_string', '_bar')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 33)
+// CHECK:    (('n_strx', 6)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -337,7 +337,7 @@
 // CHECK:     ('_string', '_prev')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 39)
+// CHECK:    (('n_strx', 41)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -345,7 +345,7 @@
 // CHECK:     ('_string', '_f2')
 // CHECK:    ),
 // CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 43)
+// CHECK:    (('n_strx', 37)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -353,7 +353,7 @@
 // CHECK:     ('_string', '_f3')
 // CHECK:    ),
 // CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 47)
+// CHECK:    (('n_strx', 34)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -361,7 +361,7 @@
 // CHECK:     ('_string', 'f6')
 // CHECK:    ),
 // CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 9)
+// CHECK:    (('n_strx', 25)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -369,7 +369,7 @@
 // CHECK:     ('_string', '_ext_foo')
 // CHECK:    ),
 // CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 12)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/eh-frame-reloc.s b/test/MC/MachO/eh-frame-reloc.s
index e14825b..c39ce84 100644
--- a/test/MC/MachO/eh-frame-reloc.s
+++ b/test/MC/MachO/eh-frame-reloc.s

@@ -1,10 +1,10 @@
 // RUN: llvm-mc < %s -triple=x86_64-apple-macosx10.7 -filetype=obj | llvm-readobj -r | FileCheck %s
 // RUN: llvm-mc < %s -triple=x86_64-apple-macosx10.6 -filetype=obj | llvm-readobj -r | FileCheck %s
 // RUN: llvm-mc < %s -triple=x86_64-apple-ios7.0.0 -filetype=obj | llvm-readobj -r | FileCheck %s
-// RUN: llvm-mc < %s -triple=x86_64-apple-macosx10.5 -filetype=obj | llvm-readobj -r | FileCheck --check-prefix=OLD64 %s
+// RUN: llvm-mc < %s -triple=x86_64-apple-macosx10.5 -filetype=obj | llvm-readobj -r | FileCheck %s
 // RUN: llvm-mc < %s -triple=i686-apple-macosx10.6 -filetype=obj | llvm-readobj -r | FileCheck %s
-// RUN: llvm-mc < %s -triple=i686-apple-macosx10.5 -filetype=obj | llvm-readobj -r | FileCheck --check-prefix=OLD32 %s
-// RUN: llvm-mc < %s -triple=i686-apple-macosx10.4 -filetype=obj | llvm-readobj -r | FileCheck --check-prefix=OLD32 %s
+// RUN: llvm-mc < %s -triple=i686-apple-macosx10.5 -filetype=obj | llvm-readobj -r | FileCheck %s
+// RUN: llvm-mc < %s -triple=i686-apple-macosx10.4 -filetype=obj | llvm-readobj -r | FileCheck %s
 
 	.globl	_bar
 	.align	4, 0x90
@@ -14,17 +14,3 @@
 
 // CHECK:      Relocations [
 // CHECK-NEXT: ]
-
-// OLD32:      Relocations [
-// OLD32-NEXT:   Section __eh_frame {
-// OLD32-NEXT:     0x20 0 2 n/a GENERIC_RELOC_LOCAL_SECTDIFF 1 0x0
-// OLD32-NEXT:     0x0 0 2 n/a GENERIC_RELOC_PAIR 1 0x20
-// OLD32-NEXT:   }
-// OLD32-NEXT: ]
-
-// OLD64:      Relocations [
-// OLD64-NEXT:   Section __eh_frame {
-// OLD64-NEXT:     0x20 0 3 0 X86_64_RELOC_SUBTRACTOR 0
-// OLD64-NEXT:     0x20 0 3 1 X86_64_RELOC_UNSIGNED 0 _bar
-// OLD64-NEXT:   }
-// OLD64-NEXT: ]

diff --git a/test/MC/MachO/empty-dwarf-lines.s b/test/MC/MachO/empty-dwarf-lines.s
deleted file mode 100644
index 4bdc16b..0000000
--- a/test/MC/MachO/empty-dwarf-lines.s
+++ /dev/null

@@ -1,25 +0,0 @@
-// RUN: llvm-mc -triple x86_64-apple-darwin9 %s -filetype=obj -o - | macho-dump | FileCheck %s
-
-// This tests that when producing files for darwin9 or older we make sure
-// that debug_line sections are of a minimum size to avoid the linker bug
-// described in PR8715.
-
-        .section        __DATA,__data
-        .file   1 "test.c"
-        .globl  _c                      ## @c
-_c:
-        .asciz   "hi\n"
-
-// CHECK:      (('section_name', '__debug_line\x00\x00\x00\x00')
-// CHECK-NEXT:  ('segment_name', '__DWARF\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK-NEXT:  ('address', 4)
-// CHECK-NEXT:  ('size', 44)
-// CHECK-NEXT:  ('offset', 452)
-// CHECK-NEXT:  ('alignment', 0)
-// CHECK-NEXT:  ('reloc_offset', 0)
-// CHECK-NEXT:  ('num_reloc', 0)
-// CHECK-NEXT:  ('flags', 0x2000000)
-// CHECK-NEXT:  ('reserved1', 0)
-// CHECK-NEXT:  ('reserved2', 0)
-// CHECK-NEXT:  ('reserved3', 0)
-// CHECK-NEXT: ),

diff --git a/test/MC/MachO/file.s b/test/MC/MachO/file.s
index 0168747..a7d6c20 100644
--- a/test/MC/MachO/file.s
+++ b/test/MC/MachO/file.s

@@ -1,22 +1,27 @@
-// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
+// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | llvm-readobj -s -section-data | FileCheck %s
 
         .file	1 "dir/foo"
         nop
 
-// CHECK:         ('_section_data', '90')
-// CHECK-NEXT:      # Section 1
-// CHECK-NEXT:     (('section_name', '__debug_line\x00\x00\x00\x00')
-// CHECK-NEXT:      ('segment_name', '__DWARF\x00\x00\x00\x00\x00\x00\x00\x00\x00')
-// CHECK-NEXT:      ('address', 1)
-// CHECK-NEXT:      ('size', 45)
-// CHECK-NEXT:      ('offset', 221)
-// CHECK-NEXT:      ('alignment', 0)
-// CHECK-NEXT:      ('reloc_offset', 0)
-// CHECK-NEXT:      ('num_reloc', 0)
-// CHECK-NEXT:      ('flags', 0x2000000)
-// CHECK-NEXT:      ('reserved1', 0)
-// CHECK-NEXT:      ('reserved2', 0)
-// CHECK-NEXT:     ),
-// CHECK-NEXT:    ('_relocations', [
-// CHECK-NEXT:    ])
-// CHECK-NEXT:    ('_section_data', '29000000 02001e00 00000101 fb0e0d00 01010101 00000001 00000164 69720000 666f6f00 01000000 02000001 01')
+// CHECK:       Section {
+// CHECK:         Index: 1
+// CHECK-NEXT:    Name: __debug_line
+// CHECK-NEXT:    Segment: __DWARF
+// CHECK-NEXT:    Address: 0x1
+// CHECK-NEXT:    Size: 0x28
+// CHECK-NEXT:    Offset: 221
+// CHECK-NEXT:    Alignment: 0
+// CHECK-NEXT:    RelocationOffset: 0x0
+// CHECK-NEXT:    RelocationCount: 0
+// CHECK-NEXT:    Type: 0x0
+// CHECK-NEXT:    Attributes [ (0x20000)
+// CHECK-NEXT:      Debug (0x20000)
+// CHECK-NEXT:    ]
+// CHECK-NEXT:    Reserved1: 0x0
+// CHECK-NEXT:    Reserved2: 0x0
+// CHECK-NEXT:    SectionData (
+// CHECK-NEXT:      0000: 24000000 02001E00 00000101 FB0E0D00
+// CHECK-NEXT:      0010: 01010101 00000001 00000164 69720000
+// CHECK-NEXT:      0020: 666F6F00 01000000
+// CHECK-NEXT:    )
+// CHECK-NEXT:  }

diff --git a/test/MC/MachO/gen-dwarf.s b/test/MC/MachO/gen-dwarf.s
index 997c834..ad0a562 100644
--- a/test/MC/MachO/gen-dwarf.s
+++ b/test/MC/MachO/gen-dwarf.s

@@ -46,12 +46,12 @@
 // CHECK:    DW_AT_name [DW_FORM_string]
 // We don't check the DW_AT_comp_dir which is the current working directory
 // CHECK:    DW_AT_producer [DW_FORM_string]	("llvm-mc (based on {{.*}})")
-// CHECK:    DW_AT_language [DW_FORM_data2]	(0x8001)
+// CHECK:    DW_AT_language [DW_FORM_data2]	(DW_LANG_Mips_Assembler)
 
 // CHECK:    DW_TAG_label [2] *
 // CHECK:      DW_AT_name [DW_FORM_string]	("bar")
-// CHECK:      DW_AT_decl_file [DW_FORM_data4]	(0x00000001)
-// CHECK:      DW_AT_decl_line [DW_FORM_data4]	(0x00000005)
+// CHECK:      DW_AT_decl_file [DW_FORM_data4]	([[FILE:".*gen-dwarf.s"]])
+// CHECK:      DW_AT_decl_line [DW_FORM_data4]	(5)
 // CHECK:      DW_AT_low_pc [DW_FORM_addr]	(0x0000000000000000)
 // CHECK:      DW_AT_prototyped [DW_FORM_flag]	(0x00)
 
@@ -61,8 +61,8 @@
 
 // CHECK:    DW_TAG_label [2] *
 // CHECK:      DW_AT_name [DW_FORM_string]	("foo")
-// CHECK:      DW_AT_decl_file [DW_FORM_data4]	(0x00000001)
-// CHECK:      DW_AT_decl_line [DW_FORM_data4]	(0x00000009)
+// CHECK:      DW_AT_decl_file [DW_FORM_data4]	([[FILE]])
+// CHECK:      DW_AT_decl_line [DW_FORM_data4]	(9)
 // CHECK:      DW_AT_low_pc [DW_FORM_addr]	(0x0000000000000007)
 // CHECK:      DW_AT_prototyped [DW_FORM_flag]	(0x00)
 
@@ -72,8 +72,8 @@
 
 // CHECK:    DW_TAG_label [2] *
 // CHECK:      DW_AT_name [DW_FORM_string]	("baz")
-// CHECK:      DW_AT_decl_file [DW_FORM_data4]	(0x00000001)
-// CHECK:      DW_AT_decl_line [DW_FORM_data4]	(0x0000000a)
+// CHECK:      DW_AT_decl_file [DW_FORM_data4]	([[FILE]])
+// CHECK:      DW_AT_decl_line [DW_FORM_data4]	(10)
 // CHECK:      DW_AT_low_pc [DW_FORM_addr]	(0x0000000000000007)
 // CHECK:      DW_AT_prototyped [DW_FORM_flag]	(0x00)
 

diff --git a/test/MC/MachO/indirect-symbols.s b/test/MC/MachO/indirect-symbols.s
index 90fd231..0795768 100644
--- a/test/MC/MachO/indirect-symbols.s
+++ b/test/MC/MachO/indirect-symbols.s

@@ -97,10 +97,10 @@
 // CHECK:   ('nsyms', 6)
 // CHECK:   ('stroff', 516)
 // CHECK:   ('strsize', 20)
-// CHECK:   ('_string_data', '\x00_d\x00_a\x00_b\x00_c\x00_e\x00_f\x00\x00')
+// CHECK:   ('_string_data', '\x00_f\x00_e\x00_d\x00_c\x00_b\x00_a\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 7)
+// CHECK:    (('n_strx', 13)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -116,7 +116,7 @@
 // CHECK:     ('_string', '_c')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 13)
+// CHECK:    (('n_strx', 4)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -124,7 +124,7 @@
 // CHECK:     ('_string', '_e')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 16)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0x2)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -132,7 +132,7 @@
 // CHECK:     ('_string', '_f')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 4)
+// CHECK:    (('n_strx', 16)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 1)
@@ -140,7 +140,7 @@
 // CHECK:     ('_string', '_a')
 // CHECK:    ),
 // CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 7)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/lcomm-attributes.s b/test/MC/MachO/lcomm-attributes.s
index 1e95924..6e49e80 100644
--- a/test/MC/MachO/lcomm-attributes.s
+++ b/test/MC/MachO/lcomm-attributes.s

@@ -73,10 +73,10 @@
 // CHECK:   ('nsyms', 4)
 // CHECK:   ('stroff', 372)
 // CHECK:   ('strsize', 68)
-// CHECK:   ('_string_data', '\x00sym_lcomm_ext_A\x00sym_lcomm_ext_B\x00sym_zfill_ext_A\x00sym_zfill_ext_B\x00\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00sym_lcomm_ext_B\x00sym_zfill_ext_B\x00sym_lcomm_ext_A\x00sym_zfill_ext_A\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 33)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -84,7 +84,7 @@
 // CHECK:     ('_string', 'sym_lcomm_ext_A')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 17)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -92,7 +92,7 @@
 // CHECK:     ('_string', 'sym_lcomm_ext_B')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 33)
+// CHECK:    (('n_strx', 49)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -100,7 +100,7 @@
 // CHECK:     ('_string', 'sym_zfill_ext_A')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 49)
+// CHECK:    (('n_strx', 17)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/osx-version-min-load-command.s b/test/MC/MachO/osx-version-min-load-command.s
index 2a73609..cb62565 100644
--- a/test/MC/MachO/osx-version-min-load-command.s
+++ b/test/MC/MachO/osx-version-min-load-command.s

@@ -6,5 +6,5 @@
 // CHECK:  (('command', 36)
 // CHECK:   ('size', 16)
 // CHECK:   ('version, 1639169)
-// CHECK:   ('reserved, 0)
+// CHECK:   ('sdk, 0)
 // CHECK:  ),

diff --git a/test/MC/MachO/reloc.s b/test/MC/MachO/reloc.s
index f6a3446..2a6d5db 100644
--- a/test/MC/MachO/reloc.s
+++ b/test/MC/MachO/reloc.s

@@ -181,10 +181,10 @@
 // CHECK:   ('nsyms', 10)
 // CHECK:   ('stroff', 724)
 // CHECK:   ('strsize', 88)
-// CHECK:   ('_string_data', '\x00undef\x00local_a_ext\x00.objc_class_name_A\x00_f1\x00local_a\x00local_a_elt\x00local_b\x00local_c\x00bar\x00_f0\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00local_a_ext\x00local_a_elt\x00bar\x00undef\x00local_c\x00local_b\x00local_a\x00.objc_class_name_A\x00_f1\x00_f0\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 42)
+// CHECK:    (('n_strx', 51)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -192,7 +192,7 @@
 // CHECK:     ('_string', 'local_a')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 50)
+// CHECK:    (('n_strx', 13)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -200,7 +200,7 @@
 // CHECK:     ('_string', 'local_a_elt')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 62)
+// CHECK:    (('n_strx', 43)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -208,7 +208,7 @@
 // CHECK:     ('_string', 'local_b')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 70)
+// CHECK:    (('n_strx', 35)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -216,7 +216,7 @@
 // CHECK:     ('_string', 'local_c')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 78)
+// CHECK:    (('n_strx', 25)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -232,7 +232,7 @@
 // CHECK:     ('_string', '_f0')
 // CHECK:    ),
 // CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 19)
+// CHECK:    (('n_strx', 59)
 // CHECK:     ('n_type', 0x3)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -240,7 +240,7 @@
 // CHECK:     ('_string', '.objc_class_name_A')
 // CHECK:    ),
 // CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 38)
+// CHECK:    (('n_strx', 78)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 128)
@@ -248,7 +248,7 @@
 // CHECK:     ('_string', '_f1')
 // CHECK:    ),
 // CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 7)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -256,7 +256,7 @@
 // CHECK:     ('_string', 'local_a_ext')
 // CHECK:    ),
 // CHECK:     # Symbol 9
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 29)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/section-align-2.s b/test/MC/MachO/section-align-2.s
index e0d7b8d..086fc4a 100644
--- a/test/MC/MachO/section-align-2.s
+++ b/test/MC/MachO/section-align-2.s

@@ -82,10 +82,10 @@
 // CHECK:   ('nsyms', 3)
 // CHECK:   ('stroff', 444)
 // CHECK:   ('strsize', 16)
-// CHECK:   ('_string_data', '\x00foo\x00bar\x00baz\x00\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00baz\x00bar\x00foo\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 9)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -101,7 +101,7 @@
 // CHECK:     ('_string', 'bar')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 9)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/string-table.s b/test/MC/MachO/string-table.s
index 179528e..3a935ee 100644
--- a/test/MC/MachO/string-table.s
+++ b/test/MC/MachO/string-table.s

@@ -53,10 +53,10 @@
 // CHECK:   ('nsyms', 2)
 // CHECK:   ('stroff', 308)
 // CHECK:   ('strsize', 8)
-// CHECK:   ('_string_data', '\x00a\x00b\x00\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00b\x00a\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 3)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -64,7 +64,7 @@
 // CHECK:     ('_string', 'a')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 3)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/symbol-diff.s b/test/MC/MachO/symbol-diff.s
index 1483df1..dae3287 100644
--- a/test/MC/MachO/symbol-diff.s
+++ b/test/MC/MachO/symbol-diff.s

@@ -75,10 +75,10 @@
 // CHECK-NEXT:  ('nsyms', 2)
 // CHECK-NEXT:  ('stroff', 424)
 // CHECK-NEXT:  ('strsize', 12)
-// CHECK-NEXT:  ('_string_data', '\x00_g\x00_g.eh\x00\x00\x00')
+// CHECK-NEXT:  ('_string_data', '\x00_g.eh\x00_g\x00\x00\x00')
 // CHECK-NEXT:  ('_symbols', [
 // CHECK-NEXT:    # Symbol 0
-// CHECK-NEXT:   (('n_strx', 1)
+// CHECK-NEXT:   (('n_strx', 7)
 // CHECK-NEXT:    ('n_type', 0xe)
 // CHECK-NEXT:    ('n_sect', 1)
 // CHECK-NEXT:    ('n_desc', 0)
@@ -86,7 +86,7 @@
 // CHECK-NEXT:    ('_string', '_g')
 // CHECK-NEXT:   ),
 // CHECK-NEXT:    # Symbol 1
-// CHECK-NEXT:   (('n_strx', 4)
+// CHECK-NEXT:   (('n_strx', 1)
 // CHECK-NEXT:    ('n_type', 0xe)
 // CHECK-NEXT:    ('n_sect', 2)
 // CHECK-NEXT:    ('n_desc', 0)

diff --git a/test/MC/MachO/symbol-flags.s b/test/MC/MachO/symbol-flags.s
index 7a4f8e4..561d88a 100644
--- a/test/MC/MachO/symbol-flags.s
+++ b/test/MC/MachO/symbol-flags.s

@@ -118,10 +118,10 @@
 // CHECK:   ('nsyms', 24)
 // CHECK:   ('stroff', 612)
 // CHECK:   ('strsize', 388)
-// CHECK:   ('_string_data', '\x00sym_ref_A\x00sym_ref_def_D\x00sym_ref_def_E\x00sym_weak_ref_A\x00sym_weak_def_A\x00sym_weak_def_B\x00sym_weak_def_C\x00sym_lazy_ref_A\x00sym_lazy_ref_D\x00sym_lazy_ref_E\x00sym_private_ext_A\x00sym_private_ext_B\x00sym_private_ext_C\x00sym_private_ext_D\x00sym_private_ext_E\x00sym_no_dead_strip_A\x00sym_ref_def_A\x00sym_ref_def_C\x00sym_weak_ref_def_A\x00sym_weak_ref_def_B\x00sym_lazy_ref_B\x00sym_lazy_ref_C\x00sym_symbol_resolver_A\x00sym_desc_flags\x00\x00')
+// CHECK:   ('_string_data', '\x00sym_desc_flags\x00sym_private_ext_E\x00sym_lazy_ref_E\x00sym_ref_def_E\x00sym_private_ext_D\x00sym_lazy_ref_D\x00sym_ref_def_D\x00sym_private_ext_C\x00sym_lazy_ref_C\x00sym_weak_def_C\x00sym_ref_def_C\x00sym_private_ext_B\x00sym_lazy_ref_B\x00sym_weak_def_B\x00sym_weak_ref_def_B\x00sym_private_ext_A\x00sym_symbol_resolver_A\x00sym_no_dead_strip_A\x00sym_lazy_ref_A\x00sym_ref_A\x00sym_weak_ref_A\x00sym_weak_def_A\x00sym_ref_def_A\x00sym_weak_ref_def_A\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 254)
+// CHECK:    (('n_strx', 354)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 32)
@@ -129,7 +129,7 @@
 // CHECK:     ('_string', 'sym_ref_def_A')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 268)
+// CHECK:    (('n_strx', 158)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 32)
@@ -137,7 +137,7 @@
 // CHECK:     ('_string', 'sym_ref_def_C')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 282)
+// CHECK:    (('n_strx', 368)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 64)
@@ -145,7 +145,7 @@
 // CHECK:     ('_string', 'sym_weak_ref_def_A')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 301)
+// CHECK:    (('n_strx', 220)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -153,7 +153,7 @@
 // CHECK:     ('_string', 'sym_weak_ref_def_B')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 320)
+// CHECK:    (('n_strx', 190)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 32)
@@ -161,7 +161,7 @@
 // CHECK:     ('_string', 'sym_lazy_ref_B')
 // CHECK:    ),
 // CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 335)
+// CHECK:    (('n_strx', 128)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 32)
@@ -169,7 +169,7 @@
 // CHECK:     ('_string', 'sym_lazy_ref_C')
 // CHECK:    ),
 // CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 350)
+// CHECK:    (('n_strx', 257)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 256)
@@ -177,7 +177,7 @@
 // CHECK:     ('_string', 'sym_symbol_resolver_A')
 // CHECK:    ),
 // CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 372)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 64)
@@ -185,7 +185,7 @@
 // CHECK:     ('_string', 'sym_desc_flags')
 // CHECK:    ),
 // CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 162)
+// CHECK:    (('n_strx', 172)
 // CHECK:     ('n_type', 0x1f)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -193,7 +193,7 @@
 // CHECK:     ('_string', 'sym_private_ext_B')
 // CHECK:    ),
 // CHECK:     # Symbol 9
-// CHECK:    (('n_strx', 180)
+// CHECK:    (('n_strx', 110)
 // CHECK:     ('n_type', 0x1f)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -201,7 +201,7 @@
 // CHECK:     ('_string', 'sym_private_ext_C')
 // CHECK:    ),
 // CHECK:     # Symbol 10
-// CHECK:    (('n_strx', 54)
+// CHECK:    (('n_strx', 339)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 128)
@@ -209,7 +209,7 @@
 // CHECK:     ('_string', 'sym_weak_def_A')
 // CHECK:    ),
 // CHECK:     # Symbol 11
-// CHECK:    (('n_strx', 69)
+// CHECK:    (('n_strx', 205)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 128)
@@ -217,7 +217,7 @@
 // CHECK:     ('_string', 'sym_weak_def_B')
 // CHECK:    ),
 // CHECK:     # Symbol 12
-// CHECK:    (('n_strx', 84)
+// CHECK:    (('n_strx', 143)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 128)
@@ -225,7 +225,7 @@
 // CHECK:     ('_string', 'sym_weak_def_C')
 // CHECK:    ),
 // CHECK:     # Symbol 13
-// CHECK:    (('n_strx', 99)
+// CHECK:    (('n_strx', 299)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 33)
@@ -233,7 +233,7 @@
 // CHECK:     ('_string', 'sym_lazy_ref_A')
 // CHECK:    ),
 // CHECK:     # Symbol 14
-// CHECK:    (('n_strx', 114)
+// CHECK:    (('n_strx', 81)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 32)
@@ -241,7 +241,7 @@
 // CHECK:     ('_string', 'sym_lazy_ref_D')
 // CHECK:    ),
 // CHECK:     # Symbol 15
-// CHECK:    (('n_strx', 129)
+// CHECK:    (('n_strx', 34)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 33)
@@ -249,7 +249,7 @@
 // CHECK:     ('_string', 'sym_lazy_ref_E')
 // CHECK:    ),
 // CHECK:     # Symbol 16
-// CHECK:    (('n_strx', 234)
+// CHECK:    (('n_strx', 279)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 32)
@@ -257,7 +257,7 @@
 // CHECK:     ('_string', 'sym_no_dead_strip_A')
 // CHECK:    ),
 // CHECK:     # Symbol 17
-// CHECK:    (('n_strx', 144)
+// CHECK:    (('n_strx', 239)
 // CHECK:     ('n_type', 0x11)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -265,7 +265,7 @@
 // CHECK:     ('_string', 'sym_private_ext_A')
 // CHECK:    ),
 // CHECK:     # Symbol 18
-// CHECK:    (('n_strx', 198)
+// CHECK:    (('n_strx', 63)
 // CHECK:     ('n_type', 0x11)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -273,7 +273,7 @@
 // CHECK:     ('_string', 'sym_private_ext_D')
 // CHECK:    ),
 // CHECK:     # Symbol 19
-// CHECK:    (('n_strx', 216)
+// CHECK:    (('n_strx', 16)
 // CHECK:     ('n_type', 0x11)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -281,7 +281,7 @@
 // CHECK:     ('_string', 'sym_private_ext_E')
 // CHECK:    ),
 // CHECK:     # Symbol 20
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 314)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 4660)
@@ -289,7 +289,7 @@
 // CHECK:     ('_string', 'sym_ref_A')
 // CHECK:    ),
 // CHECK:     # Symbol 21
-// CHECK:    (('n_strx', 11)
+// CHECK:    (('n_strx', 96)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 32)
@@ -297,7 +297,7 @@
 // CHECK:     ('_string', 'sym_ref_def_D')
 // CHECK:    ),
 // CHECK:     # Symbol 22
-// CHECK:    (('n_strx', 25)
+// CHECK:    (('n_strx', 49)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 32)
@@ -305,7 +305,7 @@
 // CHECK:     ('_string', 'sym_ref_def_E')
 // CHECK:    ),
 // CHECK:     # Symbol 23
-// CHECK:    (('n_strx', 39)
+// CHECK:    (('n_strx', 324)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 64)

diff --git a/test/MC/MachO/symbol-indirect.s b/test/MC/MachO/symbol-indirect.s
index 2412970..1cdeed1 100644
--- a/test/MC/MachO/symbol-indirect.s
+++ b/test/MC/MachO/symbol-indirect.s

@@ -137,10 +137,10 @@
 // CHECK:   ('nsyms', 10)
 // CHECK:   ('stroff', 592)
 // CHECK:   ('strsize', 104)
-// CHECK:   ('_string_data', '\x00sym_lsp_A\x00sym_lsp_G\x00sym_nlp_A\x00sym_nlp_G\x00sym_nlp_B\x00sym_nlp_E\x00sym_lsp_B\x00sym_lsp_E\x00sym_lsp_C\x00sym_nlp_C\x00\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00sym_lsp_G\x00sym_nlp_G\x00sym_lsp_E\x00sym_nlp_E\x00sym_lsp_C\x00sym_nlp_C\x00sym_lsp_B\x00sym_nlp_B\x00sym_lsp_A\x00sym_nlp_A\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 81)
+// CHECK:    (('n_strx', 41)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -148,7 +148,7 @@
 // CHECK:     ('_string', 'sym_lsp_C')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 91)
+// CHECK:    (('n_strx', 51)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -156,7 +156,7 @@
 // CHECK:     ('_string', 'sym_nlp_C')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 11)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -164,7 +164,7 @@
 // CHECK:     ('_string', 'sym_lsp_G')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 31)
+// CHECK:    (('n_strx', 11)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -172,7 +172,7 @@
 // CHECK:     ('_string', 'sym_nlp_G')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 81)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -188,7 +188,7 @@
 // CHECK:     ('_string', 'sym_lsp_B')
 // CHECK:    ),
 // CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 71)
+// CHECK:    (('n_strx', 21)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 1)
@@ -196,7 +196,7 @@
 // CHECK:     ('_string', 'sym_lsp_E')
 // CHECK:    ),
 // CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 21)
+// CHECK:    (('n_strx', 91)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -204,7 +204,7 @@
 // CHECK:     ('_string', 'sym_nlp_A')
 // CHECK:    ),
 // CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 41)
+// CHECK:    (('n_strx', 71)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)
@@ -212,7 +212,7 @@
 // CHECK:     ('_string', 'sym_nlp_B')
 // CHECK:    ),
 // CHECK:     # Symbol 9
-// CHECK:    (('n_strx', 51)
+// CHECK:    (('n_strx', 31)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/symbols-1.s b/test/MC/MachO/symbols-1.s
index cf05afa..8b663dc 100644
--- a/test/MC/MachO/symbols-1.s
+++ b/test/MC/MachO/symbols-1.s

@@ -59,10 +59,10 @@
 // CHECK-X86_32:   ('nsyms', 9)
 // CHECK-X86_32:   ('stroff', 368)
 // CHECK-X86_32:   ('strsize', 140)
-// CHECK-X86_32:   ('_string_data', '\x00sym_globl_def_B\x00sym_globl_undef_B\x00sym_globl_def_A\x00sym_globl_undef_A\x00sym_globl_def_C\x00sym_globl_undef_C\x00sym_local_B\x00sym_local_A\x00sym_local_C\x00\x00')
+// CHECK-X86_32:   ('_string_data', '\x00sym_local_C\x00sym_globl_undef_C\x00sym_globl_def_C\x00sym_local_B\x00sym_globl_undef_B\x00sym_globl_def_B\x00sym_local_A\x00sym_globl_undef_A\x00sym_globl_def_A\x00\x00')
 // CHECK-X86_32:   ('_symbols', [
 // CHECK-X86_32:     # Symbol 0
-// CHECK-X86_32:    (('n_strx', 103)
+// CHECK-X86_32:    (('n_strx', 47)
 // CHECK-X86_32:     ('n_type', 0xe)
 // CHECK-X86_32:     ('n_sect', 1)
 // CHECK-X86_32:     ('n_desc', 0)
@@ -70,7 +70,7 @@
 // CHECK-X86_32:     ('_string', 'sym_local_B')
 // CHECK-X86_32:    ),
 // CHECK-X86_32:     # Symbol 1
-// CHECK-X86_32:    (('n_strx', 115)
+// CHECK-X86_32:    (('n_strx', 93)
 // CHECK-X86_32:     ('n_type', 0xe)
 // CHECK-X86_32:     ('n_sect', 1)
 // CHECK-X86_32:     ('n_desc', 0)
@@ -78,7 +78,7 @@
 // CHECK-X86_32:     ('_string', 'sym_local_A')
 // CHECK-X86_32:    ),
 // CHECK-X86_32:     # Symbol 2
-// CHECK-X86_32:    (('n_strx', 127)
+// CHECK-X86_32:    (('n_strx', 1)
 // CHECK-X86_32:     ('n_type', 0xe)
 // CHECK-X86_32:     ('n_sect', 1)
 // CHECK-X86_32:     ('n_desc', 0)
@@ -86,7 +86,7 @@
 // CHECK-X86_32:     ('_string', 'sym_local_C')
 // CHECK-X86_32:    ),
 // CHECK-X86_32:     # Symbol 3
-// CHECK-X86_32:    (('n_strx', 35)
+// CHECK-X86_32:    (('n_strx', 123)
 // CHECK-X86_32:     ('n_type', 0xf)
 // CHECK-X86_32:     ('n_sect', 1)
 // CHECK-X86_32:     ('n_desc', 0)
@@ -94,7 +94,7 @@
 // CHECK-X86_32:     ('_string', 'sym_globl_def_A')
 // CHECK-X86_32:    ),
 // CHECK-X86_32:     # Symbol 4
-// CHECK-X86_32:    (('n_strx', 1)
+// CHECK-X86_32:    (('n_strx', 77)
 // CHECK-X86_32:     ('n_type', 0xf)
 // CHECK-X86_32:     ('n_sect', 1)
 // CHECK-X86_32:     ('n_desc', 0)
@@ -102,7 +102,7 @@
 // CHECK-X86_32:     ('_string', 'sym_globl_def_B')
 // CHECK-X86_32:    ),
 // CHECK-X86_32:     # Symbol 5
-// CHECK-X86_32:    (('n_strx', 69)
+// CHECK-X86_32:    (('n_strx', 31)
 // CHECK-X86_32:     ('n_type', 0xf)
 // CHECK-X86_32:     ('n_sect', 1)
 // CHECK-X86_32:     ('n_desc', 0)
@@ -110,7 +110,7 @@
 // CHECK-X86_32:     ('_string', 'sym_globl_def_C')
 // CHECK-X86_32:    ),
 // CHECK-X86_32:     # Symbol 6
-// CHECK-X86_32:    (('n_strx', 51)
+// CHECK-X86_32:    (('n_strx', 105)
 // CHECK-X86_32:     ('n_type', 0x1)
 // CHECK-X86_32:     ('n_sect', 0)
 // CHECK-X86_32:     ('n_desc', 0)
@@ -118,7 +118,7 @@
 // CHECK-X86_32:     ('_string', 'sym_globl_undef_A')
 // CHECK-X86_32:    ),
 // CHECK-X86_32:     # Symbol 7
-// CHECK-X86_32:    (('n_strx', 17)
+// CHECK-X86_32:    (('n_strx', 59)
 // CHECK-X86_32:     ('n_type', 0x1)
 // CHECK-X86_32:     ('n_sect', 0)
 // CHECK-X86_32:     ('n_desc', 0)
@@ -126,7 +126,7 @@
 // CHECK-X86_32:     ('_string', 'sym_globl_undef_B')
 // CHECK-X86_32:    ),
 // CHECK-X86_32:     # Symbol 8
-// CHECK-X86_32:    (('n_strx', 85)
+// CHECK-X86_32:    (('n_strx', 13)
 // CHECK-X86_32:     ('n_type', 0x1)
 // CHECK-X86_32:     ('n_sect', 0)
 // CHECK-X86_32:     ('n_desc', 0)
@@ -207,10 +207,10 @@
 // CHECK-X86_64:   ('nsyms', 9)
 // CHECK-X86_64:   ('stroff', 436)
 // CHECK-X86_64:   ('strsize', 140)
-// CHECK-X86_64:   ('_string_data', '\x00sym_globl_def_B\x00sym_globl_undef_B\x00sym_globl_def_A\x00sym_globl_undef_A\x00sym_globl_def_C\x00sym_globl_undef_C\x00sym_local_B\x00sym_local_A\x00sym_local_C\x00\x00')
+// CHECK-X86_64:   ('_string_data', '\x00sym_local_C\x00sym_globl_undef_C\x00sym_globl_def_C\x00sym_local_B\x00sym_globl_undef_B\x00sym_globl_def_B\x00sym_local_A\x00sym_globl_undef_A\x00sym_globl_def_A\x00\x00')
 // CHECK-X86_64:   ('_symbols', [
 // CHECK-X86_64:     # Symbol 0
-// CHECK-X86_64:    (('n_strx', 103)
+// CHECK-X86_64:    (('n_strx', 47)
 // CHECK-X86_64:     ('n_type', 0xe)
 // CHECK-X86_64:     ('n_sect', 1)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -218,7 +218,7 @@
 // CHECK-X86_64:     ('_string', 'sym_local_B')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 1
-// CHECK-X86_64:    (('n_strx', 115)
+// CHECK-X86_64:    (('n_strx', 93)
 // CHECK-X86_64:     ('n_type', 0xe)
 // CHECK-X86_64:     ('n_sect', 1)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -226,7 +226,7 @@
 // CHECK-X86_64:     ('_string', 'sym_local_A')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 2
-// CHECK-X86_64:    (('n_strx', 127)
+// CHECK-X86_64:    (('n_strx', 1)
 // CHECK-X86_64:     ('n_type', 0xe)
 // CHECK-X86_64:     ('n_sect', 1)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -234,7 +234,7 @@
 // CHECK-X86_64:     ('_string', 'sym_local_C')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 3
-// CHECK-X86_64:    (('n_strx', 35)
+// CHECK-X86_64:    (('n_strx', 123)
 // CHECK-X86_64:     ('n_type', 0xf)
 // CHECK-X86_64:     ('n_sect', 1)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -242,7 +242,7 @@
 // CHECK-X86_64:     ('_string', 'sym_globl_def_A')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 4
-// CHECK-X86_64:    (('n_strx', 1)
+// CHECK-X86_64:    (('n_strx', 77)
 // CHECK-X86_64:     ('n_type', 0xf)
 // CHECK-X86_64:     ('n_sect', 1)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -250,7 +250,7 @@
 // CHECK-X86_64:     ('_string', 'sym_globl_def_B')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 5
-// CHECK-X86_64:    (('n_strx', 69)
+// CHECK-X86_64:    (('n_strx', 31)
 // CHECK-X86_64:     ('n_type', 0xf)
 // CHECK-X86_64:     ('n_sect', 1)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -258,7 +258,7 @@
 // CHECK-X86_64:     ('_string', 'sym_globl_def_C')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 6
-// CHECK-X86_64:    (('n_strx', 51)
+// CHECK-X86_64:    (('n_strx', 105)
 // CHECK-X86_64:     ('n_type', 0x1)
 // CHECK-X86_64:     ('n_sect', 0)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -266,7 +266,7 @@
 // CHECK-X86_64:     ('_string', 'sym_globl_undef_A')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 7
-// CHECK-X86_64:    (('n_strx', 17)
+// CHECK-X86_64:    (('n_strx', 59)
 // CHECK-X86_64:     ('n_type', 0x1)
 // CHECK-X86_64:     ('n_sect', 0)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -274,7 +274,7 @@
 // CHECK-X86_64:     ('_string', 'sym_globl_undef_B')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 8
-// CHECK-X86_64:    (('n_strx', 85)
+// CHECK-X86_64:    (('n_strx', 13)
 // CHECK-X86_64:     ('n_type', 0x1)
 // CHECK-X86_64:     ('n_sect', 0)
 // CHECK-X86_64:     ('n_desc', 0)

diff --git a/test/MC/MachO/tbss.s b/test/MC/MachO/tbss.s
index 8eae142..1c23aa5 100644
--- a/test/MC/MachO/tbss.s
+++ b/test/MC/MachO/tbss.s

@@ -67,10 +67,10 @@
 // CHECK:   ('nsyms', 2)
 // CHECK:   ('stroff', 400)
 // CHECK:   ('strsize', 28)
-// CHECK:   ('_string_data', '\x00_a$tlv$init\x00_b$tlv$init\x00\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00_b$tlv$init\x00_a$tlv$init\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 13)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -78,7 +78,7 @@
 // CHECK:     ('_string', '_a$tlv$init')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 13)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/tls.s b/test/MC/MachO/tls.s
index 44b61be..33e23a9 100644
--- a/test/MC/MachO/tls.s
+++ b/test/MC/MachO/tls.s

@@ -167,10 +167,10 @@
 // CHECK:   ('nsyms', 9)
 // CHECK:   ('stroff', 840)
 // CHECK:   ('strsize', 80)
-// CHECK:   ('_string_data', '\x00_c$tlv$init\x00_c\x00___tlv_bootstrap\x00_d$tlv$init\x00_d\x00_a\x00_b\x00_a$tlv$init\x00_b$tlv$init\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00_d$tlv$init\x00_c$tlv$init\x00_b$tlv$init\x00_a$tlv$init\x00___tlv_bootstrap\x00_d\x00_c\x00_b\x00_a\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 54)
+// CHECK:    (('n_strx', 37)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -178,7 +178,7 @@
 // CHECK:     ('_string', '_a$tlv$init')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 66)
+// CHECK:    (('n_strx', 25)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -186,7 +186,7 @@
 // CHECK:     ('_string', '_b$tlv$init')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 48)
+// CHECK:    (('n_strx', 75)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -194,7 +194,7 @@
 // CHECK:     ('_string', '_a')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 51)
+// CHECK:    (('n_strx', 72)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -202,7 +202,7 @@
 // CHECK:     ('_string', '_b')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 13)
+// CHECK:    (('n_strx', 69)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -210,7 +210,7 @@
 // CHECK:     ('_string', '_c')
 // CHECK:    ),
 // CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 13)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -218,7 +218,7 @@
 // CHECK:     ('_string', '_c$tlv$init')
 // CHECK:    ),
 // CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 45)
+// CHECK:    (('n_strx', 66)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -226,7 +226,7 @@
 // CHECK:     ('_string', '_d')
 // CHECK:    ),
 // CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 33)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -234,7 +234,7 @@
 // CHECK:     ('_string', '_d$tlv$init')
 // CHECK:    ),
 // CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 16)
+// CHECK:    (('n_strx', 49)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/tlv-reloc.s b/test/MC/MachO/tlv-reloc.s
index d111241..80e0565 100644
--- a/test/MC/MachO/tlv-reloc.s
+++ b/test/MC/MachO/tlv-reloc.s

@@ -111,10 +111,10 @@
 // CHECK:   ('nsyms', 4)
 // CHECK:   ('stroff', 576)
 // CHECK:   ('strsize', 40)
-// CHECK:   ('_string_data', '\x00_a\x00__tlv_bootstrap\x00_foo\x00_a$tlv$init\x00\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00_a$tlv$init\x00__tlv_bootstrap\x00_foo\x00_a\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 25)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -122,7 +122,7 @@
 // CHECK:     ('_string', '_a$tlv$init')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 34)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -130,7 +130,7 @@
 // CHECK:     ('_string', '_a')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 20)
+// CHECK:    (('n_strx', 29)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -138,7 +138,7 @@
 // CHECK:     ('_string', '_foo')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 4)
+// CHECK:    (('n_strx', 13)
 // CHECK:     ('n_type', 0x1)
 // CHECK:     ('n_sect', 0)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/variable-exprs.s b/test/MC/MachO/variable-exprs.s
index a7fa45d..ac781ef 100644
--- a/test/MC/MachO/variable-exprs.s
+++ b/test/MC/MachO/variable-exprs.s

@@ -134,10 +134,10 @@
 // CHECK-I386:   ('nsyms', 10)
 // CHECK-I386:   ('stroff', 576)
 // CHECK-I386:   ('strsize', 24)
-// CHECK-I386:   ('_string_data', '\x00d2\x00d\x00d3\x00a\x00b\x00c\x00e\x00g\x00f\x00t0\x00')
+// CHECK-I386:   ('_string_data', '\x00g\x00f\x00e\x00d\x00c\x00b\x00a\x00d3\x00d2\x00t0\x00')
 // CHECK-I386:   ('_symbols', [
 // CHECK-I386:     # Symbol 0
-// CHECK-I386:    (('n_strx', 9)
+// CHECK-I386:    (('n_strx', 13)
 // CHECK-I386:     ('n_type', 0xe)
 // CHECK-I386:     ('n_sect', 2)
 // CHECK-I386:     ('n_desc', 0)
@@ -153,7 +153,7 @@
 // CHECK-I386:     ('_string', 'b')
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 2
-// CHECK-I386:    (('n_strx', 13)
+// CHECK-I386:    (('n_strx', 9)
 // CHECK-I386:     ('n_type', 0xe)
 // CHECK-I386:     ('n_sect', 2)
 // CHECK-I386:     ('n_desc', 0)
@@ -161,7 +161,7 @@
 // CHECK-I386:     ('_string', 'c')
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 3
-// CHECK-I386:    (('n_strx', 15)
+// CHECK-I386:    (('n_strx', 5)
 // CHECK-I386:     ('n_type', 0xe)
 // CHECK-I386:     ('n_sect', 2)
 // CHECK-I386:     ('n_desc', 0)
@@ -169,7 +169,7 @@
 // CHECK-I386:     ('_string', 'e')
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 4
-// CHECK-I386:    (('n_strx', 17)
+// CHECK-I386:    (('n_strx', 1)
 // CHECK-I386:     ('n_type', 0xe)
 // CHECK-I386:     ('n_sect', 2)
 // CHECK-I386:     ('n_desc', 0)
@@ -177,7 +177,7 @@
 // CHECK-I386:     ('_string', 'g')
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 5
-// CHECK-I386:    (('n_strx', 19)
+// CHECK-I386:    (('n_strx', 3)
 // CHECK-I386:     ('n_type', 0xe)
 // CHECK-I386:     ('n_sect', 2)
 // CHECK-I386:     ('n_desc', 0)
@@ -193,7 +193,7 @@
 // CHECK-I386:     ('_string', 't0')
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 7
-// CHECK-I386:    (('n_strx', 4)
+// CHECK-I386:    (('n_strx', 7)
 // CHECK-I386:     ('n_type', 0x1)
 // CHECK-I386:     ('n_sect', 0)
 // CHECK-I386:     ('n_desc', 0)
@@ -201,15 +201,15 @@
 // CHECK-I386:     ('_string', 'd')
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 8
-// CHECK-I386:    (('n_strx', 1)
+// CHECK-I386:    (('n_strx', 18)
 // CHECK-I386:     ('n_type', 0xb)
 // CHECK-I386:     ('n_sect', 0)
 // CHECK-I386:     ('n_desc', 0)
-// CHECK-I386:     ('n_value', 4)
+// CHECK-I386:     ('n_value', 7)
 // CHECK-I386:     ('_string', 'd2')
 // CHECK-I386:    ),
 // CHECK-I386:     # Symbol 9
-// CHECK-I386:    (('n_strx', 6)
+// CHECK-I386:    (('n_strx', 15)
 // CHECK-I386:     ('n_type', 0x1)
 // CHECK-I386:     ('n_sect', 0)
 // CHECK-I386:     ('n_desc', 0)
@@ -335,10 +335,10 @@
 // CHECK-X86_64:   ('nsyms', 10)
 // CHECK-X86_64:   ('stroff', 660)
 // CHECK-X86_64:   ('strsize', 24)
-// CHECK-X86_64:   ('_string_data', '\x00d2\x00d\x00d3\x00a\x00b\x00c\x00e\x00g\x00f\x00t0\x00')
+// CHECK-X86_64:   ('_string_data', '\x00g\x00f\x00e\x00d\x00c\x00b\x00a\x00d3\x00d2\x00t0\x00')
 // CHECK-X86_64:   ('_symbols', [
 // CHECK-X86_64:     # Symbol 0
-// CHECK-X86_64:    (('n_strx', 9)
+// CHECK-X86_64:    (('n_strx', 13)
 // CHECK-X86_64:     ('n_type', 0xe)
 // CHECK-X86_64:     ('n_sect', 2)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -354,7 +354,7 @@
 // CHECK-X86_64:     ('_string', 'b')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 2
-// CHECK-X86_64:    (('n_strx', 13)
+// CHECK-X86_64:    (('n_strx', 9)
 // CHECK-X86_64:     ('n_type', 0xe)
 // CHECK-X86_64:     ('n_sect', 2)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -362,7 +362,7 @@
 // CHECK-X86_64:     ('_string', 'c')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 3
-// CHECK-X86_64:    (('n_strx', 15)
+// CHECK-X86_64:    (('n_strx', 5)
 // CHECK-X86_64:     ('n_type', 0xe)
 // CHECK-X86_64:     ('n_sect', 2)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -370,7 +370,7 @@
 // CHECK-X86_64:     ('_string', 'e')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 4
-// CHECK-X86_64:    (('n_strx', 17)
+// CHECK-X86_64:    (('n_strx', 1)
 // CHECK-X86_64:     ('n_type', 0xe)
 // CHECK-X86_64:     ('n_sect', 2)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -378,7 +378,7 @@
 // CHECK-X86_64:     ('_string', 'g')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 5
-// CHECK-X86_64:    (('n_strx', 19)
+// CHECK-X86_64:    (('n_strx', 3)
 // CHECK-X86_64:     ('n_type', 0xe)
 // CHECK-X86_64:     ('n_sect', 2)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -394,7 +394,7 @@
 // CHECK-X86_64:     ('_string', 't0')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 7
-// CHECK-X86_64:    (('n_strx', 4)
+// CHECK-X86_64:    (('n_strx', 7)
 // CHECK-X86_64:     ('n_type', 0x1)
 // CHECK-X86_64:     ('n_sect', 0)
 // CHECK-X86_64:     ('n_desc', 0)
@@ -402,15 +402,15 @@
 // CHECK-X86_64:     ('_string', 'd')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 8
-// CHECK-X86_64:    (('n_strx', 1)
+// CHECK-X86_64:    (('n_strx', 18)
 // CHECK-X86_64:     ('n_type', 0xb)
 // CHECK-X86_64:     ('n_sect', 0)
 // CHECK-X86_64:     ('n_desc', 0)
-// CHECK-X86_64:     ('n_value', 4)
+// CHECK-X86_64:     ('n_value', 7)
 // CHECK-X86_64:     ('_string', 'd2')
 // CHECK-X86_64:    ),
 // CHECK-X86_64:     # Symbol 9
-// CHECK-X86_64:    (('n_strx', 6)
+// CHECK-X86_64:    (('n_strx', 15)
 // CHECK-X86_64:     ('n_type', 0x1)
 // CHECK-X86_64:     ('n_sect', 0)
 // CHECK-X86_64:     ('n_desc', 0)

diff --git a/test/MC/MachO/x86_32-symbols.s b/test/MC/MachO/x86_32-symbols.s
index 35ada35..95aa507 100644
--- a/test/MC/MachO/x86_32-symbols.s
+++ b/test/MC/MachO/x86_32-symbols.s

@@ -690,10 +690,10 @@
 // CHECK:   ('nsyms', 40)
 // CHECK:   ('stroff', 3116)
 // CHECK:   ('strsize', 152)
-// CHECK:   ('_string_data', '\x00D0\x00D1\x00D2\x00D3\x00D4\x00D5\x00D6\x00D7\x00D8\x00D9\x00D10\x00D11\x00D12\x00D13\x00D14\x00D15\x00D16\x00D17\x00D18\x00D19\x00D20\x00D21\x00D22\x00D23\x00D24\x00D25\x00D26\x00D27\x00D28\x00D29\x00D30\x00D31\x00D32\x00D33\x00D34\x00D35\x00D36\x00D37\x00D38\x00D39\x00\x00')
+// CHECK:   ('_string_data', '\x00D9\x00D39\x00D29\x00D19\x00D8\x00D38\x00D28\x00D18\x00D7\x00D37\x00D27\x00D17\x00D6\x00D36\x00D26\x00D16\x00D5\x00D35\x00D25\x00D15\x00D4\x00D34\x00D24\x00D14\x00D3\x00D33\x00D23\x00D13\x00D2\x00D32\x00D22\x00D12\x00D1\x00D31\x00D21\x00D11\x00D0\x00D30\x00D20\x00D10\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 136)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -701,7 +701,7 @@
 // CHECK:     ('_string', 'D0')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 4)
+// CHECK:    (('n_strx', 121)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -709,7 +709,7 @@
 // CHECK:     ('_string', 'D1')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 7)
+// CHECK:    (('n_strx', 106)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -717,7 +717,7 @@
 // CHECK:     ('_string', 'D2')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 10)
+// CHECK:    (('n_strx', 91)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -725,7 +725,7 @@
 // CHECK:     ('_string', 'D3')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 13)
+// CHECK:    (('n_strx', 76)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -733,7 +733,7 @@
 // CHECK:     ('_string', 'D4')
 // CHECK:    ),
 // CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 16)
+// CHECK:    (('n_strx', 61)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 5)
 // CHECK:     ('n_desc', 0)
@@ -741,7 +741,7 @@
 // CHECK:     ('_string', 'D5')
 // CHECK:    ),
 // CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 19)
+// CHECK:    (('n_strx', 46)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 6)
 // CHECK:     ('n_desc', 0)
@@ -749,7 +749,7 @@
 // CHECK:     ('_string', 'D6')
 // CHECK:    ),
 // CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 22)
+// CHECK:    (('n_strx', 31)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 7)
 // CHECK:     ('n_desc', 0)
@@ -757,7 +757,7 @@
 // CHECK:     ('_string', 'D7')
 // CHECK:    ),
 // CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 25)
+// CHECK:    (('n_strx', 16)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 8)
 // CHECK:     ('n_desc', 0)
@@ -765,7 +765,7 @@
 // CHECK:     ('_string', 'D8')
 // CHECK:    ),
 // CHECK:     # Symbol 9
-// CHECK:    (('n_strx', 28)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 9)
 // CHECK:     ('n_desc', 0)
@@ -773,7 +773,7 @@
 // CHECK:     ('_string', 'D9')
 // CHECK:    ),
 // CHECK:     # Symbol 10
-// CHECK:    (('n_strx', 31)
+// CHECK:    (('n_strx', 147)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 10)
 // CHECK:     ('n_desc', 0)
@@ -781,7 +781,7 @@
 // CHECK:     ('_string', 'D10')
 // CHECK:    ),
 // CHECK:     # Symbol 11
-// CHECK:    (('n_strx', 35)
+// CHECK:    (('n_strx', 132)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 11)
 // CHECK:     ('n_desc', 0)
@@ -789,7 +789,7 @@
 // CHECK:     ('_string', 'D11')
 // CHECK:    ),
 // CHECK:     # Symbol 12
-// CHECK:    (('n_strx', 39)
+// CHECK:    (('n_strx', 117)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 12)
 // CHECK:     ('n_desc', 0)
@@ -797,7 +797,7 @@
 // CHECK:     ('_string', 'D12')
 // CHECK:    ),
 // CHECK:     # Symbol 13
-// CHECK:    (('n_strx', 43)
+// CHECK:    (('n_strx', 102)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 13)
 // CHECK:     ('n_desc', 0)
@@ -805,7 +805,7 @@
 // CHECK:     ('_string', 'D13')
 // CHECK:    ),
 // CHECK:     # Symbol 14
-// CHECK:    (('n_strx', 47)
+// CHECK:    (('n_strx', 87)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 14)
 // CHECK:     ('n_desc', 0)
@@ -813,7 +813,7 @@
 // CHECK:     ('_string', 'D14')
 // CHECK:    ),
 // CHECK:     # Symbol 15
-// CHECK:    (('n_strx', 51)
+// CHECK:    (('n_strx', 72)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 15)
 // CHECK:     ('n_desc', 0)
@@ -821,7 +821,7 @@
 // CHECK:     ('_string', 'D15')
 // CHECK:    ),
 // CHECK:     # Symbol 16
-// CHECK:    (('n_strx', 55)
+// CHECK:    (('n_strx', 57)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 16)
 // CHECK:     ('n_desc', 0)
@@ -829,7 +829,7 @@
 // CHECK:     ('_string', 'D16')
 // CHECK:    ),
 // CHECK:     # Symbol 17
-// CHECK:    (('n_strx', 59)
+// CHECK:    (('n_strx', 42)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 17)
 // CHECK:     ('n_desc', 0)
@@ -837,7 +837,7 @@
 // CHECK:     ('_string', 'D17')
 // CHECK:    ),
 // CHECK:     # Symbol 18
-// CHECK:    (('n_strx', 63)
+// CHECK:    (('n_strx', 27)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 18)
 // CHECK:     ('n_desc', 0)
@@ -845,7 +845,7 @@
 // CHECK:     ('_string', 'D18')
 // CHECK:    ),
 // CHECK:     # Symbol 19
-// CHECK:    (('n_strx', 67)
+// CHECK:    (('n_strx', 12)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 19)
 // CHECK:     ('n_desc', 0)
@@ -853,7 +853,7 @@
 // CHECK:     ('_string', 'D19')
 // CHECK:    ),
 // CHECK:     # Symbol 20
-// CHECK:    (('n_strx', 71)
+// CHECK:    (('n_strx', 143)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 20)
 // CHECK:     ('n_desc', 0)
@@ -861,7 +861,7 @@
 // CHECK:     ('_string', 'D20')
 // CHECK:    ),
 // CHECK:     # Symbol 21
-// CHECK:    (('n_strx', 75)
+// CHECK:    (('n_strx', 128)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 21)
 // CHECK:     ('n_desc', 0)
@@ -869,7 +869,7 @@
 // CHECK:     ('_string', 'D21')
 // CHECK:    ),
 // CHECK:     # Symbol 22
-// CHECK:    (('n_strx', 79)
+// CHECK:    (('n_strx', 113)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 22)
 // CHECK:     ('n_desc', 0)
@@ -877,7 +877,7 @@
 // CHECK:     ('_string', 'D22')
 // CHECK:    ),
 // CHECK:     # Symbol 23
-// CHECK:    (('n_strx', 83)
+// CHECK:    (('n_strx', 98)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 23)
 // CHECK:     ('n_desc', 0)
@@ -885,7 +885,7 @@
 // CHECK:     ('_string', 'D23')
 // CHECK:    ),
 // CHECK:     # Symbol 24
-// CHECK:    (('n_strx', 87)
+// CHECK:    (('n_strx', 83)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 24)
 // CHECK:     ('n_desc', 0)
@@ -893,7 +893,7 @@
 // CHECK:     ('_string', 'D24')
 // CHECK:    ),
 // CHECK:     # Symbol 25
-// CHECK:    (('n_strx', 91)
+// CHECK:    (('n_strx', 68)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 25)
 // CHECK:     ('n_desc', 0)
@@ -901,7 +901,7 @@
 // CHECK:     ('_string', 'D25')
 // CHECK:    ),
 // CHECK:     # Symbol 26
-// CHECK:    (('n_strx', 95)
+// CHECK:    (('n_strx', 53)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 26)
 // CHECK:     ('n_desc', 0)
@@ -909,7 +909,7 @@
 // CHECK:     ('_string', 'D26')
 // CHECK:    ),
 // CHECK:     # Symbol 27
-// CHECK:    (('n_strx', 99)
+// CHECK:    (('n_strx', 38)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 27)
 // CHECK:     ('n_desc', 0)
@@ -917,7 +917,7 @@
 // CHECK:     ('_string', 'D27')
 // CHECK:    ),
 // CHECK:     # Symbol 28
-// CHECK:    (('n_strx', 103)
+// CHECK:    (('n_strx', 23)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 28)
 // CHECK:     ('n_desc', 0)
@@ -925,7 +925,7 @@
 // CHECK:     ('_string', 'D28')
 // CHECK:    ),
 // CHECK:     # Symbol 29
-// CHECK:    (('n_strx', 107)
+// CHECK:    (('n_strx', 8)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 29)
 // CHECK:     ('n_desc', 0)
@@ -933,7 +933,7 @@
 // CHECK:     ('_string', 'D29')
 // CHECK:    ),
 // CHECK:     # Symbol 30
-// CHECK:    (('n_strx', 111)
+// CHECK:    (('n_strx', 139)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 30)
 // CHECK:     ('n_desc', 0)
@@ -941,7 +941,7 @@
 // CHECK:     ('_string', 'D30')
 // CHECK:    ),
 // CHECK:     # Symbol 31
-// CHECK:    (('n_strx', 115)
+// CHECK:    (('n_strx', 124)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 31)
 // CHECK:     ('n_desc', 0)
@@ -949,7 +949,7 @@
 // CHECK:     ('_string', 'D31')
 // CHECK:    ),
 // CHECK:     # Symbol 32
-// CHECK:    (('n_strx', 119)
+// CHECK:    (('n_strx', 109)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 32)
 // CHECK:     ('n_desc', 0)
@@ -957,7 +957,7 @@
 // CHECK:     ('_string', 'D32')
 // CHECK:    ),
 // CHECK:     # Symbol 33
-// CHECK:    (('n_strx', 123)
+// CHECK:    (('n_strx', 94)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 33)
 // CHECK:     ('n_desc', 0)
@@ -965,7 +965,7 @@
 // CHECK:     ('_string', 'D33')
 // CHECK:    ),
 // CHECK:     # Symbol 34
-// CHECK:    (('n_strx', 127)
+// CHECK:    (('n_strx', 79)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 34)
 // CHECK:     ('n_desc', 0)
@@ -973,7 +973,7 @@
 // CHECK:     ('_string', 'D34')
 // CHECK:    ),
 // CHECK:     # Symbol 35
-// CHECK:    (('n_strx', 131)
+// CHECK:    (('n_strx', 64)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -981,7 +981,7 @@
 // CHECK:     ('_string', 'D35')
 // CHECK:    ),
 // CHECK:     # Symbol 36
-// CHECK:    (('n_strx', 135)
+// CHECK:    (('n_strx', 49)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -989,7 +989,7 @@
 // CHECK:     ('_string', 'D36')
 // CHECK:    ),
 // CHECK:     # Symbol 37
-// CHECK:    (('n_strx', 139)
+// CHECK:    (('n_strx', 34)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -997,7 +997,7 @@
 // CHECK:     ('_string', 'D37')
 // CHECK:    ),
 // CHECK:     # Symbol 38
-// CHECK:    (('n_strx', 143)
+// CHECK:    (('n_strx', 19)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 35)
 // CHECK:     ('n_desc', 0)
@@ -1005,7 +1005,7 @@
 // CHECK:     ('_string', 'D38')
 // CHECK:    ),
 // CHECK:     # Symbol 39
-// CHECK:    (('n_strx', 147)
+// CHECK:    (('n_strx', 4)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 36)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/x86_64-symbols.s b/test/MC/MachO/x86_64-symbols.s
index 804cee8..9788feb 100644
--- a/test/MC/MachO/x86_64-symbols.s
+++ b/test/MC/MachO/x86_64-symbols.s

@@ -647,10 +647,10 @@
 // CHECK:   ('nsyms', 40)
 // CHECK:   ('stroff', 3328)
 // CHECK:   ('strsize', 152)
-// CHECK:   ('_string_data', '\x00D0\x00D1\x00D2\x00D3\x00L4\x00D4\x00D5\x00D6\x00D7\x00D8\x00D9\x00D12\x00D13\x00D16\x00D17\x00D18\x00D19\x00D20\x00D21\x00D22\x00D23\x00D24\x00D25\x00D26\x00D27\x00D28\x00D29\x00D30\x00D31\x00D32\x00D33\x00D34\x00L35\x00D35\x00L36\x00D36\x00L37\x00D37\x00L38\x00D38\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00D9\x00D29\x00D19\x00D8\x00L38\x00D38\x00D28\x00D18\x00D7\x00L37\x00D37\x00D27\x00D17\x00D6\x00L36\x00D36\x00D26\x00D16\x00D5\x00L35\x00D35\x00D25\x00L4\x00D4\x00D34\x00D24\x00D3\x00D33\x00D23\x00D13\x00D2\x00D32\x00D22\x00D12\x00D1\x00D31\x00D21\x00D0\x00D30\x00D20\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 139)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -658,7 +658,7 @@
 // CHECK:     ('_string', 'D0')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 4)
+// CHECK:    (('n_strx', 128)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 1)
 // CHECK:     ('n_desc', 0)
@@ -666,7 +666,7 @@
 // CHECK:     ('_string', 'D1')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 7)
+// CHECK:    (('n_strx', 113)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -674,7 +674,7 @@
 // CHECK:     ('_string', 'D2')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 10)
+// CHECK:    (('n_strx', 98)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 3)
 // CHECK:     ('n_desc', 0)
@@ -682,7 +682,7 @@
 // CHECK:     ('_string', 'D3')
 // CHECK:    ),
 // CHECK:     # Symbol 4
-// CHECK:    (('n_strx', 13)
+// CHECK:    (('n_strx', 84)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -690,7 +690,7 @@
 // CHECK:     ('_string', 'L4')
 // CHECK:    ),
 // CHECK:     # Symbol 5
-// CHECK:    (('n_strx', 16)
+// CHECK:    (('n_strx', 87)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -698,7 +698,7 @@
 // CHECK:     ('_string', 'D4')
 // CHECK:    ),
 // CHECK:     # Symbol 6
-// CHECK:    (('n_strx', 19)
+// CHECK:    (('n_strx', 69)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 5)
 // CHECK:     ('n_desc', 0)
@@ -706,7 +706,7 @@
 // CHECK:     ('_string', 'D5')
 // CHECK:    ),
 // CHECK:     # Symbol 7
-// CHECK:    (('n_strx', 22)
+// CHECK:    (('n_strx', 50)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 6)
 // CHECK:     ('n_desc', 0)
@@ -714,7 +714,7 @@
 // CHECK:     ('_string', 'D6')
 // CHECK:    ),
 // CHECK:     # Symbol 8
-// CHECK:    (('n_strx', 25)
+// CHECK:    (('n_strx', 31)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 7)
 // CHECK:     ('n_desc', 0)
@@ -722,7 +722,7 @@
 // CHECK:     ('_string', 'D7')
 // CHECK:    ),
 // CHECK:     # Symbol 9
-// CHECK:    (('n_strx', 28)
+// CHECK:    (('n_strx', 12)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 8)
 // CHECK:     ('n_desc', 0)
@@ -730,7 +730,7 @@
 // CHECK:     ('_string', 'D8')
 // CHECK:    ),
 // CHECK:     # Symbol 10
-// CHECK:    (('n_strx', 31)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 9)
 // CHECK:     ('n_desc', 0)
@@ -738,7 +738,7 @@
 // CHECK:     ('_string', 'D9')
 // CHECK:    ),
 // CHECK:     # Symbol 11
-// CHECK:    (('n_strx', 34)
+// CHECK:    (('n_strx', 124)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 10)
 // CHECK:     ('n_desc', 0)
@@ -746,7 +746,7 @@
 // CHECK:     ('_string', 'D12')
 // CHECK:    ),
 // CHECK:     # Symbol 12
-// CHECK:    (('n_strx', 38)
+// CHECK:    (('n_strx', 109)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 11)
 // CHECK:     ('n_desc', 0)
@@ -754,7 +754,7 @@
 // CHECK:     ('_string', 'D13')
 // CHECK:    ),
 // CHECK:     # Symbol 13
-// CHECK:    (('n_strx', 42)
+// CHECK:    (('n_strx', 65)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 12)
 // CHECK:     ('n_desc', 0)
@@ -770,7 +770,7 @@
 // CHECK:     ('_string', 'D17')
 // CHECK:    ),
 // CHECK:     # Symbol 15
-// CHECK:    (('n_strx', 50)
+// CHECK:    (('n_strx', 27)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 14)
 // CHECK:     ('n_desc', 0)
@@ -778,7 +778,7 @@
 // CHECK:     ('_string', 'D18')
 // CHECK:    ),
 // CHECK:     # Symbol 16
-// CHECK:    (('n_strx', 54)
+// CHECK:    (('n_strx', 8)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 15)
 // CHECK:     ('n_desc', 0)
@@ -786,7 +786,7 @@
 // CHECK:     ('_string', 'D19')
 // CHECK:    ),
 // CHECK:     # Symbol 17
-// CHECK:    (('n_strx', 58)
+// CHECK:    (('n_strx', 146)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 16)
 // CHECK:     ('n_desc', 0)
@@ -794,7 +794,7 @@
 // CHECK:     ('_string', 'D20')
 // CHECK:    ),
 // CHECK:     # Symbol 18
-// CHECK:    (('n_strx', 62)
+// CHECK:    (('n_strx', 135)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 17)
 // CHECK:     ('n_desc', 0)
@@ -802,7 +802,7 @@
 // CHECK:     ('_string', 'D21')
 // CHECK:    ),
 // CHECK:     # Symbol 19
-// CHECK:    (('n_strx', 66)
+// CHECK:    (('n_strx', 120)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 18)
 // CHECK:     ('n_desc', 0)
@@ -810,7 +810,7 @@
 // CHECK:     ('_string', 'D22')
 // CHECK:    ),
 // CHECK:     # Symbol 20
-// CHECK:    (('n_strx', 70)
+// CHECK:    (('n_strx', 105)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 19)
 // CHECK:     ('n_desc', 0)
@@ -818,7 +818,7 @@
 // CHECK:     ('_string', 'D23')
 // CHECK:    ),
 // CHECK:     # Symbol 21
-// CHECK:    (('n_strx', 74)
+// CHECK:    (('n_strx', 94)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 20)
 // CHECK:     ('n_desc', 0)
@@ -826,7 +826,7 @@
 // CHECK:     ('_string', 'D24')
 // CHECK:    ),
 // CHECK:     # Symbol 22
-// CHECK:    (('n_strx', 78)
+// CHECK:    (('n_strx', 80)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 21)
 // CHECK:     ('n_desc', 0)
@@ -834,7 +834,7 @@
 // CHECK:     ('_string', 'D25')
 // CHECK:    ),
 // CHECK:     # Symbol 23
-// CHECK:    (('n_strx', 82)
+// CHECK:    (('n_strx', 61)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 22)
 // CHECK:     ('n_desc', 0)
@@ -842,7 +842,7 @@
 // CHECK:     ('_string', 'D26')
 // CHECK:    ),
 // CHECK:     # Symbol 24
-// CHECK:    (('n_strx', 86)
+// CHECK:    (('n_strx', 42)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 23)
 // CHECK:     ('n_desc', 0)
@@ -850,7 +850,7 @@
 // CHECK:     ('_string', 'D27')
 // CHECK:    ),
 // CHECK:     # Symbol 25
-// CHECK:    (('n_strx', 90)
+// CHECK:    (('n_strx', 23)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 24)
 // CHECK:     ('n_desc', 0)
@@ -858,7 +858,7 @@
 // CHECK:     ('_string', 'D28')
 // CHECK:    ),
 // CHECK:     # Symbol 26
-// CHECK:    (('n_strx', 94)
+// CHECK:    (('n_strx', 4)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 25)
 // CHECK:     ('n_desc', 0)
@@ -866,7 +866,7 @@
 // CHECK:     ('_string', 'D29')
 // CHECK:    ),
 // CHECK:     # Symbol 27
-// CHECK:    (('n_strx', 98)
+// CHECK:    (('n_strx', 142)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 26)
 // CHECK:     ('n_desc', 0)
@@ -874,7 +874,7 @@
 // CHECK:     ('_string', 'D30')
 // CHECK:    ),
 // CHECK:     # Symbol 28
-// CHECK:    (('n_strx', 102)
+// CHECK:    (('n_strx', 131)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 27)
 // CHECK:     ('n_desc', 0)
@@ -882,7 +882,7 @@
 // CHECK:     ('_string', 'D31')
 // CHECK:    ),
 // CHECK:     # Symbol 29
-// CHECK:    (('n_strx', 106)
+// CHECK:    (('n_strx', 116)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 28)
 // CHECK:     ('n_desc', 0)
@@ -890,7 +890,7 @@
 // CHECK:     ('_string', 'D32')
 // CHECK:    ),
 // CHECK:     # Symbol 30
-// CHECK:    (('n_strx', 110)
+// CHECK:    (('n_strx', 101)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 29)
 // CHECK:     ('n_desc', 0)
@@ -898,7 +898,7 @@
 // CHECK:     ('_string', 'D33')
 // CHECK:    ),
 // CHECK:     # Symbol 31
-// CHECK:    (('n_strx', 114)
+// CHECK:    (('n_strx', 90)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 30)
 // CHECK:     ('n_desc', 0)
@@ -906,7 +906,7 @@
 // CHECK:     ('_string', 'D34')
 // CHECK:    ),
 // CHECK:     # Symbol 32
-// CHECK:    (('n_strx', 118)
+// CHECK:    (('n_strx', 72)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -914,7 +914,7 @@
 // CHECK:     ('_string', 'L35')
 // CHECK:    ),
 // CHECK:     # Symbol 33
-// CHECK:    (('n_strx', 122)
+// CHECK:    (('n_strx', 76)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -922,7 +922,7 @@
 // CHECK:     ('_string', 'D35')
 // CHECK:    ),
 // CHECK:     # Symbol 34
-// CHECK:    (('n_strx', 126)
+// CHECK:    (('n_strx', 53)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -930,7 +930,7 @@
 // CHECK:     ('_string', 'L36')
 // CHECK:    ),
 // CHECK:     # Symbol 35
-// CHECK:    (('n_strx', 130)
+// CHECK:    (('n_strx', 57)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -938,7 +938,7 @@
 // CHECK:     ('_string', 'D36')
 // CHECK:    ),
 // CHECK:     # Symbol 36
-// CHECK:    (('n_strx', 134)
+// CHECK:    (('n_strx', 34)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -946,7 +946,7 @@
 // CHECK:     ('_string', 'L37')
 // CHECK:    ),
 // CHECK:     # Symbol 37
-// CHECK:    (('n_strx', 138)
+// CHECK:    (('n_strx', 38)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 4)
 // CHECK:     ('n_desc', 0)
@@ -954,7 +954,7 @@
 // CHECK:     ('_string', 'D37')
 // CHECK:    ),
 // CHECK:     # Symbol 38
-// CHECK:    (('n_strx', 142)
+// CHECK:    (('n_strx', 15)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 31)
 // CHECK:     ('n_desc', 0)
@@ -962,7 +962,7 @@
 // CHECK:     ('_string', 'L38')
 // CHECK:    ),
 // CHECK:     # Symbol 39
-// CHECK:    (('n_strx', 146)
+// CHECK:    (('n_strx', 19)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 31)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/MachO/zerofill-3.s b/test/MC/MachO/zerofill-3.s
index cc81fa8..a4cd31e 100644
--- a/test/MC/MachO/zerofill-3.s
+++ b/test/MC/MachO/zerofill-3.s

@@ -78,10 +78,10 @@
 // CHECK:   ('nsyms', 4)
 // CHECK:   ('stroff', 372)
 // CHECK:   ('strsize', 52)
-// CHECK:   ('_string_data', '\x00sym_lcomm_C\x00sym_lcomm_D\x00sym_lcomm_A\x00sym_lcomm_B\x00\x00\x00\x00')
+// CHECK:   ('_string_data', '\x00sym_lcomm_D\x00sym_lcomm_C\x00sym_lcomm_B\x00sym_lcomm_A\x00\x00\x00\x00')
 // CHECK:   ('_symbols', [
 // CHECK:     # Symbol 0
-// CHECK:    (('n_strx', 25)
+// CHECK:    (('n_strx', 37)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -89,7 +89,7 @@
 // CHECK:     ('_string', 'sym_lcomm_A')
 // CHECK:    ),
 // CHECK:     # Symbol 1
-// CHECK:    (('n_strx', 37)
+// CHECK:    (('n_strx', 25)
 // CHECK:     ('n_type', 0xe)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -97,7 +97,7 @@
 // CHECK:     ('_string', 'sym_lcomm_B')
 // CHECK:    ),
 // CHECK:     # Symbol 2
-// CHECK:    (('n_strx', 1)
+// CHECK:    (('n_strx', 13)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)
@@ -105,7 +105,7 @@
 // CHECK:     ('_string', 'sym_lcomm_C')
 // CHECK:    ),
 // CHECK:     # Symbol 3
-// CHECK:    (('n_strx', 13)
+// CHECK:    (('n_strx', 1)
 // CHECK:     ('n_type', 0xf)
 // CHECK:     ('n_sect', 2)
 // CHECK:     ('n_desc', 0)

diff --git a/test/MC/Mips/cpload-bad.s b/test/MC/Mips/cpload-bad.s
index 7d186f6..803610a 100644
--- a/test/MC/Mips/cpload-bad.s
+++ b/test/MC/Mips/cpload-bad.s

@@ -3,13 +3,25 @@
 
         .text
         .option pic2
+        .set noreorder
+        .set mips16
+        .cpload $25
+# ASM: :[[@LINE-1]]:17: error: .cpload is not supported in Mips16 mode
+
+        .set nomips16
         .set reorder
         .cpload $25
-# ASM: :[[@LINE-1]]:9: warning: .cpload in reorder section
+# ASM: :[[@LINE-1]]:9: warning: .cpload should be inside a noreorder section
+
         .set noreorder
         .cpload $32
 # ASM: :[[@LINE-1]]:17: error: invalid register
+
         .cpload $foo
 # ASM: :[[@LINE-1]]:17: error: expected register containing function address
+
         .cpload bar
 # ASM: :[[@LINE-1]]:17: error: expected register containing function address
+
+        .cpload $25 foobar
+# ASM: :[[@LINE-1]]:21: error: unexpected token, expected end of statement

diff --git a/test/MC/Mips/cpload.s b/test/MC/Mips/cpload.s
index bc5e797..46b3ee4 100644
--- a/test/MC/Mips/cpload.s
+++ b/test/MC/Mips/cpload.s

@@ -1,12 +1,16 @@
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=ASM
 #
-# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -filetype=obj -o -| \
-# RUN: llvm-objdump -d -r -arch=mips - | \
-# RUN: FileCheck %s -check-prefix=OBJ
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+o32 -filetype=obj -o -| \
+# RUN:  llvm-objdump -d -r -arch=mips - | \
+# RUN:    FileCheck %s -check-prefix=OBJ-O32
 
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -filetype=obj -o -| \
-# RUN: llvm-objdump -d -r -arch=mips - | \
-# RUN: FileCheck %s -check-prefix=OBJ64
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=-n64,+n32 -filetype=obj -o -| \
+# RUN:  llvm-objdump -d -r -arch=mips - | \
+# RUN:    FileCheck %s -check-prefix=OBJ-N32
+
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+n64 -filetype=obj -o -| \
+# RUN:  llvm-objdump -d -r -arch=mips - | \
+# RUN:    FileCheck %s -check-prefix=OBJ-N64
 
 # ASM:    .text
 # ASM:    .option pic2
@@ -14,17 +18,22 @@
 # ASM:    .cpload $25
 # ASM:    .set reorder
 
-# OBJ:    .text
-# OBJ:    lui $gp, 0
-# OBJ: R_MIPS_HI16 _gp_disp
-# OBJ:    addiu $gp, $gp, 0
-# OBJ: R_MIPS_LO16 _gp_disp
-# OBJ:    addu $gp, $gp, $25
+# OBJ-O32:    .text
+# OBJ-O32:    lui $gp, 0
+# OBJ-O32: R_MIPS_HI16 _gp_disp
+# OBJ-O32:    addiu $gp, $gp, 0
+# OBJ-O32: R_MIPS_LO16 _gp_disp
+# OBJ-O32:    addu $gp, $gp, $25
 
-# OBJ64: .text
-# OBJ64-NOT: lui $gp, 0
-# OBJ64-NOT: addiu $gp, $gp, 0
-# OBJ64-NOT: addu $gp, $gp, $25
+# OBJ-N32-NOT: .text
+# OBJ-N32-NOT: lui   $gp, 0
+# OBJ-N32-NOT: addiu $gp, $gp, 0
+# OBJ-N32-NOT: addu  $gp, $gp, $25
+
+# OBJ-N64-NOT: .text
+# OBJ-N64-NOT: lui   $gp, 0
+# OBJ-N64-NOT: addiu $gp, $gp, 0
+# OBJ-N64-NOT: addu  $gp, $gp, $25
 
         .text
         .option pic2

diff --git a/test/MC/Mips/elf-objdump.s b/test/MC/Mips/elf-objdump.s
deleted file mode 100644
index 6a5c2a5..0000000
--- a/test/MC/Mips/elf-objdump.s
+++ /dev/null

@@ -1,11 +0,0 @@
-// 32 bit big endian
-// RUN: llvm-mc -filetype=obj -triple mips-unknown-linux %s -o - | llvm-objdump -d -triple mips-unknown-linux  - | FileCheck %s
-// 32 bit little endian
-// RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux %s -o - | llvm-objdump -d -triple mips-unknown-linux  - | FileCheck %s
-// 64 bit big endian
-// RUN: llvm-mc -filetype=obj -arch=mips64 -triple mips64-unknown-linux %s -o - | llvm-objdump -d -triple mips-unknown-linux - | FileCheck %s
-// 64 bit little endian
-// RUN: llvm-mc -filetype=obj -arch=mips64el -triple mips64el-unknown-linux %s -o - | llvm-objdump -d -triple mips-unknown-linux - | FileCheck %s
-
-// We just want to see if llvm-objdump works at all.
-// CHECK: .text

diff --git a/test/MC/Mips/elf_eflags.s b/test/MC/Mips/elf_eflags.s
index 36f4f9e..1f28ee0 100644
--- a/test/MC/Mips/elf_eflags.s
+++ b/test/MC/Mips/elf_eflags.s

@@ -1,118 +1,119 @@
-# These *MUST* match the output of gas compiled with the same triple and
+# These *MUST* match the output of 'gcc -c' compiled with the same triple and
 # corresponding options (-mcpu=mips32 -> -mips32 for example).
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r6 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R6 %s
-# MIPSEL-MIPS64R6: Flags [ (0xA0001500)
+# MIPSEL-MIPS64R6: Flags [ (0xA0000406)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r6 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R6-NAN2008 %s
-# MIPSEL-MIPS64R6-NAN2008: Flags [ (0xA0001500)
+# MIPSEL-MIPS64R6-NAN2008: Flags [ (0xA0000406)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2 %s
-# MIPSEL-MIPS64R2: Flags [ (0x80001100)
+# MIPSEL-MIPS64R2: Flags [ (0x80000006)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64R2-NAN2008 %s
-# MIPSEL-MIPS64R2-NAN2008: Flags [ (0x80001500)
+# MIPSEL-MIPS64R2-NAN2008: Flags [ (0x80000406)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64 %s
-# MIPSEL-MIPS64: Flags [ (0x60001100)
+# MIPSEL-MIPS64: Flags [ (0x60000006)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS64-NAN2008 %s
-# MIPSEL-MIPS64-NAN2008: Flags [ (0x60001500)
+# MIPSEL-MIPS64-NAN2008: Flags [ (0x60000406)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r6 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R6 %s
-# MIPSEL-MIPS32R6: Flags [ (0x90001400)
+# MIPSEL-MIPS32R6: Flags [ (0x90001404)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r6 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R6-NAN2008 %s
-# MIPSEL-MIPS32R6-NAN2008: Flags [ (0x90001400)
+# MIPSEL-MIPS32R6-NAN2008: Flags [ (0x90001404)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2 %s
-# MIPSEL-MIPS32R2: Flags [ (0x70001000)
+# MIPSEL-MIPS32R2: Flags [ (0x70001004)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32R2-NAN2008 %s
-# MIPSEL-MIPS32R2-NAN2008: Flags [ (0x70001400)
+# MIPSEL-MIPS32R2-NAN2008: Flags [ (0x70001404)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32 %s
-# MIPSEL-MIPS32: Flags [ (0x50001000)
+# MIPSEL-MIPS32: Flags [ (0x50001004)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS32-NAN2008 %s
-# MIPSEL-MIPS32-NAN2008: Flags [ (0x50001400)
+# MIPSEL-MIPS32-NAN2008: Flags [ (0x50001404)
 
+# RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips64r2 -mattr=-n64,n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32 %s
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32 %s
-# MIPS64EL-MIPS64R2-N32: Flags [ (0x80000020)
+# MIPS64EL-MIPS64R2-N32: Flags [ (0x80000024)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,n32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N32-NAN2008 %s
-# MIPS64EL-MIPS64R2-N32-NAN2008: Flags [ (0x80000420)
+# MIPS64EL-MIPS64R2-N32-NAN2008: Flags [ (0x80000424)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -mattr=-n64,n32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N32 %s
-# MIPS64EL-MIPS64-N32: Flags [ (0x60000020)
+# MIPS64EL-MIPS64-N32: Flags [ (0x60000024)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -mattr=-n64,n32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N32-NAN2008 %s
-# MIPS64EL-MIPS64-N32-NAN2008: Flags [ (0x60000420)
+# MIPS64EL-MIPS64-N32-NAN2008: Flags [ (0x60000424)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=n64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N64 %s
-# MIPS64EL-MIPS64R2-N64: Flags [ (0x80000000)
+# MIPS64EL-MIPS64R2-N64: Flags [ (0x80000006)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=n64,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-N64-NAN2008 %s
-# MIPS64EL-MIPS64R2-N64-NAN2008: Flags [ (0x80000400)
+# MIPS64EL-MIPS64R2-N64-NAN2008: Flags [ (0x80000406)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=n64 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N64 %s
-# MIPS64EL-MIPS64-N64: Flags [ (0x60000000)
+# MIPS64EL-MIPS64-N64: Flags [ (0x60000006)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=n64,+nan2008 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-N64-NAN2008 %s
-# MIPS64EL-MIPS64-N64-NAN2008: Flags [ (0x60000400)
+# MIPS64EL-MIPS64-N64-NAN2008: Flags [ (0x60000406)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,o32 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-O32 %s
-# MIPS64EL-MIPS64R2-O32: Flags [ (0x80001100)
+# MIPS64EL-MIPS64R2-O32: Flags [ (0x80001104)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=-n64,o32,+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-O32-NAN2008 %s
-# MIPS64EL-MIPS64R2-O32-NAN2008: Flags [ (0x80001500)
+# MIPS64EL-MIPS64R2-O32-NAN2008: Flags [ (0x80001504)
 
 # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips5 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS5 %s
-# MIPS5: Flags [ (0x40000000)
+# MIPS5: Flags [ (0x40000006)
 
  # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips5 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS5-NAN2008 %s
-# MIPS5-NAN2008: Flags [ (0x40000400)
+# MIPS5-NAN2008: Flags [ (0x40000406)
 
 # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips4 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS4 %s
-# MIPS4: Flags [ (0x30000000)
+# MIPS4: Flags [ (0x30000006)
 
  # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips4 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS4-NAN2008 %s
-# MIPS4-NAN2008: Flags [ (0x30000400)
+# MIPS4-NAN2008: Flags [ (0x30000406)
 
 # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips3 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS3 %s
-# MIPS3: Flags [ (0x20000000)
+# MIPS3: Flags [ (0x20000006)
 
  # RUN: llvm-mc -filetype=obj -triple mips64-unknown-linux -mcpu=mips3 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS3-NAN2008 %s
-# MIPS3-NAN2008: Flags [ (0x20000400)
+# MIPS3-NAN2008: Flags [ (0x20000406)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS2 %s
-# MIPSEL-MIPS2: Flags [ (0x10001000)
+# MIPSEL-MIPS2: Flags [ (0x10001004)
 
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPSEL-MIPS2-NAN2008 %s
-# MIPSEL-MIPS2-NAN2008: Flags [ (0x10001400)
+# MIPSEL-MIPS2-NAN2008: Flags [ (0x10001404)
 
 # RUN: llvm-mc -filetype=obj -triple mips-unknown-linux -mcpu=mips1 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS1 %s
-# MIPS1: Flags [ (0x1000)
+# MIPS1: Flags [ (0x1004)
 
  # RUN: llvm-mc -filetype=obj -triple mips-unknown-linux -mcpu=mips1 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS1-NAN2008 %s
-# MIPS1-NAN2008: Flags [ (0x1400)
+# MIPS1-NAN2008: Flags [ (0x1404)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=-n64,o32 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-O32 %s
-# MIPS64EL-MIPS64-O32: Flags [ (0x60001100)
+# MIPS64EL-MIPS64-O32: Flags [ (0x60001104)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -mattr=-n64,o32,+nan2008 -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-O32-NAN2008 %s
-# MIPS64EL-MIPS64-O32-NAN2008: Flags [ (0x60001500)
+# MIPS64EL-MIPS64-O32-NAN2008: Flags [ (0x60001504)
 
 # Default ABI for MIPS64 is N64 as opposed to GCC/GAS (N32)
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2 %s
-# MIPS64EL-MIPS64R2: Flags [ (0x80000000)
+# MIPS64EL-MIPS64R2: Flags [ (0x80000006)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64r2 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64R2-NAN2008 %s
-# MIPS64EL-MIPS64R2-NAN2008: Flags [ (0x80000400)
+# MIPS64EL-MIPS64R2-NAN2008: Flags [ (0x80000406)
 
 # Default ABI for MIPS64 is N64 as opposed to GCC/GAS (N32)
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64 %s
-# MIPS64EL-MIPS64: Flags [ (0x60000000)
+# MIPS64EL-MIPS64: Flags [ (0x60000006)
 
 # RUN: llvm-mc -filetype=obj -triple mips64el-unknown-linux -mcpu=mips64 -mattr=+nan2008 %s -o -| llvm-readobj -h | FileCheck --check-prefix=MIPS64EL-MIPS64-NAN2008 %s
-# MIPS64EL-MIPS64-NAN2008: Flags [ (0x60000400)
+# MIPS64EL-MIPS64-NAN2008: Flags [ (0x60000406)

diff --git a/test/MC/Mips/elf_eflags_abicalls.s b/test/MC/Mips/elf_eflags_abicalls.s
index 5f39630..9e9c013 100644
--- a/test/MC/Mips/elf_eflags_abicalls.s
+++ b/test/MC/Mips/elf_eflags_abicalls.s

@@ -1,6 +1,6 @@
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| llvm-readobj -h | FileCheck %s
 
-# This *MUST* match the output of gas compiled with the same triple.
+# This *MUST* match the output of 'gcc -c' compiled with the same triple.
 # CHECK: Flags [ (0x50001006)
 
 .abicalls

diff --git a/test/MC/Mips/elf_eflags_micromips.s b/test/MC/Mips/elf_eflags_micromips.s
index 14bbcad..9b7de12 100644
--- a/test/MC/Mips/elf_eflags_micromips.s
+++ b/test/MC/Mips/elf_eflags_micromips.s

@@ -1,7 +1,7 @@
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| llvm-readobj -h | FileCheck %s
 
-# This *MUST* match the output of gas compiled with the same triple.
-# CHECK: Flags [ (0x52001000)
+# This *MUST* match the output of 'gcc -c' compiled with the same triple.
+# CHECK: Flags [ (0x52001004)
 
         .set micromips
 f:

diff --git a/test/MC/Mips/elf_eflags_mips16.s b/test/MC/Mips/elf_eflags_mips16.s
index deac3d4..5143d36 100644
--- a/test/MC/Mips/elf_eflags_mips16.s
+++ b/test/MC/Mips/elf_eflags_mips16.s

@@ -1,7 +1,7 @@
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| llvm-readobj -h | FileCheck %s
 
-# This *MUST* match the output of gas compiled with the same triple.
-# CHECK: Flags [ (0x54001000)
+# This *MUST* match the output of 'gcc -c' compiled with the same triple.
+# CHECK: Flags [ (0x54001004)
 
         .set mips16
 f:

diff --git a/test/MC/Mips/elf_eflags_nan2008.s b/test/MC/Mips/elf_eflags_nan2008.s
index 71a22be..f690342 100644
--- a/test/MC/Mips/elf_eflags_nan2008.s
+++ b/test/MC/Mips/elf_eflags_nan2008.s

@@ -4,8 +4,8 @@
 # RUN: llvm-mc -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 
-# This *MUST* match the output of gas compiled with the same triple.
-# CHECK-OBJ: Flags [ (0x50001400)
+# This *MUST* match the output of 'gcc -c' compiled with the same triple.
+# CHECK-OBJ: Flags [ (0x50001404)
 
 # CHECK-ASM: .nan 2008
 

diff --git a/test/MC/Mips/elf_eflags_nanlegacy.s b/test/MC/Mips/elf_eflags_nanlegacy.s
index 6897ad2..0fa0787 100644
--- a/test/MC/Mips/elf_eflags_nanlegacy.s
+++ b/test/MC/Mips/elf_eflags_nanlegacy.s

@@ -4,12 +4,12 @@
 # RUN: llvm-mc -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 
-# This *MUST* match the output of gas compiled with the same triple.
-# CHECK-OBJ: Flags [ (0x50001000)
+# This *MUST* match the output of 'gcc -c' compiled with the same triple.
+# CHECK-OBJ: Flags [ (0x50001004)
 
 # CHECK-ASM: .nan 2008
 # CHECK-ASM: .nan legacy
 
 .nan 2008
-// Let's override the previous directive!
+# Let's override the previous directive!
 .nan legacy

diff --git a/test/MC/Mips/elf_eflags_noreorder.s b/test/MC/Mips/elf_eflags_noreorder.s
index 3fea18b..fe46b41 100644
--- a/test/MC/Mips/elf_eflags_noreorder.s
+++ b/test/MC/Mips/elf_eflags_noreorder.s

@@ -1,6 +1,6 @@
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| llvm-readobj -h | FileCheck %s
 
-# This *MUST* match the output of gas compiled with the same triple.
-# CHECK: Flags [ (0x50001001)
+# This *MUST* match the output of 'gcc -c' compiled with the same triple.
+# CHECK: Flags [ (0x50001005)
 
 .set noreorder

diff --git a/test/MC/Mips/elf_eflags_pic0.s b/test/MC/Mips/elf_eflags_pic0.s
index a78ca03..04115fa 100644
--- a/test/MC/Mips/elf_eflags_pic0.s
+++ b/test/MC/Mips/elf_eflags_pic0.s

@@ -1,6 +1,6 @@
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| llvm-readobj -h | FileCheck %s
 
-# This *MUST* match the output of gas compiled with the same triple.
+# This *MUST* match the output of 'gcc -c' compiled with the same triple.
 # CHECK: Flags [ (0x50001004)
 
 .abicalls

diff --git a/test/MC/Mips/elf_eflags_pic2.s b/test/MC/Mips/elf_eflags_pic2.s
index a15208a..692c478 100644
--- a/test/MC/Mips/elf_eflags_pic2.s
+++ b/test/MC/Mips/elf_eflags_pic2.s

@@ -1,6 +1,6 @@
 # RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux -mcpu=mips32 %s -o -| llvm-readobj -h | FileCheck %s
 
-# This *MUST* match the output of gas compiled with the same triple.
+# This *MUST* match the output of 'gcc -c' compiled with the same triple.
 # CHECK: Flags [ (0x50001006)
 
 .option pic2

diff --git a/test/MC/Mips/elf_reginfo.s b/test/MC/Mips/elf_reginfo.s
index 5e9ddf8..ba4788a 100644
--- a/test/MC/Mips/elf_reginfo.s
+++ b/test/MC/Mips/elf_reginfo.s

@@ -14,13 +14,19 @@
 # check for .MIPS.options
 # CHECK_64:      Sections [
 # CHECK_64:        Section {
-# CHECK_64:          Name: .MIPS.options
+# CHECK_64-LABEL:    Name: .MIPS.options
 # CHECK_64-NEXT:     Type: SHT_MIPS_OPTIONS
 # CHECK_64-NEXT:     Flags [ (0x8000002)
+# CHECK_64:          AddressAlignment: 8
+# CHECK_64:          EntrySize: 1
+# CHECK_64-LABEL:  }
 
 # check for .reginfo
 # CHECK_32:      Sections [
 # CHECK_32:        Section {
-# CHECK_32:          Name: .reginfo
+# CHECK_32-LABEL:    Name: .reginfo
 # CHECK_32-NEXT:     Type:  SHT_MIPS_REGINFO
 # CHECK_32-NEXT:     Flags [ (0x2)
+# CHECK_32:          AddressAlignment: 8
+# CHECK_32:          EntrySize: 24
+# CHECK_32-LABEL:  }

diff --git a/test/MC/Mips/micromips-16-bit-instructions.s b/test/MC/Mips/micromips-16-bit-instructions.s
index 31bddcc..35855e1 100644
--- a/test/MC/Mips/micromips-16-bit-instructions.s
+++ b/test/MC/Mips/micromips-16-bit-instructions.s

@@ -9,19 +9,85 @@
 #------------------------------------------------------------------------------
 # Little endian
 #------------------------------------------------------------------------------
+# CHECK-EL: addu16  $6, $17, $4     # encoding: [0x42,0x07]
+# CHECK-EL: subu16  $5, $16, $3     # encoding: [0xb1,0x06]
+# CHECK-EL: andi16  $16, $2, 31     # encoding: [0x29,0x2c]
+# CHECK-EL: and16   $16, $2         # encoding: [0x82,0x44]
+# CHECK-EL: not16   $17, $3         # encoding: [0x0b,0x44]
+# CHECK-EL: or16    $16, $4         # encoding: [0xc4,0x44]
+# CHECK-EL: xor16   $17, $5         # encoding: [0x4d,0x44]
+# CHECK-EL: sll16   $3, $16, 5      # encoding: [0x8a,0x25]
+# CHECK-EL: srl16   $4, $17, 6      # encoding: [0x1d,0x26]
+# CHECK-EL: li16    $3, -1          # encoding: [0xff,0xed]
+# CHECK-EL: li16    $3, 126         # encoding: [0xfe,0xed]
+# CHECK-EL: addiur1sp $7, 4         # encoding: [0x83,0x6f]
+# CHECK-EL: addiur2 $6, $7, -1      # encoding: [0x7e,0x6f]
+# CHECK-EL: addiur2 $6, $7, 12      # encoding: [0x76,0x6f]
+# CHECK-EL: addius5 $7, -2          # encoding: [0xfc,0x4c]
+# CHECK-EL: addiusp -16             # encoding: [0xf9,0x4f]
 # CHECK-EL: mfhi    $9              # encoding: [0x09,0x46]
 # CHECK-EL: mflo    $9              # encoding: [0x49,0x46]
 # CHECK-EL: move    $25, $1         # encoding: [0x21,0x0f]
-# CHECK-EL: jalr    $9              # encoding: [0xc9,0x45]
+# CHECK-EL: jrc     $9              # encoding: [0xa9,0x45]
+# CHECK-NEXT: jalr    $9            # encoding: [0xc9,0x45]
+# CHECK-EL: jraddiusp 20            # encoding: [0x05,0x47]
+# CHECK-EL: nop                     # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: jalrs16 $9              # encoding: [0xe9,0x45]
+# CHECK-EL: move    $zero, $zero    # encoding: [0x00,0x0c]
+# CHECK-EL: jr16    $9              # encoding: [0x89,0x45]
+# CHECK-EL: nop                     # encoding: [0x00,0x00,0x00,0x00]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
+# CHECK-EB: addu16  $6, $17, $4     # encoding: [0x07,0x42]
+# CHECK-EB: subu16  $5, $16, $3     # encoding: [0x06,0xb1]
+# CHECK-EB: andi16  $16, $2, 31     # encoding: [0x2c,0x29]
+# CHECK-EB: and16   $16, $2         # encoding: [0x44,0x82]
+# CHECK-EB: not16   $17, $3         # encoding: [0x44,0x0b]
+# CHECK-EB: or16    $16, $4         # encoding: [0x44,0xc4]
+# CHECK-EB: xor16   $17, $5         # encoding: [0x44,0x4d]
+# CHECK-EB: sll16   $3, $16, 5      # encoding: [0x25,0x8a]
+# CHECK-EB: srl16   $4, $17, 6      # encoding: [0x26,0x1d]
+# CHECK-EB: li16    $3, -1          # encoding: [0xed,0xff]
+# CHECK-EB: li16    $3, 126         # encoding: [0xed,0xfe]
+# CHECK-EB: addiur1sp $7, 4         # encoding: [0x6f,0x83]
+# CHECK-EB: addiur2 $6, $7, -1      # encoding: [0x6f,0x7e]
+# CHECK-EB: addiur2 $6, $7, 12      # encoding: [0x6f,0x76]
+# CHECK-EB: addius5 $7, -2          # encoding: [0x4c,0xfc]
+# CHECK-EB: addiusp -16             # encoding: [0x4f,0xf9]
 # CHECK-EB: mfhi    $9              # encoding: [0x46,0x09]
 # CHECK-EB: mflo    $9              # encoding: [0x46,0x49]
 # CHECK-EB: move    $25, $1         # encoding: [0x0f,0x21]
-# CHECK-EB: jalr    $9              # encoding: [0x45,0xc9]
+# CHECK-EB: jrc     $9              # encoding: [0x45,0xa9]
+# CHECK-NEXT: jalr    $9            # encoding: [0x45,0xc9]
+# CHECK-EB: jraddiusp 20            # encoding: [0x47,0x05]
+# CHECK-EB: nop                     # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: jalrs16 $9              # encoding: [0x45,0xe9]
+# CHECK-EB: move    $zero, $zero    # encoding: [0x0c,0x00]
+# CHECK-EB: jr16    $9              # encoding: [0x45,0x89]
+# CHECK-EB: nop                     # encoding: [0x00,0x00,0x00,0x00]
 
+    addu16  $6, $17, $4
+    subu16  $5, $16, $3
+    andi16  $16, $2, 31
+    and16   $16, $2
+    not16   $17, $3
+    or16    $16, $4
+    xor16   $17, $5
+    sll16   $3, $16, 5
+    srl16   $4, $17, 6
+    li16    $3, -1
+    li16    $3, 126
+    addiur1sp $7, 4
+    addiur2 $6, $7, -1
+    addiur2 $6, $7, 12
+    addius5 $7, -2
+    addiusp -16
     mfhi    $9
     mflo    $9
     move    $25, $1
+    jrc     $9
     jalr    $9
+    jraddiusp 20
+    jalrs16 $9
+    jr16    $9

diff --git a/test/MC/Mips/micromips-branch-instructions.s b/test/MC/Mips/micromips-branch-instructions.s
index 84df2a1..cf0aab7 100644
--- a/test/MC/Mips/micromips-branch-instructions.s
+++ b/test/MC/Mips/micromips-branch-instructions.s

@@ -29,6 +29,10 @@
 # CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EL: bltz $6, 1332        # encoding: [0x06,0x40,0x9a,0x02]
 # CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: bgezals $6, 1332     # encoding: [0x66,0x42,0x9a,0x02]
+# CHECK-EL: move $zero, $zero    # encoding: [0x00,0x0c]
+# CHECK-EL: bltzals $6, 1332     # encoding: [0x26,0x42,0x9a,0x02]
+# CHECK-EL: move $zero, $zero    # encoding: [0x00,0x0c]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
@@ -52,6 +56,10 @@
 # CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EB: bltz $6, 1332        # encoding: [0x40,0x06,0x02,0x9a]
 # CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: bgezals $6, 1332     # encoding: [0x42,0x66,0x02,0x9a]
+# CHECK-EB: move $zero, $zero    # encoding: [0x0c,0x00]
+# CHECK-EB: bltzals $6, 1332     # encoding: [0x42,0x26,0x02,0x9a]
+# CHECK-EB: move $zero, $zero    # encoding: [0x0c,0x00]
 
      b      1332
      beq    $9,$6,1332
@@ -63,3 +71,5 @@
      bne    $9,$6,1332
      bal    1332
      bltz   $6,1332
+     bgezals $6,1332
+     bltzals $6,1332

diff --git a/test/MC/Mips/micromips-control-instructions.s b/test/MC/Mips/micromips-control-instructions.s
index aff84c2..e79896d 100644
--- a/test/MC/Mips/micromips-control-instructions.s
+++ b/test/MC/Mips/micromips-control-instructions.s

@@ -9,6 +9,12 @@
 #------------------------------------------------------------------------------
 # Little endian
 #------------------------------------------------------------------------------
+# CHECK-EL:    sdbbp                      # encoding: [0x00,0x00,0x7c,0xdb]
+# CHECK-EL:    sdbbp 34                   # encoding: [0x22,0x00,0x7c,0xdb]
+# CHECK-EL:    .set push
+# CHECK-EL:    .set mips32r2
+# CHECK-EL:    rdhwr $5, $29
+# CHECK-EL:    .set pop                   # encoding: [0xbd,0x00,0x3c,0x6b]
 # CHECK-EL:    break                      # encoding: [0x00,0x00,0x07,0x00]
 # CHECK-EL:    break 7                    # encoding: [0x07,0x00,0x07,0x00]
 # CHECK-EL:    break 7, 5                 # encoding: [0x07,0x00,0x47,0x01]
@@ -24,9 +30,19 @@
 # CHECK-EL:    ei  $10                    # encoding: [0x0a,0x00,0x7c,0x57]
 # CHECK-EL:    wait                       # encoding: [0x00,0x00,0x7c,0x93]
 # CHECK-EL:    wait 17                    # encoding: [0x11,0x00,0x7c,0x93]
+# CHECK-EL:    tlbp                       # encoding: [0x00,0x00,0x7c,0x03]
+# CHECK-EL:    tlbr                       # encoding: [0x00,0x00,0x7c,0x13]
+# CHECK-EL:    tlbwi                      # encoding: [0x00,0x00,0x7c,0x23]
+# CHECK-EL:    tlbwr                      # encoding: [0x00,0x00,0x7c,0x33]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
+# CHECK-EB:   sdbbp                       # encoding: [0x00,0x00,0xdb,0x7c]
+# CHECK-EB:   sdbbp 34                    # encoding: [0x00,0x22,0xdb,0x7c]
+# CHECK-EB:   .set push
+# CHECK-EB:   .set mips32r2
+# CHECK-EB:   rdhwr $5, $29
+# CHECK-EB:   .set pop                    # encoding: [0x00,0xbd,0x6b,0x3c]
 # CHECK-EB:   break                       # encoding: [0x00,0x00,0x00,0x07]
 # CHECK-EB:   break 7                     # encoding: [0x00,0x07,0x00,0x07]
 # CHECK-EB:   break 7, 5                  # encoding: [0x00,0x07,0x01,0x47]
@@ -42,7 +58,14 @@
 # CHECK-EB:   ei  $10                     # encoding: [0x00,0x0a,0x57,0x7c]
 # CHECK-EB:   wait                        # encoding: [0x00,0x00,0x93,0x7c]
 # CHECK-EB:   wait 17                     # encoding: [0x00,0x11,0x93,0x7c]
+# CHECK-EB:   tlbp                        # encoding: [0x00,0x00,0x03,0x7c]
+# CHECK-EB:   tlbr                        # encoding: [0x00,0x00,0x13,0x7c]
+# CHECK-EB:   tlbwi                       # encoding: [0x00,0x00,0x23,0x7c]
+# CHECK-EB:   tlbwr                       # encoding: [0x00,0x00,0x33,0x7c]
 
+    sdbbp
+    sdbbp 34
+    rdhwr $5, $29
     break
     break 7
     break 7,5
@@ -58,3 +81,7 @@
     ei $10
     wait
     wait 17
+    tlbp
+    tlbr
+    tlbwi
+    tlbwr

diff --git a/test/MC/Mips/micromips-fpu-instructions.s b/test/MC/Mips/micromips-fpu-instructions.s
index 5af4f98..0aeb326 100644
--- a/test/MC/Mips/micromips-fpu-instructions.s
+++ b/test/MC/Mips/micromips-fpu-instructions.s

@@ -53,6 +53,8 @@
 # CHECK-EL: ctc1    $6, $0              # encoding: [0xc0,0x54,0x3b,0x18]
 # CHECK-EL: mfc1    $6, $f8             # encoding: [0xc8,0x54,0x3b,0x20]
 # CHECK-EL: mtc1    $6, $f8             # encoding: [0xc8,0x54,0x3b,0x28]
+# CHECK-EL: mfhc1   $6, $f8             # encoding: [0xc8,0x54,0x3b,0x30]
+# CHECK-EL: mthc1   $6, $f8             # encoding: [0xc8,0x54,0x3b,0x38]
 # CHECK-EL: movz.s  $f4, $f6, $7        # encoding: [0xe6,0x54,0x78,0x20]
 # CHECK-EL: movz.d  $f4, $f6, $7        # encoding: [0xe6,0x54,0x78,0x21]
 # CHECK-EL: movn.s  $f4, $f6, $7        # encoding: [0xe6,0x54,0x38,0x20]
@@ -116,6 +118,8 @@
 # CHECK-EB: ctc1    $6, $0              # encoding: [0x54,0xc0,0x18,0x3b]
 # CHECK-EB: mfc1    $6, $f8             # encoding: [0x54,0xc8,0x20,0x3b]
 # CHECK-EB: mtc1    $6, $f8             # encoding: [0x54,0xc8,0x28,0x3b]
+# CHECK-EB: mfhc1   $6, $f8             # encoding: [0x54,0xc8,0x30,0x3b]
+# CHECK-EB: mthc1   $6, $f8             # encoding: [0x54,0xc8,0x38,0x3b]
 # CHECK-EB: movz.s  $f4, $f6, $7        # encoding: [0x54,0xe6,0x20,0x78]
 # CHECK-EB: movz.d  $f4, $f6, $7        # encoding: [0x54,0xe6,0x21,0x78]
 # CHECK-EB: movn.s  $f4, $f6, $7        # encoding: [0x54,0xe6,0x20,0x38]
@@ -175,6 +179,8 @@
     ctc1       $6, $0
     mfc1       $6, $f8
     mtc1       $6, $f8
+    mfhc1      $6, $f8
+    mthc1      $6, $f8
     movz.s     $f4, $f6, $7
     movz.d     $f4, $f6, $7
     movn.s     $f4, $f6, $7

diff --git a/test/MC/Mips/micromips-invalid.s b/test/MC/Mips/micromips-invalid.s
new file mode 100644
index 0000000..779e66e
--- /dev/null
+++ b/test/MC/Mips/micromips-invalid.s

@@ -0,0 +1,31 @@
+# RUN: not llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips 2>%t1
+# RUN: FileCheck %s < %t1
+
+  addiur1sp $7, 260 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  addiur1sp $7, 241 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: misaligned immediate operand value
+  addiur1sp $8, 240 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  addius5 $7, 9 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  addiusp 1032   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  addu16  $6, $14, $4 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  subu16  $5, $16, $9 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  andi16  $16, $10, 0x1f # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  andi16  $16, $2, 17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  and16   $16, $8   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  not16   $18, $9   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  or16    $16, $10  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  xor16   $15, $5   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sll16   $1, $16, 5 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  srl16   $4, $9, 6  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  sll16   $3, $16, 9 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  srl16   $4, $5, 15 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  li16  $8, -1 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  li16  $4, -2 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  addiur2 $9, $7, -1 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+  addiur2 $6, $7, 10 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: immediate operand value out of range
+  lwm32   $5, $6, 8($4)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: $16 or $31 expected
+  lwm32   $16, $19, 8($4)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: consecutive register numbers expected
+  lwm32   $16-$25, 8($4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid register operand
+  swm32   $5, $6, 8($4)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: $16 or $31 expected
+  swm32   $16, $19, 8($4)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: consecutive register numbers expected
+  swm32   $16-$25, 8($4)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid register operand
+  lwm32 $16, $17, $18, $19, $20, $21, $22, $23, $24, 8($4) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid register operand

diff --git a/test/MC/Mips/micromips-jump-instructions.s b/test/MC/Mips/micromips-jump-instructions.s
index a6c7676..aed18dc 100644
--- a/test/MC/Mips/micromips-jump-instructions.s
+++ b/test/MC/Mips/micromips-jump-instructions.s

@@ -19,6 +19,10 @@
 # CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EL: jr $7       # encoding: [0x07,0x00,0x3c,0x0f]
 # CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: jals 1328         # encoding: [0x00,0x74,0x98,0x02]
+# CHECK-EL: move $zero, $zero # encoding: [0x00,0x0c]
+# CHECK-EL: jalrs $ra, $6     # encoding: [0xe6,0x03,0x3c,0x4f]
+# CHECK-EL: move $zero, $zero # encoding: [0x00,0x0c]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
@@ -32,9 +36,15 @@
 # CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-EB: jr $7       # encoding: [0x00,0x07,0x0f,0x3c]
 # CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: jals 1328         # encoding: [0x74,0x00,0x02,0x98]
+# CHECK-EB: move $zero, $zero # encoding: [0x0c,0x00]
+# CHECK-EB: jalrs $ra, $6     # encoding: [0x03,0xe6,0x4f,0x3c]
+# CHECK-EB: move $zero, $zero # encoding: [0x0c,0x00]
 
      j 1328
      jal 1328
      jalr $ra, $6
      jr $7
      j $7
+     jals 1328
+     jalrs $ra, $6

diff --git a/test/MC/Mips/micromips-label-test-sections.s b/test/MC/Mips/micromips-label-test-sections.s
new file mode 100644
index 0000000..569b64c
--- /dev/null
+++ b/test/MC/Mips/micromips-label-test-sections.s

@@ -0,0 +1,35 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 \
+# RUN:   -mattr=+micromips -filetype=obj -o - | llvm-readobj -t | FileCheck %s
+  .text
+  .set micromips
+f:
+  nop
+g:
+  .section .text
+h:
+  nop
+
+# CHECK: Symbols [
+# CHECK:   Symbol {
+# CHECK:     Name: f
+# CHECK:     Binding: Local
+# CHECK:     Type: None
+# CHECK:     Other: 128
+# CHECK:     Section: .text
+# CHECK:   }
+# CHECK:   Symbol {
+# CHECK:     Name: g
+# CHECK:     Binding: Local
+# CHECK:     Type: None
+# CHECK:     Other: 0
+# CHECK:     Section: .text
+# CHECK:   }
+# CHECK:   Symbol {
+# CHECK:     Name: h
+# CHECK:     Binding: Local
+# CHECK:     Type: None
+# CHECK:     Other: 128
+# CHECK:     Section: .text
+# CHECK:   }
+# CHECK: ]
+

diff --git a/test/MC/Mips/micromips-label-test.s b/test/MC/Mips/micromips-label-test.s
new file mode 100644
index 0000000..cc1566b
--- /dev/null
+++ b/test/MC/Mips/micromips-label-test.s

@@ -0,0 +1,54 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 \
+# RUN:   -mattr=+micromips -filetype=obj -o - | llvm-readobj -t | FileCheck %s
+  .text
+  .set nomicromips
+f:
+  nop
+g:
+  .set micromips
+  nop
+h:
+  .word 0
+i:
+  nop
+j:
+  .set nomicromips
+  nop
+# CHECK: Symbols [
+# CHECK:   Symbol {
+# CHECK:     Name: f
+# CHECK:     Binding: Local
+# CHECK:     Type: None
+# CHECK:     Other: 0
+# CHECK:     Section: .text
+# CHECK:   }
+# CHECK:   Symbol {
+# CHECK:     Name: g
+# CHECK:     Binding: Local
+# CHECK:     Type: None
+# CHECK:     Other: 128
+# CHECK:     Section: .text
+# CHECK:   }
+# CHECK:   Symbol {
+# CHECK:     Name: h
+# CHECK:     Binding: Local
+# CHECK:     Type: None
+# CHECK:     Other: 0
+# CHECK:     Section: .text
+# CHECK:   }
+# CHECK:   Symbol {
+# CHECK:     Name: i
+# CHECK:     Binding: Local
+# CHECK:     Type: None
+# CHECK:     Other: 128
+# CHECK:     Section: .text
+# CHECK:   }
+# CHECK:   Symbol {
+# CHECK:     Name: j
+# CHECK:     Binding: Local
+# CHECK:     Type: None
+# CHECK:     Other: 0
+# CHECK:     Section: .text
+# CHECK:   }
+# CHECK: ]
+

diff --git a/test/MC/Mips/micromips-loadstore-instructions.s b/test/MC/Mips/micromips-loadstore-instructions.s
index 8a1b93b..62fa101 100644
--- a/test/MC/Mips/micromips-loadstore-instructions.s
+++ b/test/MC/Mips/micromips-loadstore-instructions.s

@@ -9,31 +9,49 @@
 #------------------------------------------------------------------------------
 # Little endian
 #------------------------------------------------------------------------------
-# CHECK-EL: lb     $5, 8($4)      # encoding: [0xa4,0x1c,0x08,0x00]
-# CHECK-EL: lbu    $6, 8($4)      # encoding: [0xc4,0x14,0x08,0x00]
-# CHECK-EL: lh     $2, 8($4)      # encoding: [0x44,0x3c,0x08,0x00]
-# CHECK-EL: lhu    $4, 8($2)      # encoding: [0x82,0x34,0x08,0x00]
-# CHECK-EL: lw     $6, 4($5)      # encoding: [0xc5,0xfc,0x04,0x00]
-# CHECK-EL: sb     $5, 8($4)      # encoding: [0xa4,0x18,0x08,0x00]
-# CHECK-EL: sh     $2, 8($4)      # encoding: [0x44,0x38,0x08,0x00]
-# CHECK-EL: sw     $5, 4($6)      # encoding: [0xa6,0xf8,0x04,0x00]
-# CHECK-EL: ll     $2, 8($4)      # encoding: [0x44,0x60,0x08,0x30]
-# CHECK-EL: sc     $2, 8($4)      # encoding: [0x44,0x60,0x08,0xb0]
-# CHECK-EL: lwu    $2, 8($4)      # encoding: [0x44,0x60,0x08,0xe0]
+# CHECK-EL: lb     $5, 8($4)                  # encoding: [0xa4,0x1c,0x08,0x00]
+# CHECK-EL: lbu    $6, 8($4)                  # encoding: [0xc4,0x14,0x08,0x00]
+# CHECK-EL: lh     $2, 8($4)                  # encoding: [0x44,0x3c,0x08,0x00]
+# CHECK-EL: lhu    $4, 8($2)                  # encoding: [0x82,0x34,0x08,0x00]
+# CHECK-EL: lw     $6, 4($5)                  # encoding: [0xc5,0xfc,0x04,0x00]
+# CHECK-EL: sb     $5, 8($4)                  # encoding: [0xa4,0x18,0x08,0x00]
+# CHECK-EL: sh     $2, 8($4)                  # encoding: [0x44,0x38,0x08,0x00]
+# CHECK-EL: sw     $5, 4($6)                  # encoding: [0xa6,0xf8,0x04,0x00]
+# CHECK-EL: ll     $2, 8($4)                  # encoding: [0x44,0x60,0x08,0x30]
+# CHECK-EL: sc     $2, 8($4)                  # encoding: [0x44,0x60,0x08,0xb0]
+# CHECK-EL: lwu    $2, 8($4)                  # encoding: [0x44,0x60,0x08,0xe0]
+# CHECK-EL: lwxs   $2, $3($4)                 # encoding: [0x64,0x00,0x18,0x11]
+# CHECK-EL: lwm32  $16, $17, 8($4)            # encoding: [0x44,0x20,0x08,0x50]
+# CHECK-EL: lwm32  $16, $17, $18, $19, 8($4)  # encoding: [0x84,0x20,0x08,0x50]
+# CHECK-EL: lwm32  $16, $17, $18, $19, $20, $21, $22, $23, $fp, 8($4)      # encoding: [0x24,0x21,0x08,0x50]
+# CHECK-EL: lwm32  $16, $17, $18, $19, $ra, 8($4)                          # encoding: [0x84,0x22,0x08,0x50]
+# CHECK-EL: lwm32  $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x24,0x23,0x08,0x50]
+# CHECK-EL: lwm32  $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x24,0x23,0x08,0x50]
+# CHECK-EL: swm32  $16, $17, 8($4)            # encoding: [0x44,0x20,0x08,0xd0]
+# CHECK-EL: swm32  $16, $17, $18, $19, 8($4)  # encoding: [0x84,0x20,0x08,0xd0]
 #------------------------------------------------------------------------------
 # Big endian
 #------------------------------------------------------------------------------
-# CHECK-EB: lb     $5, 8($4)      # encoding: [0x1c,0xa4,0x00,0x08]
-# CHECK-EB: lbu    $6, 8($4)      # encoding: [0x14,0xc4,0x00,0x08]
-# CHECK-EB: lh     $2, 8($4)      # encoding: [0x3c,0x44,0x00,0x08]
-# CHECK-EB: lhu    $4, 8($2)      # encoding: [0x34,0x82,0x00,0x08]
-# CHECK-EB: lw     $6, 4($5)      # encoding: [0xfc,0xc5,0x00,0x04]
-# CHECK-EB: sb     $5, 8($4)      # encoding: [0x18,0xa4,0x00,0x08]
-# CHECK-EB: sh     $2, 8($4)      # encoding: [0x38,0x44,0x00,0x08]
-# CHECK-EB: sw     $5, 4($6)      # encoding: [0xf8,0xa6,0x00,0x04]
-# CHECK-EB: ll     $2, 8($4)      # encoding: [0x60,0x44,0x30,0x08]
-# CHECK-EB: sc     $2, 8($4)      # encoding: [0x60,0x44,0xb0,0x08]
-# CHECK-EB: lwu    $2, 8($4)      # encoding: [0x60,0x44,0xe0,0x08]
+# CHECK-EB: lb     $5, 8($4)                 # encoding: [0x1c,0xa4,0x00,0x08]
+# CHECK-EB: lbu    $6, 8($4)                 # encoding: [0x14,0xc4,0x00,0x08]
+# CHECK-EB: lh     $2, 8($4)                 # encoding: [0x3c,0x44,0x00,0x08]
+# CHECK-EB: lhu    $4, 8($2)                 # encoding: [0x34,0x82,0x00,0x08]
+# CHECK-EB: lw     $6, 4($5)                 # encoding: [0xfc,0xc5,0x00,0x04]
+# CHECK-EB: sb     $5, 8($4)                 # encoding: [0x18,0xa4,0x00,0x08]
+# CHECK-EB: sh     $2, 8($4)                 # encoding: [0x38,0x44,0x00,0x08]
+# CHECK-EB: sw     $5, 4($6)                 # encoding: [0xf8,0xa6,0x00,0x04]
+# CHECK-EB: ll     $2, 8($4)                 # encoding: [0x60,0x44,0x30,0x08]
+# CHECK-EB: sc     $2, 8($4)                 # encoding: [0x60,0x44,0xb0,0x08]
+# CHECK-EB: lwu    $2, 8($4)                 # encoding: [0x60,0x44,0xe0,0x08]
+# CHECK-EB: lwxs   $2, $3($4)                # encoding: [0x00,0x64,0x11,0x18]
+# CHECK-EB: lwm32  $16, $17, 8($4)           # encoding: [0x20,0x44,0x50,0x08]
+# CHECK-EB: lwm32  $16, $17, $18, $19, 8($4) # encoding: [0x20,0x84,0x50,0x08]
+# CHECK-EB: lwm32  $16, $17, $18, $19, $20, $21, $22, $23, $fp, 8($4)      # encoding: [0x21,0x24,0x50,0x08]
+# CHECK-EB: lwm32  $16, $17, $18, $19, $ra, 8($4)                          # encoding: [0x22,0x84,0x50,0x08]
+# CHECK-EB: lwm32  $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x23,0x24,0x50,0x08]
+# CHECK-EB: lwm32  $16, $17, $18, $19, $20, $21, $22, $23, $fp, $ra, 8($4) # encoding: [0x23,0x24,0x50,0x08]
+# CHECK-EB: swm32  $16, $17, 8($4)           # encoding: [0x20,0x44,0xd0,0x08]
+# CHECK-EB: swm32  $16, $17, $18, $19, 8($4) # encoding: [0x20,0x84,0xd0,0x08]
      lb     $5, 8($4)
      lbu    $6, 8($4)
      lh     $2, 8($4)
@@ -45,3 +63,12 @@
      ll     $2, 8($4)
      sc     $2, 8($4)
      lwu    $2, 8($4)
+     lwxs   $2, $3($4)
+     lwm32  $16, $17, 8($4)
+     lwm32  $16 - $19, 8($4)
+     lwm32  $16-$23, $30, 8($4)
+     lwm32  $16-$19, $31, 8($4)
+     lwm32  $16-$23, $30, $31, 8($4)
+     lwm32  $16-$23, $30 - $31, 8($4)
+     swm32  $16, $17, 8($4)
+     swm32  $16 - $19, 8($4)

diff --git a/test/MC/Mips/mips-expansions-bad.s b/test/MC/Mips/mips-expansions-bad.s
index a137deb..8d85169 100644
--- a/test/MC/Mips/mips-expansions-bad.s
+++ b/test/MC/Mips/mips-expansions-bad.s

@@ -2,5 +2,5 @@
 # RUN: FileCheck %s < %t1
 
         .text
-        li $5, 0x100000000 # CHECK: :[[@LINE]]:9: error: instruction requires a CPU feature not currently enabled
-        dli $5, 1 # CHECK: :[[@LINE]]:9: error: instruction requires a CPU feature not currently enabled
+        li $5, 0x100000000 # CHECK: :[[@LINE]]:9: error: instruction requires a 64-bit architecture
+        dli $5, 1 # CHECK: :[[@LINE]]:9: error: instruction requires a 64-bit architecture

diff --git a/test/MC/Mips/mips-expansions.s b/test/MC/Mips/mips-expansions.s
index f0a04a5..bdc76fb 100644
--- a/test/MC/Mips/mips-expansions.s
+++ b/test/MC/Mips/mips-expansions.s

@@ -17,6 +17,22 @@
 # CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
 # CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
 # CHECK: addu    $7, $7, $8          # encoding: [0x21,0x38,0xe8,0x00]
+# CHECK: lui     $8, %hi(symbol)     # encoding: [A,A,0x08,0x3c]
+                                     #   fixup A - offset: 0, value: symbol@ABS_HI, kind: fixup_Mips_HI16
+# CHECK: ori     $8, $8, %lo(symbol) # encoding: [A,A,0x08,0x35]
+                                     #   fixup A - offset: 0, value: symbol@ABS_LO, kind: fixup_Mips_LO16
+# CHECK: .set    mips64
+# CHECK: lui     $8, %highest(symbol)    # encoding: [A,A,0x08,0x3c]
+                                         #   fixup A - offset: 0, value: symbol@HIGHEST, kind: fixup_Mips_HIGHEST
+# CHECK: ori     $8, $8, %higher(symbol) # encoding: [A,A,0x08,0x35]
+                                         #   fixup A - offset: 0, value: symbol@HIGHER, kind: fixup_Mips_HIGHER
+# CHECK: dsll    $8, $8, 16              # encoding: [0x38,0x44,0x08,0x00]
+# CHECK: ori     $8, $8, %hi(symbol)     # encoding: [A,A,0x08,0x35]
+                                         #   fixup A - offset: 0, value: symbol@ABS_HI, kind: fixup_Mips_HI16
+# CHECK: dsll    $8, $8, 16              # encoding: [0x38,0x44,0x08,0x00]
+# CHECK: ori     $8, $8, %lo(symbol)     # encoding: [A,A,0x08,0x35]
+                                         #   fixup A - offset: 0, value: symbol@ABS_LO, kind: fixup_Mips_LO16
+# CHECK: .set    mips32r2
 # CHECK: lui     $10, %hi(symbol)        # encoding: [A,A,0x0a,0x3c]
 # CHECK:                                 #   fixup A - offset: 0, value: symbol@ABS_HI, kind: fixup_Mips_HI16
 # CHECK: addu    $10, $10, $4            # encoding: [0x21,0x50,0x44,0x01]
@@ -48,6 +64,10 @@
     la $7,65538
     la $a0, 20($a1)
     la $7,65538($8)
+    la $t0, symbol
+    .set mips64
+    la $t0, symbol
+    .set mips32r2
 
     .set noat
     lw  $t2, symbol($a0)

diff --git a/test/MC/Mips/mips-hwr-register-names.s b/test/MC/Mips/mips-hwr-register-names.s
new file mode 100644
index 0000000..3849675
--- /dev/null
+++ b/test/MC/Mips/mips-hwr-register-names.s

@@ -0,0 +1,199 @@
+# Check the hardware registers
+#
+# FIXME: Use the code generator in order to print the .set directives
+#        instead of the instruction printer.
+#
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips32r2 | \
+# RUN:      FileCheck %s
+        .set noat
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $hwr_cpunum
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x00,0x3b]
+        rdhwr     $a0,$hwr_cpunum
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $hwr_cpunum
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x00,0x3b]
+        rdhwr     $a0,$0
+
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $5, $hwr_synci_step
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x05,0x08,0x3b]
+        rdhwr     $a1,$hwr_synci_step
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $5, $hwr_synci_step
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x05,0x08,0x3b]
+        rdhwr     $a1,$1
+
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $6, $hwr_cc
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x06,0x10,0x3b]
+        rdhwr     $a2,$hwr_cc
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $6, $hwr_cc
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x06,0x10,0x3b]
+        rdhwr     $a2,$2
+
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $7, $hwr_ccres
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x07,0x18,0x3b]
+        rdhwr     $a3,$hwr_ccres
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $7, $hwr_ccres
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x07,0x18,0x3b]
+        rdhwr     $a3,$3
+
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $4
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x20,0x3b]
+        rdhwr     $a0,$4
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $5
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x28,0x3b]
+        rdhwr     $a0,$5
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $6
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x30,0x3b]
+        rdhwr     $a0,$6
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $7
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x38,0x3b]
+        rdhwr     $a0,$7
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $8
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x40,0x3b]
+        rdhwr     $a0,$8
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $9
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x48,0x3b]
+        rdhwr     $a0,$9
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $10
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x50,0x3b]
+        rdhwr     $a0,$10
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $11
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x58,0x3b]
+        rdhwr     $a0,$11
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $12
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x60,0x3b]
+        rdhwr     $a0,$12
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $13
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x68,0x3b]
+        rdhwr     $a0,$13
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $14
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x70,0x3b]
+        rdhwr     $a0,$14
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $15
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x78,0x3b]
+        rdhwr     $a0,$15
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $16
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x80,0x3b]
+        rdhwr     $a0,$16
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $17
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x88,0x3b]
+        rdhwr     $a0,$17
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $18
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x90,0x3b]
+        rdhwr     $a0,$18
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $19
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0x98,0x3b]
+        rdhwr     $a0,$19
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $20
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xa0,0x3b]
+        rdhwr     $a0,$20
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $21
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xa8,0x3b]
+        rdhwr     $a0,$21
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $22
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xb0,0x3b]
+        rdhwr     $a0,$22
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $23
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xb8,0x3b]
+        rdhwr     $a0,$23
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $24
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xc0,0x3b]
+        rdhwr     $a0,$24
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $25
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xc8,0x3b]
+        rdhwr     $a0,$25
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $26
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xd0,0x3b]
+        rdhwr     $a0,$26
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $27
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xd8,0x3b]
+        rdhwr     $a0,$27
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $28
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xe0,0x3b]
+        rdhwr     $a0,$28
+
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $29
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xe8,0x3b]
+        rdhwr     $a0,$29
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $29
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xe8,0x3b]
+        rdhwr     $a0,$hwr_ulr
+
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $30
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xf0,0x3b]
+        rdhwr     $a0,$30
+        # CHECK:      .set    push
+        # CHECK-NEXT: .set    mips32r2
+        # CHECK-NEXT: rdhwr   $4, $31
+        # CHECK-NEXT: .set    pop             # encoding: [0x7c,0x04,0xf8,0x3b]
+        rdhwr     $a0,$31

diff --git a/test/MC/Mips/mips-jump-delay-slots.s b/test/MC/Mips/mips-jump-delay-slots.s
new file mode 100644
index 0000000..49f6c15
--- /dev/null
+++ b/test/MC/Mips/mips-jump-delay-slots.s

@@ -0,0 +1,122 @@
+# Verify that every branch and jump instruction is followed by a delay slot
+# except for the branch likely instructions.
+#
+# RUN: llvm-mc %s -triple=mips-unknown-linux -mcpu=mips32r2 | FileCheck %s
+
+        .set noat
+        # CHECK: b 1332
+        # CHECK: nop
+        b 1332
+        # CHECK: bc1f 1332
+        # CHECK: nop
+        bc1f 1332
+        # CHECK: bc1t 1332
+        # CHECK: nop
+        bc1t 1332
+        # CHECK: beq $9, $6, 1332
+        # CHECK: nop
+        beq $9,$6,1332
+        # CHECK: bgez $6, 1332
+        # CHECK: nop
+        bgez $6,1332
+        # CHECK: bgezal $6, 1332
+        # CHECK: nop
+        bgezal $6,1332
+        # CHECK: bgtz $6, 1332
+        # CHECK: nop
+        bgtz $6,1332
+        # CHECK: blez $6, 1332
+        # CHECK: nop
+        blez $6,1332
+        # CHECK: bltz $6, 1332
+        # CHECK: nop
+        bltz $6,1332
+        # CHECK: bne $9, $6, 1332
+        # CHECK: nop
+        bne $9,$6,1332
+        # CHECK: bltzal $6, 1332
+        # CHECK: nop
+        bltzal $6,1332
+        # CHECK: bal 1332
+        # CHECK: nop
+        bal 1332
+        # CHECK: bnez $11, 1332
+        # CHECK: nop
+        bnez $11,1332
+        # CHECK: beqz $11, 1332
+        # CHECK: nop
+        beqz $11,1332
+
+        # CHECK: bc1fl 1332
+        # CHECK-NOT: nop
+        bc1fl 1332
+        # CHECK: bc1fl 1332
+        # CHECK-NOT: nop
+        bc1fl $fcc0, 1332
+        # CHECK: bc1fl $fcc3, 1332
+        # CHECK-NOT: nop
+        bc1fl $fcc3, 1332
+        # CHECK: bc1tl 1332
+        # CHECK-NOT: nop
+        bc1tl 1332
+        # CHECK: bc1tl 1332
+        # CHECK-NOT: nop
+        bc1tl $fcc0, 1332
+        # CHECK: bc1tl $fcc3, 1332
+        # CHECK-NOT: nop
+        bc1tl $fcc3, 1332
+        # CHECK: beql $9, $6, 1332
+        # CHECK-NOT: nop
+        beql $9,$6,1332
+        # CHECK: bnel $9, $6, 1332
+        # CHECK-NOT: nop
+        bnel $9,$6,1332
+        # CHECK: bgezl $6, 1332
+        # CHECK-NOT: nop
+        bgezl $6,1332
+        # CHECK: bgtzl $6, 1332
+        # CHECK-NOT: nop
+        bgtzl $6,1332
+        # CHECK: blezl $6, 1332
+        # CHECK-NOT: nop
+        blezl $6,1332
+        # CHECK: bltzl $6, 1332
+        # CHECK-NOT: nop
+        bltzl $6,1332
+        # CHECK: bgezall $6, 1332
+        # CHECK-NOT: nop
+        bgezall $6,1332
+        # CHECK: bltzall $6, 1332
+        # CHECK-NOT: nop
+        bltzall $6,1332
+
+        # CHECK: j 1328
+        # CHECK: nop
+        j 1328
+        # CHECK: jal 1328
+        # CHECK: nop
+        jal 1328
+        # CHECK: jalr $6
+        # CHECK: nop
+        jalr $6
+        # CHECK: jalr $25
+        # CHECK: nop
+        jalr $31,$25
+        # CHECK: jalr $10, $11
+        # CHECK: nop
+        jalr $10,$11
+        # CHECK: jr $7
+        # CHECK: nop
+        jr $7
+        # CHECK: jr $7
+        # CHECK: nop
+        j $7
+        # CHECK: jalr $25
+        # CHECK: nop
+        jal $25
+        # CHECK: jalr $4, $25
+        # CHECK: nop
+        jal $4,$25
+        # CHECK: jalx lab
+        # CHECK: nop
+        jalx lab

diff --git a/test/MC/Mips/mips-noat.s b/test/MC/Mips/mips-noat.s
index 07db251..f9d4efd 100644
--- a/test/MC/Mips/mips-noat.s
+++ b/test/MC/Mips/mips-noat.s

@@ -12,7 +12,7 @@
 
 test2:
         .set noat
-        lw      $2, 65536($2) # ERROR: mips-noat.s:[[@LINE]]:9: error: Pseudo instruction requires $at, which is not available
+        lw      $2, 65536($2) # ERROR: mips-noat.s:[[@LINE]]:9: error: pseudo-instruction requires $at, which is not available
 
 
 # Can we switch it back on successfully?
@@ -26,4 +26,4 @@
 
 test4:
         .set at=$0
-        lw      $2, 65536($2) # ERROR: mips-noat.s:[[@LINE]]:9: error: Pseudo instruction requires $at, which is not available
+        lw      $2, 65536($2) # ERROR: mips-noat.s:[[@LINE]]:9: error: pseudo-instruction requires $at, which is not available

diff --git a/test/MC/Mips/mips-pdr-bad.s b/test/MC/Mips/mips-pdr-bad.s
new file mode 100644
index 0000000..40c6ba2
--- /dev/null
+++ b/test/MC/Mips/mips-pdr-bad.s

@@ -0,0 +1,42 @@
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1 -check-prefix=ASM
+
+        .text
+        
+        .ent # ASM: :[[@LINE]]:14: error: expected identifier after .ent
+        .ent bar, # ASM: :[[@LINE]]:19: error: expected number after comma
+        .ent foo, bar # AMS: :[[@LINE]]:23: error: expected an absolute expression after comma
+        .ent foo, 5, bar # AMS: :[[@LINE]]:20: error: unexpected token, expected end of statement
+ 
+        .frame # ASM: :[[@LINE]]:16: error: expected stack register
+        .frame bar # ASM: :[[@LINE]]:16: error: expected stack register
+        .frame $f1, 8, # ASM: :[[@LINE]]:16: error: expected general purpose register
+        .frame $sp # ASM: :[[@LINE]]:20: error: unexpected token, expected comma
+        .frame $sp, # ASM: :[[@LINE]]:21: error: expected frame size value
+        .frame $sp, bar # ASM: :[[@LINE]]:25: error: frame size not an absolute expression
+        .frame $sp, 8 # ASM: :[[@LINE]]:23: error: unexpected token, expected comma
+        .frame $sp, 8, # ASM: :[[@LINE]]:24: error: expected return register
+        .frame $sp, 8, $f1 # ASM: :[[@LINE]]:24: error: expected general purpose register
+        .frame $sp, 8, $ra, foo # ASM: :[[@LINE]]:27: error: unexpected token, expected end of statement
+
+        .mask  # ASM: :[[@LINE]]:16: error: expected bitmask value
+        .mask foo # ASM: :[[@LINE]]:19: error: bitmask not an absolute expression
+        .mask 0x80000000 # ASM: :[[@LINE]]:26: error: unexpected token, expected comma
+        .mask 0x80000000, # ASM: :[[@LINE]]:27: error: expected frame offset value
+        .mask 0x80000000, foo # ASM: :[[@LINE]]:31: error: frame offset not an absolute expression
+        .mask 0x80000000, -4, bar # ASM: :[[@LINE]]:29: error: unexpected token, expected end of statement
+        
+        .fmask  # ASM: :[[@LINE]]:17: error: expected bitmask value
+        .fmask foo # ASM: :[[@LINE]]:20: error: bitmask not an absolute expression
+        .fmask 0x80000000 # ASM: :[[@LINE]]:27: error: unexpected token, expected comma
+        .fmask 0x80000000, # ASM: :[[@LINE]]:28: error: expected frame offset value
+        .fmask 0x80000000, foo # ASM: :[[@LINE]]:32: error: frame offset not an absolute expression
+        .fmask 0x80000000, -4, bar # ASM: :[[@LINE]]:30: error: unexpected token, expected end of statement
+
+        .end # ASM: :[[@LINE]]:14: error: expected identifier after .end
+        .ent _local_foo_bar
+        .end _local_foo_bar, foo # ASM: :[[@LINE]]:28: error: unexpected token, expected end of statement
+        .end _local_foo_bar
+        .end _local_foo # ASM: :[[@LINE]]:25: error: .end used without .ent
+        .ent _local_foo, 2
+        .end _local_foo_bar # ASM: :[[@LINE]]:29: error: .end symbol does not match .ent symbol

diff --git a/test/MC/Mips/mips-pdr.s b/test/MC/Mips/mips-pdr.s
new file mode 100644
index 0000000..372c259
--- /dev/null
+++ b/test/MC/Mips/mips-pdr.s

@@ -0,0 +1,64 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -filetype=asm | \
+# RUN:   FileCheck %s -check-prefix=ASMOUT
+
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \
+# RUN:   llvm-readobj -s -section-data | \
+# RUN:     FileCheck %s -check-prefix=OBJOUT
+
+# ASMOUT: .text
+# ASMOUT:        .type _local_foo,@function
+# ASMOUT:        .ent _local_foo
+# ASMOUT:_local_foo:
+# ASMOUT:        .frame $fp,16,$ra
+# ASMOUT:        .mask 0x10101010,-4
+# ASMOUT:        .fmask 0x01010101,-8
+# ASMOUT:        .end _local_foo
+# ASMOUT:        .size local_foo,
+
+# OBJOUT: Section {
+# OBJOUT:     Name: .pdr
+# OBJOUT:     Type: SHT_PROGBITS (0x1)
+# OBJOUT:     Flags [ (0xB)
+# OBJOUT:       SHF_ALLOC (0x2)
+# OBJOUT:       SHF_WRITE (0x1)
+# OBJOUT:     ]
+# OBJOUT:     Size: 64
+# OBJOUT:     SectionData (
+# OBJOUT:       0000: 00000000 10101010 FFFFFFFC 01010101
+# OBJOUT:       0010: FFFFFFF8 00000010 0000001E 0000001F
+# OBJOUT:       0020: 00000000 10101010 FFFFFFFC 01010101
+# OBJOUT:       0030: FFFFFFF8 00000010 0000001E 0000001F
+# OBJOUT:     )
+# OBJOUT:   }
+
+# We should also check if relocation information was correctly generated.
+# OBJOUT:   Section {
+# OBJOUT:     Name: .rel.pdr
+# OBJOUT:     Type: SHT_REL (0x9)
+# OBJOUT:     Flags [ (0x0)
+# OBJOUT:     ]
+# OBJOUT:     Size: 16
+# OBJOUT:     SectionData (
+# OBJOUT:       0000: 00000000 00000202 00000020 00000802
+# OBJOUT:     )
+# OBJOUT:   }
+
+.text
+        .type _local_foo,@function
+        .ent _local_foo
+_local_foo:
+        .frame $fp,16,$ra
+        .mask 0x10101010,-4
+        .fmask 0x01010101,-8
+        .end _local_foo
+        .size local_foo,.-_local_foo
+
+        .globl _global_foo
+        .type _global_foo,@function
+        .ent _global_foo
+_global_foo:
+        .frame $fp,16,$ra
+        .mask 0x10101010,-4
+        .fmask 0x01010101,-8
+        .end _global_foo
+        .size global_foo,.-_global_foo

diff --git a/test/MC/Mips/mips-reginfo-fp32.s b/test/MC/Mips/mips-reginfo-fp32.s
new file mode 100644
index 0000000..5b31884
--- /dev/null
+++ b/test/MC/Mips/mips-reginfo-fp32.s

@@ -0,0 +1,34 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \
+# RUN:   llvm-readobj -s -section-data | \
+# RUN:     FileCheck %s
+
+# CHECK:   Section {
+# CHECK:    Index:
+# CHECK:    Name: .reginfo
+# CHECK:    Type: SHT_MIPS_REGINFO (0x70000006)
+# CHECK:    Flags [ (0x2)
+# CHECK:      SHF_ALLOC (0x2)
+# CHECK:    ]
+# CHECK:    Size: 24
+# CHECK:    SectionData (
+# CHECK:      0000: 01010101 00000000 C0007535 00000000
+# CHECK:      0010: 00000000 00000000
+# CHECK:    )
+# CHECK:  }
+
+.text
+        add $0,$0,$0
+        add $8,$0,$0
+        add $16,$0,$0
+        add $24,$0,$0
+
+# abs.s - Reads and writes from/to $f0.
+        abs.s $f0,$f0
+# round.w.d - Reads $f4 and $f5 and writes to $f2.
+        round.w.d $f2,$f4
+# ceil.w.s - Reads $f8 and writes to $f10.
+        ceil.w.s $f10, $f8
+# cvt.s.d - Reads from $f12 and $f13 and writes to $f14
+        cvt.s.d $f14, $f12
+# abs.d - Reads from $f30 and $f31 and writes to $f30 and $f31.
+        abs.d $f30,$f30

diff --git a/test/MC/Mips/mips-reginfo-fp64.s b/test/MC/Mips/mips-reginfo-fp64.s
new file mode 100644
index 0000000..b60e54e
--- /dev/null
+++ b/test/MC/Mips/mips-reginfo-fp64.s

@@ -0,0 +1,60 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 -mattr=+msa,+fp64 -filetype=obj -o - | \
+# RUN:   llvm-readobj -s -section-data | \
+# RUN:     FileCheck %s -check-prefix=ELF32
+
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64,-n64,+n32 -filetype=obj -o - | \
+# RUN:   llvm-readobj -s -section-data | \
+# RUN:     FileCheck %s -check-prefix=ELF32
+
+# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64,+n64 -filetype=obj -o - | \
+# RUN:   llvm-readobj -s -section-data | \
+# RUN:     FileCheck %s -check-prefix=ELF64
+
+# ELF32:   Section {
+# ELF32:    Name: .reginfo
+# ELF32:    Type: SHT_MIPS_REGINFO (0x70000006)
+# ELF32:    Flags [ (0x2)
+# ELF32:      SHF_ALLOC (0x2)
+# ELF32:    ]
+# ELF32:    Size: 24
+# ELF32:    SectionData (
+# ELF32:      0000: 01010101 00000000 4C005515 00000000
+# ELF32:      0010: 00000000 00000000
+# ELF32:    )
+# ELF32:  }
+
+# ELF64:   Section {
+# ELF64:    Name: .MIPS.options
+# ELF64:    Type: SHT_MIPS_OPTIONS (0x7000000D)
+# ELF64:    Flags [ (0x8000002)
+# ELF64:      SHF_ALLOC (0x2)
+# ELF64:      SHF_MIPS_NOSTRIP (0x8000000)
+# ELF64:    ]
+# ELF64:    Size: 40
+# ELF64:    SectionData (
+# ELF64:      0000: 01280000 00000000 01010101 00000000
+# ELF64:      0010: 00000000 4C005515 00000000 00000000
+# ELF64:      0020: 00000000 00000000
+# ELF64:    )
+# ELF64:  }
+
+.text
+        add $0,$0,$0
+        add $8,$0,$0
+        add $16,$0,$0
+        add $24,$0,$0
+
+# abs.s - Reads and writes from/to $f0.
+        abs.s $f0,$f0
+# round.w.d - Reads $f4 and writes to $f2.
+        round.w.d $f2,$f4
+# ceil.w.s - Reads $f8 and writes to $f10.
+        ceil.w.s $f10, $f8
+# cvt.s.d - Reads from $f12 and writes to $f14.
+        cvt.s.d $f14, $f12
+# abs.d - Reads from $f30 and writes to $f30.
+        abs.d $f30,$f30
+
+# Read and write from/to $f26 and $f27
+        add_a.b $w26,$w26,$w26
+        add_a.b $w27,$w27,$w27

diff --git a/test/MC/Mips/mips1/invalid-mips2.s b/test/MC/Mips/mips1/invalid-mips2.s
index 7db261d..29bd223 100644
--- a/test/MC/Mips/mips1/invalid-mips2.s
+++ b/test/MC/Mips/mips1/invalid-mips2.s

@@ -5,6 +5,18 @@
 # RUN: FileCheck %s < %t1
 
         .set noat
+        bc1fl     $fcc0,-8239     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1fl     -8239           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1tl     $fcc0,-8239     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1tl     -8239           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        beql      $14,$s3,12544   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezall   $12,7293        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezl     $4,-6858        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgtzl     $10,-3738       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        blezl     $6,2974         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzall   $6,488          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzl     $s1,-9964       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bnel      $gp,$s4,5107    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.w.d  $f11,$f25       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ceil.w.s  $f6,$f20        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         floor.w.d $f14,$f11       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -13,11 +25,23 @@
         round.w.s $f27,$f28       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         sqrt.d    $f17,$f22       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         sqrt.s    $f0,$f1         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teq       $0,$3           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        teq       $5,$7,620       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         teqi      $s5,-17504      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tge       $7,$10          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tge       $5,$19,340      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tgei      $s1,5025        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tgeiu     $sp,-28621      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeu      $22,$28         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tgeu      $20,$14,379     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlt       $15,$13         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tlt       $2,$19,133      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tlti      $t6,-21059      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tltiu     $ra,-5076       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltu      $11,$16         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tltu      $16,$29,1016    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tne       $6,$17          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        tne       $7,$8,885       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         tnei      $t4,-29647      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         trunc.w.d $f22,$f15       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         trunc.w.s $f28,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips1/invalid-mips3.s b/test/MC/Mips/mips1/invalid-mips3.s
index d1b0eec..d4be08e 100644
--- a/test/MC/Mips/mips1/invalid-mips3.s
+++ b/test/MC/Mips/mips1/invalid-mips3.s

@@ -19,6 +19,8 @@
         daddi     $sp,$s4,-27705    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         daddiu    $k0,$s6,-4586     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         daddu     $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu     $24,$2,18079      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu     $19,26943         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ddiv      $zero,$k0,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ddivu     $zero,$s0,$s1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dmfc1     $12,$f13          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -43,6 +45,8 @@
         dsrl32    $s3,$6,23         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsrlv     $s3,$14,$s4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsubu     $a1,$a1,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu     $15,$11,5025      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu     $14,-4586         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         floor.l.d $f26,$f7          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         floor.l.s $f12,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         floor.w.d $f14,$f11         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips1/invalid-mips32r2.s b/test/MC/Mips/mips1/invalid-mips32r2.s
new file mode 100644
index 0000000..679f21f
--- /dev/null
+++ b/test/MC/Mips/mips1/invalid-mips32r2.s

@@ -0,0 +1,11 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips1 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips1/invalid-mips4-wrong-error.s b/test/MC/Mips/mips1/invalid-mips4-wrong-error.s
index 2016e70..cec30c8 100644
--- a/test/MC/Mips/mips1/invalid-mips4-wrong-error.s
+++ b/test/MC/Mips/mips1/invalid-mips4-wrong-error.s

@@ -6,6 +6,8 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
+        bc1fl     $fcc7,27          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1tl     $fcc7,27          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ld        $sp,-28645($s1)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldc1      $f11,16391($s0)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ldc2      $8,-21181($at)    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction

diff --git a/test/MC/Mips/mips1/invalid-mips5-wrong-error.s b/test/MC/Mips/mips1/invalid-mips5-wrong-error.s
index 74473a3..18c0b61 100644
--- a/test/MC/Mips/mips1/invalid-mips5-wrong-error.s
+++ b/test/MC/Mips/mips1/invalid-mips5-wrong-error.s

@@ -6,41 +6,41 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
-        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips1/valid.s b/test/MC/Mips/mips1/valid.s
index 66e11ba..53ff6a0 100644
--- a/test/MC/Mips/mips1/valid.s
+++ b/test/MC/Mips/mips1/valid.s

@@ -10,7 +10,10 @@
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
         addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
         addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         and       $s7,$v0,$12
         and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
         bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
@@ -97,6 +100,8 @@
         srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
         ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
         sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
         sub.d     $f18,$f3,$f17
         sub.s     $f23,$f22,$f22
         subu      $sp,$s6,$s6

diff --git a/test/MC/Mips/mips2/invalid-mips3.s b/test/MC/Mips/mips2/invalid-mips3.s
index 458c416..e72b228 100644
--- a/test/MC/Mips/mips2/invalid-mips3.s
+++ b/test/MC/Mips/mips2/invalid-mips3.s

@@ -15,6 +15,8 @@
         daddi      $sp,$s4,-27705    # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         daddiu     $k0,$s6,-4586     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         daddu      $s3,$at,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu      $24,$2,18079      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        daddu      $19,26943         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ddiv       $zero,$k0,$s3     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ddivu      $zero,$s0,$s1     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dmfc1      $t0,$f13          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -40,6 +42,8 @@
         dsrlv      $s3,$t2,$s4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsub       $a3,$s6,$a4       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         dsubu      $a1,$a1,$k0       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu      $15,$11,5025      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        dsubu      $14,-4586         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         eret                         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         floor.l.d  $f26,$f7          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         floor.l.s  $f12,$f5          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips2/invalid-mips32r2.s b/test/MC/Mips/mips2/invalid-mips32r2.s
index 72a570a..6dc8159 100644
--- a/test/MC/Mips/mips2/invalid-mips32r2.s
+++ b/test/MC/Mips/mips2/invalid-mips32r2.s

@@ -13,7 +13,9 @@
         cvt.l.s $f11,$f29           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         deret                       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         eret                        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ldxc1   $f8,$s7($t7)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         luxc1   $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips2/invalid-mips4-wrong-error.s b/test/MC/Mips/mips2/invalid-mips4-wrong-error.s
index 193f6d7..28a98ba 100644
--- a/test/MC/Mips/mips2/invalid-mips4-wrong-error.s
+++ b/test/MC/Mips/mips2/invalid-mips4-wrong-error.s

@@ -6,6 +6,8 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
+        bc1fl     $fcc7,27        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1tl     $fcc7,27        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         ld        $sp,-28645($s1) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         lwu       $s3,-24086($v1) # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         scd       $15,-8243($sp)  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction

diff --git a/test/MC/Mips/mips2/invalid-mips5-wrong-error.s b/test/MC/Mips/mips2/invalid-mips5-wrong-error.s
index 0c58c6c..5eaeaa2 100644
--- a/test/MC/Mips/mips2/invalid-mips5-wrong-error.s
+++ b/test/MC/Mips/mips2/invalid-mips5-wrong-error.s

@@ -6,41 +6,41 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
-        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips2/valid.s b/test/MC/Mips/mips2/valid.s
index 9c3706e..34843bc 100644
--- a/test/MC/Mips/mips2/valid.s
+++ b/test/MC/Mips/mips2/valid.s

@@ -6,21 +6,36 @@
         abs.d     $f7,$f25             # CHECK: encoding:
         abs.s     $f9,$f16
         add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
         addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         and       $s7,$v0,$12
         and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
         bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
         bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,50             # CHECK: bc1fl 50      # encoding: [0x45,0x02,0x00,0x0c]
+        bc1fl     50                   # CHECK: bc1fl 50      # encoding: [0x45,0x02,0x00,0x0c]
         bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
         bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,-8239          # CHECK: bc1tl -8239   # encoding: [0x45,0x03,0xf7,0xf4]
+        bc1tl     -8239                # CHECK: bc1tl -8239   # encoding: [0x45,0x03,0xf7,0xf4]
         bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
         bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
         c.sf.d    $f30,$f0
@@ -113,6 +128,8 @@
         srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
         ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
         sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
         sub.d     $f18,$f3,$f17
         sub.s     $f23,$f22,$f22
         subu      $sp,$s6,$s6
@@ -123,15 +140,27 @@
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
         teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
         tgei      $s1,5025
         tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
         tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
         tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
         tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
         tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
         tlti      $14,-21059
         tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
         tnei      $12,-29647
         trunc.w.d $f22,$f15
         trunc.w.s $f28,$f30

diff --git a/test/MC/Mips/mips3/invalid-mips32r2.s b/test/MC/Mips/mips3/invalid-mips32r2.s
new file mode 100644
index 0000000..178e0f0
--- /dev/null
+++ b/test/MC/Mips/mips3/invalid-mips32r2.s

@@ -0,0 +1,11 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips3 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips3/invalid-mips4-wrong-error.s b/test/MC/Mips/mips3/invalid-mips4-wrong-error.s
new file mode 100644
index 0000000..c9af39a
--- /dev/null
+++ b/test/MC/Mips/mips3/invalid-mips4-wrong-error.s

@@ -0,0 +1,10 @@
+# Instructions that are invalid and are correctly rejected but use the wrong
+# error message at the moment.
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips3 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        bc1fl     $fcc7,27          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
+        bc1tl     $fcc7,27          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction

diff --git a/test/MC/Mips/mips3/invalid-mips5-wrong-error.s b/test/MC/Mips/mips3/invalid-mips5-wrong-error.s
index 2c0246a..cf809d3 100644
--- a/test/MC/Mips/mips3/invalid-mips5-wrong-error.s
+++ b/test/MC/Mips/mips3/invalid-mips5-wrong-error.s

@@ -6,41 +6,41 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
-        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips3/valid.s b/test/MC/Mips/mips3/valid.s
index cb209fd..a55576d 100644
--- a/test/MC/Mips/mips3/valid.s
+++ b/test/MC/Mips/mips3/valid.s

@@ -6,21 +6,36 @@
         abs.d     $f7,$f25             # CHECK: encoding:
         abs.s     $f9,$f16
         add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
         addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         and       $s7,$v0,$12
         and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
         bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
         bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,50             # CHECK: bc1fl 50      # encoding: [0x45,0x02,0x00,0x0c]
+        bc1fl     50                   # CHECK: bc1fl 50      # encoding: [0x45,0x02,0x00,0x0c]
         bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
         bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,-8239          # CHECK: bc1tl -8239   # encoding: [0x45,0x03,0xf7,0xf4]
+        bc1tl     -8239                # CHECK: bc1tl -8239   # encoding: [0x45,0x03,0xf7,0xf4]
         bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
         bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
         cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
@@ -50,6 +65,8 @@
         daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
+        daddu     $24,$2,18079         # CHECK: daddiu $24, $2, 18079   # encoding: [0x64,0x58,0x46,0x9f]
+        daddu     $19,26943            # CHECK: daddiu $19, $19, 26943  # encoding: [0x66,0x73,0x69,0x3f]
         ddiv      $zero,$k0,$s3
         ddivu     $zero,$s0,$s1
         div       $zero,$25,$11
@@ -84,6 +101,8 @@
         dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
         dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
+        dsubu     $15,$11,5025         # CHECK: daddiu $15, $11, -5025  # encoding: [0x65,0x6f,0xec,0x5f]
+        dsubu     $14,-4586            # CHECK: daddiu $14, $14, 4586   # encoding: [0x65,0xce,0x11,0xea]
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
         floor.l.d $f26,$f7
@@ -171,6 +190,8 @@
         srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
         ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
         sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
         sub.d     $f18,$f3,$f17
         sub.s     $f23,$f22,$f22
         subu      $sp,$s6,$s6
@@ -180,15 +201,27 @@
         swl       $15,13694($s3)
         swr       $s1,-26590($14)
         sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
         teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
         tgei      $s1,5025
         tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
         tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
         tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
         tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
         tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
         tlti      $14,-21059
         tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
         tnei      $12,-29647
         trunc.l.d $f23,$f23
         trunc.l.s $f28,$f31

diff --git a/test/MC/Mips/mips32/abiflags.s b/test/MC/Mips/mips32/abiflags.s
index 896dd84..dd772c0 100644
--- a/test/MC/Mips/mips32/abiflags.s
+++ b/test/MC/Mips/mips32/abiflags.s

@@ -8,27 +8,26 @@
 # CHECK-ASM: .module fp=32
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ:  Section {
-# CHECK-OBJ:    Index: 5
-# CHECK-OBJ:    Name: .MIPS.abiflags (12)
-# CHECK-OBJ:    Type:  (0x7000002A)
-# CHECK-OBJ:     Flags [ (0x2)
-# CHECK-OBJ:      SHF_ALLOC (0x2)
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    Address: 0x0
-# CHECK-OBJ:    Offset: 0x50
-# CHECK-OBJ:    Size: 24
-# CHECK-OBJ:    Link: 0
-# CHECK-OBJ:    Info: 0
-# CHECK-OBJ:    AddressAlignment: 8
-# CHECK-OBJ:    EntrySize: 0
-# CHECK-OBJ:    Relocations [
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    SectionData (
-# CHECK-OBJ:      0000: 00002001 01010001 00000000 00000000  |.. .............|
-# CHECK-OBJ:      0010: 00000000 00000000                    |........|
-# CHECK-OBJ:    )
-# CHECK-OBJ:  }
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags (12)
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00002001 01010001 00000000 00000000  |.. .............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
 
         .module fp=32
 

diff --git a/test/MC/Mips/mips32/invalid-mips32r2.s b/test/MC/Mips/mips32/invalid-mips32r2.s
index fa6fe32..07a1e8f 100644
--- a/test/MC/Mips/mips32/invalid-mips32r2.s
+++ b/test/MC/Mips/mips32/invalid-mips32r2.s

@@ -8,7 +8,9 @@
         cvt.l.d $f24,$f15           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         cvt.l.s $f11,$f29           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ldxc1   $f8,$s7($t7)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         luxc1   $f19,$s6($s5)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1   $f12,$s1($s8)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips32/valid.s b/test/MC/Mips/mips32/valid.s
index d330905..d79c390 100644
--- a/test/MC/Mips/mips32/valid.s
+++ b/test/MC/Mips/mips32/valid.s

@@ -6,23 +6,40 @@
         abs.d     $f7,$f25             # CHECK: encoding:
         abs.s     $f9,$f16
         add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
         addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         and       $s7,$v0,$12
         and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
         bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
         bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
         bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
         bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
         bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
         bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
         bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
         bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
         cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
@@ -141,6 +158,8 @@
         srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
         ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
         sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
         sub.d     $f18,$f3,$f17
         sub.s     $f23,$f22,$f22
         subu      $sp,$s6,$s6
@@ -151,15 +170,27 @@
         swr       $s1,-26590($14)
         sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
         teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
         tgei      $s1,5025
         tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
         tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
         tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
         tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
         tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
         tlti      $14,-21059
         tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
         tnei      $12,-29647
         trunc.w.d $f22,$f15
         trunc.w.s $f28,$f30

diff --git a/test/MC/Mips/mips32r2/abiflags.s b/test/MC/Mips/mips32r2/abiflags.s
index 41a809a..e3bb15b 100644
--- a/test/MC/Mips/mips32r2/abiflags.s
+++ b/test/MC/Mips/mips32r2/abiflags.s

@@ -9,27 +9,26 @@
 # CHECK-ASM: .set fp=64
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ:  Section {
-# CHECK-OBJ:    Index: 5
-# CHECK-OBJ:    Name: .MIPS.abiflags (12)
-# CHECK-OBJ:    Type:  (0x7000002A)
-# CHECK-OBJ:     Flags [ (0x2)
-# CHECK-OBJ:      SHF_ALLOC (0x2)
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    Address: 0x0
-# CHECK-OBJ:    Offset: 0x50
-# CHECK-OBJ:    Size: 24
-# CHECK-OBJ:    Link: 0
-# CHECK-OBJ:    Info: 0
-# CHECK-OBJ:    AddressAlignment: 8
-# CHECK-OBJ:    EntrySize: 0
-# CHECK-OBJ:    Relocations [
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    SectionData (
-# CHECK-OBJ:      0000: 00002002 01010001 00000000 00000000  |.. .............|
-# CHECK-OBJ:      0010: 00000000 00000000                    |........|
-# CHECK-OBJ:    )
-# CHECK-OBJ:  }
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags (12)
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00002002 01010001 00000000 00000000  |.. .............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
 
         .module fp=32
         .set fp=64

diff --git a/test/MC/Mips/mips32r2/valid.s b/test/MC/Mips/mips32r2/valid.s
index 631c691..4ef5aab 100644
--- a/test/MC/Mips/mips32r2/valid.s
+++ b/test/MC/Mips/mips32r2/valid.s

@@ -6,23 +6,40 @@
         abs.d     $f7,$f25             # CHECK: encoding:
         abs.s     $f9,$f16
         add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
         addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         and       $s7,$v0,$12
         and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
         bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
         bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
         bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
         bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
         bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
         bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
         bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
         bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
         cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
@@ -43,13 +60,15 @@
         cvt.w.d   $f20,$f14
         cvt.w.s   $f20,$f24
         deret
-        di        $s8
+        di        $s8                  # CHECK: di  $fp       # encoding: [0x41,0x7e,0x60,0x00]
+        di                             # CHECK: di            # encoding: [0x41,0x60,0x60,0x00]
         div       $zero,$25,$11
         div.d     $f29,$f20,$f27
         div.s     $f4,$f5,$f15
         divu      $zero,$25,$15
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-        ei        $14
+        ei        $14                  # CHECK: ei  $14       # encoding: [0x41,0x6e,0x60,0x20]
+        ei                             # CHECK: ei            # encoding: [0x41,0x60,0x60,0x20]
         eret
         floor.w.d $f14,$f11
         floor.w.s $f8,$f9
@@ -132,7 +151,12 @@
         or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
         pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
         pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
-        rdhwr     $sp,$11              
+        # FIXME: Use the code generator in order to print the .set directives
+        #        instead of the instruction printer.
+        rdhwr     $sp,$11              # CHECK:      .set  push
+                                       # CHECK-NEXT: .set  mips32r2
+                                       # CHECK-NEXT: rdhwr $sp, $11
+                                       # CHECK-NEXT: .set  pop          # encoding: [0x7c,0x1d,0x58,0x3b]
         rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
         rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
         rotrv     $1,$14,$15           # CHECK: rotrv $1, $14, $15      # encoding: [0x01,0xee,0x08,0x46]
@@ -169,6 +193,8 @@
         srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
         ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
         sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
         sub.d     $f18,$f3,$f17
         sub.s     $f23,$f22,$f22
         subu      $sp,$s6,$s6
@@ -181,15 +207,27 @@
         swxc1     $f19,$12($k0)
         sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
         teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
         tgei      $s1,5025
         tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
         tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
         tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
         tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
         tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
         tlti      $14,-21059
         tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
         tnei      $12,-29647
         trunc.w.d $f22,$f15
         trunc.w.s $f28,$f30

diff --git a/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
index 52fa5f5..cc7d403 100644
--- a/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s
+++ b/test/MC/Mips/mips32r6/invalid-mips1-wrong-error.s

@@ -5,13 +5,13 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
-        bc2f      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2t      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2t      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
         lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         swr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips32r6/invalid-mips1.s b/test/MC/Mips/mips32r6/invalid-mips1.s
index 44d4fbb..94810f4 100644
--- a/test/MC/Mips/mips32r6/invalid-mips1.s
+++ b/test/MC/Mips/mips32r6/invalid-mips1.s

@@ -5,6 +5,8 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
+        add       $9,$14,15176        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        add       $24,-7193           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         c.ngl.d   $f29,$f29           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         c.ngle.d  $f0,$f16            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -22,3 +24,5 @@
         multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
 #       div has been re-encoded. See valid.s
 #       divu has been re-encoded. See valid.s
+        sub       $22,$17,-3126       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sub       $13,6512            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips32r6/invalid-mips2-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips2-wrong-error.s
index b799c8e..1cec777 100644
--- a/test/MC/Mips/mips32r6/invalid-mips2-wrong-error.s
+++ b/test/MC/Mips/mips32r6/invalid-mips2-wrong-error.s

@@ -6,15 +6,5 @@
 # RUN: FileCheck %s < %t1
 
         .set noat
-        beql $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bgezall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bgezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bgtzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        blezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bltzall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bltzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bnel $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips32r6/invalid-mips2.s b/test/MC/Mips/mips32r6/invalid-mips2.s
index bfa2c4c..642d6bd 100644
--- a/test/MC/Mips/mips32r6/invalid-mips2.s
+++ b/test/MC/Mips/mips32r6/invalid-mips2.s

@@ -6,6 +6,18 @@
 
 	.set noat
         addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1fl     $fcc0,-8239         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1fl     -8239               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1tl     $fcc0,-8239         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1tl     -8239               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        beql      $14,$s3,12544       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bnel      $gp,$s4,5107        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezl     $4,-6858            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgtzl     $10,-3738           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        blezl     $6,2974             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzl     $s1,-9964           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezall   $12,7293            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzall   $6,488              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
index e63bdd4..3131c5a 100644
--- a/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s
+++ b/test/MC/Mips/mips32r6/invalid-mips32-wrong-error.s

@@ -6,15 +6,11 @@
 # RUN: FileCheck %s < %t1
 
         .set noat
-        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2f  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2f  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2t  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2t  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2f  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2t  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2t  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips32r6/invalid-mips32.s b/test/MC/Mips/mips32r6/invalid-mips32.s
index e0889ea..b2330c2 100644
--- a/test/MC/Mips/mips32r6/invalid-mips32.s
+++ b/test/MC/Mips/mips32r6/invalid-mips32.s

@@ -5,6 +5,8 @@
 # RUN: FileCheck %s < %t1
 
         .set noat
+        bc1fl     $fcc7,27            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1tl     $fcc7,27            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $s6,$13       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         madd      $zero,$9      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         maddu     $s3,$gp       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s
index f3131a9..06bf58c 100644
--- a/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s
+++ b/test/MC/Mips/mips32r6/invalid-mips4-wrong-error.s

@@ -6,16 +6,6 @@
 # RUN: FileCheck %s < %t1
 
         .set noat
-        beql $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bgezall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bgezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bgtzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        blezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bltzall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bltzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bnel $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        prefx 0,$2($31)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        prefx 0,$2($31)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips32r6/invalid-mips4.s b/test/MC/Mips/mips32r6/invalid-mips4.s
index 8ba2ed8..9d8f02f 100644
--- a/test/MC/Mips/mips32r6/invalid-mips4.s
+++ b/test/MC/Mips/mips32r6/invalid-mips4.s

@@ -5,6 +5,8 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
+        bc1fl     $fcc7,27            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1tl     $fcc7,27            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         ldxc1     $f8,$s7($15)        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         lwxc1     $f12,$s1($s8)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         sdxc1     $f11,$10($14)       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s b/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s
index 99d10c3..b5d7380 100644
--- a/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s
+++ b/test/MC/Mips/mips32r6/invalid-mips5-wrong-error.s

@@ -5,7 +5,7 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
-        bc1any2f  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1any2t  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1any4f  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1any4t  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc1any2f  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc1any2t  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc1any4f  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc1any4t  $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips32r6/valid.s b/test/MC/Mips/mips32r6/valid.s
index f23dbd7..362785b 100644
--- a/test/MC/Mips/mips32r6/valid.s
+++ b/test/MC/Mips/mips32r6/valid.s

@@ -17,6 +17,7 @@
         # FIXME: Add the instructions carried forward from older ISA's
         and     $2,4             # CHECK: andi $2, $2, 4      # encoding: [0x30,0x42,0x00,0x04]
         addiupc $4, 100          # CHECK: addiupc $4, 100     # encoding: [0xec,0x80,0x00,0x19]
+        addu    $9,10            # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         align   $4, $2, $3, 2    # CHECK: align $4, $2, $3, 2 # encoding: [0x7c,0x43,0x22,0xa0]
         aluipc  $3, 56           # CHECK: aluipc $3, 56       # encoding: [0xec,0x7f,0x00,0x38]
         aui     $3,$2,-23        # CHECK: aui $3, $2, -23     # encoding: [0x3c,0x62,0xff,0xe9]
@@ -96,8 +97,12 @@
         cmp.sle.d  $f2,$f3,$f4      # CHECK: cmp.sle.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8e]
         cmp.sule.s $f2,$f3,$f4      # CHECK: cmp.sule.s $f2, $f3, $f4 # encoding: [0x46,0x84,0x18,0x8f]
         cmp.sule.d $f2,$f3,$f4      # CHECK: cmp.sule.d $f2, $f3, $f4 # encoding: [0x46,0xa4,0x18,0x8f]
+        di      $s8              # CHECK: di  $fp          # encoding: [0x41,0x7e,0x60,0x00]
+        di                       # CHECK: di               # encoding: [0x41,0x60,0x60,0x00]
         div     $2,$3,$4         # CHECK: div $2, $3, $4   # encoding: [0x00,0x64,0x10,0x9a]
         divu    $2,$3,$4         # CHECK: divu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x9b]
+        ei      $14              # CHECK: ei  $14          # encoding: [0x41,0x6e,0x60,0x20]
+        ei                       # CHECK: ei               # encoding: [0x41,0x60,0x60,0x20]
         jialc   $5, 256          # CHECK: jialc $5, 256    # encoding: [0xf8,0x05,0x01,0x00]
         jic     $5, 256          # CHECK: jic $5, 256      # encoding: [0xd8,0x05,0x01,0x00]
         lsa     $2, $3, $4, 3    # CHECK: lsa  $2, $3, $4, 3 # encoding: [0x00,0x64,0x10,0xc5]
@@ -114,6 +119,12 @@
         msubf.s $f2,$f3,$f4      # CHECK: msubf.s $f2, $f3, $f4  # encoding: [0x46,0x04,0x18,0x99]
         msubf.d $f2,$f3,$f4      # CHECK: msubf.d $f2, $f3, $f4  # encoding: [0x46,0x24,0x18,0x99]
         pref    1, 8($5)         # CHECK: pref 1, 8($5)          # encoding: [0x7c,0xa1,0x04,0x35]
+        # FIXME: Use the code generator in order to print the .set directives
+        #        instead of the instruction printer.
+        rdhwr   $sp,$11          # CHECK:      .set  push
+                                 # CHECK-NEXT: .set  mips32r2
+                                 # CHECK-NEXT: rdhwr $sp, $11
+                                 # CHECK-NEXT: .set  pop      # encoding: [0x7c,0x1d,0x58,0x3b]
         sel.d   $f0,$f1,$f2      # CHECK: sel.d $f0, $f1, $f2 # encoding: [0x46,0x22,0x08,0x10]
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
         seleqz  $2,$3,$4         # CHECK: seleqz $2, $3, $4 # encoding: [0x00,0x64,0x10,0x35]
@@ -152,3 +163,15 @@
         sdbbp     34             # CHECK: sdbbp 34               # encoding: [0x00,0x00,0x08,0x8e]
         sync                     # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         sync    1                # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq     $0,$3            # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq     $5,$7,620        # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
+        tge     $7,$10           # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge     $5,$19,340       # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
+        tgeu    $22,$28          # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu    $20,$14,379      # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
+        tlt     $15,$13          # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt     $2,$19,133       # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
+        tltu    $11,$16          # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu    $16,$29,1016     # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne     $6,$17           # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne     $7,$8,885        # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]

diff --git a/test/MC/Mips/mips4/invalid-mips32r2.s b/test/MC/Mips/mips4/invalid-mips32r2.s
new file mode 100644
index 0000000..3e78758
--- /dev/null
+++ b/test/MC/Mips/mips4/invalid-mips32r2.s

@@ -0,0 +1,11 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips4 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips4/invalid-mips5-wrong-error.s b/test/MC/Mips/mips4/invalid-mips5-wrong-error.s
index c6c8968..5c8ab23 100644
--- a/test/MC/Mips/mips4/invalid-mips5-wrong-error.s
+++ b/test/MC/Mips/mips4/invalid-mips5-wrong-error.s

@@ -6,41 +6,41 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
-        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        abs.ps    $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        add.ps    $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        alnv.ps   $f12,$f18,$f30,$t0  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.eq.ps   $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.f.ps    $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.le.ps   $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.lt.ps   $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.nge.ps  $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngl.ps  $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngle.ps $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngt.ps  $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ole.ps  $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.olt.ps  $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.seq.ps  $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.sf.ps   $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ueq.ps  $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ule.ps  $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ult.ps  $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.un.ps   $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.ps.s  $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.s.pl  $f30,$f1            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.s.pu  $f14,$f25           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        madd.ps   $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mov.ps    $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movf.ps   $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movn.ps   $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movt.ps   $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movz.ps   $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        msub.ps   $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mul.ps    $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        neg.ps    $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmadd.ps  $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmsub.ps  $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pll.ps    $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        plu.ps    $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pul.ps    $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        puu.ps    $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        sub.ps    $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips4/valid.s b/test/MC/Mips/mips4/valid.s
index 949b91d..c221b76 100644
--- a/test/MC/Mips/mips4/valid.s
+++ b/test/MC/Mips/mips4/valid.s

@@ -6,23 +6,40 @@
         abs.d     $f7,$f25             # CHECK: encoding:
         abs.s     $f9,$f16
         add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
         addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         and       $s7,$v0,$12
         and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
         bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
         bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
         bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
         bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
         bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
         bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
         bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
         bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
         cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
@@ -52,6 +69,8 @@
         daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
+        daddu     $24,$2,18079         # CHECK: daddiu $24, $2, 18079  # encoding: [0x64,0x58,0x46,0x9f]
+        daddu     $19,26943            # CHECK: daddiu $19, $19, 26943 # encoding: [0x66,0x73,0x69,0x3f]
         ddiv      $zero,$k0,$s3
         ddivu     $zero,$s0,$s1
         div       $zero,$25,$11
@@ -86,6 +105,8 @@
         dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
         dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
+        dsubu     $15,$11,5025         # CHECK: daddiu $15, $11, -5025 # encoding: [0x65,0x6f,0xec,0x5f]
+        dsubu     $14,-4586            # CHECK: daddiu $14, $14, 4586  # encoding: [0x65,0xce,0x11,0xea]
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
         floor.l.d $f26,$f7
@@ -189,6 +210,8 @@
         srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
         ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
         sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
         sub.d     $f18,$f3,$f17
         sub.s     $f23,$f22,$f22
         subu      $sp,$s6,$s6
@@ -199,15 +222,27 @@
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
         sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
         teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
         tgei      $s1,5025
         tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
         tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
         tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
         tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
         tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
         tlti      $14,-21059
         tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
         tnei      $12,-29647
         trunc.l.d $f23,$f23
         trunc.l.s $f28,$f31

diff --git a/test/MC/Mips/mips5/invalid-mips32r2.s b/test/MC/Mips/mips5/invalid-mips32r2.s
new file mode 100644
index 0000000..a369efa
--- /dev/null
+++ b/test/MC/Mips/mips5/invalid-mips32r2.s

@@ -0,0 +1,11 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips5 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+        .set noat
+        di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips5/valid.s b/test/MC/Mips/mips5/valid.s
index 3afdee1..b93b22f 100644
--- a/test/MC/Mips/mips5/valid.s
+++ b/test/MC/Mips/mips5/valid.s

@@ -6,23 +6,40 @@
         abs.d     $f7,$f25             # CHECK: encoding:
         abs.s     $f9,$f16
         add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
         addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         and       $s7,$v0,$12
         and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
         bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
         bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
         bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
         bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
         bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
         bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
         bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
         bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
         cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
@@ -52,6 +69,8 @@
         daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
+        daddu     $24,$2,18079         # CHECK: daddiu $24, $2, 18079  # encoding: [0x64,0x58,0x46,0x9f]
+        daddu     $19,26943            # CHECK: daddiu $19, $19, 26943 # encoding: [0x66,0x73,0x69,0x3f]
         ddiv      $zero,$k0,$s3
         ddivu     $zero,$s0,$s1
         div       $zero,$25,$11
@@ -86,6 +105,8 @@
         dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
         dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
+        dsubu     $15,$11,5025         # CHECK: daddiu $15, $11, -5025 # encoding: [0x65,0x6f,0xec,0x5f]
+        dsubu     $14,-4586            # CHECK: daddiu $14, $14, 4586  # encoding: [0x65,0xce,0x11,0xea]
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
         floor.l.d $f26,$f7
@@ -190,6 +211,8 @@
         srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
         ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
         sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
         sub.d     $f18,$f3,$f17
         sub.s     $f23,$f22,$f22
         subu      $sp,$s6,$s6
@@ -201,15 +224,27 @@
         swr       $s1,-26590($14)
         swxc1     $f19,$12($k0)
         sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
         teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
         tgei      $s1,5025
         tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
         tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
         tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
         tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
         tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
         tlti      $14,-21059
         tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
         tnei      $12,-29647
         trunc.l.d $f23,$f23
         trunc.l.s $f28,$f31

diff --git a/test/MC/Mips/mips64-register-names-n32-n64.s b/test/MC/Mips/mips64-register-names-n32-n64.s
index ee6f88f..efe1cdb 100644
--- a/test/MC/Mips/mips64-register-names-n32-n64.s
+++ b/test/MC/Mips/mips64-register-names-n32-n64.s

@@ -1,7 +1,11 @@
-# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding | FileCheck %s
+# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding 2>%t0 \
+# RUN:     | FileCheck %s
+# RUN: FileCheck -check-prefix=WARNING %s < %t0
+#
 # RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding \
-# RUN:     -mattr=-n64,+n32 | FileCheck %s
-
+# RUN:     -mattr=-n64,+n32 2>%t1 | FileCheck %s
+# RUN: FileCheck -check-prefix=WARNING %s < %t1
+#
 # Check that the register names are mapped to their correct numbers for n32/n64
 # Second byte of addiu with $zero at rt contains the number of the source
 # register.
@@ -23,9 +27,25 @@
 daddiu	$t1, $zero, 0 # [*] # CHECK: encoding: [0x64,0x0d,0x00,0x00]
 daddiu	$t2, $zero, 0 # [*] # CHECK: encoding: [0x64,0x0e,0x00,0x00]
 daddiu	$t3, $zero, 0 # [*] # CHECK: encoding: [0x64,0x0f,0x00,0x00]
+# WARNING: mips64-register-names-n32-n64.s:[[@LINE+4]]:9: warning: register names $t4-$t7 are only available in O32.
+# WARNING-NEXT: daddiu  $t4, $zero, 0       # {{CHECK}}: encoding: [0x64,0x0c,0x00,0x00]
+# WARNING-NEXT:          ^~
+# WARNING-NEXT:          Did you mean $t0?
 daddiu	$t4, $zero, 0       # CHECK: encoding: [0x64,0x0c,0x00,0x00]
+# WARNING: mips64-register-names-n32-n64.s:[[@LINE+4]]:9: warning: register names $t4-$t7 are only available in O32.
+# WARNING-NEXT: daddiu  $t5, $zero, 0       # {{CHECK}}: encoding: [0x64,0x0d,0x00,0x00]
+# WARNING-NEXT:          ^~
+# WARNING-NEXT:          Did you mean $t1?
 daddiu	$t5, $zero, 0       # CHECK: encoding: [0x64,0x0d,0x00,0x00]
+# WARNING: mips64-register-names-n32-n64.s:[[@LINE+4]]:9: warning: register names $t4-$t7 are only available in O32.
+# WARNING-NEXT: daddiu  $t6, $zero, 0       # {{CHECK}}: encoding: [0x64,0x0e,0x00,0x00]
+# WARNING-NEXT:          ^~
+# WARNING-NEXT:          Did you mean $t2?
 daddiu	$t6, $zero, 0       # CHECK: encoding: [0x64,0x0e,0x00,0x00]
+# WARNING: mips64-register-names-n32-n64.s:[[@LINE+4]]:9: warning: register names $t4-$t7 are only available in O32.
+# WARNING-NEXT: daddiu  $t7, $zero, 0       # {{CHECK}}: encoding: [0x64,0x0f,0x00,0x00]
+# WARNING-NEXT:          ^~
+# WARNING-NEXT:          Did you mean $t3?
 daddiu	$t7, $zero, 0       # CHECK: encoding: [0x64,0x0f,0x00,0x00]
 daddiu	$s0, $zero, 0       # CHECK: encoding: [0x64,0x10,0x00,0x00]
 daddiu	$s1, $zero, 0       # CHECK: encoding: [0x64,0x11,0x00,0x00]

diff --git a/test/MC/Mips/mips64/abiflags.s b/test/MC/Mips/mips64/abiflags.s
index 557e32a..ecaffcc 100644
--- a/test/MC/Mips/mips64/abiflags.s
+++ b/test/MC/Mips/mips64/abiflags.s

@@ -8,27 +8,26 @@
 # CHECK-ASM: .module fp=64
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ:  Section {
-# CHECK-OBJ:    Index: 5
-# CHECK-OBJ:    Name: .MIPS.abiflags (12)
-# CHECK-OBJ:    Type:  (0x7000002A)
-# CHECK-OBJ:     Flags [ (0x2)
-# CHECK-OBJ:      SHF_ALLOC (0x2)
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    Address: 0x0
-# CHECK-OBJ:    Offset: 0x50
-# CHECK-OBJ:    Size: 24
-# CHECK-OBJ:    Link: 0
-# CHECK-OBJ:    Info: 0
-# CHECK-OBJ:    AddressAlignment: 8
-# CHECK-OBJ:    EntrySize: 0
-# CHECK-OBJ:    Relocations [
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    SectionData (
-# CHECK-OBJ:      0000: 00004001 02020001 00000000 00000000  |..@.............|
-# CHECK-OBJ:      0010: 00000000 00000000                    |........|
-# CHECK-OBJ:    )
-# CHECK-OBJ:  }
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00004001 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
 
         .module fp=64
 

diff --git a/test/MC/Mips/mips64/invalid-mips32r2.s b/test/MC/Mips/mips64/invalid-mips32r2.s
new file mode 100644
index 0000000..bc5d1f0
--- /dev/null
+++ b/test/MC/Mips/mips64/invalid-mips32r2.s

@@ -0,0 +1,11 @@
+# Instructions that are invalid
+#
+# RUN: not llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64 \
+# RUN:     2>%t1
+# RUN: FileCheck %s < %t1
+
+	.set noat
+        di      $s8                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        di                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei      $t6                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        ei                          # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips64/valid.s b/test/MC/Mips/mips64/valid.s
index 1bd057d..032777e 100644
--- a/test/MC/Mips/mips64/valid.s
+++ b/test/MC/Mips/mips64/valid.s

@@ -6,23 +6,40 @@
         abs.d     $f7,$f25             # CHECK: encoding:
         abs.s     $f9,$f16
         add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
         addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         and       $s7,$v0,$12
         and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
         bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
         bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
         bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
         bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
         bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
         bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
         bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
         bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
         cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
@@ -54,6 +71,8 @@
         daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
+        daddu     $24,$2,18079         # CHECK: daddiu $24, $2, 18079  # encoding: [0x64,0x58,0x46,0x9f]
+        daddu     $19,26943            # CHECK: daddiu $19, $19, 26943 # encoding: [0x66,0x73,0x69,0x3f]
         dclo      $s2,$a2              # CHECK: dclo $18, $6   # encoding: [0x70,0xd2,0x90,0x25]
         dclz      $s0,$25              # CHECK: dclz $16, $25  # encoding: [0x73,0x30,0x80,0x24]
         deret
@@ -91,6 +110,8 @@
         dsubi     $sp,$s4,-27705       # CHECK: daddi $sp, $20, 27705  # encoding: [0x62,0x9d,0x6c,0x39]
         dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
+        dsubu     $15,$11,5025         # CHECK: daddiu $15, $11, -5025 # encoding: [0x65,0x6f,0xec,0x5f]
+        dsubu     $14,-4586            # CHECK: daddiu $14, $14, 4586  # encoding: [0x65,0xce,0x11,0xea]
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
         eret
         floor.l.d $f26,$f7
@@ -206,6 +227,8 @@
         srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
         ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
         sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
         sub.d     $f18,$f3,$f17
         sub.s     $f23,$f22,$f22
         subu      $sp,$s6,$s6
@@ -218,15 +241,27 @@
         swxc1     $f19,$12($k0)
         sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
         teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
         tgei      $s1,5025
         tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
         tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
         tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
         tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
         tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
         tlti      $14,-21059
         tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
         tnei      $12,-29647
         trunc.l.d $f23,$f23
         trunc.l.s $f28,$f31

diff --git a/test/MC/Mips/mips64r2/abiflags.s b/test/MC/Mips/mips64r2/abiflags.s
index aa76dee..dc4a1e9 100644
--- a/test/MC/Mips/mips64r2/abiflags.s
+++ b/test/MC/Mips/mips64r2/abiflags.s

@@ -8,27 +8,26 @@
 # CHECK-ASM: .module fp=64
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ:  Section {
-# CHECK-OBJ:    Index: 5
-# CHECK-OBJ:    Name: .MIPS.abiflags (12)
-# CHECK-OBJ:    Type:  (0x7000002A)
-# CHECK-OBJ:     Flags [ (0x2)
-# CHECK-OBJ:      SHF_ALLOC (0x2)
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    Address: 0x0
-# CHECK-OBJ:    Offset: 0x50
-# CHECK-OBJ:    Size: 24
-# CHECK-OBJ:    Link: 0
-# CHECK-OBJ:    Info: 0
-# CHECK-OBJ:    AddressAlignment: 8
-# CHECK-OBJ:    EntrySize: 0
-# CHECK-OBJ:    Relocations [
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    SectionData (
-# CHECK-OBJ:      0000: 00004002 02020001 00000000 00000000  |..@.............|
-# CHECK-OBJ:      0010: 00000000 00000000                    |........|
-# CHECK-OBJ:    )
-# CHECK-OBJ:  }
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00004002 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
 
         .module fp=64
 

diff --git a/test/MC/Mips/mips64r2/valid.s b/test/MC/Mips/mips64r2/valid.s
index 7a2244a..7717238 100644
--- a/test/MC/Mips/mips64r2/valid.s
+++ b/test/MC/Mips/mips64r2/valid.s

@@ -6,23 +6,40 @@
         abs.d     $f7,$f25             # CHECK: encoding:
         abs.s     $f9,$f16
         add       $s7,$s2,$a1
+        add       $9,$14,15176         # CHECK: addi $9, $14, 15176   # encoding: [0x21,0xc9,0x3b,0x48]
+        add       $24,-7193            # CHECK: addi $24, $24, -7193  # encoding: [0x23,0x18,0xe3,0xe7]
         add.d     $f1,$f7,$f29
         add.s     $f8,$f21,$f24
         addi      $13,$9,26322
         addi      $8,$8,~1             # CHECK: addi $8, $8, -2 # encoding: [0x21,0x08,0xff,0xfe]
         addu      $9,$a0,$a2
+        addu      $9,10                # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         and       $s7,$v0,$12
         and       $2,4                 # CHECK: andi $2, $2, 4 # encoding: [0x30,0x42,0x00,0x04]
         bc1f      $fcc0, 4             # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
         bc1f      $fcc1, 4             # CHECK: bc1f $fcc1, 4 # encoding: [0x45,0x04,0x00,0x01]
         bc1f      4                    # CHECK: bc1f 4        # encoding: [0x45,0x00,0x00,0x01]
+        bc1fl     $fcc0,4688           # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     4688                 # CHECK: bc1fl 4688      # encoding: [0x45,0x02,0x04,0x94]
+        bc1fl     $fcc7,27             # CHECK: bc1fl $fcc7, 27 # encoding: [0x45,0x1e,0x00,0x06]
         bc1t      $fcc0, 4             # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
         bc1t      $fcc1, 4             # CHECK: bc1t $fcc1, 4 # encoding: [0x45,0x05,0x00,0x01]
         bc1t      4                    # CHECK: bc1t 4        # encoding: [0x45,0x01,0x00,0x01]
+        bc1tl     $fcc0,4688           # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     4688                 # CHECK: bc1tl 4688      # encoding: [0x45,0x03,0x04,0x94]
+        bc1tl     $fcc7,27             # CHECK: bc1tl $fcc7, 27 # encoding: [0x45,0x1f,0x00,0x06]
         bal       21100                # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $0, 21100            # CHECK: bal 21100     # encoding: [0x04,0x11,0x14,0x9b]
         bgezal    $6, 21100            # CHECK: bgezal $6, 21100 # encoding: [0x04,0xd1,0x14,0x9b]
         bltzal    $6, 21100            # CHECK: bltzal $6, 21100 # encoding: [0x04,0xd0,0x14,0x9b]
+        beql      $14,$s3,12544        # CHECK: beql $14, $19, 12544 # encoding: [0x51,0xd3,0x0c,0x40]
+        bgezall   $12,7293             # CHECK: bgezall $12, 7293    # encoding: [0x05,0x93,0x07,0x1f]
+        bgezl     $4,-6858             # CHECK: bgezl $4, -6858      # encoding: [0x04,0x83,0xf9,0x4d]
+        bgtzl     $10,-3738            # CHECK: bgtzl $10, -3738     # encoding: [0x5d,0x40,0xfc,0x59]
+        blezl     $6,2974              # CHECK: blezl $6, 2974       # encoding: [0x58,0xc0,0x02,0xe7]
+        bltzall   $6,488               # CHECK: bltzall $6, 488      # encoding: [0x04,0xd2,0x00,0x7a]
+        bltzl     $s1,-9964            # CHECK: bltzl $17, -9964     # encoding: [0x06,0x22,0xf6,0x45]
+        bnel      $gp,$s4,5107         # CHECK: bnel $gp, $20, 5107  # encoding: [0x57,0x94,0x04,0xfc]
         cache     1, 8($5)             # CHECK: cache 1, 8($5)   # encoding: [0xbc,0xa1,0x00,0x08]
         c.ngl.d   $f29,$f29
         c.ngle.d  $f0,$f16
@@ -54,10 +71,13 @@
         daddi     $sp,-27705           # CHECK: daddi $sp, $sp, -27705 # encoding: [0x63,0xbd,0x93,0xc7]
         daddiu    $k0,$s6,-4586
         daddu     $s3,$at,$ra
+        daddu     $24,$2,18079         # CHECK: daddiu $24, $2, 18079  # encoding: [0x64,0x58,0x46,0x9f]
+        daddu     $19,26943            # CHECK: daddiu $19, $19, 26943 # encoding: [0x66,0x73,0x69,0x3f]
         dclo      $s2,$a2              # CHECK: dclo $18, $6   # encoding: [0x70,0xd2,0x90,0x25]
         dclz      $s0,$25              # CHECK: dclz $16, $25  # encoding: [0x73,0x30,0x80,0x24]
         deret
-        di        $s8
+        di        $s8                  # CHECK: di  $fp        # encoding: [0x41,0x7e,0x60,0x00]
+        di                             # CHECK: di             # encoding: [0x41,0x60,0x60,0x00]
         ddiv      $zero,$k0,$s3
         ddivu     $zero,$s0,$s1
         div       $zero,$25,$11
@@ -101,8 +121,11 @@
         dsubi     $sp,-27705           # CHECK: daddi $sp, $sp, 27705  # encoding: [0x63,0xbd,0x6c,0x39]
         dsubu     $a1,$a1,$k0
         dsubu     $a1,$a1,$k0
+        dsubu     $15,$11,5025         # CHECK: daddiu $15, $11, -5025 # encoding: [0x65,0x6f,0xec,0x5f]
+        dsubu     $14,-4586            # CHECK: daddiu $14, $14, 4586  # encoding: [0x65,0xce,0x11,0xea]
         ehb                            # CHECK: ehb # encoding:  [0x00,0x00,0x00,0xc0]
-        ei        $14
+        ei        $14                  # CHECK: ei  $14       # encoding: [0x41,0x6e,0x60,0x20]
+        ei                             # CHECK: ei            # encoding: [0x41,0x60,0x60,0x20]
         eret
         floor.l.d $f26,$f7
         floor.l.s $f12,$f5
@@ -190,7 +213,12 @@
         or        $2, 4                # CHECK: ori $2, $2, 4           # encoding: [0x34,0x42,0x00,0x04]
         pause                          # CHECK: pause # encoding:  [0x00,0x00,0x01,0x40]
         pref      1, 8($5)             # CHECK: pref 1, 8($5)           # encoding: [0xcc,0xa1,0x00,0x08]
-        rdhwr     $sp,$11
+        # FIXME: Use the code generator in order to print the .set directives
+        #        instead of the instruction printer.
+        rdhwr     $sp,$11              # CHECK:      .set  push
+                                       # CHECK-NEXT: .set  mips32r2
+                                       # CHECK-NEXT: rdhwr $sp, $11
+                                       # CHECK-NEXT: .set  pop          # encoding: [0x7c,0x1d,0x58,0x3b]
         rotr      $1,15                # CHECK: rotr $1, $1, 15         # encoding: [0x00,0x21,0x0b,0xc2]
         rotr      $1,$14,15            # CHECK: rotr $1, $14, 15        # encoding: [0x00,0x2e,0x0b,0xc2]
         rotrv     $1,$14,$15           # CHECK: rotrv $1, $14, $15      # encoding: [0x01,0xee,0x08,0x46]
@@ -233,6 +261,8 @@
         srlv      $25,$s4,$a0          # CHECK: srlv $25, $20, $4      # encoding: [0x00,0x94,0xc8,0x06]
         ssnop                          # CHECK: ssnop                  # encoding: [0x00,0x00,0x00,0x40]
         sub       $s6,$s3,$12
+        sub       $22,$17,-3126        # CHECK: addi $22, $17, 3126    # encoding: [0x22,0x36,0x0c,0x36]
+        sub       $13,6512             # CHECK: addi $13, $13, -6512   # encoding: [0x21,0xad,0xe6,0x90]
         sub.d     $f18,$f3,$f17
         sub.s     $f23,$f22,$f22
         subu      $sp,$s6,$s6
@@ -245,15 +275,27 @@
         swxc1     $f19,$12($k0)
         sync                           # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         sync      1                    # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq       $0,$3                # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq       $5,$7,620            # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
         teqi      $s5,-17504
+        tge       $7,$10               # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge       $5,$19,340           # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
         tgei      $s1,5025
         tgeiu     $sp,-28621
+        tgeu      $22,$28              # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu      $20,$14,379          # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
         tlbp                           # CHECK: tlbp                   # encoding: [0x42,0x00,0x00,0x08]
         tlbr                           # CHECK: tlbr                   # encoding: [0x42,0x00,0x00,0x01]
         tlbwi                          # CHECK: tlbwi                  # encoding: [0x42,0x00,0x00,0x02]
         tlbwr                          # CHECK: tlbwr                  # encoding: [0x42,0x00,0x00,0x06]
+        tlt       $15,$13              # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt       $2,$19,133           # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
         tlti      $14,-21059
         tltiu     $ra,-5076
+        tltu      $11,$16              # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu      $16,$29,1016         # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne       $6,$17               # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne       $7,$8,885            # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]
         tnei      $12,-29647
         trunc.l.d $f23,$f23
         trunc.l.s $f28,$f31

diff --git a/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
index e914c89..5156429 100644
--- a/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s
+++ b/test/MC/Mips/mips64r6/invalid-mips1-wrong-error.s

@@ -5,13 +5,13 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
-        bc2f      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2t      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2t      4                   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
         lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         swr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips64r6/invalid-mips1.s b/test/MC/Mips/mips64r6/invalid-mips1.s
index 6efd8f4..ce0ab97 100644
--- a/test/MC/Mips/mips64r6/invalid-mips1.s
+++ b/test/MC/Mips/mips64r6/invalid-mips1.s

@@ -5,6 +5,8 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
+        add       $9,$14,15176        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        add       $24,-7193           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
@@ -25,3 +27,5 @@
         multu     $gp,$k0             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
 #       div has been re-encoded. See valid.s
 #       divu has been re-encoded. See valid.s
+        sub       $22,$17,-3126       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        sub       $13,6512            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips64r6/invalid-mips2.s b/test/MC/Mips/mips64r6/invalid-mips2.s
index 8a5c50c..a09a051 100644
--- a/test/MC/Mips/mips64r6/invalid-mips2.s
+++ b/test/MC/Mips/mips64r6/invalid-mips2.s

@@ -6,9 +6,21 @@
 
 	.set noat
         addi      $13,$9,26322        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1fl     $fcc0,-8239         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1fl     -8239               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1tl     $fcc0,-8239         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1tl     -8239               # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        beql      $14,$s3,12544       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezall   $12,7293            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgezl     $4,-6858            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bgtzl     $10,-3738           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        blezl     $6,2974             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzall   $6,488              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bltzl     $s1,-9964           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bnel      $gp,$s4,5107        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfhi      $s3                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mfhi      $sp                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         mflo      $s1                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s
index 7424f49..eda18ac 100644
--- a/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s
+++ b/test/MC/Mips/mips64r6/invalid-mips3-wrong-error.s

@@ -9,15 +9,15 @@
         ldr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         sdl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         sdr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        ldle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        ldre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        sdle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        sdre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        ldle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        ldre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        sdle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        sdre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
         lwl       $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         lwr       $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         swl       $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
         swr       $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: invalid operand for instruction
-        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        lwle      $s4,-4231($15)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        lwre      $zero,-19147($gp)   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        swle      $15,13694($s3)      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        swre      $s1,-26590($14)     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s
index cc85f18..8702318 100644
--- a/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s
+++ b/test/MC/Mips/mips64r6/invalid-mips32-wrong-error.s

@@ -6,15 +6,11 @@
 # RUN: FileCheck %s < %t1
 
         .set noat
-        bc1fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2f  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2f  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2t  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2t  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2f  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2f  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2fl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2t  $fcc0,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2t  4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2tl $fcc1,4           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s
index f3131a9..06bf58c 100644
--- a/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s
+++ b/test/MC/Mips/mips64r6/invalid-mips4-wrong-error.s

@@ -6,16 +6,6 @@
 # RUN: FileCheck %s < %t1
 
         .set noat
-        beql $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bgezall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bgezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bgtzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        blezl $3,8              # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bltzall $3,8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bltzl $4,16             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bnel $1,$2,4            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        prefx 0,$2($31)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        bc2tl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc2fl 4                 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        prefx 0,$2($31)         # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips64r6/invalid-mips4.s b/test/MC/Mips/mips64r6/invalid-mips4.s
index 706db27..82a1196 100644
--- a/test/MC/Mips/mips64r6/invalid-mips4.s
+++ b/test/MC/Mips/mips64r6/invalid-mips4.s

@@ -5,6 +5,8 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
+        bc1fl     $fcc7,27            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
+        bc1tl     $fcc7,27            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bgezal    $0, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bgezal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled
         bltzal    $6, 21100           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s b/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
index 4fc94e2..ceeb577 100644
--- a/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s
+++ b/test/MC/Mips/mips64r6/invalid-mips5-wrong-error.s

@@ -5,44 +5,44 @@
 # RUN: FileCheck %s < %t1
 
 	.set noat
-        abs.ps          $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        add.ps          $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        alnv.ps         $f12,$f18,$f30,$12  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1any2f        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1any2t        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1any4f        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        bc1any4t        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.eq.ps         $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.f.ps          $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.le.ps         $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.lt.ps         $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.nge.ps        $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngl.ps        $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngle.ps       $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ngt.ps        $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ole.ps        $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.olt.ps        $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.seq.ps        $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.sf.ps         $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ueq.ps        $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ule.ps        $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.ult.ps        $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        c.un.ps         $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.ps.s        $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        cvt.ps.pw       $f3,$f18            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        madd.ps         $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mov.ps          $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movf.ps         $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movn.ps         $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movt.ps         $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        movz.ps         $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        msub.ps         $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        mul.ps          $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        neg.ps          $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmadd.ps        $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        nmsub.ps        $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pll.ps          $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        plu.ps          $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        pul.ps          $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        puu.ps          $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
-        sub.ps          $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: Unknown instruction
+        abs.ps          $f22,$f8            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        add.ps          $f25,$f27,$f13      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        alnv.ps         $f12,$f18,$f30,$12  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc1any2f        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc1any2t        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc1any4f        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        bc1any4t        $fcc2,4             # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.eq.ps         $fcc5,$f0,$f9       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.f.ps          $fcc6,$f11,$f11     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.le.ps         $fcc1,$f7,$f20      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.lt.ps         $f19,$f5            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.nge.ps        $f1,$f26            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngl.ps        $f21,$f30           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngle.ps       $fcc7,$f12,$f20     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ngt.ps        $fcc5,$f30,$f6      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ole.ps        $fcc7,$f21,$f8      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.olt.ps        $fcc3,$f7,$f16      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.seq.ps        $fcc6,$f31,$f14     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.sf.ps         $fcc6,$f4,$f6       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ueq.ps        $fcc1,$f5,$f29      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ule.ps        $fcc6,$f17,$f3      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.ult.ps        $fcc7,$f14,$f0      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        c.un.ps         $fcc4,$f2,$f26      # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.ps.s        $f3,$f18,$f19       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        cvt.ps.pw       $f3,$f18            # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        madd.ps         $f22,$f3,$f14,$f3   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mov.ps          $f22,$f17           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movf.ps         $f10,$f28,$fcc6     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movn.ps         $f31,$f31,$s3       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movt.ps         $f20,$f25,$fcc2     # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        movz.ps         $f18,$f17,$ra       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        msub.ps         $f12,$f14,$f29,$f17 # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        mul.ps          $f14,$f0,$f16       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        neg.ps          $f19,$f13           # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmadd.ps        $f27,$f4,$f9,$f25   # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        nmsub.ps        $f6,$f12,$f14,$f17  # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pll.ps          $f25,$f9,$f30       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        plu.ps          $f1,$f26,$f29       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        pul.ps          $f9,$f30,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        puu.ps          $f24,$f9,$f2        # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction
+        sub.ps          $f5,$f14,$f26       # CHECK: :[[@LINE]]:{{[0-9]+}}: error: unknown instruction

diff --git a/test/MC/Mips/mips64r6/valid.s b/test/MC/Mips/mips64r6/valid.s
index 34c1dac..3e8fc41 100644
--- a/test/MC/Mips/mips64r6/valid.s
+++ b/test/MC/Mips/mips64r6/valid.s

@@ -17,6 +17,7 @@
         # FIXME: Add the instructions carried forward from older ISA's
         and     $2,4           # CHECK: andi $2, $2, 4        # encoding: [0x30,0x42,0x00,0x04]
         addiupc $4, 100          # CHECK: addiupc $4, 100     # encoding: [0xec,0x80,0x00,0x19]
+        addu    $9,10            # CHECK: addiu $9, $9, 10    # encoding: [0x25,0x29,0x00,0x0a]
         align   $4, $2, $3, 2    # CHECK: align $4, $2, $3, 2 # encoding: [0x7c,0x43,0x22,0xa0]
         aluipc  $3, 56           # CHECK: aluipc $3, 56       # encoding: [0xec,0x7f,0x00,0x38]
         aui     $3,$2,-23        # CHECK: aui $3, $2, -23     # encoding: [0x3c,0x62,0xff,0xe9]
@@ -96,13 +97,21 @@
         cmp.sle.d  $f2,$f3,$f4      # CHECK: cmp.sle.d $f2, $f3, $f4  # encoding: [0x46,0xa4,0x18,0x8e]
         cmp.sule.s $f2,$f3,$f4      # CHECK: cmp.sule.s $f2, $f3, $f4 # encoding: [0x46,0x84,0x18,0x8f]
         cmp.sule.d $f2,$f3,$f4      # CHECK: cmp.sule.d $f2, $f3, $f4 # encoding: [0x46,0xa4,0x18,0x8f]
+        daddu   $24,$2,18079     # CHECK: daddiu $24, $2, 18079  # encoding: [0x64,0x58,0x46,0x9f]
+        daddu   $19,26943        # CHECK: daddiu $19, $19, 26943 # encoding: [0x66,0x73,0x69,0x3f]
         dalign  $4,$2,$3,5       # CHECK: dalign $4, $2, $3, 5 # encoding: [0x7c,0x43,0x23,0x64]
         daui    $3,$2,0x1234     # CHECK: daui $3, $2, 4660  # encoding: [0x74,0x62,0x12,0x34]
         dahi     $3,0x5678       # CHECK: dahi $3, 22136     # encoding: [0x04,0x66,0x56,0x78]
         dati     $3,0xabcd       # CHECK: dati $3, 43981     # encoding: [0x04,0x7e,0xab,0xcd]
         dbitswap $4, $2          # CHECK: dbitswap $4, $2    # encoding: [0x7c,0x02,0x20,0x24]
+        di      $s8              # CHECK: di  $fp          # encoding: [0x41,0x7e,0x60,0x00]
+        di                       # CHECK: di               # encoding: [0x41,0x60,0x60,0x00]
         div     $2,$3,$4         # CHECK: div $2, $3, $4   # encoding: [0x00,0x64,0x10,0x9a]
         divu    $2,$3,$4         # CHECK: divu $2, $3, $4  # encoding: [0x00,0x64,0x10,0x9b]
+        dsubu   $15,$11,5025     # CHECK: daddiu $15, $11, -5025 # encoding: [0x65,0x6f,0xec,0x5f]
+        dsubu   $14,-4586        # CHECK: daddiu $14, $14, 4586  # encoding: [0x65,0xce,0x11,0xea]
+        ei      $14              # CHECK: ei  $14          # encoding: [0x41,0x6e,0x60,0x20]
+        ei                       # CHECK: ei               # encoding: [0x41,0x60,0x60,0x20]
         jialc   $5, 256          # CHECK: jialc $5, 256    # encoding: [0xf8,0x05,0x01,0x00]
         jic     $5, 256          # CHECK: jic $5, 256      # encoding: [0xd8,0x05,0x01,0x00]
         mod     $2,$3,$4         # CHECK: mod $2, $3, $4   # encoding: [0x00,0x64,0x10,0xda]
@@ -146,6 +155,12 @@
         seleqz.d $f0, $f2, $f4   # CHECK: seleqz.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x14]
         selnez.s $f0, $f2, $f4   # CHECK: selnez.s $f0, $f2, $f4 # encoding: [0x46,0x04,0x10,0x17]
         selnez.d $f0, $f2, $f4   # CHECK: selnez.d $f0, $f2, $f4 # encoding: [0x46,0x24,0x10,0x17]
+        # FIXME: Use the code generator in order to print the .set directives
+        #        instead of the instruction printer.
+        rdhwr   $sp,$11          # CHECK:      .set  push
+                                 # CHECK-NEXT: .set  mips32r2
+                                 # CHECK-NEXT: rdhwr $sp, $11
+                                 # CHECK-NEXT: .set  pop         # encoding: [0x7c,0x1d,0x58,0x3b]
         rint.s $f2, $f4          # CHECK: rint.s $f2, $f4        # encoding: [0x46,0x00,0x20,0x9a]
         rint.d $f2, $f4          # CHECK: rint.d $f2, $f4        # encoding: [0x46,0x20,0x20,0x9a]
         class.s $f2, $f4         # CHECK: class.s $f2, $f4       # encoding: [0x46,0x00,0x20,0x9b]
@@ -171,3 +186,15 @@
         sdbbp     34             # CHECK: sdbbp 34               # encoding: [0x00,0x00,0x08,0x8e]
         sync                     # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
         sync    1                # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
+        teq     $0,$3            # CHECK: teq $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+        teq     $5,$7,620        # CHECK: teq $5, $7, 620        # encoding: [0x00,0xa7,0x9b,0x34]
+        tge     $7,$10           # CHECK: tge $7, $10            # encoding: [0x00,0xea,0x00,0x30]
+        tge     $5,$19,340       # CHECK: tge $5, $19, 340       # encoding: [0x00,0xb3,0x55,0x30]
+        tgeu    $22,$28          # CHECK: tgeu $22, $gp          # encoding: [0x02,0xdc,0x00,0x31]
+        tgeu    $20,$14,379      # CHECK: tgeu $20, $14, 379     # encoding: [0x02,0x8e,0x5e,0xf1]
+        tlt     $15,$13          # CHECK: tlt $15, $13           # encoding: [0x01,0xed,0x00,0x32]
+        tlt     $2,$19,133       # CHECK: tlt $2, $19, 133       # encoding: [0x00,0x53,0x21,0x72]
+        tltu    $11,$16          # CHECK: tltu $11, $16          # encoding: [0x01,0x70,0x00,0x33]
+        tltu    $16,$29,1016     # CHECK: tltu $16, $sp, 1016    # encoding: [0x02,0x1d,0xfe,0x33]
+        tne     $6,$17           # CHECK: tne $6, $17            # encoding: [0x00,0xd1,0x00,0x36]
+        tne     $7,$8,885        # CHECK: tne $7, $8, 885        # encoding: [0x00,0xe8,0xdd,0x76]

diff --git a/test/MC/Mips/mips_abi_flags_xx.s b/test/MC/Mips/mips_abi_flags_xx.s
index 1d65e99..cd6c9de 100644
--- a/test/MC/Mips/mips_abi_flags_xx.s
+++ b/test/MC/Mips/mips_abi_flags_xx.s

@@ -3,32 +3,40 @@
 #
 # RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
-# RUN:     FileCheck %s -check-prefix=CHECK-OBJ
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ -check-prefix=CHECK-OBJ-R1
+
+# RUN: llvm-mc /dev/null -arch=mips -mcpu=mips32 -mattr=fpxx -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ -check-prefix=CHECK-OBJ-R1
+
+# RUN: llvm-mc /dev/null -arch=mips -mcpu=mips32r6 -mattr=fpxx -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ -check-prefix=CHECK-OBJ-R6
 
 # CHECK-ASM: .module fp=xx
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ:  Section {
-# CHECK-OBJ:    Index: 5
-# CHECK-OBJ:    Name: .MIPS.abiflags (12)
-# CHECK-OBJ:    Type:  (0x7000002A)
-# CHECK-OBJ:     Flags [ (0x2)
-# CHECK-OBJ:      SHF_ALLOC (0x2)
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    Address: 0x0
-# CHECK-OBJ:    Offset: 0x50
-# CHECK-OBJ:    Size: 24
-# CHECK-OBJ:    Link: 0
-# CHECK-OBJ:    Info: 0
-# CHECK-OBJ:    AddressAlignment: 8
-# CHECK-OBJ:    EntrySize: 0
-# CHECK-OBJ:    Relocations [
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    SectionData (
-# CHECK-OBJ:      0000: 00002001 01010005 00000000 00000000  |.. .............|
-# CHECK-OBJ:      0010: 00000000 00000000                    |........|
-# CHECK-OBJ:    )
-# CHECK-OBJ:  }
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags (12)
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ-R1:        0000: 00002001 01010005 00000000 00000000  |.. .............|
+# CHECK-OBJ-R6:        0000: 00002006 01010005 00000000 00000000  |.. .............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
 
         .module fp=xx
 

diff --git a/test/MC/Mips/mips_abi_flags_xx_set.s b/test/MC/Mips/mips_abi_flags_xx_set.s
index 56f19d3..a548972 100644
--- a/test/MC/Mips/mips_abi_flags_xx_set.s
+++ b/test/MC/Mips/mips_abi_flags_xx_set.s

@@ -9,27 +9,26 @@
 # CHECK-ASM: .set    fp=64
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ:  Section {
-# CHECK-OBJ:    Index: 5
-# CHECK-OBJ:    Name: .MIPS.abiflags (12)
-# CHECK-OBJ:    Type:  (0x7000002A)
-# CHECK-OBJ:     Flags [ (0x2)
-# CHECK-OBJ:      SHF_ALLOC (0x2)
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    Address: 0x0
-# CHECK-OBJ:    Offset: 0x50
-# CHECK-OBJ:    Size: 24
-# CHECK-OBJ:    Link: 0
-# CHECK-OBJ:    Info: 0
-# CHECK-OBJ:    AddressAlignment: 8
-# CHECK-OBJ:    EntrySize: 0
-# CHECK-OBJ:    Relocations [
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    SectionData (
-# CHECK-OBJ:      0000: 00002001 01010005 00000000 00000000  |.. .............|
-# CHECK-OBJ:      0010: 00000000 00000000                    |........|
-# CHECK-OBJ:    )
-# CHECK-OBJ:  }
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags (12)
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00002001 01010005 00000000 00000000  |.. .............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
 
         .module fp=xx
         .set    fp=64

diff --git a/test/MC/Mips/mips_directives_bad.s b/test/MC/Mips/mips_directives_bad.s
index c823cac..a4512b5 100644
--- a/test/MC/Mips/mips_directives_bad.s
+++ b/test/MC/Mips/mips_directives_bad.s

@@ -2,7 +2,7 @@
 # RUN: not llvm-mc -triple mips-unknown-unknown %s 2>&1 | FileCheck %s
 
     .abicalls should have no operands
-# CHECK:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token in directive
+# CHECK:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token, expected end of statement
 # CHECK-NEXT:    .abicalls should have no operands
 # CHECK-NEXT:              ^
 
@@ -12,48 +12,48 @@
 
 # Blank option operand
     .option 
-# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token in .option directive
+# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token, expected identifier
 # CHECK-NEXT:    .option 
 # CHECK-NEXT:            ^
 
 # Numeric option operand
     .option 2
-# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token in .option directive
+# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token, expected identifier
 # CHECK-NEXT:    .option 2
 # CHECK-NEXT:            ^
 
 # Register option operand
     .option $2
-# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token in .option directive
+# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token, expected identifier
 # CHECK-NEXT:    .option $2
 # CHECK-NEXT:            ^
 
     .option WithBadOption
-# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: warning: unknown option in .option directive
+# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: warning: unknown option, expected 'pic0' or 'pic2'
 # CHECK-NEXT:    .option WithBadOption
 # CHECK-NEXT:            ^
 
     .option pic0,
-# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token in .option pic0 directive
+# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token, expected end of statement
 # CHECK-NEXT:    .option pic0,
 # CHECK-NEXT:                ^
 
     .option pic0,pic2
-# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token in .option pic0 directive
+# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token, expected end of statement
 # CHECK-NEXT:    .option pic0,pic2
 # CHECK-NEXT:                ^
 
     .option pic0 pic2
-# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token in .option pic0 directive
+# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token, expected end of statement
 # CHECK-NEXT:    .option pic0 pic2
 # CHECK-NEXT:                 ^
 
     .option pic2,
-# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token in .option pic2 directive
+# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token, expected end of statement
 # CHECK-NEXT:    .option pic2,
 # CHECK-NEXT:                ^
 
     .option pic2 pic3
-# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token in .option pic2 directive
+# CHECK-NEXT:    :{{[0-9]+}}:{{[0-9]+}}: error: unexpected token, expected end of statement
 # CHECK-NEXT:    .option pic2 pic3
 # CHECK-NEXT:                 ^

diff --git a/test/MC/Mips/msa/abiflags.s b/test/MC/Mips/msa/abiflags.s
index 83b83cc..136c035 100644
--- a/test/MC/Mips/msa/abiflags.s
+++ b/test/MC/Mips/msa/abiflags.s

@@ -9,27 +9,26 @@
 # CHECK-ASM: .set fp=64
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ:  Section {
-# CHECK-OBJ:    Index: 5
-# CHECK-OBJ:    Name: .MIPS.abiflags (12)
-# CHECK-OBJ:    Type:  (0x7000002A)
-# CHECK-OBJ:     Flags [ (0x2)
-# CHECK-OBJ:      SHF_ALLOC (0x2)
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    Address: 0x0
-# CHECK-OBJ:    Offset: 0x50
-# CHECK-OBJ:    Size: 24
-# CHECK-OBJ:    Link: 0
-# CHECK-OBJ:    Info: 0
-# CHECK-OBJ:    AddressAlignment: 8
-# CHECK-OBJ:    EntrySize: 0
-# CHECK-OBJ:    Relocations [
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    SectionData (
-# CHECK-OBJ:      0000: 00002002 01030001 00000000 00000200  |.. .............|
-# CHECK-OBJ:      0010: 00000000 00000000                    |........|
-# CHECK-OBJ:    )
-# CHECK-OBJ:  }
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags (12)
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00002002 01030001 00000000 00000200  |.. .............|
+# CHECK-OBJ:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
 
         .module fp=32
         .set fp=64

diff --git a/test/MC/Mips/msa/set-msa-directive-bad.s b/test/MC/Mips/msa/set-msa-directive-bad.s
new file mode 100644
index 0000000..02cb9a6
--- /dev/null
+++ b/test/MC/Mips/msa/set-msa-directive-bad.s

@@ -0,0 +1,11 @@
+# RUN: not llvm-mc %s -arch=mips -mcpu=mips32r2 2>%t1
+# RUN: FileCheck %s < %t1
+
+    .set nomsa
+    addvi.b     $w14, $w12, 14 # CHECK: error: instruction requires a CPU feature not currently enabled
+
+    .set msa
+    addvi.h     $w26, $w17, 4 
+    
+    .set nomsa
+    addvi.w     $w19, $w13, 11 # CHECK: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/msa/set-msa-directive.s b/test/MC/Mips/msa/set-msa-directive.s
new file mode 100644
index 0000000..461ddba
--- /dev/null
+++ b/test/MC/Mips/msa/set-msa-directive.s

@@ -0,0 +1,22 @@
+# RUN: llvm-mc %s -arch=mips -mcpu=mips32r2 | FileCheck %s
+
+# CHECK:    .set msa
+# CHECK:    addvi.b     $w14, $w12, 14
+# CHECK:    addvi.h     $w26, $w17, 4
+# CHECK:    addvi.w     $w19, $w13, 11
+# CHECK:    addvi.d     $w16, $w19, 7    
+# CHECK:    subvi.b     $w14, $w12, 14
+# CHECK:    subvi.h     $w26, $w17, 4
+# CHECK:    subvi.w     $w19, $w13, 11
+# CHECK:    subvi.d     $w16, $w19, 7
+
+    .set msa
+    addvi.b     $w14, $w12, 14
+    addvi.h     $w26, $w17, 4
+    addvi.w     $w19, $w13, 11
+    addvi.d     $w16, $w19, 7
+    
+    subvi.b     $w14, $w12, 14
+    subvi.h     $w26, $w17, 4
+    subvi.w     $w19, $w13, 11
+    subvi.d     $w16, $w19, 7

diff --git a/test/MC/Mips/nacl-mask.s b/test/MC/Mips/nacl-mask.s
index 22286ac..c776460 100644
--- a/test/MC/Mips/nacl-mask.s
+++ b/test/MC/Mips/nacl-mask.s

@@ -252,10 +252,10 @@
         jalr $t9
         addiu $4, $zero, 5
 
-# CHECK-LABEL:   test5:
 
+# CHECK:             nop
 # CHECK-NEXT:        nop
-# CHECK-NEXT:        nop
+# CHECK-LABEL:       test5:
 # CHECK-NEXT:        jal
 # CHECK-NEXT:        addiu   $4, $zero, 1
 
@@ -301,10 +301,11 @@
         jalr $t9
         sw      $sp, 0($sp)
 
-# CHECK-LABEL:   test6:
 
+
+# CHECK:             nop
 # CHECK-NEXT:        nop
-# CHECK-NEXT:        nop
+# CHECK-LABEL:       test6:
 # CHECK-NEXT:        jal
 # CHECK-NEXT:        sw      $4, 0($sp)
 

diff --git a/test/MC/Mips/nooddspreg-cmdarg.s b/test/MC/Mips/nooddspreg-cmdarg.s
index 826db12..52b040e 100644
--- a/test/MC/Mips/nooddspreg-cmdarg.s
+++ b/test/MC/Mips/nooddspreg-cmdarg.s

@@ -14,27 +14,26 @@
 # CHECK-ASM-NOT: .module nooddspreg
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ:  Section {
-# CHECK-OBJ:    Index: 5
-# CHECK-OBJ:    Name: .MIPS.abiflags (12)
-# CHECK-OBJ:    Type:  (0x7000002A)
-# CHECK-OBJ:     Flags [ (0x2)
-# CHECK-OBJ:      SHF_ALLOC (0x2)
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    Address: 0x0
-# CHECK-OBJ:    Offset: 0x50
-# CHECK-OBJ:    Size: 24
-# CHECK-OBJ:    Link: 0
-# CHECK-OBJ:    Info: 0
-# CHECK-OBJ:    AddressAlignment: 8
-# CHECK-OBJ:    EntrySize: 0
-# CHECK-OBJ:    Relocations [
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    SectionData (
-# CHECK-OBJ:      0000: 00002001 01020007 00000000 00000000  |.. .............|
-# CHECK-OBJ:      0010: 00000000 00000000                    |........|
-# CHECK-OBJ:    )
-# CHECK-OBJ:  }
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags (12)
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00002001 01020007 00000000 00000000  |.. .............|
+# CHECK-OBJ:           0010: 00000000 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
 
 # INVALID: ERROR: -mno-odd-spreg requires the O32 ABI
 

diff --git a/test/MC/Mips/nooddspreg.s b/test/MC/Mips/nooddspreg.s
index 5a283f5..f268ef4 100644
--- a/test/MC/Mips/nooddspreg.s
+++ b/test/MC/Mips/nooddspreg.s

@@ -14,27 +14,26 @@
 # CHECK-ASM: .module nooddspreg
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ:  Section {
-# CHECK-OBJ:    Index: 5
-# CHECK-OBJ:    Name: .MIPS.abiflags (12)
-# CHECK-OBJ:    Type:  (0x7000002A)
-# CHECK-OBJ:     Flags [ (0x2)
-# CHECK-OBJ:      SHF_ALLOC (0x2)
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    Address: 0x0
-# CHECK-OBJ:    Offset: 0x50
-# CHECK-OBJ:    Size: 24
-# CHECK-OBJ:    Link: 0
-# CHECK-OBJ:    Info: 0
-# CHECK-OBJ:    AddressAlignment: 8
-# CHECK-OBJ:    EntrySize: 0
-# CHECK-OBJ:    Relocations [
-# CHECK-OBJ:    ]
-# CHECK-OBJ:    SectionData (
-# CHECK-OBJ:      0000: 00002001 01020007 00000000 00000000  |.. .............|
-# CHECK-OBJ:      0010: 00000000 00000000                    |........|
-# CHECK-OBJ:    )
-# CHECK-OBJ:  }
+# CHECK-OBJ:       Section {
+# CHECK-OBJ:         Index: 5
+# CHECK-OBJ-LABEL:   Name: .MIPS.abiflags (12)
+# CHECK-OBJ:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ:          Flags [ (0x2)
+# CHECK-OBJ:           SHF_ALLOC (0x2)
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         Address: 0x0
+# CHECK-OBJ:         Size: 24
+# CHECK-OBJ:         Link: 0
+# CHECK-OBJ:         Info: 0
+# CHECK-OBJ:         AddressAlignment: 8
+# CHECK-OBJ:         EntrySize: 24
+# CHECK-OBJ:         Relocations [
+# CHECK-OBJ:         ]
+# CHECK-OBJ:         SectionData (
+# CHECK-OBJ:           0000: 00002001 01020007 00000000 00000000  |.. .............|
+# CHECK-OBJ:           0010: 00000000 00000000                    |........|
+# CHECK-OBJ:         )
+# CHECK-OBJ-LABEL: }
 
 # INVALID: '.module nooddspreg' requires the O32 ABI
 

diff --git a/test/MC/Mips/octeon-instructions.s b/test/MC/Mips/octeon-instructions.s
index b7c89b4..2922744 100644
--- a/test/MC/Mips/octeon-instructions.s
+++ b/test/MC/Mips/octeon-instructions.s

@@ -35,6 +35,10 @@
 # CHECK: sne   $23, $23, $20          # encoding: [0x72,0xf4,0xb8,0x2b]
 # CHECK: snei  $4, $16, -313          # encoding: [0x72,0x04,0xb1,0xef]
 # CHECK: snei  $26, $26, 511          # encoding: [0x73,0x5a,0x7f,0xef]
+# CHECK: sync  2                      # encoding: [0x00,0x00,0x00,0x8f]
+# CHECK: sync  6                      # encoding: [0x00,0x00,0x01,0x8f]
+# CHECK: sync  4                      # encoding: [0x00,0x00,0x01,0x0f]
+# CHECK: sync  5                      # encoding: [0x00,0x00,0x01,0x4f]
 # CHECK: v3mulu $21, $10, $21         # encoding: [0x71,0x55,0xa8,0x11]
 # CHECK: v3mulu $20, $20, $10         # encoding: [0x72,0x8a,0xa0,0x11]
 # CHECK: vmm0  $3, $19, $16           # encoding: [0x72,0x70,0x18,0x10]
@@ -77,6 +81,10 @@
   sne   $23, $20
   snei  $4, $16, -313
   snei  $26, 511
+  synciobdma
+  syncs
+  syncw
+  syncws
   v3mulu $21, $10, $21
   v3mulu $20, $10
   vmm0  $3, $19, $16

diff --git a/test/MC/Mips/oddspreg.s b/test/MC/Mips/oddspreg.s
index f5aa9c0..32ba9e0 100644
--- a/test/MC/Mips/oddspreg.s
+++ b/test/MC/Mips/oddspreg.s

@@ -15,38 +15,51 @@
 # RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 | \
 # RUN:   FileCheck %s -check-prefix=CHECK-ASM
 #
-# RUN: llvm-mc %s -arch=mips64 -mcpu=mips64 -filetype=obj -o - | \
+# Repeat the -filetype=obj tests but this time use an empty assembly file. The
+# output should be unchanged.
+# RUN: llvm-mc /dev/null -arch=mips64 -mcpu=mips64 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-N64
+
+# RUN: llvm-mc /dev/null -arch=mips -mcpu=mips32 -mattr=+fp64 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-O32
+#
+# RUN: llvm-mc /dev/null -arch=mips64 -mcpu=mips64 -mattr=-n64,+n32 -filetype=obj -o - | \
+# RUN:   llvm-readobj -sections -section-data -section-relocations - | \
+# RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-N32
+
+# RUN: llvm-mc /dev/null -arch=mips64 -mcpu=mips64 -filetype=obj -o - | \
 # RUN:   llvm-readobj -sections -section-data -section-relocations - | \
 # RUN:     FileCheck %s -check-prefix=CHECK-OBJ-ALL -check-prefix=CHECK-OBJ-N64
 
 # CHECK-ASM: .module oddspreg
 
 # Checking if the Mips.abiflags were correctly emitted.
-# CHECK-OBJ-ALL:  Section {
-# CHECK-OBJ-ALL:    Index: 5
-# CHECK-OBJ-ALL:    Name: .MIPS.abiflags ({{[0-9]+}})
-# CHECK-OBJ-ALL:    Type:  (0x7000002A)
-# CHECK-OBJ-ALL:     Flags [ (0x2)
-# CHECK-OBJ-ALL:      SHF_ALLOC (0x2)
-# CHECK-OBJ-ALL:    ]
-# CHECK-OBJ-ALL:    Address: 0x0
-# CHECK-OBJ-ALL:    Offset: 0x{{[0-9A-F]+}}
-# CHECK-OBJ-ALL:    Size: 24
-# CHECK-OBJ-ALL:    Link: 0
-# CHECK-OBJ-ALL:    Info: 0
-# CHECK-OBJ-ALL:    AddressAlignment: 8
-# CHECK-OBJ-ALL:    EntrySize: 0
-# CHECK-OBJ-ALL:    Relocations [
-# CHECK-OBJ-ALL:    ]
-# CHECK-OBJ-ALL:    SectionData (
-# CHECK-OBJ-O32:      0000: 00002001 01020006 00000000 00000000  |.. .............|
-# CHECK-OBJ-O32:      0010: 00000001 00000000                    |........|
-# CHECK-OBJ-N32:      0000: 00004001 02020001 00000000 00000000  |..@.............|
-# CHECK-OBJ-N32:      0010: 00000001 00000000                    |........|
-# CHECK-OBJ-N64:      0000: 00004001 02020001 00000000 00000000  |..@.............|
-# CHECK-OBJ-N64:      0010: 00000001 00000000                    |........|
-# CHECK-OBJ-ALL:    )
-# CHECK-OBJ-ALL:  }
+# CHECK-OBJ-ALL:       Section {
+# CHECK-OBJ-ALL:         Index: 5
+# CHECK-OBJ-ALL-LABEL:   Name: .MIPS.abiflags ({{[0-9]+}})
+# CHECK-OBJ-ALL:         Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+# CHECK-OBJ-ALL:          Flags [ (0x2)
+# CHECK-OBJ-ALL:           SHF_ALLOC (0x2)
+# CHECK-OBJ-ALL:         ]
+# CHECK-OBJ-ALL:         Address: 0x0
+# CHECK-OBJ-ALL:         Size: 24
+# CHECK-OBJ-ALL:         Link: 0
+# CHECK-OBJ-ALL:         Info: 0
+# CHECK-OBJ-ALL:         AddressAlignment: 8
+# CHECK-OBJ-ALL:         EntrySize: 24
+# CHECK-OBJ-ALL:         Relocations [
+# CHECK-OBJ-ALL:         ]
+# CHECK-OBJ-ALL:         SectionData (
+# CHECK-OBJ-O32:           0000: 00002001 01020006 00000000 00000000  |.. .............|
+# CHECK-OBJ-O32:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ-N32:           0000: 00004001 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ-N32:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ-N64:           0000: 00004001 02020001 00000000 00000000  |..@.............|
+# CHECK-OBJ-N64:           0010: 00000001 00000000                    |........|
+# CHECK-OBJ-ALL:         )
+# CHECK-OBJ-ALL-LABEL: }
 
         .module oddspreg
         add.s $f3, $f1, $f5

diff --git a/test/MC/Mips/set-arch.s b/test/MC/Mips/set-arch.s
new file mode 100644
index 0000000..6267468
--- /dev/null
+++ b/test/MC/Mips/set-arch.s

@@ -0,0 +1,55 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32 | \
+# RUN:   FileCheck %s
+
+    .text
+    .set arch=mips1
+    add         $2, $2, $2
+    .set arch=mips2
+    ll          $2, -2($2)
+    .set arch=mips3
+    dadd        $2, $2, $2
+    .set arch=mips4
+    ldxc1       $f8, $2($4)
+    .set arch=mips5
+    luxc1       $f19, $2($4)
+    .set arch=mips32
+    clo         $2, $2
+    .set arch=mips32r2
+    rotr        $2, $2, 15
+    .set arch=mips32r6
+    mod         $2, $4, $6
+    .set arch=mips64
+    daddi       $2, $2, 10
+    .set arch=mips64r2
+    drotr32     $1, $14, 15
+    .set arch=mips64r6
+    mod         $2, $4, $6
+    .set arch=cnmips
+    .set arch=r4000
+    dadd        $2, $2, $2
+
+# CHECK: .set arch=mips1
+# CHECK: add         $2, $2, $2
+# CHECK: .set arch=mips2
+# CHECK: ll          $2, -2($2)
+# CHECK: .set arch=mips3
+# CHECK: dadd        $2, $2, $2
+# CHECK: .set arch=mips4
+# CHECK: ldxc1       $f8, $2($4)
+# CHECK: .set arch=mips5
+# CHECK: luxc1       $f19, $2($4)
+# CHECK: .set arch=mips32
+# CHECK: clo         $2, $2
+# CHECK: .set arch=mips32r2
+# CHECK: rotr        $2, $2, 15
+# CHECK: .set arch=mips32r6
+# CHECK: mod         $2, $4, $6
+# CHECK: .set arch=mips64
+# CHECK: daddi       $2, $2, 10
+# CHECK: .set arch=mips64r2
+# CHECK: drotr32     $1, $14, 15
+# CHECK: .set arch=mips64r6
+# CHECK: mod         $2, $4, $6
+# CHECK: .set arch=cnmips
+# CHECK: .set arch=r4000
+# CHECK: dadd        $2, $2, $2

diff --git a/test/MC/Mips/set-at-directive-explicit-at.s b/test/MC/Mips/set-at-directive-explicit-at.s
index 1bd26ff..797a2b7 100644
--- a/test/MC/Mips/set-at-directive-explicit-at.s
+++ b/test/MC/Mips/set-at-directive-explicit-at.s

@@ -7,15 +7,15 @@
     .text
 foo:
 # CHECK:   jr    $1                      # encoding: [0x08,0x00,0x20,0x00]
-# WARNINGS: :[[@LINE+2]]:11: warning: Used $at without ".set noat"
+# WARNINGS: :[[@LINE+2]]:11: warning: used $at without ".set noat"
     .set    at=$1
     jr    $at
 
 # CHECK:   jr    $1                      # encoding: [0x08,0x00,0x20,0x00]
-# WARNINGS: :[[@LINE+2]]:11: warning: Used $at without ".set noat"
+# WARNINGS: :[[@LINE+2]]:11: warning: used $at without ".set noat"
     .set    at=$1
     jr    $1
-# WARNINGS-NOT: warning: Used $at without ".set noat"
+# WARNINGS-NOT: warning: used $at without ".set noat"
 
 # CHECK:   jr    $1                      # encoding: [0x08,0x00,0x20,0x00]
     .set    at=$2
@@ -31,12 +31,12 @@
     jr    $at
 
 # CHECK:   jr    $16                     # encoding: [0x08,0x00,0x00,0x02]
-# WARNINGS: :[[@LINE+2]]:11: warning: Used $16 with ".set at=$16"
+# WARNINGS: :[[@LINE+2]]:11: warning: used $16 with ".set at=$16"
     .set    at=$16
     jr    $s0
 
 # CHECK:   jr    $16                     # encoding: [0x08,0x00,0x00,0x02]
-# WARNINGS: :[[@LINE+2]]:11: warning: Used $16 with ".set at=$16"
+# WARNINGS: :[[@LINE+2]]:11: warning: used $16 with ".set at=$16"
     .set    at=$16
     jr    $16
 # WARNINGS-NOT: warning

diff --git a/test/MC/Mips/set-mips-directives-bad.s b/test/MC/Mips/set-mips-directives-bad.s
new file mode 100644
index 0000000..6726987
--- /dev/null
+++ b/test/MC/Mips/set-mips-directives-bad.s

@@ -0,0 +1,30 @@
+# RUN: not llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips1 2>%t1
+# RUN: FileCheck %s < %t1
+
+# FIXME: At the moment we emit the wrong error message if we try to assemble the
+# ll instruction using an unsupported architecture so we just check for "error" 
+# and ignore the rest of the message.
+
+        .text
+        .set noreorder
+        .set mips1
+        ll  $2,-2($2) # CHECK: error:
+        .set mips2
+        dadd $2,$2,$2 # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips3
+        ldxc1 $f8,$2($4) # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips4
+        luxc1 $f19,$2($4) # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips5
+        clo  $2,$2 # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips32
+        rotr    $2,15 # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips32r2
+        mod $2, $4, $6 # CHECK: error:instruction requires a CPU feature not currently enabled
+        .set mips32r6
+        daddi $2, $2, 10 # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips64
+        drotr32 $1,$14,15 # CHECK: error: instruction requires a CPU feature not currently enabled
+        .set mips64r2
+        mod $2, $4, $6 # CHECK: error: instruction requires a CPU feature not currently enabled
+

diff --git a/test/MC/Mips/set-mips-directives.s b/test/MC/Mips/set-mips-directives.s
new file mode 100644
index 0000000..96c2308
--- /dev/null
+++ b/test/MC/Mips/set-mips-directives.s

@@ -0,0 +1,51 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips1 | \
+# RUN:   FileCheck %s
+
+        .text
+        .set noreorder
+        .set mips1
+        add $2, $2, $2
+        .set mips2
+        ll  $2,-2($2)
+        .set mips3
+        dadd $2,$2,$2
+        .set mips4
+        ldxc1 $f8,$2($4)
+        .set mips5
+        luxc1 $f19,$2($4)
+        .set mips32
+        clo  $2,$2
+        .set mips32r2
+        rotr    $2,15
+        .set mips32r6
+        mod $2, $4, $6
+        .set mips64
+        daddi $2, $2, 10
+        .set mips64r2
+        drotr32 $1,$14,15
+        .set mips64r6
+        mod $2, $4, $6
+
+# CHECK: .set noreorder
+# CHECK: .set mips1
+# CHECK: add $2, $2, $2
+# CHECK: .set mips2
+# CHECK: ll  $2, -2($2)
+# CHECK: .set mips3
+# CHECK: dadd $2, $2, $2
+# CHECK: .set mips4
+# CHECK: ldxc1 $f8, $2($4)
+# CHECK: .set mips5
+# CHECK: luxc1 $f19, $2($4)
+# CHECK: .set mips32
+# CHECK: clo $2, $2
+# CHECK: .set mips32r2
+# CHECK: rotr $2, $2, 15
+# CHECK: .set mips32r6
+# CHECK: mod $2, $4, $6
+# CHECK: .set mips64
+# CHECK: daddi $2, $2, 10
+# CHECK: .set mips64r2
+# CHECK:  drotr32 $1, $14, 15
+# CHECK: .set mips64r6
+# CHECK: mod $2, $4, $6

diff --git a/test/MC/Mips/set-mips0-directive.s b/test/MC/Mips/set-mips0-directive.s
new file mode 100644
index 0000000..5cb75bb
--- /dev/null
+++ b/test/MC/Mips/set-mips0-directive.s

@@ -0,0 +1,27 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 | \
+# RUN:   FileCheck %s
+
+    .text
+    rotr  $7, $7, 22
+
+    .set mips32r6
+    mod   $2, $4, $6
+    .set mips0
+    rotr  $2, $2, 15
+
+    .set mips3
+    dadd  $4, $4, $4
+    .set mips0
+    rotr  $3, $3, 19
+
+# CHECK: rotr  $7, $7, 22
+
+# CHECK: .set mips32r6
+# CHECK: mod   $2, $4, $6
+# CHECK: .set mips0
+# CHECK: rotr  $2, $2, 15
+
+# CHECK: .set mips3
+# CHECK: dadd  $4, $4, $4
+# CHECK: .set mips0
+# CHECK: rotr  $3, $3, 19

diff --git a/test/MC/Mips/set-mips16-directive.s b/test/MC/Mips/set-mips16-directive.s
new file mode 100644
index 0000000..cf8090e
--- /dev/null
+++ b/test/MC/Mips/set-mips16-directive.s

@@ -0,0 +1,10 @@
+# RUN: llvm-mc %s -arch=mips | FileCheck %s
+# FIXME: Update this test when we have a more mature implementation of Mips16 in the IAS.
+
+.text
+.set mips16
+.set nomips16
+
+# CHECK: .text
+# CHECK: .set mips16
+# CHECK: .set nomips16

diff --git a/test/MC/Mips/set-nodsp.s b/test/MC/Mips/set-nodsp.s
new file mode 100644
index 0000000..f98cefb
--- /dev/null
+++ b/test/MC/Mips/set-nodsp.s

@@ -0,0 +1,12 @@
+# RUN: not llvm-mc %s -mcpu=mips32 -mattr=+dsp -triple mips-unknown-linux 2>%t1
+# RUN: FileCheck %s < %t1
+
+  lbux    $7, $10($11)
+
+  .set nodsp
+  lbux    $6, $10($11)
+  # CHECK: error: instruction requires a CPU feature not currently enabled
+
+  .set dsp
+  lbux    $5, $10($11)
+  # CHECK-NOT: error: instruction requires a CPU feature not currently enabled

diff --git a/test/MC/Mips/set-push-pop-directives-bad.s b/test/MC/Mips/set-push-pop-directives-bad.s
new file mode 100644
index 0000000..53d8b23
--- /dev/null
+++ b/test/MC/Mips/set-push-pop-directives-bad.s

@@ -0,0 +1,14 @@
+# RUN: not llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 2>%t1
+# RUN:   FileCheck %s < %t1
+
+        .text
+        .set pop
+# CHECK: :[[@LINE-1]]:14: error: .set pop with no .set push
+        .set push
+        .set pop
+        .set pop
+# CHECK: :[[@LINE-1]]:14: error: .set pop with no .set push
+        .set push foo
+# CHECK: :[[@LINE-1]]:19: error: unexpected token, expected end of statement
+        .set pop bar
+# CHECK: :[[@LINE-1]]:18: error: unexpected token, expected end of statement

diff --git a/test/MC/Mips/set-push-pop-directives.s b/test/MC/Mips/set-push-pop-directives.s
new file mode 100644
index 0000000..5f55b7c
--- /dev/null
+++ b/test/MC/Mips/set-push-pop-directives.s

@@ -0,0 +1,53 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa | \
+# RUN:   FileCheck %s
+# .set push creates a copy of the current environment.
+# .set pop restores the previous environment.
+# FIXME: Also test resetting of .set macro/nomacro option.
+
+    .text
+    # The first environment on the stack (with initial values).
+    lw       $1, 65536($1)
+    b        1336
+    addvi.b  $w15, $w13, 18
+    
+    # Create a new environment.
+    .set push
+    .set at=$ra           # Test the ATReg option.
+    lw       $1, 65536($1)
+    .set noreorder        # Test the Reorder option.
+    b        1336
+    .set nomsa            # Test the Features option (ASE).
+    .set mips32r6         # Test the Features option (ISA).
+    mod      $2, $4, $6
+
+    # Switch back to the first environment.
+    .set pop
+    lw       $1, 65536($1)
+    b        1336
+    addvi.b  $w15, $w13, 18
+
+# CHECK:  lui      $1, 1
+# CHECK:  addu     $1, $1, $1
+# CHECK:  lw       $1, 0($1)
+# CHECK:  b        1336
+# CHECK:  nop
+# CHECK:  addvi.b  $w15, $w13, 18
+
+# CHECK:  .set push
+# CHECK:  lui      $ra, 1
+# CHECK:  addu     $ra, $ra, $1
+# CHECK:  lw       $1, 0($ra)
+# CHECK:  .set noreorder   
+# CHECK:  b        1336
+# CHECK-NOT:  nop
+# CHECK:  .set nomsa       
+# CHECK:  .set mips32r6    
+# CHECK:  mod      $2, $4, $6
+
+# CHECK:  .set pop
+# CHECK:  lui      $1, 1
+# CHECK:  addu     $1, $1, $1
+# CHECK:  lw       $1, 0($1)
+# CHECK:  b        1336
+# CHECK:  nop
+# CHECK:  addvi.b  $w15, $w13, 18

diff --git a/test/MC/Mips/unaligned-nops.s b/test/MC/Mips/unaligned-nops.s
new file mode 100644
index 0000000..ebbbb40
--- /dev/null
+++ b/test/MC/Mips/unaligned-nops.s

@@ -0,0 +1,4 @@
+# RUN: llvm-mc -filetype=obj  -triple=mipsel %s -o %t
+.byte 1
+.p2align 2
+foo:

diff --git a/test/MC/PowerPC/lcomm.s b/test/MC/PowerPC/lcomm.s
new file mode 100644
index 0000000..a84f138
--- /dev/null
+++ b/test/MC/PowerPC/lcomm.s

@@ -0,0 +1,21 @@
+# RUN: llvm-mc -triple powerpc-unknown-unknown -filetype=obj %s | \
+# RUN: llvm-readobj -s -sd | FileCheck %s
+# RUN: llvm-mc -triple powerpc64-unknown-unknown -filetype=obj %s | \
+# RUN: llvm-readobj -s -sd | FileCheck %s
+
+.lcomm foo, 16, 16
+
+// CHECK:        Section {
+// CHECK:          Name: .bss
+// CHECK-NEXT:     Type: SHT_NOBITS
+// CHECK-NEXT:     Flags [
+// CHECK-NEXT:       SHF_ALLOC
+// CHECK-NEXT:       SHF_WRITE
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Address: 0x0
+// CHECK-NEXT:     Offset: 0x40
+// CHECK-NEXT:     Size: 16
+// CHECK-NEXT:     Link: 0
+// CHECK-NEXT:     Info: 0
+// CHECK-NEXT:     AddressAlignment: 16
+// CHECK-NEXT:     EntrySize: 0

diff --git a/test/MC/PowerPC/ppc-reloc.s b/test/MC/PowerPC/ppc-reloc.s
new file mode 100644
index 0000000..19dd2a3
--- /dev/null
+++ b/test/MC/PowerPC/ppc-reloc.s

@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple=powerpc-unknown-linux-gnu -filetype=obj %s | \
+# RUN: llvm-readobj -r | FileCheck %s
+	.section .text
+
+	.globl foo
+	.type foo,@function
+	.align 2
+foo:
+	bl printf@plt
+.LC1:
+	.size foo, . - foo
+
+# CHECK:      Relocations [
+# CHECK-NEXT:   Section (2) .rela.text {
+# CHECK-NEXT:     0x0 R_PPC_PLTREL24 printf 0x0
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]

diff --git a/test/MC/PowerPC/ppc32-ba.s b/test/MC/PowerPC/ppc32-ba.s
new file mode 100644
index 0000000..133423b
--- /dev/null
+++ b/test/MC/PowerPC/ppc32-ba.s

@@ -0,0 +1,6 @@
+# RUN: llvm-mc -triple powerpc-unknown-unknown --show-encoding %s | FileCheck %s
+
+# Check that large immediates in 32bit mode are accepted.
+
+# CHECK: ba -33554432 # encoding: [0x4a,0x00,0x00,0x02]
+         ba 0xfe000000

diff --git a/test/MC/PowerPC/ppc64-abiversion.s b/test/MC/PowerPC/ppc64-abiversion.s
new file mode 100644
index 0000000..d2970f8
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-abiversion.s

@@ -0,0 +1,9 @@
+
+# RUN: llvm-mc -triple powerpc64-unknown-unknown -filetype=obj %s | \
+# RUN: llvm-readobj -h | FileCheck %s
+# RUN: llvm-mc -triple powerpc64le-unknown-unknown -filetype=obj %s | \
+# RUN: llvm-readobj -h | FileCheck %s
+
+	.abiversion 2
+# CHECK: Flags [ (0x2)
+

diff --git a/test/MC/PowerPC/ppc64-encoding-4xx.s b/test/MC/PowerPC/ppc64-encoding-4xx.s
new file mode 100644
index 0000000..5414e1a
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-encoding-4xx.s

@@ -0,0 +1,167 @@
+# RUN: llvm-mc -triple powerpc64-unknown-unknown --show-encoding %s | FileCheck -check-prefix=CHECK-BE %s
+# RUN: llvm-mc -triple powerpc64le-unknown-unknown --show-encoding %s | FileCheck -check-prefix=CHECK-LE %s
+
+# Instructions specific to the PowerPC 4xx embedded controllers:
+
+# CHECK-BE: mfdcr 3, 178                     # encoding: [0x7c,0x72,0x2a,0x86]
+# CHECK-LE: mfdcr 3, 178                     # encoding: [0x86,0x2a,0x72,0x7c]
+            mfdcr 3,178
+# CHECK-BE: mtdcr 178, 3                     # encoding: [0x7c,0x72,0x2b,0x86]
+# CHECK-LE: mtdcr 178, 3                     # encoding: [0x86,0x2b,0x72,0x7c]
+            mtdcr 178,3
+
+# CHECK-BE: tlbre 2, 3, 0                    # encoding: [0x7c,0x43,0x07,0x64]
+# CHECK-LE: tlbre 2, 3, 0                    # encoding: [0x64,0x07,0x43,0x7c]
+            tlbre %r2, %r3, 0
+# CHECK-BE: tlbre 2, 3, 1                    # encoding: [0x7c,0x43,0x0f,0x64]
+# CHECK-LE: tlbre 2, 3, 1                    # encoding: [0x64,0x0f,0x43,0x7c]
+            tlbre %r2, %r3, 1
+# CHECK-BE: tlbre 2, 3, 0                    # encoding: [0x7c,0x43,0x07,0x64]
+# CHECK-LE: tlbre 2, 3, 0                    # encoding: [0x64,0x07,0x43,0x7c]
+            tlbrehi %r2, %r3
+# CHECK-BE: tlbre 2, 3, 1                    # encoding: [0x7c,0x43,0x0f,0x64]
+# CHECK-LE: tlbre 2, 3, 1                    # encoding: [0x64,0x0f,0x43,0x7c]
+            tlbrelo %r2, %r3
+
+# CHECK-BE: tlbwe 2, 3, 0                    # encoding: [0x7c,0x43,0x07,0xa4]
+# CHECK-LE: tlbwe 2, 3, 0                    # encoding: [0xa4,0x07,0x43,0x7c]
+            tlbwe %r2, %r3, 0
+# CHECK-BE: tlbwe 2, 3, 1                    # encoding: [0x7c,0x43,0x0f,0xa4]
+# CHECK-LE: tlbwe 2, 3, 1                    # encoding: [0xa4,0x0f,0x43,0x7c]
+            tlbwe %r2, %r3, 1
+# CHECK-BE: tlbwe 2, 3, 0                    # encoding: [0x7c,0x43,0x07,0xa4]
+# CHECK-LE: tlbwe 2, 3, 0                    # encoding: [0xa4,0x07,0x43,0x7c]
+            tlbwehi %r2, %r3
+# CHECK-BE: tlbwe 2, 3, 1                    # encoding: [0x7c,0x43,0x0f,0xa4]
+# CHECK-LE: tlbwe 2, 3, 1                    # encoding: [0xa4,0x0f,0x43,0x7c]
+            tlbwelo %r2, %r3
+
+# CHECK-BE: tlbsx 2, 3, 1                    # encoding: [0x7c,0x43,0x0f,0x24]
+# CHECK-LE: tlbsx 2, 3, 1                    # encoding: [0x24,0x0f,0x43,0x7c]
+            tlbsx %r2, %r3, %r1
+# CHECK-BE: tlbsx. 2, 3, 1                   # encoding: [0x7c,0x43,0x0f,0x25]
+# CHECK-LE: tlbsx. 2, 3, 1                   # encoding: [0x25,0x0f,0x43,0x7c]
+            tlbsx. %r2, %r3, %r1
+
+# CHECK-BE: mfspr 2, 1018                    # encoding: [0x7c,0x5a,0xfa,0xa6]
+# CHECK-LE: mfspr 2, 1018                    # encoding: [0xa6,0xfa,0x5a,0x7c]
+            mfdccr %r2
+# CHECK-BE: mtspr 1018, 2                    # encoding: [0x7c,0x5a,0xfb,0xa6]
+# CHECK-LE: mtspr 1018, 2                    # encoding: [0xa6,0xfb,0x5a,0x7c]
+            mtdccr %r2
+
+# CHECK-BE: mfspr 2, 1019                    # encoding: [0x7c,0x5b,0xfa,0xa6]
+# CHECK-LE: mfspr 2, 1019                    # encoding: [0xa6,0xfa,0x5b,0x7c]
+            mficcr %r2
+# CHECK-BE: mtspr 1019, 2                    # encoding: [0x7c,0x5b,0xfb,0xa6]
+# CHECK-LE: mtspr 1019, 2                    # encoding: [0xa6,0xfb,0x5b,0x7c]
+            mticcr %r2
+
+# CHECK-BE: mfspr 2, 981                    # encoding: [0x7c,0x55,0xf2,0xa6]
+# CHECK-LE: mfspr 2, 981                    # encoding: [0xa6,0xf2,0x55,0x7c]
+            mfdear %r2
+# CHECK-BE: mtspr 981, 2                    # encoding: [0x7c,0x55,0xf3,0xa6]
+# CHECK-LE: mtspr 981, 2                    # encoding: [0xa6,0xf3,0x55,0x7c]
+            mtdear %r2
+
+# CHECK-BE: mfspr 2, 980                    # encoding: [0x7c,0x54,0xf2,0xa6]
+# CHECK-LE: mfspr 2, 980                    # encoding: [0xa6,0xf2,0x54,0x7c]
+            mfesr %r2
+# CHECK-BE: mtspr 980, 2                    # encoding: [0x7c,0x54,0xf3,0xa6]
+# CHECK-LE: mtspr 980, 2                    # encoding: [0xa6,0xf3,0x54,0x7c]
+            mtesr %r2
+
+# CHECK-BE: mfspr 2, 986                    # encoding: [0x7c,0x5a,0xf2,0xa6]
+# CHECK-LE: mfspr 2, 986                    # encoding: [0xa6,0xf2,0x5a,0x7c]
+            mftcr %r2
+# CHECK-BE: mtspr 986, 2                    # encoding: [0x7c,0x5a,0xf3,0xa6]
+# CHECK-LE: mtspr 986, 2                    # encoding: [0xa6,0xf3,0x5a,0x7c]
+            mttcr %r2
+
+# CHECK-BE: mfspr 2, 989                    # encoding: [0x7c,0x5d,0xf2,0xa6]
+# CHECK-LE: mfspr 2, 989                    # encoding: [0xa6,0xf2,0x5d,0x7c]
+            mftblo %r2
+# CHECK-BE: mtspr 989, 2                    # encoding: [0x7c,0x5d,0xf3,0xa6]
+# CHECK-LE: mtspr 989, 2                    # encoding: [0xa6,0xf3,0x5d,0x7c]
+            mttblo %r2
+# CHECK-BE: mfspr 2, 988                    # encoding: [0x7c,0x5c,0xf2,0xa6]
+# CHECK-LE: mfspr 2, 988                    # encoding: [0xa6,0xf2,0x5c,0x7c]
+            mftbhi %r2
+# CHECK-BE: mtspr 988, 2                    # encoding: [0x7c,0x5c,0xf3,0xa6]
+# CHECK-LE: mtspr 988, 2                    # encoding: [0xa6,0xf3,0x5c,0x7c]
+            mttbhi %r2
+
+# CHECK-BE: dccci 5, 6                      # encoding: [0x7c,0x05,0x33,0x8c]
+# CHECK-LE: dccci 5, 6                      # encoding: [0x8c,0x33,0x05,0x7c]
+            dccci %r5,%r6
+# CHECK-BE: iccci 5, 6                      # encoding: [0x7c,0x05,0x37,0x8c]
+# CHECK-LE: iccci 5, 6                      # encoding: [0x8c,0x37,0x05,0x7c]
+            iccci %r5,%r6
+# CHECK-BE: dccci 0, 0                      # encoding: [0x7c,0x00,0x03,0x8c]
+# CHECK-LE: dccci 0, 0                      # encoding: [0x8c,0x03,0x00,0x7c]
+            dci %r0
+# CHECK-BE: iccci 0, 0                      # encoding: [0x7c,0x00,0x07,0x8c]
+# CHECK-LE: iccci 0, 0                      # encoding: [0x8c,0x07,0x00,0x7c]
+            ici 0
+
+# CHECK-BE: mfspr 2, 990                    # encoding: [0x7c,0x5e,0xf2,0xa6]
+# CHECK-LE: mfspr 2, 990                    # encoding: [0xa6,0xf2,0x5e,0x7c]
+            mfsrr2 2
+# CHECK-BE: mtspr 990, 2                    # encoding: [0x7c,0x5e,0xf3,0xa6]
+# CHECK-LE: mtspr 990, 2                    # encoding: [0xa6,0xf3,0x5e,0x7c]
+            mtsrr2 2
+# CHECK-BE: mfspr 2, 991                    # encoding: [0x7c,0x5f,0xf2,0xa6]
+# CHECK-LE: mfspr 2, 991                    # encoding: [0xa6,0xf2,0x5f,0x7c]
+            mfsrr3 2
+# CHECK-BE: mtspr 991, 2                    # encoding: [0x7c,0x5f,0xf3,0xa6]
+# CHECK-LE: mtspr 991, 2                    # encoding: [0xa6,0xf3,0x5f,0x7c]
+            mtsrr3 2
+
+# CHECK-BE: mfdcr 5, 128                    # encoding: [0x7c,0xa0,0x22,0x86]
+# CHECK-LE: mfdcr 5, 128                    # encoding: [0x86,0x22,0xa0,0x7c]
+            mfbr0 %r5
+# CHECK-BE: mtdcr 128, 5                    # encoding: [0x7c,0xa0,0x23,0x86]
+# CHECK-LE: mtdcr 128, 5                    # encoding: [0x86,0x23,0xa0,0x7c]
+            mtbr0 %r5
+# CHECK-BE: mfdcr 5, 129                    # encoding: [0x7c,0xa1,0x22,0x86]
+# CHECK-LE: mfdcr 5, 129                    # encoding: [0x86,0x22,0xa1,0x7c]
+            mfbr1 %r5
+# CHECK-BE: mtdcr 129, 5                    # encoding: [0x7c,0xa1,0x23,0x86]
+# CHECK-LE: mtdcr 129, 5                    # encoding: [0x86,0x23,0xa1,0x7c]
+            mtbr1 %r5
+# CHECK-BE: mfdcr 5, 130                    # encoding: [0x7c,0xa2,0x22,0x86]
+# CHECK-LE: mfdcr 5, 130                    # encoding: [0x86,0x22,0xa2,0x7c]
+            mfbr2 %r5
+# CHECK-BE: mtdcr 130, 5                    # encoding: [0x7c,0xa2,0x23,0x86]
+# CHECK-LE: mtdcr 130, 5                    # encoding: [0x86,0x23,0xa2,0x7c]
+            mtbr2 %r5
+# CHECK-BE: mfdcr 5, 131                    # encoding: [0x7c,0xa3,0x22,0x86]
+# CHECK-LE: mfdcr 5, 131                    # encoding: [0x86,0x22,0xa3,0x7c]
+            mfbr3 %r5
+# CHECK-BE: mtdcr 131, 5                    # encoding: [0x7c,0xa3,0x23,0x86]
+# CHECK-LE: mtdcr 131, 5                    # encoding: [0x86,0x23,0xa3,0x7c]
+            mtbr3 %r5
+# CHECK-BE: mfdcr 5, 132                    # encoding: [0x7c,0xa4,0x22,0x86]
+# CHECK-LE: mfdcr 5, 132                    # encoding: [0x86,0x22,0xa4,0x7c]
+            mfbr4 %r5
+# CHECK-BE: mtdcr 132, 5                    # encoding: [0x7c,0xa4,0x23,0x86]
+# CHECK-LE: mtdcr 132, 5                    # encoding: [0x86,0x23,0xa4,0x7c]
+            mtbr4 %r5
+# CHECK-BE: mfdcr 5, 133                    # encoding: [0x7c,0xa5,0x22,0x86]
+# CHECK-LE: mfdcr 5, 133                    # encoding: [0x86,0x22,0xa5,0x7c]
+            mfbr5 %r5
+# CHECK-BE: mtdcr 133, 5                    # encoding: [0x7c,0xa5,0x23,0x86]
+# CHECK-LE: mtdcr 133, 5                    # encoding: [0x86,0x23,0xa5,0x7c]
+            mtbr5 %r5
+# CHECK-BE: mfdcr 5, 134                    # encoding: [0x7c,0xa6,0x22,0x86]
+# CHECK-LE: mfdcr 5, 134                    # encoding: [0x86,0x22,0xa6,0x7c]
+            mfbr6 %r5
+# CHECK-BE: mtdcr 134, 5                    # encoding: [0x7c,0xa6,0x23,0x86]
+# CHECK-LE: mtdcr 134, 5                    # encoding: [0x86,0x23,0xa6,0x7c]
+            mtbr6 %r5
+# CHECK-BE: mfdcr 5, 135                    # encoding: [0x7c,0xa7,0x22,0x86]
+# CHECK-LE: mfdcr 5, 135                    # encoding: [0x86,0x22,0xa7,0x7c]
+            mfbr7 %r5
+# CHECK-BE: mtdcr 135, 5                    # encoding: [0x7c,0xa7,0x23,0x86]
+# CHECK-LE: mtdcr 135, 5                    # encoding: [0x86,0x23,0xa7,0x7c]
+            mtbr7 %r5

diff --git a/test/MC/PowerPC/ppc64-encoding-6xx.s b/test/MC/PowerPC/ppc64-encoding-6xx.s
new file mode 100644
index 0000000..3a5e7a1
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-encoding-6xx.s

@@ -0,0 +1,109 @@
+# RUN: llvm-mc -triple powerpc64-unknown-unknown --show-encoding %s | FileCheck -check-prefix=CHECK-BE %s
+# RUN: llvm-mc -triple powerpc64le-unknown-unknown --show-encoding %s | FileCheck -check-prefix=CHECK-LE %s
+
+# Instructions specific to the PowerPC 6xx family:
+
+# CHECK-BE: mfspr 12, 528                    # encoding: [0x7d,0x90,0x82,0xa6]
+# CHECK-LE: mfspr 12, 528                    # encoding: [0xa6,0x82,0x90,0x7d]
+mfibatu %r12, 0
+# CHECK-BE: mfspr 12, 529                    # encoding: [0x7d,0x91,0x82,0xa6]
+# CHECK-LE: mfspr 12, 529                    # encoding: [0xa6,0x82,0x91,0x7d]
+mfibatl %r12, 0
+# CHECK-BE: mfspr 12, 530                    # encoding: [0x7d,0x92,0x82,0xa6]
+# CHECK-LE: mfspr 12, 530                    # encoding: [0xa6,0x82,0x92,0x7d]
+mfibatu %r12, 1
+# CHECK-BE: mfspr 12, 531                    # encoding: [0x7d,0x93,0x82,0xa6]
+# CHECK-LE: mfspr 12, 531                    # encoding: [0xa6,0x82,0x93,0x7d]
+mfibatl %r12, 1
+# CHECK-BE: mfspr 12, 532                    # encoding: [0x7d,0x94,0x82,0xa6]
+# CHECK-LE: mfspr 12, 532                    # encoding: [0xa6,0x82,0x94,0x7d]
+mfibatu %r12, 2
+# CHECK-BE: mfspr 12, 533                    # encoding: [0x7d,0x95,0x82,0xa6]
+# CHECK-LE: mfspr 12, 533                    # encoding: [0xa6,0x82,0x95,0x7d]
+mfibatl %r12, 2
+# CHECK-BE: mfspr 12, 534                    # encoding: [0x7d,0x96,0x82,0xa6]
+# CHECK-LE: mfspr 12, 534                    # encoding: [0xa6,0x82,0x96,0x7d]
+mfibatu %r12, 3
+# CHECK-BE: mfspr 12, 535                    # encoding: [0x7d,0x97,0x82,0xa6]
+# CHECK-LE: mfspr 12, 535                    # encoding: [0xa6,0x82,0x97,0x7d]
+mfibatl %r12, 3
+# CHECK-BE: mtspr 528, 12                    # encoding: [0x7d,0x90,0x83,0xa6]
+# CHECK-LE: mtspr 528, 12                    # encoding: [0xa6,0x83,0x90,0x7d]
+mtibatu 0, %r12
+# CHECK-BE: mtspr 529, 12                    # encoding: [0x7d,0x91,0x83,0xa6]
+# CHECK-LE: mtspr 529, 12                    # encoding: [0xa6,0x83,0x91,0x7d]
+mtibatl 0, %r12
+# CHECK-BE: mtspr 530, 12                    # encoding: [0x7d,0x92,0x83,0xa6]
+# CHECK-LE: mtspr 530, 12                    # encoding: [0xa6,0x83,0x92,0x7d]
+mtibatu 1, %r12
+# CHECK-BE: mtspr 531, 12                    # encoding: [0x7d,0x93,0x83,0xa6]
+# CHECK-LE: mtspr 531, 12                    # encoding: [0xa6,0x83,0x93,0x7d]
+mtibatl 1, %r12
+# CHECK-BE: mtspr 532, 12                    # encoding: [0x7d,0x94,0x83,0xa6]
+# CHECK-LE: mtspr 532, 12                    # encoding: [0xa6,0x83,0x94,0x7d]
+mtibatu 2, %r12
+# CHECK-BE: mtspr 533, 12                    # encoding: [0x7d,0x95,0x83,0xa6]
+# CHECK-LE: mtspr 533, 12                    # encoding: [0xa6,0x83,0x95,0x7d]
+mtibatl 2, %r12
+# CHECK-BE: mtspr 534, 12                    # encoding: [0x7d,0x96,0x83,0xa6]
+# CHECK-LE: mtspr 534, 12                    # encoding: [0xa6,0x83,0x96,0x7d]
+mtibatu 3, %r12
+# CHECK-BE: mtspr 535, 12                    # encoding: [0x7d,0x97,0x83,0xa6]
+# CHECK-LE: mtspr 535, 12                    # encoding: [0xa6,0x83,0x97,0x7d]
+mtibatl 3, %r12
+
+# CHECK-BE: mfspr 12, 536                    # encoding: [0x7d,0x98,0x82,0xa6]
+# CHECK-LE: mfspr 12, 536                    # encoding: [0xa6,0x82,0x98,0x7d]
+mfdbatu %r12, 0
+# CHECK-BE: mfspr 12, 537                    # encoding: [0x7d,0x99,0x82,0xa6]
+# CHECK-LE: mfspr 12, 537                    # encoding: [0xa6,0x82,0x99,0x7d]
+mfdbatl %r12, 0
+# CHECK-BE: mfspr 12, 538                    # encoding: [0x7d,0x9a,0x82,0xa6]
+# CHECK-LE: mfspr 12, 538                    # encoding: [0xa6,0x82,0x9a,0x7d]
+mfdbatu %r12, 1
+# CHECK-BE: mfspr 12, 539                    # encoding: [0x7d,0x9b,0x82,0xa6]
+# CHECK-LE: mfspr 12, 539                    # encoding: [0xa6,0x82,0x9b,0x7d]
+mfdbatl %r12, 1
+# CHECK-BE: mfspr 12, 540                    # encoding: [0x7d,0x9c,0x82,0xa6]
+# CHECK-LE: mfspr 12, 540                    # encoding: [0xa6,0x82,0x9c,0x7d]
+mfdbatu %r12, 2
+# CHECK-BE: mfspr 12, 541                    # encoding: [0x7d,0x9d,0x82,0xa6]
+# CHECK-LE: mfspr 12, 541                    # encoding: [0xa6,0x82,0x9d,0x7d]
+mfdbatl %r12, 2
+# CHECK-BE: mfspr 12, 542                    # encoding: [0x7d,0x9e,0x82,0xa6]
+# CHECK-LE: mfspr 12, 542                    # encoding: [0xa6,0x82,0x9e,0x7d]
+mfdbatu %r12, 3
+# CHECK-BE: mfspr 12, 543                    # encoding: [0x7d,0x9f,0x82,0xa6]
+# CHECK-LE: mfspr 12, 543                    # encoding: [0xa6,0x82,0x9f,0x7d]
+mfdbatl %r12, 3
+# CHECK-BE: mtspr 536, 12                    # encoding: [0x7d,0x98,0x83,0xa6]
+# CHECK-LE: mtspr 536, 12                    # encoding: [0xa6,0x83,0x98,0x7d]
+mtdbatu 0, %r12
+# CHECK-BE: mtspr 537, 12                    # encoding: [0x7d,0x99,0x83,0xa6]
+# CHECK-LE: mtspr 537, 12                    # encoding: [0xa6,0x83,0x99,0x7d]
+mtdbatl 0, %r12
+# CHECK-BE: mtspr 538, 12                    # encoding: [0x7d,0x9a,0x83,0xa6]
+# CHECK-LE: mtspr 538, 12                    # encoding: [0xa6,0x83,0x9a,0x7d]
+mtdbatu 1, %r12
+# CHECK-BE: mtspr 539, 12                    # encoding: [0x7d,0x9b,0x83,0xa6]
+# CHECK-LE: mtspr 539, 12                    # encoding: [0xa6,0x83,0x9b,0x7d]
+mtdbatl 1, %r12
+# CHECK-BE: mtspr 540, 12                    # encoding: [0x7d,0x9c,0x83,0xa6]
+# CHECK-LE: mtspr 540, 12                    # encoding: [0xa6,0x83,0x9c,0x7d]
+mtdbatu 2, %r12
+# CHECK-BE: mtspr 541, 12                    # encoding: [0x7d,0x9d,0x83,0xa6]
+# CHECK-LE: mtspr 541, 12                    # encoding: [0xa6,0x83,0x9d,0x7d]
+mtdbatl 2, %r12
+# CHECK-BE: mtspr 542, 12                    # encoding: [0x7d,0x9e,0x83,0xa6]
+# CHECK-LE: mtspr 542, 12                    # encoding: [0xa6,0x83,0x9e,0x7d]
+mtdbatu 3, %r12
+# CHECK-BE: mtspr 543, 12                    # encoding: [0x7d,0x9f,0x83,0xa6]
+# CHECK-LE: mtspr 543, 12                    # encoding: [0xa6,0x83,0x9f,0x7d]
+mtdbatl 3, %r12
+
+# CHECK-BE: tlbld 4                        # encoding: [0x7c,0x00,0x27,0xa4]
+# CHECK-LE: tlbld 4                        # encoding: [0xa4,0x27,0x00,0x7c]
+tlbld %r4
+# CHECK-BE: tlbli 4                        # encoding: [0x7c,0x00,0x27,0xe4]
+# CHECK-LE: tlbli 4                        # encoding: [0xe4,0x27,0x00,0x7c]
+tlbli %r4

diff --git a/test/MC/PowerPC/ppc64-encoding-bookII.s b/test/MC/PowerPC/ppc64-encoding-bookII.s
index 99796ca..20eba70 100644
--- a/test/MC/PowerPC/ppc64-encoding-bookII.s
+++ b/test/MC/PowerPC/ppc64-encoding-bookII.s

@@ -8,6 +8,10 @@
 # CHECK-LE: icbi 2, 3                       # encoding: [0xac,0x1f,0x02,0x7c]
             icbi 2, 3
 
+# CHECK-BE: icbt 0, 5, 31                   # encoding: [0x7c,0x05,0xf8,0x2c]
+# CHECK-LE: icbt 0, 5, 31                   # encoding: [0x2c,0xf8,0x05,0x7c]
+            icbt 0, 5, 31
+
 # FIXME:    dcbt 2, 3, 10
 # CHECK-BE: dcbt 2, 3                       # encoding: [0x7c,0x02,0x1a,0x2c]
 # CHECK-LE: dcbt 2, 3                       # encoding: [0x2c,0x1a,0x02,0x7c]
@@ -53,6 +57,11 @@
 # CHECK-BE: wait 2                          # encoding: [0x7c,0x40,0x00,0x7c]
 # CHECK-LE: wait 2                          # encoding: [0x7c,0x00,0x40,0x7c]
             wait 2
+# CHECK-BE: mbar 1                          # encoding: [0x7c,0x20,0x06,0xac]
+# CHECK-LE: mbar 1                          # encoding: [0xac,0x06,0x20,0x7c]
+            mbar 1
+# CHECK-BE: mbar 0
+            mbar
 
 # Extended mnemonics
 
@@ -101,7 +110,16 @@
 # CHECK-BE: mftb 2, 268                     # encoding: [0x7c,0x4c,0x42,0xe6]
 # CHECK-LE: mftb 2, 268                     # encoding: [0xe6,0x42,0x4c,0x7c]
             mftb 2
+# CHECK-BE: mftb 2, 268                     # encoding: [0x7c,0x4c,0x42,0xe6]
+# CHECK-LE: mftb 2, 268                     # encoding: [0xe6,0x42,0x4c,0x7c]
+            mftbl 2
 # CHECK-BE: mftb 2, 269                     # encoding: [0x7c,0x4d,0x42,0xe6]
 # CHECK-LE: mftb 2, 269                     # encoding: [0xe6,0x42,0x4d,0x7c]
             mftbu 2
 
+# CHECK-BE: mtspr 284, 3                    # encoding: [0x7c,0x7c,0x43,0xa6]
+# CHECK-LE: mtspr 284, 3                    # encoding: [0xa6,0x43,0x7c,0x7c]
+            mttbl 3
+# CHECK-BE: mtspr 285, 3                    # encoding: [0x7c,0x7d,0x43,0xa6]
+# CHECK-LE: mtspr 285, 3                    # encoding: [0xa6,0x43,0x7d,0x7c]
+            mttbu 3

diff --git a/test/MC/PowerPC/ppc64-encoding-bookIII.s b/test/MC/PowerPC/ppc64-encoding-bookIII.s
index dfce395..9e784db 100644
--- a/test/MC/PowerPC/ppc64-encoding-bookIII.s
+++ b/test/MC/PowerPC/ppc64-encoding-bookIII.s

@@ -21,53 +21,66 @@
 # CHECK-LE: mtmsrd 4, 1                     # encoding: [0x64,0x01,0x81,0x7c]
             mtmsrd %r4, 1
 
-# CHECK-BE: mfspr 4, 272                    # encoding: [0x7c,0x90,0x42,0xa6]
-# CHECK-LE: mfspr 4, 272                    # encoding: [0xa6,0x42,0x90,0x7c]
-            mfsprg %r4, 0
+# CHECK-BE: mfspr 4, 260                    # encoding: [0x7c,0x84,0x42,0xa6]
+# CHECK-LE: mfspr 4, 260                    # encoding: [0xa6,0x42,0x84,0x7c]
+            mfsprg %r4, 4
 
-# CHECK-BE: mfspr 4, 273                    # encoding: [0x7c,0x91,0x42,0xa6]
-# CHECK-LE: mfspr 4, 273                    # encoding: [0xa6,0x42,0x91,0x7c]
-            mfsprg %r4, 1
+# CHECK-BE: mfspr 4, 261                    # encoding: [0x7c,0x85,0x42,0xa6]
+# CHECK-LE: mfspr 4, 261                    # encoding: [0xa6,0x42,0x85,0x7c]
+            mfsprg %r4, 5
 
-# CHECK-BE: mfspr 4, 274                    # encoding: [0x7c,0x92,0x42,0xa6]
-# CHECK-LE: mfspr 4, 274                    # encoding: [0xa6,0x42,0x92,0x7c]
-            mfsprg %r4, 2
+# CHECK-BE: mfspr 4, 262                    # encoding: [0x7c,0x86,0x42,0xa6]
+# CHECK-LE: mfspr 4, 262                    # encoding: [0xa6,0x42,0x86,0x7c]
+            mfsprg %r4, 6
 
-# CHECK-BE: mfspr 4, 275                    # encoding: [0x7c,0x93,0x42,0xa6]
-# CHECK-LE: mfspr 4, 275                    # encoding: [0xa6,0x42,0x93,0x7c]
-            mfsprg %r4, 3
+# CHECK-BE: mfspr 4, 263                    # encoding: [0x7c,0x87,0x42,0xa6]
+# CHECK-LE: mfspr 4, 263                    # encoding: [0xa6,0x42,0x87,0x7c]
+            mfsprg %r4, 7
 
-# CHECK-BE: mtspr 272, 4                    # encoding: [0x7c,0x90,0x43,0xa6]
-# CHECK-LE: mtspr 272, 4                    # encoding: [0xa6,0x43,0x90,0x7c]
-            mtsprg 0, %r4
+# CHECK-BE: mfspr 2, 260                    # encoding: [0x7c,0x44,0x42,0xa6]
+# CHECK-LE: mfspr 2, 260                    # encoding: [0xa6,0x42,0x44,0x7c]
+            mfsprg4 %r2
+# CHECK-BE: mfspr 2, 261                    # encoding: [0x7c,0x45,0x42,0xa6]
+# CHECK-LE: mfspr 2, 261                    # encoding: [0xa6,0x42,0x45,0x7c]
+            mfsprg5 %r2
+# CHECK-BE: mfspr 2, 262                    # encoding: [0x7c,0x46,0x42,0xa6]
+# CHECK-LE: mfspr 2, 262                    # encoding: [0xa6,0x42,0x46,0x7c]
+            mfsprg6 %r2
+# CHECK-BE: mfspr 2, 263                    # encoding: [0x7c,0x47,0x42,0xa6]
+# CHECK-LE: mfspr 2, 263                    # encoding: [0xa6,0x42,0x47,0x7c]
+            mfsprg7 %r2
 
-# CHECK-BE: mtspr 273, 4                    # encoding: [0x7c,0x91,0x43,0xa6]
-# CHECK-LE: mtspr 273, 4                    # encoding: [0xa6,0x43,0x91,0x7c]
-            mtsprg 1, %r4
+# NOT-CHECK-BE: mtspr 260, 4                    # encoding: [0x7c,0x90,0x43,0xa6]
+# NOT-CHECK-LE: mtspr 260, 4                    # encoding: [0xa6,0x43,0x90,0x7c]
+            mtsprg 4, %r4
 
-# CHECK-BE: mtspr 274, 4                    # encoding: [0x7c,0x92,0x43,0xa6]
-# CHECK-LE: mtspr 274, 4                    # encoding: [0xa6,0x43,0x92,0x7c]
-            mtsprg 2, %r4
+# NOT-CHECK-BE: mtspr 261, 4                    # encoding: [0x7c,0x91,0x43,0xa6]
+# NOT-CHECK-LE: mtspr 261, 4                    # encoding: [0xa6,0x43,0x91,0x7c]
+            mtsprg 5, %r4
 
-# CHECK-BE: mtspr 275, 4                    # encoding: [0x7c,0x93,0x43,0xa6]
-# CHECK-LE: mtspr 275, 4                    # encoding: [0xa6,0x43,0x93,0x7c]
-            mtsprg 3, %r4
+# NOT-CHECK-BE: mtspr 262, 4                    # encoding: [0x7c,0x92,0x43,0xa6]
+# NOT-CHECK-LE: mtspr 262, 4                    # encoding: [0xa6,0x43,0x92,0x7c]
+            mtsprg 6, %r4
 
-# CHECK-BE: mtspr 272, 4                    # encoding: [0x7c,0x90,0x43,0xa6]
-# CHECK-LE: mtspr 272, 4                    # encoding: [0xa6,0x43,0x90,0x7c]
-            mtsprg0 %r4
+# NOT-CHECK-BE: mtspr 263, 4                    # encoding: [0x7c,0x93,0x43,0xa6]
+# NOT-CHECK-LE: mtspr 263, 4                    # encoding: [0xa6,0x43,0x93,0x7c]
+            mtsprg 7, %r4
 
-# CHECK-BE: mtspr 273, 4                    # encoding: [0x7c,0x91,0x43,0xa6]
-# CHECK-LE: mtspr 273, 4                    # encoding: [0xa6,0x43,0x91,0x7c]
-            mtsprg1 %r4
+# CHECK-BE: mtspr 260, 4                    # encoding: [0x7c,0x84,0x43,0xa6]
+# CHECK-LE: mtspr 260, 4                    # encoding: [0xa6,0x43,0x84,0x7c]
+            mtsprg4 %r4
 
-# CHECK-BE: mtspr 274, 4                    # encoding: [0x7c,0x92,0x43,0xa6]
-# CHECK-LE: mtspr 274, 4                    # encoding: [0xa6,0x43,0x92,0x7c]
-            mtsprg2 %r4
+# CHECK-BE: mtspr 261, 4                    # encoding: [0x7c,0x85,0x43,0xa6]
+# CHECK-LE: mtspr 261, 4                    # encoding: [0xa6,0x43,0x85,0x7c]
+            mtsprg5 %r4
 
-# CHECK-BE: mtspr 275, 4                    # encoding: [0x7c,0x93,0x43,0xa6]
-# CHECK-LE: mtspr 275, 4                    # encoding: [0xa6,0x43,0x93,0x7c]
-            mtsprg3 %r4
+# CHECK-BE: mtspr 262, 4                    # encoding: [0x7c,0x86,0x43,0xa6]
+# CHECK-LE: mtspr 262, 4                    # encoding: [0xa6,0x43,0x86,0x7c]
+            mtsprg6 %r4
+
+# CHECK-BE: mtspr 263, 4                    # encoding: [0x7c,0x87,0x43,0xa6]
+# CHECK-LE: mtspr 263, 4                    # encoding: [0xa6,0x43,0x87,0x7c]
+            mtsprg7 %r4
 
 # CHECK-BE: mtspr 280, 4                    # encoding: [0x7c,0x98,0x43,0xa6]
 # CHECK-LE: mtspr 280, 4                    # encoding: [0xa6,0x43,0x98,0x7c]
@@ -141,3 +154,34 @@
 # CHECK-LE: tlbie 4,0                       # encoding: [0x64,0x22,0x00,0x7c]
             tlbie %r4
 
+# CHECK-BE: rfi                             # encoding: [0x4c,0x00,0x00,0x64]
+# CHECK-LE: rfi                             # encoding: [0x64,0x00,0x00,0x4c]
+            rfi
+# CHECK-BE: rfci                            # encoding: [0x4c,0x00,0x00,0x66]
+# CHECK-LE: rfci                            # encoding: [0x66,0x00,0x00,0x4c]
+            rfci
+
+# CHECK-BE: wrtee 12                        # encoding: [0x7d,0x80,0x01,0x06]
+# CHECK-LE: wrtee 12                        # encoding: [0x06,0x01,0x80,0x7d]
+            wrtee %r12
+
+# CHECK-BE: wrteei 0                        # encoding: [0x7c,0x00,0x01,0x46]
+# CHECK-LE: wrteei 0                        # encoding: [0x46,0x01,0x00,0x7c]
+            wrteei 0
+
+# CHECK-BE: wrteei 1                        # encoding: [0x7c,0x00,0x81,0x46]
+# CHECK-LE: wrteei 1                        # encoding: [0x46,0x81,0x00,0x7c]
+            wrteei 1
+
+# CHECK-BE: tlbre                           # encoding: [0x7c,0x00,0x07,0x64]
+# CHECK-LE: tlbre                           # encoding: [0x64,0x07,0x00,0x7c]
+            tlbre
+# CHECK-BE: tlbwe                           # encoding: [0x7c,0x00,0x07,0xa4]
+# CHECK-LE: tlbwe                           # encoding: [0xa4,0x07,0x00,0x7c]
+            tlbwe
+# CHECK-BE: tlbivax 11, 12                  # encoding: [0x7c,0x0b,0x66,0x24]
+# CHECK-LE: tlbivax 11, 12                  # encoding: [0x24,0x66,0x0b,0x7c]
+            tlbivax %r11, %r12
+# CHECK-BE: tlbsx 11, 12                    # encoding: [0x7c,0x0b,0x67,0x24]
+# CHECK-LE: tlbsx 11, 12                    # encoding: [0x24,0x67,0x0b,0x7c]
+            tlbsx %r11, %r12

diff --git a/test/MC/PowerPC/ppc64-encoding-e500.s b/test/MC/PowerPC/ppc64-encoding-e500.s
new file mode 100644
index 0000000..fee91ee
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-encoding-e500.s

@@ -0,0 +1,11 @@
+# RUN: llvm-mc -triple powerpc64-unknown-unknown --show-encoding %s | FileCheck -check-prefix=CHECK-BE %s
+# RUN: llvm-mc -triple powerpc64le-unknown-unknown --show-encoding %s | FileCheck -check-prefix=CHECK-LE %s
+
+# Instructions specific to the e500 / e500mc cores:
+
+# CHECK-BE: rfdi                            # encoding: [0x4c,0x00,0x00,0x4e]
+# CHECK-LE: rfdi                            # encoding: [0x4e,0x00,0x00,0x4c]
+            rfdi
+# CHECK-BE: rfmci                            # encoding: [0x4c,0x00,0x00,0x4c]
+# CHECK-LE: rfmci                            # encoding: [0x4c,0x00,0x00,0x4c]
+            rfmci

diff --git a/test/MC/PowerPC/ppc64-encoding-ext.s b/test/MC/PowerPC/ppc64-encoding-ext.s
index 2374675..0ffe0bf 100644
--- a/test/MC/PowerPC/ppc64-encoding-ext.s
+++ b/test/MC/PowerPC/ppc64-encoding-ext.s

@@ -3419,6 +3419,72 @@
 # CHECK-BE: mfspr 2, 1                      # encoding: [0x7c,0x41,0x02,0xa6]
 # CHECK-LE: mfspr 2, 1                      # encoding: [0xa6,0x02,0x41,0x7c]
             mfxer 2
+# CHECK-BE: mfspr 2, 4                      # encoding: [0x7c,0x44,0x02,0xa6]
+# CHECK-LE: mfspr 2, 4                      # encoding: [0xa6,0x02,0x44,0x7c]
+            mfrtcu 2
+# CHECK-BE: mfspr 2, 5                      # encoding: [0x7c,0x45,0x02,0xa6]
+# CHECK-LE: mfspr 2, 5                      # encoding: [0xa6,0x02,0x45,0x7c]
+            mfrtcl 2
+# CHECK-BE: mtspr 17, 2                     # encoding: [0x7c,0x51,0x03,0xa6]
+# CHECK-LE: mtspr 17, 2                     # encoding: [0xa6,0x03,0x51,0x7c]
+            mtdscr 2
+# CHECK-BE: mfspr 2, 17                     # encoding: [0x7c,0x51,0x02,0xa6]
+# CHECK-LE: mfspr 2, 17                     # encoding: [0xa6,0x02,0x51,0x7c]
+            mfdscr 2
+# CHECK-BE: mtspr 18, 2                     # encoding: [0x7c,0x52,0x03,0xa6]
+# CHECK-LE: mtspr 18, 2                     # encoding: [0xa6,0x03,0x52,0x7c]
+            mtdsisr 2
+# CHECK-BE: mfspr 2, 18                     # encoding: [0x7c,0x52,0x02,0xa6]
+# CHECK-LE: mfspr 2, 18                     # encoding: [0xa6,0x02,0x52,0x7c]
+            mfdsisr 2
+# CHECK-BE: mtspr 19, 2                     # encoding: [0x7c,0x53,0x03,0xa6]
+# CHECK-LE: mtspr 19, 2                     # encoding: [0xa6,0x03,0x53,0x7c]
+            mtdar 2
+# CHECK-BE: mfspr 2, 19                     # encoding: [0x7c,0x53,0x02,0xa6]
+# CHECK-LE: mfspr 2, 19                     # encoding: [0xa6,0x02,0x53,0x7c]
+            mfdar 2
+# CHECK-BE: mtspr 22, 2                     # encoding: [0x7c,0x56,0x03,0xa6]
+# CHECK-LE: mtspr 22, 2                     # encoding: [0xa6,0x03,0x56,0x7c]
+            mtdec 2
+# CHECK-BE: mfspr 2, 22                     # encoding: [0x7c,0x56,0x02,0xa6]
+# CHECK-LE: mfspr 2, 22                     # encoding: [0xa6,0x02,0x56,0x7c]
+            mfdec 2
+# CHECK-BE: mtspr 25, 2                     # encoding: [0x7c,0x59,0x03,0xa6]
+# CHECK-LE: mtspr 25, 2                     # encoding: [0xa6,0x03,0x59,0x7c]
+            mtsdr1 2
+# CHECK-BE: mfspr 2, 25                     # encoding: [0x7c,0x59,0x02,0xa6]
+# CHECK-LE: mfspr 2, 25                     # encoding: [0xa6,0x02,0x59,0x7c]
+            mfsdr1 2
+# CHECK-BE: mtspr 26, 2                     # encoding: [0x7c,0x5a,0x03,0xa6]
+# CHECK-LE: mtspr 26, 2                     # encoding: [0xa6,0x03,0x5a,0x7c]
+            mtsrr0 2
+# CHECK-BE: mfspr 2, 26                     # encoding: [0x7c,0x5a,0x02,0xa6]
+# CHECK-LE: mfspr 2, 26                     # encoding: [0xa6,0x02,0x5a,0x7c]
+            mfsrr0 2
+# CHECK-BE: mtspr 27, 2                     # encoding: [0x7c,0x5b,0x03,0xa6]
+# CHECK-LE: mtspr 27, 2                     # encoding: [0xa6,0x03,0x5b,0x7c]
+            mtsrr1 2
+# CHECK-BE: mfspr 2, 27                     # encoding: [0x7c,0x5b,0x02,0xa6]
+# CHECK-LE: mfspr 2, 27                     # encoding: [0xa6,0x02,0x5b,0x7c]
+            mfsrr1 2
+# CHECK-BE: mtspr 28, 2                     # encoding: [0x7c,0x5c,0x03,0xa6]
+# CHECK-LE: mtspr 28, 2                     # encoding: [0xa6,0x03,0x5c,0x7c]
+            mtcfar 2
+# CHECK-BE: mfspr 2, 28                     # encoding: [0x7c,0x5c,0x02,0xa6]
+# CHECK-LE: mfspr 2, 28                     # encoding: [0xa6,0x02,0x5c,0x7c]
+            mfcfar 2
+# CHECK-BE: mtspr 29, 2                     # encoding: [0x7c,0x5d,0x03,0xa6]
+# CHECK-LE: mtspr 29, 2                     # encoding: [0xa6,0x03,0x5d,0x7c]
+            mtamr 2
+# CHECK-BE: mfspr 2, 29                     # encoding: [0x7c,0x5d,0x02,0xa6]
+# CHECK-LE: mfspr 2, 29                     # encoding: [0xa6,0x02,0x5d,0x7c]
+            mfamr 2
+# CHECK-BE: mtspr 48, 2                     # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2                     # encoding: [0xa6,0x0b,0x50,0x7c]
+            mtpid 2
+# CHECK-BE: mfspr 2, 48                     # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48                     # encoding: [0xa6,0x0a,0x50,0x7c]
+            mfpid 2
 # CHECK-BE: mtlr 2                          # encoding: [0x7c,0x48,0x03,0xa6]
 # CHECK-LE: mtlr 2                          # encoding: [0xa6,0x03,0x48,0x7c]
             mtlr 2
@@ -3465,3 +3531,105 @@
 # CHECK-LE: mtcrf 255, 2                    # encoding: [0x20,0xf1,0x4f,0x7c]
             mtcr 2
 
+# CHECK-BE: mfspr 4, 272                    # encoding: [0x7c,0x90,0x42,0xa6]
+# CHECK-LE: mfspr 4, 272                    # encoding: [0xa6,0x42,0x90,0x7c]
+            mfsprg %r4, 0
+
+# CHECK-BE: mfspr 4, 273                    # encoding: [0x7c,0x91,0x42,0xa6]
+# CHECK-LE: mfspr 4, 273                    # encoding: [0xa6,0x42,0x91,0x7c]
+            mfsprg %r4, 1
+
+# CHECK-BE: mfspr 4, 274                    # encoding: [0x7c,0x92,0x42,0xa6]
+# CHECK-LE: mfspr 4, 274                    # encoding: [0xa6,0x42,0x92,0x7c]
+            mfsprg %r4, 2
+
+# CHECK-BE: mfspr 4, 275                    # encoding: [0x7c,0x93,0x42,0xa6]
+# CHECK-LE: mfspr 4, 275                    # encoding: [0xa6,0x42,0x93,0x7c]
+            mfsprg %r4, 3
+
+# CHECK-BE: mfspr 2, 272                    # encoding: [0x7c,0x50,0x42,0xa6]
+# CHECK-LE: mfspr 2, 272                    # encoding: [0xa6,0x42,0x50,0x7c]
+            mfsprg0 %r2
+# CHECK-BE: mfspr 2, 273                    # encoding: [0x7c,0x51,0x42,0xa6]
+# CHECK-LE: mfspr 2, 273                    # encoding: [0xa6,0x42,0x51,0x7c]
+            mfsprg1 %r2
+# CHECK-BE: mfspr 2, 274                    # encoding: [0x7c,0x52,0x42,0xa6]
+# CHECK-LE: mfspr 2, 274                    # encoding: [0xa6,0x42,0x52,0x7c]
+            mfsprg2 %r2
+# CHECK-BE: mfspr 2, 275                    # encoding: [0x7c,0x53,0x42,0xa6]
+# CHECK-LE: mfspr 2, 275                    # encoding: [0xa6,0x42,0x53,0x7c]
+            mfsprg3 %r2
+
+# CHECK-BE: mtspr 272, 4                    # encoding: [0x7c,0x90,0x43,0xa6]
+# CHECK-LE: mtspr 272, 4                    # encoding: [0xa6,0x43,0x90,0x7c]
+            mtsprg 0, %r4
+
+# CHECK-BE: mtspr 273, 4                    # encoding: [0x7c,0x91,0x43,0xa6]
+# CHECK-LE: mtspr 273, 4                    # encoding: [0xa6,0x43,0x91,0x7c]
+            mtsprg 1, %r4
+
+# CHECK-BE: mtspr 274, 4                    # encoding: [0x7c,0x92,0x43,0xa6]
+# CHECK-LE: mtspr 274, 4                    # encoding: [0xa6,0x43,0x92,0x7c]
+            mtsprg 2, %r4
+
+# CHECK-BE: mtspr 275, 4                    # encoding: [0x7c,0x93,0x43,0xa6]
+# CHECK-LE: mtspr 275, 4                    # encoding: [0xa6,0x43,0x93,0x7c]
+            mtsprg 3, %r4
+
+# CHECK-BE: mtspr 272, 4                    # encoding: [0x7c,0x90,0x43,0xa6]
+# CHECK-LE: mtspr 272, 4                    # encoding: [0xa6,0x43,0x90,0x7c]
+            mtsprg0 %r4
+
+# CHECK-BE: mtspr 273, 4                    # encoding: [0x7c,0x91,0x43,0xa6]
+# CHECK-LE: mtspr 273, 4                    # encoding: [0xa6,0x43,0x91,0x7c]
+            mtsprg1 %r4
+
+# CHECK-BE: mtspr 274, 4                    # encoding: [0x7c,0x92,0x43,0xa6]
+# CHECK-LE: mtspr 274, 4                    # encoding: [0xa6,0x43,0x92,0x7c]
+            mtsprg2 %r4
+
+# CHECK-BE: mtspr 275, 4                    # encoding: [0x7c,0x93,0x43,0xa6]
+# CHECK-LE: mtspr 275, 4                    # encoding: [0xa6,0x43,0x93,0x7c]
+            mtsprg3 %r4
+
+# Altivec Data Stream instruction:
+# CHECK-BE: dss 3                            # encoding: [0x7c,0x60,0x06,0x6c]
+# CHECK-LE: dss 3                            # encoding: [0x6c,0x06,0x60,0x7c]
+            dss 3
+# CHECK-BE: dssall                           # encoding: [0x7e,0x00,0x06,0x6c]
+# CHECK-LE: dssall                           # encoding: [0x6c,0x06,0x00,0x7e]
+            dssall
+# CHECK-BE: dst 12, 11, 3                    # encoding: [0x7c,0x6c,0x5a,0xac]
+# CHECK-LE: dst 12, 11, 3                    # encoding: [0xac,0x5a,0x6c,0x7c]
+            dst %r12, %r11, 3
+# CHECK-BE: dstt 12, 11, 3                   # encoding: [0x7e,0x6c,0x5a,0xac]
+# CHECK-LE: dstt 12, 11, 3                   # encoding: [0xac,0x5a,0x6c,0x7e]
+            dstt %r12, %r11, 3
+# CHECK-BE: dstst 12, 11, 3                  # encoding: [0x7c,0x6c,0x5a,0xec]
+# CHECK-LE: dstst 12, 11, 3                  # encoding: [0xec,0x5a,0x6c,0x7c]
+            dstst %r12, %r11, 3
+# CHECK-BE: dststt 12, 11, 3                 # encoding: [0x7e,0x6c,0x5a,0xec]
+# CHECK-LE: dststt 12, 11, 3                 # encoding: [0xec,0x5a,0x6c,0x7e]
+            dststt %r12, %r11, 3
+
+# CHECK-BE: tlbia                            # encoding: [0x7c,0x00,0x02,0xe4]
+# CHECK-LE: tlbia                            # encoding: [0xe4,0x02,0x00,0x7c]
+            tlbia
+
+# CHECK-BE: lswi 8, 6, 7                     # encoding: [0x7d,0x06,0x3c,0xaa]
+# CHECK-LE: lswi 8, 6, 7                     # encoding: [0xaa,0x3c,0x06,0x7d]
+            lswi %r8, %r6, 7
+# CHECK-BE: stswi 8, 6, 7                    # encoding: [0x7d,0x06,0x3d,0xaa]
+# CHECK-LE: stswi 8, 6, 7                    # encoding: [0xaa,0x3d,0x06,0x7d]
+            stswi %r8, %r6, 7
+
+# CHECK-BE: rfid                            # encoding: [0x4c,0x00,0x00,0x24]
+# CHECK-LE: rfid                            # encoding: [0x24,0x00,0x00,0x4c]
+            rfid
+
+# CHECK-BE: mfspr 2, 280                     # encoding: [0x7c,0x58,0x42,0xa6]
+# CHECK-LE: mfspr 2, 280                     # encoding: [0xa6,0x42,0x58,0x7c]
+            mfasr 2
+# CHECK-BE: mtspr 280, 2                     # encoding: [0x7c,0x58,0x43,0xa6]
+# CHECK-LE: mtspr 280, 2                     # encoding: [0xa6,0x43,0x58,0x7c]
+            mtasr 2

diff --git a/test/MC/PowerPC/ppc64-encoding-spe.s b/test/MC/PowerPC/ppc64-encoding-spe.s
new file mode 100644
index 0000000..d90eb30
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-encoding-spe.s

@@ -0,0 +1,622 @@
+# RUN: llvm-mc -triple powerpc64-unknown-unknown --show-encoding %s | FileCheck -check-prefix=CHECK-BE %s
+# RUN: llvm-mc -triple powerpc64le-unknown-unknown --show-encoding %s | FileCheck -check-prefix=CHECK-LE %s
+
+# Instructions from the Signal Processing Engine extension:
+
+# CHECK-BE: evlddx 14, 21, 28               # encoding: [0x11,0xd5,0xe3,0x00]
+# CHECK-LE: evlddx 14, 21, 28               # encoding: [0x00,0xe3,0xd5,0x11]
+            evlddx %r14, %r21, %r28
+# CHECK-BE: evldwx 14, 21, 28               # encoding: [0x11,0xd5,0xe3,0x02]
+# CHECK-LE: evldwx 14, 21, 28               # encoding: [0x02,0xe3,0xd5,0x11]
+            evldwx %r14, %r21, %r28
+# CHECK-BE: evldhx 14, 21, 28               # encoding: [0x11,0xd5,0xe3,0x04]
+# CHECK-LE: evldhx 14, 21, 28               # encoding: [0x04,0xe3,0xd5,0x11]
+            evldhx %r14, %r21, %r28
+# CHECK-BE: evlhhesplatx 14, 21, 28         # encoding: [0x11,0xd5,0xe3,0x08]
+# CHECK-LE: evlhhesplatx 14, 21, 28         # encoding: [0x08,0xe3,0xd5,0x11]
+            evlhhesplatx %r14, %r21, %r28
+# CHECK-BE: evlhhousplatx 14, 21, 28        # encoding: [0x11,0xd5,0xe3,0x0c]
+# CHECK-LE: evlhhousplatx 14, 21, 28        # encoding: [0x0c,0xe3,0xd5,0x11]
+            evlhhousplatx %r14, %r21, %r28
+# CHECK-BE: evlhhossplatx 14, 21, 28        # encoding: [0x11,0xd5,0xe3,0x0e]
+# CHECK-LE: evlhhossplatx 14, 21, 28        # encoding: [0x0e,0xe3,0xd5,0x11]
+            evlhhossplatx %r14, %r21, %r28
+# CHECK-BE: evlwhex 14, 21, 28              # encoding: [0x11,0xd5,0xe3,0x10]
+# CHECK-LE: evlwhex 14, 21, 28              # encoding: [0x10,0xe3,0xd5,0x11]
+            evlwhex %r14, %r21, %r28
+# CHECK-BE: evlwhoux 14, 21, 28             # encoding: [0x11,0xd5,0xe3,0x14]
+# CHECK-LE: evlwhoux 14, 21, 28             # encoding: [0x14,0xe3,0xd5,0x11]
+            evlwhoux %r14, %r21, %r28
+# CHECK-BE: evlwhosx 14, 21, 28             # encoding: [0x11,0xd5,0xe3,0x16]
+# CHECK-LE: evlwhosx 14, 21, 28             # encoding: [0x16,0xe3,0xd5,0x11]
+            evlwhosx %r14, %r21, %r28
+# CHECK-BE: evlwwsplatx 14, 21, 28          # encoding: [0x11,0xd5,0xe3,0x18]
+# CHECK-LE: evlwwsplatx 14, 21, 28          # encoding: [0x18,0xe3,0xd5,0x11]
+            evlwwsplatx %r14, %r21, %r28
+# CHECK-BE: evlwhsplatx 14, 21, 28          # encoding: [0x11,0xd5,0xe3,0x1c]
+# CHECK-LE: evlwhsplatx 14, 21, 28          # encoding: [0x1c,0xe3,0xd5,0x11]
+            evlwhsplatx %r14, %r21, %r28
+# CHECK-BE: evmergehi 14, 21, 28            # encoding: [0x11,0xd5,0xe2,0x2c]
+# CHECK-LE: evmergehi 14, 21, 28            # encoding: [0x2c,0xe2,0xd5,0x11]
+            evmergehi %r14, %r21, %r28
+# CHECK-BE: evmergelo 14, 21, 28            # encoding: [0x11,0xd5,0xe2,0x2d]
+# CHECK-LE: evmergelo 14, 21, 28            # encoding: [0x2d,0xe2,0xd5,0x11]
+            evmergelo %r14, %r21, %r28
+# CHECK-BE: evmergehilo 14, 21, 28          # encoding: [0x11,0xd5,0xe2,0x2e]
+# CHECK-LE: evmergehilo 14, 21, 28          # encoding: [0x2e,0xe2,0xd5,0x11]
+            evmergehilo %r14, %r21, %r28
+# CHECK-BE: evmergelohi 14, 21, 28          # encoding: [0x11,0xd5,0xe2,0x2f]
+# CHECK-LE: evmergelohi 14, 21, 28          # encoding: [0x2f,0xe2,0xd5,0x11]
+            evmergelohi %r14, %r21, %r28
+
+# CHECK-BE: brinc 14, 22, 19                # encoding: [0x11,0xd6,0x9a,0x0f]
+# CHECK-LE: brinc 14, 22, 19                # encoding: [0x0f,0x9a,0xd6,0x11]
+            brinc %r14, %r22, %r19
+# CHECK-BE: evabs 14, 22                    # encoding: [0x11,0xd6,0x02,0x08]
+# CHECK-LE: evabs 14, 22                    # encoding: [0x08,0x02,0xd6,0x11]
+            evabs %r14, %r22
+# CHECK-BE: evaddsmiaaw 14, 22              # encoding: [0x11,0xd6,0x04,0xc9]
+# CHECK-LE: evaddsmiaaw 14, 22              # encoding: [0xc9,0x04,0xd6,0x11]
+            evaddsmiaaw %r14, %r22
+# CHECK-BE: evaddssiaaw 14, 22              # encoding: [0x11,0xd6,0x04,0xc1]
+# CHECK-LE: evaddssiaaw 14, 22              # encoding: [0xc1,0x04,0xd6,0x11]
+            evaddssiaaw %r14, %r22
+# CHECK-BE: evaddusiaaw 14, 22              # encoding: [0x11,0xd6,0x04,0xc0]
+# CHECK-LE: evaddusiaaw 14, 22              # encoding: [0xc0,0x04,0xd6,0x11]
+            evaddusiaaw %r14, %r22
+# CHECK-BE: evaddumiaaw 14, 22              # encoding: [0x11,0xd6,0x04,0xc8]
+# CHECK-LE: evaddumiaaw 14, 22              # encoding: [0xc8,0x04,0xd6,0x11]
+            evaddumiaaw %r14, %r22
+# CHECK-BE: evaddw 14, 22, 19               # encoding: [0x11,0xd6,0x9a,0x00]
+# CHECK-LE: evaddw 14, 22, 19               # encoding: [0x00,0x9a,0xd6,0x11]
+            evaddw %r14, %r22, %r19
+# CHECK-BE: evaddiw 14, 29, 19              # encoding: [0x11,0xd3,0xea,0x02]
+# CHECK-LE: evaddiw 14, 29, 19              # encoding: [0x02,0xea,0xd3,0x11]
+            evaddiw %r14, 29, %r19
+# CHECK-BE: evand 14, 22, 19                # encoding: [0x11,0xd6,0x9a,0x11]
+# CHECK-LE: evand 14, 22, 19                # encoding: [0x11,0x9a,0xd6,0x11]
+            evand %r14, %r22, %r19
+# CHECK-BE: evandc 14, 22, 19               # encoding: [0x11,0xd6,0x9a,0x12]
+# CHECK-LE: evandc 14, 22, 19               # encoding: [0x12,0x9a,0xd6,0x11]
+            evandc %r14, %r22, %r19
+# CHECK-BE: evcmpeq 3, 22, 19            # encoding: [0x11,0x96,0x9a,0x34]
+# CHECK-LE: evcmpeq 3, 22, 19            # encoding: [0x34,0x9a,0x96,0x11]
+            evcmpeq %cr3, %r22, %r19
+# CHECK-BE: evcmpgts 3, 22, 19           # encoding: [0x11,0x96,0x9a,0x31]
+# CHECK-LE: evcmpgts 3, 22, 19           # encoding: [0x31,0x9a,0x96,0x11]
+            evcmpgts %cr3, %r22, %r19
+# CHECK-BE: evcmpgtu 3, 22, 19           # encoding: [0x11,0x96,0x9a,0x30]
+# CHECK-LE: evcmpgtu 3, 22, 19           # encoding: [0x30,0x9a,0x96,0x11]
+            evcmpgtu %cr3, %r22, %r19
+# CHECK-BE: evcmplts 3, 22, 19           # encoding: [0x11,0x96,0x9a,0x33]
+# CHECK-LE: evcmplts 3, 22, 19           # encoding: [0x33,0x9a,0x96,0x11]
+            evcmplts %cr3, %r22, %r19
+# CHECK-BE: evcmpltu 3, 22, 19           # encoding: [0x11,0x96,0x9a,0x32]
+# CHECK-LE: evcmpltu 3, 22, 19           # encoding: [0x32,0x9a,0x96,0x11]
+            evcmpltu %cr3, %r22, %r19
+# CHECK-BE: evcntlsw 14, 22                 # encoding: [0x11,0xd6,0x02,0x0e]
+# CHECK-LE: evcntlsw 14, 22                 # encoding: [0x0e,0x02,0xd6,0x11]
+            evcntlsw %r14, %r22
+# CHECK-BE: evcntlzw 14, 22                 # encoding: [0x11,0xd6,0x02,0x0d]
+# CHECK-LE: evcntlzw 14, 22                 # encoding: [0x0d,0x02,0xd6,0x11]
+            evcntlzw %r14, %r22
+# CHECK-BE: evdivws 14, 22, 19              # encoding: [0x11,0xd6,0x9c,0xc6]
+# CHECK-LE: evdivws 14, 22, 19              # encoding: [0xc6,0x9c,0xd6,0x11]
+            evdivws %r14, %r22, %r19
+# CHECK-BE: evdivwu 14, 22, 19              # encoding: [0x11,0xd6,0x9c,0xc7]
+# CHECK-LE: evdivwu 14, 22, 19              # encoding: [0xc7,0x9c,0xd6,0x11]
+            evdivwu %r14, %r22, %r19
+# CHECK-BE: eveqv 14, 22, 19                # encoding: [0x11,0xd6,0x9a,0x19]
+# CHECK-LE: eveqv 14, 22, 19                # encoding: [0x19,0x9a,0xd6,0x11]
+            eveqv %r14, %r22, %r19
+# CHECK-BE: evextsb 14, 22                  # encoding: [0x11,0xd6,0x02,0x0a]
+# CHECK-LE: evextsb 14, 22                  # encoding: [0x0a,0x02,0xd6,0x11]
+            evextsb %r14, %r22
+# CHECK-BE: evextsh 14, 22                  # encoding: [0x11,0xd6,0x02,0x0b]
+# CHECK-LE: evextsh 14, 22                  # encoding: [0x0b,0x02,0xd6,0x11]
+            evextsh %r14, %r22
+# CHECK-BE: evmhegsmfaa 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x2b]
+# CHECK-LE: evmhegsmfaa 14, 22, 19          # encoding: [0x2b,0x9d,0xd6,0x11]
+            evmhegsmfaa %r14, %r22, %r19
+# CHECK-BE: evmhegsmfan 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xab]
+# CHECK-LE: evmhegsmfan 14, 22, 19          # encoding: [0xab,0x9d,0xd6,0x11]
+            evmhegsmfan %r14, %r22, %r19
+# CHECK-BE: evmhegsmiaa 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x29]
+# CHECK-LE: evmhegsmiaa 14, 22, 19          # encoding: [0x29,0x9d,0xd6,0x11]
+            evmhegsmiaa %r14, %r22, %r19
+# CHECK-BE: evmhegsmian 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xa9]
+# CHECK-LE: evmhegsmian 14, 22, 19          # encoding: [0xa9,0x9d,0xd6,0x11]
+            evmhegsmian %r14, %r22, %r19
+# CHECK-BE: evmhegumiaa 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x28]
+# CHECK-LE: evmhegumiaa 14, 22, 19          # encoding: [0x28,0x9d,0xd6,0x11]
+            evmhegumiaa %r14, %r22, %r19
+# CHECK-BE: evmhegumian 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xa8]
+# CHECK-LE: evmhegumian 14, 22, 19          # encoding: [0xa8,0x9d,0xd6,0x11]
+            evmhegumian %r14, %r22, %r19
+# CHECK-BE: evmhesmf 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x0b]
+# CHECK-LE: evmhesmf 14, 22, 19             # encoding: [0x0b,0x9c,0xd6,0x11]
+            evmhesmf %r14, %r22, %r19
+# CHECK-BE: evmhesmfa 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x2b]
+# CHECK-LE: evmhesmfa 14, 22, 19            # encoding: [0x2b,0x9c,0xd6,0x11]
+            evmhesmfa %r14, %r22, %r19
+# CHECK-BE: evmhesmfaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x0b]
+# CHECK-LE: evmhesmfaaw 14, 22, 19          # encoding: [0x0b,0x9d,0xd6,0x11]
+            evmhesmfaaw %r14, %r22, %r19
+# CHECK-BE: evmhesmfanw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x8b]
+# CHECK-LE: evmhesmfanw 14, 22, 19          # encoding: [0x8b,0x9d,0xd6,0x11]
+            evmhesmfanw %r14, %r22, %r19
+# CHECK-BE: evmhesmi 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x09]
+# CHECK-LE: evmhesmi 14, 22, 19             # encoding: [0x09,0x9c,0xd6,0x11]
+            evmhesmi %r14, %r22, %r19
+# CHECK-BE: evmhesmia 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x29]
+# CHECK-LE: evmhesmia 14, 22, 19            # encoding: [0x29,0x9c,0xd6,0x11]
+            evmhesmia %r14, %r22, %r19
+# CHECK-BE: evmhesmiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x09]
+# CHECK-LE: evmhesmiaaw 14, 22, 19          # encoding: [0x09,0x9d,0xd6,0x11]
+            evmhesmiaaw %r14, %r22, %r19
+# CHECK-BE: evmhesmianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x89]
+# CHECK-LE: evmhesmianw 14, 22, 19          # encoding: [0x89,0x9d,0xd6,0x11]
+            evmhesmianw %r14, %r22, %r19
+# CHECK-BE: evmhessf 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x03]
+# CHECK-LE: evmhessf 14, 22, 19             # encoding: [0x03,0x9c,0xd6,0x11]
+            evmhessf %r14, %r22, %r19
+# CHECK-BE: evmhessfa 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x23]
+# CHECK-LE: evmhessfa 14, 22, 19            # encoding: [0x23,0x9c,0xd6,0x11]
+            evmhessfa %r14, %r22, %r19
+# CHECK-BE: evmhessfaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x03]
+# CHECK-LE: evmhessfaaw 14, 22, 19          # encoding: [0x03,0x9d,0xd6,0x11]
+            evmhessfaaw %r14, %r22, %r19
+# CHECK-BE: evmhessfanw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x83]
+# CHECK-LE: evmhessfanw 14, 22, 19          # encoding: [0x83,0x9d,0xd6,0x11]
+            evmhessfanw %r14, %r22, %r19
+# CHECK-BE: evmhessiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x01]
+# CHECK-LE: evmhessiaaw 14, 22, 19          # encoding: [0x01,0x9d,0xd6,0x11]
+            evmhessiaaw %r14, %r22, %r19
+# CHECK-BE: evmhessianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x81]
+# CHECK-LE: evmhessianw 14, 22, 19          # encoding: [0x81,0x9d,0xd6,0x11]
+            evmhessianw %r14, %r22, %r19
+# CHECK-BE: evmheumi 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x08]
+# CHECK-LE: evmheumi 14, 22, 19             # encoding: [0x08,0x9c,0xd6,0x11]
+            evmheumi %r14, %r22, %r19
+# CHECK-BE: evmheumia 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x28]
+# CHECK-LE: evmheumia 14, 22, 19            # encoding: [0x28,0x9c,0xd6,0x11]
+            evmheumia %r14, %r22, %r19
+# CHECK-BE: evmheumiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x08]
+# CHECK-LE: evmheumiaaw 14, 22, 19          # encoding: [0x08,0x9d,0xd6,0x11]
+            evmheumiaaw %r14, %r22, %r19
+# CHECK-BE: evmheumianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x88]
+# CHECK-LE: evmheumianw 14, 22, 19          # encoding: [0x88,0x9d,0xd6,0x11]
+            evmheumianw %r14, %r22, %r19
+# CHECK-BE: evmheusiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x00]
+# CHECK-LE: evmheusiaaw 14, 22, 19          # encoding: [0x00,0x9d,0xd6,0x11]
+            evmheusiaaw %r14, %r22, %r19
+# CHECK-BE: evmheusianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x80]
+# CHECK-LE: evmheusianw 14, 22, 19          # encoding: [0x80,0x9d,0xd6,0x11]
+            evmheusianw %r14, %r22, %r19
+# CHECK-BE: evmhogsmfaa 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x2f]
+# CHECK-LE: evmhogsmfaa 14, 22, 19          # encoding: [0x2f,0x9d,0xd6,0x11]
+            evmhogsmfaa %r14, %r22, %r19
+# CHECK-BE: evmhogsmfan 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xaf]
+# CHECK-LE: evmhogsmfan 14, 22, 19          # encoding: [0xaf,0x9d,0xd6,0x11]
+            evmhogsmfan %r14, %r22, %r19
+# CHECK-BE: evmhogsmiaa 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x2d]
+# CHECK-LE: evmhogsmiaa 14, 22, 19          # encoding: [0x2d,0x9d,0xd6,0x11]
+            evmhogsmiaa %r14, %r22, %r19
+# CHECK-BE: evmhogsmian 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xad]
+# CHECK-LE: evmhogsmian 14, 22, 19          # encoding: [0xad,0x9d,0xd6,0x11]
+            evmhogsmian %r14, %r22, %r19
+# CHECK-BE: evmhogumiaa 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x2c]
+# CHECK-LE: evmhogumiaa 14, 22, 19          # encoding: [0x2c,0x9d,0xd6,0x11]
+            evmhogumiaa %r14, %r22, %r19
+# CHECK-BE: evmhogumian 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xac]
+# CHECK-LE: evmhogumian 14, 22, 19          # encoding: [0xac,0x9d,0xd6,0x11]
+            evmhogumian %r14, %r22, %r19
+# CHECK-BE: evmhosmf 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x0f]
+# CHECK-LE: evmhosmf 14, 22, 19             # encoding: [0x0f,0x9c,0xd6,0x11]
+            evmhosmf %r14, %r22, %r19
+# CHECK-BE: evmhosmfa 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x2f]
+# CHECK-LE: evmhosmfa 14, 22, 19            # encoding: [0x2f,0x9c,0xd6,0x11]
+            evmhosmfa %r14, %r22, %r19
+# CHECK-BE: evmhosmfaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x0f]
+# CHECK-LE: evmhosmfaaw 14, 22, 19          # encoding: [0x0f,0x9d,0xd6,0x11]
+            evmhosmfaaw %r14, %r22, %r19
+# CHECK-BE: evmhosmfanw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x8f]
+# CHECK-LE: evmhosmfanw 14, 22, 19          # encoding: [0x8f,0x9d,0xd6,0x11]
+            evmhosmfanw %r14, %r22, %r19
+# CHECK-BE: evmhosmi 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x0d]
+# CHECK-LE: evmhosmi 14, 22, 19             # encoding: [0x0d,0x9c,0xd6,0x11]
+            evmhosmi %r14, %r22, %r19
+# CHECK-BE: evmhosmia 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x2d]
+# CHECK-LE: evmhosmia 14, 22, 19            # encoding: [0x2d,0x9c,0xd6,0x11]
+            evmhosmia %r14, %r22, %r19
+# CHECK-BE: evmhosmiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x0d]
+# CHECK-LE: evmhosmiaaw 14, 22, 19          # encoding: [0x0d,0x9d,0xd6,0x11]
+            evmhosmiaaw %r14, %r22, %r19
+# CHECK-BE: evmhosmianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x8d]
+# CHECK-LE: evmhosmianw 14, 22, 19          # encoding: [0x8d,0x9d,0xd6,0x11]
+            evmhosmianw %r14, %r22, %r19
+# CHECK-BE: evmhossf 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x07]
+# CHECK-LE: evmhossf 14, 22, 19             # encoding: [0x07,0x9c,0xd6,0x11]
+            evmhossf %r14, %r22, %r19
+# CHECK-BE: evmhossfa 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x27]
+# CHECK-LE: evmhossfa 14, 22, 19            # encoding: [0x27,0x9c,0xd6,0x11]
+            evmhossfa %r14, %r22, %r19
+# CHECK-BE: evmhossfaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x07]
+# CHECK-LE: evmhossfaaw 14, 22, 19          # encoding: [0x07,0x9d,0xd6,0x11]
+            evmhossfaaw %r14, %r22, %r19
+# CHECK-BE: evmhossfanw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x87]
+# CHECK-LE: evmhossfanw 14, 22, 19          # encoding: [0x87,0x9d,0xd6,0x11]
+            evmhossfanw %r14, %r22, %r19
+# CHECK-BE: evmhossiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x05]
+# CHECK-LE: evmhossiaaw 14, 22, 19          # encoding: [0x05,0x9d,0xd6,0x11]
+            evmhossiaaw %r14, %r22, %r19
+# CHECK-BE: evmhossianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x85]
+# CHECK-LE: evmhossianw 14, 22, 19          # encoding: [0x85,0x9d,0xd6,0x11]
+            evmhossianw %r14, %r22, %r19
+# CHECK-BE: evmhoumi 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x0c]
+# CHECK-LE: evmhoumi 14, 22, 19             # encoding: [0x0c,0x9c,0xd6,0x11]
+            evmhoumi %r14, %r22, %r19
+# CHECK-BE: evmhoumia 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x2c]
+# CHECK-LE: evmhoumia 14, 22, 19            # encoding: [0x2c,0x9c,0xd6,0x11]
+            evmhoumia %r14, %r22, %r19
+# CHECK-BE: evmhoumiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x0c]
+# CHECK-LE: evmhoumiaaw 14, 22, 19          # encoding: [0x0c,0x9d,0xd6,0x11]
+            evmhoumiaaw %r14, %r22, %r19
+# CHECK-BE: evmhoumianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x8c]
+# CHECK-LE: evmhoumianw 14, 22, 19          # encoding: [0x8c,0x9d,0xd6,0x11]
+            evmhoumianw %r14, %r22, %r19
+# CHECK-BE: evmhousiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x04]
+# CHECK-LE: evmhousiaaw 14, 22, 19          # encoding: [0x04,0x9d,0xd6,0x11]
+            evmhousiaaw %r14, %r22, %r19
+# CHECK-BE: evmhousianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x84]
+# CHECK-LE: evmhousianw 14, 22, 19          # encoding: [0x84,0x9d,0xd6,0x11]
+            evmhousianw %r14, %r22, %r19
+# CHECK-BE: evmwhsmf 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x4f]
+# CHECK-LE: evmwhsmf 14, 22, 19             # encoding: [0x4f,0x9c,0xd6,0x11]
+            evmwhsmf %r14, %r22, %r19
+# CHECK-BE: evmwhsmfa 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x6f]
+# CHECK-LE: evmwhsmfa 14, 22, 19            # encoding: [0x6f,0x9c,0xd6,0x11]
+            evmwhsmfa %r14, %r22, %r19
+# CHECK-BE: evmwhsmi 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x4d]
+# CHECK-LE: evmwhsmi 14, 22, 19             # encoding: [0x4d,0x9c,0xd6,0x11]
+            evmwhsmi %r14, %r22, %r19
+# CHECK-BE: evmwhsmia 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x6d]
+# CHECK-LE: evmwhsmia 14, 22, 19            # encoding: [0x6d,0x9c,0xd6,0x11]
+            evmwhsmia %r14, %r22, %r19
+# CHECK-BE: evmwhssf 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x47]
+# CHECK-LE: evmwhssf 14, 22, 19             # encoding: [0x47,0x9c,0xd6,0x11]
+            evmwhssf %r14, %r22, %r19
+# CHECK-BE: evmwhssfa 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x67]
+# CHECK-LE: evmwhssfa 14, 22, 19            # encoding: [0x67,0x9c,0xd6,0x11]
+            evmwhssfa %r14, %r22, %r19
+# CHECK-BE: evmwhumi 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x4c]
+# CHECK-LE: evmwhumi 14, 22, 19             # encoding: [0x4c,0x9c,0xd6,0x11]
+            evmwhumi %r14, %r22, %r19
+# CHECK-BE: evmwhumia 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x6c]
+# CHECK-LE: evmwhumia 14, 22, 19            # encoding: [0x6c,0x9c,0xd6,0x11]
+            evmwhumia %r14, %r22, %r19
+# CHECK-BE: evmwlsmiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x49]
+# CHECK-LE: evmwlsmiaaw 14, 22, 19          # encoding: [0x49,0x9d,0xd6,0x11]
+            evmwlsmiaaw %r14, %r22, %r19
+# CHECK-BE: evmwlsmianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xc9]
+# CHECK-LE: evmwlsmianw 14, 22, 19          # encoding: [0xc9,0x9d,0xd6,0x11]
+            evmwlsmianw %r14, %r22, %r19
+# CHECK-BE: evmwlssiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x41]
+# CHECK-LE: evmwlssiaaw 14, 22, 19          # encoding: [0x41,0x9d,0xd6,0x11]
+            evmwlssiaaw %r14, %r22, %r19
+# CHECK-BE: evmwlssianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xc1]
+# CHECK-LE: evmwlssianw 14, 22, 19          # encoding: [0xc1,0x9d,0xd6,0x11]
+            evmwlssianw %r14, %r22, %r19
+# CHECK-BE: evmwlumi 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x48]
+# CHECK-LE: evmwlumi 14, 22, 19             # encoding: [0x48,0x9c,0xd6,0x11]
+            evmwlumi %r14, %r22, %r19
+# CHECK-BE: evmwlumia 14, 22, 19            # encoding: [0x11,0xd6,0x9c,0x68]
+# CHECK-LE: evmwlumia 14, 22, 19            # encoding: [0x68,0x9c,0xd6,0x11]
+            evmwlumia %r14, %r22, %r19
+# CHECK-BE: evmwlumiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x48]
+# CHECK-LE: evmwlumiaaw 14, 22, 19          # encoding: [0x48,0x9d,0xd6,0x11]
+            evmwlumiaaw %r14, %r22, %r19
+# CHECK-BE: evmwlumianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xc8]
+# CHECK-LE: evmwlumianw 14, 22, 19          # encoding: [0xc8,0x9d,0xd6,0x11]
+            evmwlumianw %r14, %r22, %r19
+# CHECK-BE: evmwlusiaaw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0x40]
+# CHECK-LE: evmwlusiaaw 14, 22, 19          # encoding: [0x40,0x9d,0xd6,0x11]
+            evmwlusiaaw %r14, %r22, %r19
+# CHECK-BE: evmwlusianw 14, 22, 19          # encoding: [0x11,0xd6,0x9d,0xc0]
+# CHECK-LE: evmwlusianw 14, 22, 19          # encoding: [0xc0,0x9d,0xd6,0x11]
+            evmwlusianw %r14, %r22, %r19
+# CHECK-BE: evmwsmf 14, 22, 19              # encoding: [0x11,0xd6,0x9c,0x5b]
+# CHECK-LE: evmwsmf 14, 22, 19              # encoding: [0x5b,0x9c,0xd6,0x11]
+            evmwsmf %r14, %r22, %r19
+# CHECK-BE: evmwsmfa 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x7b]
+# CHECK-LE: evmwsmfa 14, 22, 19             # encoding: [0x7b,0x9c,0xd6,0x11]
+            evmwsmfa %r14, %r22, %r19
+# CHECK-BE: evmwsmfaa 14, 22, 19            # encoding: [0x11,0xd6,0x9d,0x5b]
+# CHECK-LE: evmwsmfaa 14, 22, 19            # encoding: [0x5b,0x9d,0xd6,0x11]
+            evmwsmfaa %r14, %r22, %r19
+# CHECK-BE: evmwsmfan 14, 22, 19            # encoding: [0x11,0xd6,0x9d,0xdb]
+# CHECK-LE: evmwsmfan 14, 22, 19            # encoding: [0xdb,0x9d,0xd6,0x11]
+            evmwsmfan %r14, %r22, %r19
+# CHECK-BE: evmwsmi 14, 22, 19              # encoding: [0x11,0xd6,0x9c,0x59]
+# CHECK-LE: evmwsmi 14, 22, 19              # encoding: [0x59,0x9c,0xd6,0x11]
+            evmwsmi %r14, %r22, %r19
+# CHECK-BE: evmwsmia 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x79]
+# CHECK-LE: evmwsmia 14, 22, 19             # encoding: [0x79,0x9c,0xd6,0x11]
+            evmwsmia %r14, %r22, %r19
+# CHECK-BE: evmwsmiaa 14, 22, 19            # encoding: [0x11,0xd6,0x9d,0x59]
+# CHECK-LE: evmwsmiaa 14, 22, 19            # encoding: [0x59,0x9d,0xd6,0x11]
+            evmwsmiaa %r14, %r22, %r19
+# CHECK-BE: evmwsmian 14, 22, 19            # encoding: [0x11,0xd6,0x9d,0xd9]
+# CHECK-LE: evmwsmian 14, 22, 19            # encoding: [0xd9,0x9d,0xd6,0x11]
+            evmwsmian %r14, %r22, %r19
+# CHECK-BE: evmwssf 14, 22, 19              # encoding: [0x11,0xd6,0x9c,0x53]
+# CHECK-LE: evmwssf 14, 22, 19              # encoding: [0x53,0x9c,0xd6,0x11]
+            evmwssf %r14, %r22, %r19
+# CHECK-BE: evmwssfa 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x73]
+# CHECK-LE: evmwssfa 14, 22, 19             # encoding: [0x73,0x9c,0xd6,0x11]
+            evmwssfa %r14, %r22, %r19
+# CHECK-BE: evmwssfaa 14, 22, 19            # encoding: [0x11,0xd6,0x9d,0x53]
+# CHECK-LE: evmwssfaa 14, 22, 19            # encoding: [0x53,0x9d,0xd6,0x11]
+            evmwssfaa %r14, %r22, %r19
+# CHECK-BE: evmwssfan 14, 22, 19            # encoding: [0x11,0xd6,0x9d,0xd3]
+# CHECK-LE: evmwssfan 14, 22, 19            # encoding: [0xd3,0x9d,0xd6,0x11]
+            evmwssfan %r14, %r22, %r19
+# CHECK-BE: evmwumi 14, 22, 19              # encoding: [0x11,0xd6,0x9c,0x58]
+# CHECK-LE: evmwumi 14, 22, 19              # encoding: [0x58,0x9c,0xd6,0x11]
+            evmwumi %r14, %r22, %r19
+# CHECK-BE: evmwumia 14, 22, 19             # encoding: [0x11,0xd6,0x9c,0x78]
+# CHECK-LE: evmwumia 14, 22, 19             # encoding: [0x78,0x9c,0xd6,0x11]
+            evmwumia %r14, %r22, %r19
+# CHECK-BE: evmwumiaa 14, 22, 19            # encoding: [0x11,0xd6,0x9d,0x58]
+# CHECK-LE: evmwumiaa 14, 22, 19            # encoding: [0x58,0x9d,0xd6,0x11]
+            evmwumiaa %r14, %r22, %r19
+# CHECK-BE: evmwumian 14, 22, 19            # encoding: [0x11,0xd6,0x9d,0xd8]
+# CHECK-LE: evmwumian 14, 22, 19            # encoding: [0xd8,0x9d,0xd6,0x11]
+            evmwumian %r14, %r22, %r19
+# CHECK-BE: evnand 14, 22, 19               # encoding: [0x11,0xd6,0x9a,0x1e]
+# CHECK-LE: evnand 14, 22, 19               # encoding: [0x1e,0x9a,0xd6,0x11]
+            evnand %r14, %r22, %r19
+# CHECK-BE: evneg 14, 22                    # encoding: [0x11,0xd6,0x02,0x09]
+# CHECK-LE: evneg 14, 22                    # encoding: [0x09,0x02,0xd6,0x11]
+            evneg %r14, %r22
+# CHECK-BE: evnor 14, 22, 19                # encoding: [0x11,0xd6,0x9a,0x18]
+# CHECK-LE: evnor 14, 22, 19                # encoding: [0x18,0x9a,0xd6,0x11]
+            evnor %r14, %r22, %r19
+# CHECK-BE: evor 14, 22, 19                 # encoding: [0x11,0xd6,0x9a,0x17]
+# CHECK-LE: evor 14, 22, 19                 # encoding: [0x17,0x9a,0xd6,0x11]
+            evor %r14, %r22, %r19
+# CHECK-BE: evorc 14, 22, 19                # encoding: [0x11,0xd6,0x9a,0x1b]
+# CHECK-LE: evorc 14, 22, 19                # encoding: [0x1b,0x9a,0xd6,0x11]
+            evorc %r14, %r22, %r19
+# CHECK-BE: evrlwi 14, 29, 19               # encoding: [0x11,0xdd,0x9a,0x2a]
+# CHECK-LE: evrlwi 14, 29, 19               # encoding: [0x2a,0x9a,0xdd,0x11]
+            evrlwi %r14, 29, %r19
+# CHECK-BE: evrlw 14, 22, 19                # encoding: [0x11,0xd6,0x9a,0x28]
+# CHECK-LE: evrlw 14, 22, 19                # encoding: [0x28,0x9a,0xd6,0x11]
+            evrlw %r14, %r22, %r19
+# CHECK-BE: evrndw 14, 22                   # encoding: [0x11,0xd6,0x02,0x0c]
+# CHECK-LE: evrndw 14, 22                   # encoding: [0x0c,0x02,0xd6,0x11]
+            evrndw %r14, %r22
+# CHECK-BE: evslwi 14, 29, 19               # encoding: [0x11,0xdd,0x9a,0x26]
+# CHECK-LE: evslwi 14, 29, 19               # encoding: [0x26,0x9a,0xdd,0x11]
+            evslwi %r14, 29, %r19
+# CHECK-BE: evslw 14, 22, 19                # encoding: [0x11,0xd6,0x9a,0x24]
+# CHECK-LE: evslw 14, 22, 19                # encoding: [0x24,0x9a,0xd6,0x11]
+            evslw %r14, %r22, %r19
+# CHECK-BE: evsplatfi 14, -13               # encoding: [0x11,0xd3,0x02,0x2b]
+# CHECK-LE: evsplatfi 14, -13               # encoding: [0x2b,0x02,0xd3,0x11]
+            evsplatfi %r14, -13
+# CHECK-BE: evsplati 14, -13                # encoding: [0x11,0xd3,0x02,0x29]
+# CHECK-LE: evsplati 14, -13                # encoding: [0x29,0x02,0xd3,0x11]
+            evsplati %r14, -13
+# CHECK-BE: evsrwis 14, 29, 19              # encoding: [0x11,0xdd,0x9a,0x23]
+# CHECK-LE: evsrwis 14, 29, 19              # encoding: [0x23,0x9a,0xdd,0x11]
+            evsrwis %r14, 29, %r19
+# CHECK-BE: evsrwiu 14, 29, 19              # encoding: [0x11,0xdd,0x9a,0x22]
+# CHECK-LE: evsrwiu 14, 29, 19              # encoding: [0x22,0x9a,0xdd,0x11]
+            evsrwiu %r14, 29, %r19
+# CHECK-BE: evsrws 14, 22, 19               # encoding: [0x11,0xd6,0x9a,0x21]
+# CHECK-LE: evsrws 14, 22, 19               # encoding: [0x21,0x9a,0xd6,0x11]
+            evsrws %r14, %r22, %r19
+# CHECK-BE: evsrwu 14, 22, 19               # encoding: [0x11,0xd6,0x9a,0x20]
+# CHECK-LE: evsrwu 14, 22, 19               # encoding: [0x20,0x9a,0xd6,0x11]
+            evsrwu %r14, %r22, %r19
+# CHECK-BE: evstddx 14, 22, 19              # encoding: [0x11,0xd6,0x9b,0x20]
+# CHECK-LE: evstddx 14, 22, 19              # encoding: [0x20,0x9b,0xd6,0x11]
+            evstddx %r14, %r22, %r19
+# CHECK-BE: evstdhx 14, 22, 19              # encoding: [0x11,0xd6,0x9b,0x24]
+# CHECK-LE: evstdhx 14, 22, 19              # encoding: [0x24,0x9b,0xd6,0x11]
+            evstdhx %r14, %r22, %r19
+# CHECK-BE: evstdwx 14, 22, 19              # encoding: [0x11,0xd6,0x9b,0x22]
+# CHECK-LE: evstdwx 14, 22, 19              # encoding: [0x22,0x9b,0xd6,0x11]
+            evstdwx %r14, %r22, %r19
+# CHECK-BE: evstwhex 14, 22, 19             # encoding: [0x11,0xd6,0x9b,0x30]
+# CHECK-LE: evstwhex 14, 22, 19             # encoding: [0x30,0x9b,0xd6,0x11]
+            evstwhex %r14, %r22, %r19
+# CHECK-BE: evstwhox 14, 22, 19             # encoding: [0x11,0xd6,0x9b,0x34]
+# CHECK-LE: evstwhox 14, 22, 19             # encoding: [0x34,0x9b,0xd6,0x11]
+            evstwhox %r14, %r22, %r19
+# CHECK-BE: evstwwex 14, 22, 19             # encoding: [0x11,0xd6,0x9b,0x38]
+# CHECK-LE: evstwwex 14, 22, 19             # encoding: [0x38,0x9b,0xd6,0x11]
+            evstwwex %r14, %r22, %r19
+# CHECK-BE: evstwwox 14, 22, 19             # encoding: [0x11,0xd6,0x9b,0x3c]
+# CHECK-LE: evstwwox 14, 22, 19             # encoding: [0x3c,0x9b,0xd6,0x11]
+            evstwwox %r14, %r22, %r19
+# CHECK-BE: evsubfssiaaw 14, 22             # encoding: [0x11,0xd6,0x04,0xc3]
+# CHECK-LE: evsubfssiaaw 14, 22             # encoding: [0xc3,0x04,0xd6,0x11]
+            evsubfssiaaw %r14, %r22
+# CHECK-BE: evsubfsmiaaw 14, 22             # encoding: [0x11,0xd6,0x04,0xcb]
+# CHECK-LE: evsubfsmiaaw 14, 22             # encoding: [0xcb,0x04,0xd6,0x11]
+            evsubfsmiaaw %r14, %r22
+# CHECK-BE: evsubfumiaaw 14, 22             # encoding: [0x11,0xd6,0x04,0xca]
+# CHECK-LE: evsubfumiaaw 14, 22             # encoding: [0xca,0x04,0xd6,0x11]
+            evsubfumiaaw %r14, %r22
+# CHECK-BE: evsubfusiaaw 14, 22             # encoding: [0x11,0xd6,0x04,0xc2]
+# CHECK-LE: evsubfusiaaw 14, 22             # encoding: [0xc2,0x04,0xd6,0x11]
+            evsubfusiaaw %r14, %r22
+# CHECK-BE: evsubfw 14, 22, 19              # encoding: [0x11,0xd6,0x9a,0x04]
+# CHECK-LE: evsubfw 14, 22, 19              # encoding: [0x04,0x9a,0xd6,0x11]
+            evsubfw %r14, %r22, %r19
+# CHECK-BE: evsubifw 14, 29, 19             # encoding: [0x11,0xdd,0x9a,0x06]
+# CHECK-LE: evsubifw 14, 29, 19             # encoding: [0x06,0x9a,0xdd,0x11]
+            evsubifw %r14, 29, %r19
+# CHECK-BE: evxor 14, 22, 19                # encoding: [0x11,0xd6,0x9a,0x16]
+# CHECK-LE: evxor 14, 22, 19                # encoding: [0x16,0x9a,0xd6,0x11]
+            evxor %r14, %r22, %r19
+
+# CHECK-BE: evldd 14, 0(27)                 # encoding: [0x11,0xdb,0x03,0x01]
+# CHECK-LE: evldd 14, 0(27)                 # encoding: [0x01,0x03,0xdb,0x11]
+            evldd %r14, 0(%r27)
+# CHECK-BE: evldd 14, 248(27)               # encoding: [0x11,0xdb,0xfb,0x01]
+# CHECK-LE: evldd 14, 248(27)               # encoding: [0x01,0xfb,0xdb,0x11]
+            evldd %r14, 248(%r27)
+# CHECK-BE: evldd 14, 248(9)                # encoding: [0x11,0xc9,0xfb,0x01]
+# CHECK-LE: evldd 14, 248(9)                # encoding: [0x01,0xfb,0xc9,0x11]
+            evldd %r14, 248(%r9)
+# CHECK-BE: evldw 14, 0(27)                 # encoding: [0x11,0xdb,0x03,0x03]
+# CHECK-LE: evldw 14, 0(27)                 # encoding: [0x03,0x03,0xdb,0x11]
+            evldw %r14, 0(%r27)
+# CHECK-BE: evldw 14, 248(27)               # encoding: [0x11,0xdb,0xfb,0x03]
+# CHECK-LE: evldw 14, 248(27)               # encoding: [0x03,0xfb,0xdb,0x11]
+            evldw %r14, 248(%r27)
+# CHECK-BE: evldw 14, 248(9)                # encoding: [0x11,0xc9,0xfb,0x03]
+# CHECK-LE: evldw 14, 248(9)                # encoding: [0x03,0xfb,0xc9,0x11]
+            evldw %r14, 248(%r9)
+# CHECK-BE: evldh 14, 0(27)                 # encoding: [0x11,0xdb,0x03,0x05]
+# CHECK-LE: evldh 14, 0(27)                 # encoding: [0x05,0x03,0xdb,0x11]
+            evldh %r14, 0(%r27)
+# CHECK-BE: evldh 14, 248(27)               # encoding: [0x11,0xdb,0xfb,0x05]
+# CHECK-LE: evldh 14, 248(27)               # encoding: [0x05,0xfb,0xdb,0x11]
+            evldh %r14, 248(%r27)
+# CHECK-BE: evldh 14, 248(9)                # encoding: [0x11,0xc9,0xfb,0x05]
+# CHECK-LE: evldh 14, 248(9)                # encoding: [0x05,0xfb,0xc9,0x11]
+            evldh %r14, 248(%r9)
+# CHECK-BE: evlhhesplat 14, 0(27)           # encoding: [0x11,0xdb,0x03,0x09]
+# CHECK-LE: evlhhesplat 14, 0(27)           # encoding: [0x09,0x03,0xdb,0x11]
+            evlhhesplat %r14, 0(%r27)
+# CHECK-BE: evlhhousplat 14, 0(27)          # encoding: [0x11,0xdb,0x03,0x0d]
+# CHECK-LE: evlhhousplat 14, 0(27)          # encoding: [0x0d,0x03,0xdb,0x11]
+            evlhhousplat %r14, 0(%r27)
+# CHECK-BE: evlhhousplat 14, 62(27)         # encoding: [0x11,0xdb,0xfb,0x0d]
+# CHECK-LE: evlhhousplat 14, 62(27)         # encoding: [0x0d,0xfb,0xdb,0x11]
+            evlhhousplat %r14, 62(%r27)
+# CHECK-BE: evlhhousplat 14, 62(9)          # encoding: [0x11,0xc9,0xfb,0x0d]
+# CHECK-LE: evlhhousplat 14, 62(9)          # encoding: [0x0d,0xfb,0xc9,0x11]
+            evlhhousplat %r14, 62(%r9)
+# CHECK-BE: evlhhossplat 14, 0(27)          # encoding: [0x11,0xdb,0x03,0x0f]
+# CHECK-LE: evlhhossplat 14, 0(27)          # encoding: [0x0f,0x03,0xdb,0x11]
+            evlhhossplat %r14, 0(%r27)
+# CHECK-BE: evlhhossplat 14, 62(27)         # encoding: [0x11,0xdb,0xfb,0x0f]
+# CHECK-LE: evlhhossplat 14, 62(27)         # encoding: [0x0f,0xfb,0xdb,0x11]
+            evlhhossplat %r14, 62(%r27)
+# CHECK-BE: evlhhossplat 14, 62(9)          # encoding: [0x11,0xc9,0xfb,0x0f]
+# CHECK-LE: evlhhossplat 14, 62(9)          # encoding: [0x0f,0xfb,0xc9,0x11]
+            evlhhossplat %r14, 62(%r9)
+# CHECK-BE: evlwhe 14, 0(27)                # encoding: [0x11,0xdb,0x03,0x11]
+# CHECK-LE: evlwhe 14, 0(27)                # encoding: [0x11,0x03,0xdb,0x11]
+            evlwhe %r14, 0(%r27)
+# CHECK-BE: evlwhe 14, 124(27)              # encoding: [0x11,0xdb,0xfb,0x11]
+# CHECK-LE: evlwhe 14, 124(27)              # encoding: [0x11,0xfb,0xdb,0x11]
+            evlwhe %r14, 124(%r27)
+# CHECK-BE: evlwhe 14, 124(9)               # encoding: [0x11,0xc9,0xfb,0x11]
+# CHECK-LE: evlwhe 14, 124(9)               # encoding: [0x11,0xfb,0xc9,0x11]
+            evlwhe %r14, 124(%r9)
+# CHECK-BE: evlwhou 14, 0(27)               # encoding: [0x11,0xdb,0x03,0x15]
+# CHECK-LE: evlwhou 14, 0(27)               # encoding: [0x15,0x03,0xdb,0x11]
+            evlwhou %r14, 0(%r27)
+# CHECK-BE: evlwhou 14, 124(27)             # encoding: [0x11,0xdb,0xfb,0x15]
+# CHECK-LE: evlwhou 14, 124(27)             # encoding: [0x15,0xfb,0xdb,0x11]
+            evlwhou %r14, 124(%r27)
+# CHECK-BE: evlwhou 14, 124(9)              # encoding: [0x11,0xc9,0xfb,0x15]
+# CHECK-LE: evlwhou 14, 124(9)              # encoding: [0x15,0xfb,0xc9,0x11]
+            evlwhou %r14, 124(%r9)
+# CHECK-BE: evlwhos 14, 0(27)               # encoding: [0x11,0xdb,0x03,0x17]
+# CHECK-LE: evlwhos 14, 0(27)               # encoding: [0x17,0x03,0xdb,0x11]
+            evlwhos %r14, 0(%r27)
+# CHECK-BE: evlwhos 14, 124(27)             # encoding: [0x11,0xdb,0xfb,0x17]
+# CHECK-LE: evlwhos 14, 124(27)             # encoding: [0x17,0xfb,0xdb,0x11]
+            evlwhos %r14, 124(%r27)
+# CHECK-BE: evlwhos 14, 124(9)              # encoding: [0x11,0xc9,0xfb,0x17]
+# CHECK-LE: evlwhos 14, 124(9)              # encoding: [0x17,0xfb,0xc9,0x11]
+            evlwhos %r14, 124(%r9)
+# CHECK-BE: evlwwsplat 14, 0(27)            # encoding: [0x11,0xdb,0x03,0x19]
+# CHECK-LE: evlwwsplat 14, 0(27)            # encoding: [0x19,0x03,0xdb,0x11]
+            evlwwsplat %r14, 0(%r27)
+# CHECK-BE: evlwwsplat 14, 124(27)          # encoding: [0x11,0xdb,0xfb,0x19]
+# CHECK-LE: evlwwsplat 14, 124(27)          # encoding: [0x19,0xfb,0xdb,0x11]
+            evlwwsplat %r14, 124(%r27)
+# CHECK-BE: evlwwsplat 14, 124(9)           # encoding: [0x11,0xc9,0xfb,0x19]
+# CHECK-LE: evlwwsplat 14, 124(9)           # encoding: [0x19,0xfb,0xc9,0x11]
+            evlwwsplat %r14, 124(%r9)
+# CHECK-BE: evlwhsplat 14, 0(27)            # encoding: [0x11,0xdb,0x03,0x1d]
+# CHECK-LE: evlwhsplat 14, 0(27)            # encoding: [0x1d,0x03,0xdb,0x11]
+            evlwhsplat %r14, 0(%r27)
+# CHECK-BE: evlwhsplat 14, 124(27)          # encoding: [0x11,0xdb,0xfb,0x1d]
+# CHECK-LE: evlwhsplat 14, 124(27)          # encoding: [0x1d,0xfb,0xdb,0x11]
+            evlwhsplat %r14, 124(%r27)
+# CHECK-BE: evlwhsplat 14, 124(9)           # encoding: [0x11,0xc9,0xfb,0x1d]
+# CHECK-LE: evlwhsplat 14, 124(9)           # encoding: [0x1d,0xfb,0xc9,0x11]
+            evlwhsplat %r14, 124(%r9)
+# CHECK-BE: evstdd 14, 0(27)                # encoding: [0x11,0xdb,0x03,0x21]
+# CHECK-LE: evstdd 14, 0(27)                # encoding: [0x21,0x03,0xdb,0x11]
+            evstdd %r14, 0(%r27)
+# CHECK-BE: evstdd 14, 248(27)              # encoding: [0x11,0xdb,0xfb,0x21]
+# CHECK-LE: evstdd 14, 248(27)              # encoding: [0x21,0xfb,0xdb,0x11]
+            evstdd %r14, 248(%r27)
+# CHECK-BE: evstdd 14, 248(9)               # encoding: [0x11,0xc9,0xfb,0x21]
+# CHECK-LE: evstdd 14, 248(9)               # encoding: [0x21,0xfb,0xc9,0x11]
+            evstdd %r14, 248(%r9)
+# CHECK-BE: evstdh 14, 0(27)                # encoding: [0x11,0xdb,0x03,0x25]
+# CHECK-LE: evstdh 14, 0(27)                # encoding: [0x25,0x03,0xdb,0x11]
+            evstdh %r14, 0(%r27)
+# CHECK-BE: evstdh 14, 248(27)              # encoding: [0x11,0xdb,0xfb,0x25]
+# CHECK-LE: evstdh 14, 248(27)              # encoding: [0x25,0xfb,0xdb,0x11]
+            evstdh %r14, 248(%r27)
+# CHECK-BE: evstdh 14, 248(9)               # encoding: [0x11,0xc9,0xfb,0x25]
+# CHECK-LE: evstdh 14, 248(9)               # encoding: [0x25,0xfb,0xc9,0x11]
+            evstdh %r14, 248(%r9)
+# CHECK-BE: evstdw 14, 0(27)                # encoding: [0x11,0xdb,0x03,0x23]
+# CHECK-LE: evstdw 14, 0(27)                # encoding: [0x23,0x03,0xdb,0x11]
+            evstdw %r14, 0(%r27)
+# CHECK-BE: evstdw 14, 248(27)              # encoding: [0x11,0xdb,0xfb,0x23]
+# CHECK-LE: evstdw 14, 248(27)              # encoding: [0x23,0xfb,0xdb,0x11]
+            evstdw %r14, 248(%r27)
+# CHECK-BE: evstdw 14, 248(9)               # encoding: [0x11,0xc9,0xfb,0x23]
+# CHECK-LE: evstdw 14, 248(9)               # encoding: [0x23,0xfb,0xc9,0x11]
+            evstdw %r14, 248(%r9)
+# CHECK-BE: evstwhe 14, 0(27)               # encoding: [0x11,0xdb,0x03,0x31]
+# CHECK-LE: evstwhe 14, 0(27)               # encoding: [0x31,0x03,0xdb,0x11]
+            evstwhe %r14, 0(%r27)
+# CHECK-BE: evstwhe 14, 124(27)             # encoding: [0x11,0xdb,0xfb,0x31]
+# CHECK-LE: evstwhe 14, 124(27)             # encoding: [0x31,0xfb,0xdb,0x11]
+            evstwhe %r14, 124(%r27)
+# CHECK-BE: evstwhe 14, 124(9)              # encoding: [0x11,0xc9,0xfb,0x31]
+# CHECK-LE: evstwhe 14, 124(9)              # encoding: [0x31,0xfb,0xc9,0x11]
+            evstwhe %r14, 124(%r9)
+# CHECK-BE: evstwho 14, 0(27)               # encoding: [0x11,0xdb,0x03,0x35]
+# CHECK-LE: evstwho 14, 0(27)               # encoding: [0x35,0x03,0xdb,0x11]
+            evstwho %r14, 0(%r27)
+# CHECK-BE: evstwho 14, 124(27)             # encoding: [0x11,0xdb,0xfb,0x35]
+# CHECK-LE: evstwho 14, 124(27)             # encoding: [0x35,0xfb,0xdb,0x11]
+            evstwho %r14, 124(%r27)
+# CHECK-BE: evstwho 14, 124(9)              # encoding: [0x11,0xc9,0xfb,0x35]
+# CHECK-LE: evstwho 14, 124(9)              # encoding: [0x35,0xfb,0xc9,0x11]
+            evstwho %r14, 124(%r9)
+# CHECK-BE: evstwwe 14, 0(27)               # encoding: [0x11,0xdb,0x03,0x39]
+# CHECK-LE: evstwwe 14, 0(27)               # encoding: [0x39,0x03,0xdb,0x11]
+            evstwwe %r14, 0(%r27)
+# CHECK-BE: evstwwe 14, 124(27)             # encoding: [0x11,0xdb,0xfb,0x39]
+# CHECK-LE: evstwwe 14, 124(27)             # encoding: [0x39,0xfb,0xdb,0x11]
+            evstwwe %r14, 124(%r27)
+# CHECK-BE: evstwwe 14, 124(9)              # encoding: [0x11,0xc9,0xfb,0x39]
+# CHECK-LE: evstwwe 14, 124(9)              # encoding: [0x39,0xfb,0xc9,0x11]
+            evstwwe %r14, 124(%r9)
+# CHECK-BE: evstwwo 14, 0(27)               # encoding: [0x11,0xdb,0x03,0x3d]
+# CHECK-LE: evstwwo 14, 0(27)               # encoding: [0x3d,0x03,0xdb,0x11]
+            evstwwo %r14, 0(%r27)
+# CHECK-BE: evstwwo 14, 124(27)             # encoding: [0x11,0xdb,0xfb,0x3d]
+# CHECK-LE: evstwwo 14, 124(27)             # encoding: [0x3d,0xfb,0xdb,0x11]
+            evstwwo %r14, 124(%r27)
+# CHECK-BE: evstwwo 14, 124(9)              # encoding: [0x11,0xc9,0xfb,0x3d]
+# CHECK-LE: evstwwo 14, 124(9)              # encoding: [0x3d,0xfb,0xc9,0x11]
+            evstwwo %r14, 124(%r9)

diff --git a/test/MC/PowerPC/ppc64-encoding.s b/test/MC/PowerPC/ppc64-encoding.s
index 4c3530d..d483f9d 100644
--- a/test/MC/PowerPC/ppc64-encoding.s
+++ b/test/MC/PowerPC/ppc64-encoding.s

@@ -767,3 +767,17 @@
 # CHECK-LE: mfocrf 16, 8                    # encoding: [0x26,0x80,0x10,0x7e]
             mfocrf 16, 8
 
+# Move to/from segment register
+# CHECK-BE: mtsr    12, 10                    # encoding: [0x7d,0x4c,0x01,0xa4]
+# CHECK-LE: mtsr    12, 10                    # encoding: [0xa4,0x01,0x4c,0x7d]
+            mtsr    12,%r10
+# CHECK-BE: mfsr    10, 12                    # encoding: [0x7d,0x4c,0x04,0xa6]
+# CHECK-LE: mfsr    10, 12                    # encoding: [0xa6,0x04,0x4c,0x7d]
+            mfsr    %r10,12
+
+# CHECK-BE: mtsrin  10, 12                    # encoding: [0x7d,0x40,0x61,0xe4]
+# CHECK-LE: mtsrin  10, 12                    # encoding: [0xe4,0x61,0x40,0x7d]
+            mtsrin  %r10,%r12
+# CHECK-BE: mfsrin  10, 12                    # encoding: [0x7d,0x40,0x65,0x26]
+# CHECK-LE: mfsrin  10, 12                    # encoding: [0x26,0x65,0x40,0x7d]
+            mfsrin  %r10,%r12

diff --git a/test/MC/PowerPC/ppc64-fixup-apply.s b/test/MC/PowerPC/ppc64-fixup-apply.s
index 1693405..f98b46d 100644
--- a/test/MC/PowerPC/ppc64-fixup-apply.s
+++ b/test/MC/PowerPC/ppc64-fixup-apply.s

@@ -14,19 +14,21 @@
 
 .set target, 0x1234
 
-addi 1, 1, target2@l
-addis 1, 1, target2@ha
+subi 1, 1, -target2@l
+subis 1, 1, -target2@ha
 
 .set target2, 0x12345678
 
 addi 1, 1, target3-target4@l
-addis 1, 1, target3-target4@ha
+subis 1, 1, target4-target3@ha
 
 .set target3, 0x23455678
 .set target4, 0x12341234
 
 addi 1, 1, target5+0x8000@l
 addis 1, 1, target5+0x8000@ha
+ori 1, 1, target5+0x8000@l
+oris 1, 1, target5+0x8000@ha
 
 .set target5, 0x10000001
 
@@ -68,7 +70,7 @@
 # CHECK-NEXT:    ]
 # CHECK-NEXT:    Address: 0x0
 # CHECK-NEXT:    Offset:
-# CHECK-NEXT:    Size: 64
+# CHECK-NEXT:    Size: 72
 # CHECK-NEXT:    Link: 0
 # CHECK-NEXT:    Info: 0
 # CHECK-NEXT:    AddressAlignment: 4
@@ -78,10 +80,12 @@
 # CHECK-LE-NEXT:   0000: 34122138 3412213C 78562138 3412213C
 # CHECK-BE-NEXT:   0010: 38214444 3C211111 38218001 3C211001
 # CHECK-LE-NEXT:   0010: 44442138 1111213C 01802138 0110213C
-# CHECK-BE-NEXT:   0020: 38210008 3C210000 38214321 3C214321
-# CHECK-LE-NEXT:   0020: 08002138 0000213C 21432138 2143213C
-# CHECK-BE-NEXT:   0030: 3821FFFF 3C211234 38210000 3C211235
-# CHECK-LE-NEXT:   0030: FFFF2138 3412213C 00002138 3512213C
+# CHECK-BE-NEXT:   0020: 60218001 64211001 38210008 3C210000
+# CHECK-LE-NEXT:   0020: 01802160 01102164 08002138 0000213C
+# CHECK-BE-NEXT:   0030: 38214321 3C214321 3821FFFF 3C211234
+# CHECK-LE-NEXT:   0030: 21432138 2143213C FFFF2138 3412213C
+# CHECK-BE-NEXT:   0040: 38210000 3C211235
+# CHECK-LE-NEXT:   0040: 00002138 3512213C
 # CHECK-NEXT:    )
 # CHECK-NEXT:  }
 

diff --git a/test/MC/PowerPC/ppc64-fixups.s b/test/MC/PowerPC/ppc64-fixups.s
index d3769f5..20a70c2 100644
--- a/test/MC/PowerPC/ppc64-fixups.s
+++ b/test/MC/PowerPC/ppc64-fixups.s

@@ -687,6 +687,18 @@
 # CHECK-BE: ori 1, 2, 2                  # encoding: [0x60,0x41,0x00,0x02]
 # CHECK-LE: ori 1, 2, 2                  # encoding: [0x02,0x00,0x41,0x60]
             ori 1, 2, 131071@ha
+# CHECK-BE: addi 1, 2, -1                # encoding: [0x38,0x22,0xff,0xff]
+# CHECK-LE: addi 1, 2, -1                # encoding: [0xff,0xff,0x22,0x38]
+            addi 1, 2, 131071@l
+# CHECK-BE: addi 1, 2, 1                 # encoding: [0x38,0x22,0x00,0x01]
+# CHECK-LE: addi 1, 2, 1                 # encoding: [0x01,0x00,0x22,0x38]
+            addi 1, 2, 131071@h
+# CHECK-BE: addi 1, 2, 2                 # encoding: [0x38,0x22,0x00,0x02]
+# CHECK-LE: addi 1, 2, 2                 # encoding: [0x02,0x00,0x22,0x38]
+            addi 1, 2, 131071@ha
+# CHECK-BE: addis 1, 2, -4096            # encoding: [0x3c,0x22,0xf0,0x00]
+# CHECK-LE: addis 1, 2, -4096            # encoding: [0x00,0xf0,0x22,0x3c]
+            addis 1, 2, 0xf0000000@h
 
 # Data relocs
 # llvm-mc does not show any "encoding" string for data, so we just check the relocs

diff --git a/test/MC/PowerPC/ppc64-localentry-error1.s b/test/MC/PowerPC/ppc64-localentry-error1.s
new file mode 100644
index 0000000..e47640f
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-localentry-error1.s

@@ -0,0 +1,11 @@
+
+# RUN: not llvm-mc -triple powerpc64-unknown-unknown -filetype=obj < %s 2> %t
+# RUN: FileCheck < %t %s
+# RUN: not llvm-mc -triple powerpc64le-unknown-unknown -filetype=obj < %s 2> %t
+# RUN: FileCheck < %t %s
+
+sym:
+	.localentry sym, 123
+
+# CHECK: LLVM ERROR: .localentry expression cannot be encoded.
+

diff --git a/test/MC/PowerPC/ppc64-localentry-error2.s b/test/MC/PowerPC/ppc64-localentry-error2.s
new file mode 100644
index 0000000..b05687f
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-localentry-error2.s

@@ -0,0 +1,12 @@
+
+# RUN: not llvm-mc -triple powerpc64-unknown-unknown -filetype=obj < %s 2> %t
+# RUN: FileCheck < %t %s
+# RUN: not llvm-mc -triple powerpc64le-unknown-unknown -filetype=obj < %s 2> %t
+# RUN: FileCheck < %t %s
+
+	.globl remote_sym
+sym:
+	.localentry sym, remote_sym
+
+# CHECK: LLVM ERROR: .localentry expression must be absolute.
+

diff --git a/test/MC/PowerPC/ppc64-localentry.s b/test/MC/PowerPC/ppc64-localentry.s
new file mode 100644
index 0000000..6d2c120
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-localentry.s

@@ -0,0 +1,70 @@
+
+# RUN: llvm-mc -triple powerpc64-unknown-unknown -filetype=obj %s | \
+# RUN: llvm-readobj -h -r -symbols | FileCheck %s
+# RUN: llvm-mc -triple powerpc64le-unknown-unknown -filetype=obj %s | \
+# RUN: llvm-readobj -h -r -symbols | FileCheck %s
+
+	.type callee1, @function
+callee1:
+	nop
+	nop
+	.localentry callee1, .-callee1
+	nop
+	nop
+	.size callee1, .-callee1
+
+	.type callee2, @function
+callee2:
+	nop
+	nop
+	.size callee2, .-callee2
+
+	.type caller, @function
+caller:
+	bl callee1
+	nop
+	bl callee2
+	nop
+	.size caller, .-caller
+
+	.section .text.other
+caller_other:
+	bl callee1
+	nop
+	bl callee2
+	nop
+	.size caller_other, .-caller_other
+
+# Verify that use of .localentry implies ABI version 2
+# CHECK: ElfHeader {
+# CHECK: Flags [ (0x2)
+
+# Verify that fixups to local function symbols are performed only
+# if the target symbol does not use .localentry
+# CHECK: Relocations [
+# CHECK: Section ({{[0-9]*}}) .rela.text {
+# CHECK-NEXT: R_PPC64_REL24 callee1
+# CHECK-NEXT: }
+# CHECK-NOT: R_PPC64_REL24 callee2
+# CHECK: Section ({{[0-9]*}}) .rela.text.other {
+# CHECK-NEXT: R_PPC64_REL24 callee1
+# CHECK-NEXT: R_PPC64_REL24 .text
+# CHECK-NEXT: }
+
+# Verify that .localentry is encoded in the Other field.
+# CHECK: Symbols [
+# CHECK:       Name: callee1
+# CHECK-NEXT:  Value:
+# CHECK-NEXT:  Size: 16
+# CHECK-NEXT:  Binding: Local
+# CHECK-NEXT:  Type: Function
+# CHECK-NEXT:  Other: 96
+# CHECK-NEXT:  Section: .text
+# CHECK:       Name: callee2
+# CHECK-NEXT:  Value:
+# CHECK-NEXT:  Size: 8
+# CHECK-NEXT:  Binding: Local
+# CHECK-NEXT:  Type: Function
+# CHECK-NEXT:  Other: 0
+# CHECK-NEXT:  Section: .text
+

diff --git a/test/MC/PowerPC/vsx.s b/test/MC/PowerPC/vsx.s
index d292dda..b355ba3 100644
--- a/test/MC/PowerPC/vsx.s
+++ b/test/MC/PowerPC/vsx.s

@@ -1,27 +1,27 @@
 # RUN: llvm-mc -triple powerpc64-unknown-linux-gnu --show-encoding %s | FileCheck -check-prefix=CHECK-BE %s
 # RUN: llvm-mc -triple powerpc64le-unknown-linux-gnu --show-encoding %s | FileCheck -check-prefix=CHECK-LE %s
 
-# CHECK-BE: lxsdx 7, 5, 31                     # encoding: [0x7c,0xe5,0xfc,0x98]
-# CHECK-LE: lxsdx 7, 5, 31                     # encoding: [0x98,0xfc,0xe5,0x7c]
-            lxsdx 7, 5, 31
-# CHECK-BE: lxvd2x 7, 5, 31                    # encoding: [0x7c,0xe5,0xfe,0x98]
-# CHECK-LE: lxvd2x 7, 5, 31                    # encoding: [0x98,0xfe,0xe5,0x7c]
-            lxvd2x 7, 5, 31
-# CHECK-BE: lxvdsx 7, 5, 31                    # encoding: [0x7c,0xe5,0xfa,0x98]
-# CHECK-LE: lxvdsx 7, 5, 31                    # encoding: [0x98,0xfa,0xe5,0x7c]
-            lxvdsx 7, 5, 31
-# CHECK-BE: lxvw4x 7, 5, 31                    # encoding: [0x7c,0xe5,0xfe,0x18]
-# CHECK-LE: lxvw4x 7, 5, 31                    # encoding: [0x18,0xfe,0xe5,0x7c]
-            lxvw4x 7, 5, 31
-# CHECK-BE: stxsdx 8, 5, 31                    # encoding: [0x7d,0x05,0xfd,0x98]
-# CHECK-LE: stxsdx 8, 5, 31                    # encoding: [0x98,0xfd,0x05,0x7d]
-            stxsdx 8, 5, 31
-# CHECK-BE: stxvd2x 8, 5, 31                   # encoding: [0x7d,0x05,0xff,0x98]
-# CHECK-LE: stxvd2x 8, 5, 31                   # encoding: [0x98,0xff,0x05,0x7d]
-            stxvd2x 8, 5, 31
-# CHECK-BE: stxvw4x 8, 5, 31                   # encoding: [0x7d,0x05,0xff,0x18]
-# CHECK-LE: stxvw4x 8, 5, 31                   # encoding: [0x18,0xff,0x05,0x7d]
-            stxvw4x 8, 5, 31
+# CHECK-BE: lxsdx 39, 5, 31                     # encoding: [0x7c,0xe5,0xfc,0x99]
+# CHECK-LE: lxsdx 39, 5, 31                     # encoding: [0x99,0xfc,0xe5,0x7c]
+            lxsdx 39, 5, 31
+# CHECK-BE: lxvd2x 39, 5, 31                    # encoding: [0x7c,0xe5,0xfe,0x99]
+# CHECK-LE: lxvd2x 39, 5, 31                    # encoding: [0x99,0xfe,0xe5,0x7c]
+            lxvd2x 39, 5, 31
+# CHECK-BE: lxvdsx 39, 5, 31                    # encoding: [0x7c,0xe5,0xfa,0x99]
+# CHECK-LE: lxvdsx 39, 5, 31                    # encoding: [0x99,0xfa,0xe5,0x7c]
+            lxvdsx 39, 5, 31
+# CHECK-BE: lxvw4x 39, 5, 31                    # encoding: [0x7c,0xe5,0xfe,0x19]
+# CHECK-LE: lxvw4x 39, 5, 31                    # encoding: [0x19,0xfe,0xe5,0x7c]
+            lxvw4x 39, 5, 31
+# CHECK-BE: stxsdx 40, 5, 31                    # encoding: [0x7d,0x05,0xfd,0x99]
+# CHECK-LE: stxsdx 40, 5, 31                    # encoding: [0x99,0xfd,0x05,0x7d]
+            stxsdx 40, 5, 31
+# CHECK-BE: stxvd2x 40, 5, 31                   # encoding: [0x7d,0x05,0xff,0x99]
+# CHECK-LE: stxvd2x 40, 5, 31                   # encoding: [0x99,0xff,0x05,0x7d]
+            stxvd2x 40, 5, 31
+# CHECK-BE: stxvw4x 40, 5, 31                   # encoding: [0x7d,0x05,0xff,0x19]
+# CHECK-LE: stxvw4x 40, 5, 31                   # encoding: [0x19,0xff,0x05,0x7d]
+            stxvw4x 40, 5, 31
 # CHECK-BE: xsabsdp 7, 27                      # encoding: [0xf0,0xe0,0xdd,0x64]
 # CHECK-LE: xsabsdp 7, 27                      # encoding: [0x64,0xdd,0xe0,0xf0]
             xsabsdp 7, 27

diff --git a/test/MC/R600/lit.local.cfg b/test/MC/R600/lit.local.cfg
new file mode 100644
index 0000000..ad9ce25
--- /dev/null
+++ b/test/MC/R600/lit.local.cfg

@@ -0,0 +1,2 @@
+if not 'R600' in config.root.targets:
+    config.unsupported = True

diff --git a/test/MC/R600/sopp.s b/test/MC/R600/sopp.s
new file mode 100644
index 0000000..65fc97b
--- /dev/null
+++ b/test/MC/R600/sopp.s

@@ -0,0 +1,52 @@
+// RUN: llvm-mc -arch=r600 -mcpu=SI  -show-encoding %s | FileCheck %s
+
+  s_nop 1            // CHECK: s_nop 1 ; encoding: [0x01,0x00,0x80,0xbf]
+  s_endpgm           // CHECK: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+  s_branch 2         // CHECK: s_branch 2 ; encoding: [0x02,0x00,0x82,0xbf]
+  s_cbranch_scc0 3   // CHECK: s_cbranch_scc0 3 ; encoding: [0x03,0x00,0x84,0xbf]
+  s_cbranch_scc1 4   // CHECK: s_cbranch_scc1 4 ; encoding: [0x04,0x00,0x85,0xbf]
+  s_cbranch_vccz 5   // CHECK: s_cbranch_vccz 5 ; encoding: [0x05,0x00,0x86,0xbf]
+  s_cbranch_vccnz 6  // CHECK: s_cbranch_vccnz 6 ; encoding: [0x06,0x00,0x87,0xbf]
+  s_cbranch_execz 7  // CHECK: s_cbranch_execz 7 ; encoding: [0x07,0x00,0x88,0xbf]
+  s_cbranch_execnz 8 // CHECK: s_cbranch_execnz 8 ; encoding: [0x08,0x00,0x89,0xbf]
+  s_barrier          // CHECK: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf]
+
+//===----------------------------------------------------------------------===//
+// s_waitcnt
+//===----------------------------------------------------------------------===//
+
+  s_waitcnt 0
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+  s_waitcnt vmcnt(0) & expcnt(0) & lgkmcnt(0)
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+  s_waitcnt vmcnt(0), expcnt(0), lgkmcnt(0)
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+  s_waitcnt vmcnt(1)
+  // CHECK: s_waitcnt vmcnt(1) ; encoding: [0x71,0x07,0x8c,0xbf]
+
+  s_waitcnt expcnt(2)
+  // CHECK: s_waitcnt expcnt(2) ; encoding: [0x2f,0x07,0x8c,0xbf]
+
+  s_waitcnt lgkmcnt(3)
+  // CHECK: s_waitcnt lgkmcnt(3) ; encoding: [0x7f,0x03,0x8c,0xbf]
+
+  s_waitcnt vmcnt(0), expcnt(0)
+  // CHECK: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0x00,0x07,0x8c,0xbf]
+
+
+  s_sethalt 9        // CHECK: s_sethalt 9 ; encoding: [0x09,0x00,0x8d,0xbf]
+  s_sleep 10         // CHECK: s_sleep 10 ; encoding: [0x0a,0x00,0x8e,0xbf]
+  s_setprio 1        // CHECK: s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
+  s_sendmsg 2        // CHECK: s_sendmsg Gs(nop), [m0] ; encoding: [0x02,0x00,0x90,0xbf]
+  s_sendmsghalt 3    // CHECK: s_sendmsghalt 3 ; encoding: [0x03,0x00,0x91,0xbf]
+  s_trap 4           // CHECK: s_trap 4 ; encoding: [0x04,0x00,0x92,0xbf]
+  s_icache_inv       // CHECK: s_icache_inv ; encoding: [0x00,0x00,0x93,0xbf]
+  s_incperflevel 5   // CHECK: s_incperflevel 5 ; encoding: [0x05,0x00,0x94,0xbf]
+  s_decperflevel 6   // CHECK: s_decperflevel 6 ; encoding: [0x06,0x00,0x95,0xbf]
+  s_ttracedata       // CHECK: s_ttracedata ; encoding: [0x00,0x00,0x96,0xbf]

diff --git a/test/MC/SystemZ/lit.local.cfg b/test/MC/SystemZ/lit.local.cfg
index 5c02dd3..78c5738 100644
--- a/test/MC/SystemZ/lit.local.cfg
+++ b/test/MC/SystemZ/lit.local.cfg

@@ -1,3 +1,6 @@
 if not 'SystemZ' in config.root.targets:
     config.unsupported = True
 
+# http://llvm.org/bugs/show_bug.cgi?id=20980
+if 'ubsan' in config.available_features:
+  config.unsupported = True

diff --git a/test/MC/X86/AlignedBundling/labeloffset.s b/test/MC/X86/AlignedBundling/labeloffset.s
new file mode 100644
index 0000000..65a0086
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/labeloffset.s

@@ -0,0 +1,83 @@
+# RUN: llvm-mc -triple=i686-linux -filetype=obj %s -o - | \
+# RUN: llvm-objdump -disassemble -no-show-raw-insn -r - | FileCheck %s
+# RUN: llvm-mc -triple=i686-nacl -filetype=obj %s -o - | \
+# RUN: llvm-objdump -disassemble -no-show-raw-insn -r - | FileCheck %s
+
+        .bundle_align_mode 5
+        .text
+        .globl  main
+        .align  32, 0x90
+        .type   main,@function
+main:                                   # @main
+# CHECK-LABEL: main:
+# Call + pop sequence for determining the PIC base.
+        .bundle_lock align_to_end
+        calll   .L0$pb
+        .bundle_unlock
+.L0$pb:
+        popl    %eax
+# CHECK: 20: popl
+# 26 bytes of instructions between the pop and the use of the pic base symbol.
+        movl    $3, 2(%ebx, %ebx)
+        movl    $3, 2(%ebx, %ebx)
+        movl    $3, 2(%ebx, %ebx)
+        hlt
+        hlt
+# CHECK: nop
+.Ltmp0:
+        addl    (.Ltmp0-.L0$pb), %eax
+# The addl has bundle padding to push it from 0x3b to 0x40.
+# The difference between the labels should be 0x20 (0x40-0x20) not 0x1b
+# (0x3b-0x20)
+# CHECK: 40: addl 32, %eax
+        popl    %ecx
+        jmp     *%ecx
+
+
+# Also make sure it works with a non-relaxable instruction (cmp vs add)
+# and for 2 adjacent labels that both point to the correct instruction
+        .section .text.bar, "ax"
+        .globl  bar
+        .align  32, 0x90
+        .type   bar,@function
+bar:
+# CHECK-LABEL: bar:
+        .bundle_lock align_to_end
+        calll   .L1$pb
+        .bundle_unlock
+.L1$pb:
+        popl %eax
+# CHECK: 20: popl
+# 26 bytes of instructions between the pop and the use of the pic base symbol.
+        movl    $3, 2(%ebx, %ebx)
+        movl    $3, 2(%ebx, %ebx)
+        movl    $3, 2(%ebx, %ebx)
+        hlt
+        hlt
+# CHECK: nop
+.Ltmp1:
+.Ltmp2:
+        cmpl    %eax, .Ltmp1
+# CHECK: 40: cmpl %eax, 64
+        cmpl     %eax, (.Ltmp2-.L1$pb)
+# CHECK: 46: cmpl %eax, 32
+        popl    %ecx
+        jmp *%ecx
+
+
+# Switch sections in the middle of a function
+        .section .text.foo, "ax"
+        .globl  foo
+        .align  32, 0x90
+        .type   foo,@function
+# CHECK-LABEL: foo:
+foo:
+        inc %eax
+tmp3:
+        .rodata
+        .type   obj,@object
+        .comm   obj,4,4
+        .section .text.foo
+        inc %eax
+# CHECK: tmp3:
+# CHECK-NEXT: 1: incl

diff --git a/test/MC/X86/AlignedBundling/long-nop-pad.s b/test/MC/X86/AlignedBundling/long-nop-pad.s
index ea33e28..9b1ec11 100644
--- a/test/MC/X86/AlignedBundling/long-nop-pad.s
+++ b/test/MC/X86/AlignedBundling/long-nop-pad.s

@@ -14,7 +14,7 @@
 # To align this group to a bundle end, we need a 15-byte NOP and a 12-byte NOP.
 # CHECK:        0:  nop
 # CHECK-NEXT:   f:  nop
-# CHECK-NEXT:   1b: callq
+# CHECK:   1b: callq
 
 # This push instruction is 1 byte long
   .bundle_lock align_to_end

diff --git a/test/MC/X86/AlignedBundling/nesting.s b/test/MC/X86/AlignedBundling/nesting.s
new file mode 100644
index 0000000..8996170
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/nesting.s

@@ -0,0 +1,67 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
+
+# Will be bundle-aligning to 16 byte boundaries
+  .bundle_align_mode 4
+  .text
+# CHECK-LABEL: foo
+foo:
+# Test that bundle alignment mode can be set more than once.
+  .bundle_align_mode 4
+# Each of these callq instructions is 5 bytes long
+  callq bar
+  callq bar
+  .bundle_lock
+  .bundle_lock
+  callq bar
+  callq bar     
+  .bundle_unlock
+  .bundle_unlock
+# CHECK:      10: callq
+# CHECK-NEXT: 15: callq
+
+  .p2align 4
+# CHECK-LABEL: bar
+bar:
+  callq foo
+  callq foo
+# Check that the callqs get bundled together, and that the whole group is
+# align_to_end
+  .bundle_lock 
+  callq bar
+  .bundle_lock align_to_end
+  callq bar
+  .bundle_unlock
+  .bundle_unlock
+# CHECK:      36: callq
+# CHECK-NEXT: 3b: callq
+
+# CHECK-LABEL: baz
+baz:
+  callq foo
+  callq foo
+# Check that the callqs get bundled together, and that the whole group is
+# align_to_end (with the outer directive marked align_to_end)
+  .bundle_lock align_to_end
+  callq bar
+  .bundle_lock
+  callq bar
+  .bundle_unlock
+  .bundle_unlock
+# CHECK:      56: callq
+# CHECK-NEXT: 5b: callq
+
+# CHECK-LABEL: quux
+quux:
+  callq bar
+  callq bar
+  .bundle_lock
+  .bundle_lock
+  callq bar
+  .bundle_unlock
+  callq bar     
+  .bundle_unlock
+# Check that the calls are bundled together when the second one is after the
+# inner nest is closed.
+# CHECK:      70: callq
+# CHECK-NEXT: 75: callq

diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 187b512..c734da8 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s

@@ -113,6 +113,94 @@
 // CHECK:  encoding: [0x62,0xe1,0x14,0x58,0x58,0x92,0xfc,0xfd,0xff,0xff]
           vaddps -516(%rdx){1to16}, %zmm13, %zmm18
 
+// CHECK: vbroadcastsd (%rcx), %zmm30
+// CHECK:  encoding: [0x62,0x62,0xfd,0x48,0x19,0x31]
+          vbroadcastsd (%rcx), %zmm30
+
+// CHECK: vbroadcastsd (%rcx), %zmm30 {%k4}
+// CHECK:  encoding: [0x62,0x62,0xfd,0x4c,0x19,0x31]
+          vbroadcastsd (%rcx), %zmm30 {%k4}
+
+// CHECK: vbroadcastsd (%rcx), %zmm30 {%k4} {z}
+// CHECK:  encoding: [0x62,0x62,0xfd,0xcc,0x19,0x31]
+          vbroadcastsd (%rcx), %zmm30 {%k4} {z}
+
+// CHECK: vbroadcastsd 291(%rax,%r14,8), %zmm30
+// CHECK:  encoding: [0x62,0x22,0xfd,0x48,0x19,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vbroadcastsd 291(%rax,%r14,8), %zmm30
+
+// CHECK: vbroadcastsd 1016(%rdx), %zmm30
+// CHECK:  encoding: [0x62,0x62,0xfd,0x48,0x19,0x72,0x7f]
+          vbroadcastsd 1016(%rdx), %zmm30
+
+// CHECK: vbroadcastsd 1024(%rdx), %zmm30
+// CHECK:  encoding: [0x62,0x62,0xfd,0x48,0x19,0xb2,0x00,0x04,0x00,0x00]
+          vbroadcastsd 1024(%rdx), %zmm30
+
+// CHECK: vbroadcastsd -1024(%rdx), %zmm30
+// CHECK:  encoding: [0x62,0x62,0xfd,0x48,0x19,0x72,0x80]
+          vbroadcastsd -1024(%rdx), %zmm30
+
+// CHECK: vbroadcastsd -1032(%rdx), %zmm30
+// CHECK:  encoding: [0x62,0x62,0xfd,0x48,0x19,0xb2,0xf8,0xfb,0xff,0xff]
+          vbroadcastsd -1032(%rdx), %zmm30
+
+// CHECK: vbroadcastsd %xmm22, %zmm21
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x48,0x19,0xee]
+          vbroadcastsd %xmm22, %zmm21
+
+// CHECK: vbroadcastsd %xmm22, %zmm21 {%k7}
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x4f,0x19,0xee]
+          vbroadcastsd %xmm22, %zmm21 {%k7}
+
+// CHECK: vbroadcastsd %xmm22, %zmm21 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa2,0xfd,0xcf,0x19,0xee]
+          vbroadcastsd %xmm22, %zmm21 {%k7} {z}
+
+// CHECK: vbroadcastss (%rcx), %zmm3
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x18,0x19]
+          vbroadcastss (%rcx), %zmm3
+
+// CHECK: vbroadcastss (%rcx), %zmm3 {%k4}
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x4c,0x18,0x19]
+          vbroadcastss (%rcx), %zmm3 {%k4}
+
+// CHECK: vbroadcastss (%rcx), %zmm3 {%k4} {z}
+// CHECK:  encoding: [0x62,0xf2,0x7d,0xcc,0x18,0x19]
+          vbroadcastss (%rcx), %zmm3 {%k4} {z}
+
+// CHECK: vbroadcastss 291(%rax,%r14,8), %zmm3
+// CHECK:  encoding: [0x62,0xb2,0x7d,0x48,0x18,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vbroadcastss 291(%rax,%r14,8), %zmm3
+
+// CHECK: vbroadcastss 508(%rdx), %zmm3
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x18,0x5a,0x7f]
+          vbroadcastss 508(%rdx), %zmm3
+
+// CHECK: vbroadcastss 512(%rdx), %zmm3
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x18,0x9a,0x00,0x02,0x00,0x00]
+          vbroadcastss 512(%rdx), %zmm3
+
+// CHECK: vbroadcastss -512(%rdx), %zmm3
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x18,0x5a,0x80]
+          vbroadcastss -512(%rdx), %zmm3
+
+// CHECK: vbroadcastss -516(%rdx), %zmm3
+// CHECK:  encoding: [0x62,0xf2,0x7d,0x48,0x18,0x9a,0xfc,0xfd,0xff,0xff]
+          vbroadcastss -516(%rdx), %zmm3
+
+// CHECK: vbroadcastss %xmm18, %zmm18
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x48,0x18,0xd2]
+          vbroadcastss %xmm18, %zmm18
+
+// CHECK: vbroadcastss %xmm18, %zmm18 {%k2}
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x4a,0x18,0xd2]
+          vbroadcastss %xmm18, %zmm18 {%k2}
+
+// CHECK: vbroadcastss %xmm18, %zmm18 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa2,0x7d,0xca,0x18,0xd2]
+          vbroadcastss %xmm18, %zmm18 {%k2} {z}
+
 // CHECK: vdivpd %zmm11, %zmm6, %zmm18
 // CHECK:  encoding: [0x62,0xc1,0xcd,0x48,0x5e,0xd3]
           vdivpd %zmm11, %zmm6, %zmm18
@@ -449,6 +537,390 @@
 // CHECK:  encoding: [0x62,0xf1,0x64,0x58,0x5d,0x9a,0xfc,0xfd,0xff,0xff]
           vminps -516(%rdx){1to16}, %zmm3, %zmm3
 
+// CHECK: vmovapd %zmm14, %zmm7
+// CHECK:  encoding: [0x62,0xd1,0xfd,0x48,0x28,0xfe]
+          vmovapd %zmm14, %zmm7
+
+// CHECK: vmovapd %zmm14, %zmm7 {%k5}
+// CHECK:  encoding: [0x62,0xd1,0xfd,0x4d,0x28,0xfe]
+          vmovapd %zmm14, %zmm7 {%k5}
+
+// CHECK: vmovapd %zmm14, %zmm7 {%k5} {z}
+// CHECK:  encoding: [0x62,0xd1,0xfd,0xcd,0x28,0xfe]
+          vmovapd %zmm14, %zmm7 {%k5} {z}
+
+// CHECK: vmovapd (%rcx), %zmm7
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x28,0x39]
+          vmovapd (%rcx), %zmm7
+
+// CHECK: vmovapd 291(%rax,%r14,8), %zmm7
+// CHECK:  encoding: [0x62,0xb1,0xfd,0x48,0x28,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vmovapd 291(%rax,%r14,8), %zmm7
+
+// CHECK: vmovapd 8128(%rdx), %zmm7
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x28,0x7a,0x7f]
+          vmovapd 8128(%rdx), %zmm7
+
+// CHECK: vmovapd 8192(%rdx), %zmm7
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x28,0xba,0x00,0x20,0x00,0x00]
+          vmovapd 8192(%rdx), %zmm7
+
+// CHECK: vmovapd -8192(%rdx), %zmm7
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x28,0x7a,0x80]
+          vmovapd -8192(%rdx), %zmm7
+
+// CHECK: vmovapd -8256(%rdx), %zmm7
+// CHECK:  encoding: [0x62,0xf1,0xfd,0x48,0x28,0xba,0xc0,0xdf,0xff,0xff]
+          vmovapd -8256(%rdx), %zmm7
+
+// CHECK: vmovaps %zmm9, %zmm5
+// CHECK:  encoding: [0x62,0xd1,0x7c,0x48,0x28,0xe9]
+          vmovaps %zmm9, %zmm5
+
+// CHECK: vmovaps %zmm9, %zmm5 {%k1}
+// CHECK:  encoding: [0x62,0xd1,0x7c,0x49,0x28,0xe9]
+          vmovaps %zmm9, %zmm5 {%k1}
+
+// CHECK: vmovaps %zmm9, %zmm5 {%k1} {z}
+// CHECK:  encoding: [0x62,0xd1,0x7c,0xc9,0x28,0xe9]
+          vmovaps %zmm9, %zmm5 {%k1} {z}
+
+// CHECK: vmovaps (%rcx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x28,0x29]
+          vmovaps (%rcx), %zmm5
+
+// CHECK: vmovaps 291(%rax,%r14,8), %zmm5
+// CHECK:  encoding: [0x62,0xb1,0x7c,0x48,0x28,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovaps 291(%rax,%r14,8), %zmm5
+
+// CHECK: vmovaps 8128(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x28,0x6a,0x7f]
+          vmovaps 8128(%rdx), %zmm5
+
+// CHECK: vmovaps 8192(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x28,0xaa,0x00,0x20,0x00,0x00]
+          vmovaps 8192(%rdx), %zmm5
+
+// CHECK: vmovaps -8192(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x28,0x6a,0x80]
+          vmovaps -8192(%rdx), %zmm5
+
+// CHECK: vmovaps -8256(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x28,0xaa,0xc0,0xdf,0xff,0xff]
+          vmovaps -8256(%rdx), %zmm5
+
+// CHECK: vmovdqa32 %zmm18, %zmm22
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x48,0x6f,0xf2]
+          vmovdqa32 %zmm18, %zmm22
+
+// CHECK: vmovdqa32 %zmm18, %zmm22 {%k6}
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x4e,0x6f,0xf2]
+          vmovdqa32 %zmm18, %zmm22 {%k6}
+
+// CHECK: vmovdqa32 %zmm18, %zmm22 {%k6} {z}
+// CHECK:  encoding: [0x62,0xa1,0x7d,0xce,0x6f,0xf2]
+          vmovdqa32 %zmm18, %zmm22 {%k6} {z}
+
+// CHECK: vmovdqa32 (%rcx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x6f,0x31]
+          vmovdqa32 (%rcx), %zmm22
+
+// CHECK: vmovdqa32 291(%rax,%r14,8), %zmm22
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x48,0x6f,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa32 291(%rax,%r14,8), %zmm22
+
+// CHECK: vmovdqa32 8128(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x6f,0x72,0x7f]
+          vmovdqa32 8128(%rdx), %zmm22
+
+// CHECK: vmovdqa32 8192(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x6f,0xb2,0x00,0x20,0x00,0x00]
+          vmovdqa32 8192(%rdx), %zmm22
+
+// CHECK: vmovdqa32 -8192(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x6f,0x72,0x80]
+          vmovdqa32 -8192(%rdx), %zmm22
+
+// CHECK: vmovdqa32 -8256(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x6f,0xb2,0xc0,0xdf,0xff,0xff]
+          vmovdqa32 -8256(%rdx), %zmm22
+
+// CHECK: vmovdqa64 %zmm12, %zmm22
+// CHECK:  encoding: [0x62,0xc1,0xfd,0x48,0x6f,0xf4]
+          vmovdqa64 %zmm12, %zmm22
+
+// CHECK: vmovdqa64 %zmm12, %zmm22 {%k5}
+// CHECK:  encoding: [0x62,0xc1,0xfd,0x4d,0x6f,0xf4]
+          vmovdqa64 %zmm12, %zmm22 {%k5}
+
+// CHECK: vmovdqa64 %zmm12, %zmm22 {%k5} {z}
+// CHECK:  encoding: [0x62,0xc1,0xfd,0xcd,0x6f,0xf4]
+          vmovdqa64 %zmm12, %zmm22 {%k5} {z}
+
+// CHECK: vmovdqa64 (%rcx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x6f,0x31]
+          vmovdqa64 (%rcx), %zmm22
+
+// CHECK: vmovdqa64 291(%rax,%r14,8), %zmm22
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x48,0x6f,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa64 291(%rax,%r14,8), %zmm22
+
+// CHECK: vmovdqa64 8128(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x6f,0x72,0x7f]
+          vmovdqa64 8128(%rdx), %zmm22
+
+// CHECK: vmovdqa64 8192(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x6f,0xb2,0x00,0x20,0x00,0x00]
+          vmovdqa64 8192(%rdx), %zmm22
+
+// CHECK: vmovdqa64 -8192(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x6f,0x72,0x80]
+          vmovdqa64 -8192(%rdx), %zmm22
+
+// CHECK: vmovdqa64 -8256(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x6f,0xb2,0xc0,0xdf,0xff,0xff]
+          vmovdqa64 -8256(%rdx), %zmm22
+
+// CHECK: vmovdqu32 %zmm24, %zmm5
+// CHECK:  encoding: [0x62,0x91,0x7e,0x48,0x6f,0xe8]
+          vmovdqu32 %zmm24, %zmm5
+
+// CHECK: vmovdqu32 %zmm24, %zmm5 {%k5}
+// CHECK:  encoding: [0x62,0x91,0x7e,0x4d,0x6f,0xe8]
+          vmovdqu32 %zmm24, %zmm5 {%k5}
+
+// CHECK: vmovdqu32 %zmm24, %zmm5 {%k5} {z}
+// CHECK:  encoding: [0x62,0x91,0x7e,0xcd,0x6f,0xe8]
+          vmovdqu32 %zmm24, %zmm5 {%k5} {z}
+
+// CHECK: vmovdqu32 (%rcx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x48,0x6f,0x29]
+          vmovdqu32 (%rcx), %zmm5
+
+// CHECK: vmovdqu32 291(%rax,%r14,8), %zmm5
+// CHECK:  encoding: [0x62,0xb1,0x7e,0x48,0x6f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu32 291(%rax,%r14,8), %zmm5
+
+// CHECK: vmovdqu32 8128(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x48,0x6f,0x6a,0x7f]
+          vmovdqu32 8128(%rdx), %zmm5
+
+// CHECK: vmovdqu32 8192(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x48,0x6f,0xaa,0x00,0x20,0x00,0x00]
+          vmovdqu32 8192(%rdx), %zmm5
+
+// CHECK: vmovdqu32 -8192(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x48,0x6f,0x6a,0x80]
+          vmovdqu32 -8192(%rdx), %zmm5
+
+// CHECK: vmovdqu32 -8256(%rdx), %zmm5
+// CHECK:  encoding: [0x62,0xf1,0x7e,0x48,0x6f,0xaa,0xc0,0xdf,0xff,0xff]
+          vmovdqu32 -8256(%rdx), %zmm5
+
+// CHECK: vmovdqu64 %zmm15, %zmm6
+// CHECK:  encoding: [0x62,0xd1,0xfe,0x48,0x6f,0xf7]
+          vmovdqu64 %zmm15, %zmm6
+
+// CHECK: vmovdqu64 %zmm15, %zmm6 {%k3}
+// CHECK:  encoding: [0x62,0xd1,0xfe,0x4b,0x6f,0xf7]
+          vmovdqu64 %zmm15, %zmm6 {%k3}
+
+// CHECK: vmovdqu64 %zmm15, %zmm6 {%k3} {z}
+// CHECK:  encoding: [0x62,0xd1,0xfe,0xcb,0x6f,0xf7]
+          vmovdqu64 %zmm15, %zmm6 {%k3} {z}
+
+// CHECK: vmovdqu64 (%rcx), %zmm6
+// CHECK:  encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x31]
+          vmovdqu64 (%rcx), %zmm6
+
+// CHECK: vmovdqu64 291(%rax,%r14,8), %zmm6
+// CHECK:  encoding: [0x62,0xb1,0xfe,0x48,0x6f,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu64 291(%rax,%r14,8), %zmm6
+
+// CHECK: vmovdqu64 8128(%rdx), %zmm6
+// CHECK:  encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x72,0x7f]
+          vmovdqu64 8128(%rdx), %zmm6
+
+// CHECK: vmovdqu64 8192(%rdx), %zmm6
+// CHECK:  encoding: [0x62,0xf1,0xfe,0x48,0x6f,0xb2,0x00,0x20,0x00,0x00]
+          vmovdqu64 8192(%rdx), %zmm6
+
+// CHECK: vmovdqu64 -8192(%rdx), %zmm6
+// CHECK:  encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x72,0x80]
+          vmovdqu64 -8192(%rdx), %zmm6
+
+// CHECK: vmovdqu64 -8256(%rdx), %zmm6
+// CHECK:  encoding: [0x62,0xf1,0xfe,0x48,0x6f,0xb2,0xc0,0xdf,0xff,0xff]
+          vmovdqu64 -8256(%rdx), %zmm6
+
+// CHECK: vmovntdq %zmm24, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0xe7,0x01]
+          vmovntdq %zmm24, (%rcx)
+
+// CHECK: vmovntdq %zmm24, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0x7d,0x48,0xe7,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmovntdq %zmm24, 291(%rax,%r14,8)
+
+// CHECK: vmovntdq %zmm24, 8128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0xe7,0x42,0x7f]
+          vmovntdq %zmm24, 8128(%rdx)
+
+// CHECK: vmovntdq %zmm24, 8192(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0xe7,0x82,0x00,0x20,0x00,0x00]
+          vmovntdq %zmm24, 8192(%rdx)
+
+// CHECK: vmovntdq %zmm24, -8192(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0xe7,0x42,0x80]
+          vmovntdq %zmm24, -8192(%rdx)
+
+// CHECK: vmovntdq %zmm24, -8256(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x48,0xe7,0x82,0xc0,0xdf,0xff,0xff]
+          vmovntdq %zmm24, -8256(%rdx)
+
+// CHECK: vmovntdqa (%rcx), %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x09]
+          vmovntdqa (%rcx), %zmm17
+
+// CHECK: vmovntdqa 291(%rax,%r14,8), %zmm17
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x48,0x2a,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovntdqa 291(%rax,%r14,8), %zmm17
+
+// CHECK: vmovntdqa 8128(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x4a,0x7f]
+          vmovntdqa 8128(%rdx), %zmm17
+
+// CHECK: vmovntdqa 8192(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x8a,0x00,0x20,0x00,0x00]
+          vmovntdqa 8192(%rdx), %zmm17
+
+// CHECK: vmovntdqa -8192(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x4a,0x80]
+          vmovntdqa -8192(%rdx), %zmm17
+
+// CHECK: vmovntdqa -8256(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x8a,0xc0,0xdf,0xff,0xff]
+          vmovntdqa -8256(%rdx), %zmm17
+
+// CHECK: vmovntpd %zmm17, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x09]
+          vmovntpd %zmm17, (%rcx)
+
+// CHECK: vmovntpd %zmm17, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x48,0x2b,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovntpd %zmm17, 291(%rax,%r14,8)
+
+// CHECK: vmovntpd %zmm17, 8128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x4a,0x7f]
+          vmovntpd %zmm17, 8128(%rdx)
+
+// CHECK: vmovntpd %zmm17, 8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x8a,0x00,0x20,0x00,0x00]
+          vmovntpd %zmm17, 8192(%rdx)
+
+// CHECK: vmovntpd %zmm17, -8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x4a,0x80]
+          vmovntpd %zmm17, -8192(%rdx)
+
+// CHECK: vmovntpd %zmm17, -8256(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x8a,0xc0,0xdf,0xff,0xff]
+          vmovntpd %zmm17, -8256(%rdx)
+
+// CHECK: vmovntps %zmm5, (%rcx)
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x29]
+          vmovntps %zmm5, (%rcx)
+
+// CHECK: vmovntps %zmm5, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xb1,0x7c,0x48,0x2b,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovntps %zmm5, 291(%rax,%r14,8)
+
+// CHECK: vmovntps %zmm5, 8128(%rdx)
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x6a,0x7f]
+          vmovntps %zmm5, 8128(%rdx)
+
+// CHECK: vmovntps %zmm5, 8192(%rdx)
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x2b,0xaa,0x00,0x20,0x00,0x00]
+          vmovntps %zmm5, 8192(%rdx)
+
+// CHECK: vmovntps %zmm5, -8192(%rdx)
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x6a,0x80]
+          vmovntps %zmm5, -8192(%rdx)
+
+// CHECK: vmovntps %zmm5, -8256(%rdx)
+// CHECK:  encoding: [0x62,0xf1,0x7c,0x48,0x2b,0xaa,0xc0,0xdf,0xff,0xff]
+          vmovntps %zmm5, -8256(%rdx)
+
+// CHECK: vmovupd %zmm9, %zmm27
+// CHECK:  encoding: [0x62,0x41,0xfd,0x48,0x10,0xd9]
+          vmovupd %zmm9, %zmm27
+
+// CHECK: vmovupd %zmm9, %zmm27 {%k2}
+// CHECK:  encoding: [0x62,0x41,0xfd,0x4a,0x10,0xd9]
+          vmovupd %zmm9, %zmm27 {%k2}
+
+// CHECK: vmovupd %zmm9, %zmm27 {%k2} {z}
+// CHECK:  encoding: [0x62,0x41,0xfd,0xca,0x10,0xd9]
+          vmovupd %zmm9, %zmm27 {%k2} {z}
+
+// CHECK: vmovupd (%rcx), %zmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x48,0x10,0x19]
+          vmovupd (%rcx), %zmm27
+
+// CHECK: vmovupd 291(%rax,%r14,8), %zmm27
+// CHECK:  encoding: [0x62,0x21,0xfd,0x48,0x10,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vmovupd 291(%rax,%r14,8), %zmm27
+
+// CHECK: vmovupd 8128(%rdx), %zmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x48,0x10,0x5a,0x7f]
+          vmovupd 8128(%rdx), %zmm27
+
+// CHECK: vmovupd 8192(%rdx), %zmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x48,0x10,0x9a,0x00,0x20,0x00,0x00]
+          vmovupd 8192(%rdx), %zmm27
+
+// CHECK: vmovupd -8192(%rdx), %zmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x48,0x10,0x5a,0x80]
+          vmovupd -8192(%rdx), %zmm27
+
+// CHECK: vmovupd -8256(%rdx), %zmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x48,0x10,0x9a,0xc0,0xdf,0xff,0xff]
+          vmovupd -8256(%rdx), %zmm27
+
+// CHECK: vmovups %zmm22, %zmm22
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x48,0x10,0xf6]
+          vmovups %zmm22, %zmm22
+
+// CHECK: vmovups %zmm22, %zmm22 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x4b,0x10,0xf6]
+          vmovups %zmm22, %zmm22 {%k3}
+
+// CHECK: vmovups %zmm22, %zmm22 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0x7c,0xcb,0x10,0xf6]
+          vmovups %zmm22, %zmm22 {%k3} {z}
+
+// CHECK: vmovups (%rcx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x48,0x10,0x31]
+          vmovups (%rcx), %zmm22
+
+// CHECK: vmovups 291(%rax,%r14,8), %zmm22
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x48,0x10,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovups 291(%rax,%r14,8), %zmm22
+
+// CHECK: vmovups 8128(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x48,0x10,0x72,0x7f]
+          vmovups 8128(%rdx), %zmm22
+
+// CHECK: vmovups 8192(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x48,0x10,0xb2,0x00,0x20,0x00,0x00]
+          vmovups 8192(%rdx), %zmm22
+
+// CHECK: vmovups -8192(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x48,0x10,0x72,0x80]
+          vmovups -8192(%rdx), %zmm22
+
+// CHECK: vmovups -8256(%rdx), %zmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x48,0x10,0xb2,0xc0,0xdf,0xff,0xff]
+          vmovups -8256(%rdx), %zmm22
+
 // CHECK: vmulpd %zmm23, %zmm4, %zmm24
 // CHECK:  encoding: [0x62,0x21,0xdd,0x48,0x59,0xc7]
           vmulpd %zmm23, %zmm4, %zmm24
@@ -1041,6 +1513,438 @@
 // CHECK:  encoding: [0x62,0xe1,0xdd,0x58,0xdb,0x8a,0xf8,0xfb,0xff,0xff]
           vpandq -1032(%rdx){1to8}, %zmm4, %zmm17
 
+// CHECK: vpcmpd $171, %zmm10, %zmm25, %k5
+// CHECK:  encoding: [0x62,0xd3,0x35,0x40,0x1f,0xea,0xab]
+          vpcmpd $171, %zmm10, %zmm25, %k5
+
+// CHECK: vpcmpd $171, %zmm10, %zmm25, %k5 {%k3}
+// CHECK:  encoding: [0x62,0xd3,0x35,0x43,0x1f,0xea,0xab]
+          vpcmpd $171, %zmm10, %zmm25, %k5 {%k3}
+
+// CHECK: vpcmpd $123, %zmm10, %zmm25, %k5
+// CHECK:  encoding: [0x62,0xd3,0x35,0x40,0x1f,0xea,0x7b]
+          vpcmpd $123, %zmm10, %zmm25, %k5
+
+// CHECK: vpcmpd $123, (%rcx), %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x29,0x7b]
+          vpcmpd $123, (%rcx), %zmm25, %k5
+
+// CHECK: vpcmpd $123, 291(%rax,%r14,8), %zmm25, %k5
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1f,0xac,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpd $123, 291(%rax,%r14,8), %zmm25, %k5
+
+// CHECK: vpcmpd $123, (%rcx){1to16}, %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x29,0x7b]
+          vpcmpd $123, (%rcx){1to16}, %zmm25, %k5
+
+// CHECK: vpcmpd $123, 8128(%rdx), %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x6a,0x7f,0x7b]
+          vpcmpd $123, 8128(%rdx), %zmm25, %k5
+
+// CHECK: vpcmpd $123, 8192(%rdx), %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0xaa,0x00,0x20,0x00,0x00,0x7b]
+          vpcmpd $123, 8192(%rdx), %zmm25, %k5
+
+// CHECK: vpcmpd $123, -8192(%rdx), %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0x6a,0x80,0x7b]
+          vpcmpd $123, -8192(%rdx), %zmm25, %k5
+
+// CHECK: vpcmpd $123, -8256(%rdx), %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1f,0xaa,0xc0,0xdf,0xff,0xff,0x7b]
+          vpcmpd $123, -8256(%rdx), %zmm25, %k5
+
+// CHECK: vpcmpd $123, 508(%rdx){1to16}, %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x6a,0x7f,0x7b]
+          vpcmpd $123, 508(%rdx){1to16}, %zmm25, %k5
+
+// CHECK: vpcmpd $123, 512(%rdx){1to16}, %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0xaa,0x00,0x02,0x00,0x00,0x7b]
+          vpcmpd $123, 512(%rdx){1to16}, %zmm25, %k5
+
+// CHECK: vpcmpd $123, -512(%rdx){1to16}, %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0x6a,0x80,0x7b]
+          vpcmpd $123, -512(%rdx){1to16}, %zmm25, %k5
+
+// CHECK: vpcmpd $123, -516(%rdx){1to16}, %zmm25, %k5
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1f,0xaa,0xfc,0xfd,0xff,0xff,0x7b]
+          vpcmpd $123, -516(%rdx){1to16}, %zmm25, %k5
+
+// CHECK: vpcmpeqd %zmm10, %zmm2, %k5
+// CHECK:  encoding: [0x62,0xd1,0x6d,0x48,0x76,0xea]
+          vpcmpeqd %zmm10, %zmm2, %k5
+
+// CHECK: vpcmpeqd %zmm10, %zmm2, %k5 {%k7}
+// CHECK:  encoding: [0x62,0xd1,0x6d,0x4f,0x76,0xea]
+          vpcmpeqd %zmm10, %zmm2, %k5 {%k7}
+
+// CHECK: vpcmpeqd (%rcx), %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x48,0x76,0x29]
+          vpcmpeqd (%rcx), %zmm2, %k5
+
+// CHECK: vpcmpeqd 291(%rax,%r14,8), %zmm2, %k5
+// CHECK:  encoding: [0x62,0xb1,0x6d,0x48,0x76,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqd 291(%rax,%r14,8), %zmm2, %k5
+
+// CHECK: vpcmpeqd (%rcx){1to16}, %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x58,0x76,0x29]
+          vpcmpeqd (%rcx){1to16}, %zmm2, %k5
+
+// CHECK: vpcmpeqd 8128(%rdx), %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x48,0x76,0x6a,0x7f]
+          vpcmpeqd 8128(%rdx), %zmm2, %k5
+
+// CHECK: vpcmpeqd 8192(%rdx), %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x48,0x76,0xaa,0x00,0x20,0x00,0x00]
+          vpcmpeqd 8192(%rdx), %zmm2, %k5
+
+// CHECK: vpcmpeqd -8192(%rdx), %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x48,0x76,0x6a,0x80]
+          vpcmpeqd -8192(%rdx), %zmm2, %k5
+
+// CHECK: vpcmpeqd -8256(%rdx), %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x48,0x76,0xaa,0xc0,0xdf,0xff,0xff]
+          vpcmpeqd -8256(%rdx), %zmm2, %k5
+
+// CHECK: vpcmpeqd 508(%rdx){1to16}, %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x58,0x76,0x6a,0x7f]
+          vpcmpeqd 508(%rdx){1to16}, %zmm2, %k5
+
+// CHECK: vpcmpeqd 512(%rdx){1to16}, %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x58,0x76,0xaa,0x00,0x02,0x00,0x00]
+          vpcmpeqd 512(%rdx){1to16}, %zmm2, %k5
+
+// CHECK: vpcmpeqd -512(%rdx){1to16}, %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x58,0x76,0x6a,0x80]
+          vpcmpeqd -512(%rdx){1to16}, %zmm2, %k5
+
+// CHECK: vpcmpeqd -516(%rdx){1to16}, %zmm2, %k5
+// CHECK:  encoding: [0x62,0xf1,0x6d,0x58,0x76,0xaa,0xfc,0xfd,0xff,0xff]
+          vpcmpeqd -516(%rdx){1to16}, %zmm2, %k5
+
+// CHECK: vpcmpeqq %zmm2, %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x40,0x29,0xda]
+          vpcmpeqq %zmm2, %zmm22, %k3
+
+// CHECK: vpcmpeqq %zmm2, %zmm22, %k3 {%k6}
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x46,0x29,0xda]
+          vpcmpeqq %zmm2, %zmm22, %k3 {%k6}
+
+// CHECK: vpcmpeqq (%rcx), %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x40,0x29,0x19]
+          vpcmpeqq (%rcx), %zmm22, %k3
+
+// CHECK: vpcmpeqq 291(%rax,%r14,8), %zmm22, %k3
+// CHECK:  encoding: [0x62,0xb2,0xcd,0x40,0x29,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqq 291(%rax,%r14,8), %zmm22, %k3
+
+// CHECK: vpcmpeqq (%rcx){1to8}, %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x50,0x29,0x19]
+          vpcmpeqq (%rcx){1to8}, %zmm22, %k3
+
+// CHECK: vpcmpeqq 8128(%rdx), %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x40,0x29,0x5a,0x7f]
+          vpcmpeqq 8128(%rdx), %zmm22, %k3
+
+// CHECK: vpcmpeqq 8192(%rdx), %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x40,0x29,0x9a,0x00,0x20,0x00,0x00]
+          vpcmpeqq 8192(%rdx), %zmm22, %k3
+
+// CHECK: vpcmpeqq -8192(%rdx), %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x40,0x29,0x5a,0x80]
+          vpcmpeqq -8192(%rdx), %zmm22, %k3
+
+// CHECK: vpcmpeqq -8256(%rdx), %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x40,0x29,0x9a,0xc0,0xdf,0xff,0xff]
+          vpcmpeqq -8256(%rdx), %zmm22, %k3
+
+// CHECK: vpcmpeqq 1016(%rdx){1to8}, %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x50,0x29,0x5a,0x7f]
+          vpcmpeqq 1016(%rdx){1to8}, %zmm22, %k3
+
+// CHECK: vpcmpeqq 1024(%rdx){1to8}, %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x50,0x29,0x9a,0x00,0x04,0x00,0x00]
+          vpcmpeqq 1024(%rdx){1to8}, %zmm22, %k3
+
+// CHECK: vpcmpeqq -1024(%rdx){1to8}, %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x50,0x29,0x5a,0x80]
+          vpcmpeqq -1024(%rdx){1to8}, %zmm22, %k3
+
+// CHECK: vpcmpeqq -1032(%rdx){1to8}, %zmm22, %k3
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x50,0x29,0x9a,0xf8,0xfb,0xff,0xff]
+          vpcmpeqq -1032(%rdx){1to8}, %zmm22, %k3
+
+// CHECK: vpcmpgtd %zmm8, %zmm21, %k5
+// CHECK:  encoding: [0x62,0xd1,0x55,0x40,0x66,0xe8]
+          vpcmpgtd %zmm8, %zmm21, %k5
+
+// CHECK: vpcmpgtd %zmm8, %zmm21, %k5 {%k5}
+// CHECK:  encoding: [0x62,0xd1,0x55,0x45,0x66,0xe8]
+          vpcmpgtd %zmm8, %zmm21, %k5 {%k5}
+
+// CHECK: vpcmpgtd (%rcx), %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x40,0x66,0x29]
+          vpcmpgtd (%rcx), %zmm21, %k5
+
+// CHECK: vpcmpgtd 291(%rax,%r14,8), %zmm21, %k5
+// CHECK:  encoding: [0x62,0xb1,0x55,0x40,0x66,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtd 291(%rax,%r14,8), %zmm21, %k5
+
+// CHECK: vpcmpgtd (%rcx){1to16}, %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x50,0x66,0x29]
+          vpcmpgtd (%rcx){1to16}, %zmm21, %k5
+
+// CHECK: vpcmpgtd 8128(%rdx), %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x40,0x66,0x6a,0x7f]
+          vpcmpgtd 8128(%rdx), %zmm21, %k5
+
+// CHECK: vpcmpgtd 8192(%rdx), %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x40,0x66,0xaa,0x00,0x20,0x00,0x00]
+          vpcmpgtd 8192(%rdx), %zmm21, %k5
+
+// CHECK: vpcmpgtd -8192(%rdx), %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x40,0x66,0x6a,0x80]
+          vpcmpgtd -8192(%rdx), %zmm21, %k5
+
+// CHECK: vpcmpgtd -8256(%rdx), %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x40,0x66,0xaa,0xc0,0xdf,0xff,0xff]
+          vpcmpgtd -8256(%rdx), %zmm21, %k5
+
+// CHECK: vpcmpgtd 508(%rdx){1to16}, %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x50,0x66,0x6a,0x7f]
+          vpcmpgtd 508(%rdx){1to16}, %zmm21, %k5
+
+// CHECK: vpcmpgtd 512(%rdx){1to16}, %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x50,0x66,0xaa,0x00,0x02,0x00,0x00]
+          vpcmpgtd 512(%rdx){1to16}, %zmm21, %k5
+
+// CHECK: vpcmpgtd -512(%rdx){1to16}, %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x50,0x66,0x6a,0x80]
+          vpcmpgtd -512(%rdx){1to16}, %zmm21, %k5
+
+// CHECK: vpcmpgtd -516(%rdx){1to16}, %zmm21, %k5
+// CHECK:  encoding: [0x62,0xf1,0x55,0x50,0x66,0xaa,0xfc,0xfd,0xff,0xff]
+          vpcmpgtd -516(%rdx){1to16}, %zmm21, %k5
+
+// CHECK: vpcmpgtq %zmm17, %zmm20, %k2
+// CHECK:  encoding: [0x62,0xb2,0xdd,0x40,0x37,0xd1]
+          vpcmpgtq %zmm17, %zmm20, %k2
+
+// CHECK: vpcmpgtq %zmm17, %zmm20, %k2 {%k3}
+// CHECK:  encoding: [0x62,0xb2,0xdd,0x43,0x37,0xd1]
+          vpcmpgtq %zmm17, %zmm20, %k2 {%k3}
+
+// CHECK: vpcmpgtq (%rcx), %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x40,0x37,0x11]
+          vpcmpgtq (%rcx), %zmm20, %k2
+
+// CHECK: vpcmpgtq 291(%rax,%r14,8), %zmm20, %k2
+// CHECK:  encoding: [0x62,0xb2,0xdd,0x40,0x37,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtq 291(%rax,%r14,8), %zmm20, %k2
+
+// CHECK: vpcmpgtq (%rcx){1to8}, %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x50,0x37,0x11]
+          vpcmpgtq (%rcx){1to8}, %zmm20, %k2
+
+// CHECK: vpcmpgtq 8128(%rdx), %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x40,0x37,0x52,0x7f]
+          vpcmpgtq 8128(%rdx), %zmm20, %k2
+
+// CHECK: vpcmpgtq 8192(%rdx), %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x40,0x37,0x92,0x00,0x20,0x00,0x00]
+          vpcmpgtq 8192(%rdx), %zmm20, %k2
+
+// CHECK: vpcmpgtq -8192(%rdx), %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x40,0x37,0x52,0x80]
+          vpcmpgtq -8192(%rdx), %zmm20, %k2
+
+// CHECK: vpcmpgtq -8256(%rdx), %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x40,0x37,0x92,0xc0,0xdf,0xff,0xff]
+          vpcmpgtq -8256(%rdx), %zmm20, %k2
+
+// CHECK: vpcmpgtq 1016(%rdx){1to8}, %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x50,0x37,0x52,0x7f]
+          vpcmpgtq 1016(%rdx){1to8}, %zmm20, %k2
+
+// CHECK: vpcmpgtq 1024(%rdx){1to8}, %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x50,0x37,0x92,0x00,0x04,0x00,0x00]
+          vpcmpgtq 1024(%rdx){1to8}, %zmm20, %k2
+
+// CHECK: vpcmpgtq -1024(%rdx){1to8}, %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x50,0x37,0x52,0x80]
+          vpcmpgtq -1024(%rdx){1to8}, %zmm20, %k2
+
+// CHECK: vpcmpgtq -1032(%rdx){1to8}, %zmm20, %k2
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x50,0x37,0x92,0xf8,0xfb,0xff,0xff]
+          vpcmpgtq -1032(%rdx){1to8}, %zmm20, %k2
+
+// CHECK: vpcmpq $171, %zmm28, %zmm28, %k5
+// CHECK:  encoding: [0x62,0x93,0x9d,0x40,0x1f,0xec,0xab]
+          vpcmpq $171, %zmm28, %zmm28, %k5
+
+// CHECK: vpcmpq $171, %zmm28, %zmm28, %k5 {%k3}
+// CHECK:  encoding: [0x62,0x93,0x9d,0x43,0x1f,0xec,0xab]
+          vpcmpq $171, %zmm28, %zmm28, %k5 {%k3}
+
+// CHECK: vpcmpq $123, %zmm28, %zmm28, %k5
+// CHECK:  encoding: [0x62,0x93,0x9d,0x40,0x1f,0xec,0x7b]
+          vpcmpq $123, %zmm28, %zmm28, %k5
+
+// CHECK: vpcmpq $123, (%rcx), %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x40,0x1f,0x29,0x7b]
+          vpcmpq $123, (%rcx), %zmm28, %k5
+
+// CHECK: vpcmpq $123, 291(%rax,%r14,8), %zmm28, %k5
+// CHECK:  encoding: [0x62,0xb3,0x9d,0x40,0x1f,0xac,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpq $123, 291(%rax,%r14,8), %zmm28, %k5
+
+// CHECK: vpcmpq $123, (%rcx){1to8}, %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x50,0x1f,0x29,0x7b]
+          vpcmpq $123, (%rcx){1to8}, %zmm28, %k5
+
+// CHECK: vpcmpq $123, 8128(%rdx), %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x40,0x1f,0x6a,0x7f,0x7b]
+          vpcmpq $123, 8128(%rdx), %zmm28, %k5
+
+// CHECK: vpcmpq $123, 8192(%rdx), %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x40,0x1f,0xaa,0x00,0x20,0x00,0x00,0x7b]
+          vpcmpq $123, 8192(%rdx), %zmm28, %k5
+
+// CHECK: vpcmpq $123, -8192(%rdx), %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x40,0x1f,0x6a,0x80,0x7b]
+          vpcmpq $123, -8192(%rdx), %zmm28, %k5
+
+// CHECK: vpcmpq $123, -8256(%rdx), %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x40,0x1f,0xaa,0xc0,0xdf,0xff,0xff,0x7b]
+          vpcmpq $123, -8256(%rdx), %zmm28, %k5
+
+// CHECK: vpcmpq $123, 1016(%rdx){1to8}, %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x50,0x1f,0x6a,0x7f,0x7b]
+          vpcmpq $123, 1016(%rdx){1to8}, %zmm28, %k5
+
+// CHECK: vpcmpq $123, 1024(%rdx){1to8}, %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x50,0x1f,0xaa,0x00,0x04,0x00,0x00,0x7b]
+          vpcmpq $123, 1024(%rdx){1to8}, %zmm28, %k5
+
+// CHECK: vpcmpq $123, -1024(%rdx){1to8}, %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x50,0x1f,0x6a,0x80,0x7b]
+          vpcmpq $123, -1024(%rdx){1to8}, %zmm28, %k5
+
+// CHECK: vpcmpq $123, -1032(%rdx){1to8}, %zmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x50,0x1f,0xaa,0xf8,0xfb,0xff,0xff,0x7b]
+          vpcmpq $123, -1032(%rdx){1to8}, %zmm28, %k5
+
+// CHECK: vpcmpud $171, %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0xd7,0xab]
+          vpcmpud $171, %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpud $171, %zmm7, %zmm25, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xf3,0x35,0x41,0x1e,0xd7,0xab]
+          vpcmpud $171, %zmm7, %zmm25, %k2 {%k1}
+
+// CHECK: vpcmpud $123, %zmm7, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0xd7,0x7b]
+          vpcmpud $123, %zmm7, %zmm25, %k2
+
+// CHECK: vpcmpud $123, (%rcx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x11,0x7b]
+          vpcmpud $123, (%rcx), %zmm25, %k2
+
+// CHECK: vpcmpud $123, 291(%rax,%r14,8), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xb3,0x35,0x40,0x1e,0x94,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpud $123, 291(%rax,%r14,8), %zmm25, %k2
+
+// CHECK: vpcmpud $123, (%rcx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x11,0x7b]
+          vpcmpud $123, (%rcx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpud $123, 8128(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x7f,0x7b]
+          vpcmpud $123, 8128(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpud $123, 8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0x00,0x20,0x00,0x00,0x7b]
+          vpcmpud $123, 8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpud $123, -8192(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x52,0x80,0x7b]
+          vpcmpud $123, -8192(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpud $123, -8256(%rdx), %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x40,0x1e,0x92,0xc0,0xdf,0xff,0xff,0x7b]
+          vpcmpud $123, -8256(%rdx), %zmm25, %k2
+
+// CHECK: vpcmpud $123, 508(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x7f,0x7b]
+          vpcmpud $123, 508(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpud $123, 512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0x00,0x02,0x00,0x00,0x7b]
+          vpcmpud $123, 512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpud $123, -512(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x52,0x80,0x7b]
+          vpcmpud $123, -512(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpud $123, -516(%rdx){1to16}, %zmm25, %k2
+// CHECK:  encoding: [0x62,0xf3,0x35,0x50,0x1e,0x92,0xfc,0xfd,0xff,0xff,0x7b]
+          vpcmpud $123, -516(%rdx){1to16}, %zmm25, %k2
+
+// CHECK: vpcmpuq $171, %zmm8, %zmm14, %k3
+// CHECK:  encoding: [0x62,0xd3,0x8d,0x48,0x1e,0xd8,0xab]
+          vpcmpuq $171, %zmm8, %zmm14, %k3
+
+// CHECK: vpcmpuq $171, %zmm8, %zmm14, %k3 {%k2}
+// CHECK:  encoding: [0x62,0xd3,0x8d,0x4a,0x1e,0xd8,0xab]
+          vpcmpuq $171, %zmm8, %zmm14, %k3 {%k2}
+
+// CHECK: vpcmpuq $123, %zmm8, %zmm14, %k3
+// CHECK:  encoding: [0x62,0xd3,0x8d,0x48,0x1e,0xd8,0x7b]
+          vpcmpuq $123, %zmm8, %zmm14, %k3
+
+// CHECK: vpcmpuq $123, (%rcx), %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x19,0x7b]
+          vpcmpuq $123, (%rcx), %zmm14, %k3
+
+// CHECK: vpcmpuq $123, 291(%rax,%r14,8), %zmm14, %k3
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x48,0x1e,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpuq $123, 291(%rax,%r14,8), %zmm14, %k3
+
+// CHECK: vpcmpuq $123, (%rcx){1to8}, %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x19,0x7b]
+          vpcmpuq $123, (%rcx){1to8}, %zmm14, %k3
+
+// CHECK: vpcmpuq $123, 8128(%rdx), %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x5a,0x7f,0x7b]
+          vpcmpuq $123, 8128(%rdx), %zmm14, %k3
+
+// CHECK: vpcmpuq $123, 8192(%rdx), %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x9a,0x00,0x20,0x00,0x00,0x7b]
+          vpcmpuq $123, 8192(%rdx), %zmm14, %k3
+
+// CHECK: vpcmpuq $123, -8192(%rdx), %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x5a,0x80,0x7b]
+          vpcmpuq $123, -8192(%rdx), %zmm14, %k3
+
+// CHECK: vpcmpuq $123, -8256(%rdx), %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x48,0x1e,0x9a,0xc0,0xdf,0xff,0xff,0x7b]
+          vpcmpuq $123, -8256(%rdx), %zmm14, %k3
+
+// CHECK: vpcmpuq $123, 1016(%rdx){1to8}, %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x5a,0x7f,0x7b]
+          vpcmpuq $123, 1016(%rdx){1to8}, %zmm14, %k3
+
+// CHECK: vpcmpuq $123, 1024(%rdx){1to8}, %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x9a,0x00,0x04,0x00,0x00,0x7b]
+          vpcmpuq $123, 1024(%rdx){1to8}, %zmm14, %k3
+
+// CHECK: vpcmpuq $123, -1024(%rdx){1to8}, %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x5a,0x80,0x7b]
+          vpcmpuq $123, -1024(%rdx){1to8}, %zmm14, %k3
+
+// CHECK: vpcmpuq $123, -1032(%rdx){1to8}, %zmm14, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x58,0x1e,0x9a,0xf8,0xfb,0xff,0xff,0x7b]
+          vpcmpuq $123, -1032(%rdx){1to8}, %zmm14, %k3
+
 // CHECK: vpmaxsd %zmm16, %zmm8, %zmm6
 // CHECK:  encoding: [0x62,0xb2,0x3d,0x48,0x3d,0xf0]
           vpmaxsd %zmm16, %zmm8, %zmm6
@@ -2377,6 +3281,342 @@
 // CHECK:  encoding: [0x62,0xf1,0xcd,0x50,0xef,0xba,0xf8,0xfb,0xff,0xff]
           vpxorq -1032(%rdx){1to8}, %zmm22, %zmm7
 
+// CHECK: vrcp14pd %zmm4, %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x48,0x4c,0xec]
+          vrcp14pd %zmm4, %zmm13
+
+// CHECK: vrcp14pd %zmm4, %zmm13 {%k5}
+// CHECK:  encoding: [0x62,0x72,0xfd,0x4d,0x4c,0xec]
+          vrcp14pd %zmm4, %zmm13 {%k5}
+
+// CHECK: vrcp14pd %zmm4, %zmm13 {%k5} {z}
+// CHECK:  encoding: [0x62,0x72,0xfd,0xcd,0x4c,0xec]
+          vrcp14pd %zmm4, %zmm13 {%k5} {z}
+
+// CHECK: vrcp14pd (%rcx), %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x48,0x4c,0x29]
+          vrcp14pd (%rcx), %zmm13
+
+// CHECK: vrcp14pd 291(%rax,%r14,8), %zmm13
+// CHECK:  encoding: [0x62,0x32,0xfd,0x48,0x4c,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vrcp14pd 291(%rax,%r14,8), %zmm13
+
+// CHECK: vrcp14pd (%rcx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x58,0x4c,0x29]
+          vrcp14pd (%rcx){1to8}, %zmm13
+
+// CHECK: vrcp14pd 8128(%rdx), %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x48,0x4c,0x6a,0x7f]
+          vrcp14pd 8128(%rdx), %zmm13
+
+// CHECK: vrcp14pd 8192(%rdx), %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x48,0x4c,0xaa,0x00,0x20,0x00,0x00]
+          vrcp14pd 8192(%rdx), %zmm13
+
+// CHECK: vrcp14pd -8192(%rdx), %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x48,0x4c,0x6a,0x80]
+          vrcp14pd -8192(%rdx), %zmm13
+
+// CHECK: vrcp14pd -8256(%rdx), %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x48,0x4c,0xaa,0xc0,0xdf,0xff,0xff]
+          vrcp14pd -8256(%rdx), %zmm13
+
+// CHECK: vrcp14pd 1016(%rdx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x58,0x4c,0x6a,0x7f]
+          vrcp14pd 1016(%rdx){1to8}, %zmm13
+
+// CHECK: vrcp14pd 1024(%rdx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x58,0x4c,0xaa,0x00,0x04,0x00,0x00]
+          vrcp14pd 1024(%rdx){1to8}, %zmm13
+
+// CHECK: vrcp14pd -1024(%rdx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x58,0x4c,0x6a,0x80]
+          vrcp14pd -1024(%rdx){1to8}, %zmm13
+
+// CHECK: vrcp14pd -1032(%rdx){1to8}, %zmm13
+// CHECK:  encoding: [0x62,0x72,0xfd,0x58,0x4c,0xaa,0xf8,0xfb,0xff,0xff]
+          vrcp14pd -1032(%rdx){1to8}, %zmm13
+
+// CHECK: vrcp14ps %zmm25, %zmm10
+// CHECK:  encoding: [0x62,0x12,0x7d,0x48,0x4c,0xd1]
+          vrcp14ps %zmm25, %zmm10
+
+// CHECK: vrcp14ps %zmm25, %zmm10 {%k1}
+// CHECK:  encoding: [0x62,0x12,0x7d,0x49,0x4c,0xd1]
+          vrcp14ps %zmm25, %zmm10 {%k1}
+
+// CHECK: vrcp14ps %zmm25, %zmm10 {%k1} {z}
+// CHECK:  encoding: [0x62,0x12,0x7d,0xc9,0x4c,0xd1]
+          vrcp14ps %zmm25, %zmm10 {%k1} {z}
+
+// CHECK: vrcp14ps (%rcx), %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x4c,0x11]
+          vrcp14ps (%rcx), %zmm10
+
+// CHECK: vrcp14ps 291(%rax,%r14,8), %zmm10
+// CHECK:  encoding: [0x62,0x32,0x7d,0x48,0x4c,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vrcp14ps 291(%rax,%r14,8), %zmm10
+
+// CHECK: vrcp14ps (%rcx){1to16}, %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x58,0x4c,0x11]
+          vrcp14ps (%rcx){1to16}, %zmm10
+
+// CHECK: vrcp14ps 8128(%rdx), %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x4c,0x52,0x7f]
+          vrcp14ps 8128(%rdx), %zmm10
+
+// CHECK: vrcp14ps 8192(%rdx), %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x4c,0x92,0x00,0x20,0x00,0x00]
+          vrcp14ps 8192(%rdx), %zmm10
+
+// CHECK: vrcp14ps -8192(%rdx), %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x4c,0x52,0x80]
+          vrcp14ps -8192(%rdx), %zmm10
+
+// CHECK: vrcp14ps -8256(%rdx), %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x48,0x4c,0x92,0xc0,0xdf,0xff,0xff]
+          vrcp14ps -8256(%rdx), %zmm10
+
+// CHECK: vrcp14ps 508(%rdx){1to16}, %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x58,0x4c,0x52,0x7f]
+          vrcp14ps 508(%rdx){1to16}, %zmm10
+
+// CHECK: vrcp14ps 512(%rdx){1to16}, %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x58,0x4c,0x92,0x00,0x02,0x00,0x00]
+          vrcp14ps 512(%rdx){1to16}, %zmm10
+
+// CHECK: vrcp14ps -512(%rdx){1to16}, %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x58,0x4c,0x52,0x80]
+          vrcp14ps -512(%rdx){1to16}, %zmm10
+
+// CHECK: vrcp14ps -516(%rdx){1to16}, %zmm10
+// CHECK:  encoding: [0x62,0x72,0x7d,0x58,0x4c,0x92,0xfc,0xfd,0xff,0xff]
+          vrcp14ps -516(%rdx){1to16}, %zmm10
+
+// CHECK: vrsqrt14pd %zmm14, %zmm19
+// CHECK:  encoding: [0x62,0xc2,0xfd,0x48,0x4e,0xde]
+          vrsqrt14pd %zmm14, %zmm19
+
+// CHECK: vrsqrt14pd %zmm14, %zmm19 {%k1}
+// CHECK:  encoding: [0x62,0xc2,0xfd,0x49,0x4e,0xde]
+          vrsqrt14pd %zmm14, %zmm19 {%k1}
+
+// CHECK: vrsqrt14pd %zmm14, %zmm19 {%k1} {z}
+// CHECK:  encoding: [0x62,0xc2,0xfd,0xc9,0x4e,0xde]
+          vrsqrt14pd %zmm14, %zmm19 {%k1} {z}
+
+// CHECK: vrsqrt14pd (%rcx), %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x48,0x4e,0x19]
+          vrsqrt14pd (%rcx), %zmm19
+
+// CHECK: vrsqrt14pd 291(%rax,%r14,8), %zmm19
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x48,0x4e,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vrsqrt14pd 291(%rax,%r14,8), %zmm19
+
+// CHECK: vrsqrt14pd (%rcx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x58,0x4e,0x19]
+          vrsqrt14pd (%rcx){1to8}, %zmm19
+
+// CHECK: vrsqrt14pd 8128(%rdx), %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x48,0x4e,0x5a,0x7f]
+          vrsqrt14pd 8128(%rdx), %zmm19
+
+// CHECK: vrsqrt14pd 8192(%rdx), %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x48,0x4e,0x9a,0x00,0x20,0x00,0x00]
+          vrsqrt14pd 8192(%rdx), %zmm19
+
+// CHECK: vrsqrt14pd -8192(%rdx), %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x48,0x4e,0x5a,0x80]
+          vrsqrt14pd -8192(%rdx), %zmm19
+
+// CHECK: vrsqrt14pd -8256(%rdx), %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x48,0x4e,0x9a,0xc0,0xdf,0xff,0xff]
+          vrsqrt14pd -8256(%rdx), %zmm19
+
+// CHECK: vrsqrt14pd 1016(%rdx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x58,0x4e,0x5a,0x7f]
+          vrsqrt14pd 1016(%rdx){1to8}, %zmm19
+
+// CHECK: vrsqrt14pd 1024(%rdx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x58,0x4e,0x9a,0x00,0x04,0x00,0x00]
+          vrsqrt14pd 1024(%rdx){1to8}, %zmm19
+
+// CHECK: vrsqrt14pd -1024(%rdx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x58,0x4e,0x5a,0x80]
+          vrsqrt14pd -1024(%rdx){1to8}, %zmm19
+
+// CHECK: vrsqrt14pd -1032(%rdx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x58,0x4e,0x9a,0xf8,0xfb,0xff,0xff]
+          vrsqrt14pd -1032(%rdx){1to8}, %zmm19
+
+// CHECK: vrsqrt14ps %zmm9, %zmm16
+// CHECK:  encoding: [0x62,0xc2,0x7d,0x48,0x4e,0xc1]
+          vrsqrt14ps %zmm9, %zmm16
+
+// CHECK: vrsqrt14ps %zmm9, %zmm16 {%k5}
+// CHECK:  encoding: [0x62,0xc2,0x7d,0x4d,0x4e,0xc1]
+          vrsqrt14ps %zmm9, %zmm16 {%k5}
+
+// CHECK: vrsqrt14ps %zmm9, %zmm16 {%k5} {z}
+// CHECK:  encoding: [0x62,0xc2,0x7d,0xcd,0x4e,0xc1]
+          vrsqrt14ps %zmm9, %zmm16 {%k5} {z}
+
+// CHECK: vrsqrt14ps (%rcx), %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x4e,0x01]
+          vrsqrt14ps (%rcx), %zmm16
+
+// CHECK: vrsqrt14ps 291(%rax,%r14,8), %zmm16
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x48,0x4e,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vrsqrt14ps 291(%rax,%r14,8), %zmm16
+
+// CHECK: vrsqrt14ps (%rcx){1to16}, %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x58,0x4e,0x01]
+          vrsqrt14ps (%rcx){1to16}, %zmm16
+
+// CHECK: vrsqrt14ps 8128(%rdx), %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x4e,0x42,0x7f]
+          vrsqrt14ps 8128(%rdx), %zmm16
+
+// CHECK: vrsqrt14ps 8192(%rdx), %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x4e,0x82,0x00,0x20,0x00,0x00]
+          vrsqrt14ps 8192(%rdx), %zmm16
+
+// CHECK: vrsqrt14ps -8192(%rdx), %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x4e,0x42,0x80]
+          vrsqrt14ps -8192(%rdx), %zmm16
+
+// CHECK: vrsqrt14ps -8256(%rdx), %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x48,0x4e,0x82,0xc0,0xdf,0xff,0xff]
+          vrsqrt14ps -8256(%rdx), %zmm16
+
+// CHECK: vrsqrt14ps 508(%rdx){1to16}, %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x58,0x4e,0x42,0x7f]
+          vrsqrt14ps 508(%rdx){1to16}, %zmm16
+
+// CHECK: vrsqrt14ps 512(%rdx){1to16}, %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x58,0x4e,0x82,0x00,0x02,0x00,0x00]
+          vrsqrt14ps 512(%rdx){1to16}, %zmm16
+
+// CHECK: vrsqrt14ps -512(%rdx){1to16}, %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x58,0x4e,0x42,0x80]
+          vrsqrt14ps -512(%rdx){1to16}, %zmm16
+
+// CHECK: vrsqrt14ps -516(%rdx){1to16}, %zmm16
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x58,0x4e,0x82,0xfc,0xfd,0xff,0xff]
+          vrsqrt14ps -516(%rdx){1to16}, %zmm16
+
+// CHECK: vsqrtpd %zmm19, %zmm19
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x48,0x51,0xdb]
+          vsqrtpd %zmm19, %zmm19
+
+// CHECK: vsqrtpd %zmm19, %zmm19 {%k5}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x4d,0x51,0xdb]
+          vsqrtpd %zmm19, %zmm19 {%k5}
+
+// CHECK: vsqrtpd %zmm19, %zmm19 {%k5} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0xcd,0x51,0xdb]
+          vsqrtpd %zmm19, %zmm19 {%k5} {z}
+
+// CHECK: vsqrtpd (%rcx), %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x51,0x19]
+          vsqrtpd (%rcx), %zmm19
+
+// CHECK: vsqrtpd 291(%rax,%r14,8), %zmm19
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x48,0x51,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vsqrtpd 291(%rax,%r14,8), %zmm19
+
+// CHECK: vsqrtpd (%rcx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x51,0x19]
+          vsqrtpd (%rcx){1to8}, %zmm19
+
+// CHECK: vsqrtpd 8128(%rdx), %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x51,0x5a,0x7f]
+          vsqrtpd 8128(%rdx), %zmm19
+
+// CHECK: vsqrtpd 8192(%rdx), %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x51,0x9a,0x00,0x20,0x00,0x00]
+          vsqrtpd 8192(%rdx), %zmm19
+
+// CHECK: vsqrtpd -8192(%rdx), %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x51,0x5a,0x80]
+          vsqrtpd -8192(%rdx), %zmm19
+
+// CHECK: vsqrtpd -8256(%rdx), %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x51,0x9a,0xc0,0xdf,0xff,0xff]
+          vsqrtpd -8256(%rdx), %zmm19
+
+// CHECK: vsqrtpd 1016(%rdx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x51,0x5a,0x7f]
+          vsqrtpd 1016(%rdx){1to8}, %zmm19
+
+// CHECK: vsqrtpd 1024(%rdx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x51,0x9a,0x00,0x04,0x00,0x00]
+          vsqrtpd 1024(%rdx){1to8}, %zmm19
+
+// CHECK: vsqrtpd -1024(%rdx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x51,0x5a,0x80]
+          vsqrtpd -1024(%rdx){1to8}, %zmm19
+
+// CHECK: vsqrtpd -1032(%rdx){1to8}, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x58,0x51,0x9a,0xf8,0xfb,0xff,0xff]
+          vsqrtpd -1032(%rdx){1to8}, %zmm19
+
+// CHECK: vsqrtps %zmm29, %zmm28
+// CHECK:  encoding: [0x62,0x01,0x7c,0x48,0x51,0xe5]
+          vsqrtps %zmm29, %zmm28
+
+// CHECK: vsqrtps %zmm29, %zmm28 {%k3}
+// CHECK:  encoding: [0x62,0x01,0x7c,0x4b,0x51,0xe5]
+          vsqrtps %zmm29, %zmm28 {%k3}
+
+// CHECK: vsqrtps %zmm29, %zmm28 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0x7c,0xcb,0x51,0xe5]
+          vsqrtps %zmm29, %zmm28 {%k3} {z}
+
+// CHECK: vsqrtps (%rcx), %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x51,0x21]
+          vsqrtps (%rcx), %zmm28
+
+// CHECK: vsqrtps 291(%rax,%r14,8), %zmm28
+// CHECK:  encoding: [0x62,0x21,0x7c,0x48,0x51,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vsqrtps 291(%rax,%r14,8), %zmm28
+
+// CHECK: vsqrtps (%rcx){1to16}, %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x58,0x51,0x21]
+          vsqrtps (%rcx){1to16}, %zmm28
+
+// CHECK: vsqrtps 8128(%rdx), %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x51,0x62,0x7f]
+          vsqrtps 8128(%rdx), %zmm28
+
+// CHECK: vsqrtps 8192(%rdx), %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x51,0xa2,0x00,0x20,0x00,0x00]
+          vsqrtps 8192(%rdx), %zmm28
+
+// CHECK: vsqrtps -8192(%rdx), %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x51,0x62,0x80]
+          vsqrtps -8192(%rdx), %zmm28
+
+// CHECK: vsqrtps -8256(%rdx), %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x51,0xa2,0xc0,0xdf,0xff,0xff]
+          vsqrtps -8256(%rdx), %zmm28
+
+// CHECK: vsqrtps 508(%rdx){1to16}, %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x58,0x51,0x62,0x7f]
+          vsqrtps 508(%rdx){1to16}, %zmm28
+
+// CHECK: vsqrtps 512(%rdx){1to16}, %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x58,0x51,0xa2,0x00,0x02,0x00,0x00]
+          vsqrtps 512(%rdx){1to16}, %zmm28
+
+// CHECK: vsqrtps -512(%rdx){1to16}, %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x58,0x51,0x62,0x80]
+          vsqrtps -512(%rdx){1to16}, %zmm28
+
+// CHECK: vsqrtps -516(%rdx){1to16}, %zmm28
+// CHECK:  encoding: [0x62,0x61,0x7c,0x58,0x51,0xa2,0xfc,0xfd,0xff,0xff]
+          vsqrtps -516(%rdx){1to16}, %zmm28
+
 // CHECK: vsubpd %zmm9, %zmm12, %zmm9
 // CHECK:  encoding: [0x62,0x51,0x9d,0x48,0x5c,0xc9]
           vsubpd %zmm9, %zmm12, %zmm9
@@ -2489,6 +3729,298 @@
 // CHECK:  encoding: [0x62,0x71,0x24,0x50,0x5c,0xb2,0xfc,0xfd,0xff,0xff]
           vsubps -516(%rdx){1to16}, %zmm27, %zmm14
 
+// CHECK: kandw  %k6, %k5, %k2
+// CHECK:  encoding: [0xc5,0xd4,0x41,0xd6]
+          kandw  %k6, %k5, %k2
+
+// CHECK: kandnw %k7, %k6, %k4
+// CHECK:  encoding: [0xc5,0xcc,0x42,0xe7]
+          kandnw %k7, %k6, %k4
+
+// CHECK: korw   %k7, %k6, %k4
+// CHECK:  encoding: [0xc5,0xcc,0x45,0xe7]
+          korw   %k7, %k6, %k4
+
+// CHECK: kxnorw %k5, %k5, %k3
+// CHECK:  encoding: [0xc5,0xd4,0x46,0xdd]
+          kxnorw %k5, %k5, %k3
+
+// CHECK: kxorw  %k7, %k6, %k2
+// CHECK:  encoding: [0xc5,0xcc,0x47,0xd7]
+          kxorw  %k7, %k6, %k2
+
+// CHECK: knotw  %k6, %k3
+// CHECK:  encoding: [0xc5,0xf8,0x44,0xde]
+          knotw  %k6, %k3
+
+// CHECK: kmovw  %k5, %k4
+// CHECK:  encoding: [0xc5,0xf8,0x90,0xe5]
+          kmovw  %k5, %k4
+
+// CHECK: kmovw  (%rcx), %k4
+// CHECK:  encoding: [0xc5,0xf8,0x90,0x21]
+          kmovw  (%rcx), %k4
+
+// CHECK: kmovw  291(%rax,%r14,8), %k4
+// CHECK:  encoding: [0xc4,0xa1,0x78,0x90,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          kmovw  291(%rax,%r14,8), %k4
+
+// CHECK: kmovw  %k4, (%rcx)
+// CHECK:  encoding: [0xc5,0xf8,0x91,0x21]
+          kmovw  %k4, (%rcx)
+
+// CHECK: kmovw  %k4, 291(%rax,%r14,8)
+// CHECK:  encoding: [0xc4,0xa1,0x78,0x91,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          kmovw  %k4, 291(%rax,%r14,8)
+
+// CHECK: kmovw  %eax, %k3
+// CHECK:  encoding: [0xc5,0xf8,0x92,0xd8]
+          kmovw  %eax, %k3
+
+// CHECK: kmovw  %ebp, %k3
+// CHECK:  encoding: [0xc5,0xf8,0x92,0xdd]
+          kmovw  %ebp, %k3
+
+// CHECK: kmovw  %r13d, %k3
+// CHECK:  encoding: [0xc4,0xc1,0x78,0x92,0xdd]
+          kmovw  %r13d, %k3
+
+// CHECK: kmovw  %k2, %eax
+// CHECK:  encoding: [0xc5,0xf8,0x93,0xc2]
+          kmovw  %k2, %eax
+
+// CHECK: kmovw  %k2, %ebp
+// CHECK:  encoding: [0xc5,0xf8,0x93,0xea]
+          kmovw  %k2, %ebp
+
+// CHECK: kmovw  %k2, %r13d
+// CHECK:  encoding: [0xc5,0x78,0x93,0xea]
+          kmovw  %k2, %r13d
+
+// CHECK: vmovapd %zmm18, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x29,0x11]
+          vmovapd %zmm18, (%rcx)
+
+// CHECK: vmovapd %zmm18, (%rcx) {%k6}
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x4e,0x29,0x11]
+          vmovapd %zmm18, (%rcx) {%k6}
+
+// CHECK: vmovapd %zmm18, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x48,0x29,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vmovapd %zmm18, 291(%rax,%r14,8)
+
+// CHECK: vmovapd %zmm18, 8128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x29,0x52,0x7f]
+          vmovapd %zmm18, 8128(%rdx)
+
+// CHECK: vmovapd %zmm18, 8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x29,0x92,0x00,0x20,0x00,0x00]
+          vmovapd %zmm18, 8192(%rdx)
+
+// CHECK: vmovapd %zmm18, -8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x29,0x52,0x80]
+          vmovapd %zmm18, -8192(%rdx)
+
+// CHECK: vmovapd %zmm18, -8256(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x29,0x92,0xc0,0xdf,0xff,0xff]
+          vmovapd %zmm18, -8256(%rdx)
+
+// CHECK: vmovaps %zmm9, (%rcx)
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x29,0x09]
+          vmovaps %zmm9, (%rcx)
+
+// CHECK: vmovaps %zmm9, (%rcx) {%k3}
+// CHECK:  encoding: [0x62,0x71,0x7c,0x4b,0x29,0x09]
+          vmovaps %zmm9, (%rcx) {%k3}
+
+// CHECK: vmovaps %zmm9, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x31,0x7c,0x48,0x29,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovaps %zmm9, 291(%rax,%r14,8)
+
+// CHECK: vmovaps %zmm9, 8128(%rdx)
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x29,0x4a,0x7f]
+          vmovaps %zmm9, 8128(%rdx)
+
+// CHECK: vmovaps %zmm9, 8192(%rdx)
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x29,0x8a,0x00,0x20,0x00,0x00]
+          vmovaps %zmm9, 8192(%rdx)
+
+// CHECK: vmovaps %zmm9, -8192(%rdx)
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x29,0x4a,0x80]
+          vmovaps %zmm9, -8192(%rdx)
+
+// CHECK: vmovaps %zmm9, -8256(%rdx)
+// CHECK:  encoding: [0x62,0x71,0x7c,0x48,0x29,0x8a,0xc0,0xdf,0xff,0xff]
+          vmovaps %zmm9, -8256(%rdx)
+
+// CHECK: vmovdqa32 %zmm18, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7f,0x11]
+          vmovdqa32 %zmm18, (%rcx)
+
+// CHECK: vmovdqa32 %zmm18, (%rcx) {%k4}
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x4c,0x7f,0x11]
+          vmovdqa32 %zmm18, (%rcx) {%k4}
+
+// CHECK: vmovdqa32 %zmm18, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x48,0x7f,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa32 %zmm18, 291(%rax,%r14,8)
+
+// CHECK: vmovdqa32 %zmm18, 8128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7f,0x52,0x7f]
+          vmovdqa32 %zmm18, 8128(%rdx)
+
+// CHECK: vmovdqa32 %zmm18, 8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7f,0x92,0x00,0x20,0x00,0x00]
+          vmovdqa32 %zmm18, 8192(%rdx)
+
+// CHECK: vmovdqa32 %zmm18, -8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7f,0x52,0x80]
+          vmovdqa32 %zmm18, -8192(%rdx)
+
+// CHECK: vmovdqa32 %zmm18, -8256(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x48,0x7f,0x92,0xc0,0xdf,0xff,0xff]
+          vmovdqa32 %zmm18, -8256(%rdx)
+
+// CHECK: vmovdqa64 %zmm19, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7f,0x19]
+          vmovdqa64 %zmm19, (%rcx)
+
+// CHECK: vmovdqa64 %zmm19, (%rcx) {%k5}
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x4d,0x7f,0x19]
+          vmovdqa64 %zmm19, (%rcx) {%k5}
+
+// CHECK: vmovdqa64 %zmm19, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x48,0x7f,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa64 %zmm19, 291(%rax,%r14,8)
+
+// CHECK: vmovdqa64 %zmm19, 8128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7f,0x5a,0x7f]
+          vmovdqa64 %zmm19, 8128(%rdx)
+
+// CHECK: vmovdqa64 %zmm19, 8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7f,0x9a,0x00,0x20,0x00,0x00]
+          vmovdqa64 %zmm19, 8192(%rdx)
+
+// CHECK: vmovdqa64 %zmm19, -8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7f,0x5a,0x80]
+          vmovdqa64 %zmm19, -8192(%rdx)
+
+// CHECK: vmovdqa64 %zmm19, -8256(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x48,0x7f,0x9a,0xc0,0xdf,0xff,0xff]
+          vmovdqa64 %zmm19, -8256(%rdx)
+
+// CHECK: vmovdqu32 %zmm22, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x48,0x7f,0x31]
+          vmovdqu32 %zmm22, (%rcx)
+
+// CHECK: vmovdqu32 %zmm22, (%rcx) {%k1}
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x49,0x7f,0x31]
+          vmovdqu32 %zmm22, (%rcx) {%k1}
+
+// CHECK: vmovdqu32 %zmm22, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x48,0x7f,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu32 %zmm22, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu32 %zmm22, 8128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x48,0x7f,0x72,0x7f]
+          vmovdqu32 %zmm22, 8128(%rdx)
+
+// CHECK: vmovdqu32 %zmm22, 8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x48,0x7f,0xb2,0x00,0x20,0x00,0x00]
+          vmovdqu32 %zmm22, 8192(%rdx)
+
+// CHECK: vmovdqu32 %zmm22, -8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x48,0x7f,0x72,0x80]
+          vmovdqu32 %zmm22, -8192(%rdx)
+
+// CHECK: vmovdqu32 %zmm22, -8256(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x48,0x7f,0xb2,0xc0,0xdf,0xff,0xff]
+          vmovdqu32 %zmm22, -8256(%rdx)
+
+// CHECK: vmovdqu64 %zmm24, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0xfe,0x48,0x7f,0x01]
+          vmovdqu64 %zmm24, (%rcx)
+
+// CHECK: vmovdqu64 %zmm24, (%rcx) {%k5}
+// CHECK:  encoding: [0x62,0x61,0xfe,0x4d,0x7f,0x01]
+          vmovdqu64 %zmm24, (%rcx) {%k5}
+
+// CHECK: vmovdqu64 %zmm24, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0xfe,0x48,0x7f,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu64 %zmm24, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu64 %zmm24, 8128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfe,0x48,0x7f,0x42,0x7f]
+          vmovdqu64 %zmm24, 8128(%rdx)
+
+// CHECK: vmovdqu64 %zmm24, 8192(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfe,0x48,0x7f,0x82,0x00,0x20,0x00,0x00]
+          vmovdqu64 %zmm24, 8192(%rdx)
+
+// CHECK: vmovdqu64 %zmm24, -8192(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfe,0x48,0x7f,0x42,0x80]
+          vmovdqu64 %zmm24, -8192(%rdx)
+
+// CHECK: vmovdqu64 %zmm24, -8256(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfe,0x48,0x7f,0x82,0xc0,0xdf,0xff,0xff]
+          vmovdqu64 %zmm24, -8256(%rdx)
+
+// CHECK: vmovupd %zmm10, (%rcx)
+// CHECK:  encoding: [0x62,0x71,0xfd,0x48,0x11,0x11]
+          vmovupd %zmm10, (%rcx)
+
+// CHECK: vmovupd %zmm10, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0x71,0xfd,0x4f,0x11,0x11]
+          vmovupd %zmm10, (%rcx) {%k7}
+
+// CHECK: vmovupd %zmm10, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x31,0xfd,0x48,0x11,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vmovupd %zmm10, 291(%rax,%r14,8)
+
+// CHECK: vmovupd %zmm10, 8128(%rdx)
+// CHECK:  encoding: [0x62,0x71,0xfd,0x48,0x11,0x52,0x7f]
+          vmovupd %zmm10, 8128(%rdx)
+
+// CHECK: vmovupd %zmm10, 8192(%rdx)
+// CHECK:  encoding: [0x62,0x71,0xfd,0x48,0x11,0x92,0x00,0x20,0x00,0x00]
+          vmovupd %zmm10, 8192(%rdx)
+
+// CHECK: vmovupd %zmm10, -8192(%rdx)
+// CHECK:  encoding: [0x62,0x71,0xfd,0x48,0x11,0x52,0x80]
+          vmovupd %zmm10, -8192(%rdx)
+
+// CHECK: vmovupd %zmm10, -8256(%rdx)
+// CHECK:  encoding: [0x62,0x71,0xfd,0x48,0x11,0x92,0xc0,0xdf,0xff,0xff]
+          vmovupd %zmm10, -8256(%rdx)
+
+// CHECK: vmovups %zmm24, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x11,0x01]
+          vmovups %zmm24, (%rcx)
+
+// CHECK: vmovups %zmm24, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0x61,0x7c,0x4f,0x11,0x01]
+          vmovups %zmm24, (%rcx) {%k7}
+
+// CHECK: vmovups %zmm24, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0x7c,0x48,0x11,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmovups %zmm24, 291(%rax,%r14,8)
+
+// CHECK: vmovups %zmm24, 8128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x11,0x42,0x7f]
+          vmovups %zmm24, 8128(%rdx)
+
+// CHECK: vmovups %zmm24, 8192(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x11,0x82,0x00,0x20,0x00,0x00]
+          vmovups %zmm24, 8192(%rdx)
+
+// CHECK: vmovups %zmm24, -8192(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x11,0x42,0x80]
+          vmovups %zmm24, -8192(%rdx)
+
+// CHECK: vmovups %zmm24, -8256(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x48,0x11,0x82,0xc0,0xdf,0xff,0xff]
+          vmovups %zmm24, -8256(%rdx)
+
 // CHECK: vpmovqb %zmm2, %xmm3
 // CHECK:  encoding: [0x62,0xf2,0x7e,0x48,0x32,0xd3]
           vpmovqb %zmm2, %xmm3
@@ -3097,6 +4629,14 @@
 // CHECK: encoding: [0x62,0xe3,0x1d,0x40,0x38,0x4f,0x10,0x01]
 vinserti32x4  $1, 256(%rdi), %zmm28, %zmm17
 
+// CHECK: vinserti32x8
+// CHECK: encoding: [0x62,0xd3,0x4d,0x40,0x3a,0xdb,0x01]
+vinserti32x8  $1, %ymm11, %zmm22, %zmm3
+
+// CHECK: vinsertf64x2
+// CHECK: encoding: [0x62,0xf3,0xed,0x48,0x18,0x4f,0x10,0x01]
+vinsertf64x2  $1, 256(%rdi), %zmm2, %zmm1
+
 // CHECK: vextracti32x4
 // CHECK: encoding: [0x62,0x33,0x7d,0x48,0x39,0xc9,0x01]
 vextracti32x4  $1, %zmm9, %xmm17
@@ -3219,3 +4759,59 @@
 // CHECK: vpermt2d
 // CHECK: encoding: [0x62,0x32,0x4d,0xc2,0x7e,0x24,0xad,0x05,0x00,0x00,0x00]	
 vpermt2d 5(,%r13,4), %zmm22, %zmm12 {%k2} {z}
+
+// CHECK: valignq $2
+// CHECK: encoding: [0x62,0xf3,0xfd,0x48,0x03,0x4c,0x24,0x04,0x02]
+valignq  $2, 0x100(%rsp), %zmm0, %zmm1
+
+// CHECK: valignq $3
+// CHECK: encoding: [0x62,0xf3,0xfd,0x49,0x03,0xcb,0x03]
+valignq  $3, %zmm3, %zmm0, %zmm1 {%k1}
+
+// CHECK: vextractf32x4 $3
+// CHECK: encoding: [0x62,0xf3,0x7d,0x49,0x19,0xd9,0x03]
+vextractf32x4  $3, %zmm3, %xmm1 {%k1}
+
+// CHECK: vextracti64x4 $1
+// CHECK: encoding: [0x62,0x53,0xfd,0xcb,0x3b,0xf4,0x01]
+vextracti64x4  $1, %zmm14, %ymm12 {%k3} {z}
+
+// CHECK: vfmadd231ps
+// CHECK: encoding: [0x62,0xb2,0x1d,0x48,0xb8,0xe7]
+vfmadd231ps %zmm23, %zmm12, %zmm4
+
+// CHECK: vfmsub231pd
+// CHECK: encoding: [0x62,0xe2,0xed,0x48,0xba,0x73,0x08]
+vfmsub231pd 0x200(%rbx), %zmm2, %zmm22
+
+// CHECK: vfmaddsub231ps
+// CHECK: encoding: [0x62,0xd2,0x65,0x4b,0xb6,0xec]
+vfmaddsub231ps %zmm12, %zmm3, %zmm5 {%k3}
+
+// CHECK: vfmsubadd231pd
+// CHECK: encoding: [0x62,0x72,0x85,0xc5,0xb7,0xdd]
+vfmsubadd231pd %zmm5, %zmm31, %zmm11 {%k5}{z}
+
+// CHECK: vfnmadd231ps
+// CHECK: encoding: [0x62,0xf2,0x4d,0x48,0xbc,0xfd]
+vfnmadd231ps %zmm5, %zmm6, %zmm7
+
+// CHECK: vfnmsub231pd
+// CHECK: encoding: [0x62,0xf2,0xcd,0x48,0xbe,0xfd]
+vfnmsub231pd %zmm5, %zmm6, %zmm7
+
+// CHECK: vpermilps
+// CHECK: encoding: [0x62,0xf2,0x6d,0x48,0x0c,0xd9]
+vpermilps %zmm1, %zmm2, %zmm3
+
+// CHECK: vpermilpd
+// CHECK: encoding: [0x62,0xf2,0xed,0x48,0x0d,0x5b,0x10]
+vpermilpd 0x400(%rbx), %zmm2, %zmm3
+
+// CHECK: vpermilps
+// CHECK: encoding: [0x62,0xf2,0x6d,0x48,0x0c,0x5b,0x10]
+vpermilps 0x400(%rbx), %zmm2, %zmm3
+
+// CHECK: vpermilpd
+// CHECK: encoding: [0x62,0xf3,0xfd,0x48,0x05,0x53,0x10,0x23]
+vpermilpd $0x23, 0x400(%rbx), %zmm2

diff --git a/test/MC/X86/intel-syntax-2.s b/test/MC/X86/intel-syntax-2.s
index d6dbe15..f7bdaf9 100644
--- a/test/MC/X86/intel-syntax-2.s
+++ b/test/MC/X86/intel-syntax-2.s

@@ -7,3 +7,11 @@
     .att_syntax
 // CHECK:	movl	$257, -4(%rsp)
     movl $257, -4(%rsp)
+
+_test2:
+.intel_syntax noprefix
+	mov	DWORD PTR [RSP - 4], 255
+// CHECK:	movl	$255, -4(%rsp)
+.att_syntax prefix
+	movl $255, -4(%rsp)
+// CHECK:	movl	$255, -4(%rsp)

diff --git a/test/MC/X86/intel-syntax-ambiguous.s b/test/MC/X86/intel-syntax-ambiguous.s
new file mode 100644
index 0000000..fe1fe50
--- /dev/null
+++ b/test/MC/X86/intel-syntax-ambiguous.s

@@ -0,0 +1,47 @@
+// RUN: not llvm-mc -triple i686-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+.intel_syntax
+
+// Basic case of ambiguity for inc.
+
+inc [eax]
+// CHECK: error: ambiguous operand size for instruction 'inc'
+inc dword ptr [eax]
+inc word ptr [eax]
+inc byte ptr [eax]
+// CHECK-NOT: error:
+
+// Other ambiguous instructions.  Anything that doesn't take a register,
+// basically.
+
+dec [eax]
+// CHECK: error: ambiguous operand size for instruction 'dec'
+mov [eax], 1
+// CHECK: error: ambiguous operand size for instruction 'mov'
+and [eax], 0
+// CHECK: error: ambiguous operand size for instruction 'and'
+or [eax], 1
+// CHECK: error: ambiguous operand size for instruction 'or'
+add [eax], 1
+// CHECK: error: ambiguous operand size for instruction 'add'
+sub [eax], 1
+// CHECK: error: ambiguous operand size for instruction 'sub'
+
+// gas assumes these instructions are pointer-sized by default, and we follow
+// suit.
+push [eax]
+call [eax]
+jmp [eax]
+// CHECK-NOT: error:
+
+add byte ptr [eax], eax
+// CHECK: error: invalid operand for instruction
+
+add byte ptr [eax], eax
+// CHECK: error: invalid operand for instruction
+
+add rax, 3
+// CHECK: error: register %rax is only available in 64-bit mode
+
+fadd   "?half@?0??bar@@YAXXZ@4NA"
+// CHECK: error: ambiguous operand size for instruction 'fadd'

diff --git a/test/MC/X86/intel-syntax-error.s b/test/MC/X86/intel-syntax-error.s
new file mode 100644
index 0000000..7207c95
--- /dev/null
+++ b/test/MC/X86/intel-syntax-error.s

@@ -0,0 +1,13 @@
+// RUN: not llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=att %s -o /dev/null 2>&1 | FileCheck %s
+
+// This tests weird forms of Intel and AT&T syntax that gas accepts that we
+// don't.  The [no]prefix operand of the syntax directive indicates whether
+// registers need a '%' prefix.
+
+.intel_syntax prefix
+// CHECK: error: '.intel_syntax prefix' is not supported: registers must not have a '%' prefix in .intel_syntax
+_test2:
+	mov	DWORD PTR [%esp - 4], 257
+.att_syntax noprefix
+// CHECK: error: '.att_syntax noprefix' is not supported: registers must have a '%' prefix in .att_syntax
+	movl	$257, -4(esp)

diff --git a/test/MC/X86/intel-syntax-ptr-sized.s b/test/MC/X86/intel-syntax-ptr-sized.s
new file mode 100644
index 0000000..c052c32
--- /dev/null
+++ b/test/MC/X86/intel-syntax-ptr-sized.s

@@ -0,0 +1,20 @@
+// RUN: llvm-mc %s -triple=i686-pc-windows | FileCheck %s
+
+.intel_syntax
+
+push [eax]
+// CHECK: pushl (%eax)
+call [eax]
+// CHECK: calll *(%eax)
+jmp [eax]
+// CHECK: jmpl *(%eax)
+
+// mode switch
+.code16
+
+push [eax]
+// CHECK: pushw (%eax)
+call [eax]
+// CHECK: callw *(%eax)
+jmp [eax]
+// CHECK: jmpw *(%eax)

diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index 7968918..c027aa4 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s

@@ -603,7 +603,62 @@
 "?half@?0??bar@@YAXXZ@4NA":
 	.quad   4602678819172646912
 
-fadd   "?half@?0??bar@@YAXXZ@4NA"
-fadd   "?half@?0??bar@@YAXXZ@4NA"@IMGREL
+fadd   dword ptr "?half@?0??bar@@YAXXZ@4NA"
+fadd   dword ptr "?half@?0??bar@@YAXXZ@4NA"@IMGREL
 // CHECK: fadds   "?half@?0??bar@@YAXXZ@4NA"
-// CHECK: fadds   "?half@?0??bar@@YAXXZ@4NA"@IMGREL32
+// CHECK: fadds   "?half@?0??bar@@YAXXZ@4NA"@IMGREL
+
+inc qword ptr [rax]
+inc dword ptr [rax]
+inc word ptr [rax]
+inc byte ptr [rax]
+// CHECK: incq (%rax)
+// CHECK: incl (%rax)
+// CHECK: incw (%rax)
+// CHECK: incb (%rax)
+
+dec qword ptr [rax]
+dec dword ptr [rax]
+dec word ptr [rax]
+dec byte ptr [rax]
+// CHECK: decq (%rax)
+// CHECK: decl (%rax)
+// CHECK: decw (%rax)
+// CHECK: decb (%rax)
+
+add qword ptr [rax], 1
+add dword ptr [rax], 1
+add word ptr [rax], 1
+add byte ptr [rax], 1
+// CHECK: addq $1, (%rax)
+// CHECK: addl $1, (%rax)
+// CHECK: addw $1, (%rax)
+// CHECK: addb $1, (%rax)
+
+fstp xword ptr [rax]
+fstp qword ptr [rax]
+fstp dword ptr [rax]
+// CHECK: fstpt (%rax)
+// CHECK: fstpl (%rax)
+// CHECK: fstps (%rax)
+
+fxsave [eax]
+fsave [eax]
+fxrstor [eax]
+frstor [eax]
+// CHECK: fxsave (%eax)
+// CHECK: wait
+// CHECK: fnsave (%eax)
+// CHECK: fxrstor (%eax)
+// CHECK: frstor (%eax)
+
+// FIXME: Should we accept this?  Masm accepts it, but gas does not.
+fxsave dword ptr [eax]
+fsave dword ptr [eax]
+fxrstor dword ptr [eax]
+frstor dword ptr [eax]
+// CHECK: fxsave (%eax)
+// CHECK: wait
+// CHECK: fnsave (%eax)
+// CHECK: fxrstor (%eax)
+// CHECK: frstor (%eax)

diff --git a/test/MC/X86/macho-uleb.s b/test/MC/X86/macho-uleb.s
new file mode 100644
index 0000000..46d858b
--- /dev/null
+++ b/test/MC/X86/macho-uleb.s

@@ -0,0 +1,7 @@
+// RUN: llvm-mc -triple=x86_64-apple-darwin %s | FileCheck %s
+
+a:
+b:
+        .uleb128 a-b
+
+// CHECK:        .uleb128 a-b

diff --git a/test/MC/X86/reloc-macho.s b/test/MC/X86/reloc-macho.s
new file mode 100644
index 0000000..9297b1b
--- /dev/null
+++ b/test/MC/X86/reloc-macho.s

@@ -0,0 +1,9 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-apple-darwin %s -o - | llvm-readobj -r | FileCheck %s
+
+// CHECK:      Relocations [
+// CHECK-NEXT: ]
+
+  .section foo,bar
+La:
+Lb:
+ .long   La-Lb

diff --git a/test/MC/X86/sgx-encoding.s b/test/MC/X86/sgx-encoding.s
new file mode 100644
index 0000000..e6ae8c9
--- /dev/null
+++ b/test/MC/X86/sgx-encoding.s

@@ -0,0 +1,9 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: encls
+// CHECK: encoding: [0x0f,0x01,0xcf]
+	encls
+
+// CHECK: enclu
+// CHECK: encoding: [0x0f,0x01,0xd7]
+	enclu

diff --git a/test/MC/X86/stackmap-nops.ll b/test/MC/X86/stackmap-nops.ll
index 98d17ea..a0d4418 100644
--- a/test/MC/X86/stackmap-nops.ll
+++ b/test/MC/X86/stackmap-nops.ll

@@ -41,6 +41,10 @@
   tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 13, i32 13)
   tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 14)
   tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 15)
+; Add an extra stackmap with a zero-length shadow to thwart the shadow
+; optimization. This will force all 15 bytes of the previous shadow to be
+; padded with nops.
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 0)
   ret void
 }
 

diff --git a/test/MC/X86/x86-32-coverage.s b/test/MC/X86/x86-32-coverage.s
index 732874b..80c34ec 100644
--- a/test/MC/X86/x86-32-coverage.s
+++ b/test/MC/X86/x86-32-coverage.s

@@ -19618,22 +19618,36 @@
 // CHECK:   blendvps	%xmm2, %xmm1    # encoding: [0x66,0x0f,0x38,0x14,0xca]
             blendvps %xmm2, %xmm1
 
-// rdar://9795008
-// These instructions take a mask not an 8-bit sign extended value.
+// These instructions can take an unsigned 8-bit mask as well as a signed 8-bit
+// immediate. Check both forms here.
 // CHECK: blendps $129, %xmm2, %xmm1
           blendps $0x81, %xmm2, %xmm1
+// CHECK: blendps $-64, %xmm2, %xmm1
+          blendps $-64, %xmm2, %xmm1
 // CHECK: blendpd $129, %xmm2, %xmm1
           blendpd $0x81, %xmm2, %xmm1
+// CHECK: blendpd $-64, %xmm2, %xmm1
+          blendpd $-64, %xmm2, %xmm1
 // CHECK: pblendw $129, %xmm2, %xmm1
           pblendw $0x81, %xmm2, %xmm1
+// CHECK: pblendw $-64, %xmm2, %xmm1
+          pblendw $-64, %xmm2, %xmm1
 // CHECK: mpsadbw $129, %xmm2, %xmm1
           mpsadbw $0x81, %xmm2, %xmm1
+// CHECK: mpsadbw $-64, %xmm2, %xmm1
+          mpsadbw $-64, %xmm2, %xmm1
 // CHECK: dpps $129, %xmm2, %xmm1
           dpps $0x81, %xmm2, %xmm1
+// CHECK: dpps $-64, %xmm2, %xmm1
+          dpps $-64, %xmm2, %xmm1
 // CHECK: dppd $129, %xmm2, %xmm1
           dppd $0x81, %xmm2, %xmm1
+// CHECK: dppd $-64, %xmm2, %xmm1
+          dppd $-64, %xmm2, %xmm1
 // CHECK: insertps $129, %xmm2, %xmm1
           insertps $0x81, %xmm2, %xmm1
+// CHECK: insertps $-64, %xmm2, %xmm1
+          insertps $-64, %xmm2, %xmm1
 
 // PR13253 handle implicit optional third argument that must always be xmm0
 // CHECK: pblendvb %xmm2, %xmm1

diff --git a/test/MC/X86/x86-32-ms-inline-asm.s b/test/MC/X86/x86-32-ms-inline-asm.s
index d912915..3169033 100644
--- a/test/MC/X86/x86-32-ms-inline-asm.s
+++ b/test/MC/X86/x86-32-ms-inline-asm.s

@@ -90,4 +90,8 @@
 // CHECK: popal
 // CHECK: # encoding: [0x61]
 
+    fwait
+// CHECK: wait
+// CHECK: # encoding: [0x9b]
+
 	ret

diff --git a/test/MC/X86/x86-64-avx512bw.s b/test/MC/X86/x86-64-avx512bw.s
new file mode 100644
index 0000000..5155504
--- /dev/null
+++ b/test/MC/X86/x86-64-avx512bw.s

@@ -0,0 +1,997 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw  --show-encoding %s | FileCheck %s
+
+// CHECK: vpaddb %zmm23, %zmm24, %zmm19
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x40,0xfc,0xdf]
+          vpaddb %zmm23, %zmm24, %zmm19
+
+// CHECK: vpaddb %zmm23, %zmm24, %zmm19 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x43,0xfc,0xdf]
+          vpaddb %zmm23, %zmm24, %zmm19 {%k3}
+
+// CHECK: vpaddb %zmm23, %zmm24, %zmm19 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0x3d,0xc3,0xfc,0xdf]
+          vpaddb %zmm23, %zmm24, %zmm19 {%k3} {z}
+
+// CHECK: vpaddb (%rcx), %zmm24, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfc,0x19]
+          vpaddb (%rcx), %zmm24, %zmm19
+
+// CHECK: vpaddb 291(%rax,%r14,8), %zmm24, %zmm19
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x40,0xfc,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpaddb 291(%rax,%r14,8), %zmm24, %zmm19
+
+// CHECK: vpaddb 8128(%rdx), %zmm24, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfc,0x5a,0x7f]
+          vpaddb 8128(%rdx), %zmm24, %zmm19
+
+// CHECK: vpaddb 8192(%rdx), %zmm24, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfc,0x9a,0x00,0x20,0x00,0x00]
+          vpaddb 8192(%rdx), %zmm24, %zmm19
+
+// CHECK: vpaddb -8192(%rdx), %zmm24, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfc,0x5a,0x80]
+          vpaddb -8192(%rdx), %zmm24, %zmm19
+
+// CHECK: vpaddb -8256(%rdx), %zmm24, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfc,0x9a,0xc0,0xdf,0xff,0xff]
+          vpaddb -8256(%rdx), %zmm24, %zmm19
+
+// CHECK: vpaddw %zmm19, %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x40,0xfd,0xcb]
+          vpaddw %zmm19, %zmm24, %zmm17
+
+// CHECK: vpaddw %zmm19, %zmm24, %zmm17 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x43,0xfd,0xcb]
+          vpaddw %zmm19, %zmm24, %zmm17 {%k3}
+
+// CHECK: vpaddw %zmm19, %zmm24, %zmm17 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0x3d,0xc3,0xfd,0xcb]
+          vpaddw %zmm19, %zmm24, %zmm17 {%k3} {z}
+
+// CHECK: vpaddw (%rcx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfd,0x09]
+          vpaddw (%rcx), %zmm24, %zmm17
+
+// CHECK: vpaddw 291(%rax,%r14,8), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x40,0xfd,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpaddw 291(%rax,%r14,8), %zmm24, %zmm17
+
+// CHECK: vpaddw 8128(%rdx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfd,0x4a,0x7f]
+          vpaddw 8128(%rdx), %zmm24, %zmm17
+
+// CHECK: vpaddw 8192(%rdx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfd,0x8a,0x00,0x20,0x00,0x00]
+          vpaddw 8192(%rdx), %zmm24, %zmm17
+
+// CHECK: vpaddw -8192(%rdx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfd,0x4a,0x80]
+          vpaddw -8192(%rdx), %zmm24, %zmm17
+
+// CHECK: vpaddw -8256(%rdx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xfd,0x8a,0xc0,0xdf,0xff,0xff]
+          vpaddw -8256(%rdx), %zmm24, %zmm17
+
+// CHECK: vpcmpeqb %zmm26, %zmm26, %k4
+// CHECK:  encoding: [0x62,0x91,0x2d,0x40,0x74,0xe2]
+          vpcmpeqb %zmm26, %zmm26, %k4
+
+// CHECK: vpcmpeqb %zmm26, %zmm26, %k4 {%k6}
+// CHECK:  encoding: [0x62,0x91,0x2d,0x46,0x74,0xe2]
+          vpcmpeqb %zmm26, %zmm26, %k4 {%k6}
+
+// CHECK: vpcmpeqb (%rcx), %zmm26, %k4
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x40,0x74,0x21]
+          vpcmpeqb (%rcx), %zmm26, %k4
+
+// CHECK: vpcmpeqb 291(%rax,%r14,8), %zmm26, %k4
+// CHECK:  encoding: [0x62,0xb1,0x2d,0x40,0x74,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqb 291(%rax,%r14,8), %zmm26, %k4
+
+// CHECK: vpcmpeqb 8128(%rdx), %zmm26, %k4
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x40,0x74,0x62,0x7f]
+          vpcmpeqb 8128(%rdx), %zmm26, %k4
+
+// CHECK: vpcmpeqb 8192(%rdx), %zmm26, %k4
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x40,0x74,0xa2,0x00,0x20,0x00,0x00]
+          vpcmpeqb 8192(%rdx), %zmm26, %k4
+
+// CHECK: vpcmpeqb -8192(%rdx), %zmm26, %k4
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x40,0x74,0x62,0x80]
+          vpcmpeqb -8192(%rdx), %zmm26, %k4
+
+// CHECK: vpcmpeqb -8256(%rdx), %zmm26, %k4
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x40,0x74,0xa2,0xc0,0xdf,0xff,0xff]
+          vpcmpeqb -8256(%rdx), %zmm26, %k4
+
+// CHECK: vpcmpeqw %zmm19, %zmm23, %k5
+// CHECK:  encoding: [0x62,0xb1,0x45,0x40,0x75,0xeb]
+          vpcmpeqw %zmm19, %zmm23, %k5
+
+// CHECK: vpcmpeqw %zmm19, %zmm23, %k5 {%k7}
+// CHECK:  encoding: [0x62,0xb1,0x45,0x47,0x75,0xeb]
+          vpcmpeqw %zmm19, %zmm23, %k5 {%k7}
+
+// CHECK: vpcmpeqw (%rcx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x75,0x29]
+          vpcmpeqw (%rcx), %zmm23, %k5
+
+// CHECK: vpcmpeqw 291(%rax,%r14,8), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xb1,0x45,0x40,0x75,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqw 291(%rax,%r14,8), %zmm23, %k5
+
+// CHECK: vpcmpeqw 8128(%rdx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x75,0x6a,0x7f]
+          vpcmpeqw 8128(%rdx), %zmm23, %k5
+
+// CHECK: vpcmpeqw 8192(%rdx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x75,0xaa,0x00,0x20,0x00,0x00]
+          vpcmpeqw 8192(%rdx), %zmm23, %k5
+
+// CHECK: vpcmpeqw -8192(%rdx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x75,0x6a,0x80]
+          vpcmpeqw -8192(%rdx), %zmm23, %k5
+
+// CHECK: vpcmpeqw -8256(%rdx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x75,0xaa,0xc0,0xdf,0xff,0xff]
+          vpcmpeqw -8256(%rdx), %zmm23, %k5
+
+// CHECK: vpcmpgtb %zmm20, %zmm30, %k4
+// CHECK:  encoding: [0x62,0xb1,0x0d,0x40,0x64,0xe4]
+          vpcmpgtb %zmm20, %zmm30, %k4
+
+// CHECK: vpcmpgtb %zmm20, %zmm30, %k4 {%k1}
+// CHECK:  encoding: [0x62,0xb1,0x0d,0x41,0x64,0xe4]
+          vpcmpgtb %zmm20, %zmm30, %k4 {%k1}
+
+// CHECK: vpcmpgtb (%rcx), %zmm30, %k4
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x40,0x64,0x21]
+          vpcmpgtb (%rcx), %zmm30, %k4
+
+// CHECK: vpcmpgtb 291(%rax,%r14,8), %zmm30, %k4
+// CHECK:  encoding: [0x62,0xb1,0x0d,0x40,0x64,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtb 291(%rax,%r14,8), %zmm30, %k4
+
+// CHECK: vpcmpgtb 8128(%rdx), %zmm30, %k4
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x40,0x64,0x62,0x7f]
+          vpcmpgtb 8128(%rdx), %zmm30, %k4
+
+// CHECK: vpcmpgtb 8192(%rdx), %zmm30, %k4
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x40,0x64,0xa2,0x00,0x20,0x00,0x00]
+          vpcmpgtb 8192(%rdx), %zmm30, %k4
+
+// CHECK: vpcmpgtb -8192(%rdx), %zmm30, %k4
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x40,0x64,0x62,0x80]
+          vpcmpgtb -8192(%rdx), %zmm30, %k4
+
+// CHECK: vpcmpgtb -8256(%rdx), %zmm30, %k4
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x40,0x64,0xa2,0xc0,0xdf,0xff,0xff]
+          vpcmpgtb -8256(%rdx), %zmm30, %k4
+
+// CHECK: vpcmpgtw %zmm21, %zmm23, %k5
+// CHECK:  encoding: [0x62,0xb1,0x45,0x40,0x65,0xed]
+          vpcmpgtw %zmm21, %zmm23, %k5
+
+// CHECK: vpcmpgtw %zmm21, %zmm23, %k5 {%k7}
+// CHECK:  encoding: [0x62,0xb1,0x45,0x47,0x65,0xed]
+          vpcmpgtw %zmm21, %zmm23, %k5 {%k7}
+
+// CHECK: vpcmpgtw (%rcx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x65,0x29]
+          vpcmpgtw (%rcx), %zmm23, %k5
+
+// CHECK: vpcmpgtw 291(%rax,%r14,8), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xb1,0x45,0x40,0x65,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtw 291(%rax,%r14,8), %zmm23, %k5
+
+// CHECK: vpcmpgtw 8128(%rdx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x65,0x6a,0x7f]
+          vpcmpgtw 8128(%rdx), %zmm23, %k5
+
+// CHECK: vpcmpgtw 8192(%rdx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x65,0xaa,0x00,0x20,0x00,0x00]
+          vpcmpgtw 8192(%rdx), %zmm23, %k5
+
+// CHECK: vpcmpgtw -8192(%rdx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x65,0x6a,0x80]
+          vpcmpgtw -8192(%rdx), %zmm23, %k5
+
+// CHECK: vpcmpgtw -8256(%rdx), %zmm23, %k5
+// CHECK:  encoding: [0x62,0xf1,0x45,0x40,0x65,0xaa,0xc0,0xdf,0xff,0xff]
+          vpcmpgtw -8256(%rdx), %zmm23, %k5
+
+// CHECK: vpmaxsb %zmm26, %zmm29, %zmm23
+// CHECK:  encoding: [0x62,0x82,0x15,0x40,0x3c,0xfa]
+          vpmaxsb %zmm26, %zmm29, %zmm23
+
+// CHECK: vpmaxsb %zmm26, %zmm29, %zmm23 {%k4}
+// CHECK:  encoding: [0x62,0x82,0x15,0x44,0x3c,0xfa]
+          vpmaxsb %zmm26, %zmm29, %zmm23 {%k4}
+
+// CHECK: vpmaxsb %zmm26, %zmm29, %zmm23 {%k4} {z}
+// CHECK:  encoding: [0x62,0x82,0x15,0xc4,0x3c,0xfa]
+          vpmaxsb %zmm26, %zmm29, %zmm23 {%k4} {z}
+
+// CHECK: vpmaxsb (%rcx), %zmm29, %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3c,0x39]
+          vpmaxsb (%rcx), %zmm29, %zmm23
+
+// CHECK: vpmaxsb 291(%rax,%r14,8), %zmm29, %zmm23
+// CHECK:  encoding: [0x62,0xa2,0x15,0x40,0x3c,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsb 291(%rax,%r14,8), %zmm29, %zmm23
+
+// CHECK: vpmaxsb 8128(%rdx), %zmm29, %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3c,0x7a,0x7f]
+          vpmaxsb 8128(%rdx), %zmm29, %zmm23
+
+// CHECK: vpmaxsb 8192(%rdx), %zmm29, %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3c,0xba,0x00,0x20,0x00,0x00]
+          vpmaxsb 8192(%rdx), %zmm29, %zmm23
+
+// CHECK: vpmaxsb -8192(%rdx), %zmm29, %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3c,0x7a,0x80]
+          vpmaxsb -8192(%rdx), %zmm29, %zmm23
+
+// CHECK: vpmaxsb -8256(%rdx), %zmm29, %zmm23
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3c,0xba,0xc0,0xdf,0xff,0xff]
+          vpmaxsb -8256(%rdx), %zmm29, %zmm23
+
+// CHECK: vpmaxsw %zmm25, %zmm28, %zmm25
+// CHECK:  encoding: [0x62,0x01,0x1d,0x40,0xee,0xc9]
+          vpmaxsw %zmm25, %zmm28, %zmm25
+
+// CHECK: vpmaxsw %zmm25, %zmm28, %zmm25 {%k3}
+// CHECK:  encoding: [0x62,0x01,0x1d,0x43,0xee,0xc9]
+          vpmaxsw %zmm25, %zmm28, %zmm25 {%k3}
+
+// CHECK: vpmaxsw %zmm25, %zmm28, %zmm25 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0x1d,0xc3,0xee,0xc9]
+          vpmaxsw %zmm25, %zmm28, %zmm25 {%k3} {z}
+
+// CHECK: vpmaxsw (%rcx), %zmm28, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x1d,0x40,0xee,0x09]
+          vpmaxsw (%rcx), %zmm28, %zmm25
+
+// CHECK: vpmaxsw 291(%rax,%r14,8), %zmm28, %zmm25
+// CHECK:  encoding: [0x62,0x21,0x1d,0x40,0xee,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsw 291(%rax,%r14,8), %zmm28, %zmm25
+
+// CHECK: vpmaxsw 8128(%rdx), %zmm28, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x1d,0x40,0xee,0x4a,0x7f]
+          vpmaxsw 8128(%rdx), %zmm28, %zmm25
+
+// CHECK: vpmaxsw 8192(%rdx), %zmm28, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x1d,0x40,0xee,0x8a,0x00,0x20,0x00,0x00]
+          vpmaxsw 8192(%rdx), %zmm28, %zmm25
+
+// CHECK: vpmaxsw -8192(%rdx), %zmm28, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x1d,0x40,0xee,0x4a,0x80]
+          vpmaxsw -8192(%rdx), %zmm28, %zmm25
+
+// CHECK: vpmaxsw -8256(%rdx), %zmm28, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x1d,0x40,0xee,0x8a,0xc0,0xdf,0xff,0xff]
+          vpmaxsw -8256(%rdx), %zmm28, %zmm25
+
+// CHECK: vpmaxub %zmm25, %zmm18, %zmm29
+// CHECK:  encoding: [0x62,0x01,0x6d,0x40,0xde,0xe9]
+          vpmaxub %zmm25, %zmm18, %zmm29
+
+// CHECK: vpmaxub %zmm25, %zmm18, %zmm29 {%k7}
+// CHECK:  encoding: [0x62,0x01,0x6d,0x47,0xde,0xe9]
+          vpmaxub %zmm25, %zmm18, %zmm29 {%k7}
+
+// CHECK: vpmaxub %zmm25, %zmm18, %zmm29 {%k7} {z}
+// CHECK:  encoding: [0x62,0x01,0x6d,0xc7,0xde,0xe9]
+          vpmaxub %zmm25, %zmm18, %zmm29 {%k7} {z}
+
+// CHECK: vpmaxub (%rcx), %zmm18, %zmm29
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xde,0x29]
+          vpmaxub (%rcx), %zmm18, %zmm29
+
+// CHECK: vpmaxub 291(%rax,%r14,8), %zmm18, %zmm29
+// CHECK:  encoding: [0x62,0x21,0x6d,0x40,0xde,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxub 291(%rax,%r14,8), %zmm18, %zmm29
+
+// CHECK: vpmaxub 8128(%rdx), %zmm18, %zmm29
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xde,0x6a,0x7f]
+          vpmaxub 8128(%rdx), %zmm18, %zmm29
+
+// CHECK: vpmaxub 8192(%rdx), %zmm18, %zmm29
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xde,0xaa,0x00,0x20,0x00,0x00]
+          vpmaxub 8192(%rdx), %zmm18, %zmm29
+
+// CHECK: vpmaxub -8192(%rdx), %zmm18, %zmm29
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xde,0x6a,0x80]
+          vpmaxub -8192(%rdx), %zmm18, %zmm29
+
+// CHECK: vpmaxub -8256(%rdx), %zmm18, %zmm29
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xde,0xaa,0xc0,0xdf,0xff,0xff]
+          vpmaxub -8256(%rdx), %zmm18, %zmm29
+
+// CHECK: vpmaxuw %zmm23, %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xa2,0x25,0x40,0x3e,0xef]
+          vpmaxuw %zmm23, %zmm27, %zmm21
+
+// CHECK: vpmaxuw %zmm23, %zmm27, %zmm21 {%k3}
+// CHECK:  encoding: [0x62,0xa2,0x25,0x43,0x3e,0xef]
+          vpmaxuw %zmm23, %zmm27, %zmm21 {%k3}
+
+// CHECK: vpmaxuw %zmm23, %zmm27, %zmm21 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa2,0x25,0xc3,0x3e,0xef]
+          vpmaxuw %zmm23, %zmm27, %zmm21 {%k3} {z}
+
+// CHECK: vpmaxuw (%rcx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x3e,0x29]
+          vpmaxuw (%rcx), %zmm27, %zmm21
+
+// CHECK: vpmaxuw 291(%rax,%r14,8), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xa2,0x25,0x40,0x3e,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxuw 291(%rax,%r14,8), %zmm27, %zmm21
+
+// CHECK: vpmaxuw 8128(%rdx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x3e,0x6a,0x7f]
+          vpmaxuw 8128(%rdx), %zmm27, %zmm21
+
+// CHECK: vpmaxuw 8192(%rdx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x3e,0xaa,0x00,0x20,0x00,0x00]
+          vpmaxuw 8192(%rdx), %zmm27, %zmm21
+
+// CHECK: vpmaxuw -8192(%rdx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x3e,0x6a,0x80]
+          vpmaxuw -8192(%rdx), %zmm27, %zmm21
+
+// CHECK: vpmaxuw -8256(%rdx), %zmm27, %zmm21
+// CHECK:  encoding: [0x62,0xe2,0x25,0x40,0x3e,0xaa,0xc0,0xdf,0xff,0xff]
+          vpmaxuw -8256(%rdx), %zmm27, %zmm21
+
+// CHECK: vpminsb %zmm25, %zmm22, %zmm28
+// CHECK:  encoding: [0x62,0x02,0x4d,0x40,0x38,0xe1]
+          vpminsb %zmm25, %zmm22, %zmm28
+
+// CHECK: vpminsb %zmm25, %zmm22, %zmm28 {%k6}
+// CHECK:  encoding: [0x62,0x02,0x4d,0x46,0x38,0xe1]
+          vpminsb %zmm25, %zmm22, %zmm28 {%k6}
+
+// CHECK: vpminsb %zmm25, %zmm22, %zmm28 {%k6} {z}
+// CHECK:  encoding: [0x62,0x02,0x4d,0xc6,0x38,0xe1]
+          vpminsb %zmm25, %zmm22, %zmm28 {%k6} {z}
+
+// CHECK: vpminsb (%rcx), %zmm22, %zmm28
+// CHECK:  encoding: [0x62,0x62,0x4d,0x40,0x38,0x21]
+          vpminsb (%rcx), %zmm22, %zmm28
+
+// CHECK: vpminsb 291(%rax,%r14,8), %zmm22, %zmm28
+// CHECK:  encoding: [0x62,0x22,0x4d,0x40,0x38,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpminsb 291(%rax,%r14,8), %zmm22, %zmm28
+
+// CHECK: vpminsb 8128(%rdx), %zmm22, %zmm28
+// CHECK:  encoding: [0x62,0x62,0x4d,0x40,0x38,0x62,0x7f]
+          vpminsb 8128(%rdx), %zmm22, %zmm28
+
+// CHECK: vpminsb 8192(%rdx), %zmm22, %zmm28
+// CHECK:  encoding: [0x62,0x62,0x4d,0x40,0x38,0xa2,0x00,0x20,0x00,0x00]
+          vpminsb 8192(%rdx), %zmm22, %zmm28
+
+// CHECK: vpminsb -8192(%rdx), %zmm22, %zmm28
+// CHECK:  encoding: [0x62,0x62,0x4d,0x40,0x38,0x62,0x80]
+          vpminsb -8192(%rdx), %zmm22, %zmm28
+
+// CHECK: vpminsb -8256(%rdx), %zmm22, %zmm28
+// CHECK:  encoding: [0x62,0x62,0x4d,0x40,0x38,0xa2,0xc0,0xdf,0xff,0xff]
+          vpminsb -8256(%rdx), %zmm22, %zmm28
+
+// CHECK: vpminsw %zmm25, %zmm22, %zmm27
+// CHECK:  encoding: [0x62,0x01,0x4d,0x40,0xea,0xd9]
+          vpminsw %zmm25, %zmm22, %zmm27
+
+// CHECK: vpminsw %zmm25, %zmm22, %zmm27 {%k6}
+// CHECK:  encoding: [0x62,0x01,0x4d,0x46,0xea,0xd9]
+          vpminsw %zmm25, %zmm22, %zmm27 {%k6}
+
+// CHECK: vpminsw %zmm25, %zmm22, %zmm27 {%k6} {z}
+// CHECK:  encoding: [0x62,0x01,0x4d,0xc6,0xea,0xd9]
+          vpminsw %zmm25, %zmm22, %zmm27 {%k6} {z}
+
+// CHECK: vpminsw (%rcx), %zmm22, %zmm27
+// CHECK:  encoding: [0x62,0x61,0x4d,0x40,0xea,0x19]
+          vpminsw (%rcx), %zmm22, %zmm27
+
+// CHECK: vpminsw 291(%rax,%r14,8), %zmm22, %zmm27
+// CHECK:  encoding: [0x62,0x21,0x4d,0x40,0xea,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpminsw 291(%rax,%r14,8), %zmm22, %zmm27
+
+// CHECK: vpminsw 8128(%rdx), %zmm22, %zmm27
+// CHECK:  encoding: [0x62,0x61,0x4d,0x40,0xea,0x5a,0x7f]
+          vpminsw 8128(%rdx), %zmm22, %zmm27
+
+// CHECK: vpminsw 8192(%rdx), %zmm22, %zmm27
+// CHECK:  encoding: [0x62,0x61,0x4d,0x40,0xea,0x9a,0x00,0x20,0x00,0x00]
+          vpminsw 8192(%rdx), %zmm22, %zmm27
+
+// CHECK: vpminsw -8192(%rdx), %zmm22, %zmm27
+// CHECK:  encoding: [0x62,0x61,0x4d,0x40,0xea,0x5a,0x80]
+          vpminsw -8192(%rdx), %zmm22, %zmm27
+
+// CHECK: vpminsw -8256(%rdx), %zmm22, %zmm27
+// CHECK:  encoding: [0x62,0x61,0x4d,0x40,0xea,0x9a,0xc0,0xdf,0xff,0xff]
+          vpminsw -8256(%rdx), %zmm22, %zmm27
+
+// CHECK: vpminub %zmm26, %zmm25, %zmm25
+// CHECK:  encoding: [0x62,0x01,0x35,0x40,0xda,0xca]
+          vpminub %zmm26, %zmm25, %zmm25
+
+// CHECK: vpminub %zmm26, %zmm25, %zmm25 {%k6}
+// CHECK:  encoding: [0x62,0x01,0x35,0x46,0xda,0xca]
+          vpminub %zmm26, %zmm25, %zmm25 {%k6}
+
+// CHECK: vpminub %zmm26, %zmm25, %zmm25 {%k6} {z}
+// CHECK:  encoding: [0x62,0x01,0x35,0xc6,0xda,0xca]
+          vpminub %zmm26, %zmm25, %zmm25 {%k6} {z}
+
+// CHECK: vpminub (%rcx), %zmm25, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x35,0x40,0xda,0x09]
+          vpminub (%rcx), %zmm25, %zmm25
+
+// CHECK: vpminub 291(%rax,%r14,8), %zmm25, %zmm25
+// CHECK:  encoding: [0x62,0x21,0x35,0x40,0xda,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpminub 291(%rax,%r14,8), %zmm25, %zmm25
+
+// CHECK: vpminub 8128(%rdx), %zmm25, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x35,0x40,0xda,0x4a,0x7f]
+          vpminub 8128(%rdx), %zmm25, %zmm25
+
+// CHECK: vpminub 8192(%rdx), %zmm25, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x35,0x40,0xda,0x8a,0x00,0x20,0x00,0x00]
+          vpminub 8192(%rdx), %zmm25, %zmm25
+
+// CHECK: vpminub -8192(%rdx), %zmm25, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x35,0x40,0xda,0x4a,0x80]
+          vpminub -8192(%rdx), %zmm25, %zmm25
+
+// CHECK: vpminub -8256(%rdx), %zmm25, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x35,0x40,0xda,0x8a,0xc0,0xdf,0xff,0xff]
+          vpminub -8256(%rdx), %zmm25, %zmm25
+
+// CHECK: vpminuw %zmm20, %zmm29, %zmm19
+// CHECK:  encoding: [0x62,0xa2,0x15,0x40,0x3a,0xdc]
+          vpminuw %zmm20, %zmm29, %zmm19
+
+// CHECK: vpminuw %zmm20, %zmm29, %zmm19 {%k7}
+// CHECK:  encoding: [0x62,0xa2,0x15,0x47,0x3a,0xdc]
+          vpminuw %zmm20, %zmm29, %zmm19 {%k7}
+
+// CHECK: vpminuw %zmm20, %zmm29, %zmm19 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa2,0x15,0xc7,0x3a,0xdc]
+          vpminuw %zmm20, %zmm29, %zmm19 {%k7} {z}
+
+// CHECK: vpminuw (%rcx), %zmm29, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3a,0x19]
+          vpminuw (%rcx), %zmm29, %zmm19
+
+// CHECK: vpminuw 291(%rax,%r14,8), %zmm29, %zmm19
+// CHECK:  encoding: [0x62,0xa2,0x15,0x40,0x3a,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpminuw 291(%rax,%r14,8), %zmm29, %zmm19
+
+// CHECK: vpminuw 8128(%rdx), %zmm29, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3a,0x5a,0x7f]
+          vpminuw 8128(%rdx), %zmm29, %zmm19
+
+// CHECK: vpminuw 8192(%rdx), %zmm29, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3a,0x9a,0x00,0x20,0x00,0x00]
+          vpminuw 8192(%rdx), %zmm29, %zmm19
+
+// CHECK: vpminuw -8192(%rdx), %zmm29, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3a,0x5a,0x80]
+          vpminuw -8192(%rdx), %zmm29, %zmm19
+
+// CHECK: vpminuw -8256(%rdx), %zmm29, %zmm19
+// CHECK:  encoding: [0x62,0xe2,0x15,0x40,0x3a,0x9a,0xc0,0xdf,0xff,0xff]
+          vpminuw -8256(%rdx), %zmm29, %zmm19
+
+// CHECK: vpmullw %zmm19, %zmm28, %zmm19
+// CHECK:  encoding: [0x62,0xa1,0x1d,0x40,0xd5,0xdb]
+          vpmullw %zmm19, %zmm28, %zmm19
+
+// CHECK: vpmullw %zmm19, %zmm28, %zmm19 {%k5}
+// CHECK:  encoding: [0x62,0xa1,0x1d,0x45,0xd5,0xdb]
+          vpmullw %zmm19, %zmm28, %zmm19 {%k5}
+
+// CHECK: vpmullw %zmm19, %zmm28, %zmm19 {%k5} {z}
+// CHECK:  encoding: [0x62,0xa1,0x1d,0xc5,0xd5,0xdb]
+          vpmullw %zmm19, %zmm28, %zmm19 {%k5} {z}
+
+// CHECK: vpmullw (%rcx), %zmm28, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x40,0xd5,0x19]
+          vpmullw (%rcx), %zmm28, %zmm19
+
+// CHECK: vpmullw 291(%rax,%r14,8), %zmm28, %zmm19
+// CHECK:  encoding: [0x62,0xa1,0x1d,0x40,0xd5,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpmullw 291(%rax,%r14,8), %zmm28, %zmm19
+
+// CHECK: vpmullw 8128(%rdx), %zmm28, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x40,0xd5,0x5a,0x7f]
+          vpmullw 8128(%rdx), %zmm28, %zmm19
+
+// CHECK: vpmullw 8192(%rdx), %zmm28, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x40,0xd5,0x9a,0x00,0x20,0x00,0x00]
+          vpmullw 8192(%rdx), %zmm28, %zmm19
+
+// CHECK: vpmullw -8192(%rdx), %zmm28, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x40,0xd5,0x5a,0x80]
+          vpmullw -8192(%rdx), %zmm28, %zmm19
+
+// CHECK: vpmullw -8256(%rdx), %zmm28, %zmm19
+// CHECK:  encoding: [0x62,0xe1,0x1d,0x40,0xd5,0x9a,0xc0,0xdf,0xff,0xff]
+          vpmullw -8256(%rdx), %zmm28, %zmm19
+
+// CHECK: vpsubb %zmm26, %zmm18, %zmm25
+// CHECK:  encoding: [0x62,0x01,0x6d,0x40,0xf8,0xca]
+          vpsubb %zmm26, %zmm18, %zmm25
+
+// CHECK: vpsubb %zmm26, %zmm18, %zmm25 {%k4}
+// CHECK:  encoding: [0x62,0x01,0x6d,0x44,0xf8,0xca]
+          vpsubb %zmm26, %zmm18, %zmm25 {%k4}
+
+// CHECK: vpsubb %zmm26, %zmm18, %zmm25 {%k4} {z}
+// CHECK:  encoding: [0x62,0x01,0x6d,0xc4,0xf8,0xca]
+          vpsubb %zmm26, %zmm18, %zmm25 {%k4} {z}
+
+// CHECK: vpsubb (%rcx), %zmm18, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xf8,0x09]
+          vpsubb (%rcx), %zmm18, %zmm25
+
+// CHECK: vpsubb 291(%rax,%r14,8), %zmm18, %zmm25
+// CHECK:  encoding: [0x62,0x21,0x6d,0x40,0xf8,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpsubb 291(%rax,%r14,8), %zmm18, %zmm25
+
+// CHECK: vpsubb 8128(%rdx), %zmm18, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xf8,0x4a,0x7f]
+          vpsubb 8128(%rdx), %zmm18, %zmm25
+
+// CHECK: vpsubb 8192(%rdx), %zmm18, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xf8,0x8a,0x00,0x20,0x00,0x00]
+          vpsubb 8192(%rdx), %zmm18, %zmm25
+
+// CHECK: vpsubb -8192(%rdx), %zmm18, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xf8,0x4a,0x80]
+          vpsubb -8192(%rdx), %zmm18, %zmm25
+
+// CHECK: vpsubb -8256(%rdx), %zmm18, %zmm25
+// CHECK:  encoding: [0x62,0x61,0x6d,0x40,0xf8,0x8a,0xc0,0xdf,0xff,0xff]
+          vpsubb -8256(%rdx), %zmm18, %zmm25
+
+// CHECK: vpsubw %zmm24, %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0x81,0x3d,0x40,0xf9,0xc8]
+          vpsubw %zmm24, %zmm24, %zmm17
+
+// CHECK: vpsubw %zmm24, %zmm24, %zmm17 {%k4}
+// CHECK:  encoding: [0x62,0x81,0x3d,0x44,0xf9,0xc8]
+          vpsubw %zmm24, %zmm24, %zmm17 {%k4}
+
+// CHECK: vpsubw %zmm24, %zmm24, %zmm17 {%k4} {z}
+// CHECK:  encoding: [0x62,0x81,0x3d,0xc4,0xf9,0xc8]
+          vpsubw %zmm24, %zmm24, %zmm17 {%k4} {z}
+
+// CHECK: vpsubw (%rcx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xf9,0x09]
+          vpsubw (%rcx), %zmm24, %zmm17
+
+// CHECK: vpsubw 291(%rax,%r14,8), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x40,0xf9,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpsubw 291(%rax,%r14,8), %zmm24, %zmm17
+
+// CHECK: vpsubw 8128(%rdx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xf9,0x4a,0x7f]
+          vpsubw 8128(%rdx), %zmm24, %zmm17
+
+// CHECK: vpsubw 8192(%rdx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xf9,0x8a,0x00,0x20,0x00,0x00]
+          vpsubw 8192(%rdx), %zmm24, %zmm17
+
+// CHECK: vpsubw -8192(%rdx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xf9,0x4a,0x80]
+          vpsubw -8192(%rdx), %zmm24, %zmm17
+
+// CHECK: vpsubw -8256(%rdx), %zmm24, %zmm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x40,0xf9,0x8a,0xc0,0xdf,0xff,0xff]
+          vpsubw -8256(%rdx), %zmm24, %zmm17
+
+// CHECK: vmovdqu8 %zmm19, %zmm29
+// CHECK:  encoding: [0x62,0x21,0x7f,0x48,0x6f,0xeb]
+          vmovdqu8 %zmm19, %zmm29
+
+// CHECK: vmovdqu8 %zmm19, %zmm29 {%k7}
+// CHECK:  encoding: [0x62,0x21,0x7f,0x4f,0x6f,0xeb]
+          vmovdqu8 %zmm19, %zmm29 {%k7}
+
+// CHECK: vmovdqu8 %zmm19, %zmm29 {%k7} {z}
+// CHECK:  encoding: [0x62,0x21,0x7f,0xcf,0x6f,0xeb]
+          vmovdqu8 %zmm19, %zmm29 {%k7} {z}
+
+// CHECK: vmovdqu8 (%rcx), %zmm29
+// CHECK:  encoding: [0x62,0x61,0x7f,0x48,0x6f,0x29]
+          vmovdqu8 (%rcx), %zmm29
+
+// CHECK: vmovdqu8 291(%rax,%r14,8), %zmm29
+// CHECK:  encoding: [0x62,0x21,0x7f,0x48,0x6f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu8 291(%rax,%r14,8), %zmm29
+
+// CHECK: vmovdqu8 8128(%rdx), %zmm29
+// CHECK:  encoding: [0x62,0x61,0x7f,0x48,0x6f,0x6a,0x7f]
+          vmovdqu8 8128(%rdx), %zmm29
+
+// CHECK: vmovdqu8 8192(%rdx), %zmm29
+// CHECK:  encoding: [0x62,0x61,0x7f,0x48,0x6f,0xaa,0x00,0x20,0x00,0x00]
+          vmovdqu8 8192(%rdx), %zmm29
+
+// CHECK: vmovdqu8 -8192(%rdx), %zmm29
+// CHECK:  encoding: [0x62,0x61,0x7f,0x48,0x6f,0x6a,0x80]
+          vmovdqu8 -8192(%rdx), %zmm29
+
+// CHECK: vmovdqu8 -8256(%rdx), %zmm29
+// CHECK:  encoding: [0x62,0x61,0x7f,0x48,0x6f,0xaa,0xc0,0xdf,0xff,0xff]
+          vmovdqu8 -8256(%rdx), %zmm29
+
+// CHECK: vmovdqu16 %zmm18, %zmm17
+// CHECK:  encoding: [0x62,0xa1,0xff,0x48,0x6f,0xca]
+          vmovdqu16 %zmm18, %zmm17
+
+// CHECK: vmovdqu16 %zmm18, %zmm17 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0xff,0x4b,0x6f,0xca]
+          vmovdqu16 %zmm18, %zmm17 {%k3}
+
+// CHECK: vmovdqu16 %zmm18, %zmm17 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0xff,0xcb,0x6f,0xca]
+          vmovdqu16 %zmm18, %zmm17 {%k3} {z}
+
+// CHECK: vmovdqu16 (%rcx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xff,0x48,0x6f,0x09]
+          vmovdqu16 (%rcx), %zmm17
+
+// CHECK: vmovdqu16 291(%rax,%r14,8), %zmm17
+// CHECK:  encoding: [0x62,0xa1,0xff,0x48,0x6f,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu16 291(%rax,%r14,8), %zmm17
+
+// CHECK: vmovdqu16 8128(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xff,0x48,0x6f,0x4a,0x7f]
+          vmovdqu16 8128(%rdx), %zmm17
+
+// CHECK: vmovdqu16 8192(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xff,0x48,0x6f,0x8a,0x00,0x20,0x00,0x00]
+          vmovdqu16 8192(%rdx), %zmm17
+
+// CHECK: vmovdqu16 -8192(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xff,0x48,0x6f,0x4a,0x80]
+          vmovdqu16 -8192(%rdx), %zmm17
+
+// CHECK: vmovdqu16 -8256(%rdx), %zmm17
+// CHECK:  encoding: [0x62,0xe1,0xff,0x48,0x6f,0x8a,0xc0,0xdf,0xff,0xff]
+          vmovdqu16 -8256(%rdx), %zmm17
+
+// CHECK: kandq  %k7, %k5, %k5
+// CHECK:  encoding: [0xc4,0xe1,0xd4,0x41,0xef]
+          kandq  %k7, %k5, %k5
+
+// CHECK: kandd  %k4, %k5, %k5
+// CHECK:  encoding: [0xc4,0xe1,0xd5,0x41,0xec]
+          kandd  %k4, %k5, %k5
+
+// CHECK: kandnq %k4, %k5, %k2
+// CHECK:  encoding: [0xc4,0xe1,0xd4,0x42,0xd4]
+          kandnq %k4, %k5, %k2
+
+// CHECK: kandnd %k6, %k6, %k3
+// CHECK:  encoding: [0xc4,0xe1,0xcd,0x42,0xde]
+          kandnd %k6, %k6, %k3
+
+// CHECK: korq   %k4, %k5, %k4
+// CHECK:  encoding: [0xc4,0xe1,0xd4,0x45,0xe4]
+          korq   %k4, %k5, %k4
+
+// CHECK: kord   %k6, %k6, %k5
+// CHECK:  encoding: [0xc4,0xe1,0xcd,0x45,0xee]
+          kord   %k6, %k6, %k5
+
+// CHECK: kxnorq %k6, %k5, %k2
+// CHECK:  encoding: [0xc4,0xe1,0xd4,0x46,0xd6]
+          kxnorq %k6, %k5, %k2
+
+// CHECK: kxnord %k5, %k3, %k5
+// CHECK:  encoding: [0xc4,0xe1,0xe5,0x46,0xed]
+          kxnord %k5, %k3, %k5
+
+// CHECK: kxorq  %k4, %k3, %k2
+// CHECK:  encoding: [0xc4,0xe1,0xe4,0x47,0xd4]
+          kxorq  %k4, %k3, %k2
+
+// CHECK: kxord  %k6, %k5, %k2
+// CHECK:  encoding: [0xc4,0xe1,0xd5,0x47,0xd6]
+          kxord  %k6, %k5, %k2
+
+// CHECK: knotq  %k6, %k3
+// CHECK:  encoding: [0xc4,0xe1,0xf8,0x44,0xde]
+          knotq  %k6, %k3
+
+// CHECK: knotd  %k4, %k3
+// CHECK:  encoding: [0xc4,0xe1,0xf9,0x44,0xdc]
+          knotd  %k4, %k3
+
+// CHECK: kmovq  %k5, %k2
+// CHECK:  encoding: [0xc4,0xe1,0xf8,0x90,0xd5]
+          kmovq  %k5, %k2
+
+// CHECK: kmovq  (%rcx), %k2
+// CHECK:  encoding: [0xc4,0xe1,0xf8,0x90,0x11]
+          kmovq  (%rcx), %k2
+
+// CHECK: kmovq  291(%rax,%r14,8), %k2
+// CHECK:  encoding: [0xc4,0xa1,0xf8,0x90,0x94,0xf0,0x23,0x01,0x00,0x00]
+          kmovq  291(%rax,%r14,8), %k2
+
+// CHECK: kmovd  %k4, %k5
+// CHECK:  encoding: [0xc4,0xe1,0xf9,0x90,0xec]
+          kmovd  %k4, %k5
+
+// CHECK: kmovd  (%rcx), %k5
+// CHECK:  encoding: [0xc4,0xe1,0xf9,0x90,0x29]
+          kmovd  (%rcx), %k5
+
+// CHECK: kmovd  291(%rax,%r14,8), %k5
+// CHECK:  encoding: [0xc4,0xa1,0xf9,0x90,0xac,0xf0,0x23,0x01,0x00,0x00]
+          kmovd  291(%rax,%r14,8), %k5
+
+// CHECK: kmovq  %k3, (%rcx)
+// CHECK:  encoding: [0xc4,0xe1,0xf8,0x91,0x19]
+          kmovq  %k3, (%rcx)
+
+// CHECK: kmovq  %k3, 291(%rax,%r14,8)
+// CHECK:  encoding: [0xc4,0xa1,0xf8,0x91,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          kmovq  %k3, 291(%rax,%r14,8)
+
+// CHECK: kmovd  %k3, (%rcx)
+// CHECK:  encoding: [0xc4,0xe1,0xf9,0x91,0x19]
+          kmovd  %k3, (%rcx)
+
+// CHECK: kmovd  %k3, 291(%rax,%r14,8)
+// CHECK:  encoding: [0xc4,0xa1,0xf9,0x91,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          kmovd  %k3, 291(%rax,%r14,8)
+
+// CHECK: kmovq  %rax, %k2
+// CHECK:  encoding: [0xc4,0xe1,0xfb,0x92,0xd0]
+          kmovq  %rax, %k2
+
+// CHECK: kmovq  %r8, %k2
+// CHECK:  encoding: [0xc4,0xc1,0xfb,0x92,0xd0]
+          kmovq  %r8, %k2
+
+// CHECK: kmovd  %eax, %k4
+// CHECK:  encoding: [0xc5,0xfb,0x92,0xe0]
+          kmovd  %eax, %k4
+
+// CHECK: kmovd  %ebp, %k4
+// CHECK:  encoding: [0xc5,0xfb,0x92,0xe5]
+          kmovd  %ebp, %k4
+
+// CHECK: kmovd  %r13d, %k4
+// CHECK:  encoding: [0xc4,0xc1,0x7b,0x92,0xe5]
+          kmovd  %r13d, %k4
+
+// CHECK: kmovq  %k3, %rax
+// CHECK:  encoding: [0xc4,0xe1,0xfb,0x93,0xc3]
+          kmovq  %k3, %rax
+
+// CHECK: kmovq  %k3, %r8
+// CHECK:  encoding: [0xc4,0x61,0xfb,0x93,0xc3]
+          kmovq  %k3, %r8
+
+// CHECK: kmovd  %k5, %eax
+// CHECK:  encoding: [0xc5,0xfb,0x93,0xc5]
+          kmovd  %k5, %eax
+
+// CHECK: kmovd  %k5, %ebp
+// CHECK:  encoding: [0xc5,0xfb,0x93,0xed]
+          kmovd  %k5, %ebp
+
+// CHECK: kmovd  %k5, %r13d
+// CHECK:  encoding: [0xc5,0x7b,0x93,0xed]
+          kmovd  %k5, %r13d
+
+// CHECK: vmovdqu8 %zmm18, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x48,0x7f,0x11]
+          vmovdqu8 %zmm18, (%rcx)
+
+// CHECK: vmovdqu8 %zmm18, (%rcx) {%k3}
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x4b,0x7f,0x11]
+          vmovdqu8 %zmm18, (%rcx) {%k3}
+
+// CHECK: vmovdqu8 %zmm18, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7f,0x48,0x7f,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu8 %zmm18, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu8 %zmm18, 8128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x48,0x7f,0x52,0x7f]
+          vmovdqu8 %zmm18, 8128(%rdx)
+
+// CHECK: vmovdqu8 %zmm18, 8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x48,0x7f,0x92,0x00,0x20,0x00,0x00]
+          vmovdqu8 %zmm18, 8192(%rdx)
+
+// CHECK: vmovdqu8 %zmm18, -8192(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x48,0x7f,0x52,0x80]
+          vmovdqu8 %zmm18, -8192(%rdx)
+
+// CHECK: vmovdqu8 %zmm18, -8256(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x48,0x7f,0x92,0xc0,0xdf,0xff,0xff]
+          vmovdqu8 %zmm18, -8256(%rdx)
+
+// CHECK: vmovdqu16 %zmm28, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x48,0x7f,0x21]
+          vmovdqu16 %zmm28, (%rcx)
+
+// CHECK: vmovdqu16 %zmm28, (%rcx) {%k6}
+// CHECK:  encoding: [0x62,0x61,0xff,0x4e,0x7f,0x21]
+          vmovdqu16 %zmm28, (%rcx) {%k6}
+
+// CHECK: vmovdqu16 %zmm28, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0xff,0x48,0x7f,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu16 %zmm28, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu16 %zmm28, 8128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x48,0x7f,0x62,0x7f]
+          vmovdqu16 %zmm28, 8128(%rdx)
+
+// CHECK: vmovdqu16 %zmm28, 8192(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x48,0x7f,0xa2,0x00,0x20,0x00,0x00]
+          vmovdqu16 %zmm28, 8192(%rdx)
+
+// CHECK: vmovdqu16 %zmm28, -8192(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x48,0x7f,0x62,0x80]
+          vmovdqu16 %zmm28, -8192(%rdx)
+
+// CHECK: vmovdqu16 %zmm28, -8256(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x48,0x7f,0xa2,0xc0,0xdf,0xff,0xff]
+          vmovdqu16 %zmm28, -8256(%rdx)
+
+// CHECK: vpcmpb $171, %zmm25, %zmm26, %k3
+// CHECK:  encoding: [0x62,0x93,0x2d,0x40,0x3f,0xd9,0xab]
+          vpcmpb $171, %zmm25, %zmm26, %k3
+
+// CHECK: vpcmpb $171, %zmm25, %zmm26, %k3 {%k7}
+// CHECK:  encoding: [0x62,0x93,0x2d,0x47,0x3f,0xd9,0xab]
+          vpcmpb $171, %zmm25, %zmm26, %k3 {%k7}
+
+// CHECK: vpcmpb $123, %zmm25, %zmm26, %k3
+// CHECK:  encoding: [0x62,0x93,0x2d,0x40,0x3f,0xd9,0x7b]
+          vpcmpb $123, %zmm25, %zmm26, %k3
+
+// CHECK: vpcmpb $123, (%rcx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x19,0x7b]
+          vpcmpb $123, (%rcx), %zmm26, %k3
+
+// CHECK: vpcmpb $123, 291(%rax,%r14,8), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xb3,0x2d,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpb $123, 291(%rax,%r14,8), %zmm26, %k3
+
+// CHECK: vpcmpb $123, 8128(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x7f,0x7b]
+          vpcmpb $123, 8128(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpb $123, 8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x7b]
+          vpcmpb $123, 8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpb $123, -8192(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x5a,0x80,0x7b]
+          vpcmpb $123, -8192(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpb $123, -8256(%rdx), %zmm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0x2d,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x7b]
+          vpcmpb $123, -8256(%rdx), %zmm26, %k3
+
+// CHECK: vpcmpw $171, %zmm25, %zmm29, %k3
+// CHECK:  encoding: [0x62,0x93,0x95,0x40,0x3f,0xd9,0xab]
+          vpcmpw $171, %zmm25, %zmm29, %k3
+
+// CHECK: vpcmpw $171, %zmm25, %zmm29, %k3 {%k6}
+// CHECK:  encoding: [0x62,0x93,0x95,0x46,0x3f,0xd9,0xab]
+          vpcmpw $171, %zmm25, %zmm29, %k3 {%k6}
+
+// CHECK: vpcmpw $123, %zmm25, %zmm29, %k3
+// CHECK:  encoding: [0x62,0x93,0x95,0x40,0x3f,0xd9,0x7b]
+          vpcmpw $123, %zmm25, %zmm29, %k3
+
+// CHECK: vpcmpw $123, (%rcx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x19,0x7b]
+          vpcmpw $123, (%rcx), %zmm29, %k3
+
+// CHECK: vpcmpw $123, 291(%rax,%r14,8), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xb3,0x95,0x40,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpw $123, 291(%rax,%r14,8), %zmm29, %k3
+
+// CHECK: vpcmpw $123, 8128(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x7f,0x7b]
+          vpcmpw $123, 8128(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpw $123, 8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0x00,0x20,0x00,0x00,0x7b]
+          vpcmpw $123, 8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpw $123, -8192(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x5a,0x80,0x7b]
+          vpcmpw $123, -8192(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpw $123, -8256(%rdx), %zmm29, %k3
+// CHECK:  encoding: [0x62,0xf3,0x95,0x40,0x3f,0x9a,0xc0,0xdf,0xff,0xff,0x7b]
+          vpcmpw $123, -8256(%rdx), %zmm29, %k3
+
+// CHECK: vpcmpub $171, %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0xab]
+          vpcmpub $171, %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpub $171, %zmm22, %zmm29, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0x15,0x47,0x3e,0xe6,0xab]
+          vpcmpub $171, %zmm22, %zmm29, %k4 {%k7}
+
+// CHECK: vpcmpub $123, %zmm22, %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xe6,0x7b]
+          vpcmpub $123, %zmm22, %zmm29, %k4
+
+// CHECK: vpcmpub $123, (%rcx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x21,0x7b]
+          vpcmpub $123, (%rcx), %zmm29, %k4
+
+// CHECK: vpcmpub $123, 291(%rax,%r14,8), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xb3,0x15,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpub $123, 291(%rax,%r14,8), %zmm29, %k4
+
+// CHECK: vpcmpub $123, 8128(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x7f,0x7b]
+          vpcmpub $123, 8128(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpub $123, 8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x7b]
+          vpcmpub $123, 8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpub $123, -8192(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0x62,0x80,0x7b]
+          vpcmpub $123, -8192(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpub $123, -8256(%rdx), %zmm29, %k4
+// CHECK:  encoding: [0x62,0xf3,0x15,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x7b]
+          vpcmpub $123, -8256(%rdx), %zmm29, %k4
+
+// CHECK: vpcmpuw $171, %zmm22, %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xe6,0xab]
+          vpcmpuw $171, %zmm22, %zmm22, %k4
+
+// CHECK: vpcmpuw $171, %zmm22, %zmm22, %k4 {%k7}
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x47,0x3e,0xe6,0xab]
+          vpcmpuw $171, %zmm22, %zmm22, %k4 {%k7}
+
+// CHECK: vpcmpuw $123, %zmm22, %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xe6,0x7b]
+          vpcmpuw $123, %zmm22, %zmm22, %k4
+
+// CHECK: vpcmpuw $123, (%rcx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x21,0x7b]
+          vpcmpuw $123, (%rcx), %zmm22, %k4
+
+// CHECK: vpcmpuw $123, 291(%rax,%r14,8), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x40,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpuw $123, 291(%rax,%r14,8), %zmm22, %k4
+
+// CHECK: vpcmpuw $123, 8128(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x7f,0x7b]
+          vpcmpuw $123, 8128(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpuw $123, 8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0x00,0x20,0x00,0x00,0x7b]
+          vpcmpuw $123, 8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpuw $123, -8192(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0x62,0x80,0x7b]
+          vpcmpuw $123, -8192(%rdx), %zmm22, %k4
+
+// CHECK: vpcmpuw $123, -8256(%rdx), %zmm22, %k4
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x40,0x3e,0xa2,0xc0,0xdf,0xff,0xff,0x7b]
+          vpcmpuw $123, -8256(%rdx), %zmm22, %k4

diff --git a/test/MC/X86/x86-64-avx512bw_vl.s b/test/MC/X86/x86-64-avx512bw_vl.s
new file mode 100644
index 0000000..c3761de
--- /dev/null
+++ b/test/MC/X86/x86-64-avx512bw_vl.s

@@ -0,0 +1,1737 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl  --show-encoding %s | FileCheck %s
+
+// CHECK: vpaddb %xmm22, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x21,0x75,0x00,0xfc,0xd6]
+          vpaddb %xmm22, %xmm17, %xmm26
+
+// CHECK: vpaddb %xmm22, %xmm17, %xmm26 {%k5}
+// CHECK:  encoding: [0x62,0x21,0x75,0x05,0xfc,0xd6]
+          vpaddb %xmm22, %xmm17, %xmm26 {%k5}
+
+// CHECK: vpaddb %xmm22, %xmm17, %xmm26 {%k5} {z}
+// CHECK:  encoding: [0x62,0x21,0x75,0x85,0xfc,0xd6]
+          vpaddb %xmm22, %xmm17, %xmm26 {%k5} {z}
+
+// CHECK: vpaddb (%rcx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xfc,0x11]
+          vpaddb (%rcx), %xmm17, %xmm26
+
+// CHECK: vpaddb 291(%rax,%r14,8), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x21,0x75,0x00,0xfc,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpaddb 291(%rax,%r14,8), %xmm17, %xmm26
+
+// CHECK: vpaddb 2032(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xfc,0x52,0x7f]
+          vpaddb 2032(%rdx), %xmm17, %xmm26
+
+// CHECK: vpaddb 2048(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xfc,0x92,0x00,0x08,0x00,0x00]
+          vpaddb 2048(%rdx), %xmm17, %xmm26
+
+// CHECK: vpaddb -2048(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xfc,0x52,0x80]
+          vpaddb -2048(%rdx), %xmm17, %xmm26
+
+// CHECK: vpaddb -2064(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xfc,0x92,0xf0,0xf7,0xff,0xff]
+          vpaddb -2064(%rdx), %xmm17, %xmm26
+
+// CHECK: vpaddb %ymm28, %ymm27, %ymm26
+// CHECK:  encoding: [0x62,0x01,0x25,0x20,0xfc,0xd4]
+          vpaddb %ymm28, %ymm27, %ymm26
+
+// CHECK: vpaddb %ymm28, %ymm27, %ymm26 {%k3}
+// CHECK:  encoding: [0x62,0x01,0x25,0x23,0xfc,0xd4]
+          vpaddb %ymm28, %ymm27, %ymm26 {%k3}
+
+// CHECK: vpaddb %ymm28, %ymm27, %ymm26 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0x25,0xa3,0xfc,0xd4]
+          vpaddb %ymm28, %ymm27, %ymm26 {%k3} {z}
+
+// CHECK: vpaddb (%rcx), %ymm27, %ymm26
+// CHECK:  encoding: [0x62,0x61,0x25,0x20,0xfc,0x11]
+          vpaddb (%rcx), %ymm27, %ymm26
+
+// CHECK: vpaddb 291(%rax,%r14,8), %ymm27, %ymm26
+// CHECK:  encoding: [0x62,0x21,0x25,0x20,0xfc,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpaddb 291(%rax,%r14,8), %ymm27, %ymm26
+
+// CHECK: vpaddb 4064(%rdx), %ymm27, %ymm26
+// CHECK:  encoding: [0x62,0x61,0x25,0x20,0xfc,0x52,0x7f]
+          vpaddb 4064(%rdx), %ymm27, %ymm26
+
+// CHECK: vpaddb 4096(%rdx), %ymm27, %ymm26
+// CHECK:  encoding: [0x62,0x61,0x25,0x20,0xfc,0x92,0x00,0x10,0x00,0x00]
+          vpaddb 4096(%rdx), %ymm27, %ymm26
+
+// CHECK: vpaddb -4096(%rdx), %ymm27, %ymm26
+// CHECK:  encoding: [0x62,0x61,0x25,0x20,0xfc,0x52,0x80]
+          vpaddb -4096(%rdx), %ymm27, %ymm26
+
+// CHECK: vpaddb -4128(%rdx), %ymm27, %ymm26
+// CHECK:  encoding: [0x62,0x61,0x25,0x20,0xfc,0x92,0xe0,0xef,0xff,0xff]
+          vpaddb -4128(%rdx), %ymm27, %ymm26
+
+// CHECK: vpaddw %xmm18, %xmm17, %xmm18
+// CHECK:  encoding: [0x62,0xa1,0x75,0x00,0xfd,0xd2]
+          vpaddw %xmm18, %xmm17, %xmm18
+
+// CHECK: vpaddw %xmm18, %xmm17, %xmm18 {%k1}
+// CHECK:  encoding: [0x62,0xa1,0x75,0x01,0xfd,0xd2]
+          vpaddw %xmm18, %xmm17, %xmm18 {%k1}
+
+// CHECK: vpaddw %xmm18, %xmm17, %xmm18 {%k1} {z}
+// CHECK:  encoding: [0x62,0xa1,0x75,0x81,0xfd,0xd2]
+          vpaddw %xmm18, %xmm17, %xmm18 {%k1} {z}
+
+// CHECK: vpaddw (%rcx), %xmm17, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xfd,0x11]
+          vpaddw (%rcx), %xmm17, %xmm18
+
+// CHECK: vpaddw 291(%rax,%r14,8), %xmm17, %xmm18
+// CHECK:  encoding: [0x62,0xa1,0x75,0x00,0xfd,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpaddw 291(%rax,%r14,8), %xmm17, %xmm18
+
+// CHECK: vpaddw 2032(%rdx), %xmm17, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xfd,0x52,0x7f]
+          vpaddw 2032(%rdx), %xmm17, %xmm18
+
+// CHECK: vpaddw 2048(%rdx), %xmm17, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xfd,0x92,0x00,0x08,0x00,0x00]
+          vpaddw 2048(%rdx), %xmm17, %xmm18
+
+// CHECK: vpaddw -2048(%rdx), %xmm17, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xfd,0x52,0x80]
+          vpaddw -2048(%rdx), %xmm17, %xmm18
+
+// CHECK: vpaddw -2064(%rdx), %xmm17, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xfd,0x92,0xf0,0xf7,0xff,0xff]
+          vpaddw -2064(%rdx), %xmm17, %xmm18
+
+// CHECK: vpaddw %ymm26, %ymm21, %ymm23
+// CHECK:  encoding: [0x62,0x81,0x55,0x20,0xfd,0xfa]
+          vpaddw %ymm26, %ymm21, %ymm23
+
+// CHECK: vpaddw %ymm26, %ymm21, %ymm23 {%k7}
+// CHECK:  encoding: [0x62,0x81,0x55,0x27,0xfd,0xfa]
+          vpaddw %ymm26, %ymm21, %ymm23 {%k7}
+
+// CHECK: vpaddw %ymm26, %ymm21, %ymm23 {%k7} {z}
+// CHECK:  encoding: [0x62,0x81,0x55,0xa7,0xfd,0xfa]
+          vpaddw %ymm26, %ymm21, %ymm23 {%k7} {z}
+
+// CHECK: vpaddw (%rcx), %ymm21, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xfd,0x39]
+          vpaddw (%rcx), %ymm21, %ymm23
+
+// CHECK: vpaddw 291(%rax,%r14,8), %ymm21, %ymm23
+// CHECK:  encoding: [0x62,0xa1,0x55,0x20,0xfd,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpaddw 291(%rax,%r14,8), %ymm21, %ymm23
+
+// CHECK: vpaddw 4064(%rdx), %ymm21, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xfd,0x7a,0x7f]
+          vpaddw 4064(%rdx), %ymm21, %ymm23
+
+// CHECK: vpaddw 4096(%rdx), %ymm21, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xfd,0xba,0x00,0x10,0x00,0x00]
+          vpaddw 4096(%rdx), %ymm21, %ymm23
+
+// CHECK: vpaddw -4096(%rdx), %ymm21, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xfd,0x7a,0x80]
+          vpaddw -4096(%rdx), %ymm21, %ymm23
+
+// CHECK: vpaddw -4128(%rdx), %ymm21, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xfd,0xba,0xe0,0xef,0xff,0xff]
+          vpaddw -4128(%rdx), %ymm21, %ymm23
+
+// CHECK: vpcmpeqb %xmm21, %xmm21, %k4
+// CHECK:  encoding: [0x62,0xb1,0x55,0x00,0x74,0xe5]
+          vpcmpeqb %xmm21, %xmm21, %k4
+
+// CHECK: vpcmpeqb %xmm21, %xmm21, %k4 {%k3}
+// CHECK:  encoding: [0x62,0xb1,0x55,0x03,0x74,0xe5]
+          vpcmpeqb %xmm21, %xmm21, %k4 {%k3}
+
+// CHECK: vpcmpeqb (%rcx), %xmm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x00,0x74,0x21]
+          vpcmpeqb (%rcx), %xmm21, %k4
+
+// CHECK: vpcmpeqb 291(%rax,%r14,8), %xmm21, %k4
+// CHECK:  encoding: [0x62,0xb1,0x55,0x00,0x74,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqb 291(%rax,%r14,8), %xmm21, %k4
+
+// CHECK: vpcmpeqb 2032(%rdx), %xmm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x00,0x74,0x62,0x7f]
+          vpcmpeqb 2032(%rdx), %xmm21, %k4
+
+// CHECK: vpcmpeqb 2048(%rdx), %xmm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x00,0x74,0xa2,0x00,0x08,0x00,0x00]
+          vpcmpeqb 2048(%rdx), %xmm21, %k4
+
+// CHECK: vpcmpeqb -2048(%rdx), %xmm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x00,0x74,0x62,0x80]
+          vpcmpeqb -2048(%rdx), %xmm21, %k4
+
+// CHECK: vpcmpeqb -2064(%rdx), %xmm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x00,0x74,0xa2,0xf0,0xf7,0xff,0xff]
+          vpcmpeqb -2064(%rdx), %xmm21, %k4
+
+// CHECK: vpcmpeqb %ymm18, %ymm21, %k4
+// CHECK:  encoding: [0x62,0xb1,0x55,0x20,0x74,0xe2]
+          vpcmpeqb %ymm18, %ymm21, %k4
+
+// CHECK: vpcmpeqb %ymm18, %ymm21, %k4 {%k1}
+// CHECK:  encoding: [0x62,0xb1,0x55,0x21,0x74,0xe2]
+          vpcmpeqb %ymm18, %ymm21, %k4 {%k1}
+
+// CHECK: vpcmpeqb (%rcx), %ymm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x20,0x74,0x21]
+          vpcmpeqb (%rcx), %ymm21, %k4
+
+// CHECK: vpcmpeqb 291(%rax,%r14,8), %ymm21, %k4
+// CHECK:  encoding: [0x62,0xb1,0x55,0x20,0x74,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqb 291(%rax,%r14,8), %ymm21, %k4
+
+// CHECK: vpcmpeqb 4064(%rdx), %ymm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x20,0x74,0x62,0x7f]
+          vpcmpeqb 4064(%rdx), %ymm21, %k4
+
+// CHECK: vpcmpeqb 4096(%rdx), %ymm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x20,0x74,0xa2,0x00,0x10,0x00,0x00]
+          vpcmpeqb 4096(%rdx), %ymm21, %k4
+
+// CHECK: vpcmpeqb -4096(%rdx), %ymm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x20,0x74,0x62,0x80]
+          vpcmpeqb -4096(%rdx), %ymm21, %k4
+
+// CHECK: vpcmpeqb -4128(%rdx), %ymm21, %k4
+// CHECK:  encoding: [0x62,0xf1,0x55,0x20,0x74,0xa2,0xe0,0xef,0xff,0xff]
+          vpcmpeqb -4128(%rdx), %ymm21, %k4
+
+// CHECK: vpcmpeqw %xmm27, %xmm30, %k3
+// CHECK:  encoding: [0x62,0x91,0x0d,0x00,0x75,0xdb]
+          vpcmpeqw %xmm27, %xmm30, %k3
+
+// CHECK: vpcmpeqw %xmm27, %xmm30, %k3 {%k1}
+// CHECK:  encoding: [0x62,0x91,0x0d,0x01,0x75,0xdb]
+          vpcmpeqw %xmm27, %xmm30, %k3 {%k1}
+
+// CHECK: vpcmpeqw (%rcx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x75,0x19]
+          vpcmpeqw (%rcx), %xmm30, %k3
+
+// CHECK: vpcmpeqw 291(%rax,%r14,8), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb1,0x0d,0x00,0x75,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqw 291(%rax,%r14,8), %xmm30, %k3
+
+// CHECK: vpcmpeqw 2032(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x75,0x5a,0x7f]
+          vpcmpeqw 2032(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpeqw 2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x75,0x9a,0x00,0x08,0x00,0x00]
+          vpcmpeqw 2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpeqw -2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x75,0x5a,0x80]
+          vpcmpeqw -2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpeqw -2064(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x75,0x9a,0xf0,0xf7,0xff,0xff]
+          vpcmpeqw -2064(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpeqw %ymm29, %ymm20, %k2
+// CHECK:  encoding: [0x62,0x91,0x5d,0x20,0x75,0xd5]
+          vpcmpeqw %ymm29, %ymm20, %k2
+
+// CHECK: vpcmpeqw %ymm29, %ymm20, %k2 {%k5}
+// CHECK:  encoding: [0x62,0x91,0x5d,0x25,0x75,0xd5]
+          vpcmpeqw %ymm29, %ymm20, %k2 {%k5}
+
+// CHECK: vpcmpeqw (%rcx), %ymm20, %k2
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x75,0x11]
+          vpcmpeqw (%rcx), %ymm20, %k2
+
+// CHECK: vpcmpeqw 291(%rax,%r14,8), %ymm20, %k2
+// CHECK:  encoding: [0x62,0xb1,0x5d,0x20,0x75,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqw 291(%rax,%r14,8), %ymm20, %k2
+
+// CHECK: vpcmpeqw 4064(%rdx), %ymm20, %k2
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x75,0x52,0x7f]
+          vpcmpeqw 4064(%rdx), %ymm20, %k2
+
+// CHECK: vpcmpeqw 4096(%rdx), %ymm20, %k2
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x75,0x92,0x00,0x10,0x00,0x00]
+          vpcmpeqw 4096(%rdx), %ymm20, %k2
+
+// CHECK: vpcmpeqw -4096(%rdx), %ymm20, %k2
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x75,0x52,0x80]
+          vpcmpeqw -4096(%rdx), %ymm20, %k2
+
+// CHECK: vpcmpeqw -4128(%rdx), %ymm20, %k2
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x75,0x92,0xe0,0xef,0xff,0xff]
+          vpcmpeqw -4128(%rdx), %ymm20, %k2
+
+// CHECK: vpcmpgtb %xmm17, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb1,0x0d,0x00,0x64,0xd9]
+          vpcmpgtb %xmm17, %xmm30, %k3
+
+// CHECK: vpcmpgtb %xmm17, %xmm30, %k3 {%k7}
+// CHECK:  encoding: [0x62,0xb1,0x0d,0x07,0x64,0xd9]
+          vpcmpgtb %xmm17, %xmm30, %k3 {%k7}
+
+// CHECK: vpcmpgtb (%rcx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x64,0x19]
+          vpcmpgtb (%rcx), %xmm30, %k3
+
+// CHECK: vpcmpgtb 291(%rax,%r14,8), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb1,0x0d,0x00,0x64,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtb 291(%rax,%r14,8), %xmm30, %k3
+
+// CHECK: vpcmpgtb 2032(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x64,0x5a,0x7f]
+          vpcmpgtb 2032(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpgtb 2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x64,0x9a,0x00,0x08,0x00,0x00]
+          vpcmpgtb 2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpgtb -2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x64,0x5a,0x80]
+          vpcmpgtb -2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpgtb -2064(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf1,0x0d,0x00,0x64,0x9a,0xf0,0xf7,0xff,0xff]
+          vpcmpgtb -2064(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpgtb %ymm17, %ymm17, %k2
+// CHECK:  encoding: [0x62,0xb1,0x75,0x20,0x64,0xd1]
+          vpcmpgtb %ymm17, %ymm17, %k2
+
+// CHECK: vpcmpgtb %ymm17, %ymm17, %k2 {%k4}
+// CHECK:  encoding: [0x62,0xb1,0x75,0x24,0x64,0xd1]
+          vpcmpgtb %ymm17, %ymm17, %k2 {%k4}
+
+// CHECK: vpcmpgtb (%rcx), %ymm17, %k2
+// CHECK:  encoding: [0x62,0xf1,0x75,0x20,0x64,0x11]
+          vpcmpgtb (%rcx), %ymm17, %k2
+
+// CHECK: vpcmpgtb 291(%rax,%r14,8), %ymm17, %k2
+// CHECK:  encoding: [0x62,0xb1,0x75,0x20,0x64,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtb 291(%rax,%r14,8), %ymm17, %k2
+
+// CHECK: vpcmpgtb 4064(%rdx), %ymm17, %k2
+// CHECK:  encoding: [0x62,0xf1,0x75,0x20,0x64,0x52,0x7f]
+          vpcmpgtb 4064(%rdx), %ymm17, %k2
+
+// CHECK: vpcmpgtb 4096(%rdx), %ymm17, %k2
+// CHECK:  encoding: [0x62,0xf1,0x75,0x20,0x64,0x92,0x00,0x10,0x00,0x00]
+          vpcmpgtb 4096(%rdx), %ymm17, %k2
+
+// CHECK: vpcmpgtb -4096(%rdx), %ymm17, %k2
+// CHECK:  encoding: [0x62,0xf1,0x75,0x20,0x64,0x52,0x80]
+          vpcmpgtb -4096(%rdx), %ymm17, %k2
+
+// CHECK: vpcmpgtb -4128(%rdx), %ymm17, %k2
+// CHECK:  encoding: [0x62,0xf1,0x75,0x20,0x64,0x92,0xe0,0xef,0xff,0xff]
+          vpcmpgtb -4128(%rdx), %ymm17, %k2
+
+// CHECK: vpcmpgtw %xmm22, %xmm28, %k2
+// CHECK:  encoding: [0x62,0xb1,0x1d,0x00,0x65,0xd6]
+          vpcmpgtw %xmm22, %xmm28, %k2
+
+// CHECK: vpcmpgtw %xmm22, %xmm28, %k2 {%k7}
+// CHECK:  encoding: [0x62,0xb1,0x1d,0x07,0x65,0xd6]
+          vpcmpgtw %xmm22, %xmm28, %k2 {%k7}
+
+// CHECK: vpcmpgtw (%rcx), %xmm28, %k2
+// CHECK:  encoding: [0x62,0xf1,0x1d,0x00,0x65,0x11]
+          vpcmpgtw (%rcx), %xmm28, %k2
+
+// CHECK: vpcmpgtw 291(%rax,%r14,8), %xmm28, %k2
+// CHECK:  encoding: [0x62,0xb1,0x1d,0x00,0x65,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtw 291(%rax,%r14,8), %xmm28, %k2
+
+// CHECK: vpcmpgtw 2032(%rdx), %xmm28, %k2
+// CHECK:  encoding: [0x62,0xf1,0x1d,0x00,0x65,0x52,0x7f]
+          vpcmpgtw 2032(%rdx), %xmm28, %k2
+
+// CHECK: vpcmpgtw 2048(%rdx), %xmm28, %k2
+// CHECK:  encoding: [0x62,0xf1,0x1d,0x00,0x65,0x92,0x00,0x08,0x00,0x00]
+          vpcmpgtw 2048(%rdx), %xmm28, %k2
+
+// CHECK: vpcmpgtw -2048(%rdx), %xmm28, %k2
+// CHECK:  encoding: [0x62,0xf1,0x1d,0x00,0x65,0x52,0x80]
+          vpcmpgtw -2048(%rdx), %xmm28, %k2
+
+// CHECK: vpcmpgtw -2064(%rdx), %xmm28, %k2
+// CHECK:  encoding: [0x62,0xf1,0x1d,0x00,0x65,0x92,0xf0,0xf7,0xff,0xff]
+          vpcmpgtw -2064(%rdx), %xmm28, %k2
+
+// CHECK: vpcmpgtw %ymm26, %ymm20, %k5
+// CHECK:  encoding: [0x62,0x91,0x5d,0x20,0x65,0xea]
+          vpcmpgtw %ymm26, %ymm20, %k5
+
+// CHECK: vpcmpgtw %ymm26, %ymm20, %k5 {%k2}
+// CHECK:  encoding: [0x62,0x91,0x5d,0x22,0x65,0xea]
+          vpcmpgtw %ymm26, %ymm20, %k5 {%k2}
+
+// CHECK: vpcmpgtw (%rcx), %ymm20, %k5
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x65,0x29]
+          vpcmpgtw (%rcx), %ymm20, %k5
+
+// CHECK: vpcmpgtw 291(%rax,%r14,8), %ymm20, %k5
+// CHECK:  encoding: [0x62,0xb1,0x5d,0x20,0x65,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtw 291(%rax,%r14,8), %ymm20, %k5
+
+// CHECK: vpcmpgtw 4064(%rdx), %ymm20, %k5
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x65,0x6a,0x7f]
+          vpcmpgtw 4064(%rdx), %ymm20, %k5
+
+// CHECK: vpcmpgtw 4096(%rdx), %ymm20, %k5
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x65,0xaa,0x00,0x10,0x00,0x00]
+          vpcmpgtw 4096(%rdx), %ymm20, %k5
+
+// CHECK: vpcmpgtw -4096(%rdx), %ymm20, %k5
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x65,0x6a,0x80]
+          vpcmpgtw -4096(%rdx), %ymm20, %k5
+
+// CHECK: vpcmpgtw -4128(%rdx), %ymm20, %k5
+// CHECK:  encoding: [0x62,0xf1,0x5d,0x20,0x65,0xaa,0xe0,0xef,0xff,0xff]
+          vpcmpgtw -4128(%rdx), %ymm20, %k5
+
+// CHECK: vpcmpb $171, %xmm17, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb3,0x0d,0x00,0x3f,0xd9,0xab]
+          vpcmpb $171, %xmm17, %xmm30, %k3
+
+// CHECK: vpcmpb $171, %xmm17, %xmm30, %k3 {%k1}
+// CHECK:  encoding: [0x62,0xb3,0x0d,0x01,0x3f,0xd9,0xab]
+          vpcmpb $171, %xmm17, %xmm30, %k3 {%k1}
+
+// CHECK: vpcmpb $123, %xmm17, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb3,0x0d,0x00,0x3f,0xd9,0x7b]
+          vpcmpb $123, %xmm17, %xmm30, %k3
+
+// CHECK: vpcmpb $123, (%rcx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x00,0x3f,0x19,0x7b]
+          vpcmpb $123, (%rcx), %xmm30, %k3
+
+// CHECK: vpcmpb $123, 291(%rax,%r14,8), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb3,0x0d,0x00,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpb $123, 291(%rax,%r14,8), %xmm30, %k3
+
+// CHECK: vpcmpb $123, 2032(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x00,0x3f,0x5a,0x7f,0x7b]
+          vpcmpb $123, 2032(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpb $123, 2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x00,0x3f,0x9a,0x00,0x08,0x00,0x00,0x7b]
+          vpcmpb $123, 2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpb $123, -2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x00,0x3f,0x5a,0x80,0x7b]
+          vpcmpb $123, -2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpb $123, -2064(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x00,0x3f,0x9a,0xf0,0xf7,0xff,0xff,0x7b]
+          vpcmpb $123, -2064(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpb $171, %ymm19, %ymm19, %k5
+// CHECK:  encoding: [0x62,0xb3,0x65,0x20,0x3f,0xeb,0xab]
+          vpcmpb $171, %ymm19, %ymm19, %k5
+
+// CHECK: vpcmpb $171, %ymm19, %ymm19, %k5 {%k4}
+// CHECK:  encoding: [0x62,0xb3,0x65,0x24,0x3f,0xeb,0xab]
+          vpcmpb $171, %ymm19, %ymm19, %k5 {%k4}
+
+// CHECK: vpcmpb $123, %ymm19, %ymm19, %k5
+// CHECK:  encoding: [0x62,0xb3,0x65,0x20,0x3f,0xeb,0x7b]
+          vpcmpb $123, %ymm19, %ymm19, %k5
+
+// CHECK: vpcmpb $123, (%rcx), %ymm19, %k5
+// CHECK:  encoding: [0x62,0xf3,0x65,0x20,0x3f,0x29,0x7b]
+          vpcmpb $123, (%rcx), %ymm19, %k5
+
+// CHECK: vpcmpb $123, 291(%rax,%r14,8), %ymm19, %k5
+// CHECK:  encoding: [0x62,0xb3,0x65,0x20,0x3f,0xac,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpb $123, 291(%rax,%r14,8), %ymm19, %k5
+
+// CHECK: vpcmpb $123, 4064(%rdx), %ymm19, %k5
+// CHECK:  encoding: [0x62,0xf3,0x65,0x20,0x3f,0x6a,0x7f,0x7b]
+          vpcmpb $123, 4064(%rdx), %ymm19, %k5
+
+// CHECK: vpcmpb $123, 4096(%rdx), %ymm19, %k5
+// CHECK:  encoding: [0x62,0xf3,0x65,0x20,0x3f,0xaa,0x00,0x10,0x00,0x00,0x7b]
+          vpcmpb $123, 4096(%rdx), %ymm19, %k5
+
+// CHECK: vpcmpb $123, -4096(%rdx), %ymm19, %k5
+// CHECK:  encoding: [0x62,0xf3,0x65,0x20,0x3f,0x6a,0x80,0x7b]
+          vpcmpb $123, -4096(%rdx), %ymm19, %k5
+
+// CHECK: vpcmpb $123, -4128(%rdx), %ymm19, %k5
+// CHECK:  encoding: [0x62,0xf3,0x65,0x20,0x3f,0xaa,0xe0,0xef,0xff,0xff,0x7b]
+          vpcmpb $123, -4128(%rdx), %ymm19, %k5
+
+// CHECK: vpcmpw $171, %xmm22, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x00,0x3f,0xde,0xab]
+          vpcmpw $171, %xmm22, %xmm30, %k3
+
+// CHECK: vpcmpw $171, %xmm22, %xmm30, %k3 {%k6}
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x06,0x3f,0xde,0xab]
+          vpcmpw $171, %xmm22, %xmm30, %k3 {%k6}
+
+// CHECK: vpcmpw $123, %xmm22, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x00,0x3f,0xde,0x7b]
+          vpcmpw $123, %xmm22, %xmm30, %k3
+
+// CHECK: vpcmpw $123, (%rcx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x00,0x3f,0x19,0x7b]
+          vpcmpw $123, (%rcx), %xmm30, %k3
+
+// CHECK: vpcmpw $123, 291(%rax,%r14,8), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb3,0x8d,0x00,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpw $123, 291(%rax,%r14,8), %xmm30, %k3
+
+// CHECK: vpcmpw $123, 2032(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x00,0x3f,0x5a,0x7f,0x7b]
+          vpcmpw $123, 2032(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpw $123, 2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x00,0x3f,0x9a,0x00,0x08,0x00,0x00,0x7b]
+          vpcmpw $123, 2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpw $123, -2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x00,0x3f,0x5a,0x80,0x7b]
+          vpcmpw $123, -2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpw $123, -2064(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x8d,0x00,0x3f,0x9a,0xf0,0xf7,0xff,0xff,0x7b]
+          vpcmpw $123, -2064(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpw $171, %ymm18, %ymm26, %k3
+// CHECK:  encoding: [0x62,0xb3,0xad,0x20,0x3f,0xda,0xab]
+          vpcmpw $171, %ymm18, %ymm26, %k3
+
+// CHECK: vpcmpw $171, %ymm18, %ymm26, %k3 {%k3}
+// CHECK:  encoding: [0x62,0xb3,0xad,0x23,0x3f,0xda,0xab]
+          vpcmpw $171, %ymm18, %ymm26, %k3 {%k3}
+
+// CHECK: vpcmpw $123, %ymm18, %ymm26, %k3
+// CHECK:  encoding: [0x62,0xb3,0xad,0x20,0x3f,0xda,0x7b]
+          vpcmpw $123, %ymm18, %ymm26, %k3
+
+// CHECK: vpcmpw $123, (%rcx), %ymm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x3f,0x19,0x7b]
+          vpcmpw $123, (%rcx), %ymm26, %k3
+
+// CHECK: vpcmpw $123, 291(%rax,%r14,8), %ymm26, %k3
+// CHECK:  encoding: [0x62,0xb3,0xad,0x20,0x3f,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpw $123, 291(%rax,%r14,8), %ymm26, %k3
+
+// CHECK: vpcmpw $123, 4064(%rdx), %ymm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x3f,0x5a,0x7f,0x7b]
+          vpcmpw $123, 4064(%rdx), %ymm26, %k3
+
+// CHECK: vpcmpw $123, 4096(%rdx), %ymm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x3f,0x9a,0x00,0x10,0x00,0x00,0x7b]
+          vpcmpw $123, 4096(%rdx), %ymm26, %k3
+
+// CHECK: vpcmpw $123, -4096(%rdx), %ymm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x3f,0x5a,0x80,0x7b]
+          vpcmpw $123, -4096(%rdx), %ymm26, %k3
+
+// CHECK: vpcmpw $123, -4128(%rdx), %ymm26, %k3
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x3f,0x9a,0xe0,0xef,0xff,0xff,0x7b]
+          vpcmpw $123, -4128(%rdx), %ymm26, %k3
+
+// CHECK: vpcmpub $171, %xmm21, %xmm22, %k5
+// CHECK:  encoding: [0x62,0xb3,0x4d,0x00,0x3e,0xed,0xab]
+          vpcmpub $171, %xmm21, %xmm22, %k5
+
+// CHECK: vpcmpub $171, %xmm21, %xmm22, %k5 {%k3}
+// CHECK:  encoding: [0x62,0xb3,0x4d,0x03,0x3e,0xed,0xab]
+          vpcmpub $171, %xmm21, %xmm22, %k5 {%k3}
+
+// CHECK: vpcmpub $123, %xmm21, %xmm22, %k5
+// CHECK:  encoding: [0x62,0xb3,0x4d,0x00,0x3e,0xed,0x7b]
+          vpcmpub $123, %xmm21, %xmm22, %k5
+
+// CHECK: vpcmpub $123, (%rcx), %xmm22, %k5
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x3e,0x29,0x7b]
+          vpcmpub $123, (%rcx), %xmm22, %k5
+
+// CHECK: vpcmpub $123, 291(%rax,%r14,8), %xmm22, %k5
+// CHECK:  encoding: [0x62,0xb3,0x4d,0x00,0x3e,0xac,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpub $123, 291(%rax,%r14,8), %xmm22, %k5
+
+// CHECK: vpcmpub $123, 2032(%rdx), %xmm22, %k5
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x3e,0x6a,0x7f,0x7b]
+          vpcmpub $123, 2032(%rdx), %xmm22, %k5
+
+// CHECK: vpcmpub $123, 2048(%rdx), %xmm22, %k5
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x3e,0xaa,0x00,0x08,0x00,0x00,0x7b]
+          vpcmpub $123, 2048(%rdx), %xmm22, %k5
+
+// CHECK: vpcmpub $123, -2048(%rdx), %xmm22, %k5
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x3e,0x6a,0x80,0x7b]
+          vpcmpub $123, -2048(%rdx), %xmm22, %k5
+
+// CHECK: vpcmpub $123, -2064(%rdx), %xmm22, %k5
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x3e,0xaa,0xf0,0xf7,0xff,0xff,0x7b]
+          vpcmpub $123, -2064(%rdx), %xmm22, %k5
+
+// CHECK: vpcmpub $171, %ymm21, %ymm23, %k2
+// CHECK:  encoding: [0x62,0xb3,0x45,0x20,0x3e,0xd5,0xab]
+          vpcmpub $171, %ymm21, %ymm23, %k2
+
+// CHECK: vpcmpub $171, %ymm21, %ymm23, %k2 {%k2}
+// CHECK:  encoding: [0x62,0xb3,0x45,0x22,0x3e,0xd5,0xab]
+          vpcmpub $171, %ymm21, %ymm23, %k2 {%k2}
+
+// CHECK: vpcmpub $123, %ymm21, %ymm23, %k2
+// CHECK:  encoding: [0x62,0xb3,0x45,0x20,0x3e,0xd5,0x7b]
+          vpcmpub $123, %ymm21, %ymm23, %k2
+
+// CHECK: vpcmpub $123, (%rcx), %ymm23, %k2
+// CHECK:  encoding: [0x62,0xf3,0x45,0x20,0x3e,0x11,0x7b]
+          vpcmpub $123, (%rcx), %ymm23, %k2
+
+// CHECK: vpcmpub $123, 291(%rax,%r14,8), %ymm23, %k2
+// CHECK:  encoding: [0x62,0xb3,0x45,0x20,0x3e,0x94,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpub $123, 291(%rax,%r14,8), %ymm23, %k2
+
+// CHECK: vpcmpub $123, 4064(%rdx), %ymm23, %k2
+// CHECK:  encoding: [0x62,0xf3,0x45,0x20,0x3e,0x52,0x7f,0x7b]
+          vpcmpub $123, 4064(%rdx), %ymm23, %k2
+
+// CHECK: vpcmpub $123, 4096(%rdx), %ymm23, %k2
+// CHECK:  encoding: [0x62,0xf3,0x45,0x20,0x3e,0x92,0x00,0x10,0x00,0x00,0x7b]
+          vpcmpub $123, 4096(%rdx), %ymm23, %k2
+
+// CHECK: vpcmpub $123, -4096(%rdx), %ymm23, %k2
+// CHECK:  encoding: [0x62,0xf3,0x45,0x20,0x3e,0x52,0x80,0x7b]
+          vpcmpub $123, -4096(%rdx), %ymm23, %k2
+
+// CHECK: vpcmpub $123, -4128(%rdx), %ymm23, %k2
+// CHECK:  encoding: [0x62,0xf3,0x45,0x20,0x3e,0x92,0xe0,0xef,0xff,0xff,0x7b]
+          vpcmpub $123, -4128(%rdx), %ymm23, %k2
+
+// CHECK: vpcmpuw $171, %xmm17, %xmm28, %k5
+// CHECK:  encoding: [0x62,0xb3,0x9d,0x00,0x3e,0xe9,0xab]
+          vpcmpuw $171, %xmm17, %xmm28, %k5
+
+// CHECK: vpcmpuw $171, %xmm17, %xmm28, %k5 {%k4}
+// CHECK:  encoding: [0x62,0xb3,0x9d,0x04,0x3e,0xe9,0xab]
+          vpcmpuw $171, %xmm17, %xmm28, %k5 {%k4}
+
+// CHECK: vpcmpuw $123, %xmm17, %xmm28, %k5
+// CHECK:  encoding: [0x62,0xb3,0x9d,0x00,0x3e,0xe9,0x7b]
+          vpcmpuw $123, %xmm17, %xmm28, %k5
+
+// CHECK: vpcmpuw $123, (%rcx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x3e,0x29,0x7b]
+          vpcmpuw $123, (%rcx), %xmm28, %k5
+
+// CHECK: vpcmpuw $123, 291(%rax,%r14,8), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xb3,0x9d,0x00,0x3e,0xac,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpuw $123, 291(%rax,%r14,8), %xmm28, %k5
+
+// CHECK: vpcmpuw $123, 2032(%rdx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x3e,0x6a,0x7f,0x7b]
+          vpcmpuw $123, 2032(%rdx), %xmm28, %k5
+
+// CHECK: vpcmpuw $123, 2048(%rdx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x3e,0xaa,0x00,0x08,0x00,0x00,0x7b]
+          vpcmpuw $123, 2048(%rdx), %xmm28, %k5
+
+// CHECK: vpcmpuw $123, -2048(%rdx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x3e,0x6a,0x80,0x7b]
+          vpcmpuw $123, -2048(%rdx), %xmm28, %k5
+
+// CHECK: vpcmpuw $123, -2064(%rdx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x3e,0xaa,0xf0,0xf7,0xff,0xff,0x7b]
+          vpcmpuw $123, -2064(%rdx), %xmm28, %k5
+
+// CHECK: vpcmpuw $171, %ymm28, %ymm27, %k4
+// CHECK:  encoding: [0x62,0x93,0xa5,0x20,0x3e,0xe4,0xab]
+          vpcmpuw $171, %ymm28, %ymm27, %k4
+
+// CHECK: vpcmpuw $171, %ymm28, %ymm27, %k4 {%k2}
+// CHECK:  encoding: [0x62,0x93,0xa5,0x22,0x3e,0xe4,0xab]
+          vpcmpuw $171, %ymm28, %ymm27, %k4 {%k2}
+
+// CHECK: vpcmpuw $123, %ymm28, %ymm27, %k4
+// CHECK:  encoding: [0x62,0x93,0xa5,0x20,0x3e,0xe4,0x7b]
+          vpcmpuw $123, %ymm28, %ymm27, %k4
+
+// CHECK: vpcmpuw $123, (%rcx), %ymm27, %k4
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x20,0x3e,0x21,0x7b]
+          vpcmpuw $123, (%rcx), %ymm27, %k4
+
+// CHECK: vpcmpuw $123, 291(%rax,%r14,8), %ymm27, %k4
+// CHECK:  encoding: [0x62,0xb3,0xa5,0x20,0x3e,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpuw $123, 291(%rax,%r14,8), %ymm27, %k4
+
+// CHECK: vpcmpuw $123, 4064(%rdx), %ymm27, %k4
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x20,0x3e,0x62,0x7f,0x7b]
+          vpcmpuw $123, 4064(%rdx), %ymm27, %k4
+
+// CHECK: vpcmpuw $123, 4096(%rdx), %ymm27, %k4
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x20,0x3e,0xa2,0x00,0x10,0x00,0x00,0x7b]
+          vpcmpuw $123, 4096(%rdx), %ymm27, %k4
+
+// CHECK: vpcmpuw $123, -4096(%rdx), %ymm27, %k4
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x20,0x3e,0x62,0x80,0x7b]
+          vpcmpuw $123, -4096(%rdx), %ymm27, %k4
+
+// CHECK: vpcmpuw $123, -4128(%rdx), %ymm27, %k4
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x20,0x3e,0xa2,0xe0,0xef,0xff,0xff,0x7b]
+          vpcmpuw $123, -4128(%rdx), %ymm27, %k4
+
+// CHECK: vpmaxsb %xmm17, %xmm24, %xmm23
+// CHECK:  encoding: [0x62,0xa2,0x3d,0x00,0x3c,0xf9]
+          vpmaxsb %xmm17, %xmm24, %xmm23
+
+// CHECK: vpmaxsb %xmm17, %xmm24, %xmm23 {%k3}
+// CHECK:  encoding: [0x62,0xa2,0x3d,0x03,0x3c,0xf9]
+          vpmaxsb %xmm17, %xmm24, %xmm23 {%k3}
+
+// CHECK: vpmaxsb %xmm17, %xmm24, %xmm23 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa2,0x3d,0x83,0x3c,0xf9]
+          vpmaxsb %xmm17, %xmm24, %xmm23 {%k3} {z}
+
+// CHECK: vpmaxsb (%rcx), %xmm24, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0x3d,0x00,0x3c,0x39]
+          vpmaxsb (%rcx), %xmm24, %xmm23
+
+// CHECK: vpmaxsb 291(%rax,%r14,8), %xmm24, %xmm23
+// CHECK:  encoding: [0x62,0xa2,0x3d,0x00,0x3c,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsb 291(%rax,%r14,8), %xmm24, %xmm23
+
+// CHECK: vpmaxsb 2032(%rdx), %xmm24, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0x3d,0x00,0x3c,0x7a,0x7f]
+          vpmaxsb 2032(%rdx), %xmm24, %xmm23
+
+// CHECK: vpmaxsb 2048(%rdx), %xmm24, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0x3d,0x00,0x3c,0xba,0x00,0x08,0x00,0x00]
+          vpmaxsb 2048(%rdx), %xmm24, %xmm23
+
+// CHECK: vpmaxsb -2048(%rdx), %xmm24, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0x3d,0x00,0x3c,0x7a,0x80]
+          vpmaxsb -2048(%rdx), %xmm24, %xmm23
+
+// CHECK: vpmaxsb -2064(%rdx), %xmm24, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0x3d,0x00,0x3c,0xba,0xf0,0xf7,0xff,0xff]
+          vpmaxsb -2064(%rdx), %xmm24, %xmm23
+
+// CHECK: vpmaxsb %ymm27, %ymm22, %ymm30
+// CHECK:  encoding: [0x62,0x02,0x4d,0x20,0x3c,0xf3]
+          vpmaxsb %ymm27, %ymm22, %ymm30
+
+// CHECK: vpmaxsb %ymm27, %ymm22, %ymm30 {%k5}
+// CHECK:  encoding: [0x62,0x02,0x4d,0x25,0x3c,0xf3]
+          vpmaxsb %ymm27, %ymm22, %ymm30 {%k5}
+
+// CHECK: vpmaxsb %ymm27, %ymm22, %ymm30 {%k5} {z}
+// CHECK:  encoding: [0x62,0x02,0x4d,0xa5,0x3c,0xf3]
+          vpmaxsb %ymm27, %ymm22, %ymm30 {%k5} {z}
+
+// CHECK: vpmaxsb (%rcx), %ymm22, %ymm30
+// CHECK:  encoding: [0x62,0x62,0x4d,0x20,0x3c,0x31]
+          vpmaxsb (%rcx), %ymm22, %ymm30
+
+// CHECK: vpmaxsb 291(%rax,%r14,8), %ymm22, %ymm30
+// CHECK:  encoding: [0x62,0x22,0x4d,0x20,0x3c,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsb 291(%rax,%r14,8), %ymm22, %ymm30
+
+// CHECK: vpmaxsb 4064(%rdx), %ymm22, %ymm30
+// CHECK:  encoding: [0x62,0x62,0x4d,0x20,0x3c,0x72,0x7f]
+          vpmaxsb 4064(%rdx), %ymm22, %ymm30
+
+// CHECK: vpmaxsb 4096(%rdx), %ymm22, %ymm30
+// CHECK:  encoding: [0x62,0x62,0x4d,0x20,0x3c,0xb2,0x00,0x10,0x00,0x00]
+          vpmaxsb 4096(%rdx), %ymm22, %ymm30
+
+// CHECK: vpmaxsb -4096(%rdx), %ymm22, %ymm30
+// CHECK:  encoding: [0x62,0x62,0x4d,0x20,0x3c,0x72,0x80]
+          vpmaxsb -4096(%rdx), %ymm22, %ymm30
+
+// CHECK: vpmaxsb -4128(%rdx), %ymm22, %ymm30
+// CHECK:  encoding: [0x62,0x62,0x4d,0x20,0x3c,0xb2,0xe0,0xef,0xff,0xff]
+          vpmaxsb -4128(%rdx), %ymm22, %ymm30
+
+// CHECK: vpmaxsw %xmm28, %xmm24, %xmm18
+// CHECK:  encoding: [0x62,0x81,0x3d,0x00,0xee,0xd4]
+          vpmaxsw %xmm28, %xmm24, %xmm18
+
+// CHECK: vpmaxsw %xmm28, %xmm24, %xmm18 {%k3}
+// CHECK:  encoding: [0x62,0x81,0x3d,0x03,0xee,0xd4]
+          vpmaxsw %xmm28, %xmm24, %xmm18 {%k3}
+
+// CHECK: vpmaxsw %xmm28, %xmm24, %xmm18 {%k3} {z}
+// CHECK:  encoding: [0x62,0x81,0x3d,0x83,0xee,0xd4]
+          vpmaxsw %xmm28, %xmm24, %xmm18 {%k3} {z}
+
+// CHECK: vpmaxsw (%rcx), %xmm24, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x00,0xee,0x11]
+          vpmaxsw (%rcx), %xmm24, %xmm18
+
+// CHECK: vpmaxsw 291(%rax,%r14,8), %xmm24, %xmm18
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x00,0xee,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsw 291(%rax,%r14,8), %xmm24, %xmm18
+
+// CHECK: vpmaxsw 2032(%rdx), %xmm24, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x00,0xee,0x52,0x7f]
+          vpmaxsw 2032(%rdx), %xmm24, %xmm18
+
+// CHECK: vpmaxsw 2048(%rdx), %xmm24, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x00,0xee,0x92,0x00,0x08,0x00,0x00]
+          vpmaxsw 2048(%rdx), %xmm24, %xmm18
+
+// CHECK: vpmaxsw -2048(%rdx), %xmm24, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x00,0xee,0x52,0x80]
+          vpmaxsw -2048(%rdx), %xmm24, %xmm18
+
+// CHECK: vpmaxsw -2064(%rdx), %xmm24, %xmm18
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x00,0xee,0x92,0xf0,0xf7,0xff,0xff]
+          vpmaxsw -2064(%rdx), %xmm24, %xmm18
+
+// CHECK: vpmaxsw %ymm17, %ymm28, %ymm27
+// CHECK:  encoding: [0x62,0x21,0x1d,0x20,0xee,0xd9]
+          vpmaxsw %ymm17, %ymm28, %ymm27
+
+// CHECK: vpmaxsw %ymm17, %ymm28, %ymm27 {%k6}
+// CHECK:  encoding: [0x62,0x21,0x1d,0x26,0xee,0xd9]
+          vpmaxsw %ymm17, %ymm28, %ymm27 {%k6}
+
+// CHECK: vpmaxsw %ymm17, %ymm28, %ymm27 {%k6} {z}
+// CHECK:  encoding: [0x62,0x21,0x1d,0xa6,0xee,0xd9]
+          vpmaxsw %ymm17, %ymm28, %ymm27 {%k6} {z}
+
+// CHECK: vpmaxsw (%rcx), %ymm28, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x1d,0x20,0xee,0x19]
+          vpmaxsw (%rcx), %ymm28, %ymm27
+
+// CHECK: vpmaxsw 291(%rax,%r14,8), %ymm28, %ymm27
+// CHECK:  encoding: [0x62,0x21,0x1d,0x20,0xee,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsw 291(%rax,%r14,8), %ymm28, %ymm27
+
+// CHECK: vpmaxsw 4064(%rdx), %ymm28, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x1d,0x20,0xee,0x5a,0x7f]
+          vpmaxsw 4064(%rdx), %ymm28, %ymm27
+
+// CHECK: vpmaxsw 4096(%rdx), %ymm28, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x1d,0x20,0xee,0x9a,0x00,0x10,0x00,0x00]
+          vpmaxsw 4096(%rdx), %ymm28, %ymm27
+
+// CHECK: vpmaxsw -4096(%rdx), %ymm28, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x1d,0x20,0xee,0x5a,0x80]
+          vpmaxsw -4096(%rdx), %ymm28, %ymm27
+
+// CHECK: vpmaxsw -4128(%rdx), %ymm28, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x1d,0x20,0xee,0x9a,0xe0,0xef,0xff,0xff]
+          vpmaxsw -4128(%rdx), %ymm28, %ymm27
+
+// CHECK: vpmaxub %xmm23, %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x75,0x00,0xde,0xcf]
+          vpmaxub %xmm23, %xmm17, %xmm17
+
+// CHECK: vpmaxub %xmm23, %xmm17, %xmm17 {%k5}
+// CHECK:  encoding: [0x62,0xa1,0x75,0x05,0xde,0xcf]
+          vpmaxub %xmm23, %xmm17, %xmm17 {%k5}
+
+// CHECK: vpmaxub %xmm23, %xmm17, %xmm17 {%k5} {z}
+// CHECK:  encoding: [0x62,0xa1,0x75,0x85,0xde,0xcf]
+          vpmaxub %xmm23, %xmm17, %xmm17 {%k5} {z}
+
+// CHECK: vpmaxub (%rcx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xde,0x09]
+          vpmaxub (%rcx), %xmm17, %xmm17
+
+// CHECK: vpmaxub 291(%rax,%r14,8), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x75,0x00,0xde,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxub 291(%rax,%r14,8), %xmm17, %xmm17
+
+// CHECK: vpmaxub 2032(%rdx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xde,0x4a,0x7f]
+          vpmaxub 2032(%rdx), %xmm17, %xmm17
+
+// CHECK: vpmaxub 2048(%rdx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xde,0x8a,0x00,0x08,0x00,0x00]
+          vpmaxub 2048(%rdx), %xmm17, %xmm17
+
+// CHECK: vpmaxub -2048(%rdx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xde,0x4a,0x80]
+          vpmaxub -2048(%rdx), %xmm17, %xmm17
+
+// CHECK: vpmaxub -2064(%rdx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xde,0x8a,0xf0,0xf7,0xff,0xff]
+          vpmaxub -2064(%rdx), %xmm17, %xmm17
+
+// CHECK: vpmaxub %ymm24, %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x01,0x2d,0x20,0xde,0xe0]
+          vpmaxub %ymm24, %ymm26, %ymm28
+
+// CHECK: vpmaxub %ymm24, %ymm26, %ymm28 {%k6}
+// CHECK:  encoding: [0x62,0x01,0x2d,0x26,0xde,0xe0]
+          vpmaxub %ymm24, %ymm26, %ymm28 {%k6}
+
+// CHECK: vpmaxub %ymm24, %ymm26, %ymm28 {%k6} {z}
+// CHECK:  encoding: [0x62,0x01,0x2d,0xa6,0xde,0xe0]
+          vpmaxub %ymm24, %ymm26, %ymm28 {%k6} {z}
+
+// CHECK: vpmaxub (%rcx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xde,0x21]
+          vpmaxub (%rcx), %ymm26, %ymm28
+
+// CHECK: vpmaxub 291(%rax,%r14,8), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x21,0x2d,0x20,0xde,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxub 291(%rax,%r14,8), %ymm26, %ymm28
+
+// CHECK: vpmaxub 4064(%rdx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xde,0x62,0x7f]
+          vpmaxub 4064(%rdx), %ymm26, %ymm28
+
+// CHECK: vpmaxub 4096(%rdx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xde,0xa2,0x00,0x10,0x00,0x00]
+          vpmaxub 4096(%rdx), %ymm26, %ymm28
+
+// CHECK: vpmaxub -4096(%rdx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xde,0x62,0x80]
+          vpmaxub -4096(%rdx), %ymm26, %ymm28
+
+// CHECK: vpmaxub -4128(%rdx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xde,0xa2,0xe0,0xef,0xff,0xff]
+          vpmaxub -4128(%rdx), %ymm26, %ymm28
+
+// CHECK: vpmaxuw %xmm20, %xmm18, %xmm24
+// CHECK:  encoding: [0x62,0x22,0x6d,0x00,0x3e,0xc4]
+          vpmaxuw %xmm20, %xmm18, %xmm24
+
+// CHECK: vpmaxuw %xmm20, %xmm18, %xmm24 {%k7}
+// CHECK:  encoding: [0x62,0x22,0x6d,0x07,0x3e,0xc4]
+          vpmaxuw %xmm20, %xmm18, %xmm24 {%k7}
+
+// CHECK: vpmaxuw %xmm20, %xmm18, %xmm24 {%k7} {z}
+// CHECK:  encoding: [0x62,0x22,0x6d,0x87,0x3e,0xc4]
+          vpmaxuw %xmm20, %xmm18, %xmm24 {%k7} {z}
+
+// CHECK: vpmaxuw (%rcx), %xmm18, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x6d,0x00,0x3e,0x01]
+          vpmaxuw (%rcx), %xmm18, %xmm24
+
+// CHECK: vpmaxuw 291(%rax,%r14,8), %xmm18, %xmm24
+// CHECK:  encoding: [0x62,0x22,0x6d,0x00,0x3e,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxuw 291(%rax,%r14,8), %xmm18, %xmm24
+
+// CHECK: vpmaxuw 2032(%rdx), %xmm18, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x6d,0x00,0x3e,0x42,0x7f]
+          vpmaxuw 2032(%rdx), %xmm18, %xmm24
+
+// CHECK: vpmaxuw 2048(%rdx), %xmm18, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x6d,0x00,0x3e,0x82,0x00,0x08,0x00,0x00]
+          vpmaxuw 2048(%rdx), %xmm18, %xmm24
+
+// CHECK: vpmaxuw -2048(%rdx), %xmm18, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x6d,0x00,0x3e,0x42,0x80]
+          vpmaxuw -2048(%rdx), %xmm18, %xmm24
+
+// CHECK: vpmaxuw -2064(%rdx), %xmm18, %xmm24
+// CHECK:  encoding: [0x62,0x62,0x6d,0x00,0x3e,0x82,0xf0,0xf7,0xff,0xff]
+          vpmaxuw -2064(%rdx), %xmm18, %xmm24
+
+// CHECK: vpmaxuw %ymm19, %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xa2,0x4d,0x20,0x3e,0xdb]
+          vpmaxuw %ymm19, %ymm22, %ymm19
+
+// CHECK: vpmaxuw %ymm19, %ymm22, %ymm19 {%k7}
+// CHECK:  encoding: [0x62,0xa2,0x4d,0x27,0x3e,0xdb]
+          vpmaxuw %ymm19, %ymm22, %ymm19 {%k7}
+
+// CHECK: vpmaxuw %ymm19, %ymm22, %ymm19 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa2,0x4d,0xa7,0x3e,0xdb]
+          vpmaxuw %ymm19, %ymm22, %ymm19 {%k7} {z}
+
+// CHECK: vpmaxuw (%rcx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3e,0x19]
+          vpmaxuw (%rcx), %ymm22, %ymm19
+
+// CHECK: vpmaxuw 291(%rax,%r14,8), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xa2,0x4d,0x20,0x3e,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxuw 291(%rax,%r14,8), %ymm22, %ymm19
+
+// CHECK: vpmaxuw 4064(%rdx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3e,0x5a,0x7f]
+          vpmaxuw 4064(%rdx), %ymm22, %ymm19
+
+// CHECK: vpmaxuw 4096(%rdx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3e,0x9a,0x00,0x10,0x00,0x00]
+          vpmaxuw 4096(%rdx), %ymm22, %ymm19
+
+// CHECK: vpmaxuw -4096(%rdx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3e,0x5a,0x80]
+          vpmaxuw -4096(%rdx), %ymm22, %ymm19
+
+// CHECK: vpmaxuw -4128(%rdx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3e,0x9a,0xe0,0xef,0xff,0xff]
+          vpmaxuw -4128(%rdx), %ymm22, %ymm19
+
+// CHECK: vpminsb %xmm27, %xmm28, %xmm25
+// CHECK:  encoding: [0x62,0x02,0x1d,0x00,0x38,0xcb]
+          vpminsb %xmm27, %xmm28, %xmm25
+
+// CHECK: vpminsb %xmm27, %xmm28, %xmm25 {%k1}
+// CHECK:  encoding: [0x62,0x02,0x1d,0x01,0x38,0xcb]
+          vpminsb %xmm27, %xmm28, %xmm25 {%k1}
+
+// CHECK: vpminsb %xmm27, %xmm28, %xmm25 {%k1} {z}
+// CHECK:  encoding: [0x62,0x02,0x1d,0x81,0x38,0xcb]
+          vpminsb %xmm27, %xmm28, %xmm25 {%k1} {z}
+
+// CHECK: vpminsb (%rcx), %xmm28, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x1d,0x00,0x38,0x09]
+          vpminsb (%rcx), %xmm28, %xmm25
+
+// CHECK: vpminsb 291(%rax,%r14,8), %xmm28, %xmm25
+// CHECK:  encoding: [0x62,0x22,0x1d,0x00,0x38,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpminsb 291(%rax,%r14,8), %xmm28, %xmm25
+
+// CHECK: vpminsb 2032(%rdx), %xmm28, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x1d,0x00,0x38,0x4a,0x7f]
+          vpminsb 2032(%rdx), %xmm28, %xmm25
+
+// CHECK: vpminsb 2048(%rdx), %xmm28, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x1d,0x00,0x38,0x8a,0x00,0x08,0x00,0x00]
+          vpminsb 2048(%rdx), %xmm28, %xmm25
+
+// CHECK: vpminsb -2048(%rdx), %xmm28, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x1d,0x00,0x38,0x4a,0x80]
+          vpminsb -2048(%rdx), %xmm28, %xmm25
+
+// CHECK: vpminsb -2064(%rdx), %xmm28, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x1d,0x00,0x38,0x8a,0xf0,0xf7,0xff,0xff]
+          vpminsb -2064(%rdx), %xmm28, %xmm25
+
+// CHECK: vpminsb %ymm27, %ymm20, %ymm27
+// CHECK:  encoding: [0x62,0x02,0x5d,0x20,0x38,0xdb]
+          vpminsb %ymm27, %ymm20, %ymm27
+
+// CHECK: vpminsb %ymm27, %ymm20, %ymm27 {%k6}
+// CHECK:  encoding: [0x62,0x02,0x5d,0x26,0x38,0xdb]
+          vpminsb %ymm27, %ymm20, %ymm27 {%k6}
+
+// CHECK: vpminsb %ymm27, %ymm20, %ymm27 {%k6} {z}
+// CHECK:  encoding: [0x62,0x02,0x5d,0xa6,0x38,0xdb]
+          vpminsb %ymm27, %ymm20, %ymm27 {%k6} {z}
+
+// CHECK: vpminsb (%rcx), %ymm20, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x38,0x19]
+          vpminsb (%rcx), %ymm20, %ymm27
+
+// CHECK: vpminsb 291(%rax,%r14,8), %ymm20, %ymm27
+// CHECK:  encoding: [0x62,0x22,0x5d,0x20,0x38,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpminsb 291(%rax,%r14,8), %ymm20, %ymm27
+
+// CHECK: vpminsb 4064(%rdx), %ymm20, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x38,0x5a,0x7f]
+          vpminsb 4064(%rdx), %ymm20, %ymm27
+
+// CHECK: vpminsb 4096(%rdx), %ymm20, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x38,0x9a,0x00,0x10,0x00,0x00]
+          vpminsb 4096(%rdx), %ymm20, %ymm27
+
+// CHECK: vpminsb -4096(%rdx), %ymm20, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x38,0x5a,0x80]
+          vpminsb -4096(%rdx), %ymm20, %ymm27
+
+// CHECK: vpminsb -4128(%rdx), %ymm20, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x5d,0x20,0x38,0x9a,0xe0,0xef,0xff,0xff]
+          vpminsb -4128(%rdx), %ymm20, %ymm27
+
+// CHECK: vpminsw %xmm19, %xmm26, %xmm27
+// CHECK:  encoding: [0x62,0x21,0x2d,0x00,0xea,0xdb]
+          vpminsw %xmm19, %xmm26, %xmm27
+
+// CHECK: vpminsw %xmm19, %xmm26, %xmm27 {%k2}
+// CHECK:  encoding: [0x62,0x21,0x2d,0x02,0xea,0xdb]
+          vpminsw %xmm19, %xmm26, %xmm27 {%k2}
+
+// CHECK: vpminsw %xmm19, %xmm26, %xmm27 {%k2} {z}
+// CHECK:  encoding: [0x62,0x21,0x2d,0x82,0xea,0xdb]
+          vpminsw %xmm19, %xmm26, %xmm27 {%k2} {z}
+
+// CHECK: vpminsw (%rcx), %xmm26, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xea,0x19]
+          vpminsw (%rcx), %xmm26, %xmm27
+
+// CHECK: vpminsw 291(%rax,%r14,8), %xmm26, %xmm27
+// CHECK:  encoding: [0x62,0x21,0x2d,0x00,0xea,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpminsw 291(%rax,%r14,8), %xmm26, %xmm27
+
+// CHECK: vpminsw 2032(%rdx), %xmm26, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xea,0x5a,0x7f]
+          vpminsw 2032(%rdx), %xmm26, %xmm27
+
+// CHECK: vpminsw 2048(%rdx), %xmm26, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xea,0x9a,0x00,0x08,0x00,0x00]
+          vpminsw 2048(%rdx), %xmm26, %xmm27
+
+// CHECK: vpminsw -2048(%rdx), %xmm26, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xea,0x5a,0x80]
+          vpminsw -2048(%rdx), %xmm26, %xmm27
+
+// CHECK: vpminsw -2064(%rdx), %xmm26, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xea,0x9a,0xf0,0xf7,0xff,0xff]
+          vpminsw -2064(%rdx), %xmm26, %xmm27
+
+// CHECK: vpminsw %ymm27, %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0x81,0x45,0x20,0xea,0xeb]
+          vpminsw %ymm27, %ymm23, %ymm21
+
+// CHECK: vpminsw %ymm27, %ymm23, %ymm21 {%k2}
+// CHECK:  encoding: [0x62,0x81,0x45,0x22,0xea,0xeb]
+          vpminsw %ymm27, %ymm23, %ymm21 {%k2}
+
+// CHECK: vpminsw %ymm27, %ymm23, %ymm21 {%k2} {z}
+// CHECK:  encoding: [0x62,0x81,0x45,0xa2,0xea,0xeb]
+          vpminsw %ymm27, %ymm23, %ymm21 {%k2} {z}
+
+// CHECK: vpminsw (%rcx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x45,0x20,0xea,0x29]
+          vpminsw (%rcx), %ymm23, %ymm21
+
+// CHECK: vpminsw 291(%rax,%r14,8), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xa1,0x45,0x20,0xea,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpminsw 291(%rax,%r14,8), %ymm23, %ymm21
+
+// CHECK: vpminsw 4064(%rdx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x45,0x20,0xea,0x6a,0x7f]
+          vpminsw 4064(%rdx), %ymm23, %ymm21
+
+// CHECK: vpminsw 4096(%rdx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x45,0x20,0xea,0xaa,0x00,0x10,0x00,0x00]
+          vpminsw 4096(%rdx), %ymm23, %ymm21
+
+// CHECK: vpminsw -4096(%rdx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x45,0x20,0xea,0x6a,0x80]
+          vpminsw -4096(%rdx), %ymm23, %ymm21
+
+// CHECK: vpminsw -4128(%rdx), %ymm23, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x45,0x20,0xea,0xaa,0xe0,0xef,0xff,0xff]
+          vpminsw -4128(%rdx), %ymm23, %ymm21
+
+// CHECK: vpminub %xmm27, %xmm27, %xmm22
+// CHECK:  encoding: [0x62,0x81,0x25,0x00,0xda,0xf3]
+          vpminub %xmm27, %xmm27, %xmm22
+
+// CHECK: vpminub %xmm27, %xmm27, %xmm22 {%k5}
+// CHECK:  encoding: [0x62,0x81,0x25,0x05,0xda,0xf3]
+          vpminub %xmm27, %xmm27, %xmm22 {%k5}
+
+// CHECK: vpminub %xmm27, %xmm27, %xmm22 {%k5} {z}
+// CHECK:  encoding: [0x62,0x81,0x25,0x85,0xda,0xf3]
+          vpminub %xmm27, %xmm27, %xmm22 {%k5} {z}
+
+// CHECK: vpminub (%rcx), %xmm27, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x00,0xda,0x31]
+          vpminub (%rcx), %xmm27, %xmm22
+
+// CHECK: vpminub 291(%rax,%r14,8), %xmm27, %xmm22
+// CHECK:  encoding: [0x62,0xa1,0x25,0x00,0xda,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpminub 291(%rax,%r14,8), %xmm27, %xmm22
+
+// CHECK: vpminub 2032(%rdx), %xmm27, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x00,0xda,0x72,0x7f]
+          vpminub 2032(%rdx), %xmm27, %xmm22
+
+// CHECK: vpminub 2048(%rdx), %xmm27, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x00,0xda,0xb2,0x00,0x08,0x00,0x00]
+          vpminub 2048(%rdx), %xmm27, %xmm22
+
+// CHECK: vpminub -2048(%rdx), %xmm27, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x00,0xda,0x72,0x80]
+          vpminub -2048(%rdx), %xmm27, %xmm22
+
+// CHECK: vpminub -2064(%rdx), %xmm27, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x25,0x00,0xda,0xb2,0xf0,0xf7,0xff,0xff]
+          vpminub -2064(%rdx), %xmm27, %xmm22
+
+// CHECK: vpminub %ymm27, %ymm25, %ymm27
+// CHECK:  encoding: [0x62,0x01,0x35,0x20,0xda,0xdb]
+          vpminub %ymm27, %ymm25, %ymm27
+
+// CHECK: vpminub %ymm27, %ymm25, %ymm27 {%k3}
+// CHECK:  encoding: [0x62,0x01,0x35,0x23,0xda,0xdb]
+          vpminub %ymm27, %ymm25, %ymm27 {%k3}
+
+// CHECK: vpminub %ymm27, %ymm25, %ymm27 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0x35,0xa3,0xda,0xdb]
+          vpminub %ymm27, %ymm25, %ymm27 {%k3} {z}
+
+// CHECK: vpminub (%rcx), %ymm25, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x35,0x20,0xda,0x19]
+          vpminub (%rcx), %ymm25, %ymm27
+
+// CHECK: vpminub 291(%rax,%r14,8), %ymm25, %ymm27
+// CHECK:  encoding: [0x62,0x21,0x35,0x20,0xda,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpminub 291(%rax,%r14,8), %ymm25, %ymm27
+
+// CHECK: vpminub 4064(%rdx), %ymm25, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x35,0x20,0xda,0x5a,0x7f]
+          vpminub 4064(%rdx), %ymm25, %ymm27
+
+// CHECK: vpminub 4096(%rdx), %ymm25, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x35,0x20,0xda,0x9a,0x00,0x10,0x00,0x00]
+          vpminub 4096(%rdx), %ymm25, %ymm27
+
+// CHECK: vpminub -4096(%rdx), %ymm25, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x35,0x20,0xda,0x5a,0x80]
+          vpminub -4096(%rdx), %ymm25, %ymm27
+
+// CHECK: vpminub -4128(%rdx), %ymm25, %ymm27
+// CHECK:  encoding: [0x62,0x61,0x35,0x20,0xda,0x9a,0xe0,0xef,0xff,0xff]
+          vpminub -4128(%rdx), %ymm25, %ymm27
+
+// CHECK: vpminuw %xmm23, %xmm17, %xmm28
+// CHECK:  encoding: [0x62,0x22,0x75,0x00,0x3a,0xe7]
+          vpminuw %xmm23, %xmm17, %xmm28
+
+// CHECK: vpminuw %xmm23, %xmm17, %xmm28 {%k2}
+// CHECK:  encoding: [0x62,0x22,0x75,0x02,0x3a,0xe7]
+          vpminuw %xmm23, %xmm17, %xmm28 {%k2}
+
+// CHECK: vpminuw %xmm23, %xmm17, %xmm28 {%k2} {z}
+// CHECK:  encoding: [0x62,0x22,0x75,0x82,0x3a,0xe7]
+          vpminuw %xmm23, %xmm17, %xmm28 {%k2} {z}
+
+// CHECK: vpminuw (%rcx), %xmm17, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x75,0x00,0x3a,0x21]
+          vpminuw (%rcx), %xmm17, %xmm28
+
+// CHECK: vpminuw 291(%rax,%r14,8), %xmm17, %xmm28
+// CHECK:  encoding: [0x62,0x22,0x75,0x00,0x3a,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpminuw 291(%rax,%r14,8), %xmm17, %xmm28
+
+// CHECK: vpminuw 2032(%rdx), %xmm17, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x75,0x00,0x3a,0x62,0x7f]
+          vpminuw 2032(%rdx), %xmm17, %xmm28
+
+// CHECK: vpminuw 2048(%rdx), %xmm17, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x75,0x00,0x3a,0xa2,0x00,0x08,0x00,0x00]
+          vpminuw 2048(%rdx), %xmm17, %xmm28
+
+// CHECK: vpminuw -2048(%rdx), %xmm17, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x75,0x00,0x3a,0x62,0x80]
+          vpminuw -2048(%rdx), %xmm17, %xmm28
+
+// CHECK: vpminuw -2064(%rdx), %xmm17, %xmm28
+// CHECK:  encoding: [0x62,0x62,0x75,0x00,0x3a,0xa2,0xf0,0xf7,0xff,0xff]
+          vpminuw -2064(%rdx), %xmm17, %xmm28
+
+// CHECK: vpminuw %ymm18, %ymm27, %ymm20
+// CHECK:  encoding: [0x62,0xa2,0x25,0x20,0x3a,0xe2]
+          vpminuw %ymm18, %ymm27, %ymm20
+
+// CHECK: vpminuw %ymm18, %ymm27, %ymm20 {%k7}
+// CHECK:  encoding: [0x62,0xa2,0x25,0x27,0x3a,0xe2]
+          vpminuw %ymm18, %ymm27, %ymm20 {%k7}
+
+// CHECK: vpminuw %ymm18, %ymm27, %ymm20 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa2,0x25,0xa7,0x3a,0xe2]
+          vpminuw %ymm18, %ymm27, %ymm20 {%k7} {z}
+
+// CHECK: vpminuw (%rcx), %ymm27, %ymm20
+// CHECK:  encoding: [0x62,0xe2,0x25,0x20,0x3a,0x21]
+          vpminuw (%rcx), %ymm27, %ymm20
+
+// CHECK: vpminuw 291(%rax,%r14,8), %ymm27, %ymm20
+// CHECK:  encoding: [0x62,0xa2,0x25,0x20,0x3a,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpminuw 291(%rax,%r14,8), %ymm27, %ymm20
+
+// CHECK: vpminuw 4064(%rdx), %ymm27, %ymm20
+// CHECK:  encoding: [0x62,0xe2,0x25,0x20,0x3a,0x62,0x7f]
+          vpminuw 4064(%rdx), %ymm27, %ymm20
+
+// CHECK: vpminuw 4096(%rdx), %ymm27, %ymm20
+// CHECK:  encoding: [0x62,0xe2,0x25,0x20,0x3a,0xa2,0x00,0x10,0x00,0x00]
+          vpminuw 4096(%rdx), %ymm27, %ymm20
+
+// CHECK: vpminuw -4096(%rdx), %ymm27, %ymm20
+// CHECK:  encoding: [0x62,0xe2,0x25,0x20,0x3a,0x62,0x80]
+          vpminuw -4096(%rdx), %ymm27, %ymm20
+
+// CHECK: vpminuw -4128(%rdx), %ymm27, %ymm20
+// CHECK:  encoding: [0x62,0xe2,0x25,0x20,0x3a,0xa2,0xe0,0xef,0xff,0xff]
+          vpminuw -4128(%rdx), %ymm27, %ymm20
+
+// CHECK: vpmullw %xmm26, %xmm19, %xmm29
+// CHECK:  encoding: [0x62,0x01,0x65,0x00,0xd5,0xea]
+          vpmullw %xmm26, %xmm19, %xmm29
+
+// CHECK: vpmullw %xmm26, %xmm19, %xmm29 {%k7}
+// CHECK:  encoding: [0x62,0x01,0x65,0x07,0xd5,0xea]
+          vpmullw %xmm26, %xmm19, %xmm29 {%k7}
+
+// CHECK: vpmullw %xmm26, %xmm19, %xmm29 {%k7} {z}
+// CHECK:  encoding: [0x62,0x01,0x65,0x87,0xd5,0xea]
+          vpmullw %xmm26, %xmm19, %xmm29 {%k7} {z}
+
+// CHECK: vpmullw (%rcx), %xmm19, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x65,0x00,0xd5,0x29]
+          vpmullw (%rcx), %xmm19, %xmm29
+
+// CHECK: vpmullw 291(%rax,%r14,8), %xmm19, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x65,0x00,0xd5,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpmullw 291(%rax,%r14,8), %xmm19, %xmm29
+
+// CHECK: vpmullw 2032(%rdx), %xmm19, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x65,0x00,0xd5,0x6a,0x7f]
+          vpmullw 2032(%rdx), %xmm19, %xmm29
+
+// CHECK: vpmullw 2048(%rdx), %xmm19, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x65,0x00,0xd5,0xaa,0x00,0x08,0x00,0x00]
+          vpmullw 2048(%rdx), %xmm19, %xmm29
+
+// CHECK: vpmullw -2048(%rdx), %xmm19, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x65,0x00,0xd5,0x6a,0x80]
+          vpmullw -2048(%rdx), %xmm19, %xmm29
+
+// CHECK: vpmullw -2064(%rdx), %xmm19, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x65,0x00,0xd5,0xaa,0xf0,0xf7,0xff,0xff]
+          vpmullw -2064(%rdx), %xmm19, %xmm29
+
+// CHECK: vpmullw %ymm20, %ymm24, %ymm17
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x20,0xd5,0xcc]
+          vpmullw %ymm20, %ymm24, %ymm17
+
+// CHECK: vpmullw %ymm20, %ymm24, %ymm17 {%k5}
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x25,0xd5,0xcc]
+          vpmullw %ymm20, %ymm24, %ymm17 {%k5}
+
+// CHECK: vpmullw %ymm20, %ymm24, %ymm17 {%k5} {z}
+// CHECK:  encoding: [0x62,0xa1,0x3d,0xa5,0xd5,0xcc]
+          vpmullw %ymm20, %ymm24, %ymm17 {%k5} {z}
+
+// CHECK: vpmullw (%rcx), %ymm24, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x20,0xd5,0x09]
+          vpmullw (%rcx), %ymm24, %ymm17
+
+// CHECK: vpmullw 291(%rax,%r14,8), %ymm24, %ymm17
+// CHECK:  encoding: [0x62,0xa1,0x3d,0x20,0xd5,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpmullw 291(%rax,%r14,8), %ymm24, %ymm17
+
+// CHECK: vpmullw 4064(%rdx), %ymm24, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x20,0xd5,0x4a,0x7f]
+          vpmullw 4064(%rdx), %ymm24, %ymm17
+
+// CHECK: vpmullw 4096(%rdx), %ymm24, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x20,0xd5,0x8a,0x00,0x10,0x00,0x00]
+          vpmullw 4096(%rdx), %ymm24, %ymm17
+
+// CHECK: vpmullw -4096(%rdx), %ymm24, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x20,0xd5,0x4a,0x80]
+          vpmullw -4096(%rdx), %ymm24, %ymm17
+
+// CHECK: vpmullw -4128(%rdx), %ymm24, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x3d,0x20,0xd5,0x8a,0xe0,0xef,0xff,0xff]
+          vpmullw -4128(%rdx), %ymm24, %ymm17
+
+// CHECK: vpsubb %xmm28, %xmm29, %xmm27
+// CHECK:  encoding: [0x62,0x01,0x15,0x00,0xf8,0xdc]
+          vpsubb %xmm28, %xmm29, %xmm27
+
+// CHECK: vpsubb %xmm28, %xmm29, %xmm27 {%k3}
+// CHECK:  encoding: [0x62,0x01,0x15,0x03,0xf8,0xdc]
+          vpsubb %xmm28, %xmm29, %xmm27 {%k3}
+
+// CHECK: vpsubb %xmm28, %xmm29, %xmm27 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0x15,0x83,0xf8,0xdc]
+          vpsubb %xmm28, %xmm29, %xmm27 {%k3} {z}
+
+// CHECK: vpsubb (%rcx), %xmm29, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xf8,0x19]
+          vpsubb (%rcx), %xmm29, %xmm27
+
+// CHECK: vpsubb 291(%rax,%r14,8), %xmm29, %xmm27
+// CHECK:  encoding: [0x62,0x21,0x15,0x00,0xf8,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpsubb 291(%rax,%r14,8), %xmm29, %xmm27
+
+// CHECK: vpsubb 2032(%rdx), %xmm29, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xf8,0x5a,0x7f]
+          vpsubb 2032(%rdx), %xmm29, %xmm27
+
+// CHECK: vpsubb 2048(%rdx), %xmm29, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xf8,0x9a,0x00,0x08,0x00,0x00]
+          vpsubb 2048(%rdx), %xmm29, %xmm27
+
+// CHECK: vpsubb -2048(%rdx), %xmm29, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xf8,0x5a,0x80]
+          vpsubb -2048(%rdx), %xmm29, %xmm27
+
+// CHECK: vpsubb -2064(%rdx), %xmm29, %xmm27
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xf8,0x9a,0xf0,0xf7,0xff,0xff]
+          vpsubb -2064(%rdx), %xmm29, %xmm27
+
+// CHECK: vpsubb %ymm28, %ymm20, %ymm20
+// CHECK:  encoding: [0x62,0x81,0x5d,0x20,0xf8,0xe4]
+          vpsubb %ymm28, %ymm20, %ymm20
+
+// CHECK: vpsubb %ymm28, %ymm20, %ymm20 {%k2}
+// CHECK:  encoding: [0x62,0x81,0x5d,0x22,0xf8,0xe4]
+          vpsubb %ymm28, %ymm20, %ymm20 {%k2}
+
+// CHECK: vpsubb %ymm28, %ymm20, %ymm20 {%k2} {z}
+// CHECK:  encoding: [0x62,0x81,0x5d,0xa2,0xf8,0xe4]
+          vpsubb %ymm28, %ymm20, %ymm20 {%k2} {z}
+
+// CHECK: vpsubb (%rcx), %ymm20, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x20,0xf8,0x21]
+          vpsubb (%rcx), %ymm20, %ymm20
+
+// CHECK: vpsubb 291(%rax,%r14,8), %ymm20, %ymm20
+// CHECK:  encoding: [0x62,0xa1,0x5d,0x20,0xf8,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpsubb 291(%rax,%r14,8), %ymm20, %ymm20
+
+// CHECK: vpsubb 4064(%rdx), %ymm20, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x20,0xf8,0x62,0x7f]
+          vpsubb 4064(%rdx), %ymm20, %ymm20
+
+// CHECK: vpsubb 4096(%rdx), %ymm20, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x20,0xf8,0xa2,0x00,0x10,0x00,0x00]
+          vpsubb 4096(%rdx), %ymm20, %ymm20
+
+// CHECK: vpsubb -4096(%rdx), %ymm20, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x20,0xf8,0x62,0x80]
+          vpsubb -4096(%rdx), %ymm20, %ymm20
+
+// CHECK: vpsubb -4128(%rdx), %ymm20, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0x5d,0x20,0xf8,0xa2,0xe0,0xef,0xff,0xff]
+          vpsubb -4128(%rdx), %ymm20, %ymm20
+
+// CHECK: vpsubw %xmm18, %xmm22, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x4d,0x00,0xf9,0xca]
+          vpsubw %xmm18, %xmm22, %xmm17
+
+// CHECK: vpsubw %xmm18, %xmm22, %xmm17 {%k6}
+// CHECK:  encoding: [0x62,0xa1,0x4d,0x06,0xf9,0xca]
+          vpsubw %xmm18, %xmm22, %xmm17 {%k6}
+
+// CHECK: vpsubw %xmm18, %xmm22, %xmm17 {%k6} {z}
+// CHECK:  encoding: [0x62,0xa1,0x4d,0x86,0xf9,0xca]
+          vpsubw %xmm18, %xmm22, %xmm17 {%k6} {z}
+
+// CHECK: vpsubw (%rcx), %xmm22, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x4d,0x00,0xf9,0x09]
+          vpsubw (%rcx), %xmm22, %xmm17
+
+// CHECK: vpsubw 291(%rax,%r14,8), %xmm22, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x4d,0x00,0xf9,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpsubw 291(%rax,%r14,8), %xmm22, %xmm17
+
+// CHECK: vpsubw 2032(%rdx), %xmm22, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x4d,0x00,0xf9,0x4a,0x7f]
+          vpsubw 2032(%rdx), %xmm22, %xmm17
+
+// CHECK: vpsubw 2048(%rdx), %xmm22, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x4d,0x00,0xf9,0x8a,0x00,0x08,0x00,0x00]
+          vpsubw 2048(%rdx), %xmm22, %xmm17
+
+// CHECK: vpsubw -2048(%rdx), %xmm22, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x4d,0x00,0xf9,0x4a,0x80]
+          vpsubw -2048(%rdx), %xmm22, %xmm17
+
+// CHECK: vpsubw -2064(%rdx), %xmm22, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x4d,0x00,0xf9,0x8a,0xf0,0xf7,0xff,0xff]
+          vpsubw -2064(%rdx), %xmm22, %xmm17
+
+// CHECK: vpsubw %ymm24, %ymm29, %ymm17
+// CHECK:  encoding: [0x62,0x81,0x15,0x20,0xf9,0xc8]
+          vpsubw %ymm24, %ymm29, %ymm17
+
+// CHECK: vpsubw %ymm24, %ymm29, %ymm17 {%k2}
+// CHECK:  encoding: [0x62,0x81,0x15,0x22,0xf9,0xc8]
+          vpsubw %ymm24, %ymm29, %ymm17 {%k2}
+
+// CHECK: vpsubw %ymm24, %ymm29, %ymm17 {%k2} {z}
+// CHECK:  encoding: [0x62,0x81,0x15,0xa2,0xf9,0xc8]
+          vpsubw %ymm24, %ymm29, %ymm17 {%k2} {z}
+
+// CHECK: vpsubw (%rcx), %ymm29, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xf9,0x09]
+          vpsubw (%rcx), %ymm29, %ymm17
+
+// CHECK: vpsubw 291(%rax,%r14,8), %ymm29, %ymm17
+// CHECK:  encoding: [0x62,0xa1,0x15,0x20,0xf9,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpsubw 291(%rax,%r14,8), %ymm29, %ymm17
+
+// CHECK: vpsubw 4064(%rdx), %ymm29, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xf9,0x4a,0x7f]
+          vpsubw 4064(%rdx), %ymm29, %ymm17
+
+// CHECK: vpsubw 4096(%rdx), %ymm29, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xf9,0x8a,0x00,0x10,0x00,0x00]
+          vpsubw 4096(%rdx), %ymm29, %ymm17
+
+// CHECK: vpsubw -4096(%rdx), %ymm29, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xf9,0x4a,0x80]
+          vpsubw -4096(%rdx), %ymm29, %ymm17
+
+// CHECK: vpsubw -4128(%rdx), %ymm29, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xf9,0x8a,0xe0,0xef,0xff,0xff]
+          vpsubw -4128(%rdx), %ymm29, %ymm17
+
+// CHECK: vmovdqu8 %xmm23, %xmm26
+// CHECK:  encoding: [0x62,0x21,0x7f,0x08,0x6f,0xd7]
+          vmovdqu8 %xmm23, %xmm26
+
+// CHECK: vmovdqu8 %xmm23, %xmm26 {%k2}
+// CHECK:  encoding: [0x62,0x21,0x7f,0x0a,0x6f,0xd7]
+          vmovdqu8 %xmm23, %xmm26 {%k2}
+
+// CHECK: vmovdqu8 %xmm23, %xmm26 {%k2} {z}
+// CHECK:  encoding: [0x62,0x21,0x7f,0x8a,0x6f,0xd7]
+          vmovdqu8 %xmm23, %xmm26 {%k2} {z}
+
+// CHECK: vmovdqu8 (%rcx), %xmm26
+// CHECK:  encoding: [0x62,0x61,0x7f,0x08,0x6f,0x11]
+          vmovdqu8 (%rcx), %xmm26
+
+// CHECK: vmovdqu8 291(%rax,%r14,8), %xmm26
+// CHECK:  encoding: [0x62,0x21,0x7f,0x08,0x6f,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu8 291(%rax,%r14,8), %xmm26
+
+// CHECK: vmovdqu8 2032(%rdx), %xmm26
+// CHECK:  encoding: [0x62,0x61,0x7f,0x08,0x6f,0x52,0x7f]
+          vmovdqu8 2032(%rdx), %xmm26
+
+// CHECK: vmovdqu8 2048(%rdx), %xmm26
+// CHECK:  encoding: [0x62,0x61,0x7f,0x08,0x6f,0x92,0x00,0x08,0x00,0x00]
+          vmovdqu8 2048(%rdx), %xmm26
+
+// CHECK: vmovdqu8 -2048(%rdx), %xmm26
+// CHECK:  encoding: [0x62,0x61,0x7f,0x08,0x6f,0x52,0x80]
+          vmovdqu8 -2048(%rdx), %xmm26
+
+// CHECK: vmovdqu8 -2064(%rdx), %xmm26
+// CHECK:  encoding: [0x62,0x61,0x7f,0x08,0x6f,0x92,0xf0,0xf7,0xff,0xff]
+          vmovdqu8 -2064(%rdx), %xmm26
+
+// CHECK: vmovdqu8 %ymm29, %ymm18
+// CHECK:  encoding: [0x62,0x81,0x7f,0x28,0x6f,0xd5]
+          vmovdqu8 %ymm29, %ymm18
+
+// CHECK: vmovdqu8 %ymm29, %ymm18 {%k7}
+// CHECK:  encoding: [0x62,0x81,0x7f,0x2f,0x6f,0xd5]
+          vmovdqu8 %ymm29, %ymm18 {%k7}
+
+// CHECK: vmovdqu8 %ymm29, %ymm18 {%k7} {z}
+// CHECK:  encoding: [0x62,0x81,0x7f,0xaf,0x6f,0xd5]
+          vmovdqu8 %ymm29, %ymm18 {%k7} {z}
+
+// CHECK: vmovdqu8 (%rcx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x6f,0x11]
+          vmovdqu8 (%rcx), %ymm18
+
+// CHECK: vmovdqu8 291(%rax,%r14,8), %ymm18
+// CHECK:  encoding: [0x62,0xa1,0x7f,0x28,0x6f,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu8 291(%rax,%r14,8), %ymm18
+
+// CHECK: vmovdqu8 4064(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x6f,0x52,0x7f]
+          vmovdqu8 4064(%rdx), %ymm18
+
+// CHECK: vmovdqu8 4096(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x6f,0x92,0x00,0x10,0x00,0x00]
+          vmovdqu8 4096(%rdx), %ymm18
+
+// CHECK: vmovdqu8 -4096(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x6f,0x52,0x80]
+          vmovdqu8 -4096(%rdx), %ymm18
+
+// CHECK: vmovdqu8 -4128(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x6f,0x92,0xe0,0xef,0xff,0xff]
+          vmovdqu8 -4128(%rdx), %ymm18
+
+// CHECK: vmovdqu16 %xmm24, %xmm29
+// CHECK:  encoding: [0x62,0x01,0xff,0x08,0x6f,0xe8]
+          vmovdqu16 %xmm24, %xmm29
+
+// CHECK: vmovdqu16 %xmm24, %xmm29 {%k6}
+// CHECK:  encoding: [0x62,0x01,0xff,0x0e,0x6f,0xe8]
+          vmovdqu16 %xmm24, %xmm29 {%k6}
+
+// CHECK: vmovdqu16 %xmm24, %xmm29 {%k6} {z}
+// CHECK:  encoding: [0x62,0x01,0xff,0x8e,0x6f,0xe8]
+          vmovdqu16 %xmm24, %xmm29 {%k6} {z}
+
+// CHECK: vmovdqu16 (%rcx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x6f,0x29]
+          vmovdqu16 (%rcx), %xmm29
+
+// CHECK: vmovdqu16 291(%rax,%r14,8), %xmm29
+// CHECK:  encoding: [0x62,0x21,0xff,0x08,0x6f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu16 291(%rax,%r14,8), %xmm29
+
+// CHECK: vmovdqu16 2032(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x6f,0x6a,0x7f]
+          vmovdqu16 2032(%rdx), %xmm29
+
+// CHECK: vmovdqu16 2048(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x6f,0xaa,0x00,0x08,0x00,0x00]
+          vmovdqu16 2048(%rdx), %xmm29
+
+// CHECK: vmovdqu16 -2048(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x6f,0x6a,0x80]
+          vmovdqu16 -2048(%rdx), %xmm29
+
+// CHECK: vmovdqu16 -2064(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xff,0x08,0x6f,0xaa,0xf0,0xf7,0xff,0xff]
+          vmovdqu16 -2064(%rdx), %xmm29
+
+// CHECK: vmovdqu16 %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0x81,0xff,0x28,0x6f,0xf8]
+          vmovdqu16 %ymm24, %ymm23
+
+// CHECK: vmovdqu16 %ymm24, %ymm23 {%k3}
+// CHECK:  encoding: [0x62,0x81,0xff,0x2b,0x6f,0xf8]
+          vmovdqu16 %ymm24, %ymm23 {%k3}
+
+// CHECK: vmovdqu16 %ymm24, %ymm23 {%k3} {z}
+// CHECK:  encoding: [0x62,0x81,0xff,0xab,0x6f,0xf8]
+          vmovdqu16 %ymm24, %ymm23 {%k3} {z}
+
+// CHECK: vmovdqu16 (%rcx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xff,0x28,0x6f,0x39]
+          vmovdqu16 (%rcx), %ymm23
+
+// CHECK: vmovdqu16 291(%rax,%r14,8), %ymm23
+// CHECK:  encoding: [0x62,0xa1,0xff,0x28,0x6f,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu16 291(%rax,%r14,8), %ymm23
+
+// CHECK: vmovdqu16 4064(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xff,0x28,0x6f,0x7a,0x7f]
+          vmovdqu16 4064(%rdx), %ymm23
+
+// CHECK: vmovdqu16 4096(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xff,0x28,0x6f,0xba,0x00,0x10,0x00,0x00]
+          vmovdqu16 4096(%rdx), %ymm23
+
+// CHECK: vmovdqu16 -4096(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xff,0x28,0x6f,0x7a,0x80]
+          vmovdqu16 -4096(%rdx), %ymm23
+
+// CHECK: vmovdqu16 -4128(%rdx), %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xff,0x28,0x6f,0xba,0xe0,0xef,0xff,0xff]
+          vmovdqu16 -4128(%rdx), %ymm23
+
+// CHECK: vmovdqu8 %xmm17, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7f,0x09]
+          vmovdqu8 %xmm17, (%rcx)
+
+// CHECK: vmovdqu8 %xmm17, (%rcx) {%k4}
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x0c,0x7f,0x09]
+          vmovdqu8 %xmm17, (%rcx) {%k4}
+
+// CHECK: vmovdqu8 %xmm17, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7f,0x08,0x7f,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu8 %xmm17, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu8 %xmm17, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7f,0x4a,0x7f]
+          vmovdqu8 %xmm17, 2032(%rdx)
+
+// CHECK: vmovdqu8 %xmm17, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7f,0x8a,0x00,0x08,0x00,0x00]
+          vmovdqu8 %xmm17, 2048(%rdx)
+
+// CHECK: vmovdqu8 %xmm17, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7f,0x4a,0x80]
+          vmovdqu8 %xmm17, -2048(%rdx)
+
+// CHECK: vmovdqu8 %xmm17, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x08,0x7f,0x8a,0xf0,0xf7,0xff,0xff]
+          vmovdqu8 %xmm17, -2064(%rdx)
+
+// CHECK: vmovdqu8 %ymm21, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7f,0x29]
+          vmovdqu8 %ymm21, (%rcx)
+
+// CHECK: vmovdqu8 %ymm21, (%rcx) {%k1}
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x29,0x7f,0x29]
+          vmovdqu8 %ymm21, (%rcx) {%k1}
+
+// CHECK: vmovdqu8 %ymm21, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7f,0x28,0x7f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu8 %ymm21, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu8 %ymm21, 4064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7f,0x6a,0x7f]
+          vmovdqu8 %ymm21, 4064(%rdx)
+
+// CHECK: vmovdqu8 %ymm21, 4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7f,0xaa,0x00,0x10,0x00,0x00]
+          vmovdqu8 %ymm21, 4096(%rdx)
+
+// CHECK: vmovdqu8 %ymm21, -4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7f,0x6a,0x80]
+          vmovdqu8 %ymm21, -4096(%rdx)
+
+// CHECK: vmovdqu8 %ymm21, -4128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7f,0x28,0x7f,0xaa,0xe0,0xef,0xff,0xff]
+          vmovdqu8 %ymm21, -4128(%rdx)
+
+// CHECK: vmovdqu16 %xmm23, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7f,0x39]
+          vmovdqu16 %xmm23, (%rcx)
+
+// CHECK: vmovdqu16 %xmm23, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0xe1,0xff,0x0f,0x7f,0x39]
+          vmovdqu16 %xmm23, (%rcx) {%k7}
+
+// CHECK: vmovdqu16 %xmm23, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xff,0x08,0x7f,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu16 %xmm23, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu16 %xmm23, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7f,0x7a,0x7f]
+          vmovdqu16 %xmm23, 2032(%rdx)
+
+// CHECK: vmovdqu16 %xmm23, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7f,0xba,0x00,0x08,0x00,0x00]
+          vmovdqu16 %xmm23, 2048(%rdx)
+
+// CHECK: vmovdqu16 %xmm23, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7f,0x7a,0x80]
+          vmovdqu16 %xmm23, -2048(%rdx)
+
+// CHECK: vmovdqu16 %xmm23, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xff,0x08,0x7f,0xba,0xf0,0xf7,0xff,0xff]
+          vmovdqu16 %xmm23, -2064(%rdx)
+
+// CHECK: vmovdqu16 %ymm29, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7f,0x29]
+          vmovdqu16 %ymm29, (%rcx)
+
+// CHECK: vmovdqu16 %ymm29, (%rcx) {%k6}
+// CHECK:  encoding: [0x62,0x61,0xff,0x2e,0x7f,0x29]
+          vmovdqu16 %ymm29, (%rcx) {%k6}
+
+// CHECK: vmovdqu16 %ymm29, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0xff,0x28,0x7f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu16 %ymm29, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu16 %ymm29, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7f,0x6a,0x7f]
+          vmovdqu16 %ymm29, 4064(%rdx)
+
+// CHECK: vmovdqu16 %ymm29, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7f,0xaa,0x00,0x10,0x00,0x00]
+          vmovdqu16 %ymm29, 4096(%rdx)
+
+// CHECK: vmovdqu16 %ymm29, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7f,0x6a,0x80]
+          vmovdqu16 %ymm29, -4096(%rdx)
+
+// CHECK: vmovdqu16 %ymm29, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xff,0x28,0x7f,0xaa,0xe0,0xef,0xff,0xff]
+          vmovdqu16 %ymm29, -4128(%rdx)

diff --git a/test/MC/X86/x86-64-avx512dq.s b/test/MC/X86/x86-64-avx512dq.s
new file mode 100644
index 0000000..aac1765
--- /dev/null
+++ b/test/MC/X86/x86-64-avx512dq.s

@@ -0,0 +1,129 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq  --show-encoding %s | FileCheck %s
+
+// CHECK: vpmullq %zmm18, %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xa2,0xbd,0x40,0x40,0xd2]
+          vpmullq %zmm18, %zmm24, %zmm18
+
+// CHECK: vpmullq %zmm18, %zmm24, %zmm18 {%k2}
+// CHECK:  encoding: [0x62,0xa2,0xbd,0x42,0x40,0xd2]
+          vpmullq %zmm18, %zmm24, %zmm18 {%k2}
+
+// CHECK: vpmullq %zmm18, %zmm24, %zmm18 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa2,0xbd,0xc2,0x40,0xd2]
+          vpmullq %zmm18, %zmm24, %zmm18 {%k2} {z}
+
+// CHECK: vpmullq (%rcx), %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x40,0x40,0x11]
+          vpmullq (%rcx), %zmm24, %zmm18
+
+// CHECK: vpmullq 291(%rax,%r14,8), %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xa2,0xbd,0x40,0x40,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpmullq 291(%rax,%r14,8), %zmm24, %zmm18
+
+// CHECK: vpmullq (%rcx){1to8}, %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x50,0x40,0x11]
+          vpmullq (%rcx){1to8}, %zmm24, %zmm18
+
+// CHECK: vpmullq 8128(%rdx), %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x40,0x40,0x52,0x7f]
+          vpmullq 8128(%rdx), %zmm24, %zmm18
+
+// CHECK: vpmullq 8192(%rdx), %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x40,0x40,0x92,0x00,0x20,0x00,0x00]
+          vpmullq 8192(%rdx), %zmm24, %zmm18
+
+// CHECK: vpmullq -8192(%rdx), %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x40,0x40,0x52,0x80]
+          vpmullq -8192(%rdx), %zmm24, %zmm18
+
+// CHECK: vpmullq -8256(%rdx), %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x40,0x40,0x92,0xc0,0xdf,0xff,0xff]
+          vpmullq -8256(%rdx), %zmm24, %zmm18
+
+// CHECK: vpmullq 1016(%rdx){1to8}, %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x50,0x40,0x52,0x7f]
+          vpmullq 1016(%rdx){1to8}, %zmm24, %zmm18
+
+// CHECK: vpmullq 1024(%rdx){1to8}, %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x50,0x40,0x92,0x00,0x04,0x00,0x00]
+          vpmullq 1024(%rdx){1to8}, %zmm24, %zmm18
+
+// CHECK: vpmullq -1024(%rdx){1to8}, %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x50,0x40,0x52,0x80]
+          vpmullq -1024(%rdx){1to8}, %zmm24, %zmm18
+
+// CHECK: vpmullq -1032(%rdx){1to8}, %zmm24, %zmm18
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x50,0x40,0x92,0xf8,0xfb,0xff,0xff]
+          vpmullq -1032(%rdx){1to8}, %zmm24, %zmm18
+
+// CHECK: kandb  %k6, %k5, %k2
+// CHECK:  encoding: [0xc5,0xd5,0x41,0xd6]
+          kandb  %k6, %k5, %k2
+
+// CHECK: kandnb %k4, %k6, %k5
+// CHECK:  encoding: [0xc5,0xcd,0x42,0xec]
+          kandnb %k4, %k6, %k5
+
+// CHECK: korb   %k5, %k4, %k4
+// CHECK:  encoding: [0xc5,0xdd,0x45,0xe5]
+          korb   %k5, %k4, %k4
+
+// CHECK: kxnorb %k7, %k6, %k4
+// CHECK:  encoding: [0xc5,0xcd,0x46,0xe7]
+          kxnorb %k7, %k6, %k4
+
+// CHECK: kxorb  %k5, %k6, %k4
+// CHECK:  encoding: [0xc5,0xcd,0x47,0xe5]
+          kxorb  %k5, %k6, %k4
+
+// CHECK: knotb  %k4, %k5
+// CHECK:  encoding: [0xc5,0xf9,0x44,0xec]
+          knotb  %k4, %k5
+
+// CHECK: knotb  %k3, %k3
+// CHECK:  encoding: [0xc5,0xf9,0x44,0xdb]
+          knotb  %k3, %k3
+
+// CHECK: kmovb  %k3, %k5
+// CHECK:  encoding: [0xc5,0xf9,0x90,0xeb]
+          kmovb  %k3, %k5
+
+// CHECK: kmovb  (%rcx), %k5
+// CHECK:  encoding: [0xc5,0xf9,0x90,0x29]
+          kmovb  (%rcx), %k5
+
+// CHECK: kmovb  4660(%rax,%r14,8), %k5
+// CHECK:  encoding: [0xc4,0xa1,0x79,0x90,0xac,0xf0,0x34,0x12,0x00,0x00]
+          kmovb  4660(%rax,%r14,8), %k5
+
+// CHECK: kmovb  %k2, (%rcx)
+// CHECK:  encoding: [0xc5,0xf9,0x91,0x11]
+          kmovb  %k2, (%rcx)
+
+// CHECK: kmovb  %k2, 4660(%rax,%r14,8)
+// CHECK:  encoding: [0xc4,0xa1,0x79,0x91,0x94,0xf0,0x34,0x12,0x00,0x00]
+          kmovb  %k2, 4660(%rax,%r14,8)
+
+// CHECK: kmovb  %eax, %k2
+// CHECK:  encoding: [0xc5,0xf9,0x92,0xd0]
+          kmovb  %eax, %k2
+
+// CHECK: kmovb  %ebp, %k2
+// CHECK:  encoding: [0xc5,0xf9,0x92,0xd5]
+          kmovb  %ebp, %k2
+
+// CHECK: kmovb  %r13d, %k2
+// CHECK:  encoding: [0xc4,0xc1,0x79,0x92,0xd5]
+          kmovb  %r13d, %k2
+
+// CHECK: kmovb  %k3, %eax
+// CHECK:  encoding: [0xc5,0xf9,0x93,0xc3]
+          kmovb  %k3, %eax
+
+// CHECK: kmovb  %k3, %ebp
+// CHECK:  encoding: [0xc5,0xf9,0x93,0xeb]
+          kmovb  %k3, %ebp
+
+// CHECK: kmovb  %k3, %r13d
+// CHECK:  encoding: [0xc5,0x79,0x93,0xeb]
+          kmovb  %k3, %r13d

diff --git a/test/MC/X86/x86-64-avx512dq_vl.s b/test/MC/X86/x86-64-avx512dq_vl.s
new file mode 100644
index 0000000..38aab78
--- /dev/null
+++ b/test/MC/X86/x86-64-avx512dq_vl.s

@@ -0,0 +1,113 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq -mattr=+avx512vl  --show-encoding %s | FileCheck %s
+
+// CHECK: vpmullq %xmm22, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x22,0xf5,0x00,0x40,0xd6]
+          vpmullq %xmm22, %xmm17, %xmm26
+
+// CHECK: vpmullq %xmm22, %xmm17, %xmm26 {%k6}
+// CHECK:  encoding: [0x62,0x22,0xf5,0x06,0x40,0xd6]
+          vpmullq %xmm22, %xmm17, %xmm26 {%k6}
+
+// CHECK: vpmullq %xmm22, %xmm17, %xmm26 {%k6} {z}
+// CHECK:  encoding: [0x62,0x22,0xf5,0x86,0x40,0xd6]
+          vpmullq %xmm22, %xmm17, %xmm26 {%k6} {z}
+
+// CHECK: vpmullq (%rcx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x00,0x40,0x11]
+          vpmullq (%rcx), %xmm17, %xmm26
+
+// CHECK: vpmullq 291(%rax,%r14,8), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x22,0xf5,0x00,0x40,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpmullq 291(%rax,%r14,8), %xmm17, %xmm26
+
+// CHECK: vpmullq (%rcx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x10,0x40,0x11]
+          vpmullq (%rcx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpmullq 2032(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x00,0x40,0x52,0x7f]
+          vpmullq 2032(%rdx), %xmm17, %xmm26
+
+// CHECK: vpmullq 2048(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x00,0x40,0x92,0x00,0x08,0x00,0x00]
+          vpmullq 2048(%rdx), %xmm17, %xmm26
+
+// CHECK: vpmullq -2048(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x00,0x40,0x52,0x80]
+          vpmullq -2048(%rdx), %xmm17, %xmm26
+
+// CHECK: vpmullq -2064(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x00,0x40,0x92,0xf0,0xf7,0xff,0xff]
+          vpmullq -2064(%rdx), %xmm17, %xmm26
+
+// CHECK: vpmullq 1016(%rdx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x10,0x40,0x52,0x7f]
+          vpmullq 1016(%rdx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpmullq 1024(%rdx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x10,0x40,0x92,0x00,0x04,0x00,0x00]
+          vpmullq 1024(%rdx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpmullq -1024(%rdx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x10,0x40,0x52,0x80]
+          vpmullq -1024(%rdx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpmullq -1032(%rdx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xf5,0x10,0x40,0x92,0xf8,0xfb,0xff,0xff]
+          vpmullq -1032(%rdx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpmullq %ymm25, %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x02,0xb5,0x20,0x40,0xc9]
+          vpmullq %ymm25, %ymm25, %ymm25
+
+// CHECK: vpmullq %ymm25, %ymm25, %ymm25 {%k3}
+// CHECK:  encoding: [0x62,0x02,0xb5,0x23,0x40,0xc9]
+          vpmullq %ymm25, %ymm25, %ymm25 {%k3}
+
+// CHECK: vpmullq %ymm25, %ymm25, %ymm25 {%k3} {z}
+// CHECK:  encoding: [0x62,0x02,0xb5,0xa3,0x40,0xc9]
+          vpmullq %ymm25, %ymm25, %ymm25 {%k3} {z}
+
+// CHECK: vpmullq (%rcx), %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x20,0x40,0x09]
+          vpmullq (%rcx), %ymm25, %ymm25
+
+// CHECK: vpmullq 291(%rax,%r14,8), %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x22,0xb5,0x20,0x40,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpmullq 291(%rax,%r14,8), %ymm25, %ymm25
+
+// CHECK: vpmullq (%rcx){1to4}, %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x30,0x40,0x09]
+          vpmullq (%rcx){1to4}, %ymm25, %ymm25
+
+// CHECK: vpmullq 4064(%rdx), %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x20,0x40,0x4a,0x7f]
+          vpmullq 4064(%rdx), %ymm25, %ymm25
+
+// CHECK: vpmullq 4096(%rdx), %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x20,0x40,0x8a,0x00,0x10,0x00,0x00]
+          vpmullq 4096(%rdx), %ymm25, %ymm25
+
+// CHECK: vpmullq -4096(%rdx), %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x20,0x40,0x4a,0x80]
+          vpmullq -4096(%rdx), %ymm25, %ymm25
+
+// CHECK: vpmullq -4128(%rdx), %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x20,0x40,0x8a,0xe0,0xef,0xff,0xff]
+          vpmullq -4128(%rdx), %ymm25, %ymm25
+
+// CHECK: vpmullq 1016(%rdx){1to4}, %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x30,0x40,0x4a,0x7f]
+          vpmullq 1016(%rdx){1to4}, %ymm25, %ymm25
+
+// CHECK: vpmullq 1024(%rdx){1to4}, %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x30,0x40,0x8a,0x00,0x04,0x00,0x00]
+          vpmullq 1024(%rdx){1to4}, %ymm25, %ymm25
+
+// CHECK: vpmullq -1024(%rdx){1to4}, %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x30,0x40,0x4a,0x80]
+          vpmullq -1024(%rdx){1to4}, %ymm25, %ymm25
+
+// CHECK: vpmullq -1032(%rdx){1to4}, %ymm25, %ymm25
+// CHECK:  encoding: [0x62,0x62,0xb5,0x30,0x40,0x8a,0xf8,0xfb,0xff,0xff]
+          vpmullq -1032(%rdx){1to4}, %ymm25, %ymm25

diff --git a/test/MC/X86/x86-64-avx512f_vl.s b/test/MC/X86/x86-64-avx512f_vl.s
new file mode 100644
index 0000000..973a553
--- /dev/null
+++ b/test/MC/X86/x86-64-avx512f_vl.s

@@ -0,0 +1,6581 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=knl -mattr=+avx512vl --show-encoding %s | FileCheck %s
+
+// CHECK: vaddpd %xmm19, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xa1,0x95,0x00,0x58,0xe3]
+          vaddpd %xmm19, %xmm29, %xmm20
+
+// CHECK: vaddpd %xmm19, %xmm29, %xmm20 {%k7}
+// CHECK:  encoding: [0x62,0xa1,0x95,0x07,0x58,0xe3]
+          vaddpd %xmm19, %xmm29, %xmm20 {%k7}
+
+// CHECK: vaddpd %xmm19, %xmm29, %xmm20 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa1,0x95,0x87,0x58,0xe3]
+          vaddpd %xmm19, %xmm29, %xmm20 {%k7} {z}
+
+// CHECK: vaddpd (%rcx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x58,0x21]
+          vaddpd (%rcx), %xmm29, %xmm20
+
+// CHECK: vaddpd 291(%rax,%r14,8), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xa1,0x95,0x00,0x58,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vaddpd 291(%rax,%r14,8), %xmm29, %xmm20
+
+// CHECK: vaddpd (%rcx){1to2}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x58,0x21]
+          vaddpd (%rcx){1to2}, %xmm29, %xmm20
+
+// CHECK: vaddpd 2032(%rdx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x58,0x62,0x7f]
+          vaddpd 2032(%rdx), %xmm29, %xmm20
+
+// CHECK: vaddpd 2048(%rdx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x58,0xa2,0x00,0x08,0x00,0x00]
+          vaddpd 2048(%rdx), %xmm29, %xmm20
+
+// CHECK: vaddpd -2048(%rdx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x58,0x62,0x80]
+          vaddpd -2048(%rdx), %xmm29, %xmm20
+
+// CHECK: vaddpd -2064(%rdx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x58,0xa2,0xf0,0xf7,0xff,0xff]
+          vaddpd -2064(%rdx), %xmm29, %xmm20
+
+// CHECK: vaddpd 1016(%rdx){1to2}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x58,0x62,0x7f]
+          vaddpd 1016(%rdx){1to2}, %xmm29, %xmm20
+
+// CHECK: vaddpd 1024(%rdx){1to2}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x58,0xa2,0x00,0x04,0x00,0x00]
+          vaddpd 1024(%rdx){1to2}, %xmm29, %xmm20
+
+// CHECK: vaddpd -1024(%rdx){1to2}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x58,0x62,0x80]
+          vaddpd -1024(%rdx){1to2}, %xmm29, %xmm20
+
+// CHECK: vaddpd -1032(%rdx){1to2}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x58,0xa2,0xf8,0xfb,0xff,0xff]
+          vaddpd -1032(%rdx){1to2}, %xmm29, %xmm20
+
+// CHECK: vaddpd %ymm26, %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x01,0xad,0x20,0x58,0xe2]
+          vaddpd %ymm26, %ymm26, %ymm28
+
+// CHECK: vaddpd %ymm26, %ymm26, %ymm28 {%k1}
+// CHECK:  encoding: [0x62,0x01,0xad,0x21,0x58,0xe2]
+          vaddpd %ymm26, %ymm26, %ymm28 {%k1}
+
+// CHECK: vaddpd %ymm26, %ymm26, %ymm28 {%k1} {z}
+// CHECK:  encoding: [0x62,0x01,0xad,0xa1,0x58,0xe2]
+          vaddpd %ymm26, %ymm26, %ymm28 {%k1} {z}
+
+// CHECK: vaddpd (%rcx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0x58,0x21]
+          vaddpd (%rcx), %ymm26, %ymm28
+
+// CHECK: vaddpd 291(%rax,%r14,8), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x21,0xad,0x20,0x58,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vaddpd 291(%rax,%r14,8), %ymm26, %ymm28
+
+// CHECK: vaddpd (%rcx){1to4}, %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0x58,0x21]
+          vaddpd (%rcx){1to4}, %ymm26, %ymm28
+
+// CHECK: vaddpd 4064(%rdx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0x58,0x62,0x7f]
+          vaddpd 4064(%rdx), %ymm26, %ymm28
+
+// CHECK: vaddpd 4096(%rdx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0x58,0xa2,0x00,0x10,0x00,0x00]
+          vaddpd 4096(%rdx), %ymm26, %ymm28
+
+// CHECK: vaddpd -4096(%rdx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0x58,0x62,0x80]
+          vaddpd -4096(%rdx), %ymm26, %ymm28
+
+// CHECK: vaddpd -4128(%rdx), %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0x58,0xa2,0xe0,0xef,0xff,0xff]
+          vaddpd -4128(%rdx), %ymm26, %ymm28
+
+// CHECK: vaddpd 1016(%rdx){1to4}, %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0x58,0x62,0x7f]
+          vaddpd 1016(%rdx){1to4}, %ymm26, %ymm28
+
+// CHECK: vaddpd 1024(%rdx){1to4}, %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0x58,0xa2,0x00,0x04,0x00,0x00]
+          vaddpd 1024(%rdx){1to4}, %ymm26, %ymm28
+
+// CHECK: vaddpd -1024(%rdx){1to4}, %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0x58,0x62,0x80]
+          vaddpd -1024(%rdx){1to4}, %ymm26, %ymm28
+
+// CHECK: vaddpd -1032(%rdx){1to4}, %ymm26, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0x58,0xa2,0xf8,0xfb,0xff,0xff]
+          vaddpd -1032(%rdx){1to4}, %ymm26, %ymm28
+
+// CHECK: vaddps %xmm27, %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x01,0x64,0x00,0x58,0xc3]
+          vaddps %xmm27, %xmm19, %xmm24
+
+// CHECK: vaddps %xmm27, %xmm19, %xmm24 {%k4}
+// CHECK:  encoding: [0x62,0x01,0x64,0x04,0x58,0xc3]
+          vaddps %xmm27, %xmm19, %xmm24 {%k4}
+
+// CHECK: vaddps %xmm27, %xmm19, %xmm24 {%k4} {z}
+// CHECK:  encoding: [0x62,0x01,0x64,0x84,0x58,0xc3]
+          vaddps %xmm27, %xmm19, %xmm24 {%k4} {z}
+
+// CHECK: vaddps (%rcx), %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x00,0x58,0x01]
+          vaddps (%rcx), %xmm19, %xmm24
+
+// CHECK: vaddps 291(%rax,%r14,8), %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x21,0x64,0x00,0x58,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vaddps 291(%rax,%r14,8), %xmm19, %xmm24
+
+// CHECK: vaddps (%rcx){1to4}, %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x10,0x58,0x01]
+          vaddps (%rcx){1to4}, %xmm19, %xmm24
+
+// CHECK: vaddps 2032(%rdx), %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x00,0x58,0x42,0x7f]
+          vaddps 2032(%rdx), %xmm19, %xmm24
+
+// CHECK: vaddps 2048(%rdx), %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x00,0x58,0x82,0x00,0x08,0x00,0x00]
+          vaddps 2048(%rdx), %xmm19, %xmm24
+
+// CHECK: vaddps -2048(%rdx), %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x00,0x58,0x42,0x80]
+          vaddps -2048(%rdx), %xmm19, %xmm24
+
+// CHECK: vaddps -2064(%rdx), %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x00,0x58,0x82,0xf0,0xf7,0xff,0xff]
+          vaddps -2064(%rdx), %xmm19, %xmm24
+
+// CHECK: vaddps 508(%rdx){1to4}, %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x10,0x58,0x42,0x7f]
+          vaddps 508(%rdx){1to4}, %xmm19, %xmm24
+
+// CHECK: vaddps 512(%rdx){1to4}, %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x10,0x58,0x82,0x00,0x02,0x00,0x00]
+          vaddps 512(%rdx){1to4}, %xmm19, %xmm24
+
+// CHECK: vaddps -512(%rdx){1to4}, %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x10,0x58,0x42,0x80]
+          vaddps -512(%rdx){1to4}, %xmm19, %xmm24
+
+// CHECK: vaddps -516(%rdx){1to4}, %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x64,0x10,0x58,0x82,0xfc,0xfd,0xff,0xff]
+          vaddps -516(%rdx){1to4}, %xmm19, %xmm24
+
+// CHECK: vaddps %ymm20, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x21,0x2c,0x20,0x58,0xcc]
+          vaddps %ymm20, %ymm26, %ymm25
+
+// CHECK: vaddps %ymm20, %ymm26, %ymm25 {%k4}
+// CHECK:  encoding: [0x62,0x21,0x2c,0x24,0x58,0xcc]
+          vaddps %ymm20, %ymm26, %ymm25 {%k4}
+
+// CHECK: vaddps %ymm20, %ymm26, %ymm25 {%k4} {z}
+// CHECK:  encoding: [0x62,0x21,0x2c,0xa4,0x58,0xcc]
+          vaddps %ymm20, %ymm26, %ymm25 {%k4} {z}
+
+// CHECK: vaddps (%rcx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x58,0x09]
+          vaddps (%rcx), %ymm26, %ymm25
+
+// CHECK: vaddps 291(%rax,%r14,8), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x21,0x2c,0x20,0x58,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vaddps 291(%rax,%r14,8), %ymm26, %ymm25
+
+// CHECK: vaddps (%rcx){1to8}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x58,0x09]
+          vaddps (%rcx){1to8}, %ymm26, %ymm25
+
+// CHECK: vaddps 4064(%rdx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x58,0x4a,0x7f]
+          vaddps 4064(%rdx), %ymm26, %ymm25
+
+// CHECK: vaddps 4096(%rdx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x58,0x8a,0x00,0x10,0x00,0x00]
+          vaddps 4096(%rdx), %ymm26, %ymm25
+
+// CHECK: vaddps -4096(%rdx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x58,0x4a,0x80]
+          vaddps -4096(%rdx), %ymm26, %ymm25
+
+// CHECK: vaddps -4128(%rdx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x58,0x8a,0xe0,0xef,0xff,0xff]
+          vaddps -4128(%rdx), %ymm26, %ymm25
+
+// CHECK: vaddps 508(%rdx){1to8}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x58,0x4a,0x7f]
+          vaddps 508(%rdx){1to8}, %ymm26, %ymm25
+
+// CHECK: vaddps 512(%rdx){1to8}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x58,0x8a,0x00,0x02,0x00,0x00]
+          vaddps 512(%rdx){1to8}, %ymm26, %ymm25
+
+// CHECK: vaddps -512(%rdx){1to8}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x58,0x4a,0x80]
+          vaddps -512(%rdx){1to8}, %ymm26, %ymm25
+
+// CHECK: vaddps -516(%rdx){1to8}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x58,0x8a,0xfc,0xfd,0xff,0xff]
+          vaddps -516(%rdx){1to8}, %ymm26, %ymm25
+
+// CHECK: vbroadcastsd (%rcx), %ymm22
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x19,0x31]
+          vbroadcastsd (%rcx), %ymm22
+
+// CHECK: vbroadcastsd (%rcx), %ymm22 {%k5}
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x2d,0x19,0x31]
+          vbroadcastsd (%rcx), %ymm22 {%k5}
+
+// CHECK: vbroadcastsd (%rcx), %ymm22 {%k5} {z}
+// CHECK:  encoding: [0x62,0xe2,0xfd,0xad,0x19,0x31]
+          vbroadcastsd (%rcx), %ymm22 {%k5} {z}
+
+// CHECK: vbroadcastsd 291(%rax,%r14,8), %ymm22
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x28,0x19,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vbroadcastsd 291(%rax,%r14,8), %ymm22
+
+// CHECK: vbroadcastsd 1016(%rdx), %ymm22
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x19,0x72,0x7f]
+          vbroadcastsd 1016(%rdx), %ymm22
+
+// CHECK: vbroadcastsd 1024(%rdx), %ymm22
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x19,0xb2,0x00,0x04,0x00,0x00]
+          vbroadcastsd 1024(%rdx), %ymm22
+
+// CHECK: vbroadcastsd -1024(%rdx), %ymm22
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x19,0x72,0x80]
+          vbroadcastsd -1024(%rdx), %ymm22
+
+// CHECK: vbroadcastsd -1032(%rdx), %ymm22
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x19,0xb2,0xf8,0xfb,0xff,0xff]
+          vbroadcastsd -1032(%rdx), %ymm22
+
+// CHECK: vbroadcastsd %xmm17, %ymm19
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x28,0x19,0xd9]
+          vbroadcastsd %xmm17, %ymm19
+
+// CHECK: vbroadcastsd %xmm17, %ymm19 {%k6}
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x2e,0x19,0xd9]
+          vbroadcastsd %xmm17, %ymm19 {%k6}
+
+// CHECK: vbroadcastsd %xmm17, %ymm19 {%k6} {z}
+// CHECK:  encoding: [0x62,0xa2,0xfd,0xae,0x19,0xd9]
+          vbroadcastsd %xmm17, %ymm19 {%k6} {z}
+
+// CHECK: vbroadcastss (%rcx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x18,0x29]
+          vbroadcastss (%rcx), %xmm21
+
+// CHECK: vbroadcastss (%rcx), %xmm21 {%k2}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x0a,0x18,0x29]
+          vbroadcastss (%rcx), %xmm21 {%k2}
+
+// CHECK: vbroadcastss (%rcx), %xmm21 {%k2} {z}
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x8a,0x18,0x29]
+          vbroadcastss (%rcx), %xmm21 {%k2} {z}
+
+// CHECK: vbroadcastss 291(%rax,%r14,8), %xmm21
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x08,0x18,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vbroadcastss 291(%rax,%r14,8), %xmm21
+
+// CHECK: vbroadcastss 508(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x18,0x6a,0x7f]
+          vbroadcastss 508(%rdx), %xmm21
+
+// CHECK: vbroadcastss 512(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x18,0xaa,0x00,0x02,0x00,0x00]
+          vbroadcastss 512(%rdx), %xmm21
+
+// CHECK: vbroadcastss -512(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x18,0x6a,0x80]
+          vbroadcastss -512(%rdx), %xmm21
+
+// CHECK: vbroadcastss -516(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x18,0xaa,0xfc,0xfd,0xff,0xff]
+          vbroadcastss -516(%rdx), %xmm21
+
+// CHECK: vbroadcastss (%rcx), %ymm30
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x18,0x31]
+          vbroadcastss (%rcx), %ymm30
+
+// CHECK: vbroadcastss (%rcx), %ymm30 {%k1}
+// CHECK:  encoding: [0x62,0x62,0x7d,0x29,0x18,0x31]
+          vbroadcastss (%rcx), %ymm30 {%k1}
+
+// CHECK: vbroadcastss (%rcx), %ymm30 {%k1} {z}
+// CHECK:  encoding: [0x62,0x62,0x7d,0xa9,0x18,0x31]
+          vbroadcastss (%rcx), %ymm30 {%k1} {z}
+
+// CHECK: vbroadcastss 291(%rax,%r14,8), %ymm30
+// CHECK:  encoding: [0x62,0x22,0x7d,0x28,0x18,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vbroadcastss 291(%rax,%r14,8), %ymm30
+
+// CHECK: vbroadcastss 508(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x18,0x72,0x7f]
+          vbroadcastss 508(%rdx), %ymm30
+
+// CHECK: vbroadcastss 512(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x18,0xb2,0x00,0x02,0x00,0x00]
+          vbroadcastss 512(%rdx), %ymm30
+
+// CHECK: vbroadcastss -512(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x18,0x72,0x80]
+          vbroadcastss -512(%rdx), %ymm30
+
+// CHECK: vbroadcastss -516(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x18,0xb2,0xfc,0xfd,0xff,0xff]
+          vbroadcastss -516(%rdx), %ymm30
+
+// CHECK: vbroadcastss %xmm24, %xmm24
+// CHECK:  encoding: [0x62,0x02,0x7d,0x08,0x18,0xc0]
+          vbroadcastss %xmm24, %xmm24
+
+// CHECK: vbroadcastss %xmm24, %xmm24 {%k2}
+// CHECK:  encoding: [0x62,0x02,0x7d,0x0a,0x18,0xc0]
+          vbroadcastss %xmm24, %xmm24 {%k2}
+
+// CHECK: vbroadcastss %xmm24, %xmm24 {%k2} {z}
+// CHECK:  encoding: [0x62,0x02,0x7d,0x8a,0x18,0xc0]
+          vbroadcastss %xmm24, %xmm24 {%k2} {z}
+
+// CHECK: vbroadcastss %xmm28, %ymm24
+// CHECK:  encoding: [0x62,0x02,0x7d,0x28,0x18,0xc4]
+          vbroadcastss %xmm28, %ymm24
+
+// CHECK: vbroadcastss %xmm28, %ymm24 {%k6}
+// CHECK:  encoding: [0x62,0x02,0x7d,0x2e,0x18,0xc4]
+          vbroadcastss %xmm28, %ymm24 {%k6}
+
+// CHECK: vbroadcastss %xmm28, %ymm24 {%k6} {z}
+// CHECK:  encoding: [0x62,0x02,0x7d,0xae,0x18,0xc4]
+          vbroadcastss %xmm28, %ymm24 {%k6} {z}
+
+// CHECK: vdivpd %xmm27, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0x81,0xed,0x00,0x5e,0xdb]
+          vdivpd %xmm27, %xmm18, %xmm19
+
+// CHECK: vdivpd %xmm27, %xmm18, %xmm19 {%k3}
+// CHECK:  encoding: [0x62,0x81,0xed,0x03,0x5e,0xdb]
+          vdivpd %xmm27, %xmm18, %xmm19 {%k3}
+
+// CHECK: vdivpd %xmm27, %xmm18, %xmm19 {%k3} {z}
+// CHECK:  encoding: [0x62,0x81,0xed,0x83,0x5e,0xdb]
+          vdivpd %xmm27, %xmm18, %xmm19 {%k3} {z}
+
+// CHECK: vdivpd (%rcx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x00,0x5e,0x19]
+          vdivpd (%rcx), %xmm18, %xmm19
+
+// CHECK: vdivpd 291(%rax,%r14,8), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xa1,0xed,0x00,0x5e,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vdivpd 291(%rax,%r14,8), %xmm18, %xmm19
+
+// CHECK: vdivpd (%rcx){1to2}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x10,0x5e,0x19]
+          vdivpd (%rcx){1to2}, %xmm18, %xmm19
+
+// CHECK: vdivpd 2032(%rdx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x00,0x5e,0x5a,0x7f]
+          vdivpd 2032(%rdx), %xmm18, %xmm19
+
+// CHECK: vdivpd 2048(%rdx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x00,0x5e,0x9a,0x00,0x08,0x00,0x00]
+          vdivpd 2048(%rdx), %xmm18, %xmm19
+
+// CHECK: vdivpd -2048(%rdx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x00,0x5e,0x5a,0x80]
+          vdivpd -2048(%rdx), %xmm18, %xmm19
+
+// CHECK: vdivpd -2064(%rdx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x00,0x5e,0x9a,0xf0,0xf7,0xff,0xff]
+          vdivpd -2064(%rdx), %xmm18, %xmm19
+
+// CHECK: vdivpd 1016(%rdx){1to2}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x10,0x5e,0x5a,0x7f]
+          vdivpd 1016(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: vdivpd 1024(%rdx){1to2}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x10,0x5e,0x9a,0x00,0x04,0x00,0x00]
+          vdivpd 1024(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: vdivpd -1024(%rdx){1to2}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x10,0x5e,0x5a,0x80]
+          vdivpd -1024(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: vdivpd -1032(%rdx){1to2}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0xed,0x10,0x5e,0x9a,0xf8,0xfb,0xff,0xff]
+          vdivpd -1032(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: vdivpd %ymm28, %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0x81,0xbd,0x20,0x5e,0xfc]
+          vdivpd %ymm28, %ymm24, %ymm23
+
+// CHECK: vdivpd %ymm28, %ymm24, %ymm23 {%k6}
+// CHECK:  encoding: [0x62,0x81,0xbd,0x26,0x5e,0xfc]
+          vdivpd %ymm28, %ymm24, %ymm23 {%k6}
+
+// CHECK: vdivpd %ymm28, %ymm24, %ymm23 {%k6} {z}
+// CHECK:  encoding: [0x62,0x81,0xbd,0xa6,0x5e,0xfc]
+          vdivpd %ymm28, %ymm24, %ymm23 {%k6} {z}
+
+// CHECK: vdivpd (%rcx), %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5e,0x39]
+          vdivpd (%rcx), %ymm24, %ymm23
+
+// CHECK: vdivpd 291(%rax,%r14,8), %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xa1,0xbd,0x20,0x5e,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vdivpd 291(%rax,%r14,8), %ymm24, %ymm23
+
+// CHECK: vdivpd (%rcx){1to4}, %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5e,0x39]
+          vdivpd (%rcx){1to4}, %ymm24, %ymm23
+
+// CHECK: vdivpd 4064(%rdx), %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5e,0x7a,0x7f]
+          vdivpd 4064(%rdx), %ymm24, %ymm23
+
+// CHECK: vdivpd 4096(%rdx), %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5e,0xba,0x00,0x10,0x00,0x00]
+          vdivpd 4096(%rdx), %ymm24, %ymm23
+
+// CHECK: vdivpd -4096(%rdx), %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5e,0x7a,0x80]
+          vdivpd -4096(%rdx), %ymm24, %ymm23
+
+// CHECK: vdivpd -4128(%rdx), %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5e,0xba,0xe0,0xef,0xff,0xff]
+          vdivpd -4128(%rdx), %ymm24, %ymm23
+
+// CHECK: vdivpd 1016(%rdx){1to4}, %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5e,0x7a,0x7f]
+          vdivpd 1016(%rdx){1to4}, %ymm24, %ymm23
+
+// CHECK: vdivpd 1024(%rdx){1to4}, %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5e,0xba,0x00,0x04,0x00,0x00]
+          vdivpd 1024(%rdx){1to4}, %ymm24, %ymm23
+
+// CHECK: vdivpd -1024(%rdx){1to4}, %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5e,0x7a,0x80]
+          vdivpd -1024(%rdx){1to4}, %ymm24, %ymm23
+
+// CHECK: vdivpd -1032(%rdx){1to4}, %ymm24, %ymm23
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5e,0xba,0xf8,0xfb,0xff,0xff]
+          vdivpd -1032(%rdx){1to4}, %ymm24, %ymm23
+
+// CHECK: vdivps %xmm26, %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0x81,0x3c,0x00,0x5e,0xca]
+          vdivps %xmm26, %xmm24, %xmm17
+
+// CHECK: vdivps %xmm26, %xmm24, %xmm17 {%k2}
+// CHECK:  encoding: [0x62,0x81,0x3c,0x02,0x5e,0xca]
+          vdivps %xmm26, %xmm24, %xmm17 {%k2}
+
+// CHECK: vdivps %xmm26, %xmm24, %xmm17 {%k2} {z}
+// CHECK:  encoding: [0x62,0x81,0x3c,0x82,0x5e,0xca]
+          vdivps %xmm26, %xmm24, %xmm17 {%k2} {z}
+
+// CHECK: vdivps (%rcx), %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x00,0x5e,0x09]
+          vdivps (%rcx), %xmm24, %xmm17
+
+// CHECK: vdivps 291(%rax,%r14,8), %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x3c,0x00,0x5e,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vdivps 291(%rax,%r14,8), %xmm24, %xmm17
+
+// CHECK: vdivps (%rcx){1to4}, %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x10,0x5e,0x09]
+          vdivps (%rcx){1to4}, %xmm24, %xmm17
+
+// CHECK: vdivps 2032(%rdx), %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x00,0x5e,0x4a,0x7f]
+          vdivps 2032(%rdx), %xmm24, %xmm17
+
+// CHECK: vdivps 2048(%rdx), %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x00,0x5e,0x8a,0x00,0x08,0x00,0x00]
+          vdivps 2048(%rdx), %xmm24, %xmm17
+
+// CHECK: vdivps -2048(%rdx), %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x00,0x5e,0x4a,0x80]
+          vdivps -2048(%rdx), %xmm24, %xmm17
+
+// CHECK: vdivps -2064(%rdx), %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x00,0x5e,0x8a,0xf0,0xf7,0xff,0xff]
+          vdivps -2064(%rdx), %xmm24, %xmm17
+
+// CHECK: vdivps 508(%rdx){1to4}, %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x10,0x5e,0x4a,0x7f]
+          vdivps 508(%rdx){1to4}, %xmm24, %xmm17
+
+// CHECK: vdivps 512(%rdx){1to4}, %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x10,0x5e,0x8a,0x00,0x02,0x00,0x00]
+          vdivps 512(%rdx){1to4}, %xmm24, %xmm17
+
+// CHECK: vdivps -512(%rdx){1to4}, %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x10,0x5e,0x4a,0x80]
+          vdivps -512(%rdx){1to4}, %xmm24, %xmm17
+
+// CHECK: vdivps -516(%rdx){1to4}, %xmm24, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x3c,0x10,0x5e,0x8a,0xfc,0xfd,0xff,0xff]
+          vdivps -516(%rdx){1to4}, %xmm24, %xmm17
+
+// CHECK: vdivps %ymm17, %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xa1,0x24,0x20,0x5e,0xc9]
+          vdivps %ymm17, %ymm27, %ymm17
+
+// CHECK: vdivps %ymm17, %ymm27, %ymm17 {%k6}
+// CHECK:  encoding: [0x62,0xa1,0x24,0x26,0x5e,0xc9]
+          vdivps %ymm17, %ymm27, %ymm17 {%k6}
+
+// CHECK: vdivps %ymm17, %ymm27, %ymm17 {%k6} {z}
+// CHECK:  encoding: [0x62,0xa1,0x24,0xa6,0x5e,0xc9]
+          vdivps %ymm17, %ymm27, %ymm17 {%k6} {z}
+
+// CHECK: vdivps (%rcx), %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x20,0x5e,0x09]
+          vdivps (%rcx), %ymm27, %ymm17
+
+// CHECK: vdivps 291(%rax,%r14,8), %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xa1,0x24,0x20,0x5e,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vdivps 291(%rax,%r14,8), %ymm27, %ymm17
+
+// CHECK: vdivps (%rcx){1to8}, %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x30,0x5e,0x09]
+          vdivps (%rcx){1to8}, %ymm27, %ymm17
+
+// CHECK: vdivps 4064(%rdx), %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x20,0x5e,0x4a,0x7f]
+          vdivps 4064(%rdx), %ymm27, %ymm17
+
+// CHECK: vdivps 4096(%rdx), %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x20,0x5e,0x8a,0x00,0x10,0x00,0x00]
+          vdivps 4096(%rdx), %ymm27, %ymm17
+
+// CHECK: vdivps -4096(%rdx), %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x20,0x5e,0x4a,0x80]
+          vdivps -4096(%rdx), %ymm27, %ymm17
+
+// CHECK: vdivps -4128(%rdx), %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x20,0x5e,0x8a,0xe0,0xef,0xff,0xff]
+          vdivps -4128(%rdx), %ymm27, %ymm17
+
+// CHECK: vdivps 508(%rdx){1to8}, %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x30,0x5e,0x4a,0x7f]
+          vdivps 508(%rdx){1to8}, %ymm27, %ymm17
+
+// CHECK: vdivps 512(%rdx){1to8}, %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x30,0x5e,0x8a,0x00,0x02,0x00,0x00]
+          vdivps 512(%rdx){1to8}, %ymm27, %ymm17
+
+// CHECK: vdivps -512(%rdx){1to8}, %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x30,0x5e,0x4a,0x80]
+          vdivps -512(%rdx){1to8}, %ymm27, %ymm17
+
+// CHECK: vdivps -516(%rdx){1to8}, %ymm27, %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x24,0x30,0x5e,0x8a,0xfc,0xfd,0xff,0xff]
+          vdivps -516(%rdx){1to8}, %ymm27, %ymm17
+
+// CHECK: vmaxpd %xmm23, %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x95,0x00,0x5f,0xcf]
+          vmaxpd %xmm23, %xmm29, %xmm17
+
+// CHECK: vmaxpd %xmm23, %xmm29, %xmm17 {%k6}
+// CHECK:  encoding: [0x62,0xa1,0x95,0x06,0x5f,0xcf]
+          vmaxpd %xmm23, %xmm29, %xmm17 {%k6}
+
+// CHECK: vmaxpd %xmm23, %xmm29, %xmm17 {%k6} {z}
+// CHECK:  encoding: [0x62,0xa1,0x95,0x86,0x5f,0xcf]
+          vmaxpd %xmm23, %xmm29, %xmm17 {%k6} {z}
+
+// CHECK: vmaxpd (%rcx), %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x5f,0x09]
+          vmaxpd (%rcx), %xmm29, %xmm17
+
+// CHECK: vmaxpd 291(%rax,%r14,8), %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x95,0x00,0x5f,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmaxpd 291(%rax,%r14,8), %xmm29, %xmm17
+
+// CHECK: vmaxpd (%rcx){1to2}, %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x5f,0x09]
+          vmaxpd (%rcx){1to2}, %xmm29, %xmm17
+
+// CHECK: vmaxpd 2032(%rdx), %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x5f,0x4a,0x7f]
+          vmaxpd 2032(%rdx), %xmm29, %xmm17
+
+// CHECK: vmaxpd 2048(%rdx), %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x5f,0x8a,0x00,0x08,0x00,0x00]
+          vmaxpd 2048(%rdx), %xmm29, %xmm17
+
+// CHECK: vmaxpd -2048(%rdx), %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x5f,0x4a,0x80]
+          vmaxpd -2048(%rdx), %xmm29, %xmm17
+
+// CHECK: vmaxpd -2064(%rdx), %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x00,0x5f,0x8a,0xf0,0xf7,0xff,0xff]
+          vmaxpd -2064(%rdx), %xmm29, %xmm17
+
+// CHECK: vmaxpd 1016(%rdx){1to2}, %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x5f,0x4a,0x7f]
+          vmaxpd 1016(%rdx){1to2}, %xmm29, %xmm17
+
+// CHECK: vmaxpd 1024(%rdx){1to2}, %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x5f,0x8a,0x00,0x04,0x00,0x00]
+          vmaxpd 1024(%rdx){1to2}, %xmm29, %xmm17
+
+// CHECK: vmaxpd -1024(%rdx){1to2}, %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x5f,0x4a,0x80]
+          vmaxpd -1024(%rdx){1to2}, %xmm29, %xmm17
+
+// CHECK: vmaxpd -1032(%rdx){1to2}, %xmm29, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x95,0x10,0x5f,0x8a,0xf8,0xfb,0xff,0xff]
+          vmaxpd -1032(%rdx){1to2}, %xmm29, %xmm17
+
+// CHECK: vmaxpd %ymm24, %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0x81,0xbd,0x20,0x5f,0xe8]
+          vmaxpd %ymm24, %ymm24, %ymm21
+
+// CHECK: vmaxpd %ymm24, %ymm24, %ymm21 {%k1}
+// CHECK:  encoding: [0x62,0x81,0xbd,0x21,0x5f,0xe8]
+          vmaxpd %ymm24, %ymm24, %ymm21 {%k1}
+
+// CHECK: vmaxpd %ymm24, %ymm24, %ymm21 {%k1} {z}
+// CHECK:  encoding: [0x62,0x81,0xbd,0xa1,0x5f,0xe8]
+          vmaxpd %ymm24, %ymm24, %ymm21 {%k1} {z}
+
+// CHECK: vmaxpd (%rcx), %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5f,0x29]
+          vmaxpd (%rcx), %ymm24, %ymm21
+
+// CHECK: vmaxpd 291(%rax,%r14,8), %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xa1,0xbd,0x20,0x5f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmaxpd 291(%rax,%r14,8), %ymm24, %ymm21
+
+// CHECK: vmaxpd (%rcx){1to4}, %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5f,0x29]
+          vmaxpd (%rcx){1to4}, %ymm24, %ymm21
+
+// CHECK: vmaxpd 4064(%rdx), %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5f,0x6a,0x7f]
+          vmaxpd 4064(%rdx), %ymm24, %ymm21
+
+// CHECK: vmaxpd 4096(%rdx), %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5f,0xaa,0x00,0x10,0x00,0x00]
+          vmaxpd 4096(%rdx), %ymm24, %ymm21
+
+// CHECK: vmaxpd -4096(%rdx), %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5f,0x6a,0x80]
+          vmaxpd -4096(%rdx), %ymm24, %ymm21
+
+// CHECK: vmaxpd -4128(%rdx), %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x20,0x5f,0xaa,0xe0,0xef,0xff,0xff]
+          vmaxpd -4128(%rdx), %ymm24, %ymm21
+
+// CHECK: vmaxpd 1016(%rdx){1to4}, %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5f,0x6a,0x7f]
+          vmaxpd 1016(%rdx){1to4}, %ymm24, %ymm21
+
+// CHECK: vmaxpd 1024(%rdx){1to4}, %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5f,0xaa,0x00,0x04,0x00,0x00]
+          vmaxpd 1024(%rdx){1to4}, %ymm24, %ymm21
+
+// CHECK: vmaxpd -1024(%rdx){1to4}, %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5f,0x6a,0x80]
+          vmaxpd -1024(%rdx){1to4}, %ymm24, %ymm21
+
+// CHECK: vmaxpd -1032(%rdx){1to4}, %ymm24, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xbd,0x30,0x5f,0xaa,0xf8,0xfb,0xff,0xff]
+          vmaxpd -1032(%rdx){1to4}, %ymm24, %ymm21
+
+// CHECK: vmaxps %xmm19, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x21,0x5c,0x00,0x5f,0xc3]
+          vmaxps %xmm19, %xmm20, %xmm24
+
+// CHECK: vmaxps %xmm19, %xmm20, %xmm24 {%k6}
+// CHECK:  encoding: [0x62,0x21,0x5c,0x06,0x5f,0xc3]
+          vmaxps %xmm19, %xmm20, %xmm24 {%k6}
+
+// CHECK: vmaxps %xmm19, %xmm20, %xmm24 {%k6} {z}
+// CHECK:  encoding: [0x62,0x21,0x5c,0x86,0x5f,0xc3]
+          vmaxps %xmm19, %xmm20, %xmm24 {%k6} {z}
+
+// CHECK: vmaxps (%rcx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x00,0x5f,0x01]
+          vmaxps (%rcx), %xmm20, %xmm24
+
+// CHECK: vmaxps 291(%rax,%r14,8), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x21,0x5c,0x00,0x5f,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmaxps 291(%rax,%r14,8), %xmm20, %xmm24
+
+// CHECK: vmaxps (%rcx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x10,0x5f,0x01]
+          vmaxps (%rcx){1to4}, %xmm20, %xmm24
+
+// CHECK: vmaxps 2032(%rdx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x00,0x5f,0x42,0x7f]
+          vmaxps 2032(%rdx), %xmm20, %xmm24
+
+// CHECK: vmaxps 2048(%rdx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x00,0x5f,0x82,0x00,0x08,0x00,0x00]
+          vmaxps 2048(%rdx), %xmm20, %xmm24
+
+// CHECK: vmaxps -2048(%rdx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x00,0x5f,0x42,0x80]
+          vmaxps -2048(%rdx), %xmm20, %xmm24
+
+// CHECK: vmaxps -2064(%rdx), %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x00,0x5f,0x82,0xf0,0xf7,0xff,0xff]
+          vmaxps -2064(%rdx), %xmm20, %xmm24
+
+// CHECK: vmaxps 508(%rdx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x10,0x5f,0x42,0x7f]
+          vmaxps 508(%rdx){1to4}, %xmm20, %xmm24
+
+// CHECK: vmaxps 512(%rdx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x10,0x5f,0x82,0x00,0x02,0x00,0x00]
+          vmaxps 512(%rdx){1to4}, %xmm20, %xmm24
+
+// CHECK: vmaxps -512(%rdx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x10,0x5f,0x42,0x80]
+          vmaxps -512(%rdx){1to4}, %xmm20, %xmm24
+
+// CHECK: vmaxps -516(%rdx){1to4}, %xmm20, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x5c,0x10,0x5f,0x82,0xfc,0xfd,0xff,0xff]
+          vmaxps -516(%rdx){1to4}, %xmm20, %xmm24
+
+// CHECK: vmaxps %ymm17, %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xa1,0x74,0x20,0x5f,0xd9]
+          vmaxps %ymm17, %ymm17, %ymm19
+
+// CHECK: vmaxps %ymm17, %ymm17, %ymm19 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0x74,0x23,0x5f,0xd9]
+          vmaxps %ymm17, %ymm17, %ymm19 {%k3}
+
+// CHECK: vmaxps %ymm17, %ymm17, %ymm19 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0x74,0xa3,0x5f,0xd9]
+          vmaxps %ymm17, %ymm17, %ymm19 {%k3} {z}
+
+// CHECK: vmaxps (%rcx), %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x20,0x5f,0x19]
+          vmaxps (%rcx), %ymm17, %ymm19
+
+// CHECK: vmaxps 291(%rax,%r14,8), %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xa1,0x74,0x20,0x5f,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vmaxps 291(%rax,%r14,8), %ymm17, %ymm19
+
+// CHECK: vmaxps (%rcx){1to8}, %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x30,0x5f,0x19]
+          vmaxps (%rcx){1to8}, %ymm17, %ymm19
+
+// CHECK: vmaxps 4064(%rdx), %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x20,0x5f,0x5a,0x7f]
+          vmaxps 4064(%rdx), %ymm17, %ymm19
+
+// CHECK: vmaxps 4096(%rdx), %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x20,0x5f,0x9a,0x00,0x10,0x00,0x00]
+          vmaxps 4096(%rdx), %ymm17, %ymm19
+
+// CHECK: vmaxps -4096(%rdx), %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x20,0x5f,0x5a,0x80]
+          vmaxps -4096(%rdx), %ymm17, %ymm19
+
+// CHECK: vmaxps -4128(%rdx), %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x20,0x5f,0x9a,0xe0,0xef,0xff,0xff]
+          vmaxps -4128(%rdx), %ymm17, %ymm19
+
+// CHECK: vmaxps 508(%rdx){1to8}, %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x30,0x5f,0x5a,0x7f]
+          vmaxps 508(%rdx){1to8}, %ymm17, %ymm19
+
+// CHECK: vmaxps 512(%rdx){1to8}, %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x30,0x5f,0x9a,0x00,0x02,0x00,0x00]
+          vmaxps 512(%rdx){1to8}, %ymm17, %ymm19
+
+// CHECK: vmaxps -512(%rdx){1to8}, %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x30,0x5f,0x5a,0x80]
+          vmaxps -512(%rdx){1to8}, %ymm17, %ymm19
+
+// CHECK: vmaxps -516(%rdx){1to8}, %ymm17, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x74,0x30,0x5f,0x9a,0xfc,0xfd,0xff,0xff]
+          vmaxps -516(%rdx){1to8}, %ymm17, %ymm19
+
+// CHECK: vminpd %xmm19, %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x21,0xe5,0x00,0x5d,0xdb]
+          vminpd %xmm19, %xmm19, %xmm27
+
+// CHECK: vminpd %xmm19, %xmm19, %xmm27 {%k6}
+// CHECK:  encoding: [0x62,0x21,0xe5,0x06,0x5d,0xdb]
+          vminpd %xmm19, %xmm19, %xmm27 {%k6}
+
+// CHECK: vminpd %xmm19, %xmm19, %xmm27 {%k6} {z}
+// CHECK:  encoding: [0x62,0x21,0xe5,0x86,0x5d,0xdb]
+          vminpd %xmm19, %xmm19, %xmm27 {%k6} {z}
+
+// CHECK: vminpd (%rcx), %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x00,0x5d,0x19]
+          vminpd (%rcx), %xmm19, %xmm27
+
+// CHECK: vminpd 291(%rax,%r14,8), %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x21,0xe5,0x00,0x5d,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vminpd 291(%rax,%r14,8), %xmm19, %xmm27
+
+// CHECK: vminpd (%rcx){1to2}, %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x10,0x5d,0x19]
+          vminpd (%rcx){1to2}, %xmm19, %xmm27
+
+// CHECK: vminpd 2032(%rdx), %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x00,0x5d,0x5a,0x7f]
+          vminpd 2032(%rdx), %xmm19, %xmm27
+
+// CHECK: vminpd 2048(%rdx), %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x00,0x5d,0x9a,0x00,0x08,0x00,0x00]
+          vminpd 2048(%rdx), %xmm19, %xmm27
+
+// CHECK: vminpd -2048(%rdx), %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x00,0x5d,0x5a,0x80]
+          vminpd -2048(%rdx), %xmm19, %xmm27
+
+// CHECK: vminpd -2064(%rdx), %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x00,0x5d,0x9a,0xf0,0xf7,0xff,0xff]
+          vminpd -2064(%rdx), %xmm19, %xmm27
+
+// CHECK: vminpd 1016(%rdx){1to2}, %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x10,0x5d,0x5a,0x7f]
+          vminpd 1016(%rdx){1to2}, %xmm19, %xmm27
+
+// CHECK: vminpd 1024(%rdx){1to2}, %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x10,0x5d,0x9a,0x00,0x04,0x00,0x00]
+          vminpd 1024(%rdx){1to2}, %xmm19, %xmm27
+
+// CHECK: vminpd -1024(%rdx){1to2}, %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x10,0x5d,0x5a,0x80]
+          vminpd -1024(%rdx){1to2}, %xmm19, %xmm27
+
+// CHECK: vminpd -1032(%rdx){1to2}, %xmm19, %xmm27
+// CHECK:  encoding: [0x62,0x61,0xe5,0x10,0x5d,0x9a,0xf8,0xfb,0xff,0xff]
+          vminpd -1032(%rdx){1to2}, %xmm19, %xmm27
+
+// CHECK: vminpd %ymm23, %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x21,0x95,0x20,0x5d,0xc7]
+          vminpd %ymm23, %ymm29, %ymm24
+
+// CHECK: vminpd %ymm23, %ymm29, %ymm24 {%k6}
+// CHECK:  encoding: [0x62,0x21,0x95,0x26,0x5d,0xc7]
+          vminpd %ymm23, %ymm29, %ymm24 {%k6}
+
+// CHECK: vminpd %ymm23, %ymm29, %ymm24 {%k6} {z}
+// CHECK:  encoding: [0x62,0x21,0x95,0xa6,0x5d,0xc7]
+          vminpd %ymm23, %ymm29, %ymm24 {%k6} {z}
+
+// CHECK: vminpd (%rcx), %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x20,0x5d,0x01]
+          vminpd (%rcx), %ymm29, %ymm24
+
+// CHECK: vminpd 291(%rax,%r14,8), %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x21,0x95,0x20,0x5d,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vminpd 291(%rax,%r14,8), %ymm29, %ymm24
+
+// CHECK: vminpd (%rcx){1to4}, %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x30,0x5d,0x01]
+          vminpd (%rcx){1to4}, %ymm29, %ymm24
+
+// CHECK: vminpd 4064(%rdx), %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x20,0x5d,0x42,0x7f]
+          vminpd 4064(%rdx), %ymm29, %ymm24
+
+// CHECK: vminpd 4096(%rdx), %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x20,0x5d,0x82,0x00,0x10,0x00,0x00]
+          vminpd 4096(%rdx), %ymm29, %ymm24
+
+// CHECK: vminpd -4096(%rdx), %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x20,0x5d,0x42,0x80]
+          vminpd -4096(%rdx), %ymm29, %ymm24
+
+// CHECK: vminpd -4128(%rdx), %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x20,0x5d,0x82,0xe0,0xef,0xff,0xff]
+          vminpd -4128(%rdx), %ymm29, %ymm24
+
+// CHECK: vminpd 1016(%rdx){1to4}, %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x30,0x5d,0x42,0x7f]
+          vminpd 1016(%rdx){1to4}, %ymm29, %ymm24
+
+// CHECK: vminpd 1024(%rdx){1to4}, %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x30,0x5d,0x82,0x00,0x04,0x00,0x00]
+          vminpd 1024(%rdx){1to4}, %ymm29, %ymm24
+
+// CHECK: vminpd -1024(%rdx){1to4}, %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x30,0x5d,0x42,0x80]
+          vminpd -1024(%rdx){1to4}, %ymm29, %ymm24
+
+// CHECK: vminpd -1032(%rdx){1to4}, %ymm29, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x95,0x30,0x5d,0x82,0xf8,0xfb,0xff,0xff]
+          vminpd -1032(%rdx){1to4}, %ymm29, %ymm24
+
+// CHECK: vminps %xmm23, %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x5c,0x00,0x5d,0xcf]
+          vminps %xmm23, %xmm20, %xmm17
+
+// CHECK: vminps %xmm23, %xmm20, %xmm17 {%k1}
+// CHECK:  encoding: [0x62,0xa1,0x5c,0x01,0x5d,0xcf]
+          vminps %xmm23, %xmm20, %xmm17 {%k1}
+
+// CHECK: vminps %xmm23, %xmm20, %xmm17 {%k1} {z}
+// CHECK:  encoding: [0x62,0xa1,0x5c,0x81,0x5d,0xcf]
+          vminps %xmm23, %xmm20, %xmm17 {%k1} {z}
+
+// CHECK: vminps (%rcx), %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x00,0x5d,0x09]
+          vminps (%rcx), %xmm20, %xmm17
+
+// CHECK: vminps 291(%rax,%r14,8), %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0x5c,0x00,0x5d,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vminps 291(%rax,%r14,8), %xmm20, %xmm17
+
+// CHECK: vminps (%rcx){1to4}, %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x10,0x5d,0x09]
+          vminps (%rcx){1to4}, %xmm20, %xmm17
+
+// CHECK: vminps 2032(%rdx), %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x00,0x5d,0x4a,0x7f]
+          vminps 2032(%rdx), %xmm20, %xmm17
+
+// CHECK: vminps 2048(%rdx), %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x00,0x5d,0x8a,0x00,0x08,0x00,0x00]
+          vminps 2048(%rdx), %xmm20, %xmm17
+
+// CHECK: vminps -2048(%rdx), %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x00,0x5d,0x4a,0x80]
+          vminps -2048(%rdx), %xmm20, %xmm17
+
+// CHECK: vminps -2064(%rdx), %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x00,0x5d,0x8a,0xf0,0xf7,0xff,0xff]
+          vminps -2064(%rdx), %xmm20, %xmm17
+
+// CHECK: vminps 508(%rdx){1to4}, %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x10,0x5d,0x4a,0x7f]
+          vminps 508(%rdx){1to4}, %xmm20, %xmm17
+
+// CHECK: vminps 512(%rdx){1to4}, %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x10,0x5d,0x8a,0x00,0x02,0x00,0x00]
+          vminps 512(%rdx){1to4}, %xmm20, %xmm17
+
+// CHECK: vminps -512(%rdx){1to4}, %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x10,0x5d,0x4a,0x80]
+          vminps -512(%rdx){1to4}, %xmm20, %xmm17
+
+// CHECK: vminps -516(%rdx){1to4}, %xmm20, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0x5c,0x10,0x5d,0x8a,0xfc,0xfd,0xff,0xff]
+          vminps -516(%rdx){1to4}, %xmm20, %xmm17
+
+// CHECK: vminps %ymm21, %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x21,0x3c,0x20,0x5d,0xc5]
+          vminps %ymm21, %ymm24, %ymm24
+
+// CHECK: vminps %ymm21, %ymm24, %ymm24 {%k3}
+// CHECK:  encoding: [0x62,0x21,0x3c,0x23,0x5d,0xc5]
+          vminps %ymm21, %ymm24, %ymm24 {%k3}
+
+// CHECK: vminps %ymm21, %ymm24, %ymm24 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0x3c,0xa3,0x5d,0xc5]
+          vminps %ymm21, %ymm24, %ymm24 {%k3} {z}
+
+// CHECK: vminps (%rcx), %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x20,0x5d,0x01]
+          vminps (%rcx), %ymm24, %ymm24
+
+// CHECK: vminps 291(%rax,%r14,8), %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x21,0x3c,0x20,0x5d,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vminps 291(%rax,%r14,8), %ymm24, %ymm24
+
+// CHECK: vminps (%rcx){1to8}, %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x30,0x5d,0x01]
+          vminps (%rcx){1to8}, %ymm24, %ymm24
+
+// CHECK: vminps 4064(%rdx), %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x20,0x5d,0x42,0x7f]
+          vminps 4064(%rdx), %ymm24, %ymm24
+
+// CHECK: vminps 4096(%rdx), %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x20,0x5d,0x82,0x00,0x10,0x00,0x00]
+          vminps 4096(%rdx), %ymm24, %ymm24
+
+// CHECK: vminps -4096(%rdx), %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x20,0x5d,0x42,0x80]
+          vminps -4096(%rdx), %ymm24, %ymm24
+
+// CHECK: vminps -4128(%rdx), %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x20,0x5d,0x82,0xe0,0xef,0xff,0xff]
+          vminps -4128(%rdx), %ymm24, %ymm24
+
+// CHECK: vminps 508(%rdx){1to8}, %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x30,0x5d,0x42,0x7f]
+          vminps 508(%rdx){1to8}, %ymm24, %ymm24
+
+// CHECK: vminps 512(%rdx){1to8}, %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x30,0x5d,0x82,0x00,0x02,0x00,0x00]
+          vminps 512(%rdx){1to8}, %ymm24, %ymm24
+
+// CHECK: vminps -512(%rdx){1to8}, %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x30,0x5d,0x42,0x80]
+          vminps -512(%rdx){1to8}, %ymm24, %ymm24
+
+// CHECK: vminps -516(%rdx){1to8}, %ymm24, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x3c,0x30,0x5d,0x82,0xfc,0xfd,0xff,0xff]
+          vminps -516(%rdx){1to8}, %ymm24, %ymm24
+
+// CHECK: vmovapd %xmm21, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x28,0xed]
+          vmovapd %xmm21, %xmm21
+
+// CHECK: vmovapd %xmm21, %xmm21 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x0b,0x28,0xed]
+          vmovapd %xmm21, %xmm21 {%k3}
+
+// CHECK: vmovapd %xmm21, %xmm21 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x8b,0x28,0xed]
+          vmovapd %xmm21, %xmm21 {%k3} {z}
+
+// CHECK: vmovapd (%rcx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x28,0x29]
+          vmovapd (%rcx), %xmm21
+
+// CHECK: vmovapd 291(%rax,%r14,8), %xmm21
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x28,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovapd 291(%rax,%r14,8), %xmm21
+
+// CHECK: vmovapd 2032(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x28,0x6a,0x7f]
+          vmovapd 2032(%rdx), %xmm21
+
+// CHECK: vmovapd 2048(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x28,0xaa,0x00,0x08,0x00,0x00]
+          vmovapd 2048(%rdx), %xmm21
+
+// CHECK: vmovapd -2048(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x28,0x6a,0x80]
+          vmovapd -2048(%rdx), %xmm21
+
+// CHECK: vmovapd -2064(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x28,0xaa,0xf0,0xf7,0xff,0xff]
+          vmovapd -2064(%rdx), %xmm21
+
+// CHECK: vmovapd %ymm17, %ymm18
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x28,0x28,0xd1]
+          vmovapd %ymm17, %ymm18
+
+// CHECK: vmovapd %ymm17, %ymm18 {%k2}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x2a,0x28,0xd1]
+          vmovapd %ymm17, %ymm18 {%k2}
+
+// CHECK: vmovapd %ymm17, %ymm18 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0xaa,0x28,0xd1]
+          vmovapd %ymm17, %ymm18 {%k2} {z}
+
+// CHECK: vmovapd (%rcx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x28,0x11]
+          vmovapd (%rcx), %ymm18
+
+// CHECK: vmovapd 291(%rax,%r14,8), %ymm18
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x28,0x28,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vmovapd 291(%rax,%r14,8), %ymm18
+
+// CHECK: vmovapd 4064(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x28,0x52,0x7f]
+          vmovapd 4064(%rdx), %ymm18
+
+// CHECK: vmovapd 4096(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x28,0x92,0x00,0x10,0x00,0x00]
+          vmovapd 4096(%rdx), %ymm18
+
+// CHECK: vmovapd -4096(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x28,0x52,0x80]
+          vmovapd -4096(%rdx), %ymm18
+
+// CHECK: vmovapd -4128(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x28,0x92,0xe0,0xef,0xff,0xff]
+          vmovapd -4128(%rdx), %ymm18
+
+// CHECK: vmovaps %xmm29, %xmm22
+// CHECK:  encoding: [0x62,0x81,0x7c,0x08,0x28,0xf5]
+          vmovaps %xmm29, %xmm22
+
+// CHECK: vmovaps %xmm29, %xmm22 {%k1}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x09,0x28,0xf5]
+          vmovaps %xmm29, %xmm22 {%k1}
+
+// CHECK: vmovaps %xmm29, %xmm22 {%k1} {z}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x89,0x28,0xf5]
+          vmovaps %xmm29, %xmm22 {%k1} {z}
+
+// CHECK: vmovaps (%rcx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x28,0x31]
+          vmovaps (%rcx), %xmm22
+
+// CHECK: vmovaps 291(%rax,%r14,8), %xmm22
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x08,0x28,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovaps 291(%rax,%r14,8), %xmm22
+
+// CHECK: vmovaps 2032(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x28,0x72,0x7f]
+          vmovaps 2032(%rdx), %xmm22
+
+// CHECK: vmovaps 2048(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x28,0xb2,0x00,0x08,0x00,0x00]
+          vmovaps 2048(%rdx), %xmm22
+
+// CHECK: vmovaps -2048(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x28,0x72,0x80]
+          vmovaps -2048(%rdx), %xmm22
+
+// CHECK: vmovaps -2064(%rdx), %xmm22
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x28,0xb2,0xf0,0xf7,0xff,0xff]
+          vmovaps -2064(%rdx), %xmm22
+
+// CHECK: vmovaps %ymm28, %ymm25
+// CHECK:  encoding: [0x62,0x01,0x7c,0x28,0x28,0xcc]
+          vmovaps %ymm28, %ymm25
+
+// CHECK: vmovaps %ymm28, %ymm25 {%k3}
+// CHECK:  encoding: [0x62,0x01,0x7c,0x2b,0x28,0xcc]
+          vmovaps %ymm28, %ymm25 {%k3}
+
+// CHECK: vmovaps %ymm28, %ymm25 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0x7c,0xab,0x28,0xcc]
+          vmovaps %ymm28, %ymm25 {%k3} {z}
+
+// CHECK: vmovaps (%rcx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x28,0x09]
+          vmovaps (%rcx), %ymm25
+
+// CHECK: vmovaps 291(%rax,%r14,8), %ymm25
+// CHECK:  encoding: [0x62,0x21,0x7c,0x28,0x28,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovaps 291(%rax,%r14,8), %ymm25
+
+// CHECK: vmovaps 4064(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x28,0x4a,0x7f]
+          vmovaps 4064(%rdx), %ymm25
+
+// CHECK: vmovaps 4096(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x28,0x8a,0x00,0x10,0x00,0x00]
+          vmovaps 4096(%rdx), %ymm25
+
+// CHECK: vmovaps -4096(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x28,0x4a,0x80]
+          vmovaps -4096(%rdx), %ymm25
+
+// CHECK: vmovaps -4128(%rdx), %ymm25
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x28,0x8a,0xe0,0xef,0xff,0xff]
+          vmovaps -4128(%rdx), %ymm25
+
+// CHECK: vmovdqa32 %xmm24, %xmm21
+// CHECK:  encoding: [0x62,0x81,0x7d,0x08,0x6f,0xe8]
+          vmovdqa32 %xmm24, %xmm21
+
+// CHECK: vmovdqa32 %xmm24, %xmm21 {%k6}
+// CHECK:  encoding: [0x62,0x81,0x7d,0x0e,0x6f,0xe8]
+          vmovdqa32 %xmm24, %xmm21 {%k6}
+
+// CHECK: vmovdqa32 %xmm24, %xmm21 {%k6} {z}
+// CHECK:  encoding: [0x62,0x81,0x7d,0x8e,0x6f,0xe8]
+          vmovdqa32 %xmm24, %xmm21 {%k6} {z}
+
+// CHECK: vmovdqa32 (%rcx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x6f,0x29]
+          vmovdqa32 (%rcx), %xmm21
+
+// CHECK: vmovdqa32 291(%rax,%r14,8), %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x08,0x6f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa32 291(%rax,%r14,8), %xmm21
+
+// CHECK: vmovdqa32 2032(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x6f,0x6a,0x7f]
+          vmovdqa32 2032(%rdx), %xmm21
+
+// CHECK: vmovdqa32 2048(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x6f,0xaa,0x00,0x08,0x00,0x00]
+          vmovdqa32 2048(%rdx), %xmm21
+
+// CHECK: vmovdqa32 -2048(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x6f,0x6a,0x80]
+          vmovdqa32 -2048(%rdx), %xmm21
+
+// CHECK: vmovdqa32 -2064(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0x6f,0xaa,0xf0,0xf7,0xff,0xff]
+          vmovdqa32 -2064(%rdx), %xmm21
+
+// CHECK: vmovdqa32 %ymm28, %ymm24
+// CHECK:  encoding: [0x62,0x01,0x7d,0x28,0x6f,0xc4]
+          vmovdqa32 %ymm28, %ymm24
+
+// CHECK: vmovdqa32 %ymm28, %ymm24 {%k5}
+// CHECK:  encoding: [0x62,0x01,0x7d,0x2d,0x6f,0xc4]
+          vmovdqa32 %ymm28, %ymm24 {%k5}
+
+// CHECK: vmovdqa32 %ymm28, %ymm24 {%k5} {z}
+// CHECK:  encoding: [0x62,0x01,0x7d,0xad,0x6f,0xc4]
+          vmovdqa32 %ymm28, %ymm24 {%k5} {z}
+
+// CHECK: vmovdqa32 (%rcx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x6f,0x01]
+          vmovdqa32 (%rcx), %ymm24
+
+// CHECK: vmovdqa32 291(%rax,%r14,8), %ymm24
+// CHECK:  encoding: [0x62,0x21,0x7d,0x28,0x6f,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa32 291(%rax,%r14,8), %ymm24
+
+// CHECK: vmovdqa32 4064(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x6f,0x42,0x7f]
+          vmovdqa32 4064(%rdx), %ymm24
+
+// CHECK: vmovdqa32 4096(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x6f,0x82,0x00,0x10,0x00,0x00]
+          vmovdqa32 4096(%rdx), %ymm24
+
+// CHECK: vmovdqa32 -4096(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x6f,0x42,0x80]
+          vmovdqa32 -4096(%rdx), %ymm24
+
+// CHECK: vmovdqa32 -4128(%rdx), %ymm24
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x6f,0x82,0xe0,0xef,0xff,0xff]
+          vmovdqa32 -4128(%rdx), %ymm24
+
+// CHECK: vmovdqa64 %xmm24, %xmm27
+// CHECK:  encoding: [0x62,0x01,0xfd,0x08,0x6f,0xd8]
+          vmovdqa64 %xmm24, %xmm27
+
+// CHECK: vmovdqa64 %xmm24, %xmm27 {%k5}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x0d,0x6f,0xd8]
+          vmovdqa64 %xmm24, %xmm27 {%k5}
+
+// CHECK: vmovdqa64 %xmm24, %xmm27 {%k5} {z}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x8d,0x6f,0xd8]
+          vmovdqa64 %xmm24, %xmm27 {%k5} {z}
+
+// CHECK: vmovdqa64 (%rcx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x6f,0x19]
+          vmovdqa64 (%rcx), %xmm27
+
+// CHECK: vmovdqa64 291(%rax,%r14,8), %xmm27
+// CHECK:  encoding: [0x62,0x21,0xfd,0x08,0x6f,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa64 291(%rax,%r14,8), %xmm27
+
+// CHECK: vmovdqa64 2032(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x6f,0x5a,0x7f]
+          vmovdqa64 2032(%rdx), %xmm27
+
+// CHECK: vmovdqa64 2048(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x6f,0x9a,0x00,0x08,0x00,0x00]
+          vmovdqa64 2048(%rdx), %xmm27
+
+// CHECK: vmovdqa64 -2048(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x6f,0x5a,0x80]
+          vmovdqa64 -2048(%rdx), %xmm27
+
+// CHECK: vmovdqa64 -2064(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x6f,0x9a,0xf0,0xf7,0xff,0xff]
+          vmovdqa64 -2064(%rdx), %xmm27
+
+// CHECK: vmovdqa64 %ymm29, %ymm30
+// CHECK:  encoding: [0x62,0x01,0xfd,0x28,0x6f,0xf5]
+          vmovdqa64 %ymm29, %ymm30
+
+// CHECK: vmovdqa64 %ymm29, %ymm30 {%k3}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x2b,0x6f,0xf5]
+          vmovdqa64 %ymm29, %ymm30 {%k3}
+
+// CHECK: vmovdqa64 %ymm29, %ymm30 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0xfd,0xab,0x6f,0xf5]
+          vmovdqa64 %ymm29, %ymm30 {%k3} {z}
+
+// CHECK: vmovdqa64 (%rcx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x6f,0x31]
+          vmovdqa64 (%rcx), %ymm30
+
+// CHECK: vmovdqa64 291(%rax,%r14,8), %ymm30
+// CHECK:  encoding: [0x62,0x21,0xfd,0x28,0x6f,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa64 291(%rax,%r14,8), %ymm30
+
+// CHECK: vmovdqa64 4064(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x6f,0x72,0x7f]
+          vmovdqa64 4064(%rdx), %ymm30
+
+// CHECK: vmovdqa64 4096(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x6f,0xb2,0x00,0x10,0x00,0x00]
+          vmovdqa64 4096(%rdx), %ymm30
+
+// CHECK: vmovdqa64 -4096(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x6f,0x72,0x80]
+          vmovdqa64 -4096(%rdx), %ymm30
+
+// CHECK: vmovdqa64 -4128(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x6f,0xb2,0xe0,0xef,0xff,0xff]
+          vmovdqa64 -4128(%rdx), %ymm30
+
+// CHECK: vmovdqu32 %xmm19, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x7e,0x08,0x6f,0xeb]
+          vmovdqu32 %xmm19, %xmm29
+
+// CHECK: vmovdqu32 %xmm19, %xmm29 {%k6}
+// CHECK:  encoding: [0x62,0x21,0x7e,0x0e,0x6f,0xeb]
+          vmovdqu32 %xmm19, %xmm29 {%k6}
+
+// CHECK: vmovdqu32 %xmm19, %xmm29 {%k6} {z}
+// CHECK:  encoding: [0x62,0x21,0x7e,0x8e,0x6f,0xeb]
+          vmovdqu32 %xmm19, %xmm29 {%k6} {z}
+
+// CHECK: vmovdqu32 (%rcx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x6f,0x29]
+          vmovdqu32 (%rcx), %xmm29
+
+// CHECK: vmovdqu32 291(%rax,%r14,8), %xmm29
+// CHECK:  encoding: [0x62,0x21,0x7e,0x08,0x6f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu32 291(%rax,%r14,8), %xmm29
+
+// CHECK: vmovdqu32 2032(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x6f,0x6a,0x7f]
+          vmovdqu32 2032(%rdx), %xmm29
+
+// CHECK: vmovdqu32 2048(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x6f,0xaa,0x00,0x08,0x00,0x00]
+          vmovdqu32 2048(%rdx), %xmm29
+
+// CHECK: vmovdqu32 -2048(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x6f,0x6a,0x80]
+          vmovdqu32 -2048(%rdx), %xmm29
+
+// CHECK: vmovdqu32 -2064(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0x7e,0x08,0x6f,0xaa,0xf0,0xf7,0xff,0xff]
+          vmovdqu32 -2064(%rdx), %xmm29
+
+// CHECK: vmovdqu32 %ymm18, %ymm17
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x28,0x6f,0xca]
+          vmovdqu32 %ymm18, %ymm17
+
+// CHECK: vmovdqu32 %ymm18, %ymm17 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x2b,0x6f,0xca]
+          vmovdqu32 %ymm18, %ymm17 {%k3}
+
+// CHECK: vmovdqu32 %ymm18, %ymm17 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0x7e,0xab,0x6f,0xca]
+          vmovdqu32 %ymm18, %ymm17 {%k3} {z}
+
+// CHECK: vmovdqu32 (%rcx), %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x6f,0x09]
+          vmovdqu32 (%rcx), %ymm17
+
+// CHECK: vmovdqu32 291(%rax,%r14,8), %ymm17
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x28,0x6f,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu32 291(%rax,%r14,8), %ymm17
+
+// CHECK: vmovdqu32 4064(%rdx), %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x6f,0x4a,0x7f]
+          vmovdqu32 4064(%rdx), %ymm17
+
+// CHECK: vmovdqu32 4096(%rdx), %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x6f,0x8a,0x00,0x10,0x00,0x00]
+          vmovdqu32 4096(%rdx), %ymm17
+
+// CHECK: vmovdqu32 -4096(%rdx), %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x6f,0x4a,0x80]
+          vmovdqu32 -4096(%rdx), %ymm17
+
+// CHECK: vmovdqu32 -4128(%rdx), %ymm17
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x6f,0x8a,0xe0,0xef,0xff,0xff]
+          vmovdqu32 -4128(%rdx), %ymm17
+
+// CHECK: vmovdqu64 %xmm19, %xmm24
+// CHECK:  encoding: [0x62,0x21,0xfe,0x08,0x6f,0xc3]
+          vmovdqu64 %xmm19, %xmm24
+
+// CHECK: vmovdqu64 %xmm19, %xmm24 {%k5}
+// CHECK:  encoding: [0x62,0x21,0xfe,0x0d,0x6f,0xc3]
+          vmovdqu64 %xmm19, %xmm24 {%k5}
+
+// CHECK: vmovdqu64 %xmm19, %xmm24 {%k5} {z}
+// CHECK:  encoding: [0x62,0x21,0xfe,0x8d,0x6f,0xc3]
+          vmovdqu64 %xmm19, %xmm24 {%k5} {z}
+
+// CHECK: vmovdqu64 (%rcx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfe,0x08,0x6f,0x01]
+          vmovdqu64 (%rcx), %xmm24
+
+// CHECK: vmovdqu64 291(%rax,%r14,8), %xmm24
+// CHECK:  encoding: [0x62,0x21,0xfe,0x08,0x6f,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu64 291(%rax,%r14,8), %xmm24
+
+// CHECK: vmovdqu64 2032(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfe,0x08,0x6f,0x42,0x7f]
+          vmovdqu64 2032(%rdx), %xmm24
+
+// CHECK: vmovdqu64 2048(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfe,0x08,0x6f,0x82,0x00,0x08,0x00,0x00]
+          vmovdqu64 2048(%rdx), %xmm24
+
+// CHECK: vmovdqu64 -2048(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfe,0x08,0x6f,0x42,0x80]
+          vmovdqu64 -2048(%rdx), %xmm24
+
+// CHECK: vmovdqu64 -2064(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfe,0x08,0x6f,0x82,0xf0,0xf7,0xff,0xff]
+          vmovdqu64 -2064(%rdx), %xmm24
+
+// CHECK: vmovdqu64 %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x21,0xfe,0x28,0x6f,0xed]
+          vmovdqu64 %ymm21, %ymm29
+
+// CHECK: vmovdqu64 %ymm21, %ymm29 {%k3}
+// CHECK:  encoding: [0x62,0x21,0xfe,0x2b,0x6f,0xed]
+          vmovdqu64 %ymm21, %ymm29 {%k3}
+
+// CHECK: vmovdqu64 %ymm21, %ymm29 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0xfe,0xab,0x6f,0xed]
+          vmovdqu64 %ymm21, %ymm29 {%k3} {z}
+
+// CHECK: vmovdqu64 (%rcx), %ymm29
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x6f,0x29]
+          vmovdqu64 (%rcx), %ymm29
+
+// CHECK: vmovdqu64 291(%rax,%r14,8), %ymm29
+// CHECK:  encoding: [0x62,0x21,0xfe,0x28,0x6f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu64 291(%rax,%r14,8), %ymm29
+
+// CHECK: vmovdqu64 4064(%rdx), %ymm29
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x6f,0x6a,0x7f]
+          vmovdqu64 4064(%rdx), %ymm29
+
+// CHECK: vmovdqu64 4096(%rdx), %ymm29
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x6f,0xaa,0x00,0x10,0x00,0x00]
+          vmovdqu64 4096(%rdx), %ymm29
+
+// CHECK: vmovdqu64 -4096(%rdx), %ymm29
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x6f,0x6a,0x80]
+          vmovdqu64 -4096(%rdx), %ymm29
+
+// CHECK: vmovdqu64 -4128(%rdx), %ymm29
+// CHECK:  encoding: [0x62,0x61,0xfe,0x28,0x6f,0xaa,0xe0,0xef,0xff,0xff]
+          vmovdqu64 -4128(%rdx), %ymm29
+
+// CHECK: vmovntdq %xmm22, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0xe7,0x31]
+          vmovntdq %xmm22, (%rcx)
+
+// CHECK: vmovntdq %xmm22, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x08,0xe7,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovntdq %xmm22, 291(%rax,%r14,8)
+
+// CHECK: vmovntdq %xmm22, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0xe7,0x72,0x7f]
+          vmovntdq %xmm22, 2032(%rdx)
+
+// CHECK: vmovntdq %xmm22, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0xe7,0xb2,0x00,0x08,0x00,0x00]
+          vmovntdq %xmm22, 2048(%rdx)
+
+// CHECK: vmovntdq %xmm22, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0xe7,0x72,0x80]
+          vmovntdq %xmm22, -2048(%rdx)
+
+// CHECK: vmovntdq %xmm22, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x08,0xe7,0xb2,0xf0,0xf7,0xff,0xff]
+          vmovntdq %xmm22, -2064(%rdx)
+
+// CHECK: vmovntdq %ymm19, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x19]
+          vmovntdq %ymm19, (%rcx)
+
+// CHECK: vmovntdq %ymm19, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7d,0x28,0xe7,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vmovntdq %ymm19, 291(%rax,%r14,8)
+
+// CHECK: vmovntdq %ymm19, 4064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x5a,0x7f]
+          vmovntdq %ymm19, 4064(%rdx)
+
+// CHECK: vmovntdq %ymm19, 4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x9a,0x00,0x10,0x00,0x00]
+          vmovntdq %ymm19, 4096(%rdx)
+
+// CHECK: vmovntdq %ymm19, -4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x5a,0x80]
+          vmovntdq %ymm19, -4096(%rdx)
+
+// CHECK: vmovntdq %ymm19, -4128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x9a,0xe0,0xef,0xff,0xff]
+          vmovntdq %ymm19, -4128(%rdx)
+
+// CHECK: vmovntdqa (%rcx), %xmm24
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x2a,0x01]
+          vmovntdqa (%rcx), %xmm24
+
+// CHECK: vmovntdqa 291(%rax,%r14,8), %xmm24
+// CHECK:  encoding: [0x62,0x22,0x7d,0x08,0x2a,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmovntdqa 291(%rax,%r14,8), %xmm24
+
+// CHECK: vmovntdqa 2032(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x2a,0x42,0x7f]
+          vmovntdqa 2032(%rdx), %xmm24
+
+// CHECK: vmovntdqa 2048(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x2a,0x82,0x00,0x08,0x00,0x00]
+          vmovntdqa 2048(%rdx), %xmm24
+
+// CHECK: vmovntdqa -2048(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x2a,0x42,0x80]
+          vmovntdqa -2048(%rdx), %xmm24
+
+// CHECK: vmovntdqa -2064(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x2a,0x82,0xf0,0xf7,0xff,0xff]
+          vmovntdqa -2064(%rdx), %xmm24
+
+// CHECK: vmovntdqa (%rcx), %ymm28
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x2a,0x21]
+          vmovntdqa (%rcx), %ymm28
+
+// CHECK: vmovntdqa 291(%rax,%r14,8), %ymm28
+// CHECK:  encoding: [0x62,0x22,0x7d,0x28,0x2a,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vmovntdqa 291(%rax,%r14,8), %ymm28
+
+// CHECK: vmovntdqa 4064(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x2a,0x62,0x7f]
+          vmovntdqa 4064(%rdx), %ymm28
+
+// CHECK: vmovntdqa 4096(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x2a,0xa2,0x00,0x10,0x00,0x00]
+          vmovntdqa 4096(%rdx), %ymm28
+
+// CHECK: vmovntdqa -4096(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x2a,0x62,0x80]
+          vmovntdqa -4096(%rdx), %ymm28
+
+// CHECK: vmovntdqa -4128(%rdx), %ymm28
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x2a,0xa2,0xe0,0xef,0xff,0xff]
+          vmovntdqa -4128(%rdx), %ymm28
+
+// CHECK: vmovntpd %xmm17, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x09]
+          vmovntpd %xmm17, (%rcx)
+
+// CHECK: vmovntpd %xmm17, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x2b,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovntpd %xmm17, 291(%rax,%r14,8)
+
+// CHECK: vmovntpd %xmm17, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x4a,0x7f]
+          vmovntpd %xmm17, 2032(%rdx)
+
+// CHECK: vmovntpd %xmm17, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x8a,0x00,0x08,0x00,0x00]
+          vmovntpd %xmm17, 2048(%rdx)
+
+// CHECK: vmovntpd %xmm17, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x4a,0x80]
+          vmovntpd %xmm17, -2048(%rdx)
+
+// CHECK: vmovntpd %xmm17, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x8a,0xf0,0xf7,0xff,0xff]
+          vmovntpd %xmm17, -2064(%rdx)
+
+// CHECK: vmovntpd %ymm27, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x2b,0x19]
+          vmovntpd %ymm27, (%rcx)
+
+// CHECK: vmovntpd %ymm27, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0xfd,0x28,0x2b,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vmovntpd %ymm27, 291(%rax,%r14,8)
+
+// CHECK: vmovntpd %ymm27, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x2b,0x5a,0x7f]
+          vmovntpd %ymm27, 4064(%rdx)
+
+// CHECK: vmovntpd %ymm27, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x2b,0x9a,0x00,0x10,0x00,0x00]
+          vmovntpd %ymm27, 4096(%rdx)
+
+// CHECK: vmovntpd %ymm27, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x2b,0x5a,0x80]
+          vmovntpd %ymm27, -4096(%rdx)
+
+// CHECK: vmovntpd %ymm27, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x2b,0x9a,0xe0,0xef,0xff,0xff]
+          vmovntpd %ymm27, -4128(%rdx)
+
+// CHECK: vmovntps %xmm26, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x2b,0x11]
+          vmovntps %xmm26, (%rcx)
+
+// CHECK: vmovntps %xmm26, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0x7c,0x08,0x2b,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vmovntps %xmm26, 291(%rax,%r14,8)
+
+// CHECK: vmovntps %xmm26, 2032(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x2b,0x52,0x7f]
+          vmovntps %xmm26, 2032(%rdx)
+
+// CHECK: vmovntps %xmm26, 2048(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x2b,0x92,0x00,0x08,0x00,0x00]
+          vmovntps %xmm26, 2048(%rdx)
+
+// CHECK: vmovntps %xmm26, -2048(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x2b,0x52,0x80]
+          vmovntps %xmm26, -2048(%rdx)
+
+// CHECK: vmovntps %xmm26, -2064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x2b,0x92,0xf0,0xf7,0xff,0xff]
+          vmovntps %xmm26, -2064(%rdx)
+
+// CHECK: vmovntps %ymm28, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x2b,0x21]
+          vmovntps %ymm28, (%rcx)
+
+// CHECK: vmovntps %ymm28, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0x7c,0x28,0x2b,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vmovntps %ymm28, 291(%rax,%r14,8)
+
+// CHECK: vmovntps %ymm28, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x2b,0x62,0x7f]
+          vmovntps %ymm28, 4064(%rdx)
+
+// CHECK: vmovntps %ymm28, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x2b,0xa2,0x00,0x10,0x00,0x00]
+          vmovntps %ymm28, 4096(%rdx)
+
+// CHECK: vmovntps %ymm28, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x2b,0x62,0x80]
+          vmovntps %ymm28, -4096(%rdx)
+
+// CHECK: vmovntps %ymm28, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x2b,0xa2,0xe0,0xef,0xff,0xff]
+          vmovntps %ymm28, -4128(%rdx)
+
+// CHECK: vmovupd %xmm22, %xmm24
+// CHECK:  encoding: [0x62,0x21,0xfd,0x08,0x10,0xc6]
+          vmovupd %xmm22, %xmm24
+
+// CHECK: vmovupd %xmm22, %xmm24 {%k6}
+// CHECK:  encoding: [0x62,0x21,0xfd,0x0e,0x10,0xc6]
+          vmovupd %xmm22, %xmm24 {%k6}
+
+// CHECK: vmovupd %xmm22, %xmm24 {%k6} {z}
+// CHECK:  encoding: [0x62,0x21,0xfd,0x8e,0x10,0xc6]
+          vmovupd %xmm22, %xmm24 {%k6} {z}
+
+// CHECK: vmovupd (%rcx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x10,0x01]
+          vmovupd (%rcx), %xmm24
+
+// CHECK: vmovupd 291(%rax,%r14,8), %xmm24
+// CHECK:  encoding: [0x62,0x21,0xfd,0x08,0x10,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmovupd 291(%rax,%r14,8), %xmm24
+
+// CHECK: vmovupd 2032(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x10,0x42,0x7f]
+          vmovupd 2032(%rdx), %xmm24
+
+// CHECK: vmovupd 2048(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x10,0x82,0x00,0x08,0x00,0x00]
+          vmovupd 2048(%rdx), %xmm24
+
+// CHECK: vmovupd -2048(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x10,0x42,0x80]
+          vmovupd -2048(%rdx), %xmm24
+
+// CHECK: vmovupd -2064(%rdx), %xmm24
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x10,0x82,0xf0,0xf7,0xff,0xff]
+          vmovupd -2064(%rdx), %xmm24
+
+// CHECK: vmovupd %ymm25, %ymm30
+// CHECK:  encoding: [0x62,0x01,0xfd,0x28,0x10,0xf1]
+          vmovupd %ymm25, %ymm30
+
+// CHECK: vmovupd %ymm25, %ymm30 {%k7}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x2f,0x10,0xf1]
+          vmovupd %ymm25, %ymm30 {%k7}
+
+// CHECK: vmovupd %ymm25, %ymm30 {%k7} {z}
+// CHECK:  encoding: [0x62,0x01,0xfd,0xaf,0x10,0xf1]
+          vmovupd %ymm25, %ymm30 {%k7} {z}
+
+// CHECK: vmovupd (%rcx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x10,0x31]
+          vmovupd (%rcx), %ymm30
+
+// CHECK: vmovupd 291(%rax,%r14,8), %ymm30
+// CHECK:  encoding: [0x62,0x21,0xfd,0x28,0x10,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovupd 291(%rax,%r14,8), %ymm30
+
+// CHECK: vmovupd 4064(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x10,0x72,0x7f]
+          vmovupd 4064(%rdx), %ymm30
+
+// CHECK: vmovupd 4096(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x10,0xb2,0x00,0x10,0x00,0x00]
+          vmovupd 4096(%rdx), %ymm30
+
+// CHECK: vmovupd -4096(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x10,0x72,0x80]
+          vmovupd -4096(%rdx), %ymm30
+
+// CHECK: vmovupd -4128(%rdx), %ymm30
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x10,0xb2,0xe0,0xef,0xff,0xff]
+          vmovupd -4128(%rdx), %ymm30
+
+// CHECK: vmovups %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0x81,0x7c,0x08,0x10,0xe5]
+          vmovups %xmm29, %xmm20
+
+// CHECK: vmovups %xmm29, %xmm20 {%k6}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x0e,0x10,0xe5]
+          vmovups %xmm29, %xmm20 {%k6}
+
+// CHECK: vmovups %xmm29, %xmm20 {%k6} {z}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x8e,0x10,0xe5]
+          vmovups %xmm29, %xmm20 {%k6} {z}
+
+// CHECK: vmovups (%rcx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x10,0x21]
+          vmovups (%rcx), %xmm20
+
+// CHECK: vmovups 291(%rax,%r14,8), %xmm20
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x08,0x10,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vmovups 291(%rax,%r14,8), %xmm20
+
+// CHECK: vmovups 2032(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x10,0x62,0x7f]
+          vmovups 2032(%rdx), %xmm20
+
+// CHECK: vmovups 2048(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x10,0xa2,0x00,0x08,0x00,0x00]
+          vmovups 2048(%rdx), %xmm20
+
+// CHECK: vmovups -2048(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x10,0x62,0x80]
+          vmovups -2048(%rdx), %xmm20
+
+// CHECK: vmovups -2064(%rdx), %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x10,0xa2,0xf0,0xf7,0xff,0xff]
+          vmovups -2064(%rdx), %xmm20
+
+// CHECK: vmovups %ymm26, %ymm21
+// CHECK:  encoding: [0x62,0x81,0x7c,0x28,0x10,0xea]
+          vmovups %ymm26, %ymm21
+
+// CHECK: vmovups %ymm26, %ymm21 {%k6}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x2e,0x10,0xea]
+          vmovups %ymm26, %ymm21 {%k6}
+
+// CHECK: vmovups %ymm26, %ymm21 {%k6} {z}
+// CHECK:  encoding: [0x62,0x81,0x7c,0xae,0x10,0xea]
+          vmovups %ymm26, %ymm21 {%k6} {z}
+
+// CHECK: vmovups (%rcx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x10,0x29]
+          vmovups (%rcx), %ymm21
+
+// CHECK: vmovups 291(%rax,%r14,8), %ymm21
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x28,0x10,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovups 291(%rax,%r14,8), %ymm21
+
+// CHECK: vmovups 4064(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x10,0x6a,0x7f]
+          vmovups 4064(%rdx), %ymm21
+
+// CHECK: vmovups 4096(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x10,0xaa,0x00,0x10,0x00,0x00]
+          vmovups 4096(%rdx), %ymm21
+
+// CHECK: vmovups -4096(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x10,0x6a,0x80]
+          vmovups -4096(%rdx), %ymm21
+
+// CHECK: vmovups -4128(%rdx), %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x10,0xaa,0xe0,0xef,0xff,0xff]
+          vmovups -4128(%rdx), %ymm21
+
+// CHECK: vmulpd %xmm26, %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0x81,0xf5,0x00,0x59,0xca]
+          vmulpd %xmm26, %xmm17, %xmm17
+
+// CHECK: vmulpd %xmm26, %xmm17, %xmm17 {%k7}
+// CHECK:  encoding: [0x62,0x81,0xf5,0x07,0x59,0xca]
+          vmulpd %xmm26, %xmm17, %xmm17 {%k7}
+
+// CHECK: vmulpd %xmm26, %xmm17, %xmm17 {%k7} {z}
+// CHECK:  encoding: [0x62,0x81,0xf5,0x87,0x59,0xca]
+          vmulpd %xmm26, %xmm17, %xmm17 {%k7} {z}
+
+// CHECK: vmulpd (%rcx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x00,0x59,0x09]
+          vmulpd (%rcx), %xmm17, %xmm17
+
+// CHECK: vmulpd 291(%rax,%r14,8), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xa1,0xf5,0x00,0x59,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmulpd 291(%rax,%r14,8), %xmm17, %xmm17
+
+// CHECK: vmulpd (%rcx){1to2}, %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x10,0x59,0x09]
+          vmulpd (%rcx){1to2}, %xmm17, %xmm17
+
+// CHECK: vmulpd 2032(%rdx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x00,0x59,0x4a,0x7f]
+          vmulpd 2032(%rdx), %xmm17, %xmm17
+
+// CHECK: vmulpd 2048(%rdx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x00,0x59,0x8a,0x00,0x08,0x00,0x00]
+          vmulpd 2048(%rdx), %xmm17, %xmm17
+
+// CHECK: vmulpd -2048(%rdx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x00,0x59,0x4a,0x80]
+          vmulpd -2048(%rdx), %xmm17, %xmm17
+
+// CHECK: vmulpd -2064(%rdx), %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x00,0x59,0x8a,0xf0,0xf7,0xff,0xff]
+          vmulpd -2064(%rdx), %xmm17, %xmm17
+
+// CHECK: vmulpd 1016(%rdx){1to2}, %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x10,0x59,0x4a,0x7f]
+          vmulpd 1016(%rdx){1to2}, %xmm17, %xmm17
+
+// CHECK: vmulpd 1024(%rdx){1to2}, %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x10,0x59,0x8a,0x00,0x04,0x00,0x00]
+          vmulpd 1024(%rdx){1to2}, %xmm17, %xmm17
+
+// CHECK: vmulpd -1024(%rdx){1to2}, %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x10,0x59,0x4a,0x80]
+          vmulpd -1024(%rdx){1to2}, %xmm17, %xmm17
+
+// CHECK: vmulpd -1032(%rdx){1to2}, %xmm17, %xmm17
+// CHECK:  encoding: [0x62,0xe1,0xf5,0x10,0x59,0x8a,0xf8,0xfb,0xff,0xff]
+          vmulpd -1032(%rdx){1to2}, %xmm17, %xmm17
+
+// CHECK: vmulpd %ymm27, %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x01,0xa5,0x20,0x59,0xcb]
+          vmulpd %ymm27, %ymm27, %ymm25
+
+// CHECK: vmulpd %ymm27, %ymm27, %ymm25 {%k3}
+// CHECK:  encoding: [0x62,0x01,0xa5,0x23,0x59,0xcb]
+          vmulpd %ymm27, %ymm27, %ymm25 {%k3}
+
+// CHECK: vmulpd %ymm27, %ymm27, %ymm25 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0xa5,0xa3,0x59,0xcb]
+          vmulpd %ymm27, %ymm27, %ymm25 {%k3} {z}
+
+// CHECK: vmulpd (%rcx), %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x20,0x59,0x09]
+          vmulpd (%rcx), %ymm27, %ymm25
+
+// CHECK: vmulpd 291(%rax,%r14,8), %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x21,0xa5,0x20,0x59,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmulpd 291(%rax,%r14,8), %ymm27, %ymm25
+
+// CHECK: vmulpd (%rcx){1to4}, %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x30,0x59,0x09]
+          vmulpd (%rcx){1to4}, %ymm27, %ymm25
+
+// CHECK: vmulpd 4064(%rdx), %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x20,0x59,0x4a,0x7f]
+          vmulpd 4064(%rdx), %ymm27, %ymm25
+
+// CHECK: vmulpd 4096(%rdx), %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x20,0x59,0x8a,0x00,0x10,0x00,0x00]
+          vmulpd 4096(%rdx), %ymm27, %ymm25
+
+// CHECK: vmulpd -4096(%rdx), %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x20,0x59,0x4a,0x80]
+          vmulpd -4096(%rdx), %ymm27, %ymm25
+
+// CHECK: vmulpd -4128(%rdx), %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x20,0x59,0x8a,0xe0,0xef,0xff,0xff]
+          vmulpd -4128(%rdx), %ymm27, %ymm25
+
+// CHECK: vmulpd 1016(%rdx){1to4}, %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x30,0x59,0x4a,0x7f]
+          vmulpd 1016(%rdx){1to4}, %ymm27, %ymm25
+
+// CHECK: vmulpd 1024(%rdx){1to4}, %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x30,0x59,0x8a,0x00,0x04,0x00,0x00]
+          vmulpd 1024(%rdx){1to4}, %ymm27, %ymm25
+
+// CHECK: vmulpd -1024(%rdx){1to4}, %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x30,0x59,0x4a,0x80]
+          vmulpd -1024(%rdx){1to4}, %ymm27, %ymm25
+
+// CHECK: vmulpd -1032(%rdx){1to4}, %ymm27, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xa5,0x30,0x59,0x8a,0xf8,0xfb,0xff,0xff]
+          vmulpd -1032(%rdx){1to4}, %ymm27, %ymm25
+
+// CHECK: vmulps %xmm21, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x74,0x00,0x59,0xed]
+          vmulps %xmm21, %xmm17, %xmm29
+
+// CHECK: vmulps %xmm21, %xmm17, %xmm29 {%k2}
+// CHECK:  encoding: [0x62,0x21,0x74,0x02,0x59,0xed]
+          vmulps %xmm21, %xmm17, %xmm29 {%k2}
+
+// CHECK: vmulps %xmm21, %xmm17, %xmm29 {%k2} {z}
+// CHECK:  encoding: [0x62,0x21,0x74,0x82,0x59,0xed]
+          vmulps %xmm21, %xmm17, %xmm29 {%k2} {z}
+
+// CHECK: vmulps (%rcx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x00,0x59,0x29]
+          vmulps (%rcx), %xmm17, %xmm29
+
+// CHECK: vmulps 291(%rax,%r14,8), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x21,0x74,0x00,0x59,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmulps 291(%rax,%r14,8), %xmm17, %xmm29
+
+// CHECK: vmulps (%rcx){1to4}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x10,0x59,0x29]
+          vmulps (%rcx){1to4}, %xmm17, %xmm29
+
+// CHECK: vmulps 2032(%rdx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x00,0x59,0x6a,0x7f]
+          vmulps 2032(%rdx), %xmm17, %xmm29
+
+// CHECK: vmulps 2048(%rdx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x00,0x59,0xaa,0x00,0x08,0x00,0x00]
+          vmulps 2048(%rdx), %xmm17, %xmm29
+
+// CHECK: vmulps -2048(%rdx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x00,0x59,0x6a,0x80]
+          vmulps -2048(%rdx), %xmm17, %xmm29
+
+// CHECK: vmulps -2064(%rdx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x00,0x59,0xaa,0xf0,0xf7,0xff,0xff]
+          vmulps -2064(%rdx), %xmm17, %xmm29
+
+// CHECK: vmulps 508(%rdx){1to4}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x10,0x59,0x6a,0x7f]
+          vmulps 508(%rdx){1to4}, %xmm17, %xmm29
+
+// CHECK: vmulps 512(%rdx){1to4}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x10,0x59,0xaa,0x00,0x02,0x00,0x00]
+          vmulps 512(%rdx){1to4}, %xmm17, %xmm29
+
+// CHECK: vmulps -512(%rdx){1to4}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x10,0x59,0x6a,0x80]
+          vmulps -512(%rdx){1to4}, %xmm17, %xmm29
+
+// CHECK: vmulps -516(%rdx){1to4}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0x74,0x10,0x59,0xaa,0xfc,0xfd,0xff,0xff]
+          vmulps -516(%rdx){1to4}, %xmm17, %xmm29
+
+// CHECK: vmulps %ymm28, %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x01,0x2c,0x20,0x59,0xf4]
+          vmulps %ymm28, %ymm26, %ymm30
+
+// CHECK: vmulps %ymm28, %ymm26, %ymm30 {%k3}
+// CHECK:  encoding: [0x62,0x01,0x2c,0x23,0x59,0xf4]
+          vmulps %ymm28, %ymm26, %ymm30 {%k3}
+
+// CHECK: vmulps %ymm28, %ymm26, %ymm30 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0x2c,0xa3,0x59,0xf4]
+          vmulps %ymm28, %ymm26, %ymm30 {%k3} {z}
+
+// CHECK: vmulps (%rcx), %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x59,0x31]
+          vmulps (%rcx), %ymm26, %ymm30
+
+// CHECK: vmulps 291(%rax,%r14,8), %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x21,0x2c,0x20,0x59,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmulps 291(%rax,%r14,8), %ymm26, %ymm30
+
+// CHECK: vmulps (%rcx){1to8}, %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x59,0x31]
+          vmulps (%rcx){1to8}, %ymm26, %ymm30
+
+// CHECK: vmulps 4064(%rdx), %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x59,0x72,0x7f]
+          vmulps 4064(%rdx), %ymm26, %ymm30
+
+// CHECK: vmulps 4096(%rdx), %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x59,0xb2,0x00,0x10,0x00,0x00]
+          vmulps 4096(%rdx), %ymm26, %ymm30
+
+// CHECK: vmulps -4096(%rdx), %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x59,0x72,0x80]
+          vmulps -4096(%rdx), %ymm26, %ymm30
+
+// CHECK: vmulps -4128(%rdx), %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x20,0x59,0xb2,0xe0,0xef,0xff,0xff]
+          vmulps -4128(%rdx), %ymm26, %ymm30
+
+// CHECK: vmulps 508(%rdx){1to8}, %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x59,0x72,0x7f]
+          vmulps 508(%rdx){1to8}, %ymm26, %ymm30
+
+// CHECK: vmulps 512(%rdx){1to8}, %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x59,0xb2,0x00,0x02,0x00,0x00]
+          vmulps 512(%rdx){1to8}, %ymm26, %ymm30
+
+// CHECK: vmulps -512(%rdx){1to8}, %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x59,0x72,0x80]
+          vmulps -512(%rdx){1to8}, %ymm26, %ymm30
+
+// CHECK: vmulps -516(%rdx){1to8}, %ymm26, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x2c,0x30,0x59,0xb2,0xfc,0xfd,0xff,0xff]
+          vmulps -516(%rdx){1to8}, %ymm26, %ymm30
+
+// CHECK: vpaddd %xmm26, %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0x81,0x65,0x00,0xfe,0xea]
+          vpaddd %xmm26, %xmm19, %xmm21
+
+// CHECK: vpaddd %xmm26, %xmm19, %xmm21 {%k5}
+// CHECK:  encoding: [0x62,0x81,0x65,0x05,0xfe,0xea]
+          vpaddd %xmm26, %xmm19, %xmm21 {%k5}
+
+// CHECK: vpaddd %xmm26, %xmm19, %xmm21 {%k5} {z}
+// CHECK:  encoding: [0x62,0x81,0x65,0x85,0xfe,0xea]
+          vpaddd %xmm26, %xmm19, %xmm21 {%k5} {z}
+
+// CHECK: vpaddd (%rcx), %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x00,0xfe,0x29]
+          vpaddd (%rcx), %xmm19, %xmm21
+
+// CHECK: vpaddd 291(%rax,%r14,8), %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x65,0x00,0xfe,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpaddd 291(%rax,%r14,8), %xmm19, %xmm21
+
+// CHECK: vpaddd (%rcx){1to4}, %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x10,0xfe,0x29]
+          vpaddd (%rcx){1to4}, %xmm19, %xmm21
+
+// CHECK: vpaddd 2032(%rdx), %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x00,0xfe,0x6a,0x7f]
+          vpaddd 2032(%rdx), %xmm19, %xmm21
+
+// CHECK: vpaddd 2048(%rdx), %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x00,0xfe,0xaa,0x00,0x08,0x00,0x00]
+          vpaddd 2048(%rdx), %xmm19, %xmm21
+
+// CHECK: vpaddd -2048(%rdx), %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x00,0xfe,0x6a,0x80]
+          vpaddd -2048(%rdx), %xmm19, %xmm21
+
+// CHECK: vpaddd -2064(%rdx), %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x00,0xfe,0xaa,0xf0,0xf7,0xff,0xff]
+          vpaddd -2064(%rdx), %xmm19, %xmm21
+
+// CHECK: vpaddd 508(%rdx){1to4}, %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x10,0xfe,0x6a,0x7f]
+          vpaddd 508(%rdx){1to4}, %xmm19, %xmm21
+
+// CHECK: vpaddd 512(%rdx){1to4}, %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x10,0xfe,0xaa,0x00,0x02,0x00,0x00]
+          vpaddd 512(%rdx){1to4}, %xmm19, %xmm21
+
+// CHECK: vpaddd -512(%rdx){1to4}, %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x10,0xfe,0x6a,0x80]
+          vpaddd -512(%rdx){1to4}, %xmm19, %xmm21
+
+// CHECK: vpaddd -516(%rdx){1to4}, %xmm19, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x65,0x10,0xfe,0xaa,0xfc,0xfd,0xff,0xff]
+          vpaddd -516(%rdx){1to4}, %xmm19, %xmm21
+
+// CHECK: vpaddd %ymm17, %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x21,0x45,0x20,0xfe,0xe9]
+          vpaddd %ymm17, %ymm23, %ymm29
+
+// CHECK: vpaddd %ymm17, %ymm23, %ymm29 {%k5}
+// CHECK:  encoding: [0x62,0x21,0x45,0x25,0xfe,0xe9]
+          vpaddd %ymm17, %ymm23, %ymm29 {%k5}
+
+// CHECK: vpaddd %ymm17, %ymm23, %ymm29 {%k5} {z}
+// CHECK:  encoding: [0x62,0x21,0x45,0xa5,0xfe,0xe9]
+          vpaddd %ymm17, %ymm23, %ymm29 {%k5} {z}
+
+// CHECK: vpaddd (%rcx), %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x20,0xfe,0x29]
+          vpaddd (%rcx), %ymm23, %ymm29
+
+// CHECK: vpaddd 291(%rax,%r14,8), %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x21,0x45,0x20,0xfe,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpaddd 291(%rax,%r14,8), %ymm23, %ymm29
+
+// CHECK: vpaddd (%rcx){1to8}, %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x30,0xfe,0x29]
+          vpaddd (%rcx){1to8}, %ymm23, %ymm29
+
+// CHECK: vpaddd 4064(%rdx), %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x20,0xfe,0x6a,0x7f]
+          vpaddd 4064(%rdx), %ymm23, %ymm29
+
+// CHECK: vpaddd 4096(%rdx), %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x20,0xfe,0xaa,0x00,0x10,0x00,0x00]
+          vpaddd 4096(%rdx), %ymm23, %ymm29
+
+// CHECK: vpaddd -4096(%rdx), %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x20,0xfe,0x6a,0x80]
+          vpaddd -4096(%rdx), %ymm23, %ymm29
+
+// CHECK: vpaddd -4128(%rdx), %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x20,0xfe,0xaa,0xe0,0xef,0xff,0xff]
+          vpaddd -4128(%rdx), %ymm23, %ymm29
+
+// CHECK: vpaddd 508(%rdx){1to8}, %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x30,0xfe,0x6a,0x7f]
+          vpaddd 508(%rdx){1to8}, %ymm23, %ymm29
+
+// CHECK: vpaddd 512(%rdx){1to8}, %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x30,0xfe,0xaa,0x00,0x02,0x00,0x00]
+          vpaddd 512(%rdx){1to8}, %ymm23, %ymm29
+
+// CHECK: vpaddd -512(%rdx){1to8}, %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x30,0xfe,0x6a,0x80]
+          vpaddd -512(%rdx){1to8}, %ymm23, %ymm29
+
+// CHECK: vpaddd -516(%rdx){1to8}, %ymm23, %ymm29
+// CHECK:  encoding: [0x62,0x61,0x45,0x30,0xfe,0xaa,0xfc,0xfd,0xff,0xff]
+          vpaddd -516(%rdx){1to8}, %ymm23, %ymm29
+
+// CHECK: vpaddq %xmm26, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x01,0xf5,0x00,0xd4,0xea]
+          vpaddq %xmm26, %xmm17, %xmm29
+
+// CHECK: vpaddq %xmm26, %xmm17, %xmm29 {%k2}
+// CHECK:  encoding: [0x62,0x01,0xf5,0x02,0xd4,0xea]
+          vpaddq %xmm26, %xmm17, %xmm29 {%k2}
+
+// CHECK: vpaddq %xmm26, %xmm17, %xmm29 {%k2} {z}
+// CHECK:  encoding: [0x62,0x01,0xf5,0x82,0xd4,0xea]
+          vpaddq %xmm26, %xmm17, %xmm29 {%k2} {z}
+
+// CHECK: vpaddq (%rcx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xd4,0x29]
+          vpaddq (%rcx), %xmm17, %xmm29
+
+// CHECK: vpaddq 291(%rax,%r14,8), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x21,0xf5,0x00,0xd4,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpaddq 291(%rax,%r14,8), %xmm17, %xmm29
+
+// CHECK: vpaddq (%rcx){1to2}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xd4,0x29]
+          vpaddq (%rcx){1to2}, %xmm17, %xmm29
+
+// CHECK: vpaddq 2032(%rdx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xd4,0x6a,0x7f]
+          vpaddq 2032(%rdx), %xmm17, %xmm29
+
+// CHECK: vpaddq 2048(%rdx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xd4,0xaa,0x00,0x08,0x00,0x00]
+          vpaddq 2048(%rdx), %xmm17, %xmm29
+
+// CHECK: vpaddq -2048(%rdx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xd4,0x6a,0x80]
+          vpaddq -2048(%rdx), %xmm17, %xmm29
+
+// CHECK: vpaddq -2064(%rdx), %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xd4,0xaa,0xf0,0xf7,0xff,0xff]
+          vpaddq -2064(%rdx), %xmm17, %xmm29
+
+// CHECK: vpaddq 1016(%rdx){1to2}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xd4,0x6a,0x7f]
+          vpaddq 1016(%rdx){1to2}, %xmm17, %xmm29
+
+// CHECK: vpaddq 1024(%rdx){1to2}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xd4,0xaa,0x00,0x04,0x00,0x00]
+          vpaddq 1024(%rdx){1to2}, %xmm17, %xmm29
+
+// CHECK: vpaddq -1024(%rdx){1to2}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xd4,0x6a,0x80]
+          vpaddq -1024(%rdx){1to2}, %xmm17, %xmm29
+
+// CHECK: vpaddq -1032(%rdx){1to2}, %xmm17, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xd4,0xaa,0xf8,0xfb,0xff,0xff]
+          vpaddq -1032(%rdx){1to2}, %xmm17, %xmm29
+
+// CHECK: vpaddq %ymm18, %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xa1,0xb5,0x20,0xd4,0xe2]
+          vpaddq %ymm18, %ymm25, %ymm20
+
+// CHECK: vpaddq %ymm18, %ymm25, %ymm20 {%k6}
+// CHECK:  encoding: [0x62,0xa1,0xb5,0x26,0xd4,0xe2]
+          vpaddq %ymm18, %ymm25, %ymm20 {%k6}
+
+// CHECK: vpaddq %ymm18, %ymm25, %ymm20 {%k6} {z}
+// CHECK:  encoding: [0x62,0xa1,0xb5,0xa6,0xd4,0xe2]
+          vpaddq %ymm18, %ymm25, %ymm20 {%k6} {z}
+
+// CHECK: vpaddq (%rcx), %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x20,0xd4,0x21]
+          vpaddq (%rcx), %ymm25, %ymm20
+
+// CHECK: vpaddq 291(%rax,%r14,8), %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xa1,0xb5,0x20,0xd4,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpaddq 291(%rax,%r14,8), %ymm25, %ymm20
+
+// CHECK: vpaddq (%rcx){1to4}, %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x30,0xd4,0x21]
+          vpaddq (%rcx){1to4}, %ymm25, %ymm20
+
+// CHECK: vpaddq 4064(%rdx), %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x20,0xd4,0x62,0x7f]
+          vpaddq 4064(%rdx), %ymm25, %ymm20
+
+// CHECK: vpaddq 4096(%rdx), %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x20,0xd4,0xa2,0x00,0x10,0x00,0x00]
+          vpaddq 4096(%rdx), %ymm25, %ymm20
+
+// CHECK: vpaddq -4096(%rdx), %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x20,0xd4,0x62,0x80]
+          vpaddq -4096(%rdx), %ymm25, %ymm20
+
+// CHECK: vpaddq -4128(%rdx), %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x20,0xd4,0xa2,0xe0,0xef,0xff,0xff]
+          vpaddq -4128(%rdx), %ymm25, %ymm20
+
+// CHECK: vpaddq 1016(%rdx){1to4}, %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x30,0xd4,0x62,0x7f]
+          vpaddq 1016(%rdx){1to4}, %ymm25, %ymm20
+
+// CHECK: vpaddq 1024(%rdx){1to4}, %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x30,0xd4,0xa2,0x00,0x04,0x00,0x00]
+          vpaddq 1024(%rdx){1to4}, %ymm25, %ymm20
+
+// CHECK: vpaddq -1024(%rdx){1to4}, %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x30,0xd4,0x62,0x80]
+          vpaddq -1024(%rdx){1to4}, %ymm25, %ymm20
+
+// CHECK: vpaddq -1032(%rdx){1to4}, %ymm25, %ymm20
+// CHECK:  encoding: [0x62,0xe1,0xb5,0x30,0xd4,0xa2,0xf8,0xfb,0xff,0xff]
+          vpaddq -1032(%rdx){1to4}, %ymm25, %ymm20
+
+// CHECK: vpandd %xmm18, %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x21,0x2d,0x00,0xdb,0xc2]
+          vpandd %xmm18, %xmm26, %xmm24
+
+// CHECK: vpandd %xmm18, %xmm26, %xmm24 {%k2}
+// CHECK:  encoding: [0x62,0x21,0x2d,0x02,0xdb,0xc2]
+          vpandd %xmm18, %xmm26, %xmm24 {%k2}
+
+// CHECK: vpandd %xmm18, %xmm26, %xmm24 {%k2} {z}
+// CHECK:  encoding: [0x62,0x21,0x2d,0x82,0xdb,0xc2]
+          vpandd %xmm18, %xmm26, %xmm24 {%k2} {z}
+
+// CHECK: vpandd (%rcx), %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xdb,0x01]
+          vpandd (%rcx), %xmm26, %xmm24
+
+// CHECK: vpandd 291(%rax,%r14,8), %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x21,0x2d,0x00,0xdb,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vpandd 291(%rax,%r14,8), %xmm26, %xmm24
+
+// CHECK: vpandd (%rcx){1to4}, %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x10,0xdb,0x01]
+          vpandd (%rcx){1to4}, %xmm26, %xmm24
+
+// CHECK: vpandd 2032(%rdx), %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xdb,0x42,0x7f]
+          vpandd 2032(%rdx), %xmm26, %xmm24
+
+// CHECK: vpandd 2048(%rdx), %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xdb,0x82,0x00,0x08,0x00,0x00]
+          vpandd 2048(%rdx), %xmm26, %xmm24
+
+// CHECK: vpandd -2048(%rdx), %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xdb,0x42,0x80]
+          vpandd -2048(%rdx), %xmm26, %xmm24
+
+// CHECK: vpandd -2064(%rdx), %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x00,0xdb,0x82,0xf0,0xf7,0xff,0xff]
+          vpandd -2064(%rdx), %xmm26, %xmm24
+
+// CHECK: vpandd 508(%rdx){1to4}, %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x10,0xdb,0x42,0x7f]
+          vpandd 508(%rdx){1to4}, %xmm26, %xmm24
+
+// CHECK: vpandd 512(%rdx){1to4}, %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x10,0xdb,0x82,0x00,0x02,0x00,0x00]
+          vpandd 512(%rdx){1to4}, %xmm26, %xmm24
+
+// CHECK: vpandd -512(%rdx){1to4}, %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x10,0xdb,0x42,0x80]
+          vpandd -512(%rdx){1to4}, %xmm26, %xmm24
+
+// CHECK: vpandd -516(%rdx){1to4}, %xmm26, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x10,0xdb,0x82,0xfc,0xfd,0xff,0xff]
+          vpandd -516(%rdx){1to4}, %xmm26, %xmm24
+
+// CHECK: vpandd %ymm20, %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xa1,0x55,0x20,0xdb,0xd4]
+          vpandd %ymm20, %ymm21, %ymm18
+
+// CHECK: vpandd %ymm20, %ymm21, %ymm18 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0x55,0x23,0xdb,0xd4]
+          vpandd %ymm20, %ymm21, %ymm18 {%k3}
+
+// CHECK: vpandd %ymm20, %ymm21, %ymm18 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0x55,0xa3,0xdb,0xd4]
+          vpandd %ymm20, %ymm21, %ymm18 {%k3} {z}
+
+// CHECK: vpandd (%rcx), %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xdb,0x11]
+          vpandd (%rcx), %ymm21, %ymm18
+
+// CHECK: vpandd 291(%rax,%r14,8), %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xa1,0x55,0x20,0xdb,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpandd 291(%rax,%r14,8), %ymm21, %ymm18
+
+// CHECK: vpandd (%rcx){1to8}, %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x30,0xdb,0x11]
+          vpandd (%rcx){1to8}, %ymm21, %ymm18
+
+// CHECK: vpandd 4064(%rdx), %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xdb,0x52,0x7f]
+          vpandd 4064(%rdx), %ymm21, %ymm18
+
+// CHECK: vpandd 4096(%rdx), %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xdb,0x92,0x00,0x10,0x00,0x00]
+          vpandd 4096(%rdx), %ymm21, %ymm18
+
+// CHECK: vpandd -4096(%rdx), %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xdb,0x52,0x80]
+          vpandd -4096(%rdx), %ymm21, %ymm18
+
+// CHECK: vpandd -4128(%rdx), %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x20,0xdb,0x92,0xe0,0xef,0xff,0xff]
+          vpandd -4128(%rdx), %ymm21, %ymm18
+
+// CHECK: vpandd 508(%rdx){1to8}, %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x30,0xdb,0x52,0x7f]
+          vpandd 508(%rdx){1to8}, %ymm21, %ymm18
+
+// CHECK: vpandd 512(%rdx){1to8}, %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x30,0xdb,0x92,0x00,0x02,0x00,0x00]
+          vpandd 512(%rdx){1to8}, %ymm21, %ymm18
+
+// CHECK: vpandd -512(%rdx){1to8}, %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x30,0xdb,0x52,0x80]
+          vpandd -512(%rdx){1to8}, %ymm21, %ymm18
+
+// CHECK: vpandd -516(%rdx){1to8}, %ymm21, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0x55,0x30,0xdb,0x92,0xfc,0xfd,0xff,0xff]
+          vpandd -516(%rdx){1to8}, %ymm21, %ymm18
+
+// CHECK: vpandnd %xmm22, %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x21,0x75,0x00,0xdf,0xc6]
+          vpandnd %xmm22, %xmm17, %xmm24
+
+// CHECK: vpandnd %xmm22, %xmm17, %xmm24 {%k2}
+// CHECK:  encoding: [0x62,0x21,0x75,0x02,0xdf,0xc6]
+          vpandnd %xmm22, %xmm17, %xmm24 {%k2}
+
+// CHECK: vpandnd %xmm22, %xmm17, %xmm24 {%k2} {z}
+// CHECK:  encoding: [0x62,0x21,0x75,0x82,0xdf,0xc6]
+          vpandnd %xmm22, %xmm17, %xmm24 {%k2} {z}
+
+// CHECK: vpandnd (%rcx), %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xdf,0x01]
+          vpandnd (%rcx), %xmm17, %xmm24
+
+// CHECK: vpandnd 291(%rax,%r14,8), %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x21,0x75,0x00,0xdf,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vpandnd 291(%rax,%r14,8), %xmm17, %xmm24
+
+// CHECK: vpandnd (%rcx){1to4}, %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x10,0xdf,0x01]
+          vpandnd (%rcx){1to4}, %xmm17, %xmm24
+
+// CHECK: vpandnd 2032(%rdx), %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xdf,0x42,0x7f]
+          vpandnd 2032(%rdx), %xmm17, %xmm24
+
+// CHECK: vpandnd 2048(%rdx), %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xdf,0x82,0x00,0x08,0x00,0x00]
+          vpandnd 2048(%rdx), %xmm17, %xmm24
+
+// CHECK: vpandnd -2048(%rdx), %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xdf,0x42,0x80]
+          vpandnd -2048(%rdx), %xmm17, %xmm24
+
+// CHECK: vpandnd -2064(%rdx), %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x00,0xdf,0x82,0xf0,0xf7,0xff,0xff]
+          vpandnd -2064(%rdx), %xmm17, %xmm24
+
+// CHECK: vpandnd 508(%rdx){1to4}, %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x10,0xdf,0x42,0x7f]
+          vpandnd 508(%rdx){1to4}, %xmm17, %xmm24
+
+// CHECK: vpandnd 512(%rdx){1to4}, %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x10,0xdf,0x82,0x00,0x02,0x00,0x00]
+          vpandnd 512(%rdx){1to4}, %xmm17, %xmm24
+
+// CHECK: vpandnd -512(%rdx){1to4}, %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x10,0xdf,0x42,0x80]
+          vpandnd -512(%rdx){1to4}, %xmm17, %xmm24
+
+// CHECK: vpandnd -516(%rdx){1to4}, %xmm17, %xmm24
+// CHECK:  encoding: [0x62,0x61,0x75,0x10,0xdf,0x82,0xfc,0xfd,0xff,0xff]
+          vpandnd -516(%rdx){1to4}, %xmm17, %xmm24
+
+// CHECK: vpandnd %ymm17, %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xa1,0x25,0x20,0xdf,0xd9]
+          vpandnd %ymm17, %ymm27, %ymm19
+
+// CHECK: vpandnd %ymm17, %ymm27, %ymm19 {%k2}
+// CHECK:  encoding: [0x62,0xa1,0x25,0x22,0xdf,0xd9]
+          vpandnd %ymm17, %ymm27, %ymm19 {%k2}
+
+// CHECK: vpandnd %ymm17, %ymm27, %ymm19 {%k2} {z}
+// CHECK:  encoding: [0x62,0xa1,0x25,0xa2,0xdf,0xd9]
+          vpandnd %ymm17, %ymm27, %ymm19 {%k2} {z}
+
+// CHECK: vpandnd (%rcx), %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xdf,0x19]
+          vpandnd (%rcx), %ymm27, %ymm19
+
+// CHECK: vpandnd 291(%rax,%r14,8), %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xa1,0x25,0x20,0xdf,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpandnd 291(%rax,%r14,8), %ymm27, %ymm19
+
+// CHECK: vpandnd (%rcx){1to8}, %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x30,0xdf,0x19]
+          vpandnd (%rcx){1to8}, %ymm27, %ymm19
+
+// CHECK: vpandnd 4064(%rdx), %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xdf,0x5a,0x7f]
+          vpandnd 4064(%rdx), %ymm27, %ymm19
+
+// CHECK: vpandnd 4096(%rdx), %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xdf,0x9a,0x00,0x10,0x00,0x00]
+          vpandnd 4096(%rdx), %ymm27, %ymm19
+
+// CHECK: vpandnd -4096(%rdx), %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xdf,0x5a,0x80]
+          vpandnd -4096(%rdx), %ymm27, %ymm19
+
+// CHECK: vpandnd -4128(%rdx), %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x20,0xdf,0x9a,0xe0,0xef,0xff,0xff]
+          vpandnd -4128(%rdx), %ymm27, %ymm19
+
+// CHECK: vpandnd 508(%rdx){1to8}, %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x30,0xdf,0x5a,0x7f]
+          vpandnd 508(%rdx){1to8}, %ymm27, %ymm19
+
+// CHECK: vpandnd 512(%rdx){1to8}, %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x30,0xdf,0x9a,0x00,0x02,0x00,0x00]
+          vpandnd 512(%rdx){1to8}, %ymm27, %ymm19
+
+// CHECK: vpandnd -512(%rdx){1to8}, %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x30,0xdf,0x5a,0x80]
+          vpandnd -512(%rdx){1to8}, %ymm27, %ymm19
+
+// CHECK: vpandnd -516(%rdx){1to8}, %ymm27, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x25,0x30,0xdf,0x9a,0xfc,0xfd,0xff,0xff]
+          vpandnd -516(%rdx){1to8}, %ymm27, %ymm19
+
+// CHECK: vpandnq %xmm20, %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xa1,0x9d,0x00,0xdf,0xfc]
+          vpandnq %xmm20, %xmm28, %xmm23
+
+// CHECK: vpandnq %xmm20, %xmm28, %xmm23 {%k7}
+// CHECK:  encoding: [0x62,0xa1,0x9d,0x07,0xdf,0xfc]
+          vpandnq %xmm20, %xmm28, %xmm23 {%k7}
+
+// CHECK: vpandnq %xmm20, %xmm28, %xmm23 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa1,0x9d,0x87,0xdf,0xfc]
+          vpandnq %xmm20, %xmm28, %xmm23 {%k7} {z}
+
+// CHECK: vpandnq (%rcx), %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xdf,0x39]
+          vpandnq (%rcx), %xmm28, %xmm23
+
+// CHECK: vpandnq 291(%rax,%r14,8), %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xa1,0x9d,0x00,0xdf,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpandnq 291(%rax,%r14,8), %xmm28, %xmm23
+
+// CHECK: vpandnq (%rcx){1to2}, %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xdf,0x39]
+          vpandnq (%rcx){1to2}, %xmm28, %xmm23
+
+// CHECK: vpandnq 2032(%rdx), %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xdf,0x7a,0x7f]
+          vpandnq 2032(%rdx), %xmm28, %xmm23
+
+// CHECK: vpandnq 2048(%rdx), %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xdf,0xba,0x00,0x08,0x00,0x00]
+          vpandnq 2048(%rdx), %xmm28, %xmm23
+
+// CHECK: vpandnq -2048(%rdx), %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xdf,0x7a,0x80]
+          vpandnq -2048(%rdx), %xmm28, %xmm23
+
+// CHECK: vpandnq -2064(%rdx), %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xdf,0xba,0xf0,0xf7,0xff,0xff]
+          vpandnq -2064(%rdx), %xmm28, %xmm23
+
+// CHECK: vpandnq 1016(%rdx){1to2}, %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xdf,0x7a,0x7f]
+          vpandnq 1016(%rdx){1to2}, %xmm28, %xmm23
+
+// CHECK: vpandnq 1024(%rdx){1to2}, %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xdf,0xba,0x00,0x04,0x00,0x00]
+          vpandnq 1024(%rdx){1to2}, %xmm28, %xmm23
+
+// CHECK: vpandnq -1024(%rdx){1to2}, %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xdf,0x7a,0x80]
+          vpandnq -1024(%rdx){1to2}, %xmm28, %xmm23
+
+// CHECK: vpandnq -1032(%rdx){1to2}, %xmm28, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xdf,0xba,0xf8,0xfb,0xff,0xff]
+          vpandnq -1032(%rdx){1to2}, %xmm28, %xmm23
+
+// CHECK: vpandnq %ymm28, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x01,0xdd,0x20,0xdf,0xec]
+          vpandnq %ymm28, %ymm20, %ymm29
+
+// CHECK: vpandnq %ymm28, %ymm20, %ymm29 {%k1}
+// CHECK:  encoding: [0x62,0x01,0xdd,0x21,0xdf,0xec]
+          vpandnq %ymm28, %ymm20, %ymm29 {%k1}
+
+// CHECK: vpandnq %ymm28, %ymm20, %ymm29 {%k1} {z}
+// CHECK:  encoding: [0x62,0x01,0xdd,0xa1,0xdf,0xec]
+          vpandnq %ymm28, %ymm20, %ymm29 {%k1} {z}
+
+// CHECK: vpandnq (%rcx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xdf,0x29]
+          vpandnq (%rcx), %ymm20, %ymm29
+
+// CHECK: vpandnq 291(%rax,%r14,8), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x21,0xdd,0x20,0xdf,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpandnq 291(%rax,%r14,8), %ymm20, %ymm29
+
+// CHECK: vpandnq (%rcx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xdf,0x29]
+          vpandnq (%rcx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpandnq 4064(%rdx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xdf,0x6a,0x7f]
+          vpandnq 4064(%rdx), %ymm20, %ymm29
+
+// CHECK: vpandnq 4096(%rdx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xdf,0xaa,0x00,0x10,0x00,0x00]
+          vpandnq 4096(%rdx), %ymm20, %ymm29
+
+// CHECK: vpandnq -4096(%rdx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xdf,0x6a,0x80]
+          vpandnq -4096(%rdx), %ymm20, %ymm29
+
+// CHECK: vpandnq -4128(%rdx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xdf,0xaa,0xe0,0xef,0xff,0xff]
+          vpandnq -4128(%rdx), %ymm20, %ymm29
+
+// CHECK: vpandnq 1016(%rdx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xdf,0x6a,0x7f]
+          vpandnq 1016(%rdx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpandnq 1024(%rdx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xdf,0xaa,0x00,0x04,0x00,0x00]
+          vpandnq 1024(%rdx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpandnq -1024(%rdx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xdf,0x6a,0x80]
+          vpandnq -1024(%rdx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpandnq -1032(%rdx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xdf,0xaa,0xf8,0xfb,0xff,0xff]
+          vpandnq -1032(%rdx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpandq %xmm25, %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0x81,0xe5,0x00,0xdb,0xf1]
+          vpandq %xmm25, %xmm19, %xmm22
+
+// CHECK: vpandq %xmm25, %xmm19, %xmm22 {%k5}
+// CHECK:  encoding: [0x62,0x81,0xe5,0x05,0xdb,0xf1]
+          vpandq %xmm25, %xmm19, %xmm22 {%k5}
+
+// CHECK: vpandq %xmm25, %xmm19, %xmm22 {%k5} {z}
+// CHECK:  encoding: [0x62,0x81,0xe5,0x85,0xdb,0xf1]
+          vpandq %xmm25, %xmm19, %xmm22 {%k5} {z}
+
+// CHECK: vpandq (%rcx), %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x00,0xdb,0x31]
+          vpandq (%rcx), %xmm19, %xmm22
+
+// CHECK: vpandq 291(%rax,%r14,8), %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xa1,0xe5,0x00,0xdb,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpandq 291(%rax,%r14,8), %xmm19, %xmm22
+
+// CHECK: vpandq (%rcx){1to2}, %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x10,0xdb,0x31]
+          vpandq (%rcx){1to2}, %xmm19, %xmm22
+
+// CHECK: vpandq 2032(%rdx), %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x00,0xdb,0x72,0x7f]
+          vpandq 2032(%rdx), %xmm19, %xmm22
+
+// CHECK: vpandq 2048(%rdx), %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x00,0xdb,0xb2,0x00,0x08,0x00,0x00]
+          vpandq 2048(%rdx), %xmm19, %xmm22
+
+// CHECK: vpandq -2048(%rdx), %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x00,0xdb,0x72,0x80]
+          vpandq -2048(%rdx), %xmm19, %xmm22
+
+// CHECK: vpandq -2064(%rdx), %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x00,0xdb,0xb2,0xf0,0xf7,0xff,0xff]
+          vpandq -2064(%rdx), %xmm19, %xmm22
+
+// CHECK: vpandq 1016(%rdx){1to2}, %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x10,0xdb,0x72,0x7f]
+          vpandq 1016(%rdx){1to2}, %xmm19, %xmm22
+
+// CHECK: vpandq 1024(%rdx){1to2}, %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x10,0xdb,0xb2,0x00,0x04,0x00,0x00]
+          vpandq 1024(%rdx){1to2}, %xmm19, %xmm22
+
+// CHECK: vpandq -1024(%rdx){1to2}, %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x10,0xdb,0x72,0x80]
+          vpandq -1024(%rdx){1to2}, %xmm19, %xmm22
+
+// CHECK: vpandq -1032(%rdx){1to2}, %xmm19, %xmm22
+// CHECK:  encoding: [0x62,0xe1,0xe5,0x10,0xdb,0xb2,0xf8,0xfb,0xff,0xff]
+          vpandq -1032(%rdx){1to2}, %xmm19, %xmm22
+
+// CHECK: vpandq %ymm24, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x01,0xad,0x20,0xdb,0xc8]
+          vpandq %ymm24, %ymm26, %ymm25
+
+// CHECK: vpandq %ymm24, %ymm26, %ymm25 {%k7}
+// CHECK:  encoding: [0x62,0x01,0xad,0x27,0xdb,0xc8]
+          vpandq %ymm24, %ymm26, %ymm25 {%k7}
+
+// CHECK: vpandq %ymm24, %ymm26, %ymm25 {%k7} {z}
+// CHECK:  encoding: [0x62,0x01,0xad,0xa7,0xdb,0xc8]
+          vpandq %ymm24, %ymm26, %ymm25 {%k7} {z}
+
+// CHECK: vpandq (%rcx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0xdb,0x09]
+          vpandq (%rcx), %ymm26, %ymm25
+
+// CHECK: vpandq 291(%rax,%r14,8), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x21,0xad,0x20,0xdb,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpandq 291(%rax,%r14,8), %ymm26, %ymm25
+
+// CHECK: vpandq (%rcx){1to4}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0xdb,0x09]
+          vpandq (%rcx){1to4}, %ymm26, %ymm25
+
+// CHECK: vpandq 4064(%rdx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0xdb,0x4a,0x7f]
+          vpandq 4064(%rdx), %ymm26, %ymm25
+
+// CHECK: vpandq 4096(%rdx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0xdb,0x8a,0x00,0x10,0x00,0x00]
+          vpandq 4096(%rdx), %ymm26, %ymm25
+
+// CHECK: vpandq -4096(%rdx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0xdb,0x4a,0x80]
+          vpandq -4096(%rdx), %ymm26, %ymm25
+
+// CHECK: vpandq -4128(%rdx), %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x20,0xdb,0x8a,0xe0,0xef,0xff,0xff]
+          vpandq -4128(%rdx), %ymm26, %ymm25
+
+// CHECK: vpandq 1016(%rdx){1to4}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0xdb,0x4a,0x7f]
+          vpandq 1016(%rdx){1to4}, %ymm26, %ymm25
+
+// CHECK: vpandq 1024(%rdx){1to4}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0xdb,0x8a,0x00,0x04,0x00,0x00]
+          vpandq 1024(%rdx){1to4}, %ymm26, %ymm25
+
+// CHECK: vpandq -1024(%rdx){1to4}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0xdb,0x4a,0x80]
+          vpandq -1024(%rdx){1to4}, %ymm26, %ymm25
+
+// CHECK: vpandq -1032(%rdx){1to4}, %ymm26, %ymm25
+// CHECK:  encoding: [0x62,0x61,0xad,0x30,0xdb,0x8a,0xf8,0xfb,0xff,0xff]
+          vpandq -1032(%rdx){1to4}, %ymm26, %ymm25
+
+// CHECK: vpcmpd $171, %xmm20, %xmm23, %k4
+// CHECK:  encoding: [0x62,0xb3,0x45,0x00,0x1f,0xe4,0xab]
+          vpcmpd $171, %xmm20, %xmm23, %k4
+
+// CHECK: vpcmpd $171, %xmm20, %xmm23, %k4 {%k1}
+// CHECK:  encoding: [0x62,0xb3,0x45,0x01,0x1f,0xe4,0xab]
+          vpcmpd $171, %xmm20, %xmm23, %k4 {%k1}
+
+// CHECK: vpcmpd $123, %xmm20, %xmm23, %k4
+// CHECK:  encoding: [0x62,0xb3,0x45,0x00,0x1f,0xe4,0x7b]
+          vpcmpd $123, %xmm20, %xmm23, %k4
+
+// CHECK: vpcmpd $123, (%rcx), %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x00,0x1f,0x21,0x7b]
+          vpcmpd $123, (%rcx), %xmm23, %k4
+
+// CHECK: vpcmpd $123, 291(%rax,%r14,8), %xmm23, %k4
+// CHECK:  encoding: [0x62,0xb3,0x45,0x00,0x1f,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpd $123, 291(%rax,%r14,8), %xmm23, %k4
+
+// CHECK: vpcmpd $123, (%rcx){1to4}, %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x10,0x1f,0x21,0x7b]
+          vpcmpd $123, (%rcx){1to4}, %xmm23, %k4
+
+// CHECK: vpcmpd $123, 2032(%rdx), %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x00,0x1f,0x62,0x7f,0x7b]
+          vpcmpd $123, 2032(%rdx), %xmm23, %k4
+
+// CHECK: vpcmpd $123, 2048(%rdx), %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x00,0x1f,0xa2,0x00,0x08,0x00,0x00,0x7b]
+          vpcmpd $123, 2048(%rdx), %xmm23, %k4
+
+// CHECK: vpcmpd $123, -2048(%rdx), %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x00,0x1f,0x62,0x80,0x7b]
+          vpcmpd $123, -2048(%rdx), %xmm23, %k4
+
+// CHECK: vpcmpd $123, -2064(%rdx), %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x00,0x1f,0xa2,0xf0,0xf7,0xff,0xff,0x7b]
+          vpcmpd $123, -2064(%rdx), %xmm23, %k4
+
+// CHECK: vpcmpd $123, 508(%rdx){1to4}, %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x10,0x1f,0x62,0x7f,0x7b]
+          vpcmpd $123, 508(%rdx){1to4}, %xmm23, %k4
+
+// CHECK: vpcmpd $123, 512(%rdx){1to4}, %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x10,0x1f,0xa2,0x00,0x02,0x00,0x00,0x7b]
+          vpcmpd $123, 512(%rdx){1to4}, %xmm23, %k4
+
+// CHECK: vpcmpd $123, -512(%rdx){1to4}, %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x10,0x1f,0x62,0x80,0x7b]
+          vpcmpd $123, -512(%rdx){1to4}, %xmm23, %k4
+
+// CHECK: vpcmpd $123, -516(%rdx){1to4}, %xmm23, %k4
+// CHECK:  encoding: [0x62,0xf3,0x45,0x10,0x1f,0xa2,0xfc,0xfd,0xff,0xff,0x7b]
+          vpcmpd $123, -516(%rdx){1to4}, %xmm23, %k4
+
+// CHECK: vpcmpd $171, %ymm19, %ymm24, %k4
+// CHECK:  encoding: [0x62,0xb3,0x3d,0x20,0x1f,0xe3,0xab]
+          vpcmpd $171, %ymm19, %ymm24, %k4
+
+// CHECK: vpcmpd $171, %ymm19, %ymm24, %k4 {%k3}
+// CHECK:  encoding: [0x62,0xb3,0x3d,0x23,0x1f,0xe3,0xab]
+          vpcmpd $171, %ymm19, %ymm24, %k4 {%k3}
+
+// CHECK: vpcmpd $123, %ymm19, %ymm24, %k4
+// CHECK:  encoding: [0x62,0xb3,0x3d,0x20,0x1f,0xe3,0x7b]
+          vpcmpd $123, %ymm19, %ymm24, %k4
+
+// CHECK: vpcmpd $123, (%rcx), %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x20,0x1f,0x21,0x7b]
+          vpcmpd $123, (%rcx), %ymm24, %k4
+
+// CHECK: vpcmpd $123, 291(%rax,%r14,8), %ymm24, %k4
+// CHECK:  encoding: [0x62,0xb3,0x3d,0x20,0x1f,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpd $123, 291(%rax,%r14,8), %ymm24, %k4
+
+// CHECK: vpcmpd $123, (%rcx){1to8}, %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x30,0x1f,0x21,0x7b]
+          vpcmpd $123, (%rcx){1to8}, %ymm24, %k4
+
+// CHECK: vpcmpd $123, 4064(%rdx), %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x20,0x1f,0x62,0x7f,0x7b]
+          vpcmpd $123, 4064(%rdx), %ymm24, %k4
+
+// CHECK: vpcmpd $123, 4096(%rdx), %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x20,0x1f,0xa2,0x00,0x10,0x00,0x00,0x7b]
+          vpcmpd $123, 4096(%rdx), %ymm24, %k4
+
+// CHECK: vpcmpd $123, -4096(%rdx), %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x20,0x1f,0x62,0x80,0x7b]
+          vpcmpd $123, -4096(%rdx), %ymm24, %k4
+
+// CHECK: vpcmpd $123, -4128(%rdx), %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x20,0x1f,0xa2,0xe0,0xef,0xff,0xff,0x7b]
+          vpcmpd $123, -4128(%rdx), %ymm24, %k4
+
+// CHECK: vpcmpd $123, 508(%rdx){1to8}, %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x30,0x1f,0x62,0x7f,0x7b]
+          vpcmpd $123, 508(%rdx){1to8}, %ymm24, %k4
+
+// CHECK: vpcmpd $123, 512(%rdx){1to8}, %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x30,0x1f,0xa2,0x00,0x02,0x00,0x00,0x7b]
+          vpcmpd $123, 512(%rdx){1to8}, %ymm24, %k4
+
+// CHECK: vpcmpd $123, -512(%rdx){1to8}, %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x30,0x1f,0x62,0x80,0x7b]
+          vpcmpd $123, -512(%rdx){1to8}, %ymm24, %k4
+
+// CHECK: vpcmpd $123, -516(%rdx){1to8}, %ymm24, %k4
+// CHECK:  encoding: [0x62,0xf3,0x3d,0x30,0x1f,0xa2,0xfc,0xfd,0xff,0xff,0x7b]
+          vpcmpd $123, -516(%rdx){1to8}, %ymm24, %k4
+
+// CHECK: vpcmpeqd %xmm24, %xmm29, %k3
+// CHECK:  encoding: [0x62,0x91,0x15,0x00,0x76,0xd8]
+          vpcmpeqd %xmm24, %xmm29, %k3
+
+// CHECK: vpcmpeqd %xmm24, %xmm29, %k3 {%k5}
+// CHECK:  encoding: [0x62,0x91,0x15,0x05,0x76,0xd8]
+          vpcmpeqd %xmm24, %xmm29, %k3 {%k5}
+
+// CHECK: vpcmpeqd (%rcx), %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x76,0x19]
+          vpcmpeqd (%rcx), %xmm29, %k3
+
+// CHECK: vpcmpeqd 291(%rax,%r14,8), %xmm29, %k3
+// CHECK:  encoding: [0x62,0xb1,0x15,0x00,0x76,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqd 291(%rax,%r14,8), %xmm29, %k3
+
+// CHECK: vpcmpeqd (%rcx){1to4}, %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x76,0x19]
+          vpcmpeqd (%rcx){1to4}, %xmm29, %k3
+
+// CHECK: vpcmpeqd 2032(%rdx), %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x76,0x5a,0x7f]
+          vpcmpeqd 2032(%rdx), %xmm29, %k3
+
+// CHECK: vpcmpeqd 2048(%rdx), %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x76,0x9a,0x00,0x08,0x00,0x00]
+          vpcmpeqd 2048(%rdx), %xmm29, %k3
+
+// CHECK: vpcmpeqd -2048(%rdx), %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x76,0x5a,0x80]
+          vpcmpeqd -2048(%rdx), %xmm29, %k3
+
+// CHECK: vpcmpeqd -2064(%rdx), %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x76,0x9a,0xf0,0xf7,0xff,0xff]
+          vpcmpeqd -2064(%rdx), %xmm29, %k3
+
+// CHECK: vpcmpeqd 508(%rdx){1to4}, %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x76,0x5a,0x7f]
+          vpcmpeqd 508(%rdx){1to4}, %xmm29, %k3
+
+// CHECK: vpcmpeqd 512(%rdx){1to4}, %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x76,0x9a,0x00,0x02,0x00,0x00]
+          vpcmpeqd 512(%rdx){1to4}, %xmm29, %k3
+
+// CHECK: vpcmpeqd -512(%rdx){1to4}, %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x76,0x5a,0x80]
+          vpcmpeqd -512(%rdx){1to4}, %xmm29, %k3
+
+// CHECK: vpcmpeqd -516(%rdx){1to4}, %xmm29, %k3
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x76,0x9a,0xfc,0xfd,0xff,0xff]
+          vpcmpeqd -516(%rdx){1to4}, %xmm29, %k3
+
+// CHECK: vpcmpeqd %ymm20, %ymm26, %k5
+// CHECK:  encoding: [0x62,0xb1,0x2d,0x20,0x76,0xec]
+          vpcmpeqd %ymm20, %ymm26, %k5
+
+// CHECK: vpcmpeqd %ymm20, %ymm26, %k5 {%k5}
+// CHECK:  encoding: [0x62,0xb1,0x2d,0x25,0x76,0xec]
+          vpcmpeqd %ymm20, %ymm26, %k5 {%k5}
+
+// CHECK: vpcmpeqd (%rcx), %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x20,0x76,0x29]
+          vpcmpeqd (%rcx), %ymm26, %k5
+
+// CHECK: vpcmpeqd 291(%rax,%r14,8), %ymm26, %k5
+// CHECK:  encoding: [0x62,0xb1,0x2d,0x20,0x76,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqd 291(%rax,%r14,8), %ymm26, %k5
+
+// CHECK: vpcmpeqd (%rcx){1to8}, %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x30,0x76,0x29]
+          vpcmpeqd (%rcx){1to8}, %ymm26, %k5
+
+// CHECK: vpcmpeqd 4064(%rdx), %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x20,0x76,0x6a,0x7f]
+          vpcmpeqd 4064(%rdx), %ymm26, %k5
+
+// CHECK: vpcmpeqd 4096(%rdx), %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x20,0x76,0xaa,0x00,0x10,0x00,0x00]
+          vpcmpeqd 4096(%rdx), %ymm26, %k5
+
+// CHECK: vpcmpeqd -4096(%rdx), %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x20,0x76,0x6a,0x80]
+          vpcmpeqd -4096(%rdx), %ymm26, %k5
+
+// CHECK: vpcmpeqd -4128(%rdx), %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x20,0x76,0xaa,0xe0,0xef,0xff,0xff]
+          vpcmpeqd -4128(%rdx), %ymm26, %k5
+
+// CHECK: vpcmpeqd 508(%rdx){1to8}, %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x30,0x76,0x6a,0x7f]
+          vpcmpeqd 508(%rdx){1to8}, %ymm26, %k5
+
+// CHECK: vpcmpeqd 512(%rdx){1to8}, %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x30,0x76,0xaa,0x00,0x02,0x00,0x00]
+          vpcmpeqd 512(%rdx){1to8}, %ymm26, %k5
+
+// CHECK: vpcmpeqd -512(%rdx){1to8}, %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x30,0x76,0x6a,0x80]
+          vpcmpeqd -512(%rdx){1to8}, %ymm26, %k5
+
+// CHECK: vpcmpeqd -516(%rdx){1to8}, %ymm26, %k5
+// CHECK:  encoding: [0x62,0xf1,0x2d,0x30,0x76,0xaa,0xfc,0xfd,0xff,0xff]
+          vpcmpeqd -516(%rdx){1to8}, %ymm26, %k5
+
+// CHECK: vpcmpeqq %xmm29, %xmm20, %k3
+// CHECK:  encoding: [0x62,0x92,0xdd,0x00,0x29,0xdd]
+          vpcmpeqq %xmm29, %xmm20, %k3
+
+// CHECK: vpcmpeqq %xmm29, %xmm20, %k3 {%k3}
+// CHECK:  encoding: [0x62,0x92,0xdd,0x03,0x29,0xdd]
+          vpcmpeqq %xmm29, %xmm20, %k3 {%k3}
+
+// CHECK: vpcmpeqq (%rcx), %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x00,0x29,0x19]
+          vpcmpeqq (%rcx), %xmm20, %k3
+
+// CHECK: vpcmpeqq 291(%rax,%r14,8), %xmm20, %k3
+// CHECK:  encoding: [0x62,0xb2,0xdd,0x00,0x29,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqq 291(%rax,%r14,8), %xmm20, %k3
+
+// CHECK: vpcmpeqq (%rcx){1to2}, %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x10,0x29,0x19]
+          vpcmpeqq (%rcx){1to2}, %xmm20, %k3
+
+// CHECK: vpcmpeqq 2032(%rdx), %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x00,0x29,0x5a,0x7f]
+          vpcmpeqq 2032(%rdx), %xmm20, %k3
+
+// CHECK: vpcmpeqq 2048(%rdx), %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x00,0x29,0x9a,0x00,0x08,0x00,0x00]
+          vpcmpeqq 2048(%rdx), %xmm20, %k3
+
+// CHECK: vpcmpeqq -2048(%rdx), %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x00,0x29,0x5a,0x80]
+          vpcmpeqq -2048(%rdx), %xmm20, %k3
+
+// CHECK: vpcmpeqq -2064(%rdx), %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x00,0x29,0x9a,0xf0,0xf7,0xff,0xff]
+          vpcmpeqq -2064(%rdx), %xmm20, %k3
+
+// CHECK: vpcmpeqq 1016(%rdx){1to2}, %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x10,0x29,0x5a,0x7f]
+          vpcmpeqq 1016(%rdx){1to2}, %xmm20, %k3
+
+// CHECK: vpcmpeqq 1024(%rdx){1to2}, %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x10,0x29,0x9a,0x00,0x04,0x00,0x00]
+          vpcmpeqq 1024(%rdx){1to2}, %xmm20, %k3
+
+// CHECK: vpcmpeqq -1024(%rdx){1to2}, %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x10,0x29,0x5a,0x80]
+          vpcmpeqq -1024(%rdx){1to2}, %xmm20, %k3
+
+// CHECK: vpcmpeqq -1032(%rdx){1to2}, %xmm20, %k3
+// CHECK:  encoding: [0x62,0xf2,0xdd,0x10,0x29,0x9a,0xf8,0xfb,0xff,0xff]
+          vpcmpeqq -1032(%rdx){1to2}, %xmm20, %k3
+
+// CHECK: vpcmpeqq %ymm23, %ymm30, %k5
+// CHECK:  encoding: [0x62,0xb2,0x8d,0x20,0x29,0xef]
+          vpcmpeqq %ymm23, %ymm30, %k5
+
+// CHECK: vpcmpeqq %ymm23, %ymm30, %k5 {%k6}
+// CHECK:  encoding: [0x62,0xb2,0x8d,0x26,0x29,0xef]
+          vpcmpeqq %ymm23, %ymm30, %k5 {%k6}
+
+// CHECK: vpcmpeqq (%rcx), %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x20,0x29,0x29]
+          vpcmpeqq (%rcx), %ymm30, %k5
+
+// CHECK: vpcmpeqq 291(%rax,%r14,8), %ymm30, %k5
+// CHECK:  encoding: [0x62,0xb2,0x8d,0x20,0x29,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpeqq 291(%rax,%r14,8), %ymm30, %k5
+
+// CHECK: vpcmpeqq (%rcx){1to4}, %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x30,0x29,0x29]
+          vpcmpeqq (%rcx){1to4}, %ymm30, %k5
+
+// CHECK: vpcmpeqq 4064(%rdx), %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x20,0x29,0x6a,0x7f]
+          vpcmpeqq 4064(%rdx), %ymm30, %k5
+
+// CHECK: vpcmpeqq 4096(%rdx), %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x20,0x29,0xaa,0x00,0x10,0x00,0x00]
+          vpcmpeqq 4096(%rdx), %ymm30, %k5
+
+// CHECK: vpcmpeqq -4096(%rdx), %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x20,0x29,0x6a,0x80]
+          vpcmpeqq -4096(%rdx), %ymm30, %k5
+
+// CHECK: vpcmpeqq -4128(%rdx), %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x20,0x29,0xaa,0xe0,0xef,0xff,0xff]
+          vpcmpeqq -4128(%rdx), %ymm30, %k5
+
+// CHECK: vpcmpeqq 1016(%rdx){1to4}, %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x30,0x29,0x6a,0x7f]
+          vpcmpeqq 1016(%rdx){1to4}, %ymm30, %k5
+
+// CHECK: vpcmpeqq 1024(%rdx){1to4}, %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x30,0x29,0xaa,0x00,0x04,0x00,0x00]
+          vpcmpeqq 1024(%rdx){1to4}, %ymm30, %k5
+
+// CHECK: vpcmpeqq -1024(%rdx){1to4}, %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x30,0x29,0x6a,0x80]
+          vpcmpeqq -1024(%rdx){1to4}, %ymm30, %k5
+
+// CHECK: vpcmpeqq -1032(%rdx){1to4}, %ymm30, %k5
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x30,0x29,0xaa,0xf8,0xfb,0xff,0xff]
+          vpcmpeqq -1032(%rdx){1to4}, %ymm30, %k5
+
+// CHECK: vpcmpgtd %xmm20, %xmm29, %k4
+// CHECK:  encoding: [0x62,0xb1,0x15,0x00,0x66,0xe4]
+          vpcmpgtd %xmm20, %xmm29, %k4
+
+// CHECK: vpcmpgtd %xmm20, %xmm29, %k4 {%k2}
+// CHECK:  encoding: [0x62,0xb1,0x15,0x02,0x66,0xe4]
+          vpcmpgtd %xmm20, %xmm29, %k4 {%k2}
+
+// CHECK: vpcmpgtd (%rcx), %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x66,0x21]
+          vpcmpgtd (%rcx), %xmm29, %k4
+
+// CHECK: vpcmpgtd 291(%rax,%r14,8), %xmm29, %k4
+// CHECK:  encoding: [0x62,0xb1,0x15,0x00,0x66,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtd 291(%rax,%r14,8), %xmm29, %k4
+
+// CHECK: vpcmpgtd (%rcx){1to4}, %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x66,0x21]
+          vpcmpgtd (%rcx){1to4}, %xmm29, %k4
+
+// CHECK: vpcmpgtd 2032(%rdx), %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x66,0x62,0x7f]
+          vpcmpgtd 2032(%rdx), %xmm29, %k4
+
+// CHECK: vpcmpgtd 2048(%rdx), %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x66,0xa2,0x00,0x08,0x00,0x00]
+          vpcmpgtd 2048(%rdx), %xmm29, %k4
+
+// CHECK: vpcmpgtd -2048(%rdx), %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x66,0x62,0x80]
+          vpcmpgtd -2048(%rdx), %xmm29, %k4
+
+// CHECK: vpcmpgtd -2064(%rdx), %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x00,0x66,0xa2,0xf0,0xf7,0xff,0xff]
+          vpcmpgtd -2064(%rdx), %xmm29, %k4
+
+// CHECK: vpcmpgtd 508(%rdx){1to4}, %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x66,0x62,0x7f]
+          vpcmpgtd 508(%rdx){1to4}, %xmm29, %k4
+
+// CHECK: vpcmpgtd 512(%rdx){1to4}, %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x66,0xa2,0x00,0x02,0x00,0x00]
+          vpcmpgtd 512(%rdx){1to4}, %xmm29, %k4
+
+// CHECK: vpcmpgtd -512(%rdx){1to4}, %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x66,0x62,0x80]
+          vpcmpgtd -512(%rdx){1to4}, %xmm29, %k4
+
+// CHECK: vpcmpgtd -516(%rdx){1to4}, %xmm29, %k4
+// CHECK:  encoding: [0x62,0xf1,0x15,0x10,0x66,0xa2,0xfc,0xfd,0xff,0xff]
+          vpcmpgtd -516(%rdx){1to4}, %xmm29, %k4
+
+// CHECK: vpcmpgtd %ymm17, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xb1,0x4d,0x20,0x66,0xd1]
+          vpcmpgtd %ymm17, %ymm22, %k2
+
+// CHECK: vpcmpgtd %ymm17, %ymm22, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xb1,0x4d,0x21,0x66,0xd1]
+          vpcmpgtd %ymm17, %ymm22, %k2 {%k1}
+
+// CHECK: vpcmpgtd (%rcx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x20,0x66,0x11]
+          vpcmpgtd (%rcx), %ymm22, %k2
+
+// CHECK: vpcmpgtd 291(%rax,%r14,8), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xb1,0x4d,0x20,0x66,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtd 291(%rax,%r14,8), %ymm22, %k2
+
+// CHECK: vpcmpgtd (%rcx){1to8}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x30,0x66,0x11]
+          vpcmpgtd (%rcx){1to8}, %ymm22, %k2
+
+// CHECK: vpcmpgtd 4064(%rdx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x20,0x66,0x52,0x7f]
+          vpcmpgtd 4064(%rdx), %ymm22, %k2
+
+// CHECK: vpcmpgtd 4096(%rdx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x20,0x66,0x92,0x00,0x10,0x00,0x00]
+          vpcmpgtd 4096(%rdx), %ymm22, %k2
+
+// CHECK: vpcmpgtd -4096(%rdx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x20,0x66,0x52,0x80]
+          vpcmpgtd -4096(%rdx), %ymm22, %k2
+
+// CHECK: vpcmpgtd -4128(%rdx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x20,0x66,0x92,0xe0,0xef,0xff,0xff]
+          vpcmpgtd -4128(%rdx), %ymm22, %k2
+
+// CHECK: vpcmpgtd 508(%rdx){1to8}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x30,0x66,0x52,0x7f]
+          vpcmpgtd 508(%rdx){1to8}, %ymm22, %k2
+
+// CHECK: vpcmpgtd 512(%rdx){1to8}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x30,0x66,0x92,0x00,0x02,0x00,0x00]
+          vpcmpgtd 512(%rdx){1to8}, %ymm22, %k2
+
+// CHECK: vpcmpgtd -512(%rdx){1to8}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x30,0x66,0x52,0x80]
+          vpcmpgtd -512(%rdx){1to8}, %ymm22, %k2
+
+// CHECK: vpcmpgtd -516(%rdx){1to8}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf1,0x4d,0x30,0x66,0x92,0xfc,0xfd,0xff,0xff]
+          vpcmpgtd -516(%rdx){1to8}, %ymm22, %k2
+
+// CHECK: vpcmpgtq %xmm25, %xmm30, %k3
+// CHECK:  encoding: [0x62,0x92,0x8d,0x00,0x37,0xd9]
+          vpcmpgtq %xmm25, %xmm30, %k3
+
+// CHECK: vpcmpgtq %xmm25, %xmm30, %k3 {%k6}
+// CHECK:  encoding: [0x62,0x92,0x8d,0x06,0x37,0xd9]
+          vpcmpgtq %xmm25, %xmm30, %k3 {%k6}
+
+// CHECK: vpcmpgtq (%rcx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x00,0x37,0x19]
+          vpcmpgtq (%rcx), %xmm30, %k3
+
+// CHECK: vpcmpgtq 291(%rax,%r14,8), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xb2,0x8d,0x00,0x37,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtq 291(%rax,%r14,8), %xmm30, %k3
+
+// CHECK: vpcmpgtq (%rcx){1to2}, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x10,0x37,0x19]
+          vpcmpgtq (%rcx){1to2}, %xmm30, %k3
+
+// CHECK: vpcmpgtq 2032(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x00,0x37,0x5a,0x7f]
+          vpcmpgtq 2032(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpgtq 2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x00,0x37,0x9a,0x00,0x08,0x00,0x00]
+          vpcmpgtq 2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpgtq -2048(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x00,0x37,0x5a,0x80]
+          vpcmpgtq -2048(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpgtq -2064(%rdx), %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x00,0x37,0x9a,0xf0,0xf7,0xff,0xff]
+          vpcmpgtq -2064(%rdx), %xmm30, %k3
+
+// CHECK: vpcmpgtq 1016(%rdx){1to2}, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x10,0x37,0x5a,0x7f]
+          vpcmpgtq 1016(%rdx){1to2}, %xmm30, %k3
+
+// CHECK: vpcmpgtq 1024(%rdx){1to2}, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x10,0x37,0x9a,0x00,0x04,0x00,0x00]
+          vpcmpgtq 1024(%rdx){1to2}, %xmm30, %k3
+
+// CHECK: vpcmpgtq -1024(%rdx){1to2}, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x10,0x37,0x5a,0x80]
+          vpcmpgtq -1024(%rdx){1to2}, %xmm30, %k3
+
+// CHECK: vpcmpgtq -1032(%rdx){1to2}, %xmm30, %k3
+// CHECK:  encoding: [0x62,0xf2,0x8d,0x10,0x37,0x9a,0xf8,0xfb,0xff,0xff]
+          vpcmpgtq -1032(%rdx){1to2}, %xmm30, %k3
+
+// CHECK: vpcmpgtq %ymm20, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xb2,0xcd,0x20,0x37,0xd4]
+          vpcmpgtq %ymm20, %ymm22, %k2
+
+// CHECK: vpcmpgtq %ymm20, %ymm22, %k2 {%k1}
+// CHECK:  encoding: [0x62,0xb2,0xcd,0x21,0x37,0xd4]
+          vpcmpgtq %ymm20, %ymm22, %k2 {%k1}
+
+// CHECK: vpcmpgtq (%rcx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x20,0x37,0x11]
+          vpcmpgtq (%rcx), %ymm22, %k2
+
+// CHECK: vpcmpgtq 291(%rax,%r14,8), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xb2,0xcd,0x20,0x37,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpcmpgtq 291(%rax,%r14,8), %ymm22, %k2
+
+// CHECK: vpcmpgtq (%rcx){1to4}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x30,0x37,0x11]
+          vpcmpgtq (%rcx){1to4}, %ymm22, %k2
+
+// CHECK: vpcmpgtq 4064(%rdx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x20,0x37,0x52,0x7f]
+          vpcmpgtq 4064(%rdx), %ymm22, %k2
+
+// CHECK: vpcmpgtq 4096(%rdx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x20,0x37,0x92,0x00,0x10,0x00,0x00]
+          vpcmpgtq 4096(%rdx), %ymm22, %k2
+
+// CHECK: vpcmpgtq -4096(%rdx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x20,0x37,0x52,0x80]
+          vpcmpgtq -4096(%rdx), %ymm22, %k2
+
+// CHECK: vpcmpgtq -4128(%rdx), %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x20,0x37,0x92,0xe0,0xef,0xff,0xff]
+          vpcmpgtq -4128(%rdx), %ymm22, %k2
+
+// CHECK: vpcmpgtq 1016(%rdx){1to4}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x30,0x37,0x52,0x7f]
+          vpcmpgtq 1016(%rdx){1to4}, %ymm22, %k2
+
+// CHECK: vpcmpgtq 1024(%rdx){1to4}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x30,0x37,0x92,0x00,0x04,0x00,0x00]
+          vpcmpgtq 1024(%rdx){1to4}, %ymm22, %k2
+
+// CHECK: vpcmpgtq -1024(%rdx){1to4}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x30,0x37,0x52,0x80]
+          vpcmpgtq -1024(%rdx){1to4}, %ymm22, %k2
+
+// CHECK: vpcmpgtq -1032(%rdx){1to4}, %ymm22, %k2
+// CHECK:  encoding: [0x62,0xf2,0xcd,0x30,0x37,0x92,0xf8,0xfb,0xff,0xff]
+          vpcmpgtq -1032(%rdx){1to4}, %ymm22, %k2
+
+// CHECK: vpcmpq $171, %xmm24, %xmm27, %k5
+// CHECK:  encoding: [0x62,0x93,0xa5,0x00,0x1f,0xe8,0xab]
+          vpcmpq $171, %xmm24, %xmm27, %k5
+
+// CHECK: vpcmpq $171, %xmm24, %xmm27, %k5 {%k7}
+// CHECK:  encoding: [0x62,0x93,0xa5,0x07,0x1f,0xe8,0xab]
+          vpcmpq $171, %xmm24, %xmm27, %k5 {%k7}
+
+// CHECK: vpcmpq $123, %xmm24, %xmm27, %k5
+// CHECK:  encoding: [0x62,0x93,0xa5,0x00,0x1f,0xe8,0x7b]
+          vpcmpq $123, %xmm24, %xmm27, %k5
+
+// CHECK: vpcmpq $123, (%rcx), %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x00,0x1f,0x29,0x7b]
+          vpcmpq $123, (%rcx), %xmm27, %k5
+
+// CHECK: vpcmpq $123, 291(%rax,%r14,8), %xmm27, %k5
+// CHECK:  encoding: [0x62,0xb3,0xa5,0x00,0x1f,0xac,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpq $123, 291(%rax,%r14,8), %xmm27, %k5
+
+// CHECK: vpcmpq $123, (%rcx){1to2}, %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x10,0x1f,0x29,0x7b]
+          vpcmpq $123, (%rcx){1to2}, %xmm27, %k5
+
+// CHECK: vpcmpq $123, 2032(%rdx), %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x00,0x1f,0x6a,0x7f,0x7b]
+          vpcmpq $123, 2032(%rdx), %xmm27, %k5
+
+// CHECK: vpcmpq $123, 2048(%rdx), %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x00,0x1f,0xaa,0x00,0x08,0x00,0x00,0x7b]
+          vpcmpq $123, 2048(%rdx), %xmm27, %k5
+
+// CHECK: vpcmpq $123, -2048(%rdx), %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x00,0x1f,0x6a,0x80,0x7b]
+          vpcmpq $123, -2048(%rdx), %xmm27, %k5
+
+// CHECK: vpcmpq $123, -2064(%rdx), %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x00,0x1f,0xaa,0xf0,0xf7,0xff,0xff,0x7b]
+          vpcmpq $123, -2064(%rdx), %xmm27, %k5
+
+// CHECK: vpcmpq $123, 1016(%rdx){1to2}, %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x10,0x1f,0x6a,0x7f,0x7b]
+          vpcmpq $123, 1016(%rdx){1to2}, %xmm27, %k5
+
+// CHECK: vpcmpq $123, 1024(%rdx){1to2}, %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x10,0x1f,0xaa,0x00,0x04,0x00,0x00,0x7b]
+          vpcmpq $123, 1024(%rdx){1to2}, %xmm27, %k5
+
+// CHECK: vpcmpq $123, -1024(%rdx){1to2}, %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x10,0x1f,0x6a,0x80,0x7b]
+          vpcmpq $123, -1024(%rdx){1to2}, %xmm27, %k5
+
+// CHECK: vpcmpq $123, -1032(%rdx){1to2}, %xmm27, %k5
+// CHECK:  encoding: [0x62,0xf3,0xa5,0x10,0x1f,0xaa,0xf8,0xfb,0xff,0xff,0x7b]
+          vpcmpq $123, -1032(%rdx){1to2}, %xmm27, %k5
+
+// CHECK: vpcmpq $171, %ymm19, %ymm26, %k4
+// CHECK:  encoding: [0x62,0xb3,0xad,0x20,0x1f,0xe3,0xab]
+          vpcmpq $171, %ymm19, %ymm26, %k4
+
+// CHECK: vpcmpq $171, %ymm19, %ymm26, %k4 {%k6}
+// CHECK:  encoding: [0x62,0xb3,0xad,0x26,0x1f,0xe3,0xab]
+          vpcmpq $171, %ymm19, %ymm26, %k4 {%k6}
+
+// CHECK: vpcmpq $123, %ymm19, %ymm26, %k4
+// CHECK:  encoding: [0x62,0xb3,0xad,0x20,0x1f,0xe3,0x7b]
+          vpcmpq $123, %ymm19, %ymm26, %k4
+
+// CHECK: vpcmpq $123, (%rcx), %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x1f,0x21,0x7b]
+          vpcmpq $123, (%rcx), %ymm26, %k4
+
+// CHECK: vpcmpq $123, 291(%rax,%r14,8), %ymm26, %k4
+// CHECK:  encoding: [0x62,0xb3,0xad,0x20,0x1f,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpq $123, 291(%rax,%r14,8), %ymm26, %k4
+
+// CHECK: vpcmpq $123, (%rcx){1to4}, %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x30,0x1f,0x21,0x7b]
+          vpcmpq $123, (%rcx){1to4}, %ymm26, %k4
+
+// CHECK: vpcmpq $123, 4064(%rdx), %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x1f,0x62,0x7f,0x7b]
+          vpcmpq $123, 4064(%rdx), %ymm26, %k4
+
+// CHECK: vpcmpq $123, 4096(%rdx), %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x1f,0xa2,0x00,0x10,0x00,0x00,0x7b]
+          vpcmpq $123, 4096(%rdx), %ymm26, %k4
+
+// CHECK: vpcmpq $123, -4096(%rdx), %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x1f,0x62,0x80,0x7b]
+          vpcmpq $123, -4096(%rdx), %ymm26, %k4
+
+// CHECK: vpcmpq $123, -4128(%rdx), %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x20,0x1f,0xa2,0xe0,0xef,0xff,0xff,0x7b]
+          vpcmpq $123, -4128(%rdx), %ymm26, %k4
+
+// CHECK: vpcmpq $123, 1016(%rdx){1to4}, %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x30,0x1f,0x62,0x7f,0x7b]
+          vpcmpq $123, 1016(%rdx){1to4}, %ymm26, %k4
+
+// CHECK: vpcmpq $123, 1024(%rdx){1to4}, %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x30,0x1f,0xa2,0x00,0x04,0x00,0x00,0x7b]
+          vpcmpq $123, 1024(%rdx){1to4}, %ymm26, %k4
+
+// CHECK: vpcmpq $123, -1024(%rdx){1to4}, %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x30,0x1f,0x62,0x80,0x7b]
+          vpcmpq $123, -1024(%rdx){1to4}, %ymm26, %k4
+
+// CHECK: vpcmpq $123, -1032(%rdx){1to4}, %ymm26, %k4
+// CHECK:  encoding: [0x62,0xf3,0xad,0x30,0x1f,0xa2,0xf8,0xfb,0xff,0xff,0x7b]
+          vpcmpq $123, -1032(%rdx){1to4}, %ymm26, %k4
+
+// CHECK: vpcmpud $171, %xmm21, %xmm22, %k3
+// CHECK:  encoding: [0x62,0xb3,0x4d,0x00,0x1e,0xdd,0xab]
+          vpcmpud $171, %xmm21, %xmm22, %k3
+
+// CHECK: vpcmpud $171, %xmm21, %xmm22, %k3 {%k1}
+// CHECK:  encoding: [0x62,0xb3,0x4d,0x01,0x1e,0xdd,0xab]
+          vpcmpud $171, %xmm21, %xmm22, %k3 {%k1}
+
+// CHECK: vpcmpud $123, %xmm21, %xmm22, %k3
+// CHECK:  encoding: [0x62,0xb3,0x4d,0x00,0x1e,0xdd,0x7b]
+          vpcmpud $123, %xmm21, %xmm22, %k3
+
+// CHECK: vpcmpud $123, (%rcx), %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x1e,0x19,0x7b]
+          vpcmpud $123, (%rcx), %xmm22, %k3
+
+// CHECK: vpcmpud $123, 291(%rax,%r14,8), %xmm22, %k3
+// CHECK:  encoding: [0x62,0xb3,0x4d,0x00,0x1e,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpud $123, 291(%rax,%r14,8), %xmm22, %k3
+
+// CHECK: vpcmpud $123, (%rcx){1to4}, %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x10,0x1e,0x19,0x7b]
+          vpcmpud $123, (%rcx){1to4}, %xmm22, %k3
+
+// CHECK: vpcmpud $123, 2032(%rdx), %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x1e,0x5a,0x7f,0x7b]
+          vpcmpud $123, 2032(%rdx), %xmm22, %k3
+
+// CHECK: vpcmpud $123, 2048(%rdx), %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x1e,0x9a,0x00,0x08,0x00,0x00,0x7b]
+          vpcmpud $123, 2048(%rdx), %xmm22, %k3
+
+// CHECK: vpcmpud $123, -2048(%rdx), %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x1e,0x5a,0x80,0x7b]
+          vpcmpud $123, -2048(%rdx), %xmm22, %k3
+
+// CHECK: vpcmpud $123, -2064(%rdx), %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x00,0x1e,0x9a,0xf0,0xf7,0xff,0xff,0x7b]
+          vpcmpud $123, -2064(%rdx), %xmm22, %k3
+
+// CHECK: vpcmpud $123, 508(%rdx){1to4}, %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x10,0x1e,0x5a,0x7f,0x7b]
+          vpcmpud $123, 508(%rdx){1to4}, %xmm22, %k3
+
+// CHECK: vpcmpud $123, 512(%rdx){1to4}, %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x10,0x1e,0x9a,0x00,0x02,0x00,0x00,0x7b]
+          vpcmpud $123, 512(%rdx){1to4}, %xmm22, %k3
+
+// CHECK: vpcmpud $123, -512(%rdx){1to4}, %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x10,0x1e,0x5a,0x80,0x7b]
+          vpcmpud $123, -512(%rdx){1to4}, %xmm22, %k3
+
+// CHECK: vpcmpud $123, -516(%rdx){1to4}, %xmm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0x4d,0x10,0x1e,0x9a,0xfc,0xfd,0xff,0xff,0x7b]
+          vpcmpud $123, -516(%rdx){1to4}, %xmm22, %k3
+
+// CHECK: vpcmpud $171, %ymm20, %ymm30, %k3
+// CHECK:  encoding: [0x62,0xb3,0x0d,0x20,0x1e,0xdc,0xab]
+          vpcmpud $171, %ymm20, %ymm30, %k3
+
+// CHECK: vpcmpud $171, %ymm20, %ymm30, %k3 {%k6}
+// CHECK:  encoding: [0x62,0xb3,0x0d,0x26,0x1e,0xdc,0xab]
+          vpcmpud $171, %ymm20, %ymm30, %k3 {%k6}
+
+// CHECK: vpcmpud $123, %ymm20, %ymm30, %k3
+// CHECK:  encoding: [0x62,0xb3,0x0d,0x20,0x1e,0xdc,0x7b]
+          vpcmpud $123, %ymm20, %ymm30, %k3
+
+// CHECK: vpcmpud $123, (%rcx), %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x20,0x1e,0x19,0x7b]
+          vpcmpud $123, (%rcx), %ymm30, %k3
+
+// CHECK: vpcmpud $123, 291(%rax,%r14,8), %ymm30, %k3
+// CHECK:  encoding: [0x62,0xb3,0x0d,0x20,0x1e,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpud $123, 291(%rax,%r14,8), %ymm30, %k3
+
+// CHECK: vpcmpud $123, (%rcx){1to8}, %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x30,0x1e,0x19,0x7b]
+          vpcmpud $123, (%rcx){1to8}, %ymm30, %k3
+
+// CHECK: vpcmpud $123, 4064(%rdx), %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x20,0x1e,0x5a,0x7f,0x7b]
+          vpcmpud $123, 4064(%rdx), %ymm30, %k3
+
+// CHECK: vpcmpud $123, 4096(%rdx), %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x20,0x1e,0x9a,0x00,0x10,0x00,0x00,0x7b]
+          vpcmpud $123, 4096(%rdx), %ymm30, %k3
+
+// CHECK: vpcmpud $123, -4096(%rdx), %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x20,0x1e,0x5a,0x80,0x7b]
+          vpcmpud $123, -4096(%rdx), %ymm30, %k3
+
+// CHECK: vpcmpud $123, -4128(%rdx), %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x20,0x1e,0x9a,0xe0,0xef,0xff,0xff,0x7b]
+          vpcmpud $123, -4128(%rdx), %ymm30, %k3
+
+// CHECK: vpcmpud $123, 508(%rdx){1to8}, %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x30,0x1e,0x5a,0x7f,0x7b]
+          vpcmpud $123, 508(%rdx){1to8}, %ymm30, %k3
+
+// CHECK: vpcmpud $123, 512(%rdx){1to8}, %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x30,0x1e,0x9a,0x00,0x02,0x00,0x00,0x7b]
+          vpcmpud $123, 512(%rdx){1to8}, %ymm30, %k3
+
+// CHECK: vpcmpud $123, -512(%rdx){1to8}, %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x30,0x1e,0x5a,0x80,0x7b]
+          vpcmpud $123, -512(%rdx){1to8}, %ymm30, %k3
+
+// CHECK: vpcmpud $123, -516(%rdx){1to8}, %ymm30, %k3
+// CHECK:  encoding: [0x62,0xf3,0x0d,0x30,0x1e,0x9a,0xfc,0xfd,0xff,0xff,0x7b]
+          vpcmpud $123, -516(%rdx){1to8}, %ymm30, %k3
+
+// CHECK: vpcmpuq $171, %xmm28, %xmm28, %k5
+// CHECK:  encoding: [0x62,0x93,0x9d,0x00,0x1e,0xec,0xab]
+          vpcmpuq $171, %xmm28, %xmm28, %k5
+
+// CHECK: vpcmpuq $171, %xmm28, %xmm28, %k5 {%k4}
+// CHECK:  encoding: [0x62,0x93,0x9d,0x04,0x1e,0xec,0xab]
+          vpcmpuq $171, %xmm28, %xmm28, %k5 {%k4}
+
+// CHECK: vpcmpuq $123, %xmm28, %xmm28, %k5
+// CHECK:  encoding: [0x62,0x93,0x9d,0x00,0x1e,0xec,0x7b]
+          vpcmpuq $123, %xmm28, %xmm28, %k5
+
+// CHECK: vpcmpuq $123, (%rcx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x1e,0x29,0x7b]
+          vpcmpuq $123, (%rcx), %xmm28, %k5
+
+// CHECK: vpcmpuq $123, 291(%rax,%r14,8), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xb3,0x9d,0x00,0x1e,0xac,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpuq $123, 291(%rax,%r14,8), %xmm28, %k5
+
+// CHECK: vpcmpuq $123, (%rcx){1to2}, %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x10,0x1e,0x29,0x7b]
+          vpcmpuq $123, (%rcx){1to2}, %xmm28, %k5
+
+// CHECK: vpcmpuq $123, 2032(%rdx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x1e,0x6a,0x7f,0x7b]
+          vpcmpuq $123, 2032(%rdx), %xmm28, %k5
+
+// CHECK: vpcmpuq $123, 2048(%rdx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x1e,0xaa,0x00,0x08,0x00,0x00,0x7b]
+          vpcmpuq $123, 2048(%rdx), %xmm28, %k5
+
+// CHECK: vpcmpuq $123, -2048(%rdx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x1e,0x6a,0x80,0x7b]
+          vpcmpuq $123, -2048(%rdx), %xmm28, %k5
+
+// CHECK: vpcmpuq $123, -2064(%rdx), %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x00,0x1e,0xaa,0xf0,0xf7,0xff,0xff,0x7b]
+          vpcmpuq $123, -2064(%rdx), %xmm28, %k5
+
+// CHECK: vpcmpuq $123, 1016(%rdx){1to2}, %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x10,0x1e,0x6a,0x7f,0x7b]
+          vpcmpuq $123, 1016(%rdx){1to2}, %xmm28, %k5
+
+// CHECK: vpcmpuq $123, 1024(%rdx){1to2}, %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x10,0x1e,0xaa,0x00,0x04,0x00,0x00,0x7b]
+          vpcmpuq $123, 1024(%rdx){1to2}, %xmm28, %k5
+
+// CHECK: vpcmpuq $123, -1024(%rdx){1to2}, %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x10,0x1e,0x6a,0x80,0x7b]
+          vpcmpuq $123, -1024(%rdx){1to2}, %xmm28, %k5
+
+// CHECK: vpcmpuq $123, -1032(%rdx){1to2}, %xmm28, %k5
+// CHECK:  encoding: [0x62,0xf3,0x9d,0x10,0x1e,0xaa,0xf8,0xfb,0xff,0xff,0x7b]
+          vpcmpuq $123, -1032(%rdx){1to2}, %xmm28, %k5
+
+// CHECK: vpcmpuq $171, %ymm26, %ymm22, %k3
+// CHECK:  encoding: [0x62,0x93,0xcd,0x20,0x1e,0xda,0xab]
+          vpcmpuq $171, %ymm26, %ymm22, %k3
+
+// CHECK: vpcmpuq $171, %ymm26, %ymm22, %k3 {%k3}
+// CHECK:  encoding: [0x62,0x93,0xcd,0x23,0x1e,0xda,0xab]
+          vpcmpuq $171, %ymm26, %ymm22, %k3 {%k3}
+
+// CHECK: vpcmpuq $123, %ymm26, %ymm22, %k3
+// CHECK:  encoding: [0x62,0x93,0xcd,0x20,0x1e,0xda,0x7b]
+          vpcmpuq $123, %ymm26, %ymm22, %k3
+
+// CHECK: vpcmpuq $123, (%rcx), %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x20,0x1e,0x19,0x7b]
+          vpcmpuq $123, (%rcx), %ymm22, %k3
+
+// CHECK: vpcmpuq $123, 291(%rax,%r14,8), %ymm22, %k3
+// CHECK:  encoding: [0x62,0xb3,0xcd,0x20,0x1e,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+          vpcmpuq $123, 291(%rax,%r14,8), %ymm22, %k3
+
+// CHECK: vpcmpuq $123, (%rcx){1to4}, %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x30,0x1e,0x19,0x7b]
+          vpcmpuq $123, (%rcx){1to4}, %ymm22, %k3
+
+// CHECK: vpcmpuq $123, 4064(%rdx), %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x20,0x1e,0x5a,0x7f,0x7b]
+          vpcmpuq $123, 4064(%rdx), %ymm22, %k3
+
+// CHECK: vpcmpuq $123, 4096(%rdx), %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x20,0x1e,0x9a,0x00,0x10,0x00,0x00,0x7b]
+          vpcmpuq $123, 4096(%rdx), %ymm22, %k3
+
+// CHECK: vpcmpuq $123, -4096(%rdx), %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x20,0x1e,0x5a,0x80,0x7b]
+          vpcmpuq $123, -4096(%rdx), %ymm22, %k3
+
+// CHECK: vpcmpuq $123, -4128(%rdx), %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x20,0x1e,0x9a,0xe0,0xef,0xff,0xff,0x7b]
+          vpcmpuq $123, -4128(%rdx), %ymm22, %k3
+
+// CHECK: vpcmpuq $123, 1016(%rdx){1to4}, %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x30,0x1e,0x5a,0x7f,0x7b]
+          vpcmpuq $123, 1016(%rdx){1to4}, %ymm22, %k3
+
+// CHECK: vpcmpuq $123, 1024(%rdx){1to4}, %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x30,0x1e,0x9a,0x00,0x04,0x00,0x00,0x7b]
+          vpcmpuq $123, 1024(%rdx){1to4}, %ymm22, %k3
+
+// CHECK: vpcmpuq $123, -1024(%rdx){1to4}, %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x30,0x1e,0x5a,0x80,0x7b]
+          vpcmpuq $123, -1024(%rdx){1to4}, %ymm22, %k3
+
+// CHECK: vpcmpuq $123, -1032(%rdx){1to4}, %ymm22, %k3
+// CHECK:  encoding: [0x62,0xf3,0xcd,0x30,0x1e,0x9a,0xf8,0xfb,0xff,0xff,0x7b]
+          vpcmpuq $123, -1032(%rdx){1to4}, %ymm22, %k3
+
+// CHECK: vpmaxsd %xmm26, %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0x82,0x4d,0x00,0x3d,0xda]
+          vpmaxsd %xmm26, %xmm22, %xmm19
+
+// CHECK: vpmaxsd %xmm26, %xmm22, %xmm19 {%k2}
+// CHECK:  encoding: [0x62,0x82,0x4d,0x02,0x3d,0xda]
+          vpmaxsd %xmm26, %xmm22, %xmm19 {%k2}
+
+// CHECK: vpmaxsd %xmm26, %xmm22, %xmm19 {%k2} {z}
+// CHECK:  encoding: [0x62,0x82,0x4d,0x82,0x3d,0xda]
+          vpmaxsd %xmm26, %xmm22, %xmm19 {%k2} {z}
+
+// CHECK: vpmaxsd (%rcx), %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x00,0x3d,0x19]
+          vpmaxsd (%rcx), %xmm22, %xmm19
+
+// CHECK: vpmaxsd 291(%rax,%r14,8), %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xa2,0x4d,0x00,0x3d,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsd 291(%rax,%r14,8), %xmm22, %xmm19
+
+// CHECK: vpmaxsd (%rcx){1to4}, %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x10,0x3d,0x19]
+          vpmaxsd (%rcx){1to4}, %xmm22, %xmm19
+
+// CHECK: vpmaxsd 2032(%rdx), %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x00,0x3d,0x5a,0x7f]
+          vpmaxsd 2032(%rdx), %xmm22, %xmm19
+
+// CHECK: vpmaxsd 2048(%rdx), %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x00,0x3d,0x9a,0x00,0x08,0x00,0x00]
+          vpmaxsd 2048(%rdx), %xmm22, %xmm19
+
+// CHECK: vpmaxsd -2048(%rdx), %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x00,0x3d,0x5a,0x80]
+          vpmaxsd -2048(%rdx), %xmm22, %xmm19
+
+// CHECK: vpmaxsd -2064(%rdx), %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x00,0x3d,0x9a,0xf0,0xf7,0xff,0xff]
+          vpmaxsd -2064(%rdx), %xmm22, %xmm19
+
+// CHECK: vpmaxsd 508(%rdx){1to4}, %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x10,0x3d,0x5a,0x7f]
+          vpmaxsd 508(%rdx){1to4}, %xmm22, %xmm19
+
+// CHECK: vpmaxsd 512(%rdx){1to4}, %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x10,0x3d,0x9a,0x00,0x02,0x00,0x00]
+          vpmaxsd 512(%rdx){1to4}, %xmm22, %xmm19
+
+// CHECK: vpmaxsd -512(%rdx){1to4}, %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x10,0x3d,0x5a,0x80]
+          vpmaxsd -512(%rdx){1to4}, %xmm22, %xmm19
+
+// CHECK: vpmaxsd -516(%rdx){1to4}, %xmm22, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x10,0x3d,0x9a,0xfc,0xfd,0xff,0xff]
+          vpmaxsd -516(%rdx){1to4}, %xmm22, %xmm19
+
+// CHECK: vpmaxsd %ymm23, %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x22,0x3d,0x20,0x3d,0xdf]
+          vpmaxsd %ymm23, %ymm24, %ymm27
+
+// CHECK: vpmaxsd %ymm23, %ymm24, %ymm27 {%k6}
+// CHECK:  encoding: [0x62,0x22,0x3d,0x26,0x3d,0xdf]
+          vpmaxsd %ymm23, %ymm24, %ymm27 {%k6}
+
+// CHECK: vpmaxsd %ymm23, %ymm24, %ymm27 {%k6} {z}
+// CHECK:  encoding: [0x62,0x22,0x3d,0xa6,0x3d,0xdf]
+          vpmaxsd %ymm23, %ymm24, %ymm27 {%k6} {z}
+
+// CHECK: vpmaxsd (%rcx), %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x20,0x3d,0x19]
+          vpmaxsd (%rcx), %ymm24, %ymm27
+
+// CHECK: vpmaxsd 291(%rax,%r14,8), %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x22,0x3d,0x20,0x3d,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsd 291(%rax,%r14,8), %ymm24, %ymm27
+
+// CHECK: vpmaxsd (%rcx){1to8}, %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x30,0x3d,0x19]
+          vpmaxsd (%rcx){1to8}, %ymm24, %ymm27
+
+// CHECK: vpmaxsd 4064(%rdx), %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x20,0x3d,0x5a,0x7f]
+          vpmaxsd 4064(%rdx), %ymm24, %ymm27
+
+// CHECK: vpmaxsd 4096(%rdx), %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x20,0x3d,0x9a,0x00,0x10,0x00,0x00]
+          vpmaxsd 4096(%rdx), %ymm24, %ymm27
+
+// CHECK: vpmaxsd -4096(%rdx), %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x20,0x3d,0x5a,0x80]
+          vpmaxsd -4096(%rdx), %ymm24, %ymm27
+
+// CHECK: vpmaxsd -4128(%rdx), %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x20,0x3d,0x9a,0xe0,0xef,0xff,0xff]
+          vpmaxsd -4128(%rdx), %ymm24, %ymm27
+
+// CHECK: vpmaxsd 508(%rdx){1to8}, %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x30,0x3d,0x5a,0x7f]
+          vpmaxsd 508(%rdx){1to8}, %ymm24, %ymm27
+
+// CHECK: vpmaxsd 512(%rdx){1to8}, %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x30,0x3d,0x9a,0x00,0x02,0x00,0x00]
+          vpmaxsd 512(%rdx){1to8}, %ymm24, %ymm27
+
+// CHECK: vpmaxsd -512(%rdx){1to8}, %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x30,0x3d,0x5a,0x80]
+          vpmaxsd -512(%rdx){1to8}, %ymm24, %ymm27
+
+// CHECK: vpmaxsd -516(%rdx){1to8}, %ymm24, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x3d,0x30,0x3d,0x9a,0xfc,0xfd,0xff,0xff]
+          vpmaxsd -516(%rdx){1to8}, %ymm24, %ymm27
+
+// CHECK: vpmaxsq %xmm25, %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0x82,0xc5,0x00,0x3d,0xf9]
+          vpmaxsq %xmm25, %xmm23, %xmm23
+
+// CHECK: vpmaxsq %xmm25, %xmm23, %xmm23 {%k2}
+// CHECK:  encoding: [0x62,0x82,0xc5,0x02,0x3d,0xf9]
+          vpmaxsq %xmm25, %xmm23, %xmm23 {%k2}
+
+// CHECK: vpmaxsq %xmm25, %xmm23, %xmm23 {%k2} {z}
+// CHECK:  encoding: [0x62,0x82,0xc5,0x82,0x3d,0xf9]
+          vpmaxsq %xmm25, %xmm23, %xmm23 {%k2} {z}
+
+// CHECK: vpmaxsq (%rcx), %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x00,0x3d,0x39]
+          vpmaxsq (%rcx), %xmm23, %xmm23
+
+// CHECK: vpmaxsq 291(%rax,%r14,8), %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xa2,0xc5,0x00,0x3d,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsq 291(%rax,%r14,8), %xmm23, %xmm23
+
+// CHECK: vpmaxsq (%rcx){1to2}, %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x10,0x3d,0x39]
+          vpmaxsq (%rcx){1to2}, %xmm23, %xmm23
+
+// CHECK: vpmaxsq 2032(%rdx), %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x00,0x3d,0x7a,0x7f]
+          vpmaxsq 2032(%rdx), %xmm23, %xmm23
+
+// CHECK: vpmaxsq 2048(%rdx), %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x00,0x3d,0xba,0x00,0x08,0x00,0x00]
+          vpmaxsq 2048(%rdx), %xmm23, %xmm23
+
+// CHECK: vpmaxsq -2048(%rdx), %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x00,0x3d,0x7a,0x80]
+          vpmaxsq -2048(%rdx), %xmm23, %xmm23
+
+// CHECK: vpmaxsq -2064(%rdx), %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x00,0x3d,0xba,0xf0,0xf7,0xff,0xff]
+          vpmaxsq -2064(%rdx), %xmm23, %xmm23
+
+// CHECK: vpmaxsq 1016(%rdx){1to2}, %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x10,0x3d,0x7a,0x7f]
+          vpmaxsq 1016(%rdx){1to2}, %xmm23, %xmm23
+
+// CHECK: vpmaxsq 1024(%rdx){1to2}, %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x10,0x3d,0xba,0x00,0x04,0x00,0x00]
+          vpmaxsq 1024(%rdx){1to2}, %xmm23, %xmm23
+
+// CHECK: vpmaxsq -1024(%rdx){1to2}, %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x10,0x3d,0x7a,0x80]
+          vpmaxsq -1024(%rdx){1to2}, %xmm23, %xmm23
+
+// CHECK: vpmaxsq -1032(%rdx){1to2}, %xmm23, %xmm23
+// CHECK:  encoding: [0x62,0xe2,0xc5,0x10,0x3d,0xba,0xf8,0xfb,0xff,0xff]
+          vpmaxsq -1032(%rdx){1to2}, %xmm23, %xmm23
+
+// CHECK: vpmaxsq %ymm25, %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x02,0xdd,0x20,0x3d,0xf1]
+          vpmaxsq %ymm25, %ymm20, %ymm30
+
+// CHECK: vpmaxsq %ymm25, %ymm20, %ymm30 {%k6}
+// CHECK:  encoding: [0x62,0x02,0xdd,0x26,0x3d,0xf1]
+          vpmaxsq %ymm25, %ymm20, %ymm30 {%k6}
+
+// CHECK: vpmaxsq %ymm25, %ymm20, %ymm30 {%k6} {z}
+// CHECK:  encoding: [0x62,0x02,0xdd,0xa6,0x3d,0xf1]
+          vpmaxsq %ymm25, %ymm20, %ymm30 {%k6} {z}
+
+// CHECK: vpmaxsq (%rcx), %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x20,0x3d,0x31]
+          vpmaxsq (%rcx), %ymm20, %ymm30
+
+// CHECK: vpmaxsq 291(%rax,%r14,8), %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x22,0xdd,0x20,0x3d,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxsq 291(%rax,%r14,8), %ymm20, %ymm30
+
+// CHECK: vpmaxsq (%rcx){1to4}, %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x30,0x3d,0x31]
+          vpmaxsq (%rcx){1to4}, %ymm20, %ymm30
+
+// CHECK: vpmaxsq 4064(%rdx), %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x20,0x3d,0x72,0x7f]
+          vpmaxsq 4064(%rdx), %ymm20, %ymm30
+
+// CHECK: vpmaxsq 4096(%rdx), %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x20,0x3d,0xb2,0x00,0x10,0x00,0x00]
+          vpmaxsq 4096(%rdx), %ymm20, %ymm30
+
+// CHECK: vpmaxsq -4096(%rdx), %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x20,0x3d,0x72,0x80]
+          vpmaxsq -4096(%rdx), %ymm20, %ymm30
+
+// CHECK: vpmaxsq -4128(%rdx), %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x20,0x3d,0xb2,0xe0,0xef,0xff,0xff]
+          vpmaxsq -4128(%rdx), %ymm20, %ymm30
+
+// CHECK: vpmaxsq 1016(%rdx){1to4}, %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x30,0x3d,0x72,0x7f]
+          vpmaxsq 1016(%rdx){1to4}, %ymm20, %ymm30
+
+// CHECK: vpmaxsq 1024(%rdx){1to4}, %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x30,0x3d,0xb2,0x00,0x04,0x00,0x00]
+          vpmaxsq 1024(%rdx){1to4}, %ymm20, %ymm30
+
+// CHECK: vpmaxsq -1024(%rdx){1to4}, %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x30,0x3d,0x72,0x80]
+          vpmaxsq -1024(%rdx){1to4}, %ymm20, %ymm30
+
+// CHECK: vpmaxsq -1032(%rdx){1to4}, %ymm20, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xdd,0x30,0x3d,0xb2,0xf8,0xfb,0xff,0xff]
+          vpmaxsq -1032(%rdx){1to4}, %ymm20, %ymm30
+
+// CHECK: vpmaxud %xmm19, %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xa2,0x75,0x00,0x3f,0xeb]
+          vpmaxud %xmm19, %xmm17, %xmm21
+
+// CHECK: vpmaxud %xmm19, %xmm17, %xmm21 {%k7}
+// CHECK:  encoding: [0x62,0xa2,0x75,0x07,0x3f,0xeb]
+          vpmaxud %xmm19, %xmm17, %xmm21 {%k7}
+
+// CHECK: vpmaxud %xmm19, %xmm17, %xmm21 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa2,0x75,0x87,0x3f,0xeb]
+          vpmaxud %xmm19, %xmm17, %xmm21 {%k7} {z}
+
+// CHECK: vpmaxud (%rcx), %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x00,0x3f,0x29]
+          vpmaxud (%rcx), %xmm17, %xmm21
+
+// CHECK: vpmaxud 291(%rax,%r14,8), %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xa2,0x75,0x00,0x3f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxud 291(%rax,%r14,8), %xmm17, %xmm21
+
+// CHECK: vpmaxud (%rcx){1to4}, %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x10,0x3f,0x29]
+          vpmaxud (%rcx){1to4}, %xmm17, %xmm21
+
+// CHECK: vpmaxud 2032(%rdx), %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x00,0x3f,0x6a,0x7f]
+          vpmaxud 2032(%rdx), %xmm17, %xmm21
+
+// CHECK: vpmaxud 2048(%rdx), %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x00,0x3f,0xaa,0x00,0x08,0x00,0x00]
+          vpmaxud 2048(%rdx), %xmm17, %xmm21
+
+// CHECK: vpmaxud -2048(%rdx), %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x00,0x3f,0x6a,0x80]
+          vpmaxud -2048(%rdx), %xmm17, %xmm21
+
+// CHECK: vpmaxud -2064(%rdx), %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x00,0x3f,0xaa,0xf0,0xf7,0xff,0xff]
+          vpmaxud -2064(%rdx), %xmm17, %xmm21
+
+// CHECK: vpmaxud 508(%rdx){1to4}, %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x10,0x3f,0x6a,0x7f]
+          vpmaxud 508(%rdx){1to4}, %xmm17, %xmm21
+
+// CHECK: vpmaxud 512(%rdx){1to4}, %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x10,0x3f,0xaa,0x00,0x02,0x00,0x00]
+          vpmaxud 512(%rdx){1to4}, %xmm17, %xmm21
+
+// CHECK: vpmaxud -512(%rdx){1to4}, %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x10,0x3f,0x6a,0x80]
+          vpmaxud -512(%rdx){1to4}, %xmm17, %xmm21
+
+// CHECK: vpmaxud -516(%rdx){1to4}, %xmm17, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0x75,0x10,0x3f,0xaa,0xfc,0xfd,0xff,0xff]
+          vpmaxud -516(%rdx){1to4}, %xmm17, %xmm21
+
+// CHECK: vpmaxud %ymm23, %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x22,0x45,0x20,0x3f,0xe7]
+          vpmaxud %ymm23, %ymm23, %ymm28
+
+// CHECK: vpmaxud %ymm23, %ymm23, %ymm28 {%k7}
+// CHECK:  encoding: [0x62,0x22,0x45,0x27,0x3f,0xe7]
+          vpmaxud %ymm23, %ymm23, %ymm28 {%k7}
+
+// CHECK: vpmaxud %ymm23, %ymm23, %ymm28 {%k7} {z}
+// CHECK:  encoding: [0x62,0x22,0x45,0xa7,0x3f,0xe7]
+          vpmaxud %ymm23, %ymm23, %ymm28 {%k7} {z}
+
+// CHECK: vpmaxud (%rcx), %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x20,0x3f,0x21]
+          vpmaxud (%rcx), %ymm23, %ymm28
+
+// CHECK: vpmaxud 291(%rax,%r14,8), %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x22,0x45,0x20,0x3f,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxud 291(%rax,%r14,8), %ymm23, %ymm28
+
+// CHECK: vpmaxud (%rcx){1to8}, %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x30,0x3f,0x21]
+          vpmaxud (%rcx){1to8}, %ymm23, %ymm28
+
+// CHECK: vpmaxud 4064(%rdx), %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x20,0x3f,0x62,0x7f]
+          vpmaxud 4064(%rdx), %ymm23, %ymm28
+
+// CHECK: vpmaxud 4096(%rdx), %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x20,0x3f,0xa2,0x00,0x10,0x00,0x00]
+          vpmaxud 4096(%rdx), %ymm23, %ymm28
+
+// CHECK: vpmaxud -4096(%rdx), %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x20,0x3f,0x62,0x80]
+          vpmaxud -4096(%rdx), %ymm23, %ymm28
+
+// CHECK: vpmaxud -4128(%rdx), %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x20,0x3f,0xa2,0xe0,0xef,0xff,0xff]
+          vpmaxud -4128(%rdx), %ymm23, %ymm28
+
+// CHECK: vpmaxud 508(%rdx){1to8}, %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x30,0x3f,0x62,0x7f]
+          vpmaxud 508(%rdx){1to8}, %ymm23, %ymm28
+
+// CHECK: vpmaxud 512(%rdx){1to8}, %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x30,0x3f,0xa2,0x00,0x02,0x00,0x00]
+          vpmaxud 512(%rdx){1to8}, %ymm23, %ymm28
+
+// CHECK: vpmaxud -512(%rdx){1to8}, %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x30,0x3f,0x62,0x80]
+          vpmaxud -512(%rdx){1to8}, %ymm23, %ymm28
+
+// CHECK: vpmaxud -516(%rdx){1to8}, %ymm23, %ymm28
+// CHECK:  encoding: [0x62,0x62,0x45,0x30,0x3f,0xa2,0xfc,0xfd,0xff,0xff]
+          vpmaxud -516(%rdx){1to8}, %ymm23, %ymm28
+
+// CHECK: vpmaxuq %xmm25, %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x02,0xad,0x00,0x3f,0xd1]
+          vpmaxuq %xmm25, %xmm26, %xmm26
+
+// CHECK: vpmaxuq %xmm25, %xmm26, %xmm26 {%k7}
+// CHECK:  encoding: [0x62,0x02,0xad,0x07,0x3f,0xd1]
+          vpmaxuq %xmm25, %xmm26, %xmm26 {%k7}
+
+// CHECK: vpmaxuq %xmm25, %xmm26, %xmm26 {%k7} {z}
+// CHECK:  encoding: [0x62,0x02,0xad,0x87,0x3f,0xd1]
+          vpmaxuq %xmm25, %xmm26, %xmm26 {%k7} {z}
+
+// CHECK: vpmaxuq (%rcx), %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x00,0x3f,0x11]
+          vpmaxuq (%rcx), %xmm26, %xmm26
+
+// CHECK: vpmaxuq 291(%rax,%r14,8), %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x22,0xad,0x00,0x3f,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxuq 291(%rax,%r14,8), %xmm26, %xmm26
+
+// CHECK: vpmaxuq (%rcx){1to2}, %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x10,0x3f,0x11]
+          vpmaxuq (%rcx){1to2}, %xmm26, %xmm26
+
+// CHECK: vpmaxuq 2032(%rdx), %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x00,0x3f,0x52,0x7f]
+          vpmaxuq 2032(%rdx), %xmm26, %xmm26
+
+// CHECK: vpmaxuq 2048(%rdx), %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x00,0x3f,0x92,0x00,0x08,0x00,0x00]
+          vpmaxuq 2048(%rdx), %xmm26, %xmm26
+
+// CHECK: vpmaxuq -2048(%rdx), %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x00,0x3f,0x52,0x80]
+          vpmaxuq -2048(%rdx), %xmm26, %xmm26
+
+// CHECK: vpmaxuq -2064(%rdx), %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x00,0x3f,0x92,0xf0,0xf7,0xff,0xff]
+          vpmaxuq -2064(%rdx), %xmm26, %xmm26
+
+// CHECK: vpmaxuq 1016(%rdx){1to2}, %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x10,0x3f,0x52,0x7f]
+          vpmaxuq 1016(%rdx){1to2}, %xmm26, %xmm26
+
+// CHECK: vpmaxuq 1024(%rdx){1to2}, %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x10,0x3f,0x92,0x00,0x04,0x00,0x00]
+          vpmaxuq 1024(%rdx){1to2}, %xmm26, %xmm26
+
+// CHECK: vpmaxuq -1024(%rdx){1to2}, %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x10,0x3f,0x52,0x80]
+          vpmaxuq -1024(%rdx){1to2}, %xmm26, %xmm26
+
+// CHECK: vpmaxuq -1032(%rdx){1to2}, %xmm26, %xmm26
+// CHECK:  encoding: [0x62,0x62,0xad,0x10,0x3f,0x92,0xf8,0xfb,0xff,0xff]
+          vpmaxuq -1032(%rdx){1to2}, %xmm26, %xmm26
+
+// CHECK: vpmaxuq %ymm20, %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x22,0xf5,0x20,0x3f,0xf4]
+          vpmaxuq %ymm20, %ymm17, %ymm30
+
+// CHECK: vpmaxuq %ymm20, %ymm17, %ymm30 {%k4}
+// CHECK:  encoding: [0x62,0x22,0xf5,0x24,0x3f,0xf4]
+          vpmaxuq %ymm20, %ymm17, %ymm30 {%k4}
+
+// CHECK: vpmaxuq %ymm20, %ymm17, %ymm30 {%k4} {z}
+// CHECK:  encoding: [0x62,0x22,0xf5,0xa4,0x3f,0xf4]
+          vpmaxuq %ymm20, %ymm17, %ymm30 {%k4} {z}
+
+// CHECK: vpmaxuq (%rcx), %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x20,0x3f,0x31]
+          vpmaxuq (%rcx), %ymm17, %ymm30
+
+// CHECK: vpmaxuq 291(%rax,%r14,8), %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x22,0xf5,0x20,0x3f,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpmaxuq 291(%rax,%r14,8), %ymm17, %ymm30
+
+// CHECK: vpmaxuq (%rcx){1to4}, %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x30,0x3f,0x31]
+          vpmaxuq (%rcx){1to4}, %ymm17, %ymm30
+
+// CHECK: vpmaxuq 4064(%rdx), %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x20,0x3f,0x72,0x7f]
+          vpmaxuq 4064(%rdx), %ymm17, %ymm30
+
+// CHECK: vpmaxuq 4096(%rdx), %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x20,0x3f,0xb2,0x00,0x10,0x00,0x00]
+          vpmaxuq 4096(%rdx), %ymm17, %ymm30
+
+// CHECK: vpmaxuq -4096(%rdx), %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x20,0x3f,0x72,0x80]
+          vpmaxuq -4096(%rdx), %ymm17, %ymm30
+
+// CHECK: vpmaxuq -4128(%rdx), %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x20,0x3f,0xb2,0xe0,0xef,0xff,0xff]
+          vpmaxuq -4128(%rdx), %ymm17, %ymm30
+
+// CHECK: vpmaxuq 1016(%rdx){1to4}, %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x30,0x3f,0x72,0x7f]
+          vpmaxuq 1016(%rdx){1to4}, %ymm17, %ymm30
+
+// CHECK: vpmaxuq 1024(%rdx){1to4}, %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x30,0x3f,0xb2,0x00,0x04,0x00,0x00]
+          vpmaxuq 1024(%rdx){1to4}, %ymm17, %ymm30
+
+// CHECK: vpmaxuq -1024(%rdx){1to4}, %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x30,0x3f,0x72,0x80]
+          vpmaxuq -1024(%rdx){1to4}, %ymm17, %ymm30
+
+// CHECK: vpmaxuq -1032(%rdx){1to4}, %ymm17, %ymm30
+// CHECK:  encoding: [0x62,0x62,0xf5,0x30,0x3f,0xb2,0xf8,0xfb,0xff,0xff]
+          vpmaxuq -1032(%rdx){1to4}, %ymm17, %ymm30
+
+// CHECK: vpminsd %xmm17, %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x22,0x55,0x00,0x39,0xe9]
+          vpminsd %xmm17, %xmm21, %xmm29
+
+// CHECK: vpminsd %xmm17, %xmm21, %xmm29 {%k4}
+// CHECK:  encoding: [0x62,0x22,0x55,0x04,0x39,0xe9]
+          vpminsd %xmm17, %xmm21, %xmm29 {%k4}
+
+// CHECK: vpminsd %xmm17, %xmm21, %xmm29 {%k4} {z}
+// CHECK:  encoding: [0x62,0x22,0x55,0x84,0x39,0xe9]
+          vpminsd %xmm17, %xmm21, %xmm29 {%k4} {z}
+
+// CHECK: vpminsd (%rcx), %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x00,0x39,0x29]
+          vpminsd (%rcx), %xmm21, %xmm29
+
+// CHECK: vpminsd 291(%rax,%r14,8), %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x22,0x55,0x00,0x39,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpminsd 291(%rax,%r14,8), %xmm21, %xmm29
+
+// CHECK: vpminsd (%rcx){1to4}, %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x10,0x39,0x29]
+          vpminsd (%rcx){1to4}, %xmm21, %xmm29
+
+// CHECK: vpminsd 2032(%rdx), %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x00,0x39,0x6a,0x7f]
+          vpminsd 2032(%rdx), %xmm21, %xmm29
+
+// CHECK: vpminsd 2048(%rdx), %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x00,0x39,0xaa,0x00,0x08,0x00,0x00]
+          vpminsd 2048(%rdx), %xmm21, %xmm29
+
+// CHECK: vpminsd -2048(%rdx), %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x00,0x39,0x6a,0x80]
+          vpminsd -2048(%rdx), %xmm21, %xmm29
+
+// CHECK: vpminsd -2064(%rdx), %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x00,0x39,0xaa,0xf0,0xf7,0xff,0xff]
+          vpminsd -2064(%rdx), %xmm21, %xmm29
+
+// CHECK: vpminsd 508(%rdx){1to4}, %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x10,0x39,0x6a,0x7f]
+          vpminsd 508(%rdx){1to4}, %xmm21, %xmm29
+
+// CHECK: vpminsd 512(%rdx){1to4}, %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x10,0x39,0xaa,0x00,0x02,0x00,0x00]
+          vpminsd 512(%rdx){1to4}, %xmm21, %xmm29
+
+// CHECK: vpminsd -512(%rdx){1to4}, %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x10,0x39,0x6a,0x80]
+          vpminsd -512(%rdx){1to4}, %xmm21, %xmm29
+
+// CHECK: vpminsd -516(%rdx){1to4}, %xmm21, %xmm29
+// CHECK:  encoding: [0x62,0x62,0x55,0x10,0x39,0xaa,0xfc,0xfd,0xff,0xff]
+          vpminsd -516(%rdx){1to4}, %xmm21, %xmm29
+
+// CHECK: vpminsd %ymm25, %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0x82,0x2d,0x20,0x39,0xd9]
+          vpminsd %ymm25, %ymm26, %ymm19
+
+// CHECK: vpminsd %ymm25, %ymm26, %ymm19 {%k7}
+// CHECK:  encoding: [0x62,0x82,0x2d,0x27,0x39,0xd9]
+          vpminsd %ymm25, %ymm26, %ymm19 {%k7}
+
+// CHECK: vpminsd %ymm25, %ymm26, %ymm19 {%k7} {z}
+// CHECK:  encoding: [0x62,0x82,0x2d,0xa7,0x39,0xd9]
+          vpminsd %ymm25, %ymm26, %ymm19 {%k7} {z}
+
+// CHECK: vpminsd (%rcx), %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x39,0x19]
+          vpminsd (%rcx), %ymm26, %ymm19
+
+// CHECK: vpminsd 291(%rax,%r14,8), %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xa2,0x2d,0x20,0x39,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpminsd 291(%rax,%r14,8), %ymm26, %ymm19
+
+// CHECK: vpminsd (%rcx){1to8}, %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x39,0x19]
+          vpminsd (%rcx){1to8}, %ymm26, %ymm19
+
+// CHECK: vpminsd 4064(%rdx), %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x39,0x5a,0x7f]
+          vpminsd 4064(%rdx), %ymm26, %ymm19
+
+// CHECK: vpminsd 4096(%rdx), %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x39,0x9a,0x00,0x10,0x00,0x00]
+          vpminsd 4096(%rdx), %ymm26, %ymm19
+
+// CHECK: vpminsd -4096(%rdx), %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x39,0x5a,0x80]
+          vpminsd -4096(%rdx), %ymm26, %ymm19
+
+// CHECK: vpminsd -4128(%rdx), %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x39,0x9a,0xe0,0xef,0xff,0xff]
+          vpminsd -4128(%rdx), %ymm26, %ymm19
+
+// CHECK: vpminsd 508(%rdx){1to8}, %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x39,0x5a,0x7f]
+          vpminsd 508(%rdx){1to8}, %ymm26, %ymm19
+
+// CHECK: vpminsd 512(%rdx){1to8}, %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x39,0x9a,0x00,0x02,0x00,0x00]
+          vpminsd 512(%rdx){1to8}, %ymm26, %ymm19
+
+// CHECK: vpminsd -512(%rdx){1to8}, %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x39,0x5a,0x80]
+          vpminsd -512(%rdx){1to8}, %ymm26, %ymm19
+
+// CHECK: vpminsd -516(%rdx){1to8}, %ymm26, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x39,0x9a,0xfc,0xfd,0xff,0xff]
+          vpminsd -516(%rdx){1to8}, %ymm26, %ymm19
+
+// CHECK: vpminsq %xmm18, %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xa2,0xbd,0x00,0x39,0xda]
+          vpminsq %xmm18, %xmm24, %xmm19
+
+// CHECK: vpminsq %xmm18, %xmm24, %xmm19 {%k6}
+// CHECK:  encoding: [0x62,0xa2,0xbd,0x06,0x39,0xda]
+          vpminsq %xmm18, %xmm24, %xmm19 {%k6}
+
+// CHECK: vpminsq %xmm18, %xmm24, %xmm19 {%k6} {z}
+// CHECK:  encoding: [0x62,0xa2,0xbd,0x86,0x39,0xda]
+          vpminsq %xmm18, %xmm24, %xmm19 {%k6} {z}
+
+// CHECK: vpminsq (%rcx), %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x00,0x39,0x19]
+          vpminsq (%rcx), %xmm24, %xmm19
+
+// CHECK: vpminsq 291(%rax,%r14,8), %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xa2,0xbd,0x00,0x39,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpminsq 291(%rax,%r14,8), %xmm24, %xmm19
+
+// CHECK: vpminsq (%rcx){1to2}, %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x10,0x39,0x19]
+          vpminsq (%rcx){1to2}, %xmm24, %xmm19
+
+// CHECK: vpminsq 2032(%rdx), %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x00,0x39,0x5a,0x7f]
+          vpminsq 2032(%rdx), %xmm24, %xmm19
+
+// CHECK: vpminsq 2048(%rdx), %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x00,0x39,0x9a,0x00,0x08,0x00,0x00]
+          vpminsq 2048(%rdx), %xmm24, %xmm19
+
+// CHECK: vpminsq -2048(%rdx), %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x00,0x39,0x5a,0x80]
+          vpminsq -2048(%rdx), %xmm24, %xmm19
+
+// CHECK: vpminsq -2064(%rdx), %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x00,0x39,0x9a,0xf0,0xf7,0xff,0xff]
+          vpminsq -2064(%rdx), %xmm24, %xmm19
+
+// CHECK: vpminsq 1016(%rdx){1to2}, %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x10,0x39,0x5a,0x7f]
+          vpminsq 1016(%rdx){1to2}, %xmm24, %xmm19
+
+// CHECK: vpminsq 1024(%rdx){1to2}, %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x10,0x39,0x9a,0x00,0x04,0x00,0x00]
+          vpminsq 1024(%rdx){1to2}, %xmm24, %xmm19
+
+// CHECK: vpminsq -1024(%rdx){1to2}, %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x10,0x39,0x5a,0x80]
+          vpminsq -1024(%rdx){1to2}, %xmm24, %xmm19
+
+// CHECK: vpminsq -1032(%rdx){1to2}, %xmm24, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0xbd,0x10,0x39,0x9a,0xf8,0xfb,0xff,0xff]
+          vpminsq -1032(%rdx){1to2}, %xmm24, %xmm19
+
+// CHECK: vpminsq %ymm28, %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x02,0x95,0x20,0x39,0xcc]
+          vpminsq %ymm28, %ymm29, %ymm25
+
+// CHECK: vpminsq %ymm28, %ymm29, %ymm25 {%k6}
+// CHECK:  encoding: [0x62,0x02,0x95,0x26,0x39,0xcc]
+          vpminsq %ymm28, %ymm29, %ymm25 {%k6}
+
+// CHECK: vpminsq %ymm28, %ymm29, %ymm25 {%k6} {z}
+// CHECK:  encoding: [0x62,0x02,0x95,0xa6,0x39,0xcc]
+          vpminsq %ymm28, %ymm29, %ymm25 {%k6} {z}
+
+// CHECK: vpminsq (%rcx), %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x20,0x39,0x09]
+          vpminsq (%rcx), %ymm29, %ymm25
+
+// CHECK: vpminsq 291(%rax,%r14,8), %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x22,0x95,0x20,0x39,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpminsq 291(%rax,%r14,8), %ymm29, %ymm25
+
+// CHECK: vpminsq (%rcx){1to4}, %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x30,0x39,0x09]
+          vpminsq (%rcx){1to4}, %ymm29, %ymm25
+
+// CHECK: vpminsq 4064(%rdx), %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x20,0x39,0x4a,0x7f]
+          vpminsq 4064(%rdx), %ymm29, %ymm25
+
+// CHECK: vpminsq 4096(%rdx), %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x20,0x39,0x8a,0x00,0x10,0x00,0x00]
+          vpminsq 4096(%rdx), %ymm29, %ymm25
+
+// CHECK: vpminsq -4096(%rdx), %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x20,0x39,0x4a,0x80]
+          vpminsq -4096(%rdx), %ymm29, %ymm25
+
+// CHECK: vpminsq -4128(%rdx), %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x20,0x39,0x8a,0xe0,0xef,0xff,0xff]
+          vpminsq -4128(%rdx), %ymm29, %ymm25
+
+// CHECK: vpminsq 1016(%rdx){1to4}, %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x30,0x39,0x4a,0x7f]
+          vpminsq 1016(%rdx){1to4}, %ymm29, %ymm25
+
+// CHECK: vpminsq 1024(%rdx){1to4}, %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x30,0x39,0x8a,0x00,0x04,0x00,0x00]
+          vpminsq 1024(%rdx){1to4}, %ymm29, %ymm25
+
+// CHECK: vpminsq -1024(%rdx){1to4}, %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x30,0x39,0x4a,0x80]
+          vpminsq -1024(%rdx){1to4}, %ymm29, %ymm25
+
+// CHECK: vpminsq -1032(%rdx){1to4}, %ymm29, %ymm25
+// CHECK:  encoding: [0x62,0x62,0x95,0x30,0x39,0x8a,0xf8,0xfb,0xff,0xff]
+          vpminsq -1032(%rdx){1to4}, %ymm29, %ymm25
+
+// CHECK: vpminud %xmm17, %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xa2,0x45,0x00,0x3b,0xd1]
+          vpminud %xmm17, %xmm23, %xmm18
+
+// CHECK: vpminud %xmm17, %xmm23, %xmm18 {%k3}
+// CHECK:  encoding: [0x62,0xa2,0x45,0x03,0x3b,0xd1]
+          vpminud %xmm17, %xmm23, %xmm18 {%k3}
+
+// CHECK: vpminud %xmm17, %xmm23, %xmm18 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa2,0x45,0x83,0x3b,0xd1]
+          vpminud %xmm17, %xmm23, %xmm18 {%k3} {z}
+
+// CHECK: vpminud (%rcx), %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x00,0x3b,0x11]
+          vpminud (%rcx), %xmm23, %xmm18
+
+// CHECK: vpminud 291(%rax,%r14,8), %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xa2,0x45,0x00,0x3b,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpminud 291(%rax,%r14,8), %xmm23, %xmm18
+
+// CHECK: vpminud (%rcx){1to4}, %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x10,0x3b,0x11]
+          vpminud (%rcx){1to4}, %xmm23, %xmm18
+
+// CHECK: vpminud 2032(%rdx), %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x00,0x3b,0x52,0x7f]
+          vpminud 2032(%rdx), %xmm23, %xmm18
+
+// CHECK: vpminud 2048(%rdx), %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x00,0x3b,0x92,0x00,0x08,0x00,0x00]
+          vpminud 2048(%rdx), %xmm23, %xmm18
+
+// CHECK: vpminud -2048(%rdx), %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x00,0x3b,0x52,0x80]
+          vpminud -2048(%rdx), %xmm23, %xmm18
+
+// CHECK: vpminud -2064(%rdx), %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x00,0x3b,0x92,0xf0,0xf7,0xff,0xff]
+          vpminud -2064(%rdx), %xmm23, %xmm18
+
+// CHECK: vpminud 508(%rdx){1to4}, %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x10,0x3b,0x52,0x7f]
+          vpminud 508(%rdx){1to4}, %xmm23, %xmm18
+
+// CHECK: vpminud 512(%rdx){1to4}, %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x10,0x3b,0x92,0x00,0x02,0x00,0x00]
+          vpminud 512(%rdx){1to4}, %xmm23, %xmm18
+
+// CHECK: vpminud -512(%rdx){1to4}, %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x10,0x3b,0x52,0x80]
+          vpminud -512(%rdx){1to4}, %xmm23, %xmm18
+
+// CHECK: vpminud -516(%rdx){1to4}, %xmm23, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0x45,0x10,0x3b,0x92,0xfc,0xfd,0xff,0xff]
+          vpminud -516(%rdx){1to4}, %xmm23, %xmm18
+
+// CHECK: vpminud %ymm19, %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xa2,0x4d,0x20,0x3b,0xdb]
+          vpminud %ymm19, %ymm22, %ymm19
+
+// CHECK: vpminud %ymm19, %ymm22, %ymm19 {%k3}
+// CHECK:  encoding: [0x62,0xa2,0x4d,0x23,0x3b,0xdb]
+          vpminud %ymm19, %ymm22, %ymm19 {%k3}
+
+// CHECK: vpminud %ymm19, %ymm22, %ymm19 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa2,0x4d,0xa3,0x3b,0xdb]
+          vpminud %ymm19, %ymm22, %ymm19 {%k3} {z}
+
+// CHECK: vpminud (%rcx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3b,0x19]
+          vpminud (%rcx), %ymm22, %ymm19
+
+// CHECK: vpminud 291(%rax,%r14,8), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xa2,0x4d,0x20,0x3b,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpminud 291(%rax,%r14,8), %ymm22, %ymm19
+
+// CHECK: vpminud (%rcx){1to8}, %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x30,0x3b,0x19]
+          vpminud (%rcx){1to8}, %ymm22, %ymm19
+
+// CHECK: vpminud 4064(%rdx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3b,0x5a,0x7f]
+          vpminud 4064(%rdx), %ymm22, %ymm19
+
+// CHECK: vpminud 4096(%rdx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3b,0x9a,0x00,0x10,0x00,0x00]
+          vpminud 4096(%rdx), %ymm22, %ymm19
+
+// CHECK: vpminud -4096(%rdx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3b,0x5a,0x80]
+          vpminud -4096(%rdx), %ymm22, %ymm19
+
+// CHECK: vpminud -4128(%rdx), %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x20,0x3b,0x9a,0xe0,0xef,0xff,0xff]
+          vpminud -4128(%rdx), %ymm22, %ymm19
+
+// CHECK: vpminud 508(%rdx){1to8}, %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x30,0x3b,0x5a,0x7f]
+          vpminud 508(%rdx){1to8}, %ymm22, %ymm19
+
+// CHECK: vpminud 512(%rdx){1to8}, %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x30,0x3b,0x9a,0x00,0x02,0x00,0x00]
+          vpminud 512(%rdx){1to8}, %ymm22, %ymm19
+
+// CHECK: vpminud -512(%rdx){1to8}, %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x30,0x3b,0x5a,0x80]
+          vpminud -512(%rdx){1to8}, %ymm22, %ymm19
+
+// CHECK: vpminud -516(%rdx){1to8}, %ymm22, %ymm19
+// CHECK:  encoding: [0x62,0xe2,0x4d,0x30,0x3b,0x9a,0xfc,0xfd,0xff,0xff]
+          vpminud -516(%rdx){1to8}, %ymm22, %ymm19
+
+// CHECK: vpminuq %xmm23, %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x22,0xcd,0x00,0x3b,0xe7]
+          vpminuq %xmm23, %xmm22, %xmm28
+
+// CHECK: vpminuq %xmm23, %xmm22, %xmm28 {%k4}
+// CHECK:  encoding: [0x62,0x22,0xcd,0x04,0x3b,0xe7]
+          vpminuq %xmm23, %xmm22, %xmm28 {%k4}
+
+// CHECK: vpminuq %xmm23, %xmm22, %xmm28 {%k4} {z}
+// CHECK:  encoding: [0x62,0x22,0xcd,0x84,0x3b,0xe7]
+          vpminuq %xmm23, %xmm22, %xmm28 {%k4} {z}
+
+// CHECK: vpminuq (%rcx), %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x00,0x3b,0x21]
+          vpminuq (%rcx), %xmm22, %xmm28
+
+// CHECK: vpminuq 291(%rax,%r14,8), %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x22,0xcd,0x00,0x3b,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpminuq 291(%rax,%r14,8), %xmm22, %xmm28
+
+// CHECK: vpminuq (%rcx){1to2}, %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x10,0x3b,0x21]
+          vpminuq (%rcx){1to2}, %xmm22, %xmm28
+
+// CHECK: vpminuq 2032(%rdx), %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x00,0x3b,0x62,0x7f]
+          vpminuq 2032(%rdx), %xmm22, %xmm28
+
+// CHECK: vpminuq 2048(%rdx), %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x00,0x3b,0xa2,0x00,0x08,0x00,0x00]
+          vpminuq 2048(%rdx), %xmm22, %xmm28
+
+// CHECK: vpminuq -2048(%rdx), %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x00,0x3b,0x62,0x80]
+          vpminuq -2048(%rdx), %xmm22, %xmm28
+
+// CHECK: vpminuq -2064(%rdx), %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x00,0x3b,0xa2,0xf0,0xf7,0xff,0xff]
+          vpminuq -2064(%rdx), %xmm22, %xmm28
+
+// CHECK: vpminuq 1016(%rdx){1to2}, %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x10,0x3b,0x62,0x7f]
+          vpminuq 1016(%rdx){1to2}, %xmm22, %xmm28
+
+// CHECK: vpminuq 1024(%rdx){1to2}, %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x10,0x3b,0xa2,0x00,0x04,0x00,0x00]
+          vpminuq 1024(%rdx){1to2}, %xmm22, %xmm28
+
+// CHECK: vpminuq -1024(%rdx){1to2}, %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x10,0x3b,0x62,0x80]
+          vpminuq -1024(%rdx){1to2}, %xmm22, %xmm28
+
+// CHECK: vpminuq -1032(%rdx){1to2}, %xmm22, %xmm28
+// CHECK:  encoding: [0x62,0x62,0xcd,0x10,0x3b,0xa2,0xf8,0xfb,0xff,0xff]
+          vpminuq -1032(%rdx){1to2}, %xmm22, %xmm28
+
+// CHECK: vpminuq %ymm23, %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x22,0xd5,0x20,0x3b,0xef]
+          vpminuq %ymm23, %ymm21, %ymm29
+
+// CHECK: vpminuq %ymm23, %ymm21, %ymm29 {%k5}
+// CHECK:  encoding: [0x62,0x22,0xd5,0x25,0x3b,0xef]
+          vpminuq %ymm23, %ymm21, %ymm29 {%k5}
+
+// CHECK: vpminuq %ymm23, %ymm21, %ymm29 {%k5} {z}
+// CHECK:  encoding: [0x62,0x22,0xd5,0xa5,0x3b,0xef]
+          vpminuq %ymm23, %ymm21, %ymm29 {%k5} {z}
+
+// CHECK: vpminuq (%rcx), %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x3b,0x29]
+          vpminuq (%rcx), %ymm21, %ymm29
+
+// CHECK: vpminuq 291(%rax,%r14,8), %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x22,0xd5,0x20,0x3b,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpminuq 291(%rax,%r14,8), %ymm21, %ymm29
+
+// CHECK: vpminuq (%rcx){1to4}, %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x3b,0x29]
+          vpminuq (%rcx){1to4}, %ymm21, %ymm29
+
+// CHECK: vpminuq 4064(%rdx), %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x3b,0x6a,0x7f]
+          vpminuq 4064(%rdx), %ymm21, %ymm29
+
+// CHECK: vpminuq 4096(%rdx), %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x3b,0xaa,0x00,0x10,0x00,0x00]
+          vpminuq 4096(%rdx), %ymm21, %ymm29
+
+// CHECK: vpminuq -4096(%rdx), %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x3b,0x6a,0x80]
+          vpminuq -4096(%rdx), %ymm21, %ymm29
+
+// CHECK: vpminuq -4128(%rdx), %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x20,0x3b,0xaa,0xe0,0xef,0xff,0xff]
+          vpminuq -4128(%rdx), %ymm21, %ymm29
+
+// CHECK: vpminuq 1016(%rdx){1to4}, %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x3b,0x6a,0x7f]
+          vpminuq 1016(%rdx){1to4}, %ymm21, %ymm29
+
+// CHECK: vpminuq 1024(%rdx){1to4}, %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x3b,0xaa,0x00,0x04,0x00,0x00]
+          vpminuq 1024(%rdx){1to4}, %ymm21, %ymm29
+
+// CHECK: vpminuq -1024(%rdx){1to4}, %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x3b,0x6a,0x80]
+          vpminuq -1024(%rdx){1to4}, %ymm21, %ymm29
+
+// CHECK: vpminuq -1032(%rdx){1to4}, %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x62,0xd5,0x30,0x3b,0xaa,0xf8,0xfb,0xff,0xff]
+          vpminuq -1032(%rdx){1to4}, %ymm21, %ymm29
+
+// CHECK: vpmulld %xmm24, %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x02,0x65,0x00,0x40,0xc8]
+          vpmulld %xmm24, %xmm19, %xmm25
+
+// CHECK: vpmulld %xmm24, %xmm19, %xmm25 {%k6}
+// CHECK:  encoding: [0x62,0x02,0x65,0x06,0x40,0xc8]
+          vpmulld %xmm24, %xmm19, %xmm25 {%k6}
+
+// CHECK: vpmulld %xmm24, %xmm19, %xmm25 {%k6} {z}
+// CHECK:  encoding: [0x62,0x02,0x65,0x86,0x40,0xc8]
+          vpmulld %xmm24, %xmm19, %xmm25 {%k6} {z}
+
+// CHECK: vpmulld (%rcx), %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x40,0x09]
+          vpmulld (%rcx), %xmm19, %xmm25
+
+// CHECK: vpmulld 291(%rax,%r14,8), %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x22,0x65,0x00,0x40,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpmulld 291(%rax,%r14,8), %xmm19, %xmm25
+
+// CHECK: vpmulld (%rcx){1to4}, %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x10,0x40,0x09]
+          vpmulld (%rcx){1to4}, %xmm19, %xmm25
+
+// CHECK: vpmulld 2032(%rdx), %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x40,0x4a,0x7f]
+          vpmulld 2032(%rdx), %xmm19, %xmm25
+
+// CHECK: vpmulld 2048(%rdx), %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x40,0x8a,0x00,0x08,0x00,0x00]
+          vpmulld 2048(%rdx), %xmm19, %xmm25
+
+// CHECK: vpmulld -2048(%rdx), %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x40,0x4a,0x80]
+          vpmulld -2048(%rdx), %xmm19, %xmm25
+
+// CHECK: vpmulld -2064(%rdx), %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x00,0x40,0x8a,0xf0,0xf7,0xff,0xff]
+          vpmulld -2064(%rdx), %xmm19, %xmm25
+
+// CHECK: vpmulld 508(%rdx){1to4}, %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x10,0x40,0x4a,0x7f]
+          vpmulld 508(%rdx){1to4}, %xmm19, %xmm25
+
+// CHECK: vpmulld 512(%rdx){1to4}, %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x10,0x40,0x8a,0x00,0x02,0x00,0x00]
+          vpmulld 512(%rdx){1to4}, %xmm19, %xmm25
+
+// CHECK: vpmulld -512(%rdx){1to4}, %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x10,0x40,0x4a,0x80]
+          vpmulld -512(%rdx){1to4}, %xmm19, %xmm25
+
+// CHECK: vpmulld -516(%rdx){1to4}, %xmm19, %xmm25
+// CHECK:  encoding: [0x62,0x62,0x65,0x10,0x40,0x8a,0xfc,0xfd,0xff,0xff]
+          vpmulld -516(%rdx){1to4}, %xmm19, %xmm25
+
+// CHECK: vpmulld %ymm26, %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0x82,0x2d,0x20,0x40,0xfa]
+          vpmulld %ymm26, %ymm26, %ymm23
+
+// CHECK: vpmulld %ymm26, %ymm26, %ymm23 {%k2}
+// CHECK:  encoding: [0x62,0x82,0x2d,0x22,0x40,0xfa]
+          vpmulld %ymm26, %ymm26, %ymm23 {%k2}
+
+// CHECK: vpmulld %ymm26, %ymm26, %ymm23 {%k2} {z}
+// CHECK:  encoding: [0x62,0x82,0x2d,0xa2,0x40,0xfa]
+          vpmulld %ymm26, %ymm26, %ymm23 {%k2} {z}
+
+// CHECK: vpmulld (%rcx), %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x40,0x39]
+          vpmulld (%rcx), %ymm26, %ymm23
+
+// CHECK: vpmulld 291(%rax,%r14,8), %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xa2,0x2d,0x20,0x40,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpmulld 291(%rax,%r14,8), %ymm26, %ymm23
+
+// CHECK: vpmulld (%rcx){1to8}, %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x40,0x39]
+          vpmulld (%rcx){1to8}, %ymm26, %ymm23
+
+// CHECK: vpmulld 4064(%rdx), %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x40,0x7a,0x7f]
+          vpmulld 4064(%rdx), %ymm26, %ymm23
+
+// CHECK: vpmulld 4096(%rdx), %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x40,0xba,0x00,0x10,0x00,0x00]
+          vpmulld 4096(%rdx), %ymm26, %ymm23
+
+// CHECK: vpmulld -4096(%rdx), %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x40,0x7a,0x80]
+          vpmulld -4096(%rdx), %ymm26, %ymm23
+
+// CHECK: vpmulld -4128(%rdx), %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x20,0x40,0xba,0xe0,0xef,0xff,0xff]
+          vpmulld -4128(%rdx), %ymm26, %ymm23
+
+// CHECK: vpmulld 508(%rdx){1to8}, %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x40,0x7a,0x7f]
+          vpmulld 508(%rdx){1to8}, %ymm26, %ymm23
+
+// CHECK: vpmulld 512(%rdx){1to8}, %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x40,0xba,0x00,0x02,0x00,0x00]
+          vpmulld 512(%rdx){1to8}, %ymm26, %ymm23
+
+// CHECK: vpmulld -512(%rdx){1to8}, %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x40,0x7a,0x80]
+          vpmulld -512(%rdx){1to8}, %ymm26, %ymm23
+
+// CHECK: vpmulld -516(%rdx){1to8}, %ymm26, %ymm23
+// CHECK:  encoding: [0x62,0xe2,0x2d,0x30,0x40,0xba,0xfc,0xfd,0xff,0xff]
+          vpmulld -516(%rdx){1to8}, %ymm26, %ymm23
+
+// CHECK: vpord  %xmm28, %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x01,0x15,0x00,0xeb,0xe4]
+          vpord  %xmm28, %xmm29, %xmm28
+
+// CHECK: vpord  %xmm28, %xmm29, %xmm28 {%k2}
+// CHECK:  encoding: [0x62,0x01,0x15,0x02,0xeb,0xe4]
+          vpord  %xmm28, %xmm29, %xmm28 {%k2}
+
+// CHECK: vpord  %xmm28, %xmm29, %xmm28 {%k2} {z}
+// CHECK:  encoding: [0x62,0x01,0x15,0x82,0xeb,0xe4]
+          vpord  %xmm28, %xmm29, %xmm28 {%k2} {z}
+
+// CHECK: vpord  (%rcx), %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xeb,0x21]
+          vpord  (%rcx), %xmm29, %xmm28
+
+// CHECK: vpord  291(%rax,%r14,8), %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x21,0x15,0x00,0xeb,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpord  291(%rax,%r14,8), %xmm29, %xmm28
+
+// CHECK: vpord  (%rcx){1to4}, %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x10,0xeb,0x21]
+          vpord  (%rcx){1to4}, %xmm29, %xmm28
+
+// CHECK: vpord  2032(%rdx), %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xeb,0x62,0x7f]
+          vpord  2032(%rdx), %xmm29, %xmm28
+
+// CHECK: vpord  2048(%rdx), %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xeb,0xa2,0x00,0x08,0x00,0x00]
+          vpord  2048(%rdx), %xmm29, %xmm28
+
+// CHECK: vpord  -2048(%rdx), %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xeb,0x62,0x80]
+          vpord  -2048(%rdx), %xmm29, %xmm28
+
+// CHECK: vpord  -2064(%rdx), %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x00,0xeb,0xa2,0xf0,0xf7,0xff,0xff]
+          vpord  -2064(%rdx), %xmm29, %xmm28
+
+// CHECK: vpord  508(%rdx){1to4}, %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x10,0xeb,0x62,0x7f]
+          vpord  508(%rdx){1to4}, %xmm29, %xmm28
+
+// CHECK: vpord  512(%rdx){1to4}, %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x10,0xeb,0xa2,0x00,0x02,0x00,0x00]
+          vpord  512(%rdx){1to4}, %xmm29, %xmm28
+
+// CHECK: vpord  -512(%rdx){1to4}, %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x10,0xeb,0x62,0x80]
+          vpord  -512(%rdx){1to4}, %xmm29, %xmm28
+
+// CHECK: vpord  -516(%rdx){1to4}, %xmm29, %xmm28
+// CHECK:  encoding: [0x62,0x61,0x15,0x10,0xeb,0xa2,0xfc,0xfd,0xff,0xff]
+          vpord  -516(%rdx){1to4}, %xmm29, %xmm28
+
+// CHECK: vpord  %ymm22, %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xa1,0x35,0x20,0xeb,0xee]
+          vpord  %ymm22, %ymm25, %ymm21
+
+// CHECK: vpord  %ymm22, %ymm25, %ymm21 {%k6}
+// CHECK:  encoding: [0x62,0xa1,0x35,0x26,0xeb,0xee]
+          vpord  %ymm22, %ymm25, %ymm21 {%k6}
+
+// CHECK: vpord  %ymm22, %ymm25, %ymm21 {%k6} {z}
+// CHECK:  encoding: [0x62,0xa1,0x35,0xa6,0xeb,0xee]
+          vpord  %ymm22, %ymm25, %ymm21 {%k6} {z}
+
+// CHECK: vpord  (%rcx), %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x20,0xeb,0x29]
+          vpord  (%rcx), %ymm25, %ymm21
+
+// CHECK: vpord  291(%rax,%r14,8), %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xa1,0x35,0x20,0xeb,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpord  291(%rax,%r14,8), %ymm25, %ymm21
+
+// CHECK: vpord  (%rcx){1to8}, %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x30,0xeb,0x29]
+          vpord  (%rcx){1to8}, %ymm25, %ymm21
+
+// CHECK: vpord  4064(%rdx), %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x20,0xeb,0x6a,0x7f]
+          vpord  4064(%rdx), %ymm25, %ymm21
+
+// CHECK: vpord  4096(%rdx), %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x20,0xeb,0xaa,0x00,0x10,0x00,0x00]
+          vpord  4096(%rdx), %ymm25, %ymm21
+
+// CHECK: vpord  -4096(%rdx), %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x20,0xeb,0x6a,0x80]
+          vpord  -4096(%rdx), %ymm25, %ymm21
+
+// CHECK: vpord  -4128(%rdx), %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x20,0xeb,0xaa,0xe0,0xef,0xff,0xff]
+          vpord  -4128(%rdx), %ymm25, %ymm21
+
+// CHECK: vpord  508(%rdx){1to8}, %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x30,0xeb,0x6a,0x7f]
+          vpord  508(%rdx){1to8}, %ymm25, %ymm21
+
+// CHECK: vpord  512(%rdx){1to8}, %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x30,0xeb,0xaa,0x00,0x02,0x00,0x00]
+          vpord  512(%rdx){1to8}, %ymm25, %ymm21
+
+// CHECK: vpord  -512(%rdx){1to8}, %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x30,0xeb,0x6a,0x80]
+          vpord  -512(%rdx){1to8}, %ymm25, %ymm21
+
+// CHECK: vpord  -516(%rdx){1to8}, %ymm25, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x35,0x30,0xeb,0xaa,0xfc,0xfd,0xff,0xff]
+          vpord  -516(%rdx){1to8}, %ymm25, %ymm21
+
+// CHECK: vporq  %xmm20, %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x9d,0x00,0xeb,0xec]
+          vporq  %xmm20, %xmm28, %xmm21
+
+// CHECK: vporq  %xmm20, %xmm28, %xmm21 {%k7}
+// CHECK:  encoding: [0x62,0xa1,0x9d,0x07,0xeb,0xec]
+          vporq  %xmm20, %xmm28, %xmm21 {%k7}
+
+// CHECK: vporq  %xmm20, %xmm28, %xmm21 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa1,0x9d,0x87,0xeb,0xec]
+          vporq  %xmm20, %xmm28, %xmm21 {%k7} {z}
+
+// CHECK: vporq  (%rcx), %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xeb,0x29]
+          vporq  (%rcx), %xmm28, %xmm21
+
+// CHECK: vporq  291(%rax,%r14,8), %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xa1,0x9d,0x00,0xeb,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vporq  291(%rax,%r14,8), %xmm28, %xmm21
+
+// CHECK: vporq  (%rcx){1to2}, %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xeb,0x29]
+          vporq  (%rcx){1to2}, %xmm28, %xmm21
+
+// CHECK: vporq  2032(%rdx), %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xeb,0x6a,0x7f]
+          vporq  2032(%rdx), %xmm28, %xmm21
+
+// CHECK: vporq  2048(%rdx), %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xeb,0xaa,0x00,0x08,0x00,0x00]
+          vporq  2048(%rdx), %xmm28, %xmm21
+
+// CHECK: vporq  -2048(%rdx), %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xeb,0x6a,0x80]
+          vporq  -2048(%rdx), %xmm28, %xmm21
+
+// CHECK: vporq  -2064(%rdx), %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x00,0xeb,0xaa,0xf0,0xf7,0xff,0xff]
+          vporq  -2064(%rdx), %xmm28, %xmm21
+
+// CHECK: vporq  1016(%rdx){1to2}, %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xeb,0x6a,0x7f]
+          vporq  1016(%rdx){1to2}, %xmm28, %xmm21
+
+// CHECK: vporq  1024(%rdx){1to2}, %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xeb,0xaa,0x00,0x04,0x00,0x00]
+          vporq  1024(%rdx){1to2}, %xmm28, %xmm21
+
+// CHECK: vporq  -1024(%rdx){1to2}, %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xeb,0x6a,0x80]
+          vporq  -1024(%rdx){1to2}, %xmm28, %xmm21
+
+// CHECK: vporq  -1032(%rdx){1to2}, %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0xe1,0x9d,0x10,0xeb,0xaa,0xf8,0xfb,0xff,0xff]
+          vporq  -1032(%rdx){1to2}, %xmm28, %xmm21
+
+// CHECK: vporq  %ymm24, %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x01,0xe5,0x20,0xeb,0xe0]
+          vporq  %ymm24, %ymm19, %ymm28
+
+// CHECK: vporq  %ymm24, %ymm19, %ymm28 {%k4}
+// CHECK:  encoding: [0x62,0x01,0xe5,0x24,0xeb,0xe0]
+          vporq  %ymm24, %ymm19, %ymm28 {%k4}
+
+// CHECK: vporq  %ymm24, %ymm19, %ymm28 {%k4} {z}
+// CHECK:  encoding: [0x62,0x01,0xe5,0xa4,0xeb,0xe0]
+          vporq  %ymm24, %ymm19, %ymm28 {%k4} {z}
+
+// CHECK: vporq  (%rcx), %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x20,0xeb,0x21]
+          vporq  (%rcx), %ymm19, %ymm28
+
+// CHECK: vporq  291(%rax,%r14,8), %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x21,0xe5,0x20,0xeb,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vporq  291(%rax,%r14,8), %ymm19, %ymm28
+
+// CHECK: vporq  (%rcx){1to4}, %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x30,0xeb,0x21]
+          vporq  (%rcx){1to4}, %ymm19, %ymm28
+
+// CHECK: vporq  4064(%rdx), %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x20,0xeb,0x62,0x7f]
+          vporq  4064(%rdx), %ymm19, %ymm28
+
+// CHECK: vporq  4096(%rdx), %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x20,0xeb,0xa2,0x00,0x10,0x00,0x00]
+          vporq  4096(%rdx), %ymm19, %ymm28
+
+// CHECK: vporq  -4096(%rdx), %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x20,0xeb,0x62,0x80]
+          vporq  -4096(%rdx), %ymm19, %ymm28
+
+// CHECK: vporq  -4128(%rdx), %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x20,0xeb,0xa2,0xe0,0xef,0xff,0xff]
+          vporq  -4128(%rdx), %ymm19, %ymm28
+
+// CHECK: vporq  1016(%rdx){1to4}, %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x30,0xeb,0x62,0x7f]
+          vporq  1016(%rdx){1to4}, %ymm19, %ymm28
+
+// CHECK: vporq  1024(%rdx){1to4}, %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x30,0xeb,0xa2,0x00,0x04,0x00,0x00]
+          vporq  1024(%rdx){1to4}, %ymm19, %ymm28
+
+// CHECK: vporq  -1024(%rdx){1to4}, %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x30,0xeb,0x62,0x80]
+          vporq  -1024(%rdx){1to4}, %ymm19, %ymm28
+
+// CHECK: vporq  -1032(%rdx){1to4}, %ymm19, %ymm28
+// CHECK:  encoding: [0x62,0x61,0xe5,0x30,0xeb,0xa2,0xf8,0xfb,0xff,0xff]
+          vporq  -1032(%rdx){1to4}, %ymm19, %ymm28
+
+// CHECK: vpsubd %xmm26, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0x81,0x6d,0x00,0xfa,0xda]
+          vpsubd %xmm26, %xmm18, %xmm19
+
+// CHECK: vpsubd %xmm26, %xmm18, %xmm19 {%k2}
+// CHECK:  encoding: [0x62,0x81,0x6d,0x02,0xfa,0xda]
+          vpsubd %xmm26, %xmm18, %xmm19 {%k2}
+
+// CHECK: vpsubd %xmm26, %xmm18, %xmm19 {%k2} {z}
+// CHECK:  encoding: [0x62,0x81,0x6d,0x82,0xfa,0xda]
+          vpsubd %xmm26, %xmm18, %xmm19 {%k2} {z}
+
+// CHECK: vpsubd (%rcx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x00,0xfa,0x19]
+          vpsubd (%rcx), %xmm18, %xmm19
+
+// CHECK: vpsubd 291(%rax,%r14,8), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xa1,0x6d,0x00,0xfa,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vpsubd 291(%rax,%r14,8), %xmm18, %xmm19
+
+// CHECK: vpsubd (%rcx){1to4}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x10,0xfa,0x19]
+          vpsubd (%rcx){1to4}, %xmm18, %xmm19
+
+// CHECK: vpsubd 2032(%rdx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x00,0xfa,0x5a,0x7f]
+          vpsubd 2032(%rdx), %xmm18, %xmm19
+
+// CHECK: vpsubd 2048(%rdx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x00,0xfa,0x9a,0x00,0x08,0x00,0x00]
+          vpsubd 2048(%rdx), %xmm18, %xmm19
+
+// CHECK: vpsubd -2048(%rdx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x00,0xfa,0x5a,0x80]
+          vpsubd -2048(%rdx), %xmm18, %xmm19
+
+// CHECK: vpsubd -2064(%rdx), %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x00,0xfa,0x9a,0xf0,0xf7,0xff,0xff]
+          vpsubd -2064(%rdx), %xmm18, %xmm19
+
+// CHECK: vpsubd 508(%rdx){1to4}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x10,0xfa,0x5a,0x7f]
+          vpsubd 508(%rdx){1to4}, %xmm18, %xmm19
+
+// CHECK: vpsubd 512(%rdx){1to4}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x10,0xfa,0x9a,0x00,0x02,0x00,0x00]
+          vpsubd 512(%rdx){1to4}, %xmm18, %xmm19
+
+// CHECK: vpsubd -512(%rdx){1to4}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x10,0xfa,0x5a,0x80]
+          vpsubd -512(%rdx){1to4}, %xmm18, %xmm19
+
+// CHECK: vpsubd -516(%rdx){1to4}, %xmm18, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x6d,0x10,0xfa,0x9a,0xfc,0xfd,0xff,0xff]
+          vpsubd -516(%rdx){1to4}, %xmm18, %xmm19
+
+// CHECK: vpsubd %ymm21, %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x21,0x2d,0x20,0xfa,0xc5]
+          vpsubd %ymm21, %ymm26, %ymm24
+
+// CHECK: vpsubd %ymm21, %ymm26, %ymm24 {%k1}
+// CHECK:  encoding: [0x62,0x21,0x2d,0x21,0xfa,0xc5]
+          vpsubd %ymm21, %ymm26, %ymm24 {%k1}
+
+// CHECK: vpsubd %ymm21, %ymm26, %ymm24 {%k1} {z}
+// CHECK:  encoding: [0x62,0x21,0x2d,0xa1,0xfa,0xc5]
+          vpsubd %ymm21, %ymm26, %ymm24 {%k1} {z}
+
+// CHECK: vpsubd (%rcx), %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xfa,0x01]
+          vpsubd (%rcx), %ymm26, %ymm24
+
+// CHECK: vpsubd 291(%rax,%r14,8), %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x21,0x2d,0x20,0xfa,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vpsubd 291(%rax,%r14,8), %ymm26, %ymm24
+
+// CHECK: vpsubd (%rcx){1to8}, %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x30,0xfa,0x01]
+          vpsubd (%rcx){1to8}, %ymm26, %ymm24
+
+// CHECK: vpsubd 4064(%rdx), %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xfa,0x42,0x7f]
+          vpsubd 4064(%rdx), %ymm26, %ymm24
+
+// CHECK: vpsubd 4096(%rdx), %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xfa,0x82,0x00,0x10,0x00,0x00]
+          vpsubd 4096(%rdx), %ymm26, %ymm24
+
+// CHECK: vpsubd -4096(%rdx), %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xfa,0x42,0x80]
+          vpsubd -4096(%rdx), %ymm26, %ymm24
+
+// CHECK: vpsubd -4128(%rdx), %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x20,0xfa,0x82,0xe0,0xef,0xff,0xff]
+          vpsubd -4128(%rdx), %ymm26, %ymm24
+
+// CHECK: vpsubd 508(%rdx){1to8}, %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x30,0xfa,0x42,0x7f]
+          vpsubd 508(%rdx){1to8}, %ymm26, %ymm24
+
+// CHECK: vpsubd 512(%rdx){1to8}, %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x30,0xfa,0x82,0x00,0x02,0x00,0x00]
+          vpsubd 512(%rdx){1to8}, %ymm26, %ymm24
+
+// CHECK: vpsubd -512(%rdx){1to8}, %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x30,0xfa,0x42,0x80]
+          vpsubd -512(%rdx){1to8}, %ymm26, %ymm24
+
+// CHECK: vpsubd -516(%rdx){1to8}, %ymm26, %ymm24
+// CHECK:  encoding: [0x62,0x61,0x2d,0x30,0xfa,0x82,0xfc,0xfd,0xff,0xff]
+          vpsubd -516(%rdx){1to8}, %ymm26, %ymm24
+
+// CHECK: vpsubq %xmm27, %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0x81,0xa5,0x00,0xfb,0xe3]
+          vpsubq %xmm27, %xmm27, %xmm20
+
+// CHECK: vpsubq %xmm27, %xmm27, %xmm20 {%k2}
+// CHECK:  encoding: [0x62,0x81,0xa5,0x02,0xfb,0xe3]
+          vpsubq %xmm27, %xmm27, %xmm20 {%k2}
+
+// CHECK: vpsubq %xmm27, %xmm27, %xmm20 {%k2} {z}
+// CHECK:  encoding: [0x62,0x81,0xa5,0x82,0xfb,0xe3]
+          vpsubq %xmm27, %xmm27, %xmm20 {%k2} {z}
+
+// CHECK: vpsubq (%rcx), %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x00,0xfb,0x21]
+          vpsubq (%rcx), %xmm27, %xmm20
+
+// CHECK: vpsubq 291(%rax,%r14,8), %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xa1,0xa5,0x00,0xfb,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpsubq 291(%rax,%r14,8), %xmm27, %xmm20
+
+// CHECK: vpsubq (%rcx){1to2}, %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x10,0xfb,0x21]
+          vpsubq (%rcx){1to2}, %xmm27, %xmm20
+
+// CHECK: vpsubq 2032(%rdx), %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x00,0xfb,0x62,0x7f]
+          vpsubq 2032(%rdx), %xmm27, %xmm20
+
+// CHECK: vpsubq 2048(%rdx), %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x00,0xfb,0xa2,0x00,0x08,0x00,0x00]
+          vpsubq 2048(%rdx), %xmm27, %xmm20
+
+// CHECK: vpsubq -2048(%rdx), %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x00,0xfb,0x62,0x80]
+          vpsubq -2048(%rdx), %xmm27, %xmm20
+
+// CHECK: vpsubq -2064(%rdx), %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x00,0xfb,0xa2,0xf0,0xf7,0xff,0xff]
+          vpsubq -2064(%rdx), %xmm27, %xmm20
+
+// CHECK: vpsubq 1016(%rdx){1to2}, %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x10,0xfb,0x62,0x7f]
+          vpsubq 1016(%rdx){1to2}, %xmm27, %xmm20
+
+// CHECK: vpsubq 1024(%rdx){1to2}, %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x10,0xfb,0xa2,0x00,0x04,0x00,0x00]
+          vpsubq 1024(%rdx){1to2}, %xmm27, %xmm20
+
+// CHECK: vpsubq -1024(%rdx){1to2}, %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x10,0xfb,0x62,0x80]
+          vpsubq -1024(%rdx){1to2}, %xmm27, %xmm20
+
+// CHECK: vpsubq -1032(%rdx){1to2}, %xmm27, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0xa5,0x10,0xfb,0xa2,0xf8,0xfb,0xff,0xff]
+          vpsubq -1032(%rdx){1to2}, %xmm27, %xmm20
+
+// CHECK: vpsubq %ymm28, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x01,0xdd,0x20,0xfb,0xec]
+          vpsubq %ymm28, %ymm20, %ymm29
+
+// CHECK: vpsubq %ymm28, %ymm20, %ymm29 {%k5}
+// CHECK:  encoding: [0x62,0x01,0xdd,0x25,0xfb,0xec]
+          vpsubq %ymm28, %ymm20, %ymm29 {%k5}
+
+// CHECK: vpsubq %ymm28, %ymm20, %ymm29 {%k5} {z}
+// CHECK:  encoding: [0x62,0x01,0xdd,0xa5,0xfb,0xec]
+          vpsubq %ymm28, %ymm20, %ymm29 {%k5} {z}
+
+// CHECK: vpsubq (%rcx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xfb,0x29]
+          vpsubq (%rcx), %ymm20, %ymm29
+
+// CHECK: vpsubq 291(%rax,%r14,8), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x21,0xdd,0x20,0xfb,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpsubq 291(%rax,%r14,8), %ymm20, %ymm29
+
+// CHECK: vpsubq (%rcx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xfb,0x29]
+          vpsubq (%rcx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpsubq 4064(%rdx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xfb,0x6a,0x7f]
+          vpsubq 4064(%rdx), %ymm20, %ymm29
+
+// CHECK: vpsubq 4096(%rdx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xfb,0xaa,0x00,0x10,0x00,0x00]
+          vpsubq 4096(%rdx), %ymm20, %ymm29
+
+// CHECK: vpsubq -4096(%rdx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xfb,0x6a,0x80]
+          vpsubq -4096(%rdx), %ymm20, %ymm29
+
+// CHECK: vpsubq -4128(%rdx), %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x20,0xfb,0xaa,0xe0,0xef,0xff,0xff]
+          vpsubq -4128(%rdx), %ymm20, %ymm29
+
+// CHECK: vpsubq 1016(%rdx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xfb,0x6a,0x7f]
+          vpsubq 1016(%rdx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpsubq 1024(%rdx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xfb,0xaa,0x00,0x04,0x00,0x00]
+          vpsubq 1024(%rdx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpsubq -1024(%rdx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xfb,0x6a,0x80]
+          vpsubq -1024(%rdx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpsubq -1032(%rdx){1to4}, %ymm20, %ymm29
+// CHECK:  encoding: [0x62,0x61,0xdd,0x30,0xfb,0xaa,0xf8,0xfb,0xff,0xff]
+          vpsubq -1032(%rdx){1to4}, %ymm20, %ymm29
+
+// CHECK: vpxord %xmm25, %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0x81,0x75,0x00,0xef,0xf9]
+          vpxord %xmm25, %xmm17, %xmm23
+
+// CHECK: vpxord %xmm25, %xmm17, %xmm23 {%k3}
+// CHECK:  encoding: [0x62,0x81,0x75,0x03,0xef,0xf9]
+          vpxord %xmm25, %xmm17, %xmm23 {%k3}
+
+// CHECK: vpxord %xmm25, %xmm17, %xmm23 {%k3} {z}
+// CHECK:  encoding: [0x62,0x81,0x75,0x83,0xef,0xf9]
+          vpxord %xmm25, %xmm17, %xmm23 {%k3} {z}
+
+// CHECK: vpxord (%rcx), %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xef,0x39]
+          vpxord (%rcx), %xmm17, %xmm23
+
+// CHECK: vpxord 291(%rax,%r14,8), %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xa1,0x75,0x00,0xef,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vpxord 291(%rax,%r14,8), %xmm17, %xmm23
+
+// CHECK: vpxord (%rcx){1to4}, %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x10,0xef,0x39]
+          vpxord (%rcx){1to4}, %xmm17, %xmm23
+
+// CHECK: vpxord 2032(%rdx), %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xef,0x7a,0x7f]
+          vpxord 2032(%rdx), %xmm17, %xmm23
+
+// CHECK: vpxord 2048(%rdx), %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xef,0xba,0x00,0x08,0x00,0x00]
+          vpxord 2048(%rdx), %xmm17, %xmm23
+
+// CHECK: vpxord -2048(%rdx), %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xef,0x7a,0x80]
+          vpxord -2048(%rdx), %xmm17, %xmm23
+
+// CHECK: vpxord -2064(%rdx), %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x00,0xef,0xba,0xf0,0xf7,0xff,0xff]
+          vpxord -2064(%rdx), %xmm17, %xmm23
+
+// CHECK: vpxord 508(%rdx){1to4}, %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x10,0xef,0x7a,0x7f]
+          vpxord 508(%rdx){1to4}, %xmm17, %xmm23
+
+// CHECK: vpxord 512(%rdx){1to4}, %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x10,0xef,0xba,0x00,0x02,0x00,0x00]
+          vpxord 512(%rdx){1to4}, %xmm17, %xmm23
+
+// CHECK: vpxord -512(%rdx){1to4}, %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x10,0xef,0x7a,0x80]
+          vpxord -512(%rdx){1to4}, %xmm17, %xmm23
+
+// CHECK: vpxord -516(%rdx){1to4}, %xmm17, %xmm23
+// CHECK:  encoding: [0x62,0xe1,0x75,0x10,0xef,0xba,0xfc,0xfd,0xff,0xff]
+          vpxord -516(%rdx){1to4}, %xmm17, %xmm23
+
+// CHECK: vpxord %ymm22, %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xa1,0x15,0x20,0xef,0xf6]
+          vpxord %ymm22, %ymm29, %ymm22
+
+// CHECK: vpxord %ymm22, %ymm29, %ymm22 {%k4}
+// CHECK:  encoding: [0x62,0xa1,0x15,0x24,0xef,0xf6]
+          vpxord %ymm22, %ymm29, %ymm22 {%k4}
+
+// CHECK: vpxord %ymm22, %ymm29, %ymm22 {%k4} {z}
+// CHECK:  encoding: [0x62,0xa1,0x15,0xa4,0xef,0xf6]
+          vpxord %ymm22, %ymm29, %ymm22 {%k4} {z}
+
+// CHECK: vpxord (%rcx), %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xef,0x31]
+          vpxord (%rcx), %ymm29, %ymm22
+
+// CHECK: vpxord 291(%rax,%r14,8), %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xa1,0x15,0x20,0xef,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vpxord 291(%rax,%r14,8), %ymm29, %ymm22
+
+// CHECK: vpxord (%rcx){1to8}, %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x30,0xef,0x31]
+          vpxord (%rcx){1to8}, %ymm29, %ymm22
+
+// CHECK: vpxord 4064(%rdx), %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xef,0x72,0x7f]
+          vpxord 4064(%rdx), %ymm29, %ymm22
+
+// CHECK: vpxord 4096(%rdx), %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xef,0xb2,0x00,0x10,0x00,0x00]
+          vpxord 4096(%rdx), %ymm29, %ymm22
+
+// CHECK: vpxord -4096(%rdx), %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xef,0x72,0x80]
+          vpxord -4096(%rdx), %ymm29, %ymm22
+
+// CHECK: vpxord -4128(%rdx), %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x20,0xef,0xb2,0xe0,0xef,0xff,0xff]
+          vpxord -4128(%rdx), %ymm29, %ymm22
+
+// CHECK: vpxord 508(%rdx){1to8}, %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x30,0xef,0x72,0x7f]
+          vpxord 508(%rdx){1to8}, %ymm29, %ymm22
+
+// CHECK: vpxord 512(%rdx){1to8}, %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x30,0xef,0xb2,0x00,0x02,0x00,0x00]
+          vpxord 512(%rdx){1to8}, %ymm29, %ymm22
+
+// CHECK: vpxord -512(%rdx){1to8}, %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x30,0xef,0x72,0x80]
+          vpxord -512(%rdx){1to8}, %ymm29, %ymm22
+
+// CHECK: vpxord -516(%rdx){1to8}, %ymm29, %ymm22
+// CHECK:  encoding: [0x62,0xe1,0x15,0x30,0xef,0xb2,0xfc,0xfd,0xff,0xff]
+          vpxord -516(%rdx){1to8}, %ymm29, %ymm22
+
+// CHECK: vpxorq %xmm18, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x21,0xf5,0x00,0xef,0xd2]
+          vpxorq %xmm18, %xmm17, %xmm26
+
+// CHECK: vpxorq %xmm18, %xmm17, %xmm26 {%k2}
+// CHECK:  encoding: [0x62,0x21,0xf5,0x02,0xef,0xd2]
+          vpxorq %xmm18, %xmm17, %xmm26 {%k2}
+
+// CHECK: vpxorq %xmm18, %xmm17, %xmm26 {%k2} {z}
+// CHECK:  encoding: [0x62,0x21,0xf5,0x82,0xef,0xd2]
+          vpxorq %xmm18, %xmm17, %xmm26 {%k2} {z}
+
+// CHECK: vpxorq (%rcx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xef,0x11]
+          vpxorq (%rcx), %xmm17, %xmm26
+
+// CHECK: vpxorq 291(%rax,%r14,8), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x21,0xf5,0x00,0xef,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vpxorq 291(%rax,%r14,8), %xmm17, %xmm26
+
+// CHECK: vpxorq (%rcx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xef,0x11]
+          vpxorq (%rcx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpxorq 2032(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xef,0x52,0x7f]
+          vpxorq 2032(%rdx), %xmm17, %xmm26
+
+// CHECK: vpxorq 2048(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xef,0x92,0x00,0x08,0x00,0x00]
+          vpxorq 2048(%rdx), %xmm17, %xmm26
+
+// CHECK: vpxorq -2048(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xef,0x52,0x80]
+          vpxorq -2048(%rdx), %xmm17, %xmm26
+
+// CHECK: vpxorq -2064(%rdx), %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x00,0xef,0x92,0xf0,0xf7,0xff,0xff]
+          vpxorq -2064(%rdx), %xmm17, %xmm26
+
+// CHECK: vpxorq 1016(%rdx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xef,0x52,0x7f]
+          vpxorq 1016(%rdx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpxorq 1024(%rdx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xef,0x92,0x00,0x04,0x00,0x00]
+          vpxorq 1024(%rdx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpxorq -1024(%rdx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xef,0x52,0x80]
+          vpxorq -1024(%rdx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpxorq -1032(%rdx){1to2}, %xmm17, %xmm26
+// CHECK:  encoding: [0x62,0x61,0xf5,0x10,0xef,0x92,0xf8,0xfb,0xff,0xff]
+          vpxorq -1032(%rdx){1to2}, %xmm17, %xmm26
+
+// CHECK: vpxorq %ymm19, %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xa1,0xed,0x20,0xef,0xeb]
+          vpxorq %ymm19, %ymm18, %ymm21
+
+// CHECK: vpxorq %ymm19, %ymm18, %ymm21 {%k7}
+// CHECK:  encoding: [0x62,0xa1,0xed,0x27,0xef,0xeb]
+          vpxorq %ymm19, %ymm18, %ymm21 {%k7}
+
+// CHECK: vpxorq %ymm19, %ymm18, %ymm21 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa1,0xed,0xa7,0xef,0xeb]
+          vpxorq %ymm19, %ymm18, %ymm21 {%k7} {z}
+
+// CHECK: vpxorq (%rcx), %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x20,0xef,0x29]
+          vpxorq (%rcx), %ymm18, %ymm21
+
+// CHECK: vpxorq 291(%rax,%r14,8), %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xa1,0xed,0x20,0xef,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vpxorq 291(%rax,%r14,8), %ymm18, %ymm21
+
+// CHECK: vpxorq (%rcx){1to4}, %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x30,0xef,0x29]
+          vpxorq (%rcx){1to4}, %ymm18, %ymm21
+
+// CHECK: vpxorq 4064(%rdx), %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x20,0xef,0x6a,0x7f]
+          vpxorq 4064(%rdx), %ymm18, %ymm21
+
+// CHECK: vpxorq 4096(%rdx), %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x20,0xef,0xaa,0x00,0x10,0x00,0x00]
+          vpxorq 4096(%rdx), %ymm18, %ymm21
+
+// CHECK: vpxorq -4096(%rdx), %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x20,0xef,0x6a,0x80]
+          vpxorq -4096(%rdx), %ymm18, %ymm21
+
+// CHECK: vpxorq -4128(%rdx), %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x20,0xef,0xaa,0xe0,0xef,0xff,0xff]
+          vpxorq -4128(%rdx), %ymm18, %ymm21
+
+// CHECK: vpxorq 1016(%rdx){1to4}, %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x30,0xef,0x6a,0x7f]
+          vpxorq 1016(%rdx){1to4}, %ymm18, %ymm21
+
+// CHECK: vpxorq 1024(%rdx){1to4}, %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x30,0xef,0xaa,0x00,0x04,0x00,0x00]
+          vpxorq 1024(%rdx){1to4}, %ymm18, %ymm21
+
+// CHECK: vpxorq -1024(%rdx){1to4}, %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x30,0xef,0x6a,0x80]
+          vpxorq -1024(%rdx){1to4}, %ymm18, %ymm21
+
+// CHECK: vpxorq -1032(%rdx){1to4}, %ymm18, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0xed,0x30,0xef,0xaa,0xf8,0xfb,0xff,0xff]
+          vpxorq -1032(%rdx){1to4}, %ymm18, %ymm21
+
+// CHECK: vrcp14pd %xmm29, %xmm18
+// CHECK:  encoding: [0x62,0x82,0xfd,0x08,0x4c,0xd5]
+          vrcp14pd %xmm29, %xmm18
+
+// CHECK: vrcp14pd %xmm29, %xmm18 {%k4}
+// CHECK:  encoding: [0x62,0x82,0xfd,0x0c,0x4c,0xd5]
+          vrcp14pd %xmm29, %xmm18 {%k4}
+
+// CHECK: vrcp14pd %xmm29, %xmm18 {%k4} {z}
+// CHECK:  encoding: [0x62,0x82,0xfd,0x8c,0x4c,0xd5]
+          vrcp14pd %xmm29, %xmm18 {%k4} {z}
+
+// CHECK: vrcp14pd (%rcx), %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4c,0x11]
+          vrcp14pd (%rcx), %xmm18
+
+// CHECK: vrcp14pd 291(%rax,%r14,8), %xmm18
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x08,0x4c,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vrcp14pd 291(%rax,%r14,8), %xmm18
+
+// CHECK: vrcp14pd (%rcx){1to2}, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4c,0x11]
+          vrcp14pd (%rcx){1to2}, %xmm18
+
+// CHECK: vrcp14pd 2032(%rdx), %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4c,0x52,0x7f]
+          vrcp14pd 2032(%rdx), %xmm18
+
+// CHECK: vrcp14pd 2048(%rdx), %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4c,0x92,0x00,0x08,0x00,0x00]
+          vrcp14pd 2048(%rdx), %xmm18
+
+// CHECK: vrcp14pd -2048(%rdx), %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4c,0x52,0x80]
+          vrcp14pd -2048(%rdx), %xmm18
+
+// CHECK: vrcp14pd -2064(%rdx), %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4c,0x92,0xf0,0xf7,0xff,0xff]
+          vrcp14pd -2064(%rdx), %xmm18
+
+// CHECK: vrcp14pd 1016(%rdx){1to2}, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4c,0x52,0x7f]
+          vrcp14pd 1016(%rdx){1to2}, %xmm18
+
+// CHECK: vrcp14pd 1024(%rdx){1to2}, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4c,0x92,0x00,0x04,0x00,0x00]
+          vrcp14pd 1024(%rdx){1to2}, %xmm18
+
+// CHECK: vrcp14pd -1024(%rdx){1to2}, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4c,0x52,0x80]
+          vrcp14pd -1024(%rdx){1to2}, %xmm18
+
+// CHECK: vrcp14pd -1032(%rdx){1to2}, %xmm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4c,0x92,0xf8,0xfb,0xff,0xff]
+          vrcp14pd -1032(%rdx){1to2}, %xmm18
+
+// CHECK: vrcp14pd %ymm29, %ymm17
+// CHECK:  encoding: [0x62,0x82,0xfd,0x28,0x4c,0xcd]
+          vrcp14pd %ymm29, %ymm17
+
+// CHECK: vrcp14pd %ymm29, %ymm17 {%k4}
+// CHECK:  encoding: [0x62,0x82,0xfd,0x2c,0x4c,0xcd]
+          vrcp14pd %ymm29, %ymm17 {%k4}
+
+// CHECK: vrcp14pd %ymm29, %ymm17 {%k4} {z}
+// CHECK:  encoding: [0x62,0x82,0xfd,0xac,0x4c,0xcd]
+          vrcp14pd %ymm29, %ymm17 {%k4} {z}
+
+// CHECK: vrcp14pd (%rcx), %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4c,0x09]
+          vrcp14pd (%rcx), %ymm17
+
+// CHECK: vrcp14pd 291(%rax,%r14,8), %ymm17
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x28,0x4c,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vrcp14pd 291(%rax,%r14,8), %ymm17
+
+// CHECK: vrcp14pd (%rcx){1to4}, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4c,0x09]
+          vrcp14pd (%rcx){1to4}, %ymm17
+
+// CHECK: vrcp14pd 4064(%rdx), %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4c,0x4a,0x7f]
+          vrcp14pd 4064(%rdx), %ymm17
+
+// CHECK: vrcp14pd 4096(%rdx), %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4c,0x8a,0x00,0x10,0x00,0x00]
+          vrcp14pd 4096(%rdx), %ymm17
+
+// CHECK: vrcp14pd -4096(%rdx), %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4c,0x4a,0x80]
+          vrcp14pd -4096(%rdx), %ymm17
+
+// CHECK: vrcp14pd -4128(%rdx), %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4c,0x8a,0xe0,0xef,0xff,0xff]
+          vrcp14pd -4128(%rdx), %ymm17
+
+// CHECK: vrcp14pd 1016(%rdx){1to4}, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4c,0x4a,0x7f]
+          vrcp14pd 1016(%rdx){1to4}, %ymm17
+
+// CHECK: vrcp14pd 1024(%rdx){1to4}, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4c,0x8a,0x00,0x04,0x00,0x00]
+          vrcp14pd 1024(%rdx){1to4}, %ymm17
+
+// CHECK: vrcp14pd -1024(%rdx){1to4}, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4c,0x4a,0x80]
+          vrcp14pd -1024(%rdx){1to4}, %ymm17
+
+// CHECK: vrcp14pd -1032(%rdx){1to4}, %ymm17
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4c,0x8a,0xf8,0xfb,0xff,0xff]
+          vrcp14pd -1032(%rdx){1to4}, %ymm17
+
+// CHECK: vrcp14ps %xmm28, %xmm27
+// CHECK:  encoding: [0x62,0x02,0x7d,0x08,0x4c,0xdc]
+          vrcp14ps %xmm28, %xmm27
+
+// CHECK: vrcp14ps %xmm28, %xmm27 {%k4}
+// CHECK:  encoding: [0x62,0x02,0x7d,0x0c,0x4c,0xdc]
+          vrcp14ps %xmm28, %xmm27 {%k4}
+
+// CHECK: vrcp14ps %xmm28, %xmm27 {%k4} {z}
+// CHECK:  encoding: [0x62,0x02,0x7d,0x8c,0x4c,0xdc]
+          vrcp14ps %xmm28, %xmm27 {%k4} {z}
+
+// CHECK: vrcp14ps (%rcx), %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x4c,0x19]
+          vrcp14ps (%rcx), %xmm27
+
+// CHECK: vrcp14ps 291(%rax,%r14,8), %xmm27
+// CHECK:  encoding: [0x62,0x22,0x7d,0x08,0x4c,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vrcp14ps 291(%rax,%r14,8), %xmm27
+
+// CHECK: vrcp14ps (%rcx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x18,0x4c,0x19]
+          vrcp14ps (%rcx){1to4}, %xmm27
+
+// CHECK: vrcp14ps 2032(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x4c,0x5a,0x7f]
+          vrcp14ps 2032(%rdx), %xmm27
+
+// CHECK: vrcp14ps 2048(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x4c,0x9a,0x00,0x08,0x00,0x00]
+          vrcp14ps 2048(%rdx), %xmm27
+
+// CHECK: vrcp14ps -2048(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x4c,0x5a,0x80]
+          vrcp14ps -2048(%rdx), %xmm27
+
+// CHECK: vrcp14ps -2064(%rdx), %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x08,0x4c,0x9a,0xf0,0xf7,0xff,0xff]
+          vrcp14ps -2064(%rdx), %xmm27
+
+// CHECK: vrcp14ps 508(%rdx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x18,0x4c,0x5a,0x7f]
+          vrcp14ps 508(%rdx){1to4}, %xmm27
+
+// CHECK: vrcp14ps 512(%rdx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x18,0x4c,0x9a,0x00,0x02,0x00,0x00]
+          vrcp14ps 512(%rdx){1to4}, %xmm27
+
+// CHECK: vrcp14ps -512(%rdx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x18,0x4c,0x5a,0x80]
+          vrcp14ps -512(%rdx){1to4}, %xmm27
+
+// CHECK: vrcp14ps -516(%rdx){1to4}, %xmm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x18,0x4c,0x9a,0xfc,0xfd,0xff,0xff]
+          vrcp14ps -516(%rdx){1to4}, %xmm27
+
+// CHECK: vrcp14ps %ymm21, %ymm29
+// CHECK:  encoding: [0x62,0x22,0x7d,0x28,0x4c,0xed]
+          vrcp14ps %ymm21, %ymm29
+
+// CHECK: vrcp14ps %ymm21, %ymm29 {%k7}
+// CHECK:  encoding: [0x62,0x22,0x7d,0x2f,0x4c,0xed]
+          vrcp14ps %ymm21, %ymm29 {%k7}
+
+// CHECK: vrcp14ps %ymm21, %ymm29 {%k7} {z}
+// CHECK:  encoding: [0x62,0x22,0x7d,0xaf,0x4c,0xed]
+          vrcp14ps %ymm21, %ymm29 {%k7} {z}
+
+// CHECK: vrcp14ps (%rcx), %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4c,0x29]
+          vrcp14ps (%rcx), %ymm29
+
+// CHECK: vrcp14ps 291(%rax,%r14,8), %ymm29
+// CHECK:  encoding: [0x62,0x22,0x7d,0x28,0x4c,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vrcp14ps 291(%rax,%r14,8), %ymm29
+
+// CHECK: vrcp14ps (%rcx){1to8}, %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4c,0x29]
+          vrcp14ps (%rcx){1to8}, %ymm29
+
+// CHECK: vrcp14ps 4064(%rdx), %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4c,0x6a,0x7f]
+          vrcp14ps 4064(%rdx), %ymm29
+
+// CHECK: vrcp14ps 4096(%rdx), %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4c,0xaa,0x00,0x10,0x00,0x00]
+          vrcp14ps 4096(%rdx), %ymm29
+
+// CHECK: vrcp14ps -4096(%rdx), %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4c,0x6a,0x80]
+          vrcp14ps -4096(%rdx), %ymm29
+
+// CHECK: vrcp14ps -4128(%rdx), %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4c,0xaa,0xe0,0xef,0xff,0xff]
+          vrcp14ps -4128(%rdx), %ymm29
+
+// CHECK: vrcp14ps 508(%rdx){1to8}, %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4c,0x6a,0x7f]
+          vrcp14ps 508(%rdx){1to8}, %ymm29
+
+// CHECK: vrcp14ps 512(%rdx){1to8}, %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4c,0xaa,0x00,0x02,0x00,0x00]
+          vrcp14ps 512(%rdx){1to8}, %ymm29
+
+// CHECK: vrcp14ps -512(%rdx){1to8}, %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4c,0x6a,0x80]
+          vrcp14ps -512(%rdx){1to8}, %ymm29
+
+// CHECK: vrcp14ps -516(%rdx){1to8}, %ymm29
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4c,0xaa,0xfc,0xfd,0xff,0xff]
+          vrcp14ps -516(%rdx){1to8}, %ymm29
+
+// CHECK: vrsqrt14pd %xmm28, %xmm21
+// CHECK:  encoding: [0x62,0x82,0xfd,0x08,0x4e,0xec]
+          vrsqrt14pd %xmm28, %xmm21
+
+// CHECK: vrsqrt14pd %xmm28, %xmm21 {%k1}
+// CHECK:  encoding: [0x62,0x82,0xfd,0x09,0x4e,0xec]
+          vrsqrt14pd %xmm28, %xmm21 {%k1}
+
+// CHECK: vrsqrt14pd %xmm28, %xmm21 {%k1} {z}
+// CHECK:  encoding: [0x62,0x82,0xfd,0x89,0x4e,0xec]
+          vrsqrt14pd %xmm28, %xmm21 {%k1} {z}
+
+// CHECK: vrsqrt14pd (%rcx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4e,0x29]
+          vrsqrt14pd (%rcx), %xmm21
+
+// CHECK: vrsqrt14pd 291(%rax,%r14,8), %xmm21
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x08,0x4e,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vrsqrt14pd 291(%rax,%r14,8), %xmm21
+
+// CHECK: vrsqrt14pd (%rcx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4e,0x29]
+          vrsqrt14pd (%rcx){1to2}, %xmm21
+
+// CHECK: vrsqrt14pd 2032(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4e,0x6a,0x7f]
+          vrsqrt14pd 2032(%rdx), %xmm21
+
+// CHECK: vrsqrt14pd 2048(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4e,0xaa,0x00,0x08,0x00,0x00]
+          vrsqrt14pd 2048(%rdx), %xmm21
+
+// CHECK: vrsqrt14pd -2048(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4e,0x6a,0x80]
+          vrsqrt14pd -2048(%rdx), %xmm21
+
+// CHECK: vrsqrt14pd -2064(%rdx), %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x08,0x4e,0xaa,0xf0,0xf7,0xff,0xff]
+          vrsqrt14pd -2064(%rdx), %xmm21
+
+// CHECK: vrsqrt14pd 1016(%rdx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4e,0x6a,0x7f]
+          vrsqrt14pd 1016(%rdx){1to2}, %xmm21
+
+// CHECK: vrsqrt14pd 1024(%rdx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4e,0xaa,0x00,0x04,0x00,0x00]
+          vrsqrt14pd 1024(%rdx){1to2}, %xmm21
+
+// CHECK: vrsqrt14pd -1024(%rdx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4e,0x6a,0x80]
+          vrsqrt14pd -1024(%rdx){1to2}, %xmm21
+
+// CHECK: vrsqrt14pd -1032(%rdx){1to2}, %xmm21
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x18,0x4e,0xaa,0xf8,0xfb,0xff,0xff]
+          vrsqrt14pd -1032(%rdx){1to2}, %xmm21
+
+// CHECK: vrsqrt14pd %ymm19, %ymm18
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x28,0x4e,0xd3]
+          vrsqrt14pd %ymm19, %ymm18
+
+// CHECK: vrsqrt14pd %ymm19, %ymm18 {%k4}
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x2c,0x4e,0xd3]
+          vrsqrt14pd %ymm19, %ymm18 {%k4}
+
+// CHECK: vrsqrt14pd %ymm19, %ymm18 {%k4} {z}
+// CHECK:  encoding: [0x62,0xa2,0xfd,0xac,0x4e,0xd3]
+          vrsqrt14pd %ymm19, %ymm18 {%k4} {z}
+
+// CHECK: vrsqrt14pd (%rcx), %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4e,0x11]
+          vrsqrt14pd (%rcx), %ymm18
+
+// CHECK: vrsqrt14pd 291(%rax,%r14,8), %ymm18
+// CHECK:  encoding: [0x62,0xa2,0xfd,0x28,0x4e,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vrsqrt14pd 291(%rax,%r14,8), %ymm18
+
+// CHECK: vrsqrt14pd (%rcx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4e,0x11]
+          vrsqrt14pd (%rcx){1to4}, %ymm18
+
+// CHECK: vrsqrt14pd 4064(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4e,0x52,0x7f]
+          vrsqrt14pd 4064(%rdx), %ymm18
+
+// CHECK: vrsqrt14pd 4096(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4e,0x92,0x00,0x10,0x00,0x00]
+          vrsqrt14pd 4096(%rdx), %ymm18
+
+// CHECK: vrsqrt14pd -4096(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4e,0x52,0x80]
+          vrsqrt14pd -4096(%rdx), %ymm18
+
+// CHECK: vrsqrt14pd -4128(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x28,0x4e,0x92,0xe0,0xef,0xff,0xff]
+          vrsqrt14pd -4128(%rdx), %ymm18
+
+// CHECK: vrsqrt14pd 1016(%rdx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4e,0x52,0x7f]
+          vrsqrt14pd 1016(%rdx){1to4}, %ymm18
+
+// CHECK: vrsqrt14pd 1024(%rdx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4e,0x92,0x00,0x04,0x00,0x00]
+          vrsqrt14pd 1024(%rdx){1to4}, %ymm18
+
+// CHECK: vrsqrt14pd -1024(%rdx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4e,0x52,0x80]
+          vrsqrt14pd -1024(%rdx){1to4}, %ymm18
+
+// CHECK: vrsqrt14pd -1032(%rdx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe2,0xfd,0x38,0x4e,0x92,0xf8,0xfb,0xff,0xff]
+          vrsqrt14pd -1032(%rdx){1to4}, %ymm18
+
+// CHECK: vrsqrt14ps %xmm20, %xmm19
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x08,0x4e,0xdc]
+          vrsqrt14ps %xmm20, %xmm19
+
+// CHECK: vrsqrt14ps %xmm20, %xmm19 {%k7}
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x0f,0x4e,0xdc]
+          vrsqrt14ps %xmm20, %xmm19 {%k7}
+
+// CHECK: vrsqrt14ps %xmm20, %xmm19 {%k7} {z}
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x8f,0x4e,0xdc]
+          vrsqrt14ps %xmm20, %xmm19 {%k7} {z}
+
+// CHECK: vrsqrt14ps (%rcx), %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x4e,0x19]
+          vrsqrt14ps (%rcx), %xmm19
+
+// CHECK: vrsqrt14ps 291(%rax,%r14,8), %xmm19
+// CHECK:  encoding: [0x62,0xa2,0x7d,0x08,0x4e,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vrsqrt14ps 291(%rax,%r14,8), %xmm19
+
+// CHECK: vrsqrt14ps (%rcx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x18,0x4e,0x19]
+          vrsqrt14ps (%rcx){1to4}, %xmm19
+
+// CHECK: vrsqrt14ps 2032(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x4e,0x5a,0x7f]
+          vrsqrt14ps 2032(%rdx), %xmm19
+
+// CHECK: vrsqrt14ps 2048(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x4e,0x9a,0x00,0x08,0x00,0x00]
+          vrsqrt14ps 2048(%rdx), %xmm19
+
+// CHECK: vrsqrt14ps -2048(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x4e,0x5a,0x80]
+          vrsqrt14ps -2048(%rdx), %xmm19
+
+// CHECK: vrsqrt14ps -2064(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x08,0x4e,0x9a,0xf0,0xf7,0xff,0xff]
+          vrsqrt14ps -2064(%rdx), %xmm19
+
+// CHECK: vrsqrt14ps 508(%rdx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x18,0x4e,0x5a,0x7f]
+          vrsqrt14ps 508(%rdx){1to4}, %xmm19
+
+// CHECK: vrsqrt14ps 512(%rdx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x18,0x4e,0x9a,0x00,0x02,0x00,0x00]
+          vrsqrt14ps 512(%rdx){1to4}, %xmm19
+
+// CHECK: vrsqrt14ps -512(%rdx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x18,0x4e,0x5a,0x80]
+          vrsqrt14ps -512(%rdx){1to4}, %xmm19
+
+// CHECK: vrsqrt14ps -516(%rdx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe2,0x7d,0x18,0x4e,0x9a,0xfc,0xfd,0xff,0xff]
+          vrsqrt14ps -516(%rdx){1to4}, %xmm19
+
+// CHECK: vrsqrt14ps %ymm18, %ymm27
+// CHECK:  encoding: [0x62,0x22,0x7d,0x28,0x4e,0xda]
+          vrsqrt14ps %ymm18, %ymm27
+
+// CHECK: vrsqrt14ps %ymm18, %ymm27 {%k7}
+// CHECK:  encoding: [0x62,0x22,0x7d,0x2f,0x4e,0xda]
+          vrsqrt14ps %ymm18, %ymm27 {%k7}
+
+// CHECK: vrsqrt14ps %ymm18, %ymm27 {%k7} {z}
+// CHECK:  encoding: [0x62,0x22,0x7d,0xaf,0x4e,0xda]
+          vrsqrt14ps %ymm18, %ymm27 {%k7} {z}
+
+// CHECK: vrsqrt14ps (%rcx), %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4e,0x19]
+          vrsqrt14ps (%rcx), %ymm27
+
+// CHECK: vrsqrt14ps 291(%rax,%r14,8), %ymm27
+// CHECK:  encoding: [0x62,0x22,0x7d,0x28,0x4e,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vrsqrt14ps 291(%rax,%r14,8), %ymm27
+
+// CHECK: vrsqrt14ps (%rcx){1to8}, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4e,0x19]
+          vrsqrt14ps (%rcx){1to8}, %ymm27
+
+// CHECK: vrsqrt14ps 4064(%rdx), %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4e,0x5a,0x7f]
+          vrsqrt14ps 4064(%rdx), %ymm27
+
+// CHECK: vrsqrt14ps 4096(%rdx), %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4e,0x9a,0x00,0x10,0x00,0x00]
+          vrsqrt14ps 4096(%rdx), %ymm27
+
+// CHECK: vrsqrt14ps -4096(%rdx), %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4e,0x5a,0x80]
+          vrsqrt14ps -4096(%rdx), %ymm27
+
+// CHECK: vrsqrt14ps -4128(%rdx), %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x28,0x4e,0x9a,0xe0,0xef,0xff,0xff]
+          vrsqrt14ps -4128(%rdx), %ymm27
+
+// CHECK: vrsqrt14ps 508(%rdx){1to8}, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4e,0x5a,0x7f]
+          vrsqrt14ps 508(%rdx){1to8}, %ymm27
+
+// CHECK: vrsqrt14ps 512(%rdx){1to8}, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4e,0x9a,0x00,0x02,0x00,0x00]
+          vrsqrt14ps 512(%rdx){1to8}, %ymm27
+
+// CHECK: vrsqrt14ps -512(%rdx){1to8}, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4e,0x5a,0x80]
+          vrsqrt14ps -512(%rdx){1to8}, %ymm27
+
+// CHECK: vrsqrt14ps -516(%rdx){1to8}, %ymm27
+// CHECK:  encoding: [0x62,0x62,0x7d,0x38,0x4e,0x9a,0xfc,0xfd,0xff,0xff]
+          vrsqrt14ps -516(%rdx){1to8}, %ymm27
+
+// CHECK: vsqrtpd %xmm26, %xmm29
+// CHECK:  encoding: [0x62,0x01,0xfd,0x08,0x51,0xea]
+          vsqrtpd %xmm26, %xmm29
+
+// CHECK: vsqrtpd %xmm26, %xmm29 {%k3}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x0b,0x51,0xea]
+          vsqrtpd %xmm26, %xmm29 {%k3}
+
+// CHECK: vsqrtpd %xmm26, %xmm29 {%k3} {z}
+// CHECK:  encoding: [0x62,0x01,0xfd,0x8b,0x51,0xea]
+          vsqrtpd %xmm26, %xmm29 {%k3} {z}
+
+// CHECK: vsqrtpd (%rcx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x51,0x29]
+          vsqrtpd (%rcx), %xmm29
+
+// CHECK: vsqrtpd 291(%rax,%r14,8), %xmm29
+// CHECK:  encoding: [0x62,0x21,0xfd,0x08,0x51,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vsqrtpd 291(%rax,%r14,8), %xmm29
+
+// CHECK: vsqrtpd (%rcx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x51,0x29]
+          vsqrtpd (%rcx){1to2}, %xmm29
+
+// CHECK: vsqrtpd 2032(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x51,0x6a,0x7f]
+          vsqrtpd 2032(%rdx), %xmm29
+
+// CHECK: vsqrtpd 2048(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x51,0xaa,0x00,0x08,0x00,0x00]
+          vsqrtpd 2048(%rdx), %xmm29
+
+// CHECK: vsqrtpd -2048(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x51,0x6a,0x80]
+          vsqrtpd -2048(%rdx), %xmm29
+
+// CHECK: vsqrtpd -2064(%rdx), %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x08,0x51,0xaa,0xf0,0xf7,0xff,0xff]
+          vsqrtpd -2064(%rdx), %xmm29
+
+// CHECK: vsqrtpd 1016(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x51,0x6a,0x7f]
+          vsqrtpd 1016(%rdx){1to2}, %xmm29
+
+// CHECK: vsqrtpd 1024(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x51,0xaa,0x00,0x04,0x00,0x00]
+          vsqrtpd 1024(%rdx){1to2}, %xmm29
+
+// CHECK: vsqrtpd -1024(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x51,0x6a,0x80]
+          vsqrtpd -1024(%rdx){1to2}, %xmm29
+
+// CHECK: vsqrtpd -1032(%rdx){1to2}, %xmm29
+// CHECK:  encoding: [0x62,0x61,0xfd,0x18,0x51,0xaa,0xf8,0xfb,0xff,0xff]
+          vsqrtpd -1032(%rdx){1to2}, %xmm29
+
+// CHECK: vsqrtpd %ymm20, %ymm18
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x28,0x51,0xd4]
+          vsqrtpd %ymm20, %ymm18
+
+// CHECK: vsqrtpd %ymm20, %ymm18 {%k3}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x2b,0x51,0xd4]
+          vsqrtpd %ymm20, %ymm18 {%k3}
+
+// CHECK: vsqrtpd %ymm20, %ymm18 {%k3} {z}
+// CHECK:  encoding: [0x62,0xa1,0xfd,0xab,0x51,0xd4]
+          vsqrtpd %ymm20, %ymm18 {%k3} {z}
+
+// CHECK: vsqrtpd (%rcx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x51,0x11]
+          vsqrtpd (%rcx), %ymm18
+
+// CHECK: vsqrtpd 291(%rax,%r14,8), %ymm18
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x28,0x51,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vsqrtpd 291(%rax,%r14,8), %ymm18
+
+// CHECK: vsqrtpd (%rcx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x51,0x11]
+          vsqrtpd (%rcx){1to4}, %ymm18
+
+// CHECK: vsqrtpd 4064(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x51,0x52,0x7f]
+          vsqrtpd 4064(%rdx), %ymm18
+
+// CHECK: vsqrtpd 4096(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x51,0x92,0x00,0x10,0x00,0x00]
+          vsqrtpd 4096(%rdx), %ymm18
+
+// CHECK: vsqrtpd -4096(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x51,0x52,0x80]
+          vsqrtpd -4096(%rdx), %ymm18
+
+// CHECK: vsqrtpd -4128(%rdx), %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x51,0x92,0xe0,0xef,0xff,0xff]
+          vsqrtpd -4128(%rdx), %ymm18
+
+// CHECK: vsqrtpd 1016(%rdx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x51,0x52,0x7f]
+          vsqrtpd 1016(%rdx){1to4}, %ymm18
+
+// CHECK: vsqrtpd 1024(%rdx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x51,0x92,0x00,0x04,0x00,0x00]
+          vsqrtpd 1024(%rdx){1to4}, %ymm18
+
+// CHECK: vsqrtpd -1024(%rdx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x51,0x52,0x80]
+          vsqrtpd -1024(%rdx){1to4}, %ymm18
+
+// CHECK: vsqrtpd -1032(%rdx){1to4}, %ymm18
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x38,0x51,0x92,0xf8,0xfb,0xff,0xff]
+          vsqrtpd -1032(%rdx){1to4}, %ymm18
+
+// CHECK: vsqrtps %xmm28, %xmm19
+// CHECK:  encoding: [0x62,0x81,0x7c,0x08,0x51,0xdc]
+          vsqrtps %xmm28, %xmm19
+
+// CHECK: vsqrtps %xmm28, %xmm19 {%k7}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x0f,0x51,0xdc]
+          vsqrtps %xmm28, %xmm19 {%k7}
+
+// CHECK: vsqrtps %xmm28, %xmm19 {%k7} {z}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x8f,0x51,0xdc]
+          vsqrtps %xmm28, %xmm19 {%k7} {z}
+
+// CHECK: vsqrtps (%rcx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x51,0x19]
+          vsqrtps (%rcx), %xmm19
+
+// CHECK: vsqrtps 291(%rax,%r14,8), %xmm19
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x08,0x51,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vsqrtps 291(%rax,%r14,8), %xmm19
+
+// CHECK: vsqrtps (%rcx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x51,0x19]
+          vsqrtps (%rcx){1to4}, %xmm19
+
+// CHECK: vsqrtps 2032(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x51,0x5a,0x7f]
+          vsqrtps 2032(%rdx), %xmm19
+
+// CHECK: vsqrtps 2048(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x51,0x9a,0x00,0x08,0x00,0x00]
+          vsqrtps 2048(%rdx), %xmm19
+
+// CHECK: vsqrtps -2048(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x51,0x5a,0x80]
+          vsqrtps -2048(%rdx), %xmm19
+
+// CHECK: vsqrtps -2064(%rdx), %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x08,0x51,0x9a,0xf0,0xf7,0xff,0xff]
+          vsqrtps -2064(%rdx), %xmm19
+
+// CHECK: vsqrtps 508(%rdx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x51,0x5a,0x7f]
+          vsqrtps 508(%rdx){1to4}, %xmm19
+
+// CHECK: vsqrtps 512(%rdx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x51,0x9a,0x00,0x02,0x00,0x00]
+          vsqrtps 512(%rdx){1to4}, %xmm19
+
+// CHECK: vsqrtps -512(%rdx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x51,0x5a,0x80]
+          vsqrtps -512(%rdx){1to4}, %xmm19
+
+// CHECK: vsqrtps -516(%rdx){1to4}, %xmm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x18,0x51,0x9a,0xfc,0xfd,0xff,0xff]
+          vsqrtps -516(%rdx){1to4}, %xmm19
+
+// CHECK: vsqrtps %ymm25, %ymm19
+// CHECK:  encoding: [0x62,0x81,0x7c,0x28,0x51,0xd9]
+          vsqrtps %ymm25, %ymm19
+
+// CHECK: vsqrtps %ymm25, %ymm19 {%k2}
+// CHECK:  encoding: [0x62,0x81,0x7c,0x2a,0x51,0xd9]
+          vsqrtps %ymm25, %ymm19 {%k2}
+
+// CHECK: vsqrtps %ymm25, %ymm19 {%k2} {z}
+// CHECK:  encoding: [0x62,0x81,0x7c,0xaa,0x51,0xd9]
+          vsqrtps %ymm25, %ymm19 {%k2} {z}
+
+// CHECK: vsqrtps (%rcx), %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x51,0x19]
+          vsqrtps (%rcx), %ymm19
+
+// CHECK: vsqrtps 291(%rax,%r14,8), %ymm19
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x28,0x51,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vsqrtps 291(%rax,%r14,8), %ymm19
+
+// CHECK: vsqrtps (%rcx){1to8}, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x51,0x19]
+          vsqrtps (%rcx){1to8}, %ymm19
+
+// CHECK: vsqrtps 4064(%rdx), %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x51,0x5a,0x7f]
+          vsqrtps 4064(%rdx), %ymm19
+
+// CHECK: vsqrtps 4096(%rdx), %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x51,0x9a,0x00,0x10,0x00,0x00]
+          vsqrtps 4096(%rdx), %ymm19
+
+// CHECK: vsqrtps -4096(%rdx), %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x51,0x5a,0x80]
+          vsqrtps -4096(%rdx), %ymm19
+
+// CHECK: vsqrtps -4128(%rdx), %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x51,0x9a,0xe0,0xef,0xff,0xff]
+          vsqrtps -4128(%rdx), %ymm19
+
+// CHECK: vsqrtps 508(%rdx){1to8}, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x51,0x5a,0x7f]
+          vsqrtps 508(%rdx){1to8}, %ymm19
+
+// CHECK: vsqrtps 512(%rdx){1to8}, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x51,0x9a,0x00,0x02,0x00,0x00]
+          vsqrtps 512(%rdx){1to8}, %ymm19
+
+// CHECK: vsqrtps -512(%rdx){1to8}, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x51,0x5a,0x80]
+          vsqrtps -512(%rdx){1to8}, %ymm19
+
+// CHECK: vsqrtps -516(%rdx){1to8}, %ymm19
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x38,0x51,0x9a,0xfc,0xfd,0xff,0xff]
+          vsqrtps -516(%rdx){1to8}, %ymm19
+
+// CHECK: vsubpd %xmm18, %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x21,0xbd,0x00,0x5c,0xe2]
+          vsubpd %xmm18, %xmm24, %xmm28
+
+// CHECK: vsubpd %xmm18, %xmm24, %xmm28 {%k3}
+// CHECK:  encoding: [0x62,0x21,0xbd,0x03,0x5c,0xe2]
+          vsubpd %xmm18, %xmm24, %xmm28 {%k3}
+
+// CHECK: vsubpd %xmm18, %xmm24, %xmm28 {%k3} {z}
+// CHECK:  encoding: [0x62,0x21,0xbd,0x83,0x5c,0xe2]
+          vsubpd %xmm18, %xmm24, %xmm28 {%k3} {z}
+
+// CHECK: vsubpd (%rcx), %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x00,0x5c,0x21]
+          vsubpd (%rcx), %xmm24, %xmm28
+
+// CHECK: vsubpd 291(%rax,%r14,8), %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x21,0xbd,0x00,0x5c,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vsubpd 291(%rax,%r14,8), %xmm24, %xmm28
+
+// CHECK: vsubpd (%rcx){1to2}, %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x10,0x5c,0x21]
+          vsubpd (%rcx){1to2}, %xmm24, %xmm28
+
+// CHECK: vsubpd 2032(%rdx), %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x00,0x5c,0x62,0x7f]
+          vsubpd 2032(%rdx), %xmm24, %xmm28
+
+// CHECK: vsubpd 2048(%rdx), %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x00,0x5c,0xa2,0x00,0x08,0x00,0x00]
+          vsubpd 2048(%rdx), %xmm24, %xmm28
+
+// CHECK: vsubpd -2048(%rdx), %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x00,0x5c,0x62,0x80]
+          vsubpd -2048(%rdx), %xmm24, %xmm28
+
+// CHECK: vsubpd -2064(%rdx), %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x00,0x5c,0xa2,0xf0,0xf7,0xff,0xff]
+          vsubpd -2064(%rdx), %xmm24, %xmm28
+
+// CHECK: vsubpd 1016(%rdx){1to2}, %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x10,0x5c,0x62,0x7f]
+          vsubpd 1016(%rdx){1to2}, %xmm24, %xmm28
+
+// CHECK: vsubpd 1024(%rdx){1to2}, %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x10,0x5c,0xa2,0x00,0x04,0x00,0x00]
+          vsubpd 1024(%rdx){1to2}, %xmm24, %xmm28
+
+// CHECK: vsubpd -1024(%rdx){1to2}, %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x10,0x5c,0x62,0x80]
+          vsubpd -1024(%rdx){1to2}, %xmm24, %xmm28
+
+// CHECK: vsubpd -1032(%rdx){1to2}, %xmm24, %xmm28
+// CHECK:  encoding: [0x62,0x61,0xbd,0x10,0x5c,0xa2,0xf8,0xfb,0xff,0xff]
+          vsubpd -1032(%rdx){1to2}, %xmm24, %xmm28
+
+// CHECK: vsubpd %ymm25, %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x01,0x9d,0x20,0x5c,0xf1]
+          vsubpd %ymm25, %ymm28, %ymm30
+
+// CHECK: vsubpd %ymm25, %ymm28, %ymm30 {%k7}
+// CHECK:  encoding: [0x62,0x01,0x9d,0x27,0x5c,0xf1]
+          vsubpd %ymm25, %ymm28, %ymm30 {%k7}
+
+// CHECK: vsubpd %ymm25, %ymm28, %ymm30 {%k7} {z}
+// CHECK:  encoding: [0x62,0x01,0x9d,0xa7,0x5c,0xf1]
+          vsubpd %ymm25, %ymm28, %ymm30 {%k7} {z}
+
+// CHECK: vsubpd (%rcx), %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x20,0x5c,0x31]
+          vsubpd (%rcx), %ymm28, %ymm30
+
+// CHECK: vsubpd 291(%rax,%r14,8), %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x21,0x9d,0x20,0x5c,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vsubpd 291(%rax,%r14,8), %ymm28, %ymm30
+
+// CHECK: vsubpd (%rcx){1to4}, %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x30,0x5c,0x31]
+          vsubpd (%rcx){1to4}, %ymm28, %ymm30
+
+// CHECK: vsubpd 4064(%rdx), %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x20,0x5c,0x72,0x7f]
+          vsubpd 4064(%rdx), %ymm28, %ymm30
+
+// CHECK: vsubpd 4096(%rdx), %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x20,0x5c,0xb2,0x00,0x10,0x00,0x00]
+          vsubpd 4096(%rdx), %ymm28, %ymm30
+
+// CHECK: vsubpd -4096(%rdx), %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x20,0x5c,0x72,0x80]
+          vsubpd -4096(%rdx), %ymm28, %ymm30
+
+// CHECK: vsubpd -4128(%rdx), %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x20,0x5c,0xb2,0xe0,0xef,0xff,0xff]
+          vsubpd -4128(%rdx), %ymm28, %ymm30
+
+// CHECK: vsubpd 1016(%rdx){1to4}, %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x30,0x5c,0x72,0x7f]
+          vsubpd 1016(%rdx){1to4}, %ymm28, %ymm30
+
+// CHECK: vsubpd 1024(%rdx){1to4}, %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x30,0x5c,0xb2,0x00,0x04,0x00,0x00]
+          vsubpd 1024(%rdx){1to4}, %ymm28, %ymm30
+
+// CHECK: vsubpd -1024(%rdx){1to4}, %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x30,0x5c,0x72,0x80]
+          vsubpd -1024(%rdx){1to4}, %ymm28, %ymm30
+
+// CHECK: vsubpd -1032(%rdx){1to4}, %ymm28, %ymm30
+// CHECK:  encoding: [0x62,0x61,0x9d,0x30,0x5c,0xb2,0xf8,0xfb,0xff,0xff]
+          vsubpd -1032(%rdx){1to4}, %ymm28, %ymm30
+
+// CHECK: vsubps %xmm25, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0x81,0x14,0x00,0x5c,0xe1]
+          vsubps %xmm25, %xmm29, %xmm20
+
+// CHECK: vsubps %xmm25, %xmm29, %xmm20 {%k3}
+// CHECK:  encoding: [0x62,0x81,0x14,0x03,0x5c,0xe1]
+          vsubps %xmm25, %xmm29, %xmm20 {%k3}
+
+// CHECK: vsubps %xmm25, %xmm29, %xmm20 {%k3} {z}
+// CHECK:  encoding: [0x62,0x81,0x14,0x83,0x5c,0xe1]
+          vsubps %xmm25, %xmm29, %xmm20 {%k3} {z}
+
+// CHECK: vsubps (%rcx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x00,0x5c,0x21]
+          vsubps (%rcx), %xmm29, %xmm20
+
+// CHECK: vsubps 291(%rax,%r14,8), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xa1,0x14,0x00,0x5c,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vsubps 291(%rax,%r14,8), %xmm29, %xmm20
+
+// CHECK: vsubps (%rcx){1to4}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x10,0x5c,0x21]
+          vsubps (%rcx){1to4}, %xmm29, %xmm20
+
+// CHECK: vsubps 2032(%rdx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x00,0x5c,0x62,0x7f]
+          vsubps 2032(%rdx), %xmm29, %xmm20
+
+// CHECK: vsubps 2048(%rdx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x00,0x5c,0xa2,0x00,0x08,0x00,0x00]
+          vsubps 2048(%rdx), %xmm29, %xmm20
+
+// CHECK: vsubps -2048(%rdx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x00,0x5c,0x62,0x80]
+          vsubps -2048(%rdx), %xmm29, %xmm20
+
+// CHECK: vsubps -2064(%rdx), %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x00,0x5c,0xa2,0xf0,0xf7,0xff,0xff]
+          vsubps -2064(%rdx), %xmm29, %xmm20
+
+// CHECK: vsubps 508(%rdx){1to4}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x10,0x5c,0x62,0x7f]
+          vsubps 508(%rdx){1to4}, %xmm29, %xmm20
+
+// CHECK: vsubps 512(%rdx){1to4}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x10,0x5c,0xa2,0x00,0x02,0x00,0x00]
+          vsubps 512(%rdx){1to4}, %xmm29, %xmm20
+
+// CHECK: vsubps -512(%rdx){1to4}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x10,0x5c,0x62,0x80]
+          vsubps -512(%rdx){1to4}, %xmm29, %xmm20
+
+// CHECK: vsubps -516(%rdx){1to4}, %xmm29, %xmm20
+// CHECK:  encoding: [0x62,0xe1,0x14,0x10,0x5c,0xa2,0xfc,0xfd,0xff,0xff]
+          vsubps -516(%rdx){1to4}, %xmm29, %xmm20
+
+// CHECK: vsubps %ymm22, %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xa1,0x54,0x20,0x5c,0xee]
+          vsubps %ymm22, %ymm21, %ymm21
+
+// CHECK: vsubps %ymm22, %ymm21, %ymm21 {%k4}
+// CHECK:  encoding: [0x62,0xa1,0x54,0x24,0x5c,0xee]
+          vsubps %ymm22, %ymm21, %ymm21 {%k4}
+
+// CHECK: vsubps %ymm22, %ymm21, %ymm21 {%k4} {z}
+// CHECK:  encoding: [0x62,0xa1,0x54,0xa4,0x5c,0xee]
+          vsubps %ymm22, %ymm21, %ymm21 {%k4} {z}
+
+// CHECK: vsubps (%rcx), %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x20,0x5c,0x29]
+          vsubps (%rcx), %ymm21, %ymm21
+
+// CHECK: vsubps 291(%rax,%r14,8), %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xa1,0x54,0x20,0x5c,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vsubps 291(%rax,%r14,8), %ymm21, %ymm21
+
+// CHECK: vsubps (%rcx){1to8}, %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x30,0x5c,0x29]
+          vsubps (%rcx){1to8}, %ymm21, %ymm21
+
+// CHECK: vsubps 4064(%rdx), %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x20,0x5c,0x6a,0x7f]
+          vsubps 4064(%rdx), %ymm21, %ymm21
+
+// CHECK: vsubps 4096(%rdx), %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x20,0x5c,0xaa,0x00,0x10,0x00,0x00]
+          vsubps 4096(%rdx), %ymm21, %ymm21
+
+// CHECK: vsubps -4096(%rdx), %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x20,0x5c,0x6a,0x80]
+          vsubps -4096(%rdx), %ymm21, %ymm21
+
+// CHECK: vsubps -4128(%rdx), %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x20,0x5c,0xaa,0xe0,0xef,0xff,0xff]
+          vsubps -4128(%rdx), %ymm21, %ymm21
+
+// CHECK: vsubps 508(%rdx){1to8}, %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x30,0x5c,0x6a,0x7f]
+          vsubps 508(%rdx){1to8}, %ymm21, %ymm21
+
+// CHECK: vsubps 512(%rdx){1to8}, %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x30,0x5c,0xaa,0x00,0x02,0x00,0x00]
+          vsubps 512(%rdx){1to8}, %ymm21, %ymm21
+
+// CHECK: vsubps -512(%rdx){1to8}, %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x30,0x5c,0x6a,0x80]
+          vsubps -512(%rdx){1to8}, %ymm21, %ymm21
+
+// CHECK: vsubps -516(%rdx){1to8}, %ymm21, %ymm21
+// CHECK:  encoding: [0x62,0xe1,0x54,0x30,0x5c,0xaa,0xfc,0xfd,0xff,0xff]
+          vsubps -516(%rdx){1to8}, %ymm21, %ymm21
+
+// CHECK: vmovapd %xmm22, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x29,0x31]
+          vmovapd %xmm22, (%rcx)
+
+// CHECK: vmovapd %xmm22, (%rcx) {%k2}
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x0a,0x29,0x31]
+          vmovapd %xmm22, (%rcx) {%k2}
+
+// CHECK: vmovapd %xmm22, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x29,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovapd %xmm22, 291(%rax,%r14,8)
+
+// CHECK: vmovapd %xmm22, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x29,0x72,0x7f]
+          vmovapd %xmm22, 2032(%rdx)
+
+// CHECK: vmovapd %xmm22, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x29,0xb2,0x00,0x08,0x00,0x00]
+          vmovapd %xmm22, 2048(%rdx)
+
+// CHECK: vmovapd %xmm22, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x29,0x72,0x80]
+          vmovapd %xmm22, -2048(%rdx)
+
+// CHECK: vmovapd %xmm22, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x29,0xb2,0xf0,0xf7,0xff,0xff]
+          vmovapd %xmm22, -2064(%rdx)
+
+// CHECK: vmovapd %ymm17, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x29,0x09]
+          vmovapd %ymm17, (%rcx)
+
+// CHECK: vmovapd %ymm17, (%rcx) {%k6}
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x2e,0x29,0x09]
+          vmovapd %ymm17, (%rcx) {%k6}
+
+// CHECK: vmovapd %ymm17, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x28,0x29,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovapd %ymm17, 291(%rax,%r14,8)
+
+// CHECK: vmovapd %ymm17, 4064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x29,0x4a,0x7f]
+          vmovapd %ymm17, 4064(%rdx)
+
+// CHECK: vmovapd %ymm17, 4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x29,0x8a,0x00,0x10,0x00,0x00]
+          vmovapd %ymm17, 4096(%rdx)
+
+// CHECK: vmovapd %ymm17, -4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x29,0x4a,0x80]
+          vmovapd %ymm17, -4096(%rdx)
+
+// CHECK: vmovapd %ymm17, -4128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x28,0x29,0x8a,0xe0,0xef,0xff,0xff]
+          vmovapd %ymm17, -4128(%rdx)
+
+// CHECK: vmovaps %xmm29, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x29,0x29]
+          vmovaps %xmm29, (%rcx)
+
+// CHECK: vmovaps %xmm29, (%rcx) {%k5}
+// CHECK:  encoding: [0x62,0x61,0x7c,0x0d,0x29,0x29]
+          vmovaps %xmm29, (%rcx) {%k5}
+
+// CHECK: vmovaps %xmm29, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0x7c,0x08,0x29,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovaps %xmm29, 291(%rax,%r14,8)
+
+// CHECK: vmovaps %xmm29, 2032(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x29,0x6a,0x7f]
+          vmovaps %xmm29, 2032(%rdx)
+
+// CHECK: vmovaps %xmm29, 2048(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x29,0xaa,0x00,0x08,0x00,0x00]
+          vmovaps %xmm29, 2048(%rdx)
+
+// CHECK: vmovaps %xmm29, -2048(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x29,0x6a,0x80]
+          vmovaps %xmm29, -2048(%rdx)
+
+// CHECK: vmovaps %xmm29, -2064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x29,0xaa,0xf0,0xf7,0xff,0xff]
+          vmovaps %xmm29, -2064(%rdx)
+
+// CHECK: vmovaps %ymm28, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x29,0x21]
+          vmovaps %ymm28, (%rcx)
+
+// CHECK: vmovaps %ymm28, (%rcx) {%k6}
+// CHECK:  encoding: [0x62,0x61,0x7c,0x2e,0x29,0x21]
+          vmovaps %ymm28, (%rcx) {%k6}
+
+// CHECK: vmovaps %ymm28, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0x7c,0x28,0x29,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vmovaps %ymm28, 291(%rax,%r14,8)
+
+// CHECK: vmovaps %ymm28, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x29,0x62,0x7f]
+          vmovaps %ymm28, 4064(%rdx)
+
+// CHECK: vmovaps %ymm28, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x29,0xa2,0x00,0x10,0x00,0x00]
+          vmovaps %ymm28, 4096(%rdx)
+
+// CHECK: vmovaps %ymm28, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x29,0x62,0x80]
+          vmovaps %ymm28, -4096(%rdx)
+
+// CHECK: vmovaps %ymm28, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x28,0x29,0xa2,0xe0,0xef,0xff,0xff]
+          vmovaps %ymm28, -4128(%rdx)
+
+// CHECK: vmovdqa32 %xmm24, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x7f,0x01]
+          vmovdqa32 %xmm24, (%rcx)
+
+// CHECK: vmovdqa32 %xmm24, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0x61,0x7d,0x0f,0x7f,0x01]
+          vmovdqa32 %xmm24, (%rcx) {%k7}
+
+// CHECK: vmovdqa32 %xmm24, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0x7d,0x08,0x7f,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa32 %xmm24, 291(%rax,%r14,8)
+
+// CHECK: vmovdqa32 %xmm24, 2032(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x7f,0x42,0x7f]
+          vmovdqa32 %xmm24, 2032(%rdx)
+
+// CHECK: vmovdqa32 %xmm24, 2048(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x7f,0x82,0x00,0x08,0x00,0x00]
+          vmovdqa32 %xmm24, 2048(%rdx)
+
+// CHECK: vmovdqa32 %xmm24, -2048(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x7f,0x42,0x80]
+          vmovdqa32 %xmm24, -2048(%rdx)
+
+// CHECK: vmovdqa32 %xmm24, -2064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x08,0x7f,0x82,0xf0,0xf7,0xff,0xff]
+          vmovdqa32 %xmm24, -2064(%rdx)
+
+// CHECK: vmovdqa32 %ymm29, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7f,0x29]
+          vmovdqa32 %ymm29, (%rcx)
+
+// CHECK: vmovdqa32 %ymm29, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0x61,0x7d,0x2f,0x7f,0x29]
+          vmovdqa32 %ymm29, (%rcx) {%k7}
+
+// CHECK: vmovdqa32 %ymm29, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0x7d,0x28,0x7f,0xac,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa32 %ymm29, 291(%rax,%r14,8)
+
+// CHECK: vmovdqa32 %ymm29, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7f,0x6a,0x7f]
+          vmovdqa32 %ymm29, 4064(%rdx)
+
+// CHECK: vmovdqa32 %ymm29, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7f,0xaa,0x00,0x10,0x00,0x00]
+          vmovdqa32 %ymm29, 4096(%rdx)
+
+// CHECK: vmovdqa32 %ymm29, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7f,0x6a,0x80]
+          vmovdqa32 %ymm29, -4096(%rdx)
+
+// CHECK: vmovdqa32 %ymm29, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7d,0x28,0x7f,0xaa,0xe0,0xef,0xff,0xff]
+          vmovdqa32 %ymm29, -4128(%rdx)
+
+// CHECK: vmovdqa64 %xmm17, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x7f,0x09]
+          vmovdqa64 %xmm17, (%rcx)
+
+// CHECK: vmovdqa64 %xmm17, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x0f,0x7f,0x09]
+          vmovdqa64 %xmm17, (%rcx) {%k7}
+
+// CHECK: vmovdqa64 %xmm17, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x7f,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa64 %xmm17, 291(%rax,%r14,8)
+
+// CHECK: vmovdqa64 %xmm17, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x7f,0x4a,0x7f]
+          vmovdqa64 %xmm17, 2032(%rdx)
+
+// CHECK: vmovdqa64 %xmm17, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x7f,0x8a,0x00,0x08,0x00,0x00]
+          vmovdqa64 %xmm17, 2048(%rdx)
+
+// CHECK: vmovdqa64 %xmm17, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x7f,0x4a,0x80]
+          vmovdqa64 %xmm17, -2048(%rdx)
+
+// CHECK: vmovdqa64 %xmm17, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x7f,0x8a,0xf0,0xf7,0xff,0xff]
+          vmovdqa64 %xmm17, -2064(%rdx)
+
+// CHECK: vmovdqa64 %ymm24, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7f,0x01]
+          vmovdqa64 %ymm24, (%rcx)
+
+// CHECK: vmovdqa64 %ymm24, (%rcx) {%k2}
+// CHECK:  encoding: [0x62,0x61,0xfd,0x2a,0x7f,0x01]
+          vmovdqa64 %ymm24, (%rcx) {%k2}
+
+// CHECK: vmovdqa64 %ymm24, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0xfd,0x28,0x7f,0x84,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqa64 %ymm24, 291(%rax,%r14,8)
+
+// CHECK: vmovdqa64 %ymm24, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7f,0x42,0x7f]
+          vmovdqa64 %ymm24, 4064(%rdx)
+
+// CHECK: vmovdqa64 %ymm24, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7f,0x82,0x00,0x10,0x00,0x00]
+          vmovdqa64 %ymm24, 4096(%rdx)
+
+// CHECK: vmovdqa64 %ymm24, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7f,0x42,0x80]
+          vmovdqa64 %ymm24, -4096(%rdx)
+
+// CHECK: vmovdqa64 %ymm24, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x7f,0x82,0xe0,0xef,0xff,0xff]
+          vmovdqa64 %ymm24, -4128(%rdx)
+
+// CHECK: vmovdqu32 %xmm17, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x7f,0x09]
+          vmovdqu32 %xmm17, (%rcx)
+
+// CHECK: vmovdqu32 %xmm17, (%rcx) {%k4}
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x0c,0x7f,0x09]
+          vmovdqu32 %xmm17, (%rcx) {%k4}
+
+// CHECK: vmovdqu32 %xmm17, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x08,0x7f,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu32 %xmm17, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu32 %xmm17, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x7f,0x4a,0x7f]
+          vmovdqu32 %xmm17, 2032(%rdx)
+
+// CHECK: vmovdqu32 %xmm17, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x7f,0x8a,0x00,0x08,0x00,0x00]
+          vmovdqu32 %xmm17, 2048(%rdx)
+
+// CHECK: vmovdqu32 %xmm17, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x7f,0x4a,0x80]
+          vmovdqu32 %xmm17, -2048(%rdx)
+
+// CHECK: vmovdqu32 %xmm17, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x08,0x7f,0x8a,0xf0,0xf7,0xff,0xff]
+          vmovdqu32 %xmm17, -2064(%rdx)
+
+// CHECK: vmovdqu32 %ymm20, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x7f,0x21]
+          vmovdqu32 %ymm20, (%rcx)
+
+// CHECK: vmovdqu32 %ymm20, (%rcx) {%k1}
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x29,0x7f,0x21]
+          vmovdqu32 %ymm20, (%rcx) {%k1}
+
+// CHECK: vmovdqu32 %ymm20, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7e,0x28,0x7f,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu32 %ymm20, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu32 %ymm20, 4064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x7f,0x62,0x7f]
+          vmovdqu32 %ymm20, 4064(%rdx)
+
+// CHECK: vmovdqu32 %ymm20, 4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x7f,0xa2,0x00,0x10,0x00,0x00]
+          vmovdqu32 %ymm20, 4096(%rdx)
+
+// CHECK: vmovdqu32 %ymm20, -4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x7f,0x62,0x80]
+          vmovdqu32 %ymm20, -4096(%rdx)
+
+// CHECK: vmovdqu32 %ymm20, -4128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7e,0x28,0x7f,0xa2,0xe0,0xef,0xff,0xff]
+          vmovdqu32 %ymm20, -4128(%rdx)
+
+// CHECK: vmovdqu64 %xmm20, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7f,0x21]
+          vmovdqu64 %xmm20, (%rcx)
+
+// CHECK: vmovdqu64 %xmm20, (%rcx) {%k6}
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x0e,0x7f,0x21]
+          vmovdqu64 %xmm20, (%rcx) {%k6}
+
+// CHECK: vmovdqu64 %xmm20, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x08,0x7f,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu64 %xmm20, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu64 %xmm20, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7f,0x62,0x7f]
+          vmovdqu64 %xmm20, 2032(%rdx)
+
+// CHECK: vmovdqu64 %xmm20, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7f,0xa2,0x00,0x08,0x00,0x00]
+          vmovdqu64 %xmm20, 2048(%rdx)
+
+// CHECK: vmovdqu64 %xmm20, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7f,0x62,0x80]
+          vmovdqu64 %xmm20, -2048(%rdx)
+
+// CHECK: vmovdqu64 %xmm20, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x08,0x7f,0xa2,0xf0,0xf7,0xff,0xff]
+          vmovdqu64 %xmm20, -2064(%rdx)
+
+// CHECK: vmovdqu64 %ymm19, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0x7f,0x19]
+          vmovdqu64 %ymm19, (%rcx)
+
+// CHECK: vmovdqu64 %ymm19, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x2f,0x7f,0x19]
+          vmovdqu64 %ymm19, (%rcx) {%k7}
+
+// CHECK: vmovdqu64 %ymm19, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfe,0x28,0x7f,0x9c,0xf0,0x23,0x01,0x00,0x00]
+          vmovdqu64 %ymm19, 291(%rax,%r14,8)
+
+// CHECK: vmovdqu64 %ymm19, 4064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0x7f,0x5a,0x7f]
+          vmovdqu64 %ymm19, 4064(%rdx)
+
+// CHECK: vmovdqu64 %ymm19, 4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0x7f,0x9a,0x00,0x10,0x00,0x00]
+          vmovdqu64 %ymm19, 4096(%rdx)
+
+// CHECK: vmovdqu64 %ymm19, -4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0x7f,0x5a,0x80]
+          vmovdqu64 %ymm19, -4096(%rdx)
+
+// CHECK: vmovdqu64 %ymm19, -4128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfe,0x28,0x7f,0x9a,0xe0,0xef,0xff,0xff]
+          vmovdqu64 %ymm19, -4128(%rdx)
+
+// CHECK: vmovupd %xmm22, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x11,0x31]
+          vmovupd %xmm22, (%rcx)
+
+// CHECK: vmovupd %xmm22, (%rcx) {%k7}
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x0f,0x11,0x31]
+          vmovupd %xmm22, (%rcx) {%k7}
+
+// CHECK: vmovupd %xmm22, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0xfd,0x08,0x11,0xb4,0xf0,0x23,0x01,0x00,0x00]
+          vmovupd %xmm22, 291(%rax,%r14,8)
+
+// CHECK: vmovupd %xmm22, 2032(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x11,0x72,0x7f]
+          vmovupd %xmm22, 2032(%rdx)
+
+// CHECK: vmovupd %xmm22, 2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x11,0xb2,0x00,0x08,0x00,0x00]
+          vmovupd %xmm22, 2048(%rdx)
+
+// CHECK: vmovupd %xmm22, -2048(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x11,0x72,0x80]
+          vmovupd %xmm22, -2048(%rdx)
+
+// CHECK: vmovupd %xmm22, -2064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0xfd,0x08,0x11,0xb2,0xf0,0xf7,0xff,0xff]
+          vmovupd %xmm22, -2064(%rdx)
+
+// CHECK: vmovupd %ymm28, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x11,0x21]
+          vmovupd %ymm28, (%rcx)
+
+// CHECK: vmovupd %ymm28, (%rcx) {%k1}
+// CHECK:  encoding: [0x62,0x61,0xfd,0x29,0x11,0x21]
+          vmovupd %ymm28, (%rcx) {%k1}
+
+// CHECK: vmovupd %ymm28, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0xfd,0x28,0x11,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vmovupd %ymm28, 291(%rax,%r14,8)
+
+// CHECK: vmovupd %ymm28, 4064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x11,0x62,0x7f]
+          vmovupd %ymm28, 4064(%rdx)
+
+// CHECK: vmovupd %ymm28, 4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x11,0xa2,0x00,0x10,0x00,0x00]
+          vmovupd %ymm28, 4096(%rdx)
+
+// CHECK: vmovupd %ymm28, -4096(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x11,0x62,0x80]
+          vmovupd %ymm28, -4096(%rdx)
+
+// CHECK: vmovupd %ymm28, -4128(%rdx)
+// CHECK:  encoding: [0x62,0x61,0xfd,0x28,0x11,0xa2,0xe0,0xef,0xff,0xff]
+          vmovupd %ymm28, -4128(%rdx)
+
+// CHECK: vmovups %xmm26, (%rcx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x11,0x11]
+          vmovups %xmm26, (%rcx)
+
+// CHECK: vmovups %xmm26, (%rcx) {%k5}
+// CHECK:  encoding: [0x62,0x61,0x7c,0x0d,0x11,0x11]
+          vmovups %xmm26, (%rcx) {%k5}
+
+// CHECK: vmovups %xmm26, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0x21,0x7c,0x08,0x11,0x94,0xf0,0x23,0x01,0x00,0x00]
+          vmovups %xmm26, 291(%rax,%r14,8)
+
+// CHECK: vmovups %xmm26, 2032(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x11,0x52,0x7f]
+          vmovups %xmm26, 2032(%rdx)
+
+// CHECK: vmovups %xmm26, 2048(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x11,0x92,0x00,0x08,0x00,0x00]
+          vmovups %xmm26, 2048(%rdx)
+
+// CHECK: vmovups %xmm26, -2048(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x11,0x52,0x80]
+          vmovups %xmm26, -2048(%rdx)
+
+// CHECK: vmovups %xmm26, -2064(%rdx)
+// CHECK:  encoding: [0x62,0x61,0x7c,0x08,0x11,0x92,0xf0,0xf7,0xff,0xff]
+          vmovups %xmm26, -2064(%rdx)
+
+// CHECK: vmovups %ymm23, (%rcx)
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x11,0x39]
+          vmovups %ymm23, (%rcx)
+
+// CHECK: vmovups %ymm23, (%rcx) {%k3}
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x2b,0x11,0x39]
+          vmovups %ymm23, (%rcx) {%k3}
+
+// CHECK: vmovups %ymm23, 291(%rax,%r14,8)
+// CHECK:  encoding: [0x62,0xa1,0x7c,0x28,0x11,0xbc,0xf0,0x23,0x01,0x00,0x00]
+          vmovups %ymm23, 291(%rax,%r14,8)
+
+// CHECK: vmovups %ymm23, 4064(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x11,0x7a,0x7f]
+          vmovups %ymm23, 4064(%rdx)
+
+// CHECK: vmovups %ymm23, 4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x11,0xba,0x00,0x10,0x00,0x00]
+          vmovups %ymm23, 4096(%rdx)
+
+// CHECK: vmovups %ymm23, -4096(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x11,0x7a,0x80]
+          vmovups %ymm23, -4096(%rdx)
+
+// CHECK: vmovups %ymm23, -4128(%rdx)
+// CHECK:  encoding: [0x62,0xe1,0x7c,0x28,0x11,0xba,0xe0,0xef,0xff,0xff]
+          vmovups %ymm23, -4128(%rdx)

diff --git a/test/MC/X86/x86-itanium.ll b/test/MC/X86/x86-itanium.ll
new file mode 100644
index 0000000..1d8308d
--- /dev/null
+++ b/test/MC/X86/x86-itanium.ll

@@ -0,0 +1,6 @@
+; RUN: llc -mtriple i686-windows-itanium -filetype asm -o - %s | FileCheck %s
+
+@var = common global i32 0, align 4
+
+; CHECK-NOT: .type  _var,@object
+

diff --git a/test/MC/X86/x86-windows-itanium-libcalls.ll b/test/MC/X86/x86-windows-itanium-libcalls.ll
new file mode 100644
index 0000000..773d03b
--- /dev/null
+++ b/test/MC/X86/x86-windows-itanium-libcalls.ll

@@ -0,0 +1,16 @@
+; RUN: opt -mtriple i686-windows-itanium -O2 -o - %s | llvm-dis | FileCheck %s
+
+target triple = "i686-windows-itanium"
+
+declare dllimport double @floor(double)
+
+define dllexport float @test(float %f) {
+  %conv = fpext float %f to double
+  %call = tail call double @floor(double %conv)
+  %cast = fptrunc double %call to float
+  ret float %cast
+}
+
+; CHECK-NOT: floorf
+; CHECK: floor
+

diff --git a/test/MC/X86/x86_errors.s b/test/MC/X86/x86_errors.s
index 51f2e8e..0b3bc7f 100644
--- a/test/MC/X86/x86_errors.s
+++ b/test/MC/X86/x86_errors.s

@@ -46,3 +46,7 @@
 
 // 32: error: invalid operand for instruction
 outb al, 4
+
+// 32: error: invalid segment register
+// 64: error: invalid segment register
+movl %eax:0x00, %ebx

diff --git a/test/MC/X86/x86_operands.s b/test/MC/X86/x86_operands.s
index b34713d..2258a95 100644
--- a/test/MC/X86/x86_operands.s
+++ b/test/MC/X86/x86_operands.s

@@ -52,6 +52,11 @@
         call *%eax
 # CHECK: calll *4(%eax)
         call *4(%eax)
+foo:
+	calll foo()
+# CHECK: calll foo{{$}}
+	calll foo(,)
+# CHECK: calll foo{{$}}
 
 # CHECK: movl	%gs:8, %eax
 movl %gs:8, %eax

diff --git a/test/Makefile b/test/Makefile
index c78c256..38aba65 100644
--- a/test/Makefile
+++ b/test/Makefile

@@ -57,7 +57,7 @@
 
 # Force creation of Clang Tools' lit.site.cfg.
 clang-tools-site-cfg: FORCE
-	$(MAKE) -C $(PROJ_OBJ_DIR)/../tools/clang/tools/extra/test lit.site.cfg
+	$(MAKE) -C $(PROJ_OBJ_DIR)/../tools/clang/tools/extra/test lit.site.cfg Unit/lit.site.cfg
 extra-site-cfgs:: clang-tools-site-cfg
 endif
 
@@ -83,19 +83,15 @@
 
 # ulimits like these are redundantly enforced by the buildbots, so
 # just removing them here won't work.
-# Both AuroraUX & Solaris do not have the -m flag for ulimit
+# Solaris does not have the -m flag for ulimit
 ifeq ($(HOST_OS),SunOS)
 ULIMIT=ulimit -t 600 ; ulimit -d 512000 ; ulimit -v 512000 ;
 else # !SunOS
-ifeq ($(HOST_OS),AuroraUX)
-ULIMIT=ulimit -t 600 ; ulimit -d 512000 ; ulimit -v 512000 ;
-else # !AuroraUX
 # Newer versions of python try to allocate an insane amount of address space for
 # its thread-local storage, don't set a limit here.
 # When -v is not used, then -s has to be used to limit the stack size.
 # FIXME: Those limits should be enforced by lit instead of globally.
 ULIMIT=ulimit -t 600 ; ulimit -d 512000 ; ulimit -m 512000 ; ulimit -s 8192 ;
-endif # AuroraUX
 endif # SunOS
 
 check-local:: lit.site.cfg Unit/lit.site.cfg
@@ -112,11 +108,6 @@
 clean::
 	$(RM) -rf `find $(LLVM_OBJ_ROOT)/test -name Output -type d -print`
 
-ifneq ($(OCAMLOPT),)
-CC_FOR_OCAMLOPT := $(shell $(OCAMLOPT) -config | grep native_c_compiler | sed -e 's/native_c_compiler: //')
-CXX_FOR_OCAMLOPT := $(subst gcc,g++,$(CC_FOR_OCAMLOPT))
-endif
-
 FORCE:
 
 ifeq ($(DISABLE_ASSERTIONS),1)
@@ -132,11 +123,19 @@
 	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> lit.tmp
+	@$(ECHOPATH) s=@LIBDIR@=$(LibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBEXT@=$(SHLIBEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@EXEEXT@=$(EXEEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=$(PYTHON)=g >> lit.tmp
-	@$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -cclib -L$(LibDir) -I $(LibDir)/ocaml=g >> lit.tmp
+	@$(ECHOPATH) s=@OCAMLFIND@=$(OCAMLFIND)=g >> lit.tmp
+	@$(ECHOPATH) s!@OCAMLFLAGS@!$(addprefix -cclib ,$(LDFLAGS))!g >> lit.tmp
+	@$(ECHOPATH) s=@HAVE_OCAMLOPT@=$(HAVE_OCAMLOPT)=g >> lit.tmp
+	@$(ECHOPATH) s=@HAVE_OCAML_OUNIT@=$(HAVE_OCAML_OUNIT)=g >> lit.tmp
+	@$(ECHOPATH) s=@GO_EXECUTABLE@=$(GO)=g >> lit.tmp
+	@$(ECHOPATH) s!@HOST_CC@!$(CC)!g >> lit.tmp
+	@$(ECHOPATH) s!@HOST_CXX@!$(CXX)!g >> lit.tmp
+	@$(ECHOPATH) s!@HOST_LDFLAGS@!$(LDFLAGS)!g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp
 	@$(ECHOPATH) s=@TARGETS_TO_BUILD@=$(TARGETS_TO_BUILD)=g >> lit.tmp

diff --git a/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml b/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml
new file mode 100644
index 0000000..6147025
--- /dev/null
+++ b/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml

@@ -0,0 +1,47 @@
+# RUN: yaml2obj -format=elf %s > %t
+# RUN: obj2yaml  %t | FileCheck %s
+
+# CHECK:      - Name:            .rela.text
+# CHECK-NEXT:   Type:            SHT_RELA
+# CHECK-NEXT:    Link:            .symtab
+# CHECK-NEXT:    AddressAlign:    0x0000000000000008
+# CHECK-NEXT:    Info:            .text
+# CHECK-NEXT:    Relocations:     
+# CHECK-NEXT:      - Offset:          0x0000000000000000
+# CHECK-NEXT:        Symbol:          main
+# CHECK-NEXT:        Type:            R_AARCH64_ABS64
+# CHECK-NEXT:        Addend:          0
+
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_AARCH64
+Sections:
+  - Type:            SHT_PROGBITS
+    Name:            .text
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x04
+    Content:         0000000000000000
+  - Type:            SHT_RELA
+    Name:            .rela.text
+    Link:            .symtab
+    Info:            .text
+    AddressAlign:    0x08
+    Relocations:
+      - Offset:          0
+        Symbol:          main
+        Type:            R_AARCH64_ABS64
+        Addend:          0
+
+Symbols:
+  Local:
+    - Name:            .text
+      Type:            STT_SECTION
+      Section:         .text
+
+  Global:
+    - Name:            main
+      Type:            STT_FUNC
+      Section:         .text
+      Size:            0x08

diff --git a/test/Object/ARM/macho-data-in-code.test b/test/Object/ARM/macho-data-in-code.test
index dca084c..2bfb6c1 100644
--- a/test/Object/ARM/macho-data-in-code.test
+++ b/test/Object/ARM/macho-data-in-code.test

@@ -3,5 +3,5 @@
 CHECK:      12:	80 bd                                        	pop	{r7, pc}
 
 CHECK:      14:	38 00 00 00                                  	.long 56	@ KIND_DATA
-CHECK:      16:	00 00                                        	movs	r0, r0
+CHECK:      18:	70 47                                        	bx	lr
 

diff --git a/test/Object/Inputs/COFF/long-section-name.yaml b/test/Object/Inputs/COFF/long-section-name.yaml
new file mode 100644
index 0000000..a86f901
--- /dev/null
+++ b/test/Object/Inputs/COFF/long-section-name.yaml

@@ -0,0 +1,11 @@
+---
+header:
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_RELOCS_STRIPPED, IMAGE_FILE_LINE_NUMS_STRIPPED, IMAGE_FILE_LOCAL_SYMS_STRIPPED, IMAGE_FILE_32BIT_MACHINE ]
+sections:
+  - Name:            .long_section_name
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     ''
+symbols:
+...

diff --git a/test/Object/Inputs/COFF/section-aux-symbol.yaml b/test/Object/Inputs/COFF/section-aux-symbol.yaml
new file mode 100644
index 0000000..623af55
--- /dev/null
+++ b/test/Object/Inputs/COFF/section-aux-symbol.yaml

@@ -0,0 +1,167 @@
+---
+header:          
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_RELOCS_STRIPPED, IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_LINE_NUMS_STRIPPED, IMAGE_FILE_32BIT_MACHINE, IMAGE_FILE_DEBUG_STRIPPED ]
+sections:        
+  - Name:            .CRT
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       4
+    SectionData:     0000000030114000000000000000000010104000401640000000000000000000B015400060154000000000000000000000000000
+symbols:         
+  - Name:            '.CRT$XCAA'
+    Value:           4
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 1
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XIAA'
+    Value:           16
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 1
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XLD'
+    Value:           36
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 1
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XLC'
+    Value:           32
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 1
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XDZ'
+    Value:           48
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XDA'
+    Value:           44
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XLZ'
+    Value:           40
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XLA'
+    Value:           28
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XIC'
+    Value:           20
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 1
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XCZ'
+    Value:           8
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XCA'
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XIZ'
+    Value:           24
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+  - Name:            '.CRT$XIA'
+    Value:           12
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition: 
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+...

diff --git a/test/Object/Inputs/macho-archive-unsorted-x86_64.a b/test/Object/Inputs/macho-archive-unsorted-x86_64.a
new file mode 100644
index 0000000..6a2b570
--- /dev/null
+++ b/test/Object/Inputs/macho-archive-unsorted-x86_64.a
Binary files differ

diff --git a/test/Object/Inputs/macho-hello-g.macho-x86_64 b/test/Object/Inputs/macho-hello-g.macho-x86_64
new file mode 100755
index 0000000..41be03a
--- /dev/null
+++ b/test/Object/Inputs/macho-hello-g.macho-x86_64
Binary files differ

diff --git a/test/Object/Inputs/mri-crlf.mri b/test/Object/Inputs/mri-crlf.mri
new file mode 100644
index 0000000..b854030
--- /dev/null
+++ b/test/Object/Inputs/mri-crlf.mri

@@ -0,0 +1,2 @@
+; this file intentionally has crlf line endings

+end


diff --git a/test/Object/Inputs/trivial-label-test.elf-x86-64 b/test/Object/Inputs/trivial-label-test.elf-x86-64
new file mode 100644
index 0000000..76f4499
--- /dev/null
+++ b/test/Object/Inputs/trivial-label-test.elf-x86-64
Binary files differ

diff --git a/test/Object/Inputs/trivial-object-test.elf-mipsel b/test/Object/Inputs/trivial-object-test.elf-mipsel
index 2910a16..e72e02b 100644
--- a/test/Object/Inputs/trivial-object-test.elf-mipsel
+++ b/test/Object/Inputs/trivial-object-test.elf-mipsel
Binary files differ

diff --git a/test/Object/Inputs/unwind-section.elf-x86-64 b/test/Object/Inputs/unwind-section.elf-x86-64
new file mode 100644
index 0000000..3a84508
--- /dev/null
+++ b/test/Object/Inputs/unwind-section.elf-x86-64
Binary files differ

diff --git a/test/Object/X86/nm-ir.ll b/test/Object/X86/nm-ir.ll
index 6bb7e23..881397c 100644
--- a/test/Object/X86/nm-ir.ll
+++ b/test/Object/X86/nm-ir.ll

@@ -28,7 +28,7 @@
 @g4 = private global i32 42
 
 @a1 = alias i32* @g1
-@a2 = alias internal i32* @g1
+@a2 = internal alias i32* @g1
 
 define void @f1() {
   ret void

diff --git a/test/Object/X86/objdump-cfg-invalid-opcode.yaml b/test/Object/X86/objdump-cfg-invalid-opcode.yaml
deleted file mode 100644
index d0a29be..0000000
--- a/test/Object/X86/objdump-cfg-invalid-opcode.yaml
+++ /dev/null

@@ -1,58 +0,0 @@
-# RUN: yaml2obj -format=elf %s | llvm-objdump -d -yaml-cfg=%t - && FileCheck --check-prefix=CFG < %t %s
-# REQUIRES: shell
-#
-# Generated from:
-# main:
-# .LBL0_1:
-# 	movq	8(%rsi), %rax
-# 	<invalid opcode: 06>
-# 	nop
-
-!ELF
-FileHeader:
-  Class: ELFCLASS64
-  Data: ELFDATA2LSB
-  Type: ET_REL
-  Machine: EM_X86_64
-Sections:
-  - Name: .text
-    Type: SHT_PROGBITS
-    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
-    Content: "488B46080690"
-
-## 0000000000000000 <main>:
-
-#CFG: Atoms:
-#CFG:   - StartAddress:    0x0000000000000000
-#CFG:     Size:            4
-#CFG:     Type:            Text
-
-##    0:   48 8b 46 08             mov    0x8(%rsi),%rax
-#CFG:       - Inst:            MOV64rm
-#CFG:         Size:            4
-#CFG:         Ops:             [ RRAX, RRSI, I1, R, I8, R ]
-
-
-#CFG:   - StartAddress:    0x0000000000000004
-#CFG:     Size:            1
-#CFG:     Type:            Data
-
-##    4:   06                      (bad)
-#CFG:     Content:         '06'
-
-#CFG:   - StartAddress:    0x0000000000000005
-#CFG:     Size:            1
-#CFG:     Type:            Text
-
-##    5:   90                      nop
-#CFG:       - Inst:            NOOP
-#CFG:         Size:            1
-#CFG:         Ops:             [  ]
-
-Symbols:
-  Global:
-    - Name: main
-      Type: STT_FUNC
-      Section: .text
-      Value: 0x0
-      Size: 6

diff --git a/test/Object/X86/objdump-cfg-textatomsize.yaml b/test/Object/X86/objdump-cfg-textatomsize.yaml
deleted file mode 100644
index 87cb4e1..0000000
--- a/test/Object/X86/objdump-cfg-textatomsize.yaml
+++ /dev/null

@@ -1,39 +0,0 @@
-# RUN: yaml2obj -format=elf %s | llvm-objdump -d -yaml-cfg=%t - && FileCheck --check-prefix=CFG < %t %s
-# REQUIRES: shell
-#
-# Generated from:
-# main:
-# .LBL0_1:
-# 	jmp	.LBL0_1
-#
-
-!ELF
-FileHeader:
-  Class: ELFCLASS64
-  Data: ELFDATA2LSB
-  Type: ET_REL
-  Machine: EM_X86_64
-Sections:
-  - Name: .text
-    Type: SHT_PROGBITS
-    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
-    Content: "EBFE"
-
-## 0000000000000000 <main>:
-
-#CFG: Atoms:
-#CFG:   - StartAddress:    0x0000000000000000
-#CFG:     Size:            2
-
-##    0:   eb fe          jmp $-2
-#CFG:       - Inst:            JMP_1
-#CFG:         Size:            2
-#CFG:         Ops:             [ I-2 ]
-
-Symbols:
-  Global:
-    - Name: main
-      Type: STT_FUNC
-      Section: .text
-      Value: 0x0
-      Size: 2

diff --git a/test/Object/X86/objdump-cfg.yaml b/test/Object/X86/objdump-cfg.yaml
deleted file mode 100644
index c5bff03..0000000
--- a/test/Object/X86/objdump-cfg.yaml
+++ /dev/null

@@ -1,86 +0,0 @@
-# RUN: yaml2obj -format=elf %s | llvm-objdump -d -yaml-cfg=%t - && FileCheck --check-prefix=CFG < %t %s
-# REQUIRES: shell
-#
-# Generated from:
-# main:
-# 	movl	$48, %eax
-# 	cmpl	$3, %edi
-# 	jl	.LBB0_2
-# 	movq	8(%rsi), %rax
-# 	movsbl	(%rax), %eax
-# .LBB0_2:
-# 	ret
-#
-
-!ELF
-FileHeader:
-  Class: ELFCLASS64
-  Data: ELFDATA2LSB
-  Type: ET_REL
-  Machine: EM_X86_64
-Sections:
-  - Name: .text
-    Type: SHT_PROGBITS
-    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
-    Content: "B83000000083FF037C07488B46080FBE00C3"
-
-## 0000000000000000 <main>:
-
-#CFG: Atoms:
-#CFG:   - StartAddress:    0x0000000000000000
-#CFG:     Size:            10
-
-##    0:   b8 30 00 00 00          mov    $0x30,%eax
-#CFG:       - Inst:            MOV32ri
-#CFG:         Size:            5
-#CFG:         Ops:             [ REAX, I48 ]
-
-##    5:   83 ff 03                cmp    $0x3,%edi
-#CFG:       - Inst:            CMP32ri8
-#CFG:         Size:            3
-#CFG:         Ops:             [ REDI, I3 ]
-
-##    8:   7c 07                   jl     11 <main+0x11>
-#CFG:       - Inst:            JL_1
-#CFG:         Size:            2
-#CFG:         Ops:             [ I7 ]
-
-#CFG:   - StartAddress:    0x000000000000000A
-#CFG:     Size:            7
-
-##    a:   48 8b 46 08             mov    0x8(%rsi),%rax
-#CFG:       - Inst:            MOV64rm
-#CFG:         Size:            4
-#CFG:         Ops:             [ RRAX, RRSI, I1, R, I8, R ]
-
-##    e:   0f be 00                movsbl (%rax),%eax
-#CFG:       - Inst:            MOVSX32rm8
-#CFG:         Size:            3
-#CFG:         Ops:             [ REAX, RRAX, I1, R, I0, R ]
-#CFG:   - StartAddress:    0x0000000000000011
-#CFG:     Size:            1
-
-##   11:   c3                      retq
-#CFG:       - Inst:            RET
-#CFG:         Size:            1
-#CFG:         Ops:             [  ]
-
-Symbols:
-  Global:
-    - Name: main
-      Type: STT_FUNC
-      Section: .text
-      Value: 0x0
-      Size: 18
-
-#CFG: Functions:
-#CFG:     BasicBlocks:
-#CFG:       - Address:         0x0000000000000000
-#CFG:         Preds:           [  ]
-#CFG:         Succs:           [ 0x0000000000000011, 0x000000000000000A ]
-#CFG:       - Address:         0x0000000000000011
-#CFG:         Preds:           [ 0x0000000000000000, 0x000000000000000A ]
-#CFG:         Succs:           [  ]
-#CFG:       - Address:         0x000000000000000A
-#CFG:         Preds:           [ 0x0000000000000000 ]
-#CFG:         Succs:           [ 0x0000000000000011 ]

diff --git a/test/Object/X86/objdump-disassembly-inline-relocations.test b/test/Object/X86/objdump-disassembly-inline-relocations.test
index 7861576..3871bcb 100644
--- a/test/Object/X86/objdump-disassembly-inline-relocations.test
+++ b/test/Object/X86/objdump-disassembly-inline-relocations.test

@@ -76,11 +76,11 @@
 ELF-i386:        0:     83 ec 0c                                        subl    $12, %esp
 ELF-i386:        3:     c7 44 24 08 00 00 00 00                         movl    $0, 8(%esp)
 ELF-i386:        b:     c7 04 24 00 00 00 00                            movl    $0, (%esp)
-ELF-i386:                              e: R_386_32      Unknown
+ELF-i386:                              e: R_386_32      .rodata.str1.1
 ELF-i386:       12:     e8 fc ff ff ff                                  calll   -4
-ELF-i386:                             13: R_386_PC32    Unknown
+ELF-i386:                             13: R_386_PC32    puts
 ELF-i386:       17:     e8 fc ff ff ff                                  calll   -4
-ELF-i386:                             18: R_386_PC32    Unknown
+ELF-i386:                             18: R_386_PC32    SomeOtherFunction
 ELF-i386:       1c:     8b 44 24 08                                     movl    8(%esp), %eax
 ELF-i386:       20:     83 c4 0c                                        addl    $12, %esp
 ELF-i386:       23:     c3                                              ret

diff --git a/test/Object/X86/objdump-disassembly-symbolic.test b/test/Object/X86/objdump-disassembly-symbolic.test
deleted file mode 100644
index 95a5fc8..0000000
--- a/test/Object/X86/objdump-disassembly-symbolic.test
+++ /dev/null

@@ -1,68 +0,0 @@
-RUN: llvm-objdump -d -symbolize %p/../Inputs/trivial-object-test.elf-x86-64 \
-RUN:              | FileCheck %s -check-prefix ELF-x86-64
-RUN: llvm-objdump -d -symbolize %p/../Inputs/trivial-object-test.macho-x86-64 \
-RUN:              | FileCheck %s -check-prefix MACHO-x86-64
-
-# Generate this using:
-#   ld trivial-object-test.macho-x86-64 -undefined dynamic_lookup
-RUN: llvm-objdump -d -symbolize %p/../Inputs/trivial-executable-test.macho-x86-64 \
-RUN:              | FileCheck %s -check-prefix MACHO-STUBS-x86-64
-
-ELF-x86-64: file format ELF64-x86-64
-ELF-x86-64: Disassembly of section .text:
-ELF-x86-64: main:
-ELF-x86-64:        0:	48 83 ec 08                                  	subq	$8, %rsp
-ELF-x86-64:        4:	c7 44 24 04 00 00 00 00                      	movl	$0, 4(%rsp)
-ELF-x86-64:        c:	bf 00 00 00 00                               	movl	$.rodata.str1.1, %edi
-ELF-x86-64:       11:	e8 00 00 00 00                               	callq	puts-4
-ELF-x86-64:       16:	30 c0                                        	xorb	%al, %al
-ELF-x86-64:       18:	e8 00 00 00 00                               	callq	SomeOtherFunction-4
-ELF-x86-64:       1d:	8b 44 24 04                                  	movl	4(%rsp), %eax
-ELF-x86-64:       21:	48 83 c4 08                                  	addq	$8, %rsp
-ELF-x86-64:       25:	c3                                           	ret
-
-MACHO-x86-64: file format Mach-O 64-bit x86-64
-MACHO-x86-64: Disassembly of section __TEXT,__text:
-MACHO-x86-64: _main:
-MACHO-x86-64:        0:	48 83 ec 08                                  	subq	$8, %rsp
-MACHO-x86-64:        4:	c7 44 24 04 00 00 00 00                      	movl	$0, 4(%rsp)
-MACHO-x86-64:        c:	48 8d 3d 00 00 00 00                         	leaq	L_.str(%rip), %rdi ## literal pool for: Hello World!
-MACHO-x86-64:       13:	e8 00 00 00 00                               	callq	_puts
-MACHO-x86-64:       18:	30 c0                                        	xorb	%al, %al
-MACHO-x86-64:       1a:	e8 00 00 00 00                               	callq	_SomeOtherFunction
-MACHO-x86-64:       1f:	8b 44 24 04                                  	movl	4(%rsp), %eax
-MACHO-x86-64:       23:	48 83 c4 08                                  	addq	$8, %rsp
-MACHO-x86-64:       27:	c3                                           	ret
-
-MACHO-STUBS-x86-64: file format Mach-O 64-bit x86-64
-MACHO-STUBS-x86-64: Disassembly of section __TEXT,__text:
-MACHO-STUBS-x86-64: _main:
-MACHO-STUBS-x86-64:     1f90:       48 83 ec 08                                     subq    $8, %rsp
-MACHO-STUBS-x86-64:     1f94:       c7 44 24 04 00 00 00 00                         movl    $0, 4(%rsp)
-MACHO-STUBS-x86-64:     1f9c:       48 8d 3d 45 00 00 00                            leaq    69(%rip), %rdi ## literal pool for: Hello World!
-MACHO-STUBS-x86-64:     1fa3:       e8 16 00 00 00                                  callq   puts
-MACHO-STUBS-x86-64:     1fa8:       30 c0                                           xorb    %al, %al
-MACHO-STUBS-x86-64:     1faa:       e8 09 00 00 00                                  callq   SomeOtherFunction
-MACHO-STUBS-x86-64:     1faf:       8b 44 24 04                                     movl    4(%rsp), %eax
-MACHO-STUBS-x86-64:     1fb3:       48 83 c4 08                                     addq    $8, %rsp
-MACHO-STUBS-x86-64:     1fb7:       c3                                              ret
-
-
-RUN: llvm-objdump -d -symbolize %p/../Inputs/relocation-relocatable.elf-i386 \
-RUN:              | FileCheck %s -check-prefix ELF-i386-REL
-
-ELF-i386-REL: Disassembly of section .text:
-ELF-i386-REL-NEXT: f:
-ELF-i386-REL-NEXT:       0:	e9 fc ff ff ff                	jmp	h
-ELF-i386-REL:      g:
-ELF-i386-REL-NEXT:       5:	e9 fc ff ff ff                 	jmp	f
-
-
-RUN: llvm-objdump -d -symbolize %p/../Inputs/relocation-dynamic.elf-i386 \
-RUN:              | FileCheck %s -check-prefix ELF-i386-DYN
-
-ELF-i386-DYN: Disassembly of section .text:
-ELF-i386-DYN-NEXT: f:
-ELF-i386-DYN-NEXT:      1a4:	e9 fc ff ff ff                 	jmp	h
-ELF-i386-DYN:      g:
-ELF-i386-DYN-NEXT:      1a9:	e9 fc ff ff ff                 	jmp	f

diff --git a/test/Object/X86/objdump-label.test b/test/Object/X86/objdump-label.test
new file mode 100644
index 0000000..f8b9334
--- /dev/null
+++ b/test/Object/X86/objdump-label.test

@@ -0,0 +1,10 @@
+RUN: llvm-objdump -d %p/../Inputs/trivial-label-test.elf-x86-64 \
+RUN:              | FileCheck %s -check-prefix ELF-x86-64
+
+ELF-x86-64: file format ELF64-x86-64
+ELF-x86-64: Disassembly of section .text:
+ELF-x86-64: foo:
+ELF-x86-64:        0:	90                                           	nop
+ELF-x86-64: bum:
+ELF-x86-64:        1:	90                                           	nop
+

diff --git a/test/Object/archive-error-tmp.txt b/test/Object/archive-error-tmp.txt
index 0618986..ed3b145 100644
--- a/test/Object/archive-error-tmp.txt
+++ b/test/Object/archive-error-tmp.txt

@@ -1,5 +1,3 @@
-REQUIRES: shell
-
 Test that no temporary file is left behind on error.
 
 RUN: rm -rf %t

diff --git a/test/Object/archive-symtab.test b/test/Object/archive-symtab.test
index 0899828..01f17bc 100644
--- a/test/Object/archive-symtab.test
+++ b/test/Object/archive-symtab.test

@@ -61,6 +61,7 @@
 RUN: llvm-nm -M %t.a | FileCheck %s
 
 RUN: llvm-nm -M %p/Inputs/macho-archive-x86_64.a | FileCheck %s --check-prefix=BSD-MachO
+RUN: llvm-nm -M %p/Inputs/macho-archive-unsorted-x86_64.a | FileCheck %s --check-prefix=BSD-MachO
 
 BSD-MachO: Archive map
 BSD-MachO: _bar in bar.o

diff --git a/test/Object/coff-archive-short.test b/test/Object/coff-archive-short.test
index 2aee956..9f7165b 100644
--- a/test/Object/coff-archive-short.test
+++ b/test/Object/coff-archive-short.test

@@ -5,7 +5,7 @@
 # than 15 characters, thus, unlike coff_archive.lib, it has no string
 # table as the third member.
 #
-RUN: llvm-nm --numeric-sort -M %p/Inputs/coff_archive_short.lib | FileCheck -check-prefix=CHECKIDX %s
+RUN: llvm-nm -a --numeric-sort -M %p/Inputs/coff_archive_short.lib | FileCheck -check-prefix=CHECKIDX %s
 
 CHECKIDX: Archive map
 CHECKIDX: _shortfn1 in short1.obj

diff --git a/test/Object/coff-archive.test b/test/Object/coff-archive.test
index 3b0aa0c..239a96b 100644
--- a/test/Object/coff-archive.test
+++ b/test/Object/coff-archive.test

@@ -1,7 +1,7 @@
 #
 # Check if the index is appearing properly in the output file 
 #
-RUN: llvm-nm --numeric-sort -M %p/Inputs/coff_archive.lib | FileCheck -check-prefix=CHECKIDX %s
+RUN: llvm-nm -a --numeric-sort -M %p/Inputs/coff_archive.lib | FileCheck -check-prefix=CHECKIDX %s
 
 CHECKIDX: Archive map
 CHECKIDX: ??0invalid_argument@std@@QAE@PBD@Z in Debug\mymath.obj

diff --git a/test/Object/mri-addlib.test b/test/Object/mri-addlib.test
new file mode 100644
index 0000000..745bcf6
--- /dev/null
+++ b/test/Object/mri-addlib.test

@@ -0,0 +1,14 @@
+; RUN: echo create %t.a > %t.mri
+; RUN: echo addlib %p/Inputs/GNU.a >> %t.mri
+; RUN: echo addlib %p/Inputs/archive-test.a-gnu-minimal >> %t.mri
+; RUN: echo save >> %t.mri
+; RUN: echo end >> %t.mri
+
+; RUN: llvm-ar -M  < %t.mri
+; RUN: llvm-ar t %t.a | FileCheck %s
+
+; CHECK: evenlen
+; CHECK-NEXT: oddlen
+; CHECK-NEXT: very_long_bytecode_file_name.bc
+; CHECK-NEXT: IsNAN.o
+; CHECK-NEXT: test

diff --git a/test/Object/mri-addmod.test b/test/Object/mri-addmod.test
new file mode 100644
index 0000000..f104848
--- /dev/null
+++ b/test/Object/mri-addmod.test

@@ -0,0 +1,33 @@
+; RUN: echo create %t.a > %t.mri
+; RUN: echo "addmod  \"%p/Inputs/trivial-object-test.elf-x86-64\" " >> %t.mri
+; RUN: echo save >> %t.mri
+; RUN: echo end >> %t.mri
+
+; RUN: llvm-ar -M  < %t.mri
+; RUN: llvm-nm -M %t.a | FileCheck %s
+
+; CHECK:      Archive map
+; CHECK-NEXT: main in trivial-object-test.elf-x86-64
+
+; CHECK:      trivial-object-test.elf-x86-64:
+; CHECK-NEXT:                  U SomeOtherFunction
+; CHECK-NEXT: 0000000000000000 T main
+; CHECK-NEXT:                  U puts
+
+; Now test that CREATE overwrites an existing file.
+; RUN: echo create %t.a > %t2.mri
+; RUN: echo addmod %p/Inputs/trivial-object-test2.elf-x86-64 >> %t2.mri
+; RUN: echo save >> %t2.mri
+; RUN: echo end >> %t2.mri
+
+; RUN: llvm-ar -M  < %t2.mri
+; RUN: llvm-nm -M %t.a | FileCheck --check-prefix=NEW %s
+
+; NEW: Archive map
+; NEW-NEXT: foo in trivial-object-test2.elf-x86-64
+; NEW-NEXT: main in trivial-object-test2.elf-x86-64
+
+; NEW: trivial-object-test2.elf-x86-64:
+; NEW-NEXT: 0000000000000000 t bar
+; NEW-NEXT: 0000000000000006 T foo
+; NEW-NEXT: 0000000000000016 T main

diff --git a/test/Object/mri-crlf.test b/test/Object/mri-crlf.test
new file mode 100644
index 0000000..3411b55
--- /dev/null
+++ b/test/Object/mri-crlf.test

@@ -0,0 +1 @@
+; RUN: llvm-ar -M  < %S/Inputs/mri-crlf.mri

diff --git a/test/Object/mri1.test b/test/Object/mri1.test
new file mode 100644
index 0000000..3d27db7
--- /dev/null
+++ b/test/Object/mri1.test

@@ -0,0 +1,6 @@
+; RUN: echo create %t.a > %t.mri
+; RUN: echo save >> %t.mri
+; RUN: echo end >> %t.mri
+
+; RUN: llvm-ar -M  < %t.mri
+; RUN: llvm-ar t %t.a

diff --git a/test/Object/mri2.test b/test/Object/mri2.test
new file mode 100644
index 0000000..0c24179
--- /dev/null
+++ b/test/Object/mri2.test

@@ -0,0 +1,7 @@
+; RUN: echo create %t.a > %t.mri
+; RUN: echo create %t.a >> %t.mri
+; RUN: echo save >> %t.mri
+; RUN: echo end >> %t.mri
+
+; RUN: not llvm-ar -M  < %t.mri 2>&1 | FileCheck %s
+; CHECK: Editing multiple archives not supported

diff --git a/test/Object/mri3.test b/test/Object/mri3.test
new file mode 100644
index 0000000..bdc5399
--- /dev/null
+++ b/test/Object/mri3.test

@@ -0,0 +1,6 @@
+; RUN: echo save > %t.mri
+; RUN: echo create %t.a >> %t.mri
+; RUN: echo end >> %t.mri
+
+; RUN: not llvm-ar -M  < %t.mri 2>&1 | FileCheck %s
+; CHECK: File already saved.

diff --git a/test/Object/mri4.test b/test/Object/mri4.test
new file mode 100644
index 0000000..a24c14d
--- /dev/null
+++ b/test/Object/mri4.test

@@ -0,0 +1,4 @@
+; RUN: echo abc > %t.mri
+
+; RUN: not llvm-ar -M  < %t.mri 2>&1 | FileCheck %s
+; CHECK: Unknown command: abc.

diff --git a/test/Object/mri5.test b/test/Object/mri5.test
new file mode 100644
index 0000000..9811424
--- /dev/null
+++ b/test/Object/mri5.test

@@ -0,0 +1,2 @@
+; RUN: not llvm-ar -M t < %s 2>&1 | FileCheck %s
+; CHECK: Cannot mix -M and other options.

diff --git a/test/Object/nm-archive.test b/test/Object/nm-archive.test
index 7dbc22a..a9ae9cb 100644
--- a/test/Object/nm-archive.test
+++ b/test/Object/nm-archive.test

@@ -1,4 +1,4 @@
-RUN: llvm-nm %p/Inputs/archive-test.a-coff-i386 \
+RUN: llvm-nm -a %p/Inputs/archive-test.a-coff-i386 \
 RUN:         | FileCheck %s -check-prefix COFF
 
 COFF: trivial-object-test.coff-i386:
@@ -9,6 +9,15 @@
 COFF-NEXT: 00000000 T _main
 COFF-NEXT:          U _puts
 
+RUN: llvm-nm -a -o %p/Inputs/archive-test.a-coff-i386 \
+RUN:         | FileCheck %s -check-prefix COFF-o
+
+COFF-o: {{.*}}/archive-test.a-coff-i386:trivial-object-test.coff-i386: 00000000 d .data
+COFF-o: {{.*}}/archive-test.a-coff-i386:trivial-object-test.coff-i386: 00000000 t .text
+COFF-o: {{.*}}/archive-test.a-coff-i386:trivial-object-test.coff-i386: 00000000 d L_.str
+COFF-o: {{.*}}/archive-test.a-coff-i386:trivial-object-test.coff-i386:          U _SomeOtherFunction
+COFF-o: {{.*}}/archive-test.a-coff-i386:trivial-object-test.coff-i386: 00000000 T _main
+COFF-o: {{.*}}/archive-test.a-coff-i386:trivial-object-test.coff-i386:          U _puts
 
 RUN: llvm-as %p/Inputs/trivial.ll -o=%t1
 RUN: rm -f %t2

diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test
index 656d6b0..0135f2d 100644
--- a/test/Object/nm-trivial-object.test
+++ b/test/Object/nm-trivial-object.test

@@ -1,9 +1,13 @@
-RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-nm - \
+RUN: yaml2obj %p/Inputs/COFF/i386.yaml | llvm-nm -a -S - \
 RUN:         | FileCheck %s -check-prefix COFF
-RUN: yaml2obj %p/Inputs/COFF/x86-64.yaml | llvm-nm - \
+RUN: yaml2obj %p/Inputs/COFF/x86-64.yaml | llvm-nm -a -S - \
 RUN:         | FileCheck %s -check-prefix COFF
 RUN: llvm-nm %p/Inputs/trivial-object-test.elf-i386 \
 RUN:         | FileCheck %s -check-prefix ELF
+RUN: llvm-nm -o %p/Inputs/trivial-object-test.elf-i386 \
+RUN:         | FileCheck %s -check-prefix ELF-o
+RUN: llvm-nm -u %p/Inputs/trivial-object-test.elf-i386 \
+RUN:         | FileCheck %s -check-prefix ELF-u
 RUN: llvm-nm %p/Inputs/trivial-object-test.elf-x86-64 \
 RUN:         | FileCheck %s -check-prefix ELF64
 RUN: llvm-nm %p/Inputs/weak.elf-x86-64 \
@@ -22,7 +26,17 @@
 RUN:         | FileCheck %s -check-prefix macho-j
 RUN: llvm-nm -r %p/Inputs/macho-text-data-bss.macho-x86_64 \
 RUN:         | FileCheck %s -check-prefix macho-r
-RUN: llvm-nm %p/Inputs/common.coff-i386 \
+RUN: llvm-nm %p/Inputs/macho-text-data-bss.macho-x86_64 -s __DATA __data \
+RUN:         | FileCheck %s -check-prefix macho-s
+RUN: llvm-nm -x %p/Inputs/macho-text-data-bss.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix macho-x
+RUN: llvm-nm -o %p/Inputs/macho-text-data-bss.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix macho-o
+RUN: llvm-nm -p -a %p/Inputs/macho-hello-g.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix macho-pa
+RUN: llvm-nm -u %p/Inputs/macho-hello-g.macho-x86_64 \
+RUN:         | FileCheck %s -check-prefix macho-u
+RUN: llvm-nm -S -a %p/Inputs/common.coff-i386 \
 RUN:         | FileCheck %s -check-prefix COFF-COMMON
 RUN: llvm-nm %p/Inputs/relocatable-with-section-address.elf-x86-64 \
 RUN:         | FileCheck %s -check-prefix ELF-SEC-ADDR64
@@ -35,20 +49,20 @@
 REQUIRES: shell
 
 
-COFF: 00000000 d .data
-COFF: 00000000 t .text
-COFF: 00000000 d L{{_?}}.str
+COFF: 00000000 {{.*}} d .data
+COFF: 00000000 {{.*}} t .text
+COFF: 00000000 0000000d d L{{_?}}.str
 COFF:          U {{_?}}SomeOtherFunction
-COFF: 00000000 T {{_?}}main
+COFF: 00000000 {{.*}} T {{_?}}main
 COFF:          U {{_?}}puts
 
-COFF-COMMON: 00000000 b .bss
-COFF-COMMON-NEXT: 00000000 d .data
-COFF-COMMON-NEXT: 00000000 d .drectve
-COFF-COMMON-NEXT: 00000000 n .file
-COFF-COMMON-NEXT: 00000000 r .rdata$zzz
-COFF-COMMON-NEXT: 00000000 t .text
-COFF-COMMON-NEXT:          C _a
+COFF-COMMON: 00000000 00000000 b .bss
+COFF-COMMON-NEXT: 00000000 00000000 d .data
+COFF-COMMON-NEXT: 00000000 00000014 d .drectve
+COFF-COMMON-NEXT: 00000000 00000000 n .file
+COFF-COMMON-NEXT: 00000000 00000014 r .rdata$zzz
+COFF-COMMON-NEXT: 00000000 00000000 t .text
+COFF-COMMON-NEXT:          00000004 C _a
 
 
 ELF-NOT:      U
@@ -56,6 +70,13 @@
 ELF: 00000000 T main
 ELF:          U puts
 
+ELF-o: {{.*}}/trivial-object-test.elf-i386:          U SomeOtherFunction
+ELF-o: {{.*}}/trivial-object-test.elf-i386: 00000000 T main
+ELF-o: {{.*}}/trivial-object-test.elf-i386:          U puts
+
+ELF-u:          U SomeOtherFunction
+ELF-u:          U puts
+
 ELF64:                  U SomeOtherFunction
 ELF64: 0000000000000000 T main
 ELF64:                  U puts
@@ -99,6 +120,41 @@
 macho-r-NEXT: 0000000000000070 b _b
 macho-r-NEXT: 0000000000000030 s EH_frame0
 
+macho-s: 000000000000000c D _d
+macho-s-NOT: 0000000000000048 S _t.eh
+macho-s-NOT: 0000000000000000 T _t
+macho-s-NOT: 0000000000000070 b _b
+macho-s-NOT: 0000000000000030 s EH_frame0
+
+macho-x: 0000000000000030 0e 05 0000 00000010 EH_frame0
+macho-x: 0000000000000070 0e 03 0000 0000000d _b
+macho-x: 000000000000000c 0f 02 0000 00000004 _d
+macho-x: 0000000000000000 0f 01 0000 00000001 _t
+macho-x: 0000000000000048 0f 05 0000 00000007 _t.eh
+
+
+macho-o: {{.*}}/macho-text-data-bss.macho-x86_64: 0000000000000030 s EH_frame0
+macho-o: {{.*}}/macho-text-data-bss.macho-x86_64: 0000000000000070 b _b
+macho-o: {{.*}}/macho-text-data-bss.macho-x86_64: 000000000000000c D _d
+macho-o: {{.*}}/macho-text-data-bss.macho-x86_64: 0000000000000000 T _t
+macho-o: {{.*}}/macho-text-data-bss.macho-x86_64: 0000000000000048 S _t.eh
+
+macho-pa: 0000000000000000 - 00 0000    SO /Volumes/SandBox/
+macho-pa: 0000000000000000 - 00 0000    SO hello.c
+macho-pa: 0000000053c8408d - 03 0001   OSO /Volumes/SandBox/hello.o
+macho-pa: 0000000100000f30 - 01 0000 BNSYM 
+macho-pa: 0000000100000f30 - 01 0000   FUN _main
+macho-pa: 000000000000003b - 00 0000   FUN 
+macho-pa: 000000000000003b - 01 0000 ENSYM 
+macho-pa: 0000000000000000 - 01 0000    SO 
+macho-pa: 0000000100000000 T __mh_execute_header
+macho-pa: 0000000100000f30 T _main
+macho-pa:                  U _printf
+macho-pa:                  U dyld_stub_binder
+
+macho-u: _printf
+macho-u: dyld_stub_binder
+
 Test that nm uses addresses even with ELF .o files.
 ELF-SEC-ADDR64:      0000000000000058 D a
 ELF-SEC-ADDR64-NEXT: 000000000000005c D b

diff --git a/test/Object/nm-universal-binary.test b/test/Object/nm-universal-binary.test
index 889377b..0cced18 100644
--- a/test/Object/nm-universal-binary.test
+++ b/test/Object/nm-universal-binary.test

@@ -2,10 +2,16 @@
 RUN:         | FileCheck %s -check-prefix CHECK-OBJ
 RUN: llvm-nm -arch x86_64 %p/Inputs/macho-universal.x86_64.i386 \
 RUN:         | FileCheck %s -check-prefix CHECK-OBJ-x86_64
+RUN: not llvm-nm -arch armv7m %p/Inputs/macho-universal.x86_64.i386 2>&1 \
+RUN:         | FileCheck %s -check-prefix CHECK-OBJ-armv7m
+RUN: not llvm-nm -arch foobar %p/Inputs/macho-universal.x86_64.i386 2>&1 \
+RUN:         | FileCheck %s -check-prefix CHECK-OBJ-foobar
 RUN: llvm-nm -arch all %p/Inputs/macho-universal-archive.x86_64.i386 \
 RUN:         | FileCheck %s -check-prefix CHECK-AR
 RUN: llvm-nm -arch i386 %p/Inputs/macho-universal-archive.x86_64.i386 \
 RUN:         | FileCheck %s -check-prefix CHECK-AR-i386
+RUN: llvm-nm -o -arch all %p/Inputs/macho-universal-archive.x86_64.i386 \
+RUN:         | FileCheck %s -check-prefix CHECK-AR-o
 
 CHECK-OBJ: macho-universal.x86_64.i386 (for architecture x86_64):
 CHECK-OBJ: 0000000100000f60 T _main
@@ -16,6 +22,12 @@
 CHECK-OBJ-x86_64: 0000000100000f60 T _main
 CHECK-OBJ-x86_64:                  U dyld_stub_binder
 
+CHECK-OBJ-armv7m-NOT: Unknown architecture named
+CHECK-OBJ-armv7m: does not contain architecture
+
+CHECK-OBJ-foobar: Unknown architecture named
+CHECK-OBJ-foobar: does not contain architecture
+
 CHECK-AR: macho-universal-archive.x86_64.i386(hello.o) (for architecture x86_64):
 CHECK-AR: 0000000000000068 s EH_frame0
 CHECK-AR: 000000000000003b s L_.str
@@ -29,3 +41,11 @@
 CHECK-AR-i386: macho-universal-archive.x86_64.i386(foo.o):
 CHECK-AR-i386: 00000008 D _bar
 CHECK-AR-i386: 00000000 T _foo
+
+CHECK-AR-o: (for architecture x86_64):{{.*}}/macho-universal-archive.x86_64.i386:hello.o: 0000000000000068 s EH_frame0
+CHECK-AR-o: (for architecture x86_64):{{.*}}/macho-universal-archive.x86_64.i386:hello.o: 000000000000003b s L_.str
+CHECK-AR-o: (for architecture x86_64):{{.*}}/macho-universal-archive.x86_64.i386:hello.o: 0000000000000000 T _main
+CHECK-AR-o: (for architecture x86_64):{{.*}}/macho-universal-archive.x86_64.i386:hello.o: 0000000000000080 S _main.eh
+CHECK-AR-o: (for architecture x86_64):{{.*}}/macho-universal-archive.x86_64.i386:hello.o:                  U _printf
+CHECK-AR-o: (for architecture i386):{{.*}}/macho-universal-archive.x86_64.i386:foo.o: 00000008 D _bar
+CHECK-AR-o: (for architecture i386):{{.*}}/macho-universal-archive.x86_64.i386:foo.o: 00000000 T _foo

diff --git a/test/Object/obj2yaml-coff-long-section-name.test b/test/Object/obj2yaml-coff-long-section-name.test
new file mode 100644
index 0000000..5457aef
--- /dev/null
+++ b/test/Object/obj2yaml-coff-long-section-name.test

@@ -0,0 +1,3 @@
+RUN: yaml2obj %p/Inputs/COFF/long-section-name.yaml | obj2yaml | FileCheck %s --check-prefix COFF-I386
+
+COFF-I386: Name:            .long_section_name

diff --git a/test/Object/obj2yaml-coff-section-aux-symbol.test b/test/Object/obj2yaml-coff-section-aux-symbol.test
new file mode 100644
index 0000000..55ce5f0
--- /dev/null
+++ b/test/Object/obj2yaml-coff-section-aux-symbol.test

@@ -0,0 +1,96 @@
+RUN: yaml2obj %p/Inputs/COFF/section-aux-symbol.yaml | obj2yaml | FileCheck %s --check-prefix COFF-I386
+
+COFF-I386:     sections:
+COFF-I386-NEXT:  - Name:            .CRT
+COFF-I386:     symbols:
+COFF-I386:       - Name:            '.CRT$XCAA'
+COFF-I386-NEXT:    Value:           4
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 1
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XIAA'
+COFF-I386-NEXT:    Value:           16
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 1
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XLD'
+COFF-I386-NEXT:    Value:           36
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 1
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XLC'
+COFF-I386-NEXT:    Value:           32
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 1
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XDZ'
+COFF-I386-NEXT:    Value:           48
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 0
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XDA'
+COFF-I386-NEXT:    Value:           44
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 0
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XLZ'
+COFF-I386-NEXT:    Value:           40
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 0
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XLA'
+COFF-I386-NEXT:    Value:           28
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 0
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XIC'
+COFF-I386-NEXT:    Value:           20
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 1
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XCZ'
+COFF-I386-NEXT:    Value:           8
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 0
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XCA'
+COFF-I386-NEXT:    Value:           0
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 0
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XIZ'
+COFF-I386-NEXT:    Value:           24
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 0
+COFF-I386-NEXT:      NumberOfLinenumbers: 0
+COFF-I386:       - Name:            '.CRT$XIA'
+COFF-I386-NEXT:    Value:           12
+COFF-I386:         StorageClass:    IMAGE_SYM_CLASS_STATIC
+COFF-I386-NEXT:    SectionDefinition:
+COFF-I386-NEXT:      Length:          4
+COFF-I386-NEXT:      NumberOfRelocations: 0
+COFF-I386-NEXT:      NumberOfLinenumbers: 0

diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index 98b40d5..1c79e98 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test

@@ -3,6 +3,8 @@
 RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mipsel | FileCheck %s --check-prefix ELF-MIPSEL
 RUN: obj2yaml %p/Inputs/trivial-object-test.elf-mips64el | FileCheck %s --check-prefix ELF-MIPS64EL
 RUN: obj2yaml %p/Inputs/trivial-object-test.elf-x86-64 | FileCheck %s --check-prefix ELF-X86-64
+RUN: obj2yaml %p/Inputs/unwind-section.elf-x86-64 \
+RUN:   | FileCheck %s --check-prefix ELF-X86-64-UNWIND
 
 COFF-I386: header:
 COFF-I386-NEXT:  Machine: IMAGE_FILE_MACHINE_I386
@@ -189,6 +191,7 @@
 ELF-MIPSEL:      FileHeader:
 ELF-MIPSEL-NEXT:   Class:           ELFCLASS32
 ELF-MIPSEL-NEXT:   Data:            ELFDATA2LSB
+ELF-MIPSEL-NEXT:   OSABI:           ELFOSABI_GNU
 ELF-MIPSEL-NEXT:   Type:            ET_REL
 ELF-MIPSEL-NEXT:   Machine:         EM_MIPS
 ELF-MIPSEL-NEXT:   Flags:           [ EF_MIPS_NOREORDER, EF_MIPS_PIC, EF_MIPS_CPIC, EF_MIPS_ABI_O32, EF_MIPS_ARCH_32 ]
@@ -204,7 +207,7 @@
 ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000004
 ELF-MIPSEL-NEXT:     Info:            .text
 ELF-MIPSEL-NEXT:     Relocations:
-ELF-MIPSEL-NEXT:       - Offset:          0
+ELF-MIPSEL-NEXT:       - Offset:          0x0000000000000000
 ELF-MIPSEL-NEXT:         Symbol:          _gp_disp
 ELF-MIPSEL-NEXT:         Type:            R_MIPS_HI16
 ELF-MIPSEL-NEXT:         Addend:          0
@@ -237,6 +240,10 @@
 ELF-MIPSEL-NEXT:     Type:            SHT_NOBITS
 ELF-MIPSEL-NEXT:     Flags:           [ SHF_WRITE, SHF_ALLOC ]
 ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000004
+ELF-MIPSEL-NEXT:     Content:         48656C6C
+ELF-MIPSEL-NEXT:   - Name:            .mdebug.abi32
+ELF-MIPSEL-NEXT:     Type:            SHT_PROGBITS
+ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000001
 ELF-MIPSEL-NEXT:     Content:         ''
 ELF-MIPSEL-NEXT:   - Name:            .rodata.str1.1
 ELF-MIPSEL-NEXT:     Type:            SHT_PROGBITS
@@ -248,6 +255,11 @@
 ELF-MIPSEL-NEXT:     Flags:           [ SHF_ALLOC ]
 ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000001
 ELF-MIPSEL-NEXT:     Content:         '000000000000000000000000000000000000000000000000'
+ELF-MIPSEL-NEXT:   - Name:            .MIPS.abiflags
+ELF-MIPSEL-NEXT:     Type:            SHT_MIPS_ABIFLAGS
+ELF-MIPSEL-NEXT:     Flags:           [ SHF_ALLOC ]
+ELF-MIPSEL-NEXT:     AddressAlign:    0x0000000000000008
+ELF-MIPSEL-NEXT:     Content:         '000020010101000100000000000000000100000000000000'
 ELF-MIPSEL-NEXT: Symbols:
 ELF-MIPSEL-NEXT:   Local:
 ELF-MIPSEL-NEXT:     - Name:            trivial.ll
@@ -265,17 +277,27 @@
 ELF-MIPSEL-NEXT:     - Name:            .bss
 ELF-MIPSEL-NEXT:       Type:            STT_SECTION
 ELF-MIPSEL-NEXT:       Section:         .bss
+ELF-MIPSEL-NEXT:     - Name:            .mdebug.abi32
+ELF-MIPSEL-NEXT:       Type:            STT_SECTION
+ELF-MIPSEL-NEXT:       Section:         .mdebug.abi32
 ELF-MIPSEL-NEXT:     - Name:            .rodata.str1.1
 ELF-MIPSEL-NEXT:       Type:            STT_SECTION
 ELF-MIPSEL-NEXT:       Section:         .rodata.str1.1
 ELF-MIPSEL-NEXT:     - Name:            .reginfo
 ELF-MIPSEL-NEXT:       Type:            STT_SECTION
 ELF-MIPSEL-NEXT:       Section:         .reginfo
+ELF-MIPSEL-NEXT:     - Name:            .MIPS.abiflags
+ELF-MIPSEL-NEXT:       Type:            STT_SECTION
+ELF-MIPSEL-NEXT:       Section:         .MIPS.abiflags
 ELF-MIPSEL-NEXT:   Global:
 ELF-MIPSEL-NEXT:     - Name:            main
 ELF-MIPSEL-NEXT:       Type:            STT_FUNC
 ELF-MIPSEL-NEXT:       Section:         .text
 ELF-MIPSEL-NEXT:       Size:            0x000000000000004C
+ELF-MIPSEL-NEXT:     - Name:            var
+ELF-MIPSEL-NEXT:       Type:            STT_OBJECT
+ELF-MIPSEL-NEXT:       Section:         .bss
+ELF-MIPSEL-NEXT:       Size:            0x0000000000000004
 ELF-MIPSEL-NEXT:     - Name:            SomeOtherFunction
 ELF-MIPSEL-NEXT:     - Name:            _gp_disp
 ELF-MIPSEL-NEXT:     - Name:            puts
@@ -405,3 +427,9 @@
 ELF-X86-64-NEXT:       Size:            0x0000000000000026
 ELF-X86-64-NEXT:     - Name:            SomeOtherFunction
 ELF-X86-64-NEXT:     - Name:            puts
+
+ELF-X86-64-UNWIND:      - Name:            .eh_frame
+ELF-X86-64-UNWIND-NEXT:   Type:            SHT_X86_64_UNWIND
+ELF-X86-64-UNWIND-NEXT:   Flags:           [ SHF_ALLOC ]
+ELF-X86-64-UNWIND-NEXT:   AddressAlign:    0x0000000000000001
+ELF-X86-64-UNWIND-NEXT:   Content:         ''

diff --git a/test/Object/objdump-reloc-shared.test b/test/Object/objdump-reloc-shared.test
new file mode 100644
index 0000000..d899ffb
--- /dev/null
+++ b/test/Object/objdump-reloc-shared.test

@@ -0,0 +1,5 @@
+RUN: llvm-objdump -r %p/Inputs/elf-reloc-no-sym.x86_64 \
+RUN:              | FileCheck %s
+
+; CHECK: elf-reloc-no-sym.x86_64:       file format ELF64-x86-64
+; CHECK-NOT: {{.}}

diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test
index 28cac10..1e41f78 100644
--- a/test/Object/objdump-relocations.test
+++ b/test/Object/objdump-relocations.test

@@ -27,9 +27,9 @@
 COFF-x86-64: IMAGE_REL_AMD64_REL32 SomeOtherFunction
 
 ELF-i386: .text
-ELF-i386: R_386_32
-ELF-i386: R_386_PC32
-ELF-i386: R_386_PC32
+ELF-i386: R_386_32 .rodata.str1.1
+ELF-i386: R_386_PC32 puts
+ELF-i386: R_386_PC32 SomeOtherFunction
 
 ELF-x86-64: .text
 ELF-x86-64: R_X86_64_32S .rodata.str1.1

diff --git a/test/Object/yaml2obj-elf-file-headers-with-e_flags.yaml b/test/Object/yaml2obj-elf-file-headers-with-e_flags.yaml
index 7d09807..dddc7d9 100644
--- a/test/Object/yaml2obj-elf-file-headers-with-e_flags.yaml
+++ b/test/Object/yaml2obj-elf-file-headers-with-e_flags.yaml

@@ -5,13 +5,15 @@
   Data: ELFDATA2LSB
   Type: ET_REL
   Machine: EM_MIPS
-  Flags: [ EF_MIPS_NOREORDER, EF_MIPS_ABI_O32, EF_MIPS_ARCH_32R2 ]
+  Flags: [ EF_MIPS_NOREORDER, EF_MIPS_ABI_O32, EF_MIPS_ARCH_32R2,
+           EF_MIPS_NAN2008 ]
 
 # CHECK: Format: ELF32-mips
 # CHECK: Arch: mipsel
 # CHECK: Machine: EM_MIPS
-# CHECK: Flags [ (0x70001001)
+# CHECK: Flags [ (0x70001401)
 # CHECK-NEXT: EF_MIPS_ABI_O32 (0x1000)
 # CHECK-NEXT: EF_MIPS_ARCH_32R2 (0x70000000)
+# CHECK-NEXT: EF_MIPS_NAN2008 (0x400)
 # CHECK-NEXT: EF_MIPS_NOREORDER (0x1)
 # CHECK-NEXT: ]

diff --git a/test/Object/yaml2obj-elf-symbol-visibility.yaml b/test/Object/yaml2obj-elf-symbol-visibility.yaml
index 113354a..6c4037c 100644
--- a/test/Object/yaml2obj-elf-symbol-visibility.yaml
+++ b/test/Object/yaml2obj-elf-symbol-visibility.yaml

@@ -44,7 +44,7 @@
 # OBJ-NEXT:   Size: 4
 # OBJ-NEXT:   Binding: Global (0x1)
 # OBJ-NEXT:   Type: Object (0x1)
-# OBJ-NEXT:   Other: 3
+# OBJ-NEXT:   Other: 163
 # OBJ-NEXT:   Section: .data (0x1)
 # OBJ-NEXT: }
 
@@ -77,6 +77,7 @@
 # YAML-NEXT:       Value:           0x0000000000000010
 # YAML-NEXT:       Size:            0x0000000000000004
 # YAML-NEXT:       Visibility:      STV_PROTECTED
+# YAML-NEXT:       Other:           [ STO_MIPS_PIC, STO_MIPS_MICROMIPS ]
 
 ---
 FileHeader:
@@ -121,6 +122,7 @@
     - Name:            protected
       Type:            STT_OBJECT
       Visibility:      STV_PROTECTED
+      Other:           [ STO_MIPS_MICROMIPS, STO_MIPS_PIC ]
       Section:         .data
       Value:           0x10
       Size:            0x04

diff --git a/test/Other/Inputs/block-info-only.bc b/test/Other/Inputs/block-info-only.bc
new file mode 100755
index 0000000..e30ca5f
--- /dev/null
+++ b/test/Other/Inputs/block-info-only.bc
Binary files differ

diff --git a/test/Other/Inputs/has-block-info.bc b/test/Other/Inputs/has-block-info.bc
new file mode 100644
index 0000000..1815db6
--- /dev/null
+++ b/test/Other/Inputs/has-block-info.bc
Binary files differ

diff --git a/test/Other/Inputs/no-block-info.bc b/test/Other/Inputs/no-block-info.bc
new file mode 100755
index 0000000..e79c276
--- /dev/null
+++ b/test/Other/Inputs/no-block-info.bc
Binary files differ

diff --git a/test/Other/bcanalyzer-block-info.txt b/test/Other/bcanalyzer-block-info.txt
new file mode 100644
index 0000000..e660312
--- /dev/null
+++ b/test/Other/bcanalyzer-block-info.txt

@@ -0,0 +1,32 @@
+RUN: llvm-bcanalyzer -dump %S/Inputs/has-block-info.bc | FileCheck -check-prefix=CHECK -check-prefix=DATA %s
+RUN: llvm-bcanalyzer -dump %S/Inputs/no-block-info.bc | FileCheck -check-prefix=UNKNOWN -check-prefix=DATA %s
+RUN: llvm-bcanalyzer -dump %S/Inputs/no-block-info.bc -block-info %S/Inputs/block-info-only.bc | FileCheck -check-prefix=CHECK -check-prefix=DATA %s
+
+  CHECK: <ABC
+UNKNOWN: <UnknownBlock8
+   DATA:   NumWords=4 BlockCodeSize=2>
+  CHECK:   <AAA 
+UNKNOWN:   <UnknownCode0
+   DATA:     op0=42 op1=43 op2=44/>
+  CHECK:   <BBB
+UNKNOWN:   <UnknownCode1
+   DATA:     op0=42/>
+  CHECK:   <AAA
+UNKNOWN:   <UnknownCode0
+   DATA:     op0=42/>
+  CHECK: </ABC>
+UNKNOWN: </UnknownBlock8>
+  CHECK: <XYZ
+UNKNOWN: <UnknownBlock9
+   DATA:   NumWords=3 BlockCodeSize=3>
+  CHECK:   <XXX
+UNKNOWN:   <UnknownCode0
+   DATA:     abbrevid=4 op0=50 op1=4/>
+  CHECK:   <YYY
+UNKNOWN:   <UnknownCode1
+   DATA:     op0=42/>
+  CHECK:   <XXX
+UNKNOWN:   <UnknownCode0
+   DATA:     abbrevid=4 op0=50 op1=5/>
+  CHECK: </XYZ>
+UNKNOWN: </UnknownBlock9>

diff --git a/test/Other/link-opts.ll b/test/Other/link-opts.ll
deleted file mode 100644
index 8e58ac8..0000000
--- a/test/Other/link-opts.ll
+++ /dev/null

@@ -1,13 +0,0 @@
-;RUN: opt -S -std-link-opts < %s | FileCheck %s
-; Simple test to check that -std-link-opts keeps only the main function.
-
-; CHECK-NOT: define
-; CHECK: define void @main
-; CHECK-NOT: define
-define void @main() {
-  ret void
-}
-
-define void @foo() {
-  ret void
-}

diff --git a/test/Other/lit-unicode.txt b/test/Other/lit-unicode.txt
new file mode 100644
index 0000000..ca92c99
--- /dev/null
+++ b/test/Other/lit-unicode.txt

@@ -0,0 +1,3 @@
+REQUIRES: shell
+RUN: echo "ようこそ" | FileCheck %s
+CHECK: {{^}}ようこそ{{$}}

diff --git a/test/SymbolRewriter/rewrite.ll b/test/SymbolRewriter/rewrite.ll
new file mode 100644
index 0000000..716fff9
--- /dev/null
+++ b/test/SymbolRewriter/rewrite.ll

@@ -0,0 +1,59 @@
+; RUN: opt -mtriple i686-win32 -rewrite-symbols -rewrite-map-file %p/rewrite.map \
+; RUN:   %s -o - | llvm-dis | FileCheck %s
+
+declare void @source_function()
+@source_variable = external global i32
+declare void @source_function_pattern_function()
+declare void @source_function_pattern_multiple_function_matches()
+@source_variable_pattern_variable = external global i32
+@source_variable_pattern_multiple_variable_matches = external global i32
+declare void @"\01naked_source_function"()
+declare void @"\01__imp_missing_global_leader_prefix"()
+
+declare i32 @first_callee()
+declare i32 @second_callee()
+define i32 @caller() {
+  %rhs = call i32 @first_callee()
+  %lhs = call i32 @second_callee()
+  %res = add i32 %rhs, %lhs
+  ret i32 %res
+}
+
+%struct.S = type { i8 }
+@_ZN1SC1Ev = alias void (%struct.S*)* @_ZN1SC2Ev
+define void @_ZN1SC2Ev(%struct.S* %this) unnamed_addr align 2 {
+entry:
+  %this.addr = alloca %struct.S*, align 4
+  store %struct.S* %this, %struct.S** %this.addr, align 4
+  ret void
+}
+
+; CHECK: @target_variable = external global i32
+; CHECK-NOT: @source_variable = external global i32
+; CHECK: @target_pattern_variable = external global i32
+; CHECK-NOT: @source_pattern_variable = external global i32
+; CHECK: @target_pattern_multiple_variable_matches = external global i32
+; CHECK-NOT: @source_pattern_multiple_variable_matches = external global i32
+; CHECK: declare void @target_function()
+; CHECK-NOT: declare void @source_function()
+; CHECK: declare void @target_pattern_function()
+; CHECK-NOT: declare void @source_function_pattern_function()
+; CHECK: declare void @target_pattern_multiple_function_matches()
+; CHECK-NOT: declare void @source_function_pattern_multiple_function_matches()
+; CHECK: declare void @naked_target_function()
+; CHECK-NOT: declare void @"\01naked_source_function"()
+; CHECK-NOT: declare void @"\01__imp__imported_function"()
+; CHECK: declare void @"\01__imp_missing_global_leader_prefix"()
+; CHECK-NOT: declare void @"\01__imp_DO_NOT_REWRITE"()
+
+; CHECK: declare i32 @renamed_callee()
+; CHECK-NOT: declare i32 @first_callee()
+; CHECK: declare i32 @second_callee()
+; CHECK: define i32 @caller() {
+; CHECK:   %rhs = call i32 @renamed_callee()
+; CHECK-NOT: %rhs = call i32 @first_callee()
+; CHECK:   %lhs = call i32 @second_callee()
+; CHECK:   %res = add i32 %rhs, %lhs
+; CHECK:   ret i32 %res
+; CHECK: }
+

diff --git a/test/SymbolRewriter/rewrite.map b/test/SymbolRewriter/rewrite.map
new file mode 100644
index 0000000..ef6dfc8
--- /dev/null
+++ b/test/SymbolRewriter/rewrite.map

@@ -0,0 +1,46 @@
+function: {
+  source: source_function,
+  target: target_function,
+}
+
+global variable: {
+  source: source_variable,
+  target: target_variable,
+}
+
+function: {
+  source: source_function_(.*),
+  transform: target_\1,
+}
+
+global variable: {
+  source: source_variable_(.*),
+  transform: target_\1,
+}
+
+function: {
+  source: naked_source_function,
+  target: naked_target_function,
+  naked: true,
+}
+
+function: {
+  source: imported_function,
+  target: exported_function,
+}
+
+function: {
+  source: missing_global_leader_prefix,
+  target: DO_NOT_REWRITE,
+}
+
+function: {
+  source: first_callee,
+  target: renamed_callee,
+}
+
+global alias: {
+  source: _ZN1SC1Ev,
+  target: _ZN1SD1Ev,
+}
+

diff --git a/test/TableGen/BitOffsetDecoder.td b/test/TableGen/BitOffsetDecoder.td
new file mode 100644
index 0000000..ec0ceee
--- /dev/null
+++ b/test/TableGen/BitOffsetDecoder.td

@@ -0,0 +1,74 @@
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def archInstrInfo : InstrInfo { }
+
+def arch : Target {
+    let InstructionSet = archInstrInfo;
+}
+
+def  Myi32  : Operand<i32> {
+  let DecoderMethod = "DecodeMyi32";
+}
+
+
+let OutOperandList = (outs), Size = 2 in {
+
+def foo : Instruction {
+    let InOperandList = (ins i32imm:$factor);
+    field bits<16> Inst;
+    bits<32> factor;
+    let Inst{7-0} = 0xAA;
+    let Inst{14-8} = factor{6-0}; // no offset
+    let AsmString = "foo  $factor";
+    field bits<16> SoftFail = 0;
+    }
+
+def bar : Instruction {
+    let InOperandList = (ins i32imm:$factor);
+    field bits<16> Inst;
+    bits<32> factor;
+    let Inst{7-0} = 0xBB;
+    let Inst{15-8} = factor{10-3}; // offset by 3
+    let AsmString = "bar  $factor";
+    field bits<16> SoftFail = 0;
+    }
+
+def biz : Instruction {
+    let InOperandList = (ins i32imm:$factor);
+    field bits<16> Inst;
+    bits<32> factor;
+    let Inst{7-0} = 0xCC;
+    let Inst{11-8,15-12} = factor{10-3}; // offset by 3, multipart
+    let AsmString = "biz  $factor";
+    field bits<16> SoftFail = 0;
+    }
+
+def baz : Instruction {
+    let InOperandList = (ins Myi32:$factor);
+    field bits<16> Inst;
+    bits<32> factor;
+    let Inst{7-0} = 0xDD;
+    let Inst{15-8} = factor{11-4}; // offset by 4 + custom decode
+    let AsmString = "baz  $factor";
+    field bits<16> SoftFail = 0;
+    }
+
+def bum : Instruction {
+    let InOperandList = (ins i32imm:$factor);
+    field bits<16> Inst;
+    bits<32> factor;
+    let Inst{7-0} = 0xEE;
+    let Inst{15-8} = !srl(factor,5);
+    let AsmString = "bum  $factor";
+    field bits<16> SoftFail = 0;
+    }
+}
+
+
+// CHECK: tmp = fieldFromInstruction(insn, 8, 7);
+// CHECK: tmp = fieldFromInstruction(insn, 8, 8) << 3;
+// CHECK: tmp |= fieldFromInstruction(insn, 8, 4) << 7;
+// CHECK: tmp |= fieldFromInstruction(insn, 12, 4) << 3;
+// CHECK: tmp = fieldFromInstruction(insn, 8, 8) << 4;

diff --git a/test/TableGen/BitsInit.td b/test/TableGen/BitsInit.td
new file mode 100644
index 0000000..6aac3e4
--- /dev/null
+++ b/test/TableGen/BitsInit.td

@@ -0,0 +1,85 @@
+
+// RUN: not llvm-tblgen %s 2>&1 > %t
+// RUN: FileCheck %s < %t
+
+def a {
+  bits<2> opc = { 0, 1 };
+  bits<2> opc2 = { 1, 0 };
+  bits<1> opc3 = { 1 };
+  bits<2> a = { opc, opc2 }; // error!
+  bits<2> b = { opc{0}, opc2{0} };
+  bits<2> c = { opc{1}, opc2{1} };
+  bits<2> c = { opc3{0}, opc3 };
+}
+
+// CHECK: def a {
+// CHECK:   bits<2> opc = { 0, 1 };
+// CHECK:   bits<2> opc2 = { 1, 0 };
+// CHECK:   bits<1> opc3 = { 1 };
+// CHECK:   bits<2> a;
+// CHECK:   bits<2> b = { 1, 0 };
+// CHECK:   bits<2> c = { 1, 1 };
+// CHECK: }
+
+def {
+  bits<2> B1 = 0b011;  // bitfield is too small, reject
+  bits<3> B2 = 0b011;  // ok
+
+  bits<2> C1 = 0b111;  // bitfield is too small, reject
+  bits<3> C2 = 0b111;  // ok
+
+  bits<2> D1 = { 0, 0 }; // ok
+  bits<2> D2 = { 0b00 }; // ok
+  bits<3> D3 = { 0, 0 }; // type mismatch.  RHS doesn't have enough bits
+  bits<3> D4 = { 0b00 }; // type mismatch.  RHS doesn't have enough bits
+  bits<1> D5 = { 0 };    // ok
+  bits<1> D6 = { 1 };    // ok
+  bits<1> D7 = { 3 };    // type mismatch.  LHS doesn't have enough bits
+  bits<2> D8 = { 0 };    // type mismatch.  RHS doesn't have enough bits
+
+  bits<8> E;
+  let E{7-0} = {0,0,1,?,?,?,?,?};
+  let E{3-0} = 0b0010;
+
+  bits<8> F1 = { 0, 1, 0b1001, 0, 0b0 }; // ok
+  bits<7> F2 = { 0, 1, 0b1001, 0, 0b0 }; // LHS doesn't have enough bits
+  bits<9> F3 = { 0, 1, 0b1001, 0, 0b0 }; // RHS doesn't have enough bits
+
+  bits<8> G1 = { 0, { 1, 0b1001, 0 }, 0b0 }; // ok
+  bits<8> G2 = { 0, { 1, 0b1001 }, 0, 0b0 }; // ok
+  bits<8> G3 = { 0, 1, { 0b1001 }, 0, 0b0 }; // ok
+
+  bits<16> H;
+  let H{15-0} = { { 0b11001100 }, 0b00110011 };
+  bits<16> I = { G1, G2 };
+
+  // Make sure we can initialise ints with bits<> values.
+  int J = H;
+  int K = { 0, 1 };
+}
+
+// CHECK: def {{.*}} {
+// CHECK: bits<2> B1;
+// CHECK: bits<3> B2 = { 0, 1, 1 };
+// CHECK: bits<2> C1;
+// CHECK: bits<3> C2 = { 1, 1, 1 };
+// CHECK: bits<2> D1 = { 0, 0 };
+// CHECK: bits<2> D2 = { 0, 0 };
+// CHECK: bits<3> D3;
+// CHECK: bits<3> D4;
+// CHECK: bits<1> D5 = { 0 };
+// CHECK: bits<1> D6 = { 1 };
+// CHECK: bits<1> D7 = { ? };
+// CHECK: bits<2> D8;
+// CHECK: bits<8> E = { 0, 0, 1, ?, 0, 0, 1, 0 };
+// CHECK: bits<8> F1 = { 0, 1, 1, 0, 0, 1, 0, 0 };
+// CHECK: bits<7> F2;
+// CHECK: bits<9> F3;
+// CHECK: bits<8> G1 = { 0, 1, 1, 0, 0, 1, 0, 0 };
+// CHECK: bits<8> G2 = { 0, 1, 1, 0, 0, 1, 0, 0 };
+// CHECK: bits<8> G3 = { 0, 1, 1, 0, 0, 1, 0, 0 };
+// CHECK: bits<16> H = { 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1 };
+// CHECK: bits<16> I = { 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0 };
+// CHECK: int J = 52275;
+// CHECK: int K = 1;
+// CHECK: }

diff --git a/test/TableGen/ClassInstanceValue.td b/test/TableGen/ClassInstanceValue.td
new file mode 100644
index 0000000..b6c4c93
--- /dev/null
+++ b/test/TableGen/ClassInstanceValue.td

@@ -0,0 +1,19 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class Struct<int i> {
+  int I = !shl(i, 1);
+  int J = !shl(I, 1);
+}
+
+class Class<Struct s> {
+    int Class_J = s.J;
+}
+
+multiclass MultiClass<int i> {
+  def Def : Class<Struct<i>>;
+// CHECK: Class_J = 8
+// CHECK-NOT: Class_J = !shl(I, 1)
+}
+
+defm Defm : MultiClass<2>;

diff --git a/test/TableGen/ForeachList.td b/test/TableGen/ForeachList.td
index 99b7e14..9bc76e0 100644
--- a/test/TableGen/ForeachList.td
+++ b/test/TableGen/ForeachList.td

@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Register<string name, int idx> {
   string Name = name;

diff --git a/test/TableGen/ForeachLoop.td b/test/TableGen/ForeachLoop.td
index 25208fa..ce8d44c 100644
--- a/test/TableGen/ForeachLoop.td
+++ b/test/TableGen/ForeachLoop.td

@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Register<string name, int idx> {
   string Name = name;

diff --git a/test/TableGen/NestedForeach.td b/test/TableGen/NestedForeach.td
index e8c16f7..5b63175 100644
--- a/test/TableGen/NestedForeach.td
+++ b/test/TableGen/NestedForeach.td

@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Droid<string series, int release, string model, int patchlevel> {
   string Series = series;

diff --git a/test/TableGen/SiblingForeach.td b/test/TableGen/SiblingForeach.td
index a11f6f8..e4c4704 100644
--- a/test/TableGen/SiblingForeach.td
+++ b/test/TableGen/SiblingForeach.td

@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Set<int i = 0, int j = 0, int k = 0> {
   int I = i;

diff --git a/test/TableGen/if.td b/test/TableGen/if.td
index 1d8d623..05a2d99 100644
--- a/test/TableGen/if.td
+++ b/test/TableGen/if.td

@@ -3,7 +3,7 @@
 
 // Support for an `!if' operator as part of a `let' statement.
 // CHECK:      class C
-// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, !if({ C:y{3} }, 1, !if({ C:y{2} }, { C:x{0} }, !if({ C:y{1} }, { C:x{1} }, !if({ C:y{0} }, { C:x{2} }, ?)))){0}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){1}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){0}, !if({ C:x{2} }, 2, 6){2}, !if({ C:x{2} }, 2, 6){1}, !if({ C:x{2} }, 2, 6){0}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){1}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){0}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){3}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){2}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){1}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){0} };
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, !if({ C:y{3} }, 1, !if({ C:y{2} }, { C:x{0} }, !if({ C:y{1} }, { C:x{1} }, !if({ C:y{0} }, { C:x{2} }, ?)))){0}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){1}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){0}, !if({ C:x{2} }, { 0, 1, 0 }, { 1, 1, 0 }){2}, !if({ C:x{2} }, { 0, 1, 0 }, { 1, 1, 0 }){1}, !if({ C:x{2} }, { 0, 1, 0 }, { 1, 1, 0 }){0}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){1}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){0}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){3}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){2}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){1}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){0} };
 class C<bits<3> x, bits<4> y, bit z> {
   bits<16> n;
 

diff --git a/test/TableGen/ifbit.td b/test/TableGen/ifbit.td
index 88f575e..18797ca 100644
--- a/test/TableGen/ifbit.td
+++ b/test/TableGen/ifbit.td

@@ -5,6 +5,8 @@
 
 class A<bit b = 1> {
   int a = !if(b, 5, 6);
+  bit c = !if(b, 0, 1);
+  bits<1> d = !if(b, 0, 1);
 }
 
 def X : A<0>;

diff --git a/test/TableGen/intrinsic-long-name.td b/test/TableGen/intrinsic-long-name.td
new file mode 100644
index 0000000..6b9ba01
--- /dev/null
+++ b/test/TableGen/intrinsic-long-name.td

@@ -0,0 +1,32 @@
+// RUN: llvm-tblgen -gen-intrinsic %s | FileCheck %s
+// XFAIL: vg_leak
+
+class IntrinsicProperty;
+
+class ValueType<int size, int value> {
+  string Namespace = "MVT";
+  int Size = size;
+  int Value = value;
+}
+
+class LLVMType<ValueType vt> {
+  ValueType VT = vt;
+}
+
+class Intrinsic<string name, list<LLVMType> param_types = []> {
+  string LLVMName = name;
+  bit isTarget = 0;
+  string TargetPrefix = "";
+  list<LLVMType> RetTypes = [];
+  list<LLVMType> ParamTypes = param_types;
+  list<IntrinsicProperty> Properties = [];
+}
+
+def iAny : ValueType<0, 254>;
+def llvm_anyint_ty : LLVMType<iAny>;
+
+// Make sure we generate the long name without crashing
+// CHECK: this_is_a_really_long_intrinsic_name_but_we_should_still_not_crash   // llvm.this.is.a.really.long.intrinsic.name.but.we.should.still.not.crash
+def int_foo : Intrinsic<"llvm.foo", [llvm_anyint_ty]>;
+def int_this_is_a_really_long_intrinsic_name_but_we_should_still_not_crash : Intrinsic<"llvm.this.is.a.really.long.intrinsic.name.but.we.should.still.not.crash", [llvm_anyint_ty]>;
+

diff --git a/test/TableGen/intrinsic-varargs.td b/test/TableGen/intrinsic-varargs.td
index 3e48f8d..935a625 100644
--- a/test/TableGen/intrinsic-varargs.td
+++ b/test/TableGen/intrinsic-varargs.td

@@ -26,5 +26,5 @@
 def isVoid : ValueType<0, 56>;   // Produces no value
 def llvm_vararg_ty : LLVMType<isVoid>;   // this means vararg here
 
-// CHECK: /* 0 */ 0, 27, 0,
+// CHECK: /* 0 */ 0, 28, 0,
 def int_foo : Intrinsic<"llvm.foo", [llvm_vararg_ty]>;

diff --git a/test/TableGen/list-element-bitref.td b/test/TableGen/list-element-bitref.td
index 4622f28..0f59b53 100644
--- a/test/TableGen/list-element-bitref.td
+++ b/test/TableGen/list-element-bitref.td

@@ -1,7 +1,7 @@
 // RUN: llvm-tblgen %s | FileCheck %s
 // XFAIL: vg_leak
 
-class C<list<bits<8>> L> {
+class C<list<bits<4>> L> {
   bits<2> V0 = L[0]{1-0};
   bits<2> V1 = L[1]{3-2};
   string V2 = !if(L[0]{0}, "Odd", "Even");

diff --git a/test/TableGen/math.td b/test/TableGen/math.td
index 59d16ae..d966346 100644
--- a/test/TableGen/math.td
+++ b/test/TableGen/math.td

@@ -1,10 +1,26 @@
 // RUN: llvm-tblgen %s | FileCheck %s
 // XFAIL: vg_leak
 
+def shifts {
+    bits<2> b = 0b10;
+    int i = 2;
+    int shifted_b = !shl(b, 2);
+    int shifted_i = !shl(i, 2);
+}
+// CHECK: def shifts
+// CHECK: shifted_b = 8
+// CHECK: shifted_i = 8
+
 class Int<int value> {
   int Value = value;
 }
 
+// CHECK: def v0
+// CHECK: Value = 0
+
+// CHECK: def v1
+// CHECK: Value = 1
+
 def v1024   : Int<1024>;
 // CHECK: def v1024
 // CHECK: Value = 1024
@@ -17,3 +33,5 @@
 // CHECK: def v2048
 // CHECK: Value = 2048
 
+def v0 : Int<!and(v1024.Value, v2048.Value)>;
+def v1 : Int<!and(v1025.Value, 1)>;

diff --git a/test/Transforms/AddDiscriminators/basic.ll b/test/Transforms/AddDiscriminators/basic.ll
index b12cbee..6c1e532 100644
--- a/test/Transforms/AddDiscriminators/basic.ll
+++ b/test/Transforms/AddDiscriminators/basic.ll

@@ -40,20 +40,20 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [basic.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [basic.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"basic.c", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [basic.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [basic.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5 "}
 !10 = metadata !{i32 3, i32 0, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [basic.c]
+!11 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [basic.c]
 !12 = metadata !{i32 4, i32 0, metadata !4, null}
 
 ; CHECK: !12 = metadata !{i32 3, i32 0, metadata !13, null}
-; CHECK: !13 = metadata !{i32 786443, metadata !1, metadata !11, i32 3, i32 0, i32 1, i32 0} ; [ DW_TAG_lexical_block ] [./basic.c]
+; CHECK: !13 = metadata !{metadata !"0xb\001", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ] [./basic.c]
 ; CHECK: !14 = metadata !{i32 4, i32 0, metadata !4, null}

diff --git a/test/Transforms/AddDiscriminators/first-only.ll b/test/Transforms/AddDiscriminators/first-only.ll
index f3b0357..e15a80a 100644
--- a/test/Transforms/AddDiscriminators/first-only.ll
+++ b/test/Transforms/AddDiscriminators/first-only.ll

@@ -50,28 +50,28 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 (trunk 199750) (llvm/trunk 199751)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [first-only.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 (trunk 199750) (llvm/trunk 199751)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [first-only.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"first-only.c", metadata !"."}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [first-only.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [first-only.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5 (trunk 199750) (llvm/trunk 199751)"}
 !10 = metadata !{i32 3, i32 0, metadata !11, null}
 
-!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [first-only.c]
-; CHECK: !11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 0, i32 0}
+!11 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [first-only.c]
+; CHECK: !11 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4}
 
 !12 = metadata !{i32 3, i32 0, metadata !13, null}
 
-!13 = metadata !{i32 786443, metadata !1, metadata !11, i32 3, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [first-only.c]
-; CHECK: !13 = metadata !{i32 786443, metadata !1, metadata !14, i32 3, i32 0, i32 1, i32 0} ; [ DW_TAG_lexical_block ] [./first-only.c]
+!13 = metadata !{metadata !"0xb\003\000\001", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ] [first-only.c]
+; CHECK: !13 = metadata !{metadata !"0xb\001", metadata !1, metadata !14} ; [ DW_TAG_lexical_block ] [./first-only.c]
 
 !14 = metadata !{i32 4, i32 0, metadata !13, null}
-; CHECK: !14 = metadata !{i32 786443, metadata !1, metadata !11, i32 3, i32 0, i32 1}
+; CHECK: !14 = metadata !{metadata !"0xb\003\000\001", metadata !1, metadata !11}
 
 !15 = metadata !{i32 5, i32 0, metadata !13, null}
 ; CHECK: !15 = metadata !{i32 4, i32 0, metadata !14, null}

diff --git a/test/Transforms/AddDiscriminators/multiple.ll b/test/Transforms/AddDiscriminators/multiple.ll
index 0241a0c..8418c9e 100644
--- a/test/Transforms/AddDiscriminators/multiple.ll
+++ b/test/Transforms/AddDiscriminators/multiple.ll

@@ -51,21 +51,21 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 (trunk 199750) (llvm/trunk 199751)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [multiple.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 (trunk 199750) (llvm/trunk 199751)\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [multiple.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"multiple.c", metadata !"."}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [multiple.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, void (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [multiple.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5 (trunk 199750) (llvm/trunk 199751)"}
 !10 = metadata !{i32 3, i32 0, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [multiple.c]
+!11 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [multiple.c]
 !12 = metadata !{i32 4, i32 0, metadata !4, null}
 
 ; CHECK: !12 = metadata !{i32 3, i32 0, metadata !13, null}
-; CHECK: !13 = metadata !{i32 786443, metadata !1, metadata !11, i32 3, i32 0, i32 1, i32 0} ; [ DW_TAG_lexical_block ] [./multiple.c]
+; CHECK: !13 = metadata !{metadata !"0xb\001", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ] [./multiple.c]
 ; CHECK: !14 = metadata !{i32 3, i32 0, metadata !15, null}
-; CHECK: !15 = metadata !{i32 786443, metadata !1, metadata !11, i32 3, i32 0, i32 2, i32 1} ; [ DW_TAG_lexical_block ] [./multiple.c]
+; CHECK: !15 = metadata !{metadata !"0xb\002", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ] [./multiple.c]

diff --git a/test/Transforms/AddDiscriminators/no-discriminators.ll b/test/Transforms/AddDiscriminators/no-discriminators.ll
index f7b45e29..66a2c4e 100644
--- a/test/Transforms/AddDiscriminators/no-discriminators.ll
+++ b/test/Transforms/AddDiscriminators/no-discriminators.ll

@@ -17,7 +17,7 @@
   %retval = alloca i32, align 4
   %i.addr = alloca i64, align 8
   store i64 %i, i64* %i.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %i.addr}, metadata !13), !dbg !14
+  call void @llvm.dbg.declare(metadata !{i64* %i.addr}, metadata !13, metadata !{}), !dbg !14
   %0 = load i64* %i.addr, align 8, !dbg !15
 ; CHECK:  %0 = load i64* %i.addr, align 8, !dbg !15
   %cmp = icmp slt i64 %0, 5, !dbg !15
@@ -39,7 +39,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -48,24 +48,24 @@
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [./no-discriminators] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./no-discriminators] [DW_LANG_C99]
 !1 = metadata !{metadata !"no-discriminators", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i64)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [./no-discriminators]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i64)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./no-discriminators]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !9}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", null, null} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
 ; CHECK: !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !12 = metadata !{metadata !"clang version 3.5.0 "}
-!13 = metadata !{i32 786689, metadata !4, metadata !"i", metadata !5, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [i] [line 1]
+!13 = metadata !{metadata !"0x101\00i\0016777217\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [i] [line 1]
 !14 = metadata !{i32 1, i32 0, metadata !4, null}
 !15 = metadata !{i32 2, i32 0, metadata !16, null}
 ; CHECK: !15 = metadata !{i32 2, i32 0, metadata !16, null}
-!16 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./no-discriminators]
-; CHECK: !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 2, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./no-discriminators]
+!16 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./no-discriminators]
+; CHECK: !16 = metadata !{metadata !"0xb\002\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./no-discriminators]
 !17 = metadata !{i32 3, i32 0, metadata !4, null}

diff --git a/test/Transforms/AlignmentFromAssumptions/simple.ll b/test/Transforms/AlignmentFromAssumptions/simple.ll
new file mode 100644
index 0000000..884c8ba
--- /dev/null
+++ b/test/Transforms/AlignmentFromAssumptions/simple.ll

@@ -0,0 +1,215 @@
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+; RUN: opt < %s -alignment-from-assumptions -S | FileCheck %s
+
+define i32 @foo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %0 = load i32* %a, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @foo
+; CHECK: load i32* {{[^,]+}}, align 32
+; CHECK: ret i32
+}
+
+define i32 @foo2(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %offsetptr = add i64 %ptrint, 24
+  %maskedptr = and i64 %offsetptr, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds i32* %a, i64 2
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @foo2
+; CHECK: load i32* {{[^,]+}}, align 16
+; CHECK: ret i32
+}
+
+define i32 @foo2a(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %offsetptr = add i64 %ptrint, 28
+  %maskedptr = and i64 %offsetptr, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds i32* %a, i64 -1
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @foo2a
+; CHECK: load i32* {{[^,]+}}, align 32
+; CHECK: ret i32
+}
+
+define i32 @goo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %0 = load i32* %a, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @goo
+; CHECK: load i32* {{[^,]+}}, align 32
+; CHECK: ret i32
+}
+
+define i32 @hoo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK-LABEL: @hoo
+; CHECK: load i32* %arrayidx, align 32
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 4, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK-LABEL: @joo
+; CHECK: load i32* %arrayidx, align 16
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @koo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 4
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK-LABEL: @koo
+; CHECK: load i32* %arrayidx, align 16
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @koo2(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ -4, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 4
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK-LABEL: @koo2
+; CHECK: load i32* %arrayidx, align 16
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @moo(i32* nocapture %a) nounwind uwtable {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 64, i32 4, i1 false)
+  ret i32 undef
+
+; CHECK-LABEL: @moo
+; CHECK: @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 64, i32 32, i1 false)
+; CHECK: ret i32 undef
+}
+
+define i32 @moo2(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %ptrint1 = ptrtoint i32* %b to i64
+  %maskedptr3 = and i64 %ptrint1, 127
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  tail call void @llvm.assume(i1 %maskcond4)
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast i32* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 64, i32 4, i1 false)
+  ret i32 undef
+
+; CHECK-LABEL: @moo2
+; CHECK: @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 64, i32 32, i1 false)
+; CHECK: ret i32 undef
+}
+
+declare void @llvm.assume(i1) nounwind
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+

diff --git a/test/Transforms/AlignmentFromAssumptions/simple32.ll b/test/Transforms/AlignmentFromAssumptions/simple32.ll
new file mode 100644
index 0000000..166e7ef
--- /dev/null
+++ b/test/Transforms/AlignmentFromAssumptions/simple32.ll

@@ -0,0 +1,215 @@
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+; RUN: opt < %s -alignment-from-assumptions -S | FileCheck %s
+
+define i32 @foo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %0 = load i32* %a, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @foo
+; CHECK: load i32* {{[^,]+}}, align 32
+; CHECK: ret i32
+}
+
+define i32 @foo2(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %offsetptr = add i64 %ptrint, 24
+  %maskedptr = and i64 %offsetptr, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds i32* %a, i64 2
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @foo2
+; CHECK: load i32* {{[^,]+}}, align 16
+; CHECK: ret i32
+}
+
+define i32 @foo2a(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %offsetptr = add i64 %ptrint, 28
+  %maskedptr = and i64 %offsetptr, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds i32* %a, i64 -1
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @foo2a
+; CHECK: load i32* {{[^,]+}}, align 32
+; CHECK: ret i32
+}
+
+define i32 @goo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %0 = load i32* %a, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @goo
+; CHECK: load i32* {{[^,]+}}, align 32
+; CHECK: ret i32
+}
+
+define i32 @hoo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK-LABEL: @hoo
+; CHECK: load i32* %arrayidx, align 32
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @joo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 4, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK-LABEL: @joo
+; CHECK: load i32* %arrayidx, align 16
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @koo(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 4
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK-LABEL: @koo
+; CHECK: load i32* %arrayidx, align 16
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @koo2(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ -4, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.06 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %r.06
+  %indvars.iv.next = add i64 %indvars.iv, 4
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, 2048
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+; CHECK-LABEL: @koo2
+; CHECK: load i32* %arrayidx, align 16
+; CHECK: ret i32 %add.lcssa
+}
+
+define i32 @moo(i32* nocapture %a) nounwind uwtable {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 64, i32 4, i1 false)
+  ret i32 undef
+
+; CHECK-LABEL: @moo
+; CHECK: @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 64, i32 32, i1 false)
+; CHECK: ret i32 undef
+}
+
+define i32 @moo2(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %ptrint1 = ptrtoint i32* %b to i64
+  %maskedptr3 = and i64 %ptrint1, 127
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  tail call void @llvm.assume(i1 %maskcond4)
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast i32* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 64, i32 4, i1 false)
+  ret i32 undef
+
+; CHECK-LABEL: @moo2
+; CHECK: @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 64, i32 32, i1 false)
+; CHECK: ret i32 undef
+}
+
+declare void @llvm.assume(i1) nounwind
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+

diff --git a/test/Transforms/AlignmentFromAssumptions/start-unk.ll b/test/Transforms/AlignmentFromAssumptions/start-unk.ll
new file mode 100644
index 0000000..b7fe249
--- /dev/null
+++ b/test/Transforms/AlignmentFromAssumptions/start-unk.ll

@@ -0,0 +1,154 @@
+; RUN: opt -alignment-from-assumptions -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%type1 = type { %type2 }
+%type2 = type { [4 x i8] }
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #0
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.bswap.i32(i32) #1
+
+; Function Attrs: nounwind uwtable
+define void @test1() unnamed_addr #2 align 2 {
+
+; CHECK-LABEL: @test1
+
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %return, label %if.end8
+
+if.end8:                                          ; preds = %if.end
+  br i1 undef, label %if.then13, label %if.end14
+
+if.then13:                                        ; preds = %if.end8
+  unreachable
+
+if.end14:                                         ; preds = %if.end8
+  br i1 undef, label %cond.false.i129, label %cond.end.i136
+
+cond.false.i129:                                  ; preds = %if.end14
+  unreachable
+
+cond.end.i136:                                    ; preds = %if.end14
+  br i1 undef, label %land.lhs.true.i, label %if.end.i145
+
+land.lhs.true.i:                                  ; preds = %cond.end.i136
+  br i1 undef, label %if.end.i145, label %if.then.i137
+
+if.then.i137:                                     ; preds = %land.lhs.true.i
+  br i1 undef, label %cond.false8.i, label %cond.end9.i
+
+cond.false8.i:                                    ; preds = %if.then.i137
+  unreachable
+
+cond.end9.i:                                      ; preds = %if.then.i137
+  br i1 undef, label %if.then23, label %if.end24
+
+if.end.i145:                                      ; preds = %land.lhs.true.i, %cond.end.i136
+  unreachable
+
+if.then23:                                        ; preds = %cond.end9.i
+  unreachable
+
+if.end24:                                         ; preds = %cond.end9.i
+  br i1 undef, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %if.end24
+  unreachable
+
+for.end:                                          ; preds = %if.end24
+  br i1 undef, label %if.end123, label %if.then121
+
+if.then121:                                       ; preds = %for.end
+  unreachable
+
+if.end123:                                        ; preds = %for.end
+  br i1 undef, label %if.end150, label %if.then126
+
+if.then126:                                       ; preds = %if.end123
+  %ptrint.i.i185 = ptrtoint %type1* undef to i64
+  %maskedptr.i.i186 = and i64 %ptrint.i.i185, 1
+  %maskcond.i.i187 = icmp eq i64 %maskedptr.i.i186, 0
+  tail call void @llvm.assume(i1 %maskcond.i.i187) #0
+  %ret.0..sroa_cast.i.i188 = bitcast %type1* undef to i32*
+  %ret.0.copyload.i.i189 = load i32* %ret.0..sroa_cast.i.i188, align 2
+
+; CHECK: load {{.*}} align 2
+
+  %0 = tail call i32 @llvm.bswap.i32(i32 %ret.0.copyload.i.i189) #0
+  %conv131 = zext i32 %0 to i64
+  %add.ptr132 = getelementptr inbounds i8* undef, i64 %conv131
+  %1 = bitcast i8* %add.ptr132 to %type1*
+  br i1 undef, label %if.end150, label %if.end.i173
+
+if.end.i173:                                      ; preds = %if.then126
+  br i1 undef, label %test1.exit, label %cond.false.i.i.i.i174
+
+cond.false.i.i.i.i174:                            ; preds = %if.end.i173
+  unreachable
+
+test1.exit: ; preds = %if.end.i173
+  br i1 undef, label %test1a.exit, label %if.end.i124
+
+if.end.i124:                                      ; preds = %test1.exit
+  unreachable
+
+test1a.exit: ; preds = %test1.exit
+  br i1 undef, label %if.end150, label %for.body137.lr.ph
+
+for.body137.lr.ph:                                ; preds = %test1a.exit
+  br label %for.body137
+
+for.body137:                                      ; preds = %test1b.exit, %for.body137.lr.ph
+  %ShndxTable.0309 = phi %type1* [ %1, %for.body137.lr.ph ], [ %incdec.ptr, %test1b.exit ]
+  %ret.0..sroa_cast.i.i106 = bitcast %type1* %ShndxTable.0309 to i32*
+  br i1 undef, label %for.body137.if.end146_crit_edge, label %if.then140
+
+for.body137.if.end146_crit_edge:                  ; preds = %for.body137
+  %incdec.ptr = getelementptr inbounds %type1* %ShndxTable.0309, i64 1
+  br i1 undef, label %cond.false.i70, label %cond.end.i
+
+if.then140:                                       ; preds = %for.body137
+  %ret.0.copyload.i.i102 = load i32* %ret.0..sroa_cast.i.i106, align 2
+
+; CHECK: load {{.*}} align 2
+
+  unreachable
+
+cond.false.i70:                                   ; preds = %for.body137.if.end146_crit_edge
+  unreachable
+
+cond.end.i:                                       ; preds = %for.body137.if.end146_crit_edge
+  br i1 undef, label %test1b.exit, label %cond.false.i.i
+
+cond.false.i.i:                                   ; preds = %cond.end.i
+  unreachable
+
+test1b.exit: ; preds = %cond.end.i
+  br i1 undef, label %if.end150, label %for.body137
+
+if.end150:                                        ; preds = %test1b.exit, %test1a.exit, %if.then126, %if.end123
+  br i1 undef, label %for.end176, label %for.body155.lr.ph
+
+for.body155.lr.ph:                                ; preds = %if.end150
+  unreachable
+
+for.end176:                                       ; preds = %if.end150
+  unreachable
+
+return:                                           ; preds = %if.end
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind uwtable }
+

diff --git a/test/Transforms/ArgumentPromotion/dbg.ll b/test/Transforms/ArgumentPromotion/dbg.ll
index 70503af..d155750 100644
--- a/test/Transforms/ArgumentPromotion/dbg.ll
+++ b/test/Transforms/ArgumentPromotion/dbg.ll

@@ -1,22 +1,26 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
-; CHECK: call void @test(), !dbg [[DBG_LOC:![0-9]]]
-; CHECK: [[TEST_FN:.*]] = {{.*}} void ()* @test
-; CHECK: [[DBG_LOC]] = metadata !{i32 8, i32 0, metadata [[TEST_FN]], null}
+; CHECK: call void @test(i32 %
+; CHECK: void (i32)* @test, {{.*}} ; [ DW_TAG_subprogram ] {{.*}} [test]
 
-define internal void @test(i32* %X) {
+declare void @sink(i32)
+
+define internal void @test(i32** %X) {
+  %1 = load i32** %X, align 8
+  %2 = load i32* %1, align 8
+  call void @sink(i32 %2)
   ret void
 }
 
-define void @caller() {
-  call void @test(i32* null), !dbg !1
+define void @caller(i32** %Y) {
+  call void @test(i32** %Y)
   ret void
 }
 
 !llvm.module.flags = !{!0}
 !llvm.dbg.cu = !{!3}
 
-!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !1 = metadata !{i32 8, i32 0, metadata !2, null}
-!2 = metadata !{i32 786478, null, null, metadata !"test", metadata !"test", metadata !"", i32 3, null, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*)* @test, null, null, null, i32 3}
-!3 = metadata !{i32 786449, null, i32 4, metadata !"clang version 3.5.0 ", i1 false, metadata !"", i32 0, null, null, metadata !4, null, null, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/pr20038/reduce/<stdin>] [DW_LANG_C_plus_plus]
+!2 = metadata !{metadata !"0x2e\00test\00test\00\003\001\001\000\006\00256\000\003", null, null, null, null, void (i32**)* @test, null, null, null} ; [ DW_TAG_subprogram ]
+!3 = metadata !{metadata !"0x11\004\00clang version 3.5.0 \000\00\000\00\002", null, null, null, metadata !4, null, null} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/pr20038/reduce/<stdin>] [DW_LANG_C_plus_plus]
 !4 = metadata !{metadata !2}

diff --git a/test/Transforms/ArgumentPromotion/fp80.ll b/test/Transforms/ArgumentPromotion/fp80.ll
new file mode 100644
index 0000000..a770d60
--- /dev/null
+++ b/test/Transforms/ArgumentPromotion/fp80.ll

@@ -0,0 +1,58 @@
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%union.u = type { x86_fp80 }
+%struct.s = type { double, i16, i8, [5 x i8] }
+
+@b = internal global %struct.s { double 3.14, i16 9439, i8 25, [5 x i8] undef }, align 16
+
+%struct.Foo = type { i32, i64 }
+@a = internal global %struct.Foo { i32 1, i64 2 }, align 8
+
+define void @run() {
+entry:
+  tail call i8 @UseLongDoubleUnsafely(%union.u* byval align 16 bitcast (%struct.s* @b to %union.u*))
+  tail call x86_fp80 @UseLongDoubleSafely(%union.u* byval align 16 bitcast (%struct.s* @b to %union.u*))
+  call i64 @AccessPaddingOfStruct(%struct.Foo* @a)
+  call i64 @CaptureAStruct(%struct.Foo* @a)
+  ret void
+}
+
+; CHECK: internal i8 @UseLongDoubleUnsafely(%union.u* byval align 16 %arg) {
+define internal i8 @UseLongDoubleUnsafely(%union.u* byval align 16 %arg) {
+entry:
+  %bitcast = bitcast %union.u* %arg to %struct.s*
+  %gep = getelementptr inbounds %struct.s* %bitcast, i64 0, i32 2
+  %result = load i8* %gep
+  ret i8 %result
+}
+
+; CHECK: internal x86_fp80 @UseLongDoubleSafely(x86_fp80 {{%.*}}) {
+define internal x86_fp80 @UseLongDoubleSafely(%union.u* byval align 16 %arg) {
+  %gep = getelementptr inbounds %union.u* %arg, i64 0, i32 0
+  %fp80 = load x86_fp80* %gep
+  ret x86_fp80 %fp80
+}
+
+; CHECK: define internal i64 @AccessPaddingOfStruct(%struct.Foo* byval %a) {
+define internal i64 @AccessPaddingOfStruct(%struct.Foo* byval %a) {
+  %p = bitcast %struct.Foo* %a to i64*
+  %v = load i64* %p
+  ret i64 %v
+}
+
+; CHECK: define internal i64 @CaptureAStruct(%struct.Foo* byval %a) {
+define internal i64 @CaptureAStruct(%struct.Foo* byval %a) {
+entry:
+  %a_ptr = alloca %struct.Foo*
+  br label %loop
+
+loop:
+  %phi = phi %struct.Foo* [ null, %entry ], [ %gep, %loop ]
+  %0   = phi %struct.Foo* [ %a, %entry ],   [ %0, %loop ]
+  store %struct.Foo* %phi, %struct.Foo** %a_ptr
+  %gep = getelementptr %struct.Foo* %a, i64 0
+  br label %loop
+}

diff --git a/test/Transforms/ArgumentPromotion/tail.ll b/test/Transforms/ArgumentPromotion/tail.ll
index 43b8996..2ea387c 100644
--- a/test/Transforms/ArgumentPromotion/tail.ll
+++ b/test/Transforms/ArgumentPromotion/tail.ll

@@ -1,6 +1,8 @@
 ; RUN: opt %s -argpromotion -S -o - | FileCheck %s
 ; PR14710
 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
 %pair = type { i32, i32 }
 
 declare i8* @foo(%pair*)

diff --git a/test/Transforms/ArgumentPromotion/variadic.ll b/test/Transforms/ArgumentPromotion/variadic.ll
new file mode 100644
index 0000000..0ae52b3
--- /dev/null
+++ b/test/Transforms/ArgumentPromotion/variadic.ll

@@ -0,0 +1,28 @@
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+
+; Unused arguments from variadic functions cannot be eliminated as that changes
+; their classiciation according to the SysV amd64 ABI. Clang and other frontends
+; bake in the classification when they use things like byval, as in this test.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.tt0 = type { i64, i64 }
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+
+@t45 = internal global %struct.tt0 { i64 1335139741, i64 438042995 }, align 8
+
+; Function Attrs: nounwind uwtable
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
+entry:
+  tail call void (i8*, i8*, i8*, i8*, i8*, ...)* @callee_t0f(i8* undef, i8* undef, i8* undef, i8* undef, i8* undef, %struct.tt0* byval align 8 @t45)
+  ret i32 0
+}
+
+; Function Attrs: nounwind uwtable
+define internal void @callee_t0f(i8* nocapture readnone %tp13, i8* nocapture readnone %tp14, i8* nocapture readnone %tp15, i8* nocapture readnone %tp16, i8* nocapture readnone %tp17, ...) {
+entry:
+  ret void
+}
+
+; CHECK-LABEL: define internal void @callee_t0f(i8* nocapture readnone %tp13, i8* nocapture readnone %tp14, i8* nocapture readnone %tp15, i8* nocapture readnone %tp16, i8* nocapture readnone %tp17, ...)

diff --git a/test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll b/test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
new file mode 100644
index 0000000..282d42f
--- /dev/null
+++ b/test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll

@@ -0,0 +1,364 @@
+; RUN: opt -S -o - -mtriple=armv7-apple-ios7.0 -atomic-expand %s | FileCheck %s
+
+define i8 @test_atomic_xchg_i8(i8* %ptr, i8 %xchgend) {
+; CHECK-LABEL: @test_atomic_xchg_i8
+; CHECK-NOT: dmb
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[NEWVAL32:%.*]] = zext i8 %xchgend to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: dmb
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw xchg i8* %ptr, i8 %xchgend monotonic
+  ret i8 %res
+}
+
+define i16 @test_atomic_add_i16(i16* %ptr, i16 %addend) {
+; CHECK-LABEL: @test_atomic_add_i16
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
+; CHECK: [[NEWVAL:%.*]] = add i16 [[OLDVAL]], %addend
+; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: ret i16 [[OLDVAL]]
+  %res = atomicrmw add i16* %ptr, i16 %addend seq_cst
+  ret i16 %res
+}
+
+define i32 @test_atomic_sub_i32(i32* %ptr, i32 %subend) {
+; CHECK-LABEL: @test_atomic_sub_i32
+; CHECK-NOT: dmb
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr)
+; CHECK: [[NEWVAL:%.*]] = sub i32 [[OLDVAL]], %subend
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[NEWVAL]], i32* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: ret i32 [[OLDVAL]]
+  %res = atomicrmw sub i32* %ptr, i32 %subend acquire
+  ret i32 %res
+}
+
+define i8 @test_atomic_and_i8(i8* %ptr, i8 %andend) {
+; CHECK-LABEL: @test_atomic_and_i8
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[NEWVAL:%.*]] = and i8 [[OLDVAL]], %andend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: dmb
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw and i8* %ptr, i8 %andend release
+  ret i8 %res
+}
+
+define i16 @test_atomic_nand_i16(i16* %ptr, i16 %nandend) {
+; CHECK-LABEL: @test_atomic_nand_i16
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
+; CHECK: [[NEWVAL_TMP:%.*]] = and i16 [[OLDVAL]], %nandend
+; CHECK: [[NEWVAL:%.*]] = xor i16 [[NEWVAL_TMP]], -1
+; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: ret i16 [[OLDVAL]]
+  %res = atomicrmw nand i16* %ptr, i16 %nandend seq_cst
+  ret i16 %res
+}
+
+define i64 @test_atomic_or_i64(i64* %ptr, i64 %orend) {
+; CHECK-LABEL: @test_atomic_or_i64
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[PTR8]])
+; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
+; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
+; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
+; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
+; CHECK: [[NEWVAL:%.*]] = or i64 [[OLDVAL]], %orend
+; CHECK: [[NEWLO:%.*]] = trunc i64 [[NEWVAL]] to i32
+; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 [[NEWVAL]], 32
+; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: ret i64 [[OLDVAL]]
+  %res = atomicrmw or i64* %ptr, i64 %orend seq_cst
+  ret i64 %res
+}
+
+define i8 @test_atomic_xor_i8(i8* %ptr, i8 %xorend) {
+; CHECK-LABEL: @test_atomic_xor_i8
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[NEWVAL:%.*]] = xor i8 [[OLDVAL]], %xorend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw xor i8* %ptr, i8 %xorend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomic_max_i8(i8* %ptr, i8 %maxend) {
+; CHECK-LABEL: @test_atomic_max_i8
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[WANT_OLD:%.*]] = icmp sgt i8 [[OLDVAL]], %maxend
+; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %maxend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw max i8* %ptr, i8 %maxend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomic_min_i8(i8* %ptr, i8 %minend) {
+; CHECK-LABEL: @test_atomic_min_i8
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[WANT_OLD:%.*]] = icmp sle i8 [[OLDVAL]], %minend
+; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %minend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw min i8* %ptr, i8 %minend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomic_umax_i8(i8* %ptr, i8 %umaxend) {
+; CHECK-LABEL: @test_atomic_umax_i8
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[WANT_OLD:%.*]] = icmp ugt i8 [[OLDVAL]], %umaxend
+; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %umaxend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw umax i8* %ptr, i8 %umaxend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_atomic_umin_i8(i8* %ptr, i8 %uminend) {
+; CHECK-LABEL: @test_atomic_umin_i8
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[WANT_OLD:%.*]] = icmp ule i8 [[OLDVAL]], %uminend
+; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %uminend
+; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw umin i8* %ptr, i8 %uminend seq_cst
+  ret i8 %res
+}
+
+define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i8_seqcst_seqcst
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[DONE]]
+
+; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK: ret i8 [[OLDVAL]]
+
+  %pairold = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %old = extractvalue { i8, i1 } %pairold, 0
+  ret i8 %old
+}
+
+define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i16_seqcst_monotonic
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: dmb
+; CHECK: br label %[[DONE]]
+
+; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK: ret i16 [[OLDVAL]]
+
+  %pairold = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %old = extractvalue { i16, i1 } %pairold, 0
+  ret i16 %old
+}
+
+define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i32_acquire_acquire
+; CHECK-NOT: dmb
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr)
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK: call void @llvm.arm.dmb(i32 11)
+; CHECK: br label %[[DONE]]
+
+; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK: ret i32 [[OLDVAL]]
+
+  %pairold = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %old = extractvalue { i32, i1 } %pairold, 0
+  ret i32 %old
+}
+
+define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i64_monotonic_monotonic
+; CHECK-NOT: dmb
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[PTR8]])
+; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
+; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
+; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
+; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
+; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 %newval, 32
+; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: dmb
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: dmb
+; CHECK: br label %[[DONE]]
+
+; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK: ret i64 [[OLDVAL]]
+
+  %pairold = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %old = extractvalue { i64, i1 } %pairold, 0
+  ret i64 %old
+}

diff --git a/test/Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll b/test/Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll
new file mode 100644
index 0000000..42d7b78
--- /dev/null
+++ b/test/Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll

@@ -0,0 +1,226 @@
+; RUN: opt -S -o - -mtriple=armv8-linux-gnueabihf -atomic-expand %s | FileCheck %s
+
+define i8 @test_atomic_xchg_i8(i8* %ptr, i8 %xchgend) {
+; CHECK-LABEL: @test_atomic_xchg_i8
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
+; CHECK: [[NEWVAL32:%.*]] = zext i8 %xchgend to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i8 [[OLDVAL]]
+  %res = atomicrmw xchg i8* %ptr, i8 %xchgend monotonic
+  ret i8 %res
+}
+
+define i16 @test_atomic_add_i16(i16* %ptr, i16 %addend) {
+; CHECK-LABEL: @test_atomic_add_i16
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
+; CHECK: [[NEWVAL:%.*]] = add i16 [[OLDVAL]], %addend
+; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.stlex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i16 [[OLDVAL]]
+  %res = atomicrmw add i16* %ptr, i16 %addend seq_cst
+  ret i16 %res
+}
+
+define i32 @test_atomic_sub_i32(i32* %ptr, i32 %subend) {
+; CHECK-LABEL: @test_atomic_sub_i32
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* %ptr)
+; CHECK: [[NEWVAL:%.*]] = sub i32 [[OLDVAL]], %subend
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[NEWVAL]], i32* %ptr)
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i32 [[OLDVAL]]
+  %res = atomicrmw sub i32* %ptr, i32 %subend acquire
+  ret i32 %res
+}
+
+define i64 @test_atomic_or_i64(i64* %ptr, i64 %orend) {
+; CHECK-LABEL: @test_atomic_or_i64
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldaexd(i8* [[PTR8]])
+; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
+; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
+; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
+; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
+; CHECK: [[NEWVAL:%.*]] = or i64 [[OLDVAL]], %orend
+; CHECK: [[NEWLO:%.*]] = trunc i64 [[NEWVAL]] to i32
+; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 [[NEWVAL]], 32
+; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.stlexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
+; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
+; CHECK: [[END]]:
+; CHECK-NOT: fence
+; CHECK: ret i64 [[OLDVAL]]
+  %res = atomicrmw or i64* %ptr, i64 %orend seq_cst
+  ret i64 %res
+}
+
+define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i8_seqcst_seqcst
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE]]
+
+; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK: ret i8 [[OLDVAL]]
+
+  %pairold = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
+  %old = extractvalue { i8, i1 } %pairold, 0
+  ret i8 %old
+}
+
+define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i16_seqcst_monotonic
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* %ptr)
+; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence
+; CHECK: br label %[[DONE]]
+
+; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK: ret i16 [[OLDVAL]]
+
+  %pairold = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
+  %old = extractvalue { i16, i1 } %pairold, 0
+  ret i16 %old
+}
+
+define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i32_acquire_acquire
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* %ptr)
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE]]
+
+; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK: ret i32 [[OLDVAL]]
+
+  %pairold = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
+  %old = extractvalue { i32, i1 } %pairold, 0
+  ret i32 %old
+}
+
+define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %newval) {
+; CHECK-LABEL: @test_cmpxchg_i64_monotonic_monotonic
+; CHECK-NOT: fence
+; CHECK: br label %[[LOOP:.*]]
+
+; CHECK: [[LOOP]]:
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[PTR8]])
+; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
+; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
+; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
+; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
+; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
+; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
+; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 %newval, 32
+; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
+; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
+; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
+; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
+; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: fence_cst
+; CHECK: br label %[[DONE]]
+
+; CHECK: [[DONE]]:
+; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK: ret i64 [[OLDVAL]]
+
+  %pairold = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
+  %old = extractvalue { i64, i1 } %pairold, 0
+  ret i64 %old
+}

diff --git a/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll b/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
new file mode 100644
index 0000000..5465300
--- /dev/null
+++ b/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll

@@ -0,0 +1,98 @@
+; RUN: opt -atomic-expand -S -mtriple=thumbv7s-apple-ios7.0 %s | FileCheck %s
+
+define i32 @test_cmpxchg_seq_cst(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg_seq_cst
+; Intrinsic for "dmb ishst" is then expected
+; CHECK:     call void @llvm.arm.dmb(i32 10)
+; CHECK:     br label %[[START:.*]]
+
+; CHECK: [[START]]:
+; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
+; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
+; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
+; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK:     call void @llvm.arm.dmb(i32 11)
+; CHECK:     br label %[[END:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK:     call void @llvm.arm.dmb(i32 11)
+; CHECK:     br label %[[END]]
+
+; CHECK: [[END]]:
+; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK:     ret i32 [[LOADED]]
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %oldval = extractvalue { i32, i1 } %pair, 0
+  ret i32 %oldval
+}
+
+define i1 @test_cmpxchg_weak_fail(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg_weak_fail
+; CHECK:     call void @llvm.arm.dmb(i32 10)
+; CHECK:     br label %[[START:.*]]
+
+; CHECK: [[START]]:
+; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
+; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
+; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
+; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK:     call void @llvm.arm.dmb(i32 11)
+; CHECK:     br label %[[END:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: dmb
+; CHECK:     br label %[[END]]
+
+; CHECK: [[END]]:
+; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK:     ret i1 [[SUCCESS]]
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 1
+  ret i1 %oldval
+}
+
+define i32 @test_cmpxchg_monotonic(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: @test_cmpxchg_monotonic
+; CHECK-NOT: dmb
+; CHECK:     br label %[[START:.*]]
+
+; CHECK: [[START]]:
+; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
+; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
+; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[TRY_STORE]]:
+; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
+; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
+; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
+
+; CHECK: [[SUCCESS_BB]]:
+; CHECK-NOT: dmb
+; CHECK:     br label %[[END:.*]]
+
+; CHECK: [[FAILURE_BB]]:
+; CHECK-NOT: dmb
+; CHECK:     br label %[[END]]
+
+; CHECK: [[END]]:
+; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
+; CHECK:     ret i32 [[LOADED]]
+
+  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new monotonic monotonic
+  %oldval = extractvalue { i32, i1 } %pair, 0
+  ret i32 %oldval
+}

diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg b/test/Transforms/AtomicExpand/ARM/lit.local.cfg
similarity index 100%
rename from test/Transforms/AtomicExpandLoadLinked/ARM/lit.local.cfg
rename to test/Transforms/AtomicExpand/ARM/lit.local.cfg


diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
deleted file mode 100644
index 6a93016..0000000
--- a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v7.ll
+++ /dev/null

@@ -1,364 +0,0 @@
-; RUN: opt -S -o - -mtriple=armv7-apple-ios7.0 -atomic-ll-sc %s | FileCheck %s
-
-define i8 @test_atomic_xchg_i8(i8* %ptr, i8 %xchgend) {
-; CHECK-LABEL: @test_atomic_xchg_i8
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
-; CHECK: [[NEWVAL32:%.*]] = zext i8 %xchgend to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK-NOT: fence
-; CHECK: ret i8 [[OLDVAL]]
-  %res = atomicrmw xchg i8* %ptr, i8 %xchgend monotonic
-  ret i8 %res
-}
-
-define i16 @test_atomic_add_i16(i16* %ptr, i16 %addend) {
-; CHECK-LABEL: @test_atomic_add_i16
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
-; CHECK: [[NEWVAL:%.*]] = add i16 [[OLDVAL]], %addend
-; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK: fence seq_cst
-; CHECK: ret i16 [[OLDVAL]]
-  %res = atomicrmw add i16* %ptr, i16 %addend seq_cst
-  ret i16 %res
-}
-
-define i32 @test_atomic_sub_i32(i32* %ptr, i32 %subend) {
-; CHECK-LABEL: @test_atomic_sub_i32
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr)
-; CHECK: [[NEWVAL:%.*]] = sub i32 [[OLDVAL]], %subend
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[NEWVAL]], i32* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK: fence acquire
-; CHECK: ret i32 [[OLDVAL]]
-  %res = atomicrmw sub i32* %ptr, i32 %subend acquire
-  ret i32 %res
-}
-
-define i8 @test_atomic_and_i8(i8* %ptr, i8 %andend) {
-; CHECK-LABEL: @test_atomic_and_i8
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
-; CHECK: [[NEWVAL:%.*]] = and i8 [[OLDVAL]], %andend
-; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK-NOT: fence
-; CHECK: ret i8 [[OLDVAL]]
-  %res = atomicrmw and i8* %ptr, i8 %andend release
-  ret i8 %res
-}
-
-define i16 @test_atomic_nand_i16(i16* %ptr, i16 %nandend) {
-; CHECK-LABEL: @test_atomic_nand_i16
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
-; CHECK: [[NEWVAL_TMP:%.*]] = and i16 [[OLDVAL]], %nandend
-; CHECK: [[NEWVAL:%.*]] = xor i16 [[NEWVAL_TMP]], -1
-; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK: fence seq_cst
-; CHECK: ret i16 [[OLDVAL]]
-  %res = atomicrmw nand i16* %ptr, i16 %nandend seq_cst
-  ret i16 %res
-}
-
-define i64 @test_atomic_or_i64(i64* %ptr, i64 %orend) {
-; CHECK-LABEL: @test_atomic_or_i64
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
-; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[PTR8]])
-; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
-; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
-; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
-; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
-; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
-; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
-; CHECK: [[NEWVAL:%.*]] = or i64 [[OLDVAL]], %orend
-; CHECK: [[NEWLO:%.*]] = trunc i64 [[NEWVAL]] to i32
-; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 [[NEWVAL]], 32
-; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
-; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK: fence seq_cst
-; CHECK: ret i64 [[OLDVAL]]
-  %res = atomicrmw or i64* %ptr, i64 %orend seq_cst
-  ret i64 %res
-}
-
-define i8 @test_atomic_xor_i8(i8* %ptr, i8 %xorend) {
-; CHECK-LABEL: @test_atomic_xor_i8
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
-; CHECK: [[NEWVAL:%.*]] = xor i8 [[OLDVAL]], %xorend
-; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK: fence seq_cst
-; CHECK: ret i8 [[OLDVAL]]
-  %res = atomicrmw xor i8* %ptr, i8 %xorend seq_cst
-  ret i8 %res
-}
-
-define i8 @test_atomic_max_i8(i8* %ptr, i8 %maxend) {
-; CHECK-LABEL: @test_atomic_max_i8
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
-; CHECK: [[WANT_OLD:%.*]] = icmp sgt i8 [[OLDVAL]], %maxend
-; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %maxend
-; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK: fence seq_cst
-; CHECK: ret i8 [[OLDVAL]]
-  %res = atomicrmw max i8* %ptr, i8 %maxend seq_cst
-  ret i8 %res
-}
-
-define i8 @test_atomic_min_i8(i8* %ptr, i8 %minend) {
-; CHECK-LABEL: @test_atomic_min_i8
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
-; CHECK: [[WANT_OLD:%.*]] = icmp sle i8 [[OLDVAL]], %minend
-; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %minend
-; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK: fence seq_cst
-; CHECK: ret i8 [[OLDVAL]]
-  %res = atomicrmw min i8* %ptr, i8 %minend seq_cst
-  ret i8 %res
-}
-
-define i8 @test_atomic_umax_i8(i8* %ptr, i8 %umaxend) {
-; CHECK-LABEL: @test_atomic_umax_i8
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
-; CHECK: [[WANT_OLD:%.*]] = icmp ugt i8 [[OLDVAL]], %umaxend
-; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %umaxend
-; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK: fence seq_cst
-; CHECK: ret i8 [[OLDVAL]]
-  %res = atomicrmw umax i8* %ptr, i8 %umaxend seq_cst
-  ret i8 %res
-}
-
-define i8 @test_atomic_umin_i8(i8* %ptr, i8 %uminend) {
-; CHECK-LABEL: @test_atomic_umin_i8
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
-; CHECK: [[WANT_OLD:%.*]] = icmp ule i8 [[OLDVAL]], %uminend
-; CHECK: [[NEWVAL:%.*]] = select i1 [[WANT_OLD]], i8 [[OLDVAL]], i8 %uminend
-; CHECK: [[NEWVAL32:%.*]] = zext i8 [[NEWVAL]] to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK: fence seq_cst
-; CHECK: ret i8 [[OLDVAL]]
-  %res = atomicrmw umin i8* %ptr, i8 %uminend seq_cst
-  ret i8 %res
-}
-
-define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
-; CHECK-LABEL: @test_cmpxchg_i8_seqcst_seqcst
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
-; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
-; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK: fence seq_cst
-; CHECK: br label %[[DONE:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK: fence seq_cst
-; CHECK: br label %[[DONE]]
-
-; CHECK: [[DONE]]:
-; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK: ret i8 [[OLDVAL]]
-
-  %pairold = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
-  %old = extractvalue { i8, i1 } %pairold, 0
-  ret i8 %old
-}
-
-define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newval) {
-; CHECK-LABEL: @test_cmpxchg_i16_seqcst_monotonic
-; CHECK: fence release
-; CHECK: br label %[[LOOP:.*]]
-
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
-; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
-; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
-; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK: fence seq_cst
-; CHECK: br label %[[DONE:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence
-; CHECK: br label %[[DONE]]
-
-; CHECK: [[DONE]]:
-; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK: ret i16 [[OLDVAL]]
-
-  %pairold = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
-  %old = extractvalue { i16, i1 } %pairold, 0
-  ret i16 %old
-}
-
-define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newval) {
-; CHECK-LABEL: @test_cmpxchg_i32_acquire_acquire
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %ptr)
-; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
-; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK: fence acquire
-; CHECK: br label %[[DONE:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK: fence acquire
-; CHECK: br label %[[DONE]]
-
-; CHECK: [[DONE]]:
-; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK: ret i32 [[OLDVAL]]
-
-  %pairold = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
-  %old = extractvalue { i32, i1 } %pairold, 0
-  ret i32 %old
-}
-
-define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %newval) {
-; CHECK-LABEL: @test_cmpxchg_i64_monotonic_monotonic
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-
-; CHECK: [[LOOP]]:
-; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
-; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[PTR8]])
-; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
-; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
-; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
-; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
-; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
-; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
-; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
-; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 %newval, 32
-; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
-; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
-; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK-NOT: fence
-; CHECK: br label %[[DONE:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence
-; CHECK: br label %[[DONE]]
-
-; CHECK: [[DONE]]:
-; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK: ret i64 [[OLDVAL]]
-
-  %pairold = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
-  %old = extractvalue { i64, i1 } %pairold, 0
-  ret i64 %old
-}
\ No newline at end of file

diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
deleted file mode 100644
index 8092c10..0000000
--- a/test/Transforms/AtomicExpandLoadLinked/ARM/atomic-expansion-v8.ll
+++ /dev/null

@@ -1,226 +0,0 @@
-; RUN: opt -S -o - -mtriple=armv8-linux-gnueabihf -atomic-ll-sc %s | FileCheck %s
-
-define i8 @test_atomic_xchg_i8(i8* %ptr, i8 %xchgend) {
-; CHECK-LABEL: @test_atomic_xchg_i8
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i8
-; CHECK: [[NEWVAL32:%.*]] = zext i8 %xchgend to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK-NOT: fence
-; CHECK: ret i8 [[OLDVAL]]
-  %res = atomicrmw xchg i8* %ptr, i8 %xchgend monotonic
-  ret i8 %res
-}
-
-define i16 @test_atomic_add_i16(i16* %ptr, i16 %addend) {
-; CHECK-LABEL: @test_atomic_add_i16
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 [[OLDVAL32]] to i16
-; CHECK: [[NEWVAL:%.*]] = add i16 [[OLDVAL]], %addend
-; CHECK: [[NEWVAL32:%.*]] = zext i16 [[NEWVAL]] to i32
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.stlex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK-NOT: fence
-; CHECK: ret i16 [[OLDVAL]]
-  %res = atomicrmw add i16* %ptr, i16 %addend seq_cst
-  ret i16 %res
-}
-
-define i32 @test_atomic_sub_i32(i32* %ptr, i32 %subend) {
-; CHECK-LABEL: @test_atomic_sub_i32
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* %ptr)
-; CHECK: [[NEWVAL:%.*]] = sub i32 [[OLDVAL]], %subend
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[NEWVAL]], i32* %ptr)
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK-NOT: fence
-; CHECK: ret i32 [[OLDVAL]]
-  %res = atomicrmw sub i32* %ptr, i32 %subend acquire
-  ret i32 %res
-}
-
-define i64 @test_atomic_or_i64(i64* %ptr, i64 %orend) {
-; CHECK-LABEL: @test_atomic_or_i64
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-; CHECK: [[LOOP]]:
-; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
-; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldaexd(i8* [[PTR8]])
-; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
-; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
-; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
-; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
-; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
-; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
-; CHECK: [[NEWVAL:%.*]] = or i64 [[OLDVAL]], %orend
-; CHECK: [[NEWLO:%.*]] = trunc i64 [[NEWVAL]] to i32
-; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 [[NEWVAL]], 32
-; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
-; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.stlexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
-; CHECK: [[TST:%.*]] = icmp ne i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[LOOP]], label %[[END:.*]]
-; CHECK: [[END]]:
-; CHECK-NOT: fence
-; CHECK: ret i64 [[OLDVAL]]
-  %res = atomicrmw or i64* %ptr, i64 %orend seq_cst
-  ret i64 %res
-}
-
-define i8 @test_cmpxchg_i8_seqcst_seqcst(i8* %ptr, i8 %desired, i8 %newval) {
-; CHECK-LABEL: @test_cmpxchg_i8_seqcst_seqcst
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i8
-; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i8 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK: [[NEWVAL32:%.*]] = zext i8 %newval to i32
-; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i8(i32 [[NEWVAL32]], i8* %ptr)
-; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK-NOT: fence_cst
-; CHECK: br label %[[DONE:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence_cst
-; CHECK: br label %[[DONE]]
-
-; CHECK: [[DONE]]:
-; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK: ret i8 [[OLDVAL]]
-
-  %pairold = cmpxchg i8* %ptr, i8 %desired, i8 %newval seq_cst seq_cst
-  %old = extractvalue { i8, i1 } %pairold, 0
-  ret i8 %old
-}
-
-define i16 @test_cmpxchg_i16_seqcst_monotonic(i16* %ptr, i16 %desired, i16 %newval) {
-; CHECK-LABEL: @test_cmpxchg_i16_seqcst_monotonic
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL32:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* %ptr)
-; CHECK: [[OLDVAL:%.*]] = trunc i32 %1 to i16
-; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i16 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK: [[NEWVAL32:%.*]] = zext i16 %newval to i32
-; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.stlex.p0i16(i32 [[NEWVAL32]], i16* %ptr)
-; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK-NOT: fence
-; CHECK: br label %[[DONE:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence
-; CHECK: br label %[[DONE]]
-
-; CHECK: [[DONE]]:
-; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK: ret i16 [[OLDVAL]]
-
-  %pairold = cmpxchg i16* %ptr, i16 %desired, i16 %newval seq_cst monotonic
-  %old = extractvalue { i16, i1 } %pairold, 0
-  ret i16 %old
-}
-
-define i32 @test_cmpxchg_i32_acquire_acquire(i32* %ptr, i32 %desired, i32 %newval) {
-; CHECK-LABEL: @test_cmpxchg_i32_acquire_acquire
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-
-; CHECK: [[LOOP]]:
-; CHECK: [[OLDVAL:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* %ptr)
-; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i32 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK: [[TRYAGAIN:%.*]] =  call i32 @llvm.arm.strex.p0i32(i32 %newval, i32* %ptr)
-; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK-NOT: fence_cst
-; CHECK: br label %[[DONE:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence_cst
-; CHECK: br label %[[DONE]]
-
-; CHECK: [[DONE]]:
-; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK: ret i32 [[OLDVAL]]
-
-  %pairold = cmpxchg i32* %ptr, i32 %desired, i32 %newval acquire acquire
-  %old = extractvalue { i32, i1 } %pairold, 0
-  ret i32 %old
-}
-
-define i64 @test_cmpxchg_i64_monotonic_monotonic(i64* %ptr, i64 %desired, i64 %newval) {
-; CHECK-LABEL: @test_cmpxchg_i64_monotonic_monotonic
-; CHECK-NOT: fence
-; CHECK: br label %[[LOOP:.*]]
-
-; CHECK: [[LOOP]]:
-; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
-; CHECK: [[LOHI:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[PTR8]])
-; CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
-; CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
-; CHECK: [[LO64:%.*]] = zext i32 [[LO]] to i64
-; CHECK: [[HI64_TMP:%.*]] = zext i32 [[HI]] to i64
-; CHECK: [[HI64:%.*]] = shl i64 [[HI64_TMP]], 32
-; CHECK: [[OLDVAL:%.*]] = or i64 [[LO64]], [[HI64]]
-; CHECK: [[SHOULD_STORE:%.*]] = icmp eq i64 [[OLDVAL]], %desired
-; CHECK: br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK: [[NEWLO:%.*]] = trunc i64 %newval to i32
-; CHECK: [[NEWHI_TMP:%.*]] = lshr i64 %newval, 32
-; CHECK: [[NEWHI:%.*]] = trunc i64 [[NEWHI_TMP]] to i32
-; CHECK: [[PTR8:%.*]] = bitcast i64* %ptr to i8*
-; CHECK: [[TRYAGAIN:%.*]] = call i32 @llvm.arm.strexd(i32 [[NEWLO]], i32 [[NEWHI]], i8* [[PTR8]])
-; CHECK: [[TST:%.*]] = icmp eq i32 [[TRYAGAIN]], 0
-; CHECK: br i1 [[TST]], label %[[SUCCESS_BB:.*]], label %[[LOOP]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK-NOT: fence_cst
-; CHECK: br label %[[DONE:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence_cst
-; CHECK: br label %[[DONE]]
-
-; CHECK: [[DONE]]:
-; CHECK: [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK: ret i64 [[OLDVAL]]
-
-  %pairold = cmpxchg i64* %ptr, i64 %desired, i64 %newval monotonic monotonic
-  %old = extractvalue { i64, i1 } %pairold, 0
-  ret i64 %old
-}
\ No newline at end of file

diff --git a/test/Transforms/AtomicExpandLoadLinked/ARM/cmpxchg-weak.ll b/test/Transforms/AtomicExpandLoadLinked/ARM/cmpxchg-weak.ll
deleted file mode 100644
index 07a4a7f..0000000
--- a/test/Transforms/AtomicExpandLoadLinked/ARM/cmpxchg-weak.ll
+++ /dev/null

@@ -1,97 +0,0 @@
-; RUN: opt -atomic-ll-sc -S -mtriple=thumbv7s-apple-ios7.0 %s | FileCheck %s
-
-define i32 @test_cmpxchg_seq_cst(i32* %addr, i32 %desired, i32 %new) {
-; CHECK-LABEL: @test_cmpxchg_seq_cst
-; CHECK:     fence release
-; CHECK:     br label %[[START:.*]]
-
-; CHECK: [[START]]:
-; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
-; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
-; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
-; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
-; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK:     fence seq_cst
-; CHECK:     br label %[[END:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK:     fence seq_cst
-; CHECK:     br label %[[END]]
-
-; CHECK: [[END]]:
-; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK:     ret i32 [[LOADED]]
-
-  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
-  %oldval = extractvalue { i32, i1 } %pair, 0
-  ret i32 %oldval
-}
-
-define i1 @test_cmpxchg_weak_fail(i32* %addr, i32 %desired, i32 %new) {
-; CHECK-LABEL: @test_cmpxchg_weak_fail
-; CHECK:     fence release
-; CHECK:     br label %[[START:.*]]
-
-; CHECK: [[START]]:
-; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
-; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
-; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
-; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
-; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK:     fence seq_cst
-; CHECK:     br label %[[END:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence
-; CHECK:     br label %[[END]]
-
-; CHECK: [[END]]:
-; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK:     ret i1 [[SUCCESS]]
-
-  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
-  %oldval = extractvalue { i32, i1 } %pair, 1
-  ret i1 %oldval
-}
-
-define i32 @test_cmpxchg_monotonic(i32* %addr, i32 %desired, i32 %new) {
-; CHECK-LABEL: @test_cmpxchg_monotonic
-; CHECK-NOT: fence
-; CHECK:     br label %[[START:.*]]
-
-; CHECK: [[START]]:
-; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
-; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
-; CHECK:     br i1 [[SHOULD_STORE]], label %[[TRY_STORE:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 %new, i32* %addr)
-; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
-; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK-NOT: fence
-; CHECK:     br label %[[END:.*]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: fence
-; CHECK:     br label %[[END]]
-
-; CHECK: [[END]]:
-; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK:     ret i32 [[LOADED]]
-
-  %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new monotonic monotonic
-  %oldval = extractvalue { i32, i1 } %pair, 0
-  ret i32 %oldval
-}

diff --git a/test/Transforms/BranchFolding/2007-10-19-InlineAsmDirectives.ll b/test/Transforms/BranchFolding/2007-10-19-InlineAsmDirectives.ll
index 598ea0e..d4b94fe 100644
--- a/test/Transforms/BranchFolding/2007-10-19-InlineAsmDirectives.ll
+++ b/test/Transforms/BranchFolding/2007-10-19-InlineAsmDirectives.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts -o - | llc -no-integrated-as -o - | grep bork_directive | wc -l | grep 2
+; RUN: opt < %s -O3 -o - | llc -no-integrated-as -o - | grep bork_directive | wc -l | grep 2
 
 ;; We don't want branch folding to fold asm directives.
 

diff --git a/test/Transforms/CodeGenPrepare/AArch64/lit.local.cfg b/test/Transforms/CodeGenPrepare/AArch64/lit.local.cfg
new file mode 100644
index 0000000..cec29af
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/AArch64/lit.local.cfg

@@ -0,0 +1,3 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
+

diff --git a/test/Transforms/CodeGenPrepare/AArch64/trunc-weird-user.ll b/test/Transforms/CodeGenPrepare/AArch64/trunc-weird-user.ll
new file mode 100644
index 0000000..b4e6a40
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/AArch64/trunc-weird-user.ll

@@ -0,0 +1,36 @@
+; RUN: opt -S -codegenprepare -mtriple=arm64-apple-ios7.0 %s | FileCheck %s
+
+%foo = type { i8 }
+
+define %foo @test_merge(i32 %in) {
+; CHECK-LABEL: @test_merge
+
+  ; CodeGenPrepare was requesting the EVT for { i8 } to determine
+  ; whether the insertvalue user of the trunc was legal. This
+  ; asserted.
+
+; CHECK: insertvalue %foo undef, i8 %byte, 0
+  %lobit = lshr i32 %in, 31
+  %byte = trunc i32 %lobit to i8
+  %struct = insertvalue %foo undef, i8 %byte, 0
+  ret %"foo" %struct
+}
+
+define i64* @test_merge_PR21548(i32 %a, i64* %p1, i64* %p2, i64* %p3) {
+; CHECK-LABEL: @test_merge_PR21548
+  %as = lshr i32 %a, 3
+  %Tr = trunc i32 %as to i1
+  br i1 %Tr, label %BB2, label %BB3
+
+BB2:
+  ; Similarly to above:
+  ; CodeGenPrepare was requesting the EVT for i8* to determine
+  ; whether the select user of the trunc was legal. This asserted.
+
+; CHECK: select i1 {{%.*}}, i64* %p1, i64* %p2
+  %p = select i1 %Tr, i64* %p1, i64* %p2
+  ret i64* %p
+
+BB3:
+  ret i64* %p3
+}

diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index 0ea9c47..5a23dad 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll

@@ -36,6 +36,19 @@
 ; BE: ret i16 -8531
 }
 
+define i16 @test2_addrspacecast() {
+  %r = load i16 addrspace(1)* addrspacecast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16 addrspace(1)*)
+  ret i16 %r
+
+; 0xBEEF
+; LE-LABEL: @test2_addrspacecast(
+; LE: ret i16 -16657
+
+; 0xDEAD
+; BE-LABEL: @test2_addrspacecast(
+; BE: ret i16 -8531
+}
+
 ; Load of second 16 bits of 32-bit value.
 define i16 @test3() {
   %r = load i16* getelementptr(i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*), i32 1)

diff --git a/test/Transforms/ConstProp/trunc_vec.ll b/test/Transforms/ConstProp/trunc_vec.ll
new file mode 100644
index 0000000..99db329
--- /dev/null
+++ b/test/Transforms/ConstProp/trunc_vec.ll

@@ -0,0 +1,9 @@
+; RUN: opt -constprop < %s
+
+; Make sure we don't crash on this one
+
+define <8 x i8> @test_truc_vec() {
+  %x = bitcast <2 x i64> <i64 1, i64 2> to <8 x i16>
+  %y = trunc <8 x i16> %x to <8 x i8>
+  ret <8 x i8> %y
+}

diff --git a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index 26982db..077394f 100644
--- a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll

@@ -4,24 +4,24 @@
 
 define i8* @vfs_addname(i8* %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp {
 entry:
-  call void @llvm.dbg.value(metadata !{i8* %name}, i64 0, metadata !0)
-  call void @llvm.dbg.value(metadata !{i32 %len}, i64 0, metadata !10)
-  call void @llvm.dbg.value(metadata !{i32 %hash}, i64 0, metadata !11)
-  call void @llvm.dbg.value(metadata !{i32 %flags}, i64 0, metadata !12)
+  call void @llvm.dbg.value(metadata !{i8* %name}, i64 0, metadata !0, metadata !{})
+  call void @llvm.dbg.value(metadata !{i32 %len}, i64 0, metadata !10, metadata !{})
+  call void @llvm.dbg.value(metadata !{i32 %hash}, i64 0, metadata !11, metadata !{})
+  call void @llvm.dbg.value(metadata !{i32 %flags}, i64 0, metadata !12, metadata !{})
 ; CHECK:  call fastcc i8* @add_name_internal(i8* %name, i32 %hash) [[NUW:#[0-9]+]], !dbg !{{[0-9]+}}
   %0 = call fastcc i8* @add_name_internal(i8* %name, i32 %len, i32 %hash, i8 zeroext 0, i32 %flags) nounwind, !dbg !13 ; <i8*> [#uses=1]
   ret i8* %0, !dbg !13
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define internal fastcc i8* @add_name_internal(i8* %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp {
 entry:
-  call void @llvm.dbg.value(metadata !{i8* %name}, i64 0, metadata !15)
-  call void @llvm.dbg.value(metadata !{i32 %len}, i64 0, metadata !20)
-  call void @llvm.dbg.value(metadata !{i32 %hash}, i64 0, metadata !21)
-  call void @llvm.dbg.value(metadata !{i8 %extra}, i64 0, metadata !22)
-  call void @llvm.dbg.value(metadata !{i32 %flags}, i64 0, metadata !23)
+  call void @llvm.dbg.value(metadata !{i8* %name}, i64 0, metadata !15, metadata !{})
+  call void @llvm.dbg.value(metadata !{i32 %len}, i64 0, metadata !20, metadata !{})
+  call void @llvm.dbg.value(metadata !{i32 %hash}, i64 0, metadata !21, metadata !{})
+  call void @llvm.dbg.value(metadata !{i8 %extra}, i64 0, metadata !22, metadata !{})
+  call void @llvm.dbg.value(metadata !{i32 %flags}, i64 0, metadata !23, metadata !{})
   %0 = icmp eq i32 %hash, 0, !dbg !24             ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !24
 
@@ -36,7 +36,7 @@
   ret i8* %.0, !dbg !27
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind ssp }
 ; CHECK: attributes #1 = { nounwind readnone }
@@ -45,34 +45,34 @@
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
-!0 = metadata !{i32 524545, metadata !1, metadata !"name", metadata !2, i32 8, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 524334, metadata !28, metadata !2, metadata !"vfs_addname", metadata !"vfs_addname", metadata !"vfs_addname", i32 12, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 524329, metadata !28} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 524305, metadata !28, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !29, metadata !29, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !28, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00name\008\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00vfs_addname\00vfs_addname\00vfs_addname\0012\000\001\000\006\000\000\000", metadata !28, metadata !2, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", metadata !28, metadata !29, metadata !29, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !28, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6, metadata !9, metadata !9, metadata !9}
-!6 = metadata !{i32 524303, metadata !28, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 524326, metadata !28, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ]
-!8 = metadata !{i32 524324, metadata !28, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 524324, metadata !28, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 524545, metadata !1, metadata !"len", metadata !2, i32 9, metadata !9} ; [ DW_TAG_arg_variable ]
-!11 = metadata !{i32 524545, metadata !1, metadata !"hash", metadata !2, i32 10, metadata !9} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 524545, metadata !1, metadata !"flags", metadata !2, i32 11, metadata !9} ; [ DW_TAG_arg_variable ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !28, metadata !2, metadata !7} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0x26\00\000\008\008\000\000", metadata !28, metadata !2, metadata !8} ; [ DW_TAG_const_type ]
+!8 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", metadata !28, metadata !2} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", metadata !28, metadata !2} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x101\00len\009\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!11 = metadata !{metadata !"0x101\00hash\0010\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{metadata !"0x101\00flags\0011\000", metadata !1, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
 !13 = metadata !{i32 13, i32 0, metadata !14, null}
-!14 = metadata !{i32 524299, metadata !28, metadata !1, i32 12, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 524545, metadata !16, metadata !"name", metadata !2, i32 17, metadata !6} ; [ DW_TAG_arg_variable ]
-!16 = metadata !{i32 524334, metadata !28, metadata !2, metadata !"add_name_internal", metadata !"add_name_internal", metadata !"add_name_internal", i32 22, metadata !17, i1 true, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 524309, metadata !28, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0xb\0012\000\000", metadata !28, metadata !1} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0x101\00name\0017\000", metadata !16, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x2e\00add_name_internal\00add_name_internal\00add_name_internal\0022\001\001\000\006\000\000\000", metadata !28, metadata !2, metadata !17, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !28, metadata !2, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{metadata !6, metadata !6, metadata !9, metadata !9, metadata !19, metadata !9}
-!19 = metadata !{i32 524324, metadata !28, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!20 = metadata !{i32 524545, metadata !16, metadata !"len", metadata !2, i32 18, metadata !9} ; [ DW_TAG_arg_variable ]
-!21 = metadata !{i32 524545, metadata !16, metadata !"hash", metadata !2, i32 19, metadata !9} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 524545, metadata !16, metadata !"extra", metadata !2, i32 20, metadata !19} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{i32 524545, metadata !16, metadata !"flags", metadata !2, i32 21, metadata !9} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{metadata !"0x24\00unsigned char\000\008\008\000\000\008", metadata !28, metadata !2} ; [ DW_TAG_base_type ]
+!20 = metadata !{metadata !"0x101\00len\0018\000", metadata !16, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{metadata !"0x101\00hash\0019\000", metadata !16, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{metadata !"0x101\00extra\0020\000", metadata !16, metadata !2, metadata !19} ; [ DW_TAG_arg_variable ]
+!23 = metadata !{metadata !"0x101\00flags\0021\000", metadata !16, metadata !2, metadata !9} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 23, i32 0, metadata !25, null}
-!25 = metadata !{i32 524299, metadata !28, metadata !16, i32 22, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!25 = metadata !{metadata !"0xb\0022\000\000", metadata !28, metadata !16} ; [ DW_TAG_lexical_block ]
 !26 = metadata !{i32 24, i32 0, metadata !25, null}
 !27 = metadata !{i32 26, i32 0, metadata !25, null}
 !28 = metadata !{metadata !"tail.c", metadata !"/Users/echeng/LLVM/radars/r7927803/"}
 !29 = metadata !{i32 0}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/DeadArgElim/dbginfo.ll b/test/Transforms/DeadArgElim/dbginfo.ll
index 7bdcbf5..b457f01 100644
--- a/test/Transforms/DeadArgElim/dbginfo.ll
+++ b/test/Transforms/DeadArgElim/dbginfo.ll

@@ -1,65 +1,70 @@
 ; RUN: opt -deadargelim -S < %s | FileCheck %s
 ; PR14016
 
+; Built with clang (then manually running -mem2reg with opt) from the following source:
+; static void f1(int, ...) {
+; }
+;
+; void f2() {
+;   f1(1);
+; }
+
+; Test both varargs removal and removal of a traditional dead arg together, to
+; test both the basic functionality, and a particular wrinkle involving updating
+; the function->debug info mapping on update to ensure it's accurate when used
+; again for the next removal.
+
+; CHECK: void ()* @_ZL2f1iz, {{.*}} ; [ DW_TAG_subprogram ] {{.*}} [f1]
+
 ; Check that debug info metadata for subprograms stores pointers to
 ; updated LLVM functions.
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@x = global i32 0, align 4
-
-define void @_Z3runv() uwtable {
+; Function Attrs: uwtable
+define void @_Z2f2v() #0 {
 entry:
-  call void @_ZN12_GLOBAL__N_18dead_argEPv(i8* null), !dbg !10
-  call void (...)* @_ZN12_GLOBAL__N_111dead_varargEz(), !dbg !12
-  ret void, !dbg !13
-}
-
-; Argument will be deleted
-define internal void @_ZN12_GLOBAL__N_18dead_argEPv(i8* %foo) nounwind uwtable {
-entry:
-  %0 = load i32* @x, align 4, !dbg !14
-  %inc = add nsw i32 %0, 1, !dbg !14
-  store i32 %inc, i32* @x, align 4, !dbg !14
+  call void (i32, ...)* @_ZL2f1iz(i32 1), !dbg !15
   ret void, !dbg !16
 }
 
-; Vararg will be deleted
-define internal void @_ZN12_GLOBAL__N_111dead_varargEz(...) nounwind uwtable {
+; Function Attrs: nounwind uwtable
+define internal void @_ZL2f1iz(i32, ...) #1 {
 entry:
-  %0 = load i32* @x, align 4, !dbg !17
-  %inc = add nsw i32 %0, 1, !dbg !17
-  store i32 %inc, i32* @x, align 4, !dbg !17
-  ret void, !dbg !19
+  call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !17, metadata !18), !dbg !19
+  ret void, !dbg !20
 }
 
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!21}
+!llvm.module.flags = !{!12, !13}
+!llvm.ident = !{!14}
 
-!0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.2 (trunk 165305)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/samsonov/tmp/clang-di/test.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5, metadata !8, metadata !9}
-!5 = metadata !{i32 786478, metadata !20, metadata !6, metadata !"run", metadata !"run", metadata !"", i32 8, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [run]
-!6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !1, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{i32 786478, metadata !20, metadata !6, metadata !"dead_vararg", metadata !"dead_vararg", metadata !"", i32 5, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (...)* @_ZN12_GLOBAL__N_111dead_varargEz, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [dead_vararg]
-
-; CHECK: metadata !"dead_vararg"{{.*}}void ()* @_ZN12_GLOBAL__N_111dead_varargEz
-
-!9 = metadata !{i32 786478, metadata !20, metadata !6, metadata !"dead_arg", metadata !"dead_arg", metadata !"", i32 4, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @_ZN12_GLOBAL__N_18dead_argEPv, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [dead_arg]
-
-; CHECK: metadata !"dead_arg"{{.*}}void ()* @_ZN12_GLOBAL__N_18dead_argEPv
-
-!10 = metadata !{i32 8, i32 14, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !20, metadata !5, i32 8, i32 12, i32 0} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
-!12 = metadata !{i32 8, i32 27, metadata !11, null}
-!13 = metadata !{i32 8, i32 42, metadata !11, null}
-!14 = metadata !{i32 4, i32 28, metadata !15, null}
-!15 = metadata !{i32 786443, metadata !20, metadata !9, i32 4, i32 26, i32 2} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
-!16 = metadata !{i32 4, i32 33, metadata !15, null}
-!17 = metadata !{i32 5, i32 25, metadata !18, null}
-!18 = metadata !{i32 786443, metadata !20, metadata !8, i32 5, i32 23, i32 1} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
-!19 = metadata !{i32 5, i32 30, metadata !18, null}
-!20 = metadata !{metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!0 = metadata !{metadata !"0x11\004\00clang version 3.6.0 \000\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"dbg.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !8}
+!4 = metadata !{metadata !"0x2e\00f2\00f2\00_Z2f2v\004\000\001\000\000\00256\000\004", metadata !1, metadata !5, metadata !6, null, void ()* @_Z2f2v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 4] [def] [f2]
+!5 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{metadata !"0x2e\00f1\00f1\00_ZL2f1iz\001\001\001\000\000\00256\000\001", metadata !1, metadata !5, metadata !9, null, void (i32, ...)* @_ZL2f1iz, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [local] [def] [f1]
+!9 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{null, metadata !11, null}
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!13 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!14 = metadata !{metadata !"clang version 3.6.0 "}
+!15 = metadata !{i32 5, i32 3, metadata !4, null}
+!16 = metadata !{i32 6, i32 1, metadata !4, null}
+!17 = metadata !{metadata !"0x101\00\0016777217\000", metadata !8, metadata !5, metadata !11} ; [ DW_TAG_arg_variable ] [line 1]
+!18 = metadata !{metadata !"0x102"}               ; [ DW_TAG_expression ]
+!19 = metadata !{i32 1, i32 19, metadata !8, null}
+!20 = metadata !{i32 2, i32 1, metadata !8, null}

diff --git a/test/Transforms/DeadArgElim/dead_vaargs.ll b/test/Transforms/DeadArgElim/dead_vaargs.ll
index db3135c..c8189c6 100644
--- a/test/Transforms/DeadArgElim/dead_vaargs.ll
+++ b/test/Transforms/DeadArgElim/dead_vaargs.ll

@@ -1,12 +1,36 @@
-; RUN: opt < %s -deadargelim -S | not grep 47 
-; RUN: opt < %s -deadargelim -S | not grep 1.0
+; RUN: opt < %s -deadargelim -S | FileCheck %s
 
 define i32 @bar(i32 %A) {
-        %tmp4 = tail call i32 (i32, ...)* @foo( i32 %A, i32 %A, i32 %A, i32 %A, i64 47, double 1.000000e+00 )   ; <i32> [#uses=1]
-        ret i32 %tmp4
+  call void (i32, ...)* @thunk(i32 %A, i64 47, double 1.000000e+00)
+  %a = call i32 (i32, ...)* @has_vastart(i32 %A, i64 47, double 1.000000e+00)
+  %b = call i32 (i32, ...)* @no_vastart( i32 %A, i32 %A, i32 %A, i32 %A, i64 47, double 1.000000e+00 )
+  %c = add i32 %a, %b
+  ret i32 %c
 }
+; CHECK-LABEL: define i32 @bar
+; CHECK: call void (i32, ...)* @thunk(i32 %A, i64 47, double 1.000000e+00)
+; CHECK: call i32 (i32, ...)* @has_vastart(i32 %A, i64 47, double 1.000000e+00)
+; CHECK: call i32 @no_vastart(i32 %A)
 
-define internal i32 @foo(i32 %X, ...) {
-        ret i32 %X
+declare void @thunk_target(i32 %X, ...)
+
+define internal void @thunk(i32 %X, ...) {
+  musttail call void(i32, ...)* @thunk_target(i32 %X, ...)
+  ret void
 }
+; CHECK-LABEL: define internal void @thunk(i32 %X, ...)
+; CHECK: musttail call void (i32, ...)* @thunk_target(i32 %X, ...)
 
+define internal i32 @has_vastart(i32 %X, ...) {
+  %valist = alloca i8
+  call void @llvm.va_start(i8* %valist)
+  ret i32 %X
+}
+; CHECK-LABEL: define internal i32 @has_vastart(i32 %X, ...)
+
+declare void @llvm.va_start(i8*)
+
+define internal i32 @no_vastart(i32 %X, ...) {
+  ret i32 %X
+}
+; CHECK-LABEL: define internal i32 @no_vastart(i32 %X)

diff --git a/test/Transforms/DeadStoreElimination/PartialStore.ll b/test/Transforms/DeadStoreElimination/PartialStore.ll
index 4799ef3..80c2bfa 100644
--- a/test/Transforms/DeadStoreElimination/PartialStore.ll
+++ b/test/Transforms/DeadStoreElimination/PartialStore.ll

@@ -45,9 +45,9 @@
 
   store i8 19, i8* %P  ;; dead
   %A = getelementptr i8* %P, i32 3
-  
+
   store i8 42, i8* %A  ;; dead
-  
+
   %Q = bitcast i8* %P to double*
   store double 0.0, double* %Q
   ret void
@@ -61,7 +61,7 @@
   %C = getelementptr i8* %B, i32 %i
   store i8 10, i8* %C        ;; Dead store to variable index.
   store i32 20, i32* %A
-  
+
   call void @test5a(i32* %A)
   ret void
 ; CHECK-LABEL: @test5(
@@ -69,3 +69,19 @@
 ; CHECK-NEXT: store i32 20
 ; CHECK-NEXT: call void @test5a
 }
+
+declare void @test5a_as1(i32*)
+define void @test5_addrspacecast(i32 %i) nounwind ssp {
+  %A = alloca i32
+  %B = addrspacecast i32* %A to i8 addrspace(1)*
+  %C = getelementptr i8 addrspace(1)* %B, i32 %i
+  store i8 10, i8 addrspace(1)* %C        ;; Dead store to variable index.
+  store i32 20, i32* %A
+
+  call void @test5a(i32* %A)
+  ret void
+; CHECK-LABEL: @test5_addrspacecast(
+; CHECK-NEXT: alloca
+; CHECK-NEXT: store i32 20
+; CHECK-NEXT: call void @test5a
+}

diff --git a/test/Transforms/DeadStoreElimination/atomic.ll b/test/Transforms/DeadStoreElimination/atomic.ll
index 2e84298..af303fa 100644
--- a/test/Transforms/DeadStoreElimination/atomic.ll
+++ b/test/Transforms/DeadStoreElimination/atomic.ll

@@ -5,7 +5,7 @@
 
 ; Sanity tests for atomic stores.
 ; Note that it turns out essentially every transformation DSE does is legal on
-; atomic ops, just some transformations are not allowed across them. 
+; atomic ops, just some transformations are not allowed across release-acquire pairs.
 
 @x = common global i32 0, align 4
 @y = common global i32 0, align 4
@@ -13,35 +13,32 @@
 declare void @randomop(i32*)
 
 ; DSE across unordered store (allowed)
-define void @test1()  nounwind uwtable ssp {
-; CHECK: test1
+define void @test1() {
+; CHECK-LABEL: test1
 ; CHECK-NOT: store i32 0
 ; CHECK: store i32 1
-entry:
   store i32 0, i32* @x
   store atomic i32 0, i32* @y unordered, align 4
   store i32 1, i32* @x
   ret void
 }
 
-; DSE across seq_cst load (allowed in theory; not implemented ATM)
-define i32 @test2()  nounwind uwtable ssp {
-; CHECK: test2
-; CHECK: store i32 0
+; DSE across seq_cst load (allowed)
+define i32 @test2() {
+; CHECK-LABEL: test2
+; CHECK-NOT: store i32 0
 ; CHECK: store i32 1
-entry:
   store i32 0, i32* @x
   %x = load atomic i32* @y seq_cst, align 4
   store i32 1, i32* @x
   ret i32 %x
 }
 
-; DSE across seq_cst store (store before atomic store must not be removed)
-define void @test3()  nounwind uwtable ssp {
-; CHECK: test3
-; CHECK: store i32
+; DSE across seq_cst store (allowed)
+define void @test3() {
+; CHECK-LABEL: test3
+; CHECK-NOT: store i32 0
 ; CHECK: store atomic i32 2
-entry:
   store i32 0, i32* @x
   store atomic i32 2, i32* @y seq_cst, align 4
   store i32 1, i32* @x
@@ -49,32 +46,29 @@
 }
 
 ; DSE remove unordered store (allowed)
-define void @test4()  nounwind uwtable ssp {
-; CHECK: test4
+define void @test4() {
+; CHECK-LABEL: test4
 ; CHECK-NOT: store atomic
 ; CHECK: store i32 1
-entry:
   store atomic i32 0, i32* @x unordered, align 4
   store i32 1, i32* @x
   ret void
 }
 
 ; DSE unordered store overwriting non-atomic store (allowed)
-define void @test5()  nounwind uwtable ssp {
-; CHECK: test5
+define void @test5() {
+; CHECK-LABEL: test5
 ; CHECK: store atomic i32 1
-entry:
   store i32 0, i32* @x
   store atomic i32 1, i32* @x unordered, align 4
   ret void
 }
 
 ; DSE no-op unordered atomic store (allowed)
-define void @test6()  nounwind uwtable ssp {
-; CHECK: test6
+define void @test6() {
+; CHECK-LABEL: test6
 ; CHECK-NOT: store
 ; CHECK: ret void
-entry:
   %x = load atomic i32* @x unordered, align 4
   store atomic i32 %x, i32* @x unordered, align 4
   ret void
@@ -82,10 +76,9 @@
 
 ; DSE seq_cst store (be conservative; DSE doesn't have infrastructure
 ; to reason about atomic operations).
-define void @test7()  nounwind uwtable ssp {
-; CHECK: test7
-; CHECK: store atomic 
-entry:
+define void @test7() {
+; CHECK-LABEL: test7
+; CHECK: store atomic
   %a = alloca i32
   store atomic i32 0, i32* %a seq_cst, align 4
   ret void
@@ -93,11 +86,10 @@
 
 ; DSE and seq_cst load (be conservative; DSE doesn't have infrastructure
 ; to reason about atomic operations).
-define i32 @test8()  nounwind uwtable ssp {
-; CHECK: test8
+define i32 @test8() {
+; CHECK-LABEL: test8
 ; CHECK: store
-; CHECK: load atomic 
-entry:
+; CHECK: load atomic
   %a = alloca i32
   call void @randomop(i32* %a)
   store i32 0, i32* %a, align 4
@@ -105,3 +97,82 @@
   ret i32 %x
 }
 
+; DSE across monotonic load (allowed as long as the eliminated store isUnordered)
+define i32 @test9() {
+; CHECK-LABEL: test9
+; CHECK-NOT: store i32 0
+; CHECK: store i32 1
+  store i32 0, i32* @x
+  %x = load atomic i32* @y monotonic, align 4
+  store i32 1, i32* @x
+  ret i32 %x
+}
+
+; DSE across monotonic store (allowed as long as the eliminated store isUnordered)
+define void @test10() {
+; CHECK-LABEL: test10
+; CHECK-NOT: store i32 0
+; CHECK: store i32 1
+  store i32 0, i32* @x
+  store atomic i32 42, i32* @y monotonic, align 4
+  store i32 1, i32* @x
+  ret void
+}
+
+; DSE across monotonic load (forbidden since the eliminated store is atomic)
+define i32 @test11() {
+; CHECK-LABEL: test11
+; CHECK: store atomic i32 0
+; CHECK: store atomic i32 1
+  store atomic i32 0, i32* @x monotonic, align 4
+  %x = load atomic i32* @y monotonic, align 4
+  store atomic i32 1, i32* @x monotonic, align 4
+  ret i32 %x
+}
+
+; DSE across monotonic store (forbidden since the eliminated store is atomic)
+define void @test12() {
+; CHECK-LABEL: test12
+; CHECK: store atomic i32 0
+; CHECK: store atomic i32 1
+  store atomic i32 0, i32* @x monotonic, align 4
+  store atomic i32 42, i32* @y monotonic, align 4
+  store atomic i32 1, i32* @x monotonic, align 4
+  ret void
+}
+
+; DSE is allowed across a pair of an atomic read and then write.
+define i32 @test13() {
+; CHECK-LABEL: test13
+; CHECK-NOT: store i32 0
+; CHECK: store i32 1
+  store i32 0, i32* @x
+  %x = load atomic i32* @y seq_cst, align 4
+  store atomic i32 %x, i32* @y seq_cst, align 4
+  store i32 1, i32* @x
+  ret i32 %x
+}
+
+; Same if it is acquire-release instead of seq_cst/seq_cst
+define i32 @test14() {
+; CHECK-LABEL: test14
+; CHECK-NOT: store i32 0
+; CHECK: store i32 1
+  store i32 0, i32* @x
+  %x = load atomic i32* @y acquire, align 4
+  store atomic i32 %x, i32* @y release, align 4
+  store i32 1, i32* @x
+  ret i32 %x
+}
+
+; But DSE is not allowed across a release-acquire pair.
+define i32 @test15() {
+; CHECK-LABEL: test15
+; CHECK: store i32 0
+; CHECK: store i32 1
+  store i32 0, i32* @x
+  store atomic i32 0, i32* @y release, align 4
+  %x = load atomic i32* @y acquire, align 4
+  store i32 1, i32* @x
+  ret i32 %x
+}

diff --git a/test/Transforms/DeadStoreElimination/const-pointers.ll b/test/Transforms/DeadStoreElimination/const-pointers.ll
index c90d824..3e772d7 100644
--- a/test/Transforms/DeadStoreElimination/const-pointers.ll
+++ b/test/Transforms/DeadStoreElimination/const-pointers.ll

@@ -1,4 +1,5 @@
 ; RUN: opt -basicaa -dse -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %t = type { i32 }
 

diff --git a/test/Transforms/DeadStoreElimination/cs-cs-aliasing.ll b/test/Transforms/DeadStoreElimination/cs-cs-aliasing.ll
new file mode 100644
index 0000000..8953f9c
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/cs-cs-aliasing.ll

@@ -0,0 +1,74 @@
+; RUN: opt -basicaa -dse -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.basic_string = type { %"class.__gnu_cxx::__versa_string" }
+%"class.__gnu_cxx::__versa_string" = type { %"class.__gnu_cxx::__sso_string_base" }
+%"class.__gnu_cxx::__sso_string_base" = type { %"struct.__gnu_cxx::__vstring_utility<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider", i64, %union.anon }
+%"struct.__gnu_cxx::__vstring_utility<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+%union.anon = type { i64, [8 x i8] }
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #0
+
+; Function Attrs: noinline nounwind readonly uwtable
+declare zeroext i1 @callee_takes_string(%class.basic_string* nonnull) #1 align 2
+
+; Function Attrs: nounwind uwtable
+define weak_odr zeroext i1 @test() #2 align 2 {
+
+; CHECK-LABEL: @test
+
+bb:
+  %tmp = alloca %class.basic_string, align 8
+  %tmp1 = alloca %class.basic_string, align 8
+  %tmp3 = getelementptr inbounds %class.basic_string* %tmp, i64 0, i32 0, i32 0, i32 2
+  %tmp4 = bitcast %union.anon* %tmp3 to i8*
+  %tmp5 = getelementptr inbounds %class.basic_string* %tmp, i64 0, i32 0, i32 0, i32 0, i32 0
+  %tmp6 = getelementptr inbounds %class.basic_string* %tmp, i64 0, i32 0, i32 0, i32 1
+  %tmp7 = getelementptr inbounds i8* %tmp4, i64 1
+  %tmp8 = bitcast %class.basic_string* %tmp to i8*
+  %tmp9 = bitcast i64 0 to i64
+  %tmp10 = getelementptr inbounds %class.basic_string* %tmp1, i64 0, i32 0, i32 0, i32 2
+  %tmp11 = bitcast %union.anon* %tmp10 to i8*
+  %tmp12 = getelementptr inbounds %class.basic_string* %tmp1, i64 0, i32 0, i32 0, i32 0, i32 0
+  %tmp13 = getelementptr inbounds %class.basic_string* %tmp1, i64 0, i32 0, i32 0, i32 1
+  %tmp14 = getelementptr inbounds i8* %tmp11, i64 1
+  %tmp15 = bitcast %class.basic_string* %tmp1 to i8*
+  br label %_ZN12basic_stringIcSt11char_traitsIcESaIcEEC2EPKcRKS2_.exit
+
+_ZN12basic_stringIcSt11char_traitsIcESaIcEEC2EPKcRKS2_.exit: ; preds = %bb
+  store i8* %tmp4, i8** %tmp5, align 8
+  store i8 62, i8* %tmp4, align 8
+  store i64 1, i64* %tmp6, align 8
+  store i8 0, i8* %tmp7, align 1
+  %tmp16 = call zeroext i1 @callee_takes_string(%class.basic_string* nonnull %tmp)
+  br label %_ZN9__gnu_cxx17__sso_string_baseIcSt11char_traitsIcESaIcEED2Ev.exit3
+
+_ZN9__gnu_cxx17__sso_string_baseIcSt11char_traitsIcESaIcEED2Ev.exit3: ; preds = %_ZN12basic_stringIcSt11char_traitsIcESaIcEEC2EPKcRKS2_.exit
+
+; CHECK: _ZN9__gnu_cxx17__sso_string_baseIcSt11char_traitsIcESaIcEED2Ev.exit3:
+
+; The following can be read through the call %tmp17:
+  store i8* %tmp11, i8** %tmp12, align 8
+  store i8 125, i8* %tmp11, align 8
+  store i64 1, i64* %tmp13, align 8
+  store i8 0, i8* %tmp14, align 1
+
+; CHECK: store i8* %tmp11, i8** %tmp12, align 8
+; CHECK: store i8 125, i8* %tmp11, align 8
+; CHECK: store i64 1, i64* %tmp13, align 8
+; CHECK: store i8 0, i8* %tmp14, align 1
+
+  %tmp17 = call zeroext i1 @callee_takes_string(%class.basic_string* nonnull %tmp1)
+  call void @llvm.memset.p0i8.i64(i8* %tmp11, i8 -51, i64 16, i32 8, i1 false) #0
+  call void @llvm.memset.p0i8.i64(i8* %tmp15, i8 -51, i64 32, i32 8, i1 false) #0
+  call void @llvm.memset.p0i8.i64(i8* %tmp4, i8 -51, i64 16, i32 8, i1 false) #0
+  call void @llvm.memset.p0i8.i64(i8* %tmp8, i8 -51, i64 32, i32 8, i1 false) #0
+  ret i1 %tmp17
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { noinline nounwind readonly uwtable }
+attributes #2 = { nounwind uwtable }
+

diff --git a/test/Transforms/DeadStoreElimination/inst-limits.ll b/test/Transforms/DeadStoreElimination/inst-limits.ll
index 9df8801..3d78bb5 100644
--- a/test/Transforms/DeadStoreElimination/inst-limits.ll
+++ b/test/Transforms/DeadStoreElimination/inst-limits.ll

@@ -1,4 +1,5 @@
 ; RUN: opt -S -dse < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; If there are two stores to the same location, DSE should be able to remove
 ; the first store if the two stores are separated by no more than 98
@@ -117,7 +118,7 @@
 
   ; Insert a meaningless dbg.value intrinsic; it should have no
   ; effect on the working of DSE in any way.
-  call void @llvm.dbg.value(metadata !12, i64 0, metadata !10)
+  call void @llvm.dbg.value(metadata !12, i64 0, metadata !10, metadata !{})
 
   ; CHECK:  store i32 -1, i32* @x, align 4
   store i32 -1, i32* @x, align 4
@@ -239,23 +240,23 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !13}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/tmp/test.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2} ; [ DW_TAG_compile_unit ] [/home/tmp/test.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"test.c", metadata !"/home/tmp"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_within_limit", metadata !"test_within_limit", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @test_within_limit, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [test]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/tmp/test.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00test_within_limit\00test_within_limit\00\003\000\001\000\006\00256\000\004", metadata !1, metadata !5, metadata !6, null, i32 ()* @test_within_limit, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [test]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/tmp/test.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
-!10 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !5, i32 1, metadata !8, i32 0, i32 1, i32* @x, null} ; [ DW_TAG_variable ] [x] [line 1] [def]
+!10 = metadata !{metadata !"0x34\00x\00x\00\001\000\001", null, metadata !5, metadata !8, i32* @x, null} ; [ DW_TAG_variable ] [x] [line 1] [def]
 !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !12 = metadata !{i32* undef}
 
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/DeadStoreElimination/no-targetdata.ll b/test/Transforms/DeadStoreElimination/no-targetdata.ll
index c0c7c58..2539533 100644
--- a/test/Transforms/DeadStoreElimination/no-targetdata.ll
+++ b/test/Transforms/DeadStoreElimination/no-targetdata.ll

@@ -1,15 +1,21 @@
 ; RUN: opt -basicaa -dse -S < %s | FileCheck %s
 
-declare void @test1f()
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
-define void @test1(i32* noalias %p) {
-       store i32 1, i32* %p
-       call void @test1f()
-       store i32 2, i32 *%p
-       ret void
-; CHECK-LABEL: define void @test1(
-; CHECK-NOT: store
-; CHECK-NEXT: call void
-; CHECK-NEXT: store i32 2
-; CHECK-NEXT: ret void
+define void @fn(i8* nocapture %buf) #0 {
+entry:
+
+; We would not eliminate the first memcpy with data layout, and we should not
+; eliminate it without data layout.
+; CHECK-LABEL: @fn
+; CHECK: tail call void @llvm.memcpy.p0i8.p0i8.i64
+; CHECK: tail call void @llvm.memcpy.p0i8.p0i8.i64
+; CHECK: ret void
+
+  %arrayidx = getelementptr i8* %buf, i64 18
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arrayidx, i8* %buf, i64 18, i32 1, i1 false)
+  store i8 1, i8* %arrayidx, align 1
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %buf, i8* %arrayidx, i64 18, i32 1, i1 false)
+  ret void
 }
+

diff --git a/test/Transforms/DeadStoreElimination/simple.ll b/test/Transforms/DeadStoreElimination/simple.ll
index cdfe226..1e81385 100644
--- a/test/Transforms/DeadStoreElimination/simple.ll
+++ b/test/Transforms/DeadStoreElimination/simple.ll

@@ -172,6 +172,23 @@
 ; CHECK-NEXT: call void
 }
 
+define i32 addrspace(1)* @test13_addrspacecast() {
+  %p = tail call i8* @malloc(i32 4)
+  %p.bc = bitcast i8* %p to i32*
+  %P = addrspacecast i32* %p.bc to i32 addrspace(1)*
+  %DEAD = load i32 addrspace(1)* %P
+  %DEAD2 = add i32 %DEAD, 1
+  store i32 %DEAD2, i32 addrspace(1)* %P
+  call void @test13f( )
+  store i32 0, i32 addrspace(1)* %P
+  ret i32 addrspace(1)* %P
+; CHECK: @test13_addrspacecast()
+; CHECK-NEXT: malloc
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: addrspacecast
+; CHECK-NEXT: call void
+}
+
 declare noalias i8* @malloc(i32)
 declare noalias i8* @calloc(i32, i32)
 

diff --git a/test/Transforms/DebugIR/simple-addrspace.ll b/test/Transforms/DebugIR/simple-addrspace.ll
index 6bea9b2..6539c8a 100644
--- a/test/Transforms/DebugIR/simple-addrspace.ll
+++ b/test/Transforms/DebugIR/simple-addrspace.ll

@@ -8,6 +8,4 @@
 
 ; Make sure the pointer size is 16
 
-; CHECK: metadata !"i32 addrspace(1)*", i32 0, i64 16, i64 2, i64 0, i32 0
-
-
+; CHECK: metadata !"0xf\00i32 addrspace(1)*\000\0016\002\000\000"

diff --git a/test/Transforms/EarlyCSE/basic.ll b/test/Transforms/EarlyCSE/basic.ll
index 80704df..155d36f 100644
--- a/test/Transforms/EarlyCSE/basic.ll
+++ b/test/Transforms/EarlyCSE/basic.ll

@@ -1,5 +1,6 @@
 ; RUN: opt < %s -S -early-cse | FileCheck %s
 
+declare void @llvm.assume(i1) nounwind
 
 ; CHECK-LABEL: @test1(
 define void @test1(i8 %V, i32 *%P) {
@@ -42,6 +43,16 @@
   ; CHECK: ret i32 0
 }
 
+; CHECK-LABEL: @test2a(
+define i32 @test2a(i32 *%P, i1 %b) {
+  %V1 = load i32* %P
+  tail call void @llvm.assume(i1 %b)
+  %V2 = load i32* %P
+  %Diff = sub i32 %V1, %V2
+  ret i32 %Diff
+  ; CHECK: ret i32 0
+}
+
 ;; Cross block load value numbering.
 ; CHECK-LABEL: @test3(
 define i32 @test3(i32 *%P, i1 %Cond) {
@@ -58,6 +69,22 @@
   ; CHECK: ret i32 0
 }
 
+; CHECK-LABEL: @test3a(
+define i32 @test3a(i32 *%P, i1 %Cond, i1 %b) {
+  %V1 = load i32* %P
+  br i1 %Cond, label %T, label %F
+T:
+  store i32 4, i32* %P
+  ret i32 42
+F:
+  tail call void @llvm.assume(i1 %b)
+  %V2 = load i32* %P
+  %Diff = sub i32 %V1, %V2
+  ret i32 %Diff
+  ; CHECK: F:
+  ; CHECK: ret i32 0
+}
+
 ;; Cross block load value numbering stops when stores happen.
 ; CHECK-LABEL: @test4(
 define i32 @test4(i32 *%P, i1 %Cond) {
@@ -97,6 +124,15 @@
   ; CHECK: ret i32 42
 }
 
+; CHECK-LABEL: @test6a(
+define i32 @test6a(i32 *%P, i1 %b) {
+  store i32 42, i32* %P
+  tail call void @llvm.assume(i1 %b)
+  %V1 = load i32* %P
+  ret i32 %V1
+  ; CHECK: ret i32 42
+}
+
 ;; Trivial dead store elimination.
 ; CHECK-LABEL: @test7(
 define void @test7(i32 *%P) {
@@ -118,4 +154,42 @@
   ; CHECK: ret i32 0
 }
 
+;; Trivial DSE can't be performed across a readonly call.  The call
+;; can observe the earlier write.
+; CHECK-LABEL: @test9(
+define i32 @test9(i32 *%P) {
+  store i32 4, i32* %P
+  %V1 = call i32 @func(i32* %P) readonly
+  store i32 5, i32* %P        
+  ret i32 %V1
+  ; CHECK: store i32 4, i32* %P        
+  ; CHECK-NEXT: %V1 = call i32 @func(i32* %P)
+  ; CHECK-NEXT: store i32 5, i32* %P        
+  ; CHECK-NEXT: ret i32 %V1
+}
+
+;; Trivial DSE can be performed across a readnone call.
+; CHECK-LABEL: @test10
+define i32 @test10(i32 *%P) {
+  store i32 4, i32* %P
+  %V1 = call i32 @func(i32* %P) readnone
+  store i32 5, i32* %P        
+  ret i32 %V1
+  ; CHECK-NEXT: %V1 = call i32 @func(i32* %P)
+  ; CHECK-NEXT: store i32 5, i32* %P        
+  ; CHECK-NEXT: ret i32 %V1
+}
+
+;; Trivial dead store elimination - should work for an entire series of dead stores too.
+; CHECK-LABEL: @test11(
+define void @test11(i32 *%P) {
+  store i32 42, i32* %P
+  store i32 43, i32* %P
+  store i32 44, i32* %P
+  store i32 45, i32* %P
+  ret void
+  ; CHECK-NEXT: store i32 45
+  ; CHECK-NEXT: ret void
+}
+
 

diff --git a/test/Transforms/FunctionAttrs/optnone-simple.ll b/test/Transforms/FunctionAttrs/optnone-simple.ll
new file mode 100644
index 0000000..9d0f8e3
--- /dev/null
+++ b/test/Transforms/FunctionAttrs/optnone-simple.ll

@@ -0,0 +1,135 @@
+; RUN: opt -O3 -S < %s | FileCheck %s
+; Show 'optnone' suppresses optimizations.
+
+; Two attribute groups that differ only by 'optnone'.
+; 'optnone' requires 'noinline' so #0 is 'noinline' by itself,
+; even though it would otherwise be irrelevant to this example.
+attributes #0 = { noinline }
+attributes #1 = { noinline optnone }
+
+; int iadd(int a, int b){ return a + b; }
+
+define i32 @iadd_optimize(i32 %a, i32 %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  %1 = load i32* %b.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+; CHECK-LABEL: @iadd_optimize
+; CHECK-NOT: alloca
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: ret
+
+define i32 @iadd_optnone(i32 %a, i32 %b) #1 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  %1 = load i32* %b.addr, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+; CHECK-LABEL: @iadd_optnone
+; CHECK: alloca i32
+; CHECK: alloca i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: add nsw i32
+; CHECK: ret i32
+
+; float fsub(float a, float b){ return a - b; }
+
+define float @fsub_optimize(float %a, float %b) #0 {
+entry:
+  %a.addr = alloca float, align 4
+  %b.addr = alloca float, align 4
+  store float %a, float* %a.addr, align 4
+  store float %b, float* %b.addr, align 4
+  %0 = load float* %a.addr, align 4
+  %1 = load float* %b.addr, align 4
+  %sub = fsub float %0, %1
+  ret float %sub
+}
+
+; CHECK-LABEL: @fsub_optimize
+; CHECK-NOT: alloca
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: ret
+
+define float @fsub_optnone(float %a, float %b) #1 {
+entry:
+  %a.addr = alloca float, align 4
+  %b.addr = alloca float, align 4
+  store float %a, float* %a.addr, align 4
+  store float %b, float* %b.addr, align 4
+  %0 = load float* %a.addr, align 4
+  %1 = load float* %b.addr, align 4
+  %sub = fsub float %0, %1
+  ret float %sub
+}
+
+; CHECK-LABEL: @fsub_optnone
+; CHECK: alloca float
+; CHECK: alloca float
+; CHECK: store float
+; CHECK: store float
+; CHECK: load float
+; CHECK: load float
+; CHECK: fsub float
+; CHECK: ret float
+
+; typedef float __attribute__((ext_vector_type(4))) float4;
+; float4 vmul(float4 a, float4 b){ return a * b; }
+
+define <4 x float> @vmul_optimize(<4 x float> %a, <4 x float> %b) #0 {
+entry:
+  %a.addr = alloca <4 x float>, align 16
+  %b.addr = alloca <4 x float>, align 16
+  store <4 x float> %a, <4 x float>* %a.addr, align 16
+  store <4 x float> %b, <4 x float>* %b.addr, align 16
+  %0 = load <4 x float>* %a.addr, align 16
+  %1 = load <4 x float>* %b.addr, align 16
+  %mul = fmul <4 x float> %0, %1
+  ret <4 x float> %mul
+}
+
+; CHECK-LABEL: @vmul_optimize
+; CHECK-NOT: alloca
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: ret
+
+define <4 x float> @vmul_optnone(<4 x float> %a, <4 x float> %b) #1 {
+entry:
+  %a.addr = alloca <4 x float>, align 16
+  %b.addr = alloca <4 x float>, align 16
+  store <4 x float> %a, <4 x float>* %a.addr, align 16
+  store <4 x float> %b, <4 x float>* %b.addr, align 16
+  %0 = load <4 x float>* %a.addr, align 16
+  %1 = load <4 x float>* %b.addr, align 16
+  %mul = fmul <4 x float> %0, %1
+  ret <4 x float> %mul
+}
+
+; CHECK-LABEL: @vmul_optnone
+; CHECK: alloca <4 x float>
+; CHECK: alloca <4 x float>
+; CHECK: store <4 x float>
+; CHECK: store <4 x float>
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: fmul <4 x float>
+; CHECK: ret

diff --git a/test/Transforms/FunctionAttrs/optnone.ll b/test/Transforms/FunctionAttrs/optnone.ll
new file mode 100644
index 0000000..7694bfe
--- /dev/null
+++ b/test/Transforms/FunctionAttrs/optnone.ll

@@ -0,0 +1,24 @@
+; RUN: opt < %s -functionattrs -S | FileCheck %s
+
+@x = global i32 0
+
+define void @test_opt(i8* %p) {
+; CHECK-LABEL: @test_opt
+; CHECK: (i8* nocapture readnone %p) #0 {
+  ret void
+}
+
+define void @test_optnone(i8* %p) noinline optnone {
+; CHECK-LABEL: @test_optnone
+; CHECK: (i8* %p) #1 {
+  ret void
+}
+
+declare i8 @strlen(i8*) noinline optnone
+; CHECK-LABEL: @strlen
+; CHECK: (i8*) #1
+
+; CHECK-LABEL: attributes #0
+; CHECK: = { readnone }
+; CHECK-LABEL: attributes #1
+; CHECK: = { noinline optnone }

diff --git a/test/Transforms/GCOVProfiling/function-numbering.ll b/test/Transforms/GCOVProfiling/function-numbering.ll
new file mode 100644
index 0000000..2480820
--- /dev/null
+++ b/test/Transforms/GCOVProfiling/function-numbering.ll

@@ -0,0 +1,56 @@
+; Test that GCOV instrumentation numbers functions correctly when some
+; functions aren't emitted.
+
+; Inject metadata to set the .gcno file location
+; RUN: echo '!14 = metadata !{metadata !"%/T/function-numbering.ll", metadata !0}' > %t1
+; RUN: cat %s %t1 > %t2
+
+; RUN: opt -insert-gcov-profiling -S < %t2 | FileCheck --check-prefix GCDA %s
+; RUN: llvm-cov -n -dump %T/function-numbering.gcno 2>&1 | FileCheck --check-prefix GCNO %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; GCDA: @[[FOO:[0-9]+]] = private unnamed_addr constant [4 x i8] c"foo\00"
+; GCDA-NOT: @{{[0-9]+}} = private unnamed_addr constant .* c"bar\00"
+; GCDA: @[[BAZ:[0-9]+]] = private unnamed_addr constant [4 x i8] c"baz\00"
+; GCDA: define internal void @__llvm_gcov_writeout()
+; GCDA: call void @llvm_gcda_emit_function(i32 0, i8* getelementptr inbounds ([4 x i8]* @[[FOO]]
+; GCDA: call void @llvm_gcda_emit_function(i32 1, i8* getelementptr inbounds ([4 x i8]* @[[BAZ]]
+
+; GCNO: == foo (0) @
+; GCNO-NOT: == bar ({{[0-9]+}}) @
+; GCNO: == baz (1) @
+
+define void @foo() {
+  ret void, !dbg !12
+}
+
+define void @bar() {
+  ; This function is referenced by the debug info, but no lines have locations.
+  ret void
+}
+
+define void @baz() {
+  ret void, !dbg !13
+}
+
+!llvm.gcov = !{!14}
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0 \000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [function-numbering.ll] [DW_LANG_C99]
+!1 = metadata !{metadata !".../llvm/test/Transforms/GCOVProfiling/function-numbering.ll", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4, metadata !7, metadata !8}
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\000\000\000\001", metadata !1, metadata !5, metadata !6, null, void ()* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}    ; [ DW_TAG_file_type ] [/Users/bogner/build/llvm-debug//tmp/foo.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00bar\00bar\00\002\000\001\000\000\000\000\002", metadata !1, metadata !5, metadata !6, null, void ()* @bar, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
+!8 = metadata !{metadata !"0x2e\00baz\00baz\00\003\000\001\000\000\000\000\003", metadata !1, metadata !5, metadata !6, null, void ()* @baz, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [baz]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!11 = metadata !{metadata !"clang version 3.6.0 "}
+!12 = metadata !{i32 1, i32 13, metadata !4, null}
+!13 = metadata !{i32 3, i32 13, metadata !8, null}

diff --git a/test/Transforms/GCOVProfiling/global-ctor.ll b/test/Transforms/GCOVProfiling/global-ctor.ll
index 722a096..1dff3f0 100644
--- a/test/Transforms/GCOVProfiling/global-ctor.ll
+++ b/test/Transforms/GCOVProfiling/global-ctor.ll

@@ -1,11 +1,9 @@
-; RUN: echo '!16 = metadata !{metadata !"%T/global-ctor.ll", metadata !0}' > %t1
+; RUN: echo '!16 = metadata !{metadata !"%/T/global-ctor.ll", metadata !0}' > %t1
 ; RUN: cat %s %t1 > %t2
 ; RUN: opt -insert-gcov-profiling -disable-output < %t2
 ; RUN: not grep '_GLOBAL__sub_I_global-ctor' %T/global-ctor.gcno
 ; RUN: rm %T/global-ctor.gcno
 
-; REQUIRES: shell
-
 @x = global i32 0, align 4
 @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_global-ctor.ll, i8* null }]
 
@@ -40,19 +38,19 @@
 !llvm.gcov = !{!16}
 !llvm.ident = !{!12}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 210217)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2} ; [ DW_TAG_compile_unit ] [/home/nlewycky/<stdin>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 210217)\000\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/home/nlewycky/<stdin>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<stdin>", metadata !"/home/nlewycky"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !8}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 2, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [__cxx_global_var_init]
+!4 = metadata !{metadata !"0x2e\00__cxx_global_var_init\00__cxx_global_var_init\00\002\001\001\000\006\00256\000\002", metadata !5, metadata !6, metadata !7, null, void ()* @__cxx_global_var_init, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [__cxx_global_var_init]
 !5 = metadata !{metadata !"global-ctor.ll", metadata !"/home/nlewycky"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/home/nlewycky/global-ctor.ll]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"", metadata !"", metadata !"_GLOBAL__sub_I_global-ctor.ll", i32 0, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__sub_I_global-ctor.ll, null, null, metadata !2, i32 0} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
-!9 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/nlewycky/<stdin>]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/home/nlewycky/global-ctor.ll]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !"0x2e\00\00\00_GLOBAL__sub_I_global-ctor.ll\000\001\001\000\006\0064\000\000", metadata !1, metadata !9, metadata !7, null, void ()* @_GLOBAL__sub_I_global-ctor.ll, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 0] [local] [def]
+!9 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/nlewycky/<stdin>]
 !10 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!11 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!11 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !12 = metadata !{metadata !"clang version 3.5.0 (trunk 210217)"}
 !13 = metadata !{i32 2, i32 0, metadata !4, null}
 !14 = metadata !{i32 0, i32 0, metadata !15, null}
-!15 = metadata !{i32 786443, metadata !5, metadata !8} ; [ DW_TAG_lexical_block ] [/home/nlewycky/global-ctor.ll]
+!15 = metadata !{metadata !"0xb\000", metadata !5, metadata !8} ; [ DW_TAG_lexical_block ] [/home/nlewycky/global-ctor.ll]

diff --git a/test/Transforms/GCOVProfiling/linezero.ll b/test/Transforms/GCOVProfiling/linezero.ll
index e2f8324..50e026c 100644
--- a/test/Transforms/GCOVProfiling/linezero.ll
+++ b/test/Transforms/GCOVProfiling/linezero.ll

@@ -1,7 +1,6 @@
-; RUN: sed -e 's@PATTERN@\%T@g' < %s > %t1
+; RUN: sed -e 's|PATTERN|%/T|g' < %s > %t1
 ; RUN: opt -insert-gcov-profiling -disable-output < %t1
 ; RUN: rm %T/linezero.gcno %t1
-; REQUIRES: shell
 
 ; This is a crash test.
 
@@ -20,17 +19,17 @@
   %__begin = alloca i8*, align 8
   %__end = alloca i8*, align 8
   %spec = alloca i8, align 1
-  call void @llvm.dbg.declare(metadata !{%struct.vector** %__range}, metadata !27), !dbg !30
+  call void @llvm.dbg.declare(metadata !{%struct.vector** %__range}, metadata !27, metadata !{}), !dbg !30
   br label %0
 
 ; <label>:0                                       ; preds = %entry
   call void @_Z13TagFieldSpecsv(), !dbg !31
   store %struct.vector* %ref.tmp, %struct.vector** %__range, align 8, !dbg !31
-  call void @llvm.dbg.declare(metadata !{i8** %__begin}, metadata !32), !dbg !30
+  call void @llvm.dbg.declare(metadata !{i8** %__begin}, metadata !32, metadata !{}), !dbg !30
   %1 = load %struct.vector** %__range, align 8, !dbg !31
   %call = call i8* @_ZN6vector5beginEv(%struct.vector* %1), !dbg !31
   store i8* %call, i8** %__begin, align 8, !dbg !31
-  call void @llvm.dbg.declare(metadata !{i8** %__end}, metadata !33), !dbg !30
+  call void @llvm.dbg.declare(metadata !{i8** %__end}, metadata !33, metadata !{}), !dbg !30
   %2 = load %struct.vector** %__range, align 8, !dbg !31
   %call1 = call i8* @_ZN6vector3endEv(%struct.vector* %2), !dbg !31
   store i8* %call1, i8** %__end, align 8, !dbg !31
@@ -43,7 +42,7 @@
   br i1 %cmp, label %for.body, label %for.end, !dbg !34
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.dbg.declare(metadata !{i8* %spec}, metadata !37), !dbg !31
+  call void @llvm.dbg.declare(metadata !{i8* %spec}, metadata !37, metadata !{}), !dbg !31
   %5 = load i8** %__begin, align 8, !dbg !38
   %6 = load i8* %5, align 1, !dbg !38
   store i8 %6, i8* %spec, align 1, !dbg !38
@@ -65,7 +64,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare void @_Z13TagFieldSpecsv() #2
 
@@ -95,49 +94,49 @@
 !llvm.gcov = !{!25}
 !llvm.ident = !{!26}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0 (trunk 209871)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0 (trunk 209871)\000\00\000\00\001", metadata !1, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [<stdin>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<stdin>", metadata !"PATTERN"}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, null, metadata !"vector", i32 21, i64 8, i64 8, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS6vector"} ; [ DW_TAG_structure_type ] [vector] [line 21, size 8, align 8, offset 0] [def] [from ]
+!4 = metadata !{metadata !"0x13\00vector\0021\008\008\000\000\000", metadata !5, null, null, metadata !6, null, null, metadata !"_ZTS6vector"} ; [ DW_TAG_structure_type ] [vector] [line 21, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{metadata !"linezero.cc", metadata !"PATTERN"}
 !6 = metadata !{metadata !7, metadata !13}
-!7 = metadata !{i32 786478, metadata !5, metadata !"_ZTS6vector", metadata !"begin", metadata !"begin", metadata !"_ZN6vector5beginEv", i32 25, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 25} ; [ DW_TAG_subprogram ] [line 25] [begin]
-!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00begin\00begin\00_ZN6vector5beginEv\0025\000\000\000\006\00256\000\0025", metadata !5, metadata !"_ZTS6vector", metadata !8, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 25] [begin]
+!8 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !9 = metadata !{metadata !10, metadata !12}
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!11 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS6vector"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS6vector]
-!13 = metadata !{i32 786478, metadata !5, metadata !"_ZTS6vector", metadata !"end", metadata !"end", metadata !"_ZN6vector3endEv", i32 26, metadata !8, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, null, i32 26} ; [ DW_TAG_subprogram ] [line 26] [end]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!12 = metadata !{metadata !"0xf\00\000\0064\0064\000\001088", null, null, metadata !"_ZTS6vector"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS6vector]
+!13 = metadata !{metadata !"0x2e\00end\00end\00_ZN6vector3endEv\0026\000\000\000\006\00256\000\0026", metadata !5, metadata !"_ZTS6vector", metadata !8, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 26] [end]
 !14 = metadata !{metadata !15, metadata !20}
-!15 = metadata !{i32 786478, metadata !5, metadata !16, metadata !"test", metadata !"test", metadata !"_Z4testv", i32 50, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z4testv, null, null, metadata !2, i32 50} ; [ DW_TAG_subprogram ] [line 50] [def] [test]
-!16 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [./linezero.cc]
-!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0x2e\00test\00test\00_Z4testv\0050\000\001\000\006\00256\000\0050", metadata !5, metadata !16, metadata !17, null, i32 ()* @_Z4testv, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 50] [def] [test]
+!16 = metadata !{metadata !"0x29", metadata !5}         ; [ DW_TAG_file_type ] [./linezero.cc]
+!17 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !18, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{metadata !19}
-!19 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!20 = metadata !{i32 786478, metadata !5, metadata !16, metadata !"f1", metadata !"f1", metadata !"_Z2f1v", i32 54, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z2f1v, null, null, metadata !2, i32 54} ; [ DW_TAG_subprogram ] [line 54] [def] [f1]
-!21 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!19 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!20 = metadata !{metadata !"0x2e\00f1\00f1\00_Z2f1v\0054\000\001\000\006\00256\000\0054", metadata !5, metadata !16, metadata !21, null, void ()* @_Z2f1v, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 54] [def] [f1]
+!21 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{null}
 !23 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!24 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!24 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !25 = metadata !{metadata !"PATTERN/linezero.o", metadata !0}
 !26 = metadata !{metadata !"clang version 3.5.0 (trunk 209871)"}
-!27 = metadata !{i32 786688, metadata !28, metadata !"__range", null, i32 0, metadata !29, i32 64, i32 0} ; [ DW_TAG_auto_variable ] [__range] [line 0]
-!28 = metadata !{i32 786443, metadata !5, metadata !15, i32 51, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./linezero.cc]
-!29 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS6vector"} ; [ DW_TAG_rvalue_reference_type ] [line 0, size 0, align 0, offset 0] [from _ZTS6vector]
+!27 = metadata !{metadata !"0x100\00__range\000\0064", metadata !28, null, metadata !29} ; [ DW_TAG_auto_variable ] [__range] [line 0]
+!28 = metadata !{metadata !"0xb\0051\000\000", metadata !5, metadata !15} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!29 = metadata !{metadata !"0x42\00\000\000\000\000\000", null, null, metadata !"_ZTS6vector"} ; [ DW_TAG_rvalue_reference_type ] [line 0, size 0, align 0, offset 0] [from _ZTS6vector]
 !30 = metadata !{i32 0, i32 0, metadata !28, null}
 !31 = metadata !{i32 51, i32 0, metadata !28, null}
-!32 = metadata !{i32 786688, metadata !28, metadata !"__begin", null, i32 0, metadata !10, i32 64, i32 0} ; [ DW_TAG_auto_variable ] [__begin] [line 0]
-!33 = metadata !{i32 786688, metadata !28, metadata !"__end", null, i32 0, metadata !10, i32 64, i32 0} ; [ DW_TAG_auto_variable ] [__end] [line 0]
+!32 = metadata !{metadata !"0x100\00__begin\000\0064", metadata !28, null, metadata !10} ; [ DW_TAG_auto_variable ] [__begin] [line 0]
+!33 = metadata !{metadata !"0x100\00__end\000\0064", metadata !28, null, metadata !10} ; [ DW_TAG_auto_variable ] [__end] [line 0]
 !34 = metadata !{i32 51, i32 0, metadata !35, null}
-!35 = metadata !{i32 786443, metadata !5, metadata !36, i32 51, i32 0, i32 5, i32 5} ; [ DW_TAG_lexical_block ] [./linezero.cc]
-!36 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [./linezero.cc]
-!37 = metadata !{i32 786688, metadata !28, metadata !"spec", metadata !16, i32 51, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [spec] [line 51]
+!35 = metadata !{metadata !"0xb\0051\000\005", metadata !5, metadata !36} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!36 = metadata !{metadata !"0xb\0051\000\001", metadata !5, metadata !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!37 = metadata !{metadata !"0x100\00spec\0051\000", metadata !28, metadata !16, metadata !11} ; [ DW_TAG_auto_variable ] [spec] [line 51]
 !38 = metadata !{i32 51, i32 0, metadata !39, null}
-!39 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 2, i32 2} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!39 = metadata !{metadata !"0xb\0051\000\002", metadata !5, metadata !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
 !40 = metadata !{i32 51, i32 0, metadata !41, null}
-!41 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 4, i32 4} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!41 = metadata !{metadata !"0xb\0051\000\004", metadata !5, metadata !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
 !42 = metadata !{i32 51, i32 0, metadata !43, null}
-!43 = metadata !{i32 786443, metadata !5, metadata !28, i32 51, i32 0, i32 3, i32 3} ; [ DW_TAG_lexical_block ] [./linezero.cc]
+!43 = metadata !{metadata !"0xb\0051\000\003", metadata !5, metadata !28} ; [ DW_TAG_lexical_block ] [./linezero.cc]
 !44 = metadata !{i32 52, i32 0, metadata !15, null}
 !45 = metadata !{i32 54, i32 0, metadata !20, null}

diff --git a/test/Transforms/GCOVProfiling/linkagename.ll b/test/Transforms/GCOVProfiling/linkagename.ll
index ed3a5bd..04281b2 100644
--- a/test/Transforms/GCOVProfiling/linkagename.ll
+++ b/test/Transforms/GCOVProfiling/linkagename.ll

@@ -1,11 +1,9 @@
-; RUN: echo '!9 = metadata !{metadata !"%T/linkagename.ll", metadata !0}' > %t1
+; RUN: echo '!9 = metadata !{metadata !"%/T/linkagename.ll", metadata !0}' > %t1
 ; RUN: cat %s %t1 > %t2
 ; RUN: opt -insert-gcov-profiling -disable-output < %t2
 ; RUN: grep _Z3foov %T/linkagename.gcno
 ; RUN: rm %T/linkagename.gcno
 
-; REQUIRES: shell
-
 define void @_Z3foov() {
 entry:
   ret void, !dbg !8
@@ -15,15 +13,15 @@
 !llvm.module.flags = !{!10}
 !llvm.gcov = !{!9}
 
-!0 = metadata !{i32 786449, metadata !2, i32 4, metadata !"clang version 3.3 (trunk 177323)", i1 false, metadata !"", i32 0, metadata !3, metadata !3, metadata !4, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ] [/home/nlewycky/hello.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{i32 786473, metadata !2}          ; [ DW_TAG_file_type ] [/home/nlewycky/hello.cc]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 177323)\000\00\000\00\000", metadata !2, metadata !3, metadata !3, metadata !4, metadata !3,  metadata !3} ; [ DW_TAG_compile_unit ] [/home/nlewycky/hello.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"0x29", metadata !2}          ; [ DW_TAG_file_type ] [/home/nlewycky/hello.cc]
 !2 = metadata !{metadata !"hello.cc", metadata !"/home/nlewycky"}
 !3 = metadata !{i32 0}
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3foov, null, null, metadata !3, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!6 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00foo\00foo\00_Z3foov\001\000\001\000\006\00256\000\001", metadata !1, metadata !1, metadata !6, null, void ()* @_Z3foov, null, null, metadata !3} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 1, i32 0, metadata !5, null}
 
 
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/GCOVProfiling/version.ll b/test/Transforms/GCOVProfiling/version.ll
index 04f3f99..1af684e 100644
--- a/test/Transforms/GCOVProfiling/version.ll
+++ b/test/Transforms/GCOVProfiling/version.ll

@@ -1,32 +1,30 @@
-; RUN: echo '!9 = metadata !{metadata !"%T/version.ll", metadata !0}' > %t1
+; RUN: echo '!9 = metadata !{metadata !"%/T/version.ll", metadata !0}' > %t1
 ; RUN: cat %s %t1 > %t2
 ; RUN: opt -insert-gcov-profiling -disable-output < %t2
-; RUN: head -c8 %T/version.gcno | grep '^oncg\*204'
+; RUN: head -c8 %T/version.gcno | grep '^oncg.204'
 ; RUN: rm %T/version.gcno
 ; RUN: not opt -insert-gcov-profiling -default-gcov-version=asdfasdf -disable-output < %t2
 ; RUN: opt -insert-gcov-profiling -default-gcov-version=407* -disable-output < %t2
-; RUN: head -c8 %T/version.gcno | grep '^oncg\*704'
+; RUN: head -c8 %T/version.gcno | grep '^oncg.704'
 ; RUN: rm %T/version.gcno
 
 define void @test() {
   ret void, !dbg !8
 }
 
-; REQUIRES: shell
-
 !llvm.gcov = !{!9}
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{i32 786449, metadata !11, i32 4, metadata !"clang version 3.3 (trunk 176994)", i1 false, metadata !"", i32 0, metadata !3, metadata !3, metadata !4, metadata !3, null, metadata !""} ; [ DW_TAG_compile_unit ] [./version] [DW_LANG_C_plus_plus]
-!2 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.3 (trunk 176994)\000\00\000\00\000", metadata !11, metadata !3, metadata !3, metadata !4, metadata !3, null} ; [ DW_TAG_compile_unit ] [./version] [DW_LANG_C_plus_plus]
+!2 = metadata !{metadata !"0x29", metadata !11} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 0}
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !10, metadata !6, metadata !"test", metadata !"test", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @test, null, null, metadata !3, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [test]
-!6 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !3, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00test\00test\00\001\000\001\000\006\00256\000\001", metadata !10, metadata !6, metadata !7, null, void ()* @test, null, null, metadata !3} ; [ DW_TAG_subprogram ] [line 1] [def] [test]
+!6 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !3, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{i32 1, i32 0, metadata !5, null}
 ;; !9 is added through the echo line at the top.
 !10 = metadata !{metadata !"<stdin>", metadata !"."}
 !11 = metadata !{metadata !"version", metadata !"/usr/local/google/home/nlewycky"}
-!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/GVN/2009-03-10-PREOnVoid.ll b/test/Transforms/GVN/2009-03-10-PREOnVoid.ll
index fd31fce..fdf17e0 100644
--- a/test/Transforms/GVN/2009-03-10-PREOnVoid.ll
+++ b/test/Transforms/GVN/2009-03-10-PREOnVoid.ll

@@ -17,20 +17,20 @@
 	%"struct.std::pair<std::_Rb_tree_iterator<std::pair<void* const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >,bool>" = type { %"struct.std::_Rb_tree_iterator<std::pair<void* const, std::vector<ShadowInfo, std::allocator<ShadowInfo> > > >", i8 }
 	%"struct.std::pair<void* const,void*>" = type { i8*, i8* }
 
-@_ZL20__gthrw_pthread_oncePiPFvvE = alias weak i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
-@_ZL27__gthrw_pthread_getspecificj = alias weak i8* (i32)* @pthread_getspecific		; <i8* (i32)*> [#uses=0]
-@_ZL27__gthrw_pthread_setspecificjPKv = alias weak i32 (i32, i8*)* @pthread_setspecific		; <i32 (i32, i8*)*> [#uses=0]
-@_ZL22__gthrw_pthread_createPmPK16__pthread_attr_sPFPvS3_ES3_ = alias weak i32 (i32*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create		; <i32 (i32*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)*> [#uses=0]
-@_ZL22__gthrw_pthread_cancelm = alias weak i32 (i32)* @pthread_cancel		; <i32 (i32)*> [#uses=0]
-@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = alias weak i32 (%struct.pthread_mutex_t*)* @pthread_mutex_lock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = alias weak i32 (%struct.pthread_mutex_t*)* @pthread_mutex_trylock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = alias weak i32 (%struct.pthread_mutex_t*)* @pthread_mutex_unlock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
-@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = alias weak i32 (%struct.pthread_mutex_t*, %struct.__sched_param*)* @pthread_mutex_init		; <i32 (%struct.pthread_mutex_t*, %struct.__sched_param*)*> [#uses=0]
-@_ZL26__gthrw_pthread_key_createPjPFvPvE = alias weak i32 (i32*, void (i8*)*)* @pthread_key_create		; <i32 (i32*, void (i8*)*)*> [#uses=0]
-@_ZL26__gthrw_pthread_key_deletej = alias weak i32 (i32)* @pthread_key_delete		; <i32 (i32)*> [#uses=0]
-@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = alias weak i32 (%struct.__sched_param*)* @pthread_mutexattr_init		; <i32 (%struct.__sched_param*)*> [#uses=0]
-@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = alias weak i32 (%struct.__sched_param*, i32)* @pthread_mutexattr_settype		; <i32 (%struct.__sched_param*, i32)*> [#uses=0]
-@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = alias weak i32 (%struct.__sched_param*)* @pthread_mutexattr_destroy		; <i32 (%struct.__sched_param*)*> [#uses=0]
+@_ZL20__gthrw_pthread_oncePiPFvvE = weak alias i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
+@_ZL27__gthrw_pthread_getspecificj = weak alias i8* (i32)* @pthread_getspecific		; <i8* (i32)*> [#uses=0]
+@_ZL27__gthrw_pthread_setspecificjPKv = weak alias i32 (i32, i8*)* @pthread_setspecific		; <i32 (i32, i8*)*> [#uses=0]
+@_ZL22__gthrw_pthread_createPmPK16__pthread_attr_sPFPvS3_ES3_ = weak alias i32 (i32*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)* @pthread_create		; <i32 (i32*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)*> [#uses=0]
+@_ZL22__gthrw_pthread_cancelm = weak alias i32 (i32)* @pthread_cancel		; <i32 (i32)*> [#uses=0]
+@_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_lock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_trylock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t = weak alias i32 (%struct.pthread_mutex_t*)* @pthread_mutex_unlock		; <i32 (%struct.pthread_mutex_t*)*> [#uses=0]
+@_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t = weak alias i32 (%struct.pthread_mutex_t*, %struct.__sched_param*)* @pthread_mutex_init		; <i32 (%struct.pthread_mutex_t*, %struct.__sched_param*)*> [#uses=0]
+@_ZL26__gthrw_pthread_key_createPjPFvPvE = weak alias i32 (i32*, void (i8*)*)* @pthread_key_create		; <i32 (i32*, void (i8*)*)*> [#uses=0]
+@_ZL26__gthrw_pthread_key_deletej = weak alias i32 (i32)* @pthread_key_delete		; <i32 (i32)*> [#uses=0]
+@_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t = weak alias i32 (%struct.__sched_param*)* @pthread_mutexattr_init		; <i32 (%struct.__sched_param*)*> [#uses=0]
+@_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti = weak alias i32 (%struct.__sched_param*, i32)* @pthread_mutexattr_settype		; <i32 (%struct.__sched_param*, i32)*> [#uses=0]
+@_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t = weak alias i32 (%struct.__sched_param*)* @pthread_mutexattr_destroy		; <i32 (%struct.__sched_param*)*> [#uses=0]
 
 declare fastcc void @_ZNSt10_Select1stISt4pairIKPvS1_EEC1Ev() nounwind readnone
 

diff --git a/test/Transforms/GVN/atomic.ll b/test/Transforms/GVN/atomic.ll
index 094e22b..8c13d20 100644
--- a/test/Transforms/GVN/atomic.ll
+++ b/test/Transforms/GVN/atomic.ll

@@ -8,7 +8,7 @@
 
 ; GVN across unordered store (allowed)
 define i32 @test1() nounwind uwtable ssp {
-; CHECK: test1
+; CHECK-LABEL: test1
 ; CHECK: add i32 %x, %x
 entry:
   %x = load i32* @y
@@ -18,10 +18,10 @@
   ret i32 %z
 }
 
-; GVN across seq_cst store (allowed in theory; not implemented ATM)
+; GVN across seq_cst store (allowed)
 define i32 @test2() nounwind uwtable ssp {
-; CHECK: test2
-; CHECK: add i32 %x, %y
+; CHECK-LABEL: test2
+; CHECK: add i32 %x, %x
 entry:
   %x = load i32* @y
   store atomic i32 %x, i32* @x seq_cst, align 4
@@ -32,7 +32,7 @@
 
 ; GVN across unordered load (allowed)
 define i32 @test3() nounwind uwtable ssp {
-; CHECK: test3
+; CHECK-LABEL: test3
 ; CHECK: add i32 %x, %x
 entry:
   %x = load i32* @y
@@ -43,11 +43,11 @@
   ret i32 %b
 }
 
-; GVN across acquire load (load after atomic load must not be removed)
+; GVN across acquire load (allowed as the original load was not atomic)
 define i32 @test4() nounwind uwtable ssp {
-; CHECK: test4
+; CHECK-LABEL: test4
 ; CHECK: load atomic i32* @x
-; CHECK: load i32* @y
+; CHECK-NOT: load i32* @y
 entry:
   %x = load i32* @y
   %y = load atomic i32* @x seq_cst, align 4
@@ -59,7 +59,7 @@
 
 ; GVN load to unordered load (allowed)
 define i32 @test5() nounwind uwtable ssp {
-; CHECK: test5
+; CHECK-LABEL: test5
 ; CHECK: add i32 %x, %x
 entry:
   %x = load atomic i32* @x unordered, align 4
@@ -70,7 +70,7 @@
 
 ; GVN unordered load to load (unordered load must not be removed)
 define i32 @test6() nounwind uwtable ssp {
-; CHECK: test6
+; CHECK-LABEL: test6
 ; CHECK: load atomic i32* @x unordered
 entry:
   %x = load i32* @x
@@ -78,3 +78,54 @@
   %x3 = add i32 %x, %x2
   ret i32 %x3
 }
+
+; GVN across release-acquire pair (forbidden)
+define i32 @test7() nounwind uwtable ssp {
+; CHECK-LABEL: test7
+; CHECK: add i32 %x, %y
+entry:
+  %x = load i32* @y
+  store atomic i32 %x, i32* @x release, align 4
+  %w = load atomic i32* @x acquire, align 4
+  %y = load i32* @y
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+; GVN across acquire-release pair (allowed)
+define i32 @test8() nounwind uwtable ssp {
+; CHECK-LABEL: test8
+; CHECK: add i32 %x, %x
+entry:
+  %x = load i32* @y
+  %w = load atomic i32* @x acquire, align 4
+  store atomic i32 %x, i32* @x release, align 4
+  %y = load i32* @y
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+; GVN across monotonic store (allowed)
+define i32 @test9() nounwind uwtable ssp {
+; CHECK-LABEL: test9
+; CHECK: add i32 %x, %x
+entry:
+  %x = load i32* @y
+  store atomic i32 %x, i32* @x monotonic, align 4
+  %y = load i32* @y
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+; GVN of an unordered across monotonic load (not allowed)
+define i32 @test10() nounwind uwtable ssp {
+; CHECK-LABEL: test10
+; CHECK: add i32 %x, %y
+entry:
+  %x = load atomic i32* @y unordered, align 4
+  %clobber = load atomic i32* @x monotonic, align 4
+  %y = load atomic i32* @y monotonic, align 4
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+

diff --git a/test/Transforms/GVN/noalias.ll b/test/Transforms/GVN/noalias.ll
new file mode 100644
index 0000000..a774f38
--- /dev/null
+++ b/test/Transforms/GVN/noalias.ll

@@ -0,0 +1,43 @@
+; RUN: opt -scoped-noalias -basicaa -gvn -S < %s | FileCheck %s
+
+define i32 @test1(i32* %p, i32* %q) {
+; CHECK-LABEL: @test1(i32* %p, i32* %q)
+; CHECK: load i32* %p
+; CHECK-NOT: noalias
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !noalias !0
+  %b = load i32* %p
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+define i32 @test2(i32* %p, i32* %q) {
+; CHECK-LABEL: @test2(i32* %p, i32* %q)
+; CHECK: load i32* %p, !alias.scope !0
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !alias.scope !0
+  %b = load i32* %p, !alias.scope !0
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+; FIXME: In this case we can do better than intersecting the scopes, and can
+; concatenate them instead. Both loads are in the same basic block, the first
+; makes the second safe to speculatively execute, and there are no calls that may
+; throw in between.
+define i32 @test3(i32* %p, i32* %q) {
+; CHECK-LABEL: @test3(i32* %p, i32* %q)
+; CHECK: load i32* %p, !alias.scope !1
+; CHECK: %c = add i32 %a, %a
+  %a = load i32* %p, !alias.scope !1
+  %b = load i32* %p, !alias.scope !2
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+declare i32 @foo(i32*) readonly
+
+!0 = metadata !{metadata !0}
+!1 = metadata !{metadata !1}
+!2 = metadata !{metadata !0, metadata !1}
+

diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index 8d289b0..6aac93e 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll

@@ -318,6 +318,19 @@
 ; CHECK: ret i8
 }
 
+define i8 @coerce_offset0_addrspacecast(i32 %V, i32* %P) {
+  store i32 %V, i32* %P
+
+  %P2 = addrspacecast i32* %P to i8 addrspace(1)*
+  %P3 = getelementptr i8 addrspace(1)* %P2, i32 2
+
+  %A = load i8 addrspace(1)* %P3
+  ret i8 %A
+; CHECK-LABEL: @coerce_offset0_addrspacecast(
+; CHECK-NOT: load
+; CHECK: ret i8
+}
+
 ;; non-local i32/float -> i8 load forwarding.
 define i8 @coerce_offset_nonlocal0(i32* %P, i1 %cond) {
   %P2 = bitcast i32* %P to float*

diff --git a/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll b/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
index 0bdced5..584f0bf 100644
--- a/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll
+++ b/test/Transforms/GlobalDCE/2009-01-05-DeadAliases.ll

@@ -5,14 +5,14 @@
 @A = global i32 0
 ; CHECK: @A = global i32 0
 
-@D = alias internal i32* @A
+@D = internal alias i32* @A
 ; DEAD-NOT: @D
 
 @L1 = alias i32* @A
 ; CHECK: @L1 = alias i32* @A
 
-@L2 = alias internal i32* @L1
-; CHECK: @L2 = alias internal i32* @L1
+@L2 = internal alias i32* @L1
+; CHECK: @L2 = internal alias i32* @L1
 
 @L3 = alias i32* @L2
 ; CHECK: @L3 = alias i32* @L2

diff --git a/test/Transforms/GlobalDCE/2009-02-17-AliasUsesAliasee.ll b/test/Transforms/GlobalDCE/2009-02-17-AliasUsesAliasee.ll
index 68933c6..5fb4444 100644
--- a/test/Transforms/GlobalDCE/2009-02-17-AliasUsesAliasee.ll
+++ b/test/Transforms/GlobalDCE/2009-02-17-AliasUsesAliasee.ll

@@ -1,4 +1,4 @@
 ; RUN: opt < %s -globaldce
 
-@A = alias internal void ()* @F
+@A = internal alias void ()* @F
 define internal void @F() { ret void }

diff --git a/test/Transforms/GlobalDCE/deadblockaddr.ll b/test/Transforms/GlobalDCE/deadblockaddr.ll
new file mode 100644
index 0000000..1ec5994
--- /dev/null
+++ b/test/Transforms/GlobalDCE/deadblockaddr.ll

@@ -0,0 +1,16 @@
+; RUN: opt -globaldce -simplifycfg -S < %s | FileCheck %s
+
+; Tests whether globaldce does the right cleanup while removing @bar
+; so that a dead BlockAddress reference to foo won't prevent other passes
+; to work properly, e.g. simplifycfg
+@bar = internal unnamed_addr constant i8* blockaddress(@foo, %L1)
+
+; CHECK-LABEL: foo
+; CHECK-NOT: br label %L1
+; CHECK: ret void
+define void @foo() {
+entry:
+  br label %L1
+L1:
+  ret void
+}

diff --git a/test/Transforms/GlobalDCE/pr20981.ll b/test/Transforms/GlobalDCE/pr20981.ll
new file mode 100644
index 0000000..92d2840
--- /dev/null
+++ b/test/Transforms/GlobalDCE/pr20981.ll

@@ -0,0 +1,17 @@
+; RUN: opt < %s -globaldce -S | FileCheck %s
+
+$c1 = comdat any
+; CHECK: $c1 = comdat any
+
+@a1 = linkonce_odr alias void ()* @f1
+; CHECK: @a1 = linkonce_odr alias void ()* @f1
+
+define linkonce_odr void @f1() comdat $c1 {
+  ret void
+}
+; CHECK: define linkonce_odr void @f1() comdat $c1
+
+define void @g() {
+  call void @f1()
+  ret void
+}

diff --git a/test/Transforms/GlobalOpt/2009-02-15-ResolveAlias.ll b/test/Transforms/GlobalOpt/2009-02-15-ResolveAlias.ll
index b98faca..8efd018 100644
--- a/test/Transforms/GlobalOpt/2009-02-15-ResolveAlias.ll
+++ b/test/Transforms/GlobalOpt/2009-02-15-ResolveAlias.ll

@@ -13,7 +13,7 @@
 	ret void
 }
 
-@b = alias internal void ()* @g
+@b = internal alias  void ()* @g
 ; CHECK-NOT: @b
 
 define void @h() {

diff --git a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
index 0108960..0513829 100644
--- a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
+++ b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll

@@ -6,14 +6,14 @@
 define i32 @foo(i32 %i) nounwind ssp {
 entry:
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !3)
+  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !3, metadata !{})
   %0 = icmp eq i32 %i, 1, !dbg !7                 ; <i1> [#uses=1]
   br i1 %0, label %bb, label %bb1, !dbg !7
 
 bb:                                               ; preds = %entry
   store i32 0, i32* @Stop, align 4, !dbg !9
   %1 = mul nsw i32 %i, 42, !dbg !10               ; <i32> [#uses=1]
-  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !3), !dbg !10
+  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !3, metadata !{}), !dbg !10
   br label %bb2, !dbg !10
 
 bb1:                                              ; preds = %entry
@@ -28,7 +28,7 @@
   ret i32 %i_addr.0, !dbg !12
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define i32 @bar() nounwind ssp {
 entry:
@@ -51,27 +51,27 @@
   ret i32 %.0, !dbg !19
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.gv = !{!0}
 
-!0 = metadata !{i32 458804, i32 0, metadata !1, metadata !"Stop", metadata !"Stop", metadata !"", metadata !1, i32 2, metadata !2, i1 true, i1 true, i32* @Stop} ; [ DW_TAG_variable ]
-!1 = metadata !{i32 458769, metadata !20, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{i32 458788, null, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!3 = metadata !{i32 459009, metadata !4, metadata !"i", metadata !1, i32 4, metadata !2} ; [ DW_TAG_arg_variable ]
-!4 = metadata !{i32 458798, i32 0, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 4, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!5 = metadata !{i32 458773, metadata !1, null, metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x34\00Stop\00Stop\00\002\001\001", metadata !1, metadata !1, metadata !2, i32* @Stop} ; [ DW_TAG_variable ]
+!1 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !1} ; [ DW_TAG_base_type ]
+!3 = metadata !{metadata !"0x101\00i\004\000", metadata !4, metadata !1, metadata !2} ; [ DW_TAG_arg_variable ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, metadata !1, metadata !5, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !2, metadata !2}
 !7 = metadata !{i32 5, i32 0, metadata !8, null}
-!8 = metadata !{i32 458763, metadata !20, metadata !4, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!8 = metadata !{metadata !"0xb\000\000\000", metadata !20, metadata !4} ; [ DW_TAG_lexical_block ]
 !9 = metadata !{i32 6, i32 0, metadata !8, null}
 !10 = metadata !{i32 7, i32 0, metadata !8, null}
 !11 = metadata !{i32 9, i32 0, metadata !8, null}
 !12 = metadata !{i32 11, i32 0, metadata !8, null}
 !13 = metadata !{i32 14, i32 0, metadata !14, null}
-!14 = metadata !{i32 458763, metadata !20, metadata !15, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!15 = metadata !{i32 458798, i32 0, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 13, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!16 = metadata !{i32 458773, metadata !1, null, metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{metadata !"0xb\000\000\000", metadata !20, metadata !15} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0x2e\00bar\00bar\00bar\0013\000\001\000\006\000\000\000", i32 0, metadata !1, metadata !16, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!16 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, null, null, metadata !17, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{metadata !2}
 !18 = metadata !{i32 15, i32 0, metadata !14, null}
 !19 = metadata !{i32 16, i32 0, metadata !14, null}

diff --git a/test/Transforms/GlobalOpt/alias-resolve.ll b/test/Transforms/GlobalOpt/alias-resolve.ll
index 9d70c70..ebc20c6 100644
--- a/test/Transforms/GlobalOpt/alias-resolve.ll
+++ b/test/Transforms/GlobalOpt/alias-resolve.ll

@@ -9,12 +9,12 @@
 @bar1  = alias void ()* @bar2
 ; CHECK: @bar1 = alias void ()* @bar2
 
-@weak1 = alias weak void ()* @bar2
-; CHECK: @weak1 = alias weak void ()* @bar2
+@weak1 = weak alias void ()* @bar2
+; CHECK: @weak1 = weak alias void ()* @bar2
 
 @bar4 = private unnamed_addr constant [2 x i8*] zeroinitializer
-@foo4 = unnamed_addr alias linkonce_odr getelementptr inbounds ([2 x i8*]* @bar4, i32 0, i32 1)
-; CHECK: @foo4 = unnamed_addr alias linkonce_odr getelementptr inbounds ([2 x i8*]* @bar4, i32 0, i32 1)
+@foo4 = linkonce_odr unnamed_addr alias getelementptr inbounds ([2 x i8*]* @bar4, i32 0, i32 1)
+; CHECK: @foo4 = linkonce_odr unnamed_addr alias getelementptr inbounds ([2 x i8*]* @bar4, i32 0, i32 1)
 
 define void @bar2() {
   ret void

diff --git a/test/Transforms/GlobalOpt/alias-used-address-space.ll b/test/Transforms/GlobalOpt/alias-used-address-space.ll
index 633cd34..62e74ba 100644
--- a/test/Transforms/GlobalOpt/alias-used-address-space.ll
+++ b/test/Transforms/GlobalOpt/alias-used-address-space.ll

@@ -7,7 +7,7 @@
 @i = internal addrspace(1) global i8 42
 
 ; CHECK: @ia = internal addrspace(1) global i8 42
-@ia = alias internal i8 addrspace(1)* @i
+@ia = internal alias i8 addrspace(1)* @i
 
 @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* @ca to i8*)], section "llvm.metadata"
 ; CHECK-DAG: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* @ca to i8*)], section "llvm.metadata"
@@ -18,8 +18,8 @@
 @sameAsUsed = global [1 x i8*] [i8* addrspacecast(i8 addrspace(1)* @ca to i8*)]
 ; CHECK-DAG: @sameAsUsed = global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* @c to i8*)]
 
-@ca = alias internal i8 addrspace(1)* @c
-; CHECK: @ca = alias internal i8 addrspace(1)* @c
+@ca = internal alias i8 addrspace(1)* @c
+; CHECK: @ca = internal alias i8 addrspace(1)* @c
 
 define i8 addrspace(1)* @h() {
   ret i8 addrspace(1)* @ca

diff --git a/test/Transforms/GlobalOpt/alias-used-section.ll b/test/Transforms/GlobalOpt/alias-used-section.ll
index 1217937..4dab2f5 100644
--- a/test/Transforms/GlobalOpt/alias-used-section.ll
+++ b/test/Transforms/GlobalOpt/alias-used-section.ll

@@ -1,7 +1,7 @@
 ; RUN: opt -S -globalopt < %s | FileCheck %s
 
 @_Z17in_custom_section = internal global i8 42, section "CUSTOM"
-@in_custom_section = dllexport alias internal i8* @_Z17in_custom_section
+@in_custom_section = internal dllexport alias i8* @_Z17in_custom_section
 
 ; CHECK: @in_custom_section = internal dllexport global i8 42, section "CUSTOM"
 

diff --git a/test/Transforms/GlobalOpt/alias-used.ll b/test/Transforms/GlobalOpt/alias-used.ll
index 05ac7f9..21f06b7 100644
--- a/test/Transforms/GlobalOpt/alias-used.ll
+++ b/test/Transforms/GlobalOpt/alias-used.ll

@@ -4,7 +4,7 @@
 
 @i = internal global i8 42
 ; CHECK: @ia = internal global i8 42
-@ia = alias internal i8* @i
+@ia = internal alias i8* @i
 
 @llvm.used = appending global [3 x i8*] [i8* bitcast (void ()* @fa to i8*), i8* bitcast (void ()* @f to i8*), i8* @ca], section "llvm.metadata"
 ; CHECK-DAG: @llvm.used = appending global [3 x i8*] [i8* bitcast (void ()* @fa to i8*), i8* bitcast (void ()* @f to i8*), i8* @ca], section "llvm.metadata"
@@ -18,17 +18,17 @@
 @other = global i32* bitcast (void ()* @fa to i32*)
 ; CHECK-DAG: @other = global i32* bitcast (void ()* @f to i32*)
 
-@fa = alias internal void ()* @f
-; CHECK: @fa = alias internal void ()* @f
+@fa = internal alias void ()* @f
+; CHECK: @fa = internal alias void ()* @f
 
-@fa2 = alias internal void ()* @f
+@fa2 = internal alias void ()* @f
 ; CHECK-NOT: @fa2
 
-@fa3 = alias internal void ()* @f
+@fa3 = internal alias void ()* @f
 ; CHECK: @fa3
 
-@ca = alias internal i8* @c
-; CHECK: @ca = alias internal i8* @c
+@ca = internal alias i8* @c
+; CHECK: @ca = internal alias i8* @c
 
 define void @f() {
   ret void

diff --git a/test/Transforms/GlobalOpt/constantfold-initializers.ll b/test/Transforms/GlobalOpt/constantfold-initializers.ll
index 4a25d66..36de19c 100644
--- a/test/Transforms/GlobalOpt/constantfold-initializers.ll
+++ b/test/Transforms/GlobalOpt/constantfold-initializers.ll

@@ -81,10 +81,23 @@
   ret void
 }
 
+@test6_v1 = internal global { i32, i32 } { i32 42, i32 0 }, align 8
+@test6_v2 = global i32 0, align 4
+; CHECK: @test6_v2 = global i32 42, align 4
+define internal void @test6() {
+  %load = load { i32, i32 }* @test6_v1, align 8
+  %xv0 = extractvalue { i32, i32 } %load, 0
+  %iv = insertvalue { i32, i32 } %load, i32 %xv0, 1
+  %xv1 = extractvalue { i32, i32 } %iv, 1
+  store i32 %xv1, i32* @test6_v2, align 4
+  ret void
+}
+
 @llvm.global_ctors = appending constant
-  [5 x { i32, void ()* }]
+  [6 x { i32, void ()* }]
   [{ i32, void ()* } { i32 65535, void ()* @test1 },
    { i32, void ()* } { i32 65535, void ()* @test2 },
    { i32, void ()* } { i32 65535, void ()* @test3 },
    { i32, void ()* } { i32 65535, void ()* @test4 },
-   { i32, void ()* } { i32 65535, void ()* @test5 }]
+   { i32, void ()* } { i32 65535, void ()* @test5 },
+   { i32, void ()* } { i32 65535, void ()* @test6 }]

diff --git a/test/Transforms/GlobalOpt/pr21191.ll b/test/Transforms/GlobalOpt/pr21191.ll
new file mode 100644
index 0000000..39b8eee
--- /dev/null
+++ b/test/Transforms/GlobalOpt/pr21191.ll

@@ -0,0 +1,19 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+
+$c = comdat any
+; CHECK: $c = comdat any
+
+define linkonce_odr void @foo() comdat $c {
+  ret void
+}
+; CHECK: define linkonce_odr void @foo() comdat $c
+
+define linkonce_odr void @bar() comdat $c {
+  ret void
+}
+; CHECK: define linkonce_odr void @bar() comdat $c
+
+define void @zed()  {
+  call void @foo()
+  ret void
+}

diff --git a/test/Transforms/GlobalOpt/preserve-comdats.ll b/test/Transforms/GlobalOpt/preserve-comdats.ll
new file mode 100644
index 0000000..08188b9
--- /dev/null
+++ b/test/Transforms/GlobalOpt/preserve-comdats.ll

@@ -0,0 +1,37 @@
+; RUN: opt -globalopt -S < %s | FileCheck %s
+
+$comdat_global = comdat any
+
+@comdat_global = weak_odr global i8 0, comdat $comdat_global
+@simple_global = internal global i8 0
+; CHECK: @comdat_global = weak_odr global i8 0, comdat $comdat_global
+; CHECK: @simple_global = internal global i8 42
+
+@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [
+    { i32, void ()*, i8* } { i32 65535, void ()* @init_comdat_global, i8* @comdat_global },
+    { i32, void ()*, i8* } { i32 65535, void ()* @init_simple_global, i8* null }
+]
+; CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }]
+; CHECK: [{ i32, void ()*, i8* } { i32 65535, void ()* @init_comdat_global, i8* @comdat_global }]
+
+define void @init_comdat_global() {
+  store i8 42, i8* @comdat_global
+  ret void
+}
+; CHECK: define void @init_comdat_global()
+
+define internal void @init_simple_global() comdat $comdat_global {
+  store i8 42, i8* @simple_global
+  ret void
+}
+; CHECK-NOT: @init_simple_global()
+
+define i8* @use_simple() {
+  ret i8* @simple_global
+}
+; CHECK: define i8* @use_simple()
+
+define i8* @use_comdat() {
+  ret i8* @comdat_global
+}
+; CHECK: define i8* @use_comdat()

diff --git a/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll b/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
index af9f1b3..64fef10 100644
--- a/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
+++ b/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll

@@ -1,6 +1,6 @@
 ; RUN: opt < %s -indvars -S | FileCheck %s
 ; Test WidenIV::GetExtendedOperandRecurrence.
-; add219 should be extended to i64 because it is nsw, even though its
+; %add, %sub and %mul should be extended to i64 because it is nsw, even though its
 ; sext cannot be hoisted outside the loop.
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -18,13 +18,26 @@
   br i1 undef, label %for.body170, label %for.body153
 
 ; CHECK: add nsw i64 %indvars.iv, 1
+; CHECK: sub nsw i64 %indvars.iv, 2
+; CHECK: sub nsw i64 4, %indvars.iv
+; CHECK: mul nsw i64 %indvars.iv, 8
 for.body170:                                      ; preds = %for.body170, %for.body153
   %i2.19 = phi i32 [ %add249, %for.body170 ], [ 0, %for.body153 ]
-  %add219 = add nsw i32 %i2.19, 1
-  %idxprom220 = sext i32 %add219 to i64
+
+  %add = add nsw i32 %i2.19, 1
+  %add.idxprom = sext i32 %add to i64
+
+  %sub = sub nsw i32 %i2.19, 2
+  %sub.idxprom = sext i32 %sub to i64
+
+  %sub.neg = sub nsw i32 4, %i2.19
+  %sub.neg.idxprom = sext i32 %sub.neg to i64
+
+  %mul = mul nsw i32 %i2.19, 8
+  %mul.idxprom = sext i32 %mul to i64
+
   %add249 = add nsw i32 %i2.19, %shl132
   br label %for.body170
-
 for.end285:                                       ; preds = %entry
   ret void
 }

diff --git a/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll b/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll
index a8020e6..e462712 100644
--- a/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll
+++ b/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll

@@ -6,7 +6,7 @@
 
 ; CHECK-LABEL: @test(
 ; CHECK: if.end.i126:
-; CHECK: %exitcond = icmp ne i8* %incdec.ptr.i, getelementptr (i8* null, i32 undef)
+; CHECK: %exitcond = icmp ne i8* %destYPixelPtr.010.i, getelementptr (i8* null, i32 undef)
 define void @test() nounwind {
 entry:
   br label %while.cond

diff --git a/test/Transforms/IndVarSimplify/NVPTX/lit.local.cfg b/test/Transforms/IndVarSimplify/NVPTX/lit.local.cfg
new file mode 100644
index 0000000..2cb98eb
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/NVPTX/lit.local.cfg

@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True

diff --git a/test/Transforms/IndVarSimplify/NVPTX/no-widen-expensive.ll b/test/Transforms/IndVarSimplify/NVPTX/no-widen-expensive.ll
new file mode 100644
index 0000000..8744b19
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/NVPTX/no-widen-expensive.ll

@@ -0,0 +1,37 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+target triple = "nvptx64-unknown-unknown"
+
+; For the nvptx64 architecture, the cost of an arithmetic instruction on a
+; 64-bit integer is twice as expensive as that on a 32-bit integer, because the
+; hardware needs to simulate a 64-bit integer using two 32-bit integers.
+; Therefore, in this particular architecture, we should not widen induction
+; variables to 64-bit integers even though i64 is a legal type in the 64-bit
+; PTX ISA.
+
+define void @indvar_32_bit(i32 %n, i32* nocapture %output) {
+; CHECK-LABEL: @indvar_32_bit
+entry:
+  %cmp5 = icmp sgt i32 %n, 0
+  br i1 %cmp5, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+; CHECK: phi i32
+  %mul = mul nsw i32 %i.06, %i.06
+  %0 = sext i32 %i.06 to i64
+  %arrayidx = getelementptr inbounds i32* %output, i64 %0
+  store i32 %mul, i32* %arrayidx, align 4
+  %add = add nsw i32 %i.06, 3
+  %cmp = icmp slt i32 %add, %n
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}

diff --git a/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll b/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll
index e4c31d1..9e55a17 100644
--- a/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll
+++ b/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll

@@ -11,7 +11,7 @@
   br i1 %cmp1, label %for.body, label %for.end
 
 ; Make sure the added GEP has the right index type
-; CHECK: %lftr.limit = getelementptr i8 addrspace(2)* %base, i8 %0
+; CHECK: %lftr.limit = getelementptr i8 addrspace(2)* %base, i8
 
 ; CHECK: for.body:
 ; CHECK: phi i8 addrspace(2)*
@@ -43,7 +43,7 @@
   br i1 %cmp1, label %for.body, label %for.end
 
 ; Make sure the added GEP has the right index type
-; CHECK: %lftr.limit = getelementptr i8 addrspace(3)* %base, i16 %0
+; CHECK: %lftr.limit = getelementptr i8 addrspace(3)* %base, i16
 
 ; CHECK: for.body:
 ; CHECK: phi i8 addrspace(3)*

diff --git a/test/Transforms/IndVarSimplify/lftr-extend-const.ll b/test/Transforms/IndVarSimplify/lftr-extend-const.ll
index 4736f85..f12c68c 100644
--- a/test/Transforms/IndVarSimplify/lftr-extend-const.ll
+++ b/test/Transforms/IndVarSimplify/lftr-extend-const.ll

@@ -21,7 +21,7 @@
 
 ; Check that post-incrementing the backedge taken count does not overflow.
 ; CHECK-LABEL: @postinc(
-; CHECK: icmp eq i32 %indvars.iv.next, 256
+; CHECK: icmp eq i32 %indvars.iv, 255
 define i32 @postinc() #0 {
 entry:
   br label %do.body

diff --git a/test/Transforms/IndVarSimplify/lftr-reuse.ll b/test/Transforms/IndVarSimplify/lftr-reuse.ll
index 1fdcdd1..efb96bd 100644
--- a/test/Transforms/IndVarSimplify/lftr-reuse.ll
+++ b/test/Transforms/IndVarSimplify/lftr-reuse.ll

@@ -82,15 +82,23 @@
 ; Perform LFTR without generating extra preheader code.
 define void @guardedloop([0 x double]* %matrix, [0 x double]* %vector,
                          i32 %irow, i32 %ilead) nounwind {
-; CHECK: entry:
-; CHECK-NOT: zext
-; CHECK-NOT: add
-; CHECK: loop:
-; CHECK: phi i64
-; CHECK: phi i64
+; CHECK-LABEL: @guardedloop(
+; CHECK-LABEL: entry:
+; CHECK-NEXT: %[[cmp:.*]] = icmp slt i32 1, %irow
+; CHECK-NEXT: br i1 %[[cmp]], label %[[loop_preheader:.*]], label %[[return:.*]]
+
+; CHECK: [[loop_preheader]]:
+; CHECK-NEXT: %[[sext:.*]] = sext i32 %ilead to i64
+; CHECK-NEXT: %[[add:.*]] = add i32 %irow, -1
+; CHECK-NEXT: br label %[[loop:.*]]
+
+; CHECK: [[loop]]:
+; CHECK-NEXT: %[[indvars_iv2:.*]] = phi i64
+; CHECK-NEXT: phi i64
 ; CHECK-NOT: phi
-; CHECK: icmp ne
-; CHECK: br i1
+; CHECK: %[[lftr_wideiv:.*]] = trunc i64 %[[indvars_iv2]] to i32
+; CHECK-NEXT: %[[exitcond:.*]] = icmp ne i32 %[[lftr_wideiv]], %[[add]]
+; CHECK-NEXT: br i1 %[[exitcond]], label %[[loop]], label
 entry:
   %cmp = icmp slt i32 1, %irow
   br i1 %cmp, label %loop, label %return

diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
index 0576692..a7023f2 100644
--- a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
+++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll

@@ -229,10 +229,11 @@
 ; loop and the OR instruction is replaced by an ADD keeping the result
 ; equivalent.
 ;
+; CHECK: sext
 ; CHECK: loop:
 ; CHECK: phi i64
 ; CHECK-NOT: sext
-; CHECK: icmp slt i32
+; CHECK: icmp slt i64
 ; CHECK: exit:
 ; CHECK: add i64
 loop:

diff --git a/test/Transforms/IndVarSimplify/pr20680.ll b/test/Transforms/IndVarSimplify/pr20680.ll
new file mode 100644
index 0000000..88a7fd7
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr20680.ll

@@ -0,0 +1,219 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+@a = common global i32 0, align 4
+@c = common global i32 0, align 4
+@b = common global i32 0, align 4
+
+define void @f() {
+; CHECK-LABEL: @f(
+; CHECK-LABEL: entry:
+; CHECK: br label %[[for_cond2_preheader:.*]]
+
+; CHECK: [[for_cond2_preheader]]:
+; CHECK-NEXT: %[[indvars_iv:.*]] = phi i32 [ %[[indvars_iv_next:.*]], %[[for_inc13:.*]] ], [ -14, %entry ]
+; br i1 {{.*}}, label %[[for_inc13]], label %
+entry:
+  %0 = load i32* @a, align 4
+  %tobool2 = icmp eq i32 %0, 0
+  %1 = load i32* @a, align 4
+  %tobool = icmp eq i32 %1, 0
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.inc13, %entry
+  %storemerge15 = phi i8 [ -14, %entry ], [ %inc14, %for.inc13 ]
+  br i1 %tobool2, label %for.inc13, label %for.body3.lr.ph
+
+for.body3.lr.ph:                                  ; preds = %for.cond2.preheader
+  %tobool5 = icmp eq i8 %storemerge15, 0
+  %conv7 = sext i8 %storemerge15 to i32
+  %2 = add nsw i32 %conv7, 1
+  %3 = icmp ult i32 %2, 3
+  %div = select i1 %3, i32 %conv7, i32 0
+  br i1 %tobool5, label %for.body3.lr.ph.split.us, label %for.body3.lr.ph.for.body3.lr.ph.split_crit_edge
+
+for.body3.lr.ph.for.body3.lr.ph.split_crit_edge:  ; preds = %for.body3.lr.ph
+  br label %for.body3.lr.ph.split
+
+for.body3.lr.ph.split.us:                         ; preds = %for.body3.lr.ph
+  br i1 %tobool, label %for.body3.lr.ph.split.us.split.us, label %for.body3.lr.ph.split.us.for.body3.lr.ph.split.us.split_crit_edge
+
+for.body3.lr.ph.split.us.for.body3.lr.ph.split.us.split_crit_edge: ; preds = %for.body3.lr.ph.split.us
+  br label %for.body3.lr.ph.split.us.split
+
+for.body3.lr.ph.split.us.split.us:                ; preds = %for.body3.lr.ph.split.us
+  br label %for.body3.us.us
+
+for.body3.us.us:                                  ; preds = %for.cond2.loopexit.us.us, %for.body3.lr.ph.split.us.split.us
+  br i1 true, label %cond.false.us.us, label %cond.end.us.us
+
+cond.false.us.us:                                 ; preds = %for.body3.us.us
+  br label %cond.end.us.us
+
+cond.end.us.us:                                   ; preds = %cond.false.us.us, %for.body3.us.us
+  %cond.us.us = phi i32 [ %div, %cond.false.us.us ], [ %conv7, %for.body3.us.us ]
+  %4 = load i32* @b, align 4
+  %cmp91.us.us = icmp slt i32 %4, 1
+  br i1 %cmp91.us.us, label %for.inc.lr.ph.us.us, label %for.cond2.loopexit.us.us
+
+for.cond2.loopexit.us.us:                         ; preds = %for.cond8.for.cond2.loopexit_crit_edge.us.us, %cond.end.us.us
+  br i1 true, label %for.cond2.for.inc13_crit_edge.us-lcssa.us.us-lcssa.us, label %for.body3.us.us
+
+for.inc.lr.ph.us.us:                              ; preds = %cond.end.us.us
+  br label %for.inc.us.us
+
+for.cond8.for.cond2.loopexit_crit_edge.us.us:     ; preds = %for.inc.us.us
+  %inc.lcssa.us.us = phi i32 [ %inc.us.us, %for.inc.us.us ]
+  store i32 %inc.lcssa.us.us, i32* @b, align 4
+  br label %for.cond2.loopexit.us.us
+
+for.inc.us.us:                                    ; preds = %for.inc.us.us, %for.inc.lr.ph.us.us
+  %5 = phi i32 [ %4, %for.inc.lr.ph.us.us ], [ %inc.us.us, %for.inc.us.us ]
+  %inc.us.us = add nsw i32 %5, 1
+  %cmp9.us.us = icmp slt i32 %inc.us.us, 1
+  br i1 %cmp9.us.us, label %for.inc.us.us, label %for.cond8.for.cond2.loopexit_crit_edge.us.us
+
+for.cond2.for.inc13_crit_edge.us-lcssa.us.us-lcssa.us: ; preds = %for.cond2.loopexit.us.us
+  %cond.lcssa.ph.us.ph.us = phi i32 [ %cond.us.us, %for.cond2.loopexit.us.us ]
+  br label %for.cond2.for.inc13_crit_edge.us-lcssa.us
+
+for.body3.lr.ph.split.us.split:                   ; preds = %for.body3.lr.ph.split.us.for.body3.lr.ph.split.us.split_crit_edge
+  br label %for.body3.us
+
+for.body3.us:                                     ; preds = %for.cond2.loopexit.us, %for.body3.lr.ph.split.us.split
+  br i1 true, label %cond.false.us, label %cond.end.us
+
+cond.false.us:                                    ; preds = %for.body3.us
+  br label %cond.end.us
+
+cond.end.us:                                      ; preds = %cond.false.us, %for.body3.us
+  %cond.us = phi i32 [ %div, %cond.false.us ], [ %conv7, %for.body3.us ]
+  %6 = load i32* @b, align 4
+  %cmp91.us = icmp slt i32 %6, 1
+  br i1 %cmp91.us, label %for.inc.lr.ph.us, label %for.cond2.loopexit.us
+
+for.inc.us:                                       ; preds = %for.inc.lr.ph.us, %for.inc.us
+  %7 = phi i32 [ %6, %for.inc.lr.ph.us ], [ %inc.us, %for.inc.us ]
+  %inc.us = add nsw i32 %7, 1
+  %cmp9.us = icmp slt i32 %inc.us, 1
+  br i1 %cmp9.us, label %for.inc.us, label %for.cond8.for.cond2.loopexit_crit_edge.us
+
+for.cond2.loopexit.us:                            ; preds = %for.cond8.for.cond2.loopexit_crit_edge.us, %cond.end.us
+  br i1 false, label %for.cond2.for.inc13_crit_edge.us-lcssa.us.us-lcssa, label %for.body3.us
+
+for.inc.lr.ph.us:                                 ; preds = %cond.end.us
+  br label %for.inc.us
+
+for.cond8.for.cond2.loopexit_crit_edge.us:        ; preds = %for.inc.us
+  %inc.lcssa.us = phi i32 [ %inc.us, %for.inc.us ]
+  store i32 %inc.lcssa.us, i32* @b, align 4
+  br label %for.cond2.loopexit.us
+
+for.cond2.for.inc13_crit_edge.us-lcssa.us.us-lcssa: ; preds = %for.cond2.loopexit.us
+  %cond.lcssa.ph.us.ph = phi i32 [ %cond.us, %for.cond2.loopexit.us ]
+  br label %for.cond2.for.inc13_crit_edge.us-lcssa.us
+
+for.cond2.for.inc13_crit_edge.us-lcssa.us:        ; preds = %for.cond2.for.inc13_crit_edge.us-lcssa.us.us-lcssa, %for.cond2.for.inc13_crit_edge.us-lcssa.us.us-lcssa.us
+  %cond.lcssa.ph.us = phi i32 [ %cond.lcssa.ph.us.ph, %for.cond2.for.inc13_crit_edge.us-lcssa.us.us-lcssa ], [ %cond.lcssa.ph.us.ph.us, %for.cond2.for.inc13_crit_edge.us-lcssa.us.us-lcssa.us ]
+  br label %for.cond2.for.inc13_crit_edge
+
+for.body3.lr.ph.split:                            ; preds = %for.body3.lr.ph.for.body3.lr.ph.split_crit_edge
+  br i1 %tobool, label %for.body3.lr.ph.split.split.us, label %for.body3.lr.ph.split.for.body3.lr.ph.split.split_crit_edge
+
+for.body3.lr.ph.split.for.body3.lr.ph.split.split_crit_edge: ; preds = %for.body3.lr.ph.split
+  br label %for.body3.lr.ph.split.split
+
+for.body3.lr.ph.split.split.us:                   ; preds = %for.body3.lr.ph.split
+  br label %for.body3.us3
+
+for.body3.us3:                                    ; preds = %for.cond2.loopexit.us11, %for.body3.lr.ph.split.split.us
+  br i1 false, label %cond.false.us4, label %cond.end.us5
+
+cond.false.us4:                                   ; preds = %for.body3.us3
+  br label %cond.end.us5
+
+cond.end.us5:                                     ; preds = %cond.false.us4, %for.body3.us3
+  %cond.us6 = phi i32 [ %div, %cond.false.us4 ], [ %conv7, %for.body3.us3 ]
+  %8 = load i32* @b, align 4
+  %cmp91.us7 = icmp slt i32 %8, 1
+  br i1 %cmp91.us7, label %for.inc.lr.ph.us12, label %for.cond2.loopexit.us11
+
+for.inc.us8:                                      ; preds = %for.inc.lr.ph.us12, %for.inc.us8
+  %9 = phi i32 [ %8, %for.inc.lr.ph.us12 ], [ %inc.us9, %for.inc.us8 ]
+  %inc.us9 = add nsw i32 %9, 1
+  %cmp9.us10 = icmp slt i32 %inc.us9, 1
+  br i1 %cmp9.us10, label %for.inc.us8, label %for.cond8.for.cond2.loopexit_crit_edge.us13
+
+for.cond2.loopexit.us11:                          ; preds = %for.cond8.for.cond2.loopexit_crit_edge.us13, %cond.end.us5
+  br i1 true, label %for.cond2.for.inc13_crit_edge.us-lcssa.us-lcssa.us, label %for.body3.us3
+
+for.inc.lr.ph.us12:                               ; preds = %cond.end.us5
+  br label %for.inc.us8
+
+for.cond8.for.cond2.loopexit_crit_edge.us13:      ; preds = %for.inc.us8
+  %inc.lcssa.us14 = phi i32 [ %inc.us9, %for.inc.us8 ]
+  store i32 %inc.lcssa.us14, i32* @b, align 4
+  br label %for.cond2.loopexit.us11
+
+for.cond2.for.inc13_crit_edge.us-lcssa.us-lcssa.us: ; preds = %for.cond2.loopexit.us11
+  %cond.lcssa.ph.ph.us = phi i32 [ %cond.us6, %for.cond2.loopexit.us11 ]
+  br label %for.cond2.for.inc13_crit_edge.us-lcssa
+
+for.body3.lr.ph.split.split:                      ; preds = %for.body3.lr.ph.split.for.body3.lr.ph.split.split_crit_edge
+  br label %for.body3
+
+for.cond8.for.cond2.loopexit_crit_edge:           ; preds = %for.inc
+  %inc.lcssa = phi i32 [ %inc, %for.inc ]
+  store i32 %inc.lcssa, i32* @b, align 4
+  br label %for.cond2.loopexit
+
+for.cond2.loopexit:                               ; preds = %cond.end, %for.cond8.for.cond2.loopexit_crit_edge
+  br i1 false, label %for.cond2.for.inc13_crit_edge.us-lcssa.us-lcssa, label %for.body3
+
+for.body3:                                        ; preds = %for.cond2.loopexit, %for.body3.lr.ph.split.split
+  br i1 false, label %cond.false, label %cond.end
+
+cond.false:                                       ; preds = %for.body3
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %for.body3
+  %cond = phi i32 [ %div, %cond.false ], [ %conv7, %for.body3 ]
+  %10 = load i32* @b, align 4
+  %cmp91 = icmp slt i32 %10, 1
+  br i1 %cmp91, label %for.inc.lr.ph, label %for.cond2.loopexit
+
+for.inc.lr.ph:                                    ; preds = %cond.end
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc, %for.inc.lr.ph
+  %11 = phi i32 [ %10, %for.inc.lr.ph ], [ %inc, %for.inc ]
+  %inc = add nsw i32 %11, 1
+  %cmp9 = icmp slt i32 %inc, 1
+  br i1 %cmp9, label %for.inc, label %for.cond8.for.cond2.loopexit_crit_edge
+
+for.cond2.for.inc13_crit_edge.us-lcssa.us-lcssa:  ; preds = %for.cond2.loopexit
+  %cond.lcssa.ph.ph = phi i32 [ %cond, %for.cond2.loopexit ]
+  br label %for.cond2.for.inc13_crit_edge.us-lcssa
+
+for.cond2.for.inc13_crit_edge.us-lcssa:           ; preds = %for.cond2.for.inc13_crit_edge.us-lcssa.us-lcssa, %for.cond2.for.inc13_crit_edge.us-lcssa.us-lcssa.us
+  %cond.lcssa.ph = phi i32 [ %cond.lcssa.ph.ph, %for.cond2.for.inc13_crit_edge.us-lcssa.us-lcssa ], [ %cond.lcssa.ph.ph.us, %for.cond2.for.inc13_crit_edge.us-lcssa.us-lcssa.us ]
+  br label %for.cond2.for.inc13_crit_edge
+
+for.cond2.for.inc13_crit_edge:                    ; preds = %for.cond2.for.inc13_crit_edge.us-lcssa, %for.cond2.for.inc13_crit_edge.us-lcssa.us
+  %cond.lcssa = phi i32 [ %cond.lcssa.ph, %for.cond2.for.inc13_crit_edge.us-lcssa ], [ %cond.lcssa.ph.us, %for.cond2.for.inc13_crit_edge.us-lcssa.us ]
+  store i32 %cond.lcssa, i32* @c, align 4
+  br label %for.inc13
+
+; CHECK: [[for_inc13]]:
+; CHECK-NEXT: %[[indvars_iv_next]] = add nuw nsw i32 %[[indvars_iv]], 1
+; CHECK-NEXT: %[[exitcond4:.*]] = icmp ne i32 %[[indvars_iv]], -1
+; CHECK-NEXT: br i1 %[[exitcond4]], label %[[for_cond2_preheader]], label %[[for_end15:.*]]
+for.inc13:                                        ; preds = %for.cond2.for.inc13_crit_edge, %for.cond2.preheader
+  %inc14 = add i8 %storemerge15, 1
+  %cmp = icmp ugt i8 %inc14, 50
+  br i1 %cmp, label %for.cond2.preheader, label %for.end15
+
+; CHECK: [[for_end15]]:
+; CHECK-NEXT: ret void
+for.end15:                                        ; preds = %for.inc13
+  ret void
+}

diff --git a/test/Transforms/IndVarSimplify/sharpen-range.ll b/test/Transforms/IndVarSimplify/sharpen-range.ll
new file mode 100644
index 0000000..6a9d352
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/sharpen-range.ll

@@ -0,0 +1,113 @@
+;; RUN: opt -S < %s -indvars | FileCheck %s
+
+;; Check if llvm can narrow !range metadata based on loop entry
+;; predicates.
+
+declare void @abort()
+
+define i1 @bounded_below_slt(i32* nocapture readonly %buffer) {
+; CHECK-LABEL: bounded_below_slt
+entry:
+  %length = load i32* %buffer, !range !0
+  %entry.pred = icmp eq i32 %length, 0
+  br i1 %entry.pred, label %abort, label %loop.preheader
+
+loop.preheader:
+  br label %loop
+
+loop:
+; CHECK: loop
+  %idx = phi i32 [ %idx.inc, %loop.next ], [ 0, %loop.preheader ]
+  %oob.pred = icmp slt i32 %idx, %length
+  br i1 %oob.pred, label %loop.next, label %oob
+; CHECK: br i1 true, label %loop.next, label %oob
+
+loop.next:
+; CHECK: loop.next
+  %idx.inc = add i32 %idx, 1
+  %exit.pred = icmp slt i32 %idx.inc, %length
+  br i1 %exit.pred, label %loop, label %abort.loopexit
+
+abort.loopexit:
+  br label %abort
+
+abort:
+  ret i1 false
+
+oob:
+  tail call void @abort()
+  ret i1 false
+}
+
+define i1 @bounded_below_sle(i32* nocapture readonly %buffer) {
+; CHECK-LABEL: bounded_below_sle
+entry:
+  %length = load i32* %buffer, !range !0
+  %entry.pred = icmp eq i32 %length, 0
+  br i1 %entry.pred, label %abort, label %loop.preheader
+
+loop.preheader:
+  br label %loop
+
+loop:
+; CHECK: loop
+  %idx = phi i32 [ %idx.inc, %loop.next ], [ 0, %loop.preheader ]
+  %oob.pred = icmp sle i32 %idx, %length
+  br i1 %oob.pred, label %loop.next, label %oob
+; CHECK: br i1 true, label %loop.next, label %oob
+
+loop.next:
+; CHECK: loop.next
+  %idx.inc = add i32 %idx, 1
+  %exit.pred = icmp sle i32 %idx.inc, %length
+  br i1 %exit.pred, label %loop, label %abort.loopexit
+
+abort.loopexit:
+  br label %abort
+
+abort:
+  ret i1 false
+
+oob:
+  tail call void @abort()
+  ret i1 false
+}
+
+;; Assert that we're not making an incorrect transform.
+
+declare i32 @check(i8*)
+
+define void @NoChange() {
+; CHECK-LABEL: NoChange
+entry:
+  br label %loop.begin
+
+loop.begin:
+; CHECK: loop.begin:
+  %i.01 = phi i64 [ 2, %entry ], [ %add, %loop.end ]
+  %cmp = icmp ugt i64 %i.01, 1
+; CHECK: %cmp = icmp ugt i64 %i.01, 1
+  br i1 %cmp, label %loop, label %loop.end
+
+loop:
+; CHECK: loop
+  %.sum = add i64 %i.01, -2
+  %v = getelementptr inbounds i8* null, i64 %.sum
+  %r = tail call i32 @check(i8* %v)
+  %c = icmp eq i32 %r, 0
+  br i1 %c, label %loop.end, label %abort.now
+
+abort.now:
+  tail call void @abort()
+  unreachable
+
+loop.end:
+  %add = add i64 %i.01, -1
+  %eq = icmp eq i64 %add, 0
+  br i1 %eq, label %exit, label %loop.begin
+
+exit:
+  ret void
+}
+
+!0 = metadata !{i32 0, i32 100}

diff --git a/test/Transforms/IndVarSimplify/use-range-metadata.ll b/test/Transforms/IndVarSimplify/use-range-metadata.ll
new file mode 100644
index 0000000..7ac4f11
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/use-range-metadata.ll

@@ -0,0 +1,37 @@
+;; RUN: opt -S < %s -indvars | FileCheck %s
+
+;; Check if IndVarSimplify understands !range metadata.
+
+declare void @abort()
+
+define i1 @iterate(i32* nocapture readonly %buffer) {
+entry:
+  %length = load i32* %buffer, !range !0
+  br label %loop.preheader
+
+loop.preheader:
+  br label %loop
+
+loop:
+  %idx = phi i32 [ %idx.inc, %loop.next ], [ 0, %loop.preheader ]
+  %oob.pred = icmp slt i32 %idx, %length
+  br i1 %oob.pred, label %loop.next, label %oob
+; CHECK: br i1 true, label %loop.next, label %oob
+
+loop.next:
+  %idx.inc = add i32 %idx, 1
+  %exit.pred = icmp slt i32 %idx.inc, %length
+  br i1 %exit.pred, label %loop, label %abort.loopexit
+
+abort.loopexit:
+  br label %abort
+
+abort:
+  ret i1 false
+
+oob:
+  tail call void @abort()
+  ret i1 false
+}
+
+!0 = metadata !{i32 1, i32 100}

diff --git a/test/Transforms/IndVarSimplify/verify-scev.ll b/test/Transforms/IndVarSimplify/verify-scev.ll
index 019f583..b9ce3d6 100644
--- a/test/Transforms/IndVarSimplify/verify-scev.ll
+++ b/test/Transforms/IndVarSimplify/verify-scev.ll

@@ -380,11 +380,11 @@
 
 for.body65.lr.ph:                                 ; preds = %for.body48
   %0 = load i32* undef, align 4
+  %1 = sext i32 %0 to i64
   br label %for.body65.us
 
 for.body65.us:                                    ; preds = %for.inc219.us, %for.body65.lr.ph
-  %k.09.us = phi i32 [ %inc.us, %for.inc219.us ], [ 1, %for.body65.lr.ph ]
-  %idxprom66.us = sext i32 %k.09.us to i64
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc219.us ], [ 1, %for.body65.lr.ph ]
   br i1 undef, label %for.inc219.us, label %if.end72.us
 
 if.end72.us:                                      ; preds = %for.body65.us
@@ -406,8 +406,8 @@
   br i1 undef, label %for.cond139.loopexit.us, label %for.cond152.us
 
 for.inc219.us:                                    ; preds = %for.cond139.loopexit.us, %if.end110.us, %if.end93.us, %for.body65.us
-  %inc.us = add nsw i32 %k.09.us, 1
-  %cmp64.us = icmp sgt i32 %inc.us, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp64.us = icmp sgt i64 %indvars.iv.next, %1
   br i1 %cmp64.us, label %for.inc221, label %for.body65.us
 
 for.cond139.loopexit.us:                          ; preds = %for.cond152.us

diff --git a/test/Transforms/IndVarSimplify/widen-loop-comp.ll b/test/Transforms/IndVarSimplify/widen-loop-comp.ll
new file mode 100644
index 0000000..0930a0c
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/widen-loop-comp.ll

@@ -0,0 +1,191 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+target triple = "aarch64--linux-gnu"
+
+; Check the loop exit i32 compare instruction and operand are widened to i64
+; instead of truncating IV before its use in the i32 compare instruction.
+
+@idx = common global i32 0, align 4
+@e = common global i32 0, align 4
+@ptr = common global i32* null, align 8
+
+; CHECK-LABEL: @test1
+; CHECK: for.body.lr.ph:
+; CHECK: sext i32
+; CHECK: for.cond:
+; CHECK: icmp slt i64
+; CHECK: for.body:
+; CHECK: phi i64
+
+define i32 @test1() {
+entry:
+  store i32 -1, i32* @idx, align 4
+  %0 = load i32* @e, align 4
+  %cmp4 = icmp slt i32 %0, 0
+  br i1 %cmp4, label %for.end.loopexit, label %for.body.lr.ph
+
+for.body.lr.ph:
+  %1 = load i32** @ptr, align 8
+  %2 = load i32* @e, align 4
+  br label %for.body
+
+for.cond:
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %i.05, %2
+  br i1 %cmp, label %for.body, label %for.cond.for.end.loopexit_crit_edge
+
+for.body:
+  %i.05 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.cond ]
+  %idxprom = sext i32 %i.05 to i64
+  %arrayidx = getelementptr inbounds i32* %1, i64 %idxprom
+  %3 = load i32* %arrayidx, align 4
+  %tobool = icmp eq i32 %3, 0
+  br i1 %tobool, label %if.then, label %for.cond
+
+if.then:
+  %i.05.lcssa = phi i32 [ %i.05, %for.body ]
+  store i32 %i.05.lcssa, i32* @idx, align 4
+  br label %for.end
+
+for.cond.for.end.loopexit_crit_edge:
+  br label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  %4 = load i32* @idx, align 4
+  ret i32 %4
+}
+
+; CHECK-LABEL: @test2
+; CHECK: for.body4.us
+; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %cmp2.us = icmp slt i64
+; CHECK-NOT: %2 = trunc i64 %indvars.iv.next to i32
+; CHECK-NOT: %cmp2.us = icmp slt i32
+
+define void @test2([8 x i8]* %a, i8* %b, i8 %limit) {
+entry:
+  %conv = zext i8 %limit to i32
+  br i1 undef, label %for.cond1.preheader, label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+  %storemerge5.us = phi i32 [ 0, %entry ], [ %inc14.us, %for.inc13.us ]
+  br i1 true, label %for.body4.lr.ph.us, label %for.inc13.us
+
+for.inc13.us:
+  %inc14.us = add nsw i32 %storemerge5.us, 1
+  %cmp.us = icmp slt i32 %inc14.us, 4
+  br i1 %cmp.us, label %for.cond1.preheader.us, label %for.end
+
+for.body4.us:
+  %storemerge14.us = phi i32 [ 0, %for.body4.lr.ph.us ], [ %inc.us, %for.body4.us ]
+  %idxprom.us = sext i32 %storemerge14.us to i64
+  %arrayidx6.us = getelementptr inbounds [8 x i8]* %a, i64 %idxprom5.us, i64 %idxprom.us
+  %0 = load i8* %arrayidx6.us, align 1
+  %idxprom7.us = zext i8 %0 to i64
+  %arrayidx8.us = getelementptr inbounds i8* %b, i64 %idxprom7.us
+  %1 = load i8* %arrayidx8.us, align 1
+  store i8 %1, i8* %arrayidx6.us, align 1
+  %inc.us = add nsw i32 %storemerge14.us, 1
+  %cmp2.us = icmp slt i32 %inc.us, %conv
+  br i1 %cmp2.us, label %for.body4.us, label %for.inc13.us
+
+for.body4.lr.ph.us:
+  %idxprom5.us = sext i32 %storemerge5.us to i64
+  br label %for.body4.us
+
+for.cond1.preheader:
+  %storemerge5 = phi i32 [ 0, %entry ], [ %inc14, %for.inc13 ]
+  br i1 false, label %for.inc13, label %for.inc13
+
+for.inc13:
+  %inc14 = add nsw i32 %storemerge5, 1
+  %cmp = icmp slt i32 %inc14, 4
+  br i1 %cmp, label %for.cond1.preheader, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @test3
+; CHECK: sext i32 %b
+; CHECK: for.cond:
+; CHECK: phi i64
+; CHECK: icmp slt i64
+
+define i32 @test3(i32* %a, i32 %b) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %sum.0 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32* %a, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %sum.0, %0
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret i32 %sum.0
+}
+
+declare i32 @fn1(i8 signext)
+
+; PR21030
+; CHECK-LABEL: @test4
+; CHECK: for.body:
+; CHECK: phi i32
+; CHECK: icmp sgt i8
+
+define i32 @test4(i32 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %c.07 = phi i8 [ -3, %entry ], [ %dec, %for.body ]
+  %conv6 = zext i8 %c.07 to i32
+  %or = or i32 %a, %conv6
+  %conv3 = trunc i32 %or to i8
+  %call = call i32 @fn1(i8 signext %conv3)
+  %dec = add i8 %c.07, -1
+  %cmp = icmp sgt i8 %dec, -14
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 0
+}
+
+; CHECK-LABEL: @test5
+; CHECK: zext i32 %b
+; CHECK: for.cond:
+; CHECK: phi i64
+; CHECK: icmp ule i64
+
+define i32 @test5(i32* %a, i32 %b) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %sum.0 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ule i32 %i.0, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %idxprom = zext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32* %a, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %sum.0, %0
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret i32 %sum.0
+}

diff --git a/test/Transforms/Inline/align.ll b/test/Transforms/Inline/align.ll
new file mode 100644
index 0000000..9ac6d54
--- /dev/null
+++ b/test/Transforms/Inline/align.ll

@@ -0,0 +1,98 @@
+; RUN: opt -inline -preserve-alignment-assumptions-during-inlining -S < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @hello(float* align 128 nocapture %a, float* nocapture readonly %c) #0 {
+entry:
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 5
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+define void @foo(float* nocapture %a, float* nocapture readonly %c) #0 {
+entry:
+  tail call void @hello(float* %a, float* %c)
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: define void @foo(float* nocapture %a, float* nocapture readonly %c) #0 {
+; CHECK: entry:
+; CHECK:   %ptrint = ptrtoint float* %a to i64
+; CHECK:   %maskedptr = and i64 %ptrint, 127
+; CHECK:   %maskcond = icmp eq i64 %maskedptr, 0
+; CHECK:   call void @llvm.assume(i1 %maskcond)
+; CHECK:   %0 = load float* %c, align 4
+; CHECK:   %arrayidx.i = getelementptr inbounds float* %a, i64 5
+; CHECK:   store float %0, float* %arrayidx.i, align 4
+; CHECK:   %1 = load float* %c, align 4
+; CHECK:   %arrayidx = getelementptr inbounds float* %a, i64 7
+; CHECK:   store float %1, float* %arrayidx, align 4
+; CHECK:   ret void
+; CHECK: }
+
+define void @fooa(float* nocapture align 128 %a, float* nocapture readonly %c) #0 {
+entry:
+  tail call void @hello(float* %a, float* %c)
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: define void @fooa(float* nocapture align 128 %a, float* nocapture readonly %c) #0 {
+; CHECK: entry:
+; CHECK:   %0 = load float* %c, align 4
+; CHECK:   %arrayidx.i = getelementptr inbounds float* %a, i64 5
+; CHECK:   store float %0, float* %arrayidx.i, align 4
+; CHECK:   %1 = load float* %c, align 4
+; CHECK:   %arrayidx = getelementptr inbounds float* %a, i64 7
+; CHECK:   store float %1, float* %arrayidx, align 4
+; CHECK:   ret void
+; CHECK: }
+
+define void @hello2(float* align 128 nocapture %a, float* align 128 nocapture %b, float* nocapture readonly %c) #0 {
+entry:
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 5
+  store float %0, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float* %b, i64 8
+  store float %0, float* %arrayidx1, align 4
+  ret void
+}
+
+define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+entry:
+  tail call void @hello2(float* %a, float* %b, float* %c)
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+; CHECK: entry:
+; CHECK:   %ptrint = ptrtoint float* %a to i64
+; CHECK:   %maskedptr = and i64 %ptrint, 127
+; CHECK:   %maskcond = icmp eq i64 %maskedptr, 0
+; CHECK:   call void @llvm.assume(i1 %maskcond)
+; CHECK:   %ptrint1 = ptrtoint float* %b to i64
+; CHECK:   %maskedptr2 = and i64 %ptrint1, 127
+; CHECK:   %maskcond3 = icmp eq i64 %maskedptr2, 0
+; CHECK:   call void @llvm.assume(i1 %maskcond3)
+; CHECK:   %0 = load float* %c, align 4
+; CHECK:   %arrayidx.i = getelementptr inbounds float* %a, i64 5
+; CHECK:   store float %0, float* %arrayidx.i, align 4
+; CHECK:   %arrayidx1.i = getelementptr inbounds float* %b, i64 8
+; CHECK:   store float %0, float* %arrayidx1.i, align 4
+; CHECK:   %1 = load float* %c, align 4
+; CHECK:   %arrayidx = getelementptr inbounds float* %a, i64 7
+; CHECK:   store float %1, float* %arrayidx, align 4
+; CHECK:   ret void
+; CHECK: }
+
+attributes #0 = { nounwind uwtable }
+

diff --git a/test/Transforms/Inline/byval-tail-call.ll b/test/Transforms/Inline/byval-tail-call.ll
index 3a8906a..154f397 100644
--- a/test/Transforms/Inline/byval-tail-call.ll
+++ b/test/Transforms/Inline/byval-tail-call.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -inline -instcombine -dse -S | FileCheck %s
+; RUN: opt < %s -basicaa -tailcallelim -inline -instcombine -dse -S | FileCheck %s
 ; PR7272
 
 ; Calls that capture byval parameters cannot be marked as tail calls. Other
@@ -27,10 +27,13 @@
   tail call void @ext(i32* null)
   ret void
 }
+
 define void @frob(i32* %x) {
 ; CHECK-LABEL: define void @frob(
-; CHECK: alloca i32
-; CHECK: {{^ *}}call void @ext(
+; CHECK: %[[POS:.*]] = alloca i32
+; CHECK: %[[VAL:.*]] = load i32* %x
+; CHECK: store i32 %[[VAL]], i32* %[[POS]]
+; CHECK: {{^ *}}call void @ext(i32* %[[POS]]
 ; CHECK: tail call void @ext(i32* null)
 ; CHECK: ret void
   tail call void @qux(i32* byval %x)

diff --git a/test/Transforms/Inline/debug-invoke.ll b/test/Transforms/Inline/debug-invoke.ll
index 41d6074..0de2d22 100644
--- a/test/Transforms/Inline/debug-invoke.ll
+++ b/test/Transforms/Inline/debug-invoke.ll

@@ -31,7 +31,7 @@
 }
 
 !llvm.module.flags = !{!1}
-!1 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!1 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !2 = metadata !{}
 !3 = metadata !{i32 1, i32 0, metadata !2, null}
 !4 = metadata !{i32 2, i32 0, metadata !2, null}

diff --git a/test/Transforms/Inline/ephemeral.ll b/test/Transforms/Inline/ephemeral.ll
new file mode 100644
index 0000000..d1135c6
--- /dev/null
+++ b/test/Transforms/Inline/ephemeral.ll

@@ -0,0 +1,32 @@
+; RUN: opt -S -Oz %s | FileCheck %s
+
+@a = global i32 4
+
+define i1 @inner() {
+  %a1 = load volatile i32* @a
+  %x1 = add i32 %a1, %a1
+  %c = icmp eq i32 %x1, 0
+
+  ; Here are enough instructions to prevent inlining, but because they are used
+  ; only by the @llvm.assume intrinsic, they're free (and, thus, inlining will
+  ; still happen).
+  %a2 = mul i32 %a1, %a1
+  %a3 = sub i32 %a1, 5
+  %a4 = udiv i32 %a3, -13
+  %a5 = mul i32 %a4, %a4
+  %a6 = add i32 %a5, %x1
+  %ca = icmp sgt i32 %a6, -7
+  tail call void @llvm.assume(i1 %ca)
+
+  ret i1 %c
+}
+
+; @inner() should be inlined for -Oz.
+; CHECK-NOT: call i1 @inner
+define i1 @outer() optsize {
+   %r = call i1 @inner()
+   ret i1 %r
+}
+
+declare void @llvm.assume(i1) nounwind
+

diff --git a/test/Transforms/Inline/ignore-debug-info.ll b/test/Transforms/Inline/ignore-debug-info.ll
index 543a89b..428b5d5 100644
--- a/test/Transforms/Inline/ignore-debug-info.ll
+++ b/test/Transforms/Inline/ignore-debug-info.ll

@@ -7,16 +7,16 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare void @llvm.dbg.declare(metadata, metadata) #1
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 define <4 x float> @inner_vectors(<4 x float> %a, <4 x float> %b) {
 entry:
-  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{})
+  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{}, metadata !{})
   %mul = fmul <4 x float> %a, <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{})
+  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{}, metadata !{})
   %mul1 = fmul <4 x float> %b, <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>
-  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{})
+  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{}, metadata !{})
   %add = fadd <4 x float> %mul, %mul1
   ret <4 x float> %add
 }
@@ -27,10 +27,10 @@
 ; CHECK: ret float
 
 entry:
-  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{})
-  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{})
+  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{}, metadata !{})
+  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{}, metadata !{})
   %call = call <4 x float> @inner_vectors(<4 x float> %a, <4 x float> %b)
-  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{})
+  call void @llvm.dbg.value(metadata !{}, i64 0, metadata !{}, metadata !{})
   %vecext = extractelement <4 x float> %call, i32 0
   %vecext1 = extractelement <4 x float> %call, i32 1
   %add = fadd float %vecext, %vecext1
@@ -47,9 +47,9 @@
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !{}, metadata !2, metadata !2, metadata !""}
+!0 = metadata !{metadata !"0x11\004\00\000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !{}, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"", metadata !""}
 !2 = metadata !{i32 0}
 !3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !5 = metadata !{metadata !""}

diff --git a/test/Transforms/Inline/inline-musttail-varargs.ll b/test/Transforms/Inline/inline-musttail-varargs.ll
new file mode 100644
index 0000000..7a89574
--- /dev/null
+++ b/test/Transforms/Inline/inline-musttail-varargs.ll

@@ -0,0 +1,22 @@
+; RUN: opt < %s -inline -instcombine -S | FileCheck %s
+
+; We can't inline this thunk yet, but one day we will be able to.  And when we
+; do, this test case will be ready.
+
+declare void @ext_method(i8*, i32)
+
+define linkonce_odr void @thunk(i8* %this, ...) {
+  %this_adj = getelementptr i8* %this, i32 4
+  musttail call void (i8*, ...)* bitcast (void (i8*, i32)* @ext_method to void (i8*, ...)*)(i8* %this_adj, ...)
+  ret void
+}
+
+define void @thunk_caller(i8* %p) {
+  call void (i8*, ...)* @thunk(i8* %p, i32 42)
+  ret void
+}
+; CHECK-LABEL: define void @thunk_caller(i8* %p)
+; CHECK: call void (i8*, ...)* @thunk(i8* %p, i32 42)
+
+; FIXME: Inline the thunk. This should be significantly easier than inlining
+; general varargs functions.

diff --git a/test/Transforms/Inline/noalias-calls.ll b/test/Transforms/Inline/noalias-calls.ll
new file mode 100644
index 0000000..13408e4
--- /dev/null
+++ b/test/Transforms/Inline/noalias-calls.ll

@@ -0,0 +1,44 @@
+; RUN: opt -basicaa -inline -enable-noalias-to-md-conversion -S < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+declare void @hey() #0
+
+define void @hello(i8* noalias nocapture %a, i8* noalias nocapture readonly %c, i8* nocapture %b) #1 {
+entry:
+  %l = alloca i8, i32 512, align 1
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i32 16, i1 0)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b, i8* %c, i64 16, i32 16, i1 0)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %c, i64 16, i32 16, i1 0)
+  call void @hey()
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %l, i8* %c, i64 16, i32 16, i1 0)
+  ret void
+}
+
+define void @foo(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %b) #1 {
+entry:
+  tail call void @hello(i8* %a, i8* %c, i8* %b)
+  ret void
+}
+
+; CHECK: define void @foo(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %b) #1 {
+; CHECK: entry:
+; CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i32 16, i1 false) #0, !noalias !0
+; CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b, i8* %c, i64 16, i32 16, i1 false) #0, !noalias !3
+; CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %c, i64 16, i32 16, i1 false) #0, !alias.scope !5
+; CHECK:   call void @hey() #0, !noalias !5
+; CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %c, i64 16, i32 16, i1 false) #0, !noalias !3
+; CHECK:   ret void
+; CHECK: }
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind uwtable }
+
+; CHECK: !0 = metadata !{metadata !1}
+; CHECK: !1 = metadata !{metadata !1, metadata !2, metadata !"hello: %c"}
+; CHECK: !2 = metadata !{metadata !2, metadata !"hello"}
+; CHECK: !3 = metadata !{metadata !4}
+; CHECK: !4 = metadata !{metadata !4, metadata !2, metadata !"hello: %a"}
+; CHECK: !5 = metadata !{metadata !4, metadata !1}
+

diff --git a/test/Transforms/Inline/noalias-cs.ll b/test/Transforms/Inline/noalias-cs.ll
new file mode 100644
index 0000000..acd9021
--- /dev/null
+++ b/test/Transforms/Inline/noalias-cs.ll

@@ -0,0 +1,84 @@
+; RUN: opt -inline -enable-noalias-to-md-conversion -S < %s | FileCheck %s
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+entry:
+  %0 = load float* %c, align 4, !noalias !3
+  %arrayidx.i = getelementptr inbounds float* %a, i64 5
+  store float %0, float* %arrayidx.i, align 4, !alias.scope !7, !noalias !8
+  %arrayidx1.i = getelementptr inbounds float* %b, i64 8
+  store float %0, float* %arrayidx1.i, align 4, !alias.scope !8, !noalias !7
+  %1 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %1, float* %arrayidx, align 4
+  ret void
+}
+
+define void @foo(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+entry:
+  call void @foo2(float* %a, float* %b, float* %c), !noalias !0
+  call void @foo2(float* %b, float* %b, float* %a), !alias.scope !0
+  ret void
+}
+
+; CHECK: define void @foo(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+; CHECK: entry:
+; CHECK:   %0 = load float* %c, align 4, !noalias !6
+; CHECK:   %arrayidx.i.i = getelementptr inbounds float* %a, i64 5
+; CHECK:   store float %0, float* %arrayidx.i.i, align 4, !alias.scope !12, !noalias !13
+; CHECK:   %arrayidx1.i.i = getelementptr inbounds float* %b, i64 8
+; CHECK:   store float %0, float* %arrayidx1.i.i, align 4, !alias.scope !14, !noalias !15
+; CHECK:   %1 = load float* %c, align 4, !noalias !16
+; CHECK:   %arrayidx.i = getelementptr inbounds float* %a, i64 7
+; CHECK:   store float %1, float* %arrayidx.i, align 4, !noalias !16
+; CHECK:   %2 = load float* %a, align 4, !alias.scope !16, !noalias !17
+; CHECK:   %arrayidx.i.i1 = getelementptr inbounds float* %b, i64 5
+; CHECK:   store float %2, float* %arrayidx.i.i1, align 4, !alias.scope !21, !noalias !22
+; CHECK:   %arrayidx1.i.i2 = getelementptr inbounds float* %b, i64 8
+; CHECK:   store float %2, float* %arrayidx1.i.i2, align 4, !alias.scope !23, !noalias !24
+; CHECK:   %3 = load float* %a, align 4, !alias.scope !16
+; CHECK:   %arrayidx.i3 = getelementptr inbounds float* %b, i64 7
+; CHECK:   store float %3, float* %arrayidx.i3, align 4, !alias.scope !16
+; CHECK:   ret void
+; CHECK: }
+
+attributes #0 = { nounwind uwtable }
+
+!0 = metadata !{metadata !1}
+!1 = metadata !{metadata !1, metadata !2, metadata !"hello: %a"}
+!2 = metadata !{metadata !2, metadata !"hello"}
+!3 = metadata !{metadata !4, metadata !6}
+!4 = metadata !{metadata !4, metadata !5, metadata !"hello2: %a"}
+!5 = metadata !{metadata !5, metadata !"hello2"}
+!6 = metadata !{metadata !6, metadata !5, metadata !"hello2: %b"}
+!7 = metadata !{metadata !4}
+!8 = metadata !{metadata !6}
+
+; CHECK: !0 = metadata !{metadata !1, metadata !3}
+; CHECK: !1 = metadata !{metadata !1, metadata !2, metadata !"hello2: %a"}
+; CHECK: !2 = metadata !{metadata !2, metadata !"hello2"}
+; CHECK: !3 = metadata !{metadata !3, metadata !2, metadata !"hello2: %b"}
+; CHECK: !4 = metadata !{metadata !1}
+; CHECK: !5 = metadata !{metadata !3}
+; CHECK: !6 = metadata !{metadata !7, metadata !9, metadata !10}
+; CHECK: !7 = metadata !{metadata !7, metadata !8, metadata !"hello2: %a"}
+; CHECK: !8 = metadata !{metadata !8, metadata !"hello2"}
+; CHECK: !9 = metadata !{metadata !9, metadata !8, metadata !"hello2: %b"}
+; CHECK: !10 = metadata !{metadata !10, metadata !11, metadata !"hello: %a"}
+; CHECK: !11 = metadata !{metadata !11, metadata !"hello"}
+; CHECK: !12 = metadata !{metadata !7}
+; CHECK: !13 = metadata !{metadata !9, metadata !10}
+; CHECK: !14 = metadata !{metadata !9}
+; CHECK: !15 = metadata !{metadata !7, metadata !10}
+; CHECK: !16 = metadata !{metadata !10}
+; CHECK: !17 = metadata !{metadata !18, metadata !20}
+; CHECK: !18 = metadata !{metadata !18, metadata !19, metadata !"hello2: %a"}
+; CHECK: !19 = metadata !{metadata !19, metadata !"hello2"}
+; CHECK: !20 = metadata !{metadata !20, metadata !19, metadata !"hello2: %b"}
+; CHECK: !21 = metadata !{metadata !18, metadata !10}
+; CHECK: !22 = metadata !{metadata !20}
+; CHECK: !23 = metadata !{metadata !20, metadata !10}
+; CHECK: !24 = metadata !{metadata !18}
+

diff --git a/test/Transforms/Inline/noalias.ll b/test/Transforms/Inline/noalias.ll
new file mode 100644
index 0000000..7a54d5d
--- /dev/null
+++ b/test/Transforms/Inline/noalias.ll

@@ -0,0 +1,76 @@
+; RUN: opt -inline -enable-noalias-to-md-conversion -S < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @hello(float* noalias nocapture %a, float* nocapture readonly %c) #0 {
+entry:
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 5
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+define void @foo(float* nocapture %a, float* nocapture readonly %c) #0 {
+entry:
+  tail call void @hello(float* %a, float* %c)
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: define void @foo(float* nocapture %a, float* nocapture readonly %c) #0 {
+; CHECK: entry:
+; CHECK:   %0 = load float* %c, align 4, !noalias !0
+; CHECK:   %arrayidx.i = getelementptr inbounds float* %a, i64 5
+; CHECK:   store float %0, float* %arrayidx.i, align 4, !alias.scope !0
+; CHECK:   %1 = load float* %c, align 4
+; CHECK:   %arrayidx = getelementptr inbounds float* %a, i64 7
+; CHECK:   store float %1, float* %arrayidx, align 4
+; CHECK:   ret void
+; CHECK: }
+
+define void @hello2(float* noalias nocapture %a, float* noalias nocapture %b, float* nocapture readonly %c) #0 {
+entry:
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 5
+  store float %0, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float* %b, i64 8
+  store float %0, float* %arrayidx1, align 4
+  ret void
+}
+
+define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+entry:
+  tail call void @hello2(float* %a, float* %b, float* %c)
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+; CHECK: entry:
+; CHECK:   %0 = load float* %c, align 4, !noalias !3
+; CHECK:   %arrayidx.i = getelementptr inbounds float* %a, i64 5
+; CHECK:   store float %0, float* %arrayidx.i, align 4, !alias.scope !7, !noalias !8
+; CHECK:   %arrayidx1.i = getelementptr inbounds float* %b, i64 8
+; CHECK:   store float %0, float* %arrayidx1.i, align 4, !alias.scope !8, !noalias !7
+; CHECK:   %1 = load float* %c, align 4
+; CHECK:   %arrayidx = getelementptr inbounds float* %a, i64 7
+; CHECK:   store float %1, float* %arrayidx, align 4
+; CHECK:   ret void
+; CHECK: }
+
+attributes #0 = { nounwind uwtable }
+
+; CHECK: !0 = metadata !{metadata !1}
+; CHECK: !1 = metadata !{metadata !1, metadata !2, metadata !"hello: %a"}
+; CHECK: !2 = metadata !{metadata !2, metadata !"hello"}
+; CHECK: !3 = metadata !{metadata !4, metadata !6}
+; CHECK: !4 = metadata !{metadata !4, metadata !5, metadata !"hello2: %a"}
+; CHECK: !5 = metadata !{metadata !5, metadata !"hello2"}
+; CHECK: !6 = metadata !{metadata !6, metadata !5, metadata !"hello2: %b"}
+; CHECK: !7 = metadata !{metadata !4}
+; CHECK: !8 = metadata !{metadata !6}
+

diff --git a/test/Transforms/Inline/noalias2.ll b/test/Transforms/Inline/noalias2.ll
new file mode 100644
index 0000000..a4b38b0
--- /dev/null
+++ b/test/Transforms/Inline/noalias2.ll

@@ -0,0 +1,97 @@
+; RUN: opt -inline -enable-noalias-to-md-conversion -S < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @hello(float* noalias nocapture %a, float* noalias nocapture readonly %c) #0 {
+entry:
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 5
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+define void @foo(float* noalias nocapture %a, float* noalias nocapture readonly %c) #0 {
+entry:
+  tail call void @hello(float* %a, float* %c)
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: define void @foo(float* noalias nocapture %a, float* noalias nocapture readonly %c) #0 {
+; CHECK: entry:
+; CHECK:   %0 = load float* %c, align 4, !alias.scope !0, !noalias !3
+; CHECK:   %arrayidx.i = getelementptr inbounds float* %a, i64 5
+; CHECK:   store float %0, float* %arrayidx.i, align 4, !alias.scope !3, !noalias !0
+; CHECK:   %1 = load float* %c, align 4
+; CHECK:   %arrayidx = getelementptr inbounds float* %a, i64 7
+; CHECK:   store float %1, float* %arrayidx, align 4
+; CHECK:   ret void
+; CHECK: }
+
+define void @hello2(float* noalias nocapture %a, float* noalias nocapture %b, float* nocapture readonly %c) #0 {
+entry:
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 6
+  store float %0, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float* %b, i64 8
+  store float %0, float* %arrayidx1, align 4
+  ret void
+}
+
+; Check that when hello() is inlined into foo(), and then foo() is inlined into
+; foo2(), the noalias scopes are properly concatenated.
+define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+entry:
+  tail call void @foo(float* %a, float* %c)
+  tail call void @hello2(float* %a, float* %b, float* %c)
+  %0 = load float* %c, align 4
+  %arrayidx = getelementptr inbounds float* %a, i64 7
+  store float %0, float* %arrayidx, align 4
+  ret void
+}
+
+; CHECK: define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture readonly %c) #0 {
+; CHECK: entry:
+; CHECK:   %0 = load float* %c, align 4, !alias.scope !5, !noalias !10
+; CHECK:   %arrayidx.i.i = getelementptr inbounds float* %a, i64 5
+; CHECK:   store float %0, float* %arrayidx.i.i, align 4, !alias.scope !10, !noalias !5
+; CHECK:   %1 = load float* %c, align 4, !alias.scope !13, !noalias !14
+; CHECK:   %arrayidx.i = getelementptr inbounds float* %a, i64 7
+; CHECK:   store float %1, float* %arrayidx.i, align 4, !alias.scope !14, !noalias !13
+; CHECK:   %2 = load float* %c, align 4, !noalias !15
+; CHECK:   %arrayidx.i1 = getelementptr inbounds float* %a, i64 6
+; CHECK:   store float %2, float* %arrayidx.i1, align 4, !alias.scope !19, !noalias !20
+; CHECK:   %arrayidx1.i = getelementptr inbounds float* %b, i64 8
+; CHECK:   store float %2, float* %arrayidx1.i, align 4, !alias.scope !20, !noalias !19
+; CHECK:   %3 = load float* %c, align 4
+; CHECK:   %arrayidx = getelementptr inbounds float* %a, i64 7
+; CHECK:   store float %3, float* %arrayidx, align 4
+; CHECK:   ret void
+; CHECK: }
+
+; CHECK: !0 = metadata !{metadata !1}
+; CHECK: !1 = metadata !{metadata !1, metadata !2, metadata !"hello: %c"}
+; CHECK: !2 = metadata !{metadata !2, metadata !"hello"}
+; CHECK: !3 = metadata !{metadata !4}
+; CHECK: !4 = metadata !{metadata !4, metadata !2, metadata !"hello: %a"}
+; CHECK: !5 = metadata !{metadata !6, metadata !8}
+; CHECK: !6 = metadata !{metadata !6, metadata !7, metadata !"hello: %c"}
+; CHECK: !7 = metadata !{metadata !7, metadata !"hello"}
+; CHECK: !8 = metadata !{metadata !8, metadata !9, metadata !"foo: %c"}
+; CHECK: !9 = metadata !{metadata !9, metadata !"foo"}
+; CHECK: !10 = metadata !{metadata !11, metadata !12}
+; CHECK: !11 = metadata !{metadata !11, metadata !7, metadata !"hello: %a"}
+; CHECK: !12 = metadata !{metadata !12, metadata !9, metadata !"foo: %a"}
+; CHECK: !13 = metadata !{metadata !8}
+; CHECK: !14 = metadata !{metadata !12}
+; CHECK: !15 = metadata !{metadata !16, metadata !18}
+; CHECK: !16 = metadata !{metadata !16, metadata !17, metadata !"hello2: %a"}
+; CHECK: !17 = metadata !{metadata !17, metadata !"hello2"}
+; CHECK: !18 = metadata !{metadata !18, metadata !17, metadata !"hello2: %b"}
+; CHECK: !19 = metadata !{metadata !16}
+; CHECK: !20 = metadata !{metadata !18}
+
+attributes #0 = { nounwind uwtable }
+

diff --git a/test/Transforms/Inline/pr21206.ll b/test/Transforms/Inline/pr21206.ll
new file mode 100644
index 0000000..1a4366e
--- /dev/null
+++ b/test/Transforms/Inline/pr21206.ll

@@ -0,0 +1,18 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+
+$c = comdat any
+; CHECK: $c = comdat any
+
+define linkonce_odr void @foo() comdat $c {
+  ret void
+}
+; CHECK: define linkonce_odr void @foo() comdat $c
+
+define linkonce_odr void @bar() comdat $c {
+  ret void
+}
+; CHECK: define linkonce_odr void @bar() comdat $c
+
+define void()* @zed()  {
+  ret void()* @foo
+}

diff --git a/test/Transforms/InstCombine/2007-09-10-AliasConstFold.ll b/test/Transforms/InstCombine/2007-09-10-AliasConstFold.ll
index 7f9bd9e..6259893 100644
--- a/test/Transforms/InstCombine/2007-09-10-AliasConstFold.ll
+++ b/test/Transforms/InstCombine/2007-09-10-AliasConstFold.ll

@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S | grep icmp
 ; PR1646
 
-@__gthrw_pthread_cancel = alias weak i32 (i32)* @pthread_cancel		; <i32 (i32)*> [#uses=1]
+@__gthrw_pthread_cancel = weak alias i32 (i32)* @pthread_cancel		; <i32 (i32)*> [#uses=1]
 @__gthread_active_ptr.5335 = internal constant i8* bitcast (i32 (i32)* @__gthrw_pthread_cancel to i8*)		; <i8**> [#uses=1]
 define weak i32 @pthread_cancel(i32) {
        ret i32 0

diff --git a/test/Transforms/InstCombine/2007-09-17-AliasConstFold2.ll b/test/Transforms/InstCombine/2007-09-17-AliasConstFold2.ll
index c7cef75..3793a86 100644
--- a/test/Transforms/InstCombine/2007-09-17-AliasConstFold2.ll
+++ b/test/Transforms/InstCombine/2007-09-17-AliasConstFold2.ll

@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S | grep icmp
 ; PR1678
 
-@A = alias weak void ()* @B		; <void ()*> [#uses=1]
+@A = weak alias void ()* @B		; <void ()*> [#uses=1]
 
 define weak void @B() {
        ret void

diff --git a/test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll b/test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll
index fe935f9..656fb34 100644
--- a/test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll
+++ b/test/Transforms/InstCombine/2007-10-10-EliminateMemCpy.ll

@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | not grep call
-; RUN: opt < %s -std-compile-opts -S | not grep xyz
+; RUN: opt < %s -O3 -S | not grep xyz
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 @.str = internal constant [4 x i8] c"xyz\00"		; <[4 x i8]*> [#uses=1]

diff --git a/test/Transforms/InstCombine/2008-02-16-SDivOverflow.ll b/test/Transforms/InstCombine/2008-02-16-SDivOverflow.ll
deleted file mode 100644
index 917d3d9..0000000
--- a/test/Transforms/InstCombine/2008-02-16-SDivOverflow.ll
+++ /dev/null

@@ -1,14 +0,0 @@
-; RUN: opt < %s -instcombine -S | grep "ret i.* 0" | count 2
-; PR2048
-
-define i32 @i(i32 %a) {
-  %tmp1 = sdiv i32 %a, -1431655765
-  %tmp2 = sdiv i32 %tmp1, 3
-  ret i32 %tmp2
-}
-
-define i8 @j(i8 %a) {
-  %tmp1 = sdiv i8 %a, 64
-  %tmp2 = sdiv i8 %tmp1, 3
-  ret i8 %tmp2
-}

diff --git a/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll b/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
index a75a465..895b260 100644
--- a/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll
+++ b/test/Transforms/InstCombine/2011-06-13-nsw-alloca.ll

@@ -15,7 +15,7 @@
 
 ; <label>:4                                       ; preds = %0
   %5 = load i32* %1, align 4
-  %6 = mul nsw i32 %5, 8
+  %6 = shl nsw i32 %5, 3
 ; With "nsw", the alloca and its bitcast can be fused:
   %7 = add nsw i32 %6, 2048
 ;  CHECK: alloca double

diff --git a/test/Transforms/InstCombine/add2.ll b/test/Transforms/InstCombine/add2.ll
index d7eac4b..a166e5f 100644
--- a/test/Transforms/InstCombine/add2.ll
+++ b/test/Transforms/InstCombine/add2.ll

@@ -219,7 +219,7 @@
  %add2 = add nsw i16 %x, %mul1
  ret i16 %add2
 ; CHECK-LABEL: @mul_add_to_mul_1(
-; CHECK-NEXT: %add2 = mul nsw i16 %x, 9
+; CHECK-NEXT: %add2 = mul i16 %x, 9
 ; CHECK-NEXT: ret i16 %add2
 }
 
@@ -228,7 +228,7 @@
  %add2 = add nsw i16 %mul1, %x
  ret i16 %add2
 ; CHECK-LABEL: @mul_add_to_mul_2(
-; CHECK-NEXT: %add2 = mul nsw i16 %x, 9
+; CHECK-NEXT: %add2 = mul i16 %x, 9
 ; CHECK-NEXT: ret i16 %add2
 }
 
@@ -248,7 +248,7 @@
  %add = add nsw i16 %mul1, %mul2
  ret i16 %add
 ; CHECK-LABEL: @mul_add_to_mul_4(
-; CHECK-NEXT: %add = mul nsw i16 %a, 9
+; CHECK-NEXT: %add = mul i16 %a, 9
 ; CHECK-NEXT: ret i16 %add
 }
 
@@ -313,3 +313,43 @@
   ret i16 %b
 }
 !1 = metadata !{i16 0, i16 32}
+
+define i32 @add_or_and(i32 %x, i32 %y) {
+  %or = or i32 %x, %y
+  %and = and i32 %x, %y
+  %add = add i32 %or, %and
+  ret i32 %add
+; CHECK-LABEL: @add_or_and(
+; CHECK-NEXT: add i32 %x, %y
+; CHECK-NEXT: ret i32
+}
+
+define i32 @add_nsw_or_and(i32 %x, i32 %y) {
+  %or = or i32 %x, %y
+  %and = and i32 %x, %y
+  %add = add nsw i32 %or, %and
+  ret i32 %add
+; CHECK-LABEL: @add_nsw_or_and(
+; CHECK-NEXT: add nsw i32 %x, %y
+; CHECK-NEXT: ret i32
+}
+
+define i32 @add_nuw_or_and(i32 %x, i32 %y) {
+  %or = or i32 %x, %y
+  %and = and i32 %x, %y
+  %add = add nuw i32 %or, %and
+  ret i32 %add
+; CHECK-LABEL: @add_nuw_or_and(
+; CHECK-NEXT: add nuw i32 %x, %y
+; CHECK-NEXT: ret i32
+}
+
+define i32 @add_nuw_nsw_or_and(i32 %x, i32 %y) {
+  %or = or i32 %x, %y
+  %and = and i32 %x, %y
+  %add = add nsw nuw i32 %or, %and
+  ret i32 %add
+; CHECK-LABEL: @add_nuw_nsw_or_and(
+; CHECK-NEXT: add nuw nsw i32 %x, %y
+; CHECK-NEXT: ret i32
+}

diff --git a/test/Transforms/InstCombine/add4.ll b/test/Transforms/InstCombine/add4.ll
deleted file mode 100644
index f9b7e3b..0000000
--- a/test/Transforms/InstCombine/add4.ll
+++ /dev/null

@@ -1,102 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-define float @test1(float %A, float %B, i1 %C) {
-EntryBlock:
-  ;; A*(1 - uitofp i1 C) -> select C, 0, A
-  %cf = uitofp i1 %C to float
-  %mc = fsub float 1.000000e+00, %cf
-  %p1 = fmul fast float %A, %mc
-  ret float %p1
-; CHECK-LABEL: @test1(
-; CHECK: select i1 %C, float -0.000000e+00, float %A
-}
-
-define float @test2(float %A, float %B, i1 %C) {
-EntryBlock:
-  ;; B*(uitofp i1 C) -> select C, B, 0
-  %cf = uitofp i1 %C to float
-  %p2 = fmul fast float %B, %cf
-  ret float %p2
-; CHECK-LABEL: @test2(
-; CHECK: select i1 %C, float %B, float -0.000000e+00
-}
-
-define float @test3(float %A, float %B, i1 %C) {
-EntryBlock:
-  ;;  select C, 0, B + select C, A, 0 -> select C, A, B
-  %cf = uitofp i1 %C to float
-  %s1 = select i1 %C, float 0.000000e+00, float %B
-  %s2 = select i1 %C, float %A, float 0.000000e+00
-  %sum = fadd fast float %s1, %s2
-  ret float %sum
-; CHECK-LABEL: @test3(
-; CHECK: select i1 %C, float %A, float %B
-}
-
-define float @test4(float %A, float %B, i1 %C) {
-EntryBlock:
-  ;;  B*(uitofp i1 C) + A*(1 - uitofp i1 C) -> select C, A, B
-  %cf = uitofp i1 %C to float
-  %mc = fsub fast float 1.000000e+00, %cf
-  %p1 = fmul fast float %A, %mc
-  %p2 = fmul fast float %B, %cf
-  %s1 = fadd fast float %p2, %p1
-  ret float %s1
-; CHECK-LABEL: @test4(
-; CHECK: select i1 %C, float %B, float %A
-}
-
-define float @test5(float %A, float %B, i1 %C) {
-EntryBlock:
-  ;; A*(1 - uitofp i1 C) + B*(uitofp i1 C) -> select C, A, B
-  %cf = uitofp i1 %C to float
-  %mc = fsub fast float 1.000000e+00, %cf
-  %p1 = fmul fast float %A, %mc
-  %p2 = fmul fast float %B, %cf
-  %s1 = fadd fast float %p1, %p2
-  ret float %s1
-; CHECK-LABEL: @test5(
-; CHECK: select i1 %C, float %B, float %A
-}
-
-; PR15952
-define float @test6(float %A, float %B, i32 %C) {
-  %cf = uitofp i32 %C to float
-  %mc = fsub float 1.000000e+00, %cf
-  %p1 = fmul fast float %A, %mc
-  ret float %p1
-; CHECK-LABEL: @test6(
-; CHECK: uitofp
-}
-
-define float @test7(float %A, float %B, i32 %C) {
-  %cf = uitofp i32 %C to float
-  %p2 = fmul fast float %B, %cf
-  ret float %p2
-; CHECK-LABEL: @test7(
-; CHECK: uitofp
-}
-
-define <4 x float> @test8(<4 x float> %A, <4 x float> %B, <4 x i1> %C) {
-  ;;  B*(uitofp i1 C) + A*(1 - uitofp i1 C) -> select C, A, B
-  %cf = uitofp <4 x i1> %C to <4 x float>
-  %mc = fsub fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %cf
-  %p1 = fmul fast <4 x float> %A, %mc
-  %p2 = fmul fast <4 x float> %B, %cf
-  %s1 = fadd fast <4 x float> %p2, %p1
-  ret <4 x float> %s1
-; CHECK-LABEL: @test8(
-; CHECK: select <4 x i1> %C, <4 x float> %B, <4 x float> %A
-}
-
-define <4 x float> @test9(<4 x float> %A, <4 x float> %B, <4 x i1> %C) {
-  ;; A*(1 - uitofp i1 C) + B*(uitofp i1 C) -> select C, A, B
-  %cf = uitofp <4 x i1> %C to <4 x float>
-  %mc = fsub fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %cf
-  %p1 = fmul fast <4 x float> %A, %mc
-  %p2 = fmul fast <4 x float> %B, %cf
-  %s1 = fadd fast <4 x float> %p1, %p2
-  ret <4 x float> %s1
-; CHECK-LABEL: @test9
-; CHECK: select <4 x i1> %C, <4 x float> %B, <4 x float> %A
-}

diff --git a/test/Transforms/InstCombine/align-attr.ll b/test/Transforms/InstCombine/align-attr.ll
new file mode 100644
index 0000000..9f366bf
--- /dev/null
+++ b/test/Transforms/InstCombine/align-attr.ll

@@ -0,0 +1,15 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo1(i32* align 32 %a) #0 {
+entry:
+  %0 = load i32* %a, align 4
+  ret i32 %0
+
+; CHECK-LABEL: @foo1
+; CHECK-DAG: load i32* %a, align 32
+; CHECK: ret i32
+}
+

diff --git a/test/Transforms/InstCombine/and2.ll b/test/Transforms/InstCombine/and2.ll
index e88fd59..96b535d 100644
--- a/test/Transforms/InstCombine/and2.ll
+++ b/test/Transforms/InstCombine/and2.ll

@@ -45,7 +45,7 @@
 
 ; Check that we combine "if x!=0 && x!=-1" into "if x+1u>1"
 define i32 @test6(i64 %x) nounwind {
-; CHECK: @test6
+; CHECK-LABEL: @test6(
 ; CHECK-NEXT: add i64 %x, 1
 ; CHECK-NEXT: icmp ugt i64 %x.off, 1
   %cmp1 = icmp ne i64 %x, -1
@@ -54,3 +54,26 @@
   %land.ext = zext i1 %.cmp1 to i32
   ret i32 %land.ext
 }
+
+define i1 @test7(i32 %i, i1 %b) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 %i, 0
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP]], %b
+; CHECK-NEXT: ret i1 [[AND]]
+  %cmp1 = icmp slt i32 %i, 1
+  %cmp2 = icmp sgt i32 %i, -1
+  %and1 = and i1 %cmp1, %b
+  %and2 = and i1 %and1, %cmp2
+  ret i1 %and2
+}
+
+define i1 @test8(i32 %i) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT: [[DEC:%.*]] = add i32 %i, -1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[DEC]], 13
+; CHECK-NEXT: ret i1 [[CMP]]
+  %cmp1 = icmp ne i32 %i, 0
+  %cmp2 = icmp ult i32 %i, 14
+  %cond = and i1 %cmp1, %cmp2
+  ret i1 %cond
+}

diff --git a/test/Transforms/InstCombine/apint-sub.ll b/test/Transforms/InstCombine/apint-sub.ll
index df8ec52..3b69c17 100644
--- a/test/Transforms/InstCombine/apint-sub.ll
+++ b/test/Transforms/InstCombine/apint-sub.ll

@@ -95,12 +95,6 @@
 	ret i1024 %D
 }
 
-define i14 @test15(i14 %A, i14 %B) {
-	%C = sub i14 0, %A		; <i14> [#uses=1]
-	%D = srem i14 %B, %C		; <i14> [#uses=1]
-	ret i14 %D
-}
-
 define i51 @test16(i51 %A) {
 	%X = sdiv i51 %A, 1123		; <i51> [#uses=1]
 	%Y = sub i51 0, %X		; <i51> [#uses=1]

diff --git a/test/Transforms/InstCombine/ashr-nop.ll b/test/Transforms/InstCombine/ashr-nop.ll
deleted file mode 100644
index 870ede3..0000000
--- a/test/Transforms/InstCombine/ashr-nop.ll
+++ /dev/null

@@ -1,8 +0,0 @@
-; RUN: opt < %s -instcombine -S | not grep ashr
-
-define i32 @foo(i32 %x) {
-  %o = and i32 %x, 1
-  %n = add i32 %o, -1
-  %t = ashr i32 %n, 17
-  ret i32 %t
-}

diff --git a/test/Transforms/InstCombine/assume-loop-align.ll b/test/Transforms/InstCombine/assume-loop-align.ll
new file mode 100644
index 0000000..19190de
--- /dev/null
+++ b/test/Transforms/InstCombine/assume-loop-align.ll

@@ -0,0 +1,47 @@
+; RUN: opt -domtree -instcombine -loops -S < %s | FileCheck %s
+; Note: The -loops above can be anything that requires the domtree, and is
+; necessary to work around a pass-manager bug.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32* %a, i32* %b) #0 {
+entry:
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 63
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  %ptrint1 = ptrtoint i32* %b to i64
+  %maskedptr2 = and i64 %ptrint1, 63
+  %maskcond3 = icmp eq i64 %maskedptr2, 0
+  tail call void @llvm.assume(i1 %maskcond3)
+  br label %for.body
+
+; CHECK-LABEL: @foo
+; CHECK: load i32* {{.*}} align 64
+; CHECK: store i32 {{.*}}  align 64
+; CHECK: ret
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
+  %arrayidx5 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, 1648
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+

diff --git a/test/Transforms/InstCombine/assume-redundant.ll b/test/Transforms/InstCombine/assume-redundant.ll
new file mode 100644
index 0000000..81fe094
--- /dev/null
+++ b/test/Transforms/InstCombine/assume-redundant.ll

@@ -0,0 +1,55 @@
+; RUN: opt -domtree -instcombine -loops -S < %s | FileCheck %s
+; Note: The -loops above can be anything that requires the domtree, and is
+; necessary to work around a pass-manager bug.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.s = type { double* }
+
+; Function Attrs: nounwind uwtable
+define void @_Z3fooR1s(%struct.s* nocapture readonly dereferenceable(8) %x) #0 {
+
+; CHECK-LABEL: @_Z3fooR1s
+; CHECK: call void @llvm.assume
+; CHECK-NOT: call void @llvm.assume
+
+entry:
+  %a = getelementptr inbounds %struct.s* %x, i64 0, i32 0
+  %0 = load double** %a, align 8
+  %ptrint = ptrtoint double* %0 to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds double* %0, i64 %indvars.iv
+  %1 = load double* %arrayidx, align 16
+  %add = fadd double %1, 1.000000e+00
+  tail call void @llvm.assume(i1 %maskcond)
+  %mul = fmul double %add, 2.000000e+00
+  store double %mul, double* %arrayidx, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx.1 = getelementptr inbounds double* %0, i64 %indvars.iv.next
+  %2 = load double* %arrayidx.1, align 8
+  %add.1 = fadd double %2, 1.000000e+00
+  tail call void @llvm.assume(i1 %maskcond)
+  %mul.1 = fmul double %add.1, 2.000000e+00
+  store double %mul.1, double* %arrayidx.1, align 8
+  %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv.next, 1
+  %exitcond.1 = icmp eq i64 %indvars.iv.next, 1599
+  br i1 %exitcond.1, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+

diff --git a/test/Transforms/InstCombine/assume.ll b/test/Transforms/InstCombine/assume.ll
new file mode 100644
index 0000000..7e45c04
--- /dev/null
+++ b/test/Transforms/InstCombine/assume.ll

@@ -0,0 +1,265 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo1(i32* %a) #0 {
+entry:
+  %0 = load i32* %a, align 4
+
+; Check that the alignment has been upgraded and that the assume has not
+; been removed:
+; CHECK-LABEL: @foo1
+; CHECK-DAG: load i32* %a, align 32
+; CHECK-DAG: call void @llvm.assume
+; CHECK: ret i32
+
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+
+  ret i32 %0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @foo2(i32* %a) #0 {
+entry:
+; Same check as in @foo1, but make sure it works if the assume is first too.
+; CHECK-LABEL: @foo2
+; CHECK-DAG: load i32* %a, align 32
+; CHECK-DAG: call void @llvm.assume
+; CHECK: ret i32
+
+  %ptrint = ptrtoint i32* %a to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+
+  %0 = load i32* %a, align 4
+  ret i32 %0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+define i32 @simple(i32 %a) #1 {
+entry:
+
+; CHECK-LABEL: @simple
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 4
+
+  %cmp = icmp eq i32 %a, 4
+  tail call void @llvm.assume(i1 %cmp)
+  ret i32 %a
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @can1(i1 %a, i1 %b, i1 %c) {
+entry:
+  %and1 = and i1 %a, %b
+  %and  = and i1 %and1, %c
+  tail call void @llvm.assume(i1 %and)
+
+; CHECK-LABEL: @can1
+; CHECK: call void @llvm.assume(i1 %a)
+; CHECK: call void @llvm.assume(i1 %b)
+; CHECK: call void @llvm.assume(i1 %c)
+; CHECK: ret i32
+
+  ret i32 5
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @can2(i1 %a, i1 %b, i1 %c) {
+entry:
+  %v = or i1 %a, %b
+  %w = xor i1 %v, 1
+  tail call void @llvm.assume(i1 %w)
+
+; CHECK-LABEL: @can2
+; CHECK: %[[V1:[^ ]+]] = xor i1 %a, true
+; CHECK: call void @llvm.assume(i1 %[[V1]])
+; CHECK: %[[V2:[^ ]+]] = xor i1 %b, true
+; CHECK: call void @llvm.assume(i1 %[[V2]])
+; CHECK: ret i32
+
+  ret i32 5
+}
+
+define i32 @bar1(i32 %a) #0 {
+entry:
+  %and1 = and i32 %a, 3
+
+; CHECK-LABEL: @bar1
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 1
+
+  %and = and i32 %a, 7
+  %cmp = icmp eq i32 %and, 1
+  tail call void @llvm.assume(i1 %cmp)
+
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @bar2(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @bar2
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 1
+
+  %and = and i32 %a, 7
+  %cmp = icmp eq i32 %and, 1
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 3
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @bar3(i32 %a, i1 %x, i1 %y) #0 {
+entry:
+  %and1 = and i32 %a, 3
+
+; Don't be fooled by other assumes around.
+; CHECK-LABEL: @bar3
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 1
+
+  tail call void @llvm.assume(i1 %x)
+
+  %and = and i32 %a, 7
+  %cmp = icmp eq i32 %and, 1
+  tail call void @llvm.assume(i1 %cmp)
+
+  tail call void @llvm.assume(i1 %y)
+
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @bar4(i32 %a, i32 %b) {
+entry:
+  %and1 = and i32 %b, 3
+
+; CHECK-LABEL: @bar4
+; CHECK: call void @llvm.assume
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 1
+
+  %and = and i32 %a, 7
+  %cmp = icmp eq i32 %and, 1
+  tail call void @llvm.assume(i1 %cmp)
+
+  %cmp2 = icmp eq i32 %a, %b
+  tail call void @llvm.assume(i1 %cmp2)
+
+  ret i32 %and1
+}
+
+define i32 @icmp1(i32 %a) #0 {
+entry:
+  %cmp = icmp sgt i32 %a, 5
+  tail call void @llvm.assume(i1 %cmp)
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+
+; CHECK-LABEL: @icmp1
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 1
+
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @icmp2(i32 %a) #0 {
+entry:
+  %cmp = icmp sgt i32 %a, 5
+  tail call void @llvm.assume(i1 %cmp)
+  %0 = zext i1 %cmp to i32
+  %lnot.ext = xor i32 %0, 1
+  ret i32 %lnot.ext
+
+; CHECK-LABEL: @icmp2
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 0
+}
+
+declare void @escape(i32* %a)
+
+; Do we canonicalize a nonnull assumption on a load into
+; metadata form?
+define i1 @nonnull1(i32** %a) {
+entry:
+  %load = load i32** %a
+  %cmp = icmp ne i32* %load, null
+  tail call void @llvm.assume(i1 %cmp)
+  tail call void @escape(i32* %load)
+  %rval = icmp eq i32* %load, null
+  ret i1 %rval
+
+; CHECK-LABEL: @nonnull1
+; CHECK: !nonnull
+; CHECK-NOT: call void @llvm.assume
+; CHECK: ret i1 false
+}
+
+; Make sure the above canonicalization applies only
+; to pointer types.  Doing otherwise would be illegal.
+define i1 @nonnull2(i32* %a) {
+entry:
+  %load = load i32* %a
+  %cmp = icmp ne i32 %load, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %rval = icmp eq i32 %load, 0
+  ret i1 %rval
+
+; CHECK-LABEL: @nonnull2
+; CHECK-NOT: !nonnull
+; CHECK: call void @llvm.assume
+}
+
+; Make sure the above canonicalization does not trigger
+; if the assume is control dependent on something else
+define i1 @nonnull3(i32** %a, i1 %control) {
+entry:
+  %load = load i32** %a
+  %cmp = icmp ne i32* %load, null
+  br i1 %control, label %taken, label %not_taken
+taken:
+  tail call void @llvm.assume(i1 %cmp)
+  %rval = icmp eq i32* %load, null
+  ret i1 %rval
+not_taken:
+  ret i1 true
+
+; CHECK-LABEL: @nonnull3
+; CHECK-NOT: !nonnull
+; CHECK: call void @llvm.assume
+}
+
+; Make sure the above canonicalization does not trigger
+; if the path from the load to the assume is potentially 
+; interrupted by an exception being thrown
+define i1 @nonnull4(i32** %a) {
+entry:
+  %load = load i32** %a
+  ;; This call may throw!
+  tail call void @escape(i32* %load)
+  %cmp = icmp ne i32* %load, null
+  tail call void @llvm.assume(i1 %cmp)
+  %rval = icmp eq i32* %load, null
+  ret i1 %rval
+
+; CHECK-LABEL: @nonnull4
+; CHECK-NOT: !nonnull
+; CHECK: call void @llvm.assume
+}
+
+
+
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+

diff --git a/test/Transforms/InstCombine/assume2.ll b/test/Transforms/InstCombine/assume2.ll
new file mode 100644
index 0000000..c41bbaa
--- /dev/null
+++ b/test/Transforms/InstCombine/assume2.ll

@@ -0,0 +1,174 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @test1(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test1
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 5
+
+  %and = and i32 %a, 15
+  %cmp = icmp eq i32 %and, 5
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 7
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test2(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test2
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 2
+
+  %and = and i32 %a, 15
+  %nand = xor i32 %and, -1
+  %cmp = icmp eq i32 %nand, 4294967285
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 7
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test3(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test3
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 5
+
+  %v = or i32 %a, 4294967280
+  %cmp = icmp eq i32 %v, 4294967285
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 7
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test4(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test4
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 2
+
+  %v = or i32 %a, 4294967280
+  %nv = xor i32 %v, -1
+  %cmp = icmp eq i32 %nv, 5
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 7
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test5(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test5
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 4
+
+  %v = xor i32 %a, 1
+  %cmp = icmp eq i32 %v, 5
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 7
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test6(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test6
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 5
+
+  %v = shl i32 %a, 2
+  %cmp = icmp eq i32 %v, 20
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 63
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test7(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test7
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 20
+
+  %v = lshr i32 %a, 2
+  %cmp = icmp eq i32 %v, 5
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 252
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test8(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test8
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 20
+
+  %v = lshr i32 %a, 2
+  %cmp = icmp eq i32 %v, 5
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 252
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test9(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test9
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 0
+
+  %cmp = icmp sgt i32 %a, 5
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 2147483648
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test10(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test10
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 -2147483648
+
+  %cmp = icmp sle i32 %a, -2
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 2147483648
+  ret i32 %and1
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test11(i32 %a) #0 {
+entry:
+; CHECK-LABEL: @test11
+; CHECK: call void @llvm.assume
+; CHECK: ret i32 0
+
+  %cmp = icmp ule i32 %a, 256
+  tail call void @llvm.assume(i1 %cmp)
+
+  %and1 = and i32 %a, 3072
+  ret i32 %and1
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+

diff --git a/test/Transforms/InstCombine/atomic.ll b/test/Transforms/InstCombine/atomic.ll
index ccee874..98cecef 100644
--- a/test/Transforms/InstCombine/atomic.ll
+++ b/test/Transforms/InstCombine/atomic.ll

@@ -5,14 +5,6 @@
 
 ; Check transforms involving atomic operations
 
-define i32* @test1(i8** %p) {
-; CHECK-LABEL: define i32* @test1(
-; CHECK: load atomic i8** %p monotonic, align 8
-  %c = bitcast i8** %p to i32**
-  %r = load atomic i32** %c monotonic, align 8
-  ret i32* %r
-}
-
 define i32 @test2(i32* %p) {
 ; CHECK-LABEL: define i32 @test2(
 ; CHECK: %x = load atomic i32* %p seq_cst, align 4

diff --git a/test/Transforms/InstCombine/bitcast-alias-function.ll b/test/Transforms/InstCombine/bitcast-alias-function.ll
index a6b56f9..bc36b25 100644
--- a/test/Transforms/InstCombine/bitcast-alias-function.ll
+++ b/test/Transforms/InstCombine/bitcast-alias-function.ll

@@ -90,7 +90,8 @@
 define void @bitcast_alias_scalar(float* noalias %source, float* noalias %dest) nounwind {
 entry:
 ; CHECK-LABEL: @bitcast_alias_scalar
-; CHECK: bitcast float %tmp to i32
+; CHECK: bitcast float* %source to i32*
+; CHECK: load i32*
 ; CHECK-NOT: fptoui
 ; CHECK-NOT: uitofp
 ; CHECK: bitcast i32 %call to float
@@ -104,7 +105,8 @@
 define void @bitcast_alias_vector(<2 x float>* noalias %source, <2 x float>* noalias %dest) nounwind {
 entry:
 ; CHECK-LABEL: @bitcast_alias_vector
-; CHECK: bitcast <2 x float> %tmp to <2 x i32>
+; CHECK: bitcast <2 x float>* %source to <2 x i32>*
+; CHECK: load <2 x i32>*
 ; CHECK-NOT: fptoui
 ; CHECK-NOT: uitofp
 ; CHECK: bitcast <2 x i32> %call to <2 x float>
@@ -118,7 +120,8 @@
 define void @bitcast_alias_vector_scalar_same_size(<2 x float>* noalias %source, <2 x float>* noalias %dest) nounwind {
 entry:
 ; CHECK-LABEL: @bitcast_alias_vector_scalar_same_size
-; CHECK: bitcast <2 x float> %tmp to i64
+; CHECK: bitcast <2 x float>* %source to i64*
+; CHECK: load i64*
 ; CHECK: %call = call i64 @func_i64
 ; CHECK: bitcast i64 %call to <2 x float>
   %tmp = load <2 x float>* %source, align 8
@@ -130,7 +133,8 @@
 define void @bitcast_alias_scalar_vector_same_size(i64* noalias %source, i64* noalias %dest) nounwind {
 entry:
 ; CHECK-LABEL: @bitcast_alias_scalar_vector_same_size
-; CHECK: bitcast i64 %tmp to <2 x float>
+; CHECK: bitcast i64* %source to <2 x float>*
+; CHECK: load <2 x float>*
 ; CHECK: call <2 x float> @func_v2f32
 ; CHECK: bitcast <2 x float> %call to i64
   %tmp = load i64* %source, align 8
@@ -142,7 +146,8 @@
 define void @bitcast_alias_vector_ptrs_same_size(<2 x i64*>* noalias %source, <2 x i64*>* noalias %dest) nounwind {
 entry:
 ; CHECK-LABEL: @bitcast_alias_vector_ptrs_same_size
-; CHECK: bitcast <2 x i64*> %tmp to <2 x i32*>
+; CHECK: bitcast <2 x i64*>* %source to <2 x i32*>*
+; CHECK: load <2 x i32*>*
 ; CHECK: call <2 x i32*> @func_v2i32p
 ; CHECK: bitcast <2 x i32*> %call to <2 x i64*>
   %tmp = load <2 x i64*>* %source, align 8

diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 0cbfbb0..578b16d 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll

@@ -354,6 +354,24 @@
 ; CHECK: ret i32* %tmp1
 }
 
+define i32 addrspace(1)* @test41_addrspacecast_smaller(i32* %tmp1) {
+  %tmp64 = addrspacecast i32* %tmp1 to { i32 } addrspace(1)*
+  %tmp65 = getelementptr { i32 } addrspace(1)* %tmp64, i32 0, i32 0
+  ret i32 addrspace(1)* %tmp65
+; CHECK-LABEL: @test41_addrspacecast_smaller(
+; CHECK: addrspacecast i32* %tmp1 to i32 addrspace(1)*
+; CHECK-NEXT: ret i32 addrspace(1)*
+}
+
+define i32* @test41_addrspacecast_larger(i32 addrspace(1)* %tmp1) {
+  %tmp64 = addrspacecast i32 addrspace(1)* %tmp1 to { i32 }*
+  %tmp65 = getelementptr { i32 }* %tmp64, i32 0, i32 0
+  ret i32* %tmp65
+; CHECK-LABEL: @test41_addrspacecast_larger(
+; CHECK: addrspacecast i32 addrspace(1)* %tmp1 to i32*
+; CHECK-NEXT: ret i32*
+}
+
 define i32 @test42(i32 %X) {
         %Y = trunc i32 %X to i8         ; <i8> [#uses=1]
         %Z = zext i8 %Y to i32          ; <i32> [#uses=1]
@@ -792,7 +810,7 @@
 
 define double @test72(double *%p, i32 %i) {
 ; CHECK-LABEL: @test72(
-  %so = mul nsw i32 %i, 8
+  %so = shl nsw i32 %i, 3
   %o = sext i32 %so to i64
 ; CHECK-NEXT: sext i32 %i to i64
   %q = bitcast double* %p to i8*
@@ -807,7 +825,7 @@
 
 define double @test73(double *%p, i128 %i) {
 ; CHECK-LABEL: @test73(
-  %lo = mul nsw i128 %i, 8
+  %lo = shl nsw i128 %i, 3
   %o = trunc i128 %lo to i64
 ; CHECK-NEXT: trunc i128 %i to i64
   %q = bitcast double* %p to i8*
@@ -919,7 +937,7 @@
 
 define double @test80([100 x double]* %p, i32 %i) {
 ; CHECK-LABEL: @test80(
-  %tmp = mul nsw i32 %i, 8
+  %tmp = shl nsw i32 %i, 3
 ; CHECK-NEXT: sext i32 %i to i64
   %q = bitcast [100 x double]* %p to i8*
   %pp = getelementptr i8* %q, i32 %tmp
@@ -936,7 +954,7 @@
 ; CHECK-NEXT: getelementptr [100 x double] addrspace(1)* %p
 ; CHECK-NEXT: load double addrspace(1)*
 ; CHECK-NEXT: ret double
-  %tmp = mul nsw i32 %i, 8
+  %tmp = shl nsw i32 %i, 3
   %q = addrspacecast [100 x double] addrspace(1)* %p to i8 addrspace(2)*
   %pp = getelementptr i8 addrspace(2)* %q, i32 %tmp
   %r = addrspacecast i8 addrspace(2)* %pp to double addrspace(1)*
@@ -950,7 +968,7 @@
 ; CHECK-NEXT: addrspacecast double addrspace(1)*
 ; CHECK-NEXT: load double addrspace(3)*
 ; CHECK-NEXT: ret double
-  %tmp = mul nsw i32 %i, 8
+  %tmp = shl nsw i32 %i, 3
   %q = addrspacecast [100 x double] addrspace(1)* %p to i8 addrspace(2)*
   %pp = getelementptr i8 addrspace(2)* %q, i32 %tmp
   %r = addrspacecast i8 addrspace(2)* %pp to double addrspace(3)*
@@ -960,7 +978,7 @@
 
 define double @test80_as1([100 x double] addrspace(1)* %p, i16 %i) {
 ; CHECK-LABEL: @test80_as1(
-  %tmp = mul nsw i16 %i, 8
+  %tmp = shl nsw i16 %i, 3
 ; CHECK-NEXT: sext i16 %i to i32
   %q = bitcast [100 x double] addrspace(1)* %p to i8 addrspace(1)*
   %pp = getelementptr i8 addrspace(1)* %q, i16 %tmp
@@ -1004,7 +1022,74 @@
   ret i64 %sh_prom1
 
 ; CHECK-LABEL: @test83(
-; CHECK: %sub = add nsw i64 %k, 4294967295
+; CHECK: %sub = add i64 %k, 4294967295
 ; CHECK: %sh_prom = trunc i64 %sub to i32
 ; CHECK: %shl = shl i32 %conv, %sh_prom
 }
+
+define i8 @test84(i32 %a) {
+  %add = add nsw i32 %a, -16777216
+  %shr = lshr exact i32 %add, 23
+  %trunc = trunc i32 %shr to i8
+  ret i8 %trunc
+
+; CHECK-LABEL: @test84(
+; CHECK: [[ADD:%.*]] = add i32 %a, 2130706432
+; CHECK: [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
+; CHECK: [[CST:%.*]] = trunc i32 [[SHR]] to i8
+}
+
+define i8 @test85(i32 %a) {
+  %add = add nuw i32 %a, -16777216
+  %shr = lshr exact i32 %add, 23
+  %trunc = trunc i32 %shr to i8
+  ret i8 %trunc
+
+; CHECK-LABEL: @test85(
+; CHECK: [[ADD:%.*]] = add i32 %a, 2130706432
+; CHECK: [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
+; CHECK: [[CST:%.*]] = trunc i32 [[SHR]] to i8
+}
+
+; Overflow on a float to int or int to float conversion is undefined (PR21130).
+
+define i8 @overflow_fptosi() {
+  %i = fptosi double 1.56e+02 to i8
+  ret i8 %i
+; CHECK-LABEL: @overflow_fptosi(
+; CHECK-NEXT: ret i8 undef 
+}
+
+define i8 @overflow_fptoui() {
+  %i = fptoui double 2.56e+02 to i8
+  ret i8 %i
+; CHECK-LABEL: @overflow_fptoui(
+; CHECK-NEXT: ret i8 undef 
+}
+
+; The maximum float is approximately 2 ** 128 which is 3.4E38. 
+; The constant below is 4E38. Use a 130 bit integer to hold that
+; number; 129-bits for the value + 1 bit for the sign.
+define float @overflow_uitofp() {
+  %i = uitofp i130 400000000000000000000000000000000000000 to float
+  ret float %i
+; CHECK-LABEL: @overflow_uitofp(
+; CHECK-NEXT: ret float undef 
+}
+
+define float @overflow_sitofp() {
+  %i = sitofp i130 400000000000000000000000000000000000000 to float
+  ret float %i
+; CHECK-LABEL: @overflow_sitofp(
+; CHECK-NEXT: ret float undef 
+}
+
+define i32 @PR21388(i32* %v) {
+  %icmp = icmp slt i32* %v, null
+  %sext = sext i1 %icmp to i32
+  ret i32 %sext
+; CHECK-LABEL: @PR21388(
+; CHECK-NEXT: %[[icmp:.*]] = icmp slt i32* %v, null
+; CHECK-NEXT: %[[sext:.*]] = sext i1 %[[icmp]] to i32
+; CHECK-NEXT: ret i32 %[[sext]]
+}

diff --git a/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll b/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
index 7fac78a..bb61f02 100644
--- a/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
+++ b/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll

@@ -161,12 +161,11 @@
   ret i32 %a
 }
 
-define <4 x i32> @constant_fold_bitcast_vector_as() {
+define <4 x float> @constant_fold_bitcast_vector_as() {
 ; CHECK-LABEL: @constant_fold_bitcast_vector_as(
 ; CHECK: load <4 x float> addrspace(3)* @g_v4f_as3, align 16
-; CHECK: bitcast <4 x float> %1 to <4 x i32>
-  %a = load <4 x i32> addrspace(3)* bitcast (<4 x float> addrspace(3)* @g_v4f_as3 to <4 x i32> addrspace(3)*), align 4
-  ret <4 x i32> %a
+  %a = load <4 x float> addrspace(3)* bitcast (<4 x i32> addrspace(3)* bitcast (<4 x float> addrspace(3)* @g_v4f_as3 to <4 x i32> addrspace(3)*) to <4 x float> addrspace(3)*), align 4
+  ret <4 x float> %a
 }
 
 @i32_array_as3 = addrspace(3) global [10 x i32] zeroinitializer

diff --git a/test/Transforms/InstCombine/constant-fold-alias.ll b/test/Transforms/InstCombine/constant-fold-alias.ll
new file mode 100644
index 0000000..13da0f4
--- /dev/null
+++ b/test/Transforms/InstCombine/constant-fold-alias.ll

@@ -0,0 +1,40 @@
+; RUN: opt -S < %s -instcombine | FileCheck %s
+
+target datalayout = "e-p1:16:16-p2:32:32-p3:64:64"
+
+@G1 = global i32 42, align 1
+@G2 = global i32 42
+@G3 = global [4 x i8] zeroinitializer, align 1
+
+@A1 = alias bitcast (i8* getelementptr inbounds ([4 x i8]* @G3, i32 0, i32 2) to i32*)
+@A2 = alias inttoptr (i64 and (i64 ptrtoint (i8* getelementptr inbounds ([4 x i8]* @G3, i32 0, i32 3) to i64), i64 -4) to i32*)
+
+define i64 @f1() {
+; This cannot be constant folded because G1 is underaligned.
+; CHECK-LABEL: @f1(
+; CHECK: ret i64 and
+  ret i64 and (i64 ptrtoint (i32* @G1 to i64), i64 1)
+}
+
+define i64 @f2() {
+; The preferred alignment for G2 allows this one to foled to zero.
+; CHECK-LABEL: @f2(
+; CHECK: ret i64 0
+  ret i64 and (i64 ptrtoint (i32* @G2 to i64), i64 1)
+}
+
+define i64 @g1() {
+; This cannot be constant folded because A1 aliases G3 which is underalaigned.
+; CHECK-LABEL: @g1(
+; CHECK: ret i64 and
+  ret i64 and (i64 ptrtoint (i32* @A1 to i64), i64 1)
+}
+
+define i64 @g2() {
+; While A2 also aliases G3 which is underaligned, the math of A2 forces a
+; certain alignment allowing this to fold to zero.
+; CHECK-LABEL: @g2(
+; CHECK: ret i64 0
+  ret i64 and (i64 ptrtoint (i32* @A2 to i64), i64 1)
+}
+

diff --git a/test/Transforms/InstCombine/constant-fold-math.ll b/test/Transforms/InstCombine/constant-fold-math.ll
index 14377df..ce8d337 100644
--- a/test/Transforms/InstCombine/constant-fold-math.ll
+++ b/test/Transforms/InstCombine/constant-fold-math.ll

@@ -7,6 +7,7 @@
 declare double @llvm.fma.f64(double, double, double) #0
 declare double @llvm.fmuladd.f64(double, double, double) #0
 
+declare double @llvm.sqrt.f64(double) #0
 
 
 ; CHECK-LABEL: @constant_fold_fma_f32
@@ -44,4 +45,12 @@
   ret double %x
 }
 
+; The sqrt intrinsic is undefined for negative inputs besides -0.0.
+; CHECK-LABEL: @bad_sqrt
+; CHECK-NEXT: ret double undef
+define double @bad_sqrt() {
+  %x = call double @llvm.sqrt.f64(double -2.000000e+00)
+  ret double %x
+}
+
 attributes #0 = { nounwind readnone }

diff --git a/test/Transforms/InstCombine/debug-line.ll b/test/Transforms/InstCombine/debug-line.ll
index 2e3785f..309843f 100644
--- a/test/Transforms/InstCombine/debug-line.ll
+++ b/test/Transforms/InstCombine/debug-line.ll

@@ -15,14 +15,14 @@
 !llvm.module.flags = !{!10}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !8, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 589865, metadata !8} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 589841, metadata !8, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !8, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\004\000\001\000\006\000\000\000", metadata !8, metadata !1, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang\001\00\000\00\000", metadata !8, metadata !4, metadata !4, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 5, i32 2, metadata !6, null}
-!6 = metadata !{i32 589835, metadata !8, metadata !0, i32 4, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0xb\004\0012\000", metadata !8, metadata !0} ; [ DW_TAG_lexical_block ]
 !7 = metadata !{i32 6, i32 1, metadata !6, null}
 !8 = metadata !{metadata !"m.c", metadata !"/private/tmp"}
 !9 = metadata !{metadata !0}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/InstCombine/debuginfo.ll b/test/Transforms/InstCombine/debuginfo.ll
index 75082dc..a7a491e 100644
--- a/test/Transforms/InstCombine/debuginfo.ll
+++ b/test/Transforms/InstCombine/debuginfo.ll

@@ -1,6 +1,6 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) nounwind readnone
 
@@ -14,11 +14,11 @@
   store i8* %__dest, i8** %__dest.addr, align 8
 ; CHECK-NOT: call void @llvm.dbg.declare
 ; CHECK: call void @llvm.dbg.value
-  call void @llvm.dbg.declare(metadata !{i8** %__dest.addr}, metadata !0), !dbg !16
+  call void @llvm.dbg.declare(metadata !{i8** %__dest.addr}, metadata !0, metadata !{}), !dbg !16
   store i32 %__val, i32* %__val.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %__val.addr}, metadata !7), !dbg !18
+  call void @llvm.dbg.declare(metadata !{i32* %__val.addr}, metadata !7, metadata !{}), !dbg !18
   store i64 %__len, i64* %__len.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %__len.addr}, metadata !9), !dbg !20
+  call void @llvm.dbg.declare(metadata !{i64* %__len.addr}, metadata !9, metadata !{}), !dbg !20
   %tmp = load i8** %__dest.addr, align 8, !dbg !21
   %tmp1 = load i32* %__val.addr, align 4, !dbg !21
   %tmp2 = load i64* %__len.addr, align 8, !dbg !21
@@ -31,29 +31,29 @@
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"__dest", metadata !2, i32 16777294, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !27, metadata !2, metadata !"foobar", metadata !"foobar", metadata !"", i32 79, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8* (i8*, i32, i64)* @foobar, null, null, metadata !25, i32 79} ; [ DW_TAG_subprogram ] [line 79] [local] [def] [foobar]
-!2 = metadata !{i32 786473, metadata !27} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !28, i32 12, metadata !"clang version 3.0 (trunk 127710)", i1 true, metadata !"", i32 0, metadata !29, metadata !29, metadata !24, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !27, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00__dest\0016777294\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00foobar\00foobar\00\0079\001\001\000\006\00256\001\0079", metadata !27, metadata !2, metadata !4, null, i8* (i8*, i32, i64)* @foobar, null, null, metadata !25} ; [ DW_TAG_subprogram ] [line 79] [local] [def] [foobar]
+!2 = metadata !{metadata !"0x29", metadata !27} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 127710)\001\00\000\00\000", metadata !28, metadata !29, metadata !29, metadata !24, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !27, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
-!6 = metadata !{i32 786447, null, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"__val", metadata !2, i32 33554510, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786468, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786689, metadata !1, metadata !"__len", metadata !2, i32 50331726, metadata !10, i32 0, null} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{i32 589846, metadata !27, metadata !3, metadata !"size_t", i32 80, i64 0, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_typedef ]
-!11 = metadata !{i32 589846, metadata !27, metadata !3, metadata !"__darwin_size_t", i32 90, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_typedef ]
-!12 = metadata !{i32 786468, null, metadata !3, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !3, null} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0x101\00__val\0033554510\000", metadata !1, metadata !2, metadata !8} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !3} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !"0x101\00__len\0050331726\000", metadata !1, metadata !2, metadata !10} ; [ DW_TAG_arg_variable ]
+!10 = metadata !{metadata !"0x16\00size_t\0080\000\000\000\000", metadata !27, metadata !3, metadata !11} ; [ DW_TAG_typedef ]
+!11 = metadata !{metadata !"0x16\00__darwin_size_t\0090\000\000\000\000", metadata !27, metadata !3, metadata !12} ; [ DW_TAG_typedef ]
+!12 = metadata !{metadata !"0x24\00long unsigned int\000\0064\0064\000\000\007", null, metadata !3} ; [ DW_TAG_base_type ]
 !16 = metadata !{i32 78, i32 28, metadata !1, null}
 !18 = metadata !{i32 78, i32 40, metadata !1, null}
 !20 = metadata !{i32 78, i32 54, metadata !1, null}
 !21 = metadata !{i32 80, i32 3, metadata !22, null}
-!22 = metadata !{i32 786443, metadata !27, metadata !23, i32 80, i32 3, i32 7} ; [ DW_TAG_lexical_block ]
-!23 = metadata !{i32 786443, metadata !27, metadata !1, i32 79, i32 1, i32 6} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\0080\003\007", metadata !27, metadata !23} ; [ DW_TAG_lexical_block ]
+!23 = metadata !{metadata !"0xb\0079\001\006", metadata !27, metadata !1} ; [ DW_TAG_lexical_block ]
 !24 = metadata !{metadata !1}
 !25 = metadata !{metadata !0, metadata !7, metadata !9}
-!26 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
+!26 = metadata !{metadata !"0x29", metadata !28} ; [ DW_TAG_file_type ]
 !27 = metadata !{metadata !"string.h", metadata !"Game"}
 !28 = metadata !{metadata !"bits.c", metadata !"Game"}
 !29 = metadata !{i32 0}
-!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/InstCombine/descale-zero.ll b/test/Transforms/InstCombine/descale-zero.ll
index 7990fdb..4656837 100644
--- a/test/Transforms/InstCombine/descale-zero.ll
+++ b/test/Transforms/InstCombine/descale-zero.ll

@@ -5,8 +5,7 @@
 
 define internal i8* @descale_zero() {
 entry:
-; CHECK: load i16** inttoptr (i64 48 to i16**), align 16
-; CHECK-NEXT: bitcast i16*
+; CHECK: load i8** inttoptr (i64 48 to i8**), align 16
 ; CHECK-NEXT: ret i8*
   %i16_ptr = load i16** inttoptr (i64 48 to i16**), align 16
   %num = load i64* inttoptr (i64 64 to i64*), align 64

diff --git a/test/Transforms/InstCombine/devirt.ll b/test/Transforms/InstCombine/devirt.ll
deleted file mode 100644
index 9c7cf5d..0000000
--- a/test/Transforms/InstCombine/devirt.ll
+++ /dev/null

@@ -1,39 +0,0 @@
-; RUN: opt -instcombine -S < %s | FileCheck %s
-
-; CHECK-NOT: getelementptr
-; CHECK-NOT: ptrtoint
-; CHECK: bitcast i8*
-%struct.S = type { i32 (...)** }
-
-@_ZL1p = internal constant { i64, i64 } { i64 1, i64 0 }, align 8
-
-define void @_Z1g1S(%struct.S* %s) nounwind {
-entry:
-  %tmp = load { i64, i64 }* @_ZL1p, align 8
-  %memptr.adj = extractvalue { i64, i64 } %tmp, 1
-  %0 = bitcast %struct.S* %s to i8*
-  %1 = getelementptr inbounds i8* %0, i64 %memptr.adj
-  %this.adjusted = bitcast i8* %1 to %struct.S*
-  %memptr.ptr = extractvalue { i64, i64 } %tmp, 0
-  %2 = and i64 %memptr.ptr, 1
-  %memptr.isvirtual = icmp ne i64 %2, 0
-  br i1 %memptr.isvirtual, label %memptr.virtual, label %memptr.nonvirtual
-
-memptr.virtual:                                   ; preds = %entry
-  %3 = bitcast %struct.S* %this.adjusted to i8**
-  %memptr.vtable = load i8** %3
-  %4 = sub i64 %memptr.ptr, 1
-  %5 = getelementptr i8* %memptr.vtable, i64 %4
-  %6 = bitcast i8* %5 to void (%struct.S*)**
-  %memptr.virtualfn = load void (%struct.S*)** %6
-  br label %memptr.end
-
-memptr.nonvirtual:                                ; preds = %entry
-  %memptr.nonvirtualfn = inttoptr i64 %memptr.ptr to void (%struct.S*)*
-  br label %memptr.end
-
-memptr.end:                                       ; preds = %memptr.nonvirtual, %memptr.virtual
-  %7 = phi void (%struct.S*)* [ %memptr.virtualfn, %memptr.virtual ], [ %memptr.nonvirtualfn, %memptr.nonvirtual ]
-  call void %7(%struct.S* %this.adjusted)
-  ret void
-}

diff --git a/test/Transforms/InstCombine/div.ll b/test/Transforms/InstCombine/div.ll
index 9c7ba9b..2841043 100644
--- a/test/Transforms/InstCombine/div.ll
+++ b/test/Transforms/InstCombine/div.ll

@@ -132,11 +132,11 @@
 }
 
 define <2 x i64> @test16(<2 x i64> %x) nounwind {
-  %shr = lshr <2 x i64> %x, <i64 3, i64 5>
-  %div = udiv <2 x i64> %shr, <i64 4, i64 6>
+  %shr = lshr <2 x i64> %x, <i64 5, i64 5>
+  %div = udiv <2 x i64> %shr, <i64 6, i64 6>
   ret <2 x i64> %div
 ; CHECK-LABEL: @test16(
-; CHECK-NEXT: udiv <2 x i64> %x, <i64 32, i64 192>
+; CHECK-NEXT: udiv <2 x i64> %x, <i64 192, i64 192>
 ; CHECK-NEXT: ret <2 x i64>
 }
 
@@ -175,3 +175,114 @@
 ; CHECK-NEXT: select i1 %{{.*}}, i32 %x, i32 {{.*}}
 ; CHECK-NEXT: ret i32
 }
+
+define i32 @test21(i32 %a) {
+  %shl = shl nsw i32 %a, 2
+  %div = sdiv i32 %shl, 12
+  ret i32 %div
+; CHECK-LABEL: @test21(
+; CHECK-NEXT: %div = sdiv i32 %a, 3
+; CHECK-NEXT: ret i32 %div
+}
+
+define i32 @test22(i32 %a) {
+  %mul = mul nsw i32 %a, 3
+  %div = sdiv i32 %mul, 12
+  ret i32 %div
+; CHECK-LABEL: @test22(
+; CHECK-NEXT: %div = sdiv i32 %a, 4
+; CHECK-NEXT: ret i32 %div
+}
+
+define i32 @test23(i32 %a) {
+  %shl = shl nuw i32 %a, 2
+  %div = udiv i32 %shl, 12
+  ret i32 %div
+; CHECK-LABEL: @test23(
+; CHECK-NEXT: %div = udiv i32 %a, 3
+; CHECK-NEXT: ret i32 %div
+}
+
+define i32 @test24(i32 %a) {
+  %mul = mul nuw i32 %a, 3
+  %div = udiv i32 %mul, 12
+  ret i32 %div
+; CHECK-LABEL: @test24(
+; CHECK-NEXT: %div = lshr i32 %a, 2
+; CHECK-NEXT: ret i32 %div
+}
+
+define i32 @test25(i32 %a) {
+  %shl = shl nsw i32 %a, 2
+  %div = sdiv i32 %shl, 2
+  ret i32 %div
+; CHECK-LABEL: @test25(
+; CHECK-NEXT: %div = shl i32 %a, 1
+; CHECK-NEXT: ret i32 %div
+}
+
+define i32 @test26(i32 %a) {
+  %mul = mul nsw i32 %a, 12
+  %div = sdiv i32 %mul, 3
+  ret i32 %div
+; CHECK-LABEL: @test26(
+; CHECK-NEXT: %div = shl i32 %a, 2
+; CHECK-NEXT: ret i32 %div
+}
+
+define i32 @test27(i32 %a) {
+  %shl = shl nuw i32 %a, 2
+  %div = udiv i32 %shl, 2
+  ret i32 %div
+; CHECK-LABEL: @test27(
+; CHECK-NEXT: %div = shl nuw i32 %a, 1
+; CHECK-NEXT: ret i32 %div
+}
+
+define i32 @test28(i32 %a) {
+  %mul = mul nuw i32 %a, 36
+  %div = udiv i32 %mul, 3
+  ret i32 %div
+; CHECK-LABEL: @test28(
+; CHECK-NEXT: %div = mul nuw i32 %a, 12
+; CHECK-NEXT: ret i32 %div
+}
+
+define i32 @test29(i32 %a) {
+  %mul = shl nsw i32 %a, 31
+  %div = sdiv i32 %mul, -2147483648
+  ret i32 %div
+; CHECK-LABEL: @test29(
+; CHECK-NEXT: %[[and:.*]] = and i32 %a, 1
+; CHECK-NEXT: ret i32 %[[and]]
+}
+
+define i32 @test30(i32 %a) {
+  %mul = shl nuw i32 %a, 31
+  %div = udiv i32 %mul, -2147483648
+  ret i32 %div
+; CHECK-LABEL: @test30(
+; CHECK-NEXT: ret i32 %a
+}
+
+define <2 x i32> @test31(<2 x i32> %x) {
+  %shr = lshr <2 x i32> %x, <i32 31, i32 31>
+  %div = udiv <2 x i32> %shr, <i32 2147483647, i32 2147483647>
+  ret <2 x i32> %div
+; CHECK-LABEL: @test31(
+; CHECK-NEXT: %[[shr:.*]] = lshr <2 x i32> %x, <i32 31, i32 31>
+; CHECK-NEXT: udiv <2 x i32> %[[shr]], <i32 2147483647, i32 2147483647>
+; CHECK-NEXT: ret <2 x i32>
+}
+
+define i32 @test32(i32 %a, i32 %b) {
+  %shl = shl i32 2, %b
+  %div = lshr i32 %shl, 2
+  %div2 = udiv i32 %a, %div
+  ret i32 %div2
+; CHECK-LABEL: @test32(
+; CHECK-NEXT: %[[shl:.*]] = shl i32 2, %b
+; CHECK-NEXT: %[[shr:.*]] = lshr i32 %[[shl]], 2
+; CHECK-NEXT: %[[div:.*]] = udiv i32 %a, %[[shr]]
+; CHECK-NEXT: ret i32
+}

diff --git a/test/Transforms/InstCombine/double-float-shrink-1.ll b/test/Transforms/InstCombine/double-float-shrink-1.ll
index d958470..63a02bb 100644
--- a/test/Transforms/InstCombine/double-float-shrink-1.ll
+++ b/test/Transforms/InstCombine/double-float-shrink-1.ll

@@ -1,349 +1,366 @@
-; RUN: opt < %s -instcombine -enable-double-float-shrink -S | FileCheck %s
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define float @acos_test(float %f) nounwind readnone {
-; CHECK: acos_test
+; Check for and against shrinkage when using the
+; unsafe-fp-math function attribute on a math lib
+; function. This optimization may be overridden by
+; the -enable-double-float-shrink option.
+; PR17850: http://llvm.org/bugs/show_bug.cgi?id=17850
+
+define float @acos_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @acos(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: acos_test
 ; CHECK: call float @acosf(float %f)
 }
 
-define double @acos_test2(float %f) nounwind readnone {
-; CHECK: acos_test2
+define double @acos_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @acos(double %conv)
    ret double %call
+; CHECK-LABEL: acos_test2
 ; CHECK: call double @acos(double %conv)
 }
 
-define float @acosh_test(float %f) nounwind readnone {
-; CHECK: acosh_test
+define float @acosh_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @acosh(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: acosh_test
 ; CHECK: call float @acoshf(float %f)
 }
 
-define double @acosh_test2(float %f) nounwind readnone {
-; CHECK: acosh_test2
+define double @acosh_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @acosh(double %conv)
    ret double %call
+; CHECK-LABEL: acosh_test2
 ; CHECK: call double @acosh(double %conv)
 }
 
-define float @asin_test(float %f) nounwind readnone {
-; CHECK: asin_test
+define float @asin_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @asin(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: asin_test
 ; CHECK: call float @asinf(float %f)
 }
 
-define double @asin_test2(float %f) nounwind readnone {
-; CHECK: asin_test2
+define double @asin_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @asin(double %conv)
    ret double %call
+; CHECK-LABEL: asin_test2
 ; CHECK: call double @asin(double %conv)
 }
 
-define float @asinh_test(float %f) nounwind readnone {
-; CHECK: asinh_test
+define float @asinh_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @asinh(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: asinh_test
 ; CHECK: call float @asinhf(float %f)
 }
 
-define double @asinh_test2(float %f) nounwind readnone {
-; CHECK: asinh_test2
+define double @asinh_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @asinh(double %conv)
    ret double %call
+; CHECK-LABEL: asinh_test2
 ; CHECK: call double @asinh(double %conv)
 }
 
-define float @atan_test(float %f) nounwind readnone {
-; CHECK: atan_test
+define float @atan_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @atan(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: atan_test
 ; CHECK: call float @atanf(float %f)
 }
 
-define double @atan_test2(float %f) nounwind readnone {
-; CHECK: atan_test2
+define double @atan_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @atan(double %conv)
    ret double %call
+; CHECK-LABEL: atan_test2
 ; CHECK: call double @atan(double %conv)
 }
-define float @atanh_test(float %f) nounwind readnone {
-; CHECK: atanh_test
+define float @atanh_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @atanh(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: atanh_test
 ; CHECK: call float @atanhf(float %f)
 }
 
-define double @atanh_test2(float %f) nounwind readnone {
-; CHECK: atanh_test2
+define double @atanh_test2(float %f)   {
     %conv = fpext float %f to double
     %call = call double @atanh(double %conv)
     ret double %call
+; CHECK-LABEL: atanh_test2
 ; CHECK: call double @atanh(double %conv)
 }
-define float @cbrt_test(float %f) nounwind readnone {
-; CHECK: cbrt_test
+define float @cbrt_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @cbrt(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: cbrt_test
 ; CHECK: call float @cbrtf(float %f)
 }
 
-define double @cbrt_test2(float %f) nounwind readnone {
-; CHECK: cbrt_test2
+define double @cbrt_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @cbrt(double %conv)
    ret double %call
+; CHECK-LABEL: cbrt_test2
 ; CHECK: call double @cbrt(double %conv)
 }
-define float @exp_test(float %f) nounwind readnone {
-; CHECK: exp_test
+define float @exp_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @exp(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: exp_test
 ; CHECK: call float @expf(float %f)
 }
 
-define double @exp_test2(float %f) nounwind readnone {
-; CHECK: exp_test2
+define double @exp_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @exp(double %conv)
    ret double %call
+; CHECK-LABEL: exp_test2
 ; CHECK: call double @exp(double %conv)
 }
-define float @expm1_test(float %f) nounwind readnone {
-; CHECK: expm1_test
+define float @expm1_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @expm1(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: expm1_test
 ; CHECK: call float @expm1f(float %f)
 }
 
-define double @expm1_test2(float %f) nounwind readnone {
-; CHECK: expm1_test2
+define double @expm1_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @expm1(double %conv)
    ret double %call
+; CHECK-LABEL: expm1_test2
 ; CHECK: call double @expm1(double %conv)
 }
-define float @exp10_test(float %f) nounwind readnone {
-; CHECK: exp10_test
+define float @exp10_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @exp10(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
-; FIXME: Re-enable this when Linux allows transforming this again, or when we
-; can use builtin attributes to test the transform regardless of OS.
-; DISABLED-CHECK: call float @exp10f(float %f)
+; CHECK-LABEL: exp10_test
 ; CHECK: call double @exp10(double %conv)
 }
 
-define double @exp10_test2(float %f) nounwind readnone {
-; CHECK: exp10_test2
+define double @exp10_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @exp10(double %conv)
    ret double %call
+; CHECK-LABEL: exp10_test2
 ; CHECK: call double @exp10(double %conv)
 }
-define float @log_test(float %f) nounwind readnone {
-; CHECK: log_test
+define float @log_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @log(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: log_test
 ; CHECK: call float @logf(float %f)
 }
 
-define double @log_test2(float %f) nounwind readnone {
-; CHECK: log_test2
+define double @log_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @log(double %conv)
    ret double %call
+; CHECK-LABEL: log_test2
 ; CHECK: call double @log(double %conv)
 }
-define float @log10_test(float %f) nounwind readnone {
-; CHECK: log10_test
+define float @log10_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @log10(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: log10_test
 ; CHECK: call float @log10f(float %f)
 }
 
-define double @log10_test2(float %f) nounwind readnone {
-; CHECK: log10_test2
+define double @log10_test2(float %f) {
    %conv = fpext float %f to double
    %call = call double @log10(double %conv)
    ret double %call
+; CHECK-LABEL: log10_test2
 ; CHECK: call double @log10(double %conv)
 }
-define float @log1p_test(float %f) nounwind readnone {
-; CHECK: log1p_test
+define float @log1p_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @log1p(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: log1p_test
 ; CHECK: call float @log1pf(float %f)
 }
 
-define double @log1p_test2(float %f) nounwind readnone {
-; CHECK: log1p_test2
+define double @log1p_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @log1p(double %conv)
    ret double %call
+; CHECK-LABEL: log1p_test2
 ; CHECK: call double @log1p(double %conv)
 }
-define float @log2_test(float %f) nounwind readnone {
-; CHECK: log2_test
+define float @log2_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @log2(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: log2_test
 ; CHECK: call float @log2f(float %f)
 }
 
-define double @log2_test2(float %f) nounwind readnone {
-; CHECK: log2_test2
+define double @log2_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @log2(double %conv)
    ret double %call
+; CHECK-LABEL: log2_test2
 ; CHECK: call double @log2(double %conv)
 }
-define float @logb_test(float %f) nounwind readnone {
-; CHECK: logb_test
+define float @logb_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @logb(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: logb_test
 ; CHECK: call float @logbf(float %f)
 }
 
-define double @logb_test2(float %f) nounwind readnone {
-; CHECK: logb_test2
+define double @logb_test2(float %f)   {
    %conv = fpext float %f to double
    %call = call double @logb(double %conv)
    ret double %call
+; CHECK-LABEL: logb_test2
 ; CHECK: call double @logb(double %conv)
 }
-define float @sin_test(float %f) nounwind readnone {
-; CHECK: sin_test
+define float @sin_test(float %f)   {
    %conv = fpext float %f to double
    %call = call double @sin(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: sin_test
 ; CHECK: call float @sinf(float %f)
 }
 
-define double @sin_test2(float %f) nounwind readnone {
-; CHECK: sin_test2
+define double @sin_test2(float %f) {
    %conv = fpext float %f to double
    %call = call double @sin(double %conv)
    ret double %call
+; CHECK-LABEL: sin_test2
 ; CHECK: call double @sin(double %conv)
 }
 
-define float @sqrt_test(float %f) nounwind readnone {
-; CHECK: sqrt_test
+define float @sqrt_test(float %f) {
    %conv = fpext float %f to double
    %call = call double @sqrt(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: sqrt_test
 ; CHECK: call float @sqrtf(float %f)
 }
 
-define float @sqrt_int_test(float %f) nounwind readnone {
-; CHECK: sqrt_int_test
+define double @sqrt_test2(float %f) {
+   %conv = fpext float %f to double
+   %call = call double @sqrt(double %conv)
+   ret double %call
+; CHECK-LABEL: sqrt_test2
+; CHECK: call double @sqrt(double %conv)
+}
+
+define float @sqrt_int_test(float %f) {
    %conv = fpext float %f to double
    %call = call double @llvm.sqrt.f64(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: sqrt_int_test
 ; CHECK: call float @llvm.sqrt.f32(float %f)
 }
 
-define double @sqrt_test2(float %f) nounwind readnone {
-; CHECK: sqrt_test2
+define double @sqrt_int_test2(float %f) {
    %conv = fpext float %f to double
-   %call = call double @sqrt(double %conv)
+   %call = call double @llvm.sqrt.f64(double %conv)
    ret double %call
-; CHECK: call double @sqrt(double %conv)
+; CHECK-LABEL: sqrt_int_test2
+; CHECK: call double @llvm.sqrt.f64(double %conv)
 }
-define float @tan_test(float %f) nounwind readnone {
-; CHECK: tan_test
+
+define float @tan_test(float %f) {
    %conv = fpext float %f to double
    %call = call double @tan(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: tan_test
 ; CHECK: call float @tanf(float %f)
 }
 
-define double @tan_test2(float %f) nounwind readnone {
-; CHECK: tan_test2
+define double @tan_test2(float %f) {
    %conv = fpext float %f to double
    %call = call double @tan(double %conv)
    ret double %call
+; CHECK-LABEL: tan_test2
 ; CHECK: call double @tan(double %conv)
 }
-define float @tanh_test(float %f) nounwind readnone {
-; CHECK: tanh_test
+define float @tanh_test(float %f) {
    %conv = fpext float %f to double
    %call = call double @tanh(double %conv)
    %conv1 = fptrunc double %call to float
    ret float %conv1
+; CHECK-LABEL: tanh_test
 ; CHECK: call float @tanhf(float %f)
 }
 
-define double @tanh_test2(float %f) nounwind readnone {
-; CHECK: tanh_test2
+define double @tanh_test2(float %f) {
    %conv = fpext float %f to double
    %call = call double @tanh(double %conv)
    ret double %call
+; CHECK-LABEL: tanh_test2
 ; CHECK: call double @tanh(double %conv)
 }
 
-declare double @tanh(double) nounwind readnone
-declare double @tan(double) nounwind readnone
-declare double @sqrt(double) nounwind readnone
-declare double @sin(double) nounwind readnone
-declare double @log2(double) nounwind readnone
-declare double @log1p(double) nounwind readnone
-declare double @log10(double) nounwind readnone
-declare double @log(double) nounwind readnone
-declare double @logb(double) nounwind readnone
-declare double @exp10(double) nounwind readnone
-declare double @expm1(double) nounwind readnone
-declare double @exp(double) nounwind readnone
-declare double @cbrt(double) nounwind readnone
-declare double @atanh(double) nounwind readnone
-declare double @atan(double) nounwind readnone
-declare double @acos(double) nounwind readnone
-declare double @acosh(double) nounwind readnone
-declare double @asin(double) nounwind readnone
-declare double @asinh(double) nounwind readnone
+declare double @tanh(double) #1
+declare double @tan(double) #1
 
-declare double @llvm.sqrt.f64(double) nounwind readnone
+; sqrt is a special case: the shrinking optimization 
+; is valid even without unsafe-fp-math.
+declare double @sqrt(double) 
+declare double @llvm.sqrt.f64(double) 
+
+declare double @sin(double) #1
+declare double @log2(double) #1
+declare double @log1p(double) #1
+declare double @log10(double) #1
+declare double @log(double) #1
+declare double @logb(double) #1
+declare double @exp10(double) #1
+declare double @expm1(double) #1
+declare double @exp(double) #1
+declare double @cbrt(double) #1
+declare double @atanh(double) #1
+declare double @atan(double) #1
+declare double @acos(double) #1
+declare double @acosh(double) #1
+declare double @asin(double) #1
+declare double @asinh(double) #1
+
+attributes #1 = { "unsafe-fp-math"="true" }
 

diff --git a/test/Transforms/InstCombine/fabs.ll b/test/Transforms/InstCombine/fabs.ll
new file mode 100644
index 0000000..0479549
--- /dev/null
+++ b/test/Transforms/InstCombine/fabs.ll

@@ -0,0 +1,100 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Make sure all library calls are eliminated when the input is known positive.
+
+declare float @fabsf(float)
+declare double @fabs(double)
+declare fp128 @fabsl(fp128)
+
+define float @square_fabs_call_f32(float %x) {
+  %mul = fmul float %x, %x
+  %fabsf = tail call float @fabsf(float %mul)
+  ret float %fabsf
+
+; CHECK-LABEL: square_fabs_call_f32(
+; CHECK-NEXT: %mul = fmul float %x, %x
+; CHECK-NEXT: ret float %mul
+}
+
+define double @square_fabs_call_f64(double %x) {
+  %mul = fmul double %x, %x
+  %fabs = tail call double @fabs(double %mul)
+  ret double %fabs
+
+; CHECK-LABEL: square_fabs_call_f64(
+; CHECK-NEXT: %mul = fmul double %x, %x
+; CHECK-NEXT: ret double %mul
+}
+
+define fp128 @square_fabs_call_f128(fp128 %x) {
+  %mul = fmul fp128 %x, %x
+  %fabsl = tail call fp128 @fabsl(fp128 %mul)
+  ret fp128 %fabsl
+
+; CHECK-LABEL: square_fabs_call_f128(
+; CHECK-NEXT: %mul = fmul fp128 %x, %x
+; CHECK-NEXT: ret fp128 %mul
+}
+
+; Make sure all intrinsic calls are eliminated when the input is known positive.
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare fp128 @llvm.fabs.f128(fp128)
+
+define float @square_fabs_intrinsic_f32(float %x) {
+  %mul = fmul float %x, %x
+  %fabsf = tail call float @llvm.fabs.f32(float %mul)
+  ret float %fabsf
+
+; CHECK-LABEL: square_fabs_intrinsic_f32(
+; CHECK-NEXT: %mul = fmul float %x, %x
+; CHECK-NEXT: ret float %mul
+}
+
+define double @square_fabs_intrinsic_f64(double %x) {
+  %mul = fmul double %x, %x
+  %fabs = tail call double @llvm.fabs.f64(double %mul)
+  ret double %fabs
+
+; CHECK-LABEL: square_fabs_intrinsic_f64(
+; CHECK-NEXT: %mul = fmul double %x, %x
+; CHECK-NEXT: ret double %mul
+}
+
+define fp128 @square_fabs_intrinsic_f128(fp128 %x) {
+  %mul = fmul fp128 %x, %x
+  %fabsl = tail call fp128 @llvm.fabs.f128(fp128 %mul)
+  ret fp128 %fabsl
+
+; CHECK-LABEL: square_fabs_intrinsic_f128(
+; CHECK-NEXT: %mul = fmul fp128 %x, %x
+; CHECK-NEXT: ret fp128 %mul
+}
+
+; Shrinking a library call to a smaller type should not be inhibited by nor inhibit the square optimization.
+
+define float @square_fabs_shrink_call1(float %x) {
+  %ext = fpext float %x to double
+  %sq = fmul double %ext, %ext
+  %fabs = call double @fabs(double %sq)
+  %trunc = fptrunc double %fabs to float
+  ret float %trunc
+
+; CHECK-LABEL: square_fabs_shrink_call1(
+; CHECK-NEXT: %trunc = fmul float %x, %x
+; CHECK-NEXT: ret float %trunc
+}
+
+define float @square_fabs_shrink_call2(float %x) {
+  %sq = fmul float %x, %x
+  %ext = fpext float %sq to double
+  %fabs = call double @fabs(double %ext)
+  %trunc = fptrunc double %fabs to float
+  ret float %trunc
+
+; CHECK-LABEL: square_fabs_shrink_call2(
+; CHECK-NEXT: %sq = fmul float %x, %x
+; CHECK-NEXT: ret float %sq
+}
+

diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index 2ee4b0f..b0ec895 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll

@@ -530,3 +530,173 @@
 ; CHECK: fact_div6
 ; CHECK: %t3 = fsub fast float %t1, %t2
 }
+
+; =========================================================================
+;
+;   Test-cases for square root
+;
+; =========================================================================
+
+; A squared factor fed into a square root intrinsic should be hoisted out
+; as a fabs() value.
+; We have to rely on a function-level attribute to enable this optimization
+; because intrinsics don't currently have access to IR-level fast-math
+; flags. If that changes, we can relax the requirement on all of these
+; tests to just specify 'fast' on the sqrt.
+
+attributes #0 = { "unsafe-fp-math" = "true" }
+
+declare double @llvm.sqrt.f64(double)
+
+define double @sqrt_intrinsic_arg_squared(double %x) #0 {
+  %mul = fmul fast double %x, %x
+  %sqrt = call double @llvm.sqrt.f64(double %mul)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_intrinsic_arg_squared(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
+; CHECK-NEXT: ret double %fabs
+}
+
+; Check all 6 combinations of a 3-way multiplication tree where
+; one factor is repeated.
+
+define double @sqrt_intrinsic_three_args1(double %x, double %y) #0 {
+  %mul = fmul fast double %y, %x
+  %mul2 = fmul fast double %mul, %x
+  %sqrt = call double @llvm.sqrt.f64(double %mul2)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_intrinsic_three_args1(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
+; CHECK-NEXT: %sqrt1 = call double @llvm.sqrt.f64(double %y)
+; CHECK-NEXT: %1 = fmul fast double %fabs, %sqrt1
+; CHECK-NEXT: ret double %1
+}
+
+define double @sqrt_intrinsic_three_args2(double %x, double %y) #0 {
+  %mul = fmul fast double %x, %y
+  %mul2 = fmul fast double %mul, %x
+  %sqrt = call double @llvm.sqrt.f64(double %mul2)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_intrinsic_three_args2(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
+; CHECK-NEXT: %sqrt1 = call double @llvm.sqrt.f64(double %y)
+; CHECK-NEXT: %1 = fmul fast double %fabs, %sqrt1
+; CHECK-NEXT: ret double %1
+}
+
+define double @sqrt_intrinsic_three_args3(double %x, double %y) #0 {
+  %mul = fmul fast double %x, %x
+  %mul2 = fmul fast double %mul, %y
+  %sqrt = call double @llvm.sqrt.f64(double %mul2)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_intrinsic_three_args3(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
+; CHECK-NEXT: %sqrt1 = call double @llvm.sqrt.f64(double %y)
+; CHECK-NEXT: %1 = fmul fast double %fabs, %sqrt1
+; CHECK-NEXT: ret double %1
+}
+
+define double @sqrt_intrinsic_three_args4(double %x, double %y) #0 {
+  %mul = fmul fast double %y, %x
+  %mul2 = fmul fast double %x, %mul
+  %sqrt = call double @llvm.sqrt.f64(double %mul2)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_intrinsic_three_args4(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
+; CHECK-NEXT: %sqrt1 = call double @llvm.sqrt.f64(double %y)
+; CHECK-NEXT: %1 = fmul fast double %fabs, %sqrt1
+; CHECK-NEXT: ret double %1
+}
+
+define double @sqrt_intrinsic_three_args5(double %x, double %y) #0 {
+  %mul = fmul fast double %x, %y
+  %mul2 = fmul fast double %x, %mul
+  %sqrt = call double @llvm.sqrt.f64(double %mul2)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_intrinsic_three_args5(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
+; CHECK-NEXT: %sqrt1 = call double @llvm.sqrt.f64(double %y)
+; CHECK-NEXT: %1 = fmul fast double %fabs, %sqrt1
+; CHECK-NEXT: ret double %1
+}
+
+define double @sqrt_intrinsic_three_args6(double %x, double %y) #0 {
+  %mul = fmul fast double %x, %x
+  %mul2 = fmul fast double %y, %mul
+  %sqrt = call double @llvm.sqrt.f64(double %mul2)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_intrinsic_three_args6(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
+; CHECK-NEXT: %sqrt1 = call double @llvm.sqrt.f64(double %y)
+; CHECK-NEXT: %1 = fmul fast double %fabs, %sqrt1
+; CHECK-NEXT: ret double %1
+}
+
+define double @sqrt_intrinsic_arg_4th(double %x) #0 {
+  %mul = fmul fast double %x, %x
+  %mul2 = fmul fast double %mul, %mul
+  %sqrt = call double @llvm.sqrt.f64(double %mul2)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_intrinsic_arg_4th(
+; CHECK-NEXT: %mul = fmul fast double %x, %x
+; CHECK-NEXT: ret double %mul
+}
+
+define double @sqrt_intrinsic_arg_5th(double %x) #0 {
+  %mul = fmul fast double %x, %x
+  %mul2 = fmul fast double %mul, %x
+  %mul3 = fmul fast double %mul2, %mul
+  %sqrt = call double @llvm.sqrt.f64(double %mul3)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_intrinsic_arg_5th(
+; CHECK-NEXT: %mul = fmul fast double %x, %x
+; CHECK-NEXT: %sqrt1 = call double @llvm.sqrt.f64(double %x)
+; CHECK-NEXT: %1 = fmul fast double %mul, %sqrt1
+; CHECK-NEXT: ret double %1
+}
+
+; Check that square root calls have the same behavior.
+
+declare float @sqrtf(float)
+declare double @sqrt(double)
+declare fp128 @sqrtl(fp128)
+
+define float @sqrt_call_squared_f32(float %x) #0 {
+  %mul = fmul fast float %x, %x
+  %sqrt = call float @sqrtf(float %mul)
+  ret float %sqrt
+
+; CHECK-LABEL: sqrt_call_squared_f32(
+; CHECK-NEXT: %fabs = call float @llvm.fabs.f32(float %x)
+; CHECK-NEXT: ret float %fabs
+}
+
+define double @sqrt_call_squared_f64(double %x) #0 {
+  %mul = fmul fast double %x, %x
+  %sqrt = call double @sqrt(double %mul)
+  ret double %sqrt
+
+; CHECK-LABEL: sqrt_call_squared_f64(
+; CHECK-NEXT: %fabs = call double @llvm.fabs.f64(double %x)
+; CHECK-NEXT: ret double %fabs
+}
+
+define fp128 @sqrt_call_squared_f128(fp128 %x) #0 {
+  %mul = fmul fast fp128 %x, %x
+  %sqrt = call fp128 @sqrtl(fp128 %mul)
+  ret fp128 %sqrt
+
+; CHECK-LABEL: sqrt_call_squared_f128(
+; CHECK-NEXT: %fabs = call fp128 @llvm.fabs.f128(fp128 %x)
+; CHECK-NEXT: ret fp128 %fabs
+}
+

diff --git a/test/Transforms/InstCombine/fmul.ll b/test/Transforms/InstCombine/fmul.ll
index 18cbf9d..a776765 100644
--- a/test/Transforms/InstCombine/fmul.ll
+++ b/test/Transforms/InstCombine/fmul.ll

@@ -123,3 +123,32 @@
 ; CHECK-NOT: fadd float
 ; CHECK: fadd fast float
 }
+
+; PR21126: http://llvm.org/bugs/show_bug.cgi?id=21126
+; With unsafe/fast math, sqrt(X) * sqrt(X) is just X.
+declare double @llvm.sqrt.f64(double)
+
+define double @sqrt_squared1(double %f) {
+  %sqrt = call double @llvm.sqrt.f64(double %f)
+  %mul = fmul fast double %sqrt, %sqrt
+  ret double %mul
+; CHECK-LABEL: @sqrt_squared1(
+; CHECK-NEXT: ret double %f
+}
+
+; With unsafe/fast math, sqrt(X) * sqrt(X) is just X, 
+; but make sure another use of the sqrt is intact.
+; Note that the remaining fmul is altered but is not 'fast'
+; itself because it was not marked 'fast' originally. 
+; Thus, we have an overall fast result, but no more indication of
+; 'fast'ness in the code.
+define double @sqrt_squared2(double %f) {
+  %sqrt = call double @llvm.sqrt.f64(double %f)
+  %mul1 = fmul fast double %sqrt, %sqrt
+  %mul2 = fmul double %mul1, %sqrt
+  ret double %mul2
+; CHECK-LABEL: @sqrt_squared2(
+; CHECK-NEXT: %sqrt = call double @llvm.sqrt.f64(double %f)
+; CHECK-NEXT: %mul2 = fmul double %sqrt, %f
+; CHECK-NEXT: ret double %mul2
+}

diff --git a/test/Transforms/InstCombine/fold-phi.ll b/test/Transforms/InstCombine/fold-phi.ll
index bd01d58..c6bb1b3 100644
--- a/test/Transforms/InstCombine/fold-phi.ll
+++ b/test/Transforms/InstCombine/fold-phi.ll

@@ -17,23 +17,23 @@
   ret float %add5
 }
 
-; CHECK: fold_phi
-define float @fold_phi(float %a) nounwind {
+; CHECK-LABEL: @pr21377(
+define void @pr21377(i32) {
 entry:
-  br label %for.body
+  br label %while.body
 
-for.body:
-; CHECK: phi float
-; CHECK-NEXT: br i1 undef
-  %sum.057 = phi float [ 0.000000e+00, %entry ], [ %add5, %bb0 ]
-  %add5 = fadd float %sum.057, 1.0 ;; Should be moved to the latch!
-  br i1 undef, label %bb0, label %end
+while.body:                                       ; preds = %if.end, %entry
+  %phi1 = phi i64 [ undef, %entry ], [ %or2, %if.end ]
+  %zext = zext i32 %0 to i64
+  br i1 undef, label %if.end, label %if.else
 
-; CHECK: bb0:
-bb0:
-; CHECK: fadd float
-  br label %for.body
+if.else:                                          ; preds = %while.body
+  %or1 = or i64 %phi1, %zext
+  %and = and i64 %or1, 4294967295
+  br label %if.end
 
-end:
-  ret float %add5
+if.end:                                           ; preds = %if.else, %while.body
+  %phi2 = phi i64 [ %and, %if.else ], [ undef, %while.body ]
+  %or2 = or i64 %phi2, %zext
+  br label %while.body
 }

diff --git a/test/Transforms/InstCombine/fpcast.ll b/test/Transforms/InstCombine/fpcast.ll
index 9be66fd..ac03402 100644
--- a/test/Transforms/InstCombine/fpcast.ll
+++ b/test/Transforms/InstCombine/fpcast.ll

@@ -53,3 +53,23 @@
 }
 
 declare float @llvm.fabs.f32(float) nounwind readonly
+
+define <1 x float> @test6(<1 x double> %V) {
+  %frem = frem <1 x double> %V, %V
+  %trunc = fptrunc <1 x double> %frem to <1 x float>
+  ret <1 x float> %trunc
+; CHECK-LABEL: @test6
+; CHECK-NEXT: %[[frem:.*]]  = frem <1 x double> %V, %V
+; CHECK-NEXT: %[[trunc:.*]] = fptrunc <1 x double> %[[frem]] to <1 x float>
+; CHECK-NEXT: ret <1 x float> %trunc
+}
+
+define float @test7(double %V) {
+  %frem = frem double %V, 1.000000e+00
+  %trunc = fptrunc double %frem to float
+  ret float %trunc
+; CHECK-LABEL: @test7
+; CHECK-NEXT: %[[frem:.*]]  = frem double %V, 1.000000e+00
+; CHECK-NEXT: %[[trunc:.*]] = fptrunc double %frem to float
+; CHECK-NEXT: ret float %trunc
+}

diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll
index 3240c6d..bb46662 100644
--- a/test/Transforms/InstCombine/getelementptr.ll
+++ b/test/Transforms/InstCombine/getelementptr.ll

@@ -6,6 +6,7 @@
 %pair = type { i32, i32 }
 %struct.B = type { double }
 %struct.A = type { %struct.B, i32, i32 }
+%struct.C = type { [7 x i8] }
 
 
 @Global = constant [10 x i8] c"helloworld"
@@ -580,6 +581,16 @@
   ret i32 addrspace(1)* %C
 }
 
+define i32 addrspace(1)* @test33_addrspacecast(%struct.Key* %A) {
+; CHECK-LABEL: @test33_addrspacecast(
+; CHECK: %C = getelementptr %struct.Key* %A, i64 0, i32 0, i32 1
+; CHECK-NEXT: addrspacecast i32* %C to i32 addrspace(1)*
+; CHECK-NEXT: ret
+  %B = addrspacecast %struct.Key* %A to %struct.anon addrspace(1)*
+  %C = getelementptr %struct.anon addrspace(1)* %B, i32 0, i32 2
+  ret i32 addrspace(1)* %C
+}
+
 	%T2 = type { i8*, i8 }
 define i8* @test34(i8* %Val, i64 %V) nounwind {
 entry:
@@ -692,7 +703,7 @@
 
 ; CHECK-LABEL: @test39(
 ; CHECK: getelementptr inbounds %struct.ham* %arg, i64 0, i32 2
-; CHECK: getelementptr inbounds i8* %tmp3, i64 -8
+; CHECK: getelementptr inbounds i8* %{{.+}}, i64 -8
 }
 
 define i1 @pr16483([1 x i8]* %a, [1 x i8]* %b) {
@@ -803,6 +814,78 @@
 ; CHECK-NEXT: ret i16 8
 }
 
+define i8* @test42(i8* %c1, i8* %c2) {
+  %ptrtoint = ptrtoint i8* %c1 to i64
+  %sub = sub i64 0, %ptrtoint
+  %gep = getelementptr inbounds i8* %c2, i64 %sub
+  ret i8* %gep
+
+; CHECK-LABEL: @test42(
+; CHECK-NEXT:  [[PTRTOINT1:%.*]] = ptrtoint i8* %c1 to i64
+; CHECK-NEXT:  [[PTRTOINT2:%.*]] = ptrtoint i8* %c2 to i64
+; CHECK-NEXT:  [[SUB:%.*]] = sub i64 [[PTRTOINT2]], [[PTRTOINT1]]
+; CHECK-NEXT:  [[INTTOPTR:%.*]] = inttoptr i64 [[SUB]] to i8*
+; CHECK-NEXT:  ret i8* [[INTTOPTR]]
+}
+
+define i16* @test43(i16* %c1, i16* %c2) {
+  %ptrtoint = ptrtoint i16* %c1 to i64
+  %sub = sub i64 0, %ptrtoint
+  %shr = ashr i64 %sub, 1
+  %gep = getelementptr inbounds i16* %c2, i64 %shr
+  ret i16* %gep
+
+; CHECK-LABEL: @test43(
+; CHECK-NEXT:  [[PTRTOINT1:%.*]] = ptrtoint i16* %c1 to i64
+; CHECK-NEXT:  [[PTRTOINT2:%.*]] = ptrtoint i16* %c2 to i64
+; CHECK-NEXT:  [[SUB:%.*]] = sub i64 [[PTRTOINT2]], [[PTRTOINT1]]
+; CHECK-NEXT:  [[INTTOPTR:%.*]] = inttoptr i64 [[SUB]] to i16*
+; CHECK-NEXT:  ret i16* [[INTTOPTR]]
+}
+
+define %struct.C* @test44(%struct.C* %c1, %struct.C* %c2) {
+  %ptrtoint = ptrtoint %struct.C* %c1 to i64
+  %sub = sub i64 0, %ptrtoint
+  %shr = sdiv i64 %sub, 7
+  %gep = getelementptr inbounds %struct.C* %c2, i64 %shr
+  ret %struct.C* %gep
+
+; CHECK-LABEL: @test44(
+; CHECK-NEXT:  [[PTRTOINT1:%.*]] = ptrtoint %struct.C* %c1 to i64
+; CHECK-NEXT:  [[PTRTOINT2:%.*]] = ptrtoint %struct.C* %c2 to i64
+; CHECK-NEXT:  [[SUB:%.*]] = sub i64 [[PTRTOINT2]], [[PTRTOINT1]]
+; CHECK-NEXT:  [[INTTOPTR:%.*]] = inttoptr i64 [[SUB]] to %struct.C*
+; CHECK-NEXT:  ret %struct.C* [[INTTOPTR]]
+}
+
+define %struct.C* @test45(%struct.C* %c1, %struct.C** %c2) {
+  %ptrtoint1 = ptrtoint %struct.C* %c1 to i64
+  %ptrtoint2 = ptrtoint %struct.C** %c2 to i64
+  %sub = sub i64 %ptrtoint2, %ptrtoint1 ; C2 - C1
+  %shr = sdiv i64 %sub, 7
+  %gep = getelementptr inbounds %struct.C* %c1, i64 %shr ; C1 + (C2 - C1)
+  ret %struct.C* %gep
+
+; CHECK-LABEL: @test45(
+; CHECK-NEXT:  [[BITCAST:%.*]] = bitcast %struct.C** %c2 to %struct.C*
+; CHECK-NEXT:  ret %struct.C* [[BITCAST]]
+}
+
+define %struct.C* @test46(%struct.C* %c1, %struct.C* %c2, i64 %N) {
+  %ptrtoint = ptrtoint %struct.C* %c1 to i64
+  %sub = sub i64 0, %ptrtoint
+  %sdiv = sdiv i64 %sub, %N
+  %gep = getelementptr inbounds %struct.C* %c2, i64 %sdiv
+  ret %struct.C* %gep
+
+; CHECK-LABEL: @test46(
+; CHECK-NEXT:  [[PTRTOINT:%.*]] = ptrtoint %struct.C* %c1 to i64
+; CHECK-NEXT:  [[SUB:%.*]] = sub i64 0, [[PTRTOINT]]
+; CHECK-NEXT:  [[SDIV:%.*]] = sdiv i64 [[SUB]], %N
+; CHECK-NEXT:  [[GEP:%.*]] = getelementptr inbounds %struct.C* %c2, i64 %sdiv
+; CHECK-NEXT:  ret %struct.C* [[GEP]]
+}
+
 define i32 addrspace(1)* @ascast_0_gep(i32* %p) nounwind {
 ; CHECK-LABEL: @ascast_0_gep(
 ; CHECK-NOT: getelementptr

diff --git a/test/Transforms/InstCombine/icmp-logical.ll b/test/Transforms/InstCombine/icmp-logical.ll
index d5d8cbc..faae201 100644
--- a/test/Transforms/InstCombine/icmp-logical.ll
+++ b/test/Transforms/InstCombine/icmp-logical.ll

@@ -150,3 +150,23 @@
   %val = or i1 %tst1, %tst2
   ret i1 %val
 }
+
+define i1 @fold_mask_cmps_to_false(i32 %x) {
+; CHECK-LABEL: @fold_mask_cmps_to_false
+; CHECK: ret i1 false
+  %1 = and i32 %x, 2147483647
+  %2 = icmp eq i32 %1, 0
+  %3 = icmp eq i32 %x, 2147483647
+  %4 = and i1 %3, %2
+  ret i1 %4
+}
+
+define i1 @fold_mask_cmps_to_true(i32 %x) {
+; CHECK-LABEL: @fold_mask_cmps_to_true
+; CHECK: ret i1 true
+  %1 = and i32 %x, 2147483647
+  %2 = icmp ne i32 %1, 0
+  %3 = icmp ne i32 %x, 2147483647
+  %4 = or i1 %3, %2
+  ret i1 %4
+}

diff --git a/test/Transforms/InstCombine/icmp-range.ll b/test/Transforms/InstCombine/icmp-range.ll
new file mode 100644
index 0000000..97d231f
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-range.ll

@@ -0,0 +1,61 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; These should be InstSimplify checks, but most of the code
+; is currently only in InstCombine.  TODO: move supporting code
+
+; Definitely out of range
+define i1 @test_nonzero(i32* nocapture readonly %arg) {
+; CHECK-LABEL:test_nonzero
+; CHECK: ret i1 true
+  %val = load i32* %arg, !range !0
+  %rval = icmp ne i32 %val, 0
+  ret i1 %rval
+}
+define i1 @test_nonzero2(i32* nocapture readonly %arg) {
+; CHECK-LABEL:test_nonzero2
+; CHECK: ret i1 false
+  %val = load i32* %arg, !range !0
+  %rval = icmp eq i32 %val, 0
+  ret i1 %rval
+}
+
+; Potentially in range
+define i1 @test_nonzero3(i32* nocapture readonly %arg) {
+; CHECK-LABEL: test_nonzero3
+; Check that this does not trigger - it wouldn't be legal
+; CHECK: icmp
+  %val = load i32* %arg, !range !1
+  %rval = icmp ne i32 %val, 0
+  ret i1 %rval
+}
+
+; Definitely in range
+define i1 @test_nonzero4(i8* nocapture readonly %arg) {
+; CHECK-LABEL: test_nonzero4
+; CHECK: ret i1 false
+  %val = load i8* %arg, !range !2
+  %rval = icmp ne i8 %val, 0
+  ret i1 %rval
+}
+
+define i1 @test_nonzero5(i8* nocapture readonly %arg) {
+; CHECK-LABEL: test_nonzero5
+; CHECK: ret i1 false
+  %val = load i8* %arg, !range !2
+  %rval = icmp ugt i8 %val, 0
+  ret i1 %rval
+}
+
+; Cheaper checks (most values in range meet requirements)
+define i1 @test_nonzero6(i8* %argw) {
+; CHECK-LABEL: test_nonzero6
+; CHECK: icmp ne i8 %val, 0
+  %val = load i8* %argw, !range !3
+  %rval = icmp sgt i8 %val, 0
+  ret i1 %rval
+}
+
+
+!0 = metadata !{i32 1, i32 6} 
+!1 = metadata !{i32 0, i32 6} 
+!2 = metadata !{i8 0, i8 1} 
+!3 = metadata !{i8 0, i8 6} 

diff --git a/test/Transforms/InstCombine/icmp-shr.ll b/test/Transforms/InstCombine/icmp-shr.ll
new file mode 100644
index 0000000..52414b9
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-shr.ll

@@ -0,0 +1,378 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-p1:16:16:16-p2:32:32:32-p3:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK-LABEL: @lshr_eq_msb_low_last_zero
+; CHECK-NEXT: icmp ugt i8 %a, 6
+define i1 @lshr_eq_msb_low_last_zero(i8 %a) {
+ %shr = lshr i8 127, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_eq_msb_low_second_zero
+; CHECK-NEXT: icmp ugt i8 %a, 6
+define i1 @ashr_eq_msb_low_second_zero(i8 %a) {
+ %shr = ashr i8 127, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @lshr_ne_msb_low_last_zero
+; CHECK-NEXT: icmp ult i8 %a, 7
+define i1 @lshr_ne_msb_low_last_zero(i8 %a) {
+ %shr = lshr i8 127, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_ne_msb_low_second_zero
+; CHECK-NEXT: icmp ult i8 %a, 7
+define i1 @ashr_ne_msb_low_second_zero(i8 %a) {
+ %shr = ashr i8 127, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_eq_both_equal
+; CHECK-NEXT: icmp eq i8 %a, 0
+define i1 @ashr_eq_both_equal(i8 %a) {
+ %shr = ashr i8 128, %a
+ %cmp = icmp eq i8 %shr, 128
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_ne_both_equal
+; CHECK-NEXT: icmp ne i8 %a, 0
+define i1 @ashr_ne_both_equal(i8 %a) {
+ %shr = ashr i8 128, %a
+ %cmp = icmp ne i8 %shr, 128
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @lshr_eq_both_equal
+; CHECK-NEXT: icmp eq i8 %a, 0
+define i1 @lshr_eq_both_equal(i8 %a) {
+ %shr = lshr i8 127, %a
+ %cmp = icmp eq i8 %shr, 127
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @lshr_ne_both_equal
+; CHECK-NEXT: icmp ne i8 %a, 0
+define i1 @lshr_ne_both_equal(i8 %a) {
+ %shr = lshr i8 127, %a
+ %cmp = icmp ne i8 %shr, 127
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq_both_equal
+; CHECK-NEXT: icmp eq i8 %a, 0
+define i1 @exact_ashr_eq_both_equal(i8 %a) {
+ %shr = ashr exact i8 128, %a
+ %cmp = icmp eq i8 %shr, 128
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne_both_equal
+; CHECK-NEXT: icmp ne i8 %a, 0
+define i1 @exact_ashr_ne_both_equal(i8 %a) {
+ %shr = ashr exact i8 128, %a
+ %cmp = icmp ne i8 %shr, 128
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_eq_both_equal
+; CHECK-NEXT: icmp eq i8 %a, 0
+define i1 @exact_lshr_eq_both_equal(i8 %a) {
+ %shr = lshr exact i8 126, %a
+ %cmp = icmp eq i8 %shr, 126
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_ne_both_equal
+; CHECK-NEXT: icmp ne i8 %a, 0
+define i1 @exact_lshr_ne_both_equal(i8 %a) {
+ %shr = lshr exact i8 126, %a
+ %cmp = icmp ne i8 %shr, 126
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_eq_opposite_msb
+; CHECK-NEXT: icmp eq i8 %a, 7
+define i1 @exact_lshr_eq_opposite_msb(i8 %a) {
+ %shr = lshr exact i8 -128, %a
+ %cmp = icmp eq i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @lshr_eq_opposite_msb
+; CHECK-NEXT: icmp eq i8 %a, 7
+define i1 @lshr_eq_opposite_msb(i8 %a) {
+ %shr = lshr i8 -128, %a
+ %cmp = icmp eq i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_ne_opposite_msb
+; CHECK-NEXT: icmp ne i8 %a, 7
+define i1 @exact_lshr_ne_opposite_msb(i8 %a) {
+ %shr = lshr exact i8 -128, %a
+ %cmp = icmp ne i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @lshr_ne_opposite_msb
+; CHECK-NEXT: icmp ne i8 %a, 7
+define i1 @lshr_ne_opposite_msb(i8 %a) {
+ %shr = lshr i8 -128, %a
+ %cmp = icmp ne i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq
+; CHECK-NEXT: icmp eq i8 %a, 7
+define i1 @exact_ashr_eq(i8 %a) {
+ %shr = ashr exact i8 -128, %a
+ %cmp = icmp eq i8 %shr, -1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne
+; CHECK-NEXT: icmp ne i8 %a, 7
+define i1 @exact_ashr_ne(i8 %a) {
+ %shr = ashr exact i8 -128, %a
+ %cmp = icmp ne i8 %shr, -1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_eq
+; CHECK-NEXT: icmp eq i8 %a, 2
+define i1 @exact_lshr_eq(i8 %a) {
+ %shr = lshr exact i8 4, %a
+ %cmp = icmp eq i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_ne
+; CHECK-NEXT: icmp ne i8 %a, 2
+define i1 @exact_lshr_ne(i8 %a) {
+ %shr = lshr exact i8 4, %a
+ %cmp = icmp ne i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_eq
+; CHECK-NEXT: icmp eq i8 %a, 7
+define i1 @nonexact_ashr_eq(i8 %a) {
+ %shr = ashr i8 -128, %a
+ %cmp = icmp eq i8 %shr, -1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_ne
+; CHECK-NEXT: icmp ne i8 %a, 7
+define i1 @nonexact_ashr_ne(i8 %a) {
+ %shr = ashr i8 -128, %a
+ %cmp = icmp ne i8 %shr, -1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_eq
+; CHECK-NEXT: icmp eq i8 %a, 2
+define i1 @nonexact_lshr_eq(i8 %a) {
+ %shr = lshr i8 4, %a
+ %cmp = icmp eq i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_ne
+; CHECK-NEXT: icmp ne i8 %a, 2
+define i1 @nonexact_lshr_ne(i8 %a) {
+ %shr = lshr i8 4, %a
+ %cmp = icmp ne i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_eq_exactdiv
+; CHECK-NEXT: icmp eq i8 %a, 4
+define i1 @exact_lshr_eq_exactdiv(i8 %a) {
+ %shr = lshr exact i8 80, %a
+ %cmp = icmp eq i8 %shr, 5
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_ne_exactdiv
+; CHECK-NEXT: icmp ne i8 %a, 4
+define i1 @exact_lshr_ne_exactdiv(i8 %a) {
+ %shr = lshr exact i8 80, %a
+ %cmp = icmp ne i8 %shr, 5
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_eq_exactdiv
+; CHECK-NEXT: icmp eq i8 %a, 4
+define i1 @nonexact_lshr_eq_exactdiv(i8 %a) {
+ %shr = lshr i8 80, %a
+ %cmp = icmp eq i8 %shr, 5
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_ne_exactdiv
+; CHECK-NEXT: icmp ne i8 %a, 4
+define i1 @nonexact_lshr_ne_exactdiv(i8 %a) {
+ %shr = lshr i8 80, %a
+ %cmp = icmp ne i8 %shr, 5
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq_exactdiv
+; CHECK-NEXT: icmp eq i8 %a, 4
+define i1 @exact_ashr_eq_exactdiv(i8 %a) {
+ %shr = ashr exact i8 -80, %a
+ %cmp = icmp eq i8 %shr, -5
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne_exactdiv
+; CHECK-NEXT: icmp ne i8 %a, 4
+define i1 @exact_ashr_ne_exactdiv(i8 %a) {
+ %shr = ashr exact i8 -80, %a
+ %cmp = icmp ne i8 %shr, -5
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_eq_exactdiv
+; CHECK-NEXT: icmp eq i8 %a, 4
+define i1 @nonexact_ashr_eq_exactdiv(i8 %a) {
+ %shr = ashr i8 -80, %a
+ %cmp = icmp eq i8 %shr, -5
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_ne_exactdiv
+; CHECK-NEXT: icmp ne i8 %a, 4
+define i1 @nonexact_ashr_ne_exactdiv(i8 %a) {
+ %shr = ashr i8 -80, %a
+ %cmp = icmp ne i8 %shr, -5
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_eq_noexactdiv
+; CHECK-NEXT: ret i1 false
+define i1 @exact_lshr_eq_noexactdiv(i8 %a) {
+ %shr = lshr exact i8 80, %a
+ %cmp = icmp eq i8 %shr, 31
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_ne_noexactdiv
+; CHECK-NEXT: ret i1 true
+define i1 @exact_lshr_ne_noexactdiv(i8 %a) {
+ %shr = lshr exact i8 80, %a
+ %cmp = icmp ne i8 %shr, 31
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_eq_noexactdiv
+; CHECK-NEXT: ret i1 false
+define i1 @nonexact_lshr_eq_noexactdiv(i8 %a) {
+ %shr = lshr i8 80, %a
+ %cmp = icmp eq i8 %shr, 31
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_ne_noexactdiv
+; CHECK-NEXT: ret i1 true
+define i1 @nonexact_lshr_ne_noexactdiv(i8 %a) {
+ %shr = lshr i8 80, %a
+ %cmp = icmp ne i8 %shr, 31
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq_noexactdiv
+; CHECK-NEXT: ret i1 false
+define i1 @exact_ashr_eq_noexactdiv(i8 %a) {
+ %shr = ashr exact i8 -80, %a
+ %cmp = icmp eq i8 %shr, -31
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne_noexactdiv
+; CHECK-NEXT: ret i1 true
+define i1 @exact_ashr_ne_noexactdiv(i8 %a) {
+ %shr = ashr exact i8 -80, %a
+ %cmp = icmp ne i8 %shr, -31
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_eq_noexactdiv
+; CHECK-NEXT: ret i1 false
+define i1 @nonexact_ashr_eq_noexactdiv(i8 %a) {
+ %shr = ashr i8 -80, %a
+ %cmp = icmp eq i8 %shr, -31
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_ne_noexactdiv
+; CHECK-NEXT: ret i1 true
+define i1 @nonexact_ashr_ne_noexactdiv(i8 %a) {
+ %shr = ashr i8 -80, %a
+ %cmp = icmp ne i8 %shr, -31
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_eq_noexactlog
+; CHECK-NEXT: ret i1 false
+define i1 @nonexact_lshr_eq_noexactlog(i8 %a) {
+ %shr = lshr i8 90, %a
+ %cmp = icmp eq i8 %shr, 30
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_ne_noexactlog
+; CHECK-NEXT: ret i1 true
+define i1 @nonexact_lshr_ne_noexactlog(i8 %a) {
+ %shr = lshr i8 90, %a
+ %cmp = icmp ne i8 %shr, 30
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_eq_noexactlog
+; CHECK-NEXT: ret i1 false
+define i1 @nonexact_ashr_eq_noexactlog(i8 %a) {
+ %shr = ashr i8 -90, %a
+ %cmp = icmp eq i8 %shr, -30
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_ne_noexactlog
+; CHECK-NEXT: ret i1 true
+define i1 @nonexact_ashr_ne_noexactlog(i8 %a) {
+ %shr = ashr i8 -90, %a
+ %cmp = icmp ne i8 %shr, -30
+ ret i1 %cmp
+}
+
+; Don't try to fold the entire body of function @PR20945 into a
+; single `ret i1 true` statement.
+; If %B is equal to 1, then this function would return false.
+; As a consequence, the instruction combiner is not allowed to fold %cmp
+; to 'true'. Instead, it should replace %cmp with a simpler comparison
+; between %B and 1.
+
+; CHECK-LABEL: @PR20945(
+; CHECK: icmp ne i32 %B, 1
+define i1 @PR20945(i32 %B) {
+  %shr = ashr i32 -9, %B
+  %cmp = icmp ne i32 %shr, -5
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @PR21222
+; CHECK: icmp eq i32 %B, 6
+define i1 @PR21222(i32 %B) {
+  %shr = ashr i32 -93, %B
+  %cmp = icmp eq i32 %shr, -2
+  ret i1 %cmp
+}

diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 26e144f..279d86d 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll

@@ -1148,22 +1148,6 @@
   ret i1 %cmp
 }
 
-; CHECK-LABEL: @icmp_shl_1_V_eq_31(
-; CHECK-NEXT: ret i1 false
-define i1 @icmp_shl_1_V_eq_31(i32 %V) {
-  %shl = shl i32 1, %V
-  %cmp = icmp eq i32 %shl, 31
-  ret i1 %cmp
-}
-
-; CHECK-LABEL: @icmp_shl_1_V_ne_31(
-; CHECK-NEXT: ret i1 true
-define i1 @icmp_shl_1_V_ne_31(i32 %V) {
-  %shl = shl i32 1, %V
-  %cmp = icmp ne i32 %shl, 31
-  ret i1 %cmp
-}
-
 ; CHECK-LABEL: @icmp_shl_1_V_ult_30(
 ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ult i32 %V, 5
 ; CHECK-NEXT: ret i1 [[CMP]]
@@ -1209,22 +1193,6 @@
   ret i1 %cmp
 }
 
-; CHECK-LABEL: @icmp_shl_1_V_ugt_2147483648(
-; CHECK-NEXT: ret i1 false
-define i1 @icmp_shl_1_V_ugt_2147483648(i32 %V) {
-  %shl = shl i32 1, %V
-  %cmp = icmp ugt i32 %shl, 2147483648
-  ret i1 %cmp
-}
-
-; CHECK-LABEL: @icmp_shl_1_V_ule_2147483648(
-; CHECK-NEXT: ret i1 true
-define i1 @icmp_shl_1_V_ule_2147483648(i32 %V) {
-  %shl = shl i32 1, %V
-  %cmp = icmp ule i32 %shl, 2147483648
-  ret i1 %cmp
-}
-
 ; CHECK-LABEL: @icmp_shl_1_V_ult_2147483648(
 ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i32 %V, 31
 ; CHECK-NEXT: ret i1 [[CMP]]
@@ -1424,3 +1392,133 @@
   %2 = icmp slt i32 %1, -10
   ret i1 %2
 }
+
+; CHECK-LABEL: @icmp_and_or_lshr
+; CHECK-NEXT: [[SHL:%[a-z0-9]+]] = shl nuw i32 1, %y
+; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 [[SHL]], 1
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 [[OR]], %x
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+define i1 @icmp_and_or_lshr(i32 %x, i32 %y) {
+  %shf = lshr i32 %x, %y
+  %or = or i32 %shf, %x
+  %and = and i32 %or, 1
+  %ret = icmp ne i32 %and, 0
+  ret i1 %ret
+}
+
+; CHECK-LABEL: @icmp_and_or_lshr_cst
+; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 %x, 3
+; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
+define i1 @icmp_and_or_lshr_cst(i32 %x) {
+  %shf = lshr i32 %x, 1
+  %or = or i32 %shf, %x
+  %and = and i32 %or, 1
+  %ret = icmp ne i32 %and, 0
+  ret i1 %ret
+}
+
+; CHECK-LABEL: @shl_ap1_zero_ap2_non_zero_2
+; CHECK-NEXT: %cmp = icmp ugt i32 %a, 29
+; CHECK-NEXT: ret i1 %cmp
+define i1 @shl_ap1_zero_ap2_non_zero_2(i32 %a) {
+ %shl = shl i32 4, %a
+ %cmp = icmp eq i32 %shl, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @shl_ap1_zero_ap2_non_zero_4
+; CHECK-NEXT: %cmp = icmp ugt i32 %a, 30
+; CHECK-NEXT: ret i1 %cmp
+define i1 @shl_ap1_zero_ap2_non_zero_4(i32 %a) {
+ %shl = shl i32 -2, %a
+ %cmp = icmp eq i32 %shl, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @shl_ap1_non_zero_ap2_non_zero_both_positive
+; CHECK-NEXT: %cmp = icmp eq i32 %a, 0
+; CHECK-NEXT: ret i1 %cmp
+define i1 @shl_ap1_non_zero_ap2_non_zero_both_positive(i32 %a) {
+ %shl = shl i32 50, %a
+ %cmp = icmp eq i32 %shl, 50
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @shl_ap1_non_zero_ap2_non_zero_both_negative
+; CHECK-NEXT: %cmp = icmp eq i32 %a, 0
+; CHECK-NEXT: ret i1 %cmp
+define i1 @shl_ap1_non_zero_ap2_non_zero_both_negative(i32 %a) {
+ %shl = shl i32 -50, %a
+ %cmp = icmp eq i32 %shl, -50
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @shl_ap1_non_zero_ap2_non_zero_ap1_1
+; CHECK-NEXT: ret i1 false
+define i1 @shl_ap1_non_zero_ap2_non_zero_ap1_1(i32 %a) {
+ %shl = shl i32 50, %a
+ %cmp = icmp eq i32 %shl, 25
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @shl_ap1_non_zero_ap2_non_zero_ap1_2
+; CHECK-NEXT: %cmp = icmp eq i32 %a, 1
+; CHECK-NEXT: ret i1 %cmp
+define i1 @shl_ap1_non_zero_ap2_non_zero_ap1_2(i32 %a) {
+ %shl = shl i32 25, %a
+ %cmp = icmp eq i32 %shl, 50
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @shl_ap1_non_zero_ap2_non_zero_ap1_3
+; CHECK-NEXT: ret i1 false
+define i1 @shl_ap1_non_zero_ap2_non_zero_ap1_3(i32 %a) {
+ %shl = shl i32 26, %a
+ %cmp = icmp eq i32 %shl, 50
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_sgt_zero_add_nsw
+; CHECK-NEXT: icmp sgt i32 %a, -1
+define i1 @icmp_sgt_zero_add_nsw(i32 %a) {
+ %add = add nsw i32 %a, 1
+ %cmp = icmp sgt i32 %add, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_sge_zero_add_nsw
+; CHECK-NEXT: icmp sgt i32 %a, -2
+define i1 @icmp_sge_zero_add_nsw(i32 %a) {
+ %add = add nsw i32 %a, 1
+ %cmp = icmp sge i32 %add, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_slt_zero_add_nsw
+; CHECK-NEXT: icmp slt i32 %a, -1
+define i1 @icmp_slt_zero_add_nsw(i32 %a) {
+ %add = add nsw i32 %a, 1
+ %cmp = icmp slt i32 %add, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_sle_zero_add_nsw
+; CHECK-NEXT: icmp slt i32 %a, 0
+define i1 @icmp_sle_zero_add_nsw(i32 %a) {
+ %add = add nsw i32 %a, 1
+ %cmp = icmp sle i32 %add, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_cmpxchg_strong
+; CHECK-NEXT: %[[xchg:.*]] = cmpxchg i32* %sc, i32 %old_val, i32 %new_val seq_cst seq_cst
+; CHECK-NEXT: %[[icmp:.*]] = extractvalue { i32, i1 } %[[xchg]], 1
+; CHECK-NEXT: ret i1 %[[icmp]]
+define zeroext i1 @icmp_cmpxchg_strong(i32* %sc, i32 %old_val, i32 %new_val) {
+  %xchg = cmpxchg i32* %sc, i32 %old_val, i32 %new_val seq_cst seq_cst
+  %xtrc = extractvalue { i32, i1 } %xchg, 0
+  %icmp = icmp eq i32 %xtrc, %old_val
+  ret i1 %icmp
+}

diff --git a/test/Transforms/InstCombine/load-addrspace-cast.ll b/test/Transforms/InstCombine/load-addrspace-cast.ll
deleted file mode 100644
index fd6339c..0000000
--- a/test/Transforms/InstCombine/load-addrspace-cast.ll
+++ /dev/null

@@ -1,12 +0,0 @@
-; RUN: opt -instcombine -S < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-n8:16:32:64"
-
-define i32* @pointer_to_addrspace_pointer(i32 addrspace(1)** %x) nounwind {
-; CHECK-LABEL: @pointer_to_addrspace_pointer(
-; CHECK: load
-; CHECK: addrspacecast
-  %y = bitcast i32 addrspace(1)** %x to i32**
-  %z = load i32** %y
-  ret i32* %z
-}
-

diff --git a/test/Transforms/InstCombine/load.ll b/test/Transforms/InstCombine/load.ll
index d11e08e..b4b7558 100644
--- a/test/Transforms/InstCombine/load.ll
+++ b/test/Transforms/InstCombine/load.ll

@@ -1,6 +1,8 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
 ; This test makes sure that these instructions are properly eliminated.
-;
-; RUN: opt < %s -instcombine -S | not grep load
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @X = constant i32 42		; <i32*> [#uses=2]
 @X2 = constant i32 47		; <i32*> [#uses=1]
@@ -10,47 +12,63 @@
 @GLOBAL = internal constant [4 x i32] zeroinitializer
 
 
+; CHECK-LABEL: @test1(
+; CHECK-NOT: load
 define i32 @test1() {
 	%B = load i32* @X		; <i32> [#uses=1]
 	ret i32 %B
 }
 
+; CHECK-LABEL: @test2(
+; CHECK-NOT: load
 define float @test2() {
 	%A = getelementptr [2 x { i32, float }]* @Y, i64 0, i64 1, i32 1		; <float*> [#uses=1]
 	%B = load float* %A		; <float> [#uses=1]
 	ret float %B
 }
 
+; CHECK-LABEL: @test3(
+; CHECK-NOT: load
 define i32 @test3() {
 	%A = getelementptr [2 x { i32, float }]* @Y, i64 0, i64 0, i32 0		; <i32*> [#uses=1]
 	%B = load i32* %A		; <i32> [#uses=1]
 	ret i32 %B
 }
 
+; CHECK-LABEL: @test4(
+; CHECK-NOT: load
 define i32 @test4() {
 	%A = getelementptr [2 x { i32, float }]* @Z, i64 0, i64 1, i32 0		; <i32*> [#uses=1]
 	%B = load i32* %A		; <i32> [#uses=1]
 	ret i32 %B
 }
 
+; CHECK-LABEL: @test5(
+; CHECK-NOT: load
 define i32 @test5(i1 %C) {
 	%Y = select i1 %C, i32* @X, i32* @X2		; <i32*> [#uses=1]
 	%Z = load i32* %Y		; <i32> [#uses=1]
 	ret i32 %Z
 }
 
+; CHECK-LABEL: @test7(
+; CHECK-NOT: load
 define i32 @test7(i32 %X) {
 	%V = getelementptr i32* null, i32 %X		; <i32*> [#uses=1]
 	%R = load i32* %V		; <i32> [#uses=1]
 	ret i32 %R
 }
 
+; CHECK-LABEL: @test8(
+; CHECK-NOT: load
 define i32 @test8(i32* %P) {
 	store i32 1, i32* %P
 	%X = load i32* %P		; <i32> [#uses=1]
 	ret i32 %X
 }
 
+; CHECK-LABEL: @test9(
+; CHECK-NOT: load
 define i32 @test9(i32* %P) {
 	%X = load i32* %P		; <i32> [#uses=1]
 	%Y = load i32* %P		; <i32> [#uses=1]
@@ -58,6 +76,8 @@
 	ret i32 %Z
 }
 
+; CHECK-LABEL: @test10(
+; CHECK-NOT: load
 define i32 @test10(i1 %C.upgrd.1, i32* %P, i32* %Q) {
 	br i1 %C.upgrd.1, label %T, label %F
 T:		; preds = %0
@@ -72,6 +92,8 @@
 	ret i32 %V
 }
 
+; CHECK-LABEL: @test11(
+; CHECK-NOT: load
 define double @test11(double* %p) {
   %t0 = getelementptr double* %p, i32 1
   store double 2.0, double* %t0
@@ -80,19 +102,51 @@
   ret double %x
 }
 
+; CHECK-LABEL: @test12(
+; CHECK-NOT: load
 define i32 @test12(i32* %P) {
-        %A = alloca i32
-        store i32 123, i32* %A
-        ; Cast the result of the load not the source
-        %Q = bitcast i32* %A to i32*
-        %V = load i32* %Q
-        ret i32 %V
+  %A = alloca i32
+  store i32 123, i32* %A
+  ; Cast the result of the load not the source
+  %Q = bitcast i32* %A to i32*
+  %V = load i32* %Q
+  ret i32 %V
 }
 
+; CHECK-LABEL: @test13(
+; CHECK-NOT: load
 define <16 x i8> @test13(<2 x i64> %x) {
-entry:
-	%tmp = load <16 x i8> * bitcast ([4 x i32]* @GLOBAL to <16 x i8>*)
-	ret <16 x i8> %tmp
+  %tmp = load <16 x i8>* bitcast ([4 x i32]* @GLOBAL to <16 x i8>*)
+  ret <16 x i8> %tmp
 }
 
+define i8 @test14(i8 %x, i32 %y) {
+; This test must not have the store of %x forwarded to the load -- there is an
+; intervening store if %y. However, the intervening store occurs with a different
+; type and size and to a different pointer value. This is ensuring that none of
+; those confuse the analysis into thinking that the second store does not alias
+; the first.
+; CHECK-LABEL: @test14(
+; CHECK:         %[[R:.*]] = load i8*
+; CHECK-NEXT:    ret i8 %[[R]]
+  %a = alloca i32
+  %a.i8 = bitcast i32* %a to i8*
+  store i8 %x, i8* %a.i8
+  store i32 %y, i32* %a
+  %r = load i8* %a.i8
+  ret i8 %r
+}
 
+@test15_global = external global i32
+
+define i8 @test15(i8 %x, i32 %y) {
+; Same test as @test14 essentially, but using a global instead of an alloca.
+; CHECK-LABEL: @test15(
+; CHECK:         %[[R:.*]] = load i8*
+; CHECK-NEXT:    ret i8 %[[R]]
+  %g.i8 = bitcast i32* @test15_global to i8*
+  store i8 %x, i8* %g.i8
+  store i32 %y, i32* @test15_global
+  %r = load i8* %g.i8
+  ret i8 %r
+}

diff --git a/test/Transforms/InstCombine/loadstore-alignment.ll b/test/Transforms/InstCombine/loadstore-alignment.ll
index 2263cb2..e90bdb7 100644
--- a/test/Transforms/InstCombine/loadstore-alignment.ll
+++ b/test/Transforms/InstCombine/loadstore-alignment.ll

@@ -1,67 +1,117 @@
-; RUN: opt < %s -instcombine -S | grep ", align 16" | count 14
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+; RUN: opt -instcombine -S < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-p1:64:64:64-p2:32:32:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 @x = external global <2 x i64>, align 16
 @xx = external global [13 x <2 x i64>], align 16
 
+@x.as2 = external addrspace(2) global <2 x i64>, align 16
+
+; CHECK-LABEL: @static_hem(
+; CHECK: , align 16
 define <2 x i64> @static_hem() {
-	%t = getelementptr <2 x i64>* @x, i32 7
-	%tmp1 = load <2 x i64>* %t, align 1
-	ret <2 x i64> %tmp1
+  %t = getelementptr <2 x i64>* @x, i32 7
+  %tmp1 = load <2 x i64>* %t, align 1
+  ret <2 x i64> %tmp1
 }
 
+; CHECK-LABEL: @static_hem_addrspacecast(
+; CHECK: , align 16
+define <2 x i64> @static_hem_addrspacecast() {
+  %t = getelementptr <2 x i64>* @x, i32 7
+  %t.asc = addrspacecast <2 x i64>* %t to <2 x i64> addrspace(1)*
+  %tmp1 = load <2 x i64> addrspace(1)* %t.asc, align 1
+  ret <2 x i64> %tmp1
+}
+
+; CHECK-LABEL: @static_hem_addrspacecast_smaller_ptr(
+; CHECK: , align 16
+define <2 x i64> @static_hem_addrspacecast_smaller_ptr() {
+  %t = getelementptr <2 x i64>* @x, i32 7
+  %t.asc = addrspacecast <2 x i64>* %t to <2 x i64> addrspace(2)*
+  %tmp1 = load <2 x i64> addrspace(2)* %t.asc, align 1
+  ret <2 x i64> %tmp1
+}
+
+; CHECK-LABEL: @static_hem_addrspacecast_larger_ptr(
+; CHECK: , align 16
+define <2 x i64> @static_hem_addrspacecast_larger_ptr() {
+  %t = getelementptr <2 x i64> addrspace(2)* @x.as2, i32 7
+  %t.asc = addrspacecast <2 x i64> addrspace(2)* %t to <2 x i64> addrspace(1)*
+  %tmp1 = load <2 x i64> addrspace(1)* %t.asc, align 1
+  ret <2 x i64> %tmp1
+}
+
+; CHECK-LABEL: @hem(
+; CHECK: , align 16
 define <2 x i64> @hem(i32 %i) {
-	%t = getelementptr <2 x i64>* @x, i32 %i
-	%tmp1 = load <2 x i64>* %t, align 1
-	ret <2 x i64> %tmp1
+  %t = getelementptr <2 x i64>* @x, i32 %i
+  %tmp1 = load <2 x i64>* %t, align 1
+  ret <2 x i64> %tmp1
 }
 
+; CHECK-LABEL: @hem_2d(
+; CHECK: , align 16
 define <2 x i64> @hem_2d(i32 %i, i32 %j) {
-	%t = getelementptr [13 x <2 x i64>]* @xx, i32 %i, i32 %j
-	%tmp1 = load <2 x i64>* %t, align 1
-	ret <2 x i64> %tmp1
+  %t = getelementptr [13 x <2 x i64>]* @xx, i32 %i, i32 %j
+  %tmp1 = load <2 x i64>* %t, align 1
+  ret <2 x i64> %tmp1
 }
 
+; CHECK-LABEL: @foo(
+; CHECK: , align 16
 define <2 x i64> @foo() {
-	%tmp1 = load <2 x i64>* @x, align 1
-	ret <2 x i64> %tmp1
+  %tmp1 = load <2 x i64>* @x, align 1
+  ret <2 x i64> %tmp1
 }
 
+; CHECK-LABEL: @bar(
+; CHECK: , align 16
+; CHECK: , align 16
 define <2 x i64> @bar() {
-	%t = alloca <2 x i64>
-        call void @kip(<2 x i64>* %t)
-	%tmp1 = load <2 x i64>* %t, align 1
-	ret <2 x i64> %tmp1
+  %t = alloca <2 x i64>
+  call void @kip(<2 x i64>* %t)
+  %tmp1 = load <2 x i64>* %t, align 1
+  ret <2 x i64> %tmp1
 }
 
+; CHECK-LABEL: @static_hem_store(
+; CHECK: , align 16
 define void @static_hem_store(<2 x i64> %y) {
-	%t = getelementptr <2 x i64>* @x, i32 7
-	store <2 x i64> %y, <2 x i64>* %t, align 1
-        ret void
+  %t = getelementptr <2 x i64>* @x, i32 7
+  store <2 x i64> %y, <2 x i64>* %t, align 1
+  ret void
 }
 
+; CHECK-LABEL: @hem_store(
+; CHECK: , align 16
 define void @hem_store(i32 %i, <2 x i64> %y) {
-	%t = getelementptr <2 x i64>* @x, i32 %i
-	store <2 x i64> %y, <2 x i64>* %t, align 1
-        ret void
+  %t = getelementptr <2 x i64>* @x, i32 %i
+  store <2 x i64> %y, <2 x i64>* %t, align 1
+  ret void
 }
 
+; CHECK-LABEL: @hem_2d_store(
+; CHECK: , align 16
 define void @hem_2d_store(i32 %i, i32 %j, <2 x i64> %y) {
-	%t = getelementptr [13 x <2 x i64>]* @xx, i32 %i, i32 %j
-	store <2 x i64> %y, <2 x i64>* %t, align 1
-        ret void
+  %t = getelementptr [13 x <2 x i64>]* @xx, i32 %i, i32 %j
+  store <2 x i64> %y, <2 x i64>* %t, align 1
+  ret void
 }
 
+; CHECK-LABEL: @foo_store(
+; CHECK: , align 16
 define void @foo_store(<2 x i64> %y) {
-	store <2 x i64> %y, <2 x i64>* @x, align 1
-        ret void
+  store <2 x i64> %y, <2 x i64>* @x, align 1
+  ret void
 }
 
+; CHECK-LABEL: @bar_store(
+; CHECK: , align 16
 define void @bar_store(<2 x i64> %y) {
-	%t = alloca <2 x i64>
-        call void @kip(<2 x i64>* %t)
-	store <2 x i64> %y, <2 x i64>* %t, align 1
-        ret void
+  %t = alloca <2 x i64>
+  call void @kip(<2 x i64>* %t)
+  store <2 x i64> %y, <2 x i64>* %t, align 1
+  ret void
 }
 
 declare void @kip(<2 x i64>* %t)

diff --git a/test/Transforms/InstCombine/loadstore-metadata.ll b/test/Transforms/InstCombine/loadstore-metadata.ll
new file mode 100644
index 0000000..863edae
--- /dev/null
+++ b/test/Transforms/InstCombine/loadstore-metadata.ll

@@ -0,0 +1,86 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+define i32 @test_load_cast_combine_tbaa(float* %ptr) {
+; Ensure (cast (load (...))) -> (load (cast (...))) preserves TBAA.
+; CHECK-LABEL: @test_load_cast_combine_tbaa(
+; CHECK: load i32* %{{.*}}, !tbaa !0
+entry:
+  %l = load float* %ptr, !tbaa !0
+  %c = bitcast float %l to i32
+  ret i32 %c
+}
+
+define i32 @test_load_cast_combine_noalias(float* %ptr) {
+; Ensure (cast (load (...))) -> (load (cast (...))) preserves no-alias metadata.
+; CHECK-LABEL: @test_load_cast_combine_noalias(
+; CHECK: load i32* %{{.*}}, !alias.scope !2, !noalias !1
+entry:
+  %l = load float* %ptr, !alias.scope !2, !noalias !1
+  %c = bitcast float %l to i32
+  ret i32 %c
+}
+
+define float @test_load_cast_combine_range(i32* %ptr) {
+; Ensure (cast (load (...))) -> (load (cast (...))) drops range metadata. It
+; would be nice to preserve or update it somehow but this is hard when moving
+; between types.
+; CHECK-LABEL: @test_load_cast_combine_range(
+; CHECK: load float* %{{.*}}
+; CHECK-NOT: !range
+; CHECK: ret float
+entry:
+  %l = load i32* %ptr, !range !5
+  %c = bitcast i32 %l to float
+  ret float %c
+}
+
+define i32 @test_load_cast_combine_invariant(float* %ptr) {
+; Ensure (cast (load (...))) -> (load (cast (...))) preserves invariant metadata.
+; CHECK-LABEL: @test_load_cast_combine_invariant(
+; CHECK: load i32* %{{.*}}, !invariant.load !3
+entry:
+  %l = load float* %ptr, !invariant.load !3
+  %c = bitcast float %l to i32
+  ret i32 %c
+}
+
+define i32 @test_load_cast_combine_nontemporal(float* %ptr) {
+; Ensure (cast (load (...))) -> (load (cast (...))) preserves nontemporal
+; metadata.
+; CHECK-LABEL: @test_load_cast_combine_nontemporal(
+; CHECK: load i32* %{{.*}}, !nontemporal !4
+entry:
+  %l = load float* %ptr, !nontemporal !4
+  %c = bitcast float %l to i32
+  ret i32 %c
+}
+
+define void @test_load_cast_combine_loop(float* %src, i32* %dst, i32 %n) {
+; Ensure (cast (load (...))) -> (load (cast (...))) preserves loop access
+; metadata.
+; CHECK-LABEL: @test_load_cast_combine_loop(
+; CHECK: load i32* %{{.*}}, !llvm.mem.parallel_loop_access !1
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %src.gep = getelementptr inbounds float* %src, i32 %i
+  %dst.gep = getelementptr inbounds i32* %dst, i32 %i
+  %l = load float* %src.gep, !llvm.mem.parallel_loop_access !1
+  %c = bitcast float %l to i32
+  store i32 %c, i32* %dst.gep
+  %i.next = add i32 %i, 1
+  %cmp = icmp slt i32 %i.next, %n
+  br i1 %cmp, label %loop, label %exit, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+!0 = metadata !{ metadata !1, metadata !1, i64 0 }
+!1 = metadata !{ metadata !1 }
+!2 = metadata !{ metadata !2, metadata !1 }
+!3 = metadata !{ }
+!4 = metadata !{ i32 1 }
+!5 = metadata !{ i32 0, i32 42 }

diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index 2085206..ed25e4e 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll

@@ -144,3 +144,26 @@
   call void @_ZdlPvRKSt9nothrow_t(i8* %call.i, i8* %nt) builtin nounwind
   resume { i8*, i32 } %0
 }
+
+declare i8* @_Znwm(i64) nobuiltin
+declare void @_ZdlPvm(i8*, i64) nobuiltin
+declare i8* @_Znwj(i32) nobuiltin
+declare void @_ZdlPvj(i8*, i32) nobuiltin
+declare i8* @_Znam(i64) nobuiltin
+declare void @_ZdaPvm(i8*, i64) nobuiltin
+declare i8* @_Znaj(i32) nobuiltin
+declare void @_ZdaPvj(i8*, i32) nobuiltin
+
+; CHECK-LABEL: @test8(
+define void @test8() {
+  ; CHECK-NOT: call
+  %nwm = call i8* @_Znwm(i64 32) builtin
+  call void @_ZdlPvm(i8* %nwm, i64 32) builtin
+  %nwj = call i8* @_Znwj(i32 32) builtin
+  call void @_ZdlPvj(i8* %nwj, i32 32) builtin
+  %nam = call i8* @_Znam(i64 32) builtin
+  call void @_ZdaPvm(i8* %nam, i64 32) builtin
+  %naj = call i8* @_Znaj(i32 32) builtin
+  call void @_ZdaPvj(i8* %naj, i32 32) builtin
+  ret void
+}

diff --git a/test/Transforms/InstCombine/maxnum.ll b/test/Transforms/InstCombine/maxnum.ll
new file mode 100644
index 0000000..585d9f4
--- /dev/null
+++ b/test/Transforms/InstCombine/maxnum.ll

@@ -0,0 +1,222 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+declare float @llvm.maxnum.f32(float, float) #0
+declare float @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
+
+declare double @llvm.maxnum.f64(double, double) #0
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0
+
+; CHECK-LABEL: @constant_fold_maxnum_f32
+; CHECK-NEXT: ret float 2.000000e+00
+define float @constant_fold_maxnum_f32() #0 {
+  %x = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f32_inv
+; CHECK-NEXT: ret float 2.000000e+00
+define float @constant_fold_maxnum_f32_inv() #0 {
+  %x = call float @llvm.maxnum.f32(float 2.0, float 1.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f32_nan0
+; CHECK-NEXT: ret float 2.000000e+00
+define float @constant_fold_maxnum_f32_nan0() #0 {
+  %x = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 2.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f32_nan1
+; CHECK-NEXT: ret float 2.000000e+00
+define float @constant_fold_maxnum_f32_nan1() #0 {
+  %x = call float @llvm.maxnum.f32(float 2.0, float 0x7FF8000000000000) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f32_nan_nan
+; CHECK-NEXT: ret float 0x7FF8000000000000
+define float @constant_fold_maxnum_f32_nan_nan() #0 {
+  %x = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f32_p0_p0
+; CHECK-NEXT: ret float 0.000000e+00
+define float @constant_fold_maxnum_f32_p0_p0() #0 {
+  %x = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f32_p0_n0
+; CHECK-NEXT: ret float 0.000000e+00
+define float @constant_fold_maxnum_f32_p0_n0() #0 {
+  %x = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f32_n0_p0
+; CHECK-NEXT: ret float -0.000000e+00
+define float @constant_fold_maxnum_f32_n0_p0() #0 {
+  %x = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f32_n0_n0
+; CHECK-NEXT: ret float -0.000000e+00
+define float @constant_fold_maxnum_f32_n0_n0() #0 {
+  %x = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_v4f32
+; CHECK-NEXT: ret <4 x float> <float 2.000000e+00, float 8.000000e+00, float 1.000000e+01, float 9.000000e+00>
+define <4 x float> @constant_fold_maxnum_v4f32() #0 {
+  %x = call <4 x float> @llvm.maxnum.v4f32(<4 x float> <float 1.0, float 8.0, float 3.0, float 9.0>, <4 x float> <float 2.0, float 2.0, float 10.0, float 5.0>)
+  ret <4 x float> %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f64
+; CHECK-NEXT: ret double 2.000000e+00
+define double @constant_fold_maxnum_f64() #0 {
+  %x = call double @llvm.maxnum.f64(double 1.0, double 2.0) #0
+  ret double %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f64_nan0
+; CHECK-NEXT: ret double 2.000000e+00
+define double @constant_fold_maxnum_f64_nan0() #0 {
+  %x = call double @llvm.maxnum.f64(double 0x7FF8000000000000, double 2.0) #0
+  ret double %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f64_nan1
+; CHECK-NEXT: ret double 2.000000e+00
+define double @constant_fold_maxnum_f64_nan1() #0 {
+  %x = call double @llvm.maxnum.f64(double 2.0, double 0x7FF8000000000000) #0
+  ret double %x
+}
+
+; CHECK-LABEL: @constant_fold_maxnum_f64_nan_nan
+; CHECK-NEXT: ret double 0x7FF8000000000000
+define double @constant_fold_maxnum_f64_nan_nan() #0 {
+  %x = call double @llvm.maxnum.f64(double 0x7FF8000000000000, double 0x7FF8000000000000) #0
+  ret double %x
+}
+
+; CHECK-LABEL: @canonicalize_constant_maxnum_f32
+; CHECK: call float @llvm.maxnum.f32(float %x, float 1.000000e+00)
+define float @canonicalize_constant_maxnum_f32(float %x) #0 {
+  %y = call float @llvm.maxnum.f32(float 1.0, float %x) #0
+  ret float %y
+}
+
+; CHECK-LABEL: @noop_maxnum_f32
+; CHECK-NEXT: ret float %x
+define float @noop_maxnum_f32(float %x) #0 {
+  %y = call float @llvm.maxnum.f32(float %x, float %x) #0
+  ret float %y
+}
+
+; CHECK-LABEL: @maxnum_f32_nan_val
+; CHECK-NEXT: ret float %x
+define float @maxnum_f32_nan_val(float %x) #0 {
+  %y = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float %x) #0
+  ret float %y
+}
+
+; CHECK-LABEL: @maxnum_f32_val_nan
+; CHECK-NEXT: ret float %x
+define float @maxnum_f32_val_nan(float %x) #0 {
+  %y = call float @llvm.maxnum.f32(float %x, float 0x7FF8000000000000) #0
+  ret float %y
+}
+
+; CHECK-LABEL: @fold_maxnum_f32_undef_undef
+; CHECK-NEXT: ret float undef
+define float @fold_maxnum_f32_undef_undef(float %x) nounwind {
+  %val = call float @llvm.maxnum.f32(float undef, float undef) #0
+  ret float %val
+}
+
+; CHECK-LABEL: @fold_maxnum_f32_val_undef
+; CHECK-NEXT: ret float %x
+define float @fold_maxnum_f32_val_undef(float %x) nounwind {
+  %val = call float @llvm.maxnum.f32(float %x, float undef) #0
+  ret float %val
+}
+
+; CHECK-LABEL: @fold_maxnum_f32_undef_val
+; CHECK-NEXT: ret float %x
+define float @fold_maxnum_f32_undef_val(float %x) nounwind {
+  %val = call float @llvm.maxnum.f32(float undef, float %x) #0
+  ret float %val
+}
+
+; CHECK-LABEL: @maxnum_x_maxnum_x_y
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %x, float %y)
+; CHECK-NEXT: ret float
+define float @maxnum_x_maxnum_x_y(float %x, float %y) #0 {
+  %a = call float @llvm.maxnum.f32(float %x, float %y) #0
+  %b = call float @llvm.maxnum.f32(float %x, float %a) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @maxnum_y_maxnum_x_y
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %x, float %y)
+; CHECK-NEXT: ret float
+define float @maxnum_y_maxnum_x_y(float %x, float %y) #0 {
+  %a = call float @llvm.maxnum.f32(float %x, float %y) #0
+  %b = call float @llvm.maxnum.f32(float %y, float %a) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @maxnum_z_maxnum_x_y
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %x, float %y)
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %z, float %a)
+; CHECK-NEXT: ret float
+define float @maxnum_z_maxnum_x_y(float %x, float %y, float %z) #0 {
+  %a = call float @llvm.maxnum.f32(float %x, float %y) #0
+  %b = call float @llvm.maxnum.f32(float %z, float %a) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @maxnum_maxnum_x_y_z
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %x, float %y)
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %a, float %z)
+; CHECK-NEXT: ret float
+define float @maxnum_maxnum_x_y_z(float %x, float %y, float %z) #0 {
+  %a = call float @llvm.maxnum.f32(float %x, float %y) #0
+  %b = call float @llvm.maxnum.f32(float %a, float %z) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @maxnum4
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %x, float %y)
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %z, float %w)
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %a, float %b)
+; CHECK-NEXT: ret float
+define float @maxnum4(float %x, float %y, float %z, float %w) #0 {
+  %a = call float @llvm.maxnum.f32(float %x, float %y) #0
+  %b = call float @llvm.maxnum.f32(float %z, float %w) #0
+  %c = call float @llvm.maxnum.f32(float %a, float %b) #0
+  ret float %c
+}
+
+; CHECK-LABEL: @fold_maxnum_f32_inf_val
+; CHECK-NEXT: ret float 0x7FF0000000000000
+define float @fold_maxnum_f32_inf_val(float %x) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0x7FF0000000000000, float %x) #0
+  ret float %val
+}
+
+; CHECK-LABEL: @fold_maxnum_f32_neginf_val
+; CHECK-NEXT: call float @llvm.maxnum.f32(float %x, float 0xFFF0000000000000)
+; CHECK-NEXT: ret float
+define float @fold_maxnum_f32_neginf_val(float %x) nounwind {
+  %val = call float @llvm.maxnum.f32(float 0xFFF0000000000000, float %x) #0
+  ret float %val
+}
+
+attributes #0 = { nounwind readnone }

diff --git a/test/Transforms/InstCombine/memcmp-1.ll b/test/Transforms/InstCombine/memcmp-1.ll
index 65349c6..d960693 100644
--- a/test/Transforms/InstCombine/memcmp-1.ll
+++ b/test/Transforms/InstCombine/memcmp-1.ll

@@ -37,7 +37,7 @@
 ; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
 ; CHECK: [[LOAD2:%[a-z]+]] = load i8* %mem2, align 1
 ; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
-; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+; CHECK: [[RET:%[a-z]+]] = sub nsw i32 [[ZEXT1]], [[ZEXT2]]
   ret i32 %ret
 ; CHECK: ret i32 [[RET]]
 }

diff --git a/test/Transforms/InstCombine/minnum.ll b/test/Transforms/InstCombine/minnum.ll
new file mode 100644
index 0000000..57d6e16
--- /dev/null
+++ b/test/Transforms/InstCombine/minnum.ll

@@ -0,0 +1,244 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
+
+declare double @llvm.minnum.f64(double, double) #0
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0
+
+declare float @llvm.fmax.f32(float, float) #0
+
+; CHECK-LABEL: @constant_fold_minnum_f32
+; CHECK-NEXT: ret float 1.000000e+00
+define float @constant_fold_minnum_f32() #0 {
+  %x = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f32_inv
+; CHECK-NEXT: ret float 1.000000e+00
+define float @constant_fold_minnum_f32_inv() #0 {
+  %x = call float @llvm.minnum.f32(float 2.0, float 1.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f32_nan0
+; CHECK-NEXT: ret float 2.000000e+00
+define float @constant_fold_minnum_f32_nan0() #0 {
+  %x = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 2.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f32_nan1
+; CHECK-NEXT: ret float 2.000000e+00
+define float @constant_fold_minnum_f32_nan1() #0 {
+  %x = call float @llvm.minnum.f32(float 2.0, float 0x7FF8000000000000) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f32_nan_nan
+; CHECK-NEXT: ret float 0x7FF8000000000000
+define float @constant_fold_minnum_f32_nan_nan() #0 {
+  %x = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f32_p0_p0
+; CHECK-NEXT: ret float 0.000000e+00
+define float @constant_fold_minnum_f32_p0_p0() #0 {
+  %x = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f32_p0_n0
+; CHECK-NEXT: ret float 0.000000e+00
+define float @constant_fold_minnum_f32_p0_n0() #0 {
+  %x = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f32_n0_p0
+; CHECK-NEXT: ret float -0.000000e+00
+define float @constant_fold_minnum_f32_n0_p0() #0 {
+  %x = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f32_n0_n0
+; CHECK-NEXT: ret float -0.000000e+00
+define float @constant_fold_minnum_f32_n0_n0() #0 {
+  %x = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
+  ret float %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_v4f32
+; CHECK-NEXT: ret <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 5.000000e+00>
+define <4 x float> @constant_fold_minnum_v4f32() #0 {
+  %x = call <4 x float> @llvm.minnum.v4f32(<4 x float> <float 1.0, float 8.0, float 3.0, float 9.0>, <4 x float> <float 2.0, float 2.0, float 10.0, float 5.0>)
+  ret <4 x float> %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f64
+; CHECK-NEXT: ret double 1.000000e+00
+define double @constant_fold_minnum_f64() #0 {
+  %x = call double @llvm.minnum.f64(double 1.0, double 2.0) #0
+  ret double %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f64_nan0
+; CHECK-NEXT: ret double 2.000000e+00
+define double @constant_fold_minnum_f64_nan0() #0 {
+  %x = call double @llvm.minnum.f64(double 0x7FF8000000000000, double 2.0) #0
+  ret double %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f64_nan1
+; CHECK-NEXT: ret double 2.000000e+00
+define double @constant_fold_minnum_f64_nan1() #0 {
+  %x = call double @llvm.minnum.f64(double 2.0, double 0x7FF8000000000000) #0
+  ret double %x
+}
+
+; CHECK-LABEL: @constant_fold_minnum_f64_nan_nan
+; CHECK-NEXT: ret double 0x7FF8000000000000
+define double @constant_fold_minnum_f64_nan_nan() #0 {
+  %x = call double @llvm.minnum.f64(double 0x7FF8000000000000, double 0x7FF8000000000000) #0
+  ret double %x
+}
+
+; CHECK-LABEL: @canonicalize_constant_minnum_f32
+; CHECK: call float @llvm.minnum.f32(float %x, float 1.000000e+00)
+define float @canonicalize_constant_minnum_f32(float %x) #0 {
+  %y = call float @llvm.minnum.f32(float 1.0, float %x) #0
+  ret float %y
+}
+
+; CHECK-LABEL: @noop_minnum_f32
+; CHECK-NEXT: ret float %x
+define float @noop_minnum_f32(float %x) #0 {
+  %y = call float @llvm.minnum.f32(float %x, float %x) #0
+  ret float %y
+}
+
+; CHECK-LABEL: @minnum_f32_nan_val
+; CHECK-NEXT: ret float %x
+define float @minnum_f32_nan_val(float %x) #0 {
+  %y = call float @llvm.minnum.f32(float 0x7FF8000000000000, float %x) #0
+  ret float %y
+}
+
+; CHECK-LABEL: @minnum_f32_val_nan
+; CHECK-NEXT: ret float %x
+define float @minnum_f32_val_nan(float %x) #0 {
+  %y = call float @llvm.minnum.f32(float %x, float 0x7FF8000000000000) #0
+  ret float %y
+}
+
+; CHECK-LABEL: @fold_minnum_f32_undef_undef
+; CHECK-NEXT: ret float undef
+define float @fold_minnum_f32_undef_undef(float %x) nounwind {
+  %val = call float @llvm.minnum.f32(float undef, float undef) #0
+  ret float %val
+}
+
+; CHECK-LABEL: @fold_minnum_f32_val_undef
+; CHECK-NEXT: ret float %x
+define float @fold_minnum_f32_val_undef(float %x) nounwind {
+  %val = call float @llvm.minnum.f32(float %x, float undef) #0
+  ret float %val
+}
+
+; CHECK-LABEL: @fold_minnum_f32_undef_val
+; CHECK-NEXT: ret float %x
+define float @fold_minnum_f32_undef_val(float %x) nounwind {
+  %val = call float @llvm.minnum.f32(float undef, float %x) #0
+  ret float %val
+}
+
+; CHECK-LABEL: @minnum_x_minnum_x_y
+; CHECK-NEXT: call float @llvm.minnum.f32(float %x, float %y)
+; CHECK-NEXT: ret float
+define float @minnum_x_minnum_x_y(float %x, float %y) #0 {
+  %a = call float @llvm.minnum.f32(float %x, float %y) #0
+  %b = call float @llvm.minnum.f32(float %x, float %a) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @minnum_y_minnum_x_y
+; CHECK-NEXT: call float @llvm.minnum.f32(float %x, float %y)
+; CHECK-NEXT: ret float
+define float @minnum_y_minnum_x_y(float %x, float %y) #0 {
+  %a = call float @llvm.minnum.f32(float %x, float %y) #0
+  %b = call float @llvm.minnum.f32(float %y, float %a) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @minnum_z_minnum_x_y
+; CHECK-NEXT: call float @llvm.minnum.f32(float %x, float %y)
+; CHECK-NEXT: call float @llvm.minnum.f32(float %z, float %a)
+; CHECK-NEXT: ret float
+define float @minnum_z_minnum_x_y(float %x, float %y, float %z) #0 {
+  %a = call float @llvm.minnum.f32(float %x, float %y) #0
+  %b = call float @llvm.minnum.f32(float %z, float %a) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @minnum_minnum_x_y_z
+; CHECK-NEXT: call float @llvm.minnum.f32(float %x, float %y)
+; CHECK-NEXT: call float @llvm.minnum.f32(float %a, float %z)
+; CHECK-NEXT: ret float
+define float @minnum_minnum_x_y_z(float %x, float %y, float %z) #0 {
+  %a = call float @llvm.minnum.f32(float %x, float %y) #0
+  %b = call float @llvm.minnum.f32(float %a, float %z) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @minnum4
+; CHECK-NEXT: call float @llvm.minnum.f32(float %x, float %y)
+; CHECK-NEXT: call float @llvm.minnum.f32(float %z, float %w)
+; CHECK-NEXT: call float @llvm.minnum.f32(float %a, float %b)
+; CHECK-NEXT: ret float
+define float @minnum4(float %x, float %y, float %z, float %w) #0 {
+  %a = call float @llvm.minnum.f32(float %x, float %y) #0
+  %b = call float @llvm.minnum.f32(float %z, float %w) #0
+  %c = call float @llvm.minnum.f32(float %a, float %b) #0
+  ret float %c
+}
+
+; CHECK-LABEL: @minnum_x_fmax_x_y
+; CHECK-NEXT: call float @llvm.fmax.f32
+; CHECK-NEXT: call float @llvm.minnum.f32
+; CHECK-NEXT: ret float
+define float @minnum_x_fmax_x_y(float %x, float %y) #0 {
+  %a = call float @llvm.fmax.f32(float %x, float %y) #0
+  %b = call float @llvm.minnum.f32(float %x, float %a) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @fmax_x_minnum_x_y
+; CHECK-NEXT: call float @llvm.minnum.f32
+; CHECK-NEXT: call float @llvm.fmax.f32
+; CHECK-NEXT: ret float
+define float @fmax_x_minnum_x_y(float %x, float %y) #0 {
+  %a = call float @llvm.minnum.f32(float %x, float %y) #0
+  %b = call float @llvm.fmax.f32(float %x, float %a) #0
+  ret float %b
+}
+
+; CHECK-LABEL: @fold_minnum_f32_inf_val
+; CHECK-NEXT: call float @llvm.minnum.f32(float %x, float 0x7FF0000000000000)
+; CHECK-NEXT: ret float
+define float @fold_minnum_f32_inf_val(float %x) nounwind {
+  %val = call float @llvm.minnum.f32(float 0x7FF0000000000000, float %x) #0
+  ret float %val
+}
+
+; CHECK-LABEL: @fold_minnum_f32_minf_val
+; CHECK-NEXT: ret float 0xFFF0000000000000
+define float @fold_minnum_f32_minf_val(float %x) nounwind {
+  %val = call float @llvm.minnum.f32(float 0xFFF0000000000000, float %x) #0
+  ret float %val
+}
+
+attributes #0 = { nounwind readnone }

diff --git a/test/Transforms/InstCombine/narrow-switch.ll b/test/Transforms/InstCombine/narrow-switch.ll
new file mode 100644
index 0000000..7646189
--- /dev/null
+++ b/test/Transforms/InstCombine/narrow-switch.ll

@@ -0,0 +1,93 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+
+; CHECK-LABEL: define i32 @positive1
+; CHECK: switch i32
+; CHECK: i32 10, label
+; CHECK: i32 100, label
+; CHECK: i32 1001, label
+
+define i32 @positive1(i64 %a) {
+entry:
+  %and = and i64 %a, 4294967295
+  switch i64 %and, label %sw.default [
+    i64 10, label %return
+    i64 100, label %sw.bb1
+    i64 1001, label %sw.bb2
+  ]
+
+sw.bb1:
+  br label %return
+
+sw.bb2:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 24, %sw.default ], [ 123, %sw.bb2 ], [ 213, %sw.bb1 ], [ 231, %entry ]
+  ret i32 %retval.0
+}
+
+; CHECK-LABEL: define i32 @negative1
+; CHECK: switch i32
+; CHECK: i32 -10, label
+; CHECK: i32 -100, label
+; CHECK: i32 -1001, label
+
+define i32 @negative1(i64 %a) {
+entry:
+  %or = or i64 %a, -4294967296
+  switch i64 %or, label %sw.default [
+    i64 -10, label %return
+    i64 -100, label %sw.bb1
+    i64 -1001, label %sw.bb2
+  ]
+
+sw.bb1:
+  br label %return
+
+sw.bb2:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 24, %sw.default ], [ 123, %sw.bb2 ], [ 213, %sw.bb1 ], [ 231, %entry ]
+  ret i32 %retval.0
+}
+
+; Make sure truncating a constant int larger than 64-bit doesn't trigger an
+; assertion.
+
+; CHECK-LABEL: define i32 @trunc72to68
+; CHECK: switch i68
+; CHECK: i68 10, label
+; CHECK: i68 100, label
+; CHECK: i68 1001, label
+
+define i32 @trunc72to68(i72 %a) {
+entry:
+  %and = and i72 %a, 295147905179352825855
+  switch i72 %and, label %sw.default [
+    i72 10, label %return
+    i72 100, label %sw.bb1
+    i72 1001, label %sw.bb2
+  ]
+
+sw.bb1:
+  br label %return
+
+sw.bb2:
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 24, %sw.default ], [ 123, %sw.bb2 ], [ 213, %sw.bb1 ], [ 231, %entry ]
+  ret i32 %retval.0
+}

diff --git a/test/Transforms/InstCombine/no_cgscc_assert.ll b/test/Transforms/InstCombine/no_cgscc_assert.ll
new file mode 100644
index 0000000..cec5297
--- /dev/null
+++ b/test/Transforms/InstCombine/no_cgscc_assert.ll

@@ -0,0 +1,19 @@
+; RUN: opt < %s -inline -instcombine -S | FileCheck %s
+
+; PR21403: http://llvm.org/bugs/show_bug.cgi?id=21403
+; When the call to sqrtf is replaced by an intrinsic call to fabs,
+; it should not cause a problem in CGSCC. 
+
+define float @bar(float %f) #0 {
+  %mul = fmul fast float %f, %f
+  %call1 = call float @sqrtf(float %mul) #0
+  ret float %call1
+
+; CHECK-LABEL: @bar(
+; CHECK-NEXT: call float @llvm.fabs.f32
+; CHECK-NEXT: ret float
+}
+
+declare float @sqrtf(float) #0
+
+attributes #0 = { readnone "unsafe-fp-math"="true" }

diff --git a/test/Transforms/InstCombine/objsize-address-space.ll b/test/Transforms/InstCombine/objsize-address-space.ll
index 9cb6884..a971c91 100644
--- a/test/Transforms/InstCombine/objsize-address-space.ll
+++ b/test/Transforms/InstCombine/objsize-address-space.ll

@@ -32,7 +32,7 @@
   ret i16 %1
 }
 
-@a_alias = alias weak [60 x i8] addrspace(3)* @a_as3
+@a_alias = weak alias [60 x i8] addrspace(3)* @a_as3
 define i32 @foo_alias() nounwind {
   %1 = call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* getelementptr inbounds ([60 x i8] addrspace(3)* @a_alias, i32 0, i32 0), i1 false)
   ret i32 %1

diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index 6459032..1285b1c 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll

@@ -256,7 +256,7 @@
   ret i32 7
 }
 
-@globalalias = alias internal [60 x i8]* @a
+@globalalias = internal alias [60 x i8]* @a
 
 ; CHECK-LABEL: @test18(
 ; CHECK-NEXT: ret i32 60
@@ -266,7 +266,7 @@
   ret i32 %1
 }
 
-@globalalias2 = alias weak [60 x i8]* @a
+@globalalias2 = weak alias [60 x i8]* @a
 
 ; CHECK-LABEL: @test19(
 ; CHECK: llvm.objectsize

diff --git a/test/Transforms/InstCombine/or-xor.ll b/test/Transforms/InstCombine/or-xor.ll
index cec36f1..670e3e0 100644
--- a/test/Transforms/InstCombine/or-xor.ll
+++ b/test/Transforms/InstCombine/or-xor.ll

@@ -92,3 +92,92 @@
 ; CHECK-NEXT: %z = or i32 %y.not, %x
 ; CHECK-NEXT: ret i32 %z
 }
+
+define i32 @test10(i32 %A, i32 %B) {
+  %xor1 = xor i32 %B, %A
+  %not = xor i32 %A, -1
+  %xor2 = xor i32 %not, %B
+  %or = or i32 %xor1, %xor2
+  ret i32 %or
+; CHECK-LABEL: @test10(
+; CHECK-NEXT: ret i32 -1
+}
+
+define i32 @test11(i32 %A, i32 %B) {
+  %xor1 = xor i32 %B, %A
+  %not = xor i32 %A, -1
+  %xor2 = xor i32 %not, %B
+  %or = or i32 %xor1, %xor2
+  ret i32 %or
+; CHECK-LABEL: @test11(
+; CHECK-NEXT: ret i32 -1
+}
+
+; (x | y) & ((~x) ^ y) -> (x & y)
+define i32 @test12(i32 %x, i32 %y) {
+ %or = or i32 %x, %y
+ %neg = xor i32 %x, -1
+ %xor = xor i32 %neg, %y
+ %and = and i32 %or, %xor
+ ret i32 %and
+; CHECK-LABEL: @test12(
+; CHECK-NEXT: %and = and i32 %x, %y
+; CHECK-NEXT: ret i32 %and
+}
+
+; ((~x) ^ y) & (x | y) -> (x & y)
+define i32 @test13(i32 %x, i32 %y) {
+ %neg = xor i32 %x, -1
+ %xor = xor i32 %neg, %y
+ %or = or i32 %x, %y
+ %and = and i32 %xor, %or
+ ret i32 %and
+; CHECK-LABEL: @test13(
+; CHECK-NEXT: %and = and i32 %x, %y
+; CHECK-NEXT: ret i32 %and
+}
+
+; ((x | y) ^ (x ^ y)) -> (x & y)
+define i32 @test15(i32 %x, i32 %y) {
+  %1 = xor i32 %y, %x
+  %2 = or i32 %y, %x
+  %3 = xor i32 %2, %1
+  ret i32 %3
+; CHECK-LABEL: @test15(
+; CHECK-NEXT: %1 = and i32 %y, %x
+; CHECK-NEXT: ret i32 %1
+}
+
+; ((x | ~y) ^ (~x | y)) -> x ^ y
+define i32 @test16(i32 %x, i32 %y) {
+  %noty = xor i32 %y, -1
+  %notx = xor i32 %x, -1
+  %or1 = or i32 %x, %noty
+  %or2 = or i32 %notx, %y
+  %xor = xor i32 %or1, %or2
+  ret i32 %xor
+; CHECK-LABEL: @test16(
+; CHECK-NEXT: %xor = xor i32 %x, %y
+; CHECK-NEXT: ret i32 %xor
+}
+
+; ((x & ~y) ^ (~x & y)) -> x ^ y
+define i32 @test17(i32 %x, i32 %y) {
+  %noty = xor i32 %y, -1
+  %notx = xor i32 %x, -1
+  %and1 = and i32 %x, %noty
+  %and2 = and i32 %notx, %y
+  %xor = xor i32 %and1, %and2
+  ret i32 %xor
+; CHECK-LABEL: @test17(
+; CHECK-NEXT: %xor = xor i32 %x, %y
+; CHECK-NEXT: ret i32 %xor
+}
+
+define i32 @test18(i32 %a, i32 %b) {
+  %or = xor i32 %a, %b
+  %and1 = and i32 %or, 1
+  %and2 = and i32 %b, -2
+  %xor = or i32 %and1, %and2
+  ret i32 %xor
+}

diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index 1cd897e..23dad21 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll

@@ -408,3 +408,101 @@
   %or = or i32 %x, %sext
   ret i32 %or
 }
+
+define i32 @test39(i32 %a, i32 %b) {
+; CHECK-LABEL: test39(
+; CHECK-NEXT: %or = or i32 %a, %b
+ %xor = xor i32 %a, -1
+ %and = and i32 %xor, %b
+ %or = or i32 %and, %a
+ ret i32 %or
+}
+
+define i32 @test40(i32 %a, i32 %b) {
+; CHECK-LABEL: test40(
+; CHECK-NEXT:   %1 = xor i32 %a, -1 
+; CHECK-NEXT: %or = or i32 %1, %b
+ %and = and i32 %a, %b
+ %xor = xor i32 %a, -1
+ %or = or i32 %and, %xor
+ ret i32 %or
+}
+
+define i32 @test41(i32 %a, i32 %b) {
+; CHECK-LABEL: test41(
+; CHECK-NEXT: %1 = xor i32 %a, -1
+; CHECK-NEXT: %or = xor i32 %1, %b
+ %and = and i32 %a, %b
+ %nega = xor i32 %a, -1
+ %xor = xor i32 %nega, %b
+ %or = or i32 %and, %xor
+ ret i32 %or
+}
+
+define i32 @test42(i32 %a, i32 %b) {
+; CHECK-LABEL: test42(
+; CHECK-NEXT: %1 = xor i32 %a, -1
+; CHECK-NEXT: %or = xor i32 %1, %b
+ %nega = xor i32 %a, -1
+ %xor = xor i32 %nega, %b
+ %and = and i32 %a, %b
+ %or = or i32 %xor, %and
+ ret i32 %or
+}
+
+define i32 @test43(i32 %a, i32 %b) {
+; CHECK-LABEL: test43(
+; CHECK-NEXT: %or = xor i32 %a, %b
+ %neg = xor i32 %b, -1
+ %and = and i32 %a, %neg
+ %xor = xor i32 %a, %b
+ %or = or i32 %and, %xor
+ ret i32 %or
+}
+
+define i32 @test44(i32 %a, i32 %b) {
+; CHECK-LABEL: test44(
+; CHECK-NEXT: %or = xor i32 %a, %b
+ %xor = xor i32 %a, %b
+ %neg = xor i32 %b, -1
+ %and = and i32 %a, %neg
+ %or = or i32 %xor, %and
+ ret i32 %or
+}
+
+define i32 @test45(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: test45(
+; CHECK-NEXT: %1 = and i32 %x, %z
+; CHECK-NEXT: %or1 = or i32 %1, %y
+; CHECK-NEXT: ret i32 %or1
+  %or = or i32 %y, %z
+  %and = and i32 %x, %or
+  %or1 = or i32 %and, %y
+  ret i32 %or1
+}
+
+define i1 @test46(i8 signext %c)  {
+  %c.off = add i8 %c, -97
+  %cmp1 = icmp ult i8 %c.off, 26
+  %c.off17 = add i8 %c, -65
+  %cmp2 = icmp ult i8 %c.off17, 26
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+; CHECK-LABEL: @test46(
+; CHECK-NEXT:  and i8 %c, -33
+; CHECK-NEXT:  add i8 %1, -65
+; CHECK-NEXT:  icmp ult i8 %2, 26
+}
+
+define i1 @test47(i8 signext %c)  {
+  %c.off = add i8 %c, -65
+  %cmp1 = icmp ule i8 %c.off, 26
+  %c.off17 = add i8 %c, -97
+  %cmp2 = icmp ule i8 %c.off17, 26
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+; CHECK-LABEL: @test47(
+; CHECK-NEXT:  and i8 %c, -33
+; CHECK-NEXT:  add i8 %1, -65
+; CHECK-NEXT:  icmp ult i8 %2, 27
+}

diff --git a/test/Transforms/InstCombine/overflow-mul.ll b/test/Transforms/InstCombine/overflow-mul.ll
index cbb2f5f..6d8d40b 100644
--- a/test/Transforms/InstCombine/overflow-mul.ll
+++ b/test/Transforms/InstCombine/overflow-mul.ll

@@ -173,3 +173,16 @@
   %vcgez.i = sext <4 x i1> %tmp to <4 x i32>
   ret <4 x i32> %vcgez.i
 }
+
+@pr21445_data = external global i32
+define i1 @pr21445(i8 %a) {
+; CHECK-LABEL: @pr21445(
+; CHECK-NEXT:  %[[umul:.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 ptrtoint (i32* @pr21445_data to i8))
+; CHECK-NEXT:  %[[cmp:.*]] = extractvalue { i8, i1 } %[[umul]], 1
+; CHECK-NEXT:  ret i1 %[[cmp]]
+  %ext = zext i8 %a to i32
+  %mul = mul i32 %ext, zext (i8 ptrtoint (i32* @pr21445_data to i8) to i32)
+  %and = and i32 %mul, 255
+  %cmp = icmp ne i32 %mul, %and
+  ret i1 %cmp
+}

diff --git a/test/Transforms/InstCombine/pr12338.ll b/test/Transforms/InstCombine/pr12338.ll
index d34600f..614387a 100644
--- a/test/Transforms/InstCombine/pr12338.ll
+++ b/test/Transforms/InstCombine/pr12338.ll

@@ -6,7 +6,6 @@
 
 for.cond:
   %local = phi <1 x i32> [ <i32 0>, %entry ], [ %phi2, %cond.end47 ]
-; CHECK: sub <1 x i32> <i32 92>, %local
   %phi3 = sub <1 x i32> zeroinitializer, %local
   br label %cond.end
 
@@ -19,6 +18,7 @@
 
 cond.end47:
   %sum = add <1 x i32> %cond, <i32 92>
+; CHECK: sub <1 x i32> <i32 -92>, %cond
   %phi2 = sub <1 x i32> zeroinitializer, %sum
   br label %for.cond
 }

diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index d625f3b..6cf9f0f 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll

@@ -1,7 +1,8 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
 ; This test makes sure that these instructions are properly eliminated.
 ; PR1822
 
-; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-p:64:64-p1:16:16-p2:32:32:32-p3:64:64:64"
 
 define i32 @test1(i32 %A, i32 %B) {
         %C = select i1 false, i32 %A, i32 %B            
@@ -916,9 +917,9 @@
 }
 
 ; CHECK-LABEL: @select_icmp_eq_0_and_1_or_1(
-; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i64 %x, 1
-; CHECK-NEXT: [[ZEXT:%[a-z0-9]+]] = trunc i64 [[AND]] to i32
-; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i32 [[XOR]], %y
+; CHECK-NEXT: [[TRUNC:%.+]] = trunc i64 %x to i32
+; CHECK-NEXT: [[AND:%.+]] = and i32 [[TRUNC]], 1
+; CHECK-NEXT: [[OR:%.+]] = or i32 [[XOR]], %y
 ; CHECK-NEXT: ret i32 [[OR]]
 define i32 @select_icmp_eq_0_and_1_or_1(i64 %x, i32 %y) {
   %and = and i64 %x, 1
@@ -957,11 +958,11 @@
 }
 
 ; CHECK-LABEL: @select_icmp_ne_0_and_1073741824_or_8(
-; CHECK-NEXT: [[LSHR:%[a-z0-9]+]] = lshr i32 %x, 27
-; CHECK-NEXT: [[AND:%[a-z0-9]+]] = and i32 [[LSHR]], 8
-; CHECK-NEXT: [[TRUNC:%[a-z0-9]+]] = trunc i32 [[AND]] to i8
-; CHECK-NEXT: [[XOR:%[a-z0-9]+]] = xor i8 [[TRUNC]], 8
-; CHECK-NEXT: [[OR:%[a-z0-9]+]] = or i8 [[XOR]], %y
+; CHECK-NEXT: [[LSHR:%.+]] = lshr i32 %x, 27
+; CHECK-NEXT: [[TRUNC:%.+]] = trunc i32 [[LSHR]] to i8
+; CHECK-NEXT: [[AND:%.+]] = and i8 [[TRUNC]], 8
+; CHECK-NEXT: [[XOR:%.+]] = xor i8 [[AND]], 8
+; CHECK-NEXT: [[OR:%.+]] = or i8 [[XOR]], %y
 ; CHECK-NEXT: ret i8 [[OR]]
 define i8 @select_icmp_ne_0_and_1073741824_or_8(i32 %x, i8 %y) {
   %and = and i32 %x, 1073741824
@@ -1108,10 +1109,11 @@
   ret i32 %3
 
 ; CHECK-LABEL: @test65(
-; CHECK: and i64 %x, 16
-; CHECK: trunc i64 %1 to i32
-; CHECK: lshr exact i32 %2, 3
-; CHECK: xor i32 %3, 42
+; CHECK: %[[TRUNC:.*]] = trunc i64 %x to i32
+; CHECK: %[[LSHR:.*]] = lshr i32 %[[TRUNC]], 3
+; CHECK: %[[AND:.*]] = and i32 %[[LSHR]], 2
+; CHECK: %[[XOR:.*]] = xor i32 %[[AND]], 42
+; CHECK: ret i32 %[[XOR]]
 }
 
 define i32 @test66(i64 %x) {
@@ -1236,3 +1238,150 @@
 ; CHECK-NEXT: [[SEL:%[a-z0-9]+]] = select i1 [[CMP]], i32 68, i32 %x
 ; CHECK-NEXT: ret i32 [[SEL]]
 }
+
+@under_aligned = external global i32, align 1
+
+define i32 @test76(i1 %flag, i32* %x) {
+; The load here must not be speculated around the select. One side of the
+; select is trivially dereferencable but may have a lower alignment than the
+; load does.
+; CHECK-LABEL: @test76(
+; CHECK: store i32 0, i32* %x
+; CHECK: %[[P:.*]] = select i1 %flag, i32* @under_aligned, i32* %x
+; CHECK: load i32* %[[P]]
+
+  store i32 0, i32* %x
+  %p = select i1 %flag, i32* @under_aligned, i32* %x
+  %v = load i32* %p
+  ret i32 %v
+}
+
+declare void @scribble_on_memory(i32*)
+
+define i32 @test77(i1 %flag, i32* %x) {
+; The load here must not be speculated around the select. One side of the
+; select is trivially dereferencable but may have a lower alignment than the
+; load does.
+; CHECK-LABEL: @test77(
+; CHECK: %[[A:.*]] = alloca i32, align 1
+; CHECK: call void @scribble_on_memory(i32* %[[A]])
+; CHECK: store i32 0, i32* %x
+; CHECK: %[[P:.*]] = select i1 %flag, i32* %[[A]], i32* %x
+; CHECK: load i32* %[[P]]
+
+  %under_aligned = alloca i32, align 1
+  call void @scribble_on_memory(i32* %under_aligned)
+  store i32 0, i32* %x
+  %p = select i1 %flag, i32* %under_aligned, i32* %x
+  %v = load i32* %p
+  ret i32 %v
+}
+
+define i32 @test78(i1 %flag, i32* %x, i32* %y, i32* %z) {
+; Test that we can speculate the loads around the select even when we can't
+; fold the load completely away.
+; CHECK-LABEL: @test78(
+; CHECK:         %[[V1:.*]] = load i32* %x
+; CHECK-NEXT:    %[[V2:.*]] = load i32* %y
+; CHECK-NEXT:    %[[S:.*]] = select i1 %flag, i32 %[[V1]], i32 %[[V2]]
+; CHECK-NEXT:    ret i32 %[[S]]
+entry:
+  store i32 0, i32* %x
+  store i32 0, i32* %y
+  ; Block forwarding by storing to %z which could alias either %x or %y.
+  store i32 42, i32* %z
+  %p = select i1 %flag, i32* %x, i32* %y
+  %v = load i32* %p
+  ret i32 %v
+}
+
+define float @test79(i1 %flag, float* %x, i32* %y, i32* %z) {
+; Test that we can speculate the loads around the select even when we can't
+; fold the load completely away.
+; CHECK-LABEL: @test79(
+; CHECK:         %[[V1:.*]] = load float* %x
+; CHECK-NEXT:    %[[V2:.*]] = load float* %y
+; CHECK-NEXT:    %[[S:.*]] = select i1 %flag, float %[[V1]], float %[[V2]]
+; CHECK-NEXT:    ret float %[[S]]
+entry:
+  %x1 = bitcast float* %x to i32*
+  %y1 = bitcast i32* %y to float*
+  store i32 0, i32* %x1
+  store i32 0, i32* %y
+  ; Block forwarding by storing to %z which could alias either %x or %y.
+  store i32 42, i32* %z
+  %p = select i1 %flag, float* %x, float* %y1
+  %v = load float* %p
+  ret float %v
+}
+
+define i32 @test80(i1 %flag) {
+; Test that when we speculate the loads around the select they fold throug
+; load->load folding and load->store folding.
+; CHECK-LABEL: @test80(
+; CHECK:         %[[X:.*]] = alloca i32
+; CHECK-NEXT:    %[[Y:.*]] = alloca i32
+; CHECK:         %[[V:.*]] = load i32* %[[X]]
+; CHECK-NEXT:    store i32 %[[V]], i32* %[[Y]]
+; CHECK-NEXT:    ret i32 %[[V]]
+entry:
+  %x = alloca i32
+  %y = alloca i32
+  call void @scribble_on_memory(i32* %x)
+  call void @scribble_on_memory(i32* %y)
+  %tmp = load i32* %x
+  store i32 %tmp, i32* %y
+  %p = select i1 %flag, i32* %x, i32* %y
+  %v = load i32* %p
+  ret i32 %v
+}
+
+define float @test81(i1 %flag) {
+; Test that we can speculate the load around the select even though they use
+; differently typed pointers.
+; CHECK-LABEL: @test81(
+; CHECK:         %[[X:.*]] = alloca i32
+; CHECK-NEXT:    %[[Y:.*]] = alloca i32
+; CHECK:         %[[V:.*]] = load i32* %[[X]]
+; CHECK-NEXT:    store i32 %[[V]], i32* %[[Y]]
+; CHECK-NEXT:    %[[C:.*]] = bitcast i32 %[[V]] to float
+; CHECK-NEXT:    ret float %[[C]]
+entry:
+  %x = alloca float
+  %y = alloca i32
+  %x1 = bitcast float* %x to i32*
+  %y1 = bitcast i32* %y to float*
+  call void @scribble_on_memory(i32* %x1)
+  call void @scribble_on_memory(i32* %y)
+  %tmp = load i32* %x1
+  store i32 %tmp, i32* %y
+  %p = select i1 %flag, float* %x, float* %y1
+  %v = load float* %p
+  ret float %v
+}
+
+define i32 @test82(i1 %flag) {
+; Test that we can speculate the load around the select even though they use
+; differently typed pointers.
+; CHECK-LABEL: @test82(
+; CHECK:         %[[X:.*]] = alloca float
+; CHECK-NEXT:    %[[Y:.*]] = alloca i32
+; CHECK-NEXT:    %[[X1:.*]] = bitcast float* %[[X]] to i32*
+; CHECK-NEXT:    %[[Y1:.*]] = bitcast i32* %[[Y]] to float*
+; CHECK:         %[[V:.*]] = load float* %[[X]]
+; CHECK-NEXT:    store float %[[V]], float* %[[Y1]]
+; CHECK-NEXT:    %[[C:.*]] = bitcast float %[[V]] to i32
+; CHECK-NEXT:    ret i32 %[[C]]
+entry:
+  %x = alloca float
+  %y = alloca i32
+  %x1 = bitcast float* %x to i32*
+  %y1 = bitcast i32* %y to float*
+  call void @scribble_on_memory(i32* %x1)
+  call void @scribble_on_memory(i32* %y)
+  %tmp = load float* %x
+  store float %tmp, float* %y1
+  %p = select i1 %flag, i32* %x1, i32* %y
+  %v = load i32* %p
+  ret i32 %v
+}

diff --git a/test/Transforms/InstCombine/strcmp-1.ll b/test/Transforms/InstCombine/strcmp-1.ll
index fc58ffc..9bbd7db 100644
--- a/test/Transforms/InstCombine/strcmp-1.ll
+++ b/test/Transforms/InstCombine/strcmp-1.ll

@@ -15,7 +15,7 @@
 ; CHECK-LABEL: @test1(
 ; CHECK: %strcmpload = load i8* %str
 ; CHECK: %1 = zext i8 %strcmpload to i32
-; CHECK: %2 = sub i32 0, %1
+; CHECK: %2 = sub nsw i32 0, %1
 ; CHECK: ret i32 %2
 
   %str1 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0

diff --git a/test/Transforms/InstCombine/strncmp-1.ll b/test/Transforms/InstCombine/strncmp-1.ll
index df30dd1..49b0955 100644
--- a/test/Transforms/InstCombine/strncmp-1.ll
+++ b/test/Transforms/InstCombine/strncmp-1.ll

@@ -15,7 +15,7 @@
 ; CHECK-LABEL: @test1(
 ; CHECK: %strcmpload = load i8* %str
 ; CHECK: %1 = zext i8 %strcmpload to i32
-; CHECK: %2 = sub i32 0, %1
+; CHECK: %2 = sub nsw i32 0, %1
 ; CHECK: ret i32 %2
 
   %str1 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
@@ -73,7 +73,7 @@
 ; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
 ; CHECK: [[LOAD2:%[a-z]+]] = load i8* %str2, align 1
 ; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
-; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+; CHECK: [[RET:%[a-z]+]] = sub nsw i32 [[ZEXT1]], [[ZEXT2]]
 ; CHECK: ret i32 [[RET]]
 
   %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 1)

diff --git a/test/Transforms/InstCombine/sub-xor.ll b/test/Transforms/InstCombine/sub-xor.ll
index e7aff00..3a24074 100644
--- a/test/Transforms/InstCombine/sub-xor.ll
+++ b/test/Transforms/InstCombine/sub-xor.ll

@@ -32,7 +32,7 @@
 
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT: and i32 %x, 31
-; CHECK-NEXT: sub i32 73, %and
+; CHECK-NEXT: sub nsw i32 73, %and
 ; CHECK-NEXT: ret
 }
 

diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll
index 67b7c49..0e421f7 100644
--- a/test/Transforms/InstCombine/sub.ll
+++ b/test/Transforms/InstCombine/sub.ll

@@ -142,8 +142,9 @@
 	%D = srem i32 %B, %C
 	ret i32 %D
 ; CHECK-LABEL: @test15(
-; CHECK: %D = srem i32 %B, %A
-; CHECK: ret i32 %D
+; CHECK:      %[[sub:.*]] = sub i32 0, %A
+; CHECK-NEXT: %[[rem:.*]] = srem i32 %B, %[[sub]]
+; CHECK: ret i32 %[[rem]]
 }
 
 define i32 @test16(i32 %A) {
@@ -464,3 +465,88 @@
 ; CHECK-NEXT: [[SEXT:%.*]] = sext i1 [[ICMP]] to i32
 ; CHECK-NEXT: ret i32 [[SEXT]]
 }
+
+define i32 @test39(i32 %A, i32 %x) {
+  %B = sub i32 0, %A
+  %C = sub nsw i32 %x, %B
+  ret i32 %C
+; CHECK-LABEL: @test39(
+; CHECK: %C = add i32 %x, %A
+; CHECK: ret i32 %C
+}
+
+define i16 @test40(i16 %a, i16 %b) {
+  %ashr = ashr i16 %a, 1
+  %ashr1 = ashr i16 %b, 1
+  %sub = sub i16 %ashr, %ashr1
+  ret i16 %sub
+; CHECK-LABEL: @test40(
+; CHECK-NEXT: [[ASHR:%.*]] = ashr i16 %a, 1
+; CHECK-NEXT: [[ASHR1:%.*]] = ashr i16 %b, 1
+; CHECK-NEXT: [[RET:%.*]] = sub nsw i16 [[ASHR]], [[ASHR1]]
+; CHECK: ret i16 [[RET]]
+}
+
+define i32 @test41(i16 %a, i16 %b) {
+  %conv = sext i16 %a to i32
+  %conv1 = sext i16 %b to i32
+  %sub = sub i32 %conv, %conv1
+  ret i32 %sub
+; CHECK-LABEL: @test41(
+; CHECK-NEXT: [[SEXT:%.*]] = sext i16 %a to i32
+; CHECK-NEXT: [[SEXT1:%.*]] = sext i16 %b to i32
+; CHECK-NEXT: [[RET:%.*]] = sub nsw i32 [[SEXT]], [[SEXT1]]
+; CHECK: ret i32 [[RET]]
+}
+
+define i4 @test42(i4 %x, i4 %y) {
+  %a = and i4 %y, 7
+  %b = and i4 %x, 7
+  %c = sub i4 %a, %b
+  ret i4 %c
+; CHECK-LABEL: @test42(
+; CHECK-NEXT: [[AND:%.*]] = and i4 %y, 7
+; CHECK-NEXT: [[AND1:%.*]] = and i4 %x, 7
+; CHECK-NEXT: [[RET:%.*]] = sub nsw i4 [[AND]], [[AND1]]
+; CHECK: ret i4 [[RET]]
+}
+
+define i4 @test43(i4 %x, i4 %y) {
+  %a = or i4 %x, -8
+  %b = and i4 %y, 7
+  %c = sub i4 %a, %b
+  ret i4 %c
+; CHECK-LABEL: @test43(
+; CHECK-NEXT: [[OR:%.*]] = or i4 %x, -8
+; CHECK-NEXT: [[AND:%.*]] = and i4 %y, 7
+; CHECK-NEXT: [[RET:%.*]] = sub nuw i4 [[OR]], [[AND]]
+; CHECK: ret i4 [[RET]]
+}
+
+define i32 @test44(i32 %x) {
+  %sub = sub nsw i32 %x, 32768
+  ret i32 %sub
+; CHECK-LABEL: @test44(
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 %x, -32768
+; CHECK: ret i32 [[ADD]]
+}
+
+define i32 @test45(i32 %x, i32 %y) {
+  %or = or i32 %x, %y
+  %xor = xor i32 %x, %y
+  %sub = sub i32 %or, %xor
+  ret i32 %sub
+; CHECK-LABEL: @test45(
+; CHECK-NEXT: %sub = and i32 %x, %y
+; CHECK: ret i32 %sub
+}
+
+define i32 @test46(i32 %x, i32 %y) {
+ %or = or i32 %x, %y
+ %sub = sub i32 %or, %x
+ ret i32 %sub
+; CHECK-LABEL: @test46(
+; CHECK-NEXT: %x.not = xor i32 %x, -1
+; CHECK-NEXT: %sub = and i32 %y, %x.not
+; CHECK: ret i32 %sub
+}

diff --git a/test/Transforms/InstCombine/vsx-unaligned.ll b/test/Transforms/InstCombine/vsx-unaligned.ll
new file mode 100644
index 0000000..26e0426
--- /dev/null
+++ b/test/Transforms/InstCombine/vsx-unaligned.ll

@@ -0,0 +1,44 @@
+; Verify that we can create unaligned loads and stores from VSX intrinsics.
+
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+@vf = common global <4 x float> zeroinitializer, align 1
+@res_vf = common global <4 x float> zeroinitializer, align 1
+@vd = common global <2 x double> zeroinitializer, align 1
+@res_vd = common global <2 x double> zeroinitializer, align 1
+
+define void @test1() {
+entry:
+  %t1 = alloca <4 x float>*, align 8
+  %t2 = alloca <2 x double>*, align 8
+  store <4 x float>* @vf, <4 x float>** %t1, align 8
+  %0 = load <4 x float>** %t1, align 8
+  %1 = bitcast <4 x float>* %0 to i8*
+  %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* %1)
+  store <4 x float>* @res_vf, <4 x float>** %t1, align 8
+  %3 = load <4 x float>** %t1, align 8
+  %4 = bitcast <4 x float>* %3 to i8*
+  call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %2, i8* %4)
+  store <2 x double>* @vd, <2 x double>** %t2, align 8
+  %5 = load <2 x double>** %t2, align 8
+  %6 = bitcast <2 x double>* %5 to i8*
+  %7 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* %6)
+  store <2 x double>* @res_vd, <2 x double>** %t2, align 8
+  %8 = load <2 x double>** %t2, align 8
+  %9 = bitcast <2 x double>* %8 to i8*
+  call void @llvm.ppc.vsx.stxvd2x(<2 x double> %7, i8* %9)
+  ret void
+}
+
+; CHECK-LABEL: @test1
+; CHECK: %0 = load <4 x i32>* bitcast (<4 x float>* @vf to <4 x i32>*), align 1
+; CHECK: store <4 x i32> %0, <4 x i32>* bitcast (<4 x float>* @res_vf to <4 x i32>*), align 1
+; CHECK: %1 = load <2 x double>* @vd, align 1
+; CHECK: store <2 x double> %1, <2 x double>* @res_vd, align 1
+
+declare <4 x i32> @llvm.ppc.vsx.lxvw4x(i8*)
+declare void @llvm.ppc.vsx.stxvw4x(<4 x i32>, i8*)
+declare <2 x double> @llvm.ppc.vsx.lxvd2x(i8*)
+declare void @llvm.ppc.vsx.stxvd2x(<2 x double>, i8*)

diff --git a/test/Transforms/InstCombine/xor2.ll b/test/Transforms/InstCombine/xor2.ll
index d153e03..797c8f3 100644
--- a/test/Transforms/InstCombine/xor2.ll
+++ b/test/Transforms/InstCombine/xor2.ll

@@ -82,3 +82,93 @@
 ; CHECK: lshr i32 %x, 16
 ; CHECK: ret
 }
+
+
+; (A | B) ^ (~A) -> (A | ~B)
+define i32 @test7(i32 %a, i32 %b) {
+ %or = or i32 %a, %b
+ %neg = xor i32 %a, -1
+ %xor = xor i32 %or, %neg
+ ret i32 %xor
+; CHECK-LABEL: @test7(
+; CHECK-NEXT: %[[b_not:.*]] = xor i32 %b, -1
+; CHECK-NEXT: %[[or:.*]] = or i32 %a, %[[b_not]]
+; CHECK-NEXT: ret i32 %[[or]]
+}
+
+; (~A) ^ (A | B) -> (A | ~B)
+define i32 @test8(i32 %a, i32 %b) {
+ %neg = xor i32 %a, -1
+ %or = or i32 %a, %b
+ %xor = xor i32 %neg, %or
+ ret i32 %xor
+; CHECK-LABEL: @test8(
+; CHECK-NEXT: %[[b_not:.*]] = xor i32 %b, -1
+; CHECK-NEXT: %[[or:.*]] = or i32 %a, %[[b_not]]
+; CHECK-NEXT: ret i32 %[[or]]
+}
+
+; (A & B) ^ (A ^ B) -> (A | B)
+define i32 @test9(i32 %b, i32 %c) {
+ %and = and i32 %b, %c
+ %xor = xor i32 %b, %c
+ %xor2 = xor i32 %and, %xor
+ ret i32 %xor2
+; CHECK-LABEL: @test9(
+; CHECK-NEXT: %xor2 = or i32 %b, %c
+}
+
+; (A ^ B) ^ (A & B) -> (A | B)
+define i32 @test10(i32 %b, i32 %c) {
+ %xor = xor i32 %b, %c
+ %and = and i32 %b, %c
+ %xor2 = xor i32 %xor, %and
+ ret i32 %xor2
+; CHECK-LABEL: @test10(
+; CHECK-NEXT: %xor2 = or i32 %b, %c
+}
+
+define i32 @test11(i32 %A, i32 %B) {
+  %xor1 = xor i32 %B, %A
+  %not = xor i32 %A, -1
+  %xor2 = xor i32 %not, %B
+  %and = and i32 %xor1, %xor2
+  ret i32 %and
+; CHECK-LABEL: @test11(
+; CHECK-NEXT: ret i32 0
+}
+
+define i32 @test12(i32 %a, i32 %b) {
+ %negb = xor i32 %b, -1
+ %and = and i32 %a, %negb
+ %nega = xor i32 %a, -1
+ %xor = xor i32 %and, %nega
+ ret i32 %xor
+; CHECK-LABEL: @test12(
+; CHECK-NEXT: %1 = and i32 %a, %b
+; CHECK-NEXT: %xor = xor i32 %1, -1
+}
+
+define i32 @test13(i32 %a, i32 %b) {
+ %nega = xor i32 %a, -1
+ %negb = xor i32 %b, -1
+ %and = and i32 %a, %negb
+ %xor = xor i32 %nega, %and
+ ret i32 %xor
+; CHECK-LABEL: @test13(
+; CHECK-NEXT: %1 = and i32 %a, %b
+; CHECK-NEXT: %xor = xor i32 %1, -1
+}
+
+; (A ^ C) ^ (A | B) -> ((~A) & B) ^ C
+define i32 @test14(i32 %a, i32 %b, i32 %c) {
+ %neg = xor i32 %a, %c
+ %or = or i32 %a, %b
+ %xor = xor i32 %neg, %or
+ ret i32 %xor
+; CHECK-LABEL: @test14(
+; CHECK-NEXT: %[[not:.*]] = xor i32 %a, -1
+; CHECK-NEXT: %[[and:.*]] = and i32 %[[not]], %b
+; CHECK-NEXT: %[[xor:.*]] = xor i32 %[[and]], %c
+; CHECK-NEXT: ret i32 %[[xor]]
+}

diff --git a/test/Transforms/InstMerge/ld_hoist1.ll b/test/Transforms/InstMerge/ld_hoist1.ll
new file mode 100644
index 0000000..715f1b8
--- /dev/null
+++ b/test/Transforms/InstMerge/ld_hoist1.ll

@@ -0,0 +1,64 @@
+; Test load hoist
+; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc_linux"
+
+; Function Attrs: nounwind uwtable
+define float* @foo(i32* noalias nocapture readonly %in, float* noalias %out, i32 %size, i32* nocapture readonly %trigger)  {
+entry:
+  %cmp11 = icmp eq i32 %size, 0
+  br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = add i32 %size, -1
+  br label %for.body
+
+; CHECK-LABEL: for.body
+; CHECK: load
+; CHECK:  %2 = getelementptr inbounds i32* %in, i64 %indvars.iv
+; CHECK:  %3 = load i32* %2, align 4
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32* %trigger, i64 %indvars.iv
+  %1 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %1, 0
+  br i1 %cmp1, label %if.then, label %if.else
+
+; CHECK-LABEL: if.then
+if.then:                                          ; preds = %for.body
+; This load should be hoisted
+  %arrayidx3 = getelementptr inbounds i32* %in, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %conv = sitofp i32 %2 to float
+  %add = fadd float %conv, 5.000000e-01
+  %arrayidx5 = getelementptr inbounds float* %out, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %arrayidx7 = getelementptr inbounds float* %out, i64 %indvars.iv
+  %3 = load float* %arrayidx7, align 4
+  %div = fdiv float %3, 3.000000e+00
+  store float %div, float* %arrayidx7, align 4
+; This load should be hoisted in spite of store 
+  %arrayidx9 = getelementptr inbounds i32* %in, i64 %indvars.iv
+  %4 = load i32* %arrayidx9, align 4
+  %conv10 = sitofp i32 %4 to float
+  %add13 = fadd float %div, %conv10
+  store float %add13, float* %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %entry, %for.cond.for.end_crit_edge
+  ret float* %out
+}
+

diff --git a/test/Transforms/InstMerge/ld_hoist_st_sink.ll b/test/Transforms/InstMerge/ld_hoist_st_sink.ll
new file mode 100644
index 0000000..978160a
--- /dev/null
+++ b/test/Transforms/InstMerge/ld_hoist_st_sink.ll

@@ -0,0 +1,84 @@
+; Tests to make sure that loads and stores in a diamond get merged
+; Loads are hoisted into the header. Stores sunks into the footer.
+; RUN: opt -basicaa -memdep -mldst-motion -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+%struct.node = type { i64, %struct.node*, %struct.node*, %struct.node*, i64, %struct.arc*, i64, i64, i64 }
+%struct.arc = type { i64, i64, i64 }
+
+define i64 @foo(%struct.node* nocapture readonly %r) nounwind {
+entry:
+  %node.0.in16 = getelementptr inbounds %struct.node* %r, i64 0, i32 2
+  %node.017 = load %struct.node** %node.0.in16, align 8
+  %tobool18 = icmp eq %struct.node* %node.017, null
+  br i1 %tobool18, label %while.end, label %while.body.preheader
+
+; CHECK-LABEL: while.body.preheader
+while.body.preheader:                             ; preds = %entry
+; CHECK: load
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %if.end
+  %node.020 = phi %struct.node* [ %node.0, %if.end ], [ %node.017, %while.body.preheader ]
+  %sum.019 = phi i64 [ %inc, %if.end ], [ 0, %while.body.preheader ]
+  %orientation = getelementptr inbounds %struct.node* %node.020, i64 0, i32 4
+  %0 = load i64* %orientation, align 8
+  %cmp = icmp eq i64 %0, 1
+  br i1 %cmp, label %if.then, label %if.else
+; CHECK: if.then
+if.then:                                          ; preds = %while.body
+  %a = getelementptr inbounds %struct.node* %node.020, i64 0, i32 5
+; CHECK-NOT: load %struct.arc
+  %1 = load %struct.arc** %a, align 8
+  %cost = getelementptr inbounds %struct.arc* %1, i64 0, i32 0
+; CHECK-NOT: load i64*
+  %2 = load i64* %cost, align 8
+  %pred = getelementptr inbounds %struct.node* %node.020, i64 0, i32 1
+; CHECK-NOT: load %struct.node**
+  %3 = load %struct.node** %pred, align 8
+  %p = getelementptr inbounds %struct.node* %3, i64 0, i32 6
+; CHECK-NOT: load i64*
+  %4 = load i64* %p, align 8
+  %add = add nsw i64 %4, %2
+  %p1 = getelementptr inbounds %struct.node* %node.020, i64 0, i32 6
+; CHECK-NOT: store i64
+  store i64 %add, i64* %p1, align 8
+  br label %if.end
+
+; CHECK: if.else
+if.else:                                          ; preds = %while.body
+  %pred2 = getelementptr inbounds %struct.node* %node.020, i64 0, i32 1
+; CHECK-NOT: load %struct.node**
+  %5 = load %struct.node** %pred2, align 8
+  %p3 = getelementptr inbounds %struct.node* %5, i64 0, i32 6
+; CHECK-NOT: load i64*
+  %6 = load i64* %p3, align 8
+  %a4 = getelementptr inbounds %struct.node* %node.020, i64 0, i32 5
+; CHECK-NOT: load %struct.arc**
+  %7 = load %struct.arc** %a4, align 8
+  %cost5 = getelementptr inbounds %struct.arc* %7, i64 0, i32 0
+; CHECK-NOT: load i64*
+  %8 = load i64* %cost5, align 8
+  %sub = sub nsw i64 %6, %8
+  %p6 = getelementptr inbounds %struct.node* %node.020, i64 0, i32 6
+; CHECK-NOT: store i64
+  store i64 %sub, i64* %p6, align 8
+  br label %if.end
+
+; CHECK: if.end
+if.end:                                           ; preds = %if.else, %if.then
+; CHECK: store
+  %inc = add nsw i64 %sum.019, 1
+  %node.0.in = getelementptr inbounds %struct.node* %node.020, i64 0, i32 2
+  %node.0 = load %struct.node** %node.0.in, align 8
+  %tobool = icmp eq %struct.node* %node.0, null
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %if.end
+  %inc.lcssa = phi i64 [ %inc, %if.end ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %sum.0.lcssa = phi i64 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+  ret i64 %sum.0.lcssa
+}

diff --git a/test/Transforms/InstSimplify/AndOrXor.ll b/test/Transforms/InstSimplify/AndOrXor.ll
index c59d6c9..8ed06e8 100644
--- a/test/Transforms/InstSimplify/AndOrXor.ll
+++ b/test/Transforms/InstSimplify/AndOrXor.ll

@@ -20,3 +20,131 @@
   ret i64 %e2
 ; CHECK: ret i64 %e
 }
+
+define i32 @sub_neg_nuw(i32 %x, i32 %y) {
+; CHECK-LABEL: @sub_neg_nuw(
+  %neg = sub nuw i32 0, %y
+  %sub = sub i32 %x, %neg
+  ret i32 %sub
+; CHECK: ret i32 %x
+}
+
+define i1 @and_of_icmps0(i32 %b) {
+; CHECK-LABEL: @and_of_icmps0(
+  %1 = add i32 %b, 2
+  %2 = icmp ult i32 %1, 4
+  %cmp3 = icmp sgt i32 %b, 2
+  %cmp = and i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 false
+}
+
+define i1 @and_of_icmps1(i32 %b) {
+; CHECK-LABEL: @and_of_icmps1(
+  %1 = add nsw i32 %b, 2
+  %2 = icmp slt i32 %1, 4
+  %cmp3 = icmp sgt i32 %b, 2
+  %cmp = and i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 false
+}
+
+define i1 @and_of_icmps2(i32 %b) {
+; CHECK-LABEL: @and_of_icmps2(
+  %1 = add i32 %b, 2
+  %2 = icmp ule i32 %1, 3
+  %cmp3 = icmp sgt i32 %b, 2
+  %cmp = and i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 false
+}
+
+define i1 @and_of_icmps3(i32 %b) {
+; CHECK-LABEL: @and_of_icmps3(
+  %1 = add nsw i32 %b, 2
+  %2 = icmp sle i32 %1, 3
+  %cmp3 = icmp sgt i32 %b, 2
+  %cmp = and i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 false
+}
+
+define i1 @and_of_icmps4(i32 %b) {
+; CHECK-LABEL: @and_of_icmps4(
+  %1 = add nuw i32 %b, 2
+  %2 = icmp ult i32 %1, 4
+  %cmp3 = icmp ugt i32 %b, 2
+  %cmp = and i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 false
+}
+
+define i1 @and_of_icmps5(i32 %b) {
+; CHECK-LABEL: @and_of_icmps5(
+  %1 = add nuw i32 %b, 2
+  %2 = icmp ule i32 %1, 3
+  %cmp3 = icmp ugt i32 %b, 2
+  %cmp = and i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 false
+}
+
+define i1 @or_of_icmps0(i32 %b) {
+; CHECK-LABEL: @or_of_icmps0(
+  %1 = add i32 %b, 2
+  %2 = icmp uge i32 %1, 4
+  %cmp3 = icmp sle i32 %b, 2
+  %cmp = or i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 true
+}
+
+define i1 @or_of_icmps1(i32 %b) {
+; CHECK-LABEL: @or_of_icmps1(
+  %1 = add nsw i32 %b, 2
+  %2 = icmp sge i32 %1, 4
+  %cmp3 = icmp sle i32 %b, 2
+  %cmp = or i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 true
+}
+
+define i1 @or_of_icmps2(i32 %b) {
+; CHECK-LABEL: @or_of_icmps2(
+  %1 = add i32 %b, 2
+  %2 = icmp ugt i32 %1, 3
+  %cmp3 = icmp sle i32 %b, 2
+  %cmp = or i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 true
+}
+
+define i1 @or_of_icmps3(i32 %b) {
+; CHECK-LABEL: @or_of_icmps3(
+  %1 = add nsw i32 %b, 2
+  %2 = icmp sgt i32 %1, 3
+  %cmp3 = icmp sle i32 %b, 2
+  %cmp = or i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 true
+}
+
+define i1 @or_of_icmps4(i32 %b) {
+; CHECK-LABEL: @or_of_icmps4(
+  %1 = add nuw i32 %b, 2
+  %2 = icmp uge i32 %1, 4
+  %cmp3 = icmp ule i32 %b, 2
+  %cmp = or i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 true
+}
+
+define i1 @or_of_icmps5(i32 %b) {
+; CHECK-LABEL: @or_of_icmps5(
+  %1 = add nuw i32 %b, 2
+  %2 = icmp ugt i32 %1, 3
+  %cmp3 = icmp ule i32 %b, 2
+  %cmp = or i1 %2, %cmp3
+  ret i1 %cmp
+; CHECK: ret i1 true
+}

diff --git a/test/Transforms/InstSimplify/assume.ll b/test/Transforms/InstSimplify/assume.ll
new file mode 100644
index 0000000..4dd0a8f
--- /dev/null
+++ b/test/Transforms/InstSimplify/assume.ll

@@ -0,0 +1,13 @@
+; RUN: opt -instsimplify -S < %s | FileCheck %s
+
+define void @test1() {
+  call void @llvm.assume(i1 1)
+  ret void
+
+; CHECK-LABEL: @test1
+; CHECK-NOT: llvm.assume
+; CHECK: ret void
+}
+
+declare void @llvm.assume(i1) nounwind
+

diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index 7d0cd9c..38fd747 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll

@@ -333,14 +333,46 @@
 ; CHECK: ret i1 false
 }
 
-define i1 @shl(i32 %x) {
-; CHECK-LABEL: @shl(
+define i1 @shl1(i32 %x) {
+; CHECK-LABEL: @shl1(
   %s = shl i32 1, %x
   %c = icmp eq i32 %s, 0
   ret i1 %c
 ; CHECK: ret i1 false
 }
 
+define i1 @shl2(i32 %X) {
+; CHECK: @shl2
+  %sub = shl nsw i32 -1, %X
+  %cmp = icmp eq i32 %sub, 31
+  ret i1 %cmp
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @shl3(i32 %X) {
+; CHECK: @shl3
+  %sub = shl nuw i32 4, %X
+  %cmp = icmp eq i32 %sub, 31
+  ret i1 %cmp
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @shl4(i32 %X) {
+; CHECK: @shl4
+  %sub = shl nsw i32 -1, %X
+  %cmp = icmp sle i32 %sub, -1
+  ret i1 %cmp
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @shl5(i32 %X) {
+; CHECK: @shl5
+  %sub = shl nuw i32 4, %X
+  %cmp = icmp ugt i32 %sub, 3
+  ret i1 %cmp
+; CHECK-NEXT: ret i1 true
+}
+
 define i1 @lshr1(i32 %x) {
 ; CHECK-LABEL: @lshr1(
   %s = lshr i32 -1, %x
@@ -874,6 +906,21 @@
 ; CHECK: ret i1 false
 }
 
+define i1 @nonnull_deref_arg(i32* dereferenceable(4) %i) {
+  %cmp = icmp eq i32* %i, null
+  ret i1 %cmp
+; CHECK-LABEL: @nonnull_deref_arg
+; CHECK: ret i1 false
+}
+
+define i1 @nonnull_deref_as_arg(i32 addrspace(1)* dereferenceable(4) %i) {
+  %cmp = icmp eq i32 addrspace(1)* %i, null
+  ret i1 %cmp
+; CHECK-LABEL: @nonnull_deref_as_arg
+; CHECK: icmp
+; CHECK ret
+}
+
 declare nonnull i32* @returns_nonnull_helper()
 define i1 @returns_nonnull() {
   %call = call nonnull i32* @returns_nonnull_helper()
@@ -883,6 +930,48 @@
 ; CHECK: ret i1 false
 }
 
+declare dereferenceable(4) i32* @returns_nonnull_deref_helper()
+define i1 @returns_nonnull_deref() {
+  %call = call dereferenceable(4) i32* @returns_nonnull_deref_helper()
+  %cmp = icmp eq i32* %call, null
+  ret i1 %cmp
+; CHECK-LABEL: @returns_nonnull_deref
+; CHECK: ret i1 false
+}
+
+declare dereferenceable(4) i32 addrspace(1)* @returns_nonnull_deref_as_helper()
+define i1 @returns_nonnull_as_deref() {
+  %call = call dereferenceable(4) i32 addrspace(1)* @returns_nonnull_deref_as_helper()
+  %cmp = icmp eq i32 addrspace(1)* %call, null
+  ret i1 %cmp
+; CHECK-LABEL: @returns_nonnull_as_deref
+; CHECK: icmp
+; CHECK: ret
+}
+
+define i1 @nonnull_load(i32** %addr) {
+  %ptr = load i32** %addr, !nonnull !{}
+  %cmp = icmp eq i32* %ptr, null
+  ret i1 %cmp
+; CHECK-LABEL: @nonnull_load
+; CHECK: ret i1 false
+}
+
+define i1 @nonnull_load_as_outer(i32* addrspace(1)* %addr) {
+  %ptr = load i32* addrspace(1)* %addr, !nonnull !{}
+  %cmp = icmp eq i32* %ptr, null
+  ret i1 %cmp
+; CHECK-LABEL: @nonnull_load_as_outer
+; CHECK: ret i1 false
+}
+define i1 @nonnull_load_as_inner(i32 addrspace(1)** %addr) {
+  %ptr = load i32 addrspace(1)** %addr, !nonnull !{}
+  %cmp = icmp eq i32 addrspace(1)* %ptr, null
+  ret i1 %cmp
+; CHECK-LABEL: @nonnull_load_as_inner
+; CHECK: ret i1 false
+}
+
 ; If a bit is known to be zero for A and known to be one for B,
 ; then A and B cannot be equal.
 define i1 @icmp_eq_const(i32 %a) nounwind {
@@ -913,3 +1002,101 @@
 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[DIV]], -1073741824
 ; CHECK-NEXT: ret i1 [[CMP]]
 }
+
+define i1 @icmp_sdiv_pr20288(i64 %a) {
+   %div = sdiv i64 %a, -8589934592
+   %cmp = icmp ne i64 %div, 1073741824
+   ret i1 %cmp
+
+; CHECK-LABEL: @icmp_sdiv_pr20288
+; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 %a, -8589934592
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[DIV]], 1073741824
+; CHECK-NEXT: ret i1 [[CMP]]
+}
+
+define i1 @icmp_sdiv_neg1(i64 %a) {
+ %div = sdiv i64 %a, -1
+ %cmp = icmp ne i64 %div, 1073741824
+ ret i1 %cmp
+
+; CHECK-LABEL: @icmp_sdiv_neg1
+; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 %a, -1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[DIV]], 1073741824
+; CHECK-NEXT: ret i1 [[CMP]]
+}
+
+define i1 @icmp_known_bits(i4 %x, i4 %y) {
+  %and1 = and i4 %y, -7
+  %and2 = and i4 %x, -7
+  %or1 = or i4 %and1, 2
+  %or2 = or i4 %and2, 2
+  %add = add i4 %or1, %or2
+  %cmp = icmp eq i4 %add, 0
+  ret i1 %cmp
+
+; CHECK-LABEL: @icmp_known_bits
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @icmp_shl_nuw_1(i64 %a) {
+ %shl = shl nuw i64 1, %a
+ %cmp = icmp ne i64 %shl, 0
+ ret i1 %cmp
+
+; CHECK-LABEL: @icmp_shl_nuw_1
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @icmp_shl_nsw_neg1(i64 %a) {
+ %shl = shl nsw i64 -1, %a
+ %cmp = icmp sge i64 %shl, 3
+ ret i1 %cmp
+
+; CHECK-LABEL: @icmp_shl_nsw_neg1
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @icmp_shl_nsw_1(i64 %a) {
+ %shl = shl nsw i64 1, %a
+ %cmp = icmp sge i64 %shl, 0
+ ret i1 %cmp
+
+; CHECK-LABEL: @icmp_shl_nsw_1
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @icmp_shl_1_V_ugt_2147483648(i32 %V) {
+  %shl = shl i32 1, %V
+  %cmp = icmp ugt i32 %shl, 2147483648
+  ret i1 %cmp
+
+; CHECK-LABEL: @icmp_shl_1_V_ugt_2147483648(
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @icmp_shl_1_V_ule_2147483648(i32 %V) {
+  %shl = shl i32 1, %V
+  %cmp = icmp ule i32 %shl, 2147483648
+  ret i1 %cmp
+
+; CHECK-LABEL: @icmp_shl_1_V_ule_2147483648(
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @icmp_shl_1_V_eq_31(i32 %V) {
+  %shl = shl i32 1, %V
+  %cmp = icmp eq i32 %shl, 31
+  ret i1 %cmp
+
+; CHECK-LABEL: @icmp_shl_1_V_eq_31(
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @icmp_shl_1_V_ne_31(i32 %V) {
+  %shl = shl i32 1, %V
+  %cmp = icmp ne i32 %shl, 31
+  ret i1 %cmp
+
+; CHECK-LABEL: @icmp_shl_1_V_ne_31(
+; CHECK-NEXT: ret i1 true
+}

diff --git a/test/Transforms/InstSimplify/exact-nsw-nuw.ll b/test/Transforms/InstSimplify/exact-nsw-nuw.ll
index a0e326b..5ccc808 100644
--- a/test/Transforms/InstSimplify/exact-nsw-nuw.ll
+++ b/test/Transforms/InstSimplify/exact-nsw-nuw.ll

@@ -42,3 +42,19 @@
   %D = ashr i32 %C, %B
   ret i32 %D
 }
+
+; CHECK-LABEL: @div1(
+; CHECK: ret i32 0
+define i32 @div1(i32 %V) {
+  %A = udiv i32 %V, -2147483648
+  %B = udiv i32 %A, -2147483648
+  ret i32 %B
+}
+
+; CHECK-LABEL: @div2(
+; CHECK-NOT: ret i32 0
+define i32 @div2(i32 %V) {
+  %A = sdiv i32 %V, -1
+  %B = sdiv i32 %A, -2147483648
+  ret i32 %B
+}

diff --git a/test/Transforms/InstSimplify/fold-builtin-fma.ll b/test/Transforms/InstSimplify/fold-builtin-fma.ll
new file mode 100644
index 0000000..6331b8c
--- /dev/null
+++ b/test/Transforms/InstSimplify/fold-builtin-fma.ll

@@ -0,0 +1,119 @@
+; RUN: opt -instsimplify -S < %s | FileCheck %s
+
+; Fixes PR20832
+; Make sure that we correctly fold a fused multiply-add where operands
+; are all finite constants and addend is zero.
+
+declare double @llvm.fma.f64(double, double, double)
+
+
+define double @PR20832()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 8.0, double 0.0)
+  ret double %1
+}
+; CHECK-LABEL: @PR20832(
+; CHECK: ret double 5.600000e+01
+
+; Test builtin fma with all finite non-zero constants.
+define double @test_all_finite()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 8.0, double 5.0)
+  ret double %1
+}
+; CHECK-LABEL: @test_all_finite(
+; CHECK: ret double 6.100000e+01
+
+; Test builtin fma with a +/-NaN addend.
+define double @test_NaN_addend()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 8.0, double 0x7FF8000000000000)
+  ret double %1
+}
+; CHECK-LABEL: @test_NaN_addend(
+; CHECK: ret double 0x7FF8000000000000
+
+define double @test_NaN_addend_2()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 8.0, double 0xFFF8000000000000)
+  ret double %1
+}
+; CHECK-LABEL: @test_NaN_addend_2(
+; CHECK: ret double 0xFFF8000000000000
+
+; Test builtin fma with a +/-Inf addend.
+define double @test_Inf_addend()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 8.0, double 0x7FF0000000000000)
+  ret double %1
+}
+; CHECK-LABEL: @test_Inf_addend(
+; CHECK: ret double 0x7FF0000000000000
+
+define double @test_Inf_addend_2()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 8.0, double 0xFFF0000000000000)
+  ret double %1
+}
+; CHECK-LABEL: @test_Inf_addend_2(
+; CHECK: ret double 0xFFF0000000000000
+
+; Test builtin fma with one of the operands to the multiply being +/-NaN.
+define double @test_NaN_1()  {
+  %1 = call double @llvm.fma.f64(double 0x7FF8000000000000, double 8.0, double 0.0)
+  ret double %1
+}
+; CHECK-LABEL: @test_NaN_1(
+; CHECK: ret double 0x7FF8000000000000
+
+
+define double @test_NaN_2()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 0x7FF8000000000000, double 0.0)
+  ret double %1
+}
+; CHECK-LABEL: @test_NaN_2(
+; CHECK: ret double 0x7FF8000000000000
+
+
+define double @test_NaN_3()  {
+  %1 = call double @llvm.fma.f64(double 0xFFF8000000000000, double 8.0, double 0.0)
+  ret double %1
+}
+; CHECK-LABEL: @test_NaN_3(
+; CHECK: ret double 0x7FF8000000000000
+
+
+define double @test_NaN_4()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 0xFFF8000000000000, double 0.0)
+  ret double %1
+}
+; CHECK-LABEL: @test_NaN_4(
+; CHECK: ret double 0x7FF8000000000000
+
+
+; Test builtin fma with one of the operands to the multiply being +/-Inf.
+define double @test_Inf_1()  {
+  %1 = call double @llvm.fma.f64(double 0x7FF0000000000000, double 8.0, double 0.0)
+  ret double %1
+}
+; CHECK-LABEL: @test_Inf_1(
+; CHECK: ret double 0x7FF0000000000000
+
+
+define double @test_Inf_2()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 0x7FF0000000000000, double 0.0)
+  ret double %1
+}
+; CHECK-LABEL: @test_Inf_2(
+; CHECK: ret double 0x7FF0000000000000
+
+
+define double @test_Inf_3()  {
+  %1 = call double @llvm.fma.f64(double 0xFFF0000000000000, double 8.0, double 0.0)
+  ret double %1
+}
+; CHECK-LABEL: @test_Inf_3(
+; CHECK: ret double 0xFFF0000000000000
+
+
+define double @test_Inf_4()  {
+  %1 = call double @llvm.fma.f64(double 7.0, double 0xFFF0000000000000, double 0.0)
+  ret double %1
+}
+; CHECK-LABEL: @test_Inf_4(
+; CHECK: ret double 0xFFF0000000000000
+

diff --git a/test/Transforms/InstSimplify/gep.ll b/test/Transforms/InstSimplify/gep.ll
new file mode 100644
index 0000000..49a97f1
--- /dev/null
+++ b/test/Transforms/InstSimplify/gep.ll

@@ -0,0 +1,80 @@
+; RUN: opt -S -instsimplify < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.A = type { [7 x i8] }
+
+define %struct.A* @test1(%struct.A* %b, %struct.A* %e) {
+  %e_ptr = ptrtoint %struct.A* %e to i64
+  %b_ptr = ptrtoint %struct.A* %b to i64
+  %sub = sub i64 %e_ptr, %b_ptr
+  %sdiv = sdiv exact i64 %sub, 7
+  %gep = getelementptr inbounds %struct.A* %b, i64 %sdiv
+  ret %struct.A* %gep
+; CHECK-LABEL: @test1
+; CHECK-NEXT: ret %struct.A* %e
+}
+
+define i8* @test2(i8* %b, i8* %e) {
+  %e_ptr = ptrtoint i8* %e to i64
+  %b_ptr = ptrtoint i8* %b to i64
+  %sub = sub i64 %e_ptr, %b_ptr
+  %gep = getelementptr inbounds i8* %b, i64 %sub
+  ret i8* %gep
+; CHECK-LABEL: @test2
+; CHECK-NEXT: ret i8* %e
+}
+
+define i64* @test3(i64* %b, i64* %e) {
+  %e_ptr = ptrtoint i64* %e to i64
+  %b_ptr = ptrtoint i64* %b to i64
+  %sub = sub i64 %e_ptr, %b_ptr
+  %ashr = ashr exact i64 %sub, 3
+  %gep = getelementptr inbounds i64* %b, i64 %ashr
+  ret i64* %gep
+; CHECK-LABEL: @test3
+; CHECK-NEXT: ret i64* %e
+}
+
+define %struct.A* @test4(%struct.A* %b) {
+  %b_ptr = ptrtoint %struct.A* %b to i64
+  %sub = sub i64 0, %b_ptr
+  %sdiv = sdiv exact i64 %sub, 7
+  %gep = getelementptr inbounds %struct.A* %b, i64 %sdiv
+  ret %struct.A* %gep
+; CHECK-LABEL: @test4
+; CHECK-NEXT: ret %struct.A* null
+}
+
+define i8* @test5(i8* %b) {
+  %b_ptr = ptrtoint i8* %b to i64
+  %sub = sub i64 0, %b_ptr
+  %gep = getelementptr inbounds i8* %b, i64 %sub
+  ret i8* %gep
+; CHECK-LABEL: @test5
+; CHECK-NEXT: ret i8* null
+}
+
+define i64* @test6(i64* %b) {
+  %b_ptr = ptrtoint i64* %b to i64
+  %sub = sub i64 0, %b_ptr
+  %ashr = ashr exact i64 %sub, 3
+  %gep = getelementptr inbounds i64* %b, i64 %ashr
+  ret i64* %gep
+; CHECK-LABEL: @test6
+; CHECK-NEXT: ret i64* null
+}
+
+define i8* @test7(i8* %b, i8** %e) {
+  %e_ptr = ptrtoint i8** %e to i64
+  %b_ptr = ptrtoint i8* %b to i64
+  %sub = sub i64 %e_ptr, %b_ptr
+  %gep = getelementptr inbounds i8* %b, i64 %sub
+  ret i8* %gep
+; CHECK-LABEL: @test7
+; CHECK-NEXT: ptrtoint
+; CHECK-NEXT: ptrtoint
+; CHECK-NEXT: sub
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: ret
+}

diff --git a/test/Transforms/InstSimplify/rem.ll b/test/Transforms/InstSimplify/rem.ll
index 80fa8e7..f5ea451 100644
--- a/test/Transforms/InstSimplify/rem.ll
+++ b/test/Transforms/InstSimplify/rem.ll

@@ -15,3 +15,31 @@
   ret i32 %rem
 ; CHECK: ret i32 0
 }
+
+define i32 @rem1(i32 %x, i32 %n) {
+; CHECK-LABEL: @rem1(
+; CHECK-NEXT: %mod = srem i32 %x, %n
+; CHECK-NEXT: ret i32 %mod
+ %mod = srem i32 %x, %n
+ %mod1 = srem i32 %mod, %n
+ ret i32 %mod1
+}
+
+define i32 @rem2(i32 %x, i32 %n) {
+; CHECK-LABEL: @rem2(
+; CHECK-NEXT: %mod = urem i32 %x, %n
+; CHECK-NEXT: ret i32 %mod
+ %mod = urem i32 %x, %n
+ %mod1 = urem i32 %mod, %n
+ ret i32 %mod1
+}
+
+define i32 @rem3(i32 %x, i32 %n) {
+; CHECK-LABEL: @rem3(
+; CHECK-NEXT: %[[srem:.*]] = srem i32 %x, %n
+; CHECK-NEXT: %[[urem:.*]] = urem i32 %[[srem]], %n
+; CHECK-NEXT: ret i32 %[[urem]]
+ %mod = srem i32 %x, %n
+ %mod1 = urem i32 %mod, %n
+ ret i32 %mod1
+}

diff --git a/test/Transforms/InstSimplify/shr-nop.ll b/test/Transforms/InstSimplify/shr-nop.ll
new file mode 100644
index 0000000..b0dc873
--- /dev/null
+++ b/test/Transforms/InstSimplify/shr-nop.ll

@@ -0,0 +1,346 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; CHECK-LABEL: @foo
+; CHECK:      %[[and:.*]] = and i32 %x, 1
+; CHECK-NEXT: %[[add:.*]] = add i32 %[[and]], -1
+; CHECK-NEXT: ret i32 %[[add]]
+define i32 @foo(i32 %x) {
+ %o = and i32 %x, 1
+ %n = add i32 %o, -1
+ %t = ashr i32 %n, 17
+ ret i32 %t
+}
+
+; CHECK-LABEL: @exact_lshr_eq_both_zero
+; CHECK-NEXT: ret i1 true
+define i1 @exact_lshr_eq_both_zero(i8 %a) {
+ %shr = lshr exact i8 0, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq_both_zero
+; CHECK-NEXT: ret i1 true
+define i1 @exact_ashr_eq_both_zero(i8 %a) {
+ %shr = ashr exact i8 0, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_eq_both_zero
+; CHECK-NEXT: ret i1 true
+define i1 @nonexact_ashr_eq_both_zero(i8 %a) {
+ %shr = ashr i8 0, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_ne_both_zero
+; CHECK-NEXT: ret i1 false
+define i1 @exact_lshr_ne_both_zero(i8 %a) {
+ %shr = lshr exact i8 0, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne_both_zero
+; CHECK-NEXT: ret i1 false
+define i1 @exact_ashr_ne_both_zero(i8 %a) {
+ %shr = ashr exact i8 0, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_ne_both_zero
+; CHECK-NEXT: ret i1 false
+define i1 @nonexact_lshr_ne_both_zero(i8 %a) {
+ %shr = lshr i8 0, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_ne_both_zero
+; CHECK-NEXT: ret i1 false
+define i1 @nonexact_ashr_ne_both_zero(i8 %a) {
+ %shr = ashr i8 0, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_eq_last_zero
+; CHECK-NEXT: ret i1 false
+define i1 @exact_lshr_eq_last_zero(i8 %a) {
+ %shr = lshr exact i8 128, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq_last_zero
+; CHECK-NEXT: ret i1 false
+define i1 @exact_ashr_eq_last_zero(i8 %a) {
+ %shr = ashr exact i8 -128, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_eq_both_zero
+; CHECK-NEXT: ret i1 true
+define i1 @nonexact_lshr_eq_both_zero(i8 %a) {
+ %shr = lshr i8 0, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_ne_last_zero
+; CHECK-NEXT: ret i1 true
+define i1 @exact_lshr_ne_last_zero(i8 %a) {
+ %shr = lshr exact i8 128, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne_last_zero
+; CHECK-NEXT: ret i1 true
+define i1 @exact_ashr_ne_last_zero(i8 %a) {
+ %shr = ashr exact i8 -128, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_eq_last_zero
+; CHECK-NEXT: ret i1 false
+define i1 @nonexact_lshr_eq_last_zero(i8 %a) {
+ %shr = lshr i8 128, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_eq_last_zero
+; CHECK-NEXT: ret i1 false
+define i1 @nonexact_ashr_eq_last_zero(i8 %a) {
+ %shr = ashr i8 -128, %a
+ %cmp = icmp eq i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_ne_last_zero
+; CHECK-NEXT: ret i1 true
+define i1 @nonexact_lshr_ne_last_zero(i8 %a) {
+ %shr = lshr i8 128, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_ne_last_zero
+; CHECK-NEXT: ret i1 true
+define i1 @nonexact_ashr_ne_last_zero(i8 %a) {
+ %shr = ashr i8 -128, %a
+ %cmp = icmp ne i8 %shr, 0
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @lshr_eq_first_zero
+; CHECK-NEXT: ret i1 false
+define i1 @lshr_eq_first_zero(i8 %a) {
+ %shr = lshr i8 0, %a
+ %cmp = icmp eq i8 %shr, 2
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_eq_first_zero
+; CHECK-NEXT: ret i1 false
+define i1 @ashr_eq_first_zero(i8 %a) {
+ %shr = ashr i8 0, %a
+ %cmp = icmp eq i8 %shr, 2
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @lshr_ne_first_zero
+; CHECK-NEXT: ret i1 true
+define i1 @lshr_ne_first_zero(i8 %a) {
+ %shr = lshr i8 0, %a
+ %cmp = icmp ne i8 %shr, 2
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_ne_first_zero
+; CHECK-NEXT: ret i1 true
+define i1 @ashr_ne_first_zero(i8 %a) {
+ %shr = ashr i8 0, %a
+ %cmp = icmp ne i8 %shr, 2
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_eq_both_minus1
+; CHECK-NEXT: ret i1 true
+define i1 @ashr_eq_both_minus1(i8 %a) {
+ %shr = ashr i8 -1, %a
+ %cmp = icmp eq i8 %shr, -1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_ne_both_minus1
+; CHECK-NEXT: ret i1 false
+define i1 @ashr_ne_both_minus1(i8 %a) {
+ %shr = ashr i8 -1, %a
+ %cmp = icmp ne i8 %shr, -1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq_both_minus1
+; CHECK-NEXT: ret i1 true
+define i1 @exact_ashr_eq_both_minus1(i8 %a) {
+ %shr = ashr exact i8 -1, %a
+ %cmp = icmp eq i8 %shr, -1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne_both_minus1
+; CHECK-NEXT: ret i1 false
+define i1 @exact_ashr_ne_both_minus1(i8 %a) {
+ %shr = ashr exact i8 -1, %a
+ %cmp = icmp ne i8 %shr, -1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq_opposite_msb
+; CHECK-NEXT: ret i1 false
+define i1 @exact_ashr_eq_opposite_msb(i8 %a) {
+ %shr = ashr exact i8 -128, %a
+ %cmp = icmp eq i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq_noexactlog
+; CHECK-NEXT: ret i1 false
+define i1 @exact_ashr_eq_noexactlog(i8 %a) {
+ %shr = ashr exact i8 -90, %a
+ %cmp = icmp eq i8 %shr, -30
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne_opposite_msb
+; CHECK-NEXT: ret i1 true
+define i1 @exact_ashr_ne_opposite_msb(i8 %a) {
+ %shr = ashr exact i8 -128, %a
+ %cmp = icmp ne i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_eq_opposite_msb
+; CHECK-NEXT: ret i1 false
+define i1 @ashr_eq_opposite_msb(i8 %a) {
+ %shr = ashr i8 -128, %a
+ %cmp = icmp eq i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @ashr_ne_opposite_msb
+; CHECK-NEXT: ret i1 true
+define i1 @ashr_ne_opposite_msb(i8 %a) {
+ %shr = ashr i8 -128, %a
+ %cmp = icmp ne i8 %shr, 1
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_eq_shift_gt
+; CHECK-NEXT : ret i1 false
+define i1 @exact_ashr_eq_shift_gt(i8 %a) {
+ %shr = ashr exact i8 -2, %a
+ %cmp = icmp eq i8 %shr, -8
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne_shift_gt
+; CHECK-NEXT : ret i1 true
+define i1 @exact_ashr_ne_shift_gt(i8 %a) {
+ %shr = ashr exact i8 -2, %a
+ %cmp = icmp ne i8 %shr, -8
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_eq_shift_gt
+; CHECK-NEXT : ret i1 false
+define i1 @nonexact_ashr_eq_shift_gt(i8 %a) {
+ %shr = ashr i8 -2, %a
+ %cmp = icmp eq i8 %shr, -8
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_ashr_ne_shift_gt
+; CHECK-NEXT : ret i1 true
+define i1 @nonexact_ashr_ne_shift_gt(i8 %a) {
+ %shr = ashr i8 -2, %a
+ %cmp = icmp ne i8 %shr, -8
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_eq_shift_gt
+; CHECK-NEXT: ret i1 false
+define i1 @exact_lshr_eq_shift_gt(i8 %a) {
+ %shr = lshr exact i8 2, %a
+ %cmp = icmp eq i8 %shr, 8
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_ne_shift_gt
+; CHECK-NEXT: ret i1 true
+define i1 @exact_lshr_ne_shift_gt(i8 %a) {
+ %shr = lshr exact i8 2, %a
+ %cmp = icmp ne i8 %shr, 8
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_eq_shift_gt
+; CHECK-NEXT : ret i1 false
+define i1 @nonexact_lshr_eq_shift_gt(i8 %a) {
+ %shr = lshr i8 2, %a
+ %cmp = icmp eq i8 %shr, 8
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @nonexact_lshr_ne_shift_gt
+; CHECK-NEXT : ret i1 true
+define i1 @nonexact_lshr_ne_shift_gt(i8 %a) {
+ %shr = ashr i8 2, %a
+ %cmp = icmp ne i8 %shr, 8
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_ashr_ne_noexactlog
+; CHECK-NEXT: ret i1 true
+define i1 @exact_ashr_ne_noexactlog(i8 %a) {
+ %shr = ashr exact i8 -90, %a
+ %cmp = icmp ne i8 %shr, -30
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_eq_noexactlog
+; CHECK-NEXT: ret i1 false
+define i1 @exact_lshr_eq_noexactlog(i8 %a) {
+ %shr = lshr exact i8 90, %a
+ %cmp = icmp eq i8 %shr, 30
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_ne_noexactlog
+; CHECK-NEXT: ret i1 true
+define i1 @exact_lshr_ne_noexactlog(i8 %a) {
+ %shr = lshr exact i8 90, %a
+ %cmp = icmp ne i8 %shr, 30
+ ret i1 %cmp
+}
+
+; CHECK-LABEL: @exact_lshr_lowbit
+; CHECK-NEXT: ret i32 7
+define i32 @exact_lshr_lowbit(i32 %shiftval) {
+  %shr = lshr exact i32 7, %shiftval
+  ret i32 %shr
+}
+
+; CHECK-LABEL: @exact_ashr_lowbit
+; CHECK-NEXT: ret i32 7
+define i32 @exact_ashr_lowbit(i32 %shiftval) {
+  %shr = ashr exact i32 7, %shiftval
+  ret i32 %shr
+}

diff --git a/test/Transforms/InstSimplify/vector_ptr_bitcast.ll b/test/Transforms/InstSimplify/vector_ptr_bitcast.ll
new file mode 100644
index 0000000..607892a
--- /dev/null
+++ b/test/Transforms/InstSimplify/vector_ptr_bitcast.ll

@@ -0,0 +1,35 @@
+; RUN: opt -S -instsimplify < %s | FileCheck %s
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+
+%mst = type { i8*, i8* }
+%mst2 = type { i32*, i32*, i32*, i32* }
+
+@a = private unnamed_addr constant %mst { i8* inttoptr (i64 -1 to i8*),
+                                          i8* inttoptr (i64 -1 to i8*)},
+                                          align 8
+@b = private unnamed_addr constant %mst2 { i32* inttoptr (i64 42 to i32*),
+                                           i32* inttoptr (i64 67 to i32*),
+                                           i32* inttoptr (i64 33 to i32*),
+                                           i32* inttoptr (i64 58 to i32*)},
+                                          align 8
+
+define i64 @fn() {
+  %x = load <2 x i8*>* bitcast (%mst* @a to <2 x i8*>*), align 8
+  %b = extractelement <2 x i8*> %x, i32 0
+  %c = ptrtoint i8* %b to i64
+  ; CHECK-LABEL: @fn
+  ; CHECK-NEXT: ret i64 -1
+  ret i64 %c
+}
+
+define i64 @fn2() {
+  %x = load <4 x i32*>* bitcast (%mst2* @b to <4 x i32*>*), align 8
+  %b = extractelement <4 x i32*> %x, i32 0
+  %c = extractelement <4 x i32*> %x, i32 3
+  %d = ptrtoint i32* %b to i64
+  %e = ptrtoint i32* %c to i64
+  %r = add i64 %d, %e
+  ; CHECK-LABEL: @fn2
+  ; CHECK-NEXT: ret i64 100
+  ret i64 %r
+}

diff --git a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
index 16bfe2a..1652388 100644
--- a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
+++ b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll

@@ -4,10 +4,10 @@
 ; CHECK: @A = internal global i32 0
 
 @B = alias i32* @A
-; CHECK: @B = alias internal i32* @A
+; CHECK: @B = internal alias i32* @A
 
 @C = alias i32* @A
-; CHECK: @C = alias internal i32* @A
+; CHECK: @C = internal alias i32* @A
 
 define i32 @main() {
 	%tmp = load i32* @C

diff --git a/test/Transforms/Internalize/local-visibility.ll b/test/Transforms/Internalize/local-visibility.ll
index c24d4b7..b09a136 100644
--- a/test/Transforms/Internalize/local-visibility.ll
+++ b/test/Transforms/Internalize/local-visibility.ll

@@ -10,9 +10,9 @@
 ; CHECK: @protected.variable = internal global i32 0
 @protected.variable = protected global i32 0
 
-; CHECK: @hidden.alias = alias internal i32* @global
+; CHECK: @hidden.alias = internal alias  i32* @global
 @hidden.alias = hidden alias i32* @global
-; CHECK: @protected.alias = alias internal i32* @global
+; CHECK: @protected.alias = internal alias i32* @global
 @protected.alias = protected alias i32* @global
 
 ; CHECK: define internal void @hidden.function() {

diff --git a/test/Transforms/JumpThreading/assume-edge-dom.ll b/test/Transforms/JumpThreading/assume-edge-dom.ll
new file mode 100644
index 0000000..f1d0f41
--- /dev/null
+++ b/test/Transforms/JumpThreading/assume-edge-dom.ll

@@ -0,0 +1,39 @@
+; RUN: opt -S -jump-threading < %s | FileCheck %s
+
+declare i8* @escape()
+declare void @llvm.assume(i1)
+
+define i1 @test1(i1 %cond) {
+entry:
+    br i1 %cond, label %taken, label %not_taken
+
+; CHECK-LABEL: @test1
+; CHECK: br i1 %cond, label %no, label %yes
+; CHECK: ret i1 true
+
+taken:
+    %res1 = call i8* @escape()
+    %a = icmp eq i8* %res1, null
+    tail call void @llvm.assume(i1 %a)
+    br label %done
+not_taken:
+    %res2 = call i8* @escape()
+    %b = icmp ne i8* %res2, null
+    tail call void @llvm.assume(i1 %b)
+    br label %done
+
+; An assume that can be used to simplify this comparison dominates each
+; predecessor branch (although no assume dominates the cmp itself). Make sure
+; this still can be simplified.
+
+done:
+    %res = phi i8* [ %res1, %taken ], [ %res2, %not_taken ]
+    %cnd = icmp ne i8* %res, null
+    br i1 %cnd, label %yes, label %no
+
+yes:
+    ret i1 true
+no:
+    ret i1 false
+}
+

diff --git a/test/Transforms/JumpThreading/assume.ll b/test/Transforms/JumpThreading/assume.ll
new file mode 100644
index 0000000..89dd0a9
--- /dev/null
+++ b/test/Transforms/JumpThreading/assume.ll

@@ -0,0 +1,68 @@
+; RUN: opt -S -jump-threading -dce < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @test1(i32 %a, i32 %b) #0 {
+entry:
+  %cmp = icmp sgt i32 %a, 5
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp1 = icmp sgt i32 %b, 1234
+  br i1 %cmp1, label %if.then, label %if.else
+
+; CHECK-LABEL: @test1
+; CHECK: icmp sgt i32 %a, 5
+; CHECK: call void @llvm.assume
+; CHECK-NOT: icmp sgt i32 %a, 3
+; CHECK: ret i32
+
+if.then:                                          ; preds = %entry
+  %cmp2 = icmp sgt i32 %a, 3
+  br i1 %cmp2, label %if.then3, label %return
+
+if.then3:                                         ; preds = %if.then
+  tail call void (...)* @bar() #1
+  br label %return
+
+if.else:                                          ; preds = %entry
+  tail call void (...)* @car() #1
+  br label %return
+
+return:                                           ; preds = %if.else, %if.then, %if.then3
+  %retval.0 = phi i32 [ 1, %if.then3 ], [ 0, %if.then ], [ 0, %if.else ]
+  ret i32 %retval.0
+}
+
+define i32 @test2(i32 %a) #0 {
+entry:
+  %cmp = icmp sgt i32 %a, 5
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp1 = icmp sgt i32 %a, 3
+  br i1 %cmp1, label %if.then, label %return
+
+; CHECK-LABEL: @test2
+; CHECK: icmp sgt i32 %a, 5
+; CHECK: tail call void @llvm.assume
+; CHECK: tail call void (...)* @bar()
+; CHECK: ret i32 1
+
+
+if.then:                                          ; preds = %entry
+  tail call void (...)* @bar() #1
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ 1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+declare void @bar(...)
+
+declare void @car(...)
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+

diff --git a/test/Transforms/JumpThreading/thread-loads.ll b/test/Transforms/JumpThreading/thread-loads.ll
index e5bf64b..b13b767 100644
--- a/test/Transforms/JumpThreading/thread-loads.ll
+++ b/test/Transforms/JumpThreading/thread-loads.ll

@@ -75,6 +75,37 @@
 	ret i32 %res.0
 }
 
+define i32 @test3(i8** %x, i1 %f) {
+; Correctly thread loads of different (but compatible) types, placing bitcasts
+; as necessary in the predecessors. This is especially tricky because the same
+; predecessor ends up with two entries in the PHI node and they must share
+; a single cast.
+; CHECK-LABEL: @test3(
+entry:
+  %0 = bitcast i8** %x to i32**
+  %1 = load i32** %0, align 8
+  br i1 %f, label %if.end57, label %if.then56
+; CHECK: %[[LOAD:.*]] = load i32**
+; CHECK: %[[CAST:.*]] = bitcast i32* %[[LOAD]] to i8*
+
+if.then56:
+  br label %if.end57
+
+if.end57:
+  %2 = load i8** %x, align 8
+  %tobool59 = icmp eq i8* %2, null
+  br i1 %tobool59, label %return, label %if.then60
+; CHECK: %[[PHI:.*]] = phi i8* [ %[[CAST]], %[[PRED:[^ ]+]] ], [ %[[CAST]], %[[PRED]] ]
+; CHECK-NEXT: %[[CMP:.*]] = icmp eq i8* %[[PHI]], null
+; CHECK-NEXT: br i1 %[[CMP]]
+
+if.then60:
+  ret i32 42
+
+return:
+  ret i32 13
+}
+
 !0 = metadata !{metadata !3, metadata !3, i64 0}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}

diff --git a/test/Transforms/LICM/2014-09-10-doFinalizationAssert.ll b/test/Transforms/LICM/2014-09-10-doFinalizationAssert.ll
new file mode 100644
index 0000000..17ae716
--- /dev/null
+++ b/test/Transforms/LICM/2014-09-10-doFinalizationAssert.ll

@@ -0,0 +1,30 @@
+; RUN: opt < %s -scalar-evolution -licm -loop-unroll -disable-output
+; Test triggered an assertion in doFinalization() because loop unroll was deleting
+; the inner loop which caused the loop to not get removed from the
+; LoopToAliasSetMap.
+; Test case taken from test/Transforms/LoopUnroll/unloop.ll.
+
+declare i1 @check() nounwind
+define void @skiplevelexit() nounwind {
+entry:
+  br label %outer
+
+outer:
+  br label %inner
+
+inner:
+  %iv = phi i32 [ 0, %outer ], [ %inc, %tail ]
+  %inc = add i32 %iv, 1
+  call zeroext i1 @check()
+  br i1 true, label %outer.backedge, label %tail
+
+tail:
+  br i1 false, label %inner, label %exit
+
+outer.backedge:
+  br label %outer
+
+exit:
+  ret void
+}
+

diff --git a/test/Transforms/LICM/PR19798.ll b/test/Transforms/LICM/PR19798.ll
new file mode 100644
index 0000000..82befb0
--- /dev/null
+++ b/test/Transforms/LICM/PR19798.ll

@@ -0,0 +1,22 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+
+define void @f() {
+; CHECK-LABEL: @f(
+entry:
+  br label %bb0
+
+bb0:
+  %tobool7 = icmp eq i1 undef, undef
+  br label %bb1
+
+bb1:
+  br i1 undef, label %bb0, label %bb0
+
+unreachable:
+; CHECK-LABEL: unreachable:
+; CHECK:   br i1 undef, label %unreachable, label %unreachable
+  br i1 %tobool7, label %unreachable, label %unreachable
+
+bb3:
+  unreachable
+}

diff --git a/test/Transforms/LICM/PR21582.ll b/test/Transforms/LICM/PR21582.ll
new file mode 100644
index 0000000..c068c2f
--- /dev/null
+++ b/test/Transforms/LICM/PR21582.ll

@@ -0,0 +1,40 @@
+; RUN: opt < %s -basicaa -licm -S | FileCheck %s
+@b = external global i32, align 4
+@fn3.i = external global i32, align 4
+
+declare i32 @g() nounwind
+
+define i32 @f() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %entry
+; CHECK-LABEL: for.cond:
+; CHECK: store i32 0, i32* @b
+  store i32 0, i32* @b, align 4
+  br i1 true, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %for.cond
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %g.15 = phi i32 [ undef, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx2 = getelementptr inbounds i32* @fn3.i, i64 0
+  %0 = load i32* %arrayidx2, align 4
+  %call = call i32 @g()
+  br i1 false, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %for.cond
+  %whatever = phi i32 [ %call, %for.end.loopexit ], [ undef, %for.cond ]
+  br i1 false, label %for.cond, label %if.then
+
+if.then:                                          ; preds = %for.end
+; CHECK-LABEL: if.then:
+; CHECK: phi i32 [ {{.*}}, %for.end ]
+; CHECK-NOT: store i32 0, i32* @b
+; CHECK: ret i32
+  ret i32 %whatever
+}

diff --git a/test/Transforms/LICM/debug-value.ll b/test/Transforms/LICM/debug-value.ll
index e5c774f..0e0cd39 100644
--- a/test/Transforms/LICM/debug-value.ll
+++ b/test/Transforms/LICM/debug-value.ll

@@ -15,7 +15,7 @@
 
 if.then27:                                        ; preds = %if.then
 ; CHECK: tail call void @llvm.dbg.value
-  tail call void @llvm.dbg.value(metadata !18, i64 0, metadata !19), !dbg !21
+  tail call void @llvm.dbg.value(metadata !18, i64 0, metadata !19, metadata !{}), !dbg !21
   br label %for.body61.us
 
 if.end.if.end.split_crit_edge.critedge:           ; preds = %if.then
@@ -31,35 +31,35 @@
   ret void, !dbg !24
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.module.flags = !{!26}
 !llvm.dbg.sp = !{!0, !6, !9, !10}
 
-!0 = metadata !{i32 589870, metadata !25, metadata !1, metadata !"idamax", metadata !"idamax", metadata !"", i32 112, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 589865, metadata !25} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 589841, metadata !25, i32 12, metadata !"clang version 2.9 (trunk 127169)", i1 true, metadata !"", i32 0, metadata !8, metadata !8, metadata !8, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !25, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!0 = metadata !{metadata !"0x2e\00idamax\00idamax\00\00112\000\001\000\006\00256\000\000", metadata !25, metadata !1, metadata !3, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !25} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 127169)\001\00\000\00\000", metadata !25, metadata !8, metadata !8, metadata !8, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !25, metadata !1, null, metadata !4, i32 0} ; [ DW_TAG_subroutine_type ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 589860, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 589870, metadata !25, metadata !1, metadata !"dscal", metadata !"dscal", metadata !"", i32 206, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 589845, metadata !25, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x2e\00dscal\00dscal\00\00206\000\001\000\006\00256\000\000", metadata !25, metadata !1, metadata !7, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !25, metadata !1, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
-!9 = metadata !{i32 589870, metadata !25, metadata !1, metadata !"daxpy", metadata !"daxpy", metadata !"", i32 230, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 589870, metadata !25, metadata !1, metadata !"dgefa", metadata !"dgefa", metadata !"", i32 267, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 267] [def] [scope 0] [dgefa]
+!9 = metadata !{metadata !"0x2e\00daxpy\00daxpy\00\00230\000\001\000\006\00256\000\000", metadata !25, metadata !1, metadata !7, i32 0, null, null, null, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{metadata !"0x2e\00dgefa\00dgefa\00\00267\000\001\000\006\00256\000\000", metadata !25, metadata !1, metadata !7, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 267] [def] [scope 0] [dgefa]
 !11 = metadata !{i32 281, i32 9, metadata !12, null}
-!12 = metadata !{i32 589835, metadata !25, metadata !13, i32 272, i32 5, i32 32} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{i32 589835, metadata !25, metadata !14, i32 271, i32 5, i32 31} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{i32 589835, metadata !25, metadata !10, i32 267, i32 1, i32 30} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{metadata !"0xb\00272\005\0032", metadata !25, metadata !13} ; [ DW_TAG_lexical_block ]
+!13 = metadata !{metadata !"0xb\00271\005\0031", metadata !25, metadata !14} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\00267\001\0030", metadata !25, metadata !10} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 271, i32 5, metadata !14, null}
 !16 = metadata !{i32 284, i32 10, metadata !17, null}
-!17 = metadata !{i32 589835, metadata !25, metadata !12, i32 282, i32 9, i32 33} ; [ DW_TAG_lexical_block ]
+!17 = metadata !{metadata !"0xb\00282\009\0033", metadata !25, metadata !12} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{double undef}
-!19 = metadata !{i32 590080, metadata !14, metadata !"temp", metadata !1, i32 268, metadata !20, i32 0} ; [ DW_TAG_auto_variable ]
-!20 = metadata !{i32 589860, null, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!19 = metadata !{metadata !"0x100\00temp\00268\000", metadata !14, metadata !1, metadata !20} ; [ DW_TAG_auto_variable ]
+!20 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
 !21 = metadata !{i32 286, i32 14, metadata !22, null}
-!22 = metadata !{i32 589835, metadata !25, metadata !17, i32 285, i32 13, i32 34} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !"0xb\00285\0013\0034", metadata !25, metadata !17} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{i32 296, i32 13, metadata !17, null}
 !24 = metadata !{i32 313, i32 1, metadata !14, null}
 !25 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/Benchmarks/CoyoteBench/lpbench.c", metadata !"/private/tmp"}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/LICM/hoist-bitcast-load.ll b/test/Transforms/LICM/hoist-bitcast-load.ll
index 639dca5..fa61eaf 100644
--- a/test/Transforms/LICM/hoist-bitcast-load.ll
+++ b/test/Transforms/LICM/hoist-bitcast-load.ll

@@ -78,6 +78,44 @@
   ret void
 }
 
+; Make sure the basic alloca pointer hoisting works through an addrspacecast
+; CHECK-LABEL: @test2_addrspacecast
+; CHECK: load i32 addrspace(1)* %c, align 4
+; CHECK: for.body:
+
+; Function Attrs: nounwind uwtable
+define void @test2_addrspacecast(i32 addrspace(1)* nocapture %a, i32 addrspace(1)* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  %ca = alloca i64
+  %c = addrspacecast i64* %ca to i32 addrspace(1)*
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %a, i64 %indvars.iv
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32 addrspace(1)* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32 addrspace(1)* %b, i64 %indvars.iv
+  %2 = load i32 addrspace(1)* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32 addrspace(1)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
 ; Make sure the basic alloca pointer hoisting works through a bitcast to a
 ; pointer to a smaller type (where the bitcast also needs to be hoisted):
 ; CHECK-LABEL: @test3

diff --git a/test/Transforms/LICM/hoist-deref-load.ll b/test/Transforms/LICM/hoist-deref-load.ll
new file mode 100644
index 0000000..c230d1d
--- /dev/null
+++ b/test/Transforms/LICM/hoist-deref-load.ll

@@ -0,0 +1,168 @@
+; RUN: opt -S -basicaa -licm < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test represents the following function:
+; void test1(int * __restrict__ a, int * __restrict__ b, int &c, int n) {
+;   for (int i = 0; i < n; ++i)
+;     if (a[i] > 0)
+;       a[i] = c*b[i];
+; }
+; and we want to hoist the load of %c out of the loop. This can be done only
+; because the dereferenceable attribute is on %c.
+
+; CHECK-LABEL: @test1
+; CHECK: load i32* %c, align 4
+; CHECK: for.body:
+
+define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* nocapture readonly nonnull dereferenceable(4) %c, i32 %n) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; This is the same as @test1, but without the dereferenceable attribute on %c.
+; Without this attribute, we should not hoist the load of %c.
+
+; CHECK-LABEL: @test2
+; CHECK: if.then:
+; CHECK: load i32* %c, align 4
+
+define void @test2(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* nocapture readonly nonnull %c, i32 %n) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i32* %c, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; This test represents the following function:
+; void test3(int * restrict a, int * restrict b, int c[static 3], int n) {
+;   for (int i = 0; i < n; ++i)
+;     if (a[i] > 0)
+;       a[i] = c[2]*b[i];
+; }
+; and we want to hoist the load of c[2] out of the loop. This can be done only
+; because the dereferenceable attribute is on %c.
+
+; CHECK-LABEL: @test3
+; CHECK: load i32* %c2, align 4
+; CHECK: for.body:
+
+define void @test3(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* nocapture readonly dereferenceable(12) %c, i32 %n) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %c2 = getelementptr inbounds i32* %c, i64 2
+  %1 = load i32* %c2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+; This is the same as @test3, but with a dereferenceable attribute on %c with a
+; size too small to cover c[2] (and so we should not hoist it).
+
+; CHECK-LABEL: @test4
+; CHECK: if.then:
+; CHECK: load i32* %c2, align 4
+
+define void @test4(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* nocapture readonly dereferenceable(11) %c, i32 %n) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %c2 = getelementptr inbounds i32* %c, i64 2
+  %1 = load i32* %c2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %2 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %2, %1
+  store i32 %mul, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+

diff --git a/test/Transforms/LICM/speculate.ll b/test/Transforms/LICM/speculate.ll
index 4244f15..6926669 100644
--- a/test/Transforms/LICM/speculate.ll
+++ b/test/Transforms/LICM/speculate.ll

@@ -3,12 +3,11 @@
 ; UDiv is safe to speculate if the denominator is known non-zero.
 
 ; CHECK-LABEL: @safe_udiv(
-; CHECK:      %div = udiv i64 %x, %or
+; CHECK:      %div = udiv i64 %x, 2
 ; CHECK-NEXT: br label %for.body
 
 define void @safe_udiv(i64 %x, i64 %m, i64 %n, i32* %p, i64* %q) nounwind {
 entry:
-  %or = or i64 %m, 1
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.inc
@@ -19,7 +18,7 @@
   br i1 %tobool, label %for.inc, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %div = udiv i64 %x, %or
+  %div = udiv i64 %x, 2
   %arrayidx1 = getelementptr inbounds i64* %q, i64 %i.02
   store i64 %div, i64* %arrayidx1, align 8
   br label %for.inc
@@ -69,13 +68,12 @@
 ; known to have at least one zero bit.
 
 ; CHECK-LABEL: @safe_sdiv(
-; CHECK:      %div = sdiv i64 %x, %or
+; CHECK:      %div = sdiv i64 %x, 2
 ; CHECK-NEXT: br label %for.body
 
 define void @safe_sdiv(i64 %x, i64 %m, i64 %n, i32* %p, i64* %q) nounwind {
 entry:
   %and = and i64 %m, -3
-  %or = or i64 %and, 1
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.inc
@@ -86,7 +84,7 @@
   br i1 %tobool, label %for.inc, label %if.then
 
 if.then:                                          ; preds = %for.body
-  %div = sdiv i64 %x, %or
+  %div = sdiv i64 %x, 2
   %arrayidx1 = getelementptr inbounds i64* %q, i64 %i.02
   store i64 %div, i64* %arrayidx1, align 8
   br label %for.inc

diff --git a/test/Transforms/LoadCombine/load-combine-aa.ll b/test/Transforms/LoadCombine/load-combine-aa.ll
new file mode 100644
index 0000000..3542dce
--- /dev/null
+++ b/test/Transforms/LoadCombine/load-combine-aa.ll

@@ -0,0 +1,39 @@
+; RUN: opt -basicaa -load-combine -instcombine -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i64 @test1(i32* nocapture readonly noalias %a, i32* nocapture readonly noalias %b) {
+; CHECK-LABEL: @test1
+
+; CHECK: load i64*
+; CHECK: ret i64
+
+  %load1 = load i32* %a, align 4
+  %conv = zext i32 %load1 to i64
+  %arrayidx1 = getelementptr inbounds i32* %a, i64 1
+  store i32 %load1, i32* %b, align 4
+  %load2 = load i32* %arrayidx1, align 4
+  %conv2 = zext i32 %load2 to i64
+  %shl = shl nuw i64 %conv2, 32
+  %add = or i64 %shl, %conv
+  ret i64 %add
+}
+
+define i64 @test2(i32* nocapture readonly %a, i32* nocapture readonly %b) {
+; CHECK-LABEL: @test2
+
+; CHECK: load i32*
+; CHECK: load i32*
+; CHECK: ret i64
+
+  %load1 = load i32* %a, align 4
+  %conv = zext i32 %load1 to i64
+  %arrayidx1 = getelementptr inbounds i32* %a, i64 1
+  store i32 %load1, i32* %b, align 4
+  %load2 = load i32* %arrayidx1, align 4
+  %conv2 = zext i32 %load2 to i64
+  %shl = shl nuw i64 %conv2, 32
+  %add = or i64 %shl, %conv
+  ret i64 %add
+}
+

diff --git a/test/Transforms/LoadCombine/load-combine-assume.ll b/test/Transforms/LoadCombine/load-combine-assume.ll
new file mode 100644
index 0000000..94f6300
--- /dev/null
+++ b/test/Transforms/LoadCombine/load-combine-assume.ll

@@ -0,0 +1,44 @@
+; RUN: opt -basicaa -load-combine -instcombine -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.assume(i1) nounwind
+
+; 'load' before the 'call' gets optimized:
+define i64 @test1(i32* nocapture readonly %a, i1 %b) {
+; CHECK-LABEL: @test1
+
+; CHECK-DAG: load i64* %1, align 4
+; CHECK-DAG: tail call void @llvm.assume(i1 %b)
+; CHECK: ret i64
+
+  %load1 = load i32* %a, align 4
+  %conv = zext i32 %load1 to i64
+  %arrayidx1 = getelementptr inbounds i32* %a, i64 1
+  %load2 = load i32* %arrayidx1, align 4
+  tail call void @llvm.assume(i1 %b)
+  %conv2 = zext i32 %load2 to i64
+  %shl = shl nuw i64 %conv2, 32
+  %add = or i64 %shl, %conv
+  ret i64 %add
+}
+
+; 'call' before the 'load' doesn't get optimized:
+define i64 @test2(i32* nocapture readonly %a, i1 %b) {
+; CHECK-LABEL: @test2
+
+; CHECK-DAG: load i64* %1, align 4
+; CHECK-DAG: tail call void @llvm.assume(i1 %b)
+; CHECK: ret i64
+
+  %load1 = load i32* %a, align 4
+  %conv = zext i32 %load1 to i64
+  %arrayidx1 = getelementptr inbounds i32* %a, i64 1
+  tail call void @llvm.assume(i1 %b)
+  %load2 = load i32* %arrayidx1, align 4
+  %conv2 = zext i32 %load2 to i64
+  %shl = shl nuw i64 %conv2, 32
+  %add = or i64 %shl, %conv
+  ret i64 %add
+}
+

diff --git a/test/Transforms/LoopIdiom/R600/lit.local.cfg b/test/Transforms/LoopIdiom/R600/lit.local.cfg
new file mode 100644
index 0000000..4086e8d
--- /dev/null
+++ b/test/Transforms/LoopIdiom/R600/lit.local.cfg

@@ -0,0 +1,3 @@
+if not 'R600' in config.root.targets:
+    config.unsupported = True
+

diff --git a/test/Transforms/LoopIdiom/R600/popcnt.ll b/test/Transforms/LoopIdiom/R600/popcnt.ll
new file mode 100644
index 0000000..e4301bb
--- /dev/null
+++ b/test/Transforms/LoopIdiom/R600/popcnt.ll

@@ -0,0 +1,104 @@
+; RUN: opt -loop-idiom -mtriple=r600-- -mcpu=SI -S < %s | FileCheck %s
+
+; Mostly copied from x86 version.
+
+;To recognize this pattern:
+;int popcount(unsigned long long a) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;    }
+;    return c;
+;}
+;
+
+; CHECK-LABEL: @popcount_i64
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount_i64(i64 %a) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; CHECK-LABEL: @popcount_i32
+; CHECK: entry
+; CHECK: llvm.ctpop.i32
+; CHECK: ret
+define i32 @popcount_i32(i32 %a) nounwind uwtable readnone ssp {
+entry:
+  %tobool3 = icmp eq i32 %a, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i32 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i32 %a.addr.04, -1
+  %and = and i32 %sub, %a.addr.04
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
+
+; To recognize this pattern:
+;int popcount(unsigned long long a, int mydata1, int mydata2) {
+;    int c = 0;
+;    while (a) {
+;        c++;
+;        a &= a - 1;
+;        mydata1 *= c;
+;        mydata2 *= (int)a;
+;    }
+;    return c + mydata1 + mydata2;
+;}
+
+; CHECK-LABEL: @popcount2
+; CHECK: entry
+; CHECK: llvm.ctpop.i64
+; CHECK: ret
+define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp {
+entry:
+  %tobool9 = icmp eq i64 %a, 0
+  br i1 %tobool9, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ]
+  %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ]
+  %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.013, 1
+  %sub = add i64 %a.addr.010, -1
+  %and = and i64 %sub, %a.addr.010
+  %mul = mul nsw i32 %inc, %mydata1.addr.011
+  %conv = trunc i64 %and to i32
+  %mul1 = mul nsw i32 %conv, %mydata2.addr.012
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ]
+  %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ]
+  %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa
+  %add2 = add i32 %add, %c.0.lcssa
+  ret i32 %add2
+}

diff --git a/test/Transforms/LoopIdiom/debug-line.ll b/test/Transforms/LoopIdiom/debug-line.ll
index ef4a478..ea3c4de 100644
--- a/test/Transforms/LoopIdiom/debug-line.ll
+++ b/test/Transforms/LoopIdiom/debug-line.ll

@@ -5,8 +5,8 @@
 
 define void @foo(double* nocapture %a) nounwind ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !{double* %a}, i64 0, metadata !5), !dbg !8
-  tail call void @llvm.dbg.value(metadata !9, i64 0, metadata !10), !dbg !14
+  tail call void @llvm.dbg.value(metadata !{double* %a}, i64 0, metadata !5, metadata !{}), !dbg !8
+  tail call void @llvm.dbg.value(metadata !9, i64 0, metadata !10, metadata !{}), !dbg !14
   br label %for.body
 
 for.body:                                         ; preds = %entry, %for.body
@@ -19,34 +19,34 @@
   br i1 %exitcond, label %for.body, label %for.end, !dbg !14
 
 for.end:                                          ; preds = %for.body
-  tail call void @llvm.dbg.value(metadata !{null}, i64 0, metadata !10), !dbg !16
+  tail call void @llvm.dbg.value(metadata !{null}, i64 0, metadata !10, metadata !{}), !dbg !16
   ret void, !dbg !17
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.module.flags = !{!19}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !18, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (double*)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!1 = metadata !{i32 589865, metadata !18} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 589841, metadata !18, i32 12, metadata !"clang version 2.9 (trunk 127165:127174)", i1 true, metadata !"", i32 0, metadata !9, metadata !9, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !18, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\000\000", metadata !18, metadata !1, metadata !3, null, void (double*)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!1 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 127165:127174)\001\00\000\00\000", metadata !18, metadata !9, metadata !9, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 590081, metadata !0, metadata !"a", metadata !1, i32 16777218, metadata !6, i32 0} ; [ DW_TAG_arg_variable ]
-!6 = metadata !{i32 589839, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 589860, null, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!5 = metadata !{metadata !"0x101\00a\0016777218\000", metadata !0, metadata !1, metadata !6} ; [ DW_TAG_arg_variable ]
+!6 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, metadata !2, metadata !7} ; [ DW_TAG_pointer_type ]
+!7 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, metadata !2} ; [ DW_TAG_base_type ]
 !8 = metadata !{i32 2, i32 18, metadata !0, null}
 !9 = metadata !{i32 0}
-!10 = metadata !{i32 590080, metadata !11, metadata !"i", metadata !1, i32 3, metadata !13, i32 0} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 589835, metadata !18, metadata !12, i32 3, i32 3, i32 1} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 589835, metadata !18, metadata !0, i32 2, i32 21, i32 0} ; [ DW_TAG_lexical_block ]
-!13 = metadata !{i32 589860, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{metadata !"0x100\00i\003\000", metadata !11, metadata !1, metadata !13} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\003\003\001", metadata !18, metadata !12} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{metadata !"0xb\002\0021\000", metadata !18, metadata !0} ; [ DW_TAG_lexical_block ]
+!13 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
 !14 = metadata !{i32 3, i32 3, metadata !12, null}
 !15 = metadata !{i32 4, i32 5, metadata !11, null}
 !16 = metadata !{i32 3, i32 29, metadata !11, null}
 !17 = metadata !{i32 5, i32 1, metadata !12, null}
 !18 = metadata !{metadata !"li.c", metadata !"/private/tmp"}
-!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/LoopRotate/dbgvalue.ll b/test/Transforms/LoopRotate/dbgvalue.ll
index 50fc965..4da0776 100644
--- a/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/test/Transforms/LoopRotate/dbgvalue.ll

@@ -1,7 +1,7 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 define i32 @tak(i32 %x, i32 %y, i32 %z) nounwind ssp {
 ; CHECK-LABEL: define i32 @tak(
@@ -15,9 +15,9 @@
   %x.tr = phi i32 [ %x, %entry ], [ %call, %if.then ]
   %y.tr = phi i32 [ %y, %entry ], [ %call9, %if.then ]
   %z.tr = phi i32 [ %z, %entry ], [ %call14, %if.then ]
-  tail call void @llvm.dbg.value(metadata !{i32 %x.tr}, i64 0, metadata !6), !dbg !7
-  tail call void @llvm.dbg.value(metadata !{i32 %y.tr}, i64 0, metadata !8), !dbg !9
-  tail call void @llvm.dbg.value(metadata !{i32 %z.tr}, i64 0, metadata !10), !dbg !11
+  tail call void @llvm.dbg.value(metadata !{i32 %x.tr}, i64 0, metadata !6, metadata !{}), !dbg !7
+  tail call void @llvm.dbg.value(metadata !{i32 %y.tr}, i64 0, metadata !8, metadata !{}), !dbg !9
+  tail call void @llvm.dbg.value(metadata !{i32 %z.tr}, i64 0, metadata !10, metadata !{}), !dbg !11
   %cmp = icmp slt i32 %y.tr, %x.tr, !dbg !12
   br i1 %cmp, label %if.then, label %if.end, !dbg !12
 
@@ -46,9 +46,9 @@
 ; CHECK-LABEL: define void @FindFreeHorzSeg(
 ; CHECK: %dec = add
 ; CHECK-NEXT: tail call void @llvm.dbg.value
-; CHECK-NEXT: br i1 %tobool, label %for.cond, label %[[LOOP_EXIT:[^,]*]]
-; CHECK: [[LOOP_EXIT]]:
-; CHECK-NEXT: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ]
+; CHECK: %cmp = icmp
+; CHECK: br i1 %cmp
+; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ]
 ; CHECK-NEXT: br label %for.end
 
 
@@ -72,7 +72,7 @@
 
 for.inc:
   %dec = add i64 %i.0, -1
-  tail call void @llvm.dbg.value(metadata !{i64 %dec}, i64 0, metadata !{metadata !"undef"})
+  tail call void @llvm.dbg.value(metadata !{i64 %dec}, i64 0, metadata !{metadata !"undef"}, metadata !{})
   br label %for.cond
 
 for.end:
@@ -84,24 +84,24 @@
 !llvm.module.flags = !{!20}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !18, metadata !1, metadata !"tak", metadata !"tak", metadata !"", i32 32, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i32, i32)* @tak, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 0] [tak]
-!1 = metadata !{i32 589865, metadata !18} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 589841, metadata !18, i32 12, metadata !"clang version 2.9 (trunk 125492)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !18, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00tak\00tak\00\0032\000\001\000\006\00256\000\000", metadata !18, metadata !1, metadata !3, null, i32 (i32, i32, i32)* @tak, null, null, null} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 0] [tak]
+!1 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.9 (trunk 125492)\001\00\000\00\000", metadata !18, metadata !19, metadata !19, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 589860, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 590081, metadata !0, metadata !"x", metadata !1, i32 32, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00x\0032\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
 !7 = metadata !{i32 32, i32 13, metadata !0, null}
-!8 = metadata !{i32 590081, metadata !0, metadata !"y", metadata !1, i32 32, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0x101\00y\0032\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
 !9 = metadata !{i32 32, i32 20, metadata !0, null}
-!10 = metadata !{i32 590081, metadata !0, metadata !"z", metadata !1, i32 32, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!10 = metadata !{metadata !"0x101\00z\0032\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
 !11 = metadata !{i32 32, i32 27, metadata !0, null}
 !12 = metadata !{i32 33, i32 3, metadata !13, null}
-!13 = metadata !{i32 589835, metadata !18, metadata !0, i32 32, i32 30, i32 6} ; [ DW_TAG_lexical_block ]
+!13 = metadata !{metadata !"0xb\0032\0030\006", metadata !18, metadata !0} ; [ DW_TAG_lexical_block ]
 !14 = metadata !{i32 34, i32 5, metadata !15, null}
-!15 = metadata !{i32 589835, metadata !18, metadata !13, i32 33, i32 14, i32 7} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{metadata !"0xb\0033\0014\007", metadata !18, metadata !13} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{i32 36, i32 3, metadata !13, null}
 !17 = metadata !{i32 37, i32 1, metadata !13, null}
 !18 = metadata !{metadata !"/Volumes/Lalgate/cj/llvm/projects/llvm-test/SingleSource/Benchmarks/BenchmarkGame/recursive.c", metadata !"/Volumes/Lalgate/cj/D/projects/llvm-test/SingleSource/Benchmarks/BenchmarkGame"}
 !19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/LoopRotate/nosimplifylatch.ll b/test/Transforms/LoopRotate/nosimplifylatch.ll
new file mode 100644
index 0000000..8e858b4
--- /dev/null
+++ b/test/Transforms/LoopRotate/nosimplifylatch.ll

@@ -0,0 +1,34 @@
+; RUN: opt -S < %s -loop-rotate -licm -verify-dom-info -verify-loop-info | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios8.0.0"
+
+;CHECK: for.inc:
+;CHECK-NEXT: %incdec.ptr.i = getelementptr 
+
+; Function Attrs: alwaysinline inlinehint nounwind readonly ssp
+define linkonce_odr hidden i64 @_ZNSt3__14findINS_11__wrap_iterIPiEEiEET_S4_S4_RKT0_(i64 %__first.coerce, i64 %__last.coerce, i32* nocapture readonly dereferenceable(4) %__value_) {
+entry:
+  %coerce.val.ip = inttoptr i64 %__first.coerce to i32*
+  %coerce.val.ip2 = inttoptr i64 %__last.coerce to i32*
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %coerce.val.ip9 = phi i32* [ %incdec.ptr.i, %for.inc ], [ %coerce.val.ip, %entry ]
+  %lnot.i = icmp eq i32* %coerce.val.ip9, %coerce.val.ip2
+  br i1 %lnot.i, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %0 = load i32* %coerce.val.ip9, align 4
+  %1 = load i32* %__value_, align 4
+  %cmp = icmp eq i32 %0, %1
+  br i1 %cmp, label %for.end, label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %incdec.ptr.i = getelementptr inbounds i32* %coerce.val.ip9, i64 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond, %for.body
+  %coerce.val.ip9.lcssa = phi i32* [ %coerce.val.ip9, %for.cond ], [ %coerce.val.ip9, %for.body ]
+  %coerce.val.pi = ptrtoint i32* %coerce.val.ip9.lcssa to i64
+  ret i64 %coerce.val.pi
+}

diff --git a/test/Transforms/LoopRotate/simplifylatch.ll b/test/Transforms/LoopRotate/simplifylatch.ll
index d646cb9..62e5b1a 100644
--- a/test/Transforms/LoopRotate/simplifylatch.ll
+++ b/test/Transforms/LoopRotate/simplifylatch.ll

@@ -4,7 +4,7 @@
 @mode_table = global [4 x i32] zeroinitializer		; <[4 x i32]*> [#uses=1]
 
 ; CHECK-LABEL: @f(
-; CHECK-NOT: bb4
+; CHECK-NOT: bb:
 define i8 @f() {
 entry:
 	tail call i32 @fegetround( )		; <i32>:0 [#uses=1]

diff --git a/test/Transforms/LoopSimplify/merge-exits.ll b/test/Transforms/LoopSimplify/merge-exits.ll
index 8de5938..9678148 100644
--- a/test/Transforms/LoopSimplify/merge-exits.ll
+++ b/test/Transforms/LoopSimplify/merge-exits.ll

@@ -1,6 +1,4 @@
-; RUN: opt < %s -loop-simplify -loop-rotate -instcombine -indvars -S -verify-loop-info -verify-dom-info > %t
-; RUN: not grep sext %t
-; RUN: grep "phi i64" %t | count 1
+; RUN: opt < %s -loop-simplify -loop-rotate -instcombine -indvars -S -verify-loop-info -verify-dom-info | FileCheck %s
 
 ; Loopsimplify should be able to merge the two loop exits
 ; into one, so that loop rotate can rotate the loop, so
@@ -9,36 +7,42 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n32:64"
 
-define float @t(float* %pTmp1, float* %peakWeight, i32 %bandEdgeIndex) nounwind {
+; CHECK-LABEL: @test1
+; CHECK: bb:
+; CHECK: phi i64
+; CHECK-NOT: phi i64
+; CHECK-NOT: sext
+
+define float @test1(float* %pTmp1, float* %peakWeight, i32 %bandEdgeIndex) nounwind {
 entry:
-	%t0 = load float* %peakWeight, align 4		; <float> [#uses=1]
+	%t0 = load float* %peakWeight, align 4
 	br label %bb1
 
 bb:		; preds = %bb2
-	%t1 = sext i32 %hiPart.0 to i64		; <i64> [#uses=1]
-	%t2 = getelementptr float* %pTmp1, i64 %t1		; <float*> [#uses=1]
-	%t3 = load float* %t2, align 4		; <float> [#uses=1]
-	%t4 = fadd float %t3, %distERBhi.0		; <float> [#uses=1]
-	%t5 = add i32 %hiPart.0, 1		; <i32> [#uses=2]
-	%t6 = sext i32 %t5 to i64		; <i64> [#uses=1]
-	%t7 = getelementptr float* %peakWeight, i64 %t6		; <float*> [#uses=1]
-	%t8 = load float* %t7, align 4		; <float> [#uses=1]
-	%t9 = fadd float %t8, %peakCount.0		; <float> [#uses=1]
+	%t1 = sext i32 %hiPart.0 to i64
+	%t2 = getelementptr float* %pTmp1, i64 %t1
+	%t3 = load float* %t2, align 4
+	%t4 = fadd float %t3, %distERBhi.0
+	%t5 = add i32 %hiPart.0, 1
+	%t6 = sext i32 %t5 to i64
+	%t7 = getelementptr float* %peakWeight, i64 %t6
+	%t8 = load float* %t7, align 4
+	%t9 = fadd float %t8, %peakCount.0
 	br label %bb1
 
 bb1:		; preds = %bb, %entry
-	%peakCount.0 = phi float [ %t0, %entry ], [ %t9, %bb ]		; <float> [#uses=2]
-	%hiPart.0 = phi i32 [ 0, %entry ], [ %t5, %bb ]		; <i32> [#uses=3]
-	%distERBhi.0 = phi float [ 0.000000e+00, %entry ], [ %t4, %bb ]		; <float> [#uses=3]
-	%t10 = fcmp uge float %distERBhi.0, 2.500000e+00		; <i1> [#uses=1]
+	%peakCount.0 = phi float [ %t0, %entry ], [ %t9, %bb ]
+	%hiPart.0 = phi i32 [ 0, %entry ], [ %t5, %bb ]
+	%distERBhi.0 = phi float [ 0.000000e+00, %entry ], [ %t4, %bb ]
+	%t10 = fcmp uge float %distERBhi.0, 2.500000e+00
 	br i1 %t10, label %bb3, label %bb2
 
 bb2:		; preds = %bb1
-	%t11 = add i32 %bandEdgeIndex, -1		; <i32> [#uses=1]
-	%t12 = icmp sgt i32 %t11, %hiPart.0		; <i1> [#uses=1]
+	%t11 = add i32 %bandEdgeIndex, -1
+	%t12 = icmp sgt i32 %t11, %hiPart.0
 	br i1 %t12, label %bb, label %bb3
 
 bb3:		; preds = %bb2, %bb1
-	%t13 = fdiv float %peakCount.0, %distERBhi.0		; <float> [#uses=1]
+	%t13 = fdiv float %peakCount.0, %distERBhi.0
 	ret float %t13
 }

diff --git a/test/Transforms/LoopStrengthReduce/pr12018.ll b/test/Transforms/LoopStrengthReduce/pr12018.ll
index ee7b1e8..1e3df6c 100644
--- a/test/Transforms/LoopStrengthReduce/pr12018.ll
+++ b/test/Transforms/LoopStrengthReduce/pr12018.ll

@@ -16,7 +16,7 @@
   %tmp = bitcast %struct.nsTArrayHeader* %add.ptr.i to %struct.nsTArray*
   %arrayidx = getelementptr inbounds %struct.nsTArray* %tmp, i32 %i.06
   %add = add nsw i32 %i.06, 1
-  call void @llvm.dbg.value(metadata !{%struct.nsTArray* %aValues}, i64 0, metadata !0) nounwind
+  call void @llvm.dbg.value(metadata !{%struct.nsTArray* %aValues}, i64 0, metadata !0, metadata !{}) nounwind
   br label %_ZN8nsTArray9ElementAtEi.exit
 
 _ZN8nsTArray9ElementAtEi.exit:                    ; preds = %for.body
@@ -33,6 +33,6 @@
 
 declare %struct.nsTArrayHeader* @_ZN8nsTArray4Hdr2Ev()
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
-!0 = metadata !{i32 786689}                       ; [ DW_TAG_arg_variable ]
+!0 = metadata !{metadata !"0x101"}                       ; [ DW_TAG_arg_variable ]

diff --git a/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll b/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll
index 17c91e5..aae79cb 100644
--- a/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll
+++ b/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll

@@ -41,8 +41,7 @@
 }
 
 ; CHECK-LABEL: @test
-; CHECK: unr.cmp{{.*}}:
-; CHECK: for.body.unr{{.*}}:
+; CHECK: for.body.prol{{.*}}:
 ; CHECK: for.body:
 ; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body
 

diff --git a/test/Transforms/LoopUnroll/ephemeral.ll b/test/Transforms/LoopUnroll/ephemeral.ll
new file mode 100644
index 0000000..9d40613
--- /dev/null
+++ b/test/Transforms/LoopUnroll/ephemeral.ll

@@ -0,0 +1,44 @@
+; RUN: opt < %s -S -loop-unroll -unroll-threshold=50 | FileCheck %s
+
+; Make sure this loop is completely unrolled...
+; CHECK-LABEL: @test1
+; CHECK: for.body:
+; CHECK-NOT: for.end:
+
+define i32 @test1(i32* nocapture %a) nounwind uwtable readonly {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.01 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+
+  ; This loop will be completely unrolled, even with these extra instructions,
+  ; but only because they're ephemeral (and, thus, free).
+  %1 = add nsw i32 %0, 2
+  %2 = add nsw i32 %1, 4
+  %3 = add nsw i32 %2, 4
+  %4 = add nsw i32 %3, 4
+  %5 = add nsw i32 %4, 4
+  %6 = add nsw i32 %5, 4
+  %7 = add nsw i32 %6, 4
+  %8 = add nsw i32 %7, 4
+  %9 = add nsw i32 %8, 4
+  %10 = add nsw i32 %9, 4
+  %ca = icmp sgt i32 %10, -7
+  call void @llvm.assume(i1 %ca)
+
+  %add = add nsw i32 %0, %sum.01
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 5
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}
+
+declare void @llvm.assume(i1) nounwind
+

diff --git a/test/Transforms/LoopUnroll/ignore-annotation-intrinsic-cost.ll b/test/Transforms/LoopUnroll/ignore-annotation-intrinsic-cost.ll
new file mode 100644
index 0000000..dcb5d1c
--- /dev/null
+++ b/test/Transforms/LoopUnroll/ignore-annotation-intrinsic-cost.ll

@@ -0,0 +1,133 @@
+; REQUIRES: asserts
+; RUN: opt < %s -disable-output -stats -loop-unroll -info-output-file - | FileCheck %s --check-prefix=STATS
+; STATS: 1 loop-unroll - Number of loops unrolled (completely or otherwise)
+; Test that llvm.annotation intrinsic do not count against the loop body size
+; and prevent unrolling.
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+@B = common global i32 0, align 4
+
+define void @foo(i32* noalias %A, i32 %B, i32 %C) {
+entry:
+  br label %for.body
+
+; A loop that has a small loop body (except for the annotations) that should be
+; unrolled with the default heuristic. Make sure the extra annotations do not
+; prevent unrolling
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  ; The real loop.
+  %mul = mul nsw i32 %B, %C
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.01
+  store i32 %mul, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp ne i32 %inc, 4
+
+  ; A bunch of annotations
+  %annot.0 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.1 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.2 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.3 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.4 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.5 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.6 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.7 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.8 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.9 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.10 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.11 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.12 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.13 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.14 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.15 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.16 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.17 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.18 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.19 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.20 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.21 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.22 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.23 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.24 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.25 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.26 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.27 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.28 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.29 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.30 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.31 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.32 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.33 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.34 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.35 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.36 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.37 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.38 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.39 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.40 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.41 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.42 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.43 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.44 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.45 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.46 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.47 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.48 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.49 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.50 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.51 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.52 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.53 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.54 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.55 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.56 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.57 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.58 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.59 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.60 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.61 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.62 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.63 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.64 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.65 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.66 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.67 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.68 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.69 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.70 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.71 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.72 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.73 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.74 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.75 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.76 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.77 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.78 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.79 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.80 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.81 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.82 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.83 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.84 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.85 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.86 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.87 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.88 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.89 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.90 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.91 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.92 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.93 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.94 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.95 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.96 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.97 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.98 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  %annot.99 = tail call i32 @llvm.annotation.i32(i32 %i.01, i8* null, i8* null, i32 0)
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32)

diff --git a/test/Transforms/LoopUnroll/nsw-tripcount.ll b/test/Transforms/LoopUnroll/nsw-tripcount.ll
new file mode 100644
index 0000000..98cab32
--- /dev/null
+++ b/test/Transforms/LoopUnroll/nsw-tripcount.ll

@@ -0,0 +1,32 @@
+; RUN: opt -loop-unroll -S %s | FileCheck %s
+
+; extern void f(int);
+; void test1(int v) {
+;   for (int i=v; i<=v+1; ++i)
+;     f(i);
+; }
+;
+; We can use the nsw information to see that the tripcount will be 2, so the
+; loop should be unrolled as this is always beneficial
+
+declare void @f(i32)
+
+; CHECK-LABEL: @test1
+define void @test1(i32 %v) {
+entry:
+  %add = add nsw i32 %v, 1
+  br label %for.body
+
+for.body:
+  %i.04 = phi i32 [ %v, %entry ], [ %inc, %for.body ]
+  tail call void @f(i32 %i.04)
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp slt i32 %i.04, %add
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK: call void @f
+; CHECK-NOT: br i1
+; CHECK: call void @f
+for.end:
+  ret void
+}

diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll
index a14087d..05d03f2 100644
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll

@@ -3,15 +3,16 @@
 ; Tests for unrolling loops with run-time trip counts
 
 ; CHECK: %xtraiter = and i32 %n
-; CHECK: %lcmp.mod = icmp ne i32 %xtraiter, 0
-; CHECK: %lcmp.overflow = icmp eq i32 %n, 0
-; CHECK: %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod
-; CHECK: br i1 %lcmp.or, label %unr.cmp
+; CHECK:  %lcmp.mod = icmp ne i32 %xtraiter, 0
+; CHECK:  %lcmp.overflow = icmp eq i32 %n, 0
+; CHECK:  %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod
+; CHECK:  br i1 %lcmp.or, label %for.body.prol, label %for.body.preheader.split
 
-; CHECK: unr.cmp{{.*}}:
-; CHECK: for.body.unr{{.*}}:
-; CHECK: for.body:
-; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body
+; CHECK: for.body.prol:
+; CHECK: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.preheader ]
+; CHECK:  %prol.iter.sub = sub i32 %prol.iter, 1
+; CHECK:  %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
+; CHECK:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split, !llvm.loop !0
 
 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
 entry:
@@ -39,7 +40,7 @@
 ; even if the -unroll-runtime is specified
 
 ; CHECK: for.body:
-; CHECK-NOT: for.body.unr:
+; CHECK-NOT: for.body.prol:
 
 define i32 @test1(i32* nocapture %a) nounwind uwtable readonly {
 entry:
@@ -85,8 +86,8 @@
 
 ; Test run-time unrolling for a loop that counts down by -2.
 
-; CHECK: for.body.unr:
-; CHECK: br i1 %cmp.7, label %for.cond.for.end_crit_edge{{.*}}, label %for.body
+; CHECK: for.body.prol:
+; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split
 
 define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
 entry:
@@ -113,3 +114,7 @@
   %res.0.lcssa = phi i16 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
   ret i16 %res.0.lcssa
 }
+
+; CHECK: !0 = metadata !{metadata !0, metadata !1}
+; CHECK: !1 = metadata !{metadata !"llvm.loop.unroll.disable"}
+

diff --git a/test/Transforms/LoopUnroll/runtime-loop1.ll b/test/Transforms/LoopUnroll/runtime-loop1.ll
index ad99b8c..38b4f32 100644
--- a/test/Transforms/LoopUnroll/runtime-loop1.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop1.ll

@@ -1,11 +1,11 @@
-; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=4 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-count=2 | FileCheck %s
 
 ; This tests that setting the unroll count works
 
-; CHECK: unr.cmp:
-; CHECK: for.body.unr:
+; CHECK: for.body.prol:
+; CHECK: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split
 ; CHECK: for.body:
-; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body
+; CHECK: br i1 %exitcond.1, label %for.end.loopexit.unr-lcssa, label %for.body
 ; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body
 
 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {

diff --git a/test/Transforms/LoopUnroll/runtime-loop2.ll b/test/Transforms/LoopUnroll/runtime-loop2.ll
index cbc7af5..7205c68 100644
--- a/test/Transforms/LoopUnroll/runtime-loop2.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop2.ll

@@ -3,8 +3,7 @@
 ; Choose a smaller, power-of-two, unroll count if the loop is too large.
 ; This test makes sure we're not unrolling 'odd' counts
 
-; CHECK: unr.cmp:
-; CHECK: for.body.unr:
+; CHECK: for.body.prol:
 ; CHECK: for.body:
 ; CHECK: br i1 %exitcond.3, label %for.end.loopexit{{.*}}, label %for.body
 ; CHECK-NOT: br i1 %exitcond.4, label %for.end.loopexit{{.*}}, label %for.body

diff --git a/test/Transforms/LoopUnroll/scevunroll.ll b/test/Transforms/LoopUnroll/scevunroll.ll
index c3086e8..20161d7 100644
--- a/test/Transforms/LoopUnroll/scevunroll.ll
+++ b/test/Transforms/LoopUnroll/scevunroll.ll

@@ -66,16 +66,13 @@
 
 ; SCEV properly unrolls multi-exit loops.
 ;
-; SCEV cannot currently unroll this loop.
-; It should ideally detect a trip count of 5.
-; rdar:14038809 [SCEV]: Optimize trip count computation for multi-exit loops.
 ; CHECK-LABEL: @multiExit(
-; CHECKFIXME: getelementptr i32* %base, i32 10
-; CHECKFIXME-NEXT: load i32*
-; CHECKFIXME: br i1 false, label %l2.10, label %exit1
-; CHECKFIXME: l2.10:
-; CHECKFIXME-NOT: br
-; CHECKFIXME: ret i32
+; CHECK: getelementptr i32* %base, i32 10
+; CHECK-NEXT: load i32*
+; CHECK: br i1 false, label %l2.10, label %exit1
+; CHECK: l2.10:
+; CHECK-NOT: br
+; CHECK: ret i32
 define i32 @multiExit(i32* %base) nounwind {
 entry:
   br label %l1

diff --git a/test/Transforms/LoopUnroll/tripcount-overflow.ll b/test/Transforms/LoopUnroll/tripcount-overflow.ll
new file mode 100644
index 0000000..d593685
--- /dev/null
+++ b/test/Transforms/LoopUnroll/tripcount-overflow.ll

@@ -0,0 +1,30 @@
+; RUN: opt < %s -S -unroll-runtime -unroll-count=2 -loop-unroll | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; When prologue is fully unrolled, the branch on its end is unconditional.
+; Unrolling it is illegal if we can't prove that trip-count+1 doesn't overflow,
+; like in this example, where it comes from an argument.
+;
+; This test is based on an example from here:
+; http://stackoverflow.com/questions/23838661/why-is-clang-optimizing-this-code-out
+;
+; CHECK: while.body.prol:
+; CHECK: br i1
+; CHECK: entry.split:
+
+; Function Attrs: nounwind readnone ssp uwtable
+define i32 @foo(i32 %N) #0 {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %i = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  %cmp = icmp eq i32 %i, %N
+  %inc = add i32 %i, 1
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body
+  ret i32 %i
+}
+
+attributes #0 = { nounwind readnone ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/test/Transforms/LoopUnroll/unroll-pragmas-disabled.ll b/test/Transforms/LoopUnroll/unroll-pragmas-disabled.ll
new file mode 100644
index 0000000..db18f25
--- /dev/null
+++ b/test/Transforms/LoopUnroll/unroll-pragmas-disabled.ll

@@ -0,0 +1,149 @@
+; RUN: opt < %s -loop-unroll -S | FileCheck %s
+;
+; Verify that the unrolling pass removes existing unroll count metadata
+; and adds a disable unrolling node after unrolling is complete.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; #pragma clang loop  vectorize(enable) unroll_count(4) vectorize_width(8)
+;
+; Unroll count metadata should be replaced with unroll(disable).  Vectorize
+; metadata should be untouched.
+;
+; CHECK-LABEL: @unroll_count_4(
+; CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[LOOP_1:.*]]
+define void @unroll_count_4(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!1 = metadata !{metadata !1, metadata !2, metadata !3, metadata !4}
+!2 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
+!3 = metadata !{metadata !"llvm.loop.unroll.count", i32 4}
+!4 = metadata !{metadata !"llvm.loop.vectorize.width", i32 8}
+
+; #pragma clang loop unroll(full)
+;
+; An unroll disable metadata node is only added for the unroll count case.
+; In this case, the loop has a full unroll metadata but can't be fully unrolled
+; because the trip count is dynamic.  The full unroll metadata should remain
+; after unrolling.
+;
+; CHECK-LABEL: @unroll_full(
+; CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[LOOP_2:.*]]
+define void @unroll_full(i32* nocapture %a, i32 %b) {
+entry:
+  %cmp3 = icmp sgt i32 %b, 0
+  br i1 %cmp3, label %for.body, label %for.end, !llvm.loop !5
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %b
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+!5 = metadata !{metadata !5, metadata !6}
+!6 = metadata !{metadata !"llvm.loop.unroll.full"}
+
+; #pragma clang loop unroll(disable)
+;
+; Unroll metadata should not change.
+;
+; CHECK-LABEL: @unroll_disable(
+; CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[LOOP_3:.*]]
+define void @unroll_disable(i32* nocapture %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !7
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+!7 = metadata !{metadata !7, metadata !8}
+!8 = metadata !{metadata !"llvm.loop.unroll.disable"}
+
+; This function contains two loops which share the same llvm.loop metadata node
+; with an llvm.loop.unroll.count 2 hint.  Both loops should be unrolled.  This
+; verifies that adding disable metadata to a loop after unrolling doesn't affect
+; other loops which previously shared the same llvm.loop metadata.
+;
+; CHECK-LABEL: @shared_metadata(
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[LOOP_4:.*]]
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[LOOP_5:.*]]
+define void @shared_metadata(i32* nocapture %List) #0 {
+entry:
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx = getelementptr inbounds i32* %List, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add4 = add nsw i32 %0, 10
+  store i32 %add4, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.body3.1.preheader, label %for.body3, !llvm.loop !9
+
+for.body3.1.preheader:                            ; preds = %for.body3
+  br label %for.body3.1
+
+for.body3.1:                                      ; preds = %for.body3.1.preheader, %for.body3.1
+  %indvars.iv.1 = phi i64 [ %1, %for.body3.1 ], [ 0, %for.body3.1.preheader ]
+  %1 = add nsw i64 %indvars.iv.1, 1
+  %arrayidx.1 = getelementptr inbounds i32* %List, i64 %1
+  %2 = load i32* %arrayidx.1, align 4
+  %add4.1 = add nsw i32 %2, 10
+  store i32 %add4.1, i32* %arrayidx.1, align 4
+  %exitcond.1 = icmp eq i64 %1, 4
+  br i1 %exitcond.1, label %for.inc5.1, label %for.body3.1, !llvm.loop !9
+
+for.inc5.1:                                       ; preds = %for.body3.1
+  ret void
+}
+!9 = metadata !{metadata !9, metadata !10}
+!10 = metadata !{metadata !"llvm.loop.unroll.count", i32 2}
+
+
+; CHECK: ![[LOOP_1]] = metadata !{metadata ![[LOOP_1]], metadata ![[VEC_ENABLE:.*]], metadata ![[WIDTH_8:.*]], metadata ![[UNROLL_DISABLE:.*]]}
+; CHECK: ![[VEC_ENABLE]] = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
+; CHECK: ![[WIDTH_8]] = metadata !{metadata !"llvm.loop.vectorize.width", i32 8}
+; CHECK: ![[UNROLL_DISABLE]] = metadata !{metadata !"llvm.loop.unroll.disable"}
+; CHECK: ![[LOOP_2]] = metadata !{metadata ![[LOOP_2]], metadata ![[UNROLL_FULL:.*]]}
+; CHECK: ![[UNROLL_FULL]] = metadata !{metadata !"llvm.loop.unroll.full"}
+; CHECK: ![[LOOP_3]] = metadata !{metadata ![[LOOP_3]], metadata ![[UNROLL_DISABLE:.*]]}
+; CHECK: ![[LOOP_4]] = metadata !{metadata ![[LOOP_4]], metadata ![[UNROLL_DISABLE:.*]]}
+; CHECK: ![[LOOP_5]] = metadata !{metadata ![[LOOP_5]], metadata ![[UNROLL_DISABLE:.*]]}

diff --git a/test/Transforms/LoopUnroll/unroll-pragmas.ll b/test/Transforms/LoopUnroll/unroll-pragmas.ll
index 5e45a2d..1ca249d 100644
--- a/test/Transforms/LoopUnroll/unroll-pragmas.ll
+++ b/test/Transforms/LoopUnroll/unroll-pragmas.ll

@@ -1,4 +1,8 @@
 ; RUN: opt < %s -loop-unroll -S | FileCheck %s
+; RUN: opt < %s -loop-unroll -loop-unroll -S | FileCheck %s
+;
+; Run loop unrolling twice to verify that loop unrolling metadata is properly
+; removed and further unrolling is disabled after the pass is run once.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -51,11 +55,11 @@
   ret void
 }
 !1 = metadata !{metadata !1, metadata !2}
-!2 = metadata !{metadata !"llvm.loop.unroll.enable", i1 false}
+!2 = metadata !{metadata !"llvm.loop.unroll.disable"}
 
 ; loop64 has a high enough count that it should *not* be unrolled by
 ; the default unrolling heuristic.  It serves as the control for the
-; unroll(enable) pragma test loop64_with_.* tests below.
+; unroll(full) pragma test loop64_with_.* tests below.
 ;
 ; CHECK-LABEL: @loop64(
 ; CHECK: store i32
@@ -79,7 +83,7 @@
   ret void
 }
 
-; #pragma clang loop unroll(enable)
+; #pragma clang loop unroll(full)
 ; Loop should be fully unrolled.
 ;
 ; CHECK-LABEL: @loop64_with_enable(
@@ -102,7 +106,7 @@
   ret void
 }
 !3 = metadata !{metadata !3, metadata !4}
-!4 = metadata !{metadata !"llvm.loop.unroll.enable", i1 true}
+!4 = metadata !{metadata !"llvm.loop.unroll.full"}
 
 ; #pragma clang loop unroll_count(4)
 ; Loop should be unrolled 4 times.
@@ -134,37 +138,7 @@
 !5 = metadata !{metadata !5, metadata !6}
 !6 = metadata !{metadata !"llvm.loop.unroll.count", i32 4}
 
-
-; #pragma clang loop unroll_count(enable) unroll_count(4)
-; Loop should be unrolled 4 times.
-;
-; CHECK-LABEL: @loop64_with_enable_and_count4(
-; CHECK: store i32
-; CHECK: store i32
-; CHECK: store i32
-; CHECK: store i32
-; CHECK-NOT: store i32
-; CHECK: br i1
-define void @loop64_with_enable_and_count4(i32* nocapture %a) {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
-  %0 = load i32* %arrayidx, align 4
-  %inc = add nsw i32 %0, 1
-  store i32 %inc, i32* %arrayidx, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 64
-  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !7
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
-!7 = metadata !{metadata !7, metadata !6, metadata !4}
-
-; #pragma clang loop unroll_count(enable)
+; #pragma clang loop unroll(full)
 ; Full unrolling is requested, but loop has a dynamic trip count so
 ; no unrolling should occur.
 ;
@@ -257,7 +231,7 @@
 !10 = metadata !{metadata !10, metadata !11}
 !11 = metadata !{metadata !"llvm.loop.unroll.count", i32 1}
 
-; #pragma clang loop unroll(enable)
+; #pragma clang loop unroll(full)
 ; Loop has very high loop count (1 million) and full unrolling was requested.
 ; Loop should unrolled up to the pragma threshold, but not completely.
 ;

diff --git a/test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll b/test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll
new file mode 100644
index 0000000..adbf47d
--- /dev/null
+++ b/test/Transforms/LoopUnroll/update-loop-info-in-subloops.ll

@@ -0,0 +1,35 @@
+; RUN: opt -S < %s -loop-unroll -block-freq | FileCheck %s
+; Crasher from PR20987.
+
+; CHECK: define void @update_loop_info_in_subloops
+; CHECK: entry:
+; CHECK: L:
+; CHECK: L.inner:
+; CHECK: L.inner.latch:
+; CHECK: L.latch:
+; CHECK: L.inner.1:
+; CHECK: L.inner.latch.1:
+; CHECK: L.latch.1:
+
+define void @update_loop_info_in_subloops() {
+entry:
+  br label %L
+
+L:
+  %0 = phi i64 [ 1, %entry ], [ %1, %L.latch ]
+  br label %L.inner
+
+L.inner:
+  br label %L.inner.latch
+
+L.inner.latch:
+  br i1 false, label %L.latch, label %L.inner
+
+L.latch:
+  %1 = add i64 %0, 1
+  %2 = icmp eq i64 %1, 3
+  br i1 %2, label %exit, label %L
+
+exit:
+  ret void
+}

diff --git a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
index 1e1396f..a292afb 100644
--- a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
+++ b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
index aa7cc0e..b3eae69 100644
--- a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
+++ b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce
 
 ; Check that we don't fall into an infinite loop.
 define void @test() nounwind {

diff --git a/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
index ae9f998..16d64ea 100644
--- a/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
+++ b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -dce -force-vector-unroll=1 -force-vector-width=4 
+; RUN: opt < %s  -loop-vectorize -dce -force-vector-interleave=1 -force-vector-width=4 
 
 ; Check that we don't crash.
 

diff --git a/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll b/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll
new file mode 100644
index 0000000..a01d543
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll

@@ -0,0 +1,31 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+
+@Foo = common global %struct.anon zeroinitializer, align 4
+
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x i32>*
+; CHECK: sdiv <4 x i32>
+; CHECK: store <4 x i32>
+
+define void @foo(){
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %div = sdiv i32 %0, 2
+  %arrayidx2 = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
+  store i32 %div, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

diff --git a/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
index fce3b70..9c69ba8 100644
--- a/test/Transforms/LoopVectorize/X86/already-vectorized.ll
+++ b/test/Transforms/LoopVectorize/X86/already-vectorized.ll

@@ -41,6 +41,6 @@
 ; Now, we check for the Hint metadata
 ; CHECK: [[vect]] = metadata !{metadata [[vect]], metadata [[width:![0-9]+]], metadata [[unroll:![0-9]+]]}
 ; CHECK: [[width]] = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
-; CHECK: [[unroll]] = metadata !{metadata !"llvm.loop.vectorize.unroll", i32 1}
+; CHECK: [[unroll]] = metadata !{metadata !"llvm.loop.interleave.count", i32 1}
 ; CHECK: [[scalar]] = metadata !{metadata [[scalar]], metadata [[width]], metadata [[unroll]]}
 

diff --git a/test/Transforms/LoopVectorize/X86/assume.ll b/test/Transforms/LoopVectorize/X86/assume.ll
new file mode 100644
index 0000000..a94e24d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/assume.ll

@@ -0,0 +1,100 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: for.body:
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.assume(i1 %cmp1)
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+
+%struct.data = type { float*, float* }
+
+; Function Attrs: nounwind uwtable
+define void @test2(%struct.data* nocapture readonly %d) #0 {
+entry:
+  %b = getelementptr inbounds %struct.data* %d, i64 0, i32 1
+  %0 = load float** %b, align 8
+  %ptrint = ptrtoint float* %0 to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  %a = getelementptr inbounds %struct.data* %d, i64 0, i32 0
+  %1 = load float** %a, align 8
+  %ptrint2 = ptrtoint float* %1 to i64
+  %maskedptr3 = and i64 %ptrint2, 31
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  br label %for.body
+
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: for.body:
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds float* %0, i64 %indvars.iv
+  %2 = load float* %arrayidx, align 4
+  %add = fadd float %2, 1.000000e+00
+  tail call void @llvm.assume(i1 %maskcond4)
+  %arrayidx5 = getelementptr inbounds float* %1, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+

diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
index e1113fd..05403cd 100644
--- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll

@@ -1,5 +1,5 @@
 ; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -S | FileCheck %s
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-unroll=0 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-interleave=0 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index d6120e7..0650d94 100644
--- a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

diff --git a/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll b/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
index 8716cff..fd69dc4 100644
--- a/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
+++ b/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -vectorizer-min-trip-count=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -vectorizer-min-trip-count=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
index 2c47fcb..0b542a9 100644
--- a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index 7e156a9..b580d73 100644
--- a/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

diff --git a/test/Transforms/LoopVectorize/X86/powof2div.ll b/test/Transforms/LoopVectorize/X86/powof2div.ll
new file mode 100644
index 0000000..054da8e
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/powof2div.ll

@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+
+@Foo = common global %struct.anon zeroinitializer, align 4
+
+;CHECK-LABEL: @foo(
+;CHECK: load <4 x i32>*
+;CHECK: sdiv <4 x i32>
+;CHECK: store <4 x i32>
+
+define void @foo(){
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %div = sdiv i32 %0, 2
+  %arrayidx2 = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
+  store i32 %div, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+

diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll
index dfa4faa..f9a0281 100644
--- a/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/X86/tripcount.ll b/test/Transforms/LoopVectorize/X86/tripcount.ll
index 6b38bac..a4ec694 100644
--- a/test/Transforms/LoopVectorize/X86/tripcount.ll
+++ b/test/Transforms/LoopVectorize/X86/tripcount.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -mcpu=prescott < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -mcpu=prescott < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 target triple = "i386-unknown-freebsd11.0"

diff --git a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
index d5024bb..716dc08 100644
--- a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
+++ b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll

@@ -1,6 +1,6 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S \
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -dce -S \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-VECTOR
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-unroll=0 -dce -S \
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-interleave=0 -dce -S \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-SCALAR
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

diff --git a/test/Transforms/LoopVectorize/X86/unroll_selection.ll b/test/Transforms/LoopVectorize/X86/unroll_selection.ll
index 2d7b663..c684b4e 100644
--- a/test/Transforms/LoopVectorize/X86/unroll_selection.ll
+++ b/test/Transforms/LoopVectorize/X86/unroll_selection.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -dce -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
index 59bb8d0..e57cfef 100644
--- a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll

@@ -1,4 +1,4 @@
-; RUN: opt -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s
+; RUN: opt -basicaa -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index 6cdd29b..7bce11d 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll

@@ -24,10 +24,11 @@
 
 ; File, line, and column should match those specified in the metadata
 ; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations
-; CHECK: remark: source.cpp:4:5: loop not vectorized: vectorization was not specified
+; CHECK: remark: source.cpp:4:5: loop not vectorized: use -Rpass-analysis=loop-vectorize for more info
 ; CHECK: remark: source.cpp:13:5: loop not vectorized: vector width and interleave count are explicitly set to 1
 ; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds
-; CHECK: remark: source.cpp:19:5: loop not vectorized: vectorization is explicitly enabled
+; CHECK: remark: source.cpp:19:5: loop not vectorized: use -Rpass-analysis=loop-vectorize for more info
+; CHECK: warning: source.cpp:19:5: loop not vectorized: failed explicitly specified loop vectorization
 
 ; CHECK: _Z4testPii
 ; CHECK-NOT: x i32>
@@ -121,40 +122,40 @@
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2}
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0\001\00\006\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"source.cpp", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !7, metadata !8}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @_Z4testPii, null, null, metadata !2, i32 1}
-!5 = metadata !{i32 786473, metadata !1}
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null}
-!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_disabled", metadata !"test_disabled", metadata !"", i32 10, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @_Z13test_disabledPii, null, null, metadata !2, i32 10}
-!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_array_bounds", metadata !"test_array_bounds", metadata !"", i32 16, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*, i32)* @_Z17test_array_boundsPiS_i, null, null, metadata !2, i32 16}
+!4 = metadata !{metadata !"0x2e\00test\00test\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*, i32)* @_Z4testPii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [test]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./source.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00test_disabled\00test_disabled\00\0010\000\001\000\006\00256\001\0010", metadata !1, metadata !5, metadata !6, null, void (i32*, i32)* @_Z13test_disabledPii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 10] [def] [test_disabled]
+!8 = metadata !{metadata !"0x2e\00test_array_bounds\00test_array_bounds\00\0016\000\001\000\006\00256\001\0016", metadata !1, metadata !5, metadata !6, null, void (i32*, i32*, i32)* @_Z17test_array_boundsPiS_i, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 16] [def] [test_array_bounds]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !11 = metadata !{metadata !"clang version 3.5.0"}
 !12 = metadata !{i32 3, i32 8, metadata !13, null}
-!13 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 3, i32 0, i32 0}
+!13 = metadata !{metadata !"0xb\003\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
 !14 = metadata !{metadata !14, metadata !15, metadata !15}
 !15 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
 !16 = metadata !{i32 4, i32 5, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !1, metadata !13, i32 3, i32 36, i32 0, i32 1}
+!17 = metadata !{metadata !"0xb\003\0036\000", metadata !1, metadata !13} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{metadata !19, metadata !19, i64 0}
 !19 = metadata !{metadata !"int", metadata !20, i64 0}
 !20 = metadata !{metadata !"omnipotent char", metadata !21, i64 0}
 !21 = metadata !{metadata !"Simple C/C++ TBAA"}
 !22 = metadata !{i32 5, i32 9, metadata !23, null}
-!23 = metadata !{i32 786443, metadata !1, metadata !17, i32 5, i32 9, i32 0, i32 2}
+!23 = metadata !{metadata !"0xb\005\009\000", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ]
 !24 = metadata !{i32 8, i32 1, metadata !4, null}
 !25 = metadata !{i32 12, i32 8, metadata !26, null}
-!26 = metadata !{i32 786443, metadata !1, metadata !7, i32 12, i32 3, i32 0, i32 3}
+!26 = metadata !{metadata !"0xb\0012\003\000", metadata !1, metadata !7} ; [ DW_TAG_lexical_block ]
 !27 = metadata !{metadata !27, metadata !28, metadata !29}
-!28 = metadata !{metadata !"llvm.loop.vectorize.unroll", i32 1}
+!28 = metadata !{metadata !"llvm.loop.interleave.count", i32 1}
 !29 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
 !30 = metadata !{i32 13, i32 5, metadata !26, null}
 !31 = metadata !{i32 14, i32 1, metadata !7, null}
 !32 = metadata !{i32 18, i32 8, metadata !33, null}
-!33 = metadata !{i32 786443, metadata !1, metadata !8, i32 18, i32 3, i32 0, i32 4}
+!33 = metadata !{metadata !"0xb\0018\003\000", metadata !1, metadata !8} ; [ DW_TAG_lexical_block ]
 !34 = metadata !{metadata !34, metadata !15}
 !35 = metadata !{i32 19, i32 5, metadata !33, null}
 !36 = metadata !{i32 20, i32 1, metadata !8, null}

diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
index f683447..14e541a 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll

@@ -1,6 +1,6 @@
 ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
-; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
-; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-unroll=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
 
 ; This code has all the !dbg annotations needed to track source line information,
 ; but is missing the llvm.dbg.cu annotation. This prevents code generation from
@@ -52,23 +52,23 @@
 !1 = metadata !{metadata !"vectorization-remarks.c", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @foo, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [./vectorization-remarks.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\005\000\001\000\006\00256\001\006", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./vectorization-remarks.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5.0 "}
-!10 = metadata !{i32 8, i32 3, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!10 = metadata !{i32 8, i32 3, metadata !4, null}
 !11 = metadata !{metadata !12, metadata !12, i64 0}
 !12 = metadata !{metadata !"int", metadata !13, i64 0}
 !13 = metadata !{metadata !"omnipotent char", metadata !14, i64 0}
 !14 = metadata !{metadata !"Simple C/C++ TBAA"}
 !15 = metadata !{i32 17, i32 8, metadata !16, null}
-!16 = metadata !{i32 786443, metadata !1, metadata !17, i32 17, i32 8, i32 2, i32 3} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
-!17 = metadata !{i32 786443, metadata !1, metadata !18, i32 17, i32 8, i32 1, i32 2} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
-!18 = metadata !{i32 786443, metadata !1, metadata !4, i32 17, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!16 = metadata !{metadata !"0xb\0017\008\002", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!17 = metadata !{metadata !"0xb\0017\008\001", metadata !1, metadata !18} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!18 = metadata !{metadata !"0xb\0017\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
 !19 = metadata !{i32 18, i32 5, metadata !20, null}
-!20 = metadata !{i32 786443, metadata !1, metadata !18, i32 17, i32 27, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
+!20 = metadata !{metadata !"0xb\0017\0027\000", metadata !1, metadata !18} ; [ DW_TAG_lexical_block ] [./vectorization-remarks.c]
 !21 = metadata !{metadata !13, metadata !13, i64 0}
 !22 = metadata !{i32 20, i32 3, metadata !4, null}
 !23 = metadata !{i32 21, i32 3, metadata !4, null}

diff --git a/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
index efc93d9..d8e5403 100644
--- a/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
+++ b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll

@@ -1,4 +1,4 @@
-; RUN: opt -O3 -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 -S < %s | FileCheck %s
+; RUN: opt -O3 -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7.0"

diff --git a/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll b/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll
index a099daa..cab333d 100644
--- a/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll
+++ b/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=2 -S -mtriple=xcore | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S -mtriple=xcore | FileCheck %s
 
 target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32"
 target triple = "xcore"

diff --git a/test/Transforms/LoopVectorize/align.ll b/test/Transforms/LoopVectorize/align.ll
index 84b0361..f2fb8b9 100644
--- a/test/Transforms/LoopVectorize/align.ll
+++ b/test/Transforms/LoopVectorize/align.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/bsd_regex.ll b/test/Transforms/LoopVectorize/bsd_regex.ll
index 7b71272..7a3e798 100644
--- a/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/test/Transforms/LoopVectorize/bsd_regex.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-unroll=2 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-interleave=2 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/bzip_reverse_loops.ll b/test/Transforms/LoopVectorize/bzip_reverse_loops.ll
index 2648bbe..d7cbad0 100644
--- a/test/Transforms/LoopVectorize/bzip_reverse_loops.ll
+++ b/test/Transforms/LoopVectorize/bzip_reverse_loops.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/calloc.ll b/test/Transforms/LoopVectorize/calloc.ll
index 7e79916..5f441f3 100644
--- a/test/Transforms/LoopVectorize/calloc.ll
+++ b/test/Transforms/LoopVectorize/calloc.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"

diff --git a/test/Transforms/LoopVectorize/cast-induction.ll b/test/Transforms/LoopVectorize/cast-induction.ll
index 255ce9c..4f92d33 100644
--- a/test/Transforms/LoopVectorize/cast-induction.ll
+++ b/test/Transforms/LoopVectorize/cast-induction.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 ; rdar://problem/12848162
 

diff --git a/test/Transforms/LoopVectorize/conditional-assignment.ll b/test/Transforms/LoopVectorize/conditional-assignment.ll
new file mode 100644
index 0000000..50fa329
--- /dev/null
+++ b/test/Transforms/LoopVectorize/conditional-assignment.ll

@@ -0,0 +1,58 @@
+; RUN: opt < %s -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+
+; CHECK: remark: source.c:2:8: loop not vectorized: store that is conditionally executed prevents vectorization
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @conditional_store(i32* noalias nocapture %indices) #0 {
+entry:
+  br label %for.body, !dbg !10
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32* %indices, i64 %indvars.iv, !dbg !12
+  %0 = load i32* %arrayidx, align 4, !dbg !12, !tbaa !14
+  %cmp1 = icmp eq i32 %0, 1024, !dbg !12
+  br i1 %cmp1, label %if.then, label %for.inc, !dbg !12
+
+if.then:                                          ; preds = %for.body
+  store i32 0, i32* %arrayidx, align 4, !dbg !18, !tbaa !14
+  br label %for.inc, !dbg !18
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096, !dbg !10
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !10
+
+for.end:                                          ; preds = %for.inc
+  ret void, !dbg !19
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.6.0\001\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"source.c", metadata !"."}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00conditional_store\00conditional_store\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*)* @conditional_store, null, null, metadata !2} ; [ DW_TAG_subprogram ]
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!9 = metadata !{metadata !"clang version 3.6.0"}
+!10 = metadata !{i32 2, i32 8, metadata !11, null}
+!11 = metadata !{metadata !"0xb\002\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{i32 3, i32 9, metadata !13, null}
+!13 = metadata !{metadata !"0xb\003\009\000", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !15, metadata !15, i64 0}
+!15 = metadata !{metadata !"int", metadata !16, i64 0}
+!16 = metadata !{metadata !"omnipotent char", metadata !17, i64 0}
+!17 = metadata !{metadata !"Simple C/C++ TBAA"}
+!18 = metadata !{i32 3, i32 29, metadata !13, null}
+!19 = metadata !{i32 4, i32 1, metadata !4, null}

diff --git a/test/Transforms/LoopVectorize/control-flow.ll b/test/Transforms/LoopVectorize/control-flow.ll
index e4ba77f..452b7ae 100644
--- a/test/Transforms/LoopVectorize/control-flow.ll
+++ b/test/Transforms/LoopVectorize/control-flow.ll

@@ -11,7 +11,7 @@
 ; }
 
 ; CHECK: remark: source.cpp:5:9: loop not vectorized: loop control flow is not understood by vectorizer
-; CHECK: remark: source.cpp:5:9: loop not vectorized: vectorization was not specified
+; CHECK: remark: source.cpp:5:9: loop not vectorized: use -Rpass-analysis=loop-vectorize for more info
 
 ; CHECK: _Z4testPii
 ; CHECK-NOT: x i32>
@@ -55,21 +55,21 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2}
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0\001\00\006\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"source.cpp", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test", metadata !"test", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32*, i32)* @_Z4testPii, null, null, metadata !2, i32 2}
-!5 = metadata !{i32 786473, metadata !1}
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null}
+!4 = metadata !{metadata !"0x2e\00test\00test\00\001\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, i32 (i32*, i32)* @_Z4testPii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [test]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./source.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5.0"}
 !10 = metadata !{i32 3, i32 8, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 3, i32 0, i32 0}
+!11 = metadata !{metadata !"0xb\003\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{i32 5, i32 9, metadata !13, null}
-!13 = metadata !{i32 786443, metadata !1, metadata !14, i32 5, i32 9, i32 0, i32 2}
-!14 = metadata !{i32 786443, metadata !1, metadata !11, i32 4, i32 3, i32 0, i32 1}
+!13 = metadata !{metadata !"0xb\005\009\000", metadata !1, metadata !14} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\004\003\000", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{metadata !16, metadata !16, i64 0}
 !16 = metadata !{metadata !"int", metadata !17, i64 0}
 !17 = metadata !{metadata !"omnipotent char", metadata !18, i64 0}

diff --git a/test/Transforms/LoopVectorize/cpp-new-array.ll b/test/Transforms/LoopVectorize/cpp-new-array.ll
index c8215a1..f32f610 100644
--- a/test/Transforms/LoopVectorize/cpp-new-array.ll
+++ b/test/Transforms/LoopVectorize/cpp-new-array.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/dbg.value.ll b/test/Transforms/LoopVectorize/dbg.value.ll
index 2497b25..91d07d4 100644
--- a/test/Transforms/LoopVectorize/dbg.value.ll
+++ b/test/Transforms/LoopVectorize/dbg.value.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine | FileCheck %s
+; RUN: opt < %s -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine | FileCheck %s
 ; Make sure we vectorize with debugging turned on.
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -11,7 +11,7 @@
 ; CHECK-LABEL: @test(
 define i32 @test() #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !9), !dbg !18
+  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !9, metadata !{}), !dbg !18
   br label %for.body, !dbg !18
 
 for.body:
@@ -25,7 +25,7 @@
   %arrayidx4 = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv, !dbg !19
   store i32 %add, i32* %arrayidx4, align 4, !dbg !19
   %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !18
-  tail call void @llvm.dbg.value(metadata !{null}, i64 0, metadata !9), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{null}, i64 0, metadata !9, metadata !{}), !dbg !18
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !18
   %exitcond = icmp ne i32 %lftr.wideiv, 1024, !dbg !18
   br i1 %exitcond, label %for.body, label %for.end, !dbg !18
@@ -34,9 +34,9 @@
   ret i32 0, !dbg !24
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "relocation-model"="pic" "ssp-buffers-size"="8" }
 attributes #1 = { nounwind readnone }
@@ -44,27 +44,27 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!26}
 
-!0 = metadata !{i32 786449, metadata !25, i32 4, metadata !"clang", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !11, null, metadata !""}
+!0 = metadata !{metadata !"0x11\004\00clang\001\00\000\00\000", metadata !25, metadata !1, metadata !1, metadata !2, metadata !11, null} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !2 = metadata !{metadata !3}
-!3 = metadata !{i32 786478, metadata !25, metadata !4, metadata !"test", metadata !"test", metadata !"test", i32 5, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @test, null, null, metadata !8, i32 5}
-!4 = metadata !{i32 786473, metadata !25}
-!5 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!3 = metadata !{metadata !"0x2e\00test\00test\00test\005\000\001\000\006\00256\001\005", metadata !25, metadata !4, metadata !5, null, i32 ()* @test, null, null, metadata !8} ; [ DW_TAG_subprogram ]
+!4 = metadata !{metadata !"0x29", metadata !25} ; [ DW_TAG_file_type ]
+!5 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !6, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
-!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786688, metadata !10, metadata !"i", metadata !4, i32 6, metadata !7, i32 0, i32 0}
-!10 = metadata !{i32 786443, metadata !25, metadata !3, i32 6, i32 0, i32 0}
+!9 = metadata !{metadata !"0x100\00i\006\000", metadata !10, metadata !4, metadata !7} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{metadata !"0xb\006\000\000", metadata !25, metadata !3} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{metadata !12, metadata !16, metadata !17}
-!12 = metadata !{i32 786484, i32 0, null, metadata !"A", metadata !"A", metadata !"", metadata !4, i32 1, metadata !13, i32 0, i32 1, [1024 x i32]* @A, null}
-!13 = metadata !{i32 786433, null, null, null, i32 0, i64 32768, i64 32, i32 0, i32 0, metadata !7, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32768, align 32, offset 0] [from int]
+!12 = metadata !{metadata !"0x34\00A\00A\00\001\000\001", null, metadata !4, metadata !13, [1024 x i32]* @A, null} ; [ DW_TAG_variable ]
+!13 = metadata !{metadata !"0x1\00\000\0032768\0032\000\000", null, null, metadata !7, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32768, align 32, offset 0] [from int]
 !14 = metadata !{metadata !15}
 !15 = metadata !{i32 786465, i64 0, i64 1024}
-!16 = metadata !{i32 786484, i32 0, null, metadata !"B", metadata !"B", metadata !"", metadata !4, i32 2, metadata !13, i32 0, i32 1, [1024 x i32]* @B, null}
-!17 = metadata !{i32 786484, i32 0, null, metadata !"C", metadata !"C", metadata !"", metadata !4, i32 3, metadata !13, i32 0, i32 1, [1024 x i32]* @C, null} 
+!16 = metadata !{metadata !"0x34\00B\00B\00\002\000\001", null, metadata !4, metadata !13, [1024 x i32]* @B, null} ; [ DW_TAG_variable ]
+!17 = metadata !{metadata !"0x34\00C\00C\00\003\000\001", null, metadata !4, metadata !13, [1024 x i32]* @C, null} ; [ DW_TAG_variable ]
 !18 = metadata !{i32 6, i32 0, metadata !10, null}
 !19 = metadata !{i32 7, i32 0, metadata !20, null}
-!20 = metadata !{i32 786443, metadata !25, metadata !10, i32 6, i32 0, i32 1}
+!20 = metadata !{metadata !"0xb\006\000\001", metadata !25, metadata !10} ; [ DW_TAG_lexical_block ]
 !24 = metadata !{i32 9, i32 0, metadata !3, null}
 !25 = metadata !{metadata !"test", metadata !"/path/to/somewhere"}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/LoopVectorize/debugloc.ll b/test/Transforms/LoopVectorize/debugloc.ll
index bf0b418..6350296 100644
--- a/test/Transforms/LoopVectorize/debugloc.ll
+++ b/test/Transforms/LoopVectorize/debugloc.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 | FileCheck %s
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -19,10 +19,10 @@
 
 define i32 @f(i32* nocapture %a, i32 %size) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13), !dbg !19
-  tail call void @llvm.dbg.value(metadata !{i32 %size}, i64 0, metadata !14), !dbg !19
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !15), !dbg !20
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !16), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13, metadata !{}), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i32 %size}, i64 0, metadata !14, metadata !{}), !dbg !19
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !15, metadata !{}), !dbg !20
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !16, metadata !{}), !dbg !21
   %cmp4 = icmp eq i32 %size, 0, !dbg !21
   br i1 %cmp4, label %for.end, label %for.body.lr.ph, !dbg !21
 
@@ -35,9 +35,9 @@
   %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv, !dbg !22
   %0 = load i32* %arrayidx, align 4, !dbg !22
   %add = add i32 %0, %sum.05, !dbg !22
-  tail call void @llvm.dbg.value(metadata !{i32 %add.lcssa}, i64 0, metadata !15), !dbg !22
+  tail call void @llvm.dbg.value(metadata !{i32 %add.lcssa}, i64 0, metadata !15, metadata !{}), !dbg !22
   %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !21
-  tail call void @llvm.dbg.value(metadata !{null}, i64 0, metadata !16), !dbg !21
+  tail call void @llvm.dbg.value(metadata !{null}, i64 0, metadata !16, metadata !{}), !dbg !21
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
   %exitcond = icmp ne i32 %lftr.wideiv, %size, !dbg !21
   br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge, !dbg !21
@@ -52,10 +52,10 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind readonly ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -63,28 +63,28 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !27}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 185038) (llvm/trunk 185097)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Data/backedup/dev/os/llvm/debug/-] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 185038) (llvm/trunk 185097)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Volumes/Data/backedup/dev/os/llvm/debug/-] [DW_LANG_C99]
 !1 = metadata !{metadata !"-", metadata !"/Volumes/Data/backedup/dev/os/llvm/debug"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32*, i32)* @f, null, null, metadata !12, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!4 = metadata !{metadata !"0x2e\00f\00f\00\003\000\001\000\006\00256\001\003", metadata !5, metadata !6, metadata !7, null, i32 (i32*, i32)* @f, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
 !5 = metadata !{metadata !"<stdin>", metadata !"/Volumes/Data/backedup/dev/os/llvm/debug"}
-!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/backedup/dev/os/llvm/debug/<stdin>]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !"0x29", metadata !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/backedup/dev/os/llvm/debug/<stdin>]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10, metadata !11}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!11 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!11 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
 !12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16}
-!13 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !6, i32 16777219, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 3]
-!14 = metadata !{i32 786689, metadata !4, metadata !"size", metadata !6, i32 33554435, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [size] [line 3]
-!15 = metadata !{i32 786688, metadata !4, metadata !"sum", metadata !6, i32 4, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [sum] [line 4]
-!16 = metadata !{i32 786688, metadata !17, metadata !"i", metadata !6, i32 5, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 5]
-!17 = metadata !{i32 786443, metadata !5, metadata !4, i32 5, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Volumes/Data/backedup/dev/os/llvm/debug/<stdin>]
+!13 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !4, metadata !6, metadata !10} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!14 = metadata !{metadata !"0x101\00size\0033554435\000", metadata !4, metadata !6, metadata !11} ; [ DW_TAG_arg_variable ] [size] [line 3]
+!15 = metadata !{metadata !"0x100\00sum\004\000", metadata !4, metadata !6, metadata !11} ; [ DW_TAG_auto_variable ] [sum] [line 4]
+!16 = metadata !{metadata !"0x100\00i\005\000", metadata !17, metadata !6, metadata !11} ; [ DW_TAG_auto_variable ] [i] [line 5]
+!17 = metadata !{metadata !"0xb\005\000\000", metadata !5, metadata !4} ; [ DW_TAG_lexical_block ] [/Volumes/Data/backedup/dev/os/llvm/debug/<stdin>]
 !18 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !19 = metadata !{i32 3, i32 0, metadata !4, null}
 !20 = metadata !{i32 4, i32 0, metadata !4, null}
 !21 = metadata !{i32 5, i32 0, metadata !17, null}
 !22 = metadata !{i32 6, i32 0, metadata !17, null}
 !26 = metadata !{i32 7, i32 0, metadata !4, null}
-!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/LoopVectorize/duplicated-metadata.ll b/test/Transforms/LoopVectorize/duplicated-metadata.ll
new file mode 100644
index 0000000..8353dca
--- /dev/null
+++ b/test/Transforms/LoopVectorize/duplicated-metadata.ll

@@ -0,0 +1,30 @@
+; RUN: opt < %s -loop-vectorize -S 2>&1 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test makes sure we don't duplicate the loop vectorizer's metadata
+; while marking them as already vectorized (by setting width = 1), even
+; at lower optimization levels, where no extra cleanup is done
+
+define void @_Z3fooPf(float* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %a, i64 %indvars.iv
+  %p = load float* %arrayidx, align 4
+  %mul = fmul float %p, 2.000000e+00
+  store float %mul, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!0 = metadata !{metadata !0, metadata !1}
+!1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 4}
+; CHECK-NOT: !{metadata !"llvm.loop.vectorize.width", i32 4}
+; CHECK: !{metadata !"llvm.loop.interleave.count", i32 1}

diff --git a/test/Transforms/LoopVectorize/ee-crash.ll b/test/Transforms/LoopVectorize/ee-crash.ll
index 8a4f8ce..a3c0bb8 100644
--- a/test/Transforms/LoopVectorize/ee-crash.ll
+++ b/test/Transforms/LoopVectorize/ee-crash.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/exact.ll b/test/Transforms/LoopVectorize/exact.ll
new file mode 100644
index 0000000..0a8fbf3
--- /dev/null
+++ b/test/Transforms/LoopVectorize/exact.ll

@@ -0,0 +1,24 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; CHECK-LABEL: @lshr_exact(
+; CHECK: lshr exact <4 x i32>
+define void @lshr_exact(i32* %x) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %x, i64 %iv
+  %0 = load i32* %arrayidx, align 4
+  %conv1 = lshr exact i32 %0, 1
+  store i32 %conv1, i32* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

diff --git a/test/Transforms/LoopVectorize/flags.ll b/test/Transforms/LoopVectorize/flags.ll
index 21d0937..0fc55c8 100644
--- a/test/Transforms/LoopVectorize/flags.ll
+++ b/test/Transforms/LoopVectorize/flags.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/float-reduction.ll b/test/Transforms/LoopVectorize/float-reduction.ll
index 0dfbab0..0f064ee 100644
--- a/test/Transforms/LoopVectorize/float-reduction.ll
+++ b/test/Transforms/LoopVectorize/float-reduction.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -23,3 +23,25 @@
 for.end:                                          ; preds = %for.body
   ret float %add
 }
+
+;CHECK-LABEL: @foosub(
+;CHECK: fsub fast <4 x float>
+;CHECK: ret
+define float @foosub(float* nocapture %A, i32* nocapture %n) nounwind uwtable readonly ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.04 = phi float [ 0.000000e+00, %entry ], [ %sub, %for.body ]
+  %arrayidx = getelementptr inbounds float* %A, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %sub = fsub fast float %sum.04, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret float %sub
+}

diff --git a/test/Transforms/LoopVectorize/funcall.ll b/test/Transforms/LoopVectorize/funcall.ll
index f1f068c..e03534f 100644
--- a/test/Transforms/LoopVectorize/funcall.ll
+++ b/test/Transforms/LoopVectorize/funcall.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
index d8959d4..6c8af0b 100644
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll

@@ -1,5 +1,5 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine -S | FileCheck %s
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/global_alias.ll b/test/Transforms/LoopVectorize/global_alias.ll
index d64d67f..3f11ce8 100644
--- a/test/Transforms/LoopVectorize/global_alias.ll
+++ b/test/Transforms/LoopVectorize/global_alias.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -O1 -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -O1 -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
 

diff --git a/test/Transforms/LoopVectorize/hoist-loads.ll b/test/Transforms/LoopVectorize/hoist-loads.ll
index 765e14d..d0b27f1 100644
--- a/test/Transforms/LoopVectorize/hoist-loads.ll
+++ b/test/Transforms/LoopVectorize/hoist-loads.ll

@@ -1,4 +1,4 @@
-; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S < %s | FileCheck %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/i8-induction.ll b/test/Transforms/LoopVectorize/i8-induction.ll
index 2a0e826..90e3ec0 100644
--- a/test/Transforms/LoopVectorize/i8-induction.ll
+++ b/test/Transforms/LoopVectorize/i8-induction.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/if-conv-crash.ll b/test/Transforms/LoopVectorize/if-conv-crash.ll
index f8f2cf1..67910bf 100644
--- a/test/Transforms/LoopVectorize/if-conv-crash.ll
+++ b/test/Transforms/LoopVectorize/if-conv-crash.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/if-conversion-nest.ll b/test/Transforms/LoopVectorize/if-conversion-nest.ll
index 92cb06e..b5ac8fc 100644
--- a/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/test/Transforms/LoopVectorize/if-conversion-nest.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/if-conversion-reduction.ll b/test/Transforms/LoopVectorize/if-conversion-reduction.ll
index 8cb703c..455699c 100644
--- a/test/Transforms/LoopVectorize/if-conversion-reduction.ll
+++ b/test/Transforms/LoopVectorize/if-conversion-reduction.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"

diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll
index 6e3e8ed..9e18528 100644
--- a/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/test/Transforms/LoopVectorize/if-conversion.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"

diff --git a/test/Transforms/LoopVectorize/if-pred-stores.ll b/test/Transforms/LoopVectorize/if-pred-stores.ll
index 7b0e181..c6067e0 100644
--- a/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/if-pred-stores.ll

@@ -1,5 +1,5 @@
-; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-unroll=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL
-; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-unroll=1 -loop-vectorize -enable-cond-stores-vec < %s | FileCheck %s --check-prefix=VEC
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec < %s | FileCheck %s --check-prefix=VEC
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
 

diff --git a/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/test/Transforms/LoopVectorize/incorrect-dom-info.ll
new file mode 100644
index 0000000..624ee7e
--- /dev/null
+++ b/test/Transforms/LoopVectorize/incorrect-dom-info.ll

@@ -0,0 +1,142 @@
+; This test is based on one of benchmarks from SPEC2006. It exposes a bug with
+; incorrect updating of the dom-tree.
+; RUN: opt < %s  -loop-vectorize -verify-dom-info
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@PL_utf8skip = external constant [0 x i8]
+
+; Function Attrs: nounwind ssp uwtable
+define void @Perl_pp_quotemeta() #0 {
+  %len = alloca i64, align 8
+  br i1 undef, label %2, label %1
+
+; <label>:1                                       ; preds = %0
+  br label %3
+
+; <label>:2                                       ; preds = %0
+  br label %3
+
+; <label>:3                                       ; preds = %2, %1
+  br i1 undef, label %34, label %4
+
+; <label>:4                                       ; preds = %3
+  br i1 undef, label %5, label %6
+
+; <label>:5                                       ; preds = %4
+  br label %6
+
+; <label>:6                                       ; preds = %5, %4
+  br i1 undef, label %7, label %8
+
+; <label>:7                                       ; preds = %6
+  br label %8
+
+; <label>:8                                       ; preds = %7, %6
+  br i1 undef, label %.preheader, label %9
+
+.preheader:                                       ; preds = %9, %8
+  br i1 undef, label %.loopexit, label %.lr.ph
+
+; <label>:9                                       ; preds = %8
+  br i1 undef, label %thread-pre-split.preheader, label %.preheader
+
+thread-pre-split.preheader:                       ; preds = %9
+  br i1 undef, label %thread-pre-split._crit_edge, label %.lr.ph21
+
+.thread-pre-split.loopexit_crit_edge:             ; preds = %19
+  %scevgep.sum = xor i64 %umax, -1
+  %scevgep45 = getelementptr i8* %d.020, i64 %scevgep.sum
+  br label %thread-pre-split.loopexit
+
+thread-pre-split.loopexit:                        ; preds = %11, %.thread-pre-split.loopexit_crit_edge
+  %d.1.lcssa = phi i8* [ %scevgep45, %.thread-pre-split.loopexit_crit_edge ], [ %d.020, %11 ]
+  br i1 false, label %thread-pre-split._crit_edge, label %.lr.ph21
+
+.lr.ph21:                                         ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
+  %d.020 = phi i8* [ undef, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+  %10 = phi i64 [ %28, %26 ], [ undef, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+  br i1 undef, label %11, label %22
+
+; <label>:11                                      ; preds = %.lr.ph21
+  %12 = getelementptr inbounds [0 x i8]* @PL_utf8skip, i64 0, i64 undef
+  %13 = load i8* %12, align 1
+  %14 = zext i8 %13 to i64
+  %15 = icmp ugt i64 %14, %10
+  %. = select i1 %15, i64 %10, i64 %14
+  br i1 undef, label %thread-pre-split.loopexit, label %.lr.ph28
+
+.lr.ph28:                                         ; preds = %11
+  %16 = xor i64 %10, -1
+  %17 = xor i64 %14, -1
+  %18 = icmp ugt i64 %16, %17
+  %umax = select i1 %18, i64 %16, i64 %17
+  br label %19
+
+; <label>:19                                      ; preds = %19, %.lr.ph28
+  %ulen.126 = phi i64 [ %., %.lr.ph28 ], [ %20, %19 ]
+  %20 = add i64 %ulen.126, -1
+  %21 = icmp eq i64 %20, 0
+  br i1 %21, label %.thread-pre-split.loopexit_crit_edge, label %19
+
+; <label>:22                                      ; preds = %.lr.ph21
+  br i1 undef, label %26, label %23
+
+; <label>:23                                      ; preds = %22
+  br i1 undef, label %26, label %24
+
+; <label>:24                                      ; preds = %23
+  br i1 undef, label %26, label %25
+
+; <label>:25                                      ; preds = %24
+  br label %26
+
+; <label>:26                                      ; preds = %25, %24, %23, %22
+  %27 = load i64* %len, align 8
+  %28 = add i64 %27, -1
+  br i1 undef, label %thread-pre-split._crit_edge, label %.lr.ph21
+
+thread-pre-split._crit_edge:                      ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
+  br label %.loopexit
+
+.lr.ph:                                           ; preds = %33, %.preheader
+  br i1 undef, label %29, label %thread-pre-split5
+
+; <label>:29                                      ; preds = %.lr.ph
+  br i1 undef, label %33, label %30
+
+; <label>:30                                      ; preds = %29
+  br i1 undef, label %33, label %31
+
+thread-pre-split5:                                ; preds = %.lr.ph
+  br i1 undef, label %33, label %31
+
+; <label>:31                                      ; preds = %thread-pre-split5, %30
+  br i1 undef, label %33, label %32
+
+; <label>:32                                      ; preds = %31
+  br label %33
+
+; <label>:33                                      ; preds = %32, %31, %thread-pre-split5, %30, %29
+  br i1 undef, label %.loopexit, label %.lr.ph
+
+.loopexit:                                        ; preds = %33, %thread-pre-split._crit_edge, %.preheader
+  br label %35
+
+; <label>:34                                      ; preds = %3
+  br label %35
+
+; <label>:35                                      ; preds = %34, %.loopexit
+  br i1 undef, label %37, label %36
+
+; <label>:36                                      ; preds = %35
+  br label %37
+
+; <label>:37                                      ; preds = %36, %35
+  ret void
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.6.0 "}

diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
index 71bedb7..067a76b 100644
--- a/test/Transforms/LoopVectorize/increment.ll
+++ b/test/Transforms/LoopVectorize/increment.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index 7dabcb2..3f34918 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -28,7 +28,7 @@
   ret void
 }
 
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
 
 ; Make sure we remove unneeded vectorization of induction variables.
 ; In order for instcombine to cleanup the vectorized induction variables that we

diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll
index 9c8201a..ce64c5b 100644
--- a/test/Transforms/LoopVectorize/induction_plus.ll
+++ b/test/Transforms/LoopVectorize/induction_plus.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll
index 7dfaf03..d48731a 100644
--- a/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/test/Transforms/LoopVectorize/intrinsic.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -1192,3 +1192,59 @@
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
+
+declare float @llvm.minnum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @minnum_f32(
+;CHECK: llvm.minnum.v4f32
+;CHECK: ret void
+define void @minnum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %y, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float* %z, i64 %indvars.iv
+  %1 = load float* %arrayidx2, align 4
+  %call = tail call float @llvm.minnum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.maxnum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @maxnum_f32(
+;CHECK: llvm.maxnum.v4f32
+;CHECK: ret void
+define void @maxnum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %y, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float* %z, i64 %indvars.iv
+  %1 = load float* %arrayidx2, align 4
+  %call = tail call float @llvm.maxnum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}

diff --git a/test/Transforms/LoopVectorize/lcssa-crash.ll b/test/Transforms/LoopVectorize/lcssa-crash.ll
index de6be54..68cc74e 100644
--- a/test/Transforms/LoopVectorize/lcssa-crash.ll
+++ b/test/Transforms/LoopVectorize/lcssa-crash.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

diff --git a/test/Transforms/LoopVectorize/lifetime.ll b/test/Transforms/LoopVectorize/lifetime.ll
index 4f6f3b8..ba36cc4 100644
--- a/test/Transforms/LoopVectorize/lifetime.ll
+++ b/test/Transforms/LoopVectorize/lifetime.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/memdep.ll b/test/Transforms/LoopVectorize/memdep.ll
index 21cb703..f857e80 100644
--- a/test/Transforms/LoopVectorize/memdep.ll
+++ b/test/Transforms/LoopVectorize/memdep.ll

@@ -1,5 +1,5 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S | FileCheck %s
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -S | FileCheck %s -check-prefix=WIDTH
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s -check-prefix=WIDTH
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/metadata-unroll.ll b/test/Transforms/LoopVectorize/metadata-unroll.ll
index 2fcc53a..848f1f9 100644
--- a/test/Transforms/LoopVectorize/metadata-unroll.ll
+++ b/test/Transforms/LoopVectorize/metadata-unroll.ll

@@ -38,4 +38,4 @@
 }
 
 !0 = metadata !{metadata !0, metadata !1}
-!1 = metadata !{metadata !"llvm.loop.vectorize.unroll", i32 2}
+!1 = metadata !{metadata !"llvm.loop.interleave.count", i32 2}

diff --git a/test/Transforms/LoopVectorize/metadata-width.ll b/test/Transforms/LoopVectorize/metadata-width.ll
index 87de655..da0c622 100644
--- a/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/test/Transforms/LoopVectorize/metadata-width.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

diff --git a/test/Transforms/LoopVectorize/metadata.ll b/test/Transforms/LoopVectorize/metadata.ll
new file mode 100644
index 0000000..14f60b3
--- /dev/null
+++ b/test/Transforms/LoopVectorize/metadata.ll

@@ -0,0 +1,44 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %conv = fptosi float %0 to i32
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4, !tbaa !4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+; CHECK-LABEL: @test1
+; CHECK: load <4 x float>* %{{.*}}, align 4, !tbaa ![[TFLT:[0-9]+]]
+; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa ![[TINT:[0-9]+]]
+; CHECK: ret i32 0
+
+; CHECK-DAG: ![[TFLT]] = metadata !{metadata ![[TFLT1:[0-9]+]]
+; CHECK-DAG: ![[TFLT1]] = metadata !{metadata !"float"
+
+; CHECK-DAG: ![[TINT]] = metadata !{metadata ![[TINT1:[0-9]+]]
+; CHECK-DAG: ![[TINT1]] = metadata !{metadata !"int"
+
+attributes #0 = { nounwind uwtable }
+
+!0 = metadata !{metadata !1, metadata !1, i64 0}
+!1 = metadata !{metadata !"float", metadata !2, i64 0}
+!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
+!3 = metadata !{metadata !"Simple C/C++ TBAA"}
+!4 = metadata !{metadata !5, metadata !5, i64 0}
+!5 = metadata !{metadata !"int", metadata !2, i64 0}
+

diff --git a/test/Transforms/LoopVectorize/minmax_reduction.ll b/test/Transforms/LoopVectorize/minmax_reduction.ll
index 0e47260..e73e69d 100644
--- a/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/test/Transforms/LoopVectorize/minmax_reduction.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-unroll=1  < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-interleave=1  < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll b/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
index 88a29c5..cd022ad 100644
--- a/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
+++ b/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll

@@ -1,4 +1,4 @@
-; RUN: opt -indvars -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S < %s | FileCheck %s
+; RUN: opt -indvars -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"

diff --git a/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/test/Transforms/LoopVectorize/multiple-address-spaces.ll
index 7d836de..bb2af1e 100644
--- a/test/Transforms/LoopVectorize/multiple-address-spaces.ll
+++ b/test/Transforms/LoopVectorize/multiple-address-spaces.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 ; From a simple program with two address spaces:
 ; char Y[4*10000] __attribute__((address_space(1)));

diff --git a/test/Transforms/LoopVectorize/no_array_bounds.ll b/test/Transforms/LoopVectorize/no_array_bounds.ll
new file mode 100644
index 0000000..a39b44f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/no_array_bounds.ll

@@ -0,0 +1,101 @@
+; RUN: opt < %s -loop-vectorize -S 2>&1 | FileCheck %s
+
+; Verify warning is generated when vectorization/ interleaving is explicitly specified and fails to occur.
+; CHECK: warning: no_array_bounds.cpp:5:5: loop not vectorized: failed explicitly specified loop vectorization
+; CHECK: warning: no_array_bounds.cpp:10:5: loop not interleaved: failed explicitly specified loop interleaving
+
+;  #pragma clang loop vectorize(enable)
+;  for (int i = 0; i < number; i++) {
+;    A[B[i]]++;
+;  }
+
+;  #pragma clang loop vectorize(disable) interleave(enable)
+;  for (int i = 0; i < number; i++) {
+;    B[A[i]]++;
+;  }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z4testPiS_i(i32* nocapture %A, i32* nocapture %B, i32 %number) #0 {
+entry:
+  %cmp25 = icmp sgt i32 %number, 0, !dbg !10
+  br i1 %cmp25, label %for.body.preheader, label %for.end15, !dbg !10, !llvm.loop !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !14
+
+for.cond5.preheader:                              ; preds = %for.body
+  br i1 %cmp25, label %for.body7.preheader, label %for.end15, !dbg !16, !llvm.loop !18
+
+for.body7.preheader:                              ; preds = %for.cond5.preheader
+  br label %for.body7, !dbg !20
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv27 = phi i64 [ %indvars.iv.next28, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv27, !dbg !14
+  %0 = load i32* %arrayidx, align 4, !dbg !14, !tbaa !22
+  %idxprom1 = sext i32 %0 to i64, !dbg !14
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1, !dbg !14
+  %1 = load i32* %arrayidx2, align 4, !dbg !14, !tbaa !22
+  %inc = add nsw i32 %1, 1, !dbg !14
+  store i32 %inc, i32* %arrayidx2, align 4, !dbg !14, !tbaa !22
+  %indvars.iv.next28 = add nuw nsw i64 %indvars.iv27, 1, !dbg !10
+  %lftr.wideiv29 = trunc i64 %indvars.iv.next28 to i32, !dbg !10
+  %exitcond30 = icmp eq i32 %lftr.wideiv29, %number, !dbg !10
+  br i1 %exitcond30, label %for.cond5.preheader, label %for.body, !dbg !10, !llvm.loop !12
+
+for.body7:                                        ; preds = %for.body7.preheader, %for.body7
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body7 ], [ 0, %for.body7.preheader ]
+  %arrayidx9 = getelementptr inbounds i32* %A, i64 %indvars.iv, !dbg !20
+  %2 = load i32* %arrayidx9, align 4, !dbg !20, !tbaa !22
+  %idxprom10 = sext i32 %2 to i64, !dbg !20
+  %arrayidx11 = getelementptr inbounds i32* %B, i64 %idxprom10, !dbg !20
+  %3 = load i32* %arrayidx11, align 4, !dbg !20, !tbaa !22
+  %inc12 = add nsw i32 %3, 1, !dbg !20
+  store i32 %inc12, i32* %arrayidx11, align 4, !dbg !20, !tbaa !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !16
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !16
+  %exitcond = icmp eq i32 %lftr.wideiv, %number, !dbg !16
+  br i1 %exitcond, label %for.end15.loopexit, label %for.body7, !dbg !16, !llvm.loop !18
+
+for.end15.loopexit:                               ; preds = %for.body7
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry, %for.cond5.preheader
+  ret void, !dbg !26
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0\001\00\000\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"no_array_bounds.cpp", metadata !"."}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !"0x2e\00test\00test\00\001\000\001\000\006\00256\001\002", metadata !1, metadata !5, metadata !6, null, void (i32*, i32*, i32)* @_Z4testPiS_i, null, null, metadata !2} ; [ DW_TAG_subprogram ]
+!5 = metadata !{metadata !"0x29", metadata !1} ; [ DW_TAG_file_type ]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!9 = metadata !{metadata !"clang version 3.5.0"}
+!10 = metadata !{i32 4, i32 8, metadata !11, null}
+!11 = metadata !{metadata !"0xb\004\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{metadata !12, metadata !13}
+!13 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
+!14 = metadata !{i32 5, i32 5, metadata !15, null}
+!15 = metadata !{metadata !"0xb\004\0036\000", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{i32 9, i32 8, metadata !17, null}
+!17 = metadata !{metadata !"0xb\009\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{metadata !18, metadata !13, metadata !19}
+!19 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
+!20 = metadata !{i32 10, i32 5, metadata !21, null}
+!21 = metadata !{metadata !"0xb\009\0036\000", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{metadata !23, metadata !23, i64 0}
+!23 = metadata !{metadata !"int", metadata !24, i64 0}
+!24 = metadata !{metadata !"omnipotent char", metadata !25, i64 0}
+!25 = metadata !{metadata !"Simple C/C++ TBAA"}
+!26 = metadata !{i32 12, i32 1, metadata !4, null}

diff --git a/test/Transforms/LoopVectorize/no_idiv_reduction.ll b/test/Transforms/LoopVectorize/no_idiv_reduction.ll
index 295fcab..5c721a680 100644
--- a/test/Transforms/LoopVectorize/no_idiv_reduction.ll
+++ b/test/Transforms/LoopVectorize/no_idiv_reduction.ll

@@ -1,4 +1,4 @@
-; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S < %s | FileCheck %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
 @a = common global [128 x i32] zeroinitializer, align 16
 
 ;; Must not vectorize division reduction. Division is lossy.

diff --git a/test/Transforms/LoopVectorize/no_int_induction.ll b/test/Transforms/LoopVectorize/no_int_induction.ll
index e572d1a..1275915 100644
--- a/test/Transforms/LoopVectorize/no_int_induction.ll
+++ b/test/Transforms/LoopVectorize/no_int_induction.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 ; int __attribute__((noinline)) sum_array(int *A, int n) {
 ;  return std::accumulate(A, A + n, 0);

diff --git a/test/Transforms/LoopVectorize/no_outside_user.ll b/test/Transforms/LoopVectorize/no_outside_user.ll
index 1f891ad..bcd29c1 100644
--- a/test/Transforms/LoopVectorize/no_outside_user.ll
+++ b/test/Transforms/LoopVectorize/no_outside_user.ll

@@ -1,4 +1,7 @@
-; RUN: opt -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -pass-remarks-analysis=loop-vectorize < %s 2>&1 | FileCheck %s
+
+; CHECK: remark: {{.*}}: loop not vectorized: value could not be identified as an induction or reduction variable
+; CHECK: remark: {{.*}}: loop not vectorized: use of induction value outside of the loop is not handled by vectorizer
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 

diff --git a/test/Transforms/LoopVectorize/no_switch.ll b/test/Transforms/LoopVectorize/no_switch.ll
index 52b4285..c989c6b 100644
--- a/test/Transforms/LoopVectorize/no_switch.ll
+++ b/test/Transforms/LoopVectorize/no_switch.ll

@@ -1,7 +1,8 @@
 ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
 
 ; CHECK: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
-; CHECK: remark: source.cpp:4:5: loop not vectorized: vectorization is explicitly enabled with width 4
+; CHECK: remark: source.cpp:4:5: loop not vectorized: use -Rpass-analysis=loop-vectorize for more info (Force=true, Vector Width=4)
+; CHECK: warning: source.cpp:4:5: loop not vectorized: failed explicitly specified loop vectorization
 
 ; CHECK: _Z11test_switchPii
 ; CHECK-NOT: x i32>
@@ -58,28 +59,28 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5.0", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"", i32 2}
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5.0\001\00\006\00\002", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./source.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"source.cpp", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_switch", metadata !"test_switch", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32)* @_Z11test_switchPii, null, null, metadata !2, i32 1}
-!5 = metadata !{i32 786473, metadata !1}
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null}
+!4 = metadata !{metadata !"0x2e\00test_switch\00test_switch\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, void (i32*, i32)* @_Z11test_switchPii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [test_switch]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./source.cpp]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
-!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5.0"}
 !10 = metadata !{i32 3, i32 8, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 3, i32 0, i32 0}
+!11 = metadata !{metadata !"0xb\003\003\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{metadata !12, metadata !13, metadata !13}
 !13 = metadata !{metadata !"llvm.loop.vectorize.enable", i1 true}
 !14 = metadata !{i32 4, i32 5, metadata !15, null}
-!15 = metadata !{i32 786443, metadata !1, metadata !11, i32 3, i32 36, i32 0, i32 1}
+!15 = metadata !{metadata !"0xb\003\0036\000", metadata !1, metadata !11} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{metadata !17, metadata !17, i64 0}
 !17 = metadata !{metadata !"int", metadata !18, i64 0}
 !18 = metadata !{metadata !"omnipotent char", metadata !19, i64 0}
 !19 = metadata !{metadata !"Simple C/C++ TBAA"}
 !20 = metadata !{i32 6, i32 7, metadata !21, null}
-!21 = metadata !{i32 786443, metadata !1, metadata !15, i32 4, i32 18, i32 0, i32 2}
+!21 = metadata !{metadata !"0xb\004\0018\000", metadata !1, metadata !15} ; [ DW_TAG_lexical_block ]
 !22 = metadata !{i32 7, i32 5, metadata !21, null}
 !23 = metadata !{i32 9, i32 7, metadata !21, null}
 !24 = metadata !{i32 14, i32 1, metadata !4, null}

diff --git a/test/Transforms/LoopVectorize/nofloat.ll b/test/Transforms/LoopVectorize/nofloat.ll
index c3c81b6..e9f4c5f 100644
--- a/test/Transforms/LoopVectorize/nofloat.ll
+++ b/test/Transforms/LoopVectorize/nofloat.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 ; Make sure that we don't vectorize functions with 'noimplicitfloat' attributes.
 

diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
index 0c54a2b..b03d4f0 100644
--- a/test/Transforms/LoopVectorize/non-const-n.ll
+++ b/test/Transforms/LoopVectorize/non-const-n.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/nsw-crash.ll b/test/Transforms/LoopVectorize/nsw-crash.ll
index e5fad14..68d9933 100644
--- a/test/Transforms/LoopVectorize/nsw-crash.ll
+++ b/test/Transforms/LoopVectorize/nsw-crash.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

diff --git a/test/Transforms/LoopVectorize/opt.ll b/test/Transforms/LoopVectorize/opt.ll
index 27030a2..a9be80f 100644
--- a/test/Transforms/LoopVectorize/opt.ll
+++ b/test/Transforms/LoopVectorize/opt.ll

@@ -1,5 +1,5 @@
-; RUN: opt -S -O3 -force-vector-width=2 -force-vector-unroll=1 < %s | FileCheck --check-prefix=LOOPVEC %s
-; RUN: opt -S -O3 -disable-loop-vectorization -force-vector-width=2 -force-vector-unroll=1 < %s | FileCheck --check-prefix=NOLOOPVEC %s
+; RUN: opt -S -O3 -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck --check-prefix=LOOPVEC %s
+; RUN: opt -S -O3 -disable-loop-vectorization -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck --check-prefix=NOLOOPVEC %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/ptr_loops.ll b/test/Transforms/LoopVectorize/ptr_loops.ll
index 15983f0..3fb38fe 100644
--- a/test/Transforms/LoopVectorize/ptr_loops.ll
+++ b/test/Transforms/LoopVectorize/ptr_loops.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll
index fc8f0a5..2f7a96a 100644
--- a/test/Transforms/LoopVectorize/read-only.ll
+++ b/test/Transforms/LoopVectorize/read-only.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
index 791fce1..5e6b7fa 100644
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/reverse_induction.ll b/test/Transforms/LoopVectorize/reverse_induction.ll
index 65ef95d..da02d01 100644
--- a/test/Transforms/LoopVectorize/reverse_induction.ll
+++ b/test/Transforms/LoopVectorize/reverse_induction.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=2 -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/reverse_iter.ll b/test/Transforms/LoopVectorize/reverse_iter.ll
index f803120..13172bb 100644
--- a/test/Transforms/LoopVectorize/reverse_iter.ll
+++ b/test/Transforms/LoopVectorize/reverse_iter.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/test/Transforms/LoopVectorize/runtime-check-address-space.ll
index 6c86561..34bbe52 100644
--- a/test/Transforms/LoopVectorize/runtime-check-address-space.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-address-space.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
+; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
 
 ; Check vectorization that would ordinarily require a runtime bounds
 ; check on the pointers when mixing address spaces. For now we cannot

diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
index 212b37c..56f1f99 100644
--- a/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
+; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
 
 ; Artificial datalayout
 target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"

diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
index 01e28bc..9d02a6a 100644
--- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -7,7 +7,7 @@
 ;CHECK: br
 ;CHECK: br
 ;CHECK: getelementptr
-;CHECK-NEXT: getelementptr
+;CHECK-DAG: getelementptr
 ;CHECK-DAG: icmp uge
 ;CHECK-DAG: icmp uge
 ;CHECK-DAG: icmp uge

diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
index d15479d..1edafb4 100644
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"

diff --git a/test/Transforms/LoopVectorize/runtime-limit.ll b/test/Transforms/LoopVectorize/runtime-limit.ll
index 7370a6f..324949d 100644
--- a/test/Transforms/LoopVectorize/runtime-limit.ll
+++ b/test/Transforms/LoopVectorize/runtime-limit.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/safegep.ll b/test/Transforms/LoopVectorize/safegep.ll
index c950860..f853afd 100644
--- a/test/Transforms/LoopVectorize/safegep.ll
+++ b/test/Transforms/LoopVectorize/safegep.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-unroll=1  < %s |  FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1  < %s |  FileCheck %s
 target datalayout = "e-p:32:32:32-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 
 

diff --git a/test/Transforms/LoopVectorize/same-base-access.ll b/test/Transforms/LoopVectorize/same-base-access.ll
index d623a34..d19458f 100644
--- a/test/Transforms/LoopVectorize/same-base-access.ll
+++ b/test/Transforms/LoopVectorize/same-base-access.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"

diff --git a/test/Transforms/LoopVectorize/scalar-select.ll b/test/Transforms/LoopVectorize/scalar-select.ll
index 257c7be..6b37cc2 100644
--- a/test/Transforms/LoopVectorize/scalar-select.ll
+++ b/test/Transforms/LoopVectorize/scalar-select.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 683621a..1bce3f8 100644
--- a/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/test/Transforms/LoopVectorize/scev-exitlim-crash.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=2 -force-vector-width=8 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=8 -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"

diff --git a/test/Transforms/LoopVectorize/simple-unroll.ll b/test/Transforms/LoopVectorize/simple-unroll.ll
index 83f35ff..8bf680a 100644
--- a/test/Transforms/LoopVectorize/simple-unroll.ll
+++ b/test/Transforms/LoopVectorize/simple-unroll.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/small-loop.ll b/test/Transforms/LoopVectorize/small-loop.ll
index 49ce5c5..1d30102 100644
--- a/test/Transforms/LoopVectorize/small-loop.ll
+++ b/test/Transforms/LoopVectorize/small-loop.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/start-non-zero.ll b/test/Transforms/LoopVectorize/start-non-zero.ll
index 8f675af..cc47494 100644
--- a/test/Transforms/LoopVectorize/start-non-zero.ll
+++ b/test/Transforms/LoopVectorize/start-non-zero.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/store-shuffle-bug.ll b/test/Transforms/LoopVectorize/store-shuffle-bug.ll
index e53c120..6d3d113 100644
--- a/test/Transforms/LoopVectorize/store-shuffle-bug.ll
+++ b/test/Transforms/LoopVectorize/store-shuffle-bug.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
+; RUN: opt -S -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/struct_access.ll b/test/Transforms/LoopVectorize/struct_access.ll
index 75beae8..cf6f325 100644
--- a/test/Transforms/LoopVectorize/struct_access.ll
+++ b/test/Transforms/LoopVectorize/struct_access.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"

diff --git a/test/Transforms/LoopVectorize/tbaa-nodep.ll b/test/Transforms/LoopVectorize/tbaa-nodep.ll
new file mode 100644
index 0000000..5cd104c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/tbaa-nodep.ll

@@ -0,0 +1,102 @@
+; RUN: opt < %s  -tbaa -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -simplifycfg -S | FileCheck %s
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -simplifycfg -S | FileCheck %s --check-prefix=CHECK-NOTBAA
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %conv = fptosi float %0 to i32
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4, !tbaa !4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+
+; TBAA partitions the accesses in this loop, so it can be vectorized without
+; runtime checks.
+
+; CHECK-LABEL: @test1
+; CHECK: entry:
+; CHECK-NEXT: br label %vector.body
+; CHECK: vector.body:
+
+; CHECK: load <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
+
+; CHECK: ret i32 0
+
+; CHECK-NOTBAA-LABEL: @test1
+; CHECK-NOTBAA: icmp uge i32*
+
+; CHECK-NOTBAA: load <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
+
+; CHECK-NOTBAA: ret i32 0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test2(i32* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4, !tbaa !4
+  %conv = sitofp i32 %1 to float
+  %mul = fmul float %0, %conv
+  %arrayidx4 = getelementptr inbounds float* %c, i64 %indvars.iv
+  store float %mul, float* %arrayidx4, align 4, !tbaa !0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+
+; This test is like the first, except here there is still one runtime check
+; required. Without TBAA, however, two checks are required.
+
+; CHECK-LABEL: @test2
+; CHECK: icmp uge float*
+; CHECK: icmp uge float*
+; CHECK-NOT: icmp uge i32*
+
+; CHECK: load <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
+
+; CHECK: ret i32 0
+
+; CHECK-NOTBAA-LABEL: @test2
+; CHECK-NOTBAA: icmp uge float*
+; CHECK-NOTBAA: icmp uge float*
+; CHECK-NOTBAA-DAG: icmp uge float*
+; CHECK-NOTBAA-DAG: icmp uge i32*
+
+; CHECK-NOTBAA: load <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
+
+; CHECK-NOTBAA: ret i32 0
+}
+
+attributes #0 = { nounwind uwtable }
+
+!0 = metadata !{metadata !1, metadata !1, i64 0}
+!1 = metadata !{metadata !"float", metadata !2, i64 0}
+!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
+!3 = metadata !{metadata !"Simple C/C++ TBAA"}
+!4 = metadata !{metadata !5, metadata !5, i64 0}
+!5 = metadata !{metadata !"int", metadata !2, i64 0}
+

diff --git a/test/Transforms/LoopVectorize/undef-inst-bug.ll b/test/Transforms/LoopVectorize/undef-inst-bug.ll
index ed60e80..0444fe8 100644
--- a/test/Transforms/LoopVectorize/undef-inst-bug.ll
+++ b/test/Transforms/LoopVectorize/undef-inst-bug.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/unroll_novec.ll b/test/Transforms/LoopVectorize/unroll_novec.ll
index 89f4678..257b4e6 100644
--- a/test/Transforms/LoopVectorize/unroll_novec.ll
+++ b/test/Transforms/LoopVectorize/unroll_novec.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -force-target-num-scalar-regs=16 -force-target-max-scalar-unroll=8 -force-target-instruction-cost=1 -small-loop-cost=40 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -force-target-num-scalar-regs=16 -force-target-max-scalar-interleave=8 -force-target-instruction-cost=1 -small-loop-cost=40 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LoopVectorize/unsized-pointee-crash.ll b/test/Transforms/LoopVectorize/unsized-pointee-crash.ll
new file mode 100644
index 0000000..5cc9837
--- /dev/null
+++ b/test/Transforms/LoopVectorize/unsized-pointee-crash.ll

@@ -0,0 +1,24 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: @fn1
+define void @fn1() {
+entry:
+  br label %for.body
+
+for.body:
+  %b.05 = phi i32 (...)* [ undef, %entry ], [ %1, %for.body ]
+  %a.04 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = bitcast i32 (...)* %b.05 to i8*
+  %add.ptr = getelementptr i8* %0, i64 1
+  %1 = bitcast i8* %add.ptr to i32 (...)*
+; CHECK:      %[[cst:.*]] = bitcast i32 (...)* {{.*}} to i8*
+; CHECK-NEXT: %[[gep:.*]] = getelementptr i8* %[[cst]], i64 1
+  %inc = add nsw i32 %a.04, 1
+  %exitcond = icmp eq i32 %a.04, 63
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}

diff --git a/test/Transforms/LoopVectorize/value-ptr-bug.ll b/test/Transforms/LoopVectorize/value-ptr-bug.ll
index 6b06afa..7fb9095 100644
--- a/test/Transforms/LoopVectorize/value-ptr-bug.ll
+++ b/test/Transforms/LoopVectorize/value-ptr-bug.ll

@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/vect.omp.persistence.ll b/test/Transforms/LoopVectorize/vect.omp.persistence.ll
index f646567..b0fe7a5 100644
--- a/test/Transforms/LoopVectorize/vect.omp.persistence.ll
+++ b/test/Transforms/LoopVectorize/vect.omp.persistence.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -O2 -force-vector-unroll=2 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; RUN: opt < %s -O2 -force-vector-interleave=2 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; Loop from "rotated"

diff --git a/test/Transforms/LoopVectorize/vect.stats.ll b/test/Transforms/LoopVectorize/vect.stats.ll
index 92ec24f..556da45 100644
--- a/test/Transforms/LoopVectorize/vect.stats.ll
+++ b/test/Transforms/LoopVectorize/vect.stats.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=4 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ;
@@ -62,4 +62,4 @@
 
 for.end:
   ret void
-}
\ No newline at end of file
+}

diff --git a/test/Transforms/LoopVectorize/vectorize-once.ll b/test/Transforms/LoopVectorize/vectorize-once.ll
index 47de13d..cee4b16 100644
--- a/test/Transforms/LoopVectorize/vectorize-once.ll
+++ b/test/Transforms/LoopVectorize/vectorize-once.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -simplifycfg | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S -simplifycfg | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -70,7 +70,7 @@
 
 ; CHECK: !0 = metadata !{metadata !0, metadata !1, metadata !2}
 ; CHECK: !1 = metadata !{metadata !"llvm.loop.vectorize.width", i32 1}
-; CHECK: !2 = metadata !{metadata !"llvm.loop.vectorize.unroll", i32 1}
+; CHECK: !2 = metadata !{metadata !"llvm.loop.interleave.count", i32 1}
 ; CHECK: !3 = metadata !{metadata !3, metadata !1, metadata !2}
 
 !0 = metadata !{metadata !0, metadata !1}

diff --git a/test/Transforms/LoopVectorize/version-mem-access.ll b/test/Transforms/LoopVectorize/version-mem-access.ll
index 51d20e2..7ac2fca 100644
--- a/test/Transforms/LoopVectorize/version-mem-access.ll
+++ b/test/Transforms/LoopVectorize/version-mem-access.ll

@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -loop-vectorize -enable-mem-access-versioning -force-vector-width=2 -force-vector-unroll=1 < %s -S | FileCheck %s
+; RUN: opt -basicaa -loop-vectorize -enable-mem-access-versioning -force-vector-width=2 -force-vector-interleave=1 < %s -S | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 

diff --git a/test/Transforms/LoopVectorize/write-only.ll b/test/Transforms/LoopVectorize/write-only.ll
index 71a9cd0..2f100de 100644
--- a/test/Transforms/LoopVectorize/write-only.ll
+++ b/test/Transforms/LoopVectorize/write-only.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"

diff --git a/test/Transforms/LowerSwitch/2014-06-23-PHIlowering.ll b/test/Transforms/LowerSwitch/2014-06-23-PHIlowering.ll
new file mode 100644
index 0000000..c6cddf6
--- /dev/null
+++ b/test/Transforms/LowerSwitch/2014-06-23-PHIlowering.ll

@@ -0,0 +1,40 @@
+; RUN: opt < %s -lowerswitch -S | FileCheck %s
+
+define i32 @test(i32 %arg) #0 {
+; CHECK-LABEL: @test
+; CHECK: ; <label>:2
+; CHECK-NEXT:  %res.0 = phi i32 [ 1, %NodeBlock ], [ 2, %1 ]
+; CHECK-NEXT:  br label %3
+; CHECK: ; <label>:5
+; CHECK-NEXT:   %res.3 = phi i32 [ 0, %NewDefault ], [ %res.2, %4 ]
+; CHECK-NEXT:   %6 = add nsw i32 %res.3, 1
+; CHECK-NEXT:   ret i32 %6
+
+  switch i32 %arg, label %5 [
+    i32 1, label %1
+    i32 2, label %2
+    i32 3, label %3
+    i32 4, label %4
+  ]
+
+; <label>:1
+  br label %2
+
+; <label>:2
+  %res.0 = phi i32 [ 1, %0 ], [ 2, %1 ]
+  br label %3
+
+; <label>:3
+  %res.1 = phi i32 [ 0, %0 ], [ %res.0, %2 ]
+  %phitmp = add nsw i32 %res.1, 2
+  br label %4
+
+; <label>:4
+  %res.2 = phi i32 [ 1, %0 ], [ %phitmp, %3 ]
+  br label %5
+
+; <label>:5
+  %res.3 = phi i32 [ 0, %0 ], [ %res.2, %4 ]
+  %6 = add nsw i32 %res.3, 1
+  ret i32 %6
+}

diff --git a/test/Transforms/Mem2Reg/2007-08-27-VolatileLoadsStores.ll b/test/Transforms/Mem2Reg/2007-08-27-VolatileLoadsStores.ll
index ea0d515..ea581d1 100644
--- a/test/Transforms/Mem2Reg/2007-08-27-VolatileLoadsStores.ll
+++ b/test/Transforms/Mem2Reg/2007-08-27-VolatileLoadsStores.ll

@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts -S | grep volatile | count 3
+; RUN: opt < %s -O3 -S | grep volatile | count 3
 ; PR1520
 ; Don't promote load volatiles/stores. This is really needed to handle setjmp/lonjmp properly.
 

diff --git a/test/Transforms/Mem2Reg/ConvertDebugInfo.ll b/test/Transforms/Mem2Reg/ConvertDebugInfo.ll
index 33eaed6..b2d094f 100644
--- a/test/Transforms/Mem2Reg/ConvertDebugInfo.ll
+++ b/test/Transforms/Mem2Reg/ConvertDebugInfo.ll

@@ -7,13 +7,13 @@
   %retval = alloca double                         ; <double*> [#uses=2]
   %0 = alloca double                              ; <double*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{i32* %i_addr}, metadata !0), !dbg !8
-; CHECK: call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata ![[IVAR:[0-9]*]])
-; CHECK: call void @llvm.dbg.value(metadata !{double %j}, i64 0, metadata ![[JVAR:[0-9]*]])
+  call void @llvm.dbg.declare(metadata !{i32* %i_addr}, metadata !0, metadata !{}), !dbg !8
+; CHECK: call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata ![[IVAR:[0-9]*]], metadata {{.*}})
+; CHECK: call void @llvm.dbg.value(metadata !{double %j}, i64 0, metadata ![[JVAR:[0-9]*]], metadata {{.*}})
 ; CHECK: ![[IVAR]] = {{.*}} ; [ DW_TAG_arg_variable ] [i]
 ; CHECK: ![[JVAR]] = {{.*}} ; [ DW_TAG_arg_variable ] [j]
   store i32 %i, i32* %i_addr
-  call void @llvm.dbg.declare(metadata !{double* %j_addr}, metadata !9), !dbg !8
+  call void @llvm.dbg.declare(metadata !{double* %j_addr}, metadata !9, metadata !{}), !dbg !8
   store double %j, double* %j_addr
   %1 = load i32* %i_addr, align 4, !dbg !10       ; <i32> [#uses=1]
   %2 = add nsw i32 %1, 1, !dbg !10                ; <i32> [#uses=1]
@@ -30,23 +30,23 @@
   ret double %retval1, !dbg !10
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!14}
 
-!0 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 2, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !12, metadata !2, metadata !"testfunc", metadata !"testfunc", metadata !"testfunc", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (i32, double)* @testfunc, null, null, null, i32 2} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !12, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !13, metadata !13, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !12, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00i\002\000", metadata !1, metadata !2, metadata !7} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00testfunc\00testfunc\00testfunc\002\000\001\000\006\000\000\002", metadata !12, metadata !2, metadata !4, null, double (i32, double)* @testfunc, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !12, metadata !13, metadata !13, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !7, metadata !6}
-!6 = metadata !{i32 786468, metadata !12, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786468, metadata !12, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !2} ; [ DW_TAG_base_type ]
 !8 = metadata !{i32 2, i32 0, metadata !1, null}
-!9 = metadata !{i32 786689, metadata !1, metadata !"j", metadata !2, i32 2, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{metadata !"0x101\00j\002\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
 !10 = metadata !{i32 3, i32 0, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !12, metadata !1, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{metadata !"0xb\002\000\000", metadata !12, metadata !1} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{metadata !"testfunc.c", metadata !"/tmp"}
 !13 = metadata !{i32 0}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll b/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll
index 32acdd6..b7b9dc7 100644
--- a/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll
+++ b/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll

@@ -1,6 +1,6 @@
 ; RUN: opt -mem2reg < %s | llvm-dis | grep ".dbg " | count 7
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare void @foo(i32, i64, i8*)
 
@@ -11,14 +11,14 @@
   %z_addr.i = alloca i8*                          ; <i8**> [#uses=2]
   %a_addr = alloca i32                            ; <i32*> [#uses=2]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{i32* %a_addr}, metadata !0), !dbg !7
+  call void @llvm.dbg.declare(metadata !{i32* %a_addr}, metadata !0, metadata !{}), !dbg !7
   store i32 %a, i32* %a_addr
   %0 = load i32* %a_addr, align 4, !dbg !8        ; <i32> [#uses=1]
-  call void @llvm.dbg.declare(metadata !{i32* %x_addr.i}, metadata !9) nounwind, !dbg !15
+  call void @llvm.dbg.declare(metadata !{i32* %x_addr.i}, metadata !9, metadata !{}) nounwind, !dbg !15
   store i32 %0, i32* %x_addr.i
-  call void @llvm.dbg.declare(metadata !{i64* %y_addr.i}, metadata !16) nounwind, !dbg !15
+  call void @llvm.dbg.declare(metadata !{i64* %y_addr.i}, metadata !16, metadata !{}) nounwind, !dbg !15
   store i64 55, i64* %y_addr.i
-  call void @llvm.dbg.declare(metadata !{i8** %z_addr.i}, metadata !17) nounwind, !dbg !15
+  call void @llvm.dbg.declare(metadata !{i8** %z_addr.i}, metadata !17, metadata !{}) nounwind, !dbg !15
   store i8* bitcast (void (i32)* @baz to i8*), i8** %z_addr.i
   %1 = load i32* %x_addr.i, align 4, !dbg !18     ; <i32> [#uses=1]
   %2 = load i64* %y_addr.i, align 8, !dbg !18     ; <i64> [#uses=1]
@@ -32,26 +32,26 @@
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!22}
-!0 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 8, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"baz", metadata !"baz", metadata !"baz", i32 8, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void (i32)* @baz, null, null, null, i32 8} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !20, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x101\00a\008\000", metadata !1, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{metadata !"0x2e\00baz\00baz\00baz\008\000\001\000\006\000\000\008", metadata !20, metadata !2, metadata !4, null, void (i32)* @baz, null, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !20, metadata !21, metadata !21, null, null, null} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{null, metadata !6}
-!6 = metadata !{i32 786468, metadata !20, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !20, metadata !2} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 8, i32 0, metadata !1, null}
 !8 = metadata !{i32 9, i32 0, metadata !1, null}
-!9 = metadata !{i32 786689, metadata !10, metadata !"x", metadata !2, i32 4, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, metadata !11, i1 true, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 4} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x101\00x\004\000", metadata !10, metadata !2, metadata !6} ; [ DW_TAG_arg_variable ]
+!10 = metadata !{metadata !"0x2e\00bar\00bar\00bar\004\001\001\000\006\000\000\004", metadata !20, metadata !2, metadata !11, null, null, null, null, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !2, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !6, metadata !13, metadata !14}
-!13 = metadata !{i32 786468, metadata !20, metadata !2, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786447, metadata !20, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
+!13 = metadata !{metadata !"0x24\00long int\000\0064\0064\000\000\005", metadata !20, metadata !2} ; [ DW_TAG_base_type ]
+!14 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !20, metadata !2, null} ; [ DW_TAG_pointer_type ]
 !15 = metadata !{i32 4, i32 0, metadata !10, metadata !8}
-!16 = metadata !{i32 786689, metadata !10, metadata !"y", metadata !2, i32 4, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
-!17 = metadata !{i32 786689, metadata !10, metadata !"z", metadata !2, i32 4, metadata !14, i32 0, null} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x101\00y\004\000", metadata !10, metadata !2, metadata !13} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{metadata !"0x101\00z\004\000", metadata !10, metadata !2, metadata !14} ; [ DW_TAG_arg_variable ]
 !18 = metadata !{i32 5, i32 0, metadata !10, metadata !8}
 !19 = metadata !{i32 10, i32 0, metadata !1, null}
 !20 = metadata !{metadata !"bar.c", metadata !"/tmp/"}
 !21 = metadata !{i32 0}
-!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll b/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll
index d124be5..00ac34d 100644
--- a/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll
+++ b/test/Transforms/MemCpyOpt/2008-02-24-MultipleUseofSRet.ll

@@ -6,7 +6,7 @@
 
 %0 = type { x86_fp80, x86_fp80 }
 
-define internal fastcc void @initialize(%0* noalias sret %agg.result) nounwind {
+define internal fastcc void @initialize(%0* noalias nocapture sret %agg.result) nounwind {
 entry:
   %agg.result.03 = getelementptr %0* %agg.result, i32 0, i32 0
   store x86_fp80 0xK00000000000000000000, x86_fp80* %agg.result.03
@@ -15,7 +15,7 @@
   ret void
 }
 
-declare fastcc x86_fp80 @passed_uninitialized(%0*) nounwind
+declare fastcc x86_fp80 @passed_uninitialized(%0* nocapture) nounwind
 
 define fastcc void @badly_optimized() nounwind {
 entry:

diff --git a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
index 597b69d..6982c8b 100644
--- a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
+++ b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll

@@ -4,7 +4,7 @@
 %a = type { i32 }
 %b = type { float }
 
-declare void @g(%a*)
+declare void @g(%a* nocapture)
 
 define float @f() {
 entry:

diff --git a/test/Transforms/MemCpyOpt/callslot_deref.ll b/test/Transforms/MemCpyOpt/callslot_deref.ll
new file mode 100644
index 0000000..4d51552
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/callslot_deref.ll

@@ -0,0 +1,29 @@
+; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) unnamed_addr nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+; all bytes of %dst that are touch by the memset are dereferenceable
+define void @must_remove_memcpy(i8* noalias nocapture dereferenceable(4096) %dst) {
+; CHECK-LABEL: @must_remove_memcpy(
+; CHECK: call void @llvm.memset.p0i8.i64
+; CHECK-NOT: call void @llvm.memcpy.p0i8.p0i8.i64
+  %src = alloca [4096 x i8], align 1
+  %p = getelementptr inbounds [4096 x i8]* %src, i64 0, i64 0
+  call void @llvm.memset.p0i8.i64(i8* %p, i8 0, i64 4096, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %p, i64 4096, i32 1, i1 false) #2
+  ret void
+}
+
+; memset touch more bytes than those guaranteed to be dereferenceable
+define void @must_not_remove_memcpy(i8* noalias nocapture dereferenceable(1024) %dst) {
+; CHECK-LABEL: @must_not_remove_memcpy(
+; CHECK: call void @llvm.memset.p0i8.i64
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+  %src = alloca [4096 x i8], align 1
+  %p = getelementptr inbounds [4096 x i8]* %src, i64 0, i64 0
+  call void @llvm.memset.p0i8.i64(i8* %p, i8 0, i64 4096, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %p, i64 4096, i32 1, i1 false) #2
+  ret void
+}

diff --git a/test/Transforms/MemCpyOpt/capturing-func.ll b/test/Transforms/MemCpyOpt/capturing-func.ll
new file mode 100644
index 0000000..17614fd
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/capturing-func.ll

@@ -0,0 +1,22 @@
+; RUN: opt < %s -basicaa -memcpyopt -S | FileCheck %s
+
+target datalayout = "e"
+
+declare void @foo(i8*)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define void @test() {
+  %ptr1 = alloca i8
+  %ptr2 = alloca i8
+  call void @foo(i8* %ptr2)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr1, i8* %ptr2, i32 1, i32 1, i1 false)
+  call void @foo(i8* %ptr1)
+  ret void
+
+  ; Check that the transformation isn't applied if the called function can
+  ; capture the pointer argument (i.e. the nocapture attribute isn't present)
+  ; CHECK-LABEL: @test(
+  ; CHECK: call void @foo(i8* %ptr2)
+  ; CHECK-NEXT: call void @llvm.memcpy
+  ; CHECK-NEXT: call void @foo(i8* %ptr1)
+}

diff --git a/test/Transforms/MemCpyOpt/loadstore-sret.ll b/test/Transforms/MemCpyOpt/loadstore-sret.ll
index 89eabca..d4a700d 100644
--- a/test/Transforms/MemCpyOpt/loadstore-sret.ll
+++ b/test/Transforms/MemCpyOpt/loadstore-sret.ll

@@ -22,4 +22,4 @@
   ret void
 }
 
-declare void @_Z3barv(%"class.std::auto_ptr"* sret)
+declare void @_Z3barv(%"class.std::auto_ptr"* nocapture sret)

diff --git a/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll b/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll
new file mode 100644
index 0000000..6263176
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll

@@ -0,0 +1,55 @@
+; RUN: opt -basicaa -memcpyopt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo([8 x i64]* noalias nocapture sret dereferenceable(64) %sret) {
+entry-block:
+  %a = alloca [8 x i64], align 8
+  %a.cast = bitcast [8 x i64]* %a to i8*
+  call void @llvm.lifetime.start(i64 64, i8* %a.cast)
+  call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 0, i64 64, i32 8, i1 false)
+  %sret.cast = bitcast [8 x i64]* %sret to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %sret.cast, i8* %a.cast, i64 64, i32 8, i1 false)
+  call void @llvm.lifetime.end(i64 64, i8* %a.cast)
+  ret void
+
+; CHECK-LABEL: @foo(
+; CHECK:         %[[sret_cast:[^=]+]] = bitcast [8 x i64]* %sret to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* %[[sret_cast]], i8 0, i64 64
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK: ret void
+}
+
+define void @bar([8 x i64]* noalias nocapture sret dereferenceable(64) %sret, [8 x i64]* noalias nocapture dereferenceable(64) %out) {
+entry-block:
+  %a = alloca [8 x i64], align 8
+  %a.cast = bitcast [8 x i64]* %a to i8*
+  call void @llvm.lifetime.start(i64 64, i8* %a.cast)
+  call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 0, i64 64, i32 8, i1 false)
+  %sret.cast = bitcast [8 x i64]* %sret to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %sret.cast, i8* %a.cast, i64 64, i32 8, i1 false)
+  call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 42, i64 32, i32 8, i1 false)
+  %out.cast = bitcast [8 x i64]* %out to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out.cast, i8* %a.cast, i64 64, i32 8, i1 false)
+  call void @llvm.lifetime.end(i64 64, i8* %a.cast)
+  ret void
+
+; CHECK-LABEL: @bar(
+; CHECK:         %[[a:[^=]+]] = alloca [8 x i64]
+; CHECK:         %[[a_cast:[^=]+]] = bitcast [8 x i64]* %[[a]] to i8*
+; CHECK:         call void @llvm.memset.p0i8.i64(i8* %[[a_cast]], i8 0, i64 64
+; CHECK:         %[[sret_cast:[^=]+]] = bitcast [8 x i64]* %sret to i8*
+; CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[sret_cast]], i8* %[[a_cast]], i64 64
+; CHECK:         call void @llvm.memset.p0i8.i64(i8* %[[a_cast]], i8 42, i64 32
+; CHECK:         %[[out_cast:[^=]+]] = bitcast [8 x i64]* %out to i8*
+; CHECK:         call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[out_cast]], i8* %[[a_cast]], i64 64
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK: ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind

diff --git a/test/Transforms/MemCpyOpt/memcpy.ll b/test/Transforms/MemCpyOpt/memcpy.ll
index 492c453..ee04f19 100644
--- a/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/test/Transforms/MemCpyOpt/memcpy.ll

@@ -29,7 +29,7 @@
 ; CHECK: ret void
 }
 
-declare void @ccoshl(%0* sret , x86_fp80, x86_fp80) nounwind 
+declare void @ccoshl(%0* nocapture sret, x86_fp80, x86_fp80) nounwind 
 
 
 ; The intermediate alloca and one of the memcpy's should be eliminated, the
@@ -202,7 +202,7 @@
   ret void
 }
 
-declare void @f1(%struct.big* sret)
+declare void @f1(%struct.big* nocapture sret)
 declare void @f2(%struct.big*)
 
 ; CHECK: attributes [[NUW]] = { nounwind }

diff --git a/test/Transforms/MemCpyOpt/sret.ll b/test/Transforms/MemCpyOpt/sret.ll
index 1bbb5fe..bfe5e0f 100644
--- a/test/Transforms/MemCpyOpt/sret.ll
+++ b/test/Transforms/MemCpyOpt/sret.ll

@@ -25,6 +25,6 @@
   ret void
 }
 
-declare void @ccoshl(%0* noalias sret, %0* byval) nounwind
+declare void @ccoshl(%0* noalias nocapture sret, %0* byval) nounwind
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind

diff --git a/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll b/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
new file mode 100644
index 0000000..9878b47
--- /dev/null
+++ b/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll

@@ -0,0 +1,91 @@
+; RUN: opt -mergefunc -S < %s | FileCheck %s
+
+define i8 @call_with_range() {
+  bitcast i8 0 to i8 ; dummy to make the function large enough
+  %out = call i8 @dummy(), !range !0
+  ret i8 %out
+}
+
+define i8 @call_no_range() {
+; CHECK-LABEL: @call_no_range
+; CHECK-NEXT: bitcast i8 0 to i8
+; CHECK-NEXT: %out = call i8 @dummy()
+; CHECK-NEXT: ret i8 %out
+  bitcast i8 0 to i8
+  %out = call i8 @dummy()
+  ret i8 %out
+}
+
+define i8 @call_different_range() {
+; CHECK-LABEL: @call_different_range
+; CHECK-NEXT: bitcast i8 0 to i8
+; CHECK-NEXT: %out = call i8 @dummy(), !range !1
+; CHECK-NEXT: ret i8 %out
+  bitcast i8 0 to i8
+  %out = call i8 @dummy(), !range !1
+  ret i8 %out
+}
+
+define i8 @invoke_with_range() {
+  %out = invoke i8 @dummy() to label %next unwind label %lpad, !range !0
+
+next:
+  ret i8 %out
+
+lpad:
+  %pad = landingpad { i8*, i32 } personality i8* undef cleanup
+  resume { i8*, i32 } zeroinitializer
+}
+
+define i8 @invoke_no_range() {
+; CHECK-LABEL: @invoke_no_range()
+; CHECK-NEXT: invoke i8 @dummy
+  %out = invoke i8 @dummy() to label %next unwind label %lpad
+
+next:
+  ret i8 %out
+
+lpad:
+  %pad = landingpad { i8*, i32 } personality i8* undef cleanup
+  resume { i8*, i32 } zeroinitializer
+}
+
+define i8 @invoke_different_range() {
+; CHECK-LABEL: @invoke_different_range()
+; CHECK-NEXT: invoke i8 @dummy
+  %out = invoke i8 @dummy() to label %next unwind label %lpad, !range !1
+
+next:
+  ret i8 %out
+
+lpad:
+  %pad = landingpad { i8*, i32 } personality i8* undef cleanup
+  resume { i8*, i32 } zeroinitializer
+}
+
+define i8 @call_same_range() {
+; CHECK-LABEL: @call_same_range
+; CHECK: tail call i8 @call_with_range
+  bitcast i8 0 to i8
+  %out = call i8 @dummy(), !range !0
+  ret i8 %out
+}
+
+define i8 @invoke_same_range() {
+; CHECK-LABEL: @invoke_same_range()
+; CHECK: tail call i8 @invoke_with_range()
+  %out = invoke i8 @dummy() to label %next unwind label %lpad, !range !0
+
+next:
+  ret i8 %out
+
+lpad:
+  %pad = landingpad { i8*, i32 } personality i8* undef cleanup
+  resume { i8*, i32 } zeroinitializer
+}
+
+declare i8 @dummy();
+declare i32 @__gxx_personality_v0(...)
+
+!0 = metadata !{i8 0, i8 2}
+!1 = metadata !{i8 5, i8 7}
\ No newline at end of file

diff --git a/test/Transforms/MergeFunc/vector-GEP-crash.ll b/test/Transforms/MergeFunc/vector-GEP-crash.ll
new file mode 100644
index 0000000..a1eefa0
--- /dev/null
+++ b/test/Transforms/MergeFunc/vector-GEP-crash.ll

@@ -0,0 +1,12 @@
+; RUN: opt -mergefunc -disable-output < %s
+; This used to cause a crash when compairing the GEPs
+
+define void @foo(<2 x i64*>) {
+  %tmp = getelementptr <2 x i64*> %0, <2 x i64> <i64 0, i64 0>
+  ret void
+}
+
+define void @bar(<2 x i64*>) {
+  %tmp = getelementptr <2 x i64*> %0, <2 x i64> <i64 0, i64 0>
+  ret void
+}

diff --git a/test/Transforms/MetaRenamer/metarenamer.ll b/test/Transforms/MetaRenamer/metarenamer.ll
index 6297af6..4010f31 100644
--- a/test/Transforms/MetaRenamer/metarenamer.ll
+++ b/test/Transforms/MetaRenamer/metarenamer.ll

@@ -12,7 +12,7 @@
 @func_5_xxx.static_local_3_xxx = internal global i32 3, align 4
 @global_3_xxx = common global i32 0, align 4
 
-@func_7_xxx = alias weak i32 (...)* @aliased_func_7_xxx
+@func_7_xxx = weak alias i32 (...)* @aliased_func_7_xxx
 
 define i32 @aliased_func_7_xxx(...) {
   ret i32 0

diff --git a/test/Transforms/ObjCARC/allocas.ll b/test/Transforms/ObjCARC/allocas.ll
index 7347a8f..d2e7841 100644
--- a/test/Transforms/ObjCARC/allocas.ll
+++ b/test/Transforms/ObjCARC/allocas.ll

@@ -23,7 +23,7 @@
 declare void @bar(i32 ()*)
 declare void @use_alloca(i8**)
 
-declare void @llvm.dbg.value(metadata, i64, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 declare i8* @objc_msgSend(i8*, i8*, ...)
 

diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 885935c..a1ee956 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll

@@ -22,7 +22,7 @@
 declare i8* @returner()
 declare void @bar(i32 ()*)
 
-declare void @llvm.dbg.value(metadata, i64, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 declare i8* @objc_msgSend(i8*, i8*, ...)
 
@@ -2679,8 +2679,8 @@
 invoke.cont:
   %0 = bitcast {}* %self to i8*
   %1 = tail call i8* @objc_retain(i8* %0) nounwind
-  tail call void @llvm.dbg.value(metadata !{{}* %self}, i64 0, metadata !0)
-  tail call void @llvm.dbg.value(metadata !{{}* %self}, i64 0, metadata !0)
+  tail call void @llvm.dbg.value(metadata !{{}* %self}, i64 0, metadata !0, metadata !{})
+  tail call void @llvm.dbg.value(metadata !{{}* %self}, i64 0, metadata !0, metadata !{})
   %ivar = load i64* @"OBJC_IVAR_$_A.myZ", align 8
   %add.ptr = getelementptr i8* %0, i64 %ivar
   %tmp1 = bitcast i8* %add.ptr to float*
@@ -3012,7 +3012,7 @@
 !llvm.module.flags = !{!1}
 
 !0 = metadata !{}
-!1 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!1 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 
 ; CHECK: attributes #0 = { nounwind readnone }
 ; CHECK: attributes [[NUW]] = { nounwind }

diff --git a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
index 79e300c..03af93e 100644
--- a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+++ b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll

@@ -41,10 +41,10 @@
   %tmp2 = bitcast %struct._class_t* %tmp to i8*, !dbg !37
 ; CHECK: call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1)
   %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1), !dbg !37, !clang.arc.no_objc_arc_exceptions !38
-  call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !12), !dbg !37
+  call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !12, metadata !{}), !dbg !37
 ; CHECK: call i8* @objc_retain(i8* %call) [[NUW:#[0-9]+]]
   %tmp3 = call i8* @objc_retain(i8* %call) nounwind, !dbg !39
-  call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !25), !dbg !39
+  call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !25, metadata !{}), !dbg !39
   invoke fastcc void @ThrowFunc(i8* %call)
           to label %eh.cont unwind label %lpad, !dbg !40, !clang.arc.no_objc_arc_exceptions !38
 
@@ -58,7 +58,7 @@
           catch i8* null, !dbg !40
   %tmp5 = extractvalue { i8*, i32 } %tmp4, 0, !dbg !40
   %exn.adjusted = call i8* @objc_begin_catch(i8* %tmp5) nounwind, !dbg !44
-  call void @llvm.dbg.value(metadata !45, i64 0, metadata !21), !dbg !46
+  call void @llvm.dbg.value(metadata !45, i64 0, metadata !21, metadata !{}), !dbg !46
   call void @objc_end_catch(), !dbg !49, !clang.arc.no_objc_arc_exceptions !38
 ; CHECK: call void @objc_release(i8* %call)
   call void @objc_release(i8* %call) nounwind, !dbg !42, !clang.imprecise_release !38
@@ -72,7 +72,7 @@
   ret i32 0, !dbg !54
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
 
@@ -87,7 +87,7 @@
 define internal fastcc void @ThrowFunc(i8* %obj) uwtable noinline ssp {
 entry:
   %tmp = call i8* @objc_retain(i8* %obj) nounwind
-  call void @llvm.dbg.value(metadata !{i8* %obj}, i64 0, metadata !32), !dbg !55
+  call void @llvm.dbg.value(metadata !{i8* %obj}, i64 0, metadata !32, metadata !{}), !dbg !55
   %tmp1 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_1", align 8, !dbg !56
   %tmp2 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_5", align 8, !dbg !56, !invariant.load !38
   %tmp3 = bitcast %struct._class_t* %tmp1 to i8*, !dbg !56
@@ -102,7 +102,7 @@
 
 declare void @NSLog(i8*, ...)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { ssp uwtable }
 ; CHECK: attributes #1 = { nounwind readnone }
@@ -113,37 +113,37 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33, !34, !35, !36, !61}
 
-!0 = metadata !{i32 786449, metadata !60, i32 16, metadata !"clang version 3.3 ", i1 true, metadata !"", i32 2, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m] [DW_LANG_ObjC]
+!0 = metadata !{metadata !"0x11\0016\00clang version 3.3 \001\00\002\00\000", metadata !60, metadata !1, metadata !1, metadata !3, metadata !1, null} ; [ DW_TAG_compile_unit ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m] [DW_LANG_ObjC]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !27}
-!5 = metadata !{i32 786478, metadata !60, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 9, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !10, i32 10} ; [ DW_TAG_subprogram ] [line 9] [def] [scope 10] [main]
-!6 = metadata !{i32 786473, metadata !60} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x2e\00main\00main\00\009\000\001\000\006\000\001\0010", metadata !60, metadata !6, metadata !7, null, i32 ()* @main, null, null, metadata !10} ; [ DW_TAG_subprogram ] [line 9] [def] [scope 10] [main]
+!6 = metadata !{metadata !"0x29", metadata !60} ; [ DW_TAG_file_type ]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !11}
 !11 = metadata !{metadata !12, metadata !21, metadata !25}
-!12 = metadata !{i32 786688, metadata !13, metadata !"obj", metadata !6, i32 11, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [obj] [line 11]
-!13 = metadata !{i32 786443, metadata !60, metadata !5, i32 10, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!14 = metadata !{i32 786454, metadata !60, null, metadata !"id", i32 11, i64 0, i64 0, i64 0, i32 0, metadata !15} ; [ DW_TAG_typedef ] [id] [line 11, size 0, align 0, offset 0] [from ]
-!15 = metadata !{i32 786447, metadata !60, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
-!16 = metadata !{i32 786451, metadata !60, null, metadata !"objc_object", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
+!12 = metadata !{metadata !"0x100\00obj\0011\000", metadata !13, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [obj] [line 11]
+!13 = metadata !{metadata !"0xb\0010\000\000", metadata !60, metadata !5} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!14 = metadata !{metadata !"0x16\00id\0011\000\000\000\000", metadata !60, null, metadata !15} ; [ DW_TAG_typedef ] [id] [line 11, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", metadata !60, null, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
+!16 = metadata !{metadata !"0x13\00objc_object\000\000\000\000\000\000", metadata !60, null, null, metadata !17, null, i32 0, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
 !17 = metadata !{metadata !18}
-!18 = metadata !{i32 786445, metadata !60, metadata !16, metadata !"isa", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !19} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
-!19 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
-!20 = metadata !{i32 786451, metadata !60, null, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
-!21 = metadata !{i32 786688, metadata !22, metadata !"ok", metadata !6, i32 13, metadata !23, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ok] [line 13]
-!22 = metadata !{i32 786443, metadata !60, metadata !13, i32 12, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!23 = metadata !{i32 786454, metadata !60, null, metadata !"BOOL", i32 62, i64 0, i64 0, i64 0, i32 0, metadata !24} ; [ DW_TAG_typedef ] [BOOL] [line 62, size 0, align 0, offset 0] [from signed char]
-!24 = metadata !{i32 786468, null, null, metadata !"signed char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
-!25 = metadata !{i32 786688, metadata !26, metadata !"obj2", metadata !6, i32 15, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [obj2] [line 15]
-!26 = metadata !{i32 786443, metadata !60, metadata !22, i32 14, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!27 = metadata !{i32 786478, metadata !60, metadata !6, metadata !"ThrowFunc", metadata !"ThrowFunc", metadata !"", i32 4, metadata !28, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i8*)* @ThrowFunc, null, null, metadata !30, i32 5} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [scope 5] [ThrowFunc]
-!28 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{metadata !"0xd\00isa\000\0064\000\000\000", metadata !60, metadata !16, metadata !19} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
+!19 = metadata !{metadata !"0xf\00\000\0064\000\000\000", null, null, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
+!20 = metadata !{metadata !"0x13\00objc_class\000\000\000\000\004\000", metadata !60, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!21 = metadata !{metadata !"0x100\00ok\0013\000", metadata !22, metadata !6, metadata !23} ; [ DW_TAG_auto_variable ] [ok] [line 13]
+!22 = metadata !{metadata !"0xb\0012\000\001", metadata !60, metadata !13} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!23 = metadata !{metadata !"0x16\00BOOL\0062\000\000\000\000", metadata !60, null, metadata !24} ; [ DW_TAG_typedef ] [BOOL] [line 62, size 0, align 0, offset 0] [from signed char]
+!24 = metadata !{metadata !"0x24\00signed char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!25 = metadata !{metadata !"0x100\00obj2\0015\000", metadata !26, metadata !6, metadata !14} ; [ DW_TAG_auto_variable ] [obj2] [line 15]
+!26 = metadata !{metadata !"0xb\0014\000\002", metadata !60, metadata !22} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!27 = metadata !{metadata !"0x2e\00ThrowFunc\00ThrowFunc\00\004\001\001\000\006\00256\001\005", metadata !60, metadata !6, metadata !28, null, void (i8*)* @ThrowFunc, null, null, metadata !30} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [scope 5] [ThrowFunc]
+!28 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !29, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !29 = metadata !{null, metadata !14}
 !30 = metadata !{metadata !31}
 !31 = metadata !{metadata !32}
-!32 = metadata !{i32 786689, metadata !27, metadata !"obj", metadata !6, i32 16777220, metadata !14, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [obj] [line 4]
+!32 = metadata !{metadata !"0x101\00obj\0016777220\000", metadata !27, metadata !6, metadata !14} ; [ DW_TAG_arg_variable ] [obj] [line 4]
 !33 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
 !34 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
 !35 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
@@ -152,23 +152,23 @@
 !38 = metadata !{}
 !39 = metadata !{i32 15, i32 0, metadata !26, null}
 !40 = metadata !{i32 17, i32 0, metadata !41, null}
-!41 = metadata !{i32 786443, metadata !60, metadata !26, i32 16, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!41 = metadata !{metadata !"0xb\0016\000\003", metadata !60, metadata !26} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
 !42 = metadata !{i32 22, i32 0, metadata !26, null}
 !43 = metadata !{i32 23, i32 0, metadata !22, null}
 !44 = metadata !{i32 19, i32 0, metadata !41, null}
 !45 = metadata !{i8 0}
 !46 = metadata !{i32 20, i32 0, metadata !47, null}
-!47 = metadata !{i32 786443, metadata !60, metadata !48, i32 19, i32 0, i32 5} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
-!48 = metadata !{i32 786443, metadata !60, metadata !26, i32 19, i32 0, i32 4} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!47 = metadata !{metadata !"0xb\0019\000\005", metadata !60, metadata !48} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!48 = metadata !{metadata !"0xb\0019\000\004", metadata !60, metadata !26} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
 !49 = metadata !{i32 21, i32 0, metadata !47, null}
 !50 = metadata !{i32 24, i32 0, metadata !51, null}
-!51 = metadata !{i32 786443, metadata !60, metadata !22, i32 23, i32 0, i32 6} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!51 = metadata !{metadata !"0xb\0023\000\006", metadata !60, metadata !22} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
 !52 = metadata !{i32 25, i32 0, metadata !51, null}
 !53 = metadata !{i32 27, i32 0, metadata !13, null}
 !54 = metadata !{i32 28, i32 0, metadata !13, null}
 !55 = metadata !{i32 4, i32 0, metadata !27, null}
 !56 = metadata !{i32 6, i32 0, metadata !57, null}
-!57 = metadata !{i32 786443, metadata !60, metadata !27, i32 5, i32 0, i32 7} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!57 = metadata !{metadata !"0xb\005\000\007", metadata !60, metadata !27} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
 !58 = metadata !{i32 7, i32 0, metadata !57, null}
 !60 = metadata !{metadata !"test.m", metadata !"/Volumes/Files/gottesmmcab/Radar/12906997"}
-!61 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!61 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/ObjCARC/provenance.ll b/test/Transforms/ObjCARC/provenance.ll
new file mode 100644
index 0000000..937c689
--- /dev/null
+++ b/test/Transforms/ObjCARC/provenance.ll

@@ -0,0 +1,52 @@
+; RUN: opt -disable-output -pa-eval %s 2>&1 | FileCheck %s
+
+@"\01l_objc_msgSend_fixup_" = global i8 0
+@g1 = global i8 0, section "__OBJC,__message_refs,literal_pointers,no_dead_strip"
+@g2 = global i8 0, section "__DATA, __objc_classrefs, regular, no_dead_strip"
+@g3 = global i8 0, section "__DATA, __objc_superrefs, regular, no_dead_strip"
+@g4 = global i8 0, section "__TEXT,__objc_methname,cstring_literals"
+@g5 = global i8 0, section "__TEXT,__cstring,cstring_literals"
+
+declare void @g(i8)
+
+define void @f(i8* %a, i8** %b, i8** %c) {
+  %y1 = load i8* %a
+  call void @g(i8 %y1)
+
+  %y2 = load i8** %b
+  %y3 = load i8** %c
+
+  %x0 = load i8* @"\01l_objc_msgSend_fixup_"
+  call void @g(i8 %x0)
+
+  %x1 = load i8* @g1
+  call void @g(i8 %x1)
+
+  %x2 = load i8* @g2
+  call void @g(i8 %x2)
+
+  %x3 = load i8* @g3
+  call void @g(i8 %x3)
+
+  %x4 = load i8* @g4
+  call void @g(i8 %x4)
+
+  %x5 = load i8* @g5
+  call void @g(i8 %x5)
+  ret void
+}
+
+; CHECK: y1 and y2 are related.
+; CHECK: y1 and y3 are related.
+; CHECK: y2 and y3 are related.
+; CHECK: x0 and y1 are not related.
+; CHECK: x0 and y2 are not related.
+; CHECK: x0 and y3 are not related.
+; CHECK: l_objc_msgSend_fixup_ and y1 are not related.
+; CHECK: l_objc_msgSend_fixup_ and y2 are not related.
+; CHECK: l_objc_msgSend_fixup_ and y3 are not related.
+; CHECK: x1 and y1 are not related.
+; CHECK: x2 and y1 are not related.
+; CHECK: x3 and y1 are not related.
+; CHECK: x4 and y1 are not related.
+; CHECK: x5 and y1 are not related.

diff --git a/test/Transforms/PartiallyInlineLibCalls/bad-prototype.ll b/test/Transforms/PartiallyInlineLibCalls/bad-prototype.ll
new file mode 100644
index 0000000..34cd672
--- /dev/null
+++ b/test/Transforms/PartiallyInlineLibCalls/bad-prototype.ll

@@ -0,0 +1,13 @@
+; RUN: opt -S -partially-inline-libcalls < %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @sqrt()
+
+; CHECK-LABEL: @foo
+define i32 @foo() {
+  ; CHECK: call{{.*}}@sqrt
+  ; CHECK-NOT: call{{.*}}@sqrt
+  %r = call i32 @sqrt()
+  ret i32 %r
+}

diff --git a/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll b/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll
index f783955..ea86984 100644
--- a/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll
+++ b/test/Transforms/Reassociate/2006-04-27-ReassociateVector.ll

@@ -3,7 +3,7 @@
 define <4 x float> @test1() {
 ; CHECK-LABEL: test1
 ; CHECK-NEXT: %tmp1 = fsub <4 x float> zeroinitializer, zeroinitializer
-; CHECK-NEXT: %tmp2 = fmul <4 x float> zeroinitializer, %tmp1
+; CHECK-NEXT: %tmp2 = fmul <4 x float> %tmp1, zeroinitializer
 ; CHECK-NEXT: ret <4 x float> %tmp2
 
   %tmp1 = fsub <4 x float> zeroinitializer, zeroinitializer

diff --git a/test/Transforms/Reassociate/basictest.ll b/test/Transforms/Reassociate/basictest.ll
index d70bfcb..0194ce2 100644
--- a/test/Transforms/Reassociate/basictest.ll
+++ b/test/Transforms/Reassociate/basictest.ll

@@ -203,7 +203,7 @@
 
 ; CHECK-LABEL: @test14
 ; CHECK-NEXT: sub i32 %X1, %X2
-; CHECK-NEXT: mul i32 %tmp, 47
+; CHECK-NEXT: mul i32 %B2, 47
 ; CHECK-NEXT: ret i32
 }
 

diff --git a/test/Transforms/Reassociate/canonicalize-neg-const.ll b/test/Transforms/Reassociate/canonicalize-neg-const.ll
new file mode 100644
index 0000000..e85a963
--- /dev/null
+++ b/test/Transforms/Reassociate/canonicalize-neg-const.ll

@@ -0,0 +1,158 @@
+; RUN: opt -reassociate -gvn -S < %s | FileCheck %s
+
+; (x + 0.1234 * y) * (x + -0.1234 * y) -> (x + 0.1234 * y) * (x - 0.1234 * y)
+define double @test1(double %x, double %y) {
+; CHECK-LABEL: @test1
+; CHECK-NEXT: fmul double %y, 1.234000e-01
+; CHECK-NEXT: fadd double %x, %mul
+; CHECK-NEXT: fsub double %x, %mul
+; CHECK-NEXT: fmul double %add{{.*}}, %add{{.*}}
+; CHECK-NEXT: ret double %mul
+
+  %mul = fmul double 1.234000e-01, %y
+  %add = fadd double %mul, %x
+  %mul1 = fmul double -1.234000e-01, %y
+  %add2 = fadd double %mul1, %x
+  %mul3 = fmul double %add, %add2
+  ret double %mul3
+}
+
+; (x + -0.1234 * y) * (x + -0.1234 * y) -> (x - 0.1234 * y) * (x - 0.1234 * y)
+define double @test2(double %x, double %y) {
+; CHECK-LABEL: @test2
+; CHECK-NEXT: fmul double %y, 1.234000e-01
+; CHECK-NEXT: fsub double %x, %mul
+; CHECK-NEXT: fmul double %add{{.*}}, %add{{.*}}
+; CHECK-NEXT: ret double %mul
+
+  %mul = fmul double %y, -1.234000e-01
+  %add = fadd double %mul, %x
+  %mul1 = fmul double %y, -1.234000e-01
+  %add2 = fadd double %mul1, %x
+  %mul3 = fmul double %add, %add2
+  ret double %mul3
+}
+
+; (x + 0.1234 * y) * (x - -0.1234 * y) -> (x + 0.1234 * y) * (x + 0.1234 * y)
+define double @test3(double %x, double %y) {
+; CHECK-LABEL: @test3
+; CHECK-NEXT: fmul double %y, 1.234000e-01
+; CHECK-NEXT: fadd double %x, %mul
+; CHECK-NEXT: fmul double %add{{.*}}, %add{{.*}}
+; CHECK-NEXT: ret double
+
+  %mul = fmul double %y, 1.234000e-01
+  %add = fadd double %mul, %x
+  %mul1 = fmul double %y, -1.234000e-01
+  %add2 = fsub double %x, %mul1
+  %mul3 = fmul double %add, %add2
+  ret double %mul3
+}
+
+; Canonicalize (x - -1234 * y)
+define i64 @test4(i64 %x, i64 %y) {
+; CHECK-LABEL: @test4
+; CHECK-NEXT: mul i64 %y, 1234
+; CHECK-NEXT: add i64 %mul, %x
+; CHECK-NEXT: ret i64 %sub
+
+  %mul = mul i64 %y, -1234
+  %sub = sub i64 %x, %mul
+  ret i64 %sub
+}
+
+; Canonicalize (x - -0.1234 * y)
+define double @test5(double %x, double %y) {
+; CHECK-LABEL: @test5
+; CHECK-NEXT: fmul double %y, 1.234000e-01
+; CHECK-NEXT: fadd double %x, %mul
+; CHECK-NEXT: ret double
+
+  %mul = fmul double -1.234000e-01, %y
+  %sub = fsub double %x, %mul
+  ret double %sub
+}
+
+; Don't modify (-0.1234 * y - x)
+define double @test6(double %x, double %y) {
+; CHECK-LABEL: @test6
+; CHECK-NEXT: fmul double %y, -1.234000e-01
+; CHECK-NEXT: fsub double %mul, %x
+; CHECK-NEXT: ret double %sub
+
+  %mul = fmul double -1.234000e-01, %y
+  %sub = fsub double %mul, %x
+  ret double %sub
+}
+
+; Canonicalize (-0.1234 * y + x) -> (x - 0.1234 * y)
+define double @test7(double %x, double %y) {
+; CHECK-LABEL: @test7
+; CHECK-NEXT: fmul double %y, 1.234000e-01
+; CHECK-NEXT: fsub double %x, %mul
+; CHECK-NEXT: ret double %add
+
+  %mul = fmul double -1.234000e-01, %y
+  %add = fadd double %mul, %x
+  ret double %add
+}
+
+; Canonicalize (y * -0.1234 + x) -> (x - 0.1234 * y)
+define double @test8(double %x, double %y) {
+; CHECK-LABEL: @test8
+; CHECK-NEXT: fmul double %y, 1.234000e-01
+; CHECK-NEXT: fsub double %x, %mul
+; CHECK-NEXT: ret double %add
+
+  %mul = fmul double %y, -1.234000e-01
+  %add = fadd double %mul, %x
+  ret double %add
+}
+
+; Canonicalize (x - -0.1234 / y)
+define double @test9(double %x, double %y) {
+; CHECK-LABEL: @test9
+; CHECK-NEXT: fdiv double 1.234000e-01, %y
+; CHECK-NEXT: fadd double %x, %div
+; CHECK-NEXT: ret double
+
+  %div = fdiv double -1.234000e-01, %y
+  %sub = fsub double %x, %div
+  ret double %sub
+}
+
+; Don't modify (-0.1234 / y - x)
+define double @test10(double %x, double %y) {
+; CHECK-LABEL: @test10
+; CHECK-NEXT: fdiv double -1.234000e-01, %y
+; CHECK-NEXT: fsub double %div, %x
+; CHECK-NEXT: ret double %sub
+
+  %div = fdiv double -1.234000e-01, %y
+  %sub = fsub double %div, %x
+  ret double %sub
+}
+
+; Canonicalize (-0.1234 / y + x) -> (x - 0.1234 / y)
+define double @test11(double %x, double %y) {
+; CHECK-LABEL: @test11
+; CHECK-NEXT: fdiv double 1.234000e-01, %y
+; CHECK-NEXT: fsub double %x, %div
+; CHECK-NEXT: ret double %add
+
+  %div = fdiv double -1.234000e-01, %y
+  %add = fadd double %div, %x
+  ret double %add
+}
+
+; Canonicalize (y / -0.1234 + x) -> (x - y / 0.1234)
+define double @test12(double %x, double %y) {
+; CHECK-LABEL: @test12
+; CHECK-NEXT: fdiv double %y, 1.234000e-01
+; CHECK-NEXT: fsub double %x, %div
+; CHECK-NEXT: ret double %add
+
+  %div = fdiv double %y, -1.234000e-01
+  %add = fadd double %div, %x
+  ret double %add
+}

diff --git a/test/Transforms/Reassociate/commute.ll b/test/Transforms/Reassociate/commute.ll
new file mode 100644
index 0000000..760e51b
--- /dev/null
+++ b/test/Transforms/Reassociate/commute.ll

@@ -0,0 +1,19 @@
+; RUN: opt -reassociate -S < %s | FileCheck %s
+
+declare void @use(i32)
+
+define void @test1(i32 %x, i32 %y) {
+; CHECK-LABEL: test1
+; CHECK: mul i32 %y, %x
+; CHECK: mul i32 %y, %x
+; CHECK: sub i32 %1, %2
+; CHECK: call void @use(i32 %{{.*}})
+; CHECK: call void @use(i32 %{{.*}})
+
+  %1 = mul i32 %x, %y
+  %2 = mul i32 %y, %x
+  %3 = sub i32 %1, %2
+  call void @use(i32 %1)
+  call void @use(i32 %3)
+  ret void
+}

diff --git a/test/Transforms/Reassociate/fast-AgressiveSubMove.ll b/test/Transforms/Reassociate/fast-AgressiveSubMove.ll
new file mode 100644
index 0000000..0c28ed1
--- /dev/null
+++ b/test/Transforms/Reassociate/fast-AgressiveSubMove.ll

@@ -0,0 +1,24 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+define float @test1(float %A) {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %X = fadd float %A, 1.000000e+00
+; CHECK-NEXT: %Y = fadd float %A, 1.000000e+00
+; CHECK-NEXT: %r = fsub float %X, %Y
+; CHECK-NEXT: ret float %r
+
+  %X = fadd float %A, 1.000000e+00
+  %Y = fadd float %A, 1.000000e+00
+  %r = fsub float %X, %Y
+  ret float %r
+}
+
+define float @test2(float %A) {
+; CHECK-LABEL: test2
+; CHECK-NEXT: ret float 0.000000e+00
+
+  %X = fadd fast float 1.000000e+00, %A
+  %Y = fadd fast float 1.000000e+00, %A
+  %r = fsub fast float %X, %Y
+  ret float %r
+}

diff --git a/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll b/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll
new file mode 100644
index 0000000..0109e4f
--- /dev/null
+++ b/test/Transforms/Reassociate/fast-ArrayOutOfBounds.ll

@@ -0,0 +1,65 @@
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+; Not marked as fast, so must not change.
+define float @test1(float %a0, float %a1, float %a2, float %a3, float %a4) {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %tmp.2 = fadd float %a3, %a4
+; CHECK-NEXT: %tmp.4 = fadd float %tmp.2, %a2
+; CHECK-NEXT: %tmp.6 = fadd float %tmp.4, %a1
+; CHECK-NEXT: %tmp.8 = fadd float %tmp.6, %a0
+; CHECK-NEXT: %tmp.11 = fadd float %a2, %a3
+; CHECK-NEXT: %tmp.13 = fadd float %tmp.11, %a1
+; CHECK-NEXT: %tmp.15 = fadd float %tmp.13, %a0
+; CHECK-NEXT: %tmp.18 = fadd float %a1, %a2
+; CHECK-NEXT: %tmp.20 = fadd float %tmp.18, %a0
+; CHECK-NEXT: %tmp.23 = fadd float %a0, %a1
+; CHECK-NEXT: %tmp.26 = fsub float %tmp.8, %tmp.15
+; CHECK-NEXT: %tmp.28 = fadd float %tmp.20, %tmp.26
+; CHECK-NEXT: %tmp.30 = fsub float %tmp.28, %tmp.23
+; CHECK-NEXT: %tmp.32 = fsub float %tmp.30, %a4
+; CHECK-NEXT: %tmp.34 = fsub float %tmp.32, %a2
+; CHECK-NEXT: %T = fmul float %tmp.34, %tmp.34
+; CHECK-NEXT: ret float %T
+
+  %tmp.2 = fadd float %a4, %a3
+  %tmp.4 = fadd float %tmp.2, %a2
+  %tmp.6 = fadd float %tmp.4, %a1
+  %tmp.8 = fadd float %tmp.6, %a0
+  %tmp.11 = fadd float %a3, %a2
+  %tmp.13 = fadd float %tmp.11, %a1
+  %tmp.15 = fadd float %tmp.13, %a0
+  %tmp.18 = fadd float %a2, %a1
+  %tmp.20 = fadd float %tmp.18, %a0
+  %tmp.23 = fadd float %a1, %a0
+  %tmp.26 = fsub float %tmp.8, %tmp.15
+  %tmp.28 = fadd float %tmp.26, %tmp.20
+  %tmp.30 = fsub float %tmp.28, %tmp.23
+  %tmp.32 = fsub float %tmp.30, %a4
+  %tmp.34 = fsub float %tmp.32, %a2
+  %T = fmul float %tmp.34, %tmp.34
+  ret float %T
+}
+
+; Should be able to eliminate everything.
+define float @test2(float %a0, float %a1, float %a2, float %a3, float %a4) {
+; CHECK-LABEL: test2
+; CHECK: ret float 0.000000e+00
+
+  %tmp.2 = fadd fast float %a4, %a3
+  %tmp.4 = fadd fast float %tmp.2, %a2
+  %tmp.6 = fadd fast float %tmp.4, %a1
+  %tmp.8 = fadd fast float %tmp.6, %a0
+  %tmp.11 = fadd fast float %a3, %a2
+  %tmp.13 = fadd fast float %tmp.11, %a1
+  %tmp.15 = fadd fast float %tmp.13, %a0
+  %tmp.18 = fadd fast float %a2, %a1
+  %tmp.20 = fadd fast float %tmp.18, %a0
+  %tmp.23 = fadd fast float %a1, %a0
+  %tmp.26 = fsub fast float %tmp.8, %tmp.15
+  %tmp.28 = fadd fast float %tmp.26, %tmp.20
+  %tmp.30 = fsub fast float %tmp.28, %tmp.23
+  %tmp.32 = fsub fast float %tmp.30, %a4
+  %tmp.34 = fsub fast float %tmp.32, %a2
+  %T = fmul fast float %tmp.34, %tmp.34
+  ret float %T
+}

diff --git a/test/Transforms/Reassociate/fast-MissedTree.ll b/test/Transforms/Reassociate/fast-MissedTree.ll
new file mode 100644
index 0000000..689fd6c
--- /dev/null
+++ b/test/Transforms/Reassociate/fast-MissedTree.ll

@@ -0,0 +1,11 @@
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+define float @test1(float %A, float %B) {
+; CHECK-LABEL: test1
+; CHECK: %Z = fadd fast float %A, %B
+; CHECK: ret float %Z
+	%W = fadd fast float %B, -5.0
+	%Y = fadd fast float %A, 5.0
+	%Z = fadd fast float %W, %Y
+	ret float %Z
+}

diff --git a/test/Transforms/Reassociate/fast-ReassociateVector.ll b/test/Transforms/Reassociate/fast-ReassociateVector.ll
new file mode 100644
index 0000000..eeae096
--- /dev/null
+++ b/test/Transforms/Reassociate/fast-ReassociateVector.ll

@@ -0,0 +1,73 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+; Canonicalize operands, but don't optimize floating point vector operations.
+define <4 x float> @test1() {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %tmp1 = fsub fast <4 x float> zeroinitializer, zeroinitializer
+; CHECK-NEXT: %tmp2 = fmul fast <4 x float> %tmp1, zeroinitializer
+
+  %tmp1 = fsub fast <4 x float> zeroinitializer, zeroinitializer
+  %tmp2 = fmul fast <4 x float> zeroinitializer, %tmp1
+  ret <4 x float> %tmp2
+}
+
+; Commute integer vector operations.
+define <2 x i32> @test2(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: test2
+; CHECK-NEXT: %tmp1 = add <2 x i32> %x, %y
+; CHECK-NEXT: %tmp2 = add <2 x i32> %x, %y
+; CHECK-NEXT: %tmp3 = add <2 x i32> %tmp1, %tmp2
+
+  %tmp1 = add <2 x i32> %x, %y
+  %tmp2 = add <2 x i32> %y, %x
+  %tmp3 = add <2 x i32> %tmp1, %tmp2
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test3(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: test3
+; CHECK-NEXT: %tmp1 = mul <2 x i32> %x, %y
+; CHECK-NEXT: %tmp2 = mul <2 x i32> %x, %y
+; CHECK-NEXT: %tmp3 = mul <2 x i32> %tmp1, %tmp2
+
+  %tmp1 = mul <2 x i32> %x, %y
+  %tmp2 = mul <2 x i32> %y, %x
+  %tmp3 = mul <2 x i32> %tmp1, %tmp2
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test4(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: test4
+; CHECK-NEXT: %tmp1 = and <2 x i32> %x, %y
+; CHECK-NEXT: %tmp2 = and <2 x i32> %x, %y
+; CHECK-NEXT: %tmp3 = and <2 x i32> %tmp1, %tmp2
+
+  %tmp1 = and <2 x i32> %x, %y
+  %tmp2 = and <2 x i32> %y, %x
+  %tmp3 = and <2 x i32> %tmp1, %tmp2
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test5(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: test5
+; CHECK-NEXT: %tmp1 = or <2 x i32> %x, %y
+; CHECK-NEXT: %tmp2 = or <2 x i32> %x, %y
+; CHECK-NEXT: %tmp3 = or <2 x i32> %tmp1, %tmp2
+
+  %tmp1 = or <2 x i32> %x, %y
+  %tmp2 = or <2 x i32> %y, %x
+  %tmp3 = or <2 x i32> %tmp1, %tmp2
+  ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test6(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: test6
+; CHECK-NEXT: %tmp1 = xor <2 x i32> %x, %y
+; CHECK-NEXT: %tmp2 = xor <2 x i32> %x, %y
+; CHECK-NEXT: %tmp3 = xor <2 x i32> %tmp1, %tmp2
+
+  %tmp1 = xor <2 x i32> %x, %y
+  %tmp2 = xor <2 x i32> %y, %x
+  %tmp3 = xor <2 x i32> %tmp1, %tmp2
+  ret <2 x i32> %tmp3
+}

diff --git a/test/Transforms/Reassociate/fast-SubReassociate.ll b/test/Transforms/Reassociate/fast-SubReassociate.ll
new file mode 100644
index 0000000..db4191a
--- /dev/null
+++ b/test/Transforms/Reassociate/fast-SubReassociate.ll

@@ -0,0 +1,70 @@
+; RUN: opt < %s -reassociate -constprop -instcombine -S | FileCheck %s
+
+define float @test1(float %A, float %B) {
+; CHECK-LABEL: test1
+; CHECK-NEXT: %W = fadd float %B, 5.000000e+00
+; CHECK-NEXT: %X = fadd float %A, -7.000000e+00
+; CHECK-NEXT: %Y = fsub float %X, %W
+; CHECK-NEXT: %Z = fadd float %Y, 1.200000e+01
+; CHECK-NEXT: ret float %Z
+
+  %W = fadd float 5.0, %B
+  %X = fadd float -7.0, %A
+  %Y = fsub float %X, %W
+  %Z = fadd float %Y, 12.0
+  ret float %Z
+}
+
+; With sub reassociation, constant folding can eliminate all of the constants.
+define float @test2(float %A, float %B) {
+; CHECK-LABEL: test2
+; CHECK-NEXT: %Z = fsub fast float %A, %B
+; CHECK-NEXT: ret float %Z
+
+  %W = fadd fast float %B, 5.000000e+00
+  %X = fadd fast float %A, -7.000000e+00
+  %Y = fsub fast float %X, %W
+  %Z = fadd fast float %Y, 1.200000e+01
+  ret float %Z
+
+}
+
+define float @test3(float %A, float %B, float %C, float %D) {
+; CHECK-LABEL: test3
+; CHECK-NEXT: %M = fadd float %A, 1.200000e+01
+; CHECK-NEXT: %N = fadd float %M, %B
+; CHECK-NEXT: %O = fadd float %N, %C
+; CHECK-NEXT: %P = fsub float %D, %O
+; CHECK-NEXT: %Q = fadd float %P, 1.200000e+01
+; CHECK-NEXT: ret float %Q
+
+  %M = fadd float %A, 1.200000e+01
+  %N = fadd float %M, %B
+  %O = fadd float %N, %C
+  %P = fsub float %D, %O
+  %Q = fadd float %P, 1.200000e+01
+  ret float %Q
+}
+
+; With sub reassociation, constant folding can eliminate the two 12 constants.
+define float @test4(float %A, float %B, float %C, float %D) {
+; CHECK-LABEL: test4
+; CHECK-NEXT: %B.neg = fsub fast float -0.000000e+00, %B
+; CHECK-NEXT: %O.neg = fsub fast float %B.neg, %A
+; CHECK-NEXT: %P = fsub fast float %O.neg, %C
+; CHECK-NEXT: %Q = fadd fast float %P, %D
+; CHECK-NEXT: ret float %Q
+
+; FIXME: InstCombine should be able to get us to the following:
+; %sum = fadd fast float %B, %A
+; %sum1 = fadd fast float %sum, %C
+; %Q = fsub fast float %D, %sum1
+; ret i32 %Q
+
+  %M = fadd fast float 1.200000e+01, %A
+  %N = fadd fast float %M, %B
+  %O = fadd fast float %N, %C
+  %P = fsub fast float %D, %O
+  %Q = fadd fast float 1.200000e+01, %P
+  ret float %Q
+}

diff --git a/test/Transforms/Reassociate/fast-basictest.ll b/test/Transforms/Reassociate/fast-basictest.ll
new file mode 100644
index 0000000..67b07f4
--- /dev/null
+++ b/test/Transforms/Reassociate/fast-basictest.ll

@@ -0,0 +1,285 @@
+; RUN: opt < %s -reassociate -gvn -instcombine -S | FileCheck %s
+
+; With reassociation, constant folding can eliminate the 12 and -12 constants.
+define float @test1(float %arg) {
+; CHECK-LABEL: @test1
+; CHECK-NEXT: fsub fast float -0.000000e+00, %arg
+; CHECK-NEXT: ret float
+
+  %tmp1 = fsub fast float -1.200000e+01, %arg
+  %tmp2 = fadd fast float %tmp1, 1.200000e+01
+  ret float %tmp2
+}
+
+define float @test2(float %reg109, float %reg1111) {
+; CHECK-LABEL: @test2
+; CHECK-NEXT: fadd float %reg109, -3.000000e+01
+; CHECK-NEXT: fadd float %reg115, %reg1111
+; CHECK-NEXT: fadd float %reg116, 3.000000e+01
+; CHECK-NEXT: ret float
+
+  %reg115 = fadd float %reg109, -3.000000e+01
+  %reg116 = fadd float %reg115, %reg1111
+  %reg117 = fadd float %reg116, 3.000000e+01
+  ret float %reg117
+}
+
+define float @test3(float %reg109, float %reg1111) {
+; CHECK-LABEL: @test3
+; CHECK-NEXT: %reg117 = fadd fast float %reg109, %reg1111
+; CHECK-NEXT:  ret float %reg117
+
+  %reg115 = fadd fast float %reg109, -3.000000e+01
+  %reg116 = fadd fast float %reg115, %reg1111
+  %reg117 = fadd fast float %reg116, 3.000000e+01
+  ret float %reg117
+}
+
+@fe = external global float
+@fa = external global float
+@fb = external global float
+@fc = external global float
+@ff = external global float
+
+define void @test4() {
+; CHECK-LABEL: @test4
+; CHECK: fadd fast float
+; CHECK: fadd fast float
+; CHECK-NOT: fadd fast float
+; CHECK: ret void
+
+  %A = load float* @fa
+  %B = load float* @fb
+  %C = load float* @fc
+  %t1 = fadd fast float %A, %B
+  %t2 = fadd fast float %t1, %C
+  %t3 = fadd fast float %C, %A
+  %t4 = fadd fast float %t3, %B
+  ; e = (a+b)+c;
+  store float %t2, float* @fe
+  ; f = (a+c)+b
+  store float %t4, float* @ff
+  ret void
+}
+
+define void @test5() {
+; CHECK-LABEL: @test5
+; CHECK: fadd fast float
+; CHECK: fadd fast float
+; CHECK-NOT: fadd
+; CHECK: ret void
+
+  %A = load float* @fa
+  %B = load float* @fb
+  %C = load float* @fc
+  %t1 = fadd fast float %A, %B
+  %t2 = fadd fast float %t1, %C
+  %t3 = fadd fast float %C, %A
+  %t4 = fadd fast float %t3, %B
+  ; e = c+(a+b)
+  store float %t2, float* @fe
+  ; f = (c+a)+b
+  store float %t4, float* @ff
+  ret void
+}
+
+define void @test6() {
+; CHECK-LABEL: @test6
+; CHECK: fadd fast float
+; CHECK: fadd fast float
+; CHECK-NOT: fadd
+; CHECK: ret void
+
+  %A = load float* @fa
+  %B = load float* @fb
+  %C = load float* @fc
+  %t1 = fadd fast float %B, %A
+  %t2 = fadd fast float %t1, %C
+  %t3 = fadd fast float %C, %A
+  %t4 = fadd fast float %t3, %B
+  ; e = c+(b+a)
+  store float %t2, float* @fe
+  ; f = (c+a)+b
+  store float %t4, float* @ff
+  ret void
+}
+
+define float @test7(float %A, float %B, float %C) {
+; CHECK-LABEL: @test7
+; CHECK-NEXT: fadd fast float %C, %B
+; CHECK-NEXT: fmul fast float %A, %A
+; CHECK-NEXT: fmul fast float %1, %tmp2
+; CHECK-NEXT: ret float
+
+  %aa = fmul fast float %A, %A
+  %aab = fmul fast float %aa, %B
+  %ac = fmul fast float %A, %C
+  %aac = fmul fast float %ac, %A
+  %r = fadd fast float %aab, %aac
+  ret float %r
+}
+
+define float @test8(float %X, float %Y, float %Z) {
+; CHECK-LABEL: @test8
+; CHECK-NEXT: fmul fast float %Y, %X
+; CHECK-NEXT: fsub fast float %Z
+; CHECK-NEXT: ret float
+
+  %A = fsub fast float 0.0, %X
+  %B = fmul fast float %A, %Y
+  ; (-X)*Y + Z -> Z-X*Y
+  %C = fadd fast float %B, %Z
+  ret float %C
+}
+
+define float @test9(float %X) {
+; CHECK-LABEL: @test9
+; CHECK-NEXT: fmul fast float %X, 9.400000e+01
+; CHECK-NEXT: ret float
+
+  %Y = fmul fast float %X, 4.700000e+01
+  %Z = fadd fast float %Y, %Y
+  ret float %Z
+}
+
+define float @test10(float %X) {
+; CHECK-LABEL: @test10
+; CHECK-NEXT: fmul fast float %X, 3.000000e+00
+; CHECK-NEXT: ret float
+
+  %Y = fadd fast float %X ,%X
+  %Z = fadd fast float %Y, %X
+  ret float %Z
+}
+
+define float @test11(float %W) {
+; CHECK-LABEL: test11
+; CHECK-NEXT: fmul fast float %W, 3.810000e+02
+; CHECK-NEXT: ret float
+
+  %X = fmul fast float %W, 127.0
+  %Y = fadd fast float %X ,%X
+  %Z = fadd fast float %Y, %X
+  ret float %Z
+}
+
+define float @test12(float %X) {
+; CHECK-LABEL: @test12
+; CHECK-NEXT: fmul fast float %X, -3.000000e+00
+; CHECK-NEXT: fadd fast float %factor, 6.000000e+00
+; CHECK-NEXT: ret float
+
+  %A = fsub fast float 1.000000e+00, %X
+  %B = fsub fast float 2.000000e+00, %X
+  %C = fsub fast float 3.000000e+00, %X
+  %Y = fadd fast float %A ,%B
+  %Z = fadd fast float %Y, %C
+  ret float %Z
+}
+
+define float @test13(float %X1, float %X2, float %X3) {
+; CHECK-LABEL: @test13
+; CHECK-NEXT: fsub fast float %X3, %X2
+; CHECK-NEXT: fmul fast float {{.*}}, %X1
+; CHECK-NEXT: ret float
+
+  %A = fsub fast float 0.000000e+00, %X1
+  %B = fmul fast float %A, %X2   ; -X1*X2
+  %C = fmul fast float %X1, %X3  ; X1*X3
+  %D = fadd fast float %B, %C    ; -X1*X2 + X1*X3 -> X1*(X3-X2)
+  ret float %D
+}
+
+define float @test14(float %X1, float %X2) {
+; CHECK-LABEL: @test14
+; CHECK-NEXT: fsub fast float %X1, %X2
+; CHECK-NEXT: fmul fast float %1, 4.700000e+01
+; CHECK-NEXT: ret float
+
+  %B = fmul fast float %X1, 47.   ; X1*47
+  %C = fmul fast float %X2, -47.  ; X2*-47
+  %D = fadd fast float %B, %C    ; X1*47 + X2*-47 -> 47*(X1-X2)
+  ret float %D
+}
+
+define float @test15(float %arg) {
+; CHECK-LABEL: test15
+; CHECK-NEXT: fmul fast float %arg, 1.440000e+02
+; CHECK-NEXT: ret float %tmp2
+
+  %tmp1 = fmul fast float 1.200000e+01, %arg
+  %tmp2 = fmul fast float %tmp1, 1.200000e+01
+  ret float %tmp2
+}
+
+; (b+(a+1234))+-a -> b+1234
+define float @test16(float %b, float %a) {
+; CHECK-LABEL: @test16
+; CHECK-NEXT: fadd fast float %b, 1.234000e+03
+; CHECK-NEXT: ret float
+
+  %1 = fadd fast float %a, 1234.0
+  %2 = fadd fast float %b, %1
+  %3 = fsub fast float 0.0, %a
+  %4 = fadd fast float %2, %3
+  ret float %4
+}
+
+; Test that we can turn things like X*-(Y*Z) -> X*-1*Y*Z.
+
+define float @test17(float %a, float %b, float %z) {
+; CHECK-LABEL: test17
+; CHECK-NEXT: fmul fast float %a, 1.234500e+04
+; CHECK-NEXT: fmul fast float %e, %b
+; CHECK-NEXT: fmul fast float %f, %z
+; CHECK-NEXT: ret float
+
+  %c = fsub fast float 0.000000e+00, %z
+  %d = fmul fast float %a, %b
+  %e = fmul fast float %c, %d
+  %f = fmul fast float %e, 1.234500e+04
+  %g = fsub fast float 0.000000e+00, %f
+  ret float %g
+}
+
+define float @test18(float %a, float %b, float %z) {
+; CHECK-LABEL: test18
+; CHECK-NEXT: fmul fast float %a, 4.000000e+01
+; CHECK-NEXT: fmul fast float %e, %z
+; CHECK-NEXT: ret float
+
+  %d = fmul fast float %z, 4.000000e+01
+  %c = fsub fast float 0.000000e+00, %d
+  %e = fmul fast float %a, %c
+  %f = fsub fast float 0.000000e+00, %e
+  ret float %f
+}
+
+; With sub reassociation, constant folding can eliminate the 12 and -12 constants.
+define float @test19(float %A, float %B) {
+; CHECK-LABEL: @test19
+; CHECK-NEXT: fsub fast float %A, %B
+; CHECK-NEXT: ret float
+  %X = fadd fast float -1.200000e+01, %A
+  %Y = fsub fast float %X, %B
+  %Z = fadd fast float %Y, 1.200000e+01
+  ret float %Z
+}
+
+; With sub reassociation, constant folding can eliminate the uses of %a.
+define float @test20(float %a, float %b, float %c) nounwind  {
+; CHECK-LABEL: @test20
+; CHECK-NEXT: fsub fast float -0.000000e+00, %b
+; CHECK-NEXT: fsub fast float %b.neg, %c
+; CHECK-NEXT: ret float
+
+; FIXME: Should be able to generate the below, which may expose more
+;        opportunites for FAdd reassociation.
+; %sum = fadd fast float %c, %b
+; %tmp7 = fsub fast float 0, %sum
+
+  %tmp3 = fsub fast float %a, %b
+  %tmp5 = fsub fast float %tmp3, %c
+  %tmp7 = fsub fast float %tmp5, %a
+  ret float %tmp7
+}

diff --git a/test/Transforms/Reassociate/fast-fp-commute.ll b/test/Transforms/Reassociate/fast-fp-commute.ll
new file mode 100644
index 0000000..ad89607
--- /dev/null
+++ b/test/Transforms/Reassociate/fast-fp-commute.ll

@@ -0,0 +1,44 @@
+; RUN: opt -reassociate -S < %s | FileCheck %s
+
+declare void @use(float)
+
+define void @test1(float %x, float %y) {
+; CHECK-LABEL: test1
+; CHECK: fmul fast float %y, %x
+; CHECK: fmul fast float %y, %x
+; CHECK: fsub fast float %1, %2
+; CHECK: call void @use(float %{{.*}})
+; CHECK: call void @use(float %{{.*}})
+
+  %1 = fmul fast float %x, %y
+  %2 = fmul fast float %y, %x
+  %3 = fsub fast float %1, %2
+  call void @use(float %1)
+  call void @use(float %3)
+  ret void
+}
+
+define float @test2(float %x, float %y) {
+; CHECK-LABEL: test2
+; CHECK-NEXT: fmul fast float %y, %x
+; CHECK-NEXT: fmul fast float %y, %x
+; CHECK-NEXT: fsub fast float %1, %2
+; CHECK-NEXT: ret float %3
+
+  %1 = fmul fast float %x, %y
+  %2 = fmul fast float %y, %x
+  %3 = fsub fast float %1, %2
+  ret float %3
+}
+
+define float @test3(float %x, float %y) {
+; CHECK-LABEL: test3
+; CHECK-NEXT: %factor = fmul fast float %y, 2.000000e+00
+; CHECK-NEXT: %tmp1 = fmul fast float %factor, %x
+; CHECK-NEXT: ret float %tmp1
+
+  %1 = fmul fast float %x, %y
+  %2 = fmul fast float %y, %x
+  %3 = fadd fast float %1, %2
+  ret float %3
+}

diff --git a/test/Transforms/Reassociate/fast-mightymul.ll b/test/Transforms/Reassociate/fast-mightymul.ll
new file mode 100644
index 0000000..98bdf7a
--- /dev/null
+++ b/test/Transforms/Reassociate/fast-mightymul.ll

@@ -0,0 +1,35 @@
+; RUN: opt < %s -reassociate -disable-output
+; PR13021
+
+define float @test2(float %x) {
+  %t0 = fmul fast float %x, %x
+  %t1 = fmul fast float %t0, %t0
+  %t2 = fmul fast float %t1, %t1
+  %t3 = fmul fast float %t2, %t2
+  %t4 = fmul fast float %t3, %t3
+  %t5 = fmul fast float %t4, %t4
+  %t6 = fmul fast float %t5, %t5
+  %t7 = fmul fast float %t6, %t6
+  %t8 = fmul fast float %t7, %t7
+  %t9 = fmul fast float %t8, %t8
+  %t10 = fmul fast float %t9, %t9
+  %t11 = fmul fast float %t10, %t10
+  %t12 = fmul fast float %t11, %t11
+  %t13 = fmul fast float %t12, %t12
+  %t14 = fmul fast float %t13, %t13
+  %t15 = fmul fast float %t14, %t14
+  %t16 = fmul fast float %t15, %t15
+  %t17 = fmul fast float %t16, %t16
+  %t18 = fmul fast float %t17, %t17
+  %t19 = fmul fast float %t18, %t18
+  %t20 = fmul fast float %t19, %t19
+  %t21 = fmul fast float %t20, %t20
+  %t22 = fmul fast float %t21, %t21
+  %t23 = fmul fast float %t22, %t22
+  %t24 = fmul fast float %t23, %t23
+  %t25 = fmul fast float %t24, %t24
+  %t26 = fmul fast float %t25, %t25
+  %t27 = fmul fast float %t26, %t26
+  %t28 = fmul fast float %t27, %t27
+  ret float %t28
+}

diff --git a/test/Transforms/Reassociate/fast-multistep.ll b/test/Transforms/Reassociate/fast-multistep.ll
new file mode 100644
index 0000000..45e15c7
--- /dev/null
+++ b/test/Transforms/Reassociate/fast-multistep.ll

@@ -0,0 +1,32 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+
+define float @fmultistep1(float %a, float %b, float %c) {
+; Check that a*a*b+a*a*c is turned into a*(a*(b+c)).
+; CHECK-LABEL: @fmultistep1
+; CHECK-NEXT: fadd fast float %c, %b
+; CHECK-NEXT: fmul fast float %a, %tmp2
+; CHECK-NEXT: fmul fast float %tmp3, %a
+; CHECK-NEXT: ret float
+
+  %t0 = fmul fast float %a, %b
+  %t1 = fmul fast float %a, %t0 ; a*(a*b)
+  %t2 = fmul fast float %a, %c
+  %t3 = fmul fast float %a, %t2 ; a*(a*c)
+  %t4 = fadd fast float %t1, %t3
+  ret float %t4
+}
+
+define float @fmultistep2(float %a, float %b, float %c, float %d) {
+; Check that a*b+a*c+d is turned into a*(b+c)+d.
+; CHECK-LABEL: @fmultistep2
+; CHECK-NEXT: fadd fast float %c, %b
+; CHECK-NEXT: fmul fast float %tmp, %a
+; CHECK-NEXT: fadd fast float %tmp1, %d
+; CHECK-NEXT: ret float
+
+  %t0 = fmul fast float %a, %b
+  %t1 = fmul fast float %a, %c
+  %t2 = fadd fast float %t1, %d ; a*c+d
+  %t3 = fadd fast float %t0, %t2 ; a*b+(a*c+d)
+  ret float %t3
+}

diff --git a/test/Transforms/Reassociate/mixed-fast-nonfast-fp.ll b/test/Transforms/Reassociate/mixed-fast-nonfast-fp.ll
new file mode 100644
index 0000000..f51c0c1
--- /dev/null
+++ b/test/Transforms/Reassociate/mixed-fast-nonfast-fp.ll

@@ -0,0 +1,18 @@
+; RUN: opt -reassociate %s -S | FileCheck %s
+
+define float @foo(float %a,float %b, float %c) {
+; CHECK: %mul3 = fmul float %a, %b
+; CHECK-NEXT: fmul fast float %c, 2.000000e+00
+; CHECK-NEXT: fadd fast float %factor, %b
+; CHECK-NEXT: fmul fast float %tmp1, %a
+; CHECK-NEXT: fadd fast float %tmp2, %mul3
+; CHECK-NEXT: ret float
+  %mul1 = fmul fast float %a, %c
+  %mul2 = fmul fast float %a, %b
+  %mul3 = fmul float %a, %b
+  %mul4 = fmul fast float %a, %c
+  %add1 = fadd fast  float %mul1, %mul3
+  %add2 = fadd  fast float %mul4, %mul2
+  %add3 = fadd fast float %add1, %add2
+  ret float %add3
+}

diff --git a/test/Transforms/Reassociate/multistep.ll b/test/Transforms/Reassociate/multistep.ll
index 12eaeee..c499646 100644
--- a/test/Transforms/Reassociate/multistep.ll
+++ b/test/Transforms/Reassociate/multistep.ll

@@ -9,7 +9,7 @@
   %t3 = mul i64 %a, %t2 ; a*(a*c)
   %t4 = add i64 %t1, %t3
 ; CHECK-NEXT: add i64 %c, %b
-; CHECK-NEXT: mul i64 %tmp{{.*}}, %a
+; CHECK-NEXT: mul i64 %a, %tmp{{.*}}
 ; CHECK-NEXT: mul i64 %tmp{{.*}}, %a
 ; CHECK-NEXT: ret
   ret i64 %t4

diff --git a/test/Transforms/Reassociate/negation1.ll b/test/Transforms/Reassociate/negation1.ll
new file mode 100644
index 0000000..34b943c
--- /dev/null
+++ b/test/Transforms/Reassociate/negation1.ll

@@ -0,0 +1,15 @@
+; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
+
+; Test that we can turn things like A*B + X - A*B -> X.
+
+define i32 @test1(i32 %a, i32 %b, i32 %x) {
+; CHECK-LABEL: test1
+; CHECK: ret i32 %x
+
+  %c = mul i32 %a, %b
+  %d = add i32 %c, %x
+  %c1 = mul i32 %a, %b
+  %f = sub i32 %d, %c1
+  ret i32 %f
+}
+

diff --git a/test/Transforms/Reassociate/pr21205.ll b/test/Transforms/Reassociate/pr21205.ll
new file mode 100644
index 0000000..fcc7150
--- /dev/null
+++ b/test/Transforms/Reassociate/pr21205.ll

@@ -0,0 +1,21 @@
+; RUN: opt -reassociate -S < %s | FileCheck %s
+; PR21205
+
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+
+; Don't canonicalize %conv - undef into %conv + (-undef).
+; CHECK-LABEL: @test1
+; CHECK: %sub = fsub fast float %conv, undef
+; CHECK: %sub1 = fadd fast float %sub, -1.000000e+00
+
+define i32 @test1() {
+entry:
+  %0 = load i32* @a, align 4
+  %conv = sitofp i32 %0 to float
+  %sub = fsub fast float %conv, undef
+  %sub1 = fadd fast float %sub, -1.000000e+00
+  %conv2 = fptosi float %sub1 to i32
+  store i32 %conv2, i32* @b, align 4
+  ret i32 undef
+}

diff --git a/test/Transforms/Reassociate/wrap-flags.ll b/test/Transforms/Reassociate/wrap-flags.ll
new file mode 100644
index 0000000..e3304b6
--- /dev/null
+++ b/test/Transforms/Reassociate/wrap-flags.ll

@@ -0,0 +1,34 @@
+; RUN: opt < %s -reassociate -dce -S | FileCheck %s
+; PR12985
+
+; Verify the nsw flags are preserved when converting shl to mul.
+
+; CHECK-LABEL: @shl_to_mul_nsw(
+; CHECK: %mul = mul i32 %i, -2147483648
+; CHECK: add i32 %mul, 1
+define i32 @shl_to_mul_nsw(i32 %i) {
+entry:
+  %mul = shl nsw i32 %i, 31
+  %mul2 = add i32 %mul, 1
+  ret i32 %mul2
+}
+
+; CHECK-LABEL: @shl_to_mul_nuw(
+; CHECK: %mul = mul nuw i32 %i, 4
+; CHECK: add i32 %mul, 1
+define i32 @shl_to_mul_nuw(i32 %i) {
+entry:
+  %mul = shl nuw i32 %i, 2
+  %mul2 = add i32 %mul, 1
+  ret i32 %mul2
+}
+
+; CHECK-LABEL: @shl_to_mul_nuw_nsw(
+; CHECK: %mul = mul nuw nsw i32 %i, 4
+; CHECK: add i32 %mul, 1
+define i32 @shl_to_mul_nuw_nsw(i32 %i) {
+entry:
+  %mul = shl nuw nsw i32 %i, 2
+  %mul2 = add i32 %mul, 1
+  ret i32 %mul2
+}

diff --git a/test/Transforms/SCCP/ipsccp-basic.ll b/test/Transforms/SCCP/ipsccp-basic.ll
index c1c6c92..107b7af 100644
--- a/test/Transforms/SCCP/ipsccp-basic.ll
+++ b/test/Transforms/SCCP/ipsccp-basic.ll

@@ -227,3 +227,23 @@
 ; CHECK-LABEL: define internal i32 @test10b(
 ; CHECK: ret i32 undef
 }
+
+;;======================== test11
+
+define i64 @test11a() {
+  %xor = xor i64 undef, undef
+  ret i64 %xor
+; CHECK-LABEL: define i64 @test11a
+; CHECK: ret i64 0
+}
+
+define void @test11b() {
+  %call1 = call i64 @test11a()
+  %call2 = call i64 @llvm.ctpop.i64(i64 %call1)
+  ret void
+; CHECK-LABEL: define void @test11b
+; CHECK: %[[call1:.*]] = call i64 @test11a()
+; CHECK: %[[call2:.*]] = call i64 @llvm.ctpop.i64(i64 0)
+}
+
+declare i64 @llvm.ctpop.i64(i64)

diff --git a/test/Transforms/SLPVectorizer/AArch64/commute.ll b/test/Transforms/SLPVectorizer/AArch64/commute.ll
new file mode 100644
index 0000000..4ee91a5
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/commute.ll

@@ -0,0 +1,75 @@
+; RUN: opt -S -slp-vectorizer %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%structA = type { [2 x float] }
+
+define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
+; CHECK-LABEL: test1
+; CHECK: %arrayidx4 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 0
+; CHECK: %arrayidx9 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 1
+; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>*
+; CHECK: %4 = load <2 x float>* %3, align 4
+; CHECK: %5 = fsub fast <2 x float> %2, %4
+; CHECK: %6 = fmul fast <2 x float> %5, %5
+; CHECK: %7 = extractelement <2 x float> %6, i32 0
+; CHECK: %8 = extractelement <2 x float> %6, i32 1
+; CHECK: %add = fadd fast float %7, %8
+; CHECK: %cmp = fcmp oeq float %add, 0.000000e+00
+
+entry:
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %conv5 = sitofp i32 %ymin to float
+  %conv = sitofp i32 %xmin to float
+  %arrayidx4 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 0
+  %0 = load float* %arrayidx4, align 4
+  %sub = fsub fast float %conv, %0
+  %arrayidx9 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 1
+  %1 = load float* %arrayidx9, align 4
+  %sub10 = fsub fast float %conv5, %1
+  %mul11 = fmul fast float %sub, %sub
+  %mul12 = fmul fast float %sub10, %sub10
+  %add = fadd fast float %mul11, %mul12
+  %cmp = fcmp oeq float %add, 0.000000e+00
+  br i1 %cmp, label %for.body3.lr.ph, label %for.end27
+
+for.end27:
+  ret void
+}
+
+define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
+; CHECK-LABEL: test2
+; CHECK: %arrayidx4 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 0
+; CHECK: %arrayidx9 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 1
+; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>*
+; CHECK: %4 = load <2 x float>* %3, align 4
+; CHECK: %5 = fsub fast <2 x float> %2, %4
+; CHECK: %6 = fmul fast <2 x float> %5, %5
+; CHECK: %7 = extractelement <2 x float> %6, i32 0
+; CHECK: %8 = extractelement <2 x float> %6, i32 1
+; CHECK: %add = fadd fast float %8, %7
+; CHECK: %cmp = fcmp oeq float %add, 0.000000e+00
+
+entry:
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %conv5 = sitofp i32 %ymin to float
+  %conv = sitofp i32 %xmin to float
+  %arrayidx4 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 0
+  %0 = load float* %arrayidx4, align 4
+  %sub = fsub fast float %conv, %0
+  %arrayidx9 = getelementptr inbounds %structA* %J, i64 0, i32 0, i64 1
+  %1 = load float* %arrayidx9, align 4
+  %sub10 = fsub fast float %conv5, %1
+  %mul11 = fmul fast float %sub, %sub
+  %mul12 = fmul fast float %sub10, %sub10
+  %add = fadd fast float %mul12, %mul11         ;;;<---- Operands commuted!!
+  %cmp = fcmp oeq float %add, 0.000000e+00
+  br i1 %cmp, label %for.body3.lr.ph, label %for.end27
+
+for.end27:
+  ret void
+}

diff --git a/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll b/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll
new file mode 100644
index 0000000..45fa2f9
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll

@@ -0,0 +1,46 @@
+; RUN: opt -S -basicaa -slp-vectorizer < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+; Holding a value live over a call boundary may require
+; spills and fills. This is the case for <2 x double>,
+; as it occupies a Q register of which there are no
+; callee-saves.
+ 
+; CHECK: load double
+; CHECK: load double
+; CHECK: call void @g
+; CHECK: store double
+; CHECK: store double
+define void @f(double* %p, double* %q) {
+  %addr2 = getelementptr double* %q, i32 1
+  %addr = getelementptr double* %p, i32 1
+  %x = load double* %p
+  %y = load double* %addr
+  call void @g()
+  store double %x, double* %q
+  store double %y, double* %addr2
+  ret void
+}
+declare void @g()
+
+; Check we deal with loops correctly.
+;
+; CHECK: store <2 x double>
+; CHECK: load <2 x double>
+define void @f2(double* %p, double* %q) {
+entry:
+  br label %loop
+
+loop:
+  %p1 = phi double [0.0, %entry], [%x, %loop]
+  %p2 = phi double [0.0, %entry], [%y, %loop]
+  %addr2 = getelementptr double* %q, i32 1
+  %addr = getelementptr double* %p, i32 1
+  store double %p1, double* %q
+  store double %p2, double* %addr2
+
+  %x = load double* %p
+  %y = load double* %addr
+  br label %loop
+}

diff --git a/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll b/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
new file mode 100644
index 0000000..e49c7ad
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll

@@ -0,0 +1,42 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: @test1
+; CHECK: load <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sdiv <4 x i32>
+
+define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c) {
+entry:
+  %0 = load i32* %b, align 4
+  %1 = load i32* %c, align 4
+  %add = add nsw i32 %1, %0
+  %div = sdiv i32 %add, 2
+  store i32 %div, i32* %a, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 1
+  %2 = load i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32* %c, i64 1
+  %3 = load i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %3, %2
+  %div6 = sdiv i32 %add5, 2
+  %arrayidx7 = getelementptr inbounds i32* %a, i64 1
+  store i32 %div6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32* %b, i64 2
+  %4 = load i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i64 2
+  %5 = load i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %5, %4
+  %div11 = sdiv i32 %add10, 2
+  %arrayidx12 = getelementptr inbounds i32* %a, i64 2
+  store i32 %div11, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32* %b, i64 3
+  %6 = load i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32* %c, i64 3
+  %7 = load i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %7, %6
+  %div16 = sdiv i32 %add15, 2
+  %arrayidx17 = getelementptr inbounds i32* %a, i64 3
+  store i32 %div16, i32* %arrayidx17, align 4
+  ret void
+}

diff --git a/test/Transforms/SLPVectorizer/ARM/sroa.ll b/test/Transforms/SLPVectorizer/ARM/sroa.ll
index e0c75b1..899cfb1 100644
--- a/test/Transforms/SLPVectorizer/ARM/sroa.ll
+++ b/test/Transforms/SLPVectorizer/ARM/sroa.ll

@@ -5,11 +5,11 @@
 %class.Complex = type { double, double }
 
 ; Code like this is the result of SROA. Make sure we don't vectorize this
-; because the in the scalar version of this the shl/or are handled by the
+; because the scalar version of the shl/or are handled by the
 ; backend and disappear, the vectorized code stays.
 
 ; CHECK-LABEL: SROAed
-; CHECK-NOT: shl <2 x i64>
+; CHECK-NOT: shl nuw <2 x i64>
 ; CHECK-NOT: or <2 x i64>
 
 define void @SROAed(%class.Complex* noalias nocapture sret %agg.result, [4 x i32] %a.coerce, [4 x i32] %b.coerce) {

diff --git a/test/Transforms/SLPVectorizer/X86/addsub.ll b/test/Transforms/SLPVectorizer/X86/addsub.ll
index 8303bc8..174d400 100644
--- a/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/test/Transforms/SLPVectorizer/X86/addsub.ll

@@ -12,9 +12,9 @@
 @fa = common global [4 x float] zeroinitializer, align 16
 
 ; CHECK-LABEL: @addsub
-; CHECK: %5 = add <4 x i32> %3, %4
-; CHECK: %6 = add <4 x i32> %2, %5
-; CHECK: %7 = sub <4 x i32> %2, %5
+; CHECK: %5 = add nsw <4 x i32> %3, %4
+; CHECK: %6 = add nsw <4 x i32> %2, %5
+; CHECK: %7 = sub nsw <4 x i32> %2, %5
 ; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 
 ; Function Attrs: nounwind uwtable
@@ -56,9 +56,9 @@
 }
 
 ; CHECK-LABEL: @subadd
-; CHECK:  %5 = add <4 x i32> %3, %4
-; CHECK:  %6 = sub <4 x i32> %2, %5
-; CHECK:  %7 = add <4 x i32> %2, %5
+; CHECK:  %5 = add nsw <4 x i32> %3, %4
+; CHECK:  %6 = sub nsw <4 x i32> %2, %5
+; CHECK:  %7 = add nsw <4 x i32> %2, %5
 ; CHECK:  %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
 
 ; Function Attrs: nounwind uwtable

diff --git a/test/Transforms/SLPVectorizer/X86/align.ll b/test/Transforms/SLPVectorizer/X86/align.ll
index f586573..ce80620 100644
--- a/test/Transforms/SLPVectorizer/X86/align.ll
+++ b/test/Transforms/SLPVectorizer/X86/align.ll

@@ -4,7 +4,7 @@
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; Simple 3-pair chain with loads and stores
-; CHECK: test1
+; CHECK-LABEL: @test1
 define void @test1(double* %a, double* %b, double* %c) {
 entry:
   %agg.tmp.i.i.sroa.0 = alloca [3 x double], align 16
@@ -25,3 +25,31 @@
 ; CHECK: ret
   ret void
 }
+
+; Float has 4 byte abi alignment on x86_64. We must use the alignmnet of the
+; value being loaded/stored not the alignment of the pointer type.
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: align 8
+; CHECK: load <4 x float>{{.*}}, align 4
+; CHECK: store <4 x float>{{.*}}, align 4
+; CHECK: ret
+
+define void @test2(float * %a, float * %b) {
+entry:
+  %l0 = load float* %a
+  %a1 = getelementptr inbounds float* %a, i64 1
+  %l1 = load float* %a1
+  %a2 = getelementptr inbounds float* %a, i64 2
+  %l2 = load float* %a2
+  %a3 = getelementptr inbounds float* %a, i64 3
+  %l3 = load float* %a3
+  store float %l0, float* %b
+  %b1 = getelementptr inbounds float* %b, i64 1
+  store float %l1, float* %b1
+  %b2 = getelementptr inbounds float* %b, i64 2
+  store float %l2, float* %b2
+  %b3 = getelementptr inbounds float* %b, i64 3
+  store float %l3, float* %b3
+  ret void
+}

diff --git a/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll b/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll
new file mode 100644
index 0000000..dc99366
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll

@@ -0,0 +1,41 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin13.3.0"
+
+@a = common global double 0.000000e+00, align 8
+
+define i32 @fn1() {
+entry:
+  %init = load double* @a, align 8
+  br label %loop
+
+loop:
+  %phi = phi double [ %add2, %loop ], [ %init, %entry ]
+  %postadd1_phi = phi double [ %postadd1, %loop ], [ %init, %entry ]
+  %postadd2_phi = phi double [ %postadd2, %loop ], [ %init, %entry ]
+  %add1 = fadd double %postadd1_phi, undef
+  %add2 = fadd double %postadd2_phi, %phi
+  %mul2 = fmul double %add2, 0.000000e+00
+  %binaryop_B = fadd double %postadd1_phi, %mul2
+  %mul1 = fmul double %add1, 0.000000e+00
+  %tmp = fadd double %postadd2_phi, 0.000000e+00
+
+  ; tryToVectorize() starts with this binary instruction.
+  ; At the same time vectorization wraps around the loop, vectorizes
+  ; postadd1/2 and eventually binary_V and tmp. So binary_V itself is replaced
+  ; with a vector instruction.
+  ; The SLPVectorizer crashed because it tried to use binary_V
+  ; after vectorization to re-arrange instructions.
+  %binary_V = fadd double %mul1, %binaryop_B
+
+  %postadd1 = fadd double %binary_V, 0.000000e+00
+  %postadd2 = fadd double %tmp, 1.000000e+00
+  %tobool = fcmp une double %postadd1, 0.000000e+00
+  br i1 %tobool, label %exit, label %loop
+
+exit:
+  ret i32 1
+}
+
+

diff --git a/test/Transforms/SLPVectorizer/X86/crash_gep.ll b/test/Transforms/SLPVectorizer/X86/crash_gep.ll
new file mode 100644
index 0000000..dd4034c
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_gep.ll

@@ -0,0 +1,19 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-unknown-linux-gnu
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common global i64* null, align 8
+
+; Function Attrs: nounwind uwtable
+define i32 @fn1() {
+entry:
+  %0 = load i64** @a, align 8
+  %add.ptr = getelementptr inbounds i64* %0, i64 1
+  %1 = ptrtoint i64* %add.ptr to i64
+  %arrayidx = getelementptr inbounds i64* %0, i64 2
+  store i64 %1, i64* %arrayidx, align 8
+  %2 = ptrtoint i64* %arrayidx to i64
+  store i64 %2, i64* %add.ptr, align 8
+  ret i32 undef
+}

diff --git a/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll b/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll
new file mode 100644
index 0000000..dddc1be
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll

@@ -0,0 +1,47 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin13.3.0"
+
+define void @_foo(double %p1, double %p2, double %p3) #0 {
+entry:
+  %tab1 = alloca [256 x i32], align 16
+  %tab2 = alloca [256 x i32], align 16
+  br label %bb1
+
+
+bb1:
+  %mul19 = fmul double %p1, 1.638400e+04
+  %mul20 = fmul double %p3, 1.638400e+04
+  %add = fadd double %mul20, 8.192000e+03
+  %mul21 = fmul double %p2, 1.638400e+04
+  ; The SLPVectorizer crashed when scheduling this block after it inserted an
+  ; insertelement instruction (during vectorizing the for.body block) at this position.
+  br label %for.body
+
+for.body:
+  %indvars.iv266 = phi i64 [ 0, %bb1 ], [ %indvars.iv.next267, %for.body ]
+  %t.0259 = phi double [ 0.000000e+00, %bb1 ], [ %add27, %for.body ]
+  %p3.addr.0258 = phi double [ %add, %bb1 ], [ %add28, %for.body ]
+  %vecinit.i.i237 = insertelement <2 x double> undef, double %t.0259, i32 0
+  %x13 = tail call i32 @_xfn(<2 x double> %vecinit.i.i237) #2
+  %arrayidx = getelementptr inbounds [256 x i32]* %tab1, i64 0, i64 %indvars.iv266
+  store i32 %x13, i32* %arrayidx, align 4, !tbaa !4
+  %vecinit.i.i = insertelement <2 x double> undef, double %p3.addr.0258, i32 0
+  %x14 = tail call i32 @_xfn(<2 x double> %vecinit.i.i) #2
+  %arrayidx26 = getelementptr inbounds [256 x i32]* %tab2, i64 0, i64 %indvars.iv266
+  store i32 %x14, i32* %arrayidx26, align 4, !tbaa !4
+  %add27 = fadd double %mul19, %t.0259
+  %add28 = fadd double %mul21, %p3.addr.0258
+  %indvars.iv.next267 = add nuw nsw i64 %indvars.iv266, 1
+  %exitcond = icmp eq i64 %indvars.iv.next267, 256
+  br i1 %exitcond, label %return, label %for.body
+
+return:
+  ret void
+}
+
+declare i32 @_xfn(<2 x double>) #4
+
+!3 = metadata !{metadata !"int", metadata !4, i64 0}
+!4 = metadata !{metadata !3, metadata !3, i64 0}

diff --git a/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll b/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
index c7ec98a..9f1fb71 100644
--- a/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
+++ b/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll

@@ -1,4 +1,4 @@
-; RUN: opt -slp-vectorizer -mtriple=x86_64-apple-macosx10.9.0 -mcpu=corei7-avx -S < %s | FileCheck %s
+; RUN: opt -basicaa -slp-vectorizer -mtriple=x86_64-apple-macosx10.9.0 -mcpu=corei7-avx -S < %s | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
 

diff --git a/test/Transforms/SLPVectorizer/X86/cycle_dup.ll b/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
index fba3549..bac2c3c 100644
--- a/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
+++ b/test/Transforms/SLPVectorizer/X86/cycle_dup.ll

@@ -15,7 +15,7 @@
 ;CHECK: bitcast i32* %A to <4 x i32>*
 ;CHECK-NEXT: load <4 x i32>
 ;CHECK: phi <4 x i32>
-;CHECK-NEXT: mul <4 x i32>
+;CHECK-NEXT: mul nsw <4 x i32>
 ;CHECK-NOT: mul
 ;CHECK: phi <4 x i32>
 ;CHECK: bitcast i32* %A to <4 x i32>*

diff --git a/test/Transforms/SLPVectorizer/X86/debug_info.ll b/test/Transforms/SLPVectorizer/X86/debug_info.ll
index f4e68f2..1046087 100644
--- a/test/Transforms/SLPVectorizer/X86/debug_info.ll
+++ b/test/Transforms/SLPVectorizer/X86/debug_info.ll

@@ -23,11 +23,11 @@
 
 define i32 @depth(double* nocapture %A, i32 %m) #0 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{double* %A}, i64 0, metadata !12), !dbg !19
-  tail call void @llvm.dbg.value(metadata !{i32 %m}, i64 0, metadata !13), !dbg !19
-  tail call void @llvm.dbg.value(metadata !20, i64 0, metadata !14), !dbg !21
-  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !15), !dbg !21
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !16), !dbg !23
+  tail call void @llvm.dbg.value(metadata !{double* %A}, i64 0, metadata !12, metadata !{}), !dbg !19
+  tail call void @llvm.dbg.value(metadata !{i32 %m}, i64 0, metadata !13, metadata !{}), !dbg !19
+  tail call void @llvm.dbg.value(metadata !20, i64 0, metadata !14, metadata !{}), !dbg !21
+  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !15, metadata !{}), !dbg !21
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !16, metadata !{}), !dbg !23
   %cmp8 = icmp sgt i32 %m, 0, !dbg !23
   br i1 %cmp8, label %for.body.lr.ph, label %for.end, !dbg !23
 
@@ -49,7 +49,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -57,24 +57,24 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !32}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 187335) (llvm/trunk 187335:187340M)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/nadav/file.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 187335) (llvm/trunk 187335:187340M)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/Users/nadav/file.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"file.c", metadata !"/Users/nadav"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"depth", metadata !"depth", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (double*, i32)* @depth, null, null, metadata !11, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [depth]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/nadav/file.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00depth\00depth\00\001\000\001\000\006\00256\001\001", metadata !1, metadata !5, metadata !6, null, i32 (double*, i32)* @depth, null, null, metadata !11} ; [ DW_TAG_subprogram ] [line 1] [def] [depth]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/Users/nadav/file.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !9, metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
-!10 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
+!10 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
 !11 = metadata !{metadata !12, metadata !13, metadata !14, metadata !15, metadata !16}
-!12 = metadata !{i32 786689, metadata !4, metadata !"A", metadata !5, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [A] [line 1]
-!13 = metadata !{i32 786689, metadata !4, metadata !"m", metadata !5, i32 33554433, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [m] [line 1]
-!14 = metadata !{i32 786688, metadata !4, metadata !"y0", metadata !5, i32 2, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [y0] [line 2]
-!15 = metadata !{i32 786688, metadata !4, metadata !"y1", metadata !5, i32 2, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [y1] [line 2]
-!16 = metadata !{i32 786688, metadata !17, metadata !"i", metadata !5, i32 3, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3]
-!17 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
+!12 = metadata !{metadata !"0x101\00A\0016777217\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [A] [line 1]
+!13 = metadata !{metadata !"0x101\00m\0033554433\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [m] [line 1]
+!14 = metadata !{metadata !"0x100\00y0\002\000", metadata !4, metadata !5, metadata !10} ; [ DW_TAG_auto_variable ] [y0] [line 2]
+!15 = metadata !{metadata !"0x100\00y1\002\000", metadata !4, metadata !5, metadata !10} ; [ DW_TAG_auto_variable ] [y1] [line 2]
+!16 = metadata !{metadata !"0x100\00i\003\000", metadata !17, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [i] [line 3]
+!17 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
 !18 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
 !19 = metadata !{i32 1, i32 0, metadata !4, null}
 !20 = metadata !{double 0.000000e+00}
@@ -82,8 +82,8 @@
 !22 = metadata !{double 1.000000e+00}
 !23 = metadata !{i32 3, i32 0, metadata !17, null}
 !24 = metadata !{i32 4, i32 0, metadata !25, null}
-!25 = metadata !{i32 786443, metadata !1, metadata !17, i32 3, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
+!25 = metadata !{metadata !"0xb\003\000\001", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
 !29 = metadata !{i32 5, i32 0, metadata !25, null}
 !30 = metadata !{i32 7, i32 0, metadata !4, null}
-!31 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
-!32 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!31 = metadata !{i32 8, i32 0, metadata !4, null}
+!32 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
new file mode 100644
index 0000000..3628042
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll

@@ -0,0 +1,70 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.9.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@a = common global i64* null, align 8
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @fn1() {
+entry:
+  %0 = load i64** @a, align 8
+  %add.ptr = getelementptr inbounds i64* %0, i64 11
+  %1 = ptrtoint i64* %add.ptr to i64
+  store i64 %1, i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64* %0, i64 56
+  %2 = ptrtoint i64* %add.ptr1 to i64
+  %arrayidx2 = getelementptr inbounds i64* %0, i64 12
+  store i64 %2, i64* %arrayidx2, align 8
+  ret i32 undef
+; CHECK-LABEL: @fn1(
+; CHECK: extractelement <2 x i64*>
+; CHECK: ret
+}
+
+
+declare float @llvm.powi.f32(float, i32)
+define void @fn2(i32* %a, i32* %b, float* %c) {
+entry:
+  %i0 = load i32* %a, align 4
+  %i1 = load i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %fp1 = sitofp i32 %add1 to float
+  %call1 = tail call float @llvm.powi.f32(float %fp1,i32 %add1) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1
+  %i2 = load i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i32 1
+  %i3 = load i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %fp2 = sitofp i32 %add2 to float
+  %call2 = tail call float @llvm.powi.f32(float %fp2,i32 %add1) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32* %a, i32 2
+  %i4 = load i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32* %b, i32 2
+  %i5 = load i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %fp3 = sitofp i32 %add3 to float
+  %call3 = tail call float @llvm.powi.f32(float %fp3,i32 %add1) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32* %a, i32 3
+  %i6 = load i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32* %b, i32 3
+  %i7 = load i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %fp4 = sitofp i32 %add4 to float
+  %call4 = tail call float @llvm.powi.f32(float %fp4,i32 %add1) nounwind readnone
+
+  store float %call1, float* %c, align 4
+  %arrayidx8 = getelementptr inbounds float* %c, i32 1
+  store float %call2, float* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds float* %c, i32 2
+  store float %call3, float* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds float* %c, i32 3
+  store float %call4, float* %arrayidx10, align 4
+  ret void
+
+; CHECK-LABEL: @fn2(
+; CHECK: extractelement <4 x i32>
+; CHECK: ret
+}

diff --git a/test/Transforms/SLPVectorizer/X86/hoist.ll b/test/Transforms/SLPVectorizer/X86/hoist.ll
index 5074cea..78c58f1 100644
--- a/test/Transforms/SLPVectorizer/X86/hoist.ll
+++ b/test/Transforms/SLPVectorizer/X86/hoist.ll

@@ -21,7 +21,7 @@
 ; loop body:
 ;CHECK: phi
 ;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
+;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret
 define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {

diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll
index 8f91951..1836047 100644
--- a/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll

@@ -148,7 +148,7 @@
 ; }
 
 ; CHECK-LABEL: long_red
-; CHECK: fmul <4 x float>
+; CHECK: fmul fast <4 x float>
 ; CHECK: shufflevector <4 x float>
 
 define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
@@ -250,7 +250,7 @@
 ; }
 
 ; CHECK-LABEL: chain_red
-; CHECK: fmul <4 x float>
+; CHECK: fmul fast <4 x float>
 ; CHECK: shufflevector <4 x float>
 
 define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
@@ -317,7 +317,7 @@
 ; }
 
 ; CHECK-LABEL: store_red
-; CHECK: fmul <4 x float>
+; CHECK: fmul fast <4 x float>
 ; CHECK: shufflevector <4 x float>
 
 define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
@@ -379,7 +379,7 @@
 ; }
 
 ; STORE-LABEL: store_red_double
-; STORE: fmul <2 x double>
+; STORE: fmul fast <2 x double>
 ; STORE: extractelement <2 x double>
 ; STORE: extractelement <2 x double>
 

diff --git a/test/Transforms/SLPVectorizer/X86/in-tree-user.ll b/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
index 3115232..194a0fb 100644
--- a/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
+++ b/test/Transforms/SLPVectorizer/X86/in-tree-user.ll

@@ -5,9 +5,11 @@
 
 @.str = private unnamed_addr constant [6 x i8] c"bingo\00", align 1
 
-; We can't vectorize when the roots are used inside the tree.
+; Uses inside the tree must be scheduled after the corresponding tree bundle.
 ;CHECK-LABEL: @in_tree_user(
-;CHECK-NOT: load <2 x double>
+;CHECK: load <2 x double>
+;CHECK: fadd <2 x double>
+;CHECK: InTreeUser = fadd
 ;CHECK: ret
 define void @in_tree_user(double* nocapture %A, i32 %n) {
 entry:
@@ -22,7 +24,7 @@
   %mul1 = fmul double %conv, %1
   %mul2 = fmul double %mul1, 7.000000e+00
   %add = fadd double %mul2, 5.000000e+00
-  %BadValue = fadd double %add, %add    ; <------------------ In tree user.
+  %InTreeUser = fadd double %add, %add    ; <------------------ In tree user.
   %2 = or i64 %0, 1
   %arrayidx6 = getelementptr inbounds double* %A, i64 %2
   %3 = load double* %arrayidx6, align 8
@@ -43,6 +45,7 @@
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.inc
+  store double %InTreeUser, double* %A, align 8   ; Avoid dead code elimination of the InTreeUser.
   ret void
 }
 

diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 9eda29f..0221613 100644
--- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll

@@ -35,6 +35,49 @@
   ret <4 x float> %rd
 }
 
+declare void @llvm.assume(i1) nounwind
+
+; This entire tree is ephemeral, don't vectorize any of it.
+define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_eph(
+; CHECK-NOT: icmp ne <4 x i32>
+; CHECK-NOT: select <4 x i1>
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  %q0 = extractelement <4 x float> %rd, i32 0
+  %q1 = extractelement <4 x float> %rd, i32 1
+  %q2 = extractelement <4 x float> %rd, i32 2
+  %q3 = extractelement <4 x float> %rd, i32 3
+  %q4 = fadd float %q0, %q1
+  %q5 = fadd float %q2, %q3
+  %q6 = fadd float %q4, %q5
+  %qi = fcmp olt float %q6, %q5
+  call void @llvm.assume(i1 %qi)
+  ret <4 x float> undef
+}
+
 ; Insert in an order different from the vector indices to make sure it
 ; doesn't matter
 define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {

diff --git a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
index aef2479..bc12926 100644
--- a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
+++ b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll

@@ -5,10 +5,10 @@
 
 ;CHECK-LABEL: @foo(
 ;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
+;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
+;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret
 define i32 @foo(i32* nocapture %A, i32 %n) #0 {

diff --git a/test/Transforms/SLPVectorizer/X86/multi_user.ll b/test/Transforms/SLPVectorizer/X86/multi_user.ll
index cab9994..63a77e4 100644
--- a/test/Transforms/SLPVectorizer/X86/multi_user.ll
+++ b/test/Transforms/SLPVectorizer/X86/multi_user.ll

@@ -14,7 +14,7 @@
 ;CHECK-LABEL: @foo(
 ;CHECK: insertelement <4 x i32>
 ;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
+;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret
 define i32 @foo(i32* nocapture %A, i32 %n) {

diff --git a/test/Transforms/SLPVectorizer/X86/powof2div.ll b/test/Transforms/SLPVectorizer/X86/powof2div.ll
new file mode 100644
index 0000000..7aa1efd
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/powof2div.ll

@@ -0,0 +1,43 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;CHECK-LABEL: @powof2div(
+;CHECK: load <4 x i32>*
+;CHECK: add nsw <4 x i32>
+;CHECK: sdiv <4 x i32>
+define void @powof2div(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
+entry:
+  %0 = load i32* %b, align 4
+  %1 = load i32* %c, align 4
+  %add = add nsw i32 %1, %0
+  %div = sdiv i32 %add, 2
+  store i32 %div, i32* %a, align 4
+  %arrayidx3 = getelementptr inbounds i32* %b, i64 1
+  %2 = load i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32* %c, i64 1
+  %3 = load i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %3, %2
+  %div6 = sdiv i32 %add5, 2
+  %arrayidx7 = getelementptr inbounds i32* %a, i64 1
+  store i32 %div6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32* %b, i64 2
+  %4 = load i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32* %c, i64 2
+  %5 = load i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %5, %4
+  %div11 = sdiv i32 %add10, 2
+  %arrayidx12 = getelementptr inbounds i32* %a, i64 2
+  store i32 %div11, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32* %b, i64 3
+  %6 = load i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32* %c, i64 3
+  %7 = load i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %7, %6
+  %div16 = sdiv i32 %add15, 2
+  %arrayidx17 = getelementptr inbounds i32* %a, i64 3
+  store i32 %div16, i32* %arrayidx17, align 4
+  ret void
+}
+

diff --git a/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll b/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
new file mode 100644
index 0000000..3843ef7
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll

@@ -0,0 +1,350 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+
+; Check propagation of optional IR flags (PR20802). For a flag to
+; propagate from scalar instructions to their vector replacement,
+; *all* scalar instructions must have the flag.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; CHECK-LABEL: @exact(
+; CHECK: lshr exact <4 x i32>
+define void @exact(i32* %x) {
+  %idx1 = getelementptr inbounds i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32* %x, i64 3
+
+  %load1 = load i32* %idx1, align 4
+  %load2 = load i32* %idx2, align 4
+  %load3 = load i32* %idx3, align 4
+  %load4 = load i32* %idx4, align 4
+
+  %op1 = lshr exact i32 %load1, 1
+  %op2 = lshr exact i32 %load2, 1
+  %op3 = lshr exact i32 %load3, 1
+  %op4 = lshr exact i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @not_exact(
+; CHECK: lshr <4 x i32>
+define void @not_exact(i32* %x) {
+  %idx1 = getelementptr inbounds i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32* %x, i64 3
+
+  %load1 = load i32* %idx1, align 4
+  %load2 = load i32* %idx2, align 4
+  %load3 = load i32* %idx3, align 4
+  %load4 = load i32* %idx4, align 4
+
+  %op1 = lshr exact i32 %load1, 1
+  %op2 = lshr i32 %load2, 1
+  %op3 = lshr exact i32 %load3, 1
+  %op4 = lshr exact i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @nsw(
+; CHECK: add nsw <4 x i32>
+define void @nsw(i32* %x) {
+  %idx1 = getelementptr inbounds i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32* %x, i64 3
+
+  %load1 = load i32* %idx1, align 4
+  %load2 = load i32* %idx2, align 4
+  %load3 = load i32* %idx3, align 4
+  %load4 = load i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = add nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = add nsw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @not_nsw(
+; CHECK: add <4 x i32>
+define void @not_nsw(i32* %x) {
+  %idx1 = getelementptr inbounds i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32* %x, i64 3
+
+  %load1 = load i32* %idx1, align 4
+  %load2 = load i32* %idx2, align 4
+  %load3 = load i32* %idx3, align 4
+  %load4 = load i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = add nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = add i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @nuw(
+; CHECK: add nuw <4 x i32>
+define void @nuw(i32* %x) {
+  %idx1 = getelementptr inbounds i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32* %x, i64 3
+
+  %load1 = load i32* %idx1, align 4
+  %load2 = load i32* %idx2, align 4
+  %load3 = load i32* %idx3, align 4
+  %load4 = load i32* %idx4, align 4
+
+  %op1 = add nuw i32 %load1, 1
+  %op2 = add nuw i32 %load2, 1
+  %op3 = add nuw i32 %load3, 1
+  %op4 = add nuw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+ 
+; CHECK-LABEL: @not_nuw(
+; CHECK: add <4 x i32>
+define void @not_nuw(i32* %x) {
+  %idx1 = getelementptr inbounds i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32* %x, i64 3
+
+  %load1 = load i32* %idx1, align 4
+  %load2 = load i32* %idx2, align 4
+  %load3 = load i32* %idx3, align 4
+  %load4 = load i32* %idx4, align 4
+
+  %op1 = add nuw i32 %load1, 1
+  %op2 = add i32 %load2, 1
+  %op3 = add i32 %load3, 1
+  %op4 = add nuw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+ 
+; CHECK-LABEL: @nnan(
+; CHECK: fadd nnan <4 x float>
+define void @nnan(float* %x) {
+  %idx1 = getelementptr inbounds float* %x, i64 0
+  %idx2 = getelementptr inbounds float* %x, i64 1
+  %idx3 = getelementptr inbounds float* %x, i64 2
+  %idx4 = getelementptr inbounds float* %x, i64 3
+
+  %load1 = load float* %idx1, align 4
+  %load2 = load float* %idx2, align 4
+  %load3 = load float* %idx3, align 4
+  %load4 = load float* %idx4, align 4
+
+  %op1 = fadd fast nnan float %load1, 1.0
+  %op2 = fadd nnan ninf float %load2, 1.0
+  %op3 = fadd nsz nnan float %load3, 1.0
+  %op4 = fadd arcp nnan float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+ 
+; CHECK-LABEL: @not_nnan(
+; CHECK: fadd <4 x float>
+define void @not_nnan(float* %x) {
+  %idx1 = getelementptr inbounds float* %x, i64 0
+  %idx2 = getelementptr inbounds float* %x, i64 1
+  %idx3 = getelementptr inbounds float* %x, i64 2
+  %idx4 = getelementptr inbounds float* %x, i64 3
+
+  %load1 = load float* %idx1, align 4
+  %load2 = load float* %idx2, align 4
+  %load3 = load float* %idx3, align 4
+  %load4 = load float* %idx4, align 4
+
+  %op1 = fadd nnan float %load1, 1.0
+  %op2 = fadd ninf float %load2, 1.0
+  %op3 = fadd nsz float %load3, 1.0
+  %op4 = fadd arcp float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+ 
+; CHECK-LABEL: @only_fast(
+; CHECK: fadd fast <4 x float>
+define void @only_fast(float* %x) {
+  %idx1 = getelementptr inbounds float* %x, i64 0
+  %idx2 = getelementptr inbounds float* %x, i64 1
+  %idx3 = getelementptr inbounds float* %x, i64 2
+  %idx4 = getelementptr inbounds float* %x, i64 3
+
+  %load1 = load float* %idx1, align 4
+  %load2 = load float* %idx2, align 4
+  %load3 = load float* %idx3, align 4
+  %load4 = load float* %idx4, align 4
+
+  %op1 = fadd fast nnan float %load1, 1.0
+  %op2 = fadd fast nnan ninf float %load2, 1.0
+  %op3 = fadd fast nsz nnan float %load3, 1.0
+  %op4 = fadd arcp nnan fast float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+ 
+; CHECK-LABEL: @only_arcp(
+; CHECK: fadd arcp <4 x float>
+define void @only_arcp(float* %x) {
+  %idx1 = getelementptr inbounds float* %x, i64 0
+  %idx2 = getelementptr inbounds float* %x, i64 1
+  %idx3 = getelementptr inbounds float* %x, i64 2
+  %idx4 = getelementptr inbounds float* %x, i64 3
+
+  %load1 = load float* %idx1, align 4
+  %load2 = load float* %idx2, align 4
+  %load3 = load float* %idx3, align 4
+  %load4 = load float* %idx4, align 4
+
+  %op1 = fadd fast float %load1, 1.0
+  %op2 = fadd fast float %load2, 1.0
+  %op3 = fadd fast float %load3, 1.0
+  %op4 = fadd arcp float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+
+; CHECK-LABEL: @addsub_all_nsw
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+define void @addsub_all_nsw(i32* %x) {
+  %idx1 = getelementptr inbounds i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32* %x, i64 3
+
+  %load1 = load i32* %idx1, align 4
+  %load2 = load i32* %idx2, align 4
+  %load3 = load i32* %idx3, align 4
+  %load4 = load i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = sub nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = sub nsw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+ 
+; CHECK-LABEL: @addsub_some_nsw
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+define void @addsub_some_nsw(i32* %x) {
+  %idx1 = getelementptr inbounds i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32* %x, i64 3
+
+  %load1 = load i32* %idx1, align 4
+  %load2 = load i32* %idx2, align 4
+  %load3 = load i32* %idx3, align 4
+  %load4 = load i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = sub nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = sub i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+ 
+; CHECK-LABEL: @addsub_no_nsw
+; CHECK: add <4 x i32>
+; CHECK: sub <4 x i32>
+define void @addsub_no_nsw(i32* %x) {
+  %idx1 = getelementptr inbounds i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32* %x, i64 3
+
+  %load1 = load i32* %idx1, align 4
+  %load2 = load i32* %idx2, align 4
+  %load3 = load i32* %idx3, align 4
+  %load4 = load i32* %idx4, align 4
+
+  %op1 = add i32 %load1, 1
+  %op2 = sub nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = sub i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+ 

diff --git a/test/Transforms/SLPVectorizer/X86/return.ll b/test/Transforms/SLPVectorizer/X86/return.ll
new file mode 100644
index 0000000..1a81c23
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/return.ll

@@ -0,0 +1,54 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "x86_64--linux-gnu"
+
+@a = common global [4 x double] zeroinitializer, align 8
+@b = common global [4 x double] zeroinitializer, align 8
+
+; [4], b[4];
+; double foo() {
+;  double sum =0;
+;  sum = (a[0]+b[0]) + (a[1]+b[1]);
+;  return sum;
+; }
+
+; CHECK-LABEL: @return1
+; CHECK: %0 = load <2 x double>*
+; CHECK: %1 = load <2 x double>*
+; CHECK: %2 = fadd <2 x double>
+
+define double @return1() {
+entry:
+  %a0 = load double* getelementptr inbounds ([4 x double]* @a, i32 0, i32 0), align 8
+  %b0 = load double* getelementptr inbounds ([4 x double]* @b, i32 0, i32 0), align 8
+  %add0 = fadd double %a0, %b0
+  %a1 = load double* getelementptr inbounds ([4 x double]* @a, i32 0, i32 1), align 8
+  %b1 = load double* getelementptr inbounds ([4 x double]* @b, i32 0, i32 1), align 8
+  %add1 = fadd double %a1, %b1
+  %add2 = fadd double %add0, %add1
+  ret double %add2
+}
+
+; double hadd(double *x) {
+;   return ((x[0] + x[2]) + (x[1] + x[3]));
+; }
+
+; CHECK-LABEL: @return2
+; CHECK: %1 = load <2 x double>*
+; CHECK: %3 = load <2 x double>* %2
+; CHECK: %4 = fadd <2 x double> %1, %3
+
+define double @return2(double* nocapture readonly %x) {
+entry:
+  %x0 = load double* %x, align 4
+  %arrayidx1 = getelementptr inbounds double* %x, i32 2
+  %x2 = load double* %arrayidx1, align 4
+  %add3 = fadd double %x0, %x2
+  %arrayidx2 = getelementptr inbounds double* %x, i32 1
+  %x1 = load double* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds double* %x, i32 3
+  %x3 = load double* %arrayidx3, align 4
+  %add4 = fadd double %x1, %x3
+  %add5 = fadd double %add3, %add4
+  ret double %add5
+}

diff --git a/test/Transforms/SLPVectorizer/X86/saxpy.ll b/test/Transforms/SLPVectorizer/X86/saxpy.ll
index 4626341..4b39d46 100644
--- a/test/Transforms/SLPVectorizer/X86/saxpy.ll
+++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll

@@ -5,7 +5,7 @@
 
 ; SLP vectorization example from http://cs.stanford.edu/people/eschkufz/research/asplos291-schkufza.pdf
 ;CHECK: SAXPY
-;CHECK: mul <4 x i32>
+;CHECK: mul nsw <4 x i32>
 ;CHECK: ret
 
 define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a, i64 %i) {

diff --git a/test/Transforms/SLPVectorizer/X86/scheduling.ll b/test/Transforms/SLPVectorizer/X86/scheduling.ll
new file mode 100644
index 0000000..3b3bd80
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/scheduling.ll

@@ -0,0 +1,78 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+;CHECK-LABEL: @foo
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: %[[S1:.+]] = add nsw <4 x i32>
+;CHECK-DAG: store <4 x i32> %[[S1]]
+;CHECK-DAG: %[[A1:.+]] = add nsw i32
+;CHECK-DAG: %[[A2:.+]] = add nsw i32 %[[A1]]
+;CHECK-DAG: %[[A3:.+]] = add nsw i32 %[[A2]]
+;CHECK-DAG: %[[A4:.+]] = add nsw i32 %[[A3]]
+;CHECK: ret i32 %[[A4]] 
+
+define i32 @foo(i32* nocapture readonly %diff) #0 {
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add52, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32* %diff, i64 %1
+  %2 = load i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32* %diff, i64 %3
+  %4 = load i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32* %diff, i64 %5
+  %6 = load i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32* %diff, i64 %7
+  %8 = load i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+  %add24 = add nsw i32 %add10, %add17
+  %9 = or i64 %1, 2
+  %arrayidx27 = getelementptr inbounds i32* %diff, i64 %9
+  %10 = load i32* %arrayidx27, align 4
+  %11 = or i64 %1, 6
+  %arrayidx30 = getelementptr inbounds i32* %diff, i64 %11
+  %12 = load i32* %arrayidx30, align 4
+  %add31 = add nsw i32 %12, %10
+  %arrayidx34 = getelementptr inbounds [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 2
+  store i32 %add31, i32* %arrayidx34, align 8
+  %add38 = add nsw i32 %add24, %add31
+  %13 = or i64 %1, 3
+  %arrayidx41 = getelementptr inbounds i32* %diff, i64 %13
+  %14 = load i32* %arrayidx41, align 4
+  %15 = or i64 %1, 7
+  %arrayidx44 = getelementptr inbounds i32* %diff, i64 %15
+  %16 = load i32* %arrayidx44, align 4
+  %add45 = add nsw i32 %16, %14
+  %arrayidx48 = getelementptr inbounds [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 3
+  store i32 %add45, i32* %arrayidx48, align 4
+  %add52 = add nsw i32 %add38, %add45
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arraydecay = getelementptr inbounds [8 x [8 x i32]]* %m2, i64 0, i64 0
+  call void @ff([8 x i32]* %arraydecay) #1
+  ret i32 %add52
+}
+
+declare void @ff([8 x i32]*) #2
+
+

diff --git a/test/Transforms/SLPVectorizer/X86/unreachable.ll b/test/Transforms/SLPVectorizer/X86/unreachable.ll
new file mode 100644
index 0000000..8d60957
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/unreachable.ll

@@ -0,0 +1,40 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+; Check if the SLPVectorizer does not crash when handling
+; unreachable blocks with unscheduleable instructions.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define void @foo(i32* nocapture %x) #0 {
+entry:
+  br label %bb2
+
+bb1:                                    ; an unreachable block
+  %t3 = getelementptr inbounds i32* %x, i64 4
+  %t4 = load i32* %t3, align 4
+  %t5 = getelementptr inbounds i32* %x, i64 5
+  %t6 = load i32* %t5, align 4
+  %bad = fadd float %bad, 0.000000e+00  ; <- an instruction with self dependency,
+                                        ;    but legal in unreachable code
+  %t7 = getelementptr inbounds i32* %x, i64 6
+  %t8 = load i32* %t7, align 4
+  %t9 = getelementptr inbounds i32* %x, i64 7
+  %t10 = load i32* %t9, align 4
+  br label %bb2
+
+bb2:
+  %t1.0 = phi i32 [ %t4, %bb1 ], [ 2, %entry ]
+  %t2.0 = phi i32 [ %t6, %bb1 ], [ 2, %entry ]
+  %t3.0 = phi i32 [ %t8, %bb1 ], [ 2, %entry ]
+  %t4.0 = phi i32 [ %t10, %bb1 ], [ 2, %entry ]
+  store i32 %t1.0, i32* %x, align 4
+  %t12 = getelementptr inbounds i32* %x, i64 1
+  store i32 %t2.0, i32* %t12, align 4
+  %t13 = getelementptr inbounds i32* %x, i64 2
+  store i32 %t3.0, i32* %t13, align 4
+  %t14 = getelementptr inbounds i32* %x, i64 3
+  store i32 %t4.0, i32* %t14, align 4
+  ret void
+}
+

diff --git a/test/Transforms/SROA/phi-and-select.ll b/test/Transforms/SROA/phi-and-select.ll
index 8d82964..f287012 100644
--- a/test/Transforms/SROA/phi-and-select.ll
+++ b/test/Transforms/SROA/phi-and-select.ll

@@ -501,3 +501,102 @@
 ; CHECK-NOT: load
 ; CHECK: ret float %[[phi]]
 }
+
+; Verifies we fixed PR20425. We should be able to promote all alloca's to
+; registers in this test.
+;
+; %0 = slice
+; %1 = slice
+; %2 = phi(%0, %1) // == slice
+define float @simplify_phi_nodes_that_equal_slice(i1 %cond, float* %temp) {
+; CHECK-LABEL: @simplify_phi_nodes_that_equal_slice(
+entry:
+  %arr = alloca [4 x float], align 4
+; CHECK-NOT: alloca
+  br i1 %cond, label %then, label %else
+
+then:
+  %0 = getelementptr inbounds [4 x float]* %arr, i64 0, i64 3
+  store float 1.000000e+00, float* %0, align 4
+  br label %merge
+
+else:
+  %1 = getelementptr inbounds [4 x float]* %arr, i64 0, i64 3
+  store float 2.000000e+00, float* %1, align 4
+  br label %merge
+
+merge:
+  %2 = phi float* [ %0, %then ], [ %1, %else ]
+  store float 0.000000e+00, float* %temp, align 4
+  %3 = load float* %2, align 4
+  ret float %3
+}
+
+; A slightly complicated example for PR20425.
+;
+; %0 = slice
+; %1 = phi(%0) // == slice
+; %2 = slice
+; %3 = phi(%1, %2) // == slice
+define float @simplify_phi_nodes_that_equal_slice_2(i1 %cond, float* %temp) {
+; CHECK-LABEL: @simplify_phi_nodes_that_equal_slice_2(
+entry:
+  %arr = alloca [4 x float], align 4
+; CHECK-NOT: alloca
+  br i1 %cond, label %then, label %else
+
+then:
+  %0 = getelementptr inbounds [4 x float]* %arr, i64 0, i64 3
+  store float 1.000000e+00, float* %0, align 4
+  br label %then2
+
+then2:
+  %1 = phi float* [ %0, %then ]
+  store float 2.000000e+00, float* %1, align 4
+  br label %merge
+
+else:
+  %2 = getelementptr inbounds [4 x float]* %arr, i64 0, i64 3
+  store float 3.000000e+00, float* %2, align 4
+  br label %merge
+
+merge:
+  %3 = phi float* [ %1, %then2 ], [ %2, %else ]
+  store float 0.000000e+00, float* %temp, align 4
+  %4 = load float* %3, align 4
+  ret float %4
+}
+
+%struct.S = type { i32 }
+
+; Verifies we fixed PR20822. We have a foldable PHI feeding a speculatable PHI
+; which requires the rewriting of the speculated PHI to handle insertion
+; when the incoming pointer is itself from a PHI node. We would previously
+; insert a bitcast instruction *before* a PHI, producing an invalid module;
+; make sure we insert *after* the first non-PHI instruction.
+define void @PR20822() {
+; CHECK-LABEL: @PR20822(
+entry:
+  %f = alloca %struct.S, align 4
+; CHECK: %[[alloca:.*]] = alloca
+  br i1 undef, label %if.end, label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  br label %if.end
+
+if.end:                                           ; preds = %for.cond, %entry
+  %f2 = phi %struct.S* [ %f, %entry ], [ %f, %for.cond ]
+; CHECK: phi i32
+; CHECK: %[[cast:.*]] = bitcast i32* %[[alloca]] to %struct.S*
+  phi i32 [ undef, %entry ], [ undef, %for.cond ]
+  br i1 undef, label %if.then5, label %if.then2
+
+if.then2:                                         ; preds = %if.end
+  br label %if.then5
+
+if.then5:                                         ; preds = %if.then2, %if.end
+  %f1 = phi %struct.S* [ undef, %if.then2 ], [ %f2, %if.end ]
+; CHECK: phi {{.*}} %[[cast]]
+  store %struct.S undef, %struct.S* %f1, align 4
+  ret void
+}

diff --git a/test/Transforms/SROA/slice-width.ll b/test/Transforms/SROA/slice-width.ll
index 179780b..ff66dcc 100644
--- a/test/Transforms/SROA/slice-width.ll
+++ b/test/Transforms/SROA/slice-width.ll

@@ -1,7 +1,8 @@
 ; RUN: opt < %s -sroa -S | FileCheck %s
-target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-f80:128-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
 
 define void @no_split_on_non_byte_width(i32) {
 ; This tests that allocas are not split into slices that are not byte width multiple
@@ -23,3 +24,83 @@
   %t1 = load i1* %p1
   ret void
 }
+
+; PR18726: Check that we use memcpy and memset to fill out padding when we have
+; a slice with a simple single type whose store size is smaller than the slice
+; size.
+
+%union.Foo = type { x86_fp80, i64, i64 }
+
+@foo_copy_source = external constant %union.Foo
+@i64_sink = global i64 0
+
+define void @memcpy_fp80_padding() {
+  %x = alloca %union.Foo
+
+  ; Copy from a global.
+  %x_i8 = bitcast %union.Foo* %x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %x_i8, i8* bitcast (%union.Foo* @foo_copy_source to i8*), i32 32, i32 16, i1 false)
+
+  ; Access a slice of the alloca to trigger SROA.
+  %mid_p = getelementptr %union.Foo* %x, i32 0, i32 1
+  %elt = load i64* %mid_p
+  store i64 %elt, i64* @i64_sink
+  ret void
+}
+; CHECK-LABEL: define void @memcpy_fp80_padding
+; CHECK: alloca x86_fp80
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32
+; CHECK: load i64* getelementptr inbounds (%union.Foo* @foo_copy_source, i64 0, i32 1)
+; CHECK: load i64* getelementptr inbounds (%union.Foo* @foo_copy_source, i64 0, i32 2)
+
+define void @memset_fp80_padding() {
+  %x = alloca %union.Foo
+
+  ; Set to all ones.
+  %x_i8 = bitcast %union.Foo* %x to i8*
+  call void @llvm.memset.p0i8.i32(i8* %x_i8, i8 -1, i32 32, i32 16, i1 false)
+
+  ; Access a slice of the alloca to trigger SROA.
+  %mid_p = getelementptr %union.Foo* %x, i32 0, i32 1
+  %elt = load i64* %mid_p
+  store i64 %elt, i64* @i64_sink
+  ret void
+}
+; CHECK-LABEL: define void @memset_fp80_padding
+; CHECK: alloca x86_fp80
+; CHECK: call void @llvm.memset.p0i8.i32(i8* %{{.*}}, i8 -1, i32 16, i32 16, i1 false)
+; CHECK: store i64 -1, i64* @i64_sink
+
+%S.vec3float = type { float, float, float }
+%U.vec3float = type { <4 x float> }
+
+declare i32 @memcpy_vec3float_helper(%S.vec3float*)
+
+define i32 @memcpy_vec3float_widening(%S.vec3float* %x) {
+; CHECK-LABEL: @memcpy_vec3float_widening(
+; PR18726: Check that SROA does not rewrite a 12-byte memcpy into a 16-byte
+; vector store, hence accidentally putting gibberish onto the stack.
+entry:
+  ; Create a temporary variable %tmp1 and copy %x[0] into it
+  %tmp1 = alloca %S.vec3float, align 4
+  %0 = bitcast %S.vec3float* %tmp1 to i8*
+  %1 = bitcast %S.vec3float* %x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 12, i32 4, i1 false)
+
+  ; The following block does nothing; but appears to confuse SROA
+  %unused1 = bitcast %S.vec3float* %tmp1 to %U.vec3float*
+  %unused2 = getelementptr inbounds %U.vec3float* %unused1, i32 0, i32 0
+  %unused3 = load <4 x float>* %unused2, align 1
+
+  ; Create a second temporary and copy %tmp1 into it
+  %tmp2 = alloca %S.vec3float, align 4
+  %2 = bitcast %S.vec3float* %tmp2 to i8*
+  %3 = bitcast %S.vec3float* %tmp1 to i8*
+; CHECK: alloca
+; CHECK-NOT: store <4 x float>
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %2, i8* %3, i32 12, i32 4, i1 false)
+
+  %result = call i32 @memcpy_vec3float_helper(%S.vec3float* %tmp2)
+  ret i32 %result
+; CHECK: ret i32 %result
+}

diff --git a/test/Transforms/SROA/vector-lifetime-intrinsic.ll b/test/Transforms/SROA/vector-lifetime-intrinsic.ll
new file mode 100644
index 0000000..30c93b0
--- /dev/null
+++ b/test/Transforms/SROA/vector-lifetime-intrinsic.ll

@@ -0,0 +1,31 @@
+; RUN: opt -sroa -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:32-i64:32-v32:32-n32-S64"
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; CHECK: @wombat
+; CHECK-NOT: alloca
+; CHECK: ret void
+define void @wombat(<4 x float> %arg1) {
+bb:
+  %tmp = alloca <4 x float>, align 16
+  %tmp8 = bitcast <4 x float>* %tmp to i8*
+  call void @llvm.lifetime.start(i64 16, i8* %tmp8)
+  store <4 x float> %arg1, <4 x float>* %tmp, align 16
+  %tmp17 = bitcast <4 x float>* %tmp to <3 x float>*
+  %tmp18 = load <3 x float>* %tmp17
+  %tmp20 = bitcast <4 x float>* %tmp to i8*
+  call void @llvm.lifetime.end(i64 16, i8* %tmp20)
+  call void @wombat3(<3 x float> %tmp18)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @wombat3(<3 x float>) #0
+
+attributes #0 = { nounwind }

diff --git a/test/Transforms/SROA/vector-promotion.ll b/test/Transforms/SROA/vector-promotion.ll
index 9c9f6a1..830a22a 100644
--- a/test/Transforms/SROA/vector-promotion.ll
+++ b/test/Transforms/SROA/vector-promotion.ll

@@ -468,3 +468,139 @@
 ; CHECK: %[[insert:.*]] = or i32 %{{.*}}, %[[trunc]]
 ; CHECK: ret i32 %[[insert]]
 }
+
+define i32 @test7(<2 x i32> %x, <2 x i32> %y) {
+; Test that we can promote to vectors when the alloca doesn't mention any vector types.
+; CHECK-LABEL: @test7(
+entry:
+	%a = alloca [2 x i64]
+  %a.cast = bitcast [2 x i64]* %a to [2 x <2 x i32>]*
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <2 x i32>]* %a.cast, i64 0, i64 0
+  store <2 x i32> %x, <2 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <2 x i32>]* %a.cast, i64 0, i64 1
+  store <2 x i32> %y, <2 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.tmp1 = getelementptr inbounds [2 x <2 x i32>]* %a.cast, i64 0, i64 0, i64 1
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <2 x i32>]* %a.cast, i64 0, i64 1, i64 1
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <2 x i32>]* %a.cast, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: load
+; CHECK:      extractelement <2 x i32> %x, i32 1
+; CHECK-NEXT: extractelement <2 x i32> %y, i32 1
+; CHECK-NEXT: extractelement <2 x i32> %y, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test8(<2 x i32> %x) {
+; Ensure that we can promote an alloca that doesn't mention a vector type based
+; on a single store with a vector type.
+; CHECK-LABEL: @test8(
+entry:
+	%a = alloca i64
+  %a.vec = bitcast i64* %a to <2 x i32>*
+  %a.i32 = bitcast i64* %a to i32*
+; CHECK-NOT: alloca
+
+  store <2 x i32> %x, <2 x i32>* %a.vec
+; CHECK-NOT: store
+
+  %tmp1 = load i32* %a.i32
+  %a.tmp2 = getelementptr inbounds i32* %a.i32, i64 1
+  %tmp2 = load i32* %a.tmp2
+; CHECK-NOT: load
+; CHECK:      extractelement <2 x i32> %x, i32 0
+; CHECK-NEXT: extractelement <2 x i32> %x, i32 1
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  ret i32 %tmp4
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define <2 x i32> @test9(i32 %x, i32 %y) {
+; Ensure that we can promote an alloca that doesn't mention a vector type based
+; on a single load with a vector type.
+; CHECK-LABEL: @test9(
+entry:
+	%a = alloca i64
+  %a.vec = bitcast i64* %a to <2 x i32>*
+  %a.i32 = bitcast i64* %a to i32*
+; CHECK-NOT: alloca
+
+  store i32 %x, i32* %a.i32
+  %a.tmp2 = getelementptr inbounds i32* %a.i32, i64 1
+  store i32 %y, i32* %a.tmp2
+; CHECK-NOT: store
+; CHECK:      %[[V1:.*]] = insertelement <2 x i32> undef, i32 %x, i32 0
+; CHECK-NEXT: %[[V2:.*]] = insertelement <2 x i32> %[[V1]], i32 %y, i32 1
+
+  %result = load <2 x i32>* %a.vec
+; CHECK-NOT:  load
+
+  ret <2 x i32> %result
+; CHECK-NEXT: ret <2 x i32> %[[V2]]
+}
+
+define <2 x i32> @test10(<4 x i16> %x, i32 %y) {
+; If there are multiple different vector types used, we should select the one
+; with the widest elements.
+; CHECK-LABEL: @test10(
+entry:
+	%a = alloca i64
+  %a.vec1 = bitcast i64* %a to <2 x i32>*
+  %a.vec2 = bitcast i64* %a to <4 x i16>*
+  %a.i32 = bitcast i64* %a to i32*
+; CHECK-NOT: alloca
+
+  store <4 x i16> %x, <4 x i16>* %a.vec2
+  %a.tmp2 = getelementptr inbounds i32* %a.i32, i64 1
+  store i32 %y, i32* %a.tmp2
+; CHECK-NOT: store
+; CHECK:      %[[V1:.*]] = bitcast <4 x i16> %x to <2 x i32>
+; CHECK-NEXT: %[[V2:.*]] = insertelement <2 x i32> %[[V1]], i32 %y, i32 1
+
+  %result = load <2 x i32>* %a.vec1
+; CHECK-NOT:  load
+
+  ret <2 x i32> %result
+; CHECK-NEXT: ret <2 x i32> %[[V2]]
+}
+
+define <2 x float> @test11(<4 x i16> %x, i32 %y) {
+; If there are multiple different element types for different vector types,
+; pick the integer types. This isn't really important, but seems like the best
+; heuristic for making a deterministic decision.
+; CHECK-LABEL: @test11(
+entry:
+	%a = alloca i64
+  %a.vec1 = bitcast i64* %a to <2 x float>*
+  %a.vec2 = bitcast i64* %a to <4 x i16>*
+  %a.i32 = bitcast i64* %a to i32*
+; CHECK-NOT: alloca
+
+  store <4 x i16> %x, <4 x i16>* %a.vec2
+  %a.tmp2 = getelementptr inbounds i32* %a.i32, i64 1
+  store i32 %y, i32* %a.tmp2
+; CHECK-NOT: store
+; CHECK:      %[[V1:.*]] = bitcast i32 %y to <2 x i16>
+; CHECK-NEXT: %[[V2:.*]] = shufflevector <2 x i16> %[[V1]], <2 x i16> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+; CHECK-NEXT: %[[V3:.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i16> %[[V2]], <4 x i16> %x
+; CHECK-NEXT: %[[V4:.*]] = bitcast <4 x i16> %[[V3]] to <2 x float>
+
+  %result = load <2 x float>* %a.vec1
+; CHECK-NOT:  load
+
+  ret <2 x float> %result
+; CHECK-NEXT: ret <2 x float> %[[V4]]
+}

diff --git a/test/Transforms/SampleProfile/Inputs/fnptr.binprof b/test/Transforms/SampleProfile/Inputs/fnptr.binprof
new file mode 100644
index 0000000..14d7fd5
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/fnptr.binprof
Binary files differ

diff --git a/test/Transforms/SampleProfile/Inputs/fnptr.prof b/test/Transforms/SampleProfile/Inputs/fnptr.prof
new file mode 100644
index 0000000..6a3b4e2
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/fnptr.prof

@@ -0,0 +1,12 @@
+_Z3fooi:7711:610
+1: 610
+_Z3bari:20301:1437
+1: 1437
+main:184019:0
+4: 534
+6: 2080
+9: 2064 _Z3bari:1471 _Z3fooi:631
+5.1: 1075
+5: 1075
+7: 534
+4.2: 534

diff --git a/test/Transforms/SampleProfile/branch.ll b/test/Transforms/SampleProfile/branch.ll
index 65f1f17..e646609 100644
--- a/test/Transforms/SampleProfile/branch.ll
+++ b/test/Transforms/SampleProfile/branch.ll

@@ -32,8 +32,8 @@
 ; CHECK: Printing analysis 'Branch Probability Analysis' for function 'main':
 
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !13), !dbg !27
-  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !14), !dbg !27
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !13, metadata !{}), !dbg !27
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !14, metadata !{}), !dbg !27
   %cmp = icmp slt i32 %argc, 2, !dbg !28
   br i1 %cmp, label %return, label %if.end, !dbg !28
 ; CHECK: edge entry -> return probability is 1 / 2 = 50%
@@ -43,7 +43,7 @@
   %arrayidx = getelementptr inbounds i8** %argv, i64 1, !dbg !30
   %0 = load i8** %arrayidx, align 8, !dbg !30, !tbaa !31
   %call = tail call i32 @atoi(i8* %0) #4, !dbg !30
-  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !17), !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !17, metadata !{}), !dbg !30
   %cmp1 = icmp sgt i32 %call, 100, !dbg !35
   br i1 %cmp1, label %for.body, label %if.end6, !dbg !35
 ; CHECK: edge if.end -> for.body probability is 1 / 2 = 50%
@@ -55,14 +55,14 @@
   %add = fadd double %s.015, 3.049000e+00, !dbg !36
   %conv = sitofp i32 %u.016 to double, !dbg !36
   %add4 = fadd double %add, %conv, !dbg !36
-  tail call void @llvm.dbg.value(metadata !{double %add4}, i64 0, metadata !18), !dbg !36
+  tail call void @llvm.dbg.value(metadata !{double %add4}, i64 0, metadata !18, metadata !{}), !dbg !36
   %div = fdiv double 3.940000e+00, %s.015, !dbg !37
   %mul = fmul double %div, 3.200000e-01, !dbg !37
   %add5 = fadd double %add4, %mul, !dbg !37
   %sub = fsub double %add4, %add5, !dbg !37
-  tail call void @llvm.dbg.value(metadata !{double %sub}, i64 0, metadata !18), !dbg !37
+  tail call void @llvm.dbg.value(metadata !{double %sub}, i64 0, metadata !18, metadata !{}), !dbg !37
   %inc = add nsw i32 %u.016, 1, !dbg !38
-  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !21), !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !21, metadata !{}), !dbg !38
   %exitcond = icmp eq i32 %inc, %call, !dbg !38
   br i1 %exitcond, label %if.end6, label %for.body, !dbg !38
 ; CHECK: edge for.body -> if.end6 probability is 1 / 10227 = 0.00977804
@@ -86,7 +86,7 @@
 declare i32 @printf(i8* nocapture readonly, ...) #2
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #3
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -98,37 +98,37 @@
 !llvm.module.flags = !{!25, !42}
 !llvm.ident = !{!26}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 192896) (llvm/trunk 192895)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [./branch.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.4 (trunk 192896) (llvm/trunk 192895)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./branch.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"branch.cc", metadata !"."}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !12, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [./branch.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00main\00main\00\004\000\001\000\006\00256\001\004", metadata !1, metadata !5, metadata !6, null, i32 (i32, i8**)* @main, null, null, metadata !12} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./branch.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8, metadata !9}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
-!11 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!8 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = metadata !{metadata !"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !17, metadata !18, metadata !21, metadata !23}
-!13 = metadata !{i32 786689, metadata !4, metadata !"argc", metadata !5, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 4]
-!14 = metadata !{i32 786689, metadata !4, metadata !"argv", metadata !5, i32 33554436, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 4]
-!15 = metadata !{i32 786688, metadata !4, metadata !"result", metadata !5, i32 7, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 7]
-!16 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
-!17 = metadata !{i32 786688, metadata !4, metadata !"limit", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [limit] [line 8]
-!18 = metadata !{i32 786688, metadata !19, metadata !"s", metadata !5, i32 10, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [s] [line 10]
-!19 = metadata !{i32 786443, metadata !1, metadata !20, i32 9, i32 0, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [./branch.cc]
-!20 = metadata !{i32 786443, metadata !1, metadata !4, i32 9, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [./branch.cc]
-!21 = metadata !{i32 786688, metadata !22, metadata !"u", metadata !5, i32 11, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [u] [line 11]
-!22 = metadata !{i32 786443, metadata !1, metadata !19, i32 11, i32 0, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [./branch.cc]
-!23 = metadata !{i32 786688, metadata !24, metadata !"x", metadata !5, i32 12, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [x] [line 12]
-!24 = metadata !{i32 786443, metadata !1, metadata !22, i32 11, i32 0, i32 0, i32 4} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!13 = metadata !{metadata !"0x101\00argc\0016777220\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [argc] [line 4]
+!14 = metadata !{metadata !"0x101\00argv\0033554436\000", metadata !4, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ] [argv] [line 4]
+!15 = metadata !{metadata !"0x100\00result\007\000", metadata !4, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [result] [line 7]
+!16 = metadata !{metadata !"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!17 = metadata !{metadata !"0x100\00limit\008\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [limit] [line 8]
+!18 = metadata !{metadata !"0x100\00s\0010\000", metadata !19, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [s] [line 10]
+!19 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !20} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!20 = metadata !{metadata !"0xb\009\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!21 = metadata !{metadata !"0x100\00u\0011\000", metadata !22, metadata !5, metadata !8} ; [ DW_TAG_auto_variable ] [u] [line 11]
+!22 = metadata !{metadata !"0xb\0011\000\000", metadata !1, metadata !19} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!23 = metadata !{metadata !"0x100\00x\0012\000", metadata !24, metadata !5, metadata !16} ; [ DW_TAG_auto_variable ] [x] [line 12]
+!24 = metadata !{metadata !"0xb\0011\000\000", metadata !1, metadata !22} ; [ DW_TAG_lexical_block ] [./branch.cc]
 !25 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !26 = metadata !{metadata !"clang version 3.4 (trunk 192896) (llvm/trunk 192895)"}
 !27 = metadata !{i32 4, i32 0, metadata !4, null}
 !28 = metadata !{i32 5, i32 0, metadata !29, null}
-!29 = metadata !{i32 786443, metadata !1, metadata !4, i32 5, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./branch.cc]
-!30 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!29 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!30 = metadata !{i32 8, i32 0, metadata !4, null}
 !31 = metadata !{metadata !32, metadata !32, i64 0}
 !32 = metadata !{metadata !"any pointer", metadata !33, i64 0}
 !33 = metadata !{metadata !"omnipotent char", metadata !34, i64 0}
@@ -140,4 +140,4 @@
 !39 = metadata !{i32 20, i32 0, metadata !4, null}
 !40 = metadata !{i32 21, i32 0, metadata !4, null}
 !41 = metadata !{i32 22, i32 0, metadata !4, null}
-!42 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!42 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/SampleProfile/calls.ll b/test/Transforms/SampleProfile/calls.ll
index 381be87..c39472b 100644
--- a/test/Transforms/SampleProfile/calls.ll
+++ b/test/Transforms/SampleProfile/calls.ll

@@ -15,7 +15,12 @@
 ;   printf("sum is %d\n", s);
 ;   return 0;
 ; }
-
+;
+; Note that this test is missing the llvm.dbg.cu annotation. This emulates
+; the effect of the user having only used -fprofile-sample-use without
+; -gmlt when invoking the driver. In those cases, we need to track source
+; location information but we do not have to generate debug info in the
+; final binary.
 @.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
 
 ; Function Attrs: nounwind uwtable
@@ -84,33 +89,32 @@
 
 declare i32 @printf(i8*, ...) #2
 
-!llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [./calls.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [./calls.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"calls.cc", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"sum", metadata !"sum", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i32)* @_Z3sumii, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [sum]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [./calls.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!4 = metadata !{metadata !"0x2e\00sum\00sum\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, i32 (i32, i32)* @_Z3sumii, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [sum]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [./calls.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00main\00main\00\007\000\001\000\006\00256\000\007", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5 "}
 !11 = metadata !{i32 4, i32 0, metadata !4, null}
-!12 = metadata !{i32 8, i32 0, metadata !7, null} ; [ DW_TAG_imported_declaration ]
+!12 = metadata !{i32 8, i32 0, metadata !7, null}
 !13 = metadata !{i32 9, i32 0, metadata !7, null}
 !14 = metadata !{i32 9, i32 0, metadata !15, null}
-!15 = metadata !{i32 786443, metadata !1, metadata !7, i32 9, i32 0, i32 1, i32 1} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!15 = metadata !{metadata !"0xb\001", metadata !1, metadata !7} ; [ DW_TAG_lexical_block ] [./calls.cc]
 !16 = metadata !{i32 10, i32 0, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !1, metadata !7, i32 10, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!17 = metadata !{metadata !"0xb\0010\000\000", metadata !1, metadata !7} ; [ DW_TAG_lexical_block ] [./calls.cc]
 !18 = metadata !{i32 10, i32 0, metadata !19, null}
-!19 = metadata !{i32 786443, metadata !1, metadata !17, i32 10, i32 0, i32 1, i32 2} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!19 = metadata !{metadata !"0xb\001", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [./calls.cc]
 !20 = metadata !{i32 10, i32 0, metadata !21, null}
-!21 = metadata !{i32 786443, metadata !1, metadata !17, i32 10, i32 0, i32 2, i32 3} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!21 = metadata !{metadata !"0xb\002", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [./calls.cc]
 !22 = metadata !{i32 10, i32 0, metadata !23, null}
-!23 = metadata !{i32 786443, metadata !1, metadata !17, i32 10, i32 0, i32 3, i32 4} ; [ DW_TAG_lexical_block ] [./calls.cc]
+!23 = metadata !{metadata !"0xb\003", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [./calls.cc]
 !24 = metadata !{i32 11, i32 0, metadata !7, null}
 !25 = metadata !{i32 12, i32 0, metadata !7, null}

diff --git a/test/Transforms/SampleProfile/discriminator.ll b/test/Transforms/SampleProfile/discriminator.ll
index 0f773a5..73c73d1 100644
--- a/test/Transforms/SampleProfile/discriminator.ll
+++ b/test/Transforms/SampleProfile/discriminator.ll

@@ -66,25 +66,25 @@
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [discriminator.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [discriminator.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"discriminator.c", metadata !"."}
 !2 = metadata !{}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [discriminator.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\001\000\001\000\006\00256\000\001", metadata !1, metadata !5, metadata !6, null, i32 (i32)* @foo, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [discriminator.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !9 = metadata !{metadata !"clang version 3.5 "}
 !10 = metadata !{i32 2, i32 0, metadata !4, null}
 !11 = metadata !{i32 3, i32 0, metadata !4, null}
 !12 = metadata !{i32 3, i32 0, metadata !13, null}
-!13 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 0, i32 1, i32 2} ; [ DW_TAG_lexical_block ] [discriminator.c]
+!13 = metadata !{metadata !"0xb\001", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [discriminator.c]
 !14 = metadata !{i32 4, i32 0, metadata !15, null}
-!15 = metadata !{i32 786443, metadata !1, metadata !16, i32 4, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [discriminator.c]
-!16 = metadata !{i32 786443, metadata !1, metadata !4, i32 3, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [discriminator.c]
+!15 = metadata !{metadata !"0xb\004\000\001", metadata !1, metadata !16} ; [ DW_TAG_lexical_block ] [discriminator.c]
+!16 = metadata !{metadata !"0xb\003\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [discriminator.c]
 !17 = metadata !{i32 4, i32 0, metadata !18, null}
-!18 = metadata !{i32 786443, metadata !1, metadata !15, i32 4, i32 0, i32 1, i32 3} ; [ DW_TAG_lexical_block ] [discriminator.c]
+!18 = metadata !{metadata !"0xb\001", metadata !1, metadata !15} ; [ DW_TAG_lexical_block ] [discriminator.c]
 !19 = metadata !{i32 5, i32 0, metadata !16, null}
 !20 = metadata !{i32 6, i32 0, metadata !16, null}
 !21 = metadata !{i32 7, i32 0, metadata !4, null}

diff --git a/test/Transforms/SampleProfile/fnptr.ll b/test/Transforms/SampleProfile/fnptr.ll
new file mode 100644
index 0000000..f78123c
--- /dev/null
+++ b/test/Transforms/SampleProfile/fnptr.ll

@@ -0,0 +1,155 @@
+; The two profiles used in this test are the same but encoded in different
+; formats. This checks that we produce the same profile annotations regardless
+; of the profile format.
+;
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/fnptr.prof | opt -analyze -branch-prob | FileCheck %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/fnptr.binprof | opt -analyze -branch-prob | FileCheck %s
+
+; CHECK:   edge for.body3 -> if.then probability is 534 / 2598 = 20.5543%
+; CHECK:   edge for.body3 -> if.else probability is 2064 / 2598 = 79.4457%
+; CHECK:   edge for.inc -> for.inc12 probability is 1052 / 2598 = 40.4927%
+; CHECK:   edge for.inc -> for.body3 probability is 1546 / 2598 = 59.5073%
+; CHECK:   edge for.inc12 -> for.end14 probability is 518 / 1052 = 49.2395%
+; CHECK:   edge for.inc12 -> for.cond1.preheader probability is 534 / 1052 = 50.7605%
+
+; Original C++ test case.
+;
+; #include <stdlib.h>
+; #include <math.h>
+; #include <stdio.h>
+;
+; #define N 10000
+; #define M 6000
+;
+; double foo(int x) {
+;   return x * sin((double)x);
+; }
+;
+; double bar(int x) {
+;   return x - cos((double)x);
+; }
+;
+; int main() {
+;   double (*fptr)(int);
+;   double S = 0;
+;   for (int i = 0; i < N; i++)
+;     for (int j = 0; j < M; j++) {
+;       fptr = (rand() % 100 < 30) ? foo : bar;
+;       if (rand() % 100 < 10)
+;         S += (*fptr)(i + j * 300);
+;       else
+;         S += (*fptr)(i - j / 840);
+;     }
+;   printf("S = %lf\n", S);
+;   return 0;
+; }
+
+@.str = private unnamed_addr constant [9 x i8] c"S = %lf\0A\00", align 1
+
+define double @_Z3fooi(i32 %x) #0 {
+entry:
+  %conv = sitofp i32 %x to double, !dbg !2
+  %call = tail call double @sin(double %conv) #3, !dbg !8
+  %mul = fmul double %conv, %call, !dbg !8
+  ret double %mul, !dbg !8
+}
+
+declare double @sin(double) #1
+
+define double @_Z3bari(i32 %x) #0 {
+entry:
+  %conv = sitofp i32 %x to double, !dbg !9
+  %call = tail call double @cos(double %conv) #3, !dbg !11
+  %sub = fsub double %conv, %call, !dbg !11
+  ret double %sub, !dbg !11
+}
+
+declare double @cos(double) #1
+
+define i32 @main() #2 {
+entry:
+  br label %for.cond1.preheader, !dbg !12
+
+for.cond1.preheader:                              ; preds = %for.inc12, %entry
+  %i.025 = phi i32 [ 0, %entry ], [ %inc13, %for.inc12 ]
+  %S.024 = phi double [ 0.000000e+00, %entry ], [ %S.2.lcssa, %for.inc12 ]
+  br label %for.body3, !dbg !14
+
+for.body3:                                        ; preds = %for.inc, %for.cond1.preheader
+  %j.023 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %for.inc ]
+  %S.122 = phi double [ %S.024, %for.cond1.preheader ], [ %S.2, %for.inc ]
+  %call = tail call i32 @rand() #3, !dbg !15
+  %rem = srem i32 %call, 100, !dbg !15
+  %cmp4 = icmp slt i32 %rem, 30, !dbg !15
+  %_Z3fooi._Z3bari = select i1 %cmp4, double (i32)* @_Z3fooi, double (i32)* @_Z3bari, !dbg !15
+  %call5 = tail call i32 @rand() #3, !dbg !16
+  %rem6 = srem i32 %call5, 100, !dbg !16
+  %cmp7 = icmp slt i32 %rem6, 10, !dbg !16
+  br i1 %cmp7, label %if.then, label %if.else, !dbg !16, !prof !17
+
+if.then:                                          ; preds = %for.body3
+  %mul = mul nsw i32 %j.023, 300, !dbg !18
+  %add = add nsw i32 %mul, %i.025, !dbg !18
+  %call8 = tail call double %_Z3fooi._Z3bari(i32 %add), !dbg !18
+  br label %for.inc, !dbg !18
+
+if.else:                                          ; preds = %for.body3
+  %div = sdiv i32 %j.023, 840, !dbg !19
+  %sub = sub nsw i32 %i.025, %div, !dbg !19
+  %call10 = tail call double %_Z3fooi._Z3bari(i32 %sub), !dbg !19
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else
+  %call8.pn = phi double [ %call8, %if.then ], [ %call10, %if.else ]
+  %S.2 = fadd double %S.122, %call8.pn, !dbg !18
+  %inc = add nsw i32 %j.023, 1, !dbg !20
+  %exitcond = icmp eq i32 %j.023, 5999, !dbg !14
+  br i1 %exitcond, label %for.inc12, label %for.body3, !dbg !14, !prof !21
+
+for.inc12:                                        ; preds = %for.inc
+  %S.2.lcssa = phi double [ %S.2, %for.inc ]
+  %inc13 = add nsw i32 %i.025, 1, !dbg !22
+  %exitcond26 = icmp eq i32 %i.025, 9999, !dbg !12
+  br i1 %exitcond26, label %for.end14, label %for.cond1.preheader, !dbg !12, !prof !23
+
+for.end14:                                        ; preds = %for.inc12
+  %S.2.lcssa.lcssa = phi double [ %S.2.lcssa, %for.inc12 ]
+  %call15 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i64 0, i64 0), double %S.2.lcssa.lcssa), !dbg !24
+  ret i32 0, !dbg !25
+}
+
+; Function Attrs: nounwind
+declare i32 @rand() #1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...) #1
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = metadata !{i32 2, metadata !"Debug Info Version", i32 2}
+!1 = metadata !{metadata !"clang version 3.6.0 "}
+!2 = metadata !{i32 9, i32 3, metadata !3, null}
+!3 = metadata !{metadata !"0x2e\00foo\00foo\00\008\000\001\000\000\00256\001\008", metadata !4, metadata !5, metadata !6, null, double (i32)* @_Z3fooi, null, null, metadata !7} ; [ DW_TAG_subprogram ] [line 8] [def] [foo]
+!4 = metadata !{metadata !"fnptr.cc", metadata !"."}
+!5 = metadata !{metadata !"0x29", metadata !4}    ; [ DW_TAG_file_type ] [./fnptr.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", null, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{}
+!8 = metadata !{i32 9, i32 14, metadata !3, null}
+!9 = metadata !{i32 13, i32 3, metadata !10, null}
+!10 = metadata !{metadata !"0x2e\00bar\00bar\00\0012\000\001\000\000\00256\001\0012", metadata !4, metadata !5, metadata !6, null, double (i32)* @_Z3bari, null, null, metadata !7} ; [ DW_TAG_subprogram ] [line 12] [def] [bar]
+!11 = metadata !{i32 13, i32 14, metadata !10, null}
+!12 = metadata !{i32 19, i32 3, metadata !13, null}
+!13 = metadata !{metadata !"0x2e\00main\00main\00\0016\000\001\000\000\00256\001\0016", metadata !4, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !7} ; [ DW_TAG_subprogram ] [line 16] [def] [main]
+!14 = metadata !{i32 20, i32 5, metadata !13, null}
+!15 = metadata !{i32 21, i32 15, metadata !13, null}
+!16 = metadata !{i32 22, i32 11, metadata !13, null}
+!17 = metadata !{metadata !"branch_weights", i32 534, i32 2064}
+!18 = metadata !{i32 23, i32 14, metadata !13, null}
+!19 = metadata !{i32 25, i32 14, metadata !13, null}
+!20 = metadata !{i32 20, i32 28, metadata !13, null}
+!21 = metadata !{metadata !"branch_weights", i32 0, i32 1075}
+!22 = metadata !{i32 19, i32 26, metadata !13, null}
+!23 = metadata !{metadata !"branch_weights", i32 0, i32 534}
+!24 = metadata !{i32 27, i32 3, metadata !13, null}
+!25 = metadata !{i32 28, i32 3, metadata !13, null}

diff --git a/test/Transforms/SampleProfile/propagate.ll b/test/Transforms/SampleProfile/propagate.ll
index 939361b..9ee8ec5 100644
--- a/test/Transforms/SampleProfile/propagate.ll
+++ b/test/Transforms/SampleProfile/propagate.ll

@@ -198,39 +198,39 @@
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.5 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [propagate.cc] [DW_LANG_C_plus_plus]
+!0 = metadata !{metadata !"0x11\004\00clang version 3.5 \000\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [propagate.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"propagate.cc", metadata !"."}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4, metadata !7}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i64 (i32, i32, i64)* @_Z3fooiil, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [propagate.cc]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 24, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [main]
+!4 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\00256\000\003", metadata !1, metadata !5, metadata !6, null, i64 (i32, i32, i64)* @_Z3fooiil, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 3] [def] [foo]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [propagate.cc]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !"0x2e\00main\00main\00\0024\000\001\000\006\00256\000\0024", metadata !1, metadata !5, metadata !6, null, i32 ()* @main, null, null, metadata !2} ; [ DW_TAG_subprogram ] [line 24] [def] [main]
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
-!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}
 !10 = metadata !{metadata !"clang version 3.5 "}
 !11 = metadata !{i32 4, i32 0, metadata !12, null}
-!12 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!12 = metadata !{metadata !"0xb\004\000\000", metadata !1, metadata !4} ; [ DW_TAG_lexical_block ] [propagate.cc]
 !13 = metadata !{i32 5, i32 0, metadata !14, null}
-!14 = metadata !{i32 786443, metadata !1, metadata !12, i32 4, i32 0, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!14 = metadata !{metadata !"0xb\004\000\000", metadata !1, metadata !12} ; [ DW_TAG_lexical_block ] [propagate.cc]
 !15 = metadata !{i32 7, i32 0, metadata !16, null}
-!16 = metadata !{i32 786443, metadata !1, metadata !17, i32 7, i32 0, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!17 = metadata !{i32 786443, metadata !1, metadata !12, i32 6, i32 0, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!18 = metadata !{i32 8, i32 0, metadata !19, null} ; [ DW_TAG_imported_declaration ]
-!19 = metadata !{i32 786443, metadata !1, metadata !20, i32 8, i32 0, i32 0, i32 5} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!20 = metadata !{i32 786443, metadata !1, metadata !16, i32 7, i32 0, i32 0, i32 4} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!16 = metadata !{metadata !"0xb\007\000\000", metadata !1, metadata !17} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!17 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !12} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!18 = metadata !{i32 8, i32 0, metadata !19, null}
+!19 = metadata !{metadata !"0xb\008\000\000", metadata !1, metadata !20} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!20 = metadata !{metadata !"0xb\007\000\000", metadata !1, metadata !16} ; [ DW_TAG_lexical_block ] [propagate.cc]
 !21 = metadata !{i32 9, i32 0, metadata !19, null}
 !22 = metadata !{i32 10, i32 0, metadata !23, null}
-!23 = metadata !{i32 786443, metadata !1, metadata !20, i32 10, i32 0, i32 0, i32 6} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!23 = metadata !{metadata !"0xb\0010\000\000", metadata !1, metadata !20} ; [ DW_TAG_lexical_block ] [propagate.cc]
 !24 = metadata !{i32 11, i32 0, metadata !25, null}
-!25 = metadata !{i32 786443, metadata !1, metadata !23, i32 10, i32 0, i32 0, i32 7} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!25 = metadata !{metadata !"0xb\0010\000\000", metadata !1, metadata !23} ; [ DW_TAG_lexical_block ] [propagate.cc]
 !26 = metadata !{i32 12, i32 0, metadata !25, null}
 !27 = metadata !{i32 13, i32 0, metadata !25, null}
 !28 = metadata !{i32 14, i32 0, metadata !29, null}
-!29 = metadata !{i32 786443, metadata !1, metadata !30, i32 14, i32 0, i32 0, i32 9} ; [ DW_TAG_lexical_block ] [propagate.cc]
-!30 = metadata !{i32 786443, metadata !1, metadata !23, i32 13, i32 0, i32 0, i32 8} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!29 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !30} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!30 = metadata !{metadata !"0xb\0013\000\000", metadata !1, metadata !23} ; [ DW_TAG_lexical_block ] [propagate.cc]
 !31 = metadata !{i32 15, i32 0, metadata !32, null}
-!32 = metadata !{i32 786443, metadata !1, metadata !29, i32 14, i32 0, i32 0, i32 10} ; [ DW_TAG_lexical_block ] [propagate.cc]
+!32 = metadata !{metadata !"0xb\0014\000\000", metadata !1, metadata !29} ; [ DW_TAG_lexical_block ] [propagate.cc]
 !33 = metadata !{i32 16, i32 0, metadata !32, null}
 !34 = metadata !{i32 17, i32 0, metadata !32, null}
 !35 = metadata !{i32 19, i32 0, metadata !20, null}

diff --git a/test/Transforms/SampleProfile/syntax.ll b/test/Transforms/SampleProfile/syntax.ll
index 53c65f4..ed38a17 100644
--- a/test/Transforms/SampleProfile/syntax.ll
+++ b/test/Transforms/SampleProfile/syntax.ll

@@ -1,4 +1,4 @@
-; RUN: not opt < %s -sample-profile -sample-profile-file=%S/Inputs/syntax.prof 2>&1 | FileCheck -check-prefix=NO-DEBUG %s
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/syntax.prof 2>&1 | FileCheck -check-prefix=NO-DEBUG %s
 ; RUN: not opt < %s -sample-profile -sample-profile-file=missing.prof 2>&1 | FileCheck -check-prefix=MISSING-FILE %s
 ; RUN: not opt < %s -sample-profile -sample-profile-file=%S/Inputs/bad_fn_header.prof 2>&1 | FileCheck -check-prefix=BAD-FN-HEADER %s
 ; RUN: not opt < %s -sample-profile -sample-profile-file=%S/Inputs/bad_sample_line.prof 2>&1 | FileCheck -check-prefix=BAD-SAMPLE-LINE %s
@@ -11,8 +11,8 @@
 entry:
   ret void
 }
-; NO-DEBUG: error: No debug information found in function empty
-; MISSING-FILE: error: missing.prof:
+; NO-DEBUG: warning: No debug information found in function empty: Function profile not used
+; MISSING-FILE: missing.prof: Could not open profile:
 ; BAD-FN-HEADER: error: {{.*}}bad_fn_header.prof:1: Expected 'mangled_name:NUM:NUM', found 3empty:100:BAD
 ; BAD-SAMPLE-LINE: error: {{.*}}bad_sample_line.prof:3: Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found 1: BAD
 ; BAD-LINE-VALUES: error: {{.*}}bad_line_values.prof:2: Expected 'mangled_name:NUM:NUM', found -1: 10

diff --git a/test/Transforms/ScalarRepl/debuginfo-preserved.ll b/test/Transforms/ScalarRepl/debuginfo-preserved.ll
index 71bf22a..eb660d2 100644
--- a/test/Transforms/ScalarRepl/debuginfo-preserved.ll
+++ b/test/Transforms/ScalarRepl/debuginfo-preserved.ll

@@ -17,10 +17,10 @@
   %b.addr = alloca i32, align 4
   %c = alloca i32, align 4
   store i32 %a, i32* %a.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !6), !dbg !7
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !6, metadata !{}), !dbg !7
   store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !8), !dbg !9
-  call void @llvm.dbg.declare(metadata !{i32* %c}, metadata !10), !dbg !12
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !8, metadata !{}), !dbg !9
+  call void @llvm.dbg.declare(metadata !{i32* %c}, metadata !10, metadata !{}), !dbg !12
   %tmp = load i32* %a.addr, align 4, !dbg !13
   store i32 %tmp, i32* %c, align 4, !dbg !13
   %tmp1 = load i32* %a.addr, align 4, !dbg !14
@@ -37,23 +37,23 @@
   ret i32 %add7, !dbg !16
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20}
 
-!0 = metadata !{i32 786449, metadata !18, i32 12, metadata !"clang version 3.0 (trunk 131941)", i1 false, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i32)* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
-!2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.0 (trunk 131941)\000\00\000\00\000", metadata !18, metadata !19, metadata !19, metadata !17, null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"0x2e\00f\00f\00\001\000\001\000\006\00256\000\001", metadata !18, metadata !2, metadata !3, null, i32 (i32, i32)* @f, null, null, null} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!2 = metadata !{metadata !"0x29", metadata !18} ; [ DW_TAG_file_type ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !18, metadata !2, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 16777217, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !0} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00a\0016777217\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
 !7 = metadata !{i32 1, i32 11, metadata !1, null}
-!8 = metadata !{i32 786689, metadata !1, metadata !"b", metadata !2, i32 33554433, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{metadata !"0x101\00b\0033554433\000", metadata !1, metadata !2, metadata !5} ; [ DW_TAG_arg_variable ]
 !9 = metadata !{i32 1, i32 18, metadata !1, null}
-!10 = metadata !{i32 786688, metadata !11, metadata !"c", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !18, metadata !1, i32 1, i32 21, i32 0} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{metadata !"0x100\00c\002\000", metadata !11, metadata !2, metadata !5} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{metadata !"0xb\001\0021\000", metadata !18, metadata !1} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{i32 2, i32 9, metadata !11, null}
 !13 = metadata !{i32 2, i32 14, metadata !11, null}
 !14 = metadata !{i32 3, i32 5, metadata !11, null}
@@ -62,4 +62,4 @@
 !17 = metadata !{metadata !1}
 !18 = metadata !{metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b"}
 !19 = metadata !{i32 0}
-!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/Scalarizer/dbginfo.ll b/test/Transforms/Scalarizer/dbginfo.ll
index 546e89d..ee7182b 100644
--- a/test/Transforms/Scalarizer/dbginfo.ll
+++ b/test/Transforms/Scalarizer/dbginfo.ll

@@ -16,9 +16,9 @@
 ; CHECK: %b.i1 = getelementptr i32* %b.i0, i32 1
 ; CHECK: %b.i2 = getelementptr i32* %b.i0, i32 2
 ; CHECK: %b.i3 = getelementptr i32* %b.i0, i32 3
-; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %a}, i64 0, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}}
-; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %b}, i64 0, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}}
-; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %c}, i64 0, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}}
+; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %a}, i64 0, metadata !{{[0-9]+}}, metadata {{.*}}), !dbg !{{[0-9]+}}
+; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %b}, i64 0, metadata !{{[0-9]+}}, metadata {{.*}}), !dbg !{{[0-9]+}}
+; CHECK: tail call void @llvm.dbg.value(metadata !{<4 x i32>* %c}, i64 0, metadata !{{[0-9]+}}, metadata {{.*}}), !dbg !{{[0-9]+}}
 ; CHECK: %bval.i0 = load i32* %b.i0, align 16, !dbg ![[TAG1:[0-9]+]], !tbaa ![[TAG2:[0-9]+]]
 ; CHECK: %bval.i1 = load i32* %b.i1, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]]
 ; CHECK: %bval.i2 = load i32* %b.i2, align 8, !dbg ![[TAG1]], !tbaa ![[TAG2]]
@@ -37,9 +37,9 @@
 ; CHECK: store i32 %add.i3, i32* %a.i3, align 4, !dbg ![[TAG1]], !tbaa ![[TAG2]]
 ; CHECK: ret void
 entry:
-  tail call void @llvm.dbg.value(metadata !{<4 x i32>* %a}, i64 0, metadata !15), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{<4 x i32>* %b}, i64 0, metadata !16), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{<4 x i32>* %c}, i64 0, metadata !17), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{<4 x i32>* %a}, i64 0, metadata !15, metadata !{}), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{<4 x i32>* %b}, i64 0, metadata !16, metadata !{}), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{<4 x i32>* %c}, i64 0, metadata !17, metadata !{}), !dbg !20
   %bval = load <4 x i32>* %b, align 16, !dbg !21, !tbaa !22
   %cval = load <4 x i32>* %c, align 16, !dbg !21, !tbaa !22
   %add = add <4 x i32> %bval, %cval, !dbg !21
@@ -48,7 +48,7 @@
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
@@ -57,24 +57,24 @@
 !llvm.module.flags = !{!18, !26}
 !llvm.ident = !{!19}
 
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 194134) (llvm/trunk 194126)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/richards/llvm/build//tmp/add.c] [DW_LANG_C99]
+!0 = metadata !{metadata !"0x11\0012\00clang version 3.4 (trunk 194134) (llvm/trunk 194126)\001\00\000\00\000", metadata !1, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2} ; [ DW_TAG_compile_unit ] [/home/richards/llvm/build//tmp/add.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"/tmp/add.c", metadata !"/home/richards/llvm/build"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f1", metadata !"f1", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @f1, null, null, metadata !14, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [f]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/richards/llvm/build//tmp/add.c]
-!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !"0x2e\00f1\00f1\00\003\000\001\000\006\00256\001\004", metadata !1, metadata !5, metadata !6, null, void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @f1, null, null, metadata !14} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [f]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/home/richards/llvm/build//tmp/add.c]
+!6 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", i32 0, null, null, metadata !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8, metadata !8, metadata !8}
-!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from V4SI]
-!9 = metadata !{i32 786454, metadata !1, null, metadata !"V4SI", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ] [V4SI] [line 1, size 0, align 0, offset 0] [from ]
-!10 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 2048, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{metadata !"0xf\00\000\0064\0064\000\000", null, null, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from V4SI]
+!9 = metadata !{metadata !"0x16\00V4SI\001\000\000\000\000", metadata !1, null, metadata !10} ; [ DW_TAG_typedef ] [V4SI] [line 1, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !"0x1\00\000\00128\00128\000\002048", null, null, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
+!11 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13}
-!13 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
+!13 = metadata !{metadata !"0x21\000\004"}        ; [ DW_TAG_subrange_type ] [0, 3]
 !14 = metadata !{metadata !15, metadata !16, metadata !17}
-!15 = metadata !{i32 786689, metadata !4, metadata !"a", metadata !5, i32 16777219, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 3]
-!16 = metadata !{i32 786689, metadata !4, metadata !"b", metadata !5, i32 33554435, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 3]
-!17 = metadata !{i32 786689, metadata !4, metadata !"c", metadata !5, i32 50331651, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 3]
+!15 = metadata !{metadata !"0x101\00a\0016777219\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!16 = metadata !{metadata !"0x101\00b\0033554435\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [b] [line 3]
+!17 = metadata !{metadata !"0x101\00c\0050331651\000", metadata !4, metadata !5, metadata !8} ; [ DW_TAG_arg_variable ] [c] [line 3]
 !18 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
 !19 = metadata !{metadata !"clang version 3.4 (trunk 194134) (llvm/trunk 194126)"}
 !20 = metadata !{i32 3, i32 0, metadata !4, null}
@@ -83,4 +83,4 @@
 !23 = metadata !{metadata !"omnipotent char", metadata !24, i64 0}
 !24 = metadata !{metadata !"Simple C/C++ TBAA"}
 !25 = metadata !{i32 6, i32 0, metadata !4, null}
-!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
index c07440c..d054a3b 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll

@@ -45,7 +45,7 @@
   ret void
 }
 ; PTX-LABEL: sum_of_array(
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
@@ -88,7 +88,7 @@
   ret void
 }
 ; PTX-LABEL: sum_of_array2(
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
@@ -99,8 +99,17 @@
 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33
 
-; Similar to @sum_of_array3, but extends array indices using zext instead of
-; sext. e.g., array[zext(x + 1)][zext(y + 1)].
+
+; This function loads
+;   array[zext(x)][zext(y)]
+;   array[zext(x)][zext(y +nuw 1)]
+;   array[zext(x +nuw 1)][zext(y)]
+;   array[zext(x +nuw 1)][zext(y +nuw 1)].
+;
+; This function is similar to @sum_of_array, but it
+; 1) extends array indices using zext instead of sext;
+; 2) annotates the addition with "nuw"; otherwise, zext(x + 1) => zext(x) + 1
+;    may be invalid.
 define void @sum_of_array3(i32 %x, i32 %y, float* nocapture %output) {
 .preheader:
   %0 = zext i32 %y to i64
@@ -109,13 +118,13 @@
   %3 = addrspacecast float addrspace(3)* %2 to float*
   %4 = load float* %3, align 4
   %5 = fadd float %4, 0.000000e+00
-  %6 = add i32 %y, 1
+  %6 = add nuw i32 %y, 1
   %7 = zext i32 %6 to i64
   %8 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7
   %9 = addrspacecast float addrspace(3)* %8 to float*
   %10 = load float* %9, align 4
   %11 = fadd float %5, %10
-  %12 = add i32 %x, 1
+  %12 = add nuw i32 %x, 1
   %13 = zext i32 %12 to i64
   %14 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0
   %15 = addrspacecast float addrspace(3)* %14 to float*
@@ -129,7 +138,7 @@
   ret void
 }
 ; PTX-LABEL: sum_of_array3(
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rl|r)[0-9]+]]{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
 ; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
@@ -139,3 +148,49 @@
 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1
 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
 ; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33
+
+
+; This function loads
+;   array[zext(x)][zext(y)]
+;   array[zext(x)][zext(y)]
+;   array[zext(x) + 1][zext(y) + 1]
+;   array[zext(x) + 1][zext(y) + 1].
+;
+; We expect the generated code to reuse the computation of
+; &array[zext(x)][zext(y)]. See the expected IR and PTX for details.
+define void @sum_of_array4(i32 %x, i32 %y, float* nocapture %output) {
+.preheader:
+  %0 = zext i32 %y to i64
+  %1 = zext i32 %x to i64
+  %2 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
+  %3 = addrspacecast float addrspace(3)* %2 to float*
+  %4 = load float* %3, align 4
+  %5 = fadd float %4, 0.000000e+00
+  %6 = add i64 %0, 1
+  %7 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %6
+  %8 = addrspacecast float addrspace(3)* %7 to float*
+  %9 = load float* %8, align 4
+  %10 = fadd float %5, %9
+  %11 = add i64 %1, 1
+  %12 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %0
+  %13 = addrspacecast float addrspace(3)* %12 to float*
+  %14 = load float* %13, align 4
+  %15 = fadd float %10, %14
+  %16 = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %6
+  %17 = addrspacecast float addrspace(3)* %16 to float*
+  %18 = load float* %17, align 4
+  %19 = fadd float %15, %18
+  store float %19, float* %output, align 4
+  ret void
+}
+; PTX-LABEL: sum_of_array4(
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
+; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
+
+; IR-LABEL: @sum_of_array4(
+; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 1
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 32
+; IR: getelementptr float addrspace(3)* [[BASE_PTR]], i64 33

diff --git a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index ed40c7e..ea0d3f5 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll

@@ -235,27 +235,45 @@
 ; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array
 ; CHECK-NOT: getelementptr
 
-; if zext(a + b) <= max signed value of typeof(a + b), then we can prove
-; a + b >= 0 and zext(a + b) == sext(a + b). If we can prove further a or b is
-; non-negative, we have zext(a + b) == sext(a) + sext(b).
-define float* @inbounds_zext_add(i32 %i, i4 %j) {
+; The code that rebuilds an OR expression used to be buggy, and failed on this
+; test.
+define float* @shl_add_or(i64 %a, float* %ptr) {
+; CHECK-LABEL: @shl_add_or(
 entry:
-  %0 = add i32 %i, 1
-  %1 = zext i32 %0 to i64
-  ; Because zext(i + 1) is an index of an in bounds GEP based on
-  ; float_2d_array, zext(i + 1) <= sizeof(float_2d_array) = 4096.
-  ; Furthermore, since typeof(i + 1) is i32 and 4096 < 2^31, we are sure the
-  ; sign bit of i + 1 is 0. This implies zext(i + 1) = sext(i + 1).
-  %2 = add i4 %j, 2
-  %3 = zext i4 %2 to i64
-  ; In this case, typeof(j + 2) is i4, so zext(j + 2) <= 4096 does not imply
-  ; the sign bit of j + 2 is 0.
-  %p = getelementptr inbounds [32 x [32 x float]]* @float_2d_array, i64 0, i64 %1, i64 %3
+  %shl = shl i64 %a, 2
+  %add = add i64 %shl, 12
+  %or = or i64 %add, 1
+; CHECK: [[OR:%or[0-9]*]] = add i64 %shl, 1
+  ; ((a << 2) + 12) and 1 have no common bits. Therefore,
+  ; SeparateConstOffsetFromGEP is able to extract the 12.
+  ; TODO(jingyue): We could reassociate the expression to combine 12 and 1.
+  %p = getelementptr float* %ptr, i64 %or
+; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr float* %ptr, i64 [[OR]]
+; CHECK: getelementptr float* [[PTR]], i64 12
   ret float* %p
+; CHECK-NEXT: ret
 }
-; CHECK-LABEL: @inbounds_zext_add(
+
+; The source code used to be buggy in checking
+; (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0)
+; where AccumulativeByteOffset is signed but ElementTypeSizeOfGEP is unsigned.
+; The compiler would promote AccumulativeByteOffset to unsigned, causing
+; unexpected results. For example, while -64 % (int64_t)24 != 0,
+; -64 % (uint64_t)24 == 0.
+%struct3 = type { i64, i32 }
+%struct2 = type { %struct3, i32 }
+%struct1 = type { i64, %struct2 }
+%struct0 = type { i32, i32, i64*, [100 x %struct1] }
+define %struct2* @sign_mod_unsign(%struct0* %ptr, i64 %idx) {
+; CHECK-LABEL: @sign_mod_unsign(
+entry:
+  %arrayidx = add nsw i64 %idx, -2
 ; CHECK-NOT: add
-; CHECK: add i4 %j, 2
-; CHECK: sext
-; CHECK: getelementptr [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float* %{{[a-zA-Z0-9]+}}, i64 32
+  %ptr2 = getelementptr inbounds %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1
+; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr %struct0* %ptr, i64 0, i32 3, i64 %idx, i32 1
+; CHECK: [[PTR1:%[a-zA-Z0-9]+]] = bitcast %struct2* [[PTR]] to i8*
+; CHECK: getelementptr i8* [[PTR1]], i64 -64
+; CHECK: bitcast
+  ret %struct2* %ptr2
+; CHECK-NEXT: ret
+}

diff --git a/test/Transforms/SimplifyCFG/UnreachableEliminate.ll b/test/Transforms/SimplifyCFG/UnreachableEliminate.ll
index e1635f4..21428c6 100644
--- a/test/Transforms/SimplifyCFG/UnreachableEliminate.ll
+++ b/test/Transforms/SimplifyCFG/UnreachableEliminate.ll

@@ -47,7 +47,7 @@
 }
 
 ; PR9450
-define i32 @test4(i32 %v) {
+define i32 @test4(i32 %v, i32 %w) {
 ; CHECK: entry:
 ; CHECK-NEXT:  switch i32 %v, label %T [
 ; CHECK-NEXT:    i32 3, label %V
@@ -67,7 +67,54 @@
 default:
         unreachable
 U:
-        ret i32 1
+        ret i32 %w
 T:
         ret i32 2
 }
+
+
+;; We can either convert the following control-flow to a select or remove the
+;; unreachable control flow because of the undef store of null. Make sure we do
+;; the latter.
+
+define void @test5(i1 %cond, i8* %ptr) {
+
+; CHECK-LABEL: test5
+; CHECK: entry:
+; CHECK-NOT: select
+; CHECK:  store i8 2, i8* %ptr
+; CHECK:  ret
+
+entry:
+  br i1 %cond, label %bb1, label %bb3
+
+bb3:
+ br label %bb2
+
+bb1:
+ br label %bb2
+
+bb2:
+  %ptr.2 = phi i8* [ %ptr, %bb3 ], [ null, %bb1 ]
+  store i8 2, i8* %ptr.2, align 8
+  ret void
+}
+
+; CHECK-LABEL: test6
+; CHECK: entry:
+; CHECK-NOT: select
+; CHECK:  store i8 2, i8* %ptr
+; CHECK:  ret
+
+define void @test6(i1 %cond, i8* %ptr) {
+entry:
+  br i1 %cond, label %bb1, label %bb2
+
+bb1:
+  br label %bb2
+
+bb2:
+  %ptr.2 = phi i8* [ %ptr, %entry ], [ null, %bb1 ]
+  store i8 2, i8* %ptr.2, align 8
+  ret void
+}

diff --git a/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll b/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll
new file mode 100644
index 0000000..22599b3
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll

@@ -0,0 +1,50 @@
+; RUN: opt -S -simplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
+; rdar://17887153
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin12.0.0"
+
+; When we have a covered lookup table, make sure we don't delete PHINodes that
+; are cached in PHIs.
+; CHECK-LABEL: @test
+; CHECK: entry:
+; CHECK-NEXT: sub i3 %arg, -4
+; CHECK-NEXT: zext i3 %switch.tableidx to i4
+; CHECK-NEXT: getelementptr inbounds [8 x i64]* @switch.table, i32 0, i4 %switch.tableidx.zext
+; CHECK-NEXT: load i64* %switch.gep
+; CHECK-NEXT: add i64
+; CHECK-NEXT: ret i64
+define i64 @test(i3 %arg) {
+entry:
+  switch i3 %arg, label %Default [
+    i3 -2, label %Label6
+    i3 1, label %Label1
+    i3 2, label %Label2
+    i3 3, label %Label3
+    i3 -4, label %Label4
+    i3 -3, label %Label5
+  ]
+
+Default:
+  %v1 = phi i64 [ 7, %Label6 ], [ 11, %Label5 ], [ 6, %Label4 ], [ 13, %Label3 ], [ 9, %Label2 ], [ 15, %Label1 ], [ 8, %entry ]
+  %v2 = phi i64 [ 0, %Label6 ], [ 0, %Label5 ], [ 0, %Label4 ], [ 0, %Label3 ], [ 0, %Label2 ], [ 0, %Label1 ], [ 0, %entry ]
+  %v3 = add i64 %v1, %v2
+  ret i64 %v3
+
+Label1:
+  br label %Default
+
+Label2:
+  br label %Default
+
+Label3:
+  br label %Default
+
+Label4:
+  br label %Default
+
+Label5:
+  br label %Default
+
+Label6:
+  br label %Default
+}

diff --git a/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll b/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll
new file mode 100644
index 0000000..d0b8ab2
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll

@@ -0,0 +1,41 @@
+; RUN: opt -S -simplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
+; rdar://17735071
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin12.0.0"
+
+; When tableindex can't fit into i2, we should extend the type to i3.
+; CHECK-LABEL: @_TFO6reduce1E5toRawfS0_FT_Si
+; CHECK: entry:
+; CHECK-NEXT: sub i2 %0, -2
+; CHECK-NEXT: zext i2 %switch.tableidx to i3
+; CHECK-NEXT: getelementptr inbounds [4 x i64]* @switch.table, i32 0, i3 %switch.tableidx.zext
+; CHECK-NEXT: load i64* %switch.gep
+; CHECK-NEXT: ret i64 %switch.load
+define i64 @_TFO6reduce1E5toRawfS0_FT_Si(i2) {
+entry:
+  switch i2 %0, label %1 [
+    i2 0, label %2
+    i2 1, label %3
+    i2 -2, label %4
+    i2 -1, label %5
+  ]
+
+; <label>:1                                       ; preds = %entry
+  unreachable
+
+; <label>:2                                       ; preds = %2
+  br label %6
+
+; <label>:3                                       ; preds = %4
+  br label %6
+
+; <label>:4                                       ; preds = %6
+  br label %6
+
+; <label>:5                                       ; preds = %8
+  br label %6
+
+; <label>:6                                      ; preds = %3, %5, %7, %9
+  %7 = phi i64 [ 3, %5 ], [ 2, %4 ], [ 1, %3 ], [ 0, %2 ]
+  ret i64 %7
+}

diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index 51ced40..fc22e7e 100644
--- a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll

@@ -856,10 +856,10 @@
 ; CHECK: entry:
 ; CHECK: br i1 %{{.*}}, label %switch.hole_check, label %sw.default
 ; CHECK: switch.hole_check:
-; CHECK-NEXT: %switch.maskindex = trunc i32 %switch.tableidx to i6
-; CHECK-NEXT: %switch.shifted = lshr i6 -17, %switch.maskindex
+; CHECK-NEXT: %switch.maskindex = trunc i32 %switch.tableidx to i8
+; CHECK-NEXT: %switch.shifted = lshr i8 47, %switch.maskindex
 ; The mask is binary 101111.
-; CHECK-NEXT: %switch.lobit = trunc i6 %switch.shifted to i1
+; CHECK-NEXT: %switch.lobit = trunc i8 %switch.shifted to i1
 ; CHECK-NEXT: br i1 %switch.lobit, label %switch.lookup, label %sw.default
 ; CHECK-NOT: switch i32
 }
@@ -895,7 +895,7 @@
 sw.bb2: br label %return
 sw.default: br label %return
 return:
-  %x = phi i32 [ 3, %sw.default ], [ 5, %sw.bb2 ], [ 7, %sw.bb1 ], [ 9, %entry ]
+  %x = phi i32 [ 3, %sw.default ], [ 5, %sw.bb2 ], [ 7, %sw.bb1 ], [ 10, %entry ]
   ret i32 %x
 ; CHECK-LABEL: @threecases(
 ; CHECK-NOT: switch i32
@@ -915,8 +915,12 @@
   %x = phi i32 [ 3, %sw.default ], [ 7, %sw.bb1 ], [ 9, %entry ]
   ret i32 %x
 ; CHECK-LABEL: @twocases(
-; CHECK: switch i32
+; CHECK-NOT: switch i32
 ; CHECK-NOT: @switch.table
+; CHECK: %switch.selectcmp
+; CHECK-NEXT: %switch.select
+; CHECK-NEXT: %switch.selectcmp1
+; CHECK-NEXT: %switch.select2
 }
 
 ; Don't build tables for switches with TLS variables.
@@ -973,3 +977,104 @@
 ; CHECK: switch i32
 ; CHECK-NOT: @switch.table
 }
+
+; We can use linear mapping.
+define i8 @linearmap1(i32 %c) {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 10, label %return
+    i32 11, label %sw.bb1
+    i32 12, label %sw.bb2
+    i32 13, label %sw.bb3
+  ]
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: br label %return
+return:
+  %x = phi i8 [ 3, %sw.default ], [ 3, %sw.bb3 ], [ 8, %sw.bb2 ], [ 13, %sw.bb1 ], [ 18, %entry ]
+  ret i8 %x
+; CHECK-LABEL: @linearmap1(
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %c, 10
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.idx.cast = trunc i32 %switch.tableidx to i8
+; CHECK-NEXT: %switch.idx.mult = mul i8 %switch.idx.cast, -5
+; CHECK-NEXT: %switch.offset = add i8 %switch.idx.mult, 18
+; CHECK-NEXT: ret i8 %switch.offset
+}
+
+; Linear mapping in a different configuration.
+define i32 @linearmap2(i8 %c) {
+entry:
+  switch i8 %c, label %sw.default [
+    i8 -10, label %return
+    i8 -11, label %sw.bb1
+    i8 -12, label %sw.bb2
+    i8 -13, label %sw.bb3
+  ]
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: br label %return
+return:
+  %x = phi i32 [ 3, %sw.default ], [ 18, %sw.bb3 ], [ 19, %sw.bb2 ], [ 20, %sw.bb1 ], [ 21, %entry ]
+  ret i32 %x
+; CHECK-LABEL: @linearmap2(
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i8 %c, -13
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.idx.cast = zext i8 %switch.tableidx to i32
+; CHECK-NEXT: %switch.offset = add i32 %switch.idx.cast, 18
+; CHECK-NEXT: ret i32 %switch.offset
+}
+
+; Linear mapping with overflows.
+define i8 @linearmap3(i32 %c) {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 10, label %return
+    i32 11, label %sw.bb1
+    i32 12, label %sw.bb2
+    i32 13, label %sw.bb3
+  ]
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: br label %return
+return:
+  %x = phi i8 [ 3, %sw.default ], [ 44, %sw.bb3 ], [ -56, %sw.bb2 ], [ 100, %sw.bb1 ], [ 0, %entry ]
+  ret i8 %x
+; CHECK-LABEL: @linearmap3(
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %c, 10
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.idx.cast = trunc i32 %switch.tableidx to i8
+; CHECK-NEXT: %switch.idx.mult = mul i8 %switch.idx.cast, 100
+; CHECK-NEXT: ret i8 %switch.idx.mult
+}
+
+; Linear mapping with with multiplier 1 and offset 0.
+define i8 @linearmap4(i32 %c) {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 -2, label %return
+    i32 -1, label %sw.bb1
+    i32 0, label %sw.bb2
+    i32 1, label %sw.bb3
+  ]
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: br label %return
+return:
+  %x = phi i8 [ 3, %sw.default ], [ 3, %sw.bb3 ], [ 2, %sw.bb2 ], [ 1, %sw.bb1 ], [ 0, %entry ]
+  ret i8 %x
+; CHECK-LABEL: @linearmap4(
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %c, -2
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.idx.cast = trunc i32 %switch.tableidx to i8
+; CHECK-NEXT: ret i8 %switch.idx.cast
+}
+

diff --git a/test/Transforms/SimplifyCFG/assume.ll b/test/Transforms/SimplifyCFG/assume.ll
new file mode 100644
index 0000000..1d1b96a
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/assume.ll

@@ -0,0 +1,22 @@
+; RUN: opt -simplifycfg -S < %s | FileCheck %s
+
+define void @test1() {
+        call void @llvm.assume(i1 0)
+        ret void
+
+; CHECK-LABEL: @test1
+; CHECK-NOT: llvm.assume
+; CHECK: unreachable
+}
+
+define void @test2() {
+        call void @llvm.assume(i1 undef)
+        ret void
+
+; CHECK-LABEL: @test2
+; CHECK-NOT: llvm.assume
+; CHECK: unreachable
+}
+
+declare void @llvm.assume(i1) nounwind
+

diff --git a/test/Transforms/SimplifyCFG/branch-fold-dbg.ll b/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
index 9d8086c..9235f62 100644
--- a/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
+++ b/test/Transforms/SimplifyCFG/branch-fold-dbg.ll

@@ -25,7 +25,7 @@
 
 BB3:                                              ; preds = %BB2
   %6 = getelementptr inbounds [5 x %0]* @0, i32 0, i32 %0, !dbg !6
-  call void @llvm.dbg.value(metadata !{%0* %6}, i64 0, metadata !7), !dbg !12
+  call void @llvm.dbg.value(metadata !{%0* %6}, i64 0, metadata !7, metadata !{}), !dbg !12
   %7 = icmp eq %0* %6, null, !dbg !13
   br i1 %7, label %BB5, label %BB4, !dbg !13
 
@@ -37,22 +37,22 @@
   ret void, !dbg !14
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !15, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 231, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 231] [def] [scope 0] [foo]
-!1 = metadata !{i32 589865, metadata !15} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 589841, metadata !15, i32 12, metadata !"clang (trunk 129006)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !15, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\00231\000\001\000\006\00256\000\000", metadata !15, metadata !1, metadata !3, null, void (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 231] [def] [scope 0] [foo]
+!1 = metadata !{metadata !"0x29", metadata !15} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang (trunk 129006)\001\00\000\00\000", metadata !15, metadata !4, metadata !4, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !15, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 131, i32 2, metadata !0, null}
 !6 = metadata !{i32 134, i32 2, metadata !0, null}
-!7 = metadata !{i32 590080, metadata !8, metadata !"bar", metadata !1, i32 232, metadata !9, i32 0} ; [ DW_TAG_auto_variable ]
-!8 = metadata !{i32 589835, metadata !15, metadata !0, i32 231, i32 1, i32 3} ; [ DW_TAG_lexical_block ]
-!9 = metadata !{i32 589839, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 589862, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_const_type ]
-!11 = metadata !{i32 589860, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!7 = metadata !{metadata !"0x100\00bar\00232\000", metadata !8, metadata !1, metadata !9} ; [ DW_TAG_auto_variable ]
+!8 = metadata !{metadata !"0xb\00231\001\003", metadata !15, metadata !0} ; [ DW_TAG_lexical_block ]
+!9 = metadata !{metadata !"0xf\00\000\0032\0032\000\000", null, metadata !2, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{metadata !"0x26\00\000\000\000\000\000", null, metadata !2, metadata !11} ; [ DW_TAG_const_type ]
+!11 = metadata !{metadata !"0x24\00unsigned int\000\0032\0032\000\000\007", null, metadata !2} ; [ DW_TAG_base_type ]
 !12 = metadata !{i32 232, i32 40, metadata !8, null}
 !13 = metadata !{i32 234, i32 2, metadata !8, null}
 !14 = metadata !{i32 274, i32 1, metadata !8, null}

diff --git a/test/Transforms/SimplifyCFG/branch-fold-threshold.ll b/test/Transforms/SimplifyCFG/branch-fold-threshold.ll
new file mode 100644
index 0000000..878c0a4
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/branch-fold-threshold.ll

@@ -0,0 +1,28 @@
+; RUN: opt %s -simplifycfg -S | FileCheck %s --check-prefix=NORMAL
+; RUN: opt %s -simplifycfg -S -bonus-inst-threshold=2 | FileCheck %s --check-prefix=AGGRESSIVE
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32* %input) {
+; NORMAL-LABEL: @foo(
+; AGGRESSIVE-LABEL: @foo(
+entry:
+  %cmp = icmp sgt i32 %d, 3
+  br i1 %cmp, label %cond.end, label %lor.lhs.false
+; NORMAL: br i1
+; AGGRESSIVE: br i1
+
+lor.lhs.false:
+  %mul = shl i32 %c, 1
+  %add = add nsw i32 %mul, %a
+  %cmp1 = icmp slt i32 %add, %b
+  br i1 %cmp1, label %cond.false, label %cond.end
+; NORMAL: br i1
+; AGGRESSIVE-NOT: br i1
+
+cond.false:
+  %0 = load i32* %input, align 4
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ %0, %cond.false ], [ 0, %lor.lhs.false ], [ 0, %entry ]
+  ret i32 %cond
+}

diff --git a/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll b/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
index 0547fa9..cc382be 100644
--- a/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
+++ b/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll

@@ -1,8 +1,8 @@
 ; RUN: opt -simplifycfg -S < %s | FileCheck %s
 
 define i32 @foo(i32 %i) nounwind ssp {
-  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6), !dbg !7
-  call void @llvm.dbg.value(metadata !8, i64 0, metadata !9), !dbg !11
+  call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6, metadata !{}), !dbg !7
+  call void @llvm.dbg.value(metadata !8, i64 0, metadata !9, metadata !{}), !dbg !11
   %1 = icmp ne i32 %i, 0, !dbg !12
 ;CHECK: call i32 (...)* @bar()
 ;CHECK-NEXT: llvm.dbg.value
@@ -10,12 +10,12 @@
 
 ; <label>:2                                       ; preds = %0
   %3 = call i32 (...)* @bar(), !dbg !13
-  call void @llvm.dbg.value(metadata !{i32 %3}, i64 0, metadata !9), !dbg !13
+  call void @llvm.dbg.value(metadata !{i32 %3}, i64 0, metadata !9, metadata !{}), !dbg !13
   br label %6, !dbg !15
 
 ; <label>:4                                       ; preds = %0
   %5 = call i32 (...)* @bar(), !dbg !16
-  call void @llvm.dbg.value(metadata !{i32 %5}, i64 0, metadata !9), !dbg !16
+  call void @llvm.dbg.value(metadata !{i32 %5}, i64 0, metadata !9, metadata !{}), !dbg !16
   br label %6, !dbg !18
 
 ; <label>:6                                       ; preds = %4, %2
@@ -23,34 +23,34 @@
   ret i32 %k.0, !dbg !19
 }
 
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 declare i32 @bar(...)
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.module.flags = !{!21}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !20, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
-!1 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !8, metadata !8, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !20, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\002\000\001\000\006\00256\000\000", metadata !20, metadata !1, metadata !3, null, i32 (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!1 = metadata !{metadata !"0x29", metadata !20} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang\001\00\000\00\000", metadata !20, metadata !8, metadata !8, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !20, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 589860, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 590081, metadata !0, metadata !"i", metadata !1, i32 16777218, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", null, metadata !2} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x101\00i\0016777218\000", metadata !0, metadata !1, metadata !5} ; [ DW_TAG_arg_variable ]
 !7 = metadata !{i32 2, i32 13, metadata !0, null}
 !8 = metadata !{i32 0}
-!9 = metadata !{i32 590080, metadata !10, metadata !"k", metadata !1, i32 3, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{i32 589835, metadata !20, metadata !0, i32 2, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
+!9 = metadata !{metadata !"0x100\00k\003\000", metadata !10, metadata !1, metadata !5} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{metadata !"0xb\002\0016\000", metadata !20, metadata !0} ; [ DW_TAG_lexical_block ]
 !11 = metadata !{i32 3, i32 12, metadata !10, null}
 !12 = metadata !{i32 4, i32 3, metadata !10, null}
 !13 = metadata !{i32 5, i32 5, metadata !14, null}
-!14 = metadata !{i32 589835, metadata !20, metadata !10, i32 4, i32 10, i32 1} ; [ DW_TAG_lexical_block ]
+!14 = metadata !{metadata !"0xb\004\0010\001", metadata !20, metadata !10} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 6, i32 3, metadata !14, null}
 !16 = metadata !{i32 7, i32 5, metadata !17, null}
-!17 = metadata !{i32 589835, metadata !20, metadata !10, i32 6, i32 10, i32 2} ; [ DW_TAG_lexical_block ]
+!17 = metadata !{metadata !"0xb\006\0010\002", metadata !20, metadata !10} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{i32 8, i32 3, metadata !17, null}
 !19 = metadata !{i32 9, i32 3, metadata !10, null}
 !20 = metadata !{metadata !"b.c", metadata !"/private/tmp"}
-!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/SimplifyCFG/hoist-with-range.ll b/test/Transforms/SimplifyCFG/hoist-with-range.ll
new file mode 100644
index 0000000..362aa9a
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/hoist-with-range.ll

@@ -0,0 +1,20 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+define void @foo(i1 %c, i8* %p) {
+; CHECK: if:
+; CHECK-NEXT: load i8* %p, !range !0
+; CHECK: !0 = metadata !{i8 0, i8 1, i8 3, i8 5}
+if:
+  br i1 %c, label %then, label %else
+then:
+  %t = load i8* %p, !range !0
+  br label %out
+else:
+  %e = load i8* %p, !range !1
+  br label %out
+out:
+  ret void
+}
+
+!0 = metadata !{ i8 0, i8 1 }
+!1 = metadata !{ i8 3, i8 5 }

diff --git a/test/Transforms/SimplifyCFG/lifetime.ll b/test/Transforms/SimplifyCFG/lifetime.ll
index b794221..7c66be5 100644
--- a/test/Transforms/SimplifyCFG/lifetime.ll
+++ b/test/Transforms/SimplifyCFG/lifetime.ll

@@ -1,11 +1,11 @@
 ; RUN: opt < %s -simplifycfg -S | FileCheck %s
 
-; Test that a lifetime intrinsic doesn't prevent us from simplifying this.
+; Test that a lifetime intrinsic isn't removed because that would change semantics
 
 ; CHECK: foo
 ; CHECK: entry:
-; CHECK-NOT: bb0:
-; CHECK-NOT: bb1:
+; CHECK: bb0:
+; CHECK: bb1:
 ; CHECK: ret
 define void @foo(i1 %x) {
 entry:

diff --git a/test/Transforms/SimplifyCFG/speculate-math.ll b/test/Transforms/SimplifyCFG/speculate-math.ll
index fa7976d..0ba93d2 100644
--- a/test/Transforms/SimplifyCFG/speculate-math.ll
+++ b/test/Transforms/SimplifyCFG/speculate-math.ll

@@ -3,6 +3,9 @@
 declare float @llvm.sqrt.f32(float) nounwind readonly
 declare float @llvm.fma.f32(float, float, float) nounwind readonly
 declare float @llvm.fmuladd.f32(float, float, float) nounwind readonly
+declare float @llvm.fabs.f32(float) nounwind readonly
+declare float @llvm.minnum.f32(float, float) nounwind readonly
+declare float @llvm.maxnum.f32(float, float) nounwind readonly
 
 ; CHECK-LABEL: @sqrt_test(
 ; CHECK: select
@@ -21,6 +24,22 @@
   ret void
 }
 
+; CHECK-LABEL: @fabs_test(
+; CHECK: select
+define void @fabs_test(float addrspace(1)* noalias nocapture %out, float %a) nounwind {
+entry:
+  %cmp.i = fcmp olt float %a, 0.000000e+00
+  br i1 %cmp.i, label %test_fabs.exit, label %cond.else.i
+
+cond.else.i:                                      ; preds = %entry
+  %0 = tail call float @llvm.fabs.f32(float %a) nounwind readnone
+  br label %test_fabs.exit
+
+test_fabs.exit:                                   ; preds = %cond.else.i, %entry
+  %cond.i = phi float [ %0, %cond.else.i ], [ 0x7FF8000000000000, %entry ]
+  store float %cond.i, float addrspace(1)* %out, align 4
+  ret void
+}
 
 ; CHECK-LABEL: @fma_test(
 ; CHECK: select
@@ -56,3 +75,36 @@
   ret void
 }
 
+; CHECK-LABEL: @minnum_test(
+; CHECK: select
+define void @minnum_test(float addrspace(1)* noalias nocapture %out, float %a, float %b) nounwind {
+entry:
+  %cmp.i = fcmp olt float %a, 0.000000e+00
+  br i1 %cmp.i, label %test_minnum.exit, label %cond.else.i
+
+cond.else.i:                                      ; preds = %entry
+  %0 = tail call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
+  br label %test_minnum.exit
+
+test_minnum.exit:                                   ; preds = %cond.else.i, %entry
+  %cond.i = phi float [ %0, %cond.else.i ], [ 0x7FF8000000000000, %entry ]
+  store float %cond.i, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: @maxnum_test(
+; CHECK: select
+define void @maxnum_test(float addrspace(1)* noalias nocapture %out, float %a, float %b) nounwind {
+entry:
+  %cmp.i = fcmp olt float %a, 0.000000e+00
+  br i1 %cmp.i, label %test_maxnum.exit, label %cond.else.i
+
+cond.else.i:                                      ; preds = %entry
+  %0 = tail call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
+  br label %test_maxnum.exit
+
+test_maxnum.exit:                                   ; preds = %cond.else.i, %entry
+  %cond.i = phi float [ %0, %cond.else.i ], [ 0x7FF8000000000000, %entry ]
+  store float %cond.i, float addrspace(1)* %out, align 4
+  ret void
+}

diff --git a/test/Transforms/SimplifyCFG/switch-to-select-multiple-edge-per-block-phi.ll b/test/Transforms/SimplifyCFG/switch-to-select-multiple-edge-per-block-phi.ll
new file mode 100644
index 0000000..ddf5d1f
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/switch-to-select-multiple-edge-per-block-phi.ll

@@ -0,0 +1,40 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+; a, b;
+; fn1() {
+;   if (b)
+;     if (a == 0 || a == 5)
+;       return a;
+;   return 0;
+; }
+
+; Checking that we handle correctly the case when we have a switch
+; branching multiple times to the same block
+
+@b = common global i32 0, align 4
+@a = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define i32 @fn1() {
+; CHECK-LABEL: @fn1
+; CHECK: %switch.selectcmp1 = icmp eq i32 %1, 5
+; CHECK: %switch.select2 = select i1 %switch.selectcmp1, i32 5, i32 %switch.select
+entry:
+  %0 = load i32* @b, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end3, label %if.then
+
+if.then:
+  %1 = load i32* @a, align 4
+  switch i32 %1, label %if.end3 [
+    i32 5, label %return
+    i32 0, label %return
+  ]
+
+if.end3:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %if.end3 ], [ %1, %if.then ], [ %1, %if.then ]
+  ret i32 %retval.0
+}

diff --git a/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll b/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
new file mode 100644
index 0000000..69f97e5
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll

@@ -0,0 +1,72 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+; int foo1_with_default(int a) {
+;   switch(a) {
+;     case 10:
+;       return 10;
+;     case 20:
+;       return 2;
+;   }
+;   return 4;
+; }
+
+define i32 @foo1_with_default(i32 %a) {
+; CHECK-LABEL: @foo1_with_default
+; CHECK: %switch.selectcmp = icmp eq i32 %a, 20
+; CHECK-NEXT: %switch.select = select i1 %switch.selectcmp, i32 2, i32 4
+; CHECK-NEXT: %switch.selectcmp1 = icmp eq i32 %a, 10
+; CHECK-NEXT: %switch.select2 = select i1 %switch.selectcmp1, i32 10, i32 %switch.select
+entry:
+  switch i32 %a, label %sw.epilog [
+    i32 10, label %sw.bb
+    i32 20, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.epilog:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 4, %sw.epilog ], [ 2, %sw.bb1 ], [ 10, %sw.bb ]
+  ret i32 %retval.0
+}
+
+; int foo1_without_default(int a) {
+;   switch(a) {
+;     case 10:
+;       return 10;
+;     case 20:
+;       return 2;
+;   }
+;   __builtin_unreachable();
+; }
+
+define i32 @foo1_without_default(i32 %a) {
+; CHECK-LABEL: @foo1_without_default
+; CHECK: %switch.selectcmp = icmp eq i32 %a, 10
+; CHECK-NEXT: %switch.select = select i1 %switch.selectcmp, i32 10, i32 2
+; CHECK-NOT: %switch.selectcmp1
+entry:
+  switch i32 %a, label %sw.epilog [
+    i32 10, label %sw.bb
+    i32 20, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+sw.epilog:
+  unreachable
+
+return:
+  %retval.0 = phi i32 [ 2, %sw.bb1 ], [ 10, %sw.bb ]
+  ret i32 %retval.0
+}

diff --git a/test/Transforms/SimplifyCFG/trap-debugloc.ll b/test/Transforms/SimplifyCFG/trap-debugloc.ll
index 3b449cb..adf4215 100644
--- a/test/Transforms/SimplifyCFG/trap-debugloc.ll
+++ b/test/Transforms/SimplifyCFG/trap-debugloc.ll

@@ -11,14 +11,14 @@
 !llvm.module.flags = !{!10}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !8, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
-!1 = metadata !{i32 589865, metadata !8} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 589841, metadata !8, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-206.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !8, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00\003\000\001\000\006\000\000\000", metadata !8, metadata !1, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
+!1 = metadata !{metadata !"0x29", metadata !8} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-206.1) (based on LLVM 3.0svn)\001\00\000\00\000", metadata !8, metadata !4, metadata !4, metadata !9, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !8, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 4, i32 2, metadata !6, null}
-!6 = metadata !{i32 589835, metadata !8, metadata !0, i32 3, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
+!6 = metadata !{metadata !"0xb\003\0012\000", metadata !8, metadata !0} ; [ DW_TAG_lexical_block ]
 !7 = metadata !{i32 5, i32 1, metadata !6, null}
 !8 = metadata !{metadata !"foo.c", metadata !"/private/tmp"}
 !9 = metadata !{metadata !0}
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll b/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll
index 5353744..6100a6a 100644
--- a/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll
+++ b/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll

@@ -6,11 +6,11 @@
 
 define void @foo() nounwind readnone optsize ssp {
 entry:
-  tail call void @llvm.dbg.value(metadata !9, i64 0, metadata !5), !dbg !10
+  tail call void @llvm.dbg.value(metadata !9, i64 0, metadata !5, metadata !{}), !dbg !10
   ret void, !dbg !11
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13}
@@ -18,17 +18,17 @@
 !llvm.dbg.lv.foo = !{!5}
 !llvm.dbg.gv = !{!8}
 
-!0 = metadata !{i32 524334, metadata !12, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 524329, metadata !12} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 524305, metadata !12, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 524309, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\000", metadata !12, metadata !1, metadata !3, null, void ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !12} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", metadata !12, metadata !4, metadata !4, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !12, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
-!5 = metadata !{i32 524544, metadata !6, metadata !"y", metadata !1, i32 3, metadata !7} ; [ DW_TAG_auto_variable ]
-!6 = metadata !{i32 524299, metadata !12, metadata !0, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!7 = metadata !{i32 524324, metadata !12, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 524340, i32 0, metadata !1, metadata !"x", metadata !"x", metadata !"", metadata !1, i32 1, metadata !7, i1 false, i1 true, i32* @x} ; [ DW_TAG_variable ]
+!5 = metadata !{metadata !"0x100\00y\003\000", metadata !6, metadata !1, metadata !7} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{metadata !"0xb\002\000\000", metadata !12, metadata !0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !12, metadata !1} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !"0x34\00x\00x\00\001\000\001", metadata !1, metadata !1, metadata !7, i32* @x} ; [ DW_TAG_variable ]
 !9 = metadata !{i32 0}
 !10 = metadata !{i32 3, i32 0, metadata !6, null}
 !11 = metadata !{i32 4, i32 0, metadata !6, null}
 !12 = metadata !{metadata !"b.c", metadata !"/tmp"}
-!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/StripSymbols/2010-08-25-crash.ll b/test/Transforms/StripSymbols/2010-08-25-crash.ll
index b55ac3c..c211dc1 100644
--- a/test/Transforms/StripSymbols/2010-08-25-crash.ll
+++ b/test/Transforms/StripSymbols/2010-08-25-crash.ll

@@ -7,18 +7,18 @@
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14}
 
-!0 = metadata !{i32 524334, metadata !10, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 524329, metadata !10} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 524305, metadata !10, i32 12, metadata !"clang version 2.8 (trunk 112062)", i1 true, metadata !"", i32 0, metadata !11, metadata !11, metadata !12, metadata !13, null, metadata !"", i32 1} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 524309, metadata !10, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!0 = metadata !{metadata !"0x2e\00foo\00foo\00foo\003\000\001\000\006\000\000\000", metadata !10, metadata !1, metadata !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{metadata !"0x29", metadata !10} ; [ DW_TAG_file_type ]
+!2 = metadata !{metadata !"0x11\0012\00clang version 2.8 (trunk 112062)\001\00\000\00\001", metadata !10, metadata !11, metadata !11, metadata !12, metadata !13, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !10, metadata !1, null, metadata !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
-!5 = metadata !{i32 524324, metadata !10, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 524340, i32 0, metadata !1, metadata !"i", metadata !"i", metadata !"i", metadata !1, i32 2, metadata !7, i1 true, i1 true, i32 0, null} ; [ DW_TAG_variable ]
-!7 = metadata !{i32 524326, metadata !10, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !5} ; [ DW_TAG_const_type ]
+!5 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !10, metadata !1} ; [ DW_TAG_base_type ]
+!6 = metadata !{metadata !"0x34\00i\00i\00i\002\001\001", metadata !1, metadata !1, metadata !7, i32 0, null} ; [ DW_TAG_variable ]
+!7 = metadata !{metadata !"0x26\00\000\000\000\000\000", metadata !10, metadata !1, metadata !5} ; [ DW_TAG_const_type ]
 !8 = metadata !{i32 3, i32 13, metadata !9, null}
-!9 = metadata !{i32 524299, metadata !10, metadata !0, i32 3, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!9 = metadata !{metadata !"0xb\003\0011\000", metadata !10, metadata !0} ; [ DW_TAG_lexical_block ]
 !10 = metadata !{metadata !"/tmp/a.c", metadata !"/Volumes/Lalgate/clean/D.CW"}
 !11 = metadata !{i32 0}
 !12 = metadata !{metadata !0}
 !13 = metadata !{metadata !6}
-!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/StripSymbols/strip-dead-debug-info.ll b/test/Transforms/StripSymbols/strip-dead-debug-info.ll
index 8ce7b87..04a3f32 100644
--- a/test/Transforms/StripSymbols/strip-dead-debug-info.ll
+++ b/test/Transforms/StripSymbols/strip-dead-debug-info.ll

@@ -7,7 +7,7 @@
 @xyz = global i32 2
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata) #0
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #0
 
 ; Function Attrs: nounwind readnone ssp
 define i32 @fn() #1 {
@@ -18,7 +18,7 @@
 ; Function Attrs: nounwind readonly ssp
 define i32 @foo(i32 %i) #2 {
 entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !15), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !15, metadata !{}), !dbg !20
   %.0 = load i32* @xyz, align 4
   ret i32 %.0, !dbg !21
 }
@@ -30,29 +30,29 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!25}
 
-!0 = metadata !{i32 524305, metadata !1, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !23, metadata !24, null, metadata !"", i32 1} ; [ DW_TAG_compile_unit ] [/tmp//g.c] [DW_LANG_C89]
+!0 = metadata !{metadata !"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", metadata !1, metadata !2, metadata !2, metadata !23, metadata !24, null} ; [ DW_TAG_compile_unit ] [/tmp//g.c] [DW_LANG_C89]
 !1 = metadata !{metadata !"g.c", metadata !"/tmp/"}
 !2 = metadata !{null}
-!3 = metadata !{i32 524334, metadata !1, null, metadata !"bar", metadata !"bar", metadata !"", i32 5, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [scope 0] [bar]
-!4 = metadata !{i32 524309, metadata !1, metadata !5, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!5 = metadata !{i32 524329, metadata !1}          ; [ DW_TAG_file_type ] [/tmp//g.c]
-!6 = metadata !{i32 524334, metadata !1, null, metadata !"fn", metadata !"fn", metadata !"fn", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @fn, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [fn]
-!7 = metadata !{i32 524309, metadata !1, metadata !5, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!3 = metadata !{metadata !"0x2e\00bar\00bar\00\005\001\001\000\006\000\001\000", metadata !1, null, metadata !4, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [scope 0] [bar]
+!4 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, metadata !5, null, metadata !2, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !"0x29", metadata !1}          ; [ DW_TAG_file_type ] [/tmp//g.c]
+!6 = metadata !{metadata !"0x2e\00fn\00fn\00fn\006\000\001\000\006\000\001\000", metadata !1, null, metadata !7, null, i32 ()* @fn, null, null, null} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [fn]
+!7 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, metadata !5, null, metadata !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 524324, metadata !1, metadata !5, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 524334, metadata !1, null, metadata !"foo", metadata !"foo", metadata !"foo", i32 7, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 0] [foo]
-!11 = metadata !{i32 524309, metadata !1, metadata !5, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{metadata !"0x24\00int\000\0032\0032\000\000\005", metadata !1, metadata !5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !"0x2e\00foo\00foo\00foo\007\000\001\000\006\000\001\000", metadata !1, null, metadata !11, null, i32 (i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 0] [foo]
+!11 = metadata !{metadata !"0x15\00\000\000\000\000\000\000", metadata !1, metadata !5, null, metadata !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !9, metadata !9}
-!13 = metadata !{i32 524544, metadata !14, metadata !"bb", metadata !5, i32 5, metadata !9}
-!14 = metadata !{i32 524299, metadata !1, metadata !3, i32 5, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
-!15 = metadata !{i32 524545, metadata !10, metadata !"i", metadata !5, i32 7, metadata !9}
-!16 = metadata !{i32 524340, i32 0, metadata !5, metadata !"abcd", metadata !"abcd", metadata !"", metadata !5, i32 2, metadata !9, i1 true, i1 true, null, null}
-!17 = metadata !{i32 524340, i32 0, metadata !5, metadata !"xyz", metadata !"xyz", metadata !"", metadata !5, i32 3, metadata !9, i1 false, i1 true, i32* @xyz, null}
+!13 = metadata !{metadata !"0x100\00bb\005\000", metadata !14, metadata !5, metadata !9} ; [ DW_TAG_auto_variable ]
+!14 = metadata !{metadata !"0xb\005\000\000", metadata !1, metadata !3} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
+!15 = metadata !{metadata !"0x101\00i\007\000", metadata !10, metadata !5, metadata !9} ; [ DW_TAG_arg_variable ]
+!16 = metadata !{metadata !"0x34\00abcd\00abcd\00\002\001\001", metadata !5, metadata !5, metadata !9, null, null} ; [ DW_TAG_variable ]
+!17 = metadata !{metadata !"0x34\00xyz\00xyz\00\003\000\001", metadata !5, metadata !5, metadata !9, i32* @xyz, null} ; [ DW_TAG_variable ]
 !18 = metadata !{i32 6, i32 0, metadata !19, null}
-!19 = metadata !{i32 524299, metadata !1, metadata !6, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
+!19 = metadata !{metadata !"0xb\006\000\000", metadata !1, metadata !6} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
 !20 = metadata !{i32 7, i32 0, metadata !10, null}
 !21 = metadata !{i32 10, i32 0, metadata !22, null}
-!22 = metadata !{i32 524299, metadata !1, metadata !10, i32 7, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
+!22 = metadata !{metadata !"0xb\007\000\000", metadata !1, metadata !10} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
 !23 = metadata !{metadata !3, metadata !6, metadata !10}
 !24 = metadata !{metadata !16, metadata !17}
-!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 2}

diff --git a/test/Transforms/TailCallElim/EraseBB.ll b/test/Transforms/TailCallElim/EraseBB.ll
new file mode 100644
index 0000000..c8290d7
--- /dev/null
+++ b/test/Transforms/TailCallElim/EraseBB.ll

@@ -0,0 +1,26 @@
+; RUN: opt -tailcallelim -S < %s 2>&1 | FileCheck %s
+
+; CHECK: add nsw i32
+; CHECK-NEXT: br label
+; CHECK: add nsw i32
+; CHECK-NEXT: br label
+; CHECK-NOT: Uses remain when a value is destroyed
+define i32 @test(i32 %n) {
+entry:
+  %cmp = icmp slt i32 %n, 2
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %v1 = add nsw i32 %n, -2
+  %call1 = tail call i32 @test(i32 %v1)
+  br label %return
+
+if.else:                                          ; preds = %entry
+  %v2 = add nsw i32 %n, 4
+  %call2 = tail call i32 @test(i32 %v2)
+  br label %return
+
+return:                                           ; preds = %if.end, %if.else
+  %retval = phi i32 [ %call1, %if.then ], [ %call2, %if.else ]
+  ret i32 %retval
+}

diff --git a/test/Transforms/TailCallElim/basic.ll b/test/Transforms/TailCallElim/basic.ll
index 341736d..8e9814b 100644
--- a/test/Transforms/TailCallElim/basic.ll
+++ b/test/Transforms/TailCallElim/basic.ll

@@ -174,3 +174,17 @@
 return:
   ret void
 }
+
+declare void @test11_helper1(i8** nocapture, i8*)
+declare void @test11_helper2(i8*)
+define void @test11() {
+; CHECK-LABEL: @test11
+; CHECK-NOT: tail
+  %a = alloca i8*
+  %b = alloca i8
+  call void @test11_helper1(i8** %a, i8* %b)  ; a = &b
+  %c = load i8** %a
+  call void @test11_helper2(i8* %c)
+; CHECK: call void @test11_helper2
+  ret void
+}

diff --git a/test/Transforms/TailCallElim/reorder_load.ll b/test/Transforms/TailCallElim/reorder_load.ll
index 53c65da..2e350d6 100644
--- a/test/Transforms/TailCallElim/reorder_load.ll
+++ b/test/Transforms/TailCallElim/reorder_load.ll

@@ -1,6 +1,8 @@
 ; RUN: opt < %s -tailcallelim -S | FileCheck %s
 ; PR4323
 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
 ; Several cases where tail call elimination should move the load above the call,
 ; then eliminate the tail recursion.
 
@@ -12,6 +14,11 @@
 ; This load can be moved above the call because the function won't write to it
 ; and the call has no side effects.
 define fastcc i32 @raise_load_1(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind readonly {
+; CHECK-LABEL: @raise_load_1(
+; CHECK-NOT: call
+; CHECK: load i32*
+; CHECK-NOT: call
+; CHECK: }
 entry:
 	%tmp2 = icmp sge i32 %start_arg, %a_len_arg		; <i1> [#uses=1]
 	br i1 %tmp2, label %if, label %else
@@ -21,7 +28,6 @@
 
 else:		; preds = %entry
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
-; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_1(i32* %a_arg, i32 %a_len_arg, i32 %tmp7)		; <i32> [#uses=1]
 	%tmp9 = load i32* %a_arg		; <i32> [#uses=1]
 	%tmp10 = add i32 %tmp9, %tmp8		; <i32> [#uses=1]
@@ -32,6 +38,11 @@
 ; This load can be moved above the call because the function won't write to it
 ; and the load provably can't trap.
 define fastcc i32 @raise_load_2(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) readonly {
+; CHECK-LABEL: @raise_load_2(
+; CHECK-NOT: call
+; CHECK: load i32*
+; CHECK-NOT: call
+; CHECK: }
 entry:
 	%tmp2 = icmp sge i32 %start_arg, %a_len_arg		; <i1> [#uses=1]
 	br i1 %tmp2, label %if, label %else
@@ -48,7 +59,6 @@
 
 recurse:		; preds = %else
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
-; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_2(i32* %a_arg, i32 %a_len_arg, i32 %tmp7)		; <i32> [#uses=1]
 	%tmp9 = load i32* @global		; <i32> [#uses=1]
 	%tmp10 = add i32 %tmp9, %tmp8		; <i32> [#uses=1]
@@ -59,6 +69,11 @@
 ; This load can be safely moved above the call (even though it's from an
 ; extern_weak global) because the call has no side effects.
 define fastcc i32 @raise_load_3(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) nounwind readonly {
+; CHECK-LABEL: @raise_load_3(
+; CHECK-NOT: call
+; CHECK: load i32*
+; CHECK-NOT: call
+; CHECK: }
 entry:
 	%tmp2 = icmp sge i32 %start_arg, %a_len_arg		; <i1> [#uses=1]
 	br i1 %tmp2, label %if, label %else
@@ -68,7 +83,6 @@
 
 else:		; preds = %entry
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
-; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_3(i32* %a_arg, i32 %a_len_arg, i32 %tmp7)		; <i32> [#uses=1]
 	%tmp9 = load i32* @extern_weak_global		; <i32> [#uses=1]
 	%tmp10 = add i32 %tmp9, %tmp8		; <i32> [#uses=1]
@@ -80,6 +94,12 @@
 ; unknown pointer (which normally means it might trap) because the first load
 ; proves it doesn't trap.
 define fastcc i32 @raise_load_4(i32* %a_arg, i32 %a_len_arg, i32 %start_arg) readonly {
+; CHECK-LABEL: @raise_load_4(
+; CHECK-NOT: call
+; CHECK: load i32*
+; CHECK-NEXT: load i32*
+; CHECK-NOT: call
+; CHECK: }
 entry:
 	%tmp2 = icmp sge i32 %start_arg, %a_len_arg		; <i1> [#uses=1]
 	br i1 %tmp2, label %if, label %else
@@ -97,7 +117,6 @@
 recurse:		; preds = %else
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
 	%first = load i32* %a_arg		; <i32> [#uses=1]
-; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_4(i32* %a_arg, i32 %first, i32 %tmp7)		; <i32> [#uses=1]
 	%second = load i32* %a_arg		; <i32> [#uses=1]
 	%tmp10 = add i32 %second, %tmp8		; <i32> [#uses=1]

diff --git a/test/Transforms/Util/flattencfg.ll b/test/Transforms/Util/flattencfg.ll
new file mode 100644
index 0000000..4fcb77a
--- /dev/null
+++ b/test/Transforms/Util/flattencfg.ll

@@ -0,0 +1,26 @@
+; RUN: opt -flattencfg -S < %s | FileCheck %s
+
+
+; This test checks whether the pass completes without a crash.
+; The code is not transformed in any way
+;
+; CHECK-LABEL: @test_not_crash
+define void @test_not_crash(i32 %in_a) #0 {
+entry:
+  %cmp0 = icmp eq i32 %in_a, -1
+  %cmp1 = icmp ne i32 %in_a, 0
+  %cond0 = and i1 %cmp0, %cmp1
+  br i1 %cond0, label %b0, label %b1
+
+b0:                                ; preds = %entry
+  %cmp2 = icmp eq i32 %in_a, 0
+  %cmp3 = icmp ne i32 %in_a, 1
+  %cond1 = or i1 %cmp2, %cmp3
+  br i1 %cond1, label %exit, label %b1
+
+b1:                                       ; preds = %entry, %b0
+  br label %exit
+
+exit:                               ; preds = %entry, %b0, %b1
+  ret void
+}

diff --git a/test/Transforms/Util/lowerswitch.ll b/test/Transforms/Util/lowerswitch.ll
new file mode 100644
index 0000000..06bd4cc
--- /dev/null
+++ b/test/Transforms/Util/lowerswitch.ll

@@ -0,0 +1,22 @@
+; RUN: opt -lowerswitch -S < %s | FileCheck %s
+
+; Test that we don't crash and have a different basic block for each incoming edge.
+define void @test_lower_switch() {
+; CHECK-LABEL: @test_lower_switch
+; CHECK: %merge = phi i64 [ 1, %BB3 ], [ 0, %NewDefault ], [ 0, %NodeBlock5 ], [ 0, %LeafBlock1 ]
+BB1:
+  switch i32 undef, label %BB2 [
+    i32 3, label %BB2
+    i32 5, label %BB2
+    i32 0, label %BB3
+    i32 2, label %BB3
+    i32 4, label %BB3
+  ]
+
+BB2:
+  %merge = phi i64 [ 1, %BB3 ], [ 0, %BB1 ], [ 0, %BB1 ], [ 0, %BB1 ]
+  ret void
+
+BB3:
+  br label %BB2
+}

diff --git a/test/Verifier/alias.ll b/test/Verifier/alias.ll
index ff02a37..dd04ae0 100644
--- a/test/Verifier/alias.ll
+++ b/test/Verifier/alias.ll

@@ -21,7 +21,7 @@
 
 
 @test3_a = global i32 42
-@test3_b = alias weak i32* @test3_a
+@test3_b = weak alias i32* @test3_a
 @test3_c = alias i32* @test3_b
 ; CHECK: Alias cannot point to a weak alias
 ; CHECK-NEXT: i32* @test3_c

diff --git a/test/Verifier/comdat2.ll b/test/Verifier/comdat2.ll
index 23b6cee..d78030c 100644
--- a/test/Verifier/comdat2.ll
+++ b/test/Verifier/comdat2.ll

@@ -2,4 +2,4 @@
 
 $v = comdat any
 @v = private global i32 0, comdat $v
-; CHECK: comdat global value has local linkage
+; CHECK: comdat global value has private linkage

diff --git a/test/Verifier/invoke.ll b/test/Verifier/invoke.ll
index c2750bb..e80cfcf 100644
--- a/test/Verifier/invoke.ll
+++ b/test/Verifier/invoke.ll

@@ -46,7 +46,7 @@
 
 define i8 @f2() {
 entry:
-; CHECK: Cannot invoke an intrinsinc other than donothing
+; CHECK: Cannot invoke an intrinsinc other than donothing or patchpoint
   invoke void @llvm.trap()
   to label %cont unwind label %lpad
 

diff --git a/test/Verifier/jumptable.ll b/test/Verifier/jumptable.ll
index 5f4cd3f..81984ee 100644
--- a/test/Verifier/jumptable.ll
+++ b/test/Verifier/jumptable.ll

@@ -1,4 +1,4 @@
-; RUN: not llc <%s 2>&1 | FileCheck %s
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
 
 define i32 @f() jumptable {
   ret i32 0
@@ -6,4 +6,3 @@
 
 ; CHECK: Attribute 'jumptable' requires 'unnamed_addr'
 ; CHECK: i32 ()* @f
-; CHECK: LLVM ERROR: Broken function found, compilation aborted!

diff --git a/test/Verifier/musttail-valid.ll b/test/Verifier/musttail-valid.ll
index 815d77a..bdc0c8c 100644
--- a/test/Verifier/musttail-valid.ll
+++ b/test/Verifier/musttail-valid.ll

@@ -14,3 +14,26 @@
   %w = bitcast i8* %v to i32*
   ret i32* %w
 }
+
+declare x86_thiscallcc void @varargs_thiscall(i8*, ...)
+define x86_thiscallcc void @varargs_thiscall_thunk(i8* %this, ...) {
+  musttail call x86_thiscallcc void (i8*, ...)* @varargs_thiscall(i8* %this, ...)
+  ret void
+}
+
+declare x86_fastcallcc void @varargs_fastcall(i8*, ...)
+define x86_fastcallcc void @varargs_fastcall_thunk(i8* %this, ...) {
+  musttail call x86_fastcallcc void (i8*, ...)* @varargs_fastcall(i8* %this, ...)
+  ret void
+}
+
+define x86_thiscallcc void @varargs_thiscall_unreachable(i8* %this, ...) {
+  unreachable
+}
+
+define x86_thiscallcc void @varargs_thiscall_ret_unreachable(i8* %this, ...) {
+  musttail call x86_thiscallcc void (i8*, ...)* @varargs_thiscall(i8* %this, ...)
+  ret void
+bb1:
+  ret void
+}

diff --git a/test/Verifier/range-1.ll b/test/Verifier/range-1.ll
index f15ca3f..0b20ca2 100644
--- a/test/Verifier/range-1.ll
+++ b/test/Verifier/range-1.ll

@@ -48,7 +48,7 @@
   ret i8 %y
 }
 !5 = metadata !{i32 0, i8 0}
-; CHECK: Range types must match load type!
+; CHECK: Range types must match instruction type!
 ; CHECK:  %y = load
 
 define i8 @f7(i8* %x) {
@@ -57,7 +57,7 @@
   ret i8 %y
 }
 !6 = metadata !{i8 0, i32 0}
-; CHECK: Range types must match load type!
+; CHECK: Range types must match instruction type!
 ; CHECK:  %y = load
 
 define i8 @f8(i8* %x) {
@@ -66,7 +66,7 @@
   ret i8 %y
 }
 !7 = metadata !{i32 0, i32 0}
-; CHECK: Range types must match load type!
+; CHECK: Range types must match instruction type!
 ; CHECK:  %y = load
 
 define i8 @f9(i8* %x) {
@@ -140,3 +140,12 @@
 }
 !17 = metadata !{i8 1, i8 3, i8 4, i8 5, i8 6, i8 1}
 ; CHECK: Intervals are contiguous
+
+define i8 @f18() {
+entry:
+  %y = call i8 undef(), !range !18
+  ret i8 %y
+}
+!18 = metadata !{}
+; CHECK: It should have at least one range!
+; CHECK-NEXT: metadata

diff --git a/test/lit.cfg b/test/lit.cfg
index 664d55f..372e091 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg

@@ -95,10 +95,30 @@
     if symbolizer in os.environ:
         config.environment[symbolizer] = os.environ[symbolizer]
 
-# Propagate options for sanitizers.
-for options in ['ASAN_OPTIONS']:
-    if options in os.environ:
-        config.environment[options] = os.environ[options]
+# Set up OCAMLPATH to include newly built OCaml libraries.
+llvm_lib_dir = getattr(config, 'llvm_lib_dir', None)
+if llvm_lib_dir is None:
+    if llvm_obj_root is not None:
+        llvm_lib_dir = os.path.join(llvm_obj_root, 'lib')
+
+if llvm_lib_dir is not None:
+    llvm_ocaml_lib = os.path.join(llvm_lib_dir, 'ocaml')
+    if llvm_ocaml_lib is not None:
+        if 'OCAMLPATH' in os.environ:
+            ocamlpath = os.path.pathsep.join((llvm_ocaml_lib, os.environ['OCAMLPATH']))
+            config.environment['OCAMLPATH'] = ocamlpath
+        else:
+            config.environment['OCAMLPATH'] = llvm_ocaml_lib
+
+        if 'CAML_LD_LIBRARY_PATH' in os.environ:
+            caml_ld_library_path = os.path.pathsep.join((llvm_ocaml_lib,
+                                        os.environ['CAML_LD_LIBRARY_PATH']))
+            config.environment['CAML_LD_LIBRARY_PATH'] = caml_ld_library_path
+        else:
+            config.environment['CAML_LD_LIBRARY_PATH'] = llvm_ocaml_lib
+
+# Set up OCAMLRUNPARAM to enable backtraces in OCaml tests.
+config.environment['OCAMLRUNPARAM'] = 'b'
 
 ###
 
@@ -150,16 +170,15 @@
 
 ###
 
-# Provide a command line for mcjit tests
-lli_mcjit = 'lli -use-mcjit'
+lli = 'lli'
 # The target triple used by default by lli is the process target triple (some
 # triple appropriate for generating code for the current process) but because
 # we don't support COFF in MCJIT well enough for the tests, force ELF format on
 # Windows.  FIXME: the process target triple should be used here, but this is
 # difficult to obtain on Windows.
 if re.search(r'cygwin|mingw32|win32', config.host_triple):
-  lli_mcjit += ' -mtriple='+config.host_triple+'-elf'
-config.substitutions.append( ('%lli_mcjit', lli_mcjit) )
+  lli += ' -mtriple='+config.host_triple+'-elf'
+config.substitutions.append( ('%lli', lli ) )
 
 # Similarly, have a macro to use llc with DWARF even when the host is win32.
 llc_dwarf = 'llc'
@@ -167,39 +186,23 @@
   llc_dwarf += ' -mtriple='+config.target_triple.replace('-win32', '-mingw32')
 config.substitutions.append( ('%llc_dwarf', llc_dwarf) )
 
-# Provide a substition for those tests that need to run the jit to obtain data
-# but simply want use the currently considered most reliable jit for platform
-# FIXME: ppc32 is not ready for mcjit.
-if 'arm' in config.target_triple \
-   or 'aarch64' in config.target_triple \
-   or 'powerpc64' in config.target_triple \
-   or 's390x' in config.target_triple:
-    defaultIsMCJIT = 'true'
-else:
-    defaultIsMCJIT = 'false'
-config.substitutions.append( ('%defaultjit', '-use-mcjit='+defaultIsMCJIT) )
-
-# Process jit implementation option
-jit_impl_cfg = lit_config.params.get('jit_impl', None)
-if jit_impl_cfg == 'mcjit':
-  # When running with mcjit, mangle -mcjit into target triple
-  # and add -use-mcjit flag to lli invocation
-  if 'i386' in config.target_triple or 'i686' in config.target_triple:
-    config.target_triple += jit_impl_cfg + '-ia32'
-  elif 'x86_64' in config.target_triple:
-    config.target_triple += jit_impl_cfg + '-ia64'
-  else:
-    config.target_triple += jit_impl_cfg
-
-  config.substitutions.append( ('%lli', 'lli -use-mcjit') )
-else:
-  config.substitutions.append( ('%lli', 'lli') )
-
 # Add site-specific substitutions.
-config.substitutions.append( ('%ocamlopt', config.ocamlopt_executable) )
+config.substitutions.append( ('%go', config.go_executable) )
 config.substitutions.append( ('%llvmshlibdir', config.llvm_shlib_dir) )
 config.substitutions.append( ('%shlibext', config.llvm_shlib_ext) )
 config.substitutions.append( ('%exeext', config.llvm_exe_ext) )
+config.substitutions.append( ('%python', config.python_executable) )
+
+# OCaml substitutions.
+# Support tests for both native and bytecode builds.
+config.substitutions.append( ('%ocamlc',
+    "%s ocamlc %s" % (config.ocamlfind_executable, config.ocaml_flags)) )
+if config.have_ocamlopt in ('1', 'TRUE'):
+    config.substitutions.append( ('%ocamlopt',
+        "%s ocamlopt -cclib -L%s -cclib -Wl,-rpath,%s %s" %
+            (config.ocamlfind_executable, llvm_lib_dir, llvm_lib_dir, config.ocaml_flags)) )
+else:
+    config.substitutions.append( ('%ocamlopt', "true" ) )
 
 # For each occurrence of an llvm tool name as its own word, replace it
 # with the full path to the build directory holding that tool.  This
@@ -227,6 +230,7 @@
                 r"\bllvm-dis\b",
                 r"\bllvm-dwarfdump\b",
                 r"\bllvm-extract\b",
+                r"\bllvm-go\b",
                 r"\bllvm-link\b",
                 r"\bllvm-lto\b",
                 r"\bllvm-mc\b",
@@ -239,12 +243,14 @@
                 r"\bllvm-rtdyld\b",
                 r"\bllvm-size\b",
                 r"\bllvm-tblgen\b",
+                r"\bllvm-vtabledump\b",
                 r"\bllvm-c-test\b",
                 r"\bmacho-dump\b",
                 NOJUNK + r"\bopt\b",
                 r"\bFileCheck\b",
                 r"\bobj2yaml\b",
                 r"\byaml2obj\b",
+                r"\bverify-uselistorder\b",
                 # Handle these specially as they are strings searched
                 # for during testing.
                 r"\| \bcount\b",
@@ -294,9 +300,14 @@
 if (config.llvm_use_sanitizer == "Memory" or
         config.llvm_use_sanitizer == "MemoryWithOrigins"):
     config.available_features.add("msan")
+if config.llvm_use_sanitizer == "Undefined":
+    config.available_features.add("ubsan")
+else:
+    config.available_features.add("not_ubsan")
 
 # Direct object generation
-if not 'hexagon' in config.target_triple:
+# Suppress x86_64-mingw32 while investigating since r219108.
+if not 'hexagon' in config.target_triple and not re.match(r'^x86_64.*-(mingw32|win32)', config.target_triple):
     config.available_features.add("object-emission")
 
 if config.have_zlib == "1":
@@ -310,12 +321,47 @@
 if config.host_triple == config.target_triple:
     config.available_features.add("native")
 
-# Ask llvm-config about assertion mode.
 import subprocess
+
+def have_ld_plugin_support():
+    if not os.path.exists(os.path.join(config.llvm_shlib_dir, 'LLVMgold.so')):
+        return False
+
+    ld_cmd = subprocess.Popen(['ld', '--help'], stdout = subprocess.PIPE)
+    ld_out = ld_cmd.stdout.read()
+    ld_cmd.wait()
+
+    if not '-plugin' in ld_out:
+        return False
+
+    # check that the used emulations are supported.
+    emu_line = [l for l in ld_out.split('\n') if 'supported emulations' in l]
+    if len(emu_line) != 1:
+        return False
+    emu_line = emu_line[0]
+    fields = emu_line.split(':')
+    if len(fields) != 3:
+        return False
+    emulations = fields[2].split()
+    if 'elf32ppc' not in emulations or 'elf_x86_64' not in emulations:
+        return False
+
+    ld_version = subprocess.Popen(['ld', '--version'], stdout = subprocess.PIPE)
+    if not 'GNU gold' in ld_version.stdout.read():
+        return False
+    ld_version.wait()
+
+    return True
+
+if have_ld_plugin_support():
+    config.available_features.add('ld_plugin')
+
+# Ask llvm-config about assertion mode.
 try:
     llvm_config_cmd = subprocess.Popen(
         [os.path.join(llvm_tools_dir, 'llvm-config'), '--assertion-mode'],
-        stdout = subprocess.PIPE)
+        stdout = subprocess.PIPE,
+        env=config.environment)
 except OSError:
     print("Could not find llvm-config in " + llvm_tools_dir)
     exit(42)

diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 4c0bb2e..7d2c833 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in

@@ -7,21 +7,30 @@
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_lib_dir = "@LIBDIR@"
 config.llvm_shlib_dir = "@SHLIBDIR@"
 config.llvm_shlib_ext = "@SHLIBEXT@"
 config.llvm_exe_ext = "@EXEEXT@"
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
 config.python_executable = "@PYTHON_EXECUTABLE@"
-config.ocamlopt_executable = "@OCAMLOPT@"
+config.ocamlfind_executable = "@OCAMLFIND@"
+config.have_ocamlopt = "@HAVE_OCAMLOPT@"
+config.have_ocaml_ounit = "@HAVE_OCAML_OUNIT@"
+config.ocaml_flags = "@OCAMLFLAGS@"
+config.go_executable = "@GO_EXECUTABLE@"
 config.enable_shared = @ENABLE_SHARED@
 config.enable_assertions = @ENABLE_ASSERTIONS@
 config.targets_to_build = "@TARGETS_TO_BUILD@"
-config.llvm_bindings = "@LLVM_BINDINGS@"
+config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
 config.host_os = "@HOST_OS@"
 config.host_arch = "@HOST_ARCH@"
+config.host_cc = "@HOST_CC@"
+config.host_cxx = "@HOST_CXX@"
+config.host_ldflags = "@HOST_LDFLAGS@"
 config.llvm_use_intel_jitevents = "@LLVM_USE_INTEL_JITEVENTS@"
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.have_zlib = "@HAVE_LIBZ@"
+config.enable_ffi = "@LLVM_ENABLE_FFI@"
 
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.

diff --git a/test/tools/gold/Inputs/alias-1.ll b/test/tools/gold/Inputs/alias-1.ll
new file mode 100644
index 0000000..96183aa
--- /dev/null
+++ b/test/tools/gold/Inputs/alias-1.ll

@@ -0,0 +1 @@
+@a = global i32 42

diff --git a/test/tools/gold/Inputs/bcsection.s b/test/tools/gold/Inputs/bcsection.s
new file mode 100644
index 0000000..ede1e5c
--- /dev/null
+++ b/test/tools/gold/Inputs/bcsection.s

@@ -0,0 +1,2 @@
+.section .llvmbc
+.incbin "bcsection.bc"

diff --git a/test/tools/gold/Inputs/comdat.ll b/test/tools/gold/Inputs/comdat.ll
new file mode 100644
index 0000000..e9e4704
--- /dev/null
+++ b/test/tools/gold/Inputs/comdat.ll

@@ -0,0 +1,20 @@
+$c2 = comdat any
+
+@v1 = weak_odr global i32 41, comdat $c2
+define weak_odr protected i32 @f1(i8* %this) comdat $c2 {
+bb20:
+  store i8* %this, i8** null
+  br label %bb21
+bb21:
+  ret i32 41
+}
+
+@r21 = global i32* @v1
+@r22 = global i32(i8*)* @f1
+
+@a21 = alias i32* @v1
+@a22 = alias bitcast (i32* @v1 to i16*)
+
+@a23 = alias i32(i8*)* @f1
+@a24 = alias bitcast (i32(i8*)* @f1 to i16*)
+@a25 = alias i16* @a24

diff --git a/test/tools/gold/Inputs/common.ll b/test/tools/gold/Inputs/common.ll
new file mode 100644
index 0000000..46f199e
--- /dev/null
+++ b/test/tools/gold/Inputs/common.ll

@@ -0,0 +1 @@
+@a = common global i16 0, align 4

diff --git a/test/tools/gold/Inputs/invalid.bc b/test/tools/gold/Inputs/invalid.bc
new file mode 100644
index 0000000..2e7ca8d
--- /dev/null
+++ b/test/tools/gold/Inputs/invalid.bc
Binary files differ

diff --git a/test/tools/gold/Inputs/linker-script.export b/test/tools/gold/Inputs/linker-script.export
new file mode 100644
index 0000000..2062a08
--- /dev/null
+++ b/test/tools/gold/Inputs/linker-script.export

@@ -0,0 +1,5 @@
+{
+  global:
+    f;
+  local: *;
+};

diff --git a/test/tools/gold/Inputs/linkonce-weak.ll b/test/tools/gold/Inputs/linkonce-weak.ll
new file mode 100644
index 0000000..f42af8f
--- /dev/null
+++ b/test/tools/gold/Inputs/linkonce-weak.ll

@@ -0,0 +1,3 @@
+define weak_odr void @f() {
+  ret void
+}

diff --git a/test/tools/gold/Inputs/pr19901-1.ll b/test/tools/gold/Inputs/pr19901-1.ll
new file mode 100644
index 0000000..2f71532
--- /dev/null
+++ b/test/tools/gold/Inputs/pr19901-1.ll

@@ -0,0 +1,4 @@
+target triple = "x86_64-unknown-linux-gnu"
+define linkonce_odr hidden void @f() {
+  ret void
+}

diff --git a/test/tools/gold/Inputs/weak.ll b/test/tools/gold/Inputs/weak.ll
new file mode 100644
index 0000000..53b1d16
--- /dev/null
+++ b/test/tools/gold/Inputs/weak.ll

@@ -0,0 +1,2 @@
+@a = weak global i32 41
+@c = global i32* @a

diff --git a/test/tools/gold/alias.ll b/test/tools/gold/alias.ll
new file mode 100644
index 0000000..dbf3af5
--- /dev/null
+++ b/test/tools/gold/alias.ll

@@ -0,0 +1,13 @@
+; RUN: llvm-as %s -o %t.o
+; RUN: llvm-as %p/Inputs/alias-1.ll -o %t2.o
+; RUN: ld -shared -o %t3.o -plugin %llvmshlibdir/LLVMgold.so %t2.o %t.o \
+; RUN:  -plugin-opt=emit-llvm
+; RUN: llvm-dis < %t3.o -o - | FileCheck %s
+
+; CHECK-NOT: alias
+; CHECK: @a = global i32 42
+; CHECK-NEXT: @b = global i32 1
+; CHECK-NOT: alias
+
+@a = weak alias i32* @b
+@b = global i32 1

diff --git a/test/tools/gold/bad-alias.ll b/test/tools/gold/bad-alias.ll
new file mode 100644
index 0000000..e0fc788
--- /dev/null
+++ b/test/tools/gold/bad-alias.ll

@@ -0,0 +1,13 @@
+; RUN: llvm-as %s -o %t.o
+
+; RUN: not ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm \
+; RUN:    -shared %t.o -o %t2.o 2>&1 | FileCheck %s
+
+; CHECK: Unable to determine comdat of alias!
+
+@g1 = global i32 1
+@g2 = global i32 2
+
+@a = alias inttoptr(i32 sub (i32 ptrtoint (i32* @g1 to i32),
+                             i32 ptrtoint (i32* @g2 to i32)) to i32*)

diff --git a/test/tools/gold/bcsection.ll b/test/tools/gold/bcsection.ll
new file mode 100644
index 0000000..8565d9d
--- /dev/null
+++ b/test/tools/gold/bcsection.ll

@@ -0,0 +1,11 @@
+; RUN: llvm-as -o %T/bcsection.bc %s
+
+; RUN: llvm-mc -I=%T -filetype=obj -o %T/bcsection.bco %p/Inputs/bcsection.s
+; RUN: llvm-nm -no-llvm-bc %T/bcsection.bco | count 0
+; RUN: ld -r -o %T/bcsection.o -plugin %llvmshlibdir/LLVMgold.so %T/bcsection.bco
+; RUN: llvm-nm -no-llvm-bc %T/bcsection.o | FileCheck %s
+
+; CHECK: main
+define i32 @main() {
+  ret i32 0
+}

diff --git a/test/tools/gold/comdat.ll b/test/tools/gold/comdat.ll
new file mode 100644
index 0000000..ba3abce
--- /dev/null
+++ b/test/tools/gold/comdat.ll

@@ -0,0 +1,65 @@
+; RUN: llvm-as %s -o %t.o
+; RUN: llvm-as %p/Inputs/comdat.ll -o %t2.o
+; RUN: ld -shared -o %t3.o -plugin %llvmshlibdir/LLVMgold.so %t.o %t2.o \
+; RUN:  -plugin-opt=emit-llvm
+; RUN: llvm-dis %t3.o -o - | FileCheck %s
+
+$c1 = comdat any
+
+@v1 = weak_odr global i32 42, comdat $c1
+define weak_odr i32 @f1(i8*) comdat $c1 {
+bb10:
+  br label %bb11
+bb11:
+  ret i32 42
+}
+
+@r11 = global i32* @v1
+@r12 = global i32 (i8*)* @f1
+
+@a11 = alias i32* @v1
+@a12 = alias bitcast (i32* @v1 to i16*)
+
+@a13 = alias i32 (i8*)* @f1
+@a14 = alias bitcast (i32 (i8*)* @f1 to i16*)
+@a15 = alias i16* @a14
+
+; CHECK: $c1 = comdat any
+; CHECK: $c2 = comdat any
+
+; CHECK: @v1 = weak_odr global i32 42, comdat $c1
+
+; CHECK: @r11 = global i32* @v1{{$}}
+; CHECK: @r12 = global i32 (i8*)* @f1{{$}}
+
+; CHECK: @r21 = global i32* @v1{{$}}
+; CHECK: @r22 = global i32 (i8*)* @f1{{$}}
+
+; CHECK: @v11 = internal global i32 41, comdat $c2
+
+; CHECK: @a11 = alias i32* @v1{{$}}
+; CHECK: @a12 = alias bitcast (i32* @v1 to i16*)
+
+; CHECK: @a13 = alias i32 (i8*)* @f1{{$}}
+; CHECK: @a14 = alias bitcast (i32 (i8*)* @f1 to i16*)
+
+; CHECK: @a21 = alias i32* @v11{{$}}
+; CHECK: @a22 = alias bitcast (i32* @v11 to i16*)
+
+; CHECK: @a23 = alias i32 (i8*)* @f12{{$}}
+; CHECK: @a24 = alias bitcast (i32 (i8*)* @f12 to i16*)
+
+; CHECK:      define weak_odr protected i32 @f1(i8*) comdat $c1 {
+; CHECK-NEXT: bb10:
+; CHECK-NEXT:   br label %bb11{{$}}
+; CHECK:      bb11:
+; CHECK-NEXT:   ret i32 42
+; CHECK-NEXT: }
+
+; CHECK:      define internal i32 @f12(i8* %this) comdat $c2 {
+; CHECK-NEXT: bb20:
+; CHECK-NEXT:   store i8* %this, i8** null
+; CHECK-NEXT:   br label %bb21
+; CHECK:      bb21:
+; CHECK-NEXT:   ret i32 41
+; CHECK-NEXT: }

diff --git a/test/tools/gold/common.ll b/test/tools/gold/common.ll
new file mode 100644
index 0000000..f309231
--- /dev/null
+++ b/test/tools/gold/common.ll

@@ -0,0 +1,29 @@
+; RUN: llvm-as %s -o %t1.o
+; RUN: llvm-as %p/Inputs/common.ll -o %t2.o
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm \
+; RUN:    -shared %t1.o %t2.o -o %t3.o
+; RUN: llvm-dis %t3.o -o - | FileCheck %s
+
+@a = common global i8 0, align 8
+
+; Shared library case, we merge @a as common and keep it for the symbol table.
+; CHECK: @a = common global i16 0, align 8
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm \
+; RUN:    %t1.o %t2.o -o %t3.o
+; RUN: llvm-dis %t3.o -o - | FileCheck --check-prefix=EXEC %s
+
+; All IR case, we internalize a after merging.
+; EXEC: @a = internal global i16 0, align 8
+
+; RUN: llc %p/Inputs/common.ll -o %t2.o -filetype=obj
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm \
+; RUN:    %t1.o %t2.o -o %t3.o
+; RUN: llvm-dis %t3.o -o - | FileCheck --check-prefix=MIXED %s
+
+; Mixed ELF and IR. We keep ours as common so the linker will finish the merge.
+; MIXED: @a = common global i8 0, align 8

diff --git a/test/tools/gold/emit-llvm.ll b/test/tools/gold/emit-llvm.ll
new file mode 100644
index 0000000..0a6dcfc
--- /dev/null
+++ b/test/tools/gold/emit-llvm.ll

@@ -0,0 +1,67 @@
+; RUN: llvm-as %s -o %t.o
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm \
+; RUN:    --plugin-opt=generate-api-file \
+; RUN:    -shared %t.o -o %t2.o
+; RUN: llvm-dis %t2.o -o - | FileCheck %s
+; RUN: FileCheck --check-prefix=API %s < %T/../apifile.txt
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:     -m elf_x86_64 --plugin-opt=save-temps \
+; RUN:    -shared %t.o -o %t3.o
+; RUN: llvm-dis %t3.o.bc -o - | FileCheck %s
+; RUN: llvm-dis %t3.o.opt.bc -o - | FileCheck --check-prefix=OPT %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: define internal void @f1()
+; OPT-NOT: @f1
+define hidden void @f1() {
+  ret void
+}
+
+; CHECK: define hidden void @f2()
+; OPT: define hidden void @f2()
+define hidden void @f2() {
+  ret void
+}
+
+@llvm.used = appending global [1 x i8*] [ i8* bitcast (void ()* @f2 to i8*)]
+
+; CHECK: define void @f3()
+; OPT: define void @f3()
+define void @f3() {
+  call void @f4()
+  ret void
+}
+
+; CHECK: define internal void @f4()
+; OPT-NOT: @f4
+define linkonce_odr void @f4() {
+  ret void
+}
+
+; CHECK: define linkonce_odr void @f5()
+; OPT: define linkonce_odr void @f5()
+define linkonce_odr void @f5() {
+  ret void
+}
+@g5 = global void()* @f5
+
+; CHECK: define internal void @f6() unnamed_addr
+; OPT: define internal void @f6() unnamed_addr
+define linkonce_odr void @f6() unnamed_addr {
+  ret void
+}
+@g6 = global void()* @f6
+
+
+; API: f1 PREVAILING_DEF_IRONLY
+; API: f2 PREVAILING_DEF_IRONLY
+; API: f3 PREVAILING_DEF_IRONLY_EXP
+; API: f4 PREVAILING_DEF_IRONLY_EXP
+; API: f5 PREVAILING_DEF_IRONLY_EXP
+; API: f6 PREVAILING_DEF_IRONLY_EXP
+; API: g5 PREVAILING_DEF_IRONLY_EXP
+; API: g6 PREVAILING_DEF_IRONLY_EXP

diff --git a/test/tools/gold/invalid.ll b/test/tools/gold/invalid.ll
new file mode 100644
index 0000000..8db7644
--- /dev/null
+++ b/test/tools/gold/invalid.ll

@@ -0,0 +1,7 @@
+; RUN: not ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    %p/Inputs/invalid.bc -o %t2 2>&1 | FileCheck %s
+
+; test that only one error gets printed
+
+; CHECK: error: LLVM gold plugin has failed to create LTO module: Malformed block
+; CHECK-NOT: error

diff --git a/test/tools/gold/linker-script.ll b/test/tools/gold/linker-script.ll
new file mode 100644
index 0000000..35a7694
--- /dev/null
+++ b/test/tools/gold/linker-script.ll

@@ -0,0 +1,17 @@
+; RUN: llvm-as %s -o %t.o
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm \
+; RUN:    -shared %t.o -o %t2.o \
+; RUN:    -version-script=%p/Inputs/linker-script.export
+; RUN: llvm-dis %t2.o -o - | FileCheck %s
+
+; CHECK: define void @f()
+define void @f() {
+  ret void
+}
+
+; CHECK: define internal void @g()
+define void @g() {
+  ret void
+}

diff --git a/test/tools/gold/linkonce-weak.ll b/test/tools/gold/linkonce-weak.ll
new file mode 100644
index 0000000..765275b
--- /dev/null
+++ b/test/tools/gold/linkonce-weak.ll

@@ -0,0 +1,19 @@
+; RUN: llvm-as %s -o %t.o
+; RUN: llvm-as %p/Inputs/linkonce-weak.ll -o %t2.o
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm \
+; RUN:    -shared %t.o %t2.o -o %t3.o
+; RUN: llvm-dis %t3.o -o - | FileCheck %s
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm \
+; RUN:    -shared %t2.o %t.o -o %t3.o
+; RUN: llvm-dis %t3.o -o - | FileCheck %s
+
+define linkonce_odr void @f() {
+  ret void
+}
+
+; Test that we get a weak_odr regardless of the order of the files
+; CHECK: define weak_odr void @f() {

diff --git a/test/tools/gold/lit.local.cfg b/test/tools/gold/lit.local.cfg
new file mode 100644
index 0000000..a59549d
--- /dev/null
+++ b/test/tools/gold/lit.local.cfg

@@ -0,0 +1,4 @@
+if (not 'ld_plugin' in  config.available_features or
+    not 'X86' in config.root.targets or
+    not 'PowerPC' in config.root.targets):
+   config.unsupported = True

diff --git a/test/tools/gold/mtriple.ll b/test/tools/gold/mtriple.ll
new file mode 100644
index 0000000..6395af6
--- /dev/null
+++ b/test/tools/gold/mtriple.ll

@@ -0,0 +1,13 @@
+; RUN: llvm-as %s -o %t.o
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so -m elf32ppc \
+; RUN:    -plugin-opt=mtriple=powerpc-linux-gnu \
+; RUN:    -plugin-opt=obj-path=%t3.o \
+; RUN:    -shared %t.o -o %t2
+; RUN: llvm-readobj --file-headers %t2 | FileCheck  --check-prefix=DSO %s
+; RUN: llvm-readobj --file-headers %t3.o | FileCheck --check-prefix=REL %s
+
+; REL:       Type: Relocatable
+; REL-NEXT:  Machine: EM_PPC
+
+; DSO:       Type: SharedObject
+; DSO-NEXT:  Machine: EM_PPC

diff --git a/test/tools/gold/option.ll b/test/tools/gold/option.ll
new file mode 100644
index 0000000..8154e43
--- /dev/null
+++ b/test/tools/gold/option.ll

@@ -0,0 +1,39 @@
+; RUN: llvm-as %s -o %t.o
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so -m elf_x86_64 \
+; RUN:    --plugin-opt=-jump-table-type=arity \
+; RUN:    --plugin-opt=-mattr=+aes \
+; RUN:    --plugin-opt=mcpu=core-avx2 \
+; RUN:    -shared %t.o -o %t2.o
+; RUN: llvm-nm %t2.o | FileCheck %s
+
+; CHECK: t __llvm_jump_instr_table_0_1
+; CHECK: t __llvm_jump_instr_table_1_1
+
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @g(i32 %a) unnamed_addr jumptable {
+  ret i32 %a
+}
+
+define i32 (i32)* @get_g() {
+  ret i32 (i32)* @g
+}
+
+define i32 @f() unnamed_addr jumptable {
+  ret i32 0
+}
+
+define i32 ()* @get_f() {
+  ret i32 ()* @f
+}
+
+define <2 x i64> @test_aes(<2 x i64> %a0, <2 x i64> %a1) {
+  %res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <32 x i8> @test_avx2(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone

diff --git a/test/tools/gold/pr19901.ll b/test/tools/gold/pr19901.ll
new file mode 100644
index 0000000..304246b
--- /dev/null
+++ b/test/tools/gold/pr19901.ll

@@ -0,0 +1,23 @@
+; RUN: llc %s -o %t.o -filetype=obj -relocation-model=pic
+; RUN: llvm-as %p/Inputs/pr19901-1.ll -o %t2.o
+; RUN: ld -shared -o %t.so -plugin %llvmshlibdir/LLVMgold.so %t2.o %t.o
+; RUN: llvm-readobj -t %t.so | FileCheck %s
+
+; CHECK:       Symbol {
+; CHECK:         Name: f
+; CHECK-NEXT:    Value:
+; CHECK-NEXT:    Size:
+; CHECK-NEXT:    Binding: Local
+; CHECK-NEXT:    Type: Function
+; CHECK-NEXT:    Other: 2
+; CHECK-NEXT:    Section: .text
+; CHECK-NEXT:  }
+
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @g() {
+  call void @f()
+  ret i32 0
+}
+define linkonce_odr hidden void @f() {
+  ret void
+}

diff --git a/test/tools/gold/slp-vectorize.ll b/test/tools/gold/slp-vectorize.ll
new file mode 100644
index 0000000..d378902
--- /dev/null
+++ b/test/tools/gold/slp-vectorize.ll

@@ -0,0 +1,30 @@
+; RUN: llvm-as %s -o %t.o
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=save-temps \
+; RUN:    -shared %t.o -o %t2.o
+; RUN: llvm-dis %t2.o.opt.bc -o - | FileCheck %s
+
+; test that the vectorizer is run.
+; CHECK: fadd <4 x float>
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(float* nocapture %x) {
+  %tmp = load float* %x, align 4
+  %add = fadd float %tmp, 1.000000e+00
+  store float %add, float* %x, align 4
+  %arrayidx1 = getelementptr inbounds float* %x, i64 1
+  %tmp1 = load float* %arrayidx1, align 4
+  %add2 = fadd float %tmp1, 1.000000e+00
+  store float %add2, float* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds float* %x, i64 2
+  %tmp2 = load float* %arrayidx3, align 4
+  %add4 = fadd float %tmp2, 1.000000e+00
+  store float %add4, float* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds float* %x, i64 3
+  %tmp3 = load float* %arrayidx5, align 4
+  %add6 = fadd float %tmp3, 1.000000e+00
+  store float %add6, float* %arrayidx5, align 4
+  ret void
+}

diff --git a/test/tools/gold/vectorize.ll b/test/tools/gold/vectorize.ll
new file mode 100644
index 0000000..3d305db
--- /dev/null
+++ b/test/tools/gold/vectorize.ll

@@ -0,0 +1,30 @@
+; RUN: llvm-as %s -o %t.o
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=save-temps \
+; RUN:    -shared %t.o -o %t2.o
+; RUN: llvm-dis %t2.o.opt.bc -o - | FileCheck %s
+
+; test that the vectorizer is run.
+; CHECK: fadd <4 x float>
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f(float* nocapture %x, i64 %n) {
+bb:
+  br label %bb1
+
+bb1:
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp4, %bb1 ]
+  %tmp = getelementptr inbounds float* %x, i64 %i.0
+  %tmp2 = load float* %tmp, align 4
+  %tmp3 = fadd float %tmp2, 1.000000e+00
+  store float %tmp3, float* %tmp, align 4
+  %tmp4 = add nsw i64 %i.0, 1
+  %tmp5 = icmp slt i64 %tmp4, %n
+  br i1 %tmp5, label %bb1, label %bb6
+
+bb6:
+  ret void
+}

diff --git a/test/tools/gold/weak.ll b/test/tools/gold/weak.ll
new file mode 100644
index 0000000..e05e905
--- /dev/null
+++ b/test/tools/gold/weak.ll

@@ -0,0 +1,16 @@
+; RUN: llvm-as %s -o %t.o
+; RUN: llvm-as %p/Inputs/weak.ll -o %t2.o
+
+; RUN: ld -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --plugin-opt=emit-llvm \
+; RUN:    -shared %t.o %t2.o -o %t3.o
+; RUN: llvm-dis %t3.o -o - | FileCheck %s
+
+@a = weak global i32 42
+@b = global i32* @a
+
+; Test that @b and @c end up pointing to the same variable.
+
+; CHECK: @a = weak global i32 42
+; CHECK: @b = global i32* @a{{$}}
+; CHECK: @c = global i32* @a{{$}}

diff --git a/test/tools/llvm-cov/Inputs/README b/test/tools/llvm-cov/Inputs/README
index 2cfb191..3773ba3 100644
--- a/test/tools/llvm-cov/Inputs/README
+++ b/test/tools/llvm-cov/Inputs/README

@@ -1,7 +1,21 @@
 These inputs were pre-generated to allow for easier testing of llvm-cov.
 
-test.gcno and test.gcda were create by running clang:
-  clang++ -g -ftest-coverage -fprofile-arcs test.cpp
+The files used to test the gcov compatible code coverage tool were generated
+using the following method:
 
-test.cpp.gcov was created by running gcov 4.2.1:
-  gcov test.cpp
+  test.gcno and test.gcda were create by running clang:
+    clang++ -g -ftest-coverage -fprofile-arcs test.cpp
+
+  test.cpp.gcov was created by running gcov 4.2.1:
+    gcov test.cpp
+
+The 'covmapping' files that are used to test llvm-cov contain raw sections
+with the coverage mapping data generated by the compiler and linker. They are
+created by running clang and llvm-cov:
+  clang++ -fprofile-instr-generate -fcoverage-mapping -o test test.cpp
+  llvm-cov convert-for-testing -o test.covmapping test
+
+The 'profdata' files were generated by running an instrumented version of the
+program and merging the raw profile data using llvm-profdata.
+  ./test
+  llvm-profdata merge -o test.profdata default.profraw

diff --git a/test/tools/llvm-cov/Inputs/highlightedRanges.covmapping b/test/tools/llvm-cov/Inputs/highlightedRanges.covmapping
new file mode 100644
index 0000000..20eb0d7
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/highlightedRanges.covmapping
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/highlightedRanges.profdata b/test/tools/llvm-cov/Inputs/highlightedRanges.profdata
new file mode 100644
index 0000000..b465b00
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/highlightedRanges.profdata
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/lineExecutionCounts.covmapping b/test/tools/llvm-cov/Inputs/lineExecutionCounts.covmapping
new file mode 100644
index 0000000..9774b89
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/lineExecutionCounts.covmapping
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/lineExecutionCounts.profdata b/test/tools/llvm-cov/Inputs/lineExecutionCounts.profdata
new file mode 100644
index 0000000..8712227
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/lineExecutionCounts.profdata
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/regionMarkers.covmapping b/test/tools/llvm-cov/Inputs/regionMarkers.covmapping
new file mode 100644
index 0000000..3ebcb07
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/regionMarkers.covmapping
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/regionMarkers.profdata b/test/tools/llvm-cov/Inputs/regionMarkers.profdata
new file mode 100644
index 0000000..8712227
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/regionMarkers.profdata
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/report.covmapping b/test/tools/llvm-cov/Inputs/report.covmapping
new file mode 100644
index 0000000..32d84bc
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/report.covmapping
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/report.profdata b/test/tools/llvm-cov/Inputs/report.profdata
new file mode 100644
index 0000000..aa47be0
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/report.profdata
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/showExpansions.covmapping b/test/tools/llvm-cov/Inputs/showExpansions.covmapping
new file mode 100644
index 0000000..b8c7d97
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/showExpansions.covmapping
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/showExpansions.profdata b/test/tools/llvm-cov/Inputs/showExpansions.profdata
new file mode 100644
index 0000000..7925c60
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/showExpansions.profdata
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/templateInstantiations.covmapping b/test/tools/llvm-cov/Inputs/templateInstantiations.covmapping
new file mode 100644
index 0000000..d243736
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/templateInstantiations.covmapping
Binary files differ

diff --git a/test/tools/llvm-cov/Inputs/templateInstantiations.profdata b/test/tools/llvm-cov/Inputs/templateInstantiations.profdata
new file mode 100644
index 0000000..6ccf526
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/templateInstantiations.profdata
Binary files differ

diff --git a/test/tools/llvm-cov/lit.local.cfg b/test/tools/llvm-cov/lit.local.cfg
index 56c6f1f..650bc02 100644
--- a/test/tools/llvm-cov/lit.local.cfg
+++ b/test/tools/llvm-cov/lit.local.cfg

@@ -1 +1,5 @@
 config.suffixes = ['.test', '.m', '.cpp']
+
+# http://llvm.org/bugs/show_bug.cgi?id=20979
+if 'ubsan' in config.available_features:
+  config.unsupported = True

diff --git a/test/tools/llvm-cov/report.cpp b/test/tools/llvm-cov/report.cpp
new file mode 100644
index 0000000..297322a
--- /dev/null
+++ b/test/tools/llvm-cov/report.cpp

@@ -0,0 +1,24 @@
+// RUN: llvm-cov report %S/Inputs/report.covmapping -instr-profile %S/Inputs/report.profdata -no-colors 2>&1 | FileCheck %s
+
+// CHECK: Filename                    Regions    Miss   Cover Functions  Executed
+// CHECK: TOTAL                             5       2  60.00%         4    75.00%
+
+void foo(bool cond) {
+  if (cond) {
+  }
+}
+
+void bar() {
+}
+
+void func() {
+}
+
+int main() {
+  foo(false);
+  bar();
+  return 0;
+}
+
+// llvm-cov doesn't work on big endian yet
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc

diff --git a/test/tools/llvm-cov/showExpansions.cpp b/test/tools/llvm-cov/showExpansions.cpp
new file mode 100644
index 0000000..30edd90
--- /dev/null
+++ b/test/tools/llvm-cov/showExpansions.cpp

@@ -0,0 +1,29 @@
+// RUN: llvm-cov show %S/Inputs/showExpansions.covmapping -instr-profile %S/Inputs/showExpansions.profdata -dump -show-expansions -filename-equivalence %s 2>&1 | FileCheck %s
+
+#define DO_SOMETHING_ELSE() \
+  do {                      \
+  } while (0)
+#define ANOTHER_THING() \
+  do {                  \
+    if (0) {            \
+    }                   \
+  } while (0)
+
+#define DO_SOMETHING(x)    \
+  do {                     \
+    if (x)                 \
+      DO_SOMETHING_ELSE(); \
+    else                   \
+      ANOTHER_THING();     \
+  } while (0)
+// CHECK-DAG: Expansion at line [[@LINE-4]], 7 -> 24
+// CHECK-DAG: Expansion at line [[@LINE-3]], 7 -> 20
+
+int main(int argc, const char *argv[]) {
+  for (int i = 0; i < 100; ++i)
+    DO_SOMETHING(i); // CHECK-DAG: Expansion at line [[@LINE]], 5 -> 17
+  return 0;
+}
+
+// llvm-cov doesn't work on big endian yet
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc

diff --git a/test/tools/llvm-cov/showHighlightedRanges.cpp b/test/tools/llvm-cov/showHighlightedRanges.cpp
new file mode 100644
index 0000000..cec7308
--- /dev/null
+++ b/test/tools/llvm-cov/showHighlightedRanges.cpp

@@ -0,0 +1,48 @@
+// RUN: llvm-cov show %S/Inputs/highlightedRanges.covmapping -instr-profile %S/Inputs/highlightedRanges.profdata -dump -filename-equivalence %s 2>&1 | FileCheck %s
+
+void func() {
+  return;
+  int i = 0;                     // CHECK: Highlighted line [[@LINE]], 3 -> 12
+}
+
+void func2(int x) {
+  if(x > 5) {
+    while(x >= 9) {
+      return;
+      --x;                       // CHECK: Highlighted line [[@LINE]], 7 -> 10
+    }
+    int i = 0;                   // CHECK: Highlighted line [[@LINE]], 5 -> 14
+  }
+}
+
+void test() {
+  int x = 0;
+
+  if (x) {                       // CHECK: Highlighted line [[@LINE]], 10 -> ?
+    x = 0;                       // CHECK: Highlighted line [[@LINE]], 1 -> ?
+  } else {                       // CHECK: Highlighted line [[@LINE]], 1 -> 4
+    x = 1;
+  }
+
+                                  // CHECK: Highlighted line [[@LINE+1]], 26 -> 29
+  for (int i = 0; i < 0; ++i) {   // CHECK: Highlighted line [[@LINE]], 31 -> ?
+    x = 1;                        // CHECK: Highlighted line [[@LINE]], 1 -> ?
+  }                               // CHECK: Highlighted line [[@LINE]], 1 -> 4
+
+  x = x < 10 ? x +
+               1
+             : x - 1;             // CHECK: Highlighted line [[@LINE]], 16 -> 21
+  x = x > 10 ? x +                // CHECK: Highlighted line [[@LINE]], 16 -> ?
+               1                  // CHECK: Highlighted line [[@LINE]], 1 -> 17
+             : x - 1;
+}
+
+int main() {
+  test();
+  func();
+  func2(9);
+  return 0;
+}
+
+// llvm-cov doesn't work on big endian yet
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc

diff --git a/test/tools/llvm-cov/showLineExecutionCounts.cpp b/test/tools/llvm-cov/showLineExecutionCounts.cpp
new file mode 100644
index 0000000..34baa57
--- /dev/null
+++ b/test/tools/llvm-cov/showLineExecutionCounts.cpp

@@ -0,0 +1,30 @@
+// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %S/Inputs/lineExecutionCounts.profdata -no-colors -filename-equivalence %s | FileCheck -check-prefix=CHECK -check-prefix=WHOLE-FILE %s
+// RUN: llvm-cov show %S/Inputs/lineExecutionCounts.covmapping -instr-profile %S/Inputs/lineExecutionCounts.profdata -no-colors -filename-equivalence -name=main %s | FileCheck -check-prefix=CHECK -check-prefix=FILTER %s
+
+// before any coverage              // WHOLE-FILE:    | [[@LINE]]|// before
+                                    // FILTER-NOT:    | [[@LINE-1]]|// before
+int main() {                             // CHECK:   1| [[@LINE]]|int main(
+  int x = 0;                             // CHECK:   1| [[@LINE]]|  int x
+                                         // CHECK:   1| [[@LINE]]|
+  if (x) {                               // CHECK:   0| [[@LINE]]|  if (x)
+    x = 0;                               // CHECK:   0| [[@LINE]]|    x = 0
+  } else {                               // CHECK:   1| [[@LINE]]|  } else
+    x = 1;                               // CHECK:   1| [[@LINE]]|    x = 1
+  }                                      // CHECK:   1| [[@LINE]]|  }
+                                         // CHECK:   1| [[@LINE]]|
+  for (int i = 0; i < 100; ++i) {        // CHECK: 100| [[@LINE]]|  for (
+    x = 1;                               // CHECK: 100| [[@LINE]]|    x = 1
+  }                                      // CHECK: 100| [[@LINE]]|  }
+                                         // CHECK:   1| [[@LINE]]|
+  x = x < 10 ? x + 1 : x - 1;            // CHECK:   0| [[@LINE]]|  x =
+  x = x > 10 ?                           // CHECK:   1| [[@LINE]]|  x =
+        x - 1:                           // CHECK:   0| [[@LINE]]|        x
+        x + 1;                           // CHECK:   1| [[@LINE]]|        x
+                                         // CHECK:   1| [[@LINE]]|
+  return 0;                              // CHECK:   1| [[@LINE]]|  return
+}                                        // CHECK:   1| [[@LINE]]|}
+// after coverage                   // WHOLE-FILE:    | [[@LINE]]|// after
+                                    // FILTER-NOT:    | [[@LINE-1]]|// after
+
+// llvm-cov doesn't work on big endian yet
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc

diff --git a/test/tools/llvm-cov/showRegionMarkers.cpp b/test/tools/llvm-cov/showRegionMarkers.cpp
new file mode 100644
index 0000000..136c3bf
--- /dev/null
+++ b/test/tools/llvm-cov/showRegionMarkers.cpp

@@ -0,0 +1,26 @@
+// RUN: llvm-cov show %S/Inputs/regionMarkers.covmapping -instr-profile %S/Inputs/regionMarkers.profdata -show-regions -dump -filename-equivalence %s 2>&1 | FileCheck %s
+
+int main() {                      // CHECK: Marker at [[@LINE]]:12 = 1
+  int x = 0;
+
+  if (x) {                        // CHECK: Marker at [[@LINE]]:10 = 0
+    x = 0;
+  } else {                        // CHECK: Marker at [[@LINE]]:10 = 1
+    x = 1;
+  }
+                                  // CHECK: Marker at [[@LINE+2]]:19 = 101
+                                  // CHECK: Marker at [[@LINE+1]]:28 = 100
+  for (int i = 0; i < 100; ++i) { // CHECK: Marker at [[@LINE]]:33 = 100
+    x = 1;
+  }
+                                  // CHECK: Marker at [[@LINE+1]]:16 = 1
+  x = x < 10 ? x + 1 : x - 1;     // CHECK: Marker at [[@LINE]]:24 = 0
+  x = x > 10 ?
+        x - 1:                    // CHECK: Marker at [[@LINE]]:9 = 0
+        x + 1;                    // CHECK: Marker at [[@LINE]]:9 = 1
+
+  return 0;
+}
+
+// llvm-cov doesn't work on big endian yet
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc

diff --git a/test/tools/llvm-cov/showTemplateInstantiations.cpp b/test/tools/llvm-cov/showTemplateInstantiations.cpp
new file mode 100644
index 0000000..2b72d83
--- /dev/null
+++ b/test/tools/llvm-cov/showTemplateInstantiations.cpp

@@ -0,0 +1,43 @@
+// RUN: llvm-cov show %S/Inputs/templateInstantiations.covmapping -instr-profile %S/Inputs/templateInstantiations.profdata -no-colors -filename-equivalence %s | FileCheck -check-prefix=CHECK -check-prefix=ALL %s
+// RUN: llvm-cov show %S/Inputs/templateInstantiations.covmapping -instr-profile %S/Inputs/templateInstantiations.profdata -no-colors -filename-equivalence -name=_Z4funcIbEiT_ %s | FileCheck -check-prefix=CHECK -check-prefix=FILTER %s
+
+// before coverage   // WHOLE-FILE:   | [[@LINE]]|// before
+                     // FILTER-NOT:   | [[@LINE-1]]|// before
+template<typename T> // ALL:          | [[@LINE]]|template<typename T>
+int func(T x) {      // ALL-NEXT:    2| [[@LINE]]|int func(T x) {
+  if(x)              // ALL-NEXT:    2| [[@LINE]]|  if(x)
+    return 0;        // ALL-NEXT:    1| [[@LINE]]|    return 0;
+  else               // ALL-NEXT:    1| [[@LINE]]|  else
+    return 1;        // ALL-NEXT:    1| [[@LINE]]|    return 1;
+  int j = 1;         // ALL-NEXT:    0| [[@LINE]]|  int j = 1;
+}                    // ALL-NEXT:    1| [[@LINE]]|}
+
+                     // CHECK:       {{^ *(\| )?}}_Z4funcIbEiT_:
+                     // CHECK-NEXT:  1| [[@LINE-9]]|int func(T x) {
+                     // CHECK-NEXT:  1| [[@LINE-9]]|  if(x)
+                     // CHECK-NEXT:  1| [[@LINE-9]]|    return 0;
+                     // CHECK-NEXT:  1| [[@LINE-9]]|  else
+                     // CHECK-NEXT:  0| [[@LINE-9]]|    return 1;
+                     // CHECK-NEXT:  0| [[@LINE-9]]|  int j = 1;
+                     // CHECK-NEXT:  1| [[@LINE-9]]|}
+
+                     // ALL:         {{^ *}}| _Z4funcIiEiT_:
+                     // FILTER-NOT:  {{^ *(\| )?}} _Z4funcIiEiT_:
+                     // ALL-NEXT:    1| [[@LINE-19]]|int func(T x) {
+                     // ALL-NEXT:    1| [[@LINE-19]]|  if(x)
+                     // ALL-NEXT:    0| [[@LINE-19]]|    return 0;
+                     // ALL-NEXT:    1| [[@LINE-19]]|  else
+                     // ALL-NEXT:    1| [[@LINE-19]]|    return 1;
+                     // ALL-NEXT:    0| [[@LINE-19]]|  int j = 1;
+                     // ALL-NEXT:    1| [[@LINE-19]]|}
+
+int main() {         // ALL:         1| [[@LINE]]|int main() {
+  func<int>(0);      // ALL-NEXT:    1| [[@LINE]]|  func<int>(0);
+  func<bool>(true);  // ALL-NEXT:    1| [[@LINE]]|  func<bool>(true);
+  return 0;          // ALL-NEXT:    1| [[@LINE]]|  return 0;
+}                    // ALL-NEXT:    1| [[@LINE]]|}
+// after coverage    // ALL-NEXT:     | [[@LINE]]|// after
+                     // FILTER-NOT:   | [[@LINE-1]]|// after
+
+// llvm-cov doesn't work on big endian yet
+// XFAIL: powerpc64-, s390x, mips-, mips64-, sparc

diff --git a/test/tools/llvm-mc/line_end_with_space.test b/test/tools/llvm-mc/line_end_with_space.test
new file mode 100644
index 0000000..2ce3139
--- /dev/null
+++ b/test/tools/llvm-mc/line_end_with_space.test

@@ -0,0 +1,2 @@
+RUN: llvm-mc -disassemble %s
+ 
\ No newline at end of file

diff --git a/test/tools/llvm-objdump/AArch64/Inputs/ObjC.exe.macho-aarch64 b/test/tools/llvm-objdump/AArch64/Inputs/ObjC.exe.macho-aarch64
new file mode 100755
index 0000000..d28cbcb
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/Inputs/ObjC.exe.macho-aarch64
Binary files differ

diff --git a/test/tools/llvm-objdump/AArch64/Inputs/ObjC.obj.macho-aarch64 b/test/tools/llvm-objdump/AArch64/Inputs/ObjC.obj.macho-aarch64
new file mode 100644
index 0000000..8366076
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/Inputs/ObjC.obj.macho-aarch64
Binary files differ

diff --git a/test/tools/llvm-objdump/AArch64/Inputs/hello.exe.macho-aarch64 b/test/tools/llvm-objdump/AArch64/Inputs/hello.exe.macho-aarch64
new file mode 100755
index 0000000..c30d358
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/Inputs/hello.exe.macho-aarch64
Binary files differ

diff --git a/test/tools/llvm-objdump/AArch64/Inputs/hello.obj.macho-aarch64 b/test/tools/llvm-objdump/AArch64/Inputs/hello.obj.macho-aarch64
new file mode 100644
index 0000000..704dbab
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/Inputs/hello.obj.macho-aarch64
Binary files differ

diff --git a/test/tools/llvm-objdump/AArch64/lit.local.cfg b/test/tools/llvm-objdump/AArch64/lit.local.cfg
new file mode 100644
index 0000000..7184443
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/lit.local.cfg

@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True

diff --git a/test/tools/llvm-objdump/AArch64/macho-symbolized-disassembly.test b/test/tools/llvm-objdump/AArch64/macho-symbolized-disassembly.test
new file mode 100644
index 0000000..311ff51
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/macho-symbolized-disassembly.test

@@ -0,0 +1,23 @@
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.obj.macho-aarch64 | FileCheck %s -check-prefix=OBJ
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.exe.macho-aarch64 | FileCheck %s -check-prefix=EXE
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/ObjC.obj.macho-aarch64 | FileCheck %s -check-prefix=ObjC-OBJ
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/ObjC.exe.macho-aarch64 | FileCheck %s -check-prefix=ObjC-EXE
+
+OBJ: 000000000000001c	adrp	x0, L_.str@PAGE
+OBJ: 0000000000000020	add	x0, x0, L_.str@PAGEOFF
+OBJ: 0000000000000024	bl	_printf
+
+EXE: 0000000100007f58	add	x0, x0, #4008 ; literal pool for: "Hello world
+"
+EXE: 0000000100007f5c	bl	0x100007f78 ; symbol stub for: _printf
+
+ObjC-OBJ: 000000000000000c	adrp	x8, L_OBJC_SELECTOR_REFERENCES_3@PAGE
+ObjC-OBJ: 0000000000000010	add	x8, x8, L_OBJC_SELECTOR_REFERENCES_3@PAGEOFF
+ObjC-OBJ:0000000000000044	bl	_objc_msgSend
+
+ObjC-EXE: 0000000100007ed0	add	x8, x8, #80 ; Objc selector ref: date
+ObjC-EXE: 0000000100007ed8	add	x9, x9, #96 ; Objc class ref: _OBJC_CLASS_$_NSDate
+ObjC-EXE: 0000000100007f04	bl	0x100007f50 ; Objc message: +[NSObject new]
+ObjC-EXE: 0000000100007f1c	bl	0x100007f50 ; Objc message: -[x0 new]
+ObjC-EXE: 0000000100007f2c	add	x0, x0, #32 ; Objc cfstring ref: @"The current date and time is: %@"
+ObjC-EXE: 0000000100007f30	bl	0x100007f44 ; symbol stub for: _NSLog

diff --git a/test/tools/llvm-objdump/ARM/Inputs/hello.exe.macho-arm b/test/tools/llvm-objdump/ARM/Inputs/hello.exe.macho-arm
new file mode 100755
index 0000000..40d657b
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/Inputs/hello.exe.macho-arm
Binary files differ

diff --git a/test/tools/llvm-objdump/ARM/Inputs/hello.obj.macho-arm b/test/tools/llvm-objdump/ARM/Inputs/hello.obj.macho-arm
new file mode 100644
index 0000000..fb8706b
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/Inputs/hello.obj.macho-arm
Binary files differ

diff --git a/test/tools/llvm-objdump/ARM/lit.local.cfg b/test/tools/llvm-objdump/ARM/lit.local.cfg
new file mode 100644
index 0000000..236e1d3
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/lit.local.cfg

@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True

diff --git a/test/tools/llvm-objdump/ARM/macho-arm-and-thumb.test b/test/tools/llvm-objdump/ARM/macho-arm-and-thumb.test
new file mode 100644
index 0000000..720b78f
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/macho-arm-and-thumb.test

@@ -0,0 +1,15 @@
+@ RUN: llvm-mc < %s -triple armv7-apple-darwin -filetype=obj | llvm-objdump -m -d - | FileCheck %s
+
+.thumb
+.thumb_func _t
+_t:
+nop
+nop
+.align 2
+.arm
+_a:
+nop
+
+@ CHECK: 00 bf nop
+@ CHECK-NEXT: 00 bf nop
+@ CHECK: 00 f0 20 e3 nop

diff --git a/test/tools/llvm-objdump/ARM/macho-mattr-arm.test b/test/tools/llvm-objdump/ARM/macho-mattr-arm.test
new file mode 100644
index 0000000..1b17146
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/macho-mattr-arm.test

@@ -0,0 +1,5 @@
+@ RUN: llvm-mc < %s -triple armv8-apple-darwin10 -mattr=+fp-armv8 -filetype=obj -o - | llvm-objdump -d -m -mattr=+fp-armv8 - | FileCheck %s
+
+vcvtt.f64.f16 d3, s1
+
+@ CHECK: e0 3b b2 ee vcvtt.f64.f16 d3, s1

diff --git a/test/tools/llvm-objdump/ARM/macho-mcpu-arm.test b/test/tools/llvm-objdump/ARM/macho-mcpu-arm.test
new file mode 100644
index 0000000..7a3432d
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/macho-mcpu-arm.test

@@ -0,0 +1,10 @@
+@ RUN: llvm-mc < %s -triple thumbv7-apple-darwin -mcpu=cortex-a7 -filetype=obj | llvm-objdump -triple thumbv7-apple-darwin10 -m -d -mcpu=cortex-a7 - | FileCheck %s
+
+.thumb
+.thumb_func _t
+_t:
+sdiv r1, r2, r3
+udiv r1, r2, r3
+
+@ CHECK: 92 fb f3 f1 sdiv r1, r2, r3
+@ CHECK: b2 fb f3 f1 udiv r1, r2, r3

diff --git a/test/tools/llvm-objdump/ARM/macho-symbolized-disassembly.test b/test/tools/llvm-objdump/ARM/macho-symbolized-disassembly.test
new file mode 100644
index 0000000..eeeab52
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/macho-symbolized-disassembly.test

@@ -0,0 +1,8 @@
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.obj.macho-arm | FileCheck %s -check-prefix=OBJ
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.exe.macho-arm | FileCheck %s -check-prefix=EXE
+
+OBJ: 00000006	movw	r3, :lower16:((54-14)-4)
+OBJ: 0000000a	movt	r3, :upper16:((54-14)-4)
+OBJ: 00000024	bl	_printf
+
+EXE: 0000bfa8	blx	0xbffc @ symbol stub for: _printf

diff --git a/test/tools/llvm-objdump/ARM/macho-symbolized-subtractor.test b/test/tools/llvm-objdump/ARM/macho-symbolized-subtractor.test
new file mode 100644
index 0000000..65df2a9
--- /dev/null
+++ b/test/tools/llvm-objdump/ARM/macho-symbolized-subtractor.test

@@ -0,0 +1,15 @@
+# RUN: llvm-mc < %s -triple armv7-apple-darwin -filetype=obj | llvm-objdump -m -d - | FileCheck %s
+	.thumb
+	.thumb_func	_t
+_t:
+	movw	r3, :lower16:(Str-(PCinst+4))
+	movt	r3, :upper16:(Str-(PCinst+4))
+	.thumb_func	PCinst
+PCinst:
+	add	r3, pc
+
+	.section	__TEXT,__cstring,cstring_literals
+Str: 
+	.asciz	"Hello world\n"
+# CHECK: movw	r3, :lower16:((Str-PCinst)-4)
+# CHECK: movt	r3, :upper16:((Str-PCinst)-4)

diff --git a/test/tools/llvm-objdump/Inputs/bad-ordinal.macho-x86_64 b/test/tools/llvm-objdump/Inputs/bad-ordinal.macho-x86_64
new file mode 100755
index 0000000..3ab6227
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/bad-ordinal.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/bind.macho-x86_64 b/test/tools/llvm-objdump/Inputs/bind.macho-x86_64
new file mode 100755
index 0000000..51a58a7
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/bind.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/bind2.macho-x86_64 b/test/tools/llvm-objdump/Inputs/bind2.macho-x86_64
new file mode 100755
index 0000000..f756fbb
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/bind2.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/compact-unwind.macho-i386 b/test/tools/llvm-objdump/Inputs/compact-unwind.macho-i386
new file mode 100644
index 0000000..174d383
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/compact-unwind.macho-i386
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/compact-unwind.macho-x86_64 b/test/tools/llvm-objdump/Inputs/compact-unwind.macho-x86_64
new file mode 100644
index 0000000..fde1bb5
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/compact-unwind.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/exports-trie.macho-x86_64 b/test/tools/llvm-objdump/Inputs/exports-trie.macho-x86_64
new file mode 100755
index 0000000..5d75060
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/exports-trie.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/large-bss.obj.coff-i386 b/test/tools/llvm-objdump/Inputs/large-bss.obj.coff-i386
new file mode 100644
index 0000000..79311d3
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/large-bss.obj.coff-i386
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/lazy-bind.macho-x86_64 b/test/tools/llvm-objdump/Inputs/lazy-bind.macho-x86_64
new file mode 100755
index 0000000..02a4d12
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/lazy-bind.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/rebase.macho-x86_64 b/test/tools/llvm-objdump/Inputs/rebase.macho-x86_64
new file mode 100755
index 0000000..05062d8
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/rebase.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/unwind-info-no-relocs.macho-x86_64 b/test/tools/llvm-objdump/Inputs/unwind-info-no-relocs.macho-x86_64
new file mode 100755
index 0000000..a1fd687
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/unwind-info-no-relocs.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/unwind-info.macho-arm64 b/test/tools/llvm-objdump/Inputs/unwind-info.macho-arm64
new file mode 100755
index 0000000..5b9ce9c
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/unwind-info.macho-arm64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/unwind-info.macho-x86_64 b/test/tools/llvm-objdump/Inputs/unwind-info.macho-x86_64
new file mode 100755
index 0000000..9e6ad6b
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/unwind-info.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/weak-bind.macho-x86_64 b/test/tools/llvm-objdump/Inputs/weak-bind.macho-x86_64
new file mode 100755
index 0000000..6534116
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/weak-bind.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/X86/Inputs/ObjC.exe.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/ObjC.exe.macho-x86_64
new file mode 100755
index 0000000..4de8a1f
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/ObjC.exe.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/X86/Inputs/ObjC.obj.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/ObjC.obj.macho-x86_64
new file mode 100644
index 0000000..66edb3c
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/ObjC.obj.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/X86/Inputs/dylibLoadKinds.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/dylibLoadKinds.macho-x86_64
new file mode 100755
index 0000000..87d1f8c
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/dylibLoadKinds.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/X86/Inputs/hello.exe.macho-i386 b/test/tools/llvm-objdump/X86/Inputs/hello.exe.macho-i386
new file mode 100755
index 0000000..b1f7bd8
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/hello.exe.macho-i386
Binary files differ

diff --git a/test/tools/llvm-objdump/X86/Inputs/hello.exe.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/hello.exe.macho-x86_64
new file mode 100755
index 0000000..d004bed
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/hello.exe.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/X86/Inputs/hello.obj.macho-i386 b/test/tools/llvm-objdump/X86/Inputs/hello.obj.macho-i386
new file mode 100644
index 0000000..b69d4be
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/hello.obj.macho-i386
Binary files differ

diff --git a/test/tools/llvm-objdump/X86/Inputs/hello.obj.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/hello.obj.macho-x86_64
new file mode 100644
index 0000000..2b59a1c
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/hello.obj.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/X86/Inputs/hello_cpp.exe.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/hello_cpp.exe.macho-x86_64
new file mode 100755
index 0000000..6b54b15
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/Inputs/hello_cpp.exe.macho-x86_64
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/out-of-section-sym.elf-i386 b/test/tools/llvm-objdump/X86/Inputs/out-of-section-sym.elf-i386
similarity index 100%
rename from test/tools/llvm-objdump/Inputs/out-of-section-sym.elf-i386
rename to test/tools/llvm-objdump/X86/Inputs/out-of-section-sym.elf-i386
Binary files differ

diff --git a/test/tools/llvm-objdump/Inputs/trivial.obj.elf-i386 b/test/tools/llvm-objdump/X86/Inputs/trivial.obj.elf-i386
similarity index 100%
rename from test/tools/llvm-objdump/Inputs/trivial.obj.elf-i386
rename to test/tools/llvm-objdump/X86/Inputs/trivial.obj.elf-i386
Binary files differ

diff --git a/test/tools/llvm-objdump/disassembly-show-raw.test b/test/tools/llvm-objdump/X86/disassembly-show-raw.test
similarity index 100%
rename from test/tools/llvm-objdump/disassembly-show-raw.test
rename to test/tools/llvm-objdump/X86/disassembly-show-raw.test


diff --git a/test/tools/llvm-objdump/lit.local.cfg b/test/tools/llvm-objdump/X86/lit.local.cfg
similarity index 100%
rename from test/tools/llvm-objdump/lit.local.cfg
rename to test/tools/llvm-objdump/X86/lit.local.cfg


diff --git a/test/tools/llvm-objdump/X86/macho-private-headers.test b/test/tools/llvm-objdump/X86/macho-private-headers.test
new file mode 100644
index 0000000..685b4f7
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-private-headers.test

@@ -0,0 +1,368 @@
+// RUN: llvm-objdump -p %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s
+// RUN: llvm-objdump -p %p/Inputs/hello.exe.macho-x86_64 \
+// RUN:     | FileCheck %s -check-prefix=EXE
+// RUN: llvm-objdump -p %p/Inputs/dylibLoadKinds.macho-x86_64 \
+// RUN:     | FileCheck %s -check-prefix=LOAD
+
+CHECK: Mach header
+CHECK:       magic cputype cpusubtype  caps    filetype ncmds sizeofcmds      flags
+CHECK: MH_MAGIC_64  X86_64        ALL  0x00      OBJECT     3        496 SUBSECTIONS_VIA_SYMBOLS
+CHECK: Load command 0
+CHECK:       cmd LC_SEGMENT_64
+CHECK:   cmdsize 392
+CHECK:   segname 
+CHECK:    vmaddr 0x0000000000000000
+CHECK:    vmsize 0x00000000000000a8
+CHECK:   fileoff 528
+CHECK:  filesize 168
+CHECK:   maxprot rwx
+CHECK:  initprot rwx
+CHECK:    nsects 4
+CHECK:     flags (none)
+CHECK: Section
+CHECK:   sectname __text
+CHECK:    segname __TEXT
+CHECK:       addr 0x0000000000000000
+CHECK:       size 0x000000000000003b
+CHECK:     offset 528
+CHECK:      align 2^4 (16)
+CHECK:     reloff 696
+CHECK:     nreloc 2
+CHECK:       type S_REGULAR
+CHECK: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Section
+CHECK:   sectname __cstring
+CHECK:    segname __TEXT
+CHECK:       addr 0x000000000000003b
+CHECK:       size 0x000000000000000d
+CHECK:     offset 587
+CHECK:      align 2^0 (1)
+CHECK:     reloff 0
+CHECK:     nreloc 0
+CHECK:       type S_CSTRING_LITERALS
+CHECK: attributes (none)
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Section
+CHECK:   sectname __compact_unwind
+CHECK:    segname __LD
+CHECK:       addr 0x0000000000000048
+CHECK:       size 0x0000000000000020
+CHECK:     offset 600
+CHECK:      align 2^3 (8)
+CHECK:     reloff 712
+CHECK:     nreloc 1
+CHECK:       type S_REGULAR
+CHECK: attributes DEBUG
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Section
+CHECK:   sectname __eh_frame
+CHECK:    segname __TEXT
+CHECK:       addr 0x0000000000000068
+CHECK:       size 0x0000000000000040
+CHECK:     offset 632
+CHECK:      align 2^3 (8)
+CHECK:     reloff 0
+CHECK:     nreloc 0
+CHECK:       type S_COALESCED
+CHECK: attributes NO_TOC STRIP_STATIC_SYMS LIVE_SUPPORT
+CHECK:  reserved1 0
+CHECK:  reserved2 0
+CHECK: Load command 1
+CHECK:      cmd LC_SYMTAB
+CHECK:  cmdsize 24
+CHECK:   symoff 720
+CHECK:    nsyms 5
+CHECK:   stroff 800
+CHECK:  strsize 44
+CHECK: Load command 2
+CHECK:             cmd LC_DYSYMTAB
+CHECK:         cmdsize 80
+CHECK:       ilocalsym 0
+CHECK:       nlocalsym 2
+CHECK:      iextdefsym 2
+CHECK:      nextdefsym 2
+CHECK:       iundefsym 4
+CHECK:       nundefsym 1
+CHECK:          tocoff 0
+CHECK:            ntoc 0
+CHECK:       modtaboff 0
+CHECK:         nmodtab 0
+CHECK:    extrefsymoff 0
+CHECK:     nextrefsyms 0
+CHECK:  indirectsymoff 0
+CHECK:   nindirectsyms 0
+CHECK:       extreloff 0
+CHECK:         nextrel 0
+CHECK:       locreloff 0
+CHECK:         nlocrel 0
+
+EXE: Mach header
+EXE:       magic cputype cpusubtype  caps    filetype ncmds sizeofcmds      flags
+EXE: MH_MAGIC_64  X86_64        ALL LIB64     EXECUTE    16       1296   NOUNDEFS DYLDLINK TWOLEVEL PIE
+EXE: Load command 0
+EXE:       cmd LC_SEGMENT_64
+EXE:   cmdsize 72
+EXE:   segname __PAGEZERO
+EXE:    vmaddr 0x0000000000000000
+EXE:    vmsize 0x0000000100000000
+EXE:   fileoff 0
+EXE:  filesize 0
+EXE:   maxprot ---
+EXE:  initprot ---
+EXE:    nsects 0
+EXE:     flags (none)
+EXE: Load command 1
+EXE:       cmd LC_SEGMENT_64
+EXE:   cmdsize 552
+EXE:   segname __TEXT
+EXE:    vmaddr 0x0000000100000000
+EXE:    vmsize 0x0000000000001000
+EXE:   fileoff 0
+EXE:  filesize 4096
+EXE:   maxprot rwx
+EXE:  initprot r-x
+EXE:    nsects 6
+EXE:     flags (none)
+EXE: Section
+EXE:   sectname __text
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100000f30
+EXE:       size 0x000000000000003b
+EXE:     offset 3888
+EXE:      align 2^4 (16)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_REGULAR
+EXE: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __stubs
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100000f6c
+EXE:       size 0x0000000000000006
+EXE:     offset 3948
+EXE:      align 2^1 (2)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_SYMBOL_STUBS
+EXE: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+EXE:  reserved1 0 (index into indirect symbol table)
+EXE:  reserved2 6 (size of stubs)
+EXE: Section
+EXE:   sectname __stub_helper
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100000f74
+EXE:       size 0x000000000000001a
+EXE:     offset 3956
+EXE:      align 2^2 (4)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_REGULAR
+EXE: attributes PURE_INSTRUCTIONS SOME_INSTRUCTIONS
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __cstring
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100000f8e
+EXE:       size 0x000000000000000d
+EXE:     offset 3982
+EXE:      align 2^0 (1)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_CSTRING_LITERALS
+EXE: attributes (none)
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __unwind_info
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100000f9b
+EXE:       size 0x0000000000000048
+EXE:     offset 3995
+EXE:      align 2^0 (1)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_REGULAR
+EXE: attributes (none)
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __eh_frame
+EXE:    segname __TEXT
+EXE:       addr 0x0000000100000fe8
+EXE:       size 0x0000000000000018
+EXE:     offset 4072
+EXE:      align 2^3 (8)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_REGULAR
+EXE: attributes (none)
+EXE:  reserved1 0
+EXE:  reserved2 0
+EXE: Load command 2
+EXE:       cmd LC_SEGMENT_64
+EXE:   cmdsize 232
+EXE:   segname __DATA
+EXE:    vmaddr 0x0000000100001000
+EXE:    vmsize 0x0000000000001000
+EXE:   fileoff 4096
+EXE:  filesize 4096
+EXE:   maxprot rwx
+EXE:  initprot rw-
+EXE:    nsects 2
+EXE:     flags (none)
+EXE: Section
+EXE:   sectname __nl_symbol_ptr
+EXE:    segname __DATA
+EXE:       addr 0x0000000100001000
+EXE:       size 0x0000000000000010
+EXE:     offset 4096
+EXE:      align 2^3 (8)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_NON_LAZY_SYMBOL_POINTERS
+EXE: attributes (none)
+EXE:  reserved1 1 (index into indirect symbol table)
+EXE:  reserved2 0
+EXE: Section
+EXE:   sectname __la_symbol_ptr
+EXE:    segname __DATA
+EXE:       addr 0x0000000100001010
+EXE:       size 0x0000000000000008
+EXE:     offset 4112
+EXE:      align 2^3 (8)
+EXE:     reloff 0
+EXE:     nreloc 0
+EXE:       type S_LAZY_SYMBOL_POINTERS
+EXE: attributes (none)
+EXE:  reserved1 3 (index into indirect symbol table)
+EXE:  reserved2 0
+EXE: Load command 3
+EXE:       cmd LC_SEGMENT_64
+EXE:   cmdsize 72
+EXE:   segname __LINKEDIT
+EXE:    vmaddr 0x0000000100002000
+EXE:    vmsize 0x0000000000001000
+EXE:   fileoff 8192
+EXE:  filesize 304
+EXE:   maxprot rwx
+EXE:  initprot r--
+EXE:    nsects 0
+EXE:     flags (none)
+EXE: Load command 4
+EXE:             cmd LC_DYLD_INFO_ONLY
+EXE:         cmdsize 48
+EXE:      rebase_off 8192
+EXE:     rebase_size 8
+EXE:        bind_off 8200
+EXE:       bind_size 24
+EXE:   weak_bind_off 0
+EXE:  weak_bind_size 0
+EXE:   lazy_bind_off 8224
+EXE:  lazy_bind_size 16
+EXE:      export_off 8240
+EXE:     export_size 48
+EXE: Load command 5
+EXE:      cmd LC_SYMTAB
+EXE:  cmdsize 24
+EXE:   symoff 8360
+EXE:    nsyms 4
+EXE:   stroff 8440
+EXE:  strsize 56
+EXE: Load command 6
+EXE:             cmd LC_DYSYMTAB
+EXE:         cmdsize 80
+EXE:       ilocalsym 0
+EXE:       nlocalsym 0
+EXE:      iextdefsym 0
+EXE:      nextdefsym 2
+EXE:       iundefsym 2
+EXE:       nundefsym 2
+EXE:          tocoff 0
+EXE:            ntoc 0
+EXE:       modtaboff 0
+EXE:         nmodtab 0
+EXE:    extrefsymoff 0
+EXE:     nextrefsyms 0
+EXE:  indirectsymoff 8424
+EXE:   nindirectsyms 4
+EXE:       extreloff 0
+EXE:         nextrel 0
+EXE:       locreloff 0
+EXE:         nlocrel 0
+EXE: Load command 7
+EXE:           cmd LC_LOAD_DYLINKER
+EXE:       cmdsize 32
+EXE:          name /usr/lib/dyld (offset 12)
+EXE: Load command 8
+EXE:      cmd LC_UUID
+EXE:  cmdsize 24
+EXE:     uuid 65C2DD41-79B0-3B34-871B-8CB3446AB762
+EXE: Load command 9
+EXE:       cmd LC_VERSION_MIN_MACOSX
+EXE:   cmdsize 16
+EXE:   version 10.9
+EXE:       sdk 10.9
+EXE: Load command 10
+EXE:       cmd LC_SOURCE_VERSION
+EXE:   cmdsize 16
+EXE:   version 0.0
+EXE: Load command 11
+EXE:        cmd LC_MAIN
+EXE:    cmdsize 24
+EXE:   entryoff 3888
+EXE:  stacksize 0
+EXE: Load command 12
+EXE:           cmd LC_LOAD_DYLIB
+EXE:       cmdsize 56
+EXE:          name /usr/lib/libSystem.B.dylib (offset 24)
+EXE:       current version 1197.1.1
+EXE: compatibility version 1.0.0
+EXE: Load command 13
+EXE:       cmd LC_FUNCTION_STARTS
+EXE:   cmdsize 16
+EXE:   dataoff 8288
+EXE:  datasize 8
+EXE: Load command 14
+EXE:       cmd LC_DATA_IN_CODE
+EXE:   cmdsize 16
+EXE:   dataoff 8296
+EXE:  datasize 0
+EXE: Load command 15
+EXE:       cmd LC_DYLIB_CODE_SIGN_DRS
+EXE:   cmdsize 16
+EXE:   dataoff 8296
+EXE:  datasize 64
+
+
+LOAD: Load command 10
+LOAD:           cmd LC_LOAD_DYLIB
+LOAD:       cmdsize 48
+LOAD:          name /usr/lib/foo1.dylib (offset 24)
+LOAD:       current version 0.0.0
+LOAD: compatibility version 0.0.0
+LOAD: Load command 11
+LOAD:           cmd LC_LOAD_WEAK_DYLIB
+LOAD:       cmdsize 48
+LOAD:          name /usr/lib/foo2.dylib (offset 24)
+LOAD:       current version 0.0.0
+LOAD: compatibility version 0.0.0
+LOAD: Load command 12
+LOAD:           cmd LC_REEXPORT_DYLIB
+LOAD:       cmdsize 48
+LOAD:          name /usr/lib/foo3.dylib (offset 24)
+LOAD:       current version 0.0.0
+LOAD: compatibility version 0.0.0
+LOAD: Load command 13
+LOAD:           cmd LC_LAZY_LOAD_DYLIB
+LOAD:       cmdsize 48
+LOAD:          name /usr/lib/foo4.dylib (offset 24)
+LOAD:       current version 0.0.0
+LOAD: compatibility version 0.0.0
+

diff --git a/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test b/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test
new file mode 100644
index 0000000..1e1080a
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-symbolized-disassembly.test

@@ -0,0 +1,38 @@
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.obj.macho-x86_64 | FileCheck %s -check-prefix=OBJ
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.exe.macho-x86_64 | FileCheck %s -check-prefix=EXE
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/ObjC.obj.macho-x86_64 | FileCheck %s -check-prefix=ObjC-OBJ
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/ObjC.exe.macho-x86_64 | FileCheck %s -check-prefix=ObjC-EXE
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello_cpp.exe.macho-x86_64 | FileCheck %s -check-prefix=CXX-EXE
+
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.obj.macho-i386 | FileCheck %s -check-prefix=i386-OBJ
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/hello.exe.macho-i386 | FileCheck %s -check-prefix=i386-EXE
+
+OBJ: 0000000000000008	leaq	L_.str(%rip), %rax      ## literal pool for: "Hello world\n"
+OBJ: 0000000000000026	callq	_printf
+
+EXE: 0000000100000f38	leaq	0x4f(%rip), %rax        ## literal pool for: "Hello world\n"
+EXE: 0000000100000f56	callq	0x100000f6c             ## symbol stub for: _printf
+
+ObjC-OBJ: 0000000000000008	leaq	0xb1(%rip), %rax        ## Objc cfstring ref: @"The current date and time is: %@"
+ObjC-OBJ: 0000000000000016	movq	0x4b(%rip), %rcx        ## Objc class ref: NSObject
+ObjC-OBJ: 000000000000001d	movq	0x64(%rip), %rsi        ## Objc selector ref: new
+ObjC-OBJ: 0000000000000034	movq	0x35(%rip), %rax        ## Objc class ref: NSDate
+ObjC-OBJ: 000000000000003b	movq	0x4e(%rip), %rsi        ## Objc selector ref: date
+
+ObjC-EXE: 0000000100000ee8	leaq	0x159(%rip), %rax       ## Objc cfstring ref: @"The current date and time is: %@"
+ObjC-EXE: 0000000100000ef6	movq	0x13b(%rip), %rcx       ## Objc class ref: _OBJC_CLASS_$_NSObject
+ObjC-EXE: 0000000100000efd	movq	0x124(%rip), %rsi       ## Objc selector ref: new
+ObjC-EXE: 0000000100000f0b	callq	0x100000f4a             ## Objc message: +[NSObject new]
+ObjC-EXE: 0000000100000f14	movq	0x125(%rip), %rax       ## Objc class ref: _OBJC_CLASS_$_NSDate
+ObjC-EXE: 0000000100000f1b	movq	0x10e(%rip), %rsi       ## Objc selector ref: date
+ObjC-EXE: 0000000100000f25	callq	0x100000f4a             ## Objc message: +[NSDate date]
+ObjC-EXE: 0000000100000f33	callq	0x100000f44             ## symbol stub for: _NSLog
+
+CXX-EXE: 00000001000014cb	callq	__ZNSt3__116__pad_and_outputIcNS_11char_traitsIcEEEENS_19ostreambuf_iteratorIT_T0_EES6_PKS4_S8_S8_RNS_8ios_baseES4_
+
+// FIXME: Demangler depends on host's <cxxabi.h>.
+// std::__1::ostreambuf_iterator<char, std::__1::char_traits<char> > std::__1::__pad_and_output<char, std::__1::char_traits<char> >(std::__1::ostreambuf_iterator<char, std::__1::char_traits<char> >, char const*, char const*, char const*, std::__1::ios_base&, char)
+
+i386-OBJ: 0000002f	calll	_printf
+
+i386-EXE: 00001f6f	calll	0x1f84                  ## symbol stub for: _printf

diff --git a/test/tools/llvm-objdump/X86/macho-symbolized-subtractor-i386.test b/test/tools/llvm-objdump/X86/macho-symbolized-subtractor-i386.test
new file mode 100644
index 0000000..a0f753b
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-symbolized-subtractor-i386.test

@@ -0,0 +1,10 @@
+# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -d - | FileCheck %s
+
+nop
+x:
+leal	x-y(%eax), %ebx
+.data
+y:
+.quad 0
+
+# CHECK: leal	x-y(%eax), %ebx

diff --git a/test/tools/llvm-objdump/X86/macho-symbolized-subtractor.test b/test/tools/llvm-objdump/X86/macho-symbolized-subtractor.test
new file mode 100644
index 0000000..a730b5c
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-symbolized-subtractor.test

@@ -0,0 +1,10 @@
+# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -d - | FileCheck %s
+
+nop
+x:
+leaq	x-y(%rax), %rbx
+.data
+y:
+.quad 0
+
+# CHECK: leaq	x-y(%rax), %rbx

diff --git a/test/tools/llvm-objdump/out-of-section-sym.test b/test/tools/llvm-objdump/X86/out-of-section-sym.test
similarity index 100%
rename from test/tools/llvm-objdump/out-of-section-sym.test
rename to test/tools/llvm-objdump/X86/out-of-section-sym.test


diff --git a/test/tools/llvm-objdump/coff-large-bss.test b/test/tools/llvm-objdump/coff-large-bss.test
new file mode 100644
index 0000000..dc0fc67
--- /dev/null
+++ b/test/tools/llvm-objdump/coff-large-bss.test

@@ -0,0 +1,3 @@
+RUN: llvm-objdump -s %p/Inputs/large-bss.obj.coff-i386 | FileCheck %s
+
+: CHECK: <skipping contents of bss section at [0000, 010f)>

diff --git a/test/tools/llvm-objdump/macho-bad-ordinal.test b/test/tools/llvm-objdump/macho-bad-ordinal.test
new file mode 100644
index 0000000..16badcc
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-bad-ordinal.test

@@ -0,0 +1,6 @@
+# RUN: llvm-objdump -macho -bind -lazy-bind %p/Inputs/bad-ordinal.macho-x86_64 \
+# RUN:   | FileCheck %s 
+
+
+# CHECK: __DATA   __nl_symbol_ptr    0x100001000 pointer         0 <<bad library ordinal>> dyld_stub_binder
+# CHECK: __DATA   __la_symbol_ptr    0x100001010 <<bad library ordinal>> _printf

diff --git a/test/tools/llvm-objdump/macho-bind.test b/test/tools/llvm-objdump/macho-bind.test
new file mode 100644
index 0000000..5527bfa
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-bind.test

@@ -0,0 +1,10 @@
+# RUN: llvm-objdump -macho -bind %p/Inputs/bind.macho-x86_64 \
+# RUN:   | FileCheck %s  
+
+
+# CHECK:__DATA   __data             0x00001028  pointer  0        flat-namespace      _any
+# CHECK:__DATA   __data             0x00001020  pointer  0        main-executable     _fromApp
+# CHECK:__DATA   __data             0x00001018  pointer  0        this-image          _myfunc
+# CHECK:__DATA   __data             0x00001000  pointer  0        libfoo              _foo
+# CHECK:__DATA   __data             0x00001008  pointer  0        libbar              _bar
+# CHECK:__DATA   __data             0x00001010  pointer  0        libSystem           _malloc

diff --git a/test/tools/llvm-objdump/macho-bind2.test b/test/tools/llvm-objdump/macho-bind2.test
new file mode 100644
index 0000000..2eee2fc
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-bind2.test

@@ -0,0 +1,5 @@
+# RUN: llvm-objdump -macho -bind %p/Inputs/bind2.macho-x86_64 | FileCheck %s  
+
+# CHECK: __DATA   __data             0x00001008 pointer         0 libSystem        _malloc
+# CHECK: __DATA   __data             0x00001050 pointer         0 libSystem        _malloc
+# CHECK: __DATA   __data             0x00001458 pointer         0 libSystem        _malloc

diff --git a/test/tools/llvm-objdump/macho-compact-unwind-i386.test b/test/tools/llvm-objdump/macho-compact-unwind-i386.test
new file mode 100644
index 0000000..9a14c20
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-compact-unwind-i386.test

@@ -0,0 +1,27 @@
+# RUN: llvm-objdump -unwind-info %p/Inputs/compact-unwind.macho-i386 | FileCheck %s
+
+# CHECK: Contents of __compact_unwind section:
+# CHECK:   Entry at offset 0x0:
+# CHECK:     start:                0x0 __Z10test_throwv
+# CHECK:     length:               0x55
+# CHECK:     compact encoding:     0x01010005
+# CHECK-NOT: personality function
+# CHECK-NOT: LSDA
+# CHECK:   Entry at offset 0x14:
+# CHECK:     start:                0x60 __Z11test_catch1v
+# CHECK:     length:               0x6f
+# CHECK:     compact encoding:     0x41000000
+# CHECK:     personality function: 0x288 __pointers + 0x8
+# CHECK:     LSDA:                 0x180 GCC_except_table1
+# CHECK:   Entry at offset 0x28:
+# CHECK:     start:                0xd0 __Z11test_catch2v
+# CHECK:     length:               0x75
+# CHECK:     compact encoding:     0x41000000
+# CHECK:     personality function: 0x288 __pointers + 0x8
+# CHECK:     LSDA:                 0x1a8 GCC_except_table2
+# CHECK:   Entry at offset 0x3c:
+# CHECK:     start:                0x150 __Z3foov
+# CHECK:     length:               0x22
+# CHECK:     compact encoding:     0x01000000
+# CHECK-NOT: personality function
+# CHECK-NOT: LSDA

diff --git a/test/tools/llvm-objdump/macho-compact-unwind-x86_64.test b/test/tools/llvm-objdump/macho-compact-unwind-x86_64.test
new file mode 100644
index 0000000..852800d
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-compact-unwind-x86_64.test

@@ -0,0 +1,27 @@
+# RUN: llvm-objdump -unwind-info %p/Inputs/compact-unwind.macho-x86_64 | FileCheck %s
+
+# CHECK: Contents of __compact_unwind section:
+# CHECK:   Entry at offset 0x0:
+# CHECK:     start:                0x1 __Z10test_throwv + 0x1
+# CHECK:     length:               0x44
+# CHECK:     compact encoding:     0x01000000
+# CHECK-NOT: personality function
+# CHECK-NOT: LSDA
+# CHECK:   Entry at offset 0x20:
+# CHECK:     start:                0x50 __Z11test_catch1v
+# CHECK:     length:               0x71
+# CHECK:     compact encoding:     0x41000000
+# CHECK:     personality function: 0x0 ___gxx_personality_v0
+# CHECK:     LSDA:                 0x180 GCC_except_table1
+# CHECK:   Entry at offset 0x40:
+# CHECK:     start:                0xd0 __Z11test_catch2v
+# CHECK:     length:               0x77
+# CHECK:     compact encoding:     0x41000000
+# CHECK:     personality function: 0x0 ___gxx_personality_v0
+# CHECK:     LSDA:                 0x1a8 GCC_except_table2
+# CHECK:   Entry at offset 0x60:
+# CHECK:     start:                0x150 __Z3foov
+# CHECK:     length:               0x25
+# CHECK:     compact encoding:     0x01000000
+# CHECK-NOT: personality function
+# CHECK-NOT: LSDA

diff --git a/test/tools/llvm-objdump/macho-exports-trie.test b/test/tools/llvm-objdump/macho-exports-trie.test
new file mode 100644
index 0000000..473c7cb
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-exports-trie.test

@@ -0,0 +1,11 @@
+# RUN: llvm-objdump -macho -exports-trie -arch x86_64 \
+# RUN:   %p/Inputs/exports-trie.macho-x86_64 2>/dev/null | FileCheck %s
+
+
+# CHECK:[re-export] _malloc (from libSystem)
+# CHECK:[re-export] _myfree (_free from libSystem)
+# CHECK:0x00000F70  _myWeak [weak_def]
+# CHECK:0x00001018  _myTLV [per-thread]
+# CHECK:0x12345678  _myAbs [absolute]
+# CHECK:0x00000F60  _foo
+

diff --git a/test/tools/llvm-objdump/macho-lazy-bind.test b/test/tools/llvm-objdump/macho-lazy-bind.test
new file mode 100644
index 0000000..088ea06
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-lazy-bind.test

@@ -0,0 +1,7 @@
+# RUN: llvm-objdump -macho -lazy-bind %p/Inputs/lazy-bind.macho-x86_64 \
+# RUN:   | FileCheck %s 
+
+
+# CHECK: __DATA   __la_symbol_ptr    0x100001010   libfoo          _foo
+# CHECK: __DATA   __la_symbol_ptr    0x100001018   libbar          _bar
+# CHECK: __DATA   __la_symbol_ptr    0x100001020   libSystem       _malloc

diff --git a/test/tools/llvm-objdump/macho-rebase.test b/test/tools/llvm-objdump/macho-rebase.test
new file mode 100644
index 0000000..96df390
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-rebase.test

@@ -0,0 +1,15 @@
+# RUN: llvm-objdump -macho -rebase -arch x86_64 \
+# RUN:   %p/Inputs/rebase.macho-x86_64 | FileCheck %s
+
+
+# CHECK: segment  section            address     type
+# CHECK: __DATA   __data             0x00001010  pointer
+# CHECK: __DATA   __data             0x00001028  pointer
+# CHECK: __DATA   __data             0x00001030  pointer
+# CHECK: __DATA   __data             0x00001038  pointer
+# CHECK: __DATA   __data             0x00001040  pointer
+# CHECK: __DATA   __data             0x00001258  pointer
+# CHECK: __DATA   __mystuff          0x00001278  pointer
+# CHECK: __DATA   __mystuff          0x00001288  pointer
+# CHECK: __DATA   __mystuff          0x00001298  pointer
+# CHECK: __DATA   __mystuff          0x000012A8  pointer

diff --git a/test/tools/llvm-objdump/macho-unwind-info-arm64.test b/test/tools/llvm-objdump/macho-unwind-info-arm64.test
new file mode 100644
index 0000000..712edef
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-unwind-info-arm64.test

@@ -0,0 +1,28 @@
+# RUN: llvm-objdump -unwind-info %p/Inputs/unwind-info.macho-arm64 2>/dev/null | FileCheck %s
+
+# The 2nd level index here is "regular", including all offsets & encodings in
+# full.
+
+# CHECK: Contents of __unwind_info section:
+# CHECK:   Version:                                   0x1
+# CHECK:   Common encodings array section offset:     0x1c
+# CHECK:   Number of common encodings in array:       0x2
+# CHECK:   Personality function array section offset: 0x24
+# CHECK:   Number of personality functions in array:  0x1
+# CHECK:   Index array section offset:                0x28
+# CHECK:   Number of indices in array:                0x2
+# CHECK:   Common encodings: (count = 2)
+# CHECK:     encoding[0]: 0x04000000
+# CHECK:     encoding[1]: 0x54000000
+# CHECK:   Personality functions: (count = 1)
+# CHECK:     personality[1]: 0x00008008
+# CHECK:   Top level indices: (count = 2)
+# CHECK:     [0]: function offset=0x00007d64, 2nd level page offset=0x00000050, LSDA offset=0x00000040
+# CHECK:     [1]: function offset=0x00007eb5, 2nd level page offset=0x00000000, LSDA offset=0x00000050
+# CHECK:   LSDA descriptors:
+# CHECK:     [0]: function offset=0x00007d90, LSDA offset=0x00007f44
+# CHECK:     [1]: function offset=0x00007e10, LSDA offset=0x00007f6c
+# CHECK:   Second level indices:
+# CHECK:     Second level index[0]: offset in section=0x00000050, base function offset=0x00007d64
+# CHECK:       [0]: function offset=0x00007d90, encoding=0x78563412
+# CHECK:       [1]: function offset=0x00007e10, encoding=0x21436587

diff --git a/test/tools/llvm-objdump/macho-unwind-info-no-relocs.test b/test/tools/llvm-objdump/macho-unwind-info-no-relocs.test
new file mode 100644
index 0000000..3adad65
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-unwind-info-no-relocs.test

@@ -0,0 +1,8 @@
+# RUN: llvm-objdump -unwind-info %p/Inputs/unwind-info-no-relocs.macho-x86_64 2>/dev/null | FileCheck %s
+
+# Make sure we can deal with __compact_unwind sections that don't have helpful
+# relocations.
+
+# CHECK: Contents of __compact_unwind section:
+# CHECK:   Entry at offset 0x0:
+# CHECK:     start: 0x100000f7e

diff --git a/test/tools/llvm-objdump/macho-unwind-info-x86_64.test b/test/tools/llvm-objdump/macho-unwind-info-x86_64.test
new file mode 100644
index 0000000..1333d9a
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-unwind-info-x86_64.test

@@ -0,0 +1,29 @@
+# RUN: llvm-objdump -unwind-info %p/Inputs/unwind-info.macho-x86_64 2>/dev/null | FileCheck %s
+
+# The 2nd level index in this file is in compressed form, referring to both
+# common and packed encodings.
+
+# CHECK:Contents of __unwind_info section:
+# CHECK:  Version:                                   0x1
+# CHECK:  Common encodings array section offset:     0x1c
+# CHECK:  Number of common encodings in array:       0x2
+# CHECK:  Personality function array section offset: 0x24
+# CHECK:  Number of personality functions in array:  0x1
+# CHECK:  Index array section offset:                0x28
+# CHECK:  Number of indices in array:                0x2
+# CHECK:  Common encodings: (count = 2)
+# CHECK:    encoding[0]: 0x01000000
+# CHECK:    encoding[1]: 0x51000000
+# CHECK:  Personality functions: (count = 1)
+# CHECK:    personality[1]: 0x00001018
+# CHECK:  Top level indices: (count = 2)
+# CHECK:    [0]: function offset=0x00000d70, 2nd level page offset=0x00000050, LSDA offset=0x00000040
+# CHECK:    [1]: function offset=0x00000eab, 2nd level page offset=0x00000000, LSDA offset=0x00000050
+# CHECK:  LSDA descriptors:
+# CHECK:    [0]: function offset=0x00000db0, LSDA offset=0x00000f0c
+# CHECK:    [1]: function offset=0x00000e20, LSDA offset=0x00000f34
+# CHECK:  Second level indices:
+# CHECK:    Second level index[0]: offset in section=0x00000050, base function offset=0x00000d70
+# CHECK:      [0]: function offset=0x00000d70, encoding[0]=0x01000000
+# CHECK:      [1]: function offset=0x00000db0, encoding[1]=0x51000000
+# CHECK:      [2]: function offset=0x00000e20, encoding[2]=0x01234567

diff --git a/test/tools/llvm-objdump/macho-weak-bind.test b/test/tools/llvm-objdump/macho-weak-bind.test
new file mode 100644
index 0000000..1013132
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-weak-bind.test

@@ -0,0 +1,10 @@
+# RUN: llvm-objdump -macho -weak-bind  %p/Inputs/weak-bind.macho-x86_64 \
+# RUN:   | FileCheck %s
+
+
+# CHECK: __DATA   __data             0x100001018   pointer  0        __ZTISt12out_of_range
+# CHECK: __DATA   __data             0x100001020   pointer  0        __ZTISt12out_of_range
+# CHECK: __DATA   __data             0x100001028   pointer  0        __ZTISt12out_of_range
+# CHECK:                                           strong            __ZdlPv
+# CHECK: __DATA   __data             0x100001018   pointer  0        __Znam
+# CHECK:                                           strong            __Znwm

diff --git a/test/tools/llvm-profdata/Inputs/bad-hash.profdata b/test/tools/llvm-profdata/Inputs/bad-hash.proftext
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/bad-hash.profdata
rename to test/tools/llvm-profdata/Inputs/bad-hash.proftext


diff --git a/test/tools/llvm-profdata/Inputs/bar3-1.profdata b/test/tools/llvm-profdata/Inputs/bar3-1.proftext
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/bar3-1.profdata
rename to test/tools/llvm-profdata/Inputs/bar3-1.proftext


diff --git a/test/tools/llvm-profdata/Inputs/c-general.profdata b/test/tools/llvm-profdata/Inputs/c-general.profraw
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/c-general.profdata
rename to test/tools/llvm-profdata/Inputs/c-general.profraw
Binary files differ

diff --git a/test/tools/llvm-profdata/Inputs/compat.profdata.v1 b/test/tools/llvm-profdata/Inputs/compat.profdata.v1
new file mode 100644
index 0000000..fd17459
--- /dev/null
+++ b/test/tools/llvm-profdata/Inputs/compat.profdata.v1
Binary files differ

diff --git a/test/tools/llvm-profdata/Inputs/empty.profdata b/test/tools/llvm-profdata/Inputs/empty.proftext
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/empty.profdata
rename to test/tools/llvm-profdata/Inputs/empty.proftext


diff --git a/test/tools/llvm-profdata/Inputs/extra-word.profdata b/test/tools/llvm-profdata/Inputs/extra-word.proftext
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/extra-word.profdata
rename to test/tools/llvm-profdata/Inputs/extra-word.proftext


diff --git a/test/tools/llvm-profdata/Inputs/foo3-1.profdata b/test/tools/llvm-profdata/Inputs/foo3-1.proftext
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/foo3-1.profdata
rename to test/tools/llvm-profdata/Inputs/foo3-1.proftext


diff --git a/test/tools/llvm-profdata/Inputs/foo3-2.profdata b/test/tools/llvm-profdata/Inputs/foo3-2.proftext
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/foo3-2.profdata
rename to test/tools/llvm-profdata/Inputs/foo3-2.proftext


diff --git a/test/tools/llvm-profdata/Inputs/foo3bar3-1.profdata b/test/tools/llvm-profdata/Inputs/foo3bar3-1.proftext
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/foo3bar3-1.profdata
rename to test/tools/llvm-profdata/Inputs/foo3bar3-1.proftext


diff --git a/test/tools/llvm-profdata/Inputs/foo3bar3-2.profdata b/test/tools/llvm-profdata/Inputs/foo3bar3-2.profdata
deleted file mode 100644
index f1f10bd..0000000
--- a/test/tools/llvm-profdata/Inputs/foo3bar3-2.profdata
+++ /dev/null

@@ -1,13 +0,0 @@
-foo
-3
-3
-17
-19
-23
-
-bar
-3
-3
-29
-31
-37

diff --git a/test/tools/llvm-profdata/Inputs/foo4-1.profdata b/test/tools/llvm-profdata/Inputs/foo4-1.profdata
deleted file mode 100644
index 31d2a2c..0000000
--- a/test/tools/llvm-profdata/Inputs/foo4-1.profdata
+++ /dev/null

@@ -1,7 +0,0 @@
-foo
-4
-4
-11
-22
-33
-44

diff --git a/test/tools/llvm-profdata/Inputs/foo4-2.profdata b/test/tools/llvm-profdata/Inputs/foo4-2.profdata
deleted file mode 100644
index 01d8309..0000000
--- a/test/tools/llvm-profdata/Inputs/foo4-2.profdata
+++ /dev/null

@@ -1,7 +0,0 @@
-foo
-4
-4
-7
-6
-5
-4

diff --git a/test/tools/llvm-profdata/Inputs/invalid-count-later.profdata b/test/tools/llvm-profdata/Inputs/invalid-count-later.proftext
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/invalid-count-later.profdata
rename to test/tools/llvm-profdata/Inputs/invalid-count-later.proftext


diff --git a/test/tools/llvm-profdata/Inputs/no-counts.profdata b/test/tools/llvm-profdata/Inputs/no-counts.proftext
similarity index 100%
rename from test/tools/llvm-profdata/Inputs/no-counts.profdata
rename to test/tools/llvm-profdata/Inputs/no-counts.proftext


diff --git a/test/tools/llvm-profdata/Inputs/overflow.profdata b/test/tools/llvm-profdata/Inputs/overflow.profdata
deleted file mode 100644
index c9a9d69..0000000
--- a/test/tools/llvm-profdata/Inputs/overflow.profdata
+++ /dev/null

@@ -1,4 +0,0 @@
-overflow
-1
-1
-9223372036854775808

diff --git a/test/tools/llvm-profdata/Inputs/sample-profile.proftext b/test/tools/llvm-profdata/Inputs/sample-profile.proftext
new file mode 100644
index 0000000..9dc6d43
--- /dev/null
+++ b/test/tools/llvm-profdata/Inputs/sample-profile.proftext

@@ -0,0 +1,12 @@
+_Z3bari:20301:1437
+1: 1437
+_Z3fooi:7711:610
+1: 610
+main:184019:0
+4: 534
+4.2: 534
+5: 1075
+5.1: 1075
+6: 2080
+7: 534
+9: 2064  _Z3bari:1471  _Z3fooi:631

diff --git a/test/tools/llvm-profdata/c-general.test b/test/tools/llvm-profdata/c-general.test
index 9b6cd7f..0143530 100644
--- a/test/tools/llvm-profdata/c-general.test
+++ b/test/tools/llvm-profdata/c-general.test

@@ -7,10 +7,10 @@
 REGENERATE: $ TESTDIR=$SRC/test/tools/llvm-profdata
 REGENERATE: $ CFE_TESTDIR=$CFE/test/Profile
 REGENERATE: $ clang -o a.out -fprofile-instr-generate $CFE_TESTDIR/test/Profile/c-general.c
-REGENERATE: $ LLVM_PROFILE_FILE=$TESTDIR/Inputs/c-general.profdata ./a.out
+REGENERATE: $ LLVM_PROFILE_FILE=$TESTDIR/Inputs/c-general.profraw ./a.out
 
-RUN: llvm-profdata show %p/Inputs/c-general.profdata -o - | FileCheck %s -check-prefix=CHECK
-RUN: llvm-profdata show %p/Inputs/c-general.profdata -o - --function=switches | FileCheck %s -check-prefix=SWITCHES -check-prefix=CHECK
+RUN: llvm-profdata show %p/Inputs/c-general.profraw -o - | FileCheck %s -check-prefix=CHECK
+RUN: llvm-profdata show %p/Inputs/c-general.profraw -o - --function=switches | FileCheck %s -check-prefix=SWITCHES -check-prefix=CHECK
 
 SWITCHES-LABEL: Counters:
 SWITCHES-NEXT:   switches:

diff --git a/test/tools/llvm-profdata/compat.proftext b/test/tools/llvm-profdata/compat.proftext
new file mode 100644
index 0000000..14da337
--- /dev/null
+++ b/test/tools/llvm-profdata/compat.proftext

@@ -0,0 +1,47 @@
+# Compatibility tests for older profile format versions. These ensure
+# that we don't break compatibility with an older profile version
+# without noticing it.
+
+# The input file at %S/Inputs/compat.profdata.v1 was generated with
+# llvm-profdata merge from r214548.
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v1 --function function_count_only --counts | FileCheck %s -check-prefix=FUNC_COUNT_ONLY
+function_count_only
+0
+1
+97531
+# FUNC_COUNT_ONLY:      Hash: 0x{{0+$}}
+# FUNC_COUNT_ONLY-NEXT: Counters: 1
+# FUNC_COUNT_ONLY-NEXT: Function count: 97531
+# FUNC_COUNT_ONLY-NEXT: Block counts: []
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v1 --function "name with spaces" --counts | FileCheck %s -check-prefix=SPACES
+name with spaces
+1024
+2
+0
+0
+# SPACES:      Hash: 0x{{0+}}400
+# SPACES-NEXT: Counters: 2
+# SPACES-NEXT: Function count: 0
+# SPACES-NEXT: Block counts: [0]
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v1 --function large_numbers --counts | FileCheck %s -check-prefix=LARGENUM
+large_numbers
+4611686018427387903
+6
+2305843009213693952
+1152921504606846976
+576460752303423488
+288230376151711744
+144115188075855872
+72057594037927936
+# LARGENUM:      Hash: 0x3fffffffffffffff
+# LARGENUM-NEXT: Counters: 6
+# LARGENUM-NEXT: Function count: 2305843009213693952
+# LARGENUM-NEXT: Block counts: [1152921504606846976, 576460752303423488, 288230376151711744, 144115188075855872, 72057594037927936]
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v1 | FileCheck %s -check-prefix=SUMMARY
+# SUMMARY: Total functions: 3
+# SUMMARY: Maximum function count: 2305843009213693952
+# SUMMARY: Maximum internal block count: 1152921504606846976

diff --git a/test/tools/llvm-profdata/count-mismatch.proftext b/test/tools/llvm-profdata/count-mismatch.proftext
new file mode 100644
index 0000000..1a2e73f
--- /dev/null
+++ b/test/tools/llvm-profdata/count-mismatch.proftext

@@ -0,0 +1,40 @@
+# Make sure we don't try to combine counters with the same function
+# name and a matching hash if the number of counters differs
+
+# RUN: llvm-profdata merge %s -o %t.profdata 2>&1 | FileCheck -check-prefix=MERGE_ERRS %s
+# RUN: llvm-profdata show %t.profdata -all-functions -counts > %t.out
+# RUN: FileCheck %s -input-file %t.out
+foo
+1024
+4
+1
+2
+4
+8
+
+# The hash matches, but we can't combine these because the number of
+# counters differs.
+# MERGE_ERRS: count-mismatch.proftext: foo: Function count mismatch
+foo
+1024
+3
+2
+4
+8
+
+# This one does match, so it should combine with the first just fine.
+# CHECK: Hash: 0x{{0+}}400
+# CHECK-NEXT: Counters: 4
+# CHECK-NEXT: Function count: 5
+# CHECK-NEXT: Block counts: [10, 20, 40]
+foo
+1024
+4
+4
+8
+16
+32
+
+# CHECK: Total functions: 1
+# CHECK: Maximum function count: 5
+# CHECK: Maximum internal block count: 40

diff --git a/test/tools/llvm-profdata/errors.test b/test/tools/llvm-profdata/errors.test
deleted file mode 100644
index 28262ef..0000000
--- a/test/tools/llvm-profdata/errors.test
+++ /dev/null

@@ -1,16 +0,0 @@
-RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/foo4-1.profdata -o %t.out 2>&1 | FileCheck %s --check-prefix=HASH
-HASH: foo4-1.profdata: foo: Function hash mismatch
-
-RUN: llvm-profdata merge %p/Inputs/overflow.profdata %p/Inputs/overflow.profdata -o %t.out 2>&1 | FileCheck %s --check-prefix=OVERFLOW
-OVERFLOW: overflow.profdata: overflow: Counter overflow
-
-RUN: not llvm-profdata show %p/Inputs/invalid-count-later.profdata 2>&1 | FileCheck %s --check-prefix=INVALID-COUNT-LATER
-RUN: not llvm-profdata merge %p/Inputs/invalid-count-later.profdata %p/Inputs/invalid-count-later.profdata -o %t.out 2>&1 | FileCheck %s --check-prefix=INVALID-COUNT-LATER
-INVALID-COUNT-LATER: error: {{.*}}invalid-count-later.profdata: Malformed profile data
-
-RUN: not llvm-profdata show %p/Inputs/bad-hash.profdata 2>&1 | FileCheck %s --check-prefix=BAD-HASH
-RUN: not llvm-profdata merge %p/Inputs/bad-hash.profdata %p/Inputs/bad-hash.profdata -o %t.out 2>&1 | FileCheck %s --check-prefix=BAD-HASH
-BAD-HASH: error: {{.*}}bad-hash.profdata: Malformed profile data
-
-RUN: not llvm-profdata show %p/Inputs/no-counts.profdata 2>&1 | FileCheck %s --check-prefix=NO-COUNTS
-NO-COUNTS: error: {{.*}}no-counts.profdata: Malformed profile data

diff --git a/test/tools/llvm-profdata/general.proftext b/test/tools/llvm-profdata/general.proftext
new file mode 100644
index 0000000..591d262
--- /dev/null
+++ b/test/tools/llvm-profdata/general.proftext

@@ -0,0 +1,56 @@
+
+
+# RUN: llvm-profdata merge %s -o %t.profdata
+
+# RUN: llvm-profdata show %t.profdata --function function_count_only --counts | FileCheck %s -check-prefix=FUNC_COUNT_ONLY
+function_count_only
+0
+1
+97531
+# FUNC_COUNT_ONLY:      Hash: 0x{{0+$}}
+# FUNC_COUNT_ONLY-NEXT: Counters: 1
+# FUNC_COUNT_ONLY-NEXT: Function count: 97531
+# FUNC_COUNT_ONLY-NEXT: Block counts: []
+
+# RUN: llvm-profdata show %t.profdata --function "name with spaces" --counts | FileCheck %s -check-prefix=SPACES
+name with spaces
+1024
+2
+0
+0
+# SPACES:      Hash: 0x{{0+}}400
+# SPACES-NEXT: Counters: 2
+# SPACES-NEXT: Function count: 0
+# SPACES-NEXT: Block counts: [0]
+
+# RUN: llvm-profdata show %t.profdata --function large_numbers --counts | FileCheck %s -check-prefix=LARGENUM
+large_numbers
+4611686018427387903
+6
+2305843009213693952
+1152921504606846976
+576460752303423488
+288230376151711744
+144115188075855872
+72057594037927936
+# LARGENUM:      Hash: 0x3fffffffffffffff
+# LARGENUM-NEXT: Counters: 6
+# LARGENUM-NEXT: Function count: 2305843009213693952
+# LARGENUM-NEXT: Block counts: [1152921504606846976, 576460752303423488, 288230376151711744, 144115188075855872, 72057594037927936]
+
+# RUN: llvm-profdata show %t.profdata --function NOSUCHFUNC | FileCheck %s -check-prefix=NOSUCHFUNC
+# NOSUCHFUNC-NOT: Counters:
+# NOSUCHFUNC: Functions shown: 0
+
+# RUN: llvm-profdata show %t.profdata --function _ | FileCheck %s -check-prefix=SOMEFUNCS
+# SOMEFUNCS: Counters:
+# SOMEFUNCS: function_count_only:
+# SOMEFUNCS: large_numbers:
+# SOMEFUNCS: Functions shown: 2
+
+# RUN: llvm-profdata show %t.profdata | FileCheck %s -check-prefix=SUMMARY
+# SUMMARY-NOT: Counters:
+# SUMMARY-NOT: Functions shown:
+# SUMMARY: Total functions: 3
+# SUMMARY: Maximum function count: 2305843009213693952
+# SUMMARY: Maximum internal block count: 1152921504606846976

diff --git a/test/tools/llvm-profdata/hash-mismatch.proftext b/test/tools/llvm-profdata/hash-mismatch.proftext
new file mode 100644
index 0000000..fe0d4fb
--- /dev/null
+++ b/test/tools/llvm-profdata/hash-mismatch.proftext

@@ -0,0 +1,37 @@
+# If we see the same function name, but with different hashes, make
+# sure we keep both.
+
+# RUN: llvm-profdata merge %s -o %t 2>&1
+# RUN: llvm-profdata show %t -all-functions -counts > %t.out
+
+# The function ordering is non-deterministic, so we need to do our
+# checks in multiple runs.
+# RUN: FileCheck -check-prefix=FOO3 -check-prefix=BOTH %s -input-file %t.out
+# RUN: FileCheck -check-prefix=FOO4 -check-prefix=BOTH %s -input-file %t.out
+
+# FOO3: Hash: 0x{{0+}}3
+# FOO3-NEXT: Counters: 3
+# FOO3-NEXT: Function count: 1
+# FOO3-NEXT: Block counts: [2, 3]
+foo
+3
+3
+1
+2
+3
+
+# FOO4: Hash: 0x{{0+}}4
+# FOO4-NEXT: Counters: 4
+# FOO4-NEXT: Function count: 11
+# FOO4-NEXT: Block counts: [22, 33, 44]
+foo
+4
+4
+11
+22
+33
+44
+
+# BOTH: Total functions: 2
+# BOTH: Maximum function count: 11
+# BOTH: Maximum internal block count: 44

diff --git a/test/tools/llvm-profdata/lit.local.cfg b/test/tools/llvm-profdata/lit.local.cfg
new file mode 100644
index 0000000..d44913a
--- /dev/null
+++ b/test/tools/llvm-profdata/lit.local.cfg

@@ -0,0 +1 @@
+config.suffixes.add('.proftext')

diff --git a/test/tools/llvm-profdata/multiple-inputs.test b/test/tools/llvm-profdata/multiple-inputs.test
new file mode 100644
index 0000000..616efe9
--- /dev/null
+++ b/test/tools/llvm-profdata/multiple-inputs.test

@@ -0,0 +1,51 @@
+Some very basic tests for the multiple input cases.
+
+RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3-2.proftext -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3
+RUN: llvm-profdata merge %p/Inputs/foo3-2.proftext %p/Inputs/foo3-1.proftext -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3
+FOO3: foo:
+FOO3: Counters: 3
+FOO3: Function count: 8
+FOO3: Block counts: [7, 6]
+FOO3: Total functions: 1
+FOO3: Maximum function count: 8
+FOO3: Maximum internal block count: 7
+
+RUN: llvm-profdata merge %p/Inputs/empty.proftext %p/Inputs/foo3-1.proftext -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3EMPTY
+FOO3EMPTY: foo:
+FOO3EMPTY: Counters: 3
+FOO3EMPTY: Function count: 1
+FOO3EMPTY: Block counts: [2, 3]
+FOO3EMPTY: Total functions: 1
+FOO3EMPTY: Maximum function count: 1
+FOO3EMPTY: Maximum internal block count: 3
+
+RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/foo3bar3-1.proftext -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3FOO3BAR3
+FOO3FOO3BAR3: foo:
+FOO3FOO3BAR3: Counters: 3
+FOO3FOO3BAR3: Function count: 3
+FOO3FOO3BAR3: Block counts: [5, 8]
+FOO3FOO3BAR3: bar:
+FOO3FOO3BAR3: Counters: 3
+FOO3FOO3BAR3: Function count: 7
+FOO3FOO3BAR3: Block counts: [11, 13]
+FOO3FOO3BAR3: Total functions: 2
+FOO3FOO3BAR3: Maximum function count: 7
+FOO3FOO3BAR3: Maximum internal block count: 13
+
+RUN: llvm-profdata merge %p/Inputs/foo3-1.proftext %p/Inputs/bar3-1.proftext -o %t
+RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=DISJOINT
+DISJOINT: foo:
+DISJOINT: Counters: 3
+DISJOINT: Function count: 1
+DISJOINT: Block counts: [2, 3]
+DISJOINT: bar:
+DISJOINT: Counters: 3
+DISJOINT: Function count: 1
+DISJOINT: Block counts: [2, 3]
+DISJOINT: Total functions: 2
+DISJOINT: Maximum function count: 1
+DISJOINT: Maximum internal block count: 3

diff --git a/test/tools/llvm-profdata/overflow.proftext b/test/tools/llvm-profdata/overflow.proftext
new file mode 100644
index 0000000..cbf3bf1
--- /dev/null
+++ b/test/tools/llvm-profdata/overflow.proftext

@@ -0,0 +1,12 @@
+# RUN: llvm-profdata merge %s -o %t.out 2>&1 | FileCheck %s
+# CHECK: overflow.proftext: overflow: Counter overflow
+
+overflow
+1
+1
+9223372036854775808
+
+overflow
+1
+1
+9223372036854775808

diff --git a/test/tools/llvm-profdata/raw-two-profiles.test b/test/tools/llvm-profdata/raw-two-profiles.test
index 3260836..be78793 100644
--- a/test/tools/llvm-profdata/raw-two-profiles.test
+++ b/test/tools/llvm-profdata/raw-two-profiles.test

@@ -39,11 +39,9 @@
 RUN: cat %t-bar.profraw > %t-bar-padded.profraw
 RUN: printf '\0\0\0\0\0' >> %t-bar-padded.profraw
 
-RUN: cat %t-foo.profraw %t-bar.profraw > %t-nopad.profraw
 RUN: cat %t-foo-padded.profraw %t-bar.profraw > %t-pad-between.profraw
 RUN: cat %t-foo-padded.profraw %t-bar-padded.profraw > %t-pad.profraw
 
-RUN: llvm-profdata show %t-nopad.profraw -all-functions -counts | FileCheck %s
 RUN: llvm-profdata show %t-pad-between.profraw -all-functions -counts | FileCheck %s
 RUN: llvm-profdata show %t-pad.profraw -all-functions -counts | FileCheck %s
 

diff --git a/test/tools/llvm-profdata/sample-profile-basic.test b/test/tools/llvm-profdata/sample-profile-basic.test
new file mode 100644
index 0000000..0651c51
--- /dev/null
+++ b/test/tools/llvm-profdata/sample-profile-basic.test

@@ -0,0 +1,30 @@
+Basic tests for sample profiles.
+
+1- Show all functions
+RUN: llvm-profdata show --sample %p/Inputs/sample-profile.proftext | FileCheck %s --check-prefix=SHOW1
+SHOW1: Function: main: 184019, 0, 7 sampled lines
+SHOW1: line offset: 9, discriminator: 0, number of samples: 2064, calls: _Z3fooi:631 _Z3bari:1471
+SHOW1: Function: _Z3fooi: 7711, 610, 1 sampled lines
+SHOW1: Function: _Z3bari: 20301, 1437, 1 sampled lines
+SHOW1: line offset: 1, discriminator: 0, number of samples: 1437
+
+2- Show only bar
+RUN: llvm-profdata show --sample --function=_Z3bari %p/Inputs/sample-profile.proftext | FileCheck %s --check-prefix=SHOW2
+SHOW2: Function: _Z3bari: 20301, 1437, 1 sampled lines
+SHOW2: line offset: 1, discriminator: 0, number of samples: 1437
+SHOW2-NOT: Function: main: 184019, 0, 7 sampled lines
+SHOW2-NOT: Function: _Z3fooi: 7711, 610, 1 sampled lines
+
+3- Convert the profile to binary encoding and check that they are both
+   identical.
+RUN: llvm-profdata merge --sample %p/Inputs/sample-profile.proftext --binary -o - | llvm-profdata show --sample - -o %t-binary
+RUN: llvm-profdata show --sample %p/Inputs/sample-profile.proftext -o %t-text
+RUN: diff %t-binary %t-text
+
+4- Merge the binary and text encodings of the profile and check that the
+   counters have doubled.
+RUN: llvm-profdata merge --sample %p/Inputs/sample-profile.proftext -o %t-binprof
+RUN: llvm-profdata merge --sample --text %p/Inputs/sample-profile.proftext %t-binprof -o - | FileCheck %s --check-prefix=MERGE1
+MERGE1: main:368038:0
+MERGE1: 9: 4128 _Z3fooi:1262 _Z3bari:2942
+MERGE1: _Z3fooi:15422:1220

diff --git a/test/tools/llvm-profdata/simple.test b/test/tools/llvm-profdata/simple.test
deleted file mode 100644
index 18741dd..0000000
--- a/test/tools/llvm-profdata/simple.test
+++ /dev/null

@@ -1,77 +0,0 @@
-RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/foo3-2.profdata -o %t
-RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3
-RUN: llvm-profdata merge %p/Inputs/foo3-2.profdata %p/Inputs/foo3-1.profdata -o %t
-RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3
-FOO3: foo:
-FOO3: Counters: 3
-FOO3: Function count: 8
-FOO3: Block counts: [7, 6]
-FOO3: Total functions: 1
-FOO3: Maximum function count: 8
-FOO3: Maximum internal block count: 7
-
-RUN: llvm-profdata merge %p/Inputs/foo4-1.profdata %p/Inputs/foo4-2.profdata -o %t
-RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
-RUN: llvm-profdata merge %p/Inputs/foo4-2.profdata %p/Inputs/foo4-1.profdata -o %t
-RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO4
-FOO4: foo:
-FOO4: Counters: 4
-FOO4: Function count: 18
-FOO4: Block counts: [28, 38, 48]
-FOO4: Total functions: 1
-FOO4: Maximum function count: 18
-FOO4: Maximum internal block count: 48
-
-RUN: llvm-profdata merge %p/Inputs/foo3bar3-1.profdata %p/Inputs/foo3bar3-2.profdata -o %t
-RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3BAR3
-RUN: llvm-profdata merge %p/Inputs/foo3bar3-2.profdata %p/Inputs/foo3bar3-1.profdata -o %t
-RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3BAR3
-FOO3BAR3: foo:
-FOO3BAR3: Counters: 3
-FOO3BAR3: Function count: 19
-FOO3BAR3: Block counts: [22, 28]
-FOO3BAR3: bar:
-FOO3BAR3: Counters: 3
-FOO3BAR3: Function count: 36
-FOO3BAR3: Block counts: [42, 50]
-FOO3BAR3: Total functions: 2
-FOO3BAR3: Maximum function count: 36
-FOO3BAR3: Maximum internal block count: 50
-
-RUN: llvm-profdata merge %p/Inputs/empty.profdata %p/Inputs/foo3-1.profdata -o %t
-RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3EMPTY
-FOO3EMPTY: foo:
-FOO3EMPTY: Counters: 3
-FOO3EMPTY: Function count: 1
-FOO3EMPTY: Block counts: [2, 3]
-FOO3EMPTY: Total functions: 1
-FOO3EMPTY: Maximum function count: 1
-FOO3EMPTY: Maximum internal block count: 3
-
-RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/foo3bar3-1.profdata -o %t
-RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=FOO3FOO3BAR3
-FOO3FOO3BAR3: foo:
-FOO3FOO3BAR3: Counters: 3
-FOO3FOO3BAR3: Function count: 3
-FOO3FOO3BAR3: Block counts: [5, 8]
-FOO3FOO3BAR3: bar:
-FOO3FOO3BAR3: Counters: 3
-FOO3FOO3BAR3: Function count: 7
-FOO3FOO3BAR3: Block counts: [11, 13]
-FOO3FOO3BAR3: Total functions: 2
-FOO3FOO3BAR3: Maximum function count: 7
-FOO3FOO3BAR3: Maximum internal block count: 13
-
-RUN: llvm-profdata merge %p/Inputs/foo3-1.profdata %p/Inputs/bar3-1.profdata -o %t
-RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s --check-prefix=DISJOINT
-DISJOINT: foo:
-DISJOINT: Counters: 3
-DISJOINT: Function count: 1
-DISJOINT: Block counts: [2, 3]
-DISJOINT: bar:
-DISJOINT: Counters: 3
-DISJOINT: Function count: 1
-DISJOINT: Block counts: [2, 3]
-DISJOINT: Total functions: 2
-DISJOINT: Maximum function count: 1
-DISJOINT: Maximum internal block count: 3

diff --git a/test/tools/llvm-profdata/text-format-errors.test b/test/tools/llvm-profdata/text-format-errors.test
new file mode 100644
index 0000000..01513e4
--- /dev/null
+++ b/test/tools/llvm-profdata/text-format-errors.test

@@ -0,0 +1,10 @@
+RUN: not llvm-profdata show %p/Inputs/invalid-count-later.proftext 2>&1 | FileCheck %s --check-prefix=INVALID-COUNT-LATER
+RUN: not llvm-profdata merge %p/Inputs/invalid-count-later.proftext %p/Inputs/invalid-count-later.profdata -o %t.out 2>&1 | FileCheck %s --check-prefix=INVALID-COUNT-LATER
+INVALID-COUNT-LATER: error: {{.*}}invalid-count-later.proftext: Malformed profile data
+
+RUN: not llvm-profdata show %p/Inputs/bad-hash.proftext 2>&1 | FileCheck %s --check-prefix=BAD-HASH
+RUN: not llvm-profdata merge %p/Inputs/bad-hash.proftext %p/Inputs/bad-hash.proftext -o %t.out 2>&1 | FileCheck %s --check-prefix=BAD-HASH
+BAD-HASH: error: {{.*}}bad-hash.proftext: Malformed profile data
+
+RUN: not llvm-profdata show %p/Inputs/no-counts.proftext 2>&1 | FileCheck %s --check-prefix=NO-COUNTS
+NO-COUNTS: error: {{.*}}no-counts.proftext: Malformed profile data

diff --git a/test/tools/llvm-readobj/Inputs/bad-relocs.obj.coff-i386 b/test/tools/llvm-readobj/Inputs/bad-relocs.obj.coff-i386
new file mode 100644
index 0000000..06ec471
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/bad-relocs.obj.coff-i386
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/basereloc.obj.coff-i386 b/test/tools/llvm-readobj/Inputs/basereloc.obj.coff-i386
new file mode 100644
index 0000000..0aeed44
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/basereloc.obj.coff-i386
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/bigobj.coff-x86-64 b/test/tools/llvm-readobj/Inputs/bigobj.coff-x86-64
new file mode 100644
index 0000000..fdfda5e
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/bigobj.coff-x86-64
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/comdat-function-linetables.obj.coff-2012-i386 b/test/tools/llvm-readobj/Inputs/comdat-function-linetables.obj.coff-2012-i386
new file mode 100755
index 0000000..4a72304
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/comdat-function-linetables.obj.coff-2012-i386
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/comdat-function-linetables.obj.coff-2013-i386 b/test/tools/llvm-readobj/Inputs/comdat-function-linetables.obj.coff-2013-i386
new file mode 100755
index 0000000..4adaf2e
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/comdat-function-linetables.obj.coff-2013-i386
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/directives.obj.coff-x86_64 b/test/tools/llvm-readobj/Inputs/directives.obj.coff-x86_64
new file mode 100644
index 0000000..4c156dc
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/directives.obj.coff-x86_64
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/file-aux-record.yaml b/test/tools/llvm-readobj/Inputs/file-aux-record.yaml
index d19afaf..89d6761 100644
--- a/test/tools/llvm-readobj/Inputs/file-aux-record.yaml
+++ b/test/tools/llvm-readobj/Inputs/file-aux-record.yaml

@@ -6,7 +6,7 @@
   - !Symbol
     Name: .file
     Value: 0
-    SectionNumber: 65534
+    SectionNumber: -2
     SimpleType: IMAGE_SYM_TYPE_NULL
     ComplexType: IMAGE_SYM_DTYPE_NULL
     StorageClass: IMAGE_SYM_CLASS_FILE
@@ -14,7 +14,7 @@
   - !Symbol
     Name: '@comp.id'
     Value: 13485607
-    SectionNumber: 65535
+    SectionNumber: -1
     SimpleType: IMAGE_SYM_TYPE_NULL
     ComplexType: IMAGE_SYM_DTYPE_NULL
     StorageClass: IMAGE_SYM_CLASS_STATIC

diff --git a/test/tools/llvm-readobj/Inputs/file-multiple-aux-records.yaml b/test/tools/llvm-readobj/Inputs/file-multiple-aux-records.yaml
index 8d8f684..d5b1eec 100644
--- a/test/tools/llvm-readobj/Inputs/file-multiple-aux-records.yaml
+++ b/test/tools/llvm-readobj/Inputs/file-multiple-aux-records.yaml

@@ -6,7 +6,7 @@
   - !Symbol
     Name: .file
     Value: 0
-    SectionNumber: 65534
+    SectionNumber: -2
     SimpleType: IMAGE_SYM_TYPE_NULL
     ComplexType: IMAGE_SYM_DTYPE_NULL
     StorageClass: IMAGE_SYM_CLASS_FILE
@@ -14,7 +14,7 @@
   - !Symbol
     Name: '@comp.id'
     Value: 13485607
-    SectionNumber: 65535
+    SectionNumber: -1
     SimpleType: IMAGE_SYM_TYPE_NULL
     ComplexType: IMAGE_SYM_DTYPE_NULL
     StorageClass: IMAGE_SYM_CLASS_STATIC

diff --git a/test/tools/llvm-readobj/Inputs/imports.exe.coff-i386 b/test/tools/llvm-readobj/Inputs/imports.exe.coff-i386
new file mode 100644
index 0000000..72077ad
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/imports.exe.coff-i386
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/imports.exe.coff-x86-64 b/test/tools/llvm-readobj/Inputs/imports.exe.coff-x86-64
new file mode 100644
index 0000000..5ee198e
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/imports.exe.coff-x86-64
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2012-i368 b/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2012-i368
old mode 100644
new mode 100755
index 1672d3a..213331f
--- a/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2012-i368
+++ b/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2012-i368
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2012-x86_64 b/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2012-x86_64
old mode 100644
new mode 100755
index 30bfe79..a35cc11
--- a/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2012-x86_64
+++ b/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2012-x86_64
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2013-i368 b/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2013-i368
new file mode 100644
index 0000000..8a901f6
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2013-i368
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2013-x86_64 b/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2013-x86_64
new file mode 100644
index 0000000..f1c2e1f
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/multifile-linetables.obj.coff-2013-x86_64
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2012-i368 b/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2012-i368
old mode 100644
new mode 100755
index a0196ff..41479ed
--- a/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2012-i368
+++ b/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2012-i368
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2012-x86_64 b/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2012-x86_64
old mode 100644
new mode 100755
index 14f65ab..8d30f32
--- a/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2012-x86_64
+++ b/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2012-x86_64
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2013-i368 b/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2013-i368
new file mode 100644
index 0000000..4269b21
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2013-i368
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2013-x86_64 b/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2013-x86_64
new file mode 100644
index 0000000..65a1af0
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/multifunction-linetables.obj.coff-2013-x86_64
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/relocs.obj.coff-i386 b/test/tools/llvm-readobj/Inputs/relocs.obj.coff-i386
index 15e43ef..9c24ac8 100644
--- a/test/tools/llvm-readobj/Inputs/relocs.obj.coff-i386
+++ b/test/tools/llvm-readobj/Inputs/relocs.obj.coff-i386
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/relocs.obj.coff-x86_64 b/test/tools/llvm-readobj/Inputs/relocs.obj.coff-x86_64
index cd63173..356437e 100644
--- a/test/tools/llvm-readobj/Inputs/relocs.obj.coff-x86_64
+++ b/test/tools/llvm-readobj/Inputs/relocs.obj.coff-x86_64
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/relocs.obj.elf-aarch64 b/test/tools/llvm-readobj/Inputs/relocs.obj.elf-aarch64
index d39e60c..a1034cb 100644
--- a/test/tools/llvm-readobj/Inputs/relocs.obj.elf-aarch64
+++ b/test/tools/llvm-readobj/Inputs/relocs.obj.elf-aarch64
Binary files differ

diff --git a/test/tools/llvm-readobj/Inputs/relocs.py b/test/tools/llvm-readobj/Inputs/relocs.py
index af9459d..ffddf3d 100644
--- a/test/tools/llvm-readobj/Inputs/relocs.py
+++ b/test/tools/llvm-readobj/Inputs/relocs.py

@@ -327,6 +327,10 @@
 
   machine_type            = f.uint16()
   section_count           = f.uint16()
+
+  # Zero out timestamp to prevent churn when regenerating COFF files.
+  f.writeUInt32(0)
+
   f.seek(20)
   sections = [CoffSection(f) for idx in range(section_count)]
 
@@ -617,6 +621,8 @@
   R_AARCH64_LDST32_ABS_LO12_NC          = 0x11d
   R_AARCH64_LDST64_ABS_LO12_NC          = 0x11e
   R_AARCH64_LDST128_ABS_LO12_NC         = 0x12b
+  R_AARCH64_GOTREL64                    = 0x133
+  R_AARCH64_GOTREL32                    = 0x134
   R_AARCH64_ADR_GOT_PAGE                = 0x137
   R_AARCH64_LD64_GOT_LO12_NC            = 0x138
   R_AARCH64_TLSLD_MOVW_DTPREL_G2        = 0x20b
@@ -660,6 +666,15 @@
   R_AARCH64_TLSDESC_LD64_LO12_NC        = 0x233
   R_AARCH64_TLSDESC_ADD_LO12_NC         = 0x234
   R_AARCH64_TLSDESC_CALL                = 0x239
+  R_AARCH64_COPY                        = 0x400
+  R_AARCH64_GLOB_DAT                    = 0x401
+  R_AARCH64_JUMP_SLOT                   = 0x402
+  R_AARCH64_RELATIVE                    = 0x403
+  R_AARCH64_TLS_DTPREL64                = 0x404
+  R_AARCH64_TLS_DTPMOD64                = 0x405
+  R_AARCH64_TLS_TPREL64                 = 0x406
+  R_AARCH64_TLSDESC                     = 0x407
+  R_AARCH64_IRELATIVE                   = 0x408
 
 class Relocs_Elf_ARM(Enum):
   R_ARM_NONE                  = 0x00

diff --git a/test/tools/llvm-readobj/Inputs/trivial.obj.elf-mipsel b/test/tools/llvm-readobj/Inputs/trivial.obj.elf-mipsel
new file mode 100644
index 0000000..c523908
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/trivial.obj.elf-mipsel
Binary files differ

diff --git a/test/tools/llvm-readobj/bigobj.test b/test/tools/llvm-readobj/bigobj.test
new file mode 100644
index 0000000..1097214
--- /dev/null
+++ b/test/tools/llvm-readobj/bigobj.test

@@ -0,0 +1,139 @@
+RUN: llvm-readobj --file-headers --sections --symbols --relocations %p/Inputs/bigobj.coff-x86-64 | FileCheck %s
+
+CHECK:      File: {{(.*[/\\])?}}bigobj.coff-x86-64
+CHECK-NEXT: Format: COFF-x86-64
+CHECK-NEXT: Arch: x86_64
+CHECK-NEXT: AddressSize: 64bit
+CHECK-NEXT: ImageFileHeader {
+CHECK-NEXT:   Machine: IMAGE_FILE_MACHINE_AMD64 (0x8664)
+CHECK-NEXT:   SectionCount: 3
+CHECK-NEXT:   TimeDateStamp: 1970-01-01 00:00:00 (0x0)
+CHECK-NEXT:   PointerToSymbolTable: 0xB0
+CHECK-NEXT:   SymbolCount: 8
+CHECK-NEXT:   OptionalHeaderSize: 0
+CHECK-NEXT:   Characteristics [ (0x0)
+CHECK-NEXT:   ]
+CHECK-NEXT: }
+CHECK-NEXT: Sections [
+CHECK-NEXT:   Section {
+CHECK-NEXT:     Number: 1
+CHECK-NEXT:     Name: .text (2E 74 65 78 74 00 00 00)
+CHECK-NEXT:     VirtualSize: 0x0
+CHECK-NEXT:     VirtualAddress: 0x0
+CHECK-NEXT:     RawDataSize: 0
+CHECK-NEXT:     PointerToRawData: 0x0
+CHECK-NEXT:     PointerToRelocations: 0x0
+CHECK-NEXT:     PointerToLineNumbers: 0x0
+CHECK-NEXT:     RelocationCount: 0
+CHECK-NEXT:     LineNumberCount: 0
+CHECK-NEXT:     Characteristics [ (0x60500020)
+CHECK-NEXT:       IMAGE_SCN_ALIGN_16BYTES (0x500000)
+CHECK-NEXT:       IMAGE_SCN_CNT_CODE (0x20)
+CHECK-NEXT:       IMAGE_SCN_MEM_EXECUTE (0x20000000)
+CHECK-NEXT:       IMAGE_SCN_MEM_READ (0x40000000)
+CHECK-NEXT:     ]
+CHECK-NEXT:   }
+CHECK-NEXT:   Section {
+CHECK-NEXT:     Number: 2
+CHECK-NEXT:     Name: .data (2E 64 61 74 61 00 00 00)
+CHECK-NEXT:     VirtualSize: 0x0
+CHECK-NEXT:     VirtualAddress: 0x0
+CHECK-NEXT:     RawDataSize: 0
+CHECK-NEXT:     PointerToRawData: 0x0
+CHECK-NEXT:     PointerToRelocations: 0x0
+CHECK-NEXT:     PointerToLineNumbers: 0x0
+CHECK-NEXT:     RelocationCount: 0
+CHECK-NEXT:     LineNumberCount: 0
+CHECK-NEXT:     Characteristics [ (0xC0500040)
+CHECK-NEXT:       IMAGE_SCN_ALIGN_16BYTES (0x500000)
+CHECK-NEXT:       IMAGE_SCN_CNT_INITIALIZED_DATA (0x40)
+CHECK-NEXT:       IMAGE_SCN_MEM_READ (0x40000000)
+CHECK-NEXT:       IMAGE_SCN_MEM_WRITE (0x80000000)
+CHECK-NEXT:     ]
+CHECK-NEXT:   }
+CHECK-NEXT:   Section {
+CHECK-NEXT:     Number: 3
+CHECK-NEXT:     Name: .bss (2E 62 73 73 00 00 00 00)
+CHECK-NEXT:     VirtualSize: 0x0
+CHECK-NEXT:     VirtualAddress: 0x0
+CHECK-NEXT:     RawDataSize: 0
+CHECK-NEXT:     PointerToRawData: 0x0
+CHECK-NEXT:     PointerToRelocations: 0x0
+CHECK-NEXT:     PointerToLineNumbers: 0x0
+CHECK-NEXT:     RelocationCount: 0
+CHECK-NEXT:     LineNumberCount: 0
+CHECK-NEXT:     Characteristics [ (0xC0500080)
+CHECK-NEXT:       IMAGE_SCN_ALIGN_16BYTES (0x500000)
+CHECK-NEXT:       IMAGE_SCN_CNT_UNINITIALIZED_DATA (0x80)
+CHECK-NEXT:       IMAGE_SCN_MEM_READ (0x40000000)
+CHECK-NEXT:       IMAGE_SCN_MEM_WRITE (0x80000000)
+CHECK-NEXT:     ]
+CHECK-NEXT:   }
+CHECK-NEXT: ]
+CHECK-NEXT: Relocations [
+CHECK-NEXT: ]
+CHECK-NEXT: Symbols [
+CHECK-NEXT:   Symbol {
+CHECK-NEXT:     Name: .file
+CHECK-NEXT:     Value: 0
+CHECK-NEXT:     Section:  IMAGE_SYM_DEBUG (-2)
+CHECK-NEXT:     BaseType: Null (0x0)
+CHECK-NEXT:     ComplexType: Null (0x0)
+CHECK-NEXT:     StorageClass: File (0x67)
+CHECK-NEXT:     AuxSymbolCount: 1
+CHECK-NEXT:     AuxFileRecord {
+CHECK-NEXT:       FileName: fake
+CHECK-NEXT:     }
+CHECK-NEXT:   }
+CHECK-NEXT:   Symbol {
+CHECK-NEXT:     Name: .text
+CHECK-NEXT:     Value: 0
+CHECK-NEXT:     Section: .text (1)
+CHECK-NEXT:     BaseType: Null (0x0)
+CHECK-NEXT:     ComplexType: Null (0x0)
+CHECK-NEXT:     StorageClass: Static (0x3)
+CHECK-NEXT:     AuxSymbolCount: 1
+CHECK-NEXT:     AuxSectionDef {
+CHECK-NEXT:       Length: 0
+CHECK-NEXT:       RelocationCount: 0
+CHECK-NEXT:       LineNumberCount: 0
+CHECK-NEXT:       Checksum: 0x0
+CHECK-NEXT:       Number: 0
+CHECK-NEXT:       Selection: 0x0
+CHECK-NEXT:     }
+CHECK-NEXT:   }
+CHECK-NEXT:   Symbol {
+CHECK-NEXT:     Name: .data
+CHECK-NEXT:     Value: 0
+CHECK-NEXT:     Section: .data (2)
+CHECK-NEXT:     BaseType: Null (0x0)
+CHECK-NEXT:     ComplexType: Null (0x0)
+CHECK-NEXT:     StorageClass: Static (0x3)
+CHECK-NEXT:     AuxSymbolCount: 1
+CHECK-NEXT:     AuxSectionDef {
+CHECK-NEXT:       Length: 0
+CHECK-NEXT:       RelocationCount: 0
+CHECK-NEXT:       LineNumberCount: 0
+CHECK-NEXT:       Checksum: 0x0
+CHECK-NEXT:       Number: 0
+CHECK-NEXT:       Selection: 0x0
+CHECK-NEXT:     }
+CHECK-NEXT:   }
+CHECK-NEXT:   Symbol {
+CHECK-NEXT:     Name: .bss
+CHECK-NEXT:     Value: 0
+CHECK-NEXT:     Section: .bss (3)
+CHECK-NEXT:     BaseType: Null (0x0)
+CHECK-NEXT:     ComplexType: Null (0x0)
+CHECK-NEXT:     StorageClass: Static (0x3)
+CHECK-NEXT:     AuxSymbolCount: 1
+CHECK-NEXT:     AuxSectionDef {
+CHECK-NEXT:       Length: 0
+CHECK-NEXT:       RelocationCount: 0
+CHECK-NEXT:       LineNumberCount: 0
+CHECK-NEXT:       Checksum: 0x0
+CHECK-NEXT:       Number: 0
+CHECK-NEXT:       Selection: 0x0
+CHECK-NEXT:     }
+CHECK-NEXT:   }
+CHECK-NEXT: ]

diff --git a/test/tools/llvm-readobj/codeview-linetables.test b/test/tools/llvm-readobj/codeview-linetables.test
index 4854d7a..e5e344b 100644
--- a/test/tools/llvm-readobj/codeview-linetables.test
+++ b/test/tools/llvm-readobj/codeview-linetables.test

@@ -1,11 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; The following two object files were generated using the following command:
+;   D:\> cl /Z7 /c source.c
+; with the following contents of D:\source.c:
+;   void z(void);
+;
+;   void x(void) {
+;     z();
+;   }
+;
+;   void y(void) {
+;     z();
+;   }
+;
+;   void f(void) {
+;     x();
+;     y();
+;     z();
+;   }
+; using 32-/64-bit versions of CL v17.00.61030 and v18.00.21005.1 respectively.
 RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifunction-linetables.obj.coff-2012-i368 \
 RUN:   | FileCheck %s -check-prefix MFUN32
+RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifunction-linetables.obj.coff-2013-i368 \
+RUN:   | FileCheck %s -check-prefix MFUN32
 RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifunction-linetables.obj.coff-2012-x86_64 \
 RUN:   | FileCheck %s -check-prefix MFUN64
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2012-i368 \
-RUN:   | FileCheck %s -check-prefix MFILE32
-RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2012-x86_64 \
-RUN:   | FileCheck %s -check-prefix MFILE64
+RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifunction-linetables.obj.coff-2013-x86_64 \
+RUN:   | FileCheck %s -check-prefix MFUN64
 
 MFUN32:      CodeViewLineTables [
 MFUN32-NEXT:   Magic: 0x4
@@ -20,6 +40,12 @@
 MFUN32-NEXT:   Subsection [
 MFUN32-NEXT:     Type: 0xF1
 MFUN32-NEXT:     PayloadSize: 0x4B
+MFUN32:          ProcStart {
+MFUN32-NEXT:       DisplayName: x
+MFUN32-NEXT:       Section: _x
+MFUN32-NEXT:       CodeSize: 0xA
+MFUN32-NEXT:     }
+MFUN32-NEXT:     ProcEnd
 MFUN32:        ]
 MFUN32-NEXT:   Subsection [
 MFUN32-NEXT:     Type: 0xF2
@@ -33,6 +59,12 @@
 MFUN32-NEXT:   Subsection [
 MFUN32-NEXT:     Type: 0xF1
 MFUN32-NEXT:     PayloadSize: 0x4B
+MFUN32:          ProcStart {
+MFUN32-NEXT:       DisplayName: y
+MFUN32-NEXT:       Section: _y
+MFUN32-NEXT:       CodeSize: 0xA
+MFUN32-NEXT:     }
+MFUN32-NEXT:     ProcEnd
 MFUN32:        ]
 MFUN32-NEXT:   Subsection [
 MFUN32-NEXT:     Type: 0xF2
@@ -46,6 +78,12 @@
 MFUN32-NEXT:   Subsection [
 MFUN32-NEXT:     Type: 0xF1
 MFUN32-NEXT:     PayloadSize: 0x4B
+MFUN32:          ProcStart {
+MFUN32-NEXT:       DisplayName: f
+MFUN32-NEXT:       Section: _f
+MFUN32-NEXT:       CodeSize: 0x14
+MFUN32-NEXT:     }
+MFUN32-NEXT:     ProcEnd
 MFUN32:        ]
 MFUN32-NEXT:   Subsection [
 MFUN32-NEXT:     Type: 0xF2
@@ -107,6 +145,12 @@
 MFUN64-NEXT:   Subsection [
 MFUN64-NEXT:     Type: 0xF1
 MFUN64-NEXT:     PayloadSize: 0x4B
+MFUN64:          ProcStart {
+MFUN64-NEXT:       DisplayName: x
+MFUN64-NEXT:       Section: x
+MFUN64-NEXT:       CodeSize: 0xE
+MFUN64-NEXT:     }
+MFUN64-NEXT:     ProcEnd
 MFUN64:        ]
 MFUN64-NEXT:   Subsection [
 MFUN64-NEXT:     Type: 0xF2
@@ -116,6 +160,12 @@
 MFUN64-NEXT:   Subsection [
 MFUN64-NEXT:     Type: 0xF1
 MFUN64-NEXT:     PayloadSize: 0x4B
+MFUN64:          ProcStart {
+MFUN64-NEXT:       DisplayName: y
+MFUN64-NEXT:       Section: y
+MFUN64-NEXT:       CodeSize: 0xE
+MFUN64-NEXT:     }
+MFUN64-NEXT:     ProcEnd
 MFUN64:        ]
 MFUN64-NEXT:   Subsection [
 MFUN64-NEXT:     Type: 0xF2
@@ -125,6 +175,12 @@
 MFUN64-NEXT:   Subsection [
 MFUN64-NEXT:     Type: 0xF1
 MFUN64-NEXT:     PayloadSize: 0x4B
+MFUN64:          ProcStart {
+MFUN64-NEXT:       DisplayName: f
+MFUN64-NEXT:       Section: f
+MFUN64-NEXT:       CodeSize: 0x18
+MFUN64-NEXT:     }
+MFUN64-NEXT:     ProcEnd
 MFUN64:        ]
 MFUN64-NEXT:   Subsection [
 MFUN64-NEXT:     Type: 0xF2
@@ -177,6 +233,30 @@
 MFUN64-NEXT:   ]
 MFUN64-NEXT: ]
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; The following two object files were generated using the following command:
+;   D:\> cl /Z7 /c input.c
+; with the following contents of D:\input.c:
+;   void g(void);
+;
+;   void f(void) {
+;   #line 1 "one.c"
+;     g();
+;   #line 2 "two.c"
+;     g();
+;   #line 7 "one.c"
+;     g();
+;   }
+; using 32-/64-bit versions of CL v17.00.61030 and v18.00.21005.1 respectively.
+RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2012-i368 \
+RUN:   | FileCheck %s -check-prefix MFILE32
+RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2013-i368 \
+RUN:   | FileCheck %s -check-prefix MFILE32
+RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2012-x86_64 \
+RUN:   | FileCheck %s -check-prefix MFILE64
+RUN: llvm-readobj -s -codeview-linetables %p/Inputs/multifile-linetables.obj.coff-2013-x86_64 \
+RUN:   | FileCheck %s -check-prefix MFILE64
+
 MFILE32:      CodeViewLineTables [
 MFILE32-NEXT:   Magic: 0x4
 MFILE32-NEXT:   Subsection [
@@ -190,6 +270,12 @@
 MFILE32-NEXT:   Subsection [
 MFILE32-NEXT:     Type: 0xF1
 MFILE32-NEXT:     PayloadSize: 0x4B
+MFILE32:          ProcStart {
+MFILE32-NEXT:       DisplayName: f
+MFILE32-NEXT:       Section: _f
+MFILE32-NEXT:       CodeSize: 0x14
+MFILE32-NEXT:     }
+MFILE32-NEXT:     ProcEnd
 MFILE32:        ]
 MFILE32-NEXT:   Subsection [
 MFILE32-NEXT:     Type: 0xF2
@@ -240,6 +326,12 @@
 MFILE64-NEXT:   Subsection [
 MFILE64-NEXT:     Type: 0xF1
 MFILE64-NEXT:     PayloadSize: 0x4B
+MFILE64:          ProcStart {
+MFILE64-NEXT:       DisplayName: f
+MFILE64-NEXT:       Section: f
+MFILE64-NEXT:       CodeSize: 0x18
+MFILE64-NEXT:     }
+MFILE64-NEXT:     ProcEnd
 MFILE64:        ]
 MFILE64-NEXT:   Subsection [
 MFILE64-NEXT:     Type: 0xF2
@@ -280,3 +372,53 @@
 MFILE64-NEXT:     ]
 MFILE64-NEXT:   ]
 MFILE64-NEXT: ]
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; The following object files were generated using the following command:
+;   C:\src> cl /Z7 /Gy /c test.cc
+; with the following contents of C:\src\test.cc:
+;   int f()
+;   {
+;     return 0;
+;   }
+;
+;   int g()
+;   {
+;     return 0;
+;   }
+; using 32-version of CL v17.00.61030 and v18.00.21005.1 respectively.
+RUN: llvm-readobj -s -codeview-linetables %p/Inputs/comdat-function-linetables.obj.coff-2012-i386 \
+RUN:   | FileCheck %s -check-prefix MCOMDAT
+RUN: llvm-readobj -s -codeview-linetables %p/Inputs/comdat-function-linetables.obj.coff-2013-i386 \
+RUN:   | FileCheck %s -check-prefix MCOMDAT
+
+MCOMDAT:      ProcStart {
+MCOMDAT-NEXT:   DisplayName: f
+MCOMDAT-NEXT:   Section: ?f@@YAHXZ
+MCOMDAT-NEXT:   CodeSize: 0x7
+MCOMDAT-NEXT: }
+MCOMDAT:      FunctionLineTable [
+MCOMDAT-NEXT:   FunctionName: ?f@@YAHXZ
+MCOMDAT-NEXT:   CodeSize: 0x7
+MCOMDAT-NEXT:   FilenameSegment [
+MCOMDAT-NEXT:     Filename: c:\src\test.cc
+MCOMDAT-NEXT:     +0x0: 2
+MCOMDAT-NEXT:     +0x3: 3
+MCOMDAT-NEXT:     +0x5: 4
+MCOMDAT-NEXT:   ]
+MCOMDAT-NEXT: ]
+MCOMDAT:      ProcStart {
+MCOMDAT-NEXT:   DisplayName: g
+MCOMDAT-NEXT:   Section: ?g@@YAHXZ
+MCOMDAT-NEXT:   CodeSize: 0x7
+MCOMDAT-NEXT: }
+MCOMDAT:      FunctionLineTable [
+MCOMDAT-NEXT:   FunctionName: ?g@@YAHXZ
+MCOMDAT-NEXT:   CodeSize: 0x7
+MCOMDAT-NEXT:   FilenameSegment [
+MCOMDAT-NEXT:     Filename: c:\src\test.cc
+MCOMDAT-NEXT:     +0x0: 7
+MCOMDAT-NEXT:     +0x3: 8
+MCOMDAT-NEXT:     +0x5: 9
+MCOMDAT-NEXT:   ]
+MCOMDAT-NEXT: ]

diff --git a/test/tools/llvm-readobj/coff-basereloc.test b/test/tools/llvm-readobj/coff-basereloc.test
new file mode 100644
index 0000000..cd6687c
--- /dev/null
+++ b/test/tools/llvm-readobj/coff-basereloc.test

@@ -0,0 +1,24 @@
+RUN: llvm-readobj -coff-basereloc %p/Inputs/basereloc.obj.coff-i386 | FileCheck %s
+
+CHECK:      Format: COFF-i386
+CHECK-NEXT: Arch: i386
+CHECK-NEXT: AddressSize: 32bit
+CHECK-NEXT: BaseReloc [
+CHECK-NEXT:   Entry {
+CHECK-NEXT:     Type: HIGHLOW
+CHECK-NEXT:     Address: 0x1004
+CHECK-NEXT:   }
+CHECK-NEXT:   Entry {
+CHECK-NEXT:     Type: HIGHLOW
+CHECK-NEXT:     Address: 0x100A
+CHECK-NEXT:   }
+CHECK-NEXT:   Entry {
+CHECK-NEXT:     Type: HIGHLOW
+CHECK-NEXT:     Address: 0x1010
+CHECK-NEXT:   }
+CHECK-NEXT:   Entry {
+CHECK-NEXT:     Type: ABSOLUTE
+CHECK-NEXT:     Address: 0x1000
+CHECK-NEXT:   }
+CHECK-NEXT: ]
+

diff --git a/test/tools/llvm-readobj/coff-directives.test b/test/tools/llvm-readobj/coff-directives.test
new file mode 100644
index 0000000..83efffc
--- /dev/null
+++ b/test/tools/llvm-readobj/coff-directives.test

@@ -0,0 +1,2 @@
+RUN: llvm-readobj -coff-directives %p/Inputs/directives.obj.coff-x86_64 | FileCheck %s
+CHECK: Directive(s): /DEFAULTLIB:"LIBCMT" /DEFAULTLIB:"OLDNAMES" 

diff --git a/test/tools/llvm-readobj/coff-file-sections-reading.test b/test/tools/llvm-readobj/coff-file-sections-reading.test
index 5c44c16..c2f02d4 100644
--- a/test/tools/llvm-readobj/coff-file-sections-reading.test
+++ b/test/tools/llvm-readobj/coff-file-sections-reading.test

@@ -4,7 +4,7 @@
 CHECK:   Symbol {
 CHECK:     Name: .file
 CHECK:     Value: 0
-CHECK:     Section:  (65534)
+CHECK:     Section:  IMAGE_SYM_DEBUG (-2)
 CHECK:     BaseType: Null (0x0)
 CHECK:     ComplexType: Null (0x0)
 CHECK:     StorageClass: File (0x67)

diff --git a/test/tools/llvm-readobj/cxx-cli-aux.test b/test/tools/llvm-readobj/cxx-cli-aux.test
index 90e73c0..0b68779 100644
--- a/test/tools/llvm-readobj/cxx-cli-aux.test
+++ b/test/tools/llvm-readobj/cxx-cli-aux.test

@@ -9,7 +9,7 @@
 CHECK:        Symbol {
 CHECK:          Name: ?PerAppDomain@@$$Q3HA
 CHECK-NEXT:     Value: 4
-CHECK-NEXT:     Section:  (65535)
+CHECK-NEXT:     Section:  IMAGE_SYM_ABSOLUTE (-1)
 CHECK-NEXT:     BaseType: Null (0x0)
 CHECK-NEXT:     ComplexType: Null (0x0)
 CHECK-NEXT:     StorageClass: External (0x2)
@@ -21,14 +21,13 @@
 CHECK-NEXT:       Checksum: 0x0
 CHECK-NEXT:       Number: 0
 CHECK-NEXT:       Selection: NoDuplicates (0x1)
-CHECK-NEXT:       Unused: (00 00 00)
 CHECK-NEXT:     }
 CHECK-NEXT:   }
 
 CHECK:        Symbol {
 CHECK:          Name: 04000001
 CHECK-NEXT:     Value: 4
-CHECK-NEXT:     Section:  (65535)
+CHECK-NEXT:     Section:  IMAGE_SYM_ABSOLUTE (-1)
 CHECK-NEXT:     BaseType: Null (0x0)
 CHECK-NEXT:     ComplexType: Null (0x0)
 CHECK-NEXT:     StorageClass: CLRToken (0x6B)
@@ -37,6 +36,5 @@
 CHECK-NEXT:       AuxType: 1
 CHECK-NEXT:       Reserved: 0
 CHECK-NEXT:       SymbolTableIndex: ?PerAppDomain@@$$Q3HA (19)
-CHECK-NEXT:       Unused: (00 00 00 00 00 00 00 00 00 00 00 00)
 CHECK-NEXT:     }
 CHECK-NEXT:   }

diff --git a/test/tools/llvm-readobj/file-headers.test b/test/tools/llvm-readobj/file-headers.test
index 39a8c0e..fd030ef 100644
--- a/test/tools/llvm-readobj/file-headers.test
+++ b/test/tools/llvm-readobj/file-headers.test

@@ -10,6 +10,16 @@
 RUN:   | FileCheck %s -check-prefix ELF32
 RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-x86-64 \
 RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readobj -h %p/Inputs/trivial.obj.macho-i386 \
+RUN:   | FileCheck %s -check-prefix MACHO32
+RUN: llvm-readobj -h %p/Inputs/trivial.obj.macho-x86-64 \
+RUN:   | FileCheck %s -check-prefix MACHO64
+RUN: llvm-readobj -h %p/Inputs/trivial.obj.macho-ppc \
+RUN:   | FileCheck %s -check-prefix MACHO-PPC
+RUN: llvm-readobj -h %p/Inputs/trivial.obj.macho-ppc64 \
+RUN:   | FileCheck %s -check-prefix MACHO-PPC64
+RUN: llvm-readobj -h %p/Inputs/trivial.obj.macho-arm \
+RUN:   | FileCheck %s -check-prefix MACHO-ARM
 RUN: llvm-readobj -h %p/Inputs/magic.coff-unknown \
 RUN:   | FileCheck %s -check-prefix COFF-UNKNOWN
 RUN: llvm-readobj -h %p/Inputs/magic.coff-importlib \
@@ -122,6 +132,88 @@
 ELF64-NEXT:   StringTableSectionIndex: 7
 ELF64-NEXT: }
 
+MACHO32:      File: {{(.*[/\\])?}}trivial.obj.macho-i386
+MACHO32-NEXT: Format: Mach-O 32-bit i386
+MACHO32-NEXT: Arch: i386
+MACHO32-NEXT: AddressSize: 32bit
+MACHO32-NEXT: MachHeader {
+MACHO32-NEXT:   Magic: Magic (0xFEEDFACE)
+MACHO32-NEXT:   CpuType: X86 (0x7)
+MACHO32-NEXT:   CpuSubType: CPU_SUBTYPE_I386_ALL (0x3)
+MACHO32-NEXT:   FileType: Relocatable (0x1)
+MACHO32-NEXT:   NumOfLoadCommands: 3
+MACHO32-NEXT:   SizeOfLoadCommands: 296
+MACHO32-NEXT:   Flags [ (0x2000)
+MACHO32-NEXT:     MH_SUBSECTIONS_VIA_SYMBOLS (0x2000)
+MACHO32-NEXT:   ]
+MACHO32-NEXT: }
+
+MACHO64:      File: {{(.*[/\\])?}}trivial.obj.macho-x86-64
+MACHO64-NEXT: Format: Mach-O 64-bit x86-64
+MACHO64-NEXT: Arch: x86_64
+MACHO64-NEXT: AddressSize: 64bit
+MACHO64-NEXT: MachHeader {
+MACHO64-NEXT:   Magic: Magic64 (0xFEEDFACF)
+MACHO64-NEXT:   CpuType: X86-64 (0x1000007)
+MACHO64-NEXT:   CpuSubType: CPU_SUBTYPE_X86_64_ALL (0x3)
+MACHO64-NEXT:   FileType: Relocatable (0x1)
+MACHO64-NEXT:   NumOfLoadCommands: 3
+MACHO64-NEXT:   SizeOfLoadCommands: 336
+MACHO64-NEXT:   Flags [ (0x2000)
+MACHO64-NEXT:     MH_SUBSECTIONS_VIA_SYMBOLS (0x2000)
+MACHO64-NEXT:   ]
+MACHO64-NEXT:   Reserved: 0x0
+MACHO64-NEXT: }
+
+MACHO-PPC:      File: {{(.*[/\\])?}}trivial.obj.macho-ppc
+MACHO-PPC-NEXT: Format: Mach-O 32-bit ppc
+MACHO-PPC-NEXT: Arch: powerpc
+MACHO-PPC-NEXT: AddressSize: 32bit
+MACHO-PPC-NEXT: MachHeader {
+MACHO-PPC-NEXT:   Magic: Magic (0xFEEDFACE)
+MACHO-PPC-NEXT:   CpuType: PowerPC (0x12)
+MACHO-PPC-NEXT:   CpuSubType: CPU_SUBTYPE_POWERPC_ALL (0x0)
+MACHO-PPC-NEXT:   FileType: Relocatable (0x1)
+MACHO-PPC-NEXT:   NumOfLoadCommands: 3
+MACHO-PPC-NEXT:   SizeOfLoadCommands: 500
+MACHO-PPC-NEXT:   Flags [ (0x2000)
+MACHO-PPC-NEXT:     MH_SUBSECTIONS_VIA_SYMBOLS (0x2000)
+MACHO-PPC-NEXT:   ]
+MACHO-PPC-NEXT: }
+
+MACHO-PPC64:      File: {{(.*[/\\])?}}trivial.obj.macho-ppc64
+MACHO-PPC64-NEXT: Format: Mach-O 64-bit ppc64
+MACHO-PPC64-NEXT: Arch: powerpc64
+MACHO-PPC64-NEXT: AddressSize: 64bit
+MACHO-PPC64-NEXT: MachHeader {
+MACHO-PPC64-NEXT:   Magic: Magic64 (0xFEEDFACF)
+MACHO-PPC64-NEXT:   CpuType: PowerPC64 (0x1000012)
+MACHO-PPC64-NEXT:   CpuSubtype: 0x0
+MACHO-PPC64-NEXT:   FileType: Relocatable (0x1)
+MACHO-PPC64-NEXT:   NumOfLoadCommands: 3
+MACHO-PPC64-NEXT:   SizeOfLoadCommands: 576
+MACHO-PPC64-NEXT:   Flags [ (0x2000)
+MACHO-PPC64-NEXT:     MH_SUBSECTIONS_VIA_SYMBOLS (0x2000)
+MACHO-PPC64-NEXT:   ]
+MACHO-PPC64-NEXT:   Reserved: 0x0
+MACHO-PPC64-NEXT: }
+
+MACHO-ARM:      File: {{(.*[/\\])?}}trivial.obj.macho-arm
+MACHO-ARM-NEXT: Format: Mach-O arm
+MACHO-ARM-NEXT: Arch: arm
+MACHO-ARM-NEXT: AddressSize: 32bit
+MACHO-ARM-NEXT: MachHeader {
+MACHO-ARM-NEXT:   Magic: Magic (0xFEEDFACE)
+MACHO-ARM-NEXT:   CpuType: Arm (0xC)
+MACHO-ARM-NEXT:   CpuSubType: CPU_SUBTYPE_ARM_V7 (0x9)
+MACHO-ARM-NEXT:   FileType: Relocatable (0x1)
+MACHO-ARM-NEXT:   NumOfLoadCommands: 3
+MACHO-ARM-NEXT:   SizeOfLoadCommands: 636
+MACHO-ARM-NEXT:   Flags [ (0x2000)
+MACHO-ARM-NEXT:     MH_SUBSECTIONS_VIA_SYMBOLS (0x2000)
+MACHO-ARM-NEXT:   ]
+MACHO-ARM-NEXT: }
+
 PE32:      File: {{(.*[/\\])?}}trivial.exe.coff-i386
 PE32-NEXT: Format: COFF-i386
 PE32-NEXT: Arch: i386
@@ -159,7 +251,7 @@
 PE32-NEXT:   SizeOfImage: 16384
 PE32-NEXT:   SizeOfHeaders: 1024
 PE32-NEXT:   Subsystem: IMAGE_SUBSYSTEM_WINDOWS_CUI (0x3)
-PE32-NEXT:   Subsystem [ (0x8140)
+PE32-NEXT:   Characteristics [ (0x8140)
 PE32-NEXT:     IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE (0x40)
 PE32-NEXT:     IMAGE_DLL_CHARACTERISTICS_NX_COMPAT (0x100)
 PE32-NEXT:     IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE (0x8000)
@@ -204,6 +296,25 @@
 PE32-NEXT:     ReservedSize: 0x0
 PE32-NEXT:   }
 PE32-NEXT: }
+PE32-NEXT: DOSHeader {
+PE32-NEXT:   Magic: MZ
+PE32-NEXT:   UsedBytesInTheLastPage: 144
+PE32-NEXT:   FileSizeInPages: 3
+PE32-NEXT:   NumberOfRelocationItems: 0
+PE32-NEXT:   HeaderSizeInParagraphs: 4
+PE32-NEXT:   MinimumExtraParagraphs: 0
+PE32-NEXT:   MaximumExtraParagraphs: 65535
+PE32-NEXT:   InitialRelativeSS: 0
+PE32-NEXT:   InitialSP: 184
+PE32-NEXT:   Checksum: 0
+PE32-NEXT:   InitialIP: 0
+PE32-NEXT:   InitialRelativeCS: 0
+PE32-NEXT:   AddressOfRelocationTable: 64
+PE32-NEXT:   OverlayNumber: 0
+PE32-NEXT:   OEMid: 0
+PE32-NEXT:   OEMinfo: 0
+PE32-NEXT:   AddressOfNewExeHeader: 176
+PE32-NEXT: }
 
 COFF-UNKNOWN:      Format: COFF-<unknown arch>
 COFF-UNKNOWN-NEXT: Arch: unknown
@@ -224,12 +335,11 @@
 COFF-IMPORTLIB-NEXT: AddressSize: 32bit
 COFF-IMPORTLIB-NEXT: ImageFileHeader {
 COFF-IMPORTLIB-NEXT:   Machine: IMAGE_FILE_MACHINE_UNKNOWN (0x0)
-COFF-IMPORTLIB-NEXT:   SectionCount: 65535
+COFF-IMPORTLIB-NEXT:   SectionCount: 0
 COFF-IMPORTLIB-NEXT:   TimeDateStamp: 1970-09-09 19:52:32 (0x14C0000)
-COFF-IMPORTLIB-NEXT:   PointerToSymbolTable: 0x528542EB
-COFF-IMPORTLIB-NEXT:   SymbolCount: 20
+COFF-IMPORTLIB-NEXT:   PointerToSymbolTable: 0x0
+COFF-IMPORTLIB-NEXT:   SymbolCount: 0
 COFF-IMPORTLIB-NEXT:   OptionalHeaderSize: 0
-COFF-IMPORTLIB-NEXT:   Characteristics [ (0x8)
-COFF-IMPORTLIB-NEXT:     IMAGE_FILE_LOCAL_SYMS_STRIPPED (0x8)
+COFF-IMPORTLIB-NEXT:   Characteristics [ (0x0)
 COFF-IMPORTLIB-NEXT:   ]
 COFF-IMPORTLIB-NEXT: }

diff --git a/test/tools/llvm-readobj/imports.test b/test/tools/llvm-readobj/imports.test
new file mode 100644
index 0000000..58512f4
--- /dev/null
+++ b/test/tools/llvm-readobj/imports.test

@@ -0,0 +1,88 @@
+RUN: llvm-readobj --coff-imports %p/Inputs/imports.exe.coff-i386 | FileCheck -check-prefix=X86 %s
+RUN: llvm-readobj --coff-imports %p/Inputs/imports.exe.coff-x86-64 | FileCheck -check-prefix=X64  %s
+
+X86:      Import {
+X86-NEXT:   Name: KERNEL32.dll
+X86-NEXT:   ImportLookupTableRVA: 0x2108
+X86-NEXT:   ImportAddressTableRVA: 0x2000
+X86-NEXT:   Symbol: ExitProcess (337)
+X86-NEXT:   Symbol: GetProcAddress (669)
+X86-NEXT:   Symbol: FreeLibrary (414)
+X86-NEXT:   Symbol: GetLastError (592)
+X86-NEXT:   Symbol: RaiseException (1087)
+X86-NEXT:   Symbol: LoadLibraryExA (934)
+X86-NEXT: }
+X86-NEXT: Import {
+X86-NEXT:   Name: USER32.dll
+X86-NEXT:   ImportLookupTableRVA: 0x2124
+X86-NEXT:   ImportAddressTableRVA: 0x201C
+X86-NEXT:   Symbol: MessageBoxA (582)
+X86-NEXT: }
+X86-NEXT: Import {
+X86-NEXT:   Name: mydll.dll
+X86-NEXT:   ImportLookupTableRVA: 0x212C
+X86-NEXT:   ImportAddressTableRVA: 0x2024
+X86-NEXT:   Symbol: Func1 (0)
+X86-NEXT:   Symbol: Func2 (1)
+X86-NEXT:   Symbol:  (3)
+X86-NEXT: }
+X86-NEXT: DelayImport {
+X86-NEXT:   Name: lazyload.dll
+X86-NEXT:   Attributes: 0x1
+X86-NEXT:   ModuleHandle: 0x301C
+X86-NEXT:   ImportAddressTable: 0x3010
+X86-NEXT:   ImportNameTable: 0x2090
+X86-NEXT:   BoundDelayImportTable: 0x20AC
+X86-NEXT:   UnloadDelayImportTable: 0x0
+X86-NEXT:   Import {
+X86-NEXT:     Symbol: Func5 (0)
+X86-NEXT:     Address: 0x401073
+X86-NEXT:   }
+X86-NEXT:   Import {
+X86-NEXT:     Symbol: Func4 (0)
+X86-NEXT:     Address: 0x401052
+X86-NEXT:   }
+X86-NEXT: }
+
+X64:      Import {
+X64-NEXT:   Name: KERNEL32.dll
+X64-NEXT:   ImportLookupTableRVA: 0x2170
+X64-NEXT:   ImportAddressTableRVA: 0x2000
+X64-NEXT:   Symbol: ExitProcess (343)
+X64-NEXT:   Symbol: GetProcAddress (676)
+X64-NEXT:   Symbol: FreeLibrary (420)
+X64-NEXT:   Symbol: GetLastError (598)
+X64-NEXT:   Symbol: RaiseException (1091)
+X64-NEXT:   Symbol: LoadLibraryExA (937)
+X64-NEXT: }
+X64-NEXT: Import {
+X64-NEXT:   Name: USER32.dll
+X64-NEXT:   ImportLookupTableRVA: 0x21A8
+X64-NEXT:   ImportAddressTableRVA: 0x2038
+X64-NEXT:   Symbol: MessageBoxA (586)
+X64-NEXT: }
+X64-NEXT: Import {
+X64-NEXT:   Name: mydll.dll
+X64-NEXT:   ImportLookupTableRVA: 0x21B8
+X64-NEXT:   ImportAddressTableRVA: 0x2048
+X64-NEXT:   Symbol: Func1 (0)
+X64-NEXT:   Symbol: Func2 (1)
+X64-NEXT:   Symbol:  (3)
+X64-NEXT: }
+X64-NEXT: DelayImport {
+X64-NEXT:   Name: lazyload.dll
+X64-NEXT:   Attributes: 0x1
+X64-NEXT:   ModuleHandle: 0x3028
+X64-NEXT:   ImportAddressTable: 0x3010
+X64-NEXT:   ImportNameTable: 0x20E0
+X64-NEXT:   BoundDelayImportTable: 0x2108
+X64-NEXT:   UnloadDelayImportTable: 0x0
+X64-NEXT:   Import {
+X64-NEXT:     Symbol: Func5 (0)
+X64-NEXT:     Address: 0x1400010F1
+X64-NEXT:   }
+X64-NEXT:   Import {
+X64-NEXT:     Symbol: Func4 (0)
+X64-NEXT:     Address: 0x140001066
+X64-NEXT:   }
+X64-NEXT: }

diff --git a/test/tools/llvm-readobj/peplus.test b/test/tools/llvm-readobj/peplus.test
index 8e6f550..4d8d25d 100644
--- a/test/tools/llvm-readobj/peplus.test
+++ b/test/tools/llvm-readobj/peplus.test

@@ -35,7 +35,7 @@
 CHECK:   SizeOfImage: 8192
 CHECK:   SizeOfHeaders: 512
 CHECK:   Subsystem: IMAGE_SUBSYSTEM_WINDOWS_CUI (0x3)
-CHECK:   Subsystem [ (0x8160)
+CHECK:   Characteristics [ (0x8160)
 CHECK:     IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE (0x40)
 CHECK:     IMAGE_DLL_CHARACTERISTICS_HIGH_ENTROPY_VA (0x20)
 CHECK:     IMAGE_DLL_CHARACTERISTICS_NX_COMPAT (0x100)

diff --git a/test/tools/llvm-readobj/reloc-types.test b/test/tools/llvm-readobj/reloc-types.test
index 0c8b54d..20c2538 100644
--- a/test/tools/llvm-readobj/reloc-types.test
+++ b/test/tools/llvm-readobj/reloc-types.test

@@ -179,6 +179,8 @@
 ELF-AARCH64: Type: R_AARCH64_LDST32_ABS_LO12_NC (285)
 ELF-AARCH64: Type: R_AARCH64_LDST64_ABS_LO12_NC (286)
 ELF-AARCH64: Type: R_AARCH64_LDST128_ABS_LO12_NC (299)
+ELF-AARCH64: Type: R_AARCH64_GOTREL64 (307)
+ELF-AARCH64: Type: R_AARCH64_GOTREL32 (308)
 ELF-AARCH64: Type: R_AARCH64_ADR_GOT_PAGE (311)
 ELF-AARCH64: Type: R_AARCH64_LD64_GOT_LO12_NC (312)
 ELF-AARCH64: Type: R_AARCH64_TLSLD_MOVW_DTPREL_G2 (523)
@@ -222,6 +224,15 @@
 ELF-AARCH64: Type: R_AARCH64_TLSDESC_LD64_LO12_NC (563)
 ELF-AARCH64: Type: R_AARCH64_TLSDESC_ADD_LO12_NC (564)
 ELF-AARCH64: Type: R_AARCH64_TLSDESC_CALL (569)
+ELF-AARCH64: Type: R_AARCH64_COPY (1024)
+ELF-AARCH64: Type: R_AARCH64_GLOB_DAT (1025)
+ELF-AARCH64: Type: R_AARCH64_JUMP_SLOT (1026)
+ELF-AARCH64: Type: R_AARCH64_RELATIVE (1027)
+ELF-AARCH64: Type: R_AARCH64_TLS_DTPREL64 (1028)
+ELF-AARCH64: Type: R_AARCH64_TLS_DTPMOD64 (1029)
+ELF-AARCH64: Type: R_AARCH64_TLS_TPREL64 (1030)
+ELF-AARCH64: Type: R_AARCH64_TLSDESC (1031)
+ELF-AARCH64: Type: R_AARCH64_IRELATIVE (1032)
 
 ELF-ARM: Type: R_ARM_NONE (0)
 ELF-ARM: Type: R_ARM_PC24 (1)
@@ -250,7 +261,6 @@
 ELF-ARM: Type: R_ARM_GOTOFF32 (24)
 ELF-ARM: Type: R_ARM_BASE_PREL (25)
 ELF-ARM: Type: R_ARM_GOT_BREL (26)
-ELF-ARM: Type: R_ARM_PLT32 (27)
 ELF-ARM: Type: R_ARM_CALL (28)
 ELF-ARM: Type: R_ARM_JUMP24 (29)
 ELF-ARM: Type: R_ARM_THM_JUMP24 (30)

diff --git a/test/tools/llvm-readobj/relocations.test b/test/tools/llvm-readobj/relocations.test
index 864ded3..222dcf1 100644
--- a/test/tools/llvm-readobj/relocations.test
+++ b/test/tools/llvm-readobj/relocations.test

@@ -1,5 +1,7 @@
 RUN: llvm-readobj -r %p/Inputs/trivial.obj.coff-i386 \
 RUN:   | FileCheck %s -check-prefix COFF
+RUN: llvm-readobj -r %p/Inputs/bad-relocs.obj.coff-i386 \
+RUN:   | FileCheck %s -check-prefix BAD-COFF-RELOCS
 RUN: llvm-readobj -r %p/Inputs/trivial.obj.elf-i386 \
 RUN:   | FileCheck %s -check-prefix ELF
 RUN: llvm-readobj -r %p/Inputs/trivial.obj.macho-i386 \
@@ -21,6 +23,12 @@
 COFF-NEXT:   }
 COFF-NEXT: ]
 
+BAD-COFF-RELOCS:      Relocations [
+BAD-COFF-RELOCS-NEXT:   Section (1) sec {
+BAD-COFF-RELOCS-NEXT:     0xDEADBEEF IMAGE_REL_I386_ABSOLUTE -
+BAD-COFF-RELOCS-NEXT:   }
+BAD-COFF-RELOCS-NEXT: ]
+
 ELF:      Relocations [
 ELF-NEXT:   Section (2) .rel.text {
 ELF-NEXT:     0xC R_386_GOTPC _GLOBAL_OFFSET_TABLE_ 0x0

diff --git a/test/tools/llvm-readobj/sections-ext.test b/test/tools/llvm-readobj/sections-ext.test
index 972d8e6..4024878 100644
--- a/test/tools/llvm-readobj/sections-ext.test
+++ b/test/tools/llvm-readobj/sections-ext.test

@@ -52,7 +52,6 @@
 COFF-NEXT:           Checksum: 0x0
 COFF-NEXT:           Number: 1
 COFF-NEXT:           Selection: 0x0
-COFF-NEXT:           Unused: (00 00 00)
 COFF-NEXT:         }
 COFF-NEXT:       }
 COFF-NEXT:       Symbol {

diff --git a/test/tools/llvm-readobj/sections.test b/test/tools/llvm-readobj/sections.test
index 16f1131..fe734d7 100644
--- a/test/tools/llvm-readobj/sections.test
+++ b/test/tools/llvm-readobj/sections.test

@@ -2,6 +2,8 @@
 RUN:   | FileCheck %s -check-prefix COFF
 RUN: llvm-readobj -s %p/Inputs/trivial.obj.elf-i386 \
 RUN:   | FileCheck %s -check-prefix ELF
+RUN: llvm-readobj -s %p/Inputs/trivial.obj.elf-mipsel \
+RUN:   | FileCheck %s -check-prefix ELF-MIPSEL
 RUN: llvm-readobj -s %p/Inputs/trivial.obj.macho-i386 \
 RUN:   | FileCheck %s -check-prefix MACHO-I386
 RUN: llvm-readobj -s %p/Inputs/trivial.obj.macho-x86-64 \
@@ -84,6 +86,37 @@
 ELF-NEXT:     EntrySize: 0
 ELF-NEXT:   }
 
+ELF-MIPSEL:      Section {
+ELF-MIPSEL:        Index: 4
+ELF-MIPSEL-NEXT:   Name: .reginfo (27)
+ELF-MIPSEL-NEXT:   Type: SHT_MIPS_REGINFO (0x70000006)
+ELF-MIPSEL-NEXT:   Flags [ (0x2)
+ELF-MIPSEL-NEXT:     SHF_ALLOC (0x2)
+ELF-MIPSEL-NEXT:   ]
+ELF-MIPSEL-NEXT:   Address: 0x0
+ELF-MIPSEL-NEXT:   Offset: 0x34
+ELF-MIPSEL-NEXT:   Size: 24
+ELF-MIPSEL-NEXT:   Link: 0
+ELF-MIPSEL-NEXT:   Info: 0
+ELF-MIPSEL-NEXT:   AddressAlignment: 1
+ELF-MIPSEL-NEXT:   EntrySize: 0
+ELF-MIPSEL-NEXT: }
+ELF-MIPSEL-NEXT: Section {
+ELF-MIPSEL-NEXT:   Index: 5
+ELF-MIPSEL-NEXT:   Name: .MIPS.abiflags (12)
+ELF-MIPSEL-NEXT:   Type: SHT_MIPS_ABIFLAGS (0x7000002A)
+ELF-MIPSEL-NEXT:   Flags [ (0x2)
+ELF-MIPSEL-NEXT:     SHF_ALLOC (0x2)
+ELF-MIPSEL-NEXT:   ]
+ELF-MIPSEL-NEXT:   Address: 0x0
+ELF-MIPSEL-NEXT:   Offset: 0x50
+ELF-MIPSEL-NEXT:   Size: 24
+ELF-MIPSEL-NEXT:   Link: 0
+ELF-MIPSEL-NEXT:   Info: 0
+ELF-MIPSEL-NEXT:   AddressAlignment: 8
+ELF-MIPSEL-NEXT:   EntrySize: 0
+ELF-MIPSEL-NEXT: }
+
 MACHO-I386:      Sections [
 MACHO-I386-NEXT:   Section {
 MACHO-I386-NEXT:     Index: 0

diff --git a/test/tools/llvm-readobj/symbols.test b/test/tools/llvm-readobj/symbols.test
index 26830ac..71955e0 100644
--- a/test/tools/llvm-readobj/symbols.test
+++ b/test/tools/llvm-readobj/symbols.test

@@ -7,7 +7,7 @@
 COFF-NEXT:   Symbol {
 COFF-NEXT:     Name: @comp.id
 COFF-NEXT:    Value: 14766605
-COFF-NEXT:    Section:  (65535)
+COFF-NEXT:    Section:  IMAGE_SYM_ABSOLUTE (-1)
 COFF-NEXT:     BaseType: Null (0x0)
 COFF-NEXT:     ComplexType: Null (0x0)
 COFF-NEXT:    StorageClass: Static (0x3)
@@ -16,7 +16,7 @@
 COFF-NEXT:   Symbol {
 COFF-NEXT:    Name: @feat.00
 COFF-NEXT:    Value: 2147484049
-COFF-NEXT:     Section:  (65535)
+COFF-NEXT:     Section:  IMAGE_SYM_ABSOLUTE (-1)
 COFF-NEXT:     BaseType: Null (0x0)
 COFF-NEXT:    ComplexType: Null (0x0)
 COFF-NEXT:    StorageClass: Static (0x3)
@@ -37,7 +37,6 @@
 COFF-NEXT:       Checksum: 0x0
 COFF-NEXT:       Number: 1
 COFF-NEXT:       Selection: 0x0
-COFF-NEXT:       Unused: (00 00 00)
 COFF-NEXT:     }
 COFF-NEXT:   }
 

diff --git a/test/tools/llvm-symbolizer/Inputs/dsym-test-exe b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe
new file mode 100755
index 0000000..ba3154c
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe
Binary files differ

diff --git a/test/tools/llvm-symbolizer/Inputs/dsym-test-exe-differentname.dSYM/Contents/Info.plist b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe-differentname.dSYM/Contents/Info.plist
new file mode 100644
index 0000000..4e84ad0
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe-differentname.dSYM/Contents/Info.plist

@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+	<dict>
+		<key>CFBundleDevelopmentRegion</key>
+		<string>English</string>
+		<key>CFBundleIdentifier</key>
+		<string>com.apple.xcode.dsym.dsym-test-exe-differentname</string>
+		<key>CFBundleInfoDictionaryVersion</key>
+		<string>6.0</string>
+		<key>CFBundlePackageType</key>
+		<string>dSYM</string>
+		<key>CFBundleSignature</key>
+		<string>????</string>
+		<key>CFBundleShortVersionString</key>
+		<string>1.0</string>
+		<key>CFBundleVersion</key>
+		<string>1</string>
+	</dict>
+</plist>

diff --git a/test/tools/llvm-symbolizer/Inputs/dsym-test-exe-differentname.dSYM/Contents/Resources/DWARF/dsym-test-exe-second b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe-differentname.dSYM/Contents/Resources/DWARF/dsym-test-exe-second
new file mode 100644
index 0000000..c30dba3
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe-differentname.dSYM/Contents/Resources/DWARF/dsym-test-exe-second
Binary files differ

diff --git a/test/tools/llvm-symbolizer/Inputs/dsym-test-exe-second b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe-second
new file mode 100755
index 0000000..ba3154c
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe-second
Binary files differ

diff --git a/test/tools/llvm-symbolizer/Inputs/dsym-test-exe.dSYM/Contents/Info.plist b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe.dSYM/Contents/Info.plist
new file mode 100644
index 0000000..35b1c11
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe.dSYM/Contents/Info.plist

@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+	<dict>
+		<key>CFBundleDevelopmentRegion</key>
+		<string>English</string>
+		<key>CFBundleIdentifier</key>
+		<string>com.apple.xcode.dsym.dsym-test-exe</string>
+		<key>CFBundleInfoDictionaryVersion</key>
+		<string>6.0</string>
+		<key>CFBundlePackageType</key>
+		<string>dSYM</string>
+		<key>CFBundleSignature</key>
+		<string>????</string>
+		<key>CFBundleShortVersionString</key>
+		<string>1.0</string>
+		<key>CFBundleVersion</key>
+		<string>1</string>
+	</dict>
+</plist>

diff --git a/test/tools/llvm-symbolizer/Inputs/dsym-test-exe.dSYM/Contents/Resources/DWARF/dsym-test-exe b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe.dSYM/Contents/Resources/DWARF/dsym-test-exe
new file mode 100644
index 0000000..c30dba3
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/dsym-test-exe.dSYM/Contents/Resources/DWARF/dsym-test-exe
Binary files differ

diff --git a/test/tools/llvm-symbolizer/Inputs/dsym-test.c b/test/tools/llvm-symbolizer/Inputs/dsym-test.c
new file mode 100644
index 0000000..84d5ad9
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/dsym-test.c

@@ -0,0 +1,8 @@
+// clang -c dsym-test.c -g
+// clang dsym-test.o -g -o dsym-test-exe
+// dsymutil dsym-test-exe
+// clang dsym-test.o -g -o dsym-test-exe-second
+// dsymutil dsym-test-exe-second -o dsym-test-exe-differentname.dSYM
+int main() {
+  return 0;
+}

diff --git a/test/tools/llvm-symbolizer/Inputs/ppc64 b/test/tools/llvm-symbolizer/Inputs/ppc64
new file mode 100755
index 0000000..2356e43
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/ppc64
Binary files differ

diff --git a/test/tools/llvm-symbolizer/dsym.test b/test/tools/llvm-symbolizer/dsym.test
new file mode 100644
index 0000000..326602d
--- /dev/null
+++ b/test/tools/llvm-symbolizer/dsym.test

@@ -0,0 +1,14 @@
+RUN: echo "%p/Inputs/dsym-test-exe 0x0000000100000f90" > %t.input
+RUN: echo "%p/Inputs/dsym-test-exe-second 0x0000000100000f90" >> %t.input
+RUN: llvm-symbolizer < %t.input | FileCheck %s --check-prefix=CHECK-NOHINT
+RUN: llvm-symbolizer -dsym-hint=%p/Inputs/dsym-test-exe-differentname.dSYM < %t.input | FileCheck %s --check-prefix=CHECK-HINT
+
+CHECK-NOHINT: main
+CHECK-NOHINT: dsym-test.c
+CHECK-NOHINT: main
+CHECK-NOHINT: ??:0:0
+
+CHECK-HINT: main
+CHECK-HINT: dsym-test.c
+CHECK-HINT: main
+CHECK-HINT: dsym-test.c

diff --git a/test/tools/llvm-symbolizer/ppc64.test b/test/tools/llvm-symbolizer/ppc64.test
new file mode 100644
index 0000000..fc8e4ff
--- /dev/null
+++ b/test/tools/llvm-symbolizer/ppc64.test

@@ -0,0 +1,11 @@
+// ppc64 was compiled from this source on a big-endian 64-bit PowerPC box
+// with just "clang -nostdlib":
+int foo() { return 0; }
+int bar() { return foo(); }
+int _start() { return bar(); }
+
+RUN: %python -c "print('0x1000014c\n0x1000018c\n0x100001cc')" | llvm-symbolizer -obj=%p/Inputs/ppc64 | FileCheck %s
+
+CHECK: foo
+CHECK: bar
+CHECK: _start

diff --git a/test/tools/llvm-vtabledump/Inputs/trivial.obj.coff-i386 b/test/tools/llvm-vtabledump/Inputs/trivial.obj.coff-i386
new file mode 100644
index 0000000..3b93955
--- /dev/null
+++ b/test/tools/llvm-vtabledump/Inputs/trivial.obj.coff-i386
Binary files differ

diff --git a/test/tools/llvm-vtabledump/Inputs/trivial.obj.elf-i386 b/test/tools/llvm-vtabledump/Inputs/trivial.obj.elf-i386
new file mode 100644
index 0000000..1a5c929
--- /dev/null
+++ b/test/tools/llvm-vtabledump/Inputs/trivial.obj.elf-i386
Binary files differ

diff --git a/test/tools/llvm-vtabledump/trivial.test b/test/tools/llvm-vtabledump/trivial.test
new file mode 100644
index 0000000..92bd058
--- /dev/null
+++ b/test/tools/llvm-vtabledump/trivial.test

@@ -0,0 +1,58 @@
+RUN: llvm-vtabledump %p/Inputs/trivial.obj.coff-i386 \
+RUN:   | FileCheck %s --check-prefix=COFF-I386
+
+RUN: llvm-vtabledump %p/Inputs/trivial.obj.elf-i386 \
+RUN:   | FileCheck %s --check-prefix=ELF-I386
+
+COFF-I386:      ??_7S@@6B@[0]: ??_R4S@@6B@
+COFF-I386-NEXT: ??_7S@@6B@[4]: ??_GS@@UAEPAXI@Z
+COFF-I386-NEXT: ??_8S@@7B@[0]: -4
+COFF-I386-NEXT: ??_8S@@7B@[4]: 4
+COFF-I386-NEXT: ??_R4S@@6B@[IsImageRelative]: 0
+COFF-I386-NEXT: ??_R4S@@6B@[OffsetToTop]: 0
+COFF-I386-NEXT: ??_R4S@@6B@[VFPtrOffset]: 0
+COFF-I386-NEXT: ??_R4S@@6B@[TypeDescriptor]: ??_R0?AUS@@@8
+COFF-I386-NEXT: ??_R4S@@6B@[ClassHierarchyDescriptor]: ??_R3S@@8
+COFF-I386-NEXT: ??_R3A@@8[AlwaysZero]: 0
+COFF-I386-NEXT: ??_R3A@@8[Flags]: 0
+COFF-I386-NEXT: ??_R3A@@8[NumClasses]: 1
+COFF-I386-NEXT: ??_R3A@@8[BaseClassArray]: ??_R2A@@8
+COFF-I386-NEXT: ??_R3S@@8[AlwaysZero]: 0
+COFF-I386-NEXT: ??_R3S@@8[Flags]: 0
+COFF-I386-NEXT: ??_R3S@@8[NumClasses]: 2
+COFF-I386-NEXT: ??_R3S@@8[BaseClassArray]: ??_R2S@@8
+COFF-I386-NEXT: ??_R2A@@8[0]: ??_R1A@?0A@EA@A@@8
+COFF-I386-NEXT: ??_R2S@@8[0]: ??_R1A@?0A@EA@S@@8
+COFF-I386-NEXT: ??_R2S@@8[4]: ??_R1A@33FA@A@@8
+COFF-I386-NEXT: ??_R1A@33FA@A@@8[TypeDescriptor]: ??_R0?AUA@@@8
+COFF-I386-NEXT: ??_R1A@33FA@A@@8[NumBases]: 0
+COFF-I386-NEXT: ??_R1A@33FA@A@@8[OffsetInVBase]: 0
+COFF-I386-NEXT: ??_R1A@33FA@A@@8[VBPtrOffset]: 4
+COFF-I386-NEXT: ??_R1A@33FA@A@@8[OffsetInVBTable]: 4
+COFF-I386-NEXT: ??_R1A@33FA@A@@8[Flags]: 80
+COFF-I386-NEXT: ??_R1A@33FA@A@@8[ClassHierarchyDescriptor]: ??_R3A@@8
+COFF-I386-NEXT: ??_R1A@?0A@EA@A@@8[TypeDescriptor]: ??_R0?AUA@@@8
+COFF-I386-NEXT: ??_R1A@?0A@EA@A@@8[NumBases]: 0
+COFF-I386-NEXT: ??_R1A@?0A@EA@A@@8[OffsetInVBase]: 0
+COFF-I386-NEXT: ??_R1A@?0A@EA@A@@8[VBPtrOffset]: -1
+COFF-I386-NEXT: ??_R1A@?0A@EA@A@@8[OffsetInVBTable]: 0
+COFF-I386-NEXT: ??_R1A@?0A@EA@A@@8[Flags]: 64
+COFF-I386-NEXT: ??_R1A@?0A@EA@A@@8[ClassHierarchyDescriptor]: ??_R3A@@8
+COFF-I386-NEXT: ??_R1A@?0A@EA@S@@8[TypeDescriptor]: ??_R0?AUS@@@8
+COFF-I386-NEXT: ??_R1A@?0A@EA@S@@8[NumBases]: 1
+COFF-I386-NEXT: ??_R1A@?0A@EA@S@@8[OffsetInVBase]: 0
+COFF-I386-NEXT: ??_R1A@?0A@EA@S@@8[VBPtrOffset]: -1
+COFF-I386-NEXT: ??_R1A@?0A@EA@S@@8[OffsetInVBTable]: 0
+COFF-I386-NEXT: ??_R1A@?0A@EA@S@@8[Flags]: 64
+COFF-I386-NEXT: ??_R1A@?0A@EA@S@@8[ClassHierarchyDescriptor]: ??_R3S@@8
+COFF-I386-NEXT: ??_R0?AUA@@@8[VFPtr]: ??_7type_info@@6B@
+COFF-I386-NEXT: ??_R0?AUA@@@8[AlwaysZero]: 0
+COFF-I386-NEXT: ??_R0?AUA@@@8[MangledName]: .?AUA@@
+COFF-I386-NEXT: ??_R0?AUS@@@8[VFPtr]: ??_7type_info@@6B@
+COFF-I386-NEXT: ??_R0?AUS@@@8[AlwaysZero]: 0
+COFF-I386-NEXT: ??_R0?AUS@@@8[MangledName]: .?AUS@@
+
+ELF-I386:      _ZTS1A: 1A
+ELF-I386-NEXT: _ZTV1A[0]: 0
+ELF-I386-NEXT: _ZTV1A[4]: _ZTI1A
+ELF-I386-NEXT: _ZTV1A[8]: _ZN1A1fEv

diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 846ad1e..fd761ec 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt

@@ -8,6 +8,12 @@
   list(APPEND LLVM_IMPLICIT_PROJECT_IGNORE "${LLVM_MAIN_SRC_DIR}/tools/polly")
 endif(WITH_POLLY)
 
+if( LLVM_BUILD_LLVM_DYLIB )
+  add_llvm_tool_subdirectory(llvm-shlib)
+else()
+  ignore_llvm_tool_subdirectory(llvm-shlib)
+endif()
+
 add_llvm_tool_subdirectory(opt)
 add_llvm_tool_subdirectory(llvm-as)
 add_llvm_tool_subdirectory(llvm-dis)
@@ -30,6 +36,7 @@
 add_llvm_tool_subdirectory(llvm-readobj)
 add_llvm_tool_subdirectory(llvm-rtdyld)
 add_llvm_tool_subdirectory(llvm-dwarfdump)
+add_llvm_tool_subdirectory(llvm-vtabledump)
 if( LLVM_USE_INTEL_JITEVENTS )
   add_llvm_tool_subdirectory(llvm-jitlistener)
 else()
@@ -42,6 +49,8 @@
 add_llvm_tool_subdirectory(llvm-stress)
 add_llvm_tool_subdirectory(llvm-mcmarkup)
 
+add_llvm_tool_subdirectory(verify-uselistorder)
+
 add_llvm_tool_subdirectory(llvm-symbolizer)
 
 add_llvm_tool_subdirectory(llvm-c-test)
@@ -49,6 +58,8 @@
 add_llvm_tool_subdirectory(obj2yaml)
 add_llvm_tool_subdirectory(yaml2obj)
 
+add_llvm_tool_subdirectory(llvm-go)
+
 if(NOT CYGWIN AND LLVM_ENABLE_PIC)
   add_llvm_tool_subdirectory(lto)
   add_llvm_tool_subdirectory(llvm-lto)

diff --git a/tools/LLVMBuild.txt b/tools/LLVMBuild.txt
index 1b537a3..13a08b2 100644
--- a/tools/LLVMBuild.txt
+++ b/tools/LLVMBuild.txt

@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-jitlistener llvm-link llvm-lto llvm-mc llvm-nm llvm-objdump llvm-profdata llvm-rtdyld llvm-size macho-dump opt llvm-mcmarkup
+subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-jitlistener llvm-link llvm-lto llvm-mc llvm-nm llvm-objdump llvm-profdata llvm-rtdyld llvm-size macho-dump opt llvm-mcmarkup verify-uselistorder
 
 [component_0]
 type = Group

diff --git a/tools/Makefile b/tools/Makefile
index 97ad99a..4b8923a 100644
--- a/tools/Makefile
+++ b/tools/Makefile

@@ -21,7 +21,8 @@
 
 # Build LLD and LLDB if present. Note LLDB must be built last as it depends on
 # the wider LLVM infrastructure (including Clang).
-OPTIONAL_DIRS := lld lldb
+OPTIONAL_PARALLEL_DIRS += lld
+OPTIONAL_DIRS := lldb
 
 # NOTE: The tools are organized into five groups of four consisting of one
 # large and three small executables. This is done to minimize memory load
@@ -31,7 +32,8 @@
                  lli llvm-extract llvm-mc bugpoint llvm-bcanalyzer llvm-diff \
                  macho-dump llvm-objdump llvm-readobj llvm-rtdyld \
                  llvm-dwarfdump llvm-cov llvm-size llvm-stress llvm-mcmarkup \
-                 llvm-profdata llvm-symbolizer obj2yaml yaml2obj llvm-c-test
+                 llvm-profdata llvm-symbolizer obj2yaml yaml2obj llvm-c-test \
+                 llvm-vtabledump verify-uselistorder
 
 # If Intel JIT Events support is configured, build an extra tool to test it.
 ifeq ($(USE_INTEL_JITEVENTS), 1)
@@ -72,4 +74,8 @@
   endif
 endif
 
+ifneq (,$(filter go,$(BINDINGS_TO_BUILD)))
+  PARALLEL_DIRS += llvm-go
+endif
+
 include $(LEVEL)/Makefile.common

diff --git a/tools/bugpoint-passes/CMakeLists.txt b/tools/bugpoint-passes/CMakeLists.txt
index b7ee626..de68bb5 100644
--- a/tools/bugpoint-passes/CMakeLists.txt
+++ b/tools/bugpoint-passes/CMakeLists.txt

@@ -10,6 +10,10 @@
   endif()
 endif()
 
+if(WIN32 OR CYGWIN)
+  set(LLVM_LINK_COMPONENTS Core)
+endif()
+
 add_llvm_loadable_module( BugpointPasses
   TestPasses.cpp
   )

diff --git a/tools/bugpoint/Android.mk b/tools/bugpoint/Android.mk
index 78f3eff..512a91f 100644
--- a/tools/bugpoint/Android.mk
+++ b/tools/bugpoint/Android.mk

@@ -35,6 +35,7 @@
   libLLVMTarget \
   libLLVMCore \
   libLLVMMC \
+  libLLVMProfileData \
   libLLVMTransformUtils \
   libLLVMVectorize \
   libLLVMSupport \

diff --git a/tools/bugpoint/BugDriver.cpp b/tools/bugpoint/BugDriver.cpp
index cecccbe..b8be17e 100644
--- a/tools/bugpoint/BugDriver.cpp
+++ b/tools/bugpoint/BugDriver.cpp

@@ -82,14 +82,10 @@
   delete gcc;
 }
 
-
-/// ParseInputFile - Given a bitcode or assembly input filename, parse and
-/// return it, or return null if not possible.
-///
-Module *llvm::ParseInputFile(const std::string &Filename,
-                             LLVMContext& Ctxt) {
+std::unique_ptr<Module> llvm::parseInputFile(StringRef Filename,
+                                             LLVMContext &Ctxt) {
   SMDiagnostic Err;
-  Module *Result = ParseIRFile(Filename, Err, Ctxt);
+  std::unique_ptr<Module> Result = parseIRFile(Filename, Err, Ctxt);
   if (!Result)
     Err.print("bugpoint", errs());
 
@@ -120,23 +116,18 @@
   assert(!Filenames.empty() && "Must specify at least on input filename!");
 
   // Load the first input file.
-  Program = ParseInputFile(Filenames[0], Context);
+  Program = parseInputFile(Filenames[0], Context).release();
   if (!Program) return true;
 
   outs() << "Read input file      : '" << Filenames[0] << "'\n";
 
   for (unsigned i = 1, e = Filenames.size(); i != e; ++i) {
-    std::unique_ptr<Module> M(ParseInputFile(Filenames[i], Context));
+    std::unique_ptr<Module> M = parseInputFile(Filenames[i], Context);
     if (!M.get()) return true;
 
     outs() << "Linking in input file: '" << Filenames[i] << "'\n";
-    std::string ErrorMessage;
-    if (Linker::LinkModules(Program, M.get(), Linker::DestroySource,
-                            &ErrorMessage)) {
-      errs() << ToolName << ": error linking in '" << Filenames[i] << "': "
-             << ErrorMessage << '\n';
+    if (Linker::LinkModules(Program, M.get()))
       return true;
-    }
   }
 
   outs() << "*** All input ok\n";

diff --git a/tools/bugpoint/BugDriver.h b/tools/bugpoint/BugDriver.h
index 3169d29..5797812 100644
--- a/tools/bugpoint/BugDriver.h
+++ b/tools/bugpoint/BugDriver.h

@@ -13,11 +13,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef BUGDRIVER_H
-#define BUGDRIVER_H
+#ifndef LLVM_TOOLS_BUGPOINT_BUGDRIVER_H
+#define LLVM_TOOLS_BUGPOINT_BUGDRIVER_H
 
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -210,41 +211,46 @@
   void EmitProgressBitcode(const Module *M, const std::string &ID,
                            bool NoFlyer = false) const;
 
-  /// deleteInstructionFromProgram - This method clones the current Program and
-  /// deletes the specified instruction from the cloned module.  It then runs a
-  /// series of cleanup passes (ADCE and SimplifyCFG) to eliminate any code
-  /// which depends on the value.  The modified module is then returned.
+  /// This method clones the current Program and deletes the specified
+  /// instruction from the cloned module.  It then runs a series of cleanup
+  /// passes (ADCE and SimplifyCFG) to eliminate any code which depends on the
+  /// value. The modified module is then returned.
   ///
-  Module *deleteInstructionFromProgram(const Instruction *I, unsigned Simp);
+  std::unique_ptr<Module> deleteInstructionFromProgram(const Instruction *I,
+                                                       unsigned Simp);
 
-  /// performFinalCleanups - This method clones the current Program and performs
-  /// a series of cleanups intended to get rid of extra cruft on the module.  If
-  /// the MayModifySemantics argument is true, then the cleanups is allowed to
+  /// This method clones the current Program and performs a series of cleanups
+  /// intended to get rid of extra cruft on the module. If the
+  /// MayModifySemantics argument is true, then the cleanups is allowed to
   /// modify how the code behaves.
   ///
-  Module *performFinalCleanups(Module *M, bool MayModifySemantics = false);
+  std::unique_ptr<Module> performFinalCleanups(Module *M,
+                                               bool MayModifySemantics = false);
 
-  /// ExtractLoop - Given a module, extract up to one loop from it into a new
-  /// function.  This returns null if there are no extractable loops in the
-  /// program or if the loop extractor crashes.
-  Module *ExtractLoop(Module *M);
+  /// Given a module, extract up to one loop from it into a new function. This
+  /// returns null if there are no extractable loops in the program or if the
+  /// loop extractor crashes.
+  std::unique_ptr<Module> extractLoop(Module *M);
 
-  /// ExtractMappedBlocksFromModule - Extract all but the specified basic blocks
-  /// into their own functions.  The only detail is that M is actually a module
-  /// cloned from the one the BBs are in, so some mapping needs to be performed.
-  /// If this operation fails for some reason (ie the implementation is buggy),
-  /// this function should return null, otherwise it returns a new Module.
-  Module *ExtractMappedBlocksFromModule(const std::vector<BasicBlock*> &BBs,
-                                        Module *M);
+  /// Extract all but the specified basic blocks into their own functions. The
+  /// only detail is that M is actually a module cloned from the one the BBs are
+  /// in, so some mapping needs to be performed. If this operation fails for
+  /// some reason (ie the implementation is buggy), this function should return
+  /// null, otherwise it returns a new Module.
+  std::unique_ptr<Module>
+  extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
+                                Module *M);
 
-  /// runPassesOn - Carefully run the specified set of pass on the specified
-  /// module, returning the transformed module on success, or a null pointer on
-  /// failure.  If AutoDebugCrashes is set to true, then bugpoint will
-  /// automatically attempt to track down a crashing pass if one exists, and
-  /// this method will never return null.
-  Module *runPassesOn(Module *M, const std::vector<std::string> &Passes,
-                      bool AutoDebugCrashes = false, unsigned NumExtraArgs = 0,
-                      const char * const *ExtraArgs = nullptr);
+  /// Carefully run the specified set of pass on the specified/ module,
+  /// returning the transformed module on success, or a null pointer on failure.
+  /// If AutoDebugCrashes is set to true, then bugpoint will automatically
+  /// attempt to track down a crashing pass if one exists, and this method will
+  /// never return null.
+  std::unique_ptr<Module> runPassesOn(Module *M,
+                                      const std::vector<std::string> &Passes,
+                                      bool AutoDebugCrashes = false,
+                                      unsigned NumExtraArgs = 0,
+                                      const char *const *ExtraArgs = nullptr);
 
   /// runPasses - Run the specified passes on Program, outputting a bitcode
   /// file and writting the filename into OutputFile if successful.  If the
@@ -296,12 +302,11 @@
   bool initializeExecutionEnvironment();
 };
 
-/// ParseInputFile - Given a bitcode or assembly input filename, parse and
-/// return it, or return null if not possible.
+///  Given a bitcode or assembly input filename, parse and return it, or return
+///  null if not possible.
 ///
-Module *ParseInputFile(const std::string &InputFilename,
-                       LLVMContext& ctxt);
-
+std::unique_ptr<Module> parseInputFile(StringRef InputFilename,
+                                       LLVMContext &ctxt);
 
 /// getPassesString - Turn a list of passes into a string which indicates the
 /// command line options that must be passed to add the passes.

diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index 8bd61b3..bac948a 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp

@@ -72,7 +72,7 @@
 
     OrigProgram = BD.Program;
 
-    BD.Program = ParseInputFile(PrefixOutput, BD.getContext());
+    BD.Program = parseInputFile(PrefixOutput, BD.getContext()).release();
     if (BD.Program == nullptr) {
       errs() << BD.getToolName() << ": Error reading bitcode file '"
              << PrefixOutput << "'!\n";
@@ -312,22 +312,21 @@
   // have to take.
   std::vector<std::pair<std::string, std::string> > BlockInfo;
 
-  for (SmallPtrSet<BasicBlock*, 8>::iterator I = Blocks.begin(),
-         E = Blocks.end(); I != E; ++I)
-    BlockInfo.push_back(std::make_pair((*I)->getParent()->getName(),
-                                       (*I)->getName()));
+  for (BasicBlock *BB : Blocks)
+    BlockInfo.push_back(std::make_pair(BB->getParent()->getName(),
+                                       BB->getName()));
 
   // Now run the CFG simplify pass on the function...
   std::vector<std::string> Passes;
   Passes.push_back("simplifycfg");
   Passes.push_back("verify");
-  Module *New = BD.runPassesOn(M, Passes);
+  std::unique_ptr<Module> New = BD.runPassesOn(M, Passes);
   delete M;
   if (!New) {
     errs() << "simplifycfg failed!\n";
     exit(1);
   }
-  M = New;
+  M = New.release();
 
   // Try running on the hacked up program...
   if (TestFn(BD, M)) {
@@ -420,9 +419,8 @@
     // Make sure to use instruction pointers that point into the now-current
     // module, and that they don't include any deleted blocks.
     Insts.clear();
-    for (SmallPtrSet<Instruction*, 64>::const_iterator I = Instructions.begin(),
-             E = Instructions.end(); I != E; ++I)
-      Insts.push_back(*I);
+    for (Instruction *Inst : Instructions)
+      Insts.push_back(Inst);
     return true;
   }
   delete M;  // It didn't crash, try something else.
@@ -578,20 +576,17 @@
                 continue;
 
               outs() << "Checking instruction: " << *I;
-              Module *M = BD.deleteInstructionFromProgram(I, Simplification);
+              std::unique_ptr<Module> M =
+                  BD.deleteInstructionFromProgram(I, Simplification);
 
               // Find out if the pass still crashes on this pass...
-              if (TestFn(BD, M)) {
+              if (TestFn(BD, M.get())) {
                 // Yup, it does, we delete the old module, and continue trying
                 // to reduce the testcase...
-                BD.setNewProgram(M);
+                BD.setNewProgram(M.release());
                 InstructionsToSkipBeforeDeleting = CurInstructionNum;
                 goto TryAgain;  // I wish I had a multi-level break here!
               }
-
-              // This pass didn't crash without this instruction, try the next
-              // one.
-              delete M;
             }
           }
 
@@ -607,7 +602,7 @@
   if (!BugpointIsInterrupted) {
     outs() << "\n*** Attempting to perform final cleanups: ";
     Module *M = CloneModule(BD.getProgram());
-    M = BD.performFinalCleanups(M, true);
+    M = BD.performFinalCleanups(M, true).release();
 
     // Find out if the pass still crashes on the cleaned up program...
     if (TestFn(BD, M)) {

diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 4fb6856..34fe53c 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp

@@ -82,13 +82,9 @@
   }
 }  // end anonymous namespace
 
-/// deleteInstructionFromProgram - This method clones the current Program and
-/// deletes the specified instruction from the cloned module.  It then runs a
-/// series of cleanup passes (ADCE and SimplifyCFG) to eliminate any code which
-/// depends on the value.  The modified module is then returned.
-///
-Module *BugDriver::deleteInstructionFromProgram(const Instruction *I,
-                                                unsigned Simplification) {
+std::unique_ptr<Module>
+BugDriver::deleteInstructionFromProgram(const Instruction *I,
+                                        unsigned Simplification) {
   // FIXME, use vmap?
   Module *Clone = CloneModule(Program);
 
@@ -123,7 +119,7 @@
     Passes.push_back("simplifycfg");      // Delete dead control flow
 
   Passes.push_back("verify");
-  Module *New = runPassesOn(Clone, Passes);
+  std::unique_ptr<Module> New = runPassesOn(Clone, Passes);
   delete Clone;
   if (!New) {
     errs() << "Instruction removal failed.  Sorry. :(  Please report a bug!\n";
@@ -132,11 +128,8 @@
   return New;
 }
 
-/// performFinalCleanups - This method clones the current Program and performs
-/// a series of cleanups intended to get rid of extra cruft on the module
-/// before handing it to the user.
-///
-Module *BugDriver::performFinalCleanups(Module *M, bool MayModifySemantics) {
+std::unique_ptr<Module>
+BugDriver::performFinalCleanups(Module *M, bool MayModifySemantics) {
   // Make all functions external, so GlobalDCE doesn't delete them...
   for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
     I->setLinkage(GlobalValue::ExternalLinkage);
@@ -149,24 +142,20 @@
   else
     CleanupPasses.push_back("deadargelim");
 
-  Module *New = runPassesOn(M, CleanupPasses);
+  std::unique_ptr<Module> New = runPassesOn(M, CleanupPasses);
   if (!New) {
     errs() << "Final cleanups failed.  Sorry. :(  Please report a bug!\n";
-    return M;
+    return nullptr;
   }
   delete M;
   return New;
 }
 
-
-/// ExtractLoop - Given a module, extract up to one loop from it into a new
-/// function.  This returns null if there are no extractable loops in the
-/// program or if the loop extractor crashes.
-Module *BugDriver::ExtractLoop(Module *M) {
+std::unique_ptr<Module> BugDriver::extractLoop(Module *M) {
   std::vector<std::string> LoopExtractPasses;
   LoopExtractPasses.push_back("loop-extract-single");
 
-  Module *NewM = runPassesOn(M, LoopExtractPasses);
+  std::unique_ptr<Module> NewM = runPassesOn(M, LoopExtractPasses);
   if (!NewM) {
     outs() << "*** Loop extraction failed: ";
     EmitProgressBitcode(M, "loopextraction", true);
@@ -179,7 +168,6 @@
   // to avoid taking forever.
   static unsigned NumExtracted = 32;
   if (M->size() == NewM->size() || --NumExtracted == 0) {
-    delete NewM;
     return nullptr;
   } else {
     assert(M->size() < NewM->size() && "Loop extract removed functions?");
@@ -356,14 +344,9 @@
 // Basic Block Extraction Code
 //===----------------------------------------------------------------------===//
 
-/// ExtractMappedBlocksFromModule - Extract all but the specified basic blocks
-/// into their own functions.  The only detail is that M is actually a module
-/// cloned from the one the BBs are in, so some mapping needs to be performed.
-/// If this operation fails for some reason (ie the implementation is buggy),
-/// this function should return null, otherwise it returns a new Module.
-Module *BugDriver::ExtractMappedBlocksFromModule(const
-                                                 std::vector<BasicBlock*> &BBs,
-                                                 Module *M) {
+std::unique_ptr<Module>
+BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
+                                         Module *M) {
   SmallString<128> Filename;
   int FD;
   std::error_code EC = sys::fs::createUniqueFile(
@@ -401,7 +384,7 @@
 
   std::vector<std::string> PI;
   PI.push_back("extract-blocks");
-  Module *Ret = runPassesOn(M, PI, false, 1, &ExtraArg);
+  std::unique_ptr<Module> Ret = runPassesOn(M, PI, false, 1, &ExtraArg);
 
   sys::fs::remove(Filename.c_str());
 

diff --git a/tools/bugpoint/ListReducer.h b/tools/bugpoint/ListReducer.h
index 8083e2d..a0bb570 100644
--- a/tools/bugpoint/ListReducer.h
+++ b/tools/bugpoint/ListReducer.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef BUGPOINT_LIST_REDUCER_H
-#define BUGPOINT_LIST_REDUCER_H
+#ifndef LLVM_TOOLS_BUGPOINT_LISTREDUCER_H
+#define LLVM_TOOLS_BUGPOINT_LISTREDUCER_H
 
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"

diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index 3f1f84e..8cb4583 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp

@@ -128,8 +128,8 @@
   // Ok, so now we know that the prefix passes work, try running the suffix
   // passes on the result of the prefix passes.
   //
-  std::unique_ptr<Module> PrefixOutput(
-      ParseInputFile(BitcodeResult, BD.getContext()));
+  std::unique_ptr<Module> PrefixOutput =
+      parseInputFile(BitcodeResult, BD.getContext());
   if (!PrefixOutput) {
     errs() << BD.getToolName() << ": Error reading bitcode file '"
            << BitcodeResult << "'!\n";
@@ -218,16 +218,12 @@
                                  bool DeleteInputs, std::string &Error,
                                  bool &Broken) {
   // Link the two portions of the program back to together.
-  std::string ErrorMsg;
   if (!DeleteInputs) {
     M1 = CloneModule(M1);
     M2 = CloneModule(M2);
   }
-  if (Linker::LinkModules(M1, M2, Linker::DestroySource, &ErrorMsg)) {
-    errs() << BD.getToolName() << ": Error linking modules together:"
-           << ErrorMsg << '\n';
+  if (Linker::LinkModules(M1, M2))
     exit(1);
-  }
   delete M2;   // We are done with this module.
 
   // Execute the program.
@@ -316,7 +312,7 @@
     Module *ToOptimize = SplitFunctionsOutOfModule(ToNotOptimize,
                                                    MiscompiledFunctions,
                                                    VMap);
-    Module *ToOptimizeLoopExtracted = BD.ExtractLoop(ToOptimize);
+    Module *ToOptimizeLoopExtracted = BD.extractLoop(ToOptimize).release();
     if (!ToOptimizeLoopExtracted) {
       // If the loop extractor crashed or if there were no extractible loops,
       // then this chapter of our odyssey is over with.
@@ -334,8 +330,8 @@
     // extraction.
     AbstractInterpreter *AI = BD.switchToSafeInterpreter();
     bool Failure;
-    Module *New = TestMergedProgram(BD, ToOptimizeLoopExtracted, ToNotOptimize,
-                                    false, Error, Failure);
+    Module *New = TestMergedProgram(BD, ToOptimizeLoopExtracted,
+                                    ToNotOptimize, false, Error, Failure);
     if (!New)
       return false;
 
@@ -364,7 +360,6 @@
              << OutputPrefix << "-loop-extract-fail-*.bc files.\n";
       delete ToOptimize;
       delete ToNotOptimize;
-      delete ToOptimizeLoopExtracted;
       return MadeChange;
     }
     delete ToOptimize;
@@ -397,13 +392,8 @@
                                                   F->getFunctionType()));
       }
 
-      std::string ErrorMsg;
-      if (Linker::LinkModules(ToNotOptimize, ToOptimizeLoopExtracted, 
-                              Linker::DestroySource, &ErrorMsg)){
-        errs() << BD.getToolName() << ": Error linking modules together:"
-               << ErrorMsg << '\n';
+      if (Linker::LinkModules(ToNotOptimize, ToOptimizeLoopExtracted))
         exit(1);
-      }
 
       MiscompiledFunctions.clear();
       for (unsigned i = 0, e = MisCompFunctions.size(); i != e; ++i) {
@@ -431,13 +421,9 @@
     // extraction both didn't break the program, and didn't mask the problem.
     // Replace the current program with the loop extracted version, and try to
     // extract another loop.
-    std::string ErrorMsg;
-    if (Linker::LinkModules(ToNotOptimize, ToOptimizeLoopExtracted, 
-                            Linker::DestroySource, &ErrorMsg)){
-      errs() << BD.getToolName() << ": Error linking modules together:"
-             << ErrorMsg << '\n';
+    if (Linker::LinkModules(ToNotOptimize, ToOptimizeLoopExtracted))
       exit(1);
-    }
+
     delete ToOptimizeLoopExtracted;
 
     // All of the Function*'s in the MiscompiledFunctions list are in the old
@@ -533,11 +519,12 @@
 
   // Try the extraction.  If it doesn't work, then the block extractor crashed
   // or something, in which case bugpoint can't chase down this possibility.
-  if (Module *New = BD.ExtractMappedBlocksFromModule(BBsOnClone, ToOptimize)) {
+  if (std::unique_ptr<Module> New =
+          BD.extractMappedBlocksFromModule(BBsOnClone, ToOptimize)) {
     delete ToOptimize;
     // Run the predicate,
     // note that the predicate will delete both input modules.
-    bool Ret = TestFn(BD, New, ToNotOptimize, Error);
+    bool Ret = TestFn(BD, New.get(), ToNotOptimize, Error);
     delete BD.swapProgramIn(Orig);
     return Ret;
   }
@@ -591,7 +578,8 @@
   Module *ToExtract = SplitFunctionsOutOfModule(ProgClone,
                                                 MiscompiledFunctions,
                                                 VMap);
-  Module *Extracted = BD.ExtractMappedBlocksFromModule(Blocks, ToExtract);
+  std::unique_ptr<Module> Extracted =
+      BD.extractMappedBlocksFromModule(Blocks, ToExtract);
   if (!Extracted) {
     // Weird, extraction should have worked.
     errs() << "Nondeterministic problem extracting blocks??\n";
@@ -611,14 +599,8 @@
       MisCompFunctions.push_back(std::make_pair(I->getName(),
                                                 I->getFunctionType()));
 
-  std::string ErrorMsg;
-  if (Linker::LinkModules(ProgClone, Extracted, Linker::DestroySource, 
-                          &ErrorMsg)) {
-    errs() << BD.getToolName() << ": Error linking modules together:"
-           << ErrorMsg << '\n';
+  if (Linker::LinkModules(ProgClone, Extracted.get()))
     exit(1);
-  }
-  delete Extracted;
 
   // Set the new program and delete the old one.
   BD.setNewProgram(ProgClone);
@@ -730,14 +712,15 @@
   // Run the optimization passes on ToOptimize, producing a transformed version
   // of the functions being tested.
   outs() << "  Optimizing functions being tested: ";
-  Module *Optimized = BD.runPassesOn(Test, BD.getPassesToRun(),
-                                     /*AutoDebugCrashes*/true);
+  std::unique_ptr<Module> Optimized = BD.runPassesOn(Test, BD.getPassesToRun(),
+                                                     /*AutoDebugCrashes*/ true);
   outs() << "done.\n";
   delete Test;
 
   outs() << "  Checking to see if the merged program executes correctly: ";
   bool Broken;
-  Module *New = TestMergedProgram(BD, Optimized, Safe, true, Error, Broken);
+  Module *New =
+      TestMergedProgram(BD, Optimized.get(), Safe, true, Error, Broken);
   if (New) {
     outs() << (Broken ? " nope.\n" : " yup.\n");
     // Delete the original and set the new program.
@@ -796,7 +779,7 @@
 static void CleanupAndPrepareModules(BugDriver &BD, Module *&Test,
                                      Module *Safe) {
   // Clean up the modules, removing extra cruft that we don't need anymore...
-  Test = BD.performFinalCleanups(Test);
+  Test = BD.performFinalCleanups(Test).release();
 
   // If we are executing the JIT, we have several nasty issues to take care of.
   if (!BD.isExecutingJIT()) return;

diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index d452fd9..f197cc5 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp

@@ -66,15 +66,15 @@
 
 bool BugDriver::writeProgramToFile(const std::string &Filename, int FD,
                                    const Module *M) const {
-  tool_output_file Out(Filename.c_str(), FD);
+  tool_output_file Out(Filename, FD);
   return writeProgramToFileAux(Out, M);
 }
 
 bool BugDriver::writeProgramToFile(const std::string &Filename,
                                    const Module *M) const {
-  std::string ErrInfo;
-  tool_output_file Out(Filename.c_str(), ErrInfo, sys::fs::F_None);
-  if (ErrInfo.empty())
+  std::error_code EC;
+  tool_output_file Out(Filename, EC, sys::fs::F_None);
+  if (!EC)
     return writeProgramToFileAux(Out, M);
   return true;
 }
@@ -149,7 +149,7 @@
     return 1;
   }
 
-  tool_output_file InFile(InputFilename.c_str(), InputFD);
+  tool_output_file InFile(InputFilename, InputFD);
 
   WriteBitcodeToFile(Program, InFile.os());
   InFile.os().close();
@@ -159,12 +159,31 @@
     return 1;
   }
 
-  std::string tool = OptCmd.empty()? sys::FindProgramByName("opt") : OptCmd;
+  std::string tool = OptCmd;
+  if (OptCmd.empty()) {
+    if (ErrorOr<std::string> Path = sys::findProgramByName("opt"))
+      tool = *Path;
+    else
+      errs() << Path.getError().message() << "\n";
+  }
   if (tool.empty()) {
     errs() << "Cannot find `opt' in PATH!\n";
     return 1;
   }
 
+  std::string Prog;
+  if (UseValgrind) {
+    if (ErrorOr<std::string> Path = sys::findProgramByName("valgrind"))
+      Prog = *Path;
+    else
+      errs() << Path.getError().message() << "\n";
+  } else
+    Prog = tool;
+  if (Prog.empty()) {
+    errs() << "Cannot find `valgrind' in PATH!\n";
+    return 1;
+  }
+
   // Ok, everything that could go wrong before running opt is done.
   InFile.keep();
 
@@ -204,12 +223,6 @@
         errs() << "\n";
         );
 
-  std::string Prog;
-  if (UseValgrind)
-    Prog = sys::FindProgramByName("valgrind");
-  else
-    Prog = tool;
-
   // Redirect stdout and stderr to nowhere if SilencePasses is given
   StringRef Nowhere;
   const StringRef *Redirects[3] = {nullptr, &Nowhere, &Nowhere};
@@ -247,13 +260,10 @@
 }
 
 
-/// runPassesOn - Carefully run the specified set of pass on the specified
-/// module, returning the transformed module on success, or a null pointer on
-/// failure.
-Module *BugDriver::runPassesOn(Module *M,
-                               const std::vector<std::string> &Passes,
-                               bool AutoDebugCrashes, unsigned NumExtraArgs,
-                               const char * const *ExtraArgs) {
+std::unique_ptr<Module>
+BugDriver::runPassesOn(Module *M, const std::vector<std::string> &Passes,
+                       bool AutoDebugCrashes, unsigned NumExtraArgs,
+                       const char *const *ExtraArgs) {
   std::string BitcodeResult;
   if (runPasses(M, Passes, BitcodeResult, false/*delete*/, true/*quiet*/,
                 NumExtraArgs, ExtraArgs)) {
@@ -267,7 +277,7 @@
     return nullptr;
   }
 
-  Module *Ret = ParseInputFile(BitcodeResult, Context);
+  std::unique_ptr<Module> Ret = parseInputFile(BitcodeResult, Context);
   if (!Ret) {
     errs() << getToolName() << ": Error reading bitcode file '"
            << BitcodeResult << "'!\n";

diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index 4a2401b..51091e2 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp

@@ -141,13 +141,13 @@
 
   // Rerun the compiler, capturing any error messages to print them.
   SmallString<128> ErrorFilename;
-  int ErrorFD;
   std::error_code EC = sys::fs::createTemporaryFile(
-      "bugpoint.program_error_messages", "", ErrorFD, ErrorFilename);
+      "bugpoint.program_error_messages", "", ErrorFilename);
   if (EC) {
     errs() << "Error making unique filename: " << EC.message() << "\n";
     exit(1);
   }
+
   RunProgramWithTimeout(ProgPath, Args, "", ErrorFilename.str(),
                         ErrorFilename.str(), Timeout, MemoryLimit);
   // FIXME: check return code ?
@@ -427,13 +427,14 @@
     pos = CommandLine.find_first_of(delimiters, lastPos);
   }
 
-  CmdPath = sys::FindProgramByName(Command);
-  if (CmdPath.empty()) {
+  auto Path = sys::findProgramByName(Command);
+  if (!Path) {
     Message =
       std::string("Cannot find '") + Command +
-      "' in PATH!\n";
+      "' in PATH: " + Path.getError().message() + "\n";
     return;
   }
+  CmdPath = *Path;
 
   Message = "Found command in: " + CmdPath + "\n";
 }
@@ -907,16 +908,24 @@
 GCC *GCC::create(std::string &Message,
                  const std::string &GCCBinary,
                  const std::vector<std::string> *Args) {
-  std::string GCCPath = sys::FindProgramByName(GCCBinary);
-  if (GCCPath.empty()) {
-    Message = "Cannot find `"+ GCCBinary +"' in PATH!\n";
+  auto GCCPath = sys::findProgramByName(GCCBinary);
+  if (!GCCPath) {
+    Message = "Cannot find `" + GCCBinary + "' in PATH: " +
+              GCCPath.getError().message() + "\n";
     return nullptr;
   }
 
   std::string RemoteClientPath;
-  if (!RemoteClient.empty())
-    RemoteClientPath = sys::FindProgramByName(RemoteClient);
+  if (!RemoteClient.empty()) {
+    auto Path = sys::findProgramByName(RemoteClient);
+    if (!Path) {
+      Message = "Cannot find `" + RemoteClient + "' in PATH: " +
+                Path.getError().message() + "\n";
+      return nullptr;
+    }
+    RemoteClientPath = *Path;
+  }
 
-  Message = "Found gcc: " + GCCPath + "\n";
-  return new GCC(GCCPath, RemoteClientPath, Args);
+  Message = "Found gcc: " + *GCCPath + "\n";
+  return new GCC(*GCCPath, RemoteClientPath, Args);
 }

diff --git a/tools/bugpoint/ToolRunner.h b/tools/bugpoint/ToolRunner.h
index 6e7b95c..454724a 100644
--- a/tools/bugpoint/ToolRunner.h
+++ b/tools/bugpoint/ToolRunner.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef BUGPOINT_TOOLRUNNER_H
-#define BUGPOINT_TOOLRUNNER_H
+#ifndef LLVM_TOOLS_BUGPOINT_TOOLRUNNER_H
+#define LLVM_TOOLS_BUGPOINT_TOOLRUNNER_H
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/CommandLine.h"

diff --git a/tools/bugpoint/bugpoint.cpp b/tools/bugpoint/bugpoint.cpp
index c7dae0f..d0bade5 100644
--- a/tools/bugpoint/bugpoint.cpp
+++ b/tools/bugpoint/bugpoint.cpp

@@ -63,10 +63,6 @@
 PassList(cl::desc("Passes available:"), cl::ZeroOrMore);
 
 static cl::opt<bool>
-StandardCompileOpts("std-compile-opts",
-                   cl::desc("Include the standard compile time optimizations"));
-
-static cl::opt<bool>
 StandardLinkOpts("std-link-opts",
                  cl::desc("Include the standard link time optimizations"));
 
@@ -170,17 +166,11 @@
   if (D.addSources(InputFilenames)) return 1;
 
   AddToDriver PM(D);
-  if (StandardCompileOpts) {
-    PassManagerBuilder Builder;
-    Builder.OptLevel = 3;
-    Builder.Inliner = createFunctionInliningPass();
-    Builder.populateModulePassManager(PM);
-  }
 
   if (StandardLinkOpts) {
     PassManagerBuilder Builder;
-    Builder.populateLTOPassManager(PM, /*Internalize=*/true,
-                                   /*RunInliner=*/true);
+    Builder.Inliner = createFunctionInliningPass();
+    Builder.populateLTOPassManager(PM);
   }
 
   if (OptLevelO1 || OptLevelO2 || OptLevelO3) {

diff --git a/tools/gold/CMakeLists.txt b/tools/gold/CMakeLists.txt
index 3864e15..3033010 100644
--- a/tools/gold/CMakeLists.txt
+++ b/tools/gold/CMakeLists.txt

@@ -16,7 +16,9 @@
 
   set(LLVM_LINK_COMPONENTS
      ${LLVM_TARGETS_TO_BUILD}
-     LTO
+     Linker
+     BitWriter
+     IPO
      )
 
   add_llvm_loadable_module(LLVMgold

diff --git a/tools/gold/Makefile b/tools/gold/Makefile
index 593d8ea..aa006b0 100644
--- a/tools/gold/Makefile
+++ b/tools/gold/Makefile

@@ -20,7 +20,7 @@
 # early so we can set up LINK_COMPONENTS before including Makefile.rules
 include $(LEVEL)/Makefile.config
 
-LINK_COMPONENTS := $(TARGETS_TO_BUILD) LTO
+LINK_COMPONENTS := $(TARGETS_TO_BUILD) Linker BitWriter IPO
 
 # Because off_t is used in the public API, the largefile parts are required for
 # ABI compatibility.

diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index b908510..cfda6d2 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp

@@ -13,33 +13,35 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Config/config.h" // plugin-api.h requires HAVE_STDINT_H
-#include "llvm-c/lto.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/LTO/LTOCodeGenerator.h"
-#include "llvm/LTO/LTOModule.h"
-#include "llvm/Support/Errno.h"
-#include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Program.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include <cerrno>
-#include <cstdlib>
-#include <cstring>
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 #include <list>
 #include <plugin-api.h>
 #include <system_error>
 #include <vector>
 
-// Support Windows/MinGW crazyness.
-#ifdef _WIN32
-# include <io.h>
-# define lseek _lseek
-# define read _read
-#endif
-
 #ifndef LDPO_PIE
 // FIXME: remove this declaration when we stop maintaining Ubuntu Quantal and
 // Precise and Debian Wheezy (binutils 2.23 is required)
@@ -61,25 +63,24 @@
   abort();
 }
 
-static ld_plugin_add_symbols add_symbols = NULL;
-static ld_plugin_get_symbols get_symbols = NULL;
-static ld_plugin_add_input_file add_input_file = NULL;
-static ld_plugin_set_extra_library_path set_extra_library_path = NULL;
-static ld_plugin_get_view get_view = NULL;
+static ld_plugin_get_input_file get_input_file = nullptr;
+static ld_plugin_release_input_file release_input_file = nullptr;
+static ld_plugin_add_symbols add_symbols = nullptr;
+static ld_plugin_get_symbols get_symbols = nullptr;
+static ld_plugin_add_input_file add_input_file = nullptr;
+static ld_plugin_set_extra_library_path set_extra_library_path = nullptr;
+static ld_plugin_get_view get_view = nullptr;
 static ld_plugin_message message = discard_message;
-static lto_codegen_model output_type = LTO_CODEGEN_PIC_MODEL_STATIC;
+static Reloc::Model RelocationModel = Reloc::Default;
 static std::string output_name = "";
 static std::list<claimed_file> Modules;
 static std::vector<std::string> Cleanup;
-static LTOCodeGenerator *CodeGen = nullptr;
-static StringSet<> CannotBeHidden;
 static llvm::TargetOptions TargetOpts;
 
 namespace options {
-  enum generate_bc { BC_NO, BC_ALSO, BC_ONLY };
+  enum generate_bc { BC_NO, BC_ONLY, BC_SAVE_TEMPS };
   static bool generate_api_file = false;
   static generate_bc generate_bc_file = BC_NO;
-  static std::string bc_path;
   static std::string obj_path;
   static std::string extra_library_path;
   static std::string triple;
@@ -89,11 +90,11 @@
   // as plugin exclusive to pass to the code generator.
   // For example, "generate-api-file" and "as"options are for the plugin
   // use only and will not be passed.
-  static std::vector<std::string> extra;
+  static std::vector<const char *> extra;
 
   static void process_plugin_option(const char* opt_)
   {
-    if (opt_ == NULL)
+    if (opt_ == nullptr)
       return;
     llvm::StringRef opt = opt_;
 
@@ -109,20 +110,16 @@
       obj_path = opt.substr(strlen("obj-path="));
     } else if (opt == "emit-llvm") {
       generate_bc_file = BC_ONLY;
-    } else if (opt == "also-emit-llvm") {
-      generate_bc_file = BC_ALSO;
-    } else if (opt.startswith("also-emit-llvm=")) {
-      llvm::StringRef path = opt.substr(strlen("also-emit-llvm="));
-      generate_bc_file = BC_ALSO;
-      if (!bc_path.empty()) {
-        (*message)(LDPL_WARNING, "Path to the output IL file specified twice. "
-                   "Discarding %s", opt_);
-      } else {
-        bc_path = path;
-      }
+    } else if (opt == "save-temps") {
+      generate_bc_file = BC_SAVE_TEMPS;
     } else {
       // Save this option to pass to the code generator.
-      extra.push_back(opt);
+      // ParseCommandLineOptions() expects argv[0] to be program name. Lazily
+      // add that.
+      if (extra.empty())
+        extra.push_back("LLVMgold");
+
+      extra.push_back(opt_);
     }
   }
 }
@@ -159,14 +156,13 @@
           case LDPO_REL:  // .o
           case LDPO_DYN:  // .so
           case LDPO_PIE:  // position independent executable
-            output_type = LTO_CODEGEN_PIC_MODEL_DYNAMIC;
+            RelocationModel = Reloc::PIC_;
             break;
           case LDPO_EXEC:  // .exe
-            output_type = LTO_CODEGEN_PIC_MODEL_STATIC;
+            RelocationModel = Reloc::Static;
             break;
           default:
-            (*message)(LDPL_ERROR, "Unknown output file type %d",
-                       tv->tv_u.tv_val);
+            message(LDPL_ERROR, "Unknown output file type %d", tv->tv_u.tv_val);
             return LDPS_ERR;
         }
         break;
@@ -177,7 +173,7 @@
         ld_plugin_register_claim_file callback;
         callback = tv->tv_u.tv_register_claim_file;
 
-        if ((*callback)(claim_file_hook) != LDPS_OK)
+        if (callback(claim_file_hook) != LDPS_OK)
           return LDPS_ERR;
 
         registeredClaimFile = true;
@@ -186,7 +182,7 @@
         ld_plugin_register_all_symbols_read callback;
         callback = tv->tv_u.tv_register_all_symbols_read;
 
-        if ((*callback)(all_symbols_read_hook) != LDPS_OK)
+        if (callback(all_symbols_read_hook) != LDPS_OK)
           return LDPS_ERR;
 
         RegisteredAllSymbolsRead = true;
@@ -195,9 +191,15 @@
         ld_plugin_register_cleanup callback;
         callback = tv->tv_u.tv_register_cleanup;
 
-        if ((*callback)(cleanup_hook) != LDPS_OK)
+        if (callback(cleanup_hook) != LDPS_OK)
           return LDPS_ERR;
       } break;
+      case LDPT_GET_INPUT_FILE:
+        get_input_file = tv->tv_u.tv_get_input_file;
+        break;
+      case LDPT_RELEASE_INPUT_FILE:
+        release_input_file = tv->tv_u.tv_release_input_file;
+        break;
       case LDPT_ADD_SYMBOLS:
         add_symbols = tv->tv_u.tv_add_symbols;
         break;
@@ -222,56 +224,50 @@
   }
 
   if (!registeredClaimFile) {
-    (*message)(LDPL_ERROR, "register_claim_file not passed to LLVMgold.");
+    message(LDPL_ERROR, "register_claim_file not passed to LLVMgold.");
     return LDPS_ERR;
   }
   if (!add_symbols) {
-    (*message)(LDPL_ERROR, "add_symbols not passed to LLVMgold.");
+    message(LDPL_ERROR, "add_symbols not passed to LLVMgold.");
     return LDPS_ERR;
   }
 
   if (!RegisteredAllSymbolsRead)
     return LDPS_OK;
 
-  CodeGen = new LTOCodeGenerator();
-
-  // Pass through extra options to the code generator.
-  if (!options::extra.empty()) {
-    for (std::vector<std::string>::iterator it = options::extra.begin();
-         it != options::extra.end(); ++it) {
-      CodeGen->setCodeGenDebugOptions((*it).c_str());
-    }
+  if (!get_input_file) {
+    message(LDPL_ERROR, "get_input_file not passed to LLVMgold.");
+    return LDPS_ERR;
   }
-
-  CodeGen->parseCodeGenDebugOptions();
-  if (MAttrs.size()) {
-    std::string Attrs;
-    for (unsigned I = 0; I < MAttrs.size(); ++I) {
-      if (I > 0)
-        Attrs.append(",");
-      Attrs.append(MAttrs[I]);
-    }
-    CodeGen->setAttr(Attrs.c_str());
+  if (!release_input_file) {
+    message(LDPL_ERROR, "relesase_input_file not passed to LLVMgold.");
+    return LDPS_ERR;
   }
 
-  TargetOpts = InitTargetOptionsFromCodeGenFlags();
-  CodeGen->setTargetOptions(TargetOpts);
-
   return LDPS_OK;
 }
 
+static const GlobalObject *getBaseObject(const GlobalValue &GV) {
+  if (auto *GA = dyn_cast<GlobalAlias>(&GV))
+    return GA->getBaseObject();
+  return cast<GlobalObject>(&GV);
+}
+
 /// Called by gold to see whether this file is one that our plugin can handle.
 /// We'll try to open it and register all the symbols with add_symbol if
 /// possible.
 static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
                                         int *claimed) {
-  const void *view;
-  std::unique_ptr<MemoryBuffer> buffer;
+  LLVMContext Context;
+  MemoryBufferRef BufferRef;
+  std::unique_ptr<MemoryBuffer> Buffer;
   if (get_view) {
+    const void *view;
     if (get_view(file->handle, &view) != LDPS_OK) {
-      (*message)(LDPL_ERROR, "Failed to get a view of %s", file->name);
+      message(LDPL_ERROR, "Failed to get a view of %s", file->name);
       return LDPS_ERR;
     }
+    BufferRef = MemoryBufferRef(StringRef((const char *)view, file->filesize), "");
   } else {
     int64_t offset = 0;
     // Gold has found what might be IR part-way inside of a file, such as
@@ -283,225 +279,567 @@
         MemoryBuffer::getOpenFileSlice(file->fd, file->name, file->filesize,
                                        offset);
     if (std::error_code EC = BufferOrErr.getError()) {
-      (*message)(LDPL_ERROR, EC.message().c_str());
+      message(LDPL_ERROR, EC.message().c_str());
       return LDPS_ERR;
     }
-    buffer = std::move(BufferOrErr.get());
-    view = buffer->getBufferStart();
+    Buffer = std::move(BufferOrErr.get());
+    BufferRef = Buffer->getMemBufferRef();
   }
 
-  if (!LTOModule::isBitcodeFile(view, file->filesize))
+  ErrorOr<std::unique_ptr<object::IRObjectFile>> ObjOrErr =
+      object::IRObjectFile::createIRObjectFile(BufferRef, Context);
+  std::error_code EC = ObjOrErr.getError();
+  if (EC == BitcodeError::InvalidBitcodeSignature ||
+      EC == object::object_error::invalid_file_type ||
+      EC == object::object_error::bitcode_section_not_found)
     return LDPS_OK;
 
-  std::string Error;
-  LTOModule *M =
-      LTOModule::createFromBuffer(view, file->filesize, TargetOpts, Error);
-  if (!M) {
-    (*message)(LDPL_ERROR,
-               "LLVM gold plugin has failed to create LTO module: %s",
-               Error.c_str());
-    return LDPS_OK;
-  }
-
   *claimed = 1;
+
+  if (EC) {
+    message(LDPL_ERROR, "LLVM gold plugin has failed to create LTO module: %s",
+            EC.message().c_str());
+    return LDPS_ERR;
+  }
+  std::unique_ptr<object::IRObjectFile> Obj = std::move(*ObjOrErr);
+
   Modules.resize(Modules.size() + 1);
   claimed_file &cf = Modules.back();
 
-  if (!options::triple.empty())
-    M->setTargetTriple(options::triple.c_str());
-
   cf.handle = file->handle;
-  unsigned sym_count = M->getSymbolCount();
-  cf.syms.reserve(sym_count);
 
-  for (unsigned i = 0; i != sym_count; ++i) {
-    lto_symbol_attributes attrs = M->getSymbolAttributes(i);
-    if ((attrs & LTO_SYMBOL_SCOPE_MASK) == LTO_SYMBOL_SCOPE_INTERNAL)
+  for (auto &Sym : Obj->symbols()) {
+    uint32_t Symflags = Sym.getFlags();
+    if (!(Symflags & object::BasicSymbolRef::SF_Global))
+      continue;
+
+    if (Symflags & object::BasicSymbolRef::SF_FormatSpecific)
       continue;
 
     cf.syms.push_back(ld_plugin_symbol());
     ld_plugin_symbol &sym = cf.syms.back();
-    sym.name = strdup(M->getSymbolName(i));
-    sym.version = NULL;
+    sym.version = nullptr;
 
-    int scope = attrs & LTO_SYMBOL_SCOPE_MASK;
-    bool CanBeHidden = scope == LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
-    if (!CanBeHidden)
-      CannotBeHidden.insert(sym.name);
-    switch (scope) {
-      case LTO_SYMBOL_SCOPE_HIDDEN:
-        sym.visibility = LDPV_HIDDEN;
-        break;
-      case LTO_SYMBOL_SCOPE_PROTECTED:
-        sym.visibility = LDPV_PROTECTED;
-        break;
-      case 0: // extern
-      case LTO_SYMBOL_SCOPE_DEFAULT:
-      case LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN:
+    SmallString<64> Name;
+    {
+      raw_svector_ostream OS(Name);
+      Sym.printName(OS);
+    }
+    sym.name = strdup(Name.c_str());
+
+    const GlobalValue *GV = Obj->getSymbolGV(Sym.getRawDataRefImpl());
+
+    sym.visibility = LDPV_DEFAULT;
+    if (GV) {
+      switch (GV->getVisibility()) {
+      case GlobalValue::DefaultVisibility:
         sym.visibility = LDPV_DEFAULT;
         break;
-      default:
-        (*message)(LDPL_ERROR, "Unknown scope attribute: %d", scope);
-        return LDPS_ERR;
+      case GlobalValue::HiddenVisibility:
+        sym.visibility = LDPV_HIDDEN;
+        break;
+      case GlobalValue::ProtectedVisibility:
+        sym.visibility = LDPV_PROTECTED;
+        break;
+      }
     }
 
-    int definition = attrs & LTO_SYMBOL_DEFINITION_MASK;
-    sym.comdat_key = NULL;
-    switch (definition) {
-      case LTO_SYMBOL_DEFINITION_REGULAR:
-        sym.def = LDPK_DEF;
-        break;
-      case LTO_SYMBOL_DEFINITION_UNDEFINED:
-        sym.def = LDPK_UNDEF;
-        break;
-      case LTO_SYMBOL_DEFINITION_TENTATIVE:
-        sym.def = LDPK_COMMON;
-        break;
-      case LTO_SYMBOL_DEFINITION_WEAK:
-        sym.comdat_key = sym.name;
-        sym.def = LDPK_WEAKDEF;
-        break;
-      case LTO_SYMBOL_DEFINITION_WEAKUNDEF:
+    if (Symflags & object::BasicSymbolRef::SF_Undefined) {
+      sym.def = LDPK_UNDEF;
+      if (GV && GV->hasExternalWeakLinkage())
         sym.def = LDPK_WEAKUNDEF;
-        break;
-      default:
-        (*message)(LDPL_ERROR, "Unknown definition attribute: %d", definition);
-        return LDPS_ERR;
+    } else {
+      sym.def = LDPK_DEF;
+      if (GV) {
+        assert(!GV->hasExternalWeakLinkage() &&
+               !GV->hasAvailableExternallyLinkage() && "Not a declaration!");
+        if (GV->hasCommonLinkage())
+          sym.def = LDPK_COMMON;
+        else if (GV->isWeakForLinker())
+          sym.def = LDPK_WEAKDEF;
+      }
     }
 
     sym.size = 0;
+    sym.comdat_key = nullptr;
+    if (GV) {
+      const GlobalObject *Base = getBaseObject(*GV);
+      if (!Base)
+        message(LDPL_FATAL, "Unable to determine comdat of alias!");
+      const Comdat *C = Base->getComdat();
+      if (C)
+        sym.comdat_key = strdup(C->getName().str().c_str());
+      else if (Base->hasWeakLinkage() || Base->hasLinkOnceLinkage())
+        sym.comdat_key = strdup(sym.name);
+    }
 
     sym.resolution = LDPR_UNKNOWN;
   }
 
-  cf.syms.reserve(cf.syms.size());
-
   if (!cf.syms.empty()) {
-    if ((*add_symbols)(cf.handle, cf.syms.size(), &cf.syms[0]) != LDPS_OK) {
-      (*message)(LDPL_ERROR, "Unable to add symbols!");
+    if (add_symbols(cf.handle, cf.syms.size(), &cf.syms[0]) != LDPS_OK) {
+      message(LDPL_ERROR, "Unable to add symbols!");
       return LDPS_ERR;
     }
   }
 
-  if (CodeGen) {
-    std::string Error;
-    if (!CodeGen->addModule(M, Error)) {
-      (*message)(LDPL_ERROR, "Error linking module: %s", Error.c_str());
-      return LDPS_ERR;
-    }
-  }
-
-  delete M;
-
   return LDPS_OK;
 }
 
-static bool mustPreserve(const claimed_file &F, int i) {
-  if (F.syms[i].resolution == LDPR_PREVAILING_DEF)
-    return true;
-  if (F.syms[i].resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)
-    return CannotBeHidden.count(F.syms[i].name);
-  return false;
+static void keepGlobalValue(GlobalValue &GV,
+                            std::vector<GlobalAlias *> &KeptAliases) {
+  assert(!GV.hasLocalLinkage());
+
+  if (auto *GA = dyn_cast<GlobalAlias>(&GV))
+    KeptAliases.push_back(GA);
+
+  switch (GV.getLinkage()) {
+  default:
+    break;
+  case GlobalValue::LinkOnceAnyLinkage:
+    GV.setLinkage(GlobalValue::WeakAnyLinkage);
+    break;
+  case GlobalValue::LinkOnceODRLinkage:
+    GV.setLinkage(GlobalValue::WeakODRLinkage);
+    break;
+  }
+
+  assert(!GV.isDiscardableIfUnused());
 }
 
-/// all_symbols_read_hook - gold informs us that all symbols have been read.
-/// At this point, we use get_symbols to see if any of our definitions have
-/// been overridden by a native object file. Then, perform optimization and
-/// codegen.
-static ld_plugin_status all_symbols_read_hook(void) {
-  // FIXME: raw_fd_ostream should be able to represent an unopened file.
-  std::unique_ptr<raw_fd_ostream> api_file;
+static void internalize(GlobalValue &GV) {
+  if (GV.isDeclarationForLinker())
+    return; // We get here if there is a matching asm definition.
+  if (!GV.hasLocalLinkage())
+    GV.setLinkage(GlobalValue::InternalLinkage);
+}
 
-  assert(CodeGen);
-
-  if (options::generate_api_file) {
-    std::string Error;
-    api_file.reset(new raw_fd_ostream("apifile.txt", Error, sys::fs::F_None));
-    if (!Error.empty())
-      (*message)(LDPL_FATAL, "Unable to open apifile.txt for writing: %s",
-                 Error.c_str());
+static void drop(GlobalValue &GV) {
+  if (auto *F = dyn_cast<Function>(&GV)) {
+    F->deleteBody();
+    F->setComdat(nullptr); // Should deleteBody do this?
+    return;
   }
 
-  for (std::list<claimed_file>::iterator I = Modules.begin(),
-         E = Modules.end(); I != E; ++I) {
-    if (I->syms.empty())
-      continue;
-    (*get_symbols)(I->handle, I->syms.size(), &I->syms[0]);
-    for (unsigned i = 0, e = I->syms.size(); i != e; i++) {
-      if (mustPreserve(*I, i)) {
-        CodeGen->addMustPreserveSymbol(I->syms[i].name);
+  if (auto *Var = dyn_cast<GlobalVariable>(&GV)) {
+    Var->setInitializer(nullptr);
+    Var->setLinkage(
+        GlobalValue::ExternalLinkage); // Should setInitializer do this?
+    Var->setComdat(nullptr); // and this?
+    return;
+  }
 
-        if (options::generate_api_file)
-          (*api_file) << I->syms[i].name << "\n";
-      }
+  auto &Alias = cast<GlobalAlias>(GV);
+  Module &M = *Alias.getParent();
+  PointerType &Ty = *cast<PointerType>(Alias.getType());
+  GlobalValue::LinkageTypes L = Alias.getLinkage();
+  auto *Var =
+      new GlobalVariable(M, Ty.getElementType(), /*isConstant*/ false, L,
+                         /*Initializer*/ nullptr);
+  Var->takeName(&Alias);
+  Alias.replaceAllUsesWith(Var);
+  Alias.eraseFromParent();
+}
+
+static const char *getResolutionName(ld_plugin_symbol_resolution R) {
+  switch (R) {
+  case LDPR_UNKNOWN:
+    return "UNKNOWN";
+  case LDPR_UNDEF:
+    return "UNDEF";
+  case LDPR_PREVAILING_DEF:
+    return "PREVAILING_DEF";
+  case LDPR_PREVAILING_DEF_IRONLY:
+    return "PREVAILING_DEF_IRONLY";
+  case LDPR_PREEMPTED_REG:
+    return "PREEMPTED_REG";
+  case LDPR_PREEMPTED_IR:
+    return "PREEMPTED_IR";
+  case LDPR_RESOLVED_IR:
+    return "RESOLVED_IR";
+  case LDPR_RESOLVED_EXEC:
+    return "RESOLVED_EXEC";
+  case LDPR_RESOLVED_DYN:
+    return "RESOLVED_DYN";
+  case LDPR_PREVAILING_DEF_IRONLY_EXP:
+    return "PREVAILING_DEF_IRONLY_EXP";
+  }
+  llvm_unreachable("Unknown resolution");
+}
+
+static GlobalObject *makeInternalReplacement(GlobalObject *GO) {
+  Module *M = GO->getParent();
+  GlobalObject *Ret;
+  if (auto *F = dyn_cast<Function>(GO)) {
+    if (F->materialize())
+      message(LDPL_FATAL, "LLVM gold plugin has failed to read a function");
+
+    auto *NewF = Function::Create(F->getFunctionType(), F->getLinkage(),
+                                  F->getName(), M);
+
+    ValueToValueMapTy VM;
+    Function::arg_iterator NewI = NewF->arg_begin();
+    for (auto &Arg : F->args()) {
+      NewI->setName(Arg.getName());
+      VM[&Arg] = NewI;
+      ++NewI;
     }
+
+    NewF->getBasicBlockList().splice(NewF->end(), F->getBasicBlockList());
+    for (auto &BB : *NewF) {
+      for (auto &Inst : BB)
+        RemapInstruction(&Inst, VM, RF_IgnoreMissingEntries);
+    }
+
+    Ret = NewF;
+    F->deleteBody();
+  } else {
+    auto *Var = cast<GlobalVariable>(GO);
+    Ret = new GlobalVariable(
+        *M, Var->getType()->getElementType(), Var->isConstant(),
+        Var->getLinkage(), Var->getInitializer(), Var->getName(),
+        nullptr, Var->getThreadLocalMode(), Var->getType()->getAddressSpace(),
+        Var->isExternallyInitialized());
+    Var->setInitializer(nullptr);
+  }
+  Ret->copyAttributesFrom(GO);
+  Ret->setLinkage(GlobalValue::InternalLinkage);
+  Ret->setComdat(GO->getComdat());
+
+  return Ret;
+}
+
+namespace {
+class LocalValueMaterializer : public ValueMaterializer {
+  DenseSet<GlobalValue *> &Dropped;
+
+public:
+  LocalValueMaterializer(DenseSet<GlobalValue *> &Dropped) : Dropped(Dropped) {}
+  Value *materializeValueFor(Value *V) override;
+};
+}
+
+Value *LocalValueMaterializer::materializeValueFor(Value *V) {
+  auto *GV = dyn_cast<GlobalValue>(V);
+  if (!GV)
+    return nullptr;
+  if (!Dropped.count(GV))
+    return nullptr;
+  assert(!isa<GlobalAlias>(GV) && "Found alias point to weak alias.");
+  return makeInternalReplacement(cast<GlobalObject>(GV));
+}
+
+static Constant *mapConstantToLocalCopy(Constant *C, ValueToValueMapTy &VM,
+                                        LocalValueMaterializer *Materializer) {
+  return MapValue(C, VM, RF_IgnoreMissingEntries, nullptr, Materializer);
+}
+
+static std::unique_ptr<Module>
+getModuleForFile(LLVMContext &Context, claimed_file &F, raw_fd_ostream *ApiFile,
+                 StringSet<> &Internalize, StringSet<> &Maybe) {
+  ld_plugin_input_file File;
+  if (get_input_file(F.handle, &File) != LDPS_OK)
+    message(LDPL_FATAL, "Failed to get file information");
+
+  if (get_symbols(F.handle, F.syms.size(), &F.syms[0]) != LDPS_OK)
+    message(LDPL_FATAL, "Failed to get symbol information");
+
+  const void *View;
+  if (get_view(F.handle, &View) != LDPS_OK)
+    message(LDPL_FATAL, "Failed to get a view of file");
+
+  llvm::ErrorOr<MemoryBufferRef> MBOrErr =
+      object::IRObjectFile::findBitcodeInMemBuffer(
+          MemoryBufferRef(StringRef((const char *)View, File.filesize), ""));
+  if (std::error_code EC = MBOrErr.getError())
+    message(LDPL_FATAL, "Could not read bitcode from file : %s",
+            EC.message().c_str());
+
+  std::unique_ptr<MemoryBuffer> Buffer =
+      MemoryBuffer::getMemBuffer(MBOrErr->getBuffer(), "", false);
+
+  if (release_input_file(F.handle) != LDPS_OK)
+    message(LDPL_FATAL, "Failed to release file information");
+
+  ErrorOr<Module *> MOrErr = getLazyBitcodeModule(std::move(Buffer), Context);
+
+  if (std::error_code EC = MOrErr.getError())
+    message(LDPL_FATAL, "Could not read bitcode from file : %s",
+            EC.message().c_str());
+
+  std::unique_ptr<Module> M(MOrErr.get());
+
+  SmallPtrSet<GlobalValue *, 8> Used;
+  collectUsedGlobalVariables(*M, Used, /*CompilerUsed*/ false);
+
+  DenseSet<GlobalValue *> Drop;
+  std::vector<GlobalAlias *> KeptAliases;
+  for (ld_plugin_symbol &Sym : F.syms) {
+    ld_plugin_symbol_resolution Resolution =
+        (ld_plugin_symbol_resolution)Sym.resolution;
+
+    if (options::generate_api_file)
+      *ApiFile << Sym.name << ' ' << getResolutionName(Resolution) << '\n';
+
+    GlobalValue *GV = M->getNamedValue(Sym.name);
+    if (!GV)
+      continue; // Asm symbol.
+
+    if (Resolution != LDPR_PREVAILING_DEF_IRONLY && GV->hasCommonLinkage()) {
+      // Common linkage is special. There is no single symbol that wins the
+      // resolution. Instead we have to collect the maximum alignment and size.
+      // The IR linker does that for us if we just pass it every common GV.
+      // We still have to keep track of LDPR_PREVAILING_DEF_IRONLY so we
+      // internalize once the IR linker has done its job.
+      continue;
+    }
+
+    switch (Resolution) {
+    case LDPR_UNKNOWN:
+      llvm_unreachable("Unexpected resolution");
+
+    case LDPR_RESOLVED_IR:
+    case LDPR_RESOLVED_EXEC:
+    case LDPR_RESOLVED_DYN:
+    case LDPR_UNDEF:
+      assert(GV->isDeclarationForLinker());
+      break;
+
+    case LDPR_PREVAILING_DEF_IRONLY: {
+      keepGlobalValue(*GV, KeptAliases);
+      if (!Used.count(GV)) {
+        // Since we use the regular lib/Linker, we cannot just internalize GV
+        // now or it will not be copied to the merged module. Instead we force
+        // it to be copied and then internalize it.
+        Internalize.insert(Sym.name);
+      }
+      break;
+    }
+
+    case LDPR_PREVAILING_DEF:
+      keepGlobalValue(*GV, KeptAliases);
+      break;
+
+    case LDPR_PREEMPTED_IR:
+      // Gold might have selected a linkonce_odr and preempted a weak_odr.
+      // In that case we have to make sure we don't end up internalizing it.
+      if (!GV->isDiscardableIfUnused())
+        Maybe.erase(Sym.name);
+
+      // fall-through
+    case LDPR_PREEMPTED_REG:
+      Drop.insert(GV);
+      break;
+
+    case LDPR_PREVAILING_DEF_IRONLY_EXP: {
+      // We can only check for address uses after we merge the modules. The
+      // reason is that this GV might have a copy in another module
+      // and in that module the address might be significant, but that
+      // copy will be LDPR_PREEMPTED_IR.
+      if (GV->hasLinkOnceODRLinkage())
+        Maybe.insert(Sym.name);
+      keepGlobalValue(*GV, KeptAliases);
+      break;
+    }
+    }
+
+    free(Sym.name);
+    free(Sym.comdat_key);
+    Sym.name = nullptr;
+    Sym.comdat_key = nullptr;
   }
 
-  CodeGen->setCodePICModel(output_type);
-  CodeGen->setDebugInfo(LTO_DEBUG_MODEL_DWARF);
-  if (!options::mcpu.empty())
-    CodeGen->setCpu(options::mcpu.c_str());
+  ValueToValueMapTy VM;
+  LocalValueMaterializer Materializer(Drop);
+  for (GlobalAlias *GA : KeptAliases) {
+    // Gold told us to keep GA. It is possible that a GV usied in the aliasee
+    // expression is being dropped. If that is the case, that GV must be copied.
+    Constant *Aliasee = GA->getAliasee();
+    Constant *Replacement = mapConstantToLocalCopy(Aliasee, VM, &Materializer);
+    if (Aliasee != Replacement)
+      GA->setAliasee(Replacement);
+  }
+
+  for (auto *GV : Drop)
+    drop(*GV);
+
+  return M;
+}
+
+static void runLTOPasses(Module &M, TargetMachine &TM) {
+  PassManager passes;
+  PassManagerBuilder PMB;
+  PMB.LibraryInfo = new TargetLibraryInfo(Triple(TM.getTargetTriple()));
+  PMB.Inliner = createFunctionInliningPass();
+  PMB.VerifyInput = true;
+  PMB.VerifyOutput = true;
+  PMB.LoopVectorize = true;
+  PMB.SLPVectorize = true;
+  PMB.populateLTOPassManager(passes, &TM);
+  passes.run(M);
+}
+
+static void saveBCFile(StringRef Path, Module &M) {
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::F_None);
+  if (EC)
+    message(LDPL_FATAL, "Failed to write the output file.");
+  WriteBitcodeToFile(&M, OS);
+}
+
+static void codegen(Module &M) {
+  const std::string &TripleStr = M.getTargetTriple();
+  Triple TheTriple(TripleStr);
+
+  std::string ErrMsg;
+  const Target *TheTarget = TargetRegistry::lookupTarget(TripleStr, ErrMsg);
+  if (!TheTarget)
+    message(LDPL_FATAL, "Target not found: %s", ErrMsg.c_str());
+
+  if (unsigned NumOpts = options::extra.size())
+    cl::ParseCommandLineOptions(NumOpts, &options::extra[0]);
+
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(TheTriple);
+  for (const std::string &A : MAttrs)
+    Features.AddFeature(A);
+
+  TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
+  std::unique_ptr<TargetMachine> TM(TheTarget->createTargetMachine(
+      TripleStr, options::mcpu, Features.getString(), Options, RelocationModel,
+      CodeModel::Default, CodeGenOpt::Aggressive));
+
+  runLTOPasses(M, *TM);
+
+  if (options::generate_bc_file == options::BC_SAVE_TEMPS)
+    saveBCFile(output_name + ".opt.bc", M);
+
+  PassManager CodeGenPasses;
+  CodeGenPasses.add(new DataLayoutPass());
+
+  SmallString<128> Filename;
+  int FD;
+  if (options::obj_path.empty()) {
+    std::error_code EC =
+        sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
+    if (EC)
+      message(LDPL_FATAL, "Could not create temporary file: %s",
+              EC.message().c_str());
+  } else {
+    Filename = options::obj_path;
+    std::error_code EC =
+        sys::fs::openFileForWrite(Filename.c_str(), FD, sys::fs::F_None);
+    if (EC)
+      message(LDPL_FATAL, "Could not open file: %s", EC.message().c_str());
+  }
+
+  {
+    raw_fd_ostream OS(FD, true);
+    formatted_raw_ostream FOS(OS);
+
+    if (TM->addPassesToEmitFile(CodeGenPasses, FOS,
+                                TargetMachine::CGFT_ObjectFile))
+      message(LDPL_FATAL, "Failed to setup codegen");
+    CodeGenPasses.run(M);
+  }
+
+  if (add_input_file(Filename.c_str()) != LDPS_OK)
+    message(LDPL_FATAL,
+            "Unable to add .o file to the link. File left behind in: %s",
+            Filename.c_str());
+
+  if (options::obj_path.empty())
+    Cleanup.push_back(Filename.c_str());
+}
+
+/// gold informs us that all symbols have been read. At this point, we use
+/// get_symbols to see if any of our definitions have been overridden by a
+/// native object file. Then, perform optimization and codegen.
+static ld_plugin_status allSymbolsReadHook(raw_fd_ostream *ApiFile) {
+  if (Modules.empty())
+    return LDPS_OK;
+
+  LLVMContext Context;
+  std::unique_ptr<Module> Combined(new Module("ld-temp.o", Context));
+  Linker L(Combined.get());
+
+  std::string DefaultTriple = sys::getDefaultTargetTriple();
+
+  StringSet<> Internalize;
+  StringSet<> Maybe;
+  for (claimed_file &F : Modules) {
+    std::unique_ptr<Module> M =
+        getModuleForFile(Context, F, ApiFile, Internalize, Maybe);
+    if (!options::triple.empty())
+      M->setTargetTriple(options::triple.c_str());
+    else if (M->getTargetTriple().empty()) {
+      M->setTargetTriple(DefaultTriple);
+    }
+
+    if (L.linkInModule(M.get()))
+      message(LDPL_FATAL, "Failed to link module");
+  }
+
+  for (const auto &Name : Internalize) {
+    GlobalValue *GV = Combined->getNamedValue(Name.first());
+    if (GV)
+      internalize(*GV);
+  }
+
+  for (const auto &Name : Maybe) {
+    GlobalValue *GV = Combined->getNamedValue(Name.first());
+    if (!GV)
+      continue;
+    GV->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    if (canBeOmittedFromSymbolTable(GV))
+      internalize(*GV);
+  }
 
   if (options::generate_bc_file != options::BC_NO) {
     std::string path;
     if (options::generate_bc_file == options::BC_ONLY)
       path = output_name;
-    else if (!options::bc_path.empty())
-      path = options::bc_path;
     else
       path = output_name + ".bc";
-    std::string Error;
-    if (!CodeGen->writeMergedModules(path.c_str(), Error))
-      (*message)(LDPL_FATAL, "Failed to write the output file.");
-    if (options::generate_bc_file == options::BC_ONLY) {
-      delete CodeGen;
-      exit(0);
-    }
+    saveBCFile(path, *L.getModule());
+    if (options::generate_bc_file == options::BC_ONLY)
+      return LDPS_OK;
   }
 
-  std::string ObjPath;
-  {
-    const char *Temp;
-    std::string Error;
-    if (!CodeGen->compile_to_file(&Temp, /*DisableOpt*/ false, /*DisableInline*/
-                                  false, /*DisableGVNLoadPRE*/ false, Error))
-      (*message)(LDPL_ERROR, "Could not produce a combined object file\n");
-    ObjPath = Temp;
-  }
-
-  delete CodeGen;
-  for (std::list<claimed_file>::iterator I = Modules.begin(),
-         E = Modules.end(); I != E; ++I) {
-    for (unsigned i = 0; i != I->syms.size(); ++i) {
-      ld_plugin_symbol &sym = I->syms[i];
-      free(sym.name);
-    }
-  }
-
-  if ((*add_input_file)(ObjPath.c_str()) != LDPS_OK) {
-    (*message)(LDPL_ERROR, "Unable to add .o file to the link.");
-    (*message)(LDPL_ERROR, "File left behind in: %s", ObjPath.c_str());
-    return LDPS_ERR;
-  }
+  codegen(*L.getModule());
 
   if (!options::extra_library_path.empty() &&
-      set_extra_library_path(options::extra_library_path.c_str()) != LDPS_OK) {
-    (*message)(LDPL_ERROR, "Unable to set the extra library path.");
-    return LDPS_ERR;
-  }
-
-  if (options::obj_path.empty())
-    Cleanup.push_back(ObjPath);
+      set_extra_library_path(options::extra_library_path.c_str()) != LDPS_OK)
+    message(LDPL_FATAL, "Unable to set the extra library path.");
 
   return LDPS_OK;
 }
 
-static ld_plugin_status cleanup_hook(void) {
-  for (int i = 0, e = Cleanup.size(); i != e; ++i) {
-    std::error_code EC = sys::fs::remove(Cleanup[i]);
+static ld_plugin_status all_symbols_read_hook(void) {
+  ld_plugin_status Ret;
+  if (!options::generate_api_file) {
+    Ret = allSymbolsReadHook(nullptr);
+  } else {
+    std::error_code EC;
+    raw_fd_ostream ApiFile("apifile.txt", EC, sys::fs::F_None);
     if (EC)
-      (*message)(LDPL_ERROR, "Failed to delete '%s': %s", Cleanup[i].c_str(),
-                 EC.message().c_str());
+      message(LDPL_FATAL, "Unable to open apifile.txt for writing: %s",
+              EC.message().c_str());
+    Ret = allSymbolsReadHook(&ApiFile);
+  }
+
+  if (options::generate_bc_file == options::BC_ONLY)
+    exit(0);
+
+  return Ret;
+}
+
+static ld_plugin_status cleanup_hook(void) {
+  for (std::string &Name : Cleanup) {
+    std::error_code EC = sys::fs::remove(Name);
+    if (EC)
+      message(LDPL_ERROR, "Failed to delete '%s': %s", Name.c_str(),
+              EC.message().c_str());
   }
 
   return LDPS_OK;

diff --git a/tools/llc/Android.mk b/tools/llc/Android.mk
index a25cf5c..a98cd8b 100644
--- a/tools/llc/Android.mk
+++ b/tools/llc/Android.mk

@@ -48,6 +48,7 @@
   libLLVMipo \
   libLLVMipa \
   libLLVMLinker \
+  libLLVMMCDisassembler \
   libLLVMMC \
   libLLVMMCParser \
   libLLVMScalarOpts \

diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 09ff461..fe4d9ac 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp

@@ -41,6 +41,7 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <memory>
 using namespace llvm;
 
@@ -94,23 +95,6 @@
 
 static int compileModule(char **, LLVMContext &);
 
-// GetFileNameRoot - Helper function to get the basename of a filename.
-static inline std::string
-GetFileNameRoot(const std::string &InputFilename) {
-  std::string IFN = InputFilename;
-  std::string outputFilename;
-  int Len = IFN.length();
-  if ((Len > 2) &&
-      IFN[Len-3] == '.' &&
-      ((IFN[Len-2] == 'b' && IFN[Len-1] == 'c') ||
-       (IFN[Len-2] == 'l' && IFN[Len-1] == 'l'))) {
-    outputFilename = std::string(IFN.begin(), IFN.end()-3); // s/.bc/.s/
-  } else {
-    outputFilename = IFN;
-  }
-  return outputFilename;
-}
-
 static tool_output_file *GetOutputStream(const char *TargetName,
                                          Triple::OSType OS,
                                          const char *ProgName) {
@@ -119,7 +103,12 @@
     if (InputFilename == "-")
       OutputFilename = "-";
     else {
-      OutputFilename = GetFileNameRoot(InputFilename);
+      // If InputFilename ends in .bc or .ll, remove it.
+      StringRef IFN = InputFilename;
+      if (IFN.endswith(".bc") || IFN.endswith(".ll"))
+        OutputFilename = IFN.drop_back(3);
+      else
+        OutputFilename = IFN;
 
       switch (FileType) {
       case TargetMachine::CGFT_AssemblyFile:
@@ -158,14 +147,13 @@
   }
 
   // Open the file.
-  std::string error;
+  std::error_code EC;
   sys::fs::OpenFlags OpenFlags = sys::fs::F_None;
   if (!Binary)
     OpenFlags |= sys::fs::F_Text;
-  tool_output_file *FDOut = new tool_output_file(OutputFilename.c_str(), error,
-                                                 OpenFlags);
-  if (!error.empty()) {
-    errs() << error << '\n';
+  tool_output_file *FDOut = new tool_output_file(OutputFilename, EC, OpenFlags);
+  if (EC) {
+    errs() << EC.message() << '\n';
     delete FDOut;
     return nullptr;
   }
@@ -231,7 +219,7 @@
 
   // If user just wants to list available options, skip module loading
   if (!SkipModule) {
-    M.reset(ParseIRFile(InputFilename, Err, Context));
+    M = parseIRFile(InputFilename, Err, Context);
     mod = M.get();
     if (mod == nullptr) {
       Err.print(argv[0], errs());
@@ -317,9 +305,9 @@
   PM.add(TLI);
 
   // Add the target data from the target machine, if it exists, or the module.
-  if (const DataLayout *DL = Target.getDataLayout())
+  if (const DataLayout *DL = Target.getSubtargetImpl()->getDataLayout())
     mod->setDataLayout(DL);
-  PM.add(new DataLayoutPass(mod));
+  PM.add(new DataLayoutPass());
 
   if (RelaxAll.getNumOccurrences() > 0 &&
       FileType != TargetMachine::CGFT_ObjectFile)

diff --git a/tools/lli/Android.mk b/tools/lli/Android.mk
index f550f93..1b09102 100644
--- a/tools/lli/Android.mk
+++ b/tools/lli/Android.mk

@@ -44,7 +44,6 @@
   libLLVMX86Disassembler \
   libLLVMAsmPrinter \
   libLLVMSelectionDAG \
-  libLLVMJIT \
   libLLVMCodeGen \
   libLLVMInstrumentation \
   libLLVMExecutionEngine \

diff --git a/tools/lli/CMakeLists.txt b/tools/lli/CMakeLists.txt
index 731b61a..3610d76 100644
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt

@@ -7,8 +7,9 @@
   IRReader
   Instrumentation
   Interpreter
-  JIT
+  MC
   MCJIT
+  Object
   SelectionDAG
   Support
   native

diff --git a/tools/lli/LLVMBuild.txt b/tools/lli/LLVMBuild.txt
index aab2a20..4c14c47 100644
--- a/tools/lli/LLVMBuild.txt
+++ b/tools/lli/LLVMBuild.txt

@@ -22,4 +22,4 @@
 type = Tool
 name = lli
 parent = Tools
-required_libraries = AsmParser BitReader IRReader Instrumentation Interpreter JIT MCJIT NativeCodeGen SelectionDAG Native
+required_libraries = AsmParser BitReader IRReader Instrumentation Interpreter MCJIT NativeCodeGen SelectionDAG Native

diff --git a/tools/lli/Makefile b/tools/lli/Makefile
index eca5d83..94d6f06 100644
--- a/tools/lli/Makefile
+++ b/tools/lli/Makefile

@@ -14,7 +14,7 @@
 
 include $(LEVEL)/Makefile.config
 
-LINK_COMPONENTS := mcjit jit instrumentation interpreter nativecodegen bitreader asmparser irreader selectiondag native
+LINK_COMPONENTS := mcjit instrumentation interpreter nativecodegen bitreader asmparser irreader selectiondag native
 
 # If Intel JIT Events support is confiured, link against the LLVM Intel JIT
 # Events interface library

diff --git a/tools/lli/RPCChannel.h b/tools/lli/RPCChannel.h
index 2d8c708..ebd3c65 100644
--- a/tools/lli/RPCChannel.h
+++ b/tools/lli/RPCChannel.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLI_RPCCHANNEL_H
-#define LLI_RPCCHANNEL_H
+#ifndef LLVM_TOOLS_LLI_RPCCHANNEL_H
+#define LLVM_TOOLS_LLI_RPCCHANNEL_H
 
 #include <stdlib.h>
 #include <string>
@@ -46,4 +46,4 @@
 
 } // end namespace llvm
 
-#endif // LLI_RPCCHANNEL_H
+#endif

diff --git a/tools/lli/RemoteMemoryManager.cpp b/tools/lli/RemoteMemoryManager.cpp
index 4816517..5a135ea 100644
--- a/tools/lli/RemoteMemoryManager.cpp
+++ b/tools/lli/RemoteMemoryManager.cpp

@@ -172,36 +172,3 @@
 
   return false;
 }
-
-void RemoteMemoryManager::setMemoryWritable() { llvm_unreachable("Unexpected!"); }
-void RemoteMemoryManager::setMemoryExecutable() { llvm_unreachable("Unexpected!"); }
-void RemoteMemoryManager::setPoisonMemory(bool poison) { llvm_unreachable("Unexpected!"); }
-void RemoteMemoryManager::AllocateGOT() { llvm_unreachable("Unexpected!"); }
-uint8_t *RemoteMemoryManager::getGOTBase() const {
-  llvm_unreachable("Unexpected!");
-  return nullptr;
-}
-uint8_t *RemoteMemoryManager::startFunctionBody(const Function *F, uintptr_t &ActualSize){
-  llvm_unreachable("Unexpected!");
-  return nullptr;
-}
-uint8_t *RemoteMemoryManager::allocateStub(const GlobalValue* F, unsigned StubSize,
-                                              unsigned Alignment) {
-  llvm_unreachable("Unexpected!");
-  return nullptr;
-}
-void RemoteMemoryManager::endFunctionBody(const Function *F, uint8_t *FunctionStart,
-                                             uint8_t *FunctionEnd) {
-  llvm_unreachable("Unexpected!");
-}
-uint8_t *RemoteMemoryManager::allocateSpace(intptr_t Size, unsigned Alignment) {
-  llvm_unreachable("Unexpected!");
-  return nullptr;
-}
-uint8_t *RemoteMemoryManager::allocateGlobal(uintptr_t Size, unsigned Alignment) {
-  llvm_unreachable("Unexpected!");
-  return nullptr;
-}
-void RemoteMemoryManager::deallocateFunctionBody(void *Body) {
-  llvm_unreachable("Unexpected!");
-}

diff --git a/tools/lli/RemoteMemoryManager.h b/tools/lli/RemoteMemoryManager.h
index cf5d7c6..0bdb4e2 100644
--- a/tools/lli/RemoteMemoryManager.h
+++ b/tools/lli/RemoteMemoryManager.h

@@ -12,20 +12,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef REMOTEMEMORYMANAGER_H
-#define REMOTEMEMORYMANAGER_H
+#ifndef LLVM_TOOLS_LLI_REMOTEMEMORYMANAGER_H
+#define LLVM_TOOLS_LLI_REMOTEMEMORYMANAGER_H
 
 #include "RemoteTarget.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Memory.h"
 #include <utility>
 
 namespace llvm {
 
-class RemoteMemoryManager : public JITMemoryManager {
+class RemoteMemoryManager : public RTDyldMemoryManager {
 public:
   // Notice that this structure takes ownership of the memory allocated.
   struct Allocation {
@@ -93,22 +93,6 @@
 
   // This is a non-interface function used by lli
   void setRemoteTarget(RemoteTarget *T) { Target = T; }
-
-  // The following obsolete JITMemoryManager calls are stubbed out for
-  // this model.
-  void setMemoryWritable() override;
-  void setMemoryExecutable() override;
-  void setPoisonMemory(bool poison) override;
-  void AllocateGOT() override;
-  uint8_t *getGOTBase() const override;
-  uint8_t *startFunctionBody(const Function *F, uintptr_t &ActualSize) override;
-  uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
-                        unsigned Alignment) override;
-  void endFunctionBody(const Function *F, uint8_t *FunctionStart,
-                       uint8_t *FunctionEnd) override;
-  uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) override;
-  uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment) override;
-  void deallocateFunctionBody(void *Body) override;
 };
 
 } // end namespace llvm

diff --git a/tools/lli/RemoteTarget.h b/tools/lli/RemoteTarget.h
index 73e8ae2..ee758a2 100644
--- a/tools/lli/RemoteTarget.h
+++ b/tools/lli/RemoteTarget.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef REMOTEPROCESS_H
-#define REMOTEPROCESS_H
+#ifndef LLVM_TOOLS_LLI_REMOTETARGET_H
+#define LLVM_TOOLS_LLI_REMOTETARGET_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"

diff --git a/tools/lli/RemoteTargetExternal.h b/tools/lli/RemoteTargetExternal.h
index f87fc61..bb621f5 100644
--- a/tools/lli/RemoteTargetExternal.h
+++ b/tools/lli/RemoteTargetExternal.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLI_REMOTETARGETEXTERNAL_H
-#define LLI_REMOTETARGETEXTERNAL_H
+#ifndef LLVM_TOOLS_LLI_REMOTETARGETEXTERNAL_H
+#define LLVM_TOOLS_LLI_REMOTETARGETEXTERNAL_H
 
 #include "RPCChannel.h"
 #include "RemoteTarget.h"
@@ -140,4 +140,4 @@
 
 } // end namespace llvm
 
-#endif // LLI_REMOTETARGETEXTERNAL_H
+#endif

diff --git a/tools/lli/RemoteTargetMessage.h b/tools/lli/RemoteTargetMessage.h
index cb934a1..c210e4b 100644
--- a/tools/lli/RemoteTargetMessage.h
+++ b/tools/lli/RemoteTargetMessage.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLI_REMOTETARGETMESSAGE_H
-#define LLI_REMOTETARGETMESSAGE_H
+#ifndef LLVM_TOOLS_LLI_REMOTETARGETMESSAGE_H
+#define LLVM_TOOLS_LLI_REMOTETARGETMESSAGE_H
 
 namespace llvm {
 

diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 48828c1..276740b 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp

@@ -22,9 +22,7 @@
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/Interpreter.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
@@ -76,10 +74,6 @@
                                  cl::desc("Force interpretation: disable JIT"),
                                  cl::init(false));
 
-  cl::opt<bool> UseMCJIT(
-    "use-mcjit", cl::desc("Enable use of the MC-based JIT (if available)"),
-    cl::init(false));
-
   cl::opt<bool> DebugIR(
     "debug-ir", cl::desc("Generate debug information to allow debugging IR."),
     cl::init(false));
@@ -263,23 +257,23 @@
   }
   virtual ~LLIObjectCache() {}
 
-  void notifyObjectCompiled(const Module *M, const MemoryBuffer *Obj) override {
+  void notifyObjectCompiled(const Module *M, MemoryBufferRef Obj) override {
     const std::string ModuleID = M->getModuleIdentifier();
     std::string CacheName;
     if (!getCacheFilename(ModuleID, CacheName))
       return;
-    std::string errStr;
     if (!CacheDir.empty()) { // Create user-defined cache dir.
       SmallString<128> dir(CacheName);
       sys::path::remove_filename(dir);
       sys::fs::create_directories(Twine(dir));
     }
-    raw_fd_ostream outfile(CacheName.c_str(), errStr, sys::fs::F_None);
-    outfile.write(Obj->getBufferStart(), Obj->getBufferSize());
+    std::error_code EC;
+    raw_fd_ostream outfile(CacheName, EC, sys::fs::F_None);
+    outfile.write(Obj.getBufferStart(), Obj.getBufferSize());
     outfile.close();
   }
 
-  MemoryBuffer* getObject(const Module* M) override {
+  std::unique_ptr<MemoryBuffer> getObject(const Module* M) override {
     const std::string ModuleID = M->getModuleIdentifier();
     std::string CacheName;
     if (!getCacheFilename(ModuleID, CacheName))
@@ -345,7 +339,7 @@
   Triple TargetTriple(TargetTripleStr);
 
   // Create a new module.
-  Module *M = new Module("CygMingHelper", Context);
+  std::unique_ptr<Module> M = make_unique<Module>("CygMingHelper", Context);
   M->setTargetTriple(TargetTripleStr);
 
   // Create an empty function named "__main".
@@ -353,11 +347,11 @@
   if (TargetTriple.isArch64Bit()) {
     Result = Function::Create(
       TypeBuilder<int64_t(void), false>::get(Context),
-      GlobalValue::ExternalLinkage, "__main", M);
+      GlobalValue::ExternalLinkage, "__main", M.get());
   } else {
     Result = Function::Create(
       TypeBuilder<int32_t(void), false>::get(Context),
-      GlobalValue::ExternalLinkage, "__main", M);
+      GlobalValue::ExternalLinkage, "__main", M.get());
   }
   BasicBlock *BB = BasicBlock::Create(Context, "__main", Result);
   Builder.SetInsertPoint(BB);
@@ -369,7 +363,7 @@
   Builder.CreateRet(ReturnVal);
 
   // Add this new module to the ExecutionEngine.
-  EE->addModule(M);
+  EE->addModule(std::move(M));
 }
 
 
@@ -398,19 +392,17 @@
 
   // Load the bitcode...
   SMDiagnostic Err;
-  Module *Mod = ParseIRFile(InputFile, Err, Context);
+  std::unique_ptr<Module> Owner = parseIRFile(InputFile, Err, Context);
+  Module *Mod = Owner.get();
   if (!Mod) {
     Err.print(argv[0], errs());
     return 1;
   }
 
   if (EnableCacheManager) {
-    if (UseMCJIT) {
-      std::string CacheName("file:");
-      CacheName.append(InputFile);
-      Mod->setModuleIdentifier(CacheName);
-    } else
-      errs() << "warning: -enable-cache-manager can only be used with MCJIT.";
+    std::string CacheName("file:");
+    CacheName.append(InputFile);
+    Mod->setModuleIdentifier(CacheName);
   }
 
   // If not jitting lazily, load the whole bitcode file eagerly too.
@@ -423,18 +415,12 @@
   }
 
   if (DebugIR) {
-    if (!UseMCJIT) {
-      errs() << "warning: -debug-ir used without -use-mcjit. Only partial debug"
-        << " information will be emitted by the non-MC JIT engine. To see full"
-        << " source debug information, enable the flag '-use-mcjit'.\n";
-
-    }
     ModulePass *DebugIRPass = createDebugIRPass();
     DebugIRPass->runOnModule(*Mod);
   }
 
   std::string ErrorMsg;
-  EngineBuilder builder(Mod);
+  EngineBuilder builder(std::move(Owner));
   builder.setMArch(MArch);
   builder.setMCPU(MCPU);
   builder.setMAttrs(MAttrs);
@@ -451,20 +437,16 @@
 
   // Enable MCJIT if desired.
   RTDyldMemoryManager *RTDyldMM = nullptr;
-  if (UseMCJIT && !ForceInterpreter) {
-    builder.setUseMCJIT(true);
+  if (!ForceInterpreter) {
     if (RemoteMCJIT)
       RTDyldMM = new RemoteMemoryManager();
     else
       RTDyldMM = new SectionMemoryManager();
     builder.setMCJITMemoryManager(RTDyldMM);
-  } else {
-    if (RemoteMCJIT) {
-      errs() << "error: Remote process execution requires -use-mcjit\n";
-      exit(1);
-    }
-    builder.setJITMemoryManager(ForceInterpreter ? nullptr :
-                                JITMemoryManager::CreateDefaultMemManager());
+  } else if (RemoteMCJIT) {
+    errs() << "error: Remote process execution does not work with the "
+              "interpreter.\n";
+    exit(1);
   }
 
   CodeGenOpt::Level OLvl = CodeGenOpt::Default;
@@ -511,46 +493,50 @@
 
   // Load any additional modules specified on the command line.
   for (unsigned i = 0, e = ExtraModules.size(); i != e; ++i) {
-    Module *XMod = ParseIRFile(ExtraModules[i], Err, Context);
+    std::unique_ptr<Module> XMod = parseIRFile(ExtraModules[i], Err, Context);
     if (!XMod) {
       Err.print(argv[0], errs());
       return 1;
     }
     if (EnableCacheManager) {
-      if (UseMCJIT) {
-        std::string CacheName("file:");
-        CacheName.append(ExtraModules[i]);
-        XMod->setModuleIdentifier(CacheName);
-      }
-      // else, we already printed a warning above.
+      std::string CacheName("file:");
+      CacheName.append(ExtraModules[i]);
+      XMod->setModuleIdentifier(CacheName);
     }
-    EE->addModule(XMod);
+    EE->addModule(std::move(XMod));
   }
 
   for (unsigned i = 0, e = ExtraObjects.size(); i != e; ++i) {
-    ErrorOr<object::ObjectFile *> Obj =
+    ErrorOr<object::OwningBinary<object::ObjectFile>> Obj =
         object::ObjectFile::createObjectFile(ExtraObjects[i]);
     if (!Obj) {
       Err.print(argv[0], errs());
       return 1;
     }
-    EE->addObjectFile(std::unique_ptr<object::ObjectFile>(Obj.get()));
+    object::OwningBinary<object::ObjectFile> &O = Obj.get();
+    EE->addObjectFile(std::move(O));
   }
 
   for (unsigned i = 0, e = ExtraArchives.size(); i != e; ++i) {
-    ErrorOr<std::unique_ptr<MemoryBuffer>> ArBuf =
+    ErrorOr<std::unique_ptr<MemoryBuffer>> ArBufOrErr =
         MemoryBuffer::getFileOrSTDIN(ExtraArchives[i]);
-    if (!ArBuf) {
+    if (!ArBufOrErr) {
       Err.print(argv[0], errs());
       return 1;
     }
-    std::error_code EC;
-    object::Archive *Ar = new object::Archive(std::move(ArBuf.get()), EC);
-    if (EC || !Ar) {
-      Err.print(argv[0], errs());
+    std::unique_ptr<MemoryBuffer> &ArBuf = ArBufOrErr.get();
+
+    ErrorOr<std::unique_ptr<object::Archive>> ArOrErr =
+        object::Archive::create(ArBuf->getMemBufferRef());
+    if (std::error_code EC = ArOrErr.getError()) {
+      errs() << EC.message();
       return 1;
     }
-    EE->addArchive(Ar);
+    std::unique_ptr<object::Archive> &Ar = ArOrErr.get();
+
+    object::OwningBinary<object::Archive> OB(std::move(Ar), std::move(ArBuf));
+
+    EE->addArchive(std::move(OB));
   }
 
   // If the target is Cygwin/MingW and we are generating remote code, we
@@ -610,20 +596,12 @@
                                                       NULL);
 
     // Run static constructors.
-    if (UseMCJIT && !ForceInterpreter) {
+    if (!ForceInterpreter) {
       // Give MCJIT a chance to apply relocations and set page permissions.
       EE->finalizeObject();
     }
     EE->runStaticConstructorsDestructors(false);
 
-    if (!UseMCJIT && NoLazyCompilation) {
-      for (Module::iterator I = Mod->begin(), E = Mod->end(); I != E; ++I) {
-        Function *Fn = &*I;
-        if (Fn != EntryFn && !Fn->isDeclaration())
-          EE->getPointerToFunction(Fn);
-      }
-    }
-
     // Trigger compilation separately so code regions that need to be
     // invalidated will be known.
     (void)EE->getPointerToFunction(EntryFn);

diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index 0e809a7..5193def 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt

@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  Core
   Object
   Support
   )

diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index f638e55..8ee66f6 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp

@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Object/Archive.h"
@@ -20,6 +21,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/LineIterator.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
@@ -45,7 +47,7 @@
 static const char *TemporaryOutput;
 static int TmpArchiveFD = -1;
 
-// fail - Show the error message and exit.
+// Show the error message and exit.
 LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
   outs() << ToolName << ": " << Error << ".\n";
   if (TmpArchiveFD != -1)
@@ -67,14 +69,16 @@
 
 // llvm-ar/llvm-ranlib remaining positional arguments.
 static cl::list<std::string>
-RestOfArgs(cl::Positional, cl::OneOrMore,
-    cl::desc("[relpos] [count] <archive-file> [members]..."));
+    RestOfArgs(cl::Positional, cl::ZeroOrMore,
+               cl::desc("[relpos] [count] <archive-file> [members]..."));
+
+static cl::opt<bool> MRI("M", cl::desc(""));
 
 std::string Options;
 
-// MoreHelp - Provide additional help output explaining the operations and
-// modifiers of llvm-ar. This object instructs the CommandLine library
-// to print the text of the constructor when the --help option is given.
+// Provide additional help output explaining the operations and modifiers of
+// llvm-ar. This object instructs the CommandLine library to print the text of
+// the constructor when the --help option is given.
 static cl::extrahelp MoreHelp(
   "\nOPERATIONS:\n"
   "  d[NsS]       - delete file(s) from the archive\n"
@@ -132,9 +136,9 @@
 
 // This variable holds the list of member files to proecess, as given
 // on the command line.
-static std::vector<std::string> Members;
+static std::vector<StringRef> Members;
 
-// show_help - Show the error message, the help message and exit.
+// Show the error message, the help message and exit.
 LLVM_ATTRIBUTE_NORETURN static void
 show_help(const std::string &msg) {
   errs() << ToolName << ": " << msg << "\n\n";
@@ -142,8 +146,8 @@
   std::exit(1);
 }
 
-// getRelPos - Extract the member filename from the command line for
-// the [relpos] argument associated with a, b, and i modifiers
+// Extract the member filename from the command line for the [relpos] argument
+// associated with a, b, and i modifiers
 static void getRelPos() {
   if(RestOfArgs.size() == 0)
     show_help("Expected [relpos] for a, b, or i modifier");
@@ -158,7 +162,7 @@
   RestOfArgs.erase(RestOfArgs.begin());
 }
 
-// getArchive - Get the archive file name from the command line
+// Get the archive file name from the command line
 static void getArchive() {
   if(RestOfArgs.size() == 0)
     show_help("An archive name must be specified");
@@ -166,17 +170,24 @@
   RestOfArgs.erase(RestOfArgs.begin());
 }
 
-// getMembers - Copy over remaining items in RestOfArgs to our Members vector
-// This is just for clarity.
+// Copy over remaining items in RestOfArgs to our Members vector
 static void getMembers() {
-  if(RestOfArgs.size() > 0)
-    Members = std::vector<std::string>(RestOfArgs);
+  for (auto &Arg : RestOfArgs)
+    Members.push_back(Arg);
 }
 
-// parseCommandLine - Parse the command line options as presented and return the
-// operation specified. Process all modifiers and check to make sure that
-// constraints on modifier/operation pairs have not been violated.
+static void runMRIScript();
+
+// Parse the command line options as presented and return the operation
+// specified. Process all modifiers and check to make sure that constraints on
+// modifier/operation pairs have not been violated.
 static ArchiveOperation parseCommandLine() {
+  if (MRI) {
+    if (!RestOfArgs.empty())
+      fail("Cannot mix -M and other options");
+    runMRIScript();
+  }
+
   getOptions();
 
   // Keep track of number of operations. We can only specify one
@@ -279,8 +290,8 @@
   outs().write(Data.data(), Data.size());
 }
 
-// putMode - utility function for printing out the file mode when the 't'
-// operation is in verbose mode.
+// Utility function for printing out the file mode when the 't' operation is in
+// verbose mode.
 static void printMode(unsigned mode) {
   if (mode & 004)
     outs() << "r";
@@ -401,20 +412,20 @@
 
   object::Archive::child_iterator OldI;
 
-  std::string NewFilename;
+  StringRef NewFilename;
   mutable int NewFD;
   mutable sys::fs::file_status NewStatus;
 
 public:
   NewArchiveIterator(object::Archive::child_iterator I, StringRef Name);
-  NewArchiveIterator(std::string *I, StringRef Name);
+  NewArchiveIterator(StringRef I, StringRef Name);
   NewArchiveIterator();
   bool isNewMember() const;
   StringRef getName() const;
 
   object::Archive::child_iterator getOld() const;
 
-  const char *getNew() const;
+  StringRef getNew() const;
   int getFD() const;
   const sys::fs::file_status &getStatus() const;
 };
@@ -426,8 +437,8 @@
                                        StringRef Name)
     : IsNewMember(false), Name(Name), OldI(I) {}
 
-NewArchiveIterator::NewArchiveIterator(std::string *NewFilename, StringRef Name)
-    : IsNewMember(true), Name(Name), NewFilename(*NewFilename), NewFD(-1) {}
+NewArchiveIterator::NewArchiveIterator(StringRef NewFilename, StringRef Name)
+    : IsNewMember(true), Name(Name), NewFilename(NewFilename), NewFD(-1) {}
 
 StringRef NewArchiveIterator::getName() const { return Name; }
 
@@ -438,9 +449,9 @@
   return OldI;
 }
 
-const char *NewArchiveIterator::getNew() const {
+StringRef NewArchiveIterator::getNew() const {
   assert(IsNewMember);
-  return NewFilename.c_str();
+  return NewFilename;
 }
 
 int NewArchiveIterator::getFD() const {
@@ -485,16 +496,17 @@
   IA_MoveNewMember
 };
 
-static InsertAction
-computeInsertAction(ArchiveOperation Operation,
-                    object::Archive::child_iterator I, StringRef Name,
-                    std::vector<std::string>::iterator &Pos) {
+static InsertAction computeInsertAction(ArchiveOperation Operation,
+                                        object::Archive::child_iterator I,
+                                        StringRef Name,
+                                        std::vector<StringRef>::iterator &Pos) {
   if (Operation == QuickAppend || Members.empty())
     return IA_AddOldMember;
 
-  std::vector<std::string>::iterator MI = std::find_if(
-      Members.begin(), Members.end(),
-      [Name](StringRef Path) { return Name == sys::path::filename(Path); });
+  auto MI =
+      std::find_if(Members.begin(), Members.end(), [Name](StringRef Path) {
+        return Name == sys::path::filename(Path);
+      });
 
   if (MI == Members.end())
     return IA_AddOldMember;
@@ -542,11 +554,9 @@
   int InsertPos = -1;
   StringRef PosName = sys::path::filename(RelPos);
   if (OldArchive) {
-    for (object::Archive::child_iterator I = OldArchive->child_begin(),
-                                         E = OldArchive->child_end();
-         I != E; ++I) {
+    for (auto &Child : OldArchive->children()) {
       int Pos = Ret.size();
-      ErrorOr<StringRef> NameOrErr = I->getName();
+      ErrorOr<StringRef> NameOrErr = Child.getName();
       failIfError(NameOrErr.getError());
       StringRef Name = NameOrErr.get();
       if (Name == PosName) {
@@ -557,22 +567,23 @@
           InsertPos = Pos + 1;
       }
 
-      std::vector<std::string>::iterator MemberI = Members.end();
-      InsertAction Action = computeInsertAction(Operation, I, Name, MemberI);
+      std::vector<StringRef>::iterator MemberI = Members.end();
+      InsertAction Action =
+          computeInsertAction(Operation, Child, Name, MemberI);
       switch (Action) {
       case IA_AddOldMember:
-        addMember(Ret, I, Name);
+        addMember(Ret, Child, Name);
         break;
       case IA_AddNewMeber:
-        addMember(Ret, &*MemberI, Name);
+        addMember(Ret, *MemberI, Name);
         break;
       case IA_Delete:
         break;
       case IA_MoveOldMember:
-        addMember(Moved, I, Name);
+        addMember(Moved, Child, Name);
         break;
       case IA_MoveNewMember:
-        addMember(Moved, &*MemberI, Name);
+        addMember(Moved, *MemberI, Name);
         break;
       }
       if (MemberI != Members.end())
@@ -594,11 +605,10 @@
 
   Ret.insert(Ret.begin() + InsertPos, Members.size(), NewArchiveIterator());
   int Pos = InsertPos;
-  for (std::vector<std::string>::iterator I = Members.begin(),
-         E = Members.end();
-       I != E; ++I, ++Pos) {
-    StringRef Name = sys::path::filename(*I);
-    addMember(Ret, &*I, Name, Pos);
+  for (auto &Member : Members) {
+    StringRef Name = sys::path::filename(Member);
+    addMember(Ret, Member, Name, Pos);
+    ++Pos;
   }
 
   return Ret;
@@ -686,7 +696,7 @@
 
 static void
 writeSymbolTable(raw_fd_ostream &Out, ArrayRef<NewArchiveIterator> Members,
-                 MutableArrayRef<std::unique_ptr<MemoryBuffer>> Buffers,
+                 ArrayRef<MemoryBufferRef> Buffers,
                  std::vector<std::pair<unsigned, unsigned>> &MemberOffsetRefs) {
   unsigned StartOffset = 0;
   unsigned MemberNum = 0;
@@ -697,13 +707,13 @@
   for (ArrayRef<NewArchiveIterator>::iterator I = Members.begin(),
                                               E = Members.end();
        I != E; ++I, ++MemberNum) {
-    std::unique_ptr<MemoryBuffer> &MemberBuffer = Buffers[MemberNum];
-    ErrorOr<object::SymbolicFile *> ObjOrErr =
+    MemoryBufferRef MemberBuffer = Buffers[MemberNum];
+    ErrorOr<std::unique_ptr<object::SymbolicFile>> ObjOrErr =
         object::SymbolicFile::createSymbolicFile(
             MemberBuffer, sys::fs::file_magic::unknown, &Context);
     if (!ObjOrErr)
       continue;  // FIXME: check only for "not an object file" errors.
-    std::unique_ptr<object::SymbolicFile> Obj(ObjOrErr.get());
+    object::SymbolicFile &Obj = *ObjOrErr.get();
 
     if (!StartOffset) {
       printMemberHeader(Out, "", sys::TimeValue::now(), 0, 0, 0, 0);
@@ -711,7 +721,7 @@
       print32BE(Out, 0);
     }
 
-    for (const object::BasicSymbolRef &S : Obj->symbols()) {
+    for (const object::BasicSymbolRef &S : Obj.symbols()) {
       uint32_t Symflags = S.getFlags();
       if (Symflags & object::SymbolRef::SF_FormatSpecific)
         continue;
@@ -725,7 +735,6 @@
       MemberOffsetRefs.push_back(std::make_pair(Out.tell(), MemberNum));
       print32BE(Out, 0);
     }
-    MemberBuffer.reset(Obj->releaseBuffer());
   }
   Out << NameOS.str();
 
@@ -743,8 +752,9 @@
   Out.seek(Pos);
 }
 
-static void performWriteOperation(ArchiveOperation Operation,
-                                  object::Archive *OldArchive) {
+static void
+performWriteOperation(ArchiveOperation Operation, object::Archive *OldArchive,
+                      std::vector<NewArchiveIterator> &NewMembers) {
   SmallString<128> TmpArchive;
   failIfError(sys::fs::createUniqueFile(ArchiveName + ".temp-archive-%%%%%%%.a",
                                         TmpArchiveFD, TmpArchive));
@@ -754,38 +764,36 @@
   raw_fd_ostream &Out = Output.os();
   Out << "!<arch>\n";
 
-  std::vector<NewArchiveIterator> NewMembers =
-      computeNewArchiveMembers(Operation, OldArchive);
-
   std::vector<std::pair<unsigned, unsigned> > MemberOffsetRefs;
 
-  std::vector<std::unique_ptr<MemoryBuffer>> MemberBuffers;
-  MemberBuffers.resize(NewMembers.size());
+  std::vector<std::unique_ptr<MemoryBuffer>> Buffers;
+  std::vector<MemoryBufferRef> Members;
 
   for (unsigned I = 0, N = NewMembers.size(); I < N; ++I) {
-    std::unique_ptr<MemoryBuffer> MemberBuffer;
     NewArchiveIterator &Member = NewMembers[I];
+    MemoryBufferRef MemberRef;
 
     if (Member.isNewMember()) {
-      const char *Filename = Member.getNew();
+      StringRef Filename = Member.getNew();
       int FD = Member.getFD();
       const sys::fs::file_status &Status = Member.getStatus();
       ErrorOr<std::unique_ptr<MemoryBuffer>> MemberBufferOrErr =
           MemoryBuffer::getOpenFile(FD, Filename, Status.getSize(), false);
       failIfError(MemberBufferOrErr.getError(), Filename);
-      MemberBuffer = std::move(MemberBufferOrErr.get());
+      Buffers.push_back(std::move(MemberBufferOrErr.get()));
+      MemberRef = Buffers.back()->getMemBufferRef();
     } else {
       object::Archive::child_iterator OldMember = Member.getOld();
-      ErrorOr<std::unique_ptr<MemoryBuffer>> MemberBufferOrErr =
-          OldMember->getMemoryBuffer();
+      ErrorOr<MemoryBufferRef> MemberBufferOrErr =
+          OldMember->getMemoryBufferRef();
       failIfError(MemberBufferOrErr.getError());
-      MemberBuffer = std::move(MemberBufferOrErr.get());
+      MemberRef = MemberBufferOrErr.get();
     }
-    MemberBuffers[I].reset(MemberBuffer.release());
+    Members.push_back(MemberRef);
   }
 
   if (Symtab) {
-    writeSymbolTable(Out, NewMembers, MemberBuffers, MemberOffsetRefs);
+    writeSymbolTable(Out, NewMembers, Members, MemberOffsetRefs);
   }
 
   std::vector<unsigned> StringMapIndexes;
@@ -809,9 +817,9 @@
     }
     Out.seek(Pos);
 
-    const MemoryBuffer *File = MemberBuffers[MemberNum].get();
+    MemoryBufferRef File = Members[MemberNum];
     if (I->isNewMember()) {
-      const char *FileName = I->getNew();
+      StringRef FileName = I->getNew();
       const sys::fs::file_status &Status = I->getStatus();
 
       StringRef Name = sys::path::filename(FileName);
@@ -839,7 +847,7 @@
                           OldMember->getSize());
     }
 
-    Out << File->getBuffer();
+    Out << File.getBuffer();
 
     if (Out.tell() % 2)
       Out << '\n';
@@ -851,6 +859,18 @@
   TemporaryOutput = nullptr;
 }
 
+static void
+performWriteOperation(ArchiveOperation Operation, object::Archive *OldArchive,
+                      std::vector<NewArchiveIterator> *NewMembersP) {
+  if (NewMembersP) {
+    performWriteOperation(Operation, OldArchive, *NewMembersP);
+    return;
+  }
+  std::vector<NewArchiveIterator> NewMembers =
+      computeNewArchiveMembers(Operation, OldArchive);
+  performWriteOperation(Operation, OldArchive, NewMembers);
+}
+
 static void createSymbolTable(object::Archive *OldArchive) {
   // When an archive is created or modified, if the s option is given, the
   // resulting archive will have a current symbol table. If the S option
@@ -861,11 +881,12 @@
   if (OldArchive->hasSymbolTable())
     return;
 
-  performWriteOperation(CreateSymTab, OldArchive);
+  performWriteOperation(CreateSymTab, OldArchive, nullptr);
 }
 
 static void performOperation(ArchiveOperation Operation,
-                             object::Archive *OldArchive) {
+                             object::Archive *OldArchive,
+                             std::vector<NewArchiveIterator> *NewMembers) {
   switch (Operation) {
   case Print:
   case DisplayTable:
@@ -877,7 +898,7 @@
   case Move:
   case QuickAppend:
   case ReplaceOrInsert:
-    performWriteOperation(Operation, OldArchive);
+    performWriteOperation(Operation, OldArchive, NewMembers);
     return;
   case CreateSymTab:
     createSymbolTable(OldArchive);
@@ -886,10 +907,129 @@
   llvm_unreachable("Unknown operation.");
 }
 
-static int ar_main(char **argv);
-static int ranlib_main();
+static int performOperation(ArchiveOperation Operation,
+                            std::vector<NewArchiveIterator> *NewMembers) {
+  // Create or open the archive object.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      MemoryBuffer::getFile(ArchiveName, -1, false);
+  std::error_code EC = Buf.getError();
+  if (EC && EC != errc::no_such_file_or_directory) {
+    errs() << ToolName << ": error opening '" << ArchiveName
+           << "': " << EC.message() << "!\n";
+    return 1;
+  }
 
-// main - main program for llvm-ar .. see comments in the code
+  if (!EC) {
+    object::Archive Archive(Buf.get()->getMemBufferRef(), EC);
+
+    if (EC) {
+      errs() << ToolName << ": error loading '" << ArchiveName
+             << "': " << EC.message() << "!\n";
+      return 1;
+    }
+    performOperation(Operation, &Archive, NewMembers);
+    return 0;
+  }
+
+  assert(EC == errc::no_such_file_or_directory);
+
+  if (!shouldCreateArchive(Operation)) {
+    failIfError(EC, Twine("error loading '") + ArchiveName + "'");
+  } else {
+    if (!Create) {
+      // Produce a warning if we should and we're creating the archive
+      errs() << ToolName << ": creating " << ArchiveName << "\n";
+    }
+  }
+
+  performOperation(Operation, nullptr, NewMembers);
+  return 0;
+}
+
+static void runMRIScript() {
+  enum class MRICommand { AddLib, AddMod, Create, Save, End, Invalid };
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf = MemoryBuffer::getSTDIN();
+  failIfError(Buf.getError());
+  const MemoryBuffer &Ref = *Buf.get();
+  bool Saved = false;
+  std::vector<NewArchiveIterator> NewMembers;
+  std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
+  std::vector<std::unique_ptr<object::Archive>> Archives;
+
+  for (line_iterator I(Ref, /*SkipBlanks*/ true, ';'), E; I != E; ++I) {
+    StringRef Line = *I;
+    StringRef CommandStr, Rest;
+    std::tie(CommandStr, Rest) = Line.split(' ');
+    Rest = Rest.trim();
+    if (!Rest.empty() && Rest.front() == '"' && Rest.back() == '"')
+      Rest = Rest.drop_front().drop_back();
+    auto Command = StringSwitch<MRICommand>(CommandStr.lower())
+                       .Case("addlib", MRICommand::AddLib)
+                       .Case("addmod", MRICommand::AddMod)
+                       .Case("create", MRICommand::Create)
+                       .Case("save", MRICommand::Save)
+                       .Case("end", MRICommand::End)
+                       .Default(MRICommand::Invalid);
+
+    switch (Command) {
+    case MRICommand::AddLib: {
+      auto BufOrErr = MemoryBuffer::getFile(Rest, -1, false);
+      failIfError(BufOrErr.getError(), "Could not open library");
+      ArchiveBuffers.push_back(std::move(*BufOrErr));
+      auto LibOrErr =
+          object::Archive::create(ArchiveBuffers.back()->getMemBufferRef());
+      failIfError(LibOrErr.getError(), "Could not parse library");
+      Archives.push_back(std::move(*LibOrErr));
+      object::Archive &Lib = *Archives.back();
+      for (auto &Member : Lib.children()) {
+        ErrorOr<StringRef> NameOrErr = Member.getName();
+        failIfError(NameOrErr.getError());
+        addMember(NewMembers, Member, *NameOrErr);
+      }
+      break;
+    }
+    case MRICommand::AddMod:
+      addMember(NewMembers, Rest, sys::path::filename(Rest));
+      break;
+    case MRICommand::Create:
+      Create = true;
+      if (!ArchiveName.empty())
+        fail("Editing multiple archives not supported");
+      if (Saved)
+        fail("File already saved");
+      ArchiveName = Rest;
+      break;
+    case MRICommand::Save:
+      Saved = true;
+      break;
+    case MRICommand::End:
+      break;
+    case MRICommand::Invalid:
+      fail("Unknown command: " + CommandStr);
+    }
+  }
+
+  // Nothing to do if not saved.
+  if (Saved)
+    performOperation(ReplaceOrInsert, &NewMembers);
+  exit(0);
+}
+
+static int ar_main() {
+  // Do our own parsing of the command line because the CommandLine utility
+  // can't handle the grouped positional parameters without a dash.
+  ArchiveOperation Operation = parseCommandLine();
+  return performOperation(Operation, nullptr);
+}
+
+static int ranlib_main() {
+  if (RestOfArgs.size() != 1)
+    fail(ToolName + "takes just one archive as argument");
+  ArchiveName = RestOfArgs[0];
+  return performOperation(CreateSymTab, nullptr);
+}
+
 int main(int argc, char **argv) {
   ToolName = argv[0];
   // Print a stack trace if we signal out.
@@ -910,62 +1050,8 @@
 
   StringRef Stem = sys::path::stem(ToolName);
   if (Stem.find("ar") != StringRef::npos)
-    return ar_main(argv);
+    return ar_main();
   if (Stem.find("ranlib") != StringRef::npos)
     return ranlib_main();
   fail("Not ranlib or ar!");
 }
-
-static int performOperation(ArchiveOperation Operation);
-
-int ranlib_main() {
-  if (RestOfArgs.size() != 1)
-    fail(ToolName + "takes just one archive as argument");
-  ArchiveName = RestOfArgs[0];
-  return performOperation(CreateSymTab);
-}
-
-int ar_main(char **argv) {
-  // Do our own parsing of the command line because the CommandLine utility
-  // can't handle the grouped positional parameters without a dash.
-  ArchiveOperation Operation = parseCommandLine();
-  return performOperation(Operation);
-}
-
-static int performOperation(ArchiveOperation Operation) {
-  // Create or open the archive object.
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
-      MemoryBuffer::getFile(ArchiveName, -1, false);
-  std::error_code EC = Buf.getError();
-  if (EC && EC != errc::no_such_file_or_directory) {
-    errs() << ToolName << ": error opening '" << ArchiveName
-           << "': " << EC.message() << "!\n";
-    return 1;
-  }
-
-  if (!EC) {
-    object::Archive Archive(std::move(Buf.get()), EC);
-
-    if (EC) {
-      errs() << ToolName << ": error loading '" << ArchiveName
-             << "': " << EC.message() << "!\n";
-      return 1;
-    }
-    performOperation(Operation, &Archive);
-    return 0;
-  }
-
-  assert(EC == errc::no_such_file_or_directory);
-
-  if (!shouldCreateArchive(Operation)) {
-    failIfError(EC, Twine("error loading '") + ArchiveName + "'");
-  } else {
-    if (!Create) {
-      // Produce a warning if we should and we're creating the archive
-      errs() << ToolName << ": creating " << ArchiveName << "\n";
-    }
-  }
-
-  performOperation(Operation, nullptr);
-  return 0;
-}

diff --git a/tools/llvm-as/llvm-as.cpp b/tools/llvm-as/llvm-as.cpp
index 007241c..5ccf505 100644
--- a/tools/llvm-as/llvm-as.cpp
+++ b/tools/llvm-as/llvm-as.cpp

@@ -69,11 +69,11 @@
     }
   }
 
-  std::string ErrorInfo;
+  std::error_code EC;
   std::unique_ptr<tool_output_file> Out(
-      new tool_output_file(OutputFilename.c_str(), ErrorInfo, sys::fs::F_None));
-  if (!ErrorInfo.empty()) {
-    errs() << ErrorInfo << '\n';
+      new tool_output_file(OutputFilename, EC, sys::fs::F_None));
+  if (EC) {
+    errs() << EC.message() << '\n';
     exit(1);
   }
 
@@ -94,7 +94,7 @@
 
   // Parse the file now...
   SMDiagnostic Err;
-  std::unique_ptr<Module> M(ParseAssemblyFile(InputFilename, Err, Context));
+  std::unique_ptr<Module> M = parseAssemblyFile(InputFilename, Err, Context);
   if (!M.get()) {
     Err.print(argv[0], errs());
     return 1;

diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index 15567cf..f95b272 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp

@@ -31,6 +31,7 @@
 #include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -61,6 +62,10 @@
             cl::desc("Emit numeric info in dump even if"
                      " symbolic info is available"));
 
+static cl::opt<std::string>
+  BlockInfoFilename("block-info",
+                    cl::desc("Use the BLOCK_INFO from the given file"));
+
 namespace {
 
 /// CurStreamTypeType - A type for CurStreamType
@@ -71,15 +76,11 @@
 
 }
 
-/// CurStreamType - If we can sniff the flavor of this stream, we can produce
-/// better dump info.
-static CurStreamTypeType CurStreamType;
-
-
 /// GetBlockName - Return a symbolic block name if known, otherwise return
 /// null.
 static const char *GetBlockName(unsigned BlockID,
-                                const BitstreamReader &StreamFile) {
+                                const BitstreamReader &StreamFile,
+                                CurStreamTypeType CurStreamType) {
   // Standard blocks for all bitcode files.
   if (BlockID < bitc::FIRST_APPLICATION_BLOCKID) {
     if (BlockID == bitc::BLOCKINFO_BLOCK_ID)
@@ -115,7 +116,8 @@
 /// GetCodeName - Return a symbolic code name if known, otherwise return
 /// null.
 static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
-                               const BitstreamReader &StreamFile) {
+                               const BitstreamReader &StreamFile,
+                               CurStreamTypeType CurStreamType) {
   // Standard blocks for all bitcode files.
   if (BlockID < bitc::FIRST_APPLICATION_BLOCKID) {
     if (BlockID == bitc::BLOCKINFO_BLOCK_ID) {
@@ -271,7 +273,8 @@
   case bitc::USELIST_BLOCK_ID:
     switch(CodeID) {
     default:return nullptr;
-    case bitc::USELIST_CODE_ENTRY:   return "USELIST_CODE_ENTRY";
+    case bitc::USELIST_CODE_DEFAULT: return "USELIST_CODE_DEFAULT";
+    case bitc::USELIST_CODE_BB:      return "USELIST_CODE_BB";
     }
   }
 }
@@ -315,14 +318,14 @@
 
 /// Error - All bitcode analysis errors go through this function, making this a
 /// good place to breakpoint if debugging.
-static bool Error(const std::string &Err) {
+static bool Error(const Twine &Err) {
   errs() << Err << "\n";
   return true;
 }
 
 /// ParseBlock - Read a block, updating statistics, etc.
 static bool ParseBlock(BitstreamCursor &Stream, unsigned BlockID,
-                       unsigned IndentLevel) {
+                       unsigned IndentLevel, CurStreamTypeType CurStreamType) {
   std::string Indent(IndentLevel*2, ' ');
   uint64_t BlockBitStart = Stream.GetCurrentBitNo();
 
@@ -348,7 +351,8 @@
   const char *BlockName = nullptr;
   if (Dump) {
     outs() << Indent << "<";
-    if ((BlockName = GetBlockName(BlockID, *Stream.getBitStreamReader())))
+    if ((BlockName = GetBlockName(BlockID, *Stream.getBitStreamReader(),
+                                  CurStreamType)))
       outs() << BlockName;
     else
       outs() << "UnknownBlock" << BlockID;
@@ -390,7 +394,7 @@
         
     case BitstreamEntry::SubBlock: {
       uint64_t SubBlockBitStart = Stream.GetCurrentBitNo();
-      if (ParseBlock(Stream, Entry.ID, IndentLevel+1))
+      if (ParseBlock(Stream, Entry.ID, IndentLevel+1, CurStreamType))
         return true;
       ++BlockStats.NumSubBlocks;
       uint64_t SubBlockBitEnd = Stream.GetCurrentBitNo();
@@ -431,12 +435,14 @@
     if (Dump) {
       outs() << Indent << "  <";
       if (const char *CodeName =
-            GetCodeName(Code, BlockID, *Stream.getBitStreamReader()))
+            GetCodeName(Code, BlockID, *Stream.getBitStreamReader(),
+                        CurStreamType))
         outs() << CodeName;
       else
         outs() << "UnknownCode" << Code;
       if (NonSymbolic &&
-          GetCodeName(Code, BlockID, *Stream.getBitStreamReader()))
+          GetCodeName(Code, BlockID, *Stream.getBitStreamReader(),
+                      CurStreamType))
         outs() << " codeid=" << Code;
       if (Entry.ID != bitc::UNABBREV_RECORD)
         outs() << " abbrevid=" << Entry.ID;
@@ -474,21 +480,23 @@
                    (double)Bits/8, (unsigned long)(Bits/32));
 }
 
-
-/// AnalyzeBitcode - Analyze the bitcode file specified by InputFilename.
-static int AnalyzeBitcode() {
+static bool openBitcodeFile(StringRef Path,
+                            std::unique_ptr<MemoryBuffer> &MemBuf,
+                            BitstreamReader &StreamFile,
+                            BitstreamCursor &Stream,
+                            CurStreamTypeType &CurStreamType) {
   // Read the input file.
   ErrorOr<std::unique_ptr<MemoryBuffer>> MemBufOrErr =
-      MemoryBuffer::getFileOrSTDIN(InputFilename);
+      MemoryBuffer::getFileOrSTDIN(Path);
   if (std::error_code EC = MemBufOrErr.getError())
-    return Error("Error reading '" + InputFilename + "': " + EC.message());
-  std::unique_ptr<MemoryBuffer> MemBuf = std::move(MemBufOrErr.get());
+    return Error(Twine("Error reading '") + Path + "': " + EC.message());
+  MemBuf = std::move(MemBufOrErr.get());
 
   if (MemBuf->getBufferSize() & 3)
     return Error("Bitcode stream should be a multiple of 4 bytes in length");
 
   const unsigned char *BufPtr = (const unsigned char *)MemBuf->getBufferStart();
-  const unsigned char *EndBufPtr = BufPtr+MemBuf->getBufferSize();
+  const unsigned char *EndBufPtr = BufPtr + MemBuf->getBufferSize();
 
   // If we have a wrapper header, parse it and ignore the non-bc file contents.
   // The magic number is 0x0B17C0DE stored in little endian.
@@ -496,8 +504,8 @@
     if (SkipBitcodeWrapperHeader(BufPtr, EndBufPtr, true))
       return Error("Invalid bitcode wrapper header");
 
-  BitstreamReader StreamFile(BufPtr, EndBufPtr);
-  BitstreamCursor Stream(StreamFile);
+  StreamFile = BitstreamReader(BufPtr, EndBufPtr);
+  Stream = BitstreamCursor(StreamFile);
   StreamFile.CollectBlockInfoNames();
 
   // Read the stream signature.
@@ -516,6 +524,48 @@
       Signature[4] == 0xE && Signature[5] == 0xD)
     CurStreamType = LLVMIRBitstream;
 
+  return false;
+}
+
+/// AnalyzeBitcode - Analyze the bitcode file specified by InputFilename.
+static int AnalyzeBitcode() {
+  std::unique_ptr<MemoryBuffer> StreamBuffer;
+  BitstreamReader StreamFile;
+  BitstreamCursor Stream;
+  CurStreamTypeType CurStreamType;
+  if (openBitcodeFile(InputFilename, StreamBuffer, StreamFile, Stream,
+                      CurStreamType))
+    return true;
+
+  // Read block info from BlockInfoFilename, if specified.
+  // The block info must be a top-level block.
+  if (!BlockInfoFilename.empty()) {
+    std::unique_ptr<MemoryBuffer> BlockInfoBuffer;
+    BitstreamReader BlockInfoFile;
+    BitstreamCursor BlockInfoCursor;
+    CurStreamTypeType BlockInfoStreamType;
+    if (openBitcodeFile(BlockInfoFilename, BlockInfoBuffer, BlockInfoFile,
+                        BlockInfoCursor, BlockInfoStreamType))
+      return true;
+
+    while (!BlockInfoCursor.AtEndOfStream()) {
+      unsigned Code = BlockInfoCursor.ReadCode();
+      if (Code != bitc::ENTER_SUBBLOCK)
+        return Error("Invalid record at top-level in block info file");
+
+      unsigned BlockID = BlockInfoCursor.ReadSubBlockID();
+      if (BlockID == bitc::BLOCKINFO_BLOCK_ID) {
+        if (BlockInfoCursor.ReadBlockInfoBlock())
+          return Error("Malformed BlockInfoBlock in block info file");
+        break;
+      }
+
+      BlockInfoCursor.SkipBlock();
+    }
+
+    StreamFile.takeBlockInfo(std::move(BlockInfoFile));
+  }
+
   unsigned NumTopBlocks = 0;
 
   // Parse the top-level structure.  We only allow blocks at the top-level.
@@ -526,14 +576,14 @@
 
     unsigned BlockID = Stream.ReadSubBlockID();
 
-    if (ParseBlock(Stream, BlockID, 0))
+    if (ParseBlock(Stream, BlockID, 0, CurStreamType))
       return true;
     ++NumTopBlocks;
   }
 
   if (Dump) outs() << "\n\n";
 
-  uint64_t BufferSizeBits = (EndBufPtr-BufPtr)*CHAR_BIT;
+  uint64_t BufferSizeBits = StreamFile.getBitcodeBytes().getExtent() * CHAR_BIT;
   // Print a summary of the read file.
   outs() << "Summary of " << InputFilename << ":\n";
   outs() << "         Total size: ";
@@ -552,7 +602,8 @@
   for (std::map<unsigned, PerBlockIDStats>::iterator I = BlockIDStats.begin(),
        E = BlockIDStats.end(); I != E; ++I) {
     outs() << "  Block ID #" << I->first;
-    if (const char *BlockName = GetBlockName(I->first, StreamFile))
+    if (const char *BlockName = GetBlockName(I->first, StreamFile,
+                                             CurStreamType))
       outs() << " (" << BlockName << ")";
     outs() << ":\n";
 
@@ -610,7 +661,8 @@
           outs() << "         ";
 
         if (const char *CodeName =
-              GetCodeName(FreqPairs[i].second, I->first, StreamFile))
+              GetCodeName(FreqPairs[i].second, I->first, StreamFile,
+                          CurStreamType))
           outs() << CodeName << "\n";
         else
           outs() << "UnknownCode" << FreqPairs[i].second << "\n";

diff --git a/tools/llvm-c-test/CMakeLists.txt b/tools/llvm-c-test/CMakeLists.txt
index 34fea3d..989678b 100644
--- a/tools/llvm-c-test/CMakeLists.txt
+++ b/tools/llvm-c-test/CMakeLists.txt

@@ -7,6 +7,10 @@
   Target
   )
 
+if(TARGET LLVM)
+  set(LLVM_LINK_COMPONENTS)
+endif()
+
 if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99 -Wstrict-prototypes")
 endif ()
@@ -21,3 +25,8 @@
   object.c
   targets.c
   )
+
+# Use libLLVM.so if it is available.
+if(TARGET LLVM)
+  target_link_libraries(llvm-c-test LLVM)
+endif()

diff --git a/tools/llvm-c-test/disassemble.c b/tools/llvm-c-test/disassemble.c
index eb40bf3..05a9218 100644
--- a/tools/llvm-c-test/disassemble.c
+++ b/tools/llvm-c-test/disassemble.c

@@ -18,6 +18,7 @@
 #include "llvm-c/Target.h"
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 static void pprint(int pos, unsigned char *buf, int len, const char *disasm) {
   int i;
@@ -33,13 +34,15 @@
   printf("   %s\n", disasm);
 }
 
-static void do_disassemble(const char *triple, unsigned char *buf, int siz) {
-  LLVMDisasmContextRef D = LLVMCreateDisasm(triple, NULL, 0, NULL, NULL);
+static void do_disassemble(const char *triple, const char *features,
+                           unsigned char *buf, int siz) {
+  LLVMDisasmContextRef D = LLVMCreateDisasmCPUFeatures(triple, "", features,
+                                                       NULL, 0, NULL, NULL);
   char outline[1024];
   int pos;
 
   if (!D) {
-    printf("ERROR: Couldn't create disassebler for triple %s\n", triple);
+    printf("ERROR: Couldn't create disassembler for triple %s\n", triple);
     return;
   }
 
@@ -62,19 +65,22 @@
 static void handle_line(char **tokens, int ntokens) {
   unsigned char disbuf[128];
   size_t disbuflen = 0;
-  char *triple = tokens[0];
+  const char *triple = tokens[0];
+  const char *features = tokens[1];
   int i;
 
-  printf("triple: %s\n", triple);
+  printf("triple: %s, features: %s\n", triple, features);
+  if (!strcmp(features, "NULL"))
+    features = "";
 
-  for (i = 1; i < ntokens; i++) {
+  for (i = 2; i < ntokens; i++) {
     disbuf[disbuflen++] = strtol(tokens[i], NULL, 16);
     if (disbuflen >= sizeof(disbuf)) {
       fprintf(stderr, "Warning: Too long line, truncating\n");
       break;
     }
   }
-  do_disassemble(triple, disbuf, disbuflen);
+  do_disassemble(triple, features, disbuf, disbuflen);
 }
 
 int disassemble(void) {

diff --git a/tools/llvm-config/CMakeLists.txt b/tools/llvm-config/CMakeLists.txt
index 8d83762..50c84e6 100644
--- a/tools/llvm-config/CMakeLists.txt
+++ b/tools/llvm-config/CMakeLists.txt

@@ -33,3 +33,18 @@
 
 # Add the dependency on the generation step.
 add_file_dependencies(${CMAKE_CURRENT_SOURCE_DIR}/llvm-config.cpp ${BUILDVARIABLES_OBJPATH})
+
+if(CMAKE_CROSSCOMPILING)
+  set(${project}_LLVM_CONFIG_EXE "${LLVM_NATIVE_BUILD}/bin/llvm-config")
+  set(${project}_LLVM_CONFIG_EXE ${${project}_LLVM_CONFIG_EXE} PARENT_SCOPE)
+
+  add_custom_command(OUTPUT "${${project}_LLVM_CONFIG_EXE}"
+    COMMAND ${CMAKE_COMMAND} --build . --target llvm-config --config $<CONFIGURATION>
+    DEPENDS ${LLVM_NATIVE_BUILD}/CMakeCache.txt
+    WORKING_DIRECTORY ${LLVM_NATIVE_BUILD}
+    COMMENT "Building native llvm-config...")
+  add_custom_target(${project}NativeLLVMConfig DEPENDS ${${project}_LLVM_CONFIG_EXE})
+  add_dependencies(${project}NativeLLVMConfig ConfigureNativeLLVM)
+
+  add_dependencies(llvm-config ${project}NativeLLVMConfig)
+endif(CMAKE_CROSSCOMPILING)

diff --git a/tools/llvm-cov/Android.mk b/tools/llvm-cov/Android.mk
index dae211f..d76c940 100644
--- a/tools/llvm-cov/Android.mk
+++ b/tools/llvm-cov/Android.mk

@@ -8,9 +8,22 @@
 #===---------------------------------------------------------------===
 
 llvm_cov_SRC_FILES := \
-  llvm-cov.cpp
+  CodeCoverage.cpp \
+  CoverageFilters.cpp \
+  CoverageReport.cpp \
+  CoverageSummary.cpp \
+  CoverageSummaryInfo.cpp \
+  gcov.cpp \
+  llvm-cov.cpp \
+  SourceCoverageView.cpp \
+  TestingSupport.cpp
 
 llvm_cov_STATIC_LIBRARIES := \
+  libLLVMObject              \
+  libLLVMProfileData         \
+  libLLVMMC                  \
+  libLLVMMCParser            \
+  libLLVMBitReader           \
   libLLVMCore                \
   libLLVMSupport             \
 

diff --git a/tools/llvm-cov/CMakeLists.txt b/tools/llvm-cov/CMakeLists.txt
index 67cea71..b2d2b89 100644
--- a/tools/llvm-cov/CMakeLists.txt
+++ b/tools/llvm-cov/CMakeLists.txt

@@ -1,5 +1,13 @@
-set(LLVM_LINK_COMPONENTS core support )
+set(LLVM_LINK_COMPONENTS core support object profiledata)
 
 add_llvm_tool(llvm-cov
   llvm-cov.cpp
+  gcov.cpp
+  CodeCoverage.cpp
+  CoverageFilters.cpp
+  CoverageReport.cpp
+  CoverageSummary.cpp
+  CoverageSummaryInfo.cpp
+  SourceCoverageView.cpp
+  TestingSupport.cpp
   )

diff --git a/tools/llvm-cov/CodeCoverage.cpp b/tools/llvm-cov/CodeCoverage.cpp
new file mode 100644
index 0000000..093525e
--- /dev/null
+++ b/tools/llvm-cov/CodeCoverage.cpp

@@ -0,0 +1,484 @@
+//===- CodeCoverage.cpp - Coverage tool based on profiling instrumentation-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The 'CodeCoverageTool' class implements a command line tool to analyze and
+// report coverage information using the profiling instrumentation and code
+// coverage mapping.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RenderingSupport.h"
+#include "CoverageViewOptions.h"
+#include "CoverageFilters.h"
+#include "SourceCoverageView.h"
+#include "CoverageSummary.h"
+#include "CoverageReport.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/CoverageMapping.h"
+#include "llvm/ProfileData/CoverageMappingReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include <functional>
+#include <system_error>
+
+using namespace llvm;
+using namespace coverage;
+
+namespace {
+/// \brief The implementation of the coverage tool.
+class CodeCoverageTool {
+public:
+  enum Command {
+    /// \brief The show command.
+    Show,
+    /// \brief The report command.
+    Report
+  };
+
+  /// \brief Print the error message to the error output stream.
+  void error(const Twine &Message, StringRef Whence = "");
+
+  /// \brief Return a memory buffer for the given source file.
+  ErrorOr<const MemoryBuffer &> getSourceFile(StringRef SourceFile);
+
+  /// \brief Create source views for the expansions of the view.
+  void attachExpansionSubViews(SourceCoverageView &View,
+                               ArrayRef<ExpansionRecord> Expansions,
+                               CoverageMapping &Coverage);
+
+  /// \brief Create the source view of a particular function.
+  std::unique_ptr<SourceCoverageView>
+  createFunctionView(const FunctionRecord &Function, CoverageMapping &Coverage);
+
+  /// \brief Create the main source view of a particular source file.
+  std::unique_ptr<SourceCoverageView>
+  createSourceFileView(StringRef SourceFile, CoverageMapping &Coverage);
+
+  /// \brief Load the coverage mapping data. Return true if an error occured.
+  std::unique_ptr<CoverageMapping> load();
+
+  int run(Command Cmd, int argc, const char **argv);
+
+  typedef std::function<int(int, const char **)> CommandLineParserType;
+
+  int show(int argc, const char **argv,
+           CommandLineParserType commandLineParser);
+
+  int report(int argc, const char **argv,
+             CommandLineParserType commandLineParser);
+
+  std::string ObjectFilename;
+  CoverageViewOptions ViewOpts;
+  std::string PGOFilename;
+  CoverageFiltersMatchAll Filters;
+  std::vector<std::string> SourceFiles;
+  std::vector<std::pair<std::string, std::unique_ptr<MemoryBuffer>>>
+      LoadedSourceFiles;
+  bool CompareFilenamesOnly;
+  StringMap<std::string> RemappedFilenames;
+};
+}
+
+void CodeCoverageTool::error(const Twine &Message, StringRef Whence) {
+  errs() << "error: ";
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+}
+
+ErrorOr<const MemoryBuffer &>
+CodeCoverageTool::getSourceFile(StringRef SourceFile) {
+  // If we've remapped filenames, look up the real location for this file.
+  if (!RemappedFilenames.empty()) {
+    auto Loc = RemappedFilenames.find(SourceFile);
+    if (Loc != RemappedFilenames.end())
+      SourceFile = Loc->second;
+  }
+  for (const auto &Files : LoadedSourceFiles)
+    if (sys::fs::equivalent(SourceFile, Files.first))
+      return *Files.second;
+  auto Buffer = MemoryBuffer::getFile(SourceFile);
+  if (auto EC = Buffer.getError()) {
+    error(EC.message(), SourceFile);
+    return EC;
+  }
+  LoadedSourceFiles.push_back(
+      std::make_pair(SourceFile, std::move(Buffer.get())));
+  return *LoadedSourceFiles.back().second;
+}
+
+void
+CodeCoverageTool::attachExpansionSubViews(SourceCoverageView &View,
+                                          ArrayRef<ExpansionRecord> Expansions,
+                                          CoverageMapping &Coverage) {
+  if (!ViewOpts.ShowExpandedRegions)
+    return;
+  for (const auto &Expansion : Expansions) {
+    auto ExpansionCoverage = Coverage.getCoverageForExpansion(Expansion);
+    if (ExpansionCoverage.empty())
+      continue;
+    auto SourceBuffer = getSourceFile(ExpansionCoverage.getFilename());
+    if (!SourceBuffer)
+      continue;
+
+    auto SubViewExpansions = ExpansionCoverage.getExpansions();
+    auto SubView = llvm::make_unique<SourceCoverageView>(
+        SourceBuffer.get(), ViewOpts, std::move(ExpansionCoverage));
+    attachExpansionSubViews(*SubView, SubViewExpansions, Coverage);
+    View.addExpansion(Expansion.Region, std::move(SubView));
+  }
+}
+
+std::unique_ptr<SourceCoverageView>
+CodeCoverageTool::createFunctionView(const FunctionRecord &Function,
+                                     CoverageMapping &Coverage) {
+  auto FunctionCoverage = Coverage.getCoverageForFunction(Function);
+  if (FunctionCoverage.empty())
+    return nullptr;
+  auto SourceBuffer = getSourceFile(FunctionCoverage.getFilename());
+  if (!SourceBuffer)
+    return nullptr;
+
+  auto Expansions = FunctionCoverage.getExpansions();
+  auto View = llvm::make_unique<SourceCoverageView>(
+      SourceBuffer.get(), ViewOpts, std::move(FunctionCoverage));
+  attachExpansionSubViews(*View, Expansions, Coverage);
+
+  return View;
+}
+
+std::unique_ptr<SourceCoverageView>
+CodeCoverageTool::createSourceFileView(StringRef SourceFile,
+                                       CoverageMapping &Coverage) {
+  auto SourceBuffer = getSourceFile(SourceFile);
+  if (!SourceBuffer)
+    return nullptr;
+  auto FileCoverage = Coverage.getCoverageForFile(SourceFile);
+  if (FileCoverage.empty())
+    return nullptr;
+
+  auto Expansions = FileCoverage.getExpansions();
+  auto View = llvm::make_unique<SourceCoverageView>(
+      SourceBuffer.get(), ViewOpts, std::move(FileCoverage));
+  attachExpansionSubViews(*View, Expansions, Coverage);
+
+  for (auto Function : Coverage.getInstantiations(SourceFile)) {
+    auto SubViewCoverage = Coverage.getCoverageForFunction(*Function);
+    auto SubViewExpansions = SubViewCoverage.getExpansions();
+    auto SubView = llvm::make_unique<SourceCoverageView>(
+        SourceBuffer.get(), ViewOpts, std::move(SubViewCoverage));
+    attachExpansionSubViews(*SubView, SubViewExpansions, Coverage);
+
+    if (SubView) {
+      unsigned FileID = Function->CountedRegions.front().FileID;
+      unsigned Line = 0;
+      for (const auto &CR : Function->CountedRegions)
+        if (CR.FileID == FileID)
+          Line = std::max(CR.LineEnd, Line);
+      View->addInstantiation(Function->Name, Line, std::move(SubView));
+    }
+  }
+  return View;
+}
+
+std::unique_ptr<CoverageMapping> CodeCoverageTool::load() {
+  auto CoverageOrErr = CoverageMapping::load(ObjectFilename, PGOFilename);
+  if (std::error_code EC = CoverageOrErr.getError()) {
+    colored_ostream(errs(), raw_ostream::RED)
+        << "error: Failed to load coverage: " << EC.message();
+    errs() << "\n";
+    return nullptr;
+  }
+  auto Coverage = std::move(CoverageOrErr.get());
+  unsigned Mismatched = Coverage->getMismatchedCount();
+  if (Mismatched) {
+    colored_ostream(errs(), raw_ostream::RED)
+        << "warning: " << Mismatched << " functions have mismatched data. ";
+    errs() << "\n";
+  }
+
+  if (CompareFilenamesOnly) {
+    auto CoveredFiles = Coverage.get()->getUniqueSourceFiles();
+    for (auto &SF : SourceFiles) {
+      StringRef SFBase = sys::path::filename(SF);
+      for (const auto &CF : CoveredFiles)
+        if (SFBase == sys::path::filename(CF)) {
+          RemappedFilenames[CF] = SF;
+          SF = CF;
+          break;
+        }
+    }
+  }
+
+  return Coverage;
+}
+
+int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
+  // Print a stack trace if we signal out.
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+  cl::opt<std::string, true> ObjectFilename(
+      cl::Positional, cl::Required, cl::location(this->ObjectFilename),
+      cl::desc("Covered executable or object file."));
+
+  cl::list<std::string> InputSourceFiles(
+      cl::Positional, cl::desc("<Source files>"), cl::ZeroOrMore);
+
+  cl::opt<std::string, true> PGOFilename(
+      "instr-profile", cl::Required, cl::location(this->PGOFilename),
+      cl::desc(
+          "File with the profile data obtained after an instrumented run"));
+
+  cl::opt<bool> DebugDump("dump", cl::Optional,
+                          cl::desc("Show internal debug dump"));
+
+  cl::opt<bool> FilenameEquivalence(
+      "filename-equivalence", cl::Optional,
+      cl::desc("Treat source files as equivalent to paths in the coverage data "
+               "when the file names match, even if the full paths do not"));
+
+  cl::OptionCategory FilteringCategory("Function filtering options");
+
+  cl::list<std::string> NameFilters(
+      "name", cl::Optional,
+      cl::desc("Show code coverage only for functions with the given name"),
+      cl::ZeroOrMore, cl::cat(FilteringCategory));
+
+  cl::list<std::string> NameRegexFilters(
+      "name-regex", cl::Optional,
+      cl::desc("Show code coverage only for functions that match the given "
+               "regular expression"),
+      cl::ZeroOrMore, cl::cat(FilteringCategory));
+
+  cl::opt<double> RegionCoverageLtFilter(
+      "region-coverage-lt", cl::Optional,
+      cl::desc("Show code coverage only for functions with region coverage "
+               "less than the given threshold"),
+      cl::cat(FilteringCategory));
+
+  cl::opt<double> RegionCoverageGtFilter(
+      "region-coverage-gt", cl::Optional,
+      cl::desc("Show code coverage only for functions with region coverage "
+               "greater than the given threshold"),
+      cl::cat(FilteringCategory));
+
+  cl::opt<double> LineCoverageLtFilter(
+      "line-coverage-lt", cl::Optional,
+      cl::desc("Show code coverage only for functions with line coverage less "
+               "than the given threshold"),
+      cl::cat(FilteringCategory));
+
+  cl::opt<double> LineCoverageGtFilter(
+      "line-coverage-gt", cl::Optional,
+      cl::desc("Show code coverage only for functions with line coverage "
+               "greater than the given threshold"),
+      cl::cat(FilteringCategory));
+
+  auto commandLineParser = [&, this](int argc, const char **argv) -> int {
+    cl::ParseCommandLineOptions(argc, argv, "LLVM code coverage tool\n");
+    ViewOpts.Debug = DebugDump;
+    CompareFilenamesOnly = FilenameEquivalence;
+
+    // Create the function filters
+    if (!NameFilters.empty() || !NameRegexFilters.empty()) {
+      auto NameFilterer = new CoverageFilters;
+      for (const auto &Name : NameFilters)
+        NameFilterer->push_back(llvm::make_unique<NameCoverageFilter>(Name));
+      for (const auto &Regex : NameRegexFilters)
+        NameFilterer->push_back(
+            llvm::make_unique<NameRegexCoverageFilter>(Regex));
+      Filters.push_back(std::unique_ptr<CoverageFilter>(NameFilterer));
+    }
+    if (RegionCoverageLtFilter.getNumOccurrences() ||
+        RegionCoverageGtFilter.getNumOccurrences() ||
+        LineCoverageLtFilter.getNumOccurrences() ||
+        LineCoverageGtFilter.getNumOccurrences()) {
+      auto StatFilterer = new CoverageFilters;
+      if (RegionCoverageLtFilter.getNumOccurrences())
+        StatFilterer->push_back(llvm::make_unique<RegionCoverageFilter>(
+            RegionCoverageFilter::LessThan, RegionCoverageLtFilter));
+      if (RegionCoverageGtFilter.getNumOccurrences())
+        StatFilterer->push_back(llvm::make_unique<RegionCoverageFilter>(
+            RegionCoverageFilter::GreaterThan, RegionCoverageGtFilter));
+      if (LineCoverageLtFilter.getNumOccurrences())
+        StatFilterer->push_back(llvm::make_unique<LineCoverageFilter>(
+            LineCoverageFilter::LessThan, LineCoverageLtFilter));
+      if (LineCoverageGtFilter.getNumOccurrences())
+        StatFilterer->push_back(llvm::make_unique<LineCoverageFilter>(
+            RegionCoverageFilter::GreaterThan, LineCoverageGtFilter));
+      Filters.push_back(std::unique_ptr<CoverageFilter>(StatFilterer));
+    }
+
+    for (const auto &File : InputSourceFiles) {
+      SmallString<128> Path(File);
+      if (std::error_code EC = sys::fs::make_absolute(Path)) {
+        errs() << "error: " << File << ": " << EC.message();
+        return 1;
+      }
+      SourceFiles.push_back(Path.str());
+    }
+    return 0;
+  };
+
+  switch (Cmd) {
+  case Show:
+    return show(argc, argv, commandLineParser);
+  case Report:
+    return report(argc, argv, commandLineParser);
+  }
+  return 0;
+}
+
+int CodeCoverageTool::show(int argc, const char **argv,
+                           CommandLineParserType commandLineParser) {
+
+  cl::OptionCategory ViewCategory("Viewing options");
+
+  cl::opt<bool> ShowLineExecutionCounts(
+      "show-line-counts", cl::Optional,
+      cl::desc("Show the execution counts for each line"), cl::init(true),
+      cl::cat(ViewCategory));
+
+  cl::opt<bool> ShowRegions(
+      "show-regions", cl::Optional,
+      cl::desc("Show the execution counts for each region"),
+      cl::cat(ViewCategory));
+
+  cl::opt<bool> ShowBestLineRegionsCounts(
+      "show-line-counts-or-regions", cl::Optional,
+      cl::desc("Show the execution counts for each line, or the execution "
+               "counts for each region on lines that have multiple regions"),
+      cl::cat(ViewCategory));
+
+  cl::opt<bool> ShowExpansions("show-expansions", cl::Optional,
+                               cl::desc("Show expanded source regions"),
+                               cl::cat(ViewCategory));
+
+  cl::opt<bool> ShowInstantiations("show-instantiations", cl::Optional,
+                                   cl::desc("Show function instantiations"),
+                                   cl::cat(ViewCategory));
+
+  cl::opt<bool> NoColors("no-colors", cl::Optional,
+                         cl::desc("Don't show text colors"), cl::init(false),
+                         cl::cat(ViewCategory));
+
+  auto Err = commandLineParser(argc, argv);
+  if (Err)
+    return Err;
+
+  ViewOpts.Colors = !NoColors;
+  ViewOpts.ShowLineNumbers = true;
+  ViewOpts.ShowLineStats = ShowLineExecutionCounts.getNumOccurrences() != 0 ||
+                           !ShowRegions || ShowBestLineRegionsCounts;
+  ViewOpts.ShowRegionMarkers = ShowRegions || ShowBestLineRegionsCounts;
+  ViewOpts.ShowLineStatsOrRegionMarkers = ShowBestLineRegionsCounts;
+  ViewOpts.ShowExpandedRegions = ShowExpansions;
+  ViewOpts.ShowFunctionInstantiations = ShowInstantiations;
+
+  auto Coverage = load();
+  if (!Coverage)
+    return 1;
+
+  if (!Filters.empty()) {
+    // Show functions
+    for (const auto &Function : Coverage->getCoveredFunctions()) {
+      if (!Filters.matches(Function))
+        continue;
+
+      auto mainView = createFunctionView(Function, *Coverage);
+      if (!mainView) {
+        ViewOpts.colored_ostream(outs(), raw_ostream::RED)
+            << "warning: Could not read coverage for '" << Function.Name;
+        outs() << "\n";
+        continue;
+      }
+      ViewOpts.colored_ostream(outs(), raw_ostream::CYAN) << Function.Name
+                                                          << ":";
+      outs() << "\n";
+      mainView->render(outs(), /*WholeFile=*/false);
+      outs() << "\n";
+    }
+    return 0;
+  }
+
+  // Show files
+  bool ShowFilenames = SourceFiles.size() != 1;
+
+  if (SourceFiles.empty())
+    // Get the source files from the function coverage mapping
+    for (StringRef Filename : Coverage->getUniqueSourceFiles())
+      SourceFiles.push_back(Filename);
+
+  for (const auto &SourceFile : SourceFiles) {
+    auto mainView = createSourceFileView(SourceFile, *Coverage);
+    if (!mainView) {
+      ViewOpts.colored_ostream(outs(), raw_ostream::RED)
+          << "warning: The file '" << SourceFile << "' isn't covered.";
+      outs() << "\n";
+      continue;
+    }
+
+    if (ShowFilenames) {
+      ViewOpts.colored_ostream(outs(), raw_ostream::CYAN) << SourceFile << ":";
+      outs() << "\n";
+    }
+    mainView->render(outs(), /*Wholefile=*/true);
+    if (SourceFiles.size() > 1)
+      outs() << "\n";
+  }
+
+  return 0;
+}
+
+int CodeCoverageTool::report(int argc, const char **argv,
+                             CommandLineParserType commandLineParser) {
+  cl::opt<bool> NoColors("no-colors", cl::Optional,
+                         cl::desc("Don't show text colors"), cl::init(false));
+
+  auto Err = commandLineParser(argc, argv);
+  if (Err)
+    return Err;
+
+  ViewOpts.Colors = !NoColors;
+
+  auto Coverage = load();
+  if (!Coverage)
+    return 1;
+
+  CoverageSummary Summarizer;
+  Summarizer.createSummaries(*Coverage);
+  CoverageReport Report(ViewOpts, Summarizer);
+  if (SourceFiles.empty() && Filters.empty()) {
+    Report.renderFileReports(llvm::outs());
+    return 0;
+  }
+
+  Report.renderFunctionReports(llvm::outs());
+  return 0;
+}
+
+int showMain(int argc, const char *argv[]) {
+  CodeCoverageTool Tool;
+  return Tool.run(CodeCoverageTool::Show, argc, argv);
+}
+
+int reportMain(int argc, const char *argv[]) {
+  CodeCoverageTool Tool;
+  return Tool.run(CodeCoverageTool::Report, argc, argv);
+}

diff --git a/tools/llvm-cov/CoverageFilters.cpp b/tools/llvm-cov/CoverageFilters.cpp
new file mode 100644
index 0000000..325dd72
--- /dev/null
+++ b/tools/llvm-cov/CoverageFilters.cpp

@@ -0,0 +1,59 @@
+//===- CoverageFilters.cpp - Function coverage mapping filters ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// These classes provide filtering for function coverage mapping records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CoverageFilters.h"
+#include "CoverageSummaryInfo.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm;
+
+bool NameCoverageFilter::matches(const coverage::FunctionRecord &Function) {
+  StringRef FuncName = Function.Name;
+  return FuncName.find(Name) != StringRef::npos;
+}
+
+bool
+NameRegexCoverageFilter::matches(const coverage::FunctionRecord &Function) {
+  return llvm::Regex(Regex).match(Function.Name);
+}
+
+bool RegionCoverageFilter::matches(const coverage::FunctionRecord &Function) {
+  return PassesThreshold(FunctionCoverageSummary::get(Function)
+                             .RegionCoverage.getPercentCovered());
+}
+
+bool LineCoverageFilter::matches(const coverage::FunctionRecord &Function) {
+  return PassesThreshold(
+      FunctionCoverageSummary::get(Function).LineCoverage.getPercentCovered());
+}
+
+void CoverageFilters::push_back(std::unique_ptr<CoverageFilter> Filter) {
+  Filters.push_back(std::move(Filter));
+}
+
+bool CoverageFilters::matches(const coverage::FunctionRecord &Function) {
+  for (const auto &Filter : Filters) {
+    if (Filter->matches(Function))
+      return true;
+  }
+  return false;
+}
+
+bool
+CoverageFiltersMatchAll::matches(const coverage::FunctionRecord &Function) {
+  for (const auto &Filter : Filters) {
+    if (!Filter->matches(Function))
+      return false;
+  }
+  return true;
+}

diff --git a/tools/llvm-cov/CoverageFilters.h b/tools/llvm-cov/CoverageFilters.h
new file mode 100644
index 0000000..e543005
--- /dev/null
+++ b/tools/llvm-cov/CoverageFilters.h

@@ -0,0 +1,127 @@
+//===- CoverageFilters.h - Function coverage mapping filters --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// These classes provide filtering for function coverage mapping records.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_COVERAGEFILTERS_H
+#define LLVM_COV_COVERAGEFILTERS_H
+
+#include "llvm/ProfileData/CoverageMapping.h"
+#include <vector>
+#include <memory>
+
+namespace llvm {
+
+/// \brief Matches specific functions that pass the requirement of this filter.
+class CoverageFilter {
+public:
+  virtual ~CoverageFilter() {}
+
+  /// \brief Return true if the function passes the requirements of this filter.
+  virtual bool matches(const coverage::FunctionRecord &Function) {
+    return true;
+  }
+};
+
+/// \brief Matches functions that contain a specific string in their name.
+class NameCoverageFilter : public CoverageFilter {
+  StringRef Name;
+
+public:
+  NameCoverageFilter(StringRef Name) : Name(Name) {}
+
+  bool matches(const coverage::FunctionRecord &Function) override;
+};
+
+/// \brief Matches functions whose name matches a certain regular expression.
+class NameRegexCoverageFilter : public CoverageFilter {
+  StringRef Regex;
+
+public:
+  NameRegexCoverageFilter(StringRef Regex) : Regex(Regex) {}
+
+  bool matches(const coverage::FunctionRecord &Function) override;
+};
+
+/// \brief Matches numbers that pass a certain threshold.
+template <typename T> class StatisticThresholdFilter {
+public:
+  enum Operation { LessThan, GreaterThan };
+
+protected:
+  Operation Op;
+  T Threshold;
+
+  StatisticThresholdFilter(Operation Op, T Threshold)
+      : Op(Op), Threshold(Threshold) {}
+
+  /// \brief Return true if the given number is less than
+  /// or greater than the certain threshold.
+  bool PassesThreshold(T Value) const {
+    switch (Op) {
+    case LessThan:
+      return Value < Threshold;
+    case GreaterThan:
+      return Value > Threshold;
+    }
+    return false;
+  }
+};
+
+/// \brief Matches functions whose region coverage percentage
+/// is above/below a certain percentage.
+class RegionCoverageFilter : public CoverageFilter,
+                             public StatisticThresholdFilter<double> {
+public:
+  RegionCoverageFilter(Operation Op, double Threshold)
+      : StatisticThresholdFilter(Op, Threshold) {}
+
+  bool matches(const coverage::FunctionRecord &Function) override;
+};
+
+/// \brief Matches functions whose line coverage percentage
+/// is above/below a certain percentage.
+class LineCoverageFilter : public CoverageFilter,
+                           public StatisticThresholdFilter<double> {
+public:
+  LineCoverageFilter(Operation Op, double Threshold)
+      : StatisticThresholdFilter(Op, Threshold) {}
+
+  bool matches(const coverage::FunctionRecord &Function) override;
+};
+
+/// \brief A collection of filters.
+/// Matches functions that match any filters contained
+/// in an instance of this class.
+class CoverageFilters : public CoverageFilter {
+protected:
+  std::vector<std::unique_ptr<CoverageFilter>> Filters;
+
+public:
+  /// \brief Append a filter to this collection.
+  void push_back(std::unique_ptr<CoverageFilter> Filter);
+
+  bool empty() const { return Filters.empty(); }
+
+  bool matches(const coverage::FunctionRecord &Function) override;
+};
+
+/// \brief A collection of filters.
+/// Matches functions that match all of the filters contained
+/// in an instance of this class.
+class CoverageFiltersMatchAll : public CoverageFilters {
+public:
+  bool matches(const coverage::FunctionRecord &Function) override;
+};
+
+} // namespace llvm
+
+#endif // LLVM_COV_COVERAGEFILTERS_H

diff --git a/tools/llvm-cov/CoverageReport.cpp b/tools/llvm-cov/CoverageReport.cpp
new file mode 100644
index 0000000..7ac9355
--- /dev/null
+++ b/tools/llvm-cov/CoverageReport.cpp

@@ -0,0 +1,202 @@
+//===- CoverageReport.cpp - Code coverage report -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements rendering of a code coverage report.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CoverageReport.h"
+#include "CoverageSummary.h"
+#include "RenderingSupport.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FileSystem.h"
+
+using namespace llvm;
+namespace {
+/// \brief Helper struct which prints trimmed and aligned columns.
+struct Column {
+  enum TrimKind { NoTrim, LeftTrim, RightTrim };
+
+  enum AlignmentKind { LeftAlignment, RightAlignment };
+
+  StringRef Str;
+  unsigned Width;
+  TrimKind Trim;
+  AlignmentKind Alignment;
+
+  Column(StringRef Str, unsigned Width)
+      : Str(Str), Width(Width), Trim(NoTrim), Alignment(LeftAlignment) {}
+
+  Column &set(TrimKind Value) {
+    Trim = Value;
+    return *this;
+  }
+
+  Column &set(AlignmentKind Value) {
+    Alignment = Value;
+    return *this;
+  }
+
+  void render(raw_ostream &OS) const;
+};
+raw_ostream &operator<<(raw_ostream &OS, const Column &Value) {
+  Value.render(OS);
+  return OS;
+}
+}
+
+void Column::render(raw_ostream &OS) const {
+  if (Str.size() <= Width) {
+    if (Alignment == RightAlignment) {
+      OS.indent(Width - Str.size());
+      OS << Str;
+      return;
+    }
+    OS << Str;
+    OS.indent(Width - Str.size());
+    return;
+  }
+
+  switch (Trim) {
+  case NoTrim:
+    OS << Str.substr(0, Width);
+    break;
+  case LeftTrim:
+    OS << "..." << Str.substr(Str.size() - Width + 3);
+    break;
+  case RightTrim:
+    OS << Str.substr(0, Width - 3) << "...";
+    break;
+  }
+}
+
+static Column column(StringRef Str, unsigned Width) {
+  return Column(Str, Width);
+}
+
+template <typename T>
+static Column column(StringRef Str, unsigned Width, const T &Value) {
+  return Column(Str, Width).set(Value);
+}
+
+static const unsigned FileReportColumns[] = {25, 10, 8, 8, 10, 10};
+static const unsigned FunctionReportColumns[] = {25, 10, 8, 8, 10, 8, 8};
+
+/// \brief Prints a horizontal divider which spans across the given columns.
+template <typename T, size_t N>
+static void renderDivider(T (&Columns)[N], raw_ostream &OS) {
+  unsigned Length = 0;
+  for (unsigned I = 0; I < N; ++I)
+    Length += Columns[I];
+  for (unsigned I = 0; I < Length; ++I)
+    OS << '-';
+}
+
+/// \brief Return the color which correponds to the coverage
+/// percentage of a certain metric.
+template <typename T>
+static raw_ostream::Colors determineCoveragePercentageColor(const T &Info) {
+  if (Info.isFullyCovered())
+    return raw_ostream::GREEN;
+  return Info.getPercentCovered() >= 80.0 ? raw_ostream::YELLOW
+                                          : raw_ostream::RED;
+}
+
+void CoverageReport::render(const FileCoverageSummary &File, raw_ostream &OS) {
+  OS << column(File.Name, FileReportColumns[0], Column::LeftTrim)
+     << format("%*u", FileReportColumns[1], (unsigned)File.RegionCoverage.NumRegions);
+  Options.colored_ostream(OS, File.RegionCoverage.isFullyCovered()
+                                  ? raw_ostream::GREEN
+                                  : raw_ostream::RED)
+      << format("%*u", FileReportColumns[2], (unsigned)File.RegionCoverage.NotCovered);
+  Options.colored_ostream(OS,
+                          determineCoveragePercentageColor(File.RegionCoverage))
+      << format("%*.2f", FileReportColumns[3] - 1,
+                File.RegionCoverage.getPercentCovered()) << '%';
+  OS << format("%*u", FileReportColumns[4],
+               (unsigned)File.FunctionCoverage.NumFunctions);
+  Options.colored_ostream(
+      OS, determineCoveragePercentageColor(File.FunctionCoverage))
+      << format("%*.2f", FileReportColumns[5] - 1,
+                File.FunctionCoverage.getPercentCovered()) << '%';
+  OS << "\n";
+}
+
+void CoverageReport::render(const FunctionCoverageSummary &Function,
+                            raw_ostream &OS) {
+  OS << column(Function.Name, FunctionReportColumns[0], Column::RightTrim)
+     << format("%*u", FunctionReportColumns[1],
+               (unsigned)Function.RegionCoverage.NumRegions);
+  Options.colored_ostream(OS, Function.RegionCoverage.isFullyCovered()
+                                  ? raw_ostream::GREEN
+                                  : raw_ostream::RED)
+      << format("%*u", FunctionReportColumns[2],
+                (unsigned)Function.RegionCoverage.NotCovered);
+  Options.colored_ostream(
+      OS, determineCoveragePercentageColor(Function.RegionCoverage))
+      << format("%*.2f", FunctionReportColumns[3] - 1,
+                Function.RegionCoverage.getPercentCovered()) << '%';
+  OS << format("%*u", FunctionReportColumns[4],
+               (unsigned)Function.LineCoverage.NumLines);
+  Options.colored_ostream(OS, Function.LineCoverage.isFullyCovered()
+                                  ? raw_ostream::GREEN
+                                  : raw_ostream::RED)
+      << format("%*u", FunctionReportColumns[5],
+                (unsigned)Function.LineCoverage.NotCovered);
+  Options.colored_ostream(
+      OS, determineCoveragePercentageColor(Function.LineCoverage))
+      << format("%*.2f", FunctionReportColumns[6] - 1,
+                Function.LineCoverage.getPercentCovered()) << '%';
+  OS << "\n";
+}
+
+void CoverageReport::renderFunctionReports(raw_ostream &OS) {
+  bool isFirst = true;
+  for (const auto &File : Summary.getFileSummaries()) {
+    if (isFirst)
+      isFirst = false;
+    else
+      OS << "\n";
+    OS << "File '" << File.Name << "':\n";
+    OS << column("Name", FunctionReportColumns[0])
+       << column("Regions", FunctionReportColumns[1], Column::RightAlignment)
+       << column("Miss", FunctionReportColumns[2], Column::RightAlignment)
+       << column("Cover", FunctionReportColumns[3], Column::RightAlignment)
+       << column("Lines", FunctionReportColumns[4], Column::RightAlignment)
+       << column("Miss", FunctionReportColumns[5], Column::RightAlignment)
+       << column("Cover", FunctionReportColumns[6], Column::RightAlignment);
+    OS << "\n";
+    renderDivider(FunctionReportColumns, OS);
+    OS << "\n";
+    for (const auto &Function : File.FunctionSummaries)
+      render(Function, OS);
+    renderDivider(FunctionReportColumns, OS);
+    OS << "\n";
+    render(FunctionCoverageSummary("TOTAL", /*ExecutionCount=*/0,
+                                   File.RegionCoverage, File.LineCoverage),
+           OS);
+  }
+}
+
+void CoverageReport::renderFileReports(raw_ostream &OS) {
+  OS << column("Filename", FileReportColumns[0])
+     << column("Regions", FileReportColumns[1], Column::RightAlignment)
+     << column("Miss", FileReportColumns[2], Column::RightAlignment)
+     << column("Cover", FileReportColumns[3], Column::RightAlignment)
+     << column("Functions", FileReportColumns[4], Column::RightAlignment)
+     << column("Executed", FileReportColumns[5], Column::RightAlignment)
+     << "\n";
+  renderDivider(FileReportColumns, OS);
+  OS << "\n";
+  for (const auto &File : Summary.getFileSummaries())
+    render(File, OS);
+  renderDivider(FileReportColumns, OS);
+  OS << "\n";
+  render(Summary.getCombinedFileSummaries(), OS);
+}

diff --git a/tools/llvm-cov/CoverageReport.h b/tools/llvm-cov/CoverageReport.h
new file mode 100644
index 0000000..e8d34f2
--- /dev/null
+++ b/tools/llvm-cov/CoverageReport.h

@@ -0,0 +1,40 @@
+//===- CoverageReport.h - Code coverage report ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements rendering of a code coverage report.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_COVERAGEREPORT_H
+#define LLVM_COV_COVERAGEREPORT_H
+
+#include "CoverageViewOptions.h"
+#include "CoverageSummary.h"
+
+namespace llvm {
+
+/// \brief Displays the code coverage report.
+class CoverageReport {
+  const CoverageViewOptions &Options;
+  CoverageSummary &Summary;
+
+  void render(const FileCoverageSummary &File, raw_ostream &OS);
+  void render(const FunctionCoverageSummary &Function, raw_ostream &OS);
+
+public:
+  CoverageReport(const CoverageViewOptions &Options, CoverageSummary &Summary)
+      : Options(Options), Summary(Summary) {}
+
+  void renderFunctionReports(raw_ostream &OS);
+
+  void renderFileReports(raw_ostream &OS);
+};
+}
+
+#endif // LLVM_COV_COVERAGEREPORT_H

diff --git a/tools/llvm-cov/CoverageSummary.cpp b/tools/llvm-cov/CoverageSummary.cpp
new file mode 100644
index 0000000..059c8c8
--- /dev/null
+++ b/tools/llvm-cov/CoverageSummary.cpp

@@ -0,0 +1,64 @@
+//===- CoverageSummary.cpp - Code coverage summary ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements data management and rendering for the code coverage
+// summaries of all files and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CoverageSummary.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+unsigned CoverageSummary::getFileID(StringRef Filename) {
+  for (unsigned I = 0, E = Filenames.size(); I < E; ++I) {
+    if (sys::fs::equivalent(Filenames[I], Filename))
+      return I;
+  }
+  Filenames.push_back(Filename);
+  return Filenames.size() - 1;
+}
+
+void
+CoverageSummary::createSummaries(const coverage::CoverageMapping &Coverage) {
+  for (StringRef Filename : Coverage.getUniqueSourceFiles()) {
+    size_t PrevSize = FunctionSummaries.size();
+    for (const auto &F : Coverage.getCoveredFunctions(Filename))
+      FunctionSummaries.push_back(FunctionCoverageSummary::get(F));
+    size_t Count = FunctionSummaries.size() - PrevSize;
+    if (Count == 0)
+      continue;
+    FileSummaries.push_back(FileCoverageSummary::get(
+        Filename, makeArrayRef(FunctionSummaries.data() + PrevSize, Count)));
+  }
+}
+
+FileCoverageSummary CoverageSummary::getCombinedFileSummaries() {
+  size_t NumRegions = 0, CoveredRegions = 0;
+  size_t NumLines = 0, NonCodeLines = 0, CoveredLines = 0;
+  size_t NumFunctionsExecuted = 0, NumFunctions = 0;
+  for (const auto &File : FileSummaries) {
+    NumRegions += File.RegionCoverage.NumRegions;
+    CoveredRegions += File.RegionCoverage.Covered;
+
+    NumLines += File.LineCoverage.NumLines;
+    NonCodeLines += File.LineCoverage.NonCodeLines;
+    CoveredLines += File.LineCoverage.Covered;
+
+    NumFunctionsExecuted += File.FunctionCoverage.Executed;
+    NumFunctions += File.FunctionCoverage.NumFunctions;
+  }
+  return FileCoverageSummary(
+      "TOTAL", RegionCoverageInfo(CoveredRegions, NumRegions),
+      LineCoverageInfo(CoveredLines, NonCodeLines, NumLines),
+      FunctionCoverageInfo(NumFunctionsExecuted, NumFunctions),
+      None);
+}

diff --git a/tools/llvm-cov/CoverageSummary.h b/tools/llvm-cov/CoverageSummary.h
new file mode 100644
index 0000000..9dbebde
--- /dev/null
+++ b/tools/llvm-cov/CoverageSummary.h

@@ -0,0 +1,45 @@
+//===- CoverageSummary.h - Code coverage summary --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements data management and rendering for the code coverage
+// summaries of all files and functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_COVERAGESUMMARY_H
+#define LLVM_COV_COVERAGESUMMARY_H
+
+#include "CoverageSummaryInfo.h"
+#include <vector>
+
+namespace llvm {
+
+/// \brief Manager for the function and file code coverage summaries.
+class CoverageSummary {
+  std::vector<StringRef> Filenames;
+  std::vector<FunctionCoverageSummary> FunctionSummaries;
+  std::vector<std::pair<unsigned, unsigned>> FunctionSummariesFileIDs;
+  std::vector<FileCoverageSummary> FileSummaries;
+
+  unsigned getFileID(StringRef Filename);
+
+public:
+  void createSummaries(const coverage::CoverageMapping &Coverage);
+
+  ArrayRef<FileCoverageSummary> getFileSummaries() { return FileSummaries; }
+
+  FileCoverageSummary getCombinedFileSummaries();
+
+  void render(const FunctionCoverageSummary &Summary, raw_ostream &OS);
+
+  void render(raw_ostream &OS);
+};
+}
+
+#endif // LLVM_COV_COVERAGESUMMARY_H

diff --git a/tools/llvm-cov/CoverageSummaryInfo.cpp b/tools/llvm-cov/CoverageSummaryInfo.cpp
new file mode 100644
index 0000000..dd78ace
--- /dev/null
+++ b/tools/llvm-cov/CoverageSummaryInfo.cpp

@@ -0,0 +1,96 @@
+//===- CoverageSummaryInfo.cpp - Coverage summary for function/file -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// These structures are used to represent code coverage metrics
+// for functions/files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CoverageSummaryInfo.h"
+
+using namespace llvm;
+using namespace coverage;
+
+FunctionCoverageSummary
+FunctionCoverageSummary::get(const coverage::FunctionRecord &Function) {
+  // Compute the region coverage
+  size_t NumCodeRegions = 0, CoveredRegions = 0;
+  for (auto &CR : Function.CountedRegions) {
+    if (CR.Kind != CounterMappingRegion::CodeRegion)
+      continue;
+    ++NumCodeRegions;
+    if (CR.ExecutionCount != 0)
+      ++CoveredRegions;
+  }
+
+  // Compute the line coverage
+  size_t NumLines = 0, CoveredLines = 0;
+  for (unsigned FileID = 0, E = Function.Filenames.size(); FileID < E;
+       ++FileID) {
+    // Find the line start and end of the function's source code
+    // in that particular file
+    unsigned LineStart = std::numeric_limits<unsigned>::max();
+    unsigned LineEnd = 0;
+    for (auto &CR : Function.CountedRegions) {
+      if (CR.FileID != FileID)
+        continue;
+      LineStart = std::min(LineStart, CR.LineStart);
+      LineEnd = std::max(LineEnd, CR.LineEnd);
+    }
+    unsigned LineCount = LineEnd - LineStart + 1;
+
+    // Get counters
+    llvm::SmallVector<uint64_t, 16> ExecutionCounts;
+    ExecutionCounts.resize(LineCount, 0);
+    for (auto &CR : Function.CountedRegions) {
+      if (CR.FileID != FileID)
+        continue;
+      // Ignore the lines that were skipped by the preprocessor.
+      auto ExecutionCount = CR.ExecutionCount;
+      if (CR.Kind == CounterMappingRegion::SkippedRegion) {
+        LineCount -= CR.LineEnd - CR.LineStart + 1;
+        ExecutionCount = 1;
+      }
+      for (unsigned I = CR.LineStart; I <= CR.LineEnd; ++I)
+        ExecutionCounts[I - LineStart] = ExecutionCount;
+    }
+    CoveredLines += LineCount - std::count(ExecutionCounts.begin(),
+                                           ExecutionCounts.end(), 0);
+    NumLines += LineCount;
+  }
+  return FunctionCoverageSummary(
+      Function.Name, Function.ExecutionCount,
+      RegionCoverageInfo(CoveredRegions, NumCodeRegions),
+      LineCoverageInfo(CoveredLines, 0, NumLines));
+}
+
+FileCoverageSummary
+FileCoverageSummary::get(StringRef Name,
+                         ArrayRef<FunctionCoverageSummary> FunctionSummaries) {
+  size_t NumRegions = 0, CoveredRegions = 0;
+  size_t NumLines = 0, NonCodeLines = 0, CoveredLines = 0;
+  size_t NumFunctionsExecuted = 0;
+  for (const auto &Func : FunctionSummaries) {
+    CoveredRegions += Func.RegionCoverage.Covered;
+    NumRegions += Func.RegionCoverage.NumRegions;
+
+    CoveredLines += Func.LineCoverage.Covered;
+    NonCodeLines += Func.LineCoverage.NonCodeLines;
+    NumLines += Func.LineCoverage.NumLines;
+
+    if (Func.ExecutionCount != 0)
+      ++NumFunctionsExecuted;
+  }
+
+  return FileCoverageSummary(
+      Name, RegionCoverageInfo(CoveredRegions, NumRegions),
+      LineCoverageInfo(CoveredLines, NonCodeLines, NumLines),
+      FunctionCoverageInfo(NumFunctionsExecuted, FunctionSummaries.size()),
+      FunctionSummaries);
+}

diff --git a/tools/llvm-cov/CoverageSummaryInfo.h b/tools/llvm-cov/CoverageSummaryInfo.h
new file mode 100644
index 0000000..0036032
--- /dev/null
+++ b/tools/llvm-cov/CoverageSummaryInfo.h

@@ -0,0 +1,133 @@
+//===- CoverageSummaryInfo.h - Coverage summary for function/file ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// These structures are used to represent code coverage metrics
+// for functions/files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_COVERAGESUMMARYINFO_H
+#define LLVM_COV_COVERAGESUMMARYINFO_H
+
+#include "llvm/ProfileData/CoverageMapping.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+/// \brief Provides information about region coverage for a function/file.
+struct RegionCoverageInfo {
+  /// \brief The number of regions that were executed at least once.
+  size_t Covered;
+
+  /// \brief The number of regions that weren't executed.
+  size_t NotCovered;
+
+  /// \brief The total number of regions in a function/file.
+  size_t NumRegions;
+
+  RegionCoverageInfo(size_t Covered, size_t NumRegions)
+      : Covered(Covered), NotCovered(NumRegions - Covered),
+        NumRegions(NumRegions) {}
+
+  bool isFullyCovered() const { return Covered == NumRegions; }
+
+  double getPercentCovered() const {
+    return double(Covered) / double(NumRegions) * 100.0;
+  }
+};
+
+/// \brief Provides information about line coverage for a function/file.
+struct LineCoverageInfo {
+  /// \brief The number of lines that were executed at least once.
+  size_t Covered;
+
+  /// \brief The number of lines that weren't executed.
+  size_t NotCovered;
+
+  /// \brief The number of lines that aren't code.
+  size_t NonCodeLines;
+
+  /// \brief The total number of lines in a function/file.
+  size_t NumLines;
+
+  LineCoverageInfo(size_t Covered, size_t NumNonCodeLines, size_t NumLines)
+      : Covered(Covered), NotCovered(NumLines - NumNonCodeLines - Covered),
+        NonCodeLines(NumNonCodeLines), NumLines(NumLines) {}
+
+  bool isFullyCovered() const { return Covered == (NumLines - NonCodeLines); }
+
+  double getPercentCovered() const {
+    return double(Covered) / double(NumLines - NonCodeLines) * 100.0;
+  }
+};
+
+/// \brief Provides information about function coverage for a file.
+struct FunctionCoverageInfo {
+  /// \brief The number of functions that were executed.
+  size_t Executed;
+
+  /// \brief The total number of functions in this file.
+  size_t NumFunctions;
+
+  FunctionCoverageInfo(size_t Executed, size_t NumFunctions)
+      : Executed(Executed), NumFunctions(NumFunctions) {}
+
+  bool isFullyCovered() const { return Executed == NumFunctions; }
+
+  double getPercentCovered() const {
+    return double(Executed) / double(NumFunctions) * 100.0;
+  }
+};
+
+/// \brief A summary of function's code coverage.
+struct FunctionCoverageSummary {
+  StringRef Name;
+  uint64_t ExecutionCount;
+  RegionCoverageInfo RegionCoverage;
+  LineCoverageInfo LineCoverage;
+
+  FunctionCoverageSummary(StringRef Name, uint64_t ExecutionCount,
+                          const RegionCoverageInfo &RegionCoverage,
+                          const LineCoverageInfo &LineCoverage)
+      : Name(Name), ExecutionCount(ExecutionCount),
+        RegionCoverage(RegionCoverage), LineCoverage(LineCoverage) {
+  }
+
+  /// \brief Compute the code coverage summary for the given function coverage
+  /// mapping record.
+  static FunctionCoverageSummary
+  get(const coverage::FunctionRecord &Function);
+};
+
+/// \brief A summary of file's code coverage.
+struct FileCoverageSummary {
+  StringRef Name;
+  RegionCoverageInfo RegionCoverage;
+  LineCoverageInfo LineCoverage;
+  FunctionCoverageInfo FunctionCoverage;
+  /// \brief The summary of every function
+  /// in this file.
+  ArrayRef<FunctionCoverageSummary> FunctionSummaries;
+
+  FileCoverageSummary(StringRef Name, const RegionCoverageInfo &RegionCoverage,
+                      const LineCoverageInfo &LineCoverage,
+                      const FunctionCoverageInfo &FunctionCoverage,
+                      ArrayRef<FunctionCoverageSummary> FunctionSummaries)
+      : Name(Name), RegionCoverage(RegionCoverage), LineCoverage(LineCoverage),
+        FunctionCoverage(FunctionCoverage),
+        FunctionSummaries(FunctionSummaries) {}
+
+  /// \brief Compute the code coverage summary for a file.
+  static FileCoverageSummary
+  get(StringRef Name, ArrayRef<FunctionCoverageSummary> FunctionSummaries);
+};
+
+} // namespace llvm
+
+#endif // LLVM_COV_COVERAGESUMMARYINFO_H

diff --git a/tools/llvm-cov/CoverageViewOptions.h b/tools/llvm-cov/CoverageViewOptions.h
new file mode 100644
index 0000000..94b55fe
--- /dev/null
+++ b/tools/llvm-cov/CoverageViewOptions.h

@@ -0,0 +1,36 @@
+//===- CoverageViewOptions.h - Code coverage display options -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_COVERAGEVIEWOPTIONS_H
+#define LLVM_COV_COVERAGEVIEWOPTIONS_H
+
+#include "RenderingSupport.h"
+
+namespace llvm {
+
+/// \brief The options for displaying the code coverage information.
+struct CoverageViewOptions {
+  bool Debug;
+  bool Colors;
+  bool ShowLineNumbers;
+  bool ShowLineStats;
+  bool ShowRegionMarkers;
+  bool ShowLineStatsOrRegionMarkers;
+  bool ShowExpandedRegions;
+  bool ShowFunctionInstantiations;
+
+  /// \brief Change the output's stream color if the colors are enabled.
+  ColoredRawOstream colored_ostream(raw_ostream &OS,
+                                    raw_ostream::Colors Color) const {
+    return llvm::colored_ostream(OS, Color, Colors);
+  }
+};
+}
+
+#endif // LLVM_COV_COVERAGEVIEWOPTIONS_H

diff --git a/tools/llvm-cov/LLVMBuild.txt b/tools/llvm-cov/LLVMBuild.txt
index 87e00d1..d6eb74d 100644
--- a/tools/llvm-cov/LLVMBuild.txt
+++ b/tools/llvm-cov/LLVMBuild.txt

@@ -19,4 +19,4 @@
 type = Tool
 name = llvm-cov
 parent = Tools
-required_libraries = Instrumentation
+required_libraries = ProfileData Support Instrumentation

diff --git a/tools/llvm-cov/Makefile b/tools/llvm-cov/Makefile
index efed6cc..6e32b4d 100644
--- a/tools/llvm-cov/Makefile
+++ b/tools/llvm-cov/Makefile

@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-cov
-LINK_COMPONENTS := core support
+LINK_COMPONENTS := core support profiledata object
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1

diff --git a/tools/llvm-cov/RenderingSupport.h b/tools/llvm-cov/RenderingSupport.h
new file mode 100644
index 0000000..0271329
--- /dev/null
+++ b/tools/llvm-cov/RenderingSupport.h

@@ -0,0 +1,60 @@
+//===- RenderingSupport.h - output stream rendering support functions  ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_RENDERINGSUPPORT_H
+#define LLVM_COV_RENDERINGSUPPORT_H
+
+#include "llvm/Support/raw_ostream.h"
+#include <utility>
+
+namespace llvm {
+
+/// \brief A helper class that resets the output stream's color if needed
+/// when destroyed.
+class ColoredRawOstream {
+  ColoredRawOstream(const ColoredRawOstream &OS) LLVM_DELETED_FUNCTION;
+
+public:
+  raw_ostream &OS;
+  bool IsColorUsed;
+
+  ColoredRawOstream(raw_ostream &OS, bool IsColorUsed)
+      : OS(OS), IsColorUsed(IsColorUsed) {}
+
+  ColoredRawOstream(ColoredRawOstream &&Other)
+      : OS(Other.OS), IsColorUsed(Other.IsColorUsed) {
+    // Reset the other IsColorUsed so that the other object won't reset the
+    // color when destroyed.
+    Other.IsColorUsed = false;
+  }
+
+  ~ColoredRawOstream() {
+    if (IsColorUsed)
+      OS.resetColor();
+  }
+};
+
+template <typename T>
+inline raw_ostream &operator<<(const ColoredRawOstream &OS, T &&Value) {
+  return OS.OS << std::forward<T>(Value);
+}
+
+/// \brief Change the color of the output stream if the `IsColorUsed` flag
+/// is true. Returns an object that resets the color when destroyed.
+inline ColoredRawOstream colored_ostream(raw_ostream &OS,
+                                         raw_ostream::Colors Color,
+                                         bool IsColorUsed = true,
+                                         bool Bold = false, bool BG = false) {
+  if (IsColorUsed)
+    OS.changeColor(Color, Bold, BG);
+  return ColoredRawOstream(OS, IsColorUsed);
+}
+}
+
+#endif // LLVM_COV_RENDERINGSUPPORT_H

diff --git a/tools/llvm-cov/SourceCoverageView.cpp b/tools/llvm-cov/SourceCoverageView.cpp
new file mode 100644
index 0000000..015099c
--- /dev/null
+++ b/tools/llvm-cov/SourceCoverageView.cpp

@@ -0,0 +1,260 @@
+//===- SourceCoverageView.cpp - Code coverage view for source code --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements rendering for code coverage of source code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SourceCoverageView.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/LineIterator.h"
+
+using namespace llvm;
+
+void SourceCoverageView::renderLine(
+    raw_ostream &OS, StringRef Line, int64_t LineNumber,
+    const coverage::CoverageSegment *WrappedSegment,
+    ArrayRef<const coverage::CoverageSegment *> Segments,
+    unsigned ExpansionCol) {
+  Optional<raw_ostream::Colors> Highlight;
+  SmallVector<std::pair<unsigned, unsigned>, 2> HighlightedRanges;
+
+  // The first segment overlaps from a previous line, so we treat it specially.
+  if (WrappedSegment && WrappedSegment->HasCount && WrappedSegment->Count == 0)
+    Highlight = raw_ostream::RED;
+
+  // Output each segment of the line, possibly highlighted.
+  unsigned Col = 1;
+  for (const auto *S : Segments) {
+    unsigned End = std::min(S->Col, static_cast<unsigned>(Line.size()) + 1);
+    colored_ostream(OS, Highlight ? *Highlight : raw_ostream::SAVEDCOLOR,
+                    Options.Colors && Highlight, /*Bold=*/false, /*BG=*/true)
+        << Line.substr(Col - 1, End - Col);
+    if (Options.Debug && Highlight)
+      HighlightedRanges.push_back(std::make_pair(Col, End));
+    Col = End;
+    if (Col == ExpansionCol)
+      Highlight = raw_ostream::CYAN;
+    else if (S->HasCount && S->Count == 0)
+      Highlight = raw_ostream::RED;
+    else
+      Highlight = None;
+  }
+
+  // Show the rest of the line
+  colored_ostream(OS, Highlight ? *Highlight : raw_ostream::SAVEDCOLOR,
+                  Options.Colors && Highlight, /*Bold=*/false, /*BG=*/true)
+      << Line.substr(Col - 1, Line.size() - Col + 1);
+  OS << "\n";
+
+  if (Options.Debug) {
+    for (const auto &Range : HighlightedRanges)
+      errs() << "Highlighted line " << LineNumber << ", " << Range.first
+             << " -> " << Range.second << "\n";
+    if (Highlight)
+      errs() << "Highlighted line " << LineNumber << ", " << Col << " -> ?\n";
+  }
+}
+
+void SourceCoverageView::renderIndent(raw_ostream &OS, unsigned Level) {
+  for (unsigned I = 0; I < Level; ++I)
+    OS << "  |";
+}
+
+void SourceCoverageView::renderViewDivider(unsigned Level, unsigned Length,
+                                           raw_ostream &OS) {
+  assert(Level != 0 && "Cannot render divider at top level");
+  renderIndent(OS, Level - 1);
+  OS.indent(2);
+  for (unsigned I = 0; I < Length; ++I)
+    OS << "-";
+}
+
+void
+SourceCoverageView::renderLineCoverageColumn(raw_ostream &OS,
+                                             const LineCoverageInfo &Line) {
+  if (!Line.isMapped()) {
+    OS.indent(LineCoverageColumnWidth) << '|';
+    return;
+  }
+  SmallString<32> Buffer;
+  raw_svector_ostream BufferOS(Buffer);
+  BufferOS << Line.ExecutionCount;
+  auto Str = BufferOS.str();
+  // Trim
+  Str = Str.substr(0, std::min(Str.size(), (size_t)LineCoverageColumnWidth));
+  // Align to the right
+  OS.indent(LineCoverageColumnWidth - Str.size());
+  colored_ostream(OS, raw_ostream::MAGENTA,
+                  Line.hasMultipleRegions() && Options.Colors)
+      << Str;
+  OS << '|';
+}
+
+void SourceCoverageView::renderLineNumberColumn(raw_ostream &OS,
+                                                unsigned LineNo) {
+  SmallString<32> Buffer;
+  raw_svector_ostream BufferOS(Buffer);
+  BufferOS << LineNo;
+  auto Str = BufferOS.str();
+  // Trim and align to the right
+  Str = Str.substr(0, std::min(Str.size(), (size_t)LineNumberColumnWidth));
+  OS.indent(LineNumberColumnWidth - Str.size()) << Str << '|';
+}
+
+void SourceCoverageView::renderRegionMarkers(
+    raw_ostream &OS, ArrayRef<const coverage::CoverageSegment *> Segments) {
+  SmallString<32> Buffer;
+  raw_svector_ostream BufferOS(Buffer);
+
+  unsigned PrevColumn = 1;
+  for (const auto *S : Segments) {
+    if (!S->IsRegionEntry)
+      continue;
+    // Skip to the new region
+    if (S->Col > PrevColumn)
+      OS.indent(S->Col - PrevColumn);
+    PrevColumn = S->Col + 1;
+    BufferOS << S->Count;
+    StringRef Str = BufferOS.str();
+    // Trim the execution count
+    Str = Str.substr(0, std::min(Str.size(), (size_t)7));
+    PrevColumn += Str.size();
+    OS << '^' << Str;
+    Buffer.clear();
+  }
+  OS << "\n";
+
+  if (Options.Debug)
+    for (const auto *S : Segments)
+      errs() << "Marker at " << S->Line << ":" << S->Col << " = " << S->Count
+             << (S->IsRegionEntry ? "\n" : " (pop)\n");
+}
+
+void SourceCoverageView::render(raw_ostream &OS, bool WholeFile,
+                                unsigned IndentLevel) {
+  // The width of the leading columns
+  unsigned CombinedColumnWidth =
+      (Options.ShowLineStats ? LineCoverageColumnWidth + 1 : 0) +
+      (Options.ShowLineNumbers ? LineNumberColumnWidth + 1 : 0);
+  // The width of the line that is used to divide between the view and the
+  // subviews.
+  unsigned DividerWidth = CombinedColumnWidth + 4;
+
+  // We need the expansions and instantiations sorted so we can go through them
+  // while we iterate lines.
+  std::sort(ExpansionSubViews.begin(), ExpansionSubViews.end());
+  std::sort(InstantiationSubViews.begin(), InstantiationSubViews.end());
+  auto NextESV = ExpansionSubViews.begin();
+  auto EndESV = ExpansionSubViews.end();
+  auto NextISV = InstantiationSubViews.begin();
+  auto EndISV = InstantiationSubViews.end();
+
+  // Get the coverage information for the file.
+  auto NextSegment = CoverageInfo.begin();
+  auto EndSegment = CoverageInfo.end();
+
+  unsigned FirstLine = NextSegment != EndSegment ? NextSegment->Line : 0;
+  const coverage::CoverageSegment *WrappedSegment = nullptr;
+  SmallVector<const coverage::CoverageSegment *, 8> LineSegments;
+  for (line_iterator LI(File, /*SkipBlanks=*/false); !LI.is_at_eof(); ++LI) {
+    // If we aren't rendering the whole file, we need to filter out the prologue
+    // and epilogue.
+    if (!WholeFile) {
+      if (NextSegment == EndSegment)
+        break;
+      else if (LI.line_number() < FirstLine)
+        continue;
+    }
+
+    // Collect the coverage information relevant to this line.
+    if (LineSegments.size())
+      WrappedSegment = LineSegments.back();
+    LineSegments.clear();
+    while (NextSegment != EndSegment && NextSegment->Line == LI.line_number())
+      LineSegments.push_back(&*NextSegment++);
+
+    // Calculate a count to be for the line as a whole.
+    LineCoverageInfo LineCount;
+    if (WrappedSegment && WrappedSegment->HasCount)
+      LineCount.addRegionCount(WrappedSegment->Count);
+    for (const auto *S : LineSegments)
+      if (S->HasCount && S->IsRegionEntry)
+          LineCount.addRegionStartCount(S->Count);
+
+    // Render the line prefix.
+    renderIndent(OS, IndentLevel);
+    if (Options.ShowLineStats)
+      renderLineCoverageColumn(OS, LineCount);
+    if (Options.ShowLineNumbers)
+      renderLineNumberColumn(OS, LI.line_number());
+
+    // If there are expansion subviews, we want to highlight the first one.
+    unsigned ExpansionColumn = 0;
+    if (NextESV != EndESV && NextESV->getLine() == LI.line_number() &&
+        Options.Colors)
+      ExpansionColumn = NextESV->getStartCol();
+
+    // Display the source code for the current line.
+    renderLine(OS, *LI, LI.line_number(), WrappedSegment, LineSegments,
+               ExpansionColumn);
+
+    // Show the region markers.
+    if (Options.ShowRegionMarkers && (!Options.ShowLineStatsOrRegionMarkers ||
+                                      LineCount.hasMultipleRegions()) &&
+        !LineSegments.empty()) {
+      renderIndent(OS, IndentLevel);
+      OS.indent(CombinedColumnWidth);
+      renderRegionMarkers(OS, LineSegments);
+    }
+
+    // Show the expansions and instantiations for this line.
+    unsigned NestedIndent = IndentLevel + 1;
+    bool RenderedSubView = false;
+    for (; NextESV != EndESV && NextESV->getLine() == LI.line_number();
+         ++NextESV) {
+      renderViewDivider(NestedIndent, DividerWidth, OS);
+      OS << "\n";
+      if (RenderedSubView) {
+        // Re-render the current line and highlight the expansion range for
+        // this subview.
+        ExpansionColumn = NextESV->getStartCol();
+        renderIndent(OS, IndentLevel);
+        OS.indent(CombinedColumnWidth + (IndentLevel == 0 ? 0 : 1));
+        renderLine(OS, *LI, LI.line_number(), WrappedSegment, LineSegments,
+                   ExpansionColumn);
+        renderViewDivider(NestedIndent, DividerWidth, OS);
+        OS << "\n";
+      }
+      // Render the child subview
+      if (Options.Debug)
+        errs() << "Expansion at line " << NextESV->getLine() << ", "
+               << NextESV->getStartCol() << " -> " << NextESV->getEndCol()
+               << "\n";
+      NextESV->View->render(OS, false, NestedIndent);
+      RenderedSubView = true;
+    }
+    for (; NextISV != EndISV && NextISV->Line == LI.line_number(); ++NextISV) {
+      renderViewDivider(NestedIndent, DividerWidth, OS);
+      OS << "\n";
+      renderIndent(OS, NestedIndent);
+      OS << ' ';
+      Options.colored_ostream(OS, raw_ostream::CYAN) << NextISV->FunctionName
+                                                     << ":";
+      OS << "\n";
+      NextISV->View->render(OS, false, NestedIndent);
+      RenderedSubView = true;
+    }
+    if (RenderedSubView) {
+      renderViewDivider(NestedIndent, DividerWidth, OS);
+      OS << "\n";
+    }
+  }
+}

diff --git a/tools/llvm-cov/SourceCoverageView.h b/tools/llvm-cov/SourceCoverageView.h
new file mode 100644
index 0000000..d92a748
--- /dev/null
+++ b/tools/llvm-cov/SourceCoverageView.h

@@ -0,0 +1,162 @@
+//===- SourceCoverageView.h - Code coverage view for source code ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements rendering for code coverage of source code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_COV_SOURCECOVERAGEVIEW_H
+#define LLVM_COV_SOURCECOVERAGEVIEW_H
+
+#include "CoverageViewOptions.h"
+#include "llvm/ProfileData/CoverageMapping.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <vector>
+
+namespace llvm {
+
+class SourceCoverageView;
+
+/// \brief A view that represents a macro or include expansion
+struct ExpansionView {
+  coverage::CounterMappingRegion Region;
+  std::unique_ptr<SourceCoverageView> View;
+
+  ExpansionView(const coverage::CounterMappingRegion &Region,
+                std::unique_ptr<SourceCoverageView> View)
+      : Region(Region), View(std::move(View)) {}
+  ExpansionView(ExpansionView &&RHS)
+      : Region(std::move(RHS.Region)), View(std::move(RHS.View)) {}
+  ExpansionView &operator=(ExpansionView &&RHS) {
+    Region = std::move(RHS.Region);
+    View = std::move(RHS.View);
+    return *this;
+  }
+
+  unsigned getLine() const { return Region.LineStart; }
+  unsigned getStartCol() const { return Region.ColumnStart; }
+  unsigned getEndCol() const { return Region.ColumnEnd; }
+
+  friend bool operator<(const ExpansionView &LHS, const ExpansionView &RHS) {
+    return LHS.Region.startLoc() < RHS.Region.startLoc();
+  }
+};
+
+/// \brief A view that represents a function instantiation
+struct InstantiationView {
+  StringRef FunctionName;
+  unsigned Line;
+  std::unique_ptr<SourceCoverageView> View;
+
+  InstantiationView(StringRef FunctionName, unsigned Line,
+                    std::unique_ptr<SourceCoverageView> View)
+      : FunctionName(FunctionName), Line(Line), View(std::move(View)) {}
+  InstantiationView(InstantiationView &&RHS)
+      : FunctionName(std::move(RHS.FunctionName)), Line(std::move(RHS.Line)),
+        View(std::move(RHS.View)) {}
+  InstantiationView &operator=(InstantiationView &&RHS) {
+    FunctionName = std::move(RHS.FunctionName);
+    Line = std::move(RHS.Line);
+    View = std::move(RHS.View);
+    return *this;
+  }
+
+  friend bool operator<(const InstantiationView &LHS,
+                        const InstantiationView &RHS) {
+    return LHS.Line < RHS.Line;
+  }
+};
+
+/// \brief A code coverage view of a specific source file.
+/// It can have embedded coverage views.
+class SourceCoverageView {
+private:
+  /// \brief Coverage information for a single line.
+  struct LineCoverageInfo {
+    uint64_t ExecutionCount;
+    unsigned RegionCount;
+    bool Mapped;
+
+    LineCoverageInfo() : ExecutionCount(0), RegionCount(0), Mapped(false) {}
+
+    bool isMapped() const { return Mapped; }
+
+    bool hasMultipleRegions() const { return RegionCount > 1; }
+
+    void addRegionStartCount(uint64_t Count) {
+      Mapped = true;
+      ExecutionCount = Count;
+      ++RegionCount;
+    }
+
+    void addRegionCount(uint64_t Count) {
+      Mapped = true;
+      if (!RegionCount)
+        ExecutionCount = Count;
+    }
+  };
+
+  const MemoryBuffer &File;
+  const CoverageViewOptions &Options;
+  coverage::CoverageData CoverageInfo;
+  std::vector<ExpansionView> ExpansionSubViews;
+  std::vector<InstantiationView> InstantiationSubViews;
+
+  /// \brief Render a source line with highlighting.
+  void renderLine(raw_ostream &OS, StringRef Line, int64_t LineNumber,
+                  const coverage::CoverageSegment *WrappedSegment,
+                  ArrayRef<const coverage::CoverageSegment *> Segments,
+                  unsigned ExpansionCol);
+
+  void renderIndent(raw_ostream &OS, unsigned Level);
+
+  void renderViewDivider(unsigned Offset, unsigned Length, raw_ostream &OS);
+
+  /// \brief Render the line's execution count column.
+  void renderLineCoverageColumn(raw_ostream &OS, const LineCoverageInfo &Line);
+
+  /// \brief Render the line number column.
+  void renderLineNumberColumn(raw_ostream &OS, unsigned LineNo);
+
+  /// \brief Render all the region's execution counts on a line.
+  void
+  renderRegionMarkers(raw_ostream &OS,
+                      ArrayRef<const coverage::CoverageSegment *> Segments);
+
+  static const unsigned LineCoverageColumnWidth = 7;
+  static const unsigned LineNumberColumnWidth = 5;
+
+public:
+  SourceCoverageView(const MemoryBuffer &File,
+                     const CoverageViewOptions &Options,
+                     coverage::CoverageData &&CoverageInfo)
+      : File(File), Options(Options), CoverageInfo(std::move(CoverageInfo)) {}
+
+  const CoverageViewOptions &getOptions() const { return Options; }
+
+  /// \brief Add an expansion subview to this view.
+  void addExpansion(const coverage::CounterMappingRegion &Region,
+                    std::unique_ptr<SourceCoverageView> View) {
+    ExpansionSubViews.emplace_back(Region, std::move(View));
+  }
+
+  /// \brief Add a function instantiation subview to this view.
+  void addInstantiation(StringRef FunctionName, unsigned Line,
+                        std::unique_ptr<SourceCoverageView> View) {
+    InstantiationSubViews.emplace_back(FunctionName, Line, std::move(View));
+  }
+
+  /// \brief Print the code coverage information for a specific
+  /// portion of a source file to the output stream.
+  void render(raw_ostream &OS, bool WholeFile, unsigned IndentLevel = 0);
+};
+
+} // namespace llvm
+
+#endif // LLVM_COV_SOURCECOVERAGEVIEW_H

diff --git a/tools/llvm-cov/TestingSupport.cpp b/tools/llvm-cov/TestingSupport.cpp
new file mode 100644
index 0000000..537f133
--- /dev/null
+++ b/tools/llvm-cov/TestingSupport.cpp

@@ -0,0 +1,91 @@
+//===- TestingSupport.cpp - Convert objects files into test files --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include <system_error>
+#include <functional>
+
+using namespace llvm;
+using namespace object;
+
+int convertForTestingMain(int argc, const char *argv[]) {
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+  cl::opt<std::string> InputSourceFile(cl::Positional, cl::Required,
+                                       cl::desc("<Source file>"));
+
+  cl::opt<std::string> OutputFilename(
+      "o", cl::Required,
+      cl::desc(
+          "File with the profile data obtained after an instrumented run"));
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM code coverage tool\n");
+
+  auto ObjErr = llvm::object::ObjectFile::createObjectFile(InputSourceFile);
+  if (auto Err = ObjErr.getError()) {
+    errs() << "error: " << Err.message() << "\n";
+    return 1;
+  }
+  ObjectFile *OF = ObjErr.get().getBinary();
+  auto BytesInAddress = OF->getBytesInAddress();
+  if (BytesInAddress != 8) {
+    errs() << "error: 64 bit binary expected\n";
+    return 1;
+  }
+
+  // Look for the sections that we are interested in.
+  int FoundSectionCount = 0;
+  SectionRef ProfileNames, CoverageMapping;
+  for (const auto &Section : OF->sections()) {
+    StringRef Name;
+    if (Section.getName(Name))
+      return 1;
+    if (Name == "__llvm_prf_names") {
+      ProfileNames = Section;
+    } else if (Name == "__llvm_covmap") {
+      CoverageMapping = Section;
+    } else
+      continue;
+    ++FoundSectionCount;
+  }
+  if (FoundSectionCount != 2)
+    return 1;
+
+  // Get the contents of the given sections.
+  uint64_t ProfileNamesAddress = ProfileNames.getAddress();
+  StringRef CoverageMappingData;
+  StringRef ProfileNamesData;
+  if (CoverageMapping.getContents(CoverageMappingData) ||
+      ProfileNames.getContents(ProfileNamesData))
+    return 1;
+
+  int FD;
+  if (auto Err =
+          sys::fs::openFileForWrite(OutputFilename, FD, sys::fs::F_None)) {
+    errs() << "error: " << Err.message() << "\n";
+    return 1;
+  }
+
+  raw_fd_ostream OS(FD, true);
+  OS << "llvmcovmtestdata";
+  encodeULEB128(ProfileNamesData.size(), OS);
+  encodeULEB128(ProfileNamesAddress, OS);
+  OS << ProfileNamesData << CoverageMappingData;
+
+  return 0;
+}

diff --git a/tools/llvm-cov/gcov.cpp b/tools/llvm-cov/gcov.cpp
new file mode 100644
index 0000000..4c9195a
--- /dev/null
+++ b/tools/llvm-cov/gcov.cpp

@@ -0,0 +1,153 @@
+//===- gcov.cpp - GCOV compatible LLVM coverage tool ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// llvm-cov is a command line tools to analyze and report coverage information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GCOV.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include <system_error>
+using namespace llvm;
+
+void reportCoverage(StringRef SourceFile, StringRef ObjectDir,
+                    const std::string &InputGCNO, const std::string &InputGCDA,
+                    bool DumpGCOV, const GCOVOptions &Options) {
+  SmallString<128> CoverageFileStem(ObjectDir);
+  if (CoverageFileStem.empty()) {
+    // If no directory was specified with -o, look next to the source file.
+    CoverageFileStem = sys::path::parent_path(SourceFile);
+    sys::path::append(CoverageFileStem, sys::path::stem(SourceFile));
+  } else if (sys::fs::is_directory(ObjectDir))
+    // A directory name was given. Use it and the source file name.
+    sys::path::append(CoverageFileStem, sys::path::stem(SourceFile));
+  else
+    // A file was given. Ignore the source file and look next to this file.
+    sys::path::replace_extension(CoverageFileStem, "");
+
+  std::string GCNO = InputGCNO.empty()
+                         ? std::string(CoverageFileStem.str()) + ".gcno"
+                         : InputGCNO;
+  std::string GCDA = InputGCDA.empty()
+                         ? std::string(CoverageFileStem.str()) + ".gcda"
+                         : InputGCDA;
+  GCOVFile GF;
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> GCNO_Buff =
+      MemoryBuffer::getFileOrSTDIN(GCNO);
+  if (std::error_code EC = GCNO_Buff.getError()) {
+    errs() << GCNO << ": " << EC.message() << "\n";
+    return;
+  }
+  GCOVBuffer GCNO_GB(GCNO_Buff.get().get());
+  if (!GF.readGCNO(GCNO_GB)) {
+    errs() << "Invalid .gcno File!\n";
+    return;
+  }
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> GCDA_Buff =
+      MemoryBuffer::getFileOrSTDIN(GCDA);
+  if (std::error_code EC = GCDA_Buff.getError()) {
+    if (EC != errc::no_such_file_or_directory) {
+      errs() << GCDA << ": " << EC.message() << "\n";
+      return;
+    }
+    // Clear the filename to make it clear we didn't read anything.
+    GCDA = "-";
+  } else {
+    GCOVBuffer GCDA_GB(GCDA_Buff.get().get());
+    if (!GF.readGCDA(GCDA_GB)) {
+      errs() << "Invalid .gcda File!\n";
+      return;
+    }
+  }
+
+  if (DumpGCOV)
+    GF.dump();
+
+  FileInfo FI(Options);
+  GF.collectLineCounts(FI);
+  FI.print(SourceFile, GCNO, GCDA);
+}
+
+int gcovMain(int argc, const char *argv[]) {
+  // Print a stack trace if we signal out.
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
+  cl::list<std::string> SourceFiles(cl::Positional, cl::OneOrMore,
+                                    cl::desc("SOURCEFILE"));
+
+  cl::opt<bool> AllBlocks("a", cl::Grouping, cl::init(false),
+                          cl::desc("Display all basic blocks"));
+  cl::alias AllBlocksA("all-blocks", cl::aliasopt(AllBlocks));
+
+  cl::opt<bool> BranchProb("b", cl::Grouping, cl::init(false),
+                           cl::desc("Display branch probabilities"));
+  cl::alias BranchProbA("branch-probabilities", cl::aliasopt(BranchProb));
+
+  cl::opt<bool> BranchCount("c", cl::Grouping, cl::init(false),
+                            cl::desc("Display branch counts instead "
+                                     "of percentages (requires -b)"));
+  cl::alias BranchCountA("branch-counts", cl::aliasopt(BranchCount));
+
+  cl::opt<bool> LongNames("l", cl::Grouping, cl::init(false),
+                          cl::desc("Prefix filenames with the main file"));
+  cl::alias LongNamesA("long-file-names", cl::aliasopt(LongNames));
+
+  cl::opt<bool> FuncSummary("f", cl::Grouping, cl::init(false),
+                            cl::desc("Show coverage for each function"));
+  cl::alias FuncSummaryA("function-summaries", cl::aliasopt(FuncSummary));
+
+  cl::opt<bool> NoOutput("n", cl::Grouping, cl::init(false),
+                         cl::desc("Do not output any .gcov files"));
+  cl::alias NoOutputA("no-output", cl::aliasopt(NoOutput));
+
+  cl::opt<std::string> ObjectDir(
+      "o", cl::value_desc("DIR|FILE"), cl::init(""),
+      cl::desc("Find objects in DIR or based on FILE's path"));
+  cl::alias ObjectDirA("object-directory", cl::aliasopt(ObjectDir));
+  cl::alias ObjectDirB("object-file", cl::aliasopt(ObjectDir));
+
+  cl::opt<bool> PreservePaths("p", cl::Grouping, cl::init(false),
+                              cl::desc("Preserve path components"));
+  cl::alias PreservePathsA("preserve-paths", cl::aliasopt(PreservePaths));
+
+  cl::opt<bool> UncondBranch("u", cl::Grouping, cl::init(false),
+                             cl::desc("Display unconditional branch info "
+                                      "(requires -b)"));
+  cl::alias UncondBranchA("unconditional-branches", cl::aliasopt(UncondBranch));
+
+  cl::OptionCategory DebugCat("Internal and debugging options");
+  cl::opt<bool> DumpGCOV("dump", cl::init(false), cl::cat(DebugCat),
+                         cl::desc("Dump the gcov file to stderr"));
+  cl::opt<std::string> InputGCNO("gcno", cl::cat(DebugCat), cl::init(""),
+                                 cl::desc("Override inferred gcno file"));
+  cl::opt<std::string> InputGCDA("gcda", cl::cat(DebugCat), cl::init(""),
+                                 cl::desc("Override inferred gcda file"));
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM code coverage tool\n");
+
+  GCOVOptions Options(AllBlocks, BranchProb, BranchCount, FuncSummary,
+                      PreservePaths, UncondBranch, LongNames, NoOutput);
+
+  for (const auto &SourceFile : SourceFiles)
+    reportCoverage(SourceFile, ObjectDir, InputGCNO, InputGCDA, DumpGCOV,
+                   Options);
+  return 0;
+}

diff --git a/tools/llvm-cov/llvm-cov.cpp b/tools/llvm-cov/llvm-cov.cpp
index 18cc1b1..a67859e 100644
--- a/tools/llvm-cov/llvm-cov.cpp
+++ b/tools/llvm-cov/llvm-cov.cpp

@@ -11,140 +11,68 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallString.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/GCOV.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryObject.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
-#include <system_error>
+#include <string>
+
 using namespace llvm;
 
-static cl::list<std::string> SourceFiles(cl::Positional, cl::OneOrMore,
-                                         cl::desc("SOURCEFILE"));
+/// \brief The main entry point for the 'show' subcommand.
+int showMain(int argc, const char *argv[]);
 
-static cl::opt<bool> AllBlocks("a", cl::Grouping, cl::init(false),
-                               cl::desc("Display all basic blocks"));
-static cl::alias AllBlocksA("all-blocks", cl::aliasopt(AllBlocks));
+/// \brief The main entry point for the 'report' subcommand.
+int reportMain(int argc, const char *argv[]);
 
-static cl::opt<bool> BranchProb("b", cl::Grouping, cl::init(false),
-                                cl::desc("Display branch probabilities"));
-static cl::alias BranchProbA("branch-probabilities", cl::aliasopt(BranchProb));
+/// \brief The main entry point for the 'convert-for-testing' subcommand.
+int convertForTestingMain(int argc, const char *argv[]);
 
-static cl::opt<bool> BranchCount("c", cl::Grouping, cl::init(false),
-                                 cl::desc("Display branch counts instead "
-                                           "of percentages (requires -b)"));
-static cl::alias BranchCountA("branch-counts", cl::aliasopt(BranchCount));
+/// \brief The main entry point for the gcov compatible coverage tool.
+int gcovMain(int argc, const char *argv[]);
 
-static cl::opt<bool> LongNames("l", cl::Grouping, cl::init(false),
-                               cl::desc("Prefix filenames with the main file"));
-static cl::alias LongNamesA("long-file-names", cl::aliasopt(LongNames));
-
-static cl::opt<bool> FuncSummary("f", cl::Grouping, cl::init(false),
-                                 cl::desc("Show coverage for each function"));
-static cl::alias FuncSummaryA("function-summaries", cl::aliasopt(FuncSummary));
-
-static cl::opt<bool> NoOutput("n", cl::Grouping, cl::init(false),
-                              cl::desc("Do not output any .gcov files"));
-static cl::alias NoOutputA("no-output", cl::aliasopt(NoOutput));
-
-static cl::opt<std::string>
-ObjectDir("o", cl::value_desc("DIR|FILE"), cl::init(""),
-          cl::desc("Find objects in DIR or based on FILE's path"));
-static cl::alias ObjectDirA("object-directory", cl::aliasopt(ObjectDir));
-static cl::alias ObjectDirB("object-file", cl::aliasopt(ObjectDir));
-
-static cl::opt<bool> PreservePaths("p", cl::Grouping, cl::init(false),
-                                   cl::desc("Preserve path components"));
-static cl::alias PreservePathsA("preserve-paths", cl::aliasopt(PreservePaths));
-
-static cl::opt<bool> UncondBranch("u", cl::Grouping, cl::init(false),
-                                  cl::desc("Display unconditional branch info "
-                                           "(requires -b)"));
-static cl::alias UncondBranchA("unconditional-branches",
-                               cl::aliasopt(UncondBranch));
-
-static cl::OptionCategory DebugCat("Internal and debugging options");
-static cl::opt<bool> DumpGCOV("dump", cl::init(false), cl::cat(DebugCat),
-                              cl::desc("Dump the gcov file to stderr"));
-static cl::opt<std::string> InputGCNO("gcno", cl::cat(DebugCat), cl::init(""),
-                                      cl::desc("Override inferred gcno file"));
-static cl::opt<std::string> InputGCDA("gcda", cl::cat(DebugCat), cl::init(""),
-                                      cl::desc("Override inferred gcda file"));
-
-void reportCoverage(StringRef SourceFile) {
-  SmallString<128> CoverageFileStem(ObjectDir);
-  if (CoverageFileStem.empty()) {
-    // If no directory was specified with -o, look next to the source file.
-    CoverageFileStem = sys::path::parent_path(SourceFile);
-    sys::path::append(CoverageFileStem, sys::path::stem(SourceFile));
-  } else if (sys::fs::is_directory(ObjectDir))
-    // A directory name was given. Use it and the source file name.
-    sys::path::append(CoverageFileStem, sys::path::stem(SourceFile));
-  else
-    // A file was given. Ignore the source file and look next to this file.
-    sys::path::replace_extension(CoverageFileStem, "");
-
-  std::string GCNO = InputGCNO.empty()
-                         ? std::string(CoverageFileStem.str()) + ".gcno"
-                         : InputGCNO;
-  std::string GCDA = InputGCDA.empty()
-                         ? std::string(CoverageFileStem.str()) + ".gcda"
-                         : InputGCDA;
-  GCOVFile GF;
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> GCNO_Buff =
-      MemoryBuffer::getFileOrSTDIN(GCNO);
-  if (std::error_code EC = GCNO_Buff.getError()) {
-    errs() << GCNO << ": " << EC.message() << "\n";
-    return;
-  }
-  GCOVBuffer GCNO_GB(GCNO_Buff.get().get());
-  if (!GF.readGCNO(GCNO_GB)) {
-    errs() << "Invalid .gcno File!\n";
-    return;
-  }
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> GCDA_Buff =
-      MemoryBuffer::getFileOrSTDIN(GCDA);
-  if (std::error_code EC = GCDA_Buff.getError()) {
-    if (EC != errc::no_such_file_or_directory) {
-      errs() << GCDA << ": " << EC.message() << "\n";
-      return;
-    }
-    // Clear the filename to make it clear we didn't read anything.
-    GCDA = "-";
-  } else {
-    GCOVBuffer GCDA_GB(GCDA_Buff.get().get());
-    if (!GF.readGCDA(GCDA_GB)) {
-      errs() << "Invalid .gcda File!\n";
-      return;
-    }
-  }
-
-  if (DumpGCOV)
-    GF.dump();
-
-  GCOVOptions Options(AllBlocks, BranchProb, BranchCount, FuncSummary,
-                      PreservePaths, UncondBranch, LongNames, NoOutput);
-  FileInfo FI(Options);
-  GF.collectLineCounts(FI);
-  FI.print(SourceFile, GCNO, GCDA);
+/// \brief Top level help.
+int helpMain(int argc, const char *argv[]) {
+  errs() << "OVERVIEW: LLVM code coverage tool\n\n"
+         << "USAGE: llvm-cov {gcov|report|show}\n";
+  return 0;
 }
 
-int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal();
-  PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+int main(int argc, const char **argv) {
+  // If argv[0] is or ends with 'gcov', always be gcov compatible
+  if (sys::path::stem(argv[0]).endswith_lower("gcov"))
+    return gcovMain(argc, argv);
 
-  cl::ParseCommandLineOptions(argc, argv, "LLVM code coverage tool\n");
+  // Check if we are invoking a specific tool command.
+  if (argc > 1) {
+    typedef int (*MainFunction)(int, const char *[]);
+    MainFunction Func = StringSwitch<MainFunction>(argv[1])
+                            .Case("convert-for-testing", convertForTestingMain)
+                            .Case("gcov", gcovMain)
+                            .Case("report", reportMain)
+                            .Case("show", showMain)
+                            .Cases("-h", "-help", "--help", helpMain)
+                            .Default(nullptr);
 
-  for (const auto &SourceFile : SourceFiles)
-    reportCoverage(SourceFile);
-  return 0;
+    if (Func) {
+      std::string Invocation = std::string(argv[0]) + " " + argv[1];
+      argv[1] = Invocation.c_str();
+      return Func(argc - 1, argv + 1);
+    }
+  }
+
+  // Give a warning and fall back to gcov
+  errs().changeColor(raw_ostream::RED);
+  errs() << "warning:";
+  // Assume that argv[1] wasn't a command when it stats with a '-' or is a
+  // filename (i.e. contains a '.')
+  if (argc > 1 && !StringRef(argv[1]).startswith("-") &&
+      StringRef(argv[1]).find(".") == StringRef::npos)
+    errs() << " Unrecognized command '" << argv[1] << "'.";
+  errs() << " Using the gcov compatible mode "
+            "(this behaviour may be dropped in the future).";
+  errs().resetColor();
+  errs() << "\n";
+
+  return gcovMain(argc, argv);
 }

diff --git a/tools/llvm-diff/DiffConsumer.h b/tools/llvm-diff/DiffConsumer.h
index ac13a5e..855f688 100644
--- a/tools/llvm-diff/DiffConsumer.h
+++ b/tools/llvm-diff/DiffConsumer.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LLVM_DIFFCONSUMER_H_
-#define _LLVM_DIFFCONSUMER_H_
+#ifndef LLVM_TOOLS_LLVM_DIFF_DIFFCONSUMER_H
+#define LLVM_TOOLS_LLVM_DIFF_DIFFCONSUMER_H
 
 #include "DiffLog.h"
 #include "llvm/ADT/DenseMap.h"

diff --git a/tools/llvm-diff/DiffLog.h b/tools/llvm-diff/DiffLog.h
index 43e318a..8eb53ff 100644
--- a/tools/llvm-diff/DiffLog.h
+++ b/tools/llvm-diff/DiffLog.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LLVM_DIFFLOG_H_
-#define _LLVM_DIFFLOG_H_
+#ifndef LLVM_TOOLS_LLVM_DIFF_DIFFLOG_H
+#define LLVM_TOOLS_LLVM_DIFF_DIFFLOG_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"

diff --git a/tools/llvm-diff/DifferenceEngine.h b/tools/llvm-diff/DifferenceEngine.h
index 4470968..f0d8311 100644
--- a/tools/llvm-diff/DifferenceEngine.h
+++ b/tools/llvm-diff/DifferenceEngine.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LLVM_DIFFERENCE_ENGINE_H_
-#define _LLVM_DIFFERENCE_ENGINE_H_
+#ifndef LLVM_TOOLS_LLVM_DIFF_DIFFERENCEENGINE_H
+#define LLVM_TOOLS_LLVM_DIFF_DIFFERENCEENGINE_H
 
 #include "DiffConsumer.h"
 #include "DiffLog.h"

diff --git a/tools/llvm-diff/llvm-diff.cpp b/tools/llvm-diff/llvm-diff.cpp
index f70219e..ae58f5c 100644
--- a/tools/llvm-diff/llvm-diff.cpp
+++ b/tools/llvm-diff/llvm-diff.cpp

@@ -32,21 +32,22 @@
 
 /// Reads a module from a file.  On error, messages are written to stderr
 /// and null is returned.
-static Module *ReadModule(LLVMContext &Context, StringRef Name) {
+static std::unique_ptr<Module> readModule(LLVMContext &Context,
+                                          StringRef Name) {
   SMDiagnostic Diag;
-  Module *M = ParseIRFile(Name, Diag, Context);
+  std::unique_ptr<Module> M = parseIRFile(Name, Diag, Context);
   if (!M)
     Diag.print("llvm-diff", errs());
   return M;
 }
 
-static void diffGlobal(DifferenceEngine &Engine, Module *L, Module *R,
+static void diffGlobal(DifferenceEngine &Engine, Module &L, Module &R,
                        StringRef Name) {
   // Drop leading sigils from the global name.
   if (Name.startswith("@")) Name = Name.substr(1);
 
-  Function *LFn = L->getFunction(Name);
-  Function *RFn = R->getFunction(Name);
+  Function *LFn = L.getFunction(Name);
+  Function *RFn = R.getFunction(Name);
   if (LFn && RFn)
     Engine.diff(LFn, RFn);
   else if (!LFn && !RFn)
@@ -72,8 +73,8 @@
   LLVMContext Context;
 
   // Load both modules.  Die if that fails.
-  Module *LModule = ReadModule(Context, LeftFilename);
-  Module *RModule = ReadModule(Context, RightFilename);
+  std::unique_ptr<Module> LModule = readModule(Context, LeftFilename);
+  std::unique_ptr<Module> RModule = readModule(Context, RightFilename);
   if (!LModule || !RModule) return 1;
 
   DiffConsumer Consumer;
@@ -82,15 +83,12 @@
   // If any global names were given, just diff those.
   if (!GlobalsToCompare.empty()) {
     for (unsigned I = 0, E = GlobalsToCompare.size(); I != E; ++I)
-      diffGlobal(Engine, LModule, RModule, GlobalsToCompare[I]);
+      diffGlobal(Engine, *LModule, *RModule, GlobalsToCompare[I]);
 
   // Otherwise, diff everything in the module.
   } else {
-    Engine.diff(LModule, RModule);
+    Engine.diff(LModule.get(), RModule.get());
   }
 
-  delete LModule;
-  delete RModule;
-
   return Consumer.hadDifferences();
 }

diff --git a/tools/llvm-dis/llvm-dis.cpp b/tools/llvm-dis/llvm-dis.cpp
index 3b0f838..fb73717 100644
--- a/tools/llvm-dis/llvm-dis.cpp
+++ b/tools/llvm-dis/llvm-dis.cpp

@@ -171,11 +171,11 @@
     }
   }
 
-  std::string ErrorInfo;
+  std::error_code EC;
   std::unique_ptr<tool_output_file> Out(
-      new tool_output_file(OutputFilename.c_str(), ErrorInfo, sys::fs::F_None));
-  if (!ErrorInfo.empty()) {
-    errs() << ErrorInfo << '\n';
+      new tool_output_file(OutputFilename, EC, sys::fs::F_None));
+  if (EC) {
+    errs() << EC.message() << '\n';
     return 1;
   }
 

diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index f44b0e3..1c540c9 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp

@@ -45,6 +45,10 @@
         clEnumValN(DIDT_All, "all", "Dump all debug sections"),
         clEnumValN(DIDT_Abbrev, "abbrev", ".debug_abbrev"),
         clEnumValN(DIDT_AbbrevDwo, "abbrev.dwo", ".debug_abbrev.dwo"),
+        clEnumValN(DIDT_AppleNames, "apple_names", ".apple_names"),
+        clEnumValN(DIDT_AppleTypes, "apple_types", ".apple_types"),
+        clEnumValN(DIDT_AppleNamespaces, "apple_namespaces", ".apple_namespaces"),
+        clEnumValN(DIDT_AppleObjC, "apple_objc", ".apple_objc"),
         clEnumValN(DIDT_Aranges, "aranges", ".debug_aranges"),
         clEnumValN(DIDT_Info, "info", ".debug_info"),
         clEnumValN(DIDT_InfoDwo, "info.dwo", ".debug_info.dwo"),
@@ -65,26 +69,28 @@
         clEnumValN(DIDT_StrOffsetsDwo, "str_offsets.dwo", ".debug_str_offsets.dwo"),
         clEnumValEnd));
 
-static void DumpInput(const StringRef &Filename) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buff =
+static void DumpInput(StringRef Filename) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
 
-  if (std::error_code EC = Buff.getError()) {
+  if (std::error_code EC = BuffOrErr.getError()) {
     errs() << Filename << ": " << EC.message() << "\n";
     return;
   }
+  std::unique_ptr<MemoryBuffer> Buff = std::move(BuffOrErr.get());
 
-  ErrorOr<ObjectFile *> ObjOrErr(ObjectFile::createObjectFile(Buff.get()));
+  ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr =
+      ObjectFile::createObjectFile(Buff->getMemBufferRef());
   if (std::error_code EC = ObjOrErr.getError()) {
     errs() << Filename << ": " << EC.message() << '\n';
     return;
   }
-  std::unique_ptr<ObjectFile> Obj(ObjOrErr.get());
+  ObjectFile &Obj = *ObjOrErr.get();
 
-  std::unique_ptr<DIContext> DICtx(DIContext::getDWARFContext(Obj.get()));
+  std::unique_ptr<DIContext> DICtx(DIContext::getDWARFContext(Obj));
 
   outs() << Filename
-         << ":\tfile format " << Obj->getFileFormatName() << "\n\n";
+         << ":\tfile format " << Obj.getFileFormatName() << "\n\n";
   // Dump the complete DWARF structure.
   DICtx->dump(outs(), DumpType);
 }

diff --git a/tools/llvm-extract/llvm-extract.cpp b/tools/llvm-extract/llvm-extract.cpp
index 0f70868..53b2f0d 100644
--- a/tools/llvm-extract/llvm-extract.cpp
+++ b/tools/llvm-extract/llvm-extract.cpp

@@ -101,8 +101,7 @@
 
   // Use lazy loading, since we only care about selected global values.
   SMDiagnostic Err;
-  std::unique_ptr<Module> M;
-  M.reset(getLazyIRFileModule(InputFilename, Err, Context));
+  std::unique_ptr<Module> M = getLazyIRFileModule(InputFilename, Err, Context);
 
   if (!M.get()) {
     Err.print(argv[0], errs());
@@ -217,31 +216,28 @@
   if (!DeleteFn)
     for (size_t i = 0, e = GVs.size(); i != e; ++i) {
       GlobalValue *GV = GVs[i];
-      if (GV->isMaterializable()) {
-        std::string ErrInfo;
-        if (GV->Materialize(&ErrInfo)) {
-          errs() << argv[0] << ": error reading input: " << ErrInfo << "\n";
-          return 1;
-        }
+      if (std::error_code EC = GV->materialize()) {
+        errs() << argv[0] << ": error reading input: " << EC.message() << "\n";
+        return 1;
       }
     }
   else {
     // Deleting. Materialize every GV that's *not* in GVs.
     SmallPtrSet<GlobalValue *, 8> GVSet(GVs.begin(), GVs.end());
     for (auto &G : M->globals()) {
-      if (!GVSet.count(&G) && G.isMaterializable()) {
-        std::string ErrInfo;
-        if (G.Materialize(&ErrInfo)) {
-          errs() << argv[0] << ": error reading input: " << ErrInfo << "\n";
+      if (!GVSet.count(&G)) {
+        if (std::error_code EC = G.materialize()) {
+          errs() << argv[0] << ": error reading input: " << EC.message()
+                 << "\n";
           return 1;
         }
       }
     }
     for (auto &F : *M) {
-      if (!GVSet.count(&F) && F.isMaterializable()) {
-        std::string ErrInfo;
-        if (F.Materialize(&ErrInfo)) {
-          errs() << argv[0] << ": error reading input: " << ErrInfo << "\n";
+      if (!GVSet.count(&F)) {
+        if (std::error_code EC = F.materialize()) {
+          errs() << argv[0] << ": error reading input: " << EC.message()
+                 << "\n";
           return 1;
         }
       }
@@ -251,7 +247,7 @@
   // In addition to deleting all other functions, we also want to spiff it
   // up a little bit.  Do this now.
   PassManager Passes;
-  Passes.add(new DataLayoutPass(M.get())); // Use correct DataLayout
+  Passes.add(new DataLayoutPass()); // Use correct DataLayout
 
   std::vector<GlobalValue*> Gvs(GVs.begin(), GVs.end());
 
@@ -261,10 +257,10 @@
   Passes.add(createStripDeadDebugInfoPass());    // Remove dead debug info
   Passes.add(createStripDeadPrototypesPass());   // Remove dead func decls
 
-  std::string ErrorInfo;
-  tool_output_file Out(OutputFilename.c_str(), ErrorInfo, sys::fs::F_None);
-  if (!ErrorInfo.empty()) {
-    errs() << ErrorInfo << '\n';
+  std::error_code EC;
+  tool_output_file Out(OutputFilename, EC, sys::fs::F_None);
+  if (EC) {
+    errs() << EC.message() << '\n';
     return 1;
   }
 

diff --git a/tools/llvm-go/CMakeLists.txt b/tools/llvm-go/CMakeLists.txt
new file mode 100644
index 0000000..20393f7
--- /dev/null
+++ b/tools/llvm-go/CMakeLists.txt

@@ -0,0 +1,9 @@
+if(LLVM_BINDINGS MATCHES "go")
+  set(binpath ${CMAKE_BINARY_DIR}/bin/llvm-go${CMAKE_EXECUTABLE_SUFFIX})
+  add_custom_command(OUTPUT ${binpath}
+    COMMAND ${GO_EXECUTABLE} build -o ${binpath} llvm-go.go
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/llvm-go.go
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMENT "Building Go executable llvm-go")
+  add_custom_target(llvm-go ALL DEPENDS ${binpath})
+endif()

diff --git a/tools/llvm-go/Makefile b/tools/llvm-go/Makefile
new file mode 100644
index 0000000..4465b2a
--- /dev/null
+++ b/tools/llvm-go/Makefile

@@ -0,0 +1,16 @@
+##===- tools/llvm-go/Makefile ------------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+include $(LEVEL)/Makefile.common
+
+all:: $(ToolDir)/llvm-go$(EXEEXT)
+
+$(ToolDir)/llvm-go$(EXEEXT): $(PROJ_SRC_DIR)/llvm-go.go
+	$(GO) build -o $@ $<

diff --git a/tools/llvm-go/llvm-go.go b/tools/llvm-go/llvm-go.go
new file mode 100644
index 0000000..47f9481
--- /dev/null
+++ b/tools/llvm-go/llvm-go.go

@@ -0,0 +1,261 @@
+//===-- llvm-go.go - go tool wrapper for LLVM -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tool lets us build LLVM components within the tree by setting up a
+// $GOPATH that resembles a tree fetched in the normal way with "go get".
+//
+//===----------------------------------------------------------------------===//
+
+package main
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+)
+
+type pkg struct {
+	llvmpath, pkgpath string
+}
+
+var packages = []pkg{
+	{"bindings/go/llvm", "llvm.org/llvm/bindings/go/llvm"},
+}
+
+type compilerFlags struct {
+	cpp, cxx, ld string
+}
+
+var components = []string{
+	"all-targets",
+	"analysis",
+	"asmparser",
+	"asmprinter",
+	"bitreader",
+	"bitwriter",
+	"codegen",
+	"core",
+	"debuginfo",
+	"executionengine",
+	"instrumentation",
+	"interpreter",
+	"ipo",
+	"irreader",
+	"linker",
+	"mc",
+	"mcjit",
+	"objcarcopts",
+	"option",
+	"profiledata",
+	"scalaropts",
+	"support",
+	"target",
+}
+
+func llvmConfig(args ...string) string {
+	configpath := os.Getenv("LLVM_CONFIG")
+	if configpath == "" {
+		// strip llvm-go, add llvm-config
+		configpath = os.Args[0][:len(os.Args[0])-7] + "llvm-config"
+	}
+
+	cmd := exec.Command(configpath, args...)
+	out, err := cmd.Output()
+	if err != nil {
+		panic(err.Error())
+	}
+
+	outstr := string(out)
+	outstr = strings.TrimSuffix(outstr, "\n")
+	return strings.Replace(outstr, "\n", " ", -1)
+}
+
+func llvmFlags() compilerFlags {
+	ldflags := llvmConfig(append([]string{"--ldflags", "--libs", "--system-libs"}, components...)...)
+	if runtime.GOOS != "darwin" {
+		// OS X doesn't like -rpath with cgo. See:
+		// https://code.google.com/p/go/issues/detail?id=7293
+		ldflags = "-Wl,-rpath," + llvmConfig("--libdir") + " " + ldflags
+	}
+	return compilerFlags{
+		cpp: llvmConfig("--cppflags"),
+		cxx: "-std=c++11",
+		ld:  ldflags,
+	}
+}
+
+func addTag(args []string, tag string) []string {
+	args = append([]string{}, args...)
+	addedTag := false
+	for i, a := range args {
+		if strings.HasPrefix(a, "-tags=") {
+			args[i] = a + " " + tag
+			addedTag = true
+		} else if a == "-tags" && i+1 < len(args) {
+			args[i+1] = args[i+1] + " " + tag
+			addedTag = true
+		}
+	}
+	if !addedTag {
+		args = append([]string{args[0], "-tags", tag}, args[1:]...)
+	}
+	return args
+}
+
+func printComponents() {
+	fmt.Println(strings.Join(components, " "))
+}
+
+func printConfig() {
+	flags := llvmFlags()
+
+	fmt.Printf(`// +build !byollvm
+
+// This file is generated by llvm-go, do not edit.
+
+package llvm
+
+/*
+#cgo CPPFLAGS: %s
+#cgo CXXFLAGS: %s
+#cgo LDFLAGS: %s
+*/
+import "C"
+
+type (run_build_sh int)
+`, flags.cpp, flags.cxx, flags.ld)
+}
+
+func runGoWithLLVMEnv(args []string, cc, cxx, cppflags, cxxflags, ldflags string) {
+	args = addTag(args, "byollvm")
+
+	srcdir := llvmConfig("--src-root")
+
+	tmpgopath, err := ioutil.TempDir("", "gopath")
+	if err != nil {
+		panic(err.Error())
+	}
+
+	for _, p := range packages {
+		path := filepath.Join(tmpgopath, "src", p.pkgpath)
+		err := os.MkdirAll(filepath.Dir(path), os.ModePerm)
+		if err != nil {
+			panic(err.Error())
+		}
+
+		err = os.Symlink(filepath.Join(srcdir, p.llvmpath), path)
+		if err != nil {
+			panic(err.Error())
+		}
+	}
+
+	newgopathlist := []string{tmpgopath}
+	newgopathlist = append(newgopathlist, filepath.SplitList(os.Getenv("GOPATH"))...)
+	newgopath := strings.Join(newgopathlist, string(filepath.ListSeparator))
+
+	flags := llvmFlags()
+
+	newenv := []string{
+		"CC=" + cc,
+		"CXX=" + cxx,
+		"CGO_CPPFLAGS=" + flags.cpp + " " + cppflags,
+		"CGO_CXXFLAGS=" + flags.cxx + " " + cxxflags,
+		"CGO_LDFLAGS=" + flags.ld + " " + ldflags,
+		"GOPATH=" + newgopath,
+	}
+	for _, v := range os.Environ() {
+		if !strings.HasPrefix(v, "CC=") &&
+			!strings.HasPrefix(v, "CXX=") &&
+			!strings.HasPrefix(v, "CGO_CPPFLAGS=") &&
+			!strings.HasPrefix(v, "CGO_CXXFLAGS=") &&
+			!strings.HasPrefix(v, "CGO_LDFLAGS=") &&
+			!strings.HasPrefix(v, "GOPATH=") {
+			newenv = append(newenv, v)
+		}
+	}
+
+	gocmdpath, err := exec.LookPath("go")
+	if err != nil {
+		panic(err.Error())
+	}
+
+	proc, err := os.StartProcess(gocmdpath, append([]string{"go"}, args...),
+		&os.ProcAttr{
+			Env:   newenv,
+			Files: []*os.File{os.Stdin, os.Stdout, os.Stderr},
+		})
+	if err != nil {
+		panic(err.Error())
+	}
+	ps, err := proc.Wait()
+	if err != nil {
+		panic(err.Error())
+	}
+
+	os.RemoveAll(tmpgopath)
+
+	if !ps.Success() {
+		os.Exit(1)
+	}
+}
+
+func usage() {
+	fmt.Println(`Usage: llvm-go subcommand [flags]
+
+Available subcommands: build get install run test print-components print-config`)
+	os.Exit(0)
+}
+
+func main() {
+	cc := os.Getenv("CC")
+	cxx := os.Getenv("CXX")
+	cppflags := os.Getenv("CGO_CPPFLAGS")
+	cxxflags := os.Getenv("CGO_CXXFLAGS")
+	ldflags := os.Getenv("CGO_LDFLAGS")
+
+	args := os.Args[1:]
+	DONE: for {
+		switch {
+		case len(args) == 0:
+			usage()
+		case strings.HasPrefix(args[0], "cc="):
+			cc = args[0][3:]
+			args = args[1:]
+		case strings.HasPrefix(args[0], "cxx="):
+			cxx = args[0][4:]
+			args = args[1:]
+		case strings.HasPrefix(args[0], "cppflags="):
+			cppflags = args[0][9:]
+			args = args[1:]
+		case strings.HasPrefix(args[0], "cxxflags="):
+			cxxflags = args[0][9:]
+			args = args[1:]
+		case strings.HasPrefix(args[0], "ldflags="):
+			ldflags = args[0][8:]
+			args = args[1:]
+		default:
+			break DONE
+		}
+	}
+
+	switch args[0] {
+	case "build", "get", "install", "run", "test":
+		runGoWithLLVMEnv(args, cc, cxx, cppflags, cxxflags, ldflags)
+	case "print-components":
+		printComponents()
+	case "print-config":
+		printConfig()
+	default:
+		usage()
+	}
+}

diff --git a/tools/llvm-jitlistener/CMakeLists.txt b/tools/llvm-jitlistener/CMakeLists.txt
index c9704fb..68a4303 100644
--- a/tools/llvm-jitlistener/CMakeLists.txt
+++ b/tools/llvm-jitlistener/CMakeLists.txt

@@ -1,22 +1,21 @@
-# This tool is excluded from the CMake build if Intel JIT events are disabled.

-

-link_directories( ${LLVM_INTEL_JITEVENTS_LIBDIR} )

-include_directories( ${LLVM_INTEL_JITEVENTS_INCDIR} )

-

-set(LLVM_LINK_COMPONENTS

-  asmparser

-  bitreader

-  debuginfo

-  inteljitevents

-  interpreter

-  irreader

-  jit

-  mcjit

-  nativecodegen

-  object

-  selectiondag

-  )

-

-add_llvm_tool(llvm-jitlistener

-  llvm-jitlistener.cpp

-  )

+# This tool is excluded from the CMake build if Intel JIT events are disabled.
+
+link_directories( ${LLVM_INTEL_JITEVENTS_LIBDIR} )
+include_directories( ${LLVM_INTEL_JITEVENTS_INCDIR} )
+
+set(LLVM_LINK_COMPONENTS
+  asmparser
+  bitreader
+  debuginfo
+  inteljitevents
+  interpreter
+  irreader
+  mcjit
+  nativecodegen
+  object
+  selectiondag
+  )
+
+add_llvm_tool(llvm-jitlistener
+  llvm-jitlistener.cpp
+  )

diff --git a/tools/llvm-jitlistener/LLVMBuild.txt b/tools/llvm-jitlistener/LLVMBuild.txt
index 1ce78ac..e6ed20b 100644
--- a/tools/llvm-jitlistener/LLVMBuild.txt
+++ b/tools/llvm-jitlistener/LLVMBuild.txt

@@ -19,4 +19,4 @@
 type = Tool
 name = llvm-jitlistener
 parent = Tools
-required_libraries = AsmParser BitReader IRReader Interpreter JIT MCJIT NativeCodeGen Object SelectionDAG Native
+required_libraries = AsmParser BitReader IRReader Interpreter MCJIT NativeCodeGen Object SelectionDAG Native

diff --git a/tools/llvm-jitlistener/Makefile b/tools/llvm-jitlistener/Makefile
index b132227..6d72427 100644
--- a/tools/llvm-jitlistener/Makefile
+++ b/tools/llvm-jitlistener/Makefile

@@ -12,7 +12,7 @@
 

 include $(LEVEL)/Makefile.config

 

-LINK_COMPONENTS := mcjit jit interpreter nativecodegen bitreader asmparser irreader selectiondag Object

+LINK_COMPONENTS := mcjit interpreter nativecodegen bitreader asmparser irreader selectiondag Object

 

 # If Intel JIT Events support is configured, link against the LLVM Intel JIT

 # Events interface library.  If not, this tool will do nothing useful, but it


diff --git a/tools/llvm-jitlistener/llvm-jitlistener.cpp b/tools/llvm-jitlistener/llvm-jitlistener.cpp
index c159aa5..0bb6e8b 100644
--- a/tools/llvm-jitlistener/llvm-jitlistener.cpp
+++ b/tools/llvm-jitlistener/llvm-jitlistener.cpp

@@ -17,7 +17,7 @@
 #include "../../lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/IR/Module.h"
@@ -113,26 +113,18 @@
 
     // Parse the bitcode...
     SMDiagnostic Err;
-    TheModule = ParseIRFile(IRFile, Err, Context);
+    std::unique_ptr<Module> TheModule(parseIRFile(IRFile, Err, Context));
     if (!TheModule) {
       errs() << Err.getMessage();
       return;
     }
 
-    // FIXME: This is using the default legacy JITMemoryManager because it
-    // supports poison memory.  At some point, we'll need to update this to
-    // use an MCJIT-specific memory manager.  It might be nice to have the
-    // poison memory option there too.
-    JITMemoryManager *MemMgr = JITMemoryManager::CreateDefaultMemManager();
+    RTDyldMemoryManager *MemMgr = new SectionMemoryManager();
     if (!MemMgr) {
       errs() << "Unable to create memory manager.";
       return;
     }
 
-    // Tell the memory manager to poison freed memory so that accessing freed
-    // memory is more easily tested.
-    MemMgr->setPoisonMemory(true);
-
     // Override the triple to generate ELF on Windows since that's supported
     Triple Tuple(TheModule->getTargetTriple());
     if (Tuple.getTriple().empty())
@@ -145,11 +137,10 @@
 
     // Compile the IR
     std::string Error;
-    TheJIT.reset(EngineBuilder(TheModule)
+    TheJIT.reset(EngineBuilder(std::move(TheModule))
       .setEngineKind(EngineKind::JIT)
       .setErrorStr(&Error)
-      .setJITMemoryManager(MemMgr)
-      .setUseMCJIT(true)
+      .setMCJITMemoryManager(MemMgr)
       .create());
     if (Error.empty() == false)
       errs() << Error;
@@ -160,8 +151,6 @@
   }
 
   LLVMContext Context; // Global ownership
-  Module *TheModule; // Owned by ExecutionEngine.
-  JITMemoryManager *JMM; // Owned by ExecutionEngine.
   std::unique_ptr<ExecutionEngine> TheJIT;
 
 public:

diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index ed8c06e..828b9bb 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp

@@ -14,6 +14,8 @@
 
 #include "llvm/Linker/Linker.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -55,20 +57,39 @@
 SuppressWarnings("suppress-warnings", cl::desc("Suppress all linking warnings"),
                  cl::init(false));
 
-// LoadFile - Read the specified bitcode file in and return it.  This routine
-// searches the link path for the specified file to try to find it...
+// Read the specified bitcode file in and return it. This routine searches the
+// link path for the specified file to try to find it...
 //
-static inline Module *LoadFile(const char *argv0, const std::string &FN,
-                               LLVMContext& Context) {
+static std::unique_ptr<Module>
+loadFile(const char *argv0, const std::string &FN, LLVMContext &Context) {
   SMDiagnostic Err;
   if (Verbose) errs() << "Loading '" << FN << "'\n";
-  Module* Result = nullptr;
+  std::unique_ptr<Module> Result = getLazyIRFileModule(FN, Err, Context);
+  if (!Result)
+    Err.print(argv0, errs());
 
-  Result = ParseIRFile(FN, Err, Context);
-  if (Result) return Result;   // Load successful!
+  return Result;
+}
 
-  Err.print(argv0, errs());
-  return nullptr;
+static void diagnosticHandler(const DiagnosticInfo &DI) {
+  unsigned Severity = DI.getSeverity();
+  switch (Severity) {
+  case DS_Error:
+    errs() << "ERROR: ";
+    break;
+  case DS_Warning:
+    if (SuppressWarnings)
+      return;
+    errs() << "WARNING: ";
+    break;
+  case DS_Remark:
+  case DS_Note:
+    llvm_unreachable("Only expecting warnings and errors");
+  }
+
+  DiagnosticPrinterRawOStream DP(errs());
+  DI.print(DP);
+  errs() << '\n';
 }
 
 int main(int argc, char **argv) {
@@ -80,20 +101,11 @@
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
   cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
 
-  unsigned BaseArg = 0;
-  std::string ErrorMessage;
+  auto Composite = make_unique<Module>("llvm-link", Context);
+  Linker L(Composite.get(), diagnosticHandler);
 
-  std::unique_ptr<Module> Composite(
-      LoadFile(argv[0], InputFilenames[BaseArg], Context));
-  if (!Composite.get()) {
-    errs() << argv[0] << ": error loading file '"
-           << InputFilenames[BaseArg] << "'\n";
-    return 1;
-  }
-
-  Linker L(Composite.get(), SuppressWarnings);
-  for (unsigned i = BaseArg+1; i < InputFilenames.size(); ++i) {
-    std::unique_ptr<Module> M(LoadFile(argv[0], InputFilenames[i], Context));
+  for (unsigned i = 0; i < InputFilenames.size(); ++i) {
+    std::unique_ptr<Module> M = loadFile(argv[0], InputFilenames[i], Context);
     if (!M.get()) {
       errs() << argv[0] << ": error loading file '" <<InputFilenames[i]<< "'\n";
       return 1;
@@ -101,19 +113,16 @@
 
     if (Verbose) errs() << "Linking in '" << InputFilenames[i] << "'\n";
 
-    if (L.linkInModule(M.get(), &ErrorMessage)) {
-      errs() << argv[0] << ": link error in '" << InputFilenames[i]
-             << "': " << ErrorMessage << "\n";
+    if (L.linkInModule(M.get()))
       return 1;
-    }
   }
 
   if (DumpAsm) errs() << "Here's the assembly:\n" << *Composite;
 
-  std::string ErrorInfo;
-  tool_output_file Out(OutputFilename.c_str(), ErrorInfo, sys::fs::F_None);
-  if (!ErrorInfo.empty()) {
-    errs() << ErrorInfo << '\n';
+  std::error_code EC;
+  tool_output_file Out(OutputFilename, EC, sys::fs::F_None);
+  if (EC) {
+    errs() << EC.message() << '\n';
     return 1;
   }
 

diff --git a/tools/llvm-lto/CMakeLists.txt b/tools/llvm-lto/CMakeLists.txt
index 485b03d..9adf629 100644
--- a/tools/llvm-lto/CMakeLists.txt
+++ b/tools/llvm-lto/CMakeLists.txt

@@ -1,6 +1,5 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
-  Core
   LTO
   MC
   Support

diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 8b39f12..3c950ba 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp

@@ -38,6 +38,14 @@
 DisableGVNLoadPRE("disable-gvn-loadpre", cl::init(false),
   cl::desc("Do not run the GVN load PRE pass"));
 
+static cl::opt<bool>
+DisableLTOVectorization("disable-lto-vectorization", cl::init(false),
+  cl::desc("Do not run loop or slp vectorization during LTO"));
+
+static cl::opt<bool>
+UseDiagnosticHandler("use-diagnostic-handler", cl::init(false),
+  cl::desc("Use a diagnostic handler to test the handler interface"));
+
 static cl::list<std::string>
 InputFilenames(cl::Positional, cl::OneOrMore,
   cl::desc("<input bitcode files>"));
@@ -63,6 +71,25 @@
 };
 }
 
+void handleDiagnostics(lto_codegen_diagnostic_severity_t Severity,
+                       const char *Msg, void *) {
+  switch (Severity) {
+  case LTO_DS_NOTE:
+    errs() << "note: ";
+    break;
+  case LTO_DS_REMARK:
+    errs() << "remark: ";
+    break;
+  case LTO_DS_ERROR:
+    errs() << "error: ";
+    break;
+  case LTO_DS_WARNING:
+    errs() << "warning: ";
+    break;
+  }
+  errs() << Msg << "\n";
+}
+
 int main(int argc, char **argv) {
   // Print a stack trace if we signal out.
   sys::PrintStackTraceOnErrorSignal();
@@ -84,6 +111,9 @@
 
   LTOCodeGenerator CodeGen;
 
+  if (UseDiagnosticHandler)
+    CodeGen.setDiagnosticHandler(handleDiagnostics, nullptr);
+
   switch (RelocModel) {
   case Reloc::Static:
     CodeGen.setCodePICModel(LTO_CODEGEN_PIC_MODEL_STATIC);
@@ -117,12 +147,8 @@
       return 1;
     }
 
-
-    if (!CodeGen.addModule(Module.get(), error)) {
-      errs() << argv[0] << ": error adding file '" << InputFilenames[i]
-             << "': " << error << "\n";
+    if (!CodeGen.addModule(Module.get()))
       return 1;
-    }
 
     unsigned NumSyms = Module->getSymbolCount();
     for (unsigned I = 0; I < NumSyms; ++I) {
@@ -157,19 +183,20 @@
   if (!OutputFilename.empty()) {
     size_t len = 0;
     std::string ErrorInfo;
-    const void *Code = CodeGen.compile(&len, DisableOpt, DisableInline,
-                                       DisableGVNLoadPRE, ErrorInfo);
+    const void *Code =
+        CodeGen.compile(&len, DisableOpt, DisableInline, DisableGVNLoadPRE,
+                        DisableLTOVectorization, ErrorInfo);
     if (!Code) {
       errs() << argv[0]
              << ": error compiling the code: " << ErrorInfo << "\n";
       return 1;
     }
 
-    raw_fd_ostream FileStream(OutputFilename.c_str(), ErrorInfo,
-                              sys::fs::F_None);
-    if (!ErrorInfo.empty()) {
+    std::error_code EC;
+    raw_fd_ostream FileStream(OutputFilename, EC, sys::fs::F_None);
+    if (EC) {
       errs() << argv[0] << ": error opening the file '" << OutputFilename
-             << "': " << ErrorInfo << "\n";
+             << "': " << EC.message() << "\n";
       return 1;
     }
 
@@ -178,7 +205,8 @@
     std::string ErrorInfo;
     const char *OutputName = nullptr;
     if (!CodeGen.compile_to_file(&OutputName, DisableOpt, DisableInline,
-                                 DisableGVNLoadPRE, ErrorInfo)) {
+                                 DisableGVNLoadPRE, DisableLTOVectorization,
+                                 ErrorInfo)) {
       errs() << argv[0]
              << ": error compiling the code: " << ErrorInfo
              << "\n";

diff --git a/tools/llvm-mc/Android.mk b/tools/llvm-mc/Android.mk
index d794c8a..e6de9eb 100644
--- a/tools/llvm-mc/Android.mk
+++ b/tools/llvm-mc/Android.mk

@@ -34,7 +34,10 @@
   libLLVMX86AsmPrinter \
   libLLVMX86Utils \
   libLLVMX86Disassembler \
+  libLLVMX86CodeGen \
   libLLVMAsmPrinter \
+  libLLVMCodeGen \
+  libLLVMAnalysis \
   libLLVMTarget \
   libLLVMMC \
   libLLVMObject \

diff --git a/tools/llvm-mc/Disassembler.cpp b/tools/llvm-mc/Disassembler.cpp
index 9367590..95d146a 100644
--- a/tools/llvm-mc/Disassembler.cpp
+++ b/tools/llvm-mc/Disassembler.cpp

@@ -22,33 +22,14 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-typedef std::vector<std::pair<unsigned char, const char*> > ByteArrayTy;
-
-namespace {
-class VectorMemoryObject : public MemoryObject {
-private:
-  const ByteArrayTy &Bytes;
-public:
-  VectorMemoryObject(const ByteArrayTy &bytes) : Bytes(bytes) {}
-
-  uint64_t getBase() const override { return 0; }
-  uint64_t getExtent() const override { return Bytes.size(); }
-
-  int readByte(uint64_t Addr, uint8_t *Byte) const override {
-    if (Addr >= getExtent())
-      return -1;
-    *Byte = Bytes[Addr].first;
-    return 0;
-  }
-};
-}
+typedef std::pair<std::vector<unsigned char>, std::vector<const char *>>
+    ByteArrayTy;
 
 static bool PrintInsts(const MCDisassembler &DisAsm,
                        const ByteArrayTy &Bytes,
@@ -56,21 +37,21 @@
                        MCStreamer &Streamer, bool InAtomicBlock,
                        const MCSubtargetInfo &STI) {
   // Wrap the vector in a MemoryObject.
-  VectorMemoryObject memoryObject(Bytes);
+  ArrayRef<uint8_t> Data(Bytes.first.data(), Bytes.first.size());
 
   // Disassemble it to strings.
   uint64_t Size;
   uint64_t Index;
 
-  for (Index = 0; Index < Bytes.size(); Index += Size) {
+  for (Index = 0; Index < Bytes.first.size(); Index += Size) {
     MCInst Inst;
 
     MCDisassembler::DecodeStatus S;
-    S = DisAsm.getInstruction(Inst, Size, memoryObject, Index,
+    S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index,
                               /*REMOVE*/ nulls(), nulls());
     switch (S) {
     case MCDisassembler::Fail:
-      SM.PrintMessage(SMLoc::getFromPointer(Bytes[Index].second),
+      SM.PrintMessage(SMLoc::getFromPointer(Bytes.second[Index]),
                       SourceMgr::DK_Warning,
                       "invalid instruction encoding");
       // Don't try to resynchronise the stream in a block
@@ -83,7 +64,7 @@
       break;
 
     case MCDisassembler::SoftFail:
-      SM.PrintMessage(SMLoc::getFromPointer(Bytes[Index].second),
+      SM.PrintMessage(SMLoc::getFromPointer(Bytes.second[Index]),
                       SourceMgr::DK_Warning,
                       "potentially undefined instruction encoding");
       // Fall through
@@ -98,29 +79,23 @@
 }
 
 static bool SkipToToken(StringRef &Str) {
-  while (!Str.empty() && Str.find_first_not_of(" \t\r\n#,") != 0) {
-    // Strip horizontal whitespace and commas.
-    if (size_t Pos = Str.find_first_not_of(" \t\r,")) {
-      Str = Str.substr(Pos);
-    }
+  for (;;) {
+    if (Str.empty())
+      return false;
 
-    // If this is the end of a line or start of a comment, remove the rest of
-    // the line.
-    if (Str[0] == '\n' || Str[0] == '#') {
-      // Strip to the end of line if we already processed any bytes on this
-      // line.  This strips the comment and/or the \n.
-      if (Str[0] == '\n') {
-        Str = Str.substr(1);
-      } else {
-        Str = Str.substr(Str.find_first_of('\n'));
-        if (!Str.empty())
-          Str = Str.substr(1);
-      }
+    // Strip horizontal whitespace and commas.
+    if (size_t Pos = Str.find_first_not_of(" \t\r\n,")) {
+      Str = Str.substr(Pos);
       continue;
     }
-  }
 
-  return !Str.empty();
+    // If this is the start of a comment, remove the rest of the line.
+    if (Str[0] == '#') {
+        Str = Str.substr(Str.find_first_of('\n'));
+      continue;
+    }
+    return true;
+  }
 }
 
 
@@ -143,11 +118,13 @@
       SM.PrintMessage(SMLoc::getFromPointer(Value.data()), SourceMgr::DK_Error,
                       "invalid input token");
       Str = Str.substr(Str.find('\n'));
-      ByteArray.clear();
+      ByteArray.first.clear();
+      ByteArray.second.clear();
       continue;
     }
 
-    ByteArray.push_back(std::make_pair((unsigned char)ByteVal, Value.data()));
+    ByteArray.first.push_back(ByteVal);
+    ByteArray.second.push_back(Value.data());
     Str = Str.substr(Next);
   }
 
@@ -185,7 +162,7 @@
   }
 
   // Set up initial section manually here
-  Streamer.InitSections();
+  Streamer.InitSections(false);
 
   bool ErrorOccurred = false;
 
@@ -195,7 +172,8 @@
   bool InAtomicBlock = false;
 
   while (SkipToToken(Str)) {
-    ByteArray.clear();
+    ByteArray.first.clear();
+    ByteArray.second.clear();
 
     if (Str[0] == '[') {
       if (InAtomicBlock) {
@@ -220,7 +198,7 @@
     // It's a real token, get the bytes and emit them
     ErrorOccurred |= ByteArrayFromString(ByteArray, Str, SM);
 
-    if (!ByteArray.empty())
+    if (!ByteArray.first.empty())
       ErrorOccurred |= PrintInsts(*DisAsm, ByteArray, SM, Out, Streamer,
                                   InAtomicBlock, STI);
   }

diff --git a/tools/llvm-mc/Disassembler.h b/tools/llvm-mc/Disassembler.h
index 5615da8..1f18ac0 100644
--- a/tools/llvm-mc/Disassembler.h
+++ b/tools/llvm-mc/Disassembler.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef DISASSEMBLER_H
-#define DISASSEMBLER_H
+#ifndef LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H
+#define LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H
 
 #include <string>
 

diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 4c5b230..5da9e86 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp

@@ -208,11 +208,11 @@
   if (OutputFilename == "")
     OutputFilename = "-";
 
-  std::string Err;
+  std::error_code EC;
   tool_output_file *Out =
-      new tool_output_file(OutputFilename.c_str(), Err, sys::fs::F_None);
-  if (!Err.empty()) {
-    errs() << Err << '\n';
+      new tool_output_file(OutputFilename, EC, sys::fs::F_None);
+  if (EC) {
+    errs() << EC.message() << '\n';
     delete Out;
     return nullptr;
   }
@@ -373,12 +373,12 @@
     errs() << ProgName << ": " << EC.message() << '\n';
     return 1;
   }
-  MemoryBuffer *Buffer = BufferPtr->release();
+  MemoryBuffer *Buffer = BufferPtr->get();
 
   SourceMgr SrcMgr;
 
   // Tell SrcMgr about this buffer, which is what the parser will pick up.
-  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
+  SrcMgr.AddNewSourceBuffer(std::move(*BufferPtr), SMLoc());
 
   // Record the location of the include directories so that the lexer can find
   // it later.
@@ -471,9 +471,10 @@
     assert(FileType == OFT_ObjectFile && "Invalid file type!");
     MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
     MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU);
-    Str.reset(TheTarget->createMCObjectStreamer(TripleName, Ctx, *MAB,
-                                                FOS, CE, *STI, RelaxAll,
-                                                NoExecStack));
+    Str.reset(TheTarget->createMCObjectStreamer(TripleName, Ctx, *MAB, FOS, CE,
+                                                *STI, RelaxAll));
+    if (NoExecStack)
+      Str->InitSections(true);
   }
 
   int Res = 1;

diff --git a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
index a878f11..5654313 100644
--- a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
+++ b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp

@@ -141,14 +141,15 @@
     errs() << ToolName << ": " << EC.message() << '\n';
     return;
   }
-  MemoryBuffer *Buffer = BufferPtr->release();
+  std::unique_ptr<MemoryBuffer> &Buffer = BufferPtr.get();
 
   SourceMgr SrcMgr;
 
-  // Tell SrcMgr about this buffer, which is what the parser will pick up.
-  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
-
   StringRef InputSource = Buffer->getBuffer();
+
+  // Tell SrcMgr about this buffer, which is what the parser will pick up.
+  SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
+
   MarkupLexer Lex(InputSource);
   MarkupParser Parser(Lex, SrcMgr);
 

diff --git a/tools/llvm-nm/Android.mk b/tools/llvm-nm/Android.mk
index 98e7ba9..a17ca4d 100644
--- a/tools/llvm-nm/Android.mk
+++ b/tools/llvm-nm/Android.mk

@@ -30,19 +30,23 @@
   libLLVMMipsDesc \
   libLLVMMipsAsmPrinter \
   libLLVMMipsDisassembler \
-  libLLVMX86CodeGen \
   libLLVMX86Info \
   libLLVMX86Desc \
   libLLVMX86AsmPrinter \
   libLLVMX86AsmParser \
+  libLLVMX86CodeGen \
   libLLVMX86Utils \
   libLLVMX86Disassembler \
+  libLLVMCodeGen \
+  libLLVMAnalysis \
+  libLLVMTarget \
   libLLVMObject             \
   libLLVMBitReader          \
   libLLVMMC                 \
   libLLVMMCParser           \
   libLLVMCore               \
   libLLVMSupport            \
+  libLLVMMCDisassembler \
 
 include $(CLEAR_VARS)
 

diff --git a/tools/llvm-nm/CMakeLists.txt b/tools/llvm-nm/CMakeLists.txt
index 1fe4a2d..20293bb 100644
--- a/tools/llvm-nm/CMakeLists.txt
+++ b/tools/llvm-nm/CMakeLists.txt

@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  Core
   Object
   Support
   )

diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 3bd9ef9..be2c4fa 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp

@@ -87,8 +87,8 @@
 cl::opt<bool> DarwinFormat("m", cl::desc("Alias for --format=darwin"));
 
 static cl::list<std::string>
-ArchFlags("arch", cl::desc("architecture(s) from a Mach-O file to dump"),
-          cl::ZeroOrMore);
+    ArchFlags("arch", cl::desc("architecture(s) from a Mach-O file to dump"),
+              cl::ZeroOrMore);
 bool ArchAll = false;
 
 cl::opt<bool> PrintFileName(
@@ -136,6 +136,22 @@
                              cl::desc("Print just the symbol's name"));
 cl::alias JustSymbolNames("j", cl::desc("Alias for --just-symbol-name"),
                           cl::aliasopt(JustSymbolName));
+
+// FIXME: This option takes exactly two strings and should be allowed anywhere
+// on the command line.  Such that "llvm-nm -s __TEXT __text foo.o" would work.
+// But that does not as the CommandLine Library does not have a way to make
+// this work.  For now the "-s __TEXT __text" has to be last on the command
+// line.
+cl::list<std::string> SegSect("s", cl::Positional, cl::ZeroOrMore,
+                              cl::desc("Dump only symbols from this segment "
+                                       "and section name, Mach-O only"));
+
+cl::opt<bool> FormatMachOasHex("x", cl::desc("Print symbol entry in hex, "
+                                             "Mach-O only"));
+
+cl::opt<bool> NoLLVMBitcode("no-llvm-bc",
+                            cl::desc("Disable LLVM bitcode reader"));
+
 bool PrintAddress = true;
 
 bool MultipleFiles = false;
@@ -234,12 +250,12 @@
   }
 }
 
-static char isSymbolList64Bit(SymbolicFile *Obj) {
+static char isSymbolList64Bit(SymbolicFile &Obj) {
   if (isa<IRObjectFile>(Obj))
     return false;
   else if (isa<COFFObjectFile>(Obj))
     return false;
-  else if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(Obj))
+  else if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(&Obj))
     return MachO->is64Bit();
   else if (isa<ELF32LEObjectFile>(Obj))
     return false;
@@ -258,8 +274,10 @@
 static SymbolListT SymbolList;
 
 // darwinPrintSymbol() is used to print a symbol from a Mach-O file when the
-// the OutputFormat is darwin.  It produces the same output as darwin's nm(1) -m
-// output.
+// the OutputFormat is darwin or we are printing Mach-O symbols in hex.  For
+// the darwin format it produces the same output as darwin's nm(1) -m output
+// and when printing Mach-O symbols in hex it produces the same output as
+// darwin's nm(1) -x format.
 static void darwinPrintSymbol(MachOObjectFile *MachO, SymbolListT::iterator I,
                               char *SymbolAddrStr, const char *printBlanks) {
   MachO::mach_header H;
@@ -268,7 +286,9 @@
   MachO::nlist_64 STE_64;
   MachO::nlist STE;
   uint8_t NType;
+  uint8_t NSect;
   uint16_t NDesc;
+  uint32_t NStrx;
   uint64_t NValue;
   if (MachO->is64Bit()) {
     H_64 = MachO->MachOObjectFile::getHeader64();
@@ -276,7 +296,9 @@
     Flags = H_64.flags;
     STE_64 = MachO->getSymbol64TableEntry(I->Symb);
     NType = STE_64.n_type;
+    NSect = STE_64.n_sect;
     NDesc = STE_64.n_desc;
+    NStrx = STE_64.n_strx;
     NValue = STE_64.n_value;
   } else {
     H = MachO->MachOObjectFile::getHeader();
@@ -284,10 +306,34 @@
     Flags = H.flags;
     STE = MachO->getSymbolTableEntry(I->Symb);
     NType = STE.n_type;
+    NSect = STE.n_sect;
     NDesc = STE.n_desc;
+    NStrx = STE.n_strx;
     NValue = STE.n_value;
   }
 
+  // If we are printing Mach-O symbols in hex do that and return.
+  if (FormatMachOasHex) {
+    char Str[18] = "";
+    const char *printFormat;
+    if (MachO->is64Bit())
+      printFormat = "%016" PRIx64;
+    else
+      printFormat = "%08" PRIx64;
+    format(printFormat, NValue).print(Str, sizeof(Str));
+    outs() << Str << ' ';
+    format("%02x", NType).print(Str, sizeof(Str));
+    outs() << Str << ' ';
+    format("%02x", NSect).print(Str, sizeof(Str));
+    outs() << Str << ' ';
+    format("%04x", NDesc).print(Str, sizeof(Str));
+    outs() << Str << ' ';
+    format("%08x", NStrx).print(Str, sizeof(Str));
+    outs() << Str << ' ';
+    outs() << I->Name << "\n";
+    return;
+  }
+
   if (PrintAddress) {
     if ((NType & MachO::N_TYPE) == MachO::N_INDR)
       strcpy(SymbolAddrStr, printBlanks);
@@ -414,7 +460,87 @@
   outs() << "\n";
 }
 
-static void sortAndPrintSymbolList(SymbolicFile *Obj, bool printName) {
+// Table that maps Darwin's Mach-O stab constants to strings to allow printing.
+struct DarwinStabName {
+  uint8_t NType;
+  const char *Name;
+};
+static const struct DarwinStabName DarwinStabNames[] = {
+    {MachO::N_GSYM, "GSYM"},
+    {MachO::N_FNAME, "FNAME"},
+    {MachO::N_FUN, "FUN"},
+    {MachO::N_STSYM, "STSYM"},
+    {MachO::N_LCSYM, "LCSYM"},
+    {MachO::N_BNSYM, "BNSYM"},
+    {MachO::N_PC, "PC"},
+    {MachO::N_AST, "AST"},
+    {MachO::N_OPT, "OPT"},
+    {MachO::N_RSYM, "RSYM"},
+    {MachO::N_SLINE, "SLINE"},
+    {MachO::N_ENSYM, "ENSYM"},
+    {MachO::N_SSYM, "SSYM"},
+    {MachO::N_SO, "SO"},
+    {MachO::N_OSO, "OSO"},
+    {MachO::N_LSYM, "LSYM"},
+    {MachO::N_BINCL, "BINCL"},
+    {MachO::N_SOL, "SOL"},
+    {MachO::N_PARAMS, "PARAM"},
+    {MachO::N_VERSION, "VERS"},
+    {MachO::N_OLEVEL, "OLEV"},
+    {MachO::N_PSYM, "PSYM"},
+    {MachO::N_EINCL, "EINCL"},
+    {MachO::N_ENTRY, "ENTRY"},
+    {MachO::N_LBRAC, "LBRAC"},
+    {MachO::N_EXCL, "EXCL"},
+    {MachO::N_RBRAC, "RBRAC"},
+    {MachO::N_BCOMM, "BCOMM"},
+    {MachO::N_ECOMM, "ECOMM"},
+    {MachO::N_ECOML, "ECOML"},
+    {MachO::N_LENG, "LENG"},
+    {0, 0}};
+static const char *getDarwinStabString(uint8_t NType) {
+  for (unsigned i = 0; DarwinStabNames[i].Name; i++) {
+    if (DarwinStabNames[i].NType == NType)
+      return DarwinStabNames[i].Name;
+  }
+  return 0;
+}
+
+// darwinPrintStab() prints the n_sect, n_desc along with a symbolic name of
+// a stab n_type value in a Mach-O file.
+static void darwinPrintStab(MachOObjectFile *MachO, SymbolListT::iterator I) {
+  MachO::nlist_64 STE_64;
+  MachO::nlist STE;
+  uint8_t NType;
+  uint8_t NSect;
+  uint16_t NDesc;
+  if (MachO->is64Bit()) {
+    STE_64 = MachO->getSymbol64TableEntry(I->Symb);
+    NType = STE_64.n_type;
+    NSect = STE_64.n_sect;
+    NDesc = STE_64.n_desc;
+  } else {
+    STE = MachO->getSymbolTableEntry(I->Symb);
+    NType = STE.n_type;
+    NSect = STE.n_sect;
+    NDesc = STE.n_desc;
+  }
+
+  char Str[18] = "";
+  format("%02x", NSect).print(Str, sizeof(Str));
+  outs() << ' ' << Str << ' ';
+  format("%04x", NDesc).print(Str, sizeof(Str));
+  outs() << Str << ' ';
+  if (const char *stabString = getDarwinStabString(NType))
+    format("%5.5s", stabString).print(Str, sizeof(Str));
+  else
+    format("   %02x", NType).print(Str, sizeof(Str));
+  outs() << Str;
+}
+
+static void sortAndPrintSymbolList(SymbolicFile &Obj, bool printName,
+                                   std::string ArchiveName,
+                                   std::string ArchitectureName) {
   if (!NoSort) {
     if (NumericSort)
       std::sort(SymbolList.begin(), SymbolList.end(), compareSymbolAddress);
@@ -424,14 +550,16 @@
       std::sort(SymbolList.begin(), SymbolList.end(), compareSymbolName);
   }
 
-  if (OutputFormat == posix && MultipleFiles && printName) {
-    outs() << '\n' << CurrentFilename << ":\n";
-  } else if (OutputFormat == bsd && MultipleFiles && printName) {
-    outs() << "\n" << CurrentFilename << ":\n";
-  } else if (OutputFormat == sysv) {
-    outs() << "\n\nSymbols from " << CurrentFilename << ":\n\n"
-           << "Name                  Value   Class        Type"
-           << "         Size   Line  Section\n";
+  if (!PrintFileName) {
+    if (OutputFormat == posix && MultipleFiles && printName) {
+      outs() << '\n' << CurrentFilename << ":\n";
+    } else if (OutputFormat == bsd && MultipleFiles && printName) {
+      outs() << "\n" << CurrentFilename << ":\n";
+    } else if (OutputFormat == sysv) {
+      outs() << "\n\nSymbols from " << CurrentFilename << ":\n\n"
+             << "Name                  Value   Class        Type"
+             << "         Size   Line  Section\n";
+    }
   }
 
   const char *printBlanks, *printFormat;
@@ -451,7 +579,14 @@
       continue;
     if (SizeSort && !PrintAddress && I->Size == UnknownAddressOrSize)
       continue;
-    if (JustSymbolName) {
+    if (PrintFileName) {
+      if (!ArchitectureName.empty())
+        outs() << "(for architecture " << ArchitectureName << "):";
+      if (!ArchiveName.empty())
+        outs() << ArchiveName << ":";
+      outs() << CurrentFilename << ": ";
+    }
+    if (JustSymbolName || (UndefinedOnly && isa<MachOObjectFile>(Obj))) {
       outs() << I->Name << "\n";
       continue;
     }
@@ -470,11 +605,13 @@
     if (I->Size != UnknownAddressOrSize)
       format(printFormat, I->Size).print(SymbolSizeStr, sizeof(SymbolSizeStr));
 
-    // If OutputFormat is darwin and we have a MachOObjectFile print as darwin's
-    // nm(1) -m output, else if OutputFormat is darwin and not a Mach-O object
-    // fall back to OutputFormat bsd (see below).
-    MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(Obj);
-    if (OutputFormat == darwin && MachO) {
+    // If OutputFormat is darwin or we are printing Mach-O symbols in hex and
+    // we have a MachOObjectFile, call darwinPrintSymbol to print as darwin's
+    // nm(1) -m output or hex, else if OutputFormat is darwin or we are
+    // printing Mach-O symbols in hex and not a Mach-O object fall back to
+    // OutputFormat bsd (see below).
+    MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(&Obj);
+    if ((OutputFormat == darwin || FormatMachOasHex) && MachO) {
       darwinPrintSymbol(MachO, I, SymbolAddrStr, printBlanks);
     } else if (OutputFormat == posix) {
       outs() << I->Name << " " << I->TypeChar << " " << SymbolAddrStr
@@ -487,7 +624,10 @@
         if (I->Size != UnknownAddressOrSize)
           outs() << ' ';
       }
-      outs() << I->TypeChar << " " << I->Name << "\n";
+      outs() << I->TypeChar;
+      if (I->TypeChar == '-' && MachO)
+        darwinPrintStab(MachO, I);
+      outs() << " " << I->Name << "\n";
     } else if (OutputFormat == sysv) {
       std::string PaddedName(I->Name);
       while (PaddedName.length() < 20)
@@ -549,7 +689,7 @@
 }
 
 static char getSymbolNMTypeChar(COFFObjectFile &Obj, symbol_iterator I) {
-  const coff_symbol *Symb = Obj.getCOFFSymbol(*I);
+  COFFSymbolRef Symb = Obj.getCOFFSymbol(*I);
   // OK, this is COFF.
   symbol_iterator SymI(I);
 
@@ -566,7 +706,7 @@
     return Ret;
 
   uint32_t Characteristics = 0;
-  if (!COFF::isReservedSectionNumber(Symb->SectionNumber)) {
+  if (!COFF::isReservedSectionNumber(Symb.getSectionNumber())) {
     section_iterator SecI = Obj.section_end();
     if (error(SymI->getSection(SecI)))
       return '?';
@@ -574,25 +714,21 @@
     Characteristics = Section->Characteristics;
   }
 
-  switch (Symb->SectionNumber) {
+  switch (Symb.getSectionNumber()) {
   case COFF::IMAGE_SYM_DEBUG:
     return 'n';
   default:
     // Check section type.
     if (Characteristics & COFF::IMAGE_SCN_CNT_CODE)
       return 't';
-    else if (Characteristics & COFF::IMAGE_SCN_MEM_READ &&
-             ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
-      return 'r';
-    else if (Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
-      return 'd';
-    else if (Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+    if (Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+      return Characteristics & COFF::IMAGE_SCN_MEM_WRITE ? 'd' : 'r';
+    if (Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
       return 'b';
-    else if (Characteristics & COFF::IMAGE_SCN_LNK_INFO)
+    if (Characteristics & COFF::IMAGE_SCN_LNK_INFO)
       return 'i';
-
     // Check for section symbol.
-    else if (Symb->isSectionDefinition())
+    if (Symb.isSectionDefinition())
       return 's';
   }
 
@@ -612,6 +748,9 @@
   DataRefImpl Symb = I->getRawDataRefImpl();
   uint8_t NType = getNType(Obj, Symb);
 
+  if (NType & MachO::N_STAB)
+    return '-';
+
   switch (NType & MachO::N_TYPE) {
   case MachO::N_ABS:
     return 's';
@@ -654,7 +793,7 @@
 }
 
 template <class ELFT>
-static bool isObject(ELFObjectFile<ELFT> &Obj, symbol_iterator I) {
+static bool isELFObject(ELFObjectFile<ELFT> &Obj, symbol_iterator I) {
   typedef typename ELFObjectFile<ELFT>::Elf_Sym Elf_Sym;
 
   DataRefImpl Symb = I->getRawDataRefImpl();
@@ -663,19 +802,19 @@
   return ESym->getType() == ELF::STT_OBJECT;
 }
 
-static bool isObject(SymbolicFile *Obj, basic_symbol_iterator I) {
-  if (ELF32LEObjectFile *ELF = dyn_cast<ELF32LEObjectFile>(Obj))
-    return isObject(*ELF, I);
-  if (ELF64LEObjectFile *ELF = dyn_cast<ELF64LEObjectFile>(Obj))
-    return isObject(*ELF, I);
-  if (ELF32BEObjectFile *ELF = dyn_cast<ELF32BEObjectFile>(Obj))
-    return isObject(*ELF, I);
-  if (ELF64BEObjectFile *ELF = dyn_cast<ELF64BEObjectFile>(Obj))
-    return isObject(*ELF, I);
+static bool isObject(SymbolicFile &Obj, basic_symbol_iterator I) {
+  if (ELF32LEObjectFile *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
+    return isELFObject(*ELF, I);
+  if (ELF64LEObjectFile *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
+    return isELFObject(*ELF, I);
+  if (ELF32BEObjectFile *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
+    return isELFObject(*ELF, I);
+  if (ELF64BEObjectFile *ELF = dyn_cast<ELF64BEObjectFile>(&Obj))
+    return isELFObject(*ELF, I);
   return false;
 }
 
-static char getNMTypeChar(SymbolicFile *Obj, basic_symbol_iterator I) {
+static char getNMTypeChar(SymbolicFile &Obj, basic_symbol_iterator I) {
   uint32_t Symflags = I->getFlags();
   if ((Symflags & object::SymbolRef::SF_Weak) && !isa<MachOObjectFile>(Obj)) {
     char Ret = isObject(Obj, I) ? 'v' : 'w';
@@ -693,20 +832,20 @@
   char Ret = '?';
   if (Symflags & object::SymbolRef::SF_Absolute)
     Ret = 'a';
-  else if (IRObjectFile *IR = dyn_cast<IRObjectFile>(Obj))
+  else if (IRObjectFile *IR = dyn_cast<IRObjectFile>(&Obj))
     Ret = getSymbolNMTypeChar(*IR, I);
-  else if (COFFObjectFile *COFF = dyn_cast<COFFObjectFile>(Obj))
+  else if (COFFObjectFile *COFF = dyn_cast<COFFObjectFile>(&Obj))
     Ret = getSymbolNMTypeChar(*COFF, I);
-  else if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(Obj))
+  else if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(&Obj))
     Ret = getSymbolNMTypeChar(*MachO, I);
-  else if (ELF32LEObjectFile *ELF = dyn_cast<ELF32LEObjectFile>(Obj))
+  else if (ELF32LEObjectFile *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
     Ret = getSymbolNMTypeChar(*ELF, I);
-  else if (ELF64LEObjectFile *ELF = dyn_cast<ELF64LEObjectFile>(Obj))
+  else if (ELF64LEObjectFile *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
     Ret = getSymbolNMTypeChar(*ELF, I);
-  else if (ELF32BEObjectFile *ELF = dyn_cast<ELF32BEObjectFile>(Obj))
+  else if (ELF32BEObjectFile *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
     Ret = getSymbolNMTypeChar(*ELF, I);
   else
-    Ret = getSymbolNMTypeChar(*cast<ELF64BEObjectFile>(Obj), I);
+    Ret = getSymbolNMTypeChar(cast<ELF64BEObjectFile>(Obj), I);
 
   if (Symflags & object::SymbolRef::SF_Global)
     Ret = toupper(Ret);
@@ -714,32 +853,90 @@
   return Ret;
 }
 
-static void dumpSymbolNamesFromObject(SymbolicFile *Obj, bool printName) {
-  basic_symbol_iterator IBegin = Obj->symbol_begin();
-  basic_symbol_iterator IEnd = Obj->symbol_end();
+// getNsectForSegSect() is used to implement the Mach-O "-s segname sectname"
+// option to dump only those symbols from that section in a Mach-O file.
+// It is called once for each Mach-O file from dumpSymbolNamesFromObject()
+// to get the section number for that named section from the command line
+// arguments. It returns the section number for that section in the Mach-O
+// file or zero it is not present.
+static unsigned getNsectForSegSect(MachOObjectFile *Obj) {
+  unsigned Nsect = 1;
+  for (section_iterator I = Obj->section_begin(), E = Obj->section_end();
+       I != E; ++I) {
+    DataRefImpl Ref = I->getRawDataRefImpl();
+    StringRef SectionName;
+    Obj->getSectionName(Ref, SectionName);
+    StringRef SegmentName = Obj->getSectionFinalSegmentName(Ref);
+    if (SegmentName == SegSect[0] && SectionName == SegSect[1])
+      return Nsect;
+    Nsect++;
+  }
+  return 0;
+}
+
+// getNsectInMachO() is used to implement the Mach-O "-s segname sectname"
+// option to dump only those symbols from that section in a Mach-O file.
+// It is called once for each symbol in a Mach-O file from
+// dumpSymbolNamesFromObject() and returns the section number for that symbol
+// if it is in a section, else it returns 0.
+static unsigned getNsectInMachO(MachOObjectFile &Obj, basic_symbol_iterator I) {
+  DataRefImpl Symb = I->getRawDataRefImpl();
+  if (Obj.is64Bit()) {
+    MachO::nlist_64 STE = Obj.getSymbol64TableEntry(Symb);
+    if ((STE.n_type & MachO::N_TYPE) == MachO::N_SECT)
+      return STE.n_sect;
+    return 0;
+  }
+  MachO::nlist STE = Obj.getSymbolTableEntry(Symb);
+  if ((STE.n_type & MachO::N_TYPE) == MachO::N_SECT)
+    return STE.n_sect;
+  return 0;
+}
+
+static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
+                                      std::string ArchiveName = std::string(),
+                                      std::string ArchitectureName =
+                                        std::string()) {
+  basic_symbol_iterator IBegin = Obj.symbol_begin();
+  basic_symbol_iterator IEnd = Obj.symbol_end();
   if (DynamicSyms) {
-    if (!Obj->isELF()) {
-      error("File format has no dynamic symbol table", Obj->getFileName());
+    if (!Obj.isELF()) {
+      error("File format has no dynamic symbol table", Obj.getFileName());
       return;
     }
     std::pair<symbol_iterator, symbol_iterator> IDyn =
-        getELFDynamicSymbolIterators(Obj);
+        getELFDynamicSymbolIterators(&Obj);
     IBegin = IDyn.first;
     IEnd = IDyn.second;
   }
   std::string NameBuffer;
   raw_string_ostream OS(NameBuffer);
+  // If a "-s segname sectname" option was specified and this is a Mach-O
+  // file get the section number for that section in this object file.
+  unsigned int Nsect = 0;
+  MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(&Obj);
+  if (SegSect.size() != 0 && MachO) {
+    Nsect = getNsectForSegSect(MachO);
+    // If this section is not in the object file no symbols are printed.
+    if (Nsect == 0)
+      return;
+  }
   for (basic_symbol_iterator I = IBegin; I != IEnd; ++I) {
     uint32_t SymFlags = I->getFlags();
     if (!DebugSyms && (SymFlags & SymbolRef::SF_FormatSpecific))
       continue;
     if (WithoutAliases) {
-      if (IRObjectFile *IR = dyn_cast<IRObjectFile>(Obj)) {
+      if (IRObjectFile *IR = dyn_cast<IRObjectFile>(&Obj)) {
         const GlobalValue *GV = IR->getSymbolGV(I->getRawDataRefImpl());
         if (GV && isa<GlobalAlias>(GV))
           continue;
       }
     }
+    // If a "-s segname sectname" option was specified and this is a Mach-O
+    // file and this section appears in this file, Nsect will be non-zero then
+    // see if this symbol is a symbol from that section and if not skip it.
+    if (Nsect && Nsect != getNsectInMachO(*MachO, I))
+      continue;
     NMSymbol S;
     S.Size = UnknownAddressOrSize;
     S.Address = UnknownAddressOrSize;
@@ -766,8 +963,8 @@
     P += strlen(P) + 1;
   }
 
-  CurrentFilename = Obj->getFileName();
-  sortAndPrintSymbolList(Obj, printName);
+  CurrentFilename = Obj.getFileName();
+  sortAndPrintSymbolList(Obj, printName, ArchiveName, ArchitectureName);
 }
 
 // checkMachOAndArchFlags() checks to see if the SymbolicFile is a Mach-O file
@@ -809,16 +1006,15 @@
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (error(BufferOrErr.getError(), Filename))
     return;
-  std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrErr.get());
 
   LLVMContext &Context = getGlobalContext();
-  ErrorOr<Binary *> BinaryOrErr = createBinary(Buffer, &Context);
+  ErrorOr<std::unique_ptr<Binary>> BinaryOrErr = createBinary(
+      BufferOrErr.get()->getMemBufferRef(), NoLLVMBitcode ? nullptr : &Context);
   if (error(BinaryOrErr.getError(), Filename))
     return;
-  Buffer.release();
-  std::unique_ptr<Binary> Bin(BinaryOrErr.get());
+  Binary &Bin = *BinaryOrErr.get();
 
-  if (Archive *A = dyn_cast<Archive>(Bin.get())) {
+  if (Archive *A = dyn_cast<Archive>(&Bin)) {
     if (ArchiveMap) {
       Archive::symbol_iterator I = A->symbol_begin();
       Archive::symbol_iterator E = A->symbol_end();
@@ -846,18 +1042,20 @@
       if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
         if (!checkMachOAndArchFlags(O, Filename))
           return;
-        outs() << "\n";
-        if (isa<MachOObjectFile>(O)) {
-          outs() << Filename << "(" << O->getFileName() << ")";
-        } else
-          outs() << O->getFileName();
-        outs() << ":\n";
-        dumpSymbolNamesFromObject(O, false);
+        if (!PrintFileName) {
+          outs() << "\n";
+          if (isa<MachOObjectFile>(O)) {
+            outs() << Filename << "(" << O->getFileName() << ")";
+          } else
+            outs() << O->getFileName();
+          outs() << ":\n";
+        }
+        dumpSymbolNamesFromObject(*O, false, Filename);
       }
     }
     return;
   }
-  if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(Bin.get())) {
+  if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(&Bin)) {
     // If we have a list of architecture flags specified dump only those.
     if (!ArchAll && ArchFlags.size() != 0) {
       // Look for a slice in the universal binary that matches each ArchFlag.
@@ -872,14 +1070,22 @@
             ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr =
                 I->getAsObjectFile();
             std::unique_ptr<Archive> A;
+            std::string ArchiveName;
+            std::string ArchitectureName;
+            ArchiveName.clear();
+            ArchitectureName.clear();
             if (ObjOrErr) {
-              std::unique_ptr<ObjectFile> Obj = std::move(ObjOrErr.get());
+              ObjectFile &Obj = *ObjOrErr.get();
               if (ArchFlags.size() > 1) {
-                outs() << "\n" << Obj->getFileName() << " (for architecture "
-                       << I->getArchTypeName() << ")"
-                       << ":\n";
+                if (PrintFileName)
+                  ArchitectureName = I->getArchTypeName();
+                else
+                  outs() << "\n" << Obj.getFileName() << " (for architecture "
+                         << I->getArchTypeName() << ")"
+                         << ":\n";
               }
-              dumpSymbolNamesFromObject(Obj.get(), false);
+              dumpSymbolNamesFromObject(Obj, false, ArchiveName,
+                                        ArchitectureName);
             } else if (!I->getAsArchive(A)) {
               for (Archive::child_iterator AI = A->child_begin(),
                                            AE = A->child_end();
@@ -890,14 +1096,21 @@
                   continue;
                 if (SymbolicFile *O =
                         dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-                  outs() << "\n" << A->getFileName();
-                  outs() << "(" << O->getFileName() << ")";
-                  if (ArchFlags.size() > 1) {
-                    outs() << " (for architecture " << I->getArchTypeName()
-                           << ")";
+                  if (PrintFileName) {
+                    ArchiveName = A->getFileName();
+                    if (ArchFlags.size() > 1)
+                      ArchitectureName = I->getArchTypeName();
+                  } else {
+                    outs() << "\n" << A->getFileName();
+                    outs() << "(" << O->getFileName() << ")";
+                    if (ArchFlags.size() > 1) {
+                      outs() << " (for architecture " << I->getArchTypeName()
+                             << ")";
+                    }
+                    outs() << ":\n";
                   }
-                  outs() << ":\n";
-                  dumpSymbolNamesFromObject(O, false);
+                  dumpSymbolNamesFromObject(*O, false, ArchiveName,
+                                            ArchitectureName);
                 }
               }
             }
@@ -921,9 +1134,11 @@
         if (HostArchName == I->getArchTypeName()) {
           ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
           std::unique_ptr<Archive> A;
+          std::string ArchiveName;
+          ArchiveName.clear();
           if (ObjOrErr) {
-            std::unique_ptr<ObjectFile> Obj = std::move(ObjOrErr.get());
-            dumpSymbolNamesFromObject(Obj.get(), false);
+            ObjectFile &Obj = *ObjOrErr.get();
+            dumpSymbolNamesFromObject(Obj, false);
           } else if (!I->getAsArchive(A)) {
             for (Archive::child_iterator AI = A->child_begin(),
                                          AE = A->child_end();
@@ -934,10 +1149,13 @@
                 continue;
               if (SymbolicFile *O =
                       dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-                outs() << "\n" << A->getFileName() << "(" << O->getFileName()
-                       << ")"
-                       << ":\n";
-                dumpSymbolNamesFromObject(O, false);
+                if (PrintFileName)
+                  ArchiveName = A->getFileName();
+                else
+                  outs() << "\n" << A->getFileName() << "(" << O->getFileName()
+                         << ")"
+                         << ":\n";
+                dumpSymbolNamesFromObject(*O, false, ArchiveName);
               }
             }
           }
@@ -953,15 +1171,24 @@
          I != E; ++I) {
       ErrorOr<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
       std::unique_ptr<Archive> A;
+      std::string ArchiveName;
+      std::string ArchitectureName;
+      ArchiveName.clear();
+      ArchitectureName.clear();
       if (ObjOrErr) {
-        std::unique_ptr<ObjectFile> Obj = std::move(ObjOrErr.get());
-        if (moreThanOneArch)
-          outs() << "\n";
-        outs() << Obj->getFileName();
-        if (isa<MachOObjectFile>(Obj.get()) && moreThanOneArch)
-          outs() << " (for architecture " << I->getArchTypeName() << ")";
-        outs() << ":\n";
-        dumpSymbolNamesFromObject(Obj.get(), false);
+        ObjectFile &Obj = *ObjOrErr.get();
+        if (PrintFileName) {
+          if (isa<MachOObjectFile>(Obj) && moreThanOneArch)
+            ArchitectureName = I->getArchTypeName();
+        } else {
+          if (moreThanOneArch)
+            outs() << "\n";
+          outs() << Obj.getFileName();
+          if (isa<MachOObjectFile>(Obj) && moreThanOneArch)
+            outs() << " (for architecture " << I->getArchTypeName() << ")";
+          outs() << ":\n";
+        }
+        dumpSymbolNamesFromObject(Obj, false, ArchiveName, ArchitectureName);
       } else if (!I->getAsArchive(A)) {
         for (Archive::child_iterator AI = A->child_begin(), AE = A->child_end();
              AI != AE; ++AI) {
@@ -970,25 +1197,32 @@
           if (ChildOrErr.getError())
             continue;
           if (SymbolicFile *O = dyn_cast<SymbolicFile>(&*ChildOrErr.get())) {
-            outs() << "\n" << A->getFileName();
-            if (isa<MachOObjectFile>(O)) {
-              outs() << "(" << O->getFileName() << ")";
-              if (moreThanOneArch)
-                outs() << " (for architecture " << I->getArchTypeName() << ")";
-            } else
-              outs() << ":" << O->getFileName();
-            outs() << ":\n";
-            dumpSymbolNamesFromObject(O, false);
+            if (PrintFileName) {
+              ArchiveName = A->getFileName();
+              if (isa<MachOObjectFile>(O) && moreThanOneArch)
+                ArchitectureName = I->getArchTypeName();
+            } else {
+              outs() << "\n" << A->getFileName();
+              if (isa<MachOObjectFile>(O)) {
+                outs() << "(" << O->getFileName() << ")";
+                if (moreThanOneArch)
+                  outs() << " (for architecture " << I->getArchTypeName()
+                         << ")";
+              } else
+                outs() << ":" << O->getFileName();
+              outs() << ":\n";
+            }
+            dumpSymbolNamesFromObject(*O, false, ArchiveName, ArchitectureName);
           }
         }
       }
     }
     return;
   }
-  if (SymbolicFile *O = dyn_cast<SymbolicFile>(Bin.get())) {
+  if (SymbolicFile *O = dyn_cast<SymbolicFile>(&Bin)) {
     if (!checkMachOAndArchFlags(O, Filename))
       return;
-    dumpSymbolNamesFromObject(O, true);
+    dumpSymbolNamesFromObject(*O, true);
     return;
   }
   error("unrecognizable file type", Filename);
@@ -1040,13 +1274,16 @@
     if (ArchFlags[i] == "all") {
       ArchAll = true;
     } else {
-      Triple T = MachOObjectFile::getArch(ArchFlags[i]);
-      if (T.getArch() == Triple::UnknownArch)
+      if (!MachOObjectFile::isValidArch(ArchFlags[i]))
         error("Unknown architecture named '" + ArchFlags[i] + "'",
               "for the -arch option");
     }
   }
 
+  if (SegSect.size() != 0 && SegSect.size() != 2)
+    error("bad number of arguments (must be two arguments)",
+          "for the -s option");
+
   std::for_each(InputFilenames.begin(), InputFilenames.end(),
                 dumpSymbolNamesFromFile);
 

diff --git a/tools/llvm-objdump/Android.mk b/tools/llvm-objdump/Android.mk
index 8105ebf..077e0ee 100644
--- a/tools/llvm-objdump/Android.mk
+++ b/tools/llvm-objdump/Android.mk

@@ -34,14 +34,16 @@
   libLLVMX86Info \
   libLLVMX86Desc \
   libLLVMX86AsmParser \
+  libLLVMX86CodeGen \
   libLLVMX86AsmPrinter \
   libLLVMX86Utils \
   libLLVMX86Disassembler \
   libLLVMAsmPrinter \
+  libLLVMCodeGen \
+  libLLVMAnalysis \
   libLLVMTarget \
   libLLVMObject \
   libLLVMMCParser \
-  libLLVMMCAnalysis \
   libLLVMMC \
   libLLVMMCDisassembler \
   libLLVMBitReader \

diff --git a/tools/llvm-objdump/CMakeLists.txt b/tools/llvm-objdump/CMakeLists.txt
index d63602b..61bf3b3 100644
--- a/tools/llvm-objdump/CMakeLists.txt
+++ b/tools/llvm-objdump/CMakeLists.txt

@@ -2,7 +2,7 @@
   ${LLVM_TARGETS_TO_BUILD}
   DebugInfo
   MC
-  MCAnalysis
+  MCDisassembler
   Object
   Support
   )

diff --git a/tools/llvm-objdump/COFFDump.cpp b/tools/llvm-objdump/COFFDump.cpp
index 39d8e8e..4a20b91 100644
--- a/tools/llvm-objdump/COFFDump.cpp
+++ b/tools/llvm-objdump/COFFDump.cpp

@@ -260,11 +260,8 @@
   if (!PE32Header)
     return;
 
-  const coff_file_header *Header;
-  if (error(Obj->getCOFFHeader(Header)))
-    return;
   // Currently only x86 is supported
-  if (Header->Machine != COFF::IMAGE_FILE_MACHINE_I386)
+  if (Obj->getMachine() != COFF::IMAGE_FILE_MACHINE_I386)
     return;
 
   const data_directory *DataDir;
@@ -325,7 +322,7 @@
     const import_lookup_table_entry32 *entry;
     if (I->getImportLookupEntry(entry))
       return;
-    for (; entry->data; ++entry) {
+    for (; entry->Data; ++entry) {
       if (entry->isOrdinal()) {
         outs() << format("      % 6d\n", entry->getOrdinal());
         continue;
@@ -518,11 +515,7 @@
 }
 
 void llvm::printCOFFUnwindInfo(const COFFObjectFile *Obj) {
-  const coff_file_header *Header;
-  if (error(Obj->getCOFFHeader(Header)))
-    return;
-
-  if (Header->Machine != COFF::IMAGE_FILE_MACHINE_AMD64) {
+  if (Obj->getMachine() != COFF::IMAGE_FILE_MACHINE_AMD64) {
     errs() << "Unsupported image machine type "
               "(currently only AMD64 is supported).\n";
     return;

diff --git a/tools/llvm-objdump/LLVMBuild.txt b/tools/llvm-objdump/LLVMBuild.txt
index d9c09b6..d16c501 100644
--- a/tools/llvm-objdump/LLVMBuild.txt
+++ b/tools/llvm-objdump/LLVMBuild.txt

@@ -19,4 +19,4 @@
 type = Tool
 name = llvm-objdump
 parent = Tools
-required_libraries = DebugInfo MC MCAnalysis MCDisassembler MCParser Object all-targets
+required_libraries = DebugInfo MC MCDisassembler MCParser Object all-targets

diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 4b46ac4..3a28703 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp

@@ -12,16 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-objdump.h"
+#include "llvm-c/Disassembler.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Config/config.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -30,41 +31,70 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/MachO.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstring>
 #include <system_error>
+
+#if HAVE_CXXABI_H
+#include <cxxabi.h>
+#endif
+
 using namespace llvm;
 using namespace object;
 
 static cl::opt<bool>
-  UseDbg("g", cl::desc("Print line information from debug info if available"));
+    UseDbg("g",
+           cl::desc("Print line information from debug info if available"));
 
-static cl::opt<std::string>
-  DSYMFile("dsym", cl::desc("Use .dSYM file for debug info"));
+static cl::opt<std::string> DSYMFile("dsym",
+                                     cl::desc("Use .dSYM file for debug info"));
 
-static const Target *GetTarget(const MachOObjectFile *MachOObj) {
+static cl::opt<bool> FullLeadingAddr("full-leading-addr",
+                                     cl::desc("Print full leading address"));
+
+static cl::opt<bool>
+    PrintImmHex("print-imm-hex",
+                cl::desc("Use hex format for immediate values"));
+
+static std::string ThumbTripleName;
+
+static const Target *GetTarget(const MachOObjectFile *MachOObj,
+                               const char **McpuDefault,
+                               const Target **ThumbTarget) {
   // Figure out the target triple.
   if (TripleName.empty()) {
     llvm::Triple TT("unknown-unknown-unknown");
-    TT.setArch(Triple::ArchType(MachOObj->getArch()));
+    llvm::Triple ThumbTriple = Triple();
+    TT = MachOObj->getArch(McpuDefault, &ThumbTriple);
     TripleName = TT.str();
+    ThumbTripleName = ThumbTriple.str();
   }
 
   // Get the target specific parser.
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(TripleName, Error);
-  if (TheTarget)
+  if (TheTarget && ThumbTripleName.empty())
     return TheTarget;
 
-  errs() << "llvm-objdump: error: unable to get target for '" << TripleName
-         << "', see --version and --triple.\n";
+  *ThumbTarget = TargetRegistry::lookupTarget(ThumbTripleName, Error);
+  if (*ThumbTarget)
+    return TheTarget;
+
+  errs() << "llvm-objdump: error: unable to get target for '";
+  if (!TheTarget)
+    errs() << TripleName;
+  else
+    errs() << ThumbTripleName;
+  errs() << "', see --version and --triple.\n";
   return nullptr;
 }
 
@@ -93,58 +123,80 @@
 typedef std::vector<DiceTableEntry> DiceTable;
 typedef DiceTable::iterator dice_table_iterator;
 
-static bool
-compareDiceTableEntries(const DiceTableEntry i,
-                        const DiceTableEntry j) {
-  return i.first == j.first;
+// This is used to search for a data in code table entry for the PC being
+// disassembled.  The j parameter has the PC in j.first.  A single data in code
+// table entry can cover many bytes for each of its Kind's.  So if the offset,
+// aka the i.first value, of the data in code table entry plus its Length
+// covers the PC being searched for this will return true.  If not it will
+// return false.
+static bool compareDiceTableEntries(const DiceTableEntry &i,
+                                    const DiceTableEntry &j) {
+  uint16_t Length;
+  i.second.getLength(Length);
+
+  return j.first >= i.first && j.first < i.first + Length;
 }
 
-static void DumpDataInCode(const char *bytes, uint64_t Size,
-                           unsigned short Kind) {
-  uint64_t Value;
+static uint64_t DumpDataInCode(const char *bytes, uint64_t Length,
+                               unsigned short Kind) {
+  uint32_t Value, Size = 1;
 
   switch (Kind) {
+  default:
   case MachO::DICE_KIND_DATA:
-    switch (Size) {
-    case 4:
-      Value = bytes[3] << 24 |
-              bytes[2] << 16 |
-              bytes[1] << 8 |
-              bytes[0];
+    if (Length >= 4) {
+      if (!NoShowRawInsn)
+        DumpBytes(StringRef(bytes, 4));
+      Value = bytes[3] << 24 | bytes[2] << 16 | bytes[1] << 8 | bytes[0];
       outs() << "\t.long " << Value;
-      break;
-    case 2:
-      Value = bytes[1] << 8 |
-              bytes[0];
+      Size = 4;
+    } else if (Length >= 2) {
+      if (!NoShowRawInsn)
+        DumpBytes(StringRef(bytes, 2));
+      Value = bytes[1] << 8 | bytes[0];
       outs() << "\t.short " << Value;
-      break;
-    case 1:
+      Size = 2;
+    } else {
+      if (!NoShowRawInsn)
+        DumpBytes(StringRef(bytes, 2));
       Value = bytes[0];
       outs() << "\t.byte " << Value;
-      break;
+      Size = 1;
     }
-    outs() << "\t@ KIND_DATA\n";
+    if (Kind == MachO::DICE_KIND_DATA)
+      outs() << "\t@ KIND_DATA\n";
+    else
+      outs() << "\t@ data in code kind = " << Kind << "\n";
     break;
   case MachO::DICE_KIND_JUMP_TABLE8:
+    if (!NoShowRawInsn)
+      DumpBytes(StringRef(bytes, 1));
     Value = bytes[0];
-    outs() << "\t.byte " << Value << "\t@ KIND_JUMP_TABLE8";
+    outs() << "\t.byte " << format("%3u", Value) << "\t@ KIND_JUMP_TABLE8\n";
+    Size = 1;
     break;
   case MachO::DICE_KIND_JUMP_TABLE16:
-    Value = bytes[1] << 8 |
-            bytes[0];
-    outs() << "\t.short " << Value << "\t@ KIND_JUMP_TABLE16";
+    if (!NoShowRawInsn)
+      DumpBytes(StringRef(bytes, 2));
+    Value = bytes[1] << 8 | bytes[0];
+    outs() << "\t.short " << format("%5u", Value & 0xffff)
+           << "\t@ KIND_JUMP_TABLE16\n";
+    Size = 2;
     break;
   case MachO::DICE_KIND_JUMP_TABLE32:
-    Value = bytes[3] << 24 |
-            bytes[2] << 16 |
-            bytes[1] << 8 |
-            bytes[0];
-    outs() << "\t.long " << Value << "\t@ KIND_JUMP_TABLE32";
-    break;
-  default:
-    outs() << "\t@ data in code kind = " << Kind << "\n";
+  case MachO::DICE_KIND_ABS_JUMP_TABLE32:
+    if (!NoShowRawInsn)
+      DumpBytes(StringRef(bytes, 4));
+    Value = bytes[3] << 24 | bytes[2] << 16 | bytes[1] << 8 | bytes[0];
+    outs() << "\t.long " << Value;
+    if (Kind == MachO::DICE_KIND_JUMP_TABLE32)
+      outs() << "\t@ KIND_JUMP_TABLE32\n";
+    else
+      outs() << "\t@ KIND_ABS_JUMP_TABLE32\n";
+    Size = 4;
     break;
   }
+  return Size;
 }
 
 static void getSectionsAndSymbols(const MachO::mach_header Header,
@@ -165,20 +217,18 @@
   MachOObjectFile::LoadCommandInfo Command =
       MachOObj->getFirstLoadCommandInfo();
   bool BaseSegmentAddressSet = false;
-  for (unsigned i = 0; ; ++i) {
+  for (unsigned i = 0;; ++i) {
     if (Command.C.cmd == MachO::LC_FUNCTION_STARTS) {
       // We found a function starts segment, parse the addresses for later
       // consumption.
       MachO::linkedit_data_command LLC =
-        MachOObj->getLinkeditDataLoadCommand(Command);
+          MachOObj->getLinkeditDataLoadCommand(Command);
 
       MachOObj->ReadULEB128s(LLC.dataoff, FoundFns);
-    }
-    else if (Command.C.cmd == MachO::LC_SEGMENT) {
-      MachO::segment_command SLC =
-        MachOObj->getSegmentLoadCommand(Command);
+    } else if (Command.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command SLC = MachOObj->getSegmentLoadCommand(Command);
       StringRef SegName = SLC.segname;
-      if(!BaseSegmentAddressSet && SegName != "__PAGEZERO") {
+      if (!BaseSegmentAddressSet && SegName != "__PAGEZERO") {
         BaseSegmentAddressSet = true;
         BaseSegmentAddress = SLC.vmaddr;
       }
@@ -195,29 +245,1371 @@
                                    MachOObjectFile *MachOOF);
 
 void llvm::DisassembleInputMachO(StringRef Filename) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buff =
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
-  if (std::error_code EC = Buff.getError()) {
+  if (std::error_code EC = BuffOrErr.getError()) {
     errs() << "llvm-objdump: " << Filename << ": " << EC.message() << "\n";
     return;
   }
+  std::unique_ptr<MemoryBuffer> Buff = std::move(BuffOrErr.get());
 
-  std::unique_ptr<MachOObjectFile> MachOOF(static_cast<MachOObjectFile *>(
-      ObjectFile::createMachOObjectFile(Buff.get()).get()));
+  std::unique_ptr<MachOObjectFile> MachOOF = std::move(
+      ObjectFile::createMachOObjectFile(Buff.get()->getMemBufferRef()).get());
 
   DisassembleInputMachO2(Filename, MachOOF.get());
 }
 
+typedef DenseMap<uint64_t, StringRef> SymbolAddressMap;
+typedef std::pair<uint64_t, const char *> BindInfoEntry;
+typedef std::vector<BindInfoEntry> BindTable;
+typedef BindTable::iterator bind_table_iterator;
+
+// The block of info used by the Symbolizer call backs.
+struct DisassembleInfo {
+  bool verbose;
+  MachOObjectFile *O;
+  SectionRef S;
+  SymbolAddressMap *AddrMap;
+  std::vector<SectionRef> *Sections;
+  const char *class_name;
+  const char *selector_name;
+  char *method;
+  char *demangled_name;
+  uint64_t adrp_addr;
+  uint32_t adrp_inst;
+  BindTable *bindtable;
+};
+
+// GuessSymbolName is passed the address of what might be a symbol and a
+// pointer to the DisassembleInfo struct.  It returns the name of a symbol
+// with that address or nullptr if no symbol is found with that address.
+static const char *GuessSymbolName(uint64_t value,
+                                   struct DisassembleInfo *info) {
+  const char *SymbolName = nullptr;
+  // A DenseMap can't lookup up some values.
+  if (value != 0xffffffffffffffffULL && value != 0xfffffffffffffffeULL) {
+    StringRef name = info->AddrMap->lookup(value);
+    if (!name.empty())
+      SymbolName = name.data();
+  }
+  return SymbolName;
+}
+
+// SymbolizerGetOpInfo() is the operand information call back function.
+// This is called to get the symbolic information for operand(s) of an
+// instruction when it is being done.  This routine does this from
+// the relocation information, symbol table, etc. That block of information
+// is a pointer to the struct DisassembleInfo that was passed when the
+// disassembler context was created and passed to back to here when
+// called back by the disassembler for instruction operands that could have
+// relocation information. The address of the instruction containing operand is
+// at the Pc parameter.  The immediate value the operand has is passed in
+// op_info->Value and is at Offset past the start of the instruction and has a
+// byte Size of 1, 2 or 4. The symbolc information is returned in TagBuf is the
+// LLVMOpInfo1 struct defined in the header "llvm-c/Disassembler.h" as symbol
+// names and addends of the symbolic expression to add for the operand.  The
+// value of TagType is currently 1 (for the LLVMOpInfo1 struct). If symbolic
+// information is returned then this function returns 1 else it returns 0.
+int SymbolizerGetOpInfo(void *DisInfo, uint64_t Pc, uint64_t Offset,
+                        uint64_t Size, int TagType, void *TagBuf) {
+  struct DisassembleInfo *info = (struct DisassembleInfo *)DisInfo;
+  struct LLVMOpInfo1 *op_info = (struct LLVMOpInfo1 *)TagBuf;
+  uint64_t value = op_info->Value;
+
+  // Make sure all fields returned are zero if we don't set them.
+  memset((void *)op_info, '\0', sizeof(struct LLVMOpInfo1));
+  op_info->Value = value;
+
+  // If the TagType is not the value 1 which it code knows about or if no
+  // verbose symbolic information is wanted then just return 0, indicating no
+  // information is being returned.
+  if (TagType != 1 || info->verbose == false)
+    return 0;
+
+  unsigned int Arch = info->O->getArch();
+  if (Arch == Triple::x86) {
+    if (Size != 1 && Size != 2 && Size != 4 && Size != 0)
+      return 0;
+    // First search the section's relocation entries (if any) for an entry
+    // for this section offset.
+    uint32_t sect_addr = info->S.getAddress();
+    uint32_t sect_offset = (Pc + Offset) - sect_addr;
+    bool reloc_found = false;
+    DataRefImpl Rel;
+    MachO::any_relocation_info RE;
+    bool isExtern = false;
+    SymbolRef Symbol;
+    bool r_scattered = false;
+    uint32_t r_value, pair_r_value, r_type;
+    for (const RelocationRef &Reloc : info->S.relocations()) {
+      uint64_t RelocOffset;
+      Reloc.getOffset(RelocOffset);
+      if (RelocOffset == sect_offset) {
+        Rel = Reloc.getRawDataRefImpl();
+        RE = info->O->getRelocation(Rel);
+        r_type = info->O->getAnyRelocationType(RE);
+        r_scattered = info->O->isRelocationScattered(RE);
+        if (r_scattered) {
+          r_value = info->O->getScatteredRelocationValue(RE);
+          if (r_type == MachO::GENERIC_RELOC_SECTDIFF ||
+              r_type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
+            DataRefImpl RelNext = Rel;
+            info->O->moveRelocationNext(RelNext);
+            MachO::any_relocation_info RENext;
+            RENext = info->O->getRelocation(RelNext);
+            if (info->O->isRelocationScattered(RENext))
+              pair_r_value = info->O->getScatteredRelocationValue(RENext);
+            else
+              return 0;
+          }
+        } else {
+          isExtern = info->O->getPlainRelocationExternal(RE);
+          if (isExtern) {
+            symbol_iterator RelocSym = Reloc.getSymbol();
+            Symbol = *RelocSym;
+          }
+        }
+        reloc_found = true;
+        break;
+      }
+    }
+    if (reloc_found && isExtern) {
+      StringRef SymName;
+      Symbol.getName(SymName);
+      const char *name = SymName.data();
+      op_info->AddSymbol.Present = 1;
+      op_info->AddSymbol.Name = name;
+      // For i386 extern relocation entries the value in the instruction is
+      // the offset from the symbol, and value is already set in op_info->Value.
+      return 1;
+    }
+    if (reloc_found && (r_type == MachO::GENERIC_RELOC_SECTDIFF ||
+                        r_type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF)) {
+      const char *add = GuessSymbolName(r_value, info);
+      const char *sub = GuessSymbolName(pair_r_value, info);
+      uint32_t offset = value - (r_value - pair_r_value);
+      op_info->AddSymbol.Present = 1;
+      if (add != nullptr)
+        op_info->AddSymbol.Name = add;
+      else
+        op_info->AddSymbol.Value = r_value;
+      op_info->SubtractSymbol.Present = 1;
+      if (sub != nullptr)
+        op_info->SubtractSymbol.Name = sub;
+      else
+        op_info->SubtractSymbol.Value = pair_r_value;
+      op_info->Value = offset;
+      return 1;
+    }
+    // TODO:
+    // Second search the external relocation entries of a fully linked image
+    // (if any) for an entry that matches this segment offset.
+    // uint32_t seg_offset = (Pc + Offset);
+    return 0;
+  } else if (Arch == Triple::x86_64) {
+    if (Size != 1 && Size != 2 && Size != 4 && Size != 0)
+      return 0;
+    // First search the section's relocation entries (if any) for an entry
+    // for this section offset.
+    uint64_t sect_addr = info->S.getAddress();
+    uint64_t sect_offset = (Pc + Offset) - sect_addr;
+    bool reloc_found = false;
+    DataRefImpl Rel;
+    MachO::any_relocation_info RE;
+    bool isExtern = false;
+    SymbolRef Symbol;
+    for (const RelocationRef &Reloc : info->S.relocations()) {
+      uint64_t RelocOffset;
+      Reloc.getOffset(RelocOffset);
+      if (RelocOffset == sect_offset) {
+        Rel = Reloc.getRawDataRefImpl();
+        RE = info->O->getRelocation(Rel);
+        // NOTE: Scattered relocations don't exist on x86_64.
+        isExtern = info->O->getPlainRelocationExternal(RE);
+        if (isExtern) {
+          symbol_iterator RelocSym = Reloc.getSymbol();
+          Symbol = *RelocSym;
+        }
+        reloc_found = true;
+        break;
+      }
+    }
+    if (reloc_found && isExtern) {
+      // The Value passed in will be adjusted by the Pc if the instruction
+      // adds the Pc.  But for x86_64 external relocation entries the Value
+      // is the offset from the external symbol.
+      if (info->O->getAnyRelocationPCRel(RE))
+        op_info->Value -= Pc + Offset + Size;
+      StringRef SymName;
+      Symbol.getName(SymName);
+      const char *name = SymName.data();
+      unsigned Type = info->O->getAnyRelocationType(RE);
+      if (Type == MachO::X86_64_RELOC_SUBTRACTOR) {
+        DataRefImpl RelNext = Rel;
+        info->O->moveRelocationNext(RelNext);
+        MachO::any_relocation_info RENext = info->O->getRelocation(RelNext);
+        unsigned TypeNext = info->O->getAnyRelocationType(RENext);
+        bool isExternNext = info->O->getPlainRelocationExternal(RENext);
+        unsigned SymbolNum = info->O->getPlainRelocationSymbolNum(RENext);
+        if (TypeNext == MachO::X86_64_RELOC_UNSIGNED && isExternNext) {
+          op_info->SubtractSymbol.Present = 1;
+          op_info->SubtractSymbol.Name = name;
+          symbol_iterator RelocSymNext = info->O->getSymbolByIndex(SymbolNum);
+          Symbol = *RelocSymNext;
+          StringRef SymNameNext;
+          Symbol.getName(SymNameNext);
+          name = SymNameNext.data();
+        }
+      }
+      // TODO: add the VariantKinds to op_info->VariantKind for relocation types
+      // like: X86_64_RELOC_TLV, X86_64_RELOC_GOT_LOAD and X86_64_RELOC_GOT.
+      op_info->AddSymbol.Present = 1;
+      op_info->AddSymbol.Name = name;
+      return 1;
+    }
+    // TODO:
+    // Second search the external relocation entries of a fully linked image
+    // (if any) for an entry that matches this segment offset.
+    // uint64_t seg_offset = (Pc + Offset);
+    return 0;
+  } else if (Arch == Triple::arm) {
+    if (Offset != 0 || (Size != 4 && Size != 2))
+      return 0;
+    // First search the section's relocation entries (if any) for an entry
+    // for this section offset.
+    uint32_t sect_addr = info->S.getAddress();
+    uint32_t sect_offset = (Pc + Offset) - sect_addr;
+    bool reloc_found = false;
+    DataRefImpl Rel;
+    MachO::any_relocation_info RE;
+    bool isExtern = false;
+    SymbolRef Symbol;
+    bool r_scattered = false;
+    uint32_t r_value, pair_r_value, r_type, r_length, other_half;
+    for (const RelocationRef &Reloc : info->S.relocations()) {
+      uint64_t RelocOffset;
+      Reloc.getOffset(RelocOffset);
+      if (RelocOffset == sect_offset) {
+        Rel = Reloc.getRawDataRefImpl();
+        RE = info->O->getRelocation(Rel);
+        r_length = info->O->getAnyRelocationLength(RE);
+        r_scattered = info->O->isRelocationScattered(RE);
+        if (r_scattered) {
+          r_value = info->O->getScatteredRelocationValue(RE);
+          r_type = info->O->getScatteredRelocationType(RE);
+        } else {
+          r_type = info->O->getAnyRelocationType(RE);
+          isExtern = info->O->getPlainRelocationExternal(RE);
+          if (isExtern) {
+            symbol_iterator RelocSym = Reloc.getSymbol();
+            Symbol = *RelocSym;
+          }
+        }
+        if (r_type == MachO::ARM_RELOC_HALF ||
+            r_type == MachO::ARM_RELOC_SECTDIFF ||
+            r_type == MachO::ARM_RELOC_LOCAL_SECTDIFF ||
+            r_type == MachO::ARM_RELOC_HALF_SECTDIFF) {
+          DataRefImpl RelNext = Rel;
+          info->O->moveRelocationNext(RelNext);
+          MachO::any_relocation_info RENext;
+          RENext = info->O->getRelocation(RelNext);
+          other_half = info->O->getAnyRelocationAddress(RENext) & 0xffff;
+          if (info->O->isRelocationScattered(RENext))
+            pair_r_value = info->O->getScatteredRelocationValue(RENext);
+        }
+        reloc_found = true;
+        break;
+      }
+    }
+    if (reloc_found && isExtern) {
+      StringRef SymName;
+      Symbol.getName(SymName);
+      const char *name = SymName.data();
+      op_info->AddSymbol.Present = 1;
+      op_info->AddSymbol.Name = name;
+      if (value != 0) {
+        switch (r_type) {
+        case MachO::ARM_RELOC_HALF:
+          if ((r_length & 0x1) == 1) {
+            op_info->Value = value << 16 | other_half;
+            op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_HI16;
+          } else {
+            op_info->Value = other_half << 16 | value;
+            op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_LO16;
+          }
+          break;
+        default:
+          break;
+        }
+      } else {
+        switch (r_type) {
+        case MachO::ARM_RELOC_HALF:
+          if ((r_length & 0x1) == 1) {
+            op_info->Value = value << 16 | other_half;
+            op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_HI16;
+          } else {
+            op_info->Value = other_half << 16 | value;
+            op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_LO16;
+          }
+          break;
+        default:
+          break;
+        }
+      }
+      return 1;
+    }
+    // If we have a branch that is not an external relocation entry then
+    // return 0 so the code in tryAddingSymbolicOperand() can use the
+    // SymbolLookUp call back with the branch target address to look up the
+    // symbol and possiblity add an annotation for a symbol stub.
+    if (reloc_found && isExtern == 0 && (r_type == MachO::ARM_RELOC_BR24 ||
+                                         r_type == MachO::ARM_THUMB_RELOC_BR22))
+      return 0;
+
+    uint32_t offset = 0;
+    if (reloc_found) {
+      if (r_type == MachO::ARM_RELOC_HALF ||
+          r_type == MachO::ARM_RELOC_HALF_SECTDIFF) {
+        if ((r_length & 0x1) == 1)
+          value = value << 16 | other_half;
+        else
+          value = other_half << 16 | value;
+      }
+      if (r_scattered && (r_type != MachO::ARM_RELOC_HALF &&
+                          r_type != MachO::ARM_RELOC_HALF_SECTDIFF)) {
+        offset = value - r_value;
+        value = r_value;
+      }
+    }
+
+    if (reloc_found && r_type == MachO::ARM_RELOC_HALF_SECTDIFF) {
+      if ((r_length & 0x1) == 1)
+        op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_HI16;
+      else
+        op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_LO16;
+      const char *add = GuessSymbolName(r_value, info);
+      const char *sub = GuessSymbolName(pair_r_value, info);
+      int32_t offset = value - (r_value - pair_r_value);
+      op_info->AddSymbol.Present = 1;
+      if (add != nullptr)
+        op_info->AddSymbol.Name = add;
+      else
+        op_info->AddSymbol.Value = r_value;
+      op_info->SubtractSymbol.Present = 1;
+      if (sub != nullptr)
+        op_info->SubtractSymbol.Name = sub;
+      else
+        op_info->SubtractSymbol.Value = pair_r_value;
+      op_info->Value = offset;
+      return 1;
+    }
+
+    if (reloc_found == false)
+      return 0;
+
+    op_info->AddSymbol.Present = 1;
+    op_info->Value = offset;
+    if (reloc_found) {
+      if (r_type == MachO::ARM_RELOC_HALF) {
+        if ((r_length & 0x1) == 1)
+          op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_HI16;
+        else
+          op_info->VariantKind = LLVMDisassembler_VariantKind_ARM_LO16;
+      }
+    }
+    const char *add = GuessSymbolName(value, info);
+    if (add != nullptr) {
+      op_info->AddSymbol.Name = add;
+      return 1;
+    }
+    op_info->AddSymbol.Value = value;
+    return 1;
+  } else if (Arch == Triple::aarch64) {
+    if (Offset != 0 || Size != 4)
+      return 0;
+    // First search the section's relocation entries (if any) for an entry
+    // for this section offset.
+    uint64_t sect_addr = info->S.getAddress();
+    uint64_t sect_offset = (Pc + Offset) - sect_addr;
+    bool reloc_found = false;
+    DataRefImpl Rel;
+    MachO::any_relocation_info RE;
+    bool isExtern = false;
+    SymbolRef Symbol;
+    uint32_t r_type = 0;
+    for (const RelocationRef &Reloc : info->S.relocations()) {
+      uint64_t RelocOffset;
+      Reloc.getOffset(RelocOffset);
+      if (RelocOffset == sect_offset) {
+        Rel = Reloc.getRawDataRefImpl();
+        RE = info->O->getRelocation(Rel);
+        r_type = info->O->getAnyRelocationType(RE);
+        if (r_type == MachO::ARM64_RELOC_ADDEND) {
+          DataRefImpl RelNext = Rel;
+          info->O->moveRelocationNext(RelNext);
+          MachO::any_relocation_info RENext = info->O->getRelocation(RelNext);
+          if (value == 0) {
+            value = info->O->getPlainRelocationSymbolNum(RENext);
+            op_info->Value = value;
+          }
+        }
+        // NOTE: Scattered relocations don't exist on arm64.
+        isExtern = info->O->getPlainRelocationExternal(RE);
+        if (isExtern) {
+          symbol_iterator RelocSym = Reloc.getSymbol();
+          Symbol = *RelocSym;
+        }
+        reloc_found = true;
+        break;
+      }
+    }
+    if (reloc_found && isExtern) {
+      StringRef SymName;
+      Symbol.getName(SymName);
+      const char *name = SymName.data();
+      op_info->AddSymbol.Present = 1;
+      op_info->AddSymbol.Name = name;
+
+      switch (r_type) {
+      case MachO::ARM64_RELOC_PAGE21:
+        /* @page */
+        op_info->VariantKind = LLVMDisassembler_VariantKind_ARM64_PAGE;
+        break;
+      case MachO::ARM64_RELOC_PAGEOFF12:
+        /* @pageoff */
+        op_info->VariantKind = LLVMDisassembler_VariantKind_ARM64_PAGEOFF;
+        break;
+      case MachO::ARM64_RELOC_GOT_LOAD_PAGE21:
+        /* @gotpage */
+        op_info->VariantKind = LLVMDisassembler_VariantKind_ARM64_GOTPAGE;
+        break;
+      case MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12:
+        /* @gotpageoff */
+        op_info->VariantKind = LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF;
+        break;
+      case MachO::ARM64_RELOC_TLVP_LOAD_PAGE21:
+        /* @tvlppage is not implemented in llvm-mc */
+        op_info->VariantKind = LLVMDisassembler_VariantKind_ARM64_TLVP;
+        break;
+      case MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12:
+        /* @tvlppageoff is not implemented in llvm-mc */
+        op_info->VariantKind = LLVMDisassembler_VariantKind_ARM64_TLVOFF;
+        break;
+      default:
+      case MachO::ARM64_RELOC_BRANCH26:
+        op_info->VariantKind = LLVMDisassembler_VariantKind_None;
+        break;
+      }
+      return 1;
+    }
+    return 0;
+  } else {
+    return 0;
+  }
+}
+
+// GuessCstringPointer is passed the address of what might be a pointer to a
+// literal string in a cstring section.  If that address is in a cstring section
+// it returns a pointer to that string.  Else it returns nullptr.
+const char *GuessCstringPointer(uint64_t ReferenceValue,
+                                struct DisassembleInfo *info) {
+  uint32_t LoadCommandCount = info->O->getHeader().ncmds;
+  MachOObjectFile::LoadCommandInfo Load = info->O->getFirstLoadCommandInfo();
+  for (unsigned I = 0;; ++I) {
+    if (Load.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 Seg = info->O->getSegment64LoadCommand(Load);
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section_64 Sec = info->O->getSection64(Load, J);
+        uint32_t section_type = Sec.flags & MachO::SECTION_TYPE;
+        if (section_type == MachO::S_CSTRING_LITERALS &&
+            ReferenceValue >= Sec.addr &&
+            ReferenceValue < Sec.addr + Sec.size) {
+          uint64_t sect_offset = ReferenceValue - Sec.addr;
+          uint64_t object_offset = Sec.offset + sect_offset;
+          StringRef MachOContents = info->O->getData();
+          uint64_t object_size = MachOContents.size();
+          const char *object_addr = (const char *)MachOContents.data();
+          if (object_offset < object_size) {
+            const char *name = object_addr + object_offset;
+            return name;
+          } else {
+            return nullptr;
+          }
+        }
+      }
+    } else if (Load.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command Seg = info->O->getSegmentLoadCommand(Load);
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section Sec = info->O->getSection(Load, J);
+        uint32_t section_type = Sec.flags & MachO::SECTION_TYPE;
+        if (section_type == MachO::S_CSTRING_LITERALS &&
+            ReferenceValue >= Sec.addr &&
+            ReferenceValue < Sec.addr + Sec.size) {
+          uint64_t sect_offset = ReferenceValue - Sec.addr;
+          uint64_t object_offset = Sec.offset + sect_offset;
+          StringRef MachOContents = info->O->getData();
+          uint64_t object_size = MachOContents.size();
+          const char *object_addr = (const char *)MachOContents.data();
+          if (object_offset < object_size) {
+            const char *name = object_addr + object_offset;
+            return name;
+          } else {
+            return nullptr;
+          }
+        }
+      }
+    }
+    if (I == LoadCommandCount - 1)
+      break;
+    else
+      Load = info->O->getNextLoadCommandInfo(Load);
+  }
+  return nullptr;
+}
+
+// GuessIndirectSymbol returns the name of the indirect symbol for the
+// ReferenceValue passed in or nullptr.  This is used when ReferenceValue maybe
+// an address of a symbol stub or a lazy or non-lazy pointer to associate the
+// symbol name being referenced by the stub or pointer.
+static const char *GuessIndirectSymbol(uint64_t ReferenceValue,
+                                       struct DisassembleInfo *info) {
+  uint32_t LoadCommandCount = info->O->getHeader().ncmds;
+  MachOObjectFile::LoadCommandInfo Load = info->O->getFirstLoadCommandInfo();
+  MachO::dysymtab_command Dysymtab = info->O->getDysymtabLoadCommand();
+  MachO::symtab_command Symtab = info->O->getSymtabLoadCommand();
+  for (unsigned I = 0;; ++I) {
+    if (Load.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 Seg = info->O->getSegment64LoadCommand(Load);
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section_64 Sec = info->O->getSection64(Load, J);
+        uint32_t section_type = Sec.flags & MachO::SECTION_TYPE;
+        if ((section_type == MachO::S_NON_LAZY_SYMBOL_POINTERS ||
+             section_type == MachO::S_LAZY_SYMBOL_POINTERS ||
+             section_type == MachO::S_LAZY_DYLIB_SYMBOL_POINTERS ||
+             section_type == MachO::S_THREAD_LOCAL_VARIABLE_POINTERS ||
+             section_type == MachO::S_SYMBOL_STUBS) &&
+            ReferenceValue >= Sec.addr &&
+            ReferenceValue < Sec.addr + Sec.size) {
+          uint32_t stride;
+          if (section_type == MachO::S_SYMBOL_STUBS)
+            stride = Sec.reserved2;
+          else
+            stride = 8;
+          if (stride == 0)
+            return nullptr;
+          uint32_t index = Sec.reserved1 + (ReferenceValue - Sec.addr) / stride;
+          if (index < Dysymtab.nindirectsyms) {
+            uint32_t indirect_symbol =
+                info->O->getIndirectSymbolTableEntry(Dysymtab, index);
+            if (indirect_symbol < Symtab.nsyms) {
+              symbol_iterator Sym = info->O->getSymbolByIndex(indirect_symbol);
+              SymbolRef Symbol = *Sym;
+              StringRef SymName;
+              Symbol.getName(SymName);
+              const char *name = SymName.data();
+              return name;
+            }
+          }
+        }
+      }
+    } else if (Load.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command Seg = info->O->getSegmentLoadCommand(Load);
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section Sec = info->O->getSection(Load, J);
+        uint32_t section_type = Sec.flags & MachO::SECTION_TYPE;
+        if ((section_type == MachO::S_NON_LAZY_SYMBOL_POINTERS ||
+             section_type == MachO::S_LAZY_SYMBOL_POINTERS ||
+             section_type == MachO::S_LAZY_DYLIB_SYMBOL_POINTERS ||
+             section_type == MachO::S_THREAD_LOCAL_VARIABLE_POINTERS ||
+             section_type == MachO::S_SYMBOL_STUBS) &&
+            ReferenceValue >= Sec.addr &&
+            ReferenceValue < Sec.addr + Sec.size) {
+          uint32_t stride;
+          if (section_type == MachO::S_SYMBOL_STUBS)
+            stride = Sec.reserved2;
+          else
+            stride = 4;
+          if (stride == 0)
+            return nullptr;
+          uint32_t index = Sec.reserved1 + (ReferenceValue - Sec.addr) / stride;
+          if (index < Dysymtab.nindirectsyms) {
+            uint32_t indirect_symbol =
+                info->O->getIndirectSymbolTableEntry(Dysymtab, index);
+            if (indirect_symbol < Symtab.nsyms) {
+              symbol_iterator Sym = info->O->getSymbolByIndex(indirect_symbol);
+              SymbolRef Symbol = *Sym;
+              StringRef SymName;
+              Symbol.getName(SymName);
+              const char *name = SymName.data();
+              return name;
+            }
+          }
+        }
+      }
+    }
+    if (I == LoadCommandCount - 1)
+      break;
+    else
+      Load = info->O->getNextLoadCommandInfo(Load);
+  }
+  return nullptr;
+}
+
+// method_reference() is called passing it the ReferenceName that might be
+// a reference it to an Objective-C method call.  If so then it allocates and
+// assembles a method call string with the values last seen and saved in
+// the DisassembleInfo's class_name and selector_name fields.  This is saved
+// into the method field of the info and any previous string is free'ed.
+// Then the class_name field in the info is set to nullptr.  The method call
+// string is set into ReferenceName and ReferenceType is set to
+// LLVMDisassembler_ReferenceType_Out_Objc_Message.  If this not a method call
+// then both ReferenceType and ReferenceName are left unchanged.
+static void method_reference(struct DisassembleInfo *info,
+                             uint64_t *ReferenceType,
+                             const char **ReferenceName) {
+  unsigned int Arch = info->O->getArch();
+  if (*ReferenceName != nullptr) {
+    if (strcmp(*ReferenceName, "_objc_msgSend") == 0) {
+      if (info->selector_name != nullptr) {
+        if (info->method != nullptr)
+          free(info->method);
+        if (info->class_name != nullptr) {
+          info->method = (char *)malloc(5 + strlen(info->class_name) +
+                                        strlen(info->selector_name));
+          if (info->method != nullptr) {
+            strcpy(info->method, "+[");
+            strcat(info->method, info->class_name);
+            strcat(info->method, " ");
+            strcat(info->method, info->selector_name);
+            strcat(info->method, "]");
+            *ReferenceName = info->method;
+            *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message;
+          }
+        } else {
+          info->method = (char *)malloc(9 + strlen(info->selector_name));
+          if (info->method != nullptr) {
+            if (Arch == Triple::x86_64)
+              strcpy(info->method, "-[%rdi ");
+            else if (Arch == Triple::aarch64)
+              strcpy(info->method, "-[x0 ");
+            else
+              strcpy(info->method, "-[r? ");
+            strcat(info->method, info->selector_name);
+            strcat(info->method, "]");
+            *ReferenceName = info->method;
+            *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message;
+          }
+        }
+        info->class_name = nullptr;
+      }
+    } else if (strcmp(*ReferenceName, "_objc_msgSendSuper2") == 0) {
+      if (info->selector_name != nullptr) {
+        if (info->method != nullptr)
+          free(info->method);
+        info->method = (char *)malloc(17 + strlen(info->selector_name));
+        if (info->method != nullptr) {
+          if (Arch == Triple::x86_64)
+            strcpy(info->method, "-[[%rdi super] ");
+          else if (Arch == Triple::aarch64)
+            strcpy(info->method, "-[[x0 super] ");
+          else
+            strcpy(info->method, "-[[r? super] ");
+          strcat(info->method, info->selector_name);
+          strcat(info->method, "]");
+          *ReferenceName = info->method;
+          *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message;
+        }
+        info->class_name = nullptr;
+      }
+    }
+  }
+}
+
+// GuessPointerPointer() is passed the address of what might be a pointer to
+// a reference to an Objective-C class, selector, message ref or cfstring.
+// If so the value of the pointer is returned and one of the booleans are set
+// to true.  If not zero is returned and all the booleans are set to false.
+static uint64_t GuessPointerPointer(uint64_t ReferenceValue,
+                                    struct DisassembleInfo *info,
+                                    bool &classref, bool &selref, bool &msgref,
+                                    bool &cfstring) {
+  classref = false;
+  selref = false;
+  msgref = false;
+  cfstring = false;
+  uint32_t LoadCommandCount = info->O->getHeader().ncmds;
+  MachOObjectFile::LoadCommandInfo Load = info->O->getFirstLoadCommandInfo();
+  for (unsigned I = 0;; ++I) {
+    if (Load.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 Seg = info->O->getSegment64LoadCommand(Load);
+      for (unsigned J = 0; J < Seg.nsects; ++J) {
+        MachO::section_64 Sec = info->O->getSection64(Load, J);
+        if ((strncmp(Sec.sectname, "__objc_selrefs", 16) == 0 ||
+             strncmp(Sec.sectname, "__objc_classrefs", 16) == 0 ||
+             strncmp(Sec.sectname, "__objc_superrefs", 16) == 0 ||
+             strncmp(Sec.sectname, "__objc_msgrefs", 16) == 0 ||
+             strncmp(Sec.sectname, "__cfstring", 16) == 0) &&
+            ReferenceValue >= Sec.addr &&
+            ReferenceValue < Sec.addr + Sec.size) {
+          uint64_t sect_offset = ReferenceValue - Sec.addr;
+          uint64_t object_offset = Sec.offset + sect_offset;
+          StringRef MachOContents = info->O->getData();
+          uint64_t object_size = MachOContents.size();
+          const char *object_addr = (const char *)MachOContents.data();
+          if (object_offset < object_size) {
+            uint64_t pointer_value;
+            memcpy(&pointer_value, object_addr + object_offset,
+                   sizeof(uint64_t));
+            if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+              sys::swapByteOrder(pointer_value);
+            if (strncmp(Sec.sectname, "__objc_selrefs", 16) == 0)
+              selref = true;
+            else if (strncmp(Sec.sectname, "__objc_classrefs", 16) == 0 ||
+                     strncmp(Sec.sectname, "__objc_superrefs", 16) == 0)
+              classref = true;
+            else if (strncmp(Sec.sectname, "__objc_msgrefs", 16) == 0 &&
+                     ReferenceValue + 8 < Sec.addr + Sec.size) {
+              msgref = true;
+              memcpy(&pointer_value, object_addr + object_offset + 8,
+                     sizeof(uint64_t));
+              if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+                sys::swapByteOrder(pointer_value);
+            } else if (strncmp(Sec.sectname, "__cfstring", 16) == 0)
+              cfstring = true;
+            return pointer_value;
+          } else {
+            return 0;
+          }
+        }
+      }
+    }
+    // TODO: Look for LC_SEGMENT for 32-bit Mach-O files.
+    if (I == LoadCommandCount - 1)
+      break;
+    else
+      Load = info->O->getNextLoadCommandInfo(Load);
+  }
+  return 0;
+}
+
+// get_pointer_64 returns a pointer to the bytes in the object file at the
+// Address from a section in the Mach-O file.  And indirectly returns the
+// offset into the section, number of bytes left in the section past the offset
+// and which section is was being referenced.  If the Address is not in a
+// section nullptr is returned.
+const char *get_pointer_64(uint64_t Address, uint32_t &offset, uint32_t &left,
+                           SectionRef &S, DisassembleInfo *info) {
+  offset = 0;
+  left = 0;
+  S = SectionRef();
+  for (unsigned SectIdx = 0; SectIdx != info->Sections->size(); SectIdx++) {
+    uint64_t SectAddress = ((*(info->Sections))[SectIdx]).getAddress();
+    uint64_t SectSize = ((*(info->Sections))[SectIdx]).getSize();
+    if (Address >= SectAddress && Address < SectAddress + SectSize) {
+      S = (*(info->Sections))[SectIdx];
+      offset = Address - SectAddress;
+      left = SectSize - offset;
+      StringRef SectContents;
+      ((*(info->Sections))[SectIdx]).getContents(SectContents);
+      return SectContents.data() + offset;
+    }
+  }
+  return nullptr;
+}
+
+// get_symbol_64() returns the name of a symbol (or nullptr) and the address of
+// the symbol indirectly through n_value. Based on the relocation information
+// for the specified section offset in the specified section reference.
+const char *get_symbol_64(uint32_t sect_offset, SectionRef S,
+                          DisassembleInfo *info, uint64_t &n_value) {
+  n_value = 0;
+  if (info->verbose == false)
+    return nullptr;
+
+  // See if there is an external relocation entry at the sect_offset.
+  bool reloc_found = false;
+  DataRefImpl Rel;
+  MachO::any_relocation_info RE;
+  bool isExtern = false;
+  SymbolRef Symbol;
+  for (const RelocationRef &Reloc : S.relocations()) {
+    uint64_t RelocOffset;
+    Reloc.getOffset(RelocOffset);
+    if (RelocOffset == sect_offset) {
+      Rel = Reloc.getRawDataRefImpl();
+      RE = info->O->getRelocation(Rel);
+      if (info->O->isRelocationScattered(RE))
+        continue;
+      isExtern = info->O->getPlainRelocationExternal(RE);
+      if (isExtern) {
+        symbol_iterator RelocSym = Reloc.getSymbol();
+        Symbol = *RelocSym;
+      }
+      reloc_found = true;
+      break;
+    }
+  }
+  // If there is an external relocation entry for a symbol in this section
+  // at this section_offset then use that symbol's value for the n_value
+  // and return its name.
+  const char *SymbolName = nullptr;
+  if (reloc_found && isExtern) {
+    Symbol.getAddress(n_value);
+    StringRef name;
+    Symbol.getName(name);
+    if (!name.empty()) {
+      SymbolName = name.data();
+      return SymbolName;
+    }
+  }
+
+  // TODO: For fully linked images, look through the external relocation
+  // entries off the dynamic symtab command. For these the r_offset is from the
+  // start of the first writeable segment in the Mach-O file.  So the offset
+  // to this section from that segment is passed to this routine by the caller,
+  // as the database_offset. Which is the difference of the section's starting
+  // address and the first writable segment.
+  //
+  // NOTE: need add passing the database_offset to this routine.
+
+  // TODO: We did not find an external relocation entry so look up the
+  // ReferenceValue as an address of a symbol and if found return that symbol's
+  // name.
+  //
+  // NOTE: need add passing the ReferenceValue to this routine.  Then that code
+  // would simply be this:
+  // SymbolName = GuessSymbolName(ReferenceValue, info);
+
+  return SymbolName;
+}
+
+// These are structs in the Objective-C meta data and read to produce the
+// comments for disassembly.  While these are part of the ABI they are no
+// public defintions.  So the are here not in include/llvm/Support/MachO.h .
+
+// The cfstring object in a 64-bit Mach-O file.
+struct cfstring64_t {
+  uint64_t isa;        // class64_t * (64-bit pointer)
+  uint64_t flags;      // flag bits
+  uint64_t characters; // char * (64-bit pointer)
+  uint64_t length;     // number of non-NULL characters in above
+};
+
+// The class object in a 64-bit Mach-O file.
+struct class64_t {
+  uint64_t isa;        // class64_t * (64-bit pointer)
+  uint64_t superclass; // class64_t * (64-bit pointer)
+  uint64_t cache;      // Cache (64-bit pointer)
+  uint64_t vtable;     // IMP * (64-bit pointer)
+  uint64_t data;       // class_ro64_t * (64-bit pointer)
+};
+
+struct class_ro64_t {
+  uint32_t flags;
+  uint32_t instanceStart;
+  uint32_t instanceSize;
+  uint32_t reserved;
+  uint64_t ivarLayout;     // const uint8_t * (64-bit pointer)
+  uint64_t name;           // const char * (64-bit pointer)
+  uint64_t baseMethods;    // const method_list_t * (64-bit pointer)
+  uint64_t baseProtocols;  // const protocol_list_t * (64-bit pointer)
+  uint64_t ivars;          // const ivar_list_t * (64-bit pointer)
+  uint64_t weakIvarLayout; // const uint8_t * (64-bit pointer)
+  uint64_t baseProperties; // const struct objc_property_list (64-bit pointer)
+};
+
+inline void swapStruct(struct cfstring64_t &cfs) {
+  sys::swapByteOrder(cfs.isa);
+  sys::swapByteOrder(cfs.flags);
+  sys::swapByteOrder(cfs.characters);
+  sys::swapByteOrder(cfs.length);
+}
+
+inline void swapStruct(struct class64_t &c) {
+  sys::swapByteOrder(c.isa);
+  sys::swapByteOrder(c.superclass);
+  sys::swapByteOrder(c.cache);
+  sys::swapByteOrder(c.vtable);
+  sys::swapByteOrder(c.data);
+}
+
+inline void swapStruct(struct class_ro64_t &cro) {
+  sys::swapByteOrder(cro.flags);
+  sys::swapByteOrder(cro.instanceStart);
+  sys::swapByteOrder(cro.instanceSize);
+  sys::swapByteOrder(cro.reserved);
+  sys::swapByteOrder(cro.ivarLayout);
+  sys::swapByteOrder(cro.name);
+  sys::swapByteOrder(cro.baseMethods);
+  sys::swapByteOrder(cro.baseProtocols);
+  sys::swapByteOrder(cro.ivars);
+  sys::swapByteOrder(cro.weakIvarLayout);
+  sys::swapByteOrder(cro.baseProperties);
+}
+
+static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue,
+                                                 struct DisassembleInfo *info);
+
+// get_objc2_64bit_class_name() is used for disassembly and is passed a pointer
+// to an Objective-C class and returns the class name.  It is also passed the
+// address of the pointer, so when the pointer is zero as it can be in an .o
+// file, that is used to look for an external relocation entry with a symbol
+// name.
+const char *get_objc2_64bit_class_name(uint64_t pointer_value,
+                                       uint64_t ReferenceValue,
+                                       struct DisassembleInfo *info) {
+  const char *r;
+  uint32_t offset, left;
+  SectionRef S;
+
+  // The pointer_value can be 0 in an object file and have a relocation
+  // entry for the class symbol at the ReferenceValue (the address of the
+  // pointer).
+  if (pointer_value == 0) {
+    r = get_pointer_64(ReferenceValue, offset, left, S, info);
+    if (r == nullptr || left < sizeof(uint64_t))
+      return nullptr;
+    uint64_t n_value;
+    const char *symbol_name = get_symbol_64(offset, S, info, n_value);
+    if (symbol_name == nullptr)
+      return nullptr;
+    const char *class_name = strrchr(symbol_name, '$');
+    if (class_name != nullptr && class_name[1] == '_' && class_name[2] != '\0')
+      return class_name + 2;
+    else
+      return nullptr;
+  }
+
+  // The case were the pointer_value is non-zero and points to a class defined
+  // in this Mach-O file.
+  r = get_pointer_64(pointer_value, offset, left, S, info);
+  if (r == nullptr || left < sizeof(struct class64_t))
+    return nullptr;
+  struct class64_t c;
+  memcpy(&c, r, sizeof(struct class64_t));
+  if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+    swapStruct(c);
+  if (c.data == 0)
+    return nullptr;
+  r = get_pointer_64(c.data, offset, left, S, info);
+  if (r == nullptr || left < sizeof(struct class_ro64_t))
+    return nullptr;
+  struct class_ro64_t cro;
+  memcpy(&cro, r, sizeof(struct class_ro64_t));
+  if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+    swapStruct(cro);
+  if (cro.name == 0)
+    return nullptr;
+  const char *name = get_pointer_64(cro.name, offset, left, S, info);
+  return name;
+}
+
+// get_objc2_64bit_cfstring_name is used for disassembly and is passed a
+// pointer to a cfstring and returns its name or nullptr.
+const char *get_objc2_64bit_cfstring_name(uint64_t ReferenceValue,
+                                          struct DisassembleInfo *info) {
+  const char *r, *name;
+  uint32_t offset, left;
+  SectionRef S;
+  struct cfstring64_t cfs;
+  uint64_t cfs_characters;
+
+  r = get_pointer_64(ReferenceValue, offset, left, S, info);
+  if (r == nullptr || left < sizeof(struct cfstring64_t))
+    return nullptr;
+  memcpy(&cfs, r, sizeof(struct cfstring64_t));
+  if (info->O->isLittleEndian() != sys::IsLittleEndianHost)
+    swapStruct(cfs);
+  if (cfs.characters == 0) {
+    uint64_t n_value;
+    const char *symbol_name = get_symbol_64(
+        offset + offsetof(struct cfstring64_t, characters), S, info, n_value);
+    if (symbol_name == nullptr)
+      return nullptr;
+    cfs_characters = n_value;
+  } else
+    cfs_characters = cfs.characters;
+  name = get_pointer_64(cfs_characters, offset, left, S, info);
+
+  return name;
+}
+
+// get_objc2_64bit_selref() is used for disassembly and is passed a the address
+// of a pointer to an Objective-C selector reference when the pointer value is
+// zero as in a .o file and is likely to have a external relocation entry with
+// who's symbol's n_value is the real pointer to the selector name.  If that is
+// the case the real pointer to the selector name is returned else 0 is
+// returned
+uint64_t get_objc2_64bit_selref(uint64_t ReferenceValue,
+                                struct DisassembleInfo *info) {
+  uint32_t offset, left;
+  SectionRef S;
+
+  const char *r = get_pointer_64(ReferenceValue, offset, left, S, info);
+  if (r == nullptr || left < sizeof(uint64_t))
+    return 0;
+  uint64_t n_value;
+  const char *symbol_name = get_symbol_64(offset, S, info, n_value);
+  if (symbol_name == nullptr)
+    return 0;
+  return n_value;
+}
+
+// GuessLiteralPointer returns a string which for the item in the Mach-O file
+// for the address passed in as ReferenceValue for printing as a comment with
+// the instruction and also returns the corresponding type of that item
+// indirectly through ReferenceType.
+//
+// If ReferenceValue is an address of literal cstring then a pointer to the
+// cstring is returned and ReferenceType is set to
+// LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr .
+//
+// If ReferenceValue is an address of an Objective-C CFString, Selector ref or
+// Class ref that name is returned and the ReferenceType is set accordingly.
+//
+// Lastly, literals which are Symbol address in a literal pool are looked for
+// and if found the symbol name is returned and ReferenceType is set to
+// LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr .
+//
+// If there is no item in the Mach-O file for the address passed in as
+// ReferenceValue nullptr is returned and ReferenceType is unchanged.
+const char *GuessLiteralPointer(uint64_t ReferenceValue, uint64_t ReferencePC,
+                                uint64_t *ReferenceType,
+                                struct DisassembleInfo *info) {
+  // First see if there is an external relocation entry at the ReferencePC.
+  uint64_t sect_addr = info->S.getAddress();
+  uint64_t sect_offset = ReferencePC - sect_addr;
+  bool reloc_found = false;
+  DataRefImpl Rel;
+  MachO::any_relocation_info RE;
+  bool isExtern = false;
+  SymbolRef Symbol;
+  for (const RelocationRef &Reloc : info->S.relocations()) {
+    uint64_t RelocOffset;
+    Reloc.getOffset(RelocOffset);
+    if (RelocOffset == sect_offset) {
+      Rel = Reloc.getRawDataRefImpl();
+      RE = info->O->getRelocation(Rel);
+      if (info->O->isRelocationScattered(RE))
+        continue;
+      isExtern = info->O->getPlainRelocationExternal(RE);
+      if (isExtern) {
+        symbol_iterator RelocSym = Reloc.getSymbol();
+        Symbol = *RelocSym;
+      }
+      reloc_found = true;
+      break;
+    }
+  }
+  // If there is an external relocation entry for a symbol in a section
+  // then used that symbol's value for the value of the reference.
+  if (reloc_found && isExtern) {
+    if (info->O->getAnyRelocationPCRel(RE)) {
+      unsigned Type = info->O->getAnyRelocationType(RE);
+      if (Type == MachO::X86_64_RELOC_SIGNED) {
+        Symbol.getAddress(ReferenceValue);
+      }
+    }
+  }
+
+  // Look for literals such as Objective-C CFStrings refs, Selector refs,
+  // Message refs and Class refs.
+  bool classref, selref, msgref, cfstring;
+  uint64_t pointer_value = GuessPointerPointer(ReferenceValue, info, classref,
+                                               selref, msgref, cfstring);
+  if (classref == true && pointer_value == 0) {
+    // Note the ReferenceValue is a pointer into the __objc_classrefs section.
+    // And the pointer_value in that section is typically zero as it will be
+    // set by dyld as part of the "bind information".
+    const char *name = get_dyld_bind_info_symbolname(ReferenceValue, info);
+    if (name != nullptr) {
+      *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref;
+      const char *class_name = strrchr(name, '$');
+      if (class_name != nullptr && class_name[1] == '_' &&
+          class_name[2] != '\0') {
+        info->class_name = class_name + 2;
+        return name;
+      }
+    }
+  }
+
+  if (classref == true) {
+    *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref;
+    const char *name =
+        get_objc2_64bit_class_name(pointer_value, ReferenceValue, info);
+    if (name != nullptr)
+      info->class_name = name;
+    else
+      name = "bad class ref";
+    return name;
+  }
+
+  if (cfstring == true) {
+    *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref;
+    const char *name = get_objc2_64bit_cfstring_name(ReferenceValue, info);
+    return name;
+  }
+
+  if (selref == true && pointer_value == 0)
+    pointer_value = get_objc2_64bit_selref(ReferenceValue, info);
+
+  if (pointer_value != 0)
+    ReferenceValue = pointer_value;
+
+  const char *name = GuessCstringPointer(ReferenceValue, info);
+  if (name) {
+    if (pointer_value != 0 && selref == true) {
+      *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref;
+      info->selector_name = name;
+    } else if (pointer_value != 0 && msgref == true) {
+      info->class_name = nullptr;
+      *ReferenceType = LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref;
+      info->selector_name = name;
+    } else
+      *ReferenceType = LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr;
+    return name;
+  }
+
+  // Lastly look for an indirect symbol with this ReferenceValue which is in
+  // a literal pool.  If found return that symbol name.
+  name = GuessIndirectSymbol(ReferenceValue, info);
+  if (name) {
+    *ReferenceType = LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr;
+    return name;
+  }
+
+  return nullptr;
+}
+
+// SymbolizerSymbolLookUp is the symbol lookup function passed when creating
+// the Symbolizer.  It looks up the ReferenceValue using the info passed via the
+// pointer to the struct DisassembleInfo that was passed when MCSymbolizer
+// is created and returns the symbol name that matches the ReferenceValue or
+// nullptr if none.  The ReferenceType is passed in for the IN type of
+// reference the instruction is making from the values in defined in the header
+// "llvm-c/Disassembler.h".  On return the ReferenceType can set to a specific
+// Out type and the ReferenceName will also be set which is added as a comment
+// to the disassembled instruction.
+//
+#if HAVE_CXXABI_H
+// If the symbol name is a C++ mangled name then the demangled name is
+// returned through ReferenceName and ReferenceType is set to
+// LLVMDisassembler_ReferenceType_DeMangled_Name .
+#endif
+//
+// When this is called to get a symbol name for a branch target then the
+// ReferenceType will be LLVMDisassembler_ReferenceType_In_Branch and then
+// SymbolValue will be looked for in the indirect symbol table to determine if
+// it is an address for a symbol stub.  If so then the symbol name for that
+// stub is returned indirectly through ReferenceName and then ReferenceType is
+// set to LLVMDisassembler_ReferenceType_Out_SymbolStub.
+//
+// When this is called with an value loaded via a PC relative load then
+// ReferenceType will be LLVMDisassembler_ReferenceType_In_PCrel_Load then the
+// SymbolValue is checked to be an address of literal pointer, symbol pointer,
+// or an Objective-C meta data reference.  If so the output ReferenceType is
+// set to correspond to that as well as setting the ReferenceName.
+const char *SymbolizerSymbolLookUp(void *DisInfo, uint64_t ReferenceValue,
+                                   uint64_t *ReferenceType,
+                                   uint64_t ReferencePC,
+                                   const char **ReferenceName) {
+  struct DisassembleInfo *info = (struct DisassembleInfo *)DisInfo;
+  // If no verbose symbolic information is wanted then just return nullptr.
+  if (info->verbose == false) {
+    *ReferenceName = nullptr;
+    *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+    return nullptr;
+  }
+
+  const char *SymbolName = GuessSymbolName(ReferenceValue, info);
+
+  if (*ReferenceType == LLVMDisassembler_ReferenceType_In_Branch) {
+    *ReferenceName = GuessIndirectSymbol(ReferenceValue, info);
+    if (*ReferenceName != nullptr) {
+      method_reference(info, ReferenceType, ReferenceName);
+      if (*ReferenceType != LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        *ReferenceType = LLVMDisassembler_ReferenceType_Out_SymbolStub;
+    } else
+#if HAVE_CXXABI_H
+        if (SymbolName != nullptr && strncmp(SymbolName, "__Z", 3) == 0) {
+      if (info->demangled_name != nullptr)
+        free(info->demangled_name);
+      int status;
+      info->demangled_name =
+          abi::__cxa_demangle(SymbolName + 1, nullptr, nullptr, &status);
+      if (info->demangled_name != nullptr) {
+        *ReferenceName = info->demangled_name;
+        *ReferenceType = LLVMDisassembler_ReferenceType_DeMangled_Name;
+      } else
+        *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+    } else
+#endif
+      *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+  } else if (*ReferenceType == LLVMDisassembler_ReferenceType_In_PCrel_Load) {
+    *ReferenceName =
+        GuessLiteralPointer(ReferenceValue, ReferencePC, ReferenceType, info);
+    if (*ReferenceName)
+      method_reference(info, ReferenceType, ReferenceName);
+    else
+      *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+    // If this is arm64 and the reference is an adrp instruction save the
+    // instruction, passed in ReferenceValue and the address of the instruction
+    // for use later if we see and add immediate instruction.
+  } else if (info->O->getArch() == Triple::aarch64 &&
+             *ReferenceType == LLVMDisassembler_ReferenceType_In_ARM64_ADRP) {
+    info->adrp_inst = ReferenceValue;
+    info->adrp_addr = ReferencePC;
+    SymbolName = nullptr;
+    *ReferenceName = nullptr;
+    *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+    // If this is arm64 and reference is an add immediate instruction and we
+    // have
+    // seen an adrp instruction just before it and the adrp's Xd register
+    // matches
+    // this add's Xn register reconstruct the value being referenced and look to
+    // see if it is a literal pointer.  Note the add immediate instruction is
+    // passed in ReferenceValue.
+  } else if (info->O->getArch() == Triple::aarch64 &&
+             *ReferenceType == LLVMDisassembler_ReferenceType_In_ARM64_ADDXri &&
+             ReferencePC - 4 == info->adrp_addr &&
+             (info->adrp_inst & 0x9f000000) == 0x90000000 &&
+             (info->adrp_inst & 0x1f) == ((ReferenceValue >> 5) & 0x1f)) {
+    uint32_t addxri_inst;
+    uint64_t adrp_imm, addxri_imm;
+
+    adrp_imm =
+        ((info->adrp_inst & 0x00ffffe0) >> 3) | ((info->adrp_inst >> 29) & 0x3);
+    if (info->adrp_inst & 0x0200000)
+      adrp_imm |= 0xfffffffffc000000LL;
+
+    addxri_inst = ReferenceValue;
+    addxri_imm = (addxri_inst >> 10) & 0xfff;
+    if (((addxri_inst >> 22) & 0x3) == 1)
+      addxri_imm <<= 12;
+
+    ReferenceValue = (info->adrp_addr & 0xfffffffffffff000LL) +
+                     (adrp_imm << 12) + addxri_imm;
+
+    *ReferenceName =
+        GuessLiteralPointer(ReferenceValue, ReferencePC, ReferenceType, info);
+    if (*ReferenceName == nullptr)
+      *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+    // If this is arm64 and the reference is a load register instruction and we
+    // have seen an adrp instruction just before it and the adrp's Xd register
+    // matches this add's Xn register reconstruct the value being referenced and
+    // look to see if it is a literal pointer.  Note the load register
+    // instruction is passed in ReferenceValue.
+  } else if (info->O->getArch() == Triple::aarch64 &&
+             *ReferenceType == LLVMDisassembler_ReferenceType_In_ARM64_LDRXui &&
+             ReferencePC - 4 == info->adrp_addr &&
+             (info->adrp_inst & 0x9f000000) == 0x90000000 &&
+             (info->adrp_inst & 0x1f) == ((ReferenceValue >> 5) & 0x1f)) {
+    uint32_t ldrxui_inst;
+    uint64_t adrp_imm, ldrxui_imm;
+
+    adrp_imm =
+        ((info->adrp_inst & 0x00ffffe0) >> 3) | ((info->adrp_inst >> 29) & 0x3);
+    if (info->adrp_inst & 0x0200000)
+      adrp_imm |= 0xfffffffffc000000LL;
+
+    ldrxui_inst = ReferenceValue;
+    ldrxui_imm = (ldrxui_inst >> 10) & 0xfff;
+
+    ReferenceValue = (info->adrp_addr & 0xfffffffffffff000LL) +
+                     (adrp_imm << 12) + (ldrxui_imm << 3);
+
+    *ReferenceName =
+        GuessLiteralPointer(ReferenceValue, ReferencePC, ReferenceType, info);
+    if (*ReferenceName == nullptr)
+      *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+  }
+  // If this arm64 and is an load register (PC-relative) instruction the
+  // ReferenceValue is the PC plus the immediate value.
+  else if (info->O->getArch() == Triple::aarch64 &&
+           (*ReferenceType == LLVMDisassembler_ReferenceType_In_ARM64_LDRXl ||
+            *ReferenceType == LLVMDisassembler_ReferenceType_In_ARM64_ADR)) {
+    *ReferenceName =
+        GuessLiteralPointer(ReferenceValue, ReferencePC, ReferenceType, info);
+    if (*ReferenceName == nullptr)
+      *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+  }
+#if HAVE_CXXABI_H
+  else if (SymbolName != nullptr && strncmp(SymbolName, "__Z", 3) == 0) {
+    if (info->demangled_name != nullptr)
+      free(info->demangled_name);
+    int status;
+    info->demangled_name =
+        abi::__cxa_demangle(SymbolName + 1, nullptr, nullptr, &status);
+    if (info->demangled_name != nullptr) {
+      *ReferenceName = info->demangled_name;
+      *ReferenceType = LLVMDisassembler_ReferenceType_DeMangled_Name;
+    }
+  }
+#endif
+  else {
+    *ReferenceName = nullptr;
+    *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+  }
+
+  return SymbolName;
+}
+
+/// \brief Emits the comments that are stored in the CommentStream.
+/// Each comment in the CommentStream must end with a newline.
+static void emitComments(raw_svector_ostream &CommentStream,
+                         SmallString<128> &CommentsToEmit,
+                         formatted_raw_ostream &FormattedOS,
+                         const MCAsmInfo &MAI) {
+  // Flush the stream before taking its content.
+  CommentStream.flush();
+  StringRef Comments = CommentsToEmit.str();
+  // Get the default information for printing a comment.
+  const char *CommentBegin = MAI.getCommentString();
+  unsigned CommentColumn = MAI.getCommentColumn();
+  bool IsFirst = true;
+  while (!Comments.empty()) {
+    if (!IsFirst)
+      FormattedOS << '\n';
+    // Emit a line of comments.
+    FormattedOS.PadToColumn(CommentColumn);
+    size_t Position = Comments.find('\n');
+    FormattedOS << CommentBegin << ' ' << Comments.substr(0, Position);
+    // Move after the newline character.
+    Comments = Comments.substr(Position + 1);
+    IsFirst = false;
+  }
+  FormattedOS.flush();
+
+  // Tell the comment stream that the vector changed underneath it.
+  CommentsToEmit.clear();
+  CommentStream.resync();
+}
+
 static void DisassembleInputMachO2(StringRef Filename,
                                    MachOObjectFile *MachOOF) {
-  const Target *TheTarget = GetTarget(MachOOF);
+  const char *McpuDefault = nullptr;
+  const Target *ThumbTarget = nullptr;
+  const Target *TheTarget = GetTarget(MachOOF, &McpuDefault, &ThumbTarget);
   if (!TheTarget) {
     // GetTarget prints out stuff.
     return;
   }
+  if (MCPU.empty() && McpuDefault)
+    MCPU = McpuDefault;
+
   std::unique_ptr<const MCInstrInfo> InstrInfo(TheTarget->createMCInstrInfo());
-  std::unique_ptr<MCInstrAnalysis> InstrAnalysis(
-      TheTarget->createMCInstrAnalysis(InstrInfo.get()));
+  std::unique_ptr<const MCInstrInfo> ThumbInstrInfo;
+  if (ThumbTarget)
+    ThumbInstrInfo.reset(ThumbTarget->createMCInstrInfo());
+
+  // Package up features to be passed to target/subtarget
+  std::string FeaturesStr;
+  if (MAttrs.size()) {
+    SubtargetFeatures Features;
+    for (unsigned i = 0; i != MAttrs.size(); ++i)
+      Features.AddFeature(MAttrs[i]);
+    FeaturesStr = Features.getString();
+  }
 
   // Set up disassembler.
   std::unique_ptr<const MCRegisterInfo> MRI(
@@ -225,26 +1617,80 @@
   std::unique_ptr<const MCAsmInfo> AsmInfo(
       TheTarget->createMCAsmInfo(*MRI, TripleName));
   std::unique_ptr<const MCSubtargetInfo> STI(
-      TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+      TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr));
   MCContext Ctx(AsmInfo.get(), MRI.get(), nullptr);
-  std::unique_ptr<const MCDisassembler> DisAsm(
-    TheTarget->createMCDisassembler(*STI, Ctx));
+  std::unique_ptr<MCDisassembler> DisAsm(
+      TheTarget->createMCDisassembler(*STI, Ctx));
+  std::unique_ptr<MCSymbolizer> Symbolizer;
+  struct DisassembleInfo SymbolizerInfo;
+  std::unique_ptr<MCRelocationInfo> RelInfo(
+      TheTarget->createMCRelocationInfo(TripleName, Ctx));
+  if (RelInfo) {
+    Symbolizer.reset(TheTarget->createMCSymbolizer(
+        TripleName, SymbolizerGetOpInfo, SymbolizerSymbolLookUp,
+        &SymbolizerInfo, &Ctx, RelInfo.release()));
+    DisAsm->setSymbolizer(std::move(Symbolizer));
+  }
   int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
   std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
       AsmPrinterVariant, *AsmInfo, *InstrInfo, *MRI, *STI));
+  // Set the display preference for hex vs. decimal immediates.
+  IP->setPrintImmHex(PrintImmHex);
+  // Comment stream and backing vector.
+  SmallString<128> CommentsToEmit;
+  raw_svector_ostream CommentStream(CommentsToEmit);
 
-  if (!InstrAnalysis || !AsmInfo || !STI || !DisAsm || !IP) {
+  if (!AsmInfo || !STI || !DisAsm || !IP) {
     errs() << "error: couldn't initialize disassembler for target "
            << TripleName << '\n';
     return;
   }
 
+  // Set up thumb disassembler.
+  std::unique_ptr<const MCRegisterInfo> ThumbMRI;
+  std::unique_ptr<const MCAsmInfo> ThumbAsmInfo;
+  std::unique_ptr<const MCSubtargetInfo> ThumbSTI;
+  std::unique_ptr<MCDisassembler> ThumbDisAsm;
+  std::unique_ptr<MCInstPrinter> ThumbIP;
+  std::unique_ptr<MCContext> ThumbCtx;
+  std::unique_ptr<MCSymbolizer> ThumbSymbolizer;
+  struct DisassembleInfo ThumbSymbolizerInfo;
+  std::unique_ptr<MCRelocationInfo> ThumbRelInfo;
+  if (ThumbTarget) {
+    ThumbMRI.reset(ThumbTarget->createMCRegInfo(ThumbTripleName));
+    ThumbAsmInfo.reset(
+        ThumbTarget->createMCAsmInfo(*ThumbMRI, ThumbTripleName));
+    ThumbSTI.reset(
+        ThumbTarget->createMCSubtargetInfo(ThumbTripleName, MCPU, FeaturesStr));
+    ThumbCtx.reset(new MCContext(ThumbAsmInfo.get(), ThumbMRI.get(), nullptr));
+    ThumbDisAsm.reset(ThumbTarget->createMCDisassembler(*ThumbSTI, *ThumbCtx));
+    MCContext *PtrThumbCtx = ThumbCtx.get();
+    ThumbRelInfo.reset(
+        ThumbTarget->createMCRelocationInfo(ThumbTripleName, *PtrThumbCtx));
+    if (ThumbRelInfo) {
+      ThumbSymbolizer.reset(ThumbTarget->createMCSymbolizer(
+          ThumbTripleName, SymbolizerGetOpInfo, SymbolizerSymbolLookUp,
+          &ThumbSymbolizerInfo, PtrThumbCtx, ThumbRelInfo.release()));
+      ThumbDisAsm->setSymbolizer(std::move(ThumbSymbolizer));
+    }
+    int ThumbAsmPrinterVariant = ThumbAsmInfo->getAssemblerDialect();
+    ThumbIP.reset(ThumbTarget->createMCInstPrinter(
+        ThumbAsmPrinterVariant, *ThumbAsmInfo, *ThumbInstrInfo, *ThumbMRI,
+        *ThumbSTI));
+    // Set the display preference for hex vs. decimal immediates.
+    ThumbIP->setPrintImmHex(PrintImmHex);
+  }
+
+  if (ThumbTarget && (!ThumbAsmInfo || !ThumbSTI || !ThumbDisAsm || !ThumbIP)) {
+    errs() << "error: couldn't initialize disassembler for target "
+           << ThumbTripleName << '\n';
+    return;
+  }
+
   outs() << '\n' << Filename << ":\n\n";
 
   MachO::mach_header Header = MachOOF->getHeader();
 
-  // FIXME: FoundFns isn't used anymore. Using symbols/LC_FUNCTION_STARTS to
-  // determine function locations will eventually go in MCObjectDisassembler.
   // FIXME: Using the -cfg command line option, this code used to be able to
   // annotate relocations with the referenced symbol's name, and if this was
   // inside a __[cf]string section, the data it points to. This is now replaced
@@ -263,7 +1709,7 @@
   // Build a data in code table that is sorted on by the address of each entry.
   uint64_t BaseAddress = 0;
   if (Header.filetype == MachO::MH_OBJECT)
-    Sections[0].getAddress(BaseAddress);
+    BaseAddress = Sections[0].getAddress();
   else
     BaseAddress = BaseSegmentAddress;
   DiceTable Dices;
@@ -288,29 +1734,30 @@
     // A separate DSym file path was specified, parse it as a macho file,
     // get the sections and supply it to the section name parsing machinery.
     if (!DSYMFile.empty()) {
-      ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
           MemoryBuffer::getFileOrSTDIN(DSYMFile);
-      if (std::error_code EC = Buf.getError()) {
+      if (std::error_code EC = BufOrErr.getError()) {
         errs() << "llvm-objdump: " << Filename << ": " << EC.message() << '\n';
         return;
       }
-      DbgObj = ObjectFile::createMachOObjectFile(Buf.get()).get();
+      DbgObj =
+          ObjectFile::createMachOObjectFile(BufOrErr.get()->getMemBufferRef())
+              .get()
+              .release();
     }
 
     // Setup the DIContext
-    diContext.reset(DIContext::getDWARFContext(DbgObj));
+    diContext.reset(DIContext::getDWARFContext(*DbgObj));
   }
 
   for (unsigned SectIdx = 0; SectIdx != Sections.size(); SectIdx++) {
 
-    bool SectIsText = false;
-    Sections[SectIdx].isText(SectIsText);
+    bool SectIsText = Sections[SectIdx].isText();
     if (SectIsText == false)
       continue;
 
     StringRef SectName;
-    if (Sections[SectIdx].getName(SectName) ||
-        SectName != "__text")
+    if (Sections[SectIdx].getName(SectName) || SectName != "__text")
       continue; // Skip non-text sections
 
     DataRefImpl DR = Sections[SectIdx].getRawDataRefImpl();
@@ -319,17 +1766,20 @@
     if (SegmentName != "__TEXT")
       continue;
 
-    StringRef Bytes;
-    Sections[SectIdx].getContents(Bytes);
-    StringRefMemoryObject memoryObject(Bytes);
+    StringRef BytesStr;
+    Sections[SectIdx].getContents(BytesStr);
+    ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(BytesStr.data()),
+                            BytesStr.size());
+    uint64_t SectAddress = Sections[SectIdx].getAddress();
+
     bool symbolTableWorked = false;
 
     // Parse relocations.
     std::vector<std::pair<uint64_t, SymbolRef>> Relocs;
     for (const RelocationRef &Reloc : Sections[SectIdx].relocations()) {
-      uint64_t RelocOffset, SectionAddress;
+      uint64_t RelocOffset;
       Reloc.getOffset(RelocOffset);
-      Sections[SectIdx].getAddress(SectionAddress);
+      uint64_t SectionAddress = Sections[SectIdx].getAddress();
       RelocOffset -= SectionAddress;
 
       symbol_iterator RelocSym = Reloc.getSymbol();
@@ -338,6 +1788,48 @@
     }
     array_pod_sort(Relocs.begin(), Relocs.end());
 
+    // Create a map of symbol addresses to symbol names for use by
+    // the SymbolizerSymbolLookUp() routine.
+    SymbolAddressMap AddrMap;
+    for (const SymbolRef &Symbol : MachOOF->symbols()) {
+      SymbolRef::Type ST;
+      Symbol.getType(ST);
+      if (ST == SymbolRef::ST_Function || ST == SymbolRef::ST_Data ||
+          ST == SymbolRef::ST_Other) {
+        uint64_t Address;
+        Symbol.getAddress(Address);
+        StringRef SymName;
+        Symbol.getName(SymName);
+        AddrMap[Address] = SymName;
+      }
+    }
+    // Set up the block of info used by the Symbolizer call backs.
+    SymbolizerInfo.verbose = true;
+    SymbolizerInfo.O = MachOOF;
+    SymbolizerInfo.S = Sections[SectIdx];
+    SymbolizerInfo.AddrMap = &AddrMap;
+    SymbolizerInfo.Sections = &Sections;
+    SymbolizerInfo.class_name = nullptr;
+    SymbolizerInfo.selector_name = nullptr;
+    SymbolizerInfo.method = nullptr;
+    SymbolizerInfo.demangled_name = nullptr;
+    SymbolizerInfo.bindtable = nullptr;
+    SymbolizerInfo.adrp_addr = 0;
+    SymbolizerInfo.adrp_inst = 0;
+    // Same for the ThumbSymbolizer
+    ThumbSymbolizerInfo.verbose = true;
+    ThumbSymbolizerInfo.O = MachOOF;
+    ThumbSymbolizerInfo.S = Sections[SectIdx];
+    ThumbSymbolizerInfo.AddrMap = &AddrMap;
+    ThumbSymbolizerInfo.Sections = &Sections;
+    ThumbSymbolizerInfo.class_name = nullptr;
+    ThumbSymbolizerInfo.selector_name = nullptr;
+    ThumbSymbolizerInfo.method = nullptr;
+    ThumbSymbolizerInfo.demangled_name = nullptr;
+    ThumbSymbolizerInfo.bindtable = nullptr;
+    ThumbSymbolizerInfo.adrp_addr = 0;
+    ThumbSymbolizerInfo.adrp_inst = 0;
+
     // Disassemble symbol by symbol.
     for (unsigned SymIdx = 0; SymIdx != Symbols.size(); SymIdx++) {
       StringRef SymName;
@@ -349,15 +1841,13 @@
         continue;
 
       // Make sure the symbol is defined in this section.
-      bool containsSym = false;
-      Sections[SectIdx].containsSymbol(Symbols[SymIdx], containsSym);
+      bool containsSym = Sections[SectIdx].containsSymbol(Symbols[SymIdx]);
       if (!containsSym)
         continue;
 
       // Start at the address of the symbol relative to the section's address.
-      uint64_t SectionAddress = 0;
       uint64_t Start = 0;
-      Sections[SectIdx].getAddress(SectionAddress);
+      uint64_t SectionAddress = Sections[SectIdx].getAddress();
       Symbols[SymIdx].getAddress(Start);
       Start -= SectionAddress;
 
@@ -365,13 +1855,13 @@
       // the end of the section.
       bool containsNextSym = false;
       uint64_t NextSym = 0;
-      uint64_t NextSymIdx = SymIdx+1;
+      uint64_t NextSymIdx = SymIdx + 1;
       while (Symbols.size() > NextSymIdx) {
         SymbolRef::Type NextSymType;
         Symbols[NextSymIdx].getType(NextSymType);
         if (NextSymType == SymbolRef::ST_Function) {
-          Sections[SectIdx].containsSymbol(Symbols[NextSymIdx],
-                                           containsNextSym);
+          containsNextSym =
+              Sections[SectIdx].containsSymbol(Symbols[NextSymIdx]);
           Symbols[NextSymIdx].getAddress(NextSym);
           NextSym -= SectionAddress;
           break;
@@ -379,48 +1869,81 @@
         ++NextSymIdx;
       }
 
-      uint64_t SectSize;
-      Sections[SectIdx].getSize(SectSize);
-      uint64_t End = containsNextSym ?  NextSym : SectSize;
+      uint64_t SectSize = Sections[SectIdx].getSize();
+      uint64_t End = containsNextSym ? NextSym : SectSize;
       uint64_t Size;
 
       symbolTableWorked = true;
 
+      DataRefImpl Symb = Symbols[SymIdx].getRawDataRefImpl();
+      bool isThumb =
+          (MachOOF->getSymbolFlags(Symb) & SymbolRef::SF_Thumb) && ThumbTarget;
+
       outs() << SymName << ":\n";
       DILineInfo lastLine;
       for (uint64_t Index = Start; Index < End; Index += Size) {
         MCInst Inst;
 
-        uint64_t SectAddress = 0;
-        Sections[SectIdx].getAddress(SectAddress);
-        outs() << format("%8" PRIx64 ":\t", SectAddress + Index);
+        uint64_t PC = SectAddress + Index;
+        if (FullLeadingAddr) {
+          if (MachOOF->is64Bit())
+            outs() << format("%016" PRIx64, PC);
+          else
+            outs() << format("%08" PRIx64, PC);
+        } else {
+          outs() << format("%8" PRIx64 ":", PC);
+        }
+        if (!NoShowRawInsn)
+          outs() << "\t";
 
         // Check the data in code table here to see if this is data not an
         // instruction to be disassembled.
         DiceTable Dice;
-        Dice.push_back(std::make_pair(SectAddress + Index, DiceRef()));
-        dice_table_iterator DTI = std::search(Dices.begin(), Dices.end(),
-                                              Dice.begin(), Dice.end(),
-                                              compareDiceTableEntries);
-        if (DTI != Dices.end()){
+        Dice.push_back(std::make_pair(PC, DiceRef()));
+        dice_table_iterator DTI =
+            std::search(Dices.begin(), Dices.end(), Dice.begin(), Dice.end(),
+                        compareDiceTableEntries);
+        if (DTI != Dices.end()) {
           uint16_t Length;
           DTI->second.getLength(Length);
-          DumpBytes(StringRef(Bytes.data() + Index, Length));
           uint16_t Kind;
           DTI->second.getKind(Kind);
-          DumpDataInCode(Bytes.data() + Index, Length, Kind);
+          Size = DumpDataInCode(reinterpret_cast<const char *>(Bytes.data()) +
+                                    Index,
+                                Length, Kind);
+          if ((Kind == MachO::DICE_KIND_JUMP_TABLE8) &&
+              (PC == (DTI->first + Length - 1)) && (Length & 1))
+            Size++;
           continue;
         }
 
-        if (DisAsm->getInstruction(Inst, Size, memoryObject, Index,
-                                   DebugOut, nulls())) {
-          DumpBytes(StringRef(Bytes.data() + Index, Size));
-          IP->printInst(&Inst, outs(), "");
+        SmallVector<char, 64> AnnotationsBytes;
+        raw_svector_ostream Annotations(AnnotationsBytes);
+
+        bool gotInst;
+        if (isThumb)
+          gotInst = ThumbDisAsm->getInstruction(Inst, Size, Bytes.slice(Index),
+                                                PC, DebugOut, Annotations);
+        else
+          gotInst = DisAsm->getInstruction(Inst, Size, Bytes.slice(Index), PC,
+                                           DebugOut, Annotations);
+        if (gotInst) {
+          if (!NoShowRawInsn) {
+            DumpBytes(StringRef(
+                reinterpret_cast<const char *>(Bytes.data()) + Index, Size));
+          }
+          formatted_raw_ostream FormattedOS(outs());
+          Annotations.flush();
+          StringRef AnnotationsStr = Annotations.str();
+          if (isThumb)
+            ThumbIP->printInst(&Inst, FormattedOS, AnnotationsStr);
+          else
+            IP->printInst(&Inst, FormattedOS, AnnotationsStr);
+          emitComments(CommentStream, CommentsToEmit, FormattedOS, *AsmInfo);
 
           // Print debug info.
           if (diContext) {
-            DILineInfo dli =
-              diContext->getLineInfoForAddress(SectAddress + Index);
+            DILineInfo dli = diContext->getLineInfoForAddress(PC);
             // Print valid line info if it changed.
             if (dli != lastLine && dli.Line != 0)
               outs() << "\t## " << dli.FileName << ':' << dli.Line << ':'
@@ -429,34 +1952,1924 @@
           }
           outs() << "\n";
         } else {
-          errs() << "llvm-objdump: warning: invalid instruction encoding\n";
-          if (Size == 0)
-            Size = 1; // skip illegible bytes
+          unsigned int Arch = MachOOF->getArch();
+          if (Arch == Triple::x86_64 || Arch == Triple::x86) {
+            outs() << format("\t.byte 0x%02x #bad opcode\n",
+                             *(Bytes.data() + Index) & 0xff);
+            Size = 1; // skip exactly one illegible byte and move on.
+          } else if (Arch == Triple::aarch64) {
+            uint32_t opcode = (*(Bytes.data() + Index) & 0xff) |
+                              (*(Bytes.data() + Index + 1) & 0xff) << 8 |
+                              (*(Bytes.data() + Index + 2) & 0xff) << 16 |
+                              (*(Bytes.data() + Index + 3) & 0xff) << 24;
+            outs() << format("\t.long\t0x%08x\n", opcode);
+            Size = 4;
+          } else {
+            errs() << "llvm-objdump: warning: invalid instruction encoding\n";
+            if (Size == 0)
+              Size = 1; // skip illegible bytes
+          }
         }
       }
     }
     if (!symbolTableWorked) {
-      // Reading the symbol table didn't work, disassemble the whole section. 
-      uint64_t SectAddress;
-      Sections[SectIdx].getAddress(SectAddress);
-      uint64_t SectSize;
-      Sections[SectIdx].getSize(SectSize);
+      // Reading the symbol table didn't work, disassemble the whole section.
+      uint64_t SectAddress = Sections[SectIdx].getAddress();
+      uint64_t SectSize = Sections[SectIdx].getSize();
       uint64_t InstSize;
       for (uint64_t Index = 0; Index < SectSize; Index += InstSize) {
         MCInst Inst;
 
-        if (DisAsm->getInstruction(Inst, InstSize, memoryObject, Index,
+        uint64_t PC = SectAddress + Index;
+        if (DisAsm->getInstruction(Inst, InstSize, Bytes.slice(Index), PC,
                                    DebugOut, nulls())) {
-          outs() << format("%8" PRIx64 ":\t", SectAddress + Index);
-          DumpBytes(StringRef(Bytes.data() + Index, InstSize));
+          if (FullLeadingAddr) {
+            if (MachOOF->is64Bit())
+              outs() << format("%016" PRIx64, PC);
+            else
+              outs() << format("%08" PRIx64, PC);
+          } else {
+            outs() << format("%8" PRIx64 ":", PC);
+          }
+          if (!NoShowRawInsn) {
+            outs() << "\t";
+            DumpBytes(
+                StringRef(reinterpret_cast<const char *>(Bytes.data()) + Index,
+                          InstSize));
+          }
           IP->printInst(&Inst, outs(), "");
           outs() << "\n";
         } else {
-          errs() << "llvm-objdump: warning: invalid instruction encoding\n";
-          if (InstSize == 0)
-            InstSize = 1; // skip illegible bytes
+          unsigned int Arch = MachOOF->getArch();
+          if (Arch == Triple::x86_64 || Arch == Triple::x86) {
+            outs() << format("\t.byte 0x%02x #bad opcode\n",
+                             *(Bytes.data() + Index) & 0xff);
+            InstSize = 1; // skip exactly one illegible byte and move on.
+          } else {
+            errs() << "llvm-objdump: warning: invalid instruction encoding\n";
+            if (InstSize == 0)
+              InstSize = 1; // skip illegible bytes
+          }
         }
       }
     }
+    if (SymbolizerInfo.method != nullptr)
+      free(SymbolizerInfo.method);
+    if (SymbolizerInfo.demangled_name != nullptr)
+      free(SymbolizerInfo.demangled_name);
+    if (SymbolizerInfo.bindtable != nullptr)
+      delete SymbolizerInfo.bindtable;
+    if (ThumbSymbolizerInfo.method != nullptr)
+      free(ThumbSymbolizerInfo.method);
+    if (ThumbSymbolizerInfo.demangled_name != nullptr)
+      free(ThumbSymbolizerInfo.demangled_name);
+    if (ThumbSymbolizerInfo.bindtable != nullptr)
+      delete ThumbSymbolizerInfo.bindtable;
   }
 }
+
+//===----------------------------------------------------------------------===//
+// __compact_unwind section dumping
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+template <typename T> static uint64_t readNext(const char *&Buf) {
+  using llvm::support::little;
+  using llvm::support::unaligned;
+
+  uint64_t Val = support::endian::read<T, little, unaligned>(Buf);
+  Buf += sizeof(T);
+  return Val;
+}
+
+struct CompactUnwindEntry {
+  uint32_t OffsetInSection;
+
+  uint64_t FunctionAddr;
+  uint32_t Length;
+  uint32_t CompactEncoding;
+  uint64_t PersonalityAddr;
+  uint64_t LSDAAddr;
+
+  RelocationRef FunctionReloc;
+  RelocationRef PersonalityReloc;
+  RelocationRef LSDAReloc;
+
+  CompactUnwindEntry(StringRef Contents, unsigned Offset, bool Is64)
+      : OffsetInSection(Offset) {
+    if (Is64)
+      read<uint64_t>(Contents.data() + Offset);
+    else
+      read<uint32_t>(Contents.data() + Offset);
+  }
+
+private:
+  template <typename UIntPtr> void read(const char *Buf) {
+    FunctionAddr = readNext<UIntPtr>(Buf);
+    Length = readNext<uint32_t>(Buf);
+    CompactEncoding = readNext<uint32_t>(Buf);
+    PersonalityAddr = readNext<UIntPtr>(Buf);
+    LSDAAddr = readNext<UIntPtr>(Buf);
+  }
+};
+}
+
+/// Given a relocation from __compact_unwind, consisting of the RelocationRef
+/// and data being relocated, determine the best base Name and Addend to use for
+/// display purposes.
+///
+/// 1. An Extern relocation will directly reference a symbol (and the data is
+///    then already an addend), so use that.
+/// 2. Otherwise the data is an offset in the object file's layout; try to find
+//     a symbol before it in the same section, and use the offset from there.
+/// 3. Finally, if all that fails, fall back to an offset from the start of the
+///    referenced section.
+static void findUnwindRelocNameAddend(const MachOObjectFile *Obj,
+                                      std::map<uint64_t, SymbolRef> &Symbols,
+                                      const RelocationRef &Reloc, uint64_t Addr,
+                                      StringRef &Name, uint64_t &Addend) {
+  if (Reloc.getSymbol() != Obj->symbol_end()) {
+    Reloc.getSymbol()->getName(Name);
+    Addend = Addr;
+    return;
+  }
+
+  auto RE = Obj->getRelocation(Reloc.getRawDataRefImpl());
+  SectionRef RelocSection = Obj->getRelocationSection(RE);
+
+  uint64_t SectionAddr = RelocSection.getAddress();
+
+  auto Sym = Symbols.upper_bound(Addr);
+  if (Sym == Symbols.begin()) {
+    // The first symbol in the object is after this reference, the best we can
+    // do is section-relative notation.
+    RelocSection.getName(Name);
+    Addend = Addr - SectionAddr;
+    return;
+  }
+
+  // Go back one so that SymbolAddress <= Addr.
+  --Sym;
+
+  section_iterator SymSection = Obj->section_end();
+  Sym->second.getSection(SymSection);
+  if (RelocSection == *SymSection) {
+    // There's a valid symbol in the same section before this reference.
+    Sym->second.getName(Name);
+    Addend = Addr - Sym->first;
+    return;
+  }
+
+  // There is a symbol before this reference, but it's in a different
+  // section. Probably not helpful to mention it, so use the section name.
+  RelocSection.getName(Name);
+  Addend = Addr - SectionAddr;
+}
+
+static void printUnwindRelocDest(const MachOObjectFile *Obj,
+                                 std::map<uint64_t, SymbolRef> &Symbols,
+                                 const RelocationRef &Reloc, uint64_t Addr) {
+  StringRef Name;
+  uint64_t Addend;
+
+  if (!Reloc.getObjectFile())
+    return;
+
+  findUnwindRelocNameAddend(Obj, Symbols, Reloc, Addr, Name, Addend);
+
+  outs() << Name;
+  if (Addend)
+    outs() << " + " << format("0x%" PRIx64, Addend);
+}
+
+static void
+printMachOCompactUnwindSection(const MachOObjectFile *Obj,
+                               std::map<uint64_t, SymbolRef> &Symbols,
+                               const SectionRef &CompactUnwind) {
+
+  assert(Obj->isLittleEndian() &&
+         "There should not be a big-endian .o with __compact_unwind");
+
+  bool Is64 = Obj->is64Bit();
+  uint32_t PointerSize = Is64 ? sizeof(uint64_t) : sizeof(uint32_t);
+  uint32_t EntrySize = 3 * PointerSize + 2 * sizeof(uint32_t);
+
+  StringRef Contents;
+  CompactUnwind.getContents(Contents);
+
+  SmallVector<CompactUnwindEntry, 4> CompactUnwinds;
+
+  // First populate the initial raw offsets, encodings and so on from the entry.
+  for (unsigned Offset = 0; Offset < Contents.size(); Offset += EntrySize) {
+    CompactUnwindEntry Entry(Contents.data(), Offset, Is64);
+    CompactUnwinds.push_back(Entry);
+  }
+
+  // Next we need to look at the relocations to find out what objects are
+  // actually being referred to.
+  for (const RelocationRef &Reloc : CompactUnwind.relocations()) {
+    uint64_t RelocAddress;
+    Reloc.getOffset(RelocAddress);
+
+    uint32_t EntryIdx = RelocAddress / EntrySize;
+    uint32_t OffsetInEntry = RelocAddress - EntryIdx * EntrySize;
+    CompactUnwindEntry &Entry = CompactUnwinds[EntryIdx];
+
+    if (OffsetInEntry == 0)
+      Entry.FunctionReloc = Reloc;
+    else if (OffsetInEntry == PointerSize + 2 * sizeof(uint32_t))
+      Entry.PersonalityReloc = Reloc;
+    else if (OffsetInEntry == 2 * PointerSize + 2 * sizeof(uint32_t))
+      Entry.LSDAReloc = Reloc;
+    else
+      llvm_unreachable("Unexpected relocation in __compact_unwind section");
+  }
+
+  // Finally, we're ready to print the data we've gathered.
+  outs() << "Contents of __compact_unwind section:\n";
+  for (auto &Entry : CompactUnwinds) {
+    outs() << "  Entry at offset "
+           << format("0x%" PRIx32, Entry.OffsetInSection) << ":\n";
+
+    // 1. Start of the region this entry applies to.
+    outs() << "    start:                " << format("0x%" PRIx64,
+                                                     Entry.FunctionAddr) << ' ';
+    printUnwindRelocDest(Obj, Symbols, Entry.FunctionReloc, Entry.FunctionAddr);
+    outs() << '\n';
+
+    // 2. Length of the region this entry applies to.
+    outs() << "    length:               " << format("0x%" PRIx32, Entry.Length)
+           << '\n';
+    // 3. The 32-bit compact encoding.
+    outs() << "    compact encoding:     "
+           << format("0x%08" PRIx32, Entry.CompactEncoding) << '\n';
+
+    // 4. The personality function, if present.
+    if (Entry.PersonalityReloc.getObjectFile()) {
+      outs() << "    personality function: "
+             << format("0x%" PRIx64, Entry.PersonalityAddr) << ' ';
+      printUnwindRelocDest(Obj, Symbols, Entry.PersonalityReloc,
+                           Entry.PersonalityAddr);
+      outs() << '\n';
+    }
+
+    // 5. This entry's language-specific data area.
+    if (Entry.LSDAReloc.getObjectFile()) {
+      outs() << "    LSDA:                 " << format("0x%" PRIx64,
+                                                       Entry.LSDAAddr) << ' ';
+      printUnwindRelocDest(Obj, Symbols, Entry.LSDAReloc, Entry.LSDAAddr);
+      outs() << '\n';
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// __unwind_info section dumping
+//===----------------------------------------------------------------------===//
+
+static void printRegularSecondLevelUnwindPage(const char *PageStart) {
+  const char *Pos = PageStart;
+  uint32_t Kind = readNext<uint32_t>(Pos);
+  (void)Kind;
+  assert(Kind == 2 && "kind for a regular 2nd level index should be 2");
+
+  uint16_t EntriesStart = readNext<uint16_t>(Pos);
+  uint16_t NumEntries = readNext<uint16_t>(Pos);
+
+  Pos = PageStart + EntriesStart;
+  for (unsigned i = 0; i < NumEntries; ++i) {
+    uint32_t FunctionOffset = readNext<uint32_t>(Pos);
+    uint32_t Encoding = readNext<uint32_t>(Pos);
+
+    outs() << "      [" << i << "]: "
+           << "function offset=" << format("0x%08" PRIx32, FunctionOffset)
+           << ", "
+           << "encoding=" << format("0x%08" PRIx32, Encoding) << '\n';
+  }
+}
+
+static void printCompressedSecondLevelUnwindPage(
+    const char *PageStart, uint32_t FunctionBase,
+    const SmallVectorImpl<uint32_t> &CommonEncodings) {
+  const char *Pos = PageStart;
+  uint32_t Kind = readNext<uint32_t>(Pos);
+  (void)Kind;
+  assert(Kind == 3 && "kind for a compressed 2nd level index should be 3");
+
+  uint16_t EntriesStart = readNext<uint16_t>(Pos);
+  uint16_t NumEntries = readNext<uint16_t>(Pos);
+
+  uint16_t EncodingsStart = readNext<uint16_t>(Pos);
+  readNext<uint16_t>(Pos);
+  const auto *PageEncodings = reinterpret_cast<const support::ulittle32_t *>(
+      PageStart + EncodingsStart);
+
+  Pos = PageStart + EntriesStart;
+  for (unsigned i = 0; i < NumEntries; ++i) {
+    uint32_t Entry = readNext<uint32_t>(Pos);
+    uint32_t FunctionOffset = FunctionBase + (Entry & 0xffffff);
+    uint32_t EncodingIdx = Entry >> 24;
+
+    uint32_t Encoding;
+    if (EncodingIdx < CommonEncodings.size())
+      Encoding = CommonEncodings[EncodingIdx];
+    else
+      Encoding = PageEncodings[EncodingIdx - CommonEncodings.size()];
+
+    outs() << "      [" << i << "]: "
+           << "function offset=" << format("0x%08" PRIx32, FunctionOffset)
+           << ", "
+           << "encoding[" << EncodingIdx
+           << "]=" << format("0x%08" PRIx32, Encoding) << '\n';
+  }
+}
+
+static void printMachOUnwindInfoSection(const MachOObjectFile *Obj,
+                                        std::map<uint64_t, SymbolRef> &Symbols,
+                                        const SectionRef &UnwindInfo) {
+
+  assert(Obj->isLittleEndian() &&
+         "There should not be a big-endian .o with __unwind_info");
+
+  outs() << "Contents of __unwind_info section:\n";
+
+  StringRef Contents;
+  UnwindInfo.getContents(Contents);
+  const char *Pos = Contents.data();
+
+  //===----------------------------------
+  // Section header
+  //===----------------------------------
+
+  uint32_t Version = readNext<uint32_t>(Pos);
+  outs() << "  Version:                                   "
+         << format("0x%" PRIx32, Version) << '\n';
+  assert(Version == 1 && "only understand version 1");
+
+  uint32_t CommonEncodingsStart = readNext<uint32_t>(Pos);
+  outs() << "  Common encodings array section offset:     "
+         << format("0x%" PRIx32, CommonEncodingsStart) << '\n';
+  uint32_t NumCommonEncodings = readNext<uint32_t>(Pos);
+  outs() << "  Number of common encodings in array:       "
+         << format("0x%" PRIx32, NumCommonEncodings) << '\n';
+
+  uint32_t PersonalitiesStart = readNext<uint32_t>(Pos);
+  outs() << "  Personality function array section offset: "
+         << format("0x%" PRIx32, PersonalitiesStart) << '\n';
+  uint32_t NumPersonalities = readNext<uint32_t>(Pos);
+  outs() << "  Number of personality functions in array:  "
+         << format("0x%" PRIx32, NumPersonalities) << '\n';
+
+  uint32_t IndicesStart = readNext<uint32_t>(Pos);
+  outs() << "  Index array section offset:                "
+         << format("0x%" PRIx32, IndicesStart) << '\n';
+  uint32_t NumIndices = readNext<uint32_t>(Pos);
+  outs() << "  Number of indices in array:                "
+         << format("0x%" PRIx32, NumIndices) << '\n';
+
+  //===----------------------------------
+  // A shared list of common encodings
+  //===----------------------------------
+
+  // These occupy indices in the range [0, N] whenever an encoding is referenced
+  // from a compressed 2nd level index table. In practice the linker only
+  // creates ~128 of these, so that indices are available to embed encodings in
+  // the 2nd level index.
+
+  SmallVector<uint32_t, 64> CommonEncodings;
+  outs() << "  Common encodings: (count = " << NumCommonEncodings << ")\n";
+  Pos = Contents.data() + CommonEncodingsStart;
+  for (unsigned i = 0; i < NumCommonEncodings; ++i) {
+    uint32_t Encoding = readNext<uint32_t>(Pos);
+    CommonEncodings.push_back(Encoding);
+
+    outs() << "    encoding[" << i << "]: " << format("0x%08" PRIx32, Encoding)
+           << '\n';
+  }
+
+  //===----------------------------------
+  // Personality functions used in this executable
+  //===----------------------------------
+
+  // There should be only a handful of these (one per source language,
+  // roughly). Particularly since they only get 2 bits in the compact encoding.
+
+  outs() << "  Personality functions: (count = " << NumPersonalities << ")\n";
+  Pos = Contents.data() + PersonalitiesStart;
+  for (unsigned i = 0; i < NumPersonalities; ++i) {
+    uint32_t PersonalityFn = readNext<uint32_t>(Pos);
+    outs() << "    personality[" << i + 1
+           << "]: " << format("0x%08" PRIx32, PersonalityFn) << '\n';
+  }
+
+  //===----------------------------------
+  // The level 1 index entries
+  //===----------------------------------
+
+  // These specify an approximate place to start searching for the more detailed
+  // information, sorted by PC.
+
+  struct IndexEntry {
+    uint32_t FunctionOffset;
+    uint32_t SecondLevelPageStart;
+    uint32_t LSDAStart;
+  };
+
+  SmallVector<IndexEntry, 4> IndexEntries;
+
+  outs() << "  Top level indices: (count = " << NumIndices << ")\n";
+  Pos = Contents.data() + IndicesStart;
+  for (unsigned i = 0; i < NumIndices; ++i) {
+    IndexEntry Entry;
+
+    Entry.FunctionOffset = readNext<uint32_t>(Pos);
+    Entry.SecondLevelPageStart = readNext<uint32_t>(Pos);
+    Entry.LSDAStart = readNext<uint32_t>(Pos);
+    IndexEntries.push_back(Entry);
+
+    outs() << "    [" << i << "]: "
+           << "function offset=" << format("0x%08" PRIx32, Entry.FunctionOffset)
+           << ", "
+           << "2nd level page offset="
+           << format("0x%08" PRIx32, Entry.SecondLevelPageStart) << ", "
+           << "LSDA offset=" << format("0x%08" PRIx32, Entry.LSDAStart) << '\n';
+  }
+
+  //===----------------------------------
+  // Next come the LSDA tables
+  //===----------------------------------
+
+  // The LSDA layout is rather implicit: it's a contiguous array of entries from
+  // the first top-level index's LSDAOffset to the last (sentinel).
+
+  outs() << "  LSDA descriptors:\n";
+  Pos = Contents.data() + IndexEntries[0].LSDAStart;
+  int NumLSDAs = (IndexEntries.back().LSDAStart - IndexEntries[0].LSDAStart) /
+                 (2 * sizeof(uint32_t));
+  for (int i = 0; i < NumLSDAs; ++i) {
+    uint32_t FunctionOffset = readNext<uint32_t>(Pos);
+    uint32_t LSDAOffset = readNext<uint32_t>(Pos);
+    outs() << "    [" << i << "]: "
+           << "function offset=" << format("0x%08" PRIx32, FunctionOffset)
+           << ", "
+           << "LSDA offset=" << format("0x%08" PRIx32, LSDAOffset) << '\n';
+  }
+
+  //===----------------------------------
+  // Finally, the 2nd level indices
+  //===----------------------------------
+
+  // Generally these are 4K in size, and have 2 possible forms:
+  //   + Regular stores up to 511 entries with disparate encodings
+  //   + Compressed stores up to 1021 entries if few enough compact encoding
+  //     values are used.
+  outs() << "  Second level indices:\n";
+  for (unsigned i = 0; i < IndexEntries.size() - 1; ++i) {
+    // The final sentinel top-level index has no associated 2nd level page
+    if (IndexEntries[i].SecondLevelPageStart == 0)
+      break;
+
+    outs() << "    Second level index[" << i << "]: "
+           << "offset in section="
+           << format("0x%08" PRIx32, IndexEntries[i].SecondLevelPageStart)
+           << ", "
+           << "base function offset="
+           << format("0x%08" PRIx32, IndexEntries[i].FunctionOffset) << '\n';
+
+    Pos = Contents.data() + IndexEntries[i].SecondLevelPageStart;
+    uint32_t Kind = *reinterpret_cast<const support::ulittle32_t *>(Pos);
+    if (Kind == 2)
+      printRegularSecondLevelUnwindPage(Pos);
+    else if (Kind == 3)
+      printCompressedSecondLevelUnwindPage(Pos, IndexEntries[i].FunctionOffset,
+                                           CommonEncodings);
+    else
+      llvm_unreachable("Do not know how to print this kind of 2nd level page");
+  }
+}
+
+void llvm::printMachOUnwindInfo(const MachOObjectFile *Obj) {
+  std::map<uint64_t, SymbolRef> Symbols;
+  for (const SymbolRef &SymRef : Obj->symbols()) {
+    // Discard any undefined or absolute symbols. They're not going to take part
+    // in the convenience lookup for unwind info and just take up resources.
+    section_iterator Section = Obj->section_end();
+    SymRef.getSection(Section);
+    if (Section == Obj->section_end())
+      continue;
+
+    uint64_t Addr;
+    SymRef.getAddress(Addr);
+    Symbols.insert(std::make_pair(Addr, SymRef));
+  }
+
+  for (const SectionRef &Section : Obj->sections()) {
+    StringRef SectName;
+    Section.getName(SectName);
+    if (SectName == "__compact_unwind")
+      printMachOCompactUnwindSection(Obj, Symbols, Section);
+    else if (SectName == "__unwind_info")
+      printMachOUnwindInfoSection(Obj, Symbols, Section);
+    else if (SectName == "__eh_frame")
+      outs() << "llvm-objdump: warning: unhandled __eh_frame section\n";
+  }
+}
+
+static void PrintMachHeader(uint32_t magic, uint32_t cputype,
+                            uint32_t cpusubtype, uint32_t filetype,
+                            uint32_t ncmds, uint32_t sizeofcmds, uint32_t flags,
+                            bool verbose) {
+  outs() << "Mach header\n";
+  outs() << "      magic cputype cpusubtype  caps    filetype ncmds "
+            "sizeofcmds      flags\n";
+  if (verbose) {
+    if (magic == MachO::MH_MAGIC)
+      outs() << "   MH_MAGIC";
+    else if (magic == MachO::MH_MAGIC_64)
+      outs() << "MH_MAGIC_64";
+    else
+      outs() << format(" 0x%08" PRIx32, magic);
+    switch (cputype) {
+    case MachO::CPU_TYPE_I386:
+      outs() << "    I386";
+      switch (cpusubtype & ~MachO::CPU_SUBTYPE_MASK) {
+      case MachO::CPU_SUBTYPE_I386_ALL:
+        outs() << "        ALL";
+        break;
+      default:
+        outs() << format(" %10d", cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+        break;
+      }
+      break;
+    case MachO::CPU_TYPE_X86_64:
+      outs() << "  X86_64";
+    case MachO::CPU_SUBTYPE_X86_64_ALL:
+      outs() << "        ALL";
+      break;
+    case MachO::CPU_SUBTYPE_X86_64_H:
+      outs() << "    Haswell";
+      outs() << format(" %10d", cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+      break;
+    case MachO::CPU_TYPE_ARM:
+      outs() << "     ARM";
+      switch (cpusubtype & ~MachO::CPU_SUBTYPE_MASK) {
+      case MachO::CPU_SUBTYPE_ARM_ALL:
+        outs() << "        ALL";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_V4T:
+        outs() << "        V4T";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_V5TEJ:
+        outs() << "      V5TEJ";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_XSCALE:
+        outs() << "     XSCALE";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_V6:
+        outs() << "         V6";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_V6M:
+        outs() << "        V6M";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_V7:
+        outs() << "         V7";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_V7EM:
+        outs() << "       V7EM";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_V7K:
+        outs() << "        V7K";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_V7M:
+        outs() << "        V7M";
+        break;
+      case MachO::CPU_SUBTYPE_ARM_V7S:
+        outs() << "        V7S";
+        break;
+      default:
+        outs() << format(" %10d", cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+        break;
+      }
+      break;
+    case MachO::CPU_TYPE_ARM64:
+      outs() << "   ARM64";
+      switch (cpusubtype & ~MachO::CPU_SUBTYPE_MASK) {
+      case MachO::CPU_SUBTYPE_ARM64_ALL:
+        outs() << "        ALL";
+        break;
+      default:
+        outs() << format(" %10d", cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+        break;
+      }
+      break;
+    case MachO::CPU_TYPE_POWERPC:
+      outs() << "     PPC";
+      switch (cpusubtype & ~MachO::CPU_SUBTYPE_MASK) {
+      case MachO::CPU_SUBTYPE_POWERPC_ALL:
+        outs() << "        ALL";
+        break;
+      default:
+        outs() << format(" %10d", cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+        break;
+      }
+      break;
+    case MachO::CPU_TYPE_POWERPC64:
+      outs() << "   PPC64";
+      switch (cpusubtype & ~MachO::CPU_SUBTYPE_MASK) {
+      case MachO::CPU_SUBTYPE_POWERPC_ALL:
+        outs() << "        ALL";
+        break;
+      default:
+        outs() << format(" %10d", cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+        break;
+      }
+      break;
+    }
+    if ((cpusubtype & MachO::CPU_SUBTYPE_MASK) == MachO::CPU_SUBTYPE_LIB64) {
+      outs() << " LIB64";
+    } else {
+      outs() << format("  0x%02" PRIx32,
+                       (cpusubtype & MachO::CPU_SUBTYPE_MASK) >> 24);
+    }
+    switch (filetype) {
+    case MachO::MH_OBJECT:
+      outs() << "      OBJECT";
+      break;
+    case MachO::MH_EXECUTE:
+      outs() << "     EXECUTE";
+      break;
+    case MachO::MH_FVMLIB:
+      outs() << "      FVMLIB";
+      break;
+    case MachO::MH_CORE:
+      outs() << "        CORE";
+      break;
+    case MachO::MH_PRELOAD:
+      outs() << "     PRELOAD";
+      break;
+    case MachO::MH_DYLIB:
+      outs() << "       DYLIB";
+      break;
+    case MachO::MH_DYLIB_STUB:
+      outs() << "  DYLIB_STUB";
+      break;
+    case MachO::MH_DYLINKER:
+      outs() << "    DYLINKER";
+      break;
+    case MachO::MH_BUNDLE:
+      outs() << "      BUNDLE";
+      break;
+    case MachO::MH_DSYM:
+      outs() << "        DSYM";
+      break;
+    case MachO::MH_KEXT_BUNDLE:
+      outs() << "  KEXTBUNDLE";
+      break;
+    default:
+      outs() << format("  %10u", filetype);
+      break;
+    }
+    outs() << format(" %5u", ncmds);
+    outs() << format(" %10u", sizeofcmds);
+    uint32_t f = flags;
+    if (f & MachO::MH_NOUNDEFS) {
+      outs() << "   NOUNDEFS";
+      f &= ~MachO::MH_NOUNDEFS;
+    }
+    if (f & MachO::MH_INCRLINK) {
+      outs() << " INCRLINK";
+      f &= ~MachO::MH_INCRLINK;
+    }
+    if (f & MachO::MH_DYLDLINK) {
+      outs() << " DYLDLINK";
+      f &= ~MachO::MH_DYLDLINK;
+    }
+    if (f & MachO::MH_BINDATLOAD) {
+      outs() << " BINDATLOAD";
+      f &= ~MachO::MH_BINDATLOAD;
+    }
+    if (f & MachO::MH_PREBOUND) {
+      outs() << " PREBOUND";
+      f &= ~MachO::MH_PREBOUND;
+    }
+    if (f & MachO::MH_SPLIT_SEGS) {
+      outs() << " SPLIT_SEGS";
+      f &= ~MachO::MH_SPLIT_SEGS;
+    }
+    if (f & MachO::MH_LAZY_INIT) {
+      outs() << " LAZY_INIT";
+      f &= ~MachO::MH_LAZY_INIT;
+    }
+    if (f & MachO::MH_TWOLEVEL) {
+      outs() << " TWOLEVEL";
+      f &= ~MachO::MH_TWOLEVEL;
+    }
+    if (f & MachO::MH_FORCE_FLAT) {
+      outs() << " FORCE_FLAT";
+      f &= ~MachO::MH_FORCE_FLAT;
+    }
+    if (f & MachO::MH_NOMULTIDEFS) {
+      outs() << " NOMULTIDEFS";
+      f &= ~MachO::MH_NOMULTIDEFS;
+    }
+    if (f & MachO::MH_NOFIXPREBINDING) {
+      outs() << " NOFIXPREBINDING";
+      f &= ~MachO::MH_NOFIXPREBINDING;
+    }
+    if (f & MachO::MH_PREBINDABLE) {
+      outs() << " PREBINDABLE";
+      f &= ~MachO::MH_PREBINDABLE;
+    }
+    if (f & MachO::MH_ALLMODSBOUND) {
+      outs() << " ALLMODSBOUND";
+      f &= ~MachO::MH_ALLMODSBOUND;
+    }
+    if (f & MachO::MH_SUBSECTIONS_VIA_SYMBOLS) {
+      outs() << " SUBSECTIONS_VIA_SYMBOLS";
+      f &= ~MachO::MH_SUBSECTIONS_VIA_SYMBOLS;
+    }
+    if (f & MachO::MH_CANONICAL) {
+      outs() << " CANONICAL";
+      f &= ~MachO::MH_CANONICAL;
+    }
+    if (f & MachO::MH_WEAK_DEFINES) {
+      outs() << " WEAK_DEFINES";
+      f &= ~MachO::MH_WEAK_DEFINES;
+    }
+    if (f & MachO::MH_BINDS_TO_WEAK) {
+      outs() << " BINDS_TO_WEAK";
+      f &= ~MachO::MH_BINDS_TO_WEAK;
+    }
+    if (f & MachO::MH_ALLOW_STACK_EXECUTION) {
+      outs() << " ALLOW_STACK_EXECUTION";
+      f &= ~MachO::MH_ALLOW_STACK_EXECUTION;
+    }
+    if (f & MachO::MH_DEAD_STRIPPABLE_DYLIB) {
+      outs() << " DEAD_STRIPPABLE_DYLIB";
+      f &= ~MachO::MH_DEAD_STRIPPABLE_DYLIB;
+    }
+    if (f & MachO::MH_PIE) {
+      outs() << " PIE";
+      f &= ~MachO::MH_PIE;
+    }
+    if (f & MachO::MH_NO_REEXPORTED_DYLIBS) {
+      outs() << " NO_REEXPORTED_DYLIBS";
+      f &= ~MachO::MH_NO_REEXPORTED_DYLIBS;
+    }
+    if (f & MachO::MH_HAS_TLV_DESCRIPTORS) {
+      outs() << " MH_HAS_TLV_DESCRIPTORS";
+      f &= ~MachO::MH_HAS_TLV_DESCRIPTORS;
+    }
+    if (f & MachO::MH_NO_HEAP_EXECUTION) {
+      outs() << " MH_NO_HEAP_EXECUTION";
+      f &= ~MachO::MH_NO_HEAP_EXECUTION;
+    }
+    if (f & MachO::MH_APP_EXTENSION_SAFE) {
+      outs() << " APP_EXTENSION_SAFE";
+      f &= ~MachO::MH_APP_EXTENSION_SAFE;
+    }
+    if (f != 0 || flags == 0)
+      outs() << format(" 0x%08" PRIx32, f);
+  } else {
+    outs() << format(" 0x%08" PRIx32, magic);
+    outs() << format(" %7d", cputype);
+    outs() << format(" %10d", cpusubtype & ~MachO::CPU_SUBTYPE_MASK);
+    outs() << format("  0x%02" PRIx32,
+                     (cpusubtype & MachO::CPU_SUBTYPE_MASK) >> 24);
+    outs() << format("  %10u", filetype);
+    outs() << format(" %5u", ncmds);
+    outs() << format(" %10u", sizeofcmds);
+    outs() << format(" 0x%08" PRIx32, flags);
+  }
+  outs() << "\n";
+}
+
+static void PrintSegmentCommand(uint32_t cmd, uint32_t cmdsize,
+                                StringRef SegName, uint64_t vmaddr,
+                                uint64_t vmsize, uint64_t fileoff,
+                                uint64_t filesize, uint32_t maxprot,
+                                uint32_t initprot, uint32_t nsects,
+                                uint32_t flags, uint32_t object_size,
+                                bool verbose) {
+  uint64_t expected_cmdsize;
+  if (cmd == MachO::LC_SEGMENT) {
+    outs() << "      cmd LC_SEGMENT\n";
+    expected_cmdsize = nsects;
+    expected_cmdsize *= sizeof(struct MachO::section);
+    expected_cmdsize += sizeof(struct MachO::segment_command);
+  } else {
+    outs() << "      cmd LC_SEGMENT_64\n";
+    expected_cmdsize = nsects;
+    expected_cmdsize *= sizeof(struct MachO::section_64);
+    expected_cmdsize += sizeof(struct MachO::segment_command_64);
+  }
+  outs() << "  cmdsize " << cmdsize;
+  if (cmdsize != expected_cmdsize)
+    outs() << " Inconsistent size\n";
+  else
+    outs() << "\n";
+  outs() << "  segname " << SegName << "\n";
+  if (cmd == MachO::LC_SEGMENT_64) {
+    outs() << "   vmaddr " << format("0x%016" PRIx64, vmaddr) << "\n";
+    outs() << "   vmsize " << format("0x%016" PRIx64, vmsize) << "\n";
+  } else {
+    outs() << "   vmaddr " << format("0x%08" PRIx32, vmaddr) << "\n";
+    outs() << "   vmsize " << format("0x%08" PRIx32, vmsize) << "\n";
+  }
+  outs() << "  fileoff " << fileoff;
+  if (fileoff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << " filesize " << filesize;
+  if (fileoff + filesize > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  if (verbose) {
+    if ((maxprot &
+         ~(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE |
+           MachO::VM_PROT_EXECUTE)) != 0)
+      outs() << "  maxprot ?" << format("0x%08" PRIx32, maxprot) << "\n";
+    else {
+      if (maxprot & MachO::VM_PROT_READ)
+        outs() << "  maxprot r";
+      else
+        outs() << "  maxprot -";
+      if (maxprot & MachO::VM_PROT_WRITE)
+        outs() << "w";
+      else
+        outs() << "-";
+      if (maxprot & MachO::VM_PROT_EXECUTE)
+        outs() << "x\n";
+      else
+        outs() << "-\n";
+    }
+    if ((initprot &
+         ~(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE |
+           MachO::VM_PROT_EXECUTE)) != 0)
+      outs() << "  initprot ?" << format("0x%08" PRIx32, initprot) << "\n";
+    else {
+      if (initprot & MachO::VM_PROT_READ)
+        outs() << " initprot r";
+      else
+        outs() << " initprot -";
+      if (initprot & MachO::VM_PROT_WRITE)
+        outs() << "w";
+      else
+        outs() << "-";
+      if (initprot & MachO::VM_PROT_EXECUTE)
+        outs() << "x\n";
+      else
+        outs() << "-\n";
+    }
+  } else {
+    outs() << "  maxprot " << format("0x%08" PRIx32, maxprot) << "\n";
+    outs() << " initprot " << format("0x%08" PRIx32, initprot) << "\n";
+  }
+  outs() << "   nsects " << nsects << "\n";
+  if (verbose) {
+    outs() << "    flags";
+    if (flags == 0)
+      outs() << " (none)\n";
+    else {
+      if (flags & MachO::SG_HIGHVM) {
+        outs() << " HIGHVM";
+        flags &= ~MachO::SG_HIGHVM;
+      }
+      if (flags & MachO::SG_FVMLIB) {
+        outs() << " FVMLIB";
+        flags &= ~MachO::SG_FVMLIB;
+      }
+      if (flags & MachO::SG_NORELOC) {
+        outs() << " NORELOC";
+        flags &= ~MachO::SG_NORELOC;
+      }
+      if (flags & MachO::SG_PROTECTED_VERSION_1) {
+        outs() << " PROTECTED_VERSION_1";
+        flags &= ~MachO::SG_PROTECTED_VERSION_1;
+      }
+      if (flags)
+        outs() << format(" 0x%08" PRIx32, flags) << " (unknown flags)\n";
+      else
+        outs() << "\n";
+    }
+  } else {
+    outs() << "    flags " << format("0x%" PRIx32, flags) << "\n";
+  }
+}
+
+static void PrintSection(const char *sectname, const char *segname,
+                         uint64_t addr, uint64_t size, uint32_t offset,
+                         uint32_t align, uint32_t reloff, uint32_t nreloc,
+                         uint32_t flags, uint32_t reserved1, uint32_t reserved2,
+                         uint32_t cmd, const char *sg_segname,
+                         uint32_t filetype, uint32_t object_size,
+                         bool verbose) {
+  outs() << "Section\n";
+  outs() << "  sectname " << format("%.16s\n", sectname);
+  outs() << "   segname " << format("%.16s", segname);
+  if (filetype != MachO::MH_OBJECT && strncmp(sg_segname, segname, 16) != 0)
+    outs() << " (does not match segment)\n";
+  else
+    outs() << "\n";
+  if (cmd == MachO::LC_SEGMENT_64) {
+    outs() << "      addr " << format("0x%016" PRIx64, addr) << "\n";
+    outs() << "      size " << format("0x%016" PRIx64, size);
+  } else {
+    outs() << "      addr " << format("0x%08" PRIx32, addr) << "\n";
+    outs() << "      size " << format("0x%08" PRIx32, size);
+  }
+  if ((flags & MachO::S_ZEROFILL) != 0 && offset + size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "    offset " << offset;
+  if (offset > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  uint32_t align_shifted = 1 << align;
+  outs() << "     align 2^" << align << " (" << align_shifted << ")\n";
+  outs() << "    reloff " << reloff;
+  if (reloff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "    nreloc " << nreloc;
+  if (reloff + nreloc * sizeof(struct MachO::relocation_info) > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  uint32_t section_type = flags & MachO::SECTION_TYPE;
+  if (verbose) {
+    outs() << "      type";
+    if (section_type == MachO::S_REGULAR)
+      outs() << " S_REGULAR\n";
+    else if (section_type == MachO::S_ZEROFILL)
+      outs() << " S_ZEROFILL\n";
+    else if (section_type == MachO::S_CSTRING_LITERALS)
+      outs() << " S_CSTRING_LITERALS\n";
+    else if (section_type == MachO::S_4BYTE_LITERALS)
+      outs() << " S_4BYTE_LITERALS\n";
+    else if (section_type == MachO::S_8BYTE_LITERALS)
+      outs() << " S_8BYTE_LITERALS\n";
+    else if (section_type == MachO::S_16BYTE_LITERALS)
+      outs() << " S_16BYTE_LITERALS\n";
+    else if (section_type == MachO::S_LITERAL_POINTERS)
+      outs() << " S_LITERAL_POINTERS\n";
+    else if (section_type == MachO::S_NON_LAZY_SYMBOL_POINTERS)
+      outs() << " S_NON_LAZY_SYMBOL_POINTERS\n";
+    else if (section_type == MachO::S_LAZY_SYMBOL_POINTERS)
+      outs() << " S_LAZY_SYMBOL_POINTERS\n";
+    else if (section_type == MachO::S_SYMBOL_STUBS)
+      outs() << " S_SYMBOL_STUBS\n";
+    else if (section_type == MachO::S_MOD_INIT_FUNC_POINTERS)
+      outs() << " S_MOD_INIT_FUNC_POINTERS\n";
+    else if (section_type == MachO::S_MOD_TERM_FUNC_POINTERS)
+      outs() << " S_MOD_TERM_FUNC_POINTERS\n";
+    else if (section_type == MachO::S_COALESCED)
+      outs() << " S_COALESCED\n";
+    else if (section_type == MachO::S_INTERPOSING)
+      outs() << " S_INTERPOSING\n";
+    else if (section_type == MachO::S_DTRACE_DOF)
+      outs() << " S_DTRACE_DOF\n";
+    else if (section_type == MachO::S_LAZY_DYLIB_SYMBOL_POINTERS)
+      outs() << " S_LAZY_DYLIB_SYMBOL_POINTERS\n";
+    else if (section_type == MachO::S_THREAD_LOCAL_REGULAR)
+      outs() << " S_THREAD_LOCAL_REGULAR\n";
+    else if (section_type == MachO::S_THREAD_LOCAL_ZEROFILL)
+      outs() << " S_THREAD_LOCAL_ZEROFILL\n";
+    else if (section_type == MachO::S_THREAD_LOCAL_VARIABLES)
+      outs() << " S_THREAD_LOCAL_VARIABLES\n";
+    else if (section_type == MachO::S_THREAD_LOCAL_VARIABLE_POINTERS)
+      outs() << " S_THREAD_LOCAL_VARIABLE_POINTERS\n";
+    else if (section_type == MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS)
+      outs() << " S_THREAD_LOCAL_INIT_FUNCTION_POINTERS\n";
+    else
+      outs() << format("0x%08" PRIx32, section_type) << "\n";
+    outs() << "attributes";
+    uint32_t section_attributes = flags & MachO::SECTION_ATTRIBUTES;
+    if (section_attributes & MachO::S_ATTR_PURE_INSTRUCTIONS)
+      outs() << " PURE_INSTRUCTIONS";
+    if (section_attributes & MachO::S_ATTR_NO_TOC)
+      outs() << " NO_TOC";
+    if (section_attributes & MachO::S_ATTR_STRIP_STATIC_SYMS)
+      outs() << " STRIP_STATIC_SYMS";
+    if (section_attributes & MachO::S_ATTR_NO_DEAD_STRIP)
+      outs() << " NO_DEAD_STRIP";
+    if (section_attributes & MachO::S_ATTR_LIVE_SUPPORT)
+      outs() << " LIVE_SUPPORT";
+    if (section_attributes & MachO::S_ATTR_SELF_MODIFYING_CODE)
+      outs() << " SELF_MODIFYING_CODE";
+    if (section_attributes & MachO::S_ATTR_DEBUG)
+      outs() << " DEBUG";
+    if (section_attributes & MachO::S_ATTR_SOME_INSTRUCTIONS)
+      outs() << " SOME_INSTRUCTIONS";
+    if (section_attributes & MachO::S_ATTR_EXT_RELOC)
+      outs() << " EXT_RELOC";
+    if (section_attributes & MachO::S_ATTR_LOC_RELOC)
+      outs() << " LOC_RELOC";
+    if (section_attributes == 0)
+      outs() << " (none)";
+    outs() << "\n";
+  } else
+    outs() << "     flags " << format("0x%08" PRIx32, flags) << "\n";
+  outs() << " reserved1 " << reserved1;
+  if (section_type == MachO::S_SYMBOL_STUBS ||
+      section_type == MachO::S_LAZY_SYMBOL_POINTERS ||
+      section_type == MachO::S_LAZY_DYLIB_SYMBOL_POINTERS ||
+      section_type == MachO::S_NON_LAZY_SYMBOL_POINTERS ||
+      section_type == MachO::S_THREAD_LOCAL_VARIABLE_POINTERS)
+    outs() << " (index into indirect symbol table)\n";
+  else
+    outs() << "\n";
+  outs() << " reserved2 " << reserved2;
+  if (section_type == MachO::S_SYMBOL_STUBS)
+    outs() << " (size of stubs)\n";
+  else
+    outs() << "\n";
+}
+
+static void PrintSymtabLoadCommand(MachO::symtab_command st, bool Is64Bit,
+                                   uint32_t object_size) {
+  outs() << "     cmd LC_SYMTAB\n";
+  outs() << " cmdsize " << st.cmdsize;
+  if (st.cmdsize != sizeof(struct MachO::symtab_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "  symoff " << st.symoff;
+  if (st.symoff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "   nsyms " << st.nsyms;
+  uint64_t big_size;
+  if (Is64Bit) {
+    big_size = st.nsyms;
+    big_size *= sizeof(struct MachO::nlist_64);
+    big_size += st.symoff;
+    if (big_size > object_size)
+      outs() << " (past end of file)\n";
+    else
+      outs() << "\n";
+  } else {
+    big_size = st.nsyms;
+    big_size *= sizeof(struct MachO::nlist);
+    big_size += st.symoff;
+    if (big_size > object_size)
+      outs() << " (past end of file)\n";
+    else
+      outs() << "\n";
+  }
+  outs() << "  stroff " << st.stroff;
+  if (st.stroff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << " strsize " << st.strsize;
+  big_size = st.stroff;
+  big_size += st.strsize;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+}
+
+static void PrintDysymtabLoadCommand(MachO::dysymtab_command dyst,
+                                     uint32_t nsyms, uint32_t object_size,
+                                     bool Is64Bit) {
+  outs() << "            cmd LC_DYSYMTAB\n";
+  outs() << "        cmdsize " << dyst.cmdsize;
+  if (dyst.cmdsize != sizeof(struct MachO::dysymtab_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "      ilocalsym " << dyst.ilocalsym;
+  if (dyst.ilocalsym > nsyms)
+    outs() << " (greater than the number of symbols)\n";
+  else
+    outs() << "\n";
+  outs() << "      nlocalsym " << dyst.nlocalsym;
+  uint64_t big_size;
+  big_size = dyst.ilocalsym;
+  big_size += dyst.nlocalsym;
+  if (big_size > nsyms)
+    outs() << " (past the end of the symbol table)\n";
+  else
+    outs() << "\n";
+  outs() << "     iextdefsym " << dyst.iextdefsym;
+  if (dyst.iextdefsym > nsyms)
+    outs() << " (greater than the number of symbols)\n";
+  else
+    outs() << "\n";
+  outs() << "     nextdefsym " << dyst.nextdefsym;
+  big_size = dyst.iextdefsym;
+  big_size += dyst.nextdefsym;
+  if (big_size > nsyms)
+    outs() << " (past the end of the symbol table)\n";
+  else
+    outs() << "\n";
+  outs() << "      iundefsym " << dyst.iundefsym;
+  if (dyst.iundefsym > nsyms)
+    outs() << " (greater than the number of symbols)\n";
+  else
+    outs() << "\n";
+  outs() << "      nundefsym " << dyst.nundefsym;
+  big_size = dyst.iundefsym;
+  big_size += dyst.nundefsym;
+  if (big_size > nsyms)
+    outs() << " (past the end of the symbol table)\n";
+  else
+    outs() << "\n";
+  outs() << "         tocoff " << dyst.tocoff;
+  if (dyst.tocoff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "           ntoc " << dyst.ntoc;
+  big_size = dyst.ntoc;
+  big_size *= sizeof(struct MachO::dylib_table_of_contents);
+  big_size += dyst.tocoff;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "      modtaboff " << dyst.modtaboff;
+  if (dyst.modtaboff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "        nmodtab " << dyst.nmodtab;
+  uint64_t modtabend;
+  if (Is64Bit) {
+    modtabend = dyst.nmodtab;
+    modtabend *= sizeof(struct MachO::dylib_module_64);
+    modtabend += dyst.modtaboff;
+  } else {
+    modtabend = dyst.nmodtab;
+    modtabend *= sizeof(struct MachO::dylib_module);
+    modtabend += dyst.modtaboff;
+  }
+  if (modtabend > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "   extrefsymoff " << dyst.extrefsymoff;
+  if (dyst.extrefsymoff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "    nextrefsyms " << dyst.nextrefsyms;
+  big_size = dyst.nextrefsyms;
+  big_size *= sizeof(struct MachO::dylib_reference);
+  big_size += dyst.extrefsymoff;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << " indirectsymoff " << dyst.indirectsymoff;
+  if (dyst.indirectsymoff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "  nindirectsyms " << dyst.nindirectsyms;
+  big_size = dyst.nindirectsyms;
+  big_size *= sizeof(uint32_t);
+  big_size += dyst.indirectsymoff;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "      extreloff " << dyst.extreloff;
+  if (dyst.extreloff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "        nextrel " << dyst.nextrel;
+  big_size = dyst.nextrel;
+  big_size *= sizeof(struct MachO::relocation_info);
+  big_size += dyst.extreloff;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "      locreloff " << dyst.locreloff;
+  if (dyst.locreloff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "        nlocrel " << dyst.nlocrel;
+  big_size = dyst.nlocrel;
+  big_size *= sizeof(struct MachO::relocation_info);
+  big_size += dyst.locreloff;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+}
+
+static void PrintDyldInfoLoadCommand(MachO::dyld_info_command dc,
+                                     uint32_t object_size) {
+  if (dc.cmd == MachO::LC_DYLD_INFO)
+    outs() << "            cmd LC_DYLD_INFO\n";
+  else
+    outs() << "            cmd LC_DYLD_INFO_ONLY\n";
+  outs() << "        cmdsize " << dc.cmdsize;
+  if (dc.cmdsize != sizeof(struct MachO::dyld_info_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "     rebase_off " << dc.rebase_off;
+  if (dc.rebase_off > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "    rebase_size " << dc.rebase_size;
+  uint64_t big_size;
+  big_size = dc.rebase_off;
+  big_size += dc.rebase_size;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "       bind_off " << dc.bind_off;
+  if (dc.bind_off > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "      bind_size " << dc.bind_size;
+  big_size = dc.bind_off;
+  big_size += dc.bind_size;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "  weak_bind_off " << dc.weak_bind_off;
+  if (dc.weak_bind_off > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << " weak_bind_size " << dc.weak_bind_size;
+  big_size = dc.weak_bind_off;
+  big_size += dc.weak_bind_size;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "  lazy_bind_off " << dc.lazy_bind_off;
+  if (dc.lazy_bind_off > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << " lazy_bind_size " << dc.lazy_bind_size;
+  big_size = dc.lazy_bind_off;
+  big_size += dc.lazy_bind_size;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "     export_off " << dc.export_off;
+  if (dc.export_off > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << "    export_size " << dc.export_size;
+  big_size = dc.export_off;
+  big_size += dc.export_size;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+}
+
+static void PrintDyldLoadCommand(MachO::dylinker_command dyld,
+                                 const char *Ptr) {
+  if (dyld.cmd == MachO::LC_ID_DYLINKER)
+    outs() << "          cmd LC_ID_DYLINKER\n";
+  else if (dyld.cmd == MachO::LC_LOAD_DYLINKER)
+    outs() << "          cmd LC_LOAD_DYLINKER\n";
+  else if (dyld.cmd == MachO::LC_DYLD_ENVIRONMENT)
+    outs() << "          cmd LC_DYLD_ENVIRONMENT\n";
+  else
+    outs() << "          cmd ?(" << dyld.cmd << ")\n";
+  outs() << "      cmdsize " << dyld.cmdsize;
+  if (dyld.cmdsize < sizeof(struct MachO::dylinker_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  if (dyld.name >= dyld.cmdsize)
+    outs() << "         name ?(bad offset " << dyld.name << ")\n";
+  else {
+    const char *P = (const char *)(Ptr) + dyld.name;
+    outs() << "         name " << P << " (offset " << dyld.name << ")\n";
+  }
+}
+
+static void PrintUuidLoadCommand(MachO::uuid_command uuid) {
+  outs() << "     cmd LC_UUID\n";
+  outs() << " cmdsize " << uuid.cmdsize;
+  if (uuid.cmdsize != sizeof(struct MachO::uuid_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "    uuid ";
+  outs() << format("%02" PRIX32, uuid.uuid[0]);
+  outs() << format("%02" PRIX32, uuid.uuid[1]);
+  outs() << format("%02" PRIX32, uuid.uuid[2]);
+  outs() << format("%02" PRIX32, uuid.uuid[3]);
+  outs() << "-";
+  outs() << format("%02" PRIX32, uuid.uuid[4]);
+  outs() << format("%02" PRIX32, uuid.uuid[5]);
+  outs() << "-";
+  outs() << format("%02" PRIX32, uuid.uuid[6]);
+  outs() << format("%02" PRIX32, uuid.uuid[7]);
+  outs() << "-";
+  outs() << format("%02" PRIX32, uuid.uuid[8]);
+  outs() << format("%02" PRIX32, uuid.uuid[9]);
+  outs() << "-";
+  outs() << format("%02" PRIX32, uuid.uuid[10]);
+  outs() << format("%02" PRIX32, uuid.uuid[11]);
+  outs() << format("%02" PRIX32, uuid.uuid[12]);
+  outs() << format("%02" PRIX32, uuid.uuid[13]);
+  outs() << format("%02" PRIX32, uuid.uuid[14]);
+  outs() << format("%02" PRIX32, uuid.uuid[15]);
+  outs() << "\n";
+}
+
+static void PrintVersionMinLoadCommand(MachO::version_min_command vd) {
+  if (vd.cmd == MachO::LC_VERSION_MIN_MACOSX)
+    outs() << "      cmd LC_VERSION_MIN_MACOSX\n";
+  else if (vd.cmd == MachO::LC_VERSION_MIN_IPHONEOS)
+    outs() << "      cmd LC_VERSION_MIN_IPHONEOS\n";
+  else
+    outs() << "      cmd " << vd.cmd << " (?)\n";
+  outs() << "  cmdsize " << vd.cmdsize;
+  if (vd.cmdsize != sizeof(struct MachO::version_min_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "  version " << ((vd.version >> 16) & 0xffff) << "."
+         << ((vd.version >> 8) & 0xff);
+  if ((vd.version & 0xff) != 0)
+    outs() << "." << (vd.version & 0xff);
+  outs() << "\n";
+  if (vd.sdk == 0)
+    outs() << "      sdk n/a\n";
+  else {
+    outs() << "      sdk " << ((vd.sdk >> 16) & 0xffff) << "."
+           << ((vd.sdk >> 8) & 0xff);
+  }
+  if ((vd.sdk & 0xff) != 0)
+    outs() << "." << (vd.sdk & 0xff);
+  outs() << "\n";
+}
+
+static void PrintSourceVersionCommand(MachO::source_version_command sd) {
+  outs() << "      cmd LC_SOURCE_VERSION\n";
+  outs() << "  cmdsize " << sd.cmdsize;
+  if (sd.cmdsize != sizeof(struct MachO::source_version_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  uint64_t a = (sd.version >> 40) & 0xffffff;
+  uint64_t b = (sd.version >> 30) & 0x3ff;
+  uint64_t c = (sd.version >> 20) & 0x3ff;
+  uint64_t d = (sd.version >> 10) & 0x3ff;
+  uint64_t e = sd.version & 0x3ff;
+  outs() << "  version " << a << "." << b;
+  if (e != 0)
+    outs() << "." << c << "." << d << "." << e;
+  else if (d != 0)
+    outs() << "." << c << "." << d;
+  else if (c != 0)
+    outs() << "." << c;
+  outs() << "\n";
+}
+
+static void PrintEntryPointCommand(MachO::entry_point_command ep) {
+  outs() << "       cmd LC_MAIN\n";
+  outs() << "   cmdsize " << ep.cmdsize;
+  if (ep.cmdsize != sizeof(struct MachO::entry_point_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "  entryoff " << ep.entryoff << "\n";
+  outs() << " stacksize " << ep.stacksize << "\n";
+}
+
+static void PrintDylibCommand(MachO::dylib_command dl, const char *Ptr) {
+  if (dl.cmd == MachO::LC_ID_DYLIB)
+    outs() << "          cmd LC_ID_DYLIB\n";
+  else if (dl.cmd == MachO::LC_LOAD_DYLIB)
+    outs() << "          cmd LC_LOAD_DYLIB\n";
+  else if (dl.cmd == MachO::LC_LOAD_WEAK_DYLIB)
+    outs() << "          cmd LC_LOAD_WEAK_DYLIB\n";
+  else if (dl.cmd == MachO::LC_REEXPORT_DYLIB)
+    outs() << "          cmd LC_REEXPORT_DYLIB\n";
+  else if (dl.cmd == MachO::LC_LAZY_LOAD_DYLIB)
+    outs() << "          cmd LC_LAZY_LOAD_DYLIB\n";
+  else if (dl.cmd == MachO::LC_LOAD_UPWARD_DYLIB)
+    outs() << "          cmd LC_LOAD_UPWARD_DYLIB\n";
+  else
+    outs() << "          cmd " << dl.cmd << " (unknown)\n";
+  outs() << "      cmdsize " << dl.cmdsize;
+  if (dl.cmdsize < sizeof(struct MachO::dylib_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  if (dl.dylib.name < dl.cmdsize) {
+    const char *P = (const char *)(Ptr) + dl.dylib.name;
+    outs() << "         name " << P << " (offset " << dl.dylib.name << ")\n";
+  } else {
+    outs() << "         name ?(bad offset " << dl.dylib.name << ")\n";
+  }
+  outs() << "   time stamp " << dl.dylib.timestamp << " ";
+  time_t t = dl.dylib.timestamp;
+  outs() << ctime(&t);
+  outs() << "      current version ";
+  if (dl.dylib.current_version == 0xffffffff)
+    outs() << "n/a\n";
+  else
+    outs() << ((dl.dylib.current_version >> 16) & 0xffff) << "."
+           << ((dl.dylib.current_version >> 8) & 0xff) << "."
+           << (dl.dylib.current_version & 0xff) << "\n";
+  outs() << "compatibility version ";
+  if (dl.dylib.compatibility_version == 0xffffffff)
+    outs() << "n/a\n";
+  else
+    outs() << ((dl.dylib.compatibility_version >> 16) & 0xffff) << "."
+           << ((dl.dylib.compatibility_version >> 8) & 0xff) << "."
+           << (dl.dylib.compatibility_version & 0xff) << "\n";
+}
+
+static void PrintLinkEditDataCommand(MachO::linkedit_data_command ld,
+                                     uint32_t object_size) {
+  if (ld.cmd == MachO::LC_CODE_SIGNATURE)
+    outs() << "      cmd LC_FUNCTION_STARTS\n";
+  else if (ld.cmd == MachO::LC_SEGMENT_SPLIT_INFO)
+    outs() << "      cmd LC_SEGMENT_SPLIT_INFO\n";
+  else if (ld.cmd == MachO::LC_FUNCTION_STARTS)
+    outs() << "      cmd LC_FUNCTION_STARTS\n";
+  else if (ld.cmd == MachO::LC_DATA_IN_CODE)
+    outs() << "      cmd LC_DATA_IN_CODE\n";
+  else if (ld.cmd == MachO::LC_DYLIB_CODE_SIGN_DRS)
+    outs() << "      cmd LC_DYLIB_CODE_SIGN_DRS\n";
+  else if (ld.cmd == MachO::LC_LINKER_OPTIMIZATION_HINT)
+    outs() << "      cmd LC_LINKER_OPTIMIZATION_HINT\n";
+  else
+    outs() << "      cmd " << ld.cmd << " (?)\n";
+  outs() << "  cmdsize " << ld.cmdsize;
+  if (ld.cmdsize != sizeof(struct MachO::linkedit_data_command))
+    outs() << " Incorrect size\n";
+  else
+    outs() << "\n";
+  outs() << "  dataoff " << ld.dataoff;
+  if (ld.dataoff > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+  outs() << " datasize " << ld.datasize;
+  uint64_t big_size = ld.dataoff;
+  big_size += ld.datasize;
+  if (big_size > object_size)
+    outs() << " (past end of file)\n";
+  else
+    outs() << "\n";
+}
+
+static void PrintLoadCommands(const MachOObjectFile *Obj, uint32_t ncmds,
+                              uint32_t filetype, uint32_t cputype,
+                              bool verbose) {
+  StringRef Buf = Obj->getData();
+  MachOObjectFile::LoadCommandInfo Command = Obj->getFirstLoadCommandInfo();
+  for (unsigned i = 0;; ++i) {
+    outs() << "Load command " << i << "\n";
+    if (Command.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command SLC = Obj->getSegmentLoadCommand(Command);
+      const char *sg_segname = SLC.segname;
+      PrintSegmentCommand(SLC.cmd, SLC.cmdsize, SLC.segname, SLC.vmaddr,
+                          SLC.vmsize, SLC.fileoff, SLC.filesize, SLC.maxprot,
+                          SLC.initprot, SLC.nsects, SLC.flags, Buf.size(),
+                          verbose);
+      for (unsigned j = 0; j < SLC.nsects; j++) {
+        MachO::section_64 S = Obj->getSection64(Command, j);
+        PrintSection(S.sectname, S.segname, S.addr, S.size, S.offset, S.align,
+                     S.reloff, S.nreloc, S.flags, S.reserved1, S.reserved2,
+                     SLC.cmd, sg_segname, filetype, Buf.size(), verbose);
+      }
+    } else if (Command.C.cmd == MachO::LC_SEGMENT_64) {
+      MachO::segment_command_64 SLC_64 = Obj->getSegment64LoadCommand(Command);
+      const char *sg_segname = SLC_64.segname;
+      PrintSegmentCommand(SLC_64.cmd, SLC_64.cmdsize, SLC_64.segname,
+                          SLC_64.vmaddr, SLC_64.vmsize, SLC_64.fileoff,
+                          SLC_64.filesize, SLC_64.maxprot, SLC_64.initprot,
+                          SLC_64.nsects, SLC_64.flags, Buf.size(), verbose);
+      for (unsigned j = 0; j < SLC_64.nsects; j++) {
+        MachO::section_64 S_64 = Obj->getSection64(Command, j);
+        PrintSection(S_64.sectname, S_64.segname, S_64.addr, S_64.size,
+                     S_64.offset, S_64.align, S_64.reloff, S_64.nreloc,
+                     S_64.flags, S_64.reserved1, S_64.reserved2, SLC_64.cmd,
+                     sg_segname, filetype, Buf.size(), verbose);
+      }
+    } else if (Command.C.cmd == MachO::LC_SYMTAB) {
+      MachO::symtab_command Symtab = Obj->getSymtabLoadCommand();
+      PrintSymtabLoadCommand(Symtab, Obj->is64Bit(), Buf.size());
+    } else if (Command.C.cmd == MachO::LC_DYSYMTAB) {
+      MachO::dysymtab_command Dysymtab = Obj->getDysymtabLoadCommand();
+      MachO::symtab_command Symtab = Obj->getSymtabLoadCommand();
+      PrintDysymtabLoadCommand(Dysymtab, Symtab.nsyms, Buf.size(),
+                               Obj->is64Bit());
+    } else if (Command.C.cmd == MachO::LC_DYLD_INFO ||
+               Command.C.cmd == MachO::LC_DYLD_INFO_ONLY) {
+      MachO::dyld_info_command DyldInfo = Obj->getDyldInfoLoadCommand(Command);
+      PrintDyldInfoLoadCommand(DyldInfo, Buf.size());
+    } else if (Command.C.cmd == MachO::LC_LOAD_DYLINKER ||
+               Command.C.cmd == MachO::LC_ID_DYLINKER ||
+               Command.C.cmd == MachO::LC_DYLD_ENVIRONMENT) {
+      MachO::dylinker_command Dyld = Obj->getDylinkerCommand(Command);
+      PrintDyldLoadCommand(Dyld, Command.Ptr);
+    } else if (Command.C.cmd == MachO::LC_UUID) {
+      MachO::uuid_command Uuid = Obj->getUuidCommand(Command);
+      PrintUuidLoadCommand(Uuid);
+    } else if (Command.C.cmd == MachO::LC_VERSION_MIN_MACOSX) {
+      MachO::version_min_command Vd = Obj->getVersionMinLoadCommand(Command);
+      PrintVersionMinLoadCommand(Vd);
+    } else if (Command.C.cmd == MachO::LC_SOURCE_VERSION) {
+      MachO::source_version_command Sd = Obj->getSourceVersionCommand(Command);
+      PrintSourceVersionCommand(Sd);
+    } else if (Command.C.cmd == MachO::LC_MAIN) {
+      MachO::entry_point_command Ep = Obj->getEntryPointCommand(Command);
+      PrintEntryPointCommand(Ep);
+    } else if (Command.C.cmd == MachO::LC_LOAD_DYLIB ||
+               Command.C.cmd == MachO::LC_ID_DYLIB ||
+               Command.C.cmd == MachO::LC_LOAD_WEAK_DYLIB ||
+               Command.C.cmd == MachO::LC_REEXPORT_DYLIB ||
+               Command.C.cmd == MachO::LC_LAZY_LOAD_DYLIB ||
+               Command.C.cmd == MachO::LC_LOAD_UPWARD_DYLIB) {
+      MachO::dylib_command Dl = Obj->getDylibIDLoadCommand(Command);
+      PrintDylibCommand(Dl, Command.Ptr);
+    } else if (Command.C.cmd == MachO::LC_CODE_SIGNATURE ||
+               Command.C.cmd == MachO::LC_SEGMENT_SPLIT_INFO ||
+               Command.C.cmd == MachO::LC_FUNCTION_STARTS ||
+               Command.C.cmd == MachO::LC_DATA_IN_CODE ||
+               Command.C.cmd == MachO::LC_DYLIB_CODE_SIGN_DRS ||
+               Command.C.cmd == MachO::LC_LINKER_OPTIMIZATION_HINT) {
+      MachO::linkedit_data_command Ld =
+          Obj->getLinkeditDataLoadCommand(Command);
+      PrintLinkEditDataCommand(Ld, Buf.size());
+    } else {
+      outs() << "      cmd ?(" << format("0x%08" PRIx32, Command.C.cmd)
+             << ")\n";
+      outs() << "  cmdsize " << Command.C.cmdsize << "\n";
+      // TODO: get and print the raw bytes of the load command.
+    }
+    // TODO: print all the other kinds of load commands.
+    if (i == ncmds - 1)
+      break;
+    else
+      Command = Obj->getNextLoadCommandInfo(Command);
+  }
+}
+
+static void getAndPrintMachHeader(const MachOObjectFile *Obj, uint32_t &ncmds,
+                                  uint32_t &filetype, uint32_t &cputype,
+                                  bool verbose) {
+  if (Obj->is64Bit()) {
+    MachO::mach_header_64 H_64;
+    H_64 = Obj->getHeader64();
+    PrintMachHeader(H_64.magic, H_64.cputype, H_64.cpusubtype, H_64.filetype,
+                    H_64.ncmds, H_64.sizeofcmds, H_64.flags, verbose);
+    ncmds = H_64.ncmds;
+    filetype = H_64.filetype;
+    cputype = H_64.cputype;
+  } else {
+    MachO::mach_header H;
+    H = Obj->getHeader();
+    PrintMachHeader(H.magic, H.cputype, H.cpusubtype, H.filetype, H.ncmds,
+                    H.sizeofcmds, H.flags, verbose);
+    ncmds = H.ncmds;
+    filetype = H.filetype;
+    cputype = H.cputype;
+  }
+}
+
+void llvm::printMachOFileHeader(const object::ObjectFile *Obj) {
+  const MachOObjectFile *file = dyn_cast<const MachOObjectFile>(Obj);
+  uint32_t ncmds = 0;
+  uint32_t filetype = 0;
+  uint32_t cputype = 0;
+  getAndPrintMachHeader(file, ncmds, filetype, cputype, true);
+  PrintLoadCommands(file, ncmds, filetype, cputype, true);
+}
+
+//===----------------------------------------------------------------------===//
+// export trie dumping
+//===----------------------------------------------------------------------===//
+
+void llvm::printMachOExportsTrie(const object::MachOObjectFile *Obj) {
+  for (const llvm::object::ExportEntry &Entry : Obj->exports()) {
+    uint64_t Flags = Entry.flags();
+    bool ReExport = (Flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT);
+    bool WeakDef = (Flags & MachO::EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION);
+    bool ThreadLocal = ((Flags & MachO::EXPORT_SYMBOL_FLAGS_KIND_MASK) ==
+                        MachO::EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL);
+    bool Abs = ((Flags & MachO::EXPORT_SYMBOL_FLAGS_KIND_MASK) ==
+                MachO::EXPORT_SYMBOL_FLAGS_KIND_ABSOLUTE);
+    bool Resolver = (Flags & MachO::EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER);
+    if (ReExport)
+      outs() << "[re-export] ";
+    else
+      outs() << format("0x%08llX  ",
+                       Entry.address()); // FIXME:add in base address
+    outs() << Entry.name();
+    if (WeakDef || ThreadLocal || Resolver || Abs) {
+      bool NeedsComma = false;
+      outs() << " [";
+      if (WeakDef) {
+        outs() << "weak_def";
+        NeedsComma = true;
+      }
+      if (ThreadLocal) {
+        if (NeedsComma)
+          outs() << ", ";
+        outs() << "per-thread";
+        NeedsComma = true;
+      }
+      if (Abs) {
+        if (NeedsComma)
+          outs() << ", ";
+        outs() << "absolute";
+        NeedsComma = true;
+      }
+      if (Resolver) {
+        if (NeedsComma)
+          outs() << ", ";
+        outs() << format("resolver=0x%08llX", Entry.other());
+        NeedsComma = true;
+      }
+      outs() << "]";
+    }
+    if (ReExport) {
+      StringRef DylibName = "unknown";
+      int Ordinal = Entry.other() - 1;
+      Obj->getLibraryShortNameByIndex(Ordinal, DylibName);
+      if (Entry.otherName().empty())
+        outs() << " (from " << DylibName << ")";
+      else
+        outs() << " (" << Entry.otherName() << " from " << DylibName << ")";
+    }
+    outs() << "\n";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// rebase table dumping
+//===----------------------------------------------------------------------===//
+
+namespace {
+class SegInfo {
+public:
+  SegInfo(const object::MachOObjectFile *Obj);
+
+  StringRef segmentName(uint32_t SegIndex);
+  StringRef sectionName(uint32_t SegIndex, uint64_t SegOffset);
+  uint64_t address(uint32_t SegIndex, uint64_t SegOffset);
+
+private:
+  struct SectionInfo {
+    uint64_t Address;
+    uint64_t Size;
+    StringRef SectionName;
+    StringRef SegmentName;
+    uint64_t OffsetInSegment;
+    uint64_t SegmentStartAddress;
+    uint32_t SegmentIndex;
+  };
+  const SectionInfo &findSection(uint32_t SegIndex, uint64_t SegOffset);
+  SmallVector<SectionInfo, 32> Sections;
+};
+}
+
+SegInfo::SegInfo(const object::MachOObjectFile *Obj) {
+  // Build table of sections so segIndex/offset pairs can be translated.
+  uint32_t CurSegIndex = Obj->hasPageZeroSegment() ? 1 : 0;
+  StringRef CurSegName;
+  uint64_t CurSegAddress;
+  for (const SectionRef &Section : Obj->sections()) {
+    SectionInfo Info;
+    if (error(Section.getName(Info.SectionName)))
+      return;
+    Info.Address = Section.getAddress();
+    Info.Size = Section.getSize();
+    Info.SegmentName =
+        Obj->getSectionFinalSegmentName(Section.getRawDataRefImpl());
+    if (!Info.SegmentName.equals(CurSegName)) {
+      ++CurSegIndex;
+      CurSegName = Info.SegmentName;
+      CurSegAddress = Info.Address;
+    }
+    Info.SegmentIndex = CurSegIndex - 1;
+    Info.OffsetInSegment = Info.Address - CurSegAddress;
+    Info.SegmentStartAddress = CurSegAddress;
+    Sections.push_back(Info);
+  }
+}
+
+StringRef SegInfo::segmentName(uint32_t SegIndex) {
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex == SegIndex)
+      return SI.SegmentName;
+  }
+  llvm_unreachable("invalid segIndex");
+}
+
+const SegInfo::SectionInfo &SegInfo::findSection(uint32_t SegIndex,
+                                                 uint64_t OffsetInSeg) {
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex != SegIndex)
+      continue;
+    if (SI.OffsetInSegment > OffsetInSeg)
+      continue;
+    if (OffsetInSeg >= (SI.OffsetInSegment + SI.Size))
+      continue;
+    return SI;
+  }
+  llvm_unreachable("segIndex and offset not in any section");
+}
+
+StringRef SegInfo::sectionName(uint32_t SegIndex, uint64_t OffsetInSeg) {
+  return findSection(SegIndex, OffsetInSeg).SectionName;
+}
+
+uint64_t SegInfo::address(uint32_t SegIndex, uint64_t OffsetInSeg) {
+  const SectionInfo &SI = findSection(SegIndex, OffsetInSeg);
+  return SI.SegmentStartAddress + OffsetInSeg;
+}
+
+void llvm::printMachORebaseTable(const object::MachOObjectFile *Obj) {
+  // Build table of sections so names can used in final output.
+  SegInfo sectionTable(Obj);
+
+  outs() << "segment  section            address     type\n";
+  for (const llvm::object::MachORebaseEntry &Entry : Obj->rebaseTable()) {
+    uint32_t SegIndex = Entry.segmentIndex();
+    uint64_t OffsetInSeg = Entry.segmentOffset();
+    StringRef SegmentName = sectionTable.segmentName(SegIndex);
+    StringRef SectionName = sectionTable.sectionName(SegIndex, OffsetInSeg);
+    uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+
+    // Table lines look like: __DATA  __nl_symbol_ptr  0x0000F00C  pointer
+    outs() << format("%-8s %-18s 0x%08" PRIX64 "  %s\n",
+                     SegmentName.str().c_str(), SectionName.str().c_str(),
+                     Address, Entry.typeName().str().c_str());
+  }
+}
+
+static StringRef ordinalName(const object::MachOObjectFile *Obj, int Ordinal) {
+  StringRef DylibName;
+  switch (Ordinal) {
+  case MachO::BIND_SPECIAL_DYLIB_SELF:
+    return "this-image";
+  case MachO::BIND_SPECIAL_DYLIB_MAIN_EXECUTABLE:
+    return "main-executable";
+  case MachO::BIND_SPECIAL_DYLIB_FLAT_LOOKUP:
+    return "flat-namespace";
+  default:
+    if (Ordinal > 0) {
+      std::error_code EC =
+          Obj->getLibraryShortNameByIndex(Ordinal - 1, DylibName);
+      if (EC)
+        return "<<bad library ordinal>>";
+      return DylibName;
+    }
+  }
+  return "<<unknown special ordinal>>";
+}
+
+//===----------------------------------------------------------------------===//
+// bind table dumping
+//===----------------------------------------------------------------------===//
+
+void llvm::printMachOBindTable(const object::MachOObjectFile *Obj) {
+  // Build table of sections so names can used in final output.
+  SegInfo sectionTable(Obj);
+
+  outs() << "segment  section            address    type       "
+            "addend dylib            symbol\n";
+  for (const llvm::object::MachOBindEntry &Entry : Obj->bindTable()) {
+    uint32_t SegIndex = Entry.segmentIndex();
+    uint64_t OffsetInSeg = Entry.segmentOffset();
+    StringRef SegmentName = sectionTable.segmentName(SegIndex);
+    StringRef SectionName = sectionTable.sectionName(SegIndex, OffsetInSeg);
+    uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+
+    // Table lines look like:
+    //  __DATA  __got  0x00012010    pointer   0 libSystem ___stack_chk_guard
+    StringRef Attr;
+    if (Entry.flags() & MachO::BIND_SYMBOL_FLAGS_WEAK_IMPORT)
+      Attr = " (weak_import)";
+    outs() << left_justify(SegmentName, 8) << " "
+           << left_justify(SectionName, 18) << " "
+           << format_hex(Address, 10, true) << " "
+           << left_justify(Entry.typeName(), 8) << " "
+           << format_decimal(Entry.addend(), 8) << " "
+           << left_justify(ordinalName(Obj, Entry.ordinal()), 16) << " "
+           << Entry.symbolName() << Attr << "\n";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// lazy bind table dumping
+//===----------------------------------------------------------------------===//
+
+void llvm::printMachOLazyBindTable(const object::MachOObjectFile *Obj) {
+  // Build table of sections so names can used in final output.
+  SegInfo sectionTable(Obj);
+
+  outs() << "segment  section            address     "
+            "dylib            symbol\n";
+  for (const llvm::object::MachOBindEntry &Entry : Obj->lazyBindTable()) {
+    uint32_t SegIndex = Entry.segmentIndex();
+    uint64_t OffsetInSeg = Entry.segmentOffset();
+    StringRef SegmentName = sectionTable.segmentName(SegIndex);
+    StringRef SectionName = sectionTable.sectionName(SegIndex, OffsetInSeg);
+    uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+
+    // Table lines look like:
+    //  __DATA  __got  0x00012010 libSystem ___stack_chk_guard
+    outs() << left_justify(SegmentName, 8) << " "
+           << left_justify(SectionName, 18) << " "
+           << format_hex(Address, 10, true) << " "
+           << left_justify(ordinalName(Obj, Entry.ordinal()), 16) << " "
+           << Entry.symbolName() << "\n";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// weak bind table dumping
+//===----------------------------------------------------------------------===//
+
+void llvm::printMachOWeakBindTable(const object::MachOObjectFile *Obj) {
+  // Build table of sections so names can used in final output.
+  SegInfo sectionTable(Obj);
+
+  outs() << "segment  section            address     "
+            "type       addend   symbol\n";
+  for (const llvm::object::MachOBindEntry &Entry : Obj->weakBindTable()) {
+    // Strong symbols don't have a location to update.
+    if (Entry.flags() & MachO::BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION) {
+      outs() << "                                        strong              "
+             << Entry.symbolName() << "\n";
+      continue;
+    }
+    uint32_t SegIndex = Entry.segmentIndex();
+    uint64_t OffsetInSeg = Entry.segmentOffset();
+    StringRef SegmentName = sectionTable.segmentName(SegIndex);
+    StringRef SectionName = sectionTable.sectionName(SegIndex, OffsetInSeg);
+    uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+
+    // Table lines look like:
+    // __DATA  __data  0x00001000  pointer    0   _foo
+    outs() << left_justify(SegmentName, 8) << " "
+           << left_justify(SectionName, 18) << " "
+           << format_hex(Address, 10, true) << " "
+           << left_justify(Entry.typeName(), 8) << " "
+           << format_decimal(Entry.addend(), 8) << "   " << Entry.symbolName()
+           << "\n";
+  }
+}
+
+// get_dyld_bind_info_symbolname() is used for disassembly and passed an
+// address, ReferenceValue, in the Mach-O file and looks in the dyld bind
+// information for that address. If the address is found its binding symbol
+// name is returned.  If not nullptr is returned.
+static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue,
+                                                 struct DisassembleInfo *info) {
+  if (info->bindtable == nullptr) {
+    info->bindtable = new (BindTable);
+    SegInfo sectionTable(info->O);
+    for (const llvm::object::MachOBindEntry &Entry : info->O->bindTable()) {
+      uint32_t SegIndex = Entry.segmentIndex();
+      uint64_t OffsetInSeg = Entry.segmentOffset();
+      uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+      const char *SymbolName = nullptr;
+      StringRef name = Entry.symbolName();
+      if (!name.empty())
+        SymbolName = name.data();
+      info->bindtable->push_back(std::make_pair(Address, SymbolName));
+    }
+  }
+  for (bind_table_iterator BI = info->bindtable->begin(),
+                           BE = info->bindtable->end();
+       BI != BE; ++BI) {
+    uint64_t Address = BI->first;
+    if (ReferenceValue == Address) {
+      const char *SymbolName = BI->second;
+      return SymbolName;
+    }
+  }
+  return nullptr;
+}

diff --git a/tools/llvm-objdump/Makefile b/tools/llvm-objdump/Makefile
index c3601eb..4616b78 100644
--- a/tools/llvm-objdump/Makefile
+++ b/tools/llvm-objdump/Makefile

@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-objdump
-LINK_COMPONENTS := all-targets DebugInfo MC MCAnalysis MCParser MCDisassembler Object
+LINK_COMPONENTS := all-targets DebugInfo MC MCParser MCDisassembler Object
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1

diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 309bf23..8903bff 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp

@@ -20,10 +20,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAnalysis/MCAtom.h"
-#include "llvm/MC/MCAnalysis/MCFunction.h"
-#include "llvm/MC/MCAnalysis/MCModule.h"
-#include "llvm/MC/MCAnalysis/MCModuleYAML.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
@@ -31,9 +27,7 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCObjectDisassembler.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectSymbolizer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCRelocationInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -85,6 +79,21 @@
 SymbolTable("t", cl::desc("Display the symbol table"));
 
 static cl::opt<bool>
+ExportsTrie("exports-trie", cl::desc("Display mach-o exported symbols"));
+
+static cl::opt<bool>
+Rebase("rebase", cl::desc("Display mach-o rebasing info"));
+
+static cl::opt<bool>
+Bind("bind", cl::desc("Display mach-o binding info"));
+
+static cl::opt<bool>
+LazyBind("lazy-bind", cl::desc("Display mach-o lazy binding info"));
+
+static cl::opt<bool>
+WeakBind("weak-bind", cl::desc("Display mach-o weak binding info"));
+
+static cl::opt<bool>
 MachOOpt("macho", cl::desc("Use MachO specific object file parser"));
 static cl::alias
 MachOm("m", cl::desc("Alias for --macho"), cl::aliasopt(MachOOpt));
@@ -94,6 +103,12 @@
                                     "see -version for available targets"));
 
 cl::opt<std::string>
+llvm::MCPU("mcpu",
+     cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+     cl::value_desc("cpu-name"),
+     cl::init(""));
+
+cl::opt<std::string>
 llvm::ArchName("arch", cl::desc("Target arch to disassemble for, "
                                 "see -version for available targets"));
 
@@ -107,15 +122,16 @@
 SectionHeadersShorter("h", cl::desc("Alias for --section-headers"),
                       cl::aliasopt(SectionHeaders));
 
-static cl::list<std::string>
-MAttrs("mattr",
+cl::list<std::string>
+llvm::MAttrs("mattr",
   cl::CommaSeparated,
   cl::desc("Target specific attributes"),
   cl::value_desc("a1,+a2,-a3,..."));
 
-static cl::opt<bool>
-NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling instructions, "
-                                           "do not print the instruction bytes."));
+cl::opt<bool>
+llvm::NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling "
+                                                 "instructions, do not print "
+                                                 "the instruction bytes."));
 
 static cl::opt<bool>
 UnwindInfo("unwind-info", cl::desc("Display unwind information"));
@@ -132,20 +148,6 @@
 PrivateHeadersShort("p", cl::desc("Alias for --private-headers"),
                     cl::aliasopt(PrivateHeaders));
 
-static cl::opt<bool>
-Symbolize("symbolize", cl::desc("When disassembling instructions, "
-                                "try to symbolize operands."));
-
-static cl::opt<bool>
-CFG("cfg", cl::desc("Create a CFG for every function found in the object"
-                      " and write it to a graphviz file"));
-
-// FIXME: Does it make sense to have a dedicated tool for yaml cfg output?
-static cl::opt<std::string>
-YAMLCFG("yaml-cfg",
-        cl::desc("Create a CFG and write it as a YAML MCModule."),
-        cl::value_desc("yaml output file"));
-
 static StringRef ToolName;
 
 bool llvm::error(std::error_code EC) {
@@ -191,53 +193,6 @@
   return TheTarget;
 }
 
-// Write a graphviz file for the CFG inside an MCFunction.
-// FIXME: Use GraphWriter
-static void emitDOTFile(const char *FileName, const MCFunction &f,
-                        MCInstPrinter *IP) {
-  // Start a new dot file.
-  std::string Error;
-  raw_fd_ostream Out(FileName, Error, sys::fs::F_Text);
-  if (!Error.empty()) {
-    errs() << "llvm-objdump: warning: " << Error << '\n';
-    return;
-  }
-
-  Out << "digraph \"" << f.getName() << "\" {\n";
-  Out << "graph [ rankdir = \"LR\" ];\n";
-  for (MCFunction::const_iterator i = f.begin(), e = f.end(); i != e; ++i) {
-    // Only print blocks that have predecessors.
-    bool hasPreds = (*i)->pred_begin() != (*i)->pred_end();
-
-    if (!hasPreds && i != f.begin())
-      continue;
-
-    Out << '"' << (*i)->getInsts()->getBeginAddr() << "\" [ label=\"<a>";
-    // Print instructions.
-    for (unsigned ii = 0, ie = (*i)->getInsts()->size(); ii != ie;
-        ++ii) {
-      if (ii != 0) // Not the first line, start a new row.
-        Out << '|';
-      if (ii + 1 == ie) // Last line, add an end id.
-        Out << "<o>";
-
-      // Escape special chars and print the instruction in mnemonic form.
-      std::string Str;
-      raw_string_ostream OS(Str);
-      IP->printInst(&(*i)->getInsts()->at(ii).Inst, OS, "");
-      Out << DOT::EscapeString(OS.str());
-    }
-    Out << "\" shape=\"record\" ];\n";
-
-    // Add edges.
-    for (MCBasicBlock::succ_const_iterator si = (*i)->succ_begin(),
-        se = (*i)->succ_end(); si != se; ++si)
-      Out << (*i)->getInsts()->getBeginAddr() << ":o -> "
-          << (*si)->getInsts()->getBeginAddr() << ":a\n";
-  }
-  Out << "}\n";
-}
-
 void llvm::DumpBytes(StringRef bytes) {
   static const char hex_rep[] = "0123456789abcdef";
   // FIXME: The real way to do this is to figure out the longest instruction
@@ -303,7 +258,7 @@
   }
 
   std::unique_ptr<const MCSubtargetInfo> STI(
-      TheTarget->createMCSubtargetInfo(TripleName, "", FeaturesStr));
+      TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr));
   if (!STI) {
     errs() << "error: no subtarget info for target " << TripleName << "\n";
     return;
@@ -326,19 +281,6 @@
     return;
   }
 
-
-  if (Symbolize) {
-    std::unique_ptr<MCRelocationInfo> RelInfo(
-        TheTarget->createMCRelocationInfo(TripleName, Ctx));
-    if (RelInfo) {
-      std::unique_ptr<MCSymbolizer> Symzer(
-        MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo),
-                                                   Obj));
-      if (Symzer)
-        DisAsm->setSymbolizer(std::move(Symzer));
-    }
-  }
-
   std::unique_ptr<const MCInstrAnalysis> MIA(
       TheTarget->createMCInstrAnalysis(MII.get()));
 
@@ -351,45 +293,6 @@
     return;
   }
 
-  if (CFG || !YAMLCFG.empty()) {
-    std::unique_ptr<MCObjectDisassembler> OD(
-        new MCObjectDisassembler(*Obj, *DisAsm, *MIA));
-    std::unique_ptr<MCModule> Mod(OD->buildModule(/* withCFG */ true));
-    for (MCModule::const_atom_iterator AI = Mod->atom_begin(),
-                                       AE = Mod->atom_end();
-                                       AI != AE; ++AI) {
-      outs() << "Atom " << (*AI)->getName() << ": \n";
-      if (const MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI)) {
-        for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
-             II != IE;
-             ++II) {
-          IP->printInst(&II->Inst, outs(), "");
-          outs() << "\n";
-        }
-      }
-    }
-    if (CFG) {
-      for (MCModule::const_func_iterator FI = Mod->func_begin(),
-                                         FE = Mod->func_end();
-                                         FI != FE; ++FI) {
-        static int filenum = 0;
-        emitDOTFile((Twine((*FI)->getName()) + "_" +
-                     utostr(filenum) + ".dot").str().c_str(),
-                      **FI, IP.get());
-        ++filenum;
-      }
-    }
-    if (!YAMLCFG.empty()) {
-      std::string Error;
-      raw_fd_ostream YAMLOut(YAMLCFG.c_str(), Error, sys::fs::F_Text);
-      if (!Error.empty()) {
-        errs() << ToolName << ": warning: " << Error << '\n';
-        return;
-      }
-      mcmodule2yaml(YAMLOut, *Mod, *MII, *MRI);
-    }
-  }
-
   StringRef Fmt = Obj->getBytesInAddress() > 4 ? "\t\t%016" PRIx64 ":  " :
                                                  "\t\t\t%08" PRIx64 ":  ";
 
@@ -404,25 +307,18 @@
   }
 
   for (const SectionRef &Section : Obj->sections()) {
-    bool Text;
-    if (error(Section.isText(Text)))
-      break;
-    if (!Text)
+    if (!Section.isText() || Section.isVirtual())
       continue;
 
-    uint64_t SectionAddr;
-    if (error(Section.getAddress(SectionAddr)))
-      break;
-
-    uint64_t SectSize;
-    if (error(Section.getSize(SectSize)))
-      break;
+    uint64_t SectionAddr = Section.getAddress();
+    uint64_t SectSize = Section.getSize();
+    if (!SectSize)
+      continue;
 
     // Make a list of all the symbols in this section.
     std::vector<std::pair<uint64_t, StringRef>> Symbols;
     for (const SymbolRef &Symbol : Obj->symbols()) {
-      bool contains;
-      if (!error(Section.containsSymbol(Symbol, contains)) && contains) {
+      if (Section.containsSymbol(Symbol)) {
         uint64_t Address;
         if (error(Symbol.getAddress(Address)))
           break;
@@ -477,10 +373,12 @@
     SmallString<40> Comments;
     raw_svector_ostream CommentStream(Comments);
 
-    StringRef Bytes;
-    if (error(Section.getContents(Bytes)))
+    StringRef BytesStr;
+    if (error(Section.getContents(BytesStr)))
       break;
-    StringRefMemoryObject memoryObject(Bytes, SectionAddr);
+    ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(BytesStr.data()),
+                            BytesStr.size());
+
     uint64_t Size;
     uint64_t Index;
 
@@ -488,17 +386,12 @@
     std::vector<RelocationRef>::const_iterator rel_end = Rels.end();
     // Disassemble symbol by symbol.
     for (unsigned si = 0, se = Symbols.size(); si != se; ++si) {
+
       uint64_t Start = Symbols[si].first;
-      uint64_t End;
-      // The end is either the size of the section or the beginning of the next
-      // symbol.
-      if (si == se - 1)
-        End = SectSize;
-      // Make sure this symbol takes up space.
-      else if (Symbols[si + 1].first != Start)
-        End = Symbols[si + 1].first - 1;
-      else
-        // This symbol has the same address as the next symbol. Skip it.
+      // The end is either the section end or the beginning of the next symbol.
+      uint64_t End = (si == se - 1) ? SectSize : Symbols[si + 1].first;
+      // If this symbol has the same address as the next symbol, then skip it.
+      if (Start == End)
         continue;
 
       outs() << '\n' << Symbols[si].second << ":\n";
@@ -512,13 +405,14 @@
       for (Index = Start; Index < End; Index += Size) {
         MCInst Inst;
 
-        if (DisAsm->getInstruction(Inst, Size, memoryObject,
-                                   SectionAddr + Index,
-                                   DebugOut, CommentStream)) {
+        if (DisAsm->getInstruction(Inst, Size, Bytes.slice(Index),
+                                   SectionAddr + Index, DebugOut,
+                                   CommentStream)) {
           outs() << format("%8" PRIx64 ":", SectionAddr + Index);
           if (!NoShowRawInsn) {
             outs() << "\t";
-            DumpBytes(StringRef(Bytes.data() + Index, Size));
+            DumpBytes(StringRef(
+                reinterpret_cast<const char *>(Bytes.data()) + Index, Size));
           }
           IP->printInst(&Inst, outs(), "");
           outs() << CommentStream.str();
@@ -561,6 +455,11 @@
 static void PrintRelocations(const ObjectFile *Obj) {
   StringRef Fmt = Obj->getBytesInAddress() > 4 ? "%016" PRIx64 :
                                                  "%08" PRIx64;
+  // Regular objdump doesn't print relocations in non-relocatable object
+  // files.
+  if (!Obj->isRelocatableObject())
+    return;
+
   for (const SectionRef &Section : Obj->sections()) {
     if (Section.relocation_begin() == Section.relocation_end())
       continue;
@@ -598,19 +497,11 @@
     StringRef Name;
     if (error(Section.getName(Name)))
       return;
-    uint64_t Address;
-    if (error(Section.getAddress(Address)))
-      return;
-    uint64_t Size;
-    if (error(Section.getSize(Size)))
-      return;
-    bool Text, Data, BSS;
-    if (error(Section.isText(Text)))
-      return;
-    if (error(Section.isData(Data)))
-      return;
-    if (error(Section.isBSS(BSS)))
-      return;
+    uint64_t Address = Section.getAddress();
+    uint64_t Size = Section.getSize();
+    bool Text = Section.isText();
+    bool Data = Section.isData();
+    bool BSS = Section.isBSS();
     std::string Type = (std::string(Text ? "TEXT " : "") +
                         (Data ? "DATA " : "") + (BSS ? "BSS" : ""));
     outs() << format("%3d %-13s %08" PRIx64 " %016" PRIx64 " %s\n", i,
@@ -624,25 +515,24 @@
   for (const SectionRef &Section : Obj->sections()) {
     StringRef Name;
     StringRef Contents;
-    uint64_t BaseAddr;
-    bool BSS;
     if (error(Section.getName(Name)))
       continue;
-    if (error(Section.getContents(Contents)))
-      continue;
-    if (error(Section.getAddress(BaseAddr)))
-      continue;
-    if (error(Section.isBSS(BSS)))
+    uint64_t BaseAddr = Section.getAddress();
+    uint64_t Size = Section.getSize();
+    if (!Size)
       continue;
 
     outs() << "Contents of section " << Name << ":\n";
-    if (BSS) {
+    if (Section.isBSS()) {
       outs() << format("<skipping contents of bss section at [%04" PRIx64
-                       ", %04" PRIx64 ")>\n", BaseAddr,
-                       BaseAddr + Contents.size());
+                       ", %04" PRIx64 ")>\n",
+                       BaseAddr, BaseAddr + Size);
       continue;
     }
 
+    if (error(Section.getContents(Contents)))
+      continue;
+
     // Dump out the content as hex and printable ascii characters.
     for (std::size_t addr = 0, end = Contents.size(); addr < end; addr += 16) {
       outs() << format(" %04" PRIx64 " ", BaseAddr + addr);
@@ -670,34 +560,32 @@
 }
 
 static void PrintCOFFSymbolTable(const COFFObjectFile *coff) {
-  const coff_file_header *header;
-  if (error(coff->getHeader(header)))
-    return;
-
-  for (unsigned SI = 0, SE = header->NumberOfSymbols; SI != SE; ++SI) {
-    const coff_symbol *Symbol;
+  for (unsigned SI = 0, SE = coff->getNumberOfSymbols(); SI != SE; ++SI) {
+    ErrorOr<COFFSymbolRef> Symbol = coff->getSymbol(SI);
     StringRef Name;
-    if (error(coff->getSymbol(SI, Symbol)))
+    if (error(Symbol.getError()))
       return;
 
-    if (error(coff->getSymbolName(Symbol, Name)))
+    if (error(coff->getSymbolName(*Symbol, Name)))
       return;
 
     outs() << "[" << format("%2d", SI) << "]"
-           << "(sec " << format("%2d", int(Symbol->SectionNumber)) << ")"
+           << "(sec " << format("%2d", int(Symbol->getSectionNumber())) << ")"
            << "(fl 0x00)" // Flag bits, which COFF doesn't have.
-           << "(ty " << format("%3x", unsigned(Symbol->Type)) << ")"
-           << "(scl " << format("%3x", unsigned(Symbol->StorageClass)) << ") "
-           << "(nx " << unsigned(Symbol->NumberOfAuxSymbols) << ") "
-           << "0x" << format("%08x", unsigned(Symbol->Value)) << " "
+           << "(ty " << format("%3x", unsigned(Symbol->getType())) << ")"
+           << "(scl " << format("%3x", unsigned(Symbol->getStorageClass())) << ") "
+           << "(nx " << unsigned(Symbol->getNumberOfAuxSymbols()) << ") "
+           << "0x" << format("%08x", unsigned(Symbol->getValue())) << " "
            << Name << "\n";
 
-    for (unsigned AI = 0, AE = Symbol->NumberOfAuxSymbols; AI < AE; ++AI, ++SI) {
+    for (unsigned AI = 0, AE = Symbol->getNumberOfAuxSymbols(); AI < AE; ++AI, ++SI) {
       if (Symbol->isSectionDefinition()) {
         const coff_aux_section_definition *asd;
         if (error(coff->getAuxSymbol<coff_aux_section_definition>(SI + 1, asd)))
           return;
 
+        int32_t AuxNumber = asd->getNumber(Symbol->isBigObj());
+
         outs() << "AUX "
                << format("scnlen 0x%x nreloc %d nlnno %d checksum 0x%x "
                          , unsigned(asd->Length)
@@ -705,18 +593,18 @@
                          , unsigned(asd->NumberOfLinenumbers)
                          , unsigned(asd->CheckSum))
                << format("assoc %d comdat %d\n"
-                         , unsigned(asd->Number)
+                         , unsigned(AuxNumber)
                          , unsigned(asd->Selection));
       } else if (Symbol->isFileRecord()) {
-        const coff_aux_file *AF;
-        if (error(coff->getAuxSymbol<coff_aux_file>(SI + 1, AF)))
+        const char *FileName;
+        if (error(coff->getAuxSymbol<char>(SI + 1, FileName)))
           return;
 
-        StringRef Name(AF->FileName,
-                       Symbol->NumberOfAuxSymbols * COFF::SymbolSize);
+        StringRef Name(FileName, Symbol->getNumberOfAuxSymbols() *
+                                     coff->getSymbolTableEntrySize());
         outs() << "AUX " << Name.rtrim(StringRef("\0", 1))  << '\n';
 
-        SI = SI + Symbol->NumberOfAuxSymbols;
+        SI = SI + Symbol->getNumberOfAuxSymbols();
         break;
       } else {
         outs() << "AUX Unknown\n";
@@ -809,10 +697,67 @@
 
   if (const COFFObjectFile *coff = dyn_cast<COFFObjectFile>(o)) {
     printCOFFUnwindInfo(coff);
-  } else {
+  } else if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+    printMachOUnwindInfo(MachO);
+  else {
     // TODO: Extract DWARF dump tool to objdump.
     errs() << "This operation is only currently supported "
-              "for COFF object files.\n";
+              "for COFF and MachO object files.\n";
+    return;
+  }
+}
+
+static void printExportsTrie(const ObjectFile *o) {
+  outs() << "Exports trie:\n";
+  if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+    printMachOExportsTrie(MachO);
+  else {
+    errs() << "This operation is only currently supported "
+              "for Mach-O executable files.\n";
+    return;
+  }
+}
+
+static void printRebaseTable(const ObjectFile *o) {
+  outs() << "Rebase table:\n";
+  if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+    printMachORebaseTable(MachO);
+  else {
+    errs() << "This operation is only currently supported "
+              "for Mach-O executable files.\n";
+    return;
+  }
+}
+
+static void printBindTable(const ObjectFile *o) {
+  outs() << "Bind table:\n";
+  if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+    printMachOBindTable(MachO);
+  else {
+    errs() << "This operation is only currently supported "
+              "for Mach-O executable files.\n";
+    return;
+  }
+}
+
+static void printLazyBindTable(const ObjectFile *o) {
+  outs() << "Lazy bind table:\n";
+  if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+    printMachOLazyBindTable(MachO);
+  else {
+    errs() << "This operation is only currently supported "
+              "for Mach-O executable files.\n";
+    return;
+  }
+}
+
+static void printWeakBindTable(const ObjectFile *o) {
+  outs() << "Weak bind table:\n";
+  if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+    printMachOWeakBindTable(MachO);
+  else {
+    errs() << "This operation is only currently supported "
+              "for Mach-O executable files.\n";
     return;
   }
 }
@@ -822,6 +767,8 @@
     printELFFileHeader(o);
   } else if (o->isCOFF()) {
     printCOFFFileHeader(o);
+  } else if (o->isMachO()) {
+    printMachOFileHeader(o);
   }
 }
 
@@ -844,6 +791,16 @@
     PrintUnwindInfo(o);
   if (PrivateHeaders)
     printPrivateFileHeader(o);
+  if (ExportsTrie)
+    printExportsTrie(o);
+  if (Rebase)
+    printRebaseTable(o);
+  if (Bind)
+    printBindTable(o);
+  if (LazyBind)
+    printLazyBindTable(o);
+  if (WeakBind)
+    printWeakBindTable(o);
 }
 
 /// @brief Dump each object file in \a a;
@@ -880,16 +837,16 @@
   }
 
   // Attempt to open the binary.
-  ErrorOr<Binary *> BinaryOrErr = createBinary(file);
+  ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(file);
   if (std::error_code EC = BinaryOrErr.getError()) {
     errs() << ToolName << ": '" << file << "': " << EC.message() << ".\n";
     return;
   }
-  std::unique_ptr<Binary> binary(BinaryOrErr.get());
+  Binary &Binary = *BinaryOrErr.get().getBinary();
 
-  if (Archive *a = dyn_cast<Archive>(binary.get()))
+  if (Archive *a = dyn_cast<Archive>(&Binary))
     DumpArchive(a);
-  else if (ObjectFile *o = dyn_cast<ObjectFile>(binary.get()))
+  else if (ObjectFile *o = dyn_cast<ObjectFile>(&Binary))
     DumpObject(o);
   else
     errs() << ToolName << ": '" << file << "': " << "Unrecognized file type.\n";
@@ -925,7 +882,12 @@
       && !SectionContents
       && !SymbolTable
       && !UnwindInfo
-      && !PrivateHeaders) {
+      && !PrivateHeaders
+      && !ExportsTrie
+      && !Rebase
+      && !Bind
+      && !LazyBind
+      && !WeakBind) {
     cl::PrintHelpMessage();
     return 2;
   }

diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index 80f8f58..ef1509f 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h

@@ -7,23 +7,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJDUMP_H
-#define LLVM_OBJDUMP_H
+#ifndef LLVM_TOOLS_LLVM_OBJDUMP_LLVM_OBJDUMP_H
+#define LLVM_TOOLS_LLVM_OBJDUMP_LLVM_OBJDUMP_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Support/StringRefMemoryObject.h"
 
 namespace llvm {
 namespace object {
   class COFFObjectFile;
+  class MachOObjectFile;
   class ObjectFile;
   class RelocationRef;
 }
 
 extern cl::opt<std::string> TripleName;
 extern cl::opt<std::string> ArchName;
+extern cl::opt<std::string> MCPU;
+extern cl::list<std::string> MAttrs;
+extern cl::opt<bool> NoShowRawInsn;
 
 // Various helper functions.
 bool error(std::error_code ec);
@@ -31,8 +34,15 @@
 void DumpBytes(StringRef bytes);
 void DisassembleInputMachO(StringRef Filename);
 void printCOFFUnwindInfo(const object::COFFObjectFile* o);
+void printMachOUnwindInfo(const object::MachOObjectFile* o);
+void printMachOExportsTrie(const object::MachOObjectFile* o);
+void printMachORebaseTable(const object::MachOObjectFile* o);
+void printMachOBindTable(const object::MachOObjectFile* o);
+void printMachOLazyBindTable(const object::MachOObjectFile* o);
+void printMachOWeakBindTable(const object::MachOObjectFile* o);
 void printELFFileHeader(const object::ObjectFile *o);
 void printCOFFFileHeader(const object::ObjectFile *o);
+void printMachOFileHeader(const object::ObjectFile *o);
 
 } // end namespace llvm
 

diff --git a/tools/llvm-profdata/CMakeLists.txt b/tools/llvm-profdata/CMakeLists.txt
index 3529114..0e330fd 100644
--- a/tools/llvm-profdata/CMakeLists.txt
+++ b/tools/llvm-profdata/CMakeLists.txt

@@ -1,4 +1,8 @@
-set(LLVM_LINK_COMPONENTS profiledata support)
+set(LLVM_LINK_COMPONENTS
+  Core
+  ProfileData
+  Support
+  )
 
 add_llvm_tool(llvm-profdata
   llvm-profdata.cpp

diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index ba88aad..e977799 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp

@@ -14,6 +14,9 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/InstrProfWriter.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/ProfileData/SampleProfWriter.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
@@ -33,32 +36,24 @@
   ::exit(1);
 }
 
-int merge_main(int argc, const char *argv[]) {
-  cl::list<std::string> Inputs(cl::Positional, cl::Required, cl::OneOrMore,
-                               cl::desc("<filenames...>"));
+enum ProfileKinds { instr, sample };
 
-  cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
-                                      cl::init("-"),
-                                      cl::desc("Output file"));
-  cl::alias OutputFilenameA("o", cl::desc("Alias for --output"), cl::Required,
-                            cl::aliasopt(OutputFilename));
-
-  cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
-
+void mergeInstrProfile(cl::list<std::string> Inputs, StringRef OutputFilename) {
   if (OutputFilename.compare("-") == 0)
     exitWithError("Cannot write indexed profdata format to stdout.");
 
-  std::string ErrorInfo;
-  raw_fd_ostream Output(OutputFilename.data(), ErrorInfo, sys::fs::F_None);
-  if (!ErrorInfo.empty())
-    exitWithError(ErrorInfo, OutputFilename);
+  std::error_code EC;
+  raw_fd_ostream Output(OutputFilename.data(), EC, sys::fs::F_None);
+  if (EC)
+    exitWithError(EC.message(), OutputFilename);
 
   InstrProfWriter Writer;
   for (const auto &Filename : Inputs) {
-    std::unique_ptr<InstrProfReader> Reader;
-    if (std::error_code ec = InstrProfReader::create(Filename, Reader))
+    auto ReaderOrErr = InstrProfReader::create(Filename);
+    if (std::error_code ec = ReaderOrErr.getError())
       exitWithError(ec.message(), Filename);
 
+    auto Reader = std::move(ReaderOrErr.get());
     for (const auto &I : *Reader)
       if (std::error_code EC =
               Writer.addFunctionCounts(I.Name, I.Hash, I.Counts))
@@ -67,50 +62,86 @@
       exitWithError(Reader->getError().message(), Filename);
   }
   Writer.write(Output);
+}
+
+void mergeSampleProfile(cl::list<std::string> Inputs, StringRef OutputFilename,
+                        sampleprof::SampleProfileFormat OutputFormat) {
+  using namespace sampleprof;
+  auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat);
+  if (std::error_code EC = WriterOrErr.getError())
+    exitWithError(EC.message(), OutputFilename);
+
+  auto Writer = std::move(WriterOrErr.get());
+  StringMap<FunctionSamples> ProfileMap;
+  for (const auto &Filename : Inputs) {
+    auto ReaderOrErr =
+        SampleProfileReader::create(Filename, getGlobalContext());
+    if (std::error_code EC = ReaderOrErr.getError())
+      exitWithError(EC.message(), Filename);
+
+    auto Reader = std::move(ReaderOrErr.get());
+    if (std::error_code EC = Reader->read())
+      exitWithError(EC.message(), Filename);
+
+    StringMap<FunctionSamples> &Profiles = Reader->getProfiles();
+    for (StringMap<FunctionSamples>::iterator I = Profiles.begin(),
+                                              E = Profiles.end();
+         I != E; ++I) {
+      StringRef FName = I->first();
+      FunctionSamples &Samples = I->second;
+      ProfileMap[FName].merge(Samples);
+    }
+  }
+  Writer->write(ProfileMap);
+}
+
+int merge_main(int argc, const char *argv[]) {
+  cl::list<std::string> Inputs(cl::Positional, cl::Required, cl::OneOrMore,
+                               cl::desc("<filenames...>"));
+
+  cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                      cl::init("-"), cl::Required,
+                                      cl::desc("Output file"));
+  cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
+                            cl::aliasopt(OutputFilename));
+  cl::opt<ProfileKinds> ProfileKind(
+      cl::desc("Profile kind:"), cl::init(instr),
+      cl::values(clEnumVal(instr, "Instrumentation profile (default)"),
+                 clEnumVal(sample, "Sample profile"), clEnumValEnd));
+
+  cl::opt<sampleprof::SampleProfileFormat> OutputFormat(
+      cl::desc("Format of output profile (only meaningful with --sample)"),
+      cl::init(sampleprof::SPF_Binary),
+      cl::values(clEnumValN(sampleprof::SPF_Binary, "binary",
+                            "Binary encoding (default)"),
+                 clEnumValN(sampleprof::SPF_Text, "text", "Text encoding"),
+                 clEnumValN(sampleprof::SPF_GCC, "gcc", "GCC encoding"),
+                 clEnumValEnd));
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM profile data merger\n");
+
+  if (ProfileKind == instr)
+    mergeInstrProfile(Inputs, OutputFilename);
+  else
+    mergeSampleProfile(Inputs, OutputFilename, OutputFormat);
 
   return 0;
 }
 
-int show_main(int argc, const char *argv[]) {
-  cl::opt<std::string> Filename(cl::Positional, cl::Required,
-                                cl::desc("<profdata-file>"));
-
-  cl::opt<bool> ShowCounts("counts", cl::init(false),
-                           cl::desc("Show counter values for shown functions"));
-  cl::opt<bool> ShowAllFunctions("all-functions", cl::init(false),
-                                 cl::desc("Details for every function"));
-  cl::opt<std::string> ShowFunction("function",
-                                    cl::desc("Details for matching functions"));
-
-  cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
-                                      cl::init("-"),
-                                      cl::desc("Output file"));
-  cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
-                            cl::aliasopt(OutputFilename));
-
-  cl::ParseCommandLineOptions(argc, argv, "LLVM profile data summary\n");
-
-  std::unique_ptr<InstrProfReader> Reader;
-  if (std::error_code EC = InstrProfReader::create(Filename, Reader))
+int showInstrProfile(std::string Filename, bool ShowCounts,
+                     bool ShowAllFunctions, std::string ShowFunction,
+                     raw_fd_ostream &OS) {
+  auto ReaderOrErr = InstrProfReader::create(Filename);
+  if (std::error_code EC = ReaderOrErr.getError())
     exitWithError(EC.message(), Filename);
 
-  if (OutputFilename.empty())
-    OutputFilename = "-";
-
-  std::string ErrorInfo;
-  raw_fd_ostream OS(OutputFilename.data(), ErrorInfo, sys::fs::F_Text);
-  if (!ErrorInfo.empty())
-    exitWithError(ErrorInfo, OutputFilename);
-
-  if (ShowAllFunctions && !ShowFunction.empty())
-    errs() << "warning: -function argument ignored: showing all functions\n";
-
+  auto Reader = std::move(ReaderOrErr.get());
   uint64_t MaxFunctionCount = 0, MaxBlockCount = 0;
   size_t ShownFunctions = 0, TotalFunctions = 0;
   for (const auto &Func : *Reader) {
-    bool Show = ShowAllFunctions ||
-                (!ShowFunction.empty() &&
-                 Func.Name.find(ShowFunction) != Func.Name.npos);
+    bool Show =
+        ShowAllFunctions || (!ShowFunction.empty() &&
+                             Func.Name.find(ShowFunction) != Func.Name.npos);
 
     ++TotalFunctions;
     assert(Func.Counts.size() > 0 && "function missing entry counter");
@@ -150,6 +181,65 @@
   return 0;
 }
 
+int showSampleProfile(std::string Filename, bool ShowCounts,
+                      bool ShowAllFunctions, std::string ShowFunction,
+                      raw_fd_ostream &OS) {
+  using namespace sampleprof;
+  auto ReaderOrErr = SampleProfileReader::create(Filename, getGlobalContext());
+  if (std::error_code EC = ReaderOrErr.getError())
+    exitWithError(EC.message(), Filename);
+
+  auto Reader = std::move(ReaderOrErr.get());
+  Reader->read();
+  if (ShowAllFunctions || ShowFunction.empty())
+    Reader->dump(OS);
+  else
+    Reader->dumpFunctionProfile(ShowFunction, OS);
+
+  return 0;
+}
+
+int show_main(int argc, const char *argv[]) {
+  cl::opt<std::string> Filename(cl::Positional, cl::Required,
+                                cl::desc("<profdata-file>"));
+
+  cl::opt<bool> ShowCounts("counts", cl::init(false),
+                           cl::desc("Show counter values for shown functions"));
+  cl::opt<bool> ShowAllFunctions("all-functions", cl::init(false),
+                                 cl::desc("Details for every function"));
+  cl::opt<std::string> ShowFunction("function",
+                                    cl::desc("Details for matching functions"));
+
+  cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                      cl::init("-"), cl::desc("Output file"));
+  cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
+                            cl::aliasopt(OutputFilename));
+  cl::opt<ProfileKinds> ProfileKind(
+      cl::desc("Profile kind:"), cl::init(instr),
+      cl::values(clEnumVal(instr, "Instrumentation profile (default)"),
+                 clEnumVal(sample, "Sample profile"), clEnumValEnd));
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM profile data summary\n");
+
+  if (OutputFilename.empty())
+    OutputFilename = "-";
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::F_Text);
+  if (EC)
+    exitWithError(EC.message(), OutputFilename);
+
+  if (ShowAllFunctions && !ShowFunction.empty())
+    errs() << "warning: -function argument ignored: showing all functions\n";
+
+  if (ProfileKind == instr)
+    return showInstrProfile(Filename, ShowCounts, ShowAllFunctions,
+                            ShowFunction, OS);
+  else
+    return showSampleProfile(Filename, ShowCounts, ShowAllFunctions,
+                             ShowFunction, OS);
+}
+
 int main(int argc, const char *argv[]) {
   // Print a stack trace if we signal out.
   sys::PrintStackTraceOnErrorSignal();

diff --git a/tools/llvm-readobj/ARMAttributeParser.h b/tools/llvm-readobj/ARMAttributeParser.h
index c286251..f924c83 100644
--- a/tools/llvm-readobj/ARMAttributeParser.h
+++ b/tools/llvm-readobj/ARMAttributeParser.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_READOBJ_ARMATTRIBUTE_PARSER_H
-#define LLVM_READOBJ_ARMATTRIBUTE_PARSER_H
+#ifndef LLVM_TOOLS_LLVM_READOBJ_ARMATTRIBUTEPARSER_H
+#define LLVM_TOOLS_LLVM_READOBJ_ARMATTRIBUTEPARSER_H
 
 #include "StreamWriter.h"
 #include "llvm/Support/ARMBuildAttributes.h"

diff --git a/tools/llvm-readobj/ARMEHABIPrinter.h b/tools/llvm-readobj/ARMEHABIPrinter.h
index 7608cfb..b15421d 100644
--- a/tools/llvm-readobj/ARMEHABIPrinter.h
+++ b/tools/llvm-readobj/ARMEHABIPrinter.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_READOBJ_ARMEHABI_PRINTER_H
-#define LLVM_READOBJ_ARMEHABI_PRINTER_H
+#ifndef LLVM_TOOLS_LLVM_READOBJ_ARMEHABIPRINTER_H
+#define LLVM_TOOLS_LLVM_READOBJ_ARMEHABIPRINTER_H
 
 #include "Error.h"
 #include "StreamWriter.h"

diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
index b486e4a..ede36d1 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp

@@ -186,13 +186,8 @@
 ErrorOr<object::SectionRef>
 Decoder::getSectionContaining(const COFFObjectFile &COFF, uint64_t VA) {
   for (const auto &Section : COFF.sections()) {
-    uint64_t Address;
-    uint64_t Size;
-
-    if (std::error_code EC = Section.getAddress(Address))
-      return EC;
-    if (std::error_code EC = Section.getSize(Size))
-      return EC;
+    uint64_t Address = Section.getAddress();
+    uint64_t Size = Section.getSize();
 
     if (VA >= Address && (VA - Address) <= Size)
       return Section;
@@ -233,7 +228,7 @@
   return readobj_error::unknown_symbol;
 }
 
-bool Decoder::opcode_0xxxxxxx(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_0xxxxxxx(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   uint8_t Imm = OC[Offset] & 0x7f;
   SW.startLine() << format("0x%02x                ; %s sp, #(%u * 4)\n",
@@ -244,7 +239,7 @@
   return false;
 }
 
-bool Decoder::opcode_10Lxxxxx(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_10Lxxxxx(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   unsigned Link = (OC[Offset] & 0x20) >> 5;
   uint16_t RegisterMask = (Link << (Prologue ? 14 : 15))
@@ -263,7 +258,7 @@
   return false;
 }
 
-bool Decoder::opcode_1100xxxx(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_1100xxxx(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   if (Prologue)
     SW.startLine() << format("0x%02x                ; mov r%u, sp\n",
@@ -275,7 +270,7 @@
   return false;
 }
 
-bool Decoder::opcode_11010Lxx(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11010Lxx(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   unsigned Link = (OC[Offset] & 0x4) >> 3;
   unsigned Count = (OC[Offset] & 0x3);
@@ -292,7 +287,7 @@
   return false;
 }
 
-bool Decoder::opcode_11011Lxx(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11011Lxx(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   unsigned Link = (OC[Offset] & 0x4) >> 2;
   unsigned Count = (OC[Offset] & 0x3) + 4;
@@ -309,7 +304,7 @@
   return false;
 }
 
-bool Decoder::opcode_11100xxx(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11100xxx(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   unsigned High = (OC[Offset] & 0x7);
   uint32_t VFPMask = (((1 << (High + 1)) - 1) << 8);
@@ -323,7 +318,7 @@
   return false;
 }
 
-bool Decoder::opcode_111010xx(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_111010xx(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   uint16_t Imm = ((OC[Offset + 0] & 0x03) << 8) | ((OC[Offset + 1] & 0xff) << 0);
 
@@ -336,7 +331,7 @@
   return false;
 }
 
-bool Decoder::opcode_1110110L(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_1110110L(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   uint8_t GPRMask = ((OC[Offset + 0] & 0x01) << (Prologue ? 14 : 15))
                   | ((OC[Offset + 1] & 0xff) << 0);
@@ -350,7 +345,7 @@
   return false;
 }
 
-bool Decoder::opcode_11101110(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11101110(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   assert(!Prologue && "may not be used in prologue");
 
@@ -366,7 +361,7 @@
   return false;
 }
 
-bool Decoder::opcode_11101111(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11101111(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   assert(!Prologue && "may not be used in prologue");
 
@@ -382,7 +377,7 @@
   return false;
 }
 
-bool Decoder::opcode_11110101(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11110101(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   unsigned Start = (OC[Offset + 1] & 0xf0) >> 4;
   unsigned End = (OC[Offset + 1] & 0x0f) >> 0;
@@ -397,7 +392,7 @@
   return false;
 }
 
-bool Decoder::opcode_11110110(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11110110(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   unsigned Start = (OC[Offset + 1] & 0xf0) >> 4;
   unsigned End = (OC[Offset + 1] & 0x0f) >> 0;
@@ -412,7 +407,7 @@
   return false;
 }
 
-bool Decoder::opcode_11110111(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11110111(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   uint32_t Imm = (OC[Offset + 1] << 8) | (OC[Offset + 2] << 0);
 
@@ -425,7 +420,7 @@
   return false;
 }
 
-bool Decoder::opcode_11111000(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11111000(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   uint32_t Imm = (OC[Offset + 1] << 16)
                | (OC[Offset + 2] << 8)
@@ -440,7 +435,7 @@
   return false;
 }
 
-bool Decoder::opcode_11111001(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11111001(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   uint32_t Imm = (OC[Offset + 1] << 8) | (OC[Offset + 2] << 0);
 
@@ -453,7 +448,7 @@
   return false;
 }
 
-bool Decoder::opcode_11111010(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11111010(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   uint32_t Imm = (OC[Offset + 1] << 16)
                | (OC[Offset + 2] << 8)
@@ -468,41 +463,41 @@
   return false;
 }
 
-bool Decoder::opcode_11111011(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11111011(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   SW.startLine() << format("0x%02x                ; nop\n", OC[Offset]);
   ++Offset;
   return false;
 }
 
-bool Decoder::opcode_11111100(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11111100(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   SW.startLine() << format("0x%02x                ; nop.w\n", OC[Offset]);
   ++Offset;
   return false;
 }
 
-bool Decoder::opcode_11111101(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11111101(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   SW.startLine() << format("0x%02x                ; b\n", OC[Offset]);
   ++Offset;
   return true;
 }
 
-bool Decoder::opcode_11111110(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11111110(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   SW.startLine() << format("0x%02x                ; b.w\n", OC[Offset]);
   ++Offset;
   return true;
 }
 
-bool Decoder::opcode_11111111(const ulittle8_t *OC, unsigned &Offset,
+bool Decoder::opcode_11111111(const uint8_t *OC, unsigned &Offset,
                               unsigned Length, bool Prologue) {
   ++Offset;
   return true;
 }
 
-void Decoder::decodeOpcodes(ArrayRef<ulittle8_t> Opcodes, unsigned Offset,
+void Decoder::decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
                             bool Prologue) {
   assert((!Prologue || Offset == 0) && "prologue should always use offset 0");
 
@@ -525,10 +520,7 @@
   if (COFF.getSectionContents(COFF.getCOFFSection(Section), Contents))
     return false;
 
-  uint64_t SectionVA;
-  if (Section.getAddress(SectionVA))
-    return false;
-
+  uint64_t SectionVA = Section.getAddress();
   uint64_t Offset = VA - SectionVA;
   const ulittle32_t *Data =
     reinterpret_cast<const ulittle32_t *>(Contents.data() + Offset);
@@ -546,7 +538,7 @@
                  static_cast<uint64_t>(XData.CodeWords() * sizeof(uint32_t)));
 
   if (XData.E()) {
-    ArrayRef<ulittle8_t> UC = XData.UnwindByteCode();
+    ArrayRef<uint8_t> UC = XData.UnwindByteCode();
     if (!XData.F()) {
       ListScope PS(SW, "Prologue");
       decodeOpcodes(UC, 0, /*Prologue=*/true);
@@ -741,4 +733,3 @@
 }
 }
 }
-

diff --git a/tools/llvm-readobj/ARMWinEHPrinter.h b/tools/llvm-readobj/ARMWinEHPrinter.h
index 740c8b5..274ef11 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.h
+++ b/tools/llvm-readobj/ARMWinEHPrinter.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_READOBJ_ARMWINEHPRINTER_H
-#define LLVM_READOBJ_ARMWINEHPRINTER_H
+#ifndef LLVM_TOOLS_LLVM_READOBJ_ARMWINEHPRINTER_H
+#define LLVM_TOOLS_LLVM_READOBJ_ARMWINEHPRINTER_H
 
 #include "StreamWriter.h"
 #include "llvm/Object/COFF.h"
@@ -28,55 +28,54 @@
   struct RingEntry {
     uint8_t Mask;
     uint8_t Value;
-    bool (Decoder::*Routine)(const support::ulittle8_t *, unsigned &, unsigned,
-                             bool);
+    bool (Decoder::*Routine)(const uint8_t *, unsigned &, unsigned, bool);
   };
   static const RingEntry Ring[];
 
-  bool opcode_0xxxxxxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_0xxxxxxx(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_10Lxxxxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_10Lxxxxx(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_1100xxxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_1100xxxx(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11010Lxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11010Lxx(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11011Lxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11011Lxx(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11100xxx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11100xxx(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_111010xx(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_111010xx(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_1110110L(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_1110110L(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11101110(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11101110(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11101111(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11101111(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11110101(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11110101(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11110110(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11110110(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11110111(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11110111(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11111000(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11111000(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11111001(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11111001(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11111010(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11111010(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11111011(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11111011(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11111100(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11111100(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11111101(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11111101(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11111110(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11111110(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
-  bool opcode_11111111(const support::ulittle8_t *Opcodes, unsigned &Offset,
+  bool opcode_11111111(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
 
-  void decodeOpcodes(ArrayRef<support::ulittle8_t> Opcodes, unsigned Offset,
+  void decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
                      bool Prologue);
 
   void printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask);
@@ -116,4 +115,3 @@
 }
 
 #endif
-

diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 7842cd4..5276428 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp

@@ -20,6 +20,7 @@
 #include "Win64EHDumper.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/COFF.h"
@@ -49,35 +50,49 @@
     cacheRelocations();
   }
 
-  virtual void printFileHeaders() override;
-  virtual void printSections() override;
-  virtual void printRelocations() override;
-  virtual void printSymbols() override;
-  virtual void printDynamicSymbols() override;
-  virtual void printUnwindInfo() override;
+  void printFileHeaders() override;
+  void printSections() override;
+  void printRelocations() override;
+  void printSymbols() override;
+  void printDynamicSymbols() override;
+  void printUnwindInfo() override;
+  void printCOFFImports() override;
+  void printCOFFDirectives() override;
+  void printCOFFBaseReloc() override;
 
 private:
   void printSymbol(const SymbolRef &Sym);
   void printRelocation(const SectionRef &Section, const RelocationRef &Reloc);
   void printDataDirectory(uint32_t Index, const std::string &FieldName);
 
+  void printDOSHeader(const dos_header *DH);
   template <class PEHeader> void printPEHeader(const PEHeader *Hdr);
   void printBaseOfDataField(const pe32_header *Hdr);
   void printBaseOfDataField(const pe32plus_header *Hdr);
 
   void printCodeViewLineTables(const SectionRef &Section);
 
+  void printCodeViewSymbolsSubsection(StringRef Subsection,
+                                      const SectionRef &Section,
+                                      uint32_t Offset);
+
   void cacheRelocations();
 
   std::error_code resolveSymbol(const coff_section *Section, uint64_t Offset,
                                 SymbolRef &Sym);
   std::error_code resolveSymbolName(const coff_section *Section,
                                     uint64_t Offset, StringRef &Name);
+  void printImportedSymbols(iterator_range<imported_symbol_iterator> Range);
+  void printDelayImportedSymbols(
+      const DelayImportDirectoryEntryRef &I,
+      iterator_range<imported_symbol_iterator> Range);
 
   typedef DenseMap<const coff_section*, std::vector<RelocationRef> > RelocMapTy;
 
   const llvm::object::COFFObjectFile *Obj;
   RelocMapTy RelocMap;
+  StringRef CVFileIndexToStringOffsetTable;
+  StringRef CVStringTable;
 };
 
 } // namespace
@@ -313,9 +328,10 @@
 
 template <typename T>
 static std::error_code getSymbolAuxData(const COFFObjectFile *Obj,
-                                        const coff_symbol *Symbol,
-                                        const T *&Aux) {
+                                        COFFSymbolRef Symbol,
+                                        uint8_t AuxSymbolIdx, const T *&Aux) {
   ArrayRef<uint8_t> AuxData = Obj->getSymbolAuxData(Symbol);
+  AuxData = AuxData.slice(AuxSymbolIdx * Obj->getSymbolTableEntrySize());
   Aux = reinterpret_cast<const T*>(AuxData.data());
   return readobj_error::success;
 }
@@ -342,25 +358,20 @@
 }
 
 void COFFDumper::printFileHeaders() {
-  // Print COFF header
-  const coff_file_header *COFFHeader = nullptr;
-  if (error(Obj->getCOFFHeader(COFFHeader)))
-    return;
-
-  time_t TDS = COFFHeader->TimeDateStamp;
+  time_t TDS = Obj->getTimeDateStamp();
   char FormattedTime[20] = { };
   strftime(FormattedTime, 20, "%Y-%m-%d %H:%M:%S", gmtime(&TDS));
 
   {
     DictScope D(W, "ImageFileHeader");
-    W.printEnum  ("Machine", COFFHeader->Machine,
+    W.printEnum  ("Machine", Obj->getMachine(),
                     makeArrayRef(ImageFileMachineType));
-    W.printNumber("SectionCount", COFFHeader->NumberOfSections);
-    W.printHex   ("TimeDateStamp", FormattedTime, COFFHeader->TimeDateStamp);
-    W.printHex   ("PointerToSymbolTable", COFFHeader->PointerToSymbolTable);
-    W.printNumber("SymbolCount", COFFHeader->NumberOfSymbols);
-    W.printNumber("OptionalHeaderSize", COFFHeader->SizeOfOptionalHeader);
-    W.printFlags ("Characteristics", COFFHeader->Characteristics,
+    W.printNumber("SectionCount", Obj->getNumberOfSections());
+    W.printHex   ("TimeDateStamp", FormattedTime, Obj->getTimeDateStamp());
+    W.printHex   ("PointerToSymbolTable", Obj->getPointerToSymbolTable());
+    W.printNumber("SymbolCount", Obj->getNumberOfSymbols());
+    W.printNumber("OptionalHeaderSize", Obj->getSizeOfOptionalHeader());
+    W.printFlags ("Characteristics", Obj->getCharacteristics(),
                     makeArrayRef(ImageFileCharacteristics));
   }
 
@@ -377,6 +388,30 @@
     return;
   if (PEPlusHeader)
     printPEHeader<pe32plus_header>(PEPlusHeader);
+
+  if (const dos_header *DH = Obj->getDOSHeader())
+    printDOSHeader(DH);
+}
+
+void COFFDumper::printDOSHeader(const dos_header *DH) {
+  DictScope D(W, "DOSHeader");
+  W.printString("Magic", StringRef(DH->Magic, sizeof(DH->Magic)));
+  W.printNumber("UsedBytesInTheLastPage", DH->UsedBytesInTheLastPage);
+  W.printNumber("FileSizeInPages", DH->FileSizeInPages);
+  W.printNumber("NumberOfRelocationItems", DH->NumberOfRelocationItems);
+  W.printNumber("HeaderSizeInParagraphs", DH->HeaderSizeInParagraphs);
+  W.printNumber("MinimumExtraParagraphs", DH->MinimumExtraParagraphs);
+  W.printNumber("MaximumExtraParagraphs", DH->MaximumExtraParagraphs);
+  W.printNumber("InitialRelativeSS", DH->InitialRelativeSS);
+  W.printNumber("InitialSP", DH->InitialSP);
+  W.printNumber("Checksum", DH->Checksum);
+  W.printNumber("InitialIP", DH->InitialIP);
+  W.printNumber("InitialRelativeCS", DH->InitialRelativeCS);
+  W.printNumber("AddressOfRelocationTable", DH->AddressOfRelocationTable);
+  W.printNumber("OverlayNumber", DH->OverlayNumber);
+  W.printNumber("OEMid", DH->OEMid);
+  W.printNumber("OEMinfo", DH->OEMinfo);
+  W.printNumber("AddressOfNewExeHeader", DH->AddressOfNewExeHeader);
 }
 
 template <class PEHeader>
@@ -404,7 +439,7 @@
   W.printNumber("SizeOfImage", Hdr->SizeOfImage);
   W.printNumber("SizeOfHeaders", Hdr->SizeOfHeaders);
   W.printEnum  ("Subsystem", Hdr->Subsystem, makeArrayRef(PEWindowsSubsystem));
-  W.printFlags ("Subsystem", Hdr->DLLCharacteristics,
+  W.printFlags ("Characteristics", Hdr->DLLCharacteristics,
                 makeArrayRef(PEDLLCharacteristics));
   W.printNumber("SizeOfStackReserve", Hdr->SizeOfStackReserve);
   W.printNumber("SizeOfStackCommit", Hdr->SizeOfStackCommit);
@@ -440,11 +475,10 @@
 
   SmallVector<StringRef, 10> FunctionNames;
   StringMap<StringRef> FunctionLineTables;
-  StringRef FileIndexToStringOffsetTable;
-  StringRef StringTable;
 
   ListScope D(W, "CodeViewLineTables");
   {
+    // FIXME: Add more offset correctness checks.
     DataExtractor DE(Data, true, 4);
     uint32_t Offset = 0,
              Magic = DE.getU32(&Offset);
@@ -474,6 +508,9 @@
       W.printBinaryBlock("Contents", Contents);
 
       switch (SubSectionType) {
+      case COFF::DEBUG_SYMBOL_SUBSECTION:
+        printCodeViewSymbolsSubsection(Contents, Section, Offset);
+        break;
       case COFF::DEBUG_LINE_TABLE_SUBSECTION: {
         // Holds a PC to file:line table.  Some data to parse this subsection is
         // stored in the other subsections, so just check sanity and store the
@@ -502,25 +539,25 @@
         break;
       }
       case COFF::DEBUG_STRING_TABLE_SUBSECTION:
-        if (PayloadSize == 0 || StringTable.data() != nullptr ||
+        if (PayloadSize == 0 || CVStringTable.data() != nullptr ||
             Contents.back() != '\0') {
           // Empty or duplicate or non-null-terminated subsection.
           error(object_error::parse_failed);
           return;
         }
-        StringTable = Contents;
+        CVStringTable = Contents;
         break;
       case COFF::DEBUG_INDEX_SUBSECTION:
         // Holds the translation table from file indices
         // to offsets in the string table.
 
         if (PayloadSize == 0 ||
-            FileIndexToStringOffsetTable.data() != nullptr) {
+            CVFileIndexToStringOffsetTable.data() != nullptr) {
           // Empty or duplicate subsection.
           error(object_error::parse_failed);
           return;
         }
-        FileIndexToStringOffsetTable = Contents;
+        CVFileIndexToStringOffsetTable = Contents;
         break;
       }
       Offset += PayloadSize;
@@ -555,7 +592,7 @@
 
       uint32_t FilenameOffset;
       {
-        DataExtractor SDE(FileIndexToStringOffsetTable, true, 4);
+        DataExtractor SDE(CVFileIndexToStringOffsetTable, true, 4);
         uint32_t OffsetInSDE = OffsetInIndex;
         if (!SDE.isValidOffset(OffsetInSDE)) {
           error(object_error::parse_failed);
@@ -564,15 +601,15 @@
         FilenameOffset = SDE.getU32(&OffsetInSDE);
       }
 
-      if (FilenameOffset == 0 || FilenameOffset + 1 >= StringTable.size() ||
-          StringTable.data()[FilenameOffset - 1] != '\0') {
+      if (FilenameOffset == 0 || FilenameOffset + 1 >= CVStringTable.size() ||
+          CVStringTable.data()[FilenameOffset - 1] != '\0') {
         // Each string in an F3 subsection should be preceded by a null
         // character.
         error(object_error::parse_failed);
         return;
       }
 
-      StringRef Filename(StringTable.data() + FilenameOffset);
+      StringRef Filename(CVStringTable.data() + FilenameOffset);
       ListScope S(W, "FilenameSegment");
       W.printString("Filename", Filename);
       for (unsigned J = 0; J != SegmentLength && DE.isValidOffset(Offset);
@@ -593,6 +630,80 @@
   }
 }
 
+void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
+                                                const SectionRef &Section,
+                                                uint32_t OffsetInSection) {
+  if (Subsection.size() == 0) {
+    error(object_error::parse_failed);
+    return;
+  }
+  DataExtractor DE(Subsection, true, 4);
+  uint32_t Offset = 0;
+
+  // Function-level subsections have "procedure start" and "procedure end"
+  // commands that should come in pairs and surround relevant info.
+  bool InFunctionScope = false;
+  while (DE.isValidOffset(Offset)) {
+    // Read subsection segments one by one.
+    uint16_t Size = DE.getU16(&Offset);
+    // The section size includes the size of the type identifier.
+    if (Size < 2 || !DE.isValidOffsetForDataOfSize(Offset, Size)) {
+      error(object_error::parse_failed);
+      return;
+    }
+    Size -= 2;
+    uint16_t Type = DE.getU16(&Offset);
+    switch (Type) {
+    case COFF::DEBUG_SYMBOL_TYPE_PROC_START: {
+      DictScope S(W, "ProcStart");
+      if (InFunctionScope || Size < 36) {
+        error(object_error::parse_failed);
+        return;
+      }
+      InFunctionScope = true;
+
+      // We're currently interested in a limited subset of fields in this
+      // segment, just ignore the rest of the fields for now.
+      uint8_t Unused[12];
+      DE.getU8(&Offset, Unused, 12);
+      uint32_t CodeSize = DE.getU32(&Offset);
+      DE.getU8(&Offset, Unused, 12);
+      StringRef SectionName;
+      if (error(resolveSymbolName(Obj->getCOFFSection(Section),
+                                  OffsetInSection + Offset, SectionName)))
+        return;
+      Offset += 4;
+      DE.getU8(&Offset, Unused, 3);
+      StringRef DisplayName = DE.getCStr(&Offset);
+      if (!DE.isValidOffset(Offset)) {
+        error(object_error::parse_failed);
+        return;
+      }
+      W.printString("DisplayName", DisplayName);
+      W.printString("Section", SectionName);
+      W.printHex("CodeSize", CodeSize);
+
+      break;
+    }
+    case COFF::DEBUG_SYMBOL_TYPE_PROC_END: {
+      W.startLine() << "ProcEnd\n";
+      if (!InFunctionScope || Size > 0) {
+        error(object_error::parse_failed);
+        return;
+      }
+      InFunctionScope = false;
+      break;
+    }
+    default:
+      Offset += Size;
+      break;
+    }
+  }
+
+  if (InFunctionScope)
+    error(object_error::parse_failed);
+}
+
 void COFFDumper::printSections() {
   ListScope SectionsD(W, "Sections");
   int SectionNumber = 0;
@@ -628,8 +739,7 @@
     if (opts::SectionSymbols) {
       ListScope D(W, "Symbols");
       for (const SymbolRef &Symbol : Obj->symbols()) {
-        bool Contained = false;
-        if (Sec.containsSymbol(Symbol, Contained) || !Contained)
+        if (!Sec.containsSymbol(Symbol))
           continue;
 
         printSymbol(Symbol);
@@ -639,7 +749,8 @@
     if (Name == ".debug$S" && opts::CodeViewLineTables)
       printCodeViewLineTables(Sec);
 
-    if (opts::SectionData) {
+    if (opts::SectionData &&
+        !(Section->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)) {
       StringRef Data;
       if (error(Sec.getContents(Data)))
         break;
@@ -683,7 +794,6 @@
   uint64_t RelocType;
   SmallString<32> RelocName;
   StringRef SymbolName;
-  StringRef Contents;
   if (error(Reloc.getOffset(Offset)))
     return;
   if (error(Reloc.getType(RelocType)))
@@ -691,21 +801,19 @@
   if (error(Reloc.getTypeName(RelocName)))
     return;
   symbol_iterator Symbol = Reloc.getSymbol();
-  if (error(Symbol->getName(SymbolName)))
-    return;
-  if (error(Section.getContents(Contents)))
+  if (Symbol != Obj->symbol_end() && error(Symbol->getName(SymbolName)))
     return;
 
   if (opts::ExpandRelocs) {
     DictScope Group(W, "Relocation");
     W.printHex("Offset", Offset);
     W.printNumber("Type", RelocName, RelocType);
-    W.printString("Symbol", SymbolName.size() > 0 ? SymbolName : "-");
+    W.printString("Symbol", SymbolName.empty() ? "-" : SymbolName);
   } else {
     raw_ostream& OS = W.startLine();
     OS << W.hex(Offset)
        << " " << RelocName
-       << " " << (SymbolName.size() > 0 ? SymbolName : "-")
+       << " " << (SymbolName.empty() ? "-" : SymbolName)
        << "\n";
   }
 }
@@ -719,12 +827,30 @@
 
 void COFFDumper::printDynamicSymbols() { ListScope Group(W, "DynamicSymbols"); }
 
+static ErrorOr<StringRef>
+getSectionName(const llvm::object::COFFObjectFile *Obj, int32_t SectionNumber,
+               const coff_section *Section) {
+  if (Section) {
+    StringRef SectionName;
+    if (std::error_code EC = Obj->getSectionName(Section, SectionName))
+      return EC;
+    return SectionName;
+  }
+  if (SectionNumber == llvm::COFF::IMAGE_SYM_DEBUG)
+    return StringRef("IMAGE_SYM_DEBUG");
+  if (SectionNumber == llvm::COFF::IMAGE_SYM_ABSOLUTE)
+    return StringRef("IMAGE_SYM_ABSOLUTE");
+  if (SectionNumber == llvm::COFF::IMAGE_SYM_UNDEFINED)
+    return StringRef("IMAGE_SYM_UNDEFINED");
+  return StringRef("");
+}
+
 void COFFDumper::printSymbol(const SymbolRef &Sym) {
   DictScope D(W, "Symbol");
 
-  const coff_symbol *Symbol = Obj->getCOFFSymbol(Sym);
+  COFFSymbolRef Symbol = Obj->getCOFFSymbol(Sym);
   const coff_section *Section;
-  if (std::error_code EC = Obj->getSection(Symbol->SectionNumber, Section)) {
+  if (std::error_code EC = Obj->getSection(Symbol.getSectionNumber(), Section)) {
     W.startLine() << "Invalid section number: " << EC.message() << "\n";
     W.flush();
     return;
@@ -735,23 +861,25 @@
     SymbolName = "";
 
   StringRef SectionName = "";
-  if (Section)
-    Obj->getSectionName(Section, SectionName);
+  ErrorOr<StringRef> Res =
+      getSectionName(Obj, Symbol.getSectionNumber(), Section);
+  if (Res)
+    SectionName = *Res;
 
   W.printString("Name", SymbolName);
-  W.printNumber("Value", Symbol->Value);
-  W.printNumber("Section", SectionName, Symbol->SectionNumber);
-  W.printEnum  ("BaseType", Symbol->getBaseType(), makeArrayRef(ImageSymType));
-  W.printEnum  ("ComplexType", Symbol->getComplexType(),
+  W.printNumber("Value", Symbol.getValue());
+  W.printNumber("Section", SectionName, Symbol.getSectionNumber());
+  W.printEnum  ("BaseType", Symbol.getBaseType(), makeArrayRef(ImageSymType));
+  W.printEnum  ("ComplexType", Symbol.getComplexType(),
                                                    makeArrayRef(ImageSymDType));
-  W.printEnum  ("StorageClass", Symbol->StorageClass,
+  W.printEnum  ("StorageClass", Symbol.getStorageClass(),
                                                    makeArrayRef(ImageSymClass));
-  W.printNumber("AuxSymbolCount", Symbol->NumberOfAuxSymbols);
+  W.printNumber("AuxSymbolCount", Symbol.getNumberOfAuxSymbols());
 
-  for (unsigned I = 0; I < Symbol->NumberOfAuxSymbols; ++I) {
-    if (Symbol->isFunctionDefinition()) {
+  for (uint8_t I = 0; I < Symbol.getNumberOfAuxSymbols(); ++I) {
+    if (Symbol.isFunctionDefinition()) {
       const coff_aux_function_definition *Aux;
-      if (error(getSymbolAuxData(Obj, Symbol + I, Aux)))
+      if (error(getSymbolAuxData(Obj, Symbol, I, Aux)))
         break;
 
       DictScope AS(W, "AuxFunctionDef");
@@ -759,18 +887,16 @@
       W.printNumber("TotalSize", Aux->TotalSize);
       W.printHex("PointerToLineNumber", Aux->PointerToLinenumber);
       W.printHex("PointerToNextFunction", Aux->PointerToNextFunction);
-      W.printBinary("Unused", makeArrayRef(Aux->Unused));
 
-    } else if (Symbol->isWeakExternal()) {
+    } else if (Symbol.isAnyUndefined()) {
       const coff_aux_weak_external *Aux;
-      if (error(getSymbolAuxData(Obj, Symbol + I, Aux)))
+      if (error(getSymbolAuxData(Obj, Symbol, I, Aux)))
         break;
 
-      const coff_symbol *Linked;
+      ErrorOr<COFFSymbolRef> Linked = Obj->getSymbol(Aux->TagIndex);
       StringRef LinkedName;
-      std::error_code EC;
-      if ((EC = Obj->getSymbol(Aux->TagIndex, Linked)) ||
-          (EC = Obj->getSymbolName(Linked, LinkedName))) {
+      std::error_code EC = Linked.getError();
+      if (EC || (EC = Obj->getSymbolName(*Linked, LinkedName))) {
         LinkedName = "";
         error(EC);
       }
@@ -779,56 +905,60 @@
       W.printNumber("Linked", LinkedName, Aux->TagIndex);
       W.printEnum  ("Search", Aux->Characteristics,
                     makeArrayRef(WeakExternalCharacteristics));
-      W.printBinary("Unused", makeArrayRef(Aux->Unused));
 
-    } else if (Symbol->isFileRecord()) {
-      const coff_aux_file *Aux;
-      if (error(getSymbolAuxData(Obj, Symbol + I, Aux)))
+    } else if (Symbol.isFileRecord()) {
+      const char *FileName;
+      if (error(getSymbolAuxData(Obj, Symbol, I, FileName)))
         break;
 
       DictScope AS(W, "AuxFileRecord");
 
-      StringRef Name(Aux->FileName,
-                     Symbol->NumberOfAuxSymbols * COFF::SymbolSize);
+      StringRef Name(FileName, Symbol.getNumberOfAuxSymbols() *
+                                   Obj->getSymbolTableEntrySize());
       W.printString("FileName", Name.rtrim(StringRef("\0", 1)));
       break;
-    } else if (Symbol->isSectionDefinition()) {
+    } else if (Symbol.isSectionDefinition()) {
       const coff_aux_section_definition *Aux;
-      if (error(getSymbolAuxData(Obj, Symbol + I, Aux)))
+      if (error(getSymbolAuxData(Obj, Symbol, I, Aux)))
         break;
 
+      int32_t AuxNumber = Aux->getNumber(Symbol.isBigObj());
+
       DictScope AS(W, "AuxSectionDef");
       W.printNumber("Length", Aux->Length);
       W.printNumber("RelocationCount", Aux->NumberOfRelocations);
       W.printNumber("LineNumberCount", Aux->NumberOfLinenumbers);
       W.printHex("Checksum", Aux->CheckSum);
-      W.printNumber("Number", Aux->Number);
+      W.printNumber("Number", AuxNumber);
       W.printEnum("Selection", Aux->Selection, makeArrayRef(ImageCOMDATSelect));
-      W.printBinary("Unused", makeArrayRef(Aux->Unused));
 
       if (Section && Section->Characteristics & COFF::IMAGE_SCN_LNK_COMDAT
           && Aux->Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
         const coff_section *Assoc;
-        StringRef AssocName;
-        std::error_code EC;
-        if ((EC = Obj->getSection(Aux->Number, Assoc)) ||
-            (EC = Obj->getSectionName(Assoc, AssocName))) {
+        StringRef AssocName = "";
+        std::error_code EC = Obj->getSection(AuxNumber, Assoc);
+        ErrorOr<StringRef> Res = getSectionName(Obj, AuxNumber, Assoc);
+        if (Res)
+          AssocName = *Res;
+        if (!EC)
+          EC = Res.getError();
+        if (EC) {
           AssocName = "";
           error(EC);
         }
 
-        W.printNumber("AssocSection", AssocName, Aux->Number);
+        W.printNumber("AssocSection", AssocName, AuxNumber);
       }
-    } else if (Symbol->isCLRToken()) {
+    } else if (Symbol.isCLRToken()) {
       const coff_aux_clr_token *Aux;
-      if (error(getSymbolAuxData(Obj, Symbol + I, Aux)))
+      if (error(getSymbolAuxData(Obj, Symbol, I, Aux)))
         break;
 
-      const coff_symbol *ReferredSym;
+      ErrorOr<COFFSymbolRef> ReferredSym =
+          Obj->getSymbol(Aux->SymbolTableIndex);
       StringRef ReferredName;
-      std::error_code EC;
-      if ((EC = Obj->getSymbol(Aux->SymbolTableIndex, ReferredSym)) ||
-          (EC = Obj->getSymbolName(ReferredSym, ReferredName))) {
+      std::error_code EC = ReferredSym.getError();
+      if (EC || (EC = Obj->getSymbolName(*ReferredSym, ReferredName))) {
         ReferredName = "";
         error(EC);
       }
@@ -837,7 +967,6 @@
       W.printNumber("AuxType", Aux->AuxType);
       W.printNumber("Reserved", Aux->Reserved);
       W.printNumber("SymbolTableIndex", ReferredName, Aux->SymbolTableIndex);
-      W.printBinary("Unused", makeArrayRef(Aux->Unused));
 
     } else {
       W.startLine() << "<unhandled auxiliary record>\n";
@@ -846,12 +975,8 @@
 }
 
 void COFFDumper::printUnwindInfo() {
-  const coff_file_header *Header;
-  if (error(Obj->getCOFFHeader(Header)))
-    return;
-
   ListScope D(W, "UnwindInformation");
-  switch (Header->Machine) {
+  switch (Obj->getMachine()) {
   case COFF::IMAGE_FILE_MACHINE_AMD64: {
     Win64EH::Dumper Dumper(W);
     Win64EH::Dumper::SymbolResolver
@@ -870,9 +995,113 @@
     break;
   }
   default:
-    W.printEnum("unsupported Image Machine", Header->Machine,
+    W.printEnum("unsupported Image Machine", Obj->getMachine(),
                 makeArrayRef(ImageFileMachineType));
     break;
   }
 }
 
+void COFFDumper::printImportedSymbols(
+    iterator_range<imported_symbol_iterator> Range) {
+  for (const ImportedSymbolRef &I : Range) {
+    StringRef Sym;
+    if (error(I.getSymbolName(Sym))) return;
+    uint16_t Ordinal;
+    if (error(I.getOrdinal(Ordinal))) return;
+    W.printNumber("Symbol", Sym, Ordinal);
+  }
+}
+
+void COFFDumper::printDelayImportedSymbols(
+    const DelayImportDirectoryEntryRef &I,
+    iterator_range<imported_symbol_iterator> Range) {
+  int Index = 0;
+  for (const ImportedSymbolRef &S : Range) {
+    DictScope Import(W, "Import");
+    StringRef Sym;
+    if (error(S.getSymbolName(Sym))) return;
+    uint16_t Ordinal;
+    if (error(S.getOrdinal(Ordinal))) return;
+    W.printNumber("Symbol", Sym, Ordinal);
+    uint64_t Addr;
+    if (error(I.getImportAddress(Index++, Addr))) return;
+    W.printHex("Address", Addr);
+  }
+}
+
+void COFFDumper::printCOFFImports() {
+  // Regular imports
+  for (const ImportDirectoryEntryRef &I : Obj->import_directories()) {
+    DictScope Import(W, "Import");
+    StringRef Name;
+    if (error(I.getName(Name))) return;
+    W.printString("Name", Name);
+    uint32_t Addr;
+    if (error(I.getImportLookupTableRVA(Addr))) return;
+    W.printHex("ImportLookupTableRVA", Addr);
+    if (error(I.getImportAddressTableRVA(Addr))) return;
+    W.printHex("ImportAddressTableRVA", Addr);
+    printImportedSymbols(I.imported_symbols());
+  }
+
+  // Delay imports
+  for (const DelayImportDirectoryEntryRef &I : Obj->delay_import_directories()) {
+    DictScope Import(W, "DelayImport");
+    StringRef Name;
+    if (error(I.getName(Name))) return;
+    W.printString("Name", Name);
+    const delay_import_directory_table_entry *Table;
+    if (error(I.getDelayImportTable(Table))) return;
+    W.printHex("Attributes", Table->Attributes);
+    W.printHex("ModuleHandle", Table->ModuleHandle);
+    W.printHex("ImportAddressTable", Table->DelayImportAddressTable);
+    W.printHex("ImportNameTable", Table->DelayImportNameTable);
+    W.printHex("BoundDelayImportTable", Table->BoundDelayImportTable);
+    W.printHex("UnloadDelayImportTable", Table->UnloadDelayImportTable);
+    printDelayImportedSymbols(I, I.imported_symbols());
+  }
+}
+
+void COFFDumper::printCOFFDirectives() {
+  for (const SectionRef &Section : Obj->sections()) {
+    StringRef Contents;
+    StringRef Name;
+
+    if (error(Section.getName(Name)))
+      continue;
+    if (Name != ".drectve")
+      continue;
+
+    if (error(Section.getContents(Contents)))
+      return;
+
+    W.printString("Directive(s)", Contents);
+  }
+}
+
+static StringRef getBaseRelocTypeName(uint8_t Type) {
+  switch (Type) {
+  case COFF::IMAGE_REL_BASED_ABSOLUTE: return "ABSOLUTE";
+  case COFF::IMAGE_REL_BASED_HIGH: return "HIGH";
+  case COFF::IMAGE_REL_BASED_LOW: return "LOW";
+  case COFF::IMAGE_REL_BASED_HIGHLOW: return "HIGHLOW";
+  case COFF::IMAGE_REL_BASED_HIGHADJ: return "HIGHADJ";
+  case COFF::IMAGE_REL_BASED_DIR64: return "DIR64";
+  default: return "unknown (" + llvm::utostr(Type) + ")";
+  }
+}
+
+void COFFDumper::printCOFFBaseReloc() {
+  ListScope D(W, "BaseReloc");
+  for (const BaseRelocRef &I : Obj->base_relocs()) {
+    uint8_t Type;
+    uint32_t RVA;
+    if (error(I.getRVA(RVA)))
+      continue;
+    if (error(I.getType(Type)))
+      continue;
+    DictScope Import(W, "Entry");
+    W.printString("Type", getBaseRelocTypeName(Type));
+    W.printHex("Address", RVA);
+  }
+}

diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 5df51e2..d68c786 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp

@@ -407,6 +407,7 @@
     switch (Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_MIPS_REGINFO);
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_MIPS_OPTIONS);
+    LLVM_READOBJ_ENUM_CASE(ELF, SHT_MIPS_ABIFLAGS);
     }
   }
 
@@ -603,7 +604,7 @@
       }
     }
 
-    if (opts::SectionData) {
+    if (opts::SectionData && Section->sh_type != ELF::SHT_NOBITS) {
       ArrayRef<uint8_t> Data = errorOrDefault(Obj->getSectionContents(Section));
       W.printBinaryBlock("SectionData",
                          StringRef((const char *)Data.data(), Data.size()));
@@ -675,7 +676,8 @@
     DictScope Group(W, "Relocation");
     W.printHex("Offset", Rel.r_offset);
     W.printNumber("Type", RelocName, (int)Rel.getType(Obj->isMips64EL()));
-    W.printString("Symbol", SymbolName.size() > 0 ? SymbolName : "-");
+    W.printNumber("Symbol", SymbolName.size() > 0 ? SymbolName : "-",
+                  Rel.getSymbol(Obj->isMips64EL()));
     W.printHex("Addend", Rel.r_addend);
   } else {
     raw_ostream& OS = W.startLine();

diff --git a/tools/llvm-readobj/Error.cpp b/tools/llvm-readobj/Error.cpp
index a078f5c..7e6f780 100644
--- a/tools/llvm-readobj/Error.cpp
+++ b/tools/llvm-readobj/Error.cpp

@@ -24,7 +24,7 @@
 };
 } // namespace
 
-const char *_readobj_error_category::name() const {
+const char *_readobj_error_category::name() const LLVM_NOEXCEPT {
   return "llvm.readobj";
 }
 

diff --git a/tools/llvm-readobj/Error.h b/tools/llvm-readobj/Error.h
index 81ce408..f3e24bb 100644
--- a/tools/llvm-readobj/Error.h
+++ b/tools/llvm-readobj/Error.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_READOBJ_ERROR_H
-#define LLVM_READOBJ_ERROR_H
+#ifndef LLVM_TOOLS_LLVM_READOBJ_ERROR_H
+#define LLVM_TOOLS_LLVM_READOBJ_ERROR_H
 
 #include <system_error>
 

diff --git a/tools/llvm-readobj/MachODumper.cpp b/tools/llvm-readobj/MachODumper.cpp
index a5e5cf8..7e8fdad 100644
--- a/tools/llvm-readobj/MachODumper.cpp
+++ b/tools/llvm-readobj/MachODumper.cpp

@@ -31,14 +31,17 @@
     : ObjDumper(Writer)
     , Obj(Obj) { }
 
-  virtual void printFileHeaders() override;
-  virtual void printSections() override;
-  virtual void printRelocations() override;
-  virtual void printSymbols() override;
-  virtual void printDynamicSymbols() override;
-  virtual void printUnwindInfo() override;
+  void printFileHeaders() override;
+  void printSections() override;
+  void printRelocations() override;
+  void printSymbols() override;
+  void printDynamicSymbols() override;
+  void printUnwindInfo() override;
 
 private:
+  template<class MachHeader>
+  void printFileHeaders(const MachHeader &Header);
+
   void printSymbol(const SymbolRef &Symbol);
 
   void printRelocation(const RelocationRef &Reloc);
@@ -68,6 +71,137 @@
 
 } // namespace llvm
 
+static const EnumEntry<uint32_t> MachOMagics[] = {
+  { "Magic",      MachO::MH_MAGIC    },
+  { "Cigam",      MachO::MH_CIGAM    },
+  { "Magic64",    MachO::MH_MAGIC_64 },
+  { "Cigam64",    MachO::MH_CIGAM_64 },
+  { "FatMagic",   MachO::FAT_MAGIC   },
+  { "FatCigam",   MachO::FAT_CIGAM   },
+};
+
+static const EnumEntry<uint32_t> MachOHeaderFileTypes[] = {
+  { "Relocatable",          MachO::MH_OBJECT      },
+  { "Executable",           MachO::MH_EXECUTE     },
+  { "FixedVMLibrary",       MachO::MH_FVMLIB      },
+  { "Core",                 MachO::MH_CORE        },
+  { "PreloadedExecutable",  MachO::MH_PRELOAD     },
+  { "DynamicLibrary",       MachO::MH_DYLIB       },
+  { "DynamicLinker",        MachO::MH_DYLINKER    },
+  { "Bundle",               MachO::MH_BUNDLE      },
+  { "DynamicLibraryStub",   MachO::MH_DYLIB_STUB  },
+  { "DWARFSymbol",          MachO::MH_DSYM        },
+  { "KextBundle",           MachO::MH_KEXT_BUNDLE },
+};
+
+static const EnumEntry<uint32_t> MachOHeaderCpuTypes[] = {
+  { "Any"       , static_cast<uint32_t>(MachO::CPU_TYPE_ANY) },
+  { "X86"       , MachO::CPU_TYPE_X86       },
+  { "X86-64"    , MachO::CPU_TYPE_X86_64    },
+  { "Mc98000"   , MachO::CPU_TYPE_MC98000   },
+  { "Arm"       , MachO::CPU_TYPE_ARM       },
+  { "Arm64"     , MachO::CPU_TYPE_ARM64     },
+  { "Sparc"     , MachO::CPU_TYPE_SPARC     },
+  { "PowerPC"   , MachO::CPU_TYPE_POWERPC   },
+  { "PowerPC64" , MachO::CPU_TYPE_POWERPC64 },
+};
+
+static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesX86[] = {
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_I386_ALL),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_386),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_486),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_486SX),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_586),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTPRO),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTII_M3),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTII_M5),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_CELERON),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_CELERON_MOBILE),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_3),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_3_M),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_3_XEON),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_M),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_4),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_PENTIUM_4_M),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ITANIUM),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ITANIUM_2),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_XEON),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_XEON_MP),
+};
+
+static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesX64[] = {
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_X86_64_ALL),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_X86_ARCH1),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_X86_64_H),
+};
+
+static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM[] = {
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_ALL),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V4T),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V6),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V5),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V5TEJ),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_XSCALE),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7S),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7K),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V6M),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7M),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM_V7EM),
+};
+
+static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesARM64[] = {
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_ARM64_ALL),
+};
+
+static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesSPARC[] = {
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_SPARC_ALL),
+};
+
+static const EnumEntry<uint32_t> MachOHeaderCpuSubtypesPPC[] = {
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_ALL),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_601),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_602),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_603),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_603e),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_603ev),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_604),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_604e),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_620),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_750),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_7400),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_7450),
+  LLVM_READOBJ_ENUM_ENT(MachO, CPU_SUBTYPE_POWERPC_970),
+};
+
+static const EnumEntry<uint32_t> MachOHeaderFlags[] = {
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_NOUNDEFS),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_INCRLINK),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_DYLDLINK),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_BINDATLOAD),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_PREBOUND),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_SPLIT_SEGS),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_LAZY_INIT),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_TWOLEVEL),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_FORCE_FLAT),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_NOMULTIDEFS),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_NOFIXPREBINDING),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_PREBINDABLE),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_ALLMODSBOUND),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_SUBSECTIONS_VIA_SYMBOLS),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_CANONICAL),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_WEAK_DEFINES),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_BINDS_TO_WEAK),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_ALLOW_STACK_EXECUTION),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_ROOT_SAFE),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_SETUID_SAFE),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_NO_REEXPORTED_DYLIBS),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_PIE),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_DEAD_STRIPPABLE_DYLIB),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_HAS_TLV_DESCRIPTORS),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_NO_HEAP_EXECUTION),
+  LLVM_READOBJ_ENUM_ENT(MachO, MH_APP_EXTENSION_SAFE),
+};
 
 static const EnumEntry<unsigned> MachOSectionTypes[] = {
   { "Regular"                        , 0x00 },
@@ -205,7 +339,47 @@
 }
 
 void MachODumper::printFileHeaders() {
-  W.startLine() << "FileHeaders not implemented.\n";
+  DictScope H(W, "MachHeader");
+  if (!Obj->is64Bit()) {
+    printFileHeaders(Obj->getHeader());
+  } else {
+    printFileHeaders(Obj->getHeader64());
+    W.printHex("Reserved", Obj->getHeader64().reserved);
+  }
+}
+
+template<class MachHeader>
+void MachODumper::printFileHeaders(const MachHeader &Header) {
+  W.printEnum("Magic", Header.magic, makeArrayRef(MachOMagics));
+  W.printEnum("CpuType", Header.cputype, makeArrayRef(MachOHeaderCpuTypes));
+  uint32_t subtype = Header.cpusubtype & ~MachO::CPU_SUBTYPE_MASK;
+  switch (Header.cputype) {
+  case MachO::CPU_TYPE_X86:
+    W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesX86));
+    break;
+  case MachO::CPU_TYPE_X86_64:
+    W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesX64));
+    break;
+  case MachO::CPU_TYPE_ARM:
+    W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesARM));
+    break;
+  case MachO::CPU_TYPE_POWERPC:
+    W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesPPC));
+    break;
+  case MachO::CPU_TYPE_SPARC:
+    W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesSPARC));
+    break;
+  case MachO::CPU_TYPE_ARM64:
+    W.printEnum("CpuSubType", subtype, makeArrayRef(MachOHeaderCpuSubtypesARM64));
+    break;
+  case MachO::CPU_TYPE_POWERPC64:
+  default:
+    W.printHex("CpuSubtype", subtype);
+  }
+  W.printEnum("FileType", Header.filetype, makeArrayRef(MachOHeaderFileTypes));
+  W.printNumber("NumOfLoadCommands", Header.ncmds);
+  W.printNumber("SizeOfLoadCommands", Header.sizeofcmds);
+  W.printFlags("Flags", Header.flags, makeArrayRef(MachOHeaderFlags));
 }
 
 void MachODumper::printSections() {
@@ -257,8 +431,7 @@
     if (opts::SectionSymbols) {
       ListScope D(W, "Symbols");
       for (const SymbolRef &Symbol : Obj->symbols()) {
-        bool Contained = false;
-        if (Section.containsSymbol(Symbol, Contained) || !Contained)
+        if (!Section.containsSymbol(Symbol))
           continue;
 
         printSymbol(Symbol);
@@ -266,11 +439,14 @@
     }
 
     if (opts::SectionData) {
-      StringRef Data;
-      if (error(Section.getContents(Data)))
-        break;
+      bool IsBSS = Section.isBSS();
+      if (!IsBSS) {
+        StringRef Data;
+        if (error(Section.getContents(Data)))
+          break;
 
-      W.printBinaryBlock("SectionData", Data);
+        W.printBinaryBlock("SectionData", Data);
+      }
     }
   }
 }

diff --git a/tools/llvm-readobj/ObjDumper.h b/tools/llvm-readobj/ObjDumper.h
index f80a28b..a34e091 100644
--- a/tools/llvm-readobj/ObjDumper.h
+++ b/tools/llvm-readobj/ObjDumper.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_READOBJ_OBJDUMPER_H
-#define LLVM_READOBJ_OBJDUMPER_H
+#ifndef LLVM_TOOLS_LLVM_READOBJ_OBJDUMPER_H
+#define LLVM_TOOLS_LLVM_READOBJ_OBJDUMPER_H
 
 #include <memory>
 #include <system_error>
@@ -43,6 +43,11 @@
   // Only implemented for MIPS ELF at this time.
   virtual void printMipsPLTGOT() { }
 
+  // Only implemented for PE/COFF.
+  virtual void printCOFFImports() { }
+  virtual void printCOFFDirectives() { }
+  virtual void printCOFFBaseReloc() { }
+
 protected:
   StreamWriter& W;
 };

diff --git a/tools/llvm-readobj/StreamWriter.h b/tools/llvm-readobj/StreamWriter.h
index 04b38fb..2fc53ee 100644
--- a/tools/llvm-readobj/StreamWriter.h
+++ b/tools/llvm-readobj/StreamWriter.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_READOBJ_STREAMWRITER_H
-#define LLVM_READOBJ_STREAMWRITER_H
+#ifndef LLVM_TOOLS_LLVM_READOBJ_STREAMWRITER_H
+#define LLVM_TOOLS_LLVM_READOBJ_STREAMWRITER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -214,8 +214,8 @@
   }
 
   void printBinary(StringRef Label, StringRef Str, ArrayRef<char> Value) {
-    ArrayRef<uint8_t> V(reinterpret_cast<const uint8_t*>(Value.data()),
-                        Value.size());
+    auto V = makeArrayRef(reinterpret_cast<const uint8_t*>(Value.data()),
+                          Value.size());
     printBinaryImpl(Label, Str, V, false);
   }
 
@@ -224,20 +224,20 @@
   }
 
   void printBinary(StringRef Label, ArrayRef<char> Value) {
-    ArrayRef<uint8_t> V(reinterpret_cast<const uint8_t*>(Value.data()),
-                        Value.size());
+    auto V = makeArrayRef(reinterpret_cast<const uint8_t*>(Value.data()),
+                          Value.size());
     printBinaryImpl(Label, StringRef(), V, false);
   }
 
   void printBinary(StringRef Label, StringRef Value) {
-    ArrayRef<uint8_t> V(reinterpret_cast<const uint8_t*>(Value.data()),
-                        Value.size());
+    auto V = makeArrayRef(reinterpret_cast<const uint8_t*>(Value.data()),
+                          Value.size());
     printBinaryImpl(Label, StringRef(), V, false);
   }
 
   void printBinaryBlock(StringRef Label, StringRef Value) {
-    ArrayRef<uint8_t> V(reinterpret_cast<const uint8_t*>(Value.data()),
-                        Value.size());
+    auto V = makeArrayRef(reinterpret_cast<const uint8_t*>(Value.data()),
+                          Value.size());
     printBinaryImpl(Label, StringRef(), V, true);
   }
 

diff --git a/tools/llvm-readobj/Win64EHDumper.h b/tools/llvm-readobj/Win64EHDumper.h
index 9ce4d39..a80df9c 100644
--- a/tools/llvm-readobj/Win64EHDumper.h
+++ b/tools/llvm-readobj/Win64EHDumper.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_READOBJ_WIN64EHPRINTER_H
-#define LLVM_TOOLS_READOBJ_WIN64EHPRINTER_H
+#ifndef LLVM_TOOLS_LLVM_READOBJ_WIN64EHDUMPER_H
+#define LLVM_TOOLS_LLVM_READOBJ_WIN64EHDUMPER_H
 
 #include "StreamWriter.h"
 #include "llvm/Support/Win64EH.h"

diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 8d2a997..d08f186 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp

@@ -24,6 +24,7 @@
 #include "ObjDumper.h"
 #include "StreamWriter.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -140,6 +141,20 @@
   cl::opt<bool>
   MipsPLTGOT("mips-plt-got",
              cl::desc("Display the MIPS GOT and PLT GOT sections"));
+
+  // -coff-imports
+  cl::opt<bool>
+  COFFImports("coff-imports", cl::desc("Display the PE/COFF import table"));
+
+  // -coff-directives
+  cl::opt<bool>
+  COFFDirectives("coff-directives",
+                 cl::desc("Display the PE/COFF .drectve section"));
+
+  // -coff-basereloc
+  cl::opt<bool>
+  COFFBaseRelocs("coff-basereloc",
+                 cl::desc("Display the PE/COFF .reloc section"));
 } // namespace opts
 
 static int ReturnValue = EXIT_SUCCESS;
@@ -158,8 +173,8 @@
 
 bool relocAddressLess(RelocationRef a, RelocationRef b) {
   uint64_t a_addr, b_addr;
-  if (error(a.getOffset(a_addr))) return false;
-  if (error(b.getOffset(b_addr))) return false;
+  if (error(a.getOffset(a_addr))) exit(ReturnValue);
+  if (error(b.getOffset(b_addr))) exit(ReturnValue);
   return a_addr < b_addr;
 }
 
@@ -210,6 +225,17 @@
   return readobj_error::unsupported_obj_file_format;
 }
 
+static StringRef getLoadName(const ObjectFile *Obj) {
+  if (auto *ELF = dyn_cast<ELF32LEObjectFile>(Obj))
+    return ELF->getLoadName();
+  if (auto *ELF = dyn_cast<ELF64LEObjectFile>(Obj))
+    return ELF->getLoadName();
+  if (auto *ELF = dyn_cast<ELF32BEObjectFile>(Obj))
+    return ELF->getLoadName();
+  if (auto *ELF = dyn_cast<ELF64BEObjectFile>(Obj))
+    return ELF->getLoadName();
+  llvm_unreachable("Not ELF");
+}
 
 /// @brief Dumps the specified object file.
 static void dumpObject(const ObjectFile *Obj) {
@@ -228,7 +254,7 @@
          << "\n";
   outs() << "AddressSize: " << (8*Obj->getBytesInAddress()) << "bit\n";
   if (Obj->isELF())
-    outs() << "LoadName: " << Obj->getLoadName() << "\n";
+    outs() << "LoadName: " << getLoadName(Obj) << "\n";
 
   if (opts::FileHeaders)
     Dumper->printFileHeaders();
@@ -254,6 +280,12 @@
   if (isMipsArch(Obj->getArch()) && Obj->isELF())
     if (opts::MipsPLTGOT)
       Dumper->printMipsPLTGOT();
+  if (opts::COFFImports)
+    Dumper->printCOFFImports();
+  if (opts::COFFDirectives)
+    Dumper->printCOFFDirectives();
+  if (opts::COFFBaseRelocs)
+    Dumper->printCOFFBaseReloc();
 }
 
 
@@ -287,16 +319,16 @@
   }
 
   // Attempt to open the binary.
-  ErrorOr<Binary *> BinaryOrErr = createBinary(File);
+  ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
   if (std::error_code EC = BinaryOrErr.getError()) {
     reportError(File, EC);
     return;
   }
-  std::unique_ptr<Binary> Binary(BinaryOrErr.get());
+  Binary &Binary = *BinaryOrErr.get().getBinary();
 
-  if (Archive *Arc = dyn_cast<Archive>(Binary.get()))
+  if (Archive *Arc = dyn_cast<Archive>(&Binary))
     dumpArchive(Arc);
-  else if (ObjectFile *Obj = dyn_cast<ObjectFile>(Binary.get()))
+  else if (ObjectFile *Obj = dyn_cast<ObjectFile>(&Binary))
     dumpObject(Obj);
   else
     reportError(File, readobj_error::unrecognized_file_format);

diff --git a/tools/llvm-readobj/llvm-readobj.h b/tools/llvm-readobj/llvm-readobj.h
index 0413948..1c33417 100644
--- a/tools/llvm-readobj/llvm-readobj.h
+++ b/tools/llvm-readobj/llvm-readobj.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_READ_OBJ_H
-#define LLVM_TOOLS_READ_OBJ_H
+#ifndef LLVM_TOOLS_LLVM_READOBJ_LLVM_READOBJ_H
+#define LLVM_TOOLS_LLVM_READOBJ_LLVM_READOBJ_H
 
 #include "llvm/Support/CommandLine.h"
 #include <string>

diff --git a/tools/llvm-rtdyld/Android.mk b/tools/llvm-rtdyld/Android.mk
index 6f902d3..4f4fb4c 100644
--- a/tools/llvm-rtdyld/Android.mk
+++ b/tools/llvm-rtdyld/Android.mk

@@ -39,6 +39,7 @@
   libLLVMX86Disassembler \
   libLLVMDebugInfo          \
   libLLVMExecutionEngine    \
+  libLLVMCodeGen \
   libLLVMObject             \
   libLLVMMC                 \
   libLLVMMCParser           \
@@ -46,6 +47,7 @@
   libLLVMBitReader          \
   libLLVMCore               \
   libLLVMSupport            \
+  libLLVMMCDisassembler \
 
 include $(CLEAR_VARS)
 

diff --git a/tools/llvm-rtdyld/LLVMBuild.txt b/tools/llvm-rtdyld/LLVMBuild.txt
index b36d13c..c4ed49b 100644
--- a/tools/llvm-rtdyld/LLVMBuild.txt
+++ b/tools/llvm-rtdyld/LLVMBuild.txt

@@ -19,4 +19,4 @@
 type = Tool
 name = llvm-rtdyld
 parent = Tools
-required_libraries = JIT MC Object RuntimeDyld Support all-targets
+required_libraries = MC Object RuntimeDyld Support all-targets

diff --git a/tools/llvm-rtdyld/Makefile b/tools/llvm-rtdyld/Makefile
index fabdd68..9de753e 100644
--- a/tools/llvm-rtdyld/Makefile
+++ b/tools/llvm-rtdyld/Makefile

@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-rtdyld
-LINK_COMPONENTS := all-targets support MC object RuntimeDyld JIT debuginfo
+LINK_COMPONENTS := all-targets support MC object RuntimeDyld MCJIT debuginfo
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1

diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 45734f4..87d381e 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp

@@ -34,6 +34,7 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include <list>
 #include <system_error>
 
 using namespace llvm;
@@ -78,6 +79,31 @@
            cl::desc("File containing RuntimeDyld verifier checks."),
            cl::ZeroOrMore);
 
+static cl::opt<uint64_t>
+TargetAddrStart("target-addr-start",
+                cl::desc("For -verify only: start of phony target address "
+                         "range."),
+                cl::init(4096), // Start at "page 1" - no allocating at "null".
+                cl::Hidden);
+
+static cl::opt<uint64_t>
+TargetAddrEnd("target-addr-end",
+              cl::desc("For -verify only: end of phony target address range."),
+              cl::init(~0ULL),
+              cl::Hidden);
+
+static cl::opt<uint64_t>
+TargetSectionSep("target-section-sep",
+                 cl::desc("For -verify only: Separation between sections in "
+                          "phony target address space."),
+                 cl::init(0),
+                 cl::Hidden);
+
+static cl::list<std::string>
+SpecificSectionMappings("map-section",
+                        cl::desc("Map a section to a specific address."),
+                        cl::ZeroOrMore);
+
 /* *** */
 
 // A trivial memory manager that doesn't do anything fancy, just uses the
@@ -183,8 +209,8 @@
 
     std::unique_ptr<ObjectImage> LoadedObject;
     // Load the object file
-    LoadedObject.reset(
-        Dyld.loadObject(new ObjectBuffer(InputBuffer.get().release())));
+    LoadedObject = Dyld.loadObject(
+        llvm::make_unique<ObjectBuffer>(std::move(*InputBuffer)));
     if (!LoadedObject) {
       return Error(Dyld.getErrorString());
     }
@@ -193,7 +219,7 @@
     Dyld.resolveRelocations();
 
     std::unique_ptr<DIContext> Context(
-        DIContext::getDWARFContext(LoadedObject->getObjectFile()));
+        DIContext::getDWARFContext(*LoadedObject->getObjectFile()));
 
     // Use symbol info to iterate functions in the object.
     for (object::symbol_iterator I = LoadedObject->begin_symbols(),
@@ -244,8 +270,8 @@
       return Error("unable to read input: '" + EC.message() + "'");
     std::unique_ptr<ObjectImage> LoadedObject;
     // Load the object file
-    LoadedObject.reset(
-        Dyld.loadObject(new ObjectBuffer(InputBuffer.get().release())));
+    LoadedObject = Dyld.loadObject(
+        llvm::make_unique<ObjectBuffer>(std::move(*InputBuffer)));
     if (!LoadedObject) {
       return Error(Dyld.getErrorString());
     }
@@ -300,6 +326,134 @@
   return 0;
 }
 
+std::map<void*, uint64_t>
+applySpecificSectionMappings(RuntimeDyldChecker &Checker) {
+
+  std::map<void*, uint64_t> SpecificMappings;
+
+  for (StringRef Mapping : SpecificSectionMappings) {
+
+    size_t EqualsIdx = Mapping.find_first_of("=");
+    StringRef SectionIDStr = Mapping.substr(0, EqualsIdx);
+    size_t ComaIdx = Mapping.find_first_of(",");
+
+    if (ComaIdx == StringRef::npos) {
+      errs() << "Invalid section specification '" << Mapping
+             << "'. Should be '<file name>,<section name>=<addr>'\n";
+      exit(1);
+    }
+
+    StringRef FileName = SectionIDStr.substr(0, ComaIdx);
+    StringRef SectionName = SectionIDStr.substr(ComaIdx + 1);
+
+    uint64_t OldAddrInt;
+    std::string ErrorMsg;
+    std::tie(OldAddrInt, ErrorMsg) =
+      Checker.getSectionAddr(FileName, SectionName, true);
+
+    if (ErrorMsg != "") {
+      errs() << ErrorMsg;
+      exit(1);
+    }
+
+    void* OldAddr = reinterpret_cast<void*>(static_cast<uintptr_t>(OldAddrInt));
+
+    StringRef NewAddrStr = Mapping.substr(EqualsIdx + 1);
+    uint64_t NewAddr;
+
+    if (NewAddrStr.getAsInteger(0, NewAddr)) {
+      errs() << "Invalid section address in mapping: " << Mapping << "\n";
+      exit(1);
+    }
+
+    Checker.getRTDyld().mapSectionAddress(OldAddr, NewAddr);
+    SpecificMappings[OldAddr] = NewAddr;
+  }
+
+  return SpecificMappings;
+}
+
+// Scatter sections in all directions!
+// Remaps section addresses for -verify mode. The following command line options
+// can be used to customize the layout of the memory within the phony target's
+// address space:
+// -target-addr-start <s> -- Specify where the phony target addres range starts.
+// -target-addr-end   <e> -- Specify where the phony target address range ends.
+// -target-section-sep <d> -- Specify how big a gap should be left between the
+//                            end of one section and the start of the next.
+//                            Defaults to zero. Set to something big
+//                            (e.g. 1 << 32) to stress-test stubs, GOTs, etc.
+//
+void remapSections(const llvm::Triple &TargetTriple,
+                   const TrivialMemoryManager &MemMgr,
+                   RuntimeDyldChecker &Checker) {
+
+  // Set up a work list (section addr/size pairs).
+  typedef std::list<std::pair<void*, uint64_t>> WorklistT;
+  WorklistT Worklist;
+
+  for (const auto& CodeSection : MemMgr.FunctionMemory)
+    Worklist.push_back(std::make_pair(CodeSection.base(), CodeSection.size()));
+  for (const auto& DataSection : MemMgr.DataMemory)
+    Worklist.push_back(std::make_pair(DataSection.base(), DataSection.size()));
+
+  // Apply any section-specific mappings that were requested on the command
+  // line.
+  typedef std::map<void*, uint64_t> AppliedMappingsT;
+  AppliedMappingsT AppliedMappings = applySpecificSectionMappings(Checker);
+
+  // Keep an "already allocated" mapping of section target addresses to sizes.
+  // Sections whose address mappings aren't specified on the command line will
+  // allocated around the explicitly mapped sections while maintaining the
+  // minimum separation.
+  std::map<uint64_t, uint64_t> AlreadyAllocated;
+
+  // Move the previously applied mappings into the already-allocated map.
+  for (WorklistT::iterator I = Worklist.begin(), E = Worklist.end();
+       I != E;) {
+    WorklistT::iterator Tmp = I;
+    ++I;
+    AppliedMappingsT::iterator AI = AppliedMappings.find(Tmp->first);
+
+    if (AI != AppliedMappings.end()) {
+      AlreadyAllocated[AI->second] = Tmp->second;
+      Worklist.erase(Tmp);
+    }
+  }
+
+  // If the -target-addr-end option wasn't explicitly passed, then set it to a
+  // sensible default based on the target triple.
+  if (TargetAddrEnd.getNumOccurrences() == 0) {
+    if (TargetTriple.isArch16Bit())
+      TargetAddrEnd = (1ULL << 16) - 1;
+    else if (TargetTriple.isArch32Bit())
+      TargetAddrEnd = (1ULL << 32) - 1;
+    // TargetAddrEnd already has a sensible default for 64-bit systems, so
+    // there's nothing to do in the 64-bit case.
+  }
+
+  // Process any elements remaining in the worklist.
+  while (!Worklist.empty()) {
+    std::pair<void*, uint64_t> CurEntry = Worklist.front();
+    Worklist.pop_front();
+
+    uint64_t NextSectionAddr = TargetAddrStart;
+
+    for (const auto &Alloc : AlreadyAllocated)
+      if (NextSectionAddr + CurEntry.second + TargetSectionSep <= Alloc.first)
+        break;
+      else
+        NextSectionAddr = Alloc.first + Alloc.second + TargetSectionSep;
+
+    AlreadyAllocated[NextSectionAddr] = CurEntry.second;
+    Checker.getRTDyld().mapSectionAddress(CurEntry.first, NextSectionAddr);
+  }
+
+}
+
+// Load and link the objects specified on the command line, but do not execute
+// anything. Instead, attach a RuntimeDyldChecker instance and call it to
+// verify the correctness of the linked memory.
 static int linkAndVerify() {
 
   // Check for missing triple.
@@ -347,6 +501,9 @@
   // Instantiate a dynamic linker.
   TrivialMemoryManager MemMgr;
   RuntimeDyld Dyld(&MemMgr);
+  Dyld.setProcessAllSections(true);
+  RuntimeDyldChecker Checker(Dyld, Disassembler.get(), InstPrinter.get(),
+                             llvm::dbgs());
 
   // If we don't have any input files, read from stdin.
   if (!InputFileList.size())
@@ -360,19 +517,30 @@
 
     std::unique_ptr<ObjectImage> LoadedObject;
     // Load the object file
-    LoadedObject.reset(
-        Dyld.loadObject(new ObjectBuffer(InputBuffer.get().release())));
+    LoadedObject = Dyld.loadObject(
+        llvm::make_unique<ObjectBuffer>(std::move(*InputBuffer)));
     if (!LoadedObject) {
       return Error(Dyld.getErrorString());
     }
   }
 
+  // Re-map the section addresses into the phony target address space.
+  remapSections(TheTriple, MemMgr, Checker);
+
   // Resolve all the relocations we can.
   Dyld.resolveRelocations();
 
-  RuntimeDyldChecker Checker(Dyld, Disassembler.get(), InstPrinter.get(),
-                             llvm::dbgs());
-  return checkAllExpressions(Checker);
+  // Register EH frames.
+  Dyld.registerEHFrames();
+
+  int ErrorCode = checkAllExpressions(Checker);
+  if (Dyld.hasError()) {
+    errs() << "RTDyld reported an error applying relocations:\n  "
+           << Dyld.getErrorString() << "\n";
+    ErrorCode = 1;
+  }
+
+  return ErrorCode;
 }
 
 int main(int argc, char **argv) {

diff --git a/tools/llvm-shlib/CMakeLists.txt b/tools/llvm-shlib/CMakeLists.txt
new file mode 100644
index 0000000..100c184
--- /dev/null
+++ b/tools/llvm-shlib/CMakeLists.txt

@@ -0,0 +1,100 @@
+# This tool creates a shared library from the LLVM libraries. Generating this
+# library is enabled by setting LLVM_BUILD_LLVM_DYLIB=yes on the CMake
+# commandline. By default the shared library only exports the LLVM C API.
+
+
+# You can configure which libraries from LLVM you want to include in the shared
+# library by setting LLVM_DYLIB_COMPONENTS to a semi-colon delimited list of
+# LLVM components. All compoenent names handled by llvm-config are valid.
+
+if(NOT DEFINED LLVM_DYLIB_COMPONENTS)
+  set(LLVM_DYLIB_COMPONENTS
+    ${LLVM_TARGETS_TO_BUILD}
+    Analysis
+    BitReader
+    BitWriter
+    CodeGen
+    Core
+    ExecutionEngine
+    IPA
+    IPO
+    IRReader
+    InstCombine
+    Instrumentation
+    Interpreter
+    Linker
+    MCDisassembler
+    MCJIT
+    ObjCARCOpts
+    Object
+    ScalarOpts
+    Support
+    Target
+    TransformUtils
+    Vectorize
+    native
+    )
+endif()
+
+add_definitions( -DLLVM_VERSION_INFO=\"${PACKAGE_VERSION}\" )
+
+set(SOURCES
+  libllvm.cpp
+  )
+
+if(NOT DEFINED LLVM_EXPORTED_SYMBOL_FILE)
+
+  if( WIN32 AND NOT CYGWIN )
+    message(FATAL_ERROR "Auto-generation not implemented for Win32 without GNU utils. Please specify LLVM_EXPORTED_SYMBOL_FILE.")
+  endif()
+
+  # To get the export list for a single llvm library:
+  # nm ${LIB_PATH} | awk "/T _LLVM/ { print $3 }" | sort -u | sed -e "s/^_//g" > ${LIB_PATH}.exports
+
+  set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_BINARY_DIR}/libllvm.exports)
+
+  llvm_map_components_to_libnames(LIB_NAMES ${LLVM_DYLIB_COMPONENTS})
+
+  foreach (lib ${LIB_NAMES})
+    
+    set(LIB_DIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib)
+    set(LIB_NAME ${LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${lib})
+    set(LIB_PATH ${LIB_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX})
+    set(LIB_EXPORTS_PATH ${LIB_NAME}.exports)
+
+    list(APPEND LLVM_DYLIB_REQUIRED_EXPORTS ${LIB_EXPORTS_PATH})
+
+    add_custom_command(OUTPUT ${LIB_EXPORTS_PATH}
+      COMMAND nm ${LIB_PATH} | awk "/T _LLVM/ || /T LLVM/ { print $3 }" | sort -u | sed -e "s/^_//g" > ${LIB_EXPORTS_PATH}
+      WORKING_DIRECTORY ${LIB_DIR}
+      DEPENDS ${lib}
+      COMMENT "Generating Export list for ${lib}..."
+      VERBATIM )
+  endforeach ()
+
+  add_custom_command(OUTPUT ${LLVM_EXPORTED_SYMBOL_FILE}
+    COMMAND cat ${LLVM_DYLIB_REQUIRED_EXPORTS} > ${LLVM_EXPORTED_SYMBOL_FILE}
+    WORKING_DIRECTORY ${LIB_DIR}
+    DEPENDS ${LLVM_DYLIB_REQUIRED_EXPORTS}
+    COMMENT "Generating combined export list...")
+
+endif()
+
+add_llvm_library(LLVM SHARED ${SOURCES})
+
+if("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux") # FIXME: It should be "GNU ld for elf"
+  # GNU ld doesn't resolve symbols in the version script.
+  list(REMOVE_DUPLICATES LIB_NAMES)
+  set(LIB_NAMES -Wl,--whole-archive ${LIB_NAMES} -Wl,--no-whole-archive)
+endif()
+
+target_link_libraries(LLVM ${cmake_2_8_12_PRIVATE} ${LIB_NAMES})
+
+add_dependencies(LLVM ${LLVM_EXPORTED_SYMBOL_FILE})
+
+if (APPLE)
+  set_property(TARGET LLVM APPEND_STRING PROPERTY
+              LINK_FLAGS
+              " -compatibility_version ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR} -current_version ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}")
+endif()
+

diff --git a/tools/llvm-shlib/libllvm.cpp b/tools/llvm-shlib/libllvm.cpp
new file mode 100644
index 0000000..40b4f66
--- /dev/null
+++ b/tools/llvm-shlib/libllvm.cpp

@@ -0,0 +1,13 @@
+//===-libllvm.cpp - LLVM Shared Library -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is empty and serves only the purpose of making CMake happy because
+// you can't define a target with no sources.
+//
+//===----------------------------------------------------------------------===//

diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp
index 50b5220..59a5f20 100644
--- a/tools/llvm-size/llvm-size.cpp
+++ b/tools/llvm-size/llvm-size.cpp

@@ -297,17 +297,13 @@
     std::size_t max_size_len = strlen("size");
     std::size_t max_addr_len = strlen("addr");
     for (const SectionRef &Section : Obj->sections()) {
-      uint64_t size = 0;
-      if (error(Section.getSize(size)))
-        return;
+      uint64_t size = Section.getSize();
       total += size;
 
       StringRef name;
-      uint64_t addr = 0;
       if (error(Section.getName(name)))
         return;
-      if (error(Section.getAddress(addr)))
-        return;
+      uint64_t addr = Section.getAddress();
       max_name_len = std::max(max_name_len, name.size());
       max_size_len = std::max(max_size_len, getNumLengthAsString(size));
       max_addr_len = std::max(max_addr_len, getNumLengthAsString(addr));
@@ -337,14 +333,10 @@
     // Print each section.
     for (const SectionRef &Section : Obj->sections()) {
       StringRef name;
-      uint64_t size = 0;
-      uint64_t addr = 0;
       if (error(Section.getName(name)))
         return;
-      if (error(Section.getSize(size)))
-        return;
-      if (error(Section.getAddress(addr)))
-        return;
+      uint64_t size = Section.getSize();
+      uint64_t addr = Section.getAddress();
       std::string namestr = name;
 
       outs() << format(fmt.str().c_str(), namestr.c_str(), size, addr);
@@ -365,18 +357,10 @@
 
     // Make one pass over the section table to calculate sizes.
     for (const SectionRef &Section : Obj->sections()) {
-      uint64_t size = 0;
-      bool isText = false;
-      bool isData = false;
-      bool isBSS = false;
-      if (error(Section.getSize(size)))
-        return;
-      if (error(Section.isText(isText)))
-        return;
-      if (error(Section.isData(isData)))
-        return;
-      if (error(Section.isBSS(isBSS)))
-        return;
+      uint64_t size = Section.getSize();
+      bool isText = Section.isText();
+      bool isData = Section.isData();
+      bool isBSS = Section.isBSS();
       if (isText)
         total_text += size;
       else if (isData)
@@ -444,8 +428,7 @@
 static void PrintFileSectionSizes(StringRef file) {
   // If file is not stdin, check that it exists.
   if (file != "-") {
-    bool exists;
-    if (sys::fs::exists(file, exists) || !exists) {
+    if (!sys::fs::exists(file)) {
       errs() << ToolName << ": '" << file << "': "
              << "No such file\n";
       return;
@@ -453,14 +436,14 @@
   }
 
   // Attempt to open the binary.
-  ErrorOr<Binary *> BinaryOrErr = createBinary(file);
+  ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(file);
   if (std::error_code EC = BinaryOrErr.getError()) {
     errs() << ToolName << ": " << file << ": " << EC.message() << ".\n";
     return;
   }
-  std::unique_ptr<Binary> binary(BinaryOrErr.get());
+  Binary &Bin = *BinaryOrErr.get().getBinary();
 
-  if (Archive *a = dyn_cast<Archive>(binary.get())) {
+  if (Archive *a = dyn_cast<Archive>(&Bin)) {
     // This is an archive. Iterate over each member and display its sizes.
     for (object::Archive::child_iterator i = a->child_begin(),
                                          e = a->child_end();
@@ -488,7 +471,7 @@
       }
     }
   } else if (MachOUniversalBinary *UB =
-                 dyn_cast<MachOUniversalBinary>(binary.get())) {
+                 dyn_cast<MachOUniversalBinary>(&Bin)) {
     // If we have a list of architecture flags specified dump only those.
     if (!ArchAll && ArchFlags.size() != 0) {
       // Look for a slice in the universal binary that matches each ArchFlag.
@@ -692,7 +675,7 @@
         }
       }
     }
-  } else if (ObjectFile *o = dyn_cast<ObjectFile>(binary.get())) {
+  } else if (ObjectFile *o = dyn_cast<ObjectFile>(&Bin)) {
     if (!checkMachOAndArchFlags(o, file))
       return;
     if (OutputFormat == sysv)
@@ -731,8 +714,7 @@
     if (ArchFlags[i] == "all") {
       ArchAll = true;
     } else {
-      Triple T = MachOObjectFile::getArch(ArchFlags[i]);
-      if (T.getArch() == Triple::UnknownArch) {
+      if (!MachOObjectFile::isValidArch(ArchFlags[i])) {
         outs() << ToolName << ": for the -arch option: Unknown architecture "
                << "named '" << ArchFlags[i] << "'";
         return 1;

diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index 23d3b63..21a79e3 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp

@@ -704,11 +704,10 @@
   if (OutputFilename.empty())
     OutputFilename = "-";
 
-  std::string ErrorInfo;
-  Out.reset(new tool_output_file(OutputFilename.c_str(), ErrorInfo,
-                                 sys::fs::F_None));
-  if (!ErrorInfo.empty()) {
-    errs() << ErrorInfo << '\n';
+  std::error_code EC;
+  Out.reset(new tool_output_file(OutputFilename, EC, sys::fs::F_None));
+  if (EC) {
+    errs() << EC.message() << '\n';
     return 1;
   }
 

diff --git a/tools/llvm-symbolizer/LLVMSymbolize.cpp b/tools/llvm-symbolizer/LLVMSymbolize.cpp
index c1d39ef..36061d7 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.cpp
+++ b/tools/llvm-symbolizer/LLVMSymbolize.cpp

@@ -45,8 +45,26 @@
 
 ModuleInfo::ModuleInfo(ObjectFile *Obj, DIContext *DICtx)
     : Module(Obj), DebugInfoContext(DICtx) {
+  std::unique_ptr<DataExtractor> OpdExtractor;
+  uint64_t OpdAddress = 0;
+  // Find the .opd (function descriptor) section if any, for big-endian
+  // PowerPC64 ELF.
+  if (Module->getArch() == Triple::ppc64) {
+    for (section_iterator Section : Module->sections()) {
+      StringRef Name;
+      if (!error(Section->getName(Name)) && Name == ".opd") {
+        StringRef Data;
+        if (!error(Section->getContents(Data))) {
+          OpdExtractor.reset(new DataExtractor(Data, Module->isLittleEndian(),
+                                               Module->getBytesInAddress()));
+          OpdAddress = Section->getAddress();
+        }
+        break;
+      }
+    }
+  }
   for (const SymbolRef &Symbol : Module->symbols()) {
-    addSymbol(Symbol);
+    addSymbol(Symbol, OpdExtractor.get(), OpdAddress);
   }
   bool NoSymbolTable = (Module->symbol_begin() == Module->symbol_end());
   if (NoSymbolTable && Module->isELF()) {
@@ -54,12 +72,13 @@
     std::pair<symbol_iterator, symbol_iterator> IDyn =
         getELFDynamicSymbolIterators(Module);
     for (symbol_iterator si = IDyn.first, se = IDyn.second; si != se; ++si) {
-      addSymbol(*si);
+      addSymbol(*si, OpdExtractor.get(), OpdAddress);
     }
   }
 }
 
-void ModuleInfo::addSymbol(const SymbolRef &Symbol) {
+void ModuleInfo::addSymbol(const SymbolRef &Symbol, DataExtractor *OpdExtractor,
+                           uint64_t OpdAddress) {
   SymbolRef::Type SymbolType;
   if (error(Symbol.getType(SymbolType)))
     return;
@@ -69,6 +88,18 @@
   if (error(Symbol.getAddress(SymbolAddress)) ||
       SymbolAddress == UnknownAddressOrSize)
     return;
+  if (OpdExtractor) {
+    // For big-endian PowerPC64 ELF, symbols in the .opd section refer to
+    // function descriptors. The first word of the descriptor is a pointer to
+    // the function's code.
+    // For the purposes of symbolization, pretend the symbol's address is that
+    // of the function's code, not the descriptor.
+    uint64_t OpdOffset = SymbolAddress - OpdAddress;
+    uint32_t OpdOffset32 = OpdOffset;
+    if (OpdOffset == OpdOffset32 && 
+        OpdExtractor->isValidOffsetForAddress(OpdOffset32))
+      SymbolAddress = OpdExtractor->getAddress(&OpdOffset32);
+  }
   uint64_t SymbolSize;
   // Getting symbol size is linear for Mach-O files, so assume that symbol
   // occupies the memory range up to the following symbol.
@@ -85,7 +116,7 @@
     SymbolName = SymbolName.drop_front();
   // FIXME: If a function has alias, there are two entries in symbol table
   // with same address size. Make sure we choose the correct one.
-  SymbolMapTy &M = SymbolType == SymbolRef::ST_Function ? Functions : Objects;
+  auto &M = SymbolType == SymbolRef::ST_Function ? Functions : Objects;
   SymbolDesc SD = { SymbolAddress, SymbolSize };
   M.insert(std::make_pair(SD, SymbolName));
 }
@@ -93,19 +124,20 @@
 bool ModuleInfo::getNameFromSymbolTable(SymbolRef::Type Type, uint64_t Address,
                                         std::string &Name, uint64_t &Addr,
                                         uint64_t &Size) const {
-  const SymbolMapTy &M = Type == SymbolRef::ST_Function ? Functions : Objects;
-  if (M.empty())
+  const auto &SymbolMap = Type == SymbolRef::ST_Function ? Functions : Objects;
+  if (SymbolMap.empty())
     return false;
   SymbolDesc SD = { Address, Address };
-  SymbolMapTy::const_iterator it = M.upper_bound(SD);
-  if (it == M.begin())
+  auto SymbolIterator = SymbolMap.upper_bound(SD);
+  if (SymbolIterator == SymbolMap.begin())
     return false;
-  --it;
-  if (it->first.Size != 0 && it->first.Addr + it->first.Size <= Address)
+  --SymbolIterator;
+  if (SymbolIterator->first.Size != 0 &&
+      SymbolIterator->first.Addr + SymbolIterator->first.Size <= Address)
     return false;
-  Name = it->second.str();
-  Addr = it->first.Addr;
-  Size = it->first.Size;
+  Name = SymbolIterator->second.str();
+  Addr = SymbolIterator->first.Addr;
+  Size = SymbolIterator->first.Size;
   return true;
 }
 
@@ -206,14 +238,21 @@
 
 void LLVMSymbolizer::flush() {
   DeleteContainerSeconds(Modules);
-  BinaryForPath.clear();
+  ObjectPairForPathArch.clear();
   ObjectFileForArch.clear();
 }
 
-static std::string getDarwinDWARFResourceForPath(const std::string &Path) {
-  StringRef Basename = sys::path::filename(Path);
-  const std::string &DSymDirectory = Path + ".dSYM";
-  SmallString<16> ResourceName = StringRef(DSymDirectory);
+// For Path="/path/to/foo" and Basename="foo" assume that debug info is in
+// /path/to/foo.dSYM/Contents/Resources/DWARF/foo.
+// For Path="/path/to/bar.dSYM" and Basename="foo" assume that debug info is in
+// /path/to/bar.dSYM/Contents/Resources/DWARF/foo.
+static
+std::string getDarwinDWARFResourceForPath(
+    const std::string &Path, const std::string &Basename) {
+  SmallString<16> ResourceName = StringRef(Path);
+  if (sys::path::extension(Path) != ".dSYM") {
+    ResourceName += ".dSYM";
+  }
   sys::path::append(ResourceName, "Contents", "Resources", "DWARF");
   sys::path::append(ResourceName, Basename);
   return ResourceName.str();
@@ -264,9 +303,8 @@
   return false;
 }
 
-static bool getGNUDebuglinkContents(const Binary *Bin, std::string &DebugName,
+static bool getGNUDebuglinkContents(const ObjectFile *Obj, std::string &DebugName,
                                     uint32_t &CRCHash) {
-  const ObjectFile *Obj = dyn_cast<ObjectFile>(Bin);
   if (!Obj)
     return false;
   for (const SectionRef &Section : Obj->sections()) {
@@ -293,60 +331,96 @@
   return false;
 }
 
-LLVMSymbolizer::BinaryPair
-LLVMSymbolizer::getOrCreateBinary(const std::string &Path) {
-  BinaryMapTy::iterator I = BinaryForPath.find(Path);
-  if (I != BinaryForPath.end())
-    return I->second;
-  Binary *Bin = nullptr;
-  Binary *DbgBin = nullptr;
-  ErrorOr<Binary *> BinaryOrErr = createBinary(Path);
-  if (!error(BinaryOrErr.getError())) {
-    std::unique_ptr<Binary> ParsedBinary(BinaryOrErr.get());
-    // Check if it's a universal binary.
-    Bin = ParsedBinary.get();
-    ParsedBinariesAndObjects.push_back(std::move(ParsedBinary));
-    if (Bin->isMachO() || Bin->isMachOUniversalBinary()) {
-      // On Darwin we may find DWARF in separate object file in
-      // resource directory.
-      const std::string &ResourcePath =
-          getDarwinDWARFResourceForPath(Path);
-      BinaryOrErr = createBinary(ResourcePath);
-      std::error_code EC = BinaryOrErr.getError();
-      if (EC != errc::no_such_file_or_directory && !error(EC)) {
-        DbgBin = BinaryOrErr.get();
-        ParsedBinariesAndObjects.push_back(std::unique_ptr<Binary>(DbgBin));
+static
+bool darwinDsymMatchesBinary(const MachOObjectFile *DbgObj,
+                             const MachOObjectFile *Obj) {
+  ArrayRef<uint8_t> dbg_uuid = DbgObj->getUuid();
+  ArrayRef<uint8_t> bin_uuid = Obj->getUuid();
+  if (dbg_uuid.empty() || bin_uuid.empty())
+    return false;
+  return !memcmp(dbg_uuid.data(), bin_uuid.data(), dbg_uuid.size());
+}
+
+ObjectFile *LLVMSymbolizer::lookUpDsymFile(const std::string &ExePath,
+    const MachOObjectFile *MachExeObj, const std::string &ArchName) {
+  // On Darwin we may find DWARF in separate object file in
+  // resource directory.
+  std::vector<std::string> DsymPaths;
+  StringRef Filename = sys::path::filename(ExePath);
+  DsymPaths.push_back(getDarwinDWARFResourceForPath(ExePath, Filename));
+  for (const auto &Path : Opts.DsymHints) {
+    DsymPaths.push_back(getDarwinDWARFResourceForPath(Path, Filename));
+  }
+  for (const auto &path : DsymPaths) {
+    ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(path);
+    std::error_code EC = BinaryOrErr.getError();
+    if (EC != errc::no_such_file_or_directory && !error(EC)) {
+      OwningBinary<Binary> B = std::move(BinaryOrErr.get());
+      ObjectFile *DbgObj =
+          getObjectFileFromBinary(B.getBinary(), ArchName);
+      const MachOObjectFile *MachDbgObj =
+          dyn_cast<const MachOObjectFile>(DbgObj);
+      if (!MachDbgObj) continue;
+      if (darwinDsymMatchesBinary(MachDbgObj, MachExeObj)) {
+        addOwningBinary(std::move(B));
+        return DbgObj; 
       }
     }
+  }
+  return nullptr;
+}
+
+LLVMSymbolizer::ObjectPair
+LLVMSymbolizer::getOrCreateObjects(const std::string &Path,
+                                   const std::string &ArchName) {
+  const auto &I = ObjectPairForPathArch.find(std::make_pair(Path, ArchName));
+  if (I != ObjectPairForPathArch.end())
+    return I->second;
+  ObjectFile *Obj = nullptr;
+  ObjectFile *DbgObj = nullptr;
+  ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(Path);
+  if (!error(BinaryOrErr.getError())) {
+    OwningBinary<Binary> &B = BinaryOrErr.get();
+    Obj = getObjectFileFromBinary(B.getBinary(), ArchName);
+    if (!Obj) {
+      ObjectPair Res = std::make_pair(nullptr, nullptr);
+      ObjectPairForPathArch[std::make_pair(Path, ArchName)] = Res;
+      return Res;
+    }
+    addOwningBinary(std::move(B));
+    if (auto MachObj = dyn_cast<const MachOObjectFile>(Obj))
+      DbgObj = lookUpDsymFile(Path, MachObj, ArchName);
     // Try to locate the debug binary using .gnu_debuglink section.
-    if (!DbgBin) {
+    if (!DbgObj) {
       std::string DebuglinkName;
       uint32_t CRCHash;
       std::string DebugBinaryPath;
-      if (getGNUDebuglinkContents(Bin, DebuglinkName, CRCHash) &&
+      if (getGNUDebuglinkContents(Obj, DebuglinkName, CRCHash) &&
           findDebugBinary(Path, DebuglinkName, CRCHash, DebugBinaryPath)) {
         BinaryOrErr = createBinary(DebugBinaryPath);
         if (!error(BinaryOrErr.getError())) {
-          DbgBin = BinaryOrErr.get();
-          ParsedBinariesAndObjects.push_back(std::unique_ptr<Binary>(DbgBin));
+          OwningBinary<Binary> B = std::move(BinaryOrErr.get());
+          DbgObj = getObjectFileFromBinary(B.getBinary(), ArchName);
+          addOwningBinary(std::move(B));
         }
       }
     }
   }
-  if (!DbgBin)
-    DbgBin = Bin;
-  BinaryPair Res = std::make_pair(Bin, DbgBin);
-  BinaryForPath[Path] = Res;
+  if (!DbgObj)
+    DbgObj = Obj;
+  ObjectPair Res = std::make_pair(Obj, DbgObj);
+  ObjectPairForPathArch[std::make_pair(Path, ArchName)] = Res;
   return Res;
 }
 
 ObjectFile *
-LLVMSymbolizer::getObjectFileFromBinary(Binary *Bin, const std::string &ArchName) {
+LLVMSymbolizer::getObjectFileFromBinary(Binary *Bin,
+                                        const std::string &ArchName) {
   if (!Bin)
     return nullptr;
   ObjectFile *Res = nullptr;
   if (MachOUniversalBinary *UB = dyn_cast<MachOUniversalBinary>(Bin)) {
-    ObjectFileForArchMapTy::iterator I = ObjectFileForArch.find(
+    const auto &I = ObjectFileForArch.find(
         std::make_pair(UB, ArchName));
     if (I != ObjectFileForArch.end())
       return I->second;
@@ -365,7 +439,7 @@
 
 ModuleInfo *
 LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
-  ModuleMapTy::iterator I = Modules.find(ModuleName);
+  const auto &I = Modules.find(ModuleName);
   if (I != Modules.end())
     return I->second;
   std::string BinaryName = ModuleName;
@@ -379,18 +453,16 @@
       ArchName = ArchStr;
     }
   }
-  BinaryPair Binaries = getOrCreateBinary(BinaryName);
-  ObjectFile *Obj = getObjectFileFromBinary(Binaries.first, ArchName);
-  ObjectFile *DbgObj = getObjectFileFromBinary(Binaries.second, ArchName);
+  ObjectPair Objects = getOrCreateObjects(BinaryName, ArchName);
 
-  if (!Obj) {
+  if (!Objects.first) {
     // Failed to find valid object file.
     Modules.insert(make_pair(ModuleName, (ModuleInfo *)nullptr));
     return nullptr;
   }
-  DIContext *Context = DIContext::getDWARFContext(DbgObj);
+  DIContext *Context = DIContext::getDWARFContext(*Objects.second);
   assert(Context);
-  ModuleInfo *Info = new ModuleInfo(Obj, Context);
+  ModuleInfo *Info = new ModuleInfo(Objects.first, Context);
   Modules.insert(make_pair(ModuleName, Info));
   return Info;
 }

diff --git a/tools/llvm-symbolizer/LLVMSymbolize.h b/tools/llvm-symbolizer/LLVMSymbolize.h
index 45febe0..ff848fc 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.h
+++ b/tools/llvm-symbolizer/LLVMSymbolize.h

@@ -10,13 +10,14 @@
 // Header for LLVM symbolization library.
 //
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_SYMBOLIZE_H
-#define LLVM_SYMBOLIZE_H
+#ifndef LLVM_TOOLS_LLVM_SYMBOLIZER_LLVMSYMBOLIZE_H
+#define LLVM_TOOLS_LLVM_SYMBOLIZER_LLVMSYMBOLIZE_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <map>
 #include <memory>
@@ -39,13 +40,14 @@
     bool PrintInlining : 1;
     bool Demangle : 1;
     std::string DefaultArch;
+    std::vector<std::string> DsymHints;
     Options(bool UseSymbolTable = true,
             FunctionNameKind PrintFunctions = FunctionNameKind::LinkageName,
             bool PrintInlining = true, bool Demangle = true,
             std::string DefaultArch = "")
-        : UseSymbolTable(UseSymbolTable), PrintFunctions(PrintFunctions),
-          PrintInlining(PrintInlining), Demangle(Demangle),
-          DefaultArch(DefaultArch) {}
+        : UseSymbolTable(UseSymbolTable),
+          PrintFunctions(PrintFunctions), PrintInlining(PrintInlining),
+          Demangle(Demangle), DefaultArch(DefaultArch) {}
   };
 
   LLVMSymbolizer(const Options &Opts = Options()) : Opts(Opts) {}
@@ -62,11 +64,15 @@
   void flush();
   static std::string DemangleName(const std::string &Name);
 private:
-  typedef std::pair<Binary*, Binary*> BinaryPair;
+  typedef std::pair<ObjectFile*, ObjectFile*> ObjectPair;
 
   ModuleInfo *getOrCreateModuleInfo(const std::string &ModuleName);
-  /// \brief Returns pair of pointers to binary and debug binary.
-  BinaryPair getOrCreateBinary(const std::string &Path);
+  ObjectFile *lookUpDsymFile(const std::string &Path, const MachOObjectFile *ExeObj,
+                             const std::string &ArchName);
+
+  /// \brief Returns pair of pointers to object and debug object.
+  ObjectPair getOrCreateObjects(const std::string &Path,
+                                const std::string &ArchName);
   /// \brief Returns a parsed object file for a given architecture in a
   /// universal binary (or the binary itself if it is an object file).
   ObjectFile *getObjectFileFromBinary(Binary *Bin, const std::string &ArchName);
@@ -75,14 +81,21 @@
 
   // Owns all the parsed binaries and object files.
   SmallVector<std::unique_ptr<Binary>, 4> ParsedBinariesAndObjects;
+  SmallVector<std::unique_ptr<MemoryBuffer>, 4> MemoryBuffers;
+  void addOwningBinary(OwningBinary<Binary> OwningBin) {
+    std::unique_ptr<Binary> Bin;
+    std::unique_ptr<MemoryBuffer> MemBuf;
+    std::tie(Bin, MemBuf) = OwningBin.takeBinary();
+    ParsedBinariesAndObjects.push_back(std::move(Bin));
+    MemoryBuffers.push_back(std::move(MemBuf));
+  }
+
   // Owns module info objects.
-  typedef std::map<std::string, ModuleInfo *> ModuleMapTy;
-  ModuleMapTy Modules;
-  typedef std::map<std::string, BinaryPair> BinaryMapTy;
-  BinaryMapTy BinaryForPath;
-  typedef std::map<std::pair<MachOUniversalBinary *, std::string>, ObjectFile *>
-      ObjectFileForArchMapTy;
-  ObjectFileForArchMapTy ObjectFileForArch;
+  std::map<std::string, ModuleInfo *> Modules;
+  std::map<std::pair<MachOUniversalBinary *, std::string>, ObjectFile *>
+      ObjectFileForArch;
+  std::map<std::pair<std::string, std::string>, ObjectPair>
+      ObjectPairForPathArch;
 
   Options Opts;
   static const char kBadString[];
@@ -103,7 +116,11 @@
   bool getNameFromSymbolTable(SymbolRef::Type Type, uint64_t Address,
                               std::string &Name, uint64_t &Addr,
                               uint64_t &Size) const;
-  void addSymbol(const SymbolRef &Symbol);
+  // For big-endian PowerPC64 ELF, OpdAddress is the address of the .opd
+  // (function descriptor) section and OpdExtractor refers to its contents.
+  void addSymbol(const SymbolRef &Symbol,
+                 DataExtractor *OpdExtractor = nullptr,
+                 uint64_t OpdAddress = 0);
   ObjectFile *Module;
   std::unique_ptr<DIContext> DebugInfoContext;
 
@@ -116,12 +133,11 @@
       return s1.Addr < s2.Addr;
     }
   };
-  typedef std::map<SymbolDesc, StringRef> SymbolMapTy;
-  SymbolMapTy Functions;
-  SymbolMapTy Objects;
+  std::map<SymbolDesc, StringRef> Functions;
+  std::map<SymbolDesc, StringRef> Objects;
 };
 
 } // namespace symbolize
 } // namespace llvm
 
-#endif // LLVM_SYMBOLIZE_H
+#endif

diff --git a/tools/llvm-symbolizer/llvm-symbolizer.cpp b/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 29db172..d554022 100644
--- a/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/tools/llvm-symbolizer/llvm-symbolizer.cpp

@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
@@ -61,6 +62,11 @@
              cl::desc("Path to object file to be symbolized (if not provided, "
                       "object file should be specified for each input line)"));
 
+static cl::list<std::string>
+ClDsymHint("dsym-hint", cl::ZeroOrMore,
+           cl::desc("Path to .dSYM bundles to search for debug info for the "
+                    "object files"));
+
 static bool parseCommand(bool &IsData, std::string &ModuleName,
                          uint64_t &ModuleOffset) {
   const char *kDataCmd = "DATA ";
@@ -119,6 +125,14 @@
   cl::ParseCommandLineOptions(argc, argv, "llvm-symbolizer\n");
   LLVMSymbolizer::Options Opts(ClUseSymbolTable, ClPrintFunctions,
                                ClPrintInlining, ClDemangle, ClDefaultArch);
+  for (const auto &hint : ClDsymHint) {
+    if (sys::path::extension(hint) == ".dSYM") {
+      Opts.DsymHints.push_back(hint);
+    } else {
+      errs() << "Warning: invalid dSYM hint: \"" << hint <<
+                "\" (must have the '.dSYM' extension).\n";
+    }
+  }
   LLVMSymbolizer Symbolizer(Opts);
 
   bool IsData = false;

diff --git a/tools/llvm-vtabledump/CMakeLists.txt b/tools/llvm-vtabledump/CMakeLists.txt
new file mode 100644
index 0000000..4fe205b
--- /dev/null
+++ b/tools/llvm-vtabledump/CMakeLists.txt

@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  Object
+  Support
+  )
+
+add_llvm_tool(llvm-vtabledump
+  llvm-vtabledump.cpp
+  Error.cpp
+  )

diff --git a/tools/llvm-vtabledump/Error.cpp b/tools/llvm-vtabledump/Error.cpp
new file mode 100644
index 0000000..c5de895
--- /dev/null
+++ b/tools/llvm-vtabledump/Error.cpp

@@ -0,0 +1,43 @@
+//===- Error.cpp - system_error extensions for llvm-vtabledump --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines a new error_category for the llvm-vtabledump tool.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Error.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class vtabledump_error_category : public std::error_category {
+public:
+  const char *name() const LLVM_NOEXCEPT override { return "llvm.vtabledump"; }
+  std::string message(int ev) const override {
+    switch (static_cast<vtabledump_error>(ev)) {
+    case vtabledump_error::success:
+      return "Success";
+    case vtabledump_error::file_not_found:
+      return "No such file.";
+    case vtabledump_error::unrecognized_file_format:
+      return "Unrecognized file type.";
+    }
+    llvm_unreachable(
+        "An enumerator of vtabledump_error does not have a message defined.");
+  }
+};
+} // namespace
+
+namespace llvm {
+const std::error_category &vtabledump_category() {
+  static vtabledump_error_category o;
+  return o;
+}
+} // namespace llvm

diff --git a/tools/llvm-vtabledump/Error.h b/tools/llvm-vtabledump/Error.h
new file mode 100644
index 0000000..fd8bb18
--- /dev/null
+++ b/tools/llvm-vtabledump/Error.h

@@ -0,0 +1,39 @@
+//===- Error.h - system_error extensions for llvm-vtabledump ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This declares a new error_category for the llvm-vtabledump tool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_VTABLEDUMP_ERROR_H
+#define LLVM_TOOLS_LLVM_VTABLEDUMP_ERROR_H
+
+#include <system_error>
+
+namespace llvm {
+const std::error_category &vtabledump_category();
+
+enum class vtabledump_error {
+  success = 0,
+  file_not_found,
+  unrecognized_file_format,
+};
+
+inline std::error_code make_error_code(vtabledump_error e) {
+  return std::error_code(static_cast<int>(e), vtabledump_category());
+}
+
+} // namespace llvm
+
+namespace std {
+template <>
+struct is_error_code_enum<llvm::vtabledump_error> : std::true_type {};
+}
+
+#endif

diff --git a/tools/llvm-vtabledump/LLVMBuild.txt b/tools/llvm-vtabledump/LLVMBuild.txt
new file mode 100644
index 0000000..6a3cbff
--- /dev/null
+++ b/tools/llvm-vtabledump/LLVMBuild.txt

@@ -0,0 +1,22 @@
+;===- ./tools/llvm-vtabledump/LLVMBuild.txt --------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-vtabledump
+parent = Tools
+required_libraries = all-targets BitReader Object

diff --git a/tools/llvm-vtabledump/Makefile b/tools/llvm-vtabledump/Makefile
new file mode 100644
index 0000000..596c64c
--- /dev/null
+++ b/tools/llvm-vtabledump/Makefile

@@ -0,0 +1,18 @@
+##===- tools/llvm-vtabledump/Makefile ----------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+TOOLNAME := llvm-vtabledump
+LINK_COMPONENTS := bitreader object all-targets
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS := 1
+
+include $(LEVEL)/Makefile.common
+

diff --git a/tools/llvm-vtabledump/llvm-vtabledump.cpp b/tools/llvm-vtabledump/llvm-vtabledump.cpp
new file mode 100644
index 0000000..a21acae
--- /dev/null
+++ b/tools/llvm-vtabledump/llvm-vtabledump.cpp

@@ -0,0 +1,464 @@
+//===- llvm-vtabledump.cpp - Dump vtables in an Object File -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Dumps VTables resident in object files and archives.  Note, it currently only
+// supports MS-ABI style object files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-vtabledump.h"
+#include "Error.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include <map>
+#include <string>
+#include <system_error>
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::support;
+
+namespace opts {
+cl::list<std::string> InputFilenames(cl::Positional,
+                                     cl::desc("<input object files>"),
+                                     cl::ZeroOrMore);
+} // namespace opts
+
+static int ReturnValue = EXIT_SUCCESS;
+
+namespace llvm {
+
+bool error(std::error_code EC) {
+  if (!EC)
+    return false;
+
+  ReturnValue = EXIT_FAILURE;
+  outs() << "\nError reading file: " << EC.message() << ".\n";
+  outs().flush();
+  return true;
+}
+
+} // namespace llvm
+
+static void reportError(StringRef Input, StringRef Message) {
+  if (Input == "-")
+    Input = "<stdin>";
+
+  errs() << Input << ": " << Message << "\n";
+  errs().flush();
+  ReturnValue = EXIT_FAILURE;
+}
+
+static void reportError(StringRef Input, std::error_code EC) {
+  reportError(Input, EC.message());
+}
+
+static SmallVectorImpl<SectionRef> &getRelocSections(const ObjectFile *Obj,
+                                                     const SectionRef &Sec) {
+  static bool MappingDone = false;
+  static std::map<SectionRef, SmallVector<SectionRef, 1>> SectionRelocMap;
+  if (!MappingDone) {
+    for (const SectionRef &Section : Obj->sections()) {
+      section_iterator Sec2 = Section.getRelocatedSection();
+      if (Sec2 != Obj->section_end())
+        SectionRelocMap[*Sec2].push_back(Section);
+    }
+    MappingDone = true;
+  }
+  return SectionRelocMap[Sec];
+}
+
+static bool collectRelocatedSymbols(const ObjectFile *Obj,
+                                    const SectionRef &Sec, uint64_t SecAddress,
+                                    uint64_t SymAddress, uint64_t SymSize,
+                                    StringRef *I, StringRef *E) {
+  uint64_t SymOffset = SymAddress - SecAddress;
+  uint64_t SymEnd = SymOffset + SymSize;
+  for (const SectionRef &SR : getRelocSections(Obj, Sec)) {
+    for (const object::RelocationRef &Reloc : SR.relocations()) {
+      if (I == E)
+        break;
+      const object::symbol_iterator RelocSymI = Reloc.getSymbol();
+      if (RelocSymI == Obj->symbol_end())
+        continue;
+      StringRef RelocSymName;
+      if (error(RelocSymI->getName(RelocSymName)))
+        return true;
+      uint64_t Offset;
+      if (error(Reloc.getOffset(Offset)))
+        return true;
+      if (Offset >= SymOffset && Offset < SymEnd) {
+        *I = RelocSymName;
+        ++I;
+      }
+    }
+  }
+  return false;
+}
+
+static bool collectRelocationOffsets(
+    const ObjectFile *Obj, const SectionRef &Sec, uint64_t SecAddress,
+    uint64_t SymAddress, uint64_t SymSize, StringRef SymName,
+    std::map<std::pair<StringRef, uint64_t>, StringRef> &Collection) {
+  uint64_t SymOffset = SymAddress - SecAddress;
+  uint64_t SymEnd = SymOffset + SymSize;
+  for (const SectionRef &SR : getRelocSections(Obj, Sec)) {
+    for (const object::RelocationRef &Reloc : SR.relocations()) {
+      const object::symbol_iterator RelocSymI = Reloc.getSymbol();
+      if (RelocSymI == Obj->symbol_end())
+        continue;
+      StringRef RelocSymName;
+      if (error(RelocSymI->getName(RelocSymName)))
+        return true;
+      uint64_t Offset;
+      if (error(Reloc.getOffset(Offset)))
+        return true;
+      if (Offset >= SymOffset && Offset < SymEnd)
+        Collection[std::make_pair(SymName, Offset - SymOffset)] = RelocSymName;
+    }
+  }
+  return false;
+}
+
+static void dumpVTables(const ObjectFile *Obj) {
+  struct CompleteObjectLocator {
+    StringRef Symbols[2];
+    ArrayRef<little32_t> Data;
+  };
+  struct ClassHierarchyDescriptor {
+    StringRef Symbols[1];
+    ArrayRef<little32_t> Data;
+  };
+  struct BaseClassDescriptor {
+    StringRef Symbols[2];
+    ArrayRef<little32_t> Data;
+  };
+  struct TypeDescriptor {
+    StringRef Symbols[1];
+    uint64_t AlwaysZero;
+    StringRef MangledName;
+  };
+  std::map<std::pair<StringRef, uint64_t>, StringRef> VFTableEntries;
+  std::map<StringRef, ArrayRef<little32_t>> VBTables;
+  std::map<StringRef, CompleteObjectLocator> COLs;
+  std::map<StringRef, ClassHierarchyDescriptor> CHDs;
+  std::map<std::pair<StringRef, uint64_t>, StringRef> BCAEntries;
+  std::map<StringRef, BaseClassDescriptor> BCDs;
+  std::map<StringRef, TypeDescriptor> TDs;
+
+  std::map<std::pair<StringRef, uint64_t>, StringRef> VTableSymEntries;
+  std::map<std::pair<StringRef, uint64_t>, int64_t> VTableDataEntries;
+  std::map<std::pair<StringRef, uint64_t>, StringRef> VTTEntries;
+  std::map<StringRef, StringRef> TINames;
+
+  uint8_t BytesInAddress = Obj->getBytesInAddress();
+
+  for (const object::SymbolRef &Sym : Obj->symbols()) {
+    StringRef SymName;
+    if (error(Sym.getName(SymName)))
+      return;
+    object::section_iterator SecI(Obj->section_begin());
+    if (error(Sym.getSection(SecI)))
+      return;
+    // Skip external symbols.
+    if (SecI == Obj->section_end())
+      continue;
+    const SectionRef &Sec = *SecI;
+    // Skip virtual or BSS sections.
+    if (Sec.isBSS() || Sec.isVirtual())
+      continue;
+    StringRef SecContents;
+    if (error(Sec.getContents(SecContents)))
+      return;
+    uint64_t SymAddress, SymSize;
+    if (error(Sym.getAddress(SymAddress)) || error(Sym.getSize(SymSize)))
+      return;
+    uint64_t SecAddress = Sec.getAddress();
+    uint64_t SecSize = Sec.getSize();
+    uint64_t SymOffset = SymAddress - SecAddress;
+    StringRef SymContents = SecContents.substr(SymOffset, SymSize);
+
+    // VFTables in the MS-ABI start with '??_7' and are contained within their
+    // own COMDAT section.  We then determine the contents of the VFTable by
+    // looking at each relocation in the section.
+    if (SymName.startswith("??_7")) {
+      // Each relocation either names a virtual method or a thunk.  We note the
+      // offset into the section and the symbol used for the relocation.
+      collectRelocationOffsets(Obj, Sec, SecAddress, SecAddress, SecSize,
+                               SymName, VFTableEntries);
+    }
+    // VBTables in the MS-ABI start with '??_8' and are filled with 32-bit
+    // offsets of virtual bases.
+    else if (SymName.startswith("??_8")) {
+      ArrayRef<little32_t> VBTableData(
+          reinterpret_cast<const little32_t *>(SymContents.data()),
+          SymContents.size() / sizeof(little32_t));
+      VBTables[SymName] = VBTableData;
+    }
+    // Complete object locators in the MS-ABI start with '??_R4'
+    else if (SymName.startswith("??_R4")) {
+      CompleteObjectLocator COL;
+      COL.Data = ArrayRef<little32_t>(
+          reinterpret_cast<const little32_t *>(SymContents.data()), 3);
+      StringRef *I = std::begin(COL.Symbols), *E = std::end(COL.Symbols);
+      if (collectRelocatedSymbols(Obj, Sec, SecAddress, SymAddress, SymSize, I,
+                                  E))
+        return;
+      COLs[SymName] = COL;
+    }
+    // Class hierarchy descriptors in the MS-ABI start with '??_R3'
+    else if (SymName.startswith("??_R3")) {
+      ClassHierarchyDescriptor CHD;
+      CHD.Data = ArrayRef<little32_t>(
+          reinterpret_cast<const little32_t *>(SymContents.data()), 3);
+      StringRef *I = std::begin(CHD.Symbols), *E = std::end(CHD.Symbols);
+      if (collectRelocatedSymbols(Obj, Sec, SecAddress, SymAddress, SymSize, I,
+                                  E))
+        return;
+      CHDs[SymName] = CHD;
+    }
+    // Class hierarchy descriptors in the MS-ABI start with '??_R2'
+    else if (SymName.startswith("??_R2")) {
+      // Each relocation names a base class descriptor.  We note the offset into
+      // the section and the symbol used for the relocation.
+      collectRelocationOffsets(Obj, Sec, SecAddress, SymAddress, SymSize,
+                               SymName, BCAEntries);
+    }
+    // Base class descriptors in the MS-ABI start with '??_R1'
+    else if (SymName.startswith("??_R1")) {
+      BaseClassDescriptor BCD;
+      BCD.Data = ArrayRef<little32_t>(
+          reinterpret_cast<const little32_t *>(SymContents.data()) + 1, 5);
+      StringRef *I = std::begin(BCD.Symbols), *E = std::end(BCD.Symbols);
+      if (collectRelocatedSymbols(Obj, Sec, SecAddress, SymAddress, SymSize, I,
+                                  E))
+        return;
+      BCDs[SymName] = BCD;
+    }
+    // Type descriptors in the MS-ABI start with '??_R0'
+    else if (SymName.startswith("??_R0")) {
+      const char *DataPtr = SymContents.drop_front(BytesInAddress).data();
+      TypeDescriptor TD;
+      if (BytesInAddress == 8)
+        TD.AlwaysZero = *reinterpret_cast<const little64_t *>(DataPtr);
+      else
+        TD.AlwaysZero = *reinterpret_cast<const little32_t *>(DataPtr);
+      TD.MangledName = SymContents.drop_front(BytesInAddress * 2);
+      StringRef *I = std::begin(TD.Symbols), *E = std::end(TD.Symbols);
+      if (collectRelocatedSymbols(Obj, Sec, SecAddress, SymAddress, SymSize, I,
+                                  E))
+        return;
+      TDs[SymName] = TD;
+    }
+    // Construction vtables in the Itanium ABI start with '_ZTT' or '__ZTT'.
+    else if (SymName.startswith("_ZTT") || SymName.startswith("__ZTT")) {
+      collectRelocationOffsets(Obj, Sec, SecAddress, SymAddress, SymSize,
+                               SymName, VTTEntries);
+    }
+    // Typeinfo names in the Itanium ABI start with '_ZTS' or '__ZTS'.
+    else if (SymName.startswith("_ZTS") || SymName.startswith("__ZTS")) {
+      TINames[SymName] = SymContents.slice(0, SymContents.find('\0'));
+    }
+    // Vtables in the Itanium ABI start with '_ZTV' or '__ZTV'.
+    else if (SymName.startswith("_ZTV") || SymName.startswith("__ZTV")) {
+      collectRelocationOffsets(Obj, Sec, SecAddress, SymAddress, SymSize,
+                               SymName, VTableSymEntries);
+      for (uint64_t SymOffI = 0; SymOffI < SymSize; SymOffI += BytesInAddress) {
+        auto Key = std::make_pair(SymName, SymOffI);
+        if (VTableSymEntries.count(Key))
+          continue;
+        const char *DataPtr = SymContents.substr(SymOffI, BytesInAddress).data();
+        int64_t VData;
+        if (BytesInAddress == 8)
+          VData = *reinterpret_cast<const little64_t *>(DataPtr);
+        else
+          VData = *reinterpret_cast<const little32_t *>(DataPtr);
+        VTableDataEntries[Key] = VData;
+      }
+    }
+    // Typeinfo structures in the Itanium ABI start with '_ZTI' or '__ZTI'.
+    else if (SymName.startswith("_ZTI") || SymName.startswith("__ZTI")) {
+      // FIXME: Do something with these!
+    }
+  }
+  for (const std::pair<std::pair<StringRef, uint64_t>, StringRef> &VFTableEntry :
+       VFTableEntries) {
+    StringRef VFTableName = VFTableEntry.first.first;
+    uint64_t Offset = VFTableEntry.first.second;
+    StringRef SymName = VFTableEntry.second;
+    outs() << VFTableName << '[' << Offset << "]: " << SymName << '\n';
+  }
+  for (const std::pair<StringRef, ArrayRef<little32_t>> &VBTable : VBTables) {
+    StringRef VBTableName = VBTable.first;
+    uint32_t Idx = 0;
+    for (little32_t Offset : VBTable.second) {
+      outs() << VBTableName << '[' << Idx << "]: " << Offset << '\n';
+      Idx += sizeof(Offset);
+    }
+  }
+  for (const std::pair<StringRef, CompleteObjectLocator> &COLPair : COLs) {
+    StringRef COLName = COLPair.first;
+    const CompleteObjectLocator &COL = COLPair.second;
+    outs() << COLName << "[IsImageRelative]: " << COL.Data[0] << '\n';
+    outs() << COLName << "[OffsetToTop]: " << COL.Data[1] << '\n';
+    outs() << COLName << "[VFPtrOffset]: " << COL.Data[2] << '\n';
+    outs() << COLName << "[TypeDescriptor]: " << COL.Symbols[0] << '\n';
+    outs() << COLName << "[ClassHierarchyDescriptor]: " << COL.Symbols[1] << '\n';
+  }
+  for (const std::pair<StringRef, ClassHierarchyDescriptor> &CHDPair : CHDs) {
+    StringRef CHDName = CHDPair.first;
+    const ClassHierarchyDescriptor &CHD = CHDPair.second;
+    outs() << CHDName << "[AlwaysZero]: " << CHD.Data[0] << '\n';
+    outs() << CHDName << "[Flags]: " << CHD.Data[1] << '\n';
+    outs() << CHDName << "[NumClasses]: " << CHD.Data[2] << '\n';
+    outs() << CHDName << "[BaseClassArray]: " << CHD.Symbols[0] << '\n';
+  }
+  for (const std::pair<std::pair<StringRef, uint64_t>, StringRef> &BCAEntry :
+       BCAEntries) {
+    StringRef BCAName = BCAEntry.first.first;
+    uint64_t Offset = BCAEntry.first.second;
+    StringRef SymName = BCAEntry.second;
+    outs() << BCAName << '[' << Offset << "]: " << SymName << '\n';
+  }
+  for (const std::pair<StringRef, BaseClassDescriptor> &BCDPair : BCDs) {
+    StringRef BCDName = BCDPair.first;
+    const BaseClassDescriptor &BCD = BCDPair.second;
+    outs() << BCDName << "[TypeDescriptor]: " << BCD.Symbols[0] << '\n';
+    outs() << BCDName << "[NumBases]: " << BCD.Data[0] << '\n';
+    outs() << BCDName << "[OffsetInVBase]: " << BCD.Data[1] << '\n';
+    outs() << BCDName << "[VBPtrOffset]: " << BCD.Data[2] << '\n';
+    outs() << BCDName << "[OffsetInVBTable]: " << BCD.Data[3] << '\n';
+    outs() << BCDName << "[Flags]: " << BCD.Data[4] << '\n';
+    outs() << BCDName << "[ClassHierarchyDescriptor]: " << BCD.Symbols[1] << '\n';
+  }
+  for (const std::pair<StringRef, TypeDescriptor> &TDPair : TDs) {
+    StringRef TDName = TDPair.first;
+    const TypeDescriptor &TD = TDPair.second;
+    outs() << TDName << "[VFPtr]: " << TD.Symbols[0] << '\n';
+    outs() << TDName << "[AlwaysZero]: " << TD.AlwaysZero << '\n';
+    outs() << TDName << "[MangledName]: ";
+    outs().write_escaped(TD.MangledName.rtrim(StringRef("\0", 1)),
+                         /*UseHexEscapes=*/true)
+        << '\n';
+  }
+  for (const std::pair<std::pair<StringRef, uint64_t>, StringRef> &VTTPair :
+       VTTEntries) {
+    StringRef VTTName = VTTPair.first.first;
+    uint64_t VTTOffset = VTTPair.first.second;
+    StringRef VTTEntry = VTTPair.second;
+    outs() << VTTName << '[' << VTTOffset << "]: " << VTTEntry << '\n';
+  }
+  for (const std::pair<StringRef, StringRef> &TIPair : TINames) {
+    StringRef TIName = TIPair.first;
+    outs() << TIName << ": " << TIPair.second << '\n';
+  }
+  auto VTableSymI = VTableSymEntries.begin();
+  auto VTableSymE = VTableSymEntries.end();
+  auto VTableDataI = VTableDataEntries.begin();
+  auto VTableDataE = VTableDataEntries.end();
+  for (;;) {
+    bool SymDone = VTableSymI == VTableSymE;
+    bool DataDone = VTableDataI == VTableDataE;
+    if (SymDone && DataDone)
+      break;
+    if (!SymDone && (DataDone || VTableSymI->first < VTableDataI->first)) {
+      StringRef VTableName = VTableSymI->first.first;
+      uint64_t Offset = VTableSymI->first.second;
+      StringRef VTableEntry = VTableSymI->second;
+      outs() << VTableName << '[' << Offset << "]: ";
+      outs() << VTableEntry;
+      outs() << '\n';
+      ++VTableSymI;
+      continue;
+    }
+    if (!DataDone && (SymDone || VTableDataI->first < VTableSymI->first)) {
+      StringRef VTableName = VTableDataI->first.first;
+      uint64_t Offset = VTableDataI->first.second;
+      int64_t VTableEntry = VTableDataI->second;
+      outs() << VTableName << '[' << Offset << "]: ";
+      outs() << VTableEntry;
+      outs() << '\n';
+      ++VTableDataI;
+      continue;
+    }
+  }
+}
+
+static void dumpArchive(const Archive *Arc) {
+  for (const Archive::Child &ArcC : Arc->children()) {
+    ErrorOr<std::unique_ptr<Binary>> ChildOrErr = ArcC.getAsBinary();
+    if (std::error_code EC = ChildOrErr.getError()) {
+      // Ignore non-object files.
+      if (EC != object_error::invalid_file_type)
+        reportError(Arc->getFileName(), EC.message());
+      continue;
+    }
+
+    if (ObjectFile *Obj = dyn_cast<ObjectFile>(&*ChildOrErr.get()))
+      dumpVTables(Obj);
+    else
+      reportError(Arc->getFileName(),
+                  vtabledump_error::unrecognized_file_format);
+  }
+}
+
+static void dumpInput(StringRef File) {
+  // If file isn't stdin, check that it exists.
+  if (File != "-" && !sys::fs::exists(File)) {
+    reportError(File, vtabledump_error::file_not_found);
+    return;
+  }
+
+  // Attempt to open the binary.
+  ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
+  if (std::error_code EC = BinaryOrErr.getError()) {
+    reportError(File, EC);
+    return;
+  }
+  Binary &Binary = *BinaryOrErr.get().getBinary();
+
+  if (Archive *Arc = dyn_cast<Archive>(&Binary))
+    dumpArchive(Arc);
+  else if (ObjectFile *Obj = dyn_cast<ObjectFile>(&Binary))
+    dumpVTables(Obj);
+  else
+    reportError(File, vtabledump_error::unrecognized_file_format);
+}
+
+int main(int argc, const char *argv[]) {
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+  llvm_shutdown_obj Y;
+
+  // Initialize targets.
+  llvm::InitializeAllTargetInfos();
+
+  // Register the target printer for --version.
+  cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM VTable Dumper\n");
+
+  // Default to stdin if no filename is specified.
+  if (opts::InputFilenames.size() == 0)
+    opts::InputFilenames.push_back("-");
+
+  std::for_each(opts::InputFilenames.begin(), opts::InputFilenames.end(),
+                dumpInput);
+
+  return ReturnValue;
+}

diff --git a/tools/llvm-vtabledump/llvm-vtabledump.h b/tools/llvm-vtabledump/llvm-vtabledump.h
new file mode 100644
index 0000000..62f7557
--- /dev/null
+++ b/tools/llvm-vtabledump/llvm-vtabledump.h

@@ -0,0 +1,23 @@
+//===-- llvm-vtabledump.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_VTABLEDUMP_LLVM_VTABLEDUMP_H
+#define LLVM_TOOLS_LLVM_VTABLEDUMP_LLVM_VTABLEDUMP_H
+
+#include "llvm/Support/CommandLine.h"
+#include <string>
+
+namespace opts {
+extern llvm::cl::list<std::string> InputFilenames;
+} // namespace opts
+
+#define LLVM_VTABLEDUMP_ENUM_ENT(ns, enum)                                     \
+  { #enum, ns::enum }
+
+#endif

diff --git a/tools/lto/CMakeLists.txt b/tools/lto/CMakeLists.txt
index 71391b7..559b22b 100644
--- a/tools/lto/CMakeLists.txt
+++ b/tools/lto/CMakeLists.txt

@@ -1,14 +1,11 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
-  Core
   LTO
   MC
   MCDisassembler
   Support
   )
 
-add_definitions( -DLLVM_VERSION_INFO=\"${PACKAGE_VERSION}\" )
-
 set(SOURCES
   LTODisassembler.cpp
   lto.cpp

diff --git a/tools/lto/Makefile b/tools/lto/Makefile
index a4fe9ac..530c05a 100644
--- a/tools/lto/Makefile
+++ b/tools/lto/Makefile

@@ -17,10 +17,6 @@
 
 include $(LEVEL)/Makefile.common
 
-ifdef LLVM_VERSION_INFO
-CXX.Flags += -DLLVM_VERSION_INFO='"$(LLVM_VERSION_INFO)"'
-endif
-
 ifeq ($(HOST_OS),Darwin)
     # Special hack to allow libLTO to have an offset version number.
     ifdef LLVM_LTO_VERSION_OFFSET

diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index b401f9a..ef37c90 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp

@@ -32,6 +32,10 @@
 DisableGVNLoadPRE("disable-gvn-loadpre", cl::init(false),
   cl::desc("Do not run the GVN load PRE pass"));
 
+static cl::opt<bool>
+DisableLTOVectorization("disable-lto-vectorization", cl::init(false),
+  cl::desc("Do not run loop or slp vectorization during LTO"));
+
 // Holds most recent error string.
 // *** Not thread safe ***
 static std::string sLastErrorString;
@@ -146,6 +150,24 @@
       LTOModule::createFromBuffer(mem, length, Options, sLastErrorString, path));
 }
 
+lto_module_t lto_module_create_in_local_context(const void *mem, size_t length,
+                                                const char *path) {
+  lto_initialize();
+  llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
+  return wrap(LTOModule::createInLocalContext(mem, length, Options,
+                                              sLastErrorString, path));
+}
+
+lto_module_t lto_module_create_in_codegen_context(const void *mem,
+                                                  size_t length,
+                                                  const char *path,
+                                                  lto_code_gen_t cg) {
+  lto_initialize();
+  llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
+  return wrap(LTOModule::createInContext(mem, length, Options, sLastErrorString,
+                                         path, &unwrap(cg)->getContext()));
+}
+
 void lto_module_dispose(lto_module_t mod) { delete unwrap(mod); }
 
 const char* lto_module_get_target_triple(lto_module_t mod) {
@@ -205,7 +227,7 @@
 void lto_codegen_dispose(lto_code_gen_t cg) { delete unwrap(cg); }
 
 bool lto_codegen_add_module(lto_code_gen_t cg, lto_module_t mod) {
-  return !unwrap(cg)->addModule(unwrap(mod), sLastErrorString);
+  return !unwrap(cg)->addModule(unwrap(mod));
 }
 
 bool lto_codegen_set_debug_model(lto_code_gen_t cg, lto_debug_model debug) {
@@ -222,10 +244,6 @@
   return unwrap(cg)->setCpu(cpu);
 }
 
-void lto_codegen_set_attr(lto_code_gen_t cg, const char *attr) {
-  return unwrap(cg)->setAttr(attr);
-}
-
 void lto_codegen_set_assembler_path(lto_code_gen_t cg, const char *path) {
   // In here only for backwards compatibility. We use MC now.
 }
@@ -256,7 +274,8 @@
     parsedOptions = true;
   }
   return unwrap(cg)->compile(length, DisableOpt, DisableInline,
-                             DisableGVNLoadPRE, sLastErrorString);
+                             DisableGVNLoadPRE, DisableLTOVectorization,
+                             sLastErrorString);
 }
 
 bool lto_codegen_compile_to_file(lto_code_gen_t cg, const char **name) {
@@ -265,8 +284,9 @@
     lto_add_attrs(cg);
     parsedOptions = true;
   }
-  return !unwrap(cg)->compile_to_file(name, DisableOpt, DisableInline,
-                                      DisableGVNLoadPRE, sLastErrorString);
+  return !unwrap(cg)->compile_to_file(
+      name, DisableOpt, DisableInline, DisableGVNLoadPRE,
+      DisableLTOVectorization, sLastErrorString);
 }
 
 void lto_codegen_debug_options(lto_code_gen_t cg, const char *opt) {

diff --git a/tools/macho-dump/macho-dump.cpp b/tools/macho-dump/macho-dump.cpp
index 7600979..aac720d 100644
--- a/tools/macho-dump/macho-dump.cpp
+++ b/tools/macho-dump/macho-dump.cpp

@@ -324,7 +324,7 @@
                const MachOObjectFile::LoadCommandInfo &LCI) {
   MachO::version_min_command VMLC = Obj.getVersionMinLoadCommand(LCI);
   outs() << "  ('version, " << VMLC.version << ")\n"
-         << "  ('reserved, " << VMLC.reserved << ")\n";
+         << "  ('sdk, " << VMLC.sdk << ")\n";
   return 0;
 }
 
@@ -403,12 +403,12 @@
 
   cl::ParseCommandLineOptions(argc, argv, "llvm Mach-O dumping tool\n");
 
-  ErrorOr<Binary *> BinaryOrErr = createBinary(InputFile);
+  ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(InputFile);
   if (std::error_code EC = BinaryOrErr.getError())
     return Error("unable to read input: '" + EC.message() + "'");
-  std::unique_ptr<Binary> Binary(BinaryOrErr.get());
+  Binary &Binary = *BinaryOrErr.get().getBinary();
 
-  const MachOObjectFile *InputObject = dyn_cast<MachOObjectFile>(Binary.get());
+  const MachOObjectFile *InputObject = dyn_cast<MachOObjectFile>(&Binary);
   if (!InputObject)
     return Error("Not a MachO object");
 

diff --git a/tools/msbuild/CMakeLists.txt b/tools/msbuild/CMakeLists.txt
index b7be71d..4f471e5 100644
--- a/tools/msbuild/CMakeLists.txt
+++ b/tools/msbuild/CMakeLists.txt

@@ -10,6 +10,8 @@
     set(prop_file_v110_xp "Microsoft.Cpp.${platform}.LLVM-vs2012_xp.props")
     set(prop_file_v120 "toolset-vs2013.props")
     set(prop_file_v120_xp "toolset-vs2013_xp.props")
+    set(prop_file_v140 "toolset-vs2014.props")
+    set(prop_file_v140_xp "toolset-vs2014_xp.props")
 
     if (platform STREQUAL "Win32")
       set(mflag "m32")
@@ -29,6 +31,11 @@
     configure_file(${prop_file_in} ${platform}/${prop_file_v120})
     set(VS_VERSION "v120_xp")
     configure_file(${prop_file_in} ${platform}/${prop_file_v120_xp})
+    set(VS_VERSION "v140")
+    set(MSC_VERSION "1900")
+    configure_file(${prop_file_in} ${platform}/${prop_file_v140})
+    set(VS_VERSION "v140_xp")
+    configure_file(${prop_file_in} ${platform}/${prop_file_v140_xp})
     set(VS_VERSION)
     set(MSC_VERSION)
     set(mflag)
@@ -38,12 +45,16 @@
     install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${platform}/${prop_file_v110_xp}" DESTINATION tools/msbuild/${platform})
     install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${platform}/${prop_file_v120}" DESTINATION tools/msbuild/${platform})
     install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${platform}/${prop_file_v120_xp}" DESTINATION tools/msbuild/${platform})
+    install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${platform}/${prop_file_v140}" DESTINATION tools/msbuild/${platform})
+    install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${platform}/${prop_file_v140_xp}" DESTINATION tools/msbuild/${platform})
 
     install(FILES "Microsoft.Cpp.Win32.LLVM-vs2010.targets" DESTINATION "tools/msbuild/${platform}" RENAME "Microsoft.Cpp.${platform}.LLVM-vs2010.targets")
     install(FILES "Microsoft.Cpp.Win32.LLVM-vs2012.targets" DESTINATION "tools/msbuild/${platform}" RENAME "Microsoft.Cpp.${platform}.LLVM-vs2012.targets")
     install(FILES "Microsoft.Cpp.Win32.LLVM-vs2012_xp.targets" DESTINATION "tools/msbuild/${platform}" RENAME "Microsoft.Cpp.${platform}.LLVM-vs2012_xp.targets")
     install(FILES "toolset-vs2013.targets" DESTINATION "tools/msbuild/${platform}")
     install(FILES "toolset-vs2013_xp.targets" DESTINATION "tools/msbuild/${platform}")
+    install(FILES "toolset-vs2014.targets" DESTINATION "tools/msbuild/${platform}")
+    install(FILES "toolset-vs2014_xp.targets" DESTINATION "tools/msbuild/${platform}")
   endforeach()
 
   set(LIB_PATH_VERSION)

diff --git a/tools/msbuild/install.bat b/tools/msbuild/install.bat
index 9880fb2..6e321e3 100644
--- a/tools/msbuild/install.bat
+++ b/tools/msbuild/install.bat

@@ -6,13 +6,15 @@
 REM Change to the directory of this batch file.

 cd /d %~dp0

 

+REM Loop over the two platforms in awkward batch file fashion.

 set PLATFORM=None

-:START

-IF %PLATFORM% == x64 GOTO LOOPEND

+:PLATFORMLOOPHEAD

+IF %PLATFORM% == x64 GOTO PLATFORMLOOPEND

 IF %PLATFORM% == Win32 SET PLATFORM=x64

 IF %PLATFORM% == None SET PLATFORM=Win32

 

 REM Search for the MSBuild toolsets directory.

+

 SET D="%ProgramFiles%\MSBuild\Microsoft.Cpp\v4.0\Platforms\%PLATFORM%\PlatformToolsets"

 IF EXIST %D% GOTO FOUND_V100

 SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\Platforms\%PLATFORM%\PlatformToolsets"

@@ -30,13 +32,24 @@
 SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\V120\Platforms\%PLATFORM%\PlatformToolsets"

 IF EXIST %D% GOTO FOUND_V120

 

-:LOOPEND

+:TRY_V140

+SET D="%ProgramFiles%\MSBuild\Microsoft.Cpp\v4.0\V140\Platforms\%PLATFORM%\PlatformToolsets"

+IF EXIST %D% GOTO FOUND_V140

+SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\V140\Platforms\%PLATFORM%\PlatformToolsets"

+IF EXIST %D% GOTO FOUND_V140

+

+:TRY_V150

+

+GOTO PLATFORMLOOPHEAD

+

+:PLATFORMLOOPEND

 IF %SUCCESS% == 1 goto DONE

 echo Failed to find MSBuild toolsets directory.

 goto FAILED

 

 

 :FOUND_V100

+REM Routine for installing v100 toolchain.

 IF NOT EXIST %D%\LLVM-vs2010 mkdir %D%\LLVM-vs2010

 IF NOT %ERRORLEVEL% == 0 GOTO FAILED

 copy %PLATFORM%\Microsoft.Cpp.%PLATFORM%.LLVM-vs2010.props %D%\LLVM-vs2010

@@ -47,6 +60,7 @@
 GOTO TRY_V110

 

 :FOUND_V110

+REM Routine for installing v110 toolchain.

 IF NOT EXIST %D%\LLVM-vs2012 mkdir %D%\LLVM-vs2012

 IF NOT %ERRORLEVEL% == 0 GOTO FAILED

 copy %PLATFORM%\Microsoft.Cpp.%PLATFORM%.LLVM-vs2012.props %D%\LLVM-vs2012

@@ -63,6 +77,7 @@
 GOTO TRY_V120

 

 :FOUND_V120

+REM Routine for installing v120 toolchain.

 IF NOT EXIST %D%\LLVM-vs2013 mkdir %D%\LLVM-vs2013

 IF NOT %ERRORLEVEL% == 0 GOTO FAILED

 copy %PLATFORM%\toolset-vs2013.props %D%\LLVM-vs2013\toolset.props

@@ -76,7 +91,24 @@
 copy %PLATFORM%\toolset-vs2013_xp.targets %D%\LLVM-vs2013_xp\toolset.targets

 IF NOT %ERRORLEVEL% == 0 GOTO FAILED

 set SUCCESS=1

-GOTO START

+GOTO TRY_V140

+

+:FOUND_V140

+REM Routine for installing v140 toolchain.

+IF NOT EXIST %D%\LLVM-vs2014 mkdir %D%\LLVM-vs2014

+IF NOT %ERRORLEVEL% == 0 GOTO FAILED

+copy %PLATFORM%\toolset-vs2014.props %D%\LLVM-vs2014\toolset.props

+IF NOT %ERRORLEVEL% == 0 GOTO FAILED

+copy %PLATFORM%\toolset-vs2014.targets %D%\LLVM-vs2014\toolset.targets

+IF NOT %ERRORLEVEL% == 0 GOTO FAILED

+IF NOT EXIST %D%\LLVM-vs2014_xp mkdir %D%\LLVM-vs2014_xp

+IF NOT %ERRORLEVEL% == 0 GOTO FAILED

+copy %PLATFORM%\toolset-vs2014_xp.props %D%\LLVM-vs2014_xp\toolset.props

+IF NOT %ERRORLEVEL% == 0 GOTO FAILED

+copy %PLATFORM%\toolset-vs2014_xp.targets %D%\LLVM-vs2014_xp\toolset.targets

+IF NOT %ERRORLEVEL% == 0 GOTO FAILED

+set SUCCESS=1

+GOTO TRY_V150

 

 

 :DONE


diff --git a/tools/msbuild/toolset-vs2014.targets b/tools/msbuild/toolset-vs2014.targets
new file mode 100644
index 0000000..05b59a2
--- /dev/null
+++ b/tools/msbuild/toolset-vs2014.targets

@@ -0,0 +1,3 @@
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">

+  <Import Project="$(VCTargetsPath)\Microsoft.CppCommon.targets" />

+</Project>


diff --git a/tools/msbuild/toolset-vs2014_xp.targets b/tools/msbuild/toolset-vs2014_xp.targets
new file mode 100644
index 0000000..eec4f18
--- /dev/null
+++ b/tools/msbuild/toolset-vs2014_xp.targets

@@ -0,0 +1,21 @@
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">

+  <!-- Force TargetFrameworkVersion to v4.0 to support XP-->

+  <PropertyGroup>

+    <TargetFrameworkVersion>v4.0</TargetFrameworkVersion>

+    <BeforeClCompileTargets>NoSupportCodeAnalysisXP;$(BeforeClCompileTargets)</BeforeClCompileTargets>

+  </PropertyGroup>

+

+  <Import Project="$(VCTargetsPath)\Microsoft.CppCommon.targets" />

+

+  <Target Name="NoSupportCodeAnalysisXP" Condition="'$(ErrorNoSupportCodeAnalysisXP)' != 'false'">

+    <VCMessage Condition="'$(DesignTimeBuild)' != 'true' and '@(ClCompile->AnyHaveMetadataValue('EnablePREfast', 'true'))'=='true'" Code="MSB8026" Type="Error"/>

+  </Target>

+

+  <PropertyGroup>

+    <PrepareForBuildDependsOn>CheckWindowsSDK71A;$(PrepareForBuildDependsOn)</PrepareForBuildDependsOn>

+  </PropertyGroup>

+

+  <Target Name="CheckWindowsSDK71A">

+    <VCMessage Code="MSB8003" Type="Warning" Arguments="WindowsSdkDir_71A" Condition="'$(WindowsSdkDir_71A)'=='' and '$(UseEnv)' != 'true'" />

+  </Target>

+</Project>


diff --git a/tools/msbuild/uninstall.bat b/tools/msbuild/uninstall.bat
index b0bc943..c1afae2 100644
--- a/tools/msbuild/uninstall.bat
+++ b/tools/msbuild/uninstall.bat

@@ -6,8 +6,8 @@
 cd /d %~dp0

 

 set PLATFORM=None

-:START

-IF %PLATFORM% == x64 GOTO END

+:LOOPHEAD

+IF %PLATFORM% == x64 GOTO LOOPEND

 IF %PLATFORM% == Win32 SET PLATFORM=x64

 IF %PLATFORM% == None SET PLATFORM=Win32

 

@@ -51,8 +51,23 @@
 IF EXIST %D%\LLVM-vs2013_xp del %D%\LLVM-vs2013_xp\toolset.targets

 IF EXIST %D%\LLVM-vs2013_xp rmdir %D%\LLVM-vs2013_xp

 

+SET D="%ProgramFiles%\MSBuild\Microsoft.Cpp\v4.0\V140\Platforms\%PLATFORM%\PlatformToolsets"

+IF EXIST %D%\LLVM-vs2014 del %D%\LLVM-vs2014\toolset.props

+IF EXIST %D%\LLVM-vs2014 del %D%\LLVM-vs2014\toolset.targets

+IF EXIST %D%\LLVM-vs2014 rmdir %D%\LLVM-vs2014

+IF EXIST %D%\LLVM-vs2014_xp del %D%\LLVM-vs2014_xp\toolset.props

+IF EXIST %D%\LLVM-vs2014_xp del %D%\LLVM-vs2014_xp\toolset.targets

+IF EXIST %D%\LLVM-vs2014_xp rmdir %D%\LLVM-vs2014_xp

+SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\V140\Platforms\%PLATFORM%\PlatformToolsets"

+IF EXIST %D%\LLVM-vs2014 del %D%\LLVM-vs2014\toolset.props

+IF EXIST %D%\LLVM-vs2014 del %D%\LLVM-vs2014\toolset.targets

+IF EXIST %D%\LLVM-vs2014 rmdir %D%\LLVM-vs2014

+IF EXIST %D%\LLVM-vs2014_xp del %D%\LLVM-vs2014_xp\toolset.props

+IF EXIST %D%\LLVM-vs2014_xp del %D%\LLVM-vs2014_xp\toolset.targets

+IF EXIST %D%\LLVM-vs2014_xp rmdir %D%\LLVM-vs2014_xp

 

-GOTO START

 

-:END

+GOTO LOOPHEAD

+

+:LOOPEND

 echo Done!


diff --git a/tools/obj2yaml/CMakeLists.txt b/tools/obj2yaml/CMakeLists.txt
index f167ed5..3cdac5c 100644
--- a/tools/obj2yaml/CMakeLists.txt
+++ b/tools/obj2yaml/CMakeLists.txt

@@ -3,6 +3,6 @@
   Support
   )
 
-add_llvm_utility(obj2yaml
+add_llvm_tool(obj2yaml
   obj2yaml.cpp coff2yaml.cpp elf2yaml.cpp Error.cpp
   )

diff --git a/tools/obj2yaml/Error.cpp b/tools/obj2yaml/Error.cpp
index 0074128..abef8af 100644
--- a/tools/obj2yaml/Error.cpp
+++ b/tools/obj2yaml/Error.cpp

@@ -20,7 +20,9 @@
 };
 } // namespace
 
-const char *_obj2yaml_error_category::name() const { return "obj2yaml"; }
+const char *_obj2yaml_error_category::name() const LLVM_NOEXCEPT {
+  return "obj2yaml";
+}
 
 std::string _obj2yaml_error_category::message(int ev) const {
   switch (static_cast<obj2yaml_error>(ev)) {

diff --git a/tools/obj2yaml/Error.h b/tools/obj2yaml/Error.h
index 4657f0d..982f59e 100644
--- a/tools/obj2yaml/Error.h
+++ b/tools/obj2yaml/Error.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_ERROR_H
-#define LLVM_TOOLS_ERROR_H
+#ifndef LLVM_TOOLS_OBJ2YAML_ERROR_H
+#define LLVM_TOOLS_OBJ2YAML_ERROR_H
 
 #include <system_error>
 

diff --git a/tools/obj2yaml/Makefile b/tools/obj2yaml/Makefile
index 95f393d..6cbef69 100644
--- a/tools/obj2yaml/Makefile
+++ b/tools/obj2yaml/Makefile

@@ -14,7 +14,4 @@
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS = 1
 
-# Don't install this utility
-NO_INSTALL = 1
-
 include $(LEVEL)/Makefile.common

diff --git a/tools/obj2yaml/coff2yaml.cpp b/tools/obj2yaml/coff2yaml.cpp
index fed4533..5baa644 100644
--- a/tools/obj2yaml/coff2yaml.cpp
+++ b/tools/obj2yaml/coff2yaml.cpp

@@ -20,7 +20,9 @@
 class COFFDumper {
   const object::COFFObjectFile &Obj;
   COFFYAML::Object YAMLObj;
-  void dumpHeader(const object::coff_file_header *Header);
+  template <typename T>
+  void dumpOptionalHeader(T OptionalHeader);
+  void dumpHeader();
   void dumpSections(unsigned numSections);
   void dumpSymbols(unsigned numSymbols);
 
@@ -31,40 +33,90 @@
 
 }
 
-static void check(std::error_code ec) {
-  if (ec)
-    report_fatal_error(ec.message());
-}
-
 COFFDumper::COFFDumper(const object::COFFObjectFile &Obj) : Obj(Obj) {
-  const object::coff_file_header *Header;
-  check(Obj.getCOFFHeader(Header));
-  dumpHeader(Header);
-  dumpSections(Header->NumberOfSections);
-  dumpSymbols(Header->NumberOfSymbols);
+  const object::pe32_header *PE32Header = nullptr;
+  Obj.getPE32Header(PE32Header);
+  if (PE32Header) {
+    dumpOptionalHeader(PE32Header);
+  } else {
+    const object::pe32plus_header *PE32PlusHeader = nullptr;
+    Obj.getPE32PlusHeader(PE32PlusHeader);
+    if (PE32PlusHeader) {
+      dumpOptionalHeader(PE32PlusHeader);
+    }
+  }
+  dumpHeader();
+  dumpSections(Obj.getNumberOfSections());
+  dumpSymbols(Obj.getNumberOfSymbols());
 }
 
-void COFFDumper::dumpHeader(const object::coff_file_header *Header) {
-  YAMLObj.Header.Machine = Header->Machine;
-  YAMLObj.Header.Characteristics = Header->Characteristics;
+template <typename T> void COFFDumper::dumpOptionalHeader(T OptionalHeader) {
+  YAMLObj.OptionalHeader = COFFYAML::PEHeader();
+  YAMLObj.OptionalHeader->Header.AddressOfEntryPoint =
+      OptionalHeader->AddressOfEntryPoint;
+  YAMLObj.OptionalHeader->Header.AddressOfEntryPoint =
+      OptionalHeader->AddressOfEntryPoint;
+  YAMLObj.OptionalHeader->Header.ImageBase = OptionalHeader->ImageBase;
+  YAMLObj.OptionalHeader->Header.SectionAlignment =
+      OptionalHeader->SectionAlignment;
+  YAMLObj.OptionalHeader->Header.FileAlignment = OptionalHeader->FileAlignment;
+  YAMLObj.OptionalHeader->Header.MajorOperatingSystemVersion =
+      OptionalHeader->MajorOperatingSystemVersion;
+  YAMLObj.OptionalHeader->Header.MinorOperatingSystemVersion =
+      OptionalHeader->MinorOperatingSystemVersion;
+  YAMLObj.OptionalHeader->Header.MajorImageVersion =
+      OptionalHeader->MajorImageVersion;
+  YAMLObj.OptionalHeader->Header.MinorImageVersion =
+      OptionalHeader->MinorImageVersion;
+  YAMLObj.OptionalHeader->Header.MajorSubsystemVersion =
+      OptionalHeader->MajorSubsystemVersion;
+  YAMLObj.OptionalHeader->Header.MinorSubsystemVersion =
+      OptionalHeader->MinorSubsystemVersion;
+  YAMLObj.OptionalHeader->Header.Subsystem = OptionalHeader->Subsystem;
+  YAMLObj.OptionalHeader->Header.DLLCharacteristics =
+      OptionalHeader->DLLCharacteristics;
+  YAMLObj.OptionalHeader->Header.SizeOfStackReserve =
+      OptionalHeader->SizeOfStackReserve;
+  YAMLObj.OptionalHeader->Header.SizeOfStackCommit =
+      OptionalHeader->SizeOfStackCommit;
+  YAMLObj.OptionalHeader->Header.SizeOfHeapReserve =
+      OptionalHeader->SizeOfHeapReserve;
+  YAMLObj.OptionalHeader->Header.SizeOfHeapCommit =
+      OptionalHeader->SizeOfHeapCommit;
+  unsigned I = 0;
+  for (auto &DestDD : YAMLObj.OptionalHeader->DataDirectories) {
+    const object::data_directory *DD;
+    if (Obj.getDataDirectory(I++, DD))
+      continue;
+    DestDD = COFF::DataDirectory();
+    DestDD->RelativeVirtualAddress = DD->RelativeVirtualAddress;
+    DestDD->Size = DD->Size;
+  }
+}
+
+void COFFDumper::dumpHeader() {
+  YAMLObj.Header.Machine = Obj.getMachine();
+  YAMLObj.Header.Characteristics = Obj.getCharacteristics();
 }
 
 void COFFDumper::dumpSections(unsigned NumSections) {
-  std::vector<COFFYAML::Section> &Sections = YAMLObj.Sections;
-  for (const auto &Section : Obj.sections()) {
-    const object::coff_section *Sect = Obj.getCOFFSection(Section);
-    COFFYAML::Section Sec;
-    Sec.Name = Sect->Name; // FIXME: check the null termination!
-    uint32_t Characteristics = Sect->Characteristics;
-    Sec.Header.Characteristics = Characteristics;
-    Sec.Alignment = 1 << (((Characteristics >> 20) & 0xf) - 1);
+  std::vector<COFFYAML::Section> &YAMLSections = YAMLObj.Sections;
+  for (const auto &ObjSection : Obj.sections()) {
+    const object::coff_section *COFFSection = Obj.getCOFFSection(ObjSection);
+    COFFYAML::Section NewYAMLSection;
+    ObjSection.getName(NewYAMLSection.Name);
+    NewYAMLSection.Header.Characteristics = COFFSection->Characteristics;
+    NewYAMLSection.Header.VirtualAddress = ObjSection.getAddress();
+    NewYAMLSection.Header.VirtualSize = COFFSection->VirtualSize;
+    NewYAMLSection.Alignment = ObjSection.getAlignment();
 
     ArrayRef<uint8_t> sectionData;
-    Obj.getSectionContents(Sect, sectionData);
-    Sec.SectionData = yaml::BinaryRef(sectionData);
+    if (!ObjSection.isBSS())
+      Obj.getSectionContents(COFFSection, sectionData);
+    NewYAMLSection.SectionData = yaml::BinaryRef(sectionData);
 
     std::vector<COFFYAML::Relocation> Relocations;
-    for (const auto &Reloc : Section.relocations()) {
+    for (const auto &Reloc : ObjSection.relocations()) {
       const object::coff_relocation *reloc = Obj.getCOFFRelocation(Reloc);
       COFFYAML::Relocation Rel;
       object::symbol_iterator Sym = Reloc.getSymbol();
@@ -73,8 +125,8 @@
       Rel.Type = reloc->Type;
       Relocations.push_back(Rel);
     }
-    Sec.Relocations = Relocations;
-    Sections.push_back(Sec);
+    NewYAMLSection.Relocations = Relocations;
+    YAMLSections.push_back(NewYAMLSection);
   }
 }
 
@@ -111,13 +163,15 @@
 
 static void
 dumpSectionDefinition(COFFYAML::Symbol *Sym,
-                      const object::coff_aux_section_definition *ObjSD) {
+                      const object::coff_aux_section_definition *ObjSD,
+                      bool IsBigObj) {
   COFF::AuxiliarySectionDefinition YAMLASD;
+  int32_t AuxNumber = ObjSD->getNumber(IsBigObj);
   YAMLASD.Length = ObjSD->Length;
   YAMLASD.NumberOfRelocations = ObjSD->NumberOfRelocations;
   YAMLASD.NumberOfLinenumbers = ObjSD->NumberOfLinenumbers;
   YAMLASD.CheckSum = ObjSD->CheckSum;
-  YAMLASD.Number = ObjSD->Number;
+  YAMLASD.Number = AuxNumber;
   YAMLASD.Selection = ObjSD->Selection;
 
   Sym->SectionDefinition = YAMLASD;
@@ -136,63 +190,64 @@
 void COFFDumper::dumpSymbols(unsigned NumSymbols) {
   std::vector<COFFYAML::Symbol> &Symbols = YAMLObj.Symbols;
   for (const auto &S : Obj.symbols()) {
-    const object::coff_symbol *Symbol = Obj.getCOFFSymbol(S);
+    object::COFFSymbolRef Symbol = Obj.getCOFFSymbol(S);
     COFFYAML::Symbol Sym;
     Obj.getSymbolName(Symbol, Sym.Name);
-    Sym.SimpleType = COFF::SymbolBaseType(Symbol->getBaseType());
-    Sym.ComplexType = COFF::SymbolComplexType(Symbol->getComplexType());
-    Sym.Header.StorageClass = Symbol->StorageClass;
-    Sym.Header.Value = Symbol->Value;
-    Sym.Header.SectionNumber = Symbol->SectionNumber;
-    Sym.Header.NumberOfAuxSymbols = Symbol->NumberOfAuxSymbols;
+    Sym.SimpleType = COFF::SymbolBaseType(Symbol.getBaseType());
+    Sym.ComplexType = COFF::SymbolComplexType(Symbol.getComplexType());
+    Sym.Header.StorageClass = Symbol.getStorageClass();
+    Sym.Header.Value = Symbol.getValue();
+    Sym.Header.SectionNumber = Symbol.getSectionNumber();
+    Sym.Header.NumberOfAuxSymbols = Symbol.getNumberOfAuxSymbols();
 
-    if (Symbol->NumberOfAuxSymbols > 0) {
+    if (Symbol.getNumberOfAuxSymbols() > 0) {
       ArrayRef<uint8_t> AuxData = Obj.getSymbolAuxData(Symbol);
-      if (Symbol->isFunctionDefinition()) {
+      if (Symbol.isFunctionDefinition()) {
         // This symbol represents a function definition.
-        assert(Symbol->NumberOfAuxSymbols == 1 &&
+        assert(Symbol.getNumberOfAuxSymbols() == 1 &&
                "Expected a single aux symbol to describe this function!");
 
         const object::coff_aux_function_definition *ObjFD =
             reinterpret_cast<const object::coff_aux_function_definition *>(
                 AuxData.data());
         dumpFunctionDefinition(&Sym, ObjFD);
-      } else if (Symbol->isFunctionLineInfo()) {
+      } else if (Symbol.isFunctionLineInfo()) {
         // This symbol describes function line number information.
-        assert(Symbol->NumberOfAuxSymbols == 1 &&
-               "Exepected a single aux symbol to describe this section!");
+        assert(Symbol.getNumberOfAuxSymbols() == 1 &&
+               "Expected a single aux symbol to describe this function!");
 
         const object::coff_aux_bf_and_ef_symbol *ObjBES =
             reinterpret_cast<const object::coff_aux_bf_and_ef_symbol *>(
                 AuxData.data());
         dumpbfAndEfLineInfo(&Sym, ObjBES);
-      } else if (Symbol->isWeakExternal()) {
+      } else if (Symbol.isAnyUndefined()) {
         // This symbol represents a weak external definition.
-        assert(Symbol->NumberOfAuxSymbols == 1 &&
-               "Exepected a single aux symbol to describe this section!");
+        assert(Symbol.getNumberOfAuxSymbols() == 1 &&
+               "Expected a single aux symbol to describe this weak symbol!");
 
         const object::coff_aux_weak_external *ObjWE =
             reinterpret_cast<const object::coff_aux_weak_external *>(
                 AuxData.data());
         dumpWeakExternal(&Sym, ObjWE);
-      } else if (Symbol->isFileRecord()) {
+      } else if (Symbol.isFileRecord()) {
         // This symbol represents a file record.
         Sym.File = StringRef(reinterpret_cast<const char *>(AuxData.data()),
-                             Symbol->NumberOfAuxSymbols * COFF::SymbolSize)
+                             Symbol.getNumberOfAuxSymbols() *
+                                 Obj.getSymbolTableEntrySize())
                        .rtrim(StringRef("\0", /*length=*/1));
-      } else if (Symbol->isSectionDefinition()) {
+      } else if (Symbol.isSectionDefinition()) {
         // This symbol represents a section definition.
-        assert(Symbol->NumberOfAuxSymbols == 1 &&
+        assert(Symbol.getNumberOfAuxSymbols() == 1 &&
                "Expected a single aux symbol to describe this section!");
 
         const object::coff_aux_section_definition *ObjSD =
             reinterpret_cast<const object::coff_aux_section_definition *>(
                 AuxData.data());
-        dumpSectionDefinition(&Sym, ObjSD);
-      } else if (Symbol->isCLRToken()) {
+        dumpSectionDefinition(&Sym, ObjSD, Symbol.isBigObj());
+      } else if (Symbol.isCLRToken()) {
         // This symbol represents a CLR token definition.
-        assert(Symbol->NumberOfAuxSymbols == 1 &&
-               "Expected a single aux symbol to describe this CLR Token");
+        assert(Symbol.getNumberOfAuxSymbols() == 1 &&
+               "Expected a single aux symbol to describe this CLR Token!");
 
         const object::coff_aux_clr_token *ObjCLRToken =
             reinterpret_cast<const object::coff_aux_clr_token *>(

diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp
index 8b53ee7..d770ce1 100644
--- a/tools/obj2yaml/elf2yaml.cpp
+++ b/tools/obj2yaml/elf2yaml.cpp

@@ -133,7 +133,7 @@
   S.Type = Sym->getType();
   S.Value = Sym->st_value;
   S.Size = Sym->st_size;
-  S.Visibility = Sym->st_other & 0x3;
+  S.Other = Sym->st_other;
 
   ErrorOr<StringRef> NameOrErr = Obj.getSymbolName(Sym);
   if (std::error_code EC = NameOrErr.getError())

diff --git a/tools/obj2yaml/obj2yaml.cpp b/tools/obj2yaml/obj2yaml.cpp
index 944314a..b64096d 100644
--- a/tools/obj2yaml/obj2yaml.cpp
+++ b/tools/obj2yaml/obj2yaml.cpp

@@ -32,13 +32,13 @@
   if (File != "-" && !sys::fs::exists(File))
     return obj2yaml_error::file_not_found;
 
-  ErrorOr<Binary *> BinaryOrErr = createBinary(File);
+  ErrorOr<OwningBinary<Binary>> BinaryOrErr = createBinary(File);
   if (std::error_code EC = BinaryOrErr.getError())
     return EC;
 
-  std::unique_ptr<Binary> Binary(BinaryOrErr.get());
+  Binary &Binary = *BinaryOrErr.get().getBinary();
   // TODO: If this is an archive, then burst it and dump each entry
-  if (ObjectFile *Obj = dyn_cast<ObjectFile>(Binary.get()))
+  if (ObjectFile *Obj = dyn_cast<ObjectFile>(&Binary))
     return dumpObject(*Obj);
 
   return obj2yaml_error::unrecognized_file_format;

diff --git a/tools/obj2yaml/obj2yaml.h b/tools/obj2yaml/obj2yaml.h
index 6d81110..643ab7b 100644
--- a/tools/obj2yaml/obj2yaml.h
+++ b/tools/obj2yaml/obj2yaml.h

@@ -10,8 +10,8 @@
 // source file, implement it.
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OBJ2YAML_H
-#define LLVM_TOOLS_OBJ2YAML_H
+#ifndef LLVM_TOOLS_OBJ2YAML_OBJ2YAML_H
+#define LLVM_TOOLS_OBJ2YAML_OBJ2YAML_H
 
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/raw_ostream.h"

diff --git a/tools/opt/Android.mk b/tools/opt/Android.mk
index 6f3f48d..fc3c8d0 100644
--- a/tools/opt/Android.mk
+++ b/tools/opt/Android.mk

@@ -60,6 +60,7 @@
   libLLVMMC \
   libLLVMMCParser \
   libLLVMObject \
+  libLLVMProfileData \
   libLLVMCore \
   libLLVMAsmParser \
   libLLVMOption \

diff --git a/tools/opt/BreakpointPrinter.cpp b/tools/opt/BreakpointPrinter.cpp
index 44f4a11..3cbc0ae 100644
--- a/tools/opt/BreakpointPrinter.cpp
+++ b/tools/opt/BreakpointPrinter.cpp

@@ -62,7 +62,7 @@
           continue;
         getContextName(SP.getContext().resolve(TypeIdentifierMap), Name);
         Name = Name + SP.getDisplayName().str();
-        if (!Name.empty() && Processed.insert(Name)) {
+        if (!Name.empty() && Processed.insert(Name).second) {
           Out << Name << "\n";
         }
       }

diff --git a/tools/opt/NewPMDriver.h b/tools/opt/NewPMDriver.h
index 3661d3e..f977bac 100644
--- a/tools/opt/NewPMDriver.h
+++ b/tools/opt/NewPMDriver.h

@@ -18,8 +18,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_OPT_NEW_PM_DRIVER_H
-#define LLVM_TOOLS_OPT_NEW_PM_DRIVER_H
+#ifndef LLVM_TOOLS_OPT_NEWPMDRIVER_H
+#define LLVM_TOOLS_OPT_NEWPMDRIVER_H
 
 #include "llvm/ADT/StringRef.h"
 

diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 6ba6340..cdd22e4 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp

@@ -109,14 +109,6 @@
                      cl::desc("Do not run any optimization passes"));
 
 static cl::opt<bool>
-DisableInternalize("disable-internalize",
-                   cl::desc("Do not mark all symbols as internal"));
-
-static cl::opt<bool>
-StandardCompileOpts("std-compile-opts",
-                   cl::desc("Include the standard compile time optimizations"));
-
-static cl::opt<bool>
 StandardLinkOpts("std-link-opts",
                  cl::desc("Include the standard link time optimizations"));
 
@@ -145,7 +137,7 @@
 
 static cl::opt<bool>
 UnitAtATime("funit-at-a-time",
-            cl::desc("Enable IPO. This is same as llvm-gcc's -funit-at-a-time"),
+            cl::desc("Enable IPO. This corresponds to gcc's -funit-at-a-time"),
             cl::init(true));
 
 static cl::opt<bool>
@@ -198,9 +190,8 @@
   }
 }
 
-/// AddOptimizationPasses - This routine adds optimization passes
-/// based on selected optimization level, OptLevel. This routine
-/// duplicates llvm-gcc behaviour.
+/// This routine adds optimization passes based on selected optimization level,
+/// OptLevel.
 ///
 /// OptLevel - Optimization Level
 static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM,
@@ -238,41 +229,16 @@
   Builder.populateModulePassManager(MPM);
 }
 
-static void AddStandardCompilePasses(PassManagerBase &PM) {
-  PM.add(createVerifierPass());                  // Verify that input is correct
-
-  // If the -strip-debug command line option was specified, do it.
-  if (StripDebug)
-    addPass(PM, createStripSymbolsPass(true));
-
-  // Verify debug info only after it's (possibly) stripped.
-  PM.add(createDebugInfoVerifierPass());
-
-  if (DisableOptimizations) return;
-
-  // -std-compile-opts adds the same module passes as -O3.
+static void AddStandardLinkPasses(PassManagerBase &PM) {
   PassManagerBuilder Builder;
+  Builder.VerifyInput = true;
+  Builder.StripDebug = StripDebug;
+  if (DisableOptimizations)
+    Builder.OptLevel = 0;
+
   if (!DisableInline)
     Builder.Inliner = createFunctionInliningPass();
-  Builder.OptLevel = 3;
-  Builder.populateModulePassManager(PM);
-}
-
-static void AddStandardLinkPasses(PassManagerBase &PM) {
-  PM.add(createVerifierPass());                  // Verify that input is correct
-
-  // If the -strip-debug command line option was specified, do it.
-  if (StripDebug)
-    addPass(PM, createStripSymbolsPass(true));
-
-  // Verify debug info only after it's (possibly) stripped.
-  PM.add(createDebugInfoVerifierPass());
-
-  if (DisableOptimizations) return;
-
-  PassManagerBuilder Builder;
-  Builder.populateLTOPassManager(PM, /*Internalize=*/ !DisableInternalize,
-                                 /*RunInliner=*/ !DisableInline);
+  Builder.populateLTOPassManager(PM);
 }
 
 //===----------------------------------------------------------------------===//
@@ -355,7 +321,8 @@
   // For codegen passes, only passes that do IR to IR transformation are
   // supported.
   initializeCodeGenPreparePass(Registry);
-  initializeAtomicExpandLoadLinkedPass(Registry);
+  initializeAtomicExpandPass(Registry);
+  initializeRewriteSymbolsPass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::initializePollyPasses(Registry);
@@ -372,8 +339,7 @@
   SMDiagnostic Err;
 
   // Load the input module...
-  std::unique_ptr<Module> M;
-  M.reset(ParseIRFile(InputFilename, Err, Context));
+  std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
 
   if (!M.get()) {
     Err.print(argv[0], errs());
@@ -395,11 +361,10 @@
     if (OutputFilename.empty())
       OutputFilename = "-";
 
-    std::string ErrorInfo;
-    Out.reset(new tool_output_file(OutputFilename.c_str(), ErrorInfo,
-                                   sys::fs::F_None));
-    if (!ErrorInfo.empty()) {
-      errs() << ErrorInfo << '\n';
+    std::error_code EC;
+    Out.reset(new tool_output_file(OutputFilename, EC, sys::fs::F_None));
+    if (EC) {
+      errs() << EC.message() << '\n';
       return 1;
     }
   }
@@ -452,7 +417,7 @@
   }
 
   if (DL)
-    Passes.add(new DataLayoutPass(M.get()));
+    Passes.add(new DataLayoutPass());
 
   Triple ModuleTriple(M->getTargetTriple());
   TargetMachine *Machine = nullptr;
@@ -468,7 +433,7 @@
   if (OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz || OptLevelO3) {
     FPasses.reset(new FunctionPassManager(M.get()));
     if (DL)
-      FPasses->add(new DataLayoutPass(M.get()));
+      FPasses->add(new DataLayoutPass());
     if (TM.get())
       TM->addAnalysisPasses(*FPasses);
 
@@ -480,11 +445,10 @@
       if (OutputFilename.empty())
         OutputFilename = "-";
 
-      std::string ErrorInfo;
-      Out.reset(new tool_output_file(OutputFilename.c_str(), ErrorInfo,
-                                     sys::fs::F_None));
-      if (!ErrorInfo.empty()) {
-        errs() << ErrorInfo << '\n';
+      std::error_code EC;
+      Out.reset(new tool_output_file(OutputFilename, EC, sys::fs::F_None));
+      if (EC) {
+        errs() << EC.message() << '\n';
         return 1;
       }
     }
@@ -492,21 +456,12 @@
     NoOutput = true;
   }
 
-  // If the -strip-debug command line option was specified, add it.  If
-  // -std-compile-opts was also specified, it will handle StripDebug.
-  if (StripDebug && !StandardCompileOpts)
+  // If the -strip-debug command line option was specified, add it.
+  if (StripDebug)
     addPass(Passes, createStripSymbolsPass(true));
 
   // Create a new optimization pass for each one specified on the command line
   for (unsigned i = 0; i < PassList.size(); ++i) {
-    // Check to see if -std-compile-opts was specified before this option.  If
-    // so, handle it.
-    if (StandardCompileOpts &&
-        StandardCompileOpts.getPosition() < PassList.getPosition(i)) {
-      AddStandardCompilePasses(Passes);
-      StandardCompileOpts = false;
-    }
-
     if (StandardLinkOpts &&
         StandardLinkOpts.getPosition() < PassList.getPosition(i)) {
       AddStandardLinkPasses(Passes);
@@ -579,12 +534,6 @@
       Passes.add(createPrintModulePass(errs()));
   }
 
-  // If -std-compile-opts was specified at the end of the pass list, add them.
-  if (StandardCompileOpts) {
-    AddStandardCompilePasses(Passes);
-    StandardCompileOpts = false;
-  }
-
   if (StandardLinkOpts) {
     AddStandardLinkPasses(Passes);
     StandardLinkOpts = false;

diff --git a/tools/verify-uselistorder/CMakeLists.txt b/tools/verify-uselistorder/CMakeLists.txt
new file mode 100644
index 0000000..260a95a
--- /dev/null
+++ b/tools/verify-uselistorder/CMakeLists.txt

@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+  AsmParser
+  BitReader
+  BitWriter
+  Core
+  IRReader
+  Support
+  )
+
+add_llvm_tool(verify-uselistorder
+  verify-uselistorder.cpp
+  )

diff --git a/tools/verify-uselistorder/LLVMBuild.txt b/tools/verify-uselistorder/LLVMBuild.txt
new file mode 100644
index 0000000..23957c1
--- /dev/null
+++ b/tools/verify-uselistorder/LLVMBuild.txt

@@ -0,0 +1,22 @@
+;===- ./tools/verify-uselistorder/LLVMBuild.txt ----------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = verify-uselistorder
+parent = Tools
+required_libraries = IRReader BitWriter Support

diff --git a/tools/verify-uselistorder/Makefile b/tools/verify-uselistorder/Makefile
new file mode 100644
index 0000000..90d2aa8
--- /dev/null
+++ b/tools/verify-uselistorder/Makefile

@@ -0,0 +1,17 @@
+##===- tools/verify-uselistorder/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+TOOLNAME := verify-uselistorder
+LINK_COMPONENTS := AsmParser BitReader BitWriter Core IRReader Support
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS := 1
+
+include $(LEVEL)/Makefile.common

diff --git a/tools/verify-uselistorder/verify-uselistorder.cpp b/tools/verify-uselistorder/verify-uselistorder.cpp
new file mode 100644
index 0000000..992a5b0
--- /dev/null
+++ b/tools/verify-uselistorder/verify-uselistorder.cpp

@@ -0,0 +1,562 @@
+//===- verify-uselistorder.cpp - The LLVM Modular Optimizer ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Verify that use-list order can be serialized correctly.  After reading the
+// provided IR, this tool shuffles the use-lists and then writes and reads to a
+// separate Module whose use-list orders are compared to the original.
+//
+// The shuffles are deterministic, but guarantee that use-lists will change.
+// The algorithm per iteration is as follows:
+//
+//  1. Seed the random number generator.  The seed is different for each
+//     shuffle.  Shuffle 0 uses default+0, shuffle 1 uses default+1, and so on.
+//
+//  2. Visit every Value in a deterministic order.
+//
+//  3. Assign a random number to each Use in the Value's use-list in order.
+//
+//  4. If the numbers are already in order, reassign numbers until they aren't.
+//
+//  5. Sort the use-list using Value::sortUseList(), which is a stable sort.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/UseListOrder.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SystemUtils.h"
+#include <random>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "use-list-order"
+
+static cl::opt<std::string> InputFilename(cl::Positional,
+                                          cl::desc("<input bitcode file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+static cl::opt<bool> SaveTemps("save-temps", cl::desc("Save temp files"),
+                               cl::init(false));
+
+static cl::opt<unsigned>
+    NumShuffles("num-shuffles",
+                cl::desc("Number of times to shuffle and verify use-lists"),
+                cl::init(1));
+
+namespace {
+
+struct TempFile {
+  std::string Filename;
+  FileRemover Remover;
+  bool init(const std::string &Ext);
+  bool writeBitcode(const Module &M) const;
+  bool writeAssembly(const Module &M) const;
+  std::unique_ptr<Module> readBitcode(LLVMContext &Context) const;
+  std::unique_ptr<Module> readAssembly(LLVMContext &Context) const;
+};
+
+struct ValueMapping {
+  DenseMap<const Value *, unsigned> IDs;
+  std::vector<const Value *> Values;
+
+  /// \brief Construct a value mapping for module.
+  ///
+  /// Creates mapping from every value in \c M to an ID.  This mapping includes
+  /// un-referencable values.
+  ///
+  /// Every \a Value that gets serialized in some way should be represented
+  /// here.  The order needs to be deterministic, but it's unnecessary to match
+  /// the value-ids in the bitcode writer.
+  ///
+  /// All constants that are referenced by other values are included in the
+  /// mapping, but others -- which wouldn't be serialized -- are not.
+  ValueMapping(const Module &M);
+
+  /// \brief Map a value.
+  ///
+  /// Maps a value.  If it's a constant, maps all of its operands first.
+  void map(const Value *V);
+  unsigned lookup(const Value *V) const { return IDs.lookup(V); }
+};
+
+} // end namespace
+
+bool TempFile::init(const std::string &Ext) {
+  SmallVector<char, 64> Vector;
+  DEBUG(dbgs() << " - create-temp-file\n");
+  if (auto EC = sys::fs::createTemporaryFile("use-list-order", Ext, Vector)) {
+    (void)EC;
+    DEBUG(dbgs() << "error: " << EC.message() << "\n");
+    return true;
+  }
+  assert(!Vector.empty());
+
+  Filename.assign(Vector.data(), Vector.data() + Vector.size());
+  Remover.setFile(Filename, !SaveTemps);
+  DEBUG(dbgs() << " - filename = " << Filename << "\n");
+  return false;
+}
+
+bool TempFile::writeBitcode(const Module &M) const {
+  DEBUG(dbgs() << " - write bitcode\n");
+  std::error_code EC;
+  raw_fd_ostream OS(Filename, EC, sys::fs::F_None);
+  if (EC) {
+    DEBUG(dbgs() << "error: " << EC.message() << "\n");
+    return true;
+  }
+
+  WriteBitcodeToFile(&M, OS);
+  return false;
+}
+
+bool TempFile::writeAssembly(const Module &M) const {
+  DEBUG(dbgs() << " - write assembly\n");
+  std::error_code EC;
+  raw_fd_ostream OS(Filename, EC, sys::fs::F_Text);
+  if (EC) {
+    DEBUG(dbgs() << "error: " << EC.message() << "\n");
+    return true;
+  }
+
+  OS << M;
+  return false;
+}
+
+std::unique_ptr<Module> TempFile::readBitcode(LLVMContext &Context) const {
+  DEBUG(dbgs() << " - read bitcode\n");
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOr =
+      MemoryBuffer::getFile(Filename);
+  if (!BufferOr) {
+    DEBUG(dbgs() << "error: " << BufferOr.getError().message() << "\n");
+    return nullptr;
+  }
+
+  MemoryBuffer *Buffer = BufferOr.get().get();
+  ErrorOr<Module *> ModuleOr =
+      parseBitcodeFile(Buffer->getMemBufferRef(), Context);
+  if (!ModuleOr) {
+    DEBUG(dbgs() << "error: " << ModuleOr.getError().message() << "\n");
+    return nullptr;
+  }
+  return std::unique_ptr<Module>(ModuleOr.get());
+}
+
+std::unique_ptr<Module> TempFile::readAssembly(LLVMContext &Context) const {
+  DEBUG(dbgs() << " - read assembly\n");
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = parseAssemblyFile(Filename, Err, Context);
+  if (!M.get())
+    DEBUG(dbgs() << "error: "; Err.print("verify-use-list-order", dbgs()));
+  return M;
+}
+
+ValueMapping::ValueMapping(const Module &M) {
+  // Every value should be mapped, including things like void instructions and
+  // basic blocks that are kept out of the ValueEnumerator.
+  //
+  // The current mapping order makes it easier to debug the tables.  It happens
+  // to be similar to the ID mapping when writing ValueEnumerator, but they
+  // aren't (and needn't be) in sync.
+
+  // Globals.
+  for (const GlobalVariable &G : M.globals())
+    map(&G);
+  for (const GlobalAlias &A : M.aliases())
+    map(&A);
+  for (const Function &F : M)
+    map(&F);
+
+  // Constants used by globals.
+  for (const GlobalVariable &G : M.globals())
+    if (G.hasInitializer())
+      map(G.getInitializer());
+  for (const GlobalAlias &A : M.aliases())
+    map(A.getAliasee());
+  for (const Function &F : M)
+    if (F.hasPrefixData())
+      map(F.getPrefixData());
+
+  // Function bodies.
+  for (const Function &F : M) {
+    for (const Argument &A : F.args())
+      map(&A);
+    for (const BasicBlock &BB : F)
+      map(&BB);
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        map(&I);
+
+    // Constants used by instructions.
+    for (const BasicBlock &BB : F)
+      for (const Instruction &I : BB)
+        for (const Value *Op : I.operands())
+          if ((isa<Constant>(Op) && !isa<GlobalValue>(*Op)) ||
+              isa<InlineAsm>(Op))
+            map(Op);
+  }
+}
+
+void ValueMapping::map(const Value *V) {
+  if (IDs.lookup(V))
+    return;
+
+  if (auto *C = dyn_cast<Constant>(V))
+    if (!isa<GlobalValue>(C))
+      for (const Value *Op : C->operands())
+        map(Op);
+
+  Values.push_back(V);
+  IDs[V] = Values.size();
+}
+
+#ifndef NDEBUG
+static void dumpMapping(const ValueMapping &VM) {
+  dbgs() << "value-mapping (size = " << VM.Values.size() << "):\n";
+  for (unsigned I = 0, E = VM.Values.size(); I != E; ++I) {
+    dbgs() << " - id = " << I << ", value = ";
+    VM.Values[I]->dump();
+  }
+}
+
+static void debugValue(const ValueMapping &M, unsigned I, StringRef Desc) {
+  const Value *V = M.Values[I];
+  dbgs() << " - " << Desc << " value = ";
+  V->dump();
+  for (const Use &U : V->uses()) {
+    dbgs() << "   => use: op = " << U.getOperandNo()
+           << ", user-id = " << M.IDs.lookup(U.getUser()) << ", user = ";
+    U.getUser()->dump();
+  }
+}
+
+static void debugUserMismatch(const ValueMapping &L, const ValueMapping &R,
+                              unsigned I) {
+  dbgs() << " - fail: user mismatch: ID = " << I << "\n";
+  debugValue(L, I, "LHS");
+  debugValue(R, I, "RHS");
+
+  dbgs() << "\nlhs-";
+  dumpMapping(L);
+  dbgs() << "\nrhs-";
+  dumpMapping(R);
+}
+
+static void debugSizeMismatch(const ValueMapping &L, const ValueMapping &R) {
+  dbgs() << " - fail: map size: " << L.Values.size()
+         << " != " << R.Values.size() << "\n";
+  dbgs() << "\nlhs-";
+  dumpMapping(L);
+  dbgs() << "\nrhs-";
+  dumpMapping(R);
+}
+#endif
+
+static bool matches(const ValueMapping &LM, const ValueMapping &RM) {
+  DEBUG(dbgs() << "compare value maps\n");
+  if (LM.Values.size() != RM.Values.size()) {
+    DEBUG(debugSizeMismatch(LM, RM));
+    return false;
+  }
+
+  // This mapping doesn't include dangling constant users, since those don't
+  // get serialized.  However, checking if users are constant and calling
+  // isConstantUsed() on every one is very expensive.  Instead, just check if
+  // the user is mapped.
+  auto skipUnmappedUsers =
+      [&](Value::const_use_iterator &U, Value::const_use_iterator E,
+          const ValueMapping &M) {
+    while (U != E && !M.lookup(U->getUser()))
+      ++U;
+  };
+
+  // Iterate through all values, and check that both mappings have the same
+  // users.
+  for (unsigned I = 0, E = LM.Values.size(); I != E; ++I) {
+    const Value *L = LM.Values[I];
+    const Value *R = RM.Values[I];
+    auto LU = L->use_begin(), LE = L->use_end();
+    auto RU = R->use_begin(), RE = R->use_end();
+    skipUnmappedUsers(LU, LE, LM);
+    skipUnmappedUsers(RU, RE, RM);
+
+    while (LU != LE) {
+      if (RU == RE) {
+        DEBUG(debugUserMismatch(LM, RM, I));
+        return false;
+      }
+      if (LM.lookup(LU->getUser()) != RM.lookup(RU->getUser())) {
+        DEBUG(debugUserMismatch(LM, RM, I));
+        return false;
+      }
+      if (LU->getOperandNo() != RU->getOperandNo()) {
+        DEBUG(debugUserMismatch(LM, RM, I));
+        return false;
+      }
+      skipUnmappedUsers(++LU, LE, LM);
+      skipUnmappedUsers(++RU, RE, RM);
+    }
+    if (RU != RE) {
+      DEBUG(debugUserMismatch(LM, RM, I));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static void verifyAfterRoundTrip(const Module &M,
+                                 std::unique_ptr<Module> OtherM) {
+  if (!OtherM)
+    report_fatal_error("parsing failed");
+  if (verifyModule(*OtherM, &errs()))
+    report_fatal_error("verification failed");
+  if (!matches(ValueMapping(M), ValueMapping(*OtherM)))
+    report_fatal_error("use-list order changed");
+}
+static void verifyBitcodeUseListOrder(const Module &M) {
+  errs() << "*** verify-use-list-order: bitcode ***\n";
+  TempFile F;
+  if (F.init("bc"))
+    report_fatal_error("failed to initialize bitcode file");
+
+  if (F.writeBitcode(M))
+    report_fatal_error("failed to write bitcode");
+
+  LLVMContext Context;
+  verifyAfterRoundTrip(M, F.readBitcode(Context));
+}
+
+static void verifyAssemblyUseListOrder(const Module &M) {
+  errs() << "*** verify-use-list-order: assembly ***\n";
+  TempFile F;
+  if (F.init("ll"))
+    report_fatal_error("failed to initialize assembly file");
+
+  if (F.writeAssembly(M))
+    report_fatal_error("failed to write assembly");
+
+  LLVMContext Context;
+  verifyAfterRoundTrip(M, F.readAssembly(Context));
+}
+
+static void verifyUseListOrder(const Module &M) {
+  verifyBitcodeUseListOrder(M);
+  verifyAssemblyUseListOrder(M);
+}
+
+static void shuffleValueUseLists(Value *V, std::minstd_rand0 &Gen,
+                                 DenseSet<Value *> &Seen) {
+  if (!Seen.insert(V).second)
+    return;
+
+  if (auto *C = dyn_cast<Constant>(V))
+    if (!isa<GlobalValue>(C))
+      for (Value *Op : C->operands())
+        shuffleValueUseLists(Op, Gen, Seen);
+
+  if (V->use_empty() || std::next(V->use_begin()) == V->use_end())
+    // Nothing to shuffle for 0 or 1 users.
+    return;
+
+  // Generate random numbers between 10 and 99, which will line up nicely in
+  // debug output.  We're not worried about collisons here.
+  DEBUG(dbgs() << "V = "; V->dump());
+  std::uniform_int_distribution<short> Dist(10, 99);
+  SmallDenseMap<const Use *, short, 16> Order;
+  auto compareUses =
+      [&Order](const Use &L, const Use &R) { return Order[&L] < Order[&R]; };
+  do {
+    for (const Use &U : V->uses()) {
+      auto I = Dist(Gen);
+      Order[&U] = I;
+      DEBUG(dbgs() << " - order: " << I << ", op = " << U.getOperandNo()
+                   << ", U = ";
+            U.getUser()->dump());
+    }
+  } while (std::is_sorted(V->use_begin(), V->use_end(), compareUses));
+
+  DEBUG(dbgs() << " => shuffle\n");
+  V->sortUseList(compareUses);
+
+  DEBUG({
+    for (const Use &U : V->uses()) {
+      dbgs() << " - order: " << Order.lookup(&U)
+             << ", op = " << U.getOperandNo() << ", U = ";
+      U.getUser()->dump();
+    }
+  });
+}
+
+static void reverseValueUseLists(Value *V, DenseSet<Value *> &Seen) {
+  if (!Seen.insert(V).second)
+    return;
+
+  if (auto *C = dyn_cast<Constant>(V))
+    if (!isa<GlobalValue>(C))
+      for (Value *Op : C->operands())
+        reverseValueUseLists(Op, Seen);
+
+  if (V->use_empty() || std::next(V->use_begin()) == V->use_end())
+    // Nothing to shuffle for 0 or 1 users.
+    return;
+
+  DEBUG({
+    dbgs() << "V = ";
+    V->dump();
+    for (const Use &U : V->uses()) {
+      dbgs() << " - order: op = " << U.getOperandNo() << ", U = ";
+      U.getUser()->dump();
+    }
+    dbgs() << " => reverse\n";
+  });
+
+  V->reverseUseList();
+
+  DEBUG({
+    for (const Use &U : V->uses()) {
+      dbgs() << " - order: op = " << U.getOperandNo() << ", U = ";
+      U.getUser()->dump();
+    }
+  });
+}
+
+template <class Changer>
+static void changeUseLists(Module &M, Changer changeValueUseList) {
+  // Visit every value that would be serialized to an IR file.
+  //
+  // Globals.
+  for (GlobalVariable &G : M.globals())
+    changeValueUseList(&G);
+  for (GlobalAlias &A : M.aliases())
+    changeValueUseList(&A);
+  for (Function &F : M)
+    changeValueUseList(&F);
+
+  // Constants used by globals.
+  for (GlobalVariable &G : M.globals())
+    if (G.hasInitializer())
+      changeValueUseList(G.getInitializer());
+  for (GlobalAlias &A : M.aliases())
+    changeValueUseList(A.getAliasee());
+  for (Function &F : M)
+    if (F.hasPrefixData())
+      changeValueUseList(F.getPrefixData());
+
+  // Function bodies.
+  for (Function &F : M) {
+    for (Argument &A : F.args())
+      changeValueUseList(&A);
+    for (BasicBlock &BB : F)
+      changeValueUseList(&BB);
+    for (BasicBlock &BB : F)
+      for (Instruction &I : BB)
+        changeValueUseList(&I);
+
+    // Constants used by instructions.
+    for (BasicBlock &BB : F)
+      for (Instruction &I : BB)
+        for (Value *Op : I.operands())
+          if ((isa<Constant>(Op) && !isa<GlobalValue>(*Op)) ||
+              isa<InlineAsm>(Op))
+            changeValueUseList(Op);
+  }
+
+  if (verifyModule(M, &errs()))
+    report_fatal_error("verification failed");
+}
+
+static void shuffleUseLists(Module &M, unsigned SeedOffset) {
+  errs() << "*** shuffle-use-lists ***\n";
+  std::minstd_rand0 Gen(std::minstd_rand0::default_seed + SeedOffset);
+  DenseSet<Value *> Seen;
+  changeUseLists(M, [&](Value *V) { shuffleValueUseLists(V, Gen, Seen); });
+  DEBUG(dbgs() << "\n");
+}
+
+static void reverseUseLists(Module &M) {
+  errs() << "*** reverse-use-lists ***\n";
+  DenseSet<Value *> Seen;
+  changeUseLists(M, [&](Value *V) { reverseValueUseLists(V, Seen); });
+  DEBUG(dbgs() << "\n");
+}
+
+int main(int argc, char **argv) {
+  sys::PrintStackTraceOnErrorSignal();
+  llvm::PrettyStackTraceProgram X(argc, argv);
+
+  // Enable debug stream buffering.
+  EnableDebugBuffering = true;
+
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+  LLVMContext &Context = getGlobalContext();
+
+  cl::ParseCommandLineOptions(argc, argv,
+                              "llvm tool to verify use-list order\n");
+
+  SMDiagnostic Err;
+
+  // Load the input module...
+  std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
+
+  if (!M.get()) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+  if (verifyModule(*M, &errs()))
+    report_fatal_error("verification failed");
+
+  errs() << "*** verify-use-list-order ***\n";
+  // Can't verify if order isn't preserved.
+  if (!shouldPreserveBitcodeUseListOrder()) {
+    errs() << "warning: forcing -preserve-bc-use-list-order\n";
+    setPreserveBitcodeUseListOrder(true);
+  }
+  if (!shouldPreserveAssemblyUseListOrder()) {
+    errs() << "warning: forcing -preserve-ll-use-list-order\n";
+    setPreserveAssemblyUseListOrder(true);
+  }
+
+  // Verify the use lists now and after reversing them.
+  verifyUseListOrder(*M);
+  reverseUseLists(*M);
+  verifyUseListOrder(*M);
+
+  for (unsigned I = 0, E = NumShuffles; I != E; ++I) {
+    errs() << "*** shuffle iteration: " << I + 1 << " of " << E << " ***\n";
+
+    // Shuffle with a different (deterministic) seed each time.
+    shuffleUseLists(*M, I);
+
+    // Verify again before and after reversing.
+    verifyUseListOrder(*M);
+    reverseUseLists(*M);
+    verifyUseListOrder(*M);
+  }
+
+  return 0;
+}

diff --git a/tools/yaml2obj/CMakeLists.txt b/tools/yaml2obj/CMakeLists.txt
index 5e63dfb..52e9df4 100644
--- a/tools/yaml2obj/CMakeLists.txt
+++ b/tools/yaml2obj/CMakeLists.txt

@@ -1,9 +1,10 @@
 set(LLVM_LINK_COMPONENTS
+  MC
   Object
   Support
   )
 
-add_llvm_utility(yaml2obj
+add_llvm_tool(yaml2obj
   yaml2obj.cpp
   yaml2coff.cpp
   yaml2elf.cpp

diff --git a/tools/yaml2obj/Makefile b/tools/yaml2obj/Makefile
index 8801795..912f0e3 100644
--- a/tools/yaml2obj/Makefile
+++ b/tools/yaml2obj/Makefile

@@ -14,7 +14,4 @@
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS = 1
 
-# Don't install this utility
-NO_INSTALL = 1
-
 include $(LEVEL)/Makefile.common

diff --git a/tools/yaml2obj/yaml2coff.cpp b/tools/yaml2obj/yaml2coff.cpp
index c772db9..6983e9d 100644
--- a/tools/yaml2obj/yaml2coff.cpp
+++ b/tools/yaml2obj/yaml2coff.cpp

@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Object/COFFYAML.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
@@ -30,12 +31,35 @@
 /// This parses a yaml stream that represents a COFF object file.
 /// See docs/yaml2obj for the yaml scheema.
 struct COFFParser {
-  COFFParser(COFFYAML::Object &Obj) : Obj(Obj) {
+  COFFParser(COFFYAML::Object &Obj)
+      : Obj(Obj), SectionTableStart(0), SectionTableSize(0) {
     // A COFF string table always starts with a 4 byte size field. Offsets into
     // it include this size, so allocate it now.
     StringTable.append(4, char(0));
   }
 
+  bool useBigObj() const {
+    return static_cast<int32_t>(Obj.Sections.size()) >
+           COFF::MaxNumberOfSections16;
+  }
+
+  bool isPE() const { return Obj.OptionalHeader.hasValue(); }
+  bool is64Bit() const {
+    return Obj.Header.Machine == COFF::IMAGE_FILE_MACHINE_AMD64;
+  }
+
+  uint32_t getFileAlignment() const {
+    return Obj.OptionalHeader->Header.FileAlignment;
+  }
+
+  unsigned getHeaderSize() const {
+    return useBigObj() ? COFF::Header32Size : COFF::Header16Size;
+  }
+
+  unsigned getSymbolSize() const {
+    return useBigObj() ? COFF::Symbol32Size : COFF::Symbol16Size;
+  }
+
   bool parseSections() {
     for (std::vector<COFFYAML::Section>::iterator i = Obj.Sections.begin(),
            e = Obj.Sections.end(); i != e; ++i) {
@@ -111,39 +135,61 @@
 
   StringMap<unsigned> StringTableMap;
   std::string StringTable;
+  uint32_t SectionTableStart;
+  uint32_t SectionTableSize;
 };
 
 // Take a CP and assign addresses and sizes to everything. Returns false if the
 // layout is not valid to do.
-static bool layoutCOFF(COFFParser &CP) {
-  uint32_t SectionTableStart = 0;
-  uint32_t SectionTableSize  = 0;
+static bool layoutOptionalHeader(COFFParser &CP) {
+  if (!CP.isPE())
+    return true;
+  unsigned PEHeaderSize = CP.is64Bit() ? sizeof(object::pe32plus_header)
+                                       : sizeof(object::pe32_header);
+  CP.Obj.Header.SizeOfOptionalHeader =
+      PEHeaderSize +
+      sizeof(object::data_directory) * (COFF::NUM_DATA_DIRECTORIES + 1);
+  return true;
+}
 
+namespace {
+enum { DOSStubSize = 128 };
+}
+
+// Take a CP and assign addresses and sizes to everything. Returns false if the
+// layout is not valid to do.
+static bool layoutCOFF(COFFParser &CP) {
   // The section table starts immediately after the header, including the
   // optional header.
-  SectionTableStart = sizeof(COFF::header) + CP.Obj.Header.SizeOfOptionalHeader;
-  SectionTableSize = sizeof(COFF::section) * CP.Obj.Sections.size();
+  CP.SectionTableStart =
+      CP.getHeaderSize() + CP.Obj.Header.SizeOfOptionalHeader;
+  if (CP.isPE())
+    CP.SectionTableStart += DOSStubSize + sizeof(COFF::PEMagic);
+  CP.SectionTableSize = COFF::SectionSize * CP.Obj.Sections.size();
 
-  uint32_t CurrentSectionDataOffset = SectionTableStart + SectionTableSize;
+  uint32_t CurrentSectionDataOffset =
+      CP.SectionTableStart + CP.SectionTableSize;
 
   // Assign each section data address consecutively.
-  for (std::vector<COFFYAML::Section>::iterator i = CP.Obj.Sections.begin(),
-                                                e = CP.Obj.Sections.end();
-                                                i != e; ++i) {
-    if (i->SectionData.binary_size() > 0) {
-      i->Header.SizeOfRawData = i->SectionData.binary_size();
-      i->Header.PointerToRawData = CurrentSectionDataOffset;
-      CurrentSectionDataOffset += i->Header.SizeOfRawData;
-      if (!i->Relocations.empty()) {
-        i->Header.PointerToRelocations = CurrentSectionDataOffset;
-        i->Header.NumberOfRelocations = i->Relocations.size();
-        CurrentSectionDataOffset += i->Header.NumberOfRelocations *
-          COFF::RelocationSize;
+  for (COFFYAML::Section &S : CP.Obj.Sections) {
+    if (S.SectionData.binary_size() > 0) {
+      CurrentSectionDataOffset = RoundUpToAlignment(
+          CurrentSectionDataOffset, CP.isPE() ? CP.getFileAlignment() : 4);
+      S.Header.SizeOfRawData = S.SectionData.binary_size();
+      if (CP.isPE())
+        S.Header.SizeOfRawData =
+            RoundUpToAlignment(S.Header.SizeOfRawData, CP.getFileAlignment());
+      S.Header.PointerToRawData = CurrentSectionDataOffset;
+      CurrentSectionDataOffset += S.Header.SizeOfRawData;
+      if (!S.Relocations.empty()) {
+        S.Header.PointerToRelocations = CurrentSectionDataOffset;
+        S.Header.NumberOfRelocations = S.Relocations.size();
+        CurrentSectionDataOffset +=
+            S.Header.NumberOfRelocations * COFF::RelocationSize;
       }
-      // TODO: Handle alignment.
     } else {
-      i->Header.SizeOfRawData = 0;
-      i->Header.PointerToRawData = 0;
+      S.Header.SizeOfRawData = 0;
+      S.Header.PointerToRawData = 0;
     }
   }
 
@@ -163,7 +209,7 @@
       NumberOfAuxSymbols += 1;
     if (!i->File.empty())
       NumberOfAuxSymbols +=
-          (i->File.size() + COFF::SymbolSize - 1) / COFF::SymbolSize;
+          (i->File.size() + CP.getSymbolSize() - 1) / CP.getSymbolSize();
     if (i->SectionDefinition)
       NumberOfAuxSymbols += 1;
     if (i->CLRToken)
@@ -175,7 +221,10 @@
   // Store all the allocated start addresses in the header.
   CP.Obj.Header.NumberOfSections = CP.Obj.Sections.size();
   CP.Obj.Header.NumberOfSymbols = NumberOfSymbols;
-  CP.Obj.Header.PointerToSymbolTable = SymbolTableStart;
+  if (NumberOfSymbols > 0 || CP.StringTable.size() > 4)
+    CP.Obj.Header.PointerToSymbolTable = SymbolTableStart;
+  else
+    CP.Obj.Header.PointerToSymbolTable = 0;
 
   *reinterpret_cast<support::ulittle32_t *>(&CP.StringTable[0])
     = CP.StringTable.size();
@@ -222,15 +271,153 @@
   return zeros_impl<sizeof(T)>();
 }
 
-bool writeCOFF(COFFParser &CP, raw_ostream &OS) {
-  OS << binary_le(CP.Obj.Header.Machine)
-     << binary_le(CP.Obj.Header.NumberOfSections)
-     << binary_le(CP.Obj.Header.TimeDateStamp)
-     << binary_le(CP.Obj.Header.PointerToSymbolTable)
-     << binary_le(CP.Obj.Header.NumberOfSymbols)
-     << binary_le(CP.Obj.Header.SizeOfOptionalHeader)
-     << binary_le(CP.Obj.Header.Characteristics);
+struct num_zeros_impl {
+  size_t N;
+  num_zeros_impl(size_t N) : N(N) {}
+};
 
+raw_ostream &operator<<(raw_ostream &OS, const num_zeros_impl &NZI) {
+  for (size_t I = 0; I != NZI.N; ++I)
+    OS.write(0);
+  return OS;
+}
+
+num_zeros_impl num_zeros(size_t N) {
+  num_zeros_impl NZI(N);
+  return NZI;
+}
+
+template <typename T>
+static uint32_t initializeOptionalHeader(COFFParser &CP, uint16_t Magic, T Header) {
+  memset(Header, 0, sizeof(*Header));
+  Header->Magic = Magic;
+  Header->SectionAlignment = CP.Obj.OptionalHeader->Header.SectionAlignment;
+  Header->FileAlignment = CP.Obj.OptionalHeader->Header.FileAlignment;
+  uint32_t SizeOfCode = 0, SizeOfInitializedData = 0,
+           SizeOfUninitializedData = 0;
+  uint32_t SizeOfHeaders = RoundUpToAlignment(
+      CP.SectionTableStart + CP.SectionTableSize, Header->FileAlignment);
+  uint32_t SizeOfImage =
+      RoundUpToAlignment(SizeOfHeaders, Header->SectionAlignment);
+  uint32_t BaseOfData = 0;
+  for (const COFFYAML::Section &S : CP.Obj.Sections) {
+    if (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_CODE)
+      SizeOfCode += S.Header.SizeOfRawData;
+    if (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+      SizeOfInitializedData += S.Header.SizeOfRawData;
+    if (S.Header.Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+      SizeOfUninitializedData += S.Header.SizeOfRawData;
+    if (S.Name.equals(".text"))
+      Header->BaseOfCode = S.Header.VirtualAddress; // RVA
+    else if (S.Name.equals(".data"))
+      BaseOfData = S.Header.VirtualAddress; // RVA
+    if (S.Header.VirtualAddress)
+      SizeOfImage +=
+          RoundUpToAlignment(S.Header.VirtualSize, Header->SectionAlignment);
+  }
+  Header->SizeOfCode = SizeOfCode;
+  Header->SizeOfInitializedData = SizeOfInitializedData;
+  Header->SizeOfUninitializedData = SizeOfUninitializedData;
+  Header->AddressOfEntryPoint =
+      CP.Obj.OptionalHeader->Header.AddressOfEntryPoint; // RVA
+  Header->ImageBase = CP.Obj.OptionalHeader->Header.ImageBase;
+  Header->MajorOperatingSystemVersion =
+      CP.Obj.OptionalHeader->Header.MajorOperatingSystemVersion;
+  Header->MinorOperatingSystemVersion =
+      CP.Obj.OptionalHeader->Header.MinorOperatingSystemVersion;
+  Header->MajorImageVersion =
+      CP.Obj.OptionalHeader->Header.MajorImageVersion;
+  Header->MinorImageVersion =
+      CP.Obj.OptionalHeader->Header.MinorImageVersion;
+  Header->MajorSubsystemVersion =
+      CP.Obj.OptionalHeader->Header.MajorSubsystemVersion;
+  Header->MinorSubsystemVersion =
+      CP.Obj.OptionalHeader->Header.MinorSubsystemVersion;
+  Header->SizeOfImage = SizeOfImage;
+  Header->SizeOfHeaders = SizeOfHeaders;
+  Header->Subsystem = CP.Obj.OptionalHeader->Header.Subsystem;
+  Header->DLLCharacteristics = CP.Obj.OptionalHeader->Header.DLLCharacteristics;
+  Header->SizeOfStackReserve = CP.Obj.OptionalHeader->Header.SizeOfStackReserve;
+  Header->SizeOfStackCommit = CP.Obj.OptionalHeader->Header.SizeOfStackCommit;
+  Header->SizeOfHeapReserve = CP.Obj.OptionalHeader->Header.SizeOfHeapReserve;
+  Header->SizeOfHeapCommit = CP.Obj.OptionalHeader->Header.SizeOfHeapCommit;
+  Header->NumberOfRvaAndSize = COFF::NUM_DATA_DIRECTORIES + 1;
+  return BaseOfData;
+}
+
+static bool writeCOFF(COFFParser &CP, raw_ostream &OS) {
+  if (CP.isPE()) {
+    // PE files start with a DOS stub.
+    object::dos_header DH;
+    memset(&DH, 0, sizeof(DH));
+
+    // DOS EXEs start with "MZ" magic.
+    DH.Magic[0] = 'M';
+    DH.Magic[1] = 'Z';
+    // Initializing the AddressOfRelocationTable is strictly optional but
+    // mollifies certain tools which expect it to have a value greater than
+    // 0x40.
+    DH.AddressOfRelocationTable = sizeof(DH);
+    // This is the address of the PE signature.
+    DH.AddressOfNewExeHeader = DOSStubSize;
+
+    // Write out our DOS stub.
+    OS.write(reinterpret_cast<char *>(&DH), sizeof(DH));
+    // Write padding until we reach the position of where our PE signature
+    // should live.
+    OS << num_zeros(DOSStubSize - sizeof(DH));
+    // Write out the PE signature.
+    OS.write(COFF::PEMagic, sizeof(COFF::PEMagic));
+  }
+  if (CP.useBigObj()) {
+    OS << binary_le(static_cast<uint16_t>(COFF::IMAGE_FILE_MACHINE_UNKNOWN))
+       << binary_le(static_cast<uint16_t>(0xffff))
+       << binary_le(static_cast<uint16_t>(COFF::BigObjHeader::MinBigObjectVersion))
+       << binary_le(CP.Obj.Header.Machine)
+       << binary_le(CP.Obj.Header.TimeDateStamp);
+    OS.write(COFF::BigObjMagic, sizeof(COFF::BigObjMagic));
+    OS << zeros(uint32_t(0))
+       << zeros(uint32_t(0))
+       << zeros(uint32_t(0))
+       << zeros(uint32_t(0))
+       << binary_le(CP.Obj.Header.NumberOfSections)
+       << binary_le(CP.Obj.Header.PointerToSymbolTable)
+       << binary_le(CP.Obj.Header.NumberOfSymbols);
+  } else {
+    OS << binary_le(CP.Obj.Header.Machine)
+       << binary_le(static_cast<int16_t>(CP.Obj.Header.NumberOfSections))
+       << binary_le(CP.Obj.Header.TimeDateStamp)
+       << binary_le(CP.Obj.Header.PointerToSymbolTable)
+       << binary_le(CP.Obj.Header.NumberOfSymbols)
+       << binary_le(CP.Obj.Header.SizeOfOptionalHeader)
+       << binary_le(CP.Obj.Header.Characteristics);
+  }
+  if (CP.isPE()) {
+    if (CP.is64Bit()) {
+      object::pe32plus_header PEH;
+      initializeOptionalHeader(CP, COFF::PE32Header::PE32_PLUS, &PEH);
+      OS.write(reinterpret_cast<char *>(&PEH), sizeof(PEH));
+    } else {
+      object::pe32_header PEH;
+      uint32_t BaseOfData = initializeOptionalHeader(CP, COFF::PE32Header::PE32, &PEH);
+      PEH.BaseOfData = BaseOfData;
+      OS.write(reinterpret_cast<char *>(&PEH), sizeof(PEH));
+    }
+    for (const Optional<COFF::DataDirectory> &DD :
+         CP.Obj.OptionalHeader->DataDirectories) {
+      if (!DD.hasValue()) {
+        OS << zeros(uint32_t(0));
+        OS << zeros(uint32_t(0));
+      } else {
+        OS << binary_le(DD->RelativeVirtualAddress);
+        OS << binary_le(DD->Size);
+      }
+    }
+    OS << zeros(uint32_t(0));
+    OS << zeros(uint32_t(0));
+  }
+
+  assert(OS.tell() == CP.SectionTableStart);
   // Output section table.
   for (std::vector<COFFYAML::Section>::iterator i = CP.Obj.Sections.begin(),
                                                 e = CP.Obj.Sections.end();
@@ -246,6 +433,7 @@
        << binary_le(i->Header.NumberOfLineNumbers)
        << binary_le(i->Header.Characteristics);
   }
+  assert(OS.tell() == CP.SectionTableStart + CP.SectionTableSize);
 
   unsigned CurSymbol = 0;
   StringMap<unsigned> SymbolTableIndexMap;
@@ -257,12 +445,15 @@
   }
 
   // Output section data.
-  for (std::vector<COFFYAML::Section>::iterator i = CP.Obj.Sections.begin(),
-                                                e = CP.Obj.Sections.end();
-                                                i != e; ++i) {
-    i->SectionData.writeAsBinary(OS);
-    for (unsigned I2 = 0, E2 = i->Relocations.size(); I2 != E2; ++I2) {
-      const COFFYAML::Relocation &R = i->Relocations[I2];
+  for (const COFFYAML::Section &S : CP.Obj.Sections) {
+    if (!S.Header.SizeOfRawData)
+      continue;
+    assert(S.Header.PointerToRawData >= OS.tell());
+    OS << num_zeros(S.Header.PointerToRawData - OS.tell());
+    S.SectionData.writeAsBinary(OS);
+    assert(S.Header.SizeOfRawData >= S.SectionData.binary_size());
+    OS << num_zeros(S.Header.SizeOfRawData - S.SectionData.binary_size());
+    for (const COFFYAML::Relocation &R : S.Relocations) {
       uint32_t SymbolTableIndex = SymbolTableIndexMap[R.SymbolName];
       OS << binary_le(R.VirtualAddress)
          << binary_le(SymbolTableIndex)
@@ -276,9 +467,12 @@
                                                      e = CP.Obj.Symbols.end();
                                                      i != e; ++i) {
     OS.write(i->Header.Name, COFF::NameSize);
-    OS << binary_le(i->Header.Value)
-       << binary_le(i->Header.SectionNumber)
-       << binary_le(i->Header.Type)
+    OS << binary_le(i->Header.Value);
+    if (CP.useBigObj())
+       OS << binary_le(i->Header.SectionNumber);
+    else
+       OS << binary_le(static_cast<int16_t>(i->Header.SectionNumber));
+    OS << binary_le(i->Header.Type)
        << binary_le(i->Header.StorageClass)
        << binary_le(i->Header.NumberOfAuxSymbols);
 
@@ -287,43 +481,50 @@
          << binary_le(i->FunctionDefinition->TotalSize)
          << binary_le(i->FunctionDefinition->PointerToLinenumber)
          << binary_le(i->FunctionDefinition->PointerToNextFunction)
-         << zeros(i->FunctionDefinition->unused);
+         << zeros(i->FunctionDefinition->unused)
+         << num_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
     if (i->bfAndefSymbol)
       OS << zeros(i->bfAndefSymbol->unused1)
          << binary_le(i->bfAndefSymbol->Linenumber)
          << zeros(i->bfAndefSymbol->unused2)
          << binary_le(i->bfAndefSymbol->PointerToNextFunction)
-         << zeros(i->bfAndefSymbol->unused3);
+         << zeros(i->bfAndefSymbol->unused3)
+         << num_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
     if (i->WeakExternal)
       OS << binary_le(i->WeakExternal->TagIndex)
          << binary_le(i->WeakExternal->Characteristics)
-         << zeros(i->WeakExternal->unused);
+         << zeros(i->WeakExternal->unused)
+         << num_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
     if (!i->File.empty()) {
+      unsigned SymbolSize = CP.getSymbolSize();
       uint32_t NumberOfAuxRecords =
-          (i->File.size() + COFF::SymbolSize - 1) / COFF::SymbolSize;
-      uint32_t NumberOfAuxBytes = NumberOfAuxRecords * COFF::SymbolSize;
+          (i->File.size() + SymbolSize - 1) / SymbolSize;
+      uint32_t NumberOfAuxBytes = NumberOfAuxRecords * SymbolSize;
       uint32_t NumZeros = NumberOfAuxBytes - i->File.size();
       OS.write(i->File.data(), i->File.size());
-      for (uint32_t Padding = 0; Padding < NumZeros; ++Padding)
-        OS.write(0);
+      OS << num_zeros(NumZeros);
     }
     if (i->SectionDefinition)
       OS << binary_le(i->SectionDefinition->Length)
          << binary_le(i->SectionDefinition->NumberOfRelocations)
          << binary_le(i->SectionDefinition->NumberOfLinenumbers)
          << binary_le(i->SectionDefinition->CheckSum)
-         << binary_le(i->SectionDefinition->Number)
+         << binary_le(static_cast<int16_t>(i->SectionDefinition->Number))
          << binary_le(i->SectionDefinition->Selection)
-         << zeros(i->SectionDefinition->unused);
+         << zeros(i->SectionDefinition->unused)
+         << binary_le(static_cast<int16_t>(i->SectionDefinition->Number >> 16))
+         << num_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
     if (i->CLRToken)
       OS << binary_le(i->CLRToken->AuxType)
          << zeros(i->CLRToken->unused1)
          << binary_le(i->CLRToken->SymbolTableIndex)
-         << zeros(i->CLRToken->unused2);
+         << zeros(i->CLRToken->unused2)
+         << num_zeros(CP.getSymbolSize() - COFF::Symbol16Size);
   }
 
   // Output string table.
-  OS.write(&CP.StringTable[0], CP.StringTable.size());
+  if (CP.Obj.Header.PointerToSymbolTable)
+    OS.write(&CP.StringTable[0], CP.StringTable.size());
   return true;
 }
 
@@ -341,6 +542,10 @@
     return 1;
   }
 
+  if (!layoutOptionalHeader(CP)) {
+    errs() << "yaml2obj: Failed to layout optional header for COFF file!\n";
+    return 1;
+  }
   if (!layoutCOFF(CP)) {
     errs() << "yaml2obj: Failed to layout COFF file!\n";
     return 1;

diff --git a/tools/yaml2obj/yaml2elf.cpp b/tools/yaml2obj/yaml2elf.cpp
index 6eeecae..44c8c12 100644
--- a/tools/yaml2obj/yaml2elf.cpp
+++ b/tools/yaml2obj/yaml2elf.cpp

@@ -62,11 +62,7 @@
 public:
   /// \returns true if name is already present in the map.
   bool addName(StringRef Name, unsigned i) {
-    StringMapEntry<int> &Entry = Map.GetOrCreateValue(Name, -1);
-    if (Entry.getValue() != -1)
-      return true;
-    Entry.setValue((int)i);
-    return false;
+    return !Map.insert(std::make_pair(Name, (int)i)).second;
   }
   /// \returns true if name is not present in the map
   bool lookup(StringRef Name, unsigned &Idx) const {
@@ -190,7 +186,7 @@
 
   for (const auto &Sec : Doc.Sections)
     DotShStrtab.add(Sec->Name);
-  DotShStrtab.finalize();
+  DotShStrtab.finalize(StringTableBuilder::ELF);
 
   for (const auto &Sec : Doc.Sections) {
     zero(SHeader);
@@ -261,7 +257,7 @@
     DotStrtab.add(Sym.Name);
   for (const auto &Sym : Doc.Symbols.Weak)
     DotStrtab.add(Sym.Name);
-  DotStrtab.finalize();
+  DotStrtab.finalize(StringTableBuilder::ELF);
 
   addSymbols(Doc.Symbols.Local, Syms, ELF::STB_LOCAL);
   addSymbols(Doc.Symbols.Global, Syms, ELF::STB_GLOBAL);
@@ -304,7 +300,7 @@
       Symbol.st_shndx = Index;
     } // else Symbol.st_shndex == SHN_UNDEF (== 0), since it was zero'd earlier.
     Symbol.st_value = Sym.Value;
-    Symbol.st_other = Sym.Visibility;
+    Symbol.st_other = Sym.Other;
     Symbol.st_size = Sym.Size;
     Syms.push_back(Symbol);
   }

diff --git a/tools/yaml2obj/yaml2obj.cpp b/tools/yaml2obj/yaml2obj.cpp
index 945fad1..375cd89 100644
--- a/tools/yaml2obj/yaml2obj.cpp
+++ b/tools/yaml2obj/yaml2obj.cpp

@@ -83,11 +83,11 @@
   if (OutputFilename.empty())
     OutputFilename = "-";
 
-  std::string ErrorInfo;
+  std::error_code EC;
   std::unique_ptr<tool_output_file> Out(
-      new tool_output_file(OutputFilename.c_str(), ErrorInfo, sys::fs::F_None));
-  if (!ErrorInfo.empty()) {
-    errs() << ErrorInfo << '\n';
+      new tool_output_file(OutputFilename, EC, sys::fs::F_None));
+  if (EC) {
+    errs() << EC.message() << '\n';
     return 1;
   }
 

diff --git a/tools/yaml2obj/yaml2obj.h b/tools/yaml2obj/yaml2obj.h
index 086f641..7290a9a 100644
--- a/tools/yaml2obj/yaml2obj.h
+++ b/tools/yaml2obj/yaml2obj.h

@@ -9,8 +9,8 @@
 /// \file
 /// \brief Common declarations for yaml2obj
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_TOOLS_YAML2OBJ_H
-#define LLVM_TOOLS_YAML2OBJ_H
+#ifndef LLVM_TOOLS_YAML2OBJ_YAML2OBJ_H
+#define LLVM_TOOLS_YAML2OBJ_YAML2OBJ_H
 
 namespace llvm {
 class raw_ostream;

diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index 8f298cd..c7ec16b 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp

@@ -474,6 +474,40 @@
     f1.fusedMultiplyAdd(f2, f3, APFloat::rmNearestTiesToEven);
     EXPECT_EQ(12.0f, f1.convertToFloat());
   }
+
+  {
+    APFloat M1(APFloat::x87DoubleExtended, 1.0);
+    APFloat M2(APFloat::x87DoubleExtended, 1.0);
+    APFloat A(APFloat::x87DoubleExtended, 3.0);
+
+    bool losesInfo = false;
+    M1.fusedMultiplyAdd(M1, A, APFloat::rmNearestTiesToEven);
+    M1.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &losesInfo);
+    EXPECT_FALSE(losesInfo);
+    EXPECT_EQ(4.0f, M1.convertToFloat());
+  }
+}
+
+TEST(APFloatTest, MinNum) {
+  APFloat f1(1.0);
+  APFloat f2(2.0);
+  APFloat nan = APFloat::getNaN(APFloat::IEEEdouble);
+
+  EXPECT_EQ(1.0, minnum(f1, f2).convertToDouble());
+  EXPECT_EQ(1.0, minnum(f2, f1).convertToDouble());
+  EXPECT_EQ(1.0, minnum(f1, nan).convertToDouble());
+  EXPECT_EQ(1.0, minnum(nan, f1).convertToDouble());
+}
+
+TEST(APFloatTest, MaxNum) {
+  APFloat f1(1.0);
+  APFloat f2(2.0);
+  APFloat nan = APFloat::getNaN(APFloat::IEEEdouble);
+
+  EXPECT_EQ(2.0, maxnum(f1, f2).convertToDouble());
+  EXPECT_EQ(2.0, maxnum(f2, f1).convertToDouble());
+  EXPECT_EQ(1.0, maxnum(f1, nan).convertToDouble());
+  EXPECT_EQ(1.0, minnum(nan, f1).convertToDouble());
 }
 
 TEST(APFloatTest, Denormal) {
@@ -1342,6 +1376,17 @@
   }
 }
 
+TEST(APFloatTest, copySign) {
+  EXPECT_TRUE(APFloat(-42.0).bitwiseIsEqual(
+      APFloat::copySign(APFloat(42.0), APFloat(-1.0))));
+  EXPECT_TRUE(APFloat(42.0).bitwiseIsEqual(
+      APFloat::copySign(APFloat(-42.0), APFloat(1.0))));
+  EXPECT_TRUE(APFloat(-42.0).bitwiseIsEqual(
+      APFloat::copySign(APFloat(-42.0), APFloat(-1.0))));
+  EXPECT_TRUE(APFloat(42.0).bitwiseIsEqual(
+      APFloat::copySign(APFloat(42.0), APFloat(1.0))));
+}
+
 TEST(APFloatTest, convert) {
   bool losesInfo;
   APFloat test(APFloat::IEEEdouble, "1.0");
@@ -2671,4 +2716,123 @@
   }
 }
 
+TEST(APFloatTest, operatorOverloads) {
+  // This is mostly testing that these operator overloads compile.
+  APFloat One = APFloat(APFloat::IEEEsingle, "0x1p+0");
+  APFloat Two = APFloat(APFloat::IEEEsingle, "0x2p+0");
+  EXPECT_TRUE(Two.bitwiseIsEqual(One + One));
+  EXPECT_TRUE(One.bitwiseIsEqual(Two - One));
+  EXPECT_TRUE(Two.bitwiseIsEqual(One * Two));
+  EXPECT_TRUE(One.bitwiseIsEqual(Two / Two));
+}
+
+TEST(APFloatTest, abs) {
+  APFloat PInf = APFloat::getInf(APFloat::IEEEsingle, false);
+  APFloat MInf = APFloat::getInf(APFloat::IEEEsingle, true);
+  APFloat PZero = APFloat::getZero(APFloat::IEEEsingle, false);
+  APFloat MZero = APFloat::getZero(APFloat::IEEEsingle, true);
+  APFloat PQNaN = APFloat::getNaN(APFloat::IEEEsingle, false);
+  APFloat MQNaN = APFloat::getNaN(APFloat::IEEEsingle, true);
+  APFloat PSNaN = APFloat::getSNaN(APFloat::IEEEsingle, false);
+  APFloat MSNaN = APFloat::getSNaN(APFloat::IEEEsingle, true);
+  APFloat PNormalValue = APFloat(APFloat::IEEEsingle, "0x1p+0");
+  APFloat MNormalValue = APFloat(APFloat::IEEEsingle, "-0x1p+0");
+  APFloat PLargestValue = APFloat::getLargest(APFloat::IEEEsingle, false);
+  APFloat MLargestValue = APFloat::getLargest(APFloat::IEEEsingle, true);
+  APFloat PSmallestValue = APFloat::getSmallest(APFloat::IEEEsingle, false);
+  APFloat MSmallestValue = APFloat::getSmallest(APFloat::IEEEsingle, true);
+  APFloat PSmallestNormalized =
+    APFloat::getSmallestNormalized(APFloat::IEEEsingle, false);
+  APFloat MSmallestNormalized =
+    APFloat::getSmallestNormalized(APFloat::IEEEsingle, true);
+
+  EXPECT_TRUE(PInf.bitwiseIsEqual(abs(PInf)));
+  EXPECT_TRUE(PInf.bitwiseIsEqual(abs(MInf)));
+  EXPECT_TRUE(PZero.bitwiseIsEqual(abs(PZero)));
+  EXPECT_TRUE(PZero.bitwiseIsEqual(abs(MZero)));
+  EXPECT_TRUE(PQNaN.bitwiseIsEqual(abs(PQNaN)));
+  EXPECT_TRUE(PQNaN.bitwiseIsEqual(abs(MQNaN)));
+  EXPECT_TRUE(PSNaN.bitwiseIsEqual(abs(PSNaN)));
+  EXPECT_TRUE(PSNaN.bitwiseIsEqual(abs(MSNaN)));
+  EXPECT_TRUE(PNormalValue.bitwiseIsEqual(abs(PNormalValue)));
+  EXPECT_TRUE(PNormalValue.bitwiseIsEqual(abs(MNormalValue)));
+  EXPECT_TRUE(PLargestValue.bitwiseIsEqual(abs(PLargestValue)));
+  EXPECT_TRUE(PLargestValue.bitwiseIsEqual(abs(MLargestValue)));
+  EXPECT_TRUE(PSmallestValue.bitwiseIsEqual(abs(PSmallestValue)));
+  EXPECT_TRUE(PSmallestValue.bitwiseIsEqual(abs(MSmallestValue)));
+  EXPECT_TRUE(PSmallestNormalized.bitwiseIsEqual(abs(PSmallestNormalized)));
+  EXPECT_TRUE(PSmallestNormalized.bitwiseIsEqual(abs(MSmallestNormalized)));
+}
+
+TEST(APFloatTest, ilogb) {
+  EXPECT_EQ(0, ilogb(APFloat(APFloat::IEEEsingle, "0x1p+0")));
+  EXPECT_EQ(0, ilogb(APFloat(APFloat::IEEEsingle, "-0x1p+0")));
+  EXPECT_EQ(42, ilogb(APFloat(APFloat::IEEEsingle, "0x1p+42")));
+  EXPECT_EQ(-42, ilogb(APFloat(APFloat::IEEEsingle, "0x1p-42")));
+
+  EXPECT_EQ(APFloat::IEK_Inf,
+            ilogb(APFloat::getInf(APFloat::IEEEsingle, false)));
+  EXPECT_EQ(APFloat::IEK_Inf,
+            ilogb(APFloat::getInf(APFloat::IEEEsingle, true)));
+  EXPECT_EQ(APFloat::IEK_Zero,
+            ilogb(APFloat::getZero(APFloat::IEEEsingle, false)));
+  EXPECT_EQ(APFloat::IEK_Zero,
+            ilogb(APFloat::getZero(APFloat::IEEEsingle, true)));
+  EXPECT_EQ(APFloat::IEK_NaN,
+            ilogb(APFloat::getNaN(APFloat::IEEEsingle, false)));
+  EXPECT_EQ(APFloat::IEK_NaN,
+            ilogb(APFloat::getSNaN(APFloat::IEEEsingle, false)));
+
+  EXPECT_EQ(127, ilogb(APFloat::getLargest(APFloat::IEEEsingle, false)));
+  EXPECT_EQ(127, ilogb(APFloat::getLargest(APFloat::IEEEsingle, true)));
+  EXPECT_EQ(-126, ilogb(APFloat::getSmallest(APFloat::IEEEsingle, false)));
+  EXPECT_EQ(-126, ilogb(APFloat::getSmallest(APFloat::IEEEsingle, true)));
+  EXPECT_EQ(-126,
+            ilogb(APFloat::getSmallestNormalized(APFloat::IEEEsingle, false)));
+  EXPECT_EQ(-126,
+            ilogb(APFloat::getSmallestNormalized(APFloat::IEEEsingle, true)));
+}
+
+TEST(APFloatTest, scalbn) {
+  EXPECT_TRUE(
+      APFloat(APFloat::IEEEsingle, "0x1p+0")
+          .bitwiseIsEqual(scalbn(APFloat(APFloat::IEEEsingle, "0x1p+0"), 0)));
+  EXPECT_TRUE(
+      APFloat(APFloat::IEEEsingle, "0x1p+42")
+          .bitwiseIsEqual(scalbn(APFloat(APFloat::IEEEsingle, "0x1p+0"), 42)));
+  EXPECT_TRUE(
+      APFloat(APFloat::IEEEsingle, "0x1p-42")
+          .bitwiseIsEqual(scalbn(APFloat(APFloat::IEEEsingle, "0x1p+0"), -42)));
+
+  APFloat PInf = APFloat::getInf(APFloat::IEEEsingle, false);
+  APFloat MInf = APFloat::getInf(APFloat::IEEEsingle, true);
+  APFloat PZero = APFloat::getZero(APFloat::IEEEsingle, false);
+  APFloat MZero = APFloat::getZero(APFloat::IEEEsingle, true);
+  APFloat QPNaN = APFloat::getNaN(APFloat::IEEEsingle, false);
+  APFloat QMNaN = APFloat::getNaN(APFloat::IEEEsingle, true);
+  APFloat SNaN = APFloat::getSNaN(APFloat::IEEEsingle, false);
+
+  EXPECT_TRUE(PInf.bitwiseIsEqual(scalbn(PInf, 0)));
+  EXPECT_TRUE(MInf.bitwiseIsEqual(scalbn(MInf, 0)));
+  EXPECT_TRUE(PZero.bitwiseIsEqual(scalbn(PZero, 0)));
+  EXPECT_TRUE(MZero.bitwiseIsEqual(scalbn(MZero, 0)));
+  EXPECT_TRUE(QPNaN.bitwiseIsEqual(scalbn(QPNaN, 0)));
+  EXPECT_TRUE(QMNaN.bitwiseIsEqual(scalbn(QMNaN, 0)));
+  EXPECT_TRUE(SNaN.bitwiseIsEqual(scalbn(SNaN, 0)));
+
+  EXPECT_TRUE(
+      PInf.bitwiseIsEqual(scalbn(APFloat(APFloat::IEEEsingle, "0x1p+0"), 128)));
+  EXPECT_TRUE(MInf.bitwiseIsEqual(
+      scalbn(APFloat(APFloat::IEEEsingle, "-0x1p+0"), 128)));
+  EXPECT_TRUE(
+      PInf.bitwiseIsEqual(scalbn(APFloat(APFloat::IEEEsingle, "0x1p+127"), 1)));
+  EXPECT_TRUE(PZero.bitwiseIsEqual(
+      scalbn(APFloat(APFloat::IEEEsingle, "0x1p+0"), -127)));
+  EXPECT_TRUE(MZero.bitwiseIsEqual(
+      scalbn(APFloat(APFloat::IEEEsingle, "-0x1p+0"), -127)));
+  EXPECT_TRUE(PZero.bitwiseIsEqual(
+      scalbn(APFloat(APFloat::IEEEsingle, "0x1p-126"), -1)));
+  EXPECT_TRUE(PZero.bitwiseIsEqual(
+      scalbn(APFloat(APFloat::IEEEsingle, "0x1p-126"), -1)));
+}
 }

diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index 19c47ab..8198c71 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp

@@ -614,7 +614,7 @@
     0x7E7FFA5EADD8846ULL,
     0x305F341CA00B613DULL
   };
-  APInt A2(integerPartWidth*4, ArrayRef<integerPart>(E2, 4));
+  APInt A2(integerPartWidth*4, E2);
   for (unsigned i = 0; i < 4; ++i) {
     for (unsigned j = 0; j < integerPartWidth; ++j) {
       EXPECT_EQ(bool(E2[i] & (1ULL << j)),
@@ -653,17 +653,17 @@
 
   // Test round up.
   integerPart I4[4] = {0x0, 0xF, 0x18, 0x0};
-  APInt A4(integerPartWidth*4, ArrayRef<integerPart>(I4, 4));
+  APInt A4(integerPartWidth*4, I4);
   EXPECT_EQ(A4.nearestLogBase2(), A4.ceilLogBase2());
 
   // Test round down.
   integerPart I5[4] = {0x0, 0xF, 0x10, 0x0};
-  APInt A5(integerPartWidth*4, ArrayRef<integerPart>(I5, 4));
+  APInt A5(integerPartWidth*4, I5);
   EXPECT_EQ(A5.nearestLogBase2(), A5.logBase2());
 
   // Test ties round up.
   uint64_t I6[4] = {0x0, 0x0, 0x0, 0x18};
-  APInt A6(integerPartWidth*4, ArrayRef<integerPart>(I6, 4));
+  APInt A6(integerPartWidth*4, I6);
   EXPECT_EQ(A6.nearestLogBase2(), A6.ceilLogBase2());
 
   // Test BitWidth == 1 special cases.
@@ -678,4 +678,21 @@
   EXPECT_EQ(A9.nearestLogBase2(), UINT32_MAX);
 }
 
+TEST(APIntTest, SelfMoveAssignment) {
+  APInt X(32, 0xdeadbeef);
+  X = std::move(X);
+  EXPECT_EQ(32u, X.getBitWidth());
+  EXPECT_EQ(0xdeadbeefULL, X.getLimitedValue());
+
+  uint64_t Bits[] = {0xdeadbeefdeadbeefULL, 0xdeadbeefdeadbeefULL};
+  APInt Y(128, Bits);
+  Y = std::move(Y);
+  EXPECT_EQ(128u, Y.getBitWidth());
+  EXPECT_EQ(~0ULL, Y.getLimitedValue());
+  const uint64_t *Raw = Y.getRawData();
+  EXPECT_EQ(2u, Y.getNumWords());
+  EXPECT_EQ(0xdeadbeefdeadbeefULL, Raw[0]);
+  EXPECT_EQ(0xdeadbeefdeadbeefULL, Raw[1]);
+}
+
 }

diff --git a/unittests/ADT/ArrayRefTest.cpp b/unittests/ADT/ArrayRefTest.cpp
index 293afc6..f9c98a5 100644
--- a/unittests/ADT/ArrayRefTest.cpp
+++ b/unittests/ADT/ArrayRefTest.cpp

@@ -13,6 +13,23 @@
 #include "gtest/gtest.h"
 using namespace llvm;
 
+// Check that the ArrayRef-of-pointer converting constructor only allows adding
+// cv qualifiers (not removing them, or otherwise changing the type)
+static_assert(
+    std::is_convertible<ArrayRef<int *>, ArrayRef<const int *>>::value,
+    "Adding const");
+static_assert(
+    std::is_convertible<ArrayRef<int *>, ArrayRef<volatile int *>>::value,
+    "Adding volatile");
+static_assert(!std::is_convertible<ArrayRef<int *>, ArrayRef<float *>>::value,
+              "Changing pointer of one type to a pointer of another");
+static_assert(
+    !std::is_convertible<ArrayRef<const int *>, ArrayRef<int *>>::value,
+    "Removing const");
+static_assert(
+    !std::is_convertible<ArrayRef<volatile int *>, ArrayRef<int *>>::value,
+    "Removing volatile");
+
 namespace llvm {
 
 TEST(ArrayRefTest, AllocatorCopy) {
@@ -36,5 +53,41 @@
   EXPECT_TRUE(AR1.drop_back().equals(AR2));
 }
 
+TEST(ArrayRefTest, Equals) {
+  static const int A1[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  ArrayRef<int> AR1(A1);
+  EXPECT_TRUE(AR1.equals(1, 2, 3, 4, 5, 6, 7, 8));
+  EXPECT_FALSE(AR1.equals(8, 1, 2, 4, 5, 6, 6, 7));
+  EXPECT_FALSE(AR1.equals(2, 4, 5, 6, 6, 7, 8, 1));
+  EXPECT_FALSE(AR1.equals(0, 1, 2, 4, 5, 6, 6, 7));
+  EXPECT_FALSE(AR1.equals(1, 2, 42, 4, 5, 6, 7, 8));
+  EXPECT_FALSE(AR1.equals(42, 2, 3, 4, 5, 6, 7, 8));
+  EXPECT_FALSE(AR1.equals(1, 2, 3, 4, 5, 6, 7, 42));
+  EXPECT_FALSE(AR1.equals(1, 2, 3, 4, 5, 6, 7));
+  EXPECT_FALSE(AR1.equals(1, 2, 3, 4, 5, 6, 7, 8, 9));
+
+  ArrayRef<int> AR1a = AR1.drop_back();
+  EXPECT_TRUE(AR1a.equals(1, 2, 3, 4, 5, 6, 7));
+  EXPECT_FALSE(AR1a.equals(1, 2, 3, 4, 5, 6, 7, 8));
+
+  ArrayRef<int> AR1b = AR1a.slice(2, 4);
+  EXPECT_TRUE(AR1b.equals(3, 4, 5, 6));
+  EXPECT_FALSE(AR1b.equals(2, 3, 4, 5, 6));
+  EXPECT_FALSE(AR1b.equals(3, 4, 5, 6, 7));
+}
+
+TEST(ArrayRefTest, EmptyEquals) {
+  EXPECT_TRUE(ArrayRef<unsigned>() == ArrayRef<unsigned>());
+}
+
+TEST(ArrayRefTest, ConstConvert) {
+  int buf[4];
+  for (int i = 0; i < 4; ++i)
+    buf[i] = i;
+
+  static int *A[] = {&buf[0], &buf[1], &buf[2], &buf[3]};
+  ArrayRef<const int *> a((ArrayRef<int *>(A)));
+  a = ArrayRef<int *>(A);
+}
 
 } // end anonymous namespace

diff --git a/unittests/ADT/CMakeLists.txt b/unittests/ADT/CMakeLists.txt
index 0f214f3..d899852 100644
--- a/unittests/ADT/CMakeLists.txt
+++ b/unittests/ADT/CMakeLists.txt

@@ -13,6 +13,7 @@
   DenseMapTest.cpp
   DenseSetTest.cpp
   FoldingSet.cpp
+  FunctionRefTest.cpp
   HashingTest.cpp
   ilistTest.cpp
   ImmutableMapTest.cpp
@@ -26,6 +27,7 @@
   PackedVectorTest.cpp
   PointerIntPairTest.cpp
   PointerUnionTest.cpp
+  PostOrderIteratorTest.cpp
   SCCIteratorTest.cpp
   SmallPtrSetTest.cpp
   SmallStringTest.cpp

diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp
index 75a910a..f497983 100644
--- a/unittests/ADT/DenseMapTest.cpp
+++ b/unittests/ADT/DenseMapTest.cpp

@@ -244,6 +244,11 @@
 
   EXPECT_EQ(1u, copyMap.size());
   EXPECT_EQ(this->getValue(), copyMap[this->getKey()]);
+
+  // test self-assignment.
+  copyMap = copyMap;
+  EXPECT_EQ(1u, copyMap.size());
+  EXPECT_EQ(this->getValue(), copyMap[this->getKey()]);
 }
 
 // Test swap method

diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp
index 154c589..5952353 100644
--- a/unittests/ADT/DenseSetTest.cpp
+++ b/unittests/ADT/DenseSetTest.cpp

@@ -27,4 +27,42 @@
   EXPECT_EQ(0u, set.count(2));
 }
 
+struct TestDenseSetInfo {
+  static inline unsigned getEmptyKey() { return ~0; }
+  static inline unsigned getTombstoneKey() { return ~0U - 1; }
+  static unsigned getHashValue(const unsigned& Val) { return Val * 37U; }
+  static unsigned getHashValue(const char* Val) {
+    return (unsigned)(Val[0] - 'a') * 37U;
+  }
+  static bool isEqual(const unsigned& LHS, const unsigned& RHS) {
+    return LHS == RHS;
+  }
+  static bool isEqual(const char* LHS, const unsigned& RHS) {
+    return (unsigned)(LHS[0] - 'a') == RHS;
+  }
+};
+
+TEST(DenseSetCustomTest, FindAsTest) {
+  DenseSet<unsigned, TestDenseSetInfo> set;
+  set.insert(0);
+  set.insert(1);
+  set.insert(2);
+
+  // Size tests
+  EXPECT_EQ(3u, set.size());
+
+  // Normal lookup tests
+  EXPECT_EQ(1u, set.count(1));
+  EXPECT_EQ(0u, *set.find(0));
+  EXPECT_EQ(1u, *set.find(1));
+  EXPECT_EQ(2u, *set.find(2));
+  EXPECT_TRUE(set.find(3) == set.end());
+
+  // find_as() tests
+  EXPECT_EQ(0u, *set.find_as("a"));
+  EXPECT_EQ(1u, *set.find_as("b"));
+  EXPECT_EQ(2u, *set.find_as("c"));
+  EXPECT_TRUE(set.find_as("d") == set.end());
+}
+
 }

diff --git a/unittests/ADT/FunctionRefTest.cpp b/unittests/ADT/FunctionRefTest.cpp
new file mode 100644
index 0000000..075d9a0
--- /dev/null
+++ b/unittests/ADT/FunctionRefTest.cpp

@@ -0,0 +1,28 @@
+//===- llvm/unittest/ADT/MakeUniqueTest.cpp - make_unique unit tests ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+// Ensure that copies of a function_ref copy the underlying state rather than
+// causing one function_ref to chain to the next.
+TEST(FunctionRefTest, Copy) {
+  auto A = [] { return 1; };
+  auto B = [] { return 2; };
+  function_ref<int()> X = A;
+  function_ref<int()> Y = X;
+  X = B;
+  EXPECT_EQ(1, Y());
+}
+
+}

diff --git a/unittests/ADT/MapVectorTest.cpp b/unittests/ADT/MapVectorTest.cpp
index 11178bc..8919799 100644
--- a/unittests/ADT/MapVectorTest.cpp
+++ b/unittests/ADT/MapVectorTest.cpp

@@ -9,6 +9,7 @@
 
 #include "gtest/gtest.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include <utility>
 
 using namespace llvm;
@@ -53,3 +54,71 @@
   EXPECT_EQ(MV[1], 2);
   EXPECT_EQ(MV[4], 7);
 }
+
+TEST(MapVectorTest, erase) {
+  MapVector<int, int> MV;
+
+  MV.insert(std::make_pair(1, 2));
+  MV.insert(std::make_pair(3, 4));
+  MV.insert(std::make_pair(5, 6));
+  ASSERT_EQ(MV.size(), 3u);
+
+  MV.erase(MV.find(1));
+  ASSERT_EQ(MV.size(), 2u);
+  ASSERT_EQ(MV.find(1), MV.end());
+  ASSERT_EQ(MV[3], 4);
+  ASSERT_EQ(MV[5], 6);
+
+  ASSERT_EQ(MV.erase(3), 1u);
+  ASSERT_EQ(MV.size(), 1u);
+  ASSERT_EQ(MV.find(3), MV.end());
+  ASSERT_EQ(MV[5], 6);
+
+  ASSERT_EQ(MV.erase(79), 0u);
+  ASSERT_EQ(MV.size(), 1u);
+}
+
+TEST(MapVectorTest, remove_if) {
+  MapVector<int, int> MV;
+
+  MV.insert(std::make_pair(1, 11));
+  MV.insert(std::make_pair(2, 12));
+  MV.insert(std::make_pair(3, 13));
+  MV.insert(std::make_pair(4, 14));
+  MV.insert(std::make_pair(5, 15));
+  MV.insert(std::make_pair(6, 16));
+  ASSERT_EQ(MV.size(), 6u);
+
+  MV.remove_if([](const std::pair<int, int> &Val) { return Val.second % 2; });
+  ASSERT_EQ(MV.size(), 3u);
+  ASSERT_EQ(MV.find(1), MV.end());
+  ASSERT_EQ(MV.find(3), MV.end());
+  ASSERT_EQ(MV.find(5), MV.end());
+  ASSERT_EQ(MV[2], 12);
+  ASSERT_EQ(MV[4], 14);
+  ASSERT_EQ(MV[6], 16);
+}
+
+TEST(MapVectorTest, iteration_test) {
+  MapVector<int, int> MV;
+
+  MV.insert(std::make_pair(1, 11));
+  MV.insert(std::make_pair(2, 12));
+  MV.insert(std::make_pair(3, 13));
+  MV.insert(std::make_pair(4, 14));
+  MV.insert(std::make_pair(5, 15));
+  MV.insert(std::make_pair(6, 16));
+  ASSERT_EQ(MV.size(), 6u);
+
+  int count = 1;
+  for (auto P : make_range(MV.begin(), MV.end())) {
+    ASSERT_EQ(P.first, count);
+    count++;
+  }
+
+  count = 6;
+  for (auto P : make_range(MV.rbegin(), MV.rend())) {
+    ASSERT_EQ(P.first, count);
+    count--;
+  }
+}

diff --git a/unittests/ADT/OptionalTest.cpp b/unittests/ADT/OptionalTest.cpp
index 2da408c..cadadce 100644
--- a/unittests/ADT/OptionalTest.cpp
+++ b/unittests/ADT/OptionalTest.cpp

@@ -169,6 +169,52 @@
   EXPECT_EQ(0u, NonDefaultConstructible::Destructions);
 }
 
+TEST_F(OptionalTest, GetValueOr) {
+  Optional<int> A;
+  EXPECT_EQ(42, A.getValueOr(42));
+
+  A = 5;
+  EXPECT_EQ(5, A.getValueOr(42));
+}
+
+struct MultiArgConstructor {
+  int x, y;
+  MultiArgConstructor(int x, int y) : x(x), y(y) {}
+  explicit MultiArgConstructor(int x, bool positive)
+    : x(x), y(positive ? x : -x) {}
+
+  MultiArgConstructor(const MultiArgConstructor &) LLVM_DELETED_FUNCTION;
+  MultiArgConstructor(MultiArgConstructor &&) LLVM_DELETED_FUNCTION;
+  MultiArgConstructor &operator=(const MultiArgConstructor &) LLVM_DELETED_FUNCTION;
+  MultiArgConstructor &operator=(MultiArgConstructor &&) LLVM_DELETED_FUNCTION;
+
+  static unsigned Destructions;
+  ~MultiArgConstructor() {
+    ++Destructions;
+  }
+  static void ResetCounts() {
+    Destructions = 0;
+  }
+};
+unsigned MultiArgConstructor::Destructions = 0;
+
+TEST_F(OptionalTest, Emplace) {
+  MultiArgConstructor::ResetCounts();
+  Optional<MultiArgConstructor> A;
+  
+  A.emplace(1, 2);
+  EXPECT_TRUE(A.hasValue());
+  EXPECT_EQ(1, A->x);
+  EXPECT_EQ(2, A->y);
+  EXPECT_EQ(0u, MultiArgConstructor::Destructions);
+
+  A.emplace(5, false);
+  EXPECT_TRUE(A.hasValue());
+  EXPECT_EQ(5, A->x);
+  EXPECT_EQ(-5, A->y);
+  EXPECT_EQ(1u, MultiArgConstructor::Destructions);
+}
+
 struct MoveOnly {
   static unsigned MoveConstructions;
   static unsigned Destructions;
@@ -278,5 +324,58 @@
   EXPECT_EQ(1u, MoveOnly::Destructions);
 }
 
+struct Immovable {
+  static unsigned Constructions;
+  static unsigned Destructions;
+  int val;
+  explicit Immovable(int val) : val(val) {
+    ++Constructions;
+  }
+  ~Immovable() {
+    ++Destructions;
+  }
+  static void ResetCounts() {
+    Constructions = 0;
+    Destructions = 0;
+  }
+private:
+  // This should disable all move/copy operations.
+  Immovable(Immovable&& other) LLVM_DELETED_FUNCTION;
+};
+
+unsigned Immovable::Constructions = 0;
+unsigned Immovable::Destructions = 0;
+
+TEST_F(OptionalTest, ImmovableEmplace) {
+  Optional<Immovable> A;
+  Immovable::ResetCounts();
+  A.emplace(4);
+  EXPECT_TRUE((bool)A);
+  EXPECT_EQ(4, A->val);
+  EXPECT_EQ(1u, Immovable::Constructions);
+  EXPECT_EQ(0u, Immovable::Destructions);
+}
+
+#if LLVM_HAS_RVALUE_REFERENCE_THIS
+
+TEST_F(OptionalTest, MoveGetValueOr) {
+  Optional<MoveOnly> A;
+
+  MoveOnly::ResetCounts();
+  EXPECT_EQ(42, std::move(A).getValueOr(MoveOnly(42)).val);
+  EXPECT_EQ(1u, MoveOnly::MoveConstructions);
+  EXPECT_EQ(0u, MoveOnly::MoveAssignments);
+  EXPECT_EQ(2u, MoveOnly::Destructions);
+
+  A = MoveOnly(5);
+  MoveOnly::ResetCounts();
+  EXPECT_EQ(5, std::move(A).getValueOr(MoveOnly(42)).val);
+  EXPECT_EQ(1u, MoveOnly::MoveConstructions);
+  EXPECT_EQ(0u, MoveOnly::MoveAssignments);
+  EXPECT_EQ(2u, MoveOnly::Destructions);
+}
+
+#endif // LLVM_HAS_RVALUE_REFERENCE_THIS
+
 } // end anonymous namespace
 

diff --git a/unittests/ADT/PostOrderIteratorTest.cpp b/unittests/ADT/PostOrderIteratorTest.cpp
new file mode 100644
index 0000000..1da1078
--- /dev/null
+++ b/unittests/ADT/PostOrderIteratorTest.cpp

@@ -0,0 +1,37 @@
+//===- PostOrderIteratorTest.cpp - PostOrderIterator unit tests -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "gtest/gtest.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+using namespace llvm;
+
+namespace {
+
+// Whether we're able to compile
+TEST(PostOrderIteratorTest, Compiles) {
+  typedef SmallPtrSet<void *, 4> ExtSetTy;
+
+  // Tests that template specializations are kept up to date
+  void *Null = nullptr;
+  po_iterator_storage<std::set<void *>, false> PIS;
+  PIS.insertEdge(Null, Null);
+  ExtSetTy Ext;
+  po_iterator_storage<ExtSetTy, true> PISExt(Ext);
+  PIS.insertEdge(Null, Null);
+
+  // Test above, but going through po_iterator (which inherits from template
+  // base)
+  BasicBlock *NullBB = nullptr;
+  auto PI = po_end(NullBB);
+  PI.insertEdge(NullBB, NullBB);
+  auto PIExt = po_ext_end(NullBB, Ext);
+  PIExt.insertEdge(NullBB, NullBB);
+}
+}

diff --git a/unittests/ADT/StringMapTest.cpp b/unittests/ADT/StringMapTest.cpp
index 028375d..33d668f 100644
--- a/unittests/ADT/StringMapTest.cpp
+++ b/unittests/ADT/StringMapTest.cpp

@@ -250,15 +250,21 @@
 
 TEST_F(StringMapTest, NonDefaultConstructable) {
   StringMap<StringMapTestStruct> t;
-  t.GetOrCreateValue("Test", StringMapTestStruct(123));
+  t.insert(std::make_pair("Test", StringMapTestStruct(123)));
   StringMap<StringMapTestStruct>::iterator iter = t.find("Test");
   ASSERT_NE(iter, t.end());
   ASSERT_EQ(iter->second.i, 123);
 }
 
+struct Immovable {
+  Immovable() {}
+  Immovable(Immovable&&) LLVM_DELETED_FUNCTION; // will disable the other special members
+};
+
 struct MoveOnly {
   int i;
   MoveOnly(int i) : i(i) {}
+  MoveOnly(const Immovable&) : i(0) {}
   MoveOnly(MoveOnly &&RHS) : i(RHS.i) {}
   MoveOnly &operator=(MoveOnly &&RHS) {
     i = RHS.i;
@@ -270,17 +276,23 @@
   MoveOnly &operator=(const MoveOnly &) LLVM_DELETED_FUNCTION;
 };
 
-TEST_F(StringMapTest, MoveOnlyKey) {
+TEST_F(StringMapTest, MoveOnly) {
   StringMap<MoveOnly> t;
-  t.GetOrCreateValue("Test", MoveOnly(42));
+  t.insert(std::make_pair("Test", MoveOnly(42)));
   StringRef Key = "Test";
   StringMapEntry<MoveOnly>::Create(Key, MoveOnly(42))
       ->Destroy();
 }
 
+TEST_F(StringMapTest, CtorArg) {
+  StringRef Key = "Test";
+  StringMapEntry<MoveOnly>::Create(Key, Immovable())
+      ->Destroy();
+}
+
 TEST_F(StringMapTest, MoveConstruct) {
   StringMap<int> A;
-  A.GetOrCreateValue("x", 42);
+  A["x"] = 42;
   StringMap<int> B = std::move(A);
   ASSERT_EQ(A.size(), 0u);
   ASSERT_EQ(B.size(), 1u);
@@ -325,7 +337,7 @@
 TEST_F(StringMapTest, MoveDtor) {
   int InstanceCount = 0;
   StringMap<Countable> A;
-  A.GetOrCreateValue("x", Countable(42, InstanceCount));
+  A.insert(std::make_pair("x", Countable(42, InstanceCount)));
   ASSERT_EQ(InstanceCount, 1);
   auto I = A.find("x");
   ASSERT_NE(I, A.end());

diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp
index 2e9d585..cacbde6 100644
--- a/unittests/ADT/TripleTest.cpp
+++ b/unittests/ADT/TripleTest.cpp

@@ -129,6 +129,36 @@
   EXPECT_EQ(Triple::UnknownOS, T.getOS());
   EXPECT_EQ(Triple::EABI, T.getEnvironment());
 
+  T = Triple("amdil-unknown-unknown");
+  EXPECT_EQ(Triple::amdil, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::UnknownOS, T.getOS());
+
+  T = Triple("amdil64-unknown-unknown");
+  EXPECT_EQ(Triple::amdil64, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::UnknownOS, T.getOS());
+
+  T = Triple("hsail-unknown-unknown");
+  EXPECT_EQ(Triple::hsail, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::UnknownOS, T.getOS());
+
+  T = Triple("hsail64-unknown-unknown");
+  EXPECT_EQ(Triple::hsail64, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::UnknownOS, T.getOS());
+
+  T = Triple("spir-unknown-unknown");
+  EXPECT_EQ(Triple::spir, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::UnknownOS, T.getOS());
+
+  T = Triple("spir64-unknown-unknown");
+  EXPECT_EQ(Triple::spir64, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::UnknownOS, T.getOS());
+
   T = Triple("huh");
   EXPECT_EQ(Triple::UnknownArch, T.getArch());
 }
@@ -190,7 +220,7 @@
          ++Vendor) {
       C[1] = Triple::getVendorTypeName(Triple::VendorType(Vendor));
       for (int OS = 1+Triple::UnknownOS; OS <= Triple::Minix; ++OS) {
-        if (OS == Triple::Cygwin || OS == Triple::MinGW32 || OS == Triple::Win32)
+        if (OS == Triple::Win32)
           continue;
 
         C[2] = Triple::getOSTypeName(Triple::OSType(OS));
@@ -341,6 +371,36 @@
   EXPECT_FALSE(T.isArch16Bit());
   EXPECT_FALSE(T.isArch32Bit());
   EXPECT_TRUE(T.isArch64Bit());
+
+  T.setArch(Triple::amdil);
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_TRUE(T.isArch32Bit());
+  EXPECT_FALSE(T.isArch64Bit());
+
+  T.setArch(Triple::amdil64);
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_FALSE(T.isArch32Bit());
+  EXPECT_TRUE(T.isArch64Bit());
+
+  T.setArch(Triple::hsail);
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_TRUE(T.isArch32Bit());
+  EXPECT_FALSE(T.isArch64Bit());
+
+  T.setArch(Triple::hsail64);
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_FALSE(T.isArch32Bit());
+  EXPECT_TRUE(T.isArch64Bit());
+
+  T.setArch(Triple::spir);
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_TRUE(T.isArch32Bit());
+  EXPECT_FALSE(T.isArch64Bit());
+
+  T.setArch(Triple::spir64);
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_FALSE(T.isArch32Bit());
+  EXPECT_TRUE(T.isArch64Bit());
 }
 
 TEST(TripleTest, BitWidthArchVariants) {
@@ -399,6 +459,30 @@
   T.setArch(Triple::x86_64);
   EXPECT_EQ(Triple::x86, T.get32BitArchVariant().getArch());
   EXPECT_EQ(Triple::x86_64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::amdil);
+  EXPECT_EQ(Triple::amdil, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::amdil64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::amdil64);
+  EXPECT_EQ(Triple::amdil, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::amdil64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::hsail);
+  EXPECT_EQ(Triple::hsail, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::hsail64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::hsail64);
+  EXPECT_EQ(Triple::hsail, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::hsail64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::spir);
+  EXPECT_EQ(Triple::spir, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::spir64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::spir64);
+  EXPECT_EQ(Triple::spir, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::spir64, T.get64BitArchVariant().getArch());
 }
 
 TEST(TripleTest, getOSVersion) {
@@ -564,4 +648,20 @@
 
   EXPECT_EQ("i686-pc-windows-elf", Triple::normalize("i686-pc-windows-elf-elf"));
 }
+
+TEST(TripleTest, getARMCPUForArch) {
+  {
+    llvm::Triple Triple("armv6-unknown-freebsd");
+    EXPECT_STREQ("arm1176jzf-s", Triple.getARMCPUForArch());
+  }
+  {
+    llvm::Triple Triple("armv7s-apple-ios7");
+    EXPECT_STREQ("swift", Triple.getARMCPUForArch());
+  }
+  {
+    llvm::Triple Triple("armv7-apple-ios7");
+    EXPECT_STREQ("cortex-a8", Triple.getARMCPUForArch());
+    EXPECT_STREQ("swift", Triple.getARMCPUForArch("armv7s"));
+  }
+}
 }

diff --git a/unittests/Analysis/CFGTest.cpp b/unittests/Analysis/CFGTest.cpp
index ac5e710..dba9d49 100644
--- a/unittests/Analysis/CFGTest.cpp
+++ b/unittests/Analysis/CFGTest.cpp

@@ -30,20 +30,16 @@
 class IsPotentiallyReachableTest : public testing::Test {
 protected:
   void ParseAssembly(const char *Assembly) {
-    M.reset(new Module("Module", getGlobalContext()));
-
     SMDiagnostic Error;
-    bool Parsed = ParseAssemblyString(Assembly, M.get(),
-                                      Error, M->getContext()) == M.get();
+    M = parseAssemblyString(Assembly, Error, getGlobalContext());
 
     std::string errMsg;
     raw_string_ostream os(errMsg);
     Error.print("", os);
 
-    if (!Parsed) {
-      // A failure here means that the test itself is buggy.
+    // A failure here means that the test itself is buggy.
+    if (!M)
       report_fatal_error(os.str().c_str());
-    }
 
     Function *F = M->getFunction("test");
     if (F == nullptr)

diff --git a/unittests/Analysis/CMakeLists.txt b/unittests/Analysis/CMakeLists.txt
index 8454860..baf0c28 100644
--- a/unittests/Analysis/CMakeLists.txt
+++ b/unittests/Analysis/CMakeLists.txt

@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  IPA
   Analysis
   AsmParser
   Core
@@ -6,6 +7,7 @@
   )
 
 add_llvm_unittest(AnalysisTests
+  CallGraphTest.cpp
   CFGTest.cpp
   LazyCallGraphTest.cpp
   ScalarEvolutionTest.cpp

diff --git a/unittests/Analysis/CallGraphTest.cpp b/unittests/Analysis/CallGraphTest.cpp
new file mode 100644
index 0000000..777907a
--- /dev/null
+++ b/unittests/Analysis/CallGraphTest.cpp

@@ -0,0 +1,59 @@
+//=======- CallGraphTest.cpp - Unit tests for the CG analysis -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+template <typename Ty> void canSpecializeGraphTraitsIterators(Ty *G) {
+  typedef typename GraphTraits<Ty *>::NodeType NodeTy;
+
+  auto I = GraphTraits<Ty *>::nodes_begin(G);
+  auto E = GraphTraits<Ty *>::nodes_end(G);
+  auto X = ++I;
+
+  // Should be able to iterate over all nodes of the graph.
+  static_assert(std::is_same<decltype(*I), NodeTy &>::value,
+                "Node type does not match");
+  static_assert(std::is_same<decltype(*X), NodeTy &>::value,
+                "Node type does not match");
+  static_assert(std::is_same<decltype(*E), NodeTy &>::value,
+                "Node type does not match");
+
+  NodeTy *N = GraphTraits<Ty *>::getEntryNode(G);
+
+  auto S = GraphTraits<NodeTy *>::child_begin(N);
+  auto F = GraphTraits<NodeTy *>::child_end(N);
+
+  // Should be able to iterate over immediate successors of a node.
+  static_assert(std::is_same<decltype(*S), NodeTy *>::value,
+                "Node type does not match");
+  static_assert(std::is_same<decltype(*F), NodeTy *>::value,
+                "Node type does not match");
+}
+
+TEST(CallGraphTest, GraphTraitsSpecialization) {
+  Module M("", getGlobalContext());
+  CallGraph CG(M);
+
+  canSpecializeGraphTraitsIterators(&CG);
+}
+
+TEST(CallGraphTest, GraphTraitsConstSpecialization) {
+  Module M("", getGlobalContext());
+  CallGraph CG(M);
+
+  canSpecializeGraphTraitsIterators(const_cast<const CallGraph *>(&CG));
+}
+}

diff --git a/unittests/Analysis/LazyCallGraphTest.cpp b/unittests/Analysis/LazyCallGraphTest.cpp
index d7c7045..6caccb8 100644
--- a/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/unittests/Analysis/LazyCallGraphTest.cpp

@@ -22,38 +22,38 @@
 namespace {
 
 std::unique_ptr<Module> parseAssembly(const char *Assembly) {
-  auto M = make_unique<Module>("Module", getGlobalContext());
-
   SMDiagnostic Error;
-  bool Parsed =
-      ParseAssemblyString(Assembly, M.get(), Error, M->getContext()) == M.get();
+  std::unique_ptr<Module> M =
+      parseAssemblyString(Assembly, Error, getGlobalContext());
 
   std::string ErrMsg;
   raw_string_ostream OS(ErrMsg);
   Error.print("", OS);
 
   // A failure here means that the test itself is buggy.
-  if (!Parsed)
+  if (!M)
     report_fatal_error(OS.str().c_str());
 
   return M;
 }
 
-// IR forming a call graph with a diamond of triangle-shaped SCCs:
-//
-//         d1
-//        /  \
-//       d3--d2
-//      /     \
-//     b1     c1
-//   /  \    /  \
-//  b3--b2  c3--c2
-//       \  /
-//        a1
-//       /  \
-//      a3--a2
-//
-// All call edges go up between SCCs, and clockwise around the SCC.
+/*
+   IR forming a call graph with a diamond of triangle-shaped SCCs:
+
+           d1
+          /  \
+         d3--d2
+        /     \
+       b1     c1
+     /  \    /  \
+    b3--b2  c3--c2
+         \  /
+          a1
+         /  \
+        a3--a2
+
+   All call edges go up between SCCs, and clockwise around the SCC.
+ */
 static const char DiamondOfTriangles[] =
      "define void @a1() {\n"
      "entry:\n"

diff --git a/unittests/Analysis/Makefile b/unittests/Analysis/Makefile
index 527f452..52296e7 100644
--- a/unittests/Analysis/Makefile
+++ b/unittests/Analysis/Makefile

@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 TESTNAME = Analysis
-LINK_COMPONENTS := analysis asmparser
+LINK_COMPONENTS := ipa analysis asmparser
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest

diff --git a/unittests/Analysis/MixedTBAATest.cpp b/unittests/Analysis/MixedTBAATest.cpp
index 142e047..d7935e3 100644
--- a/unittests/Analysis/MixedTBAATest.cpp
+++ b/unittests/Analysis/MixedTBAATest.cpp

@@ -65,7 +65,7 @@
   // Run the TBAA eval pass on a mixture of path-aware and non-path-aware TBAA.
   // The order of the metadata (path-aware vs non-path-aware) is important,
   // because the AA eval pass only runs one test per store-pair.
-  const char* args[] = { "MixedTBAATest", "-evaluate-tbaa" };
+  const char* args[] = { "MixedTBAATest", "-evaluate-aa-metadata" };
   cl::ParseCommandLineOptions(sizeof(args) / sizeof(const char*), args);
   PM.add(createTypeBasedAliasAnalysisPass());
   PM.add(createAAEvalPass());

diff --git a/unittests/Bitcode/BitReaderTest.cpp b/unittests/Bitcode/BitReaderTest.cpp
index b6a3e9a..6eb40d6 100644
--- a/unittests/Bitcode/BitReaderTest.cpp
+++ b/unittests/Bitcode/BitReaderTest.cpp

@@ -10,58 +10,158 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Bitcode/BitstreamWriter.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/PassManager.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
-namespace llvm {
+using namespace llvm;
+
 namespace {
 
-static Module *makeLLVMModule() {
-  Module* Mod = new Module("test-mem", getGlobalContext());
+std::unique_ptr<Module> parseAssembly(const char *Assembly) {
+  SMDiagnostic Error;
+  std::unique_ptr<Module> M =
+      parseAssemblyString(Assembly, Error, getGlobalContext());
 
-  FunctionType* FuncTy =
-    FunctionType::get(Type::getVoidTy(Mod->getContext()), false);
-  Function* Func = Function::Create(FuncTy,GlobalValue::ExternalLinkage,
-                                    "func", Mod);
+  std::string ErrMsg;
+  raw_string_ostream OS(ErrMsg);
+  Error.print("", OS);
 
-  BasicBlock* Entry = BasicBlock::Create(Mod->getContext(), "entry", Func);
-  new UnreachableInst(Mod->getContext(), Entry);
+  // A failure here means that the test itself is buggy.
+  if (!M)
+    report_fatal_error(OS.str().c_str());
 
-  BasicBlock* BB = BasicBlock::Create(Mod->getContext(), "bb", Func);
-  new UnreachableInst(Mod->getContext(), BB);
-
-  PointerType* Int8Ptr = Type::getInt8PtrTy(Mod->getContext());
-  new GlobalVariable(*Mod, Int8Ptr, /*isConstant=*/true,
-                     GlobalValue::ExternalLinkage,
-                     BlockAddress::get(BB), "table");
-
-  return Mod;
+  return M;
 }
 
-static void writeModuleToBuffer(SmallVectorImpl<char> &Buffer) {
-  std::unique_ptr<Module> Mod(makeLLVMModule());
+static void writeModuleToBuffer(std::unique_ptr<Module> Mod,
+                                SmallVectorImpl<char> &Buffer) {
   raw_svector_ostream OS(Buffer);
   WriteBitcodeToFile(Mod.get(), OS);
 }
 
-TEST(BitReaderTest, MaterializeFunctionsForBlockAddr) { // PR11677
-  SmallString<1024> Mem;
-  writeModuleToBuffer(Mem);
-  MemoryBuffer *Buffer = MemoryBuffer::getMemBuffer(Mem.str(), "test", false);
+static std::unique_ptr<Module> getLazyModuleFromAssembly(LLVMContext &Context,
+                                                         SmallString<1024> &Mem,
+                                                         const char *Assembly) {
+  writeModuleToBuffer(parseAssembly(Assembly), Mem);
+  std::unique_ptr<MemoryBuffer> Buffer =
+      MemoryBuffer::getMemBuffer(Mem.str(), "test", false);
   ErrorOr<Module *> ModuleOrErr =
-      getLazyBitcodeModule(Buffer, getGlobalContext());
-  std::unique_ptr<Module> m(ModuleOrErr.get());
-  PassManager passes;
-  passes.add(createVerifierPass());
-  passes.add(createDebugInfoVerifierPass());
-  passes.run(*m);
+      getLazyBitcodeModule(std::move(Buffer), Context);
+  return std::unique_ptr<Module>(ModuleOrErr.get());
 }
 
+TEST(BitReaderTest, DematerializeFunctionPreservesLinkageType) {
+  SmallString<1024> Mem;
+
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem, "define internal i32 @func() {\n"
+                      "ret i32 0\n"
+                    "}\n");
+
+  EXPECT_FALSE(verifyModule(*M, &dbgs()));
+
+  M->getFunction("func")->materialize();
+  EXPECT_FALSE(M->getFunction("func")->empty());
+  EXPECT_TRUE(M->getFunction("func")->getLinkage() ==
+              GlobalValue::InternalLinkage);
+
+  // Check that the linkage type is preserved after dematerialization.
+  M->getFunction("func")->Dematerialize();
+  EXPECT_TRUE(M->getFunction("func")->empty());
+  EXPECT_TRUE(M->getFunction("func")->getLinkage() ==
+              GlobalValue::InternalLinkage);
+  EXPECT_FALSE(verifyModule(*M, &dbgs()));
 }
+
+TEST(BitReaderTest, MaterializeFunctionsForBlockAddr) { // PR11677
+  SmallString<1024> Mem;
+
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem, "@table = constant i8* blockaddress(@func, %bb)\n"
+                    "define void @func() {\n"
+                    "  unreachable\n"
+                    "bb:\n"
+                    "  unreachable\n"
+                    "}\n");
+  EXPECT_FALSE(verifyModule(*M, &dbgs()));
+
+  // Try (and fail) to dematerialize @func.
+  M->getFunction("func")->Dematerialize();
+  EXPECT_FALSE(M->getFunction("func")->empty());
 }
+
+TEST(BitReaderTest, MaterializeFunctionsForBlockAddrInFunctionBefore) {
+  SmallString<1024> Mem;
+
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem, "define i8* @before() {\n"
+                    "  ret i8* blockaddress(@func, %bb)\n"
+                    "}\n"
+                    "define void @other() {\n"
+                    "  unreachable\n"
+                    "}\n"
+                    "define void @func() {\n"
+                    "  unreachable\n"
+                    "bb:\n"
+                    "  unreachable\n"
+                    "}\n");
+  EXPECT_TRUE(M->getFunction("before")->empty());
+  EXPECT_TRUE(M->getFunction("func")->empty());
+  EXPECT_FALSE(verifyModule(*M, &dbgs()));
+
+  // Materialize @before, pulling in @func.
+  EXPECT_FALSE(M->getFunction("before")->materialize());
+  EXPECT_FALSE(M->getFunction("func")->empty());
+  EXPECT_TRUE(M->getFunction("other")->empty());
+  EXPECT_FALSE(verifyModule(*M, &dbgs()));
+
+  // Try (and fail) to dematerialize @func.
+  M->getFunction("func")->Dematerialize();
+  EXPECT_FALSE(M->getFunction("func")->empty());
+  EXPECT_FALSE(verifyModule(*M, &dbgs()));
+}
+
+TEST(BitReaderTest, MaterializeFunctionsForBlockAddrInFunctionAfter) {
+  SmallString<1024> Mem;
+
+  LLVMContext Context;
+  std::unique_ptr<Module> M = getLazyModuleFromAssembly(
+      Context, Mem, "define void @func() {\n"
+                    "  unreachable\n"
+                    "bb:\n"
+                    "  unreachable\n"
+                    "}\n"
+                    "define void @other() {\n"
+                    "  unreachable\n"
+                    "}\n"
+                    "define i8* @after() {\n"
+                    "  ret i8* blockaddress(@func, %bb)\n"
+                    "}\n");
+  EXPECT_TRUE(M->getFunction("after")->empty());
+  EXPECT_TRUE(M->getFunction("func")->empty());
+  EXPECT_FALSE(verifyModule(*M, &dbgs()));
+
+  // Materialize @after, pulling in @func.
+  EXPECT_FALSE(M->getFunction("after")->materialize());
+  EXPECT_FALSE(M->getFunction("func")->empty());
+  EXPECT_TRUE(M->getFunction("other")->empty());
+  EXPECT_FALSE(verifyModule(*M, &dbgs()));
+
+  // Try (and fail) to dematerialize @func.
+  M->getFunction("func")->Dematerialize();
+  EXPECT_FALSE(M->getFunction("func")->empty());
+  EXPECT_FALSE(verifyModule(*M, &dbgs()));
+}
+
+} // end namespace

diff --git a/unittests/Bitcode/BitstreamReaderTest.cpp b/unittests/Bitcode/BitstreamReaderTest.cpp
new file mode 100644
index 0000000..b11d7fd
--- /dev/null
+++ b/unittests/Bitcode/BitstreamReaderTest.cpp

@@ -0,0 +1,56 @@
+//===- BitstreamReaderTest.cpp - Tests for BitstreamReader ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/BitstreamReader.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(BitstreamReaderTest, AtEndOfStream) {
+  uint8_t Bytes[4] = {
+    0x00, 0x01, 0x02, 0x03
+  };
+  BitstreamReader Reader(std::begin(Bytes), std::end(Bytes));
+  BitstreamCursor Cursor(Reader);
+
+  EXPECT_FALSE(Cursor.AtEndOfStream());
+  (void)Cursor.Read(8);
+  EXPECT_FALSE(Cursor.AtEndOfStream());
+  (void)Cursor.Read(24);
+  EXPECT_TRUE(Cursor.AtEndOfStream());
+
+  Cursor.JumpToBit(0);
+  EXPECT_FALSE(Cursor.AtEndOfStream());
+
+  Cursor.JumpToBit(32);
+  EXPECT_TRUE(Cursor.AtEndOfStream());
+}
+
+TEST(BitstreamReaderTest, AtEndOfStreamJump) {
+  uint8_t Bytes[4] = {
+    0x00, 0x01, 0x02, 0x03
+  };
+  BitstreamReader Reader(std::begin(Bytes), std::end(Bytes));
+  BitstreamCursor Cursor(Reader);
+
+  Cursor.JumpToBit(32);
+  EXPECT_TRUE(Cursor.AtEndOfStream());
+}
+
+TEST(BitstreamReaderTest, AtEndOfStreamEmpty) {
+  uint8_t Dummy = 0xFF;
+  BitstreamReader Reader(&Dummy, &Dummy);
+  BitstreamCursor Cursor(Reader);
+
+  EXPECT_TRUE(Cursor.AtEndOfStream());
+}
+
+} // end anonymous namespace

diff --git a/unittests/Bitcode/CMakeLists.txt b/unittests/Bitcode/CMakeLists.txt
index 743ab18..09cbcdc 100644
--- a/unittests/Bitcode/CMakeLists.txt
+++ b/unittests/Bitcode/CMakeLists.txt

@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  AsmParser
   BitReader
   BitWriter
   Core
@@ -7,4 +8,5 @@
 
 add_llvm_unittest(BitcodeTests
   BitReaderTest.cpp
+  BitstreamReaderTest.cpp
   )

diff --git a/unittests/Bitcode/Makefile b/unittests/Bitcode/Makefile
index fcec879..33b09b9 100644
--- a/unittests/Bitcode/Makefile
+++ b/unittests/Bitcode/Makefile

@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 TESTNAME = Bitcode
-LINK_COMPONENTS := bitreader bitwriter
+LINK_COMPONENTS := AsmParser BitReader BitWriter Core Support
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest

diff --git a/unittests/ExecutionEngine/CMakeLists.txt b/unittests/ExecutionEngine/CMakeLists.txt
index 489eaaf..783c9b5 100644
--- a/unittests/ExecutionEngine/CMakeLists.txt
+++ b/unittests/ExecutionEngine/CMakeLists.txt

@@ -2,6 +2,7 @@
   Core
   ExecutionEngine
   Interpreter
+  MC
   Support
   )
 
@@ -9,10 +10,9 @@
   ExecutionEngineTest.cpp
   )
 
-# Include JIT/MCJIT tests only if native arch is a built JIT target.
+# Include MCJIT tests only if native arch is a built JIT target.
 list(FIND LLVM_TARGETS_TO_BUILD "${LLVM_NATIVE_ARCH}" build_idx)
 list(FIND LLVM_TARGETS_WITH_JIT "${LLVM_NATIVE_ARCH}" jit_idx)
 if (NOT build_idx LESS 0 AND NOT jit_idx LESS 0)
-  add_subdirectory(JIT)
   add_subdirectory(MCJIT)
 endif()

diff --git a/unittests/ExecutionEngine/ExecutionEngineTest.cpp b/unittests/ExecutionEngine/ExecutionEngineTest.cpp
index f23745c..19917a4 100644
--- a/unittests/ExecutionEngine/ExecutionEngineTest.cpp
+++ b/unittests/ExecutionEngine/ExecutionEngineTest.cpp

@@ -8,10 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Interpreter.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -19,10 +22,14 @@
 namespace {
 
 class ExecutionEngineTest : public testing::Test {
+private:
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+
 protected:
-  ExecutionEngineTest()
-    : M(new Module("<main>", getGlobalContext())), Error(""),
-      Engine(EngineBuilder(M).setErrorStr(&Error).create()) {
+  ExecutionEngineTest() {
+    auto Owner = make_unique<Module>("<main>", getGlobalContext());
+    M = Owner.get();
+    Engine.reset(EngineBuilder(std::move(Owner)).setErrorStr(&Error).create());
   }
 
   virtual void SetUp() {
@@ -35,9 +42,9 @@
                               GlobalValue::ExternalLinkage, nullptr, Name);
   }
 
-  Module *const M;
   std::string Error;
-  const std::unique_ptr<ExecutionEngine> Engine;
+  Module *M;  // Owned by ExecutionEngine.
+  std::unique_ptr<ExecutionEngine> Engine;
 };
 
 TEST_F(ExecutionEngineTest, ForwardGlobalMapping) {
@@ -127,4 +134,35 @@
   EXPECT_EQ(nullptr, Engine->getGlobalValueAtAddress(&Mem1));
 }
 
+TEST_F(ExecutionEngineTest, LookupWithMangledName) {
+  int x;
+  llvm::sys::DynamicLibrary::AddSymbol("x", &x);
+
+  // Demonstrate that getSymbolAddress accepts mangled names and always strips
+  // the leading underscore.
+  EXPECT_EQ(reinterpret_cast<uint64_t>(&x),
+            RTDyldMemoryManager::getSymbolAddressInProcess("_x"));
+}
+
+TEST_F(ExecutionEngineTest, LookupWithMangledAndDemangledSymbol) {
+  int x;
+  int _x;
+  llvm::sys::DynamicLibrary::AddSymbol("x", &x);
+  llvm::sys::DynamicLibrary::AddSymbol("_x", &_x);
+
+  // Lookup the demangled name first, even if there's a demangled symbol that
+  // matches the input already.
+  EXPECT_EQ(reinterpret_cast<uint64_t>(&x),
+            RTDyldMemoryManager::getSymbolAddressInProcess("_x"));
+}
+
+TEST_F(ExecutionEngineTest, LookupwithDemangledName) {
+  int _x;
+  llvm::sys::DynamicLibrary::AddSymbol("_x", &_x);
+
+  // But do fallback to looking up a demangled name if there's no ambiguity
+  EXPECT_EQ(reinterpret_cast<uint64_t>(&_x),
+            RTDyldMemoryManager::getSymbolAddressInProcess("_x"));
+}
+
 }

diff --git a/unittests/ExecutionEngine/JIT/CMakeLists.txt b/unittests/ExecutionEngine/JIT/CMakeLists.txt
deleted file mode 100644
index 72c1df7..0000000
--- a/unittests/ExecutionEngine/JIT/CMakeLists.txt
+++ /dev/null

@@ -1,64 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  AsmParser
-  BitReader
-  BitWriter
-  Core
-  ExecutionEngine
-  JIT
-  Support
-  nativecodegen
-  )
-
-# HACK: Declare a couple of source files as optionally compiled to satisfy the
-# missing-file-checker in LLVM's weird CMake build.
-set(LLVM_OPTIONAL_SOURCES
-  IntelJITEventListenerTest.cpp
-  OProfileJITEventListenerTest.cpp
-  )
-
-if( LLVM_USE_INTEL_JITEVENTS )
-  set(ProfileTestSources
-    IntelJITEventListenerTest.cpp
-    )
-  set(LLVM_LINK_COMPONENTS
-    ${LLVM_LINK_COMPONENTS}
-    DebugInfo
-    IntelJITEvents
-    Object
-    ) 
-endif( LLVM_USE_INTEL_JITEVENTS )
-
-if( LLVM_USE_OPROFILE )
-  set(ProfileTestSources
-    ${ProfileTestSources}
-    OProfileJITEventListenerTest.cpp
-    )
-  set(LLVM_LINK_COMPONENTS
-    ${LLVM_LINK_COMPONENTS}
-    OProfileJIT
-    )
-endif( LLVM_USE_OPROFILE )
-
-set(JITTestsSources
-  JITEventListenerTest.cpp
-  JITMemoryManagerTest.cpp
-  JITTest.cpp
-  MultiJITTest.cpp
-  ${ProfileTestSources}
-  )
-
-if(MSVC)
-  list(APPEND JITTestsSources JITTests.def)
-endif()
-
-# The JIT tests need to dlopen things.
-set(LLVM_NO_DEAD_STRIP 1)
-
-add_llvm_unittest(JITTests
-  ${JITTestsSources}
-  )
-
-if(MINGW OR CYGWIN)
-  set_property(TARGET JITTests PROPERTY LINK_FLAGS -Wl,--export-all-symbols)
-endif()
-set_target_properties(JITTests PROPERTIES ENABLE_EXPORTS 1)

diff --git a/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp b/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp
deleted file mode 100644
index db90887..0000000
--- a/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp
+++ /dev/null

@@ -1,113 +0,0 @@
-//===- JITEventListenerTest.cpp - Tests for Intel JITEventListener --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "JITEventListenerTestCommon.h"
-
-using namespace llvm;
-
-// Because we want to keep the implementation details of the Intel API used to
-// communicate with Amplifier out of the public header files, the header below
-// is included from the source tree instead.
-#include "../../../lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h"
-
-#include <map>
-#include <list>
-
-namespace {
-
-// map of function ("method") IDs to source locations
-NativeCodeMap ReportedDebugFuncs;
-
-} // namespace
-
-/// Mock implementaion of Intel JIT API jitprofiling library
-namespace test_jitprofiling {
-
-int NotifyEvent(iJIT_JVM_EVENT EventType, void *EventSpecificData) {
-  switch (EventType) {
-    case iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED: {
-      EXPECT_TRUE(0 != EventSpecificData);
-      iJIT_Method_Load* msg = static_cast<iJIT_Method_Load*>(EventSpecificData);
-
-      ReportedDebugFuncs[msg->method_id];
-
-      for(unsigned int i = 0; i < msg->line_number_size; ++i) {
-        EXPECT_TRUE(0 != msg->line_number_table);
-        std::pair<std::string, unsigned int> loc(
-          std::string(msg->source_file_name),
-          msg->line_number_table[i].LineNumber);
-        ReportedDebugFuncs[msg->method_id].push_back(loc);
-      }
-    }
-    break;
-    case iJVM_EVENT_TYPE_METHOD_UNLOAD_START: {
-      EXPECT_TRUE(0 != EventSpecificData);
-      unsigned int UnloadId
-        = *reinterpret_cast<unsigned int*>(EventSpecificData);
-      EXPECT_TRUE(1 == ReportedDebugFuncs.erase(UnloadId));
-    }
-    default:
-      break;
-  }
-  return 0;
-}
-
-iJIT_IsProfilingActiveFlags IsProfilingActive(void) {
-  // for testing, pretend we have an Intel Parallel Amplifier XE 2011
-  // instance attached
-  return iJIT_SAMPLING_ON;
-}
-
-unsigned int GetNewMethodID(void) {
-  static unsigned int id = 0;
-  return ++id;
-}
-
-} //namespace test_jitprofiling
-
-class IntelJITEventListenerTest
-  : public JITEventListenerTestBase<IntelJITEventsWrapper> {
-public:
-  IntelJITEventListenerTest()
-  : JITEventListenerTestBase<IntelJITEventsWrapper>(
-      new IntelJITEventsWrapper(test_jitprofiling::NotifyEvent, 0,
-        test_jitprofiling::IsProfilingActive, 0, 0,
-        test_jitprofiling::GetNewMethodID))
-  {
-    EXPECT_TRUE(0 != MockWrapper);
-
-    Listener.reset(JITEventListener::createIntelJITEventListener(
-      MockWrapper.release()));
-    EXPECT_TRUE(0 != Listener);
-    EE->RegisterJITEventListener(Listener.get());
-  }
-};
-
-TEST_F(IntelJITEventListenerTest, NoDebugInfo) {
-  TestNoDebugInfo(ReportedDebugFuncs);
-}
-
-TEST_F(IntelJITEventListenerTest, SingleLine) {
-  TestSingleLine(ReportedDebugFuncs);
-}
-
-TEST_F(IntelJITEventListenerTest, MultipleLines) {
-  TestMultipleLines(ReportedDebugFuncs);
-}
-
-// This testcase is disabled because the Intel JIT API does not support a single
-// JITted function with source lines associated with multiple files
-/*
-TEST_F(IntelJITEventListenerTest, MultipleFiles) {
-  TestMultipleFiles(ReportedDebugFuncs);
-}
-*/
-
-testing::Environment* const jit_env =
-  testing::AddGlobalTestEnvironment(new JITEnvironment);

diff --git a/unittests/ExecutionEngine/JIT/JITEventListenerTest.cpp b/unittests/ExecutionEngine/JIT/JITEventListenerTest.cpp
deleted file mode 100644
index 175b9fb..0000000
--- a/unittests/ExecutionEngine/JIT/JITEventListenerTest.cpp
+++ /dev/null

@@ -1,237 +0,0 @@
-//===- JITEventListenerTest.cpp - Unit tests for JITEventListeners --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/CodeGen/MachineCodeInfo.h"
-#include "llvm/ExecutionEngine/JIT.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeBuilder.h"
-#include "llvm/Support/TargetSelect.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-using namespace llvm;
-
-namespace {
-
-struct FunctionEmittedEvent {
-  // Indices are local to the RecordingJITEventListener, since the
-  // JITEventListener interface makes no guarantees about the order of
-  // calls between Listeners.
-  unsigned Index;
-  const Function *F;
-  void *Code;
-  size_t Size;
-  JITEvent_EmittedFunctionDetails Details;
-};
-struct FunctionFreedEvent {
-  unsigned Index;
-  void *Code;
-};
-
-struct RecordingJITEventListener : public JITEventListener {
-  std::vector<FunctionEmittedEvent> EmittedEvents;
-  std::vector<FunctionFreedEvent> FreedEvents;
-
-  unsigned NextIndex;
-
-  RecordingJITEventListener() : NextIndex(0) {}
-
-  virtual void NotifyFunctionEmitted(const Function &F,
-                                     void *Code, size_t Size,
-                                     const EmittedFunctionDetails &Details) {
-    FunctionEmittedEvent Event = {NextIndex++, &F, Code, Size, Details};
-    EmittedEvents.push_back(Event);
-  }
-
-  virtual void NotifyFreeingMachineCode(void *OldPtr) {
-    FunctionFreedEvent Event = {NextIndex++, OldPtr};
-    FreedEvents.push_back(Event);
-  }
-};
-
-class JITEventListenerTest : public testing::Test {
- protected:
-  JITEventListenerTest()
-      : M(new Module("module", getGlobalContext())),
-        EE(EngineBuilder(M)
-           .setEngineKind(EngineKind::JIT)
-           .create()) {
-  }
-
-  Module *M;
-  const std::unique_ptr<ExecutionEngine> EE;
-};
-
-// Tests on SystemZ disabled as we're running the old JIT
-#if !defined(__s390__) && !defined(__aarch64__)
-Function *buildFunction(Module *M) {
-  Function *Result = Function::Create(
-      TypeBuilder<int32_t(int32_t), false>::get(getGlobalContext()),
-      GlobalValue::ExternalLinkage, "id", M);
-  Value *Arg = Result->arg_begin();
-  BasicBlock *BB = BasicBlock::Create(M->getContext(), "entry", Result);
-  ReturnInst::Create(M->getContext(), Arg, BB);
-  return Result;
-}
-
-// Tests that a single JITEventListener follows JIT events accurately.
-TEST_F(JITEventListenerTest, Simple) {
-  RecordingJITEventListener Listener;
-  EE->RegisterJITEventListener(&Listener);
-  Function *F1 = buildFunction(M);
-  Function *F2 = buildFunction(M);
-
-  void *F1_addr = EE->getPointerToFunction(F1);
-  void *F2_addr = EE->getPointerToFunction(F2);
-  EE->getPointerToFunction(F1);  // Should do nothing.
-  EE->freeMachineCodeForFunction(F1);
-  EE->freeMachineCodeForFunction(F2);
-
-  ASSERT_EQ(2U, Listener.EmittedEvents.size());
-  ASSERT_EQ(2U, Listener.FreedEvents.size());
-
-  EXPECT_EQ(0U, Listener.EmittedEvents[0].Index);
-  EXPECT_EQ(F1, Listener.EmittedEvents[0].F);
-  EXPECT_EQ(F1_addr, Listener.EmittedEvents[0].Code);
-  EXPECT_LT(0U, Listener.EmittedEvents[0].Size)
-      << "We don't know how big the function will be, but it had better"
-      << " contain some bytes.";
-
-  EXPECT_EQ(1U, Listener.EmittedEvents[1].Index);
-  EXPECT_EQ(F2, Listener.EmittedEvents[1].F);
-  EXPECT_EQ(F2_addr, Listener.EmittedEvents[1].Code);
-  EXPECT_LT(0U, Listener.EmittedEvents[1].Size)
-      << "We don't know how big the function will be, but it had better"
-      << " contain some bytes.";
-
-  EXPECT_EQ(2U, Listener.FreedEvents[0].Index);
-  EXPECT_EQ(F1_addr, Listener.FreedEvents[0].Code);
-
-  EXPECT_EQ(3U, Listener.FreedEvents[1].Index);
-  EXPECT_EQ(F2_addr, Listener.FreedEvents[1].Code);
-
-  F1->eraseFromParent();
-  F2->eraseFromParent();
-}
-
-// Tests that a single JITEventListener follows JIT events accurately.
-TEST_F(JITEventListenerTest, MultipleListenersDontInterfere) {
-  RecordingJITEventListener Listener1;
-  RecordingJITEventListener Listener2;
-  RecordingJITEventListener Listener3;
-  Function *F1 = buildFunction(M);
-  Function *F2 = buildFunction(M);
-
-  EE->RegisterJITEventListener(&Listener1);
-  EE->RegisterJITEventListener(&Listener2);
-  void *F1_addr = EE->getPointerToFunction(F1);
-  EE->RegisterJITEventListener(&Listener3);
-  EE->UnregisterJITEventListener(&Listener1);
-  void *F2_addr = EE->getPointerToFunction(F2);
-  EE->UnregisterJITEventListener(&Listener2);
-  EE->UnregisterJITEventListener(&Listener3);
-  EE->freeMachineCodeForFunction(F1);
-  EE->RegisterJITEventListener(&Listener2);
-  EE->RegisterJITEventListener(&Listener3);
-  EE->RegisterJITEventListener(&Listener1);
-  EE->freeMachineCodeForFunction(F2);
-  EE->UnregisterJITEventListener(&Listener1);
-  EE->UnregisterJITEventListener(&Listener2);
-  EE->UnregisterJITEventListener(&Listener3);
-
-  // Listener 1.
-  ASSERT_EQ(1U, Listener1.EmittedEvents.size());
-  ASSERT_EQ(1U, Listener1.FreedEvents.size());
-
-  EXPECT_EQ(0U, Listener1.EmittedEvents[0].Index);
-  EXPECT_EQ(F1, Listener1.EmittedEvents[0].F);
-  EXPECT_EQ(F1_addr, Listener1.EmittedEvents[0].Code);
-  EXPECT_LT(0U, Listener1.EmittedEvents[0].Size)
-      << "We don't know how big the function will be, but it had better"
-      << " contain some bytes.";
-
-  EXPECT_EQ(1U, Listener1.FreedEvents[0].Index);
-  EXPECT_EQ(F2_addr, Listener1.FreedEvents[0].Code);
-
-  // Listener 2.
-  ASSERT_EQ(2U, Listener2.EmittedEvents.size());
-  ASSERT_EQ(1U, Listener2.FreedEvents.size());
-
-  EXPECT_EQ(0U, Listener2.EmittedEvents[0].Index);
-  EXPECT_EQ(F1, Listener2.EmittedEvents[0].F);
-  EXPECT_EQ(F1_addr, Listener2.EmittedEvents[0].Code);
-  EXPECT_LT(0U, Listener2.EmittedEvents[0].Size)
-      << "We don't know how big the function will be, but it had better"
-      << " contain some bytes.";
-
-  EXPECT_EQ(1U, Listener2.EmittedEvents[1].Index);
-  EXPECT_EQ(F2, Listener2.EmittedEvents[1].F);
-  EXPECT_EQ(F2_addr, Listener2.EmittedEvents[1].Code);
-  EXPECT_LT(0U, Listener2.EmittedEvents[1].Size)
-      << "We don't know how big the function will be, but it had better"
-      << " contain some bytes.";
-
-  EXPECT_EQ(2U, Listener2.FreedEvents[0].Index);
-  EXPECT_EQ(F2_addr, Listener2.FreedEvents[0].Code);
-
-  // Listener 3.
-  ASSERT_EQ(1U, Listener3.EmittedEvents.size());
-  ASSERT_EQ(1U, Listener3.FreedEvents.size());
-
-  EXPECT_EQ(0U, Listener3.EmittedEvents[0].Index);
-  EXPECT_EQ(F2, Listener3.EmittedEvents[0].F);
-  EXPECT_EQ(F2_addr, Listener3.EmittedEvents[0].Code);
-  EXPECT_LT(0U, Listener3.EmittedEvents[0].Size)
-      << "We don't know how big the function will be, but it had better"
-      << " contain some bytes.";
-
-  EXPECT_EQ(1U, Listener3.FreedEvents[0].Index);
-  EXPECT_EQ(F2_addr, Listener3.FreedEvents[0].Code);
-
-  F1->eraseFromParent();
-  F2->eraseFromParent();
-}
-
-TEST_F(JITEventListenerTest, MatchesMachineCodeInfo) {
-  RecordingJITEventListener Listener;
-  MachineCodeInfo MCI;
-  Function *F = buildFunction(M);
-
-  EE->RegisterJITEventListener(&Listener);
-  EE->runJITOnFunction(F, &MCI);
-  void *F_addr = EE->getPointerToFunction(F);
-  EE->freeMachineCodeForFunction(F);
-
-  ASSERT_EQ(1U, Listener.EmittedEvents.size());
-  ASSERT_EQ(1U, Listener.FreedEvents.size());
-
-  EXPECT_EQ(0U, Listener.EmittedEvents[0].Index);
-  EXPECT_EQ(F, Listener.EmittedEvents[0].F);
-  EXPECT_EQ(F_addr, Listener.EmittedEvents[0].Code);
-  EXPECT_EQ(MCI.address(), Listener.EmittedEvents[0].Code);
-  EXPECT_EQ(MCI.size(), Listener.EmittedEvents[0].Size);
-
-  EXPECT_EQ(1U, Listener.FreedEvents[0].Index);
-  EXPECT_EQ(F_addr, Listener.FreedEvents[0].Code);
-}
-#endif
-
-class JITEnvironment : public testing::Environment {
-  virtual void SetUp() {
-    // Required to create a JIT.
-    InitializeNativeTarget();
-  }
-};
-testing::Environment* const jit_env =
-  testing::AddGlobalTestEnvironment(new JITEnvironment);
-
-}  // anonymous namespace

diff --git a/unittests/ExecutionEngine/JIT/JITEventListenerTestCommon.h b/unittests/ExecutionEngine/JIT/JITEventListenerTestCommon.h
deleted file mode 100644
index 61220f5..0000000
--- a/unittests/ExecutionEngine/JIT/JITEventListenerTestCommon.h
+++ /dev/null

@@ -1,207 +0,0 @@
-//===- JITEventListenerTestCommon.h - Helper for JITEventListener tests ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===-------------------------------------------------------------------------------===//
-
-#ifndef JIT_EVENT_LISTENER_TEST_COMMON_H
-#define JIT_EVENT_LISTENER_TEST_COMMON_H
-
-#include "llvm/CodeGen/MachineCodeInfo.h"
-#include "llvm/Config/config.h"
-#include "llvm/ExecutionEngine/JIT.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeBuilder.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/TargetSelect.h"
-#include "gtest/gtest.h"
-#include <string>
-#include <utility>
-#include <vector>
-
-typedef std::vector<std::pair<std::string, unsigned int> > SourceLocations;
-typedef std::map<uint64_t, SourceLocations> NativeCodeMap;
-
-class JITEnvironment : public testing::Environment {
-  virtual void SetUp() {
-    // Required to create a JIT.
-    llvm::InitializeNativeTarget();
-  }
-};
-
-inline unsigned int getLine() {
-  return 12;
-}
-
-inline unsigned int getCol() {
-  return 0;
-}
-
-inline const char* getFilename() {
-  return "mock_source_file.cpp";
-}
-
-// Test fixture shared by tests for listener implementations
-template<typename WrapperT>
-class JITEventListenerTestBase : public testing::Test {
-protected:
-  std::unique_ptr<WrapperT> MockWrapper;
-  std::unique_ptr<llvm::JITEventListener> Listener;
-
-public:
-  llvm::Module* M;
-  llvm::MDNode* Scope;
-  llvm::ExecutionEngine* EE;
-  llvm::DIBuilder* DebugBuilder;
-  llvm::IRBuilder<> Builder;
-
-  JITEventListenerTestBase(WrapperT* w)
-  : MockWrapper(w)
-  , M(new llvm::Module("module", llvm::getGlobalContext()))
-  , EE(llvm::EngineBuilder(M)
-    .setEngineKind(llvm::EngineKind::JIT)
-    .setOptLevel(llvm::CodeGenOpt::None)
-    .create())
-  , DebugBuilder(new llvm::DIBuilder(*M))
-  , Builder(llvm::getGlobalContext())
-  {
-    DebugBuilder->createCompileUnit(llvm::dwarf::DW_LANG_C_plus_plus,
-                                    "JIT",
-                                    "JIT",
-                                    "JIT",
-                                    true,
-                                    "",
-                                    1);
-
-    Scope = DebugBuilder->createFile(getFilename(), ".");
-  }
-
-  llvm::Function *buildFunction(const SourceLocations& DebugLocations) {
-    using namespace llvm;
-
-    LLVMContext& GlobalContext = getGlobalContext();
-
-    SourceLocations::const_iterator CurrentDebugLocation
-      = DebugLocations.begin();
-
-    if (CurrentDebugLocation != DebugLocations.end()) {
-      DebugLoc DebugLocation = DebugLoc::get(getLine(), getCol(),
-          DebugBuilder->createFile(CurrentDebugLocation->first, "."));
-      Builder.SetCurrentDebugLocation(DebugLocation);
-      CurrentDebugLocation++;
-    }
-
-    Function *Result = Function::Create(
-        TypeBuilder<int32_t(int32_t), false>::get(GlobalContext),
-        GlobalValue::ExternalLinkage, "id", M);
-    Value *Arg = Result->arg_begin();
-    BasicBlock *BB = BasicBlock::Create(M->getContext(), "entry", Result);
-    Builder.SetInsertPoint(BB);
-    Value* one = ConstantInt::get(GlobalContext, APInt(32, 1));
-    for(; CurrentDebugLocation != DebugLocations.end();
-        ++CurrentDebugLocation) {
-      Arg = Builder.CreateMul(Arg, Builder.CreateAdd(Arg, one));
-      Builder.SetCurrentDebugLocation(
-        DebugLoc::get(CurrentDebugLocation->second, 0,
-                      DebugBuilder->createFile(CurrentDebugLocation->first, ".")));
-    }
-    Builder.CreateRet(Arg);
-    return Result;
-  }
-
-  void TestNoDebugInfo(NativeCodeMap& ReportedDebugFuncs) {
-    SourceLocations DebugLocations;
-    llvm::Function* f = buildFunction(DebugLocations);
-    EXPECT_TRUE(0 != f);
-
-    //Cause JITting and callbacks to our listener
-    EXPECT_TRUE(0 != EE->getPointerToFunction(f));
-    EXPECT_TRUE(1 == ReportedDebugFuncs.size());
-
-    EE->freeMachineCodeForFunction(f);
-    EXPECT_TRUE(ReportedDebugFuncs.size() == 0);
-  }
-
-  void TestSingleLine(NativeCodeMap& ReportedDebugFuncs) {
-    SourceLocations DebugLocations;
-    DebugLocations.push_back(std::make_pair(std::string(getFilename()),
-                                            getLine()));
-    llvm::Function* f = buildFunction(DebugLocations);
-    EXPECT_TRUE(0 != f);
-
-    EXPECT_TRUE(0 != EE->getPointerToFunction(f));
-    EXPECT_TRUE(1 == ReportedDebugFuncs.size());
-    EXPECT_STREQ(ReportedDebugFuncs.begin()->second.begin()->first.c_str(),
-                 getFilename());
-    EXPECT_EQ(ReportedDebugFuncs.begin()->second.begin()->second, getLine());
-
-    EE->freeMachineCodeForFunction(f);
-    EXPECT_TRUE(ReportedDebugFuncs.size() == 0);
-  }
-
-  void TestMultipleLines(NativeCodeMap& ReportedDebugFuncs) {
-    using namespace std;
-
-    SourceLocations DebugLocations;
-    unsigned int c = 5;
-    for(unsigned int i = 0; i < c; ++i) {
-      DebugLocations.push_back(make_pair(string(getFilename()), getLine() + i));
-    }
-
-    llvm::Function* f = buildFunction(DebugLocations);
-    EXPECT_TRUE(0 != f);
-
-    EXPECT_TRUE(0 != EE->getPointerToFunction(f));
-    EXPECT_TRUE(1 == ReportedDebugFuncs.size());
-    SourceLocations& FunctionInfo = ReportedDebugFuncs.begin()->second;
-    EXPECT_EQ(c, FunctionInfo.size());
-
-    int VerifyCount = 0;
-    for(SourceLocations::iterator i = FunctionInfo.begin();
-        i != FunctionInfo.end();
-        ++i) {
-      EXPECT_STREQ(i->first.c_str(), getFilename());
-      EXPECT_EQ(i->second, getLine() + VerifyCount);
-      VerifyCount++;
-    }
-
-    EE->freeMachineCodeForFunction(f);
-    EXPECT_TRUE(ReportedDebugFuncs.size() == 0);
-  }
-
-  void TestMultipleFiles(NativeCodeMap& ReportedDebugFuncs) {
-
-    std::string secondFilename("another_file.cpp");
-
-    SourceLocations DebugLocations;
-    DebugLocations.push_back(std::make_pair(std::string(getFilename()),
-                                            getLine()));
-    DebugLocations.push_back(std::make_pair(secondFilename, getLine()));
-    llvm::Function* f = buildFunction(DebugLocations);
-    EXPECT_TRUE(0 != f);
-
-    EXPECT_TRUE(0 != EE->getPointerToFunction(f));
-    EXPECT_TRUE(1 == ReportedDebugFuncs.size());
-    SourceLocations& FunctionInfo = ReportedDebugFuncs.begin()->second;
-    EXPECT_TRUE(2 == FunctionInfo.size());
-
-    EXPECT_STREQ(FunctionInfo.at(0).first.c_str(), getFilename());
-    EXPECT_STREQ(FunctionInfo.at(1).first.c_str(), secondFilename.c_str());
-
-    EXPECT_EQ(FunctionInfo.at(0).second, getLine());
-    EXPECT_EQ(FunctionInfo.at(1).second, getLine());
-
-    EE->freeMachineCodeForFunction(f);
-    EXPECT_TRUE(ReportedDebugFuncs.size() == 0);
-  }
-};
-
-#endif //JIT_EVENT_LISTENER_TEST_COMMON_H

diff --git a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp b/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
deleted file mode 100644
index 296838d..0000000
--- a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
+++ /dev/null

@@ -1,302 +0,0 @@
-//===- JITMemoryManagerTest.cpp - Unit tests for the JIT memory manager ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/LLVMContext.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-namespace {
-
-Function *makeFakeFunction() {
-  std::vector<Type*> params;
-  FunctionType *FTy =
-      FunctionType::get(Type::getVoidTy(getGlobalContext()), params, false);
-  return Function::Create(FTy, GlobalValue::ExternalLinkage);
-}
-
-// Allocate three simple functions that fit in the initial slab.  This exercises
-// the code in the case that we don't have to allocate more memory to store the
-// function bodies.
-TEST(JITMemoryManagerTest, NoAllocations) {
-  std::unique_ptr<JITMemoryManager> MemMgr(
-      JITMemoryManager::CreateDefaultMemManager());
-  uintptr_t size;
-  std::string Error;
-
-  // Allocate the functions.
-  std::unique_ptr<Function> F1(makeFakeFunction());
-  size = 1024;
-  uint8_t *FunctionBody1 = MemMgr->startFunctionBody(F1.get(), size);
-  memset(FunctionBody1, 0xFF, 1024);
-  MemMgr->endFunctionBody(F1.get(), FunctionBody1, FunctionBody1 + 1024);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-
-  std::unique_ptr<Function> F2(makeFakeFunction());
-  size = 1024;
-  uint8_t *FunctionBody2 = MemMgr->startFunctionBody(F2.get(), size);
-  memset(FunctionBody2, 0xFF, 1024);
-  MemMgr->endFunctionBody(F2.get(), FunctionBody2, FunctionBody2 + 1024);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-
-  std::unique_ptr<Function> F3(makeFakeFunction());
-  size = 1024;
-  uint8_t *FunctionBody3 = MemMgr->startFunctionBody(F3.get(), size);
-  memset(FunctionBody3, 0xFF, 1024);
-  MemMgr->endFunctionBody(F3.get(), FunctionBody3, FunctionBody3 + 1024);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-
-  // Deallocate them out of order, in case that matters.
-  MemMgr->deallocateFunctionBody(FunctionBody2);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-  MemMgr->deallocateFunctionBody(FunctionBody1);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-  MemMgr->deallocateFunctionBody(FunctionBody3);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-}
-
-// Make three large functions that take up most of the space in the slab.  Then
-// try allocating three smaller functions that don't require additional slabs.
-TEST(JITMemoryManagerTest, TestCodeAllocation) {
-  std::unique_ptr<JITMemoryManager> MemMgr(
-      JITMemoryManager::CreateDefaultMemManager());
-  uintptr_t size;
-  std::string Error;
-
-  // Big functions are a little less than the largest block size.
-  const uintptr_t smallFuncSize = 1024;
-  const uintptr_t bigFuncSize = (MemMgr->GetDefaultCodeSlabSize() -
-                                 smallFuncSize * 2);
-
-  // Allocate big functions
-  std::unique_ptr<Function> F1(makeFakeFunction());
-  size = bigFuncSize;
-  uint8_t *FunctionBody1 = MemMgr->startFunctionBody(F1.get(), size);
-  ASSERT_LE(bigFuncSize, size);
-  memset(FunctionBody1, 0xFF, bigFuncSize);
-  MemMgr->endFunctionBody(F1.get(), FunctionBody1, FunctionBody1 + bigFuncSize);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-
-  std::unique_ptr<Function> F2(makeFakeFunction());
-  size = bigFuncSize;
-  uint8_t *FunctionBody2 = MemMgr->startFunctionBody(F2.get(), size);
-  ASSERT_LE(bigFuncSize, size);
-  memset(FunctionBody2, 0xFF, bigFuncSize);
-  MemMgr->endFunctionBody(F2.get(), FunctionBody2, FunctionBody2 + bigFuncSize);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-
-  std::unique_ptr<Function> F3(makeFakeFunction());
-  size = bigFuncSize;
-  uint8_t *FunctionBody3 = MemMgr->startFunctionBody(F3.get(), size);
-  ASSERT_LE(bigFuncSize, size);
-  memset(FunctionBody3, 0xFF, bigFuncSize);
-  MemMgr->endFunctionBody(F3.get(), FunctionBody3, FunctionBody3 + bigFuncSize);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-
-  // Check that each large function took it's own slab.
-  EXPECT_EQ(3U, MemMgr->GetNumCodeSlabs());
-
-  // Allocate small functions
-  std::unique_ptr<Function> F4(makeFakeFunction());
-  size = smallFuncSize;
-  uint8_t *FunctionBody4 = MemMgr->startFunctionBody(F4.get(), size);
-  ASSERT_LE(smallFuncSize, size);
-  memset(FunctionBody4, 0xFF, smallFuncSize);
-  MemMgr->endFunctionBody(F4.get(), FunctionBody4,
-                          FunctionBody4 + smallFuncSize);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-
-  std::unique_ptr<Function> F5(makeFakeFunction());
-  size = smallFuncSize;
-  uint8_t *FunctionBody5 = MemMgr->startFunctionBody(F5.get(), size);
-  ASSERT_LE(smallFuncSize, size);
-  memset(FunctionBody5, 0xFF, smallFuncSize);
-  MemMgr->endFunctionBody(F5.get(), FunctionBody5,
-                          FunctionBody5 + smallFuncSize);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-
-  std::unique_ptr<Function> F6(makeFakeFunction());
-  size = smallFuncSize;
-  uint8_t *FunctionBody6 = MemMgr->startFunctionBody(F6.get(), size);
-  ASSERT_LE(smallFuncSize, size);
-  memset(FunctionBody6, 0xFF, smallFuncSize);
-  MemMgr->endFunctionBody(F6.get(), FunctionBody6,
-                          FunctionBody6 + smallFuncSize);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-
-  // Check that the small functions didn't allocate any new slabs.
-  EXPECT_EQ(3U, MemMgr->GetNumCodeSlabs());
-
-  // Deallocate them out of order, in case that matters.
-  MemMgr->deallocateFunctionBody(FunctionBody2);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-  MemMgr->deallocateFunctionBody(FunctionBody1);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-  MemMgr->deallocateFunctionBody(FunctionBody4);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-  MemMgr->deallocateFunctionBody(FunctionBody3);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-  MemMgr->deallocateFunctionBody(FunctionBody5);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-  MemMgr->deallocateFunctionBody(FunctionBody6);
-  EXPECT_TRUE(MemMgr->CheckInvariants(Error)) << Error;
-}
-
-// Allocate five global ints of varying widths and alignment, and check their
-// alignment and overlap.
-TEST(JITMemoryManagerTest, TestSmallGlobalInts) {
-  std::unique_ptr<JITMemoryManager> MemMgr(
-      JITMemoryManager::CreateDefaultMemManager());
-  uint8_t  *a = (uint8_t *)MemMgr->allocateGlobal(8,  0);
-  uint16_t *b = (uint16_t*)MemMgr->allocateGlobal(16, 2);
-  uint32_t *c = (uint32_t*)MemMgr->allocateGlobal(32, 4);
-  uint64_t *d = (uint64_t*)MemMgr->allocateGlobal(64, 8);
-
-  // Check the alignment.
-  EXPECT_EQ(0U, ((uintptr_t)b) & 0x1);
-  EXPECT_EQ(0U, ((uintptr_t)c) & 0x3);
-  EXPECT_EQ(0U, ((uintptr_t)d) & 0x7);
-
-  // Initialize them each one at a time and make sure they don't overlap.
-  *a = 0xff;
-  *b = 0U;
-  *c = 0U;
-  *d = 0U;
-  EXPECT_EQ(0xffU, *a);
-  EXPECT_EQ(0U, *b);
-  EXPECT_EQ(0U, *c);
-  EXPECT_EQ(0U, *d);
-  *a = 0U;
-  *b = 0xffffU;
-  EXPECT_EQ(0U, *a);
-  EXPECT_EQ(0xffffU, *b);
-  EXPECT_EQ(0U, *c);
-  EXPECT_EQ(0U, *d);
-  *b = 0U;
-  *c = 0xffffffffU;
-  EXPECT_EQ(0U, *a);
-  EXPECT_EQ(0U, *b);
-  EXPECT_EQ(0xffffffffU, *c);
-  EXPECT_EQ(0U, *d);
-  *c = 0U;
-  *d = 0xffffffffffffffffULL;
-  EXPECT_EQ(0U, *a);
-  EXPECT_EQ(0U, *b);
-  EXPECT_EQ(0U, *c);
-  EXPECT_EQ(0xffffffffffffffffULL, *d);
-
-  // Make sure we didn't allocate any extra slabs for this tiny amount of data.
-  EXPECT_EQ(1U, MemMgr->GetNumDataSlabs());
-}
-
-// Allocate a small global, a big global, and a third global, and make sure we
-// only use two slabs for that.
-TEST(JITMemoryManagerTest, TestLargeGlobalArray) {
-  std::unique_ptr<JITMemoryManager> MemMgr(
-      JITMemoryManager::CreateDefaultMemManager());
-  size_t Size = 4 * MemMgr->GetDefaultDataSlabSize();
-  uint64_t *a = (uint64_t*)MemMgr->allocateGlobal(64, 8);
-  uint8_t *g = MemMgr->allocateGlobal(Size, 8);
-  uint64_t *b = (uint64_t*)MemMgr->allocateGlobal(64, 8);
-
-  // Check the alignment.
-  EXPECT_EQ(0U, ((uintptr_t)a) & 0x7);
-  EXPECT_EQ(0U, ((uintptr_t)g) & 0x7);
-  EXPECT_EQ(0U, ((uintptr_t)b) & 0x7);
-
-  // Initialize them to make sure we don't segfault and make sure they don't
-  // overlap.
-  memset(a, 0x1, 8);
-  memset(g, 0x2, Size);
-  memset(b, 0x3, 8);
-  EXPECT_EQ(0x0101010101010101ULL, *a);
-  // Just check the edges.
-  EXPECT_EQ(0x02U, g[0]);
-  EXPECT_EQ(0x02U, g[Size - 1]);
-  EXPECT_EQ(0x0303030303030303ULL, *b);
-
-  // Check the number of slabs.
-  EXPECT_EQ(2U, MemMgr->GetNumDataSlabs());
-}
-
-// Allocate lots of medium globals so that we can test moving the bump allocator
-// to a new slab.
-TEST(JITMemoryManagerTest, TestManyGlobals) {
-  std::unique_ptr<JITMemoryManager> MemMgr(
-      JITMemoryManager::CreateDefaultMemManager());
-  size_t SlabSize = MemMgr->GetDefaultDataSlabSize();
-  size_t Size = 128;
-  int Iters = (SlabSize / Size) + 1;
-
-  // We should start with no slabs.
-  EXPECT_EQ(0U, MemMgr->GetNumDataSlabs());
-
-  // After allocating a bunch of globals, we should have two.
-  for (int I = 0; I < Iters; ++I)
-    MemMgr->allocateGlobal(Size, 8);
-  EXPECT_EQ(2U, MemMgr->GetNumDataSlabs());
-
-  // And after much more, we should have three.
-  for (int I = 0; I < Iters; ++I)
-    MemMgr->allocateGlobal(Size, 8);
-  EXPECT_EQ(3U, MemMgr->GetNumDataSlabs());
-}
-
-// Allocate lots of function stubs so that we can test moving the stub bump
-// allocator to a new slab.
-TEST(JITMemoryManagerTest, TestManyStubs) {
-  std::unique_ptr<JITMemoryManager> MemMgr(
-      JITMemoryManager::CreateDefaultMemManager());
-  size_t SlabSize = MemMgr->GetDefaultStubSlabSize();
-  size_t Size = 128;
-  int Iters = (SlabSize / Size) + 1;
-
-  // We should start with no slabs.
-  EXPECT_EQ(0U, MemMgr->GetNumDataSlabs());
-
-  // After allocating a bunch of stubs, we should have two.
-  for (int I = 0; I < Iters; ++I)
-    MemMgr->allocateStub(nullptr, Size, 8);
-  EXPECT_EQ(2U, MemMgr->GetNumStubSlabs());
-
-  // And after much more, we should have three.
-  for (int I = 0; I < Iters; ++I)
-    MemMgr->allocateStub(nullptr, Size, 8);
-  EXPECT_EQ(3U, MemMgr->GetNumStubSlabs());
-}
-
-// Check section allocation and alignment
-TEST(JITMemoryManagerTest, AllocateSection) {
-  std::unique_ptr<JITMemoryManager> MemMgr(
-      JITMemoryManager::CreateDefaultMemManager());
-  uint8_t *code1 = MemMgr->allocateCodeSection(256, 0, 1, StringRef());
-  uint8_t *data1 = MemMgr->allocateDataSection(256, 16, 2, StringRef(), true);
-  uint8_t *code2 = MemMgr->allocateCodeSection(257, 32, 3, StringRef());
-  uint8_t *data2 = MemMgr->allocateDataSection(256, 64, 4, StringRef(), false);
-  uint8_t *code3 = MemMgr->allocateCodeSection(258, 64, 5, StringRef());
-
-  EXPECT_NE((uint8_t*)nullptr, code1);
-  EXPECT_NE((uint8_t*)nullptr, code2);
-  EXPECT_NE((uint8_t*)nullptr, data1);
-  EXPECT_NE((uint8_t*)nullptr, data2);
-
-  // Check alignment
-  EXPECT_EQ((uint64_t)code1 & 0xf, 0u);
-  EXPECT_EQ((uint64_t)code2 & 0x1f, 0u);
-  EXPECT_EQ((uint64_t)code3 & 0x3f, 0u);
-  EXPECT_EQ((uint64_t)data1 & 0xf, 0u);
-  EXPECT_EQ((uint64_t)data2 & 0x3f, 0u);
-}
-
-}

diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp
deleted file mode 100644
index 817d207..0000000
--- a/unittests/ExecutionEngine/JIT/JITTest.cpp
+++ /dev/null

@@ -1,728 +0,0 @@
-//===- JITTest.cpp - Unit tests for the JIT -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/JIT.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/TypeBuilder.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetSelect.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-using namespace llvm;
-
-// This variable is intentionally defined differently in the statically-compiled
-// program from the IR input to the JIT to assert that the JIT doesn't use its
-// definition.  Note that this variable must be defined even on platforms where
-// JIT tests are disabled as it is referenced from the .def file.
-extern "C" int32_t JITTest_AvailableExternallyGlobal;
-int32_t JITTest_AvailableExternallyGlobal LLVM_ATTRIBUTE_USED = 42;
-
-// This function is intentionally defined differently in the statically-compiled
-// program from the IR input to the JIT to assert that the JIT doesn't use its
-// definition.  Note that this function must be defined even on platforms where
-// JIT tests are disabled as it is referenced from the .def file.
-extern "C" int32_t JITTest_AvailableExternallyFunction() LLVM_ATTRIBUTE_USED;
-extern "C" int32_t JITTest_AvailableExternallyFunction() {
-  return 42;
-}
-
-namespace {
-
-// Tests on ARM, PowerPC and SystemZ disabled as we're running the old jit
-#if !defined(__arm__) && !defined(__powerpc__) && !defined(__s390__) \
-                      && !defined(__aarch64__)
-
-Function *makeReturnGlobal(std::string Name, GlobalVariable *G, Module *M) {
-  std::vector<Type*> params;
-  FunctionType *FTy = FunctionType::get(G->getType()->getElementType(),
-                                              params, false);
-  Function *F = Function::Create(FTy, GlobalValue::ExternalLinkage, Name, M);
-  BasicBlock *Entry = BasicBlock::Create(M->getContext(), "entry", F);
-  IRBuilder<> builder(Entry);
-  Value *Load = builder.CreateLoad(G);
-  Type *GTy = G->getType()->getElementType();
-  Value *Add = builder.CreateAdd(Load, ConstantInt::get(GTy, 1LL));
-  builder.CreateStore(Add, G);
-  builder.CreateRet(Add);
-  return F;
-}
-
-std::string DumpFunction(const Function *F) {
-  std::string Result;
-  raw_string_ostream(Result) << "" << *F;
-  return Result;
-}
-
-class RecordingJITMemoryManager : public JITMemoryManager {
-  const std::unique_ptr<JITMemoryManager> Base;
-
-public:
-  RecordingJITMemoryManager()
-    : Base(JITMemoryManager::CreateDefaultMemManager()) {
-    stubsAllocated = 0;
-  }
-  virtual void *getPointerToNamedFunction(const std::string &Name,
-                                          bool AbortOnFailure = true) {
-    return Base->getPointerToNamedFunction(Name, AbortOnFailure);
-  }
-
-  virtual void setMemoryWritable() { Base->setMemoryWritable(); }
-  virtual void setMemoryExecutable() { Base->setMemoryExecutable(); }
-  virtual void setPoisonMemory(bool poison) { Base->setPoisonMemory(poison); }
-  virtual void AllocateGOT() { Base->AllocateGOT(); }
-  virtual uint8_t *getGOTBase() const { return Base->getGOTBase(); }
-  struct StartFunctionBodyCall {
-    StartFunctionBodyCall(uint8_t *Result, const Function *F,
-                          uintptr_t ActualSize, uintptr_t ActualSizeResult)
-      : Result(Result), F(F), F_dump(DumpFunction(F)),
-        ActualSize(ActualSize), ActualSizeResult(ActualSizeResult) {}
-    uint8_t *Result;
-    const Function *F;
-    std::string F_dump;
-    uintptr_t ActualSize;
-    uintptr_t ActualSizeResult;
-  };
-  std::vector<StartFunctionBodyCall> startFunctionBodyCalls;
-  virtual uint8_t *startFunctionBody(const Function *F,
-                                     uintptr_t &ActualSize) {
-    uintptr_t InitialActualSize = ActualSize;
-    uint8_t *Result = Base->startFunctionBody(F, ActualSize);
-    startFunctionBodyCalls.push_back(
-      StartFunctionBodyCall(Result, F, InitialActualSize, ActualSize));
-    return Result;
-  }
-  int stubsAllocated;
-  uint8_t *allocateStub(const GlobalValue *F, unsigned StubSize,
-                        unsigned Alignment) override {
-    stubsAllocated++;
-    return Base->allocateStub(F, StubSize, Alignment);
-  }
-  struct EndFunctionBodyCall {
-    EndFunctionBodyCall(const Function *F, uint8_t *FunctionStart,
-                        uint8_t *FunctionEnd)
-      : F(F), F_dump(DumpFunction(F)),
-        FunctionStart(FunctionStart), FunctionEnd(FunctionEnd) {}
-    const Function *F;
-    std::string F_dump;
-    uint8_t *FunctionStart;
-    uint8_t *FunctionEnd;
-  };
-  std::vector<EndFunctionBodyCall> endFunctionBodyCalls;
-  virtual void endFunctionBody(const Function *F, uint8_t *FunctionStart,
-                               uint8_t *FunctionEnd) {
-    endFunctionBodyCalls.push_back(
-      EndFunctionBodyCall(F, FunctionStart, FunctionEnd));
-    Base->endFunctionBody(F, FunctionStart, FunctionEnd);
-  }
-  virtual uint8_t *allocateDataSection(
-    uintptr_t Size, unsigned Alignment, unsigned SectionID,
-    StringRef SectionName, bool IsReadOnly) {
-    return Base->allocateDataSection(
-      Size, Alignment, SectionID, SectionName, IsReadOnly);
-  }
-  virtual uint8_t *allocateCodeSection(
-    uintptr_t Size, unsigned Alignment, unsigned SectionID,
-    StringRef SectionName) {
-    return Base->allocateCodeSection(
-      Size, Alignment, SectionID, SectionName);
-  }
-  virtual bool finalizeMemory(std::string *ErrMsg) { return false; }
-  virtual uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) {
-    return Base->allocateSpace(Size, Alignment);
-  }
-  virtual uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment) {
-    return Base->allocateGlobal(Size, Alignment);
-  }
-  struct DeallocateFunctionBodyCall {
-    DeallocateFunctionBodyCall(const void *Body) : Body(Body) {}
-    const void *Body;
-  };
-  std::vector<DeallocateFunctionBodyCall> deallocateFunctionBodyCalls;
-  virtual void deallocateFunctionBody(void *Body) {
-    deallocateFunctionBodyCalls.push_back(DeallocateFunctionBodyCall(Body));
-    Base->deallocateFunctionBody(Body);
-  }
-};
-
-bool LoadAssemblyInto(Module *M, const char *assembly) {
-  SMDiagnostic Error;
-  bool success =
-    nullptr != ParseAssemblyString(assembly, M, Error, M->getContext());
-  std::string errMsg;
-  raw_string_ostream os(errMsg);
-  Error.print("", os);
-  EXPECT_TRUE(success) << os.str();
-  return success;
-}
-
-class JITTest : public testing::Test {
- protected:
-  virtual RecordingJITMemoryManager *createMemoryManager() {
-    return new RecordingJITMemoryManager;
-  }
-
-  virtual void SetUp() {
-    M = new Module("<main>", Context);
-    RJMM = createMemoryManager();
-    RJMM->setPoisonMemory(true);
-    std::string Error;
-    TargetOptions Options;
-    TheJIT.reset(EngineBuilder(M).setEngineKind(EngineKind::JIT)
-                 .setJITMemoryManager(RJMM)
-                 .setErrorStr(&Error)
-                 .setTargetOptions(Options).create());
-    ASSERT_TRUE(TheJIT.get() != nullptr) << Error;
-  }
-
-  void LoadAssembly(const char *assembly) {
-    LoadAssemblyInto(M, assembly);
-  }
-
-  LLVMContext Context;
-  Module *M;  // Owned by ExecutionEngine.
-  RecordingJITMemoryManager *RJMM;
-  std::unique_ptr<ExecutionEngine> TheJIT;
-};
-
-// Regression test for a bug.  The JIT used to allocate globals inside the same
-// memory block used for the function, and when the function code was freed,
-// the global was left in the same place.  This test allocates a function
-// that uses and global, deallocates it, and then makes sure that the global
-// stays alive after that.
-TEST(JIT, GlobalInFunction) {
-  LLVMContext context;
-  Module *M = new Module("<main>", context);
-
-  JITMemoryManager *MemMgr = JITMemoryManager::CreateDefaultMemManager();
-  // Tell the memory manager to poison freed memory so that accessing freed
-  // memory is more easily tested.
-  MemMgr->setPoisonMemory(true);
-  std::string Error;
-  std::unique_ptr<ExecutionEngine> JIT(EngineBuilder(M)
-                                           .setEngineKind(EngineKind::JIT)
-                                           .setErrorStr(&Error)
-                                           .setJITMemoryManager(MemMgr)
-                                           // The next line enables the fix:
-                                           .setAllocateGVsWithCode(false)
-                                           .create());
-  ASSERT_EQ(Error, "");
-
-  // Create a global variable.
-  Type *GTy = Type::getInt32Ty(context);
-  GlobalVariable *G = new GlobalVariable(
-      *M,
-      GTy,
-      false,  // Not constant.
-      GlobalValue::InternalLinkage,
-      Constant::getNullValue(GTy),
-      "myglobal");
-
-  // Make a function that points to a global.
-  Function *F1 = makeReturnGlobal("F1", G, M);
-
-  // Get the pointer to the native code to force it to JIT the function and
-  // allocate space for the global.
-  void (*F1Ptr)() =
-      reinterpret_cast<void(*)()>((intptr_t)JIT->getPointerToFunction(F1));
-
-  // Since F1 was codegen'd, a pointer to G should be available.
-  int32_t *GPtr = (int32_t*)JIT->getPointerToGlobalIfAvailable(G);
-  ASSERT_NE((int32_t*)nullptr, GPtr);
-  EXPECT_EQ(0, *GPtr);
-
-  // F1() should increment G.
-  F1Ptr();
-  EXPECT_EQ(1, *GPtr);
-
-  // Make a second function identical to the first, referring to the same
-  // global.
-  Function *F2 = makeReturnGlobal("F2", G, M);
-  void (*F2Ptr)() =
-      reinterpret_cast<void(*)()>((intptr_t)JIT->getPointerToFunction(F2));
-
-  // F2() should increment G.
-  F2Ptr();
-  EXPECT_EQ(2, *GPtr);
-
-  // Deallocate F1.
-  JIT->freeMachineCodeForFunction(F1);
-
-  // F2() should *still* increment G.
-  F2Ptr();
-  EXPECT_EQ(3, *GPtr);
-}
-
-int PlusOne(int arg) {
-  return arg + 1;
-}
-
-TEST_F(JITTest, FarCallToKnownFunction) {
-  // x86-64 can only make direct calls to functions within 32 bits of
-  // the current PC.  To call anything farther away, we have to load
-  // the address into a register and call through the register.  The
-  // current JIT does this by allocating a stub for any far call.
-  // There was a bug in which the JIT tried to emit a direct call when
-  // the target was already in the JIT's global mappings and lazy
-  // compilation was disabled.
-
-  Function *KnownFunction = Function::Create(
-      TypeBuilder<int(int), false>::get(Context),
-      GlobalValue::ExternalLinkage, "known", M);
-  TheJIT->addGlobalMapping(KnownFunction, (void*)(intptr_t)PlusOne);
-
-  // int test() { return known(7); }
-  Function *TestFunction = Function::Create(
-      TypeBuilder<int(), false>::get(Context),
-      GlobalValue::ExternalLinkage, "test", M);
-  BasicBlock *Entry = BasicBlock::Create(Context, "entry", TestFunction);
-  IRBuilder<> Builder(Entry);
-  Value *result = Builder.CreateCall(
-      KnownFunction,
-      ConstantInt::get(TypeBuilder<int, false>::get(Context), 7));
-  Builder.CreateRet(result);
-
-  TheJIT->DisableLazyCompilation(true);
-  int (*TestFunctionPtr)() = reinterpret_cast<int(*)()>(
-      (intptr_t)TheJIT->getPointerToFunction(TestFunction));
-  // This used to crash in trying to call PlusOne().
-  EXPECT_EQ(8, TestFunctionPtr());
-}
-
-// Test a function C which calls A and B which call each other.
-TEST_F(JITTest, NonLazyCompilationStillNeedsStubs) {
-  TheJIT->DisableLazyCompilation(true);
-
-  FunctionType *Func1Ty =
-      cast<FunctionType>(TypeBuilder<void(void), false>::get(Context));
-  std::vector<Type*> arg_types;
-  arg_types.push_back(Type::getInt1Ty(Context));
-  FunctionType *FuncTy = FunctionType::get(
-      Type::getVoidTy(Context), arg_types, false);
-  Function *Func1 = Function::Create(Func1Ty, Function::ExternalLinkage,
-                                     "func1", M);
-  Function *Func2 = Function::Create(FuncTy, Function::InternalLinkage,
-                                     "func2", M);
-  Function *Func3 = Function::Create(FuncTy, Function::InternalLinkage,
-                                     "func3", M);
-  BasicBlock *Block1 = BasicBlock::Create(Context, "block1", Func1);
-  BasicBlock *Block2 = BasicBlock::Create(Context, "block2", Func2);
-  BasicBlock *True2 = BasicBlock::Create(Context, "cond_true", Func2);
-  BasicBlock *False2 = BasicBlock::Create(Context, "cond_false", Func2);
-  BasicBlock *Block3 = BasicBlock::Create(Context, "block3", Func3);
-  BasicBlock *True3 = BasicBlock::Create(Context, "cond_true", Func3);
-  BasicBlock *False3 = BasicBlock::Create(Context, "cond_false", Func3);
-
-  // Make Func1 call Func2(0) and Func3(0).
-  IRBuilder<> Builder(Block1);
-  Builder.CreateCall(Func2, ConstantInt::getTrue(Context));
-  Builder.CreateCall(Func3, ConstantInt::getTrue(Context));
-  Builder.CreateRetVoid();
-
-  // void Func2(bool b) { if (b) { Func3(false); return; } return; }
-  Builder.SetInsertPoint(Block2);
-  Builder.CreateCondBr(Func2->arg_begin(), True2, False2);
-  Builder.SetInsertPoint(True2);
-  Builder.CreateCall(Func3, ConstantInt::getFalse(Context));
-  Builder.CreateRetVoid();
-  Builder.SetInsertPoint(False2);
-  Builder.CreateRetVoid();
-
-  // void Func3(bool b) { if (b) { Func2(false); return; } return; }
-  Builder.SetInsertPoint(Block3);
-  Builder.CreateCondBr(Func3->arg_begin(), True3, False3);
-  Builder.SetInsertPoint(True3);
-  Builder.CreateCall(Func2, ConstantInt::getFalse(Context));
-  Builder.CreateRetVoid();
-  Builder.SetInsertPoint(False3);
-  Builder.CreateRetVoid();
-
-  // Compile the function to native code
-  void (*F1Ptr)() =
-     reinterpret_cast<void(*)()>((intptr_t)TheJIT->getPointerToFunction(Func1));
-
-  F1Ptr();
-}
-
-// Regression test for PR5162.  This used to trigger an AssertingVH inside the
-// JIT's Function to stub mapping.
-TEST_F(JITTest, NonLazyLeaksNoStubs) {
-  TheJIT->DisableLazyCompilation(true);
-
-  // Create two functions with a single basic block each.
-  FunctionType *FuncTy =
-      cast<FunctionType>(TypeBuilder<int(), false>::get(Context));
-  Function *Func1 = Function::Create(FuncTy, Function::ExternalLinkage,
-                                     "func1", M);
-  Function *Func2 = Function::Create(FuncTy, Function::InternalLinkage,
-                                     "func2", M);
-  BasicBlock *Block1 = BasicBlock::Create(Context, "block1", Func1);
-  BasicBlock *Block2 = BasicBlock::Create(Context, "block2", Func2);
-
-  // The first function calls the second and returns the result
-  IRBuilder<> Builder(Block1);
-  Value *Result = Builder.CreateCall(Func2);
-  Builder.CreateRet(Result);
-
-  // The second function just returns a constant
-  Builder.SetInsertPoint(Block2);
-  Builder.CreateRet(ConstantInt::get(TypeBuilder<int, false>::get(Context),42));
-
-  // Compile the function to native code
-  (void)TheJIT->getPointerToFunction(Func1);
-
-  // Free the JIT state for the functions
-  TheJIT->freeMachineCodeForFunction(Func1);
-  TheJIT->freeMachineCodeForFunction(Func2);
-
-  // Delete the first function (and show that is has no users)
-  EXPECT_EQ(Func1->getNumUses(), 0u);
-  Func1->eraseFromParent();
-
-  // Delete the second function (and show that it has no users - it had one,
-  // func1 but that's gone now)
-  EXPECT_EQ(Func2->getNumUses(), 0u);
-  Func2->eraseFromParent();
-}
-
-TEST_F(JITTest, ModuleDeletion) {
-  TheJIT->DisableLazyCompilation(false);
-  LoadAssembly("define void @main() { "
-               "  call i32 @computeVal() "
-               "  ret void "
-               "} "
-               " "
-               "define internal i32 @computeVal()  { "
-               "  ret i32 0 "
-               "} ");
-  Function *func = M->getFunction("main");
-  TheJIT->getPointerToFunction(func);
-  TheJIT->removeModule(M);
-  delete M;
-
-  SmallPtrSet<const void*, 2> FunctionsDeallocated;
-  for (unsigned i = 0, e = RJMM->deallocateFunctionBodyCalls.size();
-       i != e; ++i) {
-    FunctionsDeallocated.insert(RJMM->deallocateFunctionBodyCalls[i].Body);
-  }
-  for (unsigned i = 0, e = RJMM->startFunctionBodyCalls.size(); i != e; ++i) {
-    EXPECT_TRUE(FunctionsDeallocated.count(
-                  RJMM->startFunctionBodyCalls[i].Result))
-      << "Function leaked: \n" << RJMM->startFunctionBodyCalls[i].F_dump;
-  }
-  EXPECT_EQ(RJMM->startFunctionBodyCalls.size(),
-            RJMM->deallocateFunctionBodyCalls.size());
-}
-
-// ARM, MIPS and PPC still emit stubs for calls since the target may be
-// too far away to call directly.  This #if can probably be removed when
-// http://llvm.org/PR5201 is fixed.
-#if !defined(__arm__) && !defined(__mips__) && \
-    !defined(__powerpc__) && !defined(__ppc__) && !defined(__aarch64__)
-typedef int (*FooPtr) ();
-
-TEST_F(JITTest, NoStubs) {
-  LoadAssembly("define void @bar() {"
-	       "entry: "
-	       "ret void"
-	       "}"
-	       " "
-	       "define i32 @foo() {"
-	       "entry:"
-	       "call void @bar()"
-	       "ret i32 undef"
-	       "}"
-	       " "
-	       "define i32 @main() {"
-	       "entry:"
-	       "%0 = call i32 @foo()"
-	       "call void @bar()"
-	       "ret i32 undef"
-	       "}");
-  Function *foo = M->getFunction("foo");
-  uintptr_t tmp = (uintptr_t)(TheJIT->getPointerToFunction(foo));
-  FooPtr ptr = (FooPtr)(tmp);
-
-  (ptr)();
-
-  // We should now allocate no more stubs, we have the code to foo
-  // and the existing stub for bar.
-  int stubsBefore = RJMM->stubsAllocated;
-  Function *func = M->getFunction("main");
-  TheJIT->getPointerToFunction(func);
-
-  Function *bar = M->getFunction("bar");
-  TheJIT->getPointerToFunction(bar);
-
-  ASSERT_EQ(stubsBefore, RJMM->stubsAllocated);
-}
-#endif  // !ARM && !PPC
-
-TEST_F(JITTest, FunctionPointersOutliveTheirCreator) {
-  TheJIT->DisableLazyCompilation(true);
-  LoadAssembly("define i8()* @get_foo_addr() { "
-               "  ret i8()* @foo "
-               "} "
-               " "
-               "define i8 @foo() { "
-               "  ret i8 42 "
-               "} ");
-  Function *F_get_foo_addr = M->getFunction("get_foo_addr");
-
-  typedef char(*fooT)();
-  fooT (*get_foo_addr)() = reinterpret_cast<fooT(*)()>(
-      (intptr_t)TheJIT->getPointerToFunction(F_get_foo_addr));
-  fooT foo_addr = get_foo_addr();
-
-  // Now free get_foo_addr.  This should not free the machine code for foo or
-  // any call stub returned as foo's canonical address.
-  TheJIT->freeMachineCodeForFunction(F_get_foo_addr);
-
-  // Check by calling the reported address of foo.
-  EXPECT_EQ(42, foo_addr());
-
-  // The reported address should also be the same as the result of a subsequent
-  // getPointerToFunction(foo).
-#if 0
-  // Fails until PR5126 is fixed:
-  Function *F_foo = M->getFunction("foo");
-  fooT foo = reinterpret_cast<fooT>(
-      (intptr_t)TheJIT->getPointerToFunction(F_foo));
-  EXPECT_EQ((intptr_t)foo, (intptr_t)foo_addr);
-#endif
-}
-
-// ARM does not have an implementation of replaceMachineCodeForFunction(),
-// so recompileAndRelinkFunction doesn't work.
-#if !defined(__arm__) && !defined(__aarch64__)
-TEST_F(JITTest, FunctionIsRecompiledAndRelinked) {
-  Function *F = Function::Create(TypeBuilder<int(void), false>::get(Context),
-                                 GlobalValue::ExternalLinkage, "test", M);
-  BasicBlock *Entry = BasicBlock::Create(Context, "entry", F);
-  IRBuilder<> Builder(Entry);
-  Value *Val = ConstantInt::get(TypeBuilder<int, false>::get(Context), 1);
-  Builder.CreateRet(Val);
-
-  TheJIT->DisableLazyCompilation(true);
-  // Compile the function once, and make sure it works.
-  int (*OrigFPtr)() = reinterpret_cast<int(*)()>(
-    (intptr_t)TheJIT->recompileAndRelinkFunction(F));
-  EXPECT_EQ(1, OrigFPtr());
-
-  // Now change the function to return a different value.
-  Entry->eraseFromParent();
-  BasicBlock *NewEntry = BasicBlock::Create(Context, "new_entry", F);
-  Builder.SetInsertPoint(NewEntry);
-  Val = ConstantInt::get(TypeBuilder<int, false>::get(Context), 2);
-  Builder.CreateRet(Val);
-  // Recompile it, which should produce a new function pointer _and_ update the
-  // old one.
-  int (*NewFPtr)() = reinterpret_cast<int(*)()>(
-    (intptr_t)TheJIT->recompileAndRelinkFunction(F));
-
-  EXPECT_EQ(2, NewFPtr())
-    << "The new pointer should call the new version of the function";
-  EXPECT_EQ(2, OrigFPtr())
-    << "The old pointer's target should now jump to the new version";
-}
-#endif  // !defined(__arm__)
-
-TEST_F(JITTest, AvailableExternallyGlobalIsntEmitted) {
-  TheJIT->DisableLazyCompilation(true);
-  LoadAssembly("@JITTest_AvailableExternallyGlobal = "
-               "  available_externally global i32 7 "
-               " "
-               "define i32 @loader() { "
-               "  %result = load i32* @JITTest_AvailableExternallyGlobal "
-               "  ret i32 %result "
-               "} ");
-  Function *loaderIR = M->getFunction("loader");
-
-  int32_t (*loader)() = reinterpret_cast<int32_t(*)()>(
-    (intptr_t)TheJIT->getPointerToFunction(loaderIR));
-  EXPECT_EQ(42, loader()) << "func should return 42 from the external global,"
-                          << " not 7 from the IR version.";
-}
-
-TEST_F(JITTest, AvailableExternallyFunctionIsntCompiled) {
-  TheJIT->DisableLazyCompilation(true);
-  LoadAssembly("define available_externally i32 "
-               "    @JITTest_AvailableExternallyFunction() { "
-               "  ret i32 7 "
-               "} "
-               " "
-               "define i32 @func() { "
-               "  %result = tail call i32 "
-               "    @JITTest_AvailableExternallyFunction() "
-               "  ret i32 %result "
-               "} ");
-  Function *funcIR = M->getFunction("func");
-
-  int32_t (*func)() = reinterpret_cast<int32_t(*)()>(
-    (intptr_t)TheJIT->getPointerToFunction(funcIR));
-  EXPECT_EQ(42, func()) << "func should return 42 from the static version,"
-                        << " not 7 from the IR version.";
-}
-
-TEST_F(JITTest, EscapedLazyStubStillCallable) {
-  TheJIT->DisableLazyCompilation(false);
-  LoadAssembly("define internal i32 @stubbed() { "
-               "  ret i32 42 "
-               "} "
-               " "
-               "define i32()* @get_stub() { "
-               "  ret i32()* @stubbed "
-               "} ");
-  typedef int32_t(*StubTy)();
-
-  // Call get_stub() to get the address of @stubbed without actually JITting it.
-  Function *get_stubIR = M->getFunction("get_stub");
-  StubTy (*get_stub)() = reinterpret_cast<StubTy(*)()>(
-    (intptr_t)TheJIT->getPointerToFunction(get_stubIR));
-  StubTy stubbed = get_stub();
-  // Now get_stubIR is the only reference to stubbed's stub.
-  get_stubIR->eraseFromParent();
-  // Now there are no references inside the JIT, but we've got a pointer outside
-  // it.  The stub should be callable and return the right value.
-  EXPECT_EQ(42, stubbed());
-}
-
-// Converts the LLVM assembly to bitcode and returns it in a std::string.  An
-// empty string indicates an error.
-std::string AssembleToBitcode(LLVMContext &Context, const char *Assembly) {
-  Module TempModule("TempModule", Context);
-  if (!LoadAssemblyInto(&TempModule, Assembly)) {
-    return "";
-  }
-
-  std::string Result;
-  raw_string_ostream OS(Result);
-  WriteBitcodeToFile(&TempModule, OS);
-  OS.flush();
-  return Result;
-}
-
-// Returns a newly-created ExecutionEngine that reads the bitcode in 'Bitcode'
-// lazily.  The associated Module (owned by the ExecutionEngine) is returned in
-// M.  Both will be NULL on an error.  Bitcode must live at least as long as the
-// ExecutionEngine.
-ExecutionEngine *getJITFromBitcode(
-  LLVMContext &Context, const std::string &Bitcode, Module *&M) {
-  // c_str() is null-terminated like MemoryBuffer::getMemBuffer requires.
-  MemoryBuffer *BitcodeBuffer =
-    MemoryBuffer::getMemBuffer(Bitcode, "Bitcode for test");
-  ErrorOr<Module*> ModuleOrErr = getLazyBitcodeModule(BitcodeBuffer, Context);
-  if (std::error_code EC = ModuleOrErr.getError()) {
-    ADD_FAILURE() << EC.message();
-    delete BitcodeBuffer;
-    return nullptr;
-  }
-  M = ModuleOrErr.get();
-  std::string errMsg;
-  ExecutionEngine *TheJIT = EngineBuilder(M)
-    .setEngineKind(EngineKind::JIT)
-    .setErrorStr(&errMsg)
-    .create();
-  if (TheJIT == nullptr) {
-    ADD_FAILURE() << errMsg;
-    delete M;
-    M = nullptr;
-    return nullptr;
-  }
-  return TheJIT;
-}
-
-TEST(LazyLoadedJITTest, MaterializableAvailableExternallyFunctionIsntCompiled) {
-  LLVMContext Context;
-  const std::string Bitcode =
-    AssembleToBitcode(Context,
-                      "define available_externally i32 "
-                      "    @JITTest_AvailableExternallyFunction() { "
-                      "  ret i32 7 "
-                      "} "
-                      " "
-                      "define i32 @func() { "
-                      "  %result = tail call i32 "
-                      "    @JITTest_AvailableExternallyFunction() "
-                      "  ret i32 %result "
-                      "} ");
-  ASSERT_FALSE(Bitcode.empty()) << "Assembling failed";
-  Module *M;
-  std::unique_ptr<ExecutionEngine> TheJIT(
-      getJITFromBitcode(Context, Bitcode, M));
-  ASSERT_TRUE(TheJIT.get()) << "Failed to create JIT.";
-  TheJIT->DisableLazyCompilation(true);
-
-  Function *funcIR = M->getFunction("func");
-  Function *availableFunctionIR =
-    M->getFunction("JITTest_AvailableExternallyFunction");
-
-  // Double-check that the available_externally function is still unmaterialized
-  // when getPointerToFunction needs to find out if it's available_externally.
-  EXPECT_TRUE(availableFunctionIR->isMaterializable());
-
-  int32_t (*func)() = reinterpret_cast<int32_t(*)()>(
-    (intptr_t)TheJIT->getPointerToFunction(funcIR));
-  EXPECT_EQ(42, func()) << "func should return 42 from the static version,"
-                        << " not 7 from the IR version.";
-}
-
-TEST(LazyLoadedJITTest, EagerCompiledRecursionThroughGhost) {
-  LLVMContext Context;
-  const std::string Bitcode =
-    AssembleToBitcode(Context,
-                      "define i32 @recur1(i32 %a) { "
-                      "  %zero = icmp eq i32 %a, 0 "
-                      "  br i1 %zero, label %done, label %notdone "
-                      "done: "
-                      "  ret i32 3 "
-                      "notdone: "
-                      "  %am1 = sub i32 %a, 1 "
-                      "  %result = call i32 @recur2(i32 %am1) "
-                      "  ret i32 %result "
-                      "} "
-                      " "
-                      "define i32 @recur2(i32 %b) { "
-                      "  %result = call i32 @recur1(i32 %b) "
-                      "  ret i32 %result "
-                      "} ");
-  ASSERT_FALSE(Bitcode.empty()) << "Assembling failed";
-  Module *M;
-  std::unique_ptr<ExecutionEngine> TheJIT(
-      getJITFromBitcode(Context, Bitcode, M));
-  ASSERT_TRUE(TheJIT.get()) << "Failed to create JIT.";
-  TheJIT->DisableLazyCompilation(true);
-
-  Function *recur1IR = M->getFunction("recur1");
-  Function *recur2IR = M->getFunction("recur2");
-  EXPECT_TRUE(recur1IR->isMaterializable());
-  EXPECT_TRUE(recur2IR->isMaterializable());
-
-  int32_t (*recur1)(int32_t) = reinterpret_cast<int32_t(*)(int32_t)>(
-    (intptr_t)TheJIT->getPointerToFunction(recur1IR));
-  EXPECT_EQ(3, recur1(4));
-}
-#endif // !defined(__arm__) && !defined(__powerpc__) && !defined(__s390__)
-
-}

diff --git a/unittests/ExecutionEngine/JIT/JITTests.def b/unittests/ExecutionEngine/JIT/JITTests.def
deleted file mode 100644
index 17c91e8..0000000
--- a/unittests/ExecutionEngine/JIT/JITTests.def
+++ /dev/null

@@ -1,4 +0,0 @@
-EXPORTS
-getPointerToNamedFunction
-JITTest_AvailableExternallyFunction
-JITTest_AvailableExternallyGlobal

diff --git a/unittests/ExecutionEngine/JIT/Makefile b/unittests/ExecutionEngine/JIT/Makefile
deleted file mode 100644
index d86c03b..0000000
--- a/unittests/ExecutionEngine/JIT/Makefile
+++ /dev/null

@@ -1,52 +0,0 @@
-##===- unittests/ExecutionEngine/JIT/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-TESTNAME = JIT
-LINK_COMPONENTS := asmparser bitreader bitwriter jit native
-
-# The JIT tests need to dlopen things.
-NO_DEAD_STRIP := 1
-
-include $(LEVEL)/Makefile.config
-
-SOURCES := JITEventListenerTest.cpp JITMemoryManagerTest.cpp JITTest.cpp MultiJITTest.cpp
-
-
-ifeq ($(USE_INTEL_JITEVENTS), 1)
-  # Build the Intel JIT Events interface tests
-  SOURCES += IntelJITEventListenerTest.cpp
-
-  # Add the Intel JIT Events include directory
-  CPPFLAGS += -I$(INTEL_JITEVENTS_INCDIR)
-
-  # Link against the LLVM Intel JIT Evens interface library
-  LINK_COMPONENTS += debuginfo inteljitevents object
-endif
-
-ifeq ($(USE_OPROFILE), 1)
-  # Build the OProfile JIT interface tests
-  SOURCES += OProfileJITEventListenerTest.cpp
-
-  # Link against the LLVM oprofile interface library
-  LINK_COMPONENTS += oprofilejit
-endif
-
-EXPORTED_SYMBOL_FILE = $(PROJ_OBJ_DIR)/JITTests.exports
-
-include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
-
-# Permit these tests to use the JIT's symbolic lookup.
-LD.Flags += $(RDYNAMIC)
-
-# Symbol exports are necessary (at least for now) when building with LTO.
-$(LLVMUnitTestExe): $(NativeExportsFile)
-$(PROJ_OBJ_DIR)/JITTests.exports: $(PROJ_SRC_DIR)/JITTests.def $(PROJ_OBJ_DIR)/.dir
-	tail -n +2 $< > $@
-

diff --git a/unittests/ExecutionEngine/JIT/MultiJITTest.cpp b/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
deleted file mode 100644
index f530e0d..0000000
--- a/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
+++ /dev/null

@@ -1,190 +0,0 @@
-//===- MultiJITTest.cpp - Unit tests for instantiating multiple JITs ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/JIT.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/SourceMgr.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-using namespace llvm;
-
-namespace {
-
-// ARM, PowerPC and SystemZ tests disabled pending fix for PR10783.
-#if !defined(__arm__) && !defined(__powerpc__) && !defined(__s390__) \
-                      && !defined(__aarch64__)
-
-bool LoadAssemblyInto(Module *M, const char *assembly) {
-  SMDiagnostic Error;
-  bool success =
-    nullptr != ParseAssemblyString(assembly, M, Error, M->getContext());
-  std::string errMsg;
-  raw_string_ostream os(errMsg);
-  Error.print("", os);
-  EXPECT_TRUE(success) << os.str();
-  return success;
-}
-
-void createModule1(LLVMContext &Context1, Module *&M1, Function *&FooF1) {
-  M1 = new Module("test1", Context1);
-  LoadAssemblyInto(M1,
-                   "define i32 @add1(i32 %ArgX1) { "
-                   "entry: "
-                   "  %addresult = add i32 1, %ArgX1 "
-                   "  ret i32 %addresult "
-                   "} "
-                   " "
-                   "define i32 @foo1() { "
-                   "entry: "
-                   "  %add1 = call i32 @add1(i32 10) "
-                   "  ret i32 %add1 "
-                   "} ");
-  FooF1 = M1->getFunction("foo1");
-}
-
-void createModule2(LLVMContext &Context2, Module *&M2, Function *&FooF2) {
-  M2 = new Module("test2", Context2);
-  LoadAssemblyInto(M2,
-                   "define i32 @add2(i32 %ArgX2) { "
-                   "entry: "
-                   "  %addresult = add i32 2, %ArgX2 "
-                   "  ret i32 %addresult "
-                   "} "
-                   " "
-                   "define i32 @foo2() { "
-                   "entry: "
-                   "  %add2 = call i32 @add2(i32 10) "
-                   "  ret i32 %add2 "
-                   "} ");
-  FooF2 = M2->getFunction("foo2");
-}
-
-TEST(MultiJitTest, EagerMode) {
-  LLVMContext Context1;
-  Module *M1 = nullptr;
-  Function *FooF1 = nullptr;
-  createModule1(Context1, M1, FooF1);
-
-  LLVMContext Context2;
-  Module *M2 = nullptr;
-  Function *FooF2 = nullptr;
-  createModule2(Context2, M2, FooF2);
-
-  // Now we create the JIT in eager mode
-  std::unique_ptr<ExecutionEngine> EE1(EngineBuilder(M1).create());
-  EE1->DisableLazyCompilation(true);
-  std::unique_ptr<ExecutionEngine> EE2(EngineBuilder(M2).create());
-  EE2->DisableLazyCompilation(true);
-
-  // Call the `foo' function with no arguments:
-  std::vector<GenericValue> noargs;
-  GenericValue gv1 = EE1->runFunction(FooF1, noargs);
-  GenericValue gv2 = EE2->runFunction(FooF2, noargs);
-
-  // Import result of execution:
-  EXPECT_EQ(gv1.IntVal, 11);
-  EXPECT_EQ(gv2.IntVal, 12);
-
-  EE1->freeMachineCodeForFunction(FooF1);
-  EE2->freeMachineCodeForFunction(FooF2);
-}
-
-TEST(MultiJitTest, LazyMode) {
-  LLVMContext Context1;
-  Module *M1 = nullptr;
-  Function *FooF1 = nullptr;
-  createModule1(Context1, M1, FooF1);
-
-  LLVMContext Context2;
-  Module *M2 = nullptr;
-  Function *FooF2 = nullptr;
-  createModule2(Context2, M2, FooF2);
-
-  // Now we create the JIT in lazy mode
-  std::unique_ptr<ExecutionEngine> EE1(EngineBuilder(M1).create());
-  EE1->DisableLazyCompilation(false);
-  std::unique_ptr<ExecutionEngine> EE2(EngineBuilder(M2).create());
-  EE2->DisableLazyCompilation(false);
-
-  // Call the `foo' function with no arguments:
-  std::vector<GenericValue> noargs;
-  GenericValue gv1 = EE1->runFunction(FooF1, noargs);
-  GenericValue gv2 = EE2->runFunction(FooF2, noargs);
-
-  // Import result of execution:
-  EXPECT_EQ(gv1.IntVal, 11);
-  EXPECT_EQ(gv2.IntVal, 12);
-
-  EE1->freeMachineCodeForFunction(FooF1);
-  EE2->freeMachineCodeForFunction(FooF2);
-}
-
-extern "C" {
-  extern void *getPointerToNamedFunction(const char *Name);
-}
-
-TEST(MultiJitTest, JitPool) {
-  LLVMContext Context1;
-  Module *M1 = nullptr;
-  Function *FooF1 = nullptr;
-  createModule1(Context1, M1, FooF1);
-
-  LLVMContext Context2;
-  Module *M2 = nullptr;
-  Function *FooF2 = nullptr;
-  createModule2(Context2, M2, FooF2);
-
-  // Now we create two JITs
-  std::unique_ptr<ExecutionEngine> EE1(EngineBuilder(M1).create());
-  std::unique_ptr<ExecutionEngine> EE2(EngineBuilder(M2).create());
-
-  Function *F1 = EE1->FindFunctionNamed("foo1");
-  void *foo1 = EE1->getPointerToFunction(F1);
-
-  Function *F2 = EE2->FindFunctionNamed("foo2");
-  void *foo2 = EE2->getPointerToFunction(F2);
-
-  // Function in M1
-  EXPECT_EQ(getPointerToNamedFunction("foo1"), foo1);
-
-  // Function in M2
-  EXPECT_EQ(getPointerToNamedFunction("foo2"), foo2);
-
-  // Symbol search
-  intptr_t
-    sa = (intptr_t)getPointerToNamedFunction("getPointerToNamedFunction");
-  EXPECT_TRUE(sa != 0);
-  intptr_t fa = (intptr_t)&getPointerToNamedFunction;
-  EXPECT_TRUE(fa != 0);
-#ifdef __i386__
-  // getPointerToNamedFunction might be indirect jump on Win32 --enable-shared.
-  // FF 25 <disp32>: jmp *(pointer to IAT)
-  if (sa != fa && memcmp((char *)fa, "\xFF\x25", 2) == 0) {
-    fa = *(intptr_t *)(fa + 2); // Address to IAT
-    EXPECT_TRUE(fa != 0);
-    fa = *(intptr_t *)fa;       // Bound value of IAT
-  }
-#elif defined(__x86_64__)
-  // getPointerToNamedFunction might be indirect jump
-  // on Win32 x64 --enable-shared.
-  // FF 25 <pcrel32>: jmp *(RIP + pointer to IAT)
-  if (sa != fa && memcmp((char *)fa, "\xFF\x25", 2) == 0) {
-    fa += *(int32_t *)(fa + 2) + 6;     // Address to IAT(RIP)
-    fa = *(intptr_t *)fa;               // Bound value of IAT
-  }
-#endif
-  EXPECT_TRUE(sa == fa);
-}
-#endif  // !defined(__arm__) && !defined(__powerpc__) && !defined(__s390__)
-
-}  // anonymous namespace

diff --git a/unittests/ExecutionEngine/JIT/OProfileJITEventListenerTest.cpp b/unittests/ExecutionEngine/JIT/OProfileJITEventListenerTest.cpp
deleted file mode 100644
index 7057fca..0000000
--- a/unittests/ExecutionEngine/JIT/OProfileJITEventListenerTest.cpp
+++ /dev/null

@@ -1,165 +0,0 @@
-//===- OProfileJITEventListenerTest.cpp - Unit tests for OProfileJITEventsListener --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===--------------------------------------------------------------------------------------===//
-
-#include "llvm/ExecutionEngine/OProfileWrapper.h"
-#include "JITEventListenerTestCommon.h"
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#include <list>
-#include <map>
-
-using namespace llvm;
-
-namespace {
-
-struct OprofileNativeFunction {
-  const char* Name;
-  uint64_t Addr;
-  const void* CodePtr;
-  unsigned int CodeSize;
-
-  OprofileNativeFunction(const char* name,
-                         uint64_t addr,
-                         const void* code,
-                         unsigned int size)
-  : Name(name)
-  , Addr(addr)
-  , CodePtr(code)
-  , CodeSize(size) {
-  }
-};
-
-typedef std::list<OprofileNativeFunction> NativeFunctionList;
-typedef std::list<debug_line_info> NativeDebugList;
-NativeFunctionList NativeFunctions;
-
-NativeCodeMap ReportedDebugFuncs;
-
-} // namespace
-
-/// Mock implementaion of opagent library
-namespace test_opagent {
-
-op_agent_t globalAgent = reinterpret_cast<op_agent_t>(42);
-
-op_agent_t open_agent()
-{
-  // return non-null op_agent_t
-  return globalAgent;
-}
-
-int close_agent(op_agent_t agent)
-{
-  EXPECT_EQ(globalAgent, agent);
-  return 0;
-}
-
-int write_native_code(op_agent_t agent,
-                      const char* name,
-                      uint64_t addr,
-                      void const* code,
-                      unsigned int size)
-{
-  EXPECT_EQ(globalAgent, agent);
-  OprofileNativeFunction func(name, addr, code, size);
-  NativeFunctions.push_back(func);
-
-  // Verify no other registration has take place for the same address
-  EXPECT_TRUE(ReportedDebugFuncs.find(addr) == ReportedDebugFuncs.end());
-
-  ReportedDebugFuncs[addr];
-  return 0;
-}
-
-int write_debug_line_info(op_agent_t agent,
-                          void const* code,
-                          size_t num_entries,
-                          struct debug_line_info const* info)
-{
-  EXPECT_EQ(globalAgent, agent);
-
-  //verify code has been loaded first
-  uint64_t addr = reinterpret_cast<uint64_t>(code);
-  NativeCodeMap::iterator i = ReportedDebugFuncs.find(addr);
-  EXPECT_TRUE(i != ReportedDebugFuncs.end());
-
-  NativeDebugList NativeInfo(info, info + num_entries);
-
-  SourceLocations locs;
-  for(NativeDebugList::iterator i = NativeInfo.begin();
-      i != NativeInfo.end();
-      ++i) {
-    locs.push_back(std::make_pair(std::string(i->filename), i->lineno));
-  }
-  ReportedDebugFuncs[addr] = locs;
-
-  return 0;
-}
-
-int unload_native_code(op_agent_t agent, uint64_t addr) {
-  EXPECT_EQ(globalAgent, agent);
-
-  //verify that something for the given JIT addr has been loaded first
-  NativeCodeMap::iterator i = ReportedDebugFuncs.find(addr);
-  EXPECT_TRUE(i != ReportedDebugFuncs.end());
-  ReportedDebugFuncs.erase(i);
-  return 0;
-}
-
-int version() {
-  return 1;
-}
-
-bool is_oprofile_running() {
-  return true;
-}
-
-} //namespace test_opagent
-
-class OProfileJITEventListenerTest
-: public JITEventListenerTestBase<OProfileWrapper>
-{
-public:
-  OProfileJITEventListenerTest()
-  : JITEventListenerTestBase<OProfileWrapper>(
-    new OProfileWrapper(test_opagent::open_agent,
-      test_opagent::close_agent,
-      test_opagent::write_native_code,
-      test_opagent::write_debug_line_info,
-      test_opagent::unload_native_code,
-      test_opagent::version,
-      test_opagent::version,
-      test_opagent::is_oprofile_running))
-  {
-    EXPECT_TRUE(0 != MockWrapper);
-
-    Listener.reset(JITEventListener::createOProfileJITEventListener(
-      MockWrapper.get()));
-    EXPECT_TRUE(0 != Listener);
-    EE->RegisterJITEventListener(Listener.get());
-  }
-};
-
-TEST_F(OProfileJITEventListenerTest, NoDebugInfo) {
-  TestNoDebugInfo(ReportedDebugFuncs);
-}
-
-TEST_F(OProfileJITEventListenerTest, SingleLine) {
-  TestSingleLine(ReportedDebugFuncs);
-}
-
-TEST_F(OProfileJITEventListenerTest, MultipleLines) {
-  TestMultipleLines(ReportedDebugFuncs);
-}
-
-TEST_F(OProfileJITEventListenerTest, MultipleFiles) {
-  TestMultipleFiles(ReportedDebugFuncs);
-}
-
-testing::Environment* const jit_env =
-  testing::AddGlobalTestEnvironment(new JITEnvironment);

diff --git a/unittests/ExecutionEngine/MCJIT/CMakeLists.txt b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
index afa3f2a..b10cbb4 100644
--- a/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt

@@ -3,7 +3,7 @@
   Core
   ExecutionEngine
   IPO
-  JIT
+  MC
   MCJIT
   ScalarOpts
   Support

diff --git a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
index d03de89..c80b88b 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp

@@ -139,8 +139,6 @@
 
     // The operating systems below are known to be sufficiently incompatible
     // that they will fail the MCJIT C API tests.
-    UnsupportedOSs.push_back(Triple::Cygwin);
-
     UnsupportedEnvironments.push_back(Triple::Cygnus);
   }
   

diff --git a/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
index 98587f7..0582c92 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp

@@ -8,7 +8,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;

diff --git a/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp
index c5ca36e..b0d1bb3 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp

@@ -94,8 +94,8 @@
   Function *FA, *FB;
   createTwoModuleCase(A, FA, B, FB);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
   checkAdd(ptr);
@@ -114,8 +114,8 @@
   Function *FA, *FB;
   createTwoModuleCase(A, FA, B, FB);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FB->getName().str());
   TheJIT->finalizeObject();
@@ -135,8 +135,8 @@
   Function *FA, *FB;
   createTwoModuleExternCase(A, FA, B, FB);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FB->getName().str());
   TheJIT->finalizeObject();
@@ -156,8 +156,8 @@
   Function *FA, *FB;
   createTwoModuleExternCase(A, FA, B, FB);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
   checkAdd(ptr);
@@ -177,8 +177,8 @@
   createTwoModuleExternCase(A, FA1, B, FB);
   FA2 = insertSimpleCallFunction<int32_t(int32_t, int32_t)>(A.get(), FA1);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FB->getName().str());
   TheJIT->finalizeObject();
@@ -213,8 +213,8 @@
   FB = startFunction<int32_t(void)>(B.get(), "FB");
   endFunctionWithRet(FB, Builder.CreateLoad(GVB));
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
 
   uint64_t FBPtr = TheJIT->getFunctionAddress(FB->getName().str());
   TheJIT->finalizeObject();
@@ -241,9 +241,9 @@
   Function *FA, *FB, *FC;
   createThreeModuleCase(A, FA, B, FB, C, FC);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
-  TheJIT->addModule(C.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
+  TheJIT->addModule(std::move(C));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FC->getName().str());
   checkAdd(ptr);
@@ -266,9 +266,9 @@
   Function *FA, *FB, *FC;
   createThreeModuleCase(A, FA, B, FB, C, FC);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
-  TheJIT->addModule(C.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
+  TheJIT->addModule(std::move(C));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
   checkAdd(ptr);
@@ -291,9 +291,9 @@
   Function *FA, *FB, *FC;
   createThreeModuleChainedCallsCase(A, FA, B, FB, C, FC);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
-  TheJIT->addModule(C.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
+  TheJIT->addModule(std::move(C));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FC->getName().str());
   checkAdd(ptr);
@@ -316,9 +316,9 @@
   Function *FA, *FB, *FC;
   createThreeModuleChainedCallsCase(A, FA, B, FB, C, FC);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
-  TheJIT->addModule(C.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
+  TheJIT->addModule(std::move(C));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
   checkAdd(ptr);
@@ -341,8 +341,8 @@
   Function *FA, *FB1, *FB2;
   createCrossModuleRecursiveCase(A, FA, B, FB1, FB2);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
   checkAccumulate(ptr);
@@ -362,8 +362,8 @@
   Function *FA, *FB1, *FB2;
   createCrossModuleRecursiveCase(A, FA, B, FB1, FB2);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FB1->getName().str());
   checkAccumulate(ptr);
@@ -383,8 +383,8 @@
   Function *FA, *FB1, *FB2;
   createCrossModuleRecursiveCase(A, FA, B, FB1, FB2);
 
-  createJIT(A.release());
-  TheJIT->addModule(B.release());
+  createJIT(std::move(A));
+  TheJIT->addModule(std::move(B));
 
   uint64_t ptr = TheJIT->getFunctionAddress(FB1->getName().str());
   checkAccumulate(ptr);

diff --git a/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
index fbbab42..2736383 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITObjectCacheTest.cpp

@@ -11,7 +11,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/ExecutionEngine/JIT.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
@@ -25,17 +24,7 @@
 public:
   TestObjectCache() : DuplicateInserted(false) { }
 
-  virtual ~TestObjectCache() {
-    // Free any buffers we've allocated.
-    SmallVectorImpl<MemoryBuffer *>::iterator it, end;
-    end = AllocatedBuffers.end();
-    for (it = AllocatedBuffers.begin(); it != end; ++it) {
-      delete *it;
-    }
-    AllocatedBuffers.clear();
-  }
-
-  virtual void notifyObjectCompiled(const Module *M, const MemoryBuffer *Obj) {
+  void notifyObjectCompiled(const Module *M, MemoryBufferRef Obj) override {
     // If we've seen this module before, note that.
     const std::string ModuleID = M->getModuleIdentifier();
     if (ObjMap.find(ModuleID) != ObjMap.end())
@@ -44,7 +33,7 @@
     ObjMap[ModuleID] = copyBuffer(Obj);
   }
 
-  virtual MemoryBuffer* getObject(const Module* M) {
+  virtual std::unique_ptr<MemoryBuffer> getObject(const Module* M) override {
     const MemoryBuffer* BufferFound = getObjectInternal(M);
     ModulesLookedUp.insert(M->getModuleIdentifier());
     if (!BufferFound)
@@ -72,16 +61,18 @@
   }
 
 private:
-  MemoryBuffer *copyBuffer(const MemoryBuffer *Buf) {
+  MemoryBuffer *copyBuffer(MemoryBufferRef Buf) {
     // Create a local copy of the buffer.
-    MemoryBuffer *NewBuffer = MemoryBuffer::getMemBufferCopy(Buf->getBuffer());
-    AllocatedBuffers.push_back(NewBuffer);
-    return NewBuffer;
+    std::unique_ptr<MemoryBuffer> NewBuffer =
+        MemoryBuffer::getMemBufferCopy(Buf.getBuffer());
+    MemoryBuffer *Ret = NewBuffer.get();
+    AllocatedBuffers.push_back(std::move(NewBuffer));
+    return Ret;
   }
 
   StringMap<const MemoryBuffer *> ObjMap;
   StringSet<>                     ModulesLookedUp;
-  SmallVector<MemoryBuffer *, 2>  AllocatedBuffers;
+  SmallVector<std::unique_ptr<MemoryBuffer>, 2> AllocatedBuffers;
   bool                            DuplicateInserted;
 };
 
@@ -121,7 +112,7 @@
 TEST_F(MCJITObjectCacheTest, SetNullObjectCache) {
   SKIP_UNSUPPORTED_PLATFORM;
 
-  createJIT(M.release());
+  createJIT(std::move(M));
 
   TheJIT->setObjectCache(nullptr);
 
@@ -137,7 +128,7 @@
   // Save a copy of the module pointer before handing it off to MCJIT.
   const Module * SavedModulePointer = M.get();
 
-  createJIT(M.release());
+  createJIT(std::move(M));
 
   TheJIT->setObjectCache(Cache.get());
 
@@ -164,7 +155,7 @@
   std::unique_ptr<TestObjectCache> Cache(new TestObjectCache);
 
   // Compile this module with an MCJIT engine
-  createJIT(M.release());
+  createJIT(std::move(M));
   TheJIT->setObjectCache(Cache.get());
   TheJIT->finalizeObject();
 
@@ -181,7 +172,7 @@
   const Module * SecondModulePointer = M.get();
 
   // Create a new MCJIT instance to load this module then execute it.
-  createJIT(M.release());
+  createJIT(std::move(M));
   TheJIT->setObjectCache(Cache.get());
   compileAndRun();
 
@@ -198,7 +189,7 @@
   std::unique_ptr<TestObjectCache> Cache(new TestObjectCache);
 
   // Compile this module with an MCJIT engine
-  createJIT(M.release());
+  createJIT(std::move(M));
   TheJIT->setObjectCache(Cache.get());
   TheJIT->finalizeObject();
 
@@ -216,7 +207,7 @@
   const Module * SecondModulePointer = M.get();
 
   // Create a new MCJIT instance to load this module then execute it.
-  createJIT(M.release());
+  createJIT(std::move(M));
   TheJIT->setObjectCache(Cache.get());
 
   // Verify that our object cache does not contain the module yet.

diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
index c37c1d1..64d8c2f 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp

@@ -49,7 +49,7 @@
 
   int initialValue = 5;
   GlobalValue *Global = insertGlobalInt32(M.get(), "test_global", initialValue);
-  createJIT(M.release());
+  createJIT(std::move(M));
   void *globalPtr =  TheJIT->getPointerToGlobal(Global);
   EXPECT_TRUE(nullptr != globalPtr)
     << "Unable to get pointer to global value from JIT";
@@ -62,7 +62,7 @@
   SKIP_UNSUPPORTED_PLATFORM;
 
   Function *F = insertAddFunction(M.get());
-  createJIT(M.release());
+  createJIT(std::move(M));
   uint64_t addPtr = TheJIT->getFunctionAddress(F->getName().str());
   EXPECT_TRUE(0 != addPtr)
     << "Unable to get pointer to function from JIT";
@@ -83,7 +83,7 @@
 
   int rc = 6;
   Function *Main = insertMainFunction(M.get(), 6);
-  createJIT(M.release());
+  createJIT(std::move(M));
   uint64_t ptr = TheJIT->getFunctionAddress(Main->getName().str());
   EXPECT_TRUE(0 != ptr)
     << "Unable to get pointer to main() from JIT";
@@ -104,7 +104,7 @@
   Value *ReadGlobal = Builder.CreateLoad(GV);
   endFunctionWithRet(ReturnGlobal, ReadGlobal);
 
-  createJIT(M.release());
+  createJIT(std::move(M));
   uint64_t rgvPtr = TheJIT->getFunctionAddress(ReturnGlobal->getName().str());
   EXPECT_TRUE(0 != rgvPtr);
 
@@ -175,7 +175,7 @@
     Inner = Outer;
   }
 
-  createJIT(M.release());
+  createJIT(std::move(M));
   uint64_t ptr = TheJIT->getFunctionAddress(Outer->getName().str());
   EXPECT_TRUE(0 != ptr)
     << "Unable to get pointer to outer function from JIT";
@@ -187,4 +187,16 @@
 
 #endif /*!defined(__arm__)*/
 
+TEST_F(MCJITTest, multiple_decl_lookups) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  Function *Foo = insertExternalReferenceToFunction<void(void)>(M.get(), "_exit");
+  createJIT(std::move(M));
+  void *A = TheJIT->getPointerToFunction(Foo);
+  void *B = TheJIT->getPointerToFunction(Foo);
+
+  EXPECT_TRUE(A != 0) << "Failed lookup - test not correctly configured.";
+  EXPECT_EQ(A, B) << "Repeat calls to getPointerToFunction fail.";
+}
+
 }

diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTestAPICommon.h b/unittests/ExecutionEngine/MCJIT/MCJITTestAPICommon.h
index a48c071..7d704de 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTestAPICommon.h
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTestAPICommon.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MCJIT_TEST_API_COMMON_H
-#define MCJIT_TEST_API_COMMON_H
+#ifndef LLVM_UNITTESTS_EXECUTIONENGINE_MCJIT_MCJITTESTAPICOMMON_H
+#define LLVM_UNITTESTS_EXECUTIONENGINE_MCJIT_MCJITTESTAPICOMMON_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
@@ -93,5 +93,5 @@
 
 } // namespace llvm
 
-#endif // MCJIT_TEST_API_COMMON_H
+#endif
 

diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
index 25de312..eea88bb 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h

@@ -14,8 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef MCJIT_TEST_BASE_H
-#define MCJIT_TEST_BASE_H
+#ifndef LLVM_UNITTESTS_EXECUTIONENGINE_MCJIT_MCJITTESTBASE_H
+#define LLVM_UNITTESTS_EXECUTIONENGINE_MCJIT_MCJITTESTBASE_H
 
 #include "MCJITTestAPICommon.h"
 #include "llvm/Config/config.h"
@@ -107,6 +107,15 @@
     return Result;
   }
 
+  // Inserts a declaration to a function defined elsewhere
+  template <typename FuncType>
+  Function *insertExternalReferenceToFunction(Module *M, StringRef Name) {
+    Function *Result = Function::Create(
+                         TypeBuilder<FuncType, false>::get(Context),
+                         GlobalValue::ExternalLinkage, Name, M);
+    return Result;
+  }
+
   // Inserts an declaration to a function defined elsewhere
   Function *insertExternalReferenceToFunction(Module *M, StringRef Name,
                                               FunctionType *FuncTy) {
@@ -302,26 +311,23 @@
     // The operating systems below are known to be incompatible with MCJIT as
     // they are copied from the test/ExecutionEngine/MCJIT/lit.local.cfg and
     // should be kept in sync.
-    UnsupportedOSs.push_back(Triple::Cygwin);
     UnsupportedOSs.push_back(Triple::Darwin);
 
     UnsupportedEnvironments.push_back(Triple::Cygnus);
   }
 
-  void createJIT(Module *M) {
+  void createJIT(std::unique_ptr<Module> M) {
 
     // Due to the EngineBuilder constructor, it is required to have a Module
     // in order to construct an ExecutionEngine (i.e. MCJIT)
     assert(M != 0 && "a non-null Module must be provided to create MCJIT");
 
-    EngineBuilder EB(M);
+    EngineBuilder EB(std::move(M));
     std::string Error;
     TheJIT.reset(EB.setEngineKind(EngineKind::JIT)
-                 .setUseMCJIT(true) /* can this be folded into the EngineKind enum? */
                  .setMCJITMemoryManager(MM)
                  .setErrorStr(&Error)
                  .setOptLevel(CodeGenOpt::None)
-                 .setAllocateGVsWithCode(false) /*does this do anything?*/
                  .setCodeModel(CodeModel::JITDefault)
                  .setRelocationModel(Reloc::Default)
                  .setMArch(MArch)
@@ -345,4 +351,4 @@
 
 } // namespace llvm
 
-#endif // MCJIT_TEST_H
+#endif

diff --git a/unittests/ExecutionEngine/MCJIT/Makefile b/unittests/ExecutionEngine/MCJIT/Makefile
index c4dd740..2822b20 100644
--- a/unittests/ExecutionEngine/MCJIT/Makefile
+++ b/unittests/ExecutionEngine/MCJIT/Makefile

@@ -9,7 +9,7 @@
 
 LEVEL = ../../..
 TESTNAME = MCJIT
-LINK_COMPONENTS := core ipo jit mcjit native support
+LINK_COMPONENTS := core ipo mcjit native support
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest

diff --git a/unittests/ExecutionEngine/Makefile b/unittests/ExecutionEngine/Makefile
index 38e667f..8ecb883 100644
--- a/unittests/ExecutionEngine/Makefile
+++ b/unittests/ExecutionEngine/Makefile

@@ -14,7 +14,7 @@
 include $(LEVEL)/Makefile.config
 
 ifeq ($(TARGET_HAS_JIT),1)
-  PARALLEL_DIRS = JIT MCJIT
+  PARALLEL_DIRS = MCJIT
 endif
 
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest

diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index b439d59..a046209 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt

@@ -10,6 +10,7 @@
   AttributesTest.cpp
   ConstantRangeTest.cpp
   ConstantsTest.cpp
+  DebugInfoTest.cpp
   DominatorTreeTest.cpp
   IRBuilderTest.cpp
   InstructionsTest.cpp
@@ -21,6 +22,7 @@
   PatternMatch.cpp
   TypeBuilderTest.cpp
   TypesTest.cpp
+  UseTest.cpp
   UserTest.cpp
   ValueHandleTest.cpp
   ValueMapTest.cpp

diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index 0cd8549..5414b25 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp

@@ -274,5 +274,78 @@
 
 #undef CHECK
 
+TEST(ConstantsTest, ConstantArrayReplaceWithConstant) {
+  LLVMContext Context;
+  std::unique_ptr<Module> M(new Module("MyModule", Context));
+
+  Type *IntTy = Type::getInt8Ty(Context);
+  ArrayType *ArrayTy = ArrayType::get(IntTy, 2);
+  Constant *A01Vals[2] = {ConstantInt::get(IntTy, 0),
+                          ConstantInt::get(IntTy, 1)};
+  Constant *A01 = ConstantArray::get(ArrayTy, A01Vals);
+
+  Constant *Global = new GlobalVariable(*M, IntTy, false,
+                                        GlobalValue::ExternalLinkage, nullptr);
+  Constant *GlobalInt = ConstantExpr::getPtrToInt(Global, IntTy);
+  Constant *A0GVals[2] = {ConstantInt::get(IntTy, 0), GlobalInt};
+  Constant *A0G = ConstantArray::get(ArrayTy, A0GVals);
+  ASSERT_NE(A01, A0G);
+
+  GlobalVariable *RefArray =
+      new GlobalVariable(*M, ArrayTy, false, GlobalValue::ExternalLinkage, A0G);
+  ASSERT_EQ(A0G, RefArray->getInitializer());
+
+  GlobalInt->replaceAllUsesWith(ConstantInt::get(IntTy, 1));
+  ASSERT_EQ(A01, RefArray->getInitializer());
+}
+
+TEST(ConstantsTest, ConstantExprReplaceWithConstant) {
+  LLVMContext Context;
+  std::unique_ptr<Module> M(new Module("MyModule", Context));
+
+  Type *IntTy = Type::getInt8Ty(Context);
+  Constant *G1 = new GlobalVariable(*M, IntTy, false,
+                                    GlobalValue::ExternalLinkage, nullptr);
+  Constant *G2 = new GlobalVariable(*M, IntTy, false,
+                                    GlobalValue::ExternalLinkage, nullptr);
+  ASSERT_NE(G1, G2);
+
+  Constant *Int1 = ConstantExpr::getPtrToInt(G1, IntTy);
+  Constant *Int2 = ConstantExpr::getPtrToInt(G2, IntTy);
+  ASSERT_NE(Int1, Int2);
+
+  GlobalVariable *Ref =
+      new GlobalVariable(*M, IntTy, false, GlobalValue::ExternalLinkage, Int1);
+  ASSERT_EQ(Int1, Ref->getInitializer());
+
+  G1->replaceAllUsesWith(G2);
+  ASSERT_EQ(Int2, Ref->getInitializer());
+}
+
+TEST(ConstantsTest, GEPReplaceWithConstant) {
+  LLVMContext Context;
+  std::unique_ptr<Module> M(new Module("MyModule", Context));
+
+  Type *IntTy = Type::getInt32Ty(Context);
+  Type *PtrTy = PointerType::get(IntTy, 0);
+  auto *C1 = ConstantInt::get(IntTy, 1);
+  auto *Placeholder = new GlobalVariable(
+      *M, IntTy, false, GlobalValue::ExternalWeakLinkage, nullptr);
+  auto *GEP = ConstantExpr::getGetElementPtr(Placeholder, C1);
+  ASSERT_EQ(GEP->getOperand(0), Placeholder);
+
+  auto *Ref =
+      new GlobalVariable(*M, PtrTy, false, GlobalValue::ExternalLinkage, GEP);
+  ASSERT_EQ(GEP, Ref->getInitializer());
+
+  auto *Global = new GlobalVariable(*M, PtrTy, false,
+                                    GlobalValue::ExternalLinkage, nullptr);
+  auto *Alias = GlobalAlias::create(IntTy, 0, GlobalValue::ExternalLinkage,
+                                    "alias", Global, M.get());
+  Placeholder->replaceAllUsesWith(Alias);
+  ASSERT_EQ(GEP, Ref->getInitializer());
+  ASSERT_EQ(GEP->getOperand(0), Alias);
+}
+
 }  // end anonymous namespace
 }  // end namespace llvm

diff --git a/unittests/IR/DebugInfoTest.cpp b/unittests/IR/DebugInfoTest.cpp
new file mode 100644
index 0000000..1fa851e
--- /dev/null
+++ b/unittests/IR/DebugInfoTest.cpp

@@ -0,0 +1,68 @@
+//===- llvm/unittest/IR/DebugInfo.cpp - DebugInfo tests -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DebugInfo.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+static void PrintTo(const StringRef &S, ::std::ostream *os) {
+  *os << "(" << (const void *)S.data() << "," << S.size() << ") = '";
+  for (auto C : S)
+    if (C)
+      *os << C;
+    else
+      *os << "\\00";
+  *os << "'";
+}
+static void PrintTo(const DIHeaderFieldIterator &I, ::std::ostream *os) {
+  PrintTo(I.getCurrent(), os);
+  *os << " in ";
+  PrintTo(I.getHeader(), os);
+}
+
+} // end namespace llvm
+
+namespace {
+
+#define MAKE_FIELD_ITERATOR(S)                                                 \
+  DIHeaderFieldIterator(StringRef(S, sizeof(S) - 1))
+TEST(DebugInfoTest, DIHeaderFieldIterator) {
+  ASSERT_EQ(DIHeaderFieldIterator(), DIHeaderFieldIterator());
+
+  ASSERT_NE(DIHeaderFieldIterator(), MAKE_FIELD_ITERATOR(""));
+  ASSERT_EQ(DIHeaderFieldIterator(), ++MAKE_FIELD_ITERATOR(""));
+  ASSERT_EQ("", *DIHeaderFieldIterator(""));
+
+  ASSERT_NE(DIHeaderFieldIterator(), MAKE_FIELD_ITERATOR("stuff"));
+  ASSERT_EQ(DIHeaderFieldIterator(), ++MAKE_FIELD_ITERATOR("stuff"));
+  ASSERT_EQ("stuff", *DIHeaderFieldIterator("stuff"));
+
+  ASSERT_NE(DIHeaderFieldIterator(), MAKE_FIELD_ITERATOR("st\0uff"));
+  ASSERT_NE(DIHeaderFieldIterator(), ++MAKE_FIELD_ITERATOR("st\0uff"));
+  ASSERT_EQ(DIHeaderFieldIterator(), ++++MAKE_FIELD_ITERATOR("st\0uff"));
+  ASSERT_EQ("st", *MAKE_FIELD_ITERATOR("st\0uff"));
+  ASSERT_EQ("uff", *++MAKE_FIELD_ITERATOR("st\0uff"));
+
+  ASSERT_NE(DIHeaderFieldIterator(), MAKE_FIELD_ITERATOR("stuff\0"));
+  ASSERT_NE(DIHeaderFieldIterator(), ++MAKE_FIELD_ITERATOR("stuff\0"));
+  ASSERT_EQ(DIHeaderFieldIterator(), ++++MAKE_FIELD_ITERATOR("stuff\0"));
+  ASSERT_EQ("stuff", *MAKE_FIELD_ITERATOR("stuff\0"));
+  ASSERT_EQ("", *++MAKE_FIELD_ITERATOR("stuff\0"));
+
+  ASSERT_NE(DIHeaderFieldIterator(), MAKE_FIELD_ITERATOR("\0stuff"));
+  ASSERT_NE(DIHeaderFieldIterator(), ++MAKE_FIELD_ITERATOR("\0stuff"));
+  ASSERT_EQ(DIHeaderFieldIterator(), ++++MAKE_FIELD_ITERATOR("\0stuff"));
+  ASSERT_EQ("", *MAKE_FIELD_ITERATOR("\0stuff"));
+  ASSERT_EQ("stuff", *++MAKE_FIELD_ITERATOR("\0stuff"));
+}
+
+} // end namespace

diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index ab43d1c..6c43d6f 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp

@@ -186,8 +186,7 @@
     };
     char DPass::ID = 0;
 
-
-    Module* makeLLVMModule(DPass *P) {
+    std::unique_ptr<Module> makeLLVMModule(DPass *P) {
       const char *ModuleStrig =
         "declare i32 @g()\n" \
         "define void @f(i32 %x) {\n" \
@@ -213,12 +212,12 @@
         "}\n";
       LLVMContext &C = getGlobalContext();
       SMDiagnostic Err;
-      return ParseAssemblyString(ModuleStrig, nullptr, Err, C);
+      return parseAssemblyString(ModuleStrig, Err, C);
     }
 
     TEST(DominatorTree, Unreachable) {
       DPass *P = new DPass();
-      std::unique_ptr<Module> M(makeLLVMModule(P));
+      std::unique_ptr<Module> M = makeLLVMModule(P);
       PassManager Passes;
       Passes.add(P);
       Passes.run(*M);

diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 2108575..df5c840 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp

@@ -189,12 +189,16 @@
 
   Builder.clearFastMathFlags();
 
+  // To test a copy, make sure that a '0' and a '1' change state. 
   F = Builder.CreateFDiv(F, F);
   ASSERT_TRUE(isa<Instruction>(F));
   FDiv = cast<Instruction>(F);
   EXPECT_FALSE(FDiv->getFastMathFlags().any());
+  FDiv->setHasAllowReciprocal(true);
+  FAdd->setHasAllowReciprocal(false);
   FDiv->copyFastMathFlags(FAdd);
   EXPECT_TRUE(FDiv->hasNoNaNs());
+  EXPECT_FALSE(FDiv->hasAllowReciprocal());
 
 }
 

diff --git a/unittests/IR/LegacyPassManagerTest.cpp b/unittests/IR/LegacyPassManagerTest.cpp
index 9c2a835..4efc2f5 100644
--- a/unittests/IR/LegacyPassManagerTest.cpp
+++ b/unittests/IR/LegacyPassManagerTest.cpp

@@ -54,11 +54,11 @@
       static char run;
       static char ID;
       ModuleNDNM() : ModulePass(ID) { }
-      virtual bool runOnModule(Module &M) {
+      bool runOnModule(Module &M) override {
         run++;
         return false;
       }
-      virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      void getAnalysisUsage(AnalysisUsage &AU) const override {
         AU.setPreservesAll();
       }
     };
@@ -70,7 +70,7 @@
       static char run;
       static char ID;
       ModuleNDM() : ModulePass(ID) {}
-      virtual bool runOnModule(Module &M) {
+      bool runOnModule(Module &M) override {
         run++;
         return true;
       }
@@ -83,7 +83,7 @@
       static char run;
       static char ID;
       ModuleNDM2() : ModulePass(ID) {}
-      virtual bool runOnModule(Module &M) {
+      bool runOnModule(Module &M) override {
         run++;
         return true;
       }
@@ -98,12 +98,12 @@
       ModuleDNM() : ModulePass(ID) {
         initializeModuleNDMPass(*PassRegistry::getPassRegistry());
       }
-      virtual bool runOnModule(Module &M) {
+      bool runOnModule(Module &M) override {
         EXPECT_TRUE(getAnalysisIfAvailable<DataLayoutPass>());
         run++;
         return false;
       }
-      virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      void getAnalysisUsage(AnalysisUsage &AU) const override {
         AU.addRequired<ModuleNDM>();
         AU.setPreservesAll();
       }
@@ -139,7 +139,7 @@
         runc = 0;
       }
 
-      virtual void releaseMemory() {
+      void releaseMemory() override {
         EXPECT_GT(runc, 0);
         EXPECT_GT(allocated, 0);
         allocated--;
@@ -157,12 +157,12 @@
       using llvm::Pass::doInitialization;
       using llvm::Pass::doFinalization;
 #endif
-      virtual bool doInitialization(T &t) {
+      bool doInitialization(T &t) override {
         EXPECT_FALSE(PassTestBase<P>::initialized);
         PassTestBase<P>::initialized = true;
         return false;
       }
-      virtual bool doFinalization(T &t) {
+      bool doFinalization(T &t) override {
         EXPECT_FALSE(PassTestBase<P>::finalized);
         PassTestBase<P>::finalized = true;
         EXPECT_EQ(0, PassTestBase<P>::allocated);
@@ -175,7 +175,7 @@
       CGPass() {
         initializeCGPassPass(*PassRegistry::getPassRegistry());
       }
-      virtual bool runOnSCC(CallGraphSCC &SCMM) {
+      bool runOnSCC(CallGraphSCC &SCMM) override {
         EXPECT_TRUE(getAnalysisIfAvailable<DataLayoutPass>());
         run();
         return false;
@@ -184,7 +184,7 @@
 
     struct FPass : public PassTest<Module, FunctionPass> {
     public:
-      virtual bool runOnFunction(Function &F) {
+      bool runOnFunction(Function &F) override {
         // FIXME: PR4112
         // EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
         run();
@@ -209,17 +209,17 @@
       }
       using llvm::Pass::doInitialization;
       using llvm::Pass::doFinalization;
-      virtual bool doInitialization(Loop* L, LPPassManager &LPM) {
+      bool doInitialization(Loop* L, LPPassManager &LPM) override {
         initialized = true;
         initcount++;
         return false;
       }
-      virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+      bool runOnLoop(Loop *L, LPPassManager &LPM) override {
         EXPECT_TRUE(getAnalysisIfAvailable<DataLayoutPass>());
         run();
         return false;
       }
-      virtual bool doFinalization() {
+      bool doFinalization() override {
         fincount++;
         finalized = true;
         return false;
@@ -242,25 +242,25 @@
         inited = 0;
         fin = 0;
       }
-      virtual bool doInitialization(Module &M) {
+      bool doInitialization(Module &M) override {
         EXPECT_FALSE(initialized);
         initialized = true;
         return false;
       }
-      virtual bool doInitialization(Function &F) {
+      bool doInitialization(Function &F) override {
         inited++;
         return false;
       }
-      virtual bool runOnBasicBlock(BasicBlock &BB) {
+      bool runOnBasicBlock(BasicBlock &BB) override {
         EXPECT_TRUE(getAnalysisIfAvailable<DataLayoutPass>());
         run();
         return false;
       }
-      virtual bool doFinalization(Function &F) {
+      bool doFinalization(Function &F) override {
         fin++;
         return false;
       }
-      virtual bool doFinalization(Module &M) {
+      bool doFinalization(Module &M) override {
         EXPECT_FALSE(finalized);
         finalized = true;
         EXPECT_EQ(0, allocated);
@@ -276,7 +276,7 @@
       OnTheFlyTest() : ModulePass(ID) {
         initializeFPassPass(*PassRegistry::getPassRegistry());
       }
-      virtual bool runOnModule(Module &M) {
+      bool runOnModule(Module &M) override {
         EXPECT_TRUE(getAnalysisIfAvailable<DataLayoutPass>());
         for (Module::iterator I=M.begin(),E=M.end(); I != E; ++I) {
           Function &F = *I;
@@ -287,7 +287,7 @@
         }
         return false;
       }
-      virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      void getAnalysisUsage(AnalysisUsage &AU) const override {
         AU.addRequired<FPass>();
       }
     };
@@ -303,7 +303,7 @@
       mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
 
       PassManager Passes;
-      Passes.add(new DataLayoutPass(&M));
+      Passes.add(new DataLayoutPass());
       Passes.add(mNDM2);
       Passes.add(mNDM);
       Passes.add(mNDNM);
@@ -327,7 +327,7 @@
       mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
 
       PassManager Passes;
-      Passes.add(new DataLayoutPass(&M));
+      Passes.add(new DataLayoutPass());
       Passes.add(mNDM);
       Passes.add(mNDNM);
       Passes.add(mNDM2);// invalidates mNDM needed by mDNM
@@ -349,7 +349,7 @@
       std::unique_ptr<Module> M(makeLLVMModule());
       T *P = new T();
       PassManager Passes;
-      Passes.add(new DataLayoutPass(M.get()));
+      Passes.add(new DataLayoutPass());
       Passes.add(P);
       Passes.run(*M);
       T::finishedOK(run);
@@ -360,7 +360,7 @@
       Module *M = makeLLVMModule();
       T *P = new T();
       PassManager Passes;
-      Passes.add(new DataLayoutPass(M));
+      Passes.add(new DataLayoutPass());
       Passes.add(P);
       Passes.run(*M);
       T::finishedOK(run, N);
@@ -398,7 +398,7 @@
         SCOPED_TRACE("Running OnTheFlyTest");
         struct OnTheFlyTest *O = new OnTheFlyTest();
         PassManager Passes;
-        Passes.add(new DataLayoutPass(M));
+        Passes.add(new DataLayoutPass());
         Passes.add(O);
         Passes.run(*M);
 

diff --git a/unittests/IR/PassManagerTest.cpp b/unittests/IR/PassManagerTest.cpp
index 25037a7..d493156 100644
--- a/unittests/IR/PassManagerTest.cpp
+++ b/unittests/IR/PassManagerTest.cpp

@@ -168,7 +168,7 @@
 Module *parseIR(const char *IR) {
   LLVMContext &C = getGlobalContext();
   SMDiagnostic Err;
-  return ParseAssemblyString(IR, nullptr, Err, C);
+  return parseAssemblyString(IR, Err, C).release();
 }
 
 class PassManagerTest : public ::testing::Test {

diff --git a/unittests/IR/UseTest.cpp b/unittests/IR/UseTest.cpp
new file mode 100644
index 0000000..3f33ca6
--- /dev/null
+++ b/unittests/IR/UseTest.cpp

@@ -0,0 +1,112 @@
+//===- llvm/unittest/IR/UseTest.cpp - Use unit tests ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(UseTest, sort) {
+  LLVMContext C;
+
+  const char *ModuleString = "define void @f(i32 %x) {\n"
+                             "entry:\n"
+                             "  %v0 = add i32 %x, 0\n"
+                             "  %v2 = add i32 %x, 2\n"
+                             "  %v5 = add i32 %x, 5\n"
+                             "  %v1 = add i32 %x, 1\n"
+                             "  %v3 = add i32 %x, 3\n"
+                             "  %v7 = add i32 %x, 7\n"
+                             "  %v6 = add i32 %x, 6\n"
+                             "  %v4 = add i32 %x, 4\n"
+                             "  ret void\n"
+                             "}\n";
+  SMDiagnostic Err;
+  char vnbuf[8];
+  std::unique_ptr<Module> M = parseAssemblyString(ModuleString, Err, C);
+  Function *F = M->getFunction("f");
+  ASSERT_TRUE(F);
+  ASSERT_TRUE(F->arg_begin() != F->arg_end());
+  Argument &X = *F->arg_begin();
+  ASSERT_EQ("x", X.getName());
+
+  X.sortUseList([](const Use &L, const Use &R) {
+    return L.getUser()->getName() < R.getUser()->getName();
+  });
+  unsigned I = 0;
+  for (User *U : X.users()) {
+    snprintf(vnbuf, sizeof(vnbuf), "v%u", I++);
+    EXPECT_EQ(vnbuf, U->getName());
+  }
+  ASSERT_EQ(8u, I);
+
+  X.sortUseList([](const Use &L, const Use &R) {
+    return L.getUser()->getName() > R.getUser()->getName();
+  });
+  I = 0;
+  for (User *U : X.users()) {
+    snprintf(vnbuf, sizeof(vnbuf), "v%u", (7 - I++));
+    EXPECT_EQ(vnbuf, U->getName());
+  }
+  ASSERT_EQ(8u, I);
+}
+
+TEST(UseTest, reverse) {
+  LLVMContext C;
+
+  const char *ModuleString = "define void @f(i32 %x) {\n"
+                             "entry:\n"
+                             "  %v0 = add i32 %x, 0\n"
+                             "  %v2 = add i32 %x, 2\n"
+                             "  %v5 = add i32 %x, 5\n"
+                             "  %v1 = add i32 %x, 1\n"
+                             "  %v3 = add i32 %x, 3\n"
+                             "  %v7 = add i32 %x, 7\n"
+                             "  %v6 = add i32 %x, 6\n"
+                             "  %v4 = add i32 %x, 4\n"
+                             "  ret void\n"
+                             "}\n";
+  SMDiagnostic Err;
+  char vnbuf[8];
+  std::unique_ptr<Module> M = parseAssemblyString(ModuleString, Err, C);
+  Function *F = M->getFunction("f");
+  ASSERT_TRUE(F);
+  ASSERT_TRUE(F->arg_begin() != F->arg_end());
+  Argument &X = *F->arg_begin();
+  ASSERT_EQ("x", X.getName());
+
+  X.sortUseList([](const Use &L, const Use &R) {
+    return L.getUser()->getName() < R.getUser()->getName();
+  });
+  unsigned I = 0;
+  for (User *U : X.users()) {
+    snprintf(vnbuf, sizeof(vnbuf), "v%u", I++);
+    EXPECT_EQ(vnbuf, U->getName());
+  }
+  ASSERT_EQ(8u, I);
+
+  X.reverseUseList();
+  I = 0;
+  for (User *U : X.users()) {
+    snprintf(vnbuf, sizeof(vnbuf), "v%u", (7 - I++));
+    EXPECT_EQ(vnbuf, U->getName());
+  }
+  ASSERT_EQ(8u, I);
+}
+
+} // end anonymous namespace

diff --git a/unittests/IR/UserTest.cpp b/unittests/IR/UserTest.cpp
index eb07e82..5572424 100644
--- a/unittests/IR/UserTest.cpp
+++ b/unittests/IR/UserTest.cpp

@@ -65,7 +65,7 @@
                              "  ret void\n"
                              "}\n";
   SMDiagnostic Err;
-  Module *M = ParseAssemblyString(ModuleString, nullptr, Err, C);
+  std::unique_ptr<Module> M = parseAssemblyString(ModuleString, Err, C);
 
   Function *F = M->getFunction("f");
   BasicBlock &ExitBB = F->back();

diff --git a/unittests/IR/ValueMapTest.cpp b/unittests/IR/ValueMapTest.cpp
index 0b7198f..a6bad71 100644
--- a/unittests/IR/ValueMapTest.cpp
+++ b/unittests/IR/ValueMapTest.cpp

@@ -186,11 +186,11 @@
   };
   static void onRAUW(const ExtraData &Data, KeyT Old, KeyT New) {
     *Data.CalledRAUW = true;
-    EXPECT_FALSE(Data.M->tryacquire()) << "Mutex should already be locked.";
+    EXPECT_FALSE(Data.M->try_lock()) << "Mutex should already be locked.";
   }
   static void onDelete(const ExtraData &Data, KeyT Old) {
     *Data.CalledDeleted = true;
-    EXPECT_FALSE(Data.M->tryacquire()) << "Mutex should already be locked.";
+    EXPECT_FALSE(Data.M->try_lock()) << "Mutex should already be locked.";
   }
   static MutexT *getMutex(const ExtraData &Data) { return Data.M; }
 };

diff --git a/unittests/IR/ValueTest.cpp b/unittests/IR/ValueTest.cpp
index 61e44a9..4dd0c2c 100644
--- a/unittests/IR/ValueTest.cpp
+++ b/unittests/IR/ValueTest.cpp

@@ -34,7 +34,7 @@
                              "  ret void\n"
                              "}\n";
   SMDiagnostic Err;
-  Module *M = ParseAssemblyString(ModuleString, nullptr, Err, C);
+  std::unique_ptr<Module> M = parseAssemblyString(ModuleString, Err, C);
 
   Function *F = M->getFunction("f");
 
@@ -60,6 +60,10 @@
                          GlobalVariable::NotThreadLocal,
                          1);
 
+  EXPECT_TRUE(Value::MaximumAlignment == 536870912U);
+  Dummy0->setAlignment(536870912U);
+  EXPECT_EQ(Dummy0->getAlignment(), 536870912U);
+
   // Make sure the address space isn't dropped when returning this.
   Constant *Dummy1 = M->getOrInsertGlobal("dummy", Int32Ty);
   EXPECT_EQ(Dummy0, Dummy1);
@@ -83,4 +87,23 @@
   EXPECT_EQ(1u, DummyCast1->getType()->getPointerAddressSpace());
   EXPECT_NE(DummyCast0, DummyCast1) << *DummyCast1;
 }
+
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+TEST(GlobalTest, AlignDeath) {
+  LLVMContext &Ctx = getGlobalContext();
+  std::unique_ptr<Module> M(new Module("TestModule", Ctx));
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  GlobalVariable *Var =
+      new GlobalVariable(*M, Int32Ty, true, GlobalValue::ExternalLinkage,
+                         Constant::getAllOnesValue(Int32Ty), "var", nullptr,
+                         GlobalVariable::NotThreadLocal, 1);
+
+  EXPECT_DEATH(Var->setAlignment(536870913U), "Alignment is not a power of 2");
+  EXPECT_DEATH(Var->setAlignment(1073741824U),
+               "Alignment is greater than MaximumAlignment");
+}
+#endif
+#endif
+
 } // end anonymous namespace

diff --git a/unittests/LineEditor/CMakeLists.txt b/unittests/LineEditor/CMakeLists.txt
index c6823d8..70d7497 100644
--- a/unittests/LineEditor/CMakeLists.txt
+++ b/unittests/LineEditor/CMakeLists.txt

@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   LineEditor
+  Support
   )
 
 add_llvm_unittest(LineEditorTests

diff --git a/unittests/Linker/CMakeLists.txt b/unittests/Linker/CMakeLists.txt
index c3dccb6..05f45c0 100644
--- a/unittests/Linker/CMakeLists.txt
+++ b/unittests/Linker/CMakeLists.txt

@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  AsmParser
   core
   linker
   )

diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index 4ccced1..b15d180 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp

@@ -7,12 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/AsmParser/Parser.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -88,7 +90,7 @@
   Builder.CreateRet(ConstantPointerNull::get(Type::getInt8PtrTy(Ctx)));
 
   Module *LinkedModule = new Module("MyModuleLinked", Ctx);
-  Linker::LinkModules(LinkedModule, M.get(), Linker::PreserveSource, nullptr);
+  Linker::LinkModules(LinkedModule, M.get());
 
   // Delete the original module.
   M.reset();
@@ -122,12 +124,13 @@
   delete LinkedModule;
 }
 
-TEST_F(LinkModuleTest, EmptyModule) {
+static Module *getInternal(LLVMContext &Ctx) {
   Module *InternalM = new Module("InternalModule", Ctx);
   FunctionType *FTy = FunctionType::get(
       Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx), false /*=isVarArgs*/);
 
-  F = Function::Create(FTy, Function::InternalLinkage, "bar", InternalM);
+  Function *F =
+      Function::Create(FTy, Function::InternalLinkage, "bar", InternalM);
   F->setCallingConv(CallingConv::C);
 
   BasicBlock *BB = BasicBlock::Create(Ctx, "", F);
@@ -141,16 +144,37 @@
                          GlobalValue::InternalLinkage, nullptr, "g");
 
   GV->setInitializer(ConstantStruct::get(STy, F));
+  return InternalM;
+}
 
-  Module *EmptyM = new Module("EmptyModule1", Ctx);
-  Linker::LinkModules(EmptyM, InternalM, Linker::PreserveSource, nullptr);
+TEST_F(LinkModuleTest, EmptyModule) {
+  std::unique_ptr<Module> InternalM(getInternal(Ctx));
+  std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
+  Linker::LinkModules(EmptyM.get(), InternalM.get());
+}
 
-  delete EmptyM;
-  EmptyM = new Module("EmptyModule2", Ctx);
-  Linker::LinkModules(InternalM, EmptyM, Linker::PreserveSource, nullptr);
+TEST_F(LinkModuleTest, EmptyModule2) {
+  std::unique_ptr<Module> InternalM(getInternal(Ctx));
+  std::unique_ptr<Module> EmptyM(new Module("EmptyModule1", Ctx));
+  Linker::LinkModules(InternalM.get(), EmptyM.get());
+}
 
-  delete EmptyM;
-  delete InternalM;
+TEST_F(LinkModuleTest, TypeMerge) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  const char *M1Str = "%t = type {i32}\n"
+                      "@t1 = weak global %t zeroinitializer\n";
+  std::unique_ptr<Module> M1 = parseAssemblyString(M1Str, Err, C);
+
+  const char *M2Str = "%t = type {i32}\n"
+                      "@t2 = weak global %t zeroinitializer\n";
+  std::unique_ptr<Module> M2 = parseAssemblyString(M2Str, Err, C);
+
+  Linker::LinkModules(M1.get(), M2.get(), [](const llvm::DiagnosticInfo &){});
+
+  EXPECT_EQ(M1->getNamedGlobal("t1")->getType(),
+            M1->getNamedGlobal("t2")->getType());
 }
 
 } // end anonymous namespace

diff --git a/unittests/Linker/Makefile b/unittests/Linker/Makefile
index c6058c4..ddbce07 100644
--- a/unittests/Linker/Makefile
+++ b/unittests/Linker/Makefile

@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 TESTNAME = Linker
-LINK_COMPONENTS := core linker
+LINK_COMPONENTS := core linker asmparser
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest

diff --git a/unittests/MC/CMakeLists.txt b/unittests/MC/CMakeLists.txt
index e2beab2..c82bcde 100644
--- a/unittests/MC/CMakeLists.txt
+++ b/unittests/MC/CMakeLists.txt

@@ -1,9 +1,18 @@
 set(LLVM_LINK_COMPONENTS
-  MCAnalysis
+  ${LLVM_TARGETS_TO_BUILD}
+  MC
+  MCDisassembler
+  Support
   )
 
 add_llvm_unittest(MCTests
-  MCAtomTest.cpp
+  Disassembler.cpp
   StringTableBuilderTest.cpp
   YAMLTest.cpp
   )
+
+foreach(t ${LLVM_TARGETS_TO_BUILD})
+  if (IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/${t}")
+    add_subdirectory(${t})
+  endif (IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/${t}")
+endforeach()

diff --git a/unittests/MC/Disassembler.cpp b/unittests/MC/Disassembler.cpp
new file mode 100644
index 0000000..dd0f1ef
--- /dev/null
+++ b/unittests/MC/Disassembler.cpp

@@ -0,0 +1,64 @@
+//===- llvm/unittest/Object/Disassembler.cpp ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Disassembler.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+static const char *symbolLookupCallback(void *DisInfo, uint64_t ReferenceValue,
+                                        uint64_t *ReferenceType,
+                                        uint64_t ReferencePC,
+                                        const char **ReferenceName) {
+  *ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+  return nullptr;
+}
+
+TEST(Disassembler, Test1) {
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllDisassemblers();
+
+  uint8_t Bytes[] = {0x90, 0x90, 0xeb, 0xfd};
+  uint8_t *BytesP = Bytes;
+  const char OutStringSize = 100;
+  char OutString[OutStringSize];
+  LLVMDisasmContextRef DCR = LLVMCreateDisasm("x86_64-pc-linux", nullptr, 0,
+                                              nullptr, symbolLookupCallback);
+  if (!DCR)
+    return;
+
+  size_t InstSize;
+  unsigned NumBytes = sizeof(Bytes);
+  unsigned PC = 0;
+
+  InstSize = LLVMDisasmInstruction(DCR, BytesP, NumBytes, PC, OutString,
+                                   OutStringSize);
+  EXPECT_EQ(InstSize, 1U);
+  EXPECT_EQ(StringRef(OutString), "\tnop");
+  PC += InstSize;
+  BytesP += InstSize;
+  NumBytes -= InstSize;
+
+  InstSize = LLVMDisasmInstruction(DCR, BytesP, NumBytes, PC, OutString,
+                                   OutStringSize);
+  EXPECT_EQ(InstSize, 1U);
+  EXPECT_EQ(StringRef(OutString), "\tnop");
+  PC += InstSize;
+  BytesP += InstSize;
+  NumBytes -= InstSize;
+
+  InstSize = LLVMDisasmInstruction(DCR, BytesP, NumBytes, PC, OutString,
+                                   OutStringSize);
+  EXPECT_EQ(InstSize, 2U);
+  EXPECT_EQ(StringRef(OutString), "\tjmp\t0x1");
+
+  LLVMDisasmDispose(DCR);
+}

diff --git a/unittests/MC/Hexagon/CMakeLists.txt b/unittests/MC/Hexagon/CMakeLists.txt
new file mode 100644
index 0000000..6d4ee93
--- /dev/null
+++ b/unittests/MC/Hexagon/CMakeLists.txt

@@ -0,0 +1,14 @@
+set(LLVM_LINK_COMPONENTS
+  HexagonCodeGen
+  HexagonDesc
+  HexagonInfo
+  MC
+  Support
+  )
+
+include_directories (${LLVM_MAIN_SRC_DIR}/lib/Target/Hexagon)
+include_directories (${LLVM_BINARY_DIR}/lib/Target/Hexagon)
+
+add_llvm_unittest(HexagonTests
+  HexagonMCCodeEmitterTest.cpp
+  )

diff --git a/unittests/MC/Hexagon/HexagonMCCodeEmitterTest.cpp b/unittests/MC/Hexagon/HexagonMCCodeEmitterTest.cpp
new file mode 100644
index 0000000..958a21f
--- /dev/null
+++ b/unittests/MC/Hexagon/HexagonMCCodeEmitterTest.cpp

@@ -0,0 +1,53 @@
+#include "gtest/gtest.h"
+
+#include <memory>
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+
+namespace {
+class TestEmitter {
+public:
+  TestEmitter() : Triple("hexagon-unknown-elf") {
+    LLVMInitializeHexagonTargetInfo();
+    LLVMInitializeHexagonTarget();
+    LLVMInitializeHexagonTargetMC();
+    std::string error;
+    Target = llvm::TargetRegistry::lookupTarget("hexagon", error);
+    assert(Target != nullptr && "Expected to find target");
+    assert(error.empty() && "Error should be empty if we have a target");
+    RegisterInfo = Target->createMCRegInfo(Triple);
+    assert(RegisterInfo != nullptr && "Expecting to find register info");
+    AsmInfo = Target->createMCAsmInfo(*RegisterInfo, Triple);
+    assert(AsmInfo != nullptr && "Expecting to find asm info");
+    Context = new llvm::MCContext(AsmInfo, RegisterInfo, nullptr);
+    assert(Context != nullptr && "Expecting to create a context");
+    Subtarget = Target->createMCSubtargetInfo(Triple, "hexagonv4", "");
+    assert(Subtarget != nullptr && "Expecting to find a subtarget");
+    InstrInfo = Target->createMCInstrInfo();
+    assert(InstrInfo != nullptr && "Expecting to find instr info");
+    Emitter = Target->createMCCodeEmitter(*InstrInfo, *RegisterInfo, *Subtarget,
+                                          *Context);
+    assert(Emitter != nullptr);
+  }
+  std::string Triple;
+  llvm::Target const *Target;
+  llvm::MCRegisterInfo *RegisterInfo;
+  llvm::MCAsmInfo *AsmInfo;
+  llvm::MCContext *Context;
+  llvm::MCSubtargetInfo *Subtarget;
+  llvm::MCInstrInfo *InstrInfo;
+  llvm::MCCodeEmitter *Emitter;
+};
+TestEmitter Emitter;
+}
+
+TEST(HexagonMCCodeEmitter, emitter_creation) {
+  ASSERT_NE(nullptr, Emitter.Emitter);
+}

diff --git a/unittests/MC/MCAtomTest.cpp b/unittests/MC/MCAtomTest.cpp
deleted file mode 100644
index 16228b5..0000000
--- a/unittests/MC/MCAtomTest.cpp
+++ /dev/null

@@ -1,31 +0,0 @@
-//===- llvm/unittest/MC/MCAtomTest.cpp - Instructions unit tests ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCAnalysis/MCAtom.h"
-#include "llvm/MC/MCAnalysis/MCModule.h"
-#include "gtest/gtest.h"
-
-namespace llvm {
-namespace {
-
-TEST(MCAtomTest, MCDataSize) {
-  MCModule M;
-  MCDataAtom *Atom = M.createDataAtom(0, 0);
-  EXPECT_EQ(uint64_t(0), Atom->getEndAddr());
-  Atom->addData(0);
-  EXPECT_EQ(uint64_t(0), Atom->getEndAddr());
-  Atom->addData(1);
-  EXPECT_EQ(uint64_t(1), Atom->getEndAddr());
-  Atom->addData(2);
-  EXPECT_EQ(uint64_t(2), Atom->getEndAddr());
-  EXPECT_EQ(size_t(3), Atom->getData().size());
-}
-
-}  // end anonymous namespace
-}  // end namespace llvm

diff --git a/unittests/MC/Makefile b/unittests/MC/Makefile
index 07a608e..3f8d1ef 100644
--- a/unittests/MC/Makefile
+++ b/unittests/MC/Makefile

@@ -9,7 +9,7 @@
 
 LEVEL = ../..
 TESTNAME = MC
-LINK_COMPONENTS := MCAnalysis
+LINK_COMPONENTS := all-targets MCDisassembler Object
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest

diff --git a/unittests/MC/StringTableBuilderTest.cpp b/unittests/MC/StringTableBuilderTest.cpp
index d30dc62..6b5185c 100644
--- a/unittests/MC/StringTableBuilderTest.cpp
+++ b/unittests/MC/StringTableBuilderTest.cpp

@@ -9,20 +9,21 @@
 
 #include "llvm/MC/StringTableBuilder.h"
 #include "gtest/gtest.h"
+#include "llvm/Support/Endian.h"
 #include <string>
 
 using namespace llvm;
 
 namespace {
 
-TEST(StringTableBuilderTest, Basic) {
+TEST(StringTableBuilderTest, BasicELF) {
   StringTableBuilder B;
 
   B.add("foo");
   B.add("bar");
   B.add("foobar");
 
-  B.finalize();
+  B.finalize(StringTableBuilder::ELF);
 
   std::string Expected;
   Expected += '\x00';
@@ -37,4 +38,34 @@
   EXPECT_EQ(8U, B.getOffset("foo"));
 }
 
+TEST(StringTableBuilderTest, BasicWinCOFF) {
+  StringTableBuilder B;
+
+  // Strings must be 9 chars or longer to go in the table.
+  B.add("hippopotamus");
+  B.add("pygmy hippopotamus");
+  B.add("river horse");
+
+  B.finalize(StringTableBuilder::WinCOFF);
+
+  // size_field + "pygmy hippopotamus\0" + "river horse\0"
+  uint32_t ExpectedSize = 4 + 19 + 12;
+  EXPECT_EQ(ExpectedSize, B.data().size());
+
+  std::string Expected;
+
+  ExpectedSize =
+      support::endian::byte_swap<uint32_t, support::little>(ExpectedSize);
+  Expected.append((const char*)&ExpectedSize, 4);
+  Expected += "pygmy hippopotamus";
+  Expected += '\x00';
+  Expected += "river horse";
+  Expected += '\x00';
+
+  EXPECT_EQ(Expected, B.data());
+  EXPECT_EQ(4U, B.getOffset("pygmy hippopotamus"));
+  EXPECT_EQ(10U, B.getOffset("hippopotamus"));
+  EXPECT_EQ(23U, B.getOffset("river horse"));
+}
+
 }

diff --git a/unittests/Support/AllocatorTest.cpp b/unittests/Support/AllocatorTest.cpp
index 0fc84c7..7f15776 100644
--- a/unittests/Support/AllocatorTest.cpp
+++ b/unittests/Support/AllocatorTest.cpp

@@ -17,9 +17,9 @@
 
 TEST(AllocatorTest, Basics) {
   BumpPtrAllocator Alloc;
-  int *a = (int*)Alloc.Allocate(sizeof(int), 0);
-  int *b = (int*)Alloc.Allocate(sizeof(int) * 10, 0);
-  int *c = (int*)Alloc.Allocate(sizeof(int), 0);
+  int *a = (int*)Alloc.Allocate(sizeof(int), 1);
+  int *b = (int*)Alloc.Allocate(sizeof(int) * 10, 1);
+  int *c = (int*)Alloc.Allocate(sizeof(int), 1);
   *a = 1;
   b[0] = 2;
   b[9] = 2;
@@ -49,11 +49,11 @@
 // Allocate enough bytes to create three slabs.
 TEST(AllocatorTest, ThreeSlabs) {
   BumpPtrAllocator Alloc;
-  Alloc.Allocate(3000, 0);
+  Alloc.Allocate(3000, 1);
   EXPECT_EQ(1U, Alloc.GetNumSlabs());
-  Alloc.Allocate(3000, 0);
+  Alloc.Allocate(3000, 1);
   EXPECT_EQ(2U, Alloc.GetNumSlabs());
-  Alloc.Allocate(3000, 0);
+  Alloc.Allocate(3000, 1);
   EXPECT_EQ(3U, Alloc.GetNumSlabs());
 }
 
@@ -61,15 +61,15 @@
 // again.
 TEST(AllocatorTest, TestReset) {
   BumpPtrAllocator Alloc;
-  Alloc.Allocate(3000, 0);
+  Alloc.Allocate(3000, 1);
   EXPECT_EQ(1U, Alloc.GetNumSlabs());
-  Alloc.Allocate(3000, 0);
+  Alloc.Allocate(3000, 1);
   EXPECT_EQ(2U, Alloc.GetNumSlabs());
   Alloc.Reset();
   EXPECT_EQ(1U, Alloc.GetNumSlabs());
-  Alloc.Allocate(3000, 0);
+  Alloc.Allocate(3000, 1);
   EXPECT_EQ(1U, Alloc.GetNumSlabs());
-  Alloc.Allocate(3000, 0);
+  Alloc.Allocate(3000, 1);
   EXPECT_EQ(2U, Alloc.GetNumSlabs());
 }
 
@@ -99,11 +99,11 @@
   BumpPtrAllocator Alloc;
 
   // Fill the slab right up until the end pointer.
-  Alloc.Allocate(4096, 0);
+  Alloc.Allocate(4096, 1);
   EXPECT_EQ(1U, Alloc.GetNumSlabs());
 
   // If we don't allocate a new slab, then we will have overflowed.
-  Alloc.Allocate(1, 0);
+  Alloc.Allocate(1, 1);
   EXPECT_EQ(2U, Alloc.GetNumSlabs());
 }
 
@@ -111,7 +111,20 @@
 TEST(AllocatorTest, TestSmallSlabSize) {
   BumpPtrAllocator Alloc;
 
-  Alloc.Allocate(8000, 0);
+  Alloc.Allocate(8000, 1);
+  EXPECT_EQ(1U, Alloc.GetNumSlabs());
+}
+
+// Test requesting alignment that goes past the end of the current slab.
+TEST(AllocatorTest, TestAlignmentPastSlab) {
+  BumpPtrAllocator Alloc;
+  Alloc.Allocate(4095, 1);
+
+  // Aligning the current slab pointer is likely to move it past the end of the
+  // slab, which would confuse any unsigned comparisons with the difference of
+  // the the end pointer and the aligned pointer.
+  Alloc.Allocate(1024, 8192);
+
   EXPECT_EQ(2U, Alloc.GetNumSlabs());
 }
 
@@ -130,7 +143,7 @@
     void *MemBase = malloc(Size + Alignment - 1 + sizeof(void*));
 
     // Find the slab start.
-    void *Slab = alignPtr((char *)MemBase + sizeof(void *), Alignment);
+    void *Slab = (void *)alignAddr((char*)MemBase + sizeof(void *), Alignment);
 
     // Hold a pointer to the base so we can free the whole malloced block.
     ((void**)Slab)[-1] = MemBase;
@@ -155,7 +168,7 @@
   BumpPtrAllocatorImpl<MockSlabAllocator> Alloc;
 
   // First allocate a tiny bit to ensure we have to re-align things.
-  (void)Alloc.Allocate(1, 0);
+  (void)Alloc.Allocate(1, 1);
 
   // Now the big chunk with a big alignment.
   (void)Alloc.Allocate(3000, 2048);

diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 97c5c43..7abdd8a 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt

@@ -42,3 +42,8 @@
   formatted_raw_ostream_test.cpp
   raw_ostream_test.cpp
   )
+
+# ManagedStatic.cpp uses <pthread>.
+if(LLVM_ENABLE_THREADS AND HAVE_LIBPTHREAD)
+  target_link_libraries(SupportTests pthread)
+endif()

diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp
index b2d71ab..ac8d3d8 100644
--- a/unittests/Support/CommandLineTest.cpp
+++ b/unittests/Support/CommandLineTest.cpp

@@ -153,14 +153,14 @@
 };
 
 typedef void ParserFunction(StringRef Source, llvm::cl::StringSaver &Saver,
-                            SmallVectorImpl<const char *> &NewArgv);
-
+                            SmallVectorImpl<const char *> &NewArgv,
+                            bool MarkEOLs);
 
 void testCommandLineTokenizer(ParserFunction *parse, const char *Input,
                               const char *const Output[], size_t OutputSize) {
   SmallVector<const char *, 0> Actual;
   StrDupSaver Saver;
-  parse(Input, Saver, Actual);
+  parse(Input, Saver, Actual, /*MarkEOLs=*/false);
   EXPECT_EQ(OutputSize, Actual.size());
   for (unsigned I = 0, E = Actual.size(); I != E; ++I) {
     if (I < OutputSize)
@@ -212,4 +212,23 @@
   }
 }
 
+void testAliasRequired(int argc, const char *const *argv) {
+  StackOption<std::string> Option("option", cl::Required);
+  cl::alias Alias("o", llvm::cl::aliasopt(Option));
+
+  cl::ParseCommandLineOptions(argc, argv);
+  EXPECT_EQ("x", Option);
+  EXPECT_EQ(1, Option.getNumOccurrences());
+
+  Alias.removeArgument();
+}
+
+TEST(CommandLineTest, AliasRequired) {
+  const char *opts1[] = { "-tool", "-option=x" };
+  const char *opts2[] = { "-tool", "-o", "x" };
+  testAliasRequired(array_lengthof(opts1), opts1);
+  testAliasRequired(array_lengthof(opts2), opts2);
+}
+
+
 }  // anonymous namespace

diff --git a/unittests/Support/ConvertUTFTest.cpp b/unittests/Support/ConvertUTFTest.cpp
index 16c9beb..510b1da 100644
--- a/unittests/Support/ConvertUTFTest.cpp
+++ b/unittests/Support/ConvertUTFTest.cpp

@@ -39,30 +39,30 @@
 
 TEST(ConvertUTFTest, OddLengthInput) {
   std::string Result;
-  bool Success = convertUTF16ToUTF8String(ArrayRef<char>("xxxxx", 5), Result);
+  bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);
   EXPECT_FALSE(Success);
 }
 
 TEST(ConvertUTFTest, Empty) {
   std::string Result;
-  bool Success = convertUTF16ToUTF8String(ArrayRef<char>(), Result);
+  bool Success = convertUTF16ToUTF8String(None, Result);
   EXPECT_TRUE(Success);
   EXPECT_TRUE(Result.empty());
 }
 
 TEST(ConvertUTFTest, HasUTF16BOM) {
-  bool HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xff\xfe", 2));
+  bool HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xff\xfe", 2));
   EXPECT_TRUE(HasBOM);
-  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff", 2));
+  HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff", 2));
   EXPECT_TRUE(HasBOM);
-  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff ", 3));
+  HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff ", 3));
   EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
-  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff\x00asdf", 6));
+  HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe\xff\x00asdf", 6));
   EXPECT_TRUE(HasBOM);
 
-  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>());
+  HasBOM = hasUTF16ByteOrderMark(None);
   EXPECT_FALSE(HasBOM);
-  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe", 1));
+  HasBOM = hasUTF16ByteOrderMark(makeArrayRef("\xfe", 1));
   EXPECT_FALSE(HasBOM);
 }
 

diff --git a/unittests/Support/ErrorOrTest.cpp b/unittests/Support/ErrorOrTest.cpp
index d76e7d6..82bbe09 100644
--- a/unittests/Support/ErrorOrTest.cpp
+++ b/unittests/Support/ErrorOrTest.cpp

@@ -60,5 +60,36 @@
 
   ErrorOr<std::unique_ptr<B> > b1(ErrorOr<std::unique_ptr<D> >(nullptr));
   b1 = ErrorOr<std::unique_ptr<D> >(nullptr);
+
+  ErrorOr<std::unique_ptr<int>> b2(ErrorOr<int *>(nullptr));
+  ErrorOr<int *> b3(nullptr);
+  ErrorOr<std::unique_ptr<int>> b4(b3);
 }
+
+// ErrorOr<int*> x(nullptr);
+// ErrorOr<std::unique_ptr<int>> y = x; // invalid conversion
+static_assert(
+    !std::is_convertible<const ErrorOr<int *> &,
+                         ErrorOr<std::unique_ptr<int>>>::value,
+    "do not invoke explicit ctors in implicit conversion from lvalue");
+
+// ErrorOr<std::unique_ptr<int>> y = ErrorOr<int*>(nullptr); // invalid
+//                                                           // conversion
+static_assert(
+    !std::is_convertible<ErrorOr<int *> &&,
+                         ErrorOr<std::unique_ptr<int>>>::value,
+    "do not invoke explicit ctors in implicit conversion from rvalue");
+
+// ErrorOr<int*> x(nullptr);
+// ErrorOr<std::unique_ptr<int>> y;
+// y = x; // invalid conversion
+static_assert(!std::is_assignable<ErrorOr<std::unique_ptr<int>>,
+                                  const ErrorOr<int *> &>::value,
+              "do not invoke explicit ctors in assignment");
+
+// ErrorOr<std::unique_ptr<int>> x;
+// x = ErrorOr<int*>(nullptr); // invalid conversion
+static_assert(!std::is_assignable<ErrorOr<std::unique_ptr<int>>,
+                                  ErrorOr<int *> &&>::value,
+              "do not invoke explicit ctors in assignment");
 } // end anon namespace

diff --git a/unittests/Support/FileOutputBufferTest.cpp b/unittests/Support/FileOutputBufferTest.cpp
index b086f1e..911d516 100644
--- a/unittests/Support/FileOutputBufferTest.cpp
+++ b/unittests/Support/FileOutputBufferTest.cpp

@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -65,9 +66,8 @@
     // Do *not* commit buffer.
   }
   // Verify file does not exist (because buffer not committed).
-  bool Exists = false;
-  ASSERT_NO_ERROR(fs::exists(Twine(File2), Exists));
-  EXPECT_FALSE(Exists);
+  ASSERT_EQ(fs::access(Twine(File2), fs::AccessMode::Exist),
+            errc::no_such_file_or_directory);
   ASSERT_NO_ERROR(fs::remove(File2.str()));
 
   // TEST 3: Verify sizing down case.

diff --git a/unittests/Support/LEB128Test.cpp b/unittests/Support/LEB128Test.cpp
index b1ca13e..76b63e5 100644
--- a/unittests/Support/LEB128Test.cpp
+++ b/unittests/Support/LEB128Test.cpp

@@ -106,6 +106,7 @@
   EXPECT_DECODE_ULEB128_EQ(0xffu, "\xff\x01");
   EXPECT_DECODE_ULEB128_EQ(0x100u, "\x80\x02");
   EXPECT_DECODE_ULEB128_EQ(0x101u, "\x81\x02");
+  EXPECT_DECODE_ULEB128_EQ(4294975616ULL, "\x80\xc1\x80\x80\x10");
 
   // Decode ULEB128 with extra padding bytes
   EXPECT_DECODE_ULEB128_EQ(0u, "\x80\x00");
@@ -118,6 +119,42 @@
 #undef EXPECT_DECODE_ULEB128_EQ
 }
 
+TEST(LEB128Test, DecodeSLEB128) {
+#define EXPECT_DECODE_SLEB128_EQ(EXPECTED, VALUE) \
+  do { \
+    unsigned ActualSize = 0; \
+    int64_t Actual = decodeSLEB128(reinterpret_cast<const uint8_t *>(VALUE), \
+                                    &ActualSize); \
+    EXPECT_EQ(sizeof(VALUE) - 1, ActualSize); \
+    EXPECT_EQ(EXPECTED, Actual); \
+  } while (0)
+
+  // Decode SLEB128
+  EXPECT_DECODE_SLEB128_EQ(0L, "\x00");
+  EXPECT_DECODE_SLEB128_EQ(1L, "\x01");
+  EXPECT_DECODE_SLEB128_EQ(63L, "\x3f");
+  EXPECT_DECODE_SLEB128_EQ(-64L, "\x40");
+  EXPECT_DECODE_SLEB128_EQ(-63L, "\x41");
+  EXPECT_DECODE_SLEB128_EQ(-1L, "\x7f");
+  EXPECT_DECODE_SLEB128_EQ(128L, "\x80\x01");
+  EXPECT_DECODE_SLEB128_EQ(129L, "\x81\x01");
+  EXPECT_DECODE_SLEB128_EQ(-129L, "\xff\x7e");
+  EXPECT_DECODE_SLEB128_EQ(-128L, "\x80\x7f");
+  EXPECT_DECODE_SLEB128_EQ(-127L, "\x81\x7f");
+  EXPECT_DECODE_SLEB128_EQ(64L, "\xc0\x00");
+  EXPECT_DECODE_SLEB128_EQ(-12345L, "\xc7\x9f\x7f");
+
+  // Decode unnormalized SLEB128 with extra padding bytes.
+  EXPECT_DECODE_SLEB128_EQ(0L, "\x80\x00");
+  EXPECT_DECODE_SLEB128_EQ(0L, "\x80\x80\x00");
+  EXPECT_DECODE_SLEB128_EQ(0x7fL, "\xff\x00");
+  EXPECT_DECODE_SLEB128_EQ(0x7fL, "\xff\x80\x00");
+  EXPECT_DECODE_SLEB128_EQ(0x80L, "\x80\x81\x00");
+  EXPECT_DECODE_SLEB128_EQ(0x80L, "\x80\x81\x80\x00");
+
+#undef EXPECT_DECODE_SLEB128_EQ
+}
+
 TEST(LEB128Test, SLEB128Size) {
   // Positive Value Testing Plan:
   // (1) 128 ^ n - 1 ........ need (n+1) bytes

diff --git a/unittests/Support/LineIteratorTest.cpp b/unittests/Support/LineIteratorTest.cpp
index 18f3fa9..67f9d97 100644
--- a/unittests/Support/LineIteratorTest.cpp
+++ b/unittests/Support/LineIteratorTest.cpp

@@ -17,9 +17,9 @@
 namespace {
 
 TEST(LineIteratorTest, Basic) {
-  std::unique_ptr<MemoryBuffer> Buffer(MemoryBuffer::getMemBuffer("line 1\n"
-                                                                  "line 2\n"
-                                                                  "line 3"));
+  std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer("line 1\n"
+                                                                    "line 2\n"
+                                                                    "line 3");
 
   line_iterator I = line_iterator(*Buffer), E;
 
@@ -40,15 +40,17 @@
   EXPECT_EQ(E, I);
 }
 
-TEST(LineIteratorTest, CommentSkipping) {
+TEST(LineIteratorTest, CommentAndBlankSkipping) {
   std::unique_ptr<MemoryBuffer> Buffer(
       MemoryBuffer::getMemBuffer("line 1\n"
                                  "line 2\n"
                                  "# Comment 1\n"
-                                 "line 4\n"
+                                 "\n"
+                                 "line 5\n"
+                                 "\n"
                                  "# Comment 2"));
 
-  line_iterator I = line_iterator(*Buffer, '#'), E;
+  line_iterator I = line_iterator(*Buffer, true, '#'), E;
 
   EXPECT_FALSE(I.is_at_eof());
   EXPECT_NE(E, I);
@@ -59,20 +61,57 @@
   EXPECT_EQ("line 2", *I);
   EXPECT_EQ(2, I.line_number());
   ++I;
-  EXPECT_EQ("line 4", *I);
-  EXPECT_EQ(4, I.line_number());
+  EXPECT_EQ("line 5", *I);
+  EXPECT_EQ(5, I.line_number());
   ++I;
 
   EXPECT_TRUE(I.is_at_eof());
   EXPECT_EQ(E, I);
 }
 
+TEST(LineIteratorTest, CommentSkippingKeepBlanks) {
+  std::unique_ptr<MemoryBuffer> Buffer(
+      MemoryBuffer::getMemBuffer("line 1\n"
+                                 "line 2\n"
+                                 "# Comment 1\n"
+                                 "# Comment 2\n"
+                                 "\n"
+                                 "line 6\n"
+                                 "\n"
+                                 "# Comment 3"));
+
+  line_iterator I = line_iterator(*Buffer, false, '#'), E;
+
+  EXPECT_FALSE(I.is_at_eof());
+  EXPECT_NE(E, I);
+
+  EXPECT_EQ("line 1", *I);
+  EXPECT_EQ(1, I.line_number());
+  ++I;
+  EXPECT_EQ("line 2", *I);
+  EXPECT_EQ(2, I.line_number());
+  ++I;
+  EXPECT_EQ("", *I);
+  EXPECT_EQ(5, I.line_number());
+  ++I;
+  EXPECT_EQ("line 6", *I);
+  EXPECT_EQ(6, I.line_number());
+  ++I;
+  EXPECT_EQ("", *I);
+  EXPECT_EQ(7, I.line_number());
+  ++I;
+
+  EXPECT_TRUE(I.is_at_eof());
+  EXPECT_EQ(E, I);
+}
+
+
 TEST(LineIteratorTest, BlankSkipping) {
-  std::unique_ptr<MemoryBuffer> Buffer(MemoryBuffer::getMemBuffer("\n\n\n"
-                                                                  "line 1\n"
-                                                                  "\n\n\n"
-                                                                  "line 2\n"
-                                                                  "\n\n\n"));
+  std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer("\n\n\n"
+                                                                    "line 1\n"
+                                                                    "\n\n\n"
+                                                                    "line 2\n"
+                                                                    "\n\n\n");
 
   line_iterator I = line_iterator(*Buffer), E;
 
@@ -90,26 +129,65 @@
   EXPECT_EQ(E, I);
 }
 
+TEST(LineIteratorTest, BlankKeeping) {
+  std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer("\n\n"
+                                                                    "line 3\n"
+                                                                    "\n"
+                                                                    "line 5\n"
+                                                                    "\n\n");
+  line_iterator I = line_iterator(*Buffer, false), E;
+
+  EXPECT_FALSE(I.is_at_eof());
+  EXPECT_NE(E, I);
+
+  EXPECT_EQ("", *I);
+  EXPECT_EQ(1, I.line_number());
+  ++I;
+  EXPECT_EQ("", *I);
+  EXPECT_EQ(2, I.line_number());
+  ++I;
+  EXPECT_EQ("line 3", *I);
+  EXPECT_EQ(3, I.line_number());
+  ++I;
+  EXPECT_EQ("", *I);
+  EXPECT_EQ(4, I.line_number());
+  ++I;
+  EXPECT_EQ("line 5", *I);
+  EXPECT_EQ(5, I.line_number());
+  ++I;
+  EXPECT_EQ("", *I);
+  EXPECT_EQ(6, I.line_number());
+  ++I;
+  EXPECT_EQ("", *I);
+  EXPECT_EQ(7, I.line_number());
+  ++I;
+
+  EXPECT_TRUE(I.is_at_eof());
+  EXPECT_EQ(E, I);
+}
+
 TEST(LineIteratorTest, EmptyBuffers) {
-  std::unique_ptr<MemoryBuffer> Buffer(MemoryBuffer::getMemBuffer(""));
+  std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer("");
+  EXPECT_TRUE(line_iterator(*Buffer).is_at_eof());
+  EXPECT_EQ(line_iterator(), line_iterator(*Buffer));
+  EXPECT_TRUE(line_iterator(*Buffer, false).is_at_eof());
+  EXPECT_EQ(line_iterator(), line_iterator(*Buffer, false));
+
+  Buffer = MemoryBuffer::getMemBuffer("\n\n\n");
   EXPECT_TRUE(line_iterator(*Buffer).is_at_eof());
   EXPECT_EQ(line_iterator(), line_iterator(*Buffer));
 
-  Buffer.reset(MemoryBuffer::getMemBuffer("\n\n\n"));
-  EXPECT_TRUE(line_iterator(*Buffer).is_at_eof());
-  EXPECT_EQ(line_iterator(), line_iterator(*Buffer));
+  Buffer = MemoryBuffer::getMemBuffer("# foo\n"
+                                      "\n"
+                                      "# bar");
+  EXPECT_TRUE(line_iterator(*Buffer, true, '#').is_at_eof());
+  EXPECT_EQ(line_iterator(), line_iterator(*Buffer, true, '#'));
 
-  Buffer.reset(MemoryBuffer::getMemBuffer("# foo\n"
-                                          "\n"
-                                          "# bar"));
-  EXPECT_TRUE(line_iterator(*Buffer, '#').is_at_eof());
-  EXPECT_EQ(line_iterator(), line_iterator(*Buffer, '#'));
-
-  Buffer.reset(MemoryBuffer::getMemBuffer("\n"
-                                          "# baz\n"
-                                          "\n"));
-  EXPECT_TRUE(line_iterator(*Buffer, '#').is_at_eof());
-  EXPECT_EQ(line_iterator(), line_iterator(*Buffer, '#'));
+  Buffer = MemoryBuffer::getMemBuffer("\n"
+                                      "# baz\n"
+                                      "\n");
+  EXPECT_TRUE(line_iterator(*Buffer, true, '#').is_at_eof());
+  EXPECT_EQ(line_iterator(), line_iterator(*Buffer, true, '#'));
 }
 
 } // anonymous namespace

diff --git a/unittests/Support/LockFileManagerTest.cpp b/unittests/Support/LockFileManagerTest.cpp
index 885b7d6..efe3c30 100644
--- a/unittests/Support/LockFileManagerTest.cpp
+++ b/unittests/Support/LockFileManagerTest.cpp

@@ -95,7 +95,7 @@
 
   char PathBuf[1024];
   const char *OrigPath = getcwd(PathBuf, 1024);
-  chdir(TmpDir.c_str());
+  ASSERT_FALSE(chdir(TmpDir.c_str()));
 
   sys::fs::create_directory("inner");
   SmallString<64> LockedFile("inner");
@@ -118,7 +118,7 @@
   EC = sys::fs::remove("inner");
   ASSERT_FALSE(EC);
 
-  chdir(OrigPath);
+  ASSERT_FALSE(chdir(OrigPath));
 
   EC = sys::fs::remove(StringRef(TmpDir));
   ASSERT_FALSE(EC);

diff --git a/unittests/Support/MD5Test.cpp b/unittests/Support/MD5Test.cpp
index 7c1331b..c4fa5cd 100644
--- a/unittests/Support/MD5Test.cpp
+++ b/unittests/Support/MD5Test.cpp

@@ -41,19 +41,19 @@
 }
 
 TEST(MD5Test, MD5) {
-  TestMD5Sum(ArrayRef<uint8_t>((const uint8_t *)"", (size_t) 0),
+  TestMD5Sum(makeArrayRef((const uint8_t *)"", (size_t) 0),
              "d41d8cd98f00b204e9800998ecf8427e");
-  TestMD5Sum(ArrayRef<uint8_t>((const uint8_t *)"a", (size_t) 1),
+  TestMD5Sum(makeArrayRef((const uint8_t *)"a", (size_t) 1),
              "0cc175b9c0f1b6a831c399e269772661");
-  TestMD5Sum(ArrayRef<uint8_t>((const uint8_t *)"abcdefghijklmnopqrstuvwxyz",
-                               (size_t) 26),
+  TestMD5Sum(makeArrayRef((const uint8_t *)"abcdefghijklmnopqrstuvwxyz",
+                          (size_t) 26),
              "c3fcd3d76192e4007dfb496cca67e13b");
-  TestMD5Sum(ArrayRef<uint8_t>((const uint8_t *)"\0", (size_t) 1),
+  TestMD5Sum(makeArrayRef((const uint8_t *)"\0", (size_t) 1),
              "93b885adfe0da089cdf634904fd59f71");
-  TestMD5Sum(ArrayRef<uint8_t>((const uint8_t *)"a\0", (size_t) 2),
+  TestMD5Sum(makeArrayRef((const uint8_t *)"a\0", (size_t) 2),
              "4144e195f46de78a3623da7364d04f11");
-  TestMD5Sum(ArrayRef<uint8_t>((const uint8_t *)"abcdefghijklmnopqrstuvwxyz\0",
-                               (size_t) 27),
+  TestMD5Sum(makeArrayRef((const uint8_t *)"abcdefghijklmnopqrstuvwxyz\0",
+                          (size_t) 27),
              "81948d1f1554f58cd1a56ebb01f808cb");
   TestMD5Sum("abcdefghijklmnopqrstuvwxyz", "c3fcd3d76192e4007dfb496cca67e13b");
 }

diff --git a/unittests/Support/MemoryBufferTest.cpp b/unittests/Support/MemoryBufferTest.cpp
index 93bf301..1cdd6ad 100644
--- a/unittests/Support/MemoryBufferTest.cpp
+++ b/unittests/Support/MemoryBufferTest.cpp

@@ -169,4 +169,54 @@
   testGetOpenFileSlice(true);
 }
 
+
+TEST_F(MemoryBufferTest, slice) {
+  // Create a file that is six pages long with different data on each page.
+  int FD;
+  SmallString<64> TestPath;
+  sys::fs::createTemporaryFile("MemoryBufferTest_Slice", "temp", FD, TestPath);
+  raw_fd_ostream OF(FD, true, /*unbuffered=*/true);
+  for (unsigned i = 0; i < 0x2000 / 8; ++i) {
+    OF << "12345678";
+  }
+  for (unsigned i = 0; i < 0x2000 / 8; ++i) {
+    OF << "abcdefgh";
+  }
+  for (unsigned i = 0; i < 0x2000 / 8; ++i) {
+    OF << "ABCDEFGH";
+  }
+  OF.close();
+
+  // Try offset of one page.
+  ErrorOr<OwningBuffer> MB = MemoryBuffer::getFileSlice(TestPath.str(),
+                                                        0x4000, 0x1000);
+  std::error_code EC = MB.getError();
+  ASSERT_FALSE(EC);
+  EXPECT_EQ(0x4000UL, MB.get()->getBufferSize());
+ 
+  StringRef BufData = MB.get()->getBuffer();
+  EXPECT_TRUE(BufData.substr(0x0000,8).equals("12345678"));
+  EXPECT_TRUE(BufData.substr(0x0FF8,8).equals("12345678"));
+  EXPECT_TRUE(BufData.substr(0x1000,8).equals("abcdefgh"));
+  EXPECT_TRUE(BufData.substr(0x2FF8,8).equals("abcdefgh"));
+  EXPECT_TRUE(BufData.substr(0x3000,8).equals("ABCDEFGH"));
+  EXPECT_TRUE(BufData.substr(0x3FF8,8).equals("ABCDEFGH"));
+   
+  // Try non-page aligned.
+  ErrorOr<OwningBuffer> MB2 = MemoryBuffer::getFileSlice(TestPath.str(),
+                                                         0x3000, 0x0800);
+  EC = MB2.getError();
+  ASSERT_FALSE(EC);
+  EXPECT_EQ(0x3000UL, MB2.get()->getBufferSize());
+  
+  StringRef BufData2 = MB2.get()->getBuffer();
+  EXPECT_TRUE(BufData2.substr(0x0000,8).equals("12345678"));
+  EXPECT_TRUE(BufData2.substr(0x17F8,8).equals("12345678"));
+  EXPECT_TRUE(BufData2.substr(0x1800,8).equals("abcdefgh"));
+  EXPECT_TRUE(BufData2.substr(0x2FF8,8).equals("abcdefgh"));
+ 
+}
+
+
+
 }

diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index cf2e1ee..502cda2 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp

@@ -16,6 +16,7 @@
 #include "gtest/gtest.h"
 
 #ifdef LLVM_ON_WIN32
+#include <Windows.h>
 #include <winerror.h>
 #endif
 
@@ -91,6 +92,7 @@
   paths.push_back("c:\\foo/");
   paths.push_back("c:/foo\\bar");
 
+  SmallVector<StringRef, 5> ComponentStack;
   for (SmallVector<StringRef, 40>::const_iterator i = paths.begin(),
                                                   e = paths.end();
                                                   i != e;
@@ -100,18 +102,17 @@
                                    ci != ce;
                                    ++ci) {
       ASSERT_FALSE(ci->empty());
+      ComponentStack.push_back(*ci);
     }
 
-#if 0 // Valgrind is whining about this.
-    outs() << "    Reverse Iteration: [";
     for (sys::path::reverse_iterator ci = sys::path::rbegin(*i),
                                      ce = sys::path::rend(*i);
                                      ci != ce;
                                      ++ci) {
-      outs() << *ci << ',';
+      ASSERT_TRUE(*ci == ComponentStack.back());
+      ComponentStack.pop_back();
     }
-    outs() << "]\n";
-#endif
+    ASSERT_TRUE(ComponentStack.empty());
 
     path::has_root_path(*i);
     path::root_path(*i);
@@ -141,7 +142,7 @@
     StringRef filename(temp_store.begin(), temp_store.size()), stem, ext;
     stem = path::stem(filename);
     ext  = path::extension(filename);
-    EXPECT_EQ(*(--sys::path::end(filename)), (stem + ext).str());
+    EXPECT_EQ(*sys::path::rbegin(filename), (stem + ext).str());
 
     path::native(*i, temp_store);
   }
@@ -227,7 +228,7 @@
 #endif
 
   for (StringRef Path : Paths) {
-    StringRef LastComponent = *--path::end(Path);
+    StringRef LastComponent = *path::rbegin(Path);
     EXPECT_EQ(".", LastComponent);
   }
 
@@ -239,7 +240,7 @@
 #endif
 
   for (StringRef Path : RootPaths) {
-    StringRef LastComponent = *--path::end(Path);
+    StringRef LastComponent = *path::rbegin(Path);
     EXPECT_EQ(1u, LastComponent.size());
     EXPECT_TRUE(path::is_separator(LastComponent[0]));
   }
@@ -261,7 +262,7 @@
 class FileSystemTest : public testing::Test {
 protected:
   /// Unique temporary directory in which all created filesystem entities must
-  /// be placed. It is recursively removed at the end of each test.
+  /// be placed. It is removed at the end of each test (must be empty).
   SmallString<128> TestDirectory;
 
   virtual void SetUp() {
@@ -334,9 +335,7 @@
       fs::createTemporaryFile("prefix", "temp", FileDescriptor, TempPath));
 
   // Make sure it exists.
-  bool TempFileExists;
-  ASSERT_NO_ERROR(sys::fs::exists(Twine(TempPath), TempFileExists));
-  EXPECT_TRUE(TempFileExists);
+  ASSERT_TRUE(sys::fs::exists(Twine(TempPath)));
 
   // Create another temp tile.
   int FD2;
@@ -363,8 +362,8 @@
   EXPECT_EQ(B.type(), fs::file_type::file_not_found);
 
   // Make sure Temp2 doesn't exist.
-  ASSERT_NO_ERROR(fs::exists(Twine(TempPath2), TempFileExists));
-  EXPECT_FALSE(TempFileExists);
+  ASSERT_EQ(fs::access(Twine(TempPath2), sys::fs::AccessMode::Exist),
+            errc::no_such_file_or_directory);
 
   SmallString<64> TempPath3;
   ASSERT_NO_ERROR(fs::createTemporaryFile("prefix", "", TempPath3));
@@ -387,8 +386,8 @@
   ASSERT_NO_ERROR(fs::remove(Twine(TempPath2)));
 
   // Make sure Temp1 doesn't exist.
-  ASSERT_NO_ERROR(fs::exists(Twine(TempPath), TempFileExists));
-  EXPECT_FALSE(TempFileExists);
+  ASSERT_EQ(fs::access(Twine(TempPath), sys::fs::AccessMode::Exist),
+            errc::no_such_file_or_directory);
 
 #ifdef LLVM_ON_WIN32
   // Path name > 260 chars should get an error.
@@ -398,8 +397,16 @@
     "abcdefghijklmnopqrstuvwxyz5abcdefghijklmnopqrstuvwxyz4"
     "abcdefghijklmnopqrstuvwxyz3abcdefghijklmnopqrstuvwxyz2"
     "abcdefghijklmnopqrstuvwxyz1abcdefghijklmnopqrstuvwxyz0";
-  EXPECT_EQ(fs::createUniqueFile(Twine(Path270), FileDescriptor, TempPath),
-            errc::no_such_file_or_directory);
+  EXPECT_EQ(fs::createUniqueFile(Path270, FileDescriptor, TempPath),
+            errc::invalid_argument);
+  // Relative path < 247 chars, no problem.
+  const char *Path216 =
+    "abcdefghijklmnopqrstuvwxyz7abcdefghijklmnopqrstuvwxyz6"
+    "abcdefghijklmnopqrstuvwxyz5abcdefghijklmnopqrstuvwxyz4"
+    "abcdefghijklmnopqrstuvwxyz3abcdefghijklmnopqrstuvwxyz2"
+    "abcdefghijklmnopqrstuvwxyz1abcdefghijklmnopqrstuvwxyz0";
+  ASSERT_NO_ERROR(fs::createTemporaryFile(Path216, "", TempPath));
+  ASSERT_NO_ERROR(fs::remove(Twine(TempPath)));
 #endif
 }
 
@@ -409,6 +416,54 @@
   ASSERT_EQ(fs::create_directory(Twine(TestDirectory) + "foo", false),
             errc::file_exists);
   ASSERT_NO_ERROR(fs::remove(Twine(TestDirectory) + "foo"));
+
+#ifdef LLVM_ON_WIN32
+  // Prove that create_directories() can handle a pathname > 248 characters,
+  // which is the documented limit for CreateDirectory().
+  // (248 is MAX_PATH subtracting room for an 8.3 filename.)
+  // Generate a directory path guaranteed to fall into that range.
+  size_t TmpLen = TestDirectory.size();
+  const char *OneDir = "\\123456789";
+  size_t OneDirLen = strlen(OneDir);
+  ASSERT_LT(OneDirLen, 12U);
+  size_t NLevels = ((248 - TmpLen) / OneDirLen) + 1;
+  SmallString<260> LongDir(TestDirectory);
+  for (size_t I = 0; I < NLevels; ++I)
+    LongDir.append(OneDir);
+  ASSERT_NO_ERROR(fs::create_directories(Twine(LongDir)));
+  ASSERT_NO_ERROR(fs::create_directories(Twine(LongDir)));
+  ASSERT_EQ(fs::create_directories(Twine(LongDir), false),
+            errc::file_exists);
+  // Tidy up, "recursively" removing the directories.
+  StringRef ThisDir(LongDir);
+  for (size_t J = 0; J < NLevels; ++J) {
+    ASSERT_NO_ERROR(fs::remove(ThisDir));
+    ThisDir = path::parent_path(ThisDir);
+  }
+
+  // Similarly for a relative pathname.  Need to set the current directory to
+  // TestDirectory so that the one we create ends up in the right place.
+  char PreviousDir[260];
+  size_t PreviousDirLen = ::GetCurrentDirectoryA(260, PreviousDir);
+  ASSERT_GT(PreviousDirLen, 0U);
+  ASSERT_LT(PreviousDirLen, 260U);
+  ASSERT_NE(::SetCurrentDirectoryA(TestDirectory.c_str()), 0);
+  LongDir.clear();
+  // Generate a relative directory name with absolute length > 248.
+  size_t LongDirLen = 249 - TestDirectory.size();
+  LongDir.assign(LongDirLen, 'a');
+  ASSERT_NO_ERROR(fs::create_directory(Twine(LongDir)));
+  // While we're here, prove that .. and . handling works in these long paths.
+  const char *DotDotDirs = "\\..\\.\\b";
+  LongDir.append(DotDotDirs);
+  ASSERT_NO_ERROR(fs::create_directory("b"));
+  ASSERT_EQ(fs::create_directory(Twine(LongDir), false), errc::file_exists);
+  // And clean up.
+  ASSERT_NO_ERROR(fs::remove("b"));
+  ASSERT_NO_ERROR(fs::remove(
+    Twine(LongDir.substr(0, LongDir.size() - strlen(DotDotDirs)))));
+  ASSERT_NE(::SetCurrentDirectoryA(PreviousDir), 0);
+#endif
 }
 
 TEST_F(FileSystemTest, DirectoryIteration) {
@@ -485,6 +540,8 @@
 const char archive[] = "!<arch>\x0A";
 const char bitcode[] = "\xde\xc0\x17\x0b";
 const char coff_object[] = "\x00\x00......";
+const char coff_bigobj[] = "\x00\x00\xff\xff\x00\x02......"
+    "\xc7\xa1\xba\xd1\xee\xba\xa9\x4b\xaf\x20\xfa\xf6\x6a\xa4\xdc\xb8";
 const char coff_import_library[] = "\x00\x00\xff\xff....";
 const char elf_relocatable[] = { 0x7f, 'E', 'L', 'F', 1, 2, 1, 0, 0,
                                  0,    0,   0,   0,   0, 0, 0, 0, 1 };
@@ -501,6 +558,8 @@
 const char macho_bundle[] = "\xfe\xed\xfa\xce..........\x00\x08";
 const char macho_dsym_companion[] = "\xfe\xed\xfa\xce..........\x00\x0a";
 const char windows_resource[] = "\x00\x00\x00\x00\x020\x00\x00\x00\xff";
+const char macho_dynamically_linked_shared_lib_stub[] =
+    "\xfe\xed\xfa\xce..........\x00\x09";
 
 TEST_F(FileSystemTest, Magic) {
   struct type {
@@ -514,6 +573,7 @@
     DEFINE(archive),
     DEFINE(bitcode),
     DEFINE(coff_object),
+    { "coff_bigobj", coff_bigobj, sizeof(coff_bigobj), fs::file_magic::coff_object },
     DEFINE(coff_import_library),
     DEFINE(elf_relocatable),
     DEFINE(macho_universal_binary),
@@ -525,6 +585,7 @@
     DEFINE(macho_dynamically_linked_shared_lib),
     DEFINE(macho_dynamic_linker),
     DEFINE(macho_bundle),
+    DEFINE(macho_dynamically_linked_shared_lib_stub),
     DEFINE(macho_dsym_companion),
     DEFINE(windows_resource)
 #undef DEFINE
@@ -535,8 +596,8 @@
                                                                      ++i) {
     SmallString<128> file_pathname(TestDirectory);
     path::append(file_pathname, i->filename);
-    std::string ErrMsg;
-    raw_fd_ostream file(file_pathname.c_str(), ErrMsg, sys::fs::F_None);
+    std::error_code EC;
+    raw_fd_ostream file(file_pathname, EC, sys::fs::F_None);
     ASSERT_FALSE(file.has_error());
     StringRef magic(i->magic_str, i->magic_str_len);
     file << magic;
@@ -549,27 +610,27 @@
 #ifdef LLVM_ON_WIN32
 TEST_F(FileSystemTest, CarriageReturn) {
   SmallString<128> FilePathname(TestDirectory);
-  std::string ErrMsg;
+  std::error_code EC;
   path::append(FilePathname, "test");
 
   {
-    raw_fd_ostream File(FilePathname.c_str(), ErrMsg, sys::fs::F_Text);
-    EXPECT_EQ(ErrMsg, "");
+    raw_fd_ostream File(FilePathname, EC, sys::fs::F_Text);
+    ASSERT_NO_ERROR(EC);
     File << '\n';
   }
   {
-    auto Buf = MemoryBuffer::getFile(FilePathname.c_str());
+    auto Buf = MemoryBuffer::getFile(FilePathname.str());
     EXPECT_TRUE((bool)Buf);
     EXPECT_EQ(Buf.get()->getBuffer(), "\r\n");
   }
 
   {
-    raw_fd_ostream File(FilePathname.c_str(), ErrMsg, sys::fs::F_None);
-    EXPECT_EQ(ErrMsg, "");
+    raw_fd_ostream File(FilePathname, EC, sys::fs::F_None);
+    ASSERT_NO_ERROR(EC);
     File << '\n';
   }
   {
-    auto Buf = MemoryBuffer::getFile(FilePathname.c_str());
+    auto Buf = MemoryBuffer::getFile(FilePathname.str());
     EXPECT_TRUE((bool)Buf);
     EXPECT_EQ(Buf.get()->getBuffer(), "\n");
   }
@@ -640,22 +701,22 @@
   SmallString<64> Path5("\\a");
   SmallString<64> Path6("a\\");
 
-  ASSERT_NO_ERROR(fs::normalize_separators(Path1));
+  path::native(Path1);
   EXPECT_PATH_IS(Path1, "a", "a");
 
-  ASSERT_NO_ERROR(fs::normalize_separators(Path2));
-  EXPECT_PATH_IS(Path2, "a/b", "a/b");
+  path::native(Path2);
+  EXPECT_PATH_IS(Path2, "a\\b", "a/b");
 
-  ASSERT_NO_ERROR(fs::normalize_separators(Path3));
+  path::native(Path3);
   EXPECT_PATH_IS(Path3, "a\\b", "a/b");
 
-  ASSERT_NO_ERROR(fs::normalize_separators(Path4));
+  path::native(Path4);
   EXPECT_PATH_IS(Path4, "a\\\\b", "a\\\\b");
 
-  ASSERT_NO_ERROR(fs::normalize_separators(Path5));
+  path::native(Path5);
   EXPECT_PATH_IS(Path5, "\\a", "/a");
 
-  ASSERT_NO_ERROR(fs::normalize_separators(Path6));
+  path::native(Path6);
   EXPECT_PATH_IS(Path6, "a\\", "a/");
 
 #undef EXPECT_PATH_IS

diff --git a/unittests/Support/ProcessTest.cpp b/unittests/Support/ProcessTest.cpp
index f406072..3045c30 100644
--- a/unittests/Support/ProcessTest.cpp
+++ b/unittests/Support/ProcessTest.cpp

@@ -31,12 +31,12 @@
 
   EXPECT_LT(1u, process::get_self()->page_size());
 
-  EXPECT_LT(TimeValue::MinTime, process::get_self()->get_user_time());
-  EXPECT_GT(TimeValue::MaxTime, process::get_self()->get_user_time());
-  EXPECT_LT(TimeValue::MinTime, process::get_self()->get_system_time());
-  EXPECT_GT(TimeValue::MaxTime, process::get_self()->get_system_time());
-  EXPECT_LT(TimeValue::MinTime, process::get_self()->get_wall_time());
-  EXPECT_GT(TimeValue::MaxTime, process::get_self()->get_wall_time());
+  EXPECT_LT(TimeValue::MinTime(), process::get_self()->get_user_time());
+  EXPECT_GT(TimeValue::MaxTime(), process::get_self()->get_user_time());
+  EXPECT_LT(TimeValue::MinTime(), process::get_self()->get_system_time());
+  EXPECT_GT(TimeValue::MaxTime(), process::get_self()->get_system_time());
+  EXPECT_LT(TimeValue::MinTime(), process::get_self()->get_wall_time());
+  EXPECT_GT(TimeValue::MaxTime(), process::get_self()->get_wall_time());
 }
 
 TEST(ProcessTest, GetRandomNumberTest) {

diff --git a/unittests/Support/ProgramTest.cpp b/unittests/Support/ProgramTest.cpp
index 4e7316f..c0e6e80 100644
--- a/unittests/Support/ProgramTest.cpp
+++ b/unittests/Support/ProgramTest.cpp

@@ -34,6 +34,16 @@
 #error sleep_for is not implemented on your platform.
 #endif
 
+#define ASSERT_NO_ERROR(x)                                                     \
+  if (std::error_code ASSERT_NO_ERROR_ec = x) {                                \
+    SmallString<128> MessageStorage;                                           \
+    raw_svector_ostream Message(MessageStorage);                               \
+    Message << #x ": did not return errc::success.\n"                          \
+            << "error number: " << ASSERT_NO_ERROR_ec.value() << "\n"          \
+            << "error message: " << ASSERT_NO_ERROR_ec.message() << "\n";      \
+    GTEST_FATAL_FAILURE_(MessageStorage.c_str());                              \
+  } else {                                                                     \
+  }
 // From TestMain.cpp.
 extern const char *TestMainArgv0;
 
@@ -220,4 +230,44 @@
 
 }
 
+#ifdef LLVM_ON_WIN32
+const char utf16le_text[] =
+    "\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61\x00";
+const char utf16be_text[] =
+    "\x00\x6c\x00\x69\x00\x6e\x00\x67\x00\xfc\x00\x69\x00\xe7\x00\x61";
+#endif
+const char utf8_text[] = "\x6c\x69\x6e\x67\xc3\xbc\x69\xc3\xa7\x61";
+
+TEST(ProgramTest, TestWriteWithSystemEncoding) {
+  SmallString<128> TestDirectory;
+  ASSERT_NO_ERROR(fs::createUniqueDirectory("program-test", TestDirectory));
+  errs() << "Test Directory: " << TestDirectory << '\n';
+  errs().flush();
+  SmallString<128> file_pathname(TestDirectory);
+  path::append(file_pathname, "international-file.txt");
+  // Only on Windows we should encode in UTF16. For other systems, use UTF8
+  ASSERT_NO_ERROR(sys::writeFileWithEncoding(file_pathname.c_str(), utf8_text,
+                                             sys::WEM_UTF16));
+  int fd = 0;
+  ASSERT_NO_ERROR(fs::openFileForRead(file_pathname.c_str(), fd));
+#if defined(LLVM_ON_WIN32)
+  char buf[18];
+  ASSERT_EQ(::read(fd, buf, 18), 18);
+  if (strncmp(buf, "\xfe\xff", 2) == 0) { // UTF16-BE
+    ASSERT_EQ(strncmp(&buf[2], utf16be_text, 16), 0);
+  } else if (strncmp(buf, "\xff\xfe", 2) == 0) { // UTF16-LE
+    ASSERT_EQ(strncmp(&buf[2], utf16le_text, 16), 0);
+  } else {
+    FAIL() << "Invalid BOM in UTF-16 file";
+  }
+#else
+  char buf[10];
+  ASSERT_EQ(::read(fd, buf, 10), 10);
+  ASSERT_EQ(strncmp(buf, utf8_text, 10), 0);
+#endif
+  ::close(fd);
+  ASSERT_NO_ERROR(fs::remove(file_pathname.str()));
+  ASSERT_NO_ERROR(fs::remove(TestDirectory.str()));
+}
+
 } // end anonymous namespace

diff --git a/unittests/Support/SourceMgrTest.cpp b/unittests/Support/SourceMgrTest.cpp
index 2b69fe9..79c2d72 100644
--- a/unittests/Support/SourceMgrTest.cpp
+++ b/unittests/Support/SourceMgrTest.cpp

@@ -23,8 +23,9 @@
   std::string Output;
 
   void setMainBuffer(StringRef Text, StringRef BufferName) {
-    MemoryBuffer *MainBuffer = MemoryBuffer::getMemBuffer(Text, BufferName);
-    MainBufferID = SM.AddNewSourceBuffer(MainBuffer, llvm::SMLoc());
+    std::unique_ptr<MemoryBuffer> MainBuffer =
+        MemoryBuffer::getMemBuffer(Text, BufferName);
+    MainBufferID = SM.AddNewSourceBuffer(std::move(MainBuffer), llvm::SMLoc());
   }
 
   SMLoc getLoc(unsigned Offset) {

diff --git a/unittests/Support/SpecialCaseListTest.cpp b/unittests/Support/SpecialCaseListTest.cpp
index bb9c351..740dbfe 100644
--- a/unittests/Support/SpecialCaseListTest.cpp
+++ b/unittests/Support/SpecialCaseListTest.cpp

@@ -17,14 +17,15 @@
 
 class SpecialCaseListTest : public ::testing::Test {
 protected:
-  SpecialCaseList *makeSpecialCaseList(StringRef List, std::string &Error) {
-    std::unique_ptr<MemoryBuffer> MB(MemoryBuffer::getMemBuffer(List));
+  std::unique_ptr<SpecialCaseList> makeSpecialCaseList(StringRef List,
+                                                       std::string &Error) {
+    std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(List);
     return SpecialCaseList::create(MB.get(), Error);
   }
 
-  SpecialCaseList *makeSpecialCaseList(StringRef List) {
+  std::unique_ptr<SpecialCaseList> makeSpecialCaseList(StringRef List) {
     std::string Error;
-    SpecialCaseList *SCL = makeSpecialCaseList(List, Error);
+    auto SCL = makeSpecialCaseList(List, Error);
     assert(SCL);
     assert(Error == "");
     return SCL;
@@ -32,13 +33,13 @@
 };
 
 TEST_F(SpecialCaseListTest, Basic) {
-  std::unique_ptr<SpecialCaseList> SCL(
+  std::unique_ptr<SpecialCaseList> SCL =
       makeSpecialCaseList("# This is a comment.\n"
                           "\n"
                           "src:hello\n"
                           "src:bye\n"
                           "src:hi=category\n"
-                          "src:z*=category\n"));
+                          "src:z*=category\n");
   EXPECT_TRUE(SCL->inSection("src", "hello"));
   EXPECT_TRUE(SCL->inSection("src", "bye"));
   EXPECT_TRUE(SCL->inSection("src", "hi", "category"));
@@ -48,39 +49,21 @@
   EXPECT_FALSE(SCL->inSection("src", "hello", "category"));
 }
 
-TEST_F(SpecialCaseListTest, GlobalInitCompat) {
-  std::unique_ptr<SpecialCaseList> SCL(
-      makeSpecialCaseList("global:foo=init\n"));
+TEST_F(SpecialCaseListTest, GlobalInit) {
+  std::unique_ptr<SpecialCaseList> SCL =
+      makeSpecialCaseList("global:foo=init\n");
   EXPECT_FALSE(SCL->inSection("global", "foo"));
   EXPECT_FALSE(SCL->inSection("global", "bar"));
   EXPECT_TRUE(SCL->inSection("global", "foo", "init"));
   EXPECT_FALSE(SCL->inSection("global", "bar", "init"));
 
-  SCL.reset(makeSpecialCaseList("global-init:foo\n"));
-  EXPECT_FALSE(SCL->inSection("global", "foo"));
-  EXPECT_FALSE(SCL->inSection("global", "bar"));
-  EXPECT_TRUE(SCL->inSection("global", "foo", "init"));
-  EXPECT_FALSE(SCL->inSection("global", "bar", "init"));
-
-  SCL.reset(makeSpecialCaseList("type:t2=init\n"));
+  SCL = makeSpecialCaseList("type:t2=init\n");
   EXPECT_FALSE(SCL->inSection("type", "t1"));
   EXPECT_FALSE(SCL->inSection("type", "t2"));
   EXPECT_FALSE(SCL->inSection("type", "t1", "init"));
   EXPECT_TRUE(SCL->inSection("type", "t2", "init"));
 
-  SCL.reset(makeSpecialCaseList("global-init-type:t2\n"));
-  EXPECT_FALSE(SCL->inSection("type", "t1"));
-  EXPECT_FALSE(SCL->inSection("type", "t2"));
-  EXPECT_FALSE(SCL->inSection("type", "t1", "init"));
-  EXPECT_TRUE(SCL->inSection("type", "t2", "init"));
-
-  SCL.reset(makeSpecialCaseList("src:hello=init\n"));
-  EXPECT_FALSE(SCL->inSection("src", "hello"));
-  EXPECT_FALSE(SCL->inSection("src", "bye"));
-  EXPECT_TRUE(SCL->inSection("src", "hello", "init"));
-  EXPECT_FALSE(SCL->inSection("src", "bye", "init"));
-
-  SCL.reset(makeSpecialCaseList("global-init-src:hello\n"));
+  SCL = makeSpecialCaseList("src:hello=init\n");
   EXPECT_FALSE(SCL->inSection("src", "hello"));
   EXPECT_FALSE(SCL->inSection("src", "bye"));
   EXPECT_TRUE(SCL->inSection("src", "hello", "init"));
@@ -88,14 +71,14 @@
 }
 
 TEST_F(SpecialCaseListTest, Substring) {
-  std::unique_ptr<SpecialCaseList> SCL(makeSpecialCaseList("src:hello\n"
-                                                           "fun:foo\n"
-                                                           "global:bar\n"));
+  std::unique_ptr<SpecialCaseList> SCL = makeSpecialCaseList("src:hello\n"
+                                                             "fun:foo\n"
+                                                             "global:bar\n");
   EXPECT_FALSE(SCL->inSection("src", "othello"));
   EXPECT_FALSE(SCL->inSection("fun", "tomfoolery"));
   EXPECT_FALSE(SCL->inSection("global", "bartender"));
 
-  SCL.reset(makeSpecialCaseList("fun:*foo*\n"));
+  SCL = makeSpecialCaseList("fun:*foo*\n");
   EXPECT_TRUE(SCL->inSection("fun", "tomfoolery"));
   EXPECT_TRUE(SCL->inSection("fun", "foobar"));
 }
@@ -117,7 +100,7 @@
 }
 
 TEST_F(SpecialCaseListTest, EmptySpecialCaseList) {
-  std::unique_ptr<SpecialCaseList> SCL(makeSpecialCaseList(""));
+  std::unique_ptr<SpecialCaseList> SCL = makeSpecialCaseList("");
   EXPECT_FALSE(SCL->inSection("foo", "bar"));
 }
 

diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 8aed980..074e27f 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp

@@ -84,6 +84,13 @@
   }
 }
 
+TEST(YAMLIO, TestMalformedMapRead) {
+  FooBar doc;
+  Input yin("{foo: 3; bar: 5}", nullptr, suppressErrorMessages);
+  yin >> doc;
+  EXPECT_TRUE(!!yin.error());
+}
+
 //
 // Test the reading of a yaml sequence of mappings
 //

diff --git a/unittests/Support/YAMLParserTest.cpp b/unittests/Support/YAMLParserTest.cpp
index e983935..823a0d6 100644
--- a/unittests/Support/YAMLParserTest.cpp
+++ b/unittests/Support/YAMLParserTest.cpp

@@ -18,7 +18,7 @@
 namespace llvm {
 
 static void SuppressDiagnosticsOutput(const SMDiagnostic &, void *) {
-  // Prevent SourceMgr from writing errors to stderr 
+  // Prevent SourceMgr from writing errors to stderr
   // to reduce noise in unit test runs.
 }
 
@@ -210,8 +210,9 @@
 
   // When we construct a YAML stream over a named buffer,
   // we get its ID as filename in diagnostics.
-  MemoryBuffer* Buffer = MemoryBuffer::getMemBuffer("[]", "buffername.yaml");
-  yaml::Stream Stream(Buffer, SM);
+  std::unique_ptr<MemoryBuffer> Buffer =
+      MemoryBuffer::getMemBuffer("[]", "buffername.yaml");
+  yaml::Stream Stream(Buffer->getMemBufferRef(), SM);
   Stream.printError(Stream.begin()->getRoot(), "Hello, World!");
   EXPECT_EQ("buffername.yaml", GeneratedDiag.getFilename());
 }

diff --git a/unittests/Support/raw_ostream_test.cpp b/unittests/Support/raw_ostream_test.cpp
index 44d27d0..39cfaf0 100644
--- a/unittests/Support/raw_ostream_test.cpp
+++ b/unittests/Support/raw_ostream_test.cpp

@@ -143,4 +143,41 @@
   EXPECT_EQ("\\001\\010\\200", Str);
 }
 
+TEST(raw_ostreamTest, Justify) {  
+  EXPECT_EQ("xyz   ", printToString(left_justify("xyz", 6), 6));
+  EXPECT_EQ("abc",    printToString(left_justify("abc", 3), 3));
+  EXPECT_EQ("big",    printToString(left_justify("big", 1), 3));
+  EXPECT_EQ("   xyz", printToString(right_justify("xyz", 6), 6));
+  EXPECT_EQ("abc",    printToString(right_justify("abc", 3), 3));
+  EXPECT_EQ("big",    printToString(right_justify("big", 1), 3));
+}
+
+TEST(raw_ostreamTest, FormatHex) {  
+  EXPECT_EQ("0x1234",     printToString(format_hex(0x1234, 6), 6));
+  EXPECT_EQ("0x001234",   printToString(format_hex(0x1234, 8), 8));
+  EXPECT_EQ("0x00001234", printToString(format_hex(0x1234, 10), 10));
+  EXPECT_EQ("0x1234",     printToString(format_hex(0x1234, 4), 6));
+  EXPECT_EQ("0xff",       printToString(format_hex(255, 4), 4));
+  EXPECT_EQ("0xFF",       printToString(format_hex(255, 4, true), 4));
+  EXPECT_EQ("0x1",        printToString(format_hex(1, 3), 3));
+  EXPECT_EQ("0x12",       printToString(format_hex(0x12, 3), 4));
+  EXPECT_EQ("0x123",      printToString(format_hex(0x123, 3), 5));
+  EXPECT_EQ("0xffffffffffffffff",     
+                          printToString(format_hex(UINT64_MAX, 18), 18));
+  EXPECT_EQ("0x8000000000000000",     
+                          printToString(format_hex((INT64_MIN), 18), 18));
+}
+
+TEST(raw_ostreamTest, FormatDecimal) {  
+  EXPECT_EQ("   0",        printToString(format_decimal(0, 4), 4));
+  EXPECT_EQ("  -1",        printToString(format_decimal(-1, 4), 4));
+  EXPECT_EQ("    -1",      printToString(format_decimal(-1, 6), 6));
+  EXPECT_EQ("1234567890",  printToString(format_decimal(1234567890, 10), 10));
+  EXPECT_EQ("  9223372036854775807", 
+                          printToString(format_decimal(INT64_MAX, 21), 21));
+  EXPECT_EQ(" -9223372036854775808", 
+                          printToString(format_decimal(INT64_MIN, 21), 21));
+}
+
+
 }

diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index b3a1f5b..c779979 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp

@@ -232,7 +232,7 @@
 
     // Function DI
     DIFile File = DBuilder.createFile("filename.c", "/file/dir/");
-    DIArray ParamTypes = DBuilder.getOrCreateArray(ArrayRef<Value*>());
+    DITypeArray ParamTypes = DBuilder.getOrCreateTypeArray(None);
     DICompositeType FuncType = DBuilder.createSubroutineType(File, ParamTypes);
     DICompileUnit CU = DBuilder.createCompileUnit(dwarf::DW_LANG_C99,
         "filename.c", "/file/dir", "CloneFunc", false, "", 0);
@@ -255,10 +255,11 @@
     // Create a local variable around the alloca
     DIType IntType = DBuilder.createBasicType("int", 32, 0,
         dwarf::DW_ATE_signed);
+    DIExpression E = DBuilder.createExpression();
     DIVariable Variable = DBuilder.createLocalVariable(
       dwarf::DW_TAG_auto_variable, Subprogram, "x", File, 5, IntType, true);
-    DBuilder.insertDeclare(Alloca, Variable, Store);
-    DBuilder.insertDbgValueIntrinsic(AllocaContent, 0, Variable, Terminator);
+    DBuilder.insertDeclare(Alloca, Variable, E, Store);
+    DBuilder.insertDbgValueIntrinsic(AllocaContent, 0, Variable, E, Terminator);
     // Finalize the debug info
     DBuilder.finalize();
 

diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index d88cf36..59affa1 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp

@@ -50,6 +50,18 @@
 NoCanonicalizeWhiteSpace("strict-whitespace",
               cl::desc("Do not treat all horizontal whitespace as equivalent"));
 
+static cl::list<std::string> ImplicitCheckNot(
+    "implicit-check-not",
+    cl::desc("Add an implicit negative check with this pattern to every\n"
+             "positive check. This can be used to ensure that no instances of\n"
+             "this pattern occur which are not matched by a positive pattern"),
+    cl::value_desc("pattern"));
+
+static cl::opt<bool> AllowEmptyInput(
+    "allow-empty", cl::init(false),
+    cl::desc("Allow the input file to be empty. This is useful when making\n"
+             "checks that some error message does not occur, for example."));
+
 typedef cl::list<std::string>::const_iterator prefix_iterator;
 
 //===----------------------------------------------------------------------===//
@@ -624,8 +636,9 @@
 ///
 /// \param PreserveHorizontal Don't squash consecutive horizontal whitespace
 /// characters to a single space.
-static MemoryBuffer *CanonicalizeInputFile(MemoryBuffer *MB,
-                                           bool PreserveHorizontal) {
+static std::unique_ptr<MemoryBuffer>
+CanonicalizeInputFile(std::unique_ptr<MemoryBuffer> MB,
+                      bool PreserveHorizontal) {
   SmallString<128> NewFile;
   NewFile.reserve(MB->getBufferSize());
 
@@ -650,12 +663,8 @@
       ++Ptr;
   }
 
-  // Free the old buffer and return a new one.
-  MemoryBuffer *MB2 =
-    MemoryBuffer::getMemBufferCopy(NewFile.str(), MB->getBufferIdentifier());
-
-  delete MB;
-  return MB2;
+  return std::unique_ptr<MemoryBuffer>(
+      MemoryBuffer::getMemBufferCopy(NewFile.str(), MB->getBufferIdentifier()));
 }
 
 static bool IsPartOfWord(char c) {
@@ -830,14 +839,34 @@
 
   // If we want to canonicalize whitespace, strip excess whitespace from the
   // buffer containing the CHECK lines. Remove DOS style line endings.
-  MemoryBuffer *F = CanonicalizeInputFile(FileOrErr.get().release(),
-                                          NoCanonicalizeWhiteSpace);
-
-  SM.AddNewSourceBuffer(F, SMLoc());
+  std::unique_ptr<MemoryBuffer> F = CanonicalizeInputFile(
+      std::move(FileOrErr.get()), NoCanonicalizeWhiteSpace);
 
   // Find all instances of CheckPrefix followed by : in the file.
   StringRef Buffer = F->getBuffer();
-  std::vector<Pattern> DagNotMatches;
+
+  SM.AddNewSourceBuffer(std::move(F), SMLoc());
+
+  std::vector<Pattern> ImplicitNegativeChecks;
+  for (const auto &PatternString : ImplicitCheckNot) {
+    // Create a buffer with fake command line content in order to display the
+    // command line option responsible for the specific implicit CHECK-NOT.
+    std::string Prefix = std::string("-") + ImplicitCheckNot.ArgStr + "='";
+    std::string Suffix = "'";
+    std::unique_ptr<MemoryBuffer> CmdLine = MemoryBuffer::getMemBufferCopy(
+        Prefix + PatternString + Suffix, "command line");
+
+    StringRef PatternInBuffer =
+        CmdLine->getBuffer().substr(Prefix.size(), PatternString.size());
+    SM.AddNewSourceBuffer(std::move(CmdLine), SMLoc());
+
+    ImplicitNegativeChecks.push_back(Pattern(Check::CheckNot));
+    ImplicitNegativeChecks.back().ParsePattern(PatternInBuffer,
+                                               "IMPLICIT-CHECK", SM, 0);
+  }
+
+
+  std::vector<Pattern> DagNotMatches = ImplicitNegativeChecks;
 
   // LineNumber keeps track of the line on which CheckPrefix instances are
   // found.
@@ -910,6 +939,7 @@
                                        PatternLoc,
                                        CheckTy));
     std::swap(DagNotMatches, CheckStrings.back().DagNotStrings);
+    DagNotMatches = ImplicitNegativeChecks;
   }
 
   // Add an EOF pattern for any trailing CHECK-DAG/-NOTs, and use the first
@@ -1185,7 +1215,11 @@
        I != E; ++I) {
     StringRef Prefix(*I);
 
-    if (!PrefixSet.insert(Prefix))
+    // Reject empty prefixes.
+    if (Prefix == "")
+      return false;
+
+    if (!PrefixSet.insert(Prefix).second)
       return false;
 
     if (!ValidateCheckPrefix(Prefix))
@@ -1231,27 +1265,27 @@
            << "': " << EC.message() << '\n';
     return 2;
   }
-  std::unique_ptr<MemoryBuffer> File = std::move(FileOrErr.get());
+  std::unique_ptr<MemoryBuffer> &File = FileOrErr.get();
 
-  if (File->getBufferSize() == 0) {
+  if (File->getBufferSize() == 0 && !AllowEmptyInput) {
     errs() << "FileCheck error: '" << InputFilename << "' is empty.\n";
     return 2;
   }
 
   // Remove duplicate spaces in the input file if requested.
   // Remove DOS style line endings.
-  MemoryBuffer *F =
-    CanonicalizeInputFile(File.release(), NoCanonicalizeWhiteSpace);
-
-  SM.AddNewSourceBuffer(F, SMLoc());
-
-  /// VariableTable - This holds all the current filecheck variables.
-  StringMap<StringRef> VariableTable;
+  std::unique_ptr<MemoryBuffer> F =
+      CanonicalizeInputFile(std::move(File), NoCanonicalizeWhiteSpace);
 
   // Check that we have all of the expected strings, in order, in the input
   // file.
   StringRef Buffer = F->getBuffer();
 
+  SM.AddNewSourceBuffer(std::move(F), SMLoc());
+
+  /// VariableTable - This holds all the current filecheck variables.
+  StringMap<StringRef> VariableTable;
+
   bool hasError = false;
 
   unsigned i = 0, j = 0, e = CheckStrings.size();

diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 1277086..891328f 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp

@@ -448,7 +448,7 @@
   void formTwoOperandAlias(StringRef Constraint);
 
   void initialize(const AsmMatcherInfo &Info,
-                  SmallPtrSet<Record*, 16> &SingletonRegisters,
+                  SmallPtrSetImpl<Record*> &SingletonRegisters,
                   int AsmVariantNo, std::string &RegisterPrefix);
 
   /// validate - Return true if this matchable is a valid thing to match against
@@ -565,9 +565,9 @@
   Record *TheDef;
 
   /// \brief An unique index assigned to represent this feature.
-  unsigned Index;
+  uint64_t Index;
 
-  SubtargetFeatureInfo(Record *D, unsigned Idx) : TheDef(D), Index(Idx) {}
+  SubtargetFeatureInfo(Record *D, uint64_t Idx) : TheDef(D), Index(Idx) {}
 
   /// \brief The name of the enumerated constant identifying this feature.
   std::string getEnumName() const {
@@ -644,7 +644,7 @@
 
   /// buildRegisterClasses - Build the ClassInfo* instances for register
   /// classes.
-  void buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters);
+  void buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters);
 
   /// buildOperandClasses - Build the ClassInfo* instances for user defined
   /// operand classes.
@@ -766,7 +766,7 @@
 }
 
 void MatchableInfo::initialize(const AsmMatcherInfo &Info,
-                               SmallPtrSet<Record*, 16> &SingletonRegisters,
+                               SmallPtrSetImpl<Record*> &SingletonRegisters,
                                int AsmVariantNo, std::string &RegisterPrefix) {
   AsmVariantID = AsmVariantNo;
   AsmString =
@@ -1075,7 +1075,7 @@
 };
 
 void AsmMatcherInfo::
-buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
+buildRegisterClasses(SmallPtrSetImpl<Record*> &SingletonRegisters) {
   const std::vector<CodeGenRegister*> &Registers =
     Target.getRegBank().getRegisters();
   ArrayRef<CodeGenRegisterClass*> RegClassList =
@@ -1093,7 +1093,7 @@
         (*it)->getOrder().begin(), (*it)->getOrder().end()));
 
   // Add any required singleton sets.
-  for (SmallPtrSet<Record*, 16>::iterator it = SingletonRegisters.begin(),
+  for (SmallPtrSetImpl<Record*>::iterator it = SingletonRegisters.begin(),
        ie = SingletonRegisters.end(); it != ie; ++it) {
     Record *Rec = *it;
     RegisterSets.insert(RegisterSet(&Rec, &Rec + 1));
@@ -1191,7 +1191,7 @@
     RegisterClasses[it->first] = RegisterSetClasses[it->second];
 
   // Name the register classes which correspond to singleton registers.
-  for (SmallPtrSet<Record*, 16>::iterator it = SingletonRegisters.begin(),
+  for (SmallPtrSetImpl<Record*>::iterator it = SingletonRegisters.begin(),
          ie = SingletonRegisters.end(); it != ie; ++it) {
     Record *Rec = *it;
     ClassInfo *CI = RegisterClasses[Rec];
@@ -1327,10 +1327,10 @@
     if (Pred->getName().empty())
       PrintFatalError(Pred->getLoc(), "Predicate has no name!");
 
-    unsigned FeatureNo = SubtargetFeatures.size();
+    uint64_t FeatureNo = SubtargetFeatures.size();
     SubtargetFeatures[Pred] = new SubtargetFeatureInfo(Pred, FeatureNo);
     DEBUG(SubtargetFeatures[Pred]->dump());
-    assert(FeatureNo < 32 && "Too many subtarget features!");
+    assert(FeatureNo < 64 && "Too many subtarget features!");
   }
 
   // Parse the instructions; we need to do this first so that we can gather the
@@ -2205,7 +2205,9 @@
 }
 
 static const char *getMinimalTypeForRange(uint64_t Range) {
-  assert(Range <= 0xFFFFFFFFULL && "Enum too large");
+  assert(Range <= 0xFFFFFFFFFFFFFFFFULL && "Enum too large");
+  if (Range > 0xFFFFFFFFULL)
+    return "uint64_t";
   if (Range > 0xFFFF)
     return "uint32_t";
   if (Range > 0xFF)
@@ -2232,7 +2234,7 @@
          it = Info.SubtargetFeatures.begin(),
          ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
     SubtargetFeatureInfo &SFI = *it->second;
-    OS << "  " << SFI.getEnumName() << " = (1U << " << SFI.Index << "),\n";
+    OS << "  " << SFI.getEnumName() << " = (1ULL << " << SFI.Index << "),\n";
   }
   OS << "  Feature_None = 0\n";
   OS << "};\n\n";
@@ -2263,7 +2265,7 @@
 static void emitGetSubtargetFeatureName(AsmMatcherInfo &Info, raw_ostream &OS) {
   OS << "// User-level names for subtarget features that participate in\n"
      << "// instruction matching.\n"
-     << "static const char *getSubtargetFeatureName(unsigned Val) {\n";
+     << "static const char *getSubtargetFeatureName(uint64_t Val) {\n";
   if (!Info.SubtargetFeatures.empty()) {
     OS << "  switch(Val) {\n";
     typedef std::map<Record*, SubtargetFeatureInfo*, LessRecordByID> RecFeatMap;
@@ -2290,9 +2292,9 @@
   std::string ClassName =
     Info.AsmParser->getValueAsString("AsmParserClassName");
 
-  OS << "unsigned " << Info.Target.getName() << ClassName << "::\n"
+  OS << "uint64_t " << Info.Target.getName() << ClassName << "::\n"
      << "ComputeAvailableFeatures(uint64_t FB) const {\n";
-  OS << "  unsigned Features = 0;\n";
+  OS << "  uint64_t Features = 0;\n";
   for (std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator
          it = Info.SubtargetFeatures.begin(),
          ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
@@ -2446,7 +2448,7 @@
   if (Aliases.empty()) return false;
 
   OS << "static void applyMnemonicAliases(StringRef &Mnemonic, "
-    "unsigned Features, unsigned VariantID) {\n";
+    "uint64_t Features, unsigned VariantID) {\n";
   OS << "  switch (VariantID) {\n";
   unsigned VariantCount = Target.getAsmParserVariantCount();
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
@@ -2589,7 +2591,7 @@
 
   // Emit code to get the available features.
   OS << "  // Get the current feature set.\n";
-  OS << "  unsigned AvailableFeatures = getAvailableFeatures();\n\n";
+  OS << "  uint64_t AvailableFeatures = getAvailableFeatures();\n\n";
 
   OS << "  // Get the next operand index.\n";
   OS << "  unsigned NextOpNum = Operands.size()-1;\n";
@@ -2691,7 +2693,7 @@
   OS << "#undef GET_ASSEMBLER_HEADER\n";
   OS << "  // This should be included into the middle of the declaration of\n";
   OS << "  // your subclasses implementation of MCTargetAsmParser.\n";
-  OS << "  unsigned ComputeAvailableFeatures(uint64_t FeatureBits) const;\n";
+  OS << "  uint64_t ComputeAvailableFeatures(uint64_t FeatureBits) const;\n";
   OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
      << "unsigned Opcode,\n"
      << "                       const OperandVector "
@@ -2703,7 +2705,7 @@
   OS.indent(27);
   OS << "const OperandVector &Operands,\n"
      << "                                MCInst &Inst,\n"
-     << "                                unsigned &ErrorInfo,"
+     << "                                uint64_t &ErrorInfo,"
      << " bool matchingInlineAsm,\n"
      << "                                unsigned VariantID = 0);\n";
 
@@ -2912,7 +2914,7 @@
      << "MatchInstructionImpl(const OperandVector"
      << " &Operands,\n";
   OS << "                     MCInst &Inst,\n"
-     << "unsigned &ErrorInfo, bool matchingInlineAsm, unsigned VariantID) {\n";
+     << "uint64_t &ErrorInfo, bool matchingInlineAsm, unsigned VariantID) {\n";
 
   OS << "  // Eliminate obvious mismatches.\n";
   OS << "  if (Operands.size() > " << (MaxNumOperands+1) << ") {\n";
@@ -2922,7 +2924,7 @@
 
   // Emit code to get the available features.
   OS << "  // Get the current feature set.\n";
-  OS << "  unsigned AvailableFeatures = getAvailableFeatures();\n\n";
+  OS << "  uint64_t AvailableFeatures = getAvailableFeatures();\n\n";
 
   OS << "  // Get the instruction mnemonic, which is the first token.\n";
   OS << "  StringRef Mnemonic = ((" << Target.getName()
@@ -2938,7 +2940,7 @@
   OS << "  bool HadMatchOtherThanFeatures = false;\n";
   OS << "  bool HadMatchOtherThanPredicate = false;\n";
   OS << "  unsigned RetCode = Match_InvalidOperand;\n";
-  OS << "  unsigned MissingFeatures = ~0U;\n";
+  OS << "  uint64_t MissingFeatures = ~0ULL;\n";
   OS << "  // Set ErrorInfo to the operand that mismatches if it is\n";
   OS << "  // wrong for all instances of the instruction.\n";
   OS << "  ErrorInfo = ~0U;\n";
@@ -3014,10 +3016,10 @@
   OS << "    if ((AvailableFeatures & it->RequiredFeatures) "
      << "!= it->RequiredFeatures) {\n";
   OS << "      HadMatchOtherThanFeatures = true;\n";
-  OS << "      unsigned NewMissingFeatures = it->RequiredFeatures & "
+  OS << "      uint64_t NewMissingFeatures = it->RequiredFeatures & "
         "~AvailableFeatures;\n";
-  OS << "      if (CountPopulation_32(NewMissingFeatures) <=\n"
-        "          CountPopulation_32(MissingFeatures))\n";
+  OS << "      if (CountPopulation_64(NewMissingFeatures) <=\n"
+        "          CountPopulation_64(MissingFeatures))\n";
   OS << "        MissingFeatures = NewMissingFeatures;\n";
   OS << "      continue;\n";
   OS << "    }\n";
@@ -3055,7 +3057,7 @@
     OS << "    if (MII.get(Inst.getOpcode()).getDeprecatedInfo(Inst, STI, Info)) {\n";
     OS << "      SMLoc Loc = ((" << Target.getName()
        << "Operand&)*Operands[0]).getStartLoc();\n";
-    OS << "      Parser.Warning(Loc, Info, None);\n";
+    OS << "      getParser().Warning(Loc, Info, None);\n";
     OS << "    }\n";
   }
 

diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index c7fe9df..7ef70d3 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp

@@ -424,7 +424,7 @@
   }
 
   // Emit the string itself.
-  O << "  const char AsmStrs[] = {\n";
+  O << "  static const char AsmStrs[] = {\n";
   StringTable.emit(O, printChar);
   O << "  };\n\n";
 

diff --git a/utils/TableGen/AsmWriterInst.h b/utils/TableGen/AsmWriterInst.h
index 4cee352..fd77982 100644
--- a/utils/TableGen/AsmWriterInst.h
+++ b/utils/TableGen/AsmWriterInst.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ASMWRITER_INST_H
-#define ASMWRITER_INST_H
+#ifndef LLVM_UTILS_TABLEGEN_ASMWRITERINST_H
+#define LLVM_UTILS_TABLEGEN_ASMWRITERINST_H
 
 #include <string>
 #include <vector>

diff --git a/utils/TableGen/CTagsEmitter.cpp b/utils/TableGen/CTagsEmitter.cpp
index 5d6d6da..bbed92a 100644
--- a/utils/TableGen/CTagsEmitter.cpp
+++ b/utils/TableGen/CTagsEmitter.cpp

@@ -69,19 +69,15 @@
 }
 
 void CTagsEmitter::run(raw_ostream &OS) {
-  const std::map<std::string, Record *> &Classes = Records.getClasses();
-  const std::map<std::string, Record *> &Defs = Records.getDefs();
+  const auto &Classes = Records.getClasses();
+  const auto &Defs = Records.getDefs();
   std::vector<Tag> Tags;
   // Collect tags.
   Tags.reserve(Classes.size() + Defs.size());
-  for (std::map<std::string, Record *>::const_iterator I = Classes.begin(),
-                                                       E = Classes.end();
-       I != E; ++I)
-    Tags.push_back(Tag(I->first, locate(I->second)));
-  for (std::map<std::string, Record *>::const_iterator I = Defs.begin(),
-                                                       E = Defs.end();
-       I != E; ++I)
-    Tags.push_back(Tag(I->first, locate(I->second)));
+  for (const auto &C : Classes)
+    Tags.push_back(Tag(C.first, locate(C.second.get())));
+  for (const auto &D : Defs)
+    Tags.push_back(Tag(D.first, locate(D.second.get())));
   // Emit tags.
   std::sort(Tags.begin(), Tags.end());
   OS << "!_TAG_FILE_FORMAT\t1\t/original ctags format/\n";

diff --git a/utils/TableGen/CallingConvEmitter.cpp b/utils/TableGen/CallingConvEmitter.cpp
index 6d43e8e..6a65e5e 100644
--- a/utils/TableGen/CallingConvEmitter.cpp
+++ b/utils/TableGen/CallingConvEmitter.cpp

@@ -35,23 +35,26 @@
 } // End anonymous namespace
 
 void CallingConvEmitter::run(raw_ostream &O) {
-
   std::vector<Record*> CCs = Records.getAllDerivedDefinitions("CallingConv");
-  
-  // Emit prototypes for all of the CC's so that they can forward ref each
-  // other.
+
+  // Emit prototypes for all of the non-custom CC's so that they can forward ref
+  // each other.
   for (unsigned i = 0, e = CCs.size(); i != e; ++i) {
-    O << "static bool " << CCs[i]->getName()
-      << "(unsigned ValNo, MVT ValVT,\n"
-      << std::string(CCs[i]->getName().size()+13, ' ')
-      << "MVT LocVT, CCValAssign::LocInfo LocInfo,\n"
-      << std::string(CCs[i]->getName().size()+13, ' ')
-      << "ISD::ArgFlagsTy ArgFlags, CCState &State);\n";
+    if (!CCs[i]->getValueAsBit("Custom")) {
+      O << "static bool " << CCs[i]->getName()
+        << "(unsigned ValNo, MVT ValVT,\n"
+        << std::string(CCs[i]->getName().size() + 13, ' ')
+        << "MVT LocVT, CCValAssign::LocInfo LocInfo,\n"
+        << std::string(CCs[i]->getName().size() + 13, ' ')
+        << "ISD::ArgFlagsTy ArgFlags, CCState &State);\n";
+    }
   }
-  
-  // Emit each calling convention description in full.
-  for (unsigned i = 0, e = CCs.size(); i != e; ++i)
-    EmitCallingConv(CCs[i], O);
+
+  // Emit each non-custom calling convention description in full.
+  for (unsigned i = 0, e = CCs.size(); i != e; ++i) {
+    if (!CCs[i]->getValueAsBit("Custom"))
+      EmitCallingConv(CCs[i], O);
+  }
 }
 
 
@@ -178,13 +181,17 @@
       if (Size)
         O << Size << ", ";
       else
-        O << "\n" << IndentStr << "  State.getTarget().getDataLayout()"
-          "->getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext())), ";
+        O << "\n" << IndentStr
+          << "  State.getMachineFunction().getSubtarget().getDataLayout()"
+             "->getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext())),"
+             " ";
       if (Align)
         O << Align;
       else
-        O << "\n" << IndentStr << "  State.getTarget().getDataLayout()"
-          "->getABITypeAlignment(EVT(LocVT).getTypeForEVT(State.getContext()))";
+        O << "\n" << IndentStr
+          << "  State.getMachineFunction().getSubtarget().getDataLayout()"
+             "->getABITypeAlignment(EVT(LocVT).getTypeForEVT(State.getContext()"
+             "))";
       O << ");\n" << IndentStr
         << "State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset"
         << Counter << ", LocVT, LocInfo));\n";
@@ -227,6 +234,21 @@
           << IndentStr << "else\n"
           << IndentStr << IndentStr << "LocInfo = CCValAssign::AExt;\n";
       }
+    } else if (Action->isSubClassOf("CCPromoteToUpperBitsInType")) {
+      Record *DestTy = Action->getValueAsDef("DestTy");
+      MVT::SimpleValueType DestVT = getValueType(DestTy);
+      O << IndentStr << "LocVT = " << getEnumName(DestVT) << ";\n";
+      if (MVT(DestVT).isFloatingPoint()) {
+        PrintFatalError("CCPromoteToUpperBitsInType does not handle floating "
+                        "point");
+      } else {
+        O << IndentStr << "if (ArgFlags.isSExt())\n"
+          << IndentStr << IndentStr << "LocInfo = CCValAssign::SExtUpper;\n"
+          << IndentStr << "else if (ArgFlags.isZExt())\n"
+          << IndentStr << IndentStr << "LocInfo = CCValAssign::ZExtUpper;\n"
+          << IndentStr << "else\n"
+          << IndentStr << IndentStr << "LocInfo = CCValAssign::AExtUpper;\n";
+      }
     } else if (Action->isSubClassOf("CCBitConvertToType")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
       O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";

diff --git a/utils/TableGen/CodeEmitterGen.cpp b/utils/TableGen/CodeEmitterGen.cpp
index 4d0c0ca..11911b6 100644
--- a/utils/TableGen/CodeEmitterGen.cpp
+++ b/utils/TableGen/CodeEmitterGen.cpp

@@ -24,14 +24,6 @@
 #include <vector>
 using namespace llvm;
 
-// FIXME: Somewhat hackish to use a command line option for this. There should
-// be a CodeEmitter class in the Target.td that controls this sort of thing
-// instead.
-static cl::opt<bool>
-MCEmitter("mc-emitter",
-          cl::desc("Generate CodeEmitter for use with the MC library."),
-          cl::init(false));
-
 namespace {
 
 class CodeEmitterGen {
@@ -134,15 +126,13 @@
     if (SO.second == 0) {
       Case += "      // op: " + VarName + "\n" +
               "      op = " + EncoderMethodName + "(MI, " + utostr(OpIdx);
-      if (MCEmitter)
-        Case += ", Fixups, STI";
+      Case += ", Fixups, STI";
       Case += ");\n";
     }
   } else {
     Case += "      // op: " + VarName + "\n" +
       "      op = getMachineOpValue(MI, MI.getOperand(" + utostr(OpIdx) + ")";
-    if (MCEmitter)
-      Case += ", Fixups, STI";
+    Case += ", Fixups, STI";
     Case += ");\n";
   }
   
@@ -223,8 +213,7 @@
   std::string PostEmitter = R->getValueAsString("PostEncoderMethod");
   if (!PostEmitter.empty()) {
     Case += "      Value = " + PostEmitter + "(MI, Value";
-    if (MCEmitter)
-      Case += ", STI";
+    Case += ", STI";
     Case += ");\n";
   }
   
@@ -243,12 +232,9 @@
 
   // Emit function declaration
   o << "uint64_t " << Target.getName();
-  if (MCEmitter)
-    o << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
-      << "    SmallVectorImpl<MCFixup> &Fixups,\n"
-      << "    const MCSubtargetInfo &STI) const {\n";
-  else
-    o << "CodeEmitter::getBinaryCodeForInstr(const MachineInstr &MI) const {\n";
+  o << "MCCodeEmitter::getBinaryCodeForInstr(const MCInst &MI,\n"
+    << "    SmallVectorImpl<MCFixup> &Fixups,\n"
+    << "    const MCSubtargetInfo &STI) const {\n";
 
   // Emit instruction base values
   o << "  static const uint64_t InstBits[] = {\n";

diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 2602bbc..a750aa9 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp

@@ -771,7 +771,7 @@
 
 /// Compute the complexity metric for the input pattern.  This roughly
 /// corresponds to the number of nodes that are covered.
-unsigned PatternToMatch::
+int PatternToMatch::
 getPatternComplexity(const CodeGenDAGPatterns &CGP) const {
   return getPatternSize(getSrcPattern(), CGP) + getAddedComplexity();
 }
@@ -1387,7 +1387,7 @@
 
   if (R->isSubClassOf("SubRegIndex")) {
     assert(ResNo == 0 && "SubRegisterIndices only produce one result!");
-    return EEVT::TypeSet();
+    return EEVT::TypeSet(MVT::i32, TP);
   }
 
   if (R->isSubClassOf("ValueType")) {
@@ -1529,7 +1529,16 @@
   return false;
 }
 
+static bool isOperandClass(const TreePatternNode *N, StringRef Class) {
+  if (!N->isLeaf())
+    return N->getOperator()->isSubClassOf(Class);
 
+  DefInit *DI = dyn_cast<DefInit>(N->getLeafValue());
+  if (DI && DI->getDef()->isSubClassOf(Class))
+    return true;
+
+  return false;
+}
 /// ApplyTypeConstraints - Apply all of the type constraints relevant to
 /// this node and its children in the tree.  This returns true if it makes a
 /// change, false otherwise.  If a type contradiction is found, flag an error.
@@ -1689,6 +1698,34 @@
       assert(getChild(0)->getNumTypes() == 1 && "FIXME: Unhandled");
       MadeChange |= UpdateNodeType(0, getChild(0)->getExtType(0), TP);
       MadeChange |= getChild(0)->UpdateNodeType(0, getExtType(0), TP);
+    } else if (getOperator()->getName() == "REG_SEQUENCE") {
+      // We need to do extra, custom typechecking for REG_SEQUENCE since it is
+      // variadic.
+
+      unsigned NChild = getNumChildren();
+      if (NChild < 3) {
+        TP.error("REG_SEQUENCE requires at least 3 operands!");
+        return false;
+      }
+
+      if (NChild % 2 == 0) {
+        TP.error("REG_SEQUENCE requires an odd number of operands!");
+        return false;
+      }
+
+      if (!isOperandClass(getChild(0), "RegisterClass")) {
+        TP.error("REG_SEQUENCE requires a RegisterClass for first operand!");
+        return false;
+      }
+
+      for (unsigned I = 1; I < NChild; I += 2) {
+        TreePatternNode *SubIdxChild = getChild(I + 1);
+        if (!isOperandClass(SubIdxChild, "SubRegIndex")) {
+          TP.error("REG_SEQUENCE requires a SubRegIndex for operand " +
+                   itostr(I + 1) + "!");
+          return false;
+        }
+      }
     }
 
     unsigned ChildNo = 0;
@@ -1749,7 +1786,7 @@
       MadeChange |= Child->UpdateNodeTypeFromInst(ChildResNo, OperandNode, TP);
     }
 
-    if (ChildNo != getNumChildren()) {
+    if (!InstInfo.Operands.isVariadic && ChildNo != getNumChildren()) {
       TP.error("Instruction '" + getOperator()->getName() +
                "' was provided too many operands!");
       return false;
@@ -1871,7 +1908,7 @@
   Trees.push_back(Pat);
 }
 
-void TreePattern::error(const std::string &Msg) {
+void TreePattern::error(const Twine &Msg) {
   if (HasError)
     return;
   dump();
@@ -2226,13 +2263,6 @@
   VerifyInstructionFlags();
 }
 
-CodeGenDAGPatterns::~CodeGenDAGPatterns() {
-  for (pf_iterator I = PatternFragments.begin(),
-       E = PatternFragments.end(); I != E; ++I)
-    delete I->second;
-}
-
-
 Record *CodeGenDAGPatterns::getSDNodeNamed(const std::string &Name) const {
   Record *N = Records.getDef(Name);
   if (!N || !N->isSubClassOf("SDNode")) {
@@ -2294,9 +2324,9 @@
 
     DagInit *Tree = Fragments[i]->getValueAsDag("Fragment");
     TreePattern *P =
-      new TreePattern(Fragments[i], Tree,
-                      !Fragments[i]->isSubClassOf("OutPatFrag"), *this);
-    PatternFragments[Fragments[i]] = P;
+        (PatternFragments[Fragments[i]] = llvm::make_unique<TreePattern>(
+             Fragments[i], Tree, !Fragments[i]->isSubClassOf("OutPatFrag"),
+             *this)).get();
 
     // Validate the argument list, converting it to set, to discard duplicates.
     std::vector<std::string> &Args = P->getArgList();
@@ -2354,16 +2384,16 @@
     if (OutFrags != Fragments[i]->isSubClassOf("OutPatFrag"))
       continue;
 
-    TreePattern *ThePat = PatternFragments[Fragments[i]];
-    ThePat->InlinePatternFragments();
+    TreePattern &ThePat = *PatternFragments[Fragments[i]];
+    ThePat.InlinePatternFragments();
 
     // Infer as many types as possible.  Don't worry about it if we don't infer
     // all of them, some may depend on the inputs of the pattern.
-    ThePat->InferAllTypes();
-    ThePat->resetError();
+    ThePat.InferAllTypes();
+    ThePat.resetError();
 
     // If debugging, print out the pattern fragment result.
-    DEBUG(ThePat->dump());
+    DEBUG(ThePat.dump());
   }
 }
 
@@ -3274,14 +3304,14 @@
     if (LI->getSize() == 0) continue;  // no pattern.
 
     // Parse the instruction.
-    TreePattern *Result = new TreePattern(CurPattern, LI, false, *this);
+    TreePattern Result(CurPattern, LI, false, *this);
 
     // Inline pattern fragments into it.
-    Result->InlinePatternFragments();
+    Result.InlinePatternFragments();
 
-    if (Result->getNumTrees() != 1)
-      Result->error("Cannot handle instructions producing instructions "
-                    "with temporaries yet!");
+    if (Result.getNumTrees() != 1)
+      Result.error("Cannot handle instructions producing instructions "
+                   "with temporaries yet!");
 
     bool IterateInference;
     bool InferredAllPatternTypes, InferredAllResultTypes;
@@ -3294,7 +3324,7 @@
       // Infer as many types as possible.  If we cannot infer all of them, we
       // can never do anything with this pattern: report it to the user.
       InferredAllResultTypes =
-        Result->InferAllTypes(&Pattern->getNamedNodesMap());
+          Result.InferAllTypes(&Pattern->getNamedNodesMap());
 
       IterateInference = false;
 
@@ -3302,13 +3332,13 @@
       // resolve cases where the input type is known to be a pointer type (which
       // is considered resolved), but the result knows it needs to be 32- or
       // 64-bits.  Infer the other way for good measure.
-      for (unsigned i = 0, e = std::min(Result->getTree(0)->getNumTypes(),
+      for (unsigned i = 0, e = std::min(Result.getTree(0)->getNumTypes(),
                                         Pattern->getTree(0)->getNumTypes());
            i != e; ++i) {
-        IterateInference = Pattern->getTree(0)->
-          UpdateNodeType(i, Result->getTree(0)->getExtType(i), *Result);
-        IterateInference |= Result->getTree(0)->
-          UpdateNodeType(i, Pattern->getTree(0)->getExtType(i), *Result);
+        IterateInference = Pattern->getTree(0)->UpdateNodeType(
+            i, Result.getTree(0)->getExtType(i), Result);
+        IterateInference |= Result.getTree(0)->UpdateNodeType(
+            i, Pattern->getTree(0)->getExtType(i), Result);
       }
 
       // If our iteration has converged and the input pattern's types are fully
@@ -3322,8 +3352,8 @@
       // arbitrary types to the result pattern's nodes.
       if (!IterateInference && InferredAllPatternTypes &&
           !InferredAllResultTypes)
-        IterateInference = ForceArbitraryInstResultType(Result->getTree(0),
-                                                        *Result);
+        IterateInference =
+            ForceArbitraryInstResultType(Result.getTree(0), Result);
     } while (IterateInference);
 
     // Verify that we inferred enough types that we can do something with the
@@ -3332,7 +3362,7 @@
       Pattern->error("Could not infer all types in pattern!");
     if (!InferredAllResultTypes) {
       Pattern->dump();
-      Result->error("Could not infer all types in pattern result!");
+      Result.error("Could not infer all types in pattern result!");
     }
 
     // Validate that the input pattern is correct.
@@ -3345,7 +3375,7 @@
                                   InstImpResults);
 
     // Promote the xform function to be an explicit node if set.
-    TreePatternNode *DstPattern = Result->getOnlyTree();
+    TreePatternNode *DstPattern = Result.getOnlyTree();
     std::vector<TreePatternNode*> ResultNodeOperands;
     for (unsigned ii = 0, ee = DstPattern->getNumChildren(); ii != ee; ++ii) {
       TreePatternNode *OpNode = DstPattern->getChild(ii);
@@ -3357,16 +3387,16 @@
       }
       ResultNodeOperands.push_back(OpNode);
     }
-    DstPattern = Result->getOnlyTree();
+    DstPattern = Result.getOnlyTree();
     if (!DstPattern->isLeaf())
       DstPattern = new TreePatternNode(DstPattern->getOperator(),
                                        ResultNodeOperands,
                                        DstPattern->getNumTypes());
 
-    for (unsigned i = 0, e = Result->getOnlyTree()->getNumTypes(); i != e; ++i)
-      DstPattern->setType(i, Result->getOnlyTree()->getExtType(i));
+    for (unsigned i = 0, e = Result.getOnlyTree()->getNumTypes(); i != e; ++i)
+      DstPattern->setType(i, Result.getOnlyTree()->getExtType(i));
 
-    TreePattern Temp(Result->getRecord(), DstPattern, false, *this);
+    TreePattern Temp(Result.getRecord(), DstPattern, false, *this);
     Temp.InferAllTypes();
 
 

diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index fb30cdd..c0812cf 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_DAGPATTERNS_H
-#define CODEGEN_DAGPATTERNS_H
+#ifndef LLVM_UTILS_TABLEGEN_CODEGENDAGPATTERNS_H
+#define LLVM_UTILS_TABLEGEN_CODEGENDAGPATTERNS_H
 
 #include "CodeGenIntrinsics.h"
 #include "CodeGenTarget.h"
@@ -597,7 +597,7 @@
 
   /// error - If this is the first error in the current resolution step,
   /// print it and set the error flag.  Otherwise, continue silently.
-  void error(const std::string &Msg);
+  void error(const Twine &Msg);
   bool hasError() const {
     return HasError;
   }
@@ -667,7 +667,7 @@
   PatternToMatch(Record *srcrecord, ListInit *preds,
                  TreePatternNode *src, TreePatternNode *dst,
                  const std::vector<Record*> &dstregs,
-                 unsigned complexity, unsigned uid)
+                 int complexity, unsigned uid)
     : SrcRecord(srcrecord), Predicates(preds), SrcPattern(src), DstPattern(dst),
       Dstregs(dstregs), AddedComplexity(complexity), ID(uid) {}
 
@@ -676,7 +676,7 @@
   TreePatternNode *SrcPattern;  // Source pattern to match.
   TreePatternNode *DstPattern;  // Resulting pattern.
   std::vector<Record*> Dstregs; // Physical register defs being matched.
-  unsigned         AddedComplexity; // Add to matching pattern complexity.
+  int              AddedComplexity; // Add to matching pattern complexity.
   unsigned         ID;          // Unique ID for the record.
 
   Record          *getSrcRecord()  const { return SrcRecord; }
@@ -684,13 +684,13 @@
   TreePatternNode *getSrcPattern() const { return SrcPattern; }
   TreePatternNode *getDstPattern() const { return DstPattern; }
   const std::vector<Record*> &getDstRegs() const { return Dstregs; }
-  unsigned         getAddedComplexity() const { return AddedComplexity; }
+  int         getAddedComplexity() const { return AddedComplexity; }
 
   std::string getPredicateCheck() const;
 
   /// Compute the complexity metric for the input pattern.  This roughly
   /// corresponds to the number of nodes that are covered.
-  unsigned getPatternComplexity(const CodeGenDAGPatterns &CGP) const;
+  int getPatternComplexity(const CodeGenDAGPatterns &CGP) const;
 };
 
 class CodeGenDAGPatterns {
@@ -702,7 +702,8 @@
   std::map<Record*, SDNodeInfo, LessRecordByID> SDNodes;
   std::map<Record*, std::pair<Record*, std::string>, LessRecordByID> SDNodeXForms;
   std::map<Record*, ComplexPattern, LessRecordByID> ComplexPatterns;
-  std::map<Record*, TreePattern*, LessRecordByID> PatternFragments;
+  std::map<Record *, std::unique_ptr<TreePattern>, LessRecordByID>
+      PatternFragments;
   std::map<Record*, DAGDefaultOperand, LessRecordByID> DefaultOperands;
   std::map<Record*, DAGInstruction, LessRecordByID> Instructions;
 
@@ -716,7 +717,6 @@
   std::vector<PatternToMatch> PatternsToMatch;
 public:
   CodeGenDAGPatterns(RecordKeeper &R);
-  ~CodeGenDAGPatterns();
 
   CodeGenTarget &getTargetInfo() { return Target; }
   const CodeGenTarget &getTargetInfo() const { return Target; }
@@ -778,15 +778,16 @@
   // Pattern Fragment information.
   TreePattern *getPatternFragment(Record *R) const {
     assert(PatternFragments.count(R) && "Invalid pattern fragment request!");
-    return PatternFragments.find(R)->second;
+    return PatternFragments.find(R)->second.get();
   }
   TreePattern *getPatternFragmentIfRead(Record *R) const {
-    if (!PatternFragments.count(R)) return nullptr;
-    return PatternFragments.find(R)->second;
+    if (!PatternFragments.count(R))
+      return nullptr;
+    return PatternFragments.find(R)->second.get();
   }
 
-  typedef std::map<Record*, TreePattern*, LessRecordByID>::const_iterator
-          pf_iterator;
+  typedef std::map<Record *, std::unique_ptr<TreePattern>,
+                   LessRecordByID>::const_iterator pf_iterator;
   pf_iterator pf_begin() const { return PatternFragments.begin(); }
   pf_iterator pf_end() const { return PatternFragments.end(); }
 

diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index 2577ad4..d567dde 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp

@@ -314,6 +314,9 @@
   hasPostISelHook = R->getValueAsBit("hasPostISelHook");
   hasCtrlDep   = R->getValueAsBit("hasCtrlDep");
   isNotDuplicable = R->getValueAsBit("isNotDuplicable");
+  isRegSequence = R->getValueAsBit("isRegSequence");
+  isExtractSubreg = R->getValueAsBit("isExtractSubreg");
+  isInsertSubreg = R->getValueAsBit("isInsertSubreg");
 
   bool Unset;
   mayLoad      = R->getValueAsBitOrUnset("mayLoad", Unset);
@@ -520,6 +523,21 @@
     return true;
   }
 
+  // Bits<n> (also used for 0bxx literals)
+  if (BitsInit *BI = dyn_cast<BitsInit>(Arg)) {
+    if (hasSubOps || !InstOpRec->isSubClassOf("Operand"))
+      return false;
+    if (!BI->isComplete())
+      return false;
+    // Convert the bits init to an integer and use that for the result.
+    IntInit *II =
+      dyn_cast_or_null<IntInit>(BI->convertInitializerTo(IntRecTy::get()));
+    if (!II)
+      return false;
+    ResOp = ResultOperand(II->getValue());
+    return true;
+  }
+
   // If both are Operands with the same MVT, allow the conversion. It's
   // up to the user to make sure the values are appropriate, just like
   // for isel Pat's.

diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index f143875..92aac5f 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_INSTRUCTION_H
-#define CODEGEN_INSTRUCTION_H
+#ifndef LLVM_UTILS_TABLEGEN_CODEGENINSTRUCTION_H
+#define LLVM_UTILS_TABLEGEN_CODEGENINSTRUCTION_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineValueType.h"
@@ -253,6 +253,9 @@
     bool hasExtraDefRegAllocReq : 1;
     bool isCodeGenOnly : 1;
     bool isPseudo : 1;
+    bool isRegSequence : 1;
+    bool isExtractSubreg : 1;
+    bool isInsertSubreg : 1;
 
     std::string DeprecatedReason;
     bool HasComplexDeprecationPredicate;

diff --git a/utils/TableGen/CodeGenIntrinsics.h b/utils/TableGen/CodeGenIntrinsics.h
index a9ece01..1f1adf1 100644
--- a/utils/TableGen/CodeGenIntrinsics.h
+++ b/utils/TableGen/CodeGenIntrinsics.h

@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_INTRINSIC_H
-#define CODEGEN_INTRINSIC_H
+#ifndef LLVM_UTILS_TABLEGEN_CODEGENINTRINSICS_H
+#define LLVM_UTILS_TABLEGEN_CODEGENINTRINSICS_H
 
 #include "llvm/CodeGen/MachineValueType.h"
 #include <string>

diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index 8099f13..678222f 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp

@@ -901,9 +901,8 @@
     FindI = SuperRegClasses.find(SubIdx);
   if (FindI == SuperRegClasses.end())
     return;
-  for (SmallPtrSet<CodeGenRegisterClass*, 8>::const_iterator I =
-       FindI->second.begin(), E = FindI->second.end(); I != E; ++I)
-    Out.set((*I)->EnumValue);
+  for (CodeGenRegisterClass *RC : FindI->second)
+    Out.set(RC->EnumValue);
 }
 
 // Populate a unique sorted list of units from a register set.
@@ -967,9 +966,12 @@
 
   // Compute register name map.
   for (unsigned i = 0, e = Registers.size(); i != e; ++i)
-    RegistersByName.GetOrCreateValue(
-                       Registers[i]->TheDef->getValueAsString("AsmName"),
-                       Registers[i]);
+    // FIXME: This could just be RegistersByName[name] = register, except that
+    // causes some failures in MIPS - perhaps they have duplicate register name
+    // entries? (or maybe there's a reason for it - I don't know much about this
+    // code, just drive-by refactoring)
+    RegistersByName.insert(std::make_pair(
+        Registers[i]->TheDef->getValueAsString("AsmName"), Registers[i]));
 
   // Precompute all sub-register maps.
   // This will create Composite entries for all inferred sub-register indices.
@@ -1533,7 +1535,7 @@
   assert(RegUnitSets.empty() && "dirty RegUnitSets");
 
   // Compute a unique RegUnitSet for each RegClass.
-  const ArrayRef<CodeGenRegisterClass*> &RegClasses = getRegClasses();
+  ArrayRef<CodeGenRegisterClass*> RegClasses = getRegClasses();
   unsigned NumRegClasses = RegClasses.size();
   for (unsigned RCIdx = 0, RCEnd = NumRegClasses; RCIdx != RCEnd; ++RCIdx) {
     if (!RegClasses[RCIdx]->Allocatable)

diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index 278315b..c1e37fa 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_REGISTERS_H
-#define CODEGEN_REGISTERS_H
+#ifndef LLVM_UTILS_TABLEGEN_CODEGENREGISTERS_H
+#define LLVM_UTILS_TABLEGEN_CODEGENREGISTERS_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"

diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index 79d60ac..4cf7b5f 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp

@@ -182,7 +182,7 @@
 // Recursively find all reachable SchedReadWrite records.
 static void scanSchedRW(Record *RWDef, RecVec &RWDefs,
                         SmallPtrSet<Record*, 16> &RWSet) {
-  if (!RWSet.insert(RWDef))
+  if (!RWSet.insert(RWDef).second)
     return;
   RWDefs.push_back(RWDef);
   // Reads don't current have sequence records, but it can be added later.
@@ -751,7 +751,7 @@
     for (ArrayRef<Record*>::const_iterator
            II = InstDefs.begin(), IE = InstDefs.end(); II != IE; ++II) {
       unsigned OldSCIdx = InstrClassMap[*II];
-      if (OldSCIdx && RemappedClassIDs.insert(OldSCIdx)) {
+      if (OldSCIdx && RemappedClassIDs.insert(OldSCIdx).second) {
         for (RecIter RI = SchedClasses[OldSCIdx].InstRWs.begin(),
                RE = SchedClasses[OldSCIdx].InstRWs.end(); RI != RE; ++RI) {
           if ((*RI)->getValueAsDef("SchedModel") == RWModelDef) {

diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 3fef8ad..e5241b9 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h

@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_SCHEDULE_H
-#define CODEGEN_SCHEDULE_H
+#ifndef LLVM_UTILS_TABLEGEN_CODEGENSCHEDULE_H
+#define LLVM_UTILS_TABLEGEN_CODEGENSCHEDULE_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"

diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index d1b5711..597da68 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp

@@ -301,7 +301,8 @@
       "GC_LABEL",     "KILL",          "EXTRACT_SUBREG",   "INSERT_SUBREG",
       "IMPLICIT_DEF", "SUBREG_TO_REG", "COPY_TO_REGCLASS", "DBG_VALUE",
       "REG_SEQUENCE", "COPY",          "BUNDLE",           "LIFETIME_START",
-      "LIFETIME_END", "STACKMAP",      "PATCHPOINT",       nullptr};
+      "LIFETIME_END", "STACKMAP",      "PATCHPOINT",       "LOAD_STACK_GUARD",
+      nullptr};
   const DenseMap<const Record*, CodeGenInstruction*> &Insts = getInstructions();
   for (const char *const *p = FixedInstrs; *p; ++p) {
     const CodeGenInstruction *Instr = GetInstByName(*p, Insts, Records);

diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 5414310..f4e1b6a 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CODEGEN_TARGET_H
-#define CODEGEN_TARGET_H
+#ifndef LLVM_UTILS_TABLEGEN_CODEGENTARGET_H
+#define LLVM_UTILS_TABLEGEN_CODEGENTARGET_H
 
 #include "CodeGenInstruction.h"
 #include "CodeGenRegisters.h"

diff --git a/utils/TableGen/DAGISelEmitter.cpp b/utils/TableGen/DAGISelEmitter.cpp
index 82682cd..e2e6ab1 100644
--- a/utils/TableGen/DAGISelEmitter.cpp
+++ b/utils/TableGen/DAGISelEmitter.cpp

@@ -94,8 +94,8 @@
     // Otherwise, if the patterns might both match, sort based on complexity,
     // which means that we prefer to match patterns that cover more nodes in the
     // input over nodes that cover fewer.
-    unsigned LHSSize = LHS->getPatternComplexity(CGP);
-    unsigned RHSSize = RHS->getPatternComplexity(CGP);
+    int LHSSize = LHS->getPatternComplexity(CGP);
+    int RHSSize = RHS->getPatternComplexity(CGP);
     if (LHSSize > RHSSize) return true;   // LHS -> bigger -> less cost
     if (LHSSize < RHSSize) return false;
 

diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index f8f6c54..b9cb267 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h

@@ -1,4 +1,4 @@
-//===- DAGISelMatcher.h - Representation of DAG pattern matcher -----------===//
+//===- DAGISelMatcher.h - Representation of DAG pattern matcher -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TBLGEN_DAGISELMATCHER_H
-#define TBLGEN_DAGISELMATCHER_H
+#ifndef LLVM_UTILS_TABLEGEN_DAGISELMATCHER_H
+#define LLVM_UTILS_TABLEGEN_DAGISELMATCHER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"

diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index 0059570..302f27b 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp

@@ -630,7 +630,7 @@
 
   for (CodeGenDAGPatterns::pf_iterator I = CGP.pf_begin(), E = CGP.pf_end();
        I != E; ++I)
-    PFsByName[I->first->getName()] = I->second;
+    PFsByName[I->first->getName()] = I->second.get();
 
   if (!NodePredicates.empty()) {
     OS << "bool CheckNodePredicate(SDNode *Node,\n";

diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index 97e37ba..4a73b00 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp

@@ -718,7 +718,7 @@
   CodeGenInstruction &II = CGT.getInstruction(Op);
   const DAGInstruction &Inst = CGP.getInstruction(Op);
 
-  // If we can, get the pattern for the instruction we're generating.  We derive
+  // If we can, get the pattern for the instruction we're generating. We derive
   // a variety of information from this pattern, such as whether it has a chain.
   //
   // FIXME2: This is extremely dubious for several reasons, not the least of
@@ -755,16 +755,21 @@
   // the "outs" list.
   unsigned NumResults = Inst.getNumResults();
 
-  // Loop over all of the operands of the instruction pattern, emitting code
-  // to fill them all in.  The node 'N' usually has number children equal to
-  // the number of input operands of the instruction.  However, in cases
-  // where there are predicate operands for an instruction, we need to fill
-  // in the 'execute always' values.  Match up the node operands to the
-  // instruction operands to do this.
-  SmallVector<unsigned, 8> InstOps;
-  for (unsigned ChildNo = 0, InstOpNo = NumResults, e = II.Operands.size();
-       InstOpNo != e; ++InstOpNo) {
+  // Number of operands we know the output instruction must have. If it is
+  // variadic, we could have more operands.
+  unsigned NumFixedOperands = II.Operands.size();
 
+  SmallVector<unsigned, 8> InstOps;
+
+  // Loop over all of the fixed operands of the instruction pattern, emitting
+  // code to fill them all in. The node 'N' usually has number children equal to
+  // the number of input operands of the instruction.  However, in cases where
+  // there are predicate operands for an instruction, we need to fill in the
+  // 'execute always' values. Match up the node operands to the instruction
+  // operands to do this.
+  unsigned ChildNo = 0;
+  for (unsigned InstOpNo = NumResults, e = NumFixedOperands;
+       InstOpNo != e; ++InstOpNo) {
     // Determine what to emit for this operand.
     Record *OperandNode = II.Operands[InstOpNo].Rec;
     if (OperandNode->isSubClassOf("OperandWithDefaultOps") &&
@@ -807,6 +812,16 @@
     }
   }
 
+  // If this is a variadic output instruction (i.e. REG_SEQUENCE), we can't
+  // expand suboperands, use default operands, or other features determined from
+  // the CodeGenInstruction after the fixed operands, which were handled
+  // above. Emit the remaining instructions implicitly added by the use for
+  // variable_ops.
+  if (II.Operands.isVariadic) {
+    for (unsigned I = ChildNo, E = N->getNumChildren(); I < E; ++I)
+      EmitResultOperand(N->getChild(I), InstOps);
+  }
+
   // If this node has input glue or explicitly specified input physregs, we
   // need to add chained and glued copyfromreg nodes and materialize the glue
   // input.
@@ -852,7 +867,7 @@
   // gets the excess operands from the input DAG.
   int NumFixedArityOperands = -1;
   if (isRoot &&
-      (Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP)))
+      Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP))
     NumFixedArityOperands = Pattern.getSrcPattern()->getNumChildren();
 
   // If this is the root node and multiple matched nodes in the input pattern

diff --git a/utils/TableGen/DAGISelMatcherOpt.cpp b/utils/TableGen/DAGISelMatcherOpt.cpp
index 0b117eb..7a22764 100644
--- a/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/utils/TableGen/DAGISelMatcherOpt.cpp

@@ -185,7 +185,7 @@
 /// Conceptually, we'd like to sink these predicates all the way to the last
 /// matcher predicate in the series.  However, it turns out that some
 /// ComplexPatterns have side effects on the graph, so we really don't want to
-/// run a the complex pattern if the pattern predicate will fail.  For this
+/// run a complex pattern if the pattern predicate will fail.  For this
 /// reason, we refuse to sink the pattern predicate past a ComplexPattern.
 ///
 static void SinkPatternPredicates(std::unique_ptr<Matcher> &MatcherPtr) {
@@ -400,7 +400,7 @@
   }
   
   if (NewOptionsToMatch.empty()) {
-    MatcherPtr.reset(nullptr);
+    MatcherPtr.reset();
     return;
   }
   
@@ -454,7 +454,7 @@
     SmallVector<std::pair<const SDNodeInfo*, Matcher*>, 8> Cases;
     for (unsigned i = 0, e = NewOptionsToMatch.size(); i != e; ++i) {
       CheckOpcodeMatcher *COM = cast<CheckOpcodeMatcher>(NewOptionsToMatch[i]);
-      assert(Opcodes.insert(COM->getOpcode().getEnumName()) &&
+      assert(Opcodes.insert(COM->getOpcode().getEnumName()).second &&
              "Duplicate opcodes not factored?");
       Cases.push_back(std::make_pair(&COM->getOpcode(), COM->getNext()));
     }

diff --git a/utils/TableGen/FastISelEmitter.cpp b/utils/TableGen/FastISelEmitter.cpp
index 154f96d..748c923 100644
--- a/utils/TableGen/FastISelEmitter.cpp
+++ b/utils/TableGen/FastISelEmitter.cpp

@@ -19,6 +19,7 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Error.h"
@@ -36,6 +37,7 @@
   const CodeGenRegisterClass *RC;
   std::string SubRegNo;
   std::vector<std::string>* PhysRegs;
+  std::string PredicateCheck;
 };
 } // End anonymous namespace
 
@@ -347,7 +349,7 @@
         // Implicit physical register operand. e.g. Instruction::Mul expect to
         // select to a binary op. On x86, mul may take a single operand with
         // the other operand being implicit. We must emit something that looks
-        // like a binary instruction except for the very inner FastEmitInst_*
+        // like a binary instruction except for the very inner fastEmitInst_*
         // call.
         continue;
       Operands[i].printManglingSuffix(OS, ImmPredicates, StripImmCodes);
@@ -364,7 +366,9 @@
 
 namespace {
 class FastISelMap {
-  typedef std::map<std::string, InstructionMemo> PredMap;
+  // A multimap is needed instead of a "plain" map because the key is 
+  // the instruction's complexity (an int) and they are not unique.
+  typedef std::multimap<int, InstructionMemo> PredMap;
   typedef std::map<MVT::SimpleValueType, PredMap> RetPredMap;
   typedef std::map<MVT::SimpleValueType, RetPredMap> TypeRetPredMap;
   typedef std::map<std::string, TypeRetPredMap> OpcodeTypeRetPredMap;
@@ -373,6 +377,16 @@
 
   OperandsOpcodeTypeRetPredMap SimplePatterns;
 
+  // This is used to check that there are no duplicate predicates            
+  typedef std::multimap<std::string, bool> PredCheckMap;
+  typedef std::map<MVT::SimpleValueType, PredCheckMap> RetPredCheckMap;
+  typedef std::map<MVT::SimpleValueType, RetPredCheckMap> TypeRetPredCheckMap;
+  typedef std::map<std::string, TypeRetPredCheckMap> OpcodeTypeRetPredCheckMap;
+  typedef std::map<OperandsSignature, OpcodeTypeRetPredCheckMap>
+            OperandsOpcodeTypeRetPredCheckMap;
+
+  OperandsOpcodeTypeRetPredCheckMap SimplePatternsCheck;
+
   std::map<OperandsSignature, std::vector<OperandsSignature> >
     SignaturesWithConstantForms;
 
@@ -384,6 +398,11 @@
   void collectPatterns(CodeGenDAGPatterns &CGP);
   void printImmediatePredicates(raw_ostream &OS);
   void printFunctionDefinitions(raw_ostream &OS);
+private:  
+  void emitInstructionCode(raw_ostream &OS, 
+                           const OperandsSignature &Operands,
+                           const PredMap &PM, 
+                           const std::string &RetVTName);
 };
 } // End anonymous namespace
 
@@ -541,6 +560,17 @@
         continue;
     }
 
+    // Check if the operands match one of the patterns handled by FastISel.
+    std::string ManglingSuffix;
+    raw_string_ostream SuffixOS(ManglingSuffix);
+    Operands.PrintManglingSuffix(SuffixOS, ImmediatePredicates, true);
+    SuffixOS.flush();
+    if (!StringSwitch<bool>(ManglingSuffix)
+        .Cases("", "r", "rr", "ri", "rf", true)
+        .Cases("rri", "i", "f", true)
+        .Default(false))
+      continue;
+
     // Get the predicate that guards this pattern.
     std::string PredicateCheck = Pattern.getPredicateCheck();
 
@@ -549,14 +579,24 @@
       Pattern.getDstPattern()->getOperator()->getName(),
       DstRC,
       SubRegNo,
-      PhysRegInputs
+      PhysRegInputs,
+      PredicateCheck
     };
+    
+    int complexity = Pattern.getPatternComplexity(CGP);
 
-    if (SimplePatterns[Operands][OpcodeName][VT][RetVT].count(PredicateCheck))
+    if (SimplePatternsCheck[Operands][OpcodeName][VT]
+         [RetVT].count(PredicateCheck)) {
       PrintFatalError(Pattern.getSrcRecord()->getLoc(),
-                    "Duplicate record in FastISel table!");
+                    "Duplicate predicate in FastISel table!");
+    }
+    SimplePatternsCheck[Operands][OpcodeName][VT][RetVT].insert(
+            std::make_pair(PredicateCheck, true));
 
-    SimplePatterns[Operands][OpcodeName][VT][RetVT][PredicateCheck] = Memo;
+       // Note: Instructions with the same complexity will appear in the order
+          // that they are encountered.
+    SimplePatterns[Operands][OpcodeName][VT][RetVT].insert(
+      std::make_pair(complexity, Memo));
 
     // If any of the operands were immediates with predicates on them, strip
     // them down to a signature that doesn't have predicates so that we can
@@ -582,6 +622,72 @@
   OS << "\n\n";
 }
 
+void FastISelMap::emitInstructionCode(raw_ostream &OS, 
+                                      const OperandsSignature &Operands,
+                                      const PredMap &PM, 
+                                      const std::string &RetVTName) {
+  // Emit code for each possible instruction. There may be
+  // multiple if there are subtarget concerns.  A reverse iterator
+  // is used to produce the ones with highest complexity first.
+
+  bool OneHadNoPredicate = false;
+  for (PredMap::const_reverse_iterator PI = PM.rbegin(), PE = PM.rend();
+       PI != PE; ++PI) {
+    const InstructionMemo &Memo = PI->second;
+    std::string PredicateCheck = Memo.PredicateCheck;
+
+    if (PredicateCheck.empty()) {
+      assert(!OneHadNoPredicate &&
+             "Multiple instructions match and more than one had "
+             "no predicate!");
+      OneHadNoPredicate = true;
+    } else {
+      if (OneHadNoPredicate) {
+        // FIXME: This should be a PrintError once the x86 target
+        // fixes PR21575.
+        PrintWarning("Multiple instructions match and one with no "
+                     "predicate came before one with a predicate!  "
+                     "name:" + Memo.Name + "  predicate: " + 
+                     PredicateCheck);
+      }
+      OS << "  if (" + PredicateCheck + ") {\n";
+      OS << "  ";
+    }
+
+    for (unsigned i = 0; i < Memo.PhysRegs->size(); ++i) {
+      if ((*Memo.PhysRegs)[i] != "")
+        OS << "  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, "
+           << "TII.get(TargetOpcode::COPY), "
+           << (*Memo.PhysRegs)[i] << ").addReg(Op" << i << ");\n";
+    }
+
+    OS << "  return fastEmitInst_";
+    if (Memo.SubRegNo.empty()) {
+      Operands.PrintManglingSuffix(OS, *Memo.PhysRegs,
+     ImmediatePredicates, true);
+      OS << "(" << InstNS << Memo.Name << ", ";
+      OS << "&" << InstNS << Memo.RC->getName() << "RegClass";
+      if (!Operands.empty())
+        OS << ", ";
+      Operands.PrintArguments(OS, *Memo.PhysRegs);
+      OS << ");\n";
+    } else {
+      OS << "extractsubreg(" << RetVTName
+         << ", Op0, Op0IsKill, " << Memo.SubRegNo << ");\n";
+    }
+
+    if (!PredicateCheck.empty()) {
+      OS << "  }\n";
+    }
+  }
+  // Return 0 if all of the possibilities had predicates but none
+  // were satisfied.
+  if (!OneHadNoPredicate)
+    OS << "  return 0;\n";
+  OS << "}\n";
+  OS << "\n";
+}
+
 
 void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
   // Now emit code for all the patterns that we collected.
@@ -608,9 +714,8 @@
                RI != RE; ++RI) {
             MVT::SimpleValueType RetVT = RI->first;
             const PredMap &PM = RI->second;
-            bool HasPred = false;
 
-            OS << "unsigned FastEmit_"
+            OS << "unsigned fastEmit_"
                << getLegalCName(Opcode)
                << "_" << getLegalCName(getName(VT))
                << "_" << getLegalCName(getName(RetVT)) << "_";
@@ -619,58 +724,11 @@
             Operands.PrintParameters(OS);
             OS << ") {\n";
 
-            // Emit code for each possible instruction. There may be
-            // multiple if there are subtarget concerns.
-            for (PredMap::const_iterator PI = PM.begin(), PE = PM.end();
-                 PI != PE; ++PI) {
-              std::string PredicateCheck = PI->first;
-              const InstructionMemo &Memo = PI->second;
-
-              if (PredicateCheck.empty()) {
-                assert(!HasPred &&
-                       "Multiple instructions match, at least one has "
-                       "a predicate and at least one doesn't!");
-              } else {
-                OS << "  if (" + PredicateCheck + ") {\n";
-                OS << "  ";
-                HasPred = true;
-              }
-
-              for (unsigned i = 0; i < Memo.PhysRegs->size(); ++i) {
-                if ((*Memo.PhysRegs)[i] != "")
-                  OS << "  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, "
-                     << "TII.get(TargetOpcode::COPY), "
-                     << (*Memo.PhysRegs)[i] << ").addReg(Op" << i << ");\n";
-              }
-
-              OS << "  return FastEmitInst_";
-              if (Memo.SubRegNo.empty()) {
-                Operands.PrintManglingSuffix(OS, *Memo.PhysRegs,
-                                             ImmediatePredicates, true);
-                OS << "(" << InstNS << Memo.Name << ", ";
-                OS << "&" << InstNS << Memo.RC->getName() << "RegClass";
-                if (!Operands.empty())
-                  OS << ", ";
-                Operands.PrintArguments(OS, *Memo.PhysRegs);
-                OS << ");\n";
-              } else {
-                OS << "extractsubreg(" << getName(RetVT);
-                OS << ", Op0, Op0IsKill, " << Memo.SubRegNo << ");\n";
-              }
-
-              if (HasPred)
-                OS << "  }\n";
-
-            }
-            // Return 0 if none of the predicates were satisfied.
-            if (HasPred)
-              OS << "  return 0;\n";
-            OS << "}\n";
-            OS << "\n";
+            emitInstructionCode(OS, Operands, PM, getName(RetVT));
           }
 
           // Emit one function for the type that demultiplexes on return type.
-          OS << "unsigned FastEmit_"
+          OS << "unsigned fastEmit_"
              << getLegalCName(Opcode) << "_"
              << getLegalCName(getName(VT)) << "_";
           Operands.PrintManglingSuffix(OS, ImmediatePredicates);
@@ -682,7 +740,7 @@
           for (RetPredMap::const_iterator RI = RM.begin(), RE = RM.end();
                RI != RE; ++RI) {
             MVT::SimpleValueType RetVT = RI->first;
-            OS << "  case " << getName(RetVT) << ": return FastEmit_"
+            OS << "  case " << getName(RetVT) << ": return fastEmit_"
                << getLegalCName(Opcode) << "_" << getLegalCName(getName(VT))
                << "_" << getLegalCName(getName(RetVT)) << "_";
             Operands.PrintManglingSuffix(OS, ImmediatePredicates);
@@ -694,7 +752,7 @@
 
         } else {
           // Non-variadic return type.
-          OS << "unsigned FastEmit_"
+          OS << "unsigned fastEmit_"
              << getLegalCName(Opcode) << "_"
              << getLegalCName(getName(VT)) << "_";
           Operands.PrintManglingSuffix(OS, ImmediatePredicates);
@@ -708,63 +766,13 @@
              << ")\n    return 0;\n";
 
           const PredMap &PM = RM.begin()->second;
-          bool HasPred = false;
 
-          // Emit code for each possible instruction. There may be
-          // multiple if there are subtarget concerns.
-          for (PredMap::const_iterator PI = PM.begin(), PE = PM.end(); PI != PE;
-               ++PI) {
-            std::string PredicateCheck = PI->first;
-            const InstructionMemo &Memo = PI->second;
-
-            if (PredicateCheck.empty()) {
-              assert(!HasPred &&
-                     "Multiple instructions match, at least one has "
-                     "a predicate and at least one doesn't!");
-            } else {
-              OS << "  if (" + PredicateCheck + ") {\n";
-              OS << "  ";
-              HasPred = true;
-            }
-
-            for (unsigned i = 0; i < Memo.PhysRegs->size(); ++i) {
-              if ((*Memo.PhysRegs)[i] != "")
-                OS << "  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, "
-                   << "TII.get(TargetOpcode::COPY), "
-                   << (*Memo.PhysRegs)[i] << ").addReg(Op" << i << ");\n";
-            }
-
-            OS << "  return FastEmitInst_";
-
-            if (Memo.SubRegNo.empty()) {
-              Operands.PrintManglingSuffix(OS, *Memo.PhysRegs,
-                                           ImmediatePredicates, true);
-              OS << "(" << InstNS << Memo.Name << ", ";
-              OS << "&" << InstNS << Memo.RC->getName() << "RegClass";
-              if (!Operands.empty())
-                OS << ", ";
-              Operands.PrintArguments(OS, *Memo.PhysRegs);
-              OS << ");\n";
-            } else {
-              OS << "extractsubreg(RetVT, Op0, Op0IsKill, ";
-              OS << Memo.SubRegNo;
-              OS << ");\n";
-            }
-
-             if (HasPred)
-               OS << "  }\n";
-          }
-
-          // Return 0 if none of the predicates were satisfied.
-          if (HasPred)
-            OS << "  return 0;\n";
-          OS << "}\n";
-          OS << "\n";
+          emitInstructionCode(OS, Operands, PM, "RetVT");
         }
       }
 
       // Emit one function for the opcode that demultiplexes based on the type.
-      OS << "unsigned FastEmit_"
+      OS << "unsigned fastEmit_"
          << getLegalCName(Opcode) << "_";
       Operands.PrintManglingSuffix(OS, ImmediatePredicates);
       OS << "(MVT VT, MVT RetVT";
@@ -777,7 +785,7 @@
            TI != TE; ++TI) {
         MVT::SimpleValueType VT = TI->first;
         std::string TypeName = getName(VT);
-        OS << "  case " << TypeName << ": return FastEmit_"
+        OS << "  case " << TypeName << ": return fastEmit_"
            << getLegalCName(Opcode) << "_" << getLegalCName(TypeName) << "_";
         Operands.PrintManglingSuffix(OS, ImmediatePredicates);
         OS << "(RetVT";
@@ -797,13 +805,16 @@
 
     // Emit one function for the operand signature that demultiplexes based
     // on opcode and type.
-    OS << "unsigned FastEmit_";
+    OS << "unsigned fastEmit_";
     Operands.PrintManglingSuffix(OS, ImmediatePredicates);
     OS << "(MVT VT, MVT RetVT, unsigned Opcode";
     if (!Operands.empty())
       OS << ", ";
     Operands.PrintParameters(OS);
-    OS << ") {\n";
+    OS << ") ";
+    if (!Operands.hasAnyImmediateCodes())
+      OS << "override ";
+    OS << "{\n";
 
     // If there are any forms of this signature available that operate on
     // constrained forms of the immediate (e.g., 32-bit sext immediate in a
@@ -823,7 +834,7 @@
       for (unsigned i = 0, e = MI->second.size(); i != e; ++i) {
         OS << "  if (";
         MI->second[i].emitImmediatePredicate(OS, ImmediatePredicates);
-        OS << ")\n    if (unsigned Reg = FastEmit_";
+        OS << ")\n    if (unsigned Reg = fastEmit_";
         MI->second[i].PrintManglingSuffix(OS, ImmediatePredicates);
         OS << "(VT, RetVT, Opcode";
         if (!MI->second[i].empty())
@@ -841,7 +852,7 @@
          I != E; ++I) {
       const std::string &Opcode = I->first;
 
-      OS << "  case " << Opcode << ": return FastEmit_"
+      OS << "  case " << Opcode << ": return fastEmit_"
          << getLegalCName(Opcode) << "_";
       Operands.PrintManglingSuffix(OS, ImmediatePredicates);
       OS << "(VT, RetVT";

diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index 42639cc..bd83b6c 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp

@@ -230,7 +230,7 @@
   std::vector<unsigned> VariableInstructions;
 
   // Map of well-known segment value to its delegate.
-  std::map<unsigned, const FilterChooser*> FilterChooserMap;
+  std::map<unsigned, std::unique_ptr<const FilterChooser>> FilterChooserMap;
 
   // Number of instructions which fall under FilteredInstructions category.
   unsigned NumFiltered;
@@ -252,7 +252,7 @@
     return *(FilterChooserMap.find((unsigned)-1)->second);
   }
 
-  Filter(const Filter &f);
+  Filter(Filter &&f);
   Filter(FilterChooser &owner, unsigned startBit, unsigned numBits, bool mixed);
 
   ~Filter();
@@ -333,13 +333,9 @@
   // Parent emitter
   const FixedLenDecoderEmitter *Emitter;
 
+  FilterChooser(const FilterChooser &) LLVM_DELETED_FUNCTION;
+  void operator=(const FilterChooser &) LLVM_DELETED_FUNCTION;
 public:
-  FilterChooser(const FilterChooser &FC)
-    : AllInstructions(FC.AllInstructions), Opcodes(FC.Opcodes),
-      Operands(FC.Operands), Filters(FC.Filters),
-      FilterBitValues(FC.FilterBitValues), Parent(FC.Parent),
-      BestIndex(FC.BestIndex), BitWidth(FC.BitWidth),
-      Emitter(FC.Emitter) { }
 
   FilterChooser(const std::vector<const CodeGenInstruction*> &Insts,
                 const std::vector<unsigned> &IDs,
@@ -347,10 +343,8 @@
                 unsigned BW,
                 const FixedLenDecoderEmitter *E)
     : AllInstructions(Insts), Opcodes(IDs), Operands(Ops), Filters(),
-      Parent(nullptr), BestIndex(-1), BitWidth(BW), Emitter(E) {
-    for (unsigned i = 0; i < BitWidth; ++i)
-      FilterBitValues.push_back(BIT_UNFILTERED);
-
+      FilterBitValues(BW, BIT_UNFILTERED), Parent(nullptr), BestIndex(-1),
+      BitWidth(BW), Emitter(E) {
     doFilter();
   }
 
@@ -490,11 +484,11 @@
 //                       //
 ///////////////////////////
 
-Filter::Filter(const Filter &f)
+Filter::Filter(Filter &&f)
   : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), Mixed(f.Mixed),
-    FilteredInstructions(f.FilteredInstructions),
-    VariableInstructions(f.VariableInstructions),
-    FilterChooserMap(f.FilterChooserMap), NumFiltered(f.NumFiltered),
+    FilteredInstructions(std::move(f.FilteredInstructions)),
+    VariableInstructions(std::move(f.VariableInstructions)),
+    FilterChooserMap(std::move(f.FilterChooserMap)), NumFiltered(f.NumFiltered),
     LastOpcFiltered(f.LastOpcFiltered) {
 }
 
@@ -534,12 +528,6 @@
 }
 
 Filter::~Filter() {
-  std::map<unsigned, const FilterChooser*>::iterator filterIterator;
-  for (filterIterator = FilterChooserMap.begin();
-       filterIterator != FilterChooserMap.end();
-       filterIterator++) {
-    delete filterIterator->second;
-  }
 }
 
 // Divides the decoding task into sub tasks and delegates them to the
@@ -561,14 +549,10 @@
 
     // Delegates to an inferior filter chooser for further processing on this
     // group of instructions whose segment values are variable.
-    FilterChooserMap.insert(std::pair<unsigned, const FilterChooser*>(
-                              (unsigned)-1,
-                              new FilterChooser(Owner->AllInstructions,
-                                                VariableInstructions,
-                                                Owner->Operands,
-                                                BitValueArray,
-                                                *Owner)
-                              ));
+    FilterChooserMap.insert(
+        std::make_pair(-1U, llvm::make_unique<FilterChooser>(
+                                Owner->AllInstructions, VariableInstructions,
+                                Owner->Operands, BitValueArray, *Owner)));
   }
 
   // No need to recurse for a singleton filtered instruction.
@@ -594,14 +578,10 @@
 
     // Delegates to an inferior filter chooser for further processing on this
     // category of instructions.
-    FilterChooserMap.insert(std::pair<unsigned, const FilterChooser*>(
-                              mapIterator->first,
-                              new FilterChooser(Owner->AllInstructions,
-                                                mapIterator->second,
-                                                Owner->Operands,
-                                                BitValueArray,
-                                                *Owner)
-                              ));
+    FilterChooserMap.insert(std::make_pair(
+        mapIterator->first, llvm::make_unique<FilterChooser>(
+                                Owner->AllInstructions, mapIterator->second,
+                                Owner->Operands, BitValueArray, *Owner)));
   }
 }
 
@@ -636,7 +616,8 @@
   // A new filter entry begins a new scope for fixup resolution.
   TableInfo.FixupStack.push_back(FixupList());
 
-  std::map<unsigned, const FilterChooser*>::const_iterator filterIterator;
+  std::map<unsigned,
+           std::unique_ptr<const FilterChooser>>::const_iterator filterIterator;
 
   DecoderTable &Table = TableInfo.Table;
 
@@ -1066,19 +1047,17 @@
                                      const OperandInfo &OpInfo) const {
   const std::string &Decoder = OpInfo.Decoder;
 
-  if (OpInfo.numFields() == 1) {
-    OperandInfo::const_iterator OI = OpInfo.begin();
-    o.indent(Indentation) << "tmp = fieldFromInstruction"
-                          << "(insn, " << OI->Base << ", " << OI->Width
-                          << ");\n";
-  } else {
+  if (OpInfo.numFields() != 1)
     o.indent(Indentation) << "tmp = 0;\n";
-    for (OperandInfo::const_iterator OI = OpInfo.begin(), OE = OpInfo.end();
-         OI != OE; ++OI) {
-      o.indent(Indentation) << "tmp |= (fieldFromInstruction"
-                            << "(insn, " << OI->Base << ", " << OI->Width
-                            << ") << " << OI->Offset << ");\n";
-    }
+
+  for (const EncodingField &EF : OpInfo) {
+    o.indent(Indentation) << "tmp ";
+    if (OpInfo.numFields() != 1) o << '|';
+    o << "= fieldFromInstruction"
+      << "(insn, " << EF.Base << ", " << EF.Width << ')';
+    if (OpInfo.numFields() != 1 || EF.Offset != 0)
+      o << " << " << EF.Offset;
+    o << ";\n";
   }
 
   if (Decoder != "")
@@ -1384,8 +1363,7 @@
 void FilterChooser::runSingleFilter(unsigned startBit, unsigned numBit,
                                     bool mixed) {
   Filters.clear();
-  Filter F(*this, startBit, numBit, true);
-  Filters.push_back(F);
+  Filters.push_back(Filter(*this, startBit, numBit, true));
   BestIndex = 0; // Sole Filter instance to choose from.
   bestFilter().recurse();
 }

diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index 76f05ce..6fdf22d 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp

@@ -266,6 +266,7 @@
   OS << "#undef GET_INSTRINFO_NAMED_OPS\n";
   OS << "namespace llvm {";
   OS << "namespace " << Namespace << " {\n";
+  OS << "LLVM_READONLY\n";
   OS << "int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx) {\n";
   if (!Operands.empty()) {
     OS << "  static const int16_t OperandMap [][" << Operands.size()
@@ -504,6 +505,9 @@
   if (Inst.isAsCheapAsAMove)   OS << "|(1<<MCID::CheapAsAMove)";
   if (Inst.hasExtraSrcRegAllocReq) OS << "|(1<<MCID::ExtraSrcRegAllocReq)";
   if (Inst.hasExtraDefRegAllocReq) OS << "|(1<<MCID::ExtraDefRegAllocReq)";
+  if (Inst.isRegSequence) OS << "|(1<<MCID::RegSequence)";
+  if (Inst.isExtractSubreg) OS << "|(1<<MCID::ExtractSubreg)";
+  if (Inst.isInsertSubreg) OS << "|(1<<MCID::InsertSubreg)";
 
   // Emit all of the target-specific flags...
   BitsInit *TSF = Inst.TheDef->getValueAsBitsInit("TSFlags");

diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 430ef32..37f6de0 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp

@@ -129,8 +129,9 @@
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     OS << "    " << Ints[i].EnumName;
     OS << ((i != e-1) ? ", " : "  ");
-    OS << std::string(40-Ints[i].EnumName.size(), ' ')
-      << "// " << Ints[i].Name << "\n";
+    if (Ints[i].EnumName.size() < 40)
+      OS << std::string(40-Ints[i].EnumName.size(), ' ');
+    OS << " // " << Ints[i].Name << "\n";
   }
   OS << "#endif\n\n";
 }
@@ -243,19 +244,20 @@
   IIT_ARG  = 15,
 
   // Values from 16+ are only encodable with the inefficient encoding.
-  IIT_MMX  = 16,
-  IIT_METADATA = 17,
-  IIT_EMPTYSTRUCT = 18,
-  IIT_STRUCT2 = 19,
-  IIT_STRUCT3 = 20,
-  IIT_STRUCT4 = 21,
-  IIT_STRUCT5 = 22,
-  IIT_EXTEND_ARG = 23,
-  IIT_TRUNC_ARG = 24,
-  IIT_ANYPTR = 25,
-  IIT_V1   = 26,
-  IIT_VARARG = 27,
-  IIT_HALF_VEC_ARG = 28
+  IIT_V64  = 16,
+  IIT_MMX  = 17,
+  IIT_METADATA = 18,
+  IIT_EMPTYSTRUCT = 19,
+  IIT_STRUCT2 = 20,
+  IIT_STRUCT3 = 21,
+  IIT_STRUCT4 = 22,
+  IIT_STRUCT5 = 23,
+  IIT_EXTEND_ARG = 24,
+  IIT_TRUNC_ARG = 25,
+  IIT_ANYPTR = 26,
+  IIT_V1   = 27,
+  IIT_VARARG = 28,
+  IIT_HALF_VEC_ARG = 29
 };
 
 
@@ -355,6 +357,7 @@
     case 8: Sig.push_back(IIT_V8); break;
     case 16: Sig.push_back(IIT_V16); break;
     case 32: Sig.push_back(IIT_V32); break;
+    case 64: Sig.push_back(IIT_V64); break;
     }
 
     return EncodeFixedValueType(VVT.getVectorElementType().SimpleTy, Sig);
@@ -679,8 +682,7 @@
 
   OS << "    }\n";
   OS << "  }\n";
-  OS << "  return AttributeSet::get(C, ArrayRef<AttributeSet>(AS, "
-             "NumAttrs));\n";
+  OS << "  return AttributeSet::get(C, makeArrayRef(AS, NumAttrs));\n";
   OS << "}\n";
   OS << "#endif // GET_INTRINSIC_ATTRIBUTES\n\n";
 }

diff --git a/utils/TableGen/OptParserEmitter.cpp b/utils/TableGen/OptParserEmitter.cpp
index c5fd7ee..9262d7c 100644
--- a/utils/TableGen/OptParserEmitter.cpp
+++ b/utils/TableGen/OptParserEmitter.cpp

@@ -221,9 +221,11 @@
 
     // The containing option group (if any).
     OS << ", ";
-    if (const DefInit *DI = dyn_cast<DefInit>(R.getValueInit("Group")))
+    const ListInit *GroupFlags = nullptr;
+    if (const DefInit *DI = dyn_cast<DefInit>(R.getValueInit("Group"))) {
+      GroupFlags = DI->getDef()->getValueAsListInit("Flags");
       OS << getOptionName(*DI->getDef());
-    else
+    } else
       OS << "INVALID";
 
     // The option alias (if any).
@@ -249,17 +251,19 @@
     }
 
     // The option flags.
+    OS << ", ";
+    int NumFlags = 0;
     const ListInit *LI = R.getValueAsListInit("Flags");
-    if (LI->empty()) {
-      OS << ", 0";
-    } else {
-      OS << ", ";
-      for (unsigned i = 0, e = LI->size(); i != e; ++i) {
-        if (i)
-          OS << " | ";
-        OS << cast<DefInit>(LI->getElement(i))->getDef()->getName();
-      }
+    for (Init *I : *LI)
+      OS << (NumFlags++ ? " | " : "")
+         << cast<DefInit>(I)->getDef()->getName();
+    if (GroupFlags) {
+      for (Init *I : *GroupFlags)
+        OS << (NumFlags++ ? " | " : "")
+           << cast<DefInit>(I)->getDef()->getName();
     }
+    if (NumFlags == 0)
+      OS << '0';
 
     // The option parameter field.
     OS << ", " << R.getValueAsInt("NumArgs");

diff --git a/utils/TableGen/PseudoLoweringEmitter.cpp b/utils/TableGen/PseudoLoweringEmitter.cpp
index 3b74ac4..ebb43f0 100644
--- a/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/utils/TableGen/PseudoLoweringEmitter.cpp

@@ -277,11 +277,10 @@
   assert(InstructionClass && "Instruction class definition missing!");
 
   std::vector<Record*> Insts;
-  for (std::map<std::string, Record*>::const_iterator I =
-         Records.getDefs().begin(), E = Records.getDefs().end(); I != E; ++I) {
-    if (I->second->isSubClassOf(ExpansionClass) &&
-        I->second->isSubClassOf(InstructionClass))
-      Insts.push_back(I->second);
+  for (const auto &D : Records.getDefs()) {
+    if (D.second->isSubClassOf(ExpansionClass) &&
+        D.second->isSubClassOf(InstructionClass))
+      Insts.push_back(D.second.get());
   }
 
   // Process the pseudo expansion definitions, validating them as we do so.

diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 573c37f..79d08a9 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp

@@ -379,9 +379,9 @@
       OS << "extern const unsigned " << Namespace
          << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i << "Dwarf2LSize";
       if (!isCtor)
-        OS << " = sizeof(" << Namespace
+        OS << " = array_lengthof(" << Namespace
            << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
-           << "Dwarf2L)/sizeof(MCRegisterInfo::DwarfLLVMRegPair);\n\n";
+           << "Dwarf2L);\n\n";
       else
         OS << ";\n\n";
     }
@@ -427,9 +427,8 @@
       OS << "extern const unsigned " << Namespace
          << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i << "L2DwarfSize";
       if (!isCtor)
-        OS << " = sizeof(" << Namespace
-           << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i
-           << "L2Dwarf)/sizeof(MCRegisterInfo::DwarfLLVMRegPair);\n\n";
+        OS << " = array_lengthof(" << Namespace
+           << (j == 0 ? "DwarfFlavour" : "EHFlavour") << i << "L2Dwarf);\n\n";
       else
         OS << ";\n\n";
     }
@@ -848,6 +847,8 @@
   // Loop over all of the register classes... emitting each one.
   OS << "namespace {     // Register classes...\n";
 
+  SequenceToOffsetTable<std::string> RegClassStrings;
+
   // Emit the register enum value arrays for each RegisterClass
   for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
     const CodeGenRegisterClass &RC = *RegisterClasses[rc];
@@ -856,6 +857,8 @@
     // Give the register class a legal C name if it's anonymous.
     std::string Name = RC.getName();
 
+    RegClassStrings.add(Name);
+
     // Emit the register list now.
     OS << "  // " << Name << " Register Class...\n"
        << "  const MCPhysReg " << Name
@@ -880,6 +883,11 @@
   }
   OS << "}\n\n";
 
+  RegClassStrings.layout();
+  OS << "extern const char " << TargetName << "RegClassStrings[] = {\n";
+  RegClassStrings.emit(OS, printChar);
+  OS << "};\n\n";
+
   OS << "extern const MCRegisterClass " << TargetName
      << "MCRegisterClasses[] = {\n";
 
@@ -892,8 +900,8 @@
     assert((RC.SpillAlignment/8) <= 0xffff && "SpillAlignment too large.");
     assert(RC.CopyCost >= -128 && RC.CopyCost <= 127 && "Copy cost too large.");
 
-    OS << "  { " << '\"' << RC.getName() << "\", "
-       << RC.getName() << ", " << RC.getName() << "Bits, "
+    OS << "  { " << RC.getName() << ", " << RC.getName() << "Bits, "
+       << RegClassStrings.get(RC.getName()) << ", "
        << RC.getOrder().size() << ", sizeof(" << RC.getName() << "Bits), "
        << RC.getQualifiedName() + "RegClassID" << ", "
        << RC.SpillSize/8 << ", "
@@ -934,10 +942,11 @@
      << RegBank.getNumNativeRegUnits() << ", "
      << TargetName << "RegDiffLists, "
      << TargetName << "RegStrings, "
+     << TargetName << "RegClassStrings, "
      << TargetName << "SubRegIdxLists, "
      << (SubRegIndices.size() + 1) << ",\n"
      << TargetName << "SubRegIdxRanges, "
-     << "  " << TargetName << "RegEncodingTable);\n\n";
+     << TargetName << "RegEncodingTable);\n\n";
 
   EmitRegMapping(OS, Regs, false);
 
@@ -1267,6 +1276,7 @@
   OS << "extern const MCRegisterDesc " << TargetName << "RegDesc[];\n";
   OS << "extern const MCPhysReg " << TargetName << "RegDiffLists[];\n";
   OS << "extern const char " << TargetName << "RegStrings[];\n";
+  OS << "extern const char " << TargetName << "RegClassStrings[];\n";
   OS << "extern const MCPhysReg " << TargetName << "RegUnitRoots[][2];\n";
   OS << "extern const uint16_t " << TargetName << "SubRegIdxLists[];\n";
   OS << "extern const MCRegisterInfo::SubRegCoveredBits "
@@ -1289,6 +1299,7 @@
      << "                     " << RegBank.getNumNativeRegUnits() << ",\n"
      << "                     " << TargetName << "RegDiffLists,\n"
      << "                     " << TargetName << "RegStrings,\n"
+     << "                     " << TargetName << "RegClassStrings,\n"
      << "                     " << TargetName << "SubRegIdxLists,\n"
      << "                     " << SubRegIndices.size() + 1 << ",\n"
      << "                     " << TargetName << "SubRegIdxRanges,\n"

diff --git a/utils/TableGen/SequenceToOffsetTable.h b/utils/TableGen/SequenceToOffsetTable.h
index e6ab664..b58de4c 100644
--- a/utils/TableGen/SequenceToOffsetTable.h
+++ b/utils/TableGen/SequenceToOffsetTable.h

@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TBLGEN_SEQUENCE_TO_OFFSET_TABLE_H
-#define TBLGEN_SEQUENCE_TO_OFFSET_TABLE_H
+#ifndef LLVM_UTILS_TABLEGEN_SEQUENCETOOFFSETTABLE_H
+#define LLVM_UTILS_TABLEGEN_SEQUENCETOOFFSETTABLE_H
 
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>

diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 06f8694..9f2fc92 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp

@@ -386,7 +386,7 @@
   for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
          PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
 
-    if (!ItinsDefSet.insert(PI->ItinsDef))
+    if (!ItinsDefSet.insert(PI->ItinsDef).second)
       continue;
 
     std::vector<Record*> FUs = PI->ItinsDef->getValueAsListOfDefs("FU");
@@ -565,7 +565,7 @@
          PE = SchedModels.procModelEnd(); PI != PE; ++PI, ++ProcItinListsIter) {
 
     Record *ItinsDef = PI->ItinsDef;
-    if (!ItinsDefSet.insert(ItinsDef))
+    if (!ItinsDefSet.insert(ItinsDef).second)
       continue;
 
     // Get processor itinerary name
@@ -575,12 +575,13 @@
     assert(ProcItinListsIter != ProcItinLists.end() && "bad iterator");
     std::vector<InstrItinerary> &ItinList = *ProcItinListsIter;
 
+    // Empty itineraries aren't referenced anywhere in the tablegen output
+    // so don't emit them.
+    if (ItinList.empty())
+      continue;
+
     OS << "\n";
     OS << "static const llvm::InstrItinerary ";
-    if (ItinList.empty()) {
-      OS << '*' << Name << " = nullptr;\n";
-      continue;
-    }
 
     // Begin processor itinerary table
     OS << Name << "[] = {\n";
@@ -1192,7 +1193,7 @@
 
     // Begin processor itinerary properties
     OS << "\n";
-    OS << "static const llvm::MCSchedModel " << PI->ModelName << "(\n";
+    OS << "static const llvm::MCSchedModel " << PI->ModelName << " = {\n";
     EmitProcessorProp(OS, PI->ModelDef, "IssueWidth", ',');
     EmitProcessorProp(OS, PI->ModelDef, "MicroOpBufferSize", ',');
     EmitProcessorProp(OS, PI->ModelDef, "LoopMicroOpBufferSize", ',');
@@ -1201,6 +1202,10 @@
     EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
 
     OS << "  " << (bool)(PI->ModelDef ?
+                         PI->ModelDef->getValueAsBit("PostRAScheduler") : 0)
+       << ", // " << "PostRAScheduler\n";
+
+    OS << "  " << (bool)(PI->ModelDef ?
                          PI->ModelDef->getValueAsBit("CompleteModel") : 0)
        << ", // " << "CompleteModel\n";
 
@@ -1213,10 +1218,10 @@
                      - SchedModels.schedClassBegin()) << ",\n";
     else
       OS << "  0, 0, 0, 0, // No instruction-level machine model.\n";
-    if (SchedModels.hasItineraries())
-      OS << "  " << PI->ItinsDef->getName() << ");\n";
+    if (PI->hasItineraries())
+      OS << "  " << PI->ItinsDef->getName() << "};\n";
     else
-      OS << "  0); // No Itinerary\n";
+      OS << "  nullptr}; // No Itinerary\n";
   }
 }
 

diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 28b626e..2dc03ce 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h

@@ -13,6 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_UTILS_TABLEGEN_TABLEGENBACKENDS_H
+#define LLVM_UTILS_TABLEGEN_TABLEGENBACKENDS_H
 
 // A TableGen backend is a function that looks like
 //
@@ -78,3 +80,5 @@
 void EmitCTags(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
+
+#endif

diff --git a/utils/TableGen/X86DisassemblerShared.h b/utils/TableGen/X86DisassemblerShared.h
index 9e79b9c..5895277 100644
--- a/utils/TableGen/X86DisassemblerShared.h
+++ b/utils/TableGen/X86DisassemblerShared.h

@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86DISASSEMBLERSHARED_H
-#define X86DISASSEMBLERSHARED_H
+#ifndef LLVM_UTILS_TABLEGEN_X86DISASSEMBLERSHARED_H
+#define LLVM_UTILS_TABLEGEN_X86DISASSEMBLERSHARED_H
 
 #include <string.h>
 #include <string>

diff --git a/utils/TableGen/X86DisassemblerTables.cpp b/utils/TableGen/X86DisassemblerTables.cpp
index fbcc6f2..d7e981c 100644
--- a/utils/TableGen/X86DisassemblerTables.cpp
+++ b/utils/TableGen/X86DisassemblerTables.cpp

@@ -171,12 +171,17 @@
   case IC_EVEX_OPSIZE:
     return inheritsFrom(child, IC_EVEX_W_OPSIZE) ||
            inheritsFrom(child, IC_EVEX_L_W_OPSIZE);
+  case IC_EVEX_B:
+    return false;
   case IC_EVEX_W:
   case IC_EVEX_W_XS:
   case IC_EVEX_W_XD:
   case IC_EVEX_W_OPSIZE:
     return false;
   case IC_EVEX_L:
+  case IC_EVEX_L_K_B:
+  case IC_EVEX_L_KZ_B:
+  case IC_EVEX_L_B:
   case IC_EVEX_L_XS:
   case IC_EVEX_L_XD:
   case IC_EVEX_L_OPSIZE:
@@ -205,38 +210,59 @@
   case IC_EVEX_XD_K:
     return inheritsFrom(child, IC_EVEX_W_XD_K) ||
            inheritsFrom(child, IC_EVEX_L_W_XD_K);
+  case IC_EVEX_K_B:
+  case IC_EVEX_KZ:
+    return false;
+  case IC_EVEX_XS_KZ:
+    return inheritsFrom(child, IC_EVEX_W_XS_KZ) ||
+           inheritsFrom(child, IC_EVEX_L_W_XS_KZ);
+  case IC_EVEX_XD_KZ:
+    return inheritsFrom(child, IC_EVEX_W_XD_KZ) ||
+           inheritsFrom(child, IC_EVEX_L_W_XD_KZ);
+  case IC_EVEX_KZ_B:
   case IC_EVEX_OPSIZE_K:
   case IC_EVEX_OPSIZE_B:
+  case IC_EVEX_OPSIZE_K_B:
+  case IC_EVEX_OPSIZE_KZ:
+  case IC_EVEX_OPSIZE_KZ_B:
     return false;
   case IC_EVEX_W_K:
   case IC_EVEX_W_XS_K:
   case IC_EVEX_W_XD_K:
   case IC_EVEX_W_OPSIZE_K:
   case IC_EVEX_W_OPSIZE_B:
+  case IC_EVEX_W_OPSIZE_K_B:
     return false;
   case IC_EVEX_L_K:
   case IC_EVEX_L_XS_K:
   case IC_EVEX_L_XD_K:
   case IC_EVEX_L_OPSIZE_K:
+  case IC_EVEX_L_OPSIZE_B:
+  case IC_EVEX_L_OPSIZE_K_B:
     return false;
   case IC_EVEX_W_KZ:
   case IC_EVEX_W_XS_KZ:
   case IC_EVEX_W_XD_KZ:
   case IC_EVEX_W_OPSIZE_KZ:
+  case IC_EVEX_W_OPSIZE_KZ_B:
     return false;
   case IC_EVEX_L_KZ:
   case IC_EVEX_L_XS_KZ:
   case IC_EVEX_L_XD_KZ:
   case IC_EVEX_L_OPSIZE_KZ:
+  case IC_EVEX_L_OPSIZE_KZ_B:
     return false;
   case IC_EVEX_L_W_K:
   case IC_EVEX_L_W_XS_K:
   case IC_EVEX_L_W_XD_K:
   case IC_EVEX_L_W_OPSIZE_K:
+  case IC_EVEX_L_W_OPSIZE_B:
+  case IC_EVEX_L_W_OPSIZE_K_B:
   case IC_EVEX_L_W_KZ:
   case IC_EVEX_L_W_XS_KZ:
   case IC_EVEX_L_W_XD_KZ:
   case IC_EVEX_L_W_OPSIZE_KZ:
+  case IC_EVEX_L_W_OPSIZE_KZ_B:
     return false;
   case IC_EVEX_L2_K:
   case IC_EVEX_L2_B:

diff --git a/utils/TableGen/X86DisassemblerTables.h b/utils/TableGen/X86DisassemblerTables.h
index 1327375..d86b926 100644
--- a/utils/TableGen/X86DisassemblerTables.h
+++ b/utils/TableGen/X86DisassemblerTables.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86DISASSEMBLERTABLES_H
-#define X86DISASSEMBLERTABLES_H
+#ifndef LLVM_UTILS_TABLEGEN_X86DISASSEMBLERTABLES_H
+#define LLVM_UTILS_TABLEGEN_X86DISASSEMBLERTABLES_H
 
 #include "X86DisassemblerShared.h"
 #include "X86ModRMFilters.h"

diff --git a/utils/TableGen/X86ModRMFilters.h b/utils/TableGen/X86ModRMFilters.h
index fac3838..d919c58 100644
--- a/utils/TableGen/X86ModRMFilters.h
+++ b/utils/TableGen/X86ModRMFilters.h

@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86MODRMFILTERS_H
-#define X86MODRMFILTERS_H
+#ifndef LLVM_UTILS_TABLEGEN_X86MODRMFILTERS_H
+#define LLVM_UTILS_TABLEGEN_X86MODRMFILTERS_H
 
 #include "llvm/Support/DataTypes.h"
 

diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index ead419e..9b8092f 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp

@@ -32,48 +32,50 @@
   MAP(C9, 38)           \
   MAP(CA, 39)           \
   MAP(CB, 40)           \
-  MAP(D0, 41)           \
-  MAP(D1, 42)           \
-  MAP(D4, 43)           \
-  MAP(D5, 44)           \
-  MAP(D6, 45)           \
-  MAP(D8, 46)           \
-  MAP(D9, 47)           \
-  MAP(DA, 48)           \
-  MAP(DB, 49)           \
-  MAP(DC, 50)           \
-  MAP(DD, 51)           \
-  MAP(DE, 52)           \
-  MAP(DF, 53)           \
-  MAP(E0, 54)           \
-  MAP(E1, 55)           \
-  MAP(E2, 56)           \
-  MAP(E3, 57)           \
-  MAP(E4, 58)           \
-  MAP(E5, 59)           \
-  MAP(E8, 60)           \
-  MAP(E9, 61)           \
-  MAP(EA, 62)           \
-  MAP(EB, 63)           \
-  MAP(EC, 64)           \
-  MAP(ED, 65)           \
-  MAP(EE, 66)           \
-  MAP(F0, 67)           \
-  MAP(F1, 68)           \
-  MAP(F2, 69)           \
-  MAP(F3, 70)           \
-  MAP(F4, 71)           \
-  MAP(F5, 72)           \
-  MAP(F6, 73)           \
-  MAP(F7, 74)           \
-  MAP(F8, 75)           \
-  MAP(F9, 76)           \
-  MAP(FA, 77)           \
-  MAP(FB, 78)           \
-  MAP(FC, 79)           \
-  MAP(FD, 80)           \
-  MAP(FE, 81)           \
-  MAP(FF, 82)
+  MAP(CF, 41)           \
+  MAP(D0, 42)           \
+  MAP(D1, 43)           \
+  MAP(D4, 44)           \
+  MAP(D5, 45)           \
+  MAP(D6, 46)           \
+  MAP(D7, 47)           \
+  MAP(D8, 48)           \
+  MAP(D9, 49)           \
+  MAP(DA, 50)           \
+  MAP(DB, 51)           \
+  MAP(DC, 52)           \
+  MAP(DD, 53)           \
+  MAP(DE, 54)           \
+  MAP(DF, 55)           \
+  MAP(E0, 56)           \
+  MAP(E1, 57)           \
+  MAP(E2, 58)           \
+  MAP(E3, 59)           \
+  MAP(E4, 60)           \
+  MAP(E5, 61)           \
+  MAP(E8, 62)           \
+  MAP(E9, 63)           \
+  MAP(EA, 64)           \
+  MAP(EB, 65)           \
+  MAP(EC, 66)           \
+  MAP(ED, 67)           \
+  MAP(EE, 68)           \
+  MAP(F0, 69)           \
+  MAP(F1, 70)           \
+  MAP(F2, 71)           \
+  MAP(F3, 72)           \
+  MAP(F4, 73)           \
+  MAP(F5, 74)           \
+  MAP(F6, 75)           \
+  MAP(F7, 76)           \
+  MAP(F8, 77)           \
+  MAP(F9, 78)           \
+  MAP(FA, 79)           \
+  MAP(FB, 80)           \
+  MAP(FC, 81)           \
+  MAP(FD, 82)           \
+  MAP(FE, 83)           \
+  MAP(FF, 84)
 
 // A clone of X86 since we can't depend on something that is generated.
 namespace X86Local {
@@ -205,6 +207,7 @@
   HasEVEX_B        = Rec->getValueAsBit("hasEVEX_B");
   IsCodeGenOnly    = Rec->getValueAsBit("isCodeGenOnly");
   ForceDisassemble = Rec->getValueAsBit("ForceDisassemble");
+  CD8_Scale        = byteFromRec(Rec, "CD8_Scale");
 
   Name      = Rec->getName();
   AsmString = Rec->getValueAsString("AsmString");
@@ -441,6 +444,16 @@
   return insnContext;
 }
 
+void RecognizableInstr::adjustOperandEncoding(OperandEncoding &encoding) {
+  // The scaling factor for AVX512 compressed displacement encoding is an
+  // instruction attribute.  Adjust the ModRM encoding type to include the
+  // scale for compressed displacement.
+  if (encoding != ENCODING_RM || CD8_Scale == 0)
+    return;
+  encoding = (OperandEncoding)(encoding + Log2_32(CD8_Scale));
+  assert(encoding <= ENCODING_RM_CD64 && "Invalid CDisp scaling");
+}
+
 void RecognizableInstr::handleOperand(bool optional, unsigned &operandIndex,
                                       unsigned &physicalOperandIndex,
                                       unsigned &numPhysicalOperands,
@@ -464,8 +477,10 @@
 
   const std::string &typeName = (*Operands)[operandIndex].Rec->getName();
 
-  Spec->operands[operandIndex].encoding = encodingFromString(typeName,
-                                                              OpSize);
+  OperandEncoding encoding = encodingFromString(typeName, OpSize);
+  // Adjust the encoding type for an operand based on the instruction.
+  adjustOperandEncoding(encoding);
+  Spec->operands[operandIndex].encoding = encoding;
   Spec->operands[operandIndex].type = typeFromString(typeName,
                                                      HasREX_WPrefix, OpSize);
 
@@ -526,6 +541,14 @@
   // physicalOperandIndex should always be < numPhysicalOperands
   unsigned physicalOperandIndex = 0;
 
+  // Given the set of prefix bits, how many additional operands does the
+  // instruction have?
+  unsigned additionalOperands = 0;
+  if (HasVEX_4V || HasVEX_4VOp3)
+    ++additionalOperands;
+  if (HasEVEX_K)
+    ++additionalOperands;
+
   switch (Form) {
   default: llvm_unreachable("Unhandled form");
   case X86Local::RawFrmSrc:
@@ -560,17 +583,17 @@
     break;
   case X86Local::MRMDestReg:
     // Operand 1 is a register operand in the R/M field.
+    // - In AVX512 there may be a mask operand here -
     // Operand 2 is a register operand in the Reg/Opcode field.
     // - In AVX, there is a register operand in the VEX.vvvv field here -
     // Operand 3 (optional) is an immediate.
-    if (HasVEX_4V)
-      assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 4 &&
-             "Unexpected number of operands for MRMDestRegFrm with VEX_4V");
-    else
-      assert(numPhysicalOperands >= 2 && numPhysicalOperands <= 3 &&
-             "Unexpected number of operands for MRMDestRegFrm");
+    assert(numPhysicalOperands >= 2 + additionalOperands &&
+           numPhysicalOperands <= 3 + additionalOperands &&
+           "Unexpected number of operands for MRMDestRegFrm");
 
     HANDLE_OPERAND(rmRegister)
+    if (HasEVEX_K)
+      HANDLE_OPERAND(writemaskRegister)
 
     if (HasVEX_4V)
       // FIXME: In AVX, the register below becomes the one encoded
@@ -585,12 +608,10 @@
     // Operand 2 is a register operand in the Reg/Opcode field.
     // - In AVX, there is a register operand in the VEX.vvvv field here -
     // Operand 3 (optional) is an immediate.
-    if (HasVEX_4V)
-      assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 4 &&
-             "Unexpected number of operands for MRMDestMemFrm with VEX_4V");
-    else
-      assert(numPhysicalOperands >= 2 && numPhysicalOperands <= 3 &&
-             "Unexpected number of operands for MRMDestMemFrm");
+    assert(numPhysicalOperands >= 2 + additionalOperands &&
+           numPhysicalOperands <= 3 + additionalOperands &&
+           "Unexpected number of operands for MRMDestMemFrm with VEX_4V");
+
     HANDLE_OPERAND(memory)
 
     if (HasEVEX_K)
@@ -611,12 +632,9 @@
     // Operand 3 (optional) is an immediate.
     // Operand 4 (optional) is an immediate.
 
-    if (HasVEX_4V || HasVEX_4VOp3)
-      assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 5 &&
-             "Unexpected number of operands for MRMSrcRegFrm with VEX_4V");
-    else
-      assert(numPhysicalOperands >= 2 && numPhysicalOperands <= 4 &&
-             "Unexpected number of operands for MRMSrcRegFrm");
+    assert(numPhysicalOperands >= 2 + additionalOperands &&
+           numPhysicalOperands <= 4 + additionalOperands &&
+           "Unexpected number of operands for MRMSrcRegFrm");
 
     HANDLE_OPERAND(roRegister)
 
@@ -647,12 +665,9 @@
     // - In AVX, there is a register operand in the VEX.vvvv field here -
     // Operand 3 (optional) is an immediate.
 
-    if (HasVEX_4V || HasVEX_4VOp3)
-      assert(numPhysicalOperands >= 3 && numPhysicalOperands <= 5 &&
-             "Unexpected number of operands for MRMSrcMemFrm with VEX_4V");
-    else
-      assert(numPhysicalOperands >= 2 && numPhysicalOperands <= 3 &&
-             "Unexpected number of operands for MRMSrcMemFrm");
+    assert(numPhysicalOperands >= 2 + additionalOperands &&
+           numPhysicalOperands <= 4 + additionalOperands &&
+           "Unexpected number of operands for MRMSrcMemFrm");
 
     HANDLE_OPERAND(roRegister)
 
@@ -685,15 +700,13 @@
   case X86Local::MRM5r:
   case X86Local::MRM6r:
   case X86Local::MRM7r:
-    {
-      // Operand 1 is a register operand in the R/M field.
-      // Operand 2 (optional) is an immediate or relocation.
-      // Operand 3 (optional) is an immediate.
-      unsigned kOp = (HasEVEX_K) ? 1:0;
-      unsigned Op4v = (HasVEX_4V) ? 1:0;
-      if (numPhysicalOperands > 3 + kOp + Op4v)
-        llvm_unreachable("Unexpected number of operands for MRMnr");
-    }
+    // Operand 1 is a register operand in the R/M field.
+    // Operand 2 (optional) is an immediate or relocation.
+    // Operand 3 (optional) is an immediate.
+    assert(numPhysicalOperands >= 0 + additionalOperands &&
+           numPhysicalOperands <= 3 + additionalOperands &&
+           "Unexpected number of operands for MRMnr");
+
     if (HasVEX_4V)
       HANDLE_OPERAND(vvvvRegister)
 
@@ -712,15 +725,12 @@
   case X86Local::MRM5m:
   case X86Local::MRM6m:
   case X86Local::MRM7m:
-    {
-      // Operand 1 is a memory operand (possibly SIB-extended)
-      // Operand 2 (optional) is an immediate or relocation.
-      unsigned kOp = (HasEVEX_K) ? 1:0;
-      unsigned Op4v = (HasVEX_4V) ? 1:0;
-      if (numPhysicalOperands < 1 + kOp + Op4v ||
-          numPhysicalOperands > 2 + kOp + Op4v)
-        llvm_unreachable("Unexpected number of operands for MRMnm");
-    }
+    // Operand 1 is a memory operand (possibly SIB-extended)
+    // Operand 2 (optional) is an immediate or relocation.
+    assert(numPhysicalOperands >= 1 + additionalOperands &&
+           numPhysicalOperands <= 2 + additionalOperands &&
+           "Unexpected number of operands for MRMnm");
+
     if (HasVEX_4V)
       HANDLE_OPERAND(vvvvRegister)
     if (HasEVEX_K)
@@ -756,20 +766,21 @@
   case X86Local::MRM_C0: case X86Local::MRM_C1: case X86Local::MRM_C2:
   case X86Local::MRM_C3: case X86Local::MRM_C4: case X86Local::MRM_C8:
   case X86Local::MRM_C9: case X86Local::MRM_CA: case X86Local::MRM_CB:
-  case X86Local::MRM_D0: case X86Local::MRM_D1: case X86Local::MRM_D4:
-  case X86Local::MRM_D5: case X86Local::MRM_D6: case X86Local::MRM_D8:
-  case X86Local::MRM_D9: case X86Local::MRM_DA: case X86Local::MRM_DB:
-  case X86Local::MRM_DC: case X86Local::MRM_DD: case X86Local::MRM_DE:
-  case X86Local::MRM_DF: case X86Local::MRM_E0: case X86Local::MRM_E1:
-  case X86Local::MRM_E2: case X86Local::MRM_E3: case X86Local::MRM_E4:
-  case X86Local::MRM_E5: case X86Local::MRM_E8: case X86Local::MRM_E9:
-  case X86Local::MRM_EA: case X86Local::MRM_EB: case X86Local::MRM_EC:
-  case X86Local::MRM_ED: case X86Local::MRM_EE: case X86Local::MRM_F0:
-  case X86Local::MRM_F1: case X86Local::MRM_F2: case X86Local::MRM_F3:
-  case X86Local::MRM_F4: case X86Local::MRM_F5: case X86Local::MRM_F6:
-  case X86Local::MRM_F7: case X86Local::MRM_F9: case X86Local::MRM_FA:
-  case X86Local::MRM_FB: case X86Local::MRM_FC: case X86Local::MRM_FD:
-  case X86Local::MRM_FE: case X86Local::MRM_FF:
+  case X86Local::MRM_CF: case X86Local::MRM_D0: case X86Local::MRM_D1:
+  case X86Local::MRM_D4: case X86Local::MRM_D5: case X86Local::MRM_D6:
+  case X86Local::MRM_D7: case X86Local::MRM_D8: case X86Local::MRM_D9:
+  case X86Local::MRM_DA: case X86Local::MRM_DB: case X86Local::MRM_DC:
+  case X86Local::MRM_DD: case X86Local::MRM_DE: case X86Local::MRM_DF:
+  case X86Local::MRM_E0: case X86Local::MRM_E1: case X86Local::MRM_E2:
+  case X86Local::MRM_E3: case X86Local::MRM_E4: case X86Local::MRM_E5:
+  case X86Local::MRM_E8: case X86Local::MRM_E9: case X86Local::MRM_EA:
+  case X86Local::MRM_EB: case X86Local::MRM_EC: case X86Local::MRM_ED:
+  case X86Local::MRM_EE: case X86Local::MRM_F0: case X86Local::MRM_F1:
+  case X86Local::MRM_F2: case X86Local::MRM_F3: case X86Local::MRM_F4:
+  case X86Local::MRM_F5: case X86Local::MRM_F6: case X86Local::MRM_F7:
+  case X86Local::MRM_F9: case X86Local::MRM_FA: case X86Local::MRM_FB:
+  case X86Local::MRM_FC: case X86Local::MRM_FD: case X86Local::MRM_FE:
+  case X86Local::MRM_FF:
     // Ignored.
     break;
   }
@@ -896,7 +907,6 @@
   TYPE("i32mem",              TYPE_Mv)
   TYPE("i32imm",              TYPE_IMMv)
   TYPE("i32i8imm",            TYPE_IMM32)
-  TYPE("u32u8imm",            TYPE_IMM32)
   TYPE("GR32",                TYPE_R32)
   TYPE("GR32orGR64",          TYPE_R32)
   TYPE("i64mem",              TYPE_Mv)
@@ -962,10 +972,18 @@
   TYPE("VR512",               TYPE_XMM512)
   TYPE("VK1",                 TYPE_VK1)
   TYPE("VK1WM",               TYPE_VK1)
+  TYPE("VK2",                 TYPE_VK2)
+  TYPE("VK2WM",               TYPE_VK2)
+  TYPE("VK4",                 TYPE_VK4)
+  TYPE("VK4WM",               TYPE_VK4)
   TYPE("VK8",                 TYPE_VK8)
   TYPE("VK8WM",               TYPE_VK8)
   TYPE("VK16",                TYPE_VK16)
   TYPE("VK16WM",              TYPE_VK16)
+  TYPE("VK32",                TYPE_VK32)
+  TYPE("VK32WM",              TYPE_VK32)
+  TYPE("VK64",                TYPE_VK64)
+  TYPE("VK64WM",              TYPE_VK64)
   TYPE("GR16_NOAX",           TYPE_Rv)
   TYPE("GR32_NOAX",           TYPE_Rv)
   TYPE("GR64_NOAX",           TYPE_R64)
@@ -991,7 +1009,6 @@
     ENCODING("i16imm",        ENCODING_IW)
   }
   ENCODING("i32i8imm",        ENCODING_IB)
-  ENCODING("u32u8imm",        ENCODING_IB)
   ENCODING("SSECC",           ENCODING_IB)
   ENCODING("AVXCC",           ENCODING_IB)
   ENCODING("AVX512RC",        ENCODING_IB)
@@ -1038,6 +1055,8 @@
   ENCODING("VK1",             ENCODING_RM)
   ENCODING("VK8",             ENCODING_RM)
   ENCODING("VK16",            ENCODING_RM)
+  ENCODING("VK32",            ENCODING_RM)
+  ENCODING("VK64",            ENCODING_RM)
   errs() << "Unhandled R/M register encoding " << s << "\n";
   llvm_unreachable("Unhandled R/M register encoding");
 }
@@ -1064,8 +1083,12 @@
   ENCODING("FR32X",           ENCODING_REG)
   ENCODING("VR512",           ENCODING_REG)
   ENCODING("VK1",             ENCODING_REG)
+  ENCODING("VK2",             ENCODING_REG)
+  ENCODING("VK4",             ENCODING_REG)
   ENCODING("VK8",             ENCODING_REG)
   ENCODING("VK16",            ENCODING_REG)
+  ENCODING("VK32",            ENCODING_REG)
+  ENCODING("VK64",            ENCODING_REG)
   ENCODING("VK1WM",           ENCODING_REG)
   ENCODING("VK8WM",           ENCODING_REG)
   ENCODING("VK16WM",          ENCODING_REG)
@@ -1088,8 +1111,12 @@
   ENCODING("VR256X",          ENCODING_VVVV)
   ENCODING("VR512",           ENCODING_VVVV)
   ENCODING("VK1",             ENCODING_VVVV)
+  ENCODING("VK2",             ENCODING_VVVV)
+  ENCODING("VK4",             ENCODING_VVVV)
   ENCODING("VK8",             ENCODING_VVVV)
   ENCODING("VK16",            ENCODING_VVVV)
+  ENCODING("VK32",            ENCODING_VVVV)
+  ENCODING("VK64",            ENCODING_VVVV)
   errs() << "Unhandled VEX.vvvv register encoding " << s << "\n";
   llvm_unreachable("Unhandled VEX.vvvv register encoding");
 }
@@ -1098,8 +1125,12 @@
 RecognizableInstr::writemaskRegisterEncodingFromString(const std::string &s,
                                                        uint8_t OpSize) {
   ENCODING("VK1WM",           ENCODING_WRITEMASK)
+  ENCODING("VK2WM",           ENCODING_WRITEMASK)
+  ENCODING("VK4WM",           ENCODING_WRITEMASK)
   ENCODING("VK8WM",           ENCODING_WRITEMASK)
   ENCODING("VK16WM",          ENCODING_WRITEMASK)
+  ENCODING("VK32WM",          ENCODING_WRITEMASK)
+  ENCODING("VK64WM",          ENCODING_WRITEMASK)
   errs() << "Unhandled mask register encoding " << s << "\n";
   llvm_unreachable("Unhandled mask register encoding");
 }

diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index 77286bc..95d7a40 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h

@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef X86RECOGNIZABLEINSTR_H
-#define X86RECOGNIZABLEINSTR_H
+#ifndef LLVM_UTILS_TABLEGEN_X86RECOGNIZABLEINSTR_H
+#define LLVM_UTILS_TABLEGEN_X86RECOGNIZABLEINSTR_H
 
 #include "CodeGenTarget.h"
 #include "X86DisassemblerTables.h"
@@ -78,6 +78,8 @@
   bool IsCodeGenOnly;
   /// The ForceDisassemble field from the record
   bool ForceDisassemble;
+  // The CD8_Scale field from the record
+  uint8_t CD8_Scale;
   // Whether the instruction has the predicate "In64BitMode"
   bool Is64Bit;
   // Whether the instruction has the predicate "In32BitMode"
@@ -153,6 +155,9 @@
   static OperandEncoding writemaskRegisterEncodingFromString(const std::string &s,
                                                              uint8_t OpSize);
 
+  /// \brief Adjust the encoding type for an operand based on the instruction.
+  void adjustOperandEncoding(OperandEncoding &encoding);
+
   /// handleOperand - Converts a single operand from the LLVM table format to
   ///   the emitted table format, handling any duplicate operands it encounters
   ///   and then one non-duplicate.

diff --git a/utils/bisect b/utils/bisect
new file mode 100755
index 0000000..d1b1257
--- /dev/null
+++ b/utils/bisect

@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import argparse
+import subprocess
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--start', type=int, default=0)
+parser.add_argument('--end', type=int, default=(1 << 32))
+parser.add_argument('command', nargs='+')
+
+args = parser.parse_args()
+
+start = args.start
+end = args.end
+
+print("Bisect Starting!")
+print("Start: %d" % start)
+print("End: %d" % end)
+
+last = None
+while start != end and start != end-1:
+    count = start + (end - start)/2
+    print("Visiting Count: %d with (Start, End) = (%d,%d)" % (count, start, end))
+    cmd = [x % {'count':count} for x in args.command]
+    print cmd
+    result = subprocess.call(cmd)
+    if result == 0:
+        print("    PASSES! Setting start to count")
+        start = count
+    else:
+        print("    FAILS! Setting end to count")
+        end = count
+
+print("Last good count: %d" % start)

diff --git a/utils/emacs/llvm-mode.el b/utils/emacs/llvm-mode.el
index 99d3294..61127b6 100644
--- a/utils/emacs/llvm-mode.el
+++ b/utils/emacs/llvm-mode.el

@@ -48,6 +48,8 @@
    `(,(regexp-opt '("extractelement" "insertelement" "shufflevector") 'words) . font-lock-keyword-face)
    ;; Aggregate ops
    `(,(regexp-opt '("extractvalue" "insertvalue") 'words) . font-lock-keyword-face)
+   ;; Use-list order directives
+   `(,(regexp-opt '("uselistorder" "uselistorder_bb") 'words) . font-lock-keyword-face)
 
    )
   "Syntax highlighting for LLVM"

diff --git a/utils/findmisopt b/utils/findmisopt
index 88f991a..2405220 100755
--- a/utils/findmisopt
+++ b/utils/findmisopt

@@ -74,8 +74,8 @@
 echo "  Optimized program: $optprog"
 
 # Define the list of optimizations to run. This comprises the same set of 
-# optimizations that opt -std-compile-opts and gccld run, in the same order.
-opt_switches=`llvm-as < /dev/null -o - | opt -std-compile-opts -disable-output -debug-pass=Arguments 2>&1 | sed 's/Pass Arguments: //'`
+# optimizations that opt -O3 runs, in the same order.
+opt_switches=`llvm-as < /dev/null -o - | opt -O3 -disable-output -debug-pass=Arguments 2>&1 | sed 's/Pass Arguments: //'`
 all_switches="$opt_switches"
 echo "Passes : $all_switches"
 

diff --git a/utils/git-svn/git-svnrevert b/utils/git-svn/git-svnrevert
index 06a9c44..f15e7ab 100755
--- a/utils/git-svn/git-svnrevert
+++ b/utils/git-svn/git-svnrevert

@@ -2,7 +2,7 @@
 
 if [ $# -ne 1 ]; then
     echo "Invalid arguments!"
-    echo "$0 <commit to revert>"
+    echo "$0 <rNNNNNN | git-hash>"
     exit 1
 fi
 
@@ -13,20 +13,27 @@
 fi
 
 COMMIT=$1
-
-SVN_REVISION=$(git svn find-rev "$COMMIT")
+OTHER=$(git svn find-rev "$COMMIT")
 if [ $? -ne 0 ]; then
-    echo "Error! Could not find an svn revision for commit $COMMIT!"
+    echo "Error! Could not find an svn/git revision for commit $COMMIT!"
     exit 1
 fi
 
+if [ -n "$(echo $COMMIT | grep '^r[0-9]\+')" ]; then
+  SVN=`echo $COMMIT | sed -e 's/^r//'`
+  GIT=$OTHER
+else
+  SVN=$OTHER
+  GIT=$COMMIT
+fi
+
 # Grab the one line message for our revert commit message.
-ONE_LINE_MSG=$(git log --oneline $COMMIT -1 | cut -f2- -d " ")
+ONE_LINE_MSG=$(git log --oneline $GIT -1 | cut -f2- -d " ")
 
 # Revert the commit.
-git revert --no-commit $COMMIT 2>/dev/null
+git revert --no-commit $GIT 2>/dev/null
 if [ $? -ne 0 ]; then
-    echo "Error! Failed to revert commit $COMMIT. Resetting to head."
+    echo "Error! Failed to revert commit r$SVN. Resetting to head."
     git reset --hard HEAD
     exit 1
 fi
@@ -36,13 +43,13 @@
 cat > $TEMPLATE <<EOF
 Revert "$ONE_LINE_MSG"
 
-This reverts commit r$SVN_REVISION.
+This reverts commit r$SVN.
 EOF
 
 # Begin the commit but give our user an opportunity to edit it.
 git commit --file="$TEMPLATE" --edit
 if [ $? -ne 0 ]; then
-    echo "Error! Failed to commit reverting commit for commit $COMMIT. Reverting to head."
+    echo "Error! Failed to commit reverting commit for commit r$SVN. Reverting to head."
     git reset --hard HEAD
     rm -rf $TEMPLATE
     exit 1

diff --git a/utils/lit/TODO b/utils/lit/TODO
index c1a60c6..90da327 100644
--- a/utils/lit/TODO
+++ b/utils/lit/TODO

@@ -156,8 +156,6 @@
 
 * Move temp directory name into local test config.
 
-* Add --show-unsupported, don't show by default?
-
 * Support valgrind in all configs, and LLVM style valgrind.
 
 * Support a timeout / ulimit.

diff --git a/utils/lit/lit/ProgressBar.py b/utils/lit/lit/ProgressBar.py
index e3644f1..3ad704d 100644
--- a/utils/lit/lit/ProgressBar.py
+++ b/utils/lit/lit/ProgressBar.py

@@ -6,8 +6,8 @@
 import sys, re, time
 
 def to_bytes(str):
-    # Encode to Latin1 to get binary data.
-    return str.encode('ISO-8859-1')
+    # Encode to UTF-8 to get binary data.
+    return str.encode('utf-8')
 
 class TerminalController:
     """
@@ -136,7 +136,7 @@
 
     def _tparm(self, arg, index):
         import curses
-        return curses.tparm(to_bytes(arg), index).decode('ascii') or ''
+        return curses.tparm(to_bytes(arg), index).decode('utf-8') or ''
 
     def _tigetstr(self, cap_name):
         # String capabilities can include "delays" of the form "$<2>".
@@ -147,7 +147,7 @@
         if cap is None:
             cap = ''
         else:
-            cap = cap.decode('ascii')
+            cap = cap.decode('utf-8')
         return re.sub(r'\$<\d+>[/*]?', '', cap)
 
     def render(self, template):

diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 9752417..1a2df20 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py

@@ -144,13 +144,16 @@
                     named_temp_files.append(f.name)
                     args[i] = f.name
 
-        procs.append(subprocess.Popen(args, cwd=cwd,
-                                      executable = executable,
-                                      stdin = stdin,
-                                      stdout = stdout,
-                                      stderr = stderr,
-                                      env = cfg.environment,
-                                      close_fds = kUseCloseFDs))
+        try:
+            procs.append(subprocess.Popen(args, cwd=cwd,
+                                          executable = executable,
+                                          stdin = stdin,
+                                          stdout = stdout,
+                                          stderr = stderr,
+                                          env = cfg.environment,
+                                          close_fds = kUseCloseFDs))
+        except OSError as e:
+            raise InternalShellError(j, 'Could not create process due to {}'.format(e))
 
         # Immediately close stdin for any process taking stdin from us.
         if stdin == subprocess.PIPE:
@@ -192,6 +195,11 @@
         f.seek(0, 0)
         procData[i] = (procData[i][0], f.read())
 
+    def to_string(bytes):
+        if isinstance(bytes, str):
+            return bytes
+        return bytes.encode('utf-8')
+
     exitCode = None
     for i,(out,err) in enumerate(procData):
         res = procs[i].wait()
@@ -201,11 +209,11 @@
 
         # Ensure the resulting output is always of string type.
         try:
-            out = str(out.decode('ascii'))
+            out = to_string(out.decode('utf-8'))
         except:
             out = str(out)
         try:
-            err = str(err.decode('ascii'))
+            err = to_string(err.decode('utf-8'))
         except:
             err = str(err)
 
@@ -314,13 +322,18 @@
     # Python2 and bytes in Python3.
     #
     # Once we find a match, we do require each script line to be decodable to
-    # ascii, so we convert the outputs to ascii before returning. This way the
+    # UTF-8, so we convert the outputs to UTF-8 before returning. This way the
     # remaining code can work with "strings" agnostic of the executing Python
     # version.
     
     def to_bytes(str):
-        # Encode to Latin1 to get binary data.
-        return str.encode('ISO-8859-1')
+        # Encode to UTF-8 to get binary data.
+        return str.encode('utf-8')
+    def to_string(bytes):
+        if isinstance(bytes, str):
+            return bytes
+        return to_bytes(bytes)
+        
     keywords = ('RUN:', 'XFAIL:', 'REQUIRES:', 'END.')
     keywords_re = re.compile(
         to_bytes("(%s)(.*)\n" % ("|".join(k for k in keywords),)))
@@ -330,6 +343,10 @@
         # Read the entire file contents.
         data = f.read()
 
+        # Ensure the data ends with a newline.
+        if not data.endswith(to_bytes('\n')):
+            data = data + to_bytes('\n')
+
         # Iterate over the matches.
         line_number = 1
         last_match_position = 0
@@ -341,13 +358,13 @@
                                       match_position)
             last_match_position = match_position
 
-            # Convert the keyword and line to ascii strings and yield the
+            # Convert the keyword and line to UTF-8 strings and yield the
             # command. Note that we take care to return regular strings in
             # Python 2, to avoid other code having to differentiate between the
             # str and unicode types.
             keyword,ln = match.groups()
-            yield (line_number, str(keyword[:-1].decode('ascii')),
-                   str(ln.decode('ascii')))
+            yield (line_number, to_string(keyword[:-1].decode('utf-8')),
+                   to_string(ln.decode('utf-8')))
     finally:
         f.close()
 

diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py
index eb89067..4cd9486 100644
--- a/utils/lit/lit/TestingConfig.py
+++ b/utils/lit/lit/TestingConfig.py

@@ -17,15 +17,16 @@
         """
         # Set the environment based on the command line arguments.
         environment = {
-            'LIBRARY_PATH' : os.environ.get('LIBRARY_PATH',''),
-            'LD_LIBRARY_PATH' : os.environ.get('LD_LIBRARY_PATH',''),
             'PATH' : os.pathsep.join(litConfig.path +
                                      [os.environ.get('PATH','')]),
-            'SYSTEMROOT' : os.environ.get('SYSTEMROOT',''),
-            'TERM' : os.environ.get('TERM',''),
             'LLVM_DISABLE_CRASH_REPORT' : '1',
             }
 
+        pass_vars = ['LIBRARY_PATH', 'LD_LIBRARY_PATH', 'SYSTEMROOT', 'TERM',
+                     'LD_PRELOAD', 'ASAN_OPTIONS', 'UBSAN_OPTIONS']
+        for var in pass_vars:
+            environment[var] = os.environ.get(var, '')
+
         if sys.platform == 'win32':
             environment.update({
                     'INCLUDE' : os.environ.get('INCLUDE',''),

diff --git a/utils/lit/lit/__init__.py b/utils/lit/lit/__init__.py
index 46fa82d..c1bd76b 100644
--- a/utils/lit/lit/__init__.py
+++ b/utils/lit/lit/__init__.py

@@ -5,7 +5,7 @@
 
 __author__ = 'Daniel Dunbar'
 __email__ = 'daniel@zuster.org'
-__versioninfo__ = (0, 4, 0)
+__versioninfo__ = (0, 5, 0)
 __version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
 
 __all__ = []

diff --git a/utils/lit/lit/discovery.py b/utils/lit/lit/discovery.py
index 876d4f3..4befe58 100644
--- a/utils/lit/lit/discovery.py
+++ b/utils/lit/lit/discovery.py

@@ -91,7 +91,7 @@
 
         # Otherwise, copy the current config and load the local configuration
         # file into it.
-        config = copy.copy(parent)
+        config = copy.deepcopy(parent)
         if litConfig.debug:
             litConfig.note('loading local config %r' % cfgpath)
         config.load_from_path(cfgpath, litConfig)

diff --git a/utils/lit/lit/formats/googletest.py b/utils/lit/lit/formats/googletest.py
index 3d14b729..1b5b785 100644
--- a/utils/lit/lit/formats/googletest.py
+++ b/utils/lit/lit/formats/googletest.py

@@ -31,7 +31,7 @@
         try:
             lines = lit.util.capture([path, '--gtest_list_tests'],
                                      env=localConfig.environment)
-            lines = lines.decode('ascii')
+            lines = lines.decode('utf-8')
             if kIsWindows:
               lines = lines.replace('\r', '')
             lines = lines.split('\n')

diff --git a/utils/lit/lit/main.py b/utils/lit/lit/main.py
index c59651a..7343d24 100755
--- a/utils/lit/lit/main.py
+++ b/utils/lit/lit/main.py

@@ -42,8 +42,9 @@
             self.progressBar.update(float(self.completed)/self.numTests,
                                     test.getFullName())
 
-        if not test.result.code.isFailure and \
-                (self.opts.quiet or self.opts.succinct):
+        shouldShow = test.result.code.isFailure or \
+            (not self.opts.quiet and not self.opts.succinct)
+        if not shouldShow:
             return
 
         if self.progressBar:
@@ -168,6 +169,12 @@
     group.add_option("", "--no-progress-bar", dest="useProgressBar",
                      help="Do not use curses based progress bar",
                      action="store_false", default=True)
+    group.add_option("", "--show-unsupported", dest="show_unsupported",
+                     help="Show unsupported tests",
+                     action="store_true", default=False)
+    group.add_option("", "--show-xfail", dest="show_xfail",
+                     help="Show tests that were expected to fail",
+                     action="store_true", default=False)
     parser.add_option_group(group)
 
     group = OptionGroup(parser, "Test Execution")
@@ -382,7 +389,12 @@
     # Print each test in any of the failing groups.
     for title,code in (('Unexpected Passing Tests', lit.Test.XPASS),
                        ('Failing Tests', lit.Test.FAIL),
-                       ('Unresolved Tests', lit.Test.UNRESOLVED)):
+                       ('Unresolved Tests', lit.Test.UNRESOLVED),
+                       ('Unsupported Tests', lit.Test.UNSUPPORTED),
+                       ('Expected Failing Tests', lit.Test.XFAIL)):
+        if (lit.Test.XFAIL == code and not opts.show_xfail) or \
+           (lit.Test.UNSUPPORTED == code and not opts.show_unsupported):
+            continue
         elts = byCode.get(code)
         if not elts:
             continue
@@ -403,7 +415,7 @@
                       ('Unsupported Tests  ', lit.Test.UNSUPPORTED),
                       ('Unresolved Tests   ', lit.Test.UNRESOLVED),
                       ('Unexpected Passes  ', lit.Test.XPASS),
-                      ('Unexpected Failures', lit.Test.FAIL),):
+                      ('Unexpected Failures', lit.Test.FAIL)):
         if opts.quiet and not code.isFailure:
             continue
         N = len(byCode.get(code,[]))

diff --git a/utils/lit/lit/util.py b/utils/lit/lit/util.py
index 72a8b48..cce620c 100644
--- a/utils/lit/lit/util.py
+++ b/utils/lit/lit/util.py

@@ -156,13 +156,18 @@
     if exitCode == -signal.SIGINT:
         raise KeyboardInterrupt
 
+    def to_string(bytes):
+        if isinstance(bytes, str):
+            return bytes
+        return bytes.encode('utf-8')
+
     # Ensure the resulting output is always of string type.
     try:
-        out = str(out.decode('ascii'))
+        out = to_string(out.decode('utf-8'))
     except:
         out = str(out)
     try:
-        err = str(err.decode('ascii'))
+        err = to_string(err.decode('utf-8'))
     except:
         err = str(err)
 

diff --git a/utils/lldbDataFormatters.py b/utils/lldbDataFormatters.py
index 352448d..f570fb4 100644
--- a/utils/lldbDataFormatters.py
+++ b/utils/lldbDataFormatters.py

@@ -12,6 +12,9 @@
     debugger.HandleCommand('type synthetic add -w llvm '
                            '-l lldbDataFormatters.SmallVectorSynthProvider '
                            '-x "^llvm::SmallVector<.+,.+>$"')
+    debugger.HandleCommand('type synthetic add -w llvm '
+                           '-l lldbDataFormatters.ArrayRefSynthProvider '
+                           '-x "^llvm::ArrayRef<.+>$"')
 
 # Pretty printer for llvm::SmallVector/llvm::SmallVectorImpl
 class SmallVectorSynthProvider:
@@ -53,3 +56,33 @@
         self.data_type = the_type.GetTemplateArgumentType(0)
         self.type_size = self.data_type.GetByteSize()
         assert self.type_size != 0
+
+class ArrayRefSynthProvider:
+    """ Provider for llvm::ArrayRef """
+    def __init__(self, valobj, dict):
+        self.valobj = valobj;
+        self.update() # initialize this provider
+
+    def num_children(self):
+        return self.length
+
+    def get_child_index(self, name):
+        try:
+            return int(name.lstrip('[').rstrip(']'))
+        except:
+            return -1;
+
+    def get_child_at_index(self, index):
+        if index < 0 or index >= self.num_children():
+            return None;
+        offset = index * self.type_size
+        return self.data.CreateChildAtOffset('[' + str(index) + ']',
+                                             offset, self.data_type)
+
+    def update(self):
+        self.data = self.valobj.GetChildMemberWithName('Data')
+        length_obj = self.valobj.GetChildMemberWithName('Length')
+        self.length = length_obj.GetValueAsUnsigned(0)
+        self.data_type = self.data.GetType().GetPointeeType()
+        self.type_size = self.data_type.GetByteSize()
+        assert self.type_size != 0

diff --git a/utils/llvm-build/llvmbuild/main.py b/utils/llvm-build/llvmbuild/main.py
index 37aa5d8..353741f 100644
--- a/utils/llvm-build/llvmbuild/main.py
+++ b/utils/llvm-build/llvmbuild/main.py

@@ -1,4 +1,5 @@
 from __future__ import absolute_import
+import filecmp
 import os
 import sys
 
@@ -41,7 +42,7 @@
     """
     mk_quote_string_for_target(target_name) -> str
 
-    Return a quoted form of the given target_name suitable for including in a 
+    Return a quoted form of the given target_name suitable for including in a
     Makefile as a target name.
     """
 
@@ -340,7 +341,7 @@
             # Compute the llvm-config "component name". For historical reasons,
             # this is lowercased based on the library name.
             llvmconfig_component_name = c.get_llvmconfig_component_name()
-            
+
             # Get the library name, or None for LibraryGroups.
             if c.type_name == 'Library' or c.type_name == 'OptionalLibrary':
                 library_name = c.get_prefixed_library_name()
@@ -382,7 +383,7 @@
 
         # Write out the library table.
         make_install_dir(os.path.dirname(output_path))
-        f = open(output_path, 'w')
+        f = open(output_path+'.new', 'w')
         f.write("""\
 //===- llvm-build generated file --------------------------------*- C++ -*-===//
 //
@@ -420,6 +421,14 @@
         f.write('};\n')
         f.close()
 
+        if not os.path.isfile(output_path):
+            os.rename(output_path+'.new', output_path)
+        elif filecmp.cmp(output_path, output_path+'.new'):
+            os.remove(output_path+'.new')
+        else:
+            os.remove(output_path)
+            os.rename(output_path+'.new', output_path)
+
     def get_required_libraries_for_component(self, ci, traverse_groups = False):
         """
         get_required_libraries_for_component(component_info) -> iter
@@ -430,14 +439,14 @@
         traversed to include their required libraries.
         """
 
-        assert ci.type_name in ('Library', 'LibraryGroup', 'TargetGroup')
+        assert ci.type_name in ('Library', 'OptionalLibrary', 'LibraryGroup', 'TargetGroup')
 
         for name in ci.required_libraries:
             # Get the dependency info.
             dep = self.component_info_map[name]
 
             # If it is a library, yield it.
-            if dep.type_name == 'Library':
+            if dep.type_name == 'Library' or dep.type_name == 'OptionalLibrary':
                 yield dep
                 continue
 
@@ -492,7 +501,7 @@
             if (path.startswith(self.source_root) and os.path.exists(path)):
                 yield path
 
-    def write_cmake_fragment(self, output_path):
+    def write_cmake_fragment(self, output_path, enabled_optional_components):
         """
         write_cmake_fragment(output_path) -> None
 
@@ -561,8 +570,13 @@
 # names to required libraries, in a way that is easily accessed from CMake.
 """)
         for ci in self.ordered_component_infos:
-            # We only write the information for libraries currently.
-            if ci.type_name != 'Library':
+            # Skip optional components which are not enabled.
+            if ci.type_name == 'OptionalLibrary' \
+                and ci.name not in enabled_optional_components:
+                continue
+
+            # We only write the information for certain components currently.
+            if ci.type_name not in ('Library', 'OptionalLibrary'):
                 continue
 
             f.write("""\
@@ -573,7 +587,7 @@
 
         f.close()
 
-    def write_cmake_exports_fragment(self, output_path):
+    def write_cmake_exports_fragment(self, output_path, enabled_optional_components):
         """
         write_cmake_exports_fragment(output_path) -> None
 
@@ -595,8 +609,13 @@
 # dependencies of libraries imported from LLVM.
 """)
         for ci in self.ordered_component_infos:
+            # Skip optional components which are not enabled.
+            if ci.type_name == 'OptionalLibrary' \
+                and ci.name not in enabled_optional_components:
+                continue
+
             # We only write the information for libraries currently.
-            if ci.type_name != 'Library':
+            if ci.type_name not in ('Library', 'OptionalLibrary'):
                 continue
 
             # Skip disabled targets.
@@ -783,7 +802,7 @@
     # If we have a native target with a JIT, use that for the engine. Otherwise,
     # use the interpreter.
     if native_target and native_target.enabled and native_target.has_jit:
-        engine_group.required_libraries.append('JIT')
+        engine_group.required_libraries.append('MCJIT')
         engine_group.required_libraries.append(native_group.name)
     else:
         engine_group.required_libraries.append('Interpreter')
@@ -905,9 +924,11 @@
 
     # Write out the cmake fragment, if requested.
     if opts.write_cmake_fragment:
-        project_info.write_cmake_fragment(opts.write_cmake_fragment)
+        project_info.write_cmake_fragment(opts.write_cmake_fragment,
+                                          opts.optional_components)
     if opts.write_cmake_exports_fragment:
-        project_info.write_cmake_exports_fragment(opts.write_cmake_exports_fragment)
+        project_info.write_cmake_exports_fragment(opts.write_cmake_exports_fragment,
+                                                  opts.optional_components)
 
     # Configure target definition files, if requested.
     if opts.configure_target_def_files:

diff --git a/utils/not/not.cpp b/utils/not/not.cpp
index a5c7183..2adeded 100644
--- a/utils/not/not.cpp
+++ b/utils/not/not.cpp

@@ -27,10 +27,15 @@
   if (argc == 0)
     return 1;
 
-  std::string Program = sys::FindProgramByName(argv[0]);
+  auto Program = sys::findProgramByName(argv[0]);
+  if (!Program) {
+    errs() << "Error: Unable to find `" << argv[0]
+           << "' in PATH: " << Program.getError().message() << "\n";
+    return 1;
+  }
 
   std::string ErrMsg;
-  int Result = sys::ExecuteAndWait(Program, argv, nullptr, nullptr, 0, 0,
+  int Result = sys::ExecuteAndWait(*Program, argv, nullptr, nullptr, 0, 0,
                                    &ErrMsg);
 #ifdef _WIN32
   // Handle abort() in msvcrt -- It has exit code as 3.  abort(), aka

diff --git a/utils/release/export.sh b/utils/release/export.sh
index f25a193..38e5a81 100755
--- a/utils/release/export.sh
+++ b/utils/release/export.sh

@@ -14,7 +14,7 @@
 
 set -e
 
-projects="llvm cfe dragonegg test-suite compiler-rt libcxx clang-tools-extra polly lldb"
+projects="llvm cfe dragonegg test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp"
 base_url="https://llvm.org/svn/llvm-project"
 
 release=""
@@ -44,7 +44,7 @@
             $proj-$release$rc.src
 
         echo "Creating tarball ..."
-        tar cfz $proj-$release$rc.src.tar.gz $proj-$release$rc.src
+        tar cfJ $proj-$release$rc.src.tar.xz $proj-$release$rc.src
     done
 }
 

diff --git a/utils/release/tag.sh b/utils/release/tag.sh
index 6c5039d..390acaf 100755
--- a/utils/release/tag.sh
+++ b/utils/release/tag.sh

@@ -17,7 +17,7 @@
 release=""
 rc=""
 rebranch="no"
-projects="llvm cfe dragonegg test-suite compiler-rt libcxx clang-tools-extra polly lldb lld"
+projects="llvm cfe dragonegg test-suite compiler-rt libcxx libcxxabi clang-tools-extra polly lldb lld openmp"
 
 base_url="https://llvm.org/svn/llvm-project"
 
@@ -34,16 +34,16 @@
 function tag_version() {
     set -x
     for proj in  $projects; do
-        if svn ls $base_url/$proj/branches/release_$release > /dev/null 2>&1 ; then
+        if svn ls $base_url/$proj/branches/release_$branch_release > /dev/null 2>&1 ; then
             if [ $rebranch = "no" ]; then
                 continue
             fi
-            svn remove -m "Removing old release_$release branch for rebranching." \
-                $base_url/$proj/branches/release_$release
+            svn remove -m "Removing old release_$branch_release branch for rebranching." \
+                $base_url/$proj/branches/release_$branch_release
         fi
-        svn copy -m "Creating release_$release branch" \
+        svn copy -m "Creating release_$branch_release branch" \
             $base_url/$proj/trunk \
-            $base_url/$proj/branches/release_$release
+            $base_url/$proj/branches/release_$branch_release
     done
     set +x
 }
@@ -51,13 +51,13 @@
 function tag_release_candidate() {
     set -x
     for proj in $projects ; do
-        if ! svn ls $base_url/$proj/tags/RELEASE_$release > /dev/null 2>&1 ; then
-            svn mkdir -m "Creating release directory for release_$release." $base_url/$proj/tags/RELEASE_$release
+        if ! svn ls $base_url/$proj/tags/RELEASE_$tag_release > /dev/null 2>&1 ; then
+            svn mkdir -m "Creating release directory for release_$tag_release." $base_url/$proj/tags/RELEASE_$tag_release
         fi
-        if ! svn ls $base_url/$proj/tags/RELEASE_$release/$rc > /dev/null 2>&1 ; then
-            svn copy -m "Creating release candidate $rc from release_$release branch" \
-                $base_url/$proj/branches/release_$release \
-                $base_url/$proj/tags/RELEASE_$release/$rc
+        if ! svn ls $base_url/$proj/tags/RELEASE_$tag_release/$rc > /dev/null 2>&1 ; then
+            svn copy -m "Creating release candidate $rc from release_$tag_release branch" \
+                $base_url/$proj/branches/release_$branch_release \
+                $base_url/$proj/tags/RELEASE_$tag_release/$rc
         fi
     done
     set +x
@@ -99,7 +99,8 @@
     exit 1
 fi
 
-release=`echo $release | sed -e 's,\.,,g'`
+branch_release=`echo $release | sed -e 's,\([0-9]*\.[0-9]*\).*,\1,' | sed -e 's,\.,,g'`
+tag_release=`echo $release | sed -e 's,\.,,g'`
 
 if [ "x$rc" = "x" ]; then
     tag_version

diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index b4d7689..b028924 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh

@@ -18,7 +18,7 @@
     MAKE=make
 fi
 
-projects="llvm cfe dragonegg compiler-rt libcxx test-suite clang-tools-extra"
+projects="llvm cfe dragonegg compiler-rt libcxx libcxxabi test-suite clang-tools-extra"
 
 # Base SVN URL for the sources.
 Base_url="http://llvm.org/svn/llvm-project"
@@ -67,7 +67,7 @@
         -release | --release )
             shift
             Release="$1"
-            Release_no_dot="`echo $1 | sed -e 's,\.,,'`"
+            Release_no_dot="`echo $1 | sed -e 's,\.,,g'`"
             ;;
         -rc | --rc | -RC | --RC )
             shift
@@ -260,6 +260,9 @@
     if [ ! -h libcxx ]; then
         ln -s ../../libcxx.src libcxx
     fi
+    if [ ! -h libcxxabi ]; then
+        ln -s ../../libcxxabi.src libcxxabi
+    fi
     cd $BuildDir
 }
 

diff --git a/utils/shuffle_fuzz.py b/utils/shuffle_fuzz.py
new file mode 100755
index 0000000..384a93a
--- /dev/null
+++ b/utils/shuffle_fuzz.py

@@ -0,0 +1,256 @@
+#!/usr/bin/env python
+
+"""A shuffle vector fuzz tester.
+
+This is a python program to fuzz test the LLVM shufflevector instruction. It
+generates a function with a random sequnece of shufflevectors, maintaining the
+element mapping accumulated across the function. It then generates a main
+function which calls it with a different value in each element and checks that
+the result matches the expected mapping.
+
+Take the output IR printed to stdout, compile it to an executable using whatever
+set of transforms you want to test, and run the program. If it crashes, it found
+a bug.
+"""
+
+import argparse
+import itertools
+import random
+import sys
+import uuid
+
+def main():
+  element_types=['i8', 'i16', 'i32', 'i64', 'f32', 'f64']
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument('-v', '--verbose', action='store_true',
+                      help='Show verbose output')
+  parser.add_argument('--seed', default=str(uuid.uuid4()),
+                      help='A string used to seed the RNG')
+  parser.add_argument('--max-shuffle-height', type=int, default=16,
+                      help='Specify a fixed height of shuffle tree to test')
+  parser.add_argument('--no-blends', dest='blends', action='store_false',
+                      help='Include blends of two input vectors')
+  parser.add_argument('--fixed-bit-width', type=int, choices=[128, 256],
+                      help='Specify a fixed bit width of vector to test')
+  parser.add_argument('--fixed-element-type', choices=element_types,
+                      help='Specify a fixed element type to test')
+  parser.add_argument('--triple',
+                      help='Specify a triple string to include in the IR')
+  args = parser.parse_args()
+
+  random.seed(args.seed)
+
+  if args.fixed_element_type is not None:
+    element_types=[args.fixed_element_type]
+
+  if args.fixed_bit_width is not None:
+    if args.fixed_bit_width == 128:
+      width_map={'i64': 2, 'i32': 4, 'i16': 8, 'i8': 16, 'f64': 2, 'f32': 4}
+      (width, element_type) = random.choice(
+          [(width_map[t], t) for t in element_types])
+    elif args.fixed_bit_width == 256:
+      width_map={'i64': 4, 'i32': 8, 'i16': 16, 'i8': 32, 'f64': 4, 'f32': 8}
+      (width, element_type) = random.choice(
+          [(width_map[t], t) for t in element_types])
+    else:
+      sys.exit(1) # Checked above by argument parsing.
+  else:
+    width = random.choice([2, 4, 8, 16, 32, 64])
+    element_type = random.choice(element_types)
+
+  element_modulus = {
+      'i8': 1 << 8, 'i16': 1 << 16, 'i32': 1 << 32, 'i64': 1 << 64,
+      'f32': 1 << 32, 'f64': 1 << 64}[element_type]
+
+  shuffle_range = (2 * width) if args.blends else width
+
+  # Because undef (-1) saturates and is indistinguishable when testing the
+  # correctness of a shuffle, we want to bias our fuzz toward having a decent
+  # mixture of non-undef lanes in the end. With a deep shuffle tree, the
+  # probabilies aren't good so we need to bias things. The math here is that if
+  # we uniformly select between -1 and the other inputs, each element of the
+  # result will have the following probability of being undef:
+  #
+  #   1 - (shuffle_range/(shuffle_range+1))^max_shuffle_height
+  #
+  # More generally, for any probability P of selecting a defined element in
+  # a single shuffle, the end result is:
+  #
+  #   1 - P^max_shuffle_height
+  #
+  # The power of the shuffle height is the real problem, as we want:
+  #
+  #   1 - shuffle_range/(shuffle_range+1)
+  #
+  # So we bias the selection of undef at any given node based on the tree
+  # height. Below, let 'A' be 'len(shuffle_range)', 'C' be 'max_shuffle_height',
+  # and 'B' be the bias we use to compensate for
+  # C '((A+1)*A^(1/C))/(A*(A+1)^(1/C))':
+  #
+  #   1 - (B * A)/(A + 1)^C = 1 - A/(A + 1)
+  #
+  # So at each node we use:
+  #
+  #   1 - (B * A)/(A + 1)
+  # = 1 - ((A + 1) * A * A^(1/C))/(A * (A + 1) * (A + 1)^(1/C))
+  # = 1 - ((A + 1) * A^((C + 1)/C))/(A * (A + 1)^((C + 1)/C))
+  #
+  # This is the formula we use to select undef lanes in the shuffle.
+  A = float(shuffle_range)
+  C = float(args.max_shuffle_height)
+  undef_prob = 1.0 - (((A + 1.0) * pow(A, (C + 1.0)/C)) /
+                      (A * pow(A + 1.0, (C + 1.0)/C)))
+
+  shuffle_tree = [[[-1 if random.random() <= undef_prob
+                       else random.choice(range(shuffle_range))
+                    for _ in itertools.repeat(None, width)]
+                   for _ in itertools.repeat(None, args.max_shuffle_height - i)]
+                  for i in xrange(args.max_shuffle_height)]
+
+  if args.verbose:
+    # Print out the shuffle sequence in a compact form.
+    print >>sys.stderr, ('Testing shuffle sequence "%s" (v%d%s):' %
+                         (args.seed, width, element_type))
+    for i, shuffles in enumerate(shuffle_tree):
+      print >>sys.stderr, '  tree level %d:' % (i,)
+      for j, s in enumerate(shuffles):
+        print >>sys.stderr, '    shuffle %d: %s' % (j, s)
+    print >>sys.stderr, ''
+
+  # Symbolically evaluate the shuffle tree.
+  inputs = [[int(j % element_modulus)
+             for j in xrange(i * width + 1, (i + 1) * width + 1)]
+            for i in xrange(args.max_shuffle_height + 1)]
+  results = inputs
+  for shuffles in shuffle_tree:
+    results = [[((results[i] if j < width else results[i + 1])[j % width]
+                 if j != -1 else -1)
+                for j in s]
+               for i, s in enumerate(shuffles)]
+  if len(results) != 1:
+    print >>sys.stderr, 'ERROR: Bad results: %s' % (results,)
+    sys.exit(1)
+  result = results[0]
+
+  if args.verbose:
+    print >>sys.stderr, 'Which transforms:'
+    print >>sys.stderr, '  from: %s' % (inputs,)
+    print >>sys.stderr, '  into: %s' % (result,)
+    print >>sys.stderr, ''
+
+  # The IR uses silly names for floating point types. We also need a same-size
+  # integer type.
+  integral_element_type = element_type
+  if element_type == 'f32':
+    integral_element_type = 'i32'
+    element_type = 'float'
+  elif element_type == 'f64':
+    integral_element_type = 'i64'
+    element_type = 'double'
+
+  # Now we need to generate IR for the shuffle function.
+  subst = {'N': width, 'T': element_type, 'IT': integral_element_type}
+  print """
+define internal fastcc <%(N)d x %(T)s> @test(%(arguments)s) noinline nounwind {
+entry:""" % dict(subst,
+                 arguments=', '.join(
+                     ['<%(N)d x %(T)s> %%s.0.%(i)d' % dict(subst, i=i)
+                      for i in xrange(args.max_shuffle_height + 1)]))
+
+  for i, shuffles in enumerate(shuffle_tree):
+   for j, s in enumerate(shuffles):
+    print """
+  %%s.%(next_i)d.%(j)d = shufflevector <%(N)d x %(T)s> %%s.%(i)d.%(j)d, <%(N)d x %(T)s> %%s.%(i)d.%(next_j)d, <%(N)d x i32> <%(S)s>
+""".strip('\n') % dict(subst, i=i, next_i=i + 1, j=j, next_j=j + 1,
+                       S=', '.join(['i32 ' + (str(si) if si != -1 else 'undef')
+                                    for si in s]))
+
+  print """
+  ret <%(N)d x %(T)s> %%s.%(i)d.0
+}
+""" % dict(subst, i=len(shuffle_tree))
+
+  # Generate some string constants that we can use to report errors.
+  for i, r in enumerate(result):
+    if r != -1:
+      s = ('FAIL(%(seed)s): lane %(lane)d, expected %(result)d, found %%d\\0A' %
+           {'seed': args.seed, 'lane': i, 'result': r})
+      s += ''.join(['\\00' for _ in itertools.repeat(None, 128 - len(s) + 2)])
+      print """
+@error.%(i)d = private unnamed_addr global [128 x i8] c"%(s)s"
+""".strip() % {'i': i, 's': s}
+
+  # Define a wrapper function which is marked 'optnone' to prevent
+  # interprocedural optimizations from deleting the test.
+  print """
+define internal fastcc <%(N)d x %(T)s> @test_wrapper(%(arguments)s) optnone noinline {
+  %%result = call fastcc <%(N)d x %(T)s> @test(%(arguments)s)
+  ret <%(N)d x %(T)s> %%result
+}
+""" % dict(subst,
+           arguments=', '.join(['<%(N)d x %(T)s> %%s.%(i)d' % dict(subst, i=i)
+                                for i in xrange(args.max_shuffle_height + 1)]))
+
+  # Finally, generate a main function which will trap if any lanes are mapped
+  # incorrectly (in an observable way).
+  print """
+define i32 @main() {
+entry:
+  ; Create a scratch space to print error messages.
+  %%str = alloca [128 x i8]
+  %%str.ptr = getelementptr inbounds [128 x i8]* %%str, i32 0, i32 0
+
+  ; Build the input vector and call the test function.
+  %%v = call fastcc <%(N)d x %(T)s> @test_wrapper(%(inputs)s)
+  ; We need to cast this back to an integer type vector to easily check the
+  ; result.
+  %%v.cast = bitcast <%(N)d x %(T)s> %%v to <%(N)d x %(IT)s>
+  br label %%test.0
+""" % dict(subst,
+           inputs=', '.join(
+               [('<%(N)d x %(T)s> bitcast '
+                 '(<%(N)d x %(IT)s> <%(input)s> to <%(N)d x %(T)s>)' %
+                 dict(subst, input=', '.join(['%(IT)s %(i)d' % dict(subst, i=i)
+                                              for i in input])))
+                for input in inputs]))
+
+  # Test that each non-undef result lane contains the expected value.
+  for i, r in enumerate(result):
+    if r == -1:
+      print """
+test.%(i)d:
+  ; Skip this lane, its value is undef.
+  br label %%test.%(next_i)d
+""" % dict(subst, i=i, next_i=i + 1)
+    else:
+      print """
+test.%(i)d:
+  %%v.%(i)d = extractelement <%(N)d x %(IT)s> %%v.cast, i32 %(i)d
+  %%cmp.%(i)d = icmp ne %(IT)s %%v.%(i)d, %(r)d
+  br i1 %%cmp.%(i)d, label %%die.%(i)d, label %%test.%(next_i)d
+
+die.%(i)d:
+  ; Capture the actual value and print an error message.
+  %%tmp.%(i)d = zext %(IT)s %%v.%(i)d to i2048
+  %%bad.%(i)d = trunc i2048 %%tmp.%(i)d to i32
+  call i32 (i8*, i8*, ...)* @sprintf(i8* %%str.ptr, i8* getelementptr inbounds ([128 x i8]* @error.%(i)d, i32 0, i32 0), i32 %%bad.%(i)d)
+  %%length.%(i)d = call i32 @strlen(i8* %%str.ptr)
+  %%size.%(i)d = add i32 %%length.%(i)d, 1
+  call i32 @write(i32 2, i8* %%str.ptr, i32 %%size.%(i)d)
+  call void @llvm.trap()
+  unreachable
+""" % dict(subst, i=i, next_i=i + 1, r=r)
+
+  print """
+test.%d:
+  ret i32 0
+}
+
+declare i32 @strlen(i8*)
+declare i32 @write(i32, i8*, i32)
+declare i32 @sprintf(i8*, i8*, ...)
+declare void @llvm.trap() noreturn nounwind
+""" % (len(result),)
+
+if __name__ == '__main__':
+  main()

diff --git a/utils/valgrind/x86_64-pc-linux-gnu.supp b/utils/valgrind/x86_64-pc-linux-gnu.supp
index c8e5cd0..d6af2dd 100644
--- a/utils/valgrind/x86_64-pc-linux-gnu.supp
+++ b/utils/valgrind/x86_64-pc-linux-gnu.supp

@@ -33,6 +33,14 @@
 }
 
 {
+   We don't care if bash leaks
+   Memcheck:Leak
+   fun:malloc
+   fun:xmalloc
+   obj:/bin/bash
+}
+
+{
    We don't care of cmp
    Memcheck:Cond
    obj:/usr/bin/cmp
@@ -52,6 +60,14 @@
 }
 
 {
+   We don't care if sed leaks
+   Memcheck:Leak
+   fun:calloc
+   fun:malloc
+   obj:/bin/sed
+}
+
+{
    We don't care about anything ld.so does.
    Memcheck:Cond
    obj:/lib/ld*.so

diff --git a/utils/vim/llvm.vim b/utils/vim/llvm.vim
index 2b91823..e8273dd 100644
--- a/utils/vim/llvm.vim
+++ b/utils/vim/llvm.vim

@@ -57,6 +57,7 @@
 syn keyword llvmKeyword unordered uwtable volatile weak weak_odr
 syn keyword llvmKeyword x86_fastcallcc x86_stdcallcc x86_thiscallcc x86_64_sysvcc
 syn keyword llvmKeyword x86_64_win64cc zeroext
+syn keyword llvmKeyword uselistorder uselistorder_bb
 
 " Obsolete keywords.
 syn keyword llvmError  getresult begin end

diff --git a/utils/yaml-bench/YAMLBench.cpp b/utils/yaml-bench/YAMLBench.cpp
index e88ce5d..8bd1ea1 100644
--- a/utils/yaml-bench/YAMLBench.cpp
+++ b/utils/yaml-bench/YAMLBench.cpp

@@ -192,15 +192,15 @@
         MemoryBuffer::getFileOrSTDIN(Input);
     if (!BufOrErr)
       return 1;
-    std::unique_ptr<MemoryBuffer> Buf = std::move(BufOrErr.get());
+    MemoryBuffer &Buf = *BufOrErr.get();
 
     llvm::SourceMgr sm;
     if (DumpTokens) {
-      yaml::dumpTokens(Buf->getBuffer(), outs());
+      yaml::dumpTokens(Buf.getBuffer(), outs());
     }
 
     if (DumpCanonical) {
-      yaml::Stream stream(Buf->getBuffer(), sm);
+      yaml::Stream stream(Buf.getBuffer(), sm);
       dumpStream(stream);
     }
   }
commit	37ed9c199ca639565f6ce88105f9e39e898d82d0	[log] [tgz]
author	Stephen Hines <srhines@google.com>	Mon Dec 01 14:51:49 2014 -0800
committer	Stephen Hines <srhines@google.com>	Tue Dec 02 16:08:10 2014 -0800
tree	8fb36d3910e3ee4c4e1b7422f4f017108efc52f5
parent	d2327b22152ced7bc46dc629fc908959e8a52d03 [diff]